From dc22f7c5b705e2a53bd10cc52fdb3f62d609ab6d Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Thu, 25 Sep 2025 15:11:41 +0800
Subject: [PATCH 01/20] sync efficientnet_b4 test

---
 tests/executables/_utils/get_num_devices.sh   |  12 +
 .../_utils/global_environment_variables.sh    |  12 +
 .../_utils/init_classification_torch.sh       |  14 +
 tests/executables/_utils/install_pip_pkgs.sh  |  18 ++
 tests/executables/efficientnet/init_torch.sh  |   7 +
 ...ain_efficientb4_imagenet_amp_dist_torch.sh |  51 +++
 tests/prepare_dataset.sh                      |  97 ++++++
 tests/prepare_python_environment.sh           |  37 +++
 tests/prepare_system_environment.sh           |  86 +++++
 tests/quick_build_environment.sh              |  26 ++
 tests/tools/dltest/README.md                  | 121 +++++++
 tests/tools/dltest/dltest/__init__.py         |   1 +
 tests/tools/dltest/dltest/cli/__init__.py     |   0
 tests/tools/dltest/dltest/cli/assert_cli.py   | 209 ++++++++++++
 tests/tools/dltest/dltest/cli/check_cli.py    |  56 ++++
 tests/tools/dltest/dltest/cli/entry_points.py |  35 ++
 .../tools/dltest/dltest/cli/fetch_log_cli.py  |  48 +++
 .../dltest/dltest/cli/log_comparator_cli.py   |  69 ++++
 .../tools/dltest/dltest/cli/log_parser_cli.py |  35 ++
 .../dltest/dltest/cli/model_validator_cli.py  | 153 +++++++++
 tests/tools/dltest/dltest/log_comparator.py   | 101 ++++++
 tests/tools/dltest/dltest/log_parser.py       | 172 ++++++++++
 .../dltest/dltest/model_compare_config.py     | 306 ++++++++++++++++++
 tests/tools/dltest/dltest/utils/__init__.py   |   0
 tests/tools/dltest/dltest/utils/base_cli.py   |  44 +++
 tests/tools/dltest/dltest/utils/get_env.py    |  65 ++++
 tests/tools/dltest/dltest/utils/misc.py       |  41 +++
 .../dltest/dltest/utils/real_tempfile.py      |  64 ++++
 .../dltest/dltest/utils/subprocess_tools.py   |  84 +++++
 .../dltest/dltest/utils/training_args.py      |  87 +++++
 tests/tools/dltest/setup.py                   |  27 ++
 31 files changed, 2078 insertions(+)
 create mode 100644 tests/executables/_utils/get_num_devices.sh
 create mode 100644 tests/executables/_utils/global_environment_variables.sh
 create mode 100644 tests/executables/_utils/init_classification_torch.sh
 create mode 100644 tests/executables/_utils/install_pip_pkgs.sh
 create mode 100644 tests/executables/efficientnet/init_torch.sh
 create mode 100644 tests/executables/efficientnet/train_efficientb4_imagenet_amp_dist_torch.sh
 create mode 100644 tests/prepare_dataset.sh
 create mode 100644 tests/prepare_python_environment.sh
 create mode 100644 tests/prepare_system_environment.sh
 create mode 100644 tests/quick_build_environment.sh
 create mode 100644 tests/tools/dltest/README.md
 create mode 100644 tests/tools/dltest/dltest/__init__.py
 create mode 100644 tests/tools/dltest/dltest/cli/__init__.py
 create mode 100644 tests/tools/dltest/dltest/cli/assert_cli.py
 create mode 100644 tests/tools/dltest/dltest/cli/check_cli.py
 create mode 100644 tests/tools/dltest/dltest/cli/entry_points.py
 create mode 100644 tests/tools/dltest/dltest/cli/fetch_log_cli.py
 create mode 100644 tests/tools/dltest/dltest/cli/log_comparator_cli.py
 create mode 100644 tests/tools/dltest/dltest/cli/log_parser_cli.py
 create mode 100644 tests/tools/dltest/dltest/cli/model_validator_cli.py
 create mode 100644 tests/tools/dltest/dltest/log_comparator.py
 create mode 100644 tests/tools/dltest/dltest/log_parser.py
 create mode 100644 tests/tools/dltest/dltest/model_compare_config.py
 create mode 100644 tests/tools/dltest/dltest/utils/__init__.py
 create mode 100644 tests/tools/dltest/dltest/utils/base_cli.py
 create mode 100644 tests/tools/dltest/dltest/utils/get_env.py
 create mode 100644 tests/tools/dltest/dltest/utils/misc.py
 create mode 100644 tests/tools/dltest/dltest/utils/real_tempfile.py
 create mode 100644 tests/tools/dltest/dltest/utils/subprocess_tools.py
 create mode 100644 tests/tools/dltest/dltest/utils/training_args.py
 create mode 100644 tests/tools/dltest/setup.py

diff --git a/tests/executables/_utils/get_num_devices.sh b/tests/executables/_utils/get_num_devices.sh
new file mode 100644
index 000000000..f0701ce8d
--- /dev/null
+++ b/tests/executables/_utils/get_num_devices.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+devices=$CUDA_VISIBLE_DEVICES
+if [ -n "$devices"  ]; then
+    _devices=(${devices//,/ })
+    num_devices=${#_devices[@]}
+else
+    num_devices=2
+    export CUDA_VISIBLE_DEVICES=0,1
+    echo "Not found CUDA_VISIBLE_DEVICES, set nproc_per_node = ${num_devices}"
+fi
+export IX_NUM_CUDA_VISIBLE_DEVICES=${num_devices}
diff --git a/tests/executables/_utils/global_environment_variables.sh b/tests/executables/_utils/global_environment_variables.sh
new file mode 100644
index 000000000..4be552ede
--- /dev/null
+++ b/tests/executables/_utils/global_environment_variables.sh
@@ -0,0 +1,12 @@
+export PROJECT_DIR=../../
+export DRT_MEMCPYUSEKERNEL=20000000000
+: ${RUN_MODE:="strict"}
+: ${NONSTRICT_EPOCH:=5}
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0)); then
+        EXIT_STATUS=1
+    fi
+}
diff --git a/tests/executables/_utils/init_classification_torch.sh b/tests/executables/_utils/init_classification_torch.sh
new file mode 100644
index 000000000..fce2ca6a9
--- /dev/null
+++ b/tests/executables/_utils/init_classification_torch.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+if [ -n "$1" ]; then
+    _UTILS_DIR=$1
+else
+    _UTILS_DIR='../_utils'
+fi
+
+# Install packages
+. $_UTILS_DIR/install_pip_pkgs.sh
+
+pkgs=('scipy' 'easydict' 'tqdm')
+
+install_pip_pkgs "${pkgs[@]}"
\ No newline at end of file
diff --git a/tests/executables/_utils/install_pip_pkgs.sh b/tests/executables/_utils/install_pip_pkgs.sh
new file mode 100644
index 000000000..e49841657
--- /dev/null
+++ b/tests/executables/_utils/install_pip_pkgs.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+PIPCMD=pip3
+: ${PKGS_CACHE_DIR:="__null__"}
+
+function install_pip_pkgs() {
+    for pkg in "$@"
+    do
+        if [ ! -d $PKGS_CACHE_DIR ]; then
+            $PIPCMD install $pkg
+        else
+            $PIPCMD install --no-index --find-links=$PKGS_CACHE_DIR $pkg
+        fi
+    done
+}
+
+# Exeample
+# pkgs=(1 2 3)
+# install_pip_pkgs "${pkgs[@]}"
\ No newline at end of file
diff --git a/tests/executables/efficientnet/init_torch.sh b/tests/executables/efficientnet/init_torch.sh
new file mode 100644
index 000000000..16ec233f5
--- /dev/null
+++ b/tests/executables/efficientnet/init_torch.sh
@@ -0,0 +1,7 @@
+bash ../_utils/init_classification_torch.sh ../_utils
+
+if [ "$?" != "0" ]; then
+    exit 1
+fi
+
+exit 0
\ No newline at end of file
diff --git a/tests/executables/efficientnet/train_efficientb4_imagenet_amp_dist_torch.sh b/tests/executables/efficientnet/train_efficientb4_imagenet_amp_dist_torch.sh
new file mode 100644
index 000000000..382e3ec5f
--- /dev/null
+++ b/tests/executables/efficientnet/train_efficientb4_imagenet_amp_dist_torch.sh
@@ -0,0 +1,51 @@
+source ../_utils/global_environment_variables.sh
+
+: ${BATCH_SIZE:=128}
+
+IMAGENET_PATH="`pwd`/../../data/datasets/imagenet"
+
+OUTPUT_PATH="`pwd`/work_dir/efficient_b4"
+mkdir -p ${OUTPUT_PATH}
+
+source ../_utils/get_num_devices.sh
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0)); then
+        EXIT_STATUS=1
+    fi
+}
+
+cd ../../../cv/classification/efficientnet_b4/pytorch
+
+export PYTHONPATH=./:$PYTHONPATH
+
+: "${HOST_MASTER_ADDR:="127.0.0.1"}"
+: "${HOST_MASTER_PORT:=20060}"
+: "${HOST_NNODES:=1}"
+: "${HOST_NODE_RANK:=0}"
+
+extra_params="--epoch ${NONSTRICT_EPOCH}"
+if [ "${RUN_MODE}" == "strict" ]; then
+    extra_params="--acc-thresh 75.0"
+fi
+
+python3 -m torch.distributed.launch --master_addr ${HOST_MASTER_ADDR} \
+--master_port ${HOST_MASTER_PORT} \
+--nproc_per_node=$IX_NUM_CUDA_VISIBLE_DEVICES \
+--nnodes ${HOST_NNODES} \
+--node_rank ${HOST_NODE_RANK} \
+--use_env \
+train.py \
+--model efficientnet_b4 \
+--data-path ${IMAGENET_PATH} \
+--batch-size ${BATCH_SIZE} \
+--acc-thresh 75.0 \
+--amp \
+--output-dir ${OUTPUT_PATH} ${extra_params} \
+"$@";check_status
+
+
+exit ${EXIT_STATUS}
+
diff --git a/tests/prepare_dataset.sh b/tests/prepare_dataset.sh
new file mode 100644
index 000000000..4826bce2b
--- /dev/null
+++ b/tests/prepare_dataset.sh
@@ -0,0 +1,97 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+PROJ_DIR=$(cd `dirname $0`; pwd)
+DATASET_DIR="${PROJ_DIR}/data/datasets"
+MODEL_ZOO_DIR="${PROJ_DIR}/data/model_zoo"
+
+
+# Unarchive datas
+if [ -f "datasets.tgz" ]; then
+    tar -zxf "datasets.tgz"
+fi
+
+# Prepare coco
+cd ${DATASET_DIR}/coco2017
+if [[ -f "annotations_trainval2017.zip" ]]; then
+    echo "Unarchive annotations_trainval2017.zip"
+    unzip -q annotations_trainval2017.zip
+fi
+if [[ -f "train2017.zip" ]]; then
+    if [[ ! -d "${DATASET_DIR}/coco2017/train2017" ]]; then
+        echo "Unarchive train2017.zip"
+        unzip -q train2017.zip
+    fi
+fi
+if [[ -f "val2017.zip" ]]; then
+    if [[ ! -d "${DATASET_DIR}/coco2017/val2017" ]]; then
+        echo "Unarchive val2017.zip"
+        unzip -q val2017.zip
+    fi
+fi
+if [[ -f "val2017_mini.zip" ]]; then
+    if [[ ! -d "${DATASET_DIR}/coco2017/val2017_mini" ]]; then
+        echo "Unarchive val2017_mini.zip"
+        unzip -q val2017_mini.zip
+    fi
+fi
+
+cd ${DATASET_DIR}
+# TGZS=`find . -iname 'CamVid*.tgz' -or -iname '*VOC*.tgz' -or -iname 'imagenet*.tgz' -or -iname 'coco*.tgz'`
+TGZS=$(ls -al | grep -oE 'CamVid[^ ]*.tgz|[^ ]*VOC[^ ]*.tgz|imagenet[^ ]*.tgz|coco[^ ]*.tgz')
+for path in $TGZS; do
+    data_name=`echo "${path}" | cut -f2 -d'/' | cut -f1 -d'.'`
+    if [[ -d "${data_name}" ]]; then
+        echo "Skip ${path}"
+        continue
+    fi
+
+    echo "Unarchive ${path}"
+    cd ${path%/*}
+    if [ -w "${path##*/}" ]; then
+        echo "该文件有写入权限。"
+    else
+        echo "该文件没有写入权限。"
+        continue
+    fi
+    tar zxf ${path##*/}
+    cd ${DATASET_DIR}
+done
+
+
+# Prepare pretrained data
+cd ${DATASET_DIR}/bert_mini
+if [[ ! -d "${DATASET_DIR}/bert_mini/2048_shards_uncompressed" ]]; then
+    echo "Unarchive 2048_shards_uncompressed_mini"
+    tar -xzf 2048_shards_uncompressed_mini.tar.gz
+fi
+if [[ ! -d "${DATASET_DIR}/bert_mini/eval_set_uncompressed" ]]; then
+    echo "Unarchive eval_set_uncompressed.tar.gz"
+    tar -xzf eval_set_uncompressed.tar.gz
+fi
+cd ../../../
+
+
+# Prepare model's checkpoint
+if [ ! -d "${HOME}/.cache/torch/hub/checkpoints/" ]; then
+    echo "Create checkpoints dir"
+    mkdir -p ${HOME}/.cache/torch/hub/checkpoints/
+fi
+
+
+if [ -d "${MODEL_ZOO_DIR}" ]; then
+    cd ${MODEL_ZOO_DIR}
+    checkpoints=`find . -name '*.pth' -or -name '*.pt'`
+    for cpt in $checkpoints; do
+        if [[ ! -f "${HOME}/.cache/torch/hub/checkpoints/${cpt}" ]]; then
+            echo "Copy $cpt to ${HOME}/.cache/torch/hub/checkpoints/"
+            cp $cpt ${HOME}/.cache/torch/hub/checkpoints/
+        fi
+    done
+fi
diff --git a/tests/prepare_python_environment.sh b/tests/prepare_python_environment.sh
new file mode 100644
index 000000000..0d7e86995
--- /dev/null
+++ b/tests/prepare_python_environment.sh
@@ -0,0 +1,37 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+PIPCMD=pip3
+
+PROJ_DIR=$(cd `dirname $0`; pwd)
+PACKAGES_DIR="${PROJ_DIR}/data/packages"
+
+if [ -d "${PACKAGES_DIR}" ]; then
+    #$PIPCMD install --no-index --find-links=./packages numpy==1.19.5
+    $PIPCMD install --no-index --find-links=${PACKAGES_DIR} scikit-build
+else
+    $PIPCMD install scikit-build
+    #$PIPCMD install numpy==1.19.5
+fi
+
+
+# determine whether the user is root mode to execute this script
+prefix_sudo=""
+current_user=$(whoami)
+if [ "$current_user" != "root" ]; then
+    echo "User $current_user need to add sudo permission keywords"
+    prefix_sudo="sudo"
+fi
+
+echo "prefix_sudo= $prefix_sudo"
+
+cd "${PROJ_DIR}/tools/dltest"
+# $prefix_sudo python3 setup.py install
+$prefix_sudo $PIPCMD install .
\ No newline at end of file
diff --git a/tests/prepare_system_environment.sh b/tests/prepare_system_environment.sh
new file mode 100644
index 000000000..7421376ad
--- /dev/null
+++ b/tests/prepare_system_environment.sh
@@ -0,0 +1,86 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+PIPCMD=pip3
+
+# determine whether the user is root mode to execute this script
+prefix_sudo=""
+current_user=$(whoami)
+if [ "$current_user" != "root" ]; then
+    echo "User $current_user need to add sudo permission keywords"
+    prefix_sudo="sudo"
+fi
+
+echo "prefix_sudo= $prefix_sudo"
+
+## Install packages
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    $prefix_sudo apt update
+    sudo_path='command -v sudo'
+    if [ -z "${sudo_path}" ]; then
+        echo "Install sudo"
+        $prefix_sudo apt install -y sudo
+    fi
+    cmake_path=`command -v cmake`
+    if [ -z "${cmake_path}" ]; then
+        echo "Install cmake"
+        $prefix_sudo apt install -y cmake
+    fi
+    unzip_path=`command -v unzip`
+    if [ -z "${unzip_path}" ]; then
+        echo "Install unzip"
+        $prefix_sudo apt install -y unzip
+    fi
+    $prefix_sudo apt -y install libgl1-mesa-glx
+    pyver=`python3 -c 'import sys; print(sys.version_info[:][0])'`
+    pysubver=`python3 -c 'import sys; print(sys.version_info[:][1])'`
+    $prefix_sudo apt -y install python${pyver}.${pysubver}-dev
+elif [[ ${ID} == "centos" ]]; then
+    sudo_path='command -v sudo'
+    if [ -z "${sudo_path}" ]; then
+        echo "Install sudo"
+        $prefix_sudo yum install -y sudo
+    fi
+    cmake_path=`command -v cmake`
+    if [ -z "${cmake_path}" ]; then
+        echo "Install cmake"
+        $prefix_sudo yum install -y cmake
+    fi
+    unzip_path=`command -v unzip`
+    if [ -z "${unzip_path}" ]; then
+        echo "Install unzip"
+        $prefix_sudo yum install -y unzip
+    fi
+    $prefix_sudo yum -y install mesa-libGL
+else
+    sudo_path='command -v sudo'
+    if [ -z "${sudo_path}" ]; then
+        echo "Install sudo"
+        $prefix_sudo yum install -y sudo
+    fi
+    cmake_path=`command -v cmake`
+    if [ -z "${cmake_path}" ]; then
+        echo "Install cmake"
+        $prefix_sudo yum install -y cmake
+    fi
+    unzip_path=`command -v unzip`
+    if [ -z "${unzip_path}" ]; then
+        echo "Install unzip"
+        $prefix_sudo yum install -y unzip
+    fi
+    $prefix_sudo yum -y install mesa-libGL
+fi
+
+# Fix No module named 'urllib3.packages.six'
+sys_name_str=`uname -a`
+if [[ "${sys_name_str}" =~ "aarch64" ]]; then
+    pip3 install urllib3 requests --upgrade
+fi
diff --git a/tests/quick_build_environment.sh b/tests/quick_build_environment.sh
new file mode 100644
index 000000000..ba2b6991a
--- /dev/null
+++ b/tests/quick_build_environment.sh
@@ -0,0 +1,26 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+# check current directory
+: "${PROJ_DIR:=$(cd `dirname $0`; pwd)}"
+if [ ! -d "${PROJ_DIR}/executables" ]; then
+    echo "CurrentDirectory = ${PROJ_DIR}"
+    echo "ERROR: Current directory is not found executables directory, exit 1."
+    echo "Please set <PROJ_DIR> variable to deeplearningsamples, PROJ_DIR=<DIR> bash quick_build_environment.sh"
+    exit 1
+fi
+
+cd ${PROJ_DIR}
+
+echo "Current directory: `pwd`"
+
+bash ./prepare_dataset.sh
+bash ./prepare_system_environment.sh
+bash ./prepare_python_environment.sh
\ No newline at end of file
diff --git a/tests/tools/dltest/README.md b/tests/tools/dltest/README.md
new file mode 100644
index 000000000..0f89071b9
--- /dev/null
+++ b/tests/tools/dltest/README.md
@@ -0,0 +1,121 @@
+### 1. Install dltest tool
+    
+    python setup.py develop
+
+### 2. Usage
+
+#### 2.1 Fetch log
+
+Commmand:
+
+```shell
+ixdltest-fetch args ${log_path}
+```
+
+Arguments:
+
+- p or patterns, The pattern of fetch log;
+- pn or pattern_names, The name of pattern;
+- use_re, Whether use regular expression;
+- d or nearest_distance, default=10, The nearest distance of matched pattern;
+- start_flag, The flag of start to record log;
+- end_flag, The flag of stop to record log;
+- split_pattern, The pattern is used to match line, If the line is matched, argument `split_sep` to split the line.
+- split_sep, The seperator is used to split line;
+- split_idx, The index of split line;
+- saved, Save result to path;
+- log, Log path.
+
+> Examples
+
+2.1.1. Fetch accuracy in the log of ResNet by match.
+
+```shell
+ixdltest-fetch nv-train_resnet50_torch.sh.epoch_5.log -p "Acc@1" "Acc@5"
+
+# Output:
+# {'results': [{'Acc@1': [9.682], 'Acc@5': [50.293]}, {'Acc@1': [19.541], 'Acc@5': [61.096]},
+#              {'Acc@1': [21.35], 'Acc@5': [67.338]}, {'Acc@1': [21.197], 'Acc@5': [67.083]},
+#              {'Acc@1': [24.586], 'Acc@5': [67.949]}]}
+```
+
+2.1.2. Fetch mAP in the log of YoloV5 by split.
+
+```shell
+ixdltest-fetch nv-train_yolov5s_coco_torch.sh.epoch_5.log \ 
+-p "Average Precision  \(AP\) @\[ IoU=0.50:0.95 \| area=   all \| maxDets=100 \] =" \
+-pn "mAP"
+
+# Output:
+# {'results': [{'mAP': [0.359]}, {'mAP': [0.359]}, {'mAP': [0.359]}]}
+```
+
+
+#### 2.2. Compare logs
+
+```shell
+ixdltest-compare --log1 ${log_path1} --log2 ${log_path2} args
+```
+
+Arguments:
+
+- log1, First log;
+- log2, Second log;
+- threshold, Threshold;
+- only_last, Whether use the last result to compare;
+- print_result, Whether print result;
+- p or patterns, The pattern of fetch log;
+- pn or pattern_names, The name of pattern;
+- use_re, Whether use regular expression;
+- d or nearest_distance, default=10, The nearest distance of matched pattern;
+- start_flag, The flag of start to record log;
+- end_flag, The flag of stop to record log;
+- split_pattern, The pattern is used to match line, If the line is matched, argument `split_sep` to split the line.
+- split_sep, The seperator is used to split line;
+- split_idx, The index of split line;
+- saved, Save result to path;
+- log, Log path.
+
+> Examples
+
+2.2.1. Compare log
+
+```shell
+ixdltest-compare \
+--log1 nv-train_resnet50_torch.sh.epoch_5.log \
+--log2 nv-train_resnet50_torch.sh.epoch_default.log -p "Acc@1" \
+--threshold 0.02
+
+# Output:
+# Fail
+```
+
+#### 2.3. Validate model
+
+```shell
+ixdltest-validate args ${script} ${script_args}
+```
+
+Arguments:
+
+- l or compare_log, If None is given, a comparable log will be searched in `deeplearningsamples/runing_logs`;
+- saved, Save result to path;
+- with_exit_code, Add exit code for the result of compared;
+- print_result, Whether print result;
+- capture_output, optional values ['pipe', 'tempfile'], The method of capture output;
+- run_script, A runable script with arguments.
+
+
+> Examples
+
+2.3.1. Validate model
+
+```shell
+ixdltest-validate bash train_shufflenetv2_x0_5_torch.sh --epochs 5
+
+# Output:
+# SUCCESS
+
+```
+
+
diff --git a/tests/tools/dltest/dltest/__init__.py b/tests/tools/dltest/dltest/__init__.py
new file mode 100644
index 000000000..e6f4d6ab9
--- /dev/null
+++ b/tests/tools/dltest/dltest/__init__.py
@@ -0,0 +1 @@
+from .utils.training_args import show_training_arguments
\ No newline at end of file
diff --git a/tests/tools/dltest/dltest/cli/__init__.py b/tests/tools/dltest/dltest/cli/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/tools/dltest/dltest/cli/assert_cli.py b/tests/tools/dltest/dltest/cli/assert_cli.py
new file mode 100644
index 000000000..27ae1b6bd
--- /dev/null
+++ b/tests/tools/dltest/dltest/cli/assert_cli.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+import os
+from typing import List, Iterable, Optional
+
+from dltest.cli.log_parser_cli import LogParserCLI
+from dltest.log_parser import LogParser
+from dltest.model_compare_config import get_compare_config_with_full_path
+from dltest.utils.misc import get_full_path
+from dltest.utils.subprocess_tools import get_output
+from dltest.model_compare_config import ComparatorConfig
+
+
+FRAMEWORKS = list(ComparatorConfig.get_frameworks())
+
+REMAINDER = '...'
+
+assertion_expr_factory = dict(
+    eq = "a == b",
+    ne = "a != b",
+    ge = "a >= b",
+    le = "a <= b",
+    gt = "a > b",
+    lt = "a < b",
+)
+
+
+class AssertCLI(LogParserCLI):
+
+    def command_name(self):
+        return "assert"
+
+    def predefine_args(self):
+        super(AssertCLI, self).predefine_args()
+        self.parser.add_argument('-b', '--assertion_second_value', type=float, default=None,
+                                 help='It is used in assertion expression.')
+        self.parser.add_argument('--print_result', action="store_true", default=False,
+                                 help='Whether print result')
+        self.parser.add_argument('--capture_output', type=str, default='pipe', choices=['pipe', 'tempfile'],
+                                 help='The method of capture output')
+        # FIXME: Using store_action to replase it
+        self.parser.add_argument('--only_last', type=int, default=0,
+                                 help='Whether use the last result to compare')
+        self.parser.add_argument('--expr', type=str, default="ge",
+                                 help=f"Assertion expression, option keys: {', '.join(assertion_expr_factory.keys())}" +
+                                 ", or a executable code, such as `a > b`, `a > 1`, ...")
+        self.parser.add_argument('--use_predefined_parser_rules', action="store_true", default=False,
+                                 help='Whether use predefined args of parser.')
+        self.parser.add_argument('--log', type=str, default=None, help="Log path")
+        self.parser.add_argument("--run_script", default=[], nargs=REMAINDER)
+
+    def parse_args(self, *args, **kwargs):
+        args = super(AssertCLI, self).parse_args()
+        args.only_last = args.only_last > 0
+        if len(args.run_script) == 0 and args.log is None:
+            raise ValueError("The one of `--run_script` or `--log` must be given.")
+
+        if args.assertion_second_value is None:
+            if args.expr is None:
+                raise ValueError("The one of `--assertion_second_value` or `--expr` must be given.")
+
+            if args.expr in assertion_expr_factory:
+                raise ValueError(
+                    "The comparison operators depend on the argument `assertion_second_value`."
+                )
+
+        return args
+
+    def create_parser(self, args):
+        if args.use_predefined_parser_rules:
+            script_path = self._get_script_path(args.run_script)
+            config = get_compare_config_with_full_path(script_path, to_dict=False)
+
+            return LogParser(
+                patterns=config.patterns, pattern_names=config.pattern_names,
+                use_re=config.use_re, nearest_distance=config.nearest_distance,
+                start_line_pattern_flag=config.start_line_pattern_flag,
+                end_line_pattern_flag=config.end_line_pattern_flag,
+                split_pattern=config.split_pattern,
+                split_sep=config.split_sep,
+                split_idx=config.split_idx
+            )
+
+        return LogParser(
+            patterns=args.patterns, pattern_names=args.pattern_names,
+            use_re=args.use_re, nearest_distance=args.nearest_distance,
+            start_line_pattern_flag=args.start_flag,
+            end_line_pattern_flag=args.end_flag,
+            split_pattern=args.split_pattern,
+            split_sep=args.split_sep,
+            split_idx=args.split_idx
+        )
+
+    def run(self):
+        args = self.parse_args()
+        parser = self.create_parser(args)
+
+        if args.print_result:
+            print(args)
+
+        output = self.get_log(args)
+        parsed_logs = self.parser_log(parser, output, args)
+        self.check_logs(parsed_logs, args)
+
+    def get_log(self, args):
+        if len(args.run_script) == 0:
+            try:
+                with open(args.log) as f:
+                    return f.readlines()
+            except:
+                print(f"ERROR: Read log fail in {args.log}")
+                exit(1)
+        else:
+            return get_output(args.run_script, capture_output_method=args.capture_output)
+
+    def parser_log(self, parser, output, args) -> List[float]:
+        results = parser.parse(output)
+        if args.only_last:
+            results = results[-1:]
+
+        if len(results) == 0:
+            raise ValueError("The parsed results is empty, please check patterns.")
+        if isinstance(results[0], dict):
+            if len(results[0]) == 0:
+                raise ValueError("The parsed results is empty, please check patterns.")
+            key = list(results[0].keys())[0]
+            results = [result[key] for result in results]
+
+        if isinstance(results[0], Iterable):
+            results = [result[0] for result in results]
+
+        return results
+
+    def check_logs(self, parsed_logs, args):
+        if args.print_result:
+            print("Parsed result:", parsed_logs)
+
+        assertion_expr = assertion_expr_factory.get(args.expr, args.expr)
+
+        assert_results = []
+        b = args.assertion_second_value
+        for a in parsed_logs:
+            assert_results.append(eval(assertion_expr))
+
+        if args.print_result:
+            print("The result of assertion expression:", assert_results)
+
+        if any(assert_results):
+            print("SUCCESS")
+            exit(0)
+        print("FAIL")
+        exit(1)
+
+    def _get_script_path(self, run_script: List[str]):
+        # Find shell script by current run_script
+        def _find_real_shell_script(cmd: List[str]):
+            for i, field in enumerate(cmd):
+                if field.endswith('.sh') and self._get_framework(field) in FRAMEWORKS:
+                    return field
+
+        real_shell_script = _find_real_shell_script(run_script)
+
+        # Find shell script by parent process
+        if real_shell_script is None:
+            ppid = os.getppid()
+            import psutil
+            pproc = psutil.Process(ppid)
+            pproc_cmd = pproc.cmdline()
+            real_shell_script = _find_real_shell_script(pproc_cmd)
+
+        if real_shell_script is not None:
+            real_shell_script = self._get_script_abs_path(real_shell_script)
+            return real_shell_script
+
+        raise RuntimeError("The script is not named correctly, " + \
+                           "please use a script name ending with the framework, " + \
+                           f"got `{' '.join(run_script)}`, " + \
+                           "e.g. train_resnet50_torch.sh")
+
+    def _get_framework(self, shell_script: str) -> Optional[str]:
+        try:
+            return shell_script.split('.')[-2].split('_')[-1]
+        except:
+            return None
+
+    def _get_script_abs_path(self, run_script):
+        real_run_script = os.path.realpath(run_script)
+        if os.path.exists(real_run_script):
+            return real_run_script
+
+        if "MODEL_DIR" in os.environ:
+            return os.path.join(os.environ["MODEL_DIR"], run_script)
+
+        if "OLDPWD" in os.environ:
+            real_run_script = os.path.join(os.environ["OLDPWD"], run_script)
+            if os.path.exists(real_run_script):
+                return real_run_script
+
+        raise FileNotFoundError("Not found running script path, " + \
+                                "please set environment variable `MODEL_DIR`, " + \
+                                "e.g /path/to/deeplearningsamples/executables/resnet.")
+
diff --git a/tests/tools/dltest/dltest/cli/check_cli.py b/tests/tools/dltest/dltest/cli/check_cli.py
new file mode 100644
index 000000000..f2fc9f9ef
--- /dev/null
+++ b/tests/tools/dltest/dltest/cli/check_cli.py
@@ -0,0 +1,56 @@
+import os
+
+from .assert_cli import AssertCLI
+from ..utils.subprocess_tools import execute_shell
+
+RUN_MODE_KEY = "RUN_MODE"
+RUN_MODE_STRICT = "strict"
+
+
+class CheckCli(AssertCLI):
+
+    def __init__(self, *args, **kwargs):
+        super(CheckCli, self).__init__(*args, **kwargs)
+        self.args = None
+
+    def command_name(self):
+        return "check"
+
+    def predefine_args(self):
+        self.parser.add_argument("--check_mode", type=str, default="no",
+                                 choices=["all", "strict", "nonstrict", "no"],
+                                 help="which running mode needs to be checked")
+        self.parser.add_argument("--nonstrict_mode_args", type=str, default="",
+                                 help="the arguments are used with nonstric testing")
+        super(CheckCli, self).predefine_args()
+
+    def parse_args(self, *args, **kwargs):
+        if self.args is None:
+            args = super(CheckCli, self).parse_args(*args, **kwargs)
+            args.use_predefined_parser_rules = True
+            args.nonstrict_mode_args = args.nonstrict_mode_args.split(" ")
+
+            if not self.is_strict_testing():
+                args.run_script.extend(args.nonstrict_mode_args)
+
+            if args.check_mode == "all":
+                args.check_mode = self.current_running_mode()
+
+            self.args = args
+        return self.args
+
+    def run(self):
+        args = self.parse_args()
+        if args.check_mode == self.current_running_mode():
+            return super(CheckCli, self).run()
+        else:
+            res = execute_shell(args.run_script)
+            exit(res.returncode)
+
+    def current_running_mode(self):
+        return os.environ.get(RUN_MODE_KEY, RUN_MODE_STRICT)
+
+    def is_strict_testing(self):
+        return self.current_running_mode() == RUN_MODE_STRICT
+
+
diff --git a/tests/tools/dltest/dltest/cli/entry_points.py b/tests/tools/dltest/dltest/cli/entry_points.py
new file mode 100644
index 000000000..32225676d
--- /dev/null
+++ b/tests/tools/dltest/dltest/cli/entry_points.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+from dltest.cli.assert_cli import AssertCLI
+from dltest.cli.log_comparator_cli import LogComparatorCLI
+from dltest.cli.model_validator_cli import ModelValidatorCLI
+from dltest.cli.fetch_log_cli import FetchLog
+from dltest.cli.check_cli import CheckCli
+
+
+log_comparator_cli = LogComparatorCLI()
+model_validator_cli = ModelValidatorCLI()
+fetch_log_cli = FetchLog()
+assert_cli = AssertCLI()
+check_cli = CheckCli()
+
+
+def make_execute_path():
+    preffix = "dltest.cli.entry_points"
+    clis = []
+    for cli_var in globals():
+        if cli_var.endswith('_cli'):
+            cmd_name = globals()[cli_var].command_name()
+            clis.append(f"ixdltest-{cmd_name}={preffix}:{cli_var}")
+
+    return clis
+
+
diff --git a/tests/tools/dltest/dltest/cli/fetch_log_cli.py b/tests/tools/dltest/dltest/cli/fetch_log_cli.py
new file mode 100644
index 000000000..196b982f4
--- /dev/null
+++ b/tests/tools/dltest/dltest/cli/fetch_log_cli.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import json
+from typing import Mapping
+
+from dltest.log_parser import LogParser
+from dltest.cli.log_parser_cli import LogParserCLI
+
+
+class FetchLog(LogParserCLI):
+
+    def command_name(self):
+        return "fetch"
+
+    def predefine_args(self):
+        super(FetchLog, self).predefine_args()
+        self.parser.add_argument('log', type=str, help="Log path")
+        self.parser.add_argument('--saved', type=str, default=None, help='Save to path')
+
+    def run(self):
+        args = self.parse_args()
+        parser = LogParser(
+            patterns=args.patterns, pattern_names=args.pattern_names,
+            use_re=args.use_re, nearest_distance=args.nearest_distance,
+            start_line_pattern_flag=args.start_flag,
+            end_line_pattern_flag=args.end_flag,
+            split_pattern=args.split_pattern,
+            split_sep=args.split_sep,
+            split_idx=args.split_idx
+        )
+
+        results = parser.parse(args.log)
+        if not isinstance(results, Mapping):
+            results = dict(results=results)
+        print(results)
+
+        if args.saved is not None:
+            with open(args.saved, 'w') as f:
+                json.dump(results, f)
+
diff --git a/tests/tools/dltest/dltest/cli/log_comparator_cli.py b/tests/tools/dltest/dltest/cli/log_comparator_cli.py
new file mode 100644
index 000000000..d5befc704
--- /dev/null
+++ b/tests/tools/dltest/dltest/cli/log_comparator_cli.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import json
+from pprint import pprint
+
+from dltest.cli.log_parser_cli import LogParserCLI
+from dltest.log_comparator import compare_logs_with_paths, DEFAULT_NEAREST_MATCH_CHARS
+
+
+class LogComparatorCLI(LogParserCLI):
+
+    def command_name(self):
+        return "compare"
+
+    def predefine_args(self):
+        super(LogComparatorCLI, self).predefine_args()
+        self.parser.add_argument('--log1', type=str, help="First log")
+        self.parser.add_argument('--log2', type=str, help="Second log")
+        self.parser.add_argument('--threshold', type=float, default=0.0001, help="Threshold")
+        self.parser.add_argument('--only_last', type=int, default=1, help='Whether use the last result to compare')
+        self.parser.add_argument('--saved', type=str, default=None, help='Save to path')
+        self.parser.add_argument('--print_result', action="store_true", default=False, help='Whether print result')
+        self.parser.add_argument('--allow_greater_than', action="store_true", default=False, help='Allow log1 greater than log2')
+
+    def parse_args(self, *args, **kwargs):
+        args = super(LogComparatorCLI, self).parse_args(*args, **kwargs)
+        args.only_last = args.only_last >= 1
+
+        return args
+
+    def run(self):
+        args = self.parse_args()
+        satisfied, results = compare_logs_with_paths(
+            log1=args.log1, log2=args.log2,
+            threshold=args.threshold,
+            patterns=args.patterns, pattern_names=args.pattern_names,
+            use_re=args.use_re, nearest_distance=args.nearest_distance,
+            start_line_pattern_flag=args.start_flag,
+            end_line_pattern_flag=args.end_flag,
+            only_last=args.only_last,
+            split_pattern=args.split_pattern,
+            split_sep=args.split_sep,
+            split_idx=args.split_idx,
+            allow_greater_than=True
+        )
+
+        if args.print_result:
+            pprint(results)
+
+        if satisfied:
+            print("SUCCESS")
+        else:
+            print("FAIL")
+
+        if args.saved is not None:
+            with open(args.saved, 'w') as f:
+                json.dump(results, f)
+
+
+
+
diff --git a/tests/tools/dltest/dltest/cli/log_parser_cli.py b/tests/tools/dltest/dltest/cli/log_parser_cli.py
new file mode 100644
index 000000000..936a67606
--- /dev/null
+++ b/tests/tools/dltest/dltest/cli/log_parser_cli.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import json
+from typing import Mapping
+
+from dltest.log_parser import LogParser, DEFAULT_NEAREST_MATCH_CHARS
+from dltest.utils.base_cli import BaseCLI
+
+
+class LogParserCLI(BaseCLI):
+
+    def predefine_args(self):
+        self.parser.add_argument('-p', '--patterns', nargs="*", type=str, default=None, help='Fetched patterns')
+        self.parser.add_argument('-pn', '--pattern_names', nargs="*", type=str, default=None, help='The name of pattern')
+        self.parser.add_argument('--use_re', action="store_true", default=False, help='Whether use regular expression')
+        self.parser.add_argument('-d', '--nearest_distance', type=int, default=DEFAULT_NEAREST_MATCH_CHARS, help='The nearest distance of matched pattern')
+        self.parser.add_argument('--start_flag', type=str, default=None, help='The flag of start to record log')
+        self.parser.add_argument('--end_flag', type=str, default=None, help='The flag of stop to record log')
+        self.parser.add_argument('--split_pattern', type=str, default=None, help='The pattern is used to match line')
+        self.parser.add_argument('--split_sep', nargs="*", type=str, default=None, help='The seperator is used to split line')
+        self.parser.add_argument('--split_idx', nargs="*", type=int, default=None, help='The index of split line')
+
+    def parse_args(self, *args, **kwargs):
+        args = super(LogParserCLI, self).parse_args(*args, **kwargs)
+
+        return args
+
diff --git a/tests/tools/dltest/dltest/cli/model_validator_cli.py b/tests/tools/dltest/dltest/cli/model_validator_cli.py
new file mode 100644
index 000000000..973142a03
--- /dev/null
+++ b/tests/tools/dltest/dltest/cli/model_validator_cli.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import json
+import os
+import os.path as ospath
+from pprint import pprint
+from typing import List, Union
+
+from dltest.utils.base_cli import BaseCLI
+from dltest.utils.get_env import get_gpu_type
+from dltest.utils.misc import get_full_path
+from dltest.model_compare_config import get_compare_config_with_full_path
+from dltest.log_comparator import compare_logs_with_paths
+from dltest.utils.subprocess_tools import get_output
+
+
+REMAINDER = '...'
+
+
+class ModelValidatorCLI(BaseCLI):
+
+    def command_name(self):
+        return "validate"
+
+    def predefine_args(self):
+        super(ModelValidatorCLI, self).predefine_args()
+        self.parser.add_argument('-l', '--compare_log', type=str, default=None, help="Compare log")
+        self.parser.add_argument('--saved', type=str, default=None, help='Save to path')
+        self.parser.add_argument('--with_exit_code', type=int, default=1, help="Add exit code for the result of compared")
+        self.parser.add_argument('--print_result', action="store_true", default=False, help='Whether print result')
+        self.parser.add_argument('--capture_output', type=str, default='pipe', choices=['pipe', 'tempfile'], help='The method of capture output')
+        self.parser.add_argument("run_script", nargs=REMAINDER)
+
+    def parse_args(self, *args, **kwargs):
+        args = super(ModelValidatorCLI, self).parse_args()
+        if len(args.run_script) == 0:
+            print("ERROR: Invalid run_script")
+            exit(1)
+
+        return args
+
+    def run(self):
+        args = self.parse_args()
+        output = self._run_script(args.run_script, capture_output_method=args.capture_output)
+        self.compare_logs(
+            output, args.compare_log, args.run_script,
+            args.saved, args.with_exit_code,
+            args.print_result
+        )
+
+    def compare_logs(self, output: List, compare_log: str,
+                     run_script: List[str], saved: str=None,
+                     with_exit_code: int=1, print_result=False):
+        script_path = self._get_script_path(run_script)
+        script_path = get_full_path(script_path)
+        compare_args = get_compare_config_with_full_path(script_path)
+
+        if compare_log is None:
+            epoch = self._get_epoch(run_script)
+            script_name = ospath.basename(script_path)
+            dist_tag = self._get_dist_tag(script_name)
+            compare_log = self._find_comparable_log(script_path, epoch, dist_tag)
+
+            if not ospath.exists(compare_log):
+                print(f"ERROR: {compare_log} not exist. Or please use argument `l` to locate log.")
+                exit(1)
+
+        compare_args['log1'] = output
+        compare_args['log2'] = compare_log
+
+        satisfied, results = compare_logs_with_paths(**compare_args)
+
+        if print_result:
+            pprint(results)
+
+        if satisfied:
+            print("SUCCESS")
+        else:
+            print("FAIL")
+
+        if saved is not None:
+            with open(saved, 'w') as f:
+                json.dump(results, f)
+
+        if with_exit_code:
+            if satisfied:
+                exit(0)
+            else:
+                exit(1)
+
+    def _run_script(self, command: List, capture_output_method: str='tempfile'):
+        return get_output(command, capture_output_method=capture_output_method)
+
+    def _get_script_path(self, run_script: List[str]):
+        for i, field in enumerate(run_script):
+            if field.endswith('.py') or field.endswith('.sh'):
+                return field
+
+        raise RuntimeError("Not found the name of script, " +
+                           "only support python or `sh` script, but got {}.".format(run_script))
+
+    def _find_comparable_log(self, script_path: str, epoch: Union[str, int], dist_tag: str):
+        gpu_type = get_gpu_type().lower()
+
+        # Get the platform of trained log
+        if gpu_type == "nv":
+            gpu_type = 'bi'
+        else:
+            gpu_type = 'nv'
+
+        script_path = get_full_path(script_path)
+        project_dir = self._get_project_dir(script_path)
+        script_name = ospath.basename(script_path)
+
+        log_path = f"{project_dir}/runing_logs/{gpu_type}/{gpu_type}-{script_name}.epoch_{epoch}{dist_tag}.log"
+
+        return log_path
+
+
+    def _get_epoch(self, run_script: List[str]):
+        for i, field in enumerate(run_script):
+            if "--epoch" in field:
+                if "=" in field:
+                    return field.split("=")[1]
+                else:
+                    return run_script[i + 1]
+
+        return 'default'
+
+    def _get_dist_tag(self, script_name: str):
+        try:
+            import torch
+            num_gpus = torch.cuda.device_count()
+        except:
+            num_gpus = os.environ.get("CUDA_VISIBLE_DEVICES", "all")
+
+        if '_dist_' in script_name or '_multigpu_' in script_name:
+            return f".{num_gpus}card"
+        return ""
+
+    def _get_project_dir(self, abs_path):
+        abs_path = ospath.abspath(abs_path)
+        script_dir = ospath.dirname(abs_path)
+        executables_dir = ospath.dirname(script_dir)
+        return ospath.dirname(executables_dir)
diff --git a/tests/tools/dltest/dltest/log_comparator.py b/tests/tools/dltest/dltest/log_comparator.py
new file mode 100644
index 000000000..0da94100e
--- /dev/null
+++ b/tests/tools/dltest/dltest/log_comparator.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+from typing import List, Mapping, Union, Tuple
+from .log_parser import LogParser, DEFAULT_NEAREST_MATCH_CHARS
+
+LogLines = List[Mapping]
+CompareResult = Tuple[bool, Union[List, Mapping]]
+
+
+def _compute_errors(value1: Mapping, value2: Mapping, threshold: Mapping, allow_greater_than=False) -> CompareResult:
+    if not isinstance(threshold, Mapping):
+        _thds = dict()
+        for key in value1.keys():
+            _thds[key] = threshold
+        threshold = _thds
+
+    result = dict()
+    satisfied = True
+    for key, _thd in threshold.items():
+        v1, v2 = value1[key], value2[key]
+        origin_value_type = list
+        if not isinstance(v1, (tuple, list)):
+            origin_value_type = float
+            v1 = [v1]
+            v2 = [v2]
+
+        real_errors = []
+        for v1_i, v2_i in zip(v1, v2):
+            real_error = v1_i - v2_i
+            real_errors.append(real_error)
+            if satisfied and abs(real_error) > _thd:
+                if allow_greater_than and real_error > 0:
+                    continue
+                satisfied = False
+
+        if origin_value_type is float and len(real_errors) > 0:
+            real_errors = real_errors[0]
+
+        result[key] = real_errors
+
+    return satisfied, result
+
+
+def compare_logs(log1: LogLines, log2: LogLines, threshold: Union[float, Mapping], allow_greater_than=False) -> CompareResult:
+    total_lines = len(log1[0])
+    real_errors = []
+    satisfied = True
+    for line_idx in range(total_lines):
+        _satisfied, _error = _compute_errors(log1[line_idx], log2[line_idx], threshold, allow_greater_than=allow_greater_than)
+        real_errors.append(_error)
+        if satisfied and not _satisfied:
+            satisfied = False
+
+    return satisfied, real_errors
+
+
+def compare_logs_by_last_result(log1: LogLines, log2: LogLines, threshold: Union[float, Mapping], allow_greater_than=False) -> CompareResult:
+    if len(log1) == 0 or len(log2) == 0:
+        return False, []
+    return _compute_errors(log1[-1], log2[-1], threshold, allow_greater_than=allow_greater_than)
+
+
+def compare_logs_with_paths(log1, log2, threshold: Union[float, Mapping],
+                            patterns: List[str],
+                            pattern_names: List[str] = None,
+                            use_re: bool = False,
+                            nearest_distance: int = DEFAULT_NEAREST_MATCH_CHARS,
+                            start_line_pattern_flag: str = None,
+                            end_line_pattern_flag: str = None,
+                            only_last: bool=True,
+                            split_pattern: Union[str, List] = None,
+                            split_sep: List = None,
+                            split_idx: List = None,
+                            allow_greater_than: bool = False):
+    parser = LogParser(
+        patterns=patterns, pattern_names=pattern_names,
+        use_re=use_re, nearest_distance=nearest_distance,
+        start_line_pattern_flag=start_line_pattern_flag,
+        end_line_pattern_flag=end_line_pattern_flag,
+        split_pattern=split_pattern,
+        split_sep=split_sep,
+        split_idx=split_idx
+    )
+
+    log1 = parser.parse(log1)
+    log2 = parser.parse(log2)
+
+    if only_last:
+        compare_result = compare_logs_by_last_result(log1, log2, threshold, allow_greater_than=allow_greater_than)
+    else:
+        compare_result = compare_logs(log1, log2, threshold, allow_greater_than=allow_greater_than)
+
+    return compare_result[0], dict(log1=log1, log2=log2, errors=compare_result[-1])
diff --git a/tests/tools/dltest/dltest/log_parser.py b/tests/tools/dltest/dltest/log_parser.py
new file mode 100644
index 000000000..e7a7a0db5
--- /dev/null
+++ b/tests/tools/dltest/dltest/log_parser.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+from typing import List, Optional, Union, Mapping
+import re
+
+
+DEFAULT_NEAREST_MATCH_CHARS = 10
+
+
+def postprocess_search_result(results: List[str]) -> List[float]:
+    if len(results) != 0:
+        results = list(map(float, results))
+    return results
+
+
+def extract_nearest_value_by_key_inline(content: str, key: str,
+                                        nearest_distance: int=DEFAULT_NEAREST_MATCH_CHARS) -> List[float]:
+    pattern = "%s[\s\S]{0,%d}?(\d+(?:\.\d+)?)" % (key, nearest_distance)
+    return extract_value_by_pattern_inline(content, pattern)
+
+
+def extract_value_by_pattern_inline(content: str, pattern: str) -> List[float]:
+    results = re.findall(pattern, content)
+    return postprocess_search_result(results)
+
+
+def extract_value(content: str, pattern: str,
+                  inline=True, use_re=False,
+                  nearest_distance: int=DEFAULT_NEAREST_MATCH_CHARS) -> List[float]:
+    if inline:
+        if use_re:
+            return extract_value_by_pattern_inline(content, pattern)
+        else:
+            return extract_nearest_value_by_key_inline(content, pattern, nearest_distance)
+    else:
+        raise NotImplementedError()
+
+
+class LogParser:
+
+    def __init__(self,
+                 patterns: List[str]=None,
+                 pattern_names: List[str]=None,
+                 use_re: bool=False,
+                 nearest_distance: int=DEFAULT_NEAREST_MATCH_CHARS,
+                 start_line_pattern_flag: str=None,
+                 end_line_pattern_flag: str=None,
+                 split_pattern: Union[str, List]=None,
+                 split_sep: List[str]=None,
+                 split_idx: List[int]=None):
+        if patterns is None and split_sep is None:
+            raise ValueError("The one of argument `patterns` or `split_sep` must be given.")
+
+        if pattern_names is not None:
+            if isinstance(patterns, (tuple, list)) and patterns is not None and len(patterns) != len(pattern_names):
+                raise ValueError("The length of `pattern_names` argument not equal to `patterns`.")
+            if isinstance(split_sep, (tuple, list)) and split_sep is not None and len(split_sep) != len(pattern_names):
+                raise ValueError("The length of `pattern_names` argument not equal to `split_sep`.")
+
+        if split_sep is not None and (split_idx is None or not isinstance(split_idx, (int, tuple, list))):
+            raise ValueError("Invalid index to split text, got {}.".format(split_idx))
+
+        if split_sep is not None and split_pattern is None:
+            raise ValueError("Invalid pattern to split text, got {}.".format(split_pattern))
+
+        self.patterns = patterns
+        self.use_re = use_re
+        self.nearest_distance = nearest_distance
+        self.start_line_pattern_flag = start_line_pattern_flag
+        self.end_line_pattern_flag = end_line_pattern_flag
+
+        if not isinstance(split_sep, (tuple, list)) and split_sep is not None:
+            split_sep = [split_sep]
+
+            if not isinstance(split_idx, (tuple, list)):
+                split_idx = [split_idx]
+
+        self.split_sep = split_sep
+        self.split_idx = split_idx
+
+        if pattern_names is None:
+            if patterns is None:
+                pattern_names = split_idx
+            else:
+                pattern_names = patterns
+        self.pattern_names = pattern_names
+
+        if not isinstance(split_pattern, (tuple, list)) and split_sep is not None:
+            split_pattern = [split_pattern] * len(split_sep)
+        self.split_pattern = split_pattern
+
+        self.start_record = start_line_pattern_flag is None
+
+    def parse(self, path_or_logs: Union[str, List]) -> List[dict]:
+        """
+        : return: [{matric_name: value}, ...]
+        """
+
+        if isinstance(path_or_logs, str):
+            with open(path_or_logs, encoding="utf8") as log_file:
+                path_or_logs = log_file.readlines()
+
+        ret = []
+        for line in path_or_logs:
+            result = self.parse_inline(line)
+            if len(result) == 0:
+                continue
+            ret.append(result)
+        return ret
+
+    def parse_inline(self, line) -> dict:
+        if not self.can_record(line):
+            return {}
+
+        if self.split_sep is None:
+            return self._parse_inline_by_match(line)
+        return self._parse_inline_by_split(line)
+
+    def _parse_inline_by_match(self, line: str):
+        ret = {}
+        for name, pattern in zip(self.pattern_names, self.patterns):
+            result = extract_value(
+                line, pattern, inline=True, use_re=self.use_re,
+                nearest_distance=self.nearest_distance
+            )
+            if len(result) == 0:
+                continue
+            ret[name] = result
+        return ret
+
+    def _parse_inline_by_split(self, line: str, to_type=float):
+        ret = {}
+        for name, sep, idx, pattern in zip(self.pattern_names,
+                                  self.split_sep,
+                                  self.split_idx,
+                                  self.split_pattern):
+            if not self.can_matched(line, pattern):
+                continue
+            if '\t' in sep:
+                segs = line.strip().split(sep)
+            else:
+                segs = line.strip().replace('\t', ' ').split(sep)
+            segs = list(filter(lambda kv: kv.strip() not in ["", " ", None], segs))
+            if len(segs) <= idx:
+                continue
+            ret[name] = to_type(segs[idx])
+        return ret
+
+    def can_record(self, line: str):
+        if self.start_line_pattern_flag is None:
+            self.start_record = True
+        elif not self.start_record:
+            self.start_record = self.can_matched(line, self.start_line_pattern_flag)
+
+        if self.start_record:
+            if self.end_line_pattern_flag is not None and self.can_matched(line, self.end_line_pattern_flag):
+                self.start_record = False
+
+        return self.start_record
+
+    def can_matched(self, content: str, pattern: str):
+        result = re.findall(pattern, content)
+        return len(result) != 0
+
diff --git a/tests/tools/dltest/dltest/model_compare_config.py b/tests/tools/dltest/dltest/model_compare_config.py
new file mode 100644
index 000000000..4b6d9a471
--- /dev/null
+++ b/tests/tools/dltest/dltest/model_compare_config.py
@@ -0,0 +1,306 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import os.path as ospath
+
+from typing import NamedTuple, Union, List, Mapping
+
+from dltest.log_parser import DEFAULT_NEAREST_MATCH_CHARS
+
+
+class LogComparatorArgs(NamedTuple):
+    threshold: Union[float, Mapping]
+    patterns: List[str] = None
+    pattern_names: List[str] = None
+    use_re: bool = False
+    nearest_distance: int = DEFAULT_NEAREST_MATCH_CHARS
+    start_line_pattern_flag: str = None
+    end_line_pattern_flag: str = None
+    split_pattern: Union[str, List] = None
+    split_sep: List = None
+    split_idx: List = None
+    only_last: bool = True
+    allow_greater_than: bool = True
+
+    def to_dict(self):
+        return self._asdict()
+
+
+class ArgsModelsTuple(NamedTuple):
+
+    args: LogComparatorArgs
+    models: List[str]
+
+
+class BaseConfig:
+
+    def __getitem__(self, item):
+        return self.__class__.__dict__[item]
+
+    def __getattr__(self, item):
+        return self.__class__.__dict__[item]
+
+    def __iter__(self):
+        for attr, value in self.__class__.__dict__.items():
+            if isinstance(value, ArgsModelsTuple):
+                yield attr
+
+    def iter_items(self):
+        for attr, value in self.__class__.__dict__.items():
+            if isinstance(value, ArgsModelsTuple):
+                yield attr, value
+
+
+class _TFComparatorConfig(BaseConfig):
+
+    cnn_benchmarks = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=["Accuracy @ 1 =", "Accuracy @ 5 ="],
+            pattern_names=["Acc@1", "Acc@5"]
+        ),
+        models=["alexnet", "inceptionv3", "resnet50", "resnet101", "vgg16"]
+    )
+
+    dist_cnn_becnmarks = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            split_sep=[' ', ' '],
+            split_idx=[9, 10],
+            split_pattern="[\s\S]*?images/sec:[\s\S]*?jitter",
+            pattern_names=['Acc@1', 'Acc@5']
+        ),
+        models=[
+            "alexnet_dist", "inceptionv3_dist", "resnet50_dist", "resnet101_dist", "vgg16_dist"
+        ]
+    )
+
+    bert = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=["eval_accuracy ="],
+            pattern_names=["Accuracy"]
+        ),
+        models=["bert"]
+    )
+
+    ssd = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=["acc="],
+            pattern_names=["Acc@1"]
+        ),
+        models=["ssd"]
+    )
+
+    yolov3 = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.8,
+            patterns=["mAP"]
+        ),
+        models=["yolov3"]
+    )
+
+    vnet = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=["background_dice", "anterior_dice", "posterior_dice"]
+        ),
+        models=["vnet"]
+    )
+
+
+class _TorchComparatorConfig(BaseConfig):
+    classification = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=8.0, patterns=['Acc@1', 'Acc@5'],
+            start_line_pattern_flag="Start training",
+        ),
+        models=[
+            'googlenet', 'inceptionv3', 'mobilenetv3', 'resnet', 'shufflenetv2',
+            'vgg', 'resnet50_dali', 'resnext', 'densenet'
+        ]
+    )
+
+    detection = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.03,
+            patterns=[
+                "Average Precision  \(AP\) @\[ IoU=0.50:0.95 \| area=   all \| maxDets=100 \] ="
+            ],
+            pattern_names=["mAP"],
+            start_line_pattern_flag="IoU metric: bbox",
+            end_line_pattern_flag="IoU metric: segm"
+        ),
+        models=[
+            'maskrcnn', 'retinanet', 'ssd'
+        ]
+    )
+
+    bert_cola = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=['mcc']
+        ),
+        models=['bert_cola']
+    )
+
+    bert_mrpc = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=['acc']
+        ),
+        models=['bert_mrpc']
+    )
+
+    bert_pretrain_apex = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=['eval_mlm_accaracy']
+        ),
+        models=['bert_pretrain_apex']
+    )
+
+    segmentation = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=8.0,
+            patterns=['mean IoU:'],
+            pattern_names=['mIoU']
+        ),
+        models=[
+            'deeplabv3', 'fcn'
+        ]
+    )
+
+    t5 = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=5.0,
+            split_pattern="eval_bleu[\s\S]*?=",
+            split_sep=["="],
+            split_idx=[1],
+            pattern_names=['EvalBleu']
+        ),
+        models=['t5']
+    )
+
+    yolov3 = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=["mAP"]
+        ),
+        models=['yolov3']
+    )
+
+    yolov5 = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            patterns=[
+                "Average Precision  \(AP\) @\[ IoU=0.50:0.95 \| area=   all \| maxDets=100 \] ="
+            ],
+            pattern_names=["mAP"],
+        ),
+        models=['yolov5'],
+    )
+
+    yolov5s_coco128 = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            split_pattern="[\s]+?all[\s\S]*?[1-9]\d*[\s]+?[1-9]\d*",
+            split_sep=[" ", " "],
+            split_idx=[5, 6],
+            pattern_names=["AP50", "mAP"]
+        ),
+        models=['yolov5s_coco128']
+    )
+    
+    centernet_resnet18 = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            split_pattern="[\s]+?all[\s\S]*?[1-9]\d*[\s]+?[1-9]\d*",
+            split_sep=[" ", " "],
+            split_idx=[5, 6],
+            pattern_names=["AP50", "mAP"]
+        ),
+        models=['centernet_resnet18']
+    )
+    
+    fcos_resnet50_fpn = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.08,
+            split_pattern="[\s]+?all[\s\S]*?[1-9]\d*[\s]+?[1-9]\d*",
+            split_sep=[" ", " "],
+            split_idx=[5, 6],
+            pattern_names=["AP50", "mAP"]
+        ),
+        models=['fcos_resnet50_fpn']
+    )
+
+    ocr_recognition = ArgsModelsTuple(
+        args=LogComparatorArgs(
+            threshold=0.5,  patterns=["0_word_acc"],
+        ),
+        models=[
+            "sar", "satrn"
+        ]
+    )
+
+
+
+class ComparatorConfig:
+
+    _configs = dict(tf=_TFComparatorConfig(), torch=_TorchComparatorConfig())
+
+    @classmethod
+    def get_frameworks(cls) -> List:
+        return list(cls._configs.keys())
+
+    @classmethod
+    def get(cls, tf_or_torch, name, default=None):
+        for model_kind, comb in cls._configs[tf_or_torch].iter_items():
+            if name in comb.models:
+                return comb.args
+        if default is not None:
+            return default
+        raise KeyError("Not found config, but got {name} for {fw}".format(name=name, fw=tf_or_torch))
+
+    @classmethod
+    def find_config(cls, script_path: str) -> LogComparatorArgs:
+        tf_or_torch = script_path.split('.')[-2].split('_')[-1]
+
+        # Find by the name of script
+        script_name = ospath.basename(script_path).rsplit('.', maxsplit=1)[0]
+        if script_name.startswith('train_'):
+            script_name = script_name.replace("train_", "", 1)
+        while script_name not in [None, "", "/", "\\"]:
+            try:
+                config = cls.get(tf_or_torch, script_name)
+                return config
+            except:
+                pass
+            script_name = script_name.rsplit('_', maxsplit=1)
+            if len(script_name) <= 1:
+                break
+            script_name = script_name[0]
+
+        # Find by the name of model's dir
+        model_dir_name = ospath.basename(ospath.dirname(script_path))
+        try:
+            config = cls.get(tf_or_torch, model_dir_name)
+            return config
+        except:
+            raise RuntimeError("Not found for", script_path)
+
+
+def get_compare_config_with_full_path(script_path: str, to_dict=True):
+    config = ComparatorConfig.find_config(script_path)
+    if to_dict:
+        return config.to_dict()
+    return config
+
diff --git a/tests/tools/dltest/dltest/utils/__init__.py b/tests/tools/dltest/dltest/utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/tools/dltest/dltest/utils/base_cli.py b/tests/tools/dltest/dltest/utils/base_cli.py
new file mode 100644
index 000000000..faf71ef39
--- /dev/null
+++ b/tests/tools/dltest/dltest/utils/base_cli.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+from argparse import ArgumentParser
+from abc import abstractmethod
+
+
+class BaseCLI:
+
+    def __init__(self, parser=None, *args, **kwargs):
+        if parser is None:
+            self.parser = ArgumentParser(description=self.description ,*args, **kwargs)
+
+    def __call__(self):
+        self.run()
+
+    @property
+    def description(self):
+        return None
+
+    @abstractmethod
+    def command_name(self):
+        pass
+
+    def predefine_args(self):
+        pass
+
+    def parse_args(self, *args, **kwargs):
+        self.predefine_args()
+        return self.parser.parse_args(*args, **kwargs)
+
+    @abstractmethod
+    def run(self):
+        pass
+
+
+
diff --git a/tests/tools/dltest/dltest/utils/get_env.py b/tests/tools/dltest/dltest/utils/get_env.py
new file mode 100644
index 000000000..673a17990
--- /dev/null
+++ b/tests/tools/dltest/dltest/utils/get_env.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+import os
+from collections import defaultdict
+import os.path as osp
+import subprocess
+import sys
+
+
+def get_envinfo():
+    import torch
+    env_info = {}
+    env_info['sys.platform'] = sys.platform
+    env_info['Python'] = sys.version.replace('\n', '')
+
+    cuda_available = torch.cuda.is_available()
+    env_info['CUDA available'] = cuda_available
+    if cuda_available:
+        from torch.utils.cpp_extension import CUDA_HOME
+        env_info['CUDA_HOME'] = CUDA_HOME
+        if CUDA_HOME is not None and osp.isdir(CUDA_HOME):
+            try:
+                nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
+                nvcc = subprocess.check_output(
+                    f'"{nvcc}" -V | tail -n1', shell=True)
+                nvcc = nvcc.decode('utf-8').strip()
+            except subprocess.SubprocessError:
+                nvcc = 'Not Available'
+            env_info['NVCC'] = nvcc
+
+        devices = defaultdict(list)
+        for k in range(torch.cuda.device_count()):
+            devices[torch.cuda.get_device_name(k)].append(str(k))
+        for name, devids in devices.items():
+            env_info['GPU ' + ','.join(devids)] = name
+
+    gcc = subprocess.check_output('gcc --version | head -n1', shell=True)
+    gcc = gcc.decode('utf-8').strip()
+    env_info['GCC'] = gcc
+
+    env_info['PyTorch'] = torch.__version__
+
+    return env_info
+
+
+def get_gpu_type():
+    import torch
+    if "DEBUG_GPU_TYPE" in os.environ:
+        return os.environ["DEBUG_GPU_TYPE"]
+
+    if not torch.cuda.is_available():
+        return "BI"
+    dev_name = torch.cuda.get_device_name(0)
+    if 'IX BI' in dev_name or getattr(torch, "corex", False):
+        _type = "BI"
+    else:
+        _type = "NV"
+
+    return _type
diff --git a/tests/tools/dltest/dltest/utils/misc.py b/tests/tools/dltest/dltest/utils/misc.py
new file mode 100644
index 000000000..accf259f7
--- /dev/null
+++ b/tests/tools/dltest/dltest/utils/misc.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+import copy
+import os
+
+
+def get_full_path(fname):
+    pwd = os.getcwd()
+    if fname.startswith('/'):
+        return fname
+    return os.path.join(pwd, fname)
+
+
+def is_main_proc(rank):
+    return str(rank) in ["None", "-1", "0"]
+
+
+def main_proc_print(*args, **kwargs):
+    if "RANK" in os.environ:
+        if is_main_proc(os.environ["RANK"]):
+            print(*args, **kwargs)
+            return
+
+    if "LOCAL_RANK" in os.environ:
+        if is_main_proc(os.environ["LOCAL_RANK"]):
+            print(*args, **kwargs)
+            return
+
+    print(*args, **kwargs)
+
+
+def create_subproc_env():
+    env = copy.copy(os.environ)
+    env["USE_DLTEST"] = "1"
+    return env
\ No newline at end of file
diff --git a/tests/tools/dltest/dltest/utils/real_tempfile.py b/tests/tools/dltest/dltest/utils/real_tempfile.py
new file mode 100644
index 000000000..cace5d1e8
--- /dev/null
+++ b/tests/tools/dltest/dltest/utils/real_tempfile.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import os
+import os.path as ospath
+from pathlib import Path
+import tempfile
+
+
+class TemporaryFile:
+
+    def __init__(self, with_open=False, mode='r'):
+        self.name = None
+        self.with_open = with_open
+        self.mode = mode
+
+        self.file = None
+
+    def create(self):
+        self.name = tempfile.mktemp()
+        file_path = Path(self.name)
+        file_path.touch()
+
+    def delete(self):
+        if self.name is not None and ospath.exists(self.name):
+            os.unlink(self.name)
+
+    def read(self):
+        self._check_file_status()
+        return self.file.read()
+
+    def readlines(self):
+        self._check_file_status()
+        return self.file.readlines()
+
+    def _check_file_status(self):
+        if self.file is None:
+            raise RuntimeError("File is closed, please reopen it.")
+
+    def __enter__(self):
+        self.create()
+        if self.with_open:
+            self.file = open(self.name, mode=self.mode)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.with_open:
+            self.file.close()
+        self.delete()
+
+
+
+
+
+
+
+
diff --git a/tests/tools/dltest/dltest/utils/subprocess_tools.py b/tests/tools/dltest/dltest/utils/subprocess_tools.py
new file mode 100644
index 000000000..2b3fe3e7f
--- /dev/null
+++ b/tests/tools/dltest/dltest/utils/subprocess_tools.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import subprocess
+from typing import Callable, Union, List
+
+from dltest.utils.real_tempfile import TemporaryFile
+from dltest.utils import misc
+
+
+def get_output_with_pipe(command, shell=None, callback: Callable[[list], None]=None, *args, **kwargs):
+    if shell is None:
+        shell = True
+
+    if shell and not isinstance(command, str):
+        command = " ".join(command)
+
+    stream = subprocess.Popen(
+        command, shell=shell,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        *args, **kwargs
+    )
+    outputs = []
+    while 1:
+        exit_code = stream.poll()
+        if exit_code is None:
+            if stream.stdout.readable():
+                outputs.append(stream.stdout.readline().decode("utf8").rstrip())
+                if callback is not None:
+                    callback(outputs[-1:])
+                print(outputs[-1])
+        else:
+            if stream.stdout.readable():
+                lines = stream.stdout.readlines()
+                lines = [line.decode("utf8".rstrip()) for line in lines]
+                outputs.extend(lines)
+                if callback is not None:
+                    callback(outputs[-1:])
+                print('\n'.join(lines))
+            break
+
+    return outputs
+
+
+def get_output_with_tempfile(command, *args, **kwargs):
+    if not isinstance(command, (list, tuple)):
+        command = [command]
+    stdout = None
+    with TemporaryFile(with_open=True) as file:
+        command.extend(['|', 'tee', file.name])
+        command = " ".join(command)
+
+        res = subprocess.run(command, stdout=stdout, stderr=subprocess.STDOUT, shell=True, *args, **kwargs)
+        output = file.readlines()
+
+    return output
+
+def execute_shell(command, *args, **kwargs):
+    if "env" not in kwargs:
+        kwargs["env"] = misc.create_subproc_env()
+
+    if not isinstance(command, (list, tuple)):
+        command = [command]
+
+    command = " ".join(command)
+    res = subprocess.run(command,
+                         shell=True, *args, **kwargs)
+    return res
+
+def get_output(command: List, capture_output_method: str = 'tempfile', *args, **kwargs):
+    if "env" not in kwargs:
+        kwargs["env"] = misc.create_subproc_env()
+
+    if capture_output_method == "tempfile":
+        return get_output_with_tempfile(command, *args, **kwargs)
+    return get_output_with_pipe(command, *args, **kwargs)
\ No newline at end of file
diff --git a/tests/tools/dltest/dltest/utils/training_args.py b/tests/tools/dltest/dltest/utils/training_args.py
new file mode 100644
index 000000000..f096c2462
--- /dev/null
+++ b/tests/tools/dltest/dltest/utils/training_args.py
@@ -0,0 +1,87 @@
+import os
+
+from typing import Union, List, Dict, Any, Mapping
+from argparse import Namespace, ArgumentParser
+import json
+
+
+def _obj_to_dict(obj) -> Dict:
+    if isinstance(obj, Mapping):
+        return obj
+
+    try:
+        from absl import flags
+        if isinstance(obj, flags.FlagValues):
+            return obj.flag_values_dict()
+    except:
+        pass
+    if isinstance(obj, Namespace):
+        return obj.__dict__
+    elif isinstance(obj, List):
+        new_obj = dict()
+        for _o in obj:
+            _o_dict = _obj_to_dict(_o)
+            new_obj.update(_o_dict)
+        return new_obj
+    elif not isinstance(obj, Dict):
+        if hasattr(obj, "__dict__"):
+            return obj.__dict__
+    try:
+        typename = type(obj).__name__
+    except:
+        typename = str(obj)
+    return {typename: str(obj)}
+
+
+def json_dump_obj(o):
+    if hasattr(o, "__name__"):
+        return o.__name__
+    return str(o)
+
+
+def show_training_arguments(args: Union[List, Dict, Any]):
+    """ print running arguments
+    Example 1: For ArgumentParser
+        >>> parser = ArgumentParser("Test")
+        >>> parser.add_argument("--arg0", type=str)
+        >>> args = parser.parse_args()
+        >>> show_training_arguments(args)
+
+    Example 2: For dict
+        >>> args = dict(arg=1)
+        >>> show_training_arguments(args)
+
+    Example 3: For custom object
+        >>> from collections import namedtuple
+        >>> ArgsType = namedtuple("ArgsType", ["arg"])
+        >>> args = ArgsType(arg=123)
+        >>> show_training_arguments(args)
+
+    Example 4: For absl
+        >>> from absl import flags
+        >>> flags.DEFINE_string("arg", "123", "test")
+        >>> show_training_arguments(flags.FLAGS)
+
+    Example 5: For multi args
+        >>> args1 = dict(a=1)
+        >>> args2 = dict(b=2)
+        >>> show_training_arguments([args1, args2])
+
+    """
+    if not "SHOW_RUNNING_ARGS" in os.environ:
+        return
+
+    if os.environ["SHOW_RUNNING_ARGS"].lower() in ["0", "f", "false"]:
+        return
+
+    if "LOCAL_RANK" in os.environ:
+        if os.environ["LOCAL_RANK"] != "0":
+            return
+    args = _obj_to_dict(args)
+    args = json.dumps(args, default=json_dump_obj)
+    print("[RunningArguments]", args)
+
+
+if __name__ == '__main__':
+    os.environ["SHOW_RUNNING_ARGS"] = "1"
+    show_training_arguments([dict(a=1), dict(b=1), object()])
\ No newline at end of file
diff --git a/tests/tools/dltest/setup.py b/tests/tools/dltest/setup.py
new file mode 100644
index 000000000..dc81c4daf
--- /dev/null
+++ b/tests/tools/dltest/setup.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+from setuptools import setup, find_packages
+from dltest.cli.entry_points import make_execute_path
+
+setup(
+    name="dltest",
+    version="0.1",
+    description='Iluvatar Corex AI Toolbox',
+    packages=find_packages(exclude=('examples')),
+    include_package_data=True,
+    zip_safe=False,
+    entry_points = {
+        'console_scripts': make_execute_path(),
+    },
+    install_requires=[
+        'psutil'
+    ]
+)
-- 
Gitee


From 5a6bb123f9a80a54224b55543b9925f167a6e77c Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Fri, 26 Sep 2025 10:49:44 +0800
Subject: [PATCH 02/20] sync fairmot

---
 tests/executables/fairmot/init_torch.sh       | 14 +++++++++++
 .../train_fairmot_hrnet32_dist_torch.sh       | 23 +++++++++++++++++++
 2 files changed, 37 insertions(+)
 create mode 100644 tests/executables/fairmot/init_torch.sh
 create mode 100644 tests/executables/fairmot/train_fairmot_hrnet32_dist_torch.sh

diff --git a/tests/executables/fairmot/init_torch.sh b/tests/executables/fairmot/init_torch.sh
new file mode 100644
index 000000000..6ec2c63c7
--- /dev/null
+++ b/tests/executables/fairmot/init_torch.sh
@@ -0,0 +1,14 @@
+CURRENT_DIR=$(cd `dirname $0`; pwd)
+
+ROOT_DIR=${CURRENT_DIR}/../..
+
+cd ${ROOT_DIR}/data/datasets
+unzip -q MOT17.zip
+mkdir MOT17/images && mkdir MOT17/labels_with_ids
+mv ./MOT17/train ./MOT17/images/ && mv ./MOT17/test ./MOT17/images/
+
+cd ${ROOT_DIR}/cv/multi_object_tracking/fairmot/pytorch/
+pip3 install Cython
+pip3 install -r requirements.txt
+
+python3 src/gen_labels_17.py
\ No newline at end of file
diff --git a/tests/executables/fairmot/train_fairmot_hrnet32_dist_torch.sh b/tests/executables/fairmot/train_fairmot_hrnet32_dist_torch.sh
new file mode 100644
index 000000000..748020898
--- /dev/null
+++ b/tests/executables/fairmot/train_fairmot_hrnet32_dist_torch.sh
@@ -0,0 +1,23 @@
+
+source ../_utils/get_num_devices.sh
+
+CURRENT_DIR=$(cd `dirname $0`; pwd)
+source ../_utils/global_environment_variables.sh
+
+: ${BATCH_SIZE:=3}
+
+nonstrict_mode_args=""
+if [ "${RUN_MODE}" != "strict" ]; then
+    nonstrict_mode_args="--num_epochs 1 --num_iters 300"
+fi
+
+ROOT_DIR=${CURRENT_DIR}/../..
+DADASAT_PATH=${ROOT_DIR}/data/datasets/MOT17
+
+cd ${ROOT_DIR}/cv/multi_object_tracking/fairmot/pytorch/
+
+bash train_hrnet32_mot17.sh --batch_size $((IX_NUM_CUDA_VISIBLE_DEVICES*BATCH_SIZE)) \
+ --lr 0.001 \
+ --gpus $(seq -s "," 0 $(($IX_NUM_CUDA_VISIBLE_DEVICES-1))) ${nonstrict_mode_args} --target_loss 20 "$@";check_status
+
+exit ${EXIT_STATUS}
-- 
Gitee


From 8ab3678e5269db592fde5e24f98781c1b488ecdf Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Fri, 26 Sep 2025 10:56:30 +0800
Subject: [PATCH 03/20] sync maskrcnn

---
 tests/executables/maskrcnn/init_torch.sh      | 17 +++++++++++++
 .../train_maskrcnn_resnet50_amp_torch.sh      | 25 +++++++++++++++++++
 2 files changed, 42 insertions(+)
 create mode 100644 tests/executables/maskrcnn/init_torch.sh
 create mode 100644 tests/executables/maskrcnn/train_maskrcnn_resnet50_amp_torch.sh

diff --git a/tests/executables/maskrcnn/init_torch.sh b/tests/executables/maskrcnn/init_torch.sh
new file mode 100644
index 000000000..0e6c0becd
--- /dev/null
+++ b/tests/executables/maskrcnn/init_torch.sh
@@ -0,0 +1,17 @@
+bash ../_utils/init_detection_torch.sh ../_utils
+
+CURRENT_MODEL_DIR=$(cd `dirname $0`; pwd)
+PROJ_DIR="${CURRENT_MODEL_DIR}/../../"
+PROJECT_DATA="${PROJ_DIR}/data/datasets"
+
+if [[ ! -d "${PROJECT_DATA}/VOC2012_sample" ]]; then
+    tar zxf ${PROJECT_DATA}/VOC2012_sample.tgz -C ${PROJECT_DATA}
+fi
+
+cd ${PROJ_DIR}/cv/detection/maskrcnn/pytorch/
+OSNAME=$(cat /proc/version)
+# install the requirement
+if [[ "${OSNAME}" == *"aarch64"* ]] 
+then
+	pip3 install -r requirements_aarch64.txt
+fi
\ No newline at end of file
diff --git a/tests/executables/maskrcnn/train_maskrcnn_resnet50_amp_torch.sh b/tests/executables/maskrcnn/train_maskrcnn_resnet50_amp_torch.sh
new file mode 100644
index 000000000..860d6a0a2
--- /dev/null
+++ b/tests/executables/maskrcnn/train_maskrcnn_resnet50_amp_torch.sh
@@ -0,0 +1,25 @@
+source ../_utils/global_environment_variables.sh
+
+: ${BATCH_SIZE:=1}
+
+export PYTORCH_DISABLE_VEC_KERNEL=1
+export PT_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT=1
+
+OUTPUT_DIR=${PROJECT_DIR}/output/maskrcnn/$0
+if [[ -d ${OUTPUT_DIR} ]]; then
+    mkdir -p ${OUTPUT_DIR}
+fi
+
+
+ixdltest-check --nonstrict_mode_args="--epoch ${NONSTRICT_EPOCH}" -b 10 --run_script \
+python3 ${PROJECT_DIR}/cv/detection/maskrcnn/pytorch/train.py \
+--model maskrcnn_resnet50_fpn \
+--data-path ${PROJECT_DIR}/data/datasets/VOC2012_sample \
+--amp \
+--lr 0.001 \
+--batch-size ${BATCH_SIZE} \
+--output-dir ${OUTPUT_DIR} \
+"$@"; check_status
+
+rm -fr ${OUTPUT_DIR}
+exit ${EXIT_STATUS}
-- 
Gitee


From 8434d476d2ae46d9a549f3776b3042a7d3ad0778 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Fri, 26 Sep 2025 11:00:11 +0800
Subject: [PATCH 04/20] sync mobilenetv3

---
 tests/executables/mobilenetv3/init_torch.sh   | 15 +++++++++++
 .../train_mobilenetv3_large_amp_torch.sh      | 25 +++++++++++++++++++
 2 files changed, 40 insertions(+)
 create mode 100644 tests/executables/mobilenetv3/init_torch.sh
 create mode 100644 tests/executables/mobilenetv3/train_mobilenetv3_large_amp_torch.sh

diff --git a/tests/executables/mobilenetv3/init_torch.sh b/tests/executables/mobilenetv3/init_torch.sh
new file mode 100644
index 000000000..e8d2163e7
--- /dev/null
+++ b/tests/executables/mobilenetv3/init_torch.sh
@@ -0,0 +1,15 @@
+bash ../_utils/init_classification_torch.sh ../_utils
+
+# determine whether the user is root mode to execute this script
+prefix_sudo=""
+current_user=$(whoami)
+if [ "$current_user" != "root" ]; then
+    echo "User $current_user need to add sudo permission keywords"
+    prefix_sudo="sudo"
+fi
+
+echo "prefix_sudo= $prefix_sudo"
+
+command -v yum >/dev/null && $prefix_sudo yum install -y numactl ||  $prefix_sudo apt install -y numactl
+
+pip3 install -r ../../cv/classification/mobilenetv3/pytorch/requirements.txt
\ No newline at end of file
diff --git a/tests/executables/mobilenetv3/train_mobilenetv3_large_amp_torch.sh b/tests/executables/mobilenetv3/train_mobilenetv3_large_amp_torch.sh
new file mode 100644
index 000000000..13a46dc79
--- /dev/null
+++ b/tests/executables/mobilenetv3/train_mobilenetv3_large_amp_torch.sh
@@ -0,0 +1,25 @@
+
+CURRENT_DIR=$(cd `dirname $0`; pwd)
+source ../_utils/get_num_devices.sh
+source ../_utils/global_environment_variables.sh
+
+: ${BATCH_SIZE:=32}
+
+ROOT_DIR=${CURRENT_DIR}/../..
+DATA_DIR=${ROOT_DIR}/data/datasets/imagenette
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0)); then
+        EXIT_STATUS=1
+    fi
+}
+
+cd $CURRENT_DIR/../../cv/classification/mobilenetv3/pytorch/
+ixdltest-check --nonstrict_mode_args="--epoch ${NONSTRICT_EPOCH}" -b 10 --run_script \
+python3 train.py --model  mobilenet_v3_large --data-path "${DATA_DIR}" \
+   --epochs 600 --batch-size ${BATCH_SIZE} --opt sgd --lr 0.1 \
+    --wd 0.00001 --lr-step-size 2 --lr-gamma 0.973 --auto-augment imagenet --random-erase 0.2 \
+	--output-dir . --amp "$@"; check_status
+exit ${EXIT_STATUS}
-- 
Gitee


From e3d50a1f305c5c2a726fdc87e32d3cc96a159e47 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Fri, 26 Sep 2025 11:22:45 +0800
Subject: [PATCH 05/20] sync resnet torch

---
 .../resnet50/pytorch/_torchvision/__init__.py |  11 -
 .../_internally_replaced_utils.py             |  66 ---
 .../pytorch/_torchvision/models/__init__.py   |   9 -
 .../pytorch/_torchvision/models/resnet.py     | 421 -------------
 .../pytorch/_torchvision/ops/__init__.py      |  12 -
 .../resnet50/pytorch/_torchvision/ops/misc.py | 171 ------
 .../_torchvision/ops/stochastic_depth.py      |  74 ---
 .../resnet50/pytorch/_torchvision/utils.py    | 556 ------------------
 cv/classification/resnet50/pytorch/utils.py   | 160 -----
 tests/executables/resnet/init_paddle.sh       |  29 +
 tests/executables/resnet/init_tf.sh           |  25 +
 tests/executables/resnet/init_torch.sh        |   7 +
 .../resnet/train_resnet50_amp_torch.sh        |  21 +
 .../resnet/train_resnet50_dist_paddle.sh      |  19 +
 .../resnet/train_resnet50_dist_tf.sh          |  23 +
 15 files changed, 124 insertions(+), 1480 deletions(-)
 delete mode 100644 cv/classification/resnet50/pytorch/_torchvision/__init__.py
 delete mode 100644 cv/classification/resnet50/pytorch/_torchvision/_internally_replaced_utils.py
 delete mode 100644 cv/classification/resnet50/pytorch/_torchvision/models/__init__.py
 delete mode 100644 cv/classification/resnet50/pytorch/_torchvision/models/resnet.py
 delete mode 100644 cv/classification/resnet50/pytorch/_torchvision/ops/__init__.py
 delete mode 100644 cv/classification/resnet50/pytorch/_torchvision/ops/misc.py
 delete mode 100644 cv/classification/resnet50/pytorch/_torchvision/ops/stochastic_depth.py
 delete mode 100644 cv/classification/resnet50/pytorch/_torchvision/utils.py
 delete mode 100644 cv/classification/resnet50/pytorch/utils.py
 create mode 100644 tests/executables/resnet/init_paddle.sh
 create mode 100644 tests/executables/resnet/init_tf.sh
 create mode 100644 tests/executables/resnet/init_torch.sh
 create mode 100644 tests/executables/resnet/train_resnet50_amp_torch.sh
 create mode 100644 tests/executables/resnet/train_resnet50_dist_paddle.sh
 create mode 100644 tests/executables/resnet/train_resnet50_dist_tf.sh

diff --git a/cv/classification/resnet50/pytorch/_torchvision/__init__.py b/cv/classification/resnet50/pytorch/_torchvision/__init__.py
deleted file mode 100644
index 725db7dae..000000000
--- a/cv/classification/resnet50/pytorch/_torchvision/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
-# Copyright Declaration: This software, including all of its code and documentation,
-# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
-# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
-# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
-# CoreX. No user of this software shall have any right, ownership or interest in this software and
-# any use of this software shall be in compliance with the terms and conditions of the End User
-# License Agreement.
-from . import models
-
-__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/cv/classification/resnet50/pytorch/_torchvision/_internally_replaced_utils.py b/cv/classification/resnet50/pytorch/_torchvision/_internally_replaced_utils.py
deleted file mode 100644
index fce11d9b5..000000000
--- a/cv/classification/resnet50/pytorch/_torchvision/_internally_replaced_utils.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
-# Copyright Declaration: This software, including all of its code and documentation,
-# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
-# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
-# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
-# CoreX. No user of this software shall have any right, ownership or interest in this software and
-# any use of this software shall be in compliance with the terms and conditions of the End User
-# License Agreement.
-import importlib.machinery
-import os
-
-from torch.hub import _get_torch_home
-
-
-_HOME = os.path.join(_get_torch_home(), "datasets", "vision")
-_USE_SHARDED_DATASETS = False
-
-
-def _download_file_from_remote_location(fpath: str, url: str) -> None:
-    pass
-
-
-def _is_remote_location_available() -> bool:
-    return False
-
-
-try:
-    from torch.hub import load_state_dict_from_url  # noqa: 401
-except ImportError:
-    from torch.utils.model_zoo import load_url as load_state_dict_from_url  # noqa: 401
-
-
-def _get_extension_path(lib_name):
-
-    lib_dir = os.path.dirname(__file__)
-    if os.name == "nt":
-        # Register the main torchvision library location on the default DLL path
-        import ctypes
-        import sys
-
-        kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True)
-        with_load_library_flags = hasattr(kernel32, "AddDllDirectory")
-        prev_error_mode = kernel32.SetErrorMode(0x0001)
-
-        if with_load_library_flags:
-            kernel32.AddDllDirectory.restype = ctypes.c_void_p
-
-        if sys.version_info >= (3, 8):
-            os.add_dll_directory(lib_dir)
-        elif with_load_library_flags:
-            res = kernel32.AddDllDirectory(lib_dir)
-            if res is None:
-                err = ctypes.WinError(ctypes.get_last_error())
-                err.strerror += f' Error adding "{lib_dir}" to the DLL directories.'
-                raise err
-
-        kernel32.SetErrorMode(prev_error_mode)
-
-    loader_details = (importlib.machinery.ExtensionFileLoader, importlib.machinery.EXTENSION_SUFFIXES)
-
-    extfinder = importlib.machinery.FileFinder(lib_dir, loader_details)
-    ext_specs = extfinder.find_spec(lib_name)
-    if ext_specs is None:
-        raise ImportError
-
-    return ext_specs.origin
diff --git a/cv/classification/resnet50/pytorch/_torchvision/models/__init__.py b/cv/classification/resnet50/pytorch/_torchvision/models/__init__.py
deleted file mode 100644
index b40d8f4b4..000000000
--- a/cv/classification/resnet50/pytorch/_torchvision/models/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
-# Copyright Declaration: This software, including all of its code and documentation,
-# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
-# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
-# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
-# CoreX. No user of this software shall have any right, ownership or interest in this software and
-# any use of this software shall be in compliance with the terms and conditions of the End User
-# License Agreement.
-from .resnet import *
diff --git a/cv/classification/resnet50/pytorch/_torchvision/models/resnet.py b/cv/classification/resnet50/pytorch/_torchvision/models/resnet.py
deleted file mode 100644
index 2a39be3bc..000000000
--- a/cv/classification/resnet50/pytorch/_torchvision/models/resnet.py
+++ /dev/null
@@ -1,421 +0,0 @@
-# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
-# Copyright Declaration: This software, including all of its code and documentation,
-# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
-# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
-# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
-# CoreX. No user of this software shall have any right, ownership or interest in this software and
-# any use of this software shall be in compliance with the terms and conditions of the End User
-# License Agreement.
-from typing import Type, Any, Callable, Union, List, Optional
-
-import torch
-import torch.nn as nn
-from torch import Tensor
-
-from .._internally_replaced_utils import load_state_dict_from_url
-from ..utils import _log_api_usage_once
-
-
-__all__ = [
-    "ResNet",
-    "resnet18",
-    "resnet34",
-    "resnet50",
-    "resnet101",
-    "resnet152",
-    "resnext50_32x4d",
-    "resnext101_32x8d",
-    "wide_resnet50_2",
-    "wide_resnet101_2",
-]
-
-
-model_urls = {
-    "resnet18": "https://download.pytorch.org/models/resnet18-f37072fd.pth",
-    "resnet34": "https://download.pytorch.org/models/resnet34-b627a593.pth",
-    "resnet50": "https://download.pytorch.org/models/resnet50-0676ba61.pth",
-    "resnet101": "https://download.pytorch.org/models/resnet101-63fe2227.pth",
-    "resnet152": "https://download.pytorch.org/models/resnet152-394f9c45.pth",
-    "resnext50_32x4d": "https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth",
-    "resnext101_32x8d": "https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth",
-    "wide_resnet50_2": "https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth",
-    "wide_resnet101_2": "https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth",
-}
-
-
-def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
-    """3x3 convolution with padding"""
-    return nn.Conv2d(
-        in_planes,
-        out_planes,
-        kernel_size=3,
-        stride=stride,
-        padding=dilation,
-        groups=groups,
-        bias=False,
-        dilation=dilation,
-    )
-
-
-def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
-    """1x1 convolution"""
-    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
-
-
-class BasicBlock(nn.Module):
-    expansion: int = 1
-
-    def __init__(
-        self,
-        inplanes: int,
-        planes: int,
-        stride: int = 1,
-        downsample: Optional[nn.Module] = None,
-        groups: int = 1,
-        base_width: int = 64,
-        dilation: int = 1,
-        norm_layer: Optional[Callable[..., nn.Module]] = None,
-    ) -> None:
-        super().__init__()
-        if norm_layer is None:
-            norm_layer = nn.BatchNorm2d
-        if groups != 1 or base_width != 64:
-            raise ValueError("BasicBlock only supports groups=1 and base_width=64")
-        if dilation > 1:
-            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
-        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
-        self.conv1 = conv3x3(inplanes, planes, stride)
-        self.bn1 = norm_layer(planes)
-        self.relu = nn.ReLU(inplace=True)
-        self.conv2 = conv3x3(planes, planes)
-        self.bn2 = norm_layer(planes)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x: Tensor) -> Tensor:
-        identity = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-
-        if self.downsample is not None:
-            identity = self.downsample(x)
-
-        out += identity
-        out = self.relu(out)
-
-        return out
-
-
-class Bottleneck(nn.Module):
-    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
-    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
-    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
-    # This variant is also known as ResNet V1.5 and improves accuracy according to
-    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
-
-    expansion: int = 4
-
-    def __init__(
-        self,
-        inplanes: int,
-        planes: int,
-        stride: int = 1,
-        downsample: Optional[nn.Module] = None,
-        groups: int = 1,
-        base_width: int = 64,
-        dilation: int = 1,
-        norm_layer: Optional[Callable[..., nn.Module]] = None,
-    ) -> None:
-        super().__init__()
-        if norm_layer is None:
-            norm_layer = nn.BatchNorm2d
-        width = int(planes * (base_width / 64.0)) * groups
-        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
-        self.conv1 = conv1x1(inplanes, width)
-        self.bn1 = norm_layer(width)
-        self.conv2 = conv3x3(width, width, stride, groups, dilation)
-        self.bn2 = norm_layer(width)
-        self.conv3 = conv1x1(width, planes * self.expansion)
-        self.bn3 = norm_layer(planes * self.expansion)
-        self.relu = nn.ReLU(inplace=True)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x: Tensor) -> Tensor:
-        identity = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.relu(out)
-
-        out = self.conv3(out)
-        out = self.bn3(out)
-
-        if self.downsample is not None:
-            identity = self.downsample(x)
-
-        out += identity
-        out = self.relu(out)
-
-        return out
-
-
-class ResNet(nn.Module):
-    def __init__(
-        self,
-        block: Type[Union[BasicBlock, Bottleneck]],
-        layers: List[int],
-        num_classes: int = 1000,
-        zero_init_residual: bool = False,
-        groups: int = 1,
-        width_per_group: int = 64,
-        replace_stride_with_dilation: Optional[List[bool]] = None,
-        norm_layer: Optional[Callable[..., nn.Module]] = None,
-    ) -> None:
-        super().__init__()
-        _log_api_usage_once(self)
-        if norm_layer is None:
-            norm_layer = nn.BatchNorm2d
-        self._norm_layer = norm_layer
-
-        self.inplanes = 64
-        self.dilation = 1
-        if replace_stride_with_dilation is None:
-            # each element in the tuple indicates if we should replace
-            # the 2x2 stride with a dilated convolution instead
-            replace_stride_with_dilation = [False, False, False]
-        if len(replace_stride_with_dilation) != 3:
-            raise ValueError(
-                "replace_stride_with_dilation should be None "
-                f"or a 3-element tuple, got {replace_stride_with_dilation}"
-            )
-        self.groups = groups
-        self.base_width = width_per_group
-        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
-        self.bn1 = norm_layer(self.inplanes)
-        self.relu = nn.ReLU(inplace=True)
-        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-        self.layer1 = self._make_layer(block, 64, layers[0])
-        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0])
-        self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1])
-        self.layer4 = self._make_layer(block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2])
-        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
-        self.fc = nn.Linear(512 * block.expansion, num_classes)
-
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
-            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-
-        # Zero-initialize the last BN in each residual branch,
-        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
-        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
-        if zero_init_residual:
-            for m in self.modules():
-                if isinstance(m, Bottleneck):
-                    nn.init.constant_(m.bn3.weight, 0)  # type: ignore[arg-type]
-                elif isinstance(m, BasicBlock):
-                    nn.init.constant_(m.bn2.weight, 0)  # type: ignore[arg-type]
-
-    def _make_layer(
-        self,
-        block: Type[Union[BasicBlock, Bottleneck]],
-        planes: int,
-        blocks: int,
-        stride: int = 1,
-        dilate: bool = False,
-    ) -> nn.Sequential:
-        norm_layer = self._norm_layer
-        downsample = None
-        previous_dilation = self.dilation
-        if dilate:
-            self.dilation *= stride
-            stride = 1
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                conv1x1(self.inplanes, planes * block.expansion, stride),
-                norm_layer(planes * block.expansion),
-            )
-
-        layers = []
-        layers.append(
-            block(
-                self.inplanes, planes, stride, downsample, self.groups, self.base_width, previous_dilation, norm_layer
-            )
-        )
-        self.inplanes = planes * block.expansion
-        for _ in range(1, blocks):
-            layers.append(
-                block(
-                    self.inplanes,
-                    planes,
-                    groups=self.groups,
-                    base_width=self.base_width,
-                    dilation=self.dilation,
-                    norm_layer=norm_layer,
-                )
-            )
-
-        return nn.Sequential(*layers)
-
-    def _forward_impl(self, x: Tensor) -> Tensor:
-        # See note [TorchScript super()]
-        x = self.conv1(x)
-        x = self.bn1(x)
-        x = self.relu(x)
-        x = self.maxpool(x)
-
-        x = self.layer1(x)
-        x = self.layer2(x)
-        x = self.layer3(x)
-        x = self.layer4(x)
-
-        x = self.avgpool(x)
-        x = torch.flatten(x, 1)
-        x = self.fc(x)
-
-        return x
-
-    def forward(self, x: Tensor) -> Tensor:
-        return self._forward_impl(x)
-
-
-def _resnet(
-    arch: str,
-    block: Type[Union[BasicBlock, Bottleneck]],
-    layers: List[int],
-    pretrained: bool,
-    progress: bool,
-    **kwargs: Any,
-) -> ResNet:
-    model = ResNet(block, layers, **kwargs)
-    if pretrained:
-        state_dict = load_state_dict_from_url(model_urls[arch], progress=progress)
-        model.load_state_dict(state_dict)
-    return model
-
-
-def resnet18(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNet-18 model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    return _resnet("resnet18", BasicBlock, [2, 2, 2, 2], pretrained, progress, **kwargs)
-
-
-def resnet34(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNet-34 model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    return _resnet("resnet34", BasicBlock, [3, 4, 6, 3], pretrained, progress, **kwargs)
-
-
-def resnet50(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNet-50 model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    return _resnet("resnet50", Bottleneck, [3, 4, 6, 3], pretrained, progress, **kwargs)
-
-
-def resnet101(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNet-101 model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    return _resnet("resnet101", Bottleneck, [3, 4, 23, 3], pretrained, progress, **kwargs)
-
-
-def resnet152(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNet-152 model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    return _resnet("resnet152", Bottleneck, [3, 8, 36, 3], pretrained, progress, **kwargs)
-
-
-def resnext50_32x4d(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNeXt-50 32x4d model from
-    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    kwargs["groups"] = 32
-    kwargs["width_per_group"] = 4
-    return _resnet("resnext50_32x4d", Bottleneck, [3, 4, 6, 3], pretrained, progress, **kwargs)
-
-
-def resnext101_32x8d(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""ResNeXt-101 32x8d model from
-    `"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    kwargs["groups"] = 32
-    kwargs["width_per_group"] = 8
-    return _resnet("resnext101_32x8d", Bottleneck, [3, 4, 23, 3], pretrained, progress, **kwargs)
-
-
-def wide_resnet50_2(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""Wide ResNet-50-2 model from
-    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
-
-    The model is the same as ResNet except for the bottleneck number of channels
-    which is twice larger in every block. The number of channels in outer 1x1
-    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
-    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    kwargs["width_per_group"] = 64 * 2
-    return _resnet("wide_resnet50_2", Bottleneck, [3, 4, 6, 3], pretrained, progress, **kwargs)
-
-
-def wide_resnet101_2(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> ResNet:
-    r"""Wide ResNet-101-2 model from
-    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
-
-    The model is the same as ResNet except for the bottleneck number of channels
-    which is twice larger in every block. The number of channels in outer 1x1
-    convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
-    channels, and in Wide ResNet-50-2 has 2048-1024-2048.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    kwargs["width_per_group"] = 64 * 2
-    return _resnet("wide_resnet101_2", Bottleneck, [3, 4, 23, 3], pretrained, progress, **kwargs)
diff --git a/cv/classification/resnet50/pytorch/_torchvision/ops/__init__.py b/cv/classification/resnet50/pytorch/_torchvision/ops/__init__.py
deleted file mode 100644
index 5fbaff88c..000000000
--- a/cv/classification/resnet50/pytorch/_torchvision/ops/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
-# Copyright Declaration: This software, including all of its code and documentation,
-# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
-# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
-# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
-# CoreX. No user of this software shall have any right, ownership or interest in this software and
-# any use of this software shall be in compliance with the terms and conditions of the End User
-# License Agreement.
-from .misc import FrozenBatchNorm2d, ConvNormActivation, SqueezeExcitation
-from .stochastic_depth import stochastic_depth, StochasticDepth
-
-__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/cv/classification/resnet50/pytorch/_torchvision/ops/misc.py b/cv/classification/resnet50/pytorch/_torchvision/ops/misc.py
deleted file mode 100644
index a2c5a37c9..000000000
--- a/cv/classification/resnet50/pytorch/_torchvision/ops/misc.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
-# Copyright Declaration: This software, including all of its code and documentation,
-# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
-# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
-# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
-# CoreX. No user of this software shall have any right, ownership or interest in this software and
-# any use of this software shall be in compliance with the terms and conditions of the End User
-# License Agreement.
-from typing import Callable, List, Optional
-
-import torch
-from torch import Tensor
-
-from ..utils import _log_api_usage_once
-
-
-interpolate = torch.nn.functional.interpolate
-
-
-# This is not in nn
-class FrozenBatchNorm2d(torch.nn.Module):
-    """
-    BatchNorm2d where the batch statistics and the affine parameters are fixed
-
-    Args:
-        num_features (int): Number of features ``C`` from an expected input of size ``(N, C, H, W)``
-        eps (float): a value added to the denominator for numerical stability. Default: 1e-5
-    """
-
-    def __init__(
-        self,
-        num_features: int,
-        eps: float = 1e-5,
-    ):
-        super().__init__()
-        _log_api_usage_once(self)
-        self.eps = eps
-        self.register_buffer("weight", torch.ones(num_features))
-        self.register_buffer("bias", torch.zeros(num_features))
-        self.register_buffer("running_mean", torch.zeros(num_features))
-        self.register_buffer("running_var", torch.ones(num_features))
-
-    def _load_from_state_dict(
-        self,
-        state_dict: dict,
-        prefix: str,
-        local_metadata: dict,
-        strict: bool,
-        missing_keys: List[str],
-        unexpected_keys: List[str],
-        error_msgs: List[str],
-    ):
-        num_batches_tracked_key = prefix + "num_batches_tracked"
-        if num_batches_tracked_key in state_dict:
-            del state_dict[num_batches_tracked_key]
-
-        super()._load_from_state_dict(
-            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
-        )
-
-    def forward(self, x: Tensor) -> Tensor:
-        # move reshapes to the beginning
-        # to make it fuser-friendly
-        w = self.weight.reshape(1, -1, 1, 1)
-        b = self.bias.reshape(1, -1, 1, 1)
-        rv = self.running_var.reshape(1, -1, 1, 1)
-        rm = self.running_mean.reshape(1, -1, 1, 1)
-        scale = w * (rv + self.eps).rsqrt()
-        bias = b - rm * scale
-        return x * scale + bias
-
-    def __repr__(self) -> str:
-        return f"{self.__class__.__name__}({self.weight.shape[0]}, eps={self.eps})"
-
-
-class ConvNormActivation(torch.nn.Sequential):
-    """
-    Configurable block used for Convolution-Normalzation-Activation blocks.
-
-    Args:
-        in_channels (int): Number of channels in the input image
-        out_channels (int): Number of channels produced by the Convolution-Normalzation-Activation block
-        kernel_size: (int, optional): Size of the convolving kernel. Default: 3
-        stride (int, optional): Stride of the convolution. Default: 1
-        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in wich case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
-        groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
-        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolutiuon layer. If ``None`` this layer wont be used. Default: ``torch.nn.BatchNorm2d``
-        activation_layer (Callable[..., torch.nn.Module], optinal): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
-        dilation (int): Spacing between kernel elements. Default: 1
-        inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
-        bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
-
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        kernel_size: int = 3,
-        stride: int = 1,
-        padding: Optional[int] = None,
-        groups: int = 1,
-        norm_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.BatchNorm2d,
-        activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
-        dilation: int = 1,
-        inplace: Optional[bool] = True,
-        bias: Optional[bool] = None,
-    ) -> None:
-        if padding is None:
-            padding = (kernel_size - 1) // 2 * dilation
-        if bias is None:
-            bias = norm_layer is None
-        layers = [
-            torch.nn.Conv2d(
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride,
-                padding,
-                dilation=dilation,
-                groups=groups,
-                bias=bias,
-            )
-        ]
-        if norm_layer is not None:
-            layers.append(norm_layer(out_channels))
-        if activation_layer is not None:
-            params = {} if inplace is None else {"inplace": inplace}
-            layers.append(activation_layer(**params))
-        super().__init__(*layers)
-        _log_api_usage_once(self)
-        self.out_channels = out_channels
-
-
-class SqueezeExcitation(torch.nn.Module):
-    """
-    This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507 (see Fig. 1).
-    Parameters ``activation``, and ``scale_activation`` correspond to ``delta`` and ``sigma`` in in eq. 3.
-
-    Args:
-        input_channels (int): Number of channels in the input image
-        squeeze_channels (int): Number of squeeze channels
-        activation (Callable[..., torch.nn.Module], optional): ``delta`` activation. Default: ``torch.nn.ReLU``
-        scale_activation (Callable[..., torch.nn.Module]): ``sigma`` activation. Default: ``torch.nn.Sigmoid``
-    """
-
-    def __init__(
-        self,
-        input_channels: int,
-        squeeze_channels: int,
-        activation: Callable[..., torch.nn.Module] = torch.nn.ReLU,
-        scale_activation: Callable[..., torch.nn.Module] = torch.nn.Sigmoid,
-    ) -> None:
-        super().__init__()
-        _log_api_usage_once(self)
-        self.avgpool = torch.nn.AdaptiveAvgPool2d(1)
-        self.fc1 = torch.nn.Conv2d(input_channels, squeeze_channels, 1)
-        self.fc2 = torch.nn.Conv2d(squeeze_channels, input_channels, 1)
-        self.activation = activation()
-        self.scale_activation = scale_activation()
-
-    def _scale(self, input: Tensor) -> Tensor:
-        scale = self.avgpool(input)
-        scale = self.fc1(scale)
-        scale = self.activation(scale)
-        scale = self.fc2(scale)
-        return self.scale_activation(scale)
-
-    def forward(self, input: Tensor) -> Tensor:
-        scale = self._scale(input)
-        return scale * input
diff --git a/cv/classification/resnet50/pytorch/_torchvision/ops/stochastic_depth.py b/cv/classification/resnet50/pytorch/_torchvision/ops/stochastic_depth.py
deleted file mode 100644
index 167124cbf..000000000
--- a/cv/classification/resnet50/pytorch/_torchvision/ops/stochastic_depth.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
-# Copyright Declaration: This software, including all of its code and documentation,
-# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
-# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
-# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
-# CoreX. No user of this software shall have any right, ownership or interest in this software and
-# any use of this software shall be in compliance with the terms and conditions of the End User
-# License Agreement.
-import torch
-import torch.fx
-from torch import nn, Tensor
-
-from ..utils import _log_api_usage_once
-
-
-def stochastic_depth(input: Tensor, p: float, mode: str, training: bool = True) -> Tensor:
-    """
-    Implements the Stochastic Depth from `"Deep Networks with Stochastic Depth"
-    <https://arxiv.org/abs/1603.09382>`_ used for randomly dropping residual
-    branches of residual architectures.
-
-    Args:
-        input (Tensor[N, ...]): The input tensor or arbitrary dimensions with the first one
-                    being its batch i.e. a batch with ``N`` rows.
-        p (float): probability of the input to be zeroed.
-        mode (str): ``"batch"`` or ``"row"``.
-                    ``"batch"`` randomly zeroes the entire input, ``"row"`` zeroes
-                    randomly selected rows from the batch.
-        training: apply stochastic depth if is ``True``. Default: ``True``
-
-    Returns:
-        Tensor[N, ...]: The randomly zeroed tensor.
-    """
-    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
-        _log_api_usage_once(stochastic_depth)
-    if p < 0.0 or p > 1.0:
-        raise ValueError(f"drop probability has to be between 0 and 1, but got {p}")
-    if mode not in ["batch", "row"]:
-        raise ValueError(f"mode has to be either 'batch' or 'row', but got {mode}")
-    if not training or p == 0.0:
-        return input
-
-    survival_rate = 1.0 - p
-    if mode == "row":
-        size = [input.shape[0]] + [1] * (input.ndim - 1)
-    else:
-        size = [1] * input.ndim
-    noise = torch.empty(size, dtype=input.dtype, device=input.device)
-    noise = noise.bernoulli_(survival_rate)
-    if survival_rate > 0.0:
-        noise.div_(survival_rate)
-    return input * noise
-
-
-torch.fx.wrap("stochastic_depth")
-
-
-class StochasticDepth(nn.Module):
-    """
-    See :func:`stochastic_depth`.
-    """
-
-    def __init__(self, p: float, mode: str) -> None:
-        super().__init__()
-        _log_api_usage_once(self)
-        self.p = p
-        self.mode = mode
-
-    def forward(self, input: Tensor) -> Tensor:
-        return stochastic_depth(input, self.p, self.mode, self.training)
-
-    def __repr__(self) -> str:
-        s = f"{self.__class__.__name__}(p={self.p}, mode={self.mode})"
-        return s
diff --git a/cv/classification/resnet50/pytorch/_torchvision/utils.py b/cv/classification/resnet50/pytorch/_torchvision/utils.py
deleted file mode 100644
index 23bb8b54c..000000000
--- a/cv/classification/resnet50/pytorch/_torchvision/utils.py
+++ /dev/null
@@ -1,556 +0,0 @@
-# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
-# Copyright Declaration: This software, including all of its code and documentation,
-# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
-# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
-# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
-# CoreX. No user of this software shall have any right, ownership or interest in this software and
-# any use of this software shall be in compliance with the terms and conditions of the End User
-# License Agreement.
-import math
-import pathlib
-import warnings
-from types import FunctionType
-from typing import Any, BinaryIO, List, Optional, Tuple, Union
-
-import numpy as np
-import torch
-from PIL import Image, ImageColor, ImageDraw, ImageFont
-
-__all__ = [
-    "make_grid",
-    "save_image",
-    "draw_bounding_boxes",
-    "draw_segmentation_masks",
-    "draw_keypoints",
-    "flow_to_image",
-]
-
-
-@torch.no_grad()
-def make_grid(
-    tensor: Union[torch.Tensor, List[torch.Tensor]],
-    nrow: int = 8,
-    padding: int = 2,
-    normalize: bool = False,
-    value_range: Optional[Tuple[int, int]] = None,
-    scale_each: bool = False,
-    pad_value: float = 0.0,
-    **kwargs,
-) -> torch.Tensor:
-    """
-    Make a grid of images.
-
-    Args:
-        tensor (Tensor or list): 4D mini-batch Tensor of shape (B x C x H x W)
-            or a list of images all of the same size.
-        nrow (int, optional): Number of images displayed in each row of the grid.
-            The final grid size is ``(B / nrow, nrow)``. Default: ``8``.
-        padding (int, optional): amount of padding. Default: ``2``.
-        normalize (bool, optional): If True, shift the image to the range (0, 1),
-            by the min and max values specified by ``value_range``. Default: ``False``.
-        value_range (tuple, optional): tuple (min, max) where min and max are numbers,
-            then these numbers are used to normalize the image. By default, min and max
-            are computed from the tensor.
-        range (tuple. optional):
-            .. warning::
-                This parameter was deprecated in ``0.12`` and will be removed in ``0.14``. Please use ``value_range``
-                instead.
-        scale_each (bool, optional): If ``True``, scale each image in the batch of
-            images separately rather than the (min, max) over all images. Default: ``False``.
-        pad_value (float, optional): Value for the padded pixels. Default: ``0``.
-
-    Returns:
-        grid (Tensor): the tensor containing grid of images.
-    """
-    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
-        _log_api_usage_once(make_grid)
-    if not (torch.is_tensor(tensor) or (isinstance(tensor, list) and all(torch.is_tensor(t) for t in tensor))):
-        raise TypeError(f"tensor or list of tensors expected, got {type(tensor)}")
-
-    if "range" in kwargs.keys():
-        warnings.warn(
-            "The parameter 'range' is deprecated since 0.12 and will be removed in 0.14. "
-            "Please use 'value_range' instead."
-        )
-        value_range = kwargs["range"]
-
-    # if list of tensors, convert to a 4D mini-batch Tensor
-    if isinstance(tensor, list):
-        tensor = torch.stack(tensor, dim=0)
-
-    if tensor.dim() == 2:  # single image H x W
-        tensor = tensor.unsqueeze(0)
-    if tensor.dim() == 3:  # single image
-        if tensor.size(0) == 1:  # if single-channel, convert to 3-channel
-            tensor = torch.cat((tensor, tensor, tensor), 0)
-        tensor = tensor.unsqueeze(0)
-
-    if tensor.dim() == 4 and tensor.size(1) == 1:  # single-channel images
-        tensor = torch.cat((tensor, tensor, tensor), 1)
-
-    if normalize is True:
-        tensor = tensor.clone()  # avoid modifying tensor in-place
-        if value_range is not None:
-            assert isinstance(
-                value_range, tuple
-            ), "value_range has to be a tuple (min, max) if specified. min and max are numbers"
-
-        def norm_ip(img, low, high):
-            img.clamp_(min=low, max=high)
-            img.sub_(low).div_(max(high - low, 1e-5))
-
-        def norm_range(t, value_range):
-            if value_range is not None:
-                norm_ip(t, value_range[0], value_range[1])
-            else:
-                norm_ip(t, float(t.min()), float(t.max()))
-
-        if scale_each is True:
-            for t in tensor:  # loop over mini-batch dimension
-                norm_range(t, value_range)
-        else:
-            norm_range(tensor, value_range)
-
-    assert isinstance(tensor, torch.Tensor)
-    if tensor.size(0) == 1:
-        return tensor.squeeze(0)
-
-    # make the mini-batch of images into a grid
-    nmaps = tensor.size(0)
-    xmaps = min(nrow, nmaps)
-    ymaps = int(math.ceil(float(nmaps) / xmaps))
-    height, width = int(tensor.size(2) + padding), int(tensor.size(3) + padding)
-    num_channels = tensor.size(1)
-    grid = tensor.new_full((num_channels, height * ymaps + padding, width * xmaps + padding), pad_value)
-    k = 0
-    for y in range(ymaps):
-        for x in range(xmaps):
-            if k >= nmaps:
-                break
-            # Tensor.copy_() is a valid method but seems to be missing from the stubs
-            # https://pytorch.org/docs/stable/tensors.html#torch.Tensor.copy_
-            grid.narrow(1, y * height + padding, height - padding).narrow(  # type: ignore[attr-defined]
-                2, x * width + padding, width - padding
-            ).copy_(tensor[k])
-            k = k + 1
-    return grid
-
-
-@torch.no_grad()
-def save_image(
-    tensor: Union[torch.Tensor, List[torch.Tensor]],
-    fp: Union[str, pathlib.Path, BinaryIO],
-    format: Optional[str] = None,
-    **kwargs,
-) -> None:
-    """
-    Save a given Tensor into an image file.
-
-    Args:
-        tensor (Tensor or list): Image to be saved. If given a mini-batch tensor,
-            saves the tensor as a grid of images by calling ``make_grid``.
-        fp (string or file object): A filename or a file object
-        format(Optional):  If omitted, the format to use is determined from the filename extension.
-            If a file object was used instead of a filename, this parameter should always be used.
-        **kwargs: Other arguments are documented in ``make_grid``.
-    """
-
-    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
-        _log_api_usage_once(save_image)
-    grid = make_grid(tensor, **kwargs)
-    # Add 0.5 after unnormalizing to [0, 255] to round to nearest integer
-    ndarr = grid.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()
-    im = Image.fromarray(ndarr)
-    im.save(fp, format=format)
-
-
-@torch.no_grad()
-def draw_bounding_boxes(
-    image: torch.Tensor,
-    boxes: torch.Tensor,
-    labels: Optional[List[str]] = None,
-    colors: Optional[Union[List[Union[str, Tuple[int, int, int]]], str, Tuple[int, int, int]]] = None,
-    fill: Optional[bool] = False,
-    width: int = 1,
-    font: Optional[str] = None,
-    font_size: int = 10,
-) -> torch.Tensor:
-
-    """
-    Draws bounding boxes on given image.
-    The values of the input image should be uint8 between 0 and 255.
-    If fill is True, Resulting Tensor should be saved as PNG image.
-
-    Args:
-        image (Tensor): Tensor of shape (C x H x W) and dtype uint8.
-        boxes (Tensor): Tensor of size (N, 4) containing bounding boxes in (xmin, ymin, xmax, ymax) format. Note that
-            the boxes are absolute coordinates with respect to the image. In other words: `0 <= xmin < xmax < W` and
-            `0 <= ymin < ymax < H`.
-        labels (List[str]): List containing the labels of bounding boxes.
-        colors (color or list of colors, optional): List containing the colors
-            of the boxes or single color for all boxes. The color can be represented as
-            PIL strings e.g. "red" or "#FF00FF", or as RGB tuples e.g. ``(240, 10, 157)``.
-            By default, random colors are generated for boxes.
-        fill (bool): If `True` fills the bounding box with specified color.
-        width (int): Width of bounding box.
-        font (str): A filename containing a TrueType font. If the file is not found in this filename, the loader may
-            also search in other directories, such as the `fonts/` directory on Windows or `/Library/Fonts/`,
-            `/System/Library/Fonts/` and `~/Library/Fonts/` on macOS.
-        font_size (int): The requested font size in points.
-
-    Returns:
-        img (Tensor[C, H, W]): Image Tensor of dtype uint8 with bounding boxes plotted.
-    """
-
-    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
-        _log_api_usage_once(draw_bounding_boxes)
-    if not isinstance(image, torch.Tensor):
-        raise TypeError(f"Tensor expected, got {type(image)}")
-    elif image.dtype != torch.uint8:
-        raise ValueError(f"Tensor uint8 expected, got {image.dtype}")
-    elif image.dim() != 3:
-        raise ValueError("Pass individual images, not batches")
-    elif image.size(0) not in {1, 3}:
-        raise ValueError("Only grayscale and RGB images are supported")
-
-    num_boxes = boxes.shape[0]
-
-    if labels is None:
-        labels: Union[List[str], List[None]] = [None] * num_boxes  # type: ignore[no-redef]
-    elif len(labels) != num_boxes:
-        raise ValueError(
-            f"Number of boxes ({num_boxes}) and labels ({len(labels)}) mismatch. Please specify labels for each box."
-        )
-
-    if colors is None:
-        colors = _generate_color_palette(num_boxes)
-    elif isinstance(colors, list):
-        if len(colors) < num_boxes:
-            raise ValueError(f"Number of colors ({len(colors)}) is less than number of boxes ({num_boxes}). ")
-    else:  # colors specifies a single color for all boxes
-        colors = [colors] * num_boxes
-
-    colors = [(ImageColor.getrgb(color) if isinstance(color, str) else color) for color in colors]
-
-    # Handle Grayscale images
-    if image.size(0) == 1:
-        image = torch.tile(image, (3, 1, 1))
-
-    ndarr = image.permute(1, 2, 0).cpu().numpy()
-    img_to_draw = Image.fromarray(ndarr)
-    img_boxes = boxes.to(torch.int64).tolist()
-
-    if fill:
-        draw = ImageDraw.Draw(img_to_draw, "RGBA")
-    else:
-        draw = ImageDraw.Draw(img_to_draw)
-
-    txt_font = ImageFont.load_default() if font is None else ImageFont.truetype(font=font, size=font_size)
-
-    for bbox, color, label in zip(img_boxes, colors, labels):  # type: ignore[arg-type]
-        if fill:
-            fill_color = color + (100,)
-            draw.rectangle(bbox, width=width, outline=color, fill=fill_color)
-        else:
-            draw.rectangle(bbox, width=width, outline=color)
-
-        if label is not None:
-            margin = width + 1
-            draw.text((bbox[0] + margin, bbox[1] + margin), label, fill=color, font=txt_font)
-
-    return torch.from_numpy(np.array(img_to_draw)).permute(2, 0, 1).to(dtype=torch.uint8)
-
-
-@torch.no_grad()
-def draw_segmentation_masks(
-    image: torch.Tensor,
-    masks: torch.Tensor,
-    alpha: float = 0.8,
-    colors: Optional[Union[List[Union[str, Tuple[int, int, int]]], str, Tuple[int, int, int]]] = None,
-) -> torch.Tensor:
-
-    """
-    Draws segmentation masks on given RGB image.
-    The values of the input image should be uint8 between 0 and 255.
-
-    Args:
-        image (Tensor): Tensor of shape (3, H, W) and dtype uint8.
-        masks (Tensor): Tensor of shape (num_masks, H, W) or (H, W) and dtype bool.
-        alpha (float): Float number between 0 and 1 denoting the transparency of the masks.
-            0 means full transparency, 1 means no transparency.
-        colors (color or list of colors, optional): List containing the colors
-            of the masks or single color for all masks. The color can be represented as
-            PIL strings e.g. "red" or "#FF00FF", or as RGB tuples e.g. ``(240, 10, 157)``.
-            By default, random colors are generated for each mask.
-
-    Returns:
-        img (Tensor[C, H, W]): Image Tensor, with segmentation masks drawn on top.
-    """
-
-    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
-        _log_api_usage_once(draw_segmentation_masks)
-    if not isinstance(image, torch.Tensor):
-        raise TypeError(f"The image must be a tensor, got {type(image)}")
-    elif image.dtype != torch.uint8:
-        raise ValueError(f"The image dtype must be uint8, got {image.dtype}")
-    elif image.dim() != 3:
-        raise ValueError("Pass individual images, not batches")
-    elif image.size()[0] != 3:
-        raise ValueError("Pass an RGB image. Other Image formats are not supported")
-    if masks.ndim == 2:
-        masks = masks[None, :, :]
-    if masks.ndim != 3:
-        raise ValueError("masks must be of shape (H, W) or (batch_size, H, W)")
-    if masks.dtype != torch.bool:
-        raise ValueError(f"The masks must be of dtype bool. Got {masks.dtype}")
-    if masks.shape[-2:] != image.shape[-2:]:
-        raise ValueError("The image and the masks must have the same height and width")
-
-    num_masks = masks.size()[0]
-    if colors is not None and num_masks > len(colors):
-        raise ValueError(f"There are more masks ({num_masks}) than colors ({len(colors)})")
-
-    if colors is None:
-        colors = _generate_color_palette(num_masks)
-
-    if not isinstance(colors, list):
-        colors = [colors]
-    if not isinstance(colors[0], (tuple, str)):
-        raise ValueError("colors must be a tuple or a string, or a list thereof")
-    if isinstance(colors[0], tuple) and len(colors[0]) != 3:
-        raise ValueError("It seems that you passed a tuple of colors instead of a list of colors")
-
-    out_dtype = torch.uint8
-
-    colors_ = []
-    for color in colors:
-        if isinstance(color, str):
-            color = ImageColor.getrgb(color)
-        colors_.append(torch.tensor(color, dtype=out_dtype))
-
-    img_to_draw = image.detach().clone()
-    # TODO: There might be a way to vectorize this
-    for mask, color in zip(masks, colors_):
-        img_to_draw[:, mask] = color[:, None]
-
-    out = image * (1 - alpha) + img_to_draw * alpha
-    return out.to(out_dtype)
-
-
-@torch.no_grad()
-def draw_keypoints(
-    image: torch.Tensor,
-    keypoints: torch.Tensor,
-    connectivity: Optional[List[Tuple[int, int]]] = None,
-    colors: Optional[Union[str, Tuple[int, int, int]]] = None,
-    radius: int = 2,
-    width: int = 3,
-) -> torch.Tensor:
-
-    """
-    Draws Keypoints on given RGB image.
-    The values of the input image should be uint8 between 0 and 255.
-
-    Args:
-        image (Tensor): Tensor of shape (3, H, W) and dtype uint8.
-        keypoints (Tensor): Tensor of shape (num_instances, K, 2) the K keypoints location for each of the N instances,
-            in the format [x, y].
-        connectivity (List[Tuple[int, int]]]): A List of tuple where,
-            each tuple contains pair of keypoints to be connected.
-        colors (str, Tuple): The color can be represented as
-            PIL strings e.g. "red" or "#FF00FF", or as RGB tuples e.g. ``(240, 10, 157)``.
-        radius (int): Integer denoting radius of keypoint.
-        width (int): Integer denoting width of line connecting keypoints.
-
-    Returns:
-        img (Tensor[C, H, W]): Image Tensor of dtype uint8 with keypoints drawn.
-    """
-
-    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
-        _log_api_usage_once(draw_keypoints)
-    if not isinstance(image, torch.Tensor):
-        raise TypeError(f"The image must be a tensor, got {type(image)}")
-    elif image.dtype != torch.uint8:
-        raise ValueError(f"The image dtype must be uint8, got {image.dtype}")
-    elif image.dim() != 3:
-        raise ValueError("Pass individual images, not batches")
-    elif image.size()[0] != 3:
-        raise ValueError("Pass an RGB image. Other Image formats are not supported")
-
-    if keypoints.ndim != 3:
-        raise ValueError("keypoints must be of shape (num_instances, K, 2)")
-
-    ndarr = image.permute(1, 2, 0).cpu().numpy()
-    img_to_draw = Image.fromarray(ndarr)
-    draw = ImageDraw.Draw(img_to_draw)
-    img_kpts = keypoints.to(torch.int64).tolist()
-
-    for kpt_id, kpt_inst in enumerate(img_kpts):
-        for inst_id, kpt in enumerate(kpt_inst):
-            x1 = kpt[0] - radius
-            x2 = kpt[0] + radius
-            y1 = kpt[1] - radius
-            y2 = kpt[1] + radius
-            draw.ellipse([x1, y1, x2, y2], fill=colors, outline=None, width=0)
-
-        if connectivity:
-            for connection in connectivity:
-                start_pt_x = kpt_inst[connection[0]][0]
-                start_pt_y = kpt_inst[connection[0]][1]
-
-                end_pt_x = kpt_inst[connection[1]][0]
-                end_pt_y = kpt_inst[connection[1]][1]
-
-                draw.line(
-                    ((start_pt_x, start_pt_y), (end_pt_x, end_pt_y)),
-                    width=width,
-                )
-
-    return torch.from_numpy(np.array(img_to_draw)).permute(2, 0, 1).to(dtype=torch.uint8)
-
-
-# Flow visualization code adapted from https://github.com/tomrunia/OpticalFlow_Visualization
-@torch.no_grad()
-def flow_to_image(flow: torch.Tensor) -> torch.Tensor:
-
-    """
-    Converts a flow to an RGB image.
-
-    Args:
-        flow (Tensor): Flow of shape (N, 2, H, W) or (2, H, W) and dtype torch.float.
-
-    Returns:
-        img (Tensor): Image Tensor of dtype uint8 where each color corresponds
-            to a given flow direction. Shape is (N, 3, H, W) or (3, H, W) depending on the input.
-    """
-
-    if flow.dtype != torch.float:
-        raise ValueError(f"Flow should be of dtype torch.float, got {flow.dtype}.")
-
-    orig_shape = flow.shape
-    if flow.ndim == 3:
-        flow = flow[None]  # Add batch dim
-
-    if flow.ndim != 4 or flow.shape[1] != 2:
-        raise ValueError(f"Input flow should have shape (2, H, W) or (N, 2, H, W), got {orig_shape}.")
-
-    max_norm = torch.sum(flow ** 2, dim=1).sqrt().max()
-    epsilon = torch.finfo((flow).dtype).eps
-    normalized_flow = flow / (max_norm + epsilon)
-    img = _normalized_flow_to_image(normalized_flow)
-
-    if len(orig_shape) == 3:
-        img = img[0]  # Remove batch dim
-    return img
-
-
-@torch.no_grad()
-def _normalized_flow_to_image(normalized_flow: torch.Tensor) -> torch.Tensor:
-
-    """
-    Converts a batch of normalized flow to an RGB image.
-
-    Args:
-        normalized_flow (torch.Tensor): Normalized flow tensor of shape (N, 2, H, W)
-    Returns:
-       img (Tensor(N, 3, H, W)): Flow visualization image of dtype uint8.
-    """
-
-    N, _, H, W = normalized_flow.shape
-    flow_image = torch.zeros((N, 3, H, W), dtype=torch.uint8)
-    colorwheel = _make_colorwheel()  # shape [55x3]
-    num_cols = colorwheel.shape[0]
-    norm = torch.sum(normalized_flow ** 2, dim=1).sqrt()
-    a = torch.atan2(-normalized_flow[:, 1, :, :], -normalized_flow[:, 0, :, :]) / torch.pi
-    fk = (a + 1) / 2 * (num_cols - 1)
-    k0 = torch.floor(fk).to(torch.long)
-    k1 = k0 + 1
-    k1[k1 == num_cols] = 0
-    f = fk - k0
-
-    for c in range(colorwheel.shape[1]):
-        tmp = colorwheel[:, c]
-        col0 = tmp[k0] / 255.0
-        col1 = tmp[k1] / 255.0
-        col = (1 - f) * col0 + f * col1
-        col = 1 - norm * (1 - col)
-        flow_image[:, c, :, :] = torch.floor(255 * col)
-    return flow_image
-
-
-def _make_colorwheel() -> torch.Tensor:
-    """
-    Generates a color wheel for optical flow visualization as presented in:
-    Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007)
-    URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf.
-
-    Returns:
-        colorwheel (Tensor[55, 3]): Colorwheel Tensor.
-    """
-
-    RY = 15
-    YG = 6
-    GC = 4
-    CB = 11
-    BM = 13
-    MR = 6
-
-    ncols = RY + YG + GC + CB + BM + MR
-    colorwheel = torch.zeros((ncols, 3))
-    col = 0
-
-    # RY
-    colorwheel[0:RY, 0] = 255
-    colorwheel[0:RY, 1] = torch.floor(255 * torch.arange(0, RY) / RY)
-    col = col + RY
-    # YG
-    colorwheel[col : col + YG, 0] = 255 - torch.floor(255 * torch.arange(0, YG) / YG)
-    colorwheel[col : col + YG, 1] = 255
-    col = col + YG
-    # GC
-    colorwheel[col : col + GC, 1] = 255
-    colorwheel[col : col + GC, 2] = torch.floor(255 * torch.arange(0, GC) / GC)
-    col = col + GC
-    # CB
-    colorwheel[col : col + CB, 1] = 255 - torch.floor(255 * torch.arange(CB) / CB)
-    colorwheel[col : col + CB, 2] = 255
-    col = col + CB
-    # BM
-    colorwheel[col : col + BM, 2] = 255
-    colorwheel[col : col + BM, 0] = torch.floor(255 * torch.arange(0, BM) / BM)
-    col = col + BM
-    # MR
-    colorwheel[col : col + MR, 2] = 255 - torch.floor(255 * torch.arange(MR) / MR)
-    colorwheel[col : col + MR, 0] = 255
-    return colorwheel
-
-
-def _generate_color_palette(num_objects: int):
-    palette = torch.tensor([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1])
-    return [tuple((i * palette) % 255) for i in range(num_objects)]
-
-
-def _log_api_usage_once(obj: Any) -> None:
-
-    """
-    Logs API usage(module and name) within an organization.
-    In a large ecosystem, it's often useful to track the PyTorch and
-    TorchVision APIs usage. This API provides the similar functionality to the
-    logging module in the Python stdlib. It can be used for debugging purpose
-    to log which methods are used and by default it is inactive, unless the user
-    manually subscribes a logger via the `SetAPIUsageLogger method <https://github.com/pytorch/pytorch/blob/eb3b9fe719b21fae13c7a7cf3253f970290a573e/c10/util/Logging.cpp#L114>`_.
-    Please note it is triggered only once for the same API call within a process.
-    It does not collect any data from open-source users since it is no-op by default.
-    For more information, please refer to
-    * PyTorch note: https://pytorch.org/docs/stable/notes/large_scale_deployments.html#api-usage-logging;
-    * Logging policy: https://github.com/pytorch/vision/issues/5052;
-
-    Args:
-        obj (class instance or method): an object to extract info from.
-    """
-    if not obj.__module__.startswith("torchvision"):
-        return
-    name = obj.__class__.__name__
-    if isinstance(obj, FunctionType):
-        name = obj.__name__
-    torch._C._log_api_usage_once(f"{obj.__module__}.{name}")
diff --git a/cv/classification/resnet50/pytorch/utils.py b/cv/classification/resnet50/pytorch/utils.py
deleted file mode 100644
index 08ff4fa53..000000000
--- a/cv/classification/resnet50/pytorch/utils.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
-# Copyright Declaration: This software, including all of its code and documentation,
-# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
-# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
-# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
-# CoreX. No user of this software shall have any right, ownership or interest in this software and
-# any use of this software shall be in compliance with the terms and conditions of the End User
-# License Agreement.
-from collections import defaultdict, deque, OrderedDict
-import copy
-import datetime
-import hashlib
-import time
-import torch
-import torch.distributed as dist
-
-import errno
-import os
-
-from common_utils import *
-
-
-def accuracy(output, target, topk=(1,)):
-    """Computes the accuracy over the k top predictions for the specified values of k"""
-    with torch.no_grad():
-        maxk = max(topk)
-        batch_size = target.size(0)
-
-        _, pred = output.topk(maxk, 1, True, True)
-        pred = pred.t()
-        correct = pred.eq(target[None])
-
-        res = []
-        for k in topk:
-            correct_k = correct[:k].flatten().sum(dtype=torch.float32)
-            res.append(correct_k * (100.0 / batch_size))
-        return res
-
-
-def average_checkpoints(inputs):
-    """Loads checkpoints from inputs and returns a model with averaged weights. Original implementation taken from:
-    https://github.com/pytorch/fairseq/blob/a48f235636557b8d3bc4922a6fa90f3a0fa57955/scripts/average_checkpoints.py#L16
-
-    Args:
-      inputs (List[str]): An iterable of string paths of checkpoints to load from.
-    Returns:
-      A dict of string keys mapping to various values. The 'model' key
-      from the returned dict should correspond to an OrderedDict mapping
-      string parameter names to torch Tensors.
-    """
-    params_dict = OrderedDict()
-    params_keys = None
-    new_state = None
-    num_models = len(inputs)
-    for fpath in inputs:
-        with open(fpath, "rb") as f:
-            state = torch.load(
-                f,
-                map_location=(
-                    lambda s, _: torch.serialization.default_restore_location(s, "cpu")
-                ),
-            )
-        # Copies over the settings from the first checkpoint
-        if new_state is None:
-            new_state = state
-        model_params = state["model"]
-        model_params_keys = list(model_params.keys())
-        if params_keys is None:
-            params_keys = model_params_keys
-        elif params_keys != model_params_keys:
-            raise KeyError(
-                "For checkpoint {}, expected list of params: {}, "
-                "but found: {}".format(f, params_keys, model_params_keys)
-            )
-        for k in params_keys:
-            p = model_params[k]
-            if isinstance(p, torch.HalfTensor):
-                p = p.float()
-            if k not in params_dict:
-                params_dict[k] = p.clone()
-                # NOTE: clone() is needed in case of p is a shared parameter
-            else:
-                params_dict[k] += p
-    averaged_params = OrderedDict()
-    for k, v in params_dict.items():
-        averaged_params[k] = v
-        if averaged_params[k].is_floating_point():
-            averaged_params[k].div_(num_models)
-        else:
-            averaged_params[k] //= num_models
-    new_state["model"] = averaged_params
-    return new_state
-
-
-def store_model_weights(model, checkpoint_path, checkpoint_key='model', strict=True):
-    """
-    This method can be used to prepare weights files for new models. It receives as
-    input a model architecture and a checkpoint from the training script and produces
-    a file with the weights ready for release.
-
-    Examples:
-        from torchvision import models as M
-
-        # Classification
-        model = M.mobilenet_v3_large(pretrained=False)
-        print(store_model_weights(model, './class.pth'))
-
-        # Quantized Classification
-        model = M.quantization.mobilenet_v3_large(pretrained=False, quantize=False)
-        model.fuse_model()
-        model.qconfig = torch.quantization.get_default_qat_qconfig('qnnpack')
-        _ = torch.quantization.prepare_qat(model, inplace=True)
-        print(store_model_weights(model, './qat.pth'))
-
-        # Object Detection
-        model = M.detection.fasterrcnn_mobilenet_v3_large_fpn(pretrained=False, pretrained_backbone=False)
-        print(store_model_weights(model, './obj.pth'))
-
-        # Segmentation
-        model = M.segmentation.deeplabv3_mobilenet_v3_large(pretrained=False, pretrained_backbone=False, aux_loss=True)
-        print(store_model_weights(model, './segm.pth', strict=False))
-
-    Args:
-        model (pytorch.nn.Module): The model on which the weights will be loaded for validation purposes.
-        checkpoint_path (str): The path of the checkpoint we will load.
-        checkpoint_key (str, optional): The key of the checkpoint where the model weights are stored.
-            Default: "model".
-        strict (bool): whether to strictly enforce that the keys
-            in :attr:`state_dict` match the keys returned by this module's
-            :meth:`~torch.nn.Module.state_dict` function. Default: ``True``
-
-    Returns:
-        output_path (str): The location where the weights are saved.
-    """
-    # Store the new model next to the checkpoint_path
-    checkpoint_path = os.path.abspath(checkpoint_path)
-    output_dir = os.path.dirname(checkpoint_path)
-
-    # Deep copy to avoid side-effects on the model object.
-    model = copy.deepcopy(model)
-    checkpoint = torch.load(checkpoint_path, map_location='cpu')
-
-    # Load the weights to the model to validate that everything works
-    # and remove unnecessary weights (such as auxiliaries, etc)
-    model.load_state_dict(checkpoint[checkpoint_key], strict=strict)
-
-    tmp_path = os.path.join(output_dir, str(model.__hash__()))
-    torch.save(model.state_dict(), tmp_path)
-
-    sha256_hash = hashlib.sha256()
-    with open(tmp_path, "rb") as f:
-        # Read and update hash string value in blocks of 4K
-        for byte_block in iter(lambda: f.read(4096), b""):
-            sha256_hash.update(byte_block)
-        hh = sha256_hash.hexdigest()
-
-    output_path = os.path.join(output_dir, "weights-" + str(hh[:8]) + ".pth")
-    os.replace(tmp_path, output_path)
-
-    return output_path
diff --git a/tests/executables/resnet/init_paddle.sh b/tests/executables/resnet/init_paddle.sh
new file mode 100644
index 000000000..39528b7a4
--- /dev/null
+++ b/tests/executables/resnet/init_paddle.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+bash ../_utils/init_classification_paddle.sh ../_utils
+
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+pip3 install protobuf==3.20.3
+pip3 install pyyaml
+
+CUR_DIR=$(cd `dirname $0`; pwd)
+PRJ_DIR=${CUR_DIR}/../..
+DATASET_DIR=${PRJ_DIR}/data/datasets
+
+if [ ! -d "${DATASET_DIR}/flowers102" ]; then
+    tar zxf ${DATASET_DIR}/flowers102.tgz -C ${DATASET_DIR}
+fi
+
+RESNET_PADDLE_DIR=${PRJ_DIR}/official/cv/classification/resnet/paddle
+cd ${RESNET_PADDLE_DIR}
+pip3 install -r requirements.txt
+
+a=$(pip3 show paddlepaddle|awk '/Version:/ {print $NF}'); b=(${a//+/ }); c=(${b//./ })
+if [[ ${c[0]} -eq 2 && ${c[1]} -le 5 ]]; then
+  rm -rf ppcls && ln -s ppcls_2.5 ppcls
+  mkdir -p data/datasets
+  ln -s ${DATASET_DIR}/flowers102 ${RESNET_PADDLE_DIR}/data/datasets/flowers102
+else
+  rm -rf ppcls && ln -s ppcls_2.6 ppcls
+  mkdir -p dataset
+  ln -s ${DATASET_DIR}/flowers102 ${RESNET_PADDLE_DIR}/dataset/flowers102
+fi
diff --git a/tests/executables/resnet/init_tf.sh b/tests/executables/resnet/init_tf.sh
new file mode 100644
index 000000000..e03a185e2
--- /dev/null
+++ b/tests/executables/resnet/init_tf.sh
@@ -0,0 +1,25 @@
+bash ../_utils/init_tf_cnn_benckmark.sh ../_utils
+
+CURRENT_DIR=$(cd `dirname $0`; pwd)
+ROOT_DIR=${CURRENT_DIR}/../..
+DATA_DIR=${ROOT_DIR}/data/packages
+
+# sys_name_str=`uname -a`
+# if [[ "${sys_name_str}" =~ "aarch64" ]]; then
+#     pip3 install ${DATA_DIR}/addons/tensorflow_addons*.whl
+# fi
+
+# pip3 install gin-config tensorflow_addons tensorflow_datasets tensorflow_model_optimization
+
+pip3 install gin-config tensorflow_datasets tensorflow_model_optimization
+
+pip3 uninstall -y protobuf
+pip3 install "protobuf<4.0.0"  
+
+python_version=$(python3 --version 2>&1 |awk '{print $2}'|awk -F '.' '{printf "%d.%d", $1,$2}')
+if [ $python_version == 3.7 ]; then
+    pip3 install numpy==1.21.6
+else
+    pip3 install numpy==1.23.3
+fi
+
diff --git a/tests/executables/resnet/init_torch.sh b/tests/executables/resnet/init_torch.sh
new file mode 100644
index 000000000..16ec233f5
--- /dev/null
+++ b/tests/executables/resnet/init_torch.sh
@@ -0,0 +1,7 @@
+bash ../_utils/init_classification_torch.sh ../_utils
+
+if [ "$?" != "0" ]; then
+    exit 1
+fi
+
+exit 0
\ No newline at end of file
diff --git a/tests/executables/resnet/train_resnet50_amp_torch.sh b/tests/executables/resnet/train_resnet50_amp_torch.sh
new file mode 100644
index 000000000..65132746b
--- /dev/null
+++ b/tests/executables/resnet/train_resnet50_amp_torch.sh
@@ -0,0 +1,21 @@
+
+source ../_utils/global_environment_variables.sh
+
+: ${BATCH_SIZE:=256}
+
+OUTPUT_DIR=${PROJECT_DIR}/output/resnet/$0
+if [[ -d ${OUTPUT_DIR} ]]; then
+    mkdir -p ${OUTPUT_DIR}
+fi
+
+ixdltest-check --nonstrict_mode_args="--epoch ${NONSTRICT_EPOCH}" -b 8 --run_script \
+python3 ${PROJECT_DIR}/cv/classification/resnet50/pytorch/train.py \
+--data-path ${PROJECT_DIR}/data/datasets/imagenette \
+--batch-size ${BATCH_SIZE} \
+--lr 0.01 \
+--amp \
+--output-dir ${OUTPUT_DIR} \
+"$@" ;check_status
+
+rm -fr ${OUTPUT_DIR}
+exit ${EXIT_STATUS}
diff --git a/tests/executables/resnet/train_resnet50_dist_paddle.sh b/tests/executables/resnet/train_resnet50_dist_paddle.sh
new file mode 100644
index 000000000..e3aaf503c
--- /dev/null
+++ b/tests/executables/resnet/train_resnet50_dist_paddle.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+source ../_utils/global_environment_variables.sh
+source ../_utils/set_paddle_environment_variables.sh
+source ../_utils/get_num_devices.sh
+
+OUTPUT_DIR=${PROJECT_DIR}/output/resnet/$0
+if [[ -d ${OUTPUT_DIR} ]]; then
+    mkdir -p ${OUTPUT_DIR}
+fi
+
+RESNET_PADDLE_DIR=${PROJECT_DIR}/official/cv/classification/resnet/paddle
+cd ${RESNET_PADDLE_DIR}
+
+ixdltest-check --nonstrict_mode_args="--epoch ${NONSTRICT_EPOCH}" -b 8 --run_script \
+bash run_resnet50_dist.sh \
+"$@" ;check_status
+
+rm -fr ${OUTPUT_DIR}
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/tests/executables/resnet/train_resnet50_dist_tf.sh b/tests/executables/resnet/train_resnet50_dist_tf.sh
new file mode 100644
index 000000000..64fd864c3
--- /dev/null
+++ b/tests/executables/resnet/train_resnet50_dist_tf.sh
@@ -0,0 +1,23 @@
+CURRENT_DIR=$(cd `dirname $0`; pwd)
+
+source ../_utils/global_environment_variables.sh
+
+unset http_proxy
+unset https_proxy
+
+ROOT_DIR=${CURRENT_DIR}/../..
+export DATA_DIR=${ROOT_DIR}/data/datasets
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0)); then
+        EXIT_STATUS=1
+    fi
+}
+
+cd ${ROOT_DIR}/official/cv/classification/resnet/tensorflow
+ixdltest-check --nonstrict_mode_args="--epoch ${NONSTRICT_EPOCH}" -b 0.01 --run_script \
+bash run_train_resnet50_distributed_imagenette.sh "$@";  check_status
+
+exit ${EXIT_STATUS}
\ No newline at end of file
-- 
Gitee


From 7054f99c1643831b00cfe6ba08a143259d386ec7 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Fri, 26 Sep 2025 11:25:28 +0800
Subject: [PATCH 06/20] sync resnet tf

---
 tests/executables/resnet/train_resnet50_dist_tf.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/executables/resnet/train_resnet50_dist_tf.sh b/tests/executables/resnet/train_resnet50_dist_tf.sh
index 64fd864c3..e6b121f0a 100644
--- a/tests/executables/resnet/train_resnet50_dist_tf.sh
+++ b/tests/executables/resnet/train_resnet50_dist_tf.sh
@@ -16,7 +16,7 @@ check_status()
     fi
 }
 
-cd ${ROOT_DIR}/official/cv/classification/resnet/tensorflow
+cd ${ROOT_DIR}/cv/classification/resnet50/tensorflow/
 ixdltest-check --nonstrict_mode_args="--epoch ${NONSTRICT_EPOCH}" -b 0.01 --run_script \
 bash run_train_resnet50_distributed_imagenette.sh "$@";  check_status
 
-- 
Gitee


From 7500ddec38be9983a1cdd1c91a89092bd73d8d14 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Fri, 26 Sep 2025 11:32:09 +0800
Subject: [PATCH 07/20] sync retinanet

---
 tests/executables/retinanet/init_torch.sh     |  1 +
 .../retinanet/train_retinanet_amp_torch.sh    | 26 +++++++++++++++++++
 2 files changed, 27 insertions(+)
 create mode 100644 tests/executables/retinanet/init_torch.sh
 create mode 100644 tests/executables/retinanet/train_retinanet_amp_torch.sh

diff --git a/tests/executables/retinanet/init_torch.sh b/tests/executables/retinanet/init_torch.sh
new file mode 100644
index 000000000..1f8f08793
--- /dev/null
+++ b/tests/executables/retinanet/init_torch.sh
@@ -0,0 +1 @@
+bash ../_utils/init_detection_torch.sh ../_utils
\ No newline at end of file
diff --git a/tests/executables/retinanet/train_retinanet_amp_torch.sh b/tests/executables/retinanet/train_retinanet_amp_torch.sh
new file mode 100644
index 000000000..a3431f8f7
--- /dev/null
+++ b/tests/executables/retinanet/train_retinanet_amp_torch.sh
@@ -0,0 +1,26 @@
+source ../_utils/global_environment_variables.sh
+source ../_utils/get_num_devices.sh
+
+: ${BATCH_SIZE:=8}
+
+CURRENT_DIR=$(cd `dirname $0`; pwd)
+ROOT_DIR=${CURRENT_DIR}/../..
+DATA_DIR=${ROOT_DIR}/data/datasets/VOC2012_sample
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0)); then
+        EXIT_STATUS=1
+    fi
+}
+
+ixdltest-check --nonstrict_mode_args="--epoch ${NONSTRICT_EPOCH}" -b 0 --run_script \
+python3 ../../cv/detection/retinanet/pytorch/train.py \
+--model retinanet_resnet50_fpn \
+--lr 0.01 \
+--data-path ${DATA_DIR} \
+--batch-size ${BATCH_SIZE} \
+--amp "$@";  check_status
+
+exit ${EXIT_STATUS}
-- 
Gitee


From 1011445b147e7507cab53fdf73acb839dc070006 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Fri, 26 Sep 2025 11:40:48 +0800
Subject: [PATCH 08/20] sync ssd tf

---
 tests/executables/ssd/init_tf.sh          | 67 +++++++++++++++++++++++
 tests/executables/ssd/train_ssd_amp_tf.sh | 40 ++++++++++++++
 2 files changed, 107 insertions(+)
 create mode 100644 tests/executables/ssd/init_tf.sh
 create mode 100644 tests/executables/ssd/train_ssd_amp_tf.sh

diff --git a/tests/executables/ssd/init_tf.sh b/tests/executables/ssd/init_tf.sh
new file mode 100644
index 000000000..a82a9f8f9
--- /dev/null
+++ b/tests/executables/ssd/init_tf.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+CUR_DIR=$(cd "$(dirname "$0")";pwd)
+PROJECT_ROOT="${CUR_DIR}/../.."
+DATASET_DIR="${PROJECT_ROOT}/data/datasets"
+MODEL_CPT_DIR="${PROJECT_ROOT}/data/model_zoo/ssd_tf"
+VOC_RECORD_DIR="${DATASET_DIR}/tf_ssd_voc_record"
+SSD_ROOT="${PROJECT_ROOT}/cv/detection/ssd/tensorflow"
+
+# determine whether the user is root mode to execute this script
+prefix_sudo=""
+current_user=$(whoami)
+if [ "$current_user" != "root" ]; then
+    echo "User $current_user need to add sudo permission keywords"
+    prefix_sudo="sudo"
+fi
+
+echo "prefix_sudo= $prefix_sudo"
+
+# pip3 install --upgrade tf_slim
+pip3 uninstall -y protobuf
+pip3 install "protobuf<4.0.0"
+source $(cd `dirname $0`; pwd)/../_utils/which_install_tool.sh
+if command_exists apt; then
+	$prefix_sudo apt install -y git numactl
+elif command_exists dnf; then
+	$prefix_sudo dnf install -y git numactl
+else
+	$prefix_sudo yum install -y git numactl
+fi
+
+# Prepare checkpoint
+echo "Prepare SSD's checkpoint"
+if [ -d "$MODEL_CPT_DIR" ]; then
+    rm -rf $MODEL_CPT_DIR
+fi
+mkdir -p $MODEL_CPT_DIR
+
+echo "Unarchive model checkpoint"
+tar -xzvf "${MODEL_CPT_DIR}.tar" -C "${MODEL_CPT_DIR}/../"
+if [ -d "$SSD_ROOT/model" ]; then
+    rm "$SSD_ROOT/model"
+fi
+ln -s ${MODEL_CPT_DIR} "$SSD_ROOT/model"
+echo "Make soft link from ${MODEL_CPT_DIR} to $SSD_ROOT/model"
+
+# Prepare voc dataset
+echo "Start make SSD's dataset"
+if [ -d $VOC_RECORD_DIR ]; then
+    rm -rf $VOC_RECORD_DIR
+fi
+
+mkdir $VOC_RECORD_DIR
+
+cd $SSD_ROOT
+python3 dataset/convert_voc_sample_tfrecords.py \
+--dataset_directory=$DATASET_DIR \
+--output_directory=$VOC_RECORD_DIR \
+--train_splits VOC2012_sample \
+--validation_splits VOC2012_sample
+
+if [ -d "dataset/tfrecords" ]; then
+    rm "dataset/tfrecords"
+fi
+
+ln -s $VOC_RECORD_DIR "./dataset/tfrecords"
+echo "End make SSD's dataset"
+cd $CUR_DIR
diff --git a/tests/executables/ssd/train_ssd_amp_tf.sh b/tests/executables/ssd/train_ssd_amp_tf.sh
new file mode 100644
index 000000000..e7aa985f3
--- /dev/null
+++ b/tests/executables/ssd/train_ssd_amp_tf.sh
@@ -0,0 +1,40 @@
+source ../_utils/global_environment_variables.sh
+
+: ${BATCH_SIZE:=16}
+
+CURTIME=`date --utc +%Y%m%d%H%M%S`
+CURRENT_DIR=$(cd `dirname $0`; pwd)
+MODEL_NAME=`basename ${CURRENT_DIR}`
+
+ROOT_DIR="${CURRENT_DIR}/../.."
+DATASET_PATH="${ROOT_DIR}/data/datasets/imagenette"
+MODEL_ZOO="${ROOT_DIR}/data/model_zoo"
+WORKSPACE="${ROOT_DIR}/output/${MODEL_NAME}/$0/${CURTIME}"
+SRC_DIR="${ROOT_DIR}/cv/detection/ssd/tensorflow"
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0)); then
+        EXIT_STATUS=1
+    fi
+}
+
+cd ${SRC_DIR}
+
+
+if [[ -d "./logs" ]]; then
+    rm -rf ./logs
+fi
+
+: ${CUDA_VISIBLE_DEVICES:="0"}
+CUDA_VISIBLE_DEVICES=(${CUDA_VISIBLE_DEVICES//,/ })
+CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES[0]}
+
+ixdltest-check --nonstrict_mode_args="--train_epochs ${NONSTRICT_EPOCH}" -b 0. --run_script \
+CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} PYTHONPATH=$PYTHONPATH:${SRC_DIR} python3 train_ssd.py --batch_size ${BATCH_SIZE} --multi_gpu=False \
+  --use_amp "$@";  check_status
+
+
+cd -
+exit ${EXIT_STATUS}
-- 
Gitee


From f6dbca4b1ab1dc054d8b2c2d7026962a5a1800f6 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Fri, 26 Sep 2025 11:44:26 +0800
Subject: [PATCH 09/20] sync unet3d

---
 tests/executables/unet3d/init_torch.sh        |  1 +
 ...ain_unet3d_kits19_stage3_dist_1x8_torch.sh | 50 +++++++++++++++++++
 2 files changed, 51 insertions(+)
 create mode 100644 tests/executables/unet3d/init_torch.sh
 create mode 100644 tests/executables/unet3d/train_unet3d_kits19_stage3_dist_1x8_torch.sh

diff --git a/tests/executables/unet3d/init_torch.sh b/tests/executables/unet3d/init_torch.sh
new file mode 100644
index 000000000..eb598caec
--- /dev/null
+++ b/tests/executables/unet3d/init_torch.sh
@@ -0,0 +1 @@
+bash ../_utils/init_segmentation_torch.sh ../_utils
\ No newline at end of file
diff --git a/tests/executables/unet3d/train_unet3d_kits19_stage3_dist_1x8_torch.sh b/tests/executables/unet3d/train_unet3d_kits19_stage3_dist_1x8_torch.sh
new file mode 100644
index 000000000..0172b5720
--- /dev/null
+++ b/tests/executables/unet3d/train_unet3d_kits19_stage3_dist_1x8_torch.sh
@@ -0,0 +1,50 @@
+# export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+source ../_utils/get_num_devices.sh
+
+CURRENT_DIR=$(cd `dirname $0`; pwd)
+ROOT_DIR=${CURRENT_DIR}/../..
+DATA_DIR=${ROOT_DIR}/data/datasets/kits19/train
+RESUME=${ROOT_DIR}/data/model_zoo/unet3d/model_3620_stage3_start.pth
+
+SEED=1234
+MAX_EPOCHS=4200
+QUALITY_THRESHOLD="0.908"
+START_EVAL_AT=3640
+EVALUATE_EVERY=5
+LEARNING_RATE="0.8"
+LR_WARMUP_EPOCHS=200
+: ${BATCH_SIZE:=4}
+GRADIENT_ACCUMULATION_STEPS=1
+SAVE_CKPT="./ckpt_stage3"
+LOG_NAME='train_log_stage3.json'
+
+cd ../../cv/semantic_segmentation/unet3d/pytorch
+if [ ! -d ${SAVE_CKPT} ]; then
+    mkdir ${SAVE_CKPT};
+fi
+
+python3 -u -m torch.distributed.launch --nproc_per_node=$IX_NUM_CUDA_VISIBLE_DEVICES \
+main.py --data_dir ${DATA_DIR} \
+--epochs ${MAX_EPOCHS} \
+--evaluate_every ${EVALUATE_EVERY} \
+--start_eval_at ${START_EVAL_AT} \
+--quality_threshold ${QUALITY_THRESHOLD} \
+--batch_size ${BATCH_SIZE} \
+--optimizer sgd \
+--ga_steps ${GRADIENT_ACCUMULATION_STEPS} \
+--learning_rate ${LEARNING_RATE} \
+--seed ${SEED} \
+--lr_warmup_epochs ${LR_WARMUP_EPOCHS} \
+--output-dir ${SAVE_CKPT} \
+--log_name ${LOG_NAME} \
+--resume ${RESUME}
+"$@"
+
+if [ $? -eq 0 ];then
+    echo 'converged to the target value 0.908 of epoch 3820 in full train, stage-wise training succeed'
+    exit 0
+else
+    echo 'not converged to the target value, training fail'
+    exit 1
+fi
+
-- 
Gitee


From a2ea0478cbc846da24d6d5c63186efa0c715f8cf Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Fri, 26 Sep 2025 13:59:21 +0800
Subject: [PATCH 10/20] sync conformer all

---
 .../conformer/pytorch/bind_pyt.py             | 137 +++++
 .../pytorch/configs/conformer_lstm.json       |  98 ++++
 .../conformer/pytorch/dataloader.py           | 120 +++++
 .../conformer/pytorch/openspeech/README.md    | 248 +++++++++
 .../conformer/pytorch/openspeech/__init__.py  |  33 ++
 .../pytorch/openspeech/criterion/__init__.py  | 103 ++++
 .../criterion/cross_entropy/__init__.py       |  21 +
 .../criterion/cross_entropy/configuration.py  |  48 ++
 .../criterion/cross_entropy/cross_entropy.py  | 126 +++++
 .../openspeech/criterion/ctc/__init__.py      |  21 +
 .../openspeech/criterion/ctc/configuration.py |  51 ++
 .../pytorch/openspeech/criterion/ctc/ctc.py   | 148 ++++++
 .../joint_ctc_cross_entropy/__init__.py       |  21 +
 .../joint_ctc_cross_entropy/configuration.py  |  63 +++
 .../joint_ctc_cross_entropy.py                | 117 +++++
 .../label_smoothed_cross_entropy/__init__.py  |  21 +
 .../configuration.py                          |  51 ++
 .../label_smoothed_cross_entropy.py           |  94 ++++
 .../criterion/perplexity/__init__.py          |  21 +
 .../criterion/perplexity/configuration.py     |  48 ++
 .../criterion/perplexity/perplexity.py        |  78 +++
 .../criterion/transducer/__init__.py          |  21 +
 .../criterion/transducer/configuration.py     |  51 ++
 .../criterion/transducer/transducer.py        |  91 ++++
 .../pytorch/openspeech/data/__init__.py       |  71 +++
 .../pytorch/openspeech/data/audio/__init__.py |  21 +
 .../pytorch/openspeech/data/audio/augment.py  | 193 +++++++
 .../openspeech/data/audio/data_loader.py      | 130 +++++
 .../pytorch/openspeech/data/audio/dataset.py  | 219 ++++++++
 .../data/audio/filter_bank/__init__.py        |  21 +
 .../data/audio/filter_bank/configuration.py   |  79 +++
 .../data/audio/filter_bank/filter_bank.py     |  71 +++
 .../pytorch/openspeech/data/audio/load.py     |  57 +++
 .../data/audio/melspectrogram/__init__.py     |  21 +
 .../audio/melspectrogram/configuration.py     |  79 +++
 .../audio/melspectrogram/melspectrogram.py    |  72 +++
 .../openspeech/data/audio/mfcc/__init__.py    |  21 +
 .../data/audio/mfcc/configuration.py          |  79 +++
 .../openspeech/data/audio/mfcc/mfcc.py        |  77 +++
 .../data/audio/spectrogram/__init__.py        |  21 +
 .../data/audio/spectrogram/configuration.py   |  80 +++
 .../data/audio/spectrogram/spectrogram.py     |  65 +++
 .../pytorch/openspeech/data/sampler.py        |  95 ++++
 .../openspeech/data/text/data_loader.py       |  87 ++++
 .../pytorch/openspeech/data/text/dataset.py   |  77 +++
 .../pytorch/openspeech/dataclass/__init__.py  |  87 ++++
 .../openspeech/dataclass/configurations.py    | 481 ++++++++++++++++++
 .../openspeech/dataclass/initialize.py        |  96 ++++
 .../pytorch/openspeech/datasets/__init__.py   |  60 +++
 .../datasets/librispeech/__init__.py          |  21 +
 .../datasets/librispeech/lit_data_module.py   | 169 ++++++
 .../librispeech/preprocess/__init__.py        |   0
 .../librispeech/preprocess/character.py       | 116 +++++
 .../librispeech/preprocess/preprocess.py      |  60 +++
 .../librispeech/preprocess/subword.py         |  92 ++++
 .../pytorch/openspeech/decoders/__init__.py   |  27 +
 .../decoders/lstm_attention_decoder.py        | 251 +++++++++
 .../openspeech/decoders/openspeech_decoder.py |  42 ++
 .../decoders/rnn_transducer_decoder.py        | 128 +++++
 .../decoders/transformer_decoder.py           | 275 ++++++++++
 .../transformer_transducer_decoder.py         | 156 ++++++
 .../pytorch/openspeech/encoders/__init__.py   |  32 ++
 .../openspeech/encoders/conformer_encoder.py  | 141 +++++
 .../openspeech/encoders/contextnet_encoder.py | 123 +++++
 .../encoders/convolutional_lstm_encoder.py    | 134 +++++
 .../convolutional_transformer_encoder.py      | 147 ++++++
 .../openspeech/encoders/deepspeech2.py        | 123 +++++
 .../pytorch/openspeech/encoders/jasper.py     | 182 +++++++
 .../openspeech/encoders/lstm_encoder.py       | 128 +++++
 .../openspeech/encoders/openspeech_encoder.py |  72 +++
 .../pytorch/openspeech/encoders/quartznet.py  | 120 +++++
 .../encoders/rnn_transducer_encoder.py        | 112 ++++
 .../encoders/transformer_encoder.py           | 205 ++++++++
 .../transformer_transducer_encoder.py         | 177 +++++++
 .../pytorch/openspeech/lm/__init__.py         |  24 +
 .../pytorch/openspeech/lm/lstm_lm.py          | 158 ++++++
 .../pytorch/openspeech/lm/openspeech_lm.py    |  45 ++
 .../pytorch/openspeech/lm/transformer_lm.py   | 171 +++++++
 .../conformer/pytorch/openspeech/metrics.py   | 156 ++++++
 .../pytorch/openspeech/models/README.md       |  27 +
 .../pytorch/openspeech/models/__init__.py     | 208 ++++++++
 .../openspeech/models/conformer/__init__.py   |  34 ++
 .../models/conformer/configurations.py        | 368 ++++++++++++++
 .../openspeech/models/conformer/model.py      | 328 ++++++++++++
 .../openspeech/models/contextnet/__init__.py  |  32 ++
 .../models/contextnet/configurations.py       | 215 ++++++++
 .../openspeech/models/contextnet/model.py     | 255 ++++++++++
 .../openspeech/models/deepspeech2/__init__.py |  28 +
 .../models/deepspeech2/configurations.py      |  71 +++
 .../openspeech/models/deepspeech2/model.py    | 116 +++++
 .../openspeech/models/jasper/__init__.py      |  30 ++
 .../models/jasper/configurations.py           | 135 +++++
 .../pytorch/openspeech/models/jasper/model.py |  79 +++
 .../models/listen_attend_spell/__init__.py    |  36 ++
 .../listen_attend_spell/configurations.py     | 383 ++++++++++++++
 .../models/listen_attend_spell/model.py       | 324 ++++++++++++
 .../openspeech/models/lstm_lm/__init__.py     |  28 +
 .../models/lstm_lm/configurations.py          |  71 +++
 .../openspeech/models/lstm_lm/model.py        |  62 +++
 .../openspeech/models/openspeech_ctc_model.py | 174 +++++++
 .../openspeech_encoder_decoder_model.py       | 225 ++++++++
 .../models/openspeech_language_model.py       | 168 ++++++
 .../openspeech/models/openspeech_model.py     | 188 +++++++
 .../models/openspeech_transducer_model.py     | 287 +++++++++++
 .../openspeech/models/quartznet/__init__.py   |  32 ++
 .../models/quartznet/configurations.py        | 193 +++++++
 .../openspeech/models/quartznet/model.py      | 103 ++++
 .../models/rnn_transducer/__init__.py         |  24 +
 .../models/rnn_transducer/configurations.py   |  83 +++
 .../openspeech/models/rnn_transducer/model.py |  75 +++
 .../models/transformer_lm/__init__.py         |  24 +
 .../models/transformer_lm/configurations.py   |  71 +++
 .../openspeech/models/transformer_lm/model.py |  62 +++
 .../models/transformer_transducer/__init__.py |  24 +
 .../transformer_transducer/configurations.py  |  92 ++++
 .../models/transformer_transducer/model.py    | 119 +++++
 .../pytorch/openspeech/modules/__init__.py    |  97 ++++
 .../openspeech/modules/add_normalization.py   |  44 ++
 .../openspeech/modules/additive_attention.py  |  63 +++
 .../openspeech/modules/batchnorm_relu_rnn.py  |  84 +++
 .../modules/conformer_attention_module.py     |  81 +++
 .../openspeech/modules/conformer_block.py     | 110 ++++
 .../modules/conformer_convolution_module.py   |  83 +++
 .../modules/conformer_feed_forward_module.py  |  73 +++
 .../openspeech/modules/contextnet_block.py    | 199 ++++++++
 .../openspeech/modules/contextnet_module.py   | 182 +++++++
 .../openspeech/modules/conv2d_extractor.py    | 107 ++++
 .../openspeech/modules/conv2d_subsampling.py  |  71 +++
 .../pytorch/openspeech/modules/conv_base.py   |  38 ++
 .../openspeech/modules/conv_group_shuffle.py  |  42 ++
 .../modules/deepspeech2_extractor.py          |  73 +++
 .../openspeech/modules/depthwise_conv1d.py    |  74 +++
 .../modules/dot_product_attention.py          |  80 +++
 .../pytorch/openspeech/modules/glu.py         |  38 ++
 .../openspeech/modules/jasper_block.py        | 109 ++++
 .../openspeech/modules/jasper_subblock.py     | 115 +++++
 .../modules/location_aware_attention.py       |  92 ++++
 .../pytorch/openspeech/modules/mask.py        |  60 +++
 .../pytorch/openspeech/modules/mask_conv1d.py |  89 ++++
 .../pytorch/openspeech/modules/mask_conv2d.py |  98 ++++
 .../modules/multi_head_attention.py           |  89 ++++
 .../openspeech/modules/pointwise_conv1d.py    |  66 +++
 .../openspeech/modules/positional_encoding.py |  50 ++
 .../modules/positionwise_feed_forward.py      |  47 ++
 .../openspeech/modules/quartznet_block.py     | 104 ++++
 .../openspeech/modules/quartznet_subblock.py  |  96 ++++
 .../modules/relative_multi_head_attention.py  | 121 +++++
 .../modules/residual_connection_module.py     |  48 ++
 .../pytorch/openspeech/modules/swish.py       |  37 ++
 .../modules/time_channel_separable_conv1d.py  |  61 +++
 .../modules/transformer_embedding.py          |  60 +++
 .../openspeech/modules/vgg_extractor.py       |  81 +++
 .../pytorch/openspeech/modules/wrapper.py     |  64 +++
 .../pytorch/openspeech/optim/__init__.py      |  44 ++
 .../pytorch/openspeech/optim/adamp.py         | 109 ++++
 .../pytorch/openspeech/optim/novograd.py      | 127 +++++
 .../pytorch/openspeech/optim/optimizer.py     |  83 +++
 .../pytorch/openspeech/optim/radam.py         | 118 +++++
 .../openspeech/optim/scheduler/__init__.py    |  59 +++
 .../optim/scheduler/lr_scheduler.py           |  47 ++
 .../reduce_lr_on_plateau_scheduler.py         |  82 +++
 .../scheduler/transformer_lr_scheduler.py     | 109 ++++
 .../optim/scheduler/tri_stage_lr_scheduler.py | 154 ++++++
 .../warmup_reduce_lr_on_plateau_scheduler.py  | 102 ++++
 .../optim/scheduler/warmup_scheduler.py       |  82 +++
 .../pytorch/openspeech/search/__init__.py     |  28 +
 .../openspeech/search/beam_search_base.py     | 134 +++++
 .../openspeech/search/beam_search_ctc.py      |  84 +++
 .../openspeech/search/beam_search_lstm.py     | 154 ++++++
 .../search/beam_search_rnn_transducer.py      | 156 ++++++
 .../search/beam_search_transformer.py         | 155 ++++++
 .../beam_search_transformer_transducer.py     | 156 ++++++
 .../openspeech/search/ensemble_search.py      |  96 ++++
 .../pytorch/openspeech/tokenizers/__init__.py |  75 +++
 .../openspeech/tokenizers/aishell/__init__.py |  21 +
 .../tokenizers/aishell/character.py           | 138 +++++
 .../tokenizers/ksponspeech/__init__.py        |  21 +
 .../tokenizers/ksponspeech/character.py       | 134 +++++
 .../tokenizers/ksponspeech/grapheme.py        | 134 +++++
 .../tokenizers/ksponspeech/subword.py         | 100 ++++
 .../tokenizers/librispeech/__init__.py        |  21 +
 .../tokenizers/librispeech/character.py       | 135 +++++
 .../tokenizers/librispeech/subword.py         |  95 ++++
 .../openspeech/tokenizers/tokenizer.py        |  44 ++
 .../conformer/pytorch/openspeech/utils.py     | 211 ++++++++
 .../conformer/pytorch/requirements.txt        |   6 +
 .../conformer/pytorch/run_training.sh         |  28 +
 .../pytorch/test/test_build_conformer.py      |  32 ++
 .../conformer/pytorch/test/test_dataloader.py |  43 ++
 .../conformer/pytorch/train.py                | 320 ++++++++++++
 .../conformer/pytorch/utils/__init__.py       |  24 +
 .../conformer/pytorch/utils/config.py         |  19 +
 .../conformer/pytorch/utils/dist.py           | 203 ++++++++
 .../conformer/pytorch/utils/logger.py         |   1 +
 .../conformer/pytorch/utils/misc.py           |  20 +
 tests/executables/conformer/init_torch.sh     |  50 ++
 ...in_conformer_librispeech_dist_1x8_torch.sh |  31 ++
 tests/executables/ssd/init_torch.sh           |  63 +++
 tests/executables/ssd/train_ssd_amp_torch.sh  |  25 +
 199 files changed, 19651 insertions(+)
 create mode 100644 audio/speech_recognition/conformer/pytorch/bind_pyt.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/configs/conformer_lstm.json
 create mode 100644 audio/speech_recognition/conformer/pytorch/dataloader.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/README.md
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/criterion/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/criterion/cross_entropy/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/criterion/cross_entropy/configuration.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/criterion/cross_entropy/cross_entropy.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/criterion/ctc/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/criterion/ctc/configuration.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/criterion/ctc/ctc.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/criterion/joint_ctc_cross_entropy/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/criterion/joint_ctc_cross_entropy/configuration.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/criterion/joint_ctc_cross_entropy/joint_ctc_cross_entropy.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/criterion/label_smoothed_cross_entropy/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/criterion/label_smoothed_cross_entropy/configuration.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/criterion/label_smoothed_cross_entropy/label_smoothed_cross_entropy.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/criterion/perplexity/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/criterion/perplexity/configuration.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/criterion/perplexity/perplexity.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/criterion/transducer/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/criterion/transducer/configuration.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/criterion/transducer/transducer.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/data/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/data/audio/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/data/audio/augment.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/data/audio/data_loader.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/data/audio/dataset.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/data/audio/filter_bank/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/data/audio/filter_bank/configuration.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/data/audio/filter_bank/filter_bank.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/data/audio/load.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/data/audio/melspectrogram/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/data/audio/melspectrogram/configuration.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/data/audio/melspectrogram/melspectrogram.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/data/audio/mfcc/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/data/audio/mfcc/configuration.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/data/audio/mfcc/mfcc.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/data/audio/spectrogram/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/data/audio/spectrogram/configuration.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/data/audio/spectrogram/spectrogram.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/data/sampler.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/data/text/data_loader.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/data/text/dataset.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/dataclass/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/dataclass/configurations.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/dataclass/initialize.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/datasets/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/datasets/librispeech/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/datasets/librispeech/lit_data_module.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/datasets/librispeech/preprocess/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/datasets/librispeech/preprocess/character.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/datasets/librispeech/preprocess/preprocess.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/datasets/librispeech/preprocess/subword.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/decoders/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/decoders/lstm_attention_decoder.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/decoders/openspeech_decoder.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/decoders/rnn_transducer_decoder.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/decoders/transformer_decoder.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/decoders/transformer_transducer_decoder.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/encoders/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/encoders/conformer_encoder.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/encoders/contextnet_encoder.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/encoders/convolutional_lstm_encoder.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/encoders/convolutional_transformer_encoder.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/encoders/deepspeech2.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/encoders/jasper.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/encoders/lstm_encoder.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/encoders/openspeech_encoder.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/encoders/quartznet.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/encoders/rnn_transducer_encoder.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/encoders/transformer_encoder.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/encoders/transformer_transducer_encoder.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/lm/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/lm/lstm_lm.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/lm/openspeech_lm.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/lm/transformer_lm.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/metrics.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/README.md
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/conformer/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/conformer/configurations.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/conformer/model.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/contextnet/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/contextnet/configurations.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/contextnet/model.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/deepspeech2/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/deepspeech2/configurations.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/deepspeech2/model.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/jasper/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/jasper/configurations.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/jasper/model.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/listen_attend_spell/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/listen_attend_spell/configurations.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/listen_attend_spell/model.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/lstm_lm/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/lstm_lm/configurations.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/lstm_lm/model.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/openspeech_ctc_model.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/openspeech_encoder_decoder_model.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/openspeech_language_model.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/openspeech_model.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/openspeech_transducer_model.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/quartznet/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/quartznet/configurations.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/quartznet/model.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/rnn_transducer/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/rnn_transducer/configurations.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/rnn_transducer/model.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/transformer_lm/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/transformer_lm/configurations.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/transformer_lm/model.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/transformer_transducer/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/transformer_transducer/configurations.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/models/transformer_transducer/model.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/add_normalization.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/additive_attention.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/batchnorm_relu_rnn.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/conformer_attention_module.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/conformer_block.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/conformer_convolution_module.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/conformer_feed_forward_module.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/contextnet_block.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/contextnet_module.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/conv2d_extractor.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/conv2d_subsampling.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/conv_base.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/conv_group_shuffle.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/deepspeech2_extractor.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/depthwise_conv1d.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/dot_product_attention.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/glu.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/jasper_block.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/jasper_subblock.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/location_aware_attention.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/mask.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/mask_conv1d.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/mask_conv2d.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/multi_head_attention.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/pointwise_conv1d.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/positional_encoding.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/positionwise_feed_forward.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/quartznet_block.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/quartznet_subblock.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/relative_multi_head_attention.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/residual_connection_module.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/swish.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/time_channel_separable_conv1d.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/transformer_embedding.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/vgg_extractor.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/modules/wrapper.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/optim/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/optim/adamp.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/optim/novograd.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/optim/optimizer.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/optim/radam.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/lr_scheduler.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/reduce_lr_on_plateau_scheduler.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/transformer_lr_scheduler.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/tri_stage_lr_scheduler.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/warmup_reduce_lr_on_plateau_scheduler.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/warmup_scheduler.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/search/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/search/beam_search_base.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/search/beam_search_ctc.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/search/beam_search_lstm.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/search/beam_search_rnn_transducer.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/search/beam_search_transformer.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/search/beam_search_transformer_transducer.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/search/ensemble_search.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/aishell/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/aishell/character.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/ksponspeech/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/ksponspeech/character.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/ksponspeech/grapheme.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/ksponspeech/subword.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/librispeech/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/librispeech/character.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/librispeech/subword.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/tokenizer.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/openspeech/utils.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/requirements.txt
 create mode 100644 audio/speech_recognition/conformer/pytorch/run_training.sh
 create mode 100644 audio/speech_recognition/conformer/pytorch/test/test_build_conformer.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/test/test_dataloader.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/train.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/utils/__init__.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/utils/config.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/utils/dist.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/utils/logger.py
 create mode 100644 audio/speech_recognition/conformer/pytorch/utils/misc.py
 create mode 100644 tests/executables/conformer/init_torch.sh
 create mode 100644 tests/executables/conformer/train_conformer_librispeech_dist_1x8_torch.sh
 create mode 100644 tests/executables/ssd/init_torch.sh
 create mode 100644 tests/executables/ssd/train_ssd_amp_torch.sh

diff --git a/audio/speech_recognition/conformer/pytorch/bind_pyt.py b/audio/speech_recognition/conformer/pytorch/bind_pyt.py
new file mode 100644
index 000000000..1a4e44cfa
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/bind_pyt.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import subprocess
+import os
+from argparse import ArgumentParser
+
+
+def parse_args():
+    parser = ArgumentParser(description="PyTorch distributed training launch "
+                                        "helper utilty that will spawn up "
+                                        "multiple distributed processes")
+    # Optional arguments for the launch helper
+    parser.add_argument("--nnodes", type=int, default=1,
+                        help="The number of nodes to use for distributed "
+                             "training")
+    parser.add_argument("--nproc_per_node", type=int, default=1,
+                        help="The number of processes to launch on each node, "
+                             "for GPU training, this is recommended to be set "
+                             "to the number of GPUs in your system so that "
+                             "each process can be bound to a single GPU.")
+    parser.add_argument("--node_rank", type=int, default=0,
+                        help="The rank of the node for multi-node distributed "
+                             "training")
+    parser.add_argument("--master_addr", default="127.0.0.1", type=str,
+                        help="Master node (rank 0)'s address, should be either "
+                             "the IP address or the hostname of node 0, for "
+                             "single node multi-proc training, the "
+                             "--master_addr can simply be 127.0.0.1")
+    parser.add_argument("--master_port", default=29500, type=int,
+                        help="Master node (rank 0)'s free port that needs to "
+                             "be used for communciation during distributed "
+                             "training")
+    parser.add_argument('--no_hyperthreads', action='store_true',
+                        help='Flag to disable binding to hyperthreads')
+    parser.add_argument('--no_membind', action='store_true',
+                        help='Flag to disable memory binding')
+    # non-optional arguments for binding
+    parser.add_argument("--nsockets_per_node", type=int, required=True,
+                        help="Number of CPU sockets on a node")
+    parser.add_argument("--ncores_per_socket", type=int, required=True,
+                        help="Number of CPU cores per socket")
+    # positional
+    parser.add_argument("training_script", type=str,
+                         help="The full path to the single GPU training "
+                              "program/script to be launched in parallel, "
+                              "followed by all the arguments for the "
+                              "training script")
+
+    args, unparsed = parser.parse_known_args()
+    args.training_script_args = unparsed
+
+    return args
+
+
+def get_cuda_visible_devices(gpus=1):
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        return os.environ['CUDA_VISIBLE_DEVICES']
+    return ','.join([str(gpu_id) for gpu_id in range(gpus)])
+
+
+def main():
+    args = parse_args()
+
+    # set PyTorch distributed related environmental variables
+    current_env = os.environ.copy()
+    current_env["MASTER_ADDR"] = args.master_addr
+    current_env["MASTER_PORT"] = str(args.master_port)
+    current_env["NODE_RANK"] = str(args.node_rank)
+    current_env["CUDA_VISIBLE_DEVICES"] = get_cuda_visible_devices(args.nproc_per_node)
+
+    gpu_ids = current_env["CUDA_VISIBLE_DEVICES"].split(',')
+    args.nproc_per_node = len(gpu_ids)
+
+    # world size in terms of number of processes
+    dist_world_size = args.nproc_per_node * args.nnodes
+    current_env["WORLD_SIZE"] = str(dist_world_size)
+
+    # variables for numactrl binding
+    NSOCKETS = args.nsockets_per_node
+    NGPUS_PER_SOCKET = (args.nproc_per_node // args.nsockets_per_node) + \
+                       (1 if (args.nproc_per_node % args.nsockets_per_node) else 0)
+    NCORES_PER_GPU = args.ncores_per_socket // NGPUS_PER_SOCKET
+
+    processes = []
+
+    for local_rank in range(0, args.nproc_per_node):
+        # each process's rank
+        dist_rank = args.nproc_per_node * args.node_rank + local_rank
+        current_env["RANK"] = str(dist_rank)
+        current_env["LOCAL_RANK"] = str(local_rank)
+
+        # form numactrl binding command
+        cpu_ranges = [local_rank * NCORES_PER_GPU,
+                     (local_rank + 1) * NCORES_PER_GPU - 1,
+                     local_rank * NCORES_PER_GPU + (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS),
+                     (local_rank + 1) * NCORES_PER_GPU + (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS) - 1]
+
+        numactlargs = []
+        if args.no_hyperthreads:
+            numactlargs += [ "--physcpubind={}-{}".format(*cpu_ranges[0:2]) ]
+        else:
+            numactlargs += [ "--physcpubind={}-{},{}-{}".format(*cpu_ranges) ]
+
+        if not args.no_membind:
+            memnode = local_rank // NGPUS_PER_SOCKET
+            numactlargs += [ "--membind={}".format(memnode) ]
+
+        # spawn the processes
+        cmd = [ "/usr/bin/numactl" ] \
+            + numactlargs \
+            + [ sys.executable,
+                "-u",
+                args.training_script,
+                "--local_rank={}".format(local_rank)
+              ] \
+            + args.training_script_args
+
+        process = subprocess.Popen(cmd, env=current_env)
+        processes.append(process)
+
+    for process in processes:
+        process.wait()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/audio/speech_recognition/conformer/pytorch/configs/conformer_lstm.json b/audio/speech_recognition/conformer/pytorch/configs/conformer_lstm.json
new file mode 100644
index 000000000..87b5a35a3
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/configs/conformer_lstm.json
@@ -0,0 +1,98 @@
+{
+    "audio": {
+        "name": "fbank",
+        "sample_rate": 16000,
+        "frame_length": 20.0,
+        "frame_shift": 10.0,
+        "del_silence": false,
+        "num_mels": 80,
+        "apply_spec_augment": true,
+        "apply_noise_augment": false,
+        "apply_time_stretch_augment": false,
+        "apply_joining_augment": false
+    },
+    "augment": {
+        "apply_spec_augment": false,
+        "apply_noise_augment": false,
+        "apply_joining_augment": false,
+        "apply_time_stretch_augment": false,
+        "freq_mask_para": 27,
+        "freq_mask_num": 2,
+        "time_mask_num": 4,
+        "noise_dataset_dir": null,
+        "noise_level": 0.7,
+        "time_stretch_min_rate": 0.7,
+        "time_stretch_max_rate": 1.4
+    },
+    "dataset": {
+        "dataset": "librispeech",
+        "dataset_path": "datasets/LibriSpeech",
+        "train_parts": ["train-clean-100"],
+        "eval_parts": ["dev-clean"],
+        "dataset_download": false,
+        "train_manifest_file": "datasets/LibriSpeech/libri_subword_train_manifest.txt",
+        "eval_manifest_file": "datasets/LibriSpeech/libri_subword_dev_manifest.txt"
+    },
+    "criterion": {
+        "criterion_name": "cross_entropy",
+        "reduction": "mean"
+    },
+    "lr_scheduler": {
+        "lr": 0.0001,
+        "scheduler_name": "warmup_reduce_lr_on_plateau",
+        "lr_patience": 1,
+        "lr_factor": 0.3,
+        "peak_lr": 0.0001,
+        "init_lr": 1e-10,
+        "warmup_steps": 4000
+    },
+    "model": {
+        "model_name": "conformer_lstm",
+        "encoder_dim": 512,
+        "num_encoder_layers": 17,
+        "num_attention_heads": 8,
+        "feed_forward_expansion_factor": 4,
+        "conv_expansion_factor": 2,
+        "input_dropout_p": 0.1,
+        "feed_forward_dropout_p": 0.1,
+        "attention_dropout_p": 0.1,
+        "conv_dropout_p": 0.1,
+        "conv_kernel_size": 31,
+        "half_step_residual": true,
+        "num_decoder_layers": 2,
+        "decoder_dropout_p": 0.1,
+        "max_length": 128,
+        "teacher_forcing_ratio": 1.0,
+        "rnn_type": "lstm",
+        "decoder_attn_mechanism": "loc",
+        "optimizer": "adam"
+    },
+    "trainer": {
+        "seed": 1,
+        "accelerator": "ddp",
+        "accumulate_grad_batches": 1,
+        "num_workers": 4,
+        "batch_size": 8,
+        "check_val_every_n_epoch": 1,
+        "gradient_clip_val": 5.0,
+        "logger": "wandb",
+        "max_epochs": 50,
+        "save_checkpoint_n_steps": 10000,
+        "auto_scale_batch_size": "binsearch",
+        "sampler": "random",
+        "name": "gpu",
+        "device": "gpu",
+        "use_cuda": true,
+        "auto_select_gpus": true
+    },
+    "tokenizer": {
+        "sos_token": "<s>",
+        "eos_token": "</s>",
+        "pad_token": "<pad>",
+        "blank_token": "<blank>",
+        "encoding": "utf-8",
+        "unit": "libri_subword",
+        "vocab_size": 1023,
+        "vocab_path": "model_zoo/sentencepieces"
+    }
+}
diff --git a/audio/speech_recognition/conformer/pytorch/dataloader.py b/audio/speech_recognition/conformer/pytorch/dataloader.py
new file mode 100644
index 000000000..d3829b44c
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/dataloader.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+import numpy as np
+import random
+
+import torch
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
+from torch.utils.data.distributed import DistributedSampler
+
+
+def get_sampler(dataset, sampler_type):
+    return dict(
+        random=RandomSampler,
+        sequential=SequentialSampler,
+        distributed=DistributedSampler
+    )[sampler_type.lower()](dataset)
+
+
+class WorkerInitializer(object):
+
+    _instance = None
+
+    def __init__(self, seed):
+        self.seed = seed
+
+    def __call__(self, idx):
+        np.random.seed(seed=self.seed + idx)
+        random.seed(self.seed + idx)
+
+    @classmethod
+    def default(cls, seed=0):
+        if cls._instance is None:
+            cls._instance = cls(seed)
+        return cls._instance
+
+
+# sampler: Random | Sequential | Distributed
+def create_dataloader(
+        dataset,
+        batch_size,
+        worker_init_fn: WorkerInitializer=None,
+        sampler_type='Random',
+        pin_memory=True
+):
+    if worker_init_fn is None:
+        worker_init_fn = WorkerInitializer.default()
+    sampler = get_sampler(dataset, sampler_type)
+    dataloader = DataLoader(
+        dataset,
+        sampler=sampler,
+        batch_size=batch_size,
+        num_workers=0 if batch_size <= 8 else 4,
+        worker_init_fn=worker_init_fn,
+        pin_memory=pin_memory,
+        collate_fn=padding_collate_fn
+    )
+
+    return dataloader
+
+
+def padding_collate_fn(batch, pad_id: int = 0):
+    r"""
+    Functions that pad to the maximum sequence length
+
+    Args:
+        batch (tuple): tuple contains input and target tensors
+        pad_id (int): identification of pad token
+
+    Returns:
+        seqs (torch.FloatTensor): tensor contains input sequences.
+        target (torch.IntTensor): tensor contains target sequences.
+        seq_lengths (torch.IntTensor): tensor contains input sequence lengths
+        target_lengths (torch.IntTensor): tensor contains target sequence lengths
+    """
+    def seq_length_(p):
+        return len(p[0])
+
+    def target_length_(p):
+        return len(p[1])
+
+    # sort by sequence length for rnn.pack_padded_sequence()
+    batch = sorted(batch, key=lambda sample: sample[0].size(0), reverse=True)
+
+    seq_lengths = [len(s[0]) for s in batch]
+    target_lengths = [len(s[1]) - 1 for s in batch]
+
+    max_seq_sample = max(batch, key=seq_length_)[0]
+    max_target_sample = max(batch, key=target_length_)[1]
+
+    max_seq_size = max_seq_sample.size(0)
+    max_target_size = len(max_target_sample)
+
+    feat_size = max_seq_sample.size(1)
+    batch_size = len(batch)
+
+    seqs = torch.zeros(batch_size, max_seq_size, feat_size)
+
+    targets = torch.zeros(batch_size, max_target_size).to(torch.long)
+    targets.fill_(pad_id)
+
+    for x in range(batch_size):
+        sample = batch[x]
+        tensor = sample[0]
+        target = sample[1]
+        seq_length = tensor.size(0)
+
+        seqs[x].narrow(0, 0, seq_length).copy_(tensor)
+        targets[x].narrow(0, 0, len(target)).copy_(torch.LongTensor(target))
+
+    seq_lengths = torch.IntTensor(seq_lengths)
+    target_lengths = torch.IntTensor(target_lengths)
+
+    return seqs, targets, seq_lengths, target_lengths
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/README.md b/audio/speech_recognition/conformer/pytorch/openspeech/README.md
new file mode 100644
index 000000000..2abcfa9ca
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/README.md
@@ -0,0 +1,248 @@
+## OpenSpeech Structure. 
+
+```
+.
+├── README.md
+├── __init__.py
+├── criterion
+│   ├── __init__.py
+│   ├── cross_entropy
+│   │   ├── __init__.py
+│   │   ├── configuration.py
+│   │   └── cross_entropy.py
+│   ├── ctc
+│   │   ├── __init__.py
+│   │   ├── configuration.py
+│   │   └── ctc.py
+│   ├── joint_ctc_cross_entropy
+│   │   ├── __init__.py
+│   │   ├── configuration.py
+│   │   └── joint_ctc_cross_entropy.py
+│   ├── label_smoothed_cross_entropy
+│   │   ├── __init__.py
+│   │   ├── configuration.py
+│   │   └── label_smoothed_cross_entropy.py
+│   ├── perplexity
+│   │   ├── __init__.py
+│   │   ├── configuration.py
+│   │   └── perplexity.py
+│   └── transducer
+│       ├── __init__.py
+│       ├── configuration.py
+│       └── transducer.py
+├── data
+│   ├── __init__.py
+│   ├── audio
+│   │   ├── __init__.py
+│   │   ├── augment.py
+│   │   ├── data_loader.py
+│   │   ├── dataset.py
+│   │   ├── filter_bank
+│   │   │   ├── __init__.py
+│   │   │   ├── configuration.py
+│   │   │   └── filter_bank.py
+│   │   ├── load.py
+│   │   ├── melspectrogram
+│   │   │   ├── __init__.py
+│   │   │   ├── configuration.py
+│   │   │   └── melspectrogram.py
+│   │   ├── mfcc
+│   │   │   ├── __init__.py
+│   │   │   ├── configuration.py
+│   │   │   └── mfcc.py
+│   │   └── spectrogram
+│   │       ├── __init__.py
+│   │       ├── configuration.py
+│   │       └── spectrogram.py
+│   ├── sampler.py
+│   └── text
+│       ├── data_loader.py
+│       └── dataset.py
+├── dataclass
+│   ├── __init__.py
+│   ├── configurations.py
+│   └── initialize.py
+├── datasets
+│   ├── README.md
+│   ├── __init__.py
+│   ├── aishell
+│   │   ├── __init__.py
+│   │   ├── lit_data_module.py
+│   │   └── preprocess.py
+│   ├── ksponspeech
+│   │   ├── __init__.py
+│   │   ├── lit_data_module.py
+│   │   └── preprocess
+│   │       ├── __init__.py
+│   │       ├── character.py
+│   │       ├── grapheme.py
+│   │       ├── preprocess.py
+│   │       └── subword.py
+│   ├── language_model
+│   │   ├── __init__.py
+│   │   └── lit_data_module.py
+│   └── librispeech
+│       ├── __init__.py
+│       ├── lit_data_module.py
+│       └── preprocess
+│           ├── __init__.py
+│           ├── character.py
+│           ├── preprocess.py
+│           └── subword.py
+├── decoders
+│   ├── __init__.py
+│   ├── lstm_attention_decoder.py
+│   ├── openspeech_decoder.py
+│   ├── rnn_transducer_decoder.py
+│   ├── transformer_decoder.py
+│   └── transformer_transducer_decoder.py
+├── encoders
+│   ├── __init__.py
+│   ├── conformer_encoder.py
+│   ├── contextnet_encoder.py
+│   ├── convolutional_lstm_encoder.py
+│   ├── convolutional_transformer_encoder.py
+│   ├── deepspeech2.py
+│   ├── jasper.py
+│   ├── lstm_encoder.py
+│   ├── openspeech_encoder.py
+│   ├── quartznet.py
+│   ├── rnn_transducer_encoder.py
+│   ├── transformer_encoder.py
+│   └── transformer_transducer_encoder.py
+├── lm
+│   ├── __init__.py
+│   ├── lstm_lm.py
+│   ├── openspeech_lm.py
+│   └── transformer_lm.py
+├── metrics.py
+├── models
+│   ├── README.md
+│   ├── __init__.py
+│   ├── conformer
+│   │   ├── __init__.py
+│   │   ├── configurations.py
+│   │   └── model.py
+│   ├── contextnet
+│   │   ├── __init__.py
+│   │   ├── configurations.py
+│   │   └── model.py
+│   ├── deepspeech2
+│   │   ├── __init__.py
+│   │   ├── configurations.py
+│   │   └── model.py
+│   ├── jasper
+│   │   ├── __init__.py
+│   │   ├── configurations.py
+│   │   └── model.py
+│   ├── listen_attend_spell
+│   │   ├── __init__.py
+│   │   ├── configurations.py
+│   │   └── model.py
+│   ├── lstm_lm
+│   │   ├── __init__.py
+│   │   ├── configurations.py
+│   │   └── model.py
+│   ├── openspeech_ctc_model.py
+│   ├── openspeech_encoder_decoder_model.py
+│   ├── openspeech_language_model.py
+│   ├── openspeech_model.py
+│   ├── openspeech_transducer_model.py
+│   ├── quartznet
+│   │   ├── __init__.py
+│   │   ├── configurations.py
+│   │   └── model.py
+│   ├── rnn_transducer
+│   │   ├── __init__.py
+│   │   ├── configurations.py
+│   │   └── model.py
+│   ├── transformer
+│   │   ├── __init__.py
+│   │   ├── configurations.py
+│   │   └── model.py
+│   ├── transformer_lm
+│   │   ├── __init__.py
+│   │   ├── configurations.py
+│   │   └── model.py
+│   └── transformer_transducer
+│       ├── __init__.py
+│       ├── configurations.py
+│       └── model.py
+├── modules
+│   ├── __init__.py
+│   ├── add_normalization.py
+│   ├── additive_attention.py
+│   ├── batchnorm_relu_rnn.py
+│   ├── conformer_attention_module.py
+│   ├── conformer_block.py
+│   ├── conformer_convolution_module.py
+│   ├── conformer_feed_forward_module.py
+│   ├── contextnet_block.py
+│   ├── contextnet_module.py
+│   ├── conv2d_extractor.py
+│   ├── conv2d_subsampling.py
+│   ├── conv_base.py
+│   ├── conv_group_shuffle.py
+│   ├── deepspeech2_extractor.py
+│   ├── depthwise_conv1d.py
+│   ├── dot_product_attention.py
+│   ├── glu.py
+│   ├── jasper_block.py
+│   ├── jasper_subblock.py
+│   ├── location_aware_attention.py
+│   ├── mask.py
+│   ├── mask_conv1d.py
+│   ├── mask_conv2d.py
+│   ├── multi_head_attention.py
+│   ├── pointwise_conv1d.py
+│   ├── positional_encoding.py
+│   ├── positionwise_feed_forward.py
+│   ├── quartznet_block.py
+│   ├── quartznet_subblock.py
+│   ├── relative_multi_head_attention.py
+│   ├── residual_connection_module.py
+│   ├── swish.py
+│   ├── time_channel_separable_conv1d.py
+│   ├── transformer_embedding.py
+│   ├── vgg_extractor.py
+│   └── wrapper.py
+├── optim
+│   ├── __init__.py
+│   ├── adamp.py
+│   ├── novograd.py
+│   ├── optimizer.py
+│   ├── radam.py
+│   └── scheduler
+│       ├── __init__.py
+│       ├── lr_scheduler.py
+│       ├── reduce_lr_on_plateau_scheduler.py
+│       ├── transformer_lr_scheduler.py
+│       ├── tri_stage_lr_scheduler.py
+│       ├── warmup_reduce_lr_on_plateau_scheduler.py
+│       └── warmup_scheduler.py
+├── search
+│   ├── __init__.py
+│   ├── beam_search_base.py
+│   ├── beam_search_ctc.py
+│   ├── beam_search_lstm.py
+│   ├── beam_search_rnn_transducer.py
+│   ├── beam_search_transformer.py
+│   ├── beam_search_transformer_transducer.py
+│   └── ensemble_search.py
+├── tokenizers
+│   ├── __init__.py
+│   ├── aishell
+│   │   ├── __init__.py
+│   │   └── character.py
+│   ├── ksponspeech
+│   │   ├── __init__.py
+│   │   ├── character.py
+│   │   ├── grapheme.py
+│   │   └── subword.py
+│   ├── librispeech
+│   │   ├── __init__.py
+│   │   ├── character.py
+│   │   └── subword.py
+│   └── tokenizer.py
+└── utils.py
+```
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/__init__.py
new file mode 100644
index 000000000..1b66c5495
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/__init__.py
@@ -0,0 +1,33 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import openspeech.criterion
+import openspeech.datasets
+import openspeech.data
+import openspeech.dataclass
+import openspeech.encoders
+import openspeech.decoders
+import openspeech.models
+import openspeech.search
+import openspeech.optim
+import openspeech.tokenizers
+import openspeech.metrics
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/criterion/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/__init__.py
new file mode 100644
index 000000000..0e9e9c734
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/__init__.py
@@ -0,0 +1,103 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import os
+import importlib
+
+CRITERION_REGISTRY = dict()
+CRITERION_DATACLASS_REGISTRY = dict()
+
+
+def register_criterion(name: str, dataclass=None):
+    r"""
+    New criterion types can be added to OpenSpeech with the :func:`register_criterion` function decorator.
+
+    For example::
+        @register_criterion('label_smoothed_cross_entropy')
+        class LabelSmoothedCrossEntropyLoss(nn.Module):
+            (...)
+
+    .. note:: All criterion must implement the :class:`cls.__name__` interface.
+
+    Args:
+        name (str): the name of the criterion
+        dataclass (Optional, str): the dataclass of the criterion (default: None)
+    """
+
+    def register_criterion_cls(cls):
+        if name in CRITERION_REGISTRY:
+            raise ValueError(f"Cannot register duplicate criterion ({name})")
+
+        CRITERION_REGISTRY[name] = cls
+
+        cls.__dataclass = dataclass
+        if dataclass is not None:
+            if name in CRITERION_DATACLASS_REGISTRY:
+                raise ValueError(f"Cannot register duplicate criterion ({name})")
+            CRITERION_DATACLASS_REGISTRY[name] = dataclass
+
+        return cls
+
+    return register_criterion_cls
+
+
+criterion_dir = os.path.dirname(__file__)
+for file in os.listdir(criterion_dir):
+    if os.path.isdir(os.path.join(criterion_dir, file)) and not file.startswith('__'):
+        for subfile in os.listdir(os.path.join(criterion_dir, file)):
+            path = os.path.join(criterion_dir, file, subfile)
+            if subfile.endswith(".py"):
+                python_file = subfile[: subfile.find(".py")] if subfile.endswith(".py") else subfile
+                module = importlib.import_module(f"openspeech.criterion.{file}.{python_file}")
+        continue
+
+    path = os.path.join(criterion_dir, file)
+    if file.endswith(".py"):
+        criterion_name = file[: file.find(".py")] if file.endswith(".py") else file
+        module = importlib.import_module(f"openspeech.criterion.{criterion_name}")
+
+
+from .cross_entropy.configuration import CrossEntropyLossConfigs
+from .ctc.configuration import CTCLossConfigs
+from .joint_ctc_cross_entropy.configuration import JointCTCCrossEntropyLossConfigs
+from .label_smoothed_cross_entropy.configuration import LabelSmoothedCrossEntropyLossConfigs
+from .transducer.configuration import TransducerLossConfigs
+from .perplexity.perplexity import PerplexityLossConfigs
+from .cross_entropy.cross_entropy import CrossEntropyLoss
+from .ctc.ctc import CTCLoss
+from .joint_ctc_cross_entropy.joint_ctc_cross_entropy import JointCTCCrossEntropyLoss
+from .label_smoothed_cross_entropy.label_smoothed_cross_entropy import LabelSmoothedCrossEntropyLoss
+from .transducer.transducer import TransducerLoss
+from .perplexity.perplexity import Perplexity
+
+__all__ = [
+    "CrossEntropyLossConfigs",
+    "CTCLossConfigs",
+    "JointCTCCrossEntropyLossConfigs",
+    "LabelSmoothedCrossEntropyLossConfigs",
+    "TransducerLossConfigs",
+    "CrossEntropyLoss",
+    "CTCLoss",
+    "JointCTCCrossEntropyLoss",
+    "LabelSmoothedCrossEntropyLoss",
+    "TransducerLoss",
+]
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/criterion/cross_entropy/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/cross_entropy/__init__.py
new file mode 100644
index 000000000..9a083c67f
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/cross_entropy/__init__.py
@@ -0,0 +1,21 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/criterion/cross_entropy/configuration.py b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/cross_entropy/configuration.py
new file mode 100644
index 000000000..39464bcbb
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/cross_entropy/configuration.py
@@ -0,0 +1,48 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from dataclasses import dataclass, field
+
+from ...dataclass.configurations import OpenspeechDataclass
+
+
+@dataclass
+class CrossEntropyLossConfigs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of a
+    :class: `~openspeech.criterion.CrossEntropyLoss`.
+
+    It is used to initiated an `CrossEntropyLoss` criterion.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Configurations:
+        criterion_name (str): name of criterion (default: cross_entropy)
+        reduction (str): reduction method of criterion (default: mean)
+    """
+    criterion_name: str = field(
+        default="cross_entropy", metadata={"help": "Criterion name for training"}
+    )
+    reduction: str = field(
+        default="mean", metadata={"help": "Reduction method of criterion"}
+    )
+
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/criterion/cross_entropy/cross_entropy.py b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/cross_entropy/cross_entropy.py
new file mode 100644
index 000000000..0355c5672
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/cross_entropy/cross_entropy.py
@@ -0,0 +1,126 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+from torch import Tensor
+
+from .. import register_criterion
+from ..cross_entropy.configuration import CrossEntropyLossConfigs
+from ...tokenizers.tokenizer import Tokenizer
+
+
+@register_criterion("cross_entropy", dataclass=CrossEntropyLossConfigs)
+class CrossEntropyLoss(nn.Module):
+    r"""
+    The negative log likelihood loss. It is useful to train a classification
+    problem with `C` classes.
+
+    If provided, the optional argument :attr:`weight` should be a 1D Tensor assigning
+    weight to each of the classes. This is particularly useful when you have an
+    unbalanced training set.
+
+    The `input` given through a forward call is expected to contain
+    log-probabilities of each class. `input` has to be a Tensor of size either
+    :math:`(minibatch, C)` or :math:`(minibatch, C, d_1, d_2, ..., d_K)`
+    with :math:`K \geq 1` for the `K`-dimensional case (described later).
+
+    Obtaining log-probabilities in a neural network is easily achieved by
+    adding a  `LogSoftmax`  layer in the last layer of your network.
+    You may use `CrossEntropyLoss` instead, if you prefer not to add an extra
+    layer.
+
+    The `target` that this loss expects should be a class index in the range :math:`[0, C-1]`
+    where `C = number of classes`; if `ignore_index` is specified, this loss also accepts
+    this class index (this index may not necessarily be in the class range).
+
+    The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
+
+    .. math::
+        \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
+        l_n = - w_{y_n} x_{n,y_n}, \quad
+        w_{c} = \text{weight}[c] \cdot \mathbb{1}\{c \not= \text{ignore\_index}\},
+
+    where :math:`x` is the input, :math:`y` is the target, :math:`w` is the weight, and
+    :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
+    (default ``'mean'``), then
+
+    .. math::
+        \ell(x, y) = \begin{cases}
+            \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n}} l_n, &
+            \text{if reduction} = \text{`mean';}\\
+            \sum_{n=1}^N l_n,  &
+            \text{if reduction} = \text{`sum'.}
+        \end{cases}
+
+    Can also be used for higher dimension inputs, such as 2D images, by providing
+    an input of size :math:`(minibatch, C, d_1, d_2, ..., d_K)` with :math:`K \geq 1`,
+    where :math:`K` is the number of dimensions, and a target of appropriate shape
+    (see below). In the case of images, it computes NLL loss per-pixel.
+
+    Args:
+        configs (DictConfig): hydra configuration set
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs: logits, targets
+        - logits (torch.FloatTensor): probability distribution value from model and it has a logarithm shape.
+            The `FloatTensor` of size ``(batch, seq_length, num_classes)``
+        - targets (torch.LongTensor): ground-truth encoded to integers which directly point a word in label.
+            The `LongTensor` of size ``(batch, target_length)``
+
+    Returns: loss
+        * loss (float): loss for training
+
+    Examples::
+
+        >>> B, T1, C, T2 = 3, 128, 4, 10
+        >>> loss = CrossEntropyLoss()
+        >>> inputs = torch.randn(B, T1, C, requires_grad=True)
+        >>> targets = torch.empty(B, T2, dtype=torch.long).random_(T2)
+        >>> outputs = loss(inputs, targets)
+        >>> outputs.backward()
+    """
+    def __init__(
+            self,
+            configs,
+            tokenizer: Tokenizer,
+    ) -> None:
+        super(CrossEntropyLoss, self).__init__()
+        self.cross_entropy_loss = nn.CrossEntropyLoss(
+            reduction=configs.criterion.reduction,
+            ignore_index=tokenizer.pad_id,
+        )
+
+    def forward(self, logits: Tensor, targets: Tensor) -> Tensor:
+        max_target_length = targets.size(1)
+        max_logits_length = logits.size(1)
+
+        if max_logits_length > max_target_length:
+            logits = logits[:, :max_target_length, :]
+        elif max_target_length > max_logits_length:
+            targets = targets[:, :max_logits_length]
+
+        logits = logits.contiguous().view(-1, logits.size(-1))
+
+        return self.cross_entropy_loss(
+            logits.contiguous().view(-1, logits.size(-1)),
+            targets.contiguous().view(-1),
+        )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/criterion/ctc/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/ctc/__init__.py
new file mode 100644
index 000000000..9a083c67f
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/ctc/__init__.py
@@ -0,0 +1,21 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/criterion/ctc/configuration.py b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/ctc/configuration.py
new file mode 100644
index 000000000..83cce6ee8
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/ctc/configuration.py
@@ -0,0 +1,51 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from dataclasses import dataclass, field
+
+from ...dataclass.configurations import OpenspeechDataclass
+
+
+@dataclass
+class CTCLossConfigs(OpenspeechDataclass):
+    """
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.criterion.CTCLoss`.
+
+    It is used to initiated an `CTCLoss` criterion.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Configurations:
+        criterion_name (str): name of criterion. (default: ctc)
+        reduction (str): reduction method of criterion. (default: mean)
+        zero_infibity (bool): whether to zero infinite losses and the associated gradients. (default: True)
+    """
+    criterion_name: str = field(
+        default="ctc", metadata={"help": "Criterion name for training"}
+    )
+    reduction: str = field(
+        default="mean", metadata={"help": "Reduction method of criterion"}
+    )
+    zero_infinity: bool = field(
+        default=True, metadata={"help": "Whether to zero infinite losses and the associated gradients."}
+    )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/criterion/ctc/ctc.py b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/ctc/ctc.py
new file mode 100644
index 000000000..7e58a38d9
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/ctc/ctc.py
@@ -0,0 +1,148 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+from torch import Tensor
+
+from .. import register_criterion
+from ..ctc.configuration import CTCLossConfigs
+from ...tokenizers.tokenizer import Tokenizer
+
+
+@register_criterion("ctc", dataclass=CTCLossConfigs)
+class CTCLoss(nn.Module):
+    r"""
+    The Connectionist Temporal Classification loss.
+
+    Calculates loss between a continuous (unsegmented) time series and a target sequence. CTCLoss sums over the
+    probability of possible alignments of input to target, producing a loss value which is differentiable
+    with respect to each input node. The alignment of input to target is assumed to be "many-to-one", which
+    limits the length of the target sequence such that it must be :math:`\leq` the input length.
+
+    Args:
+        configs (DictConfig): hydra configuration set
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs: log_probs, targets, input_lengths, target_lengths
+        - Log_probs: Tensor of size :math:`(T, N, C)`,
+          where :math:`T = \text{input length}`,
+          :math:`N = \text{batch size}`, and
+          :math:`C = \text{number of classes (including blank)}`.
+          The logarithmized probabilities of the outputs (e.g. obtained with
+          :func:`torch.nn.functional.log_softmax`).
+        - Targets: Tensor of size :math:`(N, S)` or
+          :math:`(\operatorname{sum}(\text{target\_lengths}))`,
+          where :math:`N = \text{batch size}` and
+          :math:`S = \text{max target length, if shape is } (N, S)`.
+          It represent the target sequences. Each element in the target
+          sequence is a class index. And the target index cannot be blank (default=0).
+          In the :math:`(N, S)` form, targets are padded to the
+          length of the longest sequence, and stacked.
+          In the :math:`(\operatorname{sum}(\text{target\_lengths}))` form,
+          the targets are assumed to be un-padded and
+          concatenated within 1 dimension.
+        - Input_lengths: Tuple or tensor of size :math:`(N)`,
+          where :math:`N = \text{batch size}`. It represent the lengths of the
+          inputs (must each be :math:`\leq T`). And the lengths are specified
+          for each sequence to achieve masking under the assumption that sequences
+          are padded to equal lengths.
+        - Target_lengths: Tuple or tensor of size :math:`(N)`,
+          where :math:`N = \text{batch size}`. It represent lengths of the targets.
+          Lengths are specified for each sequence to achieve masking under the
+          assumption that sequences are padded to equal lengths. If target shape is
+          :math:`(N,S)`, target_lengths are effectively the stop index
+          :math:`s_n` for each target sequence, such that ``target_n = targets[n,0:s_n]`` for
+          each target in a batch. Lengths must each be :math:`\leq S`
+          If the targets are given as a 1d tensor that is the concatenation of individual
+          targets, the target_lengths must add up to the total length of the tensor.
+
+    Returns: loss
+        * loss (float): loss for training
+
+    Examples::
+
+        >>> # Target are to be padded
+        >>> T = 50      # Input sequence length
+        >>> C = 20      # Number of classes (including blank)
+        >>> N = 16      # Batch size
+        >>> S = 30      # Target sequence length of longest target in batch (padding length)
+        >>> S_min = 10  # Minimum target length, for demonstration purposes
+        >>>
+        >>> # Initialize random batch of input vectors, for *size = (T,N,C)
+        >>> input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_()
+        >>>
+        >>> # Initialize random batch of targets (0 = blank, 1:C = classes)
+        >>> target = torch.randint(low=1, high=C, size=(N, S), dtype=torch.long)
+        >>>
+        >>> input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long)
+        >>> target_lengths = torch.randint(low=S_min, high=S, size=(N,), dtype=torch.long)
+        >>> ctc_loss = nn.CTCLoss()
+        >>> loss = ctc_loss(input, target, input_lengths, target_lengths)
+        >>> loss.backward()
+        >>>
+        >>>
+        >>> # Target are to be un-padded
+        >>> T = 50      # Input sequence length
+        >>> C = 20      # Number of classes (including blank)
+        >>> N = 16      # Batch size
+        >>>
+        >>> # Initialize random batch of input vectors, for *size = (T,N,C)
+        >>> input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_()
+        >>> input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.long)
+        >>>
+        >>> # Initialize random batch of targets (0 = blank, 1:C = classes)
+        >>> target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.long)
+        >>> target = torch.randint(low=1, high=C, size=(sum(target_lengths),), dtype=torch.long)
+        >>> ctc_loss = CTCLoss()
+        >>> loss = ctc_loss(input, target, input_lengths, target_lengths)
+        >>> loss.backward()
+
+    Reference:
+        A. Graves et al.: Connectionist Temporal Classification:
+        Labelling Unsegmented Sequence Data with Recurrent Neural Networks:
+        https://www.cs.toronto.edu/~graves/icml_2006.pdf
+    """
+    def __init__(
+            self,
+            configs,
+            tokenizer: Tokenizer,
+    ) -> None:
+        super(CTCLoss, self).__init__()
+        self.ctc_loss = nn.CTCLoss(
+            blank=tokenizer.blank_id,
+            reduction=configs.criterion.reduction,
+            zero_infinity=configs.criterion.zero_infinity,
+        )
+
+    def forward(
+            self,
+            log_probs: Tensor,
+            input_lengths: Tensor,
+            targets: Tensor,
+            target_lengths: Tensor,
+    ):
+        return self.ctc_loss(
+            log_probs,
+            targets,
+            input_lengths,
+            target_lengths,
+        )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/criterion/joint_ctc_cross_entropy/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/joint_ctc_cross_entropy/__init__.py
new file mode 100644
index 000000000..9a083c67f
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/joint_ctc_cross_entropy/__init__.py
@@ -0,0 +1,21 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/criterion/joint_ctc_cross_entropy/configuration.py b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/joint_ctc_cross_entropy/configuration.py
new file mode 100644
index 000000000..123558ca5
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/joint_ctc_cross_entropy/configuration.py
@@ -0,0 +1,63 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from dataclasses import dataclass, field
+
+from ...dataclass.configurations import OpenspeechDataclass
+
+
+@dataclass
+class JointCTCCrossEntropyLossConfigs(OpenspeechDataclass):
+    """
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.criterion.JointCTCCrossEntropyLoss`.
+
+    It is used to initiated an `CTCLoss` criterion.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Configurations:
+        criterion_name (str): name of criterion. (default: joint_ctc_cross_entropy)
+        reduction (str): reduction method of criterion. (default: mean)
+        ctc_weight (float): weight of ctc loss for training. (default: 0.3)
+        cross_entropy_weight (float): weight of cross entropy loss for training. (default: 0.7)
+        smoothing (float): ratio of smoothing loss (confidence = 1.0 - smoothing) (default: 0.0)
+        zero_infibity (bool): whether to zero infinite losses and the associated gradients. (default: True)
+    """
+    criterion_name: str = field(
+        default="joint_ctc_cross_entropy", metadata={"help": "Criterion name for training."}
+    )
+    reduction: str = field(
+        default="mean", metadata={"help": "Reduction method of criterion"}
+    )
+    ctc_weight: float = field(
+        default=0.3, metadata={"help": "Weight of ctc loss for training."}
+    )
+    cross_entropy_weight: float = field(
+        default=0.7, metadata={"help": "Weight of cross entropy loss for training."}
+    )
+    smoothing: float = field(
+        default=0.0, metadata={"help": "Ratio of smoothing loss (confidence = 1.0 - smoothing)"}
+    )
+    zero_infinity: bool = field(
+        default=True, metadata={"help": "Whether to zero infinite losses and the associated gradients."}
+    )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/criterion/joint_ctc_cross_entropy/joint_ctc_cross_entropy.py b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/joint_ctc_cross_entropy/joint_ctc_cross_entropy.py
new file mode 100644
index 000000000..dce96ea9d
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/joint_ctc_cross_entropy/joint_ctc_cross_entropy.py
@@ -0,0 +1,117 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+from typing import Tuple
+from torch import Tensor
+
+from .. import register_criterion
+from ..joint_ctc_cross_entropy.configuration import JointCTCCrossEntropyLossConfigs
+from ..label_smoothed_cross_entropy.label_smoothed_cross_entropy import LabelSmoothedCrossEntropyLoss
+from ...tokenizers.tokenizer import Tokenizer
+
+
+@register_criterion("joint_ctc_cross_entropy", dataclass=JointCTCCrossEntropyLossConfigs)
+class JointCTCCrossEntropyLoss(nn.Module):
+    r"""
+    Privides Joint CTC-CrossEntropy Loss function. The logit from the encoder applies CTC Loss, and the logit
+    from the decoder applies Cross Entropy. This loss makes the encoder more robust.
+
+    Args:
+        configs (DictConfig): hydra configuration set
+        num_classes (int): the number of classfication
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs: encoder_logits, logits, output_lengths, targets, target_lengths
+        - encoder_logits (torch.FloatTensor): probability distribution value from encoder and it has a logarithm shape.
+            The `FloatTensor` of size ``(input_length, batch, num_classes)``
+        - logits (torch.FloatTensor): probability distribution value from model and it has a logarithm shape.
+            The `FloatTensor` of size ``(batch, seq_length, num_classes)``
+        - output_lengths (torch.LongTensor): length of model's outputs.
+            The `LongTensor` of size ``(batch)``
+        - targets (torch.LongTensor): ground-truth encoded to integers which directly point a word in label.
+            The `LongTensor` of size ``(batch, target_length)``
+        - target_lengths (torch.LongTensor): length of targets.
+            The `LongTensor` of size ``(batch)``
+
+    Returns: loss, ctc_loss, cross_entropy_loss
+        - loss (float): loss for training
+        - ctc_loss (float): ctc loss for training
+        - cross_entropy_loss (float): cross entropy loss for training
+
+    Reference:
+        Suyoun Kim et al.: Joint CTC-Attention based End-to-End Speech Recognition using Multi-task Learning:
+        https://arxiv.org/abs/1609.06773
+    """
+    
+    def __init__(
+            self,
+            configs,
+            num_classes: int,
+            tokenizer: Tokenizer,
+    ) -> None:
+        super(JointCTCCrossEntropyLoss, self).__init__()
+        self.num_classes = num_classes
+        self.dim = -1
+        self.ignore_index = tokenizer.pad_id
+        self.reduction = configs.criterion.reduction.lower()
+        self.ctc_weight = configs.criterion.ctc_weight
+        self.cross_entropy_weight = configs.criterion.cross_entropy_weight
+        self.ctc_loss = nn.CTCLoss(
+            blank=tokenizer.blank_id,
+            reduction=self.reduction,
+            zero_infinity=configs.criterion.zero_infinity,
+        )
+        if configs.criterion.smoothing > 0.0:
+            self.cross_entropy_loss = LabelSmoothedCrossEntropyLoss(
+                configs=configs,
+                num_classes=num_classes,
+                tokenizer=tokenizer,
+            )
+        else:
+            self.cross_entropy_loss = nn.CrossEntropyLoss(reduction=self.reduction, ignore_index=self.ignore_index)
+
+    def forward(
+            self,
+            encoder_logits: Tensor,
+            logits: Tensor,
+            output_lengths: Tensor,
+            targets: Tensor,
+            target_lengths: Tensor,
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        max_target_length = targets.size(1)
+        max_logits_length = logits.size(1)
+
+        if max_logits_length > max_target_length:
+            logits = logits[:, :max_target_length, :]
+            cross_entropy_targets = targets.clone()
+        elif max_target_length > max_logits_length:
+            cross_entropy_targets = targets[:, :max_logits_length].clone()
+        else:
+            cross_entropy_targets = targets.clone()
+
+        logits = logits.contiguous().view(-1, logits.size(-1))
+
+        ctc_loss = self.ctc_loss(encoder_logits, targets, output_lengths, target_lengths)
+        cross_entropy_loss = self.cross_entropy_loss(logits, cross_entropy_targets.contiguous().view(-1))
+        loss = cross_entropy_loss * self.cross_entropy_weight + ctc_loss * self.ctc_weight
+        return loss, ctc_loss, cross_entropy_loss
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/criterion/label_smoothed_cross_entropy/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/label_smoothed_cross_entropy/__init__.py
new file mode 100644
index 000000000..9a083c67f
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/label_smoothed_cross_entropy/__init__.py
@@ -0,0 +1,21 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/criterion/label_smoothed_cross_entropy/configuration.py b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/label_smoothed_cross_entropy/configuration.py
new file mode 100644
index 000000000..2ab6afab9
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/label_smoothed_cross_entropy/configuration.py
@@ -0,0 +1,51 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from dataclasses import dataclass, field
+
+from ...dataclass.configurations import OpenspeechDataclass
+
+
+@dataclass
+class LabelSmoothedCrossEntropyLossConfigs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.criterion.LabelSmoothedCrossEntropyLoss`.
+
+    It is used to initiated an `LabelSmoothedCrossEntropyLoss` criterion.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Configurations:
+        criterion_name (str): name of criterion. (default: label_smoothed_cross_entropy)
+        reduction (str): reduction method of criterion. (default: mean)
+        smoothing (float): ratio of smoothing loss (confidence = 1.0 - smoothing) (default: 0.1)
+    """
+    criterion_name: str = field(
+        default="label_smoothed_cross_entropy", metadata={"help": "Criterion name for training."}
+    )
+    reduction: str = field(
+        default="mean", metadata={"help": "Reduction method of criterion"}
+    )
+    smoothing: float = field(
+        default=0.1, metadata={"help": "Ratio of smoothing loss (confidence = 1.0 - smoothing)"}
+    )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/criterion/label_smoothed_cross_entropy/label_smoothed_cross_entropy.py b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/label_smoothed_cross_entropy/label_smoothed_cross_entropy.py
new file mode 100644
index 000000000..28c9ae6ff
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/label_smoothed_cross_entropy/label_smoothed_cross_entropy.py
@@ -0,0 +1,94 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from .. import register_criterion
+from ..label_smoothed_cross_entropy.configuration import LabelSmoothedCrossEntropyLossConfigs
+from ...tokenizers.tokenizer import Tokenizer
+
+
+@register_criterion("label_smoothed_cross_entropy", dataclass=LabelSmoothedCrossEntropyLossConfigs)
+class LabelSmoothedCrossEntropyLoss(nn.Module):
+    r"""
+    Label smoothed cross entropy loss function.
+
+    Args:
+        configs (DictConfig): hydra configuration set
+        num_classes (int): the number of classfication
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs: logits, targets
+        - **logits** (torch.FloatTensor): probability distribution value from model and it has a logarithm shape.
+            The `FloatTensor` of size ``(batch, seq_length, num_classes)``
+        - **targets** (torch.LongTensor): ground-truth encoded to integers which directly point a word in label
+            The `LongTensor` of size ``(batch, target_length)``
+
+    Returns: loss
+        * loss (float): loss for training
+    """
+    def __init__(
+            self,
+            configs,
+            num_classes: int,
+            tokenizer: Tokenizer,
+    ) -> None:
+        super(LabelSmoothedCrossEntropyLoss, self).__init__()
+        self.confidence = 1.0 - configs.criterion.smoothing
+        self.smoothing = configs.criterion.smoothing
+        self.num_classes = num_classes
+        self.dim = -1
+        self.ignore_index = tokenizer.pad_id
+        self.reduction = configs.criterion.reduction.lower()
+
+        if self.reduction == 'sum':
+            self.reduction_method = torch.sum
+        elif self.reduction == 'mean':
+            self.reduction_method = torch.mean
+        else:
+            raise ValueError(f"Unsupported reduction method {configs.criterion.reduction}")
+
+    def forward(self, logits: Tensor, targets: Tensor) -> Tensor:
+        # If predict longer than the target size, won't be able to calculate the cross entropy
+        max_target_length = targets.size(1)
+        max_logits_length = logits.size(1)
+
+        if max_logits_length > max_target_length:
+            logits = logits[:, :max_target_length, :]
+        elif max_target_length > max_logits_length:
+            targets = targets[:, :max_logits_length]
+
+        logits = logits.contiguous().view(-1, logits.size(-1))
+        targets = targets.contiguous().view(-1)
+
+        if self.smoothing > 0.0:
+            with torch.no_grad():
+                label_smoothed = torch.zeros_like(logits)
+                label_smoothed.fill_(self.smoothing / (self.num_classes - 1))
+                label_smoothed.scatter_(1, targets.data.unsqueeze(1), self.confidence)
+                label_smoothed[targets == self.ignore_index, :] = 0
+            return self.reduction_method(-label_smoothed * logits)
+
+        return F.cross_entropy(logits, targets, ignore_index=self.ignore_index, reduction=self.reduction)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/criterion/perplexity/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/perplexity/__init__.py
new file mode 100644
index 000000000..9a083c67f
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/perplexity/__init__.py
@@ -0,0 +1,21 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/criterion/perplexity/configuration.py b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/perplexity/configuration.py
new file mode 100644
index 000000000..7ce7dfa6b
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/perplexity/configuration.py
@@ -0,0 +1,48 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from dataclasses import dataclass, field
+
+from ...dataclass.configurations import OpenspeechDataclass
+
+
+@dataclass
+class PerplexityLossConfigs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of a
+    :class: `~openspeech.criterion.Perplexity`.
+
+    It is used to initiated an `Perplexity` criterion.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Configurations:
+        criterion_name (str): name of criterion (default: perplexity)
+        reduction (str): reduction method of criterion (default: mean)
+    """
+    criterion_name: str = field(
+        default="perplexity", metadata={"help": "Criterion name for training"}
+    )
+    reduction: str = field(
+        default="mean", metadata={"help": "Reduction method of criterion"}
+    )
+
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/criterion/perplexity/perplexity.py b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/perplexity/perplexity.py
new file mode 100644
index 000000000..f0fcbee8e
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/perplexity/perplexity.py
@@ -0,0 +1,78 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from .. import register_criterion
+from ..perplexity.configuration import PerplexityLossConfigs
+from ...tokenizers.tokenizer import Tokenizer
+
+
+@register_criterion("perplexity", dataclass=PerplexityLossConfigs)
+class Perplexity(nn.Module):
+    r"""
+    Language model perplexity loss.
+    Perplexity is the token averaged likelihood.  When the averaging options are the
+    same, it is the exponential of negative log-likelihood.
+
+    Args:
+        configs (DictConfig): hydra configuration set
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs: logits, targets
+        - **logits** (torch.FloatTensor): probability distribution value from model and it has a logarithm shape.
+            The `FloatTensor` of size ``(batch, seq_length, num_classes)``
+        - **targets** (torch.LongTensor): ground-truth encoded to integers which directly point a word in label
+            The `LongTensor` of size ``(batch, target_length)``
+
+    Returns: loss
+        - loss (float): loss for training
+    """
+    def __init__(
+            self,
+            configs,
+            tokenizer: Tokenizer,
+    ) -> None:
+        super(Perplexity, self).__init__()
+        self.cross_entropy_loss = nn.CrossEntropyLoss(
+            reduction=configs.criterion.reduction,
+            ignore_index=tokenizer.pad_id,
+        )
+
+    def forward(self, logits: Tensor, targets: Tensor) -> Tensor:
+        max_target_length = targets.size(1)
+        max_logits_length = logits.size(1)
+
+        if max_logits_length > max_target_length:
+            logits = logits[:, :max_target_length, :]
+        elif max_target_length > max_logits_length:
+            targets = targets[:, :max_logits_length]
+
+        logits = logits.contiguous().view(-1, logits.size(-1))
+
+        cross_entropy_loss = self.cross_entropy_loss(
+            logits.contiguous().view(-1, logits.size(-1)),
+            targets.contiguous().view(-1),
+        )
+        return torch.exp(cross_entropy_loss)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/criterion/transducer/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/transducer/__init__.py
new file mode 100644
index 000000000..9a083c67f
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/transducer/__init__.py
@@ -0,0 +1,21 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/criterion/transducer/configuration.py b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/transducer/configuration.py
new file mode 100644
index 000000000..a217a3672
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/transducer/configuration.py
@@ -0,0 +1,51 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from dataclasses import dataclass, field
+
+from ...dataclass.configurations import OpenspeechDataclass
+
+
+@dataclass
+class TransducerLossConfigs(OpenspeechDataclass):
+    """
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.criterion.TransducerLoss`.
+
+    It is used to initiated an `TransducerLoss` criterion.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Configurations:
+        criterion_name (str): name of criterion. (default: label_smoothed_cross_entropy)
+        reduction (str): reduction method of criterion. (default: mean)
+        gather (bool): reduce memory consumption. (default: True)
+    """
+    criterion_name: str = field(
+        default="transducer", metadata={"help": "Criterion name for training."}
+    )
+    reduction: str = field(
+        default="mean", metadata={"help": "Reduction method of criterion"}
+    )
+    gather: bool = field(
+        default=True, metadata={"help": "Reduce memory consumption."}
+    )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/criterion/transducer/transducer.py b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/transducer/transducer.py
new file mode 100644
index 000000000..d6ff0535c
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/criterion/transducer/transducer.py
@@ -0,0 +1,91 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+
+from .. import register_criterion
+from ..transducer.configuration import TransducerLossConfigs
+from ...utils import WARPRNNT_IMPORT_ERROR
+from ...tokenizers.tokenizer import Tokenizer
+
+
+@register_criterion("transducer", dataclass=TransducerLossConfigs)
+class TransducerLoss(nn.Module):
+    r"""
+    Compute path-aware regularization transducer loss.
+
+    Args:
+        configs (DictConfig): hydra configuration set
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        logits (torch.FloatTensor): Input tensor with shape (N, T, U, V)
+            where N is the minibatch size, T is the maximum number of
+            input frames, U is the maximum number of output labels and V is
+            the vocabulary of labels (including the blank).
+        targets (torch.IntTensor): Tensor with shape (N, U-1) representing the
+            reference labels for all samples in the minibatch.
+        input_lengths (torch.IntTensor): Tensor with shape (N,) representing the
+            number of frames for each sample in the minibatch.
+        target_lengths (torch.IntTensor): Tensor with shape (N,) representing the
+            length of the transcription for each sample in the minibatch.
+
+    Returns:
+        - loss (torch.FloatTensor): transducer loss
+
+    Reference:
+        A. Graves: Sequence Transduction with Recurrent Neural Networks:
+        https://arxiv.org/abs/1211.3711.pdf
+    """
+
+    def __init__(
+            self,
+            configs,
+            tokenizer: Tokenizer,
+    ) -> None:
+        super().__init__()
+        try:
+            from warp_rnnt import rnnt_loss
+        except ImportError:
+            raise ImportError(WARPRNNT_IMPORT_ERROR)
+        self.rnnt_loss = rnnt_loss
+        self.blank_id = tokenizer.blank_id
+        self.reduction = configs.criterion.reduction
+        self.gather = configs.criterion.gather
+
+    def forward(
+            self,
+            logits: torch.FloatTensor,
+            targets: torch.IntTensor,
+            input_lengths: torch.IntTensor,
+            target_lengths: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        return self.rnnt_loss(
+            logits,
+            targets,
+            input_lengths,
+            target_lengths,
+            reduction=self.reduction,
+            blank=self.blank_id,
+            gather=self.gather,
+        )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/data/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/data/__init__.py
new file mode 100644
index 000000000..56d4c3f6e
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/data/__init__.py
@@ -0,0 +1,71 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import os
+import importlib
+
+AUDIO_FEATURE_TRANSFORM_REGISTRY = dict()
+AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY = dict()
+
+
+def register_audio_feature_transform(name: str, dataclass=None):
+    r"""
+    New dataset types can be added to OpenSpeech with the :func:`register_dataset` function decorator.
+
+    For example::
+        @register_audio_feature_transform("fbank", dataclass=FilterBankConfigs)
+        class FilterBankFeatureTransform(object):
+            (...)
+
+    .. note:: All dataset must implement the :class:`cls.__name__` interface.
+
+    Args:
+        name (str): the name of the dataset
+        dataclass (Optional, str): the dataclass of the dataset (default: None)
+    """
+
+    def register_audio_feature_transform_cls(cls):
+        if name in AUDIO_FEATURE_TRANSFORM_REGISTRY:
+            raise ValueError(f"Cannot register duplicate audio ({name})")
+
+        AUDIO_FEATURE_TRANSFORM_REGISTRY[name] = cls
+
+        cls.__dataclass = dataclass
+        if dataclass is not None:
+            if name in AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY:
+                raise ValueError(f"Cannot register duplicate dataclass ({name})")
+            AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY[name] = dataclass
+
+        return cls
+
+    return register_audio_feature_transform_cls
+
+
+data_dir = os.path.dirname(__file__)
+for file in os.listdir(f"{data_dir}/audio"):
+    if os.path.isdir(f"{data_dir}/audio/{file}") and not file.startswith('__'):
+        path = f"{data_dir}/audio/{file}"
+        for module_file in os.listdir(path):
+            path = os.path.join(path, module_file)
+            if module_file.endswith(".py"):
+                module_name = module_file[: module_file.find(".py")] if module_file.endswith(".py") else module_file
+                module = importlib.import_module(f"openspeech.data.audio.{file}.{module_name}")
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/__init__.py
new file mode 100644
index 000000000..9a083c67f
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/__init__.py
@@ -0,0 +1,21 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/augment.py b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/augment.py
new file mode 100644
index 000000000..c8baabe04
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/augment.py
@@ -0,0 +1,193 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import os
+import numpy as np
+import random
+import logging
+import librosa
+from torch import Tensor
+
+from ..audio.load import load_audio
+
+logger = logging.getLogger(__name__)
+
+
+class SpecAugment(object):
+    """
+    Provides Spec Augment. A simple data augmentation method for speech recognition.
+    This concept proposed in https://arxiv.org/abs/1904.08779
+
+    Args:
+        freq_mask_para (int): maximum frequency masking length
+        time_mask_num (int): how many times to apply time masking
+        freq_mask_num (int): how many times to apply frequency masking
+
+    Inputs: feature_vector
+        - **feature_vector** (torch.FloatTensor): feature vector from audio file.
+
+    Returns: feature_vector:
+        - **feature_vector**: masked feature vector.
+    """
+    def __init__(self, freq_mask_para: int = 18, time_mask_num: int = 10, freq_mask_num: int = 2) -> None:
+        self.freq_mask_para = freq_mask_para
+        self.time_mask_num = time_mask_num
+        self.freq_mask_num = freq_mask_num
+
+    def __call__(self, feature: Tensor) -> Tensor:
+        """ Provides SpecAugmentation for audio """
+        time_axis_length = feature.size(0)
+        freq_axis_length = feature.size(1)
+        time_mask_para = time_axis_length / 20      # Refer to "Specaugment on large scale dataset" paper
+
+        # time mask
+        for _ in range(self.time_mask_num):
+            t = int(np.random.uniform(low=0.0, high=time_mask_para))
+            t0 = random.randint(0, time_axis_length - t)
+            feature[t0: t0 + t, :] = 0
+
+        # freq mask
+        for _ in range(self.freq_mask_num):
+            f = int(np.random.uniform(low=0.0, high=self.freq_mask_para))
+            f0 = random.randint(0, freq_axis_length - f)
+            feature[:, f0: f0 + f] = 0
+
+        return feature
+
+
+class NoiseInjector(object):
+    """
+    Provides noise injection for noise augmentation.
+
+    The noise augmentation process is as follows:
+        1: Randomly sample audios by `noise_size` from dataset
+        2: Extract noise from `audio_paths`
+        3: Add noise to sound
+
+    Args:
+        noise_dataset_dir (str): path of noise dataset
+        sample_rate (int): sampling rate
+        noise_level (float): level of noise
+
+    Inputs: signal
+        - **signal**: signal from audio file
+
+    Returns: signal
+        - **signal**: noise added signal
+    """
+    def __init__(
+            self,
+            noise_dataset_dir: str,
+            sample_rate: int = 16000,
+            noise_level: float = 0.7,
+    ) -> None:
+        if not os.path.exists(noise_dataset_dir):
+            logger.info("Directory doesn`t exist: {0}".format(noise_dataset_dir))
+            raise IOError
+
+        logger.info("Create Noise injector...")
+
+        self.sample_rate = sample_rate
+        self.noise_level = noise_level
+        self._load_audio = load_audio
+        self.audio_paths = self.create_audio_paths(noise_dataset_dir)
+        self.dataset = self.create_noiseset(noise_dataset_dir)
+
+        logger.info("Create Noise injector complete !!")
+
+    def __call__(self, signal):
+        noise = np.random.choice(self.dataset)
+        noise_level = np.random.uniform(0, self.noise_level)
+
+        signal_length = len(signal)
+        noise_length = len(noise)
+
+        if signal_length >= noise_length:
+            noise_start = int(np.random.rand() * (signal_length - noise_length))
+            noise_end = int(noise_start + noise_length)
+            signal[noise_start: noise_end] += noise * noise_level
+
+        else:
+            signal += noise[:signal_length] * noise_level
+
+        return signal
+
+    def create_audio_paths(self, dataset_path) -> list:
+        audio_paths = list()
+        noise_audio_paths = os.listdir(dataset_path)
+        num_noise_audio_data = len(noise_audio_paths)
+
+        for idx in range(num_noise_audio_data):
+            if noise_audio_paths[idx].endswith('.pcm') \
+                    or noise_audio_paths[idx].endswith('.wav') \
+                    or noise_audio_paths[idx].endswith('.flac'):
+                audio_paths.append(noise_audio_paths[idx])
+
+        return audio_paths
+
+    def create_noiseset(self, dataset_path):
+        dataset = list()
+
+        for audio_path in self.audio_paths:
+            audio_path = os.path.join(dataset_path, audio_path)
+            noise = self._load_audio(audio_path, self.sample_rate, del_silence=False)
+
+            if noise is not None:
+                dataset.append(noise)
+
+        return dataset
+
+
+class TimeStretchAugment(object):
+    """
+    Time-stretch an audio series by a fixed rate.
+
+    Inputs:
+        signal: np.ndarray [shape=(n,)] audio time series
+
+    Returns:
+        y_stretch: np.ndarray [shape=(round(n/rate),)] audio time series stretched by the specified rate
+    """
+    def __init__(self, min_rate: float = 0.7, max_rate: float = 1.4):
+        super(TimeStretchAugment, self).__init__()
+        self.min_rate = min_rate
+        self.max_rate = max_rate
+
+    def __call__(self, signal: np.array):
+        return librosa.effects.time_stretch(signal, random.uniform(self.min_rate, self.max_rate))
+
+
+class JoiningAugment(object):
+    """
+    Data augment by concatenating audio signals
+
+    Inputs:
+        signal: np.ndarray [shape=(n,)] audio time series
+
+    Returns: signal
+        - **signal**: concatenated signal
+    """
+    def __init__(self):
+        super(JoiningAugment, self).__init__()
+
+    def __call__(self, signals: tuple):
+        return np.concatenate([signal for signal in signals])
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/data_loader.py b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/data_loader.py
new file mode 100644
index 000000000..45a26fa08
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/data_loader.py
@@ -0,0 +1,130 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import numpy as np
+from typing import Tuple
+from torch.utils.data import DataLoader, Sampler
+
+
+def _collate_fn(batch, pad_id: int = 0):
+    r"""
+    Functions that pad to the maximum sequence length
+
+    Args:
+        batch (tuple): tuple contains input and target tensors
+        pad_id (int): identification of pad token
+
+    Returns:
+        seqs (torch.FloatTensor): tensor contains input sequences.
+        target (torch.IntTensor): tensor contains target sequences.
+        seq_lengths (torch.IntTensor): tensor contains input sequence lengths
+        target_lengths (torch.IntTensor): tensor contains target sequence lengths
+    """
+    def seq_length_(p):
+        return len(p[0])
+
+    def target_length_(p):
+        return len(p[1])
+
+    # sort by sequence length for rnn.pack_padded_sequence()
+    batch = sorted(batch, key=lambda sample: sample[0].size(0), reverse=True)
+
+    seq_lengths = [len(s[0]) for s in batch]
+    target_lengths = [len(s[1]) - 1 for s in batch]
+
+    max_seq_sample = max(batch, key=seq_length_)[0]
+    max_target_sample = max(batch, key=target_length_)[1]
+
+    max_seq_size = max_seq_sample.size(0)
+    max_target_size = len(max_target_sample)
+
+    feat_size = max_seq_sample.size(1)
+    batch_size = len(batch)
+
+    seqs = torch.zeros(batch_size, max_seq_size, feat_size)
+
+    targets = torch.zeros(batch_size, max_target_size).to(torch.long)
+    targets.fill_(pad_id)
+
+    for x in range(batch_size):
+        sample = batch[x]
+        tensor = sample[0]
+        target = sample[1]
+        seq_length = tensor.size(0)
+
+        seqs[x].narrow(0, 0, seq_length).copy_(tensor)
+        targets[x].narrow(0, 0, len(target)).copy_(torch.LongTensor(target))
+
+    seq_lengths = torch.IntTensor(seq_lengths)
+    target_lengths = torch.IntTensor(target_lengths)
+
+    return seqs, targets, seq_lengths, target_lengths
+
+
+class AudioDataLoader(DataLoader):
+    r"""
+    Audio Data Loader
+
+    Args:
+        dataset (torch.utils.data.Dataset): dataset from which to load the data.
+        num_workers (int): how many subprocesses to use for data loading.
+        batch_sampler (torch.utils.data.sampler.Sampler): defines the strategy to draw samples from the dataset.
+    """
+    def __init__(
+            self,
+            dataset: torch.utils.data.Dataset,
+            num_workers: int,
+            batch_sampler: torch.utils.data.sampler.Sampler,
+            **kwargs,
+    ) -> None:
+        super(AudioDataLoader, self).__init__(
+            dataset=dataset,
+            num_workers=num_workers,
+            batch_sampler=batch_sampler,
+            **kwargs,
+        )
+        self.collate_fn = _collate_fn
+
+
+def load_dataset(manifest_file_path: str) -> Tuple[list, list]:
+    """
+    Provides dictionary of filename and labels.
+
+    Args:
+        manifest_file_path (str): evaluation manifest file path.
+
+    Returns: target_dict
+        * target_dict (dict): dictionary of filename and labels
+    """
+    audio_paths = list()
+    transcripts = list()
+
+    with open(manifest_file_path) as f:
+        for idx, line in enumerate(f.readlines()):
+            audio_path, korean_transcript, transcript = line.split('\t')
+            transcript = transcript.replace('\n', '')
+
+            audio_paths.append(audio_path)
+            transcripts.append(transcript)
+
+    return audio_paths, transcripts
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/dataset.py b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/dataset.py
new file mode 100644
index 000000000..7deeecf35
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/dataset.py
@@ -0,0 +1,219 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import os
+import random
+import torch
+import numpy as np
+import logging
+from torch import Tensor
+from torch.utils.data import Dataset
+
+from openspeech.data import AUDIO_FEATURE_TRANSFORM_REGISTRY
+from openspeech.data.audio.augment import JoiningAugment, NoiseInjector, SpecAugment, TimeStretchAugment
+from openspeech.data.audio.load import load_audio
+
+logger = logging.getLogger(__name__)
+
+
+class SpeechToTextDataset(Dataset):
+    r"""
+    Dataset for audio & transcript matching
+
+    Note:
+        Do not use this class directly, use one of the sub classes.
+
+    Args:
+        dataset_path (str): path of librispeech dataset
+        audio_paths (list): list of audio path
+        transcripts (list): list of transript
+        sos_id (int): identification of <startofsentence>
+        eos_id (int): identification of <endofsentence>
+        del_silence (bool): flag indication whether to apply delete silence or not
+        apply_spec_augment (bool): flag indication whether to apply spec augment or not
+        apply_noise_augment (bool): flag indication whether to apply noise augment or not
+        apply_time_stretch_augment (bool): flag indication whether to apply time stretch augment or not
+        apply_joining_augment (bool): flag indication whether to apply audio joining augment or not
+    """
+    NONE_AUGMENT = 0
+    SPEC_AUGMENT = 1
+    NOISE_AUGMENT = 2
+    TIME_STRETCH = 3
+    AUDIO_JOINING = 4
+
+    def __init__(
+            self,
+            configs,
+            dataset_path: str,
+            audio_paths: list,
+            transcripts: list,
+            sos_id: int = 1,
+            eos_id: int = 2,
+            del_silence: bool = False,
+            apply_spec_augment: bool = False,
+            apply_noise_augment: bool = False,
+            apply_time_stretch_augment: bool = False,
+            apply_joining_augment: bool = False,
+    ) -> None:
+        super(SpeechToTextDataset, self).__init__()
+        self.dataset_path = dataset_path
+        self.audio_paths = list(audio_paths)
+        self.transcripts = list(transcripts)
+        self.augments = [self.NONE_AUGMENT] * len(self.audio_paths)
+        self.dataset_size = len(self.audio_paths)
+        self.sos_id = sos_id
+        self.eos_id = eos_id
+        self.sample_rate = configs.audio.sample_rate
+        self.num_mels = configs.audio.num_mels
+        self.del_silence = del_silence
+        self.apply_spec_augment = apply_spec_augment
+        self.apply_noise_augment = apply_noise_augment
+        self.apply_time_stretch_augment = apply_time_stretch_augment
+        self.apply_joining_augment = apply_joining_augment
+        self.transforms = AUDIO_FEATURE_TRANSFORM_REGISTRY[configs.audio.name](configs)
+        self._load_audio = load_audio
+
+        if self.apply_spec_augment:
+            self._spec_augment = SpecAugment(
+                freq_mask_para=configs.augment.freq_mask_para,
+                freq_mask_num=configs.augment.freq_mask_num,
+                time_mask_num=configs.augment.time_mask_num,
+            )
+            for idx in range(self.dataset_size):
+                self.audio_paths.append(self.audio_paths[idx])
+                self.transcripts.append(self.transcripts[idx])
+                self.augments.append(self.SPEC_AUGMENT)
+
+        if self.apply_noise_augment:
+            if eval(configs.augment.noise_dataset_dir) is None:
+                raise ValueError("`noise_dataset_dir` should be contain audio files.")
+
+            self._noise_injector = NoiseInjector(
+                noise_dataset_dir=configs.augment.noise_dataset_dir,
+                sample_rate=configs.augment.noise_sample_rate,
+                noise_level=configs.augment.noise_level,
+            )
+            for idx in range(self.dataset_size):
+                self.audio_paths.append(self.audio_paths[idx])
+                self.transcripts.append(self.transcripts[idx])
+                self.augments.append(self.NONE_AUGMENT)
+
+        if self.apply_time_stretch_augment:
+            self._time_stretch_augment = TimeStretchAugment(
+                min_rate=configs.time_stretch_min_rate,
+                max_rate=configs.time_stretch_max_rate,
+            )
+            for idx in range(self.dataset_size):
+                self.audio_paths.append(self.audio_paths[idx])
+                self.transcripts.append(self.transcripts[idx])
+                self.augments.append(self.TIME_STRETCH)
+
+        if self.apply_joining_augment:
+            self._joining_augment = JoiningAugment()
+            for idx in range(self.dataset_size):
+                self.audio_paths.append(self.audio_paths[idx])
+                self.transcripts.append(self.transcripts[idx])
+                self.augments.append(self.AUDIO_JOINING)
+
+        self.total_size = len(self.audio_paths)
+
+        tmp = list(zip(self.audio_paths, self.transcripts, self.augments))
+        random.shuffle(tmp)
+        self.audio_paths, self.transcripts, self.augments = zip(*tmp)
+
+    def _parse_audio(self, audio_path: str, augment: int = None, joining_idx: int = 0) -> Tensor:
+        """
+        Parses audio.
+
+        Args:
+            audio_path (str): path of audio file
+            augment (int): augmentation identification
+
+        Returns:
+            feature (np.ndarray): feature extract by sub-class
+        """
+        signal = self._load_audio(audio_path, sample_rate=self.sample_rate, del_silence=self.del_silence)
+
+        if signal is None:
+            logger.warning(f"{audio_path} is not Valid!!")
+            return torch.zeros(1000, self.num_mels)
+
+        if augment == self.AUDIO_JOINING:
+            joining_signal = self._load_audio(self.audio_paths[joining_idx], sample_rate=self.sample_rate)
+            signal = self._joining_augment((signal, joining_signal))
+
+        if augment == self.TIME_STRETCH:
+            signal = self._time_stretch_augment(signal)
+
+        if augment == self.NOISE_AUGMENT:
+            signal = self._noise_injector(signal)
+
+        feature = self.transforms(signal)
+
+        feature -= feature.mean()
+        feature /= np.std(feature)
+
+        feature = torch.FloatTensor(feature).transpose(0, 1)
+
+        if augment == self.SPEC_AUGMENT:
+            feature = self._spec_augment(feature)
+
+        return feature
+
+    def _parse_transcript(self, transcript: str) -> list:
+        """
+        Parses transcript
+        Args:
+            transcript (str): transcript of audio file
+        Returns
+            transcript (list): transcript that added <sos> and <eos> tokens
+        """
+        tokens = transcript.split(' ')
+        transcript = list()
+
+        transcript.append(int(self.sos_id))
+        for token in tokens:
+            transcript.append(int(token))
+        transcript.append(int(self.eos_id))
+
+        return transcript
+
+    def __getitem__(self, idx):
+        """ Provides paif of audio & transcript """
+        audio_path = os.path.join(self.dataset_path, self.audio_paths[idx])
+
+        if self.augments[idx] == self.AUDIO_JOINING:
+            joining_idx = random.randint(0, self.total_size)
+            feature = self._parse_audio(audio_path, self.augments[idx], joining_idx)
+            transcript = self._parse_transcript(f"{self.transcripts[idx]} {self.transcripts[joining_idx]}")
+
+        else:
+            feature = self._parse_audio(audio_path, self.augments[idx])
+            transcript = self._parse_transcript(self.transcripts[idx])
+
+        return feature, transcript
+
+    def __len__(self):
+        return len(self.audio_paths)
+
+    def count(self):
+        return len(self.audio_paths)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/filter_bank/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/filter_bank/__init__.py
new file mode 100644
index 000000000..9a083c67f
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/filter_bank/__init__.py
@@ -0,0 +1,21 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/filter_bank/configuration.py b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/filter_bank/configuration.py
new file mode 100644
index 000000000..c60e5e1b6
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/filter_bank/configuration.py
@@ -0,0 +1,79 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from dataclasses import dataclass, field
+
+from ....dataclass.configurations import OpenspeechDataclass
+
+
+@dataclass
+class FilterBankConfigs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.data.audio.FilterBankFeatureTransform`.
+
+    It is used to initiated an `FilterBankFeatureTransform` feature transform.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Args:
+        name (str): name of feature transform. (default: fbank)
+        sample_rate (int): sampling rate of audio (default: 16000)
+        frame_length (float): frame length for spectrogram (default: 20.0)
+        frame_shift (float): length of hop between STFT (default: 10.0)
+        del_silence (bool): flag indication whether to apply delete silence or not (default: False)
+        num_mels (int): the number of mfc coefficients to retain. (default: 80)
+        apply_spec_augment (bool): flag indication whether to apply spec augment or not (default: True)
+        apply_noise_augment (bool): flag indication whether to apply noise augment or not (default: False)
+        apply_time_stretch_augment (bool): flag indication whether to apply time stretch augment or not (default: False)
+        apply_joining_augment (bool): flag indication whether to apply audio joining augment or not (default: False)
+    """
+    name: str = field(
+        default="fbank", metadata={"help": "Name of dataset."}
+    )
+    sample_rate: int = field(
+        default=16000, metadata={"help": "Sampling rate of audio"}
+    )
+    frame_length: float = field(
+        default=20.0, metadata={"help": "Frame length for spectrogram"}
+    )
+    frame_shift: float = field(
+        default=10.0, metadata={"help": "Length of hop between STFT"}
+    )
+    del_silence: bool = field(
+        default=False, metadata={"help": "Flag indication whether to apply delete silence or not"}
+    )
+    num_mels: int = field(
+        default=80, metadata={"help": "The number of mfc coefficients to retain."}
+    )
+    apply_spec_augment: bool = field(
+        default=True, metadata={"help": "Flag indication whether to apply spec augment or not"}
+    )
+    apply_noise_augment: bool = field(
+        default=False, metadata={"help": "Flag indication whether to apply noise augment or not"}
+    )
+    apply_time_stretch_augment: bool = field(
+        default=False, metadata={"help": "Flag indication whether to apply time stretch augment or not"}
+    )
+    apply_joining_augment: bool = field(
+        default=False, metadata={"help": "Flag indication whether to apply audio joining augment or not"}
+    )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/filter_bank/filter_bank.py b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/filter_bank/filter_bank.py
new file mode 100644
index 000000000..8862117e7
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/filter_bank/filter_bank.py
@@ -0,0 +1,71 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import numpy as np
+from torch import Tensor
+
+from ... import register_audio_feature_transform
+from ...audio.filter_bank.configuration import FilterBankConfigs
+from ....utils import TORCHAUDIO_IMPORT_ERROR
+
+
+@register_audio_feature_transform("fbank", dataclass=FilterBankConfigs)
+class FilterBankFeatureTransform(object):
+    r"""
+    Create a fbank from a raw audio signal. This matches the input/output of Kaldi's compute-fbank-feats.
+
+    Args:
+        configs (DictConfig): hydra configuraion set
+
+    Inputs:
+        signal (np.ndarray): signal from audio file.
+
+    Returns:
+        Tensor: A fbank identical to what Kaldi would output. The shape is ``(seq_length, num_mels)``
+    """
+    def __init__(self, configs) -> None:
+        super(FilterBankFeatureTransform, self).__init__()
+        try:
+            import torchaudio
+        except ImportError:
+            raise ImportError(TORCHAUDIO_IMPORT_ERROR)
+        self.num_mels = configs.audio.num_mels
+        self.frame_length = configs.audio.frame_length
+        self.frame_shift = configs.audio.frame_shift
+        self.function = torchaudio.compliance.kaldi.fbank
+
+    def __call__(self, signal: np.ndarray) -> np.ndarray:
+        """
+        Provides feature extraction
+
+        Inputs:
+            signal (np.ndarray): audio signal
+
+        Returns:
+            feature (np.ndarray): feature extract by sub-class
+        """
+        return self.function(
+            Tensor(signal).unsqueeze(0),
+            num_mel_bins=self.num_mels,
+            frame_length=self.frame_length,
+            frame_shift=self.frame_shift,
+        ).transpose(0, 1).numpy()
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/load.py b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/load.py
new file mode 100644
index 000000000..032c57d40
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/load.py
@@ -0,0 +1,57 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import numpy as np
+import librosa
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def load_audio(audio_path: str, sample_rate: int, del_silence: bool = False) -> np.ndarray:
+    """
+    Load audio file (PCM) to sound. if del_silence is True, Eliminate all sounds below 30dB.
+    If exception occurs in numpy.memmap(), return None.
+    """
+    try:
+        if audio_path.endswith('pcm'):
+            signal = np.memmap(audio_path, dtype='h', mode='r').astype('float32')
+
+            if del_silence:
+                non_silence_indices = librosa.effects.split(signal, top_db=30)
+                signal = np.concatenate([signal[start:end] for start, end in non_silence_indices])
+
+            return signal / 32767  # normalize audio
+
+        elif audio_path.endswith('wav') or audio_path.endswith('flac'):
+            signal, _ = librosa.load(audio_path, sr=sample_rate)
+            return signal
+
+    except ValueError:
+        logger.debug('ValueError in {0}'.format(audio_path))
+        return None
+    except RuntimeError:
+        logger.debug('RuntimeError in {0}'.format(audio_path))
+        return None
+    except IOError:
+        logger.debug('IOError in {0}'.format(audio_path))
+        return None
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/melspectrogram/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/melspectrogram/__init__.py
new file mode 100644
index 000000000..9a083c67f
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/melspectrogram/__init__.py
@@ -0,0 +1,21 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/melspectrogram/configuration.py b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/melspectrogram/configuration.py
new file mode 100644
index 000000000..61c04db7c
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/melspectrogram/configuration.py
@@ -0,0 +1,79 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from dataclasses import dataclass, field
+
+from ....dataclass.configurations import OpenspeechDataclass
+
+
+@dataclass
+class MelSpectrogramConfigs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.data.audio.MelSpectrogramFeatureTransform`.
+
+    It is used to initiated an `MelSpectrogramFeatureTransform` feature transform.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.OpenspeechDataclass`.
+
+    Args:
+        name (str): name of feature transform. (default: melspectrogram)
+        sample_rate (int): sampling rate of audio (default: 16000)
+        frame_length (float): frame length for spectrogram (default: 20.0)
+        frame_shift (float): length of hop between STFT (default: 10.0)
+        del_silence (bool): flag indication whether to apply delete silence or not (default: False)
+        num_mels (int): the number of mfc coefficients to retain. (default: 80)
+        apply_spec_augment (bool): flag indication whether to apply spec augment or not (default: True)
+        apply_noise_augment (bool): flag indication whether to apply noise augment or not (default: False)
+        apply_time_stretch_augment (bool): flag indication whether to apply time stretch augment or not (default: False)
+        apply_joining_augment (bool): flag indication whether to apply audio joining augment or not (default: False)
+    """
+    name: str = field(
+        default="melspectrogram", metadata={"help": "Name of dataset."}
+    )
+    sample_rate: int = field(
+        default=16000, metadata={"help": "Sampling rate of audio"}
+    )
+    frame_length: float = field(
+        default=20.0, metadata={"help": "Frame length for spectrogram"}
+    )
+    frame_shift: float = field(
+        default=10.0, metadata={"help": "Length of hop between STFT"}
+    )
+    del_silence: bool = field(
+        default=False, metadata={"help": "Flag indication whether to apply delete silence or not"}
+    )
+    num_mels: int = field(
+        default=80, metadata={"help": "The number of mfc coefficients to retain."}
+    )
+    apply_spec_augment: bool = field(
+        default=True, metadata={"help": "Flag indication whether to apply spec augment or not"}
+    )
+    apply_noise_augment: bool = field(
+        default=False, metadata={"help": "Flag indication whether to apply noise augment or not"}
+    )
+    apply_time_stretch_augment: bool = field(
+        default=False, metadata={"help": "Flag indication whether to apply time stretch augment or not"}
+    )
+    apply_joining_augment: bool = field(
+        default=False, metadata={"help": "Flag indication whether to apply audio joining augment or not"}
+    )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/melspectrogram/melspectrogram.py b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/melspectrogram/melspectrogram.py
new file mode 100644
index 000000000..219b9f40e
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/melspectrogram/melspectrogram.py
@@ -0,0 +1,72 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import numpy as np
+
+from ... import register_audio_feature_transform
+from ...audio.melspectrogram.configuration import MelSpectrogramConfigs
+from ....utils import LIBROSA_IMPORT_ERROR
+
+
+@register_audio_feature_transform("melspectrogram", dataclass=MelSpectrogramConfigs)
+class MelSpectrogramFeatureTransform(object):
+    r"""
+    Create MelSpectrogram for a raw audio signal. This is a composition of Spectrogram and MelScale.
+
+    Args:
+        configs (DictConfig): configuraion set
+
+    Returns:
+        Tensor: A mel-spectrogram feature. The shape is ``(seq_length, num_mels)``
+    """
+    def __init__(self, configs) -> None:
+        super(MelSpectrogramFeatureTransform, self).__init__()
+        try:
+            import librosa
+        except ImportError:
+            raise ImportError(LIBROSA_IMPORT_ERROR)
+        self.sample_rate = configs.audio.sample_rate
+        self.num_mels = configs.audio.num_mels
+        self.n_fft = int(round(configs.audio.sample_rate * 0.001 * configs.audio.frame_length))
+        self.hop_length = int(round(configs.audio.sample_rate * 0.001 * configs.audio.frame_shift))
+        self.function = librosa.feature.melspectrogram
+        self.power_to_db = librosa.power_to_db
+
+    def __call__(self, signal: np.ndarray) -> np.ndarray:
+        """
+        Provides feature extraction
+
+        Inputs:
+            signal (np.ndarray): audio signal
+
+        Returns:
+            feature (np.ndarray): feature extract by sub-class
+        """
+        melspectrogram = self.function(
+            y=signal,
+            sr=self.sample_rate,
+            n_mels=self.num_mels,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+        )
+        melspectrogram = self.power_to_db(melspectrogram, ref=np.max)
+        return melspectrogram
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/mfcc/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/mfcc/__init__.py
new file mode 100644
index 000000000..9a083c67f
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/mfcc/__init__.py
@@ -0,0 +1,21 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/mfcc/configuration.py b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/mfcc/configuration.py
new file mode 100644
index 000000000..d20882d8b
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/mfcc/configuration.py
@@ -0,0 +1,79 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from dataclasses import dataclass, field
+
+from ....dataclass.configurations import OpenspeechDataclass
+
+
+@dataclass
+class MFCCConfigs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.data.audio.MFCCFeatureTransform`.
+
+    It is used to initiated an `MFCCFeatureTransform` feature transform.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.OpenspeechDataclass`.
+
+    Args:
+        name (str): name of feature transform. (default: mfcc)
+        sample_rate (int): sampling rate of audio (default: 16000)
+        frame_length (float): frame length for spectrogram (default: 20.0)
+        frame_shift (float): length of hop between STFT (default: 10.0)
+        del_silence (bool): flag indication whether to apply delete silence or not (default: False)
+        num_mels (int): the number of mfc coefficients to retain. (default: 40)
+        apply_spec_augment (bool): flag indication whether to apply spec augment or not (default: True)
+        apply_noise_augment (bool): flag indication whether to apply noise augment or not (default: False)
+        apply_time_stretch_augment (bool): flag indication whether to apply time stretch augment or not (default: False)
+        apply_joining_augment (bool): flag indication whether to apply audio joining augment or not (default: False)
+    """
+    name: str = field(
+        default="mfcc", metadata={"help": "Name of dataset."}
+    )
+    sample_rate: int = field(
+        default=16000, metadata={"help": "Sampling rate of audio"}
+    )
+    frame_length: float = field(
+        default=20.0, metadata={"help": "Frame length for spectrogram"}
+    )
+    frame_shift: float = field(
+        default=10.0, metadata={"help": "Length of hop between STFT"}
+    )
+    del_silence: bool = field(
+        default=False, metadata={"help": "Flag indication whether to apply delete silence or not"}
+    )
+    num_mels: int = field(
+        default=40, metadata={"help": "The number of mfc coefficients to retain."}
+    )
+    apply_spec_augment: bool = field(
+        default=True, metadata={"help": "Flag indication whether to apply spec augment or not"}
+    )
+    apply_noise_augment: bool = field(
+        default=False, metadata={"help": "Flag indication whether to apply noise augment or not"}
+    )
+    apply_time_stretch_augment: bool = field(
+        default=False, metadata={"help": "Flag indication whether to apply time stretch augment or not"}
+    )
+    apply_joining_augment: bool = field(
+        default=False, metadata={"help": "Flag indication whether to apply audio joining augment or not"}
+    )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/mfcc/mfcc.py b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/mfcc/mfcc.py
new file mode 100644
index 000000000..4cfd61003
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/mfcc/mfcc.py
@@ -0,0 +1,77 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import numpy as np
+
+from ... import register_audio_feature_transform
+from ...audio.mfcc.configuration import MFCCConfigs
+from ....utils import LIBROSA_IMPORT_ERROR
+
+
+@register_audio_feature_transform("mfcc", dataclass=MFCCConfigs)
+class MFCCFeatureTransform(object):
+    r"""
+    Create the Mel-frequency cepstrum coefficients from an audio signal.
+
+    By default, this calculates the MFCC on the DB-scaled Mel spectrogram.
+    This is not the textbook implementation, but is implemented here to
+    give consistency with librosa.
+
+    This output depends on the maximum value in the input spectrogram, and so
+    may return different values for an audio clip split into snippets vs. a
+    a full clip.
+
+    Args:
+        configs (DictConfig): configuraion set
+
+    Returns:
+        Tensor: A mfcc feature. The shape is ``(seq_length, num_mels)``
+    """
+    def __init__(self, configs) -> None:
+        super(MFCCFeatureTransform, self).__init__()
+        try:
+            import librosa
+        except ImportError:
+            raise ImportError(LIBROSA_IMPORT_ERROR)
+        self.sample_rate = configs.audio.sample_rate
+        self.num_mels = configs.audio.num_mels
+        self.n_fft = int(round(configs.audio.sample_rate * 0.001 * configs.audio.frame_length))
+        self.hop_length = int(round(configs.audio.sample_rate * 0.001 * configs.audio.frame_shift))
+        self.function = librosa.feature.mfcc
+
+    def __call__(self, signal: np.ndarray) -> np.ndarray:
+        """
+        Provides feature extraction
+
+        Inputs:
+            signal (np.ndarray): audio signal
+
+        Returns:
+            feature (np.ndarray): feature extract by sub-class
+        """
+        return self.function(
+            y=signal,
+            sr=self.sample_rate,
+            n_mfcc=self.num_mels,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+        )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/spectrogram/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/spectrogram/__init__.py
new file mode 100644
index 000000000..9a083c67f
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/spectrogram/__init__.py
@@ -0,0 +1,21 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/spectrogram/configuration.py b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/spectrogram/configuration.py
new file mode 100644
index 000000000..436c13e09
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/spectrogram/configuration.py
@@ -0,0 +1,80 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from dataclasses import dataclass, field
+
+from ....dataclass.configurations import OpenspeechDataclass
+
+
+@dataclass
+class SpectrogramConfigs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.data.audio.SpectrogramTransform`.
+
+    It is used to initiated an `SpectrogramTransform` feature transform.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.OpenspeechDataclass`.
+
+    Args:
+        name (str): name of feature transform. (default: spectrogram)
+        sample_rate (int): sampling rate of audio (default: 16000)
+        frame_length (float): frame length for spectrogram (default: 20.0)
+        frame_shift (float): length of hop between STFT (default: 10.0)
+        del_silence (bool): flag indication whether to apply delete silence or not (default: False)
+        num_mels (int): the number of mfc coefficients to retain. (default: 161)
+        apply_spec_augment (bool): flag indication whether to apply spec augment or not (default: True)
+        apply_noise_augment (bool): flag indication whether to apply noise augment or not (default: False)
+        apply_time_stretch_augment (bool): flag indication whether to apply time stretch augment or not (default: False)
+        apply_joining_augment (bool): flag indication whether to apply audio joining augment or not (default: False)
+    """
+    name: str = field(
+        default="spectrogram", metadata={"help": "Name of dataset."}
+    )
+    sample_rate: int = field(
+        default=16000, metadata={"help": "Sampling rate of audio"}
+    )
+    frame_length: float = field(
+        default=20.0, metadata={"help": "Frame length for spectrogram"}
+    )
+    frame_shift: float = field(
+        default=10.0, metadata={"help": "Length of hop between STFT"}
+    )
+    del_silence: bool = field(
+        default=False, metadata={"help": "Flag indication whether to apply delete silence or not"}
+    )
+    num_mels: int = field(
+        default=161, metadata={"help": "Spectrogram is independent of mel, but uses the 'num_mels' variable "
+                                       "to unify feature size variables "}
+    )
+    apply_spec_augment: bool = field(
+        default=True, metadata={"help": "Flag indication whether to apply spec augment or not"}
+    )
+    apply_noise_augment: bool = field(
+        default=False, metadata={"help": "Flag indication whether to apply noise augment or not"}
+    )
+    apply_time_stretch_augment: bool = field(
+        default=False, metadata={"help": "Flag indication whether to apply time stretch augment or not"}
+    )
+    apply_joining_augment: bool = field(
+        default=False, metadata={"help": "Flag indication whether to apply audio joining augment or not"}
+    )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/spectrogram/spectrogram.py b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/spectrogram/spectrogram.py
new file mode 100644
index 000000000..89525dc60
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/data/audio/spectrogram/spectrogram.py
@@ -0,0 +1,65 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from ... import register_audio_feature_transform
+from ...audio.spectrogram.configuration import SpectrogramConfigs
+
+
+@register_audio_feature_transform("spectrogram", dataclass=SpectrogramConfigs)
+class SpectrogramFeatureTransform(object):
+    r"""
+    Create a spectrogram from a audio signal.
+
+    Args:
+        configs (DictConfig): configuraion set
+
+    Returns:
+        Tensor: A spectrogram feature. The shape is ``(seq_length, num_mels)``
+    """
+    def __init__(self, configs) -> None:
+        super(SpectrogramFeatureTransform, self).__init__()
+        self.n_fft = int(round(configs.audio.sample_rate * 0.001 * configs.audio.frame_length))
+        self.hop_length = int(round(configs.audio.sample_rate * 0.001 * configs.audio.frame_shift))
+        self.function = torch.stft
+
+    def __call__(self, signal: np.ndarray) -> np.ndarray:
+        """
+        Provides feature extraction
+
+        Inputs:
+            signal (np.ndarray): audio signal
+
+        Returns:
+            feature (np.ndarray): feature extract by sub-class
+        """
+        spectrogram = self.function(
+            Tensor(signal), self.n_fft, hop_length=self.hop_length,
+            win_length=self.n_fft, window=torch.hamming_window(self.n_fft),
+            center=False, normalized=False, onesided=True
+        )
+        spectrogram = (spectrogram[:, :, 0].pow(2) + spectrogram[:, :, 1].pow(2)).pow(0.5)
+        spectrogram = np.log1p(spectrogram.numpy())
+        return spectrogram
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/data/sampler.py b/audio/speech_recognition/conformer/pytorch/openspeech/data/sampler.py
new file mode 100644
index 000000000..89c64d9d4
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/data/sampler.py
@@ -0,0 +1,95 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import os
+import numpy as np
+from torch.utils.data import Sampler
+
+from .audio.load import load_audio
+
+
+class RandomSampler(Sampler):
+    r"""
+    Implementation of a Random Sampler for sampling the dataset.
+
+    Args:
+        data_source (torch.utils.data.Dataset): dataset to sample from
+        batch_size (int): size of batch
+        drop_last (bool): flat indication whether to drop last batch or not
+    """
+    def __init__(self, data_source, batch_size: int = 32, drop_last: bool = False) -> None:
+        super(RandomSampler, self).__init__(data_source)
+        self.batch_size = batch_size
+        self.data_source = data_source
+        ids = list(range(0, len(data_source)))
+        self.bins = [ids[i:i + batch_size] for i in range(0, len(ids), batch_size)]
+        self.drop_last = drop_last
+
+    def __iter__(self):
+        for ids in self.bins:
+            np.random.shuffle(ids)
+            yield ids
+
+    def __len__(self):
+        return len(self.bins)
+
+    def shuffle(self, epoch):
+        np.random.shuffle(self.bins)
+
+
+class SmartBatchingSampler(Sampler):
+    """
+    Batching with similar sequence length.
+
+    Args:
+        data_source (torch.utils.data.Dataset): dataset to sample from
+        batch_size (int): size of batch
+        drop_last (bool): flat indication whether to drop last batch or not
+    """
+    def __init__(self, data_source, batch_size: int = 32, drop_last: bool = False) -> None:
+        super(SmartBatchingSampler, self).__init__(data_source)
+        self.batch_size = batch_size
+        self.data_source = data_source
+
+        audio_lengths = [self._get_audio_length(audio_path) for audio_path in data_source.audio_paths]
+        audio_indices = [idx for idx in range(len(data_source.audio_paths))]
+
+        pack_by_length = list(zip(audio_lengths, audio_indices))
+        sort_by_length = sorted(pack_by_length)
+        audio_lengths, audio_indices = zip(*sort_by_length)
+
+        self.bins = [audio_indices[i:i + batch_size] for i in range(0, len(audio_indices), batch_size)]
+        self.drop_last = drop_last
+
+    def __iter__(self):
+        for ids in self.bins:
+            np.random.shuffle(list(ids))
+            yield ids
+
+    def _get_audio_length(self, audio_path):
+        return len(load_audio(os.path.join(self.data_source.dataset_path, audio_path), sample_rate=16000))
+
+    def __len__(self):
+        return len(self.bins)
+
+    def shuffle(self, epoch):
+        np.random.shuffle(self.bins)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/data/text/data_loader.py b/audio/speech_recognition/conformer/pytorch/openspeech/data/text/data_loader.py
new file mode 100644
index 000000000..151af593e
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/data/text/data_loader.py
@@ -0,0 +1,87 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+from torch.utils.data import DataLoader
+
+
+def _collate_fn(batch, pad_id: int = 0):
+    r"""
+    Functions that pad to the maximum sequence length
+
+    Args:
+        batch (tuple): tuple contains input and target tensors
+        pad_id (int): identification of pad token
+
+    Returns:
+        inputs (torch.FloatTensor): tensor contains input sequences.
+    """
+    def seq_length_(p):
+        return len(p[0])
+
+    batch = sorted(batch, key=lambda sample: sample[0].size(0), reverse=True)
+
+    seq_lengths = [len(s[0]) for s in batch]
+
+    max_seq_sample = max(batch, key=seq_length_)[0]
+    max_seq_size = max_seq_sample.size(0)
+
+    batch_size = len(batch)
+
+    inputs = torch.zeros(batch_size, max_seq_size).fill_(pad_id).long()
+    targets = torch.zeros(batch_size, max_seq_size).fill_(pad_id).long()
+
+    for x in range(batch_size):
+        sample = batch[x]
+        input_var = sample[0]
+        target = sample[1]
+        inputs[x].narrow(0, 0, len(input_var)).copy_(torch.LongTensor(input_var))
+        targets[x].narrow(0, 0, len(target)).copy_(torch.LongTensor(target))
+
+    seq_lengths = torch.IntTensor(seq_lengths)
+
+    return inputs, seq_lengths, targets
+
+
+class TextDataLoader(DataLoader):
+    r"""
+    Text Data Loader
+
+    Args:
+        dataset (torch.utils.data.Dataset): dataset from which to load the data.
+        num_workers (int): how many subprocesses to use for data loading.
+        batch_sampler (torch.utils.data.sampler.Sampler): defines the strategy to draw samples from the dataset.
+    """
+    def __init__(
+            self,
+            dataset: torch.utils.data.Dataset,
+            num_workers: int,
+            batch_sampler: torch.utils.data.sampler.Sampler,
+            **kwargs,
+    ) -> None:
+        super(TextDataLoader, self).__init__(
+            dataset=dataset,
+            num_workers=num_workers,
+            batch_sampler=batch_sampler,
+            **kwargs,
+        )
+        self.collate_fn = _collate_fn
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/data/text/dataset.py b/audio/speech_recognition/conformer/pytorch/openspeech/data/text/dataset.py
new file mode 100644
index 000000000..b7e7f879f
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/data/text/dataset.py
@@ -0,0 +1,77 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import logging
+
+from torch.utils.data import Dataset
+
+logger = logging.getLogger(__name__)
+
+
+class TextDataset(Dataset):
+    """
+    Dataset for language modeling.
+
+    Args:
+        transcripts (list): list of transcript
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+    """
+    def __init__(self, transcripts: list, tokenizer):
+        super(TextDataset, self).__init__()
+        self.transcripts = transcripts
+        self.tokenizer = tokenizer
+        self.sos_id = tokenizer.sos_id
+        self.eos_id = tokenizer.eos_id
+
+    def _get_inputs(self, transcript):
+        tokens = transcript.split(' ')
+        transcript = [int(self.sos_id)]
+
+        for token in tokens:
+            transcript.append(int(token))
+
+        return transcript
+
+    def _get_targets(self, transcript):
+        tokens = transcript.split(' ')
+        transcript = list()
+
+        for token in tokens:
+            transcript.append(int(token))
+
+        transcript.append(int(self.eos_id))
+
+        return transcript
+
+    def __getitem__(self, idx):
+        transcript = self.tokenizer(self.transcripts[idx])
+        inputs = torch.LongTensor(self._get_inputs(transcript))
+        targets = torch.LongTensor(self._get_targets(transcript))
+        return inputs, targets
+
+    def __len__(self):
+        return len(self.transcripts)
+
+    def count(self):
+        return len(self.transcripts)
+
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/dataclass/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/dataclass/__init__.py
new file mode 100644
index 000000000..4f160b274
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/dataclass/__init__.py
@@ -0,0 +1,87 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from .configurations import (
+    AugmentConfigs,
+    LibriSpeechConfigs,
+    KsponSpeechConfigs,
+    AIShellConfigs,
+    LMConfigs,
+    CPUTrainerConfigs,
+    GPUTrainerConfigs,
+    TPUTrainerConfigs,
+    Fp16GPUTrainerConfigs,
+    Fp16TPUTrainerConfigs,
+    Fp64CPUTrainerConfigs,
+    EvaluationConfigs,
+    EnsembleEvaluationConfigs,
+    CPUResumeTrainerConfigs,
+    GPUResumeTrainerConfigs,
+    TPUResumeTrainerConfigs,
+)
+
+OPENSPEECH_TRAIN_CONFIGS = [
+    "audio",
+    "augment",
+    "dataset",
+    "model",
+    "criterion",
+    "lr_scheduler",
+    "trainer",
+    "tokenizer",
+]
+
+OPENSPEECH_LM_TRAIN_CONFIGS = [
+    "dataset",
+    "model",
+    "criterion",
+    "lr_scheduler",
+    "trainer",
+    "tokenizer",
+]
+
+DATASET_DATACLASS_REGISTRY = {
+    "kspon": KsponSpeechConfigs,
+    "libri": LibriSpeechConfigs,
+    "aishell": AIShellConfigs,
+    "ksponspeech": KsponSpeechConfigs,
+    "librispeech": LibriSpeechConfigs,
+    "lm": LMConfigs,
+}
+TRAINER_DATACLASS_REGISTRY = {
+    "cpu": CPUTrainerConfigs,
+    "gpu": GPUTrainerConfigs,
+    "tpu": TPUTrainerConfigs,
+    "gpu-fp16": Fp16GPUTrainerConfigs,
+    "tpu-fp16": Fp16TPUTrainerConfigs,
+    "cpu-fp64": Fp64CPUTrainerConfigs,
+    "cpu-resume": CPUResumeTrainerConfigs,
+    "gpu-resume": GPUResumeTrainerConfigs,
+    "tpu-resume": TPUResumeTrainerConfigs,
+}
+AUGMENT_DATACLASS_REGISTRY = {
+    "default": AugmentConfigs,
+}
+EVAL_DATACLASS_REGISTRY = {
+    "default": EvaluationConfigs,
+    "ensemble": EnsembleEvaluationConfigs,
+}
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/dataclass/configurations.py b/audio/speech_recognition/conformer/pytorch/openspeech/dataclass/configurations.py
new file mode 100644
index 000000000..571197f4e
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/dataclass/configurations.py
@@ -0,0 +1,481 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from dataclasses import dataclass, _MISSING_TYPE, field
+from typing import List, Optional, Any
+
+MISSING = ''
+
+
+@dataclass
+class OpenspeechDataclass:
+    """ OpenSpeech base dataclass that supported fetching attributes and metas """
+
+    def _get_all_attributes(self) -> List[str]:
+        return [k for k in self.__dataclass_fields__.keys()]
+
+    def _get_meta(self, attribute_name: str, meta: str, default: Optional[Any] = None) -> Any:
+        return self.__dataclass_fields__[attribute_name].metadata.get(meta, default)
+
+    def _get_name(self, attribute_name: str) -> str:
+        return self.__dataclass_fields__[attribute_name].name
+
+    def _get_default(self, attribute_name: str) -> Any:
+        if hasattr(self, attribute_name):
+            if str(getattr(self, attribute_name)).startswith("${"):
+                return str(getattr(self, attribute_name))
+            elif str(self.__dataclass_fields__[attribute_name].default).startswith("${"):
+                return str(self.__dataclass_fields__[attribute_name].default)
+            elif getattr(self, attribute_name) != self.__dataclass_fields__[attribute_name].default:
+                return getattr(self, attribute_name)
+
+        f = self.__dataclass_fields__[attribute_name]
+        if not isinstance(f.default_factory, _MISSING_TYPE):
+            return f.default_factory()
+        return f.default
+
+    def _get_type(self, attribute_name: str) -> Any:
+        return self.__dataclass_fields__[attribute_name].type
+
+    def _get_help(self, attribute_name: str) -> Any:
+        return self._get_meta(attribute_name, "help")
+
+
+@dataclass
+class LibriSpeechConfigs(OpenspeechDataclass):
+    """ Configuration dataclass that common used """
+    dataset: str = field(
+        default="librispeech", metadata={"help": "Select dataset for training (librispeech, ksponspeech, aishell, lm)"}
+    )
+    dataset_path: str = field(
+        default=MISSING, metadata={"help": "Path of dataset"}
+    )
+    dataset_download: bool = field(
+        default=True, metadata={"help": "Flag indication whether to download dataset or not."}
+    )
+    manifest_file_path: str = field(
+        default="../../../LibriSpeech/libri_subword_manifest.txt", metadata={"help": "Path of manifest file"}
+    )
+
+
+@dataclass
+class KsponSpeechConfigs(OpenspeechDataclass):
+    """ Configuration dataclass that common used """
+    dataset: str = field(
+        default="ksponspeech", metadata={"help": "Select dataset for training (librispeech, ksponspeech, aishell, lm)"}
+    )
+    dataset_path: str = field(
+        default=MISSING, metadata={"help": "Path of dataset"}
+    )
+    test_dataset_path: str = field(
+        default=MISSING, metadata={"help": "Path of evaluation dataset"}
+    )
+    manifest_file_path: str = field(
+        default="../../../ksponspeech_manifest.txt", metadata={"help": "Path of manifest file"}
+    )
+    test_manifest_dir: str = field(
+        default="../../../KsponSpeech_scripts", metadata={"help": "Path of directory contains test manifest files"}
+    )
+    preprocess_mode: str = field(
+        default="phonetic", metadata={"help": "KsponSpeech preprocess mode {phonetic, spelling}"},
+    )
+
+
+@dataclass
+class AIShellConfigs(OpenspeechDataclass):
+    """ Configuration dataclass that common used """
+    dataset: str = field(
+        default="aishell", metadata={"help": "Select dataset for training (librispeech, ksponspeech, aishell, lm)"}
+    )
+    dataset_path: str = field(
+        default=MISSING, metadata={"help": "Path of dataset"}
+    )
+    dataset_download: bool = field(
+        default=True, metadata={"help": "Flag indication whether to download dataset or not."}
+    )
+    manifest_file_path: str = field(
+        default="../../../data_aishell/aishell_manifest.txt", metadata={"help": "Path of manifest file"}
+    )
+
+
+@dataclass
+class LMConfigs(OpenspeechDataclass):
+    dataset: str = field(
+        default="lm", metadata={"help": "Select dataset for training (librispeech, ksponspeech, aishell, lm)"}
+    )
+    dataset_path: str = field(
+        default=MISSING, metadata={"help": "Path of dataset"}
+    )
+    valid_ratio: float = field(
+        default=0.05, metadata={"help": "Ratio of validation data"}
+    )
+    test_ratio: float = field(
+        default=0.05, metadata={"help": "Ratio of test data"}
+    )
+
+
+@dataclass
+class AugmentConfigs(OpenspeechDataclass):
+    apply_spec_augment: bool = field(
+        default=False, metadata={"help": "Flag indication whether to apply spec augment or not"}
+    )
+    apply_noise_augment: bool = field(
+        default=False, metadata={"help": "Flag indication whether to apply noise augment or not "
+                                         "Noise augment requires `noise_dataset_path`. "
+                                         "`noise_dataset_dir` should be contain audio files."}
+    )
+    apply_joining_augment: bool = field(
+        default=False, metadata={"help": "Flag indication whether to apply joining augment or not "
+                                         "If true, create a new audio file by connecting two audio randomly"}
+    )
+    apply_time_stretch_augment: bool = field(
+        default=False, metadata={"help": "Flag indication whether to apply spec augment or not"}
+    )
+    freq_mask_para: int = field(
+        default=27, metadata={"help": "Hyper Parameter for freq masking to limit freq masking length"}
+    )
+    freq_mask_num: int = field(
+        default=2, metadata={"help": "How many freq-masked area to make"}
+    )
+    time_mask_num: int = field(
+        default=4, metadata={"help": "How many time-masked area to make"}
+    )
+    noise_dataset_dir: str = field(
+        default='None', metadata={"help": "How many time-masked area to make"}
+    )
+    noise_level: float = field(
+        default=0.7, metadata={"help": "Noise adjustment level"}
+    )
+    time_stretch_min_rate: float = field(
+        default=0.7, metadata={"help": "Minimum rate of audio time stretch"}
+    )
+    time_stretch_max_rate: float = field(
+        default=1.4, metadata={"help": "Maximum rate of audio time stretch"}
+    )
+
+
+@dataclass
+class BaseTrainerConfigs(OpenspeechDataclass):
+    """ Base trainer dataclass """
+    seed: int = field(
+        default=1, metadata={"help": "Seed for training."}
+    )
+    accelerator: str = field(
+        default="dp", metadata={"help": "Previously known as distributed_backend (dp, ddp, ddp2, etc…)."}
+    )
+    accumulate_grad_batches: int = field(
+        default=1, metadata={"help": "Accumulates grads every k batches or as set up in the dict."}
+    )
+    num_workers: int = field(
+        default=4, metadata={"help": "The number of cpu cores"}
+    )
+    batch_size: int = field(
+        default=32, metadata={"help": "Size of batch"}
+    )
+    check_val_every_n_epoch: int = field(
+        default=1, metadata={"help": "Check val every n train epochs."}
+    )
+    gradient_clip_val: float = field(
+        default=5.0, metadata={"help": "0 means don’t clip."}
+    )
+    logger: str = field(
+        default="wandb", metadata={"help": "Training logger. {wandb, tensorboard}"}
+    )
+    max_epochs: int = field(
+        default=20, metadata={"help": "Stop training once this number of epochs is reached."}
+    )
+    save_checkpoint_n_steps: int = field(
+        default=10000, metadata={"help": "Save a checkpoint every N steps."}
+    )
+    auto_scale_batch_size: str = field(
+        default="binsearch", metadata={"help": "If set to True, will initially run a batch size finder trying to find "
+                                               "the largest batch size that fits into memory."}
+    )
+    sampler: str = field(
+        default="smart", metadata={"help": "smart: batching with similar sequence length."
+                                           "else: random batch"}
+    )
+
+
+@dataclass
+class CPUResumeTrainerConfigs(BaseTrainerConfigs):
+    name: str = field(
+        default="cpu-resume", metadata={"help": "Trainer name"}
+    )
+    checkpoint_path: str = field(
+        default=MISSING, metadata={"help": "Path of model checkpoint."}
+    )
+    device: str = field(
+        default="cpu", metadata={"help": "Training device."}
+    )
+    use_cuda: bool = field(
+        default=False, metadata={"help": "If set True, will train with GPU"}
+    )
+
+
+@dataclass
+class GPUResumeTrainerConfigs(BaseTrainerConfigs):
+    name: str = field(
+        default="gpu-resume", metadata={"help": "Trainer name"}
+    )
+    checkpoint_path: str = field(
+        default=MISSING, metadata={"help": "Path of model checkpoint."}
+    )
+    device: str = field(
+        default="gpu", metadata={"help": "Training device."}
+    )
+    use_cuda: bool = field(
+        default=True, metadata={"help": "If set True, will train with GPU"}
+    )
+    auto_select_gpus: bool = field(
+        default=True, metadata={"help": "If enabled and gpus is an integer, pick available gpus automatically."}
+    )
+
+
+@dataclass
+class TPUResumeTrainerConfigs(BaseTrainerConfigs):
+    name: str = field(
+        default="tpu-resume", metadata={"help": "Trainer name"}
+    )
+    checkpoint_path: str = field(
+        default=MISSING, metadata={"help": "Path of model checkpoint."}
+    )
+    device: str = field(
+        default="tpu", metadata={"help": "Training device."}
+    )
+    use_cuda: bool = field(
+        default=False, metadata={"help": "If set True, will train with GPU"}
+    )
+    use_tpu: bool = field(
+        default=True, metadata={"help": "If set True, will train with GPU"}
+    )
+    tpu_cores: int = field(
+        default=8, metadata={"help": "Number of TPU cores"}
+    )
+
+
+@dataclass
+class CPUTrainerConfigs(BaseTrainerConfigs):
+    name: str = field(
+        default="cpu", metadata={"help": "Trainer name"}
+    )
+    device: str = field(
+        default="cpu", metadata={"help": "Training device."}
+    )
+    use_cuda: bool = field(
+        default=False, metadata={"help": "If set True, will train with GPU"}
+    )
+
+
+@dataclass
+class GPUTrainerConfigs(BaseTrainerConfigs):
+    """ GPU trainer dataclass """
+    name: str = field(
+        default="gpu", metadata={"help": "Trainer name"}
+    )
+    device: str = field(
+        default="gpu", metadata={"help": "Training device."}
+    )
+    use_cuda: bool = field(
+        default=True, metadata={"help": "If set True, will train with GPU"}
+    )
+    auto_select_gpus: bool = field(
+        default=True, metadata={"help": "If enabled and gpus is an integer, pick available gpus automatically."}
+    )
+
+
+@dataclass
+class TPUTrainerConfigs(BaseTrainerConfigs):
+    name: str = field(
+        default="tpu", metadata={"help": "Trainer name"}
+    )
+    device: str = field(
+        default="tpu", metadata={"help": "Training device."}
+    )
+    use_cuda: bool = field(
+        default=False, metadata={"help": "If set True, will train with GPU"}
+    )
+    use_tpu: bool = field(
+        default=True, metadata={"help": "If set True, will train with GPU"}
+    )
+    tpu_cores: int = field(
+        default=8, metadata={"help": "Number of TPU cores"}
+    )
+
+
+@dataclass
+class Fp16GPUTrainerConfigs(GPUTrainerConfigs):
+    name: str = field(
+        default="gpu-fp16", metadata={"help": "Trainer name"}
+    )
+    precision: int = field(
+        default=16, metadata={"help": "Double precision (64), full precision (32) or half precision (16). "
+                                      "Can be used on CPU, GPU or TPUs."}
+    )
+    amp_backend: str = field(
+        default="apex", metadata={"help": "The mixed precision backend to use (“native” or “apex”)"}
+    )
+
+
+@dataclass
+class Fp16TPUTrainerConfigs(TPUTrainerConfigs):
+    name: str = field(
+        default="tpu-fp16", metadata={"help": "Trainer name"}
+    )
+    precision: int = field(
+        default=16, metadata={"help": "Double precision (64), full precision (32) or half precision (16). "
+                                      "Can be used on CPU, GPU or TPUs."}
+    )
+    amp_backend: str = field(
+        default="apex", metadata={"help": "The mixed precision backend to use (“native” or “apex”)"}
+    )
+
+
+@dataclass
+class Fp64CPUTrainerConfigs(CPUTrainerConfigs):
+    name: str = field(
+        default="cpu-fp64", metadata={"help": "Trainer name"}
+    )
+    precision: int = field(
+        default=64, metadata={"help": "Double precision (64), full precision (32) or half precision (16). "
+                                      "Can be used on CPU, GPU or TPUs."}
+    )
+    amp_backend: str = field(
+        default="apex", metadata={"help": "The mixed precision backend to use (“native” or “apex”)"}
+    )
+
+
+@dataclass
+class LearningRateSchedulerConfigs(OpenspeechDataclass):
+    """ Super class of learning rate dataclass """
+    lr: float = field(
+        default=1e-04, metadata={"help": "Learning rate"}
+    )
+
+
+@dataclass
+class TokenizerConfigs(OpenspeechDataclass):
+    """ Super class of tokenizer dataclass """
+    sos_token: str = field(
+        default="<sos>", metadata={"help": "Start of sentence token"}
+    )
+    eos_token: str = field(
+        default="<eos>", metadata={"help": "End of sentence token"}
+    )
+    pad_token: str = field(
+        default="<pad>", metadata={"help": "Pad token"}
+    )
+    blank_token: str = field(
+        default="<blank>", metadata={"help": "Blank token (for CTC training)"}
+    )
+    encoding: str = field(
+        default="utf-8", metadata={"help": "Encoding of vocab"}
+    )
+
+
+@dataclass
+class EvaluationConfigs(OpenspeechDataclass):
+    model_name: str = field(
+        default=MISSING, metadata={"help": "Model name."}
+    )
+    dataset_path: str = field(
+        default=MISSING, metadata={"help": "Path of dataset."}
+    )
+    checkpoint_path: str = field(
+        default=MISSING, metadata={"help": "Path of model checkpoint."}
+    )
+    manifest_file_path: str = field(
+        default=MISSING, metadata={"help": "Path of evaluation manifest file."}
+    )
+    num_workers: int = field(
+        default=4, metadata={"help": "Number of worker."}
+    )
+    batch_size: int = field(
+        default=32, metadata={"help": "Batch size."}
+    )
+    beam_size: int = field(
+        default=1, metadata={"help": "Beam size of beam search."}
+    )
+
+
+@dataclass
+class EnsembleEvaluationConfigs(OpenspeechDataclass):
+    model_names: str = field(
+        default=MISSING, metadata={"help": "List of model name."}
+    )
+    dataset_paths: str = field(
+        default=MISSING, metadata={"help": "Path of dataset."}
+    )
+    checkpoint_paths: str = field(
+        default=MISSING, metadata={"help": "List of model checkpoint path."}
+    )
+    manifest_file_path: str = field(
+        default=MISSING, metadata={"help": "Path of evaluation manifest file."}
+    )
+    ensemble_method: str = field(
+        default="vanilla", metadata={"help": "Method of ensemble (vanilla, weighted)"}
+    )
+    ensemble_weights: str = field(
+        default="(1.0, 1.0, 1.0 ..)", metadata={"help": "Weights of ensemble models."}
+    )
+    num_workers: int = field(
+        default=4, metadata={"help": "Number of worker."}
+    )
+    batch_size: int = field(
+        default=32, metadata={"help": "Batch size."}
+    )
+    beam_size: int = field(
+        default=1, metadata={"help": "Beam size of beam search."}
+    )
+
+
+def generate_openspeech_configs_with_help():
+    from openspeech.dataclass import OPENSPEECH_TRAIN_CONFIGS, TRAINER_DATACLASS_REGISTRY, AUGMENT_DATACLASS_REGISTRY, \
+        DATASET_DATACLASS_REGISTRY
+    from openspeech.models import MODEL_DATACLASS_REGISTRY
+    from openspeech.criterion import CRITERION_DATACLASS_REGISTRY
+    from openspeech.data import AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY
+    from openspeech.optim.scheduler import SCHEDULER_DATACLASS_REGISTRY
+    from openspeech.tokenizers import TOKENIZER_DATACLASS_REGISTRY
+
+    registries = {
+        "audio": AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY,
+        "augment": AUGMENT_DATACLASS_REGISTRY,
+        "trainer": TRAINER_DATACLASS_REGISTRY,
+        "model": MODEL_DATACLASS_REGISTRY,
+        "criterion": CRITERION_DATACLASS_REGISTRY,
+        "dataset": DATASET_DATACLASS_REGISTRY,
+        "lr_scheduler": SCHEDULER_DATACLASS_REGISTRY,
+        "tokenizer": TOKENIZER_DATACLASS_REGISTRY,
+    }
+
+    with open("configuration.md", "w") as f:
+        for group in OPENSPEECH_TRAIN_CONFIGS:
+            dataclass_registry = registries[group]
+
+            f.write(f"## `{group}`\n")
+
+            for k, v in dataclass_registry.items():
+                f.write(f"### `{k}`  \n")
+                v = v()
+                for kv in v.__dataclass_fields__:
+                    f.write(f"- `{kv}` : {v._get_help(kv)}\n")
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/dataclass/initialize.py b/audio/speech_recognition/conformer/pytorch/openspeech/dataclass/initialize.py
new file mode 100644
index 000000000..f3424ebc8
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/dataclass/initialize.py
@@ -0,0 +1,96 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from hydra.core.config_store import ConfigStore
+
+
+def hydra_train_init() -> None:
+    r""" initialize ConfigStore for hydra-train """
+    from openspeech.dataclass import OPENSPEECH_TRAIN_CONFIGS, DATASET_DATACLASS_REGISTRY, TRAINER_DATACLASS_REGISTRY
+    from openspeech.models import MODEL_DATACLASS_REGISTRY
+    from openspeech.criterion import CRITERION_DATACLASS_REGISTRY
+    from openspeech.data import AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY
+    from openspeech.optim.scheduler import SCHEDULER_DATACLASS_REGISTRY
+    from openspeech.tokenizers import TOKENIZER_DATACLASS_REGISTRY
+    from openspeech.dataclass import AUGMENT_DATACLASS_REGISTRY
+
+    registries = {
+        "audio": AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY,
+        "augment": AUGMENT_DATACLASS_REGISTRY,
+        "dataset": DATASET_DATACLASS_REGISTRY,
+        "trainer": TRAINER_DATACLASS_REGISTRY,
+        "model": MODEL_DATACLASS_REGISTRY,
+        "criterion": CRITERION_DATACLASS_REGISTRY,
+        "lr_scheduler": SCHEDULER_DATACLASS_REGISTRY,
+        "tokenizer": TOKENIZER_DATACLASS_REGISTRY,
+    }
+
+    cs = ConfigStore.instance()
+
+    for group in OPENSPEECH_TRAIN_CONFIGS:
+        dataclass_registry = registries[group]
+
+        for k, v in dataclass_registry.items():
+            cs.store(group=group, name=k, node=v, provider="openspeech")
+
+
+def hydra_lm_train_init() -> None:
+    from openspeech.dataclass import OPENSPEECH_LM_TRAIN_CONFIGS, DATASET_DATACLASS_REGISTRY, TRAINER_DATACLASS_REGISTRY
+    from openspeech.models import MODEL_DATACLASS_REGISTRY
+    from openspeech.criterion import CRITERION_DATACLASS_REGISTRY
+    from openspeech.optim.scheduler import SCHEDULER_DATACLASS_REGISTRY
+    from openspeech.tokenizers import TOKENIZER_DATACLASS_REGISTRY
+
+    registries = {
+        "dataset": DATASET_DATACLASS_REGISTRY,
+        "trainer": TRAINER_DATACLASS_REGISTRY,
+        "model": MODEL_DATACLASS_REGISTRY,
+        "criterion": CRITERION_DATACLASS_REGISTRY,
+        "lr_scheduler": SCHEDULER_DATACLASS_REGISTRY,
+        "tokenizer": TOKENIZER_DATACLASS_REGISTRY,
+    }
+
+    cs = ConfigStore.instance()
+
+    for group in OPENSPEECH_LM_TRAIN_CONFIGS:
+        dataclass_registry = registries[group]
+
+        for k, v in dataclass_registry.items():
+            cs.store(group=group, name=k, node=v, provider="openspeech")
+
+
+def hydra_eval_init() -> None:
+    from openspeech.data import AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY
+    from openspeech.dataclass import EVAL_DATACLASS_REGISTRY
+
+    registries = {
+        "audio": AUDIO_FEATURE_TRANSFORM_DATACLASS_REGISTRY,
+        "eval": EVAL_DATACLASS_REGISTRY,
+    }
+
+    cs = ConfigStore.instance()
+
+    for group in registries.keys():
+        dataclass_registry = registries[group]
+
+        for k, v in dataclass_registry.items():
+            cs.store(group=group, name=k, node=v, provider="openspeech")
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/datasets/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/datasets/__init__.py
new file mode 100644
index 000000000..52d38c845
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/datasets/__init__.py
@@ -0,0 +1,60 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import os
+import importlib
+
+DATA_MODULE_REGISTRY = dict()
+
+
+def register_data_module(name: str):
+    """
+    New data module types can be added to OpenSpeech with the :func:`register_data_module` function decorator.
+
+    For example::
+        @register_data_module('ksponspeech')
+        class LightningKsponSpeechDataModule:
+            (...)
+
+    .. note:: All vocabs must implement the :class:`cls.__name__` interface.
+
+    Args:
+        name (str): the name of the vocab
+    """
+
+    def register_data_module_cls(cls):
+        if name in DATA_MODULE_REGISTRY:
+            raise ValueError(f"Cannot register duplicate data module ({name})")
+        DATA_MODULE_REGISTRY[name] = cls
+        return cls
+
+    return register_data_module_cls
+
+
+data_module_dir = os.path.dirname(__file__)
+for file in os.listdir(data_module_dir):
+    if os.path.isdir(os.path.join(data_module_dir, file)) and file != '__pycache__':
+        for subfile in os.listdir(os.path.join(data_module_dir, file)):
+            path = os.path.join(data_module_dir, file, subfile)
+            if subfile.endswith(".py"):
+                data_module_name = subfile[: subfile.find(".py")] if subfile.endswith(".py") else subfile
+                module = importlib.import_module(f"openspeech.datasets.{file}.{data_module_name}")
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/datasets/librispeech/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/datasets/librispeech/__init__.py
new file mode 100644
index 000000000..9a083c67f
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/datasets/librispeech/__init__.py
@@ -0,0 +1,21 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/datasets/librispeech/lit_data_module.py b/audio/speech_recognition/conformer/pytorch/openspeech/datasets/librispeech/lit_data_module.py
new file mode 100644
index 000000000..396cd964a
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/datasets/librispeech/lit_data_module.py
@@ -0,0 +1,169 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import os
+import logging
+from typing import Tuple, Optional
+
+from openspeech.data.audio.dataset import SpeechToTextDataset
+from openspeech.datasets import register_data_module
+from openspeech.tokenizers.tokenizer import Tokenizer
+
+
+@register_data_module('librispeech')
+class LibriSpeechDataModule(object):
+    """
+    Data Module for LibriSpeech Dataset. LibriSpeech is a corpus of approximately 1000 hours of read
+    English speech with sampling rate of 16 kHz, prepared by Vassil Panayotov with the assistance of Daniel Povey.
+    The data is derived from read audiobooks from the LibriVox project, and has been carefully segmented and aligned.
+
+    Args:
+        configs (DictConfig): configuraion set
+    """
+    #LIBRISPEECH_TRAIN_NUM = 281241
+    #LIBRISPEECH_VALID_NUM = 5567
+    #LIBRISPEECH_TEST_NUM = 5559
+    LIBRISPEECH_PARTS = [
+        'dev-clean',
+        'test-clean',
+        'dev-other',
+        'test-other',
+        'train-clean-100',
+        'train-clean-360',
+        'train-other-500',
+    ]
+
+    def __init__(self, configs) -> None:
+        super().__init__()
+        self.configs = configs
+        self.dataset = dict()
+        self.logger = logging.getLogger(__name__)
+
+    def _parse_manifest_file(self, manifest_file_path: str) -> Tuple[list, list]:
+        """ Parsing manifest file """
+        audio_paths = list()
+        transcripts = list()
+
+        with open(manifest_file_path, encoding='utf-8') as f:
+            for idx, line in enumerate(f.readlines()):
+                audio_path, _, transcript = line.split('\t')
+                transcript = transcript.replace('\n', '')
+
+                audio_paths.append(audio_path)
+                transcripts.append(transcript)
+
+        return audio_paths, transcripts
+
+    def prepare_data(self) -> Tokenizer:
+        """
+        Prepare librispeech data
+
+        Returns:
+            tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+        """
+        if self.configs.tokenizer.unit == 'libri_subword':
+            from openspeech.datasets.librispeech.preprocess.subword import generate_manifest_files
+        elif self.configs.tokenizer.unit == 'libri_character':
+            from openspeech.datasets.librispeech.preprocess.character import generate_manifest_files
+        else:
+            raise ValueError(f"Unsupported vocabulary unit: {self.configs.tokenizer.unit}")
+
+        if self.configs.dataset.dataset_download:
+            self._download_dataset()
+
+        if not os.path.exists(self.configs.dataset.train_manifest_file):
+            self.logger.info("Manifest file is not exists !!\n"
+                             "Generate manifest files..")
+
+            if hasattr(self.configs.tokenizer, "vocab_size"):
+                generate_manifest_files(
+                    dataset_path=self.configs.dataset.dataset_path,
+                    manifest_file_path=self.configs.dataset.train_manifest_file,
+                    vocab_path=self.configs.tokenizer.vocab_path,
+                    vocab_size=self.configs.tokenizer.vocab_size,
+                    librispeech_parts=self.configs.dataset.train_parts
+                )
+            else:
+                generate_manifest_files(
+                    dataset_path=self.configs.dataset.dataset_path,
+                    manifest_file_path=self.configs.dataset.train_manifest_file,
+                    vocab_path=self.configs.tokenizer.vocab_path,
+                    librispeech_parts=self.configs.dataset.train_parts
+                )
+
+        if not os.path.exists(self.configs.dataset.eval_manifest_file):
+            self.logger.info("Manifest file is not exists !!\n"
+                             "Generate manifest files..")
+
+            if hasattr(self.configs.tokenizer, "vocab_size"):
+                generate_manifest_files(
+                    dataset_path=self.configs.dataset.dataset_path,
+                    manifest_file_path=self.configs.dataset.eval_manifest_file,
+                    vocab_path=self.configs.tokenizer.vocab_path,
+                    vocab_size=self.configs.tokenizer.vocab_size,
+                    librispeech_parts=self.configs.dataset.eval_parts
+                )
+            else:
+                generate_manifest_files(
+                    dataset_path=self.configs.dataset.dataset_path,
+                    manifest_file_path=self.configs.dataset.eval_manifest_file,
+                    vocab_path=self.configs.tokenizer.vocab_path,
+                    librispeech_parts=self.configs.dataset.eval_parts
+                )
+
+    def setup(
+        self,
+        stage: Optional[str] = None,
+        tokenizer: Tokenizer = None,
+        num_train_samples: int = None,
+        num_eval_samples: int = None
+    ) -> None:
+        train_audio_paths, train_transcripts = self._parse_manifest_file(
+            self.configs.dataset.train_manifest_file)
+        eval_audio_paths, eval_transcripts = self._parse_manifest_file(
+            self.configs.dataset.eval_manifest_file)
+
+        if num_train_samples is None:
+            num_train_samples = len(train_audio_paths)
+        if num_eval_samples is None:
+            num_eval_samples = len(eval_audio_paths)
+
+        audio_paths = {
+            "train": train_audio_paths[:num_train_samples],
+            "val": eval_audio_paths[:num_eval_samples]
+        }
+        transcripts = {
+            "train": train_transcripts[:num_train_samples],
+            "val": eval_transcripts[:num_eval_samples]
+        }
+
+        for stage in audio_paths.keys():
+            self.dataset[stage] = SpeechToTextDataset(
+                configs=self.configs,
+                dataset_path=self.configs.dataset.dataset_path,
+                audio_paths=audio_paths[stage],
+                transcripts=transcripts[stage],
+                sos_id=tokenizer.sos_id,
+                eos_id=tokenizer.eos_id,
+                apply_spec_augment=self.configs.audio.apply_spec_augment if stage == 'train' else False,
+                del_silence=self.configs.audio.del_silence if stage == 'train' else False,
+            )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/datasets/librispeech/preprocess/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/datasets/librispeech/preprocess/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/datasets/librispeech/preprocess/character.py b/audio/speech_recognition/conformer/pytorch/openspeech/datasets/librispeech/preprocess/character.py
new file mode 100644
index 000000000..3313188c6
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/datasets/librispeech/preprocess/character.py
@@ -0,0 +1,116 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import os
+import logging
+import pandas as pd
+import shutil
+from typing import Tuple
+
+from openspeech.datasets.librispeech.preprocess.preprocess import collect_transcripts
+
+
+logger = logging.getLogger(__name__)
+
+
+def _generate_character_labels(labels_dest):
+    logger.info('create_char_labels started..')
+
+    tokens = ' ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+    tokens = list(tokens)
+
+    special_tokens = ['<pad>', '<sos>', '<eos>', '<blank>']
+    tokens.extend(special_tokens)
+
+    # sort together Using zip
+    label = {
+        'id': [x for x in range(len(tokens))],
+        'char': tokens,
+    }
+
+    label_df = pd.DataFrame(label)
+    label_df.to_csv(labels_dest, encoding="utf-8", index=False)
+
+
+def _load_label(filepath):
+    char2id = dict()
+    id2char = dict()
+
+    ch_labels = pd.read_csv(filepath, encoding="utf-8")
+
+    id_list = ch_labels["id"]
+    char_list = ch_labels["char"]
+
+    for (id_, char) in zip(id_list, char_list):
+        char2id[char] = id_
+        id2char[id_] = char
+    return char2id, id2char
+
+
+def sentence_to_target(sentence, char2id):
+    target = str()
+
+    for ch in sentence:
+        try:
+            target += (str(char2id[ch]) + ' ')
+        except KeyError:
+            continue
+
+    return target[:-1]
+
+
+def generate_manifest_files(
+    dataset_path: str,
+    manifest_file_path: str,
+    vocab_path: str,
+    librispeech_parts: list = [
+        'train-clean-100',
+        'train-clean-360',
+        'train-other-500',
+        'dev-clean',
+        'dev-other',
+        'test-clean',
+        'test-other']
+) -> None:
+    """
+    Generate manifest files.
+    Format: {audio_path}\t{transcript}\t{numerical_label}
+
+    Args:
+        vocab_size (int): size of subword vocab
+
+    Returns:
+        None
+    """
+    _generate_character_labels(vocab_path)
+    char2id, id2char = _load_label(vocab_path)
+
+    transcripts_collection = collect_transcripts(dataset_path, librispeech_parts)
+
+    with open(manifest_file_path, 'w') as f:
+        for idx, part in enumerate(librispeech_parts):
+            for transcript in transcripts_collection[idx]:
+                audio_path, transcript = transcript.split('|')
+                label = sentence_to_target(transcript, char2id)
+                f.write(f"{audio_path}\t{transcript}\t{label}\n")
+
+    return
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/datasets/librispeech/preprocess/preprocess.py b/audio/speech_recognition/conformer/pytorch/openspeech/datasets/librispeech/preprocess/preprocess.py
new file mode 100644
index 000000000..df4d96383
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/datasets/librispeech/preprocess/preprocess.py
@@ -0,0 +1,60 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import os
+
+
+def collect_transcripts(
+    dataset_path,
+    librispeech_parts: list = [
+        'train-clean-100',
+        'train-clean-360',
+        'train-other-500',
+        'dev-clean',
+        'dev-other',
+        'test-clean',
+        'test-other']
+):
+    """ Collect librispeech transcripts """
+    transcripts_collection = list()
+
+    for dataset in librispeech_parts:
+        dataset_transcripts = list()
+
+        for subfolder1 in os.listdir(os.path.join(dataset_path, dataset)):
+            for subfolder2 in os.listdir(os.path.join(dataset_path, dataset, subfolder1)):
+                for file in os.listdir(os.path.join(dataset_path, dataset, subfolder1, subfolder2)):
+                    if file.endswith('txt'):
+                        with open(os.path.join(dataset_path, dataset, subfolder1, subfolder2, file)) as f:
+                            for line in f.readlines():
+                                tokens = line.split()
+                                audio_path = os.path.join(dataset, subfolder1, subfolder2, tokens[0])
+                                audio_path = f"{audio_path}.flac"
+                                transcript = " ".join(tokens[1:])
+                                dataset_transcripts.append('%s|%s' % (audio_path, transcript))
+
+                    else:
+                        continue
+
+        transcripts_collection.append(dataset_transcripts)
+
+    return transcripts_collection
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/datasets/librispeech/preprocess/subword.py b/audio/speech_recognition/conformer/pytorch/openspeech/datasets/librispeech/preprocess/subword.py
new file mode 100644
index 000000000..27c99098d
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/datasets/librispeech/preprocess/subword.py
@@ -0,0 +1,92 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import os
+import sentencepiece as spm
+import shutil
+from typing import Tuple
+
+from openspeech.datasets.librispeech.preprocess.preprocess import collect_transcripts
+
+SENTENCEPIECE_MODEL_NAME = "sp"
+
+
+def _prepare_tokenizer(train_transcripts, vocab_size):
+    """ Prepare sentencepice tokenizer """
+    input_file = 'spm_input.txt'
+    model_type = 'unigram'
+
+    with open(input_file, 'w') as f:
+        for transcript in train_transcripts:
+            f.write(f"{transcript.split('|')[-1]}\n")
+
+    spm.SentencePieceTrainer.Train(f"--input={input_file} "
+                                   f"--model_prefix={SENTENCEPIECE_MODEL_NAME} "
+                                   f"--vocab_size={vocab_size} "
+                                   f"--model_type={model_type} "
+                                   f"--pad_id=0 "
+                                   f"--bos_id=1 "
+                                   f"--eos_id=2 "
+                                   f"--unk_id=3 "
+                                   f"--user_defined_symbols=<blank>")
+
+
+def generate_manifest_files(
+    dataset_path: str,
+    manifest_file_path: str,
+    vocab_path: str,
+    vocab_size: int,
+    librispeech_parts: list = [
+        'train-clean-100',
+        'train-clean-360',
+        'train-other-500',
+        'dev-clean',
+        'dev-other',
+        'test-clean',
+        'test-other']
+) -> None:
+    """
+    Generate manifest files.
+    Format: {audio_path}\t{transcript}\t{numerical_label}
+
+    Args:
+        vocab_size (int): size of subword vocab
+
+    Returns:
+        None
+    """
+    transcripts_collection = collect_transcripts(dataset_path, librispeech_parts)
+    _prepare_tokenizer(transcripts_collection[0], vocab_size)
+
+    shutil.copy(f"{SENTENCEPIECE_MODEL_NAME}.model", os.path.join(vocab_path, f"{SENTENCEPIECE_MODEL_NAME}.model"))
+    shutil.copy(f"{SENTENCEPIECE_MODEL_NAME}.vocab", os.path.join(vocab_path, f"{SENTENCEPIECE_MODEL_NAME}.vocab"))
+
+    sp = spm.SentencePieceProcessor()
+    sp.Load(os.path.join(vocab_path, f"{SENTENCEPIECE_MODEL_NAME}.model"))
+
+    with open(manifest_file_path, 'w', encoding='utf-8') as f:
+        for idx, part in enumerate(librispeech_parts):
+            for transcript in transcripts_collection[idx]:
+                audio_path, transcript = transcript.split('|')
+                text = " ".join(sp.EncodeAsPieces(transcript))
+                label = " ".join([str(item) for item in sp.EncodeAsIds(transcript)])
+                f.write(f"{audio_path}\t{text}\t{label}\n")
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/decoders/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/decoders/__init__.py
new file mode 100644
index 000000000..41bf8e369
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/decoders/__init__.py
@@ -0,0 +1,27 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from .openspeech_decoder import OpenspeechDecoder
+from .lstm_attention_decoder import LSTMAttentionDecoder
+from .rnn_transducer_decoder import RNNTransducerDecoder
+from .transformer_decoder import TransformerDecoder
+from .transformer_transducer_decoder import TransformerTransducerDecoder
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/decoders/lstm_attention_decoder.py b/audio/speech_recognition/conformer/pytorch/openspeech/decoders/lstm_attention_decoder.py
new file mode 100644
index 000000000..40c7a784f
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/decoders/lstm_attention_decoder.py
@@ -0,0 +1,251 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import random
+import torch
+import torch.nn as nn
+from typing import Optional, Tuple, Any
+
+from openspeech.decoders import OpenspeechDecoder
+from openspeech.modules import (
+    Linear,
+    View,
+    LocationAwareAttention,
+    MultiHeadAttention,
+    AdditiveAttention,
+    DotProductAttention,
+)
+
+
+class LSTMAttentionDecoder(OpenspeechDecoder):
+    r"""
+    Converts higher level features (from encoders) into output utterances
+    by specifying a probability distribution over sequences of characters.
+
+    Args:
+        num_classes (int): number of classification
+        hidden_state_dim (int): the number of features in the decoders hidden state `h`
+        num_layers (int, optional): number of recurrent layers (default: 2)
+        rnn_type (str, optional): type of RNN cell (default: lstm)
+        pad_id (int, optional): index of the pad symbol (default: 0)
+        sos_id (int, optional): index of the start of sentence symbol (default: 1)
+        eos_id (int, optional): index of the end of sentence symbol (default: 2)
+        attn_mechanism (str, optional): type of attention mechanism (default: multi-head)
+        num_heads (int, optional): number of attention heads. (default: 4)
+        dropout_p (float, optional): dropout probability of decoders (default: 0.2)
+
+    Inputs: inputs, encoder_outputs, teacher_forcing_ratio
+        - **inputs** (batch, seq_len, input_size): list of sequences, whose length is the batch size and within which
+          each sequence is a list of token IDs.  It is used for teacher forcing when provided. (default `None`)
+        - **encoder_outputs** (batch, seq_len, hidden_state_dim): tensor with containing the outputs of the encoders.
+          Used for attention mechanism (default is `None`).
+        - **teacher_forcing_ratio** (float): The probability that teacher forcing will be used. A random number is
+          drawn uniformly from 0-1 for every decoding token, and if the sample is smaller than the given value,
+          teacher forcing would be used (default is 0).
+
+    Returns: logits
+        * logits (torch.FloatTensor) : log probabilities of model's prediction
+    """
+    supported_rnns = {
+        'lstm': nn.LSTM,
+        'gru': nn.GRU,
+        'rnn': nn.RNN,
+    }
+
+    def __init__(
+            self,
+            num_classes: int,
+            max_length: int = 150,
+            hidden_state_dim: int = 1024,
+            pad_id: int = 0,
+            sos_id: int = 1,
+            eos_id: int = 2,
+            attn_mechanism: str = 'multi-head',
+            num_heads: int = 4,
+            num_layers: int = 2,
+            rnn_type: str = 'lstm',
+            dropout_p: float = 0.3,
+    ) -> None:
+        super(LSTMAttentionDecoder, self).__init__()
+        self.hidden_state_dim = hidden_state_dim
+        self.num_classes = num_classes
+        self.num_heads = num_heads
+        self.num_layers = num_layers
+        self.max_length = max_length
+        self.eos_id = eos_id
+        self.sos_id = sos_id
+        self.pad_id = pad_id
+        self.attn_mechanism = attn_mechanism.lower()
+        self.embedding = nn.Embedding(num_classes, hidden_state_dim)
+        self.input_dropout = nn.Dropout(dropout_p)
+        rnn_cell = self.supported_rnns[rnn_type.lower()]
+        self.rnn = rnn_cell(
+            input_size=hidden_state_dim,
+            hidden_size=hidden_state_dim,
+            num_layers=num_layers,
+            bias=True,
+            batch_first=True,
+            dropout=dropout_p,
+            bidirectional=False,
+        )
+
+        if self.attn_mechanism == 'loc':
+            self.attention = LocationAwareAttention(hidden_state_dim, attn_dim=hidden_state_dim, smoothing=False)
+        elif self.attn_mechanism == 'multi-head':
+            self.attention = MultiHeadAttention(hidden_state_dim, num_heads=num_heads)
+        elif self.attn_mechanism == 'additive':
+            self.attention = AdditiveAttention(hidden_state_dim)
+        elif self.attn_mechanism == 'dot':
+            self.attention = DotProductAttention(dim=hidden_state_dim)
+        elif self.attn_mechanism == 'scaled-dot':
+            self.attention = DotProductAttention(dim=hidden_state_dim, scale=True)
+        else:
+            raise ValueError("Unsupported attention: %s".format(attn_mechanism))
+
+        self.fc = nn.Sequential(
+            Linear(hidden_state_dim << 1, hidden_state_dim),
+            nn.Tanh(),
+            View(shape=(-1, self.hidden_state_dim), contiguous=True),
+            Linear(hidden_state_dim, num_classes),
+        )
+
+    def forward_step(
+            self,
+            input_var: torch.Tensor,
+            hidden_states: Optional[torch.Tensor],
+            encoder_outputs: torch.Tensor,
+            attn: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        batch_size, output_lengths = input_var.size(0), input_var.size(1)
+
+        embedded = self.embedding(input_var)
+        embedded = self.input_dropout(embedded)
+
+        if self.training:
+            self.rnn.flatten_parameters()
+
+        outputs, hidden_states = self.rnn(embedded, hidden_states)
+
+        if self.attn_mechanism == 'loc':
+            context, attn = self.attention(outputs, encoder_outputs, attn)
+        else:
+            context, attn = self.attention(outputs, encoder_outputs, encoder_outputs)
+
+        outputs = torch.cat((outputs, context), dim=2)
+
+        step_outputs = self.fc(outputs.view(-1, self.hidden_state_dim << 1)).log_softmax(dim=-1)
+        step_outputs = step_outputs.view(batch_size, output_lengths, -1).squeeze(1)
+
+        return step_outputs, hidden_states, attn
+
+    def forward(
+            self,
+            encoder_outputs: torch.Tensor,
+            targets: Optional[torch.Tensor] = None,
+            encoder_output_lengths: Optional[torch.Tensor] = None,
+            teacher_forcing_ratio: float = 1.0,
+    ) -> torch.Tensor:
+        """
+        Forward propagate a `encoder_outputs` for training.
+
+        Args:
+            targets (torch.LongTensr): A target sequence passed to decoders. `IntTensor` of size ``(batch, seq_length)``
+            encoder_outputs (torch.FloatTensor): A output sequence of encoders. `FloatTensor` of size
+                ``(batch, seq_length, dimension)``
+            encoder_output_lengths: The length of encoders outputs. ``(batch)``
+            teacher_forcing_ratio (float): ratio of teacher forcing
+
+        Returns:
+            * logits (torch.FloatTensor): Log probability of model predictions.
+        """
+        logits = list()
+        hidden_states, attn = None, None
+
+        targets, batch_size, max_length = self.validate_args(targets, encoder_outputs, teacher_forcing_ratio)
+        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
+
+        if use_teacher_forcing:
+            targets = targets[targets != self.eos_id].view(batch_size, -1)
+
+            if self.attn_mechanism == 'loc' or self.attn_mechanism == 'additive':
+                for di in range(targets.size(1)):
+                    input_var = targets[:, di].unsqueeze(1)
+                    step_outputs, hidden_states, attn = self.forward_step(
+                        input_var=input_var,
+                        hidden_states=hidden_states,
+                        encoder_outputs=encoder_outputs,
+                        attn=attn,
+                    )
+                    logits.append(step_outputs)
+
+            else:
+                step_outputs, hidden_states, attn = self.forward_step(
+                    input_var=targets,
+                    hidden_states=hidden_states,
+                    encoder_outputs=encoder_outputs,
+                    attn=attn,
+                )
+
+                for di in range(step_outputs.size(1)):
+                    step_output = step_outputs[:, di, :]
+                    logits.append(step_output)
+
+        else:
+            input_var = targets[:, 0].unsqueeze(1)
+
+            for di in range(max_length):
+                step_outputs, hidden_states, attn = self.forward_step(
+                    input_var=input_var,
+                    hidden_states=hidden_states,
+                    encoder_outputs=encoder_outputs,
+                    attn=attn,
+                )
+                logits.append(step_outputs)
+                input_var = logits[-1].topk(1)[1]
+
+        logits = torch.stack(logits, dim=1)
+
+        return logits
+
+    def validate_args(
+            self,
+            targets: Optional[Any] = None,
+            encoder_outputs: torch.Tensor = None,
+            teacher_forcing_ratio: float = 1.0,
+    ) -> Tuple[torch.Tensor, int, int]:
+        assert encoder_outputs is not None
+        batch_size = encoder_outputs.size(0)
+
+        if targets is None:  # inference
+            targets = torch.LongTensor([self.sos_id] * batch_size).view(batch_size, 1)
+            max_length = self.max_length
+
+            if torch.cuda.is_available():
+                targets = targets.cuda()
+
+            if teacher_forcing_ratio > 0:
+                raise ValueError("Teacher forcing has to be disabled (set 0) when no targets is provided.")
+
+        else:
+            max_length = targets.size(1) - 1  # minus the start of sequence symbol
+
+        return targets, batch_size, max_length
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/decoders/openspeech_decoder.py b/audio/speech_recognition/conformer/pytorch/openspeech/decoders/openspeech_decoder.py
new file mode 100644
index 000000000..00a3f3d9d
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/decoders/openspeech_decoder.py
@@ -0,0 +1,42 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+
+
+class OpenspeechDecoder(nn.Module):
+    r""" Interface of OpenSpeech decoder. """
+    def __init__(self):
+        super(OpenspeechDecoder, self).__init__()
+
+    def count_parameters(self) -> int:
+        r""" Count parameters of decoders """
+        return sum([p.numel for p in self.parameters()])
+
+    def update_dropout(self, dropout_p: float) -> None:
+        r""" Update dropout probability of decoders """
+        for name, child in self.named_children():
+            if isinstance(child, nn.Dropout):
+                child.p = dropout_p
+
+    def forward(self, *args, **kwargs):
+        raise NotImplementedError
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/decoders/rnn_transducer_decoder.py b/audio/speech_recognition/conformer/pytorch/openspeech/decoders/rnn_transducer_decoder.py
new file mode 100644
index 000000000..3d7ec21f5
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/decoders/rnn_transducer_decoder.py
@@ -0,0 +1,128 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+from typing import Tuple
+
+from openspeech.decoders import OpenspeechDecoder
+from openspeech.modules import Linear
+
+
+class RNNTransducerDecoder(OpenspeechDecoder):
+    r"""
+    Decoder of RNN-Transducer
+
+    Args:
+        num_classes (int): number of classification
+        hidden_state_dim (int, optional): hidden state dimension of decoders (default: 512)
+        output_dim (int, optional): output dimension of encoders and decoders (default: 512)
+        num_layers (int, optional): number of decoders layers (default: 1)
+        rnn_type (str, optional): type of rnn cell (default: lstm)
+        sos_id (int, optional): start of sentence identification
+        eos_id (int, optional): end of sentence identification
+        dropout_p (float, optional): dropout probability of decoders
+
+    Inputs: inputs, input_lengths
+        inputs (torch.LongTensor): A target sequence passed to decoders. `IntTensor` of size ``(batch, seq_length)``
+        input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+        hidden_states (torch.FloatTensor): A previous hidden state of decoders. `FloatTensor` of size ``(batch, seq_length, dimension)``
+
+    Returns:
+        (Tensor, Tensor):
+
+        * decoder_outputs (torch.FloatTensor): A output sequence of decoders. `FloatTensor` of size
+            ``(batch, seq_length, dimension)``
+        * hidden_states (torch.FloatTensor): A hidden state of decoders. `FloatTensor` of size
+            ``(batch, seq_length, dimension)``
+
+    Reference:
+        A Graves: Sequence Transduction with Recurrent Neural Networks
+        https://arxiv.org/abs/1211.3711.pdf
+    """
+    supported_rnns = {
+        'lstm': nn.LSTM,
+        'gru': nn.GRU,
+        'rnn': nn.RNN,
+    }
+
+    def __init__(
+            self,
+            num_classes: int,
+            hidden_state_dim: int,
+            output_dim: int,
+            num_layers: int,
+            rnn_type: str = 'lstm',
+            pad_id: int = 0,
+            sos_id: int = 1,
+            eos_id: int = 2,
+            dropout_p: float = 0.2,
+    ):
+        super(RNNTransducerDecoder, self).__init__()
+        self.hidden_state_dim = hidden_state_dim
+        self.pad_id = pad_id,
+        self.sos_id = sos_id
+        self.eos_id = eos_id
+        self.embedding = nn.Embedding(num_classes, hidden_state_dim)
+        rnn_cell = self.supported_rnns[rnn_type.lower()]
+        self.rnn = rnn_cell(
+            input_size=hidden_state_dim,
+            hidden_size=hidden_state_dim,
+            num_layers=num_layers,
+            bias=True,
+            batch_first=True,
+            dropout=dropout_p,
+            bidirectional=False,
+        )
+        self.out_proj = Linear(hidden_state_dim, output_dim)
+
+    def forward(
+            self,
+            inputs: torch.Tensor,
+            input_lengths: torch.Tensor = None,
+            hidden_states: torch.Tensor = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Forward propage a `inputs` (targets) for training.
+
+        Inputs:
+            inputs (torch.LongTensor): A input sequence passed to label encoder. Typically inputs will be a padded `LongTensor` of size ``(batch, target_length)``
+            input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+            hidden_states (torch.FloatTensor): Previous hidden states.
+
+        Returns:
+            (Tensor, Tensor):
+
+            * outputs (torch.FloatTensor): A output sequence of decoders. `FloatTensor` of size
+                ``(batch, seq_length, dimension)``
+            * hidden_states (torch.FloatTensor): A hidden state of decoders. `FloatTensor` of size
+                ``(batch, seq_length, dimension)``
+        """
+        embedded = self.embedding(inputs)
+
+        if hidden_states is not None:
+            outputs, hidden_states = self.rnn(embedded, hidden_states)
+        else:
+            outputs, hidden_states = self.rnn(embedded)
+
+        outputs = self.out_proj(outputs)
+        return outputs, hidden_states
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/decoders/transformer_decoder.py b/audio/speech_recognition/conformer/pytorch/openspeech/decoders/transformer_decoder.py
new file mode 100644
index 000000000..77bc3fc67
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/decoders/transformer_decoder.py
@@ -0,0 +1,275 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import random
+import torch
+import torch.nn as nn
+from torch import Tensor
+from typing import Optional, Tuple
+
+from openspeech.decoders import OpenspeechDecoder
+from openspeech.modules import (
+    TransformerEmbedding,
+    PositionalEncoding,
+    Linear,
+    get_attn_pad_mask,
+    get_attn_subsequent_mask,
+    MultiHeadAttention,
+    PositionwiseFeedForward,
+)
+
+
+class TransformerDecoderLayer(nn.Module):
+    r"""
+    DecoderLayer is made up of self-attention, multi-head attention and feedforward network.
+    This standard decoders layer is based on the paper "Attention Is All You Need".
+
+    Args:
+        d_model: dimension of model (default: 512)
+        num_heads: number of attention heads (default: 8)
+        d_ff: dimension of feed forward network (default: 2048)
+        dropout_p: probability of dropout (default: 0.3)
+
+    Inputs:
+        inputs (torch.FloatTensor): input sequence of transformer decoder layer
+        encoder_outputs (torch.FloatTensor): outputs of encoder
+        self_attn_mask (torch.BoolTensor): mask of self attention
+        encoder_output_mask (torch.BoolTensor): mask of encoder outputs
+
+    Returns:
+        (Tensor, Tensor, Tensor)
+
+        * outputs (torch.FloatTensor): output of transformer decoder layer
+        * self_attn (torch.FloatTensor): output of self attention
+        * encoder_attn (torch.FloatTensor): output of encoder attention
+
+    Reference:
+        Ashish Vaswani et al.: Attention Is All You Need
+        https://arxiv.org/abs/1706.03762
+    """
+    def __init__(
+            self,
+            d_model: int = 512,
+            num_heads: int = 8,
+            d_ff: int = 2048,
+            dropout_p: float = 0.3,
+    ) -> None:
+        super(TransformerDecoderLayer, self).__init__()
+        self.self_attention_prenorm = nn.LayerNorm(d_model)
+        self.decoder_attention_prenorm = nn.LayerNorm(d_model)
+        self.feed_forward_prenorm = nn.LayerNorm(d_model)
+        self.self_attention = MultiHeadAttention(d_model, num_heads)
+        self.decoder_attention = MultiHeadAttention(d_model, num_heads)
+        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout_p)
+
+    def forward(
+            self,
+            inputs: Tensor,
+            encoder_outputs: Tensor,
+            self_attn_mask: Optional[Tensor] = None,
+            encoder_attn_mask: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        r"""
+        Forward propagate transformer decoder layer.
+
+        Inputs:
+            inputs (torch.FloatTensor): input sequence of transformer decoder layer
+            encoder_outputs (torch.FloatTensor): outputs of encoder
+            self_attn_mask (torch.BoolTensor): mask of self attention
+            encoder_output_mask (torch.BoolTensor): mask of encoder outputs
+
+        Returns:
+            outputs (torch.FloatTensor): output of transformer decoder layer
+            self_attn (torch.FloatTensor): output of self attention
+            encoder_attn (torch.FloatTensor): output of encoder attention
+        """
+        residual = inputs
+        inputs = self.self_attention_prenorm(inputs)
+        outputs, self_attn = self.self_attention(inputs, inputs, inputs, self_attn_mask)
+        outputs += residual
+
+        residual = outputs
+        outputs = self.decoder_attention_prenorm(outputs)
+        outputs, encoder_attn = self.decoder_attention(outputs, encoder_outputs, encoder_outputs, encoder_attn_mask)
+        outputs += residual
+
+        residual = outputs
+        outputs = self.feed_forward_prenorm(outputs)
+        outputs = self.feed_forward(outputs)
+        outputs += residual
+
+        return outputs, self_attn, encoder_attn
+
+
+class TransformerDecoder(OpenspeechDecoder):
+    r"""
+    The TransformerDecoder is composed of a stack of N identical layers.
+    Each layer has three sub-layers. The first is a multi-head self-attention mechanism,
+    and the second is a multi-head attention mechanism, third is a feed-forward network.
+
+    Args:
+        num_classes: umber of classes
+        d_model: dimension of model
+        d_ff: dimension of feed forward network
+        num_layers: number of layers
+        num_heads: number of attention heads
+        dropout_p: probability of dropout
+        pad_id (int, optional): index of the pad symbol (default: 0)
+        sos_id (int, optional): index of the start of sentence symbol (default: 1)
+        eos_id (int, optional): index of the end of sentence symbol (default: 2)
+        max_length (int): max decoding length
+    """
+
+    def __init__(
+            self,
+            num_classes: int,
+            d_model: int = 512,
+            d_ff: int = 512,
+            num_layers: int = 6,
+            num_heads: int = 8,
+            dropout_p: float = 0.3,
+            pad_id: int = 0,
+            sos_id: int = 1,
+            eos_id: int = 2,
+            max_length: int = 128,
+    ) -> None:
+        super(TransformerDecoder, self).__init__()
+        self.d_model = d_model
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.max_length = max_length
+        self.pad_id = pad_id
+        self.sos_id = sos_id
+        self.eos_id = eos_id
+
+        self.embedding = TransformerEmbedding(num_classes, pad_id, d_model)
+        self.positional_encoding = PositionalEncoding(d_model)
+        self.input_dropout = nn.Dropout(p=dropout_p)
+        self.layers = nn.ModuleList([
+            TransformerDecoderLayer(
+                d_model=d_model,
+                num_heads=num_heads,
+                d_ff=d_ff,
+                dropout_p=dropout_p,
+            ) for _ in range(num_layers)
+        ])
+        self.fc = nn.Sequential(
+            nn.LayerNorm(d_model),
+            Linear(d_model, d_model, bias=False),
+            nn.Tanh(),
+            Linear(d_model, num_classes, bias=False),
+        )
+
+    def forward_step(
+            self,
+            decoder_inputs: torch.Tensor,
+            decoder_input_lengths: torch.Tensor,
+            encoder_outputs: torch.Tensor,
+            encoder_output_lengths: torch.Tensor,
+            positional_encoding_length: int,
+    ) -> torch.Tensor:
+        dec_self_attn_pad_mask = get_attn_pad_mask(
+            decoder_inputs, decoder_input_lengths, decoder_inputs.size(1)
+        )
+        dec_self_attn_subsequent_mask = get_attn_subsequent_mask(decoder_inputs)
+        self_attn_mask = torch.gt((dec_self_attn_pad_mask + dec_self_attn_subsequent_mask), 0)
+
+        encoder_attn_mask = get_attn_pad_mask(encoder_outputs, encoder_output_lengths, decoder_inputs.size(1))
+
+        outputs = self.embedding(decoder_inputs) + self.positional_encoding(positional_encoding_length)
+        outputs = self.input_dropout(outputs)
+
+        for layer in self.layers:
+            outputs, self_attn, memory_attn = layer(
+                inputs=outputs,
+                encoder_outputs=encoder_outputs,
+                self_attn_mask=self_attn_mask,
+                encoder_attn_mask=encoder_attn_mask,
+            )
+
+        return outputs
+
+    def forward(
+            self,
+            encoder_outputs: torch.Tensor,
+            targets: Optional[torch.LongTensor] = None,
+            encoder_output_lengths: torch.Tensor = None,
+            target_lengths: torch.Tensor = None,
+            teacher_forcing_ratio: float = 1.0,
+    ) -> torch.Tensor:
+        r"""
+        Forward propagate a `encoder_outputs` for training.
+
+        Args:
+            targets (torch.LongTensor): A target sequence passed to decoders. `IntTensor` of size
+                ``(batch, seq_length)``
+            encoder_outputs (torch.FloatTensor): A output sequence of encoders. `FloatTensor` of size
+                ``(batch, seq_length, dimension)``
+            encoder_output_lengths (torch.LongTensor): The length of encoders outputs. ``(batch)``
+            teacher_forcing_ratio (float): ratio of teacher forcing
+
+        Returns:
+            * logits (torch.FloatTensor): Log probability of model predictions.
+        """
+        logits = list()
+        batch_size = encoder_outputs.size(0)
+        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
+
+        if targets is not None and use_teacher_forcing:
+            targets = targets[targets != self.eos_id].view(batch_size, -1)
+            target_length = targets.size(1)
+
+            step_outputs = self.forward_step(
+                decoder_inputs=targets,
+                decoder_input_lengths=target_lengths,
+                encoder_outputs=encoder_outputs,
+                encoder_output_lengths=encoder_output_lengths,
+                positional_encoding_length=target_length,
+            )
+            step_outputs = self.fc(step_outputs).log_softmax(dim=-1)
+
+            for di in range(step_outputs.size(1)):
+                step_output = step_outputs[:, di, :]
+                logits.append(step_output)
+
+        # Inference
+        else:
+            input_var = encoder_outputs.new_zeros(batch_size, self.max_length).long()
+            input_var = input_var.fill_(self.pad_id)
+            input_var[:, 0] = self.sos_id
+
+            for di in range(1, self.max_length):
+                input_lengths = torch.IntTensor(batch_size).fill_(di)
+
+                outputs = self.forward_step(
+                    decoder_inputs=input_var[:, :di],
+                    decoder_input_lengths=input_lengths,
+                    encoder_outputs=encoder_outputs,
+                    encoder_output_lengths=encoder_output_lengths,
+                    positional_encoding_length=di,
+                )
+                step_output = self.fc(outputs).log_softmax(dim=-1)
+
+                logits.append(step_output[:, -1, :])
+                input_var[:, di] = logits[-1].topk(1)[1].squeeze()
+
+        return torch.stack(logits, dim=1)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/decoders/transformer_transducer_decoder.py b/audio/speech_recognition/conformer/pytorch/openspeech/decoders/transformer_transducer_decoder.py
new file mode 100644
index 000000000..ee37cea45
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/decoders/transformer_transducer_decoder.py
@@ -0,0 +1,156 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+import numpy as np
+from typing import Tuple
+
+from openspeech.decoders import OpenspeechDecoder
+from openspeech.encoders.transformer_transducer_encoder import TransformerTransducerEncoderLayer
+from openspeech.modules import (
+    PositionalEncoding,
+    get_attn_pad_mask,
+    get_attn_subsequent_mask,
+)
+
+
+class TransformerTransducerDecoder(OpenspeechDecoder):
+    r"""
+    Converts the label to higher feature values
+
+    Args:
+        num_classes (int): the number of vocabulary
+        model_dim (int): the number of features in the label encoder (default : 512)
+        d_ff (int): the number of features in the feed forward layers (default : 2048)
+        num_layers (int): the number of label encoder layers (default: 2)
+        num_heads (int): the number of heads in the multi-head attention (default: 8)
+        dropout (float): dropout probability of label encoder (default: 0.1)
+        max_positional_length (int): Maximum length to use for positional encoding (default : 5000)
+        pad_id (int): index of padding (default: 0)
+        sos_id (int): index of the start of sentence (default: 1)
+        eos_id (int): index of the end of sentence (default: 2)
+
+    Inputs: inputs, inputs_lens
+        - **inputs**: Ground truth of batch size number
+        - **inputs_lens**: Tensor of target lengths
+
+    Returns:
+        (torch.FloatTensor, torch.FloatTensor)
+
+        * outputs (torch.FloatTensor): ``(batch, seq_length, dimension)``
+        * input_lengths (torch.FloatTensor):  ``(batch)``
+
+    Reference:
+        Qian Zhang et al.: Transformer Transducer: A Streamable Speech Recognition Model with Transformer Encoders and RNN-T Loss
+        https://arxiv.org/abs/2002.02562
+    """
+    def __init__(
+            self,
+            num_classes: int,
+            model_dim: int = 512,
+            d_ff: int = 2048,
+            num_layers: int = 2,
+            num_heads: int = 8,
+            dropout: float = 0.1,
+            max_positional_length: int = 5000,
+            pad_id: int = 0,
+            sos_id: int = 1,
+            eos_id: int = 2,
+    ) -> None:
+        super(TransformerTransducerDecoder, self).__init__()
+        self.embedding = nn.Embedding(num_classes, model_dim)
+        self.scale = np.sqrt(model_dim)
+        self.positional_encoding = PositionalEncoding(model_dim, max_positional_length)
+        self.input_dropout = nn.Dropout(p=dropout)
+        self.pad_id = pad_id
+        self.sos_id = sos_id
+        self.eos_id = eos_id
+        self.decoder_layers = nn.ModuleList([
+            TransformerTransducerEncoderLayer(
+                model_dim,
+                d_ff,
+                num_heads,
+                dropout
+            ) for _ in range(num_layers)
+        ])
+
+    def forward(
+            self,
+            inputs: torch.Tensor,
+            input_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""
+        Forward propagate a `inputs` for label encoder.
+
+        Args:
+            inputs (torch.LongTensor): A input sequence passed to label encoder. Typically inputs will be a padded
+                `LongTensor` of size ``(batch, target_length)``
+            input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+        Returns:
+            * outputs (Tensor): ``(batch, seq_length, dimension)``
+            * output_lengths (Tensor):  ``(batch)``
+        """
+        batch = inputs.size(0)
+
+        if len(inputs.size()) == 1:  # validate, evaluation
+            inputs = inputs.unsqueeze(1)
+            target_lengths = inputs.size(1)
+
+            outputs = self.forward_step(
+                decoder_inputs=inputs,
+                decoder_input_lengths=input_lengths,
+                positional_encoding_length=target_lengths,
+            )
+
+        else:  # train
+            target_lengths = inputs.size(1)
+
+            outputs = self.forward_step(
+                decoder_inputs=inputs,
+                decoder_input_lengths=input_lengths,
+                positional_encoding_length=target_lengths,
+            )
+
+        return outputs, input_lengths
+
+    def forward_step(
+            self,
+            decoder_inputs: torch.Tensor,
+            decoder_input_lengths: torch.Tensor,
+            positional_encoding_length: int = 1,
+    ) -> torch.Tensor:
+        dec_self_attn_pad_mask = get_attn_pad_mask(decoder_inputs, decoder_input_lengths, decoder_inputs.size(1))
+        dec_self_attn_subsequent_mask = get_attn_subsequent_mask(decoder_inputs)
+        self_attn_mask = torch.gt((dec_self_attn_pad_mask + dec_self_attn_subsequent_mask), 0)
+
+        embedding_output = self.embedding(decoder_inputs) * self.scale
+        positional_encoding_output = self.positional_encoding(positional_encoding_length)
+        inputs = embedding_output + positional_encoding_output
+
+        outputs = self.input_dropout(inputs)
+
+        for decoder_layer in self.decoder_layers:
+            outputs, _ = decoder_layer(outputs, self_attn_mask)
+
+        return outputs
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/encoders/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/__init__.py
new file mode 100644
index 000000000..4d942d7e0
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/__init__.py
@@ -0,0 +1,32 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from .openspeech_encoder import OpenspeechEncoder
+from .conformer_encoder import ConformerEncoder
+from .contextnet_encoder import ContextNetEncoder
+from .convolutional_lstm_encoder import ConvolutionalLSTMEncoder
+from .convolutional_transformer_encoder import ConvolutionalTransformerEncoder
+from .lstm_encoder import LSTMEncoder
+from .rnn_transducer_encoder import RNNTransducerEncoder
+from .transformer_encoder import TransformerEncoder
+from .transformer_transducer_encoder import TransformerTransducerEncoder
+from .jasper import Jasper
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/encoders/conformer_encoder.py b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/conformer_encoder.py
new file mode 100644
index 000000000..25aabfdd0
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/conformer_encoder.py
@@ -0,0 +1,141 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+from typing import Tuple
+
+from openspeech.encoders.openspeech_encoder import OpenspeechEncoder
+from openspeech.modules import Conv2dSubsampling, Linear, ConformerBlock, Transpose
+
+
+class ConformerEncoder(OpenspeechEncoder):
+    r"""
+    Transformer models are good at capturing content-based global interactions, while CNNs exploit local features
+    effectively.  Conformer achieves the best of both worlds by studying how to combine convolution neural
+    networks and transformers to model both local and global dependencies of an audio sequence
+    in a parameter-efficient way.
+
+    Args:
+        num_classes (int): Number of classification
+        input_dim (int, optional): Dimension of input vector
+        encoder_dim (int, optional): Dimension of conformer encoders
+        num_layers (int, optional): Number of conformer blocks
+        num_attention_heads (int, optional): Number of attention heads
+        feed_forward_expansion_factor (int, optional): Expansion factor of feed forward module
+        conv_expansion_factor (int, optional): Expansion factor of conformer convolution module
+        feed_forward_dropout_p (float, optional): Probability of feed forward module dropout
+        attention_dropout_p (float, optional): Probability of attention module dropout
+        conv_dropout_p (float, optional): Probability of conformer convolution module dropout
+        conv_kernel_size (int or tuple, optional): Size of the convolving kernel
+        half_step_residual (bool): Flag indication whether to use half step residual or not
+            joint_ctc_attention (bool, optional): flag indication joint ctc attention or not
+
+    Inputs: inputs, input_lengths
+        - **inputs** (batch, time, dim): Tensor containing input vector
+        - **input_lengths** (batch): list of sequence input lengths
+
+    Returns: outputs, output_lengths
+        - **outputs** (batch, out_channels, time): Tensor produces by conformer encoders.
+        - **output_lengths** (batch): list of sequence output lengths
+
+    Reference:
+        Anmol Gulati et al: Conformer: Convolution-augmented Transformer for Speech Recognition
+        https://arxiv.org/abs/2005.08100
+    """
+    def __init__(
+            self,
+            num_classes: int,
+            input_dim: int = 80,
+            encoder_dim: int = 512,
+            num_layers: int = 17,
+            num_attention_heads: int = 8,
+            feed_forward_expansion_factor: int = 4,
+            conv_expansion_factor: int = 2,
+            input_dropout_p: float = 0.1,
+            feed_forward_dropout_p: float = 0.1,
+            attention_dropout_p: float = 0.1,
+            conv_dropout_p: float = 0.1,
+            conv_kernel_size: int = 31,
+            half_step_residual: bool = True,
+            joint_ctc_attention: bool = True,
+    ) -> None:
+        super(ConformerEncoder, self).__init__()
+        self.joint_ctc_attention = joint_ctc_attention
+        self.conv_subsample = Conv2dSubsampling(input_dim, in_channels=1, out_channels=encoder_dim)
+        self.input_projection = nn.Sequential(
+            Linear(self.conv_subsample.get_output_dim(), encoder_dim),
+            nn.Dropout(p=input_dropout_p),
+        )
+        self.layers = nn.ModuleList([
+            ConformerBlock(
+                encoder_dim=encoder_dim,
+                num_attention_heads=num_attention_heads,
+                feed_forward_expansion_factor=feed_forward_expansion_factor,
+                conv_expansion_factor=conv_expansion_factor,
+                feed_forward_dropout_p=feed_forward_dropout_p,
+                attention_dropout_p=attention_dropout_p,
+                conv_dropout_p=conv_dropout_p,
+                conv_kernel_size=conv_kernel_size,
+                half_step_residual=half_step_residual,
+            ) for _ in range(num_layers)
+        ])
+        if self.joint_ctc_attention:
+            self.fc = nn.Sequential(
+                Transpose(shape=(1, 2)),
+                nn.Dropout(feed_forward_dropout_p),
+                Linear(encoder_dim, num_classes, bias=False),
+            )
+
+    def forward(
+            self,
+            inputs: torch.Tensor,
+            input_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        r"""
+        Forward propagate a `inputs` for  encoders training.
+
+        Args:
+            inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded
+                `FloatTensor` of size ``(batch, seq_length, dimension)``.
+            input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+        Returns:
+            (Tensor, Tensor, Tensor)
+
+            * outputs: A output sequence of encoders. `FloatTensor` of size ``(batch, seq_length, dimension)``
+            * encoder_logits: Log probability of encoders outputs will be passed to CTC Loss.
+                If joint_ctc_attention is False, return None.
+            * output_lengths: The length of encoders outputs. ``(batch)``
+        """
+        encoder_logits = None
+
+        outputs, output_lengths = self.conv_subsample(inputs, input_lengths)
+        outputs = self.input_projection(outputs)
+
+        for layer in self.layers:
+            outputs = layer(outputs)
+
+        if self.joint_ctc_attention:
+            encoder_logits = self.fc(outputs.transpose(1, 2)).log_softmax(dim=2)
+
+        return outputs, encoder_logits, output_lengths
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/encoders/contextnet_encoder.py b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/contextnet_encoder.py
new file mode 100644
index 000000000..76de32883
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/contextnet_encoder.py
@@ -0,0 +1,123 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from typing import Tuple
+
+from openspeech.encoders.openspeech_encoder import OpenspeechEncoder
+from openspeech.modules.contextnet_block import ContextNetBlock
+from openspeech.modules import Conv2dSubsampling, Linear, ConformerBlock, Transpose
+
+
+class ContextNetEncoder(OpenspeechEncoder):
+    r"""
+    ContextNetEncoder goes through 23 convolution blocks to convert to higher feature values.
+
+    Args:
+        num_classes (int): Number of classification
+        model_size (str, optional): Size of the model['small', 'medium', 'large'] (default : 'medium')
+        input_dim (int, optional): Dimension of input vector (default : 80)
+        num_layers (int, optional): The number of convolutional layers (default : 5)
+        kernel_size (int, optional): Value of convolution kernel size (default : 5)
+        num_channels (int, optional): The number of channels in the convolution filter (default: 256)
+        output_dim (int, optional): Dimension of encoder output vector (default: 640)
+        joint_ctc_attention (bool, optional): flag indication joint ctc attention or not
+
+    Inputs: inputs, input_lengths
+        - **inputs**: Parsed audio of batch size number `FloatTensor` of size ``(batch, seq_length, dimension)``
+        - **input_lengths**: Tensor representing the sequence length of the input ``(batch)``
+
+    Returns: output, output_lengths
+        - **output**: Tensor of encoder output `FloatTensor` of size
+                ``(batch, seq_length, dimension)``
+        - **encoder_logits**: Log probability of encoders outputs will be passed to CTC Loss.
+                If joint_ctc_attention is False, return None.
+        - **output_lengths**: Tensor representing the length of the encoder output ``(batch)``
+    """
+    supported_models = {
+        'small': 0.5,
+        'medium': 1,
+        'large': 2,
+    }
+
+    def __init__(
+            self,
+            num_classes: int,
+            model_size: str = 'medium',
+            input_dim: int = 80,
+            num_layers: int = 5,
+            kernel_size: int = 5,
+            num_channels: int = 256,
+            output_dim: int = 640,
+            joint_ctc_attention: bool = False,
+    ) -> None:
+        super(ContextNetEncoder, self).__init__()
+        assert model_size in ('small', 'medium', 'large'), f'{model_size} is not supported.'
+
+        alpha = self.supported_models[model_size]
+
+        num_channels = int(num_channels * alpha)
+        output_dim = int(output_dim * alpha)
+
+        self.joint_ctc_attention = joint_ctc_attention
+        self.blocks = ContextNetBlock.make_conv_blocks(input_dim, num_layers, kernel_size, num_channels, output_dim)
+        if self.joint_ctc_attention:
+            self.fc = nn.Linear(output_dim, num_classes, bias=False)
+
+    def forward(
+            self,
+            inputs: Tensor,
+            input_lengths: Tensor,
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        r"""
+        Forward propagate a `inputs` for audio encoder.
+
+        Args:
+            **inputs** (torch.FloatTensor): Parsed audio of batch size number `FloatTensor` of size
+                ``(batch, seq_length, dimension)``
+            **input_lengths** (torch.LongTensor): Tensor representing the sequence length of the input
+                `LongTensor` of size ``(batch)``
+
+        Returns:
+            **output** (torch.FloatTensor): Tensor of encoder output `FloatTensor` of size
+                ``(batch, seq_length, dimension)``
+            **encoder_logits** (torch.FloatTensor): Log probability of encoders outputs will be passed to CTC Loss.
+                If joint_ctc_attention is False, return None.
+            **output_lengths** (torch.LongTensor): Tensor representing the length of the encoder output
+                `LongTensor` of size ``(batch)``
+        """
+        encoder_logits = None
+
+        output = inputs.transpose(1, 2)
+        output_lengths = input_lengths
+
+        for block in self.blocks:
+            output, output_lengths = block(output, output_lengths)
+
+        output = output.transpose(1, 2)
+
+        if self.joint_ctc_attention:
+            encoder_logits = self.fc(output).log_softmax(dim=2)
+
+        return output, encoder_logits, output_lengths
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/encoders/convolutional_lstm_encoder.py b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/convolutional_lstm_encoder.py
new file mode 100644
index 000000000..74f212911
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/convolutional_lstm_encoder.py
@@ -0,0 +1,134 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+from typing import Tuple, Optional
+
+from openspeech.encoders import OpenspeechEncoder
+from openspeech.modules import Transpose, Linear
+
+
+class ConvolutionalLSTMEncoder(OpenspeechEncoder):
+    r"""
+    Converts low level speech signals into higher level features with convolutional extractor.
+
+    Args:
+        input_dim (int): dimension of input vector
+        num_classes (int): number of classification
+        hidden_state_dim (int): the number of features in the encoders hidden state `h`
+        num_layers (int, optional): number of recurrent layers (default: 3)
+        bidirectional (bool, optional): if True, becomes a bidirectional encoders (default: False)
+        extractor (str): type of CNN extractor (default: vgg)
+        conv_activation (str): activation function of convolutional extractor (default: hardtanh)
+        rnn_type (str, optional): type of RNN cell (default: lstm)
+        dropout_p (float, optional): dropout probability of encoders (default: 0.2)
+        joint_ctc_attention (bool, optional): flag indication joint ctc attention or not
+
+    Inputs: inputs, input_lengths
+        - **inputs**: list of sequences, whose length is the batch size and within which each sequence is list of tokens
+        - **input_lengths**: list of sequence lengths
+
+    Returns: encoder_outputs, encoder_log__probs, output_lengths
+        - **encoder_outputs**: tensor containing the encoded features of the input sequence
+        - **encoder_log__probs**: tensor containing log probability for encoder_only loss
+        - **output_lengths**: list of sequence lengths produced by Listener
+    """
+    supported_rnns = {
+        'lstm': nn.LSTM,
+        'gru': nn.GRU,
+        'rnn': nn.RNN,
+    }
+
+    def __init__(
+            self,
+            input_dim: int,
+            num_classes: int = None,
+            hidden_state_dim: int = 512,
+            dropout_p: float = 0.3,
+            num_layers: int = 3,
+            bidirectional: bool = True,
+            rnn_type: str = 'lstm',
+            extractor: str = 'vgg',
+            conv_activation: str = 'hardtanh',
+            joint_ctc_attention: bool = False,
+    ) -> None:
+        super(ConvolutionalLSTMEncoder, self).__init__()
+        extractor = self.supported_extractors[extractor.lower()]
+        self.conv = extractor(input_dim=input_dim, activation=conv_activation)
+        self.conv_output_dim = self.conv.get_output_dim()
+
+        self.num_classes = num_classes
+        self.joint_ctc_attention = joint_ctc_attention
+
+        self.hidden_state_dim = hidden_state_dim
+        self.rnn = self.supported_rnns[rnn_type.lower()](
+            input_size=self.conv_output_dim,
+            hidden_size=hidden_state_dim,
+            num_layers=num_layers,
+            bias=True,
+            batch_first=True,
+            dropout=dropout_p,
+            bidirectional=bidirectional,
+        )
+
+        if self.joint_ctc_attention:
+            self.fc = nn.Sequential(
+                Transpose(shape=(1, 2)),
+                nn.Dropout(dropout_p),
+                Linear(hidden_state_dim << 1, num_classes, bias=False),
+            )
+
+    def forward(
+            self,
+            inputs: torch.Tensor,
+            input_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        r"""
+        Forward propagate a `inputs` for  encoders training.
+
+        Args:
+            inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded
+                `FloatTensor` of size ``(batch, seq_length, dimension)``.
+            input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+        Returns:
+            (Tensor, Tensor, Tensor):
+
+            * outputs: A output sequence of encoders. `FloatTensor` of size ``(batch, seq_length, dimension)``
+            * encoder_logits: Log probability of encoders outputs will be passed to CTC Loss.
+                If joint_ctc_attention is False, return None.
+            * encoder_output_lengths: The length of encoders outputs. ``(batch)``
+        """
+        encoder_logits = None
+
+        conv_outputs, output_lengths = self.conv(inputs, input_lengths)
+
+        conv_outputs = nn.utils.rnn.pack_padded_sequence(conv_outputs.transpose(0, 1), output_lengths.cpu())
+        outputs, hidden_states = self.rnn(conv_outputs)
+        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
+        outputs = outputs.transpose(0, 1)
+
+        if self.joint_ctc_attention:
+            encoder_logits = self.fc(outputs.transpose(1, 2)).log_softmax(dim=2)
+
+        return outputs, encoder_logits, output_lengths
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/encoders/convolutional_transformer_encoder.py b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/convolutional_transformer_encoder.py
new file mode 100644
index 000000000..3993b5edf
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/convolutional_transformer_encoder.py
@@ -0,0 +1,147 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+from typing import Tuple
+
+from openspeech.encoders import OpenspeechEncoder
+from openspeech.encoders.transformer_encoder import TransformerEncoderLayer
+from openspeech.modules import (
+    Linear,
+    PositionalEncoding,
+    get_attn_pad_mask, Transpose,
+)
+
+
+class ConvolutionalTransformerEncoder(OpenspeechEncoder):
+    r"""
+    The TransformerEncoder is composed of a stack of N identical layers.
+    Each layer has two sub-layers. The first is a multi-head self-attention mechanism,
+    and the second is a simple, position-wise fully connected feed-forward network.
+
+    Args:
+        input_dim: dimension of feature vector
+        extractor (str): convolutional extractor
+        d_model: dimension of model (default: 512)
+        d_ff: dimension of feed forward network (default: 2048)
+        num_layers: number of encoders layers (default: 6)
+        num_heads: number of attention heads (default: 8)
+        dropout_p (float, optional):  probability of dropout (default: 0.3)
+        conv_activation (str, optional): activation function of convolutional extractor (default: hardtanh)
+        joint_ctc_attention (bool, optional): flag indication joint ctc attention or not (default: False)
+
+    Inputs:
+        - **inputs**: list of sequences, whose length is the batch size and within which each sequence is list of tokens
+        - **input_lengths**: list of sequence lengths
+
+    Returns:
+        (Tensor, Tensor, Tensor):
+
+        * outputs: A output sequence of encoders. `FloatTensor` of size ``(batch, seq_length, dimension)``
+        * encoder_logits: Log probability of encoders outputs will be passed to CTC Loss.
+            If joint_ctc_attention is False, return None.
+        * output_lengths: The length of encoders outputs. ``(batch)``
+    """
+
+    def __init__(
+            self,
+            num_classes: int,
+            input_dim: int,
+            extractor: str = 'vgg',
+            d_model: int = 512,
+            d_ff: int = 2048,
+            num_layers: int = 6,
+            num_heads: int = 8,
+            dropout_p: float = 0.3,
+            conv_activation: str = "relu",
+            joint_ctc_attention: bool = False,
+    ) -> None:
+        super(ConvolutionalTransformerEncoder, self).__init__()
+        extractor = self.supported_extractors[extractor.lower()]
+        self.conv = extractor(input_dim=input_dim, activation=conv_activation)
+        self.conv_output_dim = self.conv.get_output_dim()
+
+        self.num_classes = num_classes
+        self.joint_ctc_attention = joint_ctc_attention
+
+        self.d_model = d_model
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.input_proj = Linear(self.conv_output_dim, d_model)
+        self.input_norm = nn.LayerNorm(d_model)
+        self.input_dropout = nn.Dropout(p=dropout_p)
+        self.positional_encoding = PositionalEncoding(d_model)
+        self.layers = nn.ModuleList([
+            TransformerEncoderLayer(
+                d_model=d_model,
+                num_heads=num_heads,
+                d_ff=d_ff,
+                dropout_p=dropout_p,
+            ) for _ in range(num_layers)
+        ])
+
+        if self.joint_ctc_attention:
+            self.fc = nn.Sequential(
+                Transpose(shape=(1, 2)),
+                nn.Dropout(dropout_p),
+                Linear(d_model, num_classes, bias=False),
+            )
+
+    def forward(
+            self,
+            inputs: torch.Tensor,
+            input_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        r"""
+        Forward propagate a `inputs` for  encoders training.
+
+        Args:
+            inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded
+                `FloatTensor` of size ``(batch, seq_length, dimension)``.
+            input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+        Returns:
+            (Tensor, Tensor, Tensor):
+
+            * outputs: A output sequence of encoders. `FloatTensor` of size ``(batch, seq_length, dimension)``
+            * encoder_logits: Log probability of encoders outputs will be passed to CTC Loss.
+                If joint_ctc_attention is False, return None.
+            * output_lengths: The length of encoders outputs. ``(batch)``
+        """
+        encoder_logits = None
+
+        conv_outputs, output_lengths = self.conv(inputs, input_lengths)
+
+        self_attn_mask = get_attn_pad_mask(conv_outputs, output_lengths, conv_outputs.size(1))
+
+        outputs = self.input_norm(self.input_proj(conv_outputs))
+        outputs += self.positional_encoding(outputs.size(1))
+        outputs = self.input_dropout(outputs)
+
+        for layer in self.layers:
+            outputs, attn = layer(outputs, self_attn_mask)
+
+        if self.joint_ctc_attention:
+            encoder_logits = self.fc(outputs.transpose(1, 2)).log_softmax(dim=-1)
+
+        return outputs, encoder_logits, output_lengths
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/encoders/deepspeech2.py b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/deepspeech2.py
new file mode 100644
index 000000000..358c4c0d9
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/deepspeech2.py
@@ -0,0 +1,123 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+from torch import Tensor
+from typing import Tuple
+
+from openspeech.modules import DeepSpeech2Extractor, BNReluRNN, Linear
+
+
+class DeepSpeech2(nn.Module):
+    r"""
+    DeepSpeech2 is a set of speech recognition models based on Baidu DeepSpeech2. DeepSpeech2 is trained with CTC loss.
+
+    Args:
+        input_dim (int): dimension of input vector
+        num_classes (int): number of classfication
+        rnn_type (str, optional): type of RNN cell (default: gru)
+        num_rnn_layers (int, optional): number of recurrent layers (default: 5)
+        rnn_hidden_dim (int): the number of features in the hidden state `h`
+        dropout_p (float, optional): dropout probability (default: 0.1)
+        bidirectional (bool, optional): if True, becomes a bidirectional encoders (defulat: True)
+        activation (str): type of activation function (default: hardtanh)
+
+    Inputs: inputs, input_lengths
+        - **inputs**: list of sequences, whose length is the batch size and within which each sequence is list of tokens
+        - **input_lengths**: list of sequence lengths
+
+    Returns:
+        (Tensor, Tensor):
+
+        * predicted_log_prob (torch.FloatTensor)s: Log probability of model predictions.
+        * output_lengths (torch.LongTensor): The length of output tensor ``(batch)``
+
+    Reference:
+        Dario Amodei et al.: Deep Speech 2: End-to-End Speech Recognition in English and Mandarin
+        https://arxiv.org/abs/1512.02595
+    """
+    def __init__(
+            self,
+            input_dim: int,
+            num_classes: int,
+            rnn_type='gru',
+            num_rnn_layers: int = 5,
+            rnn_hidden_dim: int = 512,
+            dropout_p: float = 0.1,
+            bidirectional: bool = True,
+            activation: str = 'hardtanh',
+    ) -> None:
+        super(DeepSpeech2, self).__init__()
+        self.conv = DeepSpeech2Extractor(input_dim, activation=activation)
+        self.rnn_layers = nn.ModuleList()
+        rnn_output_size = rnn_hidden_dim << 1 if bidirectional else rnn_hidden_dim
+
+        for idx in range(num_rnn_layers):
+            self.rnn_layers.append(
+                BNReluRNN(
+                    input_size=self.conv.get_output_dim() if idx == 0 else rnn_output_size,
+                    hidden_state_dim=rnn_hidden_dim,
+                    rnn_type=rnn_type,
+                    bidirectional=bidirectional,
+                    dropout_p=dropout_p,
+                )
+            )
+
+        self.fc = nn.Sequential(
+            nn.LayerNorm(rnn_output_size),
+            Linear(rnn_output_size, num_classes, bias=False),
+        )
+
+    def count_parameters(self) -> int:
+        r""" Count parameters of encoders """
+        return sum([p.numel for p in self.parameters()])
+
+    def update_dropout(self, dropout_p: float) -> None:
+        r""" Update dropout probability of encoders """
+        for name, child in self.named_children():
+            if isinstance(child, nn.Dropout):
+                child.p = dropout_p
+
+    def forward(self, inputs: Tensor, input_lengths: Tensor) -> Tuple[Tensor, Tensor]:
+        r"""
+        Forward propagate a `inputs` for  encoder_only training.
+
+        Args:
+            inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded
+                `FloatTensor` of size ``(batch, seq_length, dimension)``.
+            input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+        Returns:
+            (Tensor, Tensor):
+
+            * predicted_log_prob (torch.FloatTensor)s: Log probability of model predictions.
+            * output_lengths (torch.LongTensor): The length of output tensor ``(batch)``
+        """
+        outputs, output_lengths = self.conv(inputs, input_lengths)
+        outputs = outputs.permute(1, 0, 2).contiguous()
+
+        for rnn_layer in self.rnn_layers:
+            outputs = rnn_layer(outputs, output_lengths)
+
+        outputs = self.fc(outputs.transpose(0, 1)).log_softmax(dim=-1)
+
+        return outputs, output_lengths
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/encoders/jasper.py b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/jasper.py
new file mode 100644
index 000000000..80de877b2
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/jasper.py
@@ -0,0 +1,182 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Tuple
+
+from openspeech.modules import JasperSubBlock, JasperBlock, MaskConv1d
+
+
+class Jasper(nn.Module):
+    r"""
+    Jasper (Just Another Speech Recognizer), an ASR model comprised of 54 layers proposed by NVIDIA.
+    Jasper achieved sub 3 percent word error rate (WER) on the LibriSpeech dataset.
+
+    Args:
+        num_classes (int): number of classification
+        version (str): version of jasper. Marked as BxR: B - number of blocks, R - number of sub-blocks
+
+    Inputs: inputs, input_lengths, residual
+        - **inputs**: tensor contains input sequence vector
+        - **input_lengths**: tensor contains sequence lengths
+
+    Returns:
+            (Tensor, Tensor):
+
+            * outputs (torch.FloatTensor): Log probability of model predictions.  ``(batch, seq_length, num_classes)``
+            * output_lengths (torch.LongTensor): The length of output tensor ``(batch)``
+
+    Reference:
+        Jason Li. et al.: Jasper: An End-to-End Convolutional Neural Acoustic Model
+        https://arxiv.org/pdf/1904.03288.pdf
+    """
+
+    def __init__(self, configs, input_dim: int, num_classes: int) -> None:
+        super(Jasper, self).__init__()
+        self.configs = configs
+        self.layers = nn.ModuleList()
+
+        in_channels = eval(self.configs.in_channels)
+        out_channels = eval(self.configs.out_channels)
+        kernel_size = eval(self.configs.kernel_size)
+        stride = eval(self.configs.stride)
+        dilation = eval(self.configs.dilation)
+        dropout_p = eval(self.configs.dropout_p)
+
+        self.layers.append(
+            JasperSubBlock(
+                in_channels=input_dim,
+                out_channels=out_channels[0],
+                kernel_size=kernel_size[0],
+                stride=stride[0],
+                dilation=dilation[0],
+                dropout_p=dropout_p[0],
+                activation='relu',
+                bias=False,
+            )
+        )
+        self.layers.extend([
+            JasperBlock(
+                num_sub_blocks=self.configs.num_sub_blocks,
+                in_channels=in_channels[i],
+                out_channels=out_channels[i],
+                kernel_size=kernel_size[i],
+                dilation=dilation[i],
+                dropout_p=dropout_p[i],
+                activation='relu',
+                bias=False,
+            ) for i in range(1, self.configs.num_blocks + 1)
+        ])
+        self.postprocess_layers = nn.ModuleList([
+            JasperSubBlock(
+                in_channels=in_channels[i],
+                out_channels=num_classes if out_channels[i] is None else out_channels[i],
+                kernel_size=kernel_size[i],
+                dilation=dilation[i],
+                dropout_p=dropout_p[i],
+                activation='relu',
+                bias=True if i == 2 else False,
+            ) for i in range(self.configs.num_blocks + 1, self.configs.num_blocks + 4)
+        ])
+
+        self.residual_connections = self._create_jasper_dense_residual_connections()
+
+    def count_parameters(self) -> int:
+        r""" Count parameters of model """
+        return sum([p.numel for p in self.parameters()])
+
+    def update_dropout(self, dropout_p: float) -> None:
+        r""" Update dropout probability of model """
+        for name, child in self.named_children():
+            if isinstance(child, nn.Dropout):
+                child.p = dropout_p
+
+    def forward(
+            self,
+            inputs: torch.Tensor,
+            input_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""
+        Forward propagate a `inputs` for  encoder_only training.
+
+        Args:
+            inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded
+                `FloatTensor` of size ``(batch, seq_length, dimension)``.
+            input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+        Returns:
+            (Tensor, Tensor):
+
+            * outputs (torch.FloatTensor): Log probability of model predictions.  ``(batch, seq_length, num_classes)``
+            * output_lengths (torch.LongTensor): The length of output tensor ``(batch)``
+        """
+        residual, prev_outputs, prev_output_lengths = None, list(), list()
+        inputs = inputs.transpose(1, 2)
+
+        for i, layer in enumerate(self.layers[:-1]):
+            inputs, input_lengths = layer(inputs, input_lengths, residual)
+            prev_outputs.append(inputs)
+            prev_output_lengths.append(input_lengths)
+            residual = self._get_jasper_dencse_residual(prev_outputs, prev_output_lengths, i)
+
+        outputs, output_lengths = self.layers[-1](inputs, input_lengths, residual)
+
+        for i, layer in enumerate(self.postprocess_layers):
+            outputs, output_lengths = layer(outputs, output_lengths)
+
+        outputs = F.log_softmax(outputs.transpose(1, 2), dim=-1)
+
+        return outputs, output_lengths
+
+    def _get_jasper_dencse_residual(self, prev_outputs: list, prev_output_lengths: list, index: int):
+        residual = None
+
+        for item in zip(prev_outputs, prev_output_lengths, self.residual_connections[index]):
+            prev_output, prev_output_length, residual_modules = item
+            conv1x1, batch_norm = residual_modules
+
+            if residual is None:
+                residual = conv1x1(prev_output, prev_output_length)[0]
+            else:
+                residual += conv1x1(prev_output, prev_output_length)[0]
+
+            residual = batch_norm(residual)
+
+        return residual
+
+    def _create_jasper_dense_residual_connections(self) -> nn.ModuleList:
+        residual_connections = nn.ModuleList()
+
+        for i in range(self.configs.num_blocks):
+            residual_modules = nn.ModuleList()
+            for j in range(1, i + 2):
+                residual_modules.append(nn.ModuleList([
+                    MaskConv1d(
+                        self.configs.in_channels[j], self.configs.out_channels[j], kernel_size=1
+                    ),
+                    nn.BatchNorm1d(self.configs.out_channels[i], eps=1e-03, momentum=0.1),
+                ]))
+            residual_connections.append(residual_modules)
+
+        return residual_connections
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/encoders/lstm_encoder.py b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/lstm_encoder.py
new file mode 100644
index 000000000..4109b0420
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/lstm_encoder.py
@@ -0,0 +1,128 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+from typing import Tuple, Optional
+
+from openspeech.encoders import OpenspeechEncoder
+from openspeech.modules import Transpose, Linear
+
+
+class LSTMEncoder(OpenspeechEncoder):
+    r"""
+    Converts low level speech signals into higher level features
+
+    Args:
+        input_dim (int): dimension of input vector
+        num_classes (int): number of classification
+        hidden_state_dim (int): the number of features in the encoders hidden state `h`
+        num_layers (int, optional): number of recurrent layers (default: 3)
+        bidirectional (bool, optional): if True, becomes a bidirectional encoders (default: False)
+        rnn_type (str, optional): type of RNN cell (default: lstm)
+        dropout_p (float, optional): dropout probability of encoders (default: 0.2)
+        joint_ctc_attention (bool, optional): flag indication joint ctc attention or not
+
+    Inputs: inputs, input_lengths
+        - **inputs**: list of sequences, whose length is the batch size and within which each sequence is list of tokens
+        - **input_lengths**: list of sequence lengths
+
+    Returns:
+        (Tensor, Tensor, Tensor):
+
+        * outputs: A output sequence of encoders. `FloatTensor` of size ``(batch, seq_length, dimension)``
+        * encoder_logits: Log probability of encoders outputs will be passed to CTC Loss.
+            If joint_ctc_attention is False, return None.
+        * encoder_output_lengths: The length of encoders outputs. ``(batch)``
+    """
+    supported_rnns = {
+        'lstm': nn.LSTM,
+        'gru': nn.GRU,
+        'rnn': nn.RNN,
+    }
+
+    def __init__(
+            self,
+            input_dim: int,
+            num_classes: int = None,
+            hidden_state_dim: int = 512,
+            dropout_p: float = 0.3,
+            num_layers: int = 3,
+            bidirectional: bool = True,
+            rnn_type: str = 'lstm',
+            joint_ctc_attention: bool = False,
+    ) -> None:
+        super(LSTMEncoder, self).__init__()
+
+        self.num_classes = num_classes
+        self.joint_ctc_attention = joint_ctc_attention
+
+        self.hidden_state_dim = hidden_state_dim
+        self.rnn = self.supported_rnns[rnn_type.lower()](
+            input_size=input_dim,
+            hidden_size=hidden_state_dim,
+            num_layers=num_layers,
+            bias=True,
+            batch_first=True,
+            dropout=dropout_p,
+            bidirectional=bidirectional,
+        )
+
+        if self.joint_ctc_attention:
+            self.fc = nn.Sequential(
+                Transpose(shape=(1, 2)),
+                nn.Dropout(dropout_p),
+                Linear(hidden_state_dim << 1, num_classes, bias=False),
+            )
+
+    def forward(
+            self,
+            inputs: torch.Tensor,
+            input_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        r"""
+        Forward propagate a `inputs` for  encoders training.
+
+        Args:
+            inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded
+                `FloatTensor` of size ``(batch, seq_length, dimension)``.
+            input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+        Returns:
+            (Tensor, Tensor, Tensor):
+
+            * outputs: A output sequence of encoders. `FloatTensor` of size ``(batch, seq_length, dimension)``
+            * encoder_logits: Log probability of encoders outputs will be passed to CTC Loss.
+                If joint_ctc_attention is False, return None.
+            * encoder_output_lengths: The length of encoders outputs. ``(batch)``
+        """
+        encoder_logits = None
+
+        conv_outputs = nn.utils.rnn.pack_padded_sequence(inputs.transpose(0, 1), input_lengths.cpu())
+        outputs, hidden_states = self.rnn(conv_outputs)
+        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
+        outputs = outputs.transpose(0, 1)
+
+        if self.joint_ctc_attention:
+            encoder_logits = self.fc(outputs.transpose(1, 2)).log_softmax(dim=2)
+
+        return outputs, encoder_logits, input_lengths
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/encoders/openspeech_encoder.py b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/openspeech_encoder.py
new file mode 100644
index 000000000..4717d4967
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/openspeech_encoder.py
@@ -0,0 +1,72 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+from torch import Tensor
+
+from openspeech.modules import DeepSpeech2Extractor, VGGExtractor, Swish, Conv2dSubsampling
+
+
+class OpenspeechEncoder(nn.Module):
+    r"""
+    Base Interface of Openspeech Encoder.
+
+    Inputs:
+        inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+    """
+    supported_activations = {
+        'hardtanh': nn.Hardtanh(0, 20, inplace=True),
+        'relu': nn.ReLU(inplace=True),
+        'elu': nn.ELU(inplace=True),
+        'leaky_relu': nn.LeakyReLU(inplace=True),
+        'gelu': nn.GELU(),
+        'swish': Swish(),
+    }
+    supported_extractors = {
+        'ds2': DeepSpeech2Extractor,
+        'vgg': VGGExtractor,
+        'conv2d_subsample': Conv2dSubsampling,
+    }
+
+    def __init__(self):
+        super(OpenspeechEncoder, self).__init__()
+
+    def count_parameters(self) -> int:
+        r""" Count parameters of encoders """
+        return sum([p.numel for p in self.parameters()])
+
+    def update_dropout(self, dropout_p: float) -> None:
+        r""" Update dropout probability of encoders """
+        for name, child in self.named_children():
+            if isinstance(child, nn.Dropout):
+                child.p = dropout_p
+
+    def forward(self, inputs: Tensor, input_lengths: Tensor):
+        r"""
+        Forward propagate for encoders training.
+
+        Inputs:
+            inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+            input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+        """
+        raise NotImplementedError
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/encoders/quartznet.py b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/quartznet.py
new file mode 100644
index 000000000..bf181306a
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/quartznet.py
@@ -0,0 +1,120 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+from typing import Tuple
+
+from openspeech.modules import QuartzNetBlock, JasperSubBlock
+
+
+class QuartzNet(nn.Module):
+    r"""
+    QuartzNet is fully convolutional automatic speech recognition model.  The model is composed of multiple
+    blocks with residual connections between them. Each block consists of one or more modules with
+    1D time-channel separable convolutional layers, batch normalization, and ReLU layers.
+    It is trained with CTC loss.
+
+    Args:
+        configs (DictConfig): hydra configuration set.
+        input_dim (int): dimension of input.
+        num_classes (int): number of classification.
+
+    Inputs:
+        inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        (Tensor, Tensor):
+
+        * outputs (torch.FloatTensor): Log probability of model predictions.  ``(batch, seq_length, num_classes)``
+        * output_lengths (torch.LongTensor): The length of output tensor ``(batch)``
+
+    Reference:
+        Samuel Kriman et al.: QUARTZNET: DEEP AUTOMATIC SPEECH RECOGNITION WITH 1D TIME-CHANNEL SEPARABLE CONVOLUTIONS.
+        https://arxiv.org/abs/1910.10261.pdf
+    """
+    def __init__(self, configs, input_dim: int, num_classes: int) -> None:
+        super(QuartzNet, self).__init__()
+        self.configs = configs
+
+        in_channels = eval(self.configs.model.in_channels)
+        out_channels = eval(self.configs.model.out_channels)
+        kernel_size = eval(self.configs.model.kernel_size)
+        dilation = eval(self.configs.model.dilation)
+        dropout_p = eval(self.configs.model.dropout_p)
+
+        self.preprocess_layer = JasperSubBlock(
+            in_channels=input_dim,
+            out_channels=out_channels[0],
+            kernel_size=kernel_size[0],
+            dilation=dilation[0],
+            dropout_p=dropout_p[0],
+            activation='relu',
+            bias=False,
+        )
+        self.layers = nn.ModuleList([
+            QuartzNetBlock(
+                num_sub_blocks=self.configs.model.num_sub_blocks,
+                in_channels=in_channels[i],
+                out_channels=out_channels[i],
+                kernel_size=kernel_size[i],
+                bias=False,
+            ) for i in range(1, self.configs.model.num_blocks + 1)
+        ])
+        self.postprocess_layers = nn.ModuleList([
+            JasperSubBlock(
+                in_channels=in_channels[i],
+                out_channels=num_classes if out_channels[i] is None else out_channels[i],
+                kernel_size=kernel_size[i],
+                dilation=dilation[i],
+                dropout_p=dropout_p[i],
+                activation='relu',
+                bias=True if i == 2 else False,
+            ) for i in range(self.configs.model.num_blocks + 1, self.configs.model.num_blocks + 4)
+        ])
+
+    def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""
+        Forward propagate a `inputs` for  encoder_only training.
+
+        Args:
+            inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+            input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+        Returns:
+            (Tensor, Tensor):
+
+            * outputs (torch.FloatTensor): Log probability of model predictions.  ``(batch, seq_length, num_classes)``
+            * output_lengths (torch.LongTensor): The length of output tensor ``(batch)``
+        """
+        inputs = inputs.transpose(1, 2)
+
+        outputs, output_lengths = self.preprocess_layer(inputs, input_lengths)
+
+        for layer in self.layers:
+            outputs, output_lengths = layer(outputs, output_lengths)
+
+        for layer in self.postprocess_layers:
+            outputs, output_lengths = layer(outputs, output_lengths)
+
+        return outputs.transpose(1, 2), output_lengths
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/encoders/rnn_transducer_encoder.py b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/rnn_transducer_encoder.py
new file mode 100644
index 000000000..6a5b13802
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/rnn_transducer_encoder.py
@@ -0,0 +1,112 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+from typing import Tuple
+
+from openspeech.encoders import OpenspeechEncoder
+from openspeech.modules import Linear
+
+
+class RNNTransducerEncoder(OpenspeechEncoder):
+    r"""
+    Encoder of RNN-Transducer.
+
+    Args:
+        input_dim (int): dimension of input vector
+        hidden_state_dim (int, optional): hidden state dimension of encoders (default: 320)
+        output_dim (int, optional): output dimension of encoders and decoders (default: 512)
+        num_layers (int, optional): number of encoders layers (default: 4)
+        rnn_type (str, optional): type of rnn cell (default: lstm)
+        bidirectional (bool, optional): if True, becomes a bidirectional encoders (default: True)
+
+    Inputs: inputs, input_lengths
+        inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        (Tensor, Tensor)
+
+        * outputs (torch.FloatTensor): A output sequence of encoders. `FloatTensor` of size
+            ``(batch, seq_length, dimension)``
+        * hidden_states (torch.FloatTensor): A hidden state of encoders. `FloatTensor` of size
+            ``(batch, seq_length, dimension)``
+
+    Reference:
+        A Graves: Sequence Transduction with Recurrent Neural Networks
+        https://arxiv.org/abs/1211.3711.pdf
+    """
+    supported_rnns = {
+        'lstm': nn.LSTM,
+        'gru': nn.GRU,
+        'rnn': nn.RNN,
+    }
+
+    def __init__(
+            self,
+            input_dim: int,
+            hidden_state_dim: int,
+            output_dim: int,
+            num_layers: int,
+            rnn_type: str = 'lstm',
+            dropout_p: float = 0.2,
+            bidirectional: bool = True,
+    ):
+        super(RNNTransducerEncoder, self).__init__()
+        self.hidden_state_dim = hidden_state_dim
+        rnn_cell = self.supported_rnns[rnn_type.lower()]
+        self.rnn = rnn_cell(
+            input_size=input_dim,
+            hidden_size=hidden_state_dim,
+            num_layers=num_layers,
+            bias=True,
+            batch_first=True,
+            dropout=dropout_p,
+            bidirectional=bidirectional,
+        )
+        self.fc = Linear(hidden_state_dim << 1 if bidirectional else hidden_state_dim, output_dim)
+
+    def forward(
+            self,
+            inputs: torch.Tensor,
+            input_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""
+        Forward propagate a `inputs` for  encoders training.
+
+        Args:
+            inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+            input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+        Returns:
+            (Tensor, Tensor)
+
+            * outputs (torch.FloatTensor): A output sequence of encoders. `FloatTensor` of size
+                ``(batch, seq_length, dimension)``
+            * output_lengths (torch.LongTensor): The length of output tensor. ``(batch)``
+        """
+        inputs = nn.utils.rnn.pack_padded_sequence(inputs.transpose(0, 1), input_lengths.cpu())
+        outputs, hidden_states = self.rnn(inputs)
+        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
+        outputs = self.fc(outputs.transpose(0, 1))
+        return outputs, input_lengths
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/encoders/transformer_encoder.py b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/transformer_encoder.py
new file mode 100644
index 000000000..eececf6ec
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/transformer_encoder.py
@@ -0,0 +1,205 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+from typing import Tuple
+from torch import Tensor
+
+from openspeech.encoders import OpenspeechEncoder
+from openspeech.modules import (
+    Linear,
+    PositionalEncoding,
+    get_attn_pad_mask,
+    Transpose,
+    MultiHeadAttention,
+    PositionwiseFeedForward,
+)
+
+
+class TransformerEncoderLayer(nn.Module):
+    r"""
+    EncoderLayer is made up of self-attention and feedforward network.
+    This standard encoders layer is based on the paper "Attention Is All You Need".
+
+    Args:
+        d_model: dimension of model (default: 512)
+        num_heads: number of attention heads (default: 8)
+        d_ff: dimension of feed forward network (default: 2048)
+        dropout_p: probability of dropout (default: 0.3)
+
+    Inputs:
+        inputs (torch.FloatTensor): input sequence of transformer encoder layer
+        self_attn_mask (torch.BoolTensor): mask of self attention
+
+    Returns:
+        (Tensor, Tensor)
+
+        * outputs (torch.FloatTensor): output of transformer encoder layer
+        * attn (torch.FloatTensor): attention of transformer encoder layer
+    """
+
+    def __init__(
+            self,
+            d_model: int = 512,
+            num_heads: int = 8,
+            d_ff: int = 2048,
+            dropout_p: float = 0.3,
+    ) -> None:
+        super(TransformerEncoderLayer, self).__init__()
+        self.attention_prenorm = nn.LayerNorm(d_model)
+        self.feed_forward_prenorm = nn.LayerNorm(d_model)
+        self.self_attention = MultiHeadAttention(d_model, num_heads)
+        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout_p)
+
+    def forward(self, inputs: Tensor, self_attn_mask: Tensor = None) -> Tuple[Tensor, Tensor]:
+        r"""
+        Forward propagate of transformer encoder layer.
+
+        Inputs:
+            inputs (torch.FloatTensor): input sequence of transformer encoder layer
+            self_attn_mask (torch.BoolTensor): mask of self attention
+
+        Returns:
+            outputs (torch.FloatTensor): output of transformer encoder layer
+            attn (torch.FloatTensor): attention of transformer encoder layer
+        """
+        residual = inputs
+        inputs = self.attention_prenorm(inputs)
+        outputs, attn = self.self_attention(inputs, inputs, inputs, self_attn_mask)
+        outputs += residual
+
+        residual = outputs
+        outputs = self.feed_forward_prenorm(outputs)
+        outputs = self.feed_forward(outputs)
+        outputs += residual
+
+        return outputs, attn
+
+
+class TransformerEncoder(OpenspeechEncoder):
+    r"""
+    The TransformerEncoder is composed of a stack of N identical layers.
+    Each layer has two sub-layers. The first is a multi-head self-attention mechanism,
+    and the second is a simple, position-wise fully connected feed-forward network.
+
+    Args:
+        input_dim: dimension of feature vector
+        d_model: dimension of model (default: 512)
+        d_ff: dimension of feed forward network (default: 2048)
+        num_layers: number of encoders layers (default: 6)
+        num_heads: number of attention heads (default: 8)
+        dropout_p:  probability of dropout (default: 0.3)
+        joint_ctc_attention (bool, optional): flag indication joint ctc attention or not
+
+    Inputs:
+        - **inputs**: list of sequences, whose length is the batch size and within which each sequence is list of tokens
+        - **input_lengths**: list of sequence lengths
+
+    Returns:
+        (Tensor, Tensor, Tensor):
+
+        * outputs: A output sequence of encoders. `FloatTensor` of size ``(batch, seq_length, dimension)``
+        * encoder_logits: Log probability of encoders outputs will be passed to CTC Loss.
+            If joint_ctc_attention is False, return None.  ``(batch, seq_length, num_classes)``
+        * output_lengths: The length of encoders outputs. ``(batch)``
+
+    Reference:
+        Ashish Vaswani et al.: Attention Is All You Need
+        https://arxiv.org/abs/1706.03762
+    """
+
+    def __init__(
+            self,
+            num_classes: int,
+            input_dim: int = 80,
+            d_model: int = 512,
+            d_ff: int = 2048,
+            num_layers: int = 6,
+            num_heads: int = 8,
+            dropout_p: float = 0.3,
+            joint_ctc_attention: bool = False,
+    ) -> None:
+        super(TransformerEncoder, self).__init__()
+
+        self.num_classes = num_classes
+        self.joint_ctc_attention = joint_ctc_attention
+
+        self.d_model = d_model
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.input_proj = Linear(input_dim, d_model)
+        self.input_norm = nn.LayerNorm(d_model)
+        self.input_dropout = nn.Dropout(p=dropout_p)
+        self.positional_encoding = PositionalEncoding(d_model)
+        self.layers = nn.ModuleList([
+            TransformerEncoderLayer(
+                d_model=d_model,
+                num_heads=num_heads,
+                d_ff=d_ff,
+                dropout_p=dropout_p,
+            ) for _ in range(num_layers)
+        ])
+
+        if self.joint_ctc_attention:
+            self.fc = nn.Sequential(
+                Transpose(shape=(1, 2)),
+                nn.Dropout(dropout_p),
+                Linear(d_model, num_classes, bias=False),
+            )
+
+    def forward(
+            self,
+            inputs: torch.Tensor,
+            input_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        r"""
+        Forward propagate a `inputs` for  encoders training.
+
+        Args:
+            inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded
+                `FloatTensor` of size ``(batch, seq_length, dimension)``.
+            input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+        Returns:
+            (Tensor, Tensor, Tensor):
+
+            * outputs: A output sequence of encoders. `FloatTensor` of size ``(batch, seq_length, dimension)``
+            * encoder_logits: Log probability of encoders outputs will be passed to CTC Loss.
+                If joint_ctc_attention is False, return None. ``(batch, seq_length, num_classes)``
+            * output_lengths: The length of encoders outputs. ``(batch)``
+        """
+        encoder_logits = None
+
+        self_attn_mask = get_attn_pad_mask(inputs, input_lengths, inputs.size(1))
+
+        outputs = self.input_norm(self.input_proj(inputs))
+        outputs += self.positional_encoding(outputs.size(1))
+        outputs = self.input_dropout(outputs)
+
+        for layer in self.layers:
+            outputs, attn = layer(outputs, self_attn_mask)
+
+        if self.joint_ctc_attention:
+            encoder_logits = self.fc(outputs.transpose(1, 2)).log_softmax(dim=-1)
+
+        return outputs, encoder_logits, input_lengths
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/encoders/transformer_transducer_encoder.py b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/transformer_transducer_encoder.py
new file mode 100644
index 000000000..b02427282
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/encoders/transformer_transducer_encoder.py
@@ -0,0 +1,177 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+from typing import Tuple, Optional
+from torch import Tensor
+
+from openspeech.encoders import OpenspeechEncoder
+from openspeech.modules import (
+    PositionalEncoding,
+    get_attn_pad_mask,
+    MultiHeadAttention,
+    PositionwiseFeedForward,
+)
+
+
+class TransformerTransducerEncoderLayer(nn.Module):
+    r"""
+    Repeated layers common to audio encoders and label encoders
+
+    Args:
+        model_dim (int): the number of features in the encoder (default : 512)
+        d_ff (int): the number of features in the feed forward layers (default : 2048)
+        num_heads (int): the number of heads in the multi-head attention (default: 8)
+        dropout (float): dropout probability of encoder layer (default: 0.1)
+
+    Inputs: inputs, self_attn_mask
+        - **inputs**: Audio feature or label feature
+        - **self_attn_mask**: Self attention mask to use in multi-head attention
+
+    Returns: outputs, attn_distribution
+        (Tensor, Tensor)
+
+        * outputs (torch.FloatTensor): Tensor containing higher (audio, label) feature values
+        * attn_distribution (torch.FloatTensor): Attention distribution in multi-head attention
+    """
+    def __init__(
+            self,
+            model_dim: int = 512,
+            d_ff: int = 2048,
+            num_heads: int = 8,
+            dropout: float = 0.1,
+    ) -> None:
+        super(TransformerTransducerEncoderLayer, self).__init__()
+        self.layer_norm = nn.LayerNorm(model_dim)
+        self.self_attention = MultiHeadAttention(model_dim, num_heads)
+        self.encoder_dropout = nn.Dropout(p=dropout)
+        self.feed_forward = PositionwiseFeedForward(model_dim, d_ff, dropout)
+
+    def forward(
+            self,
+            inputs: Tensor,
+            self_attn_mask: Optional[Tensor] = None
+    ) -> Tuple[Tensor, Tensor]:
+        r"""
+        Forward propagate a `inputs` for label encoder.
+
+        Args:
+            inputs : A input sequence passed to encoder layer. ``(batch, seq_length, dimension)``
+            self_attn_mask : Self attention mask to cover up padding ``(batch, seq_length, seq_length)``
+
+        Returns:
+            **outputs** (Tensor): ``(batch, seq_length, dimension)``
+            **attn_distribution** (Tensor): ``(batch, seq_length, seq_length)``
+        """
+        inputs = self.layer_norm(inputs)
+        self_attn_output, attn_distribution = self.self_attention(inputs, inputs, inputs, self_attn_mask)
+        self_attn_output += inputs
+
+        self_attn_output = self.layer_norm(self_attn_output)
+        ff_output = self.feed_forward(self_attn_output)
+        output = self.encoder_dropout(ff_output + self_attn_output)
+
+        return output, attn_distribution
+
+
+class TransformerTransducerEncoder(OpenspeechEncoder):
+    r"""
+    Converts the audio signal to higher feature values
+
+    Args:
+        input_size (int): dimension of input vector (default : 80)
+        model_dim (int): the number of features in the audio encoder (default : 512)
+        d_ff (int): the number of features in the feed forward layers (default : 2048)
+        num_layers (int): the number of audio encoder layers (default: 18)
+        num_heads (int): the number of heads in the multi-head attention (default: 8)
+        dropout (float): dropout probability of audio encoder (default: 0.1)
+        max_positional_length (int): Maximum length to use for positional encoding (default : 5000)
+
+    Inputs: inputs, inputs_lens
+        - **inputs**: Parsed audio of batch size number
+        - **inputs_lens**: Tensor of sequence lengths
+
+    Returns:
+        * outputs (torch.FloatTensor): ``(batch, seq_length, dimension)``
+        * input_lengths (torch.LongTensor):  ``(batch)``
+
+    Reference:
+        Qian Zhang et al.: Transformer Transducer: A Streamable Speech Recognition Model with Transformer Encoders and RNN-T Loss
+        https://arxiv.org/abs/2002.02562
+    """
+    def __init__(
+            self,
+            input_size: int = 80,
+            model_dim: int = 512,
+            d_ff: int = 2048,
+            num_layers: int = 18,
+            num_heads: int = 8,
+            dropout: float = 0.1,
+            max_positional_length: int = 5000,
+    ) -> None:
+        super(TransformerTransducerEncoder, self).__init__()
+        self.input_size = input_size
+        self.model_dim = model_dim
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.input_dropout = nn.Dropout(p=dropout)
+        self.layer_norm = nn.LayerNorm(model_dim)
+        self.positional_encoding = PositionalEncoding(model_dim, max_positional_length)
+        self.input_fc = nn.Linear(input_size, model_dim)
+        self.encoder_layers = nn.ModuleList([
+            TransformerTransducerEncoderLayer(
+                model_dim,
+                d_ff,
+                num_heads,
+                dropout
+            ) for _ in range(num_layers)
+        ])
+
+    def forward(
+            self,
+            inputs: torch.Tensor,
+            input_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""
+        Forward propagate a `inputs` for audio encoder.
+
+        Args:
+            inputs (torch.FloatTensor): A input sequence passed to audio encoder. Typically inputs will be a padded
+                `FloatTensor` of size ``(batch, seq_length, dimension)``.
+            input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+        Returns:
+            **outputs** (Tensor): ``(batch, seq_length, dimension)``
+            ** input_lengths**(Tensor):  ``(batch)``
+        """
+        seq_len = inputs.size(1)
+
+        self_attn_mask = get_attn_pad_mask(inputs, input_lengths, seq_len)
+
+        inputs = self.input_fc(inputs) + self.positional_encoding(seq_len)
+        outputs = self.input_dropout(inputs)
+
+        for encoder_layer in self.encoder_layers:
+            outputs, _ = encoder_layer(outputs, self_attn_mask)
+
+        return outputs, input_lengths
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/lm/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/lm/__init__.py
new file mode 100644
index 000000000..8f3de5ab0
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/lm/__init__.py
@@ -0,0 +1,24 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from .lstm_lm import LSTMForLanguageModel
+from .transformer_lm import TransformerForLanguageModel
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/lm/lstm_lm.py b/audio/speech_recognition/conformer/pytorch/openspeech/lm/lstm_lm.py
new file mode 100644
index 000000000..1eab3508c
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/lm/lstm_lm.py
@@ -0,0 +1,158 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+import random
+
+from openspeech.lm.openspeech_lm import OpenspeechLanguageModelBase
+from openspeech.modules import Linear, View
+from typing import Optional, Tuple
+
+
+class LSTMForLanguageModel(OpenspeechLanguageModelBase):
+    """
+    Language Modelling is the core problem for a number of of natural language processing tasks such as speech to text,
+    conversational system, and text summarization. A trained language model learns the likelihood of occurrence
+    of a word based on the previous sequence of words used in the text.
+
+    Args:
+        num_classes (int): number of classification
+        max_length (int): max decoding length (default: 128)
+        hidden_state_dim (int): dimension of hidden state vector (default: 768)
+        rnn_type (str, optional): type of RNN cell (default: lstm)
+        pad_id (int, optional): index of the pad symbol (default: 0)
+        sos_id (int, optional): index of the start of sentence symbol (default: 1)
+        eos_id (int, optional): index of the end of sentence symbol (default: 2)
+        num_layers (int, optional): number of recurrent layers (default: 2)
+        dropout_p (float, optional): dropout probability of decoders (default: 0.2)
+
+    Inputs: inputs, teacher_forcing_ratio
+        inputs (torch.LongTensr): A input sequence passed to decoders. `IntTensor` of size ``(batch, seq_length)``
+        teacher_forcing_ratio (float): ratio of teacher forcing
+
+    Returns:
+        * logits (torch.FloatTensor): Log probability of model predictions.
+    """
+    supported_rnns = {
+        'lstm': nn.LSTM,
+        'gru': nn.GRU,
+        'rnn': nn.RNN,
+    }
+
+    def __init__(
+            self,
+            num_classes: int,
+            max_length: int = 128,
+            hidden_state_dim: int = 768,
+            pad_id: int = 0,
+            sos_id: int = 1,
+            eos_id: int = 2,
+            num_layers: int = 2,
+            rnn_type: str = 'lstm',
+            dropout_p: float = 0.3,
+    ) -> None:
+        super(LSTMForLanguageModel, self).__init__()
+        self.hidden_state_dim = hidden_state_dim
+        self.num_classes = num_classes
+        self.num_layers = num_layers
+        self.max_length = max_length
+        self.eos_id = eos_id
+        self.sos_id = sos_id
+        self.pad_id = pad_id
+        self.embedding = nn.Embedding(num_classes, hidden_state_dim)
+        self.input_dropout = nn.Dropout(dropout_p)
+        rnn_cell = self.supported_rnns[rnn_type.lower()]
+        self.rnn = rnn_cell(
+            input_size=hidden_state_dim,
+            hidden_size=hidden_state_dim,
+            num_layers=num_layers,
+            bias=True,
+            batch_first=True,
+            dropout=dropout_p,
+            bidirectional=False,
+        )
+
+        self.fc = nn.Sequential(
+            Linear(hidden_state_dim, hidden_state_dim),
+            nn.Tanh(),
+            View(shape=(-1, self.hidden_state_dim), contiguous=True),
+            Linear(hidden_state_dim, num_classes),
+        )
+
+    def forward_step(
+            self,
+            input_var: torch.Tensor,
+            hidden_states: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        batch_size, output_lengths = input_var.size(0), input_var.size(1)
+
+        embedded = self.embedding(input_var)
+        embedded = self.input_dropout(embedded)
+
+        if self.training:
+            self.rnn.flatten_parameters()
+
+        outputs, hidden_states = self.rnn(embedded, hidden_states)
+
+        step_outputs = self.fc(outputs.reshape(-1, self.hidden_state_dim)).log_softmax(dim=-1)
+        step_outputs = step_outputs.view(batch_size, output_lengths, -1).squeeze(1)
+
+        return step_outputs, hidden_states
+
+    def forward(
+            self,
+            inputs: torch.Tensor,
+            teacher_forcing_ratio: float = 1.0,
+    ) -> torch.Tensor:
+        """
+        Forward propagate a `encoder_outputs` for training.
+
+        Args:
+            inputs (torch.LongTensr): A input sequence passed to decoders. `IntTensor` of size ``(batch, seq_length)``
+            teacher_forcing_ratio (float): ratio of teacher forcing
+
+        Returns:
+            * logits (torch.FloatTensor): Log probability of model predictions.
+        """
+        batch_size = inputs.size(0)
+        logits, hidden_states = list(), None
+        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
+
+        if use_teacher_forcing:
+            inputs = inputs[inputs != self.eos_id].view(batch_size, -1)
+            step_outputs, hidden_states = self.forward_step(input_var=inputs, hidden_states=hidden_states)
+
+            for di in range(step_outputs.size(1)):
+                step_output = step_outputs[:, di, :]
+                logits.append(step_output)
+
+        else:
+            input_var = inputs[:, 0].unsqueeze(1)
+            for di in range(self.max_length):
+                step_output, hidden = self.forward_step(input_var=input_var, hidden_states=hidden_states)
+
+                step_output = step_output.squeeze(1)
+                logits.append(step_output)
+                input_var = logits[-1].topk(1)[1]
+
+        return torch.stack(logits, dim=1)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/lm/openspeech_lm.py b/audio/speech_recognition/conformer/pytorch/openspeech/lm/openspeech_lm.py
new file mode 100644
index 000000000..61755396e
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/lm/openspeech_lm.py
@@ -0,0 +1,45 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+
+
+class OpenspeechLanguageModelBase(nn.Module):
+    r""" Interface of OpenSpeech decoder. """
+    def __init__(self):
+        super(OpenspeechLanguageModelBase, self).__init__()
+
+    def count_parameters(self) -> int:
+        r""" Count parameters of decoders """
+        return sum([p.numel for p in self.parameters()])
+
+    def update_dropout(self, dropout_p: float) -> None:
+        r""" Update dropout probability of decoders """
+        for name, child in self.named_children():
+            if isinstance(child, nn.Dropout):
+                child.p = dropout_p
+
+    def forward_step(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def forward(self, *args, **kwargs):
+        raise NotImplementedError
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/lm/transformer_lm.py b/audio/speech_recognition/conformer/pytorch/openspeech/lm/transformer_lm.py
new file mode 100644
index 000000000..205982311
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/lm/transformer_lm.py
@@ -0,0 +1,171 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+from typing import Optional, Tuple
+
+from openspeech.lm.openspeech_lm import OpenspeechLanguageModelBase
+from openspeech.modules import (
+    TransformerEmbedding,
+    PositionalEncoding,
+    Linear,
+    PositionwiseFeedForward,
+    MultiHeadAttention,
+    get_attn_pad_mask,
+    get_attn_subsequent_mask,
+)
+
+
+class TransformerForLanguageModelLayer(nn.Module):
+    def __init__(
+            self,
+            d_model: int = 768,
+            num_attention_heads: int = 8,
+            d_ff: int = 2048,
+            dropout_p: float = 0.3,
+    ) -> None:
+        super(TransformerForLanguageModelLayer, self).__init__()
+        self.attention_prenorm = nn.LayerNorm(d_model)
+        self.attention = MultiHeadAttention(d_model, num_attention_heads)
+        self.feed_forward_prenorm = nn.LayerNorm(d_model)
+        self.feed_forward = PositionwiseFeedForward(d_model=d_model, d_ff=d_ff, dropout_p=dropout_p)
+
+    def forward(
+            self,
+            inputs: torch.Tensor,
+            mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        residual = inputs
+        inputs = self.attention_prenorm(inputs)
+        outputs, _ = self.attention(inputs, inputs, inputs, mask)
+        outputs += residual
+
+        residual = outputs
+        outputs = self.feed_forward_prenorm(outputs)
+        outputs = self.feed_forward(outputs)
+        outputs += residual
+
+        return outputs
+
+
+class TransformerForLanguageModel(OpenspeechLanguageModelBase):
+    """
+    Language Modelling is the core problem for a number of of natural language processing tasks such as speech to text,
+    conversational system, and text summarization. A trained language model learns the likelihood of occurrence
+    of a word based on the previous sequence of words used in the text.
+
+    Args:
+        num_classes (int): number of classification
+        max_length (int): max decoding length (default: 128)
+        d_model (int): dimension of model (default: 768)
+        d_ff (int): dimension of feed forward network (default: 1536)
+        num_attention_heads (int): number of attention heads (default: 8)
+        pad_id (int, optional): index of the pad symbol (default: 0)
+        sos_id (int, optional): index of the start of sentence symbol (default: 1)
+        eos_id (int, optional): index of the end of sentence symbol (default: 2)
+        num_layers (int, optional): number of transformer layers (default: 2)
+        dropout_p (float, optional): dropout probability of decoders (default: 0.2)
+
+    Inputs:, inputs, input_lengths
+        inputs (torch.LongTensor): A input sequence passed to decoders. `IntTensor` of size ``(batch, seq_length)``
+        input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        * logits (torch.FloatTensor): Log probability of model predictions.
+    """
+    def __init__(
+            self,
+            num_classes: int,
+            max_length: int = 128,
+            d_model: int = 768,
+            num_attention_heads: int = 8,
+            d_ff: int = 1536,
+            pad_id: int = 0,
+            sos_id: int = 1,
+            eos_id: int = 2,
+            num_layers: int = 2,
+            dropout_p: float = 0.3,
+    ):
+        super(TransformerForLanguageModel, self).__init__()
+        self.d_model = d_model
+        self.num_classes = num_classes
+        self.num_layers = num_layers
+        self.max_length = max_length
+        self.eos_id = eos_id
+        self.sos_id = sos_id
+        self.pad_id = pad_id
+        self.embedding = TransformerEmbedding(num_classes, pad_id, d_model)
+        self.positional_encoding = PositionalEncoding(d_model)
+        self.input_dropout = nn.Dropout(p=dropout_p)
+        self.layers = nn.ModuleList([
+            TransformerForLanguageModelLayer(
+                d_model=d_model,
+                num_attention_heads=num_attention_heads,
+                d_ff=d_ff,
+                dropout_p=dropout_p,
+            ) for _ in range(num_layers)
+        ])
+        self.fc = nn.Sequential(
+            nn.LayerNorm(d_model),
+            Linear(d_model, d_model, bias=False),
+            nn.Tanh(),
+            Linear(d_model, num_classes, bias=False),
+        )
+
+    def forward_step(self, inputs, input_lengths):
+        pad_mask = get_attn_pad_mask(
+            inputs, input_lengths, inputs.size(1)
+        )
+        subsequent_mask = get_attn_subsequent_mask(inputs)
+        mask = torch.gt((pad_mask + subsequent_mask), 0)
+
+        outputs = self.embedding(inputs) + self.positional_encoding(inputs.size(1))
+        outputs = self.input_dropout(outputs)
+
+        for layer in self.layers:
+            outputs = layer(inputs=outputs, mask=mask)
+
+        step_outputs = self.fc(outputs).log_softmax(dim=-1)
+
+        return step_outputs
+
+    def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor) -> torch.Tensor:
+        """
+        Forward propagate a `encoder_outputs` for training.
+
+        Args:
+            inputs (torch.LongTensor): A input sequence passed to decoders. `IntTensor` of size ``(batch, seq_length)``
+            input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+        Returns:
+            * logits (torch.FloatTensor): Log probability of model predictions.
+        """
+        logits = list()
+
+        step_outputs = self.forward_step(inputs, input_lengths)
+
+        for di in range(step_outputs.size(1)):
+            step_output = step_outputs[:, di, :]
+            logits.append(step_output)
+
+        return torch.stack(logits, dim=1)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/metrics.py b/audio/speech_recognition/conformer/pytorch/openspeech/metrics.py
new file mode 100644
index 000000000..56543b678
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/metrics.py
@@ -0,0 +1,156 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import Levenshtein as Lev
+from typing import Tuple
+
+
+class ErrorRate(object):
+    r"""
+    Provides inteface of error rate calcuation.
+
+    Note:
+        Do not use this class directly, use one of the sub classes.
+    """
+
+    def __init__(self, tokenizer) -> None:
+        self.total_dist = 0.0
+        self.total_length = 0.0
+        self.tokenizer = tokenizer
+
+    def __call__(self, targets, y_hats):
+        r"""
+        Calculating error rate.
+
+        Args:
+            targets (torch.Tensor): set of ground truth
+            y_hats (torch.Tensor): predicted y values (y_hat) by the model
+
+        Returns:
+            - **cer**: character error rate
+        """
+        dist, length = self._get_distance(targets, y_hats)
+        self.total_dist += dist
+        self.total_length += length
+        return self.total_dist / self.total_length
+
+    def _get_distance(self, targets: torch.Tensor, y_hats: torch.Tensor) -> Tuple[float, int]:
+        r"""
+        Provides total character distance between targets & y_hats
+
+        Args: targets, y_hats
+            targets (torch.Tensor): set of ground truth
+            y_hats (torch.Tensor): predicted y values (y_hat) by the model
+
+        Returns: total_dist, total_length
+            - **total_dist**: total distance between targets & y_hats
+            - **total_length**: total length of targets sequence
+        """
+        total_dist = 0
+        total_length = 0
+
+        for (target, y_hat) in zip(targets, y_hats):
+            s1 = self.tokenizer.decode(target)
+            s2 = self.tokenizer.decode(y_hat)
+
+            dist, length = self.metric(s1, s2)
+
+            total_dist += dist
+            total_length += length
+
+        return total_dist, total_length
+
+    def metric(self, *args, **kwargs) -> Tuple[float, int]:
+        raise NotImplementedError
+
+
+class CharacterErrorRate(ErrorRate):
+    r"""
+    Computes the Character Error Rate, defined as the edit distance between the
+    two provided sentences after tokenizing to characters.
+    """
+    def __init__(self, tokenizer):
+        super(CharacterErrorRate, self).__init__(tokenizer)
+
+    def metric(self, s1: str, s2: str) -> Tuple[float, int]:
+        r"""
+        Computes the Character Error Rate, defined as the edit distance between the
+        two provided sentences after tokenizing to characters.
+
+        Args: s1, s2
+            s1 (string): space-separated sentence
+            s2 (string): space-separated sentence
+
+        Returns: dist, length
+            - **dist**: distance between target & y_hat
+            - **length**: length of target sequence
+        """
+        s1 = s1.replace(' ', '')
+        s2 = s2.replace(' ', '')
+
+        # if '_' in sentence, means subword-unit, delete '_'
+        if '_' in s1:
+            s1 = s1.replace('_', '')
+
+        if '_' in s2:
+            s2 = s2.replace('_', '')
+
+        dist = Lev.distance(s2, s1)
+        length = len(s1.replace(' ', ''))
+
+        return dist, length
+
+
+class WordErrorRate(ErrorRate):
+    r"""
+    Computes the Word Error Rate, defined as the edit distance between the
+    two provided sentences after tokenizing to words.
+    """
+    def __init__(self, tokenizer):
+        super(WordErrorRate, self).__init__(tokenizer)
+
+    def metric(self, s1: str, s2: str) -> Tuple[float, int]:
+        r"""
+        Computes the Word Error Rate, defined as the edit distance between the
+        two provided sentences after tokenizing to words.
+
+        Args: s1, s2
+            s1 (string): space-separated sentence
+            s2 (string): space-separated sentence
+
+        Returns: dist, length
+            - **dist**: distance between target & y_hat
+            - **length**: length of target sequence
+        """
+        b = set(s1.split() + s2.split())
+        word2char = dict(zip(b, range(len(b))))
+
+        # map the words to a char array (Levenshtein packages only accepts
+        # strings)
+        w1 = [chr(word2char[w]) for w in s1.split()]
+        w2 = [chr(word2char[w]) for w in s2.split()]
+
+        dist = Lev.distance(''.join(w1), ''.join(w2))
+        length = len(s1.split())
+
+        return dist, length
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/README.md b/audio/speech_recognition/conformer/pytorch/openspeech/models/README.md
new file mode 100644
index 000000000..c71ea9977
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/README.md
@@ -0,0 +1,27 @@
+## Model architectures  
+  
+We support all the models below. Note that, the important concepts of the model have been implemented to match, but the details of the implementation may vary.  
+  
+1. [**DeepSpeech2**](https://sooftware.github.io/openspeech/architectures/DeepSpeech2.html) (from Baidu Research) released with paper [Deep Speech 2: End-to-End Speech Recognition in
+English and Mandarin](https://arxiv.org/abs/1512.02595.pdf), by Dario Amodei, Rishita Anubhai, Eric Battenberg, Carl Case, Jared Casper, Bryan Catanzaro, Jingdong Chen, Mike Chrzanowski, Adam Coates, Greg Diamos, Erich Elsen, Jesse Engel, Linxi Fan, Christopher Fougner, Tony Han, Awni Hannun, Billy Jun, Patrick LeGresley, Libby Lin, Sharan Narang, Andrew Ng, Sherjil Ozair, Ryan Prenger, Jonathan Raiman, Sanjeev Satheesh, David Seetapun, Shubho Sengupta, Yi Wang, Zhiqian Wang, Chong Wang, Bo Xiao, Dani Yogatama, Jun Zhan, Zhenyao Zhu. 
+2. [**RNN-Transducer**](https://sooftware.github.io/openspeech/architectures/RNN%20Transducer.html) (from University of Toronto) released with paper [Sequence Transduction with Recurrent Neural Networks](https://arxiv.org/abs/1211.3711.pdf), by Alex Graves.
+3. [**LSTM Language Model**](https://sooftware.github.io/openspeech/architectures/LSTM%20LM.html) (from RWTH Aachen University) released with paper [LSTM Neural Networks for Language Modeling](http://www-i6.informatik.rwth-aachen.de/publications/download/820/Sundermeyer-2012.pdf), by  Martin Sundermeyer, Ralf Schluter, and Hermann Ney.  
+3. [**Listen Attend Spell**](https://sooftware.github.io/openspeech/architectures/Listen%20Attend%20Spell.html) (from Carnegie Mellon University and Google Brain) released with paper [Listen, Attend and Spell](https://arxiv.org/abs/1508.01211), by William Chan, Navdeep Jaitly, Quoc V. Le, Oriol Vinyals.  
+4. [**Location-aware attention based Listen Attend Spell**](https://sooftware.github.io/openspeech/architectures/Listen%20Attend%20Spell.html) (from University of Wrocław and Jacobs University and Universite de Montreal) released with paper [Attention-Based Models for Speech Recognition](https://arxiv.org/abs/1506.07503), by Jan Chorowski, Dzmitry Bahdanau, Dmitriy Serdyuk, Kyunghyun Cho, Yoshua Bengio.  
+5. [**Joint CTC-Attention based Listen Attend Spell**](https://sooftware.github.io/openspeech/architectures/Listen%20Attend%20Spell.html) (from Mitsubishi Electric Research Laboratories and Carnegie Mellon University) released with paper [Joint CTC-Attention based End-to-End Speech Recognition using Multi-task Learning](https://arxiv.org/abs/1609.06773), by Suyoun Kim, Takaaki Hori, Shinji Watanabe.  
+6. [**Deep CNN Encoder with Joint CTC-Attention Listen Attend Spell**](https://sooftware.github.io/openspeech/architectures/Listen%20Attend%20Spell.html) (from Mitsubishi Electric Research Laboratories and Massachusetts Institute of Technology and Carnegie Mellon University) released with paper [Advances in Joint CTC-Attention based End-to-End Speech Recognition with a Deep CNN Encoder and RNN-LM](https://arxiv.org/abs/1706.02737), by Takaaki Hori, Shinji Watanabe, Yu Zhang, William Chan.
+7. [**Multi-head attention based Listen Attend Spell**](https://sooftware.github.io/openspeech/architectures/Listen%20Attend%20Spell.html) (from Google) released with paper [State-of-the-art Speech Recognition With Sequence-to-Sequence Models](https://arxiv.org/abs/1712.01769), by Chung-Cheng Chiu, Tara N. Sainath, Yonghui Wu, Rohit Prabhavalkar, Patrick Nguyen, Zhifeng Chen, Anjuli Kannan, Ron J. Weiss, Kanishka Rao, Ekaterina Gonina, Navdeep Jaitly, Bo Li, Jan Chorowski, Michiel Bacchiani.  
+8. [**Speech-Transformer**](https://sooftware.github.io/openspeech/architectures/Transformer.html) (from University of Chinese Academy of Sciences and Institute of Automation and Chinese Academy of Sciences) released with paper [Speech-Transformer: A No-Recurrence Sequence-to-Sequence Model for Speech Recognition](https://ieeexplore.ieee.org/document/8462506), by Linhao Dong; Shuang Xu; Bo Xu.
+9. [**VGG-Transformer**](https://sooftware.github.io/openspeech/architectures/Transformer.html) (from Facebook AI Research) released with paper [Transformers with convolutional context for ASR](https://arxiv.org/abs/1904.11660), by Abdelrahman Mohamed, Dmytro Okhonko, Luke Zettlemoyer.  
+10. [**Transformer with CTC**](https://sooftware.github.io/openspeech/architectures/Transformer.html) (from NTT Communication Science Laboratories, Waseda University, Center for Language and Speech Processing, Johns Hopkins University) released with paper [Improving Transformer-based End-to-End Speech Recognition with Connectionist Temporal Classification and Language Model Integration](https://www.isca-speech.org/archive/Interspeech_2019/pdfs/1938.pdf), by Shigeki Karita, Nelson Enrique Yalta Soplin, Shinji Watanabe, Marc Delcroix, Atsunori Ogawa, Tomohiro Nakatani.
+11. [**Joint CTC-Attention based Transformer**](https://sooftware.github.io/openspeech/architectures/Transformer.html)(from NTT Corporation) released with paper [Self-Distillation for Improving CTC-Transformer-based ASR Systems](https://www.isca-speech.org/archive/Interspeech_2020/pdfs/1223.pdf), by Takafumi Moriya, Tsubasa Ochiai, Shigeki Karita, Hiroshi Sato, Tomohiro Tanaka, Takanori Ashihara, Ryo Masumura, Yusuke Shinohara, Marc Delcroix.
+12. [**Transformer Language Model**](https://sooftware.github.io/openspeech/architectures/Transformer%20LM.html) (from Amazon Web Services) released with paper [Language Models with Transformers](https://arxiv.org/abs/1904.09408), by Chenguang Wang, Mu Li, Alexander J. Smola.
+12. [**Jasper**](https://sooftware.github.io/openspeech/modules/Encoders.html#module-openspeech.encoders.jasper) (from NVIDIA and New York University) released with paper [Jasper: An End-to-End Convolutional Neural Acoustic Model](https://arxiv.org/pdf/1904.03288.pdf), by Jason Li, Vitaly Lavrukhin, Boris Ginsburg, Ryan Leary, Oleksii Kuchaiev, Jonathan M. Cohen, Huyen Nguyen, Ravi Teja Gadde.   
+13. [**QuartzNet**](https://sooftware.github.io/openspeech/modules/Encoders.html#module-openspeech.encoders.quartznet) (from NVIDIA and Univ. of Illinois and Univ. of Saint Petersburg) released with paper [QuartzNet: Deep Automatic Speech Recognition with 1D Time-Channel Separable Convolutions](https://arxiv.org/abs/1910.10261.pdf), by Samuel Kriman, Stanislav Beliaev, Boris Ginsburg, Jocelyn Huang, Oleksii Kuchaiev, Vitaly Lavrukhin, Ryan Leary, Jason Li, Yang Zhang.  
+14. [**Transformer Transducer**](https://sooftware.github.io/openspeech/architectures/Transformer%20Transducer.html) (from Facebook AI) released with paper [Transformer-Transducer:
+End-to-End Speech Recognition with Self-Attention](https://arxiv.org/abs/1910.12977.pdf), by Ching-Feng Yeh, Jay Mahadeokar, Kaustubh Kalgaonkar, Yongqiang Wang, Duc Le, Mahaveer Jain, Kjell Schubert, Christian Fuegen, Michael L. Seltzer.  
+15. [**Conformer**](https://sooftware.github.io/openspeech/architectures/Conformer.html) (from Google) released with paper [Conformer: Convolution-augmented Transformer for Speech Recognition](https://arxiv.org/abs/2005.08100), by Anmol Gulati, James Qin, Chung-Cheng Chiu, Niki Parmar, Yu Zhang, Jiahui Yu, Wei Han, Shibo Wang, Zhengdong Zhang, Yonghui Wu, Ruoming Pang.  
+16. [**Conformer with CTC**](https://sooftware.github.io/openspeech/architectures/Conformer.html) (from Northwestern Polytechnical University and University of Bordeaux and Johns Hopkins University and Human Dataware Lab and Kyoto University and NTT Corporation and Shanghai Jiao Tong University and  Chinese Academy of Sciences) released with paper [Recent Developments on ESPNET Toolkit Boosted by Conformer](https://arxiv.org/abs/2010.13956.pdf), by Pengcheng Guo, Florian Boyer, Xuankai Chang, Tomoki Hayashi, Yosuke Higuchi, Hirofumi Inaguma, Naoyuki Kamo, Chenda Li, Daniel Garcia-Romero, Jiatong Shi, Jing Shi, Shinji Watanabe, Kun Wei, Wangyou Zhang, Yuekai Zhang.
+17. [**Conformer with LSTM Decoder**](https://sooftware.github.io/openspeech/architectures/Conformer.html) (from IBM Research AI) released with paper [On the limit of English conversational speech recognition](https://arxiv.org/abs/2105.00982.pdf), by Zoltán Tüske, George Saon, Brian Kingsbury.
+18. [**ContextNet**](https://sooftware.github.io/openspeech/architectures/ContextNet.html) (from Google) released with paper [ContextNet: Improving Convolutional Neural Networks for Automatic Speech Recognition with Global Context](https://arxiv.org/abs/2005.03191), by Wei Han, Zhengdong Zhang, Yu Zhang, Jiahui Yu, Chung-Cheng Chiu, James Qin, Anmol Gulati, Ruoming Pang, Yonghui Wu.
+  
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/__init__.py
new file mode 100644
index 000000000..65f665c71
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/__init__.py
@@ -0,0 +1,208 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import importlib
+import os
+
+from .openspeech_model import OpenspeechModel
+from .openspeech_encoder_decoder_model import OpenspeechEncoderDecoderModel
+from .openspeech_transducer_model import OpenspeechTransducerModel
+from .openspeech_ctc_model import OpenspeechCTCModel
+
+MODEL_REGISTRY = dict()
+MODEL_DATACLASS_REGISTRY = dict()
+
+
+def register_model(name: str, dataclass=None):
+    r"""
+    New model types can be added to OpenSpeech with the :func:`register_model` function decorator.
+
+    For example::
+        @register_model('conformer_lstm')
+        class ConformerLSTMModel(OpenspeechModel):
+            (...)
+
+    .. note:: All models must implement the :class:`cls.__name__` interface.
+
+    Args:
+        name (str): the name of the model
+    """
+
+    def register_model_cls(cls):
+        if name in MODEL_REGISTRY:
+            raise ValueError(f"Cannot register duplicate model ({name})")
+        if not issubclass(cls, OpenspeechModel):
+            raise ValueError(f"Model ({name}: {cls.__name__}) must extend OpenspeechModel")
+
+        MODEL_REGISTRY[name] = cls
+
+        cls.__dataclass = dataclass
+        if dataclass is not None:
+            if name in MODEL_DATACLASS_REGISTRY:
+                raise ValueError(f"Cannot register duplicate model ({name})")
+            MODEL_DATACLASS_REGISTRY[name] = dataclass
+
+        return cls
+
+    return register_model_cls
+
+
+# automatically import any Python files in the models/ directory
+models_dir = os.path.dirname(__file__)
+for file in os.listdir(models_dir):
+    if os.path.isdir(os.path.join(models_dir, file)) and not file.startswith('__'):
+        for subfile in os.listdir(os.path.join(models_dir, file)):
+            path = os.path.join(models_dir, file, subfile)
+            if subfile.endswith(".py"):
+                python_file = subfile[: subfile.find(".py")] if subfile.endswith(".py") else subfile
+                module = importlib.import_module(f"openspeech.models.{file}.{python_file}")
+        continue
+
+    path = os.path.join(models_dir, file)
+    if file.endswith(".py"):
+        model_name = file[: file.find(".py")] if file.endswith(".py") else file
+        module = importlib.import_module(f"openspeech.models.{model_name}")
+
+
+from .conformer import (
+    ConformerLSTMConfigs,
+    ConformerTransducerConfigs,
+    ConformerConfigs,
+    JointCTCConformerLSTMConfigs,
+    ConformerModel,
+    ConformerLSTMModel,
+    ConformerTransducerModel,
+    JointCTCConformerLSTMModel,
+)
+
+from .contextnet import (
+    ContextNetTransducerConfigs,
+    ContextNetLSTMConfigs,
+    ContextNetConfigs,
+    ContextNetModel,
+    ContextNetTransducerModel,
+    ContextNetLSTMModel,
+)
+from .deepspeech2 import (
+    DeepSpeech2Model,
+    DeepSpeech2Configs,
+)
+from .jasper import (
+    Jasper5x3Config,
+    Jasper10x5Config,
+    Jasper5x3Model,
+    Jasper10x5Model,
+)
+from .listen_attend_spell import (
+    ListenAttendSpellConfigs,
+    ListenAttendSpellWithMultiHeadConfigs,
+    ListenAttendSpellWithLocationAwareConfigs,
+    DeepCNNWithJointCTCListenAttendSpellConfigs,
+    JointCTCListenAttendSpellConfigs,
+    ListenAttendSpellModel,
+    ListenAttendSpellWithMultiHeadModel,
+    ListenAttendSpellWithLocationAwareModel,
+    JointCTCListenAttendSpellModel,
+    DeepCNNWithJointCTCListenAttendSpellModel
+)
+from .quartznet import (
+    QuartzNet5x5Configs,
+    QuartzNet15x5Configs,
+    QuartzNet10x5Configs,
+    QuartzNet5x5Model,
+    QuartzNet15x5Model,
+    QuartzNet10x5Model,
+)
+
+from .rnn_transducer import (
+    RNNTransducerModel,
+    RNNTransducerConfigs,
+)
+from .lstm_lm import (
+    LSTMLanguageModel,
+    LSTMLanguageModelConfigs,
+)
+from .transformer_lm import (
+    TransformerLanguageModelConfigs,
+    TransformerLanguageModel
+)
+from .transformer_transducer import (
+    TransformerTransducerModel,
+    TransformerTransducerConfigs,
+)
+
+__all__ = [
+    "OpenspeechModel",
+    "OpenspeechEncoderDecoderModel",
+    "OpenspeechTransducerModel",
+    "OpenspeechCTCModel",
+    "ConformerModel",
+    "ConformerConfigs",
+    "ConformerLSTMModel",
+    "ConformerLSTMConfigs",
+    "ConformerTransducerModel",
+    "ConformerTransducerConfigs",
+    "ContextNetModel",
+    "ContextNetConfigs",
+    "ContextNetLSTMModel",
+    "ContextNetLSTMConfigs",
+    "ContextNetTransducerModel",
+    "ContextNetTransducerConfigs",
+    "DeepCNNWithJointCTCListenAttendSpellModel",
+    "DeepCNNWithJointCTCListenAttendSpellConfigs",
+    "DeepSpeech2Model",
+    "DeepSpeech2Configs",
+    "Jasper5x3Model",
+    "Jasper5x3Config",
+    "Jasper10x5Model",
+    "Jasper10x5Config",
+    "JointCTCConformerLSTMModel",
+    "JointCTCConformerLSTMConfigs",
+    "JointCTCListenAttendSpellConfigs",
+    "JointCTCTransformerConfigs",
+    "JointCTCTransformerModel",
+    "JointCTCListenAttendSpellModel",
+    "ListenAttendSpellConfigs",
+    "ListenAttendSpellWithMultiHeadConfigs",
+    "ListenAttendSpellWithLocationAwareConfigs",
+    "ListenAttendSpellModel",
+    "ListenAttendSpellWithLocationAwareModel",
+    "ListenAttendSpellWithMultiHeadModel",
+    "VGGTransformerConfigs",
+    "VGGTransformerModel",
+    "QuartzNet15x5Configs",
+    "QuartzNet10x5Configs",
+    "QuartzNet5x5Configs",
+    "QuartzNet15x5Model",
+    "QuartzNet10x5Model",
+    "RNNTransducerConfigs",
+    "RNNTransducerModel",
+    "TransformerModel",
+    "TransformerConfigs",
+    "TransformerWithCTCConfigs",
+    "TransformerTransducerConfigs",
+    "TransformerTransducerModel",
+    "TransformerWithCTCModel",
+    "LSTMLanguageModel",
+    "LSTMLanguageModelConfigs",
+    "TransformerLanguageModelConfigs",
+]
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/conformer/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/conformer/__init__.py
new file mode 100644
index 000000000..81778cbfd
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/conformer/__init__.py
@@ -0,0 +1,34 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from .configurations import (
+    ConformerConfigs,
+    ConformerLSTMConfigs,
+    ConformerTransducerConfigs,
+    JointCTCConformerLSTMConfigs,
+)
+from .model import (
+    ConformerTransducerModel,
+    ConformerLSTMModel,
+    ConformerModel,
+    JointCTCConformerLSTMModel
+)
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/conformer/configurations.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/conformer/configurations.py
new file mode 100644
index 000000000..46776922b
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/conformer/configurations.py
@@ -0,0 +1,368 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from dataclasses import dataclass, field
+
+from openspeech.dataclass.configurations import OpenspeechDataclass
+
+
+@dataclass
+class ConformerConfigs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.models.Conformer`.
+
+    It is used to initiated an `Conformer` model.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Args:
+        model_name (str): Model name (default: conformer)
+        encoder_dim (int): Dimension of encoder. (default: 512)
+        num_encoder_layers (int): The number of encoder layers. (default: 17)
+        num_attention_heads (int): The number of attention heads. (default: 8)
+        feed_forward_expansion_factor (int): The expansion factor of feed forward module. (default: 4)
+        conv_expansion_factor (int): The expansion factor of convolution module. (default: 2)
+        input_dropout_p (float): The dropout probability of inputs. (default: 0.1)
+        feed_forward_dropout_p (float): The dropout probability of feed forward module. (default: 0.1)
+        attention_dropout_p (float): The dropout probability of attention module. (default: 0.1)
+        conv_dropout_p (float): The dropout probability of convolution module. (default: 0.1)
+        conv_kernel_size (int): The kernel size of convolution. (default: eq)
+        half_step_residual (bool): Flag indication whether to use half step residual or not (default: True)
+        optimizer (str): Optimizer for training. (default: adam)
+    """
+    model_name: str = field(
+        default="conformer", metadata={"help": "Model name"}
+    )
+    encoder_dim: int = field(
+        default=512, metadata={"help": "Dimension of encoder."}
+    )
+    num_encoder_layers: int = field(
+        default=17, metadata={"help": "The number of encoder layers."}
+    )
+    num_attention_heads: int = field(
+        default=8, metadata={"help": "The number of attention heads."}
+    )
+    feed_forward_expansion_factor: int = field(
+        default=4, metadata={"help": "The expansion factor of feed forward module."}
+    )
+    conv_expansion_factor: int = field(
+        default=2, metadata={"help": "The expansion factor of convolution module."}
+    )
+    input_dropout_p: float = field(
+        default=0.1, metadata={"help": "The dropout probability of inputs."}
+    )
+    feed_forward_dropout_p: float = field(
+        default=0.1, metadata={"help": "The dropout probability of feed forward module."}
+    )
+    attention_dropout_p: float = field(
+        default=0.1, metadata={"help": "The dropout probability of attention module."}
+    )
+    conv_dropout_p: float = field(
+        default=0.1, metadata={"help": "The dropout probability of convolution module."}
+    )
+    conv_kernel_size: int = field(
+        default=31, metadata={"help": "The kernel size of convolution."}
+    )
+    half_step_residual: bool = field(
+        default=True, metadata={"help": "Flag indication whether to use half step residual or not"}
+    )
+    optimizer: str = field(
+        default="adam", metadata={"help": "Optimizer for training."}
+    )
+
+
+@dataclass
+class ConformerLSTMConfigs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.models.ConformerLSTM`.
+
+    It is used to initiated an `ConformerLSTM` model.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Args:
+        model_name (str): Model name (default: conformer_lstm)
+        encoder_dim (int): Dimension of encoder. (default: 512)
+        num_encoder_layers (int): The number of encoder layers. (default: 17)
+        num_attention_heads (int): The number of attention heads. (default: 8)
+        feed_forward_expansion_factor (int): The expansion factor of feed forward module. (default: 4)
+        conv_expansion_factor (int): The expansion factor of convolution module. (default: 2)
+        input_dropout_p (float): The dropout probability of inputs. (default: 0.1)
+        feed_forward_dropout_p (float): The dropout probability of feed forward module. (default: 0.1)
+        attention_dropout_p (float): The dropout probability of attention module. (default: 0.1)
+        conv_dropout_p (float): The dropout probability of convolution module. (default: 0.1)
+        conv_kernel_size (int): The kernel size of convolution. (default: eq)
+        half_step_residual (bool): Flag indication whether to use half step residual or not (default: True)
+        num_decoder_layers (int): The number of decoder layers. (default: 2)
+        decoder_dropout_p (float): The dropout probability of decoder. (default: 0.1)
+        max_length (int): Max decoding length. (default: 128)
+        teacher_forcing_ratio (float): The ratio of teacher forcing. (default: 1.0)
+        rnn_type (str): Type of rnn cell (rnn, lstm, gru) (default: lstm)
+        decoder_attn_mechanism (str): The attention mechanism for decoder. (default: loc)
+        optimizer (str): Optimizer for training. (default: adam)
+    """
+    model_name: str = field(
+        default="conformer_lstm", metadata={"help": "Model name"}
+    )
+    encoder_dim: int = field(
+        default=512, metadata={"help": "Dimension of encoder."}
+    )
+    num_encoder_layers: int = field(
+        default=17, metadata={"help": "The number of encoder layers."}
+    )
+    num_attention_heads: int = field(
+        default=8, metadata={"help": "The number of attention heads."}
+    )
+    feed_forward_expansion_factor: int = field(
+        default=4, metadata={"help": "The expansion factor of feed forward module."}
+    )
+    conv_expansion_factor: int = field(
+        default=2, metadata={"help": "The expansion factor of convolution module."}
+    )
+    input_dropout_p: float = field(
+        default=0.1, metadata={"help": "The dropout probability of inputs."}
+    )
+    feed_forward_dropout_p: float = field(
+        default=0.1, metadata={"help": "The dropout probability of feed forward module."}
+    )
+    attention_dropout_p: float = field(
+        default=0.1, metadata={"help": "The dropout probability of attention module."}
+    )
+    conv_dropout_p: float = field(
+        default=0.1, metadata={"help": "The dropout probability of convolution module."}
+    )
+    conv_kernel_size: int = field(
+        default=31, metadata={"help": "The kernel size of convolution."}
+    )
+    half_step_residual: bool = field(
+        default=True, metadata={"help": "Flag indication whether to use half step residual or not"}
+    )
+    num_decoder_layers: int = field(
+        default=2, metadata={"help": "The number of decoder layers."}
+    )
+    decoder_dropout_p: float = field(
+        default=0.1, metadata={"help": "The dropout probability of decoder."}
+    )
+    max_length: int = field(
+        default=128, metadata={"help": "Max decoding length."}
+    )
+    teacher_forcing_ratio: float = field(
+        default=1.0, metadata={"help": "The ratio of teacher forcing. "}
+    )
+    rnn_type: str = field(
+        default="lstm", metadata={"help": "Type of rnn cell (rnn, lstm, gru)"}
+    )
+    decoder_attn_mechanism: str = field(
+        default="loc", metadata={"help": "The attention mechanism for decoder."}
+    )
+    optimizer: str = field(
+        default="adam", metadata={"help": "Optimizer for training."}
+    )
+
+
+@dataclass
+class ConformerTransducerConfigs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.models.ConformerTransducer`.
+
+    It is used to initiated an `ConformerTransducer` model.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Args:
+        model_name (str): Model name (default: conformer_transducer)
+        encoder_dim (int): Dimension of encoder. (default: 512)
+        num_encoder_layers (int): The number of encoder layers. (default: 17)
+        num_attention_heads (int): The number of attention heads. (default: 8)
+        feed_forward_expansion_factor (int): The expansion factor of feed forward module. (default: 4)
+        conv_expansion_factor (int): The expansion factor of convolution module. (default: 2)
+        input_dropout_p (float): The dropout probability of inputs. (default: 0.1)
+        feed_forward_dropout_p (float): The dropout probability of feed forward module. (default: 0.1)
+        attention_dropout_p (float): The dropout probability of attention module. (default: 0.1)
+        conv_dropout_p (float): The dropout probability of convolution module. (default: 0.1)
+        conv_kernel_size (int): The kernel size of convolution. (default: eq)
+        half_step_residual (bool): Flag indication whether to use half step residual or not (default: True)
+        num_decoder_layers (int): The number of decoder layers. (default: 1)
+        decoder_dropout_p (float): The dropout probability of decoder. (default: 0.1)
+        max_length (int): Max decoding length. (default: 128)
+        teacher_forcing_ratio (float): The ratio of teacher forcing. (default: 1.0)
+        rnn_type (str): Type of rnn cell (rnn, lstm, gru) (default: lstm)
+        decoder_hidden_state_dim (int): Hidden state dimension of decoder. (default: 640)
+        decoder_output_dim (int): Output dimension of decoder. (default: 640)
+        optimizer (str): Optimizer for training. (default: adam)
+    """
+    model_name: str = field(
+        default="conformer_transducer", metadata={"help": "Model name"}
+    )
+    encoder_dim: int = field(
+        default=512, metadata={"help": "Dimension of encoder."}
+    )
+    num_encoder_layers: int = field(
+        default=17, metadata={"help": "The number of encoder layers."}
+    )
+    num_attention_heads: int = field(
+        default=8, metadata={"help": "The number of attention heads."}
+    )
+    feed_forward_expansion_factor: int = field(
+        default=4, metadata={"help": "The expansion factor of feed forward module."}
+    )
+    conv_expansion_factor: int = field(
+        default=2, metadata={"help": "The expansion factor of convolution module."}
+    )
+    input_dropout_p: float = field(
+        default=0.1, metadata={"help": "The dropout probability of inputs."}
+    )
+    feed_forward_dropout_p: float = field(
+        default=0.1, metadata={"help": "The dropout probability of feed forward module."}
+    )
+    attention_dropout_p: float = field(
+        default=0.1, metadata={"help": "The dropout probability of attention module."}
+    )
+    conv_dropout_p: float = field(
+        default=0.1, metadata={"help": "The dropout probability of convolution module."}
+    )
+    conv_kernel_size: int = field(
+        default=31, metadata={"help": "The kernel size of convolution."}
+    )
+    half_step_residual: bool = field(
+        default=True, metadata={"help": "Flag indication whether to use half step residual or not"}
+    )
+    num_decoder_layers: int = field(
+        default=1, metadata={"help": "The number of decoder layers."}
+    )
+    decoder_dropout_p: float = field(
+        default=0.1, metadata={"help": "The dropout probability of decoder."}
+    )
+    max_length: int = field(
+        default=128, metadata={"help": "Max decoding length."}
+    )
+    teacher_forcing_ratio: float = field(
+        default=1.0, metadata={"help": " The ratio of teacher forcing. "}
+    )
+    rnn_type: str = field(
+        default="lstm", metadata={"help": "Type of rnn cell (rnn, lstm, gru)"}
+    )
+    decoder_hidden_state_dim: int = field(
+        default=640, metadata={"help": "Hidden state dimension of decoder."}
+    )
+    decoder_output_dim: int = field(
+        default=640, metadata={"help": "Output dimension of decoder."}
+    )
+    optimizer: str = field(
+        default="adam", metadata={"help": "Optimizer for training."}
+    )
+
+
+@dataclass
+class JointCTCConformerLSTMConfigs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.models.JointCTCConformerLSTM`.
+
+    It is used to initiated an `JointCTCConformerLSTM` model.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Args:
+        model_name (str): Model name (default: joint_ctc_conformer_lstm)
+        encoder_dim (int): Dimension of encoder. (default: 512)
+        num_encoder_layers (int): The number of encoder layers. (default: 17)
+        num_attention_heads (int): The number of attention heads. (default: 8)
+        feed_forward_expansion_factor (int): The expansion factor of feed forward module. (default: 4)
+        conv_expansion_factor (int): The expansion factor of convolution module. (default: 2)
+        input_dropout_p (float): The dropout probability of inputs. (default: 0.1)
+        feed_forward_dropout_p (float): The dropout probability of feed forward module. (default: 0.1)
+        attention_dropout_p (float): The dropout probability of attention module. (default: 0.1)
+        conv_dropout_p (float): The dropout probability of convolution module. (default: 0.1)
+        conv_kernel_size (int): The kernel size of convolution. (default: eq)
+        half_step_residual (bool): Flag indication whether to use half step residual or not (default: True)
+        num_decoder_layers (int): The number of decoder layers. (default: 2)
+        decoder_dropout_p (float): The dropout probability of decoder. (default: 0.1)
+        max_length (int): Max decoding length. (default: 128)
+        teacher_forcing_ratio (float): The ratio of teacher forcing. (default: 1.0)
+        rnn_type (str): Type of rnn cell (rnn, lstm, gru) (default: lstm)
+        decoder_attn_mechanism (str): The attention mechanism for decoder. (default: loc)
+        optimizer (str): Optimizer for training. (default: adam)
+    """
+    model_name: str = field(
+        default="joint_ctc_conformer_lstm", metadata={"help": "Model name"}
+    )
+    encoder_dim: int = field(
+        default=512, metadata={"help": "Dimension of encoder."}
+    )
+    num_encoder_layers: int = field(
+        default=17, metadata={"help": "The number of encoder layers."}
+    )
+    num_attention_heads: int = field(
+        default=8, metadata={"help": "The number of attention heads."}
+    )
+    feed_forward_expansion_factor: int = field(
+        default=4, metadata={"help": "The expansion factor of feed forward module."}
+    )
+    conv_expansion_factor: int = field(
+        default=2, metadata={"help": "The expansion factor of convolution module."}
+    )
+    input_dropout_p: float = field(
+        default=0.1, metadata={"help": "The dropout probability of inputs."}
+    )
+    feed_forward_dropout_p: float = field(
+        default=0.1, metadata={"help": "The dropout probability of feed forward module."}
+    )
+    attention_dropout_p: float = field(
+        default=0.1, metadata={"help": "The dropout probability of attention module."}
+    )
+    conv_dropout_p: float = field(
+        default=0.1, metadata={"help": "The dropout probability of convolution module."}
+    )
+    conv_kernel_size: int = field(
+        default=31, metadata={"help": "The kernel size of convolution."}
+    )
+    half_step_residual: bool = field(
+        default=True, metadata={"help": "Flag indication whether to use half step residual or not"}
+    )
+    num_decoder_layers: int = field(
+        default=2, metadata={"help": "The number of decoder layers."}
+    )
+    decoder_dropout_p: float = field(
+        default=0.1, metadata={"help": "The dropout probability of decoder."}
+    )
+    num_decoder_attention_heads: int = field(
+        default=1, metadata={"help": "The number of decoder attention heads."}
+    )
+    max_length: int = field(
+        default=128, metadata={"help": "Max decoding length."}
+    )
+    teacher_forcing_ratio: float = field(
+        default=1.0, metadata={"help": " The ratio of teacher forcing. "}
+    )
+    rnn_type: str = field(
+        default="lstm", metadata={"help": "Type of rnn cell (rnn, lstm, gru)"}
+    )
+    decoder_attn_mechanism: str = field(
+        default="loc", metadata={"help": "The attention mechanism for decoder."}
+    )
+    optimizer: str = field(
+        default="adam", metadata={"help": "Optimizer for training."}
+    )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/conformer/model.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/conformer/model.py
new file mode 100644
index 000000000..356d84535
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/conformer/model.py
@@ -0,0 +1,328 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from torch import Tensor
+from typing import Dict
+from collections import OrderedDict
+
+from openspeech.decoders import LSTMAttentionDecoder, RNNTransducerDecoder
+from openspeech.models import register_model, OpenspeechEncoderDecoderModel, OpenspeechTransducerModel
+from openspeech.models import OpenspeechCTCModel
+from openspeech.encoders import ConformerEncoder
+from openspeech.modules.wrapper import Linear
+from openspeech.tokenizers.tokenizer import Tokenizer
+from openspeech.models.conformer.configurations import (
+    ConformerConfigs,
+    ConformerLSTMConfigs,
+    ConformerTransducerConfigs,
+    JointCTCConformerLSTMConfigs,
+)
+
+
+@register_model('conformer', dataclass=ConformerConfigs)
+class ConformerModel(OpenspeechCTCModel):
+    r"""
+    Conformer Encoder Only Model.
+
+    Args:
+        configs (DictConfig): configuration set.
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        outputs (dict): Result of model predictions that contains `y_hats`, `logits`, `output_lengths`
+    """
+    def __init__(self, configs, tokenizer: Tokenizer) -> None:
+        super(ConformerModel, self).__init__(configs, tokenizer)
+        self.fc = Linear(self.configs.model.encoder_dim, self.num_classes, bias=False)
+
+    def build_model(self):
+        self.encoder = ConformerEncoder(
+            num_classes=self.num_classes,
+            input_dim=self.configs.audio.num_mels,
+            encoder_dim=self.configs.model.encoder_dim,
+            num_layers=self.configs.model.num_encoder_layers,
+            num_attention_heads=self.configs.model.num_attention_heads,
+            feed_forward_expansion_factor=self.configs.model.feed_forward_expansion_factor,
+            conv_expansion_factor=self.configs.model.conv_expansion_factor,
+            input_dropout_p=self.configs.model.input_dropout_p,
+            feed_forward_dropout_p=self.configs.model.feed_forward_dropout_p,
+            attention_dropout_p=self.configs.model.attention_dropout_p,
+            conv_dropout_p=self.configs.model.conv_dropout_p,
+            conv_kernel_size=self.configs.model.conv_kernel_size,
+            half_step_residual=self.configs.model.half_step_residual,
+            joint_ctc_attention=False,
+        )
+
+    def forward(self, inputs: Tensor, input_lengths: Tensor) -> Dict[str, Tensor]:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for inference.
+
+        Inputs:
+            inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+            input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+        Returns:
+            outputs (dict): Result of model predictions that contains `y_hats`, `logits`, `output_lengths`
+        """
+        return super(ConformerModel, self).forward(inputs, input_lengths)
+
+    def training_step(self, batch: tuple, batch_idx: int) -> OrderedDict:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for training.
+
+        Inputs:
+            batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths`
+            batch_idx (int): The index of batch
+
+        Returns:
+            loss (torch.Tensor): loss for training
+        """
+        inputs, targets, input_lengths, target_lengths = batch
+        encoder_outputs, encoder_logits, output_lengths = self.encoder(inputs, input_lengths)
+        logits = self.fc(encoder_outputs).log_softmax(dim=-1)
+        return self.collect_outputs(
+            stage='train',
+            logits=logits,
+            output_lengths=output_lengths,
+            targets=targets,
+            target_lengths=target_lengths,
+        )
+
+    def validation_step(self, batch: tuple, batch_idx: int) -> OrderedDict:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for validation.
+
+        Inputs:
+            batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths`
+            batch_idx (int): The index of batch
+
+        Returns:
+            loss (torch.Tensor): loss for training
+        """
+        inputs, targets, input_lengths, target_lengths = batch
+        encoder_outputs, encoder_logits, output_lengths = self.encoder(inputs, input_lengths)
+        logits = self.fc(encoder_outputs).log_softmax(dim=-1)
+        return self.collect_outputs(
+            stage='valid',
+            logits=logits,
+            output_lengths=output_lengths,
+            targets=targets,
+            target_lengths=target_lengths,
+        )
+
+    def test_step(self, batch: tuple, batch_idx: int) -> OrderedDict:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for test.
+
+        Inputs:
+            batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths`
+            batch_idx (int): The index of batch
+
+        Returns:
+            loss (torch.Tensor): loss for training
+        """
+        inputs, targets, input_lengths, target_lengths = batch
+        encoder_outputs, encoder_logits, output_lengths = self.encoder(inputs, input_lengths)
+        logits = self.fc(encoder_outputs).log_softmax(dim=-1)
+        return self.collect_outputs(
+            stage='test',
+            logits=logits,
+            output_lengths=output_lengths,
+            targets=targets,
+            target_lengths=target_lengths,
+        )
+
+
+@register_model('conformer_lstm', dataclass=ConformerLSTMConfigs)
+class ConformerLSTMModel(OpenspeechEncoderDecoderModel):
+    r"""
+    Conformer encoder + LSTM decoder.
+
+    Args:
+        configs (DictConfig): configuraion set
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        outputs (dict): Result of model predictions that contains `y_hats`, `logits`,
+            `encoder_outputs`, `encoder_logits`, `encoder_output_lengths`.
+    """
+
+    def __init__(self, configs, tokenizer: Tokenizer) -> None:
+        super(ConformerLSTMModel, self).__init__(configs, tokenizer)
+
+    def build_model(self):
+        self.encoder = ConformerEncoder(
+            num_classes=self.num_classes,
+            input_dim=self.configs.audio.num_mels,
+            encoder_dim=self.configs.model.encoder_dim,
+            num_layers=self.configs.model.num_encoder_layers,
+            num_attention_heads=self.configs.model.num_attention_heads,
+            feed_forward_expansion_factor=self.configs.model.feed_forward_expansion_factor,
+            conv_expansion_factor=self.configs.model.conv_expansion_factor,
+            input_dropout_p=self.configs.model.input_dropout_p,
+            feed_forward_dropout_p=self.configs.model.feed_forward_dropout_p,
+            attention_dropout_p=self.configs.model.attention_dropout_p,
+            conv_dropout_p=self.configs.model.conv_dropout_p,
+            conv_kernel_size=self.configs.model.conv_kernel_size,
+            half_step_residual=self.configs.model.half_step_residual,
+            joint_ctc_attention=False,
+        )
+        self.decoder = LSTMAttentionDecoder(
+            num_classes=self.num_classes,
+            max_length=self.configs.model.max_length,
+            hidden_state_dim=self.configs.model.encoder_dim,
+            pad_id=self.tokenizer.pad_id,
+            sos_id=self.tokenizer.sos_id,
+            eos_id=self.tokenizer.eos_id,
+            num_heads=self.configs.model.num_attention_heads,
+            dropout_p=self.configs.model.decoder_dropout_p,
+            num_layers=self.configs.model.num_decoder_layers,
+            attn_mechanism=self.configs.model.decoder_attn_mechanism,
+            rnn_type=self.configs.model.rnn_type,
+        )
+
+    def set_beam_decoder(self, beam_size: int = 3):
+        """ Setting beam search decoder """
+        from openspeech.search import BeamSearchLSTM
+        self.decoder = BeamSearchLSTM(
+            decoder=self.decoder,
+            beam_size=beam_size,
+        )
+
+
+@register_model('conformer_transducer', dataclass=ConformerTransducerConfigs)
+class ConformerTransducerModel(OpenspeechTransducerModel):
+    r"""
+    Conformer: Convolution-augmented Transformer for Speech Recognition
+    Paper: https://arxiv.org/abs/2005.08100
+
+    Args:
+        configs (DictConfig): configuraion set
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        outputs (dict): Result of model predictions.
+    """
+
+    def __init__(self, configs, tokenizer: Tokenizer) -> None:
+        super(ConformerTransducerModel, self).__init__(configs, tokenizer)
+
+    def build_model(self):
+        self.encoder = ConformerEncoder(
+            num_classes=self.num_classes,
+            input_dim=self.configs.audio.num_mels,
+            encoder_dim=self.configs.model.encoder_dim,
+            num_layers=self.configs.model.num_encoder_layers,
+            num_attention_heads=self.configs.model.num_attention_heads,
+            feed_forward_expansion_factor=self.configs.model.feed_forward_expansion_factor,
+            conv_expansion_factor=self.configs.model.conv_expansion_factor,
+            input_dropout_p=self.configs.model.input_dropout_p,
+            feed_forward_dropout_p=self.configs.model.feed_forward_dropout_p,
+            attention_dropout_p=self.configs.model.attention_dropout_p,
+            conv_dropout_p=self.configs.model.conv_dropout_p,
+            conv_kernel_size=self.configs.model.conv_kernel_size,
+            half_step_residual=self.configs.model.half_step_residual,
+            joint_ctc_attention=False,
+        )
+        self.decoder = RNNTransducerDecoder(
+            num_classes=self.num_classes,
+            hidden_state_dim=self.configs.model.decoder_hidden_state_dim,
+            output_dim=self.configs.model.decoder_output_dim,
+            num_layers=self.configs.model.num_decoder_layers,
+            rnn_type=self.configs.model.rnn_type,
+            pad_id=self.tokenizer.pad_id,
+            sos_id=self.tokenizer.sos_id,
+            eos_id=self.tokenizer.eos_id,
+            dropout_p=self.configs.model.decoder_dropout_p,
+        )
+
+
+@register_model('joint_ctc_conformer_lstm', dataclass=JointCTCConformerLSTMConfigs)
+class JointCTCConformerLSTMModel(OpenspeechEncoderDecoderModel):
+    r"""
+    Conformer encoder + LSTM decoder.
+
+    Args:
+        configs (DictConfig): configuraion set
+        tokenizer (Tokeizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        outputs (torch.FloatTensor): Result of model predictions.
+    """
+
+    def __init__(self, configs, tokenizer: Tokenizer) -> None:
+        super(JointCTCConformerLSTMModel, self).__init__(configs, tokenizer)
+
+    def build_model(self):
+        self.encoder = ConformerEncoder(
+            num_classes=self.num_classes,
+            input_dim=self.configs.audio.num_mels,
+            encoder_dim=self.configs.model.encoder_dim,
+            num_layers=self.configs.model.num_encoder_layers,
+            num_attention_heads=self.configs.model.num_attention_heads,
+            feed_forward_expansion_factor=self.configs.model.feed_forward_expansion_factor,
+            conv_expansion_factor=self.configs.model.conv_expansion_factor,
+            input_dropout_p=self.configs.model.input_dropout_p,
+            feed_forward_dropout_p=self.configs.model.feed_forward_dropout_p,
+            attention_dropout_p=self.configs.model.attention_dropout_p,
+            conv_dropout_p=self.configs.model.conv_dropout_p,
+            conv_kernel_size=self.configs.model.conv_kernel_size,
+            half_step_residual=self.configs.model.half_step_residual,
+            joint_ctc_attention=True,
+        )
+        self.decoder = LSTMAttentionDecoder(
+            num_classes=self.num_classes,
+            max_length=self.configs.model.max_length,
+            hidden_state_dim=self.configs.model.encoder_dim,
+            pad_id=self.tokenizer.pad_id,
+            sos_id=self.tokenizer.sos_id,
+            eos_id=self.tokenizer.eos_id,
+            num_heads=self.configs.model.num_decoder_attention_heads,
+            dropout_p=self.configs.model.decoder_dropout_p,
+            num_layers=self.configs.model.num_decoder_layers,
+            attn_mechanism=self.configs.model.decoder_attn_mechanism,
+            rnn_type=self.configs.model.rnn_type,
+        )
+
+    def set_beam_decoder(self, beam_size: int = 3):
+        """ Setting beam search decoder """
+        from openspeech.search import BeamSearchLSTM
+        self.decoder = BeamSearchLSTM(
+            decoder=self.decoder,
+            beam_size=beam_size,
+        )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/contextnet/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/contextnet/__init__.py
new file mode 100644
index 000000000..8d1ec5dd5
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/contextnet/__init__.py
@@ -0,0 +1,32 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from .configurations import (
+    ContextNetTransducerConfigs,
+    ContextNetLSTMConfigs,
+    ContextNetConfigs,
+)
+from .model import (
+    ContextNetTransducerModel,
+    ContextNetLSTMModel,
+    ContextNetModel,
+)
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/contextnet/configurations.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/contextnet/configurations.py
new file mode 100644
index 000000000..99ee77b85
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/contextnet/configurations.py
@@ -0,0 +1,215 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from dataclasses import dataclass, field
+
+from openspeech.dataclass.configurations import OpenspeechDataclass
+
+
+@dataclass
+class ContextNetConfigs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.models.ContextNet`.
+
+    It is used to initiated an `ContextNet` model.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Args:
+        model_name (str): Model name (default: contextnet)
+        model_size (str, optional): Size of the model['small', 'medium', 'large'] (default : 'medium')
+        input_dim (int, optional): Dimension of input vector (default : 80)
+        num_encoder_layers (int, optional): The number of convolution layers (default : 5)
+        kernel_size (int, optional): Value of convolution kernel size (default : 5)
+        num_channels (int, optional): The number of channels in the convolution filter (default: 256)
+        encoder_dim (int, optional): Dimension of encoder output vector (default: 640)
+        optimizer (str): Optimizer for training. (default: adam)
+    """
+    model_name: str = field(
+        default="contextnet", metadata={"help": "Model name"}
+    )
+    model_size: str = field(
+        default="medium", metadata={"help": "Model size"}
+    )
+    input_dim: int = field(
+        default=80, metadata={"help": "Dimension of input vector"}
+    )
+    num_encoder_layers: int = field(
+        default=5, metadata={"help": "The number of convolution layers"}
+    )
+    kernel_size: int = field(
+        default=5, metadata={"help": "Value of convolution kernel size"}
+    )
+    num_channels: int = field(
+        default=256, metadata={"help": "The number of channels in the convolution filter"}
+    )
+    encoder_dim: int = field(
+        default=640, metadata={"help": "Dimension of encoder output vector"}
+    )
+    optimizer: str = field(
+        default="adam", metadata={"help": "Optimizer for training"}
+    )
+
+
+@dataclass
+class ContextNetLSTMConfigs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.models.ContextNetLSTM`.
+
+    It is used to initiated an `ContextNetLSTM` model.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Args:
+        model_name (str): Model name (default: contextnet_lstm)
+        model_size (str, optional): Size of the model['small', 'medium', 'large'] (default : 'medium')
+        input_dim (int, optional): Dimension of input vector (default : 80)
+        num_encoder_layers (int, optional): The number of convolution layers (default : 5)
+        num_decoder_layers (int): The number of decoder layers. (default: 2)
+        kernel_size (int, optional): Value of convolution kernel size (default : 5)
+        num_channels (int, optional): The number of channels in the convolution filter (default: 256)
+        encoder_dim (int, optional): Dimension of encoder output vector (default: 640)
+        num_attention_heads (int): The number of attention heads. (default: 8)
+        attention_dropout_p (float): The dropout probability of attention module. (default: 0.1)
+        decoder_dropout_p (float): The dropout probability of decoder. (default: 0.1)
+        max_length (int): Max decoding length. (default: 128)
+        teacher_forcing_ratio (float): The ratio of teacher forcing. (default: 1.0)
+        rnn_type (str): Type of rnn cell (rnn, lstm, gru) (default: lstm)
+        decoder_attn_mechanism (str): The attention mechanism for decoder. (default: loc)
+        optimizer (str): Optimizer for training. (default: adam)
+    """
+    model_name: str = field(
+        default="contextnet_lstm", metadata={"help": "Model name"}
+    )
+    model_size: str = field(
+        default="medium", metadata={"help": "Model size"}
+    )
+    input_dim: int = field(
+        default=80, metadata={"help": "Dimension of input vector"}
+    )
+    num_encoder_layers: int = field(
+        default=5, metadata={"help": "The number of convolution layers"}
+    )
+    num_decoder_layers: int = field(
+        default=2, metadata={"help": "The number of decoder layers."}
+    )
+    kernel_size: int = field(
+        default=5, metadata={"help": "Value of convolution kernel size"}
+    )
+    num_channels: int = field(
+        default=256, metadata={"help": "The number of channels in the convolution filter"}
+    )
+    encoder_dim: int = field(
+        default=640, metadata={"help": "Dimension of encoder output vector"}
+    )
+    num_attention_heads: int = field(
+        default=8, metadata={"help": "The number of attention heads."}
+    )
+    attention_dropout_p: float = field(
+        default=0.1, metadata={"help": "The dropout probability of attention module."}
+    )
+    decoder_dropout_p: float = field(
+        default=0.1, metadata={"help": "The dropout probability of decoder."}
+    )
+    max_length: int = field(
+        default=128, metadata={"help": "Max decoding length."}
+    )
+    teacher_forcing_ratio: float = field(
+        default=1.0, metadata={"help": "The ratio of teacher forcing. "}
+    )
+    rnn_type: str = field(
+        default="lstm", metadata={"help": "Type of rnn cell (rnn, lstm, gru)"}
+    )
+    decoder_attn_mechanism: str = field(
+        default="loc", metadata={"help": "The attention mechanism for decoder."}
+    )
+    optimizer: str = field(
+        default="adam", metadata={"help": "Optimizer for training."}
+    )
+
+
+@dataclass
+class ContextNetTransducerConfigs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.models.ContextNetTransducer`.
+
+    It is used to initiated an `ContextNetTransducer` model.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Args:
+        model_name (str): Model name (default: contextnet_transducer)
+        model_size (str, optional): Size of the model['small', 'medium', 'large'] (default : 'medium')
+        input_dim (int, optional): Dimension of input vector (default : 80)
+        num_encoder_layers (int, optional): The number of convolution layers (default : 5)
+        num_decoder_layers (int, optional): The number of rnn layers (default : 1)
+        kernel_size (int, optional): Value of convolution kernel size (default : 5)
+        num_channels (int, optional): The number of channels in the convolution filter (default: 256)
+        hidden_dim (int, optional): The number of features in the decoder hidden state (default : 2048)
+        encoder_dim (int, optional): Dimension of encoder output vector (default: 640)
+        decoder_output_dim (int, optional): Dimension of decoder output vector (default: 640)
+        dropout (float, optional): Dropout probability of decoder (default: 0.1)
+        rnn_type (str, optional): Type of rnn cell (rnn, lstm, gru) (default: lstm)
+        optimizer (str): Optimizer for training. (default: adam)
+    """
+    model_name: str = field(
+        default="contextnet_transducer", metadata={"help": "Model name"}
+    )
+    model_size: str = field(
+        default="medium", metadata={"help": "Model size"}
+    )
+    input_dim: int = field(
+        default=80, metadata={"help": "Dimension of input vector"}
+    )
+    num_encoder_layers: int = field(
+        default=5, metadata={"help": "The number of convolution layers"}
+    )
+    num_decoder_layers: int = field(
+        default=1, metadata={"help": "The number of rnn layers"}
+    )
+    kernel_size: int = field(
+        default=5, metadata={"help": "Value of convolution kernel size"}
+    )
+    num_channels: int = field(
+        default=256, metadata={"help": "The number of channels in the convolution filter"}
+    )
+    decoder_hidden_state_dim: int = field(
+        default=2048, metadata={"help": "The number of features in the decoder hidden state"}
+    )
+    encoder_dim: int = field(
+        default=640, metadata={"help": "Dimension of encoder output vector"}
+    )
+    decoder_output_dim: int = field(
+        default=640, metadata={"help": "Dimension of decoder output vector"}
+    )
+    decoder_dropout_p: float = field(
+        default=0.1, metadata={"help": "Dropout probability of decoder"}
+    )
+    rnn_type: str = field(
+        default='lstm', metadata={"help": "Type of rnn cell"}
+    )
+    optimizer: str = field(
+        default="adam", metadata={"help": "Optimizer for training"}
+    )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/contextnet/model.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/contextnet/model.py
new file mode 100644
index 000000000..2ea7a8778
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/contextnet/model.py
@@ -0,0 +1,255 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from torch import Tensor
+from typing import Dict
+from collections import OrderedDict
+
+from openspeech.decoders import RNNTransducerDecoder, LSTMAttentionDecoder
+from openspeech.models import register_model, OpenspeechTransducerModel, OpenspeechEncoderDecoderModel
+from openspeech.models import OpenspeechCTCModel
+from openspeech.encoders import ContextNetEncoder
+from openspeech.modules.wrapper import Linear
+from openspeech.tokenizers.tokenizer import Tokenizer
+from openspeech.models.contextnet.configurations import (
+    ContextNetConfigs,
+    ContextNetTransducerConfigs,
+    ContextNetLSTMConfigs,
+)
+
+
+@register_model('contextnet', dataclass=ContextNetConfigs)
+class ContextNetModel(OpenspeechCTCModel):
+    r"""
+    Conformer Encoder Only Model.
+
+    Args:
+        configs (DictConfig): configuration set.
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        outputs (dict): Result of model predictions that contains `y_hats`, `logits`, `output_lengths`
+    """
+    def __init__(self, configs, tokenizer: Tokenizer) -> None:
+        super(ContextNetModel, self).__init__(configs, tokenizer)
+        supported_models = {
+            'small': 0.5,
+            'medium': 1,
+            'large': 2,
+        }
+        alpha = supported_models[self.configs.model.model_size]
+        self.fc = Linear(int(self.configs.model.encoder_dim * alpha), self.num_classes, bias=False)
+
+    def build_model(self):
+        self.encoder = ContextNetEncoder(
+            num_classes=self.num_classes,
+            model_size=self.configs.model.model_size,
+            input_dim=self.configs.audio.num_mels,
+            num_layers=self.configs.model.num_encoder_layers,
+            kernel_size=self.configs.model.kernel_size,
+            num_channels=self.configs.model.num_channels,
+            output_dim=self.configs.model.encoder_dim,
+            joint_ctc_attention=False,
+        )
+
+    def forward(self, inputs: Tensor, input_lengths: Tensor) -> Dict[str, Tensor]:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for inference.
+
+        Inputs:
+            inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+            input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+        Returns:
+            outputs (dict): Result of model predictions that contains `y_hats`, `logits`, `output_lengths`
+        """
+        return super(ContextNetModel, self).forward(inputs, input_lengths)
+
+    def training_step(self, batch: tuple, batch_idx: int) -> OrderedDict:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for training.
+
+        Inputs:
+            batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths`
+            batch_idx (int): The index of batch
+
+        Returns:
+            loss (torch.Tensor): loss for training
+        """
+        inputs, targets, input_lengths, target_lengths = batch
+        encoder_outputs, encoder_logits, output_lengths = self.encoder(inputs, input_lengths)
+        logits = self.fc(encoder_outputs).log_softmax(dim=-1)
+        return self.collect_outputs(
+            stage='train',
+            logits=logits,
+            output_lengths=output_lengths,
+            targets=targets,
+            target_lengths=target_lengths,
+        )
+
+    def validation_step(self, batch: tuple, batch_idx: int) -> OrderedDict:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for validation.
+
+        Inputs:
+            batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths`
+            batch_idx (int): The index of batch
+
+        Returns:
+            loss (torch.Tensor): loss for training
+        """
+        inputs, targets, input_lengths, target_lengths = batch
+        encoder_outputs, encoder_logits, output_lengths = self.encoder(inputs, input_lengths)
+        logits = self.fc(encoder_outputs).log_softmax(dim=-1)
+        return self.collect_outputs(
+            stage='valid',
+            logits=logits,
+            output_lengths=output_lengths,
+            targets=targets,
+            target_lengths=target_lengths,
+        )
+
+    def test_step(self, batch: tuple, batch_idx: int) -> OrderedDict:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for test.
+
+        Inputs:
+            batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths`
+            batch_idx (int): The index of batch
+
+        Returns:
+            loss (torch.Tensor): loss for training
+        """
+        inputs, targets, input_lengths, target_lengths = batch
+        encoder_outputs, encoder_logits, output_lengths = self.encoder(inputs, input_lengths)
+        logits = self.fc(encoder_outputs).log_softmax(dim=-1)
+        return self.collect_outputs(
+            stage='test',
+            logits=logits,
+            output_lengths=output_lengths,
+            targets=targets,
+            target_lengths=target_lengths,
+        )
+
+
+@register_model('contextnet_lstm', dataclass=ContextNetLSTMConfigs)
+class ContextNetLSTMModel(OpenspeechEncoderDecoderModel):
+    r"""
+    ContextNet encoder + LSTM decoder.
+
+    Args:
+        configs (DictConfig): configuraion set
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        outputs (dict): Result of model predictions that contains `y_hats`, `logits`,
+            `encoder_outputs`, `encoder_logits`, `encoder_output_lengths`.
+    """
+
+    def __init__(self, configs, tokenizer: Tokenizer, ) -> None:
+        super(ContextNetLSTMModel, self).__init__(configs, tokenizer)
+
+    def build_model(self):
+        self.encoder = ContextNetEncoder(
+            num_classes=self.num_classes,
+            model_size=self.configs.model.model_size,
+            input_dim=self.configs.audio.num_mels,
+            num_layers=self.configs.model.num_encoder_layers,
+            kernel_size=self.configs.model.kernel_size,
+            num_channels=self.configs.model.num_channels,
+            output_dim=self.configs.model.encoder_dim,
+            joint_ctc_attention=False,
+        )
+        self.decoder = LSTMAttentionDecoder(
+            num_classes=self.num_classes,
+            max_length=self.configs.model.max_length,
+            hidden_state_dim=self.configs.model.encoder_dim,
+            pad_id=self.tokenizer.pad_id,
+            sos_id=self.tokenizer.sos_id,
+            eos_id=self.tokenizer.eos_id,
+            num_heads=self.configs.model.num_attention_heads,
+            dropout_p=self.configs.model.decoder_dropout_p,
+            num_layers=self.configs.model.num_decoder_layers,
+            attn_mechanism=self.configs.model.decoder_attn_mechanism,
+            rnn_type=self.configs.model.rnn_type,
+        )
+
+    def set_beam_decoder(self, beam_size: int = 3):
+        """ Setting beam search decoder """
+        from openspeech.search import BeamSearchLSTM
+        self.decoder = BeamSearchLSTM(
+            decoder=self.decoder,
+            beam_size=beam_size,
+        )
+
+
+@register_model('contextnet_transducer', dataclass=ContextNetTransducerConfigs)
+class ContextNetTransducerModel(OpenspeechTransducerModel):
+    r"""
+    ContextNet: Improving Convolutional Neural Networks for Automatic Speech Recognition with Global Context
+    Paper: https://arxiv.org/abs/2005.03191
+
+    Args:
+        configs (DictConfig): configuraion set.
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        outputs (dict): Result of model predictions.
+    """
+
+    def __init__(self, configs, tokenizer: Tokenizer, ) -> None:
+        super(ContextNetTransducerModel, self).__init__(configs, tokenizer)
+
+    def build_model(self):
+        self.encoder = ContextNetEncoder(
+            num_classes=self.num_classes,
+            model_size=self.configs.model.model_size,
+            input_dim=self.configs.audio.num_mels,
+            num_layers=self.configs.model.num_encoder_layers,
+            kernel_size=self.configs.model.kernel_size,
+            num_channels=self.configs.model.num_channels,
+            output_dim=self.configs.model.encoder_dim,
+            joint_ctc_attention=False,
+        )
+        self.decoder = RNNTransducerDecoder(
+            num_classes=self.num_classes,
+            hidden_state_dim=self.configs.model.decoder_hidden_state_dim,
+            output_dim=self.configs.model.decoder_output_dim,
+            num_layers=self.configs.model.num_decoder_layers,
+            rnn_type=self.configs.model.rnn_type,
+            pad_id=self.tokenizer.pad_id,
+            sos_id=self.tokenizer.sos_id,
+            eos_id=self.tokenizer.eos_id,
+            dropout_p=self.configs.model.decoder_dropout_p,
+        )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/deepspeech2/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/deepspeech2/__init__.py
new file mode 100644
index 000000000..8c40ad5d8
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/deepspeech2/__init__.py
@@ -0,0 +1,28 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from .configurations import (
+    DeepSpeech2Configs
+)
+from .model import (
+    DeepSpeech2Model
+)
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/deepspeech2/configurations.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/deepspeech2/configurations.py
new file mode 100644
index 000000000..e75414bf4
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/deepspeech2/configurations.py
@@ -0,0 +1,71 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from dataclasses import dataclass, field
+
+from openspeech.dataclass.configurations import OpenspeechDataclass
+
+
+@dataclass
+class DeepSpeech2Configs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.models.DeepSpeech2`.
+
+    It is used to initiated an `DeepSpeech2` model.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Args:
+        model_name (str): Model name (default: deepspeech2)
+        num_rnn_layers (int): The number of rnn layers. (default: 5)
+        rnn_hidden_dim (int): The hidden state dimension of rnn. (default: 1024)
+        dropout_p (float): The dropout probability of model. (default: 0.3)
+        bidirectional (bool): If True, becomes a bidirectional encoders (default: True)
+        rnn_type (str): Type of rnn cell (rnn, lstm, gru) (default: gru)
+        activation (str): Type of activation function (default: str)
+        optimizer (str): Optimizer for training. (default: adam)
+    """
+    model_name: str = field(
+        default="deepspeech2", metadata={"help": "Model name"}
+    )
+    rnn_type: str = field(
+        default="gru", metadata={"help": "Type of rnn cell (rnn, lstm, gru)"}
+    )
+    num_rnn_layers: int = field(
+        default=5, metadata={"help": "The number of rnn layers"}
+    )
+    rnn_hidden_dim: int = field(
+        default=1024, metadata={"help": "Hidden state dimenstion of RNN."}
+    )
+    dropout_p: float = field(
+        default=0.3, metadata={"help": "The dropout probability of model."}
+    )
+    bidirectional: bool = field(
+        default=True, metadata={"help": "If True, becomes a bidirectional encoders"}
+    )
+    activation: str = field(
+        default="hardtanh", metadata={"help": "Type of activation function"}
+    )
+    optimizer: str = field(
+        default="adam", metadata={"help": "Optimizer for training."}
+    )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/deepspeech2/model.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/deepspeech2/model.py
new file mode 100644
index 000000000..3de3c0de0
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/deepspeech2/model.py
@@ -0,0 +1,116 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from typing import Dict
+from torch import Tensor
+from collections import OrderedDict
+
+from openspeech.models import OpenspeechCTCModel, register_model
+from openspeech.encoders.deepspeech2 import DeepSpeech2
+from openspeech.models.deepspeech2.configurations import DeepSpeech2Configs
+from openspeech.tokenizers.tokenizer import Tokenizer
+
+
+@register_model('deepspeech2', dataclass=DeepSpeech2Configs)
+class DeepSpeech2Model(OpenspeechCTCModel):
+    r"""
+    Deep Speech2 model with configurable encoders and decoders.
+    Paper: https://arxiv.org/abs/1512.02595
+
+    Args:
+        configs (DictConfig): configuration set.
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        outputs (dict): Result of model predictions that contains `y_hats`, `logits`, `output_lengths`
+    """
+
+    def __init__(self, configs, tokenizer: Tokenizer) -> None:
+        super(DeepSpeech2Model, self).__init__(configs, tokenizer)
+
+    def build_model(self):
+        self.encoder = DeepSpeech2(
+            input_dim=self.configs.audio.num_mels,
+            num_classes=self.num_classes,
+            rnn_type=self.configs.model.rnn_type,
+            num_rnn_layers=self.configs.model.num_rnn_layers,
+            rnn_hidden_dim=self.configs.model.rnn_hidden_dim,
+            dropout_p=self.configs.model.dropout_p,
+            bidirectional=self.configs.model.bidirectional,
+            activation=self.configs.model.activation,
+        )
+
+    def forward(self, inputs: Tensor, input_lengths: Tensor) -> Dict[str, Tensor]:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for inference.
+
+        Inputs:
+            inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+            input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+        Returns:
+            outputs (dict): Result of model predictions that contains `y_hats`, `logits`, `output_lengths`
+        """
+        return super(DeepSpeech2Model, self).forward(inputs, input_lengths)
+
+    def training_step(self, batch: tuple, batch_idx: int) -> OrderedDict:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for training.
+
+        Inputs:
+            batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths`
+            batch_idx (int): The index of batch
+
+        Returns:
+            loss (torch.Tensor): loss for training
+        """
+        return super(DeepSpeech2Model, self).training_step(batch, batch_idx)
+
+    def validation_step(self, batch: tuple, batch_idx: int) -> OrderedDict:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for validation.
+
+        Inputs:
+            batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths`
+            batch_idx (int): The index of batch
+
+        Returns:
+            loss (torch.Tensor): loss for training
+        """
+        return super(DeepSpeech2Model, self).validation_step(batch, batch_idx)
+
+    def test_step(self, batch: tuple, batch_idx: int) -> OrderedDict:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for test.
+
+        Inputs:
+            batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths`
+            batch_idx (int): The index of batch
+
+        Returns:
+            loss (torch.Tensor): loss for training
+        """
+        return super(DeepSpeech2Model, self).test_step(batch, batch_idx)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/jasper/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/jasper/__init__.py
new file mode 100644
index 000000000..79ab64924
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/jasper/__init__.py
@@ -0,0 +1,30 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from .configurations import (
+    Jasper5x3Config,
+    Jasper10x5Config,
+)
+from .model import (
+    Jasper5x3Model,
+    Jasper10x5Model,
+)
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/jasper/configurations.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/jasper/configurations.py
new file mode 100644
index 000000000..8fe2f8663
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/jasper/configurations.py
@@ -0,0 +1,135 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from dataclasses import dataclass, field
+
+from openspeech.dataclass.configurations import OpenspeechDataclass
+
+
+@dataclass
+class Jasper5x3Config(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.models.Jasper5x3`.
+
+    It is used to initiated an `Jasper5x3` model.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Args:
+        model_name (str): Model name (default: jasper5x3)
+        num_blocks (int): Number of jasper blocks (default: 5)
+        num_sub_blocks (int): Number of jasper sub blocks (default: 3)
+        in_channels (str): Output channels of jasper block's convolution
+        out_channels (str): Output channels of jasper block's convolution
+        kernel_size (str): Kernel size of jasper block's convolution
+        dilation (str): Dilation of jasper block's convolution
+        dropout_p (str): Dropout probability
+        optimizer (str): Optimizer for training.
+    """
+    model_name: str = field(
+        default="jasper5x3", metadata={"help": "Model name"}
+    )
+    num_blocks: int = field(
+        default=5, metadata={"help": "Number of jasper blocks"}
+    )
+    num_sub_blocks: int = field(
+        default=3, metadata={"help": "Number of jasper sub blocks"}
+    )
+    in_channels: str = field(
+        default="(None, 256, 256, 256, 384, 384, 512, 512, 640, 640, 768, 768, 896, 1024)",
+        metadata={"help": "Input channels of jasper blocks"}
+    )
+    out_channels: str = field(
+        default="(256, 256, 256, 384, 384, 512, 512, 640, 640, 768, 768, 896, 1024, None)",
+        metadata={"help": "Output channels of jasper block's convolution"}
+    )
+    kernel_size: str = field(
+        default="(11, 11, 11, 13, 13, 17, 17, 21, 21, 25, 25, 29, 1, 1)",
+        metadata={"help": "Kernel size of jasper block's convolution"}
+    )
+    dilation: str = field(
+        default="(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1)",
+        metadata={"help": "Dilation of jasper block's convolution"}
+    )
+    dropout_p: str = field(
+        default="(0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.3, 0.4, 0.4, 0.0)",
+        metadata={"help": "Dropout probability"}
+    )
+    optimizer: str = field(
+        default="novograd", metadata={"help": "Optimizer for training."}
+    )
+
+
+@dataclass
+class Jasper10x5Config(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.models.Jasper10x5`.
+
+    It is used to initiated an `Jasper10x5` model.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Args:
+        model_name (str): Model name (default: jasper10x5)
+        num_blocks (int): Number of jasper blocks (default: 10)
+        num_sub_blocks (int): Number of jasper sub blocks (default: 5)
+        in_channels (str): Output channels of jasper block's convolution
+        out_channels (str): Output channels of jasper block's convolution
+        kernel_size (str): Kernel size of jasper block's convolution
+        dilation (str): Dilation of jasper block's convolution
+        dropout_p (str): Dropout probability
+        optimizer (str): Optimizer for training.
+    """
+    model_name: str = field(
+        default="jasper10x5", metadata={"help": "Model name"}
+    )
+    num_blocks: int = field(
+        default=10, metadata={"help": "Number of jasper blocks"}
+    )
+    num_sub_blocks: int = field(
+        default=5, metadata={"help": "Number of jasper sub blocks"}
+    )
+    in_channels: str = field(
+        default="(None, 256, 256, 256, 384, 384, 512, 512, 640, 640, 768, 768, 896, 1024)",
+        metadata={"help": "Input channels of jasper blocks"}
+    )
+    out_channels: str = field(
+        default="(256, 256, 256, 384, 384, 512, 512, 640, 640, 768, 768, 768, 896, 1024, None)",
+        metadata={"help": "Output channels of jasper block's convolution"}
+    )
+    kernel_size: str = field(
+        default="(11, 11, 11, 13, 13, 17, 17, 21, 21, 25, 25, 29, 1, 1)",
+        metadata={"help": "Kernel size of jasper block's convolution"}
+    )
+    dilation: str = field(
+        default="(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1)",
+        metadata={"help": "Dilation of jasper block's convolution"}
+    )
+    dropout_p: str = field(
+        default="(0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.3, 0.4, 0.4, 0.0)",
+        metadata={"help": "Dropout probability"}
+    )
+    optimizer: str = field(
+        default="novograd", metadata={"help": "Optimizer for training."}
+    )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/jasper/model.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/jasper/model.py
new file mode 100644
index 000000000..ea65b96c9
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/jasper/model.py
@@ -0,0 +1,79 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+from openspeech.models import register_model
+from openspeech.models import OpenspeechCTCModel
+from openspeech.encoders import Jasper
+from openspeech.models.jasper.configurations import Jasper5x3Config, Jasper10x5Config
+from openspeech.tokenizers.tokenizer import Tokenizer
+
+
+@register_model('jasper5x3', dataclass=Jasper5x3Config)
+class Jasper5x3Model(OpenspeechCTCModel):
+    r"""
+    Jasper: An End-to-End Convolutional Neural Acoustic Model
+    Paper: https://arxiv.org/pdf/1904.03288.pdf
+
+    Args:
+        configs (DictConfig): configuration set.
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        outputs (dict): Result of model predictions that contains `y_hats`, `logits`, `output_lengths`
+    """
+
+    def __init__(self, configs, tokenizer: Tokenizer) -> None:
+        super(Jasper5x3Model, self).__init__(configs, tokenizer)
+
+    def build_model(self):
+        self.encoder = Jasper(
+            configs=self.configs,
+            input_dim=self.configs.audio.num_mels,
+            num_classes=self.num_classes,
+        )
+
+
+@register_model('jasper10x5', dataclass=Jasper10x5Config)
+class Jasper10x5Model(Jasper5x3Model):
+    r"""
+    Jasper: An End-to-End Convolutional Neural Acoustic Model
+    Paper: https://arxiv.org/pdf/1904.03288.pdf
+
+    Args:
+        configs (DictConfig): configuration set.
+        tokenizer (Tokeizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        outputs (dict): Result of model predictions that contains `y_hats`, `logits`, `output_lengths`
+    """
+
+    def __init__(self, configs, tokenizer: Tokenizer) -> None:
+        super(Jasper10x5Model, self).__init__(configs, tokenizer)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/listen_attend_spell/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/listen_attend_spell/__init__.py
new file mode 100644
index 000000000..d74c1a04a
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/listen_attend_spell/__init__.py
@@ -0,0 +1,36 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from .configurations import (
+    ListenAttendSpellWithLocationAwareConfigs,
+    ListenAttendSpellWithMultiHeadConfigs,
+    ListenAttendSpellConfigs,
+    JointCTCListenAttendSpellConfigs,
+    DeepCNNWithJointCTCListenAttendSpellConfigs,
+)
+from .model import (
+    ListenAttendSpellWithLocationAwareModel,
+    ListenAttendSpellWithMultiHeadModel,
+    ListenAttendSpellModel,
+    JointCTCListenAttendSpellModel,
+    DeepCNNWithJointCTCListenAttendSpellModel,
+)
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/listen_attend_spell/configurations.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/listen_attend_spell/configurations.py
new file mode 100644
index 000000000..884d02289
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/listen_attend_spell/configurations.py
@@ -0,0 +1,383 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from dataclasses import dataclass, field
+
+from openspeech.dataclass.configurations import OpenspeechDataclass
+
+
+@dataclass
+class ListenAttendSpellConfigs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.models.ListenAttendSpell`.
+
+    It is used to initiated an `ListenAttendSpell` model.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Args:
+        model_name (str): Model name (default: listen_attend_spell)
+        num_encoder_layers (int): The number of encoder layers. (default: 3)
+        num_decoder_layers (int): The number of decoder layers. (default: 2)
+        hidden_state_dim (int): The hidden state dimension of encoder. (default: 512)
+        encoder_dropout_p (float): The dropout probability of encoder. (default: 0.3)
+        encoder_bidirectional (bool): If True, becomes a bidirectional encoders (default: True)
+        rnn_type (str): Type of rnn cell (rnn, lstm, gru) (default: lstm)
+        joint_ctc_attention (bool): Flag indication joint ctc attention or not (default: False)
+        max_length (int): Max decoding length. (default: 128)
+        num_attention_heads (int): The number of attention heads. (default: 1)
+        decoder_dropout_p (float): The dropout probability of decoder. (default: 0.2)
+        decoder_attn_mechanism (str): The attention mechanism for decoder. (default: dot)
+        teacher_forcing_ratio (float): The ratio of teacher forcing. (default: 1.0)
+        optimizer (str): Optimizer for training. (default: adam)
+    """
+    model_name: str = field(
+        default="listen_attend_spell", metadata={"help": "Model name"}
+    )
+    num_encoder_layers: int = field(
+        default=3, metadata={"help": "The number of encoder layers."}
+    )
+    num_decoder_layers: int = field(
+        default=2, metadata={"help": "The number of decoder layers."}
+    )
+    hidden_state_dim: int = field(
+        default=512, metadata={"help": "The hidden state dimension of encoder."}
+    )
+    encoder_dropout_p: float = field(
+        default=0.3, metadata={"help": "The dropout probability of encoder."}
+    )
+    encoder_bidirectional: bool = field(
+        default=True, metadata={"help": "If True, becomes a bidirectional encoders"}
+    )
+    rnn_type: str = field(
+        default="lstm", metadata={"help": "Type of rnn cell (rnn, lstm, gru)"}
+    )
+    joint_ctc_attention: bool = field(
+        default=False, metadata={"help": "Flag indication joint ctc attention or not"}
+    )
+    max_length: int = field(
+        default=128, metadata={"help": "Max decoding length."}
+    )
+    num_attention_heads: int = field(
+        default=1, metadata={"help": "The number of attention heads."}
+    )
+    decoder_dropout_p: float = field(
+        default=0.2, metadata={"help": "The dropout probability of decoder."}
+    )
+    decoder_attn_mechanism: str = field(
+        default="dot", metadata={"help": "The attention mechanism for decoder."}
+    )
+    teacher_forcing_ratio: float = field(
+        default=1.0, metadata={"help": "The ratio of teacher forcing. "}
+    )
+    optimizer: str = field(
+        default="adam", metadata={"help": "Optimizer for training."}
+    )
+
+
+@dataclass
+class ListenAttendSpellWithLocationAwareConfigs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.models.ListenAttendSpellWithLocationAware`.
+
+    It is used to initiated an `ListenAttendSpellWithLocationAware` model.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Args:
+        model_name (str): Model name (default: listen_attend_spell_with_location_aware)
+        num_encoder_layers (int): The number of encoder layers. (default: 3)
+        num_decoder_layers (int): The number of decoder layers. (default: 2)
+        hidden_state_dim (int): The hidden state dimension of encoder. (default: 512)
+        encoder_dropout_p (float): The dropout probability of encoder. (default: 0.3)
+        encoder_bidirectional (bool): If True, becomes a bidirectional encoders (default: True)
+        rnn_type (str): Type of rnn cell (rnn, lstm, gru) (default: lstm)
+        joint_ctc_attention (bool): Flag indication joint ctc attention or not (default: False)
+        max_length (int): Max decoding length. (default: 128)
+        num_attention_heads (int): The number of attention heads. (default: 1)
+        decoder_dropout_p (float): The dropout probability of decoder. (default: 0.2)
+        decoder_attn_mechanism (str): The attention mechanism for decoder. (default: loc)
+        teacher_forcing_ratio (float): The ratio of teacher forcing. (default: 1.0)
+        optimizer (str): Optimizer for training. (default: adam)
+    """
+    model_name: str = field(
+        default="listen_attend_spell_with_location_aware", metadata={"help": "Model name"}
+    )
+    num_encoder_layers: int = field(
+        default=3, metadata={"help": "The number of encoder layers."}
+    )
+    num_decoder_layers: int = field(
+        default=2, metadata={"help": "The number of decoder layers."}
+    )
+    hidden_state_dim: int = field(
+        default=512, metadata={"help": "The hidden state dimension of encoder."}
+    )
+    encoder_dropout_p: float = field(
+        default=0.3, metadata={"help": "The dropout probability of encoder."}
+    )
+    encoder_bidirectional: bool = field(
+        default=True, metadata={"help": "If True, becomes a bidirectional encoders"}
+    )
+    rnn_type: str = field(
+        default="lstm", metadata={"help": "Type of rnn cell (rnn, lstm, gru)"}
+    )
+    joint_ctc_attention: bool = field(
+        default=False, metadata={"help": "Flag indication joint ctc attention or not"}
+    )
+    max_length: int = field(
+        default=128, metadata={"help": "Max decoding length."}
+    )
+    num_attention_heads: int = field(
+        default=1, metadata={"help": "The number of attention heads."}
+    )
+    decoder_dropout_p: float = field(
+        default=0.2, metadata={"help": "The dropout probability of decoder."}
+    )
+    decoder_attn_mechanism: str = field(
+        default="loc", metadata={"help": "The attention mechanism for decoder."}
+    )
+    teacher_forcing_ratio: float = field(
+        default=1.0, metadata={"help": "The ratio of teacher forcing. "}
+    )
+    optimizer: str = field(
+        default="adam", metadata={"help": "Optimizer for training."}
+    )
+
+
+@dataclass
+class ListenAttendSpellWithMultiHeadConfigs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.models.ListenAttendSpellWithMultiHead`.
+
+    It is used to initiated an `ListenAttendSpellWithMultiHead` model.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Args:
+        model_name (str): Model name (default: listen_attend_spell_with_multi_head)
+        num_encoder_layers (int): The number of encoder layers. (default: 3)
+        num_decoder_layers (int): The number of decoder layers. (default: 2)
+        hidden_state_dim (int): The hidden state dimension of encoder. (default: 512)
+        encoder_dropout_p (float): The dropout probability of encoder. (default: 0.3)
+        encoder_bidirectional (bool): If True, becomes a bidirectional encoders (default: True)
+        rnn_type (str): Type of rnn cell (rnn, lstm, gru) (default: lstm)
+        joint_ctc_attention (bool): Flag indication joint ctc attention or not (default: False)
+        max_length (int): Max decoding length. (default: 128)
+        num_attention_heads (int): The number of attention heads. (default: 4)
+        decoder_dropout_p (float): The dropout probability of decoder. (default: 0.2)
+        decoder_attn_mechanism (str): The attention mechanism for decoder. (default: multi-head)
+        teacher_forcing_ratio (float): The ratio of teacher forcing. (default: 1.0)
+        optimizer (str): Optimizer for training. (default: adam)
+    """
+    model_name: str = field(
+        default="listen_attend_spell_with_multi_head", metadata={"help": "Model name"}
+    )
+    num_encoder_layers: int = field(
+        default=3, metadata={"help": "The number of encoder layers."}
+    )
+    num_decoder_layers: int = field(
+        default=2, metadata={"help": "The number of decoder layers."}
+    )
+    hidden_state_dim: int = field(
+        default=512, metadata={"help": "The hidden state dimension of encoder."}
+    )
+    encoder_dropout_p: float = field(
+        default=0.3, metadata={"help": "The dropout probability of encoder."}
+    )
+    encoder_bidirectional: bool = field(
+        default=True, metadata={"help": "If True, becomes a bidirectional encoders"}
+    )
+    rnn_type: str = field(
+        default="lstm", metadata={"help": "Type of rnn cell (rnn, lstm, gru)"}
+    )
+    joint_ctc_attention: bool = field(
+        default=False, metadata={"help": "Flag indication joint ctc attention or not"}
+    )
+    max_length: int = field(
+        default=128, metadata={"help": "Max decoding length."}
+    )
+    num_attention_heads: int = field(
+        default=4, metadata={"help": "The number of attention heads."}
+    )
+    decoder_dropout_p: float = field(
+        default=0.2, metadata={"help": "The dropout probability of decoder."}
+    )
+    decoder_attn_mechanism: str = field(
+        default="multi-head", metadata={"help": "The attention mechanism for decoder."}
+    )
+    teacher_forcing_ratio: float = field(
+        default=1.0, metadata={"help": "The ratio of teacher forcing. "}
+    )
+    optimizer: str = field(
+        default="adam", metadata={"help": "Optimizer for training."}
+    )
+
+
+@dataclass
+class JointCTCListenAttendSpellConfigs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.models.JointCTCListenAttendSpell`.
+
+    It is used to initiated an `JointCTCListenAttendSpell` model.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Args:
+        model_name (str): Model name (default: joint_ctc_listen_attend_spell)
+        num_encoder_layers (int): The number of encoder layers. (default: 3)
+        num_decoder_layers (int): The number of decoder layers. (default: 2)
+        hidden_state_dim (int): The hidden state dimension of encoder. (default: 768)
+        encoder_dropout_p (float): The dropout probability of encoder. (default: 0.3)
+        encoder_bidirectional (bool): If True, becomes a bidirectional encoders (default: True)
+        rnn_type (str): Type of rnn cell (rnn, lstm, gru) (default: lstm)
+        joint_ctc_attention (bool): Flag indication joint ctc attention or not (default: True)
+        max_length (int): Max decoding length. (default: 128)
+        num_attention_heads (int): The number of attention heads. (default: 1)
+        decoder_dropout_p (float): The dropout probability of decoder. (default: 0.2)
+        decoder_attn_mechanism (str): The attention mechanism for decoder. (default: loc)
+        teacher_forcing_ratio (float): The ratio of teacher forcing. (default: 1.0)
+        optimizer (str): Optimizer for training. (default: adam)
+    """
+    model_name: str = field(
+        default="joint_ctc_listen_attend_spell", metadata={"help": "Model name"}
+    )
+    num_encoder_layers: int = field(
+        default=3, metadata={"help": "The number of encoder layers."}
+    )
+    num_decoder_layers: int = field(
+        default=2, metadata={"help": "The number of decoder layers."}
+    )
+    hidden_state_dim: int = field(
+        default=768, metadata={"help": "The hidden state dimension of encoder."}
+    )
+    encoder_dropout_p: float = field(
+        default=0.3, metadata={"help": "The dropout probability of encoder."}
+    )
+    encoder_bidirectional: bool = field(
+        default=True, metadata={"help": "If True, becomes a bidirectional encoders"}
+    )
+    rnn_type: str = field(
+        default="lstm", metadata={"help": "Type of rnn cell (rnn, lstm, gru)"}
+    )
+    joint_ctc_attention: bool = field(
+        default=True, metadata={"help": "Flag indication joint ctc attention or not"}
+    )
+    max_length: int = field(
+        default=128, metadata={"help": "Max decoding length."}
+    )
+    num_attention_heads: int = field(
+        default=1, metadata={"help": "The number of attention heads."}
+    )
+    decoder_dropout_p: float = field(
+        default=0.2, metadata={"help": "The dropout probability of decoder."}
+    )
+    decoder_attn_mechanism: str = field(
+        default="loc", metadata={"help": "The attention mechanism for decoder."}
+    )
+    teacher_forcing_ratio: float = field(
+        default=1.0, metadata={"help": "The ratio of teacher forcing. "}
+    )
+    optimizer: str = field(
+        default="adam", metadata={"help": "Optimizer for training."}
+    )
+
+
+@dataclass
+class DeepCNNWithJointCTCListenAttendSpellConfigs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.models.DeepCNNWithJointCTCListenAttendSpell`.
+
+    It is used to initiated an `DeepCNNWithJointCTCListenAttendSpell` model.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Args:
+        model_name (str): Model name (default: deep_cnn_with_joint_ctc_listen_attend_spell)
+        num_encoder_layers (int): The number of encoder layers. (default: 3)
+        num_decoder_layers (int): The number of decoder layers. (default: 2)
+        hidden_state_dim (int): The hidden state dimension of encoder. (default: 768)
+        encoder_dropout_p (float): The dropout probability of encoder. (default: 0.3)
+        encoder_bidirectional (bool): If True, becomes a bidirectional encoders (default: True)
+        rnn_type (str): Type of rnn cell (rnn, lstm, gru) (default: lstm)
+        extractor (str): The CNN feature extractor. (default: vgg)
+        activation (str): Type of activation function (default: str)
+        joint_ctc_attention (bool): Flag indication joint ctc attention or not (default: True)
+        max_length (int): Max decoding length. (default: 128)
+        num_attention_heads (int): The number of attention heads. (default: 1)
+        decoder_dropout_p (float): The dropout probability of decoder. (default: 0.2)
+        decoder_attn_mechanism (str): The attention mechanism for decoder. (default: loc)
+        teacher_forcing_ratio (float): The ratio of teacher forcing. (default: 1.0)
+        optimizer (str): Optimizer for training. (default: adam)
+    """
+    model_name: str = field(
+        default="deep_cnn_with_joint_ctc_listen_attend_spell", metadata={"help": "Model name"}
+    )
+    num_encoder_layers: int = field(
+        default=3, metadata={"help": "The number of encoder layers."}
+    )
+    num_decoder_layers: int = field(
+        default=2, metadata={"help": "The number of decoder layers."}
+    )
+    hidden_state_dim: int = field(
+        default=768, metadata={"help": "The hidden state dimension of encoder."}
+    )
+    encoder_dropout_p: float = field(
+        default=0.3, metadata={"help": "The dropout probability of encoder."}
+    )
+    encoder_bidirectional: bool = field(
+        default=True, metadata={"help": "If True, becomes a bidirectional encoders"}
+    )
+    rnn_type: str = field(
+        default="lstm", metadata={"help": "Type of rnn cell (rnn, lstm, gru)"}
+    )
+    extractor: str = field(
+        default="vgg", metadata={"help": "The CNN feature extractor."}
+    )
+    activation: str = field(
+        default="hardtanh", metadata={"help": "Type of activation function"}
+    )
+    joint_ctc_attention: bool = field(
+        default=True, metadata={"help": "Flag indication joint ctc attention or not"}
+    )
+    max_length: int = field(
+        default=128, metadata={"help": "Max decoding length."}
+    )
+    num_attention_heads: int = field(
+        default=1, metadata={"help": "The number of attention heads."}
+    )
+    decoder_dropout_p: float = field(
+        default=0.2, metadata={"help": "The dropout probability of decoder."}
+    )
+    decoder_attn_mechanism: str = field(
+        default="loc", metadata={"help": "The attention mechanism for decoder."}
+    )
+    teacher_forcing_ratio: float = field(
+        default=1.0, metadata={"help": "The ratio of teacher forcing."}
+    )
+    optimizer: str = field(
+        default="adam", metadata={"help": "Optimizer for training."}
+    )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/listen_attend_spell/model.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/listen_attend_spell/model.py
new file mode 100644
index 000000000..9d8b37ff8
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/listen_attend_spell/model.py
@@ -0,0 +1,324 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+from openspeech.models import register_model, OpenspeechEncoderDecoderModel
+from openspeech.decoders import LSTMAttentionDecoder
+from openspeech.encoders import LSTMEncoder, ConvolutionalLSTMEncoder
+from openspeech.tokenizers.tokenizer import Tokenizer
+from openspeech.models.listen_attend_spell.configurations import (
+    ListenAttendSpellConfigs,
+    JointCTCListenAttendSpellConfigs,
+    ListenAttendSpellWithLocationAwareConfigs,
+    ListenAttendSpellWithMultiHeadConfigs,
+    DeepCNNWithJointCTCListenAttendSpellConfigs,
+)
+
+
+@register_model('listen_attend_spell', dataclass=ListenAttendSpellConfigs)
+class ListenAttendSpellModel(OpenspeechEncoderDecoderModel):
+    r"""
+    Listen, Attend and Spell model with configurable encoder and decoder.
+    Paper: https://arxiv.org/abs/1508.01211
+
+    Args:
+        configs (DictConfig): configuration set.
+        tokenizer (Tokeizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        - **inputs** (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        - **input_lengths** (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        outputs (dict): Result of model predictions.
+    """
+
+    def __init__(self, configs, tokenizer: Tokenizer) -> None:
+        super(ListenAttendSpellModel, self).__init__(configs, tokenizer)
+
+    def build_model(self):
+        self.encoder = LSTMEncoder(
+            input_dim=self.configs.audio.num_mels,
+            num_layers=self.configs.model.num_encoder_layers,
+            num_classes=self.num_classes,
+            hidden_state_dim=self.configs.model.hidden_state_dim,
+            dropout_p=self.configs.model.encoder_dropout_p,
+            bidirectional=self.configs.model.encoder_bidirectional,
+            rnn_type=self.configs.model.rnn_type,
+            joint_ctc_attention=self.configs.model.joint_ctc_attention,
+        )
+        decoder_hidden_state_dim = self.configs.model.hidden_state_dim << 1 \
+            if self.configs.model.encoder_bidirectional \
+            else self.configs.model.hidden_state_dim
+        self.decoder = LSTMAttentionDecoder(
+            num_classes=self.num_classes,
+            max_length=self.configs.model.max_length,
+            hidden_state_dim=decoder_hidden_state_dim,
+            pad_id=self.tokenizer.pad_id,
+            sos_id=self.tokenizer.sos_id,
+            eos_id=self.tokenizer.eos_id,
+            num_heads=self.configs.model.num_attention_heads,
+            dropout_p=self.configs.model.decoder_dropout_p,
+            num_layers=self.configs.model.num_decoder_layers,
+            attn_mechanism=self.configs.model.decoder_attn_mechanism,
+            rnn_type=self.configs.model.rnn_type,
+        )
+
+    def set_beam_decoder(self, beam_size: int = 3):
+        """ Setting beam search decoder """
+        from openspeech.search import BeamSearchLSTM
+        self.decoder = BeamSearchLSTM(
+            decoder=self.decoder,
+            beam_size=beam_size,
+        )
+
+
+@register_model('listen_attend_spell_with_location_aware', dataclass=ListenAttendSpellWithLocationAwareConfigs)
+class ListenAttendSpellWithLocationAwareModel(OpenspeechEncoderDecoderModel):
+    r"""
+    Listen, Attend and Spell model with configurable encoder and decoder.
+    Paper: https://arxiv.org/abs/1508.01211
+
+    Args:
+        configs (DictConfig): configuration set.
+        tokenizer (Tokeizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        - **inputs** (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        - **input_lengths** (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        outputs (dict): Result of model predictions.
+    """
+
+    def __init__(self, configs, tokenizer: Tokenizer) -> None:
+        super(ListenAttendSpellWithLocationAwareModel, self).__init__(configs, tokenizer)
+
+    def build_model(self):
+        self.encoder = LSTMEncoder(
+            input_dim=self.configs.audio.num_mels,
+            num_layers=self.configs.model.num_encoder_layers,
+            num_classes=self.num_classes,
+            hidden_state_dim=self.configs.model.hidden_state_dim,
+            dropout_p=self.configs.model.encoder_dropout_p,
+            bidirectional=self.configs.model.encoder_bidirectional,
+            rnn_type=self.configs.model.rnn_type,
+            joint_ctc_attention=self.configs.model.joint_ctc_attention,
+        )
+        decoder_hidden_state_dim = self.configs.model.hidden_state_dim << 1 \
+            if self.configs.model.encoder_bidirectional \
+            else self.configs.model.hidden_state_dim
+        self.decoder = LSTMAttentionDecoder(
+            num_classes=self.num_classes,
+            max_length=self.configs.model.max_length,
+            hidden_state_dim=decoder_hidden_state_dim,
+            pad_id=self.tokenizer.pad_id,
+            sos_id=self.tokenizer.sos_id,
+            eos_id=self.tokenizer.eos_id,
+            num_heads=self.configs.model.num_attention_heads,
+            dropout_p=self.configs.model.decoder_dropout_p,
+            num_layers=self.configs.model.num_decoder_layers,
+            attn_mechanism=self.configs.model.decoder_attn_mechanism,
+            rnn_type=self.configs.model.rnn_type,
+        )
+
+    def set_beam_decoder(self, beam_size: int = 3):
+        """ Setting beam search decoder """
+        from openspeech.search import BeamSearchLSTM
+        self.decoder = BeamSearchLSTM(
+            decoder=self.decoder,
+            beam_size=beam_size,
+        )
+
+
+@register_model('listen_attend_spell_with_multi_head', dataclass=ListenAttendSpellWithMultiHeadConfigs)
+class ListenAttendSpellWithMultiHeadModel(OpenspeechEncoderDecoderModel):
+    r"""
+    Listen, Attend and Spell model with configurable encoder and decoder.
+    Paper: https://arxiv.org/abs/1508.01211
+
+    Args:
+        configs (DictConfig): configuration set.
+        tokenizer (Tokeizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        - **inputs** (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        - **input_lengths** (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        outputs (dict): Result of model predictions.
+    """
+
+    def __init__(self, configs, tokenizer: Tokenizer) -> None:
+        super(ListenAttendSpellWithMultiHeadModel, self).__init__(configs, tokenizer)
+
+    def build_model(self):
+        self.encoder = LSTMEncoder(
+            input_dim=self.configs.audio.num_mels,
+            num_layers=self.configs.model.num_encoder_layers,
+            num_classes=self.num_classes,
+            hidden_state_dim=self.configs.model.hidden_state_dim,
+            dropout_p=self.configs.model.encoder_dropout_p,
+            bidirectional=self.configs.model.encoder_bidirectional,
+            rnn_type=self.configs.model.rnn_type,
+            joint_ctc_attention=self.configs.model.joint_ctc_attention,
+        )
+        decoder_hidden_state_dim = self.configs.model.hidden_state_dim << 1 \
+            if self.configs.model.encoder_bidirectional \
+            else self.configs.model.hidden_state_dim
+        self.decoder = LSTMAttentionDecoder(
+            num_classes=self.num_classes,
+            max_length=self.configs.model.max_length,
+            hidden_state_dim=decoder_hidden_state_dim,
+            pad_id=self.tokenizer.pad_id,
+            sos_id=self.tokenizer.sos_id,
+            eos_id=self.tokenizer.eos_id,
+            num_heads=self.configs.model.num_attention_heads,
+            dropout_p=self.configs.model.decoder_dropout_p,
+            num_layers=self.configs.model.num_decoder_layers,
+            attn_mechanism=self.configs.model.decoder_attn_mechanism,
+            rnn_type=self.configs.model.rnn_type,
+        )
+
+    def set_beam_decoder(self, beam_size: int = 3):
+        """ Setting beam search decoder """
+        from openspeech.search import BeamSearchLSTM
+        self.decoder = BeamSearchLSTM(
+            decoder=self.decoder,
+            beam_size=beam_size,
+        )
+
+
+@register_model('joint_ctc_listen_attend_spell', dataclass=JointCTCListenAttendSpellConfigs)
+class JointCTCListenAttendSpellModel(OpenspeechEncoderDecoderModel):
+    r"""
+    Joint CTC-Attention Listen, Attend and Spell model with configurable encoder and decoder.
+    Paper: https://arxiv.org/abs/1609.06773
+
+    Args:
+        configs (DictConfig): configuration set.
+        tokenizer (Tokeizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        - **inputs** (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        - **input_lengths** (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        outputs (dict): Result of model predictions.
+    """
+
+    def __init__(self, configs, tokenizer: Tokenizer) -> None:
+        super(JointCTCListenAttendSpellModel, self).__init__(configs, tokenizer)
+
+    def build_model(self):
+        self.encoder = LSTMEncoder(
+            input_dim=self.configs.audio.num_mels,
+            num_layers=self.configs.model.num_encoder_layers,
+            num_classes=self.num_classes,
+            hidden_state_dim=self.configs.model.hidden_state_dim,
+            dropout_p=self.configs.model.encoder_dropout_p,
+            bidirectional=self.configs.model.encoder_bidirectional,
+            rnn_type=self.configs.model.rnn_type,
+            joint_ctc_attention=self.configs.model.joint_ctc_attention,
+        )
+        decoder_hidden_state_dim = self.configs.model.hidden_state_dim << 1 \
+            if self.configs.model.encoder_bidirectional \
+            else self.configs.model.hidden_state_dim
+        self.decoder = LSTMAttentionDecoder(
+            num_classes=self.num_classes,
+            max_length=self.configs.model.max_length,
+            hidden_state_dim=decoder_hidden_state_dim,
+            pad_id=self.tokenizer.pad_id,
+            sos_id=self.tokenizer.sos_id,
+            eos_id=self.tokenizer.eos_id,
+            num_heads=self.configs.model.num_attention_heads,
+            dropout_p=self.configs.model.decoder_dropout_p,
+            num_layers=self.configs.model.num_decoder_layers,
+            attn_mechanism=self.configs.model.decoder_attn_mechanism,
+            rnn_type=self.configs.model.rnn_type,
+        )
+
+    def set_beam_decoder(self, beam_size: int = 3):
+        """ Setting beam search decoder """
+        from openspeech.search.beam_search_lstm import BeamSearchLSTM
+        self.decoder = BeamSearchLSTM(
+            decoder=self.decoder,
+            beam_size=beam_size,
+        )
+
+
+@register_model('deep_cnn_with_joint_ctc_listen_attend_spell', dataclass=DeepCNNWithJointCTCListenAttendSpellConfigs)
+class DeepCNNWithJointCTCListenAttendSpellModel(OpenspeechEncoderDecoderModel):
+    r"""
+    Listen, Attend and Spell model with configurable encoder and decoder.
+    Paper: https://arxiv.org/abs/1508.01211
+
+    Args:
+        configs (DictConfig): configuration set.
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        - **inputs** (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        - **input_lengths** (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        outputs (dict): Result of model predictions.
+    """
+
+    def __init__(self, configs, tokenizer: Tokenizer) -> None:
+        super(DeepCNNWithJointCTCListenAttendSpellModel, self).__init__(configs, tokenizer)
+
+    def build_model(self):
+        self.encoder = ConvolutionalLSTMEncoder(
+            input_dim=self.configs.audio.num_mels,
+            num_layers=self.configs.model.num_encoder_layers,
+            num_classes=self.num_classes,
+            hidden_state_dim=self.configs.model.hidden_state_dim,
+            dropout_p=self.configs.model.encoder_dropout_p,
+            bidirectional=self.configs.model.encoder_bidirectional,
+            rnn_type=self.configs.model.rnn_type,
+            joint_ctc_attention=self.configs.model.joint_ctc_attention,
+        )
+        decoder_hidden_state_dim = self.configs.model.hidden_state_dim << 1 \
+            if self.configs.model.encoder_bidirectional \
+            else self.configs.model.hidden_state_dim
+        self.decoder = LSTMAttentionDecoder(
+            num_classes=self.num_classes,
+            max_length=self.configs.model.max_length,
+            hidden_state_dim=decoder_hidden_state_dim,
+            pad_id=self.tokenizer.pad_id,
+            sos_id=self.tokenizer.sos_id,
+            eos_id=self.tokenizer.eos_id,
+            num_heads=self.configs.model.num_attention_heads,
+            dropout_p=self.configs.model.decoder_dropout_p,
+            num_layers=self.configs.model.num_decoder_layers,
+            attn_mechanism=self.configs.model.decoder_attn_mechanism,
+            rnn_type=self.configs.model.rnn_type,
+        )
+
+    def set_beam_decoder(self, beam_size: int = 3):
+        """ Setting beam search decoder """
+        from openspeech.search import BeamSearchLSTM
+        self.decoder = BeamSearchLSTM(
+            decoder=self.decoder,
+            beam_size=beam_size,
+        )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/lstm_lm/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/lstm_lm/__init__.py
new file mode 100644
index 000000000..045035ba4
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/lstm_lm/__init__.py
@@ -0,0 +1,28 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from .configurations import (
+    LSTMLanguageModelConfigs
+)
+from .model import (
+    LSTMLanguageModel
+)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/lstm_lm/configurations.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/lstm_lm/configurations.py
new file mode 100644
index 000000000..0370862c7
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/lstm_lm/configurations.py
@@ -0,0 +1,71 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from dataclasses import dataclass, field
+
+from openspeech.dataclass.configurations import OpenspeechDataclass
+
+
+@dataclass
+class LSTMLanguageModelConfigs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.models.LSTMLanguageModel`.
+
+    It is used to initiated an `LSTMLanguageModel` model.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Args:
+        model_name (str): Model name (default: lstm_lm)
+        num_layers (int): The number of lstm layers. (default: 3)
+        hidden_state_dim (int): The hidden state dimension of model. (default: 512)
+        dropout_p (float): The dropout probability of encoder. (default: 0.3)
+        rnn_type (str): Type of rnn cell (rnn, lstm, gru) (default: lstm)
+        max_length (int): Max decoding length. (default: 128)
+        teacher_forcing_ratio (float): The ratio of teacher forcing. (default: 1.0)
+        optimizer (str): Optimizer for training. (default: adam)
+    """
+    model_name: str = field(
+        default="lstm_lm", metadata={"help": "Model name"}
+    )
+    num_layers: int = field(
+        default=3, metadata={"help": "The number of encoder layers."}
+    )
+    hidden_state_dim: int = field(
+        default=512, metadata={"help": "The hidden state dimension of encoder."}
+    )
+    dropout_p: float = field(
+        default=0.3, metadata={"help": "The dropout probability of encoder."}
+    )
+    rnn_type: str = field(
+        default="lstm", metadata={"help": "Type of rnn cell (rnn, lstm, gru)"}
+    )
+    max_length: int = field(
+        default=128, metadata={"help": "Max decoding length."}
+    )
+    teacher_forcing_ratio: float = field(
+        default=1.0, metadata={"help": "The ratio of teacher forcing. "}
+    )
+    optimizer: str = field(
+        default="adam", metadata={"help": "Optimizer for training."}
+    )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/lstm_lm/model.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/lstm_lm/model.py
new file mode 100644
index 000000000..f1a7fe36f
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/lstm_lm/model.py
@@ -0,0 +1,62 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+from openspeech.lm.lstm_lm import LSTMForLanguageModel
+from openspeech.models import register_model
+from openspeech.models.lstm_lm.configurations import LSTMLanguageModelConfigs
+from openspeech.models.openspeech_language_model import OpenspeechLanguageModel
+from openspeech.tokenizers.tokenizer import Tokenizer
+
+
+@register_model('lstm_lm', dataclass=LSTMLanguageModelConfigs)
+class LSTMLanguageModel(OpenspeechLanguageModel):
+    r"""
+    LSTM language model.
+    Paper: http://www-i6.informatik.rwth-aachen.de/publications/download/820/Sundermeyer-2012.pdf
+
+    Args:
+        configs (DictConfig): configuration set.
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        - **inputs** (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be
+            a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+
+    Returns:
+        outputs (dict): Result of model predictions.
+    """
+    def __init__(self, configs, tokenizer: Tokenizer) -> None:
+        super(LSTMLanguageModel, self).__init__(configs, tokenizer)
+
+    def build_model(self):
+        self.lm = LSTMForLanguageModel(
+            num_classes=self.num_classes,
+            max_length=self.configs.model.max_length,
+            hidden_state_dim=self.configs.model.hidden_state_dim,
+            pad_id=self.tokenizer.pad_id,
+            sos_id=self.tokenizer.sos_id,
+            eos_id=self.tokenizer.eos_id,
+            dropout_p=self.configs.model.dropout_p,
+            num_layers=self.configs.model.num_layers,
+            rnn_type=self.configs.model.rnn_type,
+        )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/openspeech_ctc_model.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/openspeech_ctc_model.py
new file mode 100644
index 000000000..67eb6f365
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/openspeech_ctc_model.py
@@ -0,0 +1,174 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+from collections import OrderedDict
+from typing import Dict
+
+from openspeech.models import OpenspeechModel
+from openspeech.tokenizers.tokenizer import Tokenizer
+
+
+class OpenspeechCTCModel(OpenspeechModel):
+    r"""
+    Base class for OpenSpeech's encoder-only models (ctc-model).
+
+    Args:
+        configs (DictConfig): configuration set.
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        ouputs (dict): Result of model predictions that contains `y_hats`, `logits`, `output_lengths`
+    """
+    def __init__(self, configs, tokenizer: Tokenizer) -> None:
+        super(OpenspeechCTCModel, self).__init__(configs, tokenizer)
+        self.encoder = None
+        self.decoder = None
+
+    def set_beam_decoder(self, beam_size: int = 3):
+        """ Setting beam search decoder """
+        from openspeech.search import BeamSearchCTC
+        self.decoder = BeamSearchCTC(
+            labels=self.tokenizer.labels,
+            blank_id=self.tokenizer.blank_id,
+            beam_size=beam_size,
+        )
+
+    def collect_outputs(
+            self,
+            stage: str,
+            logits: torch.FloatTensor,
+            output_lengths: torch.IntTensor,
+            targets: torch.IntTensor,
+            target_lengths: torch.IntTensor,
+    ) -> OrderedDict:
+        loss = self.criterion(
+            log_probs=logits.transpose(0, 1),
+            targets=targets[:, 1:],
+            input_lengths=output_lengths,
+            target_lengths=target_lengths,
+        )
+        predictions = logits.max(-1)[1]
+
+        wer = self.wer_metric(targets[:, 1:], predictions)
+        cer = self.cer_metric(targets[:, 1:], predictions)
+
+        return OrderedDict({
+            "loss": loss,
+            "wer": wer,
+            "cer": cer,
+        })
+
+    def forward(self, inputs: torch.FloatTensor, input_lengths: torch.IntTensor) -> Dict[str, torch.Tensor]:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for inference.
+
+        Args:
+            inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+            input_lengths (torch.IntTensor): The length of input tensor. ``(batch)``
+
+        Returns:
+            ouputs (dict): Result of model predictions that contains `y_hats`, `logits`, `output_lengths`
+        """
+        outputs = self.encoder(inputs, input_lengths)
+
+        if len(outputs) == 2:
+            logits, output_lengths = outputs
+        else:
+            logits, _, output_lengths = outputs
+
+        if self.decoder is not None:
+            y_hats = self.decoder(logits)
+        else:
+            y_hats = logits.max(-1)[1]
+        return {
+            "predictions": y_hats,
+            "logits": logits,
+            "output_lengths": output_lengths,
+        }
+
+    def training_step(self, batch: tuple, batch_idx: int) -> OrderedDict:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for training.
+
+        Inputs:
+            train_batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths`
+            batch_idx (int): The index of batch
+
+        Returns:
+            loss (torch.Tensor): loss for training
+        """
+        inputs, targets, input_lengths, target_lengths = batch
+        logits, output_lengths = self.encoder(inputs, input_lengths)
+        return self.collect_outputs(
+            stage='train',
+            logits=logits,
+            output_lengths=output_lengths,
+            targets=targets,
+            target_lengths=target_lengths,
+        )
+
+    def validation_step(self, batch: tuple, batch_idx: int) -> OrderedDict:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for validation.
+
+        Inputs:
+            train_batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths`
+            batch_idx (int): The index of batch
+
+        Returns:
+            loss (torch.Tensor): loss for training
+        """
+        inputs, targets, input_lengths, target_lengths = batch
+        logits, output_lengths = self.encoder(inputs, input_lengths)
+        return self.collect_outputs(
+            stage='val',
+            logits=logits,
+            output_lengths=output_lengths,
+            targets=targets,
+            target_lengths=target_lengths,
+        )
+
+    def test_step(self, batch: tuple, batch_idx: int) -> OrderedDict:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for test.
+
+        Inputs:
+            train_batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths`
+            batch_idx (int): The index of batch
+
+        Returns:
+            loss (torch.Tensor): loss for training
+        """
+        inputs, targets, input_lengths, target_lengths = batch
+        logits, output_lengths = self.encoder(inputs, input_lengths)
+        return self.collect_outputs(
+            stage='test',
+            logits=logits,
+            output_lengths=output_lengths,
+            targets=targets,
+            target_lengths=target_lengths,
+        )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/openspeech_encoder_decoder_model.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/openspeech_encoder_decoder_model.py
new file mode 100644
index 000000000..7ff4f7bcc
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/openspeech_encoder_decoder_model.py
@@ -0,0 +1,225 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from torch import Tensor
+from collections import OrderedDict
+from typing import Dict
+
+from openspeech.models import OpenspeechModel
+from openspeech.utils import get_class_name
+from openspeech.tokenizers.tokenizer import Tokenizer
+
+
+class OpenspeechEncoderDecoderModel(OpenspeechModel):
+    r"""
+    Base class for OpenSpeech's encoder-decoder models.
+
+    Args:
+        configs (DictConfig): configuration set.
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        - **inputs** (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        - **input_lengths** (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        outputs (dict): Result of model predictions that contains `predictions`, `logits`, `encoder_outputs`,
+                `encoder_logits`, `encoder_output_lengths`.
+    """
+
+    def __init__(self, configs, tokenizer: Tokenizer, ) -> None:
+        super(OpenspeechEncoderDecoderModel, self).__init__(configs, tokenizer)
+        self.teacher_forcing_ratio = configs.model.teacher_forcing_ratio
+        self.encoder = None
+        self.decoder = None
+        self.criterion = self.configure_criterion(self.configs.criterion.criterion_name)
+
+    def set_beam_decoder(self, beam_size: int = 3):
+        raise NotImplementedError
+
+    def collect_outputs(
+            self,
+            stage: str,
+            logits: Tensor,
+            encoder_logits: Tensor,
+            encoder_output_lengths: Tensor,
+            targets: Tensor,
+            target_lengths: Tensor,
+    ) -> OrderedDict:
+        cross_entropy_loss, ctc_loss = None, None
+
+        if get_class_name(self.criterion) == "JointCTCCrossEntropyLoss":
+            loss, ctc_loss, cross_entropy_loss = self.criterion(
+                encoder_logits=encoder_logits.transpose(0, 1),
+                logits=logits,
+                output_lengths=encoder_output_lengths,
+                targets=targets[:, 1:],
+                target_lengths=target_lengths,
+            )
+        elif get_class_name(self.criterion) == "LabelSmoothedCrossEntropyLoss" \
+                or get_class_name(self.criterion) == "CrossEntropyLoss":
+            loss = self.criterion(logits, targets[:, 1:])
+        else:
+            raise ValueError(f"Unsupported criterion: {self.criterion}")
+
+        predictions = logits.max(-1)[1]
+
+        wer = self.wer_metric(targets[:, 1:], predictions)
+        cer = self.cer_metric(targets[:, 1:], predictions)
+
+        return OrderedDict({
+            "loss": loss,
+            "cross_entropy_loss": cross_entropy_loss,
+            "ctc_loss": ctc_loss,
+            "wer": wer,
+            "cer": cer,
+            "predictions": predictions,
+            "targets": targets,
+            "logits": logits
+        })
+
+    def forward(self, inputs: Tensor, input_lengths: Tensor) -> Dict[str, Tensor]:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for inference.
+
+        Inputs:
+            inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+            input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+        Returns:
+            outputs (dict): Result of model predictions that contains `predictions`, `logits`, `encoder_outputs`,
+                `encoder_logits`, `encoder_output_lengths`.
+        """
+        logits = None
+        encoder_outputs, encoder_logits, encoder_output_lengths = self.encoder(inputs, input_lengths)
+
+        if get_class_name(self.decoder) in ("BeamSearchLSTM", "BeamSearchTransformer"):
+            predictions = self.decoder(encoder_outputs, encoder_output_lengths)
+        else:
+            logits = self.decoder(
+                encoder_outputs=encoder_outputs,
+                encoder_output_lengths=encoder_output_lengths,
+                teacher_forcing_ratio=0.0,
+            )
+            predictions = logits.max(-1)[1]
+        return {
+            "predictions": predictions,
+            "logits": logits,
+            "encoder_outputs": encoder_outputs,
+            "encoder_logits": encoder_logits,
+            "encoder_output_lengths": encoder_output_lengths,
+        }
+
+    def training_step(self, batch: tuple, batch_idx: int) -> OrderedDict:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for training.
+
+        Inputs:
+            train_batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths`
+            batch_idx (int): The index of batch
+
+        Returns:
+            loss (torch.Tensor): loss for training
+        """
+        inputs, targets, input_lengths, target_lengths = batch
+
+        encoder_outputs, encoder_logits, encoder_output_lengths = self.encoder(inputs, input_lengths)
+        if get_class_name(self.decoder) == "TransformerDecoder":
+            logits = self.decoder(
+                encoder_outputs=encoder_outputs,
+                targets=targets,
+                encoder_output_lengths=encoder_output_lengths,
+                target_lengths=target_lengths,
+                teacher_forcing_ratio=self.teacher_forcing_ratio,
+            )
+        else:
+            logits = self.decoder(
+                encoder_outputs=encoder_outputs,
+                targets=targets,
+                encoder_output_lengths=encoder_output_lengths,
+                teacher_forcing_ratio=self.teacher_forcing_ratio,
+            )
+
+        return self.collect_outputs(
+            stage='train',
+            logits=logits,
+            encoder_logits=encoder_logits,
+            encoder_output_lengths=encoder_output_lengths,
+            targets=targets,
+            target_lengths=target_lengths,
+        )
+
+    def validation_step(self, batch: tuple, batch_idx: int) -> OrderedDict:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for validation.
+
+        Inputs:
+            train_batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths`
+            batch_idx (int): The index of batch
+
+        Returns:
+            loss (torch.Tensor): loss for training
+        """
+        inputs, targets, input_lengths, target_lengths = batch
+
+        encoder_outputs, encoder_logits, encoder_output_lengths = self.encoder(inputs, input_lengths)
+        logits = self.decoder(
+            encoder_outputs,
+            encoder_output_lengths=encoder_output_lengths,
+            teacher_forcing_ratio=0.0,
+        )
+        return self.collect_outputs(
+            stage='val',
+            logits=logits,
+            encoder_logits=encoder_logits,
+            encoder_output_lengths=encoder_output_lengths,
+            targets=targets,
+            target_lengths=target_lengths,
+        )
+
+    def test_step(self, batch: tuple, batch_idx: int) -> OrderedDict:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for test.
+
+        Inputs:
+            train_batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths`
+            batch_idx (int): The index of batch
+
+        Returns:
+            loss (torch.Tensor): loss for training
+        """
+        inputs, targets, input_lengths, target_lengths = batch
+
+        encoder_outputs, encoder_logits, encoder_output_lengths = self.encoder(inputs, input_lengths)
+        logits = self.decoder(
+            encoder_outputs,
+            encoder_output_lengths=encoder_output_lengths,
+            teacher_forcing_ratio=0.0,
+        )
+        return self.collect_outputs(
+            stage='test',
+            logits=logits,
+            encoder_logits=encoder_logits,
+            encoder_output_lengths=encoder_output_lengths,
+            targets=targets,
+            target_lengths=target_lengths,
+        )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/openspeech_language_model.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/openspeech_language_model.py
new file mode 100644
index 000000000..b919bc76b
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/openspeech_language_model.py
@@ -0,0 +1,168 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+from collections import OrderedDict
+from typing import Dict
+
+from openspeech.models import OpenspeechModel
+from openspeech.tokenizers.tokenizer import Tokenizer
+from openspeech.utils import get_class_name
+
+
+class OpenspeechLanguageModel(OpenspeechModel):
+    r"""
+    Base class for OpenSpeech's language models.
+
+    Args:
+        configs (DictConfig): configuration set.
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        - **inputs** (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        - **input_lengths** (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        outputs (dict): Result of model predictions that contains `loss`, `logits`, `targets`, `predictions`.
+    """
+    def __init__(self, configs, tokenizer: Tokenizer) -> None:
+        super(OpenspeechLanguageModel, self).__init__(configs, tokenizer)
+
+    def build_model(self):
+        raise NotImplementedError
+
+    def collect_outputs(
+            self,
+            stage: str,
+            logits: torch.Tensor,
+            targets: torch.Tensor,
+    ) -> OrderedDict:
+        perplexity = self.criterion(logits, targets[:, 1:])
+        predictions = logits.max(-1)[1]
+
+        return OrderedDict({
+            "loss": perplexity,
+            "logits": logits,
+            "targets": targets,
+            "predictions": predictions,
+        })
+
+    def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor) -> Dict[str, torch.Tensor]:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for inference.
+
+        Inputs:
+            inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+            input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+        Returns:
+            outputs (dict): Result of model predictions that contains `loss`, `logits`, `targets`, `predictions`.
+        """
+        if get_class_name(self.lm) == 'LSTMLanguageModel':
+            logits = self.lm(inputs, teacher_forcing_ratio=0.0)
+        elif get_class_name(self.lm) == 'TransformerLanguageModel':
+            logits = self.lm(inputs, input_lengths)
+        else:
+            raise ValueError(f"Unsupported language model class: {get_class_name(self.lm)}")
+
+        predictions = logits.max(-1)[1]
+        return {
+            "predictions": predictions,
+            "logits": logits,
+        }
+
+    def training_step(self, batch: tuple, batch_idx: int) -> OrderedDict:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for training.
+
+        Inputs:
+            train_batch (tuple): A train batch contains `inputs`
+            batch_idx (int): The index of batch
+
+        Returns:
+            loss (torch.Tensor): loss for training
+        """
+        inputs, input_lengths, targets = batch
+        if get_class_name(self.lm) == 'LSTMLanguageModel':
+            logits = self.lm(inputs, teacher_forcing_ratio=self.teacher_forcing_ratio)
+        elif get_class_name(self.lm) == 'TransformerLanguageModel':
+            logits = self.lm(inputs, input_lengths)
+        else:
+            raise ValueError(f"Unsupported language model class: {get_class_name(self.lm)}")
+
+        return self.collect_outputs(
+            stage='train',
+            logits=logits,
+            targets=targets,
+        )
+
+    def validation_step(self, batch: tuple, batch_idx: int) -> OrderedDict:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for validation.
+
+        Inputs:
+            train_batch (tuple): A train batch contains `inputs`
+            batch_idx (int): The index of batch
+
+        Returns:
+            loss (torch.Tensor): loss for training
+        """
+        inputs, input_lengths, targets = batch
+
+        if get_class_name(self.lm) == 'LSTMLanguageModel':
+            logits = self.lm(inputs, teacher_forcing_ratio=0.0)
+        elif get_class_name(self.lm) == 'TransformerLanguageModel':
+            logits = self.lm(inputs, input_lengths)
+        else:
+            raise ValueError(f"Unsupported language model class: {get_class_name(self.lm)}")
+
+        return self.collect_outputs(
+            stage='val',
+            logits=logits,
+            targets=targets,
+        )
+
+    def test_step(self, batch: tuple, batch_idx: int) -> OrderedDict:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for test.
+
+        Inputs:
+            train_batch (tuple): A train batch contains `inputs`
+            batch_idx (int): The index of batch
+
+        Returns:
+            loss (torch.Tensor): loss for training
+        """
+        inputs, input_lengths, targets = batch
+
+        if get_class_name(self.lm) == 'LSTMLanguageModel':
+            logits = self.lm(inputs, teacher_forcing_ratio=0.0)
+        elif get_class_name(self.lm) == 'TransformerLanguageModel':
+            logits = self.lm(inputs, input_lengths)
+        else:
+            raise ValueError(f"Unsupported language model class: {get_class_name(self.lm)}")
+
+        return self.collect_outputs(
+            stage='test',
+            logits=logits,
+            targets=targets,
+        )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/openspeech_model.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/openspeech_model.py
new file mode 100644
index 000000000..c6a6ade95
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/openspeech_model.py
@@ -0,0 +1,188 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+from typing import Dict
+from torch import Tensor
+from torch.optim import Adam, Adagrad, Adadelta, Adamax, AdamW, SGD, ASGD
+
+from openspeech.optim import AdamP, RAdam, Novograd
+from openspeech.criterion import CRITERION_REGISTRY
+from openspeech.metrics import WordErrorRate, CharacterErrorRate
+from openspeech.optim.scheduler import SCHEDULER_REGISTRY
+from openspeech.tokenizers.tokenizer import Tokenizer
+
+
+class OpenspeechModel(nn.Module):
+    r"""
+    Super class of openspeech models.
+
+    Note:
+        Do not use this class directly, use one of the sub classes.
+
+    Args:
+        configs (DictConfig): configuration set.
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        outputs (dict): Result of model predictions.
+    """
+    def __init__(self, configs, tokenizer: Tokenizer) -> None:
+        super(OpenspeechModel, self).__init__()
+        self.configs = configs
+        self.num_classes = len(tokenizer)
+        self.gradient_clip_val = configs.trainer.gradient_clip_val
+        self.tokenizer = tokenizer
+        self.current_val_loss = 100.0
+        self.wer_metric = WordErrorRate(tokenizer)
+        self.cer_metric = CharacterErrorRate(tokenizer)
+        self.tokenizer = tokenizer
+        self.criterion = self.configure_criterion(configs.criterion.criterion_name)
+
+    def build_model(self):
+        raise NotImplementedError
+
+    def set_beam_decoder(self, beam_size: int = 3):
+        raise NotImplementedError
+
+    def info(self, dictionary: dict) -> None:
+        r"""
+        Logging information from dictionary.
+
+        Args:
+            dictionary (dict): dictionary contains information.
+        """
+        for key, value in dictionary.items():
+            print(key, value)
+
+    def forward(self, inputs: torch.FloatTensor, input_lengths: torch.LongTensor) -> Dict[str, Tensor]:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for inference.
+
+        Inputs:
+            inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+            input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+        Returns:
+            outputs (dict): Result of model predictions.
+        """
+        raise NotImplementedError
+
+    def training_step(self, batch: tuple, batch_idx: int):
+        r"""
+        Forward propagate a `inputs` and `targets` pair for training.
+
+        Inputs:
+            batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths`
+            batch_idx (int): The index of batch
+
+        Returns:
+            loss (torch.Tensor): loss for training
+        """
+        raise NotImplementedError
+
+    def validation_step(self, batch: tuple, batch_idx: int):
+        r"""
+        Forward propagate a `inputs` and `targets` pair for validation.
+
+        Inputs:
+            batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths`
+            batch_idx (int): The index of batch
+
+        Returns:
+            loss (torch.Tensor): loss for training
+        """
+        raise NotImplementedError
+
+    def test_step(self, batch: tuple, batch_idx: int):
+        r"""
+        Forward propagate a `inputs` and `targets` pair for test.
+
+        Inputs:
+            batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths`
+            batch_idx (int): The index of batch
+
+        Returns:
+            loss (torch.Tensor): loss for training
+        """
+        raise NotImplementedError
+
+    def configure_optimizers(self):
+        r"""
+        Choose what optimizers and learning-rate schedulers to use in your optimization.
+
+
+        Returns:
+            - **Dictionary** - The first item has multiple optimizers, and the second has multiple LR schedulers (or multiple ``lr_dict``).
+        """
+        SUPPORTED_OPTIMIZERS = {
+            "adam": Adam,
+            "adamp": AdamP,
+            "radam": RAdam,
+            "adagrad": Adagrad,
+            "adadelta": Adadelta,
+            "adamax": Adamax,
+            "adamw": AdamW,
+            "sgd": SGD,
+            "asgd": ASGD,
+            "novograd": Novograd,
+        }
+
+        assert self.configs.model.optimizer in SUPPORTED_OPTIMIZERS.keys(), \
+            f"Unsupported Optimizer: {self.configs.model.optimizer}\n" \
+            f"Supported Optimizers: {SUPPORTED_OPTIMIZERS.keys()}"
+
+        optimizer = SUPPORTED_OPTIMIZERS[self.configs.model.optimizer](
+            self.parameters(),
+            lr=self.configs.lr_scheduler.lr,
+        )
+        scheduler = SCHEDULER_REGISTRY[self.configs.lr_scheduler.scheduler_name](
+            optimizer, self.configs)
+
+        return optimizer, scheduler
+
+    def configure_criterion(self, criterion_name: str) -> nn.Module:
+        r"""
+        Configure criterion for training.
+
+        Args:
+            criterion_name (str): name of criterion
+
+        Returns:
+            criterion (nn.Module): criterion for training
+        """
+        if criterion_name in ('joint_ctc_cross_entropy', 'label_smoothed_cross_entropy'):
+            return CRITERION_REGISTRY[criterion_name](
+                configs=self.configs,
+                num_classes=self.num_classes,
+                tokenizer=self.tokenizer,
+            )
+        else:
+            return CRITERION_REGISTRY[criterion_name](
+                configs=self.configs,
+                tokenizer=self.tokenizer,
+            )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/openspeech_transducer_model.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/openspeech_transducer_model.py
new file mode 100644
index 000000000..0b74c7dc6
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/openspeech_transducer_model.py
@@ -0,0 +1,287 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+import warnings
+from torch import Tensor
+from collections import OrderedDict
+from typing import Tuple, Dict
+
+from openspeech.models import OpenspeechModel
+from openspeech.search import BeamSearchRNNTransducer
+from openspeech.modules import Linear
+from openspeech.utils import get_class_name
+from openspeech.tokenizers.tokenizer import Tokenizer
+
+
+class OpenspeechTransducerModel(OpenspeechModel):
+    r"""
+    Base class for OpenSpeech's transducer models.
+
+    Args:
+        configs (DictConfig): configuration set.
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        - **inputs** (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        - **input_lengths** (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        dict (dict): Result of model predictions that contains `predictions`, `logits`, `encoder_outputs`, `encoder_output_lengths`
+    """
+
+    def __init__(self, configs, tokenizer: Tokenizer) -> None:
+        super(OpenspeechTransducerModel, self).__init__(configs, tokenizer)
+        self.encoder = None
+        self.decoder = None
+        self.decode = self.greedy_decode
+
+        if hasattr(self.configs.model, "encoder_dim"):
+            in_features = self.configs.model.encoder_dim + self.configs.model.decoder_output_dim
+        elif hasattr(self.configs.model, "output_dim"):
+            in_features = self.configs.model.output_dim << 1
+        else:
+            raise ValueError("Transducer model must be contain `encoder_dim` or `encoder_hidden_state_dim` config.")
+
+        self.fc = nn.Sequential(
+            Linear(in_features=in_features, out_features=in_features),
+            nn.Tanh(),
+            Linear(in_features=in_features, out_features=self.num_classes),
+        )
+
+    def set_beam_decode(self, beam_size: int = 3, expand_beam: float = 2.3, state_beam: float = 4.6):
+        """ Setting beam search decode """
+        self.decode = BeamSearchRNNTransducer(
+            joint=self.joint,
+            decoder=self.decoder,
+            beam_size=beam_size,
+            expand_beam=expand_beam,
+            state_beam=state_beam,
+            blank_id=self.tokenizer.blank_id,
+        )
+
+    def collect_outputs(
+            self,
+            stage: str,
+            logits: torch.FloatTensor,
+            input_lengths: torch.IntTensor,
+            targets: torch.IntTensor,
+            target_lengths: torch.IntTensor,
+    ) -> OrderedDict:
+        predictions = logits.max(-1)[1]
+
+        loss = self.criterion(
+            logits=logits,
+            targets=targets[:, 1:].contiguous().int(),
+            input_lengths=input_lengths.int(),
+            target_lengths=target_lengths.int(),
+        )
+
+        return OrderedDict({
+            "loss": loss,
+            "predictions": predictions,
+            "targets": targets,
+            "logits": logits,
+        })
+
+    def _expand_for_joint(self, encoder_outputs: Tensor, decoder_outputs: Tensor) -> Tuple[Tensor, Tensor]:
+        input_length = encoder_outputs.size(1)
+        target_length = decoder_outputs.size(1)
+
+        encoder_outputs = encoder_outputs.unsqueeze(2)
+        decoder_outputs = decoder_outputs.unsqueeze(1)
+
+        encoder_outputs = encoder_outputs.repeat([1, 1, target_length, 1])
+        decoder_outputs = decoder_outputs.repeat([1, input_length, 1, 1])
+        return encoder_outputs, decoder_outputs
+
+    def joint(self, encoder_outputs: Tensor, decoder_outputs: Tensor) -> Tensor:
+        r"""
+        Joint `encoder_outputs` and `decoder_outputs`.
+
+        Args:
+            encoder_outputs (torch.FloatTensor): A output sequence of encoders. `FloatTensor` of size ``(batch, seq_length, dimension)``
+            decoder_outputs (torch.FloatTensor): A output sequence of decoders. `FloatTensor` of size ``(batch, seq_length, dimension)``
+
+        Returns:
+            outputs (torch.FloatTensor): outputs of joint `encoder_outputs` and `decoder_outputs`..
+        """
+        if encoder_outputs.dim() == 3 and decoder_outputs.dim() == 3:
+            encoder_outputs, decoder_outputs = self._expand_for_joint(encoder_outputs, decoder_outputs)
+        else:
+            assert encoder_outputs.dim() == decoder_outputs.dim()
+
+        outputs = torch.cat((encoder_outputs, decoder_outputs), dim=-1)
+        outputs = self.fc(outputs).log_softmax(dim=-1)
+
+        return outputs
+
+    def greedy_decode(self, encoder_outputs: Tensor, max_length: int) -> Tensor:
+        r"""
+        Decode `encoder_outputs`.
+
+        Args:
+            encoder_outputs (torch.FloatTensor): A output sequence of encoders. `FloatTensor` of size
+                ``(batch, seq_length, dimension)``
+            max_length (int): max decoding time step
+
+        Returns:
+            * logits (torch.FloatTensor): Log probability of model predictions.
+        """
+        outputs = list()
+
+        for encoder_output in encoder_outputs:
+            pred_tokens = list()
+            decoder_input = encoder_output.new_zeros(1, 1).fill_(self.decoder.sos_id).long()
+            decoder_output, hidden_state = self.decoder(decoder_input)
+
+            for t in range(max_length):
+                step_output = self.joint(encoder_output[t].view(-1), decoder_output.view(-1))
+
+                pred_token = step_output.argmax(dim=0)
+                pred_token = int(pred_token.item())
+                pred_tokens.append(pred_token)
+
+                decoder_input = torch.LongTensor([[pred_token]])
+                if torch.cuda.is_available():
+                    decoder_input = decoder_input.cuda()
+
+                decoder_output, hidden_state = self.decoder(
+                    decoder_input, hidden_states=hidden_state
+                )
+
+            outputs.append(torch.LongTensor(pred_tokens))
+
+        return torch.stack(outputs, dim=0)
+
+    def forward(self, inputs: Tensor, input_lengths: Tensor) -> Dict[str, Tensor]:
+        r"""
+        Decode `encoder_outputs`.
+
+        Args:
+            inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+            input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+        Returns:
+            dict (dict): Result of model predictions that contains `predictions`,
+                `encoder_outputs`, `encoder_output_lengths`
+        """
+        if get_class_name(self.encoder) in ["ConformerEncoder", "ContextNetEncoder"]:
+            encoder_outputs, _, output_lengths = self.encoder(inputs, input_lengths)
+        else:
+            encoder_outputs, output_lengths = self.encoder(inputs, input_lengths)
+        max_length = encoder_outputs.size(1)
+
+        predictions = self.decode(encoder_outputs, max_length)
+        return {
+            "predictions": predictions,
+            "encoder_outputs": encoder_outputs,
+            "encoder_output_lengths": output_lengths,
+        }
+
+    def training_step(self, batch: tuple, batch_idx: int) -> OrderedDict:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for training.
+
+        Inputs:
+            train_batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths`
+            batch_idx (int): The index of batch
+
+        Returns:
+            loss (torch.Tensor): loss for training
+        """
+        inputs, targets, input_lengths, target_lengths = batch
+
+        if get_class_name(self.encoder) in ["ConformerEncoder", "ContextNetEncoder"]:
+            encoder_outputs, _, output_lengths = self.encoder(inputs, input_lengths)
+        else:
+            encoder_outputs, output_lengths = self.encoder(inputs, input_lengths)
+
+        decoder_outputs, _ = self.decoder(targets, target_lengths)
+        logits = self.joint(encoder_outputs, decoder_outputs)
+
+        return self.collect_outputs(
+            'train',
+            logits=logits,
+            input_lengths=output_lengths,
+            targets=targets,
+            target_lengths=target_lengths,
+        )
+
+    def validation_step(self, batch: tuple, batch_idx: int) -> OrderedDict:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for validation.
+
+        Inputs:
+            train_batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths`
+            batch_idx (int): The index of batch
+
+        Returns:
+            loss (torch.Tensor): loss for training
+        """
+        inputs, targets, input_lengths, target_lengths = batch
+
+        if get_class_name(self.encoder) in ["ConformerEncoder", "ContextNetEncoder"]:
+            encoder_outputs, _, output_lengths = self.encoder(inputs, input_lengths)
+        else:
+            encoder_outputs, output_lengths = self.encoder(inputs, input_lengths)
+
+        decoder_outputs, _ = self.decoder(targets, target_lengths)
+        logits = self.joint(encoder_outputs, decoder_outputs)
+
+        return self.collect_outputs(
+            'val',
+            logits=logits,
+            input_lengths=output_lengths,
+            targets=targets,
+            target_lengths=target_lengths,
+        )
+
+    def test_step(self, batch: tuple, batch_idx: int) -> OrderedDict:
+        r"""
+        Forward propagate a `inputs` and `targets` pair for test.
+
+        Inputs:
+            train_batch (tuple): A train batch contains `inputs`, `targets`, `input_lengths`, `target_lengths`
+            batch_idx (int): The index of batch
+
+        Returns:
+            loss (torch.Tensor): loss for training
+        """
+        inputs, targets, input_lengths, target_lengths = batch
+
+        if get_class_name(self.encoder) in ["ConformerEncoder", "ContextNetEncoder"]:
+            encoder_outputs, _, output_lengths = self.encoder(inputs, input_lengths)
+        else:
+            encoder_outputs, output_lengths = self.encoder(inputs, input_lengths)
+
+        decoder_outputs, _ = self.decoder(targets, target_lengths)
+        logits = self.joint(encoder_outputs, decoder_outputs)
+
+        return self.collect_outputs(
+            'test',
+            logits=logits,
+            input_lengths=output_lengths,
+            targets=targets,
+            target_lengths=target_lengths,
+        )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/quartznet/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/quartznet/__init__.py
new file mode 100644
index 000000000..ca88df541
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/quartznet/__init__.py
@@ -0,0 +1,32 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from .configurations import (
+    QuartzNet15x5Configs,
+    QuartzNet5x5Configs,
+    QuartzNet10x5Configs
+)
+from .model import (
+    QuartzNet10x5Model,
+    QuartzNet15x5Model,
+    QuartzNet5x5Model
+)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/quartznet/configurations.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/quartznet/configurations.py
new file mode 100644
index 000000000..451193691
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/quartznet/configurations.py
@@ -0,0 +1,193 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from dataclasses import dataclass, field
+
+from openspeech.dataclass.configurations import OpenspeechDataclass
+
+
+@dataclass
+class QuartzNet5x5Configs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.models.QuartzNet5x5`.
+
+    It is used to initiated an `QuartzNet5x5` model.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Args:
+        model_name (str): Model name (default: quartznet5x5)
+        num_blocks (int): Number of quartznet blocks (default: 5)
+        num_sub_blocks (int): Number of quartznet sub blocks (default: 5)
+        in_channels (str): Output channels of jasper block's convolution
+        out_channels (str): Output channels of jasper block's convolution
+        kernel_size (str): Kernel size of jasper block's convolution
+        dilation (str): Dilation of jasper block's convolution
+        dropout_p (str): Dropout probability
+        optimizer (str): Optimizer for training.
+    """
+    model_name: str = field(
+        default="quartznet5x5", metadata={"help": "Model name"}
+    )
+    num_blocks: int = field(
+        default=5, metadata={"help": "Number of quartznet blocks"}
+    )
+    num_sub_blocks: int = field(
+        default=5, metadata={"help": "Number of quartznet sub blocks"}
+    )
+    in_channels: str = field(
+        default="(None, 256, 256, 256, 512, 512, 512, 512, 1024)",
+        metadata={"help": "Input channels of jasper blocks"}
+    )
+    out_channels: str = field(
+        default="(256, 256, 256, 512, 512, 512, 512, 1024, None)",
+        metadata={"help": "Output channels of jasper block's convolution"}
+    )
+    kernel_size: str = field(
+        default="(33, 33, 39, 51, 63, 75, 87, 1, 1)",
+        metadata={"help": "Kernel size of jasper block's convolution"}
+    )
+    dilation: str = field(
+        default="(1, 1, 1, 1, 1, 1, 1, 1, 2)",
+        metadata={"help": "Dilation of jasper block's convolution"}
+    )
+    dropout_p: str = field(
+        default="(0.2, None, None, None, None, None, 0.2, 0.2, 0.2)",
+        metadata={"help": "Dropout probability"}
+    )
+    optimizer: str = field(
+        default="novograd", metadata={"help": "Optimizer for training."}
+    )
+
+
+@dataclass
+class QuartzNet10x5Configs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.models.QuartzNet10x5`.
+
+    It is used to initiated an `QuartzNet10x5` model.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Args:
+        model_name (str): Model name (default: quartznet5x5)
+        num_blocks (int): Number of quartznet blocks (default: 10)
+        num_sub_blocks (int): Number of quartznet sub blocks (default: 5)
+        in_channels (str): Output channels of jasper block's convolution
+        out_channels (str): Output channels of jasper block's convolution
+        kernel_size (str): Kernel size of jasper block's convolution
+        dilation (str): Dilation of jasper block's convolution
+        dropout_p (str): Dropout probability
+        optimizer (str): Optimizer for training.
+    """
+    model_name: str = field(
+        default="quartznet10x5", metadata={"help": "Model name"}
+    )
+    num_blocks: int = field(
+        default=10, metadata={"help": "Number of quartznet blocks"}
+    )
+    num_sub_blocks: int = field(
+        default=5, metadata={"help": "Number of quartznet sub blocks"}
+    )
+    in_channels: str = field(
+        default="(None, 256, 256, 256, 256, 256, 512, 512, 512, 512, 512, 512, 512, 1024)",
+        metadata={"help": "Input channels of jasper blocks"}
+    )
+    out_channels: str = field(
+        default="(256, 256, 256, 256, 256, 512, 512, 512, 512, 512, 512, 512, 1024, None)",
+        metadata={"help": "Output channels of jasper block's convolution"}
+    )
+    kernel_size: str = field(
+        default="(33, 33, 33, 39, 39, 51, 51, 63, 63, 75, 75, 87, 1, 1)",
+        metadata={"help": "Kernel size of jasper block's convolution"}
+    )
+    dilation: str = field(
+        default="(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2)",
+        metadata={"help": "Dilation of jasper block's convolution"}
+    )
+    dropout_p: str = field(
+        default="(0.2, None, None, None, None, None, None, None, None, None, None, 0.2, 0.2, 0.2)",
+        metadata={"help": "Dropout probability"}
+    )
+    optimizer: str = field(
+        default="novograd", metadata={"help": "Optimizer for training."}
+    )
+
+
+@dataclass
+class QuartzNet15x5Configs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.models.QuartzNet15x5`.
+
+    It is used to initiated an `QuartzNet15x5` model.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Args:
+        model_name (str): Model name (default: quartznet15x5)
+        num_blocks (int): Number of quartznet blocks (default: 15)
+        num_sub_blocks (int): Number of quartznet sub blocks (default: 5)
+        in_channels (str): Output channels of jasper block's convolution
+        out_channels (str): Output channels of jasper block's convolution
+        kernel_size (str): Kernel size of jasper block's convolution
+        dilation (str): Dilation of jasper block's convolution
+        dropout_p (str): Dropout probability
+        optimizer (str): Optimizer for training.
+    """
+    model_name: str = field(
+        default="quartznet15x5", metadata={"help": "Model name"}
+    )
+    num_blocks: int = field(
+        default=15, metadata={"help": "Number of quartznet5x5 blocks"}
+    )
+    num_sub_blocks: int = field(
+        default=5, metadata={"help": "Number of quartznet5x5 sub blocks"}
+    )
+    in_channels: str = field(
+        default="(None, 256, 256, 256, 256, 256, 256, 256, "
+                "512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 1024)",
+        metadata={"help": "Input channels of jasper blocks"}
+    )
+    out_channels: str = field(
+        default="(256, 256, 256, 256, 256, 256, 256, "
+                "512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 1024, None)",
+        metadata={"help": "Output channels of jasper block's convolution"}
+    )
+    kernel_size: str = field(
+        default="(33, 33, 33, 33, 39, 39, 39, 51, 51, 51, 63, 63, 63, 75, 75, 75, 87, 1, 1)",
+        metadata={"help": "Kernel size of jasper block's convolution"}
+    )
+    dilation: str = field(
+        default="(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2)",
+        metadata={"help": "Dilation of jasper block's convolution"}
+    )
+    dropout_p: str = field(
+        default="(0.2, None, None, None, None, None, None, None, None, "
+                "None, None, None, None, None, None, None, 0.2, 0.2, 0.2)",
+        metadata={"help": "Dropout probability"}
+    )
+    optimizer: str = field(
+        default="novograd", metadata={"help": "Optimizer for training."}
+    )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/quartznet/model.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/quartznet/model.py
new file mode 100644
index 000000000..b5673366b
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/quartznet/model.py
@@ -0,0 +1,103 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+from openspeech.models import register_model
+from openspeech.models import OpenspeechCTCModel
+from openspeech.encoders.quartznet import QuartzNet
+from openspeech.tokenizers.tokenizer import Tokenizer
+from openspeech.models.quartznet.configurations import (
+    QuartzNet5x5Configs,
+    QuartzNet10x5Configs,
+    QuartzNet15x5Configs,
+)
+
+
+@register_model('quartznet5x5', dataclass=QuartzNet5x5Configs)
+class QuartzNet5x5Model(OpenspeechCTCModel):
+    r"""
+    QUARTZNET: DEEP AUTOMATIC SPEECH RECOGNITION WITH 1D TIME-CHANNEL SEPARABLE CONVOLUTIONS
+    Paper: https://arxiv.org/abs/1910.10261.pdf
+
+    Args:
+        configs (DictConfig): configuration set.
+        tokenizer (Tokeizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        outputs (dict): Result of model predictions that contains `y_hats`, `logits`, `output_lengths`
+    """
+
+    def __init__(self, configs, tokenizer: Tokenizer) -> None:
+        super(QuartzNet5x5Model, self).__init__(configs, tokenizer)
+
+    def build_model(self):
+        self.encoder = QuartzNet(
+            configs=self.configs,
+            input_dim=self.configs.audio.num_mels,
+            num_classes=self.num_classes,
+        )
+
+
+@register_model('quartznet10x5', dataclass=QuartzNet10x5Configs)
+class QuartzNet10x5Model(QuartzNet5x5Model):
+    r"""
+    QUARTZNET: DEEP AUTOMATIC SPEECH RECOGNITION WITH 1D TIME-CHANNEL SEPARABLE CONVOLUTIONS
+    Paper: https://arxiv.org/abs/1910.10261.pdf
+
+    Args:
+        configs (DictConfig): configuration set.
+        tokenizer (Tokeizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        outputs (dict): Result of model predictions that contains `y_hats`, `logits`, `output_lengths`
+    """
+    def __init__(self, configs, tokenizer: Tokenizer) -> None:
+        super(QuartzNet10x5Model, self).__init__(configs, tokenizer)
+
+
+@register_model('quartznet15x5', dataclass=QuartzNet15x5Configs)
+class QuartzNet15x5Model(QuartzNet5x5Model):
+    r"""
+    QUARTZNET: DEEP AUTOMATIC SPEECH RECOGNITION WITH 1D TIME-CHANNEL SEPARABLE CONVOLUTIONS
+    Paper: https://arxiv.org/abs/1910.10261.pdf
+
+    Args:
+        configs (DictConfig): configuration set.
+        tokenizer (Tokeizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        outputs (dict): Result of model predictions that contains `y_hats`, `logits`, `output_lengths`
+    """
+    def __init__(self, configs, tokenizer: Tokenizer) -> None:
+        super(QuartzNet15x5Model, self).__init__(configs, tokenizer)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/rnn_transducer/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/rnn_transducer/__init__.py
new file mode 100644
index 000000000..20514a8b2
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/rnn_transducer/__init__.py
@@ -0,0 +1,24 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from .configurations import RNNTransducerConfigs
+from .model import RNNTransducerModel
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/rnn_transducer/configurations.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/rnn_transducer/configurations.py
new file mode 100644
index 000000000..0efb842a4
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/rnn_transducer/configurations.py
@@ -0,0 +1,83 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from dataclasses import dataclass, field
+
+from openspeech.dataclass.configurations import OpenspeechDataclass
+
+
+@dataclass
+class RNNTransducerConfigs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.models.RNNTransducer`.
+
+    It is used to initiated an `RNNTransducer` model.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Args:
+        model_name (str): Model name (default: transformer_transducer)
+        encoder_hidden_state_dim (int): Hidden state dimension of encoder (default: 312)
+        decoder_hidden_state_dim (int): Hidden state dimension of decoder (default: 512)
+        num_encoder_layers (int): The number of encoder layers. (default: 4)
+        num_decoder_layers (int): The number of decoder layers. (default: 1)
+        encoder_dropout_p (float): The dropout probability of encoder. (default: 0.2)
+        decoder_dropout_p (float): The dropout probability of decoder. (default: 0.2)
+        bidirectional (bool): If True, becomes a bidirectional encoders (default: True)
+        rnn_type (str): Type of rnn cell (rnn, lstm, gru) (default: lstm)
+        output_dim (int): dimension of model output. (default: 512)
+        optimizer (str): Optimizer for training. (default: adam)
+    """
+    model_name: str = field(
+        default="rnn_transducer", metadata={"help": "Model name"}
+    )
+    encoder_hidden_state_dim: int = field(
+        default=320, metadata={"help": "Dimension of encoder."}
+    )
+    decoder_hidden_state_dim: int = field(
+        default=512, metadata={"help": "Dimension of decoder."}
+    )
+    num_encoder_layers: int = field(
+        default=4, metadata={"help": "The number of encoder layers."}
+    )
+    num_decoder_layers: int = field(
+        default=1, metadata={"help": "The number of decoder layers."}
+    )
+    encoder_dropout_p: float = field(
+        default=0.2, metadata={"help": "The dropout probability of encoder."}
+    )
+    decoder_dropout_p: float = field(
+        default=0.2, metadata={"help": "The dropout probability of decoder."}
+    )
+    bidirectional: bool = field(
+        default=True, metadata={"help": "If True, becomes a bidirectional encoders"}
+    )
+    rnn_type: str = field(
+        default="lstm", metadata={"help": "Type of rnn cell (rnn, lstm, gru)"}
+    )
+    output_dim: int = field(
+        default=512, metadata={"help": "Dimension of outputs"}
+    )
+    optimizer: str = field(
+        default="adam", metadata={"help": "Optimizer for training."}
+    )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/rnn_transducer/model.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/rnn_transducer/model.py
new file mode 100644
index 000000000..f3aba75b9
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/rnn_transducer/model.py
@@ -0,0 +1,75 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+from openspeech.models import register_model
+from openspeech.models import OpenspeechTransducerModel
+from openspeech.decoders import RNNTransducerDecoder
+from openspeech.encoders import RNNTransducerEncoder
+from openspeech.models.rnn_transducer.configurations import RNNTransducerConfigs
+from openspeech.tokenizers.tokenizer import Tokenizer
+
+
+@register_model('rnn_transducer', dataclass=RNNTransducerConfigs)
+class RNNTransducerModel(OpenspeechTransducerModel):
+    r"""
+    RNN-Transducer are a form of sequence-to-sequence models that do not employ attention mechanisms.
+    Unlike most sequence-to-sequence models, which typically need to process the entire input sequence
+    (the waveform in our case) to produce an output (the sentence), the RNN-T continuously processes input samples and
+    streams output symbols, a property that is welcome for speech dictation. In our implementation,
+    the output symbols are the characters of the alphabet.
+
+    Args:
+        configs (DictConfig): configuration set.
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        - **inputs** (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        - **input_lengths** (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        outputs (dict): Result of model predictions.
+    """
+
+    def __init__(self, configs, tokenizer: Tokenizer) -> None:
+        super(RNNTransducerModel, self).__init__(configs, tokenizer)
+
+    def build_model(self):
+        self.encoder = RNNTransducerEncoder(
+            input_dim=self.configs.audio.num_mels,
+            hidden_state_dim=self.configs.model.encoder_hidden_state_dim,
+            output_dim=self.configs.model.output_dim,
+            num_layers=self.configs.model.num_encoder_layers,
+            rnn_type=self.configs.model.rnn_type,
+            dropout_p=self.configs.model.encoder_dropout_p,
+        )
+        self.decoder = RNNTransducerDecoder(
+            num_classes=self.num_classes,
+            hidden_state_dim=self.configs.model.decoder_hidden_state_dim,
+            output_dim=self.configs.model.output_dim,
+            num_layers=self.configs.model.num_decoder_layers,
+            rnn_type=self.configs.model.rnn_type,
+            pad_id=self.tokenizer.pad_id,
+            sos_id=self.tokenizer.sos_id,
+            eos_id=self.tokenizer.eos_id,
+            dropout_p=self.configs.model.decoder_dropout_p,
+        )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/transformer_lm/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/transformer_lm/__init__.py
new file mode 100644
index 000000000..f1a4c1fad
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/transformer_lm/__init__.py
@@ -0,0 +1,24 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from .configurations import TransformerLanguageModelConfigs
+from .model import TransformerLanguageModel
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/transformer_lm/configurations.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/transformer_lm/configurations.py
new file mode 100644
index 000000000..48255d033
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/transformer_lm/configurations.py
@@ -0,0 +1,71 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from dataclasses import dataclass, field
+
+from openspeech.dataclass.configurations import OpenspeechDataclass
+
+
+@dataclass
+class TransformerLanguageModelConfigs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.models.TransformerLanguageModel`.
+
+    It is used to initiated an `TransformerLanguageModel` model.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Args:
+        model_name (str): Model name (default: transformer_lm)
+        num_layers (int): The number of lstm layers. (default: 6)
+        d_model (int): The dimension of model. (default: 768)
+        dropout_p (float): The dropout probability of encoder. (default: 0.3)
+        d_ff (int): Dimenstion of feed forward network. (default: 2048)
+        num_attention_heads (int): The number of attention heads. (default: 8)
+        max_length (int): Max decoding length. (default: 128)
+        optimizer (str): Optimizer for training. (default: adam)
+    """
+    model_name: str = field(
+        default="transformer_lm", metadata={"help": "Model name"}
+    )
+    num_layers: int = field(
+        default=6, metadata={"help": "The number of encoder layers."}
+    )
+    d_model: int = field(
+        default=768, metadata={"help": "The dimension of model."}
+    )
+    d_ff: int = field(
+        default=1536, metadata={"help": "The dimenstion of feed forward network."}
+    )
+    num_attention_heads: int = field(
+        default=8, metadata={"help": "The number of attention heads."}
+    )
+    dropout_p: float = field(
+        default=0.3, metadata={"help": "The dropout probability of encoder."}
+    )
+    max_length: int = field(
+        default=128, metadata={"help": "Max decoding length."}
+    )
+    optimizer: str = field(
+        default="adam", metadata={"help": "Optimizer for training."}
+    )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/transformer_lm/model.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/transformer_lm/model.py
new file mode 100644
index 000000000..39938ed84
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/transformer_lm/model.py
@@ -0,0 +1,62 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+from openspeech.lm.transformer_lm import TransformerForLanguageModel
+from openspeech.models import register_model, OpenspeechModel
+from openspeech.models.transformer_lm.configurations import TransformerLanguageModelConfigs
+from openspeech.tokenizers.tokenizer import Tokenizer
+
+
+@register_model('transformer_lm', dataclass=TransformerLanguageModelConfigs)
+class TransformerLanguageModel(OpenspeechModel):
+    r"""
+    Transformer language model.
+    Paper: https://arxiv.org/abs/1904.09408
+
+    Args:
+        configs (DictConfig): configuration set.
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        - **inputs** (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        - **input_lengths** (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        outputs (dict): Result of model predictions.
+    """
+    def __init__(self, configs, tokenizer: Tokenizer) -> None:
+        super(TransformerLanguageModel, self).__init__(configs, tokenizer)
+
+    def build_model(self):
+        self.lm = TransformerForLanguageModel(
+            num_classes=self.num_classes,
+            max_length=self.configs.model.max_length,
+            d_model=self.configs.model.d_model,
+            d_ff=self.configs.model.d_ff,
+            num_attention_heads=self.configs.model.num_attention_heads,
+            pad_id=self.tokenizer.pad_id,
+            sos_id=self.tokenizer.sos_id,
+            eos_id=self.tokenizer.eos_id,
+            dropout_p=self.configs.model.dropout_p,
+            num_layers=self.configs.model.num_layers,
+        )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/transformer_transducer/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/transformer_transducer/__init__.py
new file mode 100644
index 000000000..dc26f6666
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/transformer_transducer/__init__.py
@@ -0,0 +1,24 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from .configurations import TransformerTransducerConfigs
+from .model import TransformerTransducerModel
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/transformer_transducer/configurations.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/transformer_transducer/configurations.py
new file mode 100644
index 000000000..b0796f196
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/transformer_transducer/configurations.py
@@ -0,0 +1,92 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from dataclasses import dataclass, field
+
+from openspeech.dataclass.configurations import OpenspeechDataclass
+
+
+@dataclass
+class TransformerTransducerConfigs(OpenspeechDataclass):
+    r"""
+    This is the configuration class to store the configuration of
+    a :class:`~openspeech.models.TransformerTransducer`.
+
+    It is used to initiated an `TransformerTransducer` model.
+
+    Configuration objects inherit from :class: `~openspeech.dataclass.configs.OpenspeechDataclass`.
+
+    Args:
+        model_name (str): Model name (default: transformer_transducer)
+        extractor (str): The CNN feature extractor. (default: conv2d_subsample)
+        d_model (int): Dimension of model. (default: 512)
+        d_ff (int): Dimension of feed forward network. (default: 2048)
+        num_attention_heads (int): The number of attention heads. (default: 8)
+        num_audio_layers (int): The number of audio layers. (default: 18)
+        num_label_layers (int): The number of label layers. (default: 2)
+        audio_dropout_p (float): The dropout probability of encoder. (default: 0.1)
+        label_dropout_p (float): The dropout probability of decoder. (default: 0.1)
+        decoder_hidden_state_dim (int): Hidden state dimension of decoder (default: 512)
+        decoder_output_dim (int): dimension of model output. (default: 512)
+        conv_kernel_size (int): Kernel size of convolution layer. (default: 31)
+        max_positional_length (int): Max length of positional encoding. (default: 5000)
+        optimizer (str): Optimizer for training. (default: adam)
+    """
+    model_name: str = field(
+        default="transformer_transducer", metadata={"help": "Model name"}
+    )
+    encoder_dim: int = field(
+        default=512, metadata={"help": "Dimension of encoder name"}
+    )
+    d_ff: int = field(
+        default=2048, metadata={"help": "Dimension of feed forward network"}
+    )
+    num_audio_layers: int = field(
+        default=18, metadata={"help": "Number of audio layers"}
+    )
+    num_label_layers: int = field(
+        default=2, metadata={"help": "Number of label layers"}
+    )
+    num_attention_heads: int = field(
+        default=8, metadata={"help": "Number of attention heads"}
+    )
+    audio_dropout_p: float = field(
+        default=0.1, metadata={"help": "Dropout probability of audio layer"}
+    )
+    label_dropout_p: float = field(
+        default=0.1, metadata={"help": "Dropout probability of label layer"}
+    )
+    decoder_hidden_state_dim: int = field(
+        default=512, metadata={"help": "Hidden state dimension of decoder"}
+    )
+    decoder_output_dim: int = field(
+        default=512, metadata={"help": "Dimension of model output."}
+    )
+    conv_kernel_size: int = field(
+        default=31, metadata={"help": "Kernel size of convolution layer."}
+    )
+    max_positional_length: int = field(
+        default=5000, metadata={"help": "Max length of positional encoding."}
+    )
+    optimizer: str = field(
+        default="adam", metadata={"help": "Optimizer for training."}
+    )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/models/transformer_transducer/model.py b/audio/speech_recognition/conformer/pytorch/openspeech/models/transformer_transducer/model.py
new file mode 100644
index 000000000..95911e237
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/models/transformer_transducer/model.py
@@ -0,0 +1,119 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+from torch import Tensor
+from typing import Dict
+from collections import OrderedDict
+
+from openspeech.models import register_model, OpenspeechTransducerModel
+from openspeech.decoders import TransformerTransducerDecoder
+from openspeech.encoders import TransformerTransducerEncoder
+from openspeech.search import BeamSearchTransformerTransducer
+from openspeech.models.transformer_transducer.configurations import TransformerTransducerConfigs
+from openspeech.tokenizers.tokenizer import Tokenizer
+
+
+@register_model('transformer_transducer', dataclass=TransformerTransducerConfigs)
+class TransformerTransducerModel(OpenspeechTransducerModel):
+    r"""
+    Transformer-Transducer is that every layer is identical for both audio and label encoders.
+    Unlike the basic transformer structure, the audio encoder and label encoder are separate.
+    So, the alignment is handled by a separate forward-backward process within the RNN-T architecture.
+    And we replace the LSTM encoders in RNN-T architecture with Transformer encoders.
+
+    Args:
+        configs (DictConfig): configuraion set
+        tokenizer (Tokenizer): tokenizer is in charge of preparing the inputs for a model.
+
+    Inputs:
+        inputs (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        input_lengths (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        outputs (dict): Result of model predictions.
+    """
+
+    def __init__(self, configs, tokenizer: Tokenizer) -> None:
+        super(TransformerTransducerModel, self).__init__(configs, tokenizer)
+
+    def build_model(self):
+        self.encoder = TransformerTransducerEncoder(
+            input_size=self.configs.audio.num_mels,
+            model_dim=self.configs.model.encoder_dim,
+            d_ff=self.configs.model.d_ff,
+            num_layers=self.configs.model.num_audio_layers,
+            num_heads=self.configs.model.num_attention_heads,
+            dropout=self.configs.model.audio_dropout_p,
+            max_positional_length=self.configs.model.max_positional_length,
+        )
+        self.decoder = TransformerTransducerDecoder(
+            num_classes=self.num_classes,
+            model_dim=self.configs.model.encoder_dim,
+            d_ff=self.configs.model.d_ff,
+            num_layers=self.configs.model.num_label_layers,
+            num_heads=self.configs.model.num_attention_heads,
+            dropout=self.configs.model.label_dropout_p,
+            max_positional_length=self.configs.model.max_positional_length,
+            pad_id=self.tokenizer.pad_id,
+            sos_id=self.tokenizer.sos_id,
+            eos_id=self.tokenizer.eos_id,
+        )
+
+    def set_beam_decode(self, beam_size: int = 3, expand_beam: float = 2.3, state_beam: float = 4.6):
+        """ Setting beam search decode """
+        self.decode = BeamSearchTransformerTransducer(
+            joint=self.joint,
+            decoder=self.decoder,
+            beam_size=beam_size,
+            expand_beam=expand_beam,
+            state_beam=state_beam,
+            blank_id=self.tokenizer.blank_id,
+        )
+
+    def greedy_decode(self, encoder_outputs: Tensor, max_length: int) -> Tensor:
+        r"""
+        Decode `encoder_outputs`.
+
+        Args:
+            encoder_outputs (torch.FloatTensor): A output sequence of encoders. `FloatTensor` of size ``(seq_length, dimension)``
+            max_length (int): max decoding time step
+
+        Returns:
+            y_hats (torch.IntTensor): model's predictions.
+        """
+        batch = encoder_outputs.size(0)
+        pred_tokens = list()
+
+        targets = encoder_outputs.new_tensor([self.decoder.sos_id] * batch, dtype=torch.long)
+
+        for i in range(max_length):
+            decoder_output, _ = self.decoder(targets, None)
+            decoder_output = decoder_output.squeeze(1)
+            encoder_output = encoder_outputs[:, i, :]
+            targets = self.joint(encoder_output, decoder_output)
+            targets = targets.max(1)[1]
+            pred_tokens.append(targets)
+
+        pred_tokens = torch.stack(pred_tokens, dim=1)
+
+        return torch.LongTensor(pred_tokens)
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/__init__.py
new file mode 100644
index 000000000..a8cfa9112
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/__init__.py
@@ -0,0 +1,97 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from .wrapper import Linear, View, Transpose
+from .additive_attention import AdditiveAttention
+from .add_normalization import AddNorm
+from .batchnorm_relu_rnn import BNReluRNN
+from .conformer_attention_module import MultiHeadedSelfAttentionModule
+from .conformer_block import ConformerBlock
+from .conformer_convolution_module import ConformerConvModule
+from .conformer_feed_forward_module import FeedForwardModule
+from .conv2d_extractor import Conv2dExtractor
+from .conv2d_subsampling import Conv2dSubsampling
+from .deepspeech2_extractor import DeepSpeech2Extractor
+from .vgg_extractor import VGGExtractor
+from .conv_base import BaseConv1d
+from .conv_group_shuffle import ConvGroupShuffle
+from .depthwise_conv1d import DepthwiseConv1d
+from .glu import GLU
+from .jasper_subblock import JasperSubBlock
+from .jasper_block import JasperBlock
+from .location_aware_attention import LocationAwareAttention
+from .mask import get_attn_pad_mask, get_attn_subsequent_mask
+from .mask_conv1d import MaskConv1d
+from .mask_conv2d import MaskConv2d
+from .multi_head_attention import MultiHeadAttention
+from .pointwise_conv1d import PointwiseConv1d
+from .positional_encoding import PositionalEncoding
+from .positionwise_feed_forward import PositionwiseFeedForward
+from .quartznet_subblock import QuartzNetSubBlock
+from .quartznet_block import QuartzNetBlock
+from .relative_multi_head_attention import RelativeMultiHeadAttention
+from .residual_connection_module import ResidualConnectionModule
+from .dot_product_attention import DotProductAttention
+from .swish import Swish
+from .time_channel_separable_conv1d import TimeChannelSeparableConv1d
+from .transformer_embedding import TransformerEmbedding
+
+
+__all__ = [
+    "AdditiveAttention",
+    "AddNorm",
+    "BNReluRNN",
+    "MultiHeadAttention",
+    "ConformerBlock",
+    "ConformerConvModule",
+    "FeedForwardModule",
+    "Conv2dExtractor",
+    "Conv2dSubsampling",
+    "DeepSpeech2Extractor",
+    "VGGExtractor",
+    "BaseConv1d",
+    "ConvGroupShuffle",
+    "DepthwiseConv1d",
+    "GLU",
+    "JasperSubBlock",
+    "JasperBlock",
+    "LocationAwareAttention",
+    "get_attn_pad_mask",
+    "get_attn_subsequent_mask",
+    "MaskConv1d",
+    "MaskConv2d",
+    "MultiHeadAttention",
+    "PointwiseConv1d",
+    "PositionalEncoding",
+    "PositionwiseFeedForward",
+    "QuartzNetSubBlock",
+    "QuartzNetBlock",
+    "RelativeMultiHeadAttention",
+    "ResidualConnectionModule",
+    "DotProductAttention",
+    "Swish",
+    "TimeChannelSeparableConv1d",
+    "TransformerEmbedding",
+    "Linear",
+    "View",
+    "Transpose",
+]
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/add_normalization.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/add_normalization.py
new file mode 100644
index 000000000..6d2cd278a
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/add_normalization.py
@@ -0,0 +1,44 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+
+
+class AddNorm(nn.Module):
+    """
+    Add & Normalization layer proposed in "Attention Is All You Need".
+    Transformer employ a residual connection around each of the two sub-layers,
+    (Multi-Head Attention & Feed-Forward) followed by layer normalization.
+    """
+    def __init__(self, sublayer: nn.Module, d_model: int = 512) -> None:
+        super(AddNorm, self).__init__()
+        self.sublayer = sublayer
+        self.layer_norm = nn.LayerNorm(d_model)
+
+    def forward(self, *args):
+        residual = args[0]
+        outputs = self.sublayer(*args)
+
+        if isinstance(outputs, tuple):
+            return self.layer_norm(outputs[0] + residual), outputs[1]
+
+        return self.layer_norm(outputs + residual)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/additive_attention.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/additive_attention.py
new file mode 100644
index 000000000..cb08716c8
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/additive_attention.py
@@ -0,0 +1,63 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from typing import Tuple
+
+from openspeech.modules.wrapper import Linear
+
+
+class AdditiveAttention(nn.Module):
+    r"""
+    Applies a additive attention (bahdanau) mechanism on the output features from the decoders.
+    Additive attention proposed in "Neural Machine Translation by Jointly Learning to Align and Translate" paper.
+
+    Args:
+        dim (int): dimension of model
+
+    Inputs: query, key, value
+        - **query** (batch_size, q_len, hidden_dim): tensor containing the output features from the decoders.
+        - **key** (batch, k_len, d_model): tensor containing projection vector for encoders.
+        - **value** (batch_size, v_len, hidden_dim): tensor containing features of the encoded input sequence.
+
+    Returns: context, attn
+        - **context**: tensor containing the context vector from attention mechanism.
+        - **attn**: tensor containing the alignment from the encoders outputs.
+    """
+    def __init__(self, dim: int) -> None:
+        super(AdditiveAttention, self).__init__()
+        self.query_proj = Linear(dim, dim, bias=False)
+        self.key_proj = Linear(dim, dim, bias=False)
+        self.score_proj = Linear(dim, 1)
+        self.bias = nn.Parameter(torch.rand(dim).uniform_(-0.1, 0.1))
+
+    def forward(self, query: Tensor, key: Tensor, value: Tensor) -> Tuple[Tensor, Tensor]:
+        score = self.score_proj(torch.tanh(self.key_proj(key) + self.query_proj(query) + self.bias)).squeeze(-1)
+        attn = F.softmax(score, dim=-1)
+        context = torch.bmm(attn.unsqueeze(1), value)
+
+        context += query
+
+        return context, attn
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/batchnorm_relu_rnn.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/batchnorm_relu_rnn.py
new file mode 100644
index 000000000..1bdc77a2e
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/batchnorm_relu_rnn.py
@@ -0,0 +1,84 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+
+class BNReluRNN(nn.Module):
+    r"""
+    Recurrent neural network with batch normalization layer & ReLU activation function.
+
+    Args:
+        input_size (int): size of input
+        hidden_state_dim (int): the number of features in the hidden state `h`
+        rnn_type (str, optional): type of RNN cell (default: gru)
+        bidirectional (bool, optional): if True, becomes a bidirectional encoders (defulat: True)
+        dropout_p (float, optional): dropout probability (default: 0.1)
+
+    Inputs: inputs, input_lengths
+        - **inputs** (batch, time, dim): Tensor containing input vectors
+        - **input_lengths**: Tensor containing containing sequence lengths
+
+    Returns: outputs
+        - **outputs**: Tensor produced by the BNReluRNN module
+    """
+    supported_rnns = {
+        'lstm': nn.LSTM,
+        'gru': nn.GRU,
+        'rnn': nn.RNN,
+    }
+
+    def __init__(
+            self,
+            input_size: int,
+            hidden_state_dim: int = 512,
+            rnn_type: str = 'gru',
+            bidirectional: bool = True,
+            dropout_p: float = 0.1,
+    ):
+        super(BNReluRNN, self).__init__()
+        self.hidden_state_dim = hidden_state_dim
+        self.batch_norm = nn.BatchNorm1d(input_size)
+        rnn_cell = self.supported_rnns[rnn_type]
+        self.rnn = rnn_cell(
+            input_size=input_size,
+            hidden_size=hidden_state_dim,
+            num_layers=1,
+            bias=True,
+            batch_first=True,
+            dropout=dropout_p,
+            bidirectional=bidirectional,
+        )
+
+    def forward(self, inputs: Tensor, input_lengths: Tensor):
+        total_length = inputs.size(0)
+
+        inputs = F.relu(self.batch_norm(inputs.transpose(1, 2)))
+        inputs = inputs.transpose(1, 2)
+
+        outputs = nn.utils.rnn.pack_padded_sequence(inputs, input_lengths.cpu())
+        outputs, hidden_states = self.rnn(outputs)
+        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, total_length=total_length)
+
+        return outputs
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/conformer_attention_module.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/conformer_attention_module.py
new file mode 100644
index 000000000..975c7f812
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/conformer_attention_module.py
@@ -0,0 +1,81 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+from torch import Tensor
+from typing import Optional
+
+from openspeech.modules.relative_multi_head_attention import RelativeMultiHeadAttention
+from openspeech.modules.positional_encoding import PositionalEncoding
+
+
+class MultiHeadedSelfAttentionModule(nn.Module):
+    r"""
+    Conformer employ multi-headed self-attention (MHSA) while integrating an important technique from Transformer-XL,
+    the relative sinusoidal positional encoding scheme. The relative positional encoding allows the self-attention
+    module to generalize better on different input length and the resulting encoders is more robust to the variance of
+    the utterance length. Conformer use prenorm residual units with dropout which helps training
+    and regularizing deeper models.
+
+    Args:
+        d_model (int): The dimension of model
+        num_heads (int): The number of attention heads.
+        dropout_p (float): probability of dropout
+
+    Inputs: inputs, mask
+        - **inputs** (batch, time, dim): Tensor containing input vector
+        - **mask** (batch, 1, time2) or (batch, time1, time2): Tensor containing indices to be masked
+
+    Returns:
+        - **outputs** (batch, time, dim): Tensor produces by relative multi headed self attention module.
+    """
+    def __init__(
+            self,
+            d_model: int,
+            num_heads: int,
+            dropout_p: float = 0.1,
+    ) -> None:
+        super(MultiHeadedSelfAttentionModule, self).__init__()
+        self.positional_encoding = PositionalEncoding(d_model)
+        self.layer_norm = nn.LayerNorm(d_model)
+        self.attention = RelativeMultiHeadAttention(d_model, num_heads, dropout_p)
+        self.dropout = nn.Dropout(p=dropout_p)
+
+    def forward(self, inputs: Tensor, mask: Optional[Tensor] = None) -> Tensor:
+        r"""
+        Forward propagate of conformer's multi-headed self attention module.
+
+        Inputs: inputs, mask
+            - **inputs** (batch, time, dim): Tensor containing input vector
+            - **mask** (batch, 1, time2) or (batch, time1, time2): Tensor containing indices to be masked
+
+        Returns:
+            - **outputs** (batch, time, dim): Tensor produces by relative multi headed self attention module.
+        """
+        batch_size, seq_length, _ = inputs.size()
+        pos_embedding = self.positional_encoding(seq_length)
+        pos_embedding = pos_embedding.repeat(batch_size, 1, 1)
+
+        inputs = self.layer_norm(inputs)
+        outputs = self.attention(inputs, inputs, inputs, pos_embedding=pos_embedding, mask=mask)
+
+        return self.dropout(outputs)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/conformer_block.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/conformer_block.py
new file mode 100644
index 000000000..585f08286
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/conformer_block.py
@@ -0,0 +1,110 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+from torch import Tensor
+
+from openspeech.modules.conformer_attention_module import MultiHeadedSelfAttentionModule
+from openspeech.modules.conformer_convolution_module import ConformerConvModule
+from openspeech.modules.conformer_feed_forward_module import FeedForwardModule
+from openspeech.modules.residual_connection_module import ResidualConnectionModule
+
+
+class ConformerBlock(nn.Module):
+    r"""
+    Conformer block contains two Feed Forward modules sandwiching the Multi-Headed Self-Attention module
+    and the Convolution module. This sandwich structure is inspired by Macaron-Net, which proposes replacing
+    the original feed-forward layer in the Transformer block into two half-step feed-forward layers,
+    one before the attention layer and one after.
+
+    Args:
+        encoder_dim (int, optional): Dimension of conformer encoders
+        num_attention_heads (int, optional): Number of attention heads
+        feed_forward_expansion_factor (int, optional): Expansion factor of feed forward module
+        conv_expansion_factor (int, optional): Expansion factor of conformer convolution module
+        feed_forward_dropout_p (float, optional): Probability of feed forward module dropout
+        attention_dropout_p (float, optional): Probability of attention module dropout
+        conv_dropout_p (float, optional): Probability of conformer convolution module dropout
+        conv_kernel_size (int or tuple, optional): Size of the convolving kernel
+        half_step_residual (bool): Flag indication whether to use half step residual or not
+
+    Inputs: inputs
+        - **inputs** (batch, time, dim): Tensor containing input vector
+
+    Returns: outputs
+        - **outputs** (batch, time, dim): Tensor produces by conformer block.
+    """
+    def __init__(
+            self,
+            encoder_dim: int = 512,
+            num_attention_heads: int = 8,
+            feed_forward_expansion_factor: int = 4,
+            conv_expansion_factor: int = 2,
+            feed_forward_dropout_p: float = 0.1,
+            attention_dropout_p: float = 0.1,
+            conv_dropout_p: float = 0.1,
+            conv_kernel_size: int = 31,
+            half_step_residual: bool = True,
+    ) -> None:
+        super(ConformerBlock, self).__init__()
+        if half_step_residual:
+            self.feed_forward_residual_factor = 0.5
+        else:
+            self.feed_forward_residual_factor = 1
+
+        self.sequential = nn.Sequential(
+            ResidualConnectionModule(
+                module=FeedForwardModule(
+                    encoder_dim=encoder_dim,
+                    expansion_factor=feed_forward_expansion_factor,
+                    dropout_p=feed_forward_dropout_p,
+                ),
+                module_factor=self.feed_forward_residual_factor,
+            ),
+            ResidualConnectionModule(
+                module=MultiHeadedSelfAttentionModule(
+                    d_model=encoder_dim,
+                    num_heads=num_attention_heads,
+                    dropout_p=attention_dropout_p,
+                ),
+            ),
+            ResidualConnectionModule(
+                module=ConformerConvModule(
+                    in_channels=encoder_dim,
+                    kernel_size=conv_kernel_size,
+                    expansion_factor=conv_expansion_factor,
+                    dropout_p=conv_dropout_p,
+                ),
+            ),
+            ResidualConnectionModule(
+                module=FeedForwardModule(
+                    encoder_dim=encoder_dim,
+                    expansion_factor=feed_forward_expansion_factor,
+                    dropout_p=feed_forward_dropout_p,
+                ),
+                module_factor=self.feed_forward_residual_factor,
+            ),
+            nn.LayerNorm(encoder_dim),
+        )
+
+    def forward(self, inputs: Tensor) -> Tensor:
+        return self.sequential(inputs)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/conformer_convolution_module.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/conformer_convolution_module.py
new file mode 100644
index 000000000..4adf07fbc
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/conformer_convolution_module.py
@@ -0,0 +1,83 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+from torch import Tensor
+
+from openspeech.modules.glu import GLU
+from openspeech.modules.swish import Swish
+from openspeech.modules.pointwise_conv1d import PointwiseConv1d
+from openspeech.modules.depthwise_conv1d import DepthwiseConv1d
+from openspeech.modules.wrapper import Transpose
+
+
+class ConformerConvModule(nn.Module):
+    r"""
+    Conformer convolution module starts with a pointwise convolution and a gated linear unit (GLU).
+    This is followed by a single 1-D depthwise convolution layer. Batchnorm is  deployed just after the convolution
+    to aid training deep models.
+
+    Args:
+        in_channels (int): Number of channels in the input
+        kernel_size (int or tuple, optional): Size of the convolving kernel Default: 31
+        dropout_p (float, optional): probability of dropout
+
+    Inputs: inputs
+        inputs (batch, time, dim): Tensor contains input sequences
+
+    Outputs: outputs
+        outputs (batch, time, dim): Tensor produces by conformer convolution module.
+    """
+    def __init__(
+            self,
+            in_channels: int,
+            kernel_size: int = 31,
+            expansion_factor: int = 2,
+            dropout_p: float = 0.1,
+    ) -> None:
+        super(ConformerConvModule, self).__init__()
+        assert (kernel_size - 1) % 2 == 0, "kernel_size should be a odd number for 'SAME' padding"
+        assert expansion_factor == 2, "Currently, Only Supports expansion_factor 2"
+
+        self.sequential = nn.Sequential(
+            nn.LayerNorm(in_channels),
+            Transpose(shape=(1, 2)),
+            PointwiseConv1d(in_channels, in_channels * expansion_factor, stride=1, padding=0, bias=True),
+            GLU(dim=1),
+            DepthwiseConv1d(in_channels, in_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2),
+            nn.BatchNorm1d(in_channels),
+            Swish(),
+            PointwiseConv1d(in_channels, in_channels, stride=1, padding=0, bias=True),
+            nn.Dropout(p=dropout_p),
+        )
+
+    def forward(self, inputs: Tensor) -> Tensor:
+        r"""
+        Forward propagate of conformer's convolution module.
+
+        Inputs: inputs
+            inputs (batch, time, dim): Tensor contains input sequences
+
+        Outputs: outputs
+            outputs (batch, time, dim): Tensor produces by conformer convolution module.
+        """
+        return self.sequential(inputs).transpose(1, 2)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/conformer_feed_forward_module.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/conformer_feed_forward_module.py
new file mode 100644
index 000000000..d4382fdd6
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/conformer_feed_forward_module.py
@@ -0,0 +1,73 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+from torch import Tensor
+
+from openspeech.modules.swish import Swish
+from openspeech.modules.wrapper import Linear
+
+
+class FeedForwardModule(nn.Module):
+    r"""
+    Conformer Feed Forward Module follow pre-norm residual units and apply layer normalization within the residual unit
+    and on the input before the first linear layer. This module also apply Swish activation and dropout, which helps
+    regularizing the network.
+
+    Args:
+        encoder_dim (int): Dimension of conformer encoders
+        expansion_factor (int): Expansion factor of feed forward module.
+        dropout_p (float): Ratio of dropout
+
+    Inputs: inputs
+        - **inputs** (batch, time, dim): Tensor contains input sequences
+
+    Outputs: outputs
+        - **outputs** (batch, time, dim): Tensor produces by feed forward module.
+    """
+    def __init__(
+            self,
+            encoder_dim: int = 512,
+            expansion_factor: int = 4,
+            dropout_p: float = 0.1,
+    ) -> None:
+        super(FeedForwardModule, self).__init__()
+        self.sequential = nn.Sequential(
+            nn.LayerNorm(encoder_dim),
+            Linear(encoder_dim, encoder_dim * expansion_factor, bias=True),
+            Swish(),
+            nn.Dropout(p=dropout_p),
+            Linear(encoder_dim * expansion_factor, encoder_dim, bias=True),
+            nn.Dropout(p=dropout_p),
+        )
+
+    def forward(self, inputs: Tensor) -> Tensor:
+        r"""
+        Forward propagate of conformer's feed-forward module.
+
+        Inputs: inputs
+            - **inputs** (batch, time, dim): Tensor contains input sequences
+
+        Outputs: outputs
+            - **outputs** (batch, time, dim): Tensor produces by feed forward module.
+        """
+        return self.sequential(inputs)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/contextnet_block.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/contextnet_block.py
new file mode 100644
index 000000000..934e03b7d
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/contextnet_block.py
@@ -0,0 +1,199 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from typing import Tuple
+from torch import Tensor
+from openspeech.modules.contextnet_module import ContextNetConvModule, ContextNetSEModule
+from openspeech.modules.swish import Swish
+import torch.nn as nn
+
+
+class ContextNetBlock(nn.Module):
+    r"""
+    Convolution block contains a number of convolutions, each followed by batch normalization and activation.
+    Squeeze-and-excitation (SE) block operates on the output of the last convolution layer.
+    Skip connection with projection is applied on the output of the squeeze-and-excitation block.
+
+    Args:
+        in_channels (int): Input channel in convolutional layer
+        out_channels (int): Output channel in convolutional layer
+        num_layers (int, optional): The number of convolutional layers (default : 5)
+        kernel_size (int, optional): Value of convolution kernel size (default : 5)
+        stride(int, optional): Value of stride (default : 1)
+        padding (int, optional): Value of padding (default: 0)
+        residual (bool, optional): Flag indication residual or not (default : True)
+
+    Inputs: inputs, input_lengths
+        - **inputs**: Input of convolution block `FloatTensor` of size ``(batch, dimension, seq_length)``
+        - **input_lengths**: The length of input tensor. ``(batch)``
+
+    Returns: output, output_lengths
+        - **output**: Output of convolution block `FloatTensor` of size
+                ``(batch, dimension, seq_length)``
+        - **output_lengths**: The length of output tensor. ``(batch)``
+    """
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            num_layers: int = 5,
+            kernel_size: int = 5,
+            stride: int = 1,
+            padding: int = 0,
+            residual: bool = True,
+    ) -> None:
+        super(ContextNetBlock, self).__init__()
+        self.num_layers = num_layers
+        self.swish = Swish()
+        self.se_layer = ContextNetSEModule(out_channels)
+        self.residual = None
+
+        if residual:
+            self.residual = ContextNetConvModule(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                activation=False,
+            )
+
+        if self.num_layers == 1:
+            self.conv_layers = ContextNetConvModule(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        kernel_size=kernel_size,
+                        stride=stride,
+                        padding=padding,
+                    )
+
+        else:
+            stride_list = [1 for _ in range(num_layers - 1)] + [stride]
+            in_channel_list = [in_channels] + [out_channels for _ in range(num_layers - 1)]
+
+            self.conv_layers = nn.ModuleList(list())
+            for in_channels, stride in zip(in_channel_list, stride_list):
+                self.conv_layers.append(
+                    ContextNetConvModule(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        kernel_size=kernel_size,
+                        stride=stride,
+                        padding=padding,
+                    )
+                )
+
+    def forward(
+            self,
+            inputs: Tensor,
+            input_lengths: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        r"""
+        Forward propagate a `inputs` for convolution block.
+
+        Args:
+            **inputs** (torch.FloatTensor): Input of convolution block `FloatTensor` of size
+                ``(batch, dimension, seq_length)``
+            **input_lengths** (torch.LongTensor): The length of input tensor. ``(batch)``
+
+        Returns:
+            **output** (torch.FloatTensor): Output of convolution block `FloatTensor` of size
+                ``(batch, dimension, seq_length)``
+            **output_lengths** (torch.LongTensor): The length of output tensor. ``(batch)``
+        """
+        output = inputs
+        output_lengths = input_lengths
+
+        if self.num_layers == 1:
+            output, output_lengths = self.conv_layers(output, output_lengths)
+        else:
+            for conv_layer in self.conv_layers:
+                output, output_lengths = conv_layer(output, output_lengths)
+
+        output = self.se_layer(output, output_lengths)
+
+        if self.residual is not None:
+            residual, _ = self.residual(inputs, input_lengths)
+            output += residual
+
+        return self.swish(output), output_lengths
+
+    @staticmethod
+    def make_conv_blocks(
+            input_dim: int = 80,
+            num_layers: int = 5,
+            kernel_size: int = 5,
+            num_channels: int = 256,
+            output_dim: int = 640,
+    ) -> nn.ModuleList:
+        r"""
+        Create 23 convolution blocks.
+
+        Args:
+            input_dim (int, optional): Dimension of input vector (default : 80)
+            num_layers (int, optional): The number of convolutional layers (default : 5)
+            kernel_size (int, optional): Value of convolution kernel size (default : 5)
+            num_channels (int, optional): The number of channels in the convolution filter (default: 256)
+            output_dim (int, optional): Dimension of encoder output vector (default: 640)
+
+        Returns:
+            **conv_blocks** (nn.ModuleList): ModuleList with 23 convolution blocks
+        """
+        conv_blocks = nn.ModuleList()
+
+        # C0 : 1 conv layer, init_dim output channels, stride 1, no residual
+        conv_blocks.append(ContextNetBlock(input_dim, num_channels, 1, kernel_size, 1, 0, False))
+
+        # C1-2 : 5 conv layers, init_dim output channels, stride 1
+        for _ in range(1, 2 + 1):
+            conv_blocks.append(ContextNetBlock(num_channels, num_channels, num_layers, kernel_size, 1, 0, True))
+
+        # C3 : 5 conv layer, init_dim output channels, stride 2
+        conv_blocks.append(ContextNetBlock(num_channels, num_channels, num_layers, kernel_size, 2, 0, True))
+
+        # C4-6 : 5 conv layers, init_dim output channels, stride 1
+        for _ in range(4, 6 + 1):
+            conv_blocks.append(ContextNetBlock(num_channels, num_channels, num_layers, kernel_size, 1, 0, True))
+
+        # C7 : 5 conv layers, init_dim output channels, stride 2
+        conv_blocks.append(ContextNetBlock(num_channels, num_channels, num_layers, kernel_size, 2, 0, True))
+
+        # C8-10 : 5 conv layers, init_dim output channels, stride 1
+        for _ in range(8, 10 + 1):
+            conv_blocks.append(ContextNetBlock(num_channels, num_channels, num_layers, kernel_size, 1, 0, True))
+
+        # C11-13 : 5 conv layers, middle_dim output channels, stride 1
+        conv_blocks.append(ContextNetBlock(num_channels, num_channels << 1, num_layers, kernel_size, 1, 0, True))
+        for _ in range(12, 13 + 1):
+            conv_blocks.append(ContextNetBlock(num_channels << 1, num_channels << 1, num_layers, kernel_size, 1, 0, True))
+
+        # C14 : 5 conv layers, middle_dim output channels, stride 2
+        conv_blocks.append(ContextNetBlock(num_channels << 1, num_channels << 1, num_layers, kernel_size, 2, 0, True))
+
+        # C15-21 : 5 conv layers, middle_dim output channels, stride 1
+        for i in range(15, 21 + 1):
+            conv_blocks.append(ContextNetBlock(num_channels << 1, num_channels << 1, num_layers, kernel_size, 1, 0, True))
+
+        # C22 : 1 conv layer, final_dim output channels, stride 1, no residual
+        conv_blocks.append(ContextNetBlock(num_channels << 1, output_dim, 1, kernel_size, 1, 0, False))
+
+        return conv_blocks
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/contextnet_module.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/contextnet_module.py
new file mode 100644
index 000000000..0540dbb66
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/contextnet_module.py
@@ -0,0 +1,182 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from torch import Tensor
+from typing import Tuple
+from openspeech.modules.swish import Swish
+import torch.nn as nn
+
+
+class ContextNetSEModule(nn.Module):
+    r"""
+    Squeeze-and-excitation module.
+
+    Args:
+        dim (int): Dimension to be used for two fully connected (FC) layers
+
+    Inputs: inputs, input_lengths
+        - **inputs**: The output of the last convolution layer. `FloatTensor` of size
+            ``(batch, dimension, seq_length)``
+        - **input_lengths**: The length of input tensor. ``(batch)``
+
+    Returns: output
+        - **output**: Output of SELayer `FloatTensor` of size
+            ``(batch, dimension, seq_length)``
+    """
+    def __init__(self, dim: int) -> None:
+        super(ContextNetSEModule, self).__init__()
+        assert dim % 8 == 0, 'Dimension should be divisible by 8.'
+
+        self.dim = dim
+        self.sequential = nn.Sequential(
+            nn.Linear(dim, dim // 8),
+            Swish(),
+            nn.Linear(dim // 8, dim),
+        )
+
+    def forward(
+            self,
+            inputs: Tensor,
+            input_lengths: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        r"""
+        Forward propagate a `inputs` for SE Layer.
+
+        Args:
+            **inputs** (torch.FloatTensor): The output of the last convolution layer. `FloatTensor` of size
+                ``(batch, dimension, seq_length)``
+            **input_lengths** (torch.LongTensor): The length of input tensor. ``(batch)``
+
+        Returns:
+            **output** (torch.FloatTensor): Output of SELayer `FloatTensor` of size
+                ``(batch, dimension, seq_length)``
+        """
+        residual = inputs
+        seq_lengths = inputs.size(2)
+
+        inputs = inputs.sum(dim=2) / input_lengths.unsqueeze(1)
+        output = self.sequential(inputs)
+
+        output = output.sigmoid().unsqueeze(2)
+        output = output.repeat(1, 1, seq_lengths)
+
+        return output * residual
+
+
+class ContextNetConvModule(nn.Module):
+    r"""
+    When the stride is 1, it pads the input so the output has the shape as the input.
+    And when the stride is 2, it does not pad the input.
+
+    Args:
+        in_channels (int): Input channel in convolutional layer
+        out_channels (int): Output channel in convolutional layer
+        kernel_size (int, optional): Value of convolution kernel size (default : 5)
+        stride(int, optional): Value of stride (default : 1)
+        padding (int, optional): Value of padding (default: 0)
+        activation (bool, optional): Flag indication use activation function or not (default : True)
+        groups(int, optional): Value of groups (default : 1)
+        bias (bool, optional): Flag indication use bias or not (default : True)
+
+    Inputs: inputs, input_lengths
+        - **inputs**: Input of convolution layer `FloatTensor` of size ``(batch, dimension, seq_length)``
+        - **input_lengths**: The length of input tensor. ``(batch)``
+
+    Returns: output, output_lengths
+        - **output**: Output of convolution layer `FloatTensor` of size
+                ``(batch, dimension, seq_length)``
+        - **output_lengths**: The length of output tensor. ``(batch)``
+    """
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            kernel_size: int = 5,
+            stride: int = 1,
+            padding: int = 0,
+            activation: bool = True,
+            groups: int = 1,
+            bias: bool = True,
+    ):
+        super(ContextNetConvModule, self).__init__()
+        assert kernel_size == 5, "The convolution layer in the ContextNet model has 5 kernels."
+
+        if stride == 1:
+            self.conv = nn.Conv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=1,
+                padding=(kernel_size - 1) // 2,
+                groups=groups,
+                bias=bias,
+            )
+        elif stride == 2:
+            self.conv = nn.Conv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=1,
+                padding=padding,
+                groups=groups,
+                bias=bias,
+            )
+
+        self.batch_norm = nn.BatchNorm1d(num_features=out_channels)
+        self.activation = activation
+
+        if self.activation:
+            self.swish = Swish()
+
+    def forward(
+            self,
+            inputs: Tensor,
+            input_lengths: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        r"""
+        Forward propagate a `inputs` for convolution layer.
+
+        Args:
+            **inputs** (torch.FloatTensor): Input of convolution layer `FloatTensor` of size
+                ``(batch, dimension, seq_length)``
+            **input_lengths** (torch.LongTensor): The length of input tensor. ``(batch)``
+
+        Returns:
+            **output** (torch.FloatTensor): Output of convolution layer `FloatTensor` of size
+                ``(batch, dimension, seq_length)``
+            **output_lengths** (torch.LongTensor): The length of output tensor. ``(batch)``
+        """
+        outputs, output_lengths = self.conv(inputs), self._get_sequence_lengths(input_lengths)
+        outputs = self.batch_norm(outputs)
+
+        if self.activation:
+            outputs = self.swish(outputs)
+
+        return outputs, output_lengths
+
+    def _get_sequence_lengths(self, seq_lengths):
+        return (
+                (seq_lengths + 2 * self.conv.padding[0]
+                 - self.conv.dilation[0] * (self.conv.kernel_size[0] - 1) - 1) // self.conv.stride[0] + 1
+        )
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/conv2d_extractor.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/conv2d_extractor.py
new file mode 100644
index 000000000..fa604998c
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/conv2d_extractor.py
@@ -0,0 +1,107 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import math
+import torch
+import torch.nn as nn
+from torch import Tensor
+from typing import Tuple
+
+from openspeech.modules.swish import Swish
+from openspeech.utils import get_class_name
+
+
+class Conv2dExtractor(nn.Module):
+    r"""
+    Provides inteface of convolutional extractor.
+
+    Note:
+        Do not use this class directly, use one of the sub classes.
+        Define the 'self.conv' class variable.
+
+    Inputs: inputs, input_lengths
+        - **inputs** (batch, time, dim): Tensor containing input vectors
+        - **input_lengths**: Tensor containing containing sequence lengths
+
+    Returns: outputs, output_lengths
+        - **outputs**: Tensor produced by the convolution
+        - **output_lengths**: Tensor containing sequence lengths produced by the convolution
+    """
+    supported_activations = {
+        'hardtanh': nn.Hardtanh(0, 20, inplace=True),
+        'relu': nn.ReLU(inplace=True),
+        'elu': nn.ELU(inplace=True),
+        'leaky_relu': nn.LeakyReLU(inplace=True),
+        'gelu': nn.GELU(),
+        'swish': Swish(),
+    }
+
+    def __init__(self, input_dim: int, activation: str = 'hardtanh') -> None:
+        super(Conv2dExtractor, self).__init__()
+        self.input_dim = input_dim
+        self.activation = Conv2dExtractor.supported_activations[activation]
+        self.conv = None
+
+    def get_output_lengths(self, seq_lengths: torch.Tensor):
+        assert self.conv is not None, "self.conv should be defined"
+
+        for module in self.conv:
+            if isinstance(module, nn.Conv2d):
+                numerator = seq_lengths + 2 * module.padding[1] - module.dilation[1] * (module.kernel_size[1] - 1) - 1
+                seq_lengths = numerator.float() / float(module.stride[1])
+                seq_lengths = seq_lengths.int() + 1
+
+            elif isinstance(module, nn.MaxPool2d):
+                seq_lengths >>= 1
+
+        return seq_lengths.int()
+
+    def get_output_dim(self):
+        if get_class_name(self) == "VGGExtractor":
+            output_dim = (self.input_dim - 1) << 5 if self.input_dim % 2 else self.input_dim << 5
+
+        elif get_class_name(self) == "DeepSpeech2Extractor":
+            output_dim = int(math.floor(self.input_dim + 2 * 20 - 41) / 2 + 1)
+            output_dim = int(math.floor(output_dim + 2 * 10 - 21) / 2 + 1)
+            output_dim <<= 5
+
+        elif get_class_name(self) == "Conv2dSubsampling":
+            factor = ((self.input_dim - 1) // 2 - 1) // 2
+            output_dim = self.out_channels * factor
+
+        else:
+            raise ValueError(f"Unsupported Extractor : {self.extractor}")
+
+        return output_dim
+
+    def forward(self, inputs: Tensor, input_lengths: Tensor) -> Tuple[Tensor, Tensor]:
+        r"""
+        inputs: torch.FloatTensor (batch, time, dimension)
+        input_lengths: torch.IntTensor (batch)
+        """
+        outputs, output_lengths = self.conv(inputs.unsqueeze(1).transpose(2, 3), input_lengths)
+
+        batch_size, channels, dimension, seq_lengths = outputs.size()
+        outputs = outputs.permute(0, 3, 1, 2)
+        outputs = outputs.view(batch_size, seq_lengths, channels * dimension)
+
+        return outputs, output_lengths
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/conv2d_subsampling.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/conv2d_subsampling.py
new file mode 100644
index 000000000..b5eac267f
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/conv2d_subsampling.py
@@ -0,0 +1,71 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+from typing import Tuple
+
+from openspeech.modules import Conv2dExtractor
+
+
+class Conv2dSubsampling(Conv2dExtractor):
+    r"""
+    Convolutional 2D subsampling (to 1/4 length)
+
+    Args:
+        input_dim (int): Dimension of input vector
+        in_channels (int): Number of channels in the input vector
+        out_channels (int): Number of channels produced by the convolution
+        activation (str): Activation function
+
+    Inputs: inputs
+        - **inputs** (batch, time, dim): Tensor containing sequence of inputs
+        - **input_lengths** (batch): list of sequence input lengths
+
+    Returns: outputs, output_lengths
+        - **outputs** (batch, time, dim): Tensor produced by the convolution
+        - **output_lengths** (batch): list of sequence output lengths
+    """
+    def __init__(
+            self,
+            input_dim: int,
+            in_channels: int,
+            out_channels: int,
+            activation: str = 'relu',
+    ) -> None:
+        super(Conv2dSubsampling, self).__init__(input_dim, activation)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        from openspeech.modules import MaskConv2d
+        self.conv = MaskConv2d(
+            nn.Sequential(
+                nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2),
+                self.activation,
+                nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=2),
+                self.activation,
+            )
+        )
+
+    def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        outputs, output_lengths = super().forward(inputs, input_lengths)
+
+        return outputs, output_lengths
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/conv_base.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/conv_base.py
new file mode 100644
index 000000000..2747e97d6
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/conv_base.py
@@ -0,0 +1,38 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+
+
+class BaseConv1d(nn.Module):
+    """ Base convolution module. """
+    def __init__(self):
+        super(BaseConv1d, self).__init__()
+
+    def _get_sequence_lengths(self, seq_lengths):
+        return (
+            (seq_lengths + 2 * self.conv.padding[0]
+             - self.conv.dilation[0] * (self.conv.kernel_size[0] - 1) - 1) // self.conv.stride[0] + 1
+        )
+
+    def forward(self, *args, **kwargs):
+        raise NotImplementedError
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/conv_group_shuffle.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/conv_group_shuffle.py
new file mode 100644
index 000000000..704bd6c9e
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/conv_group_shuffle.py
@@ -0,0 +1,42 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+
+class ConvGroupShuffle(nn.Module):
+    """ Convolution group shuffle module. """
+    def __init__(self, groups, channels):
+        super(ConvGroupShuffle, self).__init__()
+        self.groups = groups
+        self.channels_per_group = channels // groups
+
+    def forward(self, x: Tensor):
+        dim = x.size(-1)
+
+        x = x.view(-1, self.groups, self.channels_per_group, dim)
+        x = torch.transpose(x, 1, 2).contiguous()
+        y = x.view(-1, self.groups * self.channels_per_group, dim)
+
+        return y
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/deepspeech2_extractor.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/deepspeech2_extractor.py
new file mode 100644
index 000000000..2fcbba414
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/deepspeech2_extractor.py
@@ -0,0 +1,73 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+from typing import Tuple
+
+from openspeech.modules import Conv2dExtractor
+
+
+class DeepSpeech2Extractor(Conv2dExtractor):
+    r"""
+    DeepSpeech2 extractor for automatic speech recognition described in
+    "Deep Speech 2: End-to-End Speech Recognition in English and Mandarin" paper
+    - https://arxiv.org/abs/1512.02595
+
+    Args:
+        input_dim (int): Dimension of input vector
+        in_channels (int): Number of channels in the input vector
+        out_channels (int): Number of channels produced by the convolution
+        activation (str): Activation function
+
+    Inputs: inputs, input_lengths
+        - **inputs** (batch, time, dim): Tensor containing input vectors
+        - **input_lengths**: Tensor containing containing sequence lengths
+
+    Returns: outputs, output_lengths
+        - **outputs**: Tensor produced by the convolution
+        - **output_lengths**: Tensor containing sequence lengths produced by the convolution
+    """
+    def __init__(
+            self,
+            input_dim: int,
+            in_channels: int = 1,
+            out_channels: int = 32,
+            activation: str = 'hardtanh',
+    ) -> None:
+        super(DeepSpeech2Extractor, self).__init__(input_dim=input_dim, activation=activation)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        from openspeech.modules import MaskConv2d
+        self.conv = MaskConv2d(
+            nn.Sequential(
+                nn.Conv2d(in_channels, out_channels, kernel_size=(41, 11), stride=(2, 2), padding=(20, 5), bias=False),
+                nn.BatchNorm2d(out_channels),
+                self.activation,
+                nn.Conv2d(out_channels, out_channels, kernel_size=(21, 11), stride=(2, 1), padding=(10, 5), bias=False),
+                nn.BatchNorm2d(out_channels),
+                self.activation,
+            )
+        )
+
+    def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        return super().forward(inputs, input_lengths)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/depthwise_conv1d.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/depthwise_conv1d.py
new file mode 100644
index 000000000..37c0651c8
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/depthwise_conv1d.py
@@ -0,0 +1,74 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+from torch import Tensor
+from typing import Optional
+
+from openspeech.modules.conv_base import BaseConv1d
+
+
+class DepthwiseConv1d(BaseConv1d):
+    r"""
+    When groups == in_channels and out_channels == K * in_channels, where K is a positive integer,
+    this operation is termed in literature as depthwise convolution.
+
+    Args:
+        in_channels (int): Number of channels in the input
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
+        bias (bool, optional): If True, adds a learnable bias to the output. Default: True
+
+    Inputs: inputs
+        - **inputs** (batch, in_channels, time): Tensor containing input vector
+
+    Returns: outputs
+        - **outputs** (batch, out_channels, time): Tensor produces by depthwise 1-D convolution.
+    """
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            kernel_size: int,
+            stride: int = 1,
+            padding: int = 0,
+            bias: bool = False,
+    ) -> None:
+        super(DepthwiseConv1d, self).__init__()
+        assert out_channels % in_channels == 0, "out_channels should be constant multiple of in_channels"
+        self.conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            groups=in_channels,
+            stride=stride,
+            padding=padding,
+            bias=bias,
+        )
+
+    def forward(self, inputs: Tensor, input_lengths: Optional[Tensor] = None) -> Tensor:
+        if input_lengths is None:
+            return self.conv(inputs)
+        else:
+            return self.conv(inputs), self._get_sequence_lengths(input_lengths)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/dot_product_attention.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/dot_product_attention.py
new file mode 100644
index 000000000..831d209f0
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/dot_product_attention.py
@@ -0,0 +1,80 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from torch import Tensor
+from typing import Tuple, Optional
+
+
+class DotProductAttention(nn.Module):
+    r"""
+    Scaled Dot-Product Attention proposed in "Attention Is All You Need"
+    Compute the dot products of the query with all keys, divide each by sqrt(dim),
+    and apply a softmax function to obtain the weights on the values
+
+    Args: dim, mask
+        dim (int): dimension of attention
+        mask (torch.Tensor): tensor containing indices to be masked
+
+    Inputs: query, key, value, mask
+        - **query** (batch, q_len, d_model): tensor containing projection vector for decoders.
+        - **key** (batch, k_len, d_model): tensor containing projection vector for encoders.
+        - **value** (batch, v_len, d_model): tensor containing features of the encoded input sequence.
+        - **mask** (-): tensor containing indices to be masked
+
+    Returns: context, attn
+        - **context**: tensor containing the context vector from attention mechanism.
+        - **attn**: tensor containing the attention (alignment) from the encoders outputs.
+    """
+    def __init__(self, dim: int, scale: bool = True) -> None:
+        super(DotProductAttention, self).__init__()
+        if scale:
+            self.sqrt_dim = np.sqrt(dim)
+        else:
+            self.sqrt_dim = 1
+
+    def forward(
+            self,
+            query: Tensor,
+            key: Tensor,
+            value: Tensor,
+            mask: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Tensor]:
+        if len(query.size()) == 3:
+            score = torch.bmm(query, key.transpose(1, 2)) / self.sqrt_dim
+        else:
+            score = torch.matmul(query, key.transpose(2, 3)) / self.sqrt_dim
+
+        if mask is not None:
+            score.masked_fill_(mask, -1e4)
+
+        attn = F.softmax(score, -1)
+
+        if len(query.size()) == 3:
+            context = torch.bmm(attn, value)
+        else:
+            context = torch.matmul(attn, value)
+
+        return context, attn
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/glu.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/glu.py
new file mode 100644
index 000000000..3e670d6f2
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/glu.py
@@ -0,0 +1,38 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+from torch import Tensor
+
+
+class GLU(nn.Module):
+    r"""
+    The gating mechanism is called Gated Linear Units (GLU), which was first introduced for natural language processing
+    in the paper “Language Modeling with Gated Convolutional Networks”
+    """
+    def __init__(self, dim: int) -> None:
+        super(GLU, self).__init__()
+        self.dim = dim
+
+    def forward(self, inputs: Tensor) -> Tensor:
+        outputs, gate = inputs.chunk(2, dim=self.dim)
+        return outputs * gate.sigmoid()
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/jasper_block.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/jasper_block.py
new file mode 100644
index 000000000..26fc7fedf
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/jasper_block.py
@@ -0,0 +1,109 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+from torch import Tensor
+from typing import Tuple
+
+from openspeech.modules.jasper_subblock import JasperSubBlock
+
+
+class JasperBlock(nn.Module):
+    r"""
+    Jasper Block: The Jasper Block consists of R Jasper sub-block.
+
+    Args:
+        num_sub_blocks (int): number of sub block
+        in_channels (int): number of channels in the input feature
+        out_channels (int): number of channels produced by the convolution
+        kernel_size (int): size of the convolving kernel
+        stride (int): stride of the convolution. (default: 1)
+        dilation (int): spacing between kernel elements. (default: 1)
+        bias (bool): if True, adds a learnable bias to the output. (default: True)
+        dropout_p (float): probability of dropout
+        activation (str): activation function
+
+    Inputs: inputs, input_lengths, residual
+        - **inputs**: tensor contains input sequence vector
+        - **input_lengths**: tensor contains sequence lengths
+        - **residual**: tensor contains residual vector
+
+    Returns: output, output_lengths
+        (torch.FloatTensor, torch.LongTensor)
+
+        * output (torch.FloatTensor): tensor contains output sequence vector
+        * output_lengths (torch.LongTensor): tensor contains output sequence lengths
+    """
+    def __init__(
+            self,
+            num_sub_blocks: int,
+            in_channels: int,
+            out_channels: int,
+            kernel_size: int,
+            stride: int = 1,
+            dilation: int = 1,
+            bias: bool = True,
+            dropout_p: float = 0.2,
+            activation: str = 'relu',
+    ) -> None:
+        super(JasperBlock, self).__init__()
+        padding = self._get_same_padding(kernel_size, stride, dilation)
+        self.layers = nn.ModuleList([
+            JasperSubBlock(
+                in_channels=in_channels if i == 0 else out_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding,
+                bias=bias,
+                dropout_p=dropout_p,
+                activation=activation,
+            ) for i in range(num_sub_blocks)
+        ])
+
+    def _get_same_padding(self, kernel_size: int, stride: int, dilation: int):
+        if stride > 1 and dilation > 1:
+            raise ValueError("Only stride OR dilation may be greater than 1")
+        return (kernel_size // 2) * dilation
+
+    def forward(self, inputs: Tensor, input_lengths: Tensor, residual: Tensor) -> Tuple[Tensor, Tensor]:
+        r"""
+        Forward propagate of jasper block.
+
+        Inputs: inputs, input_lengths, residual
+            - **inputs**: tensor contains input sequence vector
+            - **input_lengths**: tensor contains sequence lengths
+            - **residual**: tensor contains residual vector
+
+        Returns: output, output_lengths
+            (torch.FloatTensor, torch.LongTensor)
+
+            * output (torch.FloatTensor): tensor contains output sequence vector
+            * output_lengths (torch.LongTensor): tensor contains output sequence lengths
+        """
+        for layer in self.layers[:-1]:
+            inputs, input_lengths = layer(inputs, input_lengths)
+
+        outputs, output_lengths = self.layers[-1](inputs, input_lengths, residual)
+
+        return outputs, output_lengths
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/jasper_subblock.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/jasper_subblock.py
new file mode 100644
index 000000000..ae551d609
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/jasper_subblock.py
@@ -0,0 +1,115 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+from typing import Tuple, Optional
+from torch import Tensor
+
+from openspeech.modules.mask_conv1d import MaskConv1d
+
+
+class JasperSubBlock(nn.Module):
+    r"""
+    Jasper sub-block applies the following operations: a 1D-convolution, batch norm, ReLU, and dropout.
+
+    Args:
+        in_channels (int): number of channels in the input feature
+        out_channels (int): number of channels produced by the convolution
+        kernel_size (int): size of the convolving kernel
+        stride (int): stride of the convolution. (default: 1)
+        dilation (int): spacing between kernel elements. (default: 1)
+        padding (int): zero-padding added to both sides of the input. (default: 0)
+        bias (bool): if True, adds a learnable bias to the output. (default: False)
+        dropout_p (float): probability of dropout
+        activation (str): activation function
+
+    Inputs: inputs, input_lengths, residual
+        - **inputs**: tensor contains input sequence vector
+        - **input_lengths**: tensor contains sequence lengths
+        - **residual**: tensor contains residual vector
+
+    Returns: output, output_lengths
+        * output (torch.FloatTensor): tensor contains output sequence vector
+        * output_lengths (torch.LongTensor): tensor contains output sequence lengths
+    """
+    supported_activations = {
+        'hardtanh': nn.Hardtanh(0, 20, inplace=True),
+        'relu': nn.ReLU(inplace=True),
+        'elu': nn.ELU(inplace=True),
+        'leaky_relu': nn.LeakyReLU(inplace=True),
+        'gelu': nn.GELU(),
+    }
+
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            kernel_size: int,
+            stride: int = 1,
+            dilation: int = 1,
+            padding: int = 0,
+            bias: bool = False,
+            dropout_p: float = 0.2,
+            activation: str = 'relu',
+    ) -> None:
+        super(JasperSubBlock, self).__init__()
+
+        self.conv = MaskConv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=bias,
+            dilation=dilation,
+        )
+        self.batch_norm = nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.1)
+        self.activation = self.supported_activations[activation]
+        self.dropout = nn.Dropout(p=dropout_p)
+
+    def forward(
+            self,
+            inputs: Tensor,
+            input_lengths: Tensor,
+            residual: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Tensor]:
+        r"""
+        Forward propagate of conformer's subblock.
+
+        Inputs: inputs, input_lengths, residual
+            - **inputs**: tensor contains input sequence vector
+            - **input_lengths**: tensor contains sequence lengths
+            - **residual**: tensor contains residual vector
+
+        Returns: output, output_lengths
+            * output (torch.FloatTensor): tensor contains output sequence vector
+            * output_lengths (torch.LongTensor): tensor contains output sequence lengths
+        """
+        outputs, output_lengths = self.conv(inputs, input_lengths)
+        outputs = self.batch_norm(outputs)
+
+        if residual is not None:
+            outputs += residual
+
+        outputs = self.dropout(self.activation(outputs))
+
+        return outputs, output_lengths
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/location_aware_attention.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/location_aware_attention.py
new file mode 100644
index 000000000..dffd00cfe
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/location_aware_attention.py
@@ -0,0 +1,92 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from typing import Tuple
+
+from openspeech.modules.wrapper import Linear
+
+
+class LocationAwareAttention(nn.Module):
+    r"""
+    Applies a location-aware attention mechanism on the output features from the decoders.
+    Location-aware attention proposed in "Attention-Based Models for Speech Recognition" paper.
+    The location-aware attention mechanism is performing well in speech recognition tasks.
+    We refer to implementation of ClovaCall Attention style.
+
+    Args:
+        dim (int): dimension of model
+        attn_dim (int): dimension of attention
+        smoothing (bool): flag indication whether to use smoothing or not.
+
+    Inputs: query, value, last_attn
+        - **query** (batch, q_len, hidden_dim): tensor containing the output features from the decoders.
+        - **value** (batch, v_len, hidden_dim): tensor containing features of the encoded input sequence.
+        - **last_attn** (batch_size, v_len): tensor containing previous timestep`s attention (alignment)
+
+    Returns: output, attn
+        - **output** (batch, output_len, dimensions): tensor containing the feature from encoders outputs
+        - **attn** (batch * num_heads, v_len): tensor containing the attention (alignment) from the encoders outputs.
+
+    Reference:
+        Jan Chorowski et al.: Attention-Based Models for Speech Recognition.
+        https://arxiv.org/abs/1506.07503
+    """
+    def __init__(self, dim: int = 1024, attn_dim: int = 1024, smoothing: bool = False) -> None:
+        super(LocationAwareAttention, self).__init__()
+        self.location_conv = nn.Conv1d(in_channels=1, out_channels=attn_dim, kernel_size=3, padding=1)
+        self.query_proj = Linear(dim, attn_dim, bias=False)
+        self.value_proj = Linear(dim, attn_dim, bias=False)
+        self.bias = nn.Parameter(torch.rand(attn_dim).uniform_(-0.1, 0.1))
+        self.fc = Linear(attn_dim, 1, bias=True)
+        self.smoothing = smoothing
+
+    def forward(self, query: Tensor, value: Tensor, last_alignment_energy: Tensor) -> Tuple[Tensor, Tensor]:
+        batch_size, hidden_dim, seq_length = query.size(0), query.size(2), value.size(1)
+
+        if last_alignment_energy is None:
+            last_alignment_energy = value.new_zeros(batch_size, seq_length)
+
+        last_alignment_energy = self.location_conv(last_alignment_energy.unsqueeze(dim=1))
+        last_alignment_energy = last_alignment_energy.transpose(1, 2)
+
+        alignmment_energy = self.fc(torch.tanh(
+                self.query_proj(query)
+                + self.value_proj(value)
+                + last_alignment_energy
+                + self.bias
+        )).squeeze(dim=-1)
+
+        if self.smoothing:
+            alignmment_energy = torch.sigmoid(alignmment_energy)
+            alignmment_energy = torch.div(alignmment_energy, alignmment_energy.sum(dim=-1).unsqueeze(dim=-1))
+
+        else:
+            alignmment_energy = F.softmax(alignmment_energy, dim=-1)
+
+        context = torch.bmm(alignmment_energy.unsqueeze(dim=1), value)
+
+        return context, alignmment_energy
+
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/mask.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/mask.py
new file mode 100644
index 000000000..00e1268fa
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/mask.py
@@ -0,0 +1,60 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+from torch import Tensor
+
+
+def get_attn_pad_mask(inputs, input_lengths, expand_length):
+    """ mask position is set to 1 """
+
+    def get_transformer_non_pad_mask(inputs: Tensor, input_lengths: Tensor) -> Tensor:
+        """ Padding position is set to 0, either use input_lengths or pad_id """
+        batch_size = inputs.size(0)
+
+        if len(inputs.size()) == 2:
+            non_pad_mask = inputs.new_ones(inputs.size())  # B x T
+        elif len(inputs.size()) == 3:
+            non_pad_mask = inputs.new_ones(inputs.size()[:-1])  # B x T
+        else:
+            raise ValueError(f"Unsupported input shape {inputs.size()}")
+
+        for i in range(batch_size):
+            non_pad_mask[i, input_lengths[i]:] = 0
+
+        return non_pad_mask
+
+    non_pad_mask = get_transformer_non_pad_mask(inputs, input_lengths)
+    pad_mask = non_pad_mask.lt(1)
+    attn_pad_mask = pad_mask.unsqueeze(1).expand(-1, expand_length, -1)
+    return attn_pad_mask
+
+
+def get_attn_subsequent_mask(seq):
+    assert seq.dim() == 2
+    attn_shape = [seq.size(0), seq.size(1), seq.size(1)]
+    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1)
+
+    if seq.is_cuda:
+        subsequent_mask = subsequent_mask.cuda()
+
+    return subsequent_mask
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/mask_conv1d.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/mask_conv1d.py
new file mode 100644
index 000000000..dad6c35fa
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/mask_conv1d.py
@@ -0,0 +1,89 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from typing import Tuple
+
+
+class MaskConv1d(nn.Conv1d):
+    r"""
+    1D convolution with masking
+
+    Args:
+        in_channels (int): Number of channels in the input vector
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int): Stride of the convolution. Default: 1
+        padding (int):  Zero-padding added to both sides of the input. Default: 0
+        dilation (int): Spacing between kernel elements. Default: 1
+        groups (int): Number of blocked connections from input channels to output channels. Default: 1
+        bias (bool): If True, adds a learnable bias to the output. Default: True
+
+    Inputs: inputs, seq_lengths
+        - **inputs** (torch.FloatTensor): The input of size (batch, dimension, time)
+        - **seq_lengths** (torch.IntTensor): The actual length of each sequence in the batch
+
+    Returns: output, seq_lengths
+        - **output**: Masked output from the conv1d
+        - **seq_lengths**: Sequence length of output from the conv1d
+    """
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            kernel_size: int,
+            stride: int = 1,
+            padding: int = 0,
+            dilation: int = 1,
+            groups: int = 1,
+            bias: bool = False,
+    ) -> None:
+        super(MaskConv1d, self).__init__(in_channels=in_channels, out_channels=out_channels,
+                                         kernel_size=kernel_size, stride=stride, padding=padding,
+                                         dilation=dilation, groups=groups, bias=bias)
+
+    def _get_sequence_lengths(self, seq_lengths):
+        return (
+            (seq_lengths + 2 * self.padding[0] - self.dilation[0] * (self.kernel_size[0] - 1) - 1) // self.stride[0] + 1
+        )
+
+    def forward(self, inputs: Tensor, input_lengths: Tensor) -> Tuple[Tensor, Tensor]:
+        r"""
+        inputs: (batch, dimension, time)
+        input_lengths: (batch)
+        """
+        max_length = inputs.size(2)
+
+        indices = torch.arange(max_length).to(input_lengths.dtype).to(input_lengths.device)
+        indices = indices.expand(len(input_lengths), max_length)
+
+        mask = indices >= input_lengths.unsqueeze(1)
+        inputs = inputs.masked_fill(mask.unsqueeze(1).to(device=inputs.device), 0)
+
+        output_lengths = self._get_sequence_lengths(input_lengths)
+        output = super(MaskConv1d, self).forward(inputs)
+
+        del mask, indices
+
+        return output, output_lengths
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/mask_conv2d.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/mask_conv2d.py
new file mode 100644
index 000000000..6bbdad6c6
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/mask_conv2d.py
@@ -0,0 +1,98 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from typing import Tuple
+
+
+class MaskConv2d(nn.Module):
+    r"""
+    Masking Convolutional Neural Network
+
+    Adds padding to the output of the module based on the given lengths.
+    This is to ensure that the results of the model do not change when batch sizes change during inference.
+    Input needs to be in the shape of (batch_size, channel, hidden_dim, seq_len)
+
+    Refer to https://github.com/SeanNaren/deepspeech.pytorch/blob/master/model.py
+    Copyright (c) 2017 Sean Naren
+    MIT License
+
+    Args:
+        sequential (torch.nn): sequential list of convolution layer
+
+    Inputs: inputs, seq_lengths
+        - **inputs** (torch.FloatTensor): The input of size BxCxHxT
+        - **seq_lengths** (torch.IntTensor): The actual length of each sequence in the batch
+
+    Returns: output, seq_lengths
+        - **output**: Masked output from the sequential
+        - **seq_lengths**: Sequence length of output from the sequential
+    """
+    def __init__(self, sequential: nn.Sequential) -> None:
+        super(MaskConv2d, self).__init__()
+        self.sequential = sequential
+
+    def forward(self, inputs: Tensor, seq_lengths: Tensor) -> Tuple[Tensor, Tensor]:
+        output = None
+
+        for module in self.sequential:
+            output = module(inputs)
+            mask = torch.BoolTensor(output.size()).fill_(0)
+
+            if output.is_cuda:
+                mask = mask.cuda()
+
+            seq_lengths = self._get_sequence_lengths(module, seq_lengths)
+
+            for idx, length in enumerate(seq_lengths):
+                length = length.item()
+
+                if (mask[idx].size(2) - length) > 0:
+                    mask[idx].narrow(dim=2, start=length, length=mask[idx].size(2) - length).fill_(1)
+
+            output = output.masked_fill(mask, 0)
+            inputs = output
+
+        return output, seq_lengths
+
+    def _get_sequence_lengths(self, module: nn.Module, seq_lengths: Tensor) -> Tensor:
+        r"""
+        Calculate convolutional neural network receptive formula
+
+        Args:
+            module (torch.nn.Module): module of CNN
+            seq_lengths (torch.IntTensor): The actual length of each sequence in the batch
+
+        Returns: seq_lengths
+            - **seq_lengths**: Sequence length of output from the module
+        """
+        if isinstance(module, nn.Conv2d):
+            numerator = seq_lengths + 2 * module.padding[1] - module.dilation[1] * (module.kernel_size[1] - 1) - 1
+            seq_lengths = numerator.float() / float(module.stride[1])
+            seq_lengths = seq_lengths.int() + 1
+
+        elif isinstance(module, nn.MaxPool2d):
+            seq_lengths >>= 1
+
+        return seq_lengths.int()
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/multi_head_attention.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/multi_head_attention.py
new file mode 100644
index 000000000..ca84ef505
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/multi_head_attention.py
@@ -0,0 +1,89 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+from torch import Tensor
+from typing import Tuple, Optional
+
+from openspeech.modules.wrapper import Linear
+from openspeech.modules.dot_product_attention import DotProductAttention
+
+
+class MultiHeadAttention(nn.Module):
+    r"""
+    Multi-Head Attention proposed in "Attention Is All You Need"
+    Instead of performing a single attention function with d_model-dimensional keys, values, and queries,
+    project the queries, keys and values h times with different, learned linear projections to d_head dimensions.
+    These are concatenated and once again projected, resulting in the final values.
+    Multi-head attention allows the model to jointly attend to information from different representation
+    subspaces at different positions.
+
+    MultiHead(Q, K, V) = Concat(head_1, ..., head_h) · W_o
+        where head_i = Attention(Q · W_q, K · W_k, V · W_v)
+
+    Args:
+        dim (int): The dimension of model (default: 512)
+        num_heads (int): The number of attention heads. (default: 8)
+
+    Inputs: query, key, value, mask
+        - **query** (batch, q_len, d_model): tensor containing projection vector for decoders.
+        - **key** (batch, k_len, d_model): tensor containing projection vector for encoders.
+        - **value** (batch, v_len, d_model): tensor containing features of the encoded input sequence.
+        - **mask** (-): tensor containing indices to be masked
+
+    Returns: output, attn
+        - **output** (batch, output_len, dimensions): tensor containing the attended output features.
+        - **attn** (batch * num_heads, v_len): tensor containing the attention (alignment) from the encoders outputs.
+    """
+    def __init__(self, dim: int = 512, num_heads: int = 8) -> None:
+        super(MultiHeadAttention, self).__init__()
+
+        assert dim % num_heads == 0, "hidden_dim % num_heads should be zero."
+
+        self.d_head = int(dim / num_heads)
+        self.num_heads = num_heads
+        self.query_proj = Linear(dim, self.d_head * num_heads)
+        self.key_proj = Linear(dim, self.d_head * num_heads)
+        self.value_proj = Linear(dim, self.d_head * num_heads)
+        self.scaled_dot_attn = DotProductAttention(dim, scale=True)
+
+    def forward(
+            self,
+            query: Tensor,
+            key: Tensor,
+            value: Tensor,
+            mask: Optional[Tensor] = None,
+    ) -> Tuple[Tensor, Tensor]:
+        batch_size = value.size(0)
+
+        query = self.query_proj(query).view(batch_size, -1, self.num_heads, self.d_head).transpose(1, 2)
+        key = self.key_proj(key).view(batch_size, -1, self.num_heads, self.d_head).transpose(1, 2)
+        value = self.value_proj(value).view(batch_size, -1, self.num_heads, self.d_head).transpose(1, 2)
+
+        if mask is not None:
+            mask = mask.unsqueeze(1).repeat(1, self.num_heads, 1, 1)
+
+        context, attn = self.scaled_dot_attn(query, key, value, mask)
+
+        context = context.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.d_head)
+
+        return context, attn
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/pointwise_conv1d.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/pointwise_conv1d.py
new file mode 100644
index 000000000..d272025ac
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/pointwise_conv1d.py
@@ -0,0 +1,66 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+from torch import Tensor
+
+from openspeech.modules.conv_base import BaseConv1d
+
+
+class PointwiseConv1d(BaseConv1d):
+    r"""
+    When kernel size == 1 conv1d, this operation is termed in literature as pointwise convolution.
+    This operation often used to match dimensions.
+
+    Args:
+        in_channels (int): Number of channels in the input
+        out_channels (int): Number of channels produced by the convolution
+        stride (int, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
+        bias (bool, optional): If True, adds a learnable bias to the output. Default: True
+
+    Inputs: inputs
+        - **inputs** (batch, in_channels, time): Tensor containing input vector
+
+    Returns: outputs
+        - **outputs** (batch, out_channels, time): Tensor produces by pointwise 1-D convolution.
+    """
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            stride: int = 1,
+            padding: int = 0,
+            bias: bool = True,
+    ) -> None:
+        super(PointwiseConv1d, self).__init__()
+        self.conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=stride,
+            padding=padding,
+            bias=bias,
+        )
+
+    def forward(self, inputs: Tensor) -> Tensor:
+        return self.conv(inputs)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/positional_encoding.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/positional_encoding.py
new file mode 100644
index 000000000..445759d5d
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/positional_encoding.py
@@ -0,0 +1,50 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import math
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+
+class PositionalEncoding(nn.Module):
+    r"""
+    Positional Encoding proposed in "Attention Is All You Need".
+    Since transformer contains no recurrence and no convolution, in order for the model to make
+    use of the order of the sequence, we must add some positional information.
+
+    "Attention Is All You Need" use sine and cosine functions of different frequencies:
+        PE_(pos, 2i)    =  sin(pos / power(10000, 2i / d_model))
+        PE_(pos, 2i+1)  =  cos(pos / power(10000, 2i / d_model))
+    """
+    def __init__(self, d_model: int = 512, max_len: int = 5000) -> None:
+        super(PositionalEncoding, self).__init__()
+        pe = torch.zeros(max_len, d_model, requires_grad=False)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+
+    def forward(self, length: int) -> Tensor:
+        return self.pe[:, :length]
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/positionwise_feed_forward.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/positionwise_feed_forward.py
new file mode 100644
index 000000000..9aaa8c36b
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/positionwise_feed_forward.py
@@ -0,0 +1,47 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+from torch import Tensor
+
+from openspeech.modules.wrapper import Linear
+
+
+class PositionwiseFeedForward(nn.Module):
+    """
+    Position-wise Feedforward Networks proposed in "Attention Is All You Need".
+    Fully connected feed-forward network, which is applied to each position separately and identically.
+    This consists of two linear transformations with a ReLU activation in between.
+    Another way of describing this is as two convolutions with kernel size 1.
+    """
+    def __init__(self, d_model: int = 512, d_ff: int = 2048, dropout_p: float = 0.3) -> None:
+        super(PositionwiseFeedForward, self).__init__()
+        self.feed_forward = nn.Sequential(
+            Linear(d_model, d_ff),
+            nn.Dropout(dropout_p),
+            nn.ReLU(),
+            Linear(d_ff, d_model),
+            nn.Dropout(dropout_p),
+        )
+
+    def forward(self, inputs: Tensor) -> Tensor:
+        return self.feed_forward(inputs)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/quartznet_block.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/quartznet_block.py
new file mode 100644
index 000000000..81d0b9c42
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/quartznet_block.py
@@ -0,0 +1,104 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+from typing import Tuple
+
+from openspeech.modules.pointwise_conv1d import PointwiseConv1d
+from openspeech.modules.quartznet_subblock import QuartzNetSubBlock
+
+
+class QuartzNetBlock(nn.Module):
+    r"""
+    QuartzNet’s design is based on the Jasper architecture, which is a convolutional model trained with
+    Connectionist Temporal Classification (CTC) loss. The main novelty in QuartzNet’s architecture is that QuartzNet
+    replaced the 1D convolutions with 1D time-channel separable convolutions, an implementation of depthwise separable
+    convolutions.
+
+    Inputs: inputs, input_lengths
+        inputs (torch.FloatTensor): tensor contains input sequence vector
+        input_lengths (torch.LongTensor): tensor contains sequence lengths
+
+    Returns: output, output_lengths
+        (torch.FloatTensor, torch.LongTensor)
+
+        * output (torch.FloatTensor): tensor contains output sequence vector
+        * output_lengths (torch.LongTensor): tensor contains output sequence lengths
+    """
+    supported_activations = {
+        'hardtanh': nn.Hardtanh(0, 20, inplace=True),
+        'relu': nn.ReLU(inplace=True),
+        'elu': nn.ELU(inplace=True),
+        'leaky_relu': nn.LeakyReLU(inplace=True),
+        'gelu': nn.GELU(),
+    }
+
+    def __init__(
+            self,
+            num_sub_blocks: int,
+            in_channels: int,
+            out_channels: int,
+            kernel_size: int,
+            bias: bool = True,
+    ) -> None:
+        super(QuartzNetBlock, self).__init__()
+        padding = self._get_same_padding(kernel_size, stride=1, dilation=1)
+        self.layers = nn.ModuleList([
+            QuartzNetSubBlock(
+                in_channels=in_channels if i == 0 else out_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                padding=padding,
+                bias=bias,
+            ) for i in range(num_sub_blocks)
+        ])
+        self.conv1x1 = PointwiseConv1d(in_channels, out_channels)
+        self.batch_norm = nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.1)
+
+    def _get_same_padding(self, kernel_size: int, stride: int, dilation: int):
+        if stride > 1 and dilation > 1:
+            raise ValueError("Only stride OR dilation may be greater than 1")
+        return (kernel_size // 2) * dilation
+
+    def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""
+        Forward propagate of QuartzNet block.
+
+        Inputs: inputs, input_lengths
+            inputs (torch.FloatTensor): tensor contains input sequence vector
+            input_lengths (torch.LongTensor): tensor contains sequence lengths
+
+        Returns: output, output_lengths
+            (torch.FloatTensor, torch.LongTensor)
+
+            * output (torch.FloatTensor): tensor contains output sequence vector
+            * output_lengths (torch.LongTensor): tensor contains output sequence lengths
+        """
+        residual = self.batch_norm(self.conv1x1(inputs))
+
+        for layer in self.layers[:-1]:
+            inputs, input_lengths = layer(inputs, input_lengths)
+
+        outputs, output_lengths = self.layers[-1](inputs, input_lengths, residual)
+
+        return outputs, output_lengths
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/quartznet_subblock.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/quartznet_subblock.py
new file mode 100644
index 000000000..2e170c86f
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/quartznet_subblock.py
@@ -0,0 +1,96 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+from typing import Optional, Tuple
+
+from openspeech.modules.depthwise_conv1d import DepthwiseConv1d
+from openspeech.modules.time_channel_separable_conv1d import TimeChannelSeparableConv1d
+from openspeech.modules.conv_group_shuffle import ConvGroupShuffle
+
+
+class QuartzNetSubBlock(nn.Module):
+    r"""
+    QuartzNet sub-block applies the following operations: a 1D-convolution, batch norm, ReLU, and dropout.
+
+    Args:
+        in_channels (int): number of channels in the input feature
+        out_channels (int): number of channels produced by the convolution
+        kernel_size (int): size of the convolving kernel
+        padding (int): zero-padding added to both sides of the input. (default: 0)
+        bias (bool): if True, adds a learnable bias to the output. (default: False)
+
+    Inputs: inputs, input_lengths, residual
+        - **inputs**: tensor contains input sequence vector
+        - **input_lengths**: tensor contains sequence lengths
+        - **residual**: tensor contains residual vector
+
+    Returns: output, output_lengths
+        * output (torch.FloatTensor): tensor contains output sequence vector
+        * output_lengths (torch.LongTensor): tensor contains output sequence lengths
+    """
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            kernel_size: int,
+            bias: bool = False,
+            padding: int = 0,
+            groups: int = 1,
+    ) -> None:
+        super(QuartzNetSubBlock, self).__init__()
+        self.depthwise_conf1d = DepthwiseConv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            padding=padding,
+        )
+        self.tcs_conv = TimeChannelSeparableConv1d(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            padding=0,
+            groups=groups,
+            bias=bias,
+        )
+        self.group_shuffle = ConvGroupShuffle(groups, out_channels)
+        self.batch_norm = nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.1)
+        self.relu = nn.ReLU()
+
+    def forward(
+            self,
+            inputs: torch.Tensor,
+            input_lengths: torch.Tensor,
+            residual: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        outputs, output_lengths = self.depthwise_conf1d(inputs, input_lengths)
+        outputs, output_lengths = self.tcs_conv(outputs, output_lengths)
+        outputs = self.group_shuffle(outputs)
+        outputs = self.batch_norm(outputs)
+
+        if residual is not None:
+            outputs += residual
+
+        outputs = self.relu(outputs)
+
+        return outputs, output_lengths
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/relative_multi_head_attention.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/relative_multi_head_attention.py
new file mode 100644
index 000000000..6dafad1c3
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/relative_multi_head_attention.py
@@ -0,0 +1,121 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from typing import Optional
+
+from openspeech.modules.wrapper import Linear
+
+
+class RelativeMultiHeadAttention(nn.Module):
+    r"""
+    Multi-head attention with relative positional encoding.
+    This concept was proposed in the "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
+
+    Args:
+        dim (int): The dimension of model
+        num_heads (int): The number of attention heads.
+        dropout_p (float): probability of dropout
+
+    Inputs: query, key, value, pos_embedding, mask
+        - **query** (batch, time, dim): Tensor containing query vector
+        - **key** (batch, time, dim): Tensor containing key vector
+        - **value** (batch, time, dim): Tensor containing value vector
+        - **pos_embedding** (batch, time, dim): Positional embedding tensor
+        - **mask** (batch, 1, time2) or (batch, time1, time2): Tensor containing indices to be masked
+
+    Returns:
+        - **outputs**: Tensor produces by relative multi head attention module.
+    """
+    def __init__(
+            self,
+            dim: int = 512,
+            num_heads: int = 16,
+            dropout_p: float = 0.1,
+    ) -> None:
+        super(RelativeMultiHeadAttention, self).__init__()
+        assert dim % num_heads == 0, "d_model % num_heads should be zero."
+
+        self.dim = dim
+        self.d_head = int(dim / num_heads)
+        self.num_heads = num_heads
+        self.sqrt_dim = math.sqrt(dim)
+
+        self.query_proj = Linear(dim, dim)
+        self.key_proj = Linear(dim, dim)
+        self.value_proj = Linear(dim, dim)
+        self.pos_proj = Linear(dim, dim, bias=False)
+
+        self.dropout = nn.Dropout(p=dropout_p)
+        self.u_bias = nn.Parameter(torch.Tensor(self.num_heads, self.d_head))
+        self.v_bias = nn.Parameter(torch.Tensor(self.num_heads, self.d_head))
+        torch.nn.init.xavier_uniform_(self.u_bias)
+        torch.nn.init.xavier_uniform_(self.v_bias)
+
+        self.out_proj = Linear(dim, dim)
+
+    def forward(
+            self,
+            query: Tensor,
+            key: Tensor,
+            value: Tensor,
+            pos_embedding: Tensor,
+            mask: Optional[Tensor] = None,
+    ) -> Tensor:
+        batch_size = value.size(0)
+
+        query = self.query_proj(query).view(batch_size, -1, self.num_heads, self.d_head)
+        key = self.key_proj(key).view(batch_size, -1, self.num_heads, self.d_head).permute(0, 2, 1, 3)
+        value = self.value_proj(value).view(batch_size, -1, self.num_heads, self.d_head).permute(0, 2, 1, 3)
+        pos_embedding = self.pos_proj(pos_embedding).view(batch_size, -1, self.num_heads, self.d_head)
+
+        content_score = torch.matmul((query + self.u_bias).transpose(1, 2), key.transpose(2, 3))
+        pos_score = torch.matmul((query + self.v_bias).transpose(1, 2), pos_embedding.permute(0, 2, 3, 1))
+        pos_score = self._relative_shift(pos_score)
+
+        score = (content_score + pos_score) / self.sqrt_dim
+
+        if mask is not None:
+            mask = mask.unsqueeze(1)
+            score.masked_fill_(mask, -1e4)
+
+        attn = F.softmax(score, -1)
+        attn = self.dropout(attn)
+
+        context = torch.matmul(attn, value).transpose(1, 2)
+        context = context.contiguous().view(batch_size, -1, self.dim)
+
+        return self.out_proj(context)
+
+    def _relative_shift(self, pos_score: Tensor) -> Tensor:
+        batch_size, num_heads, seq_length1, seq_length2 = pos_score.size()
+        zeros = pos_score.new_zeros(batch_size, num_heads, seq_length1, 1)
+        padded_pos_score = torch.cat([zeros, pos_score], dim=-1)
+
+        padded_pos_score = padded_pos_score.view(batch_size, num_heads, seq_length2 + 1, seq_length1)
+        pos_score = padded_pos_score[:, :, 1:].view_as(pos_score)
+
+        return pos_score
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/residual_connection_module.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/residual_connection_module.py
new file mode 100644
index 000000000..2bb2178b4
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/residual_connection_module.py
@@ -0,0 +1,48 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+from torch import Tensor
+from typing import Optional
+
+
+class ResidualConnectionModule(nn.Module):
+    r"""
+    Residual Connection Module.
+    outputs = (module(inputs) x module_factor + inputs x input_factor)
+    """
+    def __init__(
+            self,
+            module: nn.Module,
+            module_factor: float = 1.0,
+            input_factor: float = 1.0,
+    ) -> None:
+        super(ResidualConnectionModule, self).__init__()
+        self.module = module
+        self.module_factor = module_factor
+        self.input_factor = input_factor
+
+    def forward(self, inputs: Tensor, mask: Optional[Tensor] = None) -> Tensor:
+        if mask is None:
+            return (self.module(inputs) * self.module_factor) + (inputs * self.input_factor)
+        else:
+            return (self.module(inputs, mask) * self.module_factor) + (inputs * self.input_factor)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/swish.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/swish.py
new file mode 100644
index 000000000..d4a41a547
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/swish.py
@@ -0,0 +1,37 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+from torch import Tensor
+
+
+class Swish(nn.Module):
+    r"""
+    Swish is a smooth, non-monotonic function that consistently matches or outperforms ReLU on deep networks applied
+    to a variety of challenging domains such as Image classification and Machine translation.
+    """
+
+    def __init__(self):
+        super(Swish, self).__init__()
+
+    def forward(self, inputs: Tensor) -> Tensor:
+        return inputs * inputs.sigmoid()
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/time_channel_separable_conv1d.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/time_channel_separable_conv1d.py
new file mode 100644
index 000000000..2c2224f28
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/time_channel_separable_conv1d.py
@@ -0,0 +1,61 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+from torch import Tensor
+from typing import Optional
+
+from openspeech.modules.conv_base import BaseConv1d
+
+
+class TimeChannelSeparableConv1d(BaseConv1d):
+    r"""
+    The total number of weights for a time-channel separable convolution block is K × cin + cin × cout weights. Since K is
+    generally several times smaller than cout, most weights are
+    concentrated in the pointwise convolution part.
+    """
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            kernel_size: int = 1,
+            padding: int = 0,
+            groups: int = 1,
+            bias: bool = True,
+    ):
+        super(TimeChannelSeparableConv1d, self).__init__()
+        self.conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            dilation=1,
+            padding=padding,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, inputs: Tensor, input_lengths: Optional[Tensor] = None) -> Tensor:
+        if input_lengths is None:
+            return self.conv(inputs)
+        else:
+            return self.conv(inputs), self._get_sequence_lengths(input_lengths)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/transformer_embedding.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/transformer_embedding.py
new file mode 100644
index 000000000..79ac64e85
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/transformer_embedding.py
@@ -0,0 +1,60 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import math
+import torch.nn as nn
+from torch import Tensor
+
+
+class TransformerEmbedding(nn.Module):
+    r"""
+    Embedding layer. Similarly to other sequence transduction models, transformer use learned embeddings
+    to convert the input tokens and output tokens to vectors of dimension d_model.
+    In the embedding layers, transformer multiply those weights by sqrt(d_model)
+
+    Args:
+        num_embeddings (int): the number of embedding size
+        pad_id (int): identification of pad token
+        d_model (int): dimension of model
+
+    Inputs:
+        inputs (torch.FloatTensor): input of embedding layer
+
+    Returns:
+        outputs (torch.FloatTensor): output of embedding layer
+    """
+    def __init__(self, num_embeddings: int, pad_id: int, d_model: int = 512) -> None:
+        super(TransformerEmbedding, self).__init__()
+        self.sqrt_dim = math.sqrt(d_model)
+        self.embedding = nn.Embedding(num_embeddings, d_model, padding_idx=pad_id)
+
+    def forward(self, inputs: Tensor) -> Tensor:
+        r"""
+        Forward propagate of embedding layer.
+
+        Inputs:
+            inputs (torch.FloatTensor): input of embedding layer
+
+        Returns:
+            outputs (torch.FloatTensor): output of embedding layer
+        """
+        return self.embedding(inputs) * self.sqrt_dim
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/vgg_extractor.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/vgg_extractor.py
new file mode 100644
index 000000000..4b2af85aa
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/vgg_extractor.py
@@ -0,0 +1,81 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+from typing import Tuple
+
+from openspeech.modules import Conv2dExtractor
+
+
+class VGGExtractor(Conv2dExtractor):
+    r"""
+    VGG extractor for automatic speech recognition described in
+    "Advances in Joint CTC-Attention based End-to-End Speech Recognition with a Deep CNN Encoder and RNN-LM" paper
+    - https://arxiv.org/pdf/1706.02737.pdf
+
+    Args:
+        input_dim (int): Dimension of input vector
+        in_channels (int): Number of channels in the input image
+        out_channels (int or tuple): Number of channels produced by the convolution
+        activation (str): Activation function
+
+    Inputs: inputs, input_lengths
+        - **inputs** (batch, time, dim): Tensor containing input vectors
+        - **input_lengths**: Tensor containing containing sequence lengths
+
+    Returns: outputs, output_lengths
+        - **outputs**: Tensor produced by the convolution
+        - **output_lengths**: Tensor containing sequence lengths produced by the convolution
+    """
+    def __init__(
+            self,
+            input_dim: int,
+            in_channels: int = 1,
+            out_channels: int or tuple = (64, 128),
+            activation: str = 'hardtanh',
+    ):
+        super(VGGExtractor, self).__init__(input_dim=input_dim, activation=activation)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        from openspeech.modules import MaskConv2d
+        self.conv = MaskConv2d(
+            nn.Sequential(
+                nn.Conv2d(in_channels, out_channels[0], kernel_size=3, stride=1, padding=1, bias=False),
+                nn.BatchNorm2d(num_features=out_channels[0]),
+                self.activation,
+                nn.Conv2d(out_channels[0], out_channels[0], kernel_size=3, stride=1, padding=1, bias=False),
+                nn.BatchNorm2d(num_features=out_channels[0]),
+                self.activation,
+                nn.MaxPool2d(2, stride=2),
+                nn.Conv2d(out_channels[0], out_channels[1], kernel_size=3, stride=1, padding=1, bias=False),
+                nn.BatchNorm2d(num_features=out_channels[1]),
+                self.activation,
+                nn.Conv2d(out_channels[1], out_channels[1], kernel_size=3, stride=1, padding=1, bias=False),
+                nn.BatchNorm2d(num_features=out_channels[1]),
+                self.activation,
+                nn.MaxPool2d(2, stride=2),
+            )
+        )
+
+    def forward(self, inputs: torch.Tensor, input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        return super().forward(inputs, input_lengths)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/modules/wrapper.py b/audio/speech_recognition/conformer/pytorch/openspeech/modules/wrapper.py
new file mode 100644
index 000000000..0aeed9446
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/modules/wrapper.py
@@ -0,0 +1,64 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+import torch.nn.init as init
+from torch import Tensor
+
+
+class Linear(nn.Module):
+    r"""
+    Wrapper class of torch.nn.Linear
+    Weight initialize by xavier initialization and bias initialize to zeros.
+    """
+    def __init__(self, in_features: int, out_features: int, bias: bool = True) -> None:
+        super(Linear, self).__init__()
+        self.linear = nn.Linear(in_features, out_features, bias=bias)
+        init.xavier_uniform_(self.linear.weight)
+        if bias:
+            init.zeros_(self.linear.bias)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.linear(x)
+
+
+class View(nn.Module):
+    r""" Wrapper class of torch.view() for Sequential module. """
+    def __init__(self, shape: tuple, contiguous: bool = False):
+        super(View, self).__init__()
+        self.shape = shape
+        self.contiguous = contiguous
+
+    def forward(self, inputs):
+        if self.contiguous:
+            inputs = inputs.contiguous()
+        return inputs.view(*self.shape)
+
+
+class Transpose(nn.Module):
+    r""" Wrapper class of torch.transpose() for Sequential module. """
+    def __init__(self, shape: tuple):
+        super(Transpose, self).__init__()
+        self.shape = shape
+
+    def forward(self, inputs: Tensor):
+        return inputs.transpose(*self.shape)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/optim/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/optim/__init__.py
new file mode 100644
index 000000000..f225e5179
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/optim/__init__.py
@@ -0,0 +1,44 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import os
+import importlib
+
+from openspeech.optim.adamp import AdamP
+from openspeech.optim.radam import RAdam
+from openspeech.optim.novograd import Novograd
+
+# automatically import any Python files in the models/ directory
+scheduler_dir = os.path.dirname(__file__)
+for file in os.listdir(scheduler_dir):
+    if os.path.isdir(os.path.join(scheduler_dir, file)) and file != '__pycache__':
+        for subfile in os.listdir(os.path.join(scheduler_dir, file)):
+            path = os.path.join(scheduler_dir, file, subfile)
+            if subfile.endswith(".py"):
+                scheduler_name = subfile[: subfile.find(".py")] if subfile.endswith(".py") else subfile
+                module = importlib.import_module(f"openspeech.optim.scheduler.{scheduler_name}")
+        continue
+
+    path = os.path.join(scheduler_dir, file)
+    if file.endswith(".py"):
+        scheduler_name = file[: file.find(".py")] if file.endswith(".py") else file
+        module = importlib.import_module(f"openspeech.optim.{scheduler_name}")
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/optim/adamp.py b/audio/speech_recognition/conformer/pytorch/openspeech/optim/adamp.py
new file mode 100644
index 000000000..591e6d9b0
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/optim/adamp.py
@@ -0,0 +1,109 @@
+# AdamP
+# Copyright (c) 2020-present NAVER Corp.
+# MIT license
+
+import torch
+from torch.optim.optimizer import Optimizer
+import math
+
+
+class AdamP(Optimizer):
+    """
+    Paper: "AdamP: Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights"
+
+    Copied from https://github.com/clovaai/AdamP/
+    Copyright (c) 2020 Naver Corp.
+    MIT License
+    """
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
+                 weight_decay=0, delta=0.1, wd_ratio=0.1, nesterov=False):
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
+                        delta=delta, wd_ratio=wd_ratio, nesterov=nesterov)
+        super(AdamP, self).__init__(params, defaults)
+
+    def _channel_view(self, x):
+        return x.view(x.size(0), -1)
+
+    def _layer_view(self, x):
+        return x.view(1, -1)
+
+    def _cosine_similarity(self, x, y, eps, view_func):
+        x = view_func(x)
+        y = view_func(y)
+
+        x_norm = x.norm(dim=1).add_(eps)
+        y_norm = y.norm(dim=1).add_(eps)
+        dot = (x * y).sum(dim=1)
+
+        return dot.abs() / x_norm / y_norm
+
+    def _projection(self, p, grad, perturb, delta, wd_ratio, eps):
+        wd = 1
+        expand_size = [-1] + [1] * (len(p.shape) - 1)
+        for view_func in [self._channel_view, self._layer_view]:
+
+            cosine_sim = self._cosine_similarity(grad, p.data, eps, view_func)
+
+            if cosine_sim.max() < delta / math.sqrt(view_func(p.data).size(1)):
+                p_n = p.data / view_func(p.data).norm(dim=1).view(expand_size).add_(eps)
+                perturb -= p_n * view_func(p_n * perturb).sum(dim=1).view(expand_size)
+                wd = wd_ratio
+
+                return perturb, wd
+
+        return perturb, wd
+
+    def step(self, closure=None):
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+
+                grad = p.grad.data
+                beta1, beta2 = group['betas']
+                nesterov = group['nesterov']
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+
+                # Adam
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+
+                state['step'] += 1
+                bias_correction1 = 1 - beta1 ** state['step']
+                bias_correction2 = 1 - beta2 ** state['step']
+
+                exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+
+                denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
+                step_size = group['lr'] / bias_correction1
+
+                if nesterov:
+                    perturb = (beta1 * exp_avg + (1 - beta1) * grad) / denom
+                else:
+                    perturb = exp_avg / denom
+
+                # Projection
+                wd_ratio = 1
+                if len(p.shape) > 1:
+                    perturb, wd_ratio = self._projection(p, grad, perturb, group['delta'], group['wd_ratio'],
+                                                         group['eps'])
+
+                # Weight decay
+                if group['weight_decay'] > 0:
+                    p.data.mul_(1 - group['lr'] * group['weight_decay'] * wd_ratio)
+
+                # Step
+                p.data.add_(-step_size, perturb)
+
+        return loss
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/optim/novograd.py b/audio/speech_recognition/conformer/pytorch/openspeech/optim/novograd.py
new file mode 100644
index 000000000..e7b78e42d
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/optim/novograd.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch.optim.optimizer import Optimizer
+
+
+class Novograd(Optimizer):
+    """
+    Novograd algorithm.
+
+    Copied from https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechRecognition/Jasper/optimizers.py
+    Copyright (c) 2019 NVIDIA Corp.
+    Apache-2.0 License
+
+    Args:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.95, 0))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        grad_averaging: gradient averaging
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False)
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.95, 0), eps=1e-8,
+                 weight_decay=0, grad_averaging=False, amsgrad=False):
+        if 0.0 > lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if 0.0 > eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay,
+                        grad_averaging=grad_averaging,
+                        amsgrad=amsgrad)
+
+        super(Novograd, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(Novograd, self).__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+            and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Sparse gradients are not supported.')
+                amsgrad = group['amsgrad']
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                if amsgrad:
+                    max_exp_avg_sq = state['max_exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                state['step'] += 1
+
+                norm = torch.sum(torch.pow(grad, 2))
+
+                if exp_avg_sq == 0:
+                    exp_avg_sq.copy_(norm)
+                else:
+                    exp_avg_sq.mul_(beta2).add_(norm, alpha=1 - beta2)
+
+                if amsgrad:
+                    # Maintains the maximum of all 2nd moment running avg. till now
+                    torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
+                    # Use the max. for normalizing running avg. of gradient
+                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
+                else:
+                    denom = exp_avg_sq.sqrt().add_(group['eps'])
+
+                grad.div_(denom)
+                if group['weight_decay'] != 0:
+                    grad.add_(p.data, alpha=group['weight_decay'])
+                if group['grad_averaging']:
+                    grad.mul_(1 - beta1)
+                exp_avg.mul_(beta1).add_(grad)
+
+                p.data.add_(exp_avg, alpha=-group['lr'])
+
+        return loss
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/optim/optimizer.py b/audio/speech_recognition/conformer/pytorch/openspeech/optim/optimizer.py
new file mode 100644
index 000000000..39f40f3f3
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/optim/optimizer.py
@@ -0,0 +1,83 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+
+from openspeech.optim.scheduler.reduce_lr_on_plateau_scheduler import ReduceLROnPlateauScheduler
+from openspeech.optim.scheduler.warmup_reduce_lr_on_plateau_scheduler import WarmupReduceLROnPlateauScheduler
+
+
+class Optimizer(object):
+    """
+    This is wrapper classs of torch.optim.Optimizer.
+    This class provides functionalities for learning rate scheduling and gradient norm clipping.
+
+    Args:
+        optim (torch.optim.Optimizer): optimizer object, the parameters to be optimized
+            should be given when instantiating the object, e.g. torch.optim.Adam, torch.optim.SGD
+        scheduler (openspeech.optim.scheduler, optional): learning rate scheduler
+        scheduler_period (int, optional): timestep with learning rate scheduler
+        max_grad_norm (int, optional): value used for gradient norm clipping
+    """
+    def __init__(self, optim, scheduler=None, scheduler_period=None, max_grad_norm=0):
+        self.optimizer = optim
+        self.scheduler = scheduler
+        self.scheduler_period = scheduler_period
+        self.max_grad_norm = max_grad_norm
+        self.count = 0
+
+    def step(self, model):
+        if self.max_grad_norm > 0:
+            torch.nn.utils.clip_grad_norm_(model.parameters(), self.max_grad_norm)
+        self.optimizer.step()
+
+        if self.scheduler is not None:
+            self.update()
+            self.count += 1
+
+            if self.scheduler_period == self.count:
+                self.scheduler = None
+                self.scheduler_period = 0
+                self.count = 0
+
+    def set_scheduler(self, scheduler, scheduler_period):
+        self.scheduler = scheduler
+        self.scheduler_period = scheduler_period
+        self.count = 0
+
+    def update(self, val_loss=None):
+        if isinstance(self.scheduler, ReduceLROnPlateauScheduler) \
+                or isinstance(self.scheduler, WarmupReduceLROnPlateauScheduler):
+            self.scheduler.step(val_loss)
+        else:
+            self.scheduler.step()
+
+    def zero_grad(self):
+        self.optimizer.zero_grad()
+
+    def get_lr(self):
+        for g in self.optimizer.param_groups:
+            return g['lr']
+
+    def set_lr(self, lr):
+        for g in self.optimizer.param_groups:
+            g['lr'] = lr
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/optim/radam.py b/audio/speech_recognition/conformer/pytorch/openspeech/optim/radam.py
new file mode 100644
index 000000000..6c973973c
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/optim/radam.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2020, LiyuanLucasLiu. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import torch
+from torch.optim.optimizer import Optimizer
+
+
+class RAdam(Optimizer):
+    """
+    Paper: "On the Variance of the Adaptive Learning Rate and Beyond"
+
+    Refer to https://github.com/LiyuanLucasLiu/RAdam
+    Copyright (c) LiyuanLucasLiu
+    Apache 2.0 License
+    """
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, degenerated_to_sgd=True):
+        if lr < 0.0:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if eps < 0.0:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+
+        self.degenerated_to_sgd = degenerated_to_sgd
+        if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict):
+            for param in params:
+                if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]):
+                    param['buffer'] = [[None, None, None] for _ in range(10)]
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
+                        buffer=[[None, None, None] for _ in range(10)])
+        super(RAdam, self).__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super(RAdam, self).__setstate__(state)
+
+    def step(self, closure=None):
+
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data.float()
+                if grad.is_sparse:
+                    raise RuntimeError('RAdam does not support sparse gradients')
+
+                p_data_fp32 = p.data.float()
+
+                state = self.state[p]
+
+                if len(state) == 0:
+                    state['step'] = 0
+                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
+                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
+                else:
+                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
+                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+                exp_avg.mul_(beta1).add_(1 - beta1, grad)
+
+                state['step'] += 1
+                buffered = group['buffer'][int(state['step'] % 10)]
+                if state['step'] == buffered[0]:
+                    N_sma, step_size = buffered[1], buffered[2]
+                else:
+                    buffered[0] = state['step']
+                    beta2_t = beta2 ** state['step']
+                    N_sma_max = 2 / (1 - beta2) - 1
+                    N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
+                    buffered[1] = N_sma
+
+                    # more conservative since it's an approximated value
+                    if N_sma >= 5:
+                        step_size = math.sqrt(
+                            (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (
+                                        N_sma_max - 2)) / (1 - beta1 ** state['step'])
+                    elif self.degenerated_to_sgd:
+                        step_size = 1.0 / (1 - beta1 ** state['step'])
+                    else:
+                        step_size = -1
+                    buffered[2] = step_size
+
+                # more conservative since it's an approximated value
+                if N_sma >= 5:
+                    if group['weight_decay'] != 0:
+                        p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
+                    denom = exp_avg_sq.sqrt().add_(group['eps'])
+                    p_data_fp32.addcdiv_(-step_size * group['lr'], exp_avg, denom)
+                    p.data.copy_(p_data_fp32)
+                elif step_size > 0:
+                    if group['weight_decay'] != 0:
+                        p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)
+                    p_data_fp32.add_(-step_size * group['lr'], exp_avg)
+                    p.data.copy_(p_data_fp32)
+
+        return loss
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/__init__.py
new file mode 100644
index 000000000..125842f79
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/__init__.py
@@ -0,0 +1,59 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import os
+import importlib
+
+SCHEDULER_REGISTRY = {}
+SCHEDULER_DATACLASS_REGISTRY = {}
+
+
+def register_scheduler(name: str, dataclass=None):
+    """
+    New scheduler types can be added to OpenSpeech with the :func:`register_scheduler` function decorator.
+
+    For example::
+        @register_scheduler('reduce_lr_on_plateau')
+        class ReduceLROnPlateau:
+            (...)
+
+    .. note:: All scheduler must implement the :class:`cls.__name__` interface.
+
+    Args:
+        name (str): the name of the scheduler
+    """
+
+    def register_scheduler_cls(cls):
+        if name in SCHEDULER_REGISTRY:
+            raise ValueError(f"Cannot register duplicate scheduler ({name})")
+
+        SCHEDULER_REGISTRY[name] = cls
+
+        cls.__dataclass = dataclass
+        if dataclass is not None:
+            if name in SCHEDULER_DATACLASS_REGISTRY:
+                raise ValueError(f"Cannot register duplicate scheduler ({name})")
+            SCHEDULER_DATACLASS_REGISTRY[name] = dataclass
+
+        return cls
+
+    return register_scheduler_cls
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/lr_scheduler.py b/audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/lr_scheduler.py
new file mode 100644
index 000000000..2e8755246
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/lr_scheduler.py
@@ -0,0 +1,47 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from torch.optim.lr_scheduler import _LRScheduler
+
+
+class LearningRateScheduler(_LRScheduler):
+    r"""
+    Provides inteface of learning rate scheduler.
+
+    Note:
+        Do not use this class directly, use one of the sub classes.
+    """
+    def __init__(self, optimizer, init_lr):
+        self.optimizer = optimizer
+        self.init_lr = init_lr
+
+    def step(self, *args, **kwargs):
+        raise NotImplementedError
+
+    @staticmethod
+    def set_lr(optimizer, lr):
+        for g in optimizer.param_groups:
+            g['lr'] = lr
+
+    def get_lr(self):
+        for g in self.optimizer.param_groups:
+            return g['lr']
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/reduce_lr_on_plateau_scheduler.py b/audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/reduce_lr_on_plateau_scheduler.py
new file mode 100644
index 000000000..789281be0
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/reduce_lr_on_plateau_scheduler.py
@@ -0,0 +1,82 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from dataclasses import dataclass, field
+from torch.optim import Optimizer
+from typing import Optional
+
+from openspeech.dataclass.configurations import LearningRateSchedulerConfigs
+from openspeech.optim.scheduler import register_scheduler
+from openspeech.optim.scheduler.lr_scheduler import LearningRateScheduler
+
+
+@dataclass
+class ReduceLROnPlateauConfigs(LearningRateSchedulerConfigs):
+    scheduler_name: str = field(
+        default="reduce_lr_on_plateau", metadata={"help": "Name of learning rate scheduler."}
+    )
+    lr_patience: int = field(
+        default=1, metadata={"help": "Number of epochs with no improvement after which learning rate will be reduced."}
+    )
+    lr_factor: float = field(
+        default=0.3, metadata={"help": "Factor by which the learning rate will be reduced. new_lr = lr * factor."}
+    )
+
+
+@register_scheduler("reduce_lr_on_plateau", dataclass=ReduceLROnPlateauConfigs)
+class ReduceLROnPlateauScheduler(LearningRateScheduler):
+    r"""
+    Reduce learning rate when a metric has stopped improving. Models often benefit from reducing the learning rate by
+    a factor of 2-10 once learning stagnates. This scheduler reads a metrics quantity and if no improvement is seen
+    for a ‘patience’ number of epochs, the learning rate is reduced.
+
+    Args:
+        optimizer (Optimizer): wrapped optimizer.
+        configs (DictConfig): configuration set.
+    """
+    def __init__(
+            self,
+            optimizer: Optimizer,
+            configs,
+    ) -> None:
+        super(ReduceLROnPlateauScheduler, self).__init__(optimizer, configs.lr_scheduler.lr)
+        self.lr = configs.lr_scheduler.lr
+        self.lr_patience = configs.lr_scheduler.lr_patience
+        self.lr_factor = configs.lr_scheduler.lr_factor
+        self.val_loss = 100.0
+        self.count = 0
+
+    def step(self, val_loss: Optional[float] = None):
+        if val_loss is not None:
+            if self.val_loss < val_loss:
+                self.count += 1
+                self.val_loss = val_loss
+            else:
+                self.count = 0
+                self.val_loss = val_loss
+
+            if self.lr_patience == self.count:
+                self.count = 0
+                self.lr *= self.lr_factor
+                self.set_lr(self.optimizer, self.lr)
+
+        return self.lr
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/transformer_lr_scheduler.py b/audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/transformer_lr_scheduler.py
new file mode 100644
index 000000000..bd2fde7d6
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/transformer_lr_scheduler.py
@@ -0,0 +1,109 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import math
+import torch
+from typing import Optional
+from dataclasses import dataclass, field
+from torch.optim import Optimizer
+
+from openspeech.dataclass.configurations import LearningRateSchedulerConfigs
+from openspeech.optim.scheduler import register_scheduler
+from openspeech.optim.scheduler.lr_scheduler import LearningRateScheduler
+
+
+@dataclass
+class TransformerLRSchedulerConfigs(LearningRateSchedulerConfigs):
+    scheduler_name: str = field(
+        default="transformer", metadata={"help": "Name of learning rate scheduler."}
+    )
+    peak_lr: float = field(
+        default=1e-04, metadata={"help": "Maximum learning rate."}
+    )
+    final_lr: float = field(
+        default=1e-07, metadata={"help": "Final learning rate."}
+    )
+    final_lr_scale: float = field(
+        default=0.05, metadata={"help": "Final learning rate scale"}
+    )
+    warmup_steps: int = field(
+        default=10000, metadata={"help": "Warmup the learning rate linearly for the first N updates"}
+    )
+    decay_steps: int = field(
+        default=150000, metadata={"help": "Steps in decay stages"}
+    )
+
+
+@register_scheduler("transformer", dataclass=TransformerLRSchedulerConfigs)
+class TransformerLRScheduler(LearningRateScheduler):
+    r"""
+    Transformer Learning Rate Scheduler proposed in "Attention Is All You Need"
+
+    Args:
+        optimizer (Optimizer): wrapped optimizer.
+        configs (DictConfig): configuration set.
+    """
+    def __init__(
+            self,
+            optimizer: Optimizer,
+            configs,
+    ) -> None:
+        assert isinstance(configs.lr_scheduler.warmup_steps, int), "warmup_steps should be inteager type"
+        assert isinstance(configs.lr_scheduler.decay_steps, int), "total_steps should be inteager type"
+
+        super(TransformerLRScheduler, self).__init__(optimizer, 0.0)
+        self.final_lr = configs.lr_scheduler.final_lr
+        self.peak_lr = configs.lr_scheduler.peak_lr
+        self.warmup_steps = configs.lr_scheduler.warmup_steps
+        self.decay_steps = configs.lr_scheduler.decay_steps
+
+        self.warmup_rate = self.peak_lr / self.warmup_steps
+        self.decay_factor = -math.log(configs.lr_scheduler.final_lr_scale) / self.decay_steps
+
+        self.lr = self.init_lr
+        self.update_step = 0
+
+    def _decide_stage(self):
+        if self.update_step < self.warmup_steps:
+            return 0, self.update_step
+
+        if self.warmup_steps <= self.update_step < self.warmup_steps + self.decay_steps:
+            return 1, self.update_step - self.warmup_steps
+
+        return 2, None
+
+    def step(self, val_loss: Optional[torch.FloatTensor] = None):
+        self.update_step += 1
+        stage, steps_in_stage = self._decide_stage()
+
+        if stage == 0:
+            self.lr = self.update_step * self.warmup_rate
+        elif stage == 1:
+            self.lr = self.peak_lr * math.exp(-self.decay_factor * steps_in_stage)
+        elif stage == 2:
+            self.lr = self.final_lr
+        else:
+            raise ValueError("Undefined stage")
+
+        self.set_lr(self.optimizer, self.lr)
+
+        return self.lr
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/tri_stage_lr_scheduler.py b/audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/tri_stage_lr_scheduler.py
new file mode 100644
index 000000000..754081ee0
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/tri_stage_lr_scheduler.py
@@ -0,0 +1,154 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import math
+import torch
+from dataclasses import dataclass, field
+from typing import Optional
+from torch.optim import Optimizer
+
+from openspeech.dataclass.configurations import LearningRateSchedulerConfigs
+from openspeech.optim.scheduler import register_scheduler
+from openspeech.optim.scheduler.lr_scheduler import LearningRateScheduler
+
+
+@dataclass
+class TriStageLRSchedulerConfigs(LearningRateSchedulerConfigs):
+    scheduler_name: str = field(
+        default="tri_stage", metadata={"help": "Name of learning rate scheduler."}
+    )
+    init_lr: float = field(
+        default=1e-7, metadata={"help": "Initial learning rate."}
+    )
+    init_lr_scale: float = field(
+        default=0.01, metadata={"help": "Initial learning rate scale."}
+    )
+    final_lr_scale: float = field(
+        default=0.01, metadata={"help": "Final learning rate scale"}
+    )
+    phase_ratio: str = field(
+        default="(0.1, 0.4, 0.5)", metadata={"help": "Automatically sets warmup/hold/decay steps to the ratio "
+                                                     "specified here from max_updates. the ratios must add up to 1.0"}
+    )
+    total_steps: int = field(
+        default=400000, metadata={"help": "Total training steps."}
+    )
+
+
+@register_scheduler("tri_stage", dataclass=TriStageLRSchedulerConfigs)
+class TriStageLRScheduler(LearningRateScheduler):
+    r"""
+    Tri-Stage Learning Rate Scheduler. Implement the learning rate scheduler in "SpecAugment"
+
+    Similar to inverse_squre_root scheduler, but tri_stage learning rate employs
+    three stages LR scheduling:
+
+        - warmup stage, starting from `lr` * `init_lr_scale`, linearly
+          increased to `lr` in `warmup_steps` iterations
+        - hold stage, after `warmup_steps`, keep the LR as `lr` for `hold_steps`
+          iterations
+        - decay stage, after hold stage, decay LR exponetially to
+          `lr` * `final_lr_scale` in `decay_steps`;
+          after that LR is keep as `final_lr_scale` * `lr`
+
+    During warmup::
+      init_lr = cfg.init_lr_scale * cfg.lr
+      lrs = torch.linspace(init_lr, cfg.lr, cfg.warmup_steps)
+      lr = lrs[update_num]
+
+    During hold::
+      lr = cfg.lr
+
+    During decay::
+      decay_factor = - math.log(cfg.final_lr_scale) / cfg.decay_steps
+      lr = cfg.lr * exp(- (update_num - warmup_steps - decay_steps) * decay_factor)
+
+    After that::
+      lr = cfg.lr * cfg.final_lr_scale
+
+    Args:
+        optimizer (Optimizer): wrapped optimizer.
+        configs (DictConfig): configuration set.
+    """
+    def __init__(
+            self,
+            optimizer: Optimizer,
+            configs,
+    ):
+        super(TriStageLRScheduler, self).__init__(optimizer, configs.lr_scheduler.init_lr)
+
+        self.phase_ratio = eval(configs.lr_scheduler.phase_ratio)
+
+        self.warmup_steps = int(configs.lr_scheduler.total_steps * self.phase_ratio[0])
+        self.hold_steps = int(configs.lr_scheduler.total_steps * self.phase_ratio[1])
+        self.decay_steps = int(configs.lr_scheduler.total_steps * self.phase_ratio[2])
+
+        self.peak_lr = configs.lr_scheduler.lr
+        self.init_lr = configs.lr_scheduler.init_lr_scale * configs.lr_scheduler.lr
+        self.final_lr = configs.lr_scheduler.final_lr_scale * configs.lr_scheduler.lr
+
+        self.warmup_rate = (
+            (self.peak_lr - self.init_lr) / self.warmup_steps
+            if self.warmup_steps != 0
+            else 0
+        )
+        self.decay_factor = -math.log(configs.lr_scheduler.final_lr_scale) / self.decay_steps
+        self.update_step = 0
+        self.lr = self.init_lr
+
+    def _decide_stage(self):
+        if self.update_step < self.warmup_steps:
+            return 0, self.update_step
+
+        offset = self.warmup_steps
+
+        if self.update_step < offset + self.hold_steps:
+            return 1, self.update_step - offset
+
+        offset += self.hold_steps
+
+        if self.update_step <= offset + self.decay_steps:
+            # decay stage
+            return 2, self.update_step - offset
+
+        offset += self.decay_steps
+
+        return 3, self.update_step - offset
+
+    def step(self, val_loss: Optional[torch.FloatTensor] = None):
+        stage, steps_in_stage = self._decide_stage()
+
+        if stage == 0:
+            self.lr = self.init_lr + self.warmup_rate * steps_in_stage
+        elif stage == 1:
+            self.lr = self.peak_lr
+        elif stage == 2:
+            self.lr = self.peak_lr * math.exp(-self.decay_factor * steps_in_stage)
+        elif stage == 3:
+            self.lr = self.final_lr
+        else:
+            raise ValueError("Undefined stage")
+
+        self.set_lr(self.optimizer, self.lr)
+        self.update_step += 1
+
+        return self.lr
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/warmup_reduce_lr_on_plateau_scheduler.py b/audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/warmup_reduce_lr_on_plateau_scheduler.py
new file mode 100644
index 000000000..1031d9eef
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/warmup_reduce_lr_on_plateau_scheduler.py
@@ -0,0 +1,102 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from torch.optim import Optimizer
+from dataclasses import dataclass, field
+from typing import Optional
+
+from openspeech.dataclass.configurations import LearningRateSchedulerConfigs
+from openspeech.optim.scheduler import register_scheduler
+from openspeech.optim.scheduler.lr_scheduler import LearningRateScheduler
+from openspeech.optim.scheduler.reduce_lr_on_plateau_scheduler import ReduceLROnPlateauScheduler
+from openspeech.optim.scheduler.warmup_scheduler import WarmupLRScheduler
+
+
+@dataclass
+class WarmupReduceLROnPlateauConfigs(LearningRateSchedulerConfigs):
+    scheduler_name: str = field(
+        default="warmup_reduce_lr_on_plateau", metadata={"help": "Name of learning rate scheduler."}
+    )
+    lr_patience: int = field(
+        default=1, metadata={"help": "Number of epochs with no improvement after which learning rate will be reduced."}
+    )
+    lr_factor: float = field(
+        default=0.3, metadata={"help": "Factor by which the learning rate will be reduced. new_lr = lr * factor."}
+    )
+    peak_lr: float = field(
+        default=1e-04, metadata={"help": "Maximum learning rate."}
+    )
+    init_lr: float = field(
+        default=1e-10, metadata={"help": "Initial learning rate."}
+    )
+    warmup_steps: int = field(
+        default=4000, metadata={"help": "Warmup the learning rate linearly for the first N updates"}
+    )
+
+
+@register_scheduler("warmup_reduce_lr_on_plateau", dataclass=WarmupReduceLROnPlateauConfigs)
+class WarmupReduceLROnPlateauScheduler(LearningRateScheduler):
+    r"""
+    Warmup learning rate until `warmup_steps` and reduce learning rate on plateau after.
+
+    Args:
+        optimizer (Optimizer): wrapped optimizer.
+        configs (DictConfig): configuration set.
+    """
+    def __init__(
+            self,
+            optimizer: Optimizer,
+            configs,
+    ) -> None:
+        super(WarmupReduceLROnPlateauScheduler, self).__init__(optimizer, configs.lr_scheduler.lr)
+        self.warmup_steps = configs.lr_scheduler.warmup_steps
+        self.update_steps = 0
+        self.warmup_rate = (configs.lr_scheduler.peak_lr - configs.lr_scheduler.init_lr) / self.warmup_steps \
+            if self.warmup_steps != 0 else 0
+        self.schedulers = [
+            WarmupLRScheduler(
+                optimizer,
+                configs,
+            ),
+            ReduceLROnPlateauScheduler(
+                optimizer,
+                configs,
+            ),
+        ]
+
+    def _decide_stage(self):
+        if self.update_steps < self.warmup_steps:
+            return 0, self.update_steps
+        else:
+            return 1, None
+
+    def step(self, val_loss: Optional[float] = None):
+        stage, steps_in_stage = self._decide_stage()
+
+        if stage == 0:
+            self.schedulers[0].step()
+        elif stage == 1:
+            self.schedulers[1].step(val_loss)
+
+        self.update_steps += 1
+
+        return self.get_lr()
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/warmup_scheduler.py b/audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/warmup_scheduler.py
new file mode 100644
index 000000000..9c589b005
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/optim/scheduler/warmup_scheduler.py
@@ -0,0 +1,82 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+from dataclasses import dataclass, field
+from typing import Optional
+from torch.optim import Optimizer
+
+from openspeech.dataclass.configurations import LearningRateSchedulerConfigs
+from openspeech.optim.scheduler import register_scheduler
+from openspeech.optim.scheduler.lr_scheduler import LearningRateScheduler
+
+
+@dataclass
+class WarmupLRSchedulerConfigs(LearningRateSchedulerConfigs):
+    scheduler_name: str = field(
+        default="warmup", metadata={"help": "Name of learning rate scheduler."}
+    )
+    peak_lr: float = field(
+        default=1e-04, metadata={"help": "Maximum learning rate."}
+    )
+    init_lr: float = field(
+        default=1e-7, metadata={"help": "Initial learning rate."}
+    )
+    warmup_steps: int = field(
+        default=4000, metadata={"help": "Warmup the learning rate linearly for the first N updates"}
+    )
+    total_steps: int = field(
+        default=200000, metadata={"help": "Total training steps."}
+    )
+
+
+@register_scheduler("warmup", dataclass=WarmupLRSchedulerConfigs)
+class WarmupLRScheduler(LearningRateScheduler):
+    """
+    Warmup learning rate until `total_steps`
+
+    Args:
+        optimizer (Optimizer): wrapped optimizer.
+        configs (DictConfig): configuration set.
+    """
+    def __init__(
+            self,
+            optimizer: Optimizer,
+            configs,
+    ) -> None:
+        super(WarmupLRScheduler, self).__init__(optimizer, configs.lr_scheduler.init_lr)
+        if configs.lr_scheduler.warmup_steps != 0:
+            warmup_rate = configs.lr_scheduler.peak_lr - configs.lr_scheduler.init_lr
+            self.warmup_rate = warmup_rate / configs.lr_scheduler.warmup_steps
+        else:
+            self.warmup_rate = 0
+        self.update_steps = 1
+        self.lr = configs.lr_scheduler.init_lr
+        self.warmup_steps = configs.lr_scheduler.warmup_steps
+
+    def step(self, val_loss: Optional[torch.FloatTensor] = None):
+        if self.update_steps < self.warmup_steps:
+            lr = self.init_lr + self.warmup_rate * self.update_steps
+            self.set_lr(self.optimizer, lr)
+            self.lr = lr
+        self.update_steps += 1
+        return self.lr
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/search/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/search/__init__.py
new file mode 100644
index 000000000..c40825eb7
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/search/__init__.py
@@ -0,0 +1,28 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from .beam_search_base import OpenspeechBeamSearchBase
+from .beam_search_ctc import BeamSearchCTC
+from .beam_search_lstm import BeamSearchLSTM
+from .beam_search_rnn_transducer import BeamSearchRNNTransducer
+from .beam_search_transformer import BeamSearchTransformer
+from .beam_search_transformer_transducer import BeamSearchTransformerTransducer
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/search/beam_search_base.py b/audio/speech_recognition/conformer/pytorch/openspeech/search/beam_search_base.py
new file mode 100644
index 000000000..13d8f6d56
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/search/beam_search_base.py
@@ -0,0 +1,134 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+
+
+class OpenspeechBeamSearchBase(nn.Module):
+    """
+    Openspeech's beam-search base class. Implement the methods required for beamsearch.
+    You have to implement `forward` method.
+
+    Note:
+        Do not use this class directly, use one of the sub classes.
+    """
+    def __init__(self, decoder, beam_size: int):
+        super(OpenspeechBeamSearchBase, self).__init__()
+        self.decoder = decoder
+        self.beam_size = beam_size
+        self.sos_id = decoder.sos_id
+        self.pad_id = decoder.pad_id
+        self.eos_id = decoder.eos_id
+        self.ongoing_beams = None
+        self.cumulative_ps = None
+        self.forward_step = decoder.forward_step
+
+    def _inflate(self, tensor: torch.Tensor, n_repeat: int, dim: int) -> torch.Tensor:
+        repeat_dims = [1] * len(tensor.size())
+        repeat_dims[dim] *= n_repeat
+        return tensor.repeat(*repeat_dims)
+
+    def _get_successor(
+            self,
+            current_ps: torch.Tensor,
+            current_vs: torch.Tensor,
+            finished_ids: tuple,
+            num_successor: int,
+            eos_count: int,
+            k: int
+    ) -> int:
+        finished_batch_idx, finished_idx = finished_ids
+
+        successor_ids = current_ps.topk(k + num_successor)[1]
+        successor_idx = successor_ids[finished_batch_idx, -1]
+
+        successor_p = current_ps[finished_batch_idx, successor_idx]
+        successor_v = current_vs[finished_batch_idx, successor_idx]
+
+        prev_status_idx = (successor_idx // k)
+        prev_status = self.ongoing_beams[finished_batch_idx, prev_status_idx]
+        prev_status = prev_status.view(-1)[:-1]
+
+        successor = torch.cat([prev_status, successor_v.view(1)])
+
+        if int(successor_v) == self.eos_id:
+            self.finished[finished_batch_idx].append(successor)
+            self.finished_ps[finished_batch_idx].append(successor_p)
+            eos_count = self._get_successor(
+                current_ps=current_ps,
+                current_vs=current_vs,
+                finished_ids=finished_ids,
+                num_successor=num_successor + eos_count,
+                eos_count=eos_count + 1,
+                k=k,
+            )
+
+        else:
+            self.ongoing_beams[finished_batch_idx, finished_idx] = successor
+            self.cumulative_ps[finished_batch_idx, finished_idx] = successor_p
+
+        return eos_count
+
+    def _get_hypothesis(self):
+        predictions = list()
+
+        for batch_idx, batch in enumerate(self.finished):
+            # if there is no terminated sentences, bring ongoing sentence which has the highest probability instead
+            if len(batch) == 0:
+                prob_batch = self.cumulative_ps[batch_idx]
+                top_beam_idx = int(prob_batch.topk(1)[1])
+                predictions.append(self.ongoing_beams[batch_idx, top_beam_idx])
+
+            # bring highest probability sentence
+            else:
+                top_beam_idx = int(torch.FloatTensor(self.finished_ps[batch_idx]).topk(1)[1])
+                predictions.append(self.finished[batch_idx][top_beam_idx])
+
+        predictions = self._fill_sequence(predictions)
+        return predictions
+
+    def _is_all_finished(self, k: int) -> bool:
+        for done in self.finished:
+            if len(done) < k:
+                return False
+
+        return True
+
+    def _fill_sequence(self, y_hats: list) -> torch.Tensor:
+        batch_size = len(y_hats)
+        max_length = -1
+
+        for y_hat in y_hats:
+            if len(y_hat) > max_length:
+                max_length = len(y_hat)
+
+        matched = torch.zeros((batch_size, max_length), dtype=torch.long)
+
+        for batch_idx, y_hat in enumerate(y_hats):
+            matched[batch_idx, :len(y_hat)] = y_hat
+            matched[batch_idx, len(y_hat):] = int(self.pad_id)
+
+        return matched
+
+    def forward(self, *args, **kwargs):
+        raise NotImplementedError
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/search/beam_search_ctc.py b/audio/speech_recognition/conformer/pytorch/openspeech/search/beam_search_ctc.py
new file mode 100644
index 000000000..105bb82bf
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/search/beam_search_ctc.py
@@ -0,0 +1,84 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch.nn as nn
+from openspeech.utils import CTCDECODE_IMPORT_ERROR
+
+
+class BeamSearchCTC(nn.Module):
+    r"""
+    Decodes probability output using ctcdecode package.
+
+    Args:
+        labels (list): the tokens you used to train your model
+        lm_path (str): the path to your external kenlm language model(LM).
+        alpha (int): weighting associated with the LMs probabilities.
+        beta (int): weight associated with the number of words within our beam
+        cutoff_top_n (int): cutoff number in pruning. Only the top cutoff_top_n characters with the highest probability
+            in the vocab will be used in beam search.
+        cutoff_prob (float): cutoff probability in pruning. 1.0 means no pruning.
+        beam_size (int): this controls how broad the beam search is.
+        num_processes (int): parallelize the batch using num_processes workers.
+        blank_id (int): this should be the index of the CTC blank token
+
+    Inputs: logits, sizes
+        - logits: Tensor of character probabilities, where probs[c,t] is the probability of character c at time t
+        - sizes: Size of each sequence in the mini-batch
+
+    Returns:
+        - outputs: sequences of the model's best prediction
+    """
+    def __init__(
+            self,
+            labels: list,
+            lm_path: str = None,
+            alpha: int = 0,
+            beta: int = 0,
+            cutoff_top_n: int = 40,
+            cutoff_prob: float = 1.0,
+            beam_size: int = 3,
+            num_processes: int = 4,
+            blank_id: int = 0,
+    ) -> None:
+        super(BeamSearchCTC, self).__init__()
+        try:
+            from ctcdecode import CTCBeamDecoder
+        except ImportError:
+            raise ImportError(CTCDECODE_IMPORT_ERROR)
+        assert isinstance(labels, list), "labels must instance of list"
+        self.decoder = CTCBeamDecoder(labels, lm_path, alpha, beta, cutoff_top_n,
+                                      cutoff_prob, beam_size, num_processes, blank_id)
+
+    def forward(self, logits, sizes=None):
+        r"""
+        Decodes probability output using ctcdecode package.
+
+        Inputs: logits, sizes
+            logits: Tensor of character probabilities, where probs[c,t] is the probability of character c at time t
+            sizes: Size of each sequence in the mini-batch
+
+        Returns:
+            outputs: sequences of the model's best prediction
+        """
+        logits = logits.cpu()
+        outputs, scores, offsets, seq_lens = self.decoder.decode(logits, sizes)
+        return outputs
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/search/beam_search_lstm.py b/audio/speech_recognition/conformer/pytorch/openspeech/search/beam_search_lstm.py
new file mode 100644
index 000000000..87f92c2ec
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/search/beam_search_lstm.py
@@ -0,0 +1,154 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+
+from openspeech.search.beam_search_base import OpenspeechBeamSearchBase
+from openspeech.decoders import LSTMAttentionDecoder
+
+
+class BeamSearchLSTM(OpenspeechBeamSearchBase):
+    r"""
+    LSTM Beam Search Decoder
+
+    Args: decoder, beam_size, batch_size
+        decoder (DecoderLSTM): base decoder of lstm model.
+        beam_size (int): size of beam.
+
+    Inputs: encoder_outputs, targets, encoder_output_lengths, teacher_forcing_ratio
+        encoder_outputs (torch.FloatTensor): A output sequence of encoders. `FloatTensor` of size ``(batch, seq_length, dimension)``
+        targets (torch.LongTensor): A target sequence passed to decoders. `IntTensor` of size ``(batch, seq_length)``
+        encoder_output_lengths (torch.LongTensor): A encoder output lengths sequence. `LongTensor` of size ``(batch)``
+        teacher_forcing_ratio (float): Ratio of teacher forcing.
+
+    Returns:
+        * logits (torch.FloatTensor): Log probability of model predictions.
+    """
+    def __init__(self, decoder: LSTMAttentionDecoder, beam_size: int):
+        super(BeamSearchLSTM, self).__init__(decoder, beam_size)
+        self.hidden_state_dim = decoder.hidden_state_dim
+        self.num_layers = decoder.num_layers
+        self.validate_args = decoder.validate_args
+
+    def forward(
+            self,
+            encoder_outputs: torch.Tensor,
+            encoder_output_lengths: torch.Tensor,
+    ) -> torch.Tensor:
+        r"""
+        Beam search decoding.
+
+        Inputs: encoder_outputs
+            encoder_outputs (torch.FloatTensor): A output sequence of encoders. `FloatTensor` of size ``(batch, seq_length, dimension)``
+
+        Returns:
+            * logits (torch.FloatTensor): Log probability of model predictions.
+        """
+        batch_size, hidden_states = encoder_outputs.size(0), None
+
+        self.finished = [[] for _ in range(batch_size)]
+        self.finished_ps = [[] for _ in range(batch_size)]
+
+        inputs, batch_size, max_length = self.validate_args(None, encoder_outputs, teacher_forcing_ratio=0.0)
+
+        step_outputs, hidden_states, attn = self.forward_step(inputs, hidden_states, encoder_outputs)
+        self.cumulative_ps, self.ongoing_beams = step_outputs.topk(self.beam_size)
+
+        self.ongoing_beams = self.ongoing_beams.view(batch_size * self.beam_size, 1)
+        self.cumulative_ps = self.cumulative_ps.view(batch_size * self.beam_size, 1)
+
+        input_var = self.ongoing_beams
+
+        encoder_dim = encoder_outputs.size(2)
+        encoder_outputs = self._inflate(encoder_outputs, self.beam_size, dim=0)
+        encoder_outputs = encoder_outputs.view(self.beam_size, batch_size, -1, encoder_dim)
+        encoder_outputs = encoder_outputs.transpose(0, 1)
+        encoder_outputs = encoder_outputs.reshape(batch_size * self.beam_size, -1, encoder_dim)
+
+        if attn is not None:
+            attn = self._inflate(attn, self.beam_size, dim=0)
+
+        if isinstance(hidden_states, tuple):
+            hidden_states = tuple([self._inflate(h, self.beam_size, 1) for h in hidden_states])
+        else:
+            hidden_states = self._inflate(hidden_states, self.beam_size, 1)
+
+        for di in range(max_length - 1):
+            if self._is_all_finished(self.beam_size):
+                break
+
+            if isinstance(hidden_states, tuple):
+                tuple(h.view(self.num_layers, batch_size * self.beam_size, self.hidden_state_dim) for h in hidden_states)
+            else:
+                hidden_states = hidden_states.view(self.num_layers, batch_size * self.beam_size, self.hidden_state_dim)
+            step_outputs, hidden_states, attn = self.forward_step(input_var, hidden_states, encoder_outputs, attn)
+
+            step_outputs = step_outputs.view(batch_size, self.beam_size, -1)
+            current_ps, current_vs = step_outputs.topk(self.beam_size)
+
+            self.cumulative_ps = self.cumulative_ps.view(batch_size, self.beam_size)
+            self.ongoing_beams = self.ongoing_beams.view(batch_size, self.beam_size, -1)
+
+            current_ps = (current_ps.permute(0, 2, 1) + self.cumulative_ps.unsqueeze(1)).permute(0, 2, 1)
+            current_ps = current_ps.view(batch_size, self.beam_size ** 2)
+            current_vs = current_vs.view(batch_size, self.beam_size ** 2)
+
+            self.cumulative_ps = self.cumulative_ps.view(batch_size, self.beam_size)
+            self.ongoing_beams = self.ongoing_beams.view(batch_size, self.beam_size, -1)
+
+            topk_current_ps, topk_status_ids = current_ps.topk(self.beam_size)
+            prev_status_ids = (topk_status_ids // self.beam_size)
+
+            topk_current_vs = torch.zeros((batch_size, self.beam_size), dtype=torch.long)
+            prev_status = torch.zeros(self.ongoing_beams.size(), dtype=torch.long)
+
+            for batch_idx, batch in enumerate(topk_status_ids):
+                for idx, topk_status_idx in enumerate(batch):
+                    topk_current_vs[batch_idx, idx] = current_vs[batch_idx, topk_status_idx]
+                    prev_status[batch_idx, idx] = self.ongoing_beams[batch_idx, prev_status_ids[batch_idx, idx]]
+
+            self.ongoing_beams = torch.cat([prev_status, topk_current_vs.unsqueeze(2)], dim=2)
+            self.cumulative_ps = topk_current_ps
+
+            if torch.any(topk_current_vs == self.eos_id):
+                finished_ids = torch.where(topk_current_vs == self.eos_id)
+                num_successors = [1] * batch_size
+
+                for (batch_idx, idx) in zip(*finished_ids):
+                    self.finished[batch_idx].append(self.ongoing_beams[batch_idx, idx])
+                    self.finished_ps[batch_idx].append(self.cumulative_ps[batch_idx, idx])
+
+                    if self.beam_size != 1:
+                        eos_count = self._get_successor(
+                            current_ps=current_ps,
+                            current_vs=current_vs,
+                            finished_ids=(batch_idx, idx),
+                            num_successor=num_successors[batch_idx],
+                            eos_count=1,
+                            k=self.beam_size,
+                        )
+                        num_successors[batch_idx] += eos_count
+
+            input_var = self.ongoing_beams[:, :, -1]
+            input_var = input_var.view(batch_size * self.beam_size, -1)
+
+        return self._get_hypothesis()
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/search/beam_search_rnn_transducer.py b/audio/speech_recognition/conformer/pytorch/openspeech/search/beam_search_rnn_transducer.py
new file mode 100644
index 000000000..f752ee187
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/search/beam_search_rnn_transducer.py
@@ -0,0 +1,156 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+
+from openspeech.search.beam_search_base import OpenspeechBeamSearchBase
+from openspeech.decoders import RNNTransducerDecoder
+
+
+class BeamSearchRNNTransducer(OpenspeechBeamSearchBase):
+    r"""
+    RNN Transducer Beam Search
+    Reference: RNN-T FOR LATENCY CONTROLLED ASR WITH IMPROVED BEAM SEARCH (https://arxiv.org/pdf/1911.01629.pdf)
+
+    Args: joint, decoder, beam_size, expand_beam, state_beam, blank_id
+        joint: joint `encoder_outputs` and `decoder_outputs`
+        decoder (TransformerTransducerDecoder): base decoder of transformer transducer model.
+        beam_size (int): size of beam.
+        expand_beam (int): The threshold coefficient to limit the number of expanded hypotheses.
+        state_beam (int): The threshold coefficient to decide if hyps in A (process_hyps)
+        is likely to compete with hyps in B (ongoing_beams)
+        blank_id (int): blank id
+
+    Inputs: encoder_output, max_length
+        encoder_output (torch.FloatTensor): A output sequence of encoders. `FloatTensor` of size
+            ``(seq_length, dimension)``
+        max_length (int): max decoding time step
+
+    Returns:
+        * predictions (torch.LongTensor): model predictions.
+    """
+    def __init__(
+            self,
+            joint,
+            decoder: RNNTransducerDecoder,
+            beam_size: int = 3,
+            expand_beam: float = 2.3,
+            state_beam: float = 4.6,
+            blank_id: int = 3,
+    ) -> None:
+        super(BeamSearchRNNTransducer, self).__init__(decoder, beam_size)
+        self.joint = joint
+        self.expand_beam = expand_beam
+        self.state_beam = state_beam
+        self.blank_id = blank_id
+
+    def forward(self, encoder_outputs: torch.Tensor, max_length: int):
+        r"""
+        Beam search decoding.
+
+        Inputs: encoder_output, max_length
+            encoder_outputs (torch.FloatTensor): A output sequence of encoders. `FloatTensor` of size
+            ``(batch, seq_length, dimension)``
+            max_length (int): max decoding time step
+
+        Returns:
+            * predictions (torch.LongTensor): model predictions.
+        """
+        hypothesis = list()
+        hypothesis_score = list()
+
+        for batch_idx in range(encoder_outputs.size(0)):
+            blank = (
+                    torch.ones((1, 1), device=encoder_outputs.device, dtype=torch.long) * self.blank_id
+            )
+            step_input = (
+                    torch.ones((1, 1), device=encoder_outputs.device, dtype=torch.long) * self.sos_id
+            )
+            hyp = {
+                "prediction": [self.sos_id],
+                "logp_score": 0.0,
+                "hidden_states": None,
+            }
+            ongoing_beams = [hyp]
+
+            for t_step in range(max_length):
+                process_hyps = ongoing_beams
+                ongoing_beams = list()
+
+                while True:
+                    if len(ongoing_beams) >= self.beam_size:
+                        break
+
+                    a_best_hyp = max(process_hyps, key=lambda x: x["logp_score"] / len(x["prediction"]))
+
+                    if len(ongoing_beams) > 0:
+                        b_best_hyp = max(
+                            ongoing_beams,
+                            key=lambda x: x["logp_score"] / len(x["prediction"]),
+                        )
+
+                        a_best_prob = a_best_hyp["logp_score"]
+                        b_best_prob = b_best_hyp["logp_score"]
+
+                        if b_best_prob >= self.state_beam + a_best_prob:
+                            break
+
+                    process_hyps.remove(a_best_hyp)
+
+                    step_input[0, 0] = a_best_hyp["prediction"][-1]
+
+                    step_outputs, hidden_states = self.decoder(step_input, a_best_hyp["hidden_states"])
+                    log_probs = self.joint(encoder_outputs[batch_idx, t_step, :], step_outputs.view(-1))
+
+                    topk_targets, topk_idx = log_probs.topk(k=self.beam_size)
+
+                    if topk_idx[0] != blank:
+                        best_logp = topk_targets[0]
+                    else:
+                        best_logp = topk_targets[1]
+
+                    for j in range(topk_targets.size(0)):
+                        topk_hyp = {
+                            "prediction": a_best_hyp["prediction"][:],
+                            "logp_score": a_best_hyp["logp_score"] + topk_targets[j],
+                            "hidden_states": a_best_hyp["hidden_states"],
+                        }
+
+                        if topk_idx[j] == self.blank_id:
+                            ongoing_beams.append(topk_hyp)
+                            continue
+
+                        if topk_targets[j] >= best_logp - self.expand_beam:
+                            topk_hyp["prediction"].append(topk_idx[j].item())
+                            topk_hyp["hidden_states"] = hidden_states
+                            process_hyps.append(topk_hyp)
+
+            ongoing_beams = sorted(
+                ongoing_beams,
+                key=lambda x: x["logp_score"] / len(x["prediction"]),
+                reverse=True,
+            )[0]
+
+            hypothesis.append(torch.LongTensor(ongoing_beams["prediction"][1:]))
+            hypothesis_score.append(ongoing_beams["logp_score"] / len(ongoing_beams["prediction"]))
+
+        return self._fill_sequence(hypothesis)
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/search/beam_search_transformer.py b/audio/speech_recognition/conformer/pytorch/openspeech/search/beam_search_transformer.py
new file mode 100644
index 000000000..33e12a784
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/search/beam_search_transformer.py
@@ -0,0 +1,155 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+
+from openspeech.search.beam_search_base import OpenspeechBeamSearchBase
+from openspeech.decoders import TransformerDecoder
+
+
+class BeamSearchTransformer(OpenspeechBeamSearchBase):
+    r"""
+    Transformer Beam Search Decoder
+
+    Args: decoder, beam_size, batch_size
+        decoder (DecoderLSTM): base decoder of lstm model.
+        beam_size (int): size of beam.
+
+    Inputs: encoder_outputs, targets, encoder_output_lengths, teacher_forcing_ratio
+        encoder_outputs (torch.FloatTensor): A output sequence of encoders. `FloatTensor` of size ``(batch, seq_length, dimension)``
+        targets (torch.LongTensor): A target sequence passed to decoders. `IntTensor` of size ``(batch, seq_length)``
+        encoder_output_lengths (torch.LongTensor): A encoder output lengths sequence. `LongTensor` of size ``(batch)``
+        teacher_forcing_ratio (float): Ratio of teacher forcing.
+
+    Returns:
+        * logits (torch.FloatTensor): Log probability of model predictions.
+    """
+    def __init__(self, decoder: TransformerDecoder, beam_size: int = 3) -> None:
+        super(BeamSearchTransformer, self).__init__(decoder, beam_size)
+        self.use_cuda = True if torch.cuda.is_available() else False
+
+    def forward(
+            self,
+            encoder_outputs: torch.FloatTensor,
+            encoder_output_lengths: torch.FloatTensor,
+    ):
+        batch_size = encoder_outputs.size(0)
+
+        self.finished = [[] for _ in range(batch_size)]
+        self.finished_ps = [[] for _ in range(batch_size)]
+
+        decoder_inputs = torch.IntTensor(batch_size, self.decoder.max_length).fill_(self.sos_id).long()
+        decoder_input_lengths = torch.IntTensor(batch_size).fill_(1)
+
+        outputs = self.forward_step(
+            decoder_inputs=decoder_inputs[:, :1],
+            decoder_input_lengths=decoder_input_lengths,
+            encoder_outputs=encoder_outputs,
+            encoder_output_lengths=encoder_output_lengths,
+            positional_encoding_length=1,
+        )
+        step_outputs = self.decoder.fc(outputs).log_softmax(dim=-1)
+        self.cumulative_ps, self.ongoing_beams = step_outputs.topk(self.beam_size)
+
+        self.ongoing_beams = self.ongoing_beams.view(batch_size * self.beam_size, 1)
+        self.cumulative_ps = self.cumulative_ps.view(batch_size * self.beam_size, 1)
+
+        decoder_inputs = torch.IntTensor(batch_size * self.beam_size, 1).fill_(self.sos_id)
+        decoder_inputs = torch.cat((decoder_inputs, self.ongoing_beams), dim=-1)  # bsz * beam x 2
+
+        encoder_dim = encoder_outputs.size(2)
+        encoder_outputs = self._inflate(encoder_outputs, self.beam_size, dim=0)
+        encoder_outputs = encoder_outputs.view(self.beam_size, batch_size, -1, encoder_dim)
+        encoder_outputs = encoder_outputs.transpose(0, 1)
+        encoder_outputs = encoder_outputs.reshape(batch_size * self.beam_size, -1, encoder_dim)
+
+        encoder_output_lengths = encoder_output_lengths.unsqueeze(1).repeat(1, self.beam_size).view(-1)
+
+        for di in range(2, self.decoder.max_length):
+            if self._is_all_finished(self.beam_size):
+                break
+
+            decoder_input_lengths = torch.LongTensor(batch_size * self.beam_size).fill_(di)
+
+            step_outputs = self.forward_step(
+                decoder_inputs=decoder_inputs[:, :di],
+                decoder_input_lengths=decoder_input_lengths,
+                encoder_outputs=encoder_outputs,
+                encoder_output_lengths=encoder_output_lengths,
+                positional_encoding_length=di,
+            )
+            step_outputs = self.decoder.fc(step_outputs).log_softmax(dim=-1)
+
+            step_outputs = step_outputs.view(batch_size, self.beam_size, -1, 10)
+            current_ps, current_vs = step_outputs.topk(self.beam_size)
+
+            # TODO: Check transformer's beam search
+            current_ps = current_ps[:, :, -1, :]
+            current_vs = current_vs[:, :, -1, :]
+
+            self.cumulative_ps = self.cumulative_ps.view(batch_size, self.beam_size)
+            self.ongoing_beams = self.ongoing_beams.view(batch_size, self.beam_size, -1)
+
+            current_ps = (current_ps.permute(0, 2, 1) + self.cumulative_ps.unsqueeze(1)).permute(0, 2, 1)
+            current_ps = current_ps.view(batch_size, self.beam_size ** 2)
+            current_vs = current_vs.contiguous().view(batch_size, self.beam_size ** 2)
+
+            self.cumulative_ps = self.cumulative_ps.view(batch_size, self.beam_size)
+            self.ongoing_beams = self.ongoing_beams.view(batch_size, self.beam_size, -1)
+
+            topk_current_ps, topk_status_ids = current_ps.topk(self.beam_size)
+            prev_status_ids = (topk_status_ids // self.beam_size)
+
+            topk_current_vs = torch.zeros((batch_size, self.beam_size), dtype=torch.long)
+            prev_status = torch.zeros(self.ongoing_beams.size(), dtype=torch.long)
+
+            for batch_idx, batch in enumerate(topk_status_ids):
+                for idx, topk_status_idx in enumerate(batch):
+                    topk_current_vs[batch_idx, idx] = current_vs[batch_idx, topk_status_idx]
+                    prev_status[batch_idx, idx] = self.ongoing_beams[batch_idx, prev_status_ids[batch_idx, idx]]
+
+            self.ongoing_beams = torch.cat([prev_status, topk_current_vs.unsqueeze(2)], dim=2)
+            self.cumulative_ps = topk_current_ps
+
+            if torch.any(topk_current_vs == self.eos_id):
+                finished_ids = torch.where(topk_current_vs == self.eos_id)
+                num_successors = [1] * batch_size
+
+                for (batch_idx, idx) in zip(*finished_ids):
+                    self.finished[batch_idx].append(self.ongoing_beams[batch_idx, idx])
+                    self.finished_ps[batch_idx].append(self.cumulative_ps[batch_idx, idx])
+
+                    if self.beam_size != 1:
+                        eos_count = self._get_successor(
+                            current_ps=current_ps,
+                            current_vs=current_vs,
+                            finished_ids=(batch_idx, idx),
+                            num_successor=num_successors[batch_idx],
+                            eos_count=1,
+                            k=self.beam_size,
+                        )
+                        num_successors[batch_idx] += eos_count
+
+            ongoing_beams = self.ongoing_beams.clone().view(batch_size * self.beam_size, -1)
+            decoder_inputs = torch.cat((decoder_inputs, ongoing_beams[:, :-1]), dim=-1)
+
+        return self._get_hypothesis()
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/search/beam_search_transformer_transducer.py b/audio/speech_recognition/conformer/pytorch/openspeech/search/beam_search_transformer_transducer.py
new file mode 100644
index 000000000..8dfc01048
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/search/beam_search_transformer_transducer.py
@@ -0,0 +1,156 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+
+from openspeech.search.beam_search_base import OpenspeechBeamSearchBase
+from openspeech.decoders import TransformerTransducerDecoder
+
+
+class BeamSearchTransformerTransducer(OpenspeechBeamSearchBase):
+    r"""
+    Transformer Transducer Beam Search
+    Reference: RNN-T FOR LATENCY CONTROLLED ASR WITH IMPROVED BEAM SEARCH (https://arxiv.org/pdf/1911.01629.pdf)
+
+    Args: joint, decoder, beam_size, expand_beam, state_beam, blank_id
+        joint: joint `encoder_outputs` and `decoder_outputs`
+        decoder (TransformerTransducerDecoder): base decoder of transformer transducer model.
+        beam_size (int): size of beam.
+        expand_beam (int): The threshold coefficient to limit the number
+        of expanded hypotheses that are added in A (process_hyp).
+        state_beam (int): The threshold coefficient in log space to decide if hyps in A (process_hyps)
+        is likely to compete with hyps in B (ongoing_beams)
+        blank_id (int): blank id
+
+    Inputs: encoder_outputs, max_length
+        encoder_outputs (torch.FloatTensor): A output sequence of encoders. `FloatTensor` of size
+            ``(batch, seq_length, dimension)``
+        max_length (int): max decoding time step
+
+    Returns:
+        * predictions (torch.LongTensor): model predictions.
+    """
+    def __init__(
+            self,
+            joint,
+            decoder: TransformerTransducerDecoder,
+            beam_size: int = 3,
+            expand_beam: float = 2.3,
+            state_beam: float = 4.6,
+            blank_id: int = 3,
+    ) -> None:
+        super(BeamSearchTransformerTransducer, self).__init__(decoder, beam_size)
+        self.joint = joint
+        self.forward_step = self.decoder.forward_step
+        self.expand_beam = expand_beam
+        self.state_beam = state_beam
+        self.blank_id = blank_id
+
+    def forward(self, encoder_outputs: torch.Tensor, max_length: int):
+        r"""
+        Beam search decoding.
+
+        Inputs: encoder_outputs, max_length
+            encoder_outputs (torch.FloatTensor): A output sequence of encoders. `FloatTensor` of size
+            ``(batch, seq_length, dimension)``
+            max_length (int): max decoding time step
+
+        Returns:
+            * predictions (torch.LongTensor): model predictions.
+        """
+        hypothesis = list()
+        hypothesis_score = list()
+
+        for batch_idx in range(encoder_outputs.size(0)):
+            blank = (
+                    torch.ones((1, 1), device=encoder_outputs.device, dtype=torch.long) * self.blank_id
+            )
+            step_input = (
+                    torch.ones((1, 1), device=encoder_outputs.device, dtype=torch.long) * self.sos_id
+            )
+            hyp = {
+                "prediction": [self.sos_id],
+                "logp_score": 0.0,
+            }
+            ongoing_beams = [hyp]
+
+            for t_step in range(max_length):
+                process_hyps = ongoing_beams
+                ongoing_beams = list()
+
+                while True:
+                    if len(ongoing_beams) >= self.beam_size:
+                        break
+
+                    a_best_hyp = max(process_hyps, key=lambda x: x["logp_score"] / len(x["prediction"]))
+
+                    if len(ongoing_beams) > 0:
+                        b_best_hyp = max(
+                            ongoing_beams,
+                            key=lambda x: x["logp_score"] / len(x["prediction"]),
+                        )
+
+                        a_best_prob = a_best_hyp["logp_score"]
+                        b_best_prob = b_best_hyp["logp_score"]
+
+                        if b_best_prob >= self.state_beam + a_best_prob:
+                            break
+
+                    process_hyps.remove(a_best_hyp)
+
+                    step_input[0, 0] = a_best_hyp["prediction"][-1]
+                    step_lengths = encoder_outputs.new_tensor([0], dtype=torch.long)
+
+                    step_outputs = self.forward_step(step_input, step_lengths).squeeze(0).squeeze(0)
+                    log_probs = self.joint(encoder_outputs[batch_idx, t_step, :], step_outputs)
+
+                    topk_targets, topk_idx = log_probs.topk(k=self.beam_size)
+
+                    if topk_idx[0] != blank:
+                        best_logp = topk_targets[0]
+                    else:
+                        best_logp = topk_targets[1]
+
+                    for j in range(topk_targets.size(0)):
+                        topk_hyp = {
+                            "prediction": a_best_hyp["prediction"][:],
+                            "logp_score": a_best_hyp["logp_score"] + topk_targets[j],
+                        }
+
+                        if topk_idx[j] == self.blank_id:
+                            ongoing_beams.append(topk_hyp)
+                            continue
+
+                        if topk_targets[j] >= best_logp - self.expand_beam:
+                            topk_hyp["prediction"].append(topk_idx[j].item())
+                            process_hyps.append(topk_hyp)
+
+            ongoing_beams = sorted(
+                ongoing_beams,
+                key=lambda x: x["logp_score"] / len(x["prediction"]),
+                reverse=True,
+            )[0]
+
+            hypothesis.append(torch.LongTensor(ongoing_beams["prediction"][1:]))
+            hypothesis_score.append(ongoing_beams["logp_score"] / len(ongoing_beams["prediction"]))
+
+        return self._fill_sequence(hypothesis)
\ No newline at end of file
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/search/ensemble_search.py b/audio/speech_recognition/conformer/pytorch/openspeech/search/ensemble_search.py
new file mode 100644
index 000000000..5114c7207
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/search/ensemble_search.py
@@ -0,0 +1,96 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import torch.nn as nn
+from typing import Union
+
+
+class EnsembleSearch(nn.Module):
+    """
+    Class for ensemble search.
+
+    Args:
+        models (tuple): list of ensemble model
+
+    Inputs:
+        - **inputs** (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be
+            a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        - **input_lengths** (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        * predictions (torch.LongTensor): prediction of ensemble models
+    """
+    def __init__(self, models: Union[list, tuple]):
+        super(EnsembleSearch, self).__init__()
+        assert len(models) > 1, "Ensemble search should be multiple models."
+        self.models = models
+        
+    def forward(self, inputs: torch.FloatTensor, input_lengths: torch.LongTensor):
+        logits = list()
+
+        for model in self.models:
+            output = model(inputs, input_lengths)
+            logits.append(output["logits"])
+
+        output = logits[0]
+
+        for logit in logits[1:]:
+            output += logit
+
+        return output.max(-1)[1]
+
+
+class WeightedEnsembleSearch(nn.Module):
+    """
+    Args:
+        models (tuple): list of ensemble model
+        weights (tuple: list of ensemble's weight
+
+    Inputs:
+        - **inputs** (torch.FloatTensor): A input sequence passed to encoders. Typically for inputs this will be
+            a padded `FloatTensor` of size ``(batch, seq_length, dimension)``.
+        - **input_lengths** (torch.LongTensor): The length of input tensor. ``(batch)``
+
+    Returns:
+        * predictions (torch.LongTensor): prediction of ensemble models
+    """
+    def __init__(self, models: Union[list, tuple], weights: Union[list, tuple]):
+        super(WeightedEnsembleSearch, self).__init__()
+        assert len(models) > 1, "Ensemble search should be multiple models."
+        assert len(models) == len(weights), "len(models), len(weight) should be same."
+        self.models = models
+        self.weights = weights
+
+    def forward(self, inputs: torch.FloatTensor, input_lengths: torch.LongTensor):
+        logits = list()
+
+        for model in self.models:
+            output = model(inputs, input_lengths)
+            logits.append(output["logits"])
+
+        output = logits[0] * self.weights[0]
+
+        for idx, logit in enumerate(logits[1:]):
+            output += logit * self.weights[1]
+
+        return output.max(-1)[1]
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/__init__.py
new file mode 100644
index 000000000..792899417
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/__init__.py
@@ -0,0 +1,75 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import os
+import importlib
+
+TOKENIZER_REGISTRY = dict()
+TOKENIZER_DATACLASS_REGISTRY = dict()
+
+
+def register_tokenizer(name: str, dataclass=None):
+    """
+    New vocab types can be added to OpenSpeech with the :func:`register_tokenizer` function decorator.
+
+    For example::
+        @register_tokenizer('kspon_character')
+        class KsponSpeechCharacterTokenizer:
+            (...)
+
+    .. note:: All vocabs must implement the :class:`cls.__name__` interface.
+
+    Args:
+        name (str): the name of the tokenizer
+    """
+
+    def register_tokenizer_cls(cls):
+        if name in TOKENIZER_REGISTRY:
+            raise ValueError(f"Cannot register duplicate tokenizer ({name})")
+
+        TOKENIZER_REGISTRY[name] = cls
+
+        cls.__dataclass = dataclass
+        if dataclass is not None:
+            if name in TOKENIZER_DATACLASS_REGISTRY:
+                raise ValueError(f"Cannot register duplicate tokenizer ({name})")
+            TOKENIZER_DATACLASS_REGISTRY[name] = dataclass
+
+        return cls
+
+    return register_tokenizer_cls
+
+
+tokenizer_dir = os.path.dirname(__file__)
+for file in os.listdir(tokenizer_dir):
+    if os.path.isdir(os.path.join(tokenizer_dir, file)) and file != '__pycache__':
+        for subfile in os.listdir(os.path.join(tokenizer_dir, file)):
+            path = os.path.join(tokenizer_dir, file, subfile)
+            if subfile.endswith(".py"):
+                tokenizer_name = subfile[: subfile.find(".py")] if subfile.endswith(".py") else subfile
+                module = importlib.import_module(f"openspeech.tokenizers.{file}.{tokenizer_name}")
+        continue
+
+    path = os.path.join(tokenizer_dir, file)
+    if file.endswith(".py"):
+        vocab_name = file[: file.find(".py")] if file.endswith(".py") else file
+        module = importlib.import_module(f"openspeech.tokenizers.{vocab_name}")
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/aishell/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/aishell/__init__.py
new file mode 100644
index 000000000..2a79fa44c
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/aishell/__init__.py
@@ -0,0 +1,21 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/aishell/character.py b/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/aishell/character.py
new file mode 100644
index 000000000..ebcfb9830
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/aishell/character.py
@@ -0,0 +1,138 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import csv
+from dataclasses import dataclass, field
+
+from openspeech.dataclass.configurations import TokenizerConfigs
+from openspeech.tokenizers import register_tokenizer
+from openspeech.tokenizers.tokenizer import Tokenizer
+
+
+@dataclass
+class AIShellCharacterTokenizerConfigs(TokenizerConfigs):
+    unit: str = field(
+        default="aishell_character", metadata={"help": "Unit of vocabulary."}
+    )
+    vocab_path: str = field(
+        default="../../../data_aishell/aishell_labels.csv", metadata={"help": "Path of vocabulary file."}
+    )
+
+
+@register_tokenizer("aishell_character", dataclass=AIShellCharacterTokenizerConfigs)
+class AIShellCharacterTokenizer(Tokenizer):
+    r"""
+    Tokenizer class in Character-units for AISHELL.
+
+    Args:
+        configs (DictConfig): configuration set.
+    """
+
+    def __init__(self, configs):
+        super(AIShellCharacterTokenizer, self).__init__()
+        self.vocab_dict, self.id_dict = self.load_vocab(
+            vocab_path=configs.tokenizer.vocab_path,
+            encoding=configs.tokenizer.encoding,
+        )
+        self.labels = self.vocab_dict.keys()
+        self.sos_token = configs.tokenizer.sos_token
+        self.eos_token = configs.tokenizer.eos_token
+        self.pad_token = configs.tokenizer.pad_token
+        self.sos_id = int(self.vocab_dict[configs.tokenizer.sos_token])
+        self.eos_id = int(self.vocab_dict[configs.tokenizer.eos_token])
+        self.pad_id = int(self.vocab_dict[configs.tokenizer.pad_token])
+        self.blank_id = int(self.vocab_dict[configs.tokenizer.blank_token])
+        self.vocab_path = configs.tokenizer.vocab_path
+
+    def __len__(self):
+        return len(self.labels)
+
+    def decode(self, labels):
+        r"""
+        Converts label to string.
+
+        Args:
+            labels (numpy.ndarray): number label
+
+        Returns: sentence
+            - **sentence** (str or list): symbol of labels
+        """
+        if len(labels.shape) == 1:
+            sentence = str()
+            for label in labels:
+                if label.item() == self.eos_id:
+                    break
+                elif label.item() == self.blank_id:
+                  continue
+                sentence += self.id_dict[label.item()]
+            return sentence
+
+        sentences = list()
+        for batch in labels:
+            sentence = str()
+            for label in batch:
+                if label.item() == self.eos_id:
+                    break
+                elif label.item() == self.blank_id:
+                  continue
+                sentence += self.id_dict[label.item()]
+            sentences.append(sentence)
+        return sentences
+
+    def encode(self, sentence):
+        label = str()
+
+        for ch in sentence:
+            try:
+                label += (str(self.vocab_dict[ch]) + ' ')
+            except KeyError:
+                continue
+
+        return label[:-1]
+
+    def load_vocab(self, vocab_path, encoding='utf-8'):
+        r"""
+        Provides char2id, id2char
+
+        Args:
+            vocab_path (str): csv file with character labels
+            encoding (str): encoding method
+
+        Returns: unit2id, id2unit
+            - **unit2id** (dict): unit2id[unit] = id
+            - **id2unit** (dict): id2unit[id] = unit
+        """
+        unit2id = dict()
+        id2unit = dict()
+
+        try:
+            with open(vocab_path, 'r', encoding=encoding) as f:
+                labels = csv.reader(f, delimiter=',')
+                next(labels)
+
+                for row in labels:
+                    unit2id[row[1]] = row[0]
+                    id2unit[int(row[0])] = row[1]
+
+            return unit2id, id2unit
+        except IOError:
+            raise IOError("Character label file (csv format) doesn`t exist : {0}".format(vocab_path))
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/ksponspeech/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/ksponspeech/__init__.py
new file mode 100644
index 000000000..2a79fa44c
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/ksponspeech/__init__.py
@@ -0,0 +1,21 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/ksponspeech/character.py b/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/ksponspeech/character.py
new file mode 100644
index 000000000..935ce99cf
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/ksponspeech/character.py
@@ -0,0 +1,134 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import csv
+from dataclasses import dataclass, field
+
+from openspeech.dataclass.configurations import TokenizerConfigs
+from openspeech.tokenizers import register_tokenizer
+from openspeech.tokenizers.tokenizer import Tokenizer
+
+
+@dataclass
+class KsponSpeechCharacterTokenizerConfigs(TokenizerConfigs):
+    unit: str = field(
+        default="kspon_character", metadata={"help": "Unit of vocabulary."}
+    )
+    vocab_path: str = field(
+        default="../../../aihub_labels.csv", metadata={"help": "Path of vocabulary file."}
+    )
+
+
+@register_tokenizer("kspon_character", dataclass=KsponSpeechCharacterTokenizerConfigs)
+class KsponSpeechCharacterTokenizer(Tokenizer):
+    r"""
+    Tokenizer class in Character-units for KsponSpeech.
+
+    Args:
+        configs (DictConfig): configuration set.
+    """
+    def __init__(self, configs):
+        super(KsponSpeechCharacterTokenizer, self).__init__()
+        self.vocab_dict, self.id_dict = self.load_vocab(
+            vocab_path=configs.tokenizer.vocab_path,
+            encoding=configs.tokenizer.encoding,
+        )
+        self.labels = self.vocab_dict.keys()
+        self.sos_id = int(self.vocab_dict[configs.tokenizer.sos_token])
+        self.eos_id = int(self.vocab_dict[configs.tokenizer.eos_token])
+        self.pad_id = int(self.vocab_dict[configs.tokenizer.pad_token])
+        self.blank_id = int(self.vocab_dict[configs.tokenizer.blank_token])
+        self.vocab_path = configs.tokenizer.vocab_path
+
+    def __len__(self):
+        return len(self.labels)
+
+    def decode(self, labels):
+        r"""
+        Converts label to string (number => Hangeul)
+
+        Args:
+            labels (numpy.ndarray): number label
+
+        Returns: sentence
+            - **sentence** (str or list): symbol of labels
+        """
+        if len(labels.shape) == 1:
+            sentence = str()
+            for label in labels:
+                if label.item() == self.eos_id:
+                    break
+                elif label.item() == self.blank_id:
+                  continue
+                sentence += self.id_dict[label.item()]
+            return sentence
+
+        sentences = list()
+        for batch in labels:
+            sentence = str()
+            for label in batch:
+                if label.item() == self.eos_id:
+                    break
+                elif label.item() == self.blank_id:
+                  continue
+                sentence += self.id_dict[label.item()]
+            sentences.append(sentence)
+        return sentences
+
+    def encode(self, sentence):
+        label = str()
+
+        for ch in sentence:
+            try:
+                label += (str(self.vocab_dict[ch]) + ' ')
+            except KeyError:
+                continue
+
+        return label[:-1]
+
+    def load_vocab(self, vocab_path, encoding='utf-8'):
+        r"""
+        Provides char2id, id2char
+
+        Args:
+            vocab_path (str): csv file with character labels
+            encoding (str): encoding method
+
+        Returns: unit2id, id2unit
+            - **unit2id** (dict): unit2id[unit] = id
+            - **id2unit** (dict): id2unit[id] = unit
+        """
+        unit2id = dict()
+        id2unit = dict()
+
+        try:
+            with open(vocab_path, 'r', encoding=encoding) as f:
+                labels = csv.reader(f, delimiter=',')
+                next(labels)
+
+                for row in labels:
+                    unit2id[row[1]] = row[0]
+                    id2unit[int(row[0])] = row[1]
+
+            return unit2id, id2unit
+        except IOError:
+            raise IOError("Character label file (csv format) doesn`t exist : {0}".format(vocab_path))
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/ksponspeech/grapheme.py b/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/ksponspeech/grapheme.py
new file mode 100644
index 000000000..72245e302
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/ksponspeech/grapheme.py
@@ -0,0 +1,134 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import csv
+from dataclasses import dataclass, field
+
+from openspeech.dataclass.configurations import TokenizerConfigs
+from openspeech.tokenizers import register_tokenizer
+from openspeech.tokenizers.tokenizer import Tokenizer
+
+
+@dataclass
+class KsponSpeechGraphemeTokenizerConfigs(TokenizerConfigs):
+    unit: str = field(
+        default="kspon_grapheme", metadata={"help": "Unit of vocabulary."}
+    )
+    vocab_path: str = field(
+        default="../../../aihub_labels.csv", metadata={"help": "Path of vocabulary file."}
+    )
+
+
+@register_tokenizer("kspon_grapheme", dataclass=KsponSpeechGraphemeTokenizerConfigs)
+class KsponSpeechGraphemeTokenizer(Tokenizer):
+    """
+    Tokenizer class in Grapheme-units for KsponSpeech.
+
+    Args:
+        configs (DictConfig): configuration set.
+    """
+    def __init__(self, configs):
+        super(KsponSpeechGraphemeTokenizer, self).__init__()
+        self.vocab_dict, self.id_dict = self.load_vocab(
+            vocab_path=configs.tokenizer.vocab_path,
+            encoding=configs.tokenizer.encoding,
+        )
+        self.labels = self.vocab_dict.keys()
+        self.sos_id = int(self.vocab_dict[configs.tokenizer.sos_token])
+        self.eos_id = int(self.vocab_dict[configs.tokenizer.eos_token])
+        self.pad_id = int(self.vocab_dict[configs.tokenizer.pad_token])
+        self.blank_id = int(self.vocab_dict[configs.tokenizer.blank_token])
+        self.vocab_path = configs.tokenizer.vocab_path
+
+    def __len__(self):
+        return len(self.vocab_dict)
+
+    def decode(self, labels):
+        """
+        Converts label to string (number => Hangeul)
+
+        Args:
+            labels (numpy.ndarray): number label
+
+        Returns: sentence
+            - **sentence** (str or list): symbol of labels
+        """
+        if len(labels.shape) == 1:
+            sentence = str()
+            for label in labels:
+                if label.item() == self.eos_id:
+                    break
+                elif label.item() == self.blank_id:
+                  continue
+                sentence += self.id_dict[label.item()]
+            return sentence
+
+        sentences = list()
+        for batch in labels:
+            sentence = str()
+            for label in batch:
+                if label.item() == self.eos_id:
+                    break
+                elif label.item() == self.blank_id:
+                  continue
+                sentence += self.id_dict[label.item()]
+            sentences.append(sentence)
+        return sentences
+
+    def encode(self, sentence):
+        label = str()
+
+        for ch in sentence:
+            try:
+                label += (str(self.vocab_dict[ch]) + ' ')
+            except KeyError:
+                continue
+
+        return label[:-1]
+
+    def load_vocab(self, vocab_path, encoding='utf-8'):
+        """
+        Provides char2id, id2char
+
+        Args:
+            vocab_path (str): csv file with character labels
+            encoding (str): encoding method
+
+        Returns: unit2id, id2unit
+            - **unit2id** (dict): unit2id[unit] = id
+            - **id2unit** (dict): id2unit[id] = unit
+        """
+        unit2id = dict()
+        id2unit = dict()
+
+        try:
+            with open(vocab_path, 'r', encoding=encoding) as f:
+                labels = csv.reader(f, delimiter=',')
+                next(labels)
+
+                for row in labels:
+                    unit2id[row[1]] = row[0]
+                    id2unit[int(row[0])] = row[1]
+
+            return unit2id, id2unit
+        except IOError:
+            raise IOError("Character label file (csv format) doesn`t exist : {0}".format(vocab_path))
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/ksponspeech/subword.py b/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/ksponspeech/subword.py
new file mode 100644
index 000000000..8b56dfe6d
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/ksponspeech/subword.py
@@ -0,0 +1,100 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import sentencepiece as spm
+from dataclasses import dataclass, field
+
+from openspeech.dataclass.configurations import TokenizerConfigs
+from openspeech.tokenizers import register_tokenizer
+from openspeech.tokenizers.tokenizer import Tokenizer
+
+
+@dataclass
+class KsponSpeechSubwordTokenizerConfigs(TokenizerConfigs):
+    unit: str = field(
+        default="kspon_subword", metadata={"help": "Unit of vocabulary."}
+    )
+    sp_model_path: str = field(
+        default="sp.model", metadata={"help": "Path of sentencepiece model."}
+    )
+    sos_token: str = field(
+        default="<s>", metadata={"help": "Start of sentence token"}
+    )
+    eos_token: str = field(
+        default="</s>", metadata={"help": "End of sentence token"}
+    )
+    vocab_size: int = field(
+        default=3200, metadata={"help": "Size of vocabulary."}
+    )
+
+
+@register_tokenizer("kspon_subword", dataclass=KsponSpeechSubwordTokenizerConfigs)
+class KsponSpeechSubwordTokenizer(Tokenizer):
+    """
+    Tokenizer class in Subword-units for KsponSpeech.
+
+    Args:
+        configs (DictConfig): configuration set.
+    """
+    def __init__(self, configs):
+        super(KsponSpeechSubwordTokenizer, self).__init__()
+        self.sp = spm.SentencePieceProcessor()
+        self.sp.Load(configs.tokenizer.sp_model_path)
+
+        self.vocab_dict = [[self.sp.id_to_piece(id), id] for id in range(self.sp.get_piece_size())]
+        self.labels = [item[0] for item in self.vocab_dict]
+
+        self.pad_id = self.sp.PieceToId(configs.tokenizer.pad_token)
+        self.sos_id = self.sp.PieceToId(configs.tokenizer.sos_token)
+        self.eos_id = self.sp.PieceToId(configs.tokenizer.eos_token)
+        self.blank_id = self.sp.PieceToId(configs.tokenizer.blank_token)
+        self.vocab_size = configs.tokenizer.vocab_size
+
+    def __len__(self):
+        return self.vocab_size
+
+    def decode(self, labels):
+        """
+        Converts label to string (number => Hangeul)
+
+        Args:
+            labels (numpy.ndarray): number label
+
+        Returns: sentence
+            - **sentence** (str or list): symbol of labels
+        """
+        if len(labels.shape) == 1:
+            return self.sp.DecodeIds([int(l) for l in labels])
+
+        sentences = list()
+        for batch in labels:
+            sentence = str()
+            for label in batch:
+                sentence = self.sp.DecodeIds([int(l) for l in label])
+            sentences.append(sentence)
+        return sentences
+
+    def encode(self, sentence):
+        text = " ".join(self.sp.EncodeAsPieces(sentence))
+        label = " ".join([str(self.sp.PieceToId(token)) for token in text])
+        return label
+
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/librispeech/__init__.py b/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/librispeech/__init__.py
new file mode 100644
index 000000000..2a79fa44c
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/librispeech/__init__.py
@@ -0,0 +1,21 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/librispeech/character.py b/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/librispeech/character.py
new file mode 100644
index 000000000..1ec98d7b6
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/librispeech/character.py
@@ -0,0 +1,135 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import torch
+import csv
+from dataclasses import dataclass, field
+
+from openspeech.dataclass.configurations import TokenizerConfigs
+from openspeech.tokenizers import register_tokenizer
+from openspeech.tokenizers.tokenizer import Tokenizer
+
+
+@dataclass
+class LibriSpeechCharacterTokenizerConfigs(TokenizerConfigs):
+    unit: str = field(
+        default="libri_character", metadata={"help": "Unit of vocabulary."}
+    )
+    vocab_path: str = field(
+        default="../../../LibriSpeech/libri_labels.csv", metadata={"help": "Path of vocabulary file."}
+    )
+
+
+@register_tokenizer("libri_character", dataclass=LibriSpeechCharacterTokenizerConfigs)
+class LibriSpeechCharacterTokenizer(Tokenizer):
+    r"""
+    Tokenizer class in Character-units for LibriSpeech.
+
+    Args:
+        configs (DictConfig): configuration set.
+    """
+    def __init__(self, configs):
+        super(LibriSpeechCharacterTokenizer, self).__init__()
+        self.vocab_dict, self.id_dict = self.load_vocab(
+            vocab_path=configs.tokenizer.vocab_path,
+            encoding=configs.tokenizer.encoding,
+        )
+        self.labels = self.vocab_dict.keys()
+        self.sos_id = int(self.vocab_dict[configs.tokenizer.sos_token])
+        self.eos_id = int(self.vocab_dict[configs.tokenizer.eos_token])
+        self.pad_id = int(self.vocab_dict[configs.tokenizer.pad_token])
+        self.blank_id = int(self.vocab_dict[configs.tokenizer.blank_token])
+        self.vocab_path = configs.tokenizer.vocab_path
+
+    def __len__(self):
+        return len(self.labels)
+
+    def decode(self, labels):
+        r"""
+        Converts label to string (number => Hangeul)
+
+        Args:
+            labels (numpy.ndarray): number label
+
+        Returns: sentence
+            - **sentence** (str or list): symbol of labels
+        """
+        if len(labels.shape) == 1:
+            sentence = str()
+            for label in labels:
+                if label.item() == self.eos_id:
+                    break
+                elif label.item() == self.blank_id:
+                  continue
+                sentence += self.id_dict[label.item()]
+            return sentence
+
+        sentences = list()
+        for batch in labels:
+            sentence = str()
+            for label in batch:
+                if label.item() == self.eos_id:
+                    break
+                elif label.item() == self.blank_id:
+                  continue
+                sentence += self.id_dict[label.item()]
+            sentences.append(sentence)
+        return sentences
+
+    def encode(self, sentence):
+        label = str()
+
+        for ch in sentence:
+            try:
+                label += (str(self.vocab_dict[ch]) + ' ')
+            except KeyError:
+                continue
+
+        return label[:-1]
+
+    def load_vocab(self, vocab_path, encoding='utf-8'):
+        r"""
+        Provides char2id, id2char
+
+        Args:
+            vocab_path (str): csv file with character labels
+            encoding (str): encoding method
+
+        Returns: unit2id, id2unit
+            - **unit2id** (dict): unit2id[unit] = id
+            - **id2unit** (dict): id2unit[id] = unit
+        """
+        unit2id = dict()
+        id2unit = dict()
+
+        try:
+            with open(vocab_path, 'r', encoding=encoding) as f:
+                labels = csv.reader(f, delimiter=',')
+                next(labels)
+
+                for row in labels:
+                    unit2id[row[1]] = row[0]
+                    id2unit[int(row[0])] = row[1]
+
+            return unit2id, id2unit
+        except IOError:
+            raise IOError("Character label file (csv format) doesn`t exist : {0}".format(vocab_path))
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/librispeech/subword.py b/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/librispeech/subword.py
new file mode 100644
index 000000000..be9280886
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/librispeech/subword.py
@@ -0,0 +1,95 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import os
+from dataclasses import dataclass, field
+
+from openspeech.dataclass.configurations import TokenizerConfigs
+from openspeech.datasets.librispeech.preprocess.subword import SENTENCEPIECE_MODEL_NAME
+from openspeech.utils import SENTENCEPIECE_IMPORT_ERROR
+from openspeech.tokenizers import register_tokenizer
+from openspeech.tokenizers.tokenizer import Tokenizer
+
+
+@dataclass
+class LibriSpeechSubwordTokenizerConfigs(TokenizerConfigs):
+    unit: str = field(
+        default="libri_subword", metadata={"help": "Unit of vocabulary."}
+    )
+    sos_token: str = field(
+        default="<s>", metadata={"help": "Start of sentence token"}
+    )
+    eos_token: str = field(
+        default="</s>", metadata={"help": "End of sentence token"}
+    )
+    vocab_size: int = field(
+        default=5000, metadata={"help": "Size of vocabulary."}
+    )
+    vocab_path: str = field(
+        default="../../../LibriSpeech/", metadata={"help": "Path of vocabulary file."}
+    )
+
+
+@register_tokenizer("libri_subword", dataclass=LibriSpeechSubwordTokenizerConfigs)
+class LibriSpeechSubwordTokenizer(Tokenizer):
+    """
+    Tokenizer class in Subword-units for LibriSpeech.
+
+    Args:
+        configs (DictConfig): configuration set.
+    """
+    def __init__(self, configs):
+        super(LibriSpeechSubwordTokenizer, self).__init__()
+        try:
+            import sentencepiece as spm
+        except ImportError:
+            raise ImportError(SENTENCEPIECE_IMPORT_ERROR)
+
+        self.sp = spm.SentencePieceProcessor()
+        self.sp.Load(os.path.join(configs.tokenizer.vocab_path, f"{SENTENCEPIECE_MODEL_NAME}.model"))
+        self.pad_id = self.sp.PieceToId(configs.tokenizer.pad_token)
+        self.sos_id = self.sp.PieceToId(configs.tokenizer.sos_token)
+        self.eos_id = self.sp.PieceToId(configs.tokenizer.eos_token)
+        self.blank_id = self.sp.PieceToId(configs.tokenizer.blank_token)
+        self.vocab_size = configs.tokenizer.vocab_size
+
+    def __len__(self):
+        return self.vocab_size
+
+    def decode(self, labels):
+        if len(labels.shape) == 1:
+            return self.sp.DecodeIds([l.item() for l in labels])
+
+        elif len(labels.shape) == 2:
+            sentences = list()
+
+            for label in labels:
+                sentence = self.sp.DecodeIds([l.item() for l in label])
+                sentences.append(sentence)
+            return sentences
+        else:
+            raise ValueError("Unsupported label's shape")
+
+    def encode(self, sentence):
+        text = " ".join(self.sp.EncodeAsPieces(sentence))
+        label = " ".join([str(self.sp.PieceToId(token)) for token in text])
+        return label
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/tokenizer.py b/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/tokenizer.py
new file mode 100644
index 000000000..76f5e1307
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/tokenizers/tokenizer.py
@@ -0,0 +1,44 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+class Tokenizer(object):
+    r"""
+    A tokenizer is in charge of preparing the inputs for a model.
+
+    Note:
+        Do not use this class directly, use one of the sub classes.
+    """
+    def __init__(self, *args, **kwargs):
+        self.sos_id = None
+        self.eos_id = None
+        self.pad_id = None
+        self.blank_id = None
+
+    def decode(self, labels):
+        raise NotImplementedError
+
+    def encode(self, labels):
+        raise NotImplementedError
+
+    def __call__(self, sentence):
+        return self.encode(sentence)
diff --git a/audio/speech_recognition/conformer/pytorch/openspeech/utils.py b/audio/speech_recognition/conformer/pytorch/openspeech/utils.py
new file mode 100644
index 000000000..7dd7d713d
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/openspeech/utils.py
@@ -0,0 +1,211 @@
+# MIT License
+#
+# Copyright (c) 2021 Soohwan Kim and Sangchun Ha and Soyoung Cho
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import logging
+import torch
+import platform
+import importlib
+from collections import OrderedDict
+from typing import Tuple, Union, Iterable
+
+
+PYTORCH_IMPORT_ERROR = """
+Openspeech requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
+installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
+"""
+
+TORCHAUDIO_IMPORT_ERROR = """
+Openspeech requires the torchaudio library but it was not found in your environment. You can install it with pip:
+`pip install torchaudio`
+"""
+
+LIBROSA_IMPORT_ERROR = """
+Openspeech requires the librosa library but it was not found in your environment. You can install it with pip:
+`pip install librosa`
+"""
+
+SENTENCEPIECE_IMPORT_ERROR = """
+Openspeech requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
+installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
+that match your environment.
+"""
+
+WARPRNNT_IMPORT_ERROR = """
+Openspeech requires the warp-rnnt library but it was not found in your environment. Checkout the instructions on the
+installation page of its repo: https://github.com/1ytic/warp-rnnt and follow the ones that match your environment.
+"""
+
+CTCDECODE_IMPORT_ERROR = """
+Openspeech requires the ctcdecode library but it was not found in your environment. Checkout the instructions on the
+installation page of its repo: https://github.com/parlance/ctcdecode and follow the ones that match your environment.
+"""
+
+try:
+    import librosa
+except ImportError:
+    raise ValueError(LIBROSA_IMPORT_ERROR)
+
+DUMMY_SIGNALS, _ = librosa.load(librosa.ex('choice'))
+DUMMY_FEATURES = librosa.feature.melspectrogram(y=DUMMY_SIGNALS, n_mels=80)
+DUMMY_INPUTS = torch.FloatTensor(DUMMY_FEATURES).transpose(0, 1).unsqueeze(0).expand(3, -1, -1)
+DUMMY_INPUT_LENGTHS = torch.IntTensor([1070, 900, 800])
+DUMMY_TARGETS = torch.LongTensor([
+    [2, 3, 3, 3, 3, 3, 2, 2, 1, 0],
+    [2, 3, 3, 3, 3, 3, 2, 1, 2, 0],
+    [2, 3, 3, 3, 3, 3, 2, 2, 0, 1],
+])
+DUMMY_TARGET_LENGTHS = torch.IntTensor([9, 8, 7])
+DUMMY_TRANSCRIPTS = "OPENSPEECH IS AWESOME"
+
+DUMMY_LM_INPUTS = torch.LongTensor([
+    [2, 3, 3, 3, 3, 3, 2, 2, 0],
+    [2, 3, 3, 3, 3, 3, 2, 3, 2],
+    [2, 3, 3, 3, 3, 3, 2, 2, 0],
+])
+DYMMY_LM_INPUT_LENGTHS = torch.IntTensor([9, 8, 7])
+DUMMY_LM_TARGETS = torch.LongTensor([
+    [3, 3, 3, 3, 3, 2, 2, 1, 0],
+    [3, 3, 3, 3, 3, 2, 1, 2, 0],
+    [3, 3, 3, 3, 3, 2, 2, 0, 1],
+])
+
+
+def is_pytorch_available():
+    return importlib.util.find_spec("torch") is not None
+
+
+def is_librosa_available():
+    return importlib.util.find_spec("librosa") is not None
+
+
+def is_apex_available():
+    return importlib.util.find_spec("apex") is not None
+
+
+def is_sentencepiece_available():
+    return importlib.util.find_spec("sentencepiece") is not None
+
+
+def is_torchaudio_available():
+    return importlib.util.find_spec("torchaudio") is not None
+
+
+BACKENDS_MAPPING = OrderedDict(
+    [
+        ("torch", (is_pytorch_available, PYTORCH_IMPORT_ERROR)),
+        ("sentencepiece", (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)),
+        ("librosa", (is_librosa_available, LIBROSA_IMPORT_ERROR)),
+        ("torchaudio", (is_torchaudio_available, TORCHAUDIO_IMPORT_ERROR)),
+    ]
+)
+
+
+def check_backends():
+    backends = BACKENDS_MAPPING.keys()
+
+    if not all(BACKENDS_MAPPING[backend][0]() for backend in backends):
+        raise ImportError("".join([BACKENDS_MAPPING[backend][1] for backend in backends]))
+
+
+def get_class_name(obj):
+    return obj.__class__.__name__
+
+
+def _check_environment(use_cuda: bool, logger) -> int:
+    r"""
+    Check execution envirionment.
+    OS, Processor, CUDA version, Pytorch version, ... etc.
+    """
+    check_backends()
+
+    cuda = use_cuda and torch.cuda.is_available()
+    device = torch.device('cuda' if cuda else 'cpu')
+
+    logger.info(f"Operating System : {platform.system()} {platform.release()}")
+    logger.info(f"Processor : {platform.processor()}")
+
+    num_devices = torch.cuda.device_count()
+
+    if str(device) == 'cuda':
+        for idx in range(torch.cuda.device_count()):
+            logger.info(f"device : {torch.cuda.get_device_name(idx)}")
+        logger.info(f"CUDA is available : {torch.cuda.is_available()}")
+        logger.info(f"CUDA version : {torch.version.cuda}")
+        logger.info(f"PyTorch version : {torch.__version__}")
+
+    else:
+        logger.info(f"CUDA is available : {torch.cuda.is_available()}")
+        logger.info(f"PyTorch version : {torch.__version__}")
+
+    return num_devices
+
+
+class DotDict(dict):
+    """dot.notation access to dictionary attributes"""
+    __getattr__ = dict.get
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+
+
+def build_dummy_configs(
+        model_configs=None,
+        vocab_configs=None,
+        criterion_configs=None,
+        scheduler_configs=None,
+        trainer_configs=None,
+        audio_configs=None,
+):
+    from openspeech.models import ConformerConfigs
+    from openspeech.criterion import CrossEntropyLossConfigs
+    from openspeech.tokenizers.ksponspeech.character import KsponSpeechCharacterTokenizerConfigs
+    from openspeech.data.audio.melspectrogram.melspectrogram import MelSpectrogramConfigs
+    from openspeech.dataclass import GPUTrainerConfigs
+    from openspeech.optim.scheduler.warmup_reduce_lr_on_plateau_scheduler import WarmupReduceLROnPlateauConfigs
+
+    if model_configs is None:
+        model_configs = ConformerConfigs()
+
+    if vocab_configs is None:
+        vocab_configs = KsponSpeechCharacterTokenizerConfigs()
+        vocab_configs.vocab_path = "labels.csv"
+
+    if criterion_configs is None:
+        criterion_configs = CrossEntropyLossConfigs
+
+    if trainer_configs is None:
+        trainer_configs = GPUTrainerConfigs()
+
+    if scheduler_configs is None:
+        scheduler_configs = WarmupReduceLROnPlateauConfigs()
+
+    if audio_configs is None:
+        audio_configs = MelSpectrogramConfigs()
+
+    return DotDict({
+        'model': model_configs,
+        'vocab': vocab_configs,
+        'criterion': criterion_configs,
+        'trainer': trainer_configs,
+        'audio': audio_configs,
+        'lr_scheduler': scheduler_configs,
+    })
+
diff --git a/audio/speech_recognition/conformer/pytorch/requirements.txt b/audio/speech_recognition/conformer/pytorch/requirements.txt
new file mode 100644
index 000000000..159e7b7fc
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/requirements.txt
@@ -0,0 +1,6 @@
+addict
+python-Levenshtein
+sentencepiece
+librosa
+tqdm
+#torchaudio==0.8.1
diff --git a/audio/speech_recognition/conformer/pytorch/run_training.sh b/audio/speech_recognition/conformer/pytorch/run_training.sh
new file mode 100644
index 000000000..b246a032e
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/run_training.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+get_lscpu_value() {
+    awk -F: "(\$1 == \"${1}\"){gsub(/ /, \"\", \$2); print \$2; found=1} END{exit found!=1}"
+}
+lscpu_out=$(lscpu)
+
+n_sockets=$(get_lscpu_value 'Socket(s)' <<< "${lscpu_out}")
+n_cores_per_socket=$(get_lscpu_value 'Core(s) per socket' <<< "${lscpu_out}")
+
+echo "Number of CPU sockets on a node: ${n_sockets}"
+echo "Number of CPU cores per socket: ${n_cores_per_socket}"
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0)); then
+        EXIT_STATUS=1
+    fi
+}
+
+python3 -u -m bind_pyt \
+    --nsockets_per_node ${n_sockets} \
+    --ncores_per_socket ${n_cores_per_socket} \
+    --no_hyperthreads  \
+    --no_membind "$@" train.py; check_status
+
+exit ${EXIT_STATUS}
diff --git a/audio/speech_recognition/conformer/pytorch/test/test_build_conformer.py b/audio/speech_recognition/conformer/pytorch/test/test_build_conformer.py
new file mode 100644
index 000000000..504b02144
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/test/test_build_conformer.py
@@ -0,0 +1,32 @@
+from addict import Dict
+
+class ConfigDict(Dict):
+
+    def __missing__(self, name):
+        raise KeyError(name)
+
+    def __getattr__(self, name):
+        return super(ConfigDict, self).__getattr__(name)
+
+
+from openspeech.tokenizers import TOKENIZER_REGISTRY
+from openspeech.models import MODEL_REGISTRY
+
+
+# PYTHONPATH=$PYTHONPATH:./ python test/test_build_conformer.py
+if __name__ == '__main__':
+    config_file = 'configs/conformer_lstm.json'
+
+    import json
+
+    with open(config_file) as f:
+        configs = json.load(f)
+
+    configs = ConfigDict(configs)
+    print(configs.model.model_name)
+
+    tokenizer = TOKENIZER_REGISTRY[configs.tokenizer.unit](configs)
+
+    model = MODEL_REGISTRY[configs.model.model_name](configs=configs, tokenizer=tokenizer)
+    model.build_model()
+    print("model:", model)
diff --git a/audio/speech_recognition/conformer/pytorch/test/test_dataloader.py b/audio/speech_recognition/conformer/pytorch/test/test_dataloader.py
new file mode 100644
index 000000000..47dbbddc0
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/test/test_dataloader.py
@@ -0,0 +1,43 @@
+from addict import Dict
+
+class ConfigDict(Dict):
+
+    def __missing__(self, name):
+        raise KeyError(name)
+
+    def __getattr__(self, name):
+        return super(ConfigDict, self).__getattr__(name)
+
+
+from openspeech.tokenizers import TOKENIZER_REGISTRY
+from openspeech.datasets import DATA_MODULE_REGISTRY
+
+
+# PYTHONPATH=$PYTHONPATH:./ python test/test_dataloader.py
+if __name__ == '__main__':
+    config_file = 'configs/conformer_lstm.json'
+
+    import json
+
+    with open(config_file) as f:
+        configs = json.load(f)
+
+    configs = ConfigDict(configs)
+    print(configs.model.model_name)
+
+    data_module = DATA_MODULE_REGISTRY[configs.dataset.dataset](configs)
+    data_module.prepare_data()
+    tokenizer = TOKENIZER_REGISTRY[configs.tokenizer.unit](configs)
+
+    data_module.setup(tokenizer=tokenizer)
+
+    train_dataloader = data_module.train_dataloader()
+    print(f'iters_per_epoch: {len(train_dataloader)}')
+    steps = 0
+    for batch_data in train_dataloader:
+        inputs, targets, input_lengths, target_lengths = batch_data
+        print(f'inputs: {inputs.size()} input_lengths: {input_lengths.size()} '
+              f'targets: {targets.size()} target_lengths: {target_lengths.size()}')
+        steps += 1
+        if steps > 10:
+            break
diff --git a/audio/speech_recognition/conformer/pytorch/train.py b/audio/speech_recognition/conformer/pytorch/train.py
new file mode 100644
index 000000000..e1b0f9fb5
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/train.py
@@ -0,0 +1,320 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+from collections import OrderedDict
+import numpy as np
+import os
+import sys
+import time
+from tqdm import trange
+
+import torch
+from torch.cuda.amp import GradScaler
+from torch.cuda.amp import autocast
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+from openspeech.tokenizers import TOKENIZER_REGISTRY
+from openspeech.datasets import DATA_MODULE_REGISTRY
+from openspeech.models import MODEL_REGISTRY
+
+from dataloader import create_dataloader, WorkerInitializer
+from utils import ConfigDict, dist, manual_seed, TrainingLogger, TrainingState
+import math
+
+def parse_args():
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="This is an example for using TrainingLogger.")
+    parser.add_argument("--config_file", type=str, default="configs/conformer_lstm.json")
+    parser.add_argument("--data_dir", type=str, default="data")
+    parser.add_argument("--max_steps", type=int, default=10000,
+                        help="Total number of training steps to perform.")
+    parser.add_argument("--max_epochs", type=int, default=None,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--batch_size", type=int, default=8,
+                        help="Total batch size for training.")
+    parser.add_argument("--quality_metric", type=str, default="wer")
+    parser.add_argument("--quality_target", type=float, default=0.99)
+    parser.add_argument("--quality_judgement", type=str, default='<=')
+    parser.add_argument("--num_train_samples", type=int, default=None,
+                        help="Number of train samples to run train on.")
+    parser.add_argument("--num_eval_samples", type=int, default=None,
+                        help="Number of eval samples to run eval on.")
+    parser.add_argument("--eval_batch_size", type=int, default=None,
+                        help="Total batch size for evaluation.")
+    parser.add_argument("--eval_freq", type=int, default=1000,
+                        help="Evaluate every eval_freq steps during training.")
+    parser.add_argument("--log_freq", type=int, default=10,
+                        help="Frequency of logging training state.")
+    parser.add_argument('--seed', type=int, default=1234,
+                        help="Random seed for initialization")
+    parser.add_argument("--amp", action="store_true",
+                        help="Use mixed accuracy training.")
+    parser.add_argument("--ddp", action="store_true",
+                        help="Use distributed training.")
+    parser.add_argument("--local_rank", "--local-rank", type=int, default=-1,
+                        help="Local rank for distributed training on gpus.")
+    parser.add_argument("--ddp_type", default='native', type=str)
+    parser.add_argument("--dist_backend", type=str, default="nccl",
+                        help="Communication backend for distributed training on gpus.")
+
+    args = parser.parse_args()
+    return args
+
+
+def h2d_tensors(tensors, device):
+    return [t.to(device) for t in tensors]
+
+
+class StrFormatter():
+    fmt_dict = dict(
+        stage=' [{0: <12}]',
+        progress=' [{0: <19}]',
+        metrics=' [{0: <15}]',
+        perf=' [{0: <12}]',
+        default=' {0: <10}'
+    )
+
+    def __call__(self, key, str_msg):
+        if key not in self.fmt_dict:
+            key = 'default'
+        return self.fmt_dict[key].format(str_msg)
+
+
+def main(args):
+    import json
+
+    with open(args.config_file) as f:
+        configs = json.load(f)
+
+    configs = ConfigDict(configs)
+    configs.dataset.dataset_path = os.path.join(
+        args.data_dir, configs.dataset.dataset_path)
+    configs.dataset.train_manifest_file = os.path.join(
+        args.data_dir, configs.dataset.train_manifest_file)
+    configs.dataset.eval_manifest_file = os.path.join(
+        args.data_dir, configs.dataset.eval_manifest_file)
+    configs.tokenizer.vocab_path = os.path.join(
+        args.data_dir, configs.tokenizer.vocab_path)
+
+    logger = TrainingLogger(
+        flush_freq=1,
+        json_flush_freq=10,
+        filepath='training_log.json',
+        str_formatter=StrFormatter()
+    )
+
+    args.device, args.num_gpus = dist.init_dist_training_env(args)
+
+    worker_seeds, shuffling_seeds = dist.setup_seeds(
+        args.seed, 1, args.device)
+    if args.ddp:
+        worker_seed = worker_seeds[torch.distributed.get_rank()]
+    else:
+        worker_seed = worker_seeds[0]
+    manual_seed(worker_seed)
+    worker_init = WorkerInitializer.default(worker_seed)
+
+    tokenizer = TOKENIZER_REGISTRY[configs.tokenizer.unit](configs)
+
+    model = MODEL_REGISTRY[configs.model.model_name](configs=configs, tokenizer=tokenizer)
+    model.build_model()
+    optimizer, lr_scheduler = model.configure_optimizers()
+    if args.amp:
+        scaler = GradScaler()
+
+    data_module = DATA_MODULE_REGISTRY[configs.dataset.dataset](configs)
+    if dist.is_main_process():
+        data_module.prepare_data()
+    dist.barrier()
+    data_module_kwargs = dict(tokenizer=tokenizer)
+    if args.num_train_samples is not None:
+        data_module_kwargs['num_train_samples'] = args.num_train_samples
+    if args.num_eval_samples is not None:
+        data_module_kwargs['num_eval_samples'] = args.num_eval_samples
+    data_module.setup(**data_module_kwargs)
+
+    dist.barrier()
+
+    if args.num_train_samples is None:
+        args.num_train_samples = len(data_module.dataset['train'])
+    if args.num_eval_samples is None:
+        args.num_eval_samples = len(data_module.dataset['val'])
+    if args.eval_batch_size is None:
+        args.eval_batch_size = args.batch_size
+
+    train_dataloader = create_dataloader(
+        data_module.dataset['train'],
+        batch_size=args.batch_size,
+        sampler_type='Distributed' if args.ddp else 'Random',
+        worker_init_fn=worker_init)
+
+    val_dataloader = create_dataloader(
+        data_module.dataset['val'],
+        batch_size=args.eval_batch_size,
+        sampler_type='Distributed' if args.ddp else 'Sequential',
+        worker_init_fn=worker_init)
+
+    if args.max_steps is not None and args.max_steps > 0:
+        max_epochs = args.max_steps // len(train_dataloader)
+        args.max_epochs = max_epochs + 1 if (args.max_steps % len(train_dataloader)) \
+                          else max_epochs
+    else:
+        assert(args.max_epochs > 0)
+        args.max_steps = args.max_epochs * len(train_dataloader)
+
+    args.num_eval_steps = len(val_dataloader)
+
+    training_state = TrainingState(
+        max_steps=args.max_steps,
+        quality_target=args.quality_target,
+        quality_judgement=args.quality_judgement)
+
+    model.to(args.device)
+
+    if args.ddp:
+        model = DDP(model, device_ids=[args.local_rank])
+        training_step = model.module.training_step
+        validation_step = model.module.validation_step
+    else:
+        training_step = model.training_step
+        validation_step = model.validation_step
+
+    model.train()
+
+    dist.barrier()
+    train_start_time = time.perf_counter()
+    train_time = 0
+    start_time = train_start_time
+    for epoch in range(1, args.max_epochs+1):
+        
+        if args.ddp:
+            train_dataloader.sampler.set_epoch(epoch)
+        
+        train_data_iterator = iter(train_dataloader)
+        for step in range(1, len(train_dataloader)+1):
+            if training_state.end_training():
+                break
+
+            batch_data = next(train_data_iterator)
+            training_state.global_step += 1
+
+            if args.amp:
+                with autocast():
+                    batch_outputs = training_step(
+                        batch=h2d_tensors(batch_data, args.device), batch_idx=step)
+            else:
+                batch_outputs = training_step(
+                    batch=h2d_tensors(batch_data, args.device), batch_idx=step)
+
+            optimizer.zero_grad()
+            if args.amp:
+                scaler.scale(batch_outputs['loss']).backward()
+                scaler.step(optimizer)
+                scaler.update()
+            else:
+                batch_outputs['loss'].backward()
+                optimizer.step()
+            loss_value = batch_outputs['loss']
+            if not math.isfinite(loss_value):
+                print("Loss is {}, stopping training".format(loss_value))
+                sys.exit(1)
+            lr_scheduler.step()
+
+            training_state.num_trained_samples += args.batch_size * args.num_gpus
+
+            if training_state.global_step % args.log_freq == 0:
+                dist.barrier()
+                elapse = time.perf_counter() - start_time
+                train_time += elapse
+                if dist.is_main_process():
+                    logger.log(
+                        OrderedDict(
+                            stage='train',
+                            progress=OrderedDict(
+                                epoch=epoch,
+                                step=step),
+                            metrics=OrderedDict(
+                                loss=float(batch_outputs['loss'].detach().cpu()),
+                                wer=batch_outputs['wer'],
+                                cer=batch_outputs['cer']),
+                            perf=OrderedDict(
+                                tps=training_state.num_trained_samples / train_time)
+                        )
+                    )
+                start_time = time.perf_counter()
+
+
+            if training_state.global_step % args.eval_freq == 0:
+                with torch.no_grad():
+                    model.eval()
+                    val_data_iterator = iter(val_dataloader)
+                    metric_list = []
+                    dist.barrier()
+                    eval_start_time = time.perf_counter()
+                    for eval_step in trange(1, args.num_eval_steps+1):
+                        batch_data = next(val_data_iterator)
+                        batch_outputs = validation_step(
+                            batch=h2d_tensors(batch_data, args.device), batch_idx=eval_step)
+                        metric_list.append(batch_outputs[args.quality_metric])
+
+                    dist.barrier()
+                    eval_duration = time.perf_counter() - eval_start_time
+                    metric_value = np.mean(metric_list)
+                    if args.ddp:
+                        metric_value = np.mean(dist.all_gather(metric_value))
+                        dist.barrier()
+                    if dist.is_main_process():
+                        logger.log(
+                            OrderedDict(
+                                stage='val',
+                                progress=OrderedDict(
+                                    epoch=epoch,
+                                    step=args.num_eval_steps),
+                                metrics={
+                                    args.quality_metric: metric_value},
+                                perf=OrderedDict(
+                                    tps=args.num_eval_samples / eval_duration)
+                            )
+                        )
+                    if training_state.meet_quality_target(metric_value):
+                        training_state.status = training_state.Status.success
+                        break
+
+                model.train()
+                start_time = time.perf_counter()
+
+        if training_state.end_training():
+            break
+
+    dist.barrier()
+    raw_train_time = time.perf_counter() - train_start_time
+    final_state = OrderedDict(
+        global_step=training_state.global_step,
+        raw_train_time=raw_train_time,
+        tps=training_state.num_trained_samples / raw_train_time,
+        status={
+            training_state.Status.success: 'success',
+            training_state.Status.aborted: 'aborted'
+        }[training_state.status])
+    final_state[args.quality_metric] = metric_value
+
+    if dist.is_main_process():
+        logger.log(final_state)
+    sys.exit(training_state.status)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    try:
+        from dltest import show_training_arguments
+        show_training_arguments(args)
+    except:
+        pass
+    main(args)
diff --git a/audio/speech_recognition/conformer/pytorch/utils/__init__.py b/audio/speech_recognition/conformer/pytorch/utils/__init__.py
new file mode 100644
index 000000000..b198bd010
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/utils/__init__.py
@@ -0,0 +1,24 @@
+import random
+
+import numpy as np
+
+from .config import *
+from .dist import *
+from .logger import *
+from .misc import *
+
+
+def manual_seed(seed, deterministic=False):
+    random.seed(seed)
+    np.random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+    if deterministic:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+    else:
+        torch.backends.cudnn.deterministic = False
+        torch.backends.cudnn.benchmark = True
diff --git a/audio/speech_recognition/conformer/pytorch/utils/config.py b/audio/speech_recognition/conformer/pytorch/utils/config.py
new file mode 100644
index 000000000..94ecabc74
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/utils/config.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+from addict import Dict
+
+
+class ConfigDict(Dict):
+
+    def __missing__(self, name):
+        raise KeyError(name)
+
+    def __getattr__(self, name):
+        return super(ConfigDict, self).__getattr__(name)
diff --git a/audio/speech_recognition/conformer/pytorch/utils/dist.py b/audio/speech_recognition/conformer/pytorch/utils/dist.py
new file mode 100644
index 000000000..b7ae91e11
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/utils/dist.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from contextlib import contextmanager
+import logging.config
+import os
+import random
+
+import torch
+
+
+def generate_seeds(rng, size):
+    """
+    Generate list of random seeds
+
+    :param rng: random number generator
+    :param size: length of the returned list
+    """
+    seeds = [rng.randint(0, 2**32 - 1) for _ in range(size)]
+    return seeds
+
+
+def broadcast_seeds(seeds, device):
+    """
+    Broadcasts random seeds to all distributed workers.
+    Returns list of random seeds (broadcasted from workers with rank 0).
+
+    :param seeds: list of seeds (integers)
+    :param device: torch.device
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        seeds_tensor = torch.LongTensor(seeds).to(device)
+        torch.distributed.broadcast(seeds_tensor, 0)
+        seeds = seeds_tensor.tolist()
+    return seeds
+
+
+def setup_seeds(master_seed, epochs, device):
+    """
+    Generates seeds from one master_seed.
+    Function returns (worker_seeds, shuffling_seeds), worker_seeds are later
+    used to initialize per-worker random number generators (mostly for
+    dropouts), shuffling_seeds are for RNGs resposible for reshuffling the
+    dataset before each epoch.
+    Seeds are generated on worker with rank 0 and broadcasted to all other
+    workers.
+
+    :param master_seed: master RNG seed used to initialize other generators
+    :param epochs: number of epochs
+    :param device: torch.device (used for distributed.broadcast)
+    """
+    if master_seed is None:
+        # random master seed, random.SystemRandom() uses /dev/urandom on Unix
+        master_seed = random.SystemRandom().randint(0, 2**32 - 1)
+        if get_rank() == 0:
+            # master seed is reported only from rank=0 worker, it's to avoid
+            # confusion, seeds from rank=0 are later broadcasted to other
+            # workers
+            logging.info(f'Using random master seed: {master_seed}')
+    else:
+        # master seed was specified from command line
+        logging.info(f'Using master seed from command line: {master_seed}')
+
+    # initialize seeding RNG
+    seeding_rng = random.Random(master_seed)
+
+    # generate worker seeds, one seed for every distributed worker
+    worker_seeds = generate_seeds(seeding_rng, get_world_size())
+
+    # generate seeds for data shuffling, one seed for every epoch
+    shuffling_seeds = generate_seeds(seeding_rng, epochs)
+
+    # broadcast seeds from rank=0 to other workers
+    worker_seeds = broadcast_seeds(worker_seeds, device)
+    shuffling_seeds = broadcast_seeds(shuffling_seeds, device)
+    return worker_seeds, shuffling_seeds
+
+
+def barrier():
+    """
+    Works as a temporary distributed barrier, currently pytorch
+    doesn't implement barrier for NCCL backend.
+    Calls all_reduce on dummy tensor and synchronizes with GPU.
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        torch.distributed.all_reduce(torch.cuda.FloatTensor(1))
+    torch.cuda.synchronize()
+
+
+def get_rank(default=0):
+    """
+    Gets distributed rank or returns zero if distributed is not initialized.
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        rank = torch.distributed.get_rank()
+    else:
+        rank = default
+    return rank
+
+
+def get_world_size():
+    """
+    Gets total number of distributed workers or returns one if distributed is
+    not initialized.
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        world_size = torch.distributed.get_world_size()
+    else:
+        world_size = 1
+    return world_size
+
+
+def main_proc_print(*args, **kwargs):
+    if is_main_process():
+        print(*args, **kwargs)
+
+
+def set_device(cuda, local_rank):
+    """
+    Sets device based on local_rank and returns instance of torch.device.
+
+    :param cuda: if True: use cuda
+    :param local_rank: local rank of the worker
+    """
+    if cuda:
+        torch.cuda.set_device(local_rank)
+        device = torch.device('cuda')
+    else:
+        device = torch.device('cpu')
+    return device
+
+
+def init_dist_training_env(config):
+    if config.local_rank == -1:
+        device = torch.device("cuda")
+        num_gpus = torch.cuda.device_count()
+    else:
+        torch.cuda.set_device(config.local_rank)
+        device = torch.device("cuda", config.local_rank)
+        host_addr_full = 'tcp://' + os.environ["MASTER_ADDR"] + ':' + os.environ["MASTER_PORT"]
+        rank = int(os.environ["RANK"])
+        world_size = int(os.environ["WORLD_SIZE"])
+
+        dist_backend = config.dist_backend
+        DIST_BACKEND_ENV = "PT_DIST_BACKEND"
+        if DIST_BACKEND_ENV in os.environ:
+            print("WARN: Use the distributed backend of the environment.")
+            dist_backend = os.environ[DIST_BACKEND_ENV]
+
+        torch.distributed.init_process_group(backend=dist_backend, init_method=host_addr_full, rank=rank, world_size=world_size)
+        num_gpus = torch.distributed.get_world_size()
+
+    return device, num_gpus
+
+
+def global_batch_size(config):
+    return config.train_batch_size * config.n_gpu
+
+
+@contextmanager
+def sync_workers():
+    """
+    Yields distributed rank and synchronizes all workers on exit.
+    """
+    rank = get_rank()
+    yield rank
+    barrier()
+
+
+def is_main_process():
+    if torch.distributed.is_initialized():
+        if "LOCAL_RANK" in os.environ:
+            return int(os.environ["LOCAL_RANK"]) == 0
+        else:
+            return get_rank() == 0
+
+    return True
+
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+    data_list = [None] * world_size
+    torch.distributed.all_gather_object(data_list, data)
+    return data_list
diff --git a/audio/speech_recognition/conformer/pytorch/utils/logger.py b/audio/speech_recognition/conformer/pytorch/utils/logger.py
new file mode 100644
index 000000000..70b07aae5
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/utils/logger.py
@@ -0,0 +1 @@
+from ixpylogger import TrainingLogger, TrainingState
diff --git a/audio/speech_recognition/conformer/pytorch/utils/misc.py b/audio/speech_recognition/conformer/pytorch/utils/misc.py
new file mode 100644
index 000000000..dcfa4894a
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/utils/misc.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+import os
+import sys
+import errno
+
+
+def mkdir(path):
+    try:
+        os.makedirs(path)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
diff --git a/tests/executables/conformer/init_torch.sh b/tests/executables/conformer/init_torch.sh
new file mode 100644
index 000000000..627d953a3
--- /dev/null
+++ b/tests/executables/conformer/init_torch.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+ROOT_DIR="$(cd "$(dirname "$0")/../.."; pwd)"
+SRC_DIR=$ROOT_DIR/audio/speech_recognition/conformer/pytorch
+DATA_DIR=$ROOT_DIR/data
+
+# determine whether the user is root mode to execute this script
+prefix_sudo=""
+current_user=$(whoami)
+if [ "$current_user" != "root" ]; then
+    echo "User $current_user need to add sudo permission keywords"
+    prefix_sudo="sudo"
+fi
+
+echo "prefix_sudo= $prefix_sudo"
+
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    $prefix_sudo apt install -y numactl
+    $prefix_sudo apt install -y libsndfile1
+elif [[ ${ID} == "centos" ]]; then
+    $prefix_sudo yum install -y numactl
+    $prefix_sudo yum install -y libsndfile-devel
+else
+    echo "Unable to determine OS, assumed to be similar to CentOS"
+    $prefix_sudo yum install -y numactl
+    $prefix_sudo yum install -y libsndfile-devel
+fi
+
+pip3 list | grep "torchaudio" || \
+    pip3 install torchaudio==0.8.1
+
+pip3 install --no-index --find-links=$DATA_DIR/packages IXPyLogger==1.0.0
+pip3 install -r $SRC_DIR/requirements.txt --cache-dir=$DATA_DIR/packages
+pip3 install numpy==1.26.4
+if [ ! -f "${HOME}/.cache/librosa/admiralbob77_-_Choice_-_Drum-bass.ogg" ]; then
+    wget https://librosa.org/data/audio/admiralbob77_-_Choice_-_Drum-bass.ogg
+    mkdir -p ~/.cache/librosa/
+    mv admiralbob77_-_Choice_-_Drum-bass.ogg ~/.cache/librosa/
+fi
+
+DATASET_DIR=$DATA_DIR/datasets/LibriSpeech
+
+cd $DATASET_DIR
+GZS=`find . -type f -name '*.gz'`
+for path in $GZS; do
+    cd ${path%/*}
+    tar zxf ${path##*/}
+done
+
+mv $DATASET_DIR/LibriSpeech/* $DATASET_DIR/
diff --git a/tests/executables/conformer/train_conformer_librispeech_dist_1x8_torch.sh b/tests/executables/conformer/train_conformer_librispeech_dist_1x8_torch.sh
new file mode 100644
index 000000000..d9500486f
--- /dev/null
+++ b/tests/executables/conformer/train_conformer_librispeech_dist_1x8_torch.sh
@@ -0,0 +1,31 @@
+source ../_utils/global_environment_variables.sh
+
+: ${BATCH_SIZE:=8}
+
+ROOT_DIR="$(cd "$(dirname "$0")/../.."; pwd)"
+SRC_DIR=$ROOT_DIR/audio/speech_recognition/conformer/pytorch
+DATA_DIR=$ROOT_DIR/data
+export DRT_MEMCPYUSEKERNEL=20000000000
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0)); then
+        EXIT_STATUS=1
+    fi
+}
+
+source ../_utils/fix_import_sklearn_error_libgomp_d22c30c5.sh
+
+cd $SRC_DIR
+bash run_training.sh --data_dir=$DATA_DIR \
+    --max_steps=800 \
+    --quality_target=1.6 \
+    --batch_size=${BATCH_SIZE} \
+    --eval_freq=400 \
+    --ddp \
+    --max_steps=800 \
+    --quality_target=1.6 \
+    --eval_freq=400; check_status
+
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/tests/executables/ssd/init_torch.sh b/tests/executables/ssd/init_torch.sh
new file mode 100644
index 000000000..5065ae4b7
--- /dev/null
+++ b/tests/executables/ssd/init_torch.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0)); then
+        EXIT_STATUS=1
+    fi
+}
+
+
+: ${CXX:="g++"}
+export CXX
+
+source $(cd `dirname $0`; pwd)/../_utils/which_install_tool.sh
+
+# determine whether the user is root mode to execute this script
+prefix_sudo=""
+current_user=$(whoami)
+if [ "$current_user" != "root" ]; then
+    echo "User $current_user need to add sudo permission keywords"
+    prefix_sudo="sudo"
+fi
+
+echo "prefix_sudo= $prefix_sudo"
+
+if command_exists apt; then
+	$prefix_sudo apt install -y git numactl
+elif command_exists dnf; then
+	$prefix_sudo dnf install -y git numactl
+else
+	$prefix_sudo yum install -y git numactl
+fi
+sys_name_str=`uname -a`
+if [[ "${sys_name_str}" =~ "aarch64" ]]; then
+    pip3 install "git+https://github.com/mlperf/logging.git@1.0-branch" pybind11 ujson
+else
+    pip3 install "git+https://github.com/mlperf/logging.git@1.0-branch" pybind11==2.9.2 ujson==1.35
+fi
+
+pip3 list | grep -w "wheel" || pip3 install wheel
+# pip3 list | grep -w "numpy" | grep -w "1.23.5" || pip3 install numpy==1.23.5
+pip3 install numpy>=1.26.4
+pip3 install cython
+# pip3 install "git+https://github.com/NVIDIA/cocoapi.git@v0.6.0#subdirectory=PythonAPI"
+pip3 install pycocotools==2.0.8
+
+CUR_PATH=$(cd `dirname $0`; pwd)
+DATA_PATH=$CUR_PATH/../../data/datasets/coco2017/
+
+if [[ "$(uname -m)" == "aarch64" ]]; then
+    source /opt/rh/gcc-toolset-11/enable
+fi
+
+cd ../../research/cv/detection/ssd && bash ./clean_ssd.sh && bash ./build_ssd.sh && bash ./install_ssd.sh "$@";  check_status
+DATA_PATH_BBOX=../../../..
+
+python3 prepare-json.py --keep-keys ${DATA_PATH}/annotations/instances_val2017.json ${DATA_PATH_BBOX}/bbox_only_instances_val2017.json "$@";  check_status
+python3 prepare-json.py ${DATA_PATH}/annotations/instances_train2017.json ${DATA_PATH_BBOX}/bbox_only_instances_train2017.json "$@";  check_status
+
+
+cd - 
+#echo "init finished!"
+exit ${EXIT_STATUS}
diff --git a/tests/executables/ssd/train_ssd_amp_torch.sh b/tests/executables/ssd/train_ssd_amp_torch.sh
new file mode 100644
index 000000000..6d348a988
--- /dev/null
+++ b/tests/executables/ssd/train_ssd_amp_torch.sh
@@ -0,0 +1,25 @@
+
+COCO_PATH="`pwd`/../../data/datasets/coco2017"
+
+: ${BATCH_SIZE:=160}
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0)); then
+        EXIT_STATUS=1
+    fi
+}
+
+cd ../../research/cv/detection/ssd
+
+echo "python3 train.py --no-dali --dali-cache 0 --data=${COCO_PATH} \
+--batch-size=${BATCH_SIZE} --warmup-factor=0 --warmup=650 --lr=2.92e-3 --threshold=0.08 --epochs 5 --eval-batch-size=160 \
+--wd=1.6e-4 --use-fp16 --delay-allreduce --lr-decay-factor=0.2 --lr-decay-epochs 34 45 --opt-level O2 --seed 1769250163"
+
+python3 train.py --dali --dali-cache 0 --data=${COCO_PATH} \
+--batch-size=${BATCH_SIZE} --warmup-factor=0 --warmup=650 --lr=2.92e-3 --threshold=0.08 --epochs 5 --eval-batch-size=160 \
+--wd=1.6e-4 --use-fp16 --jit --nhwc --pad-input --delay-allreduce --lr-decay-factor=0.2 --lr-decay-epochs 34 45 --opt-level O2 --seed 1769250163 "$@";  check_status
+
+cd -
+exit ${EXIT_STATUS}
-- 
Gitee


From 13249707f0bd25b6ed4ffd210a4ac9282029ee6d Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Fri, 26 Sep 2025 14:01:31 +0800
Subject: [PATCH 11/20] sync resnet dali

---
 tests/executables/dali/init_torch.sh          |  1 +
 .../dali/train_resnet50_dali_torch.sh         | 20 +++++++++++++++++++
 2 files changed, 21 insertions(+)
 create mode 100644 tests/executables/dali/init_torch.sh
 create mode 100644 tests/executables/dali/train_resnet50_dali_torch.sh

diff --git a/tests/executables/dali/init_torch.sh b/tests/executables/dali/init_torch.sh
new file mode 100644
index 000000000..01a0f2d5e
--- /dev/null
+++ b/tests/executables/dali/init_torch.sh
@@ -0,0 +1 @@
+bash ../_utils/init_classification_torch.sh ../_utils
\ No newline at end of file
diff --git a/tests/executables/dali/train_resnet50_dali_torch.sh b/tests/executables/dali/train_resnet50_dali_torch.sh
new file mode 100644
index 000000000..6c814901c
--- /dev/null
+++ b/tests/executables/dali/train_resnet50_dali_torch.sh
@@ -0,0 +1,20 @@
+source ../_utils/global_environment_variables.sh
+source ../_utils/get_num_devices.sh
+
+: ${BATCH_SIZE:=256}
+
+OUTPUT_DIR=${PROJECT_DIR}/output/resnet/$0
+if [[ -d ${OUTPUT_DIR} ]]; then
+    mkdir -p ${OUTPUT_DIR}
+fi
+
+
+ixdltest-check --nonstrict_mode_args="--epoch ${NONSTRICT_EPOCH}" -b 8 --run_script \
+python3 ${PROJECT_DIR}/cv/classification/resnet50/pytorch/train.py \
+--data-path ${PROJECT_DIR}/data/datasets/imagenette \
+--batch-size ${BATCH_SIZE} \
+--output-dir ${OUTPUT_DIR} \
+"$@" ;check_status
+
+rm -fr ${OUTPUT_DIR}
+exit ${EXIT_STATUS}
\ No newline at end of file
-- 
Gitee


From d59d84fa89f7b26583e43793561f60dc956c314e Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Fri, 26 Sep 2025 14:11:03 +0800
Subject: [PATCH 12/20] sync bert tf

---
 tests/executables/bert/init_tf.sh             | 29 +++++++++++
 .../train_bert_pretraining_amp_dist_1x8_tf.sh | 48 +++++++++++++++++++
 2 files changed, 77 insertions(+)
 create mode 100644 tests/executables/bert/init_tf.sh
 create mode 100644 tests/executables/bert/train_bert_pretraining_amp_dist_1x8_tf.sh

diff --git a/tests/executables/bert/init_tf.sh b/tests/executables/bert/init_tf.sh
new file mode 100644
index 000000000..14d072411
--- /dev/null
+++ b/tests/executables/bert/init_tf.sh
@@ -0,0 +1,29 @@
+#/bin/bash
+source $(cd `dirname $0`; pwd)/../_utils/which_install_tool.sh
+
+# determine whether the user is root mode to execute this script
+prefix_sudo=""
+current_user=$(whoami)
+if [ "$current_user" != "root" ]; then
+    echo "User $current_user need to add sudo permission keywords"
+    prefix_sudo="sudo"
+fi
+
+echo "prefix_sudo= $prefix_sudo"
+
+if command_exists apt; then
+	$prefix_sudo apt install -y git numactl
+elif command_exists dnf; then
+	$prefix_sudo dnf install -y git numactl
+else
+	$prefix_sudo yum install -y git numactl
+fi
+if [ "$(ulimit -n)" -lt "1048576" ]; then
+	ulimit -n 1048576
+fi
+pip3 uninstall -y protobuf
+pip3 install "protobuf<4.0.0"
+pip3 install git+https://github.com/mlperf/logging.git
+pip3 install git+https://github.com/NVIDIA/dllogger.git
+pip3 install pandas==1.3.5
+pip3 install numpy==1.26.4
diff --git a/tests/executables/bert/train_bert_pretraining_amp_dist_1x8_tf.sh b/tests/executables/bert/train_bert_pretraining_amp_dist_1x8_tf.sh
new file mode 100644
index 000000000..f909c31d7
--- /dev/null
+++ b/tests/executables/bert/train_bert_pretraining_amp_dist_1x8_tf.sh
@@ -0,0 +1,48 @@
+source ../_utils/global_environment_variables.sh
+
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+source ../_utils/get_num_devices.sh
+
+# export LD_PRELOAD=/usr/local/lib/libmpi.so:${LD_PRELOAD}
+
+: ${BATCH_SIZE:=6}
+
+sys_name_str=`uname -a`
+if [[ "${sys_name_str}" =~ "aarch64" ]]; then
+    export TF_FORCE_SINGLE_THREAD=1
+fi
+
+# if [ "${CI}" == "true" ]; then
+#     if [ ! -d "/usr/local/lib/openmpi" ]; then
+#         echo "Not found /usr/local/lib/openmpi, Installing mpi ......"
+#         install-mpi
+#     fi
+#     export HOROVOD_RUN_ARGS=" "
+#     export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib/openmpi:${LD_LIBRARY_PATH}
+# fi
+
+set -euox pipefail
+
+current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
+ROOT_DIR=${current_path}"/../../"
+SRC_DIR=${ROOT_DIR}nlp/language_model/bert/tensorflow/base
+DATA_DIR=${ROOT_DIR}data/datasets
+MODEL_DIR=${ROOT_DIR}data/model_zoo
+
+nonstrict_mode_args=""
+if [ "${RUN_MODE}" != "strict" ]; then
+    nonstrict_mode_args="--stop_threshold 0.6"
+fi
+
+cd $SRC_DIR
+bash init.sh
+bash run_multi_card_FPS.sh \
+            --input_files_dir=${DATA_DIR}/bert_pretrain_tf_records/train_data \
+            --init_checkpoint=${MODEL_DIR}/bert_pretrain_tf_ckpt/model.ckpt-28252 \
+            --eval_files_dir=${DATA_DIR}/bert_pretrain_tf_records/eval_data \
+            --train_batch_size=${BATCH_SIZE} \
+            --bert_config_file=${MODEL_DIR}/bert_pretrain_tf_ckpt/bert_config.json \
+            --display_loss_steps=10 ${nonstrict_mode_args} \
+            "$@"
+
+exit $?
-- 
Gitee


From 1bc0de80681d7ce1a6d502f0d2e02f44a1062d02 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Fri, 26 Sep 2025 14:41:21 +0800
Subject: [PATCH 13/20] sync stable-diffusion all

---
 .../pytorch/.gitignore                        |  178 +
 .../pytorch/LICENSE                           |  201 +
 .../pytorch/MANIFEST.in                       |    3 +
 .../pytorch/README.md                         |   45 +
 .../pytorch/build_diffusers.sh                |   22 +
 .../pytorch/clean_diffusers.sh                |   12 +
 .../pytorch/examples/text_to_image/README.md  |  326 ++
 .../examples/text_to_image/README_sdxl.md     |  286 ++
 .../text_to_image/default_config.yaml         |   20 +
 .../examples/text_to_image/requirements.txt   |    6 +
 .../text_to_image/requirements_flax.txt       |    9 +
 .../text_to_image/requirements_sdxl.txt       |    8 +
 .../examples/text_to_image/single_config.yaml |   20 +
 .../text_to_image/test_text_to_image.py       |  365 ++
 .../text_to_image/test_text_to_image_lora.py  |  300 ++
 .../text_to_image/train_text_to_image.py      | 1137 +++++
 .../text_to_image/train_text_to_image_flax.py |  620 +++
 .../text_to_image/train_text_to_image_lora.py |  976 +++++
 .../train_text_to_image_lora_sdxl.py          | 1317 ++++++
 .../text_to_image/train_text_to_image_sdxl.py | 1374 ++++++
 .../examples/text_to_image/zero2_config.yaml  |   23 +
 .../pytorch/install_diffusers.sh              |   33 +
 .../pytorch/run_sd_1.5.sh                     |   32 +
 .../pytorch/run_sd_1.5_single.sh              |   32 +
 .../pytorch/run_sd_2.1.sh                     |   32 +
 .../pytorch/run_sd_2.1_single.sh              |   32 +
 .../pytorch/run_sd_xl.sh                      |   37 +
 .../pytorch/setup.py                          |  304 ++
 .../pytorch/src/diffusers/__init__.py         |  787 ++++
 .../src/diffusers/commands/__init__.py        |   27 +
 .../src/diffusers/commands/diffusers_cli.py   |   43 +
 .../pytorch/src/diffusers/commands/env.py     |   84 +
 .../diffusers/commands/fp16_safetensors.py    |  132 +
 .../src/diffusers/configuration_utils.py      |  703 ++++
 .../diffusers/dependency_versions_check.py    |   34 +
 .../diffusers/dependency_versions_table.py    |   45 +
 .../src/diffusers/experimental/README.md      |    5 +
 .../src/diffusers/experimental/__init__.py    |    1 +
 .../src/diffusers/experimental/rl/__init__.py |    1 +
 .../experimental/rl/value_guided_sampling.py  |  153 +
 .../pytorch/src/diffusers/image_processor.py  |  990 +++++
 .../pytorch/src/diffusers/loaders/__init__.py |   88 +
 .../src/diffusers/loaders/autoencoder.py      |  146 +
 .../src/diffusers/loaders/controlnet.py       |  136 +
 .../src/diffusers/loaders/ip_adapter.py       |  281 ++
 .../pytorch/src/diffusers/loaders/lora.py     | 1349 ++++++
 .../loaders/lora_conversion_utils.py          |  284 ++
 .../pytorch/src/diffusers/loaders/peft.py     |  186 +
 .../src/diffusers/loaders/single_file.py      |  318 ++
 .../diffusers/loaders/single_file_utils.py    | 1617 +++++++
 .../diffusers/loaders/textual_inversion.py    |  562 +++
 .../pytorch/src/diffusers/loaders/unet.py     | 1003 +++++
 .../pytorch/src/diffusers/loaders/utils.py    |   59 +
 .../pytorch/src/diffusers/models/README.md    |    3 +
 .../pytorch/src/diffusers/models/__init__.py  |  103 +
 .../src/diffusers/models/activations.py       |  145 +
 .../pytorch/src/diffusers/models/adapter.py   |  584 +++
 .../pytorch/src/diffusers/models/attention.py |  681 +++
 .../src/diffusers/models/attention_flax.py    |  494 +++
 .../diffusers/models/attention_processor.py   | 2507 +++++++++++
 .../diffusers/models/autoencoders/__init__.py |    5 +
 .../autoencoders/autoencoder_asym_kl.py       |  186 +
 .../models/autoencoders/autoencoder_kl.py     |  489 +++
 .../autoencoder_kl_temporal_decoder.py        |  399 ++
 .../models/autoencoders/autoencoder_tiny.py   |  347 ++
 .../autoencoders/consistency_decoder_vae.py   |  435 ++
 .../src/diffusers/models/autoencoders/vae.py  |  992 +++++
 .../src/diffusers/models/controlnet.py        |  868 ++++
 .../src/diffusers/models/controlnet_flax.py   |  395 ++
 .../src/diffusers/models/downsampling.py      |  334 ++
 .../diffusers/models/dual_transformer_2d.py   |   20 +
 .../src/diffusers/models/embeddings.py        |  914 ++++
 .../src/diffusers/models/embeddings_flax.py   |   97 +
 .../pytorch/src/diffusers/models/lora.py      |  457 ++
 .../models/modeling_flax_pytorch_utils.py     |  134 +
 .../diffusers/models/modeling_flax_utils.py   |  566 +++
 .../src/diffusers/models/modeling_outputs.py  |   17 +
 .../models/modeling_pytorch_flax_utils.py     |  161 +
 .../src/diffusers/models/modeling_utils.py    | 1021 +++++
 .../diffusers/models/nhwc_groupnorm/Welford.h |   94 +
 .../models/nhwc_groupnorm/__init__.py         |    0
 .../models/nhwc_groupnorm/custom_gn.cpp       |   90 +
 .../models/nhwc_groupnorm/custom_gn.py        |  367 ++
 .../models/nhwc_groupnorm/gn_kernel.cu        | 1051 +++++
 .../models/nhwc_groupnorm/gn_kernel.h         |   39 +
 .../models/nhwc_groupnorm/nchw_kernel.cu      |  994 +++++
 .../diffusers/models/nhwc_groupnorm/vecs.h    |   35 +
 .../src/diffusers/models/normalization.py     |  254 ++
 .../src/diffusers/models/prior_transformer.py |   12 +
 .../pytorch/src/diffusers/models/resnet.py    |  814 ++++
 .../src/diffusers/models/resnet_flax.py       |  124 +
 .../diffusers/models/t5_film_transformer.py   |   70 +
 .../src/diffusers/models/transformer_2d.py    |   25 +
 .../diffusers/models/transformer_temporal.py  |   34 +
 .../diffusers/models/transformers/__init__.py |    9 +
 .../transformers/dual_transformer_2d.py       |  155 +
 .../models/transformers/prior_transformer.py  |  380 ++
 .../transformers/t5_film_transformer.py       |  438 ++
 .../models/transformers/transformer_2d.py     |  460 ++
 .../transformers/transformer_temporal.py      |  379 ++
 .../pytorch/src/diffusers/models/unet_1d.py   |   26 +
 .../src/diffusers/models/unet_1d_blocks.py    |  203 +
 .../pytorch/src/diffusers/models/unet_2d.py   |   27 +
 .../src/diffusers/models/unet_2d_blocks.py    |  375 ++
 .../src/diffusers/models/unet_2d_condition.py |   25 +
 .../src/diffusers/models/unets/__init__.py    |   18 +
 .../src/diffusers/models/unets/unet_1d.py     |  255 ++
 .../diffusers/models/unets/unet_1d_blocks.py  |  702 ++++
 .../src/diffusers/models/unets/unet_2d.py     |  346 ++
 .../diffusers/models/unets/unet_2d_blocks.py  | 3731 +++++++++++++++++
 .../models/unets/unet_2d_blocks_flax.py       |  400 ++
 .../models/unets/unet_2d_condition.py         | 1319 ++++++
 .../models/unets/unet_2d_condition_flax.py    |  453 ++
 .../diffusers/models/unets/unet_3d_blocks.py  | 2405 +++++++++++
 .../models/unets/unet_3d_condition.py         |  753 ++++
 .../diffusers/models/unets/unet_i2vgen_xl.py  |  724 ++++
 .../diffusers/models/unets/unet_kandinsky3.py |  535 +++
 .../models/unets/unet_motion_model.py         |  948 +++++
 .../unets/unet_spatio_temporal_condition.py   |  489 +++
 .../models/unets/unet_stable_cascade.py       |  610 +++
 .../src/diffusers/models/unets/uvit_2d.py     |  470 +++
 .../src/diffusers/models/upsampling.py        |  448 ++
 .../pytorch/src/diffusers/models/vae_flax.py  |  876 ++++
 .../pytorch/src/diffusers/models/vq_model.py  |  181 +
 .../pytorch/src/diffusers/optimization.py     |  361 ++
 .../pytorch/src/diffusers/pipelines/README.md |  171 +
 .../src/diffusers/pipelines/__init__.py       |  581 +++
 .../diffusers/pipelines/amused/__init__.py    |   62 +
 .../pipelines/amused/pipeline_amused.py       |  328 ++
 .../amused/pipeline_amused_img2img.py         |  347 ++
 .../amused/pipeline_amused_inpaint.py         |  378 ++
 .../pipelines/animatediff/__init__.py         |   49 +
 .../animatediff/pipeline_animatediff.py       |  847 ++++
 .../pipeline_animatediff_video2video.py       |  997 +++++
 .../pipelines/animatediff/pipeline_output.py  |   23 +
 .../diffusers/pipelines/audioldm/__init__.py  |   51 +
 .../pipelines/audioldm/pipeline_audioldm.py   |  546 +++
 .../diffusers/pipelines/audioldm2/__init__.py |   50 +
 .../pipelines/audioldm2/modeling_audioldm2.py | 1511 +++++++
 .../pipelines/audioldm2/pipeline_audioldm2.py |  980 +++++
 .../src/diffusers/pipelines/auto_pipeline.py  |  987 +++++
 .../pipelines/blip_diffusion/__init__.py      |   20 +
 .../blip_diffusion/blip_image_processing.py   |  318 ++
 .../blip_diffusion/modeling_blip2.py          |  642 +++
 .../blip_diffusion/modeling_ctx_clip.py       |  223 +
 .../blip_diffusion/pipeline_blip_diffusion.py |  348 ++
 .../pipelines/consistency_models/__init__.py  |   24 +
 .../pipeline_consistency_models.py            |  275 ++
 .../pipelines/controlnet/__init__.py          |   80 +
 .../pipelines/controlnet/multicontrolnet.py   |  187 +
 .../controlnet/pipeline_controlnet.py         | 1318 ++++++
 .../pipeline_controlnet_blip_diffusion.py     |  413 ++
 .../controlnet/pipeline_controlnet_img2img.py | 1310 ++++++
 .../controlnet/pipeline_controlnet_inpaint.py | 1620 +++++++
 .../pipeline_controlnet_inpaint_sd_xl.py      | 1818 ++++++++
 .../controlnet/pipeline_controlnet_sd_xl.py   | 1499 +++++++
 .../pipeline_controlnet_sd_xl_img2img.py      | 1626 +++++++
 .../controlnet/pipeline_flax_controlnet.py    |  532 +++
 .../pipelines/dance_diffusion/__init__.py     |   18 +
 .../pipeline_dance_diffusion.py               |  156 +
 .../src/diffusers/pipelines/ddim/__init__.py  |   18 +
 .../diffusers/pipelines/ddim/pipeline_ddim.py |  154 +
 .../src/diffusers/pipelines/ddpm/__init__.py  |   22 +
 .../diffusers/pipelines/ddpm/pipeline_ddpm.py |  127 +
 .../pipelines/deepfloyd_if/__init__.py        |   85 +
 .../pipelines/deepfloyd_if/pipeline_if.py     |  788 ++++
 .../deepfloyd_if/pipeline_if_img2img.py       |  910 ++++
 .../pipeline_if_img2img_superresolution.py    | 1029 +++++
 .../deepfloyd_if/pipeline_if_inpainting.py    | 1030 +++++
 .../pipeline_if_inpainting_superresolution.py | 1137 +++++
 .../pipeline_if_superresolution.py            |  885 ++++
 .../pipelines/deepfloyd_if/pipeline_output.py |   28 +
 .../pipelines/deepfloyd_if/safety_checker.py  |   59 +
 .../pipelines/deepfloyd_if/timesteps.py       |  579 +++
 .../pipelines/deepfloyd_if/watermark.py       |   46 +
 .../diffusers/pipelines/deprecated/README.md  |    3 +
 .../pipelines/deprecated/__init__.py          |  153 +
 .../deprecated/alt_diffusion/__init__.py      |   53 +
 .../alt_diffusion/modeling_roberta_series.py  |  124 +
 .../alt_diffusion/pipeline_alt_diffusion.py   |  946 +++++
 .../pipeline_alt_diffusion_img2img.py         | 1018 +++++
 .../alt_diffusion/pipeline_output.py          |   28 +
 .../deprecated/audio_diffusion/__init__.py    |   23 +
 .../deprecated/audio_diffusion/mel.py         |  179 +
 .../pipeline_audio_diffusion.py               |  329 ++
 .../latent_diffusion_uncond/__init__.py       |   18 +
 .../pipeline_latent_diffusion_uncond.py       |  130 +
 .../pipelines/deprecated/pndm/__init__.py     |   18 +
 .../deprecated/pndm/pipeline_pndm.py          |  121 +
 .../pipelines/deprecated/repaint/__init__.py  |   19 +
 .../deprecated/repaint/pipeline_repaint.py    |  230 +
 .../deprecated/score_sde_ve/__init__.py       |   19 +
 .../score_sde_ve/pipeline_score_sde_ve.py     |  109 +
 .../spectrogram_diffusion/__init__.py         |   75 +
 .../continuous_encoder.py                     |   92 +
 .../spectrogram_diffusion/midi_utils.py       |  667 +++
 .../spectrogram_diffusion/notes_encoder.py    |   86 +
 .../pipeline_spectrogram_diffusion.py         |  269 ++
 .../stable_diffusion_variants/__init__.py     |   55 +
 .../pipeline_cycle_diffusion.py               |  948 +++++
 ...ne_onnx_stable_diffusion_inpaint_legacy.py |  542 +++
 ...ipeline_stable_diffusion_inpaint_legacy.py |  786 ++++
 ...pipeline_stable_diffusion_model_editing.py |  824 ++++
 .../pipeline_stable_diffusion_paradigms.py    |  786 ++++
 .../pipeline_stable_diffusion_pix2pix_zero.py | 1304 ++++++
 .../stochastic_karras_ve/__init__.py          |   19 +
 .../pipeline_stochastic_karras_ve.py          |  128 +
 .../versatile_diffusion/__init__.py           |   71 +
 .../versatile_diffusion/modeling_text_unet.py | 2508 +++++++++++
 .../pipeline_versatile_diffusion.py           |  421 ++
 ...ipeline_versatile_diffusion_dual_guided.py |  556 +++
 ...ine_versatile_diffusion_image_variation.py |  397 ++
 ...eline_versatile_diffusion_text_to_image.py |  475 +++
 .../deprecated/vq_diffusion/__init__.py       |   57 +
 .../vq_diffusion/pipeline_vq_diffusion.py     |  325 ++
 .../src/diffusers/pipelines/dit/__init__.py   |   19 +
 .../diffusers/pipelines/dit/pipeline_dit.py   |  233 +
 .../diffusers/pipelines/free_init_utils.py    |  184 +
 .../diffusers/pipelines/i2vgen_xl/__init__.py |   46 +
 .../pipelines/i2vgen_xl/pipeline_i2vgen_xl.py |  798 ++++
 .../diffusers/pipelines/kandinsky/__init__.py |   66 +
 .../pipelines/kandinsky/pipeline_kandinsky.py |  407 ++
 .../kandinsky/pipeline_kandinsky_combined.py  |  814 ++++
 .../kandinsky/pipeline_kandinsky_img2img.py   |  500 +++
 .../kandinsky/pipeline_kandinsky_inpaint.py   |  635 +++
 .../kandinsky/pipeline_kandinsky_prior.py     |  547 +++
 .../pipelines/kandinsky/text_encoder.py       |   27 +
 .../pipelines/kandinsky2_2/__init__.py        |   70 +
 .../kandinsky2_2/pipeline_kandinsky2_2.py     |  320 ++
 .../pipeline_kandinsky2_2_combined.py         |  851 ++++
 .../pipeline_kandinsky2_2_controlnet.py       |  320 ++
 ...ipeline_kandinsky2_2_controlnet_img2img.py |  381 ++
 .../pipeline_kandinsky2_2_img2img.py          |  399 ++
 .../pipeline_kandinsky2_2_inpainting.py       |  556 +++
 .../pipeline_kandinsky2_2_prior.py            |  549 +++
 .../pipeline_kandinsky2_2_prior_emb2emb.py    |  563 +++
 .../pipelines/kandinsky3/__init__.py          |   49 +
 .../kandinsky3/convert_kandinsky3_unet.py     |   98 +
 .../kandinsky3/pipeline_kandinsky3.py         |  589 +++
 .../kandinsky3/pipeline_kandinsky3_img2img.py |  654 +++
 .../latent_consistency_models/__init__.py     |   50 +
 .../pipeline_latent_consistency_img2img.py    |  956 +++++
 .../pipeline_latent_consistency_text2img.py   |  888 ++++
 .../pipelines/latent_diffusion/__init__.py    |   50 +
 .../pipeline_latent_diffusion.py              |  746 ++++
 ...peline_latent_diffusion_superresolution.py |  189 +
 .../diffusers/pipelines/ledits_pp/__init__.py |   55 +
 .../pipeline_leditspp_stable_diffusion.py     | 1505 +++++++
 .../pipeline_leditspp_stable_diffusion_xl.py  | 1797 ++++++++
 .../pipelines/ledits_pp/pipeline_output.py    |   43 +
 .../diffusers/pipelines/musicldm/__init__.py  |   49 +
 .../pipelines/musicldm/pipeline_musicldm.py   |  635 +++
 .../src/diffusers/pipelines/onnx_utils.py     |  215 +
 .../pipelines/paint_by_example/__init__.py    |   55 +
 .../paint_by_example/image_encoder.py         |   67 +
 .../pipeline_paint_by_example.py              |  621 +++
 .../src/diffusers/pipelines/pia/__init__.py   |   46 +
 .../diffusers/pipelines/pia/pipeline_pia.py   | 1034 +++++
 .../pipelines/pipeline_flax_utils.py          |  616 +++
 .../pipelines/pipeline_loading_utils.py       |  508 +++
 .../src/diffusers/pipelines/pipeline_utils.py | 1771 ++++++++
 .../pipelines/pixart_alpha/__init__.py        |   48 +
 .../pixart_alpha/pipeline_pixart_alpha.py     |  979 +++++
 .../semantic_stable_diffusion/__init__.py     |   49 +
 .../pipeline_output.py                        |   25 +
 .../pipeline_semantic_stable_diffusion.py     |  718 ++++
 .../diffusers/pipelines/shap_e/__init__.py    |   71 +
 .../src/diffusers/pipelines/shap_e/camera.py  |  147 +
 .../pipelines/shap_e/pipeline_shap_e.py       |  334 ++
 .../shap_e/pipeline_shap_e_img2img.py         |  321 ++
 .../diffusers/pipelines/shap_e/renderer.py    | 1050 +++++
 .../pipelines/stable_cascade/__init__.py      |   50 +
 .../stable_cascade/pipeline_stable_cascade.py |  482 +++
 .../pipeline_stable_cascade_combined.py       |  311 ++
 .../pipeline_stable_cascade_prior.py          |  638 +++
 .../pipelines/stable_diffusion/README.md      |  176 +
 .../pipelines/stable_diffusion/__init__.py    |  203 +
 .../clip_image_project_model.py               |   29 +
 .../stable_diffusion/convert_from_ckpt.py     | 1860 ++++++++
 .../pipeline_flax_stable_diffusion.py         |  473 +++
 .../pipeline_flax_stable_diffusion_img2img.py |  532 +++
 .../pipeline_flax_stable_diffusion_inpaint.py |  589 +++
 .../pipeline_onnx_stable_diffusion.py         |  487 +++
 .../pipeline_onnx_stable_diffusion_img2img.py |  549 +++
 .../pipeline_onnx_stable_diffusion_inpaint.py |  563 +++
 .../pipeline_onnx_stable_diffusion_upscale.py |  586 +++
 .../stable_diffusion/pipeline_output.py       |   45 +
 .../pipeline_stable_diffusion.py              | 1032 +++++
 .../pipeline_stable_diffusion_depth2img.py    |  860 ++++
 ...peline_stable_diffusion_image_variation.py |  420 ++
 .../pipeline_stable_diffusion_img2img.py      | 1113 +++++
 .../pipeline_stable_diffusion_inpaint.py      | 1430 +++++++
 ...eline_stable_diffusion_instruct_pix2pix.py |  807 ++++
 ...ipeline_stable_diffusion_latent_upscale.py |  495 +++
 .../pipeline_stable_diffusion_upscale.py      |  808 ++++
 .../pipeline_stable_unclip.py                 |  932 ++++
 .../pipeline_stable_unclip_img2img.py         |  839 ++++
 .../stable_diffusion/safety_checker.py        |  125 +
 .../stable_diffusion/safety_checker_flax.py   |  112 +
 .../stable_unclip_image_normalizer.py         |   57 +
 .../__init__.py                               |   48 +
 ...line_stable_diffusion_attend_and_excite.py | 1088 +++++
 .../stable_diffusion_diffedit/__init__.py     |   48 +
 .../pipeline_stable_diffusion_diffedit.py     | 1530 +++++++
 .../stable_diffusion_gligen/__init__.py       |   50 +
 .../pipeline_stable_diffusion_gligen.py       |  845 ++++
 ...line_stable_diffusion_gligen_text_image.py | 1017 +++++
 .../stable_diffusion_k_diffusion/__init__.py  |   62 +
 .../pipeline_stable_diffusion_k_diffusion.py  |  664 +++
 ...ipeline_stable_diffusion_xl_k_diffusion.py |  891 ++++
 .../stable_diffusion_ldm3d/__init__.py        |   48 +
 .../pipeline_stable_diffusion_ldm3d.py        |  985 +++++
 .../stable_diffusion_panorama/__init__.py     |   48 +
 .../pipeline_stable_diffusion_panorama.py     |  933 +++++
 .../stable_diffusion_safe/__init__.py         |   99 +
 .../stable_diffusion_safe/pipeline_output.py  |   34 +
 .../pipeline_stable_diffusion_safe.py         |  764 ++++
 .../stable_diffusion_safe/safety_checker.py   |  109 +
 .../stable_diffusion_sag/__init__.py          |   48 +
 .../pipeline_stable_diffusion_sag.py          |  886 ++++
 .../pipelines/stable_diffusion_xl/__init__.py |   76 +
 .../pipeline_flax_stable_diffusion_xl.py      |  308 ++
 .../stable_diffusion_xl/pipeline_output.py    |   37 +
 .../pipeline_stable_diffusion_xl.py           | 1266 ++++++
 .../pipeline_stable_diffusion_xl_img2img.py   | 1442 +++++++
 .../pipeline_stable_diffusion_xl_inpaint.py   | 1812 ++++++++
 ...ne_stable_diffusion_xl_instruct_pix2pix.py |  976 +++++
 .../stable_diffusion_xl/watermark.py          |   36 +
 .../stable_video_diffusion/__init__.py        |   58 +
 .../pipeline_stable_video_diffusion.py        |  673 +++
 .../pipelines/t2i_adapter/__init__.py         |   47 +
 .../pipeline_stable_diffusion_adapter.py      |  912 ++++
 .../pipeline_stable_diffusion_xl_adapter.py   | 1258 ++++++
 .../text_to_video_synthesis/__init__.py       |   54 +
 .../pipeline_output.py                        |   25 +
 .../pipeline_text_to_video_synth.py           |  663 +++
 .../pipeline_text_to_video_synth_img2img.py   |  760 ++++
 .../pipeline_text_to_video_zero.py            |  969 +++++
 .../pipeline_text_to_video_zero_sdxl.py       | 1315 ++++++
 .../diffusers/pipelines/unclip/__init__.py    |   52 +
 .../pipelines/unclip/pipeline_unclip.py       |  493 +++
 .../unclip/pipeline_unclip_image_variation.py |  420 ++
 .../diffusers/pipelines/unclip/text_proj.py   |   86 +
 .../pipelines/unidiffuser/__init__.py         |   58 +
 .../unidiffuser/modeling_text_decoder.py      |  296 ++
 .../pipelines/unidiffuser/modeling_uvit.py    | 1197 ++++++
 .../unidiffuser/pipeline_unidiffuser.py       | 1419 +++++++
 .../pipelines/wuerstchen/__init__.py          |   56 +
 .../wuerstchen/modeling_paella_vq_model.py    |  172 +
 .../wuerstchen/modeling_wuerstchen_common.py  |   81 +
 .../modeling_wuerstchen_diffnext.py           |  254 ++
 .../wuerstchen/modeling_wuerstchen_prior.py   |  200 +
 .../wuerstchen/pipeline_wuerstchen.py         |  438 ++
 .../pipeline_wuerstchen_combined.py           |  306 ++
 .../wuerstchen/pipeline_wuerstchen_prior.py   |  516 +++
 .../pytorch/src/diffusers/py.typed            |    0
 .../src/diffusers/schedulers/README.md        |    3 +
 .../src/diffusers/schedulers/__init__.py      |  211 +
 .../schedulers/deprecated/__init__.py         |   50 +
 .../deprecated/scheduling_karras_ve.py        |  243 ++
 .../deprecated/scheduling_sde_vp.py           |  109 +
 .../diffusers/schedulers/scheduling_amused.py |  162 +
 .../scheduling_consistency_decoder.py         |  180 +
 .../scheduling_consistency_models.py          |  448 ++
 .../diffusers/schedulers/scheduling_ddim.py   |  520 +++
 .../schedulers/scheduling_ddim_flax.py        |  313 ++
 .../schedulers/scheduling_ddim_inverse.py     |  374 ++
 .../schedulers/scheduling_ddim_parallel.py    |  645 +++
 .../diffusers/schedulers/scheduling_ddpm.py   |  562 +++
 .../schedulers/scheduling_ddpm_flax.py        |  299 ++
 .../schedulers/scheduling_ddpm_parallel.py    |  653 +++
 .../schedulers/scheduling_ddpm_wuerstchen.py  |  230 +
 .../schedulers/scheduling_deis_multistep.py   |  786 ++++
 .../scheduling_dpmsolver_multistep.py         | 1029 +++++
 .../scheduling_dpmsolver_multistep_flax.py    |  643 +++
 .../scheduling_dpmsolver_multistep_inverse.py |  921 ++++
 .../schedulers/scheduling_dpmsolver_sde.py    |  557 +++
 .../scheduling_dpmsolver_singlestep.py        |  979 +++++
 .../scheduling_edm_dpmsolver_multistep.py     |  683 +++
 .../schedulers/scheduling_edm_euler.py        |  381 ++
 .../scheduling_euler_ancestral_discrete.py    |  481 +++
 .../schedulers/scheduling_euler_discrete.py   |  576 +++
 .../scheduling_euler_discrete_flax.py         |  265 ++
 .../schedulers/scheduling_heun_discrete.py    |  482 +++
 .../diffusers/schedulers/scheduling_ipndm.py  |  224 +
 .../scheduling_k_dpm_2_ancestral_discrete.py  |  508 +++
 .../schedulers/scheduling_k_dpm_2_discrete.py |  483 +++
 .../schedulers/scheduling_karras_ve_flax.py   |  238 ++
 .../diffusers/schedulers/scheduling_lcm.py    |  660 +++
 .../schedulers/scheduling_lms_discrete.py     |  475 +++
 .../scheduling_lms_discrete_flax.py           |  283 ++
 .../diffusers/schedulers/scheduling_pndm.py   |  476 +++
 .../schedulers/scheduling_pndm_flax.py        |  509 +++
 .../schedulers/scheduling_repaint.py          |  361 ++
 .../schedulers/scheduling_sasolver.py         | 1124 +++++
 .../diffusers/schedulers/scheduling_sde_ve.py |  301 ++
 .../schedulers/scheduling_sde_ve_flax.py      |  280 ++
 .../diffusers/schedulers/scheduling_tcd.py    |  686 +++
 .../diffusers/schedulers/scheduling_unclip.py |  352 ++
 .../schedulers/scheduling_unipc_multistep.py  |  880 ++++
 .../diffusers/schedulers/scheduling_utils.py  |  186 +
 .../schedulers/scheduling_utils_flax.py       |  293 ++
 .../schedulers/scheduling_vq_diffusion.py     |  467 +++
 .../pytorch/src/diffusers/training_utils.py   |  453 ++
 .../pytorch/src/diffusers/utils/__init__.py   |  124 +
 .../src/diffusers/utils/accelerate_utils.py   |   48 +
 .../pytorch/src/diffusers/utils/constants.py  |   55 +
 .../src/diffusers/utils/deprecation_utils.py  |   49 +
 .../pytorch/src/diffusers/utils/doc_utils.py  |   38 +
 .../dummy_flax_and_transformers_objects.py    |   77 +
 .../src/diffusers/utils/dummy_flax_objects.py |  212 +
 .../diffusers/utils/dummy_note_seq_objects.py |   17 +
 .../src/diffusers/utils/dummy_onnx_objects.py |   17 +
 .../src/diffusers/utils/dummy_pt_objects.py   | 1170 ++++++
 .../utils/dummy_torch_and_librosa_objects.py  |   32 +
 .../utils/dummy_torch_and_scipy_objects.py    |   17 +
 .../utils/dummy_torch_and_torchsde_objects.py |   17 +
 ...nd_transformers_and_k_diffusion_objects.py |   32 +
 ...torch_and_transformers_and_onnx_objects.py |   92 +
 .../dummy_torch_and_transformers_objects.py   | 1607 +++++++
 ...sformers_and_torch_and_note_seq_objects.py |   17 +
 .../diffusers/utils/dynamic_modules_utils.py  |  452 ++
 .../src/diffusers/utils/export_utils.py       |  140 +
 .../pytorch/src/diffusers/utils/hub_utils.py  |  493 +++
 .../src/diffusers/utils/import_utils.py       |  726 ++++
 .../src/diffusers/utils/loading_utils.py      |   49 +
 .../pytorch/src/diffusers/utils/logging.py    |  339 ++
 .../diffusers/utils/model_card_template.md    |   24 +
 .../pytorch/src/diffusers/utils/outputs.py    |  137 +
 .../pytorch/src/diffusers/utils/peft_utils.py |  268 ++
 .../pytorch/src/diffusers/utils/pil_utils.py  |   67 +
 .../src/diffusers/utils/state_dict_utils.py   |  324 ++
 .../src/diffusers/utils/testing_utils.py      |  967 +++++
 .../src/diffusers/utils/torch_utils.py        |  147 +
 .../pytorch/src/diffusers/utils/versions.py   |  117 +
 .../stable-diffusion/init_torch.sh            |   26 +
 .../train_sd2.1_pokemon_dist_1x8_torch.sh     |   63 +
 437 files changed, 194613 insertions(+)
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/.gitignore
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/LICENSE
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/MANIFEST.in
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/README.md
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/build_diffusers.sh
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/clean_diffusers.sh
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/README.md
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/README_sdxl.md
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/default_config.yaml
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/requirements.txt
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/requirements_flax.txt
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/requirements_sdxl.txt
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/single_config.yaml
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/test_text_to_image.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/test_text_to_image_lora.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image_flax.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image_lora.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image_lora_sdxl.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image_sdxl.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/zero2_config.yaml
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/install_diffusers.sh
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_1.5.sh
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_1.5_single.sh
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_2.1.sh
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_2.1_single.sh
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_xl.sh
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/setup.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/commands/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/commands/diffusers_cli.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/commands/env.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/commands/fp16_safetensors.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/configuration_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/dependency_versions_check.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/dependency_versions_table.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/experimental/README.md
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/experimental/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/experimental/rl/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/experimental/rl/value_guided_sampling.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/image_processor.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/autoencoder.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/controlnet.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/ip_adapter.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/lora.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/lora_conversion_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/peft.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/single_file.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/single_file_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/textual_inversion.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/unet.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/README.md
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/activations.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/adapter.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/attention.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/attention_flax.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/attention_processor.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/autoencoder_asym_kl.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/autoencoder_kl.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/autoencoder_tiny.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/consistency_decoder_vae.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/vae.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/controlnet.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/controlnet_flax.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/downsampling.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/dual_transformer_2d.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/embeddings.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/embeddings_flax.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/lora.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_flax_pytorch_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_flax_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_outputs.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_pytorch_flax_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/Welford.h
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/custom_gn.cpp
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/custom_gn.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/gn_kernel.cu
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/gn_kernel.h
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/nchw_kernel.cu
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/vecs.h
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/normalization.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/prior_transformer.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/resnet.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/resnet_flax.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/t5_film_transformer.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformer_2d.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformer_temporal.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/dual_transformer_2d.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/prior_transformer.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/t5_film_transformer.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/transformer_2d.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/transformer_temporal.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_1d.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_1d_blocks.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_2d.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_2d_blocks.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_2d_condition.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_1d.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_1d_blocks.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d_blocks.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d_blocks_flax.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d_condition.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d_condition_flax.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_3d_blocks.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_3d_condition.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_i2vgen_xl.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_kandinsky3.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_motion_model.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_spatio_temporal_condition.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_stable_cascade.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/uvit_2d.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/upsampling.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/vae_flax.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/vq_model.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/optimization.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/README.md
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/amused/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/amused/pipeline_amused.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/amused/pipeline_amused_img2img.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/animatediff/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/animatediff/pipeline_output.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm2/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/auto_pipeline.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/blip_image_processing.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/consistency_models/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/multicontrolnet.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/dance_diffusion/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ddim/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ddim/pipeline_ddim.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ddpm/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_output.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/safety_checker.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/timesteps.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/watermark.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/README.md
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_output.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/mel.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/pndm/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/pndm/pipeline_pndm.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/repaint/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/score_sde_ve/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stochastic_karras_ve/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/vq_diffusion/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/dit/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/dit/pipeline_dit.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/free_init_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/i2vgen_xl/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/text_encoder.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky3/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_consistency_models/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_diffusion/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ledits_pp/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_output.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/musicldm/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/musicldm/pipeline_musicldm.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/onnx_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/paint_by_example/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/paint_by_example/image_encoder.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pia/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pia/pipeline_pia.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pipeline_flax_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pipeline_loading_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pipeline_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pixart_alpha/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_output.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/camera.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/renderer.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_cascade/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/README.md
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/clip_image_project_model.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_output.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/safety_checker.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/safety_checker_flax.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_attend_and_excite/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_diffedit/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_ldm3d/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_panorama/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_safe/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_safe/pipeline_output.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_sag/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_output.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/watermark.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_video_diffusion/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/t2i_adapter/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unclip/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unclip/pipeline_unclip.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unclip/text_proj.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unidiffuser/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unidiffuser/modeling_uvit.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/py.typed
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/README.md
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/deprecated/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/deprecated/scheduling_sde_vp.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_amused.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_consistency_decoder.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_consistency_models.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddim.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddim_flax.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddim_inverse.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddim_parallel.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddpm.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddpm_flax.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddpm_parallel.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_deis_multistep.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_edm_euler.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_euler_discrete.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_euler_discrete_flax.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_heun_discrete.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ipndm.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_karras_ve_flax.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_lcm.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_lms_discrete.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_lms_discrete_flax.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_pndm.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_pndm_flax.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_repaint.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_sasolver.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_sde_ve.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_sde_ve_flax.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_tcd.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_unclip.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_unipc_multistep.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_utils_flax.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_vq_diffusion.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/training_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/__init__.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/accelerate_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/constants.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/deprecation_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/doc_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_flax_and_transformers_objects.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_flax_objects.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_note_seq_objects.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_onnx_objects.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_pt_objects.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_librosa_objects.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_scipy_objects.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_torchsde_objects.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_transformers_and_k_diffusion_objects.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_transformers_objects.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_transformers_and_torch_and_note_seq_objects.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dynamic_modules_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/export_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/hub_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/import_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/loading_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/logging.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/model_card_template.md
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/outputs.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/peft_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/pil_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/state_dict_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/testing_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/torch_utils.py
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/versions.py
 create mode 100644 tests/executables/stable-diffusion/init_torch.sh
 create mode 100644 tests/executables/stable-diffusion/train_sd2.1_pokemon_dist_1x8_torch.sh

diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/.gitignore b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/.gitignore
new file mode 100644
index 000000000..9d74fe840
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/.gitignore
@@ -0,0 +1,178 @@
+# Initially taken from GitHub's Python gitignore file
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# tests and logs
+tests/fixtures/cached_*_text.txt
+logs/
+lightning_logs/
+lang_code_data/
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a Python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# vscode
+.vs
+.vscode
+
+# Pycharm
+.idea
+
+# TF code
+tensorflow_code
+
+# Models
+proc_data
+
+# examples
+runs
+/runs_old
+/wandb
+/examples/runs
+/examples/**/*.args
+/examples/rag/sweep
+
+# data
+/data
+serialization_dir
+
+# emacs
+*.*~
+debug.env
+
+# vim
+.*.swp
+
+# ctags
+tags
+
+# pre-commit
+.pre-commit*
+
+# .lock
+*.lock
+
+# DS_Store (MacOS)
+.DS_Store
+
+# RL pipelines may produce mp4 outputs
+*.mp4
+
+# dependencies
+/transformers
+
+# ruff
+.ruff_cache
+
+# wandb
+wandb
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/LICENSE b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/LICENSE
new file mode 100644
index 000000000..261eeb9e9
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/MANIFEST.in b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/MANIFEST.in
new file mode 100644
index 000000000..bc6260161
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/MANIFEST.in
@@ -0,0 +1,3 @@
+include LICENSE
+include src/diffusers/models/nhwc_groupnorm/*
+include src/diffusers/utils/model_card_template.md
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/README.md b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/README.md
new file mode 100644
index 000000000..3bfe91c9f
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/README.md
@@ -0,0 +1,45 @@
+## 安装环境
+### 安装 transformers
+```bash
+git clone ssh://git@bitbucket.iluvatar.ai:7999/apptp/transformers.git
+cd transformers
+python3 setup.py install
+```
+### 安装diffusers
+```bash
+cd diffusers
+pip3 install pip3 install -r examples/text_to_image/requirements.txt
+bash build_diffusers.sh && bash install_diffusers.sh
+```
+_默认已经安装的包有：torchvision,ixformer,flash-attn,deepspeed,apex_
+_上述包最好使用较新的daily，不然可能会有功能不支持_
+
+## 下载数据
+```bash
+mkdir -p pokemon-blip-captions 
+download here: http://10.150.9.95/swapp/datasets/multimodal/stable_diffusion/pokemon-blip-captions
+wget http://10.150.9.95/swapp/datasets/multimodal/stable_diffusion/stabilityai.tar   # sd2.1 权重
+tar -xvf stabilityai.tar
+```
+
+*sdxl权重链接：http://sw.iluvatar.ai/download/apps/datasets/aigc/xl/stable-diffusion-xl-base-1.0.tar.gz*
+
+*sd1.5权重链接：http://10.150.9.95/swapp/pretrained/multimodal/stable-diffusion/stable-diffusion-v1-5.zip*
+
+
+## 训练
+*以下脚本中包含的数据和预训练权重位置需要根据实际存放位置调整*
+### sd2.1 训练
+```bash
+$ bash run_sd_2.1.sh   # 多卡
+$ bash run_sd_2.1_single.sh   # 单卡
+```
+### sd1.5 训练
+```bash
+$ bash run_sd_1.5.sh   # 多卡
+$ bash run_sd_1.5_single.sh   # 单卡
+```
+### sdxl 训练
+```bash
+$ bash run_sd_xl.sh   # 多卡
+```
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/build_diffusers.sh b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/build_diffusers.sh
new file mode 100644
index 000000000..74e8d129e
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/build_diffusers.sh
@@ -0,0 +1,22 @@
+SCRIPTPATH=$(dirname $(realpath "$0"))
+cd $(dirname $(realpath "$0"))
+COREX_VERSION=${COREX_VERSION:-latest}
+MAX_JOBS=${MAX_JOBS:-$(nproc --all)}
+PYTHON_PATH=$(which python3)
+
+export MAX_JOBS=${MAX_JOBS}
+
+echo "Python cmd1: ${PYTHON_PATH} setup.py build"
+${PYTHON_PATH} setup.py build 2>&1 | tee compile.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit
+
+if [[ "${COREX_VERSION}" == "latest" ]]; then
+  COREX_VERSION=`date --utc +%Y%m%d%H%M%S`
+fi
+
+export LOCAL_VERSION_IDENTIFIER="corex.${COREX_VERSION}"
+
+echo "Python cmd2: ${PYTHON_PATH} setup.py bdist_wheel -d build_pip"
+${PYTHON_PATH} setup.py bdist_wheel -d build_pip || exit
+
+# Return 0 status if all finished
+exit 0
\ No newline at end of file
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/clean_diffusers.sh b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/clean_diffusers.sh
new file mode 100644
index 000000000..a04335f67
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/clean_diffusers.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+PROJPATH=$(dirname $(realpath "$0"))
+cd $(dirname $(realpath "$0"))
+PYTHON_PATH=$(which python3)
+
+rm -rf build
+${PYTHON_PATH} setup.py clean || true
+rm -rf build_pip
+rm -rf ${PROJPATH}/dist
+
+# Return 0 status if all finished
+exit 0
\ No newline at end of file
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/README.md b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/README.md
new file mode 100644
index 000000000..f2931d3f3
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/README.md
@@ -0,0 +1,326 @@
+# Stable Diffusion text-to-image fine-tuning
+
+The `train_text_to_image.py` script shows how to fine-tune stable diffusion model on your own dataset.
+
+___Note___:
+
+___This script is experimental. The script fine-tunes the whole model and often times the model overfits and runs into issues like catastrophic forgetting. It's recommended to try different hyperparameters to get the best result on your dataset.___
+
+
+## Running locally with PyTorch
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install .
+```
+
+Then cd in the example folder  and run
+```bash
+pip install -r requirements.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Note also that we use PEFT library as backend for LoRA training, make sure to have `peft>=0.6.0` installed in your environment.
+
+### Pokemon example
+
+You need to accept the model license before downloading or using the weights. In this example we'll use model version `v1-4`, so you'll need to visit [its card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree.
+
+You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work. For more information on access tokens, please refer to [this section of the documentation](https://huggingface.co/docs/hub/security-tokens).
+
+Run the following command to authenticate your token
+
+```bash
+huggingface-cli login
+```
+
+If you have already cloned the repo, then you won't need to go through these steps.
+
+<br>
+
+#### Hardware
+With `gradient_checkpointing` and `mixed_precision` it should be possible to fine tune the model on a single 24GB GPU. For higher `batch_size` and faster training it's better to use GPUs with >30GB memory.
+
+**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
+<!-- accelerate_snippet_start -->
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+
+accelerate launch --mixed_precision="fp16"  train_text_to_image.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --dataset_name=$DATASET_NAME \
+  --use_ema \
+  --resolution=512 --center_crop --random_flip \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --gradient_checkpointing \
+  --max_train_steps=15000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --output_dir="sd-pokemon-model"
+```
+<!-- accelerate_snippet_end -->
+
+
+To run on your own training files prepare the dataset according to the format required by `datasets`, you can find the instructions for how to do that in this [document](https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder-with-metadata).
+If you wish to use custom loading logic, you should modify the script, we have left pointers for that in the training script.
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export TRAIN_DIR="path_to_your_dataset"
+
+accelerate launch --mixed_precision="fp16" train_text_to_image.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$TRAIN_DIR \
+  --use_ema \
+  --resolution=512 --center_crop --random_flip \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --gradient_checkpointing \
+  --max_train_steps=15000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --output_dir="sd-pokemon-model"
+```
+
+
+Once the training is finished the model will be saved in the `output_dir` specified in the command. In this example it's `sd-pokemon-model`. To load the fine-tuned model for inference just pass that path to `StableDiffusionPipeline`
+
+```python
+import torch
+from diffusers import StableDiffusionPipeline
+
+model_path = "path_to_saved_model"
+pipe = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float16)
+pipe.to("cuda")
+
+image = pipe(prompt="yoda").images[0]
+image.save("yoda-pokemon.png")
+```
+
+Checkpoints only save the unet, so to run inference from a checkpoint, just load the unet
+
+```python
+import torch
+from diffusers import StableDiffusionPipeline, UNet2DConditionModel
+
+model_path = "path_to_saved_model"
+unet = UNet2DConditionModel.from_pretrained(model_path + "/checkpoint-<N>/unet", torch_dtype=torch.float16)
+
+pipe = StableDiffusionPipeline.from_pretrained("<initial model>", unet=unet, torch_dtype=torch.float16)
+pipe.to("cuda")
+
+image = pipe(prompt="yoda").images[0]
+image.save("yoda-pokemon.png")
+```
+
+#### Training with multiple GPUs
+
+`accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch)
+for running distributed training with `accelerate`. Here is an example command:
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+
+accelerate launch --mixed_precision="fp16" --multi_gpu  train_text_to_image.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --dataset_name=$DATASET_NAME \
+  --use_ema \
+  --resolution=512 --center_crop --random_flip \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --gradient_checkpointing \
+  --max_train_steps=15000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --output_dir="sd-pokemon-model"
+```
+
+
+#### Training with Min-SNR weighting
+
+We support training with the Min-SNR weighting strategy proposed in [Efficient Diffusion Training via Min-SNR Weighting Strategy](https://arxiv.org/abs/2303.09556) which helps to achieve faster convergence
+by rebalancing the loss. In order to use it, one needs to set the `--snr_gamma` argument. The recommended
+value when using it is 5.0.
+
+You can find [this project on Weights and Biases](https://wandb.ai/sayakpaul/text2image-finetune-minsnr) that compares the loss surfaces of the following setups:
+
+* Training without the Min-SNR weighting strategy
+* Training with the Min-SNR weighting strategy (`snr_gamma` set to 5.0)
+* Training with the Min-SNR weighting strategy (`snr_gamma` set to 1.0)
+
+For our small Pokemons dataset, the effects of Min-SNR weighting strategy might not appear to be pronounced, but for larger datasets, we believe the effects will be more pronounced.
+
+Also, note that in this example, we either predict `epsilon` (i.e., the noise) or the `v_prediction`. For both of these cases, the formulation of the Min-SNR weighting strategy that we have used holds.
+
+## Training with LoRA
+
+Low-Rank Adaption of Large Language Models was first introduced by Microsoft in [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) by *Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen*.
+
+In a nutshell, LoRA allows adapting pretrained models by adding pairs of rank-decomposition matrices to existing weights and **only** training those newly added weights. This has a couple of advantages:
+
+- Previous pretrained weights are kept frozen so that model is not prone to [catastrophic forgetting](https://www.pnas.org/doi/10.1073/pnas.1611835114).
+- Rank-decomposition matrices have significantly fewer parameters than original model, which means that trained LoRA weights are easily portable.
+- LoRA attention layers allow to control to which extent the model is adapted toward new training images via a `scale` parameter.
+
+[cloneofsimo](https://github.com/cloneofsimo) was the first to try out LoRA training for Stable Diffusion in the popular [lora](https://github.com/cloneofsimo/lora) GitHub repository.
+
+With LoRA, it's possible to fine-tune Stable Diffusion on a custom image-caption pair dataset
+on consumer GPUs like Tesla T4, Tesla V100.
+
+### Training
+
+First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables. Here, we will use [Stable Diffusion v1-4](https://hf.co/CompVis/stable-diffusion-v1-4) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions).
+
+**___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
+
+**___Note: It is quite useful to monitor the training progress by regularly generating sample images during training. [Weights and Biases](https://docs.wandb.ai/quickstart) is a nice solution to easily see generating images during training. All you need to do is to run `pip install wandb` before training to automatically log images.___**
+
+```bash
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
+export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+```
+
+For this example we want to directly store the trained LoRA embeddings on the Hub, so
+we need to be logged in and add the `--push_to_hub` flag.
+
+```bash
+huggingface-cli login
+```
+
+Now we can start training!
+
+```bash
+accelerate launch --mixed_precision="fp16" train_text_to_image_lora.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --dataset_name=$DATASET_NAME --caption_column="text" \
+  --resolution=512 --random_flip \
+  --train_batch_size=1 \
+  --num_train_epochs=100 --checkpointing_steps=5000 \
+  --learning_rate=1e-04 --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --seed=42 \
+  --output_dir="sd-pokemon-model-lora" \
+  --validation_prompt="cute dragon creature" --report_to="wandb"
+```
+
+The above command will also run inference as fine-tuning progresses and log the results to Weights and Biases.
+
+**___Note: When using LoRA we can use a much higher learning rate compared to non-LoRA fine-tuning. Here we use *1e-4* instead of the usual *1e-5*. Also, by using LoRA, it's possible to run `train_text_to_image_lora.py` in consumer GPUs like T4 or V100.___**
+
+The final LoRA embedding weights have been uploaded to [sayakpaul/sd-model-finetuned-lora-t4](https://huggingface.co/sayakpaul/sd-model-finetuned-lora-t4). **___Note: [The final weights](https://huggingface.co/sayakpaul/sd-model-finetuned-lora-t4/blob/main/pytorch_lora_weights.bin) are only 3 MB in size, which is orders of magnitudes smaller than the original model.___**
+
+You can check some inference samples that were logged during the course of the fine-tuning process [here](https://wandb.ai/sayakpaul/text2image-fine-tune/runs/q4lc0xsw).
+
+### Inference
+
+Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline` after loading the trained LoRA weights.  You
+need to pass the `output_dir` for loading the LoRA weights which, in this case, is `sd-pokemon-model-lora`.
+
+```python
+from diffusers import StableDiffusionPipeline
+import torch
+
+model_path = "sayakpaul/sd-model-finetuned-lora-t4"
+pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
+pipe.unet.load_attn_procs(model_path)
+pipe.to("cuda")
+
+prompt = "A pokemon with green eyes and red legs."
+image = pipe(prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
+image.save("pokemon.png")
+```
+
+If you are loading the LoRA parameters from the Hub and if the Hub repository has
+a `base_model` tag (such as [this](https://huggingface.co/sayakpaul/sd-model-finetuned-lora-t4/blob/main/README.md?code=true#L4)), then
+you can do:
+
+```py
+from huggingface_hub.repocard import RepoCard
+
+lora_model_id = "sayakpaul/sd-model-finetuned-lora-t4"
+card = RepoCard.load(lora_model_id)
+base_model_id = card.data.to_dict()["base_model"]
+
+pipe = StableDiffusionPipeline.from_pretrained(base_model_id, torch_dtype=torch.float16)
+...
+```
+
+## Training with Flax/JAX
+
+For faster training on TPUs and GPUs you can leverage the flax training example. Follow the instructions above to get the model and dataset before running the script.
+
+**___Note: The flax example doesn't yet support features like gradient checkpoint, gradient accumulation etc, so to use flax for faster training we will need >30GB cards or TPU v3.___**
+
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+```bash
+pip install -U -r requirements_flax.txt
+```
+
+```bash
+export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
+export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+
+python train_text_to_image_flax.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --dataset_name=$DATASET_NAME \
+  --resolution=512 --center_crop --random_flip \
+  --train_batch_size=1 \
+  --mixed_precision="fp16" \
+  --max_train_steps=15000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --output_dir="sd-pokemon-model"
+```
+
+To run on your own training files prepare the dataset according to the format required by `datasets`, you can find the instructions for how to do that in this [document](https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder-with-metadata).
+If you wish to use custom loading logic, you should modify the script, we have left pointers for that in the training script.
+
+```bash
+export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
+export TRAIN_DIR="path_to_your_dataset"
+
+python train_text_to_image_flax.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --train_data_dir=$TRAIN_DIR \
+  --resolution=512 --center_crop --random_flip \
+  --train_batch_size=1 \
+  --mixed_precision="fp16" \
+  --max_train_steps=15000 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --output_dir="sd-pokemon-model"
+```
+
+### Training with xFormers:
+
+You can enable memory efficient attention by [installing xFormers](https://huggingface.co/docs/diffusers/main/en/optimization/xformers) and passing the `--enable_xformers_memory_efficient_attention` argument to the script.
+
+xFormers training is not available for Flax/JAX.
+
+**Note**:
+
+According to [this issue](https://github.com/huggingface/diffusers/issues/2234#issuecomment-1416931212), xFormers `v0.0.16` cannot be used for training in some GPUs. If you observe that problem, please install a development version as indicated in that comment.
+
+## Stable Diffusion XL
+
+* We support fine-tuning the UNet shipped in [Stable Diffusion XL](https://huggingface.co/papers/2307.01952) via the `train_text_to_image_sdxl.py` script. Please refer to the docs [here](./README_sdxl.md).
+* We also support fine-tuning of the UNet and Text Encoder shipped in [Stable Diffusion XL](https://huggingface.co/papers/2307.01952) with LoRA via the `train_text_to_image_lora_sdxl.py` script. Please refer to the docs [here](./README_sdxl.md).
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/README_sdxl.md b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/README_sdxl.md
new file mode 100644
index 000000000..349feef50
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/README_sdxl.md
@@ -0,0 +1,286 @@
+# Stable Diffusion XL text-to-image fine-tuning
+
+The `train_text_to_image_sdxl.py` script shows how to fine-tune Stable Diffusion XL (SDXL) on your own dataset.
+
+🚨 This script is experimental. The script fine-tunes the whole model and often times the model overfits and runs into issues like catastrophic forgetting. It's recommended to try different hyperparameters to get the best result on your dataset. 🚨
+
+## Running locally with PyTorch
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install -e .
+```
+
+Then cd in the `examples/text_to_image` folder and run
+```bash
+pip install -r requirements_sdxl.txt
+```
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+Or for a default accelerate configuration without answering questions about your environment
+
+```bash
+accelerate config default
+```
+
+Or if your environment doesn't support an interactive shell (e.g., a notebook)
+
+```python
+from accelerate.utils import write_basic_config
+write_basic_config()
+```
+
+When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups.
+Note also that we use PEFT library as backend for LoRA training, make sure to have `peft>=0.6.0` installed in your environment.
+
+### Training
+
+```bash
+export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
+export VAE_NAME="madebyollin/sdxl-vae-fp16-fix"
+export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+
+accelerate launch train_text_to_image_sdxl.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --pretrained_vae_model_name_or_path=$VAE_NAME \
+  --dataset_name=$DATASET_NAME \
+  --enable_xformers_memory_efficient_attention \
+  --resolution=512 --center_crop --random_flip \
+  --proportion_empty_prompts=0.2 \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 --gradient_checkpointing \
+  --max_train_steps=10000 \
+  --use_8bit_adam \
+  --learning_rate=1e-06 --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --mixed_precision="fp16" \
+  --report_to="wandb" \
+  --validation_prompt="a cute Sundar Pichai creature" --validation_epochs 5 \
+  --checkpointing_steps=5000 \
+  --output_dir="sdxl-pokemon-model" \
+  --push_to_hub
+```
+
+**Notes**:
+
+*  The `train_text_to_image_sdxl.py` script pre-computes text embeddings and the VAE encodings and keeps them in memory. While for smaller datasets like [`lambdalabs/pokemon-blip-captions`](https://hf.co/datasets/lambdalabs/pokemon-blip-captions), it might not be a problem, it can definitely lead to memory problems when the script is used on a larger dataset. For those purposes, you would want to serialize these pre-computed representations to disk separately and load them during the fine-tuning process. Refer to [this PR](https://github.com/huggingface/diffusers/pull/4505) for a more in-depth discussion.
+* The training script is compute-intensive and may not run on a consumer GPU like Tesla T4.
+* The training command shown above performs intermediate quality validation in between the training epochs and logs the results to Weights and Biases. `--report_to`, `--validation_prompt`, and `--validation_epochs` are the relevant CLI arguments here.
+* SDXL's VAE is known to suffer from numerical instability issues. This is why we also expose a CLI argument namely `--pretrained_vae_model_name_or_path` that lets you specify the location of a better VAE (such as [this one](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).
+
+### Inference
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+model_path = "you-model-id-goes-here" # <-- change this
+pipe = DiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float16)
+pipe.to("cuda")
+
+prompt = "A pokemon with green eyes and red legs."
+image = pipe(prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
+image.save("pokemon.png")
+```
+
+### Inference in Pytorch XLA
+```python
+from diffusers import DiffusionPipeline
+import torch
+import torch_xla.core.xla_model as xm
+
+model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+pipe = DiffusionPipeline.from_pretrained(model_id)
+
+device = xm.xla_device()
+pipe.to(device)
+
+prompt = "A pokemon with green eyes and red legs."
+start = time()
+image = pipe(prompt, num_inference_steps=inference_steps).images[0]
+print(f'Compilation time is {time()-start} sec')
+image.save("pokemon.png")
+
+start = time()
+image = pipe(prompt, num_inference_steps=inference_steps).images[0]
+print(f'Inference time is {time()-start} sec after compilation')
+```
+
+Note: There is a warmup step in PyTorch XLA. This takes longer because of
+compilation and optimization. To see the real benefits of Pytorch XLA and
+speedup, we need to call the pipe again on the input with the same length
+as the original prompt to reuse the optimized graph and get the performance
+boost.
+
+## LoRA training example for Stable Diffusion XL (SDXL)
+
+Low-Rank Adaption of Large Language Models was first introduced by Microsoft in [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685) by *Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen*.
+
+In a nutshell, LoRA allows adapting pretrained models by adding pairs of rank-decomposition matrices to existing weights and **only** training those newly added weights. This has a couple of advantages:
+
+- Previous pretrained weights are kept frozen so that model is not prone to [catastrophic forgetting](https://www.pnas.org/doi/10.1073/pnas.1611835114).
+- Rank-decomposition matrices have significantly fewer parameters than original model, which means that trained LoRA weights are easily portable.
+- LoRA attention layers allow to control to which extent the model is adapted toward new training images via a `scale` parameter.
+
+[cloneofsimo](https://github.com/cloneofsimo) was the first to try out LoRA training for Stable Diffusion in the popular [lora](https://github.com/cloneofsimo/lora) GitHub repository.
+
+With LoRA, it's possible to fine-tune Stable Diffusion on a custom image-caption pair dataset
+on consumer GPUs like Tesla T4, Tesla V100.
+
+### Training
+
+First, you need to set up your development environment as is explained in the [installation section](#installing-the-dependencies). Make sure to set the `MODEL_NAME` and `DATASET_NAME` environment variables and, optionally, the `VAE_NAME` variable. Here, we will use [Stable Diffusion XL 1.0-base](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and the [Pokemons dataset](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions).
+
+**___Note: It is quite useful to monitor the training progress by regularly generating sample images during training. [Weights and Biases](https://docs.wandb.ai/quickstart) is a nice solution to easily see generating images during training. All you need to do is to run `pip install wandb` before training to automatically log images.___**
+
+```bash
+export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
+export VAE_NAME="madebyollin/sdxl-vae-fp16-fix"
+export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+```
+
+For this example we want to directly store the trained LoRA embeddings on the Hub, so
+we need to be logged in and add the `--push_to_hub` flag.
+
+```bash
+huggingface-cli login
+```
+
+Now we can start training!
+
+```bash
+accelerate launch train_text_to_image_lora_sdxl.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --pretrained_vae_model_name_or_path=$VAE_NAME \
+  --dataset_name=$DATASET_NAME --caption_column="text" \
+  --resolution=1024 --random_flip \
+  --train_batch_size=1 \
+  --num_train_epochs=2 --checkpointing_steps=500 \
+  --learning_rate=1e-04 --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --mixed_precision="fp16" \
+  --seed=42 \
+  --output_dir="sd-pokemon-model-lora-sdxl" \
+  --validation_prompt="cute dragon creature" --report_to="wandb" \
+  --push_to_hub
+```
+
+The above command will also run inference as fine-tuning progresses and log the results to Weights and Biases.
+
+**Notes**:
+
+* SDXL's VAE is known to suffer from numerical instability issues. This is why we also expose a CLI argument namely `--pretrained_vae_model_name_or_path` that lets you specify the location of a better VAE (such as [this one](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).
+
+
+### Using DeepSpeed
+Using DeepSpeed one can reduce the consumption of GPU memory, enabling the training of models on GPUs with smaller memory sizes. DeepSpeed is capable of offloading model parameters to the machine's memory, or it can distribute parameters, gradients, and optimizer states across multiple GPUs. This allows for the training of larger models under the same hardware configuration.
+
+First, you need to use the `accelerate config` command to choose to use DeepSpeed, or manually use the accelerate config file to set up DeepSpeed.
+
+Here is an example of a config file for using DeepSpeed. For more detailed explanations of the configuration, you can refer to this [link](https://huggingface.co/docs/accelerate/usage_guides/deepspeed).
+```yaml
+compute_environment: LOCAL_MACHINE
+debug: true
+deepspeed_config:
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: fp16
+num_machines: 1
+num_processes: 1
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+```
+You need to save the mentioned configuration as an `accelerate_config.yaml` file. Then, you need to input the path of your `accelerate_config.yaml` file into the `ACCELERATE_CONFIG_FILE` parameter. This way you can use DeepSpeed to train your SDXL model in LoRA. Additionally, you can use DeepSpeed to train other SD models in this way.
+
+```shell
+export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
+export VAE_NAME="madebyollin/sdxl-vae-fp16-fix"
+export DATASET_NAME="lambdalabs/pokemon-blip-captions"
+export ACCELERATE_CONFIG_FILE="your accelerate_config.yaml"
+
+accelerate launch  --config_file $ACCELERATE_CONFIG_FILE train_text_to_image_lora_sdxl.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --pretrained_vae_model_name_or_path=$VAE_NAME \
+  --dataset_name=$DATASET_NAME --caption_column="text" \
+  --resolution=1024  \
+  --train_batch_size=1 \
+  --num_train_epochs=2 \
+  --checkpointing_steps=2 \
+  --learning_rate=1e-04 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --mixed_precision="fp16" \
+  --max_train_steps=20 \
+  --validation_epochs=20 \
+  --seed=1234 \
+  --output_dir="sd-pokemon-model-lora-sdxl" \
+  --validation_prompt="cute dragon creature"
+
+```
+
+
+### Finetuning the text encoder and UNet
+
+The script also allows you to finetune the `text_encoder` along with the `unet`.
+
+🚨 Training the text encoder requires additional memory.
+
+Pass the `--train_text_encoder` argument to the training script to enable finetuning the `text_encoder` and `unet`:
+
+```bash
+accelerate launch train_text_to_image_lora_sdxl.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --dataset_name=$DATASET_NAME --caption_column="text" \
+  --resolution=1024 --random_flip \
+  --train_batch_size=1 \
+  --num_train_epochs=2 --checkpointing_steps=500 \
+  --learning_rate=1e-04 --lr_scheduler="constant" --lr_warmup_steps=0 \
+  --seed=42 \
+  --output_dir="sd-pokemon-model-lora-sdxl-txt" \
+  --train_text_encoder \
+  --validation_prompt="cute dragon creature" --report_to="wandb" \
+  --push_to_hub
+```
+
+### Inference
+
+Once you have trained a model using above command, the inference can be done simply using the `DiffusionPipeline` after loading the trained LoRA weights.  You
+need to pass the `output_dir` for loading the LoRA weights which, in this case, is `sd-pokemon-model-lora-sdxl`.
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+model_path = "takuoko/sd-pokemon-model-lora-sdxl"
+pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16)
+pipe.to("cuda")
+pipe.load_lora_weights(model_path)
+
+prompt = "A pokemon with green eyes and red legs."
+image = pipe(prompt, num_inference_steps=30, guidance_scale=7.5).images[0]
+image.save("pokemon.png")
+```
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/default_config.yaml b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/default_config.yaml
new file mode 100644
index 000000000..829e0b662
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/default_config.yaml
@@ -0,0 +1,20 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  gradient_accumulation_steps: 1
+  steps_per_print: 1
+  zero3_init_flag: true
+  zero_stage: 0
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: fp16
+num_machines: 1
+num_processes: 16
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/requirements.txt b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/requirements.txt
new file mode 100644
index 000000000..4e079f18c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/requirements.txt
@@ -0,0 +1,6 @@
+accelerate>=0.16.0
+datasets
+ftfy
+tensorboard
+Jinja2
+peft==0.7.0
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/requirements_flax.txt b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/requirements_flax.txt
new file mode 100644
index 000000000..b6eb64e25
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/requirements_flax.txt
@@ -0,0 +1,9 @@
+transformers>=4.25.1
+datasets
+flax
+optax
+torch
+torchvision
+ftfy
+tensorboard
+Jinja2
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/requirements_sdxl.txt b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/requirements_sdxl.txt
new file mode 100644
index 000000000..64cbc9205
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/requirements_sdxl.txt
@@ -0,0 +1,8 @@
+accelerate>=0.22.0
+torchvision
+transformers>=4.25.1
+ftfy
+tensorboard
+Jinja2
+datasets
+peft==0.7.0
\ No newline at end of file
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/single_config.yaml b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/single_config.yaml
new file mode 100644
index 000000000..a6d6d2e40
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/single_config.yaml
@@ -0,0 +1,20 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  gradient_accumulation_steps: 1
+  steps_per_print: 1
+  zero3_init_flag: true
+  zero_stage: 0
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: fp16
+num_machines: 1
+num_processes: 1
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
\ No newline at end of file
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/test_text_to_image.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/test_text_to_image.py
new file mode 100644
index 000000000..6231a89b1
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/test_text_to_image.py
@@ -0,0 +1,365 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import shutil
+import sys
+import tempfile
+
+from diffusers import DiffusionPipeline, UNet2DConditionModel  # noqa: E402
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class TextToImage(ExamplesTestsAccelerate):
+    def test_text_to_image(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/text_to_image/train_text_to_image.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --center_crop
+                --random_flip
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "unet", "diffusion_pytorch_model.safetensors")))
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "scheduler", "scheduler_config.json")))
+
+    def test_text_to_image_checkpointing(self):
+        pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
+        prompt = "a prompt"
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Run training script with checkpointing
+            # max_train_steps == 4, checkpointing_steps == 2
+            # Should create checkpoints at steps 2, 4
+
+            initial_run_args = f"""
+                examples/text_to_image/train_text_to_image.py
+                --pretrained_model_name_or_path {pretrained_model_name_or_path}
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --center_crop
+                --random_flip
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 4
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
+            pipe(prompt, num_inference_steps=1)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4"},
+            )
+
+            # check can run an intermediate checkpoint
+            unet = UNet2DConditionModel.from_pretrained(tmpdir, subfolder="checkpoint-2/unet")
+            pipe = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path, unet=unet, safety_checker=None)
+            pipe(prompt, num_inference_steps=1)
+
+            # Remove checkpoint 2 so that we can check only later checkpoints exist after resuming
+            shutil.rmtree(os.path.join(tmpdir, "checkpoint-2"))
+
+            # Run training script for 2 total steps resuming from checkpoint 4
+
+            resume_run_args = f"""
+                examples/text_to_image/train_text_to_image.py
+                --pretrained_model_name_or_path {pretrained_model_name_or_path}
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --center_crop
+                --random_flip
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=1
+                --resume_from_checkpoint=checkpoint-4
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            # check can run new fully trained pipeline
+            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
+            pipe(prompt, num_inference_steps=1)
+
+            # no checkpoint-2 -> check old checkpoints do not exist
+            # check new checkpoints exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4", "checkpoint-5"},
+            )
+
+    def test_text_to_image_checkpointing_use_ema(self):
+        pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
+        prompt = "a prompt"
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Run training script with checkpointing
+            # max_train_steps == 4, checkpointing_steps == 2
+            # Should create checkpoints at steps 2, 4
+
+            initial_run_args = f"""
+                examples/text_to_image/train_text_to_image.py
+                --pretrained_model_name_or_path {pretrained_model_name_or_path}
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --center_crop
+                --random_flip
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 4
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --use_ema
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
+            pipe(prompt, num_inference_steps=2)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4"},
+            )
+
+            # check can run an intermediate checkpoint
+            unet = UNet2DConditionModel.from_pretrained(tmpdir, subfolder="checkpoint-2/unet")
+            pipe = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path, unet=unet, safety_checker=None)
+            pipe(prompt, num_inference_steps=1)
+
+            # Remove checkpoint 2 so that we can check only later checkpoints exist after resuming
+            shutil.rmtree(os.path.join(tmpdir, "checkpoint-2"))
+
+            # Run training script for 2 total steps resuming from checkpoint 4
+
+            resume_run_args = f"""
+                examples/text_to_image/train_text_to_image.py
+                --pretrained_model_name_or_path {pretrained_model_name_or_path}
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --center_crop
+                --random_flip
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=1
+                --resume_from_checkpoint=checkpoint-4
+                --use_ema
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            # check can run new fully trained pipeline
+            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
+            pipe(prompt, num_inference_steps=1)
+
+            # no checkpoint-2 -> check old checkpoints do not exist
+            # check new checkpoints exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4", "checkpoint-5"},
+            )
+
+    def test_text_to_image_checkpointing_checkpoints_total_limit(self):
+        pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
+        prompt = "a prompt"
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Run training script with checkpointing
+            # max_train_steps == 6, checkpointing_steps == 2, checkpoints_total_limit == 2
+            # Should create checkpoints at steps 2, 4, 6
+            # with checkpoint at step 2 deleted
+
+            initial_run_args = f"""
+                examples/text_to_image/train_text_to_image.py
+                --pretrained_model_name_or_path {pretrained_model_name_or_path}
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --center_crop
+                --random_flip
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 6
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --checkpoints_total_limit=2
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
+            pipe(prompt, num_inference_steps=1)
+
+            # check checkpoint directories exist
+            # checkpoint-2 should have been deleted
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})
+
+    def test_text_to_image_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
+        pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
+        prompt = "a prompt"
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Run training script with checkpointing
+            # max_train_steps == 4, checkpointing_steps == 2
+            # Should create checkpoints at steps 2, 4
+
+            initial_run_args = f"""
+                examples/text_to_image/train_text_to_image.py
+                --pretrained_model_name_or_path {pretrained_model_name_or_path}
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --center_crop
+                --random_flip
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 4
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
+            pipe(prompt, num_inference_steps=1)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4"},
+            )
+
+            # resume and we should try to checkpoint at 6, where we'll have to remove
+            # checkpoint-2 and checkpoint-4 instead of just a single previous checkpoint
+
+            resume_run_args = f"""
+                examples/text_to_image/train_text_to_image.py
+                --pretrained_model_name_or_path {pretrained_model_name_or_path}
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --center_crop
+                --random_flip
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 8
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --resume_from_checkpoint=checkpoint-4
+                --checkpoints_total_limit=2
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
+            pipe(prompt, num_inference_steps=1)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-6", "checkpoint-8"},
+            )
+
+
+class TextToImageSDXL(ExamplesTestsAccelerate):
+    def test_text_to_image_sdxl(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/text_to_image/train_text_to_image_sdxl.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-xl-pipe
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --center_crop
+                --random_flip
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "unet", "diffusion_pytorch_model.safetensors")))
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "scheduler", "scheduler_config.json")))
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/test_text_to_image_lora.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/test_text_to_image_lora.py
new file mode 100644
index 000000000..4604b9f52
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/test_text_to_image_lora.py
@@ -0,0 +1,300 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+import tempfile
+
+import safetensors
+
+from diffusers import DiffusionPipeline  # noqa: E402
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+class TextToImageLoRA(ExamplesTestsAccelerate):
+    def test_text_to_image_lora_sdxl_checkpointing_checkpoints_total_limit(self):
+        prompt = "a prompt"
+        pipeline_path = "hf-internal-testing/tiny-stable-diffusion-xl-pipe"
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Run training script with checkpointing
+            # max_train_steps == 6, checkpointing_steps == 2, checkpoints_total_limit == 2
+            # Should create checkpoints at steps 2, 4, 6
+            # with checkpoint at step 2 deleted
+
+            initial_run_args = f"""
+                examples/text_to_image/train_text_to_image_lora_sdxl.py
+                --pretrained_model_name_or_path {pipeline_path}
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 6
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --checkpoints_total_limit=2
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            pipe = DiffusionPipeline.from_pretrained(pipeline_path)
+            pipe.load_lora_weights(tmpdir)
+            pipe(prompt, num_inference_steps=1)
+
+            # check checkpoint directories exist
+            # checkpoint-2 should have been deleted
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})
+
+    def test_text_to_image_lora_checkpointing_checkpoints_total_limit(self):
+        pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
+        prompt = "a prompt"
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Run training script with checkpointing
+            # max_train_steps == 6, checkpointing_steps == 2, checkpoints_total_limit == 2
+            # Should create checkpoints at steps 2, 4, 6
+            # with checkpoint at step 2 deleted
+
+            initial_run_args = f"""
+                examples/text_to_image/train_text_to_image_lora.py
+                --pretrained_model_name_or_path {pretrained_model_name_or_path}
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --center_crop
+                --random_flip
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 6
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --checkpoints_total_limit=2
+                --seed=0
+                --num_validation_images=0
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            pipe = DiffusionPipeline.from_pretrained(
+                "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None
+            )
+            pipe.load_lora_weights(tmpdir)
+            pipe(prompt, num_inference_steps=1)
+
+            # check checkpoint directories exist
+            # checkpoint-2 should have been deleted
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})
+
+    def test_text_to_image_lora_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
+        pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
+        prompt = "a prompt"
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Run training script with checkpointing
+            # max_train_steps == 4, checkpointing_steps == 2
+            # Should create checkpoints at steps 2, 4
+
+            initial_run_args = f"""
+                examples/text_to_image/train_text_to_image_lora.py
+                --pretrained_model_name_or_path {pretrained_model_name_or_path}
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --center_crop
+                --random_flip
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 4
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --seed=0
+                --num_validation_images=0
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            pipe = DiffusionPipeline.from_pretrained(
+                "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None
+            )
+            pipe.load_lora_weights(tmpdir)
+            pipe(prompt, num_inference_steps=1)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4"},
+            )
+
+            # resume and we should try to checkpoint at 6, where we'll have to remove
+            # checkpoint-2 and checkpoint-4 instead of just a single previous checkpoint
+
+            resume_run_args = f"""
+                examples/text_to_image/train_text_to_image_lora.py
+                --pretrained_model_name_or_path {pretrained_model_name_or_path}
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --center_crop
+                --random_flip
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 8
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --resume_from_checkpoint=checkpoint-4
+                --checkpoints_total_limit=2
+                --seed=0
+                --num_validation_images=0
+                """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            pipe = DiffusionPipeline.from_pretrained(
+                "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None
+            )
+            pipe.load_lora_weights(tmpdir)
+            pipe(prompt, num_inference_steps=1)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-6", "checkpoint-8"},
+            )
+
+
+class TextToImageLoRASDXL(ExamplesTestsAccelerate):
+    def test_text_to_image_lora_sdxl(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/text_to_image/train_text_to_image_lora_sdxl.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-xl-pipe
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            # make sure the state_dict has the correct naming in the parameters.
+            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+            is_lora = all("lora" in k for k in lora_state_dict.keys())
+            self.assertTrue(is_lora)
+
+    def test_text_to_image_lora_sdxl_with_text_encoder(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            test_args = f"""
+                examples/text_to_image/train_text_to_image_lora_sdxl.py
+                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-xl-pipe
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --train_text_encoder
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
+
+            # make sure the state_dict has the correct naming in the parameters.
+            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
+            is_lora = all("lora" in k for k in lora_state_dict.keys())
+            self.assertTrue(is_lora)
+
+            # when not training the text encoder, all the parameters in the state dict should start
+            # with `"unet"` or `"text_encoder"` or `"text_encoder_2"` in their names.
+            keys = lora_state_dict.keys()
+            starts_with_unet = all(
+                k.startswith("unet") or k.startswith("text_encoder") or k.startswith("text_encoder_2") for k in keys
+            )
+            self.assertTrue(starts_with_unet)
+
+    def test_text_to_image_lora_sdxl_text_encoder_checkpointing_checkpoints_total_limit(self):
+        prompt = "a prompt"
+        pipeline_path = "hf-internal-testing/tiny-stable-diffusion-xl-pipe"
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # Run training script with checkpointing
+            # max_train_steps == 6, checkpointing_steps == 2, checkpoints_total_limit == 2
+            # Should create checkpoints at steps 2, 4, 6
+            # with checkpoint at step 2 deleted
+
+            initial_run_args = f"""
+                examples/text_to_image/train_text_to_image_lora_sdxl.py
+                --pretrained_model_name_or_path {pipeline_path}
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 64
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 6
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --train_text_encoder
+                --lr_warmup_steps 0
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --checkpoints_total_limit=2
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            pipe = DiffusionPipeline.from_pretrained(pipeline_path)
+            pipe.load_lora_weights(tmpdir)
+            pipe(prompt, num_inference_steps=1)
+
+            # check checkpoint directories exist
+            # checkpoint-2 should have been deleted
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image.py
new file mode 100644
index 000000000..2052fb6d6
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image.py
@@ -0,0 +1,1137 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import math
+import os
+import random
+import shutil
+from pathlib import Path
+
+import accelerate
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.state import AcceleratorState
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+from transformers.utils import ContextManagers
+
+import diffusers
+from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import EMAModel, compute_snr
+from diffusers.utils import check_min_version, deprecate, is_wandb_available, make_image_grid
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+if is_wandb_available():
+    import wandb
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.27.0")
+
+logger = get_logger(__name__, log_level="INFO")
+
+DATASET_NAME_MAPPING = {
+    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+}
+
+
+def save_model_card(
+    args,
+    repo_id: str,
+    images: list = None,
+    repo_folder: str = None,
+):
+    img_str = ""
+    if len(images) > 0:
+        image_grid = make_image_grid(images, 1, len(args.validation_prompts))
+        image_grid.save(os.path.join(repo_folder, "val_imgs_grid.png"))
+        img_str += "![val_imgs_grid](./val_imgs_grid.png)\n"
+
+    model_description = f"""
+# Text-to-image finetuning - {repo_id}
+
+This pipeline was finetuned from **{args.pretrained_model_name_or_path}** on the **{args.dataset_name}** dataset. Below are some example images generated with the finetuned pipeline using the following prompts: {args.validation_prompts}: \n
+{img_str}
+
+## Pipeline usage
+
+You can use the pipeline like so:
+
+```python
+from diffusers import DiffusionPipeline
+import torch
+
+pipeline = DiffusionPipeline.from_pretrained("{repo_id}", torch_dtype=torch.float16)
+prompt = "{args.validation_prompts[0]}"
+image = pipeline(prompt).images[0]
+image.save("my_image.png")
+```
+
+## Training info
+
+These are the key hyperparameters used during training:
+
+* Epochs: {args.num_train_epochs}
+* Learning rate: {args.learning_rate}
+* Batch size: {args.train_batch_size}
+* Gradient accumulation steps: {args.gradient_accumulation_steps}
+* Image resolution: {args.resolution}
+* Mixed-precision: {args.mixed_precision}
+
+"""
+    wandb_info = ""
+    if is_wandb_available():
+        wandb_run_url = None
+        if wandb.run is not None:
+            wandb_run_url = wandb.run.url
+
+    if wandb_run_url is not None:
+        wandb_info = f"""
+More information on all the CLI arguments and the environment are available on your [`wandb` run page]({wandb_run_url}).
+"""
+
+    model_description += wandb_info
+
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="creativeml-openrail-m",
+        base_model=args.pretrained_model_name_or_path,
+        model_description=model_description,
+        inference=True,
+    )
+
+    tags = ["stable-diffusion", "stable-diffusion-diffusers", "text-to-image", "diffusers", "diffusers-training"]
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def log_validation(vae, text_encoder, tokenizer, unet, args, accelerator, weight_dtype, epoch):
+    logger.info("Running validation... ")
+
+    pipeline = StableDiffusionPipeline.from_pretrained(
+        args.pretrained_model_name_or_path,
+        vae=accelerator.unwrap_model(vae),
+        text_encoder=accelerator.unwrap_model(text_encoder),
+        tokenizer=tokenizer,
+        unet=accelerator.unwrap_model(unet),
+        safety_checker=None,
+        revision=args.revision,
+        variant=args.variant,
+        torch_dtype=weight_dtype,
+    )
+    pipeline = pipeline.to(accelerator.device)
+    pipeline.set_progress_bar_config(disable=True)
+
+    if args.enable_xformers_memory_efficient_attention:
+        pipeline.enable_xformers_memory_efficient_attention()
+
+    if args.seed is None:
+        generator = None
+    else:
+        generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+    images = []
+    for i in range(len(args.validation_prompts)):
+        with torch.autocast("cuda"):
+            image = pipeline(args.validation_prompts[i], num_inference_steps=20, generator=generator).images[0]
+
+        images.append(image)
+
+    for tracker in accelerator.trackers:
+        if tracker.name == "tensorboard":
+            np_images = np.stack([np.asarray(img) for img in images])
+            tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+        elif tracker.name == "wandb":
+            tracker.log(
+                {
+                    "validation": [
+                        wandb.Image(image, caption=f"{i}: {args.validation_prompts[i]}")
+                        for i, image in enumerate(images)
+                    ]
+                }
+            )
+        else:
+            logger.warning(f"image logging not implemented for {tracker.name}")
+
+    del pipeline
+    torch.cuda.empty_cache()
+
+    return images
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--input_perturbation", type=float, default=0, help="The scale of input perturbation. Recommended 0.1."
+    )
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--validation_prompts",
+        type=str,
+        default=None,
+        nargs="+",
+        help=("A set of prompts evaluated every `--validation_epochs` and logged to `--report_to`."),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sd-model-finetuned",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+    parser.add_argument(
+        "--non_ema_revision",
+        type=str,
+        default=None,
+        required=False,
+        help=(
+            "Revision of pretrained non-ema model identifier. Must be a branch, tag or git identifier of the local or"
+            " remote repository specified with --pretrained_model_name_or_path."
+        ),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=16,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--prediction_type",
+        type=str,
+        default=None,
+        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.",
+    )
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=5,
+        help="Run validation every X epochs.",
+    )
+    parser.add_argument(
+        "--tracker_project_name",
+        type=str,
+        default="text2image-fine-tune",
+        help=(
+            "The `project_name` argument passed to Accelerator.init_trackers for"
+            " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
+        ),
+    )
+    parser.add_argument(
+        "--NHWC",
+        action="store_true",
+        help="Whether or not using NHWC for training",
+    )
+    parser.add_argument(
+        "--apex_fused_adam",
+        action="store_true",
+        help="Whether or not using fused_adam optimizer",
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    # default to using the same revision for the non-ema model if not specified
+    if args.non_ema_revision is None:
+        args.non_ema_revision = args.revision
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    if args.non_ema_revision is not None:
+        deprecate(
+            "non_ema_revision!=None",
+            "0.15.0",
+            message=(
+                "Downloading 'non_ema' weights from revision branches of the Hub is deprecated. Please make sure to"
+                " use `--variant=non_ema` instead."
+            ),
+        )
+    logging_dir = os.path.join(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load scheduler, tokenizer and models.
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    tokenizer = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
+    )
+
+    def deepspeed_zero_init_disabled_context_manager():
+        """
+        returns either a context list that includes one that will disable zero.Init or an empty context list
+        """
+        deepspeed_plugin = AcceleratorState().deepspeed_plugin if accelerate.state.is_initialized() else None
+        if deepspeed_plugin is None:
+            return []
+
+        return [deepspeed_plugin.zero3_init_context_manager(enable=False)]
+
+    # Currently Accelerate doesn't know how to handle multiple models under Deepspeed ZeRO stage 3.
+    # For this to work properly all models must be run through `accelerate.prepare`. But accelerate
+    # will try to assign the same optimizer with the same weights to all models during
+    # `deepspeed.initialize`, which of course doesn't work.
+    #
+    # For now the following workaround will partially support Deepspeed ZeRO-3, by excluding the 2
+    # frozen models from being partitioned during `zero.Init` which gets called during
+    # `from_pretrained` So CLIPTextModel and AutoencoderKL will not enjoy the parameter sharding
+    # across multiple gpus and only UNet2DConditionModel will get ZeRO sharded.
+    with ContextManagers(deepspeed_zero_init_disabled_context_manager()):
+        text_encoder = CLIPTextModel.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+        )
+        vae = AutoencoderKL.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant
+        )
+
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.non_ema_revision
+    )
+    # Freeze vae and text_encoder and set unet to trainable
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    unet.train()
+
+    # Create EMA for the unet.
+    if args.use_ema:
+        ema_unet = UNet2DConditionModel.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+        )
+        ema_unet = EMAModel(ema_unet.parameters(), model_cls=UNet2DConditionModel, model_config=ema_unet.config)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                if args.use_ema:
+                    ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema"))
+
+                for i, model in enumerate(models):
+                    model.save_pretrained(os.path.join(output_dir, "unet"))
+
+                    # make sure to pop weight so that corresponding model is not saved again
+                    weights.pop()
+
+        def load_model_hook(models, input_dir):
+            if args.use_ema:
+                load_model = EMAModel.from_pretrained(os.path.join(input_dir, "unet_ema"), UNet2DConditionModel)
+                ema_unet.load_state_dict(load_model.state_dict())
+                ema_unet.to(accelerator.device)
+                del load_model
+
+            for _ in range(len(models)):
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = UNet2DConditionModel.from_pretrained(input_dir, subfolder="unet")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Initialize the optimizer
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+
+        optimizer_cls = bnb.optim.AdamW8bit
+    elif args.apex_fused_adam:
+        import apex
+        optimizer_cls = apex.optimizers.FusedAdam
+         
+    else:
+        optimizer_cls = torch.optim.AdamW
+
+    optimizer = optimizer_cls(
+        unet.parameters(),
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+            data_dir=args.train_data_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    # We need to tokenize input captions and transform the images.
+    def tokenize_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                # take a random caption if there are multiple
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        inputs = tokenizer(
+            captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+        )
+        return inputs.input_ids
+
+    # Preprocessing the datasets.
+    train_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        examples["pixel_values"] = [train_transforms(image) for image in images]
+        examples["input_ids"] = tokenize_captions(examples)
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+    
+    # for testing ips
+    from datasets import concatenate_datasets
+    train_dataset = concatenate_datasets([train_dataset for i in range(10)])
+    
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        input_ids = torch.stack([example["input_ids"] for example in examples])
+        return {"pixel_values": pixel_values, "input_ids": input_ids}
+
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+        pin_memory=True,
+        prefetch_factor = 2
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+    )
+
+    # Prepare everything with our `accelerator`.
+    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    if args.use_ema:
+        ema_unet.to(accelerator.device)
+
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+        args.mixed_precision = accelerator.mixed_precision
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+        args.mixed_precision = accelerator.mixed_precision
+
+    # Move text_encode and vae to gpu and cast to weight_dtype
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_config = dict(vars(args))
+        tracker_config.pop("validation_prompts")
+        accelerator.init_trackers(args.tracker_project_name, tracker_config)
+
+    # Function for unwrapping if model was compiled with `torch.compile`.
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+    
+    if args.NHWC:
+        unet = unet.to(memory_format=torch.channels_last)
+        vae = vae.to(memory_format=torch.channels_last)
+        
+    import time
+    for epoch in range(first_epoch, args.num_train_epochs):
+        train_loss = 0.0
+        iter_start = time.time()
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # Convert images to latent space
+                if args.NHWC:
+                    batch["pixel_values"] = batch["pixel_values"].to(memory_format=torch.channels_last)
+                latents = vae.encode(batch["pixel_values"].to(weight_dtype)).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                if args.noise_offset:
+                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+                    noise += args.noise_offset * torch.randn(
+                        (latents.shape[0], latents.shape[1], 1, 1), device=latents.device
+                    )
+                if args.input_perturbation:
+                    new_noise = noise + args.input_perturbation * torch.randn_like(noise)
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                if args.input_perturbation:
+                    noisy_latents = noise_scheduler.add_noise(latents, new_noise, timesteps)
+                else:
+                    noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"], return_dict=False)[0]
+
+                # Get the target for loss depending on the prediction type
+                if args.prediction_type is not None:
+                    # set prediction_type of scheduler if defined
+                    noise_scheduler.register_to_config(prediction_type=args.prediction_type)
+
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                # Predict the noise residual and compute loss
+                if args.NHWC:
+                    noisy_latents = noisy_latents.to(memory_format=torch.channels_last)
+                    # timesteps = timesteps.to(memory_format=torch.channels_last)
+                    # encoder_hidden_states = encoder_hidden_states.to(memory_format=torch.channels_last)
+                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states, return_dict=False)[0]
+
+                if args.snr_gamma is None:
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                else:
+                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+                    # This is discussed in Section 4.2 of the same paper.
+                    snr = compute_snr(noise_scheduler, timesteps)
+                    mse_loss_weights = torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(
+                        dim=1
+                    )[0]
+                    if noise_scheduler.config.prediction_type == "epsilon":
+                        mse_loss_weights = mse_loss_weights / snr
+                    elif noise_scheduler.config.prediction_type == "v_prediction":
+                        mse_loss_weights = mse_loss_weights / (snr + 1)
+
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+                    loss = loss.mean()
+
+                # 检查 loss 是否正常，不正常的话退出并返回 -1
+                if loss.isnan().any() or loss.isinf().any():
+                    import sys
+                    sys.exit(1)
+
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                if args.use_ema:
+                    ema_unet.step(unet.parameters())
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+                iter_elapse = time.time() - iter_start
+                iter_start = time.time()
+                ips_per_device = total_batch_size / iter_elapse / accelerator.num_processes
+                ips_per_gpu = ips_per_device * 2
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        
+                        if args.NHWC:
+                            origin_model = accelerator._models[0]
+                            accelerator._models[0] = origin_model.to(memory_format=torch.contiguous_format)
+                            accelerator.save_state(save_path)
+                            accelerator._models[0] = origin_model.to(memory_format=torch.channels_last)
+                        else:
+                            accelerator.save_state(save_path)
+                            
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], 
+                    "ips_per_device": ips_per_device, "ips_per_gpu": ips_per_gpu}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompts is not None and epoch % args.validation_epochs == 0:
+                if args.use_ema:
+                    # Store the UNet parameters temporarily and load the EMA parameters to perform inference.
+                    ema_unet.store(unet.parameters())
+                    ema_unet.copy_to(unet.parameters())
+                log_validation(
+                    vae,
+                    text_encoder,
+                    tokenizer,
+                    unet,
+                    args,
+                    accelerator,
+                    weight_dtype,
+                    global_step,
+                )
+                if args.use_ema:
+                    # Switch back to the original UNet parameters.
+                    ema_unet.restore(unet.parameters())
+
+    # Create the pipeline using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = unwrap_model(unet)
+        if args.use_ema:
+            ema_unet.copy_to(unet.parameters())
+
+        pipeline = StableDiffusionPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            text_encoder=text_encoder,
+            vae=vae.to(memory_format=torch.contiguous_format) if args.NHWC else vae,
+            unet=unet.to(memory_format=torch.contiguous_format) if args.NHWC else unet,
+            revision=args.revision,
+            variant=args.variant,
+        )
+        pipeline.save_pretrained(args.output_dir)
+
+        # Run a final round of inference.
+        images = []
+        if args.validation_prompts is not None:
+            logger.info("Running inference for collecting generated images...")
+            pipeline = pipeline.to(accelerator.device)
+            pipeline.torch_dtype = weight_dtype
+            pipeline.set_progress_bar_config(disable=True)
+
+            if args.enable_xformers_memory_efficient_attention:
+                pipeline.enable_xformers_memory_efficient_attention()
+
+            if args.seed is None:
+                generator = None
+            else:
+                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
+
+            for i in range(len(args.validation_prompts)):
+                with torch.autocast("cuda"):
+                    image = pipeline(args.validation_prompts[i], num_inference_steps=20, generator=generator).images[0]
+                images.append(image)
+
+        if args.push_to_hub:
+            save_model_card(args, repo_id, images, repo_folder=args.output_dir)
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image_flax.py
new file mode 100644
index 000000000..1386ccb04
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image_flax.py
@@ -0,0 +1,620 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import math
+import os
+import random
+from pathlib import Path
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+import optax
+import torch
+import torch.utils.checkpoint
+import transformers
+from datasets import load_dataset
+from flax import jax_utils
+from flax.training import train_state
+from flax.training.common_utils import shard
+from huggingface_hub import create_repo, upload_folder
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPImageProcessor, CLIPTokenizer, FlaxCLIPTextModel, set_seed
+
+from diffusers import (
+    FlaxAutoencoderKL,
+    FlaxDDPMScheduler,
+    FlaxPNDMScheduler,
+    FlaxStableDiffusionPipeline,
+    FlaxUNet2DConditionModel,
+)
+from diffusers.pipelines.stable_diffusion import FlaxStableDiffusionSafetyChecker
+from diffusers.utils import check_min_version
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.27.0")
+
+logger = logging.getLogger(__name__)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sd-model-finetuned",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="no",
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose"
+            "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+            "and an Nvidia Ampere GPU."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--from_pt",
+        action="store_true",
+        default=False,
+        help="Flag to indicate whether to convert models from PyTorch.",
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    return args
+
+
+dataset_name_mapping = {
+    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+}
+
+
+def get_params_to_save(params):
+    return jax.device_get(jax.tree_util.tree_map(lambda x: x[0], params))
+
+
+def main():
+    args = parse_args()
+
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    # Setup logging, we only want one process per machine to log things on the screen.
+    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
+    if jax.process_index() == 0:
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if jax.process_index() == 0:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name, args.dataset_config_name, cache_dir=args.cache_dir, data_dir=args.train_data_dir
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    dataset_columns = dataset_name_mapping.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    # We need to tokenize input captions and transform the images.
+    def tokenize_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                # take a random caption if there are multiple
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        inputs = tokenizer(captions, max_length=tokenizer.model_max_length, padding="do_not_pad", truncation=True)
+        input_ids = inputs.input_ids
+        return input_ids
+
+    train_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        examples["pixel_values"] = [train_transforms(image) for image in images]
+        examples["input_ids"] = tokenize_captions(examples)
+
+        return examples
+
+    if args.max_train_samples is not None:
+        dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+    train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        input_ids = [example["input_ids"] for example in examples]
+
+        padded_tokens = tokenizer.pad(
+            {"input_ids": input_ids}, padding="max_length", max_length=tokenizer.model_max_length, return_tensors="pt"
+        )
+        batch = {
+            "pixel_values": pixel_values,
+            "input_ids": padded_tokens.input_ids,
+        }
+        batch = {k: v.numpy() for k, v in batch.items()}
+
+        return batch
+
+    total_train_batch_size = args.train_batch_size * jax.local_device_count()
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset, shuffle=True, collate_fn=collate_fn, batch_size=total_train_batch_size, drop_last=True
+    )
+
+    weight_dtype = jnp.float32
+    if args.mixed_precision == "fp16":
+        weight_dtype = jnp.float16
+    elif args.mixed_precision == "bf16":
+        weight_dtype = jnp.bfloat16
+
+    # Load models and create wrapper for stable diffusion
+    tokenizer = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        from_pt=args.from_pt,
+        revision=args.revision,
+        subfolder="tokenizer",
+    )
+    text_encoder = FlaxCLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        from_pt=args.from_pt,
+        revision=args.revision,
+        subfolder="text_encoder",
+        dtype=weight_dtype,
+    )
+    vae, vae_params = FlaxAutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path,
+        from_pt=args.from_pt,
+        revision=args.revision,
+        subfolder="vae",
+        dtype=weight_dtype,
+    )
+    unet, unet_params = FlaxUNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        from_pt=args.from_pt,
+        revision=args.revision,
+        subfolder="unet",
+        dtype=weight_dtype,
+    )
+
+    # Optimization
+    if args.scale_lr:
+        args.learning_rate = args.learning_rate * total_train_batch_size
+
+    constant_scheduler = optax.constant_schedule(args.learning_rate)
+
+    adamw = optax.adamw(
+        learning_rate=constant_scheduler,
+        b1=args.adam_beta1,
+        b2=args.adam_beta2,
+        eps=args.adam_epsilon,
+        weight_decay=args.adam_weight_decay,
+    )
+
+    optimizer = optax.chain(
+        optax.clip_by_global_norm(args.max_grad_norm),
+        adamw,
+    )
+
+    state = train_state.TrainState.create(apply_fn=unet.__call__, params=unet_params, tx=optimizer)
+
+    noise_scheduler = FlaxDDPMScheduler(
+        beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000
+    )
+    noise_scheduler_state = noise_scheduler.create_state()
+
+    # Initialize our training
+    rng = jax.random.PRNGKey(args.seed)
+    train_rngs = jax.random.split(rng, jax.local_device_count())
+
+    def train_step(state, text_encoder_params, vae_params, batch, train_rng):
+        dropout_rng, sample_rng, new_train_rng = jax.random.split(train_rng, 3)
+
+        def compute_loss(params):
+            # Convert images to latent space
+            vae_outputs = vae.apply(
+                {"params": vae_params}, batch["pixel_values"], deterministic=True, method=vae.encode
+            )
+            latents = vae_outputs.latent_dist.sample(sample_rng)
+            # (NHWC) -> (NCHW)
+            latents = jnp.transpose(latents, (0, 3, 1, 2))
+            latents = latents * vae.config.scaling_factor
+
+            # Sample noise that we'll add to the latents
+            noise_rng, timestep_rng = jax.random.split(sample_rng)
+            noise = jax.random.normal(noise_rng, latents.shape)
+            # Sample a random timestep for each image
+            bsz = latents.shape[0]
+            timesteps = jax.random.randint(
+                timestep_rng,
+                (bsz,),
+                0,
+                noise_scheduler.config.num_train_timesteps,
+            )
+
+            # Add noise to the latents according to the noise magnitude at each timestep
+            # (this is the forward diffusion process)
+            noisy_latents = noise_scheduler.add_noise(noise_scheduler_state, latents, noise, timesteps)
+
+            # Get the text embedding for conditioning
+            encoder_hidden_states = text_encoder(
+                batch["input_ids"],
+                params=text_encoder_params,
+                train=False,
+            )[0]
+
+            # Predict the noise residual and compute loss
+            model_pred = unet.apply(
+                {"params": params}, noisy_latents, timesteps, encoder_hidden_states, train=True
+            ).sample
+
+            # Get the target for loss depending on the prediction type
+            if noise_scheduler.config.prediction_type == "epsilon":
+                target = noise
+            elif noise_scheduler.config.prediction_type == "v_prediction":
+                target = noise_scheduler.get_velocity(noise_scheduler_state, latents, noise, timesteps)
+            else:
+                raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+            loss = (target - model_pred) ** 2
+            loss = loss.mean()
+
+            return loss
+
+        grad_fn = jax.value_and_grad(compute_loss)
+        loss, grad = grad_fn(state.params)
+        grad = jax.lax.pmean(grad, "batch")
+
+        new_state = state.apply_gradients(grads=grad)
+
+        metrics = {"loss": loss}
+        metrics = jax.lax.pmean(metrics, axis_name="batch")
+
+        return new_state, metrics, new_train_rng
+
+    # Create parallel version of the train step
+    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
+
+    # Replicate the train state on each device
+    state = jax_utils.replicate(state)
+    text_encoder_params = jax_utils.replicate(text_encoder.params)
+    vae_params = jax_utils.replicate(vae_params)
+
+    # Train!
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader))
+
+    # Scheduler and math around the number of training steps.
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel & distributed) = {total_train_batch_size}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+
+    global_step = 0
+
+    epochs = tqdm(range(args.num_train_epochs), desc="Epoch ... ", position=0)
+    for epoch in epochs:
+        # ======================== Training ================================
+
+        train_metrics = []
+
+        steps_per_epoch = len(train_dataset) // total_train_batch_size
+        train_step_progress_bar = tqdm(total=steps_per_epoch, desc="Training...", position=1, leave=False)
+        # train
+        for batch in train_dataloader:
+            batch = shard(batch)
+            state, train_metric, train_rngs = p_train_step(state, text_encoder_params, vae_params, batch, train_rngs)
+            train_metrics.append(train_metric)
+
+            train_step_progress_bar.update(1)
+
+            global_step += 1
+            if global_step >= args.max_train_steps:
+                break
+
+        train_metric = jax_utils.unreplicate(train_metric)
+
+        train_step_progress_bar.close()
+        epochs.write(f"Epoch... ({epoch + 1}/{args.num_train_epochs} | Loss: {train_metric['loss']})")
+
+    # Create the pipeline using using the trained modules and save it.
+    if jax.process_index() == 0:
+        scheduler = FlaxPNDMScheduler(
+            beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True
+        )
+        safety_checker = FlaxStableDiffusionSafetyChecker.from_pretrained(
+            "CompVis/stable-diffusion-safety-checker", from_pt=True
+        )
+        pipeline = FlaxStableDiffusionPipeline(
+            text_encoder=text_encoder,
+            vae=vae,
+            unet=unet,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32"),
+        )
+
+        pipeline.save_pretrained(
+            args.output_dir,
+            params={
+                "text_encoder": get_params_to_save(text_encoder_params),
+                "vae": get_params_to_save(vae_params),
+                "unet": get_params_to_save(state.params),
+                "safety_checker": safety_checker.params,
+            },
+        )
+
+        if args.push_to_hub:
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image_lora.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image_lora.py
new file mode 100644
index 000000000..97f258ba7
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image_lora.py
@@ -0,0 +1,976 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fine-tuning script for Stable Diffusion for text2image with support for LoRA."""
+
+import argparse
+import logging
+import math
+import os
+import random
+import shutil
+from pathlib import Path
+
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from peft import LoraConfig
+from peft.utils import get_peft_model_state_dict
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+import diffusers
+from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, StableDiffusionPipeline, UNet2DConditionModel
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import cast_training_params, compute_snr
+from diffusers.utils import check_min_version, convert_state_dict_to_diffusers, is_wandb_available
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.27.0")
+
+logger = get_logger(__name__, log_level="INFO")
+
+
+def save_model_card(
+    repo_id: str,
+    images: list = None,
+    base_model: str = None,
+    dataset_name: str = None,
+    repo_folder: str = None,
+):
+    img_str = ""
+    if images is not None:
+        for i, image in enumerate(images):
+            image.save(os.path.join(repo_folder, f"image_{i}.png"))
+            img_str += f"![img_{i}](./image_{i}.png)\n"
+
+    model_description = f"""
+# LoRA text2image fine-tuning - {repo_id}
+These are LoRA adaption weights for {base_model}. The weights were fine-tuned on the {dataset_name} dataset. You can find some example images in the following. \n
+{img_str}
+"""
+
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="creativeml-openrail-m",
+        base_model=base_model,
+        model_description=model_description,
+        inference=True,
+    )
+
+    tags = [
+        "stable-diffusion",
+        "stable-diffusion-diffusers",
+        "text-to-image",
+        "diffusers",
+        "diffusers-training",
+        "lora",
+    ]
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--validation_prompt", type=str, default=None, help="A prompt that is sampled during training for inference."
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=1,
+        help=(
+            "Run fine-tuning validation every X epochs. The validation process consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sd-model-finetuned-lora",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--prediction_type",
+        type=str,
+        default=None,
+        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.",
+    )
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+
+    args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    return args
+
+
+DATASET_NAME_MAPPING = {
+    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+}
+
+
+def main():
+    args = parse_args()
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+        import wandb
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+    # Load scheduler, tokenizer and models.
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    tokenizer = CLIPTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
+    )
+    text_encoder = CLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+    )
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+    # freeze parameters of models to save more memory
+    unet.requires_grad_(False)
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Freeze the unet parameters before adding adapters
+    for param in unet.parameters():
+        param.requires_grad_(False)
+
+    unet_lora_config = LoraConfig(
+        r=args.rank,
+        lora_alpha=args.rank,
+        init_lora_weights="gaussian",
+        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+    )
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    unet.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+    text_encoder.to(accelerator.device, dtype=weight_dtype)
+
+    # Add adapter and make sure the trainable params are in float32.
+    unet.add_adapter(unet_lora_config)
+    if args.mixed_precision == "fp16":
+        # only upcast trainable parameters (LoRA) into fp32
+        cast_training_params(unet, dtype=torch.float32)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    lora_layers = filter(lambda p: p.requires_grad, unet.parameters())
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Initialize the optimizer
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+
+        optimizer_cls = bnb.optim.AdamW8bit
+    else:
+        optimizer_cls = torch.optim.AdamW
+
+    optimizer = optimizer_cls(
+        lora_layers,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+            data_dir=args.train_data_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    # We need to tokenize input captions and transform the images.
+    def tokenize_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                # take a random caption if there are multiple
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        inputs = tokenizer(
+            captions, max_length=tokenizer.model_max_length, padding="max_length", truncation=True, return_tensors="pt"
+        )
+        return inputs.input_ids
+
+    # Preprocessing the datasets.
+    train_transforms = transforms.Compose(
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        examples["pixel_values"] = [train_transforms(image) for image in images]
+        examples["input_ids"] = tokenize_captions(examples)
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        input_ids = torch.stack([example["input_ids"] for example in examples])
+        return {"pixel_values": pixel_values, "input_ids": input_ids}
+
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
+    )
+
+    # Prepare everything with our `accelerator`.
+    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("text2image-fine-tune", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # Convert images to latent space
+                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample()
+                latents = latents * vae.config.scaling_factor
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(latents)
+                if args.noise_offset:
+                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+                    noise += args.noise_offset * torch.randn(
+                        (latents.shape[0], latents.shape[1], 1, 1), device=latents.device
+                    )
+
+                bsz = latents.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
+                timesteps = timesteps.long()
+
+                # Add noise to the latents according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+
+                # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"], return_dict=False)[0]
+
+                # Get the target for loss depending on the prediction type
+                if args.prediction_type is not None:
+                    # set prediction_type of scheduler if defined
+                    noise_scheduler.register_to_config(prediction_type=args.prediction_type)
+
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(latents, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                # Predict the noise residual and compute loss
+                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states, return_dict=False)[0]
+
+                if args.snr_gamma is None:
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                else:
+                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+                    # This is discussed in Section 4.2 of the same paper.
+                    snr = compute_snr(noise_scheduler, timesteps)
+                    mse_loss_weights = torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(
+                        dim=1
+                    )[0]
+                    if noise_scheduler.config.prediction_type == "epsilon":
+                        mse_loss_weights = mse_loss_weights / snr
+                    elif noise_scheduler.config.prediction_type == "v_prediction":
+                        mse_loss_weights = mse_loss_weights / (snr + 1)
+
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+                    loss = loss.mean()
+
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = lora_layers
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+
+                if global_step % args.checkpointing_steps == 0:
+                    if accelerator.is_main_process:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+
+                        unwrapped_unet = unwrap_model(unet)
+                        unet_lora_state_dict = convert_state_dict_to_diffusers(
+                            get_peft_model_state_dict(unwrapped_unet)
+                        )
+
+                        StableDiffusionPipeline.save_lora_weights(
+                            save_directory=save_path,
+                            unet_lora_layers=unet_lora_state_dict,
+                            safe_serialization=True,
+                        )
+
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                logger.info(
+                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+                    f" {args.validation_prompt}."
+                )
+                # create pipeline
+                pipeline = DiffusionPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    unet=unwrap_model(unet),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+                pipeline = pipeline.to(accelerator.device)
+                pipeline.set_progress_bar_config(disable=True)
+
+                # run inference
+                generator = torch.Generator(device=accelerator.device)
+                if args.seed is not None:
+                    generator = generator.manual_seed(args.seed)
+                images = []
+                with torch.cuda.amp.autocast():
+                    for _ in range(args.num_validation_images):
+                        images.append(
+                            pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0]
+                        )
+
+                for tracker in accelerator.trackers:
+                    if tracker.name == "tensorboard":
+                        np_images = np.stack([np.asarray(img) for img in images])
+                        tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+                    if tracker.name == "wandb":
+                        tracker.log(
+                            {
+                                "validation": [
+                                    wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                    for i, image in enumerate(images)
+                                ]
+                            }
+                        )
+
+                del pipeline
+                torch.cuda.empty_cache()
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = unet.to(torch.float32)
+
+        unwrapped_unet = unwrap_model(unet)
+        unet_lora_state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(unwrapped_unet))
+        StableDiffusionPipeline.save_lora_weights(
+            save_directory=args.output_dir,
+            unet_lora_layers=unet_lora_state_dict,
+            safe_serialization=True,
+        )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                dataset_name=args.dataset_name,
+                repo_folder=args.output_dir,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+        # Final inference
+        # Load previous pipeline
+        if args.validation_prompt is not None:
+            pipeline = DiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                revision=args.revision,
+                variant=args.variant,
+                torch_dtype=weight_dtype,
+            )
+            pipeline = pipeline.to(accelerator.device)
+
+            # load attention processors
+            pipeline.load_lora_weights(args.output_dir)
+
+            # run inference
+            generator = torch.Generator(device=accelerator.device)
+            if args.seed is not None:
+                generator = generator.manual_seed(args.seed)
+            images = []
+            with torch.cuda.amp.autocast():
+                for _ in range(args.num_validation_images):
+                    images.append(
+                        pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0]
+                    )
+
+            for tracker in accelerator.trackers:
+                if len(images) != 0:
+                    if tracker.name == "tensorboard":
+                        np_images = np.stack([np.asarray(img) for img in images])
+                        tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC")
+                    if tracker.name == "wandb":
+                        tracker.log(
+                            {
+                                "test": [
+                                    wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                    for i, image in enumerate(images)
+                                ]
+                            }
+                        )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image_lora_sdxl.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image_lora_sdxl.py
new file mode 100644
index 000000000..43e090758
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image_lora_sdxl.py
@@ -0,0 +1,1317 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fine-tuning script for Stable Diffusion XL for text2image with support for LoRA."""
+
+import argparse
+import logging
+import math
+import os
+import random
+import shutil
+from pathlib import Path
+
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+from datasets import load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from peft import LoraConfig, set_peft_model_state_dict
+from peft.utils import get_peft_model_state_dict
+from torchvision import transforms
+from torchvision.transforms.functional import crop
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+
+import diffusers
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    StableDiffusionXLPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.loaders import LoraLoaderMixin
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import _set_state_dict_into_text_encoder, cast_training_params, compute_snr
+from diffusers.utils import (
+    check_min_version,
+    convert_state_dict_to_diffusers,
+    convert_unet_state_dict_to_peft,
+    is_wandb_available,
+)
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.27.0")
+
+logger = get_logger(__name__)
+
+
+def save_model_card(
+    repo_id: str,
+    images: list = None,
+    base_model: str = None,
+    dataset_name: str = None,
+    train_text_encoder: bool = False,
+    repo_folder: str = None,
+    vae_path: str = None,
+):
+    img_str = ""
+    if images is not None:
+        for i, image in enumerate(images):
+            image.save(os.path.join(repo_folder, f"image_{i}.png"))
+            img_str += f"![img_{i}](./image_{i}.png)\n"
+
+    model_description = f"""
+# LoRA text2image fine-tuning - {repo_id}
+
+These are LoRA adaption weights for {base_model}. The weights were fine-tuned on the {dataset_name} dataset. You can find some example images in the following. \n
+{img_str}
+
+LoRA for the text encoder was enabled: {train_text_encoder}.
+
+Special VAE used for training: {vae_path}.
+"""
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="creativeml-openrail-m",
+        base_model=base_model,
+        model_description=model_description,
+        inference=True,
+    )
+
+    tags = [
+        "stable-diffusion-xl",
+        "stable-diffusion-xl-diffusers",
+        "text-to-image",
+        "diffusers",
+        "diffusers-training",
+        "lora",
+    ]
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained VAE model with better numerical stability. More details: https://github.com/huggingface/diffusers/pull/4038.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=1,
+        help=(
+            "Run fine-tuning validation every X epochs. The validation process consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sd-model-finetuned-lora",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=1024,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_text_encoder",
+        action="store_true",
+        help="Whether to train the text encoder. If set, the text encoder should be float32 precision.",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--prediction_type",
+        type=str,
+        default=None,
+        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.",
+    )
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
+    parser.add_argument(
+        "--rank",
+        type=int,
+        default=4,
+        help=("The dimension of the LoRA update matrices."),
+    )
+    parser.add_argument(
+        "--debug_loss",
+        action="store_true",
+        help="debug loss for each image, if filenames are awailable in the dataset",
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    return args
+
+
+DATASET_NAME_MAPPING = {
+    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+}
+
+
+def tokenize_prompt(tokenizer, prompt):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+
+
+# Adapted from pipelines.StableDiffusionXLPipeline.encode_prompt
+def encode_prompt(text_encoders, tokenizers, prompt, text_input_ids_list=None):
+    prompt_embeds_list = []
+
+    for i, text_encoder in enumerate(text_encoders):
+        if tokenizers is not None:
+            tokenizer = tokenizers[i]
+            text_input_ids = tokenize_prompt(tokenizer, prompt)
+        else:
+            assert text_input_ids_list is not None
+            text_input_ids = text_input_ids_list[i]
+
+        prompt_embeds = text_encoder(
+            text_input_ids.to(text_encoder.device), output_hidden_states=True, return_dict=False
+        )
+
+        # We are only ALWAYS interested in the pooled output of the final text encoder
+        pooled_prompt_embeds = prompt_embeds[0]
+        prompt_embeds = prompt_embeds[-1][-2]
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
+        prompt_embeds_list.append(prompt_embeds)
+
+    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
+    return prompt_embeds, pooled_prompt_embeds
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+        import wandb
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizers
+    tokenizer_one = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+        use_fast=False,
+    )
+    tokenizer_two = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_2",
+        revision=args.revision,
+        use_fast=False,
+    )
+
+    # import correct text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
+    )
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    text_encoder_one = text_encoder_cls_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    text_encoder_two = text_encoder_cls_two.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+    )
+    vae_path = (
+        args.pretrained_model_name_or_path
+        if args.pretrained_vae_model_name_or_path is None
+        else args.pretrained_vae_model_name_or_path
+    )
+    vae = AutoencoderKL.from_pretrained(
+        vae_path,
+        subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+        revision=args.revision,
+        variant=args.variant,
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    # We only train the additional adapter LoRA layers
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    unet.requires_grad_(False)
+
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    # The VAE is in float32 to avoid NaN losses.
+    unet.to(accelerator.device, dtype=weight_dtype)
+
+    if args.pretrained_vae_model_name_or_path is None:
+        vae.to(accelerator.device, dtype=torch.float32)
+    else:
+        vae.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # now we will add new LoRA weights to the attention layers
+    # Set correct lora layers
+    unet_lora_config = LoraConfig(
+        r=args.rank,
+        lora_alpha=args.rank,
+        init_lora_weights="gaussian",
+        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
+    )
+
+    unet.add_adapter(unet_lora_config)
+
+    # The text encoder comes from 🤗 transformers, we will also attach adapters to it.
+    if args.train_text_encoder:
+        # ensure that dtype is float32, even if rest of the model that isn't trained is loaded in fp16
+        text_lora_config = LoraConfig(
+            r=args.rank,
+            lora_alpha=args.rank,
+            init_lora_weights="gaussian",
+            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
+        )
+        text_encoder_one.add_adapter(text_lora_config)
+        text_encoder_two.add_adapter(text_lora_config)
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+    def save_model_hook(models, weights, output_dir):
+        if accelerator.is_main_process:
+            # there are only two options here. Either are just the unet attn processor layers
+            # or there are the unet and text encoder attn layers
+            unet_lora_layers_to_save = None
+            text_encoder_one_lora_layers_to_save = None
+            text_encoder_two_lora_layers_to_save = None
+
+            for model in models:
+                if isinstance(unwrap_model(model), type(unwrap_model(unet))):
+                    unet_lora_layers_to_save = convert_state_dict_to_diffusers(get_peft_model_state_dict(model))
+                elif isinstance(unwrap_model(model), type(unwrap_model(text_encoder_one))):
+                    text_encoder_one_lora_layers_to_save = convert_state_dict_to_diffusers(
+                        get_peft_model_state_dict(model)
+                    )
+                elif isinstance(unwrap_model(model), type(unwrap_model(text_encoder_two))):
+                    text_encoder_two_lora_layers_to_save = convert_state_dict_to_diffusers(
+                        get_peft_model_state_dict(model)
+                    )
+                else:
+                    raise ValueError(f"unexpected save model: {model.__class__}")
+
+                # make sure to pop weight so that corresponding model is not saved again
+                if weights:
+                    weights.pop()
+
+            StableDiffusionXLPipeline.save_lora_weights(
+                output_dir,
+                unet_lora_layers=unet_lora_layers_to_save,
+                text_encoder_lora_layers=text_encoder_one_lora_layers_to_save,
+                text_encoder_2_lora_layers=text_encoder_two_lora_layers_to_save,
+            )
+
+    def load_model_hook(models, input_dir):
+        unet_ = None
+        text_encoder_one_ = None
+        text_encoder_two_ = None
+
+        while len(models) > 0:
+            model = models.pop()
+
+            if isinstance(model, type(unwrap_model(unet))):
+                unet_ = model
+            elif isinstance(model, type(unwrap_model(text_encoder_one))):
+                text_encoder_one_ = model
+            elif isinstance(model, type(unwrap_model(text_encoder_two))):
+                text_encoder_two_ = model
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")
+
+        lora_state_dict, _ = LoraLoaderMixin.lora_state_dict(input_dir)
+        unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")}
+        unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict)
+        incompatible_keys = set_peft_model_state_dict(unet_, unet_state_dict, adapter_name="default")
+        if incompatible_keys is not None:
+            # check only for unexpected keys
+            unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+            if unexpected_keys:
+                logger.warning(
+                    f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                    f" {unexpected_keys}. "
+                )
+
+        if args.train_text_encoder:
+            _set_state_dict_into_text_encoder(lora_state_dict, prefix="text_encoder.", text_encoder=text_encoder_one_)
+
+            _set_state_dict_into_text_encoder(
+                lora_state_dict, prefix="text_encoder_2.", text_encoder=text_encoder_two_
+            )
+
+        # Make sure the trainable params are in float32. This is again needed since the base models
+        # are in `weight_dtype`. More details:
+        # https://github.com/huggingface/diffusers/pull/6514#discussion_r1449796804
+        if args.mixed_precision == "fp16":
+            models = [unet_]
+            if args.train_text_encoder:
+                models.extend([text_encoder_one_, text_encoder_two_])
+            cast_training_params(models, dtype=torch.float32)
+
+    accelerator.register_save_state_pre_hook(save_model_hook)
+    accelerator.register_load_state_pre_hook(load_model_hook)
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder_one.gradient_checkpointing_enable()
+            text_encoder_two.gradient_checkpointing_enable()
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Make sure the trainable params are in float32.
+    if args.mixed_precision == "fp16":
+        models = [unet]
+        if args.train_text_encoder:
+            models.extend([text_encoder_one, text_encoder_two])
+        cast_training_params(models, dtype=torch.float32)
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    params_to_optimize = list(filter(lambda p: p.requires_grad, unet.parameters()))
+    if args.train_text_encoder:
+        params_to_optimize = (
+            params_to_optimize
+            + list(filter(lambda p: p.requires_grad, text_encoder_one.parameters()))
+            + list(filter(lambda p: p.requires_grad, text_encoder_two.parameters()))
+        )
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name, args.dataset_config_name, cache_dir=args.cache_dir, data_dir=args.train_data_dir
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    # We need to tokenize input captions and transform the images.
+    def tokenize_captions(examples, is_train=True):
+        captions = []
+        for caption in examples[caption_column]:
+            if isinstance(caption, str):
+                captions.append(caption)
+            elif isinstance(caption, (list, np.ndarray)):
+                # take a random caption if there are multiple
+                captions.append(random.choice(caption) if is_train else caption[0])
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        tokens_one = tokenize_prompt(tokenizer_one, captions)
+        tokens_two = tokenize_prompt(tokenizer_two, captions)
+        return tokens_one, tokens_two
+
+    # Preprocessing the datasets.
+    train_resize = transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR)
+    train_crop = transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution)
+    train_flip = transforms.RandomHorizontalFlip(p=1.0)
+    train_transforms = transforms.Compose(
+        [
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        # image aug
+        original_sizes = []
+        all_images = []
+        crop_top_lefts = []
+        for image in images:
+            original_sizes.append((image.height, image.width))
+            image = train_resize(image)
+            if args.random_flip and random.random() < 0.5:
+                # flip
+                image = train_flip(image)
+            if args.center_crop:
+                y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
+                x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
+                image = train_crop(image)
+            else:
+                y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution))
+                image = crop(image, y1, x1, h, w)
+            crop_top_left = (y1, x1)
+            crop_top_lefts.append(crop_top_left)
+            image = train_transforms(image)
+            all_images.append(image)
+
+        examples["original_sizes"] = original_sizes
+        examples["crop_top_lefts"] = crop_top_lefts
+        examples["pixel_values"] = all_images
+        tokens_one, tokens_two = tokenize_captions(examples)
+        examples["input_ids_one"] = tokens_one
+        examples["input_ids_two"] = tokens_two
+        if args.debug_loss:
+            fnames = [os.path.basename(image.filename) for image in examples[image_column] if image.filename]
+            if fnames:
+                examples["filenames"] = fnames
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train, output_all_columns=True)
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example["pixel_values"] for example in examples])
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        original_sizes = [example["original_sizes"] for example in examples]
+        crop_top_lefts = [example["crop_top_lefts"] for example in examples]
+        input_ids_one = torch.stack([example["input_ids_one"] for example in examples])
+        input_ids_two = torch.stack([example["input_ids_two"] for example in examples])
+        result = {
+            "pixel_values": pixel_values,
+            "input_ids_one": input_ids_one,
+            "input_ids_two": input_ids_two,
+            "original_sizes": original_sizes,
+            "crop_top_lefts": crop_top_lefts,
+        }
+
+        filenames = [example["filenames"] for example in examples if "filenames" in example]
+        if filenames:
+            result["filenames"] = filenames
+        return result
+
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+
+    # Prepare everything with our `accelerator`.
+    if args.train_text_encoder:
+        unet, text_encoder_one, text_encoder_two, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, text_encoder_one, text_encoder_two, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, optimizer, train_dataloader, lr_scheduler
+        )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("text2image-fine-tune", config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        unet.train()
+        if args.train_text_encoder:
+            text_encoder_one.train()
+            text_encoder_two.train()
+        train_loss = 0.0
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # Convert images to latent space
+                if args.pretrained_vae_model_name_or_path is not None:
+                    pixel_values = batch["pixel_values"].to(dtype=weight_dtype)
+                else:
+                    pixel_values = batch["pixel_values"]
+
+                model_input = vae.encode(pixel_values).latent_dist.sample()
+                model_input = model_input * vae.config.scaling_factor
+                if args.pretrained_vae_model_name_or_path is None:
+                    model_input = model_input.to(weight_dtype)
+
+                # Sample noise that we'll add to the latents
+                noise = torch.randn_like(model_input)
+                if args.noise_offset:
+                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+                    noise += args.noise_offset * torch.randn(
+                        (model_input.shape[0], model_input.shape[1], 1, 1), device=model_input.device
+                    )
+
+                bsz = model_input.shape[0]
+                # Sample a random timestep for each image
+                timesteps = torch.randint(
+                    0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
+                )
+                timesteps = timesteps.long()
+
+                # Add noise to the model input according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
+
+                # time ids
+                def compute_time_ids(original_size, crops_coords_top_left):
+                    # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
+                    target_size = (args.resolution, args.resolution)
+                    add_time_ids = list(original_size + crops_coords_top_left + target_size)
+                    add_time_ids = torch.tensor([add_time_ids])
+                    add_time_ids = add_time_ids.to(accelerator.device, dtype=weight_dtype)
+                    return add_time_ids
+
+                add_time_ids = torch.cat(
+                    [compute_time_ids(s, c) for s, c in zip(batch["original_sizes"], batch["crop_top_lefts"])]
+                )
+
+                # Predict the noise residual
+                unet_added_conditions = {"time_ids": add_time_ids}
+                prompt_embeds, pooled_prompt_embeds = encode_prompt(
+                    text_encoders=[text_encoder_one, text_encoder_two],
+                    tokenizers=None,
+                    prompt=None,
+                    text_input_ids_list=[batch["input_ids_one"], batch["input_ids_two"]],
+                )
+                unet_added_conditions.update({"text_embeds": pooled_prompt_embeds})
+                model_pred = unet(
+                    noisy_model_input,
+                    timesteps,
+                    prompt_embeds,
+                    added_cond_kwargs=unet_added_conditions,
+                    return_dict=False,
+                )[0]
+
+                # Get the target for loss depending on the prediction type
+                if args.prediction_type is not None:
+                    # set prediction_type of scheduler if defined
+                    noise_scheduler.register_to_config(prediction_type=args.prediction_type)
+
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(model_input, noise, timesteps)
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                if args.snr_gamma is None:
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                else:
+                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+                    # This is discussed in Section 4.2 of the same paper.
+                    snr = compute_snr(noise_scheduler, timesteps)
+                    mse_loss_weights = torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(
+                        dim=1
+                    )[0]
+                    if noise_scheduler.config.prediction_type == "epsilon":
+                        mse_loss_weights = mse_loss_weights / snr
+                    elif noise_scheduler.config.prediction_type == "v_prediction":
+                        mse_loss_weights = mse_loss_weights / (snr + 1)
+
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+                    loss = loss.mean()
+                if args.debug_loss and "filenames" in batch:
+                    for fname in batch["filenames"]:
+                        accelerator.log({"loss_for_" + fname: loss}, step=global_step)
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(params_to_optimize, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        accelerator.save_state(save_path)
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                logger.info(
+                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+                    f" {args.validation_prompt}."
+                )
+                # create pipeline
+                pipeline = StableDiffusionXLPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    vae=vae,
+                    text_encoder=unwrap_model(text_encoder_one),
+                    text_encoder_2=unwrap_model(text_encoder_two),
+                    unet=unwrap_model(unet),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+
+                pipeline = pipeline.to(accelerator.device)
+                pipeline.set_progress_bar_config(disable=True)
+
+                # run inference
+                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+                pipeline_args = {"prompt": args.validation_prompt}
+
+                with torch.cuda.amp.autocast():
+                    images = [
+                        pipeline(**pipeline_args, generator=generator).images[0]
+                        for _ in range(args.num_validation_images)
+                    ]
+
+                for tracker in accelerator.trackers:
+                    if tracker.name == "tensorboard":
+                        np_images = np.stack([np.asarray(img) for img in images])
+                        tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+                    if tracker.name == "wandb":
+                        tracker.log(
+                            {
+                                "validation": [
+                                    wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                    for i, image in enumerate(images)
+                                ]
+                            }
+                        )
+
+                del pipeline
+                torch.cuda.empty_cache()
+
+    # Save the lora layers
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = unwrap_model(unet)
+        unet_lora_state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(unet))
+
+        if args.train_text_encoder:
+            text_encoder_one = unwrap_model(text_encoder_one)
+            text_encoder_two = unwrap_model(text_encoder_two)
+
+            text_encoder_lora_layers = convert_state_dict_to_diffusers(get_peft_model_state_dict(text_encoder_one))
+            text_encoder_2_lora_layers = convert_state_dict_to_diffusers(get_peft_model_state_dict(text_encoder_two))
+        else:
+            text_encoder_lora_layers = None
+            text_encoder_2_lora_layers = None
+
+        StableDiffusionXLPipeline.save_lora_weights(
+            save_directory=args.output_dir,
+            unet_lora_layers=unet_lora_state_dict,
+            text_encoder_lora_layers=text_encoder_lora_layers,
+            text_encoder_2_lora_layers=text_encoder_2_lora_layers,
+        )
+
+        del unet
+        del text_encoder_one
+        del text_encoder_two
+        del text_encoder_lora_layers
+        del text_encoder_2_lora_layers
+        torch.cuda.empty_cache()
+
+        # Final inference
+        # Make sure vae.dtype is consistent with the unet.dtype
+        if args.mixed_precision == "fp16":
+            vae.to(weight_dtype)
+        # Load previous pipeline
+        pipeline = StableDiffusionXLPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            vae=vae,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+        pipeline = pipeline.to(accelerator.device)
+
+        # load attention processors
+        pipeline.load_lora_weights(args.output_dir)
+
+        # run inference
+        images = []
+        if args.validation_prompt and args.num_validation_images > 0:
+            generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+            images = [
+                pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
+                for _ in range(args.num_validation_images)
+            ]
+
+            for tracker in accelerator.trackers:
+                if tracker.name == "tensorboard":
+                    np_images = np.stack([np.asarray(img) for img in images])
+                    tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC")
+                if tracker.name == "wandb":
+                    tracker.log(
+                        {
+                            "test": [
+                                wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                for i, image in enumerate(images)
+                            ]
+                        }
+                    )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id,
+                images=images,
+                base_model=args.pretrained_model_name_or_path,
+                dataset_name=args.dataset_name,
+                train_text_encoder=args.train_text_encoder,
+                repo_folder=args.output_dir,
+                vae_path=args.pretrained_vae_model_name_or_path,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image_sdxl.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image_sdxl.py
new file mode 100644
index 000000000..a360c5fab
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image_sdxl.py
@@ -0,0 +1,1374 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fine-tuning script for Stable Diffusion XL for text2image."""
+
+import argparse
+import functools
+import gc
+import logging
+import math
+import os
+import random
+import shutil
+from pathlib import Path
+import time
+
+import accelerate
+import datasets
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import transformers
+from accelerate import Accelerator
+from accelerate.state import AcceleratorState
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from datasets import concatenate_datasets, load_dataset
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from torchvision import transforms
+from torchvision.transforms.functional import crop
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
+from transformers.utils import ContextManagers
+
+import diffusers
+from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionXLPipeline, UNet2DConditionModel
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import EMAModel, compute_snr
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import is_compiled_module
+
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.27.0")
+
+logger = get_logger(__name__)
+
+
+DATASET_NAME_MAPPING = {
+    "lambdalabs/pokemon-blip-captions": ("image", "text"),
+}
+
+
+def save_model_card(
+    repo_id: str,
+    images: list = None,
+    validation_prompt: str = None,
+    base_model: str = None,
+    dataset_name: str = None,
+    repo_folder: str = None,
+    vae_path: str = None,
+):
+    img_str = ""
+    if images is not None:
+        for i, image in enumerate(images):
+            image.save(os.path.join(repo_folder, f"image_{i}.png"))
+            img_str += f"![img_{i}](./image_{i}.png)\n"
+
+    model_description = f"""
+# Text-to-image finetuning - {repo_id}
+
+This pipeline was finetuned from **{base_model}** on the **{dataset_name}** dataset. Below are some example images generated with the finetuned pipeline using the following prompt: {validation_prompt}: \n
+{img_str}
+
+Special VAE used for training: {vae_path}.
+"""
+
+    model_card = load_or_create_model_card(
+        repo_id_or_path=repo_id,
+        from_training=True,
+        license="creativeml-openrail-m",
+        base_model=base_model,
+        model_description=model_description,
+        inference=True,
+    )
+
+    tags = [
+        "stable-diffusion-xl",
+        "stable-diffusion-xl-diffusers",
+        "text-to-image",
+        "diffusers-training",
+        "diffusers",
+    ]
+    model_card = populate_model_card(model_card, tags=tags)
+
+    model_card.save(os.path.join(repo_folder, "README.md"))
+
+
+def import_model_class_from_model_name_or_path(
+    pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder"
+):
+    text_encoder_config = PretrainedConfig.from_pretrained(
+        pretrained_model_name_or_path, subfolder=subfolder, revision=revision
+    )
+    model_class = text_encoder_config.architectures[0]
+
+    if model_class == "CLIPTextModel":
+        from transformers import CLIPTextModel
+
+        return CLIPTextModel
+    elif model_class == "CLIPTextModelWithProjection":
+        from transformers import CLIPTextModelWithProjection
+
+        return CLIPTextModelWithProjection
+    else:
+        raise ValueError(f"{model_class} is not supported.")
+
+
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_model_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained VAE model with better numerical stability. More details: https://github.com/huggingface/diffusers/pull/4038.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        default=None,
+        help=(
+            "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private,"
+            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
+            " or to a folder containing files that 🤗 Datasets can understand."
+        ),
+    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
+    parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help=(
+            "A folder containing the training data. Folder contents must follow the structure described in"
+            " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file"
+            " must exist to provide the captions for the images. Ignored if `dataset_name` is specified."
+        ),
+    )
+    parser.add_argument(
+        "--image_column", type=str, default="image", help="The column of the dataset containing an image."
+    )
+    parser.add_argument(
+        "--caption_column",
+        type=str,
+        default="text",
+        help="The column of the dataset containing a caption or a list of captions.",
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="A prompt that is used during validation to verify that the model is learning.",
+    )
+    parser.add_argument(
+        "--num_validation_images",
+        type=int,
+        default=4,
+        help="Number of images that should be generated during validation with `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_epochs",
+        type=int,
+        default=1,
+        help=(
+            "Run fine-tuning validation every X epochs. The validation process consists of running the prompt"
+            " `args.validation_prompt` multiple times: `args.num_validation_images`."
+        ),
+    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
+    parser.add_argument(
+        "--proportion_empty_prompts",
+        type=float,
+        default=0,
+        help="Proportion of image prompts to be replaced with empty strings. Defaults to 0 (no prompt replacement).",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="sdxl-model-finetuned",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        type=str,
+        default=None,
+        help="The directory where the downloaded models and datasets will be stored.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=1024,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        default=False,
+        action="store_true",
+        help=(
+            "Whether to center crop the input images to the resolution. If not set, the images will be randomly"
+            " cropped. The images will be resized to the resolution first before cropping."
+        ),
+    )
+    parser.add_argument(
+        "--random_flip",
+        action="store_true",
+        help="whether to randomly flip images horizontally",
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=100)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Whether training should be resumed from a previous checkpoint. Use a path saved by"
+            ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--timestep_bias_strategy",
+        type=str,
+        default="none",
+        choices=["earlier", "later", "range", "none"],
+        help=(
+            "The timestep bias strategy, which may help direct the model toward learning low or high frequency details."
+            " Choices: ['earlier', 'later', 'range', 'none']."
+            " The default is 'none', which means no bias is applied, and training proceeds normally."
+            " The value of 'later' will increase the frequency of the model's final training timesteps."
+        ),
+    )
+    parser.add_argument(
+        "--timestep_bias_multiplier",
+        type=float,
+        default=1.0,
+        help=(
+            "The multiplier for the bias. Defaults to 1.0, which means no bias is applied."
+            " A value of 2.0 will double the weight of the bias, and a value of 0.5 will halve it."
+        ),
+    )
+    parser.add_argument(
+        "--timestep_bias_begin",
+        type=int,
+        default=0,
+        help=(
+            "When using `--timestep_bias_strategy=range`, the beginning (inclusive) timestep to bias."
+            " Defaults to zero, which equates to having no specific bias."
+        ),
+    )
+    parser.add_argument(
+        "--timestep_bias_end",
+        type=int,
+        default=1000,
+        help=(
+            "When using `--timestep_bias_strategy=range`, the final timestep (inclusive) to bias."
+            " Defaults to 1000, which is the number of timesteps that Stable Diffusion is trained on."
+        ),
+    )
+    parser.add_argument(
+        "--timestep_bias_portion",
+        type=float,
+        default=0.25,
+        help=(
+            "The portion of timesteps to bias. Defaults to 0.25, which 25% of timesteps will be biased."
+            " A value of 0.5 will bias one half of the timesteps. The value provided for `--timestep_bias_strategy` determines"
+            " whether the biased portions are in the earlier or later timesteps."
+        ),
+    )
+    parser.add_argument(
+        "--snr_gamma",
+        type=float,
+        default=None,
+        help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
+        "More details here: https://arxiv.org/abs/2303.09556.",
+    )
+    parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    parser.add_argument(
+        "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
+    )
+    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
+    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--prediction_type",
+        type=str,
+        default=None,
+        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.",
+    )
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default="tensorboard",
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
+    )
+    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
+    
+    parser.add_argument(
+        "--NHWC",
+        action="store_true",
+        help="Whether or not using NHWC for training",
+    )
+    parser.add_argument(
+        "--apex_fused_adam",
+        action="store_true",
+        help="Whether or not using fused_adam optimizer",
+    )
+
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+
+    # Sanity checks
+    if args.dataset_name is None and args.train_data_dir is None:
+        raise ValueError("Need either a dataset name or a training folder.")
+
+    if args.proportion_empty_prompts < 0 or args.proportion_empty_prompts > 1:
+        raise ValueError("`--proportion_empty_prompts` must be in the range [0, 1].")
+
+    return args
+
+
+# Adapted from pipelines.StableDiffusionXLPipeline.encode_prompt
+def encode_prompt(batch, text_encoders, tokenizers, proportion_empty_prompts, caption_column, is_train=True):
+    prompt_embeds_list = []
+    prompt_batch = batch[caption_column]
+
+    captions = []
+    for caption in prompt_batch:
+        if random.random() < proportion_empty_prompts:
+            captions.append("")
+        elif isinstance(caption, str):
+            captions.append(caption)
+        elif isinstance(caption, (list, np.ndarray)):
+            # take a random caption if there are multiple
+            captions.append(random.choice(caption) if is_train else caption[0])
+
+    with torch.no_grad():
+        for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+            text_inputs = tokenizer(
+                captions,
+                padding="max_length",
+                max_length=tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            prompt_embeds = text_encoder(
+                text_input_ids.to(text_encoder.device),
+                output_hidden_states=True,
+                return_dict=False,
+            )
+
+            # We are only ALWAYS interested in the pooled output of the final text encoder
+            pooled_prompt_embeds = prompt_embeds[0]
+            prompt_embeds = prompt_embeds[-1][-2]
+            bs_embed, seq_len, _ = prompt_embeds.shape
+            prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
+            prompt_embeds_list.append(prompt_embeds)
+
+    prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+    pooled_prompt_embeds = pooled_prompt_embeds.view(bs_embed, -1)
+    return {"prompt_embeds": prompt_embeds.cpu(), "pooled_prompt_embeds": pooled_prompt_embeds.cpu()}
+
+
+def compute_vae_encodings(batch, vae):
+    memory_format = torch.channels_last if int(os.environ["USE_NHWC_GN"]) else torch.contiguous_format
+    images = batch.pop("pixel_values")
+    pixel_values = torch.stack(list(images))
+    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+    pixel_values = pixel_values.to(vae.device, dtype=vae.dtype, memory_format=memory_format)
+
+    with torch.no_grad():
+        model_input = vae.encode(pixel_values).latent_dist.sample()
+    model_input = model_input.to(memory_format=torch.contiguous_format) * vae.config.scaling_factor
+    return {"model_input": model_input.cpu()}
+
+
+def generate_timestep_weights(args, num_timesteps):
+    weights = torch.ones(num_timesteps)
+
+    # Determine the indices to bias
+    num_to_bias = int(args.timestep_bias_portion * num_timesteps)
+
+    if args.timestep_bias_strategy == "later":
+        bias_indices = slice(-num_to_bias, None)
+    elif args.timestep_bias_strategy == "earlier":
+        bias_indices = slice(0, num_to_bias)
+    elif args.timestep_bias_strategy == "range":
+        # Out of the possible 1000 timesteps, we might want to focus on eg. 200-500.
+        range_begin = args.timestep_bias_begin
+        range_end = args.timestep_bias_end
+        if range_begin < 0:
+            raise ValueError(
+                "When using the range strategy for timestep bias, you must provide a beginning timestep greater or equal to zero."
+            )
+        if range_end > num_timesteps:
+            raise ValueError(
+                "When using the range strategy for timestep bias, you must provide an ending timestep smaller than the number of timesteps."
+            )
+        bias_indices = slice(range_begin, range_end)
+    else:  # 'none' or any other string
+        return weights
+    if args.timestep_bias_multiplier <= 0:
+        return ValueError(
+            "The parameter --timestep_bias_multiplier is not intended to be used to disable the training of specific timesteps."
+            " If it was intended to disable timestep bias, use `--timestep_bias_strategy none` instead."
+            " A timestep bias multiplier less than or equal to 0 is not allowed."
+        )
+
+    # Apply the bias
+    weights[bias_indices] *= args.timestep_bias_multiplier
+
+    # Normalize
+    weights /= weights.sum()
+
+    return weights
+
+
+def main(args):
+    if int(os.environ.get("USE_NHWC_GN", 0)):
+        assert args.NHWC, "USE_NHWC_GN requires NHWC to be true"
+    assert int(os.supports_bytes_environ)
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+    )
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+        import wandb
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
+            ).repo_id
+
+    # Load the tokenizers
+    tokenizer_one = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=args.revision,
+        use_fast=False,
+    )
+    tokenizer_two = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="tokenizer_2",
+        revision=args.revision,
+        use_fast=False,
+    )
+
+    # import correct text encoder classes
+    text_encoder_cls_one = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision
+    )
+    text_encoder_cls_two = import_model_class_from_model_name_or_path(
+        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_2"
+    )
+
+    # Load scheduler and models
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+    
+    def deepspeed_zero_init_disabled_context_manager():
+        """
+        returns either a context list that includes one that will disable zero.Init or an empty context list
+        """
+        deepspeed_plugin = AcceleratorState().deepspeed_plugin if accelerate.state.is_initialized() else None
+        if deepspeed_plugin is None:
+            return []
+
+        return [deepspeed_plugin.zero3_init_context_manager(enable=False)]
+    
+    vae_path = (
+        args.pretrained_model_name_or_path
+        if args.pretrained_vae_model_name_or_path is None
+        else args.pretrained_vae_model_name_or_path
+    )
+    
+    with ContextManagers(deepspeed_zero_init_disabled_context_manager()):
+            # Check for terminal SNR in combination with SNR Gamma
+        text_encoder_one = text_encoder_cls_one.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+        )
+        text_encoder_two = text_encoder_cls_two.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+        )
+        # vae 因为需要用float32的缘故，其中的attn部分需要用native实现，因为flash-attn 不支持float32
+        origin_attn = os.environ.get("USE_NATIVE_ATTN", 0)
+        os.environ["USE_NATIVE_ATTN"] = "1"
+        vae = AutoencoderKL.from_pretrained(
+            vae_path,
+            subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+            revision=args.revision,
+            variant=args.variant,
+        )
+        os.environ["USE_NATIVE_ATTN"] = origin_attn
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+    )
+
+    # Freeze vae and text encoders.
+    vae.requires_grad_(False)
+    text_encoder_one.requires_grad_(False)
+    text_encoder_two.requires_grad_(False)
+    # Set unet as trainable.
+    unet.train()
+
+    # For mixed precision training we cast all non-trainable weights to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    # The VAE is in float32 to avoid NaN losses.
+    vae.to(accelerator.device, dtype=torch.float32)
+    text_encoder_one.to(accelerator.device, dtype=weight_dtype)
+    text_encoder_two.to(accelerator.device, dtype=weight_dtype)
+
+    # Create EMA for the unet.
+    if args.use_ema:
+        ema_unet = UNet2DConditionModel.from_pretrained(
+            args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+        )
+        ema_unet = EMAModel(ema_unet.parameters(), model_cls=UNet2DConditionModel, model_config=ema_unet.config)
+
+    if args.enable_xformers_memory_efficient_attention:
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = version.parse(xformers.__version__)
+            if xformers_version == version.parse("0.0.16"):
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                )
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError("xformers is not available. Make sure it is installed correctly")
+
+    # `accelerate` 0.16.0 will have better support for customized saving
+    if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
+        # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
+        def save_model_hook(models, weights, output_dir):
+            if accelerator.is_main_process:
+                if args.use_ema:
+                    ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema"))
+
+                for i, model in enumerate(models):
+                    model.save_pretrained(os.path.join(output_dir, "unet"))
+
+                    # make sure to pop weight so that corresponding model is not saved again
+                    weights.pop()
+
+        def load_model_hook(models, input_dir):
+            if args.use_ema:
+                load_model = EMAModel.from_pretrained(os.path.join(input_dir, "unet_ema"), UNet2DConditionModel)
+                ema_unet.load_state_dict(load_model.state_dict())
+                ema_unet.to(accelerator.device)
+                del load_model
+
+            for _ in range(len(models)):
+                # pop models so that they are not loaded again
+                model = models.pop()
+
+                # load diffusers style into model
+                load_model = UNet2DConditionModel.from_pretrained(input_dir, subfolder="unet")
+                model.register_to_config(**load_model.config)
+
+                model.load_state_dict(load_model.state_dict())
+                del load_model
+
+        accelerator.register_save_state_pre_hook(save_model_hook)
+        accelerator.register_load_state_pre_hook(load_model_hook)
+
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+        optimizer_class = bnb.optim.AdamW8bit
+    elif args.apex_fused_adam:
+        import apex
+        optimizer_class = apex.optimizers.FusedAdam
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    # Optimizer creation
+    params_to_optimize = unet.parameters()
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+
+    # Get the datasets: you can either provide your own training and evaluation files (see below)
+    # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub).
+
+    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
+    # download the dataset.
+    if args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        dataset = load_dataset(
+            args.dataset_name,
+            args.dataset_config_name,
+            cache_dir=args.cache_dir,
+        )
+    else:
+        data_files = {}
+        if args.train_data_dir is not None:
+            data_files["train"] = os.path.join(args.train_data_dir, "**")
+        dataset = load_dataset(
+            "imagefolder",
+            data_files=data_files,
+            cache_dir=args.cache_dir,
+        )
+        # See more about loading custom images at
+        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder
+
+    # Preprocessing the datasets.
+    # We need to tokenize inputs and targets.
+    column_names = dataset["train"].column_names
+
+    # 6. Get the column names for input/target.
+    dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None)
+    if args.image_column is None:
+        image_column = dataset_columns[0] if dataset_columns is not None else column_names[0]
+    else:
+        image_column = args.image_column
+        if image_column not in column_names:
+            raise ValueError(
+                f"--image_column' value '{args.image_column}' needs to be one of: {', '.join(column_names)}"
+            )
+    if args.caption_column is None:
+        caption_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
+    else:
+        caption_column = args.caption_column
+        if caption_column not in column_names:
+            raise ValueError(
+                f"--caption_column' value '{args.caption_column}' needs to be one of: {', '.join(column_names)}"
+            )
+
+    # Preprocessing the datasets.
+    train_resize = transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR)
+    train_crop = transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution)
+    train_flip = transforms.RandomHorizontalFlip(p=1.0)
+    train_transforms = transforms.Compose([transforms.ToTensor(), transforms.Normalize([0.5], [0.5])])
+
+    def preprocess_train(examples):
+        images = [image.convert("RGB") for image in examples[image_column]]
+        # image aug
+        original_sizes = []
+        all_images = []
+        crop_top_lefts = []
+        for image in images:
+            original_sizes.append((image.height, image.width))
+            image = train_resize(image)
+            if args.random_flip and random.random() < 0.5:
+                # flip
+                image = train_flip(image)
+            if args.center_crop:
+                y1 = max(0, int(round((image.height - args.resolution) / 2.0)))
+                x1 = max(0, int(round((image.width - args.resolution) / 2.0)))
+                image = train_crop(image)
+            else:
+                y1, x1, h, w = train_crop.get_params(image, (args.resolution, args.resolution))
+                image = crop(image, y1, x1, h, w)
+            crop_top_left = (y1, x1)
+            crop_top_lefts.append(crop_top_left)
+            image = train_transforms(image)
+            all_images.append(image)
+
+        examples["original_sizes"] = original_sizes
+        examples["crop_top_lefts"] = crop_top_lefts
+        examples["pixel_values"] = all_images
+        return examples
+
+    with accelerator.main_process_first():
+        if args.max_train_samples is not None:
+            dataset["train"] = dataset["train"].shuffle(seed=args.seed).select(range(args.max_train_samples))
+        # Set the training transforms
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+
+    # Let's first compute all the embeddings so that we can free up the text encoders
+    # from memory. We will pre-compute the VAE encodings too.
+    text_encoders = [text_encoder_one, text_encoder_two]
+    tokenizers = [tokenizer_one, tokenizer_two]
+    compute_embeddings_fn = functools.partial(
+        encode_prompt,
+        text_encoders=text_encoders,
+        tokenizers=tokenizers,
+        proportion_empty_prompts=args.proportion_empty_prompts,
+        caption_column=args.caption_column,
+    )
+    compute_vae_encodings_fn = functools.partial(compute_vae_encodings, vae=vae)
+    with accelerator.main_process_first():
+        from datasets.fingerprint import Hasher
+
+        # fingerprint used by the cache for the other processes to load the result
+        # details: https://github.com/huggingface/diffusers/pull/4038#discussion_r1266078401
+        new_fingerprint = Hasher.hash(args)
+        new_fingerprint_for_vae = Hasher.hash(vae_path)
+        train_dataset_with_embeddings = train_dataset.map(
+            compute_embeddings_fn, batched=True, new_fingerprint=new_fingerprint
+        )
+        train_dataset_with_vae = train_dataset.map(
+            compute_vae_encodings_fn,
+            batched=True,
+            # batch_size=args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps,\
+            batch_size=args.train_batch_size,
+            new_fingerprint=new_fingerprint_for_vae,
+        )
+        precomputed_dataset = concatenate_datasets(
+            [train_dataset_with_embeddings, train_dataset_with_vae.remove_columns(["image", "text"])], axis=1
+        )
+        precomputed_dataset = precomputed_dataset.with_transform(preprocess_train)
+
+    del text_encoders, tokenizers, vae
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    def collate_fn(examples):
+        model_input = torch.stack([torch.tensor(example["model_input"]) for example in examples])
+        original_sizes = [example["original_sizes"] for example in examples]
+        crop_top_lefts = [example["crop_top_lefts"] for example in examples]
+        prompt_embeds = torch.stack([torch.tensor(example["prompt_embeds"]) for example in examples])
+        pooled_prompt_embeds = torch.stack([torch.tensor(example["pooled_prompt_embeds"]) for example in examples])
+
+        return {
+            "model_input": model_input,
+            "prompt_embeds": prompt_embeds,
+            "pooled_prompt_embeds": pooled_prompt_embeds,
+            "original_sizes": original_sizes,
+            "crop_top_lefts": crop_top_lefts,
+        }
+
+    # DataLoaders creation:
+    # for testing ips
+    precomputed_dataset = concatenate_datasets([precomputed_dataset for i in range(10)])
+    
+    train_dataloader = torch.utils.data.DataLoader(
+        precomputed_dataset,
+        shuffle=True,
+        collate_fn=collate_fn,
+        batch_size=args.train_batch_size,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+
+    # Prepare everything with our `accelerator`.
+    unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        unet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    if args.use_ema:
+        ema_unet.to(accelerator.device)
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("text2image-fine-tune-sdxl", config=vars(args))
+
+    # Function for unwrapping if torch.compile() was used in accelerate.
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(precomputed_dataset)}")
+    logger.info(f"  Num Epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+
+    # Potentially load in the weights and states from a previous save
+    if args.resume_from_checkpoint:
+        if args.resume_from_checkpoint != "latest":
+            path = os.path.basename(args.resume_from_checkpoint)
+        else:
+            # Get the most recent checkpoint
+            dirs = os.listdir(args.output_dir)
+            dirs = [d for d in dirs if d.startswith("checkpoint")]
+            dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
+            path = dirs[-1] if len(dirs) > 0 else None
+
+        if path is None:
+            accelerator.print(
+                f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
+            )
+            args.resume_from_checkpoint = None
+            initial_global_step = 0
+        else:
+            accelerator.print(f"Resuming from checkpoint {path}")
+            accelerator.load_state(os.path.join(args.output_dir, path))
+            global_step = int(path.split("-")[1])
+
+            initial_global_step = global_step
+            first_epoch = global_step // num_update_steps_per_epoch
+
+    else:
+        initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+    
+    if args.NHWC:
+        unet = unet.to(memory_format=torch.channels_last)
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        train_loss = 0.0
+        iter_start = time.time()
+        for step, batch in enumerate(train_dataloader):
+            with accelerator.accumulate(unet):
+                # Sample noise that we'll add to the latents
+                model_input = batch["model_input"].to(accelerator.device)
+                noise = torch.randn_like(model_input)
+                if args.noise_offset:
+                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
+                    noise += args.noise_offset * torch.randn(
+                        (model_input.shape[0], model_input.shape[1], 1, 1), device=model_input.device
+                    )
+
+                bsz = model_input.shape[0]
+                if args.timestep_bias_strategy == "none":
+                    # Sample a random timestep for each image without bias.
+                    timesteps = torch.randint(
+                        0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
+                    )
+                else:
+                    # Sample a random timestep for each image, potentially biased by the timestep weights.
+                    # Biasing the timestep weights allows us to spend less time training irrelevant timesteps.
+                    weights = generate_timestep_weights(args, noise_scheduler.config.num_train_timesteps).to(
+                        model_input.device
+                    )
+                    timesteps = torch.multinomial(weights, bsz, replacement=True).long()
+
+                # Add noise to the model input according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_model_input = noise_scheduler.add_noise(model_input, noise, timesteps)
+
+                # time ids
+                def compute_time_ids(original_size, crops_coords_top_left):
+                    # Adapted from pipeline.StableDiffusionXLPipeline._get_add_time_ids
+                    target_size = (args.resolution, args.resolution)
+                    add_time_ids = list(original_size + crops_coords_top_left + target_size)
+                    add_time_ids = torch.tensor([add_time_ids])
+                    add_time_ids = add_time_ids.to(accelerator.device, dtype=weight_dtype)
+                    return add_time_ids
+
+                add_time_ids = torch.cat(
+                    [compute_time_ids(s, c) for s, c in zip(batch["original_sizes"], batch["crop_top_lefts"])]
+                )
+
+                # Predict the noise residual
+                unet_added_conditions = {"time_ids": add_time_ids}
+                prompt_embeds = batch["prompt_embeds"].to(accelerator.device)
+                pooled_prompt_embeds = batch["pooled_prompt_embeds"].to(accelerator.device)
+                unet_added_conditions.update({"text_embeds": pooled_prompt_embeds})
+                
+                if args.NHWC:
+                    noisy_model_input = noisy_model_input.to(memory_format=torch.channels_last)
+                model_pred = unet(
+                    noisy_model_input,
+                    timesteps,
+                    prompt_embeds,
+                    added_cond_kwargs=unet_added_conditions,
+                    return_dict=False,
+                )[0]
+
+                # Get the target for loss depending on the prediction type
+                if args.prediction_type is not None:
+                    # set prediction_type of scheduler if defined
+                    noise_scheduler.register_to_config(prediction_type=args.prediction_type)
+
+                if noise_scheduler.config.prediction_type == "epsilon":
+                    target = noise
+                elif noise_scheduler.config.prediction_type == "v_prediction":
+                    target = noise_scheduler.get_velocity(model_input, noise, timesteps)
+                elif noise_scheduler.config.prediction_type == "sample":
+                    # We set the target to latents here, but the model_pred will return the noise sample prediction.
+                    target = model_input
+                    # We will have to subtract the noise residual from the prediction to get the target sample.
+                    model_pred = model_pred - noise
+                else:
+                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+
+                if args.snr_gamma is None:
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+                else:
+                    # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
+                    # This is discussed in Section 4.2 of the same paper.
+                    snr = compute_snr(noise_scheduler, timesteps)
+                    mse_loss_weights = torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(
+                        dim=1
+                    )[0]
+                    if noise_scheduler.config.prediction_type == "epsilon":
+                        mse_loss_weights = mse_loss_weights / snr
+                    elif noise_scheduler.config.prediction_type == "v_prediction":
+                        mse_loss_weights = mse_loss_weights / (snr + 1)
+
+                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
+                    loss = loss.mean()
+
+                # Gather the losses across all processes for logging (if we use distributed training).
+                avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
+                train_loss += avg_loss.item() / args.gradient_accumulation_steps
+
+                # Backpropagate
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    params_to_clip = unet.parameters()
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                if args.use_ema:
+                    ema_unet.step(unet.parameters())
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss}, step=global_step)
+                train_loss = 0.0
+                iter_elapse = time.time() - iter_start
+                iter_start = time.time()
+                ips_per_device = total_batch_size / iter_elapse / accelerator.num_processes
+                ips_per_gpu = ips_per_device * 2
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
+                            checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
+                                    shutil.rmtree(removing_checkpoint)
+
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                        
+                        if args.NHWC:
+                            origin_model = accelerator._models[0]
+                            accelerator._models[0] = origin_model.to(memory_format=torch.contiguous_format)
+                            accelerator.save_state(save_path)
+                            accelerator._models[0] = origin_model.to(memory_format=torch.channels_last)
+                        else:
+                            accelerator.save_state(save_path)
+                        
+                        logger.info(f"Saved state to {save_path}")
+
+            logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], 
+                    "ips_per_device": ips_per_device, "ips_per_gpu": ips_per_gpu}
+            progress_bar.set_postfix(**logs)
+
+            if global_step >= args.max_train_steps:
+                break
+
+        if accelerator.is_main_process:
+            if args.validation_prompt is not None and epoch % args.validation_epochs == 0:
+                logger.info(
+                    f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
+                    f" {args.validation_prompt}."
+                )
+                if args.use_ema:
+                    # Store the UNet parameters temporarily and load the EMA parameters to perform inference.
+                    ema_unet.store(unet.parameters())
+                    ema_unet.copy_to(unet.parameters())
+
+                # create pipeline
+                vae = AutoencoderKL.from_pretrained(
+                    vae_path,
+                    subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+                    revision=args.revision,
+                    variant=args.variant,
+                )
+                pipeline = StableDiffusionXLPipeline.from_pretrained(
+                    args.pretrained_model_name_or_path,
+                    vae=vae,
+                    unet=accelerator.unwrap_model(unet),
+                    revision=args.revision,
+                    variant=args.variant,
+                    torch_dtype=weight_dtype,
+                )
+                if args.prediction_type is not None:
+                    scheduler_args = {"prediction_type": args.prediction_type}
+                    pipeline.scheduler = pipeline.scheduler.from_config(pipeline.scheduler.config, **scheduler_args)
+
+                pipeline = pipeline.to(accelerator.device)
+                pipeline.set_progress_bar_config(disable=True)
+
+                # run inference
+                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+                pipeline_args = {"prompt": args.validation_prompt}
+
+                with torch.cuda.amp.autocast():
+                    images = [
+                        pipeline(**pipeline_args, generator=generator, num_inference_steps=25).images[0]
+                        for _ in range(args.num_validation_images)
+                    ]
+
+                for tracker in accelerator.trackers:
+                    if tracker.name == "tensorboard":
+                        np_images = np.stack([np.asarray(img) for img in images])
+                        tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC")
+                    if tracker.name == "wandb":
+                        tracker.log(
+                            {
+                                "validation": [
+                                    wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                    for i, image in enumerate(images)
+                                ]
+                            }
+                        )
+
+                del pipeline
+                torch.cuda.empty_cache()
+
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        unet = unwrap_model(unet)
+        if args.use_ema:
+            ema_unet.copy_to(unet.parameters())
+
+        # Serialize pipeline.
+        vae = AutoencoderKL.from_pretrained(
+            vae_path,
+            subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+        pipeline = StableDiffusionXLPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            unet=unet,
+            vae=vae,
+            revision=args.revision,
+            variant=args.variant,
+            torch_dtype=weight_dtype,
+        )
+        if args.prediction_type is not None:
+            scheduler_args = {"prediction_type": args.prediction_type}
+            pipeline.scheduler = pipeline.scheduler.from_config(pipeline.scheduler.config, **scheduler_args)
+        pipeline.save_pretrained(args.output_dir)
+
+        # run inference
+        images = []
+        if args.validation_prompt and args.num_validation_images > 0:
+            pipeline = pipeline.to(accelerator.device)
+            generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+            with torch.cuda.amp.autocast():
+                images = [
+                    pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0]
+                    for _ in range(args.num_validation_images)
+                ]
+
+            for tracker in accelerator.trackers:
+                if tracker.name == "tensorboard":
+                    np_images = np.stack([np.asarray(img) for img in images])
+                    tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC")
+                if tracker.name == "wandb":
+                    tracker.log(
+                        {
+                            "test": [
+                                wandb.Image(image, caption=f"{i}: {args.validation_prompt}")
+                                for i, image in enumerate(images)
+                            ]
+                        }
+                    )
+
+        if args.push_to_hub:
+            save_model_card(
+                repo_id=repo_id,
+                images=images,
+                validation_prompt=args.validation_prompt,
+                base_model=args.pretrained_model_name_or_path,
+                dataset_name=args.dataset_name,
+                repo_folder=args.output_dir,
+                vae_path=args.pretrained_vae_model_name_or_path,
+            )
+            upload_folder(
+                repo_id=repo_id,
+                folder_path=args.output_dir,
+                commit_message="End of training",
+                ignore_patterns=["step_*", "epoch_*"],
+            )
+
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/zero2_config.yaml b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/zero2_config.yaml
new file mode 100644
index 000000000..8ffdbbd1e
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/zero2_config.yaml
@@ -0,0 +1,23 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+machine_rank: 0
+main_training_function: main
+mixed_precision: fp16
+num_machines: 1
+num_processes: 16
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/install_diffusers.sh b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/install_diffusers.sh
new file mode 100644
index 000000000..9dab7ddf9
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/install_diffusers.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+cd $(dirname $(realpath "$0"))
+TARGET_DIR=${TARGET_DIR:-}
+
+PYTHON_PATH=$(which python3)
+PYTHON_DIST_PATH=${TARGET_DIR}/lib/python3/dist-packages
+
+PKG_DIR="build_pip"
+PKG_NAME="diffusers"
+
+if [[ ! -d ${PKG_DIR} ]]; then
+  echo "ERROR: Package directory ${PKG_DIR} doesn't exist"
+  exit 1
+fi
+
+latest_pkg="$(ls -t ${PKG_DIR} | grep ${PKG_NAME} | head -1)"
+if [[ "${latest_pkg}" == "" ]]; then
+  echo "ERROR: Cannot find latest ${PKG_NAME} package"
+  exit 1
+else
+  echo "INFO: Found latest package ${latest_pkg} in directory ${PKG_DIR}"
+fi
+
+if [[ "${TARGET_DIR}" != ""  ]]; then
+  ${PYTHON_PATH} -m pip install --upgrade --no-deps -t ${PYTHON_DIST_PATH} ${PKG_DIR}/${latest_pkg} || exit
+  echo "lightllm installed in ${PYTHON_DIST_PATH}; please add it to your PYTHONPATH."
+else
+  ${PYTHON_PATH} -m pip uninstall ${PKG_NAME} -y
+  ${PYTHON_PATH} -m pip install --no-deps ${PKG_DIR}/${latest_pkg} || exit
+fi
+
+# Return 0 status if all finished
+exit 0
\ No newline at end of file
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_1.5.sh b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_1.5.sh
new file mode 100644
index 000000000..248f79c63
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_1.5.sh
@@ -0,0 +1,32 @@
+export CLIP_FLASH_ATTN=1
+export USE_NHWC_GN=1
+export USE_IXFORMER_GEGLU=1
+export USE_APEX_LN=1
+export USE_NATIVE_ATTN=0
+export ENABLE_FLASH_ATTENTION_WITH_IXDNN=1
+echo $ENABLE_FLASH_ATTENTION_WITH_IXDNN
+
+export MODEL_NAME=/data/yili.li/jira_1040/stable-diffusion-v1-5
+export DATASET_NAME=/data/yili.li/jira/pokemon-blip-captions/
+
+cd /data/yili.li/jira_1068/diffusers/examples/text_to_image
+
+accelerate launch --config_file default_config.yaml --mixed_precision="fp16"  train_text_to_image.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --dataset_name=$DATASET_NAME \
+  --resolution=512 \
+  --seed 42 \
+  --gradient_checkpointing \
+  --center_crop \
+  --random_flip \
+  --train_batch_size=24 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --output_dir="sd-pokemon-model-3" \
+  --max_train_steps=100 \
+  --NHWC \
+  --dataloader_num_workers=32 \
+  --apex_fused_adam
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_1.5_single.sh b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_1.5_single.sh
new file mode 100644
index 000000000..2c2efcd4e
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_1.5_single.sh
@@ -0,0 +1,32 @@
+export CLIP_FLASH_ATTN=1
+export USE_NHWC_GN=1
+export USE_IXFORMER_GEGLU=1
+export USE_APEX_LN=1
+export USE_NATIVE_ATTN=0
+export ENABLE_FLASH_ATTENTION_WITH_IXDNN=1
+echo $ENABLE_FLASH_ATTENTION_WITH_IXDNN
+
+export MODEL_NAME=/data/yili.li/jira_1040/stable-diffusion-v1-5
+export DATASET_NAME=/data/yili.li/jira/pokemon-blip-captions/
+
+cd /data/yili.li/jira_1068/diffusers/examples/text_to_image
+
+accelerate launch --config_file single_config.yaml --mixed_precision="fp16"  train_text_to_image.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --dataset_name=$DATASET_NAME \
+  --resolution=512 \
+  --seed 42 \
+  --gradient_checkpointing \
+  --center_crop \
+  --random_flip \
+  --train_batch_size=32 \
+  --gradient_accumulation_steps=1 \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --output_dir="sd-pokemon-model-3" \
+  --max_train_steps=100 \
+  --dataloader_num_workers=32 \
+  --NHWC \
+  --apex_fused_adam
\ No newline at end of file
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_2.1.sh b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_2.1.sh
new file mode 100644
index 000000000..2ad523bbf
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_2.1.sh
@@ -0,0 +1,32 @@
+export CLIP_FLASH_ATTN=1
+export USE_NHWC_GN=1
+export USE_IXFORMER_GEGLU=1
+export USE_APEX_LN=1
+export ENABLE_FLASH_ATTENTION_WITH_IXDNN=1
+echo $ENABLE_FLASH_ATTENTION_WITH_IXDNN
+
+export MODEL_NAME=/data/yili.li/jira/stabilityai/stable-diffusion-2-1
+export DATASET_NAME=/data/yili.li/jira/pokemon-blip-captions/
+
+cd /data/yili.li/jira_1068/diffusers/examples/text_to_image
+
+accelerate launch --config_file default_config.yaml --mixed_precision="fp16" train_text_to_image.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --dataset_name=$DATASET_NAME \
+  --resolution=512 \
+  --seed 42 \
+  --center_crop \
+  --random_flip \
+  --train_batch_size=32 \
+  --gradient_accumulation_steps=1 \
+  --gradient_checkpointing \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --output_dir="sd-pokemon-model-3" \
+  --max_train_steps=100 \
+  --NHWC \
+  --dataloader_num_workers=32 \
+  --apex_fused_adam 
+  # --use_ema
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_2.1_single.sh b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_2.1_single.sh
new file mode 100644
index 000000000..b6fda519d
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_2.1_single.sh
@@ -0,0 +1,32 @@
+export CLIP_FLASH_ATTN=1
+export USE_NHWC_GN=1
+export USE_IXFORMER_GEGLU=1
+export USE_APEX_LN=1
+export ENABLE_FLASH_ATTENTION_WITH_IXDNN=1
+echo $ENABLE_FLASH_ATTENTION_WITH_IXDNN
+
+# export MODEL_NAME=/data/yili.li/jira/stabilityai/stable-diffusion-2-1
+# export DATASET_NAME=/data/yili.li/jira/pokemon-blip-captions/
+
+cd /data/yili.li/jira_1068/diffusers/examples/text_to_image
+
+accelerate launch --config_file single_config.yaml --mixed_precision="fp16" train_text_to_image.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --dataset_name=$DATASET_NAME \
+  --resolution=512 \
+  --seed 42 \
+  --center_crop \
+  --random_flip \
+  --train_batch_size=32 \
+  --gradient_accumulation_steps=1 \
+  --gradient_checkpointing \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --output_dir="sd-pokemon-model-3" \
+  --max_train_steps=100 \
+  --NHWC \
+  --dataloader_num_workers=32 \
+  --apex_fused_adam 
+  # --use_ema
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_xl.sh b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_xl.sh
new file mode 100644
index 000000000..4ae95dad9
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_xl.sh
@@ -0,0 +1,37 @@
+export CLIP_FLASH_ATTN=1
+export USE_NHWC_GN=1
+export USE_IXFORMER_GEGLU=1
+export USE_APEX_LN=1
+export USE_NATIVE_ATTN=0
+export ENABLE_FLASH_ATTENTION_WITH_IXDNN=1
+echo $ENABLE_FLASH_ATTENTION_WITH_IXDNN
+
+export MODEL_NAME=/data/yili.li/jira_1040/stable-diffusion-xl-base-1.0
+export DATASET_NAME=/data/yili.li/jira/pokemon-blip-captions/
+export VAE_NAME=/data/yili.li/jira_1040/sdxl-vae-fp16-fix
+
+cd /data/yili.li/jira_1068/diffusers/examples/text_to_image
+
+accelerate launch --config_file zero2_config.yaml --mixed_precision="fp16"  train_text_to_image_sdxl.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --pretrained_vae_model_name_or_path=$VAE_NAME \
+  --dataset_name=$DATASET_NAME \
+  --resolution=512 \
+  --seed 42 \
+  --gradient_checkpointing \
+  --center_crop \
+  --random_flip \
+  --train_batch_size=32 \
+  --gradient_accumulation_steps=1 \
+  --gradient_checkpointing \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --output_dir="sd-pokemon-model-3" \
+  --max_train_steps=100 \
+  --dataloader_num_workers=32 \
+  --NHWC \
+  --apex_fused_adam 
+    # --use_ema 
+
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/setup.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/setup.py
new file mode 100644
index 000000000..5f33982ae
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/setup.py
@@ -0,0 +1,304 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Simple check list from AllenNLP repo: https://github.com/allenai/allennlp/blob/main/setup.py
+
+To create the package for pypi.
+
+1. Run `make pre-release` (or `make pre-patch` for a patch release) then run `make fix-copies` to fix the index of the
+   documentation.
+
+   If releasing on a special branch, copy the updated README.md on the main branch for your the commit you will make
+   for the post-release and run `make fix-copies` on the main branch as well.
+
+2. Run Tests for Amazon Sagemaker. The documentation is located in `./tests/sagemaker/README.md`, otherwise @philschmid.
+
+3. Unpin specific versions from setup.py that use a git install.
+
+4. Checkout the release branch (v<RELEASE>-release, for example v4.19-release), and commit these changes with the
+   message: "Release: <RELEASE>" and push.
+
+5. Wait for the tests on main to be completed and be green (otherwise revert and fix bugs)
+
+6. Add a tag in git to mark the release: "git tag v<RELEASE> -m 'Adds tag v<RELEASE> for pypi' "
+   Push the tag to git: git push --tags origin v<RELEASE>-release
+
+7. Build both the sources and the wheel. Do not change anything in setup.py between
+   creating the wheel and the source distribution (obviously).
+
+   For the wheel, run: "python setup.py bdist_wheel" in the top level directory.
+   (this will build a wheel for the python version you use to build it).
+
+   For the sources, run: "python setup.py sdist"
+   You should now have a /dist directory with both .whl and .tar.gz source versions.
+
+   Long story cut short, you need to run both before you can upload the distribution to the 
+   test pypi and the actual pypi servers: 
+   
+   python setup.py bdist_wheel && python setup.py sdist
+
+8. Check that everything looks correct by uploading the package to the pypi test server:
+
+   twine upload dist/* -r pypitest
+   (pypi suggest using twine as other methods upload files via plaintext.)
+   You may have to specify the repository url, use the following command then:
+   twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/
+
+   Check that you can install it in a virtualenv by running:
+   pip install -i https://testpypi.python.org/pypi diffusers
+
+   If you are testing from a Colab Notebook, for instance, then do:
+   pip install diffusers && pip uninstall diffusers
+   pip install -i https://testpypi.python.org/pypi diffusers
+
+   Check you can run the following commands:
+   python -c "python -c "from diffusers import __version__; print(__version__)"
+   python -c "from diffusers import DiffusionPipeline; pipe = DiffusionPipeline.from_pretrained('fusing/unet-ldm-dummy-update'); pipe()"
+   python -c "from diffusers import DiffusionPipeline; pipe = DiffusionPipeline.from_pretrained('hf-internal-testing/tiny-stable-diffusion-pipe', safety_checker=None); pipe('ah suh du')"
+   python -c "from diffusers import *"
+
+9. Upload the final version to actual pypi:
+   twine upload dist/* -r pypi
+
+10. Prepare the release notes and publish them on github once everything is looking hunky-dory.
+
+11. Run `make post-release` (or, for a patch release, `make post-patch`). If you were on a branch for the release,
+    you need to go back to main before executing this.
+"""
+
+import os
+import re
+from distutils.core import Command
+
+from setuptools import find_packages, setup
+
+
+# IMPORTANT:
+# 1. all dependencies should be listed here with their version requirements if any
+# 2. once modified, run: `make deps_table_update` to update src/diffusers/dependency_versions_table.py
+_deps = [
+    "Pillow",  # keep the PIL.Image.Resampling deprecation away
+    "accelerate>=0.11.0",
+    "compel==0.1.8",
+    "black~=23.1",
+    "datasets",
+    "filelock",
+    "flax>=0.4.1",
+    "hf-doc-builder>=0.3.0",
+    "huggingface-hub>=0.13.2",
+    "requests-mock==1.10.0",
+    "importlib_metadata",
+    "invisible-watermark>=0.2.0",
+    "isort>=5.5.4",
+    "jax>=0.4.1",
+    "jaxlib>=0.4.1",
+    "Jinja2",
+    "k-diffusion>=0.0.12",
+    "torchsde",
+    "note_seq",
+    "librosa",
+    "numpy",
+    "omegaconf",
+    "parameterized",
+    "protobuf>=3.20.3,<4",
+    "pytest",
+    "pytest-timeout",
+    "pytest-xdist",
+    "ruff==0.0.280",
+    "safetensors>=0.3.1",
+    "sentencepiece>=0.1.91,!=0.1.92",
+    "scipy",
+    "onnx",
+    "regex!=2019.12.17",
+    "requests",
+    "tensorboard",
+    "torch>=1.4",
+    "torchvision",
+    "transformers>=4.25.1",
+    "urllib3<=2.0.0",
+]
+
+# this is a lookup table with items like:
+#
+# tokenizers: "huggingface-hub==0.8.0"
+# packaging: "packaging"
+#
+# some of the values are versioned whereas others aren't.
+deps = {b: a for a, b in (re.findall(r"^(([^!=<>~]+)(?:[!=<>~].*)?$)", x)[0] for x in _deps)}
+
+# since we save this data in src/diffusers/dependency_versions_table.py it can be easily accessed from
+# anywhere. If you need to quickly access the data from this table in a shell, you can do so easily with:
+#
+# python -c 'import sys; from diffusers.dependency_versions_table import deps; \
+# print(" ".join([ deps[x] for x in sys.argv[1:]]))' tokenizers datasets
+#
+# Just pass the desired package names to that script as it's shown with 2 packages above.
+#
+# If diffusers is not yet installed and the work is done from the cloned repo remember to add `PYTHONPATH=src` to the script above
+#
+# You can then feed this for example to `pip`:
+#
+# pip install -U $(python -c 'import sys; from diffusers.dependency_versions_table import deps; \
+# print(" ".join([ deps[x] for x in sys.argv[1:]]))' tokenizers datasets)
+#
+
+
+def deps_list(*pkgs):
+    return [deps[pkg] for pkg in pkgs]
+
+
+class DepsTableUpdateCommand(Command):
+    """
+    A custom distutils command that updates the dependency table.
+    usage: python setup.py deps_table_update
+    """
+
+    description = "build runtime dependency table"
+    user_options = [
+        # format: (long option, short option, description).
+        ("dep-table-update", None, "updates src/diffusers/dependency_versions_table.py"),
+    ]
+
+    def initialize_options(self):
+        pass
+
+    def finalize_options(self):
+        pass
+
+    def run(self):
+        entries = "\n".join([f'    "{k}": "{v}",' for k, v in deps.items()])
+        content = [
+            "# THIS FILE HAS BEEN AUTOGENERATED. To update:",
+            "# 1. modify the `_deps` dict in setup.py",
+            "# 2. run `make deps_table_update``",
+            "deps = {",
+            entries,
+            "}",
+            "",
+        ]
+        target = "src/diffusers/dependency_versions_table.py"
+        print(f"updating {target}")
+        with open(target, "w", encoding="utf-8", newline="\n") as f:
+            f.write("\n".join(content))
+
+
+extras = {}
+
+
+extras = {}
+extras["quality"] = deps_list("urllib3", "black", "isort", "ruff", "hf-doc-builder")
+extras["docs"] = deps_list("hf-doc-builder")
+extras["training"] = deps_list("accelerate", "datasets", "protobuf", "tensorboard", "Jinja2")
+extras["test"] = deps_list(
+    "compel",
+    "datasets",
+    "Jinja2",
+    "invisible-watermark",
+    "k-diffusion",
+    "librosa",
+    "omegaconf",
+    "parameterized",
+    "pytest",
+    "pytest-timeout",
+    "pytest-xdist",
+    "requests-mock",
+    "safetensors",
+    "sentencepiece",
+    "scipy",
+    "torchvision",
+    "transformers",
+)
+extras["torch"] = deps_list("torch", "accelerate")
+
+if os.name == "nt":  # windows
+    extras["flax"] = []  # jax is not supported on windows
+else:
+    extras["flax"] = deps_list("jax", "jaxlib", "flax")
+
+extras["dev"] = (
+    extras["quality"] + extras["test"] + extras["training"] + extras["docs"] + extras["torch"] + extras["flax"]
+)
+
+install_requires = [
+    deps["importlib_metadata"],
+    deps["filelock"],
+    deps["huggingface-hub"],
+    deps["numpy"],
+    deps["regex"],
+    deps["requests"],
+    deps["safetensors"],
+    deps["Pillow"],
+]
+LOCAL_VERSION_IDENTIFIER = os.getenv("LOCAL_VERSION_IDENTIFIER", "")
+if LOCAL_VERSION_IDENTIFIER:
+    LOCAL_VERSION_IDENTIFIER = "+" + LOCAL_VERSION_IDENTIFIER
+
+version="0.22.0"
+
+version = version + LOCAL_VERSION_IDENTIFIER
+
+setup(
+    name="diffusers",
+    version=version,  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    description="State-of-the-art diffusion in PyTorch and JAX.",
+    long_description=open("README.md", "r", encoding="utf-8").read(),
+    long_description_content_type="text/markdown",
+    keywords="deep learning diffusion jax pytorch stable diffusion audioldm",
+    license="Apache",
+    author="The HuggingFace team",
+    author_email="patrick@huggingface.co",
+    url="https://github.com/huggingface/diffusers",
+    package_dir={"": "src"},
+    packages=find_packages("src"),
+    package_data={"diffusers": ["py.typed"]},
+    include_package_data=True,
+    python_requires=">=3.8.0",
+    install_requires=list(install_requires),
+    extras_require=extras,
+    entry_points={"console_scripts": ["diffusers-cli=diffusers.commands.diffusers_cli:main"]},
+    classifiers=[
+        "Development Status :: 5 - Production/Stable",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Education",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: Apache Software License",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    ],
+    cmdclass={"deps_table_update": DepsTableUpdateCommand},
+)
+
+# Release checklist
+# 1. Change the version in __init__.py and setup.py.
+# 2. Commit these changes with the message: "Release: Release"
+# 3. Add a tag in git to mark the release: "git tag RELEASE -m 'Adds tag RELEASE for pypi' "
+#    Push the tag to git: git push --tags origin main
+# 4. Run the following commands in the top-level directory:
+#      python setup.py bdist_wheel
+#      python setup.py sdist
+# 5. Upload the package to the pypi test server first:
+#      twine upload dist/* -r pypitest
+#      twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/
+# 6. Check that you can install it in a virtualenv by running:
+#      pip install -i https://testpypi.python.org/pypi diffusers
+#      diffusers env
+#      diffusers test
+# 7. Upload the final version to actual pypi:
+#      twine upload dist/* -r pypi
+# 8. Add release notes to the tag in github once everything is looking hunky-dory.
+# 9. Update the version in __init__.py, setup.py to the new version "-dev" and push to master
\ No newline at end of file
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/__init__.py
new file mode 100644
index 000000000..2f258e9fb
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/__init__.py
@@ -0,0 +1,787 @@
+__version__ = "0.27.0"
+
+from typing import TYPE_CHECKING
+
+from .utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_flax_available,
+    is_k_diffusion_available,
+    is_librosa_available,
+    is_note_seq_available,
+    is_onnx_available,
+    is_scipy_available,
+    is_torch_available,
+    is_torchsde_available,
+    is_transformers_available,
+)
+
+
+# Lazy Import based on
+# https://github.com/huggingface/transformers/blob/main/src/transformers/__init__.py
+
+# When adding a new object to this init, please add it to `_import_structure`. The `_import_structure` is a dictionary submodule to list of object names,
+# and is used to defer the actual importing for when the objects are requested.
+# This way `import diffusers` provides the names in the namespace without actually importing anything (and especially none of the backends).
+
+_import_structure = {
+    "configuration_utils": ["ConfigMixin"],
+    "models": [],
+    "pipelines": [],
+    "schedulers": [],
+    "utils": [
+        "OptionalDependencyNotAvailable",
+        "is_flax_available",
+        "is_inflect_available",
+        "is_invisible_watermark_available",
+        "is_k_diffusion_available",
+        "is_k_diffusion_version",
+        "is_librosa_available",
+        "is_note_seq_available",
+        "is_onnx_available",
+        "is_scipy_available",
+        "is_torch_available",
+        "is_torchsde_available",
+        "is_transformers_available",
+        "is_transformers_version",
+        "is_unidecode_available",
+        "logging",
+    ],
+}
+
+try:
+    if not is_onnx_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_onnx_objects  # noqa F403
+
+    _import_structure["utils.dummy_onnx_objects"] = [
+        name for name in dir(dummy_onnx_objects) if not name.startswith("_")
+    ]
+
+else:
+    _import_structure["pipelines"].extend(["OnnxRuntimeModel"])
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_pt_objects  # noqa F403
+
+    _import_structure["utils.dummy_pt_objects"] = [name for name in dir(dummy_pt_objects) if not name.startswith("_")]
+
+else:
+    _import_structure["models"].extend(
+        [
+            "AsymmetricAutoencoderKL",
+            "AutoencoderKL",
+            "AutoencoderKLTemporalDecoder",
+            "AutoencoderTiny",
+            "ConsistencyDecoderVAE",
+            "ControlNetModel",
+            "I2VGenXLUNet",
+            "Kandinsky3UNet",
+            "ModelMixin",
+            "MotionAdapter",
+            "MultiAdapter",
+            "PriorTransformer",
+            "StableCascadeUNet",
+            "T2IAdapter",
+            "T5FilmDecoder",
+            "Transformer2DModel",
+            "UNet1DModel",
+            "UNet2DConditionModel",
+            "UNet2DModel",
+            "UNet3DConditionModel",
+            "UNetMotionModel",
+            "UNetSpatioTemporalConditionModel",
+            "UVit2DModel",
+            "VQModel",
+        ]
+    )
+
+    _import_structure["optimization"] = [
+        "get_constant_schedule",
+        "get_constant_schedule_with_warmup",
+        "get_cosine_schedule_with_warmup",
+        "get_cosine_with_hard_restarts_schedule_with_warmup",
+        "get_linear_schedule_with_warmup",
+        "get_polynomial_decay_schedule_with_warmup",
+        "get_scheduler",
+    ]
+    _import_structure["pipelines"].extend(
+        [
+            "AudioPipelineOutput",
+            "AutoPipelineForImage2Image",
+            "AutoPipelineForInpainting",
+            "AutoPipelineForText2Image",
+            "ConsistencyModelPipeline",
+            "DanceDiffusionPipeline",
+            "DDIMPipeline",
+            "DDPMPipeline",
+            "DiffusionPipeline",
+            "DiTPipeline",
+            "ImagePipelineOutput",
+            "KarrasVePipeline",
+            "LDMPipeline",
+            "LDMSuperResolutionPipeline",
+            "PNDMPipeline",
+            "RePaintPipeline",
+            "ScoreSdeVePipeline",
+            "StableDiffusionMixin",
+        ]
+    )
+    _import_structure["schedulers"].extend(
+        [
+            "AmusedScheduler",
+            "CMStochasticIterativeScheduler",
+            "DDIMInverseScheduler",
+            "DDIMParallelScheduler",
+            "DDIMScheduler",
+            "DDPMParallelScheduler",
+            "DDPMScheduler",
+            "DDPMWuerstchenScheduler",
+            "DEISMultistepScheduler",
+            "DPMSolverMultistepInverseScheduler",
+            "DPMSolverMultistepScheduler",
+            "DPMSolverSinglestepScheduler",
+            "EDMDPMSolverMultistepScheduler",
+            "EDMEulerScheduler",
+            "EulerAncestralDiscreteScheduler",
+            "EulerDiscreteScheduler",
+            "HeunDiscreteScheduler",
+            "IPNDMScheduler",
+            "KarrasVeScheduler",
+            "KDPM2AncestralDiscreteScheduler",
+            "KDPM2DiscreteScheduler",
+            "LCMScheduler",
+            "PNDMScheduler",
+            "RePaintScheduler",
+            "SASolverScheduler",
+            "SchedulerMixin",
+            "ScoreSdeVeScheduler",
+            "TCDScheduler",
+            "UnCLIPScheduler",
+            "UniPCMultistepScheduler",
+            "VQDiffusionScheduler",
+        ]
+    )
+    _import_structure["training_utils"] = ["EMAModel"]
+
+try:
+    if not (is_torch_available() and is_scipy_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_torch_and_scipy_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_scipy_objects"] = [
+        name for name in dir(dummy_torch_and_scipy_objects) if not name.startswith("_")
+    ]
+
+else:
+    _import_structure["schedulers"].extend(["LMSDiscreteScheduler"])
+
+try:
+    if not (is_torch_available() and is_torchsde_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_torch_and_torchsde_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_torchsde_objects"] = [
+        name for name in dir(dummy_torch_and_torchsde_objects) if not name.startswith("_")
+    ]
+
+else:
+    _import_structure["schedulers"].extend(["DPMSolverSDEScheduler"])
+
+try:
+    if not (is_torch_available() and is_transformers_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_transformers_objects"] = [
+        name for name in dir(dummy_torch_and_transformers_objects) if not name.startswith("_")
+    ]
+
+else:
+    _import_structure["pipelines"].extend(
+        [
+            "AltDiffusionImg2ImgPipeline",
+            "AltDiffusionPipeline",
+            "AmusedImg2ImgPipeline",
+            "AmusedInpaintPipeline",
+            "AmusedPipeline",
+            "AnimateDiffPipeline",
+            "AnimateDiffVideoToVideoPipeline",
+            "AudioLDM2Pipeline",
+            "AudioLDM2ProjectionModel",
+            "AudioLDM2UNet2DConditionModel",
+            "AudioLDMPipeline",
+            "BlipDiffusionControlNetPipeline",
+            "BlipDiffusionPipeline",
+            "CLIPImageProjection",
+            "CycleDiffusionPipeline",
+            "I2VGenXLPipeline",
+            "IFImg2ImgPipeline",
+            "IFImg2ImgSuperResolutionPipeline",
+            "IFInpaintingPipeline",
+            "IFInpaintingSuperResolutionPipeline",
+            "IFPipeline",
+            "IFSuperResolutionPipeline",
+            "ImageTextPipelineOutput",
+            "Kandinsky3Img2ImgPipeline",
+            "Kandinsky3Pipeline",
+            "KandinskyCombinedPipeline",
+            "KandinskyImg2ImgCombinedPipeline",
+            "KandinskyImg2ImgPipeline",
+            "KandinskyInpaintCombinedPipeline",
+            "KandinskyInpaintPipeline",
+            "KandinskyPipeline",
+            "KandinskyPriorPipeline",
+            "KandinskyV22CombinedPipeline",
+            "KandinskyV22ControlnetImg2ImgPipeline",
+            "KandinskyV22ControlnetPipeline",
+            "KandinskyV22Img2ImgCombinedPipeline",
+            "KandinskyV22Img2ImgPipeline",
+            "KandinskyV22InpaintCombinedPipeline",
+            "KandinskyV22InpaintPipeline",
+            "KandinskyV22Pipeline",
+            "KandinskyV22PriorEmb2EmbPipeline",
+            "KandinskyV22PriorPipeline",
+            "LatentConsistencyModelImg2ImgPipeline",
+            "LatentConsistencyModelPipeline",
+            "LDMTextToImagePipeline",
+            "LEditsPPPipelineStableDiffusion",
+            "LEditsPPPipelineStableDiffusionXL",
+            "MusicLDMPipeline",
+            "PaintByExamplePipeline",
+            "PIAPipeline",
+            "PixArtAlphaPipeline",
+            "SemanticStableDiffusionPipeline",
+            "ShapEImg2ImgPipeline",
+            "ShapEPipeline",
+            "StableCascadeCombinedPipeline",
+            "StableCascadeDecoderPipeline",
+            "StableCascadePriorPipeline",
+            "StableDiffusionAdapterPipeline",
+            "StableDiffusionAttendAndExcitePipeline",
+            "StableDiffusionControlNetImg2ImgPipeline",
+            "StableDiffusionControlNetInpaintPipeline",
+            "StableDiffusionControlNetPipeline",
+            "StableDiffusionDepth2ImgPipeline",
+            "StableDiffusionDiffEditPipeline",
+            "StableDiffusionGLIGENPipeline",
+            "StableDiffusionGLIGENTextImagePipeline",
+            "StableDiffusionImageVariationPipeline",
+            "StableDiffusionImg2ImgPipeline",
+            "StableDiffusionInpaintPipeline",
+            "StableDiffusionInpaintPipelineLegacy",
+            "StableDiffusionInstructPix2PixPipeline",
+            "StableDiffusionLatentUpscalePipeline",
+            "StableDiffusionLDM3DPipeline",
+            "StableDiffusionModelEditingPipeline",
+            "StableDiffusionPanoramaPipeline",
+            "StableDiffusionParadigmsPipeline",
+            "StableDiffusionPipeline",
+            "StableDiffusionPipelineSafe",
+            "StableDiffusionPix2PixZeroPipeline",
+            "StableDiffusionSAGPipeline",
+            "StableDiffusionUpscalePipeline",
+            "StableDiffusionXLAdapterPipeline",
+            "StableDiffusionXLControlNetImg2ImgPipeline",
+            "StableDiffusionXLControlNetInpaintPipeline",
+            "StableDiffusionXLControlNetPipeline",
+            "StableDiffusionXLImg2ImgPipeline",
+            "StableDiffusionXLInpaintPipeline",
+            "StableDiffusionXLInstructPix2PixPipeline",
+            "StableDiffusionXLPipeline",
+            "StableUnCLIPImg2ImgPipeline",
+            "StableUnCLIPPipeline",
+            "StableVideoDiffusionPipeline",
+            "TextToVideoSDPipeline",
+            "TextToVideoZeroPipeline",
+            "TextToVideoZeroSDXLPipeline",
+            "UnCLIPImageVariationPipeline",
+            "UnCLIPPipeline",
+            "UniDiffuserModel",
+            "UniDiffuserPipeline",
+            "UniDiffuserTextDecoder",
+            "VersatileDiffusionDualGuidedPipeline",
+            "VersatileDiffusionImageVariationPipeline",
+            "VersatileDiffusionPipeline",
+            "VersatileDiffusionTextToImagePipeline",
+            "VideoToVideoSDPipeline",
+            "VQDiffusionPipeline",
+            "WuerstchenCombinedPipeline",
+            "WuerstchenDecoderPipeline",
+            "WuerstchenPriorPipeline",
+        ]
+    )
+
+try:
+    if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_torch_and_transformers_and_k_diffusion_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_transformers_and_k_diffusion_objects"] = [
+        name for name in dir(dummy_torch_and_transformers_and_k_diffusion_objects) if not name.startswith("_")
+    ]
+
+else:
+    _import_structure["pipelines"].extend(["StableDiffusionKDiffusionPipeline", "StableDiffusionXLKDiffusionPipeline"])
+
+try:
+    if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_torch_and_transformers_and_onnx_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_transformers_and_onnx_objects"] = [
+        name for name in dir(dummy_torch_and_transformers_and_onnx_objects) if not name.startswith("_")
+    ]
+
+else:
+    _import_structure["pipelines"].extend(
+        [
+            "OnnxStableDiffusionImg2ImgPipeline",
+            "OnnxStableDiffusionInpaintPipeline",
+            "OnnxStableDiffusionInpaintPipelineLegacy",
+            "OnnxStableDiffusionPipeline",
+            "OnnxStableDiffusionUpscalePipeline",
+            "StableDiffusionOnnxPipeline",
+        ]
+    )
+
+try:
+    if not (is_torch_available() and is_librosa_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_torch_and_librosa_objects  # noqa F403
+
+    _import_structure["utils.dummy_torch_and_librosa_objects"] = [
+        name for name in dir(dummy_torch_and_librosa_objects) if not name.startswith("_")
+    ]
+
+else:
+    _import_structure["pipelines"].extend(["AudioDiffusionPipeline", "Mel"])
+
+try:
+    if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_transformers_and_torch_and_note_seq_objects  # noqa F403
+
+    _import_structure["utils.dummy_transformers_and_torch_and_note_seq_objects"] = [
+        name for name in dir(dummy_transformers_and_torch_and_note_seq_objects) if not name.startswith("_")
+    ]
+
+
+else:
+    _import_structure["pipelines"].extend(["SpectrogramDiffusionPipeline"])
+
+try:
+    if not is_flax_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_flax_objects  # noqa F403
+
+    _import_structure["utils.dummy_flax_objects"] = [
+        name for name in dir(dummy_flax_objects) if not name.startswith("_")
+    ]
+
+
+else:
+    _import_structure["models.controlnet_flax"] = ["FlaxControlNetModel"]
+    _import_structure["models.modeling_flax_utils"] = ["FlaxModelMixin"]
+    _import_structure["models.unets.unet_2d_condition_flax"] = ["FlaxUNet2DConditionModel"]
+    _import_structure["models.vae_flax"] = ["FlaxAutoencoderKL"]
+    _import_structure["pipelines"].extend(["FlaxDiffusionPipeline"])
+    _import_structure["schedulers"].extend(
+        [
+            "FlaxDDIMScheduler",
+            "FlaxDDPMScheduler",
+            "FlaxDPMSolverMultistepScheduler",
+            "FlaxEulerDiscreteScheduler",
+            "FlaxKarrasVeScheduler",
+            "FlaxLMSDiscreteScheduler",
+            "FlaxPNDMScheduler",
+            "FlaxSchedulerMixin",
+            "FlaxScoreSdeVeScheduler",
+        ]
+    )
+
+
+try:
+    if not (is_flax_available() and is_transformers_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_flax_and_transformers_objects  # noqa F403
+
+    _import_structure["utils.dummy_flax_and_transformers_objects"] = [
+        name for name in dir(dummy_flax_and_transformers_objects) if not name.startswith("_")
+    ]
+
+
+else:
+    _import_structure["pipelines"].extend(
+        [
+            "FlaxStableDiffusionControlNetPipeline",
+            "FlaxStableDiffusionImg2ImgPipeline",
+            "FlaxStableDiffusionInpaintPipeline",
+            "FlaxStableDiffusionPipeline",
+            "FlaxStableDiffusionXLPipeline",
+        ]
+    )
+
+try:
+    if not (is_note_seq_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_note_seq_objects  # noqa F403
+
+    _import_structure["utils.dummy_note_seq_objects"] = [
+        name for name in dir(dummy_note_seq_objects) if not name.startswith("_")
+    ]
+
+
+else:
+    _import_structure["pipelines"].extend(["MidiProcessor"])
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .configuration_utils import ConfigMixin
+
+    try:
+        if not is_onnx_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_onnx_objects import *  # noqa F403
+    else:
+        from .pipelines import OnnxRuntimeModel
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_pt_objects import *  # noqa F403
+    else:
+        from .models import (
+            AsymmetricAutoencoderKL,
+            AutoencoderKL,
+            AutoencoderKLTemporalDecoder,
+            AutoencoderTiny,
+            ConsistencyDecoderVAE,
+            ControlNetModel,
+            I2VGenXLUNet,
+            Kandinsky3UNet,
+            ModelMixin,
+            MotionAdapter,
+            MultiAdapter,
+            PriorTransformer,
+            T2IAdapter,
+            T5FilmDecoder,
+            Transformer2DModel,
+            UNet1DModel,
+            UNet2DConditionModel,
+            UNet2DModel,
+            UNet3DConditionModel,
+            UNetMotionModel,
+            UNetSpatioTemporalConditionModel,
+            UVit2DModel,
+            VQModel,
+        )
+        from .optimization import (
+            get_constant_schedule,
+            get_constant_schedule_with_warmup,
+            get_cosine_schedule_with_warmup,
+            get_cosine_with_hard_restarts_schedule_with_warmup,
+            get_linear_schedule_with_warmup,
+            get_polynomial_decay_schedule_with_warmup,
+            get_scheduler,
+        )
+        from .pipelines import (
+            AudioPipelineOutput,
+            AutoPipelineForImage2Image,
+            AutoPipelineForInpainting,
+            AutoPipelineForText2Image,
+            BlipDiffusionControlNetPipeline,
+            BlipDiffusionPipeline,
+            CLIPImageProjection,
+            ConsistencyModelPipeline,
+            DanceDiffusionPipeline,
+            DDIMPipeline,
+            DDPMPipeline,
+            DiffusionPipeline,
+            DiTPipeline,
+            ImagePipelineOutput,
+            KarrasVePipeline,
+            LDMPipeline,
+            LDMSuperResolutionPipeline,
+            PNDMPipeline,
+            RePaintPipeline,
+            ScoreSdeVePipeline,
+            StableDiffusionMixin,
+        )
+        from .schedulers import (
+            AmusedScheduler,
+            CMStochasticIterativeScheduler,
+            DDIMInverseScheduler,
+            DDIMParallelScheduler,
+            DDIMScheduler,
+            DDPMParallelScheduler,
+            DDPMScheduler,
+            DDPMWuerstchenScheduler,
+            DEISMultistepScheduler,
+            DPMSolverMultistepInverseScheduler,
+            DPMSolverMultistepScheduler,
+            DPMSolverSinglestepScheduler,
+            EDMDPMSolverMultistepScheduler,
+            EDMEulerScheduler,
+            EulerAncestralDiscreteScheduler,
+            EulerDiscreteScheduler,
+            HeunDiscreteScheduler,
+            IPNDMScheduler,
+            KarrasVeScheduler,
+            KDPM2AncestralDiscreteScheduler,
+            KDPM2DiscreteScheduler,
+            LCMScheduler,
+            PNDMScheduler,
+            RePaintScheduler,
+            SASolverScheduler,
+            SchedulerMixin,
+            ScoreSdeVeScheduler,
+            TCDScheduler,
+            UnCLIPScheduler,
+            UniPCMultistepScheduler,
+            VQDiffusionScheduler,
+        )
+        from .training_utils import EMAModel
+
+    try:
+        if not (is_torch_available() and is_scipy_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_scipy_objects import *  # noqa F403
+    else:
+        from .schedulers import LMSDiscreteScheduler
+
+    try:
+        if not (is_torch_available() and is_torchsde_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_torchsde_objects import *  # noqa F403
+    else:
+        from .schedulers import DPMSolverSDEScheduler
+
+    try:
+        if not (is_torch_available() and is_transformers_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipelines import (
+            AltDiffusionImg2ImgPipeline,
+            AltDiffusionPipeline,
+            AmusedImg2ImgPipeline,
+            AmusedInpaintPipeline,
+            AmusedPipeline,
+            AnimateDiffPipeline,
+            AnimateDiffVideoToVideoPipeline,
+            AudioLDM2Pipeline,
+            AudioLDM2ProjectionModel,
+            AudioLDM2UNet2DConditionModel,
+            AudioLDMPipeline,
+            CLIPImageProjection,
+            CycleDiffusionPipeline,
+            I2VGenXLPipeline,
+            IFImg2ImgPipeline,
+            IFImg2ImgSuperResolutionPipeline,
+            IFInpaintingPipeline,
+            IFInpaintingSuperResolutionPipeline,
+            IFPipeline,
+            IFSuperResolutionPipeline,
+            ImageTextPipelineOutput,
+            Kandinsky3Img2ImgPipeline,
+            Kandinsky3Pipeline,
+            KandinskyCombinedPipeline,
+            KandinskyImg2ImgCombinedPipeline,
+            KandinskyImg2ImgPipeline,
+            KandinskyInpaintCombinedPipeline,
+            KandinskyInpaintPipeline,
+            KandinskyPipeline,
+            KandinskyPriorPipeline,
+            KandinskyV22CombinedPipeline,
+            KandinskyV22ControlnetImg2ImgPipeline,
+            KandinskyV22ControlnetPipeline,
+            KandinskyV22Img2ImgCombinedPipeline,
+            KandinskyV22Img2ImgPipeline,
+            KandinskyV22InpaintCombinedPipeline,
+            KandinskyV22InpaintPipeline,
+            KandinskyV22Pipeline,
+            KandinskyV22PriorEmb2EmbPipeline,
+            KandinskyV22PriorPipeline,
+            LatentConsistencyModelImg2ImgPipeline,
+            LatentConsistencyModelPipeline,
+            LDMTextToImagePipeline,
+            LEditsPPPipelineStableDiffusion,
+            LEditsPPPipelineStableDiffusionXL,
+            MusicLDMPipeline,
+            PaintByExamplePipeline,
+            PIAPipeline,
+            PixArtAlphaPipeline,
+            SemanticStableDiffusionPipeline,
+            ShapEImg2ImgPipeline,
+            ShapEPipeline,
+            StableCascadeCombinedPipeline,
+            StableCascadeDecoderPipeline,
+            StableCascadePriorPipeline,
+            StableDiffusionAdapterPipeline,
+            StableDiffusionAttendAndExcitePipeline,
+            StableDiffusionControlNetImg2ImgPipeline,
+            StableDiffusionControlNetInpaintPipeline,
+            StableDiffusionControlNetPipeline,
+            StableDiffusionDepth2ImgPipeline,
+            StableDiffusionDiffEditPipeline,
+            StableDiffusionGLIGENPipeline,
+            StableDiffusionGLIGENTextImagePipeline,
+            StableDiffusionImageVariationPipeline,
+            StableDiffusionImg2ImgPipeline,
+            StableDiffusionInpaintPipeline,
+            StableDiffusionInpaintPipelineLegacy,
+            StableDiffusionInstructPix2PixPipeline,
+            StableDiffusionLatentUpscalePipeline,
+            StableDiffusionLDM3DPipeline,
+            StableDiffusionModelEditingPipeline,
+            StableDiffusionPanoramaPipeline,
+            StableDiffusionParadigmsPipeline,
+            StableDiffusionPipeline,
+            StableDiffusionPipelineSafe,
+            StableDiffusionPix2PixZeroPipeline,
+            StableDiffusionSAGPipeline,
+            StableDiffusionUpscalePipeline,
+            StableDiffusionXLAdapterPipeline,
+            StableDiffusionXLControlNetImg2ImgPipeline,
+            StableDiffusionXLControlNetInpaintPipeline,
+            StableDiffusionXLControlNetPipeline,
+            StableDiffusionXLImg2ImgPipeline,
+            StableDiffusionXLInpaintPipeline,
+            StableDiffusionXLInstructPix2PixPipeline,
+            StableDiffusionXLPipeline,
+            StableUnCLIPImg2ImgPipeline,
+            StableUnCLIPPipeline,
+            StableVideoDiffusionPipeline,
+            TextToVideoSDPipeline,
+            TextToVideoZeroPipeline,
+            TextToVideoZeroSDXLPipeline,
+            UnCLIPImageVariationPipeline,
+            UnCLIPPipeline,
+            UniDiffuserModel,
+            UniDiffuserPipeline,
+            UniDiffuserTextDecoder,
+            VersatileDiffusionDualGuidedPipeline,
+            VersatileDiffusionImageVariationPipeline,
+            VersatileDiffusionPipeline,
+            VersatileDiffusionTextToImagePipeline,
+            VideoToVideoSDPipeline,
+            VQDiffusionPipeline,
+            WuerstchenCombinedPipeline,
+            WuerstchenDecoderPipeline,
+            WuerstchenPriorPipeline,
+        )
+
+    try:
+        if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_transformers_and_k_diffusion_objects import *  # noqa F403
+    else:
+        from .pipelines import StableDiffusionKDiffusionPipeline, StableDiffusionXLKDiffusionPipeline
+
+    try:
+        if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_transformers_and_onnx_objects import *  # noqa F403
+    else:
+        from .pipelines import (
+            OnnxStableDiffusionImg2ImgPipeline,
+            OnnxStableDiffusionInpaintPipeline,
+            OnnxStableDiffusionInpaintPipelineLegacy,
+            OnnxStableDiffusionPipeline,
+            OnnxStableDiffusionUpscalePipeline,
+            StableDiffusionOnnxPipeline,
+        )
+
+    try:
+        if not (is_torch_available() and is_librosa_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_torch_and_librosa_objects import *  # noqa F403
+    else:
+        from .pipelines import AudioDiffusionPipeline, Mel
+
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_transformers_and_torch_and_note_seq_objects import *  # noqa F403
+    else:
+        from .pipelines import SpectrogramDiffusionPipeline
+
+    try:
+        if not is_flax_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_flax_objects import *  # noqa F403
+    else:
+        from .models.controlnet_flax import FlaxControlNetModel
+        from .models.modeling_flax_utils import FlaxModelMixin
+        from .models.unets.unet_2d_condition_flax import FlaxUNet2DConditionModel
+        from .models.vae_flax import FlaxAutoencoderKL
+        from .pipelines import FlaxDiffusionPipeline
+        from .schedulers import (
+            FlaxDDIMScheduler,
+            FlaxDDPMScheduler,
+            FlaxDPMSolverMultistepScheduler,
+            FlaxEulerDiscreteScheduler,
+            FlaxKarrasVeScheduler,
+            FlaxLMSDiscreteScheduler,
+            FlaxPNDMScheduler,
+            FlaxSchedulerMixin,
+            FlaxScoreSdeVeScheduler,
+        )
+
+    try:
+        if not (is_flax_available() and is_transformers_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_flax_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipelines import (
+            FlaxStableDiffusionControlNetPipeline,
+            FlaxStableDiffusionImg2ImgPipeline,
+            FlaxStableDiffusionInpaintPipeline,
+            FlaxStableDiffusionPipeline,
+            FlaxStableDiffusionXLPipeline,
+        )
+
+    try:
+        if not (is_note_seq_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from .utils.dummy_note_seq_objects import *  # noqa F403
+    else:
+        from .pipelines import MidiProcessor
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={"__version__": __version__},
+    )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/commands/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/commands/__init__.py
new file mode 100644
index 000000000..8208283f6
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/commands/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from argparse import ArgumentParser
+
+
+class BaseDiffusersCLICommand(ABC):
+    @staticmethod
+    @abstractmethod
+    def register_subcommand(parser: ArgumentParser):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def run(self):
+        raise NotImplementedError()
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/commands/diffusers_cli.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/commands/diffusers_cli.py
new file mode 100644
index 000000000..f582c3bcd
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/commands/diffusers_cli.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from argparse import ArgumentParser
+
+from .env import EnvironmentCommand
+from .fp16_safetensors import FP16SafetensorsCommand
+
+
+def main():
+    parser = ArgumentParser("Diffusers CLI tool", usage="diffusers-cli <command> [<args>]")
+    commands_parser = parser.add_subparsers(help="diffusers-cli command helpers")
+
+    # Register commands
+    EnvironmentCommand.register_subcommand(commands_parser)
+    FP16SafetensorsCommand.register_subcommand(commands_parser)
+
+    # Let's go
+    args = parser.parse_args()
+
+    if not hasattr(args, "func"):
+        parser.print_help()
+        exit(1)
+
+    # Run
+    service = args.func(args)
+    service.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/commands/env.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/commands/env.py
new file mode 100644
index 000000000..baa69b361
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/commands/env.py
@@ -0,0 +1,84 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import platform
+from argparse import ArgumentParser
+
+import huggingface_hub
+
+from .. import __version__ as version
+from ..utils import is_accelerate_available, is_torch_available, is_transformers_available, is_xformers_available
+from . import BaseDiffusersCLICommand
+
+
+def info_command_factory(_):
+    return EnvironmentCommand()
+
+
+class EnvironmentCommand(BaseDiffusersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        download_parser = parser.add_parser("env")
+        download_parser.set_defaults(func=info_command_factory)
+
+    def run(self):
+        hub_version = huggingface_hub.__version__
+
+        pt_version = "not installed"
+        pt_cuda_available = "NA"
+        if is_torch_available():
+            import torch
+
+            pt_version = torch.__version__
+            pt_cuda_available = torch.cuda.is_available()
+
+        transformers_version = "not installed"
+        if is_transformers_available():
+            import transformers
+
+            transformers_version = transformers.__version__
+
+        accelerate_version = "not installed"
+        if is_accelerate_available():
+            import accelerate
+
+            accelerate_version = accelerate.__version__
+
+        xformers_version = "not installed"
+        if is_xformers_available():
+            import xformers
+
+            xformers_version = xformers.__version__
+
+        info = {
+            "`diffusers` version": version,
+            "Platform": platform.platform(),
+            "Python version": platform.python_version(),
+            "PyTorch version (GPU?)": f"{pt_version} ({pt_cuda_available})",
+            "Huggingface_hub version": hub_version,
+            "Transformers version": transformers_version,
+            "Accelerate version": accelerate_version,
+            "xFormers version": xformers_version,
+            "Using GPU in script?": "<fill in>",
+            "Using distributed or parallel set-up in script?": "<fill in>",
+        }
+
+        print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n")
+        print(self.format_dict(info))
+
+        return info
+
+    @staticmethod
+    def format_dict(d):
+        return "\n".join([f"- {prop}: {val}" for prop, val in d.items()]) + "\n"
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/commands/fp16_safetensors.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/commands/fp16_safetensors.py
new file mode 100644
index 000000000..b26b8816b
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/commands/fp16_safetensors.py
@@ -0,0 +1,132 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Usage example:
+    diffusers-cli fp16_safetensors --ckpt_id=openai/shap-e --fp16 --use_safetensors
+"""
+
+import glob
+import json
+import warnings
+from argparse import ArgumentParser, Namespace
+from importlib import import_module
+
+import huggingface_hub
+import torch
+from huggingface_hub import hf_hub_download
+from packaging import version
+
+from ..utils import logging
+from . import BaseDiffusersCLICommand
+
+
+def conversion_command_factory(args: Namespace):
+    if args.use_auth_token:
+        warnings.warn(
+            "The `--use_auth_token` flag is deprecated and will be removed in a future version. Authentication is now"
+            " handled automatically if user is logged in."
+        )
+    return FP16SafetensorsCommand(args.ckpt_id, args.fp16, args.use_safetensors)
+
+
+class FP16SafetensorsCommand(BaseDiffusersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        conversion_parser = parser.add_parser("fp16_safetensors")
+        conversion_parser.add_argument(
+            "--ckpt_id",
+            type=str,
+            help="Repo id of the checkpoints on which to run the conversion. Example: 'openai/shap-e'.",
+        )
+        conversion_parser.add_argument(
+            "--fp16", action="store_true", help="If serializing the variables in FP16 precision."
+        )
+        conversion_parser.add_argument(
+            "--use_safetensors", action="store_true", help="If serializing in the safetensors format."
+        )
+        conversion_parser.add_argument(
+            "--use_auth_token",
+            action="store_true",
+            help="When working with checkpoints having private visibility. When used `huggingface-cli login` needs to be run beforehand.",
+        )
+        conversion_parser.set_defaults(func=conversion_command_factory)
+
+    def __init__(self, ckpt_id: str, fp16: bool, use_safetensors: bool):
+        self.logger = logging.get_logger("diffusers-cli/fp16_safetensors")
+        self.ckpt_id = ckpt_id
+        self.local_ckpt_dir = f"/tmp/{ckpt_id}"
+        self.fp16 = fp16
+
+        self.use_safetensors = use_safetensors
+
+        if not self.use_safetensors and not self.fp16:
+            raise NotImplementedError(
+                "When `use_safetensors` and `fp16` both are False, then this command is of no use."
+            )
+
+    def run(self):
+        if version.parse(huggingface_hub.__version__) < version.parse("0.9.0"):
+            raise ImportError(
+                "The huggingface_hub version must be >= 0.9.0 to use this command. Please update your huggingface_hub"
+                " installation."
+            )
+        else:
+            from huggingface_hub import create_commit
+            from huggingface_hub._commit_api import CommitOperationAdd
+
+        model_index = hf_hub_download(repo_id=self.ckpt_id, filename="model_index.json")
+        with open(model_index, "r") as f:
+            pipeline_class_name = json.load(f)["_class_name"]
+        pipeline_class = getattr(import_module("diffusers"), pipeline_class_name)
+        self.logger.info(f"Pipeline class imported: {pipeline_class_name}.")
+
+        # Load the appropriate pipeline. We could have use `DiffusionPipeline`
+        # here, but just to avoid any rough edge cases.
+        pipeline = pipeline_class.from_pretrained(
+            self.ckpt_id, torch_dtype=torch.float16 if self.fp16 else torch.float32
+        )
+        pipeline.save_pretrained(
+            self.local_ckpt_dir,
+            safe_serialization=True if self.use_safetensors else False,
+            variant="fp16" if self.fp16 else None,
+        )
+        self.logger.info(f"Pipeline locally saved to {self.local_ckpt_dir}.")
+
+        # Fetch all the paths.
+        if self.fp16:
+            modified_paths = glob.glob(f"{self.local_ckpt_dir}/*/*.fp16.*")
+        elif self.use_safetensors:
+            modified_paths = glob.glob(f"{self.local_ckpt_dir}/*/*.safetensors")
+
+        # Prepare for the PR.
+        commit_message = f"Serialize variables with FP16: {self.fp16} and safetensors: {self.use_safetensors}."
+        operations = []
+        for path in modified_paths:
+            operations.append(CommitOperationAdd(path_in_repo="/".join(path.split("/")[4:]), path_or_fileobj=path))
+
+        # Open the PR.
+        commit_description = (
+            "Variables converted by the [`diffusers`' `fp16_safetensors`"
+            " CLI](https://github.com/huggingface/diffusers/blob/main/src/diffusers/commands/fp16_safetensors.py)."
+        )
+        hub_pr_url = create_commit(
+            repo_id=self.ckpt_id,
+            operations=operations,
+            commit_message=commit_message,
+            commit_description=commit_description,
+            repo_type="model",
+            create_pr=True,
+        ).pr_url
+        self.logger.info(f"PR created here: {hub_pr_url}.")
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/configuration_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/configuration_utils.py
new file mode 100644
index 000000000..189ef4380
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/configuration_utils.py
@@ -0,0 +1,703 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ConfigMixin base class and utilities."""
+import dataclasses
+import functools
+import importlib
+import inspect
+import json
+import os
+import re
+from collections import OrderedDict
+from pathlib import PosixPath
+from typing import Any, Dict, Tuple, Union
+
+import numpy as np
+from huggingface_hub import create_repo, hf_hub_download
+from huggingface_hub.utils import (
+    EntryNotFoundError,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+    validate_hf_hub_args,
+)
+from requests import HTTPError
+
+from . import __version__
+from .utils import (
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    DummyObject,
+    deprecate,
+    extract_commit_hash,
+    http_user_agent,
+    logging,
+)
+
+
+logger = logging.get_logger(__name__)
+
+_re_configuration_file = re.compile(r"config\.(.*)\.json")
+
+
+class FrozenDict(OrderedDict):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        for key, value in self.items():
+            setattr(self, key, value)
+
+        self.__frozen = True
+
+    def __delitem__(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
+
+    def setdefault(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
+
+    def pop(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
+
+    def update(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
+
+    def __setattr__(self, name, value):
+        if hasattr(self, "__frozen") and self.__frozen:
+            raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.")
+        super().__setattr__(name, value)
+
+    def __setitem__(self, name, value):
+        if hasattr(self, "__frozen") and self.__frozen:
+            raise Exception(f"You cannot use ``__setattr__`` on a {self.__class__.__name__} instance.")
+        super().__setitem__(name, value)
+
+
+class ConfigMixin:
+    r"""
+    Base class for all configuration classes. All configuration parameters are stored under `self.config`. Also
+    provides the [`~ConfigMixin.from_config`] and [`~ConfigMixin.save_config`] methods for loading, downloading, and
+    saving classes that inherit from [`ConfigMixin`].
+
+    Class attributes:
+        - **config_name** (`str`) -- A filename under which the config should stored when calling
+          [`~ConfigMixin.save_config`] (should be overridden by parent class).
+        - **ignore_for_config** (`List[str]`) -- A list of attributes that should not be saved in the config (should be
+          overridden by subclass).
+        - **has_compatibles** (`bool`) -- Whether the class has compatible classes (should be overridden by subclass).
+        - **_deprecated_kwargs** (`List[str]`) -- Keyword arguments that are deprecated. Note that the `init` function
+          should only have a `kwargs` argument if at least one argument is deprecated (should be overridden by
+          subclass).
+    """
+
+    config_name = None
+    ignore_for_config = []
+    has_compatibles = False
+
+    _deprecated_kwargs = []
+
+    def register_to_config(self, **kwargs):
+        if self.config_name is None:
+            raise NotImplementedError(f"Make sure that {self.__class__} has defined a class name `config_name`")
+        # Special case for `kwargs` used in deprecation warning added to schedulers
+        # TODO: remove this when we remove the deprecation warning, and the `kwargs` argument,
+        # or solve in a more general way.
+        kwargs.pop("kwargs", None)
+
+        if not hasattr(self, "_internal_dict"):
+            internal_dict = kwargs
+        else:
+            previous_dict = dict(self._internal_dict)
+            internal_dict = {**self._internal_dict, **kwargs}
+            logger.debug(f"Updating config from {previous_dict} to {internal_dict}")
+
+        self._internal_dict = FrozenDict(internal_dict)
+
+    def __getattr__(self, name: str) -> Any:
+        """The only reason we overwrite `getattr` here is to gracefully deprecate accessing
+        config attributes directly. See https://github.com/huggingface/diffusers/pull/3129
+
+        This function is mostly copied from PyTorch's __getattr__ overwrite:
+        https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
+        """
+
+        is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name)
+        is_attribute = name in self.__dict__
+
+        if is_in_config and not is_attribute:
+            deprecation_message = f"Accessing config attribute `{name}` directly via '{type(self).__name__}' object attribute is deprecated. Please access '{name}' over '{type(self).__name__}'s config object instead, e.g. 'scheduler.config.{name}'."
+            deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False)
+            return self._internal_dict[name]
+
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
+
+    def save_config(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
+        """
+        Save a configuration object to the directory specified in `save_directory` so that it can be reloaded using the
+        [`~ConfigMixin.from_config`] class method.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the configuration JSON file is saved (will be created if it does not exist).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        # If we save using the predefined names, we can load using `from_config`
+        output_config_file = os.path.join(save_directory, self.config_name)
+
+        self.to_json_file(output_config_file)
+        logger.info(f"Configuration saved in {output_config_file}")
+
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            private = kwargs.pop("private", False)
+            create_pr = kwargs.pop("create_pr", False)
+            token = kwargs.pop("token", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
+
+            self._upload_folder(
+                save_directory,
+                repo_id,
+                token=token,
+                commit_message=commit_message,
+                create_pr=create_pr,
+            )
+
+    @classmethod
+    def from_config(cls, config: Union[FrozenDict, Dict[str, Any]] = None, return_unused_kwargs=False, **kwargs):
+        r"""
+        Instantiate a Python class from a config dictionary.
+
+        Parameters:
+            config (`Dict[str, Any]`):
+                A config dictionary from which the Python class is instantiated. Make sure to only load configuration
+                files of compatible classes.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                Whether kwargs that are not consumed by the Python class should be returned or not.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to update the configuration object (after it is loaded) and initiate the Python class.
+                `**kwargs` are passed directly to the underlying scheduler/model's `__init__` method and eventually
+                overwrite the same named arguments in `config`.
+
+        Returns:
+            [`ModelMixin`] or [`SchedulerMixin`]:
+                A model or scheduler object instantiated from a config dictionary.
+
+        Examples:
+
+        ```python
+        >>> from diffusers import DDPMScheduler, DDIMScheduler, PNDMScheduler
+
+        >>> # Download scheduler from huggingface.co and cache.
+        >>> scheduler = DDPMScheduler.from_pretrained("google/ddpm-cifar10-32")
+
+        >>> # Instantiate DDIM scheduler class with same config as DDPM
+        >>> scheduler = DDIMScheduler.from_config(scheduler.config)
+
+        >>> # Instantiate PNDM scheduler class with same config as DDPM
+        >>> scheduler = PNDMScheduler.from_config(scheduler.config)
+        ```
+        """
+        # <===== TO BE REMOVED WITH DEPRECATION
+        # TODO(Patrick) - make sure to remove the following lines when config=="model_path" is deprecated
+        if "pretrained_model_name_or_path" in kwargs:
+            config = kwargs.pop("pretrained_model_name_or_path")
+
+        if config is None:
+            raise ValueError("Please make sure to provide a config as the first positional argument.")
+        # ======>
+
+        if not isinstance(config, dict):
+            deprecation_message = "It is deprecated to pass a pretrained model name or path to `from_config`."
+            if "Scheduler" in cls.__name__:
+                deprecation_message += (
+                    f"If you were trying to load a scheduler, please use {cls}.from_pretrained(...) instead."
+                    " Otherwise, please make sure to pass a configuration dictionary instead. This functionality will"
+                    " be removed in v1.0.0."
+                )
+            elif "Model" in cls.__name__:
+                deprecation_message += (
+                    f"If you were trying to load a model, please use {cls}.load_config(...) followed by"
+                    f" {cls}.from_config(...) instead. Otherwise, please make sure to pass a configuration dictionary"
+                    " instead. This functionality will be removed in v1.0.0."
+                )
+            deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)
+            config, kwargs = cls.load_config(pretrained_model_name_or_path=config, return_unused_kwargs=True, **kwargs)
+
+        init_dict, unused_kwargs, hidden_dict = cls.extract_init_dict(config, **kwargs)
+
+        # Allow dtype to be specified on initialization
+        if "dtype" in unused_kwargs:
+            init_dict["dtype"] = unused_kwargs.pop("dtype")
+
+        # add possible deprecated kwargs
+        for deprecated_kwarg in cls._deprecated_kwargs:
+            if deprecated_kwarg in unused_kwargs:
+                init_dict[deprecated_kwarg] = unused_kwargs.pop(deprecated_kwarg)
+
+        # Return model and optionally state and/or unused_kwargs
+        model = cls(**init_dict)
+
+        # make sure to also save config parameters that might be used for compatible classes
+        # update _class_name
+        if "_class_name" in hidden_dict:
+            hidden_dict["_class_name"] = cls.__name__
+
+        model.register_to_config(**hidden_dict)
+
+        # add hidden kwargs of compatible classes to unused_kwargs
+        unused_kwargs = {**unused_kwargs, **hidden_dict}
+
+        if return_unused_kwargs:
+            return (model, unused_kwargs)
+        else:
+            return model
+
+    @classmethod
+    def get_config_dict(cls, *args, **kwargs):
+        deprecation_message = (
+            f" The function get_config_dict is deprecated. Please use {cls}.load_config instead. This function will be"
+            " removed in version v1.0.0"
+        )
+        deprecate("get_config_dict", "1.0.0", deprecation_message, standard_warn=False)
+        return cls.load_config(*args, **kwargs)
+
+    @classmethod
+    @validate_hf_hub_args
+    def load_config(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        return_unused_kwargs=False,
+        return_commit_hash=False,
+        **kwargs,
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        r"""
+        Load a model or scheduler configuration.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing model weights saved with
+                      [`~ConfigMixin.save_config`].
+
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False):
+                Whether unused keyword arguments of the config are returned.
+            return_commit_hash (`bool`, *optional*, defaults to `False):
+                Whether the `commit_hash` of the loaded configuration are returned.
+
+        Returns:
+            `dict`:
+                A dictionary of all the parameters stored in a JSON configuration file.
+
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
+        _ = kwargs.pop("mirror", None)
+        subfolder = kwargs.pop("subfolder", None)
+        user_agent = kwargs.pop("user_agent", {})
+
+        user_agent = {**user_agent, "file_type": "config"}
+        user_agent = http_user_agent(user_agent)
+
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+
+        if cls.config_name is None:
+            raise ValueError(
+                "`self.config_name` is not defined. Note that one should not load a config from "
+                "`ConfigMixin`. Please make sure to define `config_name` in a class inheriting from `ConfigMixin`"
+            )
+
+        if os.path.isfile(pretrained_model_name_or_path):
+            config_file = pretrained_model_name_or_path
+        elif os.path.isdir(pretrained_model_name_or_path):
+            if os.path.isfile(os.path.join(pretrained_model_name_or_path, cls.config_name)):
+                # Load from a PyTorch checkpoint
+                config_file = os.path.join(pretrained_model_name_or_path, cls.config_name)
+            elif subfolder is not None and os.path.isfile(
+                os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
+            ):
+                config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
+            else:
+                raise EnvironmentError(
+                    f"Error no file named {cls.config_name} found in directory {pretrained_model_name_or_path}."
+                )
+        else:
+            try:
+                # Load from URL or cache if already cached
+                config_file = hf_hub_download(
+                    pretrained_model_name_or_path,
+                    filename=cls.config_name,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    local_files_only=local_files_only,
+                    token=token,
+                    user_agent=user_agent,
+                    subfolder=subfolder,
+                    revision=revision,
+                )
+            except RepositoryNotFoundError:
+                raise EnvironmentError(
+                    f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier"
+                    " listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a"
+                    " token having permission to this repo with `token` or log in with `huggingface-cli login`."
+                )
+            except RevisionNotFoundError:
+                raise EnvironmentError(
+                    f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for"
+                    " this model name. Check the model page at"
+                    f" 'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
+                )
+            except EntryNotFoundError:
+                raise EnvironmentError(
+                    f"{pretrained_model_name_or_path} does not appear to have a file named {cls.config_name}."
+                )
+            except HTTPError as err:
+                raise EnvironmentError(
+                    "There was a specific connection error when trying to load"
+                    f" {pretrained_model_name_or_path}:\n{err}"
+                )
+            except ValueError:
+                raise EnvironmentError(
+                    f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
+                    f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
+                    f" directory containing a {cls.config_name} file.\nCheckout your internet connection or see how to"
+                    " run the library in offline mode at"
+                    " 'https://huggingface.co/docs/diffusers/installation#offline-mode'."
+                )
+            except EnvironmentError:
+                raise EnvironmentError(
+                    f"Can't load config for '{pretrained_model_name_or_path}'. If you were trying to load it from "
+                    "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
+                    f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
+                    f"containing a {cls.config_name} file"
+                )
+
+        try:
+            # Load config dict
+            config_dict = cls._dict_from_json_file(config_file)
+
+            commit_hash = extract_commit_hash(config_file)
+        except (json.JSONDecodeError, UnicodeDecodeError):
+            raise EnvironmentError(f"It looks like the config file at '{config_file}' is not a valid JSON file.")
+
+        if not (return_unused_kwargs or return_commit_hash):
+            return config_dict
+
+        outputs = (config_dict,)
+
+        if return_unused_kwargs:
+            outputs += (kwargs,)
+
+        if return_commit_hash:
+            outputs += (commit_hash,)
+
+        return outputs
+
+    @staticmethod
+    def _get_init_keys(cls):
+        return set(dict(inspect.signature(cls.__init__).parameters).keys())
+
+    @classmethod
+    def extract_init_dict(cls, config_dict, **kwargs):
+        # Skip keys that were not present in the original config, so default __init__ values were used
+        used_defaults = config_dict.get("_use_default_values", [])
+        config_dict = {k: v for k, v in config_dict.items() if k not in used_defaults and k != "_use_default_values"}
+
+        # 0. Copy origin config dict
+        original_dict = dict(config_dict.items())
+
+        # 1. Retrieve expected config attributes from __init__ signature
+        expected_keys = cls._get_init_keys(cls)
+        expected_keys.remove("self")
+        # remove general kwargs if present in dict
+        if "kwargs" in expected_keys:
+            expected_keys.remove("kwargs")
+        # remove flax internal keys
+        if hasattr(cls, "_flax_internal_args"):
+            for arg in cls._flax_internal_args:
+                expected_keys.remove(arg)
+
+        # 2. Remove attributes that cannot be expected from expected config attributes
+        # remove keys to be ignored
+        if len(cls.ignore_for_config) > 0:
+            expected_keys = expected_keys - set(cls.ignore_for_config)
+
+        # load diffusers library to import compatible and original scheduler
+        diffusers_library = importlib.import_module(__name__.split(".")[0])
+
+        if cls.has_compatibles:
+            compatible_classes = [c for c in cls._get_compatibles() if not isinstance(c, DummyObject)]
+        else:
+            compatible_classes = []
+
+        expected_keys_comp_cls = set()
+        for c in compatible_classes:
+            expected_keys_c = cls._get_init_keys(c)
+            expected_keys_comp_cls = expected_keys_comp_cls.union(expected_keys_c)
+        expected_keys_comp_cls = expected_keys_comp_cls - cls._get_init_keys(cls)
+        config_dict = {k: v for k, v in config_dict.items() if k not in expected_keys_comp_cls}
+
+        # remove attributes from orig class that cannot be expected
+        orig_cls_name = config_dict.pop("_class_name", cls.__name__)
+        if (
+            isinstance(orig_cls_name, str)
+            and orig_cls_name != cls.__name__
+            and hasattr(diffusers_library, orig_cls_name)
+        ):
+            orig_cls = getattr(diffusers_library, orig_cls_name)
+            unexpected_keys_from_orig = cls._get_init_keys(orig_cls) - expected_keys
+            config_dict = {k: v for k, v in config_dict.items() if k not in unexpected_keys_from_orig}
+        elif not isinstance(orig_cls_name, str) and not isinstance(orig_cls_name, (list, tuple)):
+            raise ValueError(
+                "Make sure that the `_class_name` is of type string or list of string (for custom pipelines)."
+            )
+
+        # remove private attributes
+        config_dict = {k: v for k, v in config_dict.items() if not k.startswith("_")}
+
+        # 3. Create keyword arguments that will be passed to __init__ from expected keyword arguments
+        init_dict = {}
+        for key in expected_keys:
+            # if config param is passed to kwarg and is present in config dict
+            # it should overwrite existing config dict key
+            if key in kwargs and key in config_dict:
+                config_dict[key] = kwargs.pop(key)
+
+            if key in kwargs:
+                # overwrite key
+                init_dict[key] = kwargs.pop(key)
+            elif key in config_dict:
+                # use value from config dict
+                init_dict[key] = config_dict.pop(key)
+
+        # 4. Give nice warning if unexpected values have been passed
+        if len(config_dict) > 0:
+            logger.warning(
+                f"The config attributes {config_dict} were passed to {cls.__name__}, "
+                "but are not expected and will be ignored. Please verify your "
+                f"{cls.config_name} configuration file."
+            )
+
+        # 5. Give nice info if config attributes are initialized to default because they have not been passed
+        passed_keys = set(init_dict.keys())
+        if len(expected_keys - passed_keys) > 0:
+            logger.info(
+                f"{expected_keys - passed_keys} was not found in config. Values will be initialized to default values."
+            )
+
+        # 6. Define unused keyword arguments
+        unused_kwargs = {**config_dict, **kwargs}
+
+        # 7. Define "hidden" config parameters that were saved for compatible classes
+        hidden_config_dict = {k: v for k, v in original_dict.items() if k not in init_dict}
+
+        return init_dict, unused_kwargs, hidden_config_dict
+
+    @classmethod
+    def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
+        with open(json_file, "r", encoding="utf-8") as reader:
+            text = reader.read()
+        return json.loads(text)
+
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+
+    @property
+    def config(self) -> Dict[str, Any]:
+        """
+        Returns the config of the class as a frozen dictionary
+
+        Returns:
+            `Dict[str, Any]`: Config of the class.
+        """
+        return self._internal_dict
+
+    def to_json_string(self) -> str:
+        """
+        Serializes the configuration instance to a JSON string.
+
+        Returns:
+            `str`:
+                String containing all the attributes that make up the configuration instance in JSON format.
+        """
+        config_dict = self._internal_dict if hasattr(self, "_internal_dict") else {}
+        config_dict["_class_name"] = self.__class__.__name__
+        config_dict["_diffusers_version"] = __version__
+
+        def to_json_saveable(value):
+            if isinstance(value, np.ndarray):
+                value = value.tolist()
+            elif isinstance(value, PosixPath):
+                value = str(value)
+            return value
+
+        config_dict = {k: to_json_saveable(v) for k, v in config_dict.items()}
+        # Don't save "_ignore_files" or "_use_default_values"
+        config_dict.pop("_ignore_files", None)
+        config_dict.pop("_use_default_values", None)
+
+        return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
+        """
+        Save the configuration instance's parameters to a JSON file.
+
+        Args:
+            json_file_path (`str` or `os.PathLike`):
+                Path to the JSON file to save a configuration instance's parameters.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            writer.write(self.to_json_string())
+
+
+def register_to_config(init):
+    r"""
+    Decorator to apply on the init of classes inheriting from [`ConfigMixin`] so that all the arguments are
+    automatically sent to `self.register_for_config`. To ignore a specific argument accepted by the init but that
+    shouldn't be registered in the config, use the `ignore_for_config` class variable
+
+    Warning: Once decorated, all private arguments (beginning with an underscore) are trashed and not sent to the init!
+    """
+
+    @functools.wraps(init)
+    def inner_init(self, *args, **kwargs):
+        # Ignore private kwargs in the init.
+        init_kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_")}
+        config_init_kwargs = {k: v for k, v in kwargs.items() if k.startswith("_")}
+        if not isinstance(self, ConfigMixin):
+            raise RuntimeError(
+                f"`@register_for_config` was applied to {self.__class__.__name__} init method, but this class does "
+                "not inherit from `ConfigMixin`."
+            )
+
+        ignore = getattr(self, "ignore_for_config", [])
+        # Get positional arguments aligned with kwargs
+        new_kwargs = {}
+        signature = inspect.signature(init)
+        parameters = {
+            name: p.default for i, (name, p) in enumerate(signature.parameters.items()) if i > 0 and name not in ignore
+        }
+        for arg, name in zip(args, parameters.keys()):
+            new_kwargs[name] = arg
+
+        # Then add all kwargs
+        new_kwargs.update(
+            {
+                k: init_kwargs.get(k, default)
+                for k, default in parameters.items()
+                if k not in ignore and k not in new_kwargs
+            }
+        )
+
+        # Take note of the parameters that were not present in the loaded config
+        if len(set(new_kwargs.keys()) - set(init_kwargs)) > 0:
+            new_kwargs["_use_default_values"] = list(set(new_kwargs.keys()) - set(init_kwargs))
+
+        new_kwargs = {**config_init_kwargs, **new_kwargs}
+        getattr(self, "register_to_config")(**new_kwargs)
+        init(self, *args, **init_kwargs)
+
+    return inner_init
+
+
+def flax_register_to_config(cls):
+    original_init = cls.__init__
+
+    @functools.wraps(original_init)
+    def init(self, *args, **kwargs):
+        if not isinstance(self, ConfigMixin):
+            raise RuntimeError(
+                f"`@register_for_config` was applied to {self.__class__.__name__} init method, but this class does "
+                "not inherit from `ConfigMixin`."
+            )
+
+        # Ignore private kwargs in the init. Retrieve all passed attributes
+        init_kwargs = dict(kwargs.items())
+
+        # Retrieve default values
+        fields = dataclasses.fields(self)
+        default_kwargs = {}
+        for field in fields:
+            # ignore flax specific attributes
+            if field.name in self._flax_internal_args:
+                continue
+            if type(field.default) == dataclasses._MISSING_TYPE:
+                default_kwargs[field.name] = None
+            else:
+                default_kwargs[field.name] = getattr(self, field.name)
+
+        # Make sure init_kwargs override default kwargs
+        new_kwargs = {**default_kwargs, **init_kwargs}
+        # dtype should be part of `init_kwargs`, but not `new_kwargs`
+        if "dtype" in new_kwargs:
+            new_kwargs.pop("dtype")
+
+        # Get positional arguments aligned with kwargs
+        for i, arg in enumerate(args):
+            name = fields[i].name
+            new_kwargs[name] = arg
+
+        # Take note of the parameters that were not present in the loaded config
+        if len(set(new_kwargs.keys()) - set(init_kwargs)) > 0:
+            new_kwargs["_use_default_values"] = list(set(new_kwargs.keys()) - set(init_kwargs))
+
+        getattr(self, "register_to_config")(**new_kwargs)
+        original_init(self, *args, **kwargs)
+
+    cls.__init__ = init
+    return cls
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/dependency_versions_check.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/dependency_versions_check.py
new file mode 100644
index 000000000..0728b3a7c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/dependency_versions_check.py
@@ -0,0 +1,34 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .dependency_versions_table import deps
+from .utils.versions import require_version, require_version_core
+
+
+# define which module versions we always want to check at run time
+# (usually the ones defined in `install_requires` in setup.py)
+#
+# order specific notes:
+# - tqdm must be checked before tokenizers
+
+pkgs_to_check_at_runtime = "python requests filelock numpy".split()
+for pkg in pkgs_to_check_at_runtime:
+    if pkg in deps:
+        require_version_core(deps[pkg])
+    else:
+        raise ValueError(f"can't find {pkg} in {deps.keys()}, check dependency_versions_table.py")
+
+
+def dep_version_check(pkg, hint=None):
+    require_version(deps[pkg], hint)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/dependency_versions_table.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/dependency_versions_table.py
new file mode 100644
index 000000000..e92a486bf
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/dependency_versions_table.py
@@ -0,0 +1,45 @@
+# THIS FILE HAS BEEN AUTOGENERATED. To update:
+# 1. modify the `_deps` dict in setup.py
+# 2. run `make deps_table_update`
+deps = {
+    "Pillow": "Pillow",
+    "accelerate": "accelerate>=0.11.0",
+    "compel": "compel==0.1.8",
+    "datasets": "datasets",
+    "filelock": "filelock",
+    "flax": "flax>=0.4.1",
+    "hf-doc-builder": "hf-doc-builder>=0.3.0",
+    "huggingface-hub": "huggingface-hub>=0.20.2",
+    "requests-mock": "requests-mock==1.10.0",
+    "importlib_metadata": "importlib_metadata",
+    "invisible-watermark": "invisible-watermark>=0.2.0",
+    "isort": "isort>=5.5.4",
+    "jax": "jax>=0.4.1",
+    "jaxlib": "jaxlib>=0.4.1",
+    "Jinja2": "Jinja2",
+    "k-diffusion": "k-diffusion>=0.0.12",
+    "torchsde": "torchsde",
+    "note_seq": "note_seq",
+    "librosa": "librosa",
+    "numpy": "numpy",
+    "parameterized": "parameterized",
+    "peft": "peft>=0.6.0",
+    "protobuf": "protobuf>=3.20.3,<4",
+    "pytest": "pytest",
+    "pytest-timeout": "pytest-timeout",
+    "pytest-xdist": "pytest-xdist",
+    "python": "python>=3.8.0",
+    "ruff": "ruff==0.1.5",
+    "safetensors": "safetensors>=0.3.1",
+    "sentencepiece": "sentencepiece>=0.1.91,!=0.1.92",
+    "GitPython": "GitPython<3.1.19",
+    "scipy": "scipy",
+    "onnx": "onnx",
+    "regex": "regex!=2019.12.17",
+    "requests": "requests",
+    "tensorboard": "tensorboard",
+    "torch": "torch>=1.4",
+    "torchvision": "torchvision",
+    "transformers": "transformers>=4.25.1",
+    "urllib3": "urllib3<=2.0.0",
+}
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/experimental/README.md b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/experimental/README.md
new file mode 100644
index 000000000..81a9de81c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/experimental/README.md
@@ -0,0 +1,5 @@
+# 🧨 Diffusers Experimental
+
+We are adding experimental code to support novel applications and usages of the Diffusers library.
+Currently, the following experiments are supported:
+* Reinforcement learning via an implementation of the [Diffuser](https://arxiv.org/abs/2205.09991) model.
\ No newline at end of file
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/experimental/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/experimental/__init__.py
new file mode 100644
index 000000000..ebc815540
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/experimental/__init__.py
@@ -0,0 +1 @@
+from .rl import ValueGuidedRLPipeline
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/experimental/rl/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/experimental/rl/__init__.py
new file mode 100644
index 000000000..7b338d317
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/experimental/rl/__init__.py
@@ -0,0 +1 @@
+from .value_guided_sampling import ValueGuidedRLPipeline
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/experimental/rl/value_guided_sampling.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/experimental/rl/value_guided_sampling.py
new file mode 100644
index 000000000..2f9de8574
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/experimental/rl/value_guided_sampling.py
@@ -0,0 +1,153 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import torch
+import tqdm
+
+from ...models.unets.unet_1d import UNet1DModel
+from ...pipelines import DiffusionPipeline
+from ...utils.dummy_pt_objects import DDPMScheduler
+from ...utils.torch_utils import randn_tensor
+
+
+class ValueGuidedRLPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for value-guided sampling from a diffusion model trained to predict sequences of states.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        value_function ([`UNet1DModel`]):
+            A specialized UNet for fine-tuning trajectories base on reward.
+        unet ([`UNet1DModel`]):
+            UNet architecture to denoise the encoded trajectories.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded trajectories. Default for this
+            application is [`DDPMScheduler`].
+        env ():
+            An environment following the OpenAI gym API to act in. For now only Hopper has pretrained models.
+    """
+
+    def __init__(
+        self,
+        value_function: UNet1DModel,
+        unet: UNet1DModel,
+        scheduler: DDPMScheduler,
+        env,
+    ):
+        super().__init__()
+
+        self.register_modules(value_function=value_function, unet=unet, scheduler=scheduler, env=env)
+
+        self.data = env.get_dataset()
+        self.means = {}
+        for key in self.data.keys():
+            try:
+                self.means[key] = self.data[key].mean()
+            except:  # noqa: E722
+                pass
+        self.stds = {}
+        for key in self.data.keys():
+            try:
+                self.stds[key] = self.data[key].std()
+            except:  # noqa: E722
+                pass
+        self.state_dim = env.observation_space.shape[0]
+        self.action_dim = env.action_space.shape[0]
+
+    def normalize(self, x_in, key):
+        return (x_in - self.means[key]) / self.stds[key]
+
+    def de_normalize(self, x_in, key):
+        return x_in * self.stds[key] + self.means[key]
+
+    def to_torch(self, x_in):
+        if isinstance(x_in, dict):
+            return {k: self.to_torch(v) for k, v in x_in.items()}
+        elif torch.is_tensor(x_in):
+            return x_in.to(self.unet.device)
+        return torch.tensor(x_in, device=self.unet.device)
+
+    def reset_x0(self, x_in, cond, act_dim):
+        for key, val in cond.items():
+            x_in[:, key, act_dim:] = val.clone()
+        return x_in
+
+    def run_diffusion(self, x, conditions, n_guide_steps, scale):
+        batch_size = x.shape[0]
+        y = None
+        for i in tqdm.tqdm(self.scheduler.timesteps):
+            # create batch of timesteps to pass into model
+            timesteps = torch.full((batch_size,), i, device=self.unet.device, dtype=torch.long)
+            for _ in range(n_guide_steps):
+                with torch.enable_grad():
+                    x.requires_grad_()
+
+                    # permute to match dimension for pre-trained models
+                    y = self.value_function(x.permute(0, 2, 1), timesteps).sample
+                    grad = torch.autograd.grad([y.sum()], [x])[0]
+
+                    posterior_variance = self.scheduler._get_variance(i)
+                    model_std = torch.exp(0.5 * posterior_variance)
+                    grad = model_std * grad
+
+                grad[timesteps < 2] = 0
+                x = x.detach()
+                x = x + scale * grad
+                x = self.reset_x0(x, conditions, self.action_dim)
+
+            prev_x = self.unet(x.permute(0, 2, 1), timesteps).sample.permute(0, 2, 1)
+
+            # TODO: verify deprecation of this kwarg
+            x = self.scheduler.step(prev_x, i, x)["prev_sample"]
+
+            # apply conditions to the trajectory (set the initial state)
+            x = self.reset_x0(x, conditions, self.action_dim)
+            x = self.to_torch(x)
+        return x, y
+
+    def __call__(self, obs, batch_size=64, planning_horizon=32, n_guide_steps=2, scale=0.1):
+        # normalize the observations and create  batch dimension
+        obs = self.normalize(obs, "observations")
+        obs = obs[None].repeat(batch_size, axis=0)
+
+        conditions = {0: self.to_torch(obs)}
+        shape = (batch_size, planning_horizon, self.state_dim + self.action_dim)
+
+        # generate initial noise and apply our conditions (to make the trajectories start at current state)
+        x1 = randn_tensor(shape, device=self.unet.device)
+        x = self.reset_x0(x1, conditions, self.action_dim)
+        x = self.to_torch(x)
+
+        # run the diffusion process
+        x, y = self.run_diffusion(x, conditions, n_guide_steps, scale)
+
+        # sort output trajectories by value
+        sorted_idx = y.argsort(0, descending=True).squeeze()
+        sorted_values = x[sorted_idx]
+        actions = sorted_values[:, :, : self.action_dim]
+        actions = actions.detach().cpu().numpy()
+        denorm_actions = self.de_normalize(actions, key="actions")
+
+        # select the action with the highest value
+        if y is not None:
+            selected_index = 0
+        else:
+            # if we didn't run value guiding, select a random action
+            selected_index = np.random.randint(0, batch_size)
+
+        denorm_actions = denorm_actions[selected_index, 0]
+        return denorm_actions
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/image_processor.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/image_processor.py
new file mode 100644
index 000000000..daeb8fd6f
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/image_processor.py
@@ -0,0 +1,990 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from PIL import Image, ImageFilter, ImageOps
+
+from .configuration_utils import ConfigMixin, register_to_config
+from .utils import CONFIG_NAME, PIL_INTERPOLATION, deprecate
+
+
+PipelineImageInput = Union[
+    PIL.Image.Image,
+    np.ndarray,
+    torch.FloatTensor,
+    List[PIL.Image.Image],
+    List[np.ndarray],
+    List[torch.FloatTensor],
+]
+
+PipelineDepthInput = PipelineImageInput
+
+
+class VaeImageProcessor(ConfigMixin):
+    """
+    Image processor for VAE.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`. Can accept
+            `height` and `width` arguments from [`image_processor.VaeImageProcessor.preprocess`] method.
+        vae_scale_factor (`int`, *optional*, defaults to `8`):
+            VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor.
+        resample (`str`, *optional*, defaults to `lanczos`):
+            Resampling filter to use when resizing the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image to [-1,1].
+        do_binarize (`bool`, *optional*, defaults to `False`):
+            Whether to binarize the image to 0/1.
+        do_convert_rgb (`bool`, *optional*, defaults to be `False`):
+            Whether to convert the images to RGB format.
+        do_convert_grayscale (`bool`, *optional*, defaults to be `False`):
+            Whether to convert the images to grayscale format.
+    """
+
+    config_name = CONFIG_NAME
+
+    @register_to_config
+    def __init__(
+        self,
+        do_resize: bool = True,
+        vae_scale_factor: int = 8,
+        resample: str = "lanczos",
+        do_normalize: bool = True,
+        do_binarize: bool = False,
+        do_convert_rgb: bool = False,
+        do_convert_grayscale: bool = False,
+    ):
+        super().__init__()
+        if do_convert_rgb and do_convert_grayscale:
+            raise ValueError(
+                "`do_convert_rgb` and `do_convert_grayscale` can not both be set to `True`,"
+                " if you intended to convert the image into RGB format, please set `do_convert_grayscale = False`.",
+                " if you intended to convert the image into grayscale format, please set `do_convert_rgb = False`",
+            )
+            self.config.do_convert_rgb = False
+
+    @staticmethod
+    def numpy_to_pil(images: np.ndarray) -> List[PIL.Image.Image]:
+        """
+        Convert a numpy image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images = (images * 255).round().astype("uint8")
+        if images.shape[-1] == 1:
+            # special case for grayscale (single channel) images
+            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+        else:
+            pil_images = [Image.fromarray(image) for image in images]
+
+        return pil_images
+
+    @staticmethod
+    def pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.ndarray:
+        """
+        Convert a PIL image or a list of PIL images to NumPy arrays.
+        """
+        if not isinstance(images, list):
+            images = [images]
+        images = [np.array(image).astype(np.float32) / 255.0 for image in images]
+        images = np.stack(images, axis=0)
+
+        return images
+
+    @staticmethod
+    def numpy_to_pt(images: np.ndarray) -> torch.FloatTensor:
+        """
+        Convert a NumPy image to a PyTorch tensor.
+        """
+        if images.ndim == 3:
+            images = images[..., None]
+
+        images = torch.from_numpy(images.transpose(0, 3, 1, 2))
+        return images
+
+    @staticmethod
+    def pt_to_numpy(images: torch.FloatTensor) -> np.ndarray:
+        """
+        Convert a PyTorch tensor to a NumPy image.
+        """
+        images = images.cpu().permute(0, 2, 3, 1).float().numpy()
+        return images
+
+    @staticmethod
+    def normalize(images: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
+        """
+        Normalize an image array to [-1,1].
+        """
+        return 2.0 * images - 1.0
+
+    @staticmethod
+    def denormalize(images: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
+        """
+        Denormalize an image array to [0,1].
+        """
+        return (images / 2 + 0.5).clamp(0, 1)
+
+    @staticmethod
+    def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image:
+        """
+        Converts a PIL image to RGB format.
+        """
+        image = image.convert("RGB")
+
+        return image
+
+    @staticmethod
+    def convert_to_grayscale(image: PIL.Image.Image) -> PIL.Image.Image:
+        """
+        Converts a PIL image to grayscale format.
+        """
+        image = image.convert("L")
+
+        return image
+
+    @staticmethod
+    def blur(image: PIL.Image.Image, blur_factor: int = 4) -> PIL.Image.Image:
+        """
+        Applies Gaussian blur to an image.
+        """
+        image = image.filter(ImageFilter.GaussianBlur(blur_factor))
+
+        return image
+
+    @staticmethod
+    def get_crop_region(mask_image: PIL.Image.Image, width: int, height: int, pad=0):
+        """
+        Finds a rectangular region that contains all masked ares in an image, and expands region to match the aspect ratio of the original image;
+        for example, if user drew mask in a 128x32 region, and the dimensions for processing are 512x512, the region will be expanded to 128x128.
+
+        Args:
+            mask_image (PIL.Image.Image): Mask image.
+            width (int): Width of the image to be processed.
+            height (int): Height of the image to be processed.
+            pad (int, optional): Padding to be added to the crop region. Defaults to 0.
+
+        Returns:
+            tuple: (x1, y1, x2, y2) represent a rectangular region that contains all masked ares in an image and matches the original aspect ratio.
+        """
+
+        mask_image = mask_image.convert("L")
+        mask = np.array(mask_image)
+
+        # 1. find a rectangular region that contains all masked ares in an image
+        h, w = mask.shape
+        crop_left = 0
+        for i in range(w):
+            if not (mask[:, i] == 0).all():
+                break
+            crop_left += 1
+
+        crop_right = 0
+        for i in reversed(range(w)):
+            if not (mask[:, i] == 0).all():
+                break
+            crop_right += 1
+
+        crop_top = 0
+        for i in range(h):
+            if not (mask[i] == 0).all():
+                break
+            crop_top += 1
+
+        crop_bottom = 0
+        for i in reversed(range(h)):
+            if not (mask[i] == 0).all():
+                break
+            crop_bottom += 1
+
+        # 2. add padding to the crop region
+        x1, y1, x2, y2 = (
+            int(max(crop_left - pad, 0)),
+            int(max(crop_top - pad, 0)),
+            int(min(w - crop_right + pad, w)),
+            int(min(h - crop_bottom + pad, h)),
+        )
+
+        # 3. expands crop region to match the aspect ratio of the image to be processed
+        ratio_crop_region = (x2 - x1) / (y2 - y1)
+        ratio_processing = width / height
+
+        if ratio_crop_region > ratio_processing:
+            desired_height = (x2 - x1) / ratio_processing
+            desired_height_diff = int(desired_height - (y2 - y1))
+            y1 -= desired_height_diff // 2
+            y2 += desired_height_diff - desired_height_diff // 2
+            if y2 >= mask_image.height:
+                diff = y2 - mask_image.height
+                y2 -= diff
+                y1 -= diff
+            if y1 < 0:
+                y2 -= y1
+                y1 -= y1
+            if y2 >= mask_image.height:
+                y2 = mask_image.height
+        else:
+            desired_width = (y2 - y1) * ratio_processing
+            desired_width_diff = int(desired_width - (x2 - x1))
+            x1 -= desired_width_diff // 2
+            x2 += desired_width_diff - desired_width_diff // 2
+            if x2 >= mask_image.width:
+                diff = x2 - mask_image.width
+                x2 -= diff
+                x1 -= diff
+            if x1 < 0:
+                x2 -= x1
+                x1 -= x1
+            if x2 >= mask_image.width:
+                x2 = mask_image.width
+
+        return x1, y1, x2, y2
+
+    def _resize_and_fill(
+        self,
+        image: PIL.Image.Image,
+        width: int,
+        height: int,
+    ) -> PIL.Image.Image:
+        """
+        Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image within the dimensions, filling empty with data from image.
+
+        Args:
+            image: The image to resize.
+            width: The width to resize the image to.
+            height: The height to resize the image to.
+        """
+
+        ratio = width / height
+        src_ratio = image.width / image.height
+
+        src_w = width if ratio < src_ratio else image.width * height // image.height
+        src_h = height if ratio >= src_ratio else image.height * width // image.width
+
+        resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION["lanczos"])
+        res = Image.new("RGB", (width, height))
+        res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
+
+        if ratio < src_ratio:
+            fill_height = height // 2 - src_h // 2
+            if fill_height > 0:
+                res.paste(resized.resize((width, fill_height), box=(0, 0, width, 0)), box=(0, 0))
+                res.paste(
+                    resized.resize((width, fill_height), box=(0, resized.height, width, resized.height)),
+                    box=(0, fill_height + src_h),
+                )
+        elif ratio > src_ratio:
+            fill_width = width // 2 - src_w // 2
+            if fill_width > 0:
+                res.paste(resized.resize((fill_width, height), box=(0, 0, 0, height)), box=(0, 0))
+                res.paste(
+                    resized.resize((fill_width, height), box=(resized.width, 0, resized.width, height)),
+                    box=(fill_width + src_w, 0),
+                )
+
+        return res
+
+    def _resize_and_crop(
+        self,
+        image: PIL.Image.Image,
+        width: int,
+        height: int,
+    ) -> PIL.Image.Image:
+        """
+        Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image within the dimensions, cropping the excess.
+
+        Args:
+            image: The image to resize.
+            width: The width to resize the image to.
+            height: The height to resize the image to.
+        """
+        ratio = width / height
+        src_ratio = image.width / image.height
+
+        src_w = width if ratio > src_ratio else image.width * height // image.height
+        src_h = height if ratio <= src_ratio else image.height * width // image.width
+
+        resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION["lanczos"])
+        res = Image.new("RGB", (width, height))
+        res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
+        return res
+
+    def resize(
+        self,
+        image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
+        height: int,
+        width: int,
+        resize_mode: str = "default",  # "default", "fill", "crop"
+    ) -> Union[PIL.Image.Image, np.ndarray, torch.Tensor]:
+        """
+        Resize image.
+
+        Args:
+            image (`PIL.Image.Image`, `np.ndarray` or `torch.Tensor`):
+                The image input, can be a PIL image, numpy array or pytorch tensor.
+            height (`int`):
+                The height to resize to.
+            width (`int`):
+                The width to resize to.
+            resize_mode (`str`, *optional*, defaults to `default`):
+                The resize mode to use, can be one of `default` or `fill`. If `default`, will resize the image to fit
+                within the specified width and height, and it may not maintaining the original aspect ratio.
+                If `fill`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image
+                within the dimensions, filling empty with data from image.
+                If `crop`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image
+                within the dimensions, cropping the excess.
+                Note that resize_mode `fill` and `crop` are only supported for PIL image input.
+
+        Returns:
+            `PIL.Image.Image`, `np.ndarray` or `torch.Tensor`:
+                The resized image.
+        """
+        if resize_mode != "default" and not isinstance(image, PIL.Image.Image):
+            raise ValueError(f"Only PIL image input is supported for resize_mode {resize_mode}")
+        if isinstance(image, PIL.Image.Image):
+            if resize_mode == "default":
+                image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample])
+            elif resize_mode == "fill":
+                image = self._resize_and_fill(image, width, height)
+            elif resize_mode == "crop":
+                image = self._resize_and_crop(image, width, height)
+            else:
+                raise ValueError(f"resize_mode {resize_mode} is not supported")
+
+        elif isinstance(image, torch.Tensor):
+            image = torch.nn.functional.interpolate(
+                image,
+                size=(height, width),
+            )
+        elif isinstance(image, np.ndarray):
+            image = self.numpy_to_pt(image)
+            image = torch.nn.functional.interpolate(
+                image,
+                size=(height, width),
+            )
+            image = self.pt_to_numpy(image)
+        return image
+
+    def binarize(self, image: PIL.Image.Image) -> PIL.Image.Image:
+        """
+        Create a mask.
+
+        Args:
+            image (`PIL.Image.Image`):
+                The image input, should be a PIL image.
+
+        Returns:
+            `PIL.Image.Image`:
+                The binarized image. Values less than 0.5 are set to 0, values greater than 0.5 are set to 1.
+        """
+        image[image < 0.5] = 0
+        image[image >= 0.5] = 1
+
+        return image
+
+    def get_default_height_width(
+        self,
+        image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+    ) -> Tuple[int, int]:
+        """
+        This function return the height and width that are downscaled to the next integer multiple of
+        `vae_scale_factor`.
+
+        Args:
+            image(`PIL.Image.Image`, `np.ndarray` or `torch.Tensor`):
+                The image input, can be a PIL image, numpy array or pytorch tensor. if it is a numpy array, should have
+                shape `[batch, height, width]` or `[batch, height, width, channel]` if it is a pytorch tensor, should
+                have shape `[batch, channel, height, width]`.
+            height (`int`, *optional*, defaults to `None`):
+                The height in preprocessed image. If `None`, will use the height of `image` input.
+            width (`int`, *optional*`, defaults to `None`):
+                The width in preprocessed. If `None`, will use the width of the `image` input.
+        """
+
+        if height is None:
+            if isinstance(image, PIL.Image.Image):
+                height = image.height
+            elif isinstance(image, torch.Tensor):
+                height = image.shape[2]
+            else:
+                height = image.shape[1]
+
+        if width is None:
+            if isinstance(image, PIL.Image.Image):
+                width = image.width
+            elif isinstance(image, torch.Tensor):
+                width = image.shape[3]
+            else:
+                width = image.shape[2]
+
+        width, height = (
+            x - x % self.config.vae_scale_factor for x in (width, height)
+        )  # resize to integer multiple of vae_scale_factor
+
+        return height, width
+
+    def preprocess(
+        self,
+        image: PipelineImageInput,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        resize_mode: str = "default",  # "default", "fill", "crop"
+        crops_coords: Optional[Tuple[int, int, int, int]] = None,
+    ) -> torch.Tensor:
+        """
+        Preprocess the image input.
+
+        Args:
+            image (`pipeline_image_input`):
+                The image input, accepted formats are PIL images, NumPy arrays, PyTorch tensors; Also accept list of supported formats.
+            height (`int`, *optional*, defaults to `None`):
+                The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default height.
+            width (`int`, *optional*`, defaults to `None`):
+                The width in preprocessed. If `None`, will use  get_default_height_width()` to get the default width.
+            resize_mode (`str`, *optional*, defaults to `default`):
+                The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit
+                within the specified width and height, and it may not maintaining the original aspect ratio.
+                If `fill`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image
+                within the dimensions, filling empty with data from image.
+                If `crop`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image
+                within the dimensions, cropping the excess.
+                Note that resize_mode `fill` and `crop` are only supported for PIL image input.
+            crops_coords (`List[Tuple[int, int, int, int]]`, *optional*, defaults to `None`):
+                The crop coordinates for each image in the batch. If `None`, will not crop the image.
+        """
+        supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
+
+        # Expand the missing dimension for 3-dimensional pytorch tensor or numpy array that represents grayscale image
+        if self.config.do_convert_grayscale and isinstance(image, (torch.Tensor, np.ndarray)) and image.ndim == 3:
+            if isinstance(image, torch.Tensor):
+                # if image is a pytorch tensor could have 2 possible shapes:
+                #    1. batch x height x width: we should insert the channel dimension at position 1
+                #    2. channel x height x width: we should insert batch dimension at position 0,
+                #       however, since both channel and batch dimension has same size 1, it is same to insert at position 1
+                #    for simplicity, we insert a dimension of size 1 at position 1 for both cases
+                image = image.unsqueeze(1)
+            else:
+                # if it is a numpy array, it could have 2 possible shapes:
+                #   1. batch x height x width: insert channel dimension on last position
+                #   2. height x width x channel: insert batch dimension on first position
+                if image.shape[-1] == 1:
+                    image = np.expand_dims(image, axis=0)
+                else:
+                    image = np.expand_dims(image, axis=-1)
+
+        if isinstance(image, supported_formats):
+            image = [image]
+        elif not (isinstance(image, list) and all(isinstance(i, supported_formats) for i in image)):
+            raise ValueError(
+                f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support {', '.join(supported_formats)}"
+            )
+
+        if isinstance(image[0], PIL.Image.Image):
+            if crops_coords is not None:
+                image = [i.crop(crops_coords) for i in image]
+            if self.config.do_resize:
+                height, width = self.get_default_height_width(image[0], height, width)
+                image = [self.resize(i, height, width, resize_mode=resize_mode) for i in image]
+            if self.config.do_convert_rgb:
+                image = [self.convert_to_rgb(i) for i in image]
+            elif self.config.do_convert_grayscale:
+                image = [self.convert_to_grayscale(i) for i in image]
+            image = self.pil_to_numpy(image)  # to np
+            image = self.numpy_to_pt(image)  # to pt
+
+        elif isinstance(image[0], np.ndarray):
+            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
+
+            image = self.numpy_to_pt(image)
+
+            height, width = self.get_default_height_width(image, height, width)
+            if self.config.do_resize:
+                image = self.resize(image, height, width)
+
+        elif isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
+
+            if self.config.do_convert_grayscale and image.ndim == 3:
+                image = image.unsqueeze(1)
+
+            channel = image.shape[1]
+            # don't need any preprocess if the image is latents
+            if channel == 4:
+                return image
+
+            height, width = self.get_default_height_width(image, height, width)
+            if self.config.do_resize:
+                image = self.resize(image, height, width)
+
+        # expected range [0,1], normalize to [-1,1]
+        do_normalize = self.config.do_normalize
+        if do_normalize and image.min() < 0:
+            warnings.warn(
+                "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
+                f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]",
+                FutureWarning,
+            )
+            do_normalize = False
+
+        if do_normalize:
+            image = self.normalize(image)
+
+        if self.config.do_binarize:
+            image = self.binarize(image)
+
+        return image
+
+    def postprocess(
+        self,
+        image: torch.FloatTensor,
+        output_type: str = "pil",
+        do_denormalize: Optional[List[bool]] = None,
+    ) -> Union[PIL.Image.Image, np.ndarray, torch.FloatTensor]:
+        """
+        Postprocess the image output from tensor to `output_type`.
+
+        Args:
+            image (`torch.FloatTensor`):
+                The image input, should be a pytorch tensor with shape `B x C x H x W`.
+            output_type (`str`, *optional*, defaults to `pil`):
+                The output type of the image, can be one of `pil`, `np`, `pt`, `latent`.
+            do_denormalize (`List[bool]`, *optional*, defaults to `None`):
+                Whether to denormalize the image to [0,1]. If `None`, will use the value of `do_normalize` in the
+                `VaeImageProcessor` config.
+
+        Returns:
+            `PIL.Image.Image`, `np.ndarray` or `torch.FloatTensor`:
+                The postprocessed image.
+        """
+        if not isinstance(image, torch.Tensor):
+            raise ValueError(
+                f"Input for postprocessing is in incorrect format: {type(image)}. We only support pytorch tensor"
+            )
+        if output_type not in ["latent", "pt", "np", "pil"]:
+            deprecation_message = (
+                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
+                "`pil`, `np`, `pt`, `latent`"
+            )
+            deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
+            output_type = "np"
+
+        if output_type == "latent":
+            return image
+
+        if do_denormalize is None:
+            do_denormalize = [self.config.do_normalize] * image.shape[0]
+
+        image = torch.stack(
+            [self.denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])]
+        )
+
+        if output_type == "pt":
+            return image
+
+        image = self.pt_to_numpy(image)
+
+        if output_type == "np":
+            return image
+
+        if output_type == "pil":
+            return self.numpy_to_pil(image)
+
+    def apply_overlay(
+        self,
+        mask: PIL.Image.Image,
+        init_image: PIL.Image.Image,
+        image: PIL.Image.Image,
+        crop_coords: Optional[Tuple[int, int, int, int]] = None,
+    ) -> PIL.Image.Image:
+        """
+        overlay the inpaint output to the original image
+        """
+
+        width, height = image.width, image.height
+
+        init_image = self.resize(init_image, width=width, height=height)
+        mask = self.resize(mask, width=width, height=height)
+
+        init_image_masked = PIL.Image.new("RGBa", (width, height))
+        init_image_masked.paste(init_image.convert("RGBA").convert("RGBa"), mask=ImageOps.invert(mask.convert("L")))
+        init_image_masked = init_image_masked.convert("RGBA")
+
+        if crop_coords is not None:
+            x, y, x2, y2 = crop_coords
+            w = x2 - x
+            h = y2 - y
+            base_image = PIL.Image.new("RGBA", (width, height))
+            image = self.resize(image, height=h, width=w, resize_mode="crop")
+            base_image.paste(image, (x, y))
+            image = base_image.convert("RGB")
+
+        image = image.convert("RGBA")
+        image.alpha_composite(init_image_masked)
+        image = image.convert("RGB")
+
+        return image
+
+
+class VaeImageProcessorLDM3D(VaeImageProcessor):
+    """
+    Image processor for VAE LDM3D.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`.
+        vae_scale_factor (`int`, *optional*, defaults to `8`):
+            VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor.
+        resample (`str`, *optional*, defaults to `lanczos`):
+            Resampling filter to use when resizing the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image to [-1,1].
+    """
+
+    config_name = CONFIG_NAME
+
+    @register_to_config
+    def __init__(
+        self,
+        do_resize: bool = True,
+        vae_scale_factor: int = 8,
+        resample: str = "lanczos",
+        do_normalize: bool = True,
+    ):
+        super().__init__()
+
+    @staticmethod
+    def numpy_to_pil(images: np.ndarray) -> List[PIL.Image.Image]:
+        """
+        Convert a NumPy image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images = (images * 255).round().astype("uint8")
+        if images.shape[-1] == 1:
+            # special case for grayscale (single channel) images
+            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+        else:
+            pil_images = [Image.fromarray(image[:, :, :3]) for image in images]
+
+        return pil_images
+
+    @staticmethod
+    def depth_pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.ndarray:
+        """
+        Convert a PIL image or a list of PIL images to NumPy arrays.
+        """
+        if not isinstance(images, list):
+            images = [images]
+
+        images = [np.array(image).astype(np.float32) / (2**16 - 1) for image in images]
+        images = np.stack(images, axis=0)
+        return images
+
+    @staticmethod
+    def rgblike_to_depthmap(image: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
+        """
+        Args:
+            image: RGB-like depth image
+
+        Returns: depth map
+
+        """
+        return image[:, :, 1] * 2**8 + image[:, :, 2]
+
+    def numpy_to_depth(self, images: np.ndarray) -> List[PIL.Image.Image]:
+        """
+        Convert a NumPy depth image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images_depth = images[:, :, :, 3:]
+        if images.shape[-1] == 6:
+            images_depth = (images_depth * 255).round().astype("uint8")
+            pil_images = [
+                Image.fromarray(self.rgblike_to_depthmap(image_depth), mode="I;16") for image_depth in images_depth
+            ]
+        elif images.shape[-1] == 4:
+            images_depth = (images_depth * 65535.0).astype(np.uint16)
+            pil_images = [Image.fromarray(image_depth, mode="I;16") for image_depth in images_depth]
+        else:
+            raise Exception("Not supported")
+
+        return pil_images
+
+    def postprocess(
+        self,
+        image: torch.FloatTensor,
+        output_type: str = "pil",
+        do_denormalize: Optional[List[bool]] = None,
+    ) -> Union[PIL.Image.Image, np.ndarray, torch.FloatTensor]:
+        """
+        Postprocess the image output from tensor to `output_type`.
+
+        Args:
+            image (`torch.FloatTensor`):
+                The image input, should be a pytorch tensor with shape `B x C x H x W`.
+            output_type (`str`, *optional*, defaults to `pil`):
+                The output type of the image, can be one of `pil`, `np`, `pt`, `latent`.
+            do_denormalize (`List[bool]`, *optional*, defaults to `None`):
+                Whether to denormalize the image to [0,1]. If `None`, will use the value of `do_normalize` in the
+                `VaeImageProcessor` config.
+
+        Returns:
+            `PIL.Image.Image`, `np.ndarray` or `torch.FloatTensor`:
+                The postprocessed image.
+        """
+        if not isinstance(image, torch.Tensor):
+            raise ValueError(
+                f"Input for postprocessing is in incorrect format: {type(image)}. We only support pytorch tensor"
+            )
+        if output_type not in ["latent", "pt", "np", "pil"]:
+            deprecation_message = (
+                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
+                "`pil`, `np`, `pt`, `latent`"
+            )
+            deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False)
+            output_type = "np"
+
+        if do_denormalize is None:
+            do_denormalize = [self.config.do_normalize] * image.shape[0]
+
+        image = torch.stack(
+            [self.denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])]
+        )
+
+        image = self.pt_to_numpy(image)
+
+        if output_type == "np":
+            if image.shape[-1] == 6:
+                image_depth = np.stack([self.rgblike_to_depthmap(im[:, :, 3:]) for im in image], axis=0)
+            else:
+                image_depth = image[:, :, :, 3:]
+            return image[:, :, :, :3], image_depth
+
+        if output_type == "pil":
+            return self.numpy_to_pil(image), self.numpy_to_depth(image)
+        else:
+            raise Exception(f"This type {output_type} is not supported")
+
+    def preprocess(
+        self,
+        rgb: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
+        depth: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        target_res: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Preprocess the image input. Accepted formats are PIL images, NumPy arrays or PyTorch tensors.
+        """
+        supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
+
+        # Expand the missing dimension for 3-dimensional pytorch tensor or numpy array that represents grayscale image
+        if self.config.do_convert_grayscale and isinstance(rgb, (torch.Tensor, np.ndarray)) and rgb.ndim == 3:
+            raise Exception("This is not yet supported")
+
+        if isinstance(rgb, supported_formats):
+            rgb = [rgb]
+            depth = [depth]
+        elif not (isinstance(rgb, list) and all(isinstance(i, supported_formats) for i in rgb)):
+            raise ValueError(
+                f"Input is in incorrect format: {[type(i) for i in rgb]}. Currently, we only support {', '.join(supported_formats)}"
+            )
+
+        if isinstance(rgb[0], PIL.Image.Image):
+            if self.config.do_convert_rgb:
+                raise Exception("This is not yet supported")
+                # rgb = [self.convert_to_rgb(i) for i in rgb]
+                # depth = [self.convert_to_depth(i) for i in depth]  #TODO define convert_to_depth
+            if self.config.do_resize or target_res:
+                height, width = self.get_default_height_width(rgb[0], height, width) if not target_res else target_res
+                rgb = [self.resize(i, height, width) for i in rgb]
+                depth = [self.resize(i, height, width) for i in depth]
+            rgb = self.pil_to_numpy(rgb)  # to np
+            rgb = self.numpy_to_pt(rgb)  # to pt
+
+            depth = self.depth_pil_to_numpy(depth)  # to np
+            depth = self.numpy_to_pt(depth)  # to pt
+
+        elif isinstance(rgb[0], np.ndarray):
+            rgb = np.concatenate(rgb, axis=0) if rgb[0].ndim == 4 else np.stack(rgb, axis=0)
+            rgb = self.numpy_to_pt(rgb)
+            height, width = self.get_default_height_width(rgb, height, width)
+            if self.config.do_resize:
+                rgb = self.resize(rgb, height, width)
+
+            depth = np.concatenate(depth, axis=0) if rgb[0].ndim == 4 else np.stack(depth, axis=0)
+            depth = self.numpy_to_pt(depth)
+            height, width = self.get_default_height_width(depth, height, width)
+            if self.config.do_resize:
+                depth = self.resize(depth, height, width)
+
+        elif isinstance(rgb[0], torch.Tensor):
+            raise Exception("This is not yet supported")
+            # rgb = torch.cat(rgb, axis=0) if rgb[0].ndim == 4 else torch.stack(rgb, axis=0)
+
+            # if self.config.do_convert_grayscale and rgb.ndim == 3:
+            #     rgb = rgb.unsqueeze(1)
+
+            # channel = rgb.shape[1]
+
+            # height, width = self.get_default_height_width(rgb, height, width)
+            # if self.config.do_resize:
+            #     rgb = self.resize(rgb, height, width)
+
+            # depth = torch.cat(depth, axis=0) if depth[0].ndim == 4 else torch.stack(depth, axis=0)
+
+            # if self.config.do_convert_grayscale and depth.ndim == 3:
+            #     depth = depth.unsqueeze(1)
+
+            # channel = depth.shape[1]
+            # # don't need any preprocess if the image is latents
+            # if depth == 4:
+            #     return rgb, depth
+
+            # height, width = self.get_default_height_width(depth, height, width)
+            # if self.config.do_resize:
+            #     depth = self.resize(depth, height, width)
+        # expected range [0,1], normalize to [-1,1]
+        do_normalize = self.config.do_normalize
+        if rgb.min() < 0 and do_normalize:
+            warnings.warn(
+                "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
+                f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{rgb.min()},{rgb.max()}]",
+                FutureWarning,
+            )
+            do_normalize = False
+
+        if do_normalize:
+            rgb = self.normalize(rgb)
+            depth = self.normalize(depth)
+
+        if self.config.do_binarize:
+            rgb = self.binarize(rgb)
+            depth = self.binarize(depth)
+
+        return rgb, depth
+
+
+class IPAdapterMaskProcessor(VaeImageProcessor):
+    """
+    Image processor for IP Adapter image masks.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`.
+        vae_scale_factor (`int`, *optional*, defaults to `8`):
+            VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor.
+        resample (`str`, *optional*, defaults to `lanczos`):
+            Resampling filter to use when resizing the image.
+        do_normalize (`bool`, *optional*, defaults to `False`):
+            Whether to normalize the image to [-1,1].
+        do_binarize (`bool`, *optional*, defaults to `True`):
+            Whether to binarize the image to 0/1.
+        do_convert_grayscale (`bool`, *optional*, defaults to be `True`):
+            Whether to convert the images to grayscale format.
+
+    """
+
+    config_name = CONFIG_NAME
+
+    @register_to_config
+    def __init__(
+        self,
+        do_resize: bool = True,
+        vae_scale_factor: int = 8,
+        resample: str = "lanczos",
+        do_normalize: bool = False,
+        do_binarize: bool = True,
+        do_convert_grayscale: bool = True,
+    ):
+        super().__init__(
+            do_resize=do_resize,
+            vae_scale_factor=vae_scale_factor,
+            resample=resample,
+            do_normalize=do_normalize,
+            do_binarize=do_binarize,
+            do_convert_grayscale=do_convert_grayscale,
+        )
+
+    @staticmethod
+    def downsample(mask: torch.FloatTensor, batch_size: int, num_queries: int, value_embed_dim: int):
+        """
+        Downsamples the provided mask tensor to match the expected dimensions for scaled dot-product attention.
+        If the aspect ratio of the mask does not match the aspect ratio of the output image, a warning is issued.
+
+        Args:
+            mask (`torch.FloatTensor`):
+                The input mask tensor generated with `IPAdapterMaskProcessor.preprocess()`.
+            batch_size (`int`):
+                The batch size.
+            num_queries (`int`):
+                The number of queries.
+            value_embed_dim (`int`):
+                The dimensionality of the value embeddings.
+
+        Returns:
+            `torch.FloatTensor`:
+                The downsampled mask tensor.
+
+        """
+        o_h = mask.shape[1]
+        o_w = mask.shape[2]
+        ratio = o_w / o_h
+        mask_h = int(math.sqrt(num_queries / ratio))
+        mask_h = int(mask_h) + int((num_queries % int(mask_h)) != 0)
+        mask_w = num_queries // mask_h
+
+        mask_downsample = F.interpolate(mask.unsqueeze(0), size=(mask_h, mask_w), mode="bicubic").squeeze(0)
+
+        # Repeat batch_size times
+        if mask_downsample.shape[0] < batch_size:
+            mask_downsample = mask_downsample.repeat(batch_size, 1, 1)
+
+        mask_downsample = mask_downsample.view(mask_downsample.shape[0], -1)
+
+        downsampled_area = mask_h * mask_w
+        # If the output image and the mask do not have the same aspect ratio, tensor shapes will not match
+        # Pad tensor if downsampled_mask.shape[1] is smaller than num_queries
+        if downsampled_area < num_queries:
+            warnings.warn(
+                "The aspect ratio of the mask does not match the aspect ratio of the output image. "
+                "Please update your masks or adjust the output size for optimal performance.",
+                UserWarning,
+            )
+            mask_downsample = F.pad(mask_downsample, (0, num_queries - mask_downsample.shape[1]), value=0.0)
+        # Discard last embeddings if downsampled_mask.shape[1] is bigger than num_queries
+        if downsampled_area > num_queries:
+            warnings.warn(
+                "The aspect ratio of the mask does not match the aspect ratio of the output image. "
+                "Please update your masks or adjust the output size for optimal performance.",
+                UserWarning,
+            )
+            mask_downsample = mask_downsample[:, :num_queries]
+
+        # Repeat last dimension to match SDPA output shape
+        mask_downsample = mask_downsample.view(mask_downsample.shape[0], mask_downsample.shape[1], 1).repeat(
+            1, 1, value_embed_dim
+        )
+
+        return mask_downsample
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/__init__.py
new file mode 100644
index 000000000..4da047435
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/__init__.py
@@ -0,0 +1,88 @@
+from typing import TYPE_CHECKING
+
+from ..utils import DIFFUSERS_SLOW_IMPORT, _LazyModule, deprecate
+from ..utils.import_utils import is_peft_available, is_torch_available, is_transformers_available
+
+
+def text_encoder_lora_state_dict(text_encoder):
+    deprecate(
+        "text_encoder_load_state_dict in `models`",
+        "0.27.0",
+        "`text_encoder_lora_state_dict` is deprecated and will be removed in 0.27.0. Make sure to retrieve the weights using `get_peft_model`. See https://huggingface.co/docs/peft/v0.6.2/en/quicktour#peftmodel for more information.",
+    )
+    state_dict = {}
+
+    for name, module in text_encoder_attn_modules(text_encoder):
+        for k, v in module.q_proj.lora_linear_layer.state_dict().items():
+            state_dict[f"{name}.q_proj.lora_linear_layer.{k}"] = v
+
+        for k, v in module.k_proj.lora_linear_layer.state_dict().items():
+            state_dict[f"{name}.k_proj.lora_linear_layer.{k}"] = v
+
+        for k, v in module.v_proj.lora_linear_layer.state_dict().items():
+            state_dict[f"{name}.v_proj.lora_linear_layer.{k}"] = v
+
+        for k, v in module.out_proj.lora_linear_layer.state_dict().items():
+            state_dict[f"{name}.out_proj.lora_linear_layer.{k}"] = v
+
+    return state_dict
+
+
+if is_transformers_available():
+
+    def text_encoder_attn_modules(text_encoder):
+        deprecate(
+            "text_encoder_attn_modules in `models`",
+            "0.27.0",
+            "`text_encoder_lora_state_dict` is deprecated and will be removed in 0.27.0. Make sure to retrieve the weights using `get_peft_model`. See https://huggingface.co/docs/peft/v0.6.2/en/quicktour#peftmodel for more information.",
+        )
+        from transformers import CLIPTextModel, CLIPTextModelWithProjection
+
+        attn_modules = []
+
+        if isinstance(text_encoder, (CLIPTextModel, CLIPTextModelWithProjection)):
+            for i, layer in enumerate(text_encoder.text_model.encoder.layers):
+                name = f"text_model.encoder.layers.{i}.self_attn"
+                mod = layer.self_attn
+                attn_modules.append((name, mod))
+        else:
+            raise ValueError(f"do not know how to get attention modules for: {text_encoder.__class__.__name__}")
+
+        return attn_modules
+
+
+_import_structure = {}
+
+if is_torch_available():
+    _import_structure["autoencoder"] = ["FromOriginalVAEMixin"]
+
+    _import_structure["controlnet"] = ["FromOriginalControlNetMixin"]
+    _import_structure["unet"] = ["UNet2DConditionLoadersMixin"]
+    _import_structure["utils"] = ["AttnProcsLayers"]
+    if is_transformers_available():
+        _import_structure["single_file"] = ["FromSingleFileMixin"]
+        _import_structure["lora"] = ["LoraLoaderMixin", "StableDiffusionXLLoraLoaderMixin"]
+        _import_structure["textual_inversion"] = ["TextualInversionLoaderMixin"]
+        _import_structure["ip_adapter"] = ["IPAdapterMixin"]
+
+_import_structure["peft"] = ["PeftAdapterMixin"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    if is_torch_available():
+        from .autoencoder import FromOriginalVAEMixin
+        from .controlnet import FromOriginalControlNetMixin
+        from .unet import UNet2DConditionLoadersMixin
+        from .utils import AttnProcsLayers
+
+        if is_transformers_available():
+            from .ip_adapter import IPAdapterMixin
+            from .lora import LoraLoaderMixin, StableDiffusionXLLoraLoaderMixin
+            from .single_file import FromSingleFileMixin
+            from .textual_inversion import TextualInversionLoaderMixin
+
+    from .peft import PeftAdapterMixin
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/autoencoder.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/autoencoder.py
new file mode 100644
index 000000000..b91d27f7d
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/autoencoder.py
@@ -0,0 +1,146 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from huggingface_hub.utils import validate_hf_hub_args
+
+from .single_file_utils import (
+    create_diffusers_vae_model_from_ldm,
+    fetch_ldm_config_and_checkpoint,
+)
+
+
+class FromOriginalVAEMixin:
+    """
+    Load pretrained AutoencoderKL weights saved in the `.ckpt` or `.safetensors` format into a [`AutoencoderKL`].
+    """
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
+        r"""
+        Instantiate a [`AutoencoderKL`] from pretrained ControlNet weights saved in the original `.ckpt` or
+        `.safetensors` format. The pipeline is set in evaluation mode (`model.eval()`) by default.
+
+        Parameters:
+            pretrained_model_link_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+                    - A link to the `.ckpt` file (for example
+                      `"https://huggingface.co/<repo_id>/blob/main/<path_to_file>.ckpt"`) on the Hub.
+                    - A path to a *file* containing all pipeline weights.
+            config_file (`str`, *optional*):
+                Filepath to the configuration YAML file associated with the model. If not provided it will default to:
+                https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
+                dtype is automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to True, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            image_size (`int`, *optional*, defaults to 512):
+                The image size the model was trained on. Use 512 for all Stable Diffusion v1 models and the Stable
+                Diffusion v2 base model. Use 768 for Stable Diffusion v2.
+            scaling_factor (`float`, *optional*, defaults to 0.18215):
+                The component-wise standard deviation of the trained latent space computed using the first batch of the
+                training set. This is used to scale the latent space to have unit variance when training the diffusion
+                model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+                diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z
+                = 1 / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution
+                Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to overwrite load and saveable variables (for example the pipeline components of the
+                specific pipeline class). The overwritten components are directly passed to the pipelines `__init__`
+                method. See example below for more information.
+
+        <Tip warning={true}>
+
+            Make sure to pass both `image_size` and `scaling_factor` to `from_single_file()` if you're loading
+            a VAE from SDXL or a Stable Diffusion v2 model or higher.
+
+        </Tip>
+
+        Examples:
+
+        ```py
+        from diffusers import AutoencoderKL
+
+        url = "https://huggingface.co/stabilityai/sd-vae-ft-mse-original/blob/main/vae-ft-mse-840000-ema-pruned.safetensors"  # can also be local file
+        model = AutoencoderKL.from_single_file(url)
+        ```
+        """
+
+        original_config_file = kwargs.pop("original_config_file", None)
+        config_file = kwargs.pop("config_file", None)
+        resume_download = kwargs.pop("resume_download", False)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        revision = kwargs.pop("revision", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+
+        class_name = cls.__name__
+
+        if (config_file is not None) and (original_config_file is not None):
+            raise ValueError(
+                "You cannot pass both `config_file` and `original_config_file` to `from_single_file`. Please use only one of these arguments."
+            )
+
+        original_config_file = original_config_file or config_file
+        original_config, checkpoint = fetch_ldm_config_and_checkpoint(
+            pretrained_model_link_or_path=pretrained_model_link_or_path,
+            class_name=class_name,
+            original_config_file=original_config_file,
+            resume_download=resume_download,
+            force_download=force_download,
+            proxies=proxies,
+            token=token,
+            revision=revision,
+            local_files_only=local_files_only,
+            cache_dir=cache_dir,
+        )
+
+        image_size = kwargs.pop("image_size", None)
+        scaling_factor = kwargs.pop("scaling_factor", None)
+        component = create_diffusers_vae_model_from_ldm(
+            class_name,
+            original_config,
+            checkpoint,
+            image_size=image_size,
+            scaling_factor=scaling_factor,
+            torch_dtype=torch_dtype,
+        )
+        vae = component["vae"]
+        if torch_dtype is not None:
+            vae = vae.to(torch_dtype)
+
+        return vae
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/controlnet.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/controlnet.py
new file mode 100644
index 000000000..d323f60aa
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/controlnet.py
@@ -0,0 +1,136 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from huggingface_hub.utils import validate_hf_hub_args
+
+from .single_file_utils import (
+    create_diffusers_controlnet_model_from_ldm,
+    fetch_ldm_config_and_checkpoint,
+)
+
+
+class FromOriginalControlNetMixin:
+    """
+    Load pretrained ControlNet weights saved in the `.ckpt` or `.safetensors` format into a [`ControlNetModel`].
+    """
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
+        r"""
+        Instantiate a [`ControlNetModel`] from pretrained ControlNet weights saved in the original `.ckpt` or
+        `.safetensors` format. The pipeline is set in evaluation mode (`model.eval()`) by default.
+
+        Parameters:
+            pretrained_model_link_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+                    - A link to the `.ckpt` file (for example
+                      `"https://huggingface.co/<repo_id>/blob/main/<path_to_file>.ckpt"`) on the Hub.
+                    - A path to a *file* containing all pipeline weights.
+            config_file (`str`, *optional*):
+                Filepath to the configuration YAML file associated with the model. If not provided it will default to:
+                https://raw.githubusercontent.com/lllyasviel/ControlNet/main/models/cldm_v15.yaml
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
+                dtype is automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to True, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            image_size (`int`, *optional*, defaults to 512):
+                The image size the model was trained on. Use 512 for all Stable Diffusion v1 models and the Stable
+                Diffusion v2 base model. Use 768 for Stable Diffusion v2.
+            upcast_attention (`bool`, *optional*, defaults to `None`):
+                Whether the attention computation should always be upcasted.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to overwrite load and saveable variables (for example the pipeline components of the
+                specific pipeline class). The overwritten components are directly passed to the pipelines `__init__`
+                method. See example below for more information.
+
+        Examples:
+
+        ```py
+        from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
+
+        url = "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth"  # can also be a local path
+        model = ControlNetModel.from_single_file(url)
+
+        url = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned.safetensors"  # can also be a local path
+        pipe = StableDiffusionControlNetPipeline.from_single_file(url, controlnet=controlnet)
+        ```
+        """
+        original_config_file = kwargs.pop("original_config_file", None)
+        config_file = kwargs.pop("config_file", None)
+        resume_download = kwargs.pop("resume_download", False)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        revision = kwargs.pop("revision", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+
+        class_name = cls.__name__
+        if (config_file is not None) and (original_config_file is not None):
+            raise ValueError(
+                "You cannot pass both `config_file` and `original_config_file` to `from_single_file`. Please use only one of these arguments."
+            )
+
+        original_config_file = config_file or original_config_file
+        original_config, checkpoint = fetch_ldm_config_and_checkpoint(
+            pretrained_model_link_or_path=pretrained_model_link_or_path,
+            class_name=class_name,
+            original_config_file=original_config_file,
+            resume_download=resume_download,
+            force_download=force_download,
+            proxies=proxies,
+            token=token,
+            revision=revision,
+            local_files_only=local_files_only,
+            cache_dir=cache_dir,
+        )
+
+        upcast_attention = kwargs.pop("upcast_attention", False)
+        image_size = kwargs.pop("image_size", None)
+
+        component = create_diffusers_controlnet_model_from_ldm(
+            class_name,
+            original_config,
+            checkpoint,
+            upcast_attention=upcast_attention,
+            image_size=image_size,
+            torch_dtype=torch_dtype,
+        )
+        controlnet = component["controlnet"]
+        if torch_dtype is not None:
+            controlnet = controlnet.to(torch_dtype)
+
+        return controlnet
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/ip_adapter.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/ip_adapter.py
new file mode 100644
index 000000000..93959b9f0
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/ip_adapter.py
@@ -0,0 +1,281 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+
+import torch
+from huggingface_hub.utils import validate_hf_hub_args
+from safetensors import safe_open
+
+from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT
+from ..utils import (
+    _get_model_file,
+    is_accelerate_available,
+    is_torch_version,
+    is_transformers_available,
+    logging,
+)
+
+
+if is_transformers_available():
+    from transformers import (
+        CLIPImageProcessor,
+        CLIPVisionModelWithProjection,
+    )
+
+    from ..models.attention_processor import (
+        IPAdapterAttnProcessor,
+        IPAdapterAttnProcessor2_0,
+    )
+
+logger = logging.get_logger(__name__)
+
+
+class IPAdapterMixin:
+    """Mixin for handling IP Adapters."""
+
+    @validate_hf_hub_args
+    def load_ip_adapter(
+        self,
+        pretrained_model_name_or_path_or_dict: Union[str, List[str], Dict[str, torch.Tensor]],
+        subfolder: Union[str, List[str]],
+        weight_name: Union[str, List[str]],
+        image_encoder_folder: Optional[str] = "image_encoder",
+        **kwargs,
+    ):
+        """
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `List[str]` or `os.PathLike` or `List[os.PathLike]` or `dict` or `List[dict]`):
+                Can be either:
+
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      with [`ModelMixin.save_pretrained`].
+                    - A [torch state
+                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
+            subfolder (`str` or `List[str]`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+                If a list is passed, it should have the same length as `weight_name`.
+            weight_name (`str` or `List[str]`):
+                The name of the weight file to load. If a list is passed, it should have the same length as
+                `weight_name`.
+            image_encoder_folder (`str`, *optional*, defaults to `image_encoder`):
+                The subfolder location of the image encoder within a larger model repository on the Hub or locally.
+                Pass `None` to not load the image encoder. If the image encoder is located in a folder inside `subfolder`,
+                you only need to pass the name of the folder that contains image encoder weights, e.g. `image_encoder_folder="image_encoder"`.
+                If the image encoder is located in a folder other than `subfolder`, you should pass the path to the folder that contains image encoder weights,
+                for example, `image_encoder_folder="different_subfolder/image_encoder"`.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+        """
+
+        # handle the list inputs for multiple IP Adapters
+        if not isinstance(weight_name, list):
+            weight_name = [weight_name]
+
+        if not isinstance(pretrained_model_name_or_path_or_dict, list):
+            pretrained_model_name_or_path_or_dict = [pretrained_model_name_or_path_or_dict]
+        if len(pretrained_model_name_or_path_or_dict) == 1:
+            pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict * len(weight_name)
+
+        if not isinstance(subfolder, list):
+            subfolder = [subfolder]
+        if len(subfolder) == 1:
+            subfolder = subfolder * len(weight_name)
+
+        if len(weight_name) != len(pretrained_model_name_or_path_or_dict):
+            raise ValueError("`weight_name` and `pretrained_model_name_or_path_or_dict` must have the same length.")
+
+        if len(weight_name) != len(subfolder):
+            raise ValueError("`weight_name` and `subfolder` must have the same length.")
+
+        # Load the main state dict first.
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
+
+        if low_cpu_mem_usage and not is_accelerate_available():
+            low_cpu_mem_usage = False
+            logger.warning(
+                "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                " install accelerate\n```\n."
+            )
+
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+
+        user_agent = {
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        }
+        state_dicts = []
+        for pretrained_model_name_or_path_or_dict, weight_name, subfolder in zip(
+            pretrained_model_name_or_path_or_dict, weight_name, subfolder
+        ):
+            if not isinstance(pretrained_model_name_or_path_or_dict, dict):
+                model_file = _get_model_file(
+                    pretrained_model_name_or_path_or_dict,
+                    weights_name=weight_name,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                    user_agent=user_agent,
+                )
+                if weight_name.endswith(".safetensors"):
+                    state_dict = {"image_proj": {}, "ip_adapter": {}}
+                    with safe_open(model_file, framework="pt", device="cpu") as f:
+                        for key in f.keys():
+                            if key.startswith("image_proj."):
+                                state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
+                            elif key.startswith("ip_adapter."):
+                                state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
+                else:
+                    state_dict = torch.load(model_file, map_location="cpu")
+            else:
+                state_dict = pretrained_model_name_or_path_or_dict
+
+            keys = list(state_dict.keys())
+            if keys != ["image_proj", "ip_adapter"]:
+                raise ValueError("Required keys are (`image_proj` and `ip_adapter`) missing from the state dict.")
+
+            state_dicts.append(state_dict)
+
+            # load CLIP image encoder here if it has not been registered to the pipeline yet
+            if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is None:
+                if image_encoder_folder is not None:
+                    if not isinstance(pretrained_model_name_or_path_or_dict, dict):
+                        logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}")
+                        if image_encoder_folder.count("/") == 0:
+                            image_encoder_subfolder = Path(subfolder, image_encoder_folder).as_posix()
+                        else:
+                            image_encoder_subfolder = Path(image_encoder_folder).as_posix()
+
+                        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+                            pretrained_model_name_or_path_or_dict,
+                            subfolder=image_encoder_subfolder,
+                            low_cpu_mem_usage=low_cpu_mem_usage,
+                        ).to(self.device, dtype=self.dtype)
+                        self.register_modules(image_encoder=image_encoder)
+                    else:
+                        raise ValueError(
+                            "`image_encoder` cannot be loaded because `pretrained_model_name_or_path_or_dict` is a state dict."
+                        )
+                else:
+                    logger.warning(
+                        "image_encoder is not loaded since `image_encoder_folder=None` passed. You will not be able to use `ip_adapter_image` when calling the pipeline with IP-Adapter."
+                        "Use `ip_adapter_image_embeds` to pass pre-generated image embedding instead."
+                    )
+
+            # create feature extractor if it has not been registered to the pipeline yet
+            if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None:
+                feature_extractor = CLIPImageProcessor()
+                self.register_modules(feature_extractor=feature_extractor)
+
+        # load ip-adapter into unet
+        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
+        unet._load_ip_adapter_weights(state_dicts, low_cpu_mem_usage=low_cpu_mem_usage)
+
+    def set_ip_adapter_scale(self, scale):
+        """
+        Sets the conditioning scale between text and image.
+
+        Example:
+
+        ```py
+        pipeline.set_ip_adapter_scale(0.5)
+        ```
+        """
+        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
+        for attn_processor in unet.attn_processors.values():
+            if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
+                if not isinstance(scale, list):
+                    scale = [scale] * len(attn_processor.scale)
+                if len(attn_processor.scale) != len(scale):
+                    raise ValueError(
+                        f"`scale` should be a list of same length as the number if ip-adapters "
+                        f"Expected {len(attn_processor.scale)} but got {len(scale)}."
+                    )
+                attn_processor.scale = scale
+
+    def unload_ip_adapter(self):
+        """
+        Unloads the IP Adapter weights
+
+        Examples:
+
+        ```python
+        >>> # Assuming `pipeline` is already loaded with the IP Adapter weights.
+        >>> pipeline.unload_ip_adapter()
+        >>> ...
+        ```
+        """
+        # remove CLIP image encoder
+        if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is not None:
+            self.image_encoder = None
+            self.register_to_config(image_encoder=[None, None])
+
+        # remove feature extractor only when safety_checker is None as safety_checker uses
+        # the feature_extractor later
+        if not hasattr(self, "safety_checker"):
+            if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is not None:
+                self.feature_extractor = None
+                self.register_to_config(feature_extractor=[None, None])
+
+        # remove hidden encoder
+        self.unet.encoder_hid_proj = None
+        self.config.encoder_hid_dim_type = None
+
+        # restore original Unet attention processors layers
+        self.unet.set_default_attn_processor()
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/lora.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/lora.py
new file mode 100644
index 000000000..c6077f3a8
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/lora.py
@@ -0,0 +1,1349 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import os
+from pathlib import Path
+from typing import Callable, Dict, List, Optional, Union
+
+import safetensors
+import torch
+from huggingface_hub import model_info
+from huggingface_hub.constants import HF_HUB_OFFLINE
+from huggingface_hub.utils import validate_hf_hub_args
+from packaging import version
+from torch import nn
+
+from .. import __version__
+from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT
+from ..utils import (
+    USE_PEFT_BACKEND,
+    _get_model_file,
+    convert_state_dict_to_diffusers,
+    convert_state_dict_to_peft,
+    convert_unet_state_dict_to_peft,
+    delete_adapter_layers,
+    get_adapter_name,
+    get_peft_kwargs,
+    is_accelerate_available,
+    is_transformers_available,
+    logging,
+    recurse_remove_peft_layers,
+    scale_lora_layers,
+    set_adapter_layers,
+    set_weights_and_activate_adapters,
+)
+from .lora_conversion_utils import _convert_kohya_lora_to_diffusers, _maybe_map_sgm_blocks_to_diffusers
+
+
+if is_transformers_available():
+    from transformers import PreTrainedModel
+
+    from ..models.lora import text_encoder_attn_modules, text_encoder_mlp_modules
+
+if is_accelerate_available():
+    from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
+
+logger = logging.get_logger(__name__)
+
+TEXT_ENCODER_NAME = "text_encoder"
+UNET_NAME = "unet"
+TRANSFORMER_NAME = "transformer"
+
+LORA_WEIGHT_NAME = "pytorch_lora_weights.bin"
+LORA_WEIGHT_NAME_SAFE = "pytorch_lora_weights.safetensors"
+
+LORA_DEPRECATION_MESSAGE = "You are using an old version of LoRA backend. This will be deprecated in the next releases in favor of PEFT make sure to install the latest PEFT and transformers packages in the future."
+
+
+class LoraLoaderMixin:
+    r"""
+    Load LoRA layers into [`UNet2DConditionModel`] and
+    [`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel).
+    """
+
+    text_encoder_name = TEXT_ENCODER_NAME
+    unet_name = UNET_NAME
+    transformer_name = TRANSFORMER_NAME
+    num_fused_loras = 0
+
+    def load_lora_weights(
+        self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], adapter_name=None, **kwargs
+    ):
+        """
+        Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.unet` and
+        `self.text_encoder`.
+
+        All kwargs are forwarded to `self.lora_state_dict`.
+
+        See [`~loaders.LoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded.
+
+        See [`~loaders.LoraLoaderMixin.load_lora_into_unet`] for more details on how the state dict is loaded into
+        `self.unet`.
+
+        See [`~loaders.LoraLoaderMixin.load_lora_into_text_encoder`] for more details on how the state dict is loaded
+        into `self.text_encoder`.
+
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+                See [`~loaders.LoraLoaderMixin.lora_state_dict`].
+            kwargs (`dict`, *optional*):
+                See [`~loaders.LoraLoaderMixin.lora_state_dict`].
+            adapter_name (`str`, *optional*):
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        # if a dict is passed, copy it instead of modifying it inplace
+        if isinstance(pretrained_model_name_or_path_or_dict, dict):
+            pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict.copy()
+
+        # First, ensure that the checkpoint is a compatible one and can be successfully loaded.
+        state_dict, network_alphas = self.lora_state_dict(pretrained_model_name_or_path_or_dict, **kwargs)
+
+        is_correct_format = all("lora" in key for key in state_dict.keys())
+        if not is_correct_format:
+            raise ValueError("Invalid LoRA checkpoint.")
+
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
+
+        self.load_lora_into_unet(
+            state_dict,
+            network_alphas=network_alphas,
+            unet=getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet,
+            low_cpu_mem_usage=low_cpu_mem_usage,
+            adapter_name=adapter_name,
+            _pipeline=self,
+        )
+        self.load_lora_into_text_encoder(
+            state_dict,
+            network_alphas=network_alphas,
+            text_encoder=getattr(self, self.text_encoder_name)
+            if not hasattr(self, "text_encoder")
+            else self.text_encoder,
+            lora_scale=self.lora_scale,
+            low_cpu_mem_usage=low_cpu_mem_usage,
+            adapter_name=adapter_name,
+            _pipeline=self,
+        )
+
+    @classmethod
+    @validate_hf_hub_args
+    def lora_state_dict(
+        cls,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        **kwargs,
+    ):
+        r"""
+        Return state dict for lora weights and the network alphas.
+
+        <Tip warning={true}>
+
+        We support loading A1111 formatted LoRA checkpoints in a limited capacity.
+
+        This function is experimental and might change in the future.
+
+        </Tip>
+
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+                Can be either:
+
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      with [`ModelMixin.save_pretrained`].
+                    - A [torch state
+                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
+
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+
+        """
+        # Load the main state dict first which has the LoRA layers for either of
+        # UNet and text encoder or both.
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+        weight_name = kwargs.pop("weight_name", None)
+        unet_config = kwargs.pop("unet_config", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+
+        allow_pickle = False
+        if use_safetensors is None:
+            use_safetensors = True
+            allow_pickle = True
+
+        user_agent = {
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        }
+
+        model_file = None
+        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
+            # Let's first try to load .safetensors weights
+            if (use_safetensors and weight_name is None) or (
+                weight_name is not None and weight_name.endswith(".safetensors")
+            ):
+                try:
+                    # Here we're relaxing the loading check to enable more Inference API
+                    # friendliness where sometimes, it's not at all possible to automatically
+                    # determine `weight_name`.
+                    if weight_name is None:
+                        weight_name = cls._best_guess_weight_name(
+                            pretrained_model_name_or_path_or_dict,
+                            file_extension=".safetensors",
+                            local_files_only=local_files_only,
+                        )
+                    model_file = _get_model_file(
+                        pretrained_model_name_or_path_or_dict,
+                        weights_name=weight_name or LORA_WEIGHT_NAME_SAFE,
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        resume_download=resume_download,
+                        proxies=proxies,
+                        local_files_only=local_files_only,
+                        token=token,
+                        revision=revision,
+                        subfolder=subfolder,
+                        user_agent=user_agent,
+                    )
+                    state_dict = safetensors.torch.load_file(model_file, device="cpu")
+                except (IOError, safetensors.SafetensorError) as e:
+                    if not allow_pickle:
+                        raise e
+                    # try loading non-safetensors weights
+                    model_file = None
+                    pass
+
+            if model_file is None:
+                if weight_name is None:
+                    weight_name = cls._best_guess_weight_name(
+                        pretrained_model_name_or_path_or_dict, file_extension=".bin", local_files_only=local_files_only
+                    )
+                model_file = _get_model_file(
+                    pretrained_model_name_or_path_or_dict,
+                    weights_name=weight_name or LORA_WEIGHT_NAME,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                    user_agent=user_agent,
+                )
+                state_dict = torch.load(model_file, map_location="cpu")
+        else:
+            state_dict = pretrained_model_name_or_path_or_dict
+
+        network_alphas = None
+        # TODO: replace it with a method from `state_dict_utils`
+        if all(
+            (
+                k.startswith("lora_te_")
+                or k.startswith("lora_unet_")
+                or k.startswith("lora_te1_")
+                or k.startswith("lora_te2_")
+            )
+            for k in state_dict.keys()
+        ):
+            # Map SDXL blocks correctly.
+            if unet_config is not None:
+                # use unet config to remap block numbers
+                state_dict = _maybe_map_sgm_blocks_to_diffusers(state_dict, unet_config)
+            state_dict, network_alphas = _convert_kohya_lora_to_diffusers(state_dict)
+
+        return state_dict, network_alphas
+
+    @classmethod
+    def _best_guess_weight_name(
+        cls, pretrained_model_name_or_path_or_dict, file_extension=".safetensors", local_files_only=False
+    ):
+        if local_files_only or HF_HUB_OFFLINE:
+            raise ValueError("When using the offline mode, you must specify a `weight_name`.")
+
+        targeted_files = []
+
+        if os.path.isfile(pretrained_model_name_or_path_or_dict):
+            return
+        elif os.path.isdir(pretrained_model_name_or_path_or_dict):
+            targeted_files = [
+                f for f in os.listdir(pretrained_model_name_or_path_or_dict) if f.endswith(file_extension)
+            ]
+        else:
+            files_in_repo = model_info(pretrained_model_name_or_path_or_dict).siblings
+            targeted_files = [f.rfilename for f in files_in_repo if f.rfilename.endswith(file_extension)]
+        if len(targeted_files) == 0:
+            return
+
+        # "scheduler" does not correspond to a LoRA checkpoint.
+        # "optimizer" does not correspond to a LoRA checkpoint
+        # only top-level checkpoints are considered and not the other ones, hence "checkpoint".
+        unallowed_substrings = {"scheduler", "optimizer", "checkpoint"}
+        targeted_files = list(
+            filter(lambda x: all(substring not in x for substring in unallowed_substrings), targeted_files)
+        )
+
+        if any(f.endswith(LORA_WEIGHT_NAME) for f in targeted_files):
+            targeted_files = list(filter(lambda x: x.endswith(LORA_WEIGHT_NAME), targeted_files))
+        elif any(f.endswith(LORA_WEIGHT_NAME_SAFE) for f in targeted_files):
+            targeted_files = list(filter(lambda x: x.endswith(LORA_WEIGHT_NAME_SAFE), targeted_files))
+
+        if len(targeted_files) > 1:
+            raise ValueError(
+                f"Provided path contains more than one weights file in the {file_extension} format. Either specify `weight_name` in `load_lora_weights` or make sure there's only one  `.safetensors` or `.bin` file in  {pretrained_model_name_or_path_or_dict}."
+            )
+        weight_name = targeted_files[0]
+        return weight_name
+
+    @classmethod
+    def _optionally_disable_offloading(cls, _pipeline):
+        """
+        Optionally removes offloading in case the pipeline has been already sequentially offloaded to CPU.
+
+        Args:
+            _pipeline (`DiffusionPipeline`):
+                The pipeline to disable offloading for.
+
+        Returns:
+            tuple:
+                A tuple indicating if `is_model_cpu_offload` or `is_sequential_cpu_offload` is True.
+        """
+        is_model_cpu_offload = False
+        is_sequential_cpu_offload = False
+
+        if _pipeline is not None:
+            for _, component in _pipeline.components.items():
+                if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"):
+                    if not is_model_cpu_offload:
+                        is_model_cpu_offload = isinstance(component._hf_hook, CpuOffload)
+                    if not is_sequential_cpu_offload:
+                        is_sequential_cpu_offload = isinstance(component._hf_hook, AlignDevicesHook)
+
+                    logger.info(
+                        "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
+                    )
+                    remove_hook_from_module(component, recurse=is_sequential_cpu_offload)
+
+        return (is_model_cpu_offload, is_sequential_cpu_offload)
+
+    @classmethod
+    def load_lora_into_unet(
+        cls, state_dict, network_alphas, unet, low_cpu_mem_usage=None, adapter_name=None, _pipeline=None
+    ):
+        """
+        This will load the LoRA layers specified in `state_dict` into `unet`.
+
+        Parameters:
+            state_dict (`dict`):
+                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
+                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
+                encoder lora layers.
+            network_alphas (`Dict[str, float]`):
+                See `LoRALinearLayer` for more details.
+            unet (`UNet2DConditionModel`):
+                The UNet model to load the LoRA layers into.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+            adapter_name (`str`, *optional*):
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict
+
+        low_cpu_mem_usage = low_cpu_mem_usage if low_cpu_mem_usage is not None else _LOW_CPU_MEM_USAGE_DEFAULT
+        # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918),
+        # then the `state_dict` keys should have `cls.unet_name` and/or `cls.text_encoder_name` as
+        # their prefixes.
+        keys = list(state_dict.keys())
+
+        if all(key.startswith(cls.unet_name) or key.startswith(cls.text_encoder_name) for key in keys):
+            # Load the layers corresponding to UNet.
+            logger.info(f"Loading {cls.unet_name}.")
+
+            unet_keys = [k for k in keys if k.startswith(cls.unet_name)]
+            state_dict = {k.replace(f"{cls.unet_name}.", ""): v for k, v in state_dict.items() if k in unet_keys}
+
+            if network_alphas is not None:
+                alpha_keys = [k for k in network_alphas.keys() if k.startswith(cls.unet_name)]
+                network_alphas = {
+                    k.replace(f"{cls.unet_name}.", ""): v for k, v in network_alphas.items() if k in alpha_keys
+                }
+
+        else:
+            # Otherwise, we're dealing with the old format. This means the `state_dict` should only
+            # contain the module names of the `unet` as its keys WITHOUT any prefix.
+            if not USE_PEFT_BACKEND:
+                warn_message = "You have saved the LoRA weights using the old format. To convert the old LoRA weights to the new format, you can first load them in a dictionary and then create a new dictionary like the following: `new_state_dict = {f'unet.{module_name}': params for module_name, params in old_state_dict.items()}`."
+                logger.warning(warn_message)
+
+        if len(state_dict.keys()) > 0:
+            if adapter_name in getattr(unet, "peft_config", {}):
+                raise ValueError(
+                    f"Adapter name {adapter_name} already in use in the Unet - please select a new adapter name."
+                )
+
+            state_dict = convert_unet_state_dict_to_peft(state_dict)
+
+            if network_alphas is not None:
+                # The alphas state dict have the same structure as Unet, thus we convert it to peft format using
+                # `convert_unet_state_dict_to_peft` method.
+                network_alphas = convert_unet_state_dict_to_peft(network_alphas)
+
+            rank = {}
+            for key, val in state_dict.items():
+                if "lora_B" in key:
+                    rank[key] = val.shape[1]
+
+            lora_config_kwargs = get_peft_kwargs(rank, network_alphas, state_dict, is_unet=True)
+            lora_config = LoraConfig(**lora_config_kwargs)
+
+            # adapter_name
+            if adapter_name is None:
+                adapter_name = get_adapter_name(unet)
+
+            # In case the pipeline has been already offloaded to CPU - temporarily remove the hooks
+            # otherwise loading LoRA weights will lead to an error
+            is_model_cpu_offload, is_sequential_cpu_offload = cls._optionally_disable_offloading(_pipeline)
+
+            inject_adapter_in_model(lora_config, unet, adapter_name=adapter_name)
+            incompatible_keys = set_peft_model_state_dict(unet, state_dict, adapter_name)
+
+            if incompatible_keys is not None:
+                # check only for unexpected keys
+                unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+                if unexpected_keys:
+                    logger.warning(
+                        f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                        f" {unexpected_keys}. "
+                    )
+
+            # Offload back.
+            if is_model_cpu_offload:
+                _pipeline.enable_model_cpu_offload()
+            elif is_sequential_cpu_offload:
+                _pipeline.enable_sequential_cpu_offload()
+            # Unsafe code />
+
+        unet.load_attn_procs(
+            state_dict, network_alphas=network_alphas, low_cpu_mem_usage=low_cpu_mem_usage, _pipeline=_pipeline
+        )
+
+    @classmethod
+    def load_lora_into_text_encoder(
+        cls,
+        state_dict,
+        network_alphas,
+        text_encoder,
+        prefix=None,
+        lora_scale=1.0,
+        low_cpu_mem_usage=None,
+        adapter_name=None,
+        _pipeline=None,
+    ):
+        """
+        This will load the LoRA layers specified in `state_dict` into `text_encoder`
+
+        Parameters:
+            state_dict (`dict`):
+                A standard state dict containing the lora layer parameters. The key should be prefixed with an
+                additional `text_encoder` to distinguish between unet lora layers.
+            network_alphas (`Dict[str, float]`):
+                See `LoRALinearLayer` for more details.
+            text_encoder (`CLIPTextModel`):
+                The text encoder model to load the LoRA layers into.
+            prefix (`str`):
+                Expected prefix of the `text_encoder` in the `state_dict`.
+            lora_scale (`float`):
+                How much to scale the output of the lora linear layer before it is added with the output of the regular
+                lora layer.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+            adapter_name (`str`, *optional*):
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        from peft import LoraConfig
+
+        low_cpu_mem_usage = low_cpu_mem_usage if low_cpu_mem_usage is not None else _LOW_CPU_MEM_USAGE_DEFAULT
+
+        # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918),
+        # then the `state_dict` keys should have `self.unet_name` and/or `self.text_encoder_name` as
+        # their prefixes.
+        keys = list(state_dict.keys())
+        prefix = cls.text_encoder_name if prefix is None else prefix
+
+        # Safe prefix to check with.
+        if any(cls.text_encoder_name in key for key in keys):
+            # Load the layers corresponding to text encoder and make necessary adjustments.
+            text_encoder_keys = [k for k in keys if k.startswith(prefix) and k.split(".")[0] == prefix]
+            text_encoder_lora_state_dict = {
+                k.replace(f"{prefix}.", ""): v for k, v in state_dict.items() if k in text_encoder_keys
+            }
+
+            if len(text_encoder_lora_state_dict) > 0:
+                logger.info(f"Loading {prefix}.")
+                rank = {}
+                text_encoder_lora_state_dict = convert_state_dict_to_diffusers(text_encoder_lora_state_dict)
+
+                # convert state dict
+                text_encoder_lora_state_dict = convert_state_dict_to_peft(text_encoder_lora_state_dict)
+
+                for name, _ in text_encoder_attn_modules(text_encoder):
+                    rank_key = f"{name}.out_proj.lora_B.weight"
+                    rank[rank_key] = text_encoder_lora_state_dict[rank_key].shape[1]
+
+                patch_mlp = any(".mlp." in key for key in text_encoder_lora_state_dict.keys())
+                if patch_mlp:
+                    for name, _ in text_encoder_mlp_modules(text_encoder):
+                        rank_key_fc1 = f"{name}.fc1.lora_B.weight"
+                        rank_key_fc2 = f"{name}.fc2.lora_B.weight"
+
+                        rank[rank_key_fc1] = text_encoder_lora_state_dict[rank_key_fc1].shape[1]
+                        rank[rank_key_fc2] = text_encoder_lora_state_dict[rank_key_fc2].shape[1]
+
+                if network_alphas is not None:
+                    alpha_keys = [
+                        k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix
+                    ]
+                    network_alphas = {
+                        k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys
+                    }
+
+                lora_config_kwargs = get_peft_kwargs(rank, network_alphas, text_encoder_lora_state_dict, is_unet=False)
+                lora_config = LoraConfig(**lora_config_kwargs)
+
+                # adapter_name
+                if adapter_name is None:
+                    adapter_name = get_adapter_name(text_encoder)
+
+                is_model_cpu_offload, is_sequential_cpu_offload = cls._optionally_disable_offloading(_pipeline)
+
+                # inject LoRA layers and load the state dict
+                # in transformers we automatically check whether the adapter name is already in use or not
+                text_encoder.load_adapter(
+                    adapter_name=adapter_name,
+                    adapter_state_dict=text_encoder_lora_state_dict,
+                    peft_config=lora_config,
+                )
+
+                # scale LoRA layers with `lora_scale`
+                scale_lora_layers(text_encoder, weight=lora_scale)
+
+                text_encoder.to(device=text_encoder.device, dtype=text_encoder.dtype)
+
+                # Offload back.
+                if is_model_cpu_offload:
+                    _pipeline.enable_model_cpu_offload()
+                elif is_sequential_cpu_offload:
+                    _pipeline.enable_sequential_cpu_offload()
+                # Unsafe code />
+
+    @classmethod
+    def load_lora_into_transformer(
+        cls, state_dict, network_alphas, transformer, low_cpu_mem_usage=None, adapter_name=None, _pipeline=None
+    ):
+        """
+        This will load the LoRA layers specified in `state_dict` into `transformer`.
+
+        Parameters:
+            state_dict (`dict`):
+                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
+                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
+                encoder lora layers.
+            network_alphas (`Dict[str, float]`):
+                See `LoRALinearLayer` for more details.
+            unet (`UNet2DConditionModel`):
+                The UNet model to load the LoRA layers into.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+            adapter_name (`str`, *optional*):
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
+        """
+        from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict
+
+        low_cpu_mem_usage = low_cpu_mem_usage if low_cpu_mem_usage is not None else _LOW_CPU_MEM_USAGE_DEFAULT
+
+        keys = list(state_dict.keys())
+
+        transformer_keys = [k for k in keys if k.startswith(cls.transformer_name)]
+        state_dict = {
+            k.replace(f"{cls.transformer_name}.", ""): v for k, v in state_dict.items() if k in transformer_keys
+        }
+
+        if network_alphas is not None:
+            alpha_keys = [k for k in network_alphas.keys() if k.startswith(cls.transformer_name)]
+            network_alphas = {
+                k.replace(f"{cls.transformer_name}.", ""): v for k, v in network_alphas.items() if k in alpha_keys
+            }
+
+        if len(state_dict.keys()) > 0:
+            if adapter_name in getattr(transformer, "peft_config", {}):
+                raise ValueError(
+                    f"Adapter name {adapter_name} already in use in the transformer - please select a new adapter name."
+                )
+
+            rank = {}
+            for key, val in state_dict.items():
+                if "lora_B" in key:
+                    rank[key] = val.shape[1]
+
+            lora_config_kwargs = get_peft_kwargs(rank, network_alphas, state_dict)
+            lora_config = LoraConfig(**lora_config_kwargs)
+
+            # adapter_name
+            if adapter_name is None:
+                adapter_name = get_adapter_name(transformer)
+
+            # In case the pipeline has been already offloaded to CPU - temporarily remove the hooks
+            # otherwise loading LoRA weights will lead to an error
+            is_model_cpu_offload, is_sequential_cpu_offload = cls._optionally_disable_offloading(_pipeline)
+
+            inject_adapter_in_model(lora_config, transformer, adapter_name=adapter_name)
+            incompatible_keys = set_peft_model_state_dict(transformer, state_dict, adapter_name)
+
+            if incompatible_keys is not None:
+                # check only for unexpected keys
+                unexpected_keys = getattr(incompatible_keys, "unexpected_keys", None)
+                if unexpected_keys:
+                    logger.warning(
+                        f"Loading adapter weights from state_dict led to unexpected keys not found in the model: "
+                        f" {unexpected_keys}. "
+                    )
+
+            # Offload back.
+            if is_model_cpu_offload:
+                _pipeline.enable_model_cpu_offload()
+            elif is_sequential_cpu_offload:
+                _pipeline.enable_sequential_cpu_offload()
+            # Unsafe code />
+
+    @property
+    def lora_scale(self) -> float:
+        # property function that returns the lora scale which can be set at run time by the pipeline.
+        # if _lora_scale has not been set, return 1
+        return self._lora_scale if hasattr(self, "_lora_scale") else 1.0
+
+    def _remove_text_encoder_monkey_patch(self):
+        remove_method = recurse_remove_peft_layers
+        if hasattr(self, "text_encoder"):
+            remove_method(self.text_encoder)
+            # In case text encoder have no Lora attached
+            if getattr(self.text_encoder, "peft_config", None) is not None:
+                del self.text_encoder.peft_config
+                self.text_encoder._hf_peft_config_loaded = None
+
+        if hasattr(self, "text_encoder_2"):
+            remove_method(self.text_encoder_2)
+            if getattr(self.text_encoder_2, "peft_config", None) is not None:
+                del self.text_encoder_2.peft_config
+                self.text_encoder_2._hf_peft_config_loaded = None
+
+    @classmethod
+    def save_lora_weights(
+        cls,
+        save_directory: Union[str, os.PathLike],
+        unet_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        text_encoder_lora_layers: Dict[str, torch.nn.Module] = None,
+        transformer_lora_layers: Dict[str, torch.nn.Module] = None,
+        is_main_process: bool = True,
+        weight_name: str = None,
+        save_function: Callable = None,
+        safe_serialization: bool = True,
+    ):
+        r"""
+        Save the LoRA parameters corresponding to the UNet and text encoder.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to save LoRA parameters to. Will be created if it doesn't exist.
+            unet_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
+                State dict of the LoRA layers corresponding to the `unet`.
+            text_encoder_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
+                State dict of the LoRA layers corresponding to the `text_encoder`. Must explicitly pass the text
+                encoder LoRA state dict because it comes from 🤗 Transformers.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful during distributed training and you
+                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
+                process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful during distributed training when you need to
+                replace `torch.save` with another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
+        """
+        state_dict = {}
+
+        def pack_weights(layers, prefix):
+            layers_weights = layers.state_dict() if isinstance(layers, torch.nn.Module) else layers
+            layers_state_dict = {f"{prefix}.{module_name}": param for module_name, param in layers_weights.items()}
+            return layers_state_dict
+
+        if not (unet_lora_layers or text_encoder_lora_layers or transformer_lora_layers):
+            raise ValueError(
+                "You must pass at least one of `unet_lora_layers`, `text_encoder_lora_layers`, or `transformer_lora_layers`."
+            )
+
+        if unet_lora_layers:
+            state_dict.update(pack_weights(unet_lora_layers, cls.unet_name))
+
+        if text_encoder_lora_layers:
+            state_dict.update(pack_weights(text_encoder_lora_layers, cls.text_encoder_name))
+
+        if transformer_lora_layers:
+            state_dict.update(pack_weights(transformer_lora_layers, "transformer"))
+
+        # Save the model
+        cls.write_lora_layers(
+            state_dict=state_dict,
+            save_directory=save_directory,
+            is_main_process=is_main_process,
+            weight_name=weight_name,
+            save_function=save_function,
+            safe_serialization=safe_serialization,
+        )
+
+    @staticmethod
+    def write_lora_layers(
+        state_dict: Dict[str, torch.Tensor],
+        save_directory: str,
+        is_main_process: bool,
+        weight_name: str,
+        save_function: Callable,
+        safe_serialization: bool,
+    ):
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+
+        if save_function is None:
+            if safe_serialization:
+
+                def save_function(weights, filename):
+                    return safetensors.torch.save_file(weights, filename, metadata={"format": "pt"})
+
+            else:
+                save_function = torch.save
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        if weight_name is None:
+            if safe_serialization:
+                weight_name = LORA_WEIGHT_NAME_SAFE
+            else:
+                weight_name = LORA_WEIGHT_NAME
+
+        save_path = Path(save_directory, weight_name).as_posix()
+        save_function(state_dict, save_path)
+        logger.info(f"Model weights saved in {save_path}")
+
+    def unload_lora_weights(self):
+        """
+        Unloads the LoRA parameters.
+
+        Examples:
+
+        ```python
+        >>> # Assuming `pipeline` is already loaded with the LoRA parameters.
+        >>> pipeline.unload_lora_weights()
+        >>> ...
+        ```
+        """
+        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
+
+        if not USE_PEFT_BACKEND:
+            if version.parse(__version__) > version.parse("0.23"):
+                logger.warning(
+                    "You are using `unload_lora_weights` to disable and unload lora weights. If you want to iteratively enable and disable adapter weights,"
+                    "you can use `pipe.enable_lora()` or `pipe.disable_lora()`. After installing the latest version of PEFT."
+                )
+
+            for _, module in unet.named_modules():
+                if hasattr(module, "set_lora_layer"):
+                    module.set_lora_layer(None)
+        else:
+            recurse_remove_peft_layers(unet)
+            if hasattr(unet, "peft_config"):
+                del unet.peft_config
+
+        # Safe to call the following regardless of LoRA.
+        self._remove_text_encoder_monkey_patch()
+
+    def fuse_lora(
+        self,
+        fuse_unet: bool = True,
+        fuse_text_encoder: bool = True,
+        lora_scale: float = 1.0,
+        safe_fusing: bool = False,
+        adapter_names: Optional[List[str]] = None,
+    ):
+        r"""
+        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
+
+        <Tip warning={true}>
+
+        This is an experimental API.
+
+        </Tip>
+
+        Args:
+            fuse_unet (`bool`, defaults to `True`): Whether to fuse the UNet LoRA parameters.
+            fuse_text_encoder (`bool`, defaults to `True`):
+                Whether to fuse the text encoder LoRA parameters. If the text encoder wasn't monkey-patched with the
+                LoRA parameters then it won't have any effect.
+            lora_scale (`float`, defaults to 1.0):
+                Controls how much to influence the outputs with the LoRA parameters.
+            safe_fusing (`bool`, defaults to `False`):
+                Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
+            adapter_names (`List[str]`, *optional*):
+                Adapter names to be used for fusing. If nothing is passed, all active adapters will be fused.
+
+        Example:
+
+        ```py
+        from diffusers import DiffusionPipeline
+        import torch
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+        pipeline.fuse_lora(lora_scale=0.7)
+        ```
+        """
+        from peft.tuners.tuners_utils import BaseTunerLayer
+
+        if fuse_unet or fuse_text_encoder:
+            self.num_fused_loras += 1
+            if self.num_fused_loras > 1:
+                logger.warning(
+                    "The current API is supported for operating with a single LoRA file. You are trying to load and fuse more than one LoRA which is not well-supported.",
+                )
+
+        if fuse_unet:
+            unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
+            unet.fuse_lora(lora_scale, safe_fusing=safe_fusing, adapter_names=adapter_names)
+
+        def fuse_text_encoder_lora(text_encoder, lora_scale=1.0, safe_fusing=False, adapter_names=None):
+            merge_kwargs = {"safe_merge": safe_fusing}
+
+            for module in text_encoder.modules():
+                if isinstance(module, BaseTunerLayer):
+                    if lora_scale != 1.0:
+                        module.scale_layer(lora_scale)
+
+                    # For BC with previous PEFT versions, we need to check the signature
+                    # of the `merge` method to see if it supports the `adapter_names` argument.
+                    supported_merge_kwargs = list(inspect.signature(module.merge).parameters)
+                    if "adapter_names" in supported_merge_kwargs:
+                        merge_kwargs["adapter_names"] = adapter_names
+                    elif "adapter_names" not in supported_merge_kwargs and adapter_names is not None:
+                        raise ValueError(
+                            "The `adapter_names` argument is not supported with your PEFT version. "
+                            "Please upgrade to the latest version of PEFT. `pip install -U peft`"
+                        )
+
+                    module.merge(**merge_kwargs)
+
+        if fuse_text_encoder:
+            if hasattr(self, "text_encoder"):
+                fuse_text_encoder_lora(self.text_encoder, lora_scale, safe_fusing, adapter_names=adapter_names)
+            if hasattr(self, "text_encoder_2"):
+                fuse_text_encoder_lora(self.text_encoder_2, lora_scale, safe_fusing, adapter_names=adapter_names)
+
+    def unfuse_lora(self, unfuse_unet: bool = True, unfuse_text_encoder: bool = True):
+        r"""
+        Reverses the effect of
+        [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraLoaderMixin.fuse_lora).
+
+        <Tip warning={true}>
+
+        This is an experimental API.
+
+        </Tip>
+
+        Args:
+            unfuse_unet (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters.
+            unfuse_text_encoder (`bool`, defaults to `True`):
+                Whether to unfuse the text encoder LoRA parameters. If the text encoder wasn't monkey-patched with the
+                LoRA parameters then it won't have any effect.
+        """
+        from peft.tuners.tuners_utils import BaseTunerLayer
+
+        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
+        if unfuse_unet:
+            for module in unet.modules():
+                if isinstance(module, BaseTunerLayer):
+                    module.unmerge()
+
+        def unfuse_text_encoder_lora(text_encoder):
+            for module in text_encoder.modules():
+                if isinstance(module, BaseTunerLayer):
+                    module.unmerge()
+
+        if unfuse_text_encoder:
+            if hasattr(self, "text_encoder"):
+                unfuse_text_encoder_lora(self.text_encoder)
+            if hasattr(self, "text_encoder_2"):
+                unfuse_text_encoder_lora(self.text_encoder_2)
+
+        self.num_fused_loras -= 1
+
+    def set_adapters_for_text_encoder(
+        self,
+        adapter_names: Union[List[str], str],
+        text_encoder: Optional["PreTrainedModel"] = None,  # noqa: F821
+        text_encoder_weights: List[float] = None,
+    ):
+        """
+        Sets the adapter layers for the text encoder.
+
+        Args:
+            adapter_names (`List[str]` or `str`):
+                The names of the adapters to use.
+            text_encoder (`torch.nn.Module`, *optional*):
+                The text encoder module to set the adapter layers for. If `None`, it will try to get the `text_encoder`
+                attribute.
+            text_encoder_weights (`List[float]`, *optional*):
+                The weights to use for the text encoder. If `None`, the weights are set to `1.0` for all the adapters.
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        def process_weights(adapter_names, weights):
+            if weights is None:
+                weights = [1.0] * len(adapter_names)
+            elif isinstance(weights, float):
+                weights = [weights]
+
+            if len(adapter_names) != len(weights):
+                raise ValueError(
+                    f"Length of adapter names {len(adapter_names)} is not equal to the length of the weights {len(weights)}"
+                )
+            return weights
+
+        adapter_names = [adapter_names] if isinstance(adapter_names, str) else adapter_names
+        text_encoder_weights = process_weights(adapter_names, text_encoder_weights)
+        text_encoder = text_encoder or getattr(self, "text_encoder", None)
+        if text_encoder is None:
+            raise ValueError(
+                "The pipeline does not have a default `pipe.text_encoder` class. Please make sure to pass a `text_encoder` instead."
+            )
+        set_weights_and_activate_adapters(text_encoder, adapter_names, text_encoder_weights)
+
+    def disable_lora_for_text_encoder(self, text_encoder: Optional["PreTrainedModel"] = None):
+        """
+        Disables the LoRA layers for the text encoder.
+
+        Args:
+            text_encoder (`torch.nn.Module`, *optional*):
+                The text encoder module to disable the LoRA layers for. If `None`, it will try to get the
+                `text_encoder` attribute.
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        text_encoder = text_encoder or getattr(self, "text_encoder", None)
+        if text_encoder is None:
+            raise ValueError("Text Encoder not found.")
+        set_adapter_layers(text_encoder, enabled=False)
+
+    def enable_lora_for_text_encoder(self, text_encoder: Optional["PreTrainedModel"] = None):
+        """
+        Enables the LoRA layers for the text encoder.
+
+        Args:
+            text_encoder (`torch.nn.Module`, *optional*):
+                The text encoder module to enable the LoRA layers for. If `None`, it will try to get the `text_encoder`
+                attribute.
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+        text_encoder = text_encoder or getattr(self, "text_encoder", None)
+        if text_encoder is None:
+            raise ValueError("Text Encoder not found.")
+        set_adapter_layers(self.text_encoder, enabled=True)
+
+    def set_adapters(
+        self,
+        adapter_names: Union[List[str], str],
+        adapter_weights: Optional[List[float]] = None,
+    ):
+        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
+        # Handle the UNET
+        unet.set_adapters(adapter_names, adapter_weights)
+
+        # Handle the Text Encoder
+        if hasattr(self, "text_encoder"):
+            self.set_adapters_for_text_encoder(adapter_names, self.text_encoder, adapter_weights)
+        if hasattr(self, "text_encoder_2"):
+            self.set_adapters_for_text_encoder(adapter_names, self.text_encoder_2, adapter_weights)
+
+    def disable_lora(self):
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        # Disable unet adapters
+        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
+        unet.disable_lora()
+
+        # Disable text encoder adapters
+        if hasattr(self, "text_encoder"):
+            self.disable_lora_for_text_encoder(self.text_encoder)
+        if hasattr(self, "text_encoder_2"):
+            self.disable_lora_for_text_encoder(self.text_encoder_2)
+
+    def enable_lora(self):
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        # Enable unet adapters
+        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
+        unet.enable_lora()
+
+        # Enable text encoder adapters
+        if hasattr(self, "text_encoder"):
+            self.enable_lora_for_text_encoder(self.text_encoder)
+        if hasattr(self, "text_encoder_2"):
+            self.enable_lora_for_text_encoder(self.text_encoder_2)
+
+    def delete_adapters(self, adapter_names: Union[List[str], str]):
+        """
+        Args:
+        Deletes the LoRA layers of `adapter_name` for the unet and text-encoder(s).
+            adapter_names (`Union[List[str], str]`):
+                The names of the adapter to delete. Can be a single string or a list of strings
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        if isinstance(adapter_names, str):
+            adapter_names = [adapter_names]
+
+        # Delete unet adapters
+        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
+        unet.delete_adapters(adapter_names)
+
+        for adapter_name in adapter_names:
+            # Delete text encoder adapters
+            if hasattr(self, "text_encoder"):
+                delete_adapter_layers(self.text_encoder, adapter_name)
+            if hasattr(self, "text_encoder_2"):
+                delete_adapter_layers(self.text_encoder_2, adapter_name)
+
+    def get_active_adapters(self) -> List[str]:
+        """
+        Gets the list of the current active adapters.
+
+        Example:
+
+        ```python
+        from diffusers import DiffusionPipeline
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+        ).to("cuda")
+        pipeline.load_lora_weights("CiroN2022/toy-face", weight_name="toy_face_sdxl.safetensors", adapter_name="toy")
+        pipeline.get_active_adapters()
+        ```
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError(
+                "PEFT backend is required for this method. Please install the latest version of PEFT `pip install -U peft`"
+            )
+
+        from peft.tuners.tuners_utils import BaseTunerLayer
+
+        active_adapters = []
+        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
+        for module in unet.modules():
+            if isinstance(module, BaseTunerLayer):
+                active_adapters = module.active_adapters
+                break
+
+        return active_adapters
+
+    def get_list_adapters(self) -> Dict[str, List[str]]:
+        """
+        Gets the current list of all available adapters in the pipeline.
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError(
+                "PEFT backend is required for this method. Please install the latest version of PEFT `pip install -U peft`"
+            )
+
+        set_adapters = {}
+
+        if hasattr(self, "text_encoder") and hasattr(self.text_encoder, "peft_config"):
+            set_adapters["text_encoder"] = list(self.text_encoder.peft_config.keys())
+
+        if hasattr(self, "text_encoder_2") and hasattr(self.text_encoder_2, "peft_config"):
+            set_adapters["text_encoder_2"] = list(self.text_encoder_2.peft_config.keys())
+
+        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
+        if hasattr(self, self.unet_name) and hasattr(unet, "peft_config"):
+            set_adapters[self.unet_name] = list(self.unet.peft_config.keys())
+
+        return set_adapters
+
+    def set_lora_device(self, adapter_names: List[str], device: Union[torch.device, str, int]) -> None:
+        """
+        Moves the LoRAs listed in `adapter_names` to a target device. Useful for offloading the LoRA to the CPU in case
+        you want to load multiple adapters and free some GPU memory.
+
+        Args:
+            adapter_names (`List[str]`):
+                List of adapters to send device to.
+            device (`Union[torch.device, str, int]`):
+                Device to send the adapters to. Can be either a torch device, a str or an integer.
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        from peft.tuners.tuners_utils import BaseTunerLayer
+
+        # Handle the UNET
+        unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
+        for unet_module in unet.modules():
+            if isinstance(unet_module, BaseTunerLayer):
+                for adapter_name in adapter_names:
+                    unet_module.lora_A[adapter_name].to(device)
+                    unet_module.lora_B[adapter_name].to(device)
+
+        # Handle the text encoder
+        modules_to_process = []
+        if hasattr(self, "text_encoder"):
+            modules_to_process.append(self.text_encoder)
+
+        if hasattr(self, "text_encoder_2"):
+            modules_to_process.append(self.text_encoder_2)
+
+        for text_encoder in modules_to_process:
+            # loop over submodules
+            for text_encoder_module in text_encoder.modules():
+                if isinstance(text_encoder_module, BaseTunerLayer):
+                    for adapter_name in adapter_names:
+                        text_encoder_module.lora_A[adapter_name].to(device)
+                        text_encoder_module.lora_B[adapter_name].to(device)
+
+
+class StableDiffusionXLLoraLoaderMixin(LoraLoaderMixin):
+    """This class overrides `LoraLoaderMixin` with LoRA loading/saving code that's specific to SDXL"""
+
+    # Override to properly handle the loading and unloading of the additional text encoder.
+    def load_lora_weights(
+        self,
+        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+        adapter_name: Optional[str] = None,
+        **kwargs,
+    ):
+        """
+        Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.unet` and
+        `self.text_encoder`.
+
+        All kwargs are forwarded to `self.lora_state_dict`.
+
+        See [`~loaders.LoraLoaderMixin.lora_state_dict`] for more details on how the state dict is loaded.
+
+        See [`~loaders.LoraLoaderMixin.load_lora_into_unet`] for more details on how the state dict is loaded into
+        `self.unet`.
+
+        See [`~loaders.LoraLoaderMixin.load_lora_into_text_encoder`] for more details on how the state dict is loaded
+        into `self.text_encoder`.
+
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+                See [`~loaders.LoraLoaderMixin.lora_state_dict`].
+            adapter_name (`str`, *optional*):
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
+            kwargs (`dict`, *optional*):
+                See [`~loaders.LoraLoaderMixin.lora_state_dict`].
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        # We could have accessed the unet config from `lora_state_dict()` too. We pass
+        # it here explicitly to be able to tell that it's coming from an SDXL
+        # pipeline.
+
+        # if a dict is passed, copy it instead of modifying it inplace
+        if isinstance(pretrained_model_name_or_path_or_dict, dict):
+            pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict.copy()
+
+        # First, ensure that the checkpoint is a compatible one and can be successfully loaded.
+        state_dict, network_alphas = self.lora_state_dict(
+            pretrained_model_name_or_path_or_dict,
+            unet_config=self.unet.config,
+            **kwargs,
+        )
+        is_correct_format = all("lora" in key for key in state_dict.keys())
+        if not is_correct_format:
+            raise ValueError("Invalid LoRA checkpoint.")
+
+        self.load_lora_into_unet(
+            state_dict, network_alphas=network_alphas, unet=self.unet, adapter_name=adapter_name, _pipeline=self
+        )
+        text_encoder_state_dict = {k: v for k, v in state_dict.items() if "text_encoder." in k}
+        if len(text_encoder_state_dict) > 0:
+            self.load_lora_into_text_encoder(
+                text_encoder_state_dict,
+                network_alphas=network_alphas,
+                text_encoder=self.text_encoder,
+                prefix="text_encoder",
+                lora_scale=self.lora_scale,
+                adapter_name=adapter_name,
+                _pipeline=self,
+            )
+
+        text_encoder_2_state_dict = {k: v for k, v in state_dict.items() if "text_encoder_2." in k}
+        if len(text_encoder_2_state_dict) > 0:
+            self.load_lora_into_text_encoder(
+                text_encoder_2_state_dict,
+                network_alphas=network_alphas,
+                text_encoder=self.text_encoder_2,
+                prefix="text_encoder_2",
+                lora_scale=self.lora_scale,
+                adapter_name=adapter_name,
+                _pipeline=self,
+            )
+
+    @classmethod
+    def save_lora_weights(
+        cls,
+        save_directory: Union[str, os.PathLike],
+        unet_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        text_encoder_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        text_encoder_2_lora_layers: Dict[str, Union[torch.nn.Module, torch.Tensor]] = None,
+        is_main_process: bool = True,
+        weight_name: str = None,
+        save_function: Callable = None,
+        safe_serialization: bool = True,
+    ):
+        r"""
+        Save the LoRA parameters corresponding to the UNet and text encoder.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to save LoRA parameters to. Will be created if it doesn't exist.
+            unet_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
+                State dict of the LoRA layers corresponding to the `unet`.
+            text_encoder_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
+                State dict of the LoRA layers corresponding to the `text_encoder`. Must explicitly pass the text
+                encoder LoRA state dict because it comes from 🤗 Transformers.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful during distributed training and you
+                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
+                process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful during distributed training when you need to
+                replace `torch.save` with another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
+        """
+        state_dict = {}
+
+        def pack_weights(layers, prefix):
+            layers_weights = layers.state_dict() if isinstance(layers, torch.nn.Module) else layers
+            layers_state_dict = {f"{prefix}.{module_name}": param for module_name, param in layers_weights.items()}
+            return layers_state_dict
+
+        if not (unet_lora_layers or text_encoder_lora_layers or text_encoder_2_lora_layers):
+            raise ValueError(
+                "You must pass at least one of `unet_lora_layers`, `text_encoder_lora_layers` or `text_encoder_2_lora_layers`."
+            )
+
+        if unet_lora_layers:
+            state_dict.update(pack_weights(unet_lora_layers, "unet"))
+
+        if text_encoder_lora_layers and text_encoder_2_lora_layers:
+            state_dict.update(pack_weights(text_encoder_lora_layers, "text_encoder"))
+            state_dict.update(pack_weights(text_encoder_2_lora_layers, "text_encoder_2"))
+
+        cls.write_lora_layers(
+            state_dict=state_dict,
+            save_directory=save_directory,
+            is_main_process=is_main_process,
+            weight_name=weight_name,
+            save_function=save_function,
+            safe_serialization=safe_serialization,
+        )
+
+    def _remove_text_encoder_monkey_patch(self):
+        recurse_remove_peft_layers(self.text_encoder)
+        # TODO: @younesbelkada handle this in transformers side
+        if getattr(self.text_encoder, "peft_config", None) is not None:
+            del self.text_encoder.peft_config
+            self.text_encoder._hf_peft_config_loaded = None
+
+        recurse_remove_peft_layers(self.text_encoder_2)
+        if getattr(self.text_encoder_2, "peft_config", None) is not None:
+            del self.text_encoder_2.peft_config
+            self.text_encoder_2._hf_peft_config_loaded = None
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/lora_conversion_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/lora_conversion_utils.py
new file mode 100644
index 000000000..e968ef962
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/lora_conversion_utils.py
@@ -0,0 +1,284 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+from ..utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def _maybe_map_sgm_blocks_to_diffusers(state_dict, unet_config, delimiter="_", block_slice_pos=5):
+    # 1. get all state_dict_keys
+    all_keys = list(state_dict.keys())
+    sgm_patterns = ["input_blocks", "middle_block", "output_blocks"]
+
+    # 2. check if needs remapping, if not return original dict
+    is_in_sgm_format = False
+    for key in all_keys:
+        if any(p in key for p in sgm_patterns):
+            is_in_sgm_format = True
+            break
+
+    if not is_in_sgm_format:
+        return state_dict
+
+    # 3. Else remap from SGM patterns
+    new_state_dict = {}
+    inner_block_map = ["resnets", "attentions", "upsamplers"]
+
+    # Retrieves # of down, mid and up blocks
+    input_block_ids, middle_block_ids, output_block_ids = set(), set(), set()
+
+    for layer in all_keys:
+        if "text" in layer:
+            new_state_dict[layer] = state_dict.pop(layer)
+        else:
+            layer_id = int(layer.split(delimiter)[:block_slice_pos][-1])
+            if sgm_patterns[0] in layer:
+                input_block_ids.add(layer_id)
+            elif sgm_patterns[1] in layer:
+                middle_block_ids.add(layer_id)
+            elif sgm_patterns[2] in layer:
+                output_block_ids.add(layer_id)
+            else:
+                raise ValueError(f"Checkpoint not supported because layer {layer} not supported.")
+
+    input_blocks = {
+        layer_id: [key for key in state_dict if f"input_blocks{delimiter}{layer_id}" in key]
+        for layer_id in input_block_ids
+    }
+    middle_blocks = {
+        layer_id: [key for key in state_dict if f"middle_block{delimiter}{layer_id}" in key]
+        for layer_id in middle_block_ids
+    }
+    output_blocks = {
+        layer_id: [key for key in state_dict if f"output_blocks{delimiter}{layer_id}" in key]
+        for layer_id in output_block_ids
+    }
+
+    # Rename keys accordingly
+    for i in input_block_ids:
+        block_id = (i - 1) // (unet_config.layers_per_block + 1)
+        layer_in_block_id = (i - 1) % (unet_config.layers_per_block + 1)
+
+        for key in input_blocks[i]:
+            inner_block_id = int(key.split(delimiter)[block_slice_pos])
+            inner_block_key = inner_block_map[inner_block_id] if "op" not in key else "downsamplers"
+            inner_layers_in_block = str(layer_in_block_id) if "op" not in key else "0"
+            new_key = delimiter.join(
+                key.split(delimiter)[: block_slice_pos - 1]
+                + [str(block_id), inner_block_key, inner_layers_in_block]
+                + key.split(delimiter)[block_slice_pos + 1 :]
+            )
+            new_state_dict[new_key] = state_dict.pop(key)
+
+    for i in middle_block_ids:
+        key_part = None
+        if i == 0:
+            key_part = [inner_block_map[0], "0"]
+        elif i == 1:
+            key_part = [inner_block_map[1], "0"]
+        elif i == 2:
+            key_part = [inner_block_map[0], "1"]
+        else:
+            raise ValueError(f"Invalid middle block id {i}.")
+
+        for key in middle_blocks[i]:
+            new_key = delimiter.join(
+                key.split(delimiter)[: block_slice_pos - 1] + key_part + key.split(delimiter)[block_slice_pos:]
+            )
+            new_state_dict[new_key] = state_dict.pop(key)
+
+    for i in output_block_ids:
+        block_id = i // (unet_config.layers_per_block + 1)
+        layer_in_block_id = i % (unet_config.layers_per_block + 1)
+
+        for key in output_blocks[i]:
+            inner_block_id = int(key.split(delimiter)[block_slice_pos])
+            inner_block_key = inner_block_map[inner_block_id]
+            inner_layers_in_block = str(layer_in_block_id) if inner_block_id < 2 else "0"
+            new_key = delimiter.join(
+                key.split(delimiter)[: block_slice_pos - 1]
+                + [str(block_id), inner_block_key, inner_layers_in_block]
+                + key.split(delimiter)[block_slice_pos + 1 :]
+            )
+            new_state_dict[new_key] = state_dict.pop(key)
+
+    if len(state_dict) > 0:
+        raise ValueError("At this point all state dict entries have to be converted.")
+
+    return new_state_dict
+
+
+def _convert_kohya_lora_to_diffusers(state_dict, unet_name="unet", text_encoder_name="text_encoder"):
+    unet_state_dict = {}
+    te_state_dict = {}
+    te2_state_dict = {}
+    network_alphas = {}
+
+    # every down weight has a corresponding up weight and potentially an alpha weight
+    lora_keys = [k for k in state_dict.keys() if k.endswith("lora_down.weight")]
+    for key in lora_keys:
+        lora_name = key.split(".")[0]
+        lora_name_up = lora_name + ".lora_up.weight"
+        lora_name_alpha = lora_name + ".alpha"
+
+        if lora_name.startswith("lora_unet_"):
+            diffusers_name = key.replace("lora_unet_", "").replace("_", ".")
+
+            if "input.blocks" in diffusers_name:
+                diffusers_name = diffusers_name.replace("input.blocks", "down_blocks")
+            else:
+                diffusers_name = diffusers_name.replace("down.blocks", "down_blocks")
+
+            if "middle.block" in diffusers_name:
+                diffusers_name = diffusers_name.replace("middle.block", "mid_block")
+            else:
+                diffusers_name = diffusers_name.replace("mid.block", "mid_block")
+            if "output.blocks" in diffusers_name:
+                diffusers_name = diffusers_name.replace("output.blocks", "up_blocks")
+            else:
+                diffusers_name = diffusers_name.replace("up.blocks", "up_blocks")
+
+            diffusers_name = diffusers_name.replace("transformer.blocks", "transformer_blocks")
+            diffusers_name = diffusers_name.replace("to.q.lora", "to_q_lora")
+            diffusers_name = diffusers_name.replace("to.k.lora", "to_k_lora")
+            diffusers_name = diffusers_name.replace("to.v.lora", "to_v_lora")
+            diffusers_name = diffusers_name.replace("to.out.0.lora", "to_out_lora")
+            diffusers_name = diffusers_name.replace("proj.in", "proj_in")
+            diffusers_name = diffusers_name.replace("proj.out", "proj_out")
+            diffusers_name = diffusers_name.replace("emb.layers", "time_emb_proj")
+
+            # SDXL specificity.
+            if "emb" in diffusers_name and "time.emb.proj" not in diffusers_name:
+                pattern = r"\.\d+(?=\D*$)"
+                diffusers_name = re.sub(pattern, "", diffusers_name, count=1)
+            if ".in." in diffusers_name:
+                diffusers_name = diffusers_name.replace("in.layers.2", "conv1")
+            if ".out." in diffusers_name:
+                diffusers_name = diffusers_name.replace("out.layers.3", "conv2")
+            if "downsamplers" in diffusers_name or "upsamplers" in diffusers_name:
+                diffusers_name = diffusers_name.replace("op", "conv")
+            if "skip" in diffusers_name:
+                diffusers_name = diffusers_name.replace("skip.connection", "conv_shortcut")
+
+            # LyCORIS specificity.
+            if "time.emb.proj" in diffusers_name:
+                diffusers_name = diffusers_name.replace("time.emb.proj", "time_emb_proj")
+            if "conv.shortcut" in diffusers_name:
+                diffusers_name = diffusers_name.replace("conv.shortcut", "conv_shortcut")
+
+            # General coverage.
+            if "transformer_blocks" in diffusers_name:
+                if "attn1" in diffusers_name or "attn2" in diffusers_name:
+                    diffusers_name = diffusers_name.replace("attn1", "attn1.processor")
+                    diffusers_name = diffusers_name.replace("attn2", "attn2.processor")
+                    unet_state_dict[diffusers_name] = state_dict.pop(key)
+                    unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+                elif "ff" in diffusers_name:
+                    unet_state_dict[diffusers_name] = state_dict.pop(key)
+                    unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+            elif any(key in diffusers_name for key in ("proj_in", "proj_out")):
+                unet_state_dict[diffusers_name] = state_dict.pop(key)
+                unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+            else:
+                unet_state_dict[diffusers_name] = state_dict.pop(key)
+                unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+
+        elif lora_name.startswith("lora_te_"):
+            diffusers_name = key.replace("lora_te_", "").replace("_", ".")
+            diffusers_name = diffusers_name.replace("text.model", "text_model")
+            diffusers_name = diffusers_name.replace("self.attn", "self_attn")
+            diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora")
+            diffusers_name = diffusers_name.replace("k.proj.lora", "to_k_lora")
+            diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora")
+            diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora")
+            if "self_attn" in diffusers_name:
+                te_state_dict[diffusers_name] = state_dict.pop(key)
+                te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+            elif "mlp" in diffusers_name:
+                # Be aware that this is the new diffusers convention and the rest of the code might
+                # not utilize it yet.
+                diffusers_name = diffusers_name.replace(".lora.", ".lora_linear_layer.")
+                te_state_dict[diffusers_name] = state_dict.pop(key)
+                te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+
+        # (sayakpaul): Duplicate code. Needs to be cleaned.
+        elif lora_name.startswith("lora_te1_"):
+            diffusers_name = key.replace("lora_te1_", "").replace("_", ".")
+            diffusers_name = diffusers_name.replace("text.model", "text_model")
+            diffusers_name = diffusers_name.replace("self.attn", "self_attn")
+            diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora")
+            diffusers_name = diffusers_name.replace("k.proj.lora", "to_k_lora")
+            diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora")
+            diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora")
+            if "self_attn" in diffusers_name:
+                te_state_dict[diffusers_name] = state_dict.pop(key)
+                te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+            elif "mlp" in diffusers_name:
+                # Be aware that this is the new diffusers convention and the rest of the code might
+                # not utilize it yet.
+                diffusers_name = diffusers_name.replace(".lora.", ".lora_linear_layer.")
+                te_state_dict[diffusers_name] = state_dict.pop(key)
+                te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+
+        # (sayakpaul): Duplicate code. Needs to be cleaned.
+        elif lora_name.startswith("lora_te2_"):
+            diffusers_name = key.replace("lora_te2_", "").replace("_", ".")
+            diffusers_name = diffusers_name.replace("text.model", "text_model")
+            diffusers_name = diffusers_name.replace("self.attn", "self_attn")
+            diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora")
+            diffusers_name = diffusers_name.replace("k.proj.lora", "to_k_lora")
+            diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora")
+            diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora")
+            if "self_attn" in diffusers_name:
+                te2_state_dict[diffusers_name] = state_dict.pop(key)
+                te2_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+            elif "mlp" in diffusers_name:
+                # Be aware that this is the new diffusers convention and the rest of the code might
+                # not utilize it yet.
+                diffusers_name = diffusers_name.replace(".lora.", ".lora_linear_layer.")
+                te2_state_dict[diffusers_name] = state_dict.pop(key)
+                te2_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+
+        # Rename the alphas so that they can be mapped appropriately.
+        if lora_name_alpha in state_dict:
+            alpha = state_dict.pop(lora_name_alpha).item()
+            if lora_name_alpha.startswith("lora_unet_"):
+                prefix = "unet."
+            elif lora_name_alpha.startswith(("lora_te_", "lora_te1_")):
+                prefix = "text_encoder."
+            else:
+                prefix = "text_encoder_2."
+            new_name = prefix + diffusers_name.split(".lora.")[0] + ".alpha"
+            network_alphas.update({new_name: alpha})
+
+    if len(state_dict) > 0:
+        raise ValueError(f"The following keys have not been correctly be renamed: \n\n {', '.join(state_dict.keys())}")
+
+    logger.info("Kohya-style checkpoint detected.")
+    unet_state_dict = {f"{unet_name}.{module_name}": params for module_name, params in unet_state_dict.items()}
+    te_state_dict = {f"{text_encoder_name}.{module_name}": params for module_name, params in te_state_dict.items()}
+    te2_state_dict = (
+        {f"text_encoder_2.{module_name}": params for module_name, params in te2_state_dict.items()}
+        if len(te2_state_dict) > 0
+        else None
+    )
+    if te2_state_dict is not None:
+        te_state_dict.update(te2_state_dict)
+
+    new_state_dict = {**unet_state_dict, **te_state_dict}
+    return new_state_dict, network_alphas
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/peft.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/peft.py
new file mode 100644
index 000000000..01dbd3494
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/peft.py
@@ -0,0 +1,186 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Union
+
+from ..utils import MIN_PEFT_VERSION, check_peft_version, is_peft_available
+
+
+class PeftAdapterMixin:
+    """
+    A class containing all functions for loading and using adapters weights that are supported in PEFT library. For
+    more details about adapters and injecting them in a transformer-based model, check out the PEFT [documentation](https://huggingface.co/docs/peft/index).
+
+    Install the latest version of PEFT, and use this mixin to:
+
+    - Attach new adapters in the model.
+    - Attach multiple adapters and iteratively activate/deactivate them.
+    - Activate/deactivate all adapters from the model.
+    - Get a list of the active adapters.
+    """
+
+    _hf_peft_config_loaded = False
+
+    def add_adapter(self, adapter_config, adapter_name: str = "default") -> None:
+        r"""
+        Adds a new adapter to the current model for training. If no adapter name is passed, a default name is assigned
+        to the adapter to follow the convention of the PEFT library.
+
+        If you are not familiar with adapters and PEFT methods, we invite you to read more about them in the PEFT
+        [documentation](https://huggingface.co/docs/peft).
+
+        Args:
+            adapter_config (`[~peft.PeftConfig]`):
+                The configuration of the adapter to add; supported adapters are non-prefix tuning and adaption prompt
+                methods.
+            adapter_name (`str`, *optional*, defaults to `"default"`):
+                The name of the adapter to add. If no name is passed, a default name is assigned to the adapter.
+        """
+        check_peft_version(min_version=MIN_PEFT_VERSION)
+
+        if not is_peft_available():
+            raise ImportError("PEFT is not available. Please install PEFT to use this function: `pip install peft`.")
+
+        from peft import PeftConfig, inject_adapter_in_model
+
+        if not self._hf_peft_config_loaded:
+            self._hf_peft_config_loaded = True
+        elif adapter_name in self.peft_config:
+            raise ValueError(f"Adapter with name {adapter_name} already exists. Please use a different name.")
+
+        if not isinstance(adapter_config, PeftConfig):
+            raise ValueError(
+                f"adapter_config should be an instance of PeftConfig. Got {type(adapter_config)} instead."
+            )
+
+        # Unlike transformers, here we don't need to retrieve the name_or_path of the unet as the loading logic is
+        # handled by the `load_lora_layers` or `LoraLoaderMixin`. Therefore we set it to `None` here.
+        adapter_config.base_model_name_or_path = None
+        inject_adapter_in_model(adapter_config, self, adapter_name)
+        self.set_adapter(adapter_name)
+
+    def set_adapter(self, adapter_name: Union[str, List[str]]) -> None:
+        """
+        Sets a specific adapter by forcing the model to only use that adapter and disables the other adapters.
+
+        If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
+        [documentation](https://huggingface.co/docs/peft).
+
+        Args:
+            adapter_name (Union[str, List[str]])):
+                The list of adapters to set or the adapter name in the case of a single adapter.
+        """
+        check_peft_version(min_version=MIN_PEFT_VERSION)
+
+        if not self._hf_peft_config_loaded:
+            raise ValueError("No adapter loaded. Please load an adapter first.")
+
+        if isinstance(adapter_name, str):
+            adapter_name = [adapter_name]
+
+        missing = set(adapter_name) - set(self.peft_config)
+        if len(missing) > 0:
+            raise ValueError(
+                f"Following adapter(s) could not be found: {', '.join(missing)}. Make sure you are passing the correct adapter name(s)."
+                f" current loaded adapters are: {list(self.peft_config.keys())}"
+            )
+
+        from peft.tuners.tuners_utils import BaseTunerLayer
+
+        _adapters_has_been_set = False
+
+        for _, module in self.named_modules():
+            if isinstance(module, BaseTunerLayer):
+                if hasattr(module, "set_adapter"):
+                    module.set_adapter(adapter_name)
+                # Previous versions of PEFT does not support multi-adapter inference
+                elif not hasattr(module, "set_adapter") and len(adapter_name) != 1:
+                    raise ValueError(
+                        "You are trying to set multiple adapters and you have a PEFT version that does not support multi-adapter inference. Please upgrade to the latest version of PEFT."
+                        " `pip install -U peft` or `pip install -U git+https://github.com/huggingface/peft.git`"
+                    )
+                else:
+                    module.active_adapter = adapter_name
+                _adapters_has_been_set = True
+
+        if not _adapters_has_been_set:
+            raise ValueError(
+                "Did not succeeded in setting the adapter. Please make sure you are using a model that supports adapters."
+            )
+
+    def disable_adapters(self) -> None:
+        r"""
+        Disable all adapters attached to the model and fallback to inference with the base model only.
+
+        If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
+        [documentation](https://huggingface.co/docs/peft).
+        """
+        check_peft_version(min_version=MIN_PEFT_VERSION)
+
+        if not self._hf_peft_config_loaded:
+            raise ValueError("No adapter loaded. Please load an adapter first.")
+
+        from peft.tuners.tuners_utils import BaseTunerLayer
+
+        for _, module in self.named_modules():
+            if isinstance(module, BaseTunerLayer):
+                if hasattr(module, "enable_adapters"):
+                    module.enable_adapters(enabled=False)
+                else:
+                    # support for older PEFT versions
+                    module.disable_adapters = True
+
+    def enable_adapters(self) -> None:
+        """
+        Enable adapters that are attached to the model. The model uses `self.active_adapters()` to retrieve the
+        list of adapters to enable.
+
+        If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
+        [documentation](https://huggingface.co/docs/peft).
+        """
+        check_peft_version(min_version=MIN_PEFT_VERSION)
+
+        if not self._hf_peft_config_loaded:
+            raise ValueError("No adapter loaded. Please load an adapter first.")
+
+        from peft.tuners.tuners_utils import BaseTunerLayer
+
+        for _, module in self.named_modules():
+            if isinstance(module, BaseTunerLayer):
+                if hasattr(module, "enable_adapters"):
+                    module.enable_adapters(enabled=True)
+                else:
+                    # support for older PEFT versions
+                    module.disable_adapters = False
+
+    def active_adapters(self) -> List[str]:
+        """
+        Gets the current list of active adapters of the model.
+
+        If you are not familiar with adapters and PEFT methods, we invite you to read more about them on the PEFT
+        [documentation](https://huggingface.co/docs/peft).
+        """
+        check_peft_version(min_version=MIN_PEFT_VERSION)
+
+        if not is_peft_available():
+            raise ImportError("PEFT is not available. Please install PEFT to use this function: `pip install peft`.")
+
+        if not self._hf_peft_config_loaded:
+            raise ValueError("No adapter loaded. Please load an adapter first.")
+
+        from peft.tuners.tuners_utils import BaseTunerLayer
+
+        for _, module in self.named_modules():
+            if isinstance(module, BaseTunerLayer):
+                return module.active_adapter
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/single_file.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/single_file.py
new file mode 100644
index 000000000..0d384b164
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/single_file.py
@@ -0,0 +1,318 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from huggingface_hub.utils import validate_hf_hub_args
+
+from ..utils import is_transformers_available, logging
+from .single_file_utils import (
+    create_diffusers_unet_model_from_ldm,
+    create_diffusers_vae_model_from_ldm,
+    create_scheduler_from_ldm,
+    create_text_encoders_and_tokenizers_from_ldm,
+    fetch_ldm_config_and_checkpoint,
+    infer_model_type,
+)
+
+
+logger = logging.get_logger(__name__)
+
+# Pipelines that support the SDXL Refiner checkpoint
+REFINER_PIPELINES = [
+    "StableDiffusionXLImg2ImgPipeline",
+    "StableDiffusionXLInpaintPipeline",
+    "StableDiffusionXLControlNetImg2ImgPipeline",
+]
+
+if is_transformers_available():
+    from transformers import AutoFeatureExtractor
+
+
+def build_sub_model_components(
+    pipeline_components,
+    pipeline_class_name,
+    component_name,
+    original_config,
+    checkpoint,
+    local_files_only=False,
+    load_safety_checker=False,
+    model_type=None,
+    image_size=None,
+    torch_dtype=None,
+    **kwargs,
+):
+    if component_name in pipeline_components:
+        return {}
+
+    if component_name == "unet":
+        num_in_channels = kwargs.pop("num_in_channels", None)
+        upcast_attention = kwargs.pop("upcast_attention", None)
+
+        unet_components = create_diffusers_unet_model_from_ldm(
+            pipeline_class_name,
+            original_config,
+            checkpoint,
+            num_in_channels=num_in_channels,
+            image_size=image_size,
+            torch_dtype=torch_dtype,
+            model_type=model_type,
+            upcast_attention=upcast_attention,
+        )
+        return unet_components
+
+    if component_name == "vae":
+        scaling_factor = kwargs.get("scaling_factor", None)
+        vae_components = create_diffusers_vae_model_from_ldm(
+            pipeline_class_name,
+            original_config,
+            checkpoint,
+            image_size,
+            scaling_factor,
+            torch_dtype,
+            model_type=model_type,
+        )
+        return vae_components
+
+    if component_name == "scheduler":
+        scheduler_type = kwargs.get("scheduler_type", "ddim")
+        prediction_type = kwargs.get("prediction_type", None)
+
+        scheduler_components = create_scheduler_from_ldm(
+            pipeline_class_name,
+            original_config,
+            checkpoint,
+            scheduler_type=scheduler_type,
+            prediction_type=prediction_type,
+            model_type=model_type,
+        )
+
+        return scheduler_components
+
+    if component_name in ["text_encoder", "text_encoder_2", "tokenizer", "tokenizer_2"]:
+        text_encoder_components = create_text_encoders_and_tokenizers_from_ldm(
+            original_config,
+            checkpoint,
+            model_type=model_type,
+            local_files_only=local_files_only,
+            torch_dtype=torch_dtype,
+        )
+        return text_encoder_components
+
+    if component_name == "safety_checker":
+        if load_safety_checker:
+            from ..pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+            safety_checker = StableDiffusionSafetyChecker.from_pretrained(
+                "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only, torch_dtype=torch_dtype
+            )
+        else:
+            safety_checker = None
+        return {"safety_checker": safety_checker}
+
+    if component_name == "feature_extractor":
+        if load_safety_checker:
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
+            )
+        else:
+            feature_extractor = None
+        return {"feature_extractor": feature_extractor}
+
+    return
+
+
+def set_additional_components(
+    pipeline_class_name,
+    original_config,
+    checkpoint=None,
+    model_type=None,
+):
+    components = {}
+    if pipeline_class_name in REFINER_PIPELINES:
+        model_type = infer_model_type(original_config, checkpoint=checkpoint, model_type=model_type)
+        is_refiner = model_type == "SDXL-Refiner"
+        components.update(
+            {
+                "requires_aesthetics_score": is_refiner,
+                "force_zeros_for_empty_prompt": False if is_refiner else True,
+            }
+        )
+
+    return components
+
+
+class FromSingleFileMixin:
+    """
+    Load model weights saved in the `.ckpt` format into a [`DiffusionPipeline`].
+    """
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
+        r"""
+        Instantiate a [`DiffusionPipeline`] from pretrained pipeline weights saved in the `.ckpt` or `.safetensors`
+        format. The pipeline is set in evaluation mode (`model.eval()`) by default.
+
+        Parameters:
+            pretrained_model_link_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+                    - A link to the `.ckpt` file (for example
+                      `"https://huggingface.co/<repo_id>/blob/main/<path_to_file>.ckpt"`) on the Hub.
+                    - A path to a *file* containing all pipeline weights.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            original_config_file (`str`, *optional*):
+                The path to the original config file that was used to train the model. If not provided, the config file
+                will be inferred from the checkpoint file.
+            model_type (`str`, *optional*):
+                The type of model to load. If not provided, the model type will be inferred from the checkpoint file.
+            image_size (`int`, *optional*):
+                The size of the image output. It's used to configure the `sample_size` parameter of the UNet and VAE model.
+            load_safety_checker (`bool`, *optional*, defaults to `False`):
+                Whether to load the safety checker model or not. By default, the safety checker is not loaded unless a `safety_checker` component is passed to the `kwargs`.
+            num_in_channels (`int`, *optional*):
+                Specify the number of input channels for the UNet model. Read more about how to configure UNet model with this parameter
+                [here](https://huggingface.co/docs/diffusers/training/adapt_a_model#configure-unet2dconditionmodel-parameters).
+            scaling_factor (`float`, *optional*):
+                The scaling factor to use for the VAE model. If not provided, it is inferred from the config file first.
+                If the scaling factor is not found in the config file, the default value 0.18215 is used.
+            scheduler_type (`str`, *optional*):
+                The type of scheduler to load. If not provided, the scheduler type will be inferred from the checkpoint file.
+            prediction_type (`str`, *optional*):
+                The type of prediction to load. If not provided, the prediction type will be inferred from the checkpoint file.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to overwrite load and saveable variables (the pipeline components of the specific pipeline
+                class). The overwritten components are passed directly to the pipelines `__init__` method. See example
+                below for more information.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import StableDiffusionPipeline
+
+        >>> # Download pipeline from huggingface.co and cache.
+        >>> pipeline = StableDiffusionPipeline.from_single_file(
+        ...     "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix.safetensors"
+        ... )
+
+        >>> # Download pipeline from local file
+        >>> # file is downloaded under ./v1-5-pruned-emaonly.ckpt
+        >>> pipeline = StableDiffusionPipeline.from_single_file("./v1-5-pruned-emaonly")
+
+        >>> # Enable float16 and move to GPU
+        >>> pipeline = StableDiffusionPipeline.from_single_file(
+        ...     "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt",
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> pipeline.to("cuda")
+        ```
+        """
+        original_config_file = kwargs.pop("original_config_file", None)
+        resume_download = kwargs.pop("resume_download", False)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+
+        class_name = cls.__name__
+
+        original_config, checkpoint = fetch_ldm_config_and_checkpoint(
+            pretrained_model_link_or_path=pretrained_model_link_or_path,
+            class_name=class_name,
+            original_config_file=original_config_file,
+            resume_download=resume_download,
+            force_download=force_download,
+            proxies=proxies,
+            token=token,
+            revision=revision,
+            local_files_only=local_files_only,
+            cache_dir=cache_dir,
+        )
+
+        from ..pipelines.pipeline_utils import _get_pipeline_class
+
+        pipeline_class = _get_pipeline_class(
+            cls,
+            config=None,
+            cache_dir=cache_dir,
+        )
+
+        expected_modules, optional_kwargs = cls._get_signature_keys(pipeline_class)
+        passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
+        passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
+
+        model_type = kwargs.pop("model_type", None)
+        image_size = kwargs.pop("image_size", None)
+        load_safety_checker = (kwargs.pop("load_safety_checker", False)) or (
+            passed_class_obj.get("safety_checker", None) is not None
+        )
+
+        init_kwargs = {}
+        for name in expected_modules:
+            if name in passed_class_obj:
+                init_kwargs[name] = passed_class_obj[name]
+            else:
+                components = build_sub_model_components(
+                    init_kwargs,
+                    class_name,
+                    name,
+                    original_config,
+                    checkpoint,
+                    model_type=model_type,
+                    image_size=image_size,
+                    load_safety_checker=load_safety_checker,
+                    local_files_only=local_files_only,
+                    torch_dtype=torch_dtype,
+                    **kwargs,
+                )
+                if not components:
+                    continue
+                init_kwargs.update(components)
+
+        additional_components = set_additional_components(
+            class_name, original_config, checkpoint=checkpoint, model_type=model_type
+        )
+        if additional_components:
+            init_kwargs.update(additional_components)
+
+        init_kwargs.update(passed_pipe_kwargs)
+        pipe = pipeline_class(**init_kwargs)
+
+        if torch_dtype is not None:
+            pipe.to(dtype=torch_dtype)
+
+        return pipe
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/single_file_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/single_file_utils.py
new file mode 100644
index 000000000..cdaa0802a
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/single_file_utils.py
@@ -0,0 +1,1617 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conversion script for the Stable Diffusion checkpoints."""
+
+import os
+import re
+from contextlib import nullcontext
+from io import BytesIO
+from urllib.parse import urlparse
+
+import requests
+import yaml
+
+from ..models.modeling_utils import load_state_dict
+from ..schedulers import (
+    DDIMScheduler,
+    DDPMScheduler,
+    DPMSolverMultistepScheduler,
+    EDMDPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from ..utils import is_accelerate_available, is_transformers_available, logging
+from ..utils.hub_utils import _get_model_file
+
+
+if is_transformers_available():
+    from transformers import (
+        CLIPTextConfig,
+        CLIPTextModel,
+        CLIPTextModelWithProjection,
+        CLIPTokenizer,
+    )
+
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+CONFIG_URLS = {
+    "v1": "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml",
+    "v2": "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml",
+    "xl": "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml",
+    "xl_refiner": "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_refiner.yaml",
+    "upscale": "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/x4-upscaling.yaml",
+    "controlnet": "https://raw.githubusercontent.com/lllyasviel/ControlNet/main/models/cldm_v15.yaml",
+}
+
+CHECKPOINT_KEY_NAMES = {
+    "v2": "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight",
+    "xl_base": "conditioner.embedders.1.model.transformer.resblocks.9.mlp.c_proj.bias",
+    "xl_refiner": "conditioner.embedders.0.model.transformer.resblocks.9.mlp.c_proj.bias",
+}
+
+SCHEDULER_DEFAULT_CONFIG = {
+    "beta_schedule": "scaled_linear",
+    "beta_start": 0.00085,
+    "beta_end": 0.012,
+    "interpolation_type": "linear",
+    "num_train_timesteps": 1000,
+    "prediction_type": "epsilon",
+    "sample_max_value": 1.0,
+    "set_alpha_to_one": False,
+    "skip_prk_steps": True,
+    "steps_offset": 1,
+    "timestep_spacing": "leading",
+}
+
+
+STABLE_CASCADE_DEFAULT_CONFIGS = {
+    "stage_c": {"pretrained_model_name_or_path": "diffusers/stable-cascade-configs", "subfolder": "prior"},
+    "stage_c_lite": {"pretrained_model_name_or_path": "diffusers/stable-cascade-configs", "subfolder": "prior_lite"},
+    "stage_b": {"pretrained_model_name_or_path": "diffusers/stable-cascade-configs", "subfolder": "decoder"},
+    "stage_b_lite": {"pretrained_model_name_or_path": "diffusers/stable-cascade-configs", "subfolder": "decoder_lite"},
+}
+
+
+def convert_stable_cascade_unet_single_file_to_diffusers(original_state_dict):
+    is_stage_c = "clip_txt_mapper.weight" in original_state_dict
+
+    if is_stage_c:
+        state_dict = {}
+        for key in original_state_dict.keys():
+            if key.endswith("in_proj_weight"):
+                weights = original_state_dict[key].chunk(3, 0)
+                state_dict[key.replace("attn.in_proj_weight", "to_q.weight")] = weights[0]
+                state_dict[key.replace("attn.in_proj_weight", "to_k.weight")] = weights[1]
+                state_dict[key.replace("attn.in_proj_weight", "to_v.weight")] = weights[2]
+            elif key.endswith("in_proj_bias"):
+                weights = original_state_dict[key].chunk(3, 0)
+                state_dict[key.replace("attn.in_proj_bias", "to_q.bias")] = weights[0]
+                state_dict[key.replace("attn.in_proj_bias", "to_k.bias")] = weights[1]
+                state_dict[key.replace("attn.in_proj_bias", "to_v.bias")] = weights[2]
+            elif key.endswith("out_proj.weight"):
+                weights = original_state_dict[key]
+                state_dict[key.replace("attn.out_proj.weight", "to_out.0.weight")] = weights
+            elif key.endswith("out_proj.bias"):
+                weights = original_state_dict[key]
+                state_dict[key.replace("attn.out_proj.bias", "to_out.0.bias")] = weights
+            else:
+                state_dict[key] = original_state_dict[key]
+    else:
+        state_dict = {}
+        for key in original_state_dict.keys():
+            if key.endswith("in_proj_weight"):
+                weights = original_state_dict[key].chunk(3, 0)
+                state_dict[key.replace("attn.in_proj_weight", "to_q.weight")] = weights[0]
+                state_dict[key.replace("attn.in_proj_weight", "to_k.weight")] = weights[1]
+                state_dict[key.replace("attn.in_proj_weight", "to_v.weight")] = weights[2]
+            elif key.endswith("in_proj_bias"):
+                weights = original_state_dict[key].chunk(3, 0)
+                state_dict[key.replace("attn.in_proj_bias", "to_q.bias")] = weights[0]
+                state_dict[key.replace("attn.in_proj_bias", "to_k.bias")] = weights[1]
+                state_dict[key.replace("attn.in_proj_bias", "to_v.bias")] = weights[2]
+            elif key.endswith("out_proj.weight"):
+                weights = original_state_dict[key]
+                state_dict[key.replace("attn.out_proj.weight", "to_out.0.weight")] = weights
+            elif key.endswith("out_proj.bias"):
+                weights = original_state_dict[key]
+                state_dict[key.replace("attn.out_proj.bias", "to_out.0.bias")] = weights
+            # rename clip_mapper to clip_txt_pooled_mapper
+            elif key.endswith("clip_mapper.weight"):
+                weights = original_state_dict[key]
+                state_dict[key.replace("clip_mapper.weight", "clip_txt_pooled_mapper.weight")] = weights
+            elif key.endswith("clip_mapper.bias"):
+                weights = original_state_dict[key]
+                state_dict[key.replace("clip_mapper.bias", "clip_txt_pooled_mapper.bias")] = weights
+            else:
+                state_dict[key] = original_state_dict[key]
+
+    return state_dict
+
+
+def infer_stable_cascade_single_file_config(checkpoint):
+    is_stage_c = "clip_txt_mapper.weight" in checkpoint
+    is_stage_b = "down_blocks.1.0.channelwise.0.weight" in checkpoint
+
+    if is_stage_c and (checkpoint["clip_txt_mapper.weight"].shape[0] == 1536):
+        config_type = "stage_c_lite"
+    elif is_stage_c and (checkpoint["clip_txt_mapper.weight"].shape[0] == 2048):
+        config_type = "stage_c"
+    elif is_stage_b and checkpoint["down_blocks.1.0.channelwise.0.weight"].shape[-1] == 576:
+        config_type = "stage_b_lite"
+    elif is_stage_b and checkpoint["down_blocks.1.0.channelwise.0.weight"].shape[-1] == 640:
+        config_type = "stage_b"
+
+    return STABLE_CASCADE_DEFAULT_CONFIGS[config_type]
+
+
+DIFFUSERS_TO_LDM_MAPPING = {
+    "unet": {
+        "layers": {
+            "time_embedding.linear_1.weight": "time_embed.0.weight",
+            "time_embedding.linear_1.bias": "time_embed.0.bias",
+            "time_embedding.linear_2.weight": "time_embed.2.weight",
+            "time_embedding.linear_2.bias": "time_embed.2.bias",
+            "conv_in.weight": "input_blocks.0.0.weight",
+            "conv_in.bias": "input_blocks.0.0.bias",
+            "conv_norm_out.weight": "out.0.weight",
+            "conv_norm_out.bias": "out.0.bias",
+            "conv_out.weight": "out.2.weight",
+            "conv_out.bias": "out.2.bias",
+        },
+        "class_embed_type": {
+            "class_embedding.linear_1.weight": "label_emb.0.0.weight",
+            "class_embedding.linear_1.bias": "label_emb.0.0.bias",
+            "class_embedding.linear_2.weight": "label_emb.0.2.weight",
+            "class_embedding.linear_2.bias": "label_emb.0.2.bias",
+        },
+        "addition_embed_type": {
+            "add_embedding.linear_1.weight": "label_emb.0.0.weight",
+            "add_embedding.linear_1.bias": "label_emb.0.0.bias",
+            "add_embedding.linear_2.weight": "label_emb.0.2.weight",
+            "add_embedding.linear_2.bias": "label_emb.0.2.bias",
+        },
+    },
+    "controlnet": {
+        "layers": {
+            "time_embedding.linear_1.weight": "time_embed.0.weight",
+            "time_embedding.linear_1.bias": "time_embed.0.bias",
+            "time_embedding.linear_2.weight": "time_embed.2.weight",
+            "time_embedding.linear_2.bias": "time_embed.2.bias",
+            "conv_in.weight": "input_blocks.0.0.weight",
+            "conv_in.bias": "input_blocks.0.0.bias",
+            "controlnet_cond_embedding.conv_in.weight": "input_hint_block.0.weight",
+            "controlnet_cond_embedding.conv_in.bias": "input_hint_block.0.bias",
+            "controlnet_cond_embedding.conv_out.weight": "input_hint_block.14.weight",
+            "controlnet_cond_embedding.conv_out.bias": "input_hint_block.14.bias",
+        },
+        "class_embed_type": {
+            "class_embedding.linear_1.weight": "label_emb.0.0.weight",
+            "class_embedding.linear_1.bias": "label_emb.0.0.bias",
+            "class_embedding.linear_2.weight": "label_emb.0.2.weight",
+            "class_embedding.linear_2.bias": "label_emb.0.2.bias",
+        },
+        "addition_embed_type": {
+            "add_embedding.linear_1.weight": "label_emb.0.0.weight",
+            "add_embedding.linear_1.bias": "label_emb.0.0.bias",
+            "add_embedding.linear_2.weight": "label_emb.0.2.weight",
+            "add_embedding.linear_2.bias": "label_emb.0.2.bias",
+        },
+    },
+    "vae": {
+        "encoder.conv_in.weight": "encoder.conv_in.weight",
+        "encoder.conv_in.bias": "encoder.conv_in.bias",
+        "encoder.conv_out.weight": "encoder.conv_out.weight",
+        "encoder.conv_out.bias": "encoder.conv_out.bias",
+        "encoder.conv_norm_out.weight": "encoder.norm_out.weight",
+        "encoder.conv_norm_out.bias": "encoder.norm_out.bias",
+        "decoder.conv_in.weight": "decoder.conv_in.weight",
+        "decoder.conv_in.bias": "decoder.conv_in.bias",
+        "decoder.conv_out.weight": "decoder.conv_out.weight",
+        "decoder.conv_out.bias": "decoder.conv_out.bias",
+        "decoder.conv_norm_out.weight": "decoder.norm_out.weight",
+        "decoder.conv_norm_out.bias": "decoder.norm_out.bias",
+        "quant_conv.weight": "quant_conv.weight",
+        "quant_conv.bias": "quant_conv.bias",
+        "post_quant_conv.weight": "post_quant_conv.weight",
+        "post_quant_conv.bias": "post_quant_conv.bias",
+    },
+    "openclip": {
+        "layers": {
+            "text_model.embeddings.position_embedding.weight": "positional_embedding",
+            "text_model.embeddings.token_embedding.weight": "token_embedding.weight",
+            "text_model.final_layer_norm.weight": "ln_final.weight",
+            "text_model.final_layer_norm.bias": "ln_final.bias",
+            "text_projection.weight": "text_projection",
+        },
+        "transformer": {
+            "text_model.encoder.layers.": "resblocks.",
+            "layer_norm1": "ln_1",
+            "layer_norm2": "ln_2",
+            ".fc1.": ".c_fc.",
+            ".fc2.": ".c_proj.",
+            ".self_attn": ".attn",
+            "transformer.text_model.final_layer_norm.": "ln_final.",
+            "transformer.text_model.embeddings.token_embedding.weight": "token_embedding.weight",
+            "transformer.text_model.embeddings.position_embedding.weight": "positional_embedding",
+        },
+    },
+}
+
+LDM_VAE_KEY = "first_stage_model."
+LDM_VAE_DEFAULT_SCALING_FACTOR = 0.18215
+PLAYGROUND_VAE_SCALING_FACTOR = 0.5
+LDM_UNET_KEY = "model.diffusion_model."
+LDM_CONTROLNET_KEY = "control_model."
+LDM_CLIP_PREFIX_TO_REMOVE = ["cond_stage_model.transformer.", "conditioner.embedders.0.transformer."]
+LDM_OPEN_CLIP_TEXT_PROJECTION_DIM = 1024
+
+SD_2_TEXT_ENCODER_KEYS_TO_IGNORE = [
+    "cond_stage_model.model.transformer.resblocks.23.attn.in_proj_bias",
+    "cond_stage_model.model.transformer.resblocks.23.attn.in_proj_weight",
+    "cond_stage_model.model.transformer.resblocks.23.attn.out_proj.bias",
+    "cond_stage_model.model.transformer.resblocks.23.attn.out_proj.weight",
+    "cond_stage_model.model.transformer.resblocks.23.ln_1.bias",
+    "cond_stage_model.model.transformer.resblocks.23.ln_1.weight",
+    "cond_stage_model.model.transformer.resblocks.23.ln_2.bias",
+    "cond_stage_model.model.transformer.resblocks.23.ln_2.weight",
+    "cond_stage_model.model.transformer.resblocks.23.mlp.c_fc.bias",
+    "cond_stage_model.model.transformer.resblocks.23.mlp.c_fc.weight",
+    "cond_stage_model.model.transformer.resblocks.23.mlp.c_proj.bias",
+    "cond_stage_model.model.transformer.resblocks.23.mlp.c_proj.weight",
+    "cond_stage_model.model.text_projection",
+]
+
+
+VALID_URL_PREFIXES = ["https://huggingface.co/", "huggingface.co/", "hf.co/", "https://hf.co/"]
+
+
+def _extract_repo_id_and_weights_name(pretrained_model_name_or_path):
+    pattern = r"([^/]+)/([^/]+)/(?:blob/main/)?(.+)"
+    weights_name = None
+    repo_id = (None,)
+    for prefix in VALID_URL_PREFIXES:
+        pretrained_model_name_or_path = pretrained_model_name_or_path.replace(prefix, "")
+    match = re.match(pattern, pretrained_model_name_or_path)
+    if not match:
+        return repo_id, weights_name
+
+    repo_id = f"{match.group(1)}/{match.group(2)}"
+    weights_name = match.group(3)
+
+    return repo_id, weights_name
+
+
+def fetch_ldm_config_and_checkpoint(
+    pretrained_model_link_or_path,
+    class_name,
+    original_config_file=None,
+    resume_download=False,
+    force_download=False,
+    proxies=None,
+    token=None,
+    cache_dir=None,
+    local_files_only=None,
+    revision=None,
+):
+    checkpoint = load_single_file_model_checkpoint(
+        pretrained_model_link_or_path,
+        resume_download=resume_download,
+        force_download=force_download,
+        proxies=proxies,
+        token=token,
+        cache_dir=cache_dir,
+        local_files_only=local_files_only,
+        revision=revision,
+    )
+    original_config = fetch_original_config(class_name, checkpoint, original_config_file)
+
+    return original_config, checkpoint
+
+
+def load_single_file_model_checkpoint(
+    pretrained_model_link_or_path,
+    resume_download=False,
+    force_download=False,
+    proxies=None,
+    token=None,
+    cache_dir=None,
+    local_files_only=None,
+    revision=None,
+):
+    if os.path.isfile(pretrained_model_link_or_path):
+        checkpoint = load_state_dict(pretrained_model_link_or_path)
+    else:
+        repo_id, weights_name = _extract_repo_id_and_weights_name(pretrained_model_link_or_path)
+        checkpoint_path = _get_model_file(
+            repo_id,
+            weights_name=weights_name,
+            force_download=force_download,
+            cache_dir=cache_dir,
+            resume_download=resume_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+        )
+        checkpoint = load_state_dict(checkpoint_path)
+
+    # some checkpoints contain the model state dict under a "state_dict" key
+    while "state_dict" in checkpoint:
+        checkpoint = checkpoint["state_dict"]
+
+    return checkpoint
+
+
+def infer_original_config_file(class_name, checkpoint):
+    if CHECKPOINT_KEY_NAMES["v2"] in checkpoint and checkpoint[CHECKPOINT_KEY_NAMES["v2"]].shape[-1] == 1024:
+        config_url = CONFIG_URLS["v2"]
+
+    elif CHECKPOINT_KEY_NAMES["xl_base"] in checkpoint:
+        config_url = CONFIG_URLS["xl"]
+
+    elif CHECKPOINT_KEY_NAMES["xl_refiner"] in checkpoint:
+        config_url = CONFIG_URLS["xl_refiner"]
+
+    elif class_name == "StableDiffusionUpscalePipeline":
+        config_url = CONFIG_URLS["upscale"]
+
+    elif class_name == "ControlNetModel":
+        config_url = CONFIG_URLS["controlnet"]
+
+    else:
+        config_url = CONFIG_URLS["v1"]
+
+    original_config_file = BytesIO(requests.get(config_url).content)
+
+    return original_config_file
+
+
+def fetch_original_config(pipeline_class_name, checkpoint, original_config_file=None):
+    def is_valid_url(url):
+        result = urlparse(url)
+        if result.scheme and result.netloc:
+            return True
+
+        return False
+
+    if original_config_file is None:
+        original_config_file = infer_original_config_file(pipeline_class_name, checkpoint)
+
+    elif os.path.isfile(original_config_file):
+        with open(original_config_file, "r") as fp:
+            original_config_file = fp.read()
+
+    elif is_valid_url(original_config_file):
+        original_config_file = BytesIO(requests.get(original_config_file).content)
+
+    else:
+        raise ValueError("Invalid `original_config_file` provided. Please set it to a valid file path or URL.")
+
+    original_config = yaml.safe_load(original_config_file)
+
+    return original_config
+
+
+def infer_model_type(original_config, checkpoint, model_type=None):
+    if model_type is not None:
+        return model_type
+
+    has_cond_stage_config = (
+        "cond_stage_config" in original_config["model"]["params"]
+        and original_config["model"]["params"]["cond_stage_config"] is not None
+    )
+    has_network_config = (
+        "network_config" in original_config["model"]["params"]
+        and original_config["model"]["params"]["network_config"] is not None
+    )
+
+    if has_cond_stage_config:
+        model_type = original_config["model"]["params"]["cond_stage_config"]["target"].split(".")[-1]
+
+    elif has_network_config:
+        context_dim = original_config["model"]["params"]["network_config"]["params"]["context_dim"]
+        if "edm_mean" in checkpoint and "edm_std" in checkpoint:
+            model_type = "Playground"
+        elif context_dim == 2048:
+            model_type = "SDXL"
+        else:
+            model_type = "SDXL-Refiner"
+    else:
+        raise ValueError("Unable to infer model type from config")
+
+    logger.debug(f"No `model_type` given, `model_type` inferred as: {model_type}")
+
+    return model_type
+
+
+def get_default_scheduler_config():
+    return SCHEDULER_DEFAULT_CONFIG
+
+
+def set_image_size(pipeline_class_name, original_config, checkpoint, image_size=None, model_type=None):
+    if image_size:
+        return image_size
+
+    global_step = checkpoint["global_step"] if "global_step" in checkpoint else None
+    model_type = infer_model_type(original_config, checkpoint, model_type)
+
+    if pipeline_class_name == "StableDiffusionUpscalePipeline":
+        image_size = original_config["model"]["params"]["unet_config"]["params"]["image_size"]
+        return image_size
+
+    elif model_type in ["SDXL", "SDXL-Refiner", "Playground"]:
+        image_size = 1024
+        return image_size
+
+    elif (
+        "parameterization" in original_config["model"]["params"]
+        and original_config["model"]["params"]["parameterization"] == "v"
+    ):
+        # NOTE: For stable diffusion 2 base one has to pass `image_size==512`
+        # as it relies on a brittle global step parameter here
+        image_size = 512 if global_step == 875000 else 768
+        return image_size
+
+    else:
+        image_size = 512
+        return image_size
+
+
+# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.conv_attn_to_linear
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+
+
+def create_unet_diffusers_config(original_config, image_size: int):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    if (
+        "unet_config" in original_config["model"]["params"]
+        and original_config["model"]["params"]["unet_config"] is not None
+    ):
+        unet_params = original_config["model"]["params"]["unet_config"]["params"]
+    else:
+        unet_params = original_config["model"]["params"]["network_config"]["params"]
+
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+    block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]]
+
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    if unet_params["transformer_depth"] is not None:
+        transformer_layers_per_block = (
+            unet_params["transformer_depth"]
+            if isinstance(unet_params["transformer_depth"], int)
+            else list(unet_params["transformer_depth"])
+        )
+    else:
+        transformer_layers_per_block = 1
+
+    vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1)
+
+    head_dim = unet_params["num_heads"] if "num_heads" in unet_params else None
+    use_linear_projection = (
+        unet_params["use_linear_in_transformer"] if "use_linear_in_transformer" in unet_params else False
+    )
+    if use_linear_projection:
+        # stable diffusion 2-base-512 and 2-768
+        if head_dim is None:
+            head_dim_mult = unet_params["model_channels"] // unet_params["num_head_channels"]
+            head_dim = [head_dim_mult * c for c in list(unet_params["channel_mult"])]
+
+    class_embed_type = None
+    addition_embed_type = None
+    addition_time_embed_dim = None
+    projection_class_embeddings_input_dim = None
+    context_dim = None
+
+    if unet_params["context_dim"] is not None:
+        context_dim = (
+            unet_params["context_dim"]
+            if isinstance(unet_params["context_dim"], int)
+            else unet_params["context_dim"][0]
+        )
+
+    if "num_classes" in unet_params:
+        if unet_params["num_classes"] == "sequential":
+            if context_dim in [2048, 1280]:
+                # SDXL
+                addition_embed_type = "text_time"
+                addition_time_embed_dim = 256
+            else:
+                class_embed_type = "projection"
+            assert "adm_in_channels" in unet_params
+            projection_class_embeddings_input_dim = unet_params["adm_in_channels"]
+
+    config = {
+        "sample_size": image_size // vae_scale_factor,
+        "in_channels": unet_params["in_channels"],
+        "down_block_types": down_block_types,
+        "block_out_channels": block_out_channels,
+        "layers_per_block": unet_params["num_res_blocks"],
+        "cross_attention_dim": context_dim,
+        "attention_head_dim": head_dim,
+        "use_linear_projection": use_linear_projection,
+        "class_embed_type": class_embed_type,
+        "addition_embed_type": addition_embed_type,
+        "addition_time_embed_dim": addition_time_embed_dim,
+        "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
+        "transformer_layers_per_block": transformer_layers_per_block,
+    }
+
+    if "disable_self_attentions" in unet_params:
+        config["only_cross_attention"] = unet_params["disable_self_attentions"]
+
+    if "num_classes" in unet_params and isinstance(unet_params["num_classes"], int):
+        config["num_class_embeds"] = unet_params["num_classes"]
+
+    config["out_channels"] = unet_params["out_channels"]
+    config["up_block_types"] = up_block_types
+
+    return config
+
+
+def create_controlnet_diffusers_config(original_config, image_size: int):
+    unet_params = original_config["model"]["params"]["control_stage_config"]["params"]
+    diffusers_unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
+
+    controlnet_config = {
+        "conditioning_channels": unet_params["hint_channels"],
+        "in_channels": diffusers_unet_config["in_channels"],
+        "down_block_types": diffusers_unet_config["down_block_types"],
+        "block_out_channels": diffusers_unet_config["block_out_channels"],
+        "layers_per_block": diffusers_unet_config["layers_per_block"],
+        "cross_attention_dim": diffusers_unet_config["cross_attention_dim"],
+        "attention_head_dim": diffusers_unet_config["attention_head_dim"],
+        "use_linear_projection": diffusers_unet_config["use_linear_projection"],
+        "class_embed_type": diffusers_unet_config["class_embed_type"],
+        "addition_embed_type": diffusers_unet_config["addition_embed_type"],
+        "addition_time_embed_dim": diffusers_unet_config["addition_time_embed_dim"],
+        "projection_class_embeddings_input_dim": diffusers_unet_config["projection_class_embeddings_input_dim"],
+        "transformer_layers_per_block": diffusers_unet_config["transformer_layers_per_block"],
+    }
+
+    return controlnet_config
+
+
+def create_vae_diffusers_config(original_config, image_size, scaling_factor=None, latents_mean=None, latents_std=None):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+    if (scaling_factor is None) and (latents_mean is not None) and (latents_std is not None):
+        scaling_factor = PLAYGROUND_VAE_SCALING_FACTOR
+    elif (scaling_factor is None) and ("scale_factor" in original_config["model"]["params"]):
+        scaling_factor = original_config["model"]["params"]["scale_factor"]
+    elif scaling_factor is None:
+        scaling_factor = LDM_VAE_DEFAULT_SCALING_FACTOR
+
+    block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+
+    config = {
+        "sample_size": image_size,
+        "in_channels": vae_params["in_channels"],
+        "out_channels": vae_params["out_ch"],
+        "down_block_types": down_block_types,
+        "up_block_types": up_block_types,
+        "block_out_channels": block_out_channels,
+        "latent_channels": vae_params["z_channels"],
+        "layers_per_block": vae_params["num_res_blocks"],
+        "scaling_factor": scaling_factor,
+    }
+    if latents_mean is not None and latents_std is not None:
+        config.update({"latents_mean": latents_mean, "latents_std": latents_std})
+
+    return config
+
+
+def update_unet_resnet_ldm_to_diffusers(ldm_keys, new_checkpoint, checkpoint, mapping=None):
+    for ldm_key in ldm_keys:
+        diffusers_key = (
+            ldm_key.replace("in_layers.0", "norm1")
+            .replace("in_layers.2", "conv1")
+            .replace("out_layers.0", "norm2")
+            .replace("out_layers.3", "conv2")
+            .replace("emb_layers.1", "time_emb_proj")
+            .replace("skip_connection", "conv_shortcut")
+        )
+        if mapping:
+            diffusers_key = diffusers_key.replace(mapping["old"], mapping["new"])
+        new_checkpoint[diffusers_key] = checkpoint.pop(ldm_key)
+
+
+def update_unet_attention_ldm_to_diffusers(ldm_keys, new_checkpoint, checkpoint, mapping):
+    for ldm_key in ldm_keys:
+        diffusers_key = ldm_key.replace(mapping["old"], mapping["new"])
+        new_checkpoint[diffusers_key] = checkpoint.pop(ldm_key)
+
+
+def convert_ldm_unet_checkpoint(checkpoint, config, extract_ema=False):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+    # extract state_dict for UNet
+    unet_state_dict = {}
+    keys = list(checkpoint.keys())
+    unet_key = LDM_UNET_KEY
+
+    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+    if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+        logger.warning("Checkpoint has both EMA and non-EMA weights.")
+        logger.warning(
+            "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+            " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+        )
+        for key in keys:
+            if key.startswith("model.diffusion_model"):
+                flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+    else:
+        if sum(k.startswith("model_ema") for k in keys) > 100:
+            logger.warning(
+                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+            )
+        for key in keys:
+            if key.startswith(unet_key):
+                unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+
+    new_checkpoint = {}
+    ldm_unet_keys = DIFFUSERS_TO_LDM_MAPPING["unet"]["layers"]
+    for diffusers_key, ldm_key in ldm_unet_keys.items():
+        if ldm_key not in unet_state_dict:
+            continue
+        new_checkpoint[diffusers_key] = unet_state_dict[ldm_key]
+
+    if ("class_embed_type" in config) and (config["class_embed_type"] in ["timestep", "projection"]):
+        class_embed_keys = DIFFUSERS_TO_LDM_MAPPING["unet"]["class_embed_type"]
+        for diffusers_key, ldm_key in class_embed_keys.items():
+            new_checkpoint[diffusers_key] = unet_state_dict[ldm_key]
+
+    if ("addition_embed_type" in config) and (config["addition_embed_type"] == "text_time"):
+        addition_embed_keys = DIFFUSERS_TO_LDM_MAPPING["unet"]["addition_embed_type"]
+        for diffusers_key, ldm_key in addition_embed_keys.items():
+            new_checkpoint[diffusers_key] = unet_state_dict[ldm_key]
+
+    # Relevant to StableDiffusionUpscalePipeline
+    if "num_class_embeds" in config:
+        if (config["num_class_embeds"] is not None) and ("label_emb.weight" in unet_state_dict):
+            new_checkpoint["class_embedding.weight"] = unet_state_dict["label_emb.weight"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    # Down blocks
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        update_unet_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            unet_state_dict,
+            {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"},
+        )
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+        if attentions:
+            update_unet_attention_ldm_to_diffusers(
+                attentions,
+                new_checkpoint,
+                unet_state_dict,
+                {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"},
+            )
+
+    # Mid blocks
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    update_unet_resnet_ldm_to_diffusers(
+        resnet_0, new_checkpoint, unet_state_dict, mapping={"old": "middle_block.0", "new": "mid_block.resnets.0"}
+    )
+    update_unet_resnet_ldm_to_diffusers(
+        resnet_1, new_checkpoint, unet_state_dict, mapping={"old": "middle_block.2", "new": "mid_block.resnets.1"}
+    )
+    update_unet_attention_ldm_to_diffusers(
+        attentions, new_checkpoint, unet_state_dict, mapping={"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    )
+
+    # Up Blocks
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in output_blocks[i] if f"output_blocks.{i}.0" in key and f"output_blocks.{i}.0.op" not in key
+        ]
+        update_unet_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            unet_state_dict,
+            {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"},
+        )
+
+        attentions = [
+            key for key in output_blocks[i] if f"output_blocks.{i}.1" in key and f"output_blocks.{i}.1.conv" not in key
+        ]
+        if attentions:
+            update_unet_attention_ldm_to_diffusers(
+                attentions,
+                new_checkpoint,
+                unet_state_dict,
+                {"old": f"output_blocks.{i}.1", "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}"},
+            )
+
+        if f"output_blocks.{i}.1.conv.weight" in unet_state_dict:
+            new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                f"output_blocks.{i}.1.conv.weight"
+            ]
+            new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                f"output_blocks.{i}.1.conv.bias"
+            ]
+        if f"output_blocks.{i}.2.conv.weight" in unet_state_dict:
+            new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                f"output_blocks.{i}.2.conv.weight"
+            ]
+            new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                f"output_blocks.{i}.2.conv.bias"
+            ]
+
+    return new_checkpoint
+
+
+def convert_controlnet_checkpoint(
+    checkpoint,
+    config,
+):
+    # Some controlnet ckpt files are distributed independently from the rest of the
+    # model components i.e. https://huggingface.co/thibaud/controlnet-sd21/
+    if "time_embed.0.weight" in checkpoint:
+        controlnet_state_dict = checkpoint
+
+    else:
+        controlnet_state_dict = {}
+        keys = list(checkpoint.keys())
+        controlnet_key = LDM_CONTROLNET_KEY
+        for key in keys:
+            if key.startswith(controlnet_key):
+                controlnet_state_dict[key.replace(controlnet_key, "")] = checkpoint.pop(key)
+
+    new_checkpoint = {}
+    ldm_controlnet_keys = DIFFUSERS_TO_LDM_MAPPING["controlnet"]["layers"]
+    for diffusers_key, ldm_key in ldm_controlnet_keys.items():
+        if ldm_key not in controlnet_state_dict:
+            continue
+        new_checkpoint[diffusers_key] = controlnet_state_dict[ldm_key]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len(
+        {".".join(layer.split(".")[:2]) for layer in controlnet_state_dict if "input_blocks" in layer}
+    )
+    input_blocks = {
+        layer_id: [key for key in controlnet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Down blocks
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        update_unet_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            controlnet_state_dict,
+            {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"},
+        )
+
+        if f"input_blocks.{i}.0.op.weight" in controlnet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = controlnet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = controlnet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+        if attentions:
+            update_unet_attention_ldm_to_diffusers(
+                attentions,
+                new_checkpoint,
+                controlnet_state_dict,
+                {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"},
+            )
+
+    # controlnet down blocks
+    for i in range(num_input_blocks):
+        new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = controlnet_state_dict.pop(f"zero_convs.{i}.0.weight")
+        new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = controlnet_state_dict.pop(f"zero_convs.{i}.0.bias")
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len(
+        {".".join(layer.split(".")[:2]) for layer in controlnet_state_dict if "middle_block" in layer}
+    )
+    middle_blocks = {
+        layer_id: [key for key in controlnet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+    if middle_blocks:
+        resnet_0 = middle_blocks[0]
+        attentions = middle_blocks[1]
+        resnet_1 = middle_blocks[2]
+
+        update_unet_resnet_ldm_to_diffusers(
+            resnet_0,
+            new_checkpoint,
+            controlnet_state_dict,
+            mapping={"old": "middle_block.0", "new": "mid_block.resnets.0"},
+        )
+        update_unet_resnet_ldm_to_diffusers(
+            resnet_1,
+            new_checkpoint,
+            controlnet_state_dict,
+            mapping={"old": "middle_block.2", "new": "mid_block.resnets.1"},
+        )
+        update_unet_attention_ldm_to_diffusers(
+            attentions,
+            new_checkpoint,
+            controlnet_state_dict,
+            mapping={"old": "middle_block.1", "new": "mid_block.attentions.0"},
+        )
+
+    # mid block
+    new_checkpoint["controlnet_mid_block.weight"] = controlnet_state_dict.pop("middle_block_out.0.weight")
+    new_checkpoint["controlnet_mid_block.bias"] = controlnet_state_dict.pop("middle_block_out.0.bias")
+
+    # controlnet cond embedding blocks
+    cond_embedding_blocks = {
+        ".".join(layer.split(".")[:2])
+        for layer in controlnet_state_dict
+        if "input_hint_block" in layer and ("input_hint_block.0" not in layer) and ("input_hint_block.14" not in layer)
+    }
+    num_cond_embedding_blocks = len(cond_embedding_blocks)
+
+    for idx in range(1, num_cond_embedding_blocks + 1):
+        diffusers_idx = idx - 1
+        cond_block_id = 2 * idx
+
+        new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_idx}.weight"] = controlnet_state_dict.pop(
+            f"input_hint_block.{cond_block_id}.weight"
+        )
+        new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_idx}.bias"] = controlnet_state_dict.pop(
+            f"input_hint_block.{cond_block_id}.bias"
+        )
+
+    return new_checkpoint
+
+
+def create_diffusers_controlnet_model_from_ldm(
+    pipeline_class_name, original_config, checkpoint, upcast_attention=False, image_size=None, torch_dtype=None
+):
+    # import here to avoid circular imports
+    from ..models import ControlNetModel
+
+    image_size = set_image_size(pipeline_class_name, original_config, checkpoint, image_size=image_size)
+
+    diffusers_config = create_controlnet_diffusers_config(original_config, image_size=image_size)
+    diffusers_config["upcast_attention"] = upcast_attention
+
+    diffusers_format_controlnet_checkpoint = convert_controlnet_checkpoint(checkpoint, diffusers_config)
+
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+    with ctx():
+        controlnet = ControlNetModel(**diffusers_config)
+
+    if is_accelerate_available():
+        from ..models.modeling_utils import load_model_dict_into_meta
+
+        unexpected_keys = load_model_dict_into_meta(
+            controlnet, diffusers_format_controlnet_checkpoint, dtype=torch_dtype
+        )
+        if controlnet._keys_to_ignore_on_load_unexpected is not None:
+            for pat in controlnet._keys_to_ignore_on_load_unexpected:
+                unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
+
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Some weights of the model checkpoint were not used when initializing {controlnet.__name__}: \n {[', '.join(unexpected_keys)]}"
+            )
+    else:
+        controlnet.load_state_dict(diffusers_format_controlnet_checkpoint)
+
+    if torch_dtype is not None:
+        controlnet = controlnet.to(torch_dtype)
+
+    return {"controlnet": controlnet}
+
+
+def update_vae_resnet_ldm_to_diffusers(keys, new_checkpoint, checkpoint, mapping):
+    for ldm_key in keys:
+        diffusers_key = ldm_key.replace(mapping["old"], mapping["new"]).replace("nin_shortcut", "conv_shortcut")
+        new_checkpoint[diffusers_key] = checkpoint.pop(ldm_key)
+
+
+def update_vae_attentions_ldm_to_diffusers(keys, new_checkpoint, checkpoint, mapping):
+    for ldm_key in keys:
+        diffusers_key = (
+            ldm_key.replace(mapping["old"], mapping["new"])
+            .replace("norm.weight", "group_norm.weight")
+            .replace("norm.bias", "group_norm.bias")
+            .replace("q.weight", "to_q.weight")
+            .replace("q.bias", "to_q.bias")
+            .replace("k.weight", "to_k.weight")
+            .replace("k.bias", "to_k.bias")
+            .replace("v.weight", "to_v.weight")
+            .replace("v.bias", "to_v.bias")
+            .replace("proj_out.weight", "to_out.0.weight")
+            .replace("proj_out.bias", "to_out.0.bias")
+        )
+        new_checkpoint[diffusers_key] = checkpoint.pop(ldm_key)
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        shape = new_checkpoint[diffusers_key].shape
+
+        if len(shape) == 3:
+            new_checkpoint[diffusers_key] = new_checkpoint[diffusers_key][:, :, 0]
+        elif len(shape) == 4:
+            new_checkpoint[diffusers_key] = new_checkpoint[diffusers_key][:, :, 0, 0]
+
+
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    # remove the LDM_VAE_KEY prefix from the ldm checkpoint keys so that it is easier to map them to diffusers keys
+    vae_state_dict = {}
+    keys = list(checkpoint.keys())
+    vae_key = LDM_VAE_KEY if any(k.startswith(LDM_VAE_KEY) for k in keys) else ""
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+
+    new_checkpoint = {}
+    vae_diffusers_ldm_map = DIFFUSERS_TO_LDM_MAPPING["vae"]
+    for diffusers_key, ldm_key in vae_diffusers_ldm_map.items():
+        if ldm_key not in vae_state_dict:
+            continue
+        new_checkpoint[diffusers_key] = vae_state_dict[ldm_key]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len(config["down_block_types"])
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+        update_vae_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            vae_state_dict,
+            mapping={"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"},
+        )
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+        update_vae_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            vae_state_dict,
+            mapping={"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"},
+        )
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    update_vae_attentions_ldm_to_diffusers(
+        mid_attentions, new_checkpoint, vae_state_dict, mapping={"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    )
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len(config["up_block_types"])
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+        update_vae_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            vae_state_dict,
+            mapping={"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"},
+        )
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+        update_vae_resnet_ldm_to_diffusers(
+            resnets,
+            new_checkpoint,
+            vae_state_dict,
+            mapping={"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"},
+        )
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    update_vae_attentions_ldm_to_diffusers(
+        mid_attentions, new_checkpoint, vae_state_dict, mapping={"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    )
+    conv_attn_to_linear(new_checkpoint)
+
+    return new_checkpoint
+
+
+def create_text_encoder_from_ldm_clip_checkpoint(config_name, checkpoint, local_files_only=False, torch_dtype=None):
+    try:
+        config = CLIPTextConfig.from_pretrained(config_name, local_files_only=local_files_only)
+    except Exception:
+        raise ValueError(
+            f"With local_files_only set to {local_files_only}, you must first locally save the configuration in the following path: 'openai/clip-vit-large-patch14'."
+        )
+
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+    with ctx():
+        text_model = CLIPTextModel(config)
+
+    keys = list(checkpoint.keys())
+    text_model_dict = {}
+
+    remove_prefixes = LDM_CLIP_PREFIX_TO_REMOVE
+
+    for key in keys:
+        for prefix in remove_prefixes:
+            if key.startswith(prefix):
+                diffusers_key = key.replace(prefix, "")
+                text_model_dict[diffusers_key] = checkpoint[key]
+
+    if is_accelerate_available():
+        from ..models.modeling_utils import load_model_dict_into_meta
+
+        unexpected_keys = load_model_dict_into_meta(text_model, text_model_dict, dtype=torch_dtype)
+        if text_model._keys_to_ignore_on_load_unexpected is not None:
+            for pat in text_model._keys_to_ignore_on_load_unexpected:
+                unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
+
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Some weights of the model checkpoint were not used when initializing {text_model.__class__.__name__}: \n {[', '.join(unexpected_keys)]}"
+            )
+    else:
+        if not (hasattr(text_model, "embeddings") and hasattr(text_model.embeddings.position_ids)):
+            text_model_dict.pop("text_model.embeddings.position_ids", None)
+
+        text_model.load_state_dict(text_model_dict)
+
+    if torch_dtype is not None:
+        text_model = text_model.to(torch_dtype)
+
+    return text_model
+
+
+def create_text_encoder_from_open_clip_checkpoint(
+    config_name,
+    checkpoint,
+    prefix="cond_stage_model.model.",
+    has_projection=False,
+    local_files_only=False,
+    torch_dtype=None,
+    **config_kwargs,
+):
+    try:
+        config = CLIPTextConfig.from_pretrained(config_name, **config_kwargs, local_files_only=local_files_only)
+    except Exception:
+        raise ValueError(
+            f"With local_files_only set to {local_files_only}, you must first locally save the configuration in the following path: '{config_name}'."
+        )
+
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+    with ctx():
+        text_model = CLIPTextModelWithProjection(config) if has_projection else CLIPTextModel(config)
+
+    text_model_dict = {}
+    text_proj_key = prefix + "text_projection"
+    text_proj_dim = (
+        int(checkpoint[text_proj_key].shape[0]) if text_proj_key in checkpoint else LDM_OPEN_CLIP_TEXT_PROJECTION_DIM
+    )
+    text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")
+
+    keys = list(checkpoint.keys())
+    keys_to_ignore = SD_2_TEXT_ENCODER_KEYS_TO_IGNORE
+
+    openclip_diffusers_ldm_map = DIFFUSERS_TO_LDM_MAPPING["openclip"]["layers"]
+    for diffusers_key, ldm_key in openclip_diffusers_ldm_map.items():
+        ldm_key = prefix + ldm_key
+        if ldm_key not in checkpoint:
+            continue
+        if ldm_key in keys_to_ignore:
+            continue
+        if ldm_key.endswith("text_projection"):
+            text_model_dict[diffusers_key] = checkpoint[ldm_key].T.contiguous()
+        else:
+            text_model_dict[diffusers_key] = checkpoint[ldm_key]
+
+    for key in keys:
+        if key in keys_to_ignore:
+            continue
+
+        if not key.startswith(prefix + "transformer."):
+            continue
+
+        diffusers_key = key.replace(prefix + "transformer.", "")
+        transformer_diffusers_to_ldm_map = DIFFUSERS_TO_LDM_MAPPING["openclip"]["transformer"]
+        for new_key, old_key in transformer_diffusers_to_ldm_map.items():
+            diffusers_key = (
+                diffusers_key.replace(old_key, new_key).replace(".in_proj_weight", "").replace(".in_proj_bias", "")
+            )
+
+        if key.endswith(".in_proj_weight"):
+            weight_value = checkpoint[key]
+
+            text_model_dict[diffusers_key + ".q_proj.weight"] = weight_value[:text_proj_dim, :]
+            text_model_dict[diffusers_key + ".k_proj.weight"] = weight_value[text_proj_dim : text_proj_dim * 2, :]
+            text_model_dict[diffusers_key + ".v_proj.weight"] = weight_value[text_proj_dim * 2 :, :]
+
+        elif key.endswith(".in_proj_bias"):
+            weight_value = checkpoint[key]
+            text_model_dict[diffusers_key + ".q_proj.bias"] = weight_value[:text_proj_dim]
+            text_model_dict[diffusers_key + ".k_proj.bias"] = weight_value[text_proj_dim : text_proj_dim * 2]
+            text_model_dict[diffusers_key + ".v_proj.bias"] = weight_value[text_proj_dim * 2 :]
+        else:
+            text_model_dict[diffusers_key] = checkpoint[key]
+
+    if is_accelerate_available():
+        from ..models.modeling_utils import load_model_dict_into_meta
+
+        unexpected_keys = load_model_dict_into_meta(text_model, text_model_dict, dtype=torch_dtype)
+        if text_model._keys_to_ignore_on_load_unexpected is not None:
+            for pat in text_model._keys_to_ignore_on_load_unexpected:
+                unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
+
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Some weights of the model checkpoint were not used when initializing {text_model.__class__.__name__}: \n {[', '.join(unexpected_keys)]}"
+            )
+
+    else:
+        if not (hasattr(text_model, "embeddings") and hasattr(text_model.embeddings.position_ids)):
+            text_model_dict.pop("text_model.embeddings.position_ids", None)
+
+        text_model.load_state_dict(text_model_dict)
+
+    if torch_dtype is not None:
+        text_model = text_model.to(torch_dtype)
+
+    return text_model
+
+
+def create_diffusers_unet_model_from_ldm(
+    pipeline_class_name,
+    original_config,
+    checkpoint,
+    num_in_channels=None,
+    upcast_attention=None,
+    extract_ema=False,
+    image_size=None,
+    torch_dtype=None,
+    model_type=None,
+):
+    from ..models import UNet2DConditionModel
+
+    if num_in_channels is None:
+        if pipeline_class_name in [
+            "StableDiffusionInpaintPipeline",
+            "StableDiffusionControlNetInpaintPipeline",
+            "StableDiffusionXLInpaintPipeline",
+            "StableDiffusionXLControlNetInpaintPipeline",
+        ]:
+            num_in_channels = 9
+
+        elif pipeline_class_name == "StableDiffusionUpscalePipeline":
+            num_in_channels = 7
+
+        else:
+            num_in_channels = 4
+
+    image_size = set_image_size(
+        pipeline_class_name, original_config, checkpoint, image_size=image_size, model_type=model_type
+    )
+    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
+    unet_config["in_channels"] = num_in_channels
+    if upcast_attention is not None:
+        unet_config["upcast_attention"] = upcast_attention
+
+    diffusers_format_unet_checkpoint = convert_ldm_unet_checkpoint(checkpoint, unet_config, extract_ema=extract_ema)
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+
+    with ctx():
+        unet = UNet2DConditionModel(**unet_config)
+
+    if is_accelerate_available():
+        from ..models.modeling_utils import load_model_dict_into_meta
+
+        unexpected_keys = load_model_dict_into_meta(unet, diffusers_format_unet_checkpoint, dtype=torch_dtype)
+        if unet._keys_to_ignore_on_load_unexpected is not None:
+            for pat in unet._keys_to_ignore_on_load_unexpected:
+                unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
+
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Some weights of the model checkpoint were not used when initializing {unet.__name__}: \n {[', '.join(unexpected_keys)]}"
+            )
+    else:
+        unet.load_state_dict(diffusers_format_unet_checkpoint)
+
+    if torch_dtype is not None:
+        unet = unet.to(torch_dtype)
+
+    return {"unet": unet}
+
+
+def create_diffusers_vae_model_from_ldm(
+    pipeline_class_name,
+    original_config,
+    checkpoint,
+    image_size=None,
+    scaling_factor=None,
+    torch_dtype=None,
+    model_type=None,
+):
+    # import here to avoid circular imports
+    from ..models import AutoencoderKL
+
+    image_size = set_image_size(
+        pipeline_class_name, original_config, checkpoint, image_size=image_size, model_type=model_type
+    )
+    model_type = infer_model_type(original_config, checkpoint, model_type)
+
+    if model_type == "Playground":
+        edm_mean = (
+            checkpoint["edm_mean"].to(dtype=torch_dtype).tolist() if torch_dtype else checkpoint["edm_mean"].tolist()
+        )
+        edm_std = (
+            checkpoint["edm_std"].to(dtype=torch_dtype).tolist() if torch_dtype else checkpoint["edm_std"].tolist()
+        )
+    else:
+        edm_mean = None
+        edm_std = None
+
+    vae_config = create_vae_diffusers_config(
+        original_config,
+        image_size=image_size,
+        scaling_factor=scaling_factor,
+        latents_mean=edm_mean,
+        latents_std=edm_std,
+    )
+    diffusers_format_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+
+    with ctx():
+        vae = AutoencoderKL(**vae_config)
+
+    if is_accelerate_available():
+        from ..models.modeling_utils import load_model_dict_into_meta
+
+        unexpected_keys = load_model_dict_into_meta(vae, diffusers_format_vae_checkpoint, dtype=torch_dtype)
+        if vae._keys_to_ignore_on_load_unexpected is not None:
+            for pat in vae._keys_to_ignore_on_load_unexpected:
+                unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
+
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Some weights of the model checkpoint were not used when initializing {vae.__name__}: \n {[', '.join(unexpected_keys)]}"
+            )
+    else:
+        vae.load_state_dict(diffusers_format_vae_checkpoint)
+
+    if torch_dtype is not None:
+        vae = vae.to(torch_dtype)
+
+    return {"vae": vae}
+
+
+def create_text_encoders_and_tokenizers_from_ldm(
+    original_config,
+    checkpoint,
+    model_type=None,
+    local_files_only=False,
+    torch_dtype=None,
+):
+    model_type = infer_model_type(original_config, checkpoint=checkpoint, model_type=model_type)
+
+    if model_type == "FrozenOpenCLIPEmbedder":
+        config_name = "stabilityai/stable-diffusion-2"
+        config_kwargs = {"subfolder": "text_encoder"}
+
+        try:
+            text_encoder = create_text_encoder_from_open_clip_checkpoint(
+                config_name, checkpoint, local_files_only=local_files_only, torch_dtype=torch_dtype, **config_kwargs
+            )
+            tokenizer = CLIPTokenizer.from_pretrained(
+                config_name, subfolder="tokenizer", local_files_only=local_files_only
+            )
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the text_encoder in the following path: '{config_name}'."
+            )
+        else:
+            return {"text_encoder": text_encoder, "tokenizer": tokenizer}
+
+    elif model_type == "FrozenCLIPEmbedder":
+        try:
+            config_name = "openai/clip-vit-large-patch14"
+            text_encoder = create_text_encoder_from_ldm_clip_checkpoint(
+                config_name,
+                checkpoint,
+                local_files_only=local_files_only,
+                torch_dtype=torch_dtype,
+            )
+            tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
+
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: '{config_name}'."
+            )
+        else:
+            return {"text_encoder": text_encoder, "tokenizer": tokenizer}
+
+    elif model_type == "SDXL-Refiner":
+        config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
+        config_kwargs = {"projection_dim": 1280}
+        prefix = "conditioner.embedders.0.model."
+
+        try:
+            tokenizer_2 = CLIPTokenizer.from_pretrained(config_name, pad_token="!", local_files_only=local_files_only)
+            text_encoder_2 = create_text_encoder_from_open_clip_checkpoint(
+                config_name,
+                checkpoint,
+                prefix=prefix,
+                has_projection=True,
+                local_files_only=local_files_only,
+                torch_dtype=torch_dtype,
+                **config_kwargs,
+            )
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the text_encoder_2 and tokenizer_2 in the following path: {config_name} with `pad_token` set to '!'."
+            )
+
+        else:
+            return {
+                "text_encoder": None,
+                "tokenizer": None,
+                "tokenizer_2": tokenizer_2,
+                "text_encoder_2": text_encoder_2,
+            }
+
+    elif model_type in ["SDXL", "Playground"]:
+        try:
+            config_name = "openai/clip-vit-large-patch14"
+            tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
+            text_encoder = create_text_encoder_from_ldm_clip_checkpoint(
+                config_name, checkpoint, local_files_only=local_files_only, torch_dtype=torch_dtype
+            )
+
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the text_encoder and tokenizer in the following path: 'openai/clip-vit-large-patch14'."
+            )
+
+        try:
+            config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
+            config_kwargs = {"projection_dim": 1280}
+            prefix = "conditioner.embedders.1.model."
+            tokenizer_2 = CLIPTokenizer.from_pretrained(config_name, pad_token="!", local_files_only=local_files_only)
+            text_encoder_2 = create_text_encoder_from_open_clip_checkpoint(
+                config_name,
+                checkpoint,
+                prefix=prefix,
+                has_projection=True,
+                local_files_only=local_files_only,
+                torch_dtype=torch_dtype,
+                **config_kwargs,
+            )
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the text_encoder_2 and tokenizer_2 in the following path: {config_name} with `pad_token` set to '!'."
+            )
+
+        return {
+            "tokenizer": tokenizer,
+            "text_encoder": text_encoder,
+            "tokenizer_2": tokenizer_2,
+            "text_encoder_2": text_encoder_2,
+        }
+
+    return
+
+
+def create_scheduler_from_ldm(
+    pipeline_class_name,
+    original_config,
+    checkpoint,
+    prediction_type=None,
+    scheduler_type="ddim",
+    model_type=None,
+):
+    scheduler_config = get_default_scheduler_config()
+    model_type = infer_model_type(original_config, checkpoint=checkpoint, model_type=model_type)
+
+    global_step = checkpoint["global_step"] if "global_step" in checkpoint else None
+
+    num_train_timesteps = getattr(original_config["model"]["params"], "timesteps", None) or 1000
+    scheduler_config["num_train_timesteps"] = num_train_timesteps
+
+    if (
+        "parameterization" in original_config["model"]["params"]
+        and original_config["model"]["params"]["parameterization"] == "v"
+    ):
+        if prediction_type is None:
+            # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"`
+            # as it relies on a brittle global step parameter here
+            prediction_type = "epsilon" if global_step == 875000 else "v_prediction"
+
+    else:
+        prediction_type = prediction_type or "epsilon"
+
+    scheduler_config["prediction_type"] = prediction_type
+
+    if model_type in ["SDXL", "SDXL-Refiner"]:
+        scheduler_type = "euler"
+    elif model_type == "Playground":
+        scheduler_type = "edm_dpm_solver_multistep"
+    else:
+        beta_start = original_config["model"]["params"].get("linear_start", 0.02)
+        beta_end = original_config["model"]["params"].get("linear_end", 0.085)
+        scheduler_config["beta_start"] = beta_start
+        scheduler_config["beta_end"] = beta_end
+        scheduler_config["beta_schedule"] = "scaled_linear"
+        scheduler_config["clip_sample"] = False
+        scheduler_config["set_alpha_to_one"] = False
+
+    if scheduler_type == "pndm":
+        scheduler_config["skip_prk_steps"] = True
+        scheduler = PNDMScheduler.from_config(scheduler_config)
+
+    elif scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler.from_config(scheduler_config)
+
+    elif scheduler_type == "heun":
+        scheduler = HeunDiscreteScheduler.from_config(scheduler_config)
+
+    elif scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler.from_config(scheduler_config)
+
+    elif scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler_config)
+
+    elif scheduler_type == "dpm":
+        scheduler = DPMSolverMultistepScheduler.from_config(scheduler_config)
+
+    elif scheduler_type == "ddim":
+        scheduler = DDIMScheduler.from_config(scheduler_config)
+
+    elif scheduler_type == "edm_dpm_solver_multistep":
+        scheduler_config = {
+            "algorithm_type": "dpmsolver++",
+            "dynamic_thresholding_ratio": 0.995,
+            "euler_at_final": False,
+            "final_sigmas_type": "zero",
+            "lower_order_final": True,
+            "num_train_timesteps": 1000,
+            "prediction_type": "epsilon",
+            "rho": 7.0,
+            "sample_max_value": 1.0,
+            "sigma_data": 0.5,
+            "sigma_max": 80.0,
+            "sigma_min": 0.002,
+            "solver_order": 2,
+            "solver_type": "midpoint",
+            "thresholding": False,
+        }
+        scheduler = EDMDPMSolverMultistepScheduler(**scheduler_config)
+
+    else:
+        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+
+    if pipeline_class_name == "StableDiffusionUpscalePipeline":
+        scheduler = DDIMScheduler.from_pretrained("stabilityai/stable-diffusion-x4-upscaler", subfolder="scheduler")
+        low_res_scheduler = DDPMScheduler.from_pretrained(
+            "stabilityai/stable-diffusion-x4-upscaler", subfolder="low_res_scheduler"
+        )
+
+        return {
+            "scheduler": scheduler,
+            "low_res_scheduler": low_res_scheduler,
+        }
+
+    return {"scheduler": scheduler}
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/textual_inversion.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/textual_inversion.py
new file mode 100644
index 000000000..aaaf4b68b
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/textual_inversion.py
@@ -0,0 +1,562 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, List, Optional, Union
+
+import safetensors
+import torch
+from huggingface_hub.utils import validate_hf_hub_args
+from torch import nn
+
+from ..utils import _get_model_file, is_accelerate_available, is_transformers_available, logging
+
+
+if is_transformers_available():
+    from transformers import PreTrainedModel, PreTrainedTokenizer
+
+if is_accelerate_available():
+    from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
+
+logger = logging.get_logger(__name__)
+
+TEXT_INVERSION_NAME = "learned_embeds.bin"
+TEXT_INVERSION_NAME_SAFE = "learned_embeds.safetensors"
+
+
+@validate_hf_hub_args
+def load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs):
+    cache_dir = kwargs.pop("cache_dir", None)
+    force_download = kwargs.pop("force_download", False)
+    resume_download = kwargs.pop("resume_download", False)
+    proxies = kwargs.pop("proxies", None)
+    local_files_only = kwargs.pop("local_files_only", None)
+    token = kwargs.pop("token", None)
+    revision = kwargs.pop("revision", None)
+    subfolder = kwargs.pop("subfolder", None)
+    weight_name = kwargs.pop("weight_name", None)
+    use_safetensors = kwargs.pop("use_safetensors", None)
+
+    allow_pickle = False
+    if use_safetensors is None:
+        use_safetensors = True
+        allow_pickle = True
+
+    user_agent = {
+        "file_type": "text_inversion",
+        "framework": "pytorch",
+    }
+    state_dicts = []
+    for pretrained_model_name_or_path in pretrained_model_name_or_paths:
+        if not isinstance(pretrained_model_name_or_path, (dict, torch.Tensor)):
+            # 3.1. Load textual inversion file
+            model_file = None
+
+            # Let's first try to load .safetensors weights
+            if (use_safetensors and weight_name is None) or (
+                weight_name is not None and weight_name.endswith(".safetensors")
+            ):
+                try:
+                    model_file = _get_model_file(
+                        pretrained_model_name_or_path,
+                        weights_name=weight_name or TEXT_INVERSION_NAME_SAFE,
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        resume_download=resume_download,
+                        proxies=proxies,
+                        local_files_only=local_files_only,
+                        token=token,
+                        revision=revision,
+                        subfolder=subfolder,
+                        user_agent=user_agent,
+                    )
+                    state_dict = safetensors.torch.load_file(model_file, device="cpu")
+                except Exception as e:
+                    if not allow_pickle:
+                        raise e
+
+                    model_file = None
+
+            if model_file is None:
+                model_file = _get_model_file(
+                    pretrained_model_name_or_path,
+                    weights_name=weight_name or TEXT_INVERSION_NAME,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                    user_agent=user_agent,
+                )
+                state_dict = torch.load(model_file, map_location="cpu")
+        else:
+            state_dict = pretrained_model_name_or_path
+
+        state_dicts.append(state_dict)
+
+    return state_dicts
+
+
+class TextualInversionLoaderMixin:
+    r"""
+    Load Textual Inversion tokens and embeddings to the tokenizer and text encoder.
+    """
+
+    def maybe_convert_prompt(self, prompt: Union[str, List[str]], tokenizer: "PreTrainedTokenizer"):  # noqa: F821
+        r"""
+        Processes prompts that include a special token corresponding to a multi-vector textual inversion embedding to
+        be replaced with multiple special tokens each corresponding to one of the vectors. If the prompt has no textual
+        inversion token or if the textual inversion token is a single vector, the input prompt is returned.
+
+        Parameters:
+            prompt (`str` or list of `str`):
+                The prompt or prompts to guide the image generation.
+            tokenizer (`PreTrainedTokenizer`):
+                The tokenizer responsible for encoding the prompt into input tokens.
+
+        Returns:
+            `str` or list of `str`: The converted prompt
+        """
+        if not isinstance(prompt, List):
+            prompts = [prompt]
+        else:
+            prompts = prompt
+
+        prompts = [self._maybe_convert_prompt(p, tokenizer) for p in prompts]
+
+        if not isinstance(prompt, List):
+            return prompts[0]
+
+        return prompts
+
+    def _maybe_convert_prompt(self, prompt: str, tokenizer: "PreTrainedTokenizer"):  # noqa: F821
+        r"""
+        Maybe convert a prompt into a "multi vector"-compatible prompt. If the prompt includes a token that corresponds
+        to a multi-vector textual inversion embedding, this function will process the prompt so that the special token
+        is replaced with multiple special tokens each corresponding to one of the vectors. If the prompt has no textual
+        inversion token or a textual inversion token that is a single vector, the input prompt is simply returned.
+
+        Parameters:
+            prompt (`str`):
+                The prompt to guide the image generation.
+            tokenizer (`PreTrainedTokenizer`):
+                The tokenizer responsible for encoding the prompt into input tokens.
+
+        Returns:
+            `str`: The converted prompt
+        """
+        tokens = tokenizer.tokenize(prompt)
+        unique_tokens = set(tokens)
+        for token in unique_tokens:
+            if token in tokenizer.added_tokens_encoder:
+                replacement = token
+                i = 1
+                while f"{token}_{i}" in tokenizer.added_tokens_encoder:
+                    replacement += f" {token}_{i}"
+                    i += 1
+
+                prompt = prompt.replace(token, replacement)
+
+        return prompt
+
+    def _check_text_inv_inputs(self, tokenizer, text_encoder, pretrained_model_name_or_paths, tokens):
+        if tokenizer is None:
+            raise ValueError(
+                f"{self.__class__.__name__} requires `self.tokenizer` or passing a `tokenizer` of type `PreTrainedTokenizer` for calling"
+                f" `{self.load_textual_inversion.__name__}`"
+            )
+
+        if text_encoder is None:
+            raise ValueError(
+                f"{self.__class__.__name__} requires `self.text_encoder` or passing a `text_encoder` of type `PreTrainedModel` for calling"
+                f" `{self.load_textual_inversion.__name__}`"
+            )
+
+        if len(pretrained_model_name_or_paths) > 1 and len(pretrained_model_name_or_paths) != len(tokens):
+            raise ValueError(
+                f"You have passed a list of models of length {len(pretrained_model_name_or_paths)}, and list of tokens of length {len(tokens)} "
+                f"Make sure both lists have the same length."
+            )
+
+        valid_tokens = [t for t in tokens if t is not None]
+        if len(set(valid_tokens)) < len(valid_tokens):
+            raise ValueError(f"You have passed a list of tokens that contains duplicates: {tokens}")
+
+    @staticmethod
+    def _retrieve_tokens_and_embeddings(tokens, state_dicts, tokenizer):
+        all_tokens = []
+        all_embeddings = []
+        for state_dict, token in zip(state_dicts, tokens):
+            if isinstance(state_dict, torch.Tensor):
+                if token is None:
+                    raise ValueError(
+                        "You are trying to load a textual inversion embedding that has been saved as a PyTorch tensor. Make sure to pass the name of the corresponding token in this case: `token=...`."
+                    )
+                loaded_token = token
+                embedding = state_dict
+            elif len(state_dict) == 1:
+                # diffusers
+                loaded_token, embedding = next(iter(state_dict.items()))
+            elif "string_to_param" in state_dict:
+                # A1111
+                loaded_token = state_dict["name"]
+                embedding = state_dict["string_to_param"]["*"]
+            else:
+                raise ValueError(
+                    f"Loaded state dictionary is incorrect: {state_dict}. \n\n"
+                    "Please verify that the loaded state dictionary of the textual embedding either only has a single key or includes the `string_to_param`"
+                    " input key."
+                )
+
+            if token is not None and loaded_token != token:
+                logger.info(f"The loaded token: {loaded_token} is overwritten by the passed token {token}.")
+            else:
+                token = loaded_token
+
+            if token in tokenizer.get_vocab():
+                raise ValueError(
+                    f"Token {token} already in tokenizer vocabulary. Please choose a different token name or remove {token} and embedding from the tokenizer and text encoder."
+                )
+
+            all_tokens.append(token)
+            all_embeddings.append(embedding)
+
+        return all_tokens, all_embeddings
+
+    @staticmethod
+    def _extend_tokens_and_embeddings(tokens, embeddings, tokenizer):
+        all_tokens = []
+        all_embeddings = []
+
+        for embedding, token in zip(embeddings, tokens):
+            if f"{token}_1" in tokenizer.get_vocab():
+                multi_vector_tokens = [token]
+                i = 1
+                while f"{token}_{i}" in tokenizer.added_tokens_encoder:
+                    multi_vector_tokens.append(f"{token}_{i}")
+                    i += 1
+
+                raise ValueError(
+                    f"Multi-vector Token {multi_vector_tokens} already in tokenizer vocabulary. Please choose a different token name or remove the {multi_vector_tokens} and embedding from the tokenizer and text encoder."
+                )
+
+            is_multi_vector = len(embedding.shape) > 1 and embedding.shape[0] > 1
+            if is_multi_vector:
+                all_tokens += [token] + [f"{token}_{i}" for i in range(1, embedding.shape[0])]
+                all_embeddings += [e for e in embedding]  # noqa: C416
+            else:
+                all_tokens += [token]
+                all_embeddings += [embedding[0]] if len(embedding.shape) > 1 else [embedding]
+
+        return all_tokens, all_embeddings
+
+    @validate_hf_hub_args
+    def load_textual_inversion(
+        self,
+        pretrained_model_name_or_path: Union[str, List[str], Dict[str, torch.Tensor], List[Dict[str, torch.Tensor]]],
+        token: Optional[Union[str, List[str]]] = None,
+        tokenizer: Optional["PreTrainedTokenizer"] = None,  # noqa: F821
+        text_encoder: Optional["PreTrainedModel"] = None,  # noqa: F821
+        **kwargs,
+    ):
+        r"""
+        Load Textual Inversion embeddings into the text encoder of [`StableDiffusionPipeline`] (both 🤗 Diffusers and
+        Automatic1111 formats are supported).
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike` or `List[str or os.PathLike]` or `Dict` or `List[Dict]`):
+                Can be either one of the following or a list of them:
+
+                    - A string, the *model id* (for example `sd-concepts-library/low-poly-hd-logos-icons`) of a
+                      pretrained model hosted on the Hub.
+                    - A path to a *directory* (for example `./my_text_inversion_directory/`) containing the textual
+                      inversion weights.
+                    - A path to a *file* (for example `./my_text_inversions.pt`) containing textual inversion weights.
+                    - A [torch state
+                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
+
+            token (`str` or `List[str]`, *optional*):
+                Override the token to use for the textual inversion weights. If `pretrained_model_name_or_path` is a
+                list, then `token` must also be a list of equal length.
+            text_encoder ([`~transformers.CLIPTextModel`], *optional*):
+                Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+                If not specified, function will take self.tokenizer.
+            tokenizer ([`~transformers.CLIPTokenizer`], *optional*):
+                A `CLIPTokenizer` to tokenize text. If not specified, function will take self.tokenizer.
+            weight_name (`str`, *optional*):
+                Name of a custom weight file. This should be used when:
+
+                    - The saved textual inversion file is in 🤗 Diffusers format, but was saved under a specific weight
+                      name such as `text_inv.bin`.
+                    - The saved textual inversion file is in the Automatic1111 format.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+
+        Example:
+
+        To load a Textual Inversion embedding vector in 🤗 Diffusers format:
+
+        ```py
+        from diffusers import StableDiffusionPipeline
+        import torch
+
+        model_id = "runwayml/stable-diffusion-v1-5"
+        pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+
+        pipe.load_textual_inversion("sd-concepts-library/cat-toy")
+
+        prompt = "A <cat-toy> backpack"
+
+        image = pipe(prompt, num_inference_steps=50).images[0]
+        image.save("cat-backpack.png")
+        ```
+
+        To load a Textual Inversion embedding vector in Automatic1111 format, make sure to download the vector first
+        (for example from [civitAI](https://civitai.com/models/3036?modelVersionId=9857)) and then load the vector
+        locally:
+
+        ```py
+        from diffusers import StableDiffusionPipeline
+        import torch
+
+        model_id = "runwayml/stable-diffusion-v1-5"
+        pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+
+        pipe.load_textual_inversion("./charturnerv2.pt", token="charturnerv2")
+
+        prompt = "charturnerv2, multiple views of the same character in the same outfit, a character turnaround of a woman wearing a black jacket and red shirt, best quality, intricate details."
+
+        image = pipe(prompt, num_inference_steps=50).images[0]
+        image.save("character.png")
+        ```
+
+        """
+        # 1. Set correct tokenizer and text encoder
+        tokenizer = tokenizer or getattr(self, "tokenizer", None)
+        text_encoder = text_encoder or getattr(self, "text_encoder", None)
+
+        # 2. Normalize inputs
+        pretrained_model_name_or_paths = (
+            [pretrained_model_name_or_path]
+            if not isinstance(pretrained_model_name_or_path, list)
+            else pretrained_model_name_or_path
+        )
+        tokens = [token] if not isinstance(token, list) else token
+        if tokens[0] is None:
+            tokens = tokens * len(pretrained_model_name_or_paths)
+
+        # 3. Check inputs
+        self._check_text_inv_inputs(tokenizer, text_encoder, pretrained_model_name_or_paths, tokens)
+
+        # 4. Load state dicts of textual embeddings
+        state_dicts = load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs)
+
+        # 4.1 Handle the special case when state_dict is a tensor that contains n embeddings for n tokens
+        if len(tokens) > 1 and len(state_dicts) == 1:
+            if isinstance(state_dicts[0], torch.Tensor):
+                state_dicts = list(state_dicts[0])
+                if len(tokens) != len(state_dicts):
+                    raise ValueError(
+                        f"You have passed a state_dict contains {len(state_dicts)} embeddings, and list of tokens of length {len(tokens)} "
+                        f"Make sure both have the same length."
+                    )
+
+        # 4. Retrieve tokens and embeddings
+        tokens, embeddings = self._retrieve_tokens_and_embeddings(tokens, state_dicts, tokenizer)
+
+        # 5. Extend tokens and embeddings for multi vector
+        tokens, embeddings = self._extend_tokens_and_embeddings(tokens, embeddings, tokenizer)
+
+        # 6. Make sure all embeddings have the correct size
+        expected_emb_dim = text_encoder.get_input_embeddings().weight.shape[-1]
+        if any(expected_emb_dim != emb.shape[-1] for emb in embeddings):
+            raise ValueError(
+                "Loaded embeddings are of incorrect shape. Expected each textual inversion embedding "
+                "to be of shape {input_embeddings.shape[-1]}, but are {embeddings.shape[-1]} "
+            )
+
+        # 7. Now we can be sure that loading the embedding matrix works
+        # < Unsafe code:
+
+        # 7.1 Offload all hooks in case the pipeline was cpu offloaded before make sure, we offload and onload again
+        is_model_cpu_offload = False
+        is_sequential_cpu_offload = False
+        for _, component in self.components.items():
+            if isinstance(component, nn.Module):
+                if hasattr(component, "_hf_hook"):
+                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
+                    is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+                    logger.info(
+                        "Accelerate hooks detected. Since you have called `load_textual_inversion()`, the previous hooks will be first removed. Then the textual inversion parameters will be loaded and the hooks will be applied again."
+                    )
+                    remove_hook_from_module(component, recurse=is_sequential_cpu_offload)
+
+        # 7.2 save expected device and dtype
+        device = text_encoder.device
+        dtype = text_encoder.dtype
+
+        # 7.3 Increase token embedding matrix
+        text_encoder.resize_token_embeddings(len(tokenizer) + len(tokens))
+        input_embeddings = text_encoder.get_input_embeddings().weight
+
+        # 7.4 Load token and embedding
+        for token, embedding in zip(tokens, embeddings):
+            # add tokens and get ids
+            tokenizer.add_tokens(token)
+            token_id = tokenizer.convert_tokens_to_ids(token)
+            input_embeddings.data[token_id] = embedding
+            logger.info(f"Loaded textual inversion embedding for {token}.")
+
+        input_embeddings.to(dtype=dtype, device=device)
+
+        # 7.5 Offload the model again
+        if is_model_cpu_offload:
+            self.enable_model_cpu_offload()
+        elif is_sequential_cpu_offload:
+            self.enable_sequential_cpu_offload()
+
+        # / Unsafe Code >
+
+    def unload_textual_inversion(
+        self,
+        tokens: Optional[Union[str, List[str]]] = None,
+        tokenizer: Optional["PreTrainedTokenizer"] = None,
+        text_encoder: Optional["PreTrainedModel"] = None,
+    ):
+        r"""
+        Unload Textual Inversion embeddings from the text encoder of [`StableDiffusionPipeline`]
+
+        Example:
+        ```py
+        from diffusers import AutoPipelineForText2Image
+        import torch
+
+        pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5")
+
+        # Example 1
+        pipeline.load_textual_inversion("sd-concepts-library/gta5-artwork")
+        pipeline.load_textual_inversion("sd-concepts-library/moeb-style")
+
+        # Remove all token embeddings
+        pipeline.unload_textual_inversion()
+
+        # Example 2
+        pipeline.load_textual_inversion("sd-concepts-library/moeb-style")
+        pipeline.load_textual_inversion("sd-concepts-library/gta5-artwork")
+
+        # Remove just one token
+        pipeline.unload_textual_inversion("<moe-bius>")
+
+        # Example 3: unload from SDXL
+        pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
+        embedding_path = hf_hub_download(repo_id="linoyts/web_y2k", filename="web_y2k_emb.safetensors", repo_type="model")
+
+        # load embeddings to the text encoders
+        state_dict = load_file(embedding_path)
+
+        # load embeddings of text_encoder 1 (CLIP ViT-L/14)
+        pipeline.load_textual_inversion(state_dict["clip_l"], token=["<s0>", "<s1>"], text_encoder=pipeline.text_encoder, tokenizer=pipeline.tokenizer)
+        # load embeddings of text_encoder 2 (CLIP ViT-G/14)
+        pipeline.load_textual_inversion(state_dict["clip_g"], token=["<s0>", "<s1>"], text_encoder=pipeline.text_encoder_2, tokenizer=pipeline.tokenizer_2)
+
+        # Unload explicitly from both text encoders abd tokenizers
+        pipeline.unload_textual_inversion(tokens=["<s0>", "<s1>"], text_encoder=pipeline.text_encoder, tokenizer=pipeline.tokenizer)
+        pipeline.unload_textual_inversion(tokens=["<s0>", "<s1>"], text_encoder=pipeline.text_encoder_2, tokenizer=pipeline.tokenizer_2)
+
+        ```
+        """
+
+        tokenizer = tokenizer or getattr(self, "tokenizer", None)
+        text_encoder = text_encoder or getattr(self, "text_encoder", None)
+
+        # Get textual inversion tokens and ids
+        token_ids = []
+        last_special_token_id = None
+
+        if tokens:
+            if isinstance(tokens, str):
+                tokens = [tokens]
+            for added_token_id, added_token in tokenizer.added_tokens_decoder.items():
+                if not added_token.special:
+                    if added_token.content in tokens:
+                        token_ids.append(added_token_id)
+                else:
+                    last_special_token_id = added_token_id
+            if len(token_ids) == 0:
+                raise ValueError("No tokens to remove found")
+        else:
+            tokens = []
+            for added_token_id, added_token in tokenizer.added_tokens_decoder.items():
+                if not added_token.special:
+                    token_ids.append(added_token_id)
+                    tokens.append(added_token.content)
+                else:
+                    last_special_token_id = added_token_id
+
+        # Delete from tokenizer
+        for token_id, token_to_remove in zip(token_ids, tokens):
+            del tokenizer._added_tokens_decoder[token_id]
+            del tokenizer._added_tokens_encoder[token_to_remove]
+
+        # Make all token ids sequential in tokenizer
+        key_id = 1
+        for token_id in tokenizer.added_tokens_decoder:
+            if token_id > last_special_token_id and token_id > last_special_token_id + key_id:
+                token = tokenizer._added_tokens_decoder[token_id]
+                tokenizer._added_tokens_decoder[last_special_token_id + key_id] = token
+                del tokenizer._added_tokens_decoder[token_id]
+                tokenizer._added_tokens_encoder[token.content] = last_special_token_id + key_id
+                key_id += 1
+        tokenizer._update_trie()
+
+        # Delete from text encoder
+        text_embedding_dim = text_encoder.get_input_embeddings().embedding_dim
+        temp_text_embedding_weights = text_encoder.get_input_embeddings().weight
+        text_embedding_weights = temp_text_embedding_weights[: last_special_token_id + 1]
+        to_append = []
+        for i in range(last_special_token_id + 1, temp_text_embedding_weights.shape[0]):
+            if i not in token_ids:
+                to_append.append(temp_text_embedding_weights[i].unsqueeze(0))
+        if len(to_append) > 0:
+            to_append = torch.cat(to_append, dim=0)
+            text_embedding_weights = torch.cat([text_embedding_weights, to_append], dim=0)
+        text_embeddings_filtered = nn.Embedding(text_embedding_weights.shape[0], text_embedding_dim)
+        text_embeddings_filtered.weight.data = text_embedding_weights
+        text_encoder.set_input_embeddings(text_embeddings_filtered)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/unet.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/unet.py
new file mode 100644
index 000000000..0a9544d0d
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/unet.py
@@ -0,0 +1,1003 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import os
+from collections import defaultdict
+from contextlib import nullcontext
+from functools import partial
+from pathlib import Path
+from typing import Callable, Dict, List, Optional, Union
+
+import safetensors
+import torch
+import torch.nn.functional as F
+from huggingface_hub.utils import validate_hf_hub_args
+from torch import nn
+
+from ..models.embeddings import (
+    ImageProjection,
+    IPAdapterFullImageProjection,
+    IPAdapterPlusImageProjection,
+    MultiIPAdapterImageProjection,
+)
+from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_model_dict_into_meta
+from ..utils import (
+    USE_PEFT_BACKEND,
+    _get_model_file,
+    delete_adapter_layers,
+    is_accelerate_available,
+    is_torch_version,
+    logging,
+    set_adapter_layers,
+    set_weights_and_activate_adapters,
+)
+from .single_file_utils import (
+    convert_stable_cascade_unet_single_file_to_diffusers,
+    infer_stable_cascade_single_file_config,
+    load_single_file_model_checkpoint,
+)
+from .utils import AttnProcsLayers
+
+
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+    from accelerate.hooks import AlignDevicesHook, CpuOffload, remove_hook_from_module
+
+logger = logging.get_logger(__name__)
+
+
+TEXT_ENCODER_NAME = "text_encoder"
+UNET_NAME = "unet"
+
+LORA_WEIGHT_NAME = "pytorch_lora_weights.bin"
+LORA_WEIGHT_NAME_SAFE = "pytorch_lora_weights.safetensors"
+
+CUSTOM_DIFFUSION_WEIGHT_NAME = "pytorch_custom_diffusion_weights.bin"
+CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE = "pytorch_custom_diffusion_weights.safetensors"
+
+
+class UNet2DConditionLoadersMixin:
+    """
+    Load LoRA layers into a [`UNet2DCondtionModel`].
+    """
+
+    text_encoder_name = TEXT_ENCODER_NAME
+    unet_name = UNET_NAME
+
+    @validate_hf_hub_args
+    def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
+        r"""
+        Load pretrained attention processor layers into [`UNet2DConditionModel`]. Attention processor layers have to be
+        defined in
+        [`attention_processor.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py)
+        and be a `torch.nn.Module` class.
+
+        Parameters:
+            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+                Can be either:
+
+                    - A string, the model id (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a directory (for example `./my_model_directory`) containing the model weights saved
+                      with [`ModelMixin.save_pretrained`].
+                    - A [torch state
+                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
+
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+
+        Example:
+
+        ```py
+        from diffusers import AutoPipelineForText2Image
+        import torch
+
+        pipeline = AutoPipelineForText2Image.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.unet.load_attn_procs(
+            "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_name="cinematic"
+        )
+        ```
+        """
+        from ..models.attention_processor import CustomDiffusionAttnProcessor
+        from ..models.lora import LoRACompatibleConv, LoRACompatibleLinear, LoRAConv2dLayer, LoRALinearLayer
+
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+        weight_name = kwargs.pop("weight_name", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
+        # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
+        # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+        network_alphas = kwargs.pop("network_alphas", None)
+
+        _pipeline = kwargs.pop("_pipeline", None)
+
+        is_network_alphas_none = network_alphas is None
+
+        allow_pickle = False
+
+        if use_safetensors is None:
+            use_safetensors = True
+            allow_pickle = True
+
+        user_agent = {
+            "file_type": "attn_procs_weights",
+            "framework": "pytorch",
+        }
+
+        model_file = None
+        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
+            # Let's first try to load .safetensors weights
+            if (use_safetensors and weight_name is None) or (
+                weight_name is not None and weight_name.endswith(".safetensors")
+            ):
+                try:
+                    model_file = _get_model_file(
+                        pretrained_model_name_or_path_or_dict,
+                        weights_name=weight_name or LORA_WEIGHT_NAME_SAFE,
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        resume_download=resume_download,
+                        proxies=proxies,
+                        local_files_only=local_files_only,
+                        token=token,
+                        revision=revision,
+                        subfolder=subfolder,
+                        user_agent=user_agent,
+                    )
+                    state_dict = safetensors.torch.load_file(model_file, device="cpu")
+                except IOError as e:
+                    if not allow_pickle:
+                        raise e
+                    # try loading non-safetensors weights
+                    pass
+            if model_file is None:
+                model_file = _get_model_file(
+                    pretrained_model_name_or_path_or_dict,
+                    weights_name=weight_name or LORA_WEIGHT_NAME,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                    user_agent=user_agent,
+                )
+                state_dict = torch.load(model_file, map_location="cpu")
+        else:
+            state_dict = pretrained_model_name_or_path_or_dict
+
+        # fill attn processors
+        lora_layers_list = []
+
+        is_lora = all(("lora" in k or k.endswith(".alpha")) for k in state_dict.keys()) and not USE_PEFT_BACKEND
+        is_custom_diffusion = any("custom_diffusion" in k for k in state_dict.keys())
+
+        if is_lora:
+            # correct keys
+            state_dict, network_alphas = self.convert_state_dict_legacy_attn_format(state_dict, network_alphas)
+
+            if network_alphas is not None:
+                network_alphas_keys = list(network_alphas.keys())
+                used_network_alphas_keys = set()
+
+            lora_grouped_dict = defaultdict(dict)
+            mapped_network_alphas = {}
+
+            all_keys = list(state_dict.keys())
+            for key in all_keys:
+                value = state_dict.pop(key)
+                attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
+                lora_grouped_dict[attn_processor_key][sub_key] = value
+
+                # Create another `mapped_network_alphas` dictionary so that we can properly map them.
+                if network_alphas is not None:
+                    for k in network_alphas_keys:
+                        if k.replace(".alpha", "") in key:
+                            mapped_network_alphas.update({attn_processor_key: network_alphas.get(k)})
+                            used_network_alphas_keys.add(k)
+
+            if not is_network_alphas_none:
+                if len(set(network_alphas_keys) - used_network_alphas_keys) > 0:
+                    raise ValueError(
+                        f"The `network_alphas` has to be empty at this point but has the following keys \n\n {', '.join(network_alphas.keys())}"
+                    )
+
+            if len(state_dict) > 0:
+                raise ValueError(
+                    f"The `state_dict` has to be empty at this point but has the following keys \n\n {', '.join(state_dict.keys())}"
+                )
+
+            for key, value_dict in lora_grouped_dict.items():
+                attn_processor = self
+                for sub_key in key.split("."):
+                    attn_processor = getattr(attn_processor, sub_key)
+
+                # Process non-attention layers, which don't have to_{k,v,q,out_proj}_lora layers
+                # or add_{k,v,q,out_proj}_proj_lora layers.
+                rank = value_dict["lora.down.weight"].shape[0]
+
+                if isinstance(attn_processor, LoRACompatibleConv):
+                    in_features = attn_processor.in_channels
+                    out_features = attn_processor.out_channels
+                    kernel_size = attn_processor.kernel_size
+
+                    ctx = init_empty_weights if low_cpu_mem_usage else nullcontext
+                    with ctx():
+                        lora = LoRAConv2dLayer(
+                            in_features=in_features,
+                            out_features=out_features,
+                            rank=rank,
+                            kernel_size=kernel_size,
+                            stride=attn_processor.stride,
+                            padding=attn_processor.padding,
+                            network_alpha=mapped_network_alphas.get(key),
+                        )
+                elif isinstance(attn_processor, LoRACompatibleLinear):
+                    ctx = init_empty_weights if low_cpu_mem_usage else nullcontext
+                    with ctx():
+                        lora = LoRALinearLayer(
+                            attn_processor.in_features,
+                            attn_processor.out_features,
+                            rank,
+                            mapped_network_alphas.get(key),
+                        )
+                else:
+                    raise ValueError(f"Module {key} is not a LoRACompatibleConv or LoRACompatibleLinear module.")
+
+                value_dict = {k.replace("lora.", ""): v for k, v in value_dict.items()}
+                lora_layers_list.append((attn_processor, lora))
+
+                if low_cpu_mem_usage:
+                    device = next(iter(value_dict.values())).device
+                    dtype = next(iter(value_dict.values())).dtype
+                    load_model_dict_into_meta(lora, value_dict, device=device, dtype=dtype)
+                else:
+                    lora.load_state_dict(value_dict)
+
+        elif is_custom_diffusion:
+            attn_processors = {}
+            custom_diffusion_grouped_dict = defaultdict(dict)
+            for key, value in state_dict.items():
+                if len(value) == 0:
+                    custom_diffusion_grouped_dict[key] = {}
+                else:
+                    if "to_out" in key:
+                        attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
+                    else:
+                        attn_processor_key, sub_key = ".".join(key.split(".")[:-2]), ".".join(key.split(".")[-2:])
+                    custom_diffusion_grouped_dict[attn_processor_key][sub_key] = value
+
+            for key, value_dict in custom_diffusion_grouped_dict.items():
+                if len(value_dict) == 0:
+                    attn_processors[key] = CustomDiffusionAttnProcessor(
+                        train_kv=False, train_q_out=False, hidden_size=None, cross_attention_dim=None
+                    )
+                else:
+                    cross_attention_dim = value_dict["to_k_custom_diffusion.weight"].shape[1]
+                    hidden_size = value_dict["to_k_custom_diffusion.weight"].shape[0]
+                    train_q_out = True if "to_q_custom_diffusion.weight" in value_dict else False
+                    attn_processors[key] = CustomDiffusionAttnProcessor(
+                        train_kv=True,
+                        train_q_out=train_q_out,
+                        hidden_size=hidden_size,
+                        cross_attention_dim=cross_attention_dim,
+                    )
+                    attn_processors[key].load_state_dict(value_dict)
+        elif USE_PEFT_BACKEND:
+            # In that case we have nothing to do as loading the adapter weights is already handled above by `set_peft_model_state_dict`
+            # on the Unet
+            pass
+        else:
+            raise ValueError(
+                f"{model_file} does not seem to be in the correct format expected by LoRA or Custom Diffusion training."
+            )
+
+        # <Unsafe code
+        # We can be sure that the following works as it just sets attention processors, lora layers and puts all in the same dtype
+        # Now we remove any existing hooks to
+        is_model_cpu_offload = False
+        is_sequential_cpu_offload = False
+
+        # For PEFT backend the Unet is already offloaded at this stage as it is handled inside `load_lora_weights_into_unet`
+        if not USE_PEFT_BACKEND:
+            if _pipeline is not None:
+                for _, component in _pipeline.components.items():
+                    if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"):
+                        is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
+                        is_sequential_cpu_offload = isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+
+                        logger.info(
+                            "Accelerate hooks detected. Since you have called `load_lora_weights()`, the previous hooks will be first removed. Then the LoRA parameters will be loaded and the hooks will be applied again."
+                        )
+                        remove_hook_from_module(component, recurse=is_sequential_cpu_offload)
+
+            # only custom diffusion needs to set attn processors
+            if is_custom_diffusion:
+                self.set_attn_processor(attn_processors)
+
+            # set lora layers
+            for target_module, lora_layer in lora_layers_list:
+                target_module.set_lora_layer(lora_layer)
+
+            self.to(dtype=self.dtype, device=self.device)
+
+            # Offload back.
+            if is_model_cpu_offload:
+                _pipeline.enable_model_cpu_offload()
+            elif is_sequential_cpu_offload:
+                _pipeline.enable_sequential_cpu_offload()
+            # Unsafe code />
+
+    def convert_state_dict_legacy_attn_format(self, state_dict, network_alphas):
+        is_new_lora_format = all(
+            key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in state_dict.keys()
+        )
+        if is_new_lora_format:
+            # Strip the `"unet"` prefix.
+            is_text_encoder_present = any(key.startswith(self.text_encoder_name) for key in state_dict.keys())
+            if is_text_encoder_present:
+                warn_message = "The state_dict contains LoRA params corresponding to the text encoder which are not being used here. To use both UNet and text encoder related LoRA params, use [`pipe.load_lora_weights()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraLoaderMixin.load_lora_weights)."
+                logger.warning(warn_message)
+            unet_keys = [k for k in state_dict.keys() if k.startswith(self.unet_name)]
+            state_dict = {k.replace(f"{self.unet_name}.", ""): v for k, v in state_dict.items() if k in unet_keys}
+
+        # change processor format to 'pure' LoRACompatibleLinear format
+        if any("processor" in k.split(".") for k in state_dict.keys()):
+
+            def format_to_lora_compatible(key):
+                if "processor" not in key.split("."):
+                    return key
+                return key.replace(".processor", "").replace("to_out_lora", "to_out.0.lora").replace("_lora", ".lora")
+
+            state_dict = {format_to_lora_compatible(k): v for k, v in state_dict.items()}
+
+            if network_alphas is not None:
+                network_alphas = {format_to_lora_compatible(k): v for k, v in network_alphas.items()}
+        return state_dict, network_alphas
+
+    def save_attn_procs(
+        self,
+        save_directory: Union[str, os.PathLike],
+        is_main_process: bool = True,
+        weight_name: str = None,
+        save_function: Callable = None,
+        safe_serialization: bool = True,
+        **kwargs,
+    ):
+        r"""
+        Save attention processor layers to a directory so that it can be reloaded with the
+        [`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`] method.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to save an attention processor to (will be created if it doesn't exist).
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful during distributed training and you
+                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
+                process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful during distributed training when you need to
+                replace `torch.save` with another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or with `pickle`.
+
+        Example:
+
+        ```py
+        import torch
+        from diffusers import DiffusionPipeline
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            "CompVis/stable-diffusion-v1-4",
+            torch_dtype=torch.float16,
+        ).to("cuda")
+        pipeline.unet.load_attn_procs("path-to-save-model", weight_name="pytorch_custom_diffusion_weights.bin")
+        pipeline.unet.save_attn_procs("path-to-save-model", weight_name="pytorch_custom_diffusion_weights.bin")
+        ```
+        """
+        from ..models.attention_processor import (
+            CustomDiffusionAttnProcessor,
+            CustomDiffusionAttnProcessor2_0,
+            CustomDiffusionXFormersAttnProcessor,
+        )
+
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+
+        if save_function is None:
+            if safe_serialization:
+
+                def save_function(weights, filename):
+                    return safetensors.torch.save_file(weights, filename, metadata={"format": "pt"})
+
+            else:
+                save_function = torch.save
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        is_custom_diffusion = any(
+            isinstance(
+                x,
+                (CustomDiffusionAttnProcessor, CustomDiffusionAttnProcessor2_0, CustomDiffusionXFormersAttnProcessor),
+            )
+            for (_, x) in self.attn_processors.items()
+        )
+        if is_custom_diffusion:
+            model_to_save = AttnProcsLayers(
+                {
+                    y: x
+                    for (y, x) in self.attn_processors.items()
+                    if isinstance(
+                        x,
+                        (
+                            CustomDiffusionAttnProcessor,
+                            CustomDiffusionAttnProcessor2_0,
+                            CustomDiffusionXFormersAttnProcessor,
+                        ),
+                    )
+                }
+            )
+            state_dict = model_to_save.state_dict()
+            for name, attn in self.attn_processors.items():
+                if len(attn.state_dict()) == 0:
+                    state_dict[name] = {}
+        else:
+            model_to_save = AttnProcsLayers(self.attn_processors)
+            state_dict = model_to_save.state_dict()
+
+        if weight_name is None:
+            if safe_serialization:
+                weight_name = CUSTOM_DIFFUSION_WEIGHT_NAME_SAFE if is_custom_diffusion else LORA_WEIGHT_NAME_SAFE
+            else:
+                weight_name = CUSTOM_DIFFUSION_WEIGHT_NAME if is_custom_diffusion else LORA_WEIGHT_NAME
+
+        # Save the model
+        save_path = Path(save_directory, weight_name).as_posix()
+        save_function(state_dict, save_path)
+        logger.info(f"Model weights saved in {save_path}")
+
+    def fuse_lora(self, lora_scale=1.0, safe_fusing=False, adapter_names=None):
+        self.lora_scale = lora_scale
+        self._safe_fusing = safe_fusing
+        self.apply(partial(self._fuse_lora_apply, adapter_names=adapter_names))
+
+    def _fuse_lora_apply(self, module, adapter_names=None):
+        if not USE_PEFT_BACKEND:
+            if hasattr(module, "_fuse_lora"):
+                module._fuse_lora(self.lora_scale, self._safe_fusing)
+
+            if adapter_names is not None:
+                raise ValueError(
+                    "The `adapter_names` argument is not supported in your environment. Please switch"
+                    " to PEFT backend to use this argument by installing latest PEFT and transformers."
+                    " `pip install -U peft transformers`"
+                )
+        else:
+            from peft.tuners.tuners_utils import BaseTunerLayer
+
+            merge_kwargs = {"safe_merge": self._safe_fusing}
+
+            if isinstance(module, BaseTunerLayer):
+                if self.lora_scale != 1.0:
+                    module.scale_layer(self.lora_scale)
+
+                # For BC with prevous PEFT versions, we need to check the signature
+                # of the `merge` method to see if it supports the `adapter_names` argument.
+                supported_merge_kwargs = list(inspect.signature(module.merge).parameters)
+                if "adapter_names" in supported_merge_kwargs:
+                    merge_kwargs["adapter_names"] = adapter_names
+                elif "adapter_names" not in supported_merge_kwargs and adapter_names is not None:
+                    raise ValueError(
+                        "The `adapter_names` argument is not supported with your PEFT version. Please upgrade"
+                        " to the latest version of PEFT. `pip install -U peft`"
+                    )
+
+                module.merge(**merge_kwargs)
+
+    def unfuse_lora(self):
+        self.apply(self._unfuse_lora_apply)
+
+    def _unfuse_lora_apply(self, module):
+        if not USE_PEFT_BACKEND:
+            if hasattr(module, "_unfuse_lora"):
+                module._unfuse_lora()
+        else:
+            from peft.tuners.tuners_utils import BaseTunerLayer
+
+            if isinstance(module, BaseTunerLayer):
+                module.unmerge()
+
+    def set_adapters(
+        self,
+        adapter_names: Union[List[str], str],
+        weights: Optional[Union[List[float], float]] = None,
+    ):
+        """
+        Set the currently active adapters for use in the UNet.
+
+        Args:
+            adapter_names (`List[str]` or `str`):
+                The names of the adapters to use.
+            adapter_weights (`Union[List[float], float]`, *optional*):
+                The adapter(s) weights to use with the UNet. If `None`, the weights are set to `1.0` for all the
+                adapters.
+
+        Example:
+
+        ```py
+        from diffusers import AutoPipelineForText2Image
+        import torch
+
+        pipeline = AutoPipelineForText2Image.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights(
+            "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_name="cinematic"
+        )
+        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+        pipeline.set_adapters(["cinematic", "pixel"], adapter_weights=[0.5, 0.5])
+        ```
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for `set_adapters()`.")
+
+        adapter_names = [adapter_names] if isinstance(adapter_names, str) else adapter_names
+
+        if weights is None:
+            weights = [1.0] * len(adapter_names)
+        elif isinstance(weights, float):
+            weights = [weights] * len(adapter_names)
+
+        if len(adapter_names) != len(weights):
+            raise ValueError(
+                f"Length of adapter names {len(adapter_names)} is not equal to the length of their weights {len(weights)}."
+            )
+
+        set_weights_and_activate_adapters(self, adapter_names, weights)
+
+    def disable_lora(self):
+        """
+        Disable the UNet's active LoRA layers.
+
+        Example:
+
+        ```py
+        from diffusers import AutoPipelineForText2Image
+        import torch
+
+        pipeline = AutoPipelineForText2Image.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights(
+            "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_name="cinematic"
+        )
+        pipeline.disable_lora()
+        ```
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+        set_adapter_layers(self, enabled=False)
+
+    def enable_lora(self):
+        """
+        Enable the UNet's active LoRA layers.
+
+        Example:
+
+        ```py
+        from diffusers import AutoPipelineForText2Image
+        import torch
+
+        pipeline = AutoPipelineForText2Image.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights(
+            "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_name="cinematic"
+        )
+        pipeline.enable_lora()
+        ```
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+        set_adapter_layers(self, enabled=True)
+
+    def delete_adapters(self, adapter_names: Union[List[str], str]):
+        """
+        Delete an adapter's LoRA layers from the UNet.
+
+        Args:
+            adapter_names (`Union[List[str], str]`):
+                The names (single string or list of strings) of the adapter to delete.
+
+        Example:
+
+        ```py
+        from diffusers import AutoPipelineForText2Image
+        import torch
+
+        pipeline = AutoPipelineForText2Image.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights(
+            "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_names="cinematic"
+        )
+        pipeline.delete_adapters("cinematic")
+        ```
+        """
+        if not USE_PEFT_BACKEND:
+            raise ValueError("PEFT backend is required for this method.")
+
+        if isinstance(adapter_names, str):
+            adapter_names = [adapter_names]
+
+        for adapter_name in adapter_names:
+            delete_adapter_layers(self, adapter_name)
+
+            # Pop also the corresponding adapter from the config
+            if hasattr(self, "peft_config"):
+                self.peft_config.pop(adapter_name, None)
+
+    def _convert_ip_adapter_image_proj_to_diffusers(self, state_dict, low_cpu_mem_usage=False):
+        if low_cpu_mem_usage:
+            if is_accelerate_available():
+                from accelerate import init_empty_weights
+
+            else:
+                low_cpu_mem_usage = False
+                logger.warning(
+                    "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                    " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                    " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                    " install accelerate\n```\n."
+                )
+
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+
+        updated_state_dict = {}
+        image_projection = None
+        init_context = init_empty_weights if low_cpu_mem_usage else nullcontext
+
+        if "proj.weight" in state_dict:
+            # IP-Adapter
+            num_image_text_embeds = 4
+            clip_embeddings_dim = state_dict["proj.weight"].shape[-1]
+            cross_attention_dim = state_dict["proj.weight"].shape[0] // 4
+
+            with init_context():
+                image_projection = ImageProjection(
+                    cross_attention_dim=cross_attention_dim,
+                    image_embed_dim=clip_embeddings_dim,
+                    num_image_text_embeds=num_image_text_embeds,
+                )
+
+            for key, value in state_dict.items():
+                diffusers_name = key.replace("proj", "image_embeds")
+                updated_state_dict[diffusers_name] = value
+
+        elif "proj.3.weight" in state_dict:
+            # IP-Adapter Full
+            clip_embeddings_dim = state_dict["proj.0.weight"].shape[0]
+            cross_attention_dim = state_dict["proj.3.weight"].shape[0]
+
+            with init_context():
+                image_projection = IPAdapterFullImageProjection(
+                    cross_attention_dim=cross_attention_dim, image_embed_dim=clip_embeddings_dim
+                )
+
+            for key, value in state_dict.items():
+                diffusers_name = key.replace("proj.0", "ff.net.0.proj")
+                diffusers_name = diffusers_name.replace("proj.2", "ff.net.2")
+                diffusers_name = diffusers_name.replace("proj.3", "norm")
+                updated_state_dict[diffusers_name] = value
+
+        else:
+            # IP-Adapter Plus
+            num_image_text_embeds = state_dict["latents"].shape[1]
+            embed_dims = state_dict["proj_in.weight"].shape[1]
+            output_dims = state_dict["proj_out.weight"].shape[0]
+            hidden_dims = state_dict["latents"].shape[2]
+            heads = state_dict["layers.0.0.to_q.weight"].shape[0] // 64
+
+            with init_context():
+                image_projection = IPAdapterPlusImageProjection(
+                    embed_dims=embed_dims,
+                    output_dims=output_dims,
+                    hidden_dims=hidden_dims,
+                    heads=heads,
+                    num_queries=num_image_text_embeds,
+                )
+
+            for key, value in state_dict.items():
+                diffusers_name = key.replace("0.to", "2.to")
+                diffusers_name = diffusers_name.replace("1.0.weight", "3.0.weight")
+                diffusers_name = diffusers_name.replace("1.0.bias", "3.0.bias")
+                diffusers_name = diffusers_name.replace("1.1.weight", "3.1.net.0.proj.weight")
+                diffusers_name = diffusers_name.replace("1.3.weight", "3.1.net.2.weight")
+
+                if "norm1" in diffusers_name:
+                    updated_state_dict[diffusers_name.replace("0.norm1", "0")] = value
+                elif "norm2" in diffusers_name:
+                    updated_state_dict[diffusers_name.replace("0.norm2", "1")] = value
+                elif "to_kv" in diffusers_name:
+                    v_chunk = value.chunk(2, dim=0)
+                    updated_state_dict[diffusers_name.replace("to_kv", "to_k")] = v_chunk[0]
+                    updated_state_dict[diffusers_name.replace("to_kv", "to_v")] = v_chunk[1]
+                elif "to_out" in diffusers_name:
+                    updated_state_dict[diffusers_name.replace("to_out", "to_out.0")] = value
+                else:
+                    updated_state_dict[diffusers_name] = value
+
+        if not low_cpu_mem_usage:
+            image_projection.load_state_dict(updated_state_dict)
+        else:
+            load_model_dict_into_meta(image_projection, updated_state_dict, device=self.device, dtype=self.dtype)
+
+        return image_projection
+
+    def _convert_ip_adapter_attn_to_diffusers(self, state_dicts, low_cpu_mem_usage=False):
+        from ..models.attention_processor import (
+            AttnProcessor,
+            AttnProcessor2_0,
+            IPAdapterAttnProcessor,
+            IPAdapterAttnProcessor2_0,
+        )
+
+        if low_cpu_mem_usage:
+            if is_accelerate_available():
+                from accelerate import init_empty_weights
+
+            else:
+                low_cpu_mem_usage = False
+                logger.warning(
+                    "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                    " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                    " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                    " install accelerate\n```\n."
+                )
+
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+
+        # set ip-adapter cross-attention processors & load state_dict
+        attn_procs = {}
+        key_id = 1
+        init_context = init_empty_weights if low_cpu_mem_usage else nullcontext
+        for name in self.attn_processors.keys():
+            cross_attention_dim = None if name.endswith("attn1.processor") else self.config.cross_attention_dim
+            if name.startswith("mid_block"):
+                hidden_size = self.config.block_out_channels[-1]
+            elif name.startswith("up_blocks"):
+                block_id = int(name[len("up_blocks.")])
+                hidden_size = list(reversed(self.config.block_out_channels))[block_id]
+            elif name.startswith("down_blocks"):
+                block_id = int(name[len("down_blocks.")])
+                hidden_size = self.config.block_out_channels[block_id]
+
+            if cross_attention_dim is None or "motion_modules" in name:
+                attn_processor_class = (
+                    AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
+                )
+                attn_procs[name] = attn_processor_class()
+            else:
+                attn_processor_class = (
+                    IPAdapterAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAdapterAttnProcessor
+                )
+                num_image_text_embeds = []
+                for state_dict in state_dicts:
+                    if "proj.weight" in state_dict["image_proj"]:
+                        # IP-Adapter
+                        num_image_text_embeds += [4]
+                    elif "proj.3.weight" in state_dict["image_proj"]:
+                        # IP-Adapter Full Face
+                        num_image_text_embeds += [257]  # 256 CLIP tokens + 1 CLS token
+                    else:
+                        # IP-Adapter Plus
+                        num_image_text_embeds += [state_dict["image_proj"]["latents"].shape[1]]
+
+                with init_context():
+                    attn_procs[name] = attn_processor_class(
+                        hidden_size=hidden_size,
+                        cross_attention_dim=cross_attention_dim,
+                        scale=1.0,
+                        num_tokens=num_image_text_embeds,
+                    )
+
+                value_dict = {}
+                for i, state_dict in enumerate(state_dicts):
+                    value_dict.update({f"to_k_ip.{i}.weight": state_dict["ip_adapter"][f"{key_id}.to_k_ip.weight"]})
+                    value_dict.update({f"to_v_ip.{i}.weight": state_dict["ip_adapter"][f"{key_id}.to_v_ip.weight"]})
+
+                if not low_cpu_mem_usage:
+                    attn_procs[name].load_state_dict(value_dict)
+                else:
+                    device = next(iter(value_dict.values())).device
+                    dtype = next(iter(value_dict.values())).dtype
+                    load_model_dict_into_meta(attn_procs[name], value_dict, device=device, dtype=dtype)
+
+                key_id += 2
+
+        return attn_procs
+
+    def _load_ip_adapter_weights(self, state_dicts, low_cpu_mem_usage=False):
+        if not isinstance(state_dicts, list):
+            state_dicts = [state_dicts]
+        # Set encoder_hid_proj after loading ip_adapter weights,
+        # because `IPAdapterPlusImageProjection` also has `attn_processors`.
+        self.encoder_hid_proj = None
+
+        attn_procs = self._convert_ip_adapter_attn_to_diffusers(state_dicts, low_cpu_mem_usage=low_cpu_mem_usage)
+        self.set_attn_processor(attn_procs)
+
+        # convert IP-Adapter Image Projection layers to diffusers
+        image_projection_layers = []
+        for state_dict in state_dicts:
+            image_projection_layer = self._convert_ip_adapter_image_proj_to_diffusers(
+                state_dict["image_proj"], low_cpu_mem_usage=low_cpu_mem_usage
+            )
+            image_projection_layers.append(image_projection_layer)
+
+        self.encoder_hid_proj = MultiIPAdapterImageProjection(image_projection_layers)
+        self.config.encoder_hid_dim_type = "ip_image_proj"
+
+        self.to(dtype=self.dtype, device=self.device)
+
+
+class FromOriginalUNetMixin:
+    """
+    Load pretrained UNet model weights saved in the `.ckpt` or `.safetensors` format into a [`StableCascadeUNet`].
+    """
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_single_file(cls, pretrained_model_link_or_path, **kwargs):
+        r"""
+        Instantiate a [`StableCascadeUNet`] from pretrained StableCascadeUNet weights saved in the original `.ckpt` or
+        `.safetensors` format. The pipeline is set in evaluation mode (`model.eval()`) by default.
+
+        Parameters:
+            pretrained_model_link_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+                    - A link to the `.ckpt` file (for example
+                      `"https://huggingface.co/<repo_id>/blob/main/<path_to_file>.ckpt"`) on the Hub.
+                    - A path to a *file* containing all pipeline weights.
+            config: (`dict`, *optional*):
+                Dictionary containing the configuration of the model:
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
+                dtype is automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to True, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to overwrite load and saveable variables of the model.
+
+        """
+        class_name = cls.__name__
+        if class_name != "StableCascadeUNet":
+            raise ValueError("FromOriginalUNetMixin is currently only compatible with StableCascadeUNet")
+
+        config = kwargs.pop("config", None)
+        resume_download = kwargs.pop("resume_download", False)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        revision = kwargs.pop("revision", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+
+        checkpoint = load_single_file_model_checkpoint(
+            pretrained_model_link_or_path,
+            resume_download=resume_download,
+            force_download=force_download,
+            proxies=proxies,
+            token=token,
+            cache_dir=cache_dir,
+            local_files_only=local_files_only,
+            revision=revision,
+        )
+
+        if config is None:
+            config = infer_stable_cascade_single_file_config(checkpoint)
+            model_config = cls.load_config(**config, **kwargs)
+        else:
+            model_config = config
+
+        ctx = init_empty_weights if is_accelerate_available() else nullcontext
+        with ctx():
+            model = cls.from_config(model_config, **kwargs)
+
+        diffusers_format_checkpoint = convert_stable_cascade_unet_single_file_to_diffusers(checkpoint)
+        if is_accelerate_available():
+            unexpected_keys = load_model_dict_into_meta(model, diffusers_format_checkpoint, dtype=torch_dtype)
+            if len(unexpected_keys) > 0:
+                logger.warn(
+                    f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
+                )
+
+        else:
+            model.load_state_dict(diffusers_format_checkpoint)
+
+        if torch_dtype is not None:
+            model.to(torch_dtype)
+
+        return model
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/utils.py
new file mode 100644
index 000000000..142d72bf6
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/utils.py
@@ -0,0 +1,59 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict
+
+import torch
+
+
+class AttnProcsLayers(torch.nn.Module):
+    def __init__(self, state_dict: Dict[str, torch.Tensor]):
+        super().__init__()
+        self.layers = torch.nn.ModuleList(state_dict.values())
+        self.mapping = dict(enumerate(state_dict.keys()))
+        self.rev_mapping = {v: k for k, v in enumerate(state_dict.keys())}
+
+        # .processor for unet, .self_attn for text encoder
+        self.split_keys = [".processor", ".self_attn"]
+
+        # we add a hook to state_dict() and load_state_dict() so that the
+        # naming fits with `unet.attn_processors`
+        def map_to(module, state_dict, *args, **kwargs):
+            new_state_dict = {}
+            for key, value in state_dict.items():
+                num = int(key.split(".")[1])  # 0 is always "layers"
+                new_key = key.replace(f"layers.{num}", module.mapping[num])
+                new_state_dict[new_key] = value
+
+            return new_state_dict
+
+        def remap_key(key, state_dict):
+            for k in self.split_keys:
+                if k in key:
+                    return key.split(k)[0] + k
+
+            raise ValueError(
+                f"There seems to be a problem with the state_dict: {set(state_dict.keys())}. {key} has to have one of {self.split_keys}."
+            )
+
+        def map_from(module, state_dict, *args, **kwargs):
+            all_keys = list(state_dict.keys())
+            for key in all_keys:
+                replace_key = remap_key(key, state_dict)
+                new_key = key.replace(replace_key, f"layers.{module.rev_mapping[replace_key]}")
+                state_dict[new_key] = state_dict[key]
+                del state_dict[key]
+
+        self._register_state_dict_hook(map_to)
+        self._register_load_state_dict_pre_hook(map_from, with_module=True)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/README.md b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/README.md
new file mode 100644
index 000000000..fb91f5941
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/README.md
@@ -0,0 +1,3 @@
+# Models
+
+For more detail on the models, please refer to the [docs](https://huggingface.co/docs/diffusers/api/models/overview).
\ No newline at end of file
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/__init__.py
new file mode 100644
index 000000000..da77e4450
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/__init__.py
@@ -0,0 +1,103 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ..utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    _LazyModule,
+    is_flax_available,
+    is_torch_available,
+)
+
+
+_import_structure = {}
+
+if is_torch_available():
+    _import_structure["adapter"] = ["MultiAdapter", "T2IAdapter"]
+    _import_structure["autoencoders.autoencoder_asym_kl"] = ["AsymmetricAutoencoderKL"]
+    _import_structure["autoencoders.autoencoder_kl"] = ["AutoencoderKL"]
+    _import_structure["autoencoders.autoencoder_kl_temporal_decoder"] = ["AutoencoderKLTemporalDecoder"]
+    _import_structure["autoencoders.autoencoder_tiny"] = ["AutoencoderTiny"]
+    _import_structure["autoencoders.consistency_decoder_vae"] = ["ConsistencyDecoderVAE"]
+    _import_structure["controlnet"] = ["ControlNetModel"]
+    _import_structure["dual_transformer_2d"] = ["DualTransformer2DModel"]
+    _import_structure["embeddings"] = ["ImageProjection"]
+    _import_structure["modeling_utils"] = ["ModelMixin"]
+    _import_structure["transformers.prior_transformer"] = ["PriorTransformer"]
+    _import_structure["transformers.t5_film_transformer"] = ["T5FilmDecoder"]
+    _import_structure["transformers.transformer_2d"] = ["Transformer2DModel"]
+    _import_structure["transformers.transformer_temporal"] = ["TransformerTemporalModel"]
+    _import_structure["unets.unet_1d"] = ["UNet1DModel"]
+    _import_structure["unets.unet_2d"] = ["UNet2DModel"]
+    _import_structure["unets.unet_2d_condition"] = ["UNet2DConditionModel"]
+    _import_structure["unets.unet_3d_condition"] = ["UNet3DConditionModel"]
+    _import_structure["unets.unet_i2vgen_xl"] = ["I2VGenXLUNet"]
+    _import_structure["unets.unet_kandinsky3"] = ["Kandinsky3UNet"]
+    _import_structure["unets.unet_motion_model"] = ["MotionAdapter", "UNetMotionModel"]
+    _import_structure["unets.unet_spatio_temporal_condition"] = ["UNetSpatioTemporalConditionModel"]
+    _import_structure["unets.unet_stable_cascade"] = ["StableCascadeUNet"]
+    _import_structure["unets.uvit_2d"] = ["UVit2DModel"]
+    _import_structure["vq_model"] = ["VQModel"]
+
+if is_flax_available():
+    _import_structure["controlnet_flax"] = ["FlaxControlNetModel"]
+    _import_structure["unets.unet_2d_condition_flax"] = ["FlaxUNet2DConditionModel"]
+    _import_structure["vae_flax"] = ["FlaxAutoencoderKL"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    if is_torch_available():
+        from .adapter import MultiAdapter, T2IAdapter
+        from .autoencoders import (
+            AsymmetricAutoencoderKL,
+            AutoencoderKL,
+            AutoencoderKLTemporalDecoder,
+            AutoencoderTiny,
+            ConsistencyDecoderVAE,
+        )
+        from .controlnet import ControlNetModel
+        from .embeddings import ImageProjection
+        from .modeling_utils import ModelMixin
+        from .transformers import (
+            DualTransformer2DModel,
+            PriorTransformer,
+            T5FilmDecoder,
+            Transformer2DModel,
+            TransformerTemporalModel,
+        )
+        from .unets import (
+            I2VGenXLUNet,
+            Kandinsky3UNet,
+            MotionAdapter,
+            StableCascadeUNet,
+            UNet1DModel,
+            UNet2DConditionModel,
+            UNet2DModel,
+            UNet3DConditionModel,
+            UNetMotionModel,
+            UNetSpatioTemporalConditionModel,
+            UVit2DModel,
+        )
+        from .vq_model import VQModel
+
+    if is_flax_available():
+        from .controlnet_flax import FlaxControlNetModel
+        from .unets import FlaxUNet2DConditionModel
+        from .vae_flax import FlaxAutoencoderKL
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/activations.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/activations.py
new file mode 100644
index 000000000..4b6914307
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/activations.py
@@ -0,0 +1,145 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+import ixformer.functions as IXF
+
+from ..utils import deprecate
+
+
+ACTIVATION_FUNCTIONS = {
+    "swish": nn.SiLU(),
+    "silu": nn.SiLU(),
+    "mish": nn.Mish(),
+    "gelu": nn.GELU(),
+    "relu": nn.ReLU(),
+}
+
+
+def get_activation(act_fn: str) -> nn.Module:
+    """Helper function to get activation function from string.
+
+    Args:
+        act_fn (str): Name of activation function.
+
+    Returns:
+        nn.Module: Activation function.
+    """
+
+    act_fn = act_fn.lower()
+    if act_fn in ACTIVATION_FUNCTIONS:
+        return ACTIVATION_FUNCTIONS[act_fn]
+    else:
+        raise ValueError(f"Unsupported activation function: {act_fn}")
+
+
+class GELU(nn.Module):
+    r"""
+    GELU activation function with tanh approximation support with `approximate="tanh"`.
+
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+        approximate (`str`, *optional*, defaults to `"none"`): If `"tanh"`, use tanh approximation.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+
+    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none", bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        self.approximate = approximate
+
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        if gate.device.type != "mps":
+            return F.gelu(gate, approximate=self.approximate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32), approximate=self.approximate).to(dtype=gate.dtype)
+
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        return hidden_states
+
+
+class GEGLU(nn.Module):
+    r"""
+    A [variant](https://arxiv.org/abs/2002.05202) of the gated linear unit activation function.
+
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias)
+
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        if gate.device.type != "mps":
+            return F.gelu(gate)
+        # mps: gelu is not implemented for float16
+        return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+
+    def forward(self, hidden_states, *args, **kwargs):
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        hidden_states, gate = self.proj(hidden_states).chunk(2, dim=-1)
+        return hidden_states * self.gelu(gate)
+
+
+class IXF_GEGLU(GEGLU):
+    r"""
+    A [variant](https://arxiv.org/abs/2002.05202) of the gated linear unit activation function.
+
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+
+    def forward(self, hidden_states, *args, **kwargs):
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        
+        lout = self.proj(hidden_states)
+        res = IXF.geglu(lout)
+        
+        return res
+
+
+class ApproximateGELU(nn.Module):
+    r"""
+    The approximate form of the Gaussian Error Linear Unit (GELU). For more details, see section 2 of this
+    [paper](https://arxiv.org/abs/1606.08415).
+
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        return x * torch.sigmoid(1.702 * x)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/adapter.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/adapter.py
new file mode 100644
index 000000000..0f4b2ec03
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/adapter.py
@@ -0,0 +1,584 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Callable, List, Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import logging
+from .modeling_utils import ModelMixin
+
+
+logger = logging.get_logger(__name__)
+
+
+class MultiAdapter(ModelMixin):
+    r"""
+    MultiAdapter is a wrapper model that contains multiple adapter models and merges their outputs according to
+    user-assigned weighting.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+    implements for all the model (such as downloading or saving, etc.)
+
+    Parameters:
+        adapters (`List[T2IAdapter]`, *optional*, defaults to None):
+            A list of `T2IAdapter` model instances.
+    """
+
+    def __init__(self, adapters: List["T2IAdapter"]):
+        super(MultiAdapter, self).__init__()
+
+        self.num_adapter = len(adapters)
+        self.adapters = nn.ModuleList(adapters)
+
+        if len(adapters) == 0:
+            raise ValueError("Expecting at least one adapter")
+
+        if len(adapters) == 1:
+            raise ValueError("For a single adapter, please use the `T2IAdapter` class instead of `MultiAdapter`")
+
+        # The outputs from each adapter are added together with a weight.
+        # This means that the change in dimensions from downsampling must
+        # be the same for all adapters. Inductively, it also means the
+        # downscale_factor and total_downscale_factor must be the same for all
+        # adapters.
+        first_adapter_total_downscale_factor = adapters[0].total_downscale_factor
+        first_adapter_downscale_factor = adapters[0].downscale_factor
+        for idx in range(1, len(adapters)):
+            if (
+                adapters[idx].total_downscale_factor != first_adapter_total_downscale_factor
+                or adapters[idx].downscale_factor != first_adapter_downscale_factor
+            ):
+                raise ValueError(
+                    f"Expecting all adapters to have the same downscaling behavior, but got:\n"
+                    f"adapters[0].total_downscale_factor={first_adapter_total_downscale_factor}\n"
+                    f"adapters[0].downscale_factor={first_adapter_downscale_factor}\n"
+                    f"adapter[`{idx}`].total_downscale_factor={adapters[idx].total_downscale_factor}\n"
+                    f"adapter[`{idx}`].downscale_factor={adapters[idx].downscale_factor}"
+                )
+
+        self.total_downscale_factor = first_adapter_total_downscale_factor
+        self.downscale_factor = first_adapter_downscale_factor
+
+    def forward(self, xs: torch.Tensor, adapter_weights: Optional[List[float]] = None) -> List[torch.Tensor]:
+        r"""
+        Args:
+            xs (`torch.Tensor`):
+                (batch, channel, height, width) input images for multiple adapter models concated along dimension 1,
+                `channel` should equal to `num_adapter` * "number of channel of image".
+            adapter_weights (`List[float]`, *optional*, defaults to None):
+                List of floats representing the weight which will be multiply to each adapter's output before adding
+                them together.
+        """
+        if adapter_weights is None:
+            adapter_weights = torch.tensor([1 / self.num_adapter] * self.num_adapter)
+        else:
+            adapter_weights = torch.tensor(adapter_weights)
+
+        accume_state = None
+        for x, w, adapter in zip(xs, adapter_weights, self.adapters):
+            features = adapter(x)
+            if accume_state is None:
+                accume_state = features
+                for i in range(len(accume_state)):
+                    accume_state[i] = w * accume_state[i]
+            else:
+                for i in range(len(features)):
+                    accume_state[i] += w * features[i]
+        return accume_state
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        is_main_process: bool = True,
+        save_function: Callable = None,
+        safe_serialization: bool = True,
+        variant: Optional[str] = None,
+    ):
+        """
+        Save a model and its configuration file to a directory, so that it can be re-loaded using the
+        `[`~models.adapter.MultiAdapter.from_pretrained`]` class method.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to which to save. Will be created if it doesn't exist.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful when in distributed training like
+                TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
+                the main process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful on distributed training like TPUs when one
+                need to replace `torch.save` by another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+            variant (`str`, *optional*):
+                If specified, weights are saved in the format pytorch_model.<variant>.bin.
+        """
+        idx = 0
+        model_path_to_save = save_directory
+        for adapter in self.adapters:
+            adapter.save_pretrained(
+                model_path_to_save,
+                is_main_process=is_main_process,
+                save_function=save_function,
+                safe_serialization=safe_serialization,
+                variant=variant,
+            )
+
+            idx += 1
+            model_path_to_save = model_path_to_save + f"_{idx}"
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_path: Optional[Union[str, os.PathLike]], **kwargs):
+        r"""
+        Instantiate a pretrained MultiAdapter model from multiple pre-trained adapter models.
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
+        the model, you should first set it back in training mode with `model.train()`.
+
+        The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
+        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
+        task.
+
+        The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
+        weights are discarded.
+
+        Parameters:
+            pretrained_model_path (`os.PathLike`):
+                A path to a *directory* containing model weights saved using
+                [`~diffusers.models.adapter.MultiAdapter.save_pretrained`], e.g., `./my_model_directory/adapter`.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
+                will be automatically derived from the model's weights.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn't need to be refined to each
+                parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
+                same device.
+
+                To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            max_memory (`Dict`, *optional*):
+                A dictionary device identifier to maximum memory. Will default to the maximum memory available for each
+                GPU and the available CPU RAM if unset.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading by not initializing the weights and only loading the pre-trained weights. This
+                also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the
+                model. This is only supported when torch version >= 1.9.0. If you are using an older version of torch,
+                setting this argument to `True` will raise an error.
+            variant (`str`, *optional*):
+                If specified load weights from `variant` filename, *e.g.* pytorch_model.<variant>.bin. `variant` is
+                ignored when using `from_flax`.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the `safetensors` weights will be downloaded if they're available **and** if the
+                `safetensors` library is installed. If set to `True`, the model will be forcibly loaded from
+                `safetensors` weights. If set to `False`, loading will *not* use `safetensors`.
+        """
+        idx = 0
+        adapters = []
+
+        # load adapter and append to list until no adapter directory exists anymore
+        # first adapter has to be saved under `./mydirectory/adapter` to be compliant with `DiffusionPipeline.from_pretrained`
+        # second, third, ... adapters have to be saved under `./mydirectory/adapter_1`, `./mydirectory/adapter_2`, ...
+        model_path_to_load = pretrained_model_path
+        while os.path.isdir(model_path_to_load):
+            adapter = T2IAdapter.from_pretrained(model_path_to_load, **kwargs)
+            adapters.append(adapter)
+
+            idx += 1
+            model_path_to_load = pretrained_model_path + f"_{idx}"
+
+        logger.info(f"{len(adapters)} adapters loaded from {pretrained_model_path}.")
+
+        if len(adapters) == 0:
+            raise ValueError(
+                f"No T2IAdapters found under {os.path.dirname(pretrained_model_path)}. Expected at least {pretrained_model_path + '_0'}."
+            )
+
+        return cls(adapters)
+
+
+class T2IAdapter(ModelMixin, ConfigMixin):
+    r"""
+    A simple ResNet-like model that accepts images containing control signals such as keyposes and depth. The model
+    generates multiple feature maps that are used as additional conditioning in [`UNet2DConditionModel`]. The model's
+    architecture follows the original implementation of
+    [Adapter](https://github.com/TencentARC/T2I-Adapter/blob/686de4681515662c0ac2ffa07bf5dda83af1038a/ldm/modules/encoders/adapter.py#L97)
+     and
+     [AdapterLight](https://github.com/TencentARC/T2I-Adapter/blob/686de4681515662c0ac2ffa07bf5dda83af1038a/ldm/modules/encoders/adapter.py#L235).
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+    implements for all the model (such as downloading or saving, etc.)
+
+    Parameters:
+        in_channels (`int`, *optional*, defaults to 3):
+            Number of channels of Aapter's input(*control image*). Set this parameter to 1 if you're using gray scale
+            image as *control image*.
+        channels (`List[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The number of channel of each downsample block's output hidden state. The `len(block_out_channels)` will
+            also determine the number of downsample blocks in the Adapter.
+        num_res_blocks (`int`, *optional*, defaults to 2):
+            Number of ResNet blocks in each downsample block.
+        downscale_factor (`int`, *optional*, defaults to 8):
+            A factor that determines the total downscale factor of the Adapter.
+        adapter_type (`str`, *optional*, defaults to `full_adapter`):
+            The type of Adapter to use. Choose either `full_adapter` or `full_adapter_xl` or `light_adapter`.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        channels: List[int] = [320, 640, 1280, 1280],
+        num_res_blocks: int = 2,
+        downscale_factor: int = 8,
+        adapter_type: str = "full_adapter",
+    ):
+        super().__init__()
+
+        if adapter_type == "full_adapter":
+            self.adapter = FullAdapter(in_channels, channels, num_res_blocks, downscale_factor)
+        elif adapter_type == "full_adapter_xl":
+            self.adapter = FullAdapterXL(in_channels, channels, num_res_blocks, downscale_factor)
+        elif adapter_type == "light_adapter":
+            self.adapter = LightAdapter(in_channels, channels, num_res_blocks, downscale_factor)
+        else:
+            raise ValueError(
+                f"Unsupported adapter_type: '{adapter_type}'. Choose either 'full_adapter' or "
+                "'full_adapter_xl' or 'light_adapter'."
+            )
+
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        r"""
+        This function processes the input tensor `x` through the adapter model and returns a list of feature tensors,
+        each representing information extracted at a different scale from the input. The length of the list is
+        determined by the number of downsample blocks in the Adapter, as specified by the `channels` and
+        `num_res_blocks` parameters during initialization.
+        """
+        return self.adapter(x)
+
+    @property
+    def total_downscale_factor(self):
+        return self.adapter.total_downscale_factor
+
+    @property
+    def downscale_factor(self):
+        """The downscale factor applied in the T2I-Adapter's initial pixel unshuffle operation. If an input image's dimensions are
+        not evenly divisible by the downscale_factor then an exception will be raised.
+        """
+        return self.adapter.unshuffle.downscale_factor
+
+
+# full adapter
+
+
+class FullAdapter(nn.Module):
+    r"""
+    See [`T2IAdapter`] for more information.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        channels: List[int] = [320, 640, 1280, 1280],
+        num_res_blocks: int = 2,
+        downscale_factor: int = 8,
+    ):
+        super().__init__()
+
+        in_channels = in_channels * downscale_factor**2
+
+        self.unshuffle = nn.PixelUnshuffle(downscale_factor)
+        self.conv_in = nn.Conv2d(in_channels, channels[0], kernel_size=3, padding=1)
+
+        self.body = nn.ModuleList(
+            [
+                AdapterBlock(channels[0], channels[0], num_res_blocks),
+                *[
+                    AdapterBlock(channels[i - 1], channels[i], num_res_blocks, down=True)
+                    for i in range(1, len(channels))
+                ],
+            ]
+        )
+
+        self.total_downscale_factor = downscale_factor * 2 ** (len(channels) - 1)
+
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        r"""
+        This method processes the input tensor `x` through the FullAdapter model and performs operations including
+        pixel unshuffling, convolution, and a stack of AdapterBlocks. It returns a list of feature tensors, each
+        capturing information at a different stage of processing within the FullAdapter model. The number of feature
+        tensors in the list is determined by the number of downsample blocks specified during initialization.
+        """
+        x = self.unshuffle(x)
+        x = self.conv_in(x)
+
+        features = []
+
+        for block in self.body:
+            x = block(x)
+            features.append(x)
+
+        return features
+
+
+class FullAdapterXL(nn.Module):
+    r"""
+    See [`T2IAdapter`] for more information.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        channels: List[int] = [320, 640, 1280, 1280],
+        num_res_blocks: int = 2,
+        downscale_factor: int = 16,
+    ):
+        super().__init__()
+
+        in_channels = in_channels * downscale_factor**2
+
+        self.unshuffle = nn.PixelUnshuffle(downscale_factor)
+        self.conv_in = nn.Conv2d(in_channels, channels[0], kernel_size=3, padding=1)
+
+        self.body = []
+        # blocks to extract XL features with dimensions of [320, 64, 64], [640, 64, 64], [1280, 32, 32], [1280, 32, 32]
+        for i in range(len(channels)):
+            if i == 1:
+                self.body.append(AdapterBlock(channels[i - 1], channels[i], num_res_blocks))
+            elif i == 2:
+                self.body.append(AdapterBlock(channels[i - 1], channels[i], num_res_blocks, down=True))
+            else:
+                self.body.append(AdapterBlock(channels[i], channels[i], num_res_blocks))
+
+        self.body = nn.ModuleList(self.body)
+        # XL has only one downsampling AdapterBlock.
+        self.total_downscale_factor = downscale_factor * 2
+
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        r"""
+        This method takes the tensor x as input and processes it through FullAdapterXL model. It consists of operations
+        including unshuffling pixels, applying convolution layer and appending each block into list of feature tensors.
+        """
+        x = self.unshuffle(x)
+        x = self.conv_in(x)
+
+        features = []
+
+        for block in self.body:
+            x = block(x)
+            features.append(x)
+
+        return features
+
+
+class AdapterBlock(nn.Module):
+    r"""
+    An AdapterBlock is a helper model that contains multiple ResNet-like blocks. It is used in the `FullAdapter` and
+    `FullAdapterXL` models.
+
+    Parameters:
+        in_channels (`int`):
+            Number of channels of AdapterBlock's input.
+        out_channels (`int`):
+            Number of channels of AdapterBlock's output.
+        num_res_blocks (`int`):
+            Number of ResNet blocks in the AdapterBlock.
+        down (`bool`, *optional*, defaults to `False`):
+            Whether to perform downsampling on AdapterBlock's input.
+    """
+
+    def __init__(self, in_channels: int, out_channels: int, num_res_blocks: int, down: bool = False):
+        super().__init__()
+
+        self.downsample = None
+        if down:
+            self.downsample = nn.AvgPool2d(kernel_size=2, stride=2, ceil_mode=True)
+
+        self.in_conv = None
+        if in_channels != out_channels:
+            self.in_conv = nn.Conv2d(in_channels, out_channels, kernel_size=1)
+
+        self.resnets = nn.Sequential(
+            *[AdapterResnetBlock(out_channels) for _ in range(num_res_blocks)],
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        r"""
+        This method takes tensor x as input and performs operations downsampling and convolutional layers if the
+        self.downsample and self.in_conv properties of AdapterBlock model are specified. Then it applies a series of
+        residual blocks to the input tensor.
+        """
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        if self.in_conv is not None:
+            x = self.in_conv(x)
+
+        x = self.resnets(x)
+
+        return x
+
+
+class AdapterResnetBlock(nn.Module):
+    r"""
+    An `AdapterResnetBlock` is a helper model that implements a ResNet-like block.
+
+    Parameters:
+        channels (`int`):
+            Number of channels of AdapterResnetBlock's input and output.
+    """
+
+    def __init__(self, channels: int):
+        super().__init__()
+        self.block1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
+        self.act = nn.ReLU()
+        self.block2 = nn.Conv2d(channels, channels, kernel_size=1)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        r"""
+        This method takes input tensor x and applies a convolutional layer, ReLU activation, and another convolutional
+        layer on the input tensor. It returns addition with the input tensor.
+        """
+
+        h = self.act(self.block1(x))
+        h = self.block2(h)
+
+        return h + x
+
+
+# light adapter
+
+
+class LightAdapter(nn.Module):
+    r"""
+    See [`T2IAdapter`] for more information.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        channels: List[int] = [320, 640, 1280],
+        num_res_blocks: int = 4,
+        downscale_factor: int = 8,
+    ):
+        super().__init__()
+
+        in_channels = in_channels * downscale_factor**2
+
+        self.unshuffle = nn.PixelUnshuffle(downscale_factor)
+
+        self.body = nn.ModuleList(
+            [
+                LightAdapterBlock(in_channels, channels[0], num_res_blocks),
+                *[
+                    LightAdapterBlock(channels[i], channels[i + 1], num_res_blocks, down=True)
+                    for i in range(len(channels) - 1)
+                ],
+                LightAdapterBlock(channels[-1], channels[-1], num_res_blocks, down=True),
+            ]
+        )
+
+        self.total_downscale_factor = downscale_factor * (2 ** len(channels))
+
+    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+        r"""
+        This method takes the input tensor x and performs downscaling and appends it in list of feature tensors. Each
+        feature tensor corresponds to a different level of processing within the LightAdapter.
+        """
+        x = self.unshuffle(x)
+
+        features = []
+
+        for block in self.body:
+            x = block(x)
+            features.append(x)
+
+        return features
+
+
+class LightAdapterBlock(nn.Module):
+    r"""
+    A `LightAdapterBlock` is a helper model that contains multiple `LightAdapterResnetBlocks`. It is used in the
+    `LightAdapter` model.
+
+    Parameters:
+        in_channels (`int`):
+            Number of channels of LightAdapterBlock's input.
+        out_channels (`int`):
+            Number of channels of LightAdapterBlock's output.
+        num_res_blocks (`int`):
+            Number of LightAdapterResnetBlocks in the LightAdapterBlock.
+        down (`bool`, *optional*, defaults to `False`):
+            Whether to perform downsampling on LightAdapterBlock's input.
+    """
+
+    def __init__(self, in_channels: int, out_channels: int, num_res_blocks: int, down: bool = False):
+        super().__init__()
+        mid_channels = out_channels // 4
+
+        self.downsample = None
+        if down:
+            self.downsample = nn.AvgPool2d(kernel_size=2, stride=2, ceil_mode=True)
+
+        self.in_conv = nn.Conv2d(in_channels, mid_channels, kernel_size=1)
+        self.resnets = nn.Sequential(*[LightAdapterResnetBlock(mid_channels) for _ in range(num_res_blocks)])
+        self.out_conv = nn.Conv2d(mid_channels, out_channels, kernel_size=1)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        r"""
+        This method takes tensor x as input and performs downsampling if required. Then it applies in convolution
+        layer, a sequence of residual blocks, and out convolutional layer.
+        """
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        x = self.in_conv(x)
+        x = self.resnets(x)
+        x = self.out_conv(x)
+
+        return x
+
+
+class LightAdapterResnetBlock(nn.Module):
+    """
+    A `LightAdapterResnetBlock` is a helper model that implements a ResNet-like block with a slightly different
+    architecture than `AdapterResnetBlock`.
+
+    Parameters:
+        channels (`int`):
+            Number of channels of LightAdapterResnetBlock's input and output.
+    """
+
+    def __init__(self, channels: int):
+        super().__init__()
+        self.block1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
+        self.act = nn.ReLU()
+        self.block2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        r"""
+        This function takes input tensor x and processes it through one convolutional layer, ReLU activation, and
+        another convolutional layer and adds it to input tensor.
+        """
+
+        h = self.act(self.block1(x))
+        h = self.block2(h)
+
+        return h + x
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/attention.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/attention.py
new file mode 100644
index 000000000..7cdd4ce6b
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/attention.py
@@ -0,0 +1,681 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional
+import os
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ..utils import deprecate, logging
+from ..utils.torch_utils import maybe_allow_in_graph
+from .activations import GEGLU, GELU, ApproximateGELU, IXF_GEGLU
+from .attention_processor import Attention
+from .embeddings import SinusoidalPositionalEmbedding
+from .normalization import AdaLayerNorm, AdaLayerNormContinuous, AdaLayerNormZero, RMSNorm
+
+
+logger = logging.get_logger(__name__)
+
+
+def _chunked_feed_forward(ff: nn.Module, hidden_states: torch.Tensor, chunk_dim: int, chunk_size: int):
+    # "feed_forward_chunk_size" can be used to save memory
+    if hidden_states.shape[chunk_dim] % chunk_size != 0:
+        raise ValueError(
+            f"`hidden_states` dimension to be chunked: {hidden_states.shape[chunk_dim]} has to be divisible by chunk size: {chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
+        )
+
+    num_chunks = hidden_states.shape[chunk_dim] // chunk_size
+    ff_output = torch.cat(
+        [ff(hid_slice) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)],
+        dim=chunk_dim,
+    )
+    return ff_output
+
+
+@maybe_allow_in_graph
+class GatedSelfAttentionDense(nn.Module):
+    r"""
+    A gated self-attention dense layer that combines visual features and object features.
+
+    Parameters:
+        query_dim (`int`): The number of channels in the query.
+        context_dim (`int`): The number of channels in the context.
+        n_heads (`int`): The number of heads to use for attention.
+        d_head (`int`): The number of channels in each head.
+    """
+
+    def __init__(self, query_dim: int, context_dim: int, n_heads: int, d_head: int):
+        super().__init__()
+
+        # we need a linear projection since we need cat visual feature and obj feature
+        self.linear = nn.Linear(context_dim, query_dim)
+
+        self.attn = Attention(query_dim=query_dim, heads=n_heads, dim_head=d_head)
+        self.ff = FeedForward(query_dim, activation_fn="geglu")
+
+        self.norm1 = nn.LayerNorm(query_dim)
+        self.norm2 = nn.LayerNorm(query_dim)
+
+        self.register_parameter("alpha_attn", nn.Parameter(torch.tensor(0.0)))
+        self.register_parameter("alpha_dense", nn.Parameter(torch.tensor(0.0)))
+
+        self.enabled = True
+
+    def forward(self, x: torch.Tensor, objs: torch.Tensor) -> torch.Tensor:
+        if not self.enabled:
+            return x
+
+        n_visual = x.shape[1]
+        objs = self.linear(objs)
+
+        x = x + self.alpha_attn.tanh() * self.attn(self.norm1(torch.cat([x, objs], dim=1)))[:, :n_visual, :]
+        x = x + self.alpha_dense.tanh() * self.ff(self.norm2(x))
+
+        return x
+
+
+@maybe_allow_in_graph
+class BasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
+        final_dropout (`bool` *optional*, defaults to False):
+            Whether to apply a final dropout after the last feed-forward layer.
+        attention_type (`str`, *optional*, defaults to `"default"`):
+            The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
+        positional_embeddings (`str`, *optional*, defaults to `None`):
+            The type of positional embeddings to apply to.
+        num_positional_embeddings (`int`, *optional*, defaults to `None`):
+            The maximum number of positional embeddings to apply.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",  # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single', 'ada_norm_continuous', 'layer_norm_i2vgen'
+        norm_eps: float = 1e-5,
+        final_dropout: bool = False,
+        attention_type: str = "default",
+        positional_embeddings: Optional[str] = None,
+        num_positional_embeddings: Optional[int] = None,
+        ada_norm_continous_conditioning_embedding_dim: Optional[int] = None,
+        ada_norm_bias: Optional[int] = None,
+        ff_inner_dim: Optional[int] = None,
+        ff_bias: bool = True,
+        attention_out_bias: bool = True,
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+
+        # We keep these boolean flags for backward-compatibility.
+        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+        self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
+        self.use_layer_norm = norm_type == "layer_norm"
+        self.use_ada_layer_norm_continuous = norm_type == "ada_norm_continuous"
+
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+
+        self.norm_type = norm_type
+        self.num_embeds_ada_norm = num_embeds_ada_norm
+
+        if positional_embeddings and (num_positional_embeddings is None):
+            raise ValueError(
+                "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
+            )
+
+        if positional_embeddings == "sinusoidal":
+            self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
+        else:
+            self.pos_embed = None
+
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        if norm_type == "ada_norm":
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        elif norm_type == "ada_norm_zero":
+            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+        elif norm_type == "ada_norm_continuous":
+            self.norm1 = AdaLayerNormContinuous(
+                dim,
+                ada_norm_continous_conditioning_embedding_dim,
+                norm_elementwise_affine,
+                norm_eps,
+                ada_norm_bias,
+                "rms_norm",
+            )
+        else:
+            if int(os.environ.get("USE_APEX_LN", 0)):
+                from apex.normalization import FusedLayerNorm
+                self.norm1 = FusedLayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+            else:
+                self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+            out_bias=attention_out_bias,
+        )
+
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            if norm_type == "ada_norm":
+                self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm)
+            elif norm_type == "ada_norm_continuous":
+                self.norm2 = AdaLayerNormContinuous(
+                    dim,
+                    ada_norm_continous_conditioning_embedding_dim,
+                    norm_elementwise_affine,
+                    norm_eps,
+                    ada_norm_bias,
+                    "rms_norm",
+                )
+            else:
+                if int(os.environ.get("USE_APEX_LN", 0)):
+                    from apex.normalization import FusedLayerNorm
+                    self.norm2 = FusedLayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+                else:
+                    self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+                out_bias=attention_out_bias,
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.norm2 = None
+            self.attn2 = None
+
+        # 3. Feed-forward
+        if norm_type == "ada_norm_continuous":
+            self.norm3 = AdaLayerNormContinuous(
+                dim,
+                ada_norm_continous_conditioning_embedding_dim,
+                norm_elementwise_affine,
+                norm_eps,
+                ada_norm_bias,
+                "layer_norm",
+            )
+
+        elif norm_type in ["ada_norm_zero", "ada_norm", "layer_norm", "ada_norm_continuous"]:
+            if int(os.environ.get("USE_APEX_LN", 0)):
+                from apex.normalization import FusedLayerNorm
+                self.norm3 = FusedLayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+            else:
+                self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+        elif norm_type == "layer_norm_i2vgen":
+            self.norm3 = None
+
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+            inner_dim=ff_inner_dim,
+            bias=ff_bias,
+        )
+
+        # 4. Fuser
+        if attention_type == "gated" or attention_type == "gated-text-image":
+            self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
+
+        # 5. Scale-shift for PixArt-Alpha.
+        if norm_type == "ada_norm_single":
+            self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
+
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.FloatTensor:
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        batch_size = hidden_states.shape[0]
+        if self.norm_type == "ada_norm":
+            norm_hidden_states = self.norm1(hidden_states, timestep)
+        elif self.norm_type == "ada_norm_zero":
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+        elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]:
+            norm_hidden_states = self.norm1(hidden_states)
+        elif self.norm_type == "ada_norm_continuous":
+            norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
+        elif self.norm_type == "ada_norm_single":
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+                self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
+            ).chunk(6, dim=1)
+            norm_hidden_states = self.norm1(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+            norm_hidden_states = norm_hidden_states.squeeze(1)
+        else:
+            raise ValueError("Incorrect norm used")
+
+        if self.pos_embed is not None:
+            norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+        # 1. Prepare GLIGEN inputs
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+        gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
+
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+        if self.norm_type == "ada_norm_zero":
+            attn_output = gate_msa.unsqueeze(1) * attn_output
+        elif self.norm_type == "ada_norm_single":
+            attn_output = gate_msa * attn_output
+
+        hidden_states = attn_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        # 1.2 GLIGEN Control
+        if gligen_kwargs is not None:
+            hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
+
+        # 3. Cross-Attention
+        if self.attn2 is not None:
+            if self.norm_type == "ada_norm":
+                norm_hidden_states = self.norm2(hidden_states, timestep)
+            elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]:
+                norm_hidden_states = self.norm2(hidden_states)
+            elif self.norm_type == "ada_norm_single":
+                # For PixArt norm2 isn't applied here:
+                # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
+                norm_hidden_states = hidden_states
+            elif self.norm_type == "ada_norm_continuous":
+                norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
+            else:
+                raise ValueError("Incorrect norm")
+
+            if self.pos_embed is not None and self.norm_type != "ada_norm_single":
+                norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+            attn_output = self.attn2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
+            hidden_states = attn_output + hidden_states
+
+        # 4. Feed-forward
+        # i2vgen doesn't have this norm 🤷‍♂️
+        if self.norm_type == "ada_norm_continuous":
+            norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"])
+        elif not self.norm_type == "ada_norm_single":
+            norm_hidden_states = self.norm3(hidden_states)
+
+        if self.norm_type == "ada_norm_zero":
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+
+        if self.norm_type == "ada_norm_single":
+            norm_hidden_states = self.norm2(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            ff_output = self.ff(norm_hidden_states)
+
+        if self.norm_type == "ada_norm_zero":
+            ff_output = gate_mlp.unsqueeze(1) * ff_output
+        elif self.norm_type == "ada_norm_single":
+            ff_output = gate_mlp * ff_output
+
+        hidden_states = ff_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        return hidden_states
+
+
+@maybe_allow_in_graph
+class TemporalBasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block for video like data.
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        time_mix_inner_dim (`int`): The number of channels for temporal attention.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        time_mix_inner_dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        cross_attention_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.is_res = dim == time_mix_inner_dim
+
+        self.norm_in = nn.LayerNorm(dim)
+
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        self.ff_in = FeedForward(
+            dim,
+            dim_out=time_mix_inner_dim,
+            activation_fn="geglu",
+        )
+
+        self.norm1 = nn.LayerNorm(time_mix_inner_dim)
+        self.attn1 = Attention(
+            query_dim=time_mix_inner_dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            cross_attention_dim=None,
+        )
+
+        # 2. Cross-Attn
+        if cross_attention_dim is not None:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            self.norm2 = nn.LayerNorm(time_mix_inner_dim)
+            self.attn2 = Attention(
+                query_dim=time_mix_inner_dim,
+                cross_attention_dim=cross_attention_dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.norm2 = None
+            self.attn2 = None
+
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(time_mix_inner_dim)
+        self.ff = FeedForward(time_mix_inner_dim, activation_fn="geglu")
+
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = None
+
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], **kwargs):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        # chunk dim should be hardcoded to 1 to have better speed vs. memory trade-off
+        self._chunk_dim = 1
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        num_frames: int,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        batch_size = hidden_states.shape[0]
+
+        batch_frames, seq_length, channels = hidden_states.shape
+        batch_size = batch_frames // num_frames
+
+        hidden_states = hidden_states[None, :].reshape(batch_size, num_frames, seq_length, channels)
+        hidden_states = hidden_states.permute(0, 2, 1, 3)
+        hidden_states = hidden_states.reshape(batch_size * seq_length, num_frames, channels)
+
+        residual = hidden_states
+        hidden_states = self.norm_in(hidden_states)
+
+        if self._chunk_size is not None:
+            hidden_states = _chunked_feed_forward(self.ff_in, hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            hidden_states = self.ff_in(hidden_states)
+
+        if self.is_res:
+            hidden_states = hidden_states + residual
+
+        norm_hidden_states = self.norm1(hidden_states)
+        attn_output = self.attn1(norm_hidden_states, encoder_hidden_states=None)
+        hidden_states = attn_output + hidden_states
+
+        # 3. Cross-Attention
+        if self.attn2 is not None:
+            norm_hidden_states = self.norm2(hidden_states)
+            attn_output = self.attn2(norm_hidden_states, encoder_hidden_states=encoder_hidden_states)
+            hidden_states = attn_output + hidden_states
+
+        # 4. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states)
+
+        if self._chunk_size is not None:
+            ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            ff_output = self.ff(norm_hidden_states)
+
+        if self.is_res:
+            hidden_states = ff_output + hidden_states
+        else:
+            hidden_states = ff_output
+
+        hidden_states = hidden_states[None, :].reshape(batch_size, seq_length, num_frames, channels)
+        hidden_states = hidden_states.permute(0, 2, 1, 3)
+        hidden_states = hidden_states.reshape(batch_size * num_frames, seq_length, channels)
+
+        return hidden_states
+
+
+class SkipFFTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        kv_input_dim: int,
+        kv_input_dim_proj_use_bias: bool,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        attention_out_bias: bool = True,
+    ):
+        super().__init__()
+        if kv_input_dim != dim:
+            self.kv_mapper = nn.Linear(kv_input_dim, dim, kv_input_dim_proj_use_bias)
+        else:
+            self.kv_mapper = None
+
+        self.norm1 = RMSNorm(dim, 1e-06)
+
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim,
+            out_bias=attention_out_bias,
+        )
+
+        self.norm2 = RMSNorm(dim, 1e-06)
+
+        self.attn2 = Attention(
+            query_dim=dim,
+            cross_attention_dim=cross_attention_dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            out_bias=attention_out_bias,
+        )
+
+    def forward(self, hidden_states, encoder_hidden_states, cross_attention_kwargs):
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+
+        if self.kv_mapper is not None:
+            encoder_hidden_states = self.kv_mapper(F.silu(encoder_hidden_states))
+
+        norm_hidden_states = self.norm1(hidden_states)
+
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            **cross_attention_kwargs,
+        )
+
+        hidden_states = attn_output + hidden_states
+
+        norm_hidden_states = self.norm2(hidden_states)
+
+        attn_output = self.attn2(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            **cross_attention_kwargs,
+        )
+
+        hidden_states = attn_output + hidden_states
+
+        return hidden_states
+
+
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+        inner_dim=None,
+        bias: bool = True,
+    ):
+        super().__init__()
+        if inner_dim is None:
+            inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+        linear_cls = nn.Linear
+
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim, bias=bias)
+        if activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
+        elif activation_fn == "geglu":
+            import os
+            if int(os.environ.get("USE_IXFORMER_GEGLU", 0)):
+                print("==> use IXF_GEGLU")
+                act_fn = IXF_GEGLU(dim, inner_dim, bias=bias)
+            else:
+                act_fn = GEGLU(dim, inner_dim, bias=bias)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
+
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(linear_cls(inner_dim, dim_out, bias=bias))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+
+    def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/attention_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/attention_flax.py
new file mode 100644
index 000000000..25ae5d0a5
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/attention_flax.py
@@ -0,0 +1,494 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import math
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+
+
+def _query_chunk_attention(query, key, value, precision, key_chunk_size: int = 4096):
+    """Multi-head dot product attention with a limited number of queries."""
+    num_kv, num_heads, k_features = key.shape[-3:]
+    v_features = value.shape[-1]
+    key_chunk_size = min(key_chunk_size, num_kv)
+    query = query / jnp.sqrt(k_features)
+
+    @functools.partial(jax.checkpoint, prevent_cse=False)
+    def summarize_chunk(query, key, value):
+        attn_weights = jnp.einsum("...qhd,...khd->...qhk", query, key, precision=precision)
+
+        max_score = jnp.max(attn_weights, axis=-1, keepdims=True)
+        max_score = jax.lax.stop_gradient(max_score)
+        exp_weights = jnp.exp(attn_weights - max_score)
+
+        exp_values = jnp.einsum("...vhf,...qhv->...qhf", value, exp_weights, precision=precision)
+        max_score = jnp.einsum("...qhk->...qh", max_score)
+
+        return (exp_values, exp_weights.sum(axis=-1), max_score)
+
+    def chunk_scanner(chunk_idx):
+        # julienne key array
+        key_chunk = jax.lax.dynamic_slice(
+            operand=key,
+            start_indices=[0] * (key.ndim - 3) + [chunk_idx, 0, 0],  # [...,k,h,d]
+            slice_sizes=list(key.shape[:-3]) + [key_chunk_size, num_heads, k_features],  # [...,k,h,d]
+        )
+
+        # julienne value array
+        value_chunk = jax.lax.dynamic_slice(
+            operand=value,
+            start_indices=[0] * (value.ndim - 3) + [chunk_idx, 0, 0],  # [...,v,h,d]
+            slice_sizes=list(value.shape[:-3]) + [key_chunk_size, num_heads, v_features],  # [...,v,h,d]
+        )
+
+        return summarize_chunk(query, key_chunk, value_chunk)
+
+    chunk_values, chunk_weights, chunk_max = jax.lax.map(f=chunk_scanner, xs=jnp.arange(0, num_kv, key_chunk_size))
+
+    global_max = jnp.max(chunk_max, axis=0, keepdims=True)
+    max_diffs = jnp.exp(chunk_max - global_max)
+
+    chunk_values *= jnp.expand_dims(max_diffs, axis=-1)
+    chunk_weights *= max_diffs
+
+    all_values = chunk_values.sum(axis=0)
+    all_weights = jnp.expand_dims(chunk_weights, -1).sum(axis=0)
+
+    return all_values / all_weights
+
+
+def jax_memory_efficient_attention(
+    query, key, value, precision=jax.lax.Precision.HIGHEST, query_chunk_size: int = 1024, key_chunk_size: int = 4096
+):
+    r"""
+    Flax Memory-efficient multi-head dot product attention. https://arxiv.org/abs/2112.05682v2
+    https://github.com/AminRezaei0x443/memory-efficient-attention
+
+    Args:
+        query (`jnp.ndarray`): (batch..., query_length, head, query_key_depth_per_head)
+        key (`jnp.ndarray`): (batch..., key_value_length, head, query_key_depth_per_head)
+        value (`jnp.ndarray`): (batch..., key_value_length, head, value_depth_per_head)
+        precision (`jax.lax.Precision`, *optional*, defaults to `jax.lax.Precision.HIGHEST`):
+            numerical precision for computation
+        query_chunk_size (`int`, *optional*, defaults to 1024):
+            chunk size to divide query array value must divide query_length equally without remainder
+        key_chunk_size (`int`, *optional*, defaults to 4096):
+            chunk size to divide key and value array value must divide key_value_length equally without remainder
+
+    Returns:
+        (`jnp.ndarray`) with shape of (batch..., query_length, head, value_depth_per_head)
+    """
+    num_q, num_heads, q_features = query.shape[-3:]
+
+    def chunk_scanner(chunk_idx, _):
+        # julienne query array
+        query_chunk = jax.lax.dynamic_slice(
+            operand=query,
+            start_indices=([0] * (query.ndim - 3)) + [chunk_idx, 0, 0],  # [...,q,h,d]
+            slice_sizes=list(query.shape[:-3]) + [min(query_chunk_size, num_q), num_heads, q_features],  # [...,q,h,d]
+        )
+
+        return (
+            chunk_idx + query_chunk_size,  # unused ignore it
+            _query_chunk_attention(
+                query=query_chunk, key=key, value=value, precision=precision, key_chunk_size=key_chunk_size
+            ),
+        )
+
+    _, res = jax.lax.scan(
+        f=chunk_scanner,
+        init=0,
+        xs=None,
+        length=math.ceil(num_q / query_chunk_size),  # start counter  # stop counter
+    )
+
+    return jnp.concatenate(res, axis=-3)  # fuse the chunked result back
+
+
+class FlaxAttention(nn.Module):
+    r"""
+    A Flax multi-head attention module as described in: https://arxiv.org/abs/1706.03762
+
+    Parameters:
+        query_dim (:obj:`int`):
+            Input hidden states dimension
+        heads (:obj:`int`, *optional*, defaults to 8):
+            Number of heads
+        dim_head (:obj:`int`, *optional*, defaults to 64):
+            Hidden states dimension inside each head
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
+            enable memory efficient attention https://arxiv.org/abs/2112.05682
+        split_head_dim (`bool`, *optional*, defaults to `False`):
+            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
+            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+
+    """
+
+    query_dim: int
+    heads: int = 8
+    dim_head: int = 64
+    dropout: float = 0.0
+    use_memory_efficient_attention: bool = False
+    split_head_dim: bool = False
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        inner_dim = self.dim_head * self.heads
+        self.scale = self.dim_head**-0.5
+
+        # Weights were exported with old names {to_q, to_k, to_v, to_out}
+        self.query = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, name="to_q")
+        self.key = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, name="to_k")
+        self.value = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, name="to_v")
+
+        self.proj_attn = nn.Dense(self.query_dim, dtype=self.dtype, name="to_out_0")
+        self.dropout_layer = nn.Dropout(rate=self.dropout)
+
+    def reshape_heads_to_batch_dim(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
+        tensor = jnp.transpose(tensor, (0, 2, 1, 3))
+        tensor = tensor.reshape(batch_size * head_size, seq_len, dim // head_size)
+        return tensor
+
+    def reshape_batch_dim_to_heads(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+        tensor = jnp.transpose(tensor, (0, 2, 1, 3))
+        tensor = tensor.reshape(batch_size // head_size, seq_len, dim * head_size)
+        return tensor
+
+    def __call__(self, hidden_states, context=None, deterministic=True):
+        context = hidden_states if context is None else context
+
+        query_proj = self.query(hidden_states)
+        key_proj = self.key(context)
+        value_proj = self.value(context)
+
+        if self.split_head_dim:
+            b = hidden_states.shape[0]
+            query_states = jnp.reshape(query_proj, (b, -1, self.heads, self.dim_head))
+            key_states = jnp.reshape(key_proj, (b, -1, self.heads, self.dim_head))
+            value_states = jnp.reshape(value_proj, (b, -1, self.heads, self.dim_head))
+        else:
+            query_states = self.reshape_heads_to_batch_dim(query_proj)
+            key_states = self.reshape_heads_to_batch_dim(key_proj)
+            value_states = self.reshape_heads_to_batch_dim(value_proj)
+
+        if self.use_memory_efficient_attention:
+            query_states = query_states.transpose(1, 0, 2)
+            key_states = key_states.transpose(1, 0, 2)
+            value_states = value_states.transpose(1, 0, 2)
+
+            # this if statement create a chunk size for each layer of the unet
+            # the chunk size is equal to the query_length dimension of the deepest layer of the unet
+
+            flatten_latent_dim = query_states.shape[-3]
+            if flatten_latent_dim % 64 == 0:
+                query_chunk_size = int(flatten_latent_dim / 64)
+            elif flatten_latent_dim % 16 == 0:
+                query_chunk_size = int(flatten_latent_dim / 16)
+            elif flatten_latent_dim % 4 == 0:
+                query_chunk_size = int(flatten_latent_dim / 4)
+            else:
+                query_chunk_size = int(flatten_latent_dim)
+
+            hidden_states = jax_memory_efficient_attention(
+                query_states, key_states, value_states, query_chunk_size=query_chunk_size, key_chunk_size=4096 * 4
+            )
+
+            hidden_states = hidden_states.transpose(1, 0, 2)
+        else:
+            # compute attentions
+            if self.split_head_dim:
+                attention_scores = jnp.einsum("b t n h, b f n h -> b n f t", key_states, query_states)
+            else:
+                attention_scores = jnp.einsum("b i d, b j d->b i j", query_states, key_states)
+
+            attention_scores = attention_scores * self.scale
+            attention_probs = nn.softmax(attention_scores, axis=-1 if self.split_head_dim else 2)
+
+            # attend to values
+            if self.split_head_dim:
+                hidden_states = jnp.einsum("b n f t, b t n h -> b f n h", attention_probs, value_states)
+                b = hidden_states.shape[0]
+                hidden_states = jnp.reshape(hidden_states, (b, -1, self.heads * self.dim_head))
+            else:
+                hidden_states = jnp.einsum("b i j, b j d -> b i d", attention_probs, value_states)
+                hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+
+        hidden_states = self.proj_attn(hidden_states)
+        return self.dropout_layer(hidden_states, deterministic=deterministic)
+
+
+class FlaxBasicTransformerBlock(nn.Module):
+    r"""
+    A Flax transformer block layer with `GLU` (Gated Linear Unit) activation function as described in:
+    https://arxiv.org/abs/1706.03762
+
+
+    Parameters:
+        dim (:obj:`int`):
+            Inner hidden states dimension
+        n_heads (:obj:`int`):
+            Number of heads
+        d_head (:obj:`int`):
+            Hidden states dimension inside each head
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        only_cross_attention (`bool`, defaults to `False`):
+            Whether to only apply cross attention.
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
+            enable memory efficient attention https://arxiv.org/abs/2112.05682
+        split_head_dim (`bool`, *optional*, defaults to `False`):
+            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
+            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
+    """
+
+    dim: int
+    n_heads: int
+    d_head: int
+    dropout: float = 0.0
+    only_cross_attention: bool = False
+    dtype: jnp.dtype = jnp.float32
+    use_memory_efficient_attention: bool = False
+    split_head_dim: bool = False
+
+    def setup(self):
+        # self attention (or cross_attention if only_cross_attention is True)
+        self.attn1 = FlaxAttention(
+            self.dim,
+            self.n_heads,
+            self.d_head,
+            self.dropout,
+            self.use_memory_efficient_attention,
+            self.split_head_dim,
+            dtype=self.dtype,
+        )
+        # cross attention
+        self.attn2 = FlaxAttention(
+            self.dim,
+            self.n_heads,
+            self.d_head,
+            self.dropout,
+            self.use_memory_efficient_attention,
+            self.split_head_dim,
+            dtype=self.dtype,
+        )
+        self.ff = FlaxFeedForward(dim=self.dim, dropout=self.dropout, dtype=self.dtype)
+        self.norm1 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
+        self.norm2 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
+        self.norm3 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
+        self.dropout_layer = nn.Dropout(rate=self.dropout)
+
+    def __call__(self, hidden_states, context, deterministic=True):
+        # self attention
+        residual = hidden_states
+        if self.only_cross_attention:
+            hidden_states = self.attn1(self.norm1(hidden_states), context, deterministic=deterministic)
+        else:
+            hidden_states = self.attn1(self.norm1(hidden_states), deterministic=deterministic)
+        hidden_states = hidden_states + residual
+
+        # cross attention
+        residual = hidden_states
+        hidden_states = self.attn2(self.norm2(hidden_states), context, deterministic=deterministic)
+        hidden_states = hidden_states + residual
+
+        # feed forward
+        residual = hidden_states
+        hidden_states = self.ff(self.norm3(hidden_states), deterministic=deterministic)
+        hidden_states = hidden_states + residual
+
+        return self.dropout_layer(hidden_states, deterministic=deterministic)
+
+
+class FlaxTransformer2DModel(nn.Module):
+    r"""
+    A Spatial Transformer layer with Gated Linear Unit (GLU) activation function as described in:
+    https://arxiv.org/pdf/1506.02025.pdf
+
+
+    Parameters:
+        in_channels (:obj:`int`):
+            Input number of channels
+        n_heads (:obj:`int`):
+            Number of heads
+        d_head (:obj:`int`):
+            Hidden states dimension inside each head
+        depth (:obj:`int`, *optional*, defaults to 1):
+            Number of transformers block
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        use_linear_projection (`bool`, defaults to `False`): tbd
+        only_cross_attention (`bool`, defaults to `False`): tbd
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
+            enable memory efficient attention https://arxiv.org/abs/2112.05682
+        split_head_dim (`bool`, *optional*, defaults to `False`):
+            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
+            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
+    """
+
+    in_channels: int
+    n_heads: int
+    d_head: int
+    depth: int = 1
+    dropout: float = 0.0
+    use_linear_projection: bool = False
+    only_cross_attention: bool = False
+    dtype: jnp.dtype = jnp.float32
+    use_memory_efficient_attention: bool = False
+    split_head_dim: bool = False
+
+    def setup(self):
+        self.norm = nn.GroupNorm(num_groups=32, epsilon=1e-5)
+
+        inner_dim = self.n_heads * self.d_head
+        if self.use_linear_projection:
+            self.proj_in = nn.Dense(inner_dim, dtype=self.dtype)
+        else:
+            self.proj_in = nn.Conv(
+                inner_dim,
+                kernel_size=(1, 1),
+                strides=(1, 1),
+                padding="VALID",
+                dtype=self.dtype,
+            )
+
+        self.transformer_blocks = [
+            FlaxBasicTransformerBlock(
+                inner_dim,
+                self.n_heads,
+                self.d_head,
+                dropout=self.dropout,
+                only_cross_attention=self.only_cross_attention,
+                dtype=self.dtype,
+                use_memory_efficient_attention=self.use_memory_efficient_attention,
+                split_head_dim=self.split_head_dim,
+            )
+            for _ in range(self.depth)
+        ]
+
+        if self.use_linear_projection:
+            self.proj_out = nn.Dense(inner_dim, dtype=self.dtype)
+        else:
+            self.proj_out = nn.Conv(
+                inner_dim,
+                kernel_size=(1, 1),
+                strides=(1, 1),
+                padding="VALID",
+                dtype=self.dtype,
+            )
+
+        self.dropout_layer = nn.Dropout(rate=self.dropout)
+
+    def __call__(self, hidden_states, context, deterministic=True):
+        batch, height, width, channels = hidden_states.shape
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        if self.use_linear_projection:
+            hidden_states = hidden_states.reshape(batch, height * width, channels)
+            hidden_states = self.proj_in(hidden_states)
+        else:
+            hidden_states = self.proj_in(hidden_states)
+            hidden_states = hidden_states.reshape(batch, height * width, channels)
+
+        for transformer_block in self.transformer_blocks:
+            hidden_states = transformer_block(hidden_states, context, deterministic=deterministic)
+
+        if self.use_linear_projection:
+            hidden_states = self.proj_out(hidden_states)
+            hidden_states = hidden_states.reshape(batch, height, width, channels)
+        else:
+            hidden_states = hidden_states.reshape(batch, height, width, channels)
+            hidden_states = self.proj_out(hidden_states)
+
+        hidden_states = hidden_states + residual
+        return self.dropout_layer(hidden_states, deterministic=deterministic)
+
+
+class FlaxFeedForward(nn.Module):
+    r"""
+    Flax module that encapsulates two Linear layers separated by a non-linearity. It is the counterpart of PyTorch's
+    [`FeedForward`] class, with the following simplifications:
+    - The activation function is currently hardcoded to a gated linear unit from:
+    https://arxiv.org/abs/2002.05202
+    - `dim_out` is equal to `dim`.
+    - The number of hidden dimensions is hardcoded to `dim * 4` in [`FlaxGELU`].
+
+    Parameters:
+        dim (:obj:`int`):
+            Inner hidden states dimension
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    dim: int
+    dropout: float = 0.0
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        # The second linear layer needs to be called
+        # net_2 for now to match the index of the Sequential layer
+        self.net_0 = FlaxGEGLU(self.dim, self.dropout, self.dtype)
+        self.net_2 = nn.Dense(self.dim, dtype=self.dtype)
+
+    def __call__(self, hidden_states, deterministic=True):
+        hidden_states = self.net_0(hidden_states, deterministic=deterministic)
+        hidden_states = self.net_2(hidden_states)
+        return hidden_states
+
+
+class FlaxGEGLU(nn.Module):
+    r"""
+    Flax implementation of a Linear layer followed by the variant of the gated linear unit activation function from
+    https://arxiv.org/abs/2002.05202.
+
+    Parameters:
+        dim (:obj:`int`):
+            Input hidden states dimension
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    dim: int
+    dropout: float = 0.0
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        inner_dim = self.dim * 4
+        self.proj = nn.Dense(inner_dim * 2, dtype=self.dtype)
+        self.dropout_layer = nn.Dropout(rate=self.dropout)
+
+    def __call__(self, hidden_states, deterministic=True):
+        hidden_states = self.proj(hidden_states)
+        hidden_linear, hidden_gelu = jnp.split(hidden_states, 2, axis=2)
+        return self.dropout_layer(hidden_linear * nn.gelu(hidden_gelu), deterministic=deterministic)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/attention_processor.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/attention_processor.py
new file mode 100644
index 000000000..30baef416
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/attention_processor.py
@@ -0,0 +1,2507 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from importlib import import_module
+from typing import Callable, Optional, Union
+import os
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ..utils import USE_PEFT_BACKEND, deprecate, logging
+from ..utils.import_utils import is_xformers_available
+from ..utils.torch_utils import maybe_allow_in_graph
+from .lora import LoRACompatibleLinear, LoRALinearLayer
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+
+
+@maybe_allow_in_graph
+class Attention(nn.Module):
+    r"""
+    A cross attention layer.
+
+    Parameters:
+        query_dim (`int`):
+            The number of channels in the query.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
+        heads (`int`,  *optional*, defaults to 8):
+            The number of heads to use for multi-head attention.
+        dim_head (`int`,  *optional*, defaults to 64):
+            The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability to use.
+        bias (`bool`, *optional*, defaults to False):
+            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
+        upcast_attention (`bool`, *optional*, defaults to False):
+            Set to `True` to upcast the attention computation to `float32`.
+        upcast_softmax (`bool`, *optional*, defaults to False):
+            Set to `True` to upcast the softmax computation to `float32`.
+        cross_attention_norm (`str`, *optional*, defaults to `None`):
+            The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`.
+        cross_attention_norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use for the group norm in the cross attention.
+        added_kv_proj_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the added key and value projections. If `None`, no projection is used.
+        norm_num_groups (`int`, *optional*, defaults to `None`):
+            The number of groups to use for the group norm in the attention.
+        spatial_norm_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the spatial normalization.
+        out_bias (`bool`, *optional*, defaults to `True`):
+            Set to `True` to use a bias in the output linear layer.
+        scale_qk (`bool`, *optional*, defaults to `True`):
+            Set to `True` to scale the query and key by `1 / sqrt(dim_head)`.
+        only_cross_attention (`bool`, *optional*, defaults to `False`):
+            Set to `True` to only use cross attention and not added_kv_proj_dim. Can only be set to `True` if
+            `added_kv_proj_dim` is not `None`.
+        eps (`float`, *optional*, defaults to 1e-5):
+            An additional value added to the denominator in group normalization that is used for numerical stability.
+        rescale_output_factor (`float`, *optional*, defaults to 1.0):
+            A factor to rescale the output by dividing it with this value.
+        residual_connection (`bool`, *optional*, defaults to `False`):
+            Set to `True` to add the residual connection to the output.
+        _from_deprecated_attn_block (`bool`, *optional*, defaults to `False`):
+            Set to `True` if the attention block is loaded from a deprecated state dict.
+        processor (`AttnProcessor`, *optional*, defaults to `None`):
+            The attention processor to use. If `None`, defaults to `AttnProcessor2_0` if `torch 2.x` is used and
+            `AttnProcessor` otherwise.
+    """
+
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias: bool = False,
+        upcast_attention: bool = False,
+        upcast_softmax: bool = False,
+        cross_attention_norm: Optional[str] = None,
+        cross_attention_norm_num_groups: int = 32,
+        added_kv_proj_dim: Optional[int] = None,
+        norm_num_groups: Optional[int] = None,
+        spatial_norm_dim: Optional[int] = None,
+        out_bias: bool = True,
+        scale_qk: bool = True,
+        only_cross_attention: bool = False,
+        eps: float = 1e-5,
+        rescale_output_factor: float = 1.0,
+        residual_connection: bool = False,
+        _from_deprecated_attn_block: bool = False,
+        processor: Optional["AttnProcessor"] = None,
+        out_dim: int = None,
+    ):
+        super().__init__()
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.query_dim = query_dim
+        self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.upcast_attention = upcast_attention
+        self.upcast_softmax = upcast_softmax
+        self.rescale_output_factor = rescale_output_factor
+        self.residual_connection = residual_connection
+        self.dropout = dropout
+        self.fused_projections = False
+        self.out_dim = out_dim if out_dim is not None else query_dim
+
+        # we make use of this private variable to know whether this class is loaded
+        # with an deprecated state dict so that we can convert it on the fly
+        self._from_deprecated_attn_block = _from_deprecated_attn_block
+
+        self.scale_qk = scale_qk
+        self.scale = dim_head**-0.5 if self.scale_qk else 1.0
+
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+        # for slice_size > 0 the attention score computation
+        # is split across the batch axis to save memory
+        # You can set slice_size with `set_attention_slice`
+        self.sliceable_head_dim = heads
+
+        self.added_kv_proj_dim = added_kv_proj_dim
+        self.only_cross_attention = only_cross_attention
+
+        if self.added_kv_proj_dim is None and self.only_cross_attention:
+            raise ValueError(
+                "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
+            )
+
+        if norm_num_groups is not None:
+            self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True)
+        else:
+            self.group_norm = None
+
+        if spatial_norm_dim is not None:
+            self.spatial_norm = SpatialNorm(f_channels=query_dim, zq_channels=spatial_norm_dim)
+        else:
+            self.spatial_norm = None
+
+        if cross_attention_norm is None:
+            self.norm_cross = None
+        elif cross_attention_norm == "layer_norm":
+            self.norm_cross = nn.LayerNorm(self.cross_attention_dim)
+        elif cross_attention_norm == "group_norm":
+            if self.added_kv_proj_dim is not None:
+                # The given `encoder_hidden_states` are initially of shape
+                # (batch_size, seq_len, added_kv_proj_dim) before being projected
+                # to (batch_size, seq_len, cross_attention_dim). The norm is applied
+                # before the projection, so we need to use `added_kv_proj_dim` as
+                # the number of channels for the group norm.
+                norm_cross_num_channels = added_kv_proj_dim
+            else:
+                norm_cross_num_channels = self.cross_attention_dim
+
+            self.norm_cross = nn.GroupNorm(
+                num_channels=norm_cross_num_channels, num_groups=cross_attention_norm_num_groups, eps=1e-5, affine=True
+            )
+        else:
+            raise ValueError(
+                f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
+            )
+
+        if USE_PEFT_BACKEND:
+            linear_cls = nn.Linear
+        else:
+            linear_cls = LoRACompatibleLinear
+
+        self.linear_cls = linear_cls
+        self.to_q = linear_cls(query_dim, self.inner_dim, bias=bias)
+
+        if not self.only_cross_attention:
+            # only relevant for the `AddedKVProcessor` classes
+            self.to_k = linear_cls(self.cross_attention_dim, self.inner_dim, bias=bias)
+            self.to_v = linear_cls(self.cross_attention_dim, self.inner_dim, bias=bias)
+        else:
+            self.to_k = None
+            self.to_v = None
+
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = linear_cls(added_kv_proj_dim, self.inner_dim)
+            self.add_v_proj = linear_cls(added_kv_proj_dim, self.inner_dim)
+
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(linear_cls(self.inner_dim, self.out_dim, bias=out_bias))
+        self.to_out.append(nn.Dropout(dropout))
+
+        # set attention processor
+        # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+        # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+        # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+        if processor is None:
+            # processor = (AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor())
+            # processor = (AttnProcessor())
+            processor = (AttnProcessor()) if int(os.environ.get("USE_NATIVE_ATTN", 0)) else (AttnProcessor3_0())
+            
+        self.set_processor(processor)
+
+    def set_use_memory_efficient_attention_xformers(
+        self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
+    ) -> None:
+        r"""
+        Set whether to use memory efficient attention from `xformers` or not.
+
+        Args:
+            use_memory_efficient_attention_xformers (`bool`):
+                Whether to use memory efficient attention from `xformers` or not.
+            attention_op (`Callable`, *optional*):
+                The attention operation to use. Defaults to `None` which uses the default attention operation from
+                `xformers`.
+        """
+        is_lora = hasattr(self, "processor") and isinstance(
+            self.processor,
+            LORA_ATTENTION_PROCESSORS,
+        )
+        is_custom_diffusion = hasattr(self, "processor") and isinstance(
+            self.processor,
+            (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor, CustomDiffusionAttnProcessor2_0),
+        )
+        is_added_kv_processor = hasattr(self, "processor") and isinstance(
+            self.processor,
+            (
+                AttnAddedKVProcessor,
+                AttnAddedKVProcessor2_0,
+                SlicedAttnAddedKVProcessor,
+                XFormersAttnAddedKVProcessor,
+                LoRAAttnAddedKVProcessor,
+            ),
+        )
+
+        if use_memory_efficient_attention_xformers:
+            if is_added_kv_processor and (is_lora or is_custom_diffusion):
+                raise NotImplementedError(
+                    f"Memory efficient attention is currently not supported for LoRA or custom diffusion for attention processor type {self.processor}"
+                )
+            if not is_xformers_available():
+                raise ModuleNotFoundError(
+                    (
+                        "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
+                        " xformers"
+                    ),
+                    name="xformers",
+                )
+            elif not torch.cuda.is_available():
+                raise ValueError(
+                    "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is"
+                    " only available for GPU "
+                )
+            else:
+                try:
+                    # Make sure we can run the memory efficient attention
+                    _ = xformers.ops.memory_efficient_attention(
+                        torch.randn((1, 2, 40), device="cuda"),
+                        torch.randn((1, 2, 40), device="cuda"),
+                        torch.randn((1, 2, 40), device="cuda"),
+                    )
+                except Exception as e:
+                    raise e
+
+            if is_lora:
+                # TODO (sayakpaul): should we throw a warning if someone wants to use the xformers
+                # variant when using PT 2.0 now that we have LoRAAttnProcessor2_0?
+                processor = LoRAXFormersAttnProcessor(
+                    hidden_size=self.processor.hidden_size,
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                    rank=self.processor.rank,
+                    attention_op=attention_op,
+                )
+                processor.load_state_dict(self.processor.state_dict())
+                processor.to(self.processor.to_q_lora.up.weight.device)
+            elif is_custom_diffusion:
+                processor = CustomDiffusionXFormersAttnProcessor(
+                    train_kv=self.processor.train_kv,
+                    train_q_out=self.processor.train_q_out,
+                    hidden_size=self.processor.hidden_size,
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                    attention_op=attention_op,
+                )
+                processor.load_state_dict(self.processor.state_dict())
+                if hasattr(self.processor, "to_k_custom_diffusion"):
+                    processor.to(self.processor.to_k_custom_diffusion.weight.device)
+            elif is_added_kv_processor:
+                # TODO(Patrick, Suraj, William) - currently xformers doesn't work for UnCLIP
+                # which uses this type of cross attention ONLY because the attention mask of format
+                # [0, ..., -10.000, ..., 0, ...,] is not supported
+                # throw warning
+                logger.info(
+                    "Memory efficient attention with `xformers` might currently not work correctly if an attention mask is required for the attention operation."
+                )
+                processor = XFormersAttnAddedKVProcessor(attention_op=attention_op)
+            else:
+                processor = XFormersAttnProcessor(attention_op=attention_op)
+        else:
+            if is_lora:
+                attn_processor_class = (
+                    LoRAAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else LoRAAttnProcessor
+                )
+                processor = attn_processor_class(
+                    hidden_size=self.processor.hidden_size,
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                    rank=self.processor.rank,
+                )
+                processor.load_state_dict(self.processor.state_dict())
+                processor.to(self.processor.to_q_lora.up.weight.device)
+            elif is_custom_diffusion:
+                attn_processor_class = (
+                    CustomDiffusionAttnProcessor2_0
+                    if hasattr(F, "scaled_dot_product_attention")
+                    else CustomDiffusionAttnProcessor
+                )
+                processor = attn_processor_class(
+                    train_kv=self.processor.train_kv,
+                    train_q_out=self.processor.train_q_out,
+                    hidden_size=self.processor.hidden_size,
+                    cross_attention_dim=self.processor.cross_attention_dim,
+                )
+                processor.load_state_dict(self.processor.state_dict())
+                if hasattr(self.processor, "to_k_custom_diffusion"):
+                    processor.to(self.processor.to_k_custom_diffusion.weight.device)
+            else:
+                # set attention processor
+                # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+                # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+                # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+                processor = (
+                    AttnProcessor2_0()
+                    if hasattr(F, "scaled_dot_product_attention") and self.scale_qk
+                    else AttnProcessor()
+                )
+
+        self.set_processor(processor)
+
+    def set_attention_slice(self, slice_size: int) -> None:
+        r"""
+        Set the slice size for attention computation.
+
+        Args:
+            slice_size (`int`):
+                The slice size for attention computation.
+        """
+        if slice_size is not None and slice_size > self.sliceable_head_dim:
+            raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.")
+
+        if slice_size is not None and self.added_kv_proj_dim is not None:
+            processor = SlicedAttnAddedKVProcessor(slice_size)
+        elif slice_size is not None:
+            processor = SlicedAttnProcessor(slice_size)
+        elif self.added_kv_proj_dim is not None:
+            processor = AttnAddedKVProcessor()
+        else:
+            # set attention processor
+            # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+            # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+            # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+            processor = (AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor())
+
+        self.set_processor(processor)
+
+    def set_processor(self, processor: "AttnProcessor") -> None:
+        r"""
+        Set the attention processor to use.
+
+        Args:
+            processor (`AttnProcessor`):
+                The attention processor to use.
+        """
+        # if current processor is in `self._modules` and if passed `processor` is not, we need to
+        # pop `processor` from `self._modules`
+        if (
+            hasattr(self, "processor")
+            and isinstance(self.processor, torch.nn.Module)
+            and not isinstance(processor, torch.nn.Module)
+        ):
+            logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}")
+            self._modules.pop("processor")
+
+        self.processor = processor
+
+    def get_processor(self, return_deprecated_lora: bool = False) -> "AttentionProcessor":
+        r"""
+        Get the attention processor in use.
+
+        Args:
+            return_deprecated_lora (`bool`, *optional*, defaults to `False`):
+                Set to `True` to return the deprecated LoRA attention processor.
+
+        Returns:
+            "AttentionProcessor": The attention processor in use.
+        """
+        if not return_deprecated_lora:
+            return self.processor
+
+        # TODO(Sayak, Patrick). The rest of the function is needed to ensure backwards compatible
+        # serialization format for LoRA Attention Processors. It should be deleted once the integration
+        # with PEFT is completed.
+        is_lora_activated = {
+            name: module.lora_layer is not None
+            for name, module in self.named_modules()
+            if hasattr(module, "lora_layer")
+        }
+
+        # 1. if no layer has a LoRA activated we can return the processor as usual
+        if not any(is_lora_activated.values()):
+            return self.processor
+
+        # If doesn't apply LoRA do `add_k_proj` or `add_v_proj`
+        is_lora_activated.pop("add_k_proj", None)
+        is_lora_activated.pop("add_v_proj", None)
+        # 2. else it is not posssible that only some layers have LoRA activated
+        if not all(is_lora_activated.values()):
+            raise ValueError(
+                f"Make sure that either all layers or no layers have LoRA activated, but have {is_lora_activated}"
+            )
+
+        # 3. And we need to merge the current LoRA layers into the corresponding LoRA attention processor
+        non_lora_processor_cls_name = self.processor.__class__.__name__
+        lora_processor_cls = getattr(import_module(__name__), "LoRA" + non_lora_processor_cls_name)
+
+        hidden_size = self.inner_dim
+
+        # now create a LoRA attention processor from the LoRA layers
+        if lora_processor_cls in [LoRAAttnProcessor, LoRAAttnProcessor2_0, LoRAXFormersAttnProcessor]:
+            kwargs = {
+                "cross_attention_dim": self.cross_attention_dim,
+                "rank": self.to_q.lora_layer.rank,
+                "network_alpha": self.to_q.lora_layer.network_alpha,
+                "q_rank": self.to_q.lora_layer.rank,
+                "q_hidden_size": self.to_q.lora_layer.out_features,
+                "k_rank": self.to_k.lora_layer.rank,
+                "k_hidden_size": self.to_k.lora_layer.out_features,
+                "v_rank": self.to_v.lora_layer.rank,
+                "v_hidden_size": self.to_v.lora_layer.out_features,
+                "out_rank": self.to_out[0].lora_layer.rank,
+                "out_hidden_size": self.to_out[0].lora_layer.out_features,
+            }
+
+            if hasattr(self.processor, "attention_op"):
+                kwargs["attention_op"] = self.processor.attention_op
+
+            lora_processor = lora_processor_cls(hidden_size, **kwargs)
+            lora_processor.to_q_lora.load_state_dict(self.to_q.lora_layer.state_dict())
+            lora_processor.to_k_lora.load_state_dict(self.to_k.lora_layer.state_dict())
+            lora_processor.to_v_lora.load_state_dict(self.to_v.lora_layer.state_dict())
+            lora_processor.to_out_lora.load_state_dict(self.to_out[0].lora_layer.state_dict())
+        elif lora_processor_cls == LoRAAttnAddedKVProcessor:
+            lora_processor = lora_processor_cls(
+                hidden_size,
+                cross_attention_dim=self.add_k_proj.weight.shape[0],
+                rank=self.to_q.lora_layer.rank,
+                network_alpha=self.to_q.lora_layer.network_alpha,
+            )
+            lora_processor.to_q_lora.load_state_dict(self.to_q.lora_layer.state_dict())
+            lora_processor.to_k_lora.load_state_dict(self.to_k.lora_layer.state_dict())
+            lora_processor.to_v_lora.load_state_dict(self.to_v.lora_layer.state_dict())
+            lora_processor.to_out_lora.load_state_dict(self.to_out[0].lora_layer.state_dict())
+
+            # only save if used
+            if self.add_k_proj.lora_layer is not None:
+                lora_processor.add_k_proj_lora.load_state_dict(self.add_k_proj.lora_layer.state_dict())
+                lora_processor.add_v_proj_lora.load_state_dict(self.add_v_proj.lora_layer.state_dict())
+            else:
+                lora_processor.add_k_proj_lora = None
+                lora_processor.add_v_proj_lora = None
+        else:
+            raise ValueError(f"{lora_processor_cls} does not exist.")
+
+        return lora_processor
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        **cross_attention_kwargs,
+    ) -> torch.Tensor:
+        r"""
+        The forward method of the `Attention` class.
+
+        Args:
+            hidden_states (`torch.Tensor`):
+                The hidden states of the query.
+            encoder_hidden_states (`torch.Tensor`, *optional*):
+                The hidden states of the encoder.
+            attention_mask (`torch.Tensor`, *optional*):
+                The attention mask to use. If `None`, no mask is applied.
+            **cross_attention_kwargs:
+                Additional keyword arguments to pass along to the cross attention.
+
+        Returns:
+            `torch.Tensor`: The output of the attention layer.
+        """
+        # The `Attention` class can call different attention processors / attention functions
+        # here we simply pass along all tensors to the selected processor class
+        # For standard processors that are defined here, `**cross_attention_kwargs` is empty
+        
+        out =  self.processor(
+            self,
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+        
+        return out
+        
+        
+
+    def batch_to_head_dim(self, tensor: torch.Tensor) -> torch.Tensor:
+        r"""
+        Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size // heads, seq_len, dim * heads]`. `heads`
+        is the number of heads initialized while constructing the `Attention` class.
+
+        Args:
+            tensor (`torch.Tensor`): The tensor to reshape.
+
+        Returns:
+            `torch.Tensor`: The reshaped tensor.
+        """
+        head_size = self.heads
+        batch_size, seq_len, dim = tensor.shape
+        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+        tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
+        return tensor
+
+    def head_to_batch_dim(self, tensor: torch.Tensor, out_dim: int = 3) -> torch.Tensor:
+        r"""
+        Reshape the tensor from `[batch_size, seq_len, dim]` to `[batch_size, seq_len, heads, dim // heads]` `heads` is
+        the number of heads initialized while constructing the `Attention` class.
+
+        Args:
+            tensor (`torch.Tensor`): The tensor to reshape.
+            out_dim (`int`, *optional*, defaults to `3`): The output dimension of the tensor. If `3`, the tensor is
+                reshaped to `[batch_size * heads, seq_len, dim // heads]`.
+
+        Returns:
+            `torch.Tensor`: The reshaped tensor.
+        """
+        head_size = self.heads
+        batch_size, seq_len, dim = tensor.shape
+        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
+        tensor = tensor.permute(0, 2, 1, 3)
+
+        if out_dim == 3:
+            tensor = tensor.reshape(batch_size * head_size, seq_len, dim // head_size)
+
+        return tensor
+
+    def get_attention_scores(
+        self, query: torch.Tensor, key: torch.Tensor, attention_mask: torch.Tensor = None
+    ) -> torch.Tensor:
+        r"""
+        Compute the attention scores.
+
+        Args:
+            query (`torch.Tensor`): The query tensor.
+            key (`torch.Tensor`): The key tensor.
+            attention_mask (`torch.Tensor`, *optional*): The attention mask to use. If `None`, no mask is applied.
+
+        Returns:
+            `torch.Tensor`: The attention probabilities/scores.
+        """
+        dtype = query.dtype
+        if self.upcast_attention:
+            query = query.float()
+            key = key.float()
+
+        if attention_mask is None:
+            baddbmm_input = torch.empty(
+                query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device
+            )
+            beta = 0
+        else:
+            baddbmm_input = attention_mask
+            beta = 1
+
+        attention_scores = torch.baddbmm(
+            baddbmm_input,
+            query,
+            key.transpose(-1, -2),
+            beta=beta,
+            alpha=self.scale,
+        )
+        del baddbmm_input
+
+        if self.upcast_softmax:
+            attention_scores = attention_scores.float()
+
+        attention_probs = attention_scores.softmax(dim=-1)
+        del attention_scores
+
+        attention_probs = attention_probs.to(dtype)
+
+        return attention_probs
+
+    def prepare_attention_mask(
+        self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3
+    ) -> torch.Tensor:
+        r"""
+        Prepare the attention mask for the attention computation.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                The attention mask to prepare.
+            target_length (`int`):
+                The target length of the attention mask. This is the length of the attention mask after padding.
+            batch_size (`int`):
+                The batch size, which is used to repeat the attention mask.
+            out_dim (`int`, *optional*, defaults to `3`):
+                The output dimension of the attention mask. Can be either `3` or `4`.
+
+        Returns:
+            `torch.Tensor`: The prepared attention mask.
+        """
+        head_size = self.heads
+        if attention_mask is None:
+            return attention_mask
+
+        current_length: int = attention_mask.shape[-1]
+        if current_length != target_length:
+            if attention_mask.device.type == "mps":
+                # HACK: MPS: Does not support padding by greater than dimension of input tensor.
+                # Instead, we can manually construct the padding tensor.
+                padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length)
+                padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device)
+                attention_mask = torch.cat([attention_mask, padding], dim=2)
+            else:
+                # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
+                #       we want to instead pad by (0, remaining_length), where remaining_length is:
+                #       remaining_length: int = target_length - current_length
+                # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+
+        if out_dim == 3:
+            if attention_mask.shape[0] < batch_size * head_size:
+                attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
+        elif out_dim == 4:
+            attention_mask = attention_mask.unsqueeze(1)
+            attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
+
+        return attention_mask
+
+    def norm_encoder_hidden_states(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor:
+        r"""
+        Normalize the encoder hidden states. Requires `self.norm_cross` to be specified when constructing the
+        `Attention` class.
+
+        Args:
+            encoder_hidden_states (`torch.Tensor`): Hidden states of the encoder.
+
+        Returns:
+            `torch.Tensor`: The normalized encoder hidden states.
+        """
+        assert self.norm_cross is not None, "self.norm_cross must be defined to call self.norm_encoder_hidden_states"
+
+        if isinstance(self.norm_cross, nn.LayerNorm):
+            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
+        elif isinstance(self.norm_cross, nn.GroupNorm):
+            # Group norm norms along the channels dimension and expects
+            # input to be in the shape of (N, C, *). In this case, we want
+            # to norm along the hidden dimension, so we need to move
+            # (batch_size, sequence_length, hidden_size) ->
+            # (batch_size, hidden_size, sequence_length)
+            encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
+            encoder_hidden_states = self.norm_cross(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.transpose(1, 2)
+        else:
+            assert False
+
+        return encoder_hidden_states
+
+    @torch.no_grad()
+    def fuse_projections(self, fuse=True):
+        is_cross_attention = self.cross_attention_dim != self.query_dim
+        device = self.to_q.weight.data.device
+        dtype = self.to_q.weight.data.dtype
+
+        if not is_cross_attention:
+            # fetch weight matrices.
+            concatenated_weights = torch.cat([self.to_q.weight.data, self.to_k.weight.data, self.to_v.weight.data])
+            in_features = concatenated_weights.shape[1]
+            out_features = concatenated_weights.shape[0]
+
+            # create a new single projection layer and copy over the weights.
+            self.to_qkv = self.linear_cls(in_features, out_features, bias=False, device=device, dtype=dtype)
+            self.to_qkv.weight.copy_(concatenated_weights)
+
+        else:
+            concatenated_weights = torch.cat([self.to_k.weight.data, self.to_v.weight.data])
+            in_features = concatenated_weights.shape[1]
+            out_features = concatenated_weights.shape[0]
+
+            self.to_kv = self.linear_cls(in_features, out_features, bias=False, device=device, dtype=dtype)
+            self.to_kv.weight.copy_(concatenated_weights)
+
+        self.fused_projections = fuse
+
+
+class AttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+    ) -> torch.Tensor:
+        residual = hidden_states
+
+        args = () if USE_PEFT_BACKEND else (scale,)
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states, *args)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states, *args)
+        value = attn.to_v(encoder_hidden_states, *args)
+        # # ============
+        # from flash_attn import flash_attn_func
+        # inner_dim = key.shape[-1]
+        # head_dim = inner_dim // attn.heads
+        # query_bak = query.view(batch_size, -1, attn.heads, head_dim)
+        # key_bak = key.view(batch_size, -1, attn.heads, head_dim)
+        # value_bak = value.view(batch_size, -1, attn.heads, head_dim)
+        # hidden_states_bak = flash_attn_func(query_bak, key_bak, value_bak, dropout_p=0.0, causal=False, imp_mode=1)
+        # hidden_states_bak = hidden_states_bak.reshape(batch_size, -1, attn.heads * head_dim)
+        # #================
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        # print("=====")
+        # print("[before] q.data_prt=", query.data_ptr(), "|| q.shape=", query.shape)
+        # print("[before] k.data_prt=", key.data_ptr(), "|| q.shape=", key.shape)
+        # print("[before] v.data_prt=", value.data_ptr(), "|| q.shape=", value.shape)
+        # print("+++++")
+        
+        
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+ 
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states, *args)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class CustomDiffusionAttnProcessor(nn.Module):
+    r"""
+    Processor for implementing attention for the Custom Diffusion method.
+
+    Args:
+        train_kv (`bool`, defaults to `True`):
+            Whether to newly train the key and value matrices corresponding to the text features.
+        train_q_out (`bool`, defaults to `True`):
+            Whether to newly train query matrices corresponding to the latent image features.
+        hidden_size (`int`, *optional*, defaults to `None`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`, *optional*, defaults to `None`):
+            The number of channels in the `encoder_hidden_states`.
+        out_bias (`bool`, defaults to `True`):
+            Whether to include the bias parameter in `train_q_out`.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability to use.
+    """
+
+    def __init__(
+        self,
+        train_kv: bool = True,
+        train_q_out: bool = True,
+        hidden_size: Optional[int] = None,
+        cross_attention_dim: Optional[int] = None,
+        out_bias: bool = True,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.train_kv = train_kv
+        self.train_q_out = train_q_out
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+
+        # `_custom_diffusion` id for easy serialization and loading.
+        if self.train_kv:
+            self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+            self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        if self.train_q_out:
+            self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias=False)
+            self.to_out_custom_diffusion = nn.ModuleList([])
+            self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=out_bias))
+            self.to_out_custom_diffusion.append(nn.Dropout(dropout))
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if self.train_q_out:
+            query = self.to_q_custom_diffusion(hidden_states).to(attn.to_q.weight.dtype)
+        else:
+            query = attn.to_q(hidden_states.to(attn.to_q.weight.dtype))
+
+        if encoder_hidden_states is None:
+            crossattn = False
+            encoder_hidden_states = hidden_states
+        else:
+            crossattn = True
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        if self.train_kv:
+            key = self.to_k_custom_diffusion(encoder_hidden_states.to(self.to_k_custom_diffusion.weight.dtype))
+            value = self.to_v_custom_diffusion(encoder_hidden_states.to(self.to_v_custom_diffusion.weight.dtype))
+            key = key.to(attn.to_q.weight.dtype)
+            value = value.to(attn.to_q.weight.dtype)
+        else:
+            key = attn.to_k(encoder_hidden_states)
+            value = attn.to_v(encoder_hidden_states)
+
+        if crossattn:
+            detach = torch.ones_like(key)
+            detach[:, :1, :] = detach[:, :1, :] * 0.0
+            key = detach * key + (1 - detach) * key.detach()
+            value = detach * value + (1 - detach) * value.detach()
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        if self.train_q_out:
+            # linear proj
+            hidden_states = self.to_out_custom_diffusion[0](hidden_states)
+            # dropout
+            hidden_states = self.to_out_custom_diffusion[1](hidden_states)
+        else:
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+
+        return hidden_states
+
+
+class AttnAddedKVProcessor:
+    r"""
+    Processor for performing attention-related computations with extra learnable key and value matrices for the text
+    encoder.
+    """
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+    ) -> torch.Tensor:
+        residual = hidden_states
+
+        args = () if USE_PEFT_BACKEND else (scale,)
+
+        hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
+        batch_size, sequence_length, _ = hidden_states.shape
+
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states, *args)
+        query = attn.head_to_batch_dim(query)
+
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states, *args)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states, *args)
+        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
+        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
+
+        if not attn.only_cross_attention:
+            key = attn.to_k(hidden_states, *args)
+            value = attn.to_v(hidden_states, *args)
+            key = attn.head_to_batch_dim(key)
+            value = attn.head_to_batch_dim(value)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
+        else:
+            key = encoder_hidden_states_key_proj
+            value = encoder_hidden_states_value_proj
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states, *args)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape)
+        hidden_states = hidden_states + residual
+
+        return hidden_states
+
+
+class AttnAddedKVProcessor2_0:
+    r"""
+    Processor for performing scaled dot-product attention (enabled by default if you're using PyTorch 2.0), with extra
+    learnable key and value matrices for the text encoder.
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "AttnAddedKVProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+    ) -> torch.Tensor:
+        residual = hidden_states
+
+        args = () if USE_PEFT_BACKEND else (scale,)
+
+        hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
+        batch_size, sequence_length, _ = hidden_states.shape
+
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, out_dim=4)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states, *args)
+        query = attn.head_to_batch_dim(query, out_dim=4)
+
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj, out_dim=4)
+        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj, out_dim=4)
+
+        if not attn.only_cross_attention:
+            key = attn.to_k(hidden_states, *args)
+            value = attn.to_v(hidden_states, *args)
+            key = attn.head_to_batch_dim(key, out_dim=4)
+            value = attn.head_to_batch_dim(value, out_dim=4)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+        else:
+            key = encoder_hidden_states_key_proj
+            value = encoder_hidden_states_value_proj
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, residual.shape[1])
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states, *args)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape)
+        hidden_states = hidden_states + residual
+
+        return hidden_states
+
+
+class XFormersAttnAddedKVProcessor:
+    r"""
+    Processor for implementing memory efficient attention using xFormers.
+
+    Args:
+        attention_op (`Callable`, *optional*, defaults to `None`):
+            The base
+            [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to
+            use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best
+            operator.
+    """
+
+    def __init__(self, attention_op: Optional[Callable] = None):
+        self.attention_op = attention_op
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
+        batch_size, sequence_length, _ = hidden_states.shape
+
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+        query = attn.head_to_batch_dim(query)
+
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
+        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
+
+        if not attn.only_cross_attention:
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+            key = attn.head_to_batch_dim(key)
+            value = attn.head_to_batch_dim(value)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
+        else:
+            key = encoder_hidden_states_key_proj
+            value = encoder_hidden_states_value_proj
+
+        hidden_states = xformers.ops.memory_efficient_attention(
+            query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape)
+        hidden_states = hidden_states + residual
+
+        return hidden_states
+
+
+class XFormersAttnProcessor:
+    r"""
+    Processor for implementing memory efficient attention using xFormers.
+
+    Args:
+        attention_op (`Callable`, *optional*, defaults to `None`):
+            The base
+            [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to
+            use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best
+            operator.
+    """
+
+    def __init__(self, attention_op: Optional[Callable] = None):
+        self.attention_op = attention_op
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+
+        args = () if USE_PEFT_BACKEND else (scale,)
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, key_tokens, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        attention_mask = attn.prepare_attention_mask(attention_mask, key_tokens, batch_size)
+        if attention_mask is not None:
+            # expand our mask's singleton query_tokens dimension:
+            #   [batch*heads,            1, key_tokens] ->
+            #   [batch*heads, query_tokens, key_tokens]
+            # so that it can be added as a bias onto the attention scores that xformers computes:
+            #   [batch*heads, query_tokens, key_tokens]
+            # we do this explicitly because xformers doesn't broadcast the singleton dimension for us.
+            _, query_tokens, _ = hidden_states.shape
+            attention_mask = attention_mask.expand(-1, query_tokens, -1)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states, *args)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states, *args)
+        value = attn.to_v(encoder_hidden_states, *args)
+
+        query = attn.head_to_batch_dim(query).contiguous()
+        key = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+
+        hidden_states = xformers.ops.memory_efficient_attention(
+            query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states, *args)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class AttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        args = () if USE_PEFT_BACKEND else (scale,)
+        query = attn.to_q(hidden_states, *args)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states, *args)
+        value = attn.to_v(encoder_hidden_states, *args)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        # print(query.shape, key.shape, value.shape,)
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states, *args)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+class AttnProcessor3_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        args = () if USE_PEFT_BACKEND else (scale,)
+        query = attn.to_q(hidden_states, *args)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states, *args)
+        value = attn.to_v(encoder_hidden_states, *args)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        # query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        # value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # # TODO: add support for attn.scale when we move to Torch 2.1
+        # # print(query.shape, key.shape, value.shape,)
+        # hidden_states = F.scaled_dot_product_attention(
+        #     query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        # )
+        # hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).contiguous()
+        key = key.view(batch_size, -1, attn.heads, head_dim).contiguous()
+        value = value.view(batch_size, -1, attn.heads, head_dim).contiguous()
+        
+        #print("query.shape, key.shape, value.shape: ", query.shape, key.shape, value.shape)
+        #print("query.stride, key.stride, value.stride: ", query.stride(), key.stride(), value.stride())
+        #print("attention_mask is: ", attention_mask)
+        from flash_attn import flash_attn_func
+        
+        # torch.cuda.synchronize()
+        #expand_q = torch.zeros(1, *query.shape[1:]).to(dtype=query.dtype, device=query.device)
+        #expand_kv = torch.zeros(1, *key.shape[1:]).to(dtype=key.dtype, device=key.device)
+        #query = torch.cat((query, expand_q), dim=0)
+        #key = torch.cat((key, expand_kv), dim=0)
+        #value = torch.cat((value, expand_kv), dim=0)
+        
+        hidden_states = flash_attn_func(query, key, value, dropout_p=0.0, causal=False, imp_mode=1)
+        
+        #hidden_states = hidden_states[:-1]
+        # torch.cuda.synchronize()
+        hidden_states = hidden_states.reshape(batch_size, -1, attn.heads * head_dim)
+
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states, *args)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+        
+
+        return hidden_states
+
+
+class FusedAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
+    It uses fused projection layers. For self-attention modules, all projection matrices (i.e., query,
+    key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+    <Tip warning={true}>
+
+    This API is currently 🧪 experimental in nature and can change in future.
+
+    </Tip>
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "FusedAttnProcessor2_0 requires at least PyTorch 2.0, to use it. Please upgrade PyTorch to > 2.0."
+            )
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        args = () if USE_PEFT_BACKEND else (scale,)
+        if encoder_hidden_states is None:
+            qkv = attn.to_qkv(hidden_states, *args)
+            split_size = qkv.shape[-1] // 3
+            query, key, value = torch.split(qkv, split_size, dim=-1)
+        else:
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+            query = attn.to_q(hidden_states, *args)
+
+            kv = attn.to_kv(encoder_hidden_states, *args)
+            split_size = kv.shape[-1] // 2
+            key, value = torch.split(kv, split_size, dim=-1)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states, *args)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class CustomDiffusionXFormersAttnProcessor(nn.Module):
+    r"""
+    Processor for implementing memory efficient attention using xFormers for the Custom Diffusion method.
+
+    Args:
+    train_kv (`bool`, defaults to `True`):
+        Whether to newly train the key and value matrices corresponding to the text features.
+    train_q_out (`bool`, defaults to `True`):
+        Whether to newly train query matrices corresponding to the latent image features.
+    hidden_size (`int`, *optional*, defaults to `None`):
+        The hidden size of the attention layer.
+    cross_attention_dim (`int`, *optional*, defaults to `None`):
+        The number of channels in the `encoder_hidden_states`.
+    out_bias (`bool`, defaults to `True`):
+        Whether to include the bias parameter in `train_q_out`.
+    dropout (`float`, *optional*, defaults to 0.0):
+        The dropout probability to use.
+    attention_op (`Callable`, *optional*, defaults to `None`):
+        The base
+        [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to use
+        as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best operator.
+    """
+
+    def __init__(
+        self,
+        train_kv: bool = True,
+        train_q_out: bool = False,
+        hidden_size: Optional[int] = None,
+        cross_attention_dim: Optional[int] = None,
+        out_bias: bool = True,
+        dropout: float = 0.0,
+        attention_op: Optional[Callable] = None,
+    ):
+        super().__init__()
+        self.train_kv = train_kv
+        self.train_q_out = train_q_out
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.attention_op = attention_op
+
+        # `_custom_diffusion` id for easy serialization and loading.
+        if self.train_kv:
+            self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+            self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        if self.train_q_out:
+            self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias=False)
+            self.to_out_custom_diffusion = nn.ModuleList([])
+            self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=out_bias))
+            self.to_out_custom_diffusion.append(nn.Dropout(dropout))
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if self.train_q_out:
+            query = self.to_q_custom_diffusion(hidden_states).to(attn.to_q.weight.dtype)
+        else:
+            query = attn.to_q(hidden_states.to(attn.to_q.weight.dtype))
+
+        if encoder_hidden_states is None:
+            crossattn = False
+            encoder_hidden_states = hidden_states
+        else:
+            crossattn = True
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        if self.train_kv:
+            key = self.to_k_custom_diffusion(encoder_hidden_states.to(self.to_k_custom_diffusion.weight.dtype))
+            value = self.to_v_custom_diffusion(encoder_hidden_states.to(self.to_v_custom_diffusion.weight.dtype))
+            key = key.to(attn.to_q.weight.dtype)
+            value = value.to(attn.to_q.weight.dtype)
+        else:
+            key = attn.to_k(encoder_hidden_states)
+            value = attn.to_v(encoder_hidden_states)
+
+        if crossattn:
+            detach = torch.ones_like(key)
+            detach[:, :1, :] = detach[:, :1, :] * 0.0
+            key = detach * key + (1 - detach) * key.detach()
+            value = detach * value + (1 - detach) * value.detach()
+
+        query = attn.head_to_batch_dim(query).contiguous()
+        key = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+
+        hidden_states = xformers.ops.memory_efficient_attention(
+            query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        if self.train_q_out:
+            # linear proj
+            hidden_states = self.to_out_custom_diffusion[0](hidden_states)
+            # dropout
+            hidden_states = self.to_out_custom_diffusion[1](hidden_states)
+        else:
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+
+        return hidden_states
+
+
+class CustomDiffusionAttnProcessor2_0(nn.Module):
+    r"""
+    Processor for implementing attention for the Custom Diffusion method using PyTorch 2.0’s memory-efficient scaled
+    dot-product attention.
+
+    Args:
+        train_kv (`bool`, defaults to `True`):
+            Whether to newly train the key and value matrices corresponding to the text features.
+        train_q_out (`bool`, defaults to `True`):
+            Whether to newly train query matrices corresponding to the latent image features.
+        hidden_size (`int`, *optional*, defaults to `None`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`, *optional*, defaults to `None`):
+            The number of channels in the `encoder_hidden_states`.
+        out_bias (`bool`, defaults to `True`):
+            Whether to include the bias parameter in `train_q_out`.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability to use.
+    """
+
+    def __init__(
+        self,
+        train_kv: bool = True,
+        train_q_out: bool = True,
+        hidden_size: Optional[int] = None,
+        cross_attention_dim: Optional[int] = None,
+        out_bias: bool = True,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.train_kv = train_kv
+        self.train_q_out = train_q_out
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+
+        # `_custom_diffusion` id for easy serialization and loading.
+        if self.train_kv:
+            self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+            self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        if self.train_q_out:
+            self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias=False)
+            self.to_out_custom_diffusion = nn.ModuleList([])
+            self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=out_bias))
+            self.to_out_custom_diffusion.append(nn.Dropout(dropout))
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if self.train_q_out:
+            query = self.to_q_custom_diffusion(hidden_states)
+        else:
+            query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            crossattn = False
+            encoder_hidden_states = hidden_states
+        else:
+            crossattn = True
+            if attn.norm_cross:
+                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        if self.train_kv:
+            key = self.to_k_custom_diffusion(encoder_hidden_states.to(self.to_k_custom_diffusion.weight.dtype))
+            value = self.to_v_custom_diffusion(encoder_hidden_states.to(self.to_v_custom_diffusion.weight.dtype))
+            key = key.to(attn.to_q.weight.dtype)
+            value = value.to(attn.to_q.weight.dtype)
+
+        else:
+            key = attn.to_k(encoder_hidden_states)
+            value = attn.to_v(encoder_hidden_states)
+
+        if crossattn:
+            detach = torch.ones_like(key)
+            detach[:, :1, :] = detach[:, :1, :] * 0.0
+            key = detach * key + (1 - detach) * key.detach()
+            value = detach * value + (1 - detach) * value.detach()
+
+        inner_dim = hidden_states.shape[-1]
+
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        if self.train_q_out:
+            # linear proj
+            hidden_states = self.to_out_custom_diffusion[0](hidden_states)
+            # dropout
+            hidden_states = self.to_out_custom_diffusion[1](hidden_states)
+        else:
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+
+        return hidden_states
+
+
+class SlicedAttnProcessor:
+    r"""
+    Processor for implementing sliced attention.
+
+    Args:
+        slice_size (`int`, *optional*):
+            The number of steps to compute attention. Uses as many slices as `attention_head_dim // slice_size`, and
+            `attention_head_dim` must be a multiple of the `slice_size`.
+    """
+
+    def __init__(self, slice_size: int):
+        self.slice_size = slice_size
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+        dim = query.shape[-1]
+        query = attn.head_to_batch_dim(query)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        batch_size_attention, query_tokens, _ = query.shape
+        hidden_states = torch.zeros(
+            (batch_size_attention, query_tokens, dim // attn.heads), device=query.device, dtype=query.dtype
+        )
+
+        for i in range(batch_size_attention // self.slice_size):
+            start_idx = i * self.slice_size
+            end_idx = (i + 1) * self.slice_size
+
+            query_slice = query[start_idx:end_idx]
+            key_slice = key[start_idx:end_idx]
+            attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
+
+            attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
+
+            attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])
+
+            hidden_states[start_idx:end_idx] = attn_slice
+
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class SlicedAttnAddedKVProcessor:
+    r"""
+    Processor for implementing sliced attention with extra learnable key and value matrices for the text encoder.
+
+    Args:
+        slice_size (`int`, *optional*):
+            The number of steps to compute attention. Uses as many slices as `attention_head_dim // slice_size`, and
+            `attention_head_dim` must be a multiple of the `slice_size`.
+    """
+
+    def __init__(self, slice_size):
+        self.slice_size = slice_size
+
+    def __call__(
+        self,
+        attn: "Attention",
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2)
+
+        batch_size, sequence_length, _ = hidden_states.shape
+
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+        dim = query.shape[-1]
+        query = attn.head_to_batch_dim(query)
+
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+
+        encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
+        encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
+
+        if not attn.only_cross_attention:
+            key = attn.to_k(hidden_states)
+            value = attn.to_v(hidden_states)
+            key = attn.head_to_batch_dim(key)
+            value = attn.head_to_batch_dim(value)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=1)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=1)
+        else:
+            key = encoder_hidden_states_key_proj
+            value = encoder_hidden_states_value_proj
+
+        batch_size_attention, query_tokens, _ = query.shape
+        hidden_states = torch.zeros(
+            (batch_size_attention, query_tokens, dim // attn.heads), device=query.device, dtype=query.dtype
+        )
+
+        for i in range(batch_size_attention // self.slice_size):
+            start_idx = i * self.slice_size
+            end_idx = (i + 1) * self.slice_size
+
+            query_slice = query[start_idx:end_idx]
+            key_slice = key[start_idx:end_idx]
+            attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None
+
+            attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
+
+            attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx])
+
+            hidden_states[start_idx:end_idx] = attn_slice
+
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape)
+        hidden_states = hidden_states + residual
+
+        return hidden_states
+
+
+class SpatialNorm(nn.Module):
+    """
+    Spatially conditioned normalization as defined in https://arxiv.org/abs/2209.09002.
+
+    Args:
+        f_channels (`int`):
+            The number of channels for input to group normalization layer, and output of the spatial norm layer.
+        zq_channels (`int`):
+            The number of channels for the quantized vector as described in the paper.
+    """
+
+    def __init__(
+        self,
+        f_channels: int,
+        zq_channels: int,
+    ):
+        super().__init__()
+        self.norm_layer = nn.GroupNorm(num_channels=f_channels, num_groups=32, eps=1e-6, affine=True)
+        self.conv_y = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
+        self.conv_b = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, f: torch.FloatTensor, zq: torch.FloatTensor) -> torch.FloatTensor:
+        f_size = f.shape[-2:]
+        zq = F.interpolate(zq, size=f_size, mode="nearest")
+        norm_f = self.norm_layer(f)
+        new_f = norm_f * self.conv_y(zq) + self.conv_b(zq)
+        return new_f
+
+
+## Deprecated
+class LoRAAttnProcessor(nn.Module):
+    r"""
+    Processor for implementing the LoRA attention mechanism.
+
+    Args:
+        hidden_size (`int`, *optional*):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the `encoder_hidden_states`.
+        rank (`int`, defaults to 4):
+            The dimension of the LoRA update matrices.
+        network_alpha (`int`, *optional*):
+            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
+        kwargs (`dict`):
+            Additional keyword arguments to pass to the `LoRALinearLayer` layers.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        cross_attention_dim: Optional[int] = None,
+        rank: int = 4,
+        network_alpha: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.rank = rank
+
+        q_rank = kwargs.pop("q_rank", None)
+        q_hidden_size = kwargs.pop("q_hidden_size", None)
+        q_rank = q_rank if q_rank is not None else rank
+        q_hidden_size = q_hidden_size if q_hidden_size is not None else hidden_size
+
+        v_rank = kwargs.pop("v_rank", None)
+        v_hidden_size = kwargs.pop("v_hidden_size", None)
+        v_rank = v_rank if v_rank is not None else rank
+        v_hidden_size = v_hidden_size if v_hidden_size is not None else hidden_size
+
+        out_rank = kwargs.pop("out_rank", None)
+        out_hidden_size = kwargs.pop("out_hidden_size", None)
+        out_rank = out_rank if out_rank is not None else rank
+        out_hidden_size = out_hidden_size if out_hidden_size is not None else hidden_size
+
+        self.to_q_lora = LoRALinearLayer(q_hidden_size, q_hidden_size, q_rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or v_hidden_size, v_hidden_size, v_rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(out_hidden_size, out_hidden_size, out_rank, network_alpha)
+
+    def __call__(self, attn: Attention, hidden_states: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+        self_cls_name = self.__class__.__name__
+        deprecate(
+            self_cls_name,
+            "0.26.0",
+            (
+                f"Make sure use {self_cls_name[4:]} instead by setting"
+                "LoRA layers to `self.{to_q,to_k,to_v,to_out[0]}.lora_layer` respectively. This will be done automatically when using"
+                " `LoraLoaderMixin.load_lora_weights`"
+            ),
+        )
+        attn.to_q.lora_layer = self.to_q_lora.to(hidden_states.device)
+        attn.to_k.lora_layer = self.to_k_lora.to(hidden_states.device)
+        attn.to_v.lora_layer = self.to_v_lora.to(hidden_states.device)
+        attn.to_out[0].lora_layer = self.to_out_lora.to(hidden_states.device)
+
+        attn._modules.pop("processor")
+        attn.processor = AttnProcessor()
+        return attn.processor(attn, hidden_states, *args, **kwargs)
+
+
+class LoRAAttnProcessor2_0(nn.Module):
+    r"""
+    Processor for implementing the LoRA attention mechanism using PyTorch 2.0's memory-efficient scaled dot-product
+    attention.
+
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the `encoder_hidden_states`.
+        rank (`int`, defaults to 4):
+            The dimension of the LoRA update matrices.
+        network_alpha (`int`, *optional*):
+            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
+        kwargs (`dict`):
+            Additional keyword arguments to pass to the `LoRALinearLayer` layers.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        cross_attention_dim: Optional[int] = None,
+        rank: int = 4,
+        network_alpha: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__()
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.rank = rank
+
+        q_rank = kwargs.pop("q_rank", None)
+        q_hidden_size = kwargs.pop("q_hidden_size", None)
+        q_rank = q_rank if q_rank is not None else rank
+        q_hidden_size = q_hidden_size if q_hidden_size is not None else hidden_size
+
+        v_rank = kwargs.pop("v_rank", None)
+        v_hidden_size = kwargs.pop("v_hidden_size", None)
+        v_rank = v_rank if v_rank is not None else rank
+        v_hidden_size = v_hidden_size if v_hidden_size is not None else hidden_size
+
+        out_rank = kwargs.pop("out_rank", None)
+        out_hidden_size = kwargs.pop("out_hidden_size", None)
+        out_rank = out_rank if out_rank is not None else rank
+        out_hidden_size = out_hidden_size if out_hidden_size is not None else hidden_size
+
+        self.to_q_lora = LoRALinearLayer(q_hidden_size, q_hidden_size, q_rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or v_hidden_size, v_hidden_size, v_rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(out_hidden_size, out_hidden_size, out_rank, network_alpha)
+
+    def __call__(self, attn: Attention, hidden_states: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+        self_cls_name = self.__class__.__name__
+        deprecate(
+            self_cls_name,
+            "0.26.0",
+            (
+                f"Make sure use {self_cls_name[4:]} instead by setting"
+                "LoRA layers to `self.{to_q,to_k,to_v,to_out[0]}.lora_layer` respectively. This will be done automatically when using"
+                " `LoraLoaderMixin.load_lora_weights`"
+            ),
+        )
+        attn.to_q.lora_layer = self.to_q_lora.to(hidden_states.device)
+        attn.to_k.lora_layer = self.to_k_lora.to(hidden_states.device)
+        attn.to_v.lora_layer = self.to_v_lora.to(hidden_states.device)
+        attn.to_out[0].lora_layer = self.to_out_lora.to(hidden_states.device)
+
+        attn._modules.pop("processor")
+        attn.processor = AttnProcessor2_0()
+        return attn.processor(attn, hidden_states, *args, **kwargs)
+
+
+class LoRAXFormersAttnProcessor(nn.Module):
+    r"""
+    Processor for implementing the LoRA attention mechanism with memory efficient attention using xFormers.
+
+    Args:
+        hidden_size (`int`, *optional*):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the `encoder_hidden_states`.
+        rank (`int`, defaults to 4):
+            The dimension of the LoRA update matrices.
+        attention_op (`Callable`, *optional*, defaults to `None`):
+            The base
+            [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to
+            use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best
+            operator.
+        network_alpha (`int`, *optional*):
+            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
+        kwargs (`dict`):
+            Additional keyword arguments to pass to the `LoRALinearLayer` layers.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        cross_attention_dim: int,
+        rank: int = 4,
+        attention_op: Optional[Callable] = None,
+        network_alpha: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__()
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.rank = rank
+        self.attention_op = attention_op
+
+        q_rank = kwargs.pop("q_rank", None)
+        q_hidden_size = kwargs.pop("q_hidden_size", None)
+        q_rank = q_rank if q_rank is not None else rank
+        q_hidden_size = q_hidden_size if q_hidden_size is not None else hidden_size
+
+        v_rank = kwargs.pop("v_rank", None)
+        v_hidden_size = kwargs.pop("v_hidden_size", None)
+        v_rank = v_rank if v_rank is not None else rank
+        v_hidden_size = v_hidden_size if v_hidden_size is not None else hidden_size
+
+        out_rank = kwargs.pop("out_rank", None)
+        out_hidden_size = kwargs.pop("out_hidden_size", None)
+        out_rank = out_rank if out_rank is not None else rank
+        out_hidden_size = out_hidden_size if out_hidden_size is not None else hidden_size
+
+        self.to_q_lora = LoRALinearLayer(q_hidden_size, q_hidden_size, q_rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(cross_attention_dim or v_hidden_size, v_hidden_size, v_rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(out_hidden_size, out_hidden_size, out_rank, network_alpha)
+
+    def __call__(self, attn: Attention, hidden_states: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+        self_cls_name = self.__class__.__name__
+        deprecate(
+            self_cls_name,
+            "0.26.0",
+            (
+                f"Make sure use {self_cls_name[4:]} instead by setting"
+                "LoRA layers to `self.{to_q,to_k,to_v,add_k_proj,add_v_proj,to_out[0]}.lora_layer` respectively. This will be done automatically when using"
+                " `LoraLoaderMixin.load_lora_weights`"
+            ),
+        )
+        attn.to_q.lora_layer = self.to_q_lora.to(hidden_states.device)
+        attn.to_k.lora_layer = self.to_k_lora.to(hidden_states.device)
+        attn.to_v.lora_layer = self.to_v_lora.to(hidden_states.device)
+        attn.to_out[0].lora_layer = self.to_out_lora.to(hidden_states.device)
+
+        attn._modules.pop("processor")
+        attn.processor = XFormersAttnProcessor()
+        return attn.processor(attn, hidden_states, *args, **kwargs)
+
+
+class LoRAAttnAddedKVProcessor(nn.Module):
+    r"""
+    Processor for implementing the LoRA attention mechanism with extra learnable key and value matrices for the text
+    encoder.
+
+    Args:
+        hidden_size (`int`, *optional*):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`, *optional*, defaults to `None`):
+            The number of channels in the `encoder_hidden_states`.
+        rank (`int`, defaults to 4):
+            The dimension of the LoRA update matrices.
+        network_alpha (`int`, *optional*):
+            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
+        kwargs (`dict`):
+            Additional keyword arguments to pass to the `LoRALinearLayer` layers.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        cross_attention_dim: Optional[int] = None,
+        rank: int = 4,
+        network_alpha: Optional[int] = None,
+    ):
+        super().__init__()
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.rank = rank
+
+        self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.add_k_proj_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.add_v_proj_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+        self.to_k_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_v_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+        self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+
+    def __call__(self, attn: Attention, hidden_states: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+        self_cls_name = self.__class__.__name__
+        deprecate(
+            self_cls_name,
+            "0.26.0",
+            (
+                f"Make sure use {self_cls_name[4:]} instead by setting"
+                "LoRA layers to `self.{to_q,to_k,to_v,add_k_proj,add_v_proj,to_out[0]}.lora_layer` respectively. This will be done automatically when using"
+                " `LoraLoaderMixin.load_lora_weights`"
+            ),
+        )
+        attn.to_q.lora_layer = self.to_q_lora.to(hidden_states.device)
+        attn.to_k.lora_layer = self.to_k_lora.to(hidden_states.device)
+        attn.to_v.lora_layer = self.to_v_lora.to(hidden_states.device)
+        attn.to_out[0].lora_layer = self.to_out_lora.to(hidden_states.device)
+
+        attn._modules.pop("processor")
+        attn.processor = AttnAddedKVProcessor()
+        return attn.processor(attn, hidden_states, *args, **kwargs)
+
+
+class IPAdapterAttnProcessor(nn.Module):
+    r"""
+    Attention processor for IP-Adapater.
+
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        num_tokens (`int`, defaults to 4):
+            The context length of the image features.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+    """
+
+    def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=4, scale=1.0):
+        super().__init__()
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.num_tokens = num_tokens
+        self.scale = scale
+
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        scale=1.0,
+    ):
+        if scale != 1.0:
+            logger.warning("`scale` of IPAttnProcessor should be set with `set_ip_adapter_scale`.")
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        # split hidden states
+        end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+        encoder_hidden_states, ip_hidden_states = (
+            encoder_hidden_states[:, :end_pos, :],
+            encoder_hidden_states[:, end_pos:, :],
+        )
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # for ip-adapter
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+
+        ip_key = attn.head_to_batch_dim(ip_key)
+        ip_value = attn.head_to_batch_dim(ip_value)
+
+        ip_attention_probs = attn.get_attention_scores(query, ip_key, None)
+        ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
+        ip_hidden_states = attn.batch_to_head_dim(ip_hidden_states)
+
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+class IPAdapterAttnProcessor2_0(torch.nn.Module):
+    r"""
+    Attention processor for IP-Adapater for PyTorch 2.0.
+
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`):
+            The number of channels in the `encoder_hidden_states`.
+        num_tokens (`int`, defaults to 4):
+            The context length of the image features.
+        scale (`float`, defaults to 1.0):
+            the weight scale of image prompt.
+    """
+
+    def __init__(self, hidden_size, cross_attention_dim=None, num_tokens=4, scale=1.0):
+        super().__init__()
+
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                f"{self.__class__.__name__} requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.num_tokens = num_tokens
+        self.scale = scale
+
+        self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+        self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        scale=1.0,
+    ):
+        if scale != 1.0:
+            logger.warning("`scale` of IPAttnProcessor should be set by `set_ip_adapter_scale`.")
+        residual = hidden_states
+
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+
+        input_ndim = hidden_states.ndim
+
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        # split hidden states
+        end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+        encoder_hidden_states, ip_hidden_states = (
+            encoder_hidden_states[:, :end_pos, :],
+            encoder_hidden_states[:, end_pos:, :],
+        )
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # for ip-adapter
+        ip_key = self.to_k_ip(ip_hidden_states)
+        ip_value = self.to_v_ip(ip_hidden_states)
+
+        ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        ip_hidden_states = F.scaled_dot_product_attention(
+            query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
+        )
+
+        ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        ip_hidden_states = ip_hidden_states.to(query.dtype)
+
+        hidden_states = hidden_states + self.scale * ip_hidden_states
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+
+LORA_ATTENTION_PROCESSORS = (
+    LoRAAttnProcessor,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    LoRAAttnAddedKVProcessor,
+)
+
+ADDED_KV_ATTENTION_PROCESSORS = (
+    AttnAddedKVProcessor,
+    SlicedAttnAddedKVProcessor,
+    AttnAddedKVProcessor2_0,
+    XFormersAttnAddedKVProcessor,
+    LoRAAttnAddedKVProcessor,
+)
+
+CROSS_ATTENTION_PROCESSORS = (
+    AttnProcessor,
+    AttnProcessor2_0,
+    XFormersAttnProcessor,
+    SlicedAttnProcessor,
+    LoRAAttnProcessor,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    IPAdapterAttnProcessor,
+    IPAdapterAttnProcessor2_0,
+)
+
+AttentionProcessor = Union[
+    AttnProcessor,
+    AttnProcessor2_0,
+    FusedAttnProcessor2_0,
+    XFormersAttnProcessor,
+    SlicedAttnProcessor,
+    AttnAddedKVProcessor,
+    SlicedAttnAddedKVProcessor,
+    AttnAddedKVProcessor2_0,
+    XFormersAttnAddedKVProcessor,
+    CustomDiffusionAttnProcessor,
+    CustomDiffusionXFormersAttnProcessor,
+    CustomDiffusionAttnProcessor2_0,
+    # deprecated
+    LoRAAttnProcessor,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    LoRAAttnAddedKVProcessor,
+]
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/__init__.py
new file mode 100644
index 000000000..201a40ff1
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/__init__.py
@@ -0,0 +1,5 @@
+from .autoencoder_asym_kl import AsymmetricAutoencoderKL
+from .autoencoder_kl import AutoencoderKL
+from .autoencoder_kl_temporal_decoder import AutoencoderKLTemporalDecoder
+from .autoencoder_tiny import AutoencoderTiny
+from .consistency_decoder_vae import ConsistencyDecoderVAE
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/autoencoder_asym_kl.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/autoencoder_asym_kl.py
new file mode 100644
index 000000000..fc2041d2e
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/autoencoder_asym_kl.py
@@ -0,0 +1,186 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils.accelerate_utils import apply_forward_hook
+from ..modeling_outputs import AutoencoderKLOutput
+from ..modeling_utils import ModelMixin
+from .vae import DecoderOutput, DiagonalGaussianDistribution, Encoder, MaskConditionDecoder
+
+
+class AsymmetricAutoencoderKL(ModelMixin, ConfigMixin):
+    r"""
+    Designing a Better Asymmetric VQGAN for StableDiffusion https://arxiv.org/abs/2306.04632 . A VAE model with KL loss
+    for encoding images into latents and decoding latent representations into images.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            Tuple of downsample block types.
+        down_block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
+            Tuple of down block output channels.
+        layers_per_down_block (`int`, *optional*, defaults to `1`):
+            Number layers for down block.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            Tuple of upsample block types.
+        up_block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
+            Tuple of up block output channels.
+        layers_per_up_block (`int`, *optional*, defaults to `1`):
+            Number layers for up block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space.
+        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
+        norm_num_groups (`int`, *optional*, defaults to `32`):
+            Number of groups to use for the first normalization layer in ResNet blocks.
+        scaling_factor (`float`, *optional*, defaults to 0.18215):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",),
+        down_block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_down_block: int = 1,
+        up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
+        up_block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_up_block: int = 1,
+        act_fn: str = "silu",
+        latent_channels: int = 4,
+        norm_num_groups: int = 32,
+        sample_size: int = 32,
+        scaling_factor: float = 0.18215,
+    ) -> None:
+        super().__init__()
+
+        # pass init params to Encoder
+        self.encoder = Encoder(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=down_block_out_channels,
+            layers_per_block=layers_per_down_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            double_z=True,
+        )
+
+        # pass init params to Decoder
+        self.decoder = MaskConditionDecoder(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=up_block_out_channels,
+            layers_per_block=layers_per_up_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+        )
+
+        self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1)
+        self.post_quant_conv = nn.Conv2d(latent_channels, latent_channels, 1)
+
+        self.use_slicing = False
+        self.use_tiling = False
+
+        self.register_to_config(block_out_channels=up_block_out_channels)
+        self.register_to_config(force_upcast=False)
+
+    @apply_forward_hook
+    def encode(
+        self, x: torch.FloatTensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[torch.FloatTensor]]:
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+
+        if not return_dict:
+            return (posterior,)
+
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    def _decode(
+        self,
+        z: torch.FloatTensor,
+        image: Optional[torch.FloatTensor] = None,
+        mask: Optional[torch.FloatTensor] = None,
+        return_dict: bool = True,
+    ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z, image, mask)
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    @apply_forward_hook
+    def decode(
+        self,
+        z: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        image: Optional[torch.FloatTensor] = None,
+        mask: Optional[torch.FloatTensor] = None,
+        return_dict: bool = True,
+    ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
+        decoded = self._decode(z, image, mask).sample
+
+        if not return_dict:
+            return (decoded,)
+
+        return DecoderOutput(sample=decoded)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        mask: Optional[torch.FloatTensor] = None,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): Input sample.
+            mask (`torch.FloatTensor`, *optional*, defaults to `None`): Optional inpainting mask.
+            sample_posterior (`bool`, *optional*, defaults to `False`):
+                Whether to sample from the posterior.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z, sample, mask).sample
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/autoencoder_kl.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/autoencoder_kl.py
new file mode 100644
index 000000000..9bbf2023e
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/autoencoder_kl.py
@@ -0,0 +1,489 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import FromOriginalVAEMixin
+from ...utils.accelerate_utils import apply_forward_hook
+from ..attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    Attention,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from ..modeling_outputs import AutoencoderKLOutput
+from ..modeling_utils import ModelMixin
+from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder
+
+
+class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
+    r"""
+    A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            Tuple of downsample block types.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            Tuple of upsample block types.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
+            Tuple of block output channels.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space.
+        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
+        scaling_factor (`float`, *optional*, defaults to 0.18215):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+        force_upcast (`bool`, *optional*, default to `True`):
+            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
+            can be fine-tuned / trained to a lower range without loosing too much precision in which case
+            `force_upcast` can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
+        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int] = (64,),
+        layers_per_block: int = 1,
+        act_fn: str = "silu",
+        latent_channels: int = 4,
+        norm_num_groups: int = 32,
+        sample_size: int = 32,
+        scaling_factor: float = 0.18215,
+        latents_mean: Optional[Tuple[float]] = None,
+        latents_std: Optional[Tuple[float]] = None,
+        force_upcast: float = True,
+    ):
+        super().__init__()
+
+        # pass init params to Encoder
+        self.encoder = Encoder(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            double_z=True,
+        )
+
+        # pass init params to Decoder
+        self.decoder = Decoder(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            norm_num_groups=norm_num_groups,
+            act_fn=act_fn,
+        )
+
+        self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1)
+        self.post_quant_conv = nn.Conv2d(latent_channels, latent_channels, 1)
+
+        self.use_slicing = False
+        self.use_tiling = False
+
+        # only relevant if vae tiling is enabled
+        self.tile_sample_min_size = self.config.sample_size
+        sample_size = (
+            self.config.sample_size[0]
+            if isinstance(self.config.sample_size, (list, tuple))
+            else self.config.sample_size
+        )
+        self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
+        self.tile_overlap_factor = 0.25
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (Encoder, Decoder)):
+            module.gradient_checkpointing = value
+
+    def enable_tiling(self, use_tiling: bool = True):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.use_tiling = use_tiling
+
+    def disable_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.enable_tiling(False)
+
+    def enable_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+
+    def disable_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    @apply_forward_hook
+    def encode(
+        self, x: torch.FloatTensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        """
+        Encode a batch of images into latents.
+
+        Args:
+            x (`torch.FloatTensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+                The latent representations of the encoded images. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
+            return self.tiled_encode(x, return_dict=return_dict)
+
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self.encoder(x_slice) for x_slice in x.split(1)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = self.encoder(x)
+
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+
+        if not return_dict:
+            return (posterior,)
+
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    def _decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+        if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
+            return self.tiled_decode(z, return_dict=return_dict)
+
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    @apply_forward_hook
+    def decode(
+        self, z: torch.FloatTensor, return_dict: bool = True, generator=None
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
+        """
+        Decode a batch of images.
+
+        Args:
+            z (`torch.FloatTensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+
+        """
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = self._decode(z).sample
+
+        if not return_dict:
+            return (decoded,)
+
+        return DecoderOutput(sample=decoded)
+
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[2], b.shape[2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
+        return b
+
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[3], b.shape[3], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
+        return b
+
+    def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput:
+        r"""Encode a batch of images using a tiled encoder.
+
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
+        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
+        output, but they should be much less noticeable.
+
+        Args:
+            x (`torch.FloatTensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
+                If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
+                `tuple` is returned.
+        """
+        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_latent_min_size - blend_extent
+
+        # Split the image into 512x512 tiles and encode them separately.
+        rows = []
+        for i in range(0, x.shape[2], overlap_size):
+            row = []
+            for j in range(0, x.shape[3], overlap_size):
+                tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
+                tile = self.encoder(tile)
+                tile = self.quant_conv(tile)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=3))
+
+        moments = torch.cat(result_rows, dim=2)
+        posterior = DiagonalGaussianDistribution(moments)
+
+        if not return_dict:
+            return (posterior,)
+
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    def tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+        r"""
+        Decode a batch of images using a tiled decoder.
+
+        Args:
+            z (`torch.FloatTensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_sample_min_size - blend_extent
+
+        # Split z into overlapping 64x64 tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, z.shape[2], overlap_size):
+            row = []
+            for j in range(0, z.shape[3], overlap_size):
+                tile = z[:, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size]
+                tile = self.post_quant_conv(tile)
+                decoded = self.decoder(tile)
+                row.append(decoded)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=3))
+
+        dec = torch.cat(result_rows, dim=2)
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): Input sample.
+            sample_posterior (`bool`, *optional*, defaults to `False`):
+                Whether to sample from the posterior.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z).sample
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
new file mode 100644
index 000000000..b12226fa4
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
@@ -0,0 +1,399 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import is_torch_version
+from ...utils.accelerate_utils import apply_forward_hook
+from ..attention_processor import CROSS_ATTENTION_PROCESSORS, AttentionProcessor, AttnProcessor
+from ..modeling_outputs import AutoencoderKLOutput
+from ..modeling_utils import ModelMixin
+from ..unets.unet_3d_blocks import MidBlockTemporalDecoder, UpBlockTemporalDecoder
+from .vae import DecoderOutput, DiagonalGaussianDistribution, Encoder
+
+
+class TemporalDecoder(nn.Module):
+    def __init__(
+        self,
+        in_channels: int = 4,
+        out_channels: int = 3,
+        block_out_channels: Tuple[int] = (128, 256, 512, 512),
+        layers_per_block: int = 2,
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+
+        self.conv_in = nn.Conv2d(in_channels, block_out_channels[-1], kernel_size=3, stride=1, padding=1)
+        self.mid_block = MidBlockTemporalDecoder(
+            num_layers=self.layers_per_block,
+            in_channels=block_out_channels[-1],
+            out_channels=block_out_channels[-1],
+            attention_head_dim=block_out_channels[-1],
+        )
+
+        # up
+        self.up_blocks = nn.ModuleList([])
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i in range(len(block_out_channels)):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+
+            is_final_block = i == len(block_out_channels) - 1
+            up_block = UpBlockTemporalDecoder(
+                num_layers=self.layers_per_block + 1,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                add_upsample=not is_final_block,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=32, eps=1e-6)
+
+        self.conv_act = nn.SiLU()
+        self.conv_out = torch.nn.Conv2d(
+            in_channels=block_out_channels[0],
+            out_channels=out_channels,
+            kernel_size=3,
+            padding=1,
+        )
+
+        conv_out_kernel_size = (3, 1, 1)
+        padding = [int(k // 2) for k in conv_out_kernel_size]
+        self.time_conv_out = torch.nn.Conv3d(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=conv_out_kernel_size,
+            padding=padding,
+        )
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        image_only_indicator: torch.FloatTensor,
+        num_frames: int = 1,
+    ) -> torch.FloatTensor:
+        r"""The forward method of the `Decoder` class."""
+
+        sample = self.conv_in(sample)
+
+        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
+        if self.training and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            if is_torch_version(">=", "1.11.0"):
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block),
+                    sample,
+                    image_only_indicator,
+                    use_reentrant=False,
+                )
+                sample = sample.to(upscale_dtype)
+
+                # up
+                for up_block in self.up_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(up_block),
+                        sample,
+                        image_only_indicator,
+                        use_reentrant=False,
+                    )
+            else:
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block),
+                    sample,
+                    image_only_indicator,
+                )
+                sample = sample.to(upscale_dtype)
+
+                # up
+                for up_block in self.up_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(up_block),
+                        sample,
+                        image_only_indicator,
+                    )
+        else:
+            # middle
+            sample = self.mid_block(sample, image_only_indicator=image_only_indicator)
+            sample = sample.to(upscale_dtype)
+
+            # up
+            for up_block in self.up_blocks:
+                sample = up_block(sample, image_only_indicator=image_only_indicator)
+
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        batch_frames, channels, height, width = sample.shape
+        batch_size = batch_frames // num_frames
+        sample = sample[None, :].reshape(batch_size, num_frames, channels, height, width).permute(0, 2, 1, 3, 4)
+        sample = self.time_conv_out(sample)
+
+        sample = sample.permute(0, 2, 1, 3, 4).reshape(batch_frames, channels, height, width)
+
+        return sample
+
+
+class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin):
+    r"""
+    A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            Tuple of downsample block types.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
+            Tuple of block output channels.
+        layers_per_block: (`int`, *optional*, defaults to 1): Number of layers per block.
+        latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space.
+        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
+        scaling_factor (`float`, *optional*, defaults to 0.18215):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+        force_upcast (`bool`, *optional*, default to `True`):
+            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
+            can be fine-tuned / trained to a lower range without loosing too much precision in which case
+            `force_upcast` can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
+        block_out_channels: Tuple[int] = (64,),
+        layers_per_block: int = 1,
+        latent_channels: int = 4,
+        sample_size: int = 32,
+        scaling_factor: float = 0.18215,
+        force_upcast: float = True,
+    ):
+        super().__init__()
+
+        # pass init params to Encoder
+        self.encoder = Encoder(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            double_z=True,
+        )
+
+        # pass init params to Decoder
+        self.decoder = TemporalDecoder(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+        )
+
+        self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1)
+
+        sample_size = (
+            self.config.sample_size[0]
+            if isinstance(self.config.sample_size, (list, tuple))
+            else self.config.sample_size
+        )
+        self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
+        self.tile_overlap_factor = 0.25
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (Encoder, TemporalDecoder)):
+            module.gradient_checkpointing = value
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    @apply_forward_hook
+    def encode(
+        self, x: torch.FloatTensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        """
+        Encode a batch of images into latents.
+
+        Args:
+            x (`torch.FloatTensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+                The latent representations of the encoded images. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+
+        if not return_dict:
+            return (posterior,)
+
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    @apply_forward_hook
+    def decode(
+        self,
+        z: torch.FloatTensor,
+        num_frames: int,
+        return_dict: bool = True,
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
+        """
+        Decode a batch of images.
+
+        Args:
+            z (`torch.FloatTensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+
+        """
+        batch_size = z.shape[0] // num_frames
+        image_only_indicator = torch.zeros(batch_size, num_frames, dtype=z.dtype, device=z.device)
+        decoded = self.decoder(z, num_frames=num_frames, image_only_indicator=image_only_indicator)
+
+        if not return_dict:
+            return (decoded,)
+
+        return DecoderOutput(sample=decoded)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[torch.Generator] = None,
+        num_frames: int = 1,
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): Input sample.
+            sample_posterior (`bool`, *optional*, defaults to `False`):
+                Whether to sample from the posterior.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+
+        dec = self.decode(z, num_frames=num_frames).sample
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/autoencoder_tiny.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/autoencoder_tiny.py
new file mode 100644
index 000000000..ef43526cf
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/autoencoder_tiny.py
@@ -0,0 +1,347 @@
+# Copyright 2024 Ollin Boer Bohan and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import BaseOutput
+from ...utils.accelerate_utils import apply_forward_hook
+from ..modeling_utils import ModelMixin
+from .vae import DecoderOutput, DecoderTiny, EncoderTiny
+
+
+@dataclass
+class AutoencoderTinyOutput(BaseOutput):
+    """
+    Output of AutoencoderTiny encoding method.
+
+    Args:
+        latents (`torch.Tensor`): Encoded outputs of the `Encoder`.
+
+    """
+
+    latents: torch.Tensor
+
+
+class AutoencoderTiny(ModelMixin, ConfigMixin):
+    r"""
+    A tiny distilled VAE model for encoding images into latents and decoding latent representations into images.
+
+    [`AutoencoderTiny`] is a wrapper around the original implementation of `TAESD`.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for its generic methods implemented for
+    all models (such as downloading or saving).
+
+    Parameters:
+        in_channels (`int`, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (`int`,  *optional*, defaults to 3): Number of channels in the output.
+        encoder_block_out_channels (`Tuple[int]`, *optional*, defaults to `(64, 64, 64, 64)`):
+            Tuple of integers representing the number of output channels for each encoder block. The length of the
+            tuple should be equal to the number of encoder blocks.
+        decoder_block_out_channels (`Tuple[int]`, *optional*, defaults to `(64, 64, 64, 64)`):
+            Tuple of integers representing the number of output channels for each decoder block. The length of the
+            tuple should be equal to the number of decoder blocks.
+        act_fn (`str`, *optional*, defaults to `"relu"`):
+            Activation function to be used throughout the model.
+        latent_channels (`int`, *optional*, defaults to 4):
+            Number of channels in the latent representation. The latent space acts as a compressed representation of
+            the input image.
+        upsampling_scaling_factor (`int`, *optional*, defaults to 2):
+            Scaling factor for upsampling in the decoder. It determines the size of the output image during the
+            upsampling process.
+        num_encoder_blocks (`Tuple[int]`, *optional*, defaults to `(1, 3, 3, 3)`):
+            Tuple of integers representing the number of encoder blocks at each stage of the encoding process. The
+            length of the tuple should be equal to the number of stages in the encoder. Each stage has a different
+            number of encoder blocks.
+        num_decoder_blocks (`Tuple[int]`, *optional*, defaults to `(3, 3, 3, 1)`):
+            Tuple of integers representing the number of decoder blocks at each stage of the decoding process. The
+            length of the tuple should be equal to the number of stages in the decoder. Each stage has a different
+            number of decoder blocks.
+        latent_magnitude (`float`, *optional*, defaults to 3.0):
+            Magnitude of the latent representation. This parameter scales the latent representation values to control
+            the extent of information preservation.
+        latent_shift (float, *optional*, defaults to 0.5):
+            Shift applied to the latent representation. This parameter controls the center of the latent space.
+        scaling_factor (`float`, *optional*, defaults to 1.0):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper. For this Autoencoder,
+            however, no such scaling factor was used, hence the value of 1.0 as the default.
+        force_upcast (`bool`, *optional*, default to `False`):
+            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
+            can be fine-tuned / trained to a lower range without losing too much precision, in which case
+            `force_upcast` can be set to `False` (see this fp16-friendly
+            [AutoEncoder](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        encoder_block_out_channels: Tuple[int, ...] = (64, 64, 64, 64),
+        decoder_block_out_channels: Tuple[int, ...] = (64, 64, 64, 64),
+        act_fn: str = "relu",
+        latent_channels: int = 4,
+        upsampling_scaling_factor: int = 2,
+        num_encoder_blocks: Tuple[int, ...] = (1, 3, 3, 3),
+        num_decoder_blocks: Tuple[int, ...] = (3, 3, 3, 1),
+        latent_magnitude: int = 3,
+        latent_shift: float = 0.5,
+        force_upcast: bool = False,
+        scaling_factor: float = 1.0,
+    ):
+        super().__init__()
+
+        if len(encoder_block_out_channels) != len(num_encoder_blocks):
+            raise ValueError("`encoder_block_out_channels` should have the same length as `num_encoder_blocks`.")
+        if len(decoder_block_out_channels) != len(num_decoder_blocks):
+            raise ValueError("`decoder_block_out_channels` should have the same length as `num_decoder_blocks`.")
+
+        self.encoder = EncoderTiny(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            num_blocks=num_encoder_blocks,
+            block_out_channels=encoder_block_out_channels,
+            act_fn=act_fn,
+        )
+
+        self.decoder = DecoderTiny(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            num_blocks=num_decoder_blocks,
+            block_out_channels=decoder_block_out_channels,
+            upsampling_scaling_factor=upsampling_scaling_factor,
+            act_fn=act_fn,
+        )
+
+        self.latent_magnitude = latent_magnitude
+        self.latent_shift = latent_shift
+        self.scaling_factor = scaling_factor
+
+        self.use_slicing = False
+        self.use_tiling = False
+
+        # only relevant if vae tiling is enabled
+        self.spatial_scale_factor = 2**out_channels
+        self.tile_overlap_factor = 0.125
+        self.tile_sample_min_size = 512
+        self.tile_latent_min_size = self.tile_sample_min_size // self.spatial_scale_factor
+
+        self.register_to_config(block_out_channels=decoder_block_out_channels)
+        self.register_to_config(force_upcast=False)
+
+    def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
+        if isinstance(module, (EncoderTiny, DecoderTiny)):
+            module.gradient_checkpointing = value
+
+    def scale_latents(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        """raw latents -> [0, 1]"""
+        return x.div(2 * self.latent_magnitude).add(self.latent_shift).clamp(0, 1)
+
+    def unscale_latents(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        """[0, 1] -> raw latents"""
+        return x.sub(self.latent_shift).mul(2 * self.latent_magnitude)
+
+    def enable_slicing(self) -> None:
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+
+    def disable_slicing(self) -> None:
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+
+    def enable_tiling(self, use_tiling: bool = True) -> None:
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.use_tiling = use_tiling
+
+    def disable_tiling(self) -> None:
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.enable_tiling(False)
+
+    def _tiled_encode(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        r"""Encode a batch of images using a tiled encoder.
+
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output.
+
+        Args:
+            x (`torch.FloatTensor`): Input batch of images.
+
+        Returns:
+            `torch.FloatTensor`: Encoded batch of images.
+        """
+        # scale of encoder output relative to input
+        sf = self.spatial_scale_factor
+        tile_size = self.tile_sample_min_size
+
+        # number of pixels to blend and to traverse between tile
+        blend_size = int(tile_size * self.tile_overlap_factor)
+        traverse_size = tile_size - blend_size
+
+        # tiles index (up/left)
+        ti = range(0, x.shape[-2], traverse_size)
+        tj = range(0, x.shape[-1], traverse_size)
+
+        # mask for blending
+        blend_masks = torch.stack(
+            torch.meshgrid([torch.arange(tile_size / sf) / (blend_size / sf - 1)] * 2, indexing="ij")
+        )
+        blend_masks = blend_masks.clamp(0, 1).to(x.device)
+
+        # output array
+        out = torch.zeros(x.shape[0], 4, x.shape[-2] // sf, x.shape[-1] // sf, device=x.device)
+        for i in ti:
+            for j in tj:
+                tile_in = x[..., i : i + tile_size, j : j + tile_size]
+                # tile result
+                tile_out = out[..., i // sf : (i + tile_size) // sf, j // sf : (j + tile_size) // sf]
+                tile = self.encoder(tile_in)
+                h, w = tile.shape[-2], tile.shape[-1]
+                # blend tile result into output
+                blend_mask_i = torch.ones_like(blend_masks[0]) if i == 0 else blend_masks[0]
+                blend_mask_j = torch.ones_like(blend_masks[1]) if j == 0 else blend_masks[1]
+                blend_mask = blend_mask_i * blend_mask_j
+                tile, blend_mask = tile[..., :h, :w], blend_mask[..., :h, :w]
+                tile_out.copy_(blend_mask * tile + (1 - blend_mask) * tile_out)
+        return out
+
+    def _tiled_decode(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        r"""Encode a batch of images using a tiled encoder.
+
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output.
+
+        Args:
+            x (`torch.FloatTensor`): Input batch of images.
+
+        Returns:
+            `torch.FloatTensor`: Encoded batch of images.
+        """
+        # scale of decoder output relative to input
+        sf = self.spatial_scale_factor
+        tile_size = self.tile_latent_min_size
+
+        # number of pixels to blend and to traverse between tiles
+        blend_size = int(tile_size * self.tile_overlap_factor)
+        traverse_size = tile_size - blend_size
+
+        # tiles index (up/left)
+        ti = range(0, x.shape[-2], traverse_size)
+        tj = range(0, x.shape[-1], traverse_size)
+
+        # mask for blending
+        blend_masks = torch.stack(
+            torch.meshgrid([torch.arange(tile_size * sf) / (blend_size * sf - 1)] * 2, indexing="ij")
+        )
+        blend_masks = blend_masks.clamp(0, 1).to(x.device)
+
+        # output array
+        out = torch.zeros(x.shape[0], 3, x.shape[-2] * sf, x.shape[-1] * sf, device=x.device)
+        for i in ti:
+            for j in tj:
+                tile_in = x[..., i : i + tile_size, j : j + tile_size]
+                # tile result
+                tile_out = out[..., i * sf : (i + tile_size) * sf, j * sf : (j + tile_size) * sf]
+                tile = self.decoder(tile_in)
+                h, w = tile.shape[-2], tile.shape[-1]
+                # blend tile result into output
+                blend_mask_i = torch.ones_like(blend_masks[0]) if i == 0 else blend_masks[0]
+                blend_mask_j = torch.ones_like(blend_masks[1]) if j == 0 else blend_masks[1]
+                blend_mask = (blend_mask_i * blend_mask_j)[..., :h, :w]
+                tile_out.copy_(blend_mask * tile + (1 - blend_mask) * tile_out)
+        return out
+
+    @apply_forward_hook
+    def encode(
+        self, x: torch.FloatTensor, return_dict: bool = True
+    ) -> Union[AutoencoderTinyOutput, Tuple[torch.FloatTensor]]:
+        if self.use_slicing and x.shape[0] > 1:
+            output = [
+                self._tiled_encode(x_slice) if self.use_tiling else self.encoder(x_slice) for x_slice in x.split(1)
+            ]
+            output = torch.cat(output)
+        else:
+            output = self._tiled_encode(x) if self.use_tiling else self.encoder(x)
+
+        if not return_dict:
+            return (output,)
+
+        return AutoencoderTinyOutput(latents=output)
+
+    @apply_forward_hook
+    def decode(
+        self, x: torch.FloatTensor, generator: Optional[torch.Generator] = None, return_dict: bool = True
+    ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
+        if self.use_slicing and x.shape[0] > 1:
+            output = [self._tiled_decode(x_slice) if self.use_tiling else self.decoder(x) for x_slice in x.split(1)]
+            output = torch.cat(output)
+        else:
+            output = self._tiled_decode(x) if self.use_tiling else self.decoder(x)
+
+        if not return_dict:
+            return (output,)
+
+        return DecoderOutput(sample=output)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        return_dict: bool = True,
+    ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): Input sample.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        enc = self.encode(sample).latents
+
+        # scale latents to be in [0, 1], then quantize latents to a byte tensor,
+        # as if we were storing the latents in an RGBA uint8 image.
+        scaled_enc = self.scale_latents(enc).mul_(255).round_().byte()
+
+        # unquantize latents back into [0, 1], then unscale latents back to their original range,
+        # as if we were loading the latents from an RGBA uint8 image.
+        unscaled_enc = self.unscale_latents(scaled_enc / 255.0)
+
+        dec = self.decode(unscaled_enc)
+
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/consistency_decoder_vae.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/consistency_decoder_vae.py
new file mode 100644
index 000000000..72c512da9
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/consistency_decoder_vae.py
@@ -0,0 +1,435 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...schedulers import ConsistencyDecoderScheduler
+from ...utils import BaseOutput
+from ...utils.accelerate_utils import apply_forward_hook
+from ...utils.torch_utils import randn_tensor
+from ..attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from ..modeling_utils import ModelMixin
+from ..unets.unet_2d import UNet2DModel
+from .vae import DecoderOutput, DiagonalGaussianDistribution, Encoder
+
+
+@dataclass
+class ConsistencyDecoderVAEOutput(BaseOutput):
+    """
+    Output of encoding method.
+
+    Args:
+        latent_dist (`DiagonalGaussianDistribution`):
+            Encoded outputs of `Encoder` represented as the mean and logvar of `DiagonalGaussianDistribution`.
+            `DiagonalGaussianDistribution` allows for sampling latents from the distribution.
+    """
+
+    latent_dist: "DiagonalGaussianDistribution"
+
+
+class ConsistencyDecoderVAE(ModelMixin, ConfigMixin):
+    r"""
+    The consistency decoder used with DALL-E 3.
+
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionPipeline, ConsistencyDecoderVAE
+
+        >>> vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=torch.float16)
+        >>> pipe = StableDiffusionPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", vae=vae, torch_dtype=torch.float16
+        ... ).to("cuda")
+
+        >>> pipe("horse", generator=torch.manual_seed(0)).images
+        ```
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        scaling_factor: float = 0.18215,
+        latent_channels: int = 4,
+        encoder_act_fn: str = "silu",
+        encoder_block_out_channels: Tuple[int, ...] = (128, 256, 512, 512),
+        encoder_double_z: bool = True,
+        encoder_down_block_types: Tuple[str, ...] = (
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+            "DownEncoderBlock2D",
+        ),
+        encoder_in_channels: int = 3,
+        encoder_layers_per_block: int = 2,
+        encoder_norm_num_groups: int = 32,
+        encoder_out_channels: int = 4,
+        decoder_add_attention: bool = False,
+        decoder_block_out_channels: Tuple[int, ...] = (320, 640, 1024, 1024),
+        decoder_down_block_types: Tuple[str, ...] = (
+            "ResnetDownsampleBlock2D",
+            "ResnetDownsampleBlock2D",
+            "ResnetDownsampleBlock2D",
+            "ResnetDownsampleBlock2D",
+        ),
+        decoder_downsample_padding: int = 1,
+        decoder_in_channels: int = 7,
+        decoder_layers_per_block: int = 3,
+        decoder_norm_eps: float = 1e-05,
+        decoder_norm_num_groups: int = 32,
+        decoder_num_train_timesteps: int = 1024,
+        decoder_out_channels: int = 6,
+        decoder_resnet_time_scale_shift: str = "scale_shift",
+        decoder_time_embedding_type: str = "learned",
+        decoder_up_block_types: Tuple[str, ...] = (
+            "ResnetUpsampleBlock2D",
+            "ResnetUpsampleBlock2D",
+            "ResnetUpsampleBlock2D",
+            "ResnetUpsampleBlock2D",
+        ),
+    ):
+        super().__init__()
+        self.encoder = Encoder(
+            act_fn=encoder_act_fn,
+            block_out_channels=encoder_block_out_channels,
+            double_z=encoder_double_z,
+            down_block_types=encoder_down_block_types,
+            in_channels=encoder_in_channels,
+            layers_per_block=encoder_layers_per_block,
+            norm_num_groups=encoder_norm_num_groups,
+            out_channels=encoder_out_channels,
+        )
+
+        self.decoder_unet = UNet2DModel(
+            add_attention=decoder_add_attention,
+            block_out_channels=decoder_block_out_channels,
+            down_block_types=decoder_down_block_types,
+            downsample_padding=decoder_downsample_padding,
+            in_channels=decoder_in_channels,
+            layers_per_block=decoder_layers_per_block,
+            norm_eps=decoder_norm_eps,
+            norm_num_groups=decoder_norm_num_groups,
+            num_train_timesteps=decoder_num_train_timesteps,
+            out_channels=decoder_out_channels,
+            resnet_time_scale_shift=decoder_resnet_time_scale_shift,
+            time_embedding_type=decoder_time_embedding_type,
+            up_block_types=decoder_up_block_types,
+        )
+        self.decoder_scheduler = ConsistencyDecoderScheduler()
+        self.register_to_config(block_out_channels=encoder_block_out_channels)
+        self.register_to_config(force_upcast=False)
+        self.register_buffer(
+            "means",
+            torch.tensor([0.38862467, 0.02253063, 0.07381133, -0.0171294])[None, :, None, None],
+            persistent=False,
+        )
+        self.register_buffer(
+            "stds", torch.tensor([0.9654121, 1.0440036, 0.76147926, 0.77022034])[None, :, None, None], persistent=False
+        )
+
+        self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1)
+
+        self.use_slicing = False
+        self.use_tiling = False
+
+    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.enable_tiling
+    def enable_tiling(self, use_tiling: bool = True):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.use_tiling = use_tiling
+
+    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.disable_tiling
+    def disable_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.enable_tiling(False)
+
+    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.enable_slicing
+    def enable_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+
+    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.disable_slicing
+    def disable_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    @apply_forward_hook
+    def encode(
+        self, x: torch.FloatTensor, return_dict: bool = True
+    ) -> Union[ConsistencyDecoderVAEOutput, Tuple[DiagonalGaussianDistribution]]:
+        """
+        Encode a batch of images into latents.
+
+        Args:
+            x (`torch.FloatTensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.consistecy_decoder_vae.ConsistencyDecoderOoutput`] instead of a plain
+                tuple.
+
+        Returns:
+                The latent representations of the encoded images. If `return_dict` is True, a
+                [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] is returned, otherwise a plain `tuple`
+                is returned.
+        """
+        if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
+            return self.tiled_encode(x, return_dict=return_dict)
+
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self.encoder(x_slice) for x_slice in x.split(1)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = self.encoder(x)
+
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+
+        if not return_dict:
+            return (posterior,)
+
+        return ConsistencyDecoderVAEOutput(latent_dist=posterior)
+
+    @apply_forward_hook
+    def decode(
+        self,
+        z: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+        num_inference_steps: int = 2,
+    ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
+        z = (z * self.config.scaling_factor - self.means) / self.stds
+
+        scale_factor = 2 ** (len(self.config.block_out_channels) - 1)
+        z = F.interpolate(z, mode="nearest", scale_factor=scale_factor)
+
+        batch_size, _, height, width = z.shape
+
+        self.decoder_scheduler.set_timesteps(num_inference_steps, device=self.device)
+
+        x_t = self.decoder_scheduler.init_noise_sigma * randn_tensor(
+            (batch_size, 3, height, width), generator=generator, dtype=z.dtype, device=z.device
+        )
+
+        for t in self.decoder_scheduler.timesteps:
+            model_input = torch.concat([self.decoder_scheduler.scale_model_input(x_t, t), z], dim=1)
+            model_output = self.decoder_unet(model_input, t).sample[:, :3, :, :]
+            prev_sample = self.decoder_scheduler.step(model_output, t, x_t, generator).prev_sample
+            x_t = prev_sample
+
+        x_0 = x_t
+
+        if not return_dict:
+            return (x_0,)
+
+        return DecoderOutput(sample=x_0)
+
+    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.blend_v
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[2], b.shape[2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
+        return b
+
+    # Copied from diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL.blend_h
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[3], b.shape[3], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
+        return b
+
+    def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> ConsistencyDecoderVAEOutput:
+        r"""Encode a batch of images using a tiled encoder.
+
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
+        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
+        output, but they should be much less noticeable.
+
+        Args:
+            x (`torch.FloatTensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] instead of a
+                plain tuple.
+
+        Returns:
+            [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] or `tuple`:
+                If return_dict is True, a [`~models.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] is returned,
+                otherwise a plain `tuple` is returned.
+        """
+        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_latent_min_size - blend_extent
+
+        # Split the image into 512x512 tiles and encode them separately.
+        rows = []
+        for i in range(0, x.shape[2], overlap_size):
+            row = []
+            for j in range(0, x.shape[3], overlap_size):
+                tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
+                tile = self.encoder(tile)
+                tile = self.quant_conv(tile)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=3))
+
+        moments = torch.cat(result_rows, dim=2)
+        posterior = DiagonalGaussianDistribution(moments)
+
+        if not return_dict:
+            return (posterior,)
+
+        return ConsistencyDecoderVAEOutput(latent_dist=posterior)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[DecoderOutput, Tuple[torch.FloatTensor]]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): Input sample.
+            sample_posterior (`bool`, *optional*, defaults to `False`):
+                Whether to sample from the posterior.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+            generator (`torch.Generator`, *optional*, defaults to `None`):
+                Generator to use for sampling.
+
+        Returns:
+            [`DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`DecoderOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z, generator=generator).sample
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/vae.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/vae.py
new file mode 100644
index 000000000..885a0a10c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/vae.py
@@ -0,0 +1,992 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import os
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ...utils import BaseOutput, is_torch_version
+from ...utils.torch_utils import randn_tensor
+from ..activations import get_activation
+from ..attention_processor import SpatialNorm
+from ..unets.unet_2d_blocks import (
+    AutoencoderTinyBlock,
+    UNetMidBlock2D,
+    get_down_block,
+    get_up_block,
+)
+from ..nhwc_groupnorm.custom_gn import GN_NHWC
+
+
+@dataclass
+class DecoderOutput(BaseOutput):
+    r"""
+    Output of decoding method.
+
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The decoded output sample from the last layer of the model.
+    """
+
+    sample: torch.FloatTensor
+
+
+class Encoder(nn.Module):
+    r"""
+    The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation.
+
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        down_block_types (`Tuple[str, ...]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            The types of down blocks to use. See `~diffusers.models.unet_2d_blocks.get_down_block` for available
+            options.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+        double_z (`bool`, *optional*, defaults to `True`):
+            Whether to double the number of output channels for the last block.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        double_z: bool = True,
+        mid_block_add_attention=True,
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+
+        self.conv_in = nn.Conv2d(
+            in_channels,
+            block_out_channels[0],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        self.mid_block = None
+        self.down_blocks = nn.ModuleList([])
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=self.layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                add_downsample=not is_final_block,
+                resnet_eps=1e-6,
+                downsample_padding=0,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=output_channel,
+                temb_channels=None,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = UNetMidBlock2D(
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default",
+            attention_head_dim=block_out_channels[-1],
+            resnet_groups=norm_num_groups,
+            temb_channels=None,
+            add_attention=mid_block_add_attention,
+        )
+        # out
+        self.fuse_gn_silu = True if int(os.environ.get("USE_NHWC_GN", 0)) else False
+        if self.fuse_gn_silu:
+            self.conv_norm_out = GN_NHWC(norm_num_groups, block_out_channels[-1], activation="silu")
+        else:
+            self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[-1], num_groups=norm_num_groups, eps=1e-6)
+            self.conv_act = nn.SiLU()
+
+        conv_out_channels = 2 * out_channels if double_z else out_channels
+        self.conv_out = nn.Conv2d(block_out_channels[-1], conv_out_channels, 3, padding=1)
+
+        self.gradient_checkpointing = False
+
+    def forward(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        r"""The forward method of the `Encoder` class."""
+
+        sample = self.conv_in(sample)
+
+        if self.training and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            # down
+            if is_torch_version(">=", "1.11.0"):
+                for down_block in self.down_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(down_block), sample, use_reentrant=False
+                    )
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block), sample, use_reentrant=False
+                )
+            else:
+                for down_block in self.down_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(down_block), sample)
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample)
+
+        else:
+            # down
+            for down_block in self.down_blocks:
+                sample = down_block(sample)
+
+            # middle
+            sample = self.mid_block(sample)
+
+        # post-process
+        sample = self.conv_norm_out(sample)
+        if not self.fuse_gn_silu:
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+
+class Decoder(nn.Module):
+    r"""
+    The `Decoder` layer of a variational autoencoder that decodes its latent representation into an output sample.
+
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+        norm_type (`str`, *optional*, defaults to `"group"`):
+            The normalization type to use. Can be either `"group"` or `"spatial"`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        norm_type: str = "group",  # group, spatial
+        mid_block_add_attention=True,
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+
+        self.conv_in = nn.Conv2d(
+            in_channels,
+            block_out_channels[-1],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+
+        temb_channels = in_channels if norm_type == "spatial" else None
+
+        # mid
+        self.mid_block = UNetMidBlock2D(
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default" if norm_type == "group" else norm_type,
+            attention_head_dim=block_out_channels[-1],
+            resnet_groups=norm_num_groups,
+            temb_channels=temb_channels,
+            add_attention=mid_block_add_attention,
+        )
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=self.layers_per_block + 1,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                prev_output_channel=None,
+                add_upsample=not is_final_block,
+                resnet_eps=1e-6,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=output_channel,
+                temb_channels=temb_channels,
+                resnet_time_scale_shift=norm_type,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        if norm_type == "spatial":
+            self.conv_norm_out = SpatialNorm(block_out_channels[0], temb_channels)
+        else:
+            if int(os.environ.get("USE_NHWC_GN", 0)):
+                self.conv_norm_out = GN_NHWC(norm_num_groups, block_out_channels[0], activation="identity")
+            else:
+                self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
+        self.conv_act = nn.SiLU()
+        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        latent_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        r"""The forward method of the `Decoder` class."""
+
+        sample = self.conv_in(sample)
+
+        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
+        if self.training and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            if is_torch_version(">=", "1.11.0"):
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block),
+                    sample,
+                    latent_embeds,
+                    use_reentrant=False,
+                )
+                sample = sample.to(upscale_dtype)
+
+                # up
+                for up_block in self.up_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(up_block),
+                        sample,
+                        latent_embeds,
+                        use_reentrant=False,
+                    )
+            else:
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block), sample, latent_embeds
+                )
+                sample = sample.to(upscale_dtype)
+
+                # up
+                for up_block in self.up_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds)
+        else:
+            # middle
+            sample = self.mid_block(sample, latent_embeds)
+            sample = sample.to(upscale_dtype)
+
+            # up
+            for up_block in self.up_blocks:
+                sample = up_block(sample, latent_embeds)
+
+        # post-process
+        if latent_embeds is None:
+            sample = self.conv_norm_out(sample)
+        else:
+            sample = self.conv_norm_out(sample, latent_embeds)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+
+class UpSample(nn.Module):
+    r"""
+    The `UpSample` layer of a variational autoencoder that upsamples its input.
+
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+    ) -> None:
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.deconv = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=4, stride=2, padding=1)
+
+    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        r"""The forward method of the `UpSample` class."""
+        x = torch.relu(x)
+        x = self.deconv(x)
+        return x
+
+
+class MaskConditionEncoder(nn.Module):
+    """
+    used in AsymmetricAutoencoderKL
+    """
+
+    def __init__(
+        self,
+        in_ch: int,
+        out_ch: int = 192,
+        res_ch: int = 768,
+        stride: int = 16,
+    ) -> None:
+        super().__init__()
+
+        channels = []
+        while stride > 1:
+            stride = stride // 2
+            in_ch_ = out_ch * 2
+            if out_ch > res_ch:
+                out_ch = res_ch
+            if stride == 1:
+                in_ch_ = res_ch
+            channels.append((in_ch_, out_ch))
+            out_ch *= 2
+
+        out_channels = []
+        for _in_ch, _out_ch in channels:
+            out_channels.append(_out_ch)
+        out_channels.append(channels[-1][0])
+
+        layers = []
+        in_ch_ = in_ch
+        for l in range(len(out_channels)):
+            out_ch_ = out_channels[l]
+            if l == 0 or l == 1:
+                layers.append(nn.Conv2d(in_ch_, out_ch_, kernel_size=3, stride=1, padding=1))
+            else:
+                layers.append(nn.Conv2d(in_ch_, out_ch_, kernel_size=4, stride=2, padding=1))
+            in_ch_ = out_ch_
+
+        self.layers = nn.Sequential(*layers)
+
+    def forward(self, x: torch.FloatTensor, mask=None) -> torch.FloatTensor:
+        r"""The forward method of the `MaskConditionEncoder` class."""
+        out = {}
+        for l in range(len(self.layers)):
+            layer = self.layers[l]
+            x = layer(x)
+            out[str(tuple(x.shape))] = x
+            x = torch.relu(x)
+        return out
+
+
+class MaskConditionDecoder(nn.Module):
+    r"""The `MaskConditionDecoder` should be used in combination with [`AsymmetricAutoencoderKL`] to enhance the model's
+    decoder with a conditioner on the mask and masked image.
+
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+        norm_type (`str`, *optional*, defaults to `"group"`):
+            The normalization type to use. Can be either `"group"` or `"spatial"`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        norm_type: str = "group",  # group, spatial
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+
+        self.conv_in = nn.Conv2d(
+            in_channels,
+            block_out_channels[-1],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+
+        temb_channels = in_channels if norm_type == "spatial" else None
+
+        # mid
+        self.mid_block = UNetMidBlock2D(
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default" if norm_type == "group" else norm_type,
+            attention_head_dim=block_out_channels[-1],
+            resnet_groups=norm_num_groups,
+            temb_channels=temb_channels,
+        )
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=self.layers_per_block + 1,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                prev_output_channel=None,
+                add_upsample=not is_final_block,
+                resnet_eps=1e-6,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=output_channel,
+                temb_channels=temb_channels,
+                resnet_time_scale_shift=norm_type,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # condition encoder
+        self.condition_encoder = MaskConditionEncoder(
+            in_ch=out_channels,
+            out_ch=block_out_channels[0],
+            res_ch=block_out_channels[-1],
+        )
+
+        # out
+        if norm_type == "spatial":
+            self.conv_norm_out = SpatialNorm(block_out_channels[0], temb_channels)
+        else:
+            self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
+        self.conv_act = nn.SiLU()
+        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        z: torch.FloatTensor,
+        image: Optional[torch.FloatTensor] = None,
+        mask: Optional[torch.FloatTensor] = None,
+        latent_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        r"""The forward method of the `MaskConditionDecoder` class."""
+        sample = z
+        sample = self.conv_in(sample)
+
+        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
+        if self.training and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            if is_torch_version(">=", "1.11.0"):
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block),
+                    sample,
+                    latent_embeds,
+                    use_reentrant=False,
+                )
+                sample = sample.to(upscale_dtype)
+
+                # condition encoder
+                if image is not None and mask is not None:
+                    masked_image = (1 - mask) * image
+                    im_x = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(self.condition_encoder),
+                        masked_image,
+                        mask,
+                        use_reentrant=False,
+                    )
+
+                # up
+                for up_block in self.up_blocks:
+                    if image is not None and mask is not None:
+                        sample_ = im_x[str(tuple(sample.shape))]
+                        mask_ = nn.functional.interpolate(mask, size=sample.shape[-2:], mode="nearest")
+                        sample = sample * mask_ + sample_ * (1 - mask_)
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(up_block),
+                        sample,
+                        latent_embeds,
+                        use_reentrant=False,
+                    )
+                if image is not None and mask is not None:
+                    sample = sample * mask + im_x[str(tuple(sample.shape))] * (1 - mask)
+            else:
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block), sample, latent_embeds
+                )
+                sample = sample.to(upscale_dtype)
+
+                # condition encoder
+                if image is not None and mask is not None:
+                    masked_image = (1 - mask) * image
+                    im_x = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(self.condition_encoder),
+                        masked_image,
+                        mask,
+                    )
+
+                # up
+                for up_block in self.up_blocks:
+                    if image is not None and mask is not None:
+                        sample_ = im_x[str(tuple(sample.shape))]
+                        mask_ = nn.functional.interpolate(mask, size=sample.shape[-2:], mode="nearest")
+                        sample = sample * mask_ + sample_ * (1 - mask_)
+                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds)
+                if image is not None and mask is not None:
+                    sample = sample * mask + im_x[str(tuple(sample.shape))] * (1 - mask)
+        else:
+            # middle
+            sample = self.mid_block(sample, latent_embeds)
+            sample = sample.to(upscale_dtype)
+
+            # condition encoder
+            if image is not None and mask is not None:
+                masked_image = (1 - mask) * image
+                im_x = self.condition_encoder(masked_image, mask)
+
+            # up
+            for up_block in self.up_blocks:
+                if image is not None and mask is not None:
+                    sample_ = im_x[str(tuple(sample.shape))]
+                    mask_ = nn.functional.interpolate(mask, size=sample.shape[-2:], mode="nearest")
+                    sample = sample * mask_ + sample_ * (1 - mask_)
+                sample = up_block(sample, latent_embeds)
+            if image is not None and mask is not None:
+                sample = sample * mask + im_x[str(tuple(sample.shape))] * (1 - mask)
+
+        # post-process
+        if latent_embeds is None:
+            sample = self.conv_norm_out(sample)
+        else:
+            sample = self.conv_norm_out(sample, latent_embeds)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+
+class VectorQuantizer(nn.Module):
+    """
+    Improved version over VectorQuantizer, can be used as a drop-in replacement. Mostly avoids costly matrix
+    multiplications and allows for post-hoc remapping of indices.
+    """
+
+    # NOTE: due to a bug the beta term was applied to the wrong term. for
+    # backwards compatibility we use the buggy version by default, but you can
+    # specify legacy=False to fix it.
+    def __init__(
+        self,
+        n_e: int,
+        vq_embed_dim: int,
+        beta: float,
+        remap=None,
+        unknown_index: str = "random",
+        sane_index_shape: bool = False,
+        legacy: bool = True,
+    ):
+        super().__init__()
+        self.n_e = n_e
+        self.vq_embed_dim = vq_embed_dim
+        self.beta = beta
+        self.legacy = legacy
+
+        self.embedding = nn.Embedding(self.n_e, self.vq_embed_dim)
+        self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+
+        self.remap = remap
+        if self.remap is not None:
+            self.register_buffer("used", torch.tensor(np.load(self.remap)))
+            self.used: torch.Tensor
+            self.re_embed = self.used.shape[0]
+            self.unknown_index = unknown_index  # "random" or "extra" or integer
+            if self.unknown_index == "extra":
+                self.unknown_index = self.re_embed
+                self.re_embed = self.re_embed + 1
+            print(
+                f"Remapping {self.n_e} indices to {self.re_embed} indices. "
+                f"Using {self.unknown_index} for unknown indices."
+            )
+        else:
+            self.re_embed = n_e
+
+        self.sane_index_shape = sane_index_shape
+
+    def remap_to_used(self, inds: torch.LongTensor) -> torch.LongTensor:
+        ishape = inds.shape
+        assert len(ishape) > 1
+        inds = inds.reshape(ishape[0], -1)
+        used = self.used.to(inds)
+        match = (inds[:, :, None] == used[None, None, ...]).long()
+        new = match.argmax(-1)
+        unknown = match.sum(2) < 1
+        if self.unknown_index == "random":
+            new[unknown] = torch.randint(0, self.re_embed, size=new[unknown].shape).to(device=new.device)
+        else:
+            new[unknown] = self.unknown_index
+        return new.reshape(ishape)
+
+    def unmap_to_all(self, inds: torch.LongTensor) -> torch.LongTensor:
+        ishape = inds.shape
+        assert len(ishape) > 1
+        inds = inds.reshape(ishape[0], -1)
+        used = self.used.to(inds)
+        if self.re_embed > self.used.shape[0]:  # extra token
+            inds[inds >= self.used.shape[0]] = 0  # simply set to zero
+        back = torch.gather(used[None, :][inds.shape[0] * [0], :], 1, inds)
+        return back.reshape(ishape)
+
+    def forward(self, z: torch.FloatTensor) -> Tuple[torch.FloatTensor, torch.FloatTensor, Tuple]:
+        # reshape z -> (batch, height, width, channel) and flatten
+        z = z.permute(0, 2, 3, 1).contiguous()
+        z_flattened = z.view(-1, self.vq_embed_dim)
+
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        min_encoding_indices = torch.argmin(torch.cdist(z_flattened, self.embedding.weight), dim=1)
+
+        z_q = self.embedding(min_encoding_indices).view(z.shape)
+        perplexity = None
+        min_encodings = None
+
+        # compute loss for embedding
+        if not self.legacy:
+            loss = self.beta * torch.mean((z_q.detach() - z) ** 2) + torch.mean((z_q - z.detach()) ** 2)
+        else:
+            loss = torch.mean((z_q.detach() - z) ** 2) + self.beta * torch.mean((z_q - z.detach()) ** 2)
+
+        # preserve gradients
+        z_q: torch.FloatTensor = z + (z_q - z).detach()
+
+        # reshape back to match original input shape
+        z_q = z_q.permute(0, 3, 1, 2).contiguous()
+
+        if self.remap is not None:
+            min_encoding_indices = min_encoding_indices.reshape(z.shape[0], -1)  # add batch axis
+            min_encoding_indices = self.remap_to_used(min_encoding_indices)
+            min_encoding_indices = min_encoding_indices.reshape(-1, 1)  # flatten
+
+        if self.sane_index_shape:
+            min_encoding_indices = min_encoding_indices.reshape(z_q.shape[0], z_q.shape[2], z_q.shape[3])
+
+        return z_q, loss, (perplexity, min_encodings, min_encoding_indices)
+
+    def get_codebook_entry(self, indices: torch.LongTensor, shape: Tuple[int, ...]) -> torch.FloatTensor:
+        # shape specifying (batch, height, width, channel)
+        if self.remap is not None:
+            indices = indices.reshape(shape[0], -1)  # add batch axis
+            indices = self.unmap_to_all(indices)
+            indices = indices.reshape(-1)  # flatten again
+
+        # get quantized latent vectors
+        z_q: torch.FloatTensor = self.embedding(indices)
+
+        if shape is not None:
+            z_q = z_q.view(shape)
+            # reshape back to match original input shape
+            z_q = z_q.permute(0, 3, 1, 2).contiguous()
+
+        return z_q
+
+
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters: torch.Tensor, deterministic: bool = False):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(
+                self.mean, device=self.parameters.device, dtype=self.parameters.dtype
+            )
+
+    def sample(self, generator: Optional[torch.Generator] = None) -> torch.FloatTensor:
+        # make sure sample is on the same device as the parameters and has same dtype
+        sample = randn_tensor(
+            self.mean.shape,
+            generator=generator,
+            device=self.parameters.device,
+            dtype=self.parameters.dtype,
+        )
+        x = self.mean + self.std * sample
+        return x
+
+    def kl(self, other: "DiagonalGaussianDistribution" = None) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            if other is None:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+                    dim=[1, 2, 3],
+                )
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var
+                    - 1.0
+                    - self.logvar
+                    + other.logvar,
+                    dim=[1, 2, 3],
+                )
+
+    def nll(self, sample: torch.Tensor, dims: Tuple[int, ...] = [1, 2, 3]) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims,
+        )
+
+    def mode(self) -> torch.Tensor:
+        return self.mean
+
+
+class EncoderTiny(nn.Module):
+    r"""
+    The `EncoderTiny` layer is a simpler version of the `Encoder` layer.
+
+    Args:
+        in_channels (`int`):
+            The number of input channels.
+        out_channels (`int`):
+            The number of output channels.
+        num_blocks (`Tuple[int, ...]`):
+            Each value of the tuple represents a Conv2d layer followed by `value` number of `AutoencoderTinyBlock`'s to
+            use.
+        block_out_channels (`Tuple[int, ...]`):
+            The number of output channels for each block.
+        act_fn (`str`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_blocks: Tuple[int, ...],
+        block_out_channels: Tuple[int, ...],
+        act_fn: str,
+    ):
+        super().__init__()
+
+        layers = []
+        for i, num_block in enumerate(num_blocks):
+            num_channels = block_out_channels[i]
+
+            if i == 0:
+                layers.append(nn.Conv2d(in_channels, num_channels, kernel_size=3, padding=1))
+            else:
+                layers.append(
+                    nn.Conv2d(
+                        num_channels,
+                        num_channels,
+                        kernel_size=3,
+                        padding=1,
+                        stride=2,
+                        bias=False,
+                    )
+                )
+
+            for _ in range(num_block):
+                layers.append(AutoencoderTinyBlock(num_channels, num_channels, act_fn))
+
+        layers.append(nn.Conv2d(block_out_channels[-1], out_channels, kernel_size=3, padding=1))
+
+        self.layers = nn.Sequential(*layers)
+        self.gradient_checkpointing = False
+
+    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        r"""The forward method of the `EncoderTiny` class."""
+        if self.training and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            if is_torch_version(">=", "1.11.0"):
+                x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x, use_reentrant=False)
+            else:
+                x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x)
+
+        else:
+            # scale image from [-1, 1] to [0, 1] to match TAESD convention
+            x = self.layers(x.add(1).div(2))
+
+        return x
+
+
+class DecoderTiny(nn.Module):
+    r"""
+    The `DecoderTiny` layer is a simpler version of the `Decoder` layer.
+
+    Args:
+        in_channels (`int`):
+            The number of input channels.
+        out_channels (`int`):
+            The number of output channels.
+        num_blocks (`Tuple[int, ...]`):
+            Each value of the tuple represents a Conv2d layer followed by `value` number of `AutoencoderTinyBlock`'s to
+            use.
+        block_out_channels (`Tuple[int, ...]`):
+            The number of output channels for each block.
+        upsampling_scaling_factor (`int`):
+            The scaling factor to use for upsampling.
+        act_fn (`str`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_blocks: Tuple[int, ...],
+        block_out_channels: Tuple[int, ...],
+        upsampling_scaling_factor: int,
+        act_fn: str,
+    ):
+        super().__init__()
+
+        layers = [
+            nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, padding=1),
+            get_activation(act_fn),
+        ]
+
+        for i, num_block in enumerate(num_blocks):
+            is_final_block = i == (len(num_blocks) - 1)
+            num_channels = block_out_channels[i]
+
+            for _ in range(num_block):
+                layers.append(AutoencoderTinyBlock(num_channels, num_channels, act_fn))
+
+            if not is_final_block:
+                layers.append(nn.Upsample(scale_factor=upsampling_scaling_factor))
+
+            conv_out_channel = num_channels if not is_final_block else out_channels
+            layers.append(
+                nn.Conv2d(
+                    num_channels,
+                    conv_out_channel,
+                    kernel_size=3,
+                    padding=1,
+                    bias=is_final_block,
+                )
+            )
+
+        self.layers = nn.Sequential(*layers)
+        self.gradient_checkpointing = False
+
+    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        r"""The forward method of the `DecoderTiny` class."""
+        # Clamp.
+        x = torch.tanh(x / 3) * 3
+
+        if self.training and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            if is_torch_version(">=", "1.11.0"):
+                x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x, use_reentrant=False)
+            else:
+                x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x)
+
+        else:
+            x = self.layers(x)
+
+        # scale image from [0, 1] to [-1, 1] to match diffusers convention
+        return x.mul(2).sub(1)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/controlnet.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/controlnet.py
new file mode 100644
index 000000000..130e6430d
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/controlnet.py
@@ -0,0 +1,868 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..loaders import FromOriginalControlNetMixin
+from ..utils import BaseOutput, logging
+from .attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from .embeddings import TextImageProjection, TextImageTimeEmbedding, TextTimeEmbedding, TimestepEmbedding, Timesteps
+from .modeling_utils import ModelMixin
+from .unets.unet_2d_blocks import (
+    CrossAttnDownBlock2D,
+    DownBlock2D,
+    UNetMidBlock2D,
+    UNetMidBlock2DCrossAttn,
+    get_down_block,
+)
+from .unets.unet_2d_condition import UNet2DConditionModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class ControlNetOutput(BaseOutput):
+    """
+    The output of [`ControlNetModel`].
+
+    Args:
+        down_block_res_samples (`tuple[torch.Tensor]`):
+            A tuple of downsample activations at different resolutions for each downsampling block. Each tensor should
+            be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be
+            used to condition the original UNet's downsampling activations.
+        mid_down_block_re_sample (`torch.Tensor`):
+            The activation of the midde block (the lowest sample resolution). Each tensor should be of shape
+            `(batch_size, channel * lowest_resolution, height // lowest_resolution, width // lowest_resolution)`.
+            Output can be used to condition the original UNet's middle block activation.
+    """
+
+    down_block_res_samples: Tuple[torch.Tensor]
+    mid_block_res_sample: torch.Tensor
+
+
+class ControlNetConditioningEmbedding(nn.Module):
+    """
+    Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
+    [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized
+    training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the
+    convolution size. We use a tiny network E(·) of four convolution layers with 4 × 4 kernels and 2 × 2 strides
+    (activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full
+    model) to encode image-space conditions ... into feature maps ..."
+    """
+
+    def __init__(
+        self,
+        conditioning_embedding_channels: int,
+        conditioning_channels: int = 3,
+        block_out_channels: Tuple[int, ...] = (16, 32, 96, 256),
+    ):
+        super().__init__()
+
+        self.conv_in = nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
+
+        self.blocks = nn.ModuleList([])
+
+        for i in range(len(block_out_channels) - 1):
+            channel_in = block_out_channels[i]
+            channel_out = block_out_channels[i + 1]
+            self.blocks.append(nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1))
+            self.blocks.append(nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
+
+        self.conv_out = zero_module(
+            nn.Conv2d(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1)
+        )
+
+    def forward(self, conditioning):
+        embedding = self.conv_in(conditioning)
+        embedding = F.silu(embedding)
+
+        for block in self.blocks:
+            embedding = block(embedding)
+            embedding = F.silu(embedding)
+
+        embedding = self.conv_out(embedding)
+
+        return embedding
+
+
+class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlNetMixin):
+    """
+    A ControlNet model.
+
+    Args:
+        in_channels (`int`, defaults to 4):
+            The number of channels in the input sample.
+        flip_sin_to_cos (`bool`, defaults to `True`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, defaults to 0):
+            The frequency shift to apply to the time embedding.
+        down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        only_cross_attention (`Union[bool, Tuple[bool]]`, defaults to `False`):
+        block_out_channels (`tuple[int]`, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, defaults to 2):
+            The number of layers per block.
+        downsample_padding (`int`, defaults to 1):
+            The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, defaults to 1):
+            The scale factor to use for the mid block.
+        act_fn (`str`, defaults to "silu"):
+            The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use for the normalization. If None, normalization and activation layers is skipped
+            in post-processing.
+        norm_eps (`float`, defaults to 1e-5):
+            The epsilon to use for the normalization.
+        cross_attention_dim (`int`, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        encoder_hid_dim (`int`, *optional*, defaults to None):
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
+            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+        attention_head_dim (`Union[int, Tuple[int]]`, defaults to 8):
+            The dimension of the attention heads.
+        use_linear_projection (`bool`, defaults to `False`):
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from None,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to `None`):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
+        num_class_embeds (`int`, *optional*, defaults to 0):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        upcast_attention (`bool`, defaults to `False`):
+        resnet_time_scale_shift (`str`, defaults to `"default"`):
+            Time scale shift config for ResNet blocks (see `ResnetBlock2D`). Choose from `default` or `scale_shift`.
+        projection_class_embeddings_input_dim (`int`, *optional*, defaults to `None`):
+            The dimension of the `class_labels` input when `class_embed_type="projection"`. Required when
+            `class_embed_type="projection"`.
+        controlnet_conditioning_channel_order (`str`, defaults to `"rgb"`):
+            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
+        conditioning_embedding_out_channels (`tuple[int]`, *optional*, defaults to `(16, 32, 96, 256)`):
+            The tuple of output channel for each block in the `conditioning_embedding` layer.
+        global_pool_conditions (`bool`, defaults to `False`):
+            TODO(Patrick) - unused parameter.
+        addition_embed_type_num_heads (`int`, defaults to 64):
+            The number of heads to use for the `TextTimeEmbedding` layer.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 4,
+        conditioning_channels: int = 3,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str, ...] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int, ...]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int, ...]]] = None,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        controlnet_conditioning_channel_order: str = "rgb",
+        conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
+        global_pool_conditions: bool = False,
+        addition_embed_type_num_heads: int = 64,
+    ):
+        super().__init__()
+
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+
+        # Check inputs
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+
+        # input
+        conv_in_kernel = 3
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+
+        # time
+        time_embed_dim = block_out_channels[0] * 4
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+        )
+
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None
+
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
+            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
+
+        # control net conditioning embedding
+        self.controlnet_cond_embedding = ControlNetConditioningEmbedding(
+            conditioning_embedding_channels=block_out_channels[0],
+            block_out_channels=conditioning_embedding_out_channels,
+            conditioning_channels=conditioning_channels,
+        )
+
+        self.down_blocks = nn.ModuleList([])
+        self.controlnet_down_blocks = nn.ModuleList([])
+
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+
+        # down
+        output_channel = block_out_channels[0]
+
+        controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+        controlnet_block = zero_module(controlnet_block)
+        self.controlnet_down_blocks.append(controlnet_block)
+
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=num_attention_heads[i],
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                downsample_padding=downsample_padding,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+            )
+            self.down_blocks.append(down_block)
+
+            for _ in range(layers_per_block):
+                controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+                controlnet_block = zero_module(controlnet_block)
+                self.controlnet_down_blocks.append(controlnet_block)
+
+            if not is_final_block:
+                controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
+                controlnet_block = zero_module(controlnet_block)
+                self.controlnet_down_blocks.append(controlnet_block)
+
+        # mid
+        mid_block_channel = block_out_channels[-1]
+
+        controlnet_block = nn.Conv2d(mid_block_channel, mid_block_channel, kernel_size=1)
+        controlnet_block = zero_module(controlnet_block)
+        self.controlnet_mid_block = controlnet_block
+
+        if mid_block_type == "UNetMidBlock2DCrossAttn":
+            self.mid_block = UNetMidBlock2DCrossAttn(
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                in_channels=mid_block_channel,
+                temb_channels=time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+            )
+        elif mid_block_type == "UNetMidBlock2D":
+            self.mid_block = UNetMidBlock2D(
+                in_channels=block_out_channels[-1],
+                temb_channels=time_embed_dim,
+                num_layers=0,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                add_attention=False,
+            )
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+
+    @classmethod
+    def from_unet(
+        cls,
+        unet: UNet2DConditionModel,
+        controlnet_conditioning_channel_order: str = "rgb",
+        conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
+        load_weights_from_unet: bool = True,
+        conditioning_channels: int = 3,
+    ):
+        r"""
+        Instantiate a [`ControlNetModel`] from [`UNet2DConditionModel`].
+
+        Parameters:
+            unet (`UNet2DConditionModel`):
+                The UNet model weights to copy to the [`ControlNetModel`]. All configuration options are also copied
+                where applicable.
+        """
+        transformer_layers_per_block = (
+            unet.config.transformer_layers_per_block if "transformer_layers_per_block" in unet.config else 1
+        )
+        encoder_hid_dim = unet.config.encoder_hid_dim if "encoder_hid_dim" in unet.config else None
+        encoder_hid_dim_type = unet.config.encoder_hid_dim_type if "encoder_hid_dim_type" in unet.config else None
+        addition_embed_type = unet.config.addition_embed_type if "addition_embed_type" in unet.config else None
+        addition_time_embed_dim = (
+            unet.config.addition_time_embed_dim if "addition_time_embed_dim" in unet.config else None
+        )
+
+        controlnet = cls(
+            encoder_hid_dim=encoder_hid_dim,
+            encoder_hid_dim_type=encoder_hid_dim_type,
+            addition_embed_type=addition_embed_type,
+            addition_time_embed_dim=addition_time_embed_dim,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=unet.config.in_channels,
+            flip_sin_to_cos=unet.config.flip_sin_to_cos,
+            freq_shift=unet.config.freq_shift,
+            down_block_types=unet.config.down_block_types,
+            only_cross_attention=unet.config.only_cross_attention,
+            block_out_channels=unet.config.block_out_channels,
+            layers_per_block=unet.config.layers_per_block,
+            downsample_padding=unet.config.downsample_padding,
+            mid_block_scale_factor=unet.config.mid_block_scale_factor,
+            act_fn=unet.config.act_fn,
+            norm_num_groups=unet.config.norm_num_groups,
+            norm_eps=unet.config.norm_eps,
+            cross_attention_dim=unet.config.cross_attention_dim,
+            attention_head_dim=unet.config.attention_head_dim,
+            num_attention_heads=unet.config.num_attention_heads,
+            use_linear_projection=unet.config.use_linear_projection,
+            class_embed_type=unet.config.class_embed_type,
+            num_class_embeds=unet.config.num_class_embeds,
+            upcast_attention=unet.config.upcast_attention,
+            resnet_time_scale_shift=unet.config.resnet_time_scale_shift,
+            projection_class_embeddings_input_dim=unet.config.projection_class_embeddings_input_dim,
+            mid_block_type=unet.config.mid_block_type,
+            controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,
+            conditioning_embedding_out_channels=conditioning_embedding_out_channels,
+            conditioning_channels=conditioning_channels,
+        )
+
+        if load_weights_from_unet:
+            controlnet.conv_in.load_state_dict(unet.conv_in.state_dict())
+            controlnet.time_proj.load_state_dict(unet.time_proj.state_dict())
+            controlnet.time_embedding.load_state_dict(unet.time_embedding.state_dict())
+
+            if controlnet.class_embedding:
+                controlnet.class_embedding.load_state_dict(unet.class_embedding.state_dict())
+
+            controlnet.down_blocks.load_state_dict(unet.down_blocks.state_dict())
+            controlnet.mid_block.load_state_dict(unet.mid_block.state_dict())
+
+        return controlnet
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attention_slice
+    def set_attention_slice(self, slice_size: Union[str, int, List[int]]) -> None:
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+
+        num_sliceable_layers = len(sliceable_head_dims)
+
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+
+    def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
+        if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)):
+            module.gradient_checkpointing = value
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        controlnet_cond: torch.FloatTensor,
+        conditioning_scale: float = 1.0,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guess_mode: bool = False,
+        return_dict: bool = True,
+    ) -> Union[ControlNetOutput, Tuple[Tuple[torch.FloatTensor, ...], torch.FloatTensor]]:
+        """
+        The [`ControlNetModel`] forward method.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor.
+            timestep (`Union[torch.Tensor, float, int]`):
+                The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.Tensor`):
+                The encoder hidden states.
+            controlnet_cond (`torch.FloatTensor`):
+                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
+            conditioning_scale (`float`, defaults to `1.0`):
+                The scale factor for ControlNet outputs.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond (`torch.Tensor`, *optional*, defaults to `None`):
+                Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
+                timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
+                embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            added_cond_kwargs (`dict`):
+                Additional conditions for the Stable Diffusion XL UNet.
+            cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
+            guess_mode (`bool`, defaults to `False`):
+                In this mode, the ControlNet encoder tries its best to recognize the input content of the input even if
+                you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended.
+            return_dict (`bool`, defaults to `True`):
+                Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.controlnet.ControlNetOutput`] **or** `tuple`:
+                If `return_dict` is `True`, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a tuple is
+                returned where the first element is the sample tensor.
+        """
+        # check channel order
+        channel_order = self.config.controlnet_conditioning_channel_order
+
+        if channel_order == "rgb":
+            # in rgb order by default
+            ...
+        elif channel_order == "bgr":
+            controlnet_cond = torch.flip(controlnet_cond, dims=[1])
+        else:
+            raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}")
+
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+
+        if self.config.addition_embed_type is not None:
+            if self.config.addition_embed_type == "text":
+                aug_emb = self.add_embedding(encoder_hidden_states)
+
+            elif self.config.addition_embed_type == "text_time":
+                if "text_embeds" not in added_cond_kwargs:
+                    raise ValueError(
+                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                    )
+                text_embeds = added_cond_kwargs.get("text_embeds")
+                if "time_ids" not in added_cond_kwargs:
+                    raise ValueError(
+                        f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                    )
+                time_ids = added_cond_kwargs.get("time_ids")
+                time_embeds = self.add_time_proj(time_ids.flatten())
+                time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+
+                add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+                add_embeds = add_embeds.to(emb.dtype)
+                aug_emb = self.add_embedding(add_embeds)
+
+        emb = emb + aug_emb if aug_emb is not None else emb
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)
+        sample = sample + controlnet_cond
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+
+            down_block_res_samples += res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample = self.mid_block(sample, emb)
+
+        # 5. Control net blocks
+
+        controlnet_down_block_res_samples = ()
+
+        for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
+            down_block_res_sample = controlnet_block(down_block_res_sample)
+            controlnet_down_block_res_samples = controlnet_down_block_res_samples + (down_block_res_sample,)
+
+        down_block_res_samples = controlnet_down_block_res_samples
+
+        mid_block_res_sample = self.controlnet_mid_block(sample)
+
+        # 6. scaling
+        if guess_mode and not self.config.global_pool_conditions:
+            scales = torch.logspace(-1, 0, len(down_block_res_samples) + 1, device=sample.device)  # 0.1 to 1.0
+            scales = scales * conditioning_scale
+            down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)]
+            mid_block_res_sample = mid_block_res_sample * scales[-1]  # last one
+        else:
+            down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
+            mid_block_res_sample = mid_block_res_sample * conditioning_scale
+
+        if self.config.global_pool_conditions:
+            down_block_res_samples = [
+                torch.mean(sample, dim=(2, 3), keepdim=True) for sample in down_block_res_samples
+            ]
+            mid_block_res_sample = torch.mean(mid_block_res_sample, dim=(2, 3), keepdim=True)
+
+        if not return_dict:
+            return (down_block_res_samples, mid_block_res_sample)
+
+        return ControlNetOutput(
+            down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample
+        )
+
+
+def zero_module(module):
+    for p in module.parameters():
+        nn.init.zeros_(p)
+    return module
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/controlnet_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/controlnet_flax.py
new file mode 100644
index 000000000..6f9b201aa
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/controlnet_flax.py
@@ -0,0 +1,395 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union
+
+import flax
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict
+
+from ..configuration_utils import ConfigMixin, flax_register_to_config
+from ..utils import BaseOutput
+from .embeddings_flax import FlaxTimestepEmbedding, FlaxTimesteps
+from .modeling_flax_utils import FlaxModelMixin
+from .unets.unet_2d_blocks_flax import (
+    FlaxCrossAttnDownBlock2D,
+    FlaxDownBlock2D,
+    FlaxUNetMidBlock2DCrossAttn,
+)
+
+
+@flax.struct.dataclass
+class FlaxControlNetOutput(BaseOutput):
+    """
+    The output of [`FlaxControlNetModel`].
+
+    Args:
+        down_block_res_samples (`jnp.ndarray`):
+        mid_block_res_sample (`jnp.ndarray`):
+    """
+
+    down_block_res_samples: jnp.ndarray
+    mid_block_res_sample: jnp.ndarray
+
+
+class FlaxControlNetConditioningEmbedding(nn.Module):
+    conditioning_embedding_channels: int
+    block_out_channels: Tuple[int, ...] = (16, 32, 96, 256)
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self) -> None:
+        self.conv_in = nn.Conv(
+            self.block_out_channels[0],
+            kernel_size=(3, 3),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+        blocks = []
+        for i in range(len(self.block_out_channels) - 1):
+            channel_in = self.block_out_channels[i]
+            channel_out = self.block_out_channels[i + 1]
+            conv1 = nn.Conv(
+                channel_in,
+                kernel_size=(3, 3),
+                padding=((1, 1), (1, 1)),
+                dtype=self.dtype,
+            )
+            blocks.append(conv1)
+            conv2 = nn.Conv(
+                channel_out,
+                kernel_size=(3, 3),
+                strides=(2, 2),
+                padding=((1, 1), (1, 1)),
+                dtype=self.dtype,
+            )
+            blocks.append(conv2)
+        self.blocks = blocks
+
+        self.conv_out = nn.Conv(
+            self.conditioning_embedding_channels,
+            kernel_size=(3, 3),
+            padding=((1, 1), (1, 1)),
+            kernel_init=nn.initializers.zeros_init(),
+            bias_init=nn.initializers.zeros_init(),
+            dtype=self.dtype,
+        )
+
+    def __call__(self, conditioning: jnp.ndarray) -> jnp.ndarray:
+        embedding = self.conv_in(conditioning)
+        embedding = nn.silu(embedding)
+
+        for block in self.blocks:
+            embedding = block(embedding)
+            embedding = nn.silu(embedding)
+
+        embedding = self.conv_out(embedding)
+
+        return embedding
+
+
+@flax_register_to_config
+class FlaxControlNetModel(nn.Module, FlaxModelMixin, ConfigMixin):
+    r"""
+    A ControlNet model.
+
+    This model inherits from [`FlaxModelMixin`]. Check the superclass documentation for it’s generic methods
+    implemented for all models (such as downloading or saving).
+
+    This model is also a Flax Linen [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
+    subclass. Use it as a regular Flax Linen module and refer to the Flax documentation for all matters related to its
+    general usage and behavior.
+
+    Inherent JAX features such as the following are supported:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        sample_size (`int`, *optional*):
+            The size of the input sample.
+        in_channels (`int`, *optional*, defaults to 4):
+            The number of channels in the input sample.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxDownBlock2D")`):
+            The tuple of downsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        attention_head_dim (`int` or `Tuple[int]`, *optional*, defaults to 8):
+            The dimension of the attention heads.
+        num_attention_heads (`int` or `Tuple[int]`, *optional*):
+            The number of attention heads.
+        cross_attention_dim (`int`, *optional*, defaults to 768):
+            The dimension of the cross attention features.
+        dropout (`float`, *optional*, defaults to 0):
+            Dropout probability for down, up and bottleneck blocks.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        controlnet_conditioning_channel_order (`str`, *optional*, defaults to `rgb`):
+            The channel order of conditional image. Will convert to `rgb` if it's `bgr`.
+        conditioning_embedding_out_channels (`tuple`, *optional*, defaults to `(16, 32, 96, 256)`):
+            The tuple of output channel for each block in the `conditioning_embedding` layer.
+    """
+
+    sample_size: int = 32
+    in_channels: int = 4
+    down_block_types: Tuple[str, ...] = (
+        "CrossAttnDownBlock2D",
+        "CrossAttnDownBlock2D",
+        "CrossAttnDownBlock2D",
+        "DownBlock2D",
+    )
+    only_cross_attention: Union[bool, Tuple[bool, ...]] = False
+    block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280)
+    layers_per_block: int = 2
+    attention_head_dim: Union[int, Tuple[int, ...]] = 8
+    num_attention_heads: Optional[Union[int, Tuple[int, ...]]] = None
+    cross_attention_dim: int = 1280
+    dropout: float = 0.0
+    use_linear_projection: bool = False
+    dtype: jnp.dtype = jnp.float32
+    flip_sin_to_cos: bool = True
+    freq_shift: int = 0
+    controlnet_conditioning_channel_order: str = "rgb"
+    conditioning_embedding_out_channels: Tuple[int, ...] = (16, 32, 96, 256)
+
+    def init_weights(self, rng: jax.Array) -> FrozenDict:
+        # init input tensors
+        sample_shape = (1, self.in_channels, self.sample_size, self.sample_size)
+        sample = jnp.zeros(sample_shape, dtype=jnp.float32)
+        timesteps = jnp.ones((1,), dtype=jnp.int32)
+        encoder_hidden_states = jnp.zeros((1, 1, self.cross_attention_dim), dtype=jnp.float32)
+        controlnet_cond_shape = (1, 3, self.sample_size * 8, self.sample_size * 8)
+        controlnet_cond = jnp.zeros(controlnet_cond_shape, dtype=jnp.float32)
+
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        return self.init(rngs, sample, timesteps, encoder_hidden_states, controlnet_cond)["params"]
+
+    def setup(self) -> None:
+        block_out_channels = self.block_out_channels
+        time_embed_dim = block_out_channels[0] * 4
+
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = self.num_attention_heads or self.attention_head_dim
+
+        # input
+        self.conv_in = nn.Conv(
+            block_out_channels[0],
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+        # time
+        self.time_proj = FlaxTimesteps(
+            block_out_channels[0], flip_sin_to_cos=self.flip_sin_to_cos, freq_shift=self.config.freq_shift
+        )
+        self.time_embedding = FlaxTimestepEmbedding(time_embed_dim, dtype=self.dtype)
+
+        self.controlnet_cond_embedding = FlaxControlNetConditioningEmbedding(
+            conditioning_embedding_channels=block_out_channels[0],
+            block_out_channels=self.conditioning_embedding_out_channels,
+        )
+
+        only_cross_attention = self.only_cross_attention
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = (only_cross_attention,) * len(self.down_block_types)
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(self.down_block_types)
+
+        # down
+        down_blocks = []
+        controlnet_down_blocks = []
+
+        output_channel = block_out_channels[0]
+
+        controlnet_block = nn.Conv(
+            output_channel,
+            kernel_size=(1, 1),
+            padding="VALID",
+            kernel_init=nn.initializers.zeros_init(),
+            bias_init=nn.initializers.zeros_init(),
+            dtype=self.dtype,
+        )
+        controlnet_down_blocks.append(controlnet_block)
+
+        for i, down_block_type in enumerate(self.down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            if down_block_type == "CrossAttnDownBlock2D":
+                down_block = FlaxCrossAttnDownBlock2D(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    dropout=self.dropout,
+                    num_layers=self.layers_per_block,
+                    num_attention_heads=num_attention_heads[i],
+                    add_downsample=not is_final_block,
+                    use_linear_projection=self.use_linear_projection,
+                    only_cross_attention=only_cross_attention[i],
+                    dtype=self.dtype,
+                )
+            else:
+                down_block = FlaxDownBlock2D(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    dropout=self.dropout,
+                    num_layers=self.layers_per_block,
+                    add_downsample=not is_final_block,
+                    dtype=self.dtype,
+                )
+
+            down_blocks.append(down_block)
+
+            for _ in range(self.layers_per_block):
+                controlnet_block = nn.Conv(
+                    output_channel,
+                    kernel_size=(1, 1),
+                    padding="VALID",
+                    kernel_init=nn.initializers.zeros_init(),
+                    bias_init=nn.initializers.zeros_init(),
+                    dtype=self.dtype,
+                )
+                controlnet_down_blocks.append(controlnet_block)
+
+            if not is_final_block:
+                controlnet_block = nn.Conv(
+                    output_channel,
+                    kernel_size=(1, 1),
+                    padding="VALID",
+                    kernel_init=nn.initializers.zeros_init(),
+                    bias_init=nn.initializers.zeros_init(),
+                    dtype=self.dtype,
+                )
+                controlnet_down_blocks.append(controlnet_block)
+
+        self.down_blocks = down_blocks
+        self.controlnet_down_blocks = controlnet_down_blocks
+
+        # mid
+        mid_block_channel = block_out_channels[-1]
+        self.mid_block = FlaxUNetMidBlock2DCrossAttn(
+            in_channels=mid_block_channel,
+            dropout=self.dropout,
+            num_attention_heads=num_attention_heads[-1],
+            use_linear_projection=self.use_linear_projection,
+            dtype=self.dtype,
+        )
+
+        self.controlnet_mid_block = nn.Conv(
+            mid_block_channel,
+            kernel_size=(1, 1),
+            padding="VALID",
+            kernel_init=nn.initializers.zeros_init(),
+            bias_init=nn.initializers.zeros_init(),
+            dtype=self.dtype,
+        )
+
+    def __call__(
+        self,
+        sample: jnp.ndarray,
+        timesteps: Union[jnp.ndarray, float, int],
+        encoder_hidden_states: jnp.ndarray,
+        controlnet_cond: jnp.ndarray,
+        conditioning_scale: float = 1.0,
+        return_dict: bool = True,
+        train: bool = False,
+    ) -> Union[FlaxControlNetOutput, Tuple[Tuple[jnp.ndarray, ...], jnp.ndarray]]:
+        r"""
+        Args:
+            sample (`jnp.ndarray`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`jnp.ndarray` or `float` or `int`): timesteps
+            encoder_hidden_states (`jnp.ndarray`): (batch_size, sequence_length, hidden_size) encoder hidden states
+            controlnet_cond (`jnp.ndarray`): (batch, channel, height, width) the conditional input tensor
+            conditioning_scale (`float`, *optional*, defaults to `1.0`): the scale factor for controlnet outputs
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] instead of a
+                plain tuple.
+            train (`bool`, *optional*, defaults to `False`):
+                Use deterministic functions and disable dropout when not training.
+
+        Returns:
+            [`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] or `tuple`:
+                [`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] if `return_dict` is True, otherwise a
+                `tuple`. When returning a tuple, the first element is the sample tensor.
+        """
+        channel_order = self.controlnet_conditioning_channel_order
+        if channel_order == "bgr":
+            controlnet_cond = jnp.flip(controlnet_cond, axis=1)
+
+        # 1. time
+        if not isinstance(timesteps, jnp.ndarray):
+            timesteps = jnp.array([timesteps], dtype=jnp.int32)
+        elif isinstance(timesteps, jnp.ndarray) and len(timesteps.shape) == 0:
+            timesteps = timesteps.astype(dtype=jnp.float32)
+            timesteps = jnp.expand_dims(timesteps, 0)
+
+        t_emb = self.time_proj(timesteps)
+        t_emb = self.time_embedding(t_emb)
+
+        # 2. pre-process
+        sample = jnp.transpose(sample, (0, 2, 3, 1))
+        sample = self.conv_in(sample)
+
+        controlnet_cond = jnp.transpose(controlnet_cond, (0, 2, 3, 1))
+        controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)
+        sample += controlnet_cond
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for down_block in self.down_blocks:
+            if isinstance(down_block, FlaxCrossAttnDownBlock2D):
+                sample, res_samples = down_block(sample, t_emb, encoder_hidden_states, deterministic=not train)
+            else:
+                sample, res_samples = down_block(sample, t_emb, deterministic=not train)
+            down_block_res_samples += res_samples
+
+        # 4. mid
+        sample = self.mid_block(sample, t_emb, encoder_hidden_states, deterministic=not train)
+
+        # 5. contronet blocks
+        controlnet_down_block_res_samples = ()
+        for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
+            down_block_res_sample = controlnet_block(down_block_res_sample)
+            controlnet_down_block_res_samples += (down_block_res_sample,)
+
+        down_block_res_samples = controlnet_down_block_res_samples
+
+        mid_block_res_sample = self.controlnet_mid_block(sample)
+
+        # 6. scaling
+        down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
+        mid_block_res_sample *= conditioning_scale
+
+        if not return_dict:
+            return (down_block_res_samples, mid_block_res_sample)
+
+        return FlaxControlNetOutput(
+            down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample
+        )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/downsampling.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/downsampling.py
new file mode 100644
index 000000000..9ae28e950
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/downsampling.py
@@ -0,0 +1,334 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..utils import deprecate
+from .normalization import RMSNorm
+from .upsampling import upfirdn2d_native
+
+
+class Downsample1D(nn.Module):
+    """A 1D downsampling layer with an optional convolution.
+
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        padding (`int`, default `1`):
+            padding for the convolution.
+        name (`str`, default `conv`):
+            name of the downsampling 1D layer.
+    """
+
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = False,
+        out_channels: Optional[int] = None,
+        padding: int = 1,
+        name: str = "conv",
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.padding = padding
+        stride = 2
+        self.name = name
+
+        if use_conv:
+            self.conv = nn.Conv1d(self.channels, self.out_channels, 3, stride=stride, padding=padding)
+        else:
+            assert self.channels == self.out_channels
+            self.conv = nn.AvgPool1d(kernel_size=stride, stride=stride)
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        assert inputs.shape[1] == self.channels
+        return self.conv(inputs)
+
+
+class Downsample2D(nn.Module):
+    """A 2D downsampling layer with an optional convolution.
+
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        padding (`int`, default `1`):
+            padding for the convolution.
+        name (`str`, default `conv`):
+            name of the downsampling 2D layer.
+    """
+
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = False,
+        out_channels: Optional[int] = None,
+        padding: int = 1,
+        name: str = "conv",
+        kernel_size=3,
+        norm_type=None,
+        eps=None,
+        elementwise_affine=None,
+        bias=True,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.padding = padding
+        stride = 2
+        self.name = name
+        conv_cls = nn.Conv2d
+
+        if norm_type == "ln_norm":
+            self.norm = nn.LayerNorm(channels, eps, elementwise_affine)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(channels, eps, elementwise_affine)
+        elif norm_type is None:
+            self.norm = None
+        else:
+            raise ValueError(f"unknown norm_type: {norm_type}")
+
+        if use_conv:
+            conv = conv_cls(
+                self.channels, self.out_channels, kernel_size=kernel_size, stride=stride, padding=padding, bias=bias
+            )
+        else:
+            assert self.channels == self.out_channels
+            conv = nn.AvgPool2d(kernel_size=stride, stride=stride)
+
+        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+        if name == "conv":
+            self.Conv2d_0 = conv
+            self.conv = conv
+        elif name == "Conv2d_0":
+            self.conv = conv
+        else:
+            self.conv = conv
+
+    def forward(self, hidden_states: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        assert hidden_states.shape[1] == self.channels
+
+        if self.norm is not None:
+            hidden_states = self.norm(hidden_states.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+
+        if self.use_conv and self.padding == 0:
+            pad = (0, 1, 0, 1)
+            hidden_states = F.pad(hidden_states, pad, mode="constant", value=0)
+
+        assert hidden_states.shape[1] == self.channels
+
+        hidden_states = self.conv(hidden_states)
+
+        return hidden_states
+
+
+class FirDownsample2D(nn.Module):
+    """A 2D FIR downsampling layer with an optional convolution.
+
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        fir_kernel (`tuple`, default `(1, 3, 3, 1)`):
+            kernel for the FIR filter.
+    """
+
+    def __init__(
+        self,
+        channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        use_conv: bool = False,
+        fir_kernel: Tuple[int, int, int, int] = (1, 3, 3, 1),
+    ):
+        super().__init__()
+        out_channels = out_channels if out_channels else channels
+        if use_conv:
+            self.Conv2d_0 = nn.Conv2d(channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.fir_kernel = fir_kernel
+        self.use_conv = use_conv
+        self.out_channels = out_channels
+
+    def _downsample_2d(
+        self,
+        hidden_states: torch.FloatTensor,
+        weight: Optional[torch.FloatTensor] = None,
+        kernel: Optional[torch.FloatTensor] = None,
+        factor: int = 2,
+        gain: float = 1,
+    ) -> torch.FloatTensor:
+        """Fused `Conv2d()` followed by `downsample_2d()`.
+        Padding is performed only once at the beginning, not between the operations. The fused op is considerably more
+        efficient than performing the same calculation using standard TensorFlow ops. It supports gradients of
+        arbitrary order.
+
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
+            weight (`torch.FloatTensor`, *optional*):
+                Weight tensor of the shape `[filterH, filterW, inChannels, outChannels]`. Grouped convolution can be
+                performed by `inChannels = x.shape[0] // numGroups`.
+            kernel (`torch.FloatTensor`, *optional*):
+                FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which
+                corresponds to average pooling.
+            factor (`int`, *optional*, default to `2`):
+                Integer downsampling factor.
+            gain (`float`, *optional*, default to `1.0`):
+                Scaling factor for signal magnitude.
+
+        Returns:
+            output (`torch.FloatTensor`):
+                Tensor of the shape `[N, C, H // factor, W // factor]` or `[N, H // factor, W // factor, C]`, and same
+                datatype as `x`.
+        """
+
+        assert isinstance(factor, int) and factor >= 1
+        if kernel is None:
+            kernel = [1] * factor
+
+        # setup kernel
+        kernel = torch.tensor(kernel, dtype=torch.float32)
+        if kernel.ndim == 1:
+            kernel = torch.outer(kernel, kernel)
+        kernel /= torch.sum(kernel)
+
+        kernel = kernel * gain
+
+        if self.use_conv:
+            _, _, convH, convW = weight.shape
+            pad_value = (kernel.shape[0] - factor) + (convW - 1)
+            stride_value = [factor, factor]
+            upfirdn_input = upfirdn2d_native(
+                hidden_states,
+                torch.tensor(kernel, device=hidden_states.device),
+                pad=((pad_value + 1) // 2, pad_value // 2),
+            )
+            output = F.conv2d(upfirdn_input, weight, stride=stride_value, padding=0)
+        else:
+            pad_value = kernel.shape[0] - factor
+            output = upfirdn2d_native(
+                hidden_states,
+                torch.tensor(kernel, device=hidden_states.device),
+                down=factor,
+                pad=((pad_value + 1) // 2, pad_value // 2),
+            )
+
+        return output
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        if self.use_conv:
+            downsample_input = self._downsample_2d(hidden_states, weight=self.Conv2d_0.weight, kernel=self.fir_kernel)
+            hidden_states = downsample_input + self.Conv2d_0.bias.reshape(1, -1, 1, 1)
+        else:
+            hidden_states = self._downsample_2d(hidden_states, kernel=self.fir_kernel, factor=2)
+
+        return hidden_states
+
+
+# downsample/upsample layer used in k-upscaler, might be able to use FirDownsample2D/DirUpsample2D instead
+class KDownsample2D(nn.Module):
+    r"""A 2D K-downsampling layer.
+
+    Parameters:
+        pad_mode (`str`, *optional*, default to `"reflect"`): the padding mode to use.
+    """
+
+    def __init__(self, pad_mode: str = "reflect"):
+        super().__init__()
+        self.pad_mode = pad_mode
+        kernel_1d = torch.tensor([[1 / 8, 3 / 8, 3 / 8, 1 / 8]])
+        self.pad = kernel_1d.shape[1] // 2 - 1
+        self.register_buffer("kernel", kernel_1d.T @ kernel_1d, persistent=False)
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        inputs = F.pad(inputs, (self.pad,) * 4, self.pad_mode)
+        weight = inputs.new_zeros(
+            [
+                inputs.shape[1],
+                inputs.shape[1],
+                self.kernel.shape[0],
+                self.kernel.shape[1],
+            ]
+        )
+        indices = torch.arange(inputs.shape[1], device=inputs.device)
+        kernel = self.kernel.to(weight)[None, :].expand(inputs.shape[1], -1, -1)
+        weight[indices, indices] = kernel
+        return F.conv2d(inputs, weight, stride=2)
+
+
+def downsample_2d(
+    hidden_states: torch.FloatTensor,
+    kernel: Optional[torch.FloatTensor] = None,
+    factor: int = 2,
+    gain: float = 1,
+) -> torch.FloatTensor:
+    r"""Downsample2D a batch of 2D images with the given filter.
+    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and downsamples each image with the
+    given filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the
+    specified `gain`. Pixels outside the image are assumed to be zero, and the filter is padded with zeros so that its
+    shape is a multiple of the downsampling factor.
+
+    Args:
+        hidden_states (`torch.FloatTensor`)
+            Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
+        kernel (`torch.FloatTensor`, *optional*):
+            FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which
+            corresponds to average pooling.
+        factor (`int`, *optional*, default to `2`):
+            Integer downsampling factor.
+        gain (`float`, *optional*, default to `1.0`):
+            Scaling factor for signal magnitude.
+
+    Returns:
+        output (`torch.FloatTensor`):
+            Tensor of the shape `[N, C, H // factor, W // factor]`
+    """
+
+    assert isinstance(factor, int) and factor >= 1
+    if kernel is None:
+        kernel = [1] * factor
+
+    kernel = torch.tensor(kernel, dtype=torch.float32)
+    if kernel.ndim == 1:
+        kernel = torch.outer(kernel, kernel)
+    kernel /= torch.sum(kernel)
+
+    kernel = kernel * gain
+    pad_value = kernel.shape[0] - factor
+    output = upfirdn2d_native(
+        hidden_states,
+        kernel.to(device=hidden_states.device),
+        down=factor,
+        pad=((pad_value + 1) // 2, pad_value // 2),
+    )
+    return output
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/dual_transformer_2d.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/dual_transformer_2d.py
new file mode 100644
index 000000000..b8e40f14d
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/dual_transformer_2d.py
@@ -0,0 +1,20 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..utils import deprecate
+from .transformers.dual_transformer_2d import DualTransformer2DModel
+
+
+class DualTransformer2DModel(DualTransformer2DModel):
+    deprecation_message = "Importing `DualTransformer2DModel` from `diffusers.models.dual_transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.dual_transformer_2d import DualTransformer2DModel`, instead."
+    deprecate("DualTransformer2DModel", "0.29", deprecation_message)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/embeddings.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/embeddings.py
new file mode 100644
index 000000000..c15ff24cb
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/embeddings.py
@@ -0,0 +1,914 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import nn
+
+from ..utils import deprecate
+from .activations import get_activation
+from .attention_processor import Attention
+
+
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
+    embeddings. :return: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+    )
+    exponent = exponent / (half_dim - downscale_freq_shift)
+
+    emb = torch.exp(exponent)
+    emb = timesteps[:, None].float() * emb[None, :]
+
+    # scale embeddings
+    emb = scale * emb
+
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+
+
+def get_2d_sincos_pos_embed(
+    embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16
+):
+    """
+    grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
+    [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if isinstance(grid_size, int):
+        grid_size = (grid_size, grid_size)
+
+    grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0] / base_size) / interpolation_scale
+    grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1] / base_size) / interpolation_scale
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be divisible by 2")
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
+    """
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be divisible by 2")
+
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding"""
+
+    def __init__(
+        self,
+        height=224,
+        width=224,
+        patch_size=16,
+        in_channels=3,
+        embed_dim=768,
+        layer_norm=False,
+        flatten=True,
+        bias=True,
+        interpolation_scale=1,
+    ):
+        super().__init__()
+
+        num_patches = (height // patch_size) * (width // patch_size)
+        self.flatten = flatten
+        self.layer_norm = layer_norm
+
+        self.proj = nn.Conv2d(
+            in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
+        )
+        if layer_norm:
+            self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
+        else:
+            self.norm = None
+
+        self.patch_size = patch_size
+        # See:
+        # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L161
+        self.height, self.width = height // patch_size, width // patch_size
+        self.base_size = height // patch_size
+        self.interpolation_scale = interpolation_scale
+        pos_embed = get_2d_sincos_pos_embed(
+            embed_dim, int(num_patches**0.5), base_size=self.base_size, interpolation_scale=self.interpolation_scale
+        )
+        self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=False)
+
+    def forward(self, latent):
+        height, width = latent.shape[-2] // self.patch_size, latent.shape[-1] // self.patch_size
+
+        latent = self.proj(latent)
+        if self.flatten:
+            latent = latent.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        if self.layer_norm:
+            latent = self.norm(latent)
+
+        # Interpolate positional embeddings if needed.
+        # (For PixArt-Alpha: https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L162C151-L162C160)
+        if self.height != height or self.width != width:
+            pos_embed = get_2d_sincos_pos_embed(
+                embed_dim=self.pos_embed.shape[-1],
+                grid_size=(height, width),
+                base_size=self.base_size,
+                interpolation_scale=self.interpolation_scale,
+            )
+            pos_embed = torch.from_numpy(pos_embed)
+            pos_embed = pos_embed.float().unsqueeze(0).to(latent.device)
+        else:
+            pos_embed = self.pos_embed
+
+        return (latent + pos_embed).to(latent.dtype)
+
+
+class TimestepEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        act_fn: str = "silu",
+        out_dim: int = None,
+        post_act_fn: Optional[str] = None,
+        cond_proj_dim=None,
+        sample_proj_bias=True,
+    ):
+        super().__init__()
+        linear_cls = nn.Linear
+
+        self.linear_1 = linear_cls(in_channels, time_embed_dim, sample_proj_bias)
+
+        if cond_proj_dim is not None:
+            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
+        else:
+            self.cond_proj = None
+
+        self.act = get_activation(act_fn)
+
+        if out_dim is not None:
+            time_embed_dim_out = out_dim
+        else:
+            time_embed_dim_out = time_embed_dim
+        self.linear_2 = linear_cls(time_embed_dim, time_embed_dim_out, sample_proj_bias)
+
+        if post_act_fn is None:
+            self.post_act = None
+        else:
+            self.post_act = get_activation(post_act_fn)
+
+    def forward(self, sample, condition=None):
+        if condition is not None:
+            sample = sample + self.cond_proj(condition)
+        sample = self.linear_1(sample)
+
+        if self.act is not None:
+            sample = self.act(sample)
+
+        sample = self.linear_2(sample)
+
+        if self.post_act is not None:
+            sample = self.post_act(sample)
+        return sample
+
+
+class Timesteps(nn.Module):
+    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float):
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+
+    def forward(self, timesteps):
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.num_channels,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+        )
+        return t_emb
+
+
+class GaussianFourierProjection(nn.Module):
+    """Gaussian Fourier embeddings for noise levels."""
+
+    def __init__(
+        self, embedding_size: int = 256, scale: float = 1.0, set_W_to_weight=True, log=True, flip_sin_to_cos=False
+    ):
+        super().__init__()
+        self.weight = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
+        self.log = log
+        self.flip_sin_to_cos = flip_sin_to_cos
+
+        if set_W_to_weight:
+            # to delete later
+            self.W = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
+
+            self.weight = self.W
+
+    def forward(self, x):
+        if self.log:
+            x = torch.log(x)
+
+        x_proj = x[:, None] * self.weight[None, :] * 2 * np.pi
+
+        if self.flip_sin_to_cos:
+            out = torch.cat([torch.cos(x_proj), torch.sin(x_proj)], dim=-1)
+        else:
+            out = torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
+        return out
+
+
+class SinusoidalPositionalEmbedding(nn.Module):
+    """Apply positional information to a sequence of embeddings.
+
+    Takes in a sequence of embeddings with shape (batch_size, seq_length, embed_dim) and adds positional embeddings to
+    them
+
+    Args:
+        embed_dim: (int): Dimension of the positional embedding.
+        max_seq_length: Maximum sequence length to apply positional embeddings
+
+    """
+
+    def __init__(self, embed_dim: int, max_seq_length: int = 32):
+        super().__init__()
+        position = torch.arange(max_seq_length).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, embed_dim, 2) * (-math.log(10000.0) / embed_dim))
+        pe = torch.zeros(1, max_seq_length, embed_dim)
+        pe[0, :, 0::2] = torch.sin(position * div_term)
+        pe[0, :, 1::2] = torch.cos(position * div_term)
+        self.register_buffer("pe", pe)
+
+    def forward(self, x):
+        _, seq_length, _ = x.shape
+        x = x + self.pe[:, :seq_length]
+        return x
+
+
+class ImagePositionalEmbeddings(nn.Module):
+    """
+    Converts latent image classes into vector embeddings. Sums the vector embeddings with positional embeddings for the
+    height and width of the latent space.
+
+    For more details, see figure 10 of the dall-e paper: https://arxiv.org/abs/2102.12092
+
+    For VQ-diffusion:
+
+    Output vector embeddings are used as input for the transformer.
+
+    Note that the vector embeddings for the transformer are different than the vector embeddings from the VQVAE.
+
+    Args:
+        num_embed (`int`):
+            Number of embeddings for the latent pixels embeddings.
+        height (`int`):
+            Height of the latent image i.e. the number of height embeddings.
+        width (`int`):
+            Width of the latent image i.e. the number of width embeddings.
+        embed_dim (`int`):
+            Dimension of the produced vector embeddings. Used for the latent pixel, height, and width embeddings.
+    """
+
+    def __init__(
+        self,
+        num_embed: int,
+        height: int,
+        width: int,
+        embed_dim: int,
+    ):
+        super().__init__()
+
+        self.height = height
+        self.width = width
+        self.num_embed = num_embed
+        self.embed_dim = embed_dim
+
+        self.emb = nn.Embedding(self.num_embed, embed_dim)
+        self.height_emb = nn.Embedding(self.height, embed_dim)
+        self.width_emb = nn.Embedding(self.width, embed_dim)
+
+    def forward(self, index):
+        emb = self.emb(index)
+
+        height_emb = self.height_emb(torch.arange(self.height, device=index.device).view(1, self.height))
+
+        # 1 x H x D -> 1 x H x 1 x D
+        height_emb = height_emb.unsqueeze(2)
+
+        width_emb = self.width_emb(torch.arange(self.width, device=index.device).view(1, self.width))
+
+        # 1 x W x D -> 1 x 1 x W x D
+        width_emb = width_emb.unsqueeze(1)
+
+        pos_emb = height_emb + width_emb
+
+        # 1 x H x W x D -> 1 x L xD
+        pos_emb = pos_emb.view(1, self.height * self.width, -1)
+
+        emb = emb + pos_emb[:, : emb.shape[1], :]
+
+        return emb
+
+
+class LabelEmbedding(nn.Module):
+    """
+    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
+
+    Args:
+        num_classes (`int`): The number of classes.
+        hidden_size (`int`): The size of the vector embeddings.
+        dropout_prob (`float`): The probability of dropping a label.
+    """
+
+    def __init__(self, num_classes, hidden_size, dropout_prob):
+        super().__init__()
+        use_cfg_embedding = dropout_prob > 0
+        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
+        self.num_classes = num_classes
+        self.dropout_prob = dropout_prob
+
+    def token_drop(self, labels, force_drop_ids=None):
+        """
+        Drops labels to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
+        else:
+            drop_ids = torch.tensor(force_drop_ids == 1)
+        labels = torch.where(drop_ids, self.num_classes, labels)
+        return labels
+
+    def forward(self, labels: torch.LongTensor, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (self.training and use_dropout) or (force_drop_ids is not None):
+            labels = self.token_drop(labels, force_drop_ids)
+        embeddings = self.embedding_table(labels)
+        return embeddings
+
+
+class TextImageProjection(nn.Module):
+    def __init__(
+        self,
+        text_embed_dim: int = 1024,
+        image_embed_dim: int = 768,
+        cross_attention_dim: int = 768,
+        num_image_text_embeds: int = 10,
+    ):
+        super().__init__()
+
+        self.num_image_text_embeds = num_image_text_embeds
+        self.image_embeds = nn.Linear(image_embed_dim, self.num_image_text_embeds * cross_attention_dim)
+        self.text_proj = nn.Linear(text_embed_dim, cross_attention_dim)
+
+    def forward(self, text_embeds: torch.FloatTensor, image_embeds: torch.FloatTensor):
+        batch_size = text_embeds.shape[0]
+
+        # image
+        image_text_embeds = self.image_embeds(image_embeds)
+        image_text_embeds = image_text_embeds.reshape(batch_size, self.num_image_text_embeds, -1)
+
+        # text
+        text_embeds = self.text_proj(text_embeds)
+
+        return torch.cat([image_text_embeds, text_embeds], dim=1)
+
+
+class ImageProjection(nn.Module):
+    def __init__(
+        self,
+        image_embed_dim: int = 768,
+        cross_attention_dim: int = 768,
+        num_image_text_embeds: int = 32,
+    ):
+        super().__init__()
+
+        self.num_image_text_embeds = num_image_text_embeds
+        self.image_embeds = nn.Linear(image_embed_dim, self.num_image_text_embeds * cross_attention_dim)
+        self.norm = nn.LayerNorm(cross_attention_dim)
+
+    def forward(self, image_embeds: torch.FloatTensor):
+        batch_size = image_embeds.shape[0]
+
+        # image
+        image_embeds = self.image_embeds(image_embeds)
+        image_embeds = image_embeds.reshape(batch_size, self.num_image_text_embeds, -1)
+        image_embeds = self.norm(image_embeds)
+        return image_embeds
+
+
+class IPAdapterFullImageProjection(nn.Module):
+    def __init__(self, image_embed_dim=1024, cross_attention_dim=1024):
+        super().__init__()
+        from .attention import FeedForward
+
+        self.ff = FeedForward(image_embed_dim, cross_attention_dim, mult=1, activation_fn="gelu")
+        self.norm = nn.LayerNorm(cross_attention_dim)
+
+    def forward(self, image_embeds: torch.FloatTensor):
+        return self.norm(self.ff(image_embeds))
+
+
+class CombinedTimestepLabelEmbeddings(nn.Module):
+    def __init__(self, num_classes, embedding_dim, class_dropout_prob=0.1):
+        super().__init__()
+
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=1)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+        self.class_embedder = LabelEmbedding(num_classes, embedding_dim, class_dropout_prob)
+
+    def forward(self, timestep, class_labels, hidden_dtype=None):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (N, D)
+
+        class_labels = self.class_embedder(class_labels)  # (N, D)
+
+        conditioning = timesteps_emb + class_labels  # (N, D)
+
+        return conditioning
+
+
+class TextTimeEmbedding(nn.Module):
+    def __init__(self, encoder_dim: int, time_embed_dim: int, num_heads: int = 64):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(encoder_dim)
+        self.pool = AttentionPooling(num_heads, encoder_dim)
+        self.proj = nn.Linear(encoder_dim, time_embed_dim)
+        self.norm2 = nn.LayerNorm(time_embed_dim)
+
+    def forward(self, hidden_states):
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.pool(hidden_states)
+        hidden_states = self.proj(hidden_states)
+        hidden_states = self.norm2(hidden_states)
+        return hidden_states
+
+
+class TextImageTimeEmbedding(nn.Module):
+    def __init__(self, text_embed_dim: int = 768, image_embed_dim: int = 768, time_embed_dim: int = 1536):
+        super().__init__()
+        self.text_proj = nn.Linear(text_embed_dim, time_embed_dim)
+        self.text_norm = nn.LayerNorm(time_embed_dim)
+        self.image_proj = nn.Linear(image_embed_dim, time_embed_dim)
+
+    def forward(self, text_embeds: torch.FloatTensor, image_embeds: torch.FloatTensor):
+        # text
+        time_text_embeds = self.text_proj(text_embeds)
+        time_text_embeds = self.text_norm(time_text_embeds)
+
+        # image
+        time_image_embeds = self.image_proj(image_embeds)
+
+        return time_image_embeds + time_text_embeds
+
+
+class ImageTimeEmbedding(nn.Module):
+    def __init__(self, image_embed_dim: int = 768, time_embed_dim: int = 1536):
+        super().__init__()
+        self.image_proj = nn.Linear(image_embed_dim, time_embed_dim)
+        self.image_norm = nn.LayerNorm(time_embed_dim)
+
+    def forward(self, image_embeds: torch.FloatTensor):
+        # image
+        time_image_embeds = self.image_proj(image_embeds)
+        time_image_embeds = self.image_norm(time_image_embeds)
+        return time_image_embeds
+
+
+class ImageHintTimeEmbedding(nn.Module):
+    def __init__(self, image_embed_dim: int = 768, time_embed_dim: int = 1536):
+        super().__init__()
+        self.image_proj = nn.Linear(image_embed_dim, time_embed_dim)
+        self.image_norm = nn.LayerNorm(time_embed_dim)
+        self.input_hint_block = nn.Sequential(
+            nn.Conv2d(3, 16, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(16, 16, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(16, 32, 3, padding=1, stride=2),
+            nn.SiLU(),
+            nn.Conv2d(32, 32, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(32, 96, 3, padding=1, stride=2),
+            nn.SiLU(),
+            nn.Conv2d(96, 96, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(96, 256, 3, padding=1, stride=2),
+            nn.SiLU(),
+            nn.Conv2d(256, 4, 3, padding=1),
+        )
+
+    def forward(self, image_embeds: torch.FloatTensor, hint: torch.FloatTensor):
+        # image
+        time_image_embeds = self.image_proj(image_embeds)
+        time_image_embeds = self.image_norm(time_image_embeds)
+        hint = self.input_hint_block(hint)
+        return time_image_embeds, hint
+
+
+class AttentionPooling(nn.Module):
+    # Copied from https://github.com/deep-floyd/IF/blob/2f91391f27dd3c468bf174be5805b4cc92980c0b/deepfloyd_if/model/nn.py#L54
+
+    def __init__(self, num_heads, embed_dim, dtype=None):
+        super().__init__()
+        self.dtype = dtype
+        self.positional_embedding = nn.Parameter(torch.randn(1, embed_dim) / embed_dim**0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim, dtype=self.dtype)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, dtype=self.dtype)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, dtype=self.dtype)
+        self.num_heads = num_heads
+        self.dim_per_head = embed_dim // self.num_heads
+
+    def forward(self, x):
+        bs, length, width = x.size()
+
+        def shape(x):
+            # (bs, length, width) --> (bs, length, n_heads, dim_per_head)
+            x = x.view(bs, -1, self.num_heads, self.dim_per_head)
+            # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+            x = x.transpose(1, 2)
+            # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
+            x = x.reshape(bs * self.num_heads, -1, self.dim_per_head)
+            # (bs*n_heads, length, dim_per_head) --> (bs*n_heads, dim_per_head, length)
+            x = x.transpose(1, 2)
+            return x
+
+        class_token = x.mean(dim=1, keepdim=True) + self.positional_embedding.to(x.dtype)
+        x = torch.cat([class_token, x], dim=1)  # (bs, length+1, width)
+
+        # (bs*n_heads, class_token_length, dim_per_head)
+        q = shape(self.q_proj(class_token))
+        # (bs*n_heads, length+class_token_length, dim_per_head)
+        k = shape(self.k_proj(x))
+        v = shape(self.v_proj(x))
+
+        # (bs*n_heads, class_token_length, length+class_token_length):
+        scale = 1 / math.sqrt(math.sqrt(self.dim_per_head))
+        weight = torch.einsum("bct,bcs->bts", q * scale, k * scale)  # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+
+        # (bs*n_heads, dim_per_head, class_token_length)
+        a = torch.einsum("bts,bcs->bct", weight, v)
+
+        # (bs, length+1, width)
+        a = a.reshape(bs, -1, 1).transpose(1, 2)
+
+        return a[:, 0, :]  # cls_token
+
+
+def get_fourier_embeds_from_boundingbox(embed_dim, box):
+    """
+    Args:
+        embed_dim: int
+        box: a 3-D tensor [B x N x 4] representing the bounding boxes for GLIGEN pipeline
+    Returns:
+        [B x N x embed_dim] tensor of positional embeddings
+    """
+
+    batch_size, num_boxes = box.shape[:2]
+
+    emb = 100 ** (torch.arange(embed_dim) / embed_dim)
+    emb = emb[None, None, None].to(device=box.device, dtype=box.dtype)
+    emb = emb * box.unsqueeze(-1)
+
+    emb = torch.stack((emb.sin(), emb.cos()), dim=-1)
+    emb = emb.permute(0, 1, 3, 4, 2).reshape(batch_size, num_boxes, embed_dim * 2 * 4)
+
+    return emb
+
+
+class GLIGENTextBoundingboxProjection(nn.Module):
+    def __init__(self, positive_len, out_dim, feature_type="text-only", fourier_freqs=8):
+        super().__init__()
+        self.positive_len = positive_len
+        self.out_dim = out_dim
+
+        self.fourier_embedder_dim = fourier_freqs
+        self.position_dim = fourier_freqs * 2 * 4  # 2: sin/cos, 4: xyxy
+
+        if isinstance(out_dim, tuple):
+            out_dim = out_dim[0]
+
+        if feature_type == "text-only":
+            self.linears = nn.Sequential(
+                nn.Linear(self.positive_len + self.position_dim, 512),
+                nn.SiLU(),
+                nn.Linear(512, 512),
+                nn.SiLU(),
+                nn.Linear(512, out_dim),
+            )
+            self.null_positive_feature = torch.nn.Parameter(torch.zeros([self.positive_len]))
+
+        elif feature_type == "text-image":
+            self.linears_text = nn.Sequential(
+                nn.Linear(self.positive_len + self.position_dim, 512),
+                nn.SiLU(),
+                nn.Linear(512, 512),
+                nn.SiLU(),
+                nn.Linear(512, out_dim),
+            )
+            self.linears_image = nn.Sequential(
+                nn.Linear(self.positive_len + self.position_dim, 512),
+                nn.SiLU(),
+                nn.Linear(512, 512),
+                nn.SiLU(),
+                nn.Linear(512, out_dim),
+            )
+            self.null_text_feature = torch.nn.Parameter(torch.zeros([self.positive_len]))
+            self.null_image_feature = torch.nn.Parameter(torch.zeros([self.positive_len]))
+
+        self.null_position_feature = torch.nn.Parameter(torch.zeros([self.position_dim]))
+
+    def forward(
+        self,
+        boxes,
+        masks,
+        positive_embeddings=None,
+        phrases_masks=None,
+        image_masks=None,
+        phrases_embeddings=None,
+        image_embeddings=None,
+    ):
+        masks = masks.unsqueeze(-1)
+
+        # embedding position (it may includes padding as placeholder)
+        xyxy_embedding = get_fourier_embeds_from_boundingbox(self.fourier_embedder_dim, boxes)  # B*N*4 -> B*N*C
+
+        # learnable null embedding
+        xyxy_null = self.null_position_feature.view(1, 1, -1)
+
+        # replace padding with learnable null embedding
+        xyxy_embedding = xyxy_embedding * masks + (1 - masks) * xyxy_null
+
+        # positionet with text only information
+        if positive_embeddings is not None:
+            # learnable null embedding
+            positive_null = self.null_positive_feature.view(1, 1, -1)
+
+            # replace padding with learnable null embedding
+            positive_embeddings = positive_embeddings * masks + (1 - masks) * positive_null
+
+            objs = self.linears(torch.cat([positive_embeddings, xyxy_embedding], dim=-1))
+
+        # positionet with text and image infomation
+        else:
+            phrases_masks = phrases_masks.unsqueeze(-1)
+            image_masks = image_masks.unsqueeze(-1)
+
+            # learnable null embedding
+            text_null = self.null_text_feature.view(1, 1, -1)
+            image_null = self.null_image_feature.view(1, 1, -1)
+
+            # replace padding with learnable null embedding
+            phrases_embeddings = phrases_embeddings * phrases_masks + (1 - phrases_masks) * text_null
+            image_embeddings = image_embeddings * image_masks + (1 - image_masks) * image_null
+
+            objs_text = self.linears_text(torch.cat([phrases_embeddings, xyxy_embedding], dim=-1))
+            objs_image = self.linears_image(torch.cat([image_embeddings, xyxy_embedding], dim=-1))
+            objs = torch.cat([objs_text, objs_image], dim=1)
+
+        return objs
+
+
+class PixArtAlphaCombinedTimestepSizeEmbeddings(nn.Module):
+    """
+    For PixArt-Alpha.
+
+    Reference:
+    https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L164C9-L168C29
+    """
+
+    def __init__(self, embedding_dim, size_emb_dim, use_additional_conditions: bool = False):
+        super().__init__()
+
+        self.outdim = size_emb_dim
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+
+        self.use_additional_conditions = use_additional_conditions
+        if use_additional_conditions:
+            self.additional_condition_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
+            self.resolution_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=size_emb_dim)
+            self.aspect_ratio_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=size_emb_dim)
+
+    def forward(self, timestep, resolution, aspect_ratio, batch_size, hidden_dtype):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (N, D)
+
+        if self.use_additional_conditions:
+            resolution_emb = self.additional_condition_proj(resolution.flatten()).to(hidden_dtype)
+            resolution_emb = self.resolution_embedder(resolution_emb).reshape(batch_size, -1)
+            aspect_ratio_emb = self.additional_condition_proj(aspect_ratio.flatten()).to(hidden_dtype)
+            aspect_ratio_emb = self.aspect_ratio_embedder(aspect_ratio_emb).reshape(batch_size, -1)
+            conditioning = timesteps_emb + torch.cat([resolution_emb, aspect_ratio_emb], dim=1)
+        else:
+            conditioning = timesteps_emb
+
+        return conditioning
+
+
+class PixArtAlphaTextProjection(nn.Module):
+    """
+    Projects caption embeddings. Also handles dropout for classifier-free guidance.
+
+    Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
+    """
+
+    def __init__(self, in_features, hidden_size, num_tokens=120):
+        super().__init__()
+        self.linear_1 = nn.Linear(in_features=in_features, out_features=hidden_size, bias=True)
+        self.act_1 = nn.GELU(approximate="tanh")
+        self.linear_2 = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=True)
+
+    def forward(self, caption):
+        hidden_states = self.linear_1(caption)
+        hidden_states = self.act_1(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class IPAdapterPlusImageProjection(nn.Module):
+    """Resampler of IP-Adapter Plus.
+
+    Args:
+    ----
+        embed_dims (int): The feature dimension. Defaults to 768.
+        output_dims (int): The number of output channels, that is the same
+            number of the channels in the
+            `unet.config.cross_attention_dim`. Defaults to 1024.
+        hidden_dims (int): The number of hidden channels. Defaults to 1280.
+        depth (int): The number of blocks. Defaults to 8.
+        dim_head (int): The number of head channels. Defaults to 64.
+        heads (int): Parallel attention heads. Defaults to 16.
+        num_queries (int): The number of queries. Defaults to 8.
+        ffn_ratio (float): The expansion ratio of feedforward network hidden
+            layer channels. Defaults to 4.
+    """
+
+    def __init__(
+        self,
+        embed_dims: int = 768,
+        output_dims: int = 1024,
+        hidden_dims: int = 1280,
+        depth: int = 4,
+        dim_head: int = 64,
+        heads: int = 16,
+        num_queries: int = 8,
+        ffn_ratio: float = 4,
+    ) -> None:
+        super().__init__()
+        from .attention import FeedForward  # Lazy import to avoid circular import
+
+        self.latents = nn.Parameter(torch.randn(1, num_queries, hidden_dims) / hidden_dims**0.5)
+
+        self.proj_in = nn.Linear(embed_dims, hidden_dims)
+
+        self.proj_out = nn.Linear(hidden_dims, output_dims)
+        self.norm_out = nn.LayerNorm(output_dims)
+
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        nn.LayerNorm(hidden_dims),
+                        nn.LayerNorm(hidden_dims),
+                        Attention(
+                            query_dim=hidden_dims,
+                            dim_head=dim_head,
+                            heads=heads,
+                            out_bias=False,
+                        ),
+                        nn.Sequential(
+                            nn.LayerNorm(hidden_dims),
+                            FeedForward(hidden_dims, hidden_dims, activation_fn="gelu", mult=ffn_ratio, bias=False),
+                        ),
+                    ]
+                )
+            )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward pass.
+
+        Args:
+        ----
+            x (torch.Tensor): Input Tensor.
+
+        Returns:
+        -------
+            torch.Tensor: Output Tensor.
+        """
+        latents = self.latents.repeat(x.size(0), 1, 1)
+
+        x = self.proj_in(x)
+
+        for ln0, ln1, attn, ff in self.layers:
+            residual = latents
+
+            encoder_hidden_states = ln0(x)
+            latents = ln1(latents)
+            encoder_hidden_states = torch.cat([encoder_hidden_states, latents], dim=-2)
+            latents = attn(latents, encoder_hidden_states) + residual
+            latents = ff(latents) + latents
+
+        latents = self.proj_out(latents)
+        return self.norm_out(latents)
+
+
+class MultiIPAdapterImageProjection(nn.Module):
+    def __init__(self, IPAdapterImageProjectionLayers: Union[List[nn.Module], Tuple[nn.Module]]):
+        super().__init__()
+        self.image_projection_layers = nn.ModuleList(IPAdapterImageProjectionLayers)
+
+    def forward(self, image_embeds: List[torch.FloatTensor]):
+        projected_image_embeds = []
+
+        # currently, we accept `image_embeds` as
+        #  1. a tensor (deprecated) with shape [batch_size, embed_dim] or [batch_size, sequence_length, embed_dim]
+        #  2. list of `n` tensors where `n` is number of ip-adapters, each tensor can hae shape [batch_size, num_images, embed_dim] or [batch_size, num_images, sequence_length, embed_dim]
+        if not isinstance(image_embeds, list):
+            deprecation_message = (
+                "You have passed a tensor as `image_embeds`.This is deprecated and will be removed in a future release."
+                " Please make sure to update your script to pass `image_embeds` as a list of tensors to supress this warning."
+            )
+            deprecate("image_embeds not a list", "1.0.0", deprecation_message, standard_warn=False)
+            image_embeds = [image_embeds.unsqueeze(1)]
+
+        if len(image_embeds) != len(self.image_projection_layers):
+            raise ValueError(
+                f"image_embeds must have the same length as image_projection_layers, got {len(image_embeds)} and {len(self.image_projection_layers)}"
+            )
+
+        for image_embed, image_projection_layer in zip(image_embeds, self.image_projection_layers):
+            batch_size, num_images = image_embed.shape[0], image_embed.shape[1]
+            image_embed = image_embed.reshape((batch_size * num_images,) + image_embed.shape[2:])
+            image_embed = image_projection_layer(image_embed)
+            image_embed = image_embed.reshape((batch_size, num_images) + image_embed.shape[1:])
+
+            projected_image_embeds.append(image_embed)
+
+        return projected_image_embeds
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/embeddings_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/embeddings_flax.py
new file mode 100644
index 000000000..8e343be0d
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/embeddings_flax.py
@@ -0,0 +1,97 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import flax.linen as nn
+import jax.numpy as jnp
+
+
+def get_sinusoidal_embeddings(
+    timesteps: jnp.ndarray,
+    embedding_dim: int,
+    freq_shift: float = 1,
+    min_timescale: float = 1,
+    max_timescale: float = 1.0e4,
+    flip_sin_to_cos: bool = False,
+    scale: float = 1.0,
+) -> jnp.ndarray:
+    """Returns the positional encoding (same as Tensor2Tensor).
+
+    Args:
+        timesteps: a 1-D Tensor of N indices, one per batch element.
+        These may be fractional.
+        embedding_dim: The number of output channels.
+        min_timescale: The smallest time unit (should probably be 0.0).
+        max_timescale: The largest time unit.
+    Returns:
+        a Tensor of timing signals [N, num_channels]
+    """
+    assert timesteps.ndim == 1, "Timesteps should be a 1d-array"
+    assert embedding_dim % 2 == 0, f"Embedding dimension {embedding_dim} should be even"
+    num_timescales = float(embedding_dim // 2)
+    log_timescale_increment = math.log(max_timescale / min_timescale) / (num_timescales - freq_shift)
+    inv_timescales = min_timescale * jnp.exp(jnp.arange(num_timescales, dtype=jnp.float32) * -log_timescale_increment)
+    emb = jnp.expand_dims(timesteps, 1) * jnp.expand_dims(inv_timescales, 0)
+
+    # scale embeddings
+    scaled_time = scale * emb
+
+    if flip_sin_to_cos:
+        signal = jnp.concatenate([jnp.cos(scaled_time), jnp.sin(scaled_time)], axis=1)
+    else:
+        signal = jnp.concatenate([jnp.sin(scaled_time), jnp.cos(scaled_time)], axis=1)
+    signal = jnp.reshape(signal, [jnp.shape(timesteps)[0], embedding_dim])
+    return signal
+
+
+class FlaxTimestepEmbedding(nn.Module):
+    r"""
+    Time step Embedding Module. Learns embeddings for input time steps.
+
+    Args:
+        time_embed_dim (`int`, *optional*, defaults to `32`):
+                Time step embedding dimension
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+                Parameters `dtype`
+    """
+
+    time_embed_dim: int = 32
+    dtype: jnp.dtype = jnp.float32
+
+    @nn.compact
+    def __call__(self, temb):
+        temb = nn.Dense(self.time_embed_dim, dtype=self.dtype, name="linear_1")(temb)
+        temb = nn.silu(temb)
+        temb = nn.Dense(self.time_embed_dim, dtype=self.dtype, name="linear_2")(temb)
+        return temb
+
+
+class FlaxTimesteps(nn.Module):
+    r"""
+    Wrapper Module for sinusoidal Time step Embeddings as described in https://arxiv.org/abs/2006.11239
+
+    Args:
+        dim (`int`, *optional*, defaults to `32`):
+                Time step embedding dimension
+    """
+
+    dim: int = 32
+    flip_sin_to_cos: bool = False
+    freq_shift: float = 1
+
+    @nn.compact
+    def __call__(self, timesteps):
+        return get_sinusoidal_embeddings(
+            timesteps, embedding_dim=self.dim, flip_sin_to_cos=self.flip_sin_to_cos, freq_shift=self.freq_shift
+        )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/lora.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/lora.py
new file mode 100644
index 000000000..4e9e0c07c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/lora.py
@@ -0,0 +1,457 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# IMPORTANT:                                                      #
+###################################################################
+# ----------------------------------------------------------------#
+# This file is deprecated and will be removed soon                #
+# (as soon as PEFT will become a required dependency for LoRA)    #
+# ----------------------------------------------------------------#
+###################################################################
+
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ..utils import deprecate, logging
+from ..utils.import_utils import is_transformers_available
+
+
+if is_transformers_available():
+    from transformers import CLIPTextModel, CLIPTextModelWithProjection
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def text_encoder_attn_modules(text_encoder):
+    attn_modules = []
+
+    if isinstance(text_encoder, (CLIPTextModel, CLIPTextModelWithProjection)):
+        for i, layer in enumerate(text_encoder.text_model.encoder.layers):
+            name = f"text_model.encoder.layers.{i}.self_attn"
+            mod = layer.self_attn
+            attn_modules.append((name, mod))
+    else:
+        raise ValueError(f"do not know how to get attention modules for: {text_encoder.__class__.__name__}")
+
+    return attn_modules
+
+
+def text_encoder_mlp_modules(text_encoder):
+    mlp_modules = []
+
+    if isinstance(text_encoder, (CLIPTextModel, CLIPTextModelWithProjection)):
+        for i, layer in enumerate(text_encoder.text_model.encoder.layers):
+            mlp_mod = layer.mlp
+            name = f"text_model.encoder.layers.{i}.mlp"
+            mlp_modules.append((name, mlp_mod))
+    else:
+        raise ValueError(f"do not know how to get mlp modules for: {text_encoder.__class__.__name__}")
+
+    return mlp_modules
+
+
+def adjust_lora_scale_text_encoder(text_encoder, lora_scale: float = 1.0):
+    for _, attn_module in text_encoder_attn_modules(text_encoder):
+        if isinstance(attn_module.q_proj, PatchedLoraProjection):
+            attn_module.q_proj.lora_scale = lora_scale
+            attn_module.k_proj.lora_scale = lora_scale
+            attn_module.v_proj.lora_scale = lora_scale
+            attn_module.out_proj.lora_scale = lora_scale
+
+    for _, mlp_module in text_encoder_mlp_modules(text_encoder):
+        if isinstance(mlp_module.fc1, PatchedLoraProjection):
+            mlp_module.fc1.lora_scale = lora_scale
+            mlp_module.fc2.lora_scale = lora_scale
+
+
+class PatchedLoraProjection(torch.nn.Module):
+    def __init__(self, regular_linear_layer, lora_scale=1, network_alpha=None, rank=4, dtype=None):
+        deprecation_message = "Use of `PatchedLoraProjection` is deprecated. Please switch to PEFT backend by installing PEFT: `pip install peft`."
+        deprecate("PatchedLoraProjection", "1.0.0", deprecation_message)
+
+        super().__init__()
+        from ..models.lora import LoRALinearLayer
+
+        self.regular_linear_layer = regular_linear_layer
+
+        device = self.regular_linear_layer.weight.device
+
+        if dtype is None:
+            dtype = self.regular_linear_layer.weight.dtype
+
+        self.lora_linear_layer = LoRALinearLayer(
+            self.regular_linear_layer.in_features,
+            self.regular_linear_layer.out_features,
+            network_alpha=network_alpha,
+            device=device,
+            dtype=dtype,
+            rank=rank,
+        )
+
+        self.lora_scale = lora_scale
+
+    # overwrite PyTorch's `state_dict` to be sure that only the 'regular_linear_layer' weights are saved
+    # when saving the whole text encoder model and when LoRA is unloaded or fused
+    def state_dict(self, *args, destination=None, prefix="", keep_vars=False):
+        if self.lora_linear_layer is None:
+            return self.regular_linear_layer.state_dict(
+                *args, destination=destination, prefix=prefix, keep_vars=keep_vars
+            )
+
+        return super().state_dict(*args, destination=destination, prefix=prefix, keep_vars=keep_vars)
+
+    def _fuse_lora(self, lora_scale=1.0, safe_fusing=False):
+        if self.lora_linear_layer is None:
+            return
+
+        dtype, device = self.regular_linear_layer.weight.data.dtype, self.regular_linear_layer.weight.data.device
+
+        w_orig = self.regular_linear_layer.weight.data.float()
+        w_up = self.lora_linear_layer.up.weight.data.float()
+        w_down = self.lora_linear_layer.down.weight.data.float()
+
+        if self.lora_linear_layer.network_alpha is not None:
+            w_up = w_up * self.lora_linear_layer.network_alpha / self.lora_linear_layer.rank
+
+        fused_weight = w_orig + (lora_scale * torch.bmm(w_up[None, :], w_down[None, :])[0])
+
+        if safe_fusing and torch.isnan(fused_weight).any().item():
+            raise ValueError(
+                "This LoRA weight seems to be broken. "
+                f"Encountered NaN values when trying to fuse LoRA weights for {self}."
+                "LoRA weights will not be fused."
+            )
+
+        self.regular_linear_layer.weight.data = fused_weight.to(device=device, dtype=dtype)
+
+        # we can drop the lora layer now
+        self.lora_linear_layer = None
+
+        # offload the up and down matrices to CPU to not blow the memory
+        self.w_up = w_up.cpu()
+        self.w_down = w_down.cpu()
+        self.lora_scale = lora_scale
+
+    def _unfuse_lora(self):
+        if not (getattr(self, "w_up", None) is not None and getattr(self, "w_down", None) is not None):
+            return
+
+        fused_weight = self.regular_linear_layer.weight.data
+        dtype, device = fused_weight.dtype, fused_weight.device
+
+        w_up = self.w_up.to(device=device).float()
+        w_down = self.w_down.to(device).float()
+
+        unfused_weight = fused_weight.float() - (self.lora_scale * torch.bmm(w_up[None, :], w_down[None, :])[0])
+        self.regular_linear_layer.weight.data = unfused_weight.to(device=device, dtype=dtype)
+
+        self.w_up = None
+        self.w_down = None
+
+    def forward(self, input):
+        if self.lora_scale is None:
+            self.lora_scale = 1.0
+        if self.lora_linear_layer is None:
+            return self.regular_linear_layer(input)
+        return self.regular_linear_layer(input) + (self.lora_scale * self.lora_linear_layer(input))
+
+
+class LoRALinearLayer(nn.Module):
+    r"""
+    A linear layer that is used with LoRA.
+
+    Parameters:
+        in_features (`int`):
+            Number of input features.
+        out_features (`int`):
+            Number of output features.
+        rank (`int`, `optional`, defaults to 4):
+            The rank of the LoRA layer.
+        network_alpha (`float`, `optional`, defaults to `None`):
+            The value of the network alpha used for stable learning and preventing underflow. This value has the same
+            meaning as the `--network_alpha` option in the kohya-ss trainer script. See
+            https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+        device (`torch.device`, `optional`, defaults to `None`):
+            The device to use for the layer's weights.
+        dtype (`torch.dtype`, `optional`, defaults to `None`):
+            The dtype to use for the layer's weights.
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        rank: int = 4,
+        network_alpha: Optional[float] = None,
+        device: Optional[Union[torch.device, str]] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        super().__init__()
+
+        deprecation_message = "Use of `LoRALinearLayer` is deprecated. Please switch to PEFT backend by installing PEFT: `pip install peft`."
+        deprecate("LoRALinearLayer", "1.0.0", deprecation_message)
+
+        self.down = nn.Linear(in_features, rank, bias=False, device=device, dtype=dtype)
+        self.up = nn.Linear(rank, out_features, bias=False, device=device, dtype=dtype)
+        # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
+        # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+        self.network_alpha = network_alpha
+        self.rank = rank
+        self.out_features = out_features
+        self.in_features = in_features
+
+        nn.init.normal_(self.down.weight, std=1 / rank)
+        nn.init.zeros_(self.up.weight)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_dtype = hidden_states.dtype
+        dtype = self.down.weight.dtype
+
+        down_hidden_states = self.down(hidden_states.to(dtype))
+        up_hidden_states = self.up(down_hidden_states)
+
+        if self.network_alpha is not None:
+            up_hidden_states *= self.network_alpha / self.rank
+
+        return up_hidden_states.to(orig_dtype)
+
+
+class LoRAConv2dLayer(nn.Module):
+    r"""
+    A convolutional layer that is used with LoRA.
+
+    Parameters:
+        in_features (`int`):
+            Number of input features.
+        out_features (`int`):
+            Number of output features.
+        rank (`int`, `optional`, defaults to 4):
+            The rank of the LoRA layer.
+        kernel_size (`int` or `tuple` of two `int`, `optional`, defaults to 1):
+            The kernel size of the convolution.
+        stride (`int` or `tuple` of two `int`, `optional`, defaults to 1):
+            The stride of the convolution.
+        padding (`int` or `tuple` of two `int` or `str`, `optional`, defaults to 0):
+            The padding of the convolution.
+        network_alpha (`float`, `optional`, defaults to `None`):
+            The value of the network alpha used for stable learning and preventing underflow. This value has the same
+            meaning as the `--network_alpha` option in the kohya-ss trainer script. See
+            https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        rank: int = 4,
+        kernel_size: Union[int, Tuple[int, int]] = (1, 1),
+        stride: Union[int, Tuple[int, int]] = (1, 1),
+        padding: Union[int, Tuple[int, int], str] = 0,
+        network_alpha: Optional[float] = None,
+    ):
+        super().__init__()
+
+        deprecation_message = "Use of `LoRAConv2dLayer` is deprecated. Please switch to PEFT backend by installing PEFT: `pip install peft`."
+        deprecate("LoRAConv2dLayer", "1.0.0", deprecation_message)
+
+        self.down = nn.Conv2d(in_features, rank, kernel_size=kernel_size, stride=stride, padding=padding, bias=False)
+        # according to the official kohya_ss trainer kernel_size are always fixed for the up layer
+        # # see: https://github.com/bmaltais/kohya_ss/blob/2accb1305979ba62f5077a23aabac23b4c37e935/networks/lora_diffusers.py#L129
+        self.up = nn.Conv2d(rank, out_features, kernel_size=(1, 1), stride=(1, 1), bias=False)
+
+        # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
+        # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+        self.network_alpha = network_alpha
+        self.rank = rank
+
+        nn.init.normal_(self.down.weight, std=1 / rank)
+        nn.init.zeros_(self.up.weight)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_dtype = hidden_states.dtype
+        dtype = self.down.weight.dtype
+
+        down_hidden_states = self.down(hidden_states.to(dtype))
+        up_hidden_states = self.up(down_hidden_states)
+
+        if self.network_alpha is not None:
+            up_hidden_states *= self.network_alpha / self.rank
+
+        return up_hidden_states.to(orig_dtype)
+
+
+class LoRACompatibleConv(nn.Conv2d):
+    """
+    A convolutional layer that can be used with LoRA.
+    """
+
+    def __init__(self, *args, lora_layer: Optional[LoRAConv2dLayer] = None, **kwargs):
+        deprecation_message = "Use of `LoRACompatibleConv` is deprecated. Please switch to PEFT backend by installing PEFT: `pip install peft`."
+        deprecate("LoRACompatibleConv", "1.0.0", deprecation_message)
+
+        super().__init__(*args, **kwargs)
+        self.lora_layer = lora_layer
+
+    def set_lora_layer(self, lora_layer: Optional[LoRAConv2dLayer]):
+        deprecation_message = "Use of `set_lora_layer()` is deprecated. Please switch to PEFT backend by installing PEFT: `pip install peft`."
+        deprecate("set_lora_layer", "1.0.0", deprecation_message)
+
+        self.lora_layer = lora_layer
+
+    def _fuse_lora(self, lora_scale: float = 1.0, safe_fusing: bool = False):
+        if self.lora_layer is None:
+            return
+
+        dtype, device = self.weight.data.dtype, self.weight.data.device
+
+        w_orig = self.weight.data.float()
+        w_up = self.lora_layer.up.weight.data.float()
+        w_down = self.lora_layer.down.weight.data.float()
+
+        if self.lora_layer.network_alpha is not None:
+            w_up = w_up * self.lora_layer.network_alpha / self.lora_layer.rank
+
+        fusion = torch.mm(w_up.flatten(start_dim=1), w_down.flatten(start_dim=1))
+        fusion = fusion.reshape((w_orig.shape))
+        fused_weight = w_orig + (lora_scale * fusion)
+
+        if safe_fusing and torch.isnan(fused_weight).any().item():
+            raise ValueError(
+                "This LoRA weight seems to be broken. "
+                f"Encountered NaN values when trying to fuse LoRA weights for {self}."
+                "LoRA weights will not be fused."
+            )
+
+        self.weight.data = fused_weight.to(device=device, dtype=dtype)
+
+        # we can drop the lora layer now
+        self.lora_layer = None
+
+        # offload the up and down matrices to CPU to not blow the memory
+        self.w_up = w_up.cpu()
+        self.w_down = w_down.cpu()
+        self._lora_scale = lora_scale
+
+    def _unfuse_lora(self):
+        if not (getattr(self, "w_up", None) is not None and getattr(self, "w_down", None) is not None):
+            return
+
+        fused_weight = self.weight.data
+        dtype, device = fused_weight.data.dtype, fused_weight.data.device
+
+        self.w_up = self.w_up.to(device=device).float()
+        self.w_down = self.w_down.to(device).float()
+
+        fusion = torch.mm(self.w_up.flatten(start_dim=1), self.w_down.flatten(start_dim=1))
+        fusion = fusion.reshape((fused_weight.shape))
+        unfused_weight = fused_weight.float() - (self._lora_scale * fusion)
+        self.weight.data = unfused_weight.to(device=device, dtype=dtype)
+
+        self.w_up = None
+        self.w_down = None
+
+    def forward(self, hidden_states: torch.Tensor, scale: float = 1.0) -> torch.Tensor:
+        if self.padding_mode != "zeros":
+            hidden_states = F.pad(hidden_states, self._reversed_padding_repeated_twice, mode=self.padding_mode)
+            padding = (0, 0)
+        else:
+            padding = self.padding
+
+        original_outputs = F.conv2d(
+            hidden_states, self.weight, self.bias, self.stride, padding, self.dilation, self.groups
+        )
+
+        if self.lora_layer is None:
+            return original_outputs
+        else:
+            return original_outputs + (scale * self.lora_layer(hidden_states))
+
+
+class LoRACompatibleLinear(nn.Linear):
+    """
+    A Linear layer that can be used with LoRA.
+    """
+
+    def __init__(self, *args, lora_layer: Optional[LoRALinearLayer] = None, **kwargs):
+        deprecation_message = "Use of `LoRACompatibleLinear` is deprecated. Please switch to PEFT backend by installing PEFT: `pip install peft`."
+        deprecate("LoRACompatibleLinear", "1.0.0", deprecation_message)
+
+        super().__init__(*args, **kwargs)
+        self.lora_layer = lora_layer
+
+    def set_lora_layer(self, lora_layer: Optional[LoRALinearLayer]):
+        deprecation_message = "Use of `set_lora_layer()` is deprecated. Please switch to PEFT backend by installing PEFT: `pip install peft`."
+        deprecate("set_lora_layer", "1.0.0", deprecation_message)
+        self.lora_layer = lora_layer
+
+    def _fuse_lora(self, lora_scale: float = 1.0, safe_fusing: bool = False):
+        if self.lora_layer is None:
+            return
+
+        dtype, device = self.weight.data.dtype, self.weight.data.device
+
+        w_orig = self.weight.data.float()
+        w_up = self.lora_layer.up.weight.data.float()
+        w_down = self.lora_layer.down.weight.data.float()
+
+        if self.lora_layer.network_alpha is not None:
+            w_up = w_up * self.lora_layer.network_alpha / self.lora_layer.rank
+
+        fused_weight = w_orig + (lora_scale * torch.bmm(w_up[None, :], w_down[None, :])[0])
+
+        if safe_fusing and torch.isnan(fused_weight).any().item():
+            raise ValueError(
+                "This LoRA weight seems to be broken. "
+                f"Encountered NaN values when trying to fuse LoRA weights for {self}."
+                "LoRA weights will not be fused."
+            )
+
+        self.weight.data = fused_weight.to(device=device, dtype=dtype)
+
+        # we can drop the lora layer now
+        self.lora_layer = None
+
+        # offload the up and down matrices to CPU to not blow the memory
+        self.w_up = w_up.cpu()
+        self.w_down = w_down.cpu()
+        self._lora_scale = lora_scale
+
+    def _unfuse_lora(self):
+        if not (getattr(self, "w_up", None) is not None and getattr(self, "w_down", None) is not None):
+            return
+
+        fused_weight = self.weight.data
+        dtype, device = fused_weight.dtype, fused_weight.device
+
+        w_up = self.w_up.to(device=device).float()
+        w_down = self.w_down.to(device).float()
+
+        unfused_weight = fused_weight.float() - (self._lora_scale * torch.bmm(w_up[None, :], w_down[None, :])[0])
+        self.weight.data = unfused_weight.to(device=device, dtype=dtype)
+
+        self.w_up = None
+        self.w_down = None
+
+    def forward(self, hidden_states: torch.Tensor, scale: float = 1.0) -> torch.Tensor:
+        if self.lora_layer is None:
+            out = super().forward(hidden_states)
+            return out
+        else:
+            out = super().forward(hidden_states) + (scale * self.lora_layer(hidden_states))
+            return out
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_flax_pytorch_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_flax_pytorch_utils.py
new file mode 100644
index 000000000..4a487133e
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_flax_pytorch_utils.py
@@ -0,0 +1,134 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch - Flax general utilities."""
+import re
+
+import jax.numpy as jnp
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax.random import PRNGKey
+
+from ..utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def rename_key(key):
+    regex = r"\w+[.]\d+"
+    pats = re.findall(regex, key)
+    for pat in pats:
+        key = key.replace(pat, "_".join(pat.split(".")))
+    return key
+
+
+#####################
+# PyTorch => Flax #
+#####################
+
+
+# Adapted from https://github.com/huggingface/transformers/blob/c603c80f46881ae18b2ca50770ef65fa4033eacd/src/transformers/modeling_flax_pytorch_utils.py#L69
+# and https://github.com/patil-suraj/stable-diffusion-jax/blob/main/stable_diffusion_jax/convert_diffusers_to_jax.py
+def rename_key_and_reshape_tensor(pt_tuple_key, pt_tensor, random_flax_state_dict):
+    """Rename PT weight names to corresponding Flax weight names and reshape tensor if necessary"""
+    # conv norm or layer norm
+    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("scale",)
+
+    # rename attention layers
+    if len(pt_tuple_key) > 1:
+        for rename_from, rename_to in (
+            ("to_out_0", "proj_attn"),
+            ("to_k", "key"),
+            ("to_v", "value"),
+            ("to_q", "query"),
+        ):
+            if pt_tuple_key[-2] == rename_from:
+                weight_name = pt_tuple_key[-1]
+                weight_name = "kernel" if weight_name == "weight" else weight_name
+                renamed_pt_tuple_key = pt_tuple_key[:-2] + (rename_to, weight_name)
+                if renamed_pt_tuple_key in random_flax_state_dict:
+                    assert random_flax_state_dict[renamed_pt_tuple_key].shape == pt_tensor.T.shape
+                    return renamed_pt_tuple_key, pt_tensor.T
+
+    if (
+        any("norm" in str_ for str_ in pt_tuple_key)
+        and (pt_tuple_key[-1] == "bias")
+        and (pt_tuple_key[:-1] + ("bias",) not in random_flax_state_dict)
+        and (pt_tuple_key[:-1] + ("scale",) in random_flax_state_dict)
+    ):
+        renamed_pt_tuple_key = pt_tuple_key[:-1] + ("scale",)
+        return renamed_pt_tuple_key, pt_tensor
+    elif pt_tuple_key[-1] in ["weight", "gamma"] and pt_tuple_key[:-1] + ("scale",) in random_flax_state_dict:
+        renamed_pt_tuple_key = pt_tuple_key[:-1] + ("scale",)
+        return renamed_pt_tuple_key, pt_tensor
+
+    # embedding
+    if pt_tuple_key[-1] == "weight" and pt_tuple_key[:-1] + ("embedding",) in random_flax_state_dict:
+        pt_tuple_key = pt_tuple_key[:-1] + ("embedding",)
+        return renamed_pt_tuple_key, pt_tensor
+
+    # conv layer
+    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("kernel",)
+    if pt_tuple_key[-1] == "weight" and pt_tensor.ndim == 4:
+        pt_tensor = pt_tensor.transpose(2, 3, 1, 0)
+        return renamed_pt_tuple_key, pt_tensor
+
+    # linear layer
+    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("kernel",)
+    if pt_tuple_key[-1] == "weight":
+        pt_tensor = pt_tensor.T
+        return renamed_pt_tuple_key, pt_tensor
+
+    # old PyTorch layer norm weight
+    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("weight",)
+    if pt_tuple_key[-1] == "gamma":
+        return renamed_pt_tuple_key, pt_tensor
+
+    # old PyTorch layer norm bias
+    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("bias",)
+    if pt_tuple_key[-1] == "beta":
+        return renamed_pt_tuple_key, pt_tensor
+
+    return pt_tuple_key, pt_tensor
+
+
+def convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model, init_key=42):
+    # Step 1: Convert pytorch tensor to numpy
+    pt_state_dict = {k: v.numpy() for k, v in pt_state_dict.items()}
+
+    # Step 2: Since the model is stateless, get random Flax params
+    random_flax_params = flax_model.init_weights(PRNGKey(init_key))
+
+    random_flax_state_dict = flatten_dict(random_flax_params)
+    flax_state_dict = {}
+
+    # Need to change some parameters name to match Flax names
+    for pt_key, pt_tensor in pt_state_dict.items():
+        renamed_pt_key = rename_key(pt_key)
+        pt_tuple_key = tuple(renamed_pt_key.split("."))
+
+        # Correctly rename weight parameters
+        flax_key, flax_tensor = rename_key_and_reshape_tensor(pt_tuple_key, pt_tensor, random_flax_state_dict)
+
+        if flax_key in random_flax_state_dict:
+            if flax_tensor.shape != random_flax_state_dict[flax_key].shape:
+                raise ValueError(
+                    f"PyTorch checkpoint seems to be incorrect. Weight {pt_key} was expected to be of shape "
+                    f"{random_flax_state_dict[flax_key].shape}, but is {flax_tensor.shape}."
+                )
+
+        # also add unexpected weight so that warning is thrown
+        flax_state_dict[flax_key] = jnp.asarray(flax_tensor)
+
+    return unflatten_dict(flax_state_dict)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_flax_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_flax_utils.py
new file mode 100644
index 000000000..1ddcda900
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_flax_utils.py
@@ -0,0 +1,566 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from pickle import UnpicklingError
+from typing import Any, Dict, Union
+
+import jax
+import jax.numpy as jnp
+import msgpack.exceptions
+from flax.core.frozen_dict import FrozenDict, unfreeze
+from flax.serialization import from_bytes, to_bytes
+from flax.traverse_util import flatten_dict, unflatten_dict
+from huggingface_hub import create_repo, hf_hub_download
+from huggingface_hub.utils import (
+    EntryNotFoundError,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+    validate_hf_hub_args,
+)
+from requests import HTTPError
+
+from .. import __version__, is_torch_available
+from ..utils import (
+    CONFIG_NAME,
+    FLAX_WEIGHTS_NAME,
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    WEIGHTS_NAME,
+    PushToHubMixin,
+    logging,
+)
+from .modeling_flax_pytorch_utils import convert_pytorch_state_dict_to_flax
+
+
+logger = logging.get_logger(__name__)
+
+
+class FlaxModelMixin(PushToHubMixin):
+    r"""
+    Base class for all Flax models.
+
+    [`FlaxModelMixin`] takes care of storing the model configuration and provides methods for loading, downloading and
+    saving models.
+
+        - **config_name** ([`str`]) -- Filename to save a model to when calling [`~FlaxModelMixin.save_pretrained`].
+    """
+
+    config_name = CONFIG_NAME
+    _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"]
+    _flax_internal_args = ["name", "parent", "dtype"]
+
+    @classmethod
+    def _from_config(cls, config, **kwargs):
+        """
+        All context managers that the model should be initialized under go here.
+        """
+        return cls(config, **kwargs)
+
+    def _cast_floating_to(self, params: Union[Dict, FrozenDict], dtype: jnp.dtype, mask: Any = None) -> Any:
+        """
+        Helper method to cast floating-point values of given parameter `PyTree` to given `dtype`.
+        """
+
+        # taken from https://github.com/deepmind/jmp/blob/3a8318abc3292be38582794dbf7b094e6583b192/jmp/_src/policy.py#L27
+        def conditional_cast(param):
+            if isinstance(param, jnp.ndarray) and jnp.issubdtype(param.dtype, jnp.floating):
+                param = param.astype(dtype)
+            return param
+
+        if mask is None:
+            return jax.tree_map(conditional_cast, params)
+
+        flat_params = flatten_dict(params)
+        flat_mask, _ = jax.tree_flatten(mask)
+
+        for masked, key in zip(flat_mask, flat_params.keys()):
+            if masked:
+                param = flat_params[key]
+                flat_params[key] = conditional_cast(param)
+
+        return unflatten_dict(flat_params)
+
+    def to_bf16(self, params: Union[Dict, FrozenDict], mask: Any = None):
+        r"""
+        Cast the floating-point `params` to `jax.numpy.bfloat16`. This returns a new `params` tree and does not cast
+        the `params` in place.
+
+        This method can be used on a TPU to explicitly convert the model parameters to bfloat16 precision to do full
+        half-precision training or to save weights in bfloat16 for inference in order to save memory and improve speed.
+
+        Arguments:
+            params (`Union[Dict, FrozenDict]`):
+                A `PyTree` of model parameters.
+            mask (`Union[Dict, FrozenDict]`):
+                A `PyTree` with same structure as the `params` tree. The leaves should be booleans. It should be `True`
+                for params you want to cast, and `False` for those you want to skip.
+
+        Examples:
+
+        ```python
+        >>> from diffusers import FlaxUNet2DConditionModel
+
+        >>> # load model
+        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> # By default, the model parameters will be in fp32 precision, to cast these to bfloat16 precision
+        >>> params = model.to_bf16(params)
+        >>> # If you don't want to cast certain parameters (for example layer norm bias and scale)
+        >>> # then pass the mask as follows
+        >>> from flax import traverse_util
+
+        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> flat_params = traverse_util.flatten_dict(params)
+        >>> mask = {
+        ...     path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale"))
+        ...     for path in flat_params
+        ... }
+        >>> mask = traverse_util.unflatten_dict(mask)
+        >>> params = model.to_bf16(params, mask)
+        ```"""
+        return self._cast_floating_to(params, jnp.bfloat16, mask)
+
+    def to_fp32(self, params: Union[Dict, FrozenDict], mask: Any = None):
+        r"""
+        Cast the floating-point `params` to `jax.numpy.float32`. This method can be used to explicitly convert the
+        model parameters to fp32 precision. This returns a new `params` tree and does not cast the `params` in place.
+
+        Arguments:
+            params (`Union[Dict, FrozenDict]`):
+                A `PyTree` of model parameters.
+            mask (`Union[Dict, FrozenDict]`):
+                A `PyTree` with same structure as the `params` tree. The leaves should be booleans. It should be `True`
+                for params you want to cast, and `False` for those you want to skip.
+
+        Examples:
+
+        ```python
+        >>> from diffusers import FlaxUNet2DConditionModel
+
+        >>> # Download model and configuration from huggingface.co
+        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> # By default, the model params will be in fp32, to illustrate the use of this method,
+        >>> # we'll first cast to fp16 and back to fp32
+        >>> params = model.to_f16(params)
+        >>> # now cast back to fp32
+        >>> params = model.to_fp32(params)
+        ```"""
+        return self._cast_floating_to(params, jnp.float32, mask)
+
+    def to_fp16(self, params: Union[Dict, FrozenDict], mask: Any = None):
+        r"""
+        Cast the floating-point `params` to `jax.numpy.float16`. This returns a new `params` tree and does not cast the
+        `params` in place.
+
+        This method can be used on a GPU to explicitly convert the model parameters to float16 precision to do full
+        half-precision training or to save weights in float16 for inference in order to save memory and improve speed.
+
+        Arguments:
+            params (`Union[Dict, FrozenDict]`):
+                A `PyTree` of model parameters.
+            mask (`Union[Dict, FrozenDict]`):
+                A `PyTree` with same structure as the `params` tree. The leaves should be booleans. It should be `True`
+                for params you want to cast, and `False` for those you want to skip.
+
+        Examples:
+
+        ```python
+        >>> from diffusers import FlaxUNet2DConditionModel
+
+        >>> # load model
+        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> # By default, the model params will be in fp32, to cast these to float16
+        >>> params = model.to_fp16(params)
+        >>> # If you want don't want to cast certain parameters (for example layer norm bias and scale)
+        >>> # then pass the mask as follows
+        >>> from flax import traverse_util
+
+        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> flat_params = traverse_util.flatten_dict(params)
+        >>> mask = {
+        ...     path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale"))
+        ...     for path in flat_params
+        ... }
+        >>> mask = traverse_util.unflatten_dict(mask)
+        >>> params = model.to_fp16(params, mask)
+        ```"""
+        return self._cast_floating_to(params, jnp.float16, mask)
+
+    def init_weights(self, rng: jax.Array) -> Dict:
+        raise NotImplementedError(f"init_weights method has to be implemented for {self}")
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        dtype: jnp.dtype = jnp.float32,
+        *model_args,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a pretrained Flax model from a pretrained model configuration.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                Can be either:
+
+                    - A string, the *model id* (for example `runwayml/stable-diffusion-v1-5`) of a pretrained model
+                      hosted on the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      using [`~FlaxModelMixin.save_pretrained`].
+            dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+                The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+                `jax.numpy.bfloat16` (on TPUs).
+
+                This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+                specified, all the computation will be performed with the given `dtype`.
+
+                <Tip>
+
+                This only specifies the dtype of the *computation* and does not influence the dtype of model
+                parameters.
+
+                If you wish to change the dtype of the model parameters, see [`~FlaxModelMixin.to_fp16`] and
+                [`~FlaxModelMixin.to_bf16`].
+
+                </Tip>
+
+            model_args (sequence of positional arguments, *optional*):
+                All remaining positional arguments are passed to the underlying model's `__init__` method.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            from_pt (`bool`, *optional*, defaults to `False`):
+                Load the model weights from a PyTorch checkpoint save file.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to update the configuration object (after it is loaded) and initiate the model (for
+                example, `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
+                automatically loaded:
+
+                    - If a configuration is provided with `config`, `kwargs` are directly passed to the underlying
+                      model's `__init__` method (we assume all relevant updates to the configuration have already been
+                      done).
+                    - If a configuration is not provided, `kwargs` are first passed to the configuration class
+                      initialization function [`~ConfigMixin.from_config`]. Each key of the `kwargs` that corresponds
+                      to a configuration attribute is used to override said attribute with the supplied `kwargs` value.
+                      Remaining keys that do not correspond to any configuration attribute are passed to the underlying
+                      model's `__init__` function.
+
+        Examples:
+
+        ```python
+        >>> from diffusers import FlaxUNet2DConditionModel
+
+        >>> # Download model and configuration from huggingface.co and cache.
+        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
+        >>> model, params = FlaxUNet2DConditionModel.from_pretrained("./test/saved_model/")
+        ```
+
+        If you get the error message below, you need to finetune the weights for your downstream task:
+
+        ```bash
+        Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
+        - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated
+        You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+        ```
+        """
+        config = kwargs.pop("config", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        from_pt = kwargs.pop("from_pt", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+
+        user_agent = {
+            "diffusers": __version__,
+            "file_type": "model",
+            "framework": "flax",
+        }
+
+        # Load config if we don't provide one
+        if config is None:
+            config, unused_kwargs = cls.load_config(
+                pretrained_model_name_or_path,
+                cache_dir=cache_dir,
+                return_unused_kwargs=True,
+                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                subfolder=subfolder,
+                **kwargs,
+            )
+
+        model, model_kwargs = cls.from_config(config, dtype=dtype, return_unused_kwargs=True, **unused_kwargs)
+
+        # Load model
+        pretrained_path_with_subfolder = (
+            pretrained_model_name_or_path
+            if subfolder is None
+            else os.path.join(pretrained_model_name_or_path, subfolder)
+        )
+        if os.path.isdir(pretrained_path_with_subfolder):
+            if from_pt:
+                if not os.path.isfile(os.path.join(pretrained_path_with_subfolder, WEIGHTS_NAME)):
+                    raise EnvironmentError(
+                        f"Error no file named {WEIGHTS_NAME} found in directory {pretrained_path_with_subfolder} "
+                    )
+                model_file = os.path.join(pretrained_path_with_subfolder, WEIGHTS_NAME)
+            elif os.path.isfile(os.path.join(pretrained_path_with_subfolder, FLAX_WEIGHTS_NAME)):
+                # Load from a Flax checkpoint
+                model_file = os.path.join(pretrained_path_with_subfolder, FLAX_WEIGHTS_NAME)
+            # Check if pytorch weights exist instead
+            elif os.path.isfile(os.path.join(pretrained_path_with_subfolder, WEIGHTS_NAME)):
+                raise EnvironmentError(
+                    f"{WEIGHTS_NAME} file found in directory {pretrained_path_with_subfolder}. Please load the model"
+                    " using `from_pt=True`."
+                )
+            else:
+                raise EnvironmentError(
+                    f"Error no file named {FLAX_WEIGHTS_NAME} or {WEIGHTS_NAME} found in directory "
+                    f"{pretrained_path_with_subfolder}."
+                )
+        else:
+            try:
+                model_file = hf_hub_download(
+                    pretrained_model_name_or_path,
+                    filename=FLAX_WEIGHTS_NAME if not from_pt else WEIGHTS_NAME,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    local_files_only=local_files_only,
+                    token=token,
+                    user_agent=user_agent,
+                    subfolder=subfolder,
+                    revision=revision,
+                )
+
+            except RepositoryNotFoundError:
+                raise EnvironmentError(
+                    f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
+                    "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a "
+                    "token having permission to this repo with `token` or log in with `huggingface-cli "
+                    "login`."
+                )
+            except RevisionNotFoundError:
+                raise EnvironmentError(
+                    f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
+                    "this model name. Check the model page at "
+                    f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
+                )
+            except EntryNotFoundError:
+                raise EnvironmentError(
+                    f"{pretrained_model_name_or_path} does not appear to have a file named {FLAX_WEIGHTS_NAME}."
+                )
+            except HTTPError as err:
+                raise EnvironmentError(
+                    f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n"
+                    f"{err}"
+                )
+            except ValueError:
+                raise EnvironmentError(
+                    f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
+                    f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
+                    f" directory containing a file named {FLAX_WEIGHTS_NAME} or {WEIGHTS_NAME}.\nCheckout your"
+                    " internet connection or see how to run the library in offline mode at"
+                    " 'https://huggingface.co/docs/transformers/installation#offline-mode'."
+                )
+            except EnvironmentError:
+                raise EnvironmentError(
+                    f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
+                    "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
+                    f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
+                    f"containing a file named {FLAX_WEIGHTS_NAME} or {WEIGHTS_NAME}."
+                )
+
+        if from_pt:
+            if is_torch_available():
+                from .modeling_utils import load_state_dict
+            else:
+                raise EnvironmentError(
+                    "Can't load the model in PyTorch format because PyTorch is not installed. "
+                    "Please, install PyTorch or use native Flax weights."
+                )
+
+            # Step 1: Get the pytorch file
+            pytorch_model_file = load_state_dict(model_file)
+
+            # Step 2: Convert the weights
+            state = convert_pytorch_state_dict_to_flax(pytorch_model_file, model)
+        else:
+            try:
+                with open(model_file, "rb") as state_f:
+                    state = from_bytes(cls, state_f.read())
+            except (UnpicklingError, msgpack.exceptions.ExtraData) as e:
+                try:
+                    with open(model_file) as f:
+                        if f.read().startswith("version"):
+                            raise OSError(
+                                "You seem to have cloned a repository without having git-lfs installed. Please"
+                                " install git-lfs and run `git lfs install` followed by `git lfs pull` in the"
+                                " folder you cloned."
+                            )
+                        else:
+                            raise ValueError from e
+                except (UnicodeDecodeError, ValueError):
+                    raise EnvironmentError(f"Unable to convert {model_file} to Flax deserializable object. ")
+            # make sure all arrays are stored as jnp.ndarray
+            # NOTE: This is to prevent a bug this will be fixed in Flax >= v0.3.4:
+            # https://github.com/google/flax/issues/1261
+        state = jax.tree_util.tree_map(lambda x: jax.device_put(x, jax.local_devices(backend="cpu")[0]), state)
+
+        # flatten dicts
+        state = flatten_dict(state)
+
+        params_shape_tree = jax.eval_shape(model.init_weights, rng=jax.random.PRNGKey(0))
+        required_params = set(flatten_dict(unfreeze(params_shape_tree)).keys())
+
+        shape_state = flatten_dict(unfreeze(params_shape_tree))
+
+        missing_keys = required_params - set(state.keys())
+        unexpected_keys = set(state.keys()) - required_params
+
+        if missing_keys:
+            logger.warning(
+                f"The checkpoint {pretrained_model_name_or_path} is missing required keys: {missing_keys}. "
+                "Make sure to call model.init_weights to initialize the missing weights."
+            )
+            cls._missing_keys = missing_keys
+
+        for key in state.keys():
+            if key in shape_state and state[key].shape != shape_state[key].shape:
+                raise ValueError(
+                    f"Trying to load the pretrained weight for {key} failed: checkpoint has shape "
+                    f"{state[key].shape} which is incompatible with the model shape {shape_state[key].shape}. "
+                )
+
+        # remove unexpected keys to not be saved again
+        for unexpected_key in unexpected_keys:
+            del state[unexpected_key]
+
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
+                f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
+                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task or"
+                " with another architecture."
+            )
+        else:
+            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
+
+        if len(missing_keys) > 0:
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
+                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
+            )
+        else:
+            logger.info(
+                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the checkpoint"
+                f" was trained on, you can already use {model.__class__.__name__} for predictions without further"
+                " training."
+            )
+
+        return model, unflatten_dict(state)
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        params: Union[Dict, FrozenDict],
+        is_main_process: bool = True,
+        push_to_hub: bool = False,
+        **kwargs,
+    ):
+        """
+        Save a model and its configuration file to a directory so that it can be reloaded using the
+        [`~FlaxModelMixin.from_pretrained`] class method.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to save a model and its configuration file to. Will be created if it doesn't exist.
+            params (`Union[Dict, FrozenDict]`):
+                A `PyTree` of model parameters.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful during distributed training and you
+                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
+                process to avoid race conditions.
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            private = kwargs.pop("private", False)
+            create_pr = kwargs.pop("create_pr", False)
+            token = kwargs.pop("token", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
+
+        model_to_save = self
+
+        # Attach architecture to the config
+        # Save the config
+        if is_main_process:
+            model_to_save.save_config(save_directory)
+
+        # save model
+        output_model_file = os.path.join(save_directory, FLAX_WEIGHTS_NAME)
+        with open(output_model_file, "wb") as f:
+            model_bytes = to_bytes(params)
+            f.write(model_bytes)
+
+        logger.info(f"Model weights saved in {output_model_file}")
+
+        if push_to_hub:
+            self._upload_folder(
+                save_directory,
+                repo_id,
+                token=token,
+                commit_message=commit_message,
+                create_pr=create_pr,
+            )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_outputs.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_outputs.py
new file mode 100644
index 000000000..8dfee5fec
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_outputs.py
@@ -0,0 +1,17 @@
+from dataclasses import dataclass
+
+from ..utils import BaseOutput
+
+
+@dataclass
+class AutoencoderKLOutput(BaseOutput):
+    """
+    Output of AutoencoderKL encoding method.
+
+    Args:
+        latent_dist (`DiagonalGaussianDistribution`):
+            Encoded outputs of `Encoder` represented as the mean and logvar of `DiagonalGaussianDistribution`.
+            `DiagonalGaussianDistribution` allows for sampling latents from the distribution.
+    """
+
+    latent_dist: "DiagonalGaussianDistribution"  # noqa: F821
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_pytorch_flax_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_pytorch_flax_utils.py
new file mode 100644
index 000000000..7099daca7
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_pytorch_flax_utils.py
@@ -0,0 +1,161 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch - Flax general utilities."""
+
+from pickle import UnpicklingError
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.serialization import from_bytes
+from flax.traverse_util import flatten_dict
+
+from ..utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+#####################
+# Flax => PyTorch #
+#####################
+
+
+# from https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_flax_pytorch_utils.py#L224-L352
+def load_flax_checkpoint_in_pytorch_model(pt_model, model_file):
+    try:
+        with open(model_file, "rb") as flax_state_f:
+            flax_state = from_bytes(None, flax_state_f.read())
+    except UnpicklingError as e:
+        try:
+            with open(model_file) as f:
+                if f.read().startswith("version"):
+                    raise OSError(
+                        "You seem to have cloned a repository without having git-lfs installed. Please"
+                        " install git-lfs and run `git lfs install` followed by `git lfs pull` in the"
+                        " folder you cloned."
+                    )
+                else:
+                    raise ValueError from e
+        except (UnicodeDecodeError, ValueError):
+            raise EnvironmentError(f"Unable to convert {model_file} to Flax deserializable object. ")
+
+    return load_flax_weights_in_pytorch_model(pt_model, flax_state)
+
+
+def load_flax_weights_in_pytorch_model(pt_model, flax_state):
+    """Load flax checkpoints in a PyTorch model"""
+
+    try:
+        import torch  # noqa: F401
+    except ImportError:
+        logger.error(
+            "Loading Flax weights in PyTorch requires both PyTorch and Flax to be installed. Please see"
+            " https://pytorch.org/ and https://flax.readthedocs.io/en/latest/installation.html for installation"
+            " instructions."
+        )
+        raise
+
+    # check if we have bf16 weights
+    is_type_bf16 = flatten_dict(jax.tree_util.tree_map(lambda x: x.dtype == jnp.bfloat16, flax_state)).values()
+    if any(is_type_bf16):
+        # convert all weights to fp32 if they are bf16 since torch.from_numpy can-not handle bf16
+
+        # and bf16 is not fully supported in PT yet.
+        logger.warning(
+            "Found ``bfloat16`` weights in Flax model. Casting all ``bfloat16`` weights to ``float32`` "
+            "before loading those in PyTorch model."
+        )
+        flax_state = jax.tree_util.tree_map(
+            lambda params: params.astype(np.float32) if params.dtype == jnp.bfloat16 else params, flax_state
+        )
+
+    pt_model.base_model_prefix = ""
+
+    flax_state_dict = flatten_dict(flax_state, sep=".")
+    pt_model_dict = pt_model.state_dict()
+
+    # keep track of unexpected & missing keys
+    unexpected_keys = []
+    missing_keys = set(pt_model_dict.keys())
+
+    for flax_key_tuple, flax_tensor in flax_state_dict.items():
+        flax_key_tuple_array = flax_key_tuple.split(".")
+
+        if flax_key_tuple_array[-1] == "kernel" and flax_tensor.ndim == 4:
+            flax_key_tuple_array = flax_key_tuple_array[:-1] + ["weight"]
+            flax_tensor = jnp.transpose(flax_tensor, (3, 2, 0, 1))
+        elif flax_key_tuple_array[-1] == "kernel":
+            flax_key_tuple_array = flax_key_tuple_array[:-1] + ["weight"]
+            flax_tensor = flax_tensor.T
+        elif flax_key_tuple_array[-1] == "scale":
+            flax_key_tuple_array = flax_key_tuple_array[:-1] + ["weight"]
+
+        if "time_embedding" not in flax_key_tuple_array:
+            for i, flax_key_tuple_string in enumerate(flax_key_tuple_array):
+                flax_key_tuple_array[i] = (
+                    flax_key_tuple_string.replace("_0", ".0")
+                    .replace("_1", ".1")
+                    .replace("_2", ".2")
+                    .replace("_3", ".3")
+                    .replace("_4", ".4")
+                    .replace("_5", ".5")
+                    .replace("_6", ".6")
+                    .replace("_7", ".7")
+                    .replace("_8", ".8")
+                    .replace("_9", ".9")
+                )
+
+        flax_key = ".".join(flax_key_tuple_array)
+
+        if flax_key in pt_model_dict:
+            if flax_tensor.shape != pt_model_dict[flax_key].shape:
+                raise ValueError(
+                    f"Flax checkpoint seems to be incorrect. Weight {flax_key_tuple} was expected "
+                    f"to be of shape {pt_model_dict[flax_key].shape}, but is {flax_tensor.shape}."
+                )
+            else:
+                # add weight to pytorch dict
+                flax_tensor = np.asarray(flax_tensor) if not isinstance(flax_tensor, np.ndarray) else flax_tensor
+                pt_model_dict[flax_key] = torch.from_numpy(flax_tensor)
+                # remove from missing keys
+                missing_keys.remove(flax_key)
+        else:
+            # weight is not expected by PyTorch model
+            unexpected_keys.append(flax_key)
+
+    pt_model.load_state_dict(pt_model_dict)
+
+    # re-transform missing_keys to list
+    missing_keys = list(missing_keys)
+
+    if len(unexpected_keys) > 0:
+        logger.warning(
+            "Some weights of the Flax model were not used when initializing the PyTorch model"
+            f" {pt_model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are initializing"
+            f" {pt_model.__class__.__name__} from a Flax model trained on another task or with another architecture"
+            " (e.g. initializing a BertForSequenceClassification model from a FlaxBertForPreTraining model).\n- This"
+            f" IS NOT expected if you are initializing {pt_model.__class__.__name__} from a Flax model that you expect"
+            " to be exactly identical (e.g. initializing a BertForSequenceClassification model from a"
+            " FlaxBertForSequenceClassification model)."
+        )
+    if len(missing_keys) > 0:
+        logger.warning(
+            f"Some weights of {pt_model.__class__.__name__} were not initialized from the Flax model and are newly"
+            f" initialized: {missing_keys}\nYou should probably TRAIN this model on a down-stream task to be able to"
+            " use it for predictions and inference."
+        )
+
+    return pt_model
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_utils.py
new file mode 100644
index 000000000..73ea5fb07
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_utils.py
@@ -0,0 +1,1021 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import itertools
+import os
+import re
+from collections import OrderedDict
+from functools import partial
+from typing import Any, Callable, List, Optional, Tuple, Union
+
+import safetensors
+import torch
+from huggingface_hub import create_repo
+from huggingface_hub.utils import validate_hf_hub_args
+from torch import Tensor, nn
+
+from .. import __version__
+from ..utils import (
+    CONFIG_NAME,
+    FLAX_WEIGHTS_NAME,
+    SAFETENSORS_FILE_EXTENSION,
+    SAFETENSORS_WEIGHTS_NAME,
+    WEIGHTS_NAME,
+    _add_variant,
+    _get_model_file,
+    deprecate,
+    is_accelerate_available,
+    is_torch_version,
+    logging,
+)
+from ..utils.hub_utils import PushToHubMixin, load_or_create_model_card, populate_model_card
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_torch_version(">=", "1.9.0"):
+    _LOW_CPU_MEM_USAGE_DEFAULT = True
+else:
+    _LOW_CPU_MEM_USAGE_DEFAULT = False
+
+
+if is_accelerate_available():
+    import accelerate
+    from accelerate.utils import set_module_tensor_to_device
+    from accelerate.utils.versions import is_torch_version
+
+
+def get_parameter_device(parameter: torch.nn.Module) -> torch.device:
+    try:
+        parameters_and_buffers = itertools.chain(parameter.parameters(), parameter.buffers())
+        return next(parameters_and_buffers).device
+    except StopIteration:
+        # For torch.nn.DataParallel compatibility in PyTorch 1.5
+
+        def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
+            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+            return tuples
+
+        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+        first_tuple = next(gen)
+        return first_tuple[1].device
+
+
+def get_parameter_dtype(parameter: torch.nn.Module) -> torch.dtype:
+    try:
+        params = tuple(parameter.parameters())
+        if len(params) > 0:
+            return params[0].dtype
+
+        buffers = tuple(parameter.buffers())
+        if len(buffers) > 0:
+            return buffers[0].dtype
+
+    except StopIteration:
+        # For torch.nn.DataParallel compatibility in PyTorch 1.5
+
+        def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
+            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+            return tuples
+
+        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+        first_tuple = next(gen)
+        return first_tuple[1].dtype
+
+
+def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[str] = None):
+    """
+    Reads a checkpoint file, returning properly formatted errors if they arise.
+    """
+    try:
+        file_extension = os.path.basename(checkpoint_file).split(".")[-1]
+        if file_extension == SAFETENSORS_FILE_EXTENSION:
+            return safetensors.torch.load_file(checkpoint_file, device="cpu")
+        else:
+            return torch.load(checkpoint_file, map_location="cpu")
+    except Exception as e:
+        try:
+            with open(checkpoint_file) as f:
+                if f.read().startswith("version"):
+                    raise OSError(
+                        "You seem to have cloned a repository without having git-lfs installed. Please install "
+                        "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
+                        "you cloned."
+                    )
+                else:
+                    raise ValueError(
+                        f"Unable to locate the file {checkpoint_file} which is necessary to load this pretrained "
+                        "model. Make sure you have saved the model properly."
+                    ) from e
+        except (UnicodeDecodeError, ValueError):
+            raise OSError(
+                f"Unable to load weights from checkpoint file for '{checkpoint_file}' " f"at '{checkpoint_file}'. "
+            )
+
+
+def load_model_dict_into_meta(
+    model,
+    state_dict: OrderedDict,
+    device: Optional[Union[str, torch.device]] = None,
+    dtype: Optional[Union[str, torch.dtype]] = None,
+    model_name_or_path: Optional[str] = None,
+) -> List[str]:
+    device = device or torch.device("cpu")
+    dtype = dtype or torch.float32
+
+    accepts_dtype = "dtype" in set(inspect.signature(set_module_tensor_to_device).parameters.keys())
+
+    unexpected_keys = []
+    empty_state_dict = model.state_dict()
+    for param_name, param in state_dict.items():
+        if param_name not in empty_state_dict:
+            unexpected_keys.append(param_name)
+            continue
+
+        if empty_state_dict[param_name].shape != param.shape:
+            model_name_or_path_str = f"{model_name_or_path} " if model_name_or_path is not None else ""
+            raise ValueError(
+                f"Cannot load {model_name_or_path_str}because {param_name} expected shape {empty_state_dict[param_name]}, but got {param.shape}. If you want to instead overwrite randomly initialized weights, please make sure to pass both `low_cpu_mem_usage=False` and `ignore_mismatched_sizes=True`. For more information, see also: https://github.com/huggingface/diffusers/issues/1619#issuecomment-1345604389 as an example."
+            )
+
+        if accepts_dtype:
+            set_module_tensor_to_device(model, param_name, device, value=param, dtype=dtype)
+        else:
+            set_module_tensor_to_device(model, param_name, device, value=param)
+    return unexpected_keys
+
+
+def _load_state_dict_into_model(model_to_load, state_dict: OrderedDict) -> List[str]:
+    # Convert old format to new format if needed from a PyTorch state_dict
+    # copy state_dict so _load_from_state_dict can modify it
+    state_dict = state_dict.copy()
+    error_msgs = []
+
+    # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
+    # so we need to apply the function recursively.
+    def load(module: torch.nn.Module, prefix: str = ""):
+        args = (state_dict, prefix, {}, True, [], [], error_msgs)
+        module._load_from_state_dict(*args)
+
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, prefix + name + ".")
+
+    load(model_to_load)
+
+    return error_msgs
+
+
+class ModelMixin(torch.nn.Module, PushToHubMixin):
+    r"""
+    Base class for all models.
+
+    [`ModelMixin`] takes care of storing the model configuration and provides methods for loading, downloading and
+    saving models.
+
+        - **config_name** ([`str`]) -- Filename to save a model to when calling [`~models.ModelMixin.save_pretrained`].
+    """
+
+    config_name = CONFIG_NAME
+    _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"]
+    _supports_gradient_checkpointing = False
+    _keys_to_ignore_on_load_unexpected = None
+
+    def __init__(self):
+        super().__init__()
+
+    def __getattr__(self, name: str) -> Any:
+        """The only reason we overwrite `getattr` here is to gracefully deprecate accessing
+        config attributes directly. See https://github.com/huggingface/diffusers/pull/3129 We need to overwrite
+        __getattr__ here in addition so that we don't trigger `torch.nn.Module`'s __getattr__':
+        https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
+        """
+
+        is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name)
+        is_attribute = name in self.__dict__
+
+        if is_in_config and not is_attribute:
+            deprecation_message = f"Accessing config attribute `{name}` directly via '{type(self).__name__}' object attribute is deprecated. Please access '{name}' over '{type(self).__name__}'s config object instead, e.g. 'unet.config.{name}'."
+            deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False, stacklevel=3)
+            return self._internal_dict[name]
+
+        # call PyTorch's https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
+        return super().__getattr__(name)
+
+    @property
+    def is_gradient_checkpointing(self) -> bool:
+        """
+        Whether gradient checkpointing is activated for this model or not.
+        """
+        return any(hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing for m in self.modules())
+
+    def enable_gradient_checkpointing(self) -> None:
+        """
+        Activates gradient checkpointing for the current model (may be referred to as *activation checkpointing* or
+        *checkpoint activations* in other frameworks).
+        """
+        if not self._supports_gradient_checkpointing:
+            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
+        self.apply(partial(self._set_gradient_checkpointing, value=True))
+
+    def disable_gradient_checkpointing(self) -> None:
+        """
+        Deactivates gradient checkpointing for the current model (may be referred to as *activation checkpointing* or
+        *checkpoint activations* in other frameworks).
+        """
+        if self._supports_gradient_checkpointing:
+            self.apply(partial(self._set_gradient_checkpointing, value=False))
+
+    def set_use_memory_efficient_attention_xformers(
+        self, valid: bool, attention_op: Optional[Callable] = None
+    ) -> None:
+        # Recursively walk through all the children.
+        # Any children which exposes the set_use_memory_efficient_attention_xformers method
+        # gets the message
+        def fn_recursive_set_mem_eff(module: torch.nn.Module):
+            if hasattr(module, "set_use_memory_efficient_attention_xformers"):
+                module.set_use_memory_efficient_attention_xformers(valid, attention_op)
+
+            for child in module.children():
+                fn_recursive_set_mem_eff(child)
+
+        for module in self.children():
+            if isinstance(module, torch.nn.Module):
+                fn_recursive_set_mem_eff(module)
+
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None) -> None:
+        r"""
+        Enable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/).
+
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up during
+        inference. Speed up during training is not guaranteed.
+
+        <Tip warning={true}>
+
+        ⚠️ When memory efficient attention and sliced attention are both enabled, memory efficient attention takes
+        precedent.
+
+        </Tip>
+
+        Parameters:
+            attention_op (`Callable`, *optional*):
+                Override the default `None` operator for use as `op` argument to the
+                [`memory_efficient_attention()`](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.memory_efficient_attention)
+                function of xFormers.
+
+        Examples:
+
+        ```py
+        >>> import torch
+        >>> from diffusers import UNet2DConditionModel
+        >>> from xformers.ops import MemoryEfficientAttentionFlashAttentionOp
+
+        >>> model = UNet2DConditionModel.from_pretrained(
+        ...     "stabilityai/stable-diffusion-2-1", subfolder="unet", torch_dtype=torch.float16
+        ... )
+        >>> model = model.to("cuda")
+        >>> model.enable_xformers_memory_efficient_attention(attention_op=MemoryEfficientAttentionFlashAttentionOp)
+        ```
+        """
+        self.set_use_memory_efficient_attention_xformers(True, attention_op)
+
+    def disable_xformers_memory_efficient_attention(self) -> None:
+        r"""
+        Disable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/).
+        """
+        self.set_use_memory_efficient_attention_xformers(False)
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        is_main_process: bool = True,
+        save_function: Optional[Callable] = None,
+        safe_serialization: bool = True,
+        variant: Optional[str] = None,
+        push_to_hub: bool = False,
+        **kwargs,
+    ):
+        """
+        Save a model and its configuration file to a directory so that it can be reloaded using the
+        [`~models.ModelMixin.from_pretrained`] class method.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to save a model and its configuration file to. Will be created if it doesn't exist.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful during distributed training and you
+                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
+                process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful during distributed training when you need to
+                replace `torch.save` with another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
+            variant (`str`, *optional*):
+                If specified, weights are saved in the format `pytorch_model.<variant>.bin`.
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            private = kwargs.pop("private", False)
+            create_pr = kwargs.pop("create_pr", False)
+            token = kwargs.pop("token", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
+
+        # Only save the model itself if we are using distributed training
+        model_to_save = self
+
+        # Attach architecture to the config
+        # Save the config
+        if is_main_process:
+            model_to_save.save_config(save_directory)
+
+        # Save the model
+        state_dict = model_to_save.state_dict()
+
+        weights_name = SAFETENSORS_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
+        weights_name = _add_variant(weights_name, variant)
+
+        # Save the model
+        if safe_serialization:
+            safetensors.torch.save_file(
+                state_dict, os.path.join(save_directory, weights_name), metadata={"format": "pt"}
+            )
+        else:
+            torch.save(state_dict, os.path.join(save_directory, weights_name))
+
+        logger.info(f"Model weights saved in {os.path.join(save_directory, weights_name)}")
+
+        if push_to_hub:
+            # Create a new empty model card and eventually tag it
+            model_card = load_or_create_model_card(repo_id, token=token)
+            model_card = populate_model_card(model_card)
+            model_card.save(os.path.join(save_directory, "README.md"))
+
+            self._upload_folder(
+                save_directory,
+                repo_id,
+                token=token,
+                commit_message=commit_message,
+                create_pr=create_pr,
+            )
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        r"""
+        Instantiate a pretrained PyTorch model from a pretrained model configuration.
+
+        The model is set in evaluation mode - `model.eval()` - by default, and dropout modules are deactivated. To
+        train the model, set it back in training mode with `model.train()`.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      with [`~ModelMixin.save_pretrained`].
+
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
+                dtype is automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info (`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            from_flax (`bool`, *optional*, defaults to `False`):
+                Load the model weights from a Flax checkpoint save file.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn't need to be defined for each
+                parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
+                same device.
+
+                Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            max_memory (`Dict`, *optional*):
+                A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
+                each GPU and the available CPU RAM if unset.
+            offload_folder (`str` or `os.PathLike`, *optional*):
+                The path to offload weights if `device_map` contains the value `"disk"`.
+            offload_state_dict (`bool`, *optional*):
+                If `True`, temporarily offloads the CPU state dict to the hard drive to avoid running out of CPU RAM if
+                the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to `True`
+                when there is some disk offload.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+            variant (`str`, *optional*):
+                Load weights from a specified `variant` filename such as `"fp16"` or `"ema"`. This is ignored when
+                loading `from_flax`.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the `safetensors` weights are downloaded if they're available **and** if the
+                `safetensors` library is installed. If set to `True`, the model is forcibly loaded from `safetensors`
+                weights. If set to `False`, `safetensors` weights are not loaded.
+
+        <Tip>
+
+        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with
+        `huggingface-cli login`. You can also activate the special
+        ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a
+        firewalled environment.
+
+        </Tip>
+
+        Example:
+
+        ```py
+        from diffusers import UNet2DConditionModel
+
+        unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet")
+        ```
+
+        If you get the error message below, you need to finetune the weights for your downstream task:
+
+        ```bash
+        Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
+        - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated
+        You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+        ```
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
+        force_download = kwargs.pop("force_download", False)
+        from_flax = kwargs.pop("from_flax", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        output_loading_info = kwargs.pop("output_loading_info", False)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        subfolder = kwargs.pop("subfolder", None)
+        device_map = kwargs.pop("device_map", None)
+        max_memory = kwargs.pop("max_memory", None)
+        offload_folder = kwargs.pop("offload_folder", None)
+        offload_state_dict = kwargs.pop("offload_state_dict", False)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
+        variant = kwargs.pop("variant", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+
+        allow_pickle = False
+        if use_safetensors is None:
+            use_safetensors = True
+            allow_pickle = True
+
+        if low_cpu_mem_usage and not is_accelerate_available():
+            low_cpu_mem_usage = False
+            logger.warning(
+                "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                " install accelerate\n```\n."
+            )
+
+        if device_map is not None and not is_accelerate_available():
+            raise NotImplementedError(
+                "Loading and dispatching requires `accelerate`. Please make sure to install accelerate or set"
+                " `device_map=None`. You can install accelerate with `pip install accelerate`."
+            )
+
+        # Check if we can handle device_map and dispatching the weights
+        if device_map is not None and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `device_map=None`."
+            )
+
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+
+        if low_cpu_mem_usage is False and device_map is not None:
+            raise ValueError(
+                f"You cannot set `low_cpu_mem_usage` to `False` while using device_map={device_map} for loading and"
+                " dispatching. Please make sure to set `low_cpu_mem_usage=True`."
+            )
+
+        # Load config if we don't provide a configuration
+        config_path = pretrained_model_name_or_path
+
+        user_agent = {
+            "diffusers": __version__,
+            "file_type": "model",
+            "framework": "pytorch",
+        }
+
+        # load config
+        config, unused_kwargs, commit_hash = cls.load_config(
+            config_path,
+            cache_dir=cache_dir,
+            return_unused_kwargs=True,
+            return_commit_hash=True,
+            force_download=force_download,
+            resume_download=resume_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            subfolder=subfolder,
+            device_map=device_map,
+            max_memory=max_memory,
+            offload_folder=offload_folder,
+            offload_state_dict=offload_state_dict,
+            user_agent=user_agent,
+            **kwargs,
+        )
+
+        # load model
+        model_file = None
+        if from_flax:
+            model_file = _get_model_file(
+                pretrained_model_name_or_path,
+                weights_name=FLAX_WEIGHTS_NAME,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                subfolder=subfolder,
+                user_agent=user_agent,
+                commit_hash=commit_hash,
+            )
+            model = cls.from_config(config, **unused_kwargs)
+
+            # Convert the weights
+            from .modeling_pytorch_flax_utils import load_flax_checkpoint_in_pytorch_model
+
+            model = load_flax_checkpoint_in_pytorch_model(model, model_file)
+        else:
+            if use_safetensors:
+                try:
+                    model_file = _get_model_file(
+                        pretrained_model_name_or_path,
+                        weights_name=_add_variant(SAFETENSORS_WEIGHTS_NAME, variant),
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        resume_download=resume_download,
+                        proxies=proxies,
+                        local_files_only=local_files_only,
+                        token=token,
+                        revision=revision,
+                        subfolder=subfolder,
+                        user_agent=user_agent,
+                        commit_hash=commit_hash,
+                    )
+                except IOError as e:
+                    if not allow_pickle:
+                        raise e
+                    pass
+            if model_file is None:
+                model_file = _get_model_file(
+                    pretrained_model_name_or_path,
+                    weights_name=_add_variant(WEIGHTS_NAME, variant),
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                    user_agent=user_agent,
+                    commit_hash=commit_hash,
+                )
+
+            if low_cpu_mem_usage:
+                # Instantiate model with empty weights
+                with accelerate.init_empty_weights():
+                    model = cls.from_config(config, **unused_kwargs)
+
+                # if device_map is None, load the state dict and move the params from meta device to the cpu
+                if device_map is None:
+                    param_device = "cpu"
+                    state_dict = load_state_dict(model_file, variant=variant)
+                    model._convert_deprecated_attention_blocks(state_dict)
+                    # move the params from meta device to cpu
+                    missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
+                    if len(missing_keys) > 0:
+                        raise ValueError(
+                            f"Cannot load {cls} from {pretrained_model_name_or_path} because the following keys are"
+                            f" missing: \n {', '.join(missing_keys)}. \n Please make sure to pass"
+                            " `low_cpu_mem_usage=False` and `device_map=None` if you want to randomly initialize"
+                            " those weights or else make sure your checkpoint file is correct."
+                        )
+
+                    unexpected_keys = load_model_dict_into_meta(
+                        model,
+                        state_dict,
+                        device=param_device,
+                        dtype=torch_dtype,
+                        model_name_or_path=pretrained_model_name_or_path,
+                    )
+
+                    if cls._keys_to_ignore_on_load_unexpected is not None:
+                        for pat in cls._keys_to_ignore_on_load_unexpected:
+                            unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
+
+                    if len(unexpected_keys) > 0:
+                        logger.warning(
+                            f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
+                        )
+
+                else:  # else let accelerate handle loading and dispatching.
+                    # Load weights and dispatch according to the device_map
+                    # by default the device_map is None and the weights are loaded on the CPU
+                    try:
+                        accelerate.load_checkpoint_and_dispatch(
+                            model,
+                            model_file,
+                            device_map,
+                            max_memory=max_memory,
+                            offload_folder=offload_folder,
+                            offload_state_dict=offload_state_dict,
+                            dtype=torch_dtype,
+                        )
+                    except AttributeError as e:
+                        # When using accelerate loading, we do not have the ability to load the state
+                        # dict and rename the weight names manually. Additionally, accelerate skips
+                        # torch loading conventions and directly writes into `module.{_buffers, _parameters}`
+                        # (which look like they should be private variables?), so we can't use the standard hooks
+                        # to rename parameters on load. We need to mimic the original weight names so the correct
+                        # attributes are available. After we have loaded the weights, we convert the deprecated
+                        # names to the new non-deprecated names. Then we _greatly encourage_ the user to convert
+                        # the weights so we don't have to do this again.
+
+                        if "'Attention' object has no attribute" in str(e):
+                            logger.warning(
+                                f"Taking `{str(e)}` while using `accelerate.load_checkpoint_and_dispatch` to mean {pretrained_model_name_or_path}"
+                                " was saved with deprecated attention block weight names. We will load it with the deprecated attention block"
+                                " names and convert them on the fly to the new attention block format. Please re-save the model after this conversion,"
+                                " so we don't have to do the on the fly renaming in the future. If the model is from a hub checkpoint,"
+                                " please also re-upload it or open a PR on the original repository."
+                            )
+                            model._temp_convert_self_to_deprecated_attention_blocks()
+                            accelerate.load_checkpoint_and_dispatch(
+                                model,
+                                model_file,
+                                device_map,
+                                max_memory=max_memory,
+                                offload_folder=offload_folder,
+                                offload_state_dict=offload_state_dict,
+                                dtype=torch_dtype,
+                            )
+                            model._undo_temp_convert_self_to_deprecated_attention_blocks()
+                        else:
+                            raise e
+
+                loading_info = {
+                    "missing_keys": [],
+                    "unexpected_keys": [],
+                    "mismatched_keys": [],
+                    "error_msgs": [],
+                }
+            else:
+                model = cls.from_config(config, **unused_kwargs)
+
+                state_dict = load_state_dict(model_file, variant=variant)
+                model._convert_deprecated_attention_blocks(state_dict)
+
+                model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model(
+                    model,
+                    state_dict,
+                    model_file,
+                    pretrained_model_name_or_path,
+                    ignore_mismatched_sizes=ignore_mismatched_sizes,
+                )
+
+                loading_info = {
+                    "missing_keys": missing_keys,
+                    "unexpected_keys": unexpected_keys,
+                    "mismatched_keys": mismatched_keys,
+                    "error_msgs": error_msgs,
+                }
+
+        if torch_dtype is not None and not isinstance(torch_dtype, torch.dtype):
+            raise ValueError(
+                f"{torch_dtype} needs to be of type `torch.dtype`, e.g. `torch.float16`, but is {type(torch_dtype)}."
+            )
+        elif torch_dtype is not None:
+            model = model.to(torch_dtype)
+
+        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
+
+        # Set model in evaluation mode to deactivate DropOut modules by default
+        model.eval()
+        if output_loading_info:
+            return model, loading_info
+
+        return model
+
+    @classmethod
+    def _load_pretrained_model(
+        cls,
+        model,
+        state_dict: OrderedDict,
+        resolved_archive_file,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        ignore_mismatched_sizes: bool = False,
+    ):
+        # Retrieve missing & unexpected_keys
+        model_state_dict = model.state_dict()
+        loaded_keys = list(state_dict.keys())
+
+        expected_keys = list(model_state_dict.keys())
+
+        original_loaded_keys = loaded_keys
+
+        missing_keys = list(set(expected_keys) - set(loaded_keys))
+        unexpected_keys = list(set(loaded_keys) - set(expected_keys))
+
+        # Make sure we are able to load base models as well as derived models (with heads)
+        model_to_load = model
+
+        def _find_mismatched_keys(
+            state_dict,
+            model_state_dict,
+            loaded_keys,
+            ignore_mismatched_sizes,
+        ):
+            mismatched_keys = []
+            if ignore_mismatched_sizes:
+                for checkpoint_key in loaded_keys:
+                    model_key = checkpoint_key
+
+                    if (
+                        model_key in model_state_dict
+                        and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
+                    ):
+                        mismatched_keys.append(
+                            (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
+                        )
+                        del state_dict[checkpoint_key]
+            return mismatched_keys
+
+        if state_dict is not None:
+            # Whole checkpoint
+            mismatched_keys = _find_mismatched_keys(
+                state_dict,
+                model_state_dict,
+                original_loaded_keys,
+                ignore_mismatched_sizes,
+            )
+            error_msgs = _load_state_dict_into_model(model_to_load, state_dict)
+
+        if len(error_msgs) > 0:
+            error_msg = "\n\t".join(error_msgs)
+            if "size mismatch" in error_msg:
+                error_msg += (
+                    "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
+                )
+            raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
+
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
+                f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
+                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task"
+                " or with another architecture (e.g. initializing a BertForSequenceClassification model from a"
+                " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
+                f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly"
+                " identical (initializing a BertForSequenceClassification model from a"
+                " BertForSequenceClassification model)."
+            )
+        else:
+            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
+        if len(missing_keys) > 0:
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
+                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
+            )
+        elif len(mismatched_keys) == 0:
+            logger.info(
+                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the"
+                f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions"
+                " without further training."
+            )
+        if len(mismatched_keys) > 0:
+            mismatched_warning = "\n".join(
+                [
+                    f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
+                    for key, shape1, shape2 in mismatched_keys
+                ]
+            )
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
+                f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be"
+                " able to use it for predictions and inference."
+            )
+
+        return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs
+
+    @property
+    def device(self) -> torch.device:
+        """
+        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
+        device).
+        """
+        return get_parameter_device(self)
+
+    @property
+    def dtype(self) -> torch.dtype:
+        """
+        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
+        """
+        return get_parameter_dtype(self)
+
+    def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int:
+        """
+        Get number of (trainable or non-embedding) parameters in the module.
+
+        Args:
+            only_trainable (`bool`, *optional*, defaults to `False`):
+                Whether or not to return only the number of trainable parameters.
+            exclude_embeddings (`bool`, *optional*, defaults to `False`):
+                Whether or not to return only the number of non-embedding parameters.
+
+        Returns:
+            `int`: The number of parameters.
+
+        Example:
+
+        ```py
+        from diffusers import UNet2DConditionModel
+
+        model_id = "runwayml/stable-diffusion-v1-5"
+        unet = UNet2DConditionModel.from_pretrained(model_id, subfolder="unet")
+        unet.num_parameters(only_trainable=True)
+        859520964
+        ```
+        """
+
+        if exclude_embeddings:
+            embedding_param_names = [
+                f"{name}.weight"
+                for name, module_type in self.named_modules()
+                if isinstance(module_type, torch.nn.Embedding)
+            ]
+            non_embedding_parameters = [
+                parameter for name, parameter in self.named_parameters() if name not in embedding_param_names
+            ]
+            return sum(p.numel() for p in non_embedding_parameters if p.requires_grad or not only_trainable)
+        else:
+            return sum(p.numel() for p in self.parameters() if p.requires_grad or not only_trainable)
+
+    def _convert_deprecated_attention_blocks(self, state_dict: OrderedDict) -> None:
+        deprecated_attention_block_paths = []
+
+        def recursive_find_attn_block(name, module):
+            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
+                deprecated_attention_block_paths.append(name)
+
+            for sub_name, sub_module in module.named_children():
+                sub_name = sub_name if name == "" else f"{name}.{sub_name}"
+                recursive_find_attn_block(sub_name, sub_module)
+
+        recursive_find_attn_block("", self)
+
+        # NOTE: we have to check if the deprecated parameters are in the state dict
+        # because it is possible we are loading from a state dict that was already
+        # converted
+
+        for path in deprecated_attention_block_paths:
+            # group_norm path stays the same
+
+            # query -> to_q
+            if f"{path}.query.weight" in state_dict:
+                state_dict[f"{path}.to_q.weight"] = state_dict.pop(f"{path}.query.weight")
+            if f"{path}.query.bias" in state_dict:
+                state_dict[f"{path}.to_q.bias"] = state_dict.pop(f"{path}.query.bias")
+
+            # key -> to_k
+            if f"{path}.key.weight" in state_dict:
+                state_dict[f"{path}.to_k.weight"] = state_dict.pop(f"{path}.key.weight")
+            if f"{path}.key.bias" in state_dict:
+                state_dict[f"{path}.to_k.bias"] = state_dict.pop(f"{path}.key.bias")
+
+            # value -> to_v
+            if f"{path}.value.weight" in state_dict:
+                state_dict[f"{path}.to_v.weight"] = state_dict.pop(f"{path}.value.weight")
+            if f"{path}.value.bias" in state_dict:
+                state_dict[f"{path}.to_v.bias"] = state_dict.pop(f"{path}.value.bias")
+
+            # proj_attn -> to_out.0
+            if f"{path}.proj_attn.weight" in state_dict:
+                state_dict[f"{path}.to_out.0.weight"] = state_dict.pop(f"{path}.proj_attn.weight")
+            if f"{path}.proj_attn.bias" in state_dict:
+                state_dict[f"{path}.to_out.0.bias"] = state_dict.pop(f"{path}.proj_attn.bias")
+
+    def _temp_convert_self_to_deprecated_attention_blocks(self) -> None:
+        deprecated_attention_block_modules = []
+
+        def recursive_find_attn_block(module):
+            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
+                deprecated_attention_block_modules.append(module)
+
+            for sub_module in module.children():
+                recursive_find_attn_block(sub_module)
+
+        recursive_find_attn_block(self)
+
+        for module in deprecated_attention_block_modules:
+            module.query = module.to_q
+            module.key = module.to_k
+            module.value = module.to_v
+            module.proj_attn = module.to_out[0]
+
+            # We don't _have_ to delete the old attributes, but it's helpful to ensure
+            # that _all_ the weights are loaded into the new attributes and we're not
+            # making an incorrect assumption that this model should be converted when
+            # it really shouldn't be.
+            del module.to_q
+            del module.to_k
+            del module.to_v
+            del module.to_out
+
+    def _undo_temp_convert_self_to_deprecated_attention_blocks(self) -> None:
+        deprecated_attention_block_modules = []
+
+        def recursive_find_attn_block(module) -> None:
+            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
+                deprecated_attention_block_modules.append(module)
+
+            for sub_module in module.children():
+                recursive_find_attn_block(sub_module)
+
+        recursive_find_attn_block(self)
+
+        for module in deprecated_attention_block_modules:
+            module.to_q = module.query
+            module.to_k = module.key
+            module.to_v = module.value
+            module.to_out = nn.ModuleList([module.proj_attn, nn.Dropout(module.dropout)])
+
+            del module.query
+            del module.key
+            del module.value
+            del module.proj_attn
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/Welford.h b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/Welford.h
new file mode 100644
index 000000000..f7b955371
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/Welford.h
@@ -0,0 +1,94 @@
+#pragma once
+#ifndef WELFORD_H 
+#define WELFORD_H 
+
+#include <c10/cuda/CUDAMathCompat.h> // C10_HOST_DEVICE
+
+// copied from https://github.com/pytorch/pytorch/blob/b8307513e57f8beaf99daff342a23d705a417e11/aten/src/ATen/native/SharedReduceOps.h
+template <typename scalar_t, typename index_t>
+struct WelfordData {
+  scalar_t mean;
+  scalar_t m2;
+  index_t n;
+  scalar_t nf;
+
+  C10_HOST_DEVICE WelfordData() : mean(0), m2(0), n(0), nf(0) {}
+
+  C10_HOST_DEVICE WelfordData(
+      scalar_t mean,
+      scalar_t m2,
+      index_t n,
+      scalar_t nf)
+      : mean(mean), m2(m2), n(n), nf(nf) {}
+};
+
+// copied from https://github.com/pytorch/pytorch/blob/b8307513e57f8beaf99daff342a23d705a417e11/aten/src/ATen/native/SharedReduceOps.h
+template <typename scalar_t, typename acc_scalar_t, typename index_t, typename res_t>
+struct WelfordOps {
+  acc_scalar_t correction;
+  bool take_sqrt;
+  public:
+    using acc_t = WelfordData<acc_scalar_t, index_t>;
+    inline C10_DEVICE acc_t reduce(acc_t acc, scalar_t data) const {
+      // We accumulate n in index_t to avoid cumulative rounding error, but still
+      // need nf for use in combine where int32 may overflow.
+      index_t new_n = acc.n + 1;
+      acc_scalar_t new_nf = static_cast<acc_scalar_t>(new_n);
+
+      acc_scalar_t delta = data - acc.mean;
+      
+      acc_scalar_t new_mean = acc.mean + delta / new_nf;
+      acc_scalar_t new_delta = data - new_mean;
+      return {
+        new_mean,
+        acc.m2 + delta * new_delta,
+        new_n,
+        new_nf,
+      };
+  }
+  inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const {
+    if (a.nf == 0) {
+      return b;
+    }
+    if (b.nf == 0) {
+      return a;
+    }
+    acc_scalar_t delta = b.mean - a.mean;
+    acc_scalar_t new_count = a.nf + b.nf;
+    acc_scalar_t nb_over_n = b.nf / new_count;
+    return {
+      a.mean + delta * nb_over_n,
+      a.m2 + b.m2 + delta * delta * a.nf * nb_over_n,
+      // setting acc.n as -1 since acc.n might not be able to represent the count
+      // correctly within its range, setting it to -1 to avoid confusion
+      -1,
+      new_count
+    };
+  }
+  inline C10_DEVICE res_t project(acc_t acc) const __ubsan_ignore_float_divide_by_zero__ {
+    const auto mean = static_cast<scalar_t>(acc.mean);
+    const auto divisor = acc.nf > correction ? acc.nf - correction : 0;
+    const auto var = acc.m2 / divisor;
+    res_t results(take_sqrt ? std::sqrt(var) : var, mean);
+    return results;
+  }
+
+  static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) {
+    return acc;
+  }
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+  inline __device__ acc_t warp_shfl_down(acc_t acc, int offset) const {
+    return {
+      WARP_SHFL_DOWN(acc.mean, offset)
+      , WARP_SHFL_DOWN(acc.m2, offset)
+      , WARP_SHFL_DOWN(acc.n, offset)
+      , WARP_SHFL_DOWN(acc.nf, offset)
+    };
+  }
+#endif
+  C10_HOST_DEVICE WelfordOps(acc_scalar_t correction, bool take_sqrt)
+      : correction(correction), take_sqrt(take_sqrt) {}
+};
+
+#endif
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/custom_gn.cpp b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/custom_gn.cpp
new file mode 100644
index 000000000..7aeec0187
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/custom_gn.cpp
@@ -0,0 +1,90 @@
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/empty.h>
+#include <ATen/Dispatch.h>
+#include <ATen/Tensor.h>
+#include <torch/library.h>
+#include "gn_kernel.h"
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> gn_nhwc_fwd(
+    const at::Tensor X,
+    const at::Tensor weight,
+    const at::Tensor bias,
+    const int64_t G,
+    double eps,
+    const int64_t act_fn_option) {
+  CHECK_CUDA(X);
+  CHECK_CUDA(weight);
+  CHECK_CUDA(bias);
+  const int N = X.size(0);
+  const int C = X.size(1);
+  const int H = X.size(2);
+  const int W = X.size(3);
+
+  at::Tensor X_nhwc = X.permute({0, 2, 3, 1});
+  at::Tensor X_out = at::empty_like(X_nhwc);
+  at::Tensor means = at::empty({N, G}, weight.options());
+  at::Tensor rstds = at::empty({N, G}, weight.options());
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+    at::ScalarType::Half,
+    at::ScalarType::BFloat16,
+    X.scalar_type(),
+    "group_norm_nhwc_forward", [&]() {
+    run_gn_fwd_kernels<scalar_t>(
+        X_nhwc.const_data_ptr<scalar_t>(),
+        weight.const_data_ptr<scalar_t>(), bias.const_data_ptr<scalar_t>(),
+        N, H, W, C, G, static_cast<scalar_t>(eps), act_fn_option,
+        X_out.mutable_data_ptr<scalar_t>(), means.mutable_data_ptr<scalar_t>(), rstds.mutable_data_ptr<scalar_t>()
+    );
+  });
+  return {X_out.permute({0, 3, 1, 2}), means, rstds};
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> gn_nhwc_bwd(
+    const at::Tensor dy,
+    const at::Tensor X,
+    const at::Tensor weight,
+    const at::Tensor bias,
+    const at::Tensor means,
+    const at::Tensor rstds,
+    const int64_t G,
+    const int64_t act_fn_option) {
+  CHECK_CUDA(dy);
+  CHECK_CUDA(X);
+  CHECK_CUDA(weight);
+  CHECK_CUDA(bias);
+  CHECK_CUDA(means);
+  CHECK_CUDA(rstds);
+
+  const int N = X.size(0);
+  const int C = X.size(1);
+  const int H = X.size(2);
+  const int W = X.size(3);
+  at::Tensor dy_nhwc = dy.permute({0, 2, 3, 1});
+  at::Tensor X_nhwc = X.permute({0, 2, 3, 1});
+  at::Tensor dX = at::empty_like(X_nhwc);
+  at::Tensor dweight = at::empty({C}, X.options());
+  at::Tensor dbias = at::empty({C}, X.options());
+
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+    c10::ScalarType::Half,
+    c10::ScalarType::BFloat16,
+    X.scalar_type(),
+    "group_norm_nhwc_backward", [&]() {
+      run_gn_bwd_kernels<scalar_t>(
+      dy_nhwc.const_data_ptr<scalar_t>(), X_nhwc.const_data_ptr<scalar_t>(),
+      weight.const_data_ptr<scalar_t>(), bias.const_data_ptr<scalar_t>(),
+      means.const_data_ptr<scalar_t>(), rstds.const_data_ptr<scalar_t>(),
+      N, H, W, C, G, act_fn_option,
+      dX.mutable_data_ptr<scalar_t>(), dweight.mutable_data_ptr<scalar_t>(), dbias.mutable_data_ptr<scalar_t>()
+      );
+  });
+  return {dX.permute({0, 3, 1, 2}), dweight, dbias};
+}
+
+TORCH_LIBRARY(gnop, m) {
+  m.def("fwd", &gn_nhwc_fwd);
+  m.def("bwd", &gn_nhwc_bwd);
+}
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/custom_gn.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/custom_gn.py
new file mode 100644
index 000000000..bf50c37e9
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/custom_gn.py
@@ -0,0 +1,367 @@
+from tqdm import tqdm
+import torch.nn.functional as F
+import torch.nn as nn
+import numpy as np
+import torch, datetime, time, os, itertools
+torch.set_printoptions(sci_mode=False)
+module_dir = os.path.dirname(os.path.abspath(__file__))
+
+from torch.utils.cpp_extension import load
+gn_op = load(
+        name="gn_op",
+        sources=[
+            os.path.join(module_dir, "custom_gn.cpp"),
+            os.path.join(module_dir, "gn_kernel.cu"),
+            #os.path.join(module_dir, "nchw_kernel.cu")
+            ],
+        extra_cuda_cflags=[
+            '-use_fast_math',
+            '-lineinfo', # useful for profiling
+            ],
+        extra_cflags=[
+            '-O3', # needed or else GN NCHW from source is slower than nn.GroupNorm
+            '-funroll-all-loops',
+            '-march=native',
+            ], 
+        is_python_module=False,
+        verbose=True,
+        )
+
+class GN_NHWC_Func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, X: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, G: int, eps: float, activation: str):
+        X_out, means, rstds = torch.ops.gnop.fwd(X, weight, bias, G, eps, activation)
+        ctx.save_for_backward(X, weight, bias, means, rstds)
+        ctx.G = G
+        ctx.activation = activation
+        return X_out
+
+    @staticmethod
+    def backward(ctx, dy: torch.Tensor):
+        dy = dy.contiguous(memory_format=torch.channels_last)
+        X, weight, bias, means, rstds = ctx.saved_tensors 
+        dx, dgamma, dbeta = torch.ops.gnop.bwd(dy, X, weight, bias, means, rstds, ctx.G, ctx.activation)
+        return dx, dgamma, dbeta, None, None, None
+
+class GN_NHWC(nn.GroupNorm):
+    def __init__(self, num_groups: int, nc: int, activation='identity', **kwargs):
+        super().__init__(num_groups, nc, **kwargs)
+        assert activation in {'identity', 'silu', 'relu', 'gelu', 'gelu_tanh'}
+        if activation == 'identity':
+            self.activation = 0
+        if activation == 'relu':
+            self.activation = 1
+        if activation == 'silu':
+            self.activation = 2
+        if activation == 'gelu':
+            self.activation = 3
+        if activation == 'gelu_tanh':
+            self.activation = 4
+
+    @torch._dynamo.disable
+    def forward(self, x):
+        #print(x.shape, self.num_channels)
+        if len(x.size()) == 3:
+            N, C, L = x.shape
+        elif len(x.size()) == 4:
+            N, C, H, W = x.shape
+        else:
+            raise ValueError
+        G = self.num_groups
+
+        #if C // G > 512:
+        #    raise ValueError(f'Error in fwd for X.shape={x.shape}, G={G}: C // G = {C // G} which is greater than 512. This input is not supported.')
+
+        #if H * W % 8 != 0:
+        #    raise ValueError(f'Error in fwd for X.shape={x.shape}, G={G}: H * W is not a multiple of 8. This input is not supported.')
+
+        if self.affine:
+            return GN_NHWC_Func.apply(x, self.weight, self.bias, self.num_groups, self.eps, self.activation)
+        else:
+            w = torch.ones((self.num_channels,), device=x.device, dtype=x.dtype)
+            b = torch.zeros((self.num_channels,), device=x.device, dtype=x.dtype)
+            return GN_NHWC_Func.apply(x, w, b, self.num_groups, self.eps, self.activation)
+
+class GN_NCHW_Func(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, X: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, G: int, eps: float):
+        X_out, means, rstds = gn_op.nchwforward(X, weight, bias, G, eps)
+        ctx.save_for_backward(X, weight, means, rstds)
+        ctx.G = G
+        return X_out
+
+    @staticmethod
+    def backward(ctx, dy):
+        dy = dy.contiguous()
+        X, weight, means, rstds = ctx.saved_tensors 
+        dx, dgamma, dbeta = gn_op.nchwbackward(dy, X, weight, means, rstds, ctx.G)
+        return dx, dgamma, dbeta, None, None
+
+class GN_NCHW(nn.GroupNorm):
+    def __init__(self, num_groups: int, nc: int, **kwargs):
+        super().__init__(num_groups, nc, **kwargs)
+
+    def forward(self, x):
+        if self.affine:
+            return GN_NCHW_Func.apply(x.contiguous(), self.weight, self.bias, self.num_groups, self.eps)
+        else:
+            w = torch.ones((self.num_channels,), device=x.device, dtype=x.dtype)
+            b = torch.zeros((self.num_channels,), device=x.device, dtype=x.dtype)
+            return GN_NCHW_Func.apply(x.contiguous(), w, b, self.num_groups, self.eps)
+
+def red(text):
+    return '\033[91m' + str(text) + '\033[0m'
+def green(text):
+    return '\033[92m' + str(text) + '\033[0m'
+def yellow(text):
+    return '\033[93m' + str(text) + '\033[0m'
+def blue(text):
+    return '\033[94m' + str(text) + '\033[0m'
+
+def config_filter(x): # returns true if config is valid
+    DTYPE, B, C, R, G = x
+    if C % G != 0:
+        return False
+
+    if R == 1: # this causes an autograd problem where it gets confused since the tensor is both contiguous in channels first/last format 
+        return False
+
+    if C / G > 512: # this isn't supported since it is assumed that at least one full group is processed per block in the fwd and the max threads per block is set to 512
+        return False
+
+    dtype_size = 2 if DTYPE in (torch.half, torch.bfloat16) else 4 # only care about 16/32-bit dtypes for now
+    estimated_mem_usage_gib = (25 * dtype_size * B * C * R * R) / 2**30 #  this is just a rough estimate, likely wrong
+    if estimated_mem_usage_gib > 4: # vram filter
+        return False
+    return True
+
+if __name__ == '__main__':
+    ACT_FN = 'silu'
+    if ACT_FN == 'silu':
+        act_fn = F.silu
+    if ACT_FN == 'identity':
+        act_fn = lambda x: x
+    if ACT_FN == 'relu':
+        act_fn = F.relu
+    if ACT_FN == 'gelu':
+        act_fn = F.gelu
+    if ACT_FN == 'gelu_tanh':
+        act_fn = lambda x: F.gelu(x, approximate='tanh')
+    MODE = 'check' # can be 'check', 'bench', other modes do both
+    CHECK_PROF = False
+
+    if MODE != 'bench':
+        #DTYPEs = (torch.bfloat16, torch.float, torch.double)
+        DTYPEs = (torch.float16,)
+        Bs = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13, 16)
+        Cs = (
+                32, 64, 128, 256, 512,
+                13, 140, 125, 961,
+                160, 320, 640, 960, 1280, 1600, 1920, 2240, 2560
+                )
+        Rs = (
+                2, 3, 4, 5, 6, 7, 8, 9, 10, 17,
+                8, 16, 64, 128, 256, 512,
+                1024,
+                )
+        Gs = (1, 2, 4, 8, 16, 32,)
+        all_params = itertools.product(DTYPEs, Bs, Cs, Rs, Gs)
+
+        err_inputs = []
+        for params in tqdm(sorted(
+                #filter(config_filter, all_params),
+                [
+                    (torch.float16, 2, 640, 16, 32),
+                    (torch.float16, 2, 1280, 8, 32),
+                    (torch.float16, 2, 2560, 8, 32),
+                    (torch.float16, 2, 1280, 16, 32),
+                    (torch.float16, 2, 320, 32, 32),
+                    (torch.float16, 2, 1920, 16, 32),
+                    (torch.float16, 2, 2560, 16, 32),
+                    (torch.float16, 2, 640, 32, 32),
+                    (torch.float16, 2, 960, 32, 32),
+                    (torch.float16, 2, 1280, 32, 32),
+                    (torch.float16, 2, 320, 64, 32),
+                    (torch.float16, 2, 1920, 32, 32),
+                    (torch.float16, 2, 640, 64, 32),
+                    (torch.float16, 2, 960, 64, 32),
+                ],
+                key = lambda x: x[1]*x[2]*x[3]*x[4]
+        )):
+            DTYPE, B, C, R, G = params
+            #torch.cuda.empty_cache()
+            print(f'B: {B:<2} | C: {C:<4} | R: {R:<4} | G: {G:<3} | DTYPE: {DTYPE}')
+            x = torch.randn(B * C * R * R).reshape((B, C, R, R)).to(DTYPE, memory_format=torch.channels_last).cuda().requires_grad_(True) #* 1000
+            #x = torch.arange(B * C * R * R).reshape((B, C, R, R)).to(DTYPE, memory_format=torch.channels_last).cuda().requires_grad_(True)/R/R/C/B-0.5 #* 1000
+            torch.random.manual_seed(0)
+
+            gn2 = GN_NHWC(G, C, activation=ACT_FN).cuda().to(DTYPE)
+
+            if CHECK_PROF:
+                #g1 = gn1(x.contiguous())
+                #g1sum = g1.sum()
+                #g1_grad_wrt_w = torch.autograd.grad(g1sum, gn1.weight, retain_graph=True)[0]
+                g2 = gn2(x)
+                #g2sum = g2.sum()
+                #g2_grad_wrt_w = torch.autograd.grad(g2sum, gn2.weight, retain_graph=True)[0]
+            else:
+                gn1 = nn.GroupNorm(G, C).float().cuda()
+                gn3 = nn.GroupNorm(G, C).cuda().to(DTYPE)
+                with torch.no_grad():
+                    w = torch.randn((C,), dtype=DTYPE)
+                    b = torch.randn((C,), dtype=DTYPE)
+                    gn1.weight.copy_(w.detach().float())
+                    gn1.bias.copy_(b.detach().float())
+                    gn2.weight.copy_(w.detach())
+                    gn2.bias.copy_(b.detach())
+                    gn3.weight.copy_(w.detach())
+                    gn3.bias.copy_(b.detach())
+
+                g1 = act_fn(gn1(x.float()))
+                g2 = gn2(x)
+                g3 = act_fn(gn3(x))
+                rand_dy = torch.rand_like(g3)
+                rand_dy /= rand_dy.numel() ** 0.5 # to prevent false positive errors from ocurring because of really large magnitude losses
+                g1sum = (g1 * rand_dy).sum()
+                g2sum = (g2 * rand_dy).sum()
+                g3sum = (g3 * rand_dy).sum()
+                def print_err(act_float, act_testing, act_ref, left_pad=0):
+                    with torch.no_grad():
+                        lpad = ' ' * left_pad
+                        red_error = red('ERROR: ')
+                        testing_err = F.mse_loss(act_float, act_testing)
+                        expected_err = F.mse_loss(act_float, act_ref)
+                        if testing_err.isnan() or testing_err / expected_err > 2 and testing_err > 1e-6:
+                            print(red(f'{lpad}Your error: {testing_err}, expected error: {expected_err}'))
+                            err_inputs.append((params, testing_err, expected_err))
+                        else:
+                            print(f'{lpad}Negligible difference (testing err: {testing_err:.2e}, ref err: {expected_err:.2e}) found')
+
+                print('  FORWARD')
+                print_err(g1, g2, g3, 4)
+                print('  BACKWARD')
+                print('    wrt weight')
+                g1_grad_wrt_w = torch.autograd.grad(
+                        g1sum, gn1.weight, retain_graph=True)[0]
+                g2_grad_wrt_w = torch.autograd.grad(
+                        g2sum, gn2.weight, retain_graph=True)[0]
+                g3_grad_wrt_w = torch.autograd.grad(
+                        g3sum, gn3.weight, retain_graph=True)[0]
+                print_err(g1_grad_wrt_w, g2_grad_wrt_w, g3_grad_wrt_w, 6)
+
+
+                print('    wrt bias')
+                g1_grad_wrt_b = torch.autograd.grad(
+                        g1sum, gn1.bias, retain_graph=True)[0]
+                g2_grad_wrt_b = torch.autograd.grad(
+                        g2sum, gn2.bias, retain_graph=True)[0]
+                g3_grad_wrt_b = torch.autograd.grad(
+                        g3sum, gn3.bias, retain_graph=True)[0]
+                print_err(g1_grad_wrt_b, g2_grad_wrt_b, g3_grad_wrt_b, 6)
+
+                print('    wrt X')
+                g1_grad_wrt_x = torch.autograd.grad(g1sum, x, retain_graph=True)[0]
+                g2_grad_wrt_x = torch.autograd.grad(g2sum, x, retain_graph=True)[0]
+                g3_grad_wrt_x = torch.autograd.grad(g3sum, x, retain_graph=True)[0]
+                print_err(g1_grad_wrt_x, g2_grad_wrt_x, g3_grad_wrt_x, 6)
+        if len(err_inputs) > 0:
+            print(red('Error inputs found:'))
+            print('\n'.join(map(lambda x: f'{x[0]}, testing error: {x[1]:.2e}, expected error: {x[2]:.2e}', err_inputs)))
+        elif not CHECK_PROF:
+            print(green('No errors found :)'))
+
+    if MODE != 'check':
+        NSEC = 1 # number of seconds that each kernel runs for on a certain input
+        DTYPES = [torch.bfloat16]
+        BATCHES = [1, 2, 4, 8, 16, 32]
+        #CHANNELS = [32, 64, 128, 256, 512]
+        CHANNELS = [320, 640, 960, 1920, 2560]
+        RESOLUTIONS = [4, 8, 16, 32, 64, 128, 256, 512]
+        #NUM_GROUPS = [4, 8, 16, 32, 64, 128]
+        NUM_GROUPS = [32]
+        BENCH = 'fwd' # can be 'fwd', 'bwd', anything else is fwd + bwd
+        GN_KERNELS = [
+                #(GN_NHWC, 'GN NHWC fused (custom op)', gn_op.fwd_fused),
+                #(GN_NHWC, 'GN NHWC NH grid (custom op)', gn_op.fwd_NH_grid),
+                #(GN_NHWC, 'GN NHWC N grid (custom op)', gn_op.fwd_N_grid),
+                #(GN_NHWC, 'GN NHWC NG grid NG grid (custom op)', gn_op.fwd_NG_grid),
+                #(GN_NCHW, 'torch.nn GN NCHW (compiled from src)', gn_op.nchwforward),
+                (nn.GroupNorm, 'torch.nn GN NCHW', None),
+                #(GN_NCHW, 'torch.nn GN NCHW (compiled from src)', None),
+                (GN_NHWC, 'GN NHWC', None),
+        ]
+
+        os.makedirs('csvs', exist_ok=True)
+        fname = datetime.datetime.now().strftime("csvs/%H-%M-%S-%d-%m-%Y.csv")
+        print(f'Writing to {fname}')
+        outfile = open(fname, 'w')
+        outfile.write('Kernel,B (batch),C (num channels),R (resolution),G (num groups), D (C/G),Speed (it/s; 25th percentile),Speed (it/s; 50th percentile),Speed (it/s; 75th percentile)\n')
+        
+        configs = list(filter(config_filter, itertools.product(DTYPES, BATCHES, CHANNELS, RESOLUTIONS, NUM_GROUPS)))
+        print('Estimated time (seconds) to complete:', NSEC * len(configs) * len(GN_KERNELS))
+
+        for DTYPE, B, C, R, G in configs:
+            x_nchw = torch.randn((B, C, R, R), dtype=DTYPE, device='cuda').requires_grad_(True)
+            x_nhwc = x_nchw.contiguous(memory_format=torch.channels_last).cuda().requires_grad_(True)
+
+            gn_args = (G, C)
+            print(BENCH, 'X shape:', x_nchw.shape, 'G (num groups):', G)
+            for gn_class, desc, fwd_fn in GN_KERNELS:
+                gn_input = x_nchw if 'NCHW' in desc else x_nhwc
+                print(f'\t{desc}')
+
+                try:
+                    gn_layer = gn_class(*gn_args).cuda().to(DTYPE)
+                    g = gn_layer(gn_input)
+                    if not isinstance(gn_layer, GN_NHWC):
+                        g = act_fn(g)
+
+                    torch.cuda.synchronize()
+
+                    tic = time.time()
+                    tic_sec = time.time()
+                    ntrials = 0
+                    ntrials_minor = 0
+                    minor_speeds = [] # used to track speed percentiles since they can often vary by a lot
+
+                    while time.time() - tic < NSEC:
+                        if BENCH == 'fwd':
+                            if fwd_fn is None:
+                                g = gn_layer(gn_input)
+                            else:
+                                g = fwd_fn(gn_input, gn_layer.weight, gn_layer.bias, gn_layer.num_groups, gn_layer.eps) # Not calling gn_layer(gn_input) since I found this added a lot of overhead
+                        elif BENCH == 'both':
+                            g = gn_layer(gn_input)
+                        if not isinstance(gn_layer, GN_NHWC):
+                            g = act_fn(g)
+                        if BENCH != 'fwd':
+                            torch.autograd.grad(g.sum(), gn_input, retain_graph=True)
+                        torch.cuda.synchronize()
+
+                        ntrials += 1
+                        ntrials_minor += 1
+
+                        if time.time() - tic_sec > 0.1:
+                            speed = round(ntrials_minor / (time.time() - tic_sec), 2)
+                            minor_speeds.append(speed)
+                            print(f'\t\t{round(time.time() - tic, 1)}/{NSEC} seconds completed, speed: {blue(speed)} it/s\r', end='')
+                            ntrials_minor = 0
+                            tic_sec = time.time()
+
+                    minor_speeds = np.array(minor_speeds)
+                    median_speed = round(np.percentile(minor_speeds, 50), 2)
+                    slow_speed = round(np.percentile(minor_speeds, 25), 2)
+                    fast_speed = round(np.percentile(minor_speeds, 75), 2)
+                    print(f'\n\t\tSpeed (25th/50th/75th percentile): {red(slow_speed)}/{yellow(median_speed)}/{green(fast_speed)} it/s')
+                except KeyboardInterrupt:
+                    print(f'Keyboard interrupt, closing {fname}.')
+                    outfile.close()
+                    raise
+                except Exception as e:
+                    print('\t\tFAILED; Error:', str(e).strip())
+                    median_speed = slow_speed = fast_speed = '-1 (failed)'
+                
+                outfile.write(f'{desc},{B},{C},{R},{G},{C//G},{slow_speed},{median_speed},{fast_speed}\n')
+            print()
+        print(f'All tests done, closing {fname}.')
+        outfile.close()
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/gn_kernel.cu b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/gn_kernel.cu
new file mode 100644
index 000000000..b421daccf
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/gn_kernel.cu
@@ -0,0 +1,1051 @@
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/core/ScalarType.h>
+#include <thrust/pair.h>
+#include <thrust/execution_policy.h>
+#include "gn_kernel.h"
+#include "Welford.h"
+#include "vecs.h"
+#define MAX_THREADS_PER_BLOCK 512 // 512 slightly faster (~3%) than 1024 because of higher theoretical occupancy -> higher mem throughput
+#define MAX(a, b) (a > b) ? a : b
+#define MIN(a, b) (a < b) ? a : b
+
+#define DEBUG_ENABLED 0
+#if DEBUG_ENABLED
+#define DEBUG(format, args...) fprintf(stderr, format, args)
+#else
+#define DEBUG(format, args...) ((void)0)
+#endif
+#define ELEM_DEBUG 0
+#define INT int // torch uses int64_t but this came at a pretty big hit to performance and the input sizes that I frequently use (resolutions no bigger than 1024x1024) have a number of pixels smaller than the int max value
+
+template <typename T>
+struct acc_type { using type = float; };
+template <>
+struct acc_type<double> { using type = double; };
+
+typedef struct block_params {
+  int t; // threads per block
+  int d; // dimensionality (number of rows of data that each threadblock proceesses in parallel)
+  int f; // factor (number of different threadblocks needed to represent one row of data) 
+} block_params_t;
+
+inline block_params_t calc_block_params(const int ideal_num_threads, const int threads_per_row, int f_divides = -1, const int tpb_divides = -1) {
+  /*
+  ideal_num_threads: absolute upper limit of threads that a block should have (e.g. a kernel that operates on only 30 elements should have a max TPB of 30 (ideal_num_threads=30))
+  threads_per_row: determines the user-specified upper limit on the size of blockDim.x
+    - meant to be set to the size of the last dimension, e.g. a kernel operating on tensor sized (N, R, C) would have threads_per_row=C
+  f_divides: optional parameter if user needs to explicitly specify a stricter requirement on the divisibility of the number of threads per block
+    - e.g. fwd with C = 2560, G = 32, TPB = 480 wouldn't work since that means 32 groups are split over f=5 blocks (5.333 groups per block)
+    - e.g. fwd with C = 2560, G = 32, TPB = 320 would work since that means 32 groups are split over f=8 blocks (4 groups per block), you could say that f divides 32 (f_divides=32)
+  tpb_divides: optional parameter if user needs to explicitly specify that the returned threads per block needs to divide another value (e.g. a kernel where bounds checking isn't implemented)
+    - e.g. fwd with H, W, C = 5, 5, 32; TPB = 512 wouldn't work since that means you use 1.5625 blocks to represent H*W*C (800) elements
+    - e.g. fwd with H, W, C = 5, 5, 32; TPB = 160 would work since that means you use 5 blocks to represent H*W*C (800) elements, you could say that TPB (160) divides 800 (tpb_divides=800)
+  */
+  int TPB, d = 1, f = 1;
+  f_divides = f_divides == -1 ? threads_per_row : f_divides;
+  TPB = MIN(MAX_THREADS_PER_BLOCK, ideal_num_threads);
+  if (threads_per_row < TPB) {
+    d = TPB / threads_per_row;
+    if (tpb_divides != -1) // could be put as another condition in the while loop but it hurts readability
+      while (tpb_divides % (threads_per_row * d) != 0) // d = 1 guaranteed to break this condition
+        --d;
+  }
+  else
+    while (f_divides % f != 0 || threads_per_row / f > MAX_THREADS_PER_BLOCK)
+      ++f;
+  TPB = threads_per_row * d / f;
+  return {TPB, d, f};
+}
+
+template <typename T> __device__ T inline identity(T x) {
+  return x;
+}
+template <typename T> __device__ T inline identity_d(T /*x*/) {
+  return 1;
+}
+
+template <typename T> __device__ T inline relu(T x) {
+  return x > 0 ? x : static_cast<T>(0);
+}
+template <typename T> __device__ T inline relu_d(T x) {
+  return x > 0 ? static_cast<T>(1) : static_cast<T>(0);
+}
+
+template <typename T> __device__ T inline silu(T x) {
+  return x / (1 + exp(-x));
+}
+template <typename T> __device__ T inline silu_d(T x) {
+  const T s = 1 / (1 + exp(-x));
+  return s * (1 + x * (1 - s));
+}
+
+template <typename T> __device__ T inline gelu(T x) {
+  constexpr float kAlpha = M_SQRT1_2;
+  return x * T(0.5) * (T(1) + erf(x * kAlpha));
+}
+template <typename T> __device__ T inline gelu_d(T x) {
+  constexpr float kBeta = M_2_SQRTPI * M_SQRT1_2 * 0.5;
+  constexpr float kAlpha = M_SQRT1_2;
+  const T cdf = T(0.5) * (T(1) + erf(x * kAlpha));
+  const T pdf = exp(T(-0.5) * x * x) * kBeta;
+  return cdf + x * pdf;
+}
+
+template <typename T> __device__ T inline gelu_tanh(T x) {
+  constexpr float kBeta = M_SQRT2 * M_2_SQRTPI * 0.5;
+  constexpr float kKappa = 0.044715;
+  auto inner = kBeta * (x + kKappa * x * x * x);
+  return T(0.5) * x * (T(1) + tanh(inner));
+}
+template <typename T> __device__ T inline gelu_tanh_d(T x) {
+  constexpr float kBeta = M_SQRT2 * M_2_SQRTPI * 0.5;
+  constexpr float kKappa = 0.044715;
+  auto x_sq = x * x;
+  auto x_cube = x_sq * x;
+  auto inner = kBeta * (x + kKappa * x_cube);
+  auto tanh_inner = tanh(inner);
+
+  auto left = T(0.5) * x;
+  auto right = T(1) + tanh_inner;
+
+  auto left_derivative = T(0.5) * right;
+
+  auto tanh_derivative = T(1) - tanh_inner * tanh_inner;
+  auto inner_derivative = kBeta * (T(1) + T(3) * kKappa * x_sq);
+  auto right_derivative = left * tanh_derivative * inner_derivative;
+
+  return left_derivative + right_derivative;
+}
+
+//////////////////////////////////////////////////
+// forward kernels
+//////////////////////////////////////////////////
+
+template <typename T>
+__global__ void
+compute_stats_pt1(
+    const T* X,
+    const int H,
+    const int W,
+    const int C,
+    const int G,
+    WelfordData<typename acc_type<T>::type, INT> *welford_data
+  ) {
+  /*
+  Computes means and rstds of X on the W (width) dimension.
+  grid: (x=N, y=H, z=f); block: (x=TPB/d, y=d)
+  - TPB = Cd/f
+  if TPB < C (f > 1, d=1)
+    C = f*TPB
+    X shape: (N, H, W, C) -view-> (N, H, W, 1, f, TPB); X stride: (HWC, WC, C, C, TPB, 1)
+    dram reduction (per block): (W, 1, TPB) -reduce-> (1, TPB)
+  else (block.x=C, block.y=d)
+    TPB = Cd
+    X shape: (N, H, W, C) -view-> (N, H, W/d, d, 1, C); X stride: (HWC, WC, dC, C, C, 1)
+    dram reduction (per block): (W/d, d, C) -reduce-> (d, C)
+  shmem reduction (per block): (TPB,) -view-> (d, G/f, D) -permute-> (d, D, G/f) -reduce-> G/f
+  output buffer: (N, f, G/f, H)
+  */
+  using T_ACC = typename acc_type<T>::type;
+  using WelfordType = WelfordData<T_ACC, INT>;
+  using WelfordOp = WelfordOps<T_ACC, T_ACC, INT, thrust::pair<T_ACC, T_ACC>>;
+  const int TPB = blockDim.y * blockDim.x;
+  const int d = blockDim.y;
+
+  WelfordOp welford_op = {/*correction=*/0, /*take_sqrt=*/false};
+  WelfordType val(0, 0, 0, 0);
+
+  const int w = ceil((float)W / d);
+  int i;
+#pragma unroll
+  for (i = 0; i < w - 1; ++i) {
+    int reduce_idx = 0;
+    reduce_idx += blockIdx.x * H * W * C;
+    reduce_idx += blockIdx.y * W * C;
+    reduce_idx += i * d * C;
+    reduce_idx += threadIdx.y * C;
+    reduce_idx += blockIdx.z * TPB;
+    reduce_idx += threadIdx.x;
+    T x = X[reduce_idx];
+    val = welford_op.reduce(val, static_cast<T_ACC>(x));
+  }
+  if ((int)(i * d + threadIdx.y) < W) // last iteration to deal with inputs with weird width sizes
+    val = welford_op.reduce(val, static_cast<T_ACC>(X[blockIdx.x * H * W * C + blockIdx.y * W * C + i * d * C + threadIdx.y * C + blockIdx.z * TPB + threadIdx.x]));
+
+  // shmem reduction
+  const int D = C / G;
+  const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+  const int f = gridDim.z;
+  const int gf = G / f;
+  const int d_idx = threadIdx.y;
+  const int gf_idx = threadIdx.x / D;
+  const int D_idx = threadIdx.x % D;
+
+  __shared__ typename std::aligned_storage<sizeof(WelfordType), alignof(WelfordType)>::type vals_reduced_arr[MAX_THREADS_PER_BLOCK];
+  WelfordType *vals_reduced = reinterpret_cast<WelfordType*>(vals_reduced_arr);
+
+  int idx = 0;
+  idx += d_idx * D * gf;
+  idx += D_idx * gf;
+  idx += gf_idx;
+  vals_reduced[idx] = val;
+  __syncthreads();
+
+  int reduce_n = TPB / gf; // number of inputs that gets reduced to a single output
+#pragma unroll
+  for (int stride = TPB / 2; stride >= gf && reduce_n % 2 == 0 && stride % gf == 0; stride >>= 1, reduce_n >>= 1) {
+    if (tid < stride)
+      vals_reduced[tid] = welford_op.combine(vals_reduced[tid], vals_reduced[tid + stride]);
+    __syncthreads();
+  }
+
+  // put reduced outputs into return buffers
+  if (tid < gf) {
+#pragma unroll
+    for (int i = 1; i < reduce_n; ++i)
+      vals_reduced[tid] = welford_op.combine(vals_reduced[tid], vals_reduced[tid + i * gf]);
+
+    int out_idx = 0;
+    out_idx += blockIdx.x * G * H;
+    out_idx += blockIdx.z * gf * H;
+    out_idx += tid * H;
+    out_idx += blockIdx.y;
+    welford_data[out_idx] = vals_reduced[tid];
+  }
+}
+
+template <typename T>
+__global__ void
+compute_stats_pt2(
+    WelfordData<typename acc_type<T>::type, INT> *welford_data,
+    const int H,
+    const int G,
+    const T eps,
+    T* means,
+    T* rstds
+  ) {
+  using T_ACC = typename acc_type<T>::type;
+  using WelfordType = WelfordData<T_ACC, INT>;
+  using WelfordOp = WelfordOps<T_ACC, T_ACC, INT, thrust::pair<T_ACC, T_ACC>>;
+  /*
+  Computes means and rstds of X on the H (height) dimension.
+  grid: (x=N, y=G); block: (x=H/f)
+  - TPB = Gd/f
+  welford_data shape: (N, G, H) -view-> (N, G, f, H/f); X stride: (GH, H, H/f, 1)
+  dram reduction (per block): (f, H/f) -reduce-> (H/f,)
+  shmem reduction (per block): (H/f) -reduce-> (1,)
+  output buffer: (N, G)
+  */
+
+  WelfordOp welford_op = {/*correction=*/0, /*take_sqrt=*/false};
+  WelfordType val(0, 0, 0, 0);
+  const int TPB = blockDim.y * blockDim.x;
+
+  const int f = H / TPB;
+  for (int i = 0 ; i < f; ++i) {
+    int idx = 0;
+    idx += blockIdx.x * G * H;
+    idx += blockIdx.y * H;
+    idx += i * H / f;
+    idx += threadIdx.x;
+    val = welford_op.combine(val, welford_data[idx]);
+  }
+
+  // shmem reduction
+  __shared__ typename std::aligned_storage<sizeof(WelfordType), alignof(WelfordType)>::type vals_reduced_arr[MAX_THREADS_PER_BLOCK];
+  WelfordType *vals_reduced = reinterpret_cast<WelfordType*>(vals_reduced_arr);
+
+  const int tid = threadIdx.x;
+  vals_reduced[tid] = val;
+  __syncthreads();
+
+  int reduce_n = TPB; // number of inputs that gets reduced to a single output
+
+#pragma unroll
+  for (int stride = TPB / 2; stride >= 1 && reduce_n % 2 == 0; stride >>= 1, reduce_n >>= 1) {
+    if (tid < stride)
+      vals_reduced[tid] = welford_op.combine(vals_reduced[tid], vals_reduced[tid + stride]);
+    __syncthreads();
+  }
+
+  // put reduced outputs into return buffers
+  if (tid == 0) {
+#pragma unroll
+    for (int i = 1; i < reduce_n; ++i)
+      vals_reduced[tid] = welford_op.combine(vals_reduced[tid], vals_reduced[tid + i]);
+
+    T_ACC mean, var;
+    thrust::tie(var, mean) = welford_op.project(vals_reduced[tid]);
+    int out_idx = 0;
+    out_idx += blockIdx.x * G;
+    out_idx += blockIdx.y;
+    means[out_idx] = mean;
+    rstds[out_idx] = rsqrt(var + static_cast<T_ACC>(eps));
+  }
+}
+
+template <typename T, int LOOP_I, int vec_elems, int64_t act_fn_option>
+__global__ void
+scale_shift(
+    const T* X_data,
+    const T* mean_data,
+    const T* rstd_data,
+    const T* weight_data,
+    const T* bias_data,
+    const int N,
+    const int C,
+    const int G,
+    T* y
+    ) {
+  /*
+  Performs elementwise op (X - mean) * rstd * weight + bias. Vectorized for speed.
+  LOOP_I: number of elements that each thread processes.
+  vec_elems: number of elements stored for each vector.
+  grid: (x=NHWC / (TPB * LOOP_I * f), y=f), block: (x=TPB)
+  - HWC % (TPB * LOOP_I * f) = 0
+  - TPB * f % C = 0
+  X shape: (N, H, W, C) -view-> (NHWC / (TPB * LOOP_I * f), LOOP_I, f, TPB); X.stride: (LOOP_I * f * TPB, f * TPB, TPB, 1)
+  */
+  using T_ACC = typename acc_type<T>::type;
+  using V = float_vec<T, vec_elems>;
+  const int f = gridDim.y;
+  const int TPB = blockDim.x;
+
+  const int n = (N * blockIdx.x) / gridDim.x;
+  const int c = (blockIdx.y * blockDim.x + threadIdx.x) % (C / vec_elems);
+  const int g = (G * c) / (C / vec_elems);
+  const int ng = n * G + g;
+  const V *X_vecs = reinterpret_cast<const V*>(X_data);
+  const V *weight_vecs = reinterpret_cast<const V*>(weight_data);
+  const V *bias_vecs = reinterpret_cast<const V*>(bias_data);
+  V *y_vecs = reinterpret_cast<V*>(y);
+  T mean = mean_data[ng];
+  T rstd = rstd_data[ng];
+  V weight_vec = weight_vecs[c];
+  V bias_vec = bias_vecs[c];
+
+  // compute fused weight/bias a,b such that (x - mean) * rstd * weight + bias = x * a + b
+  V fused_weight, fused_bias;
+  if constexpr (vec_elems == 1) {
+    fused_weight = {rstd * weight_vec.x};
+    fused_bias = {-mean * fused_weight.x + bias_vec.x};
+  }
+  else if constexpr (vec_elems == 2) {
+    fused_weight = {
+      rstd * weight_vec.x,
+      rstd * weight_vec.y
+    };
+    fused_bias = {
+      -mean * fused_weight.x + bias_vec.x,
+      -mean * fused_weight.y + bias_vec.y
+    };
+  }
+  else if constexpr (vec_elems == 4) {
+    fused_weight = {
+      rstd * weight_vec.x,
+      rstd * weight_vec.y,
+      rstd * weight_vec.z,
+      rstd * weight_vec.w
+    };
+    fused_bias = {
+      -mean * fused_weight.x + bias_vec.x,
+      -mean * fused_weight.y + bias_vec.y,
+      -mean * fused_weight.z + bias_vec.z,
+      -mean * fused_weight.w + bias_vec.w
+    };
+  }
+
+  T (*act_fn)(T);
+  if constexpr (act_fn_option == 0)
+    act_fn = identity;
+  else if constexpr (act_fn_option == 1)
+    act_fn = relu;
+  else if constexpr (act_fn_option == 2)
+    act_fn = silu;
+  else if constexpr (act_fn_option == 3)
+    act_fn = gelu;
+  else if constexpr (act_fn_option == 4)
+    act_fn = gelu_tanh;
+
+#pragma unroll
+  for (int i = 0; i < LOOP_I; ++i) {
+    int idx = 0;
+    idx += blockIdx.x * LOOP_I * f * TPB;
+    idx += i * f * TPB;
+    idx += blockIdx.y * TPB;
+    idx += threadIdx.x;
+    V X_vec = X_vecs[idx];
+    
+    if constexpr (vec_elems == 1)
+      y_vecs[idx] = {act_fn(static_cast<T_ACC>(X_vec.x) * fused_weight.x + fused_bias.x)};
+    else if constexpr (vec_elems == 2) {
+      y_vecs[idx] = {
+        act_fn(static_cast<T_ACC>(X_vec.x) * fused_weight.x + fused_bias.x),
+        act_fn(static_cast<T_ACC>(X_vec.y) * fused_weight.y + fused_bias.y),
+      };
+    }
+    else if constexpr (vec_elems == 4) {
+      y_vecs[idx] = {
+        act_fn(static_cast<T_ACC>(X_vec.x) * fused_weight.x + fused_bias.x),
+        act_fn(static_cast<T_ACC>(X_vec.y) * fused_weight.y + fused_bias.y),
+        act_fn(static_cast<T_ACC>(X_vec.z) * fused_weight.z + fused_bias.z),
+        act_fn(static_cast<T_ACC>(X_vec.w) * fused_weight.w + fused_bias.w),
+      };
+    }
+  }
+}
+
+template <typename T>
+void run_gn_fwd_kernels(
+    const T *X_data,
+    const T *weight_data,
+    const T *bias_data,
+    const int N,
+    const int H,
+    const int W,
+    const int C,
+    const int G,
+    T eps,
+    const int64_t act_fn_option,
+    T *Y_data,
+    T *mean_data,
+    T *rstd_data) {
+  using T_ACC = typename acc_type<T>::type;
+  using WelfordType = WelfordData<T_ACC, INT>;
+  WelfordType *welford_data = (WelfordType*)c10::cuda::CUDACachingAllocator::raw_alloc(sizeof(WelfordType) * N * G * H);
+  cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
+  
+  // compute means/rstds over width dimension
+  {
+    auto [TPB, d, f] = calc_block_params(W * C, C, G);
+    DEBUG("starting compute_stats 1, N: %d, H: %d, W: %d, C: %d, G: %d, D: %d, TPB: %d, d: %d, f: %d, G/f: %d\n", N, H, W, C, G, (C / G), TPB, d, f, (G / f));
+    compute_stats_pt1<<<dim3(N, H, f), dim3(TPB / d, d), 0, cuda_stream>>>(
+        X_data,
+        H, W, C, G, 
+        welford_data
+    );
+  }
+
+  // compute means/rstds over height dimension
+  {
+    auto [TPB, d, f] = calc_block_params(H, H);
+    DEBUG("starting compute_stats 2, N: %d, H: %d, W: %d, C: %d, G: %d, D: %d, TPB: %d, d: %d, f: %d, G/f: %d\n", N, H, W, C, G, (C / G), TPB, d, f, (G / f));
+    compute_stats_pt2<<<dim3(N, G), H / f, 0, cuda_stream>>>(
+        welford_data,
+        H, G, eps,
+        mean_data, rstd_data
+    );
+  }
+
+  // scale/shift X
+  {
+    const int D = C / G;
+    int vec_elems;
+    if (D % 4 == 0) vec_elems = 4;
+    else if (D % 2 == 0) vec_elems = 2;
+    else vec_elems = 1;
+    auto [TPB, d, f] = calc_block_params(H * W * C / 8 / vec_elems, C);
+
+    if (!ELEM_DEBUG && ((H * W * C) % (TPB * 8 * f * vec_elems) == 0)) {
+      const int LOOP_I = 8;
+      const int num_blocks = N * H * W * C / TPB / LOOP_I / f;
+      DEBUG("scale shift starting (LOOP_I = 8), N: %d, H: %d, W: %d, C: %d, G: %d, D: %d, TPB: %d, f: %d, num blocks (before vectors): %d, vec_elems: %d\n", N, H, W, C, G, D, TPB, f, num_blocks, vec_elems);
+      if (vec_elems == 4 && act_fn_option == 0) // i'm sorry
+        scale_shift<T, LOOP_I, 4, 0><<<dim3(num_blocks / vec_elems, f), TPB, 0, cuda_stream>>>(X_data, mean_data, rstd_data, weight_data, bias_data, N, C, G, Y_data);
+      else if (vec_elems == 2 && act_fn_option == 0)
+        scale_shift<T, LOOP_I, 2, 0><<<dim3(num_blocks / vec_elems, f), TPB, 0, cuda_stream>>>(X_data, mean_data, rstd_data, weight_data, bias_data, N, C, G, Y_data);
+      else if (vec_elems == 1 && act_fn_option == 0)
+        scale_shift<T, LOOP_I, 1, 0><<<dim3(num_blocks / vec_elems, f), TPB, 0, cuda_stream>>>(X_data, mean_data, rstd_data, weight_data, bias_data, N, C, G, Y_data);
+      else if (vec_elems == 4 && act_fn_option == 1)
+        scale_shift<T, LOOP_I, 4, 1><<<dim3(num_blocks / vec_elems, f), TPB, 0, cuda_stream>>>(X_data, mean_data, rstd_data, weight_data, bias_data, N, C, G, Y_data);
+      else if (vec_elems == 2 && act_fn_option == 1)
+        scale_shift<T, LOOP_I, 2, 1><<<dim3(num_blocks / vec_elems, f), TPB, 0, cuda_stream>>>(X_data, mean_data, rstd_data, weight_data, bias_data, N, C, G, Y_data);
+      else if (vec_elems == 1 && act_fn_option == 1)
+        scale_shift<T, LOOP_I, 1, 1><<<dim3(num_blocks / vec_elems, f), TPB, 0, cuda_stream>>>(X_data, mean_data, rstd_data, weight_data, bias_data, N, C, G, Y_data);
+      else if (vec_elems == 4 && act_fn_option == 2)
+        scale_shift<T, LOOP_I, 4, 2><<<dim3(num_blocks / vec_elems, f), TPB, 0, cuda_stream>>>(X_data, mean_data, rstd_data, weight_data, bias_data, N, C, G, Y_data);
+      else if (vec_elems == 2 && act_fn_option == 2)
+        scale_shift<T, LOOP_I, 2, 2><<<dim3(num_blocks / vec_elems, f), TPB, 0, cuda_stream>>>(X_data, mean_data, rstd_data, weight_data, bias_data, N, C, G, Y_data);
+      else if (vec_elems == 1 && act_fn_option == 2)
+        scale_shift<T, LOOP_I, 1, 2><<<dim3(num_blocks / vec_elems, f), TPB, 0, cuda_stream>>>(X_data, mean_data, rstd_data, weight_data, bias_data, N, C, G, Y_data);
+      else if (vec_elems == 4 && act_fn_option == 3)
+        scale_shift<T, LOOP_I, 4, 3><<<dim3(num_blocks / vec_elems, f), TPB, 0, cuda_stream>>>(X_data, mean_data, rstd_data, weight_data, bias_data, N, C, G, Y_data);
+      else if (vec_elems == 2 && act_fn_option == 3)
+        scale_shift<T, LOOP_I, 2, 3><<<dim3(num_blocks / vec_elems, f), TPB, 0, cuda_stream>>>(X_data, mean_data, rstd_data, weight_data, bias_data, N, C, G, Y_data);
+      else if (vec_elems == 1 && act_fn_option == 3)
+        scale_shift<T, LOOP_I, 1, 3><<<dim3(num_blocks / vec_elems, f), TPB, 0, cuda_stream>>>(X_data, mean_data, rstd_data, weight_data, bias_data, N, C, G, Y_data);
+      else if (vec_elems == 4 && act_fn_option == 4)
+        scale_shift<T, LOOP_I, 4, 4><<<dim3(num_blocks / vec_elems, f), TPB, 0, cuda_stream>>>(X_data, mean_data, rstd_data, weight_data, bias_data, N, C, G, Y_data);
+      else if (vec_elems == 2 && act_fn_option == 4)
+        scale_shift<T, LOOP_I, 2, 4><<<dim3(num_blocks / vec_elems, f), TPB, 0, cuda_stream>>>(X_data, mean_data, rstd_data, weight_data, bias_data, N, C, G, Y_data);
+      else if (vec_elems == 1 && act_fn_option == 4)
+        scale_shift<T, LOOP_I, 1, 4><<<dim3(num_blocks / vec_elems, f), TPB, 0, cuda_stream>>>(X_data, mean_data, rstd_data, weight_data, bias_data, N, C, G, Y_data);
+    }
+    else {// relatively slow fallback
+      const int num_blocks = N * H * W;
+      DEBUG("SLOW FALLBACK, scale shift kernel starting, N: %d, H: %d, W: %d, C: %d, G: %d, D: %d, TPB: %d, f: %d, num blocks (before vectors): %d, vec_elems: %d\n", N, H, W, C, G, D, C/f, f, num_blocks, vec_elems);
+      if (act_fn_option == 0)
+        scale_shift<T, 1, 1, 0><<<dim3(num_blocks, f), C / f, 0, cuda_stream>>>(X_data, mean_data, rstd_data, weight_data, bias_data, N, C, G, Y_data);
+      if (act_fn_option == 1)
+        scale_shift<T, 1, 1, 1><<<dim3(num_blocks, f), C / f, 0, cuda_stream>>>(X_data, mean_data, rstd_data, weight_data, bias_data, N, C, G, Y_data);
+      if (act_fn_option == 2)
+        scale_shift<T, 1, 1, 2><<<dim3(num_blocks, f), C / f, 0, cuda_stream>>>(X_data, mean_data, rstd_data, weight_data, bias_data, N, C, G, Y_data);
+      if (act_fn_option == 3)
+        scale_shift<T, 1, 1, 3><<<dim3(num_blocks, f), C / f, 0, cuda_stream>>>(X_data, mean_data, rstd_data, weight_data, bias_data, N, C, G, Y_data);
+      if (act_fn_option == 4)
+        scale_shift<T, 1, 1, 4><<<dim3(num_blocks, f), C / f, 0, cuda_stream>>>(X_data, mean_data, rstd_data, weight_data, bias_data, N, C, G, Y_data);
+    }
+  }
+
+  c10::cuda::CUDACachingAllocator::raw_delete(welford_data);
+}
+
+template void run_gn_fwd_kernels<float>(const float *X_data, const float *weight_data, const float *bias_data, const int N, const int h, const int W, const int C, const int G, float eps, const int64_t act_fn_option, float *Y_data, float *mean_data, float *rstd_data);
+template void run_gn_fwd_kernels<double>(const double *X_data, const double *weight_data, const double *bias_data, const int N, const int h, const int W, const int C, const int G, double eps, const int64_t act_fn_option, double *Y_data, double *mean_data, double *rstd_data);
+template void run_gn_fwd_kernels<c10::Half>(const c10::Half *X_data, const c10::Half *weight_data, const c10::Half *bias_data, const int N, const int h, const int W, const int C, const int G, c10::Half eps, const int64_t act_fn_option, c10::Half *Y_data, c10::Half *mean_data, c10::Half *rstd_data);
+template void run_gn_fwd_kernels<c10::BFloat16>(const c10::BFloat16 *X_data, const c10::BFloat16 *weight_data, const c10::BFloat16 *bias_data, const int N, const int h, const int W, const int C, const int G, c10::BFloat16 eps, const int64_t act_fn_option, c10::BFloat16 *Y_data, c10::BFloat16 *mean_data, c10::BFloat16 *rstd_data);
+
+//////////////////////////////////////////////////
+// backward kernels
+//////////////////////////////////////////////////
+
+template <typename T>
+__device__ void
+sum_reduce(
+    T vals_reduced,
+    const int start_stride,
+    const int end_stride
+  ) {
+  // Sums a shared buffer (vals_reduced) with shape (2 * start_stride / end_stride, end_stride) into (end_stride,).
+  const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+  int reduce_n = 2 * start_stride / end_stride;
+
+#pragma unroll
+  for (int stride = start_stride; stride >= end_stride && reduce_n % 2 == 0 && stride % end_stride == 0; stride >>= 1, reduce_n >>= 1) {
+    if (tid < stride)
+      vals_reduced[tid] += vals_reduced[tid + stride];
+    __syncthreads();
+  }
+
+  if (tid < end_stride)
+#pragma unroll
+    for (int i = 1; i < reduce_n; ++i)
+      vals_reduced[tid] += vals_reduced[tid + i * end_stride];
+  __syncthreads();
+}
+
+template <typename T, int64_t act_fn_option>
+__global__ void
+width_reduce(
+      const T* dy_data,
+      const T* X_data,
+      const T* mean_data,
+      const T* rstd_data,
+      const T* weight_data,
+      const T* bias_data,
+      const int H,
+      const int W,
+      const int C,
+      const int G,
+      typename acc_type<T>::type *xdy_dy_sum_data) {
+  /*
+  Loops over W (width) dimension, loading and summing dy, X, and the activation derivative of Y. Outputs stored in xdy_dy_sum_data. Spatial dimension H is processed in a separate kernel.
+  grid: (x=N, y=H, z=f); blockdim: (x=TPB/d, y=d)
+    TPB = Cd/f
+  if TPB < C (f > 1, d=1)
+    C = f*TPB
+    X shape: (N, H, W, C) -view-> (N, H, W, 1, f, TPB); X stride: (HWC, WC, C, C, TPB, 1)
+    dram reduction (per block): (W, 1, TPB) -reduce-> (TPB,)
+  else (block.x=C, block.y=d)
+    TPB = Cd
+    X shape: (N, H, W, C) -view-> (N, H, W/d, d, 1, C); X stride: (HWC, WC, dC, C, C, 1)
+    dram reduction (per block): (W/d, d, C) -reduce-> (d, C)
+  shmem reduction (per block): (TPB, 2) -> (d, C/f, 2) -reduce-> (C/f, 2) (the 2 comes from storing both xdy_sum and dy_sum in the same buffer)
+  output buffer: (N, f, C/f, H, 2) -view-> (N, C, H, 2)
+    xdy_dy_sum_data[:, :, :, 0] = x * dy * activation_derivative((x-mean)*rstd*weight+bias)
+    xdy_dy_sum_data[:, :, :, 1] = dy * activation_derivative((x-mean)*rstd*weight+bias)
+   */
+  using T_ACC = typename acc_type<T>::type;
+
+  const int TPB = blockDim.y * blockDim.x;
+  const int d = blockDim.y;
+  T_ACC xdy_sum = 0;
+  T_ACC dy_sum = 0;
+
+  const int n = blockIdx.x;
+  int c = blockIdx.z * blockDim.x + threadIdx.x;
+  int g = G * c / C;
+  const int ng = n * G + g;
+  T_ACC fused_scale = rstd_data[ng] * weight_data[c];
+  T_ACC fused_bias = -mean_data[ng] * fused_scale + bias_data[c];
+
+  T (*act_d_fn)(T x);
+  if constexpr (act_fn_option == 0)
+    act_d_fn = identity_d;
+  else if constexpr (act_fn_option == 1)
+    act_d_fn = relu_d;
+  else if constexpr (act_fn_option == 2)
+    act_d_fn = silu_d;
+  else if constexpr (act_fn_option == 3)
+    act_d_fn = gelu_d;
+  else if constexpr (act_fn_option == 4)
+    act_d_fn = gelu_tanh_d;
+
+  const int w = ceil((float)W / d);
+  int i;
+#pragma unroll
+  for (i = 0; i < w - 1; ++i) {
+    int reduce_idx = 0;
+    reduce_idx += blockIdx.x * H * W * C;
+    reduce_idx += blockIdx.y * W * C;
+    reduce_idx += i * d * C;
+    reduce_idx += threadIdx.y * C;
+    reduce_idx += blockIdx.z * TPB;
+    reduce_idx += threadIdx.x;
+    T_ACC dy_elem = static_cast<T_ACC>(dy_data[reduce_idx]);
+    T_ACC X_elem = static_cast<T_ACC>(X_data[reduce_idx]);
+    T_ACC X_norm = X_elem * fused_scale + fused_bias;
+    T_ACC d_act = act_d_fn(X_norm);
+    xdy_sum += dy_elem * X_elem * d_act;
+    dy_sum += dy_elem * d_act;
+  }
+  if ((int)(i * d + threadIdx.y) < W) { // last iteration to deal with inputs with weird width sizes
+    int reduce_idx = blockIdx.x * H * W * C + blockIdx.y * W * C + i * d * C + threadIdx.y * C + blockIdx.z * TPB + threadIdx.x;
+    T_ACC dy_elem = static_cast<T_ACC>(dy_data[reduce_idx]);
+    T_ACC X_elem = static_cast<T_ACC>(X_data[reduce_idx]);
+    T_ACC X_norm = X_elem * fused_scale + fused_bias;
+    T_ACC d_act = act_d_fn(X_norm);
+    xdy_sum += dy_elem * X_elem * d_act;
+    dy_sum += dy_elem * d_act;
+  }
+
+  // shmem reduction
+  extern __shared__ char vals_reduced_uncasted[]; // size 2*TPB, TPB for sum1, TPB for sum2
+  T_ACC *vals_reduced = reinterpret_cast<T_ACC*>(vals_reduced_uncasted);
+
+  const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+  vals_reduced[2 * tid] = xdy_sum;
+  vals_reduced[2 * tid + 1] = dy_sum;
+  __syncthreads();
+  sum_reduce(vals_reduced, TPB, 2 * C);
+
+  // put reduced outputs into return buffers
+  if (tid < C) {
+    int out_idx = 0;
+    out_idx += blockIdx.x * C * H;
+    out_idx += (blockIdx.z * TPB + tid) * H;
+    out_idx += blockIdx.y;
+
+    xdy_dy_sum_data[2 * out_idx] = vals_reduced[2 * tid];
+    xdy_dy_sum_data[2 * out_idx + 1] = vals_reduced[2 * tid + 1];
+  }
+}
+
+template <typename T>
+__global__ void
+height_reduce(
+    T *xdy_dy_sum_data, // no need to specify T_ACC as T is already an accumulation type
+    const int H,
+    const int C,
+    T *xdy_sum_data,
+    T *dy_sum_data
+  ) {
+  /*
+  Same thing as width_reduce but over the H (height) instead of the width dimension.
+  grid: (x=N, y=C); block: (x=2H/f)
+  X shape: (N, C, H, 2) -view-> (N, C, f, H/f, 2); X stride: (2CH, 2H, 2H/f, H/f, 1)
+  dram reduction (per block): (f, H/f, 2) -reduce-> (H/f, 2)
+  shmem reduction (per block): (H/f, 2) -reduce-> (2,)
+  output buffer: (N, C, 2)
+  */
+  const int TPB = blockDim.x;
+  const int tid = threadIdx.x;
+
+  // shmem reduction
+  extern __shared__ char vals_reduced_uncasted[];
+  T *vals_reduced = reinterpret_cast<T*>(vals_reduced_uncasted);
+  T sum = 0;
+  int i;
+#pragma unroll
+  for (i = 0; i < ceil((float)2 * H / TPB) - 1; ++i) {
+    int idx = 0;
+    idx += blockIdx.x * C * H * 2;
+    idx += blockIdx.y * H * 2;
+    idx += i * TPB;
+    idx += tid;
+    sum += xdy_dy_sum_data[idx];
+  }
+  if (i * TPB + tid < 2 * H)
+    sum += xdy_dy_sum_data[blockIdx.x * C * H * 2 + blockIdx.y * H * 2 + i * TPB + tid];
+
+  vals_reduced[tid] = sum;
+  __syncthreads();
+  sum_reduce(vals_reduced, TPB / 2, 2);
+
+  // put reduced outputs into return buffers
+  if (tid == 0) {
+    int out_idx = blockIdx.x * C + blockIdx.y;
+    xdy_sum_data[out_idx] = vals_reduced[0];
+    dy_sum_data[out_idx] = vals_reduced[1];
+  }
+}
+
+template <typename T>
+__global__ void
+compute_bwd_scale_biases(
+    const T* mean_data,
+    const T* rstd_data,
+    const T* weight_data,
+    const T* bias_data,
+    typename acc_type<T>::type* xdy_sum_data,
+    typename acc_type<T>::type* dy_sum_data,
+    const int H,
+    const int W,
+    const int C,
+    const int G,
+    typename acc_type<T>::type* coef1_data,
+    typename acc_type<T>::type* coef2_data,
+    typename acc_type<T>::type* coef3_data,
+    typename acc_type<T>::type* coef4_data
+    ) {
+  /*
+  Calculates coefficients to reduce computation on the elementwise kernel.
+  - coef1: fused scale (rstd * weight)
+  - coef2: fused bias (-mean * rstd * weight + bias)
+  - coef3/4: some derivative terms
+  griddim: (x=N, y=f); blockdim: (x=C/f)
+  - d = num. spatial elements (from HW dimension) each thread-block processes in parallel
+  - Cd = TPB (threads per block)
+  X shape: (N, C) -view-> (N, G, D) -permute-> (N, D, G) -reduce-> (N, G)
+  shmem reduction: (D, G) -reduce-> G
+  output buffer: (N, G)
+  */
+  using T_ACC = typename acc_type<T>::type;
+  const int D = C / G;
+  const int f = gridDim.y;
+  const int Gf = G / f;
+  const int n = blockIdx.x;
+  const int c = blockIdx.y * blockDim.x + threadIdx.x;
+  const int g = c / D;
+  const int d = c % D;
+  const int nc = n * C + c;
+  const T_ACC gamma_v = static_cast<T_ACC>(weight_data[c]);
+
+  extern __shared__ char vals_reduced_uncasted[]; // size 2*C, C for sum1, C for sum2
+  T_ACC *vals_reduced = reinterpret_cast<T_ACC*>(vals_reduced_uncasted);
+
+  int idx = 0;
+  idx += d * G / f;
+  idx += g % Gf;
+  vals_reduced[2 * idx] = xdy_sum_data[nc] * gamma_v;
+  vals_reduced[2 * idx + 1] = dy_sum_data[nc] * gamma_v;
+  __syncthreads();
+  sum_reduce(vals_reduced, C / f, 2 * G / f);
+
+  const int ng = n * G + g;
+  const T_ACC mean_elem = static_cast<T_ACC>(mean_data[ng]);
+  const T_ACC rstd_elem = static_cast<T_ACC>(rstd_data[ng]);
+  coef1_data[nc] = rstd_elem * weight_data[c];
+  coef2_data[nc] = -mean_elem * rstd_elem * weight_data[c] + bias_data[c];
+
+  if (d == 0) {
+    const T_ACC sum1 = vals_reduced[2 * (g % Gf)];
+    const T_ACC sum2 = vals_reduced[2 * (g % Gf) + 1];
+    const T_ACC s = T_ACC(1) / static_cast<T_ACC>(D * H * W);
+    const T_ACC x = (sum2 * mean_elem - sum1) * rstd_elem * rstd_elem * rstd_elem * s;
+    coef3_data[ng] = x;
+    coef4_data[ng] = (-x * mean_elem) - (sum2 * s * rstd_elem);
+  }
+}
+
+template <typename T>
+__global__ void
+compute_dweight_dbias(
+    const T* mean_data,
+    const T* rstd_data,
+    typename acc_type<T>::type *xdy_sum_data,
+    typename acc_type<T>::type *dy_sum_data,
+    const int N,
+    const int C,
+    const int G,
+    T* dweight_data,
+    T* dbias_data) {
+  /*
+  Computes derivatives wrt the weight and bias. 
+  grid: (x=f), block: (x=C/f)
+  */
+  using T_ACC = typename acc_type<T>::type;
+  const int c = blockIdx.x * blockDim.x + threadIdx.x;
+  const int D = C / G;
+  const int g = c / D;
+  T_ACC sum1 = 0;
+  T_ACC sum2 = 0;
+
+#pragma unroll
+  for (int n = 0; n < N; ++n) {
+    const int nc = n * C + c;
+    const int ng = n * G + g;
+    sum1 += (xdy_sum_data[nc] - dy_sum_data[nc] * mean_data[ng]) * rstd_data[ng];
+    sum2 += dy_sum_data[nc];
+  }
+  dweight_data[c] = sum1;
+  dbias_data[c] = sum2;
+}
+
+template <typename T, int LOOP_I, int vec_elems, int64_t act_fn_option>
+__global__ void
+dx_elem_kernel(
+    const T* dy_data,
+    const T* X_data,
+    typename acc_type<T>::type* coef1_data,
+    typename acc_type<T>::type* coef2_data,
+    typename acc_type<T>::type* coef3_data,
+    typename acc_type<T>::type* coef4_data,
+    const int N,
+    const int C,
+    const int G,
+    T* dx_data
+    ) {
+  /*
+  Performs elementwise kernel to calculate gradients wrt X. Vectorized for speed.
+  LOOP_I: number of elements that each thread processes.
+  vec_elems: number of elements stored for each vector.
+  grid: (x=NHWC / (TPB * LOOP_I * f), y=f), block: (x=TPB)
+  - HWC % (TPB * LOOP_I * f) = 0
+  - TPB * f % C = 0
+  X shape: (N, H, W, C) -view-> (NHWC / (TPB * LOOP_I * f), LOOP_I, f, TPB); X.stride: (LOOP_I * f * TPB, f * TPB, TPB, 1)
+  */
+  using T_ACC = typename acc_type<T>::type;
+  using V = float_vec<T, vec_elems>;
+  using V_ACC = float_vec<T_ACC, vec_elems>;
+  const int f = gridDim.y;
+  const int n = (N * blockIdx.x) / gridDim.x;
+  const int c = (blockIdx.y * blockDim.x + threadIdx.x) % (C / vec_elems);
+  const int g = (G * c) / (C / vec_elems);
+  const int nc = n * (C / vec_elems) + c;
+  const int ng = n * G + g;
+  T_ACC coef3 = coef3_data[ng];
+  T_ACC coef4 = coef4_data[ng];
+  const V *dy_vecs = reinterpret_cast<const V*>(dy_data);
+  const V *X_vecs = reinterpret_cast<const V*>(X_data);
+  V *dx_vecs = reinterpret_cast<V*>(dx_data);
+  V_ACC coef1_vec = reinterpret_cast<V_ACC*>(coef1_data)[nc];
+  V_ACC coef2_vec = reinterpret_cast<V_ACC*>(coef2_data)[nc];
+
+  T (*act_d_fn)(T);
+  if constexpr (act_fn_option == 0)
+    act_d_fn = identity_d;
+  else if constexpr (act_fn_option == 1)
+    act_d_fn = relu_d;
+  else if constexpr (act_fn_option == 2)
+    act_d_fn = silu_d;
+  else if constexpr (act_fn_option == 3)
+    act_d_fn = gelu_d;
+  else if constexpr (act_fn_option == 4)
+    act_d_fn = gelu_tanh_d;
+
+#pragma unroll
+  for (int i = 0; i < LOOP_I; ++i) {
+    int idx = 0;
+    idx += blockIdx.x * LOOP_I * f * blockDim.x;
+    idx += i * f * blockDim.x;
+    idx += blockIdx.y * blockDim.x;
+    idx += threadIdx.x;
+
+    V dy_vec = dy_vecs[idx];
+    V X_vec = X_vecs[idx];
+
+    if constexpr (vec_elems == 1) {
+      V X_norm = {X_vec.x * coef1_vec.x + coef2_vec.x};
+      dx_vecs[idx] = {
+        (coef1_vec.x * act_d_fn(X_norm.x) * dy_vec.x)
+          + ((coef3 * X_vec.x) + coef4)
+      };
+    }
+    else if constexpr (vec_elems == 2) {
+      V X_norm = {
+        X_vec.x * coef1_vec.x + coef2_vec.x,
+        X_vec.y * coef1_vec.y + coef2_vec.y,
+      };
+      dx_vecs[idx] = {
+        (coef1_vec.x * act_d_fn(X_norm.x) * dy_vec.x)
+          + ((coef3 * X_vec.x) + coef4),
+        (coef1_vec.y * act_d_fn(X_norm.y) * dy_vec.y)
+          + ((coef3 * X_vec.y) + coef4),
+      };
+    }
+    else if constexpr (vec_elems == 4) {
+      V X_norm = {
+        X_vec.x * coef1_vec.x + coef2_vec.x,
+        X_vec.y * coef1_vec.y + coef2_vec.y,
+        X_vec.z * coef1_vec.z + coef2_vec.z,
+        X_vec.w * coef1_vec.w + coef2_vec.w,
+      };
+      dx_vecs[idx] = {
+        (coef1_vec.x * act_d_fn(X_norm.x) * dy_vec.x)
+          + ((coef3 * X_vec.x) + coef4),
+        (coef1_vec.y * act_d_fn(X_norm.y) * dy_vec.y)
+          + ((coef3 * X_vec.y) + coef4),
+        (coef1_vec.z * act_d_fn(X_norm.z) * dy_vec.z)
+          + ((coef3 * X_vec.z) + coef4),
+        (coef1_vec.w * act_d_fn(X_norm.w) * dy_vec.w)
+          + ((coef3 * X_vec.w) + coef4),
+      };
+    }
+  }
+}
+
+template <typename T>
+void run_gn_bwd_kernels(
+      const T *dy_data,
+      const T *X_data,
+      const T *weight_data,
+      const T *bias_data,
+      const T *mean_data,
+      const T *rstd_data,
+      const int N,
+      const int H,
+      const int W,
+      const int C,
+      const int G,
+      const int64_t act_fn_option,
+      T *dx_data,
+      T *dweight_data,
+      T *dbias_data
+  ) {
+  using T_ACC = typename acc_type<T>::type;
+  cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
+  const int D = C / G;
+
+  T_ACC* xdy_dy_sum_data = (T_ACC*)c10::cuda::CUDACachingAllocator::raw_alloc(sizeof(T_ACC) * N * C * H * 2);
+
+  // sum over W dim
+  {
+    auto [TPB, d, f] = calc_block_params(W * C, C, G);
+    DEBUG("starting width reduce, N: %d, H: %d, W: %d, C: %d, G: %d, TPB: %d, d: %d, f: %d\n", N, H, W, C, G, TPB, d, f);
+    if (act_fn_option == 0)
+      width_reduce<T, 0><<<dim3(N, H, f), dim3(TPB / d, d), sizeof(T_ACC) * 2 * TPB, cuda_stream>>>(
+          dy_data, X_data, 
+          mean_data, rstd_data,
+          weight_data, bias_data,
+          H, W, C, G,
+          xdy_dy_sum_data);
+    else if (act_fn_option == 1)
+      width_reduce<T, 1><<<dim3(N, H, f), dim3(TPB / d, d), sizeof(T_ACC) * 2 * TPB, cuda_stream>>>(dy_data, X_data, mean_data, rstd_data, weight_data, bias_data, H, W, C, G, xdy_dy_sum_data);
+    else if (act_fn_option == 2)
+      width_reduce<T, 2><<<dim3(N, H, f), dim3(TPB / d, d), sizeof(T_ACC) * 2 * TPB, cuda_stream>>>(dy_data, X_data, mean_data, rstd_data, weight_data, bias_data, H, W, C, G, xdy_dy_sum_data);
+    else if (act_fn_option == 3)
+      width_reduce<T, 3><<<dim3(N, H, f), dim3(TPB / d, d), sizeof(T_ACC) * 2 * TPB, cuda_stream>>>(dy_data, X_data, mean_data, rstd_data, weight_data, bias_data, H, W, C, G, xdy_dy_sum_data);
+    else if (act_fn_option == 4)
+      width_reduce<T, 4><<<dim3(N, H, f), dim3(TPB / d, d), sizeof(T_ACC) * 2 * TPB, cuda_stream>>>(dy_data, X_data, mean_data, rstd_data, weight_data, bias_data, H, W, C, G, xdy_dy_sum_data);
+  }
+
+  T_ACC* xdy_sum_data = (T_ACC*)c10::cuda::CUDACachingAllocator::raw_alloc(sizeof(T_ACC) * N * C);
+  T_ACC* dy_sum_data = (T_ACC*)c10::cuda::CUDACachingAllocator::raw_alloc(sizeof(T_ACC) * N * C);
+  // sum over H dim
+  {
+    auto [TPB, d, f] = calc_block_params(2 * H, 2);
+    DEBUG("starting height reduce, N: %d, H: %d, W: %d, C: %d, G: %d, TPB: %d, d: %d, f: %d\n", N, H, W, C, G, TPB, d, f);
+    height_reduce<<<dim3(N, C), TPB, sizeof(T_ACC) * TPB, cuda_stream>>>(
+        xdy_dy_sum_data,
+        H, C,
+        xdy_sum_data, dy_sum_data);
+  }
+  c10::cuda::CUDACachingAllocator::raw_delete(xdy_dy_sum_data);
+
+  // compute weight/bias grads
+  {
+    auto [TPB, d, f] = calc_block_params(C, C, G);
+    DEBUG("starting compute dweight dbias, N: %d, H: %d, W: %d, C: %d, G: %d, TPB: %d, d: %d, f: %d\n", N, H, W, C, G, TPB, d, f);
+    compute_dweight_dbias<<<f, C / f, 0, cuda_stream>>>(
+        mean_data, rstd_data,
+        xdy_sum_data, dy_sum_data,
+        N, C, G,
+        dweight_data, dbias_data);
+  }
+
+  T_ACC *coef1_data = (T_ACC*)c10::cuda::CUDACachingAllocator::raw_alloc(sizeof(T_ACC) * N * C);
+  T_ACC *coef2_data = (T_ACC*)c10::cuda::CUDACachingAllocator::raw_alloc(sizeof(T_ACC) * N * C);
+  T_ACC *coef3_data = (T_ACC*)c10::cuda::CUDACachingAllocator::raw_alloc(sizeof(T_ACC) * N * G);
+  T_ACC *coef4_data = (T_ACC*)c10::cuda::CUDACachingAllocator::raw_alloc(sizeof(T_ACC) * N * G);
+  // compute fused scales/biases for dx elementwise kernel
+  {
+    auto [TPB, d, f] = calc_block_params(C, C, G);
+    DEBUG("starting bwd scale biases, N: %d, H: %d, W: %d, C: %d, G: %d, TPB: %d, d: %d, f: %d\n", N, H, W, C, G, TPB, d, f);
+    compute_bwd_scale_biases<<<dim3(N, f), C / f, sizeof(T_ACC) * 2 * C / f, cuda_stream>>>(
+        mean_data, rstd_data, weight_data, bias_data,
+        xdy_sum_data, dy_sum_data,
+        H, W, C, G,
+        coef1_data, coef2_data, coef3_data, coef4_data);
+  }
+
+  {
+    int vec_elems;
+    if (D % 4 == 0) vec_elems = 4;
+    else if (D % 2 == 0) vec_elems = 2;
+    else vec_elems = 1;
+    auto [TPB, d, f] = calc_block_params(H * W * C, C, G);
+
+    if (!ELEM_DEBUG && ((H * W * C) % (TPB * 8 * f * vec_elems) == 0)) {
+      const int LOOP_I = 8;
+      const int num_blocks = ceil((float)N * H * W * C / TPB / LOOP_I / f);
+      DEBUG("dx elem kernel starting, N: %d, H: %d, W: %d, C: %d, G: %d, D: %d, TPB: %d, f: %d, num blocks (before vectors): %d, vec_elems: %d\n", N, H, W, C, G, D, TPB, f, num_blocks, vec_elems);
+      if (D % 4 == 0 && act_fn_option == 0)
+        dx_elem_kernel<T, LOOP_I, 4, 0><<<dim3(num_blocks / 4, f), TPB, 0, cuda_stream>>>(dy_data, X_data, coef1_data, coef2_data, coef3_data, coef4_data, N,  C, G, dx_data);
+      else if (D % 2 == 0 && act_fn_option == 0)
+        dx_elem_kernel<T, LOOP_I, 2, 0><<<dim3(num_blocks / 2, f), TPB, 0, cuda_stream>>>(dy_data, X_data, coef1_data, coef2_data, coef3_data, coef4_data, N,  C, G, dx_data);
+      else if (D % 1 == 0 && act_fn_option == 0)
+        dx_elem_kernel<T, LOOP_I, 1, 0><<<dim3(num_blocks / 1, f), TPB, 0, cuda_stream>>>(dy_data, X_data, coef1_data, coef2_data, coef3_data, coef4_data, N,  C, G, dx_data);
+      else if (D % 4 == 0 && act_fn_option == 1)
+        dx_elem_kernel<T, LOOP_I, 4, 1><<<dim3(num_blocks / 4, f), TPB, 0, cuda_stream>>>(dy_data, X_data, coef1_data, coef2_data, coef3_data, coef4_data, N,  C, G, dx_data);
+      else if (D % 2 == 0 && act_fn_option == 1)
+        dx_elem_kernel<T, LOOP_I, 2, 1><<<dim3(num_blocks / 2, f), TPB, 0, cuda_stream>>>(dy_data, X_data, coef1_data, coef2_data, coef3_data, coef4_data, N,  C, G, dx_data);
+      else if (D % 1 == 0 && act_fn_option == 1)
+        dx_elem_kernel<T, LOOP_I, 1, 1><<<dim3(num_blocks / 1, f), TPB, 0, cuda_stream>>>(dy_data, X_data, coef1_data, coef2_data, coef3_data, coef4_data, N,  C, G, dx_data);
+      else if (D % 4 == 0 && act_fn_option == 2)
+        dx_elem_kernel<T, LOOP_I, 4, 2><<<dim3(num_blocks / 4, f), TPB, 0, cuda_stream>>>(dy_data, X_data, coef1_data, coef2_data, coef3_data, coef4_data, N,  C, G, dx_data);
+      else if (D % 2 == 0 && act_fn_option == 2)
+        dx_elem_kernel<T, LOOP_I, 2, 2><<<dim3(num_blocks / 2, f), TPB, 0, cuda_stream>>>(dy_data, X_data, coef1_data, coef2_data, coef3_data, coef4_data, N,  C, G, dx_data);
+      else if (D % 1 == 0 && act_fn_option == 2)
+        dx_elem_kernel<T, LOOP_I, 1, 2><<<dim3(num_blocks / 1, f), TPB, 0, cuda_stream>>>(dy_data, X_data, coef1_data, coef2_data, coef3_data, coef4_data, N,  C, G, dx_data);
+      else if (D % 4 == 0 && act_fn_option == 3)
+        dx_elem_kernel<T, LOOP_I, 4, 3><<<dim3(num_blocks / 4, f), TPB, 0, cuda_stream>>>(dy_data, X_data, coef1_data, coef2_data, coef3_data, coef4_data, N,  C, G, dx_data);
+      else if (D % 2 == 0 && act_fn_option == 3)
+        dx_elem_kernel<T, LOOP_I, 2, 3><<<dim3(num_blocks / 2, f), TPB, 0, cuda_stream>>>(dy_data, X_data, coef1_data, coef2_data, coef3_data, coef4_data, N,  C, G, dx_data);
+      else if (D % 1 == 0 && act_fn_option == 3)
+        dx_elem_kernel<T, LOOP_I, 1, 3><<<dim3(num_blocks / 1, f), TPB, 0, cuda_stream>>>(dy_data, X_data, coef1_data, coef2_data, coef3_data, coef4_data, N,  C, G, dx_data);
+      else if (D % 4 == 0 && act_fn_option == 4)
+        dx_elem_kernel<T, LOOP_I, 4, 4><<<dim3(num_blocks / 4, f), TPB, 0, cuda_stream>>>(dy_data, X_data, coef1_data, coef2_data, coef3_data, coef4_data, N,  C, G, dx_data);
+      else if (D % 2 == 0 && act_fn_option == 4)
+        dx_elem_kernel<T, LOOP_I, 2, 4><<<dim3(num_blocks / 2, f), TPB, 0, cuda_stream>>>(dy_data, X_data, coef1_data, coef2_data, coef3_data, coef4_data, N,  C, G, dx_data);
+      else if (D % 1 == 0 && act_fn_option == 4)
+        dx_elem_kernel<T, LOOP_I, 1, 4><<<dim3(num_blocks / 1, f), TPB, 0, cuda_stream>>>(dy_data, X_data, coef1_data, coef2_data, coef3_data, coef4_data, N,  C, G, dx_data);
+    }
+    else { // relatively slow fallback
+      const int num_blocks = N * H * W;
+      DEBUG("SLOW FALLBACK, dx elem kernel starting, N: %d, H: %d, W: %d, C: %d, G: %d, D: %d, TPB: %d, f: %d, num blocks (before vectors): %d, vec_elems: %d\n", N, H, W, C, G, D, C/f, f, num_blocks, vec_elems);
+      if (act_fn_option == 0)
+        dx_elem_kernel<T, 1, 1, 0><<<dim3(num_blocks, f), C / f, 0, cuda_stream>>>(dy_data, X_data, coef1_data, coef2_data, coef3_data, coef4_data, N,  C, G, dx_data);
+      else if (act_fn_option == 1)
+        dx_elem_kernel<T, 1, 1, 1><<<dim3(num_blocks, f), C / f, 0, cuda_stream>>>(dy_data, X_data, coef1_data, coef2_data, coef3_data, coef4_data, N,  C, G, dx_data);
+      else if (act_fn_option == 2)
+        dx_elem_kernel<T, 1, 1, 2><<<dim3(num_blocks, f), C / f, 0, cuda_stream>>>(dy_data, X_data, coef1_data, coef2_data, coef3_data, coef4_data, N,  C, G, dx_data);
+      else if (act_fn_option == 3)
+        dx_elem_kernel<T, 1, 1, 3><<<dim3(num_blocks, f), C / f, 0, cuda_stream>>>(dy_data, X_data, coef1_data, coef2_data, coef3_data, coef4_data, N,  C, G, dx_data);
+      else if (act_fn_option == 4)
+        dx_elem_kernel<T, 1, 1, 4><<<dim3(num_blocks, f), C / f, 0, cuda_stream>>>(dy_data, X_data, coef1_data, coef2_data, coef3_data, coef4_data, N,  C, G, dx_data);
+    }
+  }
+
+  c10::cuda::CUDACachingAllocator::raw_delete(xdy_sum_data);
+  c10::cuda::CUDACachingAllocator::raw_delete(dy_sum_data);
+  c10::cuda::CUDACachingAllocator::raw_delete(coef1_data);
+  c10::cuda::CUDACachingAllocator::raw_delete(coef2_data);
+  c10::cuda::CUDACachingAllocator::raw_delete(coef3_data);
+  c10::cuda::CUDACachingAllocator::raw_delete(coef4_data);
+}
+
+template void run_gn_bwd_kernels<double>(const double *dy_data, const double *X_data, const double *weight_data, const double *bias_data, const double *mean_data, const double *rstd_data, const int N, const int H, const int W, const int C, const int G, const int64_t act_fn_option, double *dx_data, double *dweight_data, double *dbias_data);
+template void run_gn_bwd_kernels<float>(const float *dy_data, const float *X_data, const float *weight_data, const float *bias_data, const float *mean_data, const float *rstd_data, const int N, const int H, const int W, const int C, const int G, const int64_t act_fn_option, float *dx_data, float *dweight_data, float *dbias_data);
+template void run_gn_bwd_kernels<c10::Half>(const c10::Half *dy_data, const c10::Half *X_data, const c10::Half *weight_data, const c10::Half *bias_data, const c10::Half *mean_data, const c10::Half *rstd_data, const int N, const int H, const int W, const int C, const int G, const int64_t act_fn_option, c10::Half *dx_data, c10::Half *dweight_data, c10::Half *dbias_data);
+template void run_gn_bwd_kernels<c10::BFloat16>(const c10::BFloat16 *dy_data, const c10::BFloat16 *X_data, const c10::BFloat16 *weight_data, const c10::BFloat16 *bias_data, const c10::BFloat16 *mean_data, const c10::BFloat16 *rstd_data, const int N, const int H, const int W, const int C, const int G, const int64_t act_fn_option, c10::BFloat16 *dx_data, c10::BFloat16 *dweight_data, c10::BFloat16 *dbias_data);
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/gn_kernel.h b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/gn_kernel.h
new file mode 100644
index 000000000..83003d25b
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/gn_kernel.h
@@ -0,0 +1,39 @@
+#pragma once
+#ifndef FWD_GN_KERNEL_H
+#define FWD_GN_KERNEL_H
+
+template <typename T>
+void run_gn_fwd_kernels(
+    const T *X_data,
+    const T *weight_data,
+    const T *bias_data,
+    const int N,
+    const int H,
+    const int W,
+    const int C,
+    const int G,
+    T eps,
+    const int64_t act_fn_option,
+    T *Y_data,
+    T *mean_data,
+    T *rstd_data);
+
+template <typename T>
+void run_gn_bwd_kernels(
+      const T *dy_data,
+      const T *X_data,
+      const T *weight_data,
+      const T *bias_data,
+      const T *mean_data,
+      const T *rstd_data,
+      const int N,
+      const int H,
+      const int W,
+      const int C,
+      const int G,
+      const int64_t act_fn_option,
+      T *dx_data,
+      T *dweight_data,
+      T *dbias_data);
+
+#endif
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/nchw_kernel.cu b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/nchw_kernel.cu
new file mode 100644
index 000000000..5dc5605e0
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/nchw_kernel.cu
@@ -0,0 +1,994 @@
+// Copied from https://github.com/pytorch/pytorch/blob/8852bb561cbc821ffebf395990ee12a7ea376612/aten/src/ATen/native/cuda/group_norm_kernel.cu with slight style modifications
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/native/group_norm.h>
+
+#include <type_traits>
+
+#include <thrust/tuple.h>
+
+//#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/native/SharedReduceOps.h>
+#include <ATen/native/TensorIterator.h>
+#include <c10/cuda/CUDAMathCompat.h>
+#include <ATen/cuda/detail/IndexUtils.cuh>
+#include <ATen/native/cuda/Loops.cuh>
+#include <ATen/native/cuda/block_reduce.cuh>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#endif
+
+constexpr int kCUDANumThreads = 256;
+constexpr int kReduceTileSize = 32;
+
+template <typename T>
+__global__ void RowwiseMomentsCUDAKernelF(
+    int64_t N,
+    T eps,
+    const T* X,
+    T* mean,
+    T* rstd) {
+  using T_ACC = at::acc_type<T, true>;
+  using WelfordType = at::native::WelfordData<T_ACC, int64_t>;
+  using WelfordOp =
+      at::native::WelfordOps<T_ACC, T_ACC, int64_t, thrust::pair<T_ACC, T_ACC>>;
+
+  const int64_t i = blockIdx.x;
+  WelfordOp welford_op = {/*correction=*/0, /*take_sqrt=*/false};
+  WelfordType val(0, 0, 0, 0);
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    val = welford_op.reduce(val, static_cast<T_ACC>(X[index]), index);
+  }
+  if (blockDim.x <= C10_WARP_SIZE) {
+    val = at::native::cuda_utils::WarpReduce(val, welford_op);
+  } else {
+    // There will be a warning if we declare a __shared__ WelfordType array.
+    // https://github.com/pytorch/pytorch/pull/13967
+    __shared__ typename std::aligned_storage<
+        sizeof(WelfordType),
+        alignof(WelfordType)>::type val_shared[C10_WARP_SIZE];
+    WelfordType* val_shared_ptr = reinterpret_cast<WelfordType*>(val_shared);
+    val = at::native::cuda_utils::BlockReduce(
+        val,
+        welford_op,
+        /*identity_element=*/WelfordType(0, 0, 0, 0),
+        val_shared_ptr);
+  }
+  if (threadIdx.x == 0) {
+    T_ACC m1;
+    T_ACC m2;
+    thrust::tie(m2, m1) = welford_op.project(val);
+    mean[i] = m1;
+    rstd[i] = c10::cuda::compat::rsqrt(m2 + static_cast<T_ACC>(eps));
+  }
+}
+
+template <typename T>
+__global__ void ComputeFusedParamsCUDAKernelF(
+    int64_t N,
+    int64_t C,
+    int64_t group,
+    const T* mean,
+    const T* rstd,
+    const T* gamma,
+    const T* beta,
+    at::acc_type<T, true>* a,
+    at::acc_type<T, true>* b) {
+  using T_ACC = at::acc_type<T, true>;
+  const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < N * C) {
+    const int64_t ng = index / (C / group);
+    const int64_t c = index % C;
+    const T_ACC scale = (gamma == nullptr)
+        ? static_cast<T_ACC>(rstd[ng])
+        : static_cast<T_ACC>(rstd[ng]) * static_cast<T_ACC>(gamma[c]);
+    a[index] = scale;
+    b[index] = -scale * static_cast<T_ACC>(mean[ng]) +
+        ((beta == nullptr) ? 0 : static_cast<T_ACC>(beta[c]));
+  }
+}
+
+template <typename T>
+__global__ void Compute1dBackwardFusedParamsCUDAKernelF(
+    int64_t C,
+    int64_t group,
+    const T* dY,
+    const T* X,
+    const T* mean,
+    const T* rstd,
+    const T* gamma,
+    at::acc_type<T, true>* c2,
+    at::acc_type<T, true>* c3) {
+  using T_ACC = at::acc_type<T, true>;
+  const int64_t G = group;
+  const int64_t D = C / G;
+  const int64_t n = blockIdx.x;
+  const int64_t g = blockIdx.y;
+  const int64_t ng = n * G + g;
+  T_ACC sum1 = 0;
+  T_ACC sum2 = 0;
+  for (int64_t i = threadIdx.x; i < D; i += blockDim.x) {
+    const int64_t index = ng * D + i;
+    const int64_t c = g * D + i;
+    const T_ACC gamma_v =
+        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[c]);
+    sum1 += dY[index] * X[index] * gamma_v;
+    sum2 += dY[index] * gamma_v;
+  }
+  if (blockDim.x <= C10_WARP_SIZE) {
+    sum1 = at::native::cuda_utils::WarpReduceSum<T_ACC>(sum1);
+    sum2 = at::native::cuda_utils::WarpReduceSum<T_ACC>(sum2);
+  } else {
+    __shared__ T_ACC ds_shared[C10_WARP_SIZE];
+    __shared__ T_ACC db_shared[C10_WARP_SIZE];
+    sum1 = at::native::cuda_utils::BlockReduceSum<T_ACC>(sum1, ds_shared);
+    sum2 = at::native::cuda_utils::BlockReduceSum<T_ACC>(sum2, db_shared);
+  }
+  if (threadIdx.x == 0) {
+    const T_ACC s = T_ACC(1) / static_cast<T_ACC>(D);
+    const T_ACC x = (sum2 * static_cast<T_ACC>(mean[ng]) - sum1) *
+        static_cast<T_ACC>(rstd[ng]) * static_cast<T_ACC>(rstd[ng]) *
+        static_cast<T_ACC>(rstd[ng]) * s;
+    //printf("\n\nng: %d, x: %f\n\n", (int)ng, x);
+    c2[ng] = x;
+    c3[ng] = -x * static_cast<T_ACC>(mean[ng]) -
+        sum2 * static_cast<T_ACC>(rstd[ng]) * s;
+  }
+}
+
+template <typename T>
+__global__ void GammaBeta1dBackwardCUDAKernelF1(
+    int64_t N,
+    int64_t C,
+    int64_t group,
+    const T* dY,
+    const T* X,
+    const T* mean,
+    const T* rstd,
+    T* dgamma,
+    T* dbeta) {
+  using T_ACC = at::acc_type<T, true>;
+  const int64_t c = blockIdx.x * blockDim.x + threadIdx.x;
+  if (c < C) {
+    const int64_t G = group;
+    const int64_t D = C / G;
+    T_ACC sum1 = 0;
+    T_ACC sum2 = 0;
+    for (int64_t n = 0; n < N; ++n) {
+      const int64_t nc = n * C + c;
+      const int64_t ng = n * G + c / D;
+      const T_ACC dy_acc = static_cast<T_ACC>(dY[nc]);
+      const T_ACC x_acc = static_cast<T_ACC>(X[nc]);
+      sum1 += (dgamma == nullptr)
+          ? T_ACC(0)
+          : ((dy_acc * x_acc - dy_acc * static_cast<T_ACC>(mean[ng])) *
+             static_cast<T_ACC>(rstd[ng]));
+      sum2 += (dbeta == nullptr) ? T_ACC(0) : dy_acc;
+    }
+    if (dgamma != nullptr) {
+      dgamma[c] = sum1;
+    }
+    if (dbeta != nullptr) {
+      dbeta[c] = sum2;
+    }
+  }
+}
+
+template <typename T>
+__global__ void GammaBeta1dBackwardCUDAKernelF2(
+    int64_t N,
+    int64_t C,
+    int64_t group,
+    const T* dY,
+    const T* X,
+    const T* mean,
+    const T* rstd,
+    T* dgamma,
+    T* dbeta) {
+  using T_ACC = at::acc_type<T, true>;
+  __shared__ T_ACC g_shared[kReduceTileSize][kReduceTileSize + 1];
+  __shared__ T_ACC b_shared[kReduceTileSize][kReduceTileSize + 1];
+  const int64_t c = blockIdx.x * blockDim.x + threadIdx.x;
+  T_ACC dg_sum1 = 0;
+  T_ACC dg_sum2 = 0;
+  T_ACC db_sum1 = 0;
+  T_ACC db_sum2 = 0;
+  if (c < C) {
+    const int64_t G = group;
+    const int64_t D = C / G;
+    // Accumulate each 32 cols into a 32 * 32 tile.
+    // Since the blockDim is (32, 16), accumulate twice for 1st and 2nd 16 rows
+    // of a 32 contiguous elements.
+    for (int64_t n = threadIdx.y; n < N; n += blockDim.y * 2) {
+      const int64_t n1 = n;
+      const int64_t n2 = n + blockDim.y;
+      const int64_t nc1 = n1 * C + c;
+      const int64_t nc2 = n2 * C + c;
+      const int64_t ng1 = n1 * G + c / D;
+      const int64_t ng2 = n2 * G + c / D;
+      const T_ACC dy1_acc = static_cast<T_ACC>(dY[nc1]);
+      const T_ACC x1_acc = static_cast<T_ACC>(X[nc1]);
+      dg_sum1 += dgamma == nullptr
+          ? T_ACC(0)
+          : ((dy1_acc * x1_acc - dy1_acc * static_cast<T_ACC>(mean[ng1])) *
+             static_cast<T_ACC>(rstd[ng1]));
+      db_sum1 += dbeta == nullptr ? T_ACC(0) : dy1_acc;
+      if (n2 < N) {
+        const T_ACC dy2_acc = static_cast<T_ACC>(dY[nc2]);
+        const T_ACC x2_acc = static_cast<T_ACC>(X[nc2]);
+        dg_sum2 += dgamma == nullptr
+            ? T_ACC(0)
+            : ((dy2_acc * x2_acc - dy2_acc * static_cast<T_ACC>(mean[ng2])) *
+               static_cast<T_ACC>(rstd[ng2]));
+        db_sum2 += dbeta == nullptr ? T_ACC(0) : dy2_acc;
+      }
+    }
+  }
+
+  // Write accumulated tile to shared memory.
+  g_shared[threadIdx.y][threadIdx.x] = dg_sum1;
+  g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2;
+  b_shared[threadIdx.y][threadIdx.x] = db_sum1;
+  b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2;
+  __syncthreads();
+
+  // Do warp reduce for the 1st 16 cols in the tile.
+  T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y];
+  T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y];
+  sum1 = at::native::cuda_utils::WarpReduceSum<T_ACC>(sum1);
+  sum2 = at::native::cuda_utils::WarpReduceSum<T_ACC>(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t c = blockIdx.x * blockDim.x + threadIdx.y;
+    if (c < C) {
+      if (dgamma != nullptr) {
+        dgamma[c] = sum1;
+      }
+      if (dbeta != nullptr) {
+        dbeta[c] = sum2;
+      }
+    }
+  }
+
+  // Do warp reduce for the 2nd 16 cols in the tile.
+  sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum1 = at::native::cuda_utils::WarpReduceSum<T_ACC>(sum1);
+  sum2 = at::native::cuda_utils::WarpReduceSum<T_ACC>(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t c = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y;
+    if (c < C) {
+      if (dgamma != nullptr) {
+        dgamma[c] = sum1;
+      }
+      if (dbeta != nullptr) {
+        dbeta[c] = sum2;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void ComputeInternalGradientsCUDAKernelF(
+    int64_t HxW,
+    const T* dY,
+    const T* X,
+    at::acc_type<T, true>* ds,
+    at::acc_type<T, true>* db) {
+  using T_ACC = at::acc_type<T, true>;
+  const int64_t nc = blockIdx.x;
+  T_ACC sum1 = 0;
+  T_ACC sum2 = 0;
+  for (int64_t hw = threadIdx.x; hw < HxW; hw += blockDim.x) {
+    const int64_t index = nc * HxW + hw;
+    sum1 += static_cast<T_ACC>(dY[index]) * static_cast<T_ACC>(X[index]);
+    sum2 += static_cast<T_ACC>(dY[index]);
+  }
+  if (blockDim.x <= C10_WARP_SIZE) {
+    sum1 = at::native::cuda_utils::WarpReduceSum<T_ACC>(sum1);
+    sum2 = at::native::cuda_utils::WarpReduceSum<T_ACC>(sum2);
+  } else {
+    __shared__ T_ACC ds_shared[C10_WARP_SIZE];
+    __shared__ T_ACC db_shared[C10_WARP_SIZE];
+    sum1 = at::native::cuda_utils::BlockReduceSum<T_ACC>(sum1, ds_shared);
+    sum2 = at::native::cuda_utils::BlockReduceSum<T_ACC>(sum2, db_shared);
+  }
+  if (threadIdx.x == 0) {
+    ds[nc] = sum1;
+    db[nc] = sum2;
+  }
+}
+
+template <typename T>
+__global__ void ComputeBackwardFusedParamsCUDAKernelF(
+    int64_t C,
+    int64_t HxW,
+    int64_t group,
+    const T* mean,
+    const T* rstd,
+    const T* gamma,
+    const at::acc_type<T, true>* ds,
+    const at::acc_type<T, true>* db,
+    at::acc_type<T, true>* c2,
+    at::acc_type<T, true>* c3) {
+  using T_ACC = at::acc_type<T, true>;
+  const int64_t G = group;
+  const int64_t D = C / G;
+  const int64_t n = blockIdx.x;
+  const int64_t g = blockIdx.y;
+  const int64_t ng = n * G + g;
+  T_ACC sum1 = 0;
+  T_ACC sum2 = 0;
+  for (int64_t i = threadIdx.x; i < D; i += blockDim.x) {
+    const int64_t index = ng * D + i;
+    const int64_t c = g * D + i;
+    const T_ACC gamma_v =
+        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[c]);
+    sum1 += ds[index] * gamma_v;
+    sum2 += db[index] * gamma_v;
+  }
+  if (blockDim.x <= C10_WARP_SIZE) {
+    sum1 = at::native::cuda_utils::WarpReduceSum<T_ACC>(sum1);
+    sum2 = at::native::cuda_utils::WarpReduceSum<T_ACC>(sum2);
+  } else {
+    __shared__ T_ACC ds_shared[C10_WARP_SIZE];
+    __shared__ T_ACC db_shared[C10_WARP_SIZE];
+    sum1 = at::native::cuda_utils::BlockReduceSum<T_ACC>(sum1, ds_shared);
+    sum2 = at::native::cuda_utils::BlockReduceSum<T_ACC>(sum2, db_shared);
+  }
+  if (threadIdx.x == 0) {
+    const T_ACC s = T_ACC(1) / static_cast<T_ACC>(D * HxW);
+    const T_ACC x = (sum2 * static_cast<T_ACC>(mean[ng]) - sum1) *
+        static_cast<T_ACC>(rstd[ng]) * static_cast<T_ACC>(rstd[ng]) *
+        static_cast<T_ACC>(rstd[ng]) * s;
+    c2[ng] = x;
+    c3[ng] = -x * static_cast<T_ACC>(mean[ng]) -
+        sum2 * static_cast<T_ACC>(rstd[ng]) * s;
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardCUDAKernelF1(
+    int64_t N,
+    int64_t C,
+    int64_t group,
+    const T* mean,
+    const T* rstd,
+    const at::acc_type<T, true>* ds,
+    const at::acc_type<T, true>* db,
+    T* dgamma,
+    T* dbeta) {
+  using T_ACC = at::acc_type<T, true>;
+  const int64_t c = blockIdx.x * blockDim.x + threadIdx.x;
+  if (c < C) {
+    const int64_t G = group;
+    const int64_t D = C / G;
+    T_ACC sum1 = 0;
+    T_ACC sum2 = 0;
+    for (int64_t n = 0; n < N; ++n) {
+      const int64_t nc = n * C + c;
+      const int64_t ng = n * G + c / D;
+      sum1 += (dgamma == nullptr)
+          ? T_ACC(0)
+          : ((ds[nc] - db[nc] * static_cast<T_ACC>(mean[ng])) *
+             static_cast<T_ACC>(rstd[ng]));
+      sum2 += (dbeta == nullptr) ? T_ACC(0) : db[nc];
+    }
+    if (dgamma != nullptr) {
+      dgamma[c] = sum1;
+    }
+    if (dbeta != nullptr) {
+      dbeta[c] = sum2;
+    }
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardCUDAKernelF2(
+    int64_t N,
+    int64_t C,
+    int64_t group,
+    const T* mean,
+    const T* rstd,
+    const at::acc_type<T, true>* ds,
+    const at::acc_type<T, true>* db,
+    T* dgamma,
+    T* dbeta) {
+  using T_ACC = at::acc_type<T, true>;
+  __shared__ T_ACC g_shared[kReduceTileSize][kReduceTileSize + 1];
+  __shared__ T_ACC b_shared[kReduceTileSize][kReduceTileSize + 1];
+  const int64_t c = blockIdx.x * blockDim.x + threadIdx.x;
+  T_ACC dg_sum1 = 0;
+  T_ACC dg_sum2 = 0;
+  T_ACC db_sum1 = 0;
+  T_ACC db_sum2 = 0;
+  if (c < C) {
+    const int64_t G = group;
+    const int64_t D = C / G;
+    // Accumulate each 32 cols into a 32 * 32 tile.
+    // Since the blockDim is (32, 16), accumulate twice for 1st and 2nd 16 rows
+    // of a 32 contiguous elements.
+    for (int64_t n = threadIdx.y; n < N; n += blockDim.y * 2) {
+      const int64_t n1 = n;
+      const int64_t n2 = n + blockDim.y;
+      const int64_t nc1 = n1 * C + c;
+      const int64_t nc2 = n2 * C + c;
+      const int64_t ng1 = n1 * G + c / D;
+      const int64_t ng2 = n2 * G + c / D;
+      dg_sum1 += dgamma == nullptr
+          ? T_ACC(0)
+          : ((ds[nc1] - db[nc1] * static_cast<T_ACC>(mean[ng1])) *
+             static_cast<T_ACC>(rstd[ng1]));
+      db_sum1 += dbeta == nullptr ? T_ACC(0) : db[nc1];
+      if (n2 < N) {
+        dg_sum2 += dgamma == nullptr
+            ? T_ACC(0)
+            : ((ds[nc2] - db[nc2] * static_cast<T_ACC>(mean[ng2])) *
+               static_cast<T_ACC>(rstd[ng2]));
+        db_sum2 += dbeta == nullptr ? T_ACC(0) : db[nc2];
+      }
+    }
+  }
+
+  // Write accumulated tile to shared memory.
+  g_shared[threadIdx.y][threadIdx.x] = dg_sum1;
+  g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2;
+  b_shared[threadIdx.y][threadIdx.x] = db_sum1;
+  b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2;
+  __syncthreads();
+
+  // Do warp reduce for the 1st 16 cols in the tile.
+  T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y];
+  T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y];
+  sum1 = at::native::cuda_utils::WarpReduceSum<T_ACC>(sum1);
+  sum2 = at::native::cuda_utils::WarpReduceSum<T_ACC>(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t c = blockIdx.x * blockDim.x + threadIdx.y;
+    if (c < C) {
+      if (dgamma != nullptr) {
+        dgamma[c] = sum1;
+      }
+      if (dbeta != nullptr) {
+        dbeta[c] = sum2;
+      }
+    }
+  }
+
+  // Do warp reduce for the 2st 16 cols in the tile.
+  sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum1 = at::native::cuda_utils::WarpReduceSum<T_ACC>(sum1);
+  sum2 = at::native::cuda_utils::WarpReduceSum<T_ACC>(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t c = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y;
+    if (c < C) {
+      if (dgamma != nullptr) {
+        dgamma[c] = sum1;
+      }
+      if (dbeta != nullptr) {
+        dbeta[c] = sum2;
+      }
+    }
+  }
+}
+
+template <typename T>
+void GroupNorm1dForward(
+    const at::Tensor& X,
+    const at::Tensor& mean,
+    const at::Tensor& rstd,
+    const at::Tensor& gamma,
+    const at::Tensor& beta,
+    int64_t N,
+    int64_t C,
+    int64_t group,
+    at::Tensor& Y) {
+  using T_ACC = at::acc_type<T, true>;
+  const int64_t G = group;
+  const int64_t D = C / G;
+  if (gamma.defined() && beta.defined()) {
+    auto iter = at::TensorIteratorConfig()
+                    .resize_outputs(false)
+                    .add_owned_output(Y.view({N, G, D}))
+                    .add_owned_input(X.view({N, G, D}))
+                    .add_owned_input(mean.view({N, G, 1}))
+                    .add_owned_input(rstd.view({N, G, 1}))
+                    .add_owned_input(gamma.view({1, G, D}))
+                    .add_owned_input(beta.view({1, G, D}))
+                    .build();
+    at::native::gpu_kernel(iter, [] GPU_LAMBDA(T x, T mean, T rstd, T gamma, T beta) -> T {
+      return (static_cast<T_ACC>(x) - static_cast<T_ACC>(mean)) *
+          static_cast<T_ACC>(rstd) * static_cast<T_ACC>(gamma) +
+          static_cast<T_ACC>(beta);
+    });
+  } else if (gamma.defined()) {
+    auto iter = at::TensorIteratorConfig()
+                    .resize_outputs(false)
+                    .add_owned_output(Y.view({N, G, D}))
+                    .add_owned_input(X.view({N, G, D}))
+                    .add_owned_input(mean.view({N, G, 1}))
+                    .add_owned_input(rstd.view({N, G, 1}))
+                    .add_owned_input(gamma.view({1, G, D}))
+                    .build();
+    at::native::gpu_kernel(iter, [] GPU_LAMBDA(T x, T mean, T rstd, T gamma) -> T {
+      return (static_cast<T_ACC>(x) - static_cast<T_ACC>(mean)) *
+          static_cast<T_ACC>(rstd) * static_cast<T_ACC>(gamma);
+    });
+  } else if (beta.defined()) {
+    auto iter = at::TensorIteratorConfig()
+                    .resize_outputs(false)
+                    .add_owned_output(Y.view({N, G, D}))
+                    .add_owned_input(X.view({N, G, D}))
+                    .add_owned_input(mean.view({N, G, 1}))
+                    .add_owned_input(rstd.view({N, G, 1}))
+                    .add_owned_input(beta.view({1, G, D}))
+                    .build();
+    at::native::gpu_kernel(iter, [] GPU_LAMBDA(T x, T mean, T rstd, T beta) -> T {
+      return (static_cast<T_ACC>(x) - static_cast<T_ACC>(mean)) *
+          static_cast<T_ACC>(rstd) +
+          static_cast<T_ACC>(beta);
+    });
+  } else {
+    auto iter = at::TensorIteratorConfig()
+                    .resize_outputs(false)
+                    .add_owned_output(Y.view({N * G, D}))
+                    .add_owned_input(X.view({N * G, D}))
+                    .add_owned_input(mean.view({N * G, 1}))
+                    .add_owned_input(rstd.view({N * G, 1}))
+                    .build();
+    at::native::gpu_kernel(iter, [] GPU_LAMBDA(T x, T mean, T rstd) -> T {
+      return (static_cast<T_ACC>(x) - static_cast<T_ACC>(mean)) *
+          static_cast<T_ACC>(rstd);
+    });
+  }
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+template <typename T>
+void GroupNormKernelImplInternal(
+    const at::Tensor& X,
+    const at::Tensor& gamma,
+    const at::Tensor& beta,
+    int64_t N,
+    int64_t C,
+    int64_t HxW,
+    int64_t group,
+    T eps,
+    at::Tensor& Y,
+    at::Tensor& mean,
+    at::Tensor& rstd) {
+  using T_ACC = at::acc_type<T, true>;
+  TORCH_CHECK(X.numel() == N * C * HxW);
+  TORCH_CHECK(!gamma.defined() || gamma.numel() == C);
+  TORCH_CHECK(!beta.defined() || beta.numel() == C);
+  if (N == 0) {
+    return;
+  }
+  const int64_t G = group;
+  const int64_t D = C / G;
+  const T* X_data = X.const_data_ptr<T>();
+  T* mean_data = mean.mutable_data_ptr<T>();
+  T* rstd_data = rstd.mutable_data_ptr<T>();
+
+  cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
+  const int64_t num_threads = D * HxW < at::native::cuda_utils::kCUDABlockReduceNumThreads
+      ? at::cuda::warp_size()
+      : at::native::cuda_utils::kCUDABlockReduceNumThreads;
+  RowwiseMomentsCUDAKernelF<T><<<N * G, num_threads, 0, cuda_stream>>>(
+      D * HxW, eps, X_data, mean_data, rstd_data);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+  if (HxW == 1) {
+    GroupNorm1dForward<T>(X, mean, rstd, gamma, beta, N, C, G, Y);
+  } else if (!gamma.defined() && !beta.defined()) {
+    auto iter = at::TensorIteratorConfig()
+                    .resize_outputs(false)
+                    .add_owned_output(Y.view({N * G, D * HxW}))
+                    .add_owned_input(X.view({N * G, D * HxW}))
+                    .add_owned_input(mean.view({N * G, 1}))
+                    .add_owned_input(rstd.view({N * G, 1}))
+                    .build();
+    at::native::gpu_kernel(iter, [] GPU_LAMBDA(T x, T mean, T rstd) -> T {
+      return (static_cast<T_ACC>(x) - static_cast<T_ACC>(mean)) *
+          static_cast<T_ACC>(rstd);
+    });
+  } else {
+    const auto kAccType =
+        (X.scalar_type() == at::kHalf || X.scalar_type() == at::kBFloat16)
+        ? at::kFloat
+        : X.scalar_type();
+    at::Tensor a = at::empty({N, C}, X.options().dtype(kAccType));
+    at::Tensor b = at::empty({N, C}, X.options().dtype(kAccType));
+    const T* gamma_data = gamma.defined() ? gamma.const_data_ptr<T>() : nullptr;
+    const T* beta_data = beta.defined() ? beta.const_data_ptr<T>() : nullptr;
+    T_ACC* a_data = a.mutable_data_ptr<T_ACC>();
+    T_ACC* b_data = b.mutable_data_ptr<T_ACC>();
+
+    // TODO: Since there is some issues in at::native::gpu_kernel_multiple_outputs, we are
+    // using maunal kernel here. Make it using at::native::gpu_kernel_multiple_outputs once
+    // the issue fixed.
+    const int64_t B = (N * C + kCUDANumThreads - 1) / kCUDANumThreads;
+    ComputeFusedParamsCUDAKernelF<T><<<B, kCUDANumThreads, 0, cuda_stream>>>(
+        N, C, G, mean_data, rstd_data, gamma_data, beta_data, a_data, b_data);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+    auto iter = at::TensorIteratorConfig()
+                    .check_all_same_dtype(std::is_same<T, T_ACC>::value)
+                    .resize_outputs(false)
+                    .add_owned_output(Y.view({N * C, HxW}))
+                    .add_owned_input(X.view({N * C, HxW}))
+                    .add_owned_input(a.view({N * C, 1}))
+                    .add_owned_input(b.view({N * C, 1}))
+                    .build();
+    at::native::gpu_kernel(iter, [] GPU_LAMBDA(T x, T_ACC a, T_ACC b) -> T {
+      return a * static_cast<T_ACC>(x) + b;
+    });
+  }
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void GroupNormKernelImpl(
+    const at::Tensor& X,
+    const at::Tensor& gamma,
+    const at::Tensor& beta,
+    int64_t N,
+    int64_t C,
+    int64_t HxW,
+    int64_t group,
+    double eps,
+    at::Tensor& Y,
+    at::Tensor& mean,
+    at::Tensor& rstd) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      X.scalar_type(),
+      "GroupNormKernelImpl",
+      [&]() {
+        GroupNormKernelImplInternal<scalar_t>(
+            X,
+            gamma,
+            beta,
+            N,
+            C,
+            HxW,
+            group,
+            static_cast<scalar_t>(eps),
+            Y,
+            mean,
+            rstd);
+      });
+}
+
+template <typename T>
+void GroupNorm1dBackward(
+    const at::Tensor dY,
+    const at::Tensor X,
+    const at::Tensor mean,
+    const at::Tensor rstd,
+    const at::Tensor gamma,
+    int64_t N,
+    int64_t C,
+    int64_t group,
+    at::Tensor& dX,
+    at::Tensor& dgamma,
+    at::Tensor& dbeta) {
+  using T_ACC = at::acc_type<T, true>;
+  const int64_t G = group;
+  const int64_t D = C / G;
+  const T* dY_data = dY.const_data_ptr<T>();
+  const T* X_data = X.const_data_ptr<T>();
+  const T* mean_data = mean.const_data_ptr<T>();
+  const T* rstd_data = rstd.const_data_ptr<T>();
+
+  cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
+  if (dX.defined()) {
+    const T* gamma_data = gamma.defined() ? gamma.const_data_ptr<T>() : nullptr;
+    const auto kAccType =
+        (X.scalar_type() == at::kHalf || X.scalar_type() == at::kBFloat16)
+        ? at::kFloat
+        : X.scalar_type();
+    at::Tensor c2 = at::empty({N, G}, X.options().dtype(kAccType));
+    at::Tensor c3 = at::empty({N, G}, X.options().dtype(kAccType));
+    T_ACC* c2_data = c2.mutable_data_ptr<T_ACC>();
+    T_ACC* c3_data = c3.mutable_data_ptr<T_ACC>();
+    const int64_t num_threads = (C / G) < at::native::cuda_utils::kCUDABlockReduceNumThreads
+        ? at::cuda::warp_size()
+        : at::native::cuda_utils::kCUDABlockReduceNumThreads;
+    Compute1dBackwardFusedParamsCUDAKernelF<T>
+        <<<dim3(N, G), num_threads, 0, cuda_stream>>>(
+            C,
+            G,
+            dY_data,
+            X_data,
+            mean_data,
+            rstd_data,
+            gamma_data,
+            c2_data,
+            c3_data);
+    //std::cout << "mean: " << mean << '\n';
+    //std::cout << "rstd: " << rstd << '\n';
+    //std::cout << "g: " << gamma << '\n';
+    //std::cout << "c2: " << c2 << '\n';
+    //std::cout << "c3: " << c3 << '\n';
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+    if (gamma.defined()) {
+      auto iter = at::TensorIteratorConfig()
+                      .check_all_same_dtype(std::is_same<T, T_ACC>::value)
+                      .resize_outputs(false)
+                      .add_owned_output(dX.view({N, G, D}))
+                      .add_owned_input(dY.view({N, G, D}))
+                      .add_owned_input(X.view({N, G, D}))
+                      .add_owned_input(rstd.view({N, G, 1}))
+                      .add_owned_input(gamma.view({1, G, D}))
+                      .add_owned_input(c2.view({N, G, 1}))
+                      .add_owned_input(c3.view({N, G, 1}))
+                      .build();
+      at::native::gpu_kernel(
+          iter,
+          [] GPU_LAMBDA(T dy, T x, T rstd, T gamma, T_ACC c2, T_ACC c3) -> T {
+            const T_ACC c1 =
+                static_cast<T_ACC>(rstd) * static_cast<T_ACC>(gamma);
+            return c1 * static_cast<T_ACC>(dy) + c2 * static_cast<T_ACC>(x) +
+                c3;
+          });
+    } else {
+      auto iter = at::TensorIteratorConfig()
+                      .check_all_same_dtype(std::is_same<T, T_ACC>::value)
+                      .resize_outputs(false)
+                      .add_owned_output(dX.view({N * G, D}))
+                      .add_owned_input(dY.view({N * G, D}))
+                      .add_owned_input(X.view({N * G, D}))
+                      .add_owned_input(rstd.view({N * G, 1}))
+                      .add_owned_input(c2.view({N * G, 1}))
+                      .add_owned_input(c3.view({N * G, 1}))
+                      .build();
+      at::native::gpu_kernel(
+          iter, [] GPU_LAMBDA(T dy, T x, T rstd, T_ACC c2, T_ACC c3) -> T {
+            const T_ACC c1 = static_cast<T_ACC>(rstd);
+            return c1 * static_cast<T_ACC>(dy) + c2 * static_cast<T_ACC>(x) +
+                c3;
+          });
+    }
+  }
+  if (dgamma.defined() || dbeta.defined()) {
+    T* dgamma_data = dgamma.defined() ? dgamma.mutable_data_ptr<T>() : nullptr;
+    T* dbeta_data = dbeta.defined() ? dbeta.mutable_data_ptr<T>() : nullptr;
+    if (N <= 128) {
+      const int64_t B = (C + kCUDANumThreads - 1) / kCUDANumThreads;
+      GammaBeta1dBackwardCUDAKernelF1<T><<<B, kCUDANumThreads, 0, cuda_stream>>>(
+          N,
+          C,
+          G,
+          dY_data,
+          X_data,
+          mean_data,
+          rstd_data,
+          dgamma_data,
+          dbeta_data);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    } else {
+      const int64_t B = (C + kReduceTileSize - 1) / kReduceTileSize;
+      // The algorithm for colwise reduction here is to accumulate each 32 cols
+      // to a 32 * 32 tile and write the tile to shared memmory. Then do warp
+      // reduce for each col in the tile. So here the blockDim must be (32, 16).
+      constexpr int kThreadX = kReduceTileSize;
+      constexpr int kThreadY = kReduceTileSize / 2;
+      GammaBeta1dBackwardCUDAKernelF2<T>
+          <<<B, dim3(kThreadX, kThreadY), 0, cuda_stream>>>(
+              N,
+              C,
+              G,
+              dY_data,
+              X_data,
+              mean_data,
+              rstd_data,
+              dgamma_data,
+              dbeta_data);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    }
+  }
+}
+
+template <typename T>
+void GroupNormBackwardKernelImplInternal(
+    const at::Tensor& dY,
+    const at::Tensor& X,
+    const at::Tensor& mean,
+    const at::Tensor& rstd,
+    const at::Tensor& gamma,
+    int64_t N,
+    int64_t C,
+    int64_t HxW,
+    int64_t group,
+    at::Tensor& dX,
+    at::Tensor& dgamma,
+    at::Tensor& dbeta) {
+  using T_ACC = at::acc_type<T, true>;
+  const int64_t G = group;
+  const int64_t D = C / G;
+  TORCH_CHECK(dY.numel() == N * C * HxW);
+  TORCH_CHECK(X.numel() == N * C * HxW);
+  TORCH_CHECK(mean.numel() == N * G);
+  TORCH_CHECK(rstd.numel() == N * G);
+  TORCH_CHECK(!gamma.defined() || gamma.numel() == C);
+  cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
+
+  if (N == 0) {
+    if (dgamma.defined()) {
+      dgamma.fill_(T(0));
+    }
+    if (dbeta.defined()) {
+      dbeta.fill_(T(0));
+    }
+    return;
+  }
+
+  const T* dY_data = dY.const_data_ptr<T>();
+  const T* X_data = X.const_data_ptr<T>();
+  const T* mean_data = mean.const_data_ptr<T>();
+  const T* rstd_data = rstd.const_data_ptr<T>();
+  const T* gamma_data = gamma.defined() ? gamma.const_data_ptr<T>() : nullptr;
+  const auto kAccType =
+      (X.scalar_type() == at::kHalf || X.scalar_type() == at::kBFloat16)
+      ? at::kFloat
+      : X.scalar_type();
+  at::Tensor ds = at::empty({N, C}, X.options().dtype(kAccType));
+  at::Tensor db = at::empty({N, C}, X.options().dtype(kAccType));
+  T_ACC* ds_data = ds.mutable_data_ptr<T_ACC>();
+  T_ACC* db_data = db.mutable_data_ptr<T_ACC>();
+
+  if (HxW == 1) {
+    GroupNorm1dBackward<T>(
+        dY, X, mean, rstd, gamma, N, C, G, dX, dgamma, dbeta);
+    return;
+  }
+
+  int warp_size = at::cuda::warp_size();
+  int64_t num_threads = HxW < at::native::cuda_utils::kCUDABlockReduceNumThreads
+      ? warp_size
+      : at::native::cuda_utils::kCUDABlockReduceNumThreads;
+  ComputeInternalGradientsCUDAKernelF<T><<<N * C, num_threads, 0, cuda_stream>>>(
+      HxW, dY_data, X_data, ds_data, db_data);
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+  //std::cout << "ds: " << ds << '\n';
+  //std::cout << "db: " << db << '\n';
+
+  if (dX.defined()) {
+    at::Tensor c1 = at::empty({0}, X.options().dtype(kAccType));
+    at::Tensor c2 = at::empty({N, G}, X.options().dtype(kAccType));
+    at::Tensor c3 = at::empty({N, G}, X.options().dtype(kAccType));
+    T_ACC* c2_data = c2.mutable_data_ptr<T_ACC>();
+    T_ACC* c3_data = c3.mutable_data_ptr<T_ACC>();
+
+    if (gamma.defined()) {
+      auto iter = at::TensorIteratorConfig()
+                      .check_all_same_dtype(std::is_same<T, T_ACC>::value)
+                      .add_output(c1)
+                      .add_owned_input(rstd.view({N, G, 1}))
+                      .add_owned_input(gamma.view({1, G, D}))
+                      .build();
+      at::native::gpu_kernel(iter, [] GPU_LAMBDA(T rstd, T gamma) -> T_ACC {
+        return static_cast<T_ACC>(rstd) * static_cast<T_ACC>(gamma);
+      });
+    }
+
+    num_threads = (C / G) < at::native::cuda_utils::kCUDABlockReduceNumThreads
+        ? warp_size
+        : at::native::cuda_utils::kCUDABlockReduceNumThreads;
+    ComputeBackwardFusedParamsCUDAKernelF<T>
+        <<<dim3(N, G), num_threads, 0, cuda_stream>>>(
+            C,
+            HxW,
+            G,
+            mean_data,
+            rstd_data,
+            gamma_data,
+            ds_data,
+            db_data,
+            c2_data,
+            c3_data);
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+    if (gamma.defined()) {
+      auto iter = at::TensorIteratorConfig()
+                      .check_all_same_dtype(std::is_same<T, T_ACC>::value)
+                      .resize_outputs(false)
+                      .add_owned_output(dX.view({N * G, D, HxW}))
+                      .add_owned_input(dY.view({N * G, D, HxW}))
+                      .add_owned_input(X.view({N * G, D, HxW}))
+                      .add_owned_input(c1.view({N * G, D, 1}))
+                      .add_owned_input(c2.view({N * G, 1, 1}))
+                      .add_owned_input(c3.view({N * G, 1, 1}))
+                      .build();
+      at::native::gpu_kernel(
+          iter, [] GPU_LAMBDA(T dy, T x, T_ACC c1, T_ACC c2, T_ACC c3) -> T {
+            return c1 * static_cast<T_ACC>(dy) + c2 * static_cast<T_ACC>(x) +
+                c3;
+          });
+    } else {
+      auto iter = at::TensorIteratorConfig()
+                      .check_all_same_dtype(std::is_same<T, T_ACC>::value)
+                      .resize_outputs(false)
+                      .add_owned_output(dX.view({N * G, D * HxW}))
+                      .add_owned_input(dY.view({N * G, D * HxW}))
+                      .add_owned_input(X.view({N * G, D * HxW}))
+                      .add_owned_input(rstd.view({N * G, 1}))
+                      .add_owned_input(c2.view({N * G, 1}))
+                      .add_owned_input(c3.view({N * G, 1}))
+                      .build();
+      at::native::gpu_kernel(
+          iter, [] GPU_LAMBDA(T dy, T x, T_ACC c1, T_ACC c2, T_ACC c3) -> T {
+            return c1 * static_cast<T_ACC>(dy) + c2 * static_cast<T_ACC>(x) +
+                c3;
+          });
+    }
+  }
+  if (dgamma.defined() || dbeta.defined()) {
+    T* dgamma_data = dgamma.defined() ? dgamma.mutable_data_ptr<T>() : nullptr;
+    T* dbeta_data = dbeta.defined() ? dbeta.mutable_data_ptr<T>() : nullptr;
+    if (N <= 128) {
+      // For small batch size, do colwise reduce directly.
+      const int64_t B = (C + kCUDANumThreads - 1) / kCUDANumThreads;
+      GammaBetaBackwardCUDAKernelF1<T><<<B, kCUDANumThreads, 0, cuda_stream>>>(
+          N,
+          C,
+          G,
+          mean_data,
+          rstd_data,
+          ds_data,
+          db_data,
+          dgamma_data,
+          dbeta_data);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    } else {
+      const int64_t B = (C + kReduceTileSize - 1) / kReduceTileSize;
+      // The algorithm for colwise reduction here is to accumulate each 32 cols
+      // to a 32 * 32 tile and write the tile to shared memmory. Then do warp
+      // reduce for each col in the tile. So here the blockDim must be (32, 16).
+      constexpr int kThreadX = kReduceTileSize;
+      constexpr int kThreadY = kReduceTileSize / 2;
+      GammaBetaBackwardCUDAKernelF2<T>
+          <<<B, dim3(kThreadX, kThreadY), 0, cuda_stream>>>(
+              N,
+              C,
+              G,
+              mean_data,
+              rstd_data,
+              ds_data,
+              db_data,
+              dgamma_data,
+              dbeta_data);
+      C10_CUDA_KERNEL_LAUNCH_CHECK();
+    }
+  }
+}
+
+void GroupNormBackwardKernelImpl(
+    const at::Tensor& dY,
+    const at::Tensor& X,
+    const at::Tensor& mean,
+    const at::Tensor& rstd,
+    const at::Tensor& gamma,
+    int64_t N,
+    int64_t C,
+    int64_t HxW,
+    int64_t group,
+    at::Tensor& dX,
+    at::Tensor& dgamma,
+    at::Tensor& dbeta) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      X.scalar_type(),
+      "GroupNormBackwardKernelImpl",
+      [&]() {
+        GroupNormBackwardKernelImplInternal<scalar_t>(
+            dY, X, mean, rstd, gamma, N, C, HxW, group, dX, dgamma, dbeta);
+      });
+}
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/vecs.h b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/vecs.h
new file mode 100644
index 000000000..0b8cb7a65
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/vecs.h
@@ -0,0 +1,35 @@
+#pragma once
+#ifndef VECS_H
+#define VECS_H
+
+template <typename T, int num_elems>
+struct float_vec;
+
+template <typename T>
+struct alignas(1 * sizeof(T)) float_vec<T, 1> {
+  T x;
+  template <typename U>
+  __host__ __device__ operator float_vec<U, 1>() const {
+      return { static_cast<U>(x), };
+  }
+};
+
+template <typename T>
+struct alignas(2 * sizeof(T)) float_vec<T, 2> {
+  T x, y;
+  template <typename U>
+  __host__ __device__ operator float_vec<U, 2>() const {
+      return { static_cast<U>(x), static_cast<U>(y), };
+  }
+};
+
+template <typename T>
+struct alignas(4 * sizeof(T)) float_vec<T, 4> {
+  T x, y, z, w;
+  template <typename U>
+  __host__ __device__ operator float_vec<U, 4>() const {
+      return { static_cast<U>(x), static_cast<U>(y), static_cast<U>(z), static_cast<U>(w), };
+  }
+};
+
+#endif
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/normalization.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/normalization.py
new file mode 100644
index 000000000..036a66890
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/normalization.py
@@ -0,0 +1,254 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numbers
+from typing import Dict, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..utils import is_torch_version
+from .activations import get_activation
+from .embeddings import CombinedTimestepLabelEmbeddings, PixArtAlphaCombinedTimestepSizeEmbeddings
+
+
+class AdaLayerNorm(nn.Module):
+    r"""
+    Norm layer modified to incorporate timestep embeddings.
+
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        num_embeddings (`int`): The size of the embeddings dictionary.
+    """
+
+    def __init__(self, embedding_dim: int, num_embeddings: int):
+        super().__init__()
+        self.emb = nn.Embedding(num_embeddings, embedding_dim)
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, embedding_dim * 2)
+        self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False)
+
+    def forward(self, x: torch.Tensor, timestep: torch.Tensor) -> torch.Tensor:
+        emb = self.linear(self.silu(self.emb(timestep)))
+        scale, shift = torch.chunk(emb, 2)
+        x = self.norm(x) * (1 + scale) + shift
+        return x
+
+
+class AdaLayerNormZero(nn.Module):
+    r"""
+    Norm layer adaptive layer norm zero (adaLN-Zero).
+
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        num_embeddings (`int`): The size of the embeddings dictionary.
+    """
+
+    def __init__(self, embedding_dim: int, num_embeddings: int):
+        super().__init__()
+
+        self.emb = CombinedTimestepLabelEmbeddings(num_embeddings, embedding_dim)
+
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
+        self.norm = nn.LayerNorm(embedding_dim, elementwise_affine=False, eps=1e-6)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        timestep: torch.Tensor,
+        class_labels: torch.LongTensor,
+        hidden_dtype: Optional[torch.dtype] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        emb = self.linear(self.silu(self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)))
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, dim=1)
+        x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
+
+
+class AdaLayerNormSingle(nn.Module):
+    r"""
+    Norm layer adaptive layer norm single (adaLN-single).
+
+    As proposed in PixArt-Alpha (see: https://arxiv.org/abs/2310.00426; Section 2.3).
+
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        use_additional_conditions (`bool`): To use additional conditions for normalization or not.
+    """
+
+    def __init__(self, embedding_dim: int, use_additional_conditions: bool = False):
+        super().__init__()
+
+        self.emb = PixArtAlphaCombinedTimestepSizeEmbeddings(
+            embedding_dim, size_emb_dim=embedding_dim // 3, use_additional_conditions=use_additional_conditions
+        )
+
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
+
+    def forward(
+        self,
+        timestep: torch.Tensor,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        batch_size: Optional[int] = None,
+        hidden_dtype: Optional[torch.dtype] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        # No modulation happening here.
+        embedded_timestep = self.emb(timestep, **added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_dtype)
+        return self.linear(self.silu(embedded_timestep)), embedded_timestep
+
+
+class AdaGroupNorm(nn.Module):
+    r"""
+    GroupNorm layer modified to incorporate timestep embeddings.
+
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        num_embeddings (`int`): The size of the embeddings dictionary.
+        num_groups (`int`): The number of groups to separate the channels into.
+        act_fn (`str`, *optional*, defaults to `None`): The activation function to use.
+        eps (`float`, *optional*, defaults to `1e-5`): The epsilon value to use for numerical stability.
+    """
+
+    def __init__(
+        self, embedding_dim: int, out_dim: int, num_groups: int, act_fn: Optional[str] = None, eps: float = 1e-5
+    ):
+        super().__init__()
+        self.num_groups = num_groups
+        self.eps = eps
+
+        if act_fn is None:
+            self.act = None
+        else:
+            self.act = get_activation(act_fn)
+
+        self.linear = nn.Linear(embedding_dim, out_dim * 2)
+
+    def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
+        if self.act:
+            emb = self.act(emb)
+        emb = self.linear(emb)
+        emb = emb[:, :, None, None]
+        scale, shift = emb.chunk(2, dim=1)
+
+        x = F.group_norm(x, self.num_groups, eps=self.eps)
+        x = x * (1 + scale) + shift
+        return x
+
+
+class AdaLayerNormContinuous(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        conditioning_embedding_dim: int,
+        # NOTE: It is a bit weird that the norm layer can be configured to have scale and shift parameters
+        # because the output is immediately scaled and shifted by the projected conditioning embeddings.
+        # Note that AdaLayerNorm does not let the norm layer have scale and shift parameters.
+        # However, this is how it was implemented in the original code, and it's rather likely you should
+        # set `elementwise_affine` to False.
+        elementwise_affine=True,
+        eps=1e-5,
+        bias=True,
+        norm_type="layer_norm",
+    ):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = LayerNorm(embedding_dim, eps, elementwise_affine, bias)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(embedding_dim, eps, elementwise_affine)
+        else:
+            raise ValueError(f"unknown norm_type {norm_type}")
+
+    def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor:
+        emb = self.linear(self.silu(conditioning_embedding))
+        scale, shift = torch.chunk(emb, 2, dim=1)
+        x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
+        return x
+
+
+if is_torch_version(">=", "2.1.0"):
+    LayerNorm = nn.LayerNorm
+else:
+    # Has optional bias parameter compared to torch layer norm
+    # TODO: replace with torch layernorm once min required torch version >= 2.1
+    class LayerNorm(nn.Module):
+        def __init__(self, dim, eps: float = 1e-5, elementwise_affine: bool = True, bias: bool = True):
+            super().__init__()
+
+            self.eps = eps
+
+            if isinstance(dim, numbers.Integral):
+                dim = (dim,)
+
+            self.dim = torch.Size(dim)
+
+            if elementwise_affine:
+                self.weight = nn.Parameter(torch.ones(dim))
+                self.bias = nn.Parameter(torch.zeros(dim)) if bias else None
+            else:
+                self.weight = None
+                self.bias = None
+
+        def forward(self, input):
+            return F.layer_norm(input, self.dim, self.weight, self.bias, self.eps)
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps: float, elementwise_affine: bool = True):
+        super().__init__()
+
+        self.eps = eps
+
+        if isinstance(dim, numbers.Integral):
+            dim = (dim,)
+
+        self.dim = torch.Size(dim)
+
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim))
+        else:
+            self.weight = None
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+
+        if self.weight is not None:
+            # convert into half-precision if necessary
+            if self.weight.dtype in [torch.float16, torch.bfloat16]:
+                hidden_states = hidden_states.to(self.weight.dtype)
+            hidden_states = hidden_states * self.weight
+        else:
+            hidden_states = hidden_states.to(input_dtype)
+
+        return hidden_states
+
+
+class GlobalResponseNorm(nn.Module):
+    # Taken from https://github.com/facebookresearch/ConvNeXt-V2/blob/3608f67cc1dae164790c5d0aead7bf2d73d9719b/models/utils.py#L105
+    def __init__(self, dim):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim))
+        self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim))
+
+    def forward(self, x):
+        gx = torch.norm(x, p=2, dim=(1, 2), keepdim=True)
+        nx = gx / (gx.mean(dim=-1, keepdim=True) + 1e-6)
+        return self.gamma * (x * nx) + self.beta + x
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/prior_transformer.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/prior_transformer.py
new file mode 100644
index 000000000..328835a95
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/prior_transformer.py
@@ -0,0 +1,12 @@
+from ..utils import deprecate
+from .transformers.prior_transformer import PriorTransformer, PriorTransformerOutput
+
+
+class PriorTransformerOutput(PriorTransformerOutput):
+    deprecation_message = "Importing `PriorTransformerOutput` from `diffusers.models.prior_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.prior_transformer import PriorTransformerOutput`, instead."
+    deprecate("PriorTransformerOutput", "0.29", deprecation_message)
+
+
+class PriorTransformer(PriorTransformer):
+    deprecation_message = "Importing `PriorTransformer` from `diffusers.models.prior_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.prior_transformer import PriorTransformer`, instead."
+    deprecate("PriorTransformer", "0.29", deprecation_message)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/resnet.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/resnet.py
new file mode 100644
index 000000000..34b8eba0a
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/resnet.py
@@ -0,0 +1,814 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+# `TemporalConvLayer` Copyright 2024 Alibaba DAMO-VILAB, The ModelScope Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+from typing import Optional, Tuple, Union
+import os
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..utils import deprecate
+from .activations import get_activation
+from .attention_processor import SpatialNorm
+from .downsampling import (  # noqa
+    Downsample1D,
+    Downsample2D,
+    FirDownsample2D,
+    KDownsample2D,
+    downsample_2d,
+)
+from .normalization import AdaGroupNorm
+from .upsampling import (  # noqa
+    FirUpsample2D,
+    KUpsample2D,
+    Upsample1D,
+    Upsample2D,
+    upfirdn2d_native,
+    upsample_2d,
+)
+
+from .nhwc_groupnorm.custom_gn import GN_NHWC 
+
+
+class ResnetBlockCondNorm2D(nn.Module):
+    r"""
+    A Resnet block that use normalization layer that incorporate conditioning information.
+
+    Parameters:
+        in_channels (`int`): The number of channels in the input.
+        out_channels (`int`, *optional*, default to be `None`):
+            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
+        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
+        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
+        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
+        groups_out (`int`, *optional*, default to None):
+            The number of groups to use for the second normalization layer. if set to None, same as `groups`.
+        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
+        non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
+        time_embedding_norm (`str`, *optional*, default to `"ada_group"` ):
+            The normalization layer for time embedding `temb`. Currently only support "ada_group" or "spatial".
+        kernel (`torch.FloatTensor`, optional, default to None): FIR filter, see
+            [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
+        output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
+        use_in_shortcut (`bool`, *optional*, default to `True`):
+            If `True`, add a 1x1 nn.conv2d layer for skip-connection.
+        up (`bool`, *optional*, default to `False`): If `True`, add an upsample layer.
+        down (`bool`, *optional*, default to `False`): If `True`, add a downsample layer.
+        conv_shortcut_bias (`bool`, *optional*, default to `True`):  If `True`, adds a learnable bias to the
+            `conv_shortcut` output.
+        conv_2d_out_channels (`int`, *optional*, default to `None`): the number of channels in the output.
+            If None, same as `out_channels`.
+    """
+
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        conv_shortcut: bool = False,
+        dropout: float = 0.0,
+        temb_channels: int = 512,
+        groups: int = 32,
+        groups_out: Optional[int] = None,
+        eps: float = 1e-6,
+        non_linearity: str = "swish",
+        time_embedding_norm: str = "ada_group",  # ada_group, spatial
+        output_scale_factor: float = 1.0,
+        use_in_shortcut: Optional[bool] = None,
+        up: bool = False,
+        down: bool = False,
+        conv_shortcut_bias: bool = True,
+        conv_2d_out_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.up = up
+        self.down = down
+        self.output_scale_factor = output_scale_factor
+        self.time_embedding_norm = time_embedding_norm
+
+        conv_cls = nn.Conv2d
+
+        if groups_out is None:
+            groups_out = groups
+
+        if self.time_embedding_norm == "ada_group":  # ada_group
+            self.norm1 = AdaGroupNorm(temb_channels, in_channels, groups, eps=eps)
+        elif self.time_embedding_norm == "spatial":
+            self.norm1 = SpatialNorm(in_channels, temb_channels)
+        else:
+            raise ValueError(f" unsupported time_embedding_norm: {self.time_embedding_norm}")
+
+        self.conv1 = conv_cls(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+        if self.time_embedding_norm == "ada_group":  # ada_group
+            self.norm2 = AdaGroupNorm(temb_channels, out_channels, groups_out, eps=eps)
+        elif self.time_embedding_norm == "spatial":  # spatial
+            self.norm2 = SpatialNorm(out_channels, temb_channels)
+        else:
+            raise ValueError(f" unsupported time_embedding_norm: {self.time_embedding_norm}")
+
+        self.dropout = torch.nn.Dropout(dropout)
+
+        conv_2d_out_channels = conv_2d_out_channels or out_channels
+        self.conv2 = conv_cls(out_channels, conv_2d_out_channels, kernel_size=3, stride=1, padding=1)
+
+        self.nonlinearity = get_activation(non_linearity)
+
+        self.upsample = self.downsample = None
+        if self.up:
+            self.upsample = Upsample2D(in_channels, use_conv=False)
+        elif self.down:
+            self.downsample = Downsample2D(in_channels, use_conv=False, padding=1, name="op")
+
+        self.use_in_shortcut = self.in_channels != conv_2d_out_channels if use_in_shortcut is None else use_in_shortcut
+
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = conv_cls(
+                in_channels,
+                conv_2d_out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=conv_shortcut_bias,
+            )
+
+    def forward(self, input_tensor: torch.FloatTensor, temb: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        hidden_states = input_tensor
+
+        hidden_states = self.norm1(hidden_states, temb)
+
+        hidden_states = self.nonlinearity(hidden_states)
+
+        if self.upsample is not None:
+            # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+            if hidden_states.shape[0] >= 64:
+                input_tensor = input_tensor.contiguous()
+                hidden_states = hidden_states.contiguous()
+            input_tensor = self.upsample(input_tensor)
+            hidden_states = self.upsample(hidden_states)
+
+        elif self.downsample is not None:
+            input_tensor = self.downsample(input_tensor)
+            hidden_states = self.downsample(hidden_states)
+
+        hidden_states = self.conv1(hidden_states)
+
+        hidden_states = self.norm2(hidden_states, temb)
+
+        hidden_states = self.nonlinearity(hidden_states)
+
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+
+        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+
+        return output_tensor
+
+
+class ResnetBlock2D(nn.Module):
+    r"""
+    A Resnet block.
+
+    Parameters:
+        in_channels (`int`): The number of channels in the input.
+        out_channels (`int`, *optional*, default to be `None`):
+            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
+        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
+        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
+        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
+        groups_out (`int`, *optional*, default to None):
+            The number of groups to use for the second normalization layer. if set to None, same as `groups`.
+        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
+        non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
+        time_embedding_norm (`str`, *optional*, default to `"default"` ): Time scale shift config.
+            By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift"
+            for a stronger conditioning with scale and shift.
+        kernel (`torch.FloatTensor`, optional, default to None): FIR filter, see
+            [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
+        output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
+        use_in_shortcut (`bool`, *optional*, default to `True`):
+            If `True`, add a 1x1 nn.conv2d layer for skip-connection.
+        up (`bool`, *optional*, default to `False`): If `True`, add an upsample layer.
+        down (`bool`, *optional*, default to `False`): If `True`, add a downsample layer.
+        conv_shortcut_bias (`bool`, *optional*, default to `True`):  If `True`, adds a learnable bias to the
+            `conv_shortcut` output.
+        conv_2d_out_channels (`int`, *optional*, default to `None`): the number of channels in the output.
+            If None, same as `out_channels`.
+    """
+
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        conv_shortcut: bool = False,
+        dropout: float = 0.0,
+        temb_channels: int = 512,
+        groups: int = 32,
+        groups_out: Optional[int] = None,
+        pre_norm: bool = True,
+        eps: float = 1e-6,
+        non_linearity: str = "swish",
+        skip_time_act: bool = False,
+        time_embedding_norm: str = "default",  # default, scale_shift,
+        kernel: Optional[torch.FloatTensor] = None,
+        output_scale_factor: float = 1.0,
+        use_in_shortcut: Optional[bool] = None,
+        up: bool = False,
+        down: bool = False,
+        conv_shortcut_bias: bool = True,
+        conv_2d_out_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        if time_embedding_norm == "ada_group":
+            raise ValueError(
+                "This class cannot be used with `time_embedding_norm==ada_group`, please use `ResnetBlockCondNorm2D` instead",
+            )
+        if time_embedding_norm == "spatial":
+            raise ValueError(
+                "This class cannot be used with `time_embedding_norm==spatial`, please use `ResnetBlockCondNorm2D` instead",
+            )
+
+        self.pre_norm = True
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.up = up
+        self.down = down
+        self.output_scale_factor = output_scale_factor
+        self.time_embedding_norm = time_embedding_norm
+        self.skip_time_act = skip_time_act
+
+        linear_cls = nn.Linear
+        conv_cls = nn.Conv2d
+
+        if groups_out is None:
+            groups_out = groups
+            
+        self.fuse_gn_silu = True if int(os.environ.get("USE_NHWC_GN", 0)) else False
+        if self.fuse_gn_silu:
+            self.norm1 = GN_NHWC(groups, in_channels, activation="silu")
+        else:
+            self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+
+        self.conv1 = conv_cls(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+        if temb_channels is not None:
+            if self.time_embedding_norm == "default":
+                self.time_emb_proj = linear_cls(temb_channels, out_channels)
+            elif self.time_embedding_norm == "scale_shift":
+                self.time_emb_proj = linear_cls(temb_channels, 2 * out_channels)
+            else:
+                raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
+        else:
+            self.time_emb_proj = None
+
+        if self.fuse_gn_silu:
+            self.norm2 = GN_NHWC(groups_out, out_channels, activation="silu")
+        else:
+            self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
+
+        self.dropout = torch.nn.Dropout(dropout)
+        conv_2d_out_channels = conv_2d_out_channels or out_channels
+        self.conv2 = conv_cls(out_channels, conv_2d_out_channels, kernel_size=3, stride=1, padding=1)
+
+        self.nonlinearity = get_activation(non_linearity)
+
+        self.upsample = self.downsample = None
+        if self.up:
+            if kernel == "fir":
+                fir_kernel = (1, 3, 3, 1)
+                self.upsample = lambda x: upsample_2d(x, kernel=fir_kernel)
+            elif kernel == "sde_vp":
+                self.upsample = partial(F.interpolate, scale_factor=2.0, mode="nearest")
+            else:
+                self.upsample = Upsample2D(in_channels, use_conv=False)
+        elif self.down:
+            if kernel == "fir":
+                fir_kernel = (1, 3, 3, 1)
+                self.downsample = lambda x: downsample_2d(x, kernel=fir_kernel)
+            elif kernel == "sde_vp":
+                self.downsample = partial(F.avg_pool2d, kernel_size=2, stride=2)
+            else:
+                self.downsample = Downsample2D(in_channels, use_conv=False, padding=1, name="op")
+
+        self.use_in_shortcut = self.in_channels != conv_2d_out_channels if use_in_shortcut is None else use_in_shortcut
+
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = conv_cls(
+                in_channels,
+                conv_2d_out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=conv_shortcut_bias,
+            )
+
+    def forward(self, input_tensor: torch.FloatTensor, temb: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        hidden_states = input_tensor
+
+        hidden_states = self.norm1(hidden_states)
+        if not self.fuse_gn_silu:
+            hidden_states = self.nonlinearity(hidden_states)
+
+        if self.upsample is not None:
+            # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+            if hidden_states.shape[0] >= 64:
+                input_tensor = input_tensor.contiguous()
+                hidden_states = hidden_states.contiguous()
+            input_tensor = self.upsample(input_tensor)
+            hidden_states = self.upsample(hidden_states)
+        elif self.downsample is not None:
+            input_tensor = self.downsample(input_tensor)
+            hidden_states = self.downsample(hidden_states)
+
+        hidden_states = self.conv1(hidden_states)
+
+        if self.time_emb_proj is not None:
+            if not self.skip_time_act:
+                temb = self.nonlinearity(temb)
+            temb = self.time_emb_proj(temb)[:, :, None, None]
+
+        if self.time_embedding_norm == "default":
+            if temb is not None:
+                hidden_states = hidden_states + temb
+            hidden_states = self.norm2(hidden_states)
+        elif self.time_embedding_norm == "scale_shift":
+            if temb is None:
+                raise ValueError(
+                    f" `temb` should not be None when `time_embedding_norm` is {self.time_embedding_norm}"
+                )
+            time_scale, time_shift = torch.chunk(temb, 2, dim=1)
+            hidden_states = self.norm2(hidden_states)
+            hidden_states = hidden_states * (1 + time_scale) + time_shift
+        else:
+            hidden_states = self.norm2(hidden_states)
+        
+        if not self.fuse_gn_silu:
+            hidden_states = self.nonlinearity(hidden_states)
+
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+
+        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+
+        return output_tensor
+
+
+# unet_rl.py
+def rearrange_dims(tensor: torch.Tensor) -> torch.Tensor:
+    if len(tensor.shape) == 2:
+        return tensor[:, :, None]
+    if len(tensor.shape) == 3:
+        return tensor[:, :, None, :]
+    elif len(tensor.shape) == 4:
+        return tensor[:, :, 0, :]
+    else:
+        raise ValueError(f"`len(tensor)`: {len(tensor)} has to be 2, 3 or 4.")
+
+
+class Conv1dBlock(nn.Module):
+    """
+    Conv1d --> GroupNorm --> Mish
+
+    Parameters:
+        inp_channels (`int`): Number of input channels.
+        out_channels (`int`): Number of output channels.
+        kernel_size (`int` or `tuple`): Size of the convolving kernel.
+        n_groups (`int`, default `8`): Number of groups to separate the channels into.
+        activation (`str`, defaults to `mish`): Name of the activation function.
+    """
+
+    def __init__(
+        self,
+        inp_channels: int,
+        out_channels: int,
+        kernel_size: Union[int, Tuple[int, int]],
+        n_groups: int = 8,
+        activation: str = "mish",
+    ):
+        super().__init__()
+
+        self.conv1d = nn.Conv1d(inp_channels, out_channels, kernel_size, padding=kernel_size // 2)
+        self.group_norm = nn.GroupNorm(n_groups, out_channels)
+        self.mish = get_activation(activation)
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        intermediate_repr = self.conv1d(inputs)
+        intermediate_repr = rearrange_dims(intermediate_repr)
+        intermediate_repr = self.group_norm(intermediate_repr)
+        intermediate_repr = rearrange_dims(intermediate_repr)
+        output = self.mish(intermediate_repr)
+        return output
+
+
+# unet_rl.py
+class ResidualTemporalBlock1D(nn.Module):
+    """
+    Residual 1D block with temporal convolutions.
+
+    Parameters:
+        inp_channels (`int`): Number of input channels.
+        out_channels (`int`): Number of output channels.
+        embed_dim (`int`): Embedding dimension.
+        kernel_size (`int` or `tuple`): Size of the convolving kernel.
+        activation (`str`, defaults `mish`): It is possible to choose the right activation function.
+    """
+
+    def __init__(
+        self,
+        inp_channels: int,
+        out_channels: int,
+        embed_dim: int,
+        kernel_size: Union[int, Tuple[int, int]] = 5,
+        activation: str = "mish",
+    ):
+        super().__init__()
+        self.conv_in = Conv1dBlock(inp_channels, out_channels, kernel_size)
+        self.conv_out = Conv1dBlock(out_channels, out_channels, kernel_size)
+
+        self.time_emb_act = get_activation(activation)
+        self.time_emb = nn.Linear(embed_dim, out_channels)
+
+        self.residual_conv = (
+            nn.Conv1d(inp_channels, out_channels, 1) if inp_channels != out_channels else nn.Identity()
+        )
+
+    def forward(self, inputs: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            inputs : [ batch_size x inp_channels x horizon ]
+            t : [ batch_size x embed_dim ]
+
+        returns:
+            out : [ batch_size x out_channels x horizon ]
+        """
+        t = self.time_emb_act(t)
+        t = self.time_emb(t)
+        out = self.conv_in(inputs) + rearrange_dims(t)
+        out = self.conv_out(out)
+        return out + self.residual_conv(inputs)
+
+
+class TemporalConvLayer(nn.Module):
+    """
+    Temporal convolutional layer that can be used for video (sequence of images) input Code mostly copied from:
+    https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/models/multi_modal/video_synthesis/unet_sd.py#L1016
+
+    Parameters:
+        in_dim (`int`): Number of input channels.
+        out_dim (`int`): Number of output channels.
+        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
+    """
+
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: Optional[int] = None,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+    ):
+        super().__init__()
+        out_dim = out_dim or in_dim
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+
+        # conv layers
+        self.conv1 = nn.Sequential(
+            nn.GroupNorm(norm_num_groups, in_dim),
+            nn.SiLU(),
+            nn.Conv3d(in_dim, out_dim, (3, 1, 1), padding=(1, 0, 0)),
+        )
+        self.conv2 = nn.Sequential(
+            nn.GroupNorm(norm_num_groups, out_dim),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Conv3d(out_dim, in_dim, (3, 1, 1), padding=(1, 0, 0)),
+        )
+        self.conv3 = nn.Sequential(
+            nn.GroupNorm(norm_num_groups, out_dim),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Conv3d(out_dim, in_dim, (3, 1, 1), padding=(1, 0, 0)),
+        )
+        self.conv4 = nn.Sequential(
+            nn.GroupNorm(norm_num_groups, out_dim),
+            nn.SiLU(),
+            nn.Dropout(dropout),
+            nn.Conv3d(out_dim, in_dim, (3, 1, 1), padding=(1, 0, 0)),
+        )
+
+        # zero out the last layer params,so the conv block is identity
+        nn.init.zeros_(self.conv4[-1].weight)
+        nn.init.zeros_(self.conv4[-1].bias)
+
+    def forward(self, hidden_states: torch.Tensor, num_frames: int = 1) -> torch.Tensor:
+        hidden_states = (
+            hidden_states[None, :].reshape((-1, num_frames) + hidden_states.shape[1:]).permute(0, 2, 1, 3, 4)
+        )
+
+        identity = hidden_states
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.conv3(hidden_states)
+        hidden_states = self.conv4(hidden_states)
+
+        hidden_states = identity + hidden_states
+
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4).reshape(
+            (hidden_states.shape[0] * hidden_states.shape[2], -1) + hidden_states.shape[3:]
+        )
+        return hidden_states
+
+
+class TemporalResnetBlock(nn.Module):
+    r"""
+    A Resnet block.
+
+    Parameters:
+        in_channels (`int`): The number of channels in the input.
+        out_channels (`int`, *optional*, default to be `None`):
+            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
+        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
+        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        temb_channels: int = 512,
+        eps: float = 1e-6,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+
+        kernel_size = (3, 1, 1)
+        padding = [k // 2 for k in kernel_size]
+
+        self.norm1 = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=eps, affine=True)
+        self.conv1 = nn.Conv3d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=padding,
+        )
+
+        if temb_channels is not None:
+            self.time_emb_proj = nn.Linear(temb_channels, out_channels)
+        else:
+            self.time_emb_proj = None
+
+        self.norm2 = torch.nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=eps, affine=True)
+
+        self.dropout = torch.nn.Dropout(0.0)
+        self.conv2 = nn.Conv3d(
+            out_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=padding,
+        )
+
+        self.nonlinearity = get_activation("silu")
+
+        self.use_in_shortcut = self.in_channels != out_channels
+
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = nn.Conv3d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            )
+
+    def forward(self, input_tensor: torch.FloatTensor, temb: torch.FloatTensor) -> torch.FloatTensor:
+        hidden_states = input_tensor
+
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        if self.time_emb_proj is not None:
+            temb = self.nonlinearity(temb)
+            temb = self.time_emb_proj(temb)[:, :, :, None, None]
+            temb = temb.permute(0, 2, 1, 3, 4)
+            hidden_states = hidden_states + temb
+
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+
+        output_tensor = input_tensor + hidden_states
+
+        return output_tensor
+
+
+# VideoResBlock
+class SpatioTemporalResBlock(nn.Module):
+    r"""
+    A SpatioTemporal Resnet block.
+
+    Parameters:
+        in_channels (`int`): The number of channels in the input.
+        out_channels (`int`, *optional*, default to be `None`):
+            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
+        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
+        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the spatial resenet.
+        temporal_eps (`float`, *optional*, defaults to `eps`): The epsilon to use for the temporal resnet.
+        merge_factor (`float`, *optional*, defaults to `0.5`): The merge factor to use for the temporal mixing.
+        merge_strategy (`str`, *optional*, defaults to `learned_with_images`):
+            The merge strategy to use for the temporal mixing.
+        switch_spatial_to_temporal_mix (`bool`, *optional*, defaults to `False`):
+            If `True`, switch the spatial and temporal mixing.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        temb_channels: int = 512,
+        eps: float = 1e-6,
+        temporal_eps: Optional[float] = None,
+        merge_factor: float = 0.5,
+        merge_strategy="learned_with_images",
+        switch_spatial_to_temporal_mix: bool = False,
+    ):
+        super().__init__()
+
+        self.spatial_res_block = ResnetBlock2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            eps=eps,
+        )
+
+        self.temporal_res_block = TemporalResnetBlock(
+            in_channels=out_channels if out_channels is not None else in_channels,
+            out_channels=out_channels if out_channels is not None else in_channels,
+            temb_channels=temb_channels,
+            eps=temporal_eps if temporal_eps is not None else eps,
+        )
+
+        self.time_mixer = AlphaBlender(
+            alpha=merge_factor,
+            merge_strategy=merge_strategy,
+            switch_spatial_to_temporal_mix=switch_spatial_to_temporal_mix,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        image_only_indicator: Optional[torch.Tensor] = None,
+    ):
+        num_frames = image_only_indicator.shape[-1]
+        hidden_states = self.spatial_res_block(hidden_states, temb)
+
+        batch_frames, channels, height, width = hidden_states.shape
+        batch_size = batch_frames // num_frames
+
+        hidden_states_mix = (
+            hidden_states[None, :].reshape(batch_size, num_frames, channels, height, width).permute(0, 2, 1, 3, 4)
+        )
+        hidden_states = (
+            hidden_states[None, :].reshape(batch_size, num_frames, channels, height, width).permute(0, 2, 1, 3, 4)
+        )
+
+        if temb is not None:
+            temb = temb.reshape(batch_size, num_frames, -1)
+
+        hidden_states = self.temporal_res_block(hidden_states, temb)
+        hidden_states = self.time_mixer(
+            x_spatial=hidden_states_mix,
+            x_temporal=hidden_states,
+            image_only_indicator=image_only_indicator,
+        )
+
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4).reshape(batch_frames, channels, height, width)
+        return hidden_states
+
+
+class AlphaBlender(nn.Module):
+    r"""
+    A module to blend spatial and temporal features.
+
+    Parameters:
+        alpha (`float`): The initial value of the blending factor.
+        merge_strategy (`str`, *optional*, defaults to `learned_with_images`):
+            The merge strategy to use for the temporal mixing.
+        switch_spatial_to_temporal_mix (`bool`, *optional*, defaults to `False`):
+            If `True`, switch the spatial and temporal mixing.
+    """
+
+    strategies = ["learned", "fixed", "learned_with_images"]
+
+    def __init__(
+        self,
+        alpha: float,
+        merge_strategy: str = "learned_with_images",
+        switch_spatial_to_temporal_mix: bool = False,
+    ):
+        super().__init__()
+        self.merge_strategy = merge_strategy
+        self.switch_spatial_to_temporal_mix = switch_spatial_to_temporal_mix  # For TemporalVAE
+
+        if merge_strategy not in self.strategies:
+            raise ValueError(f"merge_strategy needs to be in {self.strategies}")
+
+        if self.merge_strategy == "fixed":
+            self.register_buffer("mix_factor", torch.Tensor([alpha]))
+        elif self.merge_strategy == "learned" or self.merge_strategy == "learned_with_images":
+            self.register_parameter("mix_factor", torch.nn.Parameter(torch.Tensor([alpha])))
+        else:
+            raise ValueError(f"Unknown merge strategy {self.merge_strategy}")
+
+    def get_alpha(self, image_only_indicator: torch.Tensor, ndims: int) -> torch.Tensor:
+        if self.merge_strategy == "fixed":
+            alpha = self.mix_factor
+
+        elif self.merge_strategy == "learned":
+            alpha = torch.sigmoid(self.mix_factor)
+
+        elif self.merge_strategy == "learned_with_images":
+            if image_only_indicator is None:
+                raise ValueError("Please provide image_only_indicator to use learned_with_images merge strategy")
+
+            alpha = torch.where(
+                image_only_indicator.bool(),
+                torch.ones(1, 1, device=image_only_indicator.device),
+                torch.sigmoid(self.mix_factor)[..., None],
+            )
+
+            # (batch, channel, frames, height, width)
+            if ndims == 5:
+                alpha = alpha[:, None, :, None, None]
+            # (batch*frames, height*width, channels)
+            elif ndims == 3:
+                alpha = alpha.reshape(-1)[:, None, None]
+            else:
+                raise ValueError(f"Unexpected ndims {ndims}. Dimensions should be 3 or 5")
+
+        else:
+            raise NotImplementedError
+
+        return alpha
+
+    def forward(
+        self,
+        x_spatial: torch.Tensor,
+        x_temporal: torch.Tensor,
+        image_only_indicator: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        alpha = self.get_alpha(image_only_indicator, x_spatial.ndim)
+        alpha = alpha.to(x_spatial.dtype)
+
+        if self.switch_spatial_to_temporal_mix:
+            alpha = 1.0 - alpha
+
+        x = alpha * x_spatial + (1.0 - alpha) * x_temporal
+        return x
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/resnet_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/resnet_flax.py
new file mode 100644
index 000000000..f8bb4788d
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/resnet_flax.py
@@ -0,0 +1,124 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+
+
+class FlaxUpsample2D(nn.Module):
+    out_channels: int
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.conv = nn.Conv(
+            self.out_channels,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+    def __call__(self, hidden_states):
+        batch, height, width, channels = hidden_states.shape
+        hidden_states = jax.image.resize(
+            hidden_states,
+            shape=(batch, height * 2, width * 2, channels),
+            method="nearest",
+        )
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class FlaxDownsample2D(nn.Module):
+    out_channels: int
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.conv = nn.Conv(
+            self.out_channels,
+            kernel_size=(3, 3),
+            strides=(2, 2),
+            padding=((1, 1), (1, 1)),  # padding="VALID",
+            dtype=self.dtype,
+        )
+
+    def __call__(self, hidden_states):
+        # pad = ((0, 0), (0, 1), (0, 1), (0, 0))  # pad height and width dim
+        # hidden_states = jnp.pad(hidden_states, pad_width=pad)
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class FlaxResnetBlock2D(nn.Module):
+    in_channels: int
+    out_channels: int = None
+    dropout_prob: float = 0.0
+    use_nin_shortcut: bool = None
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        out_channels = self.in_channels if self.out_channels is None else self.out_channels
+
+        self.norm1 = nn.GroupNorm(num_groups=32, epsilon=1e-5)
+        self.conv1 = nn.Conv(
+            out_channels,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+        self.time_emb_proj = nn.Dense(out_channels, dtype=self.dtype)
+
+        self.norm2 = nn.GroupNorm(num_groups=32, epsilon=1e-5)
+        self.dropout = nn.Dropout(self.dropout_prob)
+        self.conv2 = nn.Conv(
+            out_channels,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+        use_nin_shortcut = self.in_channels != out_channels if self.use_nin_shortcut is None else self.use_nin_shortcut
+
+        self.conv_shortcut = None
+        if use_nin_shortcut:
+            self.conv_shortcut = nn.Conv(
+                out_channels,
+                kernel_size=(1, 1),
+                strides=(1, 1),
+                padding="VALID",
+                dtype=self.dtype,
+            )
+
+    def __call__(self, hidden_states, temb, deterministic=True):
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = nn.swish(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        temb = self.time_emb_proj(nn.swish(temb))
+        temb = jnp.expand_dims(jnp.expand_dims(temb, 1), 1)
+        hidden_states = hidden_states + temb
+
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = nn.swish(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.conv_shortcut is not None:
+            residual = self.conv_shortcut(residual)
+
+        return hidden_states + residual
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/t5_film_transformer.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/t5_film_transformer.py
new file mode 100644
index 000000000..6aa5ff744
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/t5_film_transformer.py
@@ -0,0 +1,70 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..utils import deprecate
+from .transformers.t5_film_transformer import (
+    DecoderLayer,
+    NewGELUActivation,
+    T5DenseGatedActDense,
+    T5FilmDecoder,
+    T5FiLMLayer,
+    T5LayerCrossAttention,
+    T5LayerFFCond,
+    T5LayerNorm,
+    T5LayerSelfAttentionCond,
+)
+
+
+class T5FilmDecoder(T5FilmDecoder):
+    deprecation_message = "Importing `T5FilmDecoder` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import T5FilmDecoder`, instead."
+    deprecate("T5FilmDecoder", "0.29", deprecation_message)
+
+
+class DecoderLayer(DecoderLayer):
+    deprecation_message = "Importing `DecoderLayer` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import DecoderLayer`, instead."
+    deprecate("DecoderLayer", "0.29", deprecation_message)
+
+
+class T5LayerSelfAttentionCond(T5LayerSelfAttentionCond):
+    deprecation_message = "Importing `T5LayerSelfAttentionCond` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import T5LayerSelfAttentionCond`, instead."
+    deprecate("T5LayerSelfAttentionCond", "0.29", deprecation_message)
+
+
+class T5LayerCrossAttention(T5LayerCrossAttention):
+    deprecation_message = "Importing `T5LayerCrossAttention` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import T5LayerCrossAttention`, instead."
+    deprecate("T5LayerCrossAttention", "0.29", deprecation_message)
+
+
+class T5LayerFFCond(T5LayerFFCond):
+    deprecation_message = "Importing `T5LayerFFCond` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import T5LayerFFCond`, instead."
+    deprecate("T5LayerFFCond", "0.29", deprecation_message)
+
+
+class T5DenseGatedActDense(T5DenseGatedActDense):
+    deprecation_message = "Importing `T5DenseGatedActDense` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import T5DenseGatedActDense`, instead."
+    deprecate("T5DenseGatedActDense", "0.29", deprecation_message)
+
+
+class T5LayerNorm(T5LayerNorm):
+    deprecation_message = "Importing `T5LayerNorm` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import T5LayerNorm`, instead."
+    deprecate("T5LayerNorm", "0.29", deprecation_message)
+
+
+class NewGELUActivation(NewGELUActivation):
+    deprecation_message = "Importing `T5LayerNorm` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import NewGELUActivation`, instead."
+    deprecate("NewGELUActivation", "0.29", deprecation_message)
+
+
+class T5FiLMLayer(T5FiLMLayer):
+    deprecation_message = "Importing `T5FiLMLayer` from `diffusers.models.t5_film_transformer` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.t5_film_transformer import T5FiLMLayer`, instead."
+    deprecate("T5FiLMLayer", "0.29", deprecation_message)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformer_2d.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformer_2d.py
new file mode 100644
index 000000000..5d8ef1347
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformer_2d.py
@@ -0,0 +1,25 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..utils import deprecate
+from .transformers.transformer_2d import Transformer2DModel, Transformer2DModelOutput
+
+
+class Transformer2DModelOutput(Transformer2DModelOutput):
+    deprecation_message = "Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModelOutput`, instead."
+    deprecate("Transformer2DModelOutput", "0.29", deprecation_message)
+
+
+class Transformer2DModel(Transformer2DModel):
+    deprecation_message = "Importing `Transformer2DModel` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_2d import Transformer2DModel`, instead."
+    deprecate("Transformer2DModel", "0.29", deprecation_message)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformer_temporal.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformer_temporal.py
new file mode 100644
index 000000000..83c7a8e67
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformer_temporal.py
@@ -0,0 +1,34 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..utils import deprecate
+from .transformers.transformer_temporal import (
+    TransformerSpatioTemporalModel,
+    TransformerTemporalModel,
+    TransformerTemporalModelOutput,
+)
+
+
+class TransformerTemporalModelOutput(TransformerTemporalModelOutput):
+    deprecation_message = "Importing `TransformerTemporalModelOutput` from `diffusers.models.transformer_temporal` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.tranformer_temporal import TransformerTemporalModelOutput`, instead."
+    deprecate("TransformerTemporalModelOutput", "0.29", deprecation_message)
+
+
+class TransformerTemporalModel(TransformerTemporalModel):
+    deprecation_message = "Importing `TransformerTemporalModel` from `diffusers.models.transformer_temporal` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.tranformer_temporal import TransformerTemporalModel`, instead."
+    deprecate("TransformerTemporalModel", "0.29", deprecation_message)
+
+
+class TransformerSpatioTemporalModel(TransformerSpatioTemporalModel):
+    deprecation_message = "Importing `TransformerSpatioTemporalModel` from `diffusers.models.transformer_temporal` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.tranformer_temporal import TransformerSpatioTemporalModel`, instead."
+    deprecate("TransformerTemporalModelOutput", "0.29", deprecation_message)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/__init__.py
new file mode 100644
index 000000000..dc78a72b2
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/__init__.py
@@ -0,0 +1,9 @@
+from ...utils import is_torch_available
+
+
+if is_torch_available():
+    from .dual_transformer_2d import DualTransformer2DModel
+    from .prior_transformer import PriorTransformer
+    from .t5_film_transformer import T5FilmDecoder
+    from .transformer_2d import Transformer2DModel
+    from .transformer_temporal import TransformerTemporalModel
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/dual_transformer_2d.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/dual_transformer_2d.py
new file mode 100644
index 000000000..96849bd28
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/dual_transformer_2d.py
@@ -0,0 +1,155 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+
+from torch import nn
+
+from .transformer_2d import Transformer2DModel, Transformer2DModelOutput
+
+
+class DualTransformer2DModel(nn.Module):
+    """
+    Dual transformer wrapper that combines two `Transformer2DModel`s for mixed inference.
+
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            Pass if the input is continuous. The number of channels in the input and output.
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.1): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use.
+        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
+            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
+            `ImagePositionalEmbeddings`.
+        num_vector_embeds (`int`, *optional*):
+            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
+            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
+            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
+            up to but not more than steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the TransformerBlocks' attention should contain a bias parameter.
+    """
+
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+    ):
+        super().__init__()
+        self.transformers = nn.ModuleList(
+            [
+                Transformer2DModel(
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    in_channels=in_channels,
+                    num_layers=num_layers,
+                    dropout=dropout,
+                    norm_num_groups=norm_num_groups,
+                    cross_attention_dim=cross_attention_dim,
+                    attention_bias=attention_bias,
+                    sample_size=sample_size,
+                    num_vector_embeds=num_vector_embeds,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                )
+                for _ in range(2)
+            ]
+        )
+
+        # Variables that can be set by a pipeline:
+
+        # The ratio of transformer1 to transformer2's output states to be combined during inference
+        self.mix_ratio = 0.5
+
+        # The shape of `encoder_hidden_states` is expected to be
+        # `(batch_size, condition_lengths[0]+condition_lengths[1], num_features)`
+        self.condition_lengths = [77, 257]
+
+        # Which transformer to use to encode which condition.
+        # E.g. `(1, 0)` means that we'll use `transformers[1](conditions[0])` and `transformers[0](conditions[1])`
+        self.transformer_index_for_condition = [1, 0]
+
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states,
+        timestep=None,
+        attention_mask=None,
+        cross_attention_kwargs=None,
+        return_dict: bool = True,
+    ):
+        """
+        Args:
+            hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
+                When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
+                hidden_states.
+            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.long`, *optional*):
+                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
+            attention_mask (`torch.FloatTensor`, *optional*):
+                Optional attention mask to be applied in Attention.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.transformer_2d.Transformer2DModelOutput`] or `tuple`:
+            [`~models.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+        input_states = hidden_states
+
+        encoded_states = []
+        tokens_start = 0
+        # attention_mask is not used yet
+        for i in range(2):
+            # for each of the two transformers, pass the corresponding condition tokens
+            condition_state = encoder_hidden_states[:, tokens_start : tokens_start + self.condition_lengths[i]]
+            transformer_index = self.transformer_index_for_condition[i]
+            encoded_state = self.transformers[transformer_index](
+                input_states,
+                encoder_hidden_states=condition_state,
+                timestep=timestep,
+                cross_attention_kwargs=cross_attention_kwargs,
+                return_dict=False,
+            )[0]
+            encoded_states.append(encoded_state - input_states)
+            tokens_start += self.condition_lengths[i]
+
+        output_states = encoded_states[0] * self.mix_ratio + encoded_states[1] * (1 - self.mix_ratio)
+        output_states = output_states + input_states
+
+        if not return_dict:
+            return (output_states,)
+
+        return Transformer2DModelOutput(sample=output_states)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/prior_transformer.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/prior_transformer.py
new file mode 100644
index 000000000..990eabe2c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/prior_transformer.py
@@ -0,0 +1,380 @@
+from dataclasses import dataclass
+from typing import Dict, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import PeftAdapterMixin, UNet2DConditionLoadersMixin
+from ...utils import BaseOutput
+from ..attention import BasicTransformerBlock
+from ..attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from ..embeddings import TimestepEmbedding, Timesteps
+from ..modeling_utils import ModelMixin
+
+
+@dataclass
+class PriorTransformerOutput(BaseOutput):
+    """
+    The output of [`PriorTransformer`].
+
+    Args:
+        predicted_image_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`):
+            The predicted CLIP image embedding conditioned on the CLIP text embedding input.
+    """
+
+    predicted_image_embedding: torch.FloatTensor
+
+
+class PriorTransformer(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin):
+    """
+    A Prior Transformer model.
+
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 32): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
+        num_layers (`int`, *optional*, defaults to 20): The number of layers of Transformer blocks to use.
+        embedding_dim (`int`, *optional*, defaults to 768): The dimension of the model input `hidden_states`
+        num_embeddings (`int`, *optional*, defaults to 77):
+            The number of embeddings of the model input `hidden_states`
+        additional_embeddings (`int`, *optional*, defaults to 4): The number of additional tokens appended to the
+            projected `hidden_states`. The actual length of the used `hidden_states` is `num_embeddings +
+            additional_embeddings`.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        time_embed_act_fn (`str`, *optional*, defaults to 'silu'):
+            The activation function to use to create timestep embeddings.
+        norm_in_type (`str`, *optional*, defaults to None): The normalization layer to apply on hidden states before
+            passing to Transformer blocks. Set it to `None` if normalization is not needed.
+        embedding_proj_norm_type (`str`, *optional*, defaults to None):
+            The normalization layer to apply on the input `proj_embedding`. Set it to `None` if normalization is not
+            needed.
+        encoder_hid_proj_type (`str`, *optional*, defaults to `linear`):
+            The projection layer to apply on the input `encoder_hidden_states`. Set it to `None` if
+            `encoder_hidden_states` is `None`.
+        added_emb_type (`str`, *optional*, defaults to `prd`): Additional embeddings to condition the model.
+            Choose from `prd` or `None`. if choose `prd`, it will prepend a token indicating the (quantized) dot
+            product between the text embedding and image embedding as proposed in the unclip paper
+            https://arxiv.org/abs/2204.06125 If it is `None`, no additional embeddings will be prepended.
+        time_embed_dim (`int, *optional*, defaults to None): The dimension of timestep embeddings.
+            If None, will be set to `num_attention_heads * attention_head_dim`
+        embedding_proj_dim (`int`, *optional*, default to None):
+            The dimension of `proj_embedding`. If None, will be set to `embedding_dim`.
+        clip_embed_dim (`int`, *optional*, default to None):
+            The dimension of the output. If None, will be set to `embedding_dim`.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 32,
+        attention_head_dim: int = 64,
+        num_layers: int = 20,
+        embedding_dim: int = 768,
+        num_embeddings=77,
+        additional_embeddings=4,
+        dropout: float = 0.0,
+        time_embed_act_fn: str = "silu",
+        norm_in_type: Optional[str] = None,  # layer
+        embedding_proj_norm_type: Optional[str] = None,  # layer
+        encoder_hid_proj_type: Optional[str] = "linear",  # linear
+        added_emb_type: Optional[str] = "prd",  # prd
+        time_embed_dim: Optional[int] = None,
+        embedding_proj_dim: Optional[int] = None,
+        clip_embed_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        self.additional_embeddings = additional_embeddings
+
+        time_embed_dim = time_embed_dim or inner_dim
+        embedding_proj_dim = embedding_proj_dim or embedding_dim
+        clip_embed_dim = clip_embed_dim or embedding_dim
+
+        self.time_proj = Timesteps(inner_dim, True, 0)
+        self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, out_dim=inner_dim, act_fn=time_embed_act_fn)
+
+        self.proj_in = nn.Linear(embedding_dim, inner_dim)
+
+        if embedding_proj_norm_type is None:
+            self.embedding_proj_norm = None
+        elif embedding_proj_norm_type == "layer":
+            self.embedding_proj_norm = nn.LayerNorm(embedding_proj_dim)
+        else:
+            raise ValueError(f"unsupported embedding_proj_norm_type: {embedding_proj_norm_type}")
+
+        self.embedding_proj = nn.Linear(embedding_proj_dim, inner_dim)
+
+        if encoder_hid_proj_type is None:
+            self.encoder_hidden_states_proj = None
+        elif encoder_hid_proj_type == "linear":
+            self.encoder_hidden_states_proj = nn.Linear(embedding_dim, inner_dim)
+        else:
+            raise ValueError(f"unsupported encoder_hid_proj_type: {encoder_hid_proj_type}")
+
+        self.positional_embedding = nn.Parameter(torch.zeros(1, num_embeddings + additional_embeddings, inner_dim))
+
+        if added_emb_type == "prd":
+            self.prd_embedding = nn.Parameter(torch.zeros(1, 1, inner_dim))
+        elif added_emb_type is None:
+            self.prd_embedding = None
+        else:
+            raise ValueError(
+                f"`added_emb_type`: {added_emb_type} is not supported. Make sure to choose one of `'prd'` or `None`."
+            )
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    activation_fn="gelu",
+                    attention_bias=True,
+                )
+                for d in range(num_layers)
+            ]
+        )
+
+        if norm_in_type == "layer":
+            self.norm_in = nn.LayerNorm(inner_dim)
+        elif norm_in_type is None:
+            self.norm_in = None
+        else:
+            raise ValueError(f"Unsupported norm_in_type: {norm_in_type}.")
+
+        self.norm_out = nn.LayerNorm(inner_dim)
+
+        self.proj_to_clip_embeddings = nn.Linear(inner_dim, clip_embed_dim)
+
+        causal_attention_mask = torch.full(
+            [num_embeddings + additional_embeddings, num_embeddings + additional_embeddings], -10000.0
+        )
+        causal_attention_mask.triu_(1)
+        causal_attention_mask = causal_attention_mask[None, ...]
+        self.register_buffer("causal_attention_mask", causal_attention_mask, persistent=False)
+
+        self.clip_mean = nn.Parameter(torch.zeros(1, clip_embed_dim))
+        self.clip_std = nn.Parameter(torch.zeros(1, clip_embed_dim))
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    def forward(
+        self,
+        hidden_states,
+        timestep: Union[torch.Tensor, float, int],
+        proj_embedding: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.BoolTensor] = None,
+        return_dict: bool = True,
+    ):
+        """
+        The [`PriorTransformer`] forward method.
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`):
+                The currently predicted image embeddings.
+            timestep (`torch.LongTensor`):
+                Current denoising step.
+            proj_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`):
+                Projected embedding vector the denoising process is conditioned on.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_embeddings, embedding_dim)`):
+                Hidden states of the text embeddings the denoising process is conditioned on.
+            attention_mask (`torch.BoolTensor` of shape `(batch_size, num_embeddings)`):
+                Text mask for the text embeddings.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.prior_transformer.PriorTransformerOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            [`~models.prior_transformer.PriorTransformerOutput`] or `tuple`:
+                If return_dict is True, a [`~models.prior_transformer.PriorTransformerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        batch_size = hidden_states.shape[0]
+
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            timesteps = torch.tensor([timesteps], dtype=torch.long, device=hidden_states.device)
+        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(hidden_states.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps * torch.ones(batch_size, dtype=timesteps.dtype, device=timesteps.device)
+
+        timesteps_projected = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might be fp16, so we need to cast here.
+        timesteps_projected = timesteps_projected.to(dtype=self.dtype)
+        time_embeddings = self.time_embedding(timesteps_projected)
+
+        if self.embedding_proj_norm is not None:
+            proj_embedding = self.embedding_proj_norm(proj_embedding)
+
+        proj_embeddings = self.embedding_proj(proj_embedding)
+        if self.encoder_hidden_states_proj is not None and encoder_hidden_states is not None:
+            encoder_hidden_states = self.encoder_hidden_states_proj(encoder_hidden_states)
+        elif self.encoder_hidden_states_proj is not None and encoder_hidden_states is None:
+            raise ValueError("`encoder_hidden_states_proj` requires `encoder_hidden_states` to be set")
+
+        hidden_states = self.proj_in(hidden_states)
+
+        positional_embeddings = self.positional_embedding.to(hidden_states.dtype)
+
+        additional_embeds = []
+        additional_embeddings_len = 0
+
+        if encoder_hidden_states is not None:
+            additional_embeds.append(encoder_hidden_states)
+            additional_embeddings_len += encoder_hidden_states.shape[1]
+
+        if len(proj_embeddings.shape) == 2:
+            proj_embeddings = proj_embeddings[:, None, :]
+
+        if len(hidden_states.shape) == 2:
+            hidden_states = hidden_states[:, None, :]
+
+        additional_embeds = additional_embeds + [
+            proj_embeddings,
+            time_embeddings[:, None, :],
+            hidden_states,
+        ]
+
+        if self.prd_embedding is not None:
+            prd_embedding = self.prd_embedding.to(hidden_states.dtype).expand(batch_size, -1, -1)
+            additional_embeds.append(prd_embedding)
+
+        hidden_states = torch.cat(
+            additional_embeds,
+            dim=1,
+        )
+
+        # Allow positional_embedding to not include the `addtional_embeddings` and instead pad it with zeros for these additional tokens
+        additional_embeddings_len = additional_embeddings_len + proj_embeddings.shape[1] + 1
+        if positional_embeddings.shape[1] < hidden_states.shape[1]:
+            positional_embeddings = F.pad(
+                positional_embeddings,
+                (
+                    0,
+                    0,
+                    additional_embeddings_len,
+                    self.prd_embedding.shape[1] if self.prd_embedding is not None else 0,
+                ),
+                value=0.0,
+            )
+
+        hidden_states = hidden_states + positional_embeddings
+
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = F.pad(attention_mask, (0, self.additional_embeddings), value=0.0)
+            attention_mask = (attention_mask[:, None, :] + self.causal_attention_mask).to(hidden_states.dtype)
+            attention_mask = attention_mask.repeat_interleave(self.config.num_attention_heads, dim=0)
+
+        if self.norm_in is not None:
+            hidden_states = self.norm_in(hidden_states)
+
+        for block in self.transformer_blocks:
+            hidden_states = block(hidden_states, attention_mask=attention_mask)
+
+        hidden_states = self.norm_out(hidden_states)
+
+        if self.prd_embedding is not None:
+            hidden_states = hidden_states[:, -1]
+        else:
+            hidden_states = hidden_states[:, additional_embeddings_len:]
+
+        predicted_image_embedding = self.proj_to_clip_embeddings(hidden_states)
+
+        if not return_dict:
+            return (predicted_image_embedding,)
+
+        return PriorTransformerOutput(predicted_image_embedding=predicted_image_embedding)
+
+    def post_process_latents(self, prior_latents):
+        prior_latents = (prior_latents * self.clip_std) + self.clip_mean
+        return prior_latents
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/t5_film_transformer.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/t5_film_transformer.py
new file mode 100644
index 000000000..bff98db02
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/t5_film_transformer.py
@@ -0,0 +1,438 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ..attention_processor import Attention
+from ..embeddings import get_timestep_embedding
+from ..modeling_utils import ModelMixin
+
+
+class T5FilmDecoder(ModelMixin, ConfigMixin):
+    r"""
+    T5 style decoder with FiLM conditioning.
+
+    Args:
+        input_dims (`int`, *optional*, defaults to `128`):
+            The number of input dimensions.
+        targets_length (`int`, *optional*, defaults to `256`):
+            The length of the targets.
+        d_model (`int`, *optional*, defaults to `768`):
+            Size of the input hidden states.
+        num_layers (`int`, *optional*, defaults to `12`):
+            The number of `DecoderLayer`'s to use.
+        num_heads (`int`, *optional*, defaults to `12`):
+            The number of attention heads to use.
+        d_kv (`int`, *optional*, defaults to `64`):
+            Size of the key-value projection vectors.
+        d_ff (`int`, *optional*, defaults to `2048`):
+            The number of dimensions in the intermediate feed-forward layer of `DecoderLayer`'s.
+        dropout_rate (`float`, *optional*, defaults to `0.1`):
+            Dropout probability.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        input_dims: int = 128,
+        targets_length: int = 256,
+        max_decoder_noise_time: float = 2000.0,
+        d_model: int = 768,
+        num_layers: int = 12,
+        num_heads: int = 12,
+        d_kv: int = 64,
+        d_ff: int = 2048,
+        dropout_rate: float = 0.1,
+    ):
+        super().__init__()
+
+        self.conditioning_emb = nn.Sequential(
+            nn.Linear(d_model, d_model * 4, bias=False),
+            nn.SiLU(),
+            nn.Linear(d_model * 4, d_model * 4, bias=False),
+            nn.SiLU(),
+        )
+
+        self.position_encoding = nn.Embedding(targets_length, d_model)
+        self.position_encoding.weight.requires_grad = False
+
+        self.continuous_inputs_projection = nn.Linear(input_dims, d_model, bias=False)
+
+        self.dropout = nn.Dropout(p=dropout_rate)
+
+        self.decoders = nn.ModuleList()
+        for lyr_num in range(num_layers):
+            # FiLM conditional T5 decoder
+            lyr = DecoderLayer(d_model=d_model, d_kv=d_kv, num_heads=num_heads, d_ff=d_ff, dropout_rate=dropout_rate)
+            self.decoders.append(lyr)
+
+        self.decoder_norm = T5LayerNorm(d_model)
+
+        self.post_dropout = nn.Dropout(p=dropout_rate)
+        self.spec_out = nn.Linear(d_model, input_dims, bias=False)
+
+    def encoder_decoder_mask(self, query_input: torch.FloatTensor, key_input: torch.FloatTensor) -> torch.FloatTensor:
+        mask = torch.mul(query_input.unsqueeze(-1), key_input.unsqueeze(-2))
+        return mask.unsqueeze(-3)
+
+    def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time):
+        batch, _, _ = decoder_input_tokens.shape
+        assert decoder_noise_time.shape == (batch,)
+
+        # decoder_noise_time is in [0, 1), so rescale to expected timing range.
+        time_steps = get_timestep_embedding(
+            decoder_noise_time * self.config.max_decoder_noise_time,
+            embedding_dim=self.config.d_model,
+            max_period=self.config.max_decoder_noise_time,
+        ).to(dtype=self.dtype)
+
+        conditioning_emb = self.conditioning_emb(time_steps).unsqueeze(1)
+
+        assert conditioning_emb.shape == (batch, 1, self.config.d_model * 4)
+
+        seq_length = decoder_input_tokens.shape[1]
+
+        # If we want to use relative positions for audio context, we can just offset
+        # this sequence by the length of encodings_and_masks.
+        decoder_positions = torch.broadcast_to(
+            torch.arange(seq_length, device=decoder_input_tokens.device),
+            (batch, seq_length),
+        )
+
+        position_encodings = self.position_encoding(decoder_positions)
+
+        inputs = self.continuous_inputs_projection(decoder_input_tokens)
+        inputs += position_encodings
+        y = self.dropout(inputs)
+
+        # decoder: No padding present.
+        decoder_mask = torch.ones(
+            decoder_input_tokens.shape[:2], device=decoder_input_tokens.device, dtype=inputs.dtype
+        )
+
+        # Translate encoding masks to encoder-decoder masks.
+        encodings_and_encdec_masks = [(x, self.encoder_decoder_mask(decoder_mask, y)) for x, y in encodings_and_masks]
+
+        # cross attend style: concat encodings
+        encoded = torch.cat([x[0] for x in encodings_and_encdec_masks], dim=1)
+        encoder_decoder_mask = torch.cat([x[1] for x in encodings_and_encdec_masks], dim=-1)
+
+        for lyr in self.decoders:
+            y = lyr(
+                y,
+                conditioning_emb=conditioning_emb,
+                encoder_hidden_states=encoded,
+                encoder_attention_mask=encoder_decoder_mask,
+            )[0]
+
+        y = self.decoder_norm(y)
+        y = self.post_dropout(y)
+
+        spec_out = self.spec_out(y)
+        return spec_out
+
+
+class DecoderLayer(nn.Module):
+    r"""
+    T5 decoder layer.
+
+    Args:
+        d_model (`int`):
+            Size of the input hidden states.
+        d_kv (`int`):
+            Size of the key-value projection vectors.
+        num_heads (`int`):
+            Number of attention heads.
+        d_ff (`int`):
+            Size of the intermediate feed-forward layer.
+        dropout_rate (`float`):
+            Dropout probability.
+        layer_norm_epsilon (`float`, *optional*, defaults to `1e-6`):
+            A small value used for numerical stability to avoid dividing by zero.
+    """
+
+    def __init__(
+        self, d_model: int, d_kv: int, num_heads: int, d_ff: int, dropout_rate: float, layer_norm_epsilon: float = 1e-6
+    ):
+        super().__init__()
+        self.layer = nn.ModuleList()
+
+        # cond self attention: layer 0
+        self.layer.append(
+            T5LayerSelfAttentionCond(d_model=d_model, d_kv=d_kv, num_heads=num_heads, dropout_rate=dropout_rate)
+        )
+
+        # cross attention: layer 1
+        self.layer.append(
+            T5LayerCrossAttention(
+                d_model=d_model,
+                d_kv=d_kv,
+                num_heads=num_heads,
+                dropout_rate=dropout_rate,
+                layer_norm_epsilon=layer_norm_epsilon,
+            )
+        )
+
+        # Film Cond MLP + dropout: last layer
+        self.layer.append(
+            T5LayerFFCond(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate, layer_norm_epsilon=layer_norm_epsilon)
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        conditioning_emb: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        encoder_decoder_position_bias=None,
+    ) -> Tuple[torch.FloatTensor]:
+        hidden_states = self.layer[0](
+            hidden_states,
+            conditioning_emb=conditioning_emb,
+            attention_mask=attention_mask,
+        )
+
+        if encoder_hidden_states is not None:
+            encoder_extended_attention_mask = torch.where(encoder_attention_mask > 0, 0, -1e10).to(
+                encoder_hidden_states.dtype
+            )
+
+            hidden_states = self.layer[1](
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_extended_attention_mask,
+            )
+
+        # Apply Film Conditional Feed Forward layer
+        hidden_states = self.layer[-1](hidden_states, conditioning_emb)
+
+        return (hidden_states,)
+
+
+class T5LayerSelfAttentionCond(nn.Module):
+    r"""
+    T5 style self-attention layer with conditioning.
+
+    Args:
+        d_model (`int`):
+            Size of the input hidden states.
+        d_kv (`int`):
+            Size of the key-value projection vectors.
+        num_heads (`int`):
+            Number of attention heads.
+        dropout_rate (`float`):
+            Dropout probability.
+    """
+
+    def __init__(self, d_model: int, d_kv: int, num_heads: int, dropout_rate: float):
+        super().__init__()
+        self.layer_norm = T5LayerNorm(d_model)
+        self.FiLMLayer = T5FiLMLayer(in_features=d_model * 4, out_features=d_model)
+        self.attention = Attention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False)
+        self.dropout = nn.Dropout(dropout_rate)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        conditioning_emb: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        # pre_self_attention_layer_norm
+        normed_hidden_states = self.layer_norm(hidden_states)
+
+        if conditioning_emb is not None:
+            normed_hidden_states = self.FiLMLayer(normed_hidden_states, conditioning_emb)
+
+        # Self-attention block
+        attention_output = self.attention(normed_hidden_states)
+
+        hidden_states = hidden_states + self.dropout(attention_output)
+
+        return hidden_states
+
+
+class T5LayerCrossAttention(nn.Module):
+    r"""
+    T5 style cross-attention layer.
+
+    Args:
+        d_model (`int`):
+            Size of the input hidden states.
+        d_kv (`int`):
+            Size of the key-value projection vectors.
+        num_heads (`int`):
+            Number of attention heads.
+        dropout_rate (`float`):
+            Dropout probability.
+        layer_norm_epsilon (`float`):
+            A small value used for numerical stability to avoid dividing by zero.
+    """
+
+    def __init__(self, d_model: int, d_kv: int, num_heads: int, dropout_rate: float, layer_norm_epsilon: float):
+        super().__init__()
+        self.attention = Attention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False)
+        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
+        self.dropout = nn.Dropout(dropout_rate)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        key_value_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.attention(
+            normed_hidden_states,
+            encoder_hidden_states=key_value_states,
+            attention_mask=attention_mask.squeeze(1),
+        )
+        layer_output = hidden_states + self.dropout(attention_output)
+        return layer_output
+
+
+class T5LayerFFCond(nn.Module):
+    r"""
+    T5 style feed-forward conditional layer.
+
+    Args:
+        d_model (`int`):
+            Size of the input hidden states.
+        d_ff (`int`):
+            Size of the intermediate feed-forward layer.
+        dropout_rate (`float`):
+            Dropout probability.
+        layer_norm_epsilon (`float`):
+            A small value used for numerical stability to avoid dividing by zero.
+    """
+
+    def __init__(self, d_model: int, d_ff: int, dropout_rate: float, layer_norm_epsilon: float):
+        super().__init__()
+        self.DenseReluDense = T5DenseGatedActDense(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate)
+        self.film = T5FiLMLayer(in_features=d_model * 4, out_features=d_model)
+        self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
+        self.dropout = nn.Dropout(dropout_rate)
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, conditioning_emb: Optional[torch.FloatTensor] = None
+    ) -> torch.FloatTensor:
+        forwarded_states = self.layer_norm(hidden_states)
+        if conditioning_emb is not None:
+            forwarded_states = self.film(forwarded_states, conditioning_emb)
+
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        hidden_states = hidden_states + self.dropout(forwarded_states)
+        return hidden_states
+
+
+class T5DenseGatedActDense(nn.Module):
+    r"""
+    T5 style feed-forward layer with gated activations and dropout.
+
+    Args:
+        d_model (`int`):
+            Size of the input hidden states.
+        d_ff (`int`):
+            Size of the intermediate feed-forward layer.
+        dropout_rate (`float`):
+            Dropout probability.
+    """
+
+    def __init__(self, d_model: int, d_ff: int, dropout_rate: float):
+        super().__init__()
+        self.wi_0 = nn.Linear(d_model, d_ff, bias=False)
+        self.wi_1 = nn.Linear(d_model, d_ff, bias=False)
+        self.wo = nn.Linear(d_ff, d_model, bias=False)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.act = NewGELUActivation()
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        hidden_gelu = self.act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class T5LayerNorm(nn.Module):
+    r"""
+    T5 style layer normalization module.
+
+    Args:
+        hidden_size (`int`):
+            Size of the input hidden states.
+        eps (`float`, `optional`, defaults to `1e-6`):
+            A small value used for numerical stability to avoid dividing by zero.
+    """
+
+    def __init__(self, hidden_size: int, eps: float = 1e-6):
+        """
+        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
+        # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus variance is calculated
+        # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
+        # half-precision inputs is done in fp32
+
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
+
+
+class NewGELUActivation(nn.Module):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
+    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
+
+
+class T5FiLMLayer(nn.Module):
+    """
+    T5 style FiLM Layer.
+
+    Args:
+        in_features (`int`):
+            Number of input features.
+        out_features (`int`):
+            Number of output features.
+    """
+
+    def __init__(self, in_features: int, out_features: int):
+        super().__init__()
+        self.scale_bias = nn.Linear(in_features, out_features * 2, bias=False)
+
+    def forward(self, x: torch.FloatTensor, conditioning_emb: torch.FloatTensor) -> torch.FloatTensor:
+        emb = self.scale_bias(conditioning_emb)
+        scale, shift = torch.chunk(emb, 2, -1)
+        x = x * (1 + scale) + shift
+        return x
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/transformer_2d.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/transformer_2d.py
new file mode 100644
index 000000000..63df0dbbc
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/transformer_2d.py
@@ -0,0 +1,460 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+import os
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import BaseOutput, deprecate, is_torch_version, logging
+from ..attention import BasicTransformerBlock
+from ..embeddings import ImagePositionalEmbeddings, PatchEmbed, PixArtAlphaTextProjection
+from ..modeling_utils import ModelMixin
+from ..normalization import AdaLayerNormSingle
+from ..nhwc_groupnorm.custom_gn import GN_NHWC
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class Transformer2DModelOutput(BaseOutput):
+    """
+    The output of [`Transformer2DModel`].
+
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
+            The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability
+            distributions for the unnoised latent pixels.
+    """
+
+    sample: torch.FloatTensor
+
+
+class Transformer2DModel(ModelMixin, ConfigMixin):
+    """
+    A 2D Transformer model for image-like data.
+
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            The number of channels in the input and output (specify if the input is **continuous**).
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+            This is fixed during training since it is used to learn a number of position embeddings.
+        num_vector_embeds (`int`, *optional*):
+            The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**).
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*):
+            The number of diffusion steps used during training. Pass if at least one of the norm_layers is
+            `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
+            added to the hidden states.
+
+            During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the `TransformerBlocks` attention should contain a bias parameter.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        patch_size: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_type: str = "layer_norm",  # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single', 'ada_norm_continuous', 'layer_norm_i2vgen'
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        attention_type: str = "default",
+        caption_channels: int = None,
+        interpolation_scale: float = None,
+    ):
+        super().__init__()
+        if patch_size is not None:
+            if norm_type not in ["ada_norm", "ada_norm_zero", "ada_norm_single"]:
+                raise NotImplementedError(
+                    f"Forward pass is not implemented when `patch_size` is not None and `norm_type` is '{norm_type}'."
+                )
+            elif norm_type in ["ada_norm", "ada_norm_zero"] and num_embeds_ada_norm is None:
+                raise ValueError(
+                    f"When using a `patch_size` and this `norm_type` ({norm_type}), `num_embeds_ada_norm` cannot be None."
+                )
+
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+
+        conv_cls = nn.Conv2d
+        linear_cls = nn.Linear
+
+        # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
+        # Define whether input is continuous or discrete depending on configuration
+        self.is_input_continuous = (in_channels is not None) and (patch_size is None)
+        self.is_input_vectorized = num_vector_embeds is not None
+        self.is_input_patches = in_channels is not None and patch_size is not None
+
+        if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
+            deprecation_message = (
+                f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
+                " incorrectly set to `'layer_norm'`.Make sure to set `norm_type` to `'ada_norm'` in the config."
+                " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
+                " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
+                " would be very nice if you could open a Pull request for the `transformer/config.json` file"
+            )
+            deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
+            norm_type = "ada_norm"
+
+        if self.is_input_continuous and self.is_input_vectorized:
+            raise ValueError(
+                f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is None."
+            )
+        elif self.is_input_vectorized and self.is_input_patches:
+            raise ValueError(
+                f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make"
+                " sure that either `num_vector_embeds` or `num_patches` is None."
+            )
+        elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches:
+            raise ValueError(
+                f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:"
+                f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None."
+            )
+
+        # 2. Define input layers
+        if self.is_input_continuous:
+            self.in_channels = in_channels
+            if int(os.environ.get("USE_NHWC_GN", 0)):
+                self.norm = GN_NHWC(norm_num_groups, in_channels, activation="identity")
+            else:
+                self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+            if use_linear_projection:
+                self.proj_in = linear_cls(in_channels, inner_dim)
+            else:
+                self.proj_in = conv_cls(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
+            assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"
+
+            self.height = sample_size
+            self.width = sample_size
+            self.num_vector_embeds = num_vector_embeds
+            self.num_latent_pixels = self.height * self.width
+
+            self.latent_image_embedding = ImagePositionalEmbeddings(
+                num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
+            )
+        elif self.is_input_patches:
+            assert sample_size is not None, "Transformer2DModel over patched input must provide sample_size"
+
+            self.height = sample_size
+            self.width = sample_size
+
+            self.patch_size = patch_size
+            interpolation_scale = (
+                interpolation_scale if interpolation_scale is not None else max(self.config.sample_size // 64, 1)
+            )
+            self.pos_embed = PatchEmbed(
+                height=sample_size,
+                width=sample_size,
+                patch_size=patch_size,
+                in_channels=in_channels,
+                embed_dim=inner_dim,
+                interpolation_scale=interpolation_scale,
+            )
+
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    double_self_attention=double_self_attention,
+                    upcast_attention=upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    norm_eps=norm_eps,
+                    attention_type=attention_type,
+                )
+                for d in range(num_layers)
+            ]
+        )
+
+        # 4. Define output layers
+        self.out_channels = in_channels if out_channels is None else out_channels
+        if self.is_input_continuous:
+            # TODO: should use out_channels for continuous projections
+            if use_linear_projection:
+                self.proj_out = linear_cls(inner_dim, in_channels)
+            else:
+                self.proj_out = conv_cls(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+        elif self.is_input_vectorized:
+            self.norm_out = nn.LayerNorm(inner_dim)
+            self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
+        elif self.is_input_patches and norm_type != "ada_norm_single":
+            self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+            self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim)
+            self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
+        elif self.is_input_patches and norm_type == "ada_norm_single":
+            self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
+            self.scale_shift_table = nn.Parameter(torch.randn(2, inner_dim) / inner_dim**0.5)
+            self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
+
+        # 5. PixArt-Alpha blocks.
+        self.adaln_single = None
+        self.use_additional_conditions = False
+        if norm_type == "ada_norm_single":
+            self.use_additional_conditions = self.config.sample_size == 128
+            # TODO(Sayak, PVP) clean this, for now we use sample size to determine whether to use
+            # additional conditions until we find better name
+            self.adaln_single = AdaLayerNormSingle(inner_dim, use_additional_conditions=self.use_additional_conditions)
+
+        self.caption_projection = None
+        if caption_channels is not None:
+            self.caption_projection = PixArtAlphaTextProjection(in_features=caption_channels, hidden_size=inner_dim)
+
+        self.gradient_checkpointing = False
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        added_cond_kwargs: Dict[str, torch.Tensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ):
+        """
+        The [`Transformer2DModel`] forward method.
+
+        Args:
+            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
+                Input `hidden_states`.
+            encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+                `AdaLayerZeroNorm`.
+            cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            attention_mask ( `torch.Tensor`, *optional*):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            encoder_attention_mask ( `torch.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+
+                    * Mask `(batch, sequence_length)` True = keep, False = discard.
+                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        # 1. Input
+        if self.is_input_continuous:
+            batch, _, height, width = hidden_states.shape
+            residual = hidden_states
+
+            hidden_states = self.norm(hidden_states)
+            if not self.use_linear_projection:
+                hidden_states = self.proj_in(hidden_states)
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+            else:
+                inner_dim = hidden_states.shape[1]
+                hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+                hidden_states = self.proj_in(hidden_states)
+
+        elif self.is_input_vectorized:
+            hidden_states = self.latent_image_embedding(hidden_states)
+        elif self.is_input_patches:
+            height, width = hidden_states.shape[-2] // self.patch_size, hidden_states.shape[-1] // self.patch_size
+            hidden_states = self.pos_embed(hidden_states)
+
+            if self.adaln_single is not None:
+                if self.use_additional_conditions and added_cond_kwargs is None:
+                    raise ValueError(
+                        "`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`."
+                    )
+                batch_size = hidden_states.shape[0]
+                timestep, embedded_timestep = self.adaln_single(
+                    timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype
+                )
+
+        # 2. Blocks
+        if self.caption_projection is not None:
+            batch_size = hidden_states.shape[0]
+            encoder_hidden_states = self.caption_projection(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
+
+        for block in self.transformer_blocks:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    timestep,
+                    cross_attention_kwargs,
+                    class_labels,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    timestep=timestep,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    class_labels=class_labels,
+                )
+
+        # 3. Output
+        if self.is_input_continuous:
+            if not self.use_linear_projection:
+                hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+                hidden_states = self.proj_out(hidden_states)
+            else:
+                hidden_states = self.proj_out(hidden_states)
+                hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2)
+
+            output = hidden_states + residual
+        elif self.is_input_vectorized:
+            hidden_states = self.norm_out(hidden_states)
+            logits = self.out(hidden_states)
+            # (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
+            logits = logits.permute(0, 2, 1)
+
+            # log(p(x_0))
+            output = F.log_softmax(logits.double(), dim=1).float()
+
+        if self.is_input_patches:
+            if self.config.norm_type != "ada_norm_single":
+                conditioning = self.transformer_blocks[0].norm1.emb(
+                    timestep, class_labels, hidden_dtype=hidden_states.dtype
+                )
+                shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
+                hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+                hidden_states = self.proj_out_2(hidden_states)
+            elif self.config.norm_type == "ada_norm_single":
+                shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, dim=1)
+                hidden_states = self.norm_out(hidden_states)
+                # Modulation
+                hidden_states = hidden_states * (1 + scale) + shift
+                hidden_states = self.proj_out(hidden_states)
+                hidden_states = hidden_states.squeeze(1)
+
+            # unpatchify
+            if self.adaln_single is None:
+                height = width = int(hidden_states.shape[1] ** 0.5)
+            hidden_states = hidden_states.reshape(
+                shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
+            )
+            hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+            output = hidden_states.reshape(
+                shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
+            )
+
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/transformer_temporal.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/transformer_temporal.py
new file mode 100644
index 000000000..9c61eaee2
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/transformer_temporal.py
@@ -0,0 +1,379 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+import torch
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import BaseOutput
+from ..attention import BasicTransformerBlock, TemporalBasicTransformerBlock
+from ..embeddings import TimestepEmbedding, Timesteps
+from ..modeling_utils import ModelMixin
+from ..resnet import AlphaBlender
+
+
+@dataclass
+class TransformerTemporalModelOutput(BaseOutput):
+    """
+    The output of [`TransformerTemporalModel`].
+
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size x num_frames, num_channels, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input.
+    """
+
+    sample: torch.FloatTensor
+
+
+class TransformerTemporalModel(ModelMixin, ConfigMixin):
+    """
+    A Transformer model for video-like data.
+
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            The number of channels in the input and output (specify if the input is **continuous**).
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        attention_bias (`bool`, *optional*):
+            Configure if the `TransformerBlock` attention should contain a bias parameter.
+        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+            This is fixed during training since it is used to learn a number of position embeddings.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`):
+            Activation function to use in feed-forward. See `diffusers.models.activations.get_activation` for supported
+            activation functions.
+        norm_elementwise_affine (`bool`, *optional*):
+            Configure if the `TransformerBlock` should use learnable elementwise affine parameters for normalization.
+        double_self_attention (`bool`, *optional*):
+            Configure if each `TransformerBlock` should contain two self-attention layers.
+        positional_embeddings: (`str`, *optional*):
+            The type of positional embeddings to apply to the sequence input before passing use.
+        num_positional_embeddings: (`int`, *optional*):
+            The maximum length of the sequence over which to apply positional embeddings.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        activation_fn: str = "geglu",
+        norm_elementwise_affine: bool = True,
+        double_self_attention: bool = True,
+        positional_embeddings: Optional[str] = None,
+        num_positional_embeddings: Optional[int] = None,
+    ):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+
+        self.in_channels = in_channels
+
+        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+        self.proj_in = nn.Linear(in_channels, inner_dim)
+
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    attention_bias=attention_bias,
+                    double_self_attention=double_self_attention,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    positional_embeddings=positional_embeddings,
+                    num_positional_embeddings=num_positional_embeddings,
+                )
+                for d in range(num_layers)
+            ]
+        )
+
+        self.proj_out = nn.Linear(inner_dim, in_channels)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.LongTensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        class_labels: torch.LongTensor = None,
+        num_frames: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> TransformerTemporalModelOutput:
+        """
+        The [`TransformerTemporal`] forward method.
+
+        Args:
+            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
+                Input hidden_states.
+            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+                `AdaLayerZeroNorm`.
+            num_frames (`int`, *optional*, defaults to 1):
+                The number of frames to be processed per batch. This is used to reshape the hidden states.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            [`~models.transformer_temporal.TransformerTemporalModelOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.transformer_temporal.TransformerTemporalModelOutput`] is
+                returned, otherwise a `tuple` where the first element is the sample tensor.
+        """
+        # 1. Input
+        batch_frames, channel, height, width = hidden_states.shape
+        batch_size = batch_frames // num_frames
+
+        residual = hidden_states
+
+        hidden_states = hidden_states[None, :].reshape(batch_size, num_frames, channel, height, width)
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+
+        hidden_states = self.norm(hidden_states)
+        hidden_states = hidden_states.permute(0, 3, 4, 2, 1).reshape(batch_size * height * width, num_frames, channel)
+
+        hidden_states = self.proj_in(hidden_states)
+
+        # 2. Blocks
+        for block in self.transformer_blocks:
+            hidden_states = block(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                timestep=timestep,
+                cross_attention_kwargs=cross_attention_kwargs,
+                class_labels=class_labels,
+            )
+
+        # 3. Output
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = (
+            hidden_states[None, None, :]
+            .reshape(batch_size, height, width, num_frames, channel)
+            .permute(0, 3, 4, 1, 2)
+            .contiguous()
+        )
+        hidden_states = hidden_states.reshape(batch_frames, channel, height, width)
+
+        output = hidden_states + residual
+
+        if not return_dict:
+            return (output,)
+
+        return TransformerTemporalModelOutput(sample=output)
+
+
+class TransformerSpatioTemporalModel(nn.Module):
+    """
+    A Transformer model for video-like data.
+
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            The number of channels in the input and output (specify if the input is **continuous**).
+        out_channels (`int`, *optional*):
+            The number of channels in the output (specify if the input is **continuous**).
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+    """
+
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: int = 320,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        cross_attention_dim: Optional[int] = None,
+    ):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+
+        inner_dim = num_attention_heads * attention_head_dim
+        self.inner_dim = inner_dim
+
+        # 2. Define input layers
+        self.in_channels = in_channels
+        self.norm = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6)
+        self.proj_in = nn.Linear(in_channels, inner_dim)
+
+        # 3. Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    cross_attention_dim=cross_attention_dim,
+                )
+                for d in range(num_layers)
+            ]
+        )
+
+        time_mix_inner_dim = inner_dim
+        self.temporal_transformer_blocks = nn.ModuleList(
+            [
+                TemporalBasicTransformerBlock(
+                    inner_dim,
+                    time_mix_inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    cross_attention_dim=cross_attention_dim,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+        time_embed_dim = in_channels * 4
+        self.time_pos_embed = TimestepEmbedding(in_channels, time_embed_dim, out_dim=in_channels)
+        self.time_proj = Timesteps(in_channels, True, 0)
+        self.time_mixer = AlphaBlender(alpha=0.5, merge_strategy="learned_with_images")
+
+        # 4. Define output layers
+        self.out_channels = in_channels if out_channels is None else out_channels
+        # TODO: should use out_channels for continuous projections
+        self.proj_out = nn.Linear(inner_dim, in_channels)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        image_only_indicator: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
+                Input hidden_states.
+            num_frames (`int`):
+                The number of frames to be processed per batch. This is used to reshape the hidden states.
+            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            image_only_indicator (`torch.LongTensor` of shape `(batch size, num_frames)`, *optional*):
+                A tensor indicating whether the input contains only images. 1 indicates that the input contains only
+                images, 0 indicates that the input contains video frames.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformer_temporal.TransformerTemporalModelOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            [`~models.transformer_temporal.TransformerTemporalModelOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.transformer_temporal.TransformerTemporalModelOutput`] is
+                returned, otherwise a `tuple` where the first element is the sample tensor.
+        """
+        # 1. Input
+        batch_frames, _, height, width = hidden_states.shape
+        num_frames = image_only_indicator.shape[-1]
+        batch_size = batch_frames // num_frames
+
+        time_context = encoder_hidden_states
+        time_context_first_timestep = time_context[None, :].reshape(
+            batch_size, num_frames, -1, time_context.shape[-1]
+        )[:, 0]
+        time_context = time_context_first_timestep[None, :].broadcast_to(
+            height * width, batch_size, 1, time_context.shape[-1]
+        )
+        time_context = time_context.reshape(height * width * batch_size, 1, time_context.shape[-1])
+
+        residual = hidden_states
+
+        hidden_states = self.norm(hidden_states)
+        inner_dim = hidden_states.shape[1]
+        hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch_frames, height * width, inner_dim)
+        hidden_states = self.proj_in(hidden_states)
+
+        num_frames_emb = torch.arange(num_frames, device=hidden_states.device)
+        num_frames_emb = num_frames_emb.repeat(batch_size, 1)
+        num_frames_emb = num_frames_emb.reshape(-1)
+        t_emb = self.time_proj(num_frames_emb)
+
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=hidden_states.dtype)
+
+        emb = self.time_pos_embed(t_emb)
+        emb = emb[:, None, :]
+
+        # 2. Blocks
+        for block, temporal_block in zip(self.transformer_blocks, self.temporal_transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    block,
+                    hidden_states,
+                    None,
+                    encoder_hidden_states,
+                    None,
+                    use_reentrant=False,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                )
+
+            hidden_states_mix = hidden_states
+            hidden_states_mix = hidden_states_mix + emb
+
+            hidden_states_mix = temporal_block(
+                hidden_states_mix,
+                num_frames=num_frames,
+                encoder_hidden_states=time_context,
+            )
+            hidden_states = self.time_mixer(
+                x_spatial=hidden_states,
+                x_temporal=hidden_states_mix,
+                image_only_indicator=image_only_indicator,
+            )
+
+        # 3. Output
+        hidden_states = self.proj_out(hidden_states)
+        hidden_states = hidden_states.reshape(batch_frames, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+
+        output = hidden_states + residual
+
+        if not return_dict:
+            return (output,)
+
+        return TransformerTemporalModelOutput(sample=output)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_1d.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_1d.py
new file mode 100644
index 000000000..e857c90ca
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_1d.py
@@ -0,0 +1,26 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils import deprecate
+from .unets.unet_1d import UNet1DModel, UNet1DOutput
+
+
+class UNet1DOutput(UNet1DOutput):
+    deprecation_message = "Importing `UNet1DOutput` from `diffusers.models.unet_1d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d import UNet1DOutput`, instead."
+    deprecate("UNet1DOutput", "0.29", deprecation_message)
+
+
+class UNet1DModel(UNet1DModel):
+    deprecation_message = "Importing `UNet1DModel` from `diffusers.models.unet_1d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d import UNet1DModel`, instead."
+    deprecate("UNet1DModel", "0.29", deprecation_message)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_1d_blocks.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_1d_blocks.py
new file mode 100644
index 000000000..6b0f09457
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_1d_blocks.py
@@ -0,0 +1,203 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils import deprecate
+from .unets.unet_1d_blocks import (
+    AttnDownBlock1D,
+    AttnUpBlock1D,
+    DownBlock1D,
+    DownBlock1DNoSkip,
+    DownResnetBlock1D,
+    Downsample1d,
+    MidResTemporalBlock1D,
+    OutConv1DBlock,
+    OutValueFunctionBlock,
+    ResConvBlock,
+    SelfAttention1d,
+    UNetMidBlock1D,
+    UpBlock1D,
+    UpBlock1DNoSkip,
+    UpResnetBlock1D,
+    Upsample1d,
+    ValueFunctionMidBlock1D,
+)
+
+
+class DownResnetBlock1D(DownResnetBlock1D):
+    deprecation_message = "Importing `DownResnetBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import DownResnetBlock1D`, instead."
+    deprecate("DownResnetBlock1D", "0.29", deprecation_message)
+
+
+class UpResnetBlock1D(UpResnetBlock1D):
+    deprecation_message = "Importing `UpResnetBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import UpResnetBlock1D`, instead."
+    deprecate("UpResnetBlock1D", "0.29", deprecation_message)
+
+
+class ValueFunctionMidBlock1D(ValueFunctionMidBlock1D):
+    deprecation_message = "Importing `ValueFunctionMidBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import ValueFunctionMidBlock1D`, instead."
+    deprecate("ValueFunctionMidBlock1D", "0.29", deprecation_message)
+
+
+class OutConv1DBlock(OutConv1DBlock):
+    deprecation_message = "Importing `OutConv1DBlock` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import OutConv1DBlock`, instead."
+    deprecate("OutConv1DBlock", "0.29", deprecation_message)
+
+
+class OutValueFunctionBlock(OutValueFunctionBlock):
+    deprecation_message = "Importing `OutValueFunctionBlock` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import OutValueFunctionBlock`, instead."
+    deprecate("OutValueFunctionBlock", "0.29", deprecation_message)
+
+
+class Downsample1d(Downsample1d):
+    deprecation_message = "Importing `Downsample1d` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import Downsample1d`, instead."
+    deprecate("Downsample1d", "0.29", deprecation_message)
+
+
+class Upsample1d(Upsample1d):
+    deprecation_message = "Importing `Upsample1d` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import Upsample1d`, instead."
+    deprecate("Upsample1d", "0.29", deprecation_message)
+
+
+class SelfAttention1d(SelfAttention1d):
+    deprecation_message = "Importing `SelfAttention1d` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import SelfAttention1d`, instead."
+    deprecate("SelfAttention1d", "0.29", deprecation_message)
+
+
+class ResConvBlock(ResConvBlock):
+    deprecation_message = "Importing `ResConvBlock` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import ResConvBlock`, instead."
+    deprecate("ResConvBlock", "0.29", deprecation_message)
+
+
+class UNetMidBlock1D(UNetMidBlock1D):
+    deprecation_message = "Importing `UNetMidBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import UNetMidBlock1D`, instead."
+    deprecate("UNetMidBlock1D", "0.29", deprecation_message)
+
+
+class AttnDownBlock1D(AttnDownBlock1D):
+    deprecation_message = "Importing `AttnDownBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import AttnDownBlock1D`, instead."
+    deprecate("AttnDownBlock1D", "0.29", deprecation_message)
+
+
+class DownBlock1D(DownBlock1D):
+    deprecation_message = "Importing `DownBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import DownBlock1D`, instead."
+    deprecate("DownBlock1D", "0.29", deprecation_message)
+
+
+class DownBlock1DNoSkip(DownBlock1DNoSkip):
+    deprecation_message = "Importing `DownBlock1DNoSkip` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import DownBlock1DNoSkip`, instead."
+    deprecate("DownBlock1DNoSkip", "0.29", deprecation_message)
+
+
+class AttnUpBlock1D(AttnUpBlock1D):
+    deprecation_message = "Importing `AttnUpBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import AttnUpBlock1D`, instead."
+    deprecate("AttnUpBlock1D", "0.29", deprecation_message)
+
+
+class UpBlock1D(UpBlock1D):
+    deprecation_message = "Importing `UpBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import UpBlock1D`, instead."
+    deprecate("UpBlock1D", "0.29", deprecation_message)
+
+
+class UpBlock1DNoSkip(UpBlock1DNoSkip):
+    deprecation_message = "Importing `UpBlock1DNoSkip` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import UpBlock1DNoSkip`, instead."
+    deprecate("UpBlock1DNoSkip", "0.29", deprecation_message)
+
+
+class MidResTemporalBlock1D(MidResTemporalBlock1D):
+    deprecation_message = "Importing `MidResTemporalBlock1D` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import MidResTemporalBlock1D`, instead."
+    deprecate("MidResTemporalBlock1D", "0.29", deprecation_message)
+
+
+def get_down_block(
+    down_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    temb_channels: int,
+    add_downsample: bool,
+):
+    deprecation_message = "Importing `get_down_block` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import get_down_block`, instead."
+    deprecate("get_down_block", "0.29", deprecation_message)
+
+    from .unets.unet_1d_blocks import get_down_block
+
+    return get_down_block(
+        down_block_type=down_block_type,
+        num_layers=num_layers,
+        in_channels=in_channels,
+        out_channels=out_channels,
+        temb_channels=temb_channels,
+        add_downsample=add_downsample,
+    )
+
+
+def get_up_block(
+    up_block_type: str, num_layers: int, in_channels: int, out_channels: int, temb_channels: int, add_upsample: bool
+):
+    deprecation_message = "Importing `get_up_block` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import get_up_block`, instead."
+    deprecate("get_up_block", "0.29", deprecation_message)
+
+    from .unets.unet_1d_blocks import get_up_block
+
+    return get_up_block(
+        up_block_type=up_block_type,
+        num_layers=num_layers,
+        in_channels=in_channels,
+        out_channels=out_channels,
+        temb_channels=temb_channels,
+        add_upsample=add_upsample,
+    )
+
+
+def get_mid_block(
+    mid_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    mid_channels: int,
+    out_channels: int,
+    embed_dim: int,
+    add_downsample: bool,
+):
+    deprecation_message = "Importing `get_mid_block` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import get_mid_block`, instead."
+    deprecate("get_mid_block", "0.29", deprecation_message)
+
+    from .unets.unet_1d_blocks import get_mid_block
+
+    return get_mid_block(
+        mid_block_type=mid_block_type,
+        num_layers=num_layers,
+        in_channels=in_channels,
+        mid_channels=mid_channels,
+        out_channels=out_channels,
+        embed_dim=embed_dim,
+        add_downsample=add_downsample,
+    )
+
+
+def get_out_block(
+    *, out_block_type: str, num_groups_out: int, embed_dim: int, out_channels: int, act_fn: str, fc_dim: int
+):
+    deprecation_message = "Importing `get_out_block` from `diffusers.models.unet_1d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_1d_blocks import get_out_block`, instead."
+    deprecate("get_out_block", "0.29", deprecation_message)
+
+    from .unets.unet_1d_blocks import get_out_block
+
+    return get_out_block(
+        out_block_type=out_block_type,
+        num_groups_out=num_groups_out,
+        embed_dim=embed_dim,
+        out_channels=out_channels,
+        act_fn=act_fn,
+        fc_dim=fc_dim,
+    )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_2d.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_2d.py
new file mode 100644
index 000000000..21f1fea68
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_2d.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ..utils import deprecate
+from .unets.unet_2d import UNet2DModel, UNet2DOutput
+
+
+class UNet2DOutput(UNet2DOutput):
+    deprecation_message = "Importing `UNet2DOutput` from `diffusers.models.unet_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d import UNet2DOutput`, instead."
+    deprecate("UNet2DOutput", "0.29", deprecation_message)
+
+
+class UNet2DModel(UNet2DModel):
+    deprecation_message = "Importing `UNet2DModel` from `diffusers.models.unet_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d import UNet2DModel`, instead."
+    deprecate("UNet2DModel", "0.29", deprecation_message)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_2d_blocks.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_2d_blocks.py
new file mode 100644
index 000000000..931fa89a7
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_2d_blocks.py
@@ -0,0 +1,375 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+from ..utils import deprecate
+from .unets.unet_2d_blocks import (
+    AttnDownBlock2D,
+    AttnDownEncoderBlock2D,
+    AttnSkipDownBlock2D,
+    AttnSkipUpBlock2D,
+    AttnUpBlock2D,
+    AttnUpDecoderBlock2D,
+    AutoencoderTinyBlock,
+    CrossAttnDownBlock2D,
+    CrossAttnUpBlock2D,
+    DownBlock2D,
+    KAttentionBlock,
+    KCrossAttnDownBlock2D,
+    KCrossAttnUpBlock2D,
+    KDownBlock2D,
+    KUpBlock2D,
+    ResnetDownsampleBlock2D,
+    ResnetUpsampleBlock2D,
+    SimpleCrossAttnDownBlock2D,
+    SimpleCrossAttnUpBlock2D,
+    SkipDownBlock2D,
+    SkipUpBlock2D,
+    UNetMidBlock2D,
+    UNetMidBlock2DCrossAttn,
+    UNetMidBlock2DSimpleCrossAttn,
+    UpBlock2D,
+    UpDecoderBlock2D,
+)
+
+
+def get_down_block(
+    down_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    temb_channels: int,
+    add_downsample: bool,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    transformer_layers_per_block: int = 1,
+    num_attention_heads: Optional[int] = None,
+    resnet_groups: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    downsample_padding: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = False,
+    only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    attention_type: str = "default",
+    resnet_skip_time_act: bool = False,
+    resnet_out_scale_factor: float = 1.0,
+    cross_attention_norm: Optional[str] = None,
+    attention_head_dim: Optional[int] = None,
+    downsample_type: Optional[str] = None,
+    dropout: float = 0.0,
+):
+    deprecation_message = "Importing `get_down_block` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import get_down_block`, instead."
+    deprecate("get_down_block", "0.29", deprecation_message)
+
+    from .unets.unet_2d_blocks import get_down_block
+
+    return get_down_block(
+        down_block_type=down_block_type,
+        num_layers=num_layers,
+        in_channels=in_channels,
+        out_channels=out_channels,
+        temb_channels=temb_channels,
+        add_downsample=add_downsample,
+        resnet_eps=resnet_eps,
+        resnet_act_fn=resnet_act_fn,
+        transformer_layers_per_block=transformer_layers_per_block,
+        num_attention_heads=num_attention_heads,
+        resnet_groups=resnet_groups,
+        cross_attention_dim=cross_attention_dim,
+        downsample_padding=downsample_padding,
+        dual_cross_attention=dual_cross_attention,
+        use_linear_projection=use_linear_projection,
+        only_cross_attention=only_cross_attention,
+        upcast_attention=upcast_attention,
+        resnet_time_scale_shift=resnet_time_scale_shift,
+        attention_type=attention_type,
+        resnet_skip_time_act=resnet_skip_time_act,
+        resnet_out_scale_factor=resnet_out_scale_factor,
+        cross_attention_norm=cross_attention_norm,
+        attention_head_dim=attention_head_dim,
+        downsample_type=downsample_type,
+        dropout=dropout,
+    )
+
+
+def get_mid_block(
+    mid_block_type: str,
+    temb_channels: int,
+    in_channels: int,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    resnet_groups: int,
+    output_scale_factor: float = 1.0,
+    transformer_layers_per_block: int = 1,
+    num_attention_heads: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = False,
+    mid_block_only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    attention_type: str = "default",
+    resnet_skip_time_act: bool = False,
+    cross_attention_norm: Optional[str] = None,
+    attention_head_dim: Optional[int] = 1,
+    dropout: float = 0.0,
+):
+    if mid_block_type == "UNetMidBlock2DCrossAttn":
+        return UNetMidBlock2DCrossAttn(
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            output_scale_factor=output_scale_factor,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            resnet_groups=resnet_groups,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            upcast_attention=upcast_attention,
+            attention_type=attention_type,
+        )
+    elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
+        return UNetMidBlock2DSimpleCrossAttn(
+            in_channels=in_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            output_scale_factor=output_scale_factor,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            only_cross_attention=mid_block_only_cross_attention,
+            cross_attention_norm=cross_attention_norm,
+        )
+    elif mid_block_type == "UNetMidBlock2D":
+        return UNetMidBlock2D(
+            in_channels=in_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            num_layers=0,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            output_scale_factor=output_scale_factor,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            add_attention=False,
+        )
+    elif mid_block_type is None:
+        return None
+    else:
+        raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+
+
+def get_up_block(
+    up_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    prev_output_channel: int,
+    temb_channels: int,
+    add_upsample: bool,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    resolution_idx: Optional[int] = None,
+    transformer_layers_per_block: int = 1,
+    num_attention_heads: Optional[int] = None,
+    resnet_groups: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = False,
+    only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    attention_type: str = "default",
+    resnet_skip_time_act: bool = False,
+    resnet_out_scale_factor: float = 1.0,
+    cross_attention_norm: Optional[str] = None,
+    attention_head_dim: Optional[int] = None,
+    upsample_type: Optional[str] = None,
+    dropout: float = 0.0,
+):
+    deprecation_message = "Importing `get_up_block` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import get_up_block`, instead."
+    deprecate("get_up_block", "0.29", deprecation_message)
+
+    from .unets.unet_2d_blocks import get_up_block
+
+    return get_up_block(
+        up_block_type=up_block_type,
+        num_layers=num_layers,
+        in_channels=in_channels,
+        out_channels=out_channels,
+        prev_output_channel=prev_output_channel,
+        temb_channels=temb_channels,
+        add_upsample=add_upsample,
+        resnet_eps=resnet_eps,
+        resnet_act_fn=resnet_act_fn,
+        resolution_idx=resolution_idx,
+        transformer_layers_per_block=transformer_layers_per_block,
+        num_attention_heads=num_attention_heads,
+        resnet_groups=resnet_groups,
+        cross_attention_dim=cross_attention_dim,
+        dual_cross_attention=dual_cross_attention,
+        use_linear_projection=use_linear_projection,
+        only_cross_attention=only_cross_attention,
+        upcast_attention=upcast_attention,
+        resnet_time_scale_shift=resnet_time_scale_shift,
+        attention_type=attention_type,
+        resnet_skip_time_act=resnet_skip_time_act,
+        resnet_out_scale_factor=resnet_out_scale_factor,
+        cross_attention_norm=cross_attention_norm,
+        attention_head_dim=attention_head_dim,
+        upsample_type=upsample_type,
+        dropout=dropout,
+    )
+
+
+class AutoencoderTinyBlock(AutoencoderTinyBlock):
+    deprecation_message = "Importing `AutoencoderTinyBlock` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import AutoencoderTinyBlock`, instead."
+    deprecate("AutoencoderTinyBlock", "0.29", deprecation_message)
+
+
+class UNetMidBlock2D(UNetMidBlock2D):
+    deprecation_message = "Importing `UNetMidBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import UNetMidBlock2D`, instead."
+    deprecate("UNetMidBlock2D", "0.29", deprecation_message)
+
+
+class UNetMidBlock2DCrossAttn(UNetMidBlock2DCrossAttn):
+    deprecation_message = "Importing `UNetMidBlock2DCrossAttn` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import UNetMidBlock2DCrossAttn`, instead."
+    deprecate("UNetMidBlock2DCrossAttn", "0.29", deprecation_message)
+
+
+class UNetMidBlock2DSimpleCrossAttn(UNetMidBlock2DSimpleCrossAttn):
+    deprecation_message = "Importing `UNetMidBlock2DSimpleCrossAttn` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import UNetMidBlock2DSimpleCrossAttn`, instead."
+    deprecate("UNetMidBlock2DSimpleCrossAttn", "0.29", deprecation_message)
+
+
+class AttnDownBlock2D(AttnDownBlock2D):
+    deprecation_message = "Importing `AttnDownBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import AttnDownBlock2D`, instead."
+    deprecate("AttnDownBlock2D", "0.29", deprecation_message)
+
+
+class CrossAttnDownBlock2D(CrossAttnDownBlock2D):
+    deprecation_message = "Importing `AttnDownBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import CrossAttnDownBlock2D`, instead."
+    deprecate("CrossAttnDownBlock2D", "0.29", deprecation_message)
+
+
+class DownBlock2D(DownBlock2D):
+    deprecation_message = "Importing `DownBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import DownBlock2D`, instead."
+    deprecate("DownBlock2D", "0.29", deprecation_message)
+
+
+class AttnDownEncoderBlock2D(AttnDownEncoderBlock2D):
+    deprecation_message = "Importing `AttnDownEncoderBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import AttnDownEncoderBlock2D`, instead."
+    deprecate("AttnDownEncoderBlock2D", "0.29", deprecation_message)
+
+
+class AttnSkipDownBlock2D(AttnSkipDownBlock2D):
+    deprecation_message = "Importing `AttnSkipDownBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import AttnSkipDownBlock2D`, instead."
+    deprecate("AttnSkipDownBlock2D", "0.29", deprecation_message)
+
+
+class SkipDownBlock2D(SkipDownBlock2D):
+    deprecation_message = "Importing `SkipDownBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import SkipDownBlock2D`, instead."
+    deprecate("SkipDownBlock2D", "0.29", deprecation_message)
+
+
+class ResnetDownsampleBlock2D(ResnetDownsampleBlock2D):
+    deprecation_message = "Importing `ResnetDownsampleBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import ResnetDownsampleBlock2D`, instead."
+    deprecate("ResnetDownsampleBlock2D", "0.29", deprecation_message)
+
+
+class SimpleCrossAttnDownBlock2D(SimpleCrossAttnDownBlock2D):
+    deprecation_message = "Importing `SimpleCrossAttnDownBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import SimpleCrossAttnDownBlock2D`, instead."
+    deprecate("SimpleCrossAttnDownBlock2D", "0.29", deprecation_message)
+
+
+class KDownBlock2D(KDownBlock2D):
+    deprecation_message = "Importing `KDownBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import KDownBlock2D`, instead."
+    deprecate("KDownBlock2D", "0.29", deprecation_message)
+
+
+class KCrossAttnDownBlock2D(KCrossAttnDownBlock2D):
+    deprecation_message = "Importing `KCrossAttnDownBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import KCrossAttnDownBlock2D`, instead."
+    deprecate("KCrossAttnDownBlock2D", "0.29", deprecation_message)
+
+
+class AttnUpBlock2D(AttnUpBlock2D):
+    deprecation_message = "Importing `AttnUpBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import AttnUpBlock2D`, instead."
+    deprecate("AttnUpBlock2D", "0.29", deprecation_message)
+
+
+class CrossAttnUpBlock2D(CrossAttnUpBlock2D):
+    deprecation_message = "Importing `CrossAttnUpBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import CrossAttnUpBlock2D`, instead."
+    deprecate("CrossAttnUpBlock2D", "0.29", deprecation_message)
+
+
+class UpBlock2D(UpBlock2D):
+    deprecation_message = "Importing `UpBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import UpBlock2D`, instead."
+    deprecate("UpBlock2D", "0.29", deprecation_message)
+
+
+class UpDecoderBlock2D(UpDecoderBlock2D):
+    deprecation_message = "Importing `UpDecoderBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import UpDecoderBlock2D`, instead."
+    deprecate("UpDecoderBlock2D", "0.29", deprecation_message)
+
+
+class AttnUpDecoderBlock2D(AttnUpDecoderBlock2D):
+    deprecation_message = "Importing `AttnUpDecoderBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import AttnUpDecoderBlock2D`, instead."
+    deprecate("AttnUpDecoderBlock2D", "0.29", deprecation_message)
+
+
+class AttnSkipUpBlock2D(AttnSkipUpBlock2D):
+    deprecation_message = "Importing `AttnSkipUpBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import AttnSkipUpBlock2D`, instead."
+    deprecate("AttnSkipUpBlock2D", "0.29", deprecation_message)
+
+
+class SkipUpBlock2D(SkipUpBlock2D):
+    deprecation_message = "Importing `SkipUpBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import SkipUpBlock2D`, instead."
+    deprecate("SkipUpBlock2D", "0.29", deprecation_message)
+
+
+class ResnetUpsampleBlock2D(ResnetUpsampleBlock2D):
+    deprecation_message = "Importing `ResnetUpsampleBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import ResnetUpsampleBlock2D`, instead."
+    deprecate("ResnetUpsampleBlock2D", "0.29", deprecation_message)
+
+
+class SimpleCrossAttnUpBlock2D(SimpleCrossAttnUpBlock2D):
+    deprecation_message = "Importing `SimpleCrossAttnUpBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import SimpleCrossAttnUpBlock2D`, instead."
+    deprecate("SimpleCrossAttnUpBlock2D", "0.29", deprecation_message)
+
+
+class KUpBlock2D(KUpBlock2D):
+    deprecation_message = "Importing `KUpBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import KUpBlock2D`, instead."
+    deprecate("KUpBlock2D", "0.29", deprecation_message)
+
+
+class KCrossAttnUpBlock2D(KCrossAttnUpBlock2D):
+    deprecation_message = "Importing `KCrossAttnUpBlock2D` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import KCrossAttnUpBlock2D`, instead."
+    deprecate("KCrossAttnUpBlock2D", "0.29", deprecation_message)
+
+
+# can potentially later be renamed to `No-feed-forward` attention
+class KAttentionBlock(KAttentionBlock):
+    deprecation_message = "Importing `KAttentionBlock` from `diffusers.models.unet_2d_blocks` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_blocks import KAttentionBlock`, instead."
+    deprecate("KAttentionBlock", "0.29", deprecation_message)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_2d_condition.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_2d_condition.py
new file mode 100644
index 000000000..85a3e7b09
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_2d_condition.py
@@ -0,0 +1,25 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..utils import deprecate
+from .unets.unet_2d_condition import UNet2DConditionModel, UNet2DConditionOutput
+
+
+class UNet2DConditionOutput(UNet2DConditionOutput):
+    deprecation_message = "Importing `UNet2DConditionOutput` from `diffusers.models.unet_2d_condition` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_condition import UNet2DConditionOutput`, instead."
+    deprecate("UNet2DConditionOutput", "0.29", deprecation_message)
+
+
+class UNet2DConditionModel(UNet2DConditionModel):
+    deprecation_message = "Importing `UNet2DConditionModel` from `diffusers.models.unet_2d_condition` is deprecated and this will be removed in a future version. Please use `from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel`, instead."
+    deprecate("UNet2DConditionModel", "0.29", deprecation_message)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/__init__.py
new file mode 100644
index 000000000..9ef04fb62
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/__init__.py
@@ -0,0 +1,18 @@
+from ...utils import is_flax_available, is_torch_available
+
+
+if is_torch_available():
+    from .unet_1d import UNet1DModel
+    from .unet_2d import UNet2DModel
+    from .unet_2d_condition import UNet2DConditionModel
+    from .unet_3d_condition import UNet3DConditionModel
+    from .unet_i2vgen_xl import I2VGenXLUNet
+    from .unet_kandinsky3 import Kandinsky3UNet
+    from .unet_motion_model import MotionAdapter, UNetMotionModel
+    from .unet_spatio_temporal_condition import UNetSpatioTemporalConditionModel
+    from .unet_stable_cascade import StableCascadeUNet
+    from .uvit_2d import UVit2DModel
+
+
+if is_flax_available():
+    from .unet_2d_condition_flax import FlaxUNet2DConditionModel
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_1d.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_1d.py
new file mode 100644
index 000000000..59d70f67c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_1d.py
@@ -0,0 +1,255 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import BaseOutput
+from ..embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
+from ..modeling_utils import ModelMixin
+from .unet_1d_blocks import get_down_block, get_mid_block, get_out_block, get_up_block
+
+
+@dataclass
+class UNet1DOutput(BaseOutput):
+    """
+    The output of [`UNet1DModel`].
+
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, sample_size)`):
+            The hidden states output from the last layer of the model.
+    """
+
+    sample: torch.FloatTensor
+
+
+class UNet1DModel(ModelMixin, ConfigMixin):
+    r"""
+    A 1D UNet model that takes a noisy sample and a timestep and returns a sample shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        sample_size (`int`, *optional*): Default length of sample. Should be adaptable at runtime.
+        in_channels (`int`, *optional*, defaults to 2): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 2): Number of channels in the output.
+        extra_in_channels (`int`, *optional*, defaults to 0):
+            Number of additional channels to be added to the input of the first down block. Useful for cases where the
+            input data has more channels than what the model was initially designed for.
+        time_embedding_type (`str`, *optional*, defaults to `"fourier"`): Type of time embedding to use.
+        freq_shift (`float`, *optional*, defaults to 0.0): Frequency shift for Fourier time embedding.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip sin to cos for Fourier time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D")`):
+            Tuple of downsample block types.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip")`):
+            Tuple of upsample block types.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(32, 32, 64)`):
+            Tuple of block output channels.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock1D"`): Block type for middle of UNet.
+        out_block_type (`str`, *optional*, defaults to `None`): Optional output processing block of UNet.
+        act_fn (`str`, *optional*, defaults to `None`): Optional activation function in UNet blocks.
+        norm_num_groups (`int`, *optional*, defaults to 8): The number of groups for normalization.
+        layers_per_block (`int`, *optional*, defaults to 1): The number of layers per block.
+        downsample_each_block (`int`, *optional*, defaults to `False`):
+            Experimental feature for using a UNet without upsampling.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: int = 65536,
+        sample_rate: Optional[int] = None,
+        in_channels: int = 2,
+        out_channels: int = 2,
+        extra_in_channels: int = 0,
+        time_embedding_type: str = "fourier",
+        flip_sin_to_cos: bool = True,
+        use_timestep_embedding: bool = False,
+        freq_shift: float = 0.0,
+        down_block_types: Tuple[str] = ("DownBlock1DNoSkip", "DownBlock1D", "AttnDownBlock1D"),
+        up_block_types: Tuple[str] = ("AttnUpBlock1D", "UpBlock1D", "UpBlock1DNoSkip"),
+        mid_block_type: Tuple[str] = "UNetMidBlock1D",
+        out_block_type: str = None,
+        block_out_channels: Tuple[int] = (32, 32, 64),
+        act_fn: str = None,
+        norm_num_groups: int = 8,
+        layers_per_block: int = 1,
+        downsample_each_block: bool = False,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+
+        # time
+        if time_embedding_type == "fourier":
+            self.time_proj = GaussianFourierProjection(
+                embedding_size=8, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+            )
+            timestep_input_dim = 2 * block_out_channels[0]
+        elif time_embedding_type == "positional":
+            self.time_proj = Timesteps(
+                block_out_channels[0], flip_sin_to_cos=flip_sin_to_cos, downscale_freq_shift=freq_shift
+            )
+            timestep_input_dim = block_out_channels[0]
+
+        if use_timestep_embedding:
+            time_embed_dim = block_out_channels[0] * 4
+            self.time_mlp = TimestepEmbedding(
+                in_channels=timestep_input_dim,
+                time_embed_dim=time_embed_dim,
+                act_fn=act_fn,
+                out_dim=block_out_channels[0],
+            )
+
+        self.down_blocks = nn.ModuleList([])
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+        self.out_block = None
+
+        # down
+        output_channel = in_channels
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+
+            if i == 0:
+                input_channel += extra_in_channels
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=block_out_channels[0],
+                add_downsample=not is_final_block or downsample_each_block,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = get_mid_block(
+            mid_block_type,
+            in_channels=block_out_channels[-1],
+            mid_channels=block_out_channels[-1],
+            out_channels=block_out_channels[-1],
+            embed_dim=block_out_channels[0],
+            num_layers=layers_per_block,
+            add_downsample=downsample_each_block,
+        )
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        if out_block_type is None:
+            final_upsample_channels = out_channels
+        else:
+            final_upsample_channels = block_out_channels[0]
+
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = (
+                reversed_block_out_channels[i + 1] if i < len(up_block_types) - 1 else final_upsample_channels
+            )
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                temb_channels=block_out_channels[0],
+                add_upsample=not is_final_block,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        num_groups_out = norm_num_groups if norm_num_groups is not None else min(block_out_channels[0] // 4, 32)
+        self.out_block = get_out_block(
+            out_block_type=out_block_type,
+            num_groups_out=num_groups_out,
+            embed_dim=block_out_channels[0],
+            out_channels=out_channels,
+            act_fn=act_fn,
+            fc_dim=block_out_channels[-1] // 4,
+        )
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        return_dict: bool = True,
+    ) -> Union[UNet1DOutput, Tuple]:
+        r"""
+        The [`UNet1DModel`] forward method.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch_size, num_channels, sample_size)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_1d.UNet1DOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.unet_1d.UNet1DOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_1d.UNet1DOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is the sample tensor.
+        """
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
+        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        timestep_embed = self.time_proj(timesteps)
+        if self.config.use_timestep_embedding:
+            timestep_embed = self.time_mlp(timestep_embed)
+        else:
+            timestep_embed = timestep_embed[..., None]
+            timestep_embed = timestep_embed.repeat([1, 1, sample.shape[2]]).to(sample.dtype)
+            timestep_embed = timestep_embed.broadcast_to((sample.shape[:1] + timestep_embed.shape[1:]))
+
+        # 2. down
+        down_block_res_samples = ()
+        for downsample_block in self.down_blocks:
+            sample, res_samples = downsample_block(hidden_states=sample, temb=timestep_embed)
+            down_block_res_samples += res_samples
+
+        # 3. mid
+        if self.mid_block:
+            sample = self.mid_block(sample, timestep_embed)
+
+        # 4. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            res_samples = down_block_res_samples[-1:]
+            down_block_res_samples = down_block_res_samples[:-1]
+            sample = upsample_block(sample, res_hidden_states_tuple=res_samples, temb=timestep_embed)
+
+        # 5. post-process
+        if self.out_block:
+            sample = self.out_block(sample, timestep_embed)
+
+        if not return_dict:
+            return (sample,)
+
+        return UNet1DOutput(sample=sample)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_1d_blocks.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_1d_blocks.py
new file mode 100644
index 000000000..e3163cd1d
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_1d_blocks.py
@@ -0,0 +1,702 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ..activations import get_activation
+from ..resnet import Downsample1D, ResidualTemporalBlock1D, Upsample1D, rearrange_dims
+
+
+class DownResnetBlock1D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        conv_shortcut: bool = False,
+        temb_channels: int = 32,
+        groups: int = 32,
+        groups_out: Optional[int] = None,
+        non_linearity: Optional[str] = None,
+        time_embedding_norm: str = "default",
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.time_embedding_norm = time_embedding_norm
+        self.add_downsample = add_downsample
+        self.output_scale_factor = output_scale_factor
+
+        if groups_out is None:
+            groups_out = groups
+
+        # there will always be at least one resnet
+        resnets = [ResidualTemporalBlock1D(in_channels, out_channels, embed_dim=temb_channels)]
+
+        for _ in range(num_layers):
+            resnets.append(ResidualTemporalBlock1D(out_channels, out_channels, embed_dim=temb_channels))
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if non_linearity is None:
+            self.nonlinearity = None
+        else:
+            self.nonlinearity = get_activation(non_linearity)
+
+        self.downsample = None
+        if add_downsample:
+            self.downsample = Downsample1D(out_channels, use_conv=True, padding=1)
+
+    def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor:
+        output_states = ()
+
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for resnet in self.resnets[1:]:
+            hidden_states = resnet(hidden_states, temb)
+
+        output_states += (hidden_states,)
+
+        if self.nonlinearity is not None:
+            hidden_states = self.nonlinearity(hidden_states)
+
+        if self.downsample is not None:
+            hidden_states = self.downsample(hidden_states)
+
+        return hidden_states, output_states
+
+
+class UpResnetBlock1D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        temb_channels: int = 32,
+        groups: int = 32,
+        groups_out: Optional[int] = None,
+        non_linearity: Optional[str] = None,
+        time_embedding_norm: str = "default",
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.time_embedding_norm = time_embedding_norm
+        self.add_upsample = add_upsample
+        self.output_scale_factor = output_scale_factor
+
+        if groups_out is None:
+            groups_out = groups
+
+        # there will always be at least one resnet
+        resnets = [ResidualTemporalBlock1D(2 * in_channels, out_channels, embed_dim=temb_channels)]
+
+        for _ in range(num_layers):
+            resnets.append(ResidualTemporalBlock1D(out_channels, out_channels, embed_dim=temb_channels))
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if non_linearity is None:
+            self.nonlinearity = None
+        else:
+            self.nonlinearity = get_activation(non_linearity)
+
+        self.upsample = None
+        if add_upsample:
+            self.upsample = Upsample1D(out_channels, use_conv_transpose=True)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Optional[Tuple[torch.FloatTensor, ...]] = None,
+        temb: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        if res_hidden_states_tuple is not None:
+            res_hidden_states = res_hidden_states_tuple[-1]
+            hidden_states = torch.cat((hidden_states, res_hidden_states), dim=1)
+
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for resnet in self.resnets[1:]:
+            hidden_states = resnet(hidden_states, temb)
+
+        if self.nonlinearity is not None:
+            hidden_states = self.nonlinearity(hidden_states)
+
+        if self.upsample is not None:
+            hidden_states = self.upsample(hidden_states)
+
+        return hidden_states
+
+
+class ValueFunctionMidBlock1D(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, embed_dim: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.embed_dim = embed_dim
+
+        self.res1 = ResidualTemporalBlock1D(in_channels, in_channels // 2, embed_dim=embed_dim)
+        self.down1 = Downsample1D(out_channels // 2, use_conv=True)
+        self.res2 = ResidualTemporalBlock1D(in_channels // 2, in_channels // 4, embed_dim=embed_dim)
+        self.down2 = Downsample1D(out_channels // 4, use_conv=True)
+
+    def forward(self, x: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor:
+        x = self.res1(x, temb)
+        x = self.down1(x)
+        x = self.res2(x, temb)
+        x = self.down2(x)
+        return x
+
+
+class MidResTemporalBlock1D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        embed_dim: int,
+        num_layers: int = 1,
+        add_downsample: bool = False,
+        add_upsample: bool = False,
+        non_linearity: Optional[str] = None,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.add_downsample = add_downsample
+
+        # there will always be at least one resnet
+        resnets = [ResidualTemporalBlock1D(in_channels, out_channels, embed_dim=embed_dim)]
+
+        for _ in range(num_layers):
+            resnets.append(ResidualTemporalBlock1D(out_channels, out_channels, embed_dim=embed_dim))
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if non_linearity is None:
+            self.nonlinearity = None
+        else:
+            self.nonlinearity = get_activation(non_linearity)
+
+        self.upsample = None
+        if add_upsample:
+            self.upsample = Downsample1D(out_channels, use_conv=True)
+
+        self.downsample = None
+        if add_downsample:
+            self.downsample = Downsample1D(out_channels, use_conv=True)
+
+        if self.upsample and self.downsample:
+            raise ValueError("Block cannot downsample and upsample")
+
+    def forward(self, hidden_states: torch.FloatTensor, temb: torch.FloatTensor) -> torch.FloatTensor:
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for resnet in self.resnets[1:]:
+            hidden_states = resnet(hidden_states, temb)
+
+        if self.upsample:
+            hidden_states = self.upsample(hidden_states)
+        if self.downsample:
+            self.downsample = self.downsample(hidden_states)
+
+        return hidden_states
+
+
+class OutConv1DBlock(nn.Module):
+    def __init__(self, num_groups_out: int, out_channels: int, embed_dim: int, act_fn: str):
+        super().__init__()
+        self.final_conv1d_1 = nn.Conv1d(embed_dim, embed_dim, 5, padding=2)
+        self.final_conv1d_gn = nn.GroupNorm(num_groups_out, embed_dim)
+        self.final_conv1d_act = get_activation(act_fn)
+        self.final_conv1d_2 = nn.Conv1d(embed_dim, out_channels, 1)
+
+    def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor:
+        hidden_states = self.final_conv1d_1(hidden_states)
+        hidden_states = rearrange_dims(hidden_states)
+        hidden_states = self.final_conv1d_gn(hidden_states)
+        hidden_states = rearrange_dims(hidden_states)
+        hidden_states = self.final_conv1d_act(hidden_states)
+        hidden_states = self.final_conv1d_2(hidden_states)
+        return hidden_states
+
+
+class OutValueFunctionBlock(nn.Module):
+    def __init__(self, fc_dim: int, embed_dim: int, act_fn: str = "mish"):
+        super().__init__()
+        self.final_block = nn.ModuleList(
+            [
+                nn.Linear(fc_dim + embed_dim, fc_dim // 2),
+                get_activation(act_fn),
+                nn.Linear(fc_dim // 2, 1),
+            ]
+        )
+
+    def forward(self, hidden_states: torch.FloatTensor, temb: torch.FloatTensor) -> torch.FloatTensor:
+        hidden_states = hidden_states.view(hidden_states.shape[0], -1)
+        hidden_states = torch.cat((hidden_states, temb), dim=-1)
+        for layer in self.final_block:
+            hidden_states = layer(hidden_states)
+
+        return hidden_states
+
+
+_kernels = {
+    "linear": [1 / 8, 3 / 8, 3 / 8, 1 / 8],
+    "cubic": [-0.01171875, -0.03515625, 0.11328125, 0.43359375, 0.43359375, 0.11328125, -0.03515625, -0.01171875],
+    "lanczos3": [
+        0.003689131001010537,
+        0.015056144446134567,
+        -0.03399861603975296,
+        -0.066637322306633,
+        0.13550527393817902,
+        0.44638532400131226,
+        0.44638532400131226,
+        0.13550527393817902,
+        -0.066637322306633,
+        -0.03399861603975296,
+        0.015056144446134567,
+        0.003689131001010537,
+    ],
+}
+
+
+class Downsample1d(nn.Module):
+    def __init__(self, kernel: str = "linear", pad_mode: str = "reflect"):
+        super().__init__()
+        self.pad_mode = pad_mode
+        kernel_1d = torch.tensor(_kernels[kernel])
+        self.pad = kernel_1d.shape[0] // 2 - 1
+        self.register_buffer("kernel", kernel_1d)
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        hidden_states = F.pad(hidden_states, (self.pad,) * 2, self.pad_mode)
+        weight = hidden_states.new_zeros([hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]])
+        indices = torch.arange(hidden_states.shape[1], device=hidden_states.device)
+        kernel = self.kernel.to(weight)[None, :].expand(hidden_states.shape[1], -1)
+        weight[indices, indices] = kernel
+        return F.conv1d(hidden_states, weight, stride=2)
+
+
+class Upsample1d(nn.Module):
+    def __init__(self, kernel: str = "linear", pad_mode: str = "reflect"):
+        super().__init__()
+        self.pad_mode = pad_mode
+        kernel_1d = torch.tensor(_kernels[kernel]) * 2
+        self.pad = kernel_1d.shape[0] // 2 - 1
+        self.register_buffer("kernel", kernel_1d)
+
+    def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor:
+        hidden_states = F.pad(hidden_states, ((self.pad + 1) // 2,) * 2, self.pad_mode)
+        weight = hidden_states.new_zeros([hidden_states.shape[1], hidden_states.shape[1], self.kernel.shape[0]])
+        indices = torch.arange(hidden_states.shape[1], device=hidden_states.device)
+        kernel = self.kernel.to(weight)[None, :].expand(hidden_states.shape[1], -1)
+        weight[indices, indices] = kernel
+        return F.conv_transpose1d(hidden_states, weight, stride=2, padding=self.pad * 2 + 1)
+
+
+class SelfAttention1d(nn.Module):
+    def __init__(self, in_channels: int, n_head: int = 1, dropout_rate: float = 0.0):
+        super().__init__()
+        self.channels = in_channels
+        self.group_norm = nn.GroupNorm(1, num_channels=in_channels)
+        self.num_heads = n_head
+
+        self.query = nn.Linear(self.channels, self.channels)
+        self.key = nn.Linear(self.channels, self.channels)
+        self.value = nn.Linear(self.channels, self.channels)
+
+        self.proj_attn = nn.Linear(self.channels, self.channels, bias=True)
+
+        self.dropout = nn.Dropout(dropout_rate, inplace=True)
+
+    def transpose_for_scores(self, projection: torch.Tensor) -> torch.Tensor:
+        new_projection_shape = projection.size()[:-1] + (self.num_heads, -1)
+        # move heads to 2nd position (B, T, H * D) -> (B, T, H, D) -> (B, H, T, D)
+        new_projection = projection.view(new_projection_shape).permute(0, 2, 1, 3)
+        return new_projection
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        residual = hidden_states
+        batch, channel_dim, seq = hidden_states.shape
+
+        hidden_states = self.group_norm(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+
+        query_proj = self.query(hidden_states)
+        key_proj = self.key(hidden_states)
+        value_proj = self.value(hidden_states)
+
+        query_states = self.transpose_for_scores(query_proj)
+        key_states = self.transpose_for_scores(key_proj)
+        value_states = self.transpose_for_scores(value_proj)
+
+        scale = 1 / math.sqrt(math.sqrt(key_states.shape[-1]))
+
+        attention_scores = torch.matmul(query_states * scale, key_states.transpose(-1, -2) * scale)
+        attention_probs = torch.softmax(attention_scores, dim=-1)
+
+        # compute attention output
+        hidden_states = torch.matmul(attention_probs, value_states)
+
+        hidden_states = hidden_states.permute(0, 2, 1, 3).contiguous()
+        new_hidden_states_shape = hidden_states.size()[:-2] + (self.channels,)
+        hidden_states = hidden_states.view(new_hidden_states_shape)
+
+        # compute next hidden_states
+        hidden_states = self.proj_attn(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.dropout(hidden_states)
+
+        output = hidden_states + residual
+
+        return output
+
+
+class ResConvBlock(nn.Module):
+    def __init__(self, in_channels: int, mid_channels: int, out_channels: int, is_last: bool = False):
+        super().__init__()
+        self.is_last = is_last
+        self.has_conv_skip = in_channels != out_channels
+
+        if self.has_conv_skip:
+            self.conv_skip = nn.Conv1d(in_channels, out_channels, 1, bias=False)
+
+        self.conv_1 = nn.Conv1d(in_channels, mid_channels, 5, padding=2)
+        self.group_norm_1 = nn.GroupNorm(1, mid_channels)
+        self.gelu_1 = nn.GELU()
+        self.conv_2 = nn.Conv1d(mid_channels, out_channels, 5, padding=2)
+
+        if not self.is_last:
+            self.group_norm_2 = nn.GroupNorm(1, out_channels)
+            self.gelu_2 = nn.GELU()
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        residual = self.conv_skip(hidden_states) if self.has_conv_skip else hidden_states
+
+        hidden_states = self.conv_1(hidden_states)
+        hidden_states = self.group_norm_1(hidden_states)
+        hidden_states = self.gelu_1(hidden_states)
+        hidden_states = self.conv_2(hidden_states)
+
+        if not self.is_last:
+            hidden_states = self.group_norm_2(hidden_states)
+            hidden_states = self.gelu_2(hidden_states)
+
+        output = hidden_states + residual
+        return output
+
+
+class UNetMidBlock1D(nn.Module):
+    def __init__(self, mid_channels: int, in_channels: int, out_channels: Optional[int] = None):
+        super().__init__()
+
+        out_channels = in_channels if out_channels is None else out_channels
+
+        # there is always at least one resnet
+        self.down = Downsample1d("cubic")
+        resnets = [
+            ResConvBlock(in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels),
+        ]
+        attentions = [
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(out_channels, out_channels // 32),
+        ]
+        self.up = Upsample1d(kernel="cubic")
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor:
+        hidden_states = self.down(hidden_states)
+        for attn, resnet in zip(self.attentions, self.resnets):
+            hidden_states = resnet(hidden_states)
+            hidden_states = attn(hidden_states)
+
+        hidden_states = self.up(hidden_states)
+
+        return hidden_states
+
+
+class AttnDownBlock1D(nn.Module):
+    def __init__(self, out_channels: int, in_channels: int, mid_channels: Optional[int] = None):
+        super().__init__()
+        mid_channels = out_channels if mid_channels is None else mid_channels
+
+        self.down = Downsample1d("cubic")
+        resnets = [
+            ResConvBlock(in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels),
+        ]
+        attentions = [
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(out_channels, out_channels // 32),
+        ]
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor:
+        hidden_states = self.down(hidden_states)
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states)
+            hidden_states = attn(hidden_states)
+
+        return hidden_states, (hidden_states,)
+
+
+class DownBlock1D(nn.Module):
+    def __init__(self, out_channels: int, in_channels: int, mid_channels: Optional[int] = None):
+        super().__init__()
+        mid_channels = out_channels if mid_channels is None else mid_channels
+
+        self.down = Downsample1d("cubic")
+        resnets = [
+            ResConvBlock(in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels),
+        ]
+
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor:
+        hidden_states = self.down(hidden_states)
+
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states)
+
+        return hidden_states, (hidden_states,)
+
+
+class DownBlock1DNoSkip(nn.Module):
+    def __init__(self, out_channels: int, in_channels: int, mid_channels: Optional[int] = None):
+        super().__init__()
+        mid_channels = out_channels if mid_channels is None else mid_channels
+
+        resnets = [
+            ResConvBlock(in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels),
+        ]
+
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor:
+        hidden_states = torch.cat([hidden_states, temb], dim=1)
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states)
+
+        return hidden_states, (hidden_states,)
+
+
+class AttnUpBlock1D(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, mid_channels: Optional[int] = None):
+        super().__init__()
+        mid_channels = out_channels if mid_channels is None else mid_channels
+
+        resnets = [
+            ResConvBlock(2 * in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels),
+        ]
+        attentions = [
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(mid_channels, mid_channels // 32),
+            SelfAttention1d(out_channels, out_channels // 32),
+        ]
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.up = Upsample1d(kernel="cubic")
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        res_hidden_states = res_hidden_states_tuple[-1]
+        hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states)
+            hidden_states = attn(hidden_states)
+
+        hidden_states = self.up(hidden_states)
+
+        return hidden_states
+
+
+class UpBlock1D(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, mid_channels: Optional[int] = None):
+        super().__init__()
+        mid_channels = in_channels if mid_channels is None else mid_channels
+
+        resnets = [
+            ResConvBlock(2 * in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels),
+        ]
+
+        self.resnets = nn.ModuleList(resnets)
+        self.up = Upsample1d(kernel="cubic")
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        res_hidden_states = res_hidden_states_tuple[-1]
+        hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states)
+
+        hidden_states = self.up(hidden_states)
+
+        return hidden_states
+
+
+class UpBlock1DNoSkip(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int, mid_channels: Optional[int] = None):
+        super().__init__()
+        mid_channels = in_channels if mid_channels is None else mid_channels
+
+        resnets = [
+            ResConvBlock(2 * in_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, mid_channels),
+            ResConvBlock(mid_channels, mid_channels, out_channels, is_last=True),
+        ]
+
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        res_hidden_states = res_hidden_states_tuple[-1]
+        hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states)
+
+        return hidden_states
+
+
+DownBlockType = Union[DownResnetBlock1D, DownBlock1D, AttnDownBlock1D, DownBlock1DNoSkip]
+MidBlockType = Union[MidResTemporalBlock1D, ValueFunctionMidBlock1D, UNetMidBlock1D]
+OutBlockType = Union[OutConv1DBlock, OutValueFunctionBlock]
+UpBlockType = Union[UpResnetBlock1D, UpBlock1D, AttnUpBlock1D, UpBlock1DNoSkip]
+
+
+def get_down_block(
+    down_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    temb_channels: int,
+    add_downsample: bool,
+) -> DownBlockType:
+    if down_block_type == "DownResnetBlock1D":
+        return DownResnetBlock1D(
+            in_channels=in_channels,
+            num_layers=num_layers,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+        )
+    elif down_block_type == "DownBlock1D":
+        return DownBlock1D(out_channels=out_channels, in_channels=in_channels)
+    elif down_block_type == "AttnDownBlock1D":
+        return AttnDownBlock1D(out_channels=out_channels, in_channels=in_channels)
+    elif down_block_type == "DownBlock1DNoSkip":
+        return DownBlock1DNoSkip(out_channels=out_channels, in_channels=in_channels)
+    raise ValueError(f"{down_block_type} does not exist.")
+
+
+def get_up_block(
+    up_block_type: str, num_layers: int, in_channels: int, out_channels: int, temb_channels: int, add_upsample: bool
+) -> UpBlockType:
+    if up_block_type == "UpResnetBlock1D":
+        return UpResnetBlock1D(
+            in_channels=in_channels,
+            num_layers=num_layers,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+        )
+    elif up_block_type == "UpBlock1D":
+        return UpBlock1D(in_channels=in_channels, out_channels=out_channels)
+    elif up_block_type == "AttnUpBlock1D":
+        return AttnUpBlock1D(in_channels=in_channels, out_channels=out_channels)
+    elif up_block_type == "UpBlock1DNoSkip":
+        return UpBlock1DNoSkip(in_channels=in_channels, out_channels=out_channels)
+    raise ValueError(f"{up_block_type} does not exist.")
+
+
+def get_mid_block(
+    mid_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    mid_channels: int,
+    out_channels: int,
+    embed_dim: int,
+    add_downsample: bool,
+) -> MidBlockType:
+    if mid_block_type == "MidResTemporalBlock1D":
+        return MidResTemporalBlock1D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            embed_dim=embed_dim,
+            add_downsample=add_downsample,
+        )
+    elif mid_block_type == "ValueFunctionMidBlock1D":
+        return ValueFunctionMidBlock1D(in_channels=in_channels, out_channels=out_channels, embed_dim=embed_dim)
+    elif mid_block_type == "UNetMidBlock1D":
+        return UNetMidBlock1D(in_channels=in_channels, mid_channels=mid_channels, out_channels=out_channels)
+    raise ValueError(f"{mid_block_type} does not exist.")
+
+
+def get_out_block(
+    *, out_block_type: str, num_groups_out: int, embed_dim: int, out_channels: int, act_fn: str, fc_dim: int
+) -> Optional[OutBlockType]:
+    if out_block_type == "OutConv1DBlock":
+        return OutConv1DBlock(num_groups_out, out_channels, embed_dim, act_fn)
+    elif out_block_type == "ValueFunction":
+        return OutValueFunctionBlock(fc_dim, embed_dim, act_fn)
+    return None
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d.py
new file mode 100644
index 000000000..5efb63822
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d.py
@@ -0,0 +1,346 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import BaseOutput
+from ..embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
+from ..modeling_utils import ModelMixin
+from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block
+
+
+@dataclass
+class UNet2DOutput(BaseOutput):
+    """
+    The output of [`UNet2DModel`].
+
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The hidden states output from the last layer of the model.
+    """
+
+    sample: torch.FloatTensor
+
+
+class UNet2DModel(ModelMixin, ConfigMixin):
+    r"""
+    A 2D UNet model that takes a noisy sample and a timestep and returns a sample shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample. Dimensions must be a multiple of `2 ** (len(block_out_channels) -
+            1)`.
+        in_channels (`int`, *optional*, defaults to 3): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 3): Number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        time_embedding_type (`str`, *optional*, defaults to `"positional"`): Type of time embedding to use.
+        freq_shift (`int`, *optional*, defaults to 0): Frequency shift for Fourier time embedding.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
+            Whether to flip sin to cos for Fourier time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D")`):
+            Tuple of downsample block types.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2D"`):
+            Block type for middle of UNet, it can be either `UNetMidBlock2D` or `UnCLIPUNetMidBlock2D`.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D")`):
+            Tuple of upsample block types.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(224, 448, 672, 896)`):
+            Tuple of block output channels.
+        layers_per_block (`int`, *optional*, defaults to `2`): The number of layers per block.
+        mid_block_scale_factor (`float`, *optional*, defaults to `1`): The scale factor for the mid block.
+        downsample_padding (`int`, *optional*, defaults to `1`): The padding for the downsample convolution.
+        downsample_type (`str`, *optional*, defaults to `conv`):
+            The downsample type for downsampling layers. Choose between "conv" and "resnet"
+        upsample_type (`str`, *optional*, defaults to `conv`):
+            The upsample type for upsampling layers. Choose between "conv" and "resnet"
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        attention_head_dim (`int`, *optional*, defaults to `8`): The attention head dimension.
+        norm_num_groups (`int`, *optional*, defaults to `32`): The number of groups for normalization.
+        attn_norm_num_groups (`int`, *optional*, defaults to `None`):
+            If set to an integer, a group norm layer will be created in the mid block's [`Attention`] layer with the
+            given number of groups. If left as `None`, the group norm layer will only be created if
+            `resnet_time_scale_shift` is set to `default`, and if created will have `norm_num_groups` groups.
+        norm_eps (`float`, *optional*, defaults to `1e-5`): The epsilon for normalization.
+        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
+            for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
+            `"timestep"`, or `"identity"`.
+        num_class_embeds (`int`, *optional*, defaults to `None`):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim` when performing class
+            conditioning with `class_embed_type` equal to `None`.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[Union[int, Tuple[int, int]]] = None,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        center_input_sample: bool = False,
+        time_embedding_type: str = "positional",
+        freq_shift: int = 0,
+        flip_sin_to_cos: bool = True,
+        down_block_types: Tuple[str, ...] = ("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D"),
+        up_block_types: Tuple[str, ...] = ("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D"),
+        block_out_channels: Tuple[int, ...] = (224, 448, 672, 896),
+        layers_per_block: int = 2,
+        mid_block_scale_factor: float = 1,
+        downsample_padding: int = 1,
+        downsample_type: str = "conv",
+        upsample_type: str = "conv",
+        dropout: float = 0.0,
+        act_fn: str = "silu",
+        attention_head_dim: Optional[int] = 8,
+        norm_num_groups: int = 32,
+        attn_norm_num_groups: Optional[int] = None,
+        norm_eps: float = 1e-5,
+        resnet_time_scale_shift: str = "default",
+        add_attention: bool = True,
+        class_embed_type: Optional[str] = None,
+        num_class_embeds: Optional[int] = None,
+        num_train_timesteps: Optional[int] = None,
+    ):
+        super().__init__()
+
+        self.sample_size = sample_size
+        time_embed_dim = block_out_channels[0] * 4
+
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        # input
+        self.conv_in = nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
+
+        # time
+        if time_embedding_type == "fourier":
+            self.time_proj = GaussianFourierProjection(embedding_size=block_out_channels[0], scale=16)
+            timestep_input_dim = 2 * block_out_channels[0]
+        elif time_embedding_type == "positional":
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        elif time_embedding_type == "learned":
+            self.time_proj = nn.Embedding(num_train_timesteps, block_out_channels[0])
+            timestep_input_dim = block_out_channels[0]
+
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+
+        self.down_blocks = nn.ModuleList([])
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=attention_head_dim if attention_head_dim is not None else output_channel,
+                downsample_padding=downsample_padding,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                downsample_type=downsample_type,
+                dropout=dropout,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = UNetMidBlock2D(
+            in_channels=block_out_channels[-1],
+            temb_channels=time_embed_dim,
+            dropout=dropout,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            output_scale_factor=mid_block_scale_factor,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            attention_head_dim=attention_head_dim if attention_head_dim is not None else block_out_channels[-1],
+            resnet_groups=norm_num_groups,
+            attn_groups=attn_norm_num_groups,
+            add_attention=add_attention,
+        )
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=attention_head_dim if attention_head_dim is not None else output_channel,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                upsample_type=upsample_type,
+                dropout=dropout,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        num_groups_out = norm_num_groups if norm_num_groups is not None else min(block_out_channels[0] // 4, 32)
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=num_groups_out, eps=norm_eps)
+        self.conv_act = nn.SiLU()
+        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, kernel_size=3, padding=1)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        class_labels: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet2DOutput, Tuple]:
+        r"""
+        The [`UNet2DModel`] forward method.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            class_labels (`torch.FloatTensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d.UNet2DOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.unet_2d.UNet2DOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_2d.UNet2DOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is the sample tensor.
+        """
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
+        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps * torch.ones(sample.shape[0], dtype=timesteps.dtype, device=timesteps.device)
+
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+        emb = self.time_embedding(t_emb)
+
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when doing class conditioning")
+
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+        elif self.class_embedding is None and class_labels is not None:
+            raise ValueError("class_embedding needs to be initialized in order to use class conditioning")
+
+        # 2. pre-process
+        skip_sample = sample
+        sample = self.conv_in(sample)
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "skip_conv"):
+                sample, res_samples, skip_sample = downsample_block(
+                    hidden_states=sample, temb=emb, skip_sample=skip_sample
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+
+            down_block_res_samples += res_samples
+
+        # 4. mid
+        sample = self.mid_block(sample, emb)
+
+        # 5. up
+        skip_sample = None
+        for upsample_block in self.up_blocks:
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            if hasattr(upsample_block, "skip_conv"):
+                sample, skip_sample = upsample_block(sample, res_samples, emb, skip_sample)
+            else:
+                sample = upsample_block(sample, res_samples, emb)
+
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        if skip_sample is not None:
+            sample += skip_sample
+
+        if self.config.time_embedding_type == "fourier":
+            timesteps = timesteps.reshape((sample.shape[0], *([1] * len(sample.shape[1:]))))
+            sample = sample / timesteps
+
+        if not return_dict:
+            return (sample,)
+
+        return UNet2DOutput(sample=sample)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d_blocks.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d_blocks.py
new file mode 100644
index 000000000..b9e9e63bb
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d_blocks.py
@@ -0,0 +1,3731 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...utils import deprecate, is_torch_version, logging
+from ...utils.torch_utils import apply_freeu
+from ..activations import get_activation
+from ..attention_processor import Attention, AttnAddedKVProcessor, AttnAddedKVProcessor2_0
+from ..normalization import AdaGroupNorm
+from ..resnet import (
+    Downsample2D,
+    FirDownsample2D,
+    FirUpsample2D,
+    KDownsample2D,
+    KUpsample2D,
+    ResnetBlock2D,
+    ResnetBlockCondNorm2D,
+    Upsample2D,
+)
+from ..transformers.dual_transformer_2d import DualTransformer2DModel
+from ..transformers.transformer_2d import Transformer2DModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def get_down_block(
+    down_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    temb_channels: int,
+    add_downsample: bool,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    transformer_layers_per_block: int = 1,
+    num_attention_heads: Optional[int] = None,
+    resnet_groups: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    downsample_padding: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = False,
+    only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    attention_type: str = "default",
+    resnet_skip_time_act: bool = False,
+    resnet_out_scale_factor: float = 1.0,
+    cross_attention_norm: Optional[str] = None,
+    attention_head_dim: Optional[int] = None,
+    downsample_type: Optional[str] = None,
+    dropout: float = 0.0,
+):
+    # If attn head dim is not defined, we default it to the number of heads
+    if attention_head_dim is None:
+        logger.warning(
+            f"It is recommended to provide `attention_head_dim` when calling `get_down_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
+        )
+        attention_head_dim = num_attention_heads
+
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+    if down_block_type == "DownBlock2D":
+        return DownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "ResnetDownsampleBlock2D":
+        return ResnetDownsampleBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+        )
+    elif down_block_type == "AttnDownBlock2D":
+        if add_downsample is False:
+            downsample_type = None
+        else:
+            downsample_type = downsample_type or "conv"  # default to 'conv'
+        return AttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            downsample_type=downsample_type,
+        )
+    elif down_block_type == "CrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock2D")
+        return CrossAttnDownBlock2D(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            attention_type=attention_type,
+        )
+    elif down_block_type == "SimpleCrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for SimpleCrossAttnDownBlock2D")
+        return SimpleCrossAttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+            only_cross_attention=only_cross_attention,
+            cross_attention_norm=cross_attention_norm,
+        )
+    elif down_block_type == "SkipDownBlock2D":
+        return SkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "AttnSkipDownBlock2D":
+        return AttnSkipDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "DownEncoderBlock2D":
+        return DownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "AttnDownEncoderBlock2D":
+        return AttnDownEncoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "KDownBlock2D":
+        return KDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif down_block_type == "KCrossAttnDownBlock2D":
+        return KCrossAttnDownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            add_self_attention=True if not add_downsample else False,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+
+
+def get_mid_block(
+    mid_block_type: str,
+    temb_channels: int,
+    in_channels: int,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    resnet_groups: int,
+    output_scale_factor: float = 1.0,
+    transformer_layers_per_block: int = 1,
+    num_attention_heads: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = False,
+    mid_block_only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    attention_type: str = "default",
+    resnet_skip_time_act: bool = False,
+    cross_attention_norm: Optional[str] = None,
+    attention_head_dim: Optional[int] = 1,
+    dropout: float = 0.0,
+):
+    if mid_block_type == "UNetMidBlock2DCrossAttn":
+        return UNetMidBlock2DCrossAttn(
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            output_scale_factor=output_scale_factor,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            resnet_groups=resnet_groups,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            upcast_attention=upcast_attention,
+            attention_type=attention_type,
+        )
+    elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
+        return UNetMidBlock2DSimpleCrossAttn(
+            in_channels=in_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            output_scale_factor=output_scale_factor,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            only_cross_attention=mid_block_only_cross_attention,
+            cross_attention_norm=cross_attention_norm,
+        )
+    elif mid_block_type == "UNetMidBlock2D":
+        return UNetMidBlock2D(
+            in_channels=in_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            num_layers=0,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            output_scale_factor=output_scale_factor,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            add_attention=False,
+        )
+    elif mid_block_type is None:
+        return None
+    else:
+        raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+
+
+def get_up_block(
+    up_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    prev_output_channel: int,
+    temb_channels: int,
+    add_upsample: bool,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    resolution_idx: Optional[int] = None,
+    transformer_layers_per_block: int = 1,
+    num_attention_heads: Optional[int] = None,
+    resnet_groups: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = False,
+    only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    attention_type: str = "default",
+    resnet_skip_time_act: bool = False,
+    resnet_out_scale_factor: float = 1.0,
+    cross_attention_norm: Optional[str] = None,
+    attention_head_dim: Optional[int] = None,
+    upsample_type: Optional[str] = None,
+    dropout: float = 0.0,
+) -> nn.Module:
+    # If attn head dim is not defined, we default it to the number of heads
+    if attention_head_dim is None:
+        logger.warning(
+            f"It is recommended to provide `attention_head_dim` when calling `get_up_block`. Defaulting `attention_head_dim` to {num_attention_heads}."
+        )
+        attention_head_dim = num_attention_heads
+
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    if up_block_type == "UpBlock2D":
+        return UpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "ResnetUpsampleBlock2D":
+        return ResnetUpsampleBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+        )
+    elif up_block_type == "CrossAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock2D")
+        return CrossAttnUpBlock2D(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            attention_type=attention_type,
+        )
+    elif up_block_type == "SimpleCrossAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for SimpleCrossAttnUpBlock2D")
+        return SimpleCrossAttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            skip_time_act=resnet_skip_time_act,
+            output_scale_factor=resnet_out_scale_factor,
+            only_cross_attention=only_cross_attention,
+            cross_attention_norm=cross_attention_norm,
+        )
+    elif up_block_type == "AttnUpBlock2D":
+        if add_upsample is False:
+            upsample_type = None
+        else:
+            upsample_type = upsample_type or "conv"  # default to 'conv'
+
+        return AttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            upsample_type=upsample_type,
+        )
+    elif up_block_type == "SkipUpBlock2D":
+        return SkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "AttnSkipUpBlock2D":
+        return AttnSkipUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "UpDecoderBlock2D":
+        return UpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            temb_channels=temb_channels,
+        )
+    elif up_block_type == "AttnUpDecoderBlock2D":
+        return AttnUpDecoderBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            attention_head_dim=attention_head_dim,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            temb_channels=temb_channels,
+        )
+    elif up_block_type == "KUpBlock2D":
+        return KUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+        )
+    elif up_block_type == "KCrossAttnUpBlock2D":
+        return KCrossAttnUpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            cross_attention_dim=cross_attention_dim,
+            attention_head_dim=attention_head_dim,
+        )
+
+    raise ValueError(f"{up_block_type} does not exist.")
+
+
+class AutoencoderTinyBlock(nn.Module):
+    """
+    Tiny Autoencoder block used in [`AutoencoderTiny`]. It is a mini residual module consisting of plain conv + ReLU
+    blocks.
+
+    Args:
+        in_channels (`int`): The number of input channels.
+        out_channels (`int`): The number of output channels.
+        act_fn (`str`):
+            ` The activation function to use. Supported values are `"swish"`, `"mish"`, `"gelu"`, and `"relu"`.
+
+    Returns:
+        `torch.FloatTensor`: A tensor with the same shape as the input tensor, but with the number of channels equal to
+        `out_channels`.
+    """
+
+    def __init__(self, in_channels: int, out_channels: int, act_fn: str):
+        super().__init__()
+        act_fn = get_activation(act_fn)
+        self.conv = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
+            act_fn,
+            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
+            act_fn,
+            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
+        )
+        self.skip = (
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
+            if in_channels != out_channels
+            else nn.Identity()
+        )
+        self.fuse = nn.ReLU()
+
+    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
+        return self.fuse(self.conv(x) + self.skip(x))
+
+
+class UNetMidBlock2D(nn.Module):
+    """
+    A 2D UNet mid-block [`UNetMidBlock2D`] with multiple residual blocks and optional attention blocks.
+
+    Args:
+        in_channels (`int`): The number of input channels.
+        temb_channels (`int`): The number of temporal embedding channels.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
+        num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
+        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
+        resnet_time_scale_shift (`str`, *optional*, defaults to `default`):
+            The type of normalization to apply to the time embeddings. This can help to improve the performance of the
+            model on tasks with long-range temporal dependencies.
+        resnet_act_fn (`str`, *optional*, defaults to `swish`): The activation function for the resnet blocks.
+        resnet_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use in the group normalization layers of the resnet blocks.
+        attn_groups (`Optional[int]`, *optional*, defaults to None): The number of groups for the attention blocks.
+        resnet_pre_norm (`bool`, *optional*, defaults to `True`):
+            Whether to use pre-normalization for the resnet blocks.
+        add_attention (`bool`, *optional*, defaults to `True`): Whether to add attention blocks.
+        attention_head_dim (`int`, *optional*, defaults to 1):
+            Dimension of a single attention head. The number of attention heads is determined based on this value and
+            the number of input channels.
+        output_scale_factor (`float`, *optional*, defaults to 1.0): The output scale factor.
+
+    Returns:
+        `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
+        in_channels, height, width)`.
+
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",  # default, spatial
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        attn_groups: Optional[int] = None,
+        resnet_pre_norm: bool = True,
+        add_attention: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+    ):
+        super().__init__()
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        self.add_attention = add_attention
+
+        if attn_groups is None:
+            attn_groups = resnet_groups if resnet_time_scale_shift == "default" else None
+
+        # there is always at least one resnet
+        if resnet_time_scale_shift == "spatial":
+            resnets = [
+                ResnetBlockCondNorm2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm="spatial",
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                )
+            ]
+        else:
+            resnets = [
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            ]
+        attentions = []
+
+        if attention_head_dim is None:
+            logger.warning(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {in_channels}."
+            )
+            attention_head_dim = in_channels
+
+        for _ in range(num_layers):
+            if self.add_attention:
+                attentions.append(
+                    Attention(
+                        in_channels,
+                        heads=in_channels // attention_head_dim,
+                        dim_head=attention_head_dim,
+                        rescale_output_factor=output_scale_factor,
+                        eps=resnet_eps,
+                        norm_num_groups=attn_groups,
+                        spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
+                        residual_connection=True,
+                        bias=True,
+                        upcast_softmax=True,
+                        _from_deprecated_attn_block=True,
+                    )
+                )
+            else:
+                attentions.append(None)
+
+            if resnet_time_scale_shift == "spatial":
+                resnets.append(
+                    ResnetBlockCondNorm2D(
+                        in_channels=in_channels,
+                        out_channels=in_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm="spatial",
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                    )
+                )
+            else:
+                resnets.append(
+                    ResnetBlock2D(
+                        in_channels=in_channels,
+                        out_channels=in_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                    )
+                )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor:
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if attn is not None:
+                hidden_states = attn(hidden_states, temb=temb)
+            hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
+
+
+class UNetMidBlock2DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        output_scale_factor: float = 1.0,
+        cross_attention_dim: int = 1280,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+
+        # support for variable transformer layers per block
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+
+        for i in range(num_layers):
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+                hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
+
+
+class UNetMidBlock2DSimpleCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+        cross_attention_dim: int = 1280,
+        skip_time_act: bool = False,
+        only_cross_attention: bool = False,
+        cross_attention_norm: Optional[str] = None,
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+
+        self.attention_head_dim = attention_head_dim
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+
+        self.num_heads = in_channels // self.attention_head_dim
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                skip_time_act=skip_time_act,
+            )
+        ]
+        attentions = []
+
+        for _ in range(num_layers):
+            processor = (
+                AttnAddedKVProcessor2_0() if hasattr(F, "scaled_dot_product_attention") else AttnAddedKVProcessor()
+            )
+
+            attentions.append(
+                Attention(
+                    query_dim=in_channels,
+                    cross_attention_dim=in_channels,
+                    heads=self.num_heads,
+                    dim_head=self.attention_head_dim,
+                    added_kv_proj_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    bias=True,
+                    upcast_softmax=True,
+                    only_cross_attention=only_cross_attention,
+                    cross_attention_norm=cross_attention_norm,
+                    processor=processor,
+                )
+            )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    skip_time_act=skip_time_act,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        if cross_attention_kwargs.get("scale", None) is not None:
+            logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+
+        if attention_mask is None:
+            # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
+            mask = None if encoder_hidden_states is None else encoder_attention_mask
+        else:
+            # when attention_mask is defined: we don't even check for encoder_attention_mask.
+            # this is to maintain compatibility with UnCLIP, which uses 'attention_mask' param for cross-attn masks.
+            # TODO: UnCLIP should express cross-attn mask via encoder_attention_mask param instead of via attention_mask.
+            #       then we can simplify this whole if/else block to:
+            #         mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask
+            mask = attention_mask
+
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            # attn
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=mask,
+                **cross_attention_kwargs,
+            )
+
+            # resnet
+            hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
+
+
+class AttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+        downsample_padding: int = 1,
+        downsample_type: str = "conv",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.downsample_type = downsample_type
+
+        if attention_head_dim is None:
+            logger.warning(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {out_channels}."
+            )
+            attention_head_dim = out_channels
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                Attention(
+                    out_channels,
+                    heads=out_channels // attention_head_dim,
+                    dim_head=attention_head_dim,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                    residual_connection=True,
+                    bias=True,
+                    upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if downsample_type == "conv":
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        elif downsample_type == "resnet":
+            self.downsamplers = nn.ModuleList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        down=True,
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        if cross_attention_kwargs.get("scale", None) is not None:
+            logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+
+        output_states = ()
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = attn(hidden_states, **cross_attention_kwargs)
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                if self.downsample_type == "resnet":
+                    hidden_states = downsampler(hidden_states, temb=temb)
+                else:
+                    hidden_states = downsampler(hidden_states)
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class CrossAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        downsample_padding: int = 1,
+        add_downsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        additional_residuals: Optional[torch.FloatTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+
+        output_states = ()
+
+        blocks = list(zip(self.resnets, self.attentions))
+
+        for i, (resnet, attn) in enumerate(blocks):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+
+            # apply additional residuals to the output of the last pair of resnet and attention blocks
+            if i == len(blocks) - 1 and additional_residuals is not None:
+                hidden_states = hidden_states + additional_residuals
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class DownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, *args, **kwargs
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        output_states = ()
+
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class DownEncoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            if resnet_time_scale_shift == "spatial":
+                resnets.append(
+                    ResnetBlockCondNorm2D(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        temb_channels=None,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm="spatial",
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                    )
+                )
+            else:
+                resnets.append(
+                    ResnetBlock2D(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        temb_channels=None,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                    )
+                )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(self, hidden_states: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=None)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+        return hidden_states
+
+
+class AttnDownEncoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        if attention_head_dim is None:
+            logger.warning(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {out_channels}."
+            )
+            attention_head_dim = out_channels
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            if resnet_time_scale_shift == "spatial":
+                resnets.append(
+                    ResnetBlockCondNorm2D(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        temb_channels=None,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm="spatial",
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                    )
+                )
+            else:
+                resnets.append(
+                    ResnetBlock2D(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        temb_channels=None,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                    )
+                )
+            attentions.append(
+                Attention(
+                    out_channels,
+                    heads=out_channels // attention_head_dim,
+                    dim_head=attention_head_dim,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                    residual_connection=True,
+                    bias=True,
+                    upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+    def forward(self, hidden_states: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb=None)
+            hidden_states = attn(hidden_states)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+        return hidden_states
+
+
+class AttnSkipDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = np.sqrt(2.0),
+        add_downsample: bool = True,
+    ):
+        super().__init__()
+        self.attentions = nn.ModuleList([])
+        self.resnets = nn.ModuleList([])
+
+        if attention_head_dim is None:
+            logger.warning(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {out_channels}."
+            )
+            attention_head_dim = out_channels
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min(in_channels // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            self.attentions.append(
+                Attention(
+                    out_channels,
+                    heads=out_channels // attention_head_dim,
+                    dim_head=attention_head_dim,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=32,
+                    residual_connection=True,
+                    bias=True,
+                    upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
+                )
+            )
+
+        if add_downsample:
+            self.resnet_down = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                down=True,
+                kernel="fir",
+            )
+            self.downsamplers = nn.ModuleList([FirDownsample2D(out_channels, out_channels=out_channels)])
+            self.skip_conv = nn.Conv2d(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
+        else:
+            self.resnet_down = None
+            self.downsamplers = None
+            self.skip_conv = None
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        skip_sample: Optional[torch.FloatTensor] = None,
+        *args,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...], torch.FloatTensor]:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        output_states = ()
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = attn(hidden_states)
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            hidden_states = self.resnet_down(hidden_states, temb)
+            for downsampler in self.downsamplers:
+                skip_sample = downsampler(skip_sample)
+
+            hidden_states = self.skip_conv(skip_sample) + hidden_states
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states, skip_sample
+
+
+class SkipDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = np.sqrt(2.0),
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        self.resnets = nn.ModuleList([])
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min(in_channels // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        if add_downsample:
+            self.resnet_down = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                down=True,
+                kernel="fir",
+            )
+            self.downsamplers = nn.ModuleList([FirDownsample2D(out_channels, out_channels=out_channels)])
+            self.skip_conv = nn.Conv2d(3, out_channels, kernel_size=(1, 1), stride=(1, 1))
+        else:
+            self.resnet_down = None
+            self.downsamplers = None
+            self.skip_conv = None
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        skip_sample: Optional[torch.FloatTensor] = None,
+        *args,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...], torch.FloatTensor]:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        output_states = ()
+
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb)
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            hidden_states = self.resnet_down(hidden_states, temb)
+            for downsampler in self.downsamplers:
+                skip_sample = downsampler(skip_sample)
+
+            hidden_states = self.skip_conv(skip_sample) + hidden_states
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states, skip_sample
+
+
+class ResnetDownsampleBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        skip_time_act: bool = False,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    skip_time_act=skip_time_act,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        skip_time_act=skip_time_act,
+                        down=True,
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, *args, **kwargs
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        output_states = ()
+
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, temb)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class SimpleCrossAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        skip_time_act: bool = False,
+        only_cross_attention: bool = False,
+        cross_attention_norm: Optional[str] = None,
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+
+        resnets = []
+        attentions = []
+
+        self.attention_head_dim = attention_head_dim
+        self.num_heads = out_channels // self.attention_head_dim
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    skip_time_act=skip_time_act,
+                )
+            )
+
+            processor = (
+                AttnAddedKVProcessor2_0() if hasattr(F, "scaled_dot_product_attention") else AttnAddedKVProcessor()
+            )
+
+            attentions.append(
+                Attention(
+                    query_dim=out_channels,
+                    cross_attention_dim=out_channels,
+                    heads=self.num_heads,
+                    dim_head=attention_head_dim,
+                    added_kv_proj_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    bias=True,
+                    upcast_softmax=True,
+                    only_cross_attention=only_cross_attention,
+                    cross_attention_norm=cross_attention_norm,
+                    processor=processor,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        skip_time_act=skip_time_act,
+                        down=True,
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        if cross_attention_kwargs.get("scale", None) is not None:
+            logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+
+        output_states = ()
+
+        if attention_mask is None:
+            # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
+            mask = None if encoder_hidden_states is None else encoder_attention_mask
+        else:
+            # when attention_mask is defined: we don't even check for encoder_attention_mask.
+            # this is to maintain compatibility with UnCLIP, which uses 'attention_mask' param for cross-attn masks.
+            # TODO: UnCLIP should express cross-attn mask via encoder_attention_mask param instead of via attention_mask.
+            #       then we can simplify this whole if/else block to:
+            #         mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask
+            mask = attention_mask
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=mask,
+                    **cross_attention_kwargs,
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=mask,
+                    **cross_attention_kwargs,
+                )
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states, temb)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class KDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 4,
+        resnet_eps: float = 1e-5,
+        resnet_act_fn: str = "gelu",
+        resnet_group_size: int = 32,
+        add_downsample: bool = False,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            groups = in_channels // resnet_group_size
+            groups_out = out_channels // resnet_group_size
+
+            resnets.append(
+                ResnetBlockCondNorm2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    dropout=dropout,
+                    temb_channels=temb_channels,
+                    groups=groups,
+                    groups_out=groups_out,
+                    eps=resnet_eps,
+                    non_linearity=resnet_act_fn,
+                    time_embedding_norm="ada_group",
+                    conv_shortcut_bias=False,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            # YiYi's comments- might be able to use FirDownsample2D, look into details later
+            self.downsamplers = nn.ModuleList([KDownsample2D()])
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, *args, **kwargs
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        output_states = ()
+
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+        return hidden_states, output_states
+
+
+class KCrossAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        cross_attention_dim: int,
+        dropout: float = 0.0,
+        num_layers: int = 4,
+        resnet_group_size: int = 32,
+        add_downsample: bool = True,
+        attention_head_dim: int = 64,
+        add_self_attention: bool = False,
+        resnet_eps: float = 1e-5,
+        resnet_act_fn: str = "gelu",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            groups = in_channels // resnet_group_size
+            groups_out = out_channels // resnet_group_size
+
+            resnets.append(
+                ResnetBlockCondNorm2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    dropout=dropout,
+                    temb_channels=temb_channels,
+                    groups=groups,
+                    groups_out=groups_out,
+                    eps=resnet_eps,
+                    non_linearity=resnet_act_fn,
+                    time_embedding_norm="ada_group",
+                    conv_shortcut_bias=False,
+                )
+            )
+            attentions.append(
+                KAttentionBlock(
+                    out_channels,
+                    out_channels // attention_head_dim,
+                    attention_head_dim,
+                    cross_attention_dim=cross_attention_dim,
+                    temb_channels=temb_channels,
+                    attention_bias=True,
+                    add_self_attention=add_self_attention,
+                    cross_attention_norm="layer_norm",
+                    group_size=resnet_group_size,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+        self.attentions = nn.ModuleList(attentions)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList([KDownsample2D()])
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        if cross_attention_kwargs.get("scale", None) is not None:
+            logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+
+        output_states = ()
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    emb=temb,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    emb=temb,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+
+            if self.downsamplers is None:
+                output_states += (None,)
+            else:
+                output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+        return hidden_states, output_states
+
+
+class AttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: int = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+        upsample_type: str = "conv",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.upsample_type = upsample_type
+
+        if attention_head_dim is None:
+            logger.warning(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {out_channels}."
+            )
+            attention_head_dim = out_channels
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            attentions.append(
+                Attention(
+                    out_channels,
+                    heads=out_channels // attention_head_dim,
+                    dim_head=attention_head_dim,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups,
+                    residual_connection=True,
+                    bias=True,
+                    upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if upsample_type == "conv":
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        elif upsample_type == "resnet":
+            self.upsamplers = nn.ModuleList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        up=True,
+                    )
+                ]
+            )
+        else:
+            self.upsamplers = None
+
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = attn(hidden_states)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                if self.upsample_type == "resnet":
+                    hidden_states = upsampler(hidden_states, temb=temb)
+                else:
+                    hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class CrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+
+        return hidden_states
+
+
+class UpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+
+        return hidden_states
+
+
+class UpDecoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",  # default, spatial
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        temb_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+
+            if resnet_time_scale_shift == "spatial":
+                resnets.append(
+                    ResnetBlockCondNorm2D(
+                        in_channels=input_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm="spatial",
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                    )
+                )
+            else:
+                resnets.append(
+                    ResnetBlock2D(
+                        in_channels=input_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                    )
+                )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.resolution_idx = resolution_idx
+
+    def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor:
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb=temb)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class AttnUpDecoderBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        temb_channels: Optional[int] = None,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        if attention_head_dim is None:
+            logger.warning(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `out_channels`: {out_channels}."
+            )
+            attention_head_dim = out_channels
+
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+
+            if resnet_time_scale_shift == "spatial":
+                resnets.append(
+                    ResnetBlockCondNorm2D(
+                        in_channels=input_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm="spatial",
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                    )
+                )
+            else:
+                resnets.append(
+                    ResnetBlock2D(
+                        in_channels=input_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                    )
+                )
+
+            attentions.append(
+                Attention(
+                    out_channels,
+                    heads=out_channels // attention_head_dim,
+                    dim_head=attention_head_dim,
+                    rescale_output_factor=output_scale_factor,
+                    eps=resnet_eps,
+                    norm_num_groups=resnet_groups if resnet_time_scale_shift != "spatial" else None,
+                    spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
+                    residual_connection=True,
+                    bias=True,
+                    upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.resolution_idx = resolution_idx
+
+    def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor:
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb=temb)
+            hidden_states = attn(hidden_states, temb=temb)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class AttnSkipUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = np.sqrt(2.0),
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        self.attentions = nn.ModuleList([])
+        self.resnets = nn.ModuleList([])
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min(resnet_in_channels + res_skip_channels // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        if attention_head_dim is None:
+            logger.warning(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `out_channels`: {out_channels}."
+            )
+            attention_head_dim = out_channels
+
+        self.attentions.append(
+            Attention(
+                out_channels,
+                heads=out_channels // attention_head_dim,
+                dim_head=attention_head_dim,
+                rescale_output_factor=output_scale_factor,
+                eps=resnet_eps,
+                norm_num_groups=32,
+                residual_connection=True,
+                bias=True,
+                upcast_softmax=True,
+                _from_deprecated_attn_block=True,
+            )
+        )
+
+        self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels)
+        if add_upsample:
+            self.resnet_up = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                groups_out=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                up=True,
+                kernel="fir",
+            )
+            self.skip_conv = nn.Conv2d(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            self.skip_norm = torch.nn.GroupNorm(
+                num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True
+            )
+            self.act = nn.SiLU()
+        else:
+            self.resnet_up = None
+            self.skip_conv = None
+            self.skip_norm = None
+            self.act = None
+
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        skip_sample=None,
+        *args,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            hidden_states = resnet(hidden_states, temb)
+
+        hidden_states = self.attentions[0](hidden_states)
+
+        if skip_sample is not None:
+            skip_sample = self.upsampler(skip_sample)
+        else:
+            skip_sample = 0
+
+        if self.resnet_up is not None:
+            skip_sample_states = self.skip_norm(hidden_states)
+            skip_sample_states = self.act(skip_sample_states)
+            skip_sample_states = self.skip_conv(skip_sample_states)
+
+            skip_sample = skip_sample + skip_sample_states
+
+            hidden_states = self.resnet_up(hidden_states, temb)
+
+        return hidden_states, skip_sample
+
+
+class SkipUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = np.sqrt(2.0),
+        add_upsample: bool = True,
+        upsample_padding: int = 1,
+    ):
+        super().__init__()
+        self.resnets = nn.ModuleList([])
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            self.resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=min((resnet_in_channels + res_skip_channels) // 4, 32),
+                    groups_out=min(out_channels // 4, 32),
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.upsampler = FirUpsample2D(in_channels, out_channels=out_channels)
+        if add_upsample:
+            self.resnet_up = ResnetBlock2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=min(out_channels // 4, 32),
+                groups_out=min(out_channels // 4, 32),
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                use_in_shortcut=True,
+                up=True,
+                kernel="fir",
+            )
+            self.skip_conv = nn.Conv2d(out_channels, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
+            self.skip_norm = torch.nn.GroupNorm(
+                num_groups=min(out_channels // 4, 32), num_channels=out_channels, eps=resnet_eps, affine=True
+            )
+            self.act = nn.SiLU()
+        else:
+            self.resnet_up = None
+            self.skip_conv = None
+            self.skip_norm = None
+            self.act = None
+
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        skip_sample=None,
+        *args,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            hidden_states = resnet(hidden_states, temb)
+
+        if skip_sample is not None:
+            skip_sample = self.upsampler(skip_sample)
+        else:
+            skip_sample = 0
+
+        if self.resnet_up is not None:
+            skip_sample_states = self.skip_norm(hidden_states)
+            skip_sample_states = self.act(skip_sample_states)
+            skip_sample_states = self.skip_conv(skip_sample_states)
+
+            skip_sample = skip_sample + skip_sample_states
+
+            hidden_states = self.resnet_up(hidden_states, temb)
+
+        return hidden_states, skip_sample
+
+
+class ResnetUpsampleBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        skip_time_act: bool = False,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    skip_time_act=skip_time_act,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        skip_time_act=skip_time_act,
+                        up=True,
+                    )
+                ]
+            )
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, temb)
+
+        return hidden_states
+
+
+class SimpleCrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        skip_time_act: bool = False,
+        only_cross_attention: bool = False,
+        cross_attention_norm: Optional[str] = None,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.attention_head_dim = attention_head_dim
+
+        self.num_heads = out_channels // self.attention_head_dim
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    skip_time_act=skip_time_act,
+                )
+            )
+
+            processor = (
+                AttnAddedKVProcessor2_0() if hasattr(F, "scaled_dot_product_attention") else AttnAddedKVProcessor()
+            )
+
+            attentions.append(
+                Attention(
+                    query_dim=out_channels,
+                    cross_attention_dim=out_channels,
+                    heads=self.num_heads,
+                    dim_head=self.attention_head_dim,
+                    added_kv_proj_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    bias=True,
+                    upcast_softmax=True,
+                    only_cross_attention=only_cross_attention,
+                    cross_attention_norm=cross_attention_norm,
+                    processor=processor,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList(
+                [
+                    ResnetBlock2D(
+                        in_channels=out_channels,
+                        out_channels=out_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                        skip_time_act=skip_time_act,
+                        up=True,
+                    )
+                ]
+            )
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        if cross_attention_kwargs.get("scale", None) is not None:
+            logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+
+        if attention_mask is None:
+            # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
+            mask = None if encoder_hidden_states is None else encoder_attention_mask
+        else:
+            # when attention_mask is defined: we don't even check for encoder_attention_mask.
+            # this is to maintain compatibility with UnCLIP, which uses 'attention_mask' param for cross-attn masks.
+            # TODO: UnCLIP should express cross-attn mask via encoder_attention_mask param instead of via attention_mask.
+            #       then we can simplify this whole if/else block to:
+            #         mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask
+            mask = attention_mask
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # resnet
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=mask,
+                    **cross_attention_kwargs,
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=mask,
+                    **cross_attention_kwargs,
+                )
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, temb)
+
+        return hidden_states
+
+
+class KUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: int,
+        dropout: float = 0.0,
+        num_layers: int = 5,
+        resnet_eps: float = 1e-5,
+        resnet_act_fn: str = "gelu",
+        resnet_group_size: Optional[int] = 32,
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+        k_in_channels = 2 * out_channels
+        k_out_channels = in_channels
+        num_layers = num_layers - 1
+
+        for i in range(num_layers):
+            in_channels = k_in_channels if i == 0 else out_channels
+            groups = in_channels // resnet_group_size
+            groups_out = out_channels // resnet_group_size
+
+            resnets.append(
+                ResnetBlockCondNorm2D(
+                    in_channels=in_channels,
+                    out_channels=k_out_channels if (i == num_layers - 1) else out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=groups,
+                    groups_out=groups_out,
+                    dropout=dropout,
+                    non_linearity=resnet_act_fn,
+                    time_embedding_norm="ada_group",
+                    conv_shortcut_bias=False,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([KUpsample2D()])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        res_hidden_states_tuple = res_hidden_states_tuple[-1]
+        if res_hidden_states_tuple is not None:
+            hidden_states = torch.cat([hidden_states, res_hidden_states_tuple], dim=1)
+
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class KCrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: int,
+        dropout: float = 0.0,
+        num_layers: int = 4,
+        resnet_eps: float = 1e-5,
+        resnet_act_fn: str = "gelu",
+        resnet_group_size: int = 32,
+        attention_head_dim: int = 1,  # attention dim_head
+        cross_attention_dim: int = 768,
+        add_upsample: bool = True,
+        upcast_attention: bool = False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        is_first_block = in_channels == out_channels == temb_channels
+        is_middle_block = in_channels != out_channels
+        add_self_attention = True if is_first_block else False
+
+        self.has_cross_attention = True
+        self.attention_head_dim = attention_head_dim
+
+        # in_channels, and out_channels for the block (k-unet)
+        k_in_channels = out_channels if is_first_block else 2 * out_channels
+        k_out_channels = in_channels
+
+        num_layers = num_layers - 1
+
+        for i in range(num_layers):
+            in_channels = k_in_channels if i == 0 else out_channels
+            groups = in_channels // resnet_group_size
+            groups_out = out_channels // resnet_group_size
+
+            if is_middle_block and (i == num_layers - 1):
+                conv_2d_out_channels = k_out_channels
+            else:
+                conv_2d_out_channels = None
+
+            resnets.append(
+                ResnetBlockCondNorm2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    conv_2d_out_channels=conv_2d_out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=groups,
+                    groups_out=groups_out,
+                    dropout=dropout,
+                    non_linearity=resnet_act_fn,
+                    time_embedding_norm="ada_group",
+                    conv_shortcut_bias=False,
+                )
+            )
+            attentions.append(
+                KAttentionBlock(
+                    k_out_channels if (i == num_layers - 1) else out_channels,
+                    k_out_channels // attention_head_dim
+                    if (i == num_layers - 1)
+                    else out_channels // attention_head_dim,
+                    attention_head_dim,
+                    cross_attention_dim=cross_attention_dim,
+                    temb_channels=temb_channels,
+                    attention_bias=True,
+                    add_self_attention=add_self_attention,
+                    cross_attention_norm="layer_norm",
+                    upcast_attention=upcast_attention,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+        self.attentions = nn.ModuleList(attentions)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([KUpsample2D()])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        res_hidden_states_tuple = res_hidden_states_tuple[-1]
+        if res_hidden_states_tuple is not None:
+            hidden_states = torch.cat([hidden_states, res_hidden_states_tuple], dim=1)
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    emb=temb,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    emb=temb,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+# can potentially later be renamed to `No-feed-forward` attention
+class KAttentionBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Configure if the attention layers should contain a bias parameter.
+        upcast_attention (`bool`, *optional*, defaults to `False`):
+            Set to `True` to upcast the attention computation to `float32`.
+        temb_channels (`int`, *optional*, defaults to 768):
+            The number of channels in the token embedding.
+        add_self_attention (`bool`, *optional*, defaults to `False`):
+            Set to `True` to add self-attention to the block.
+        cross_attention_norm (`str`, *optional*, defaults to `None`):
+            The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`.
+        group_size (`int`, *optional*, defaults to 32):
+            The number of groups to separate the channels into for group normalization.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout: float = 0.0,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        upcast_attention: bool = False,
+        temb_channels: int = 768,  # for ada_group_norm
+        add_self_attention: bool = False,
+        cross_attention_norm: Optional[str] = None,
+        group_size: int = 32,
+    ):
+        super().__init__()
+        self.add_self_attention = add_self_attention
+
+        # 1. Self-Attn
+        if add_self_attention:
+            self.norm1 = AdaGroupNorm(temb_channels, dim, max(1, dim // group_size))
+            self.attn1 = Attention(
+                query_dim=dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                cross_attention_dim=None,
+                cross_attention_norm=None,
+            )
+
+        # 2. Cross-Attn
+        self.norm2 = AdaGroupNorm(temb_channels, dim, max(1, dim // group_size))
+        self.attn2 = Attention(
+            query_dim=dim,
+            cross_attention_dim=cross_attention_dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            upcast_attention=upcast_attention,
+            cross_attention_norm=cross_attention_norm,
+        )
+
+    def _to_3d(self, hidden_states: torch.FloatTensor, height: int, weight: int) -> torch.FloatTensor:
+        return hidden_states.permute(0, 2, 3, 1).reshape(hidden_states.shape[0], height * weight, -1)
+
+    def _to_4d(self, hidden_states: torch.FloatTensor, height: int, weight: int) -> torch.FloatTensor:
+        return hidden_states.permute(0, 2, 1).reshape(hidden_states.shape[0], -1, height, weight)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        # TODO: mark emb as non-optional (self.norm2 requires it).
+        #       requires assessing impact of change to positional param interface.
+        emb: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        if cross_attention_kwargs.get("scale", None) is not None:
+            logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+
+        # 1. Self-Attention
+        if self.add_self_attention:
+            norm_hidden_states = self.norm1(hidden_states, emb)
+
+            height, weight = norm_hidden_states.shape[2:]
+            norm_hidden_states = self._to_3d(norm_hidden_states, height, weight)
+
+            attn_output = self.attn1(
+                norm_hidden_states,
+                encoder_hidden_states=None,
+                attention_mask=attention_mask,
+                **cross_attention_kwargs,
+            )
+            attn_output = self._to_4d(attn_output, height, weight)
+
+            hidden_states = attn_output + hidden_states
+
+        # 2. Cross-Attention/None
+        norm_hidden_states = self.norm2(hidden_states, emb)
+
+        height, weight = norm_hidden_states.shape[2:]
+        norm_hidden_states = self._to_3d(norm_hidden_states, height, weight)
+        attn_output = self.attn2(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask if encoder_hidden_states is None else encoder_attention_mask,
+            **cross_attention_kwargs,
+        )
+        attn_output = self._to_4d(attn_output, height, weight)
+
+        hidden_states = attn_output + hidden_states
+
+        return hidden_states
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d_blocks_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d_blocks_flax.py
new file mode 100644
index 000000000..a4585dbc8
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d_blocks_flax.py
@@ -0,0 +1,400 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import flax.linen as nn
+import jax.numpy as jnp
+
+from ..attention_flax import FlaxTransformer2DModel
+from ..resnet_flax import FlaxDownsample2D, FlaxResnetBlock2D, FlaxUpsample2D
+
+
+class FlaxCrossAttnDownBlock2D(nn.Module):
+    r"""
+    Cross Attention 2D Downsizing block - original architecture from Unet transformers:
+    https://arxiv.org/abs/2103.06104
+
+    Parameters:
+        in_channels (:obj:`int`):
+            Input channels
+        out_channels (:obj:`int`):
+            Output channels
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        num_layers (:obj:`int`, *optional*, defaults to 1):
+            Number of attention blocks layers
+        num_attention_heads (:obj:`int`, *optional*, defaults to 1):
+            Number of attention heads of each spatial transformer block
+        add_downsample (:obj:`bool`, *optional*, defaults to `True`):
+            Whether to add downsampling layer before each final output
+        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
+            enable memory efficient attention https://arxiv.org/abs/2112.05682
+        split_head_dim (`bool`, *optional*, defaults to `False`):
+            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
+            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    in_channels: int
+    out_channels: int
+    dropout: float = 0.0
+    num_layers: int = 1
+    num_attention_heads: int = 1
+    add_downsample: bool = True
+    use_linear_projection: bool = False
+    only_cross_attention: bool = False
+    use_memory_efficient_attention: bool = False
+    split_head_dim: bool = False
+    dtype: jnp.dtype = jnp.float32
+    transformer_layers_per_block: int = 1
+
+    def setup(self):
+        resnets = []
+        attentions = []
+
+        for i in range(self.num_layers):
+            in_channels = self.in_channels if i == 0 else self.out_channels
+
+            res_block = FlaxResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=self.out_channels,
+                dropout_prob=self.dropout,
+                dtype=self.dtype,
+            )
+            resnets.append(res_block)
+
+            attn_block = FlaxTransformer2DModel(
+                in_channels=self.out_channels,
+                n_heads=self.num_attention_heads,
+                d_head=self.out_channels // self.num_attention_heads,
+                depth=self.transformer_layers_per_block,
+                use_linear_projection=self.use_linear_projection,
+                only_cross_attention=self.only_cross_attention,
+                use_memory_efficient_attention=self.use_memory_efficient_attention,
+                split_head_dim=self.split_head_dim,
+                dtype=self.dtype,
+            )
+            attentions.append(attn_block)
+
+        self.resnets = resnets
+        self.attentions = attentions
+
+        if self.add_downsample:
+            self.downsamplers_0 = FlaxDownsample2D(self.out_channels, dtype=self.dtype)
+
+    def __call__(self, hidden_states, temb, encoder_hidden_states, deterministic=True):
+        output_states = ()
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb, deterministic=deterministic)
+            hidden_states = attn(hidden_states, encoder_hidden_states, deterministic=deterministic)
+            output_states += (hidden_states,)
+
+        if self.add_downsample:
+            hidden_states = self.downsamplers_0(hidden_states)
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class FlaxDownBlock2D(nn.Module):
+    r"""
+    Flax 2D downsizing block
+
+    Parameters:
+        in_channels (:obj:`int`):
+            Input channels
+        out_channels (:obj:`int`):
+            Output channels
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        num_layers (:obj:`int`, *optional*, defaults to 1):
+            Number of attention blocks layers
+        add_downsample (:obj:`bool`, *optional*, defaults to `True`):
+            Whether to add downsampling layer before each final output
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    in_channels: int
+    out_channels: int
+    dropout: float = 0.0
+    num_layers: int = 1
+    add_downsample: bool = True
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        resnets = []
+
+        for i in range(self.num_layers):
+            in_channels = self.in_channels if i == 0 else self.out_channels
+
+            res_block = FlaxResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=self.out_channels,
+                dropout_prob=self.dropout,
+                dtype=self.dtype,
+            )
+            resnets.append(res_block)
+        self.resnets = resnets
+
+        if self.add_downsample:
+            self.downsamplers_0 = FlaxDownsample2D(self.out_channels, dtype=self.dtype)
+
+    def __call__(self, hidden_states, temb, deterministic=True):
+        output_states = ()
+
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb, deterministic=deterministic)
+            output_states += (hidden_states,)
+
+        if self.add_downsample:
+            hidden_states = self.downsamplers_0(hidden_states)
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class FlaxCrossAttnUpBlock2D(nn.Module):
+    r"""
+    Cross Attention 2D Upsampling block - original architecture from Unet transformers:
+    https://arxiv.org/abs/2103.06104
+
+    Parameters:
+        in_channels (:obj:`int`):
+            Input channels
+        out_channels (:obj:`int`):
+            Output channels
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        num_layers (:obj:`int`, *optional*, defaults to 1):
+            Number of attention blocks layers
+        num_attention_heads (:obj:`int`, *optional*, defaults to 1):
+            Number of attention heads of each spatial transformer block
+        add_upsample (:obj:`bool`, *optional*, defaults to `True`):
+            Whether to add upsampling layer before each final output
+        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
+            enable memory efficient attention https://arxiv.org/abs/2112.05682
+        split_head_dim (`bool`, *optional*, defaults to `False`):
+            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
+            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    in_channels: int
+    out_channels: int
+    prev_output_channel: int
+    dropout: float = 0.0
+    num_layers: int = 1
+    num_attention_heads: int = 1
+    add_upsample: bool = True
+    use_linear_projection: bool = False
+    only_cross_attention: bool = False
+    use_memory_efficient_attention: bool = False
+    split_head_dim: bool = False
+    dtype: jnp.dtype = jnp.float32
+    transformer_layers_per_block: int = 1
+
+    def setup(self):
+        resnets = []
+        attentions = []
+
+        for i in range(self.num_layers):
+            res_skip_channels = self.in_channels if (i == self.num_layers - 1) else self.out_channels
+            resnet_in_channels = self.prev_output_channel if i == 0 else self.out_channels
+
+            res_block = FlaxResnetBlock2D(
+                in_channels=resnet_in_channels + res_skip_channels,
+                out_channels=self.out_channels,
+                dropout_prob=self.dropout,
+                dtype=self.dtype,
+            )
+            resnets.append(res_block)
+
+            attn_block = FlaxTransformer2DModel(
+                in_channels=self.out_channels,
+                n_heads=self.num_attention_heads,
+                d_head=self.out_channels // self.num_attention_heads,
+                depth=self.transformer_layers_per_block,
+                use_linear_projection=self.use_linear_projection,
+                only_cross_attention=self.only_cross_attention,
+                use_memory_efficient_attention=self.use_memory_efficient_attention,
+                split_head_dim=self.split_head_dim,
+                dtype=self.dtype,
+            )
+            attentions.append(attn_block)
+
+        self.resnets = resnets
+        self.attentions = attentions
+
+        if self.add_upsample:
+            self.upsamplers_0 = FlaxUpsample2D(self.out_channels, dtype=self.dtype)
+
+    def __call__(self, hidden_states, res_hidden_states_tuple, temb, encoder_hidden_states, deterministic=True):
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = jnp.concatenate((hidden_states, res_hidden_states), axis=-1)
+
+            hidden_states = resnet(hidden_states, temb, deterministic=deterministic)
+            hidden_states = attn(hidden_states, encoder_hidden_states, deterministic=deterministic)
+
+        if self.add_upsample:
+            hidden_states = self.upsamplers_0(hidden_states)
+
+        return hidden_states
+
+
+class FlaxUpBlock2D(nn.Module):
+    r"""
+    Flax 2D upsampling block
+
+    Parameters:
+        in_channels (:obj:`int`):
+            Input channels
+        out_channels (:obj:`int`):
+            Output channels
+        prev_output_channel (:obj:`int`):
+            Output channels from the previous block
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        num_layers (:obj:`int`, *optional*, defaults to 1):
+            Number of attention blocks layers
+        add_downsample (:obj:`bool`, *optional*, defaults to `True`):
+            Whether to add downsampling layer before each final output
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    in_channels: int
+    out_channels: int
+    prev_output_channel: int
+    dropout: float = 0.0
+    num_layers: int = 1
+    add_upsample: bool = True
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        resnets = []
+
+        for i in range(self.num_layers):
+            res_skip_channels = self.in_channels if (i == self.num_layers - 1) else self.out_channels
+            resnet_in_channels = self.prev_output_channel if i == 0 else self.out_channels
+
+            res_block = FlaxResnetBlock2D(
+                in_channels=resnet_in_channels + res_skip_channels,
+                out_channels=self.out_channels,
+                dropout_prob=self.dropout,
+                dtype=self.dtype,
+            )
+            resnets.append(res_block)
+
+        self.resnets = resnets
+
+        if self.add_upsample:
+            self.upsamplers_0 = FlaxUpsample2D(self.out_channels, dtype=self.dtype)
+
+    def __call__(self, hidden_states, res_hidden_states_tuple, temb, deterministic=True):
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = jnp.concatenate((hidden_states, res_hidden_states), axis=-1)
+
+            hidden_states = resnet(hidden_states, temb, deterministic=deterministic)
+
+        if self.add_upsample:
+            hidden_states = self.upsamplers_0(hidden_states)
+
+        return hidden_states
+
+
+class FlaxUNetMidBlock2DCrossAttn(nn.Module):
+    r"""
+    Cross Attention 2D Mid-level block - original architecture from Unet transformers: https://arxiv.org/abs/2103.06104
+
+    Parameters:
+        in_channels (:obj:`int`):
+            Input channels
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        num_layers (:obj:`int`, *optional*, defaults to 1):
+            Number of attention blocks layers
+        num_attention_heads (:obj:`int`, *optional*, defaults to 1):
+            Number of attention heads of each spatial transformer block
+        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
+            enable memory efficient attention https://arxiv.org/abs/2112.05682
+        split_head_dim (`bool`, *optional*, defaults to `False`):
+            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
+            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    in_channels: int
+    dropout: float = 0.0
+    num_layers: int = 1
+    num_attention_heads: int = 1
+    use_linear_projection: bool = False
+    use_memory_efficient_attention: bool = False
+    split_head_dim: bool = False
+    dtype: jnp.dtype = jnp.float32
+    transformer_layers_per_block: int = 1
+
+    def setup(self):
+        # there is always at least one resnet
+        resnets = [
+            FlaxResnetBlock2D(
+                in_channels=self.in_channels,
+                out_channels=self.in_channels,
+                dropout_prob=self.dropout,
+                dtype=self.dtype,
+            )
+        ]
+
+        attentions = []
+
+        for _ in range(self.num_layers):
+            attn_block = FlaxTransformer2DModel(
+                in_channels=self.in_channels,
+                n_heads=self.num_attention_heads,
+                d_head=self.in_channels // self.num_attention_heads,
+                depth=self.transformer_layers_per_block,
+                use_linear_projection=self.use_linear_projection,
+                use_memory_efficient_attention=self.use_memory_efficient_attention,
+                split_head_dim=self.split_head_dim,
+                dtype=self.dtype,
+            )
+            attentions.append(attn_block)
+
+            res_block = FlaxResnetBlock2D(
+                in_channels=self.in_channels,
+                out_channels=self.in_channels,
+                dropout_prob=self.dropout,
+                dtype=self.dtype,
+            )
+            resnets.append(res_block)
+
+        self.resnets = resnets
+        self.attentions = attentions
+
+    def __call__(self, hidden_states, temb, encoder_hidden_states, deterministic=True):
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            hidden_states = attn(hidden_states, encoder_hidden_states, deterministic=deterministic)
+            hidden_states = resnet(hidden_states, temb, deterministic=deterministic)
+
+        return hidden_states
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d_condition.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d_condition.py
new file mode 100644
index 000000000..cd5b48469
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d_condition.py
@@ -0,0 +1,1319 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+import os
+from ..nhwc_groupnorm.custom_gn import GN_NHWC
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import PeftAdapterMixin, UNet2DConditionLoadersMixin
+from ...utils import USE_PEFT_BACKEND, BaseOutput, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ..activations import get_activation
+from ..attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    Attention,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from ..embeddings import (
+    GaussianFourierProjection,
+    GLIGENTextBoundingboxProjection,
+    ImageHintTimeEmbedding,
+    ImageProjection,
+    ImageTimeEmbedding,
+    TextImageProjection,
+    TextImageTimeEmbedding,
+    TextTimeEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
+from ..modeling_utils import ModelMixin
+from .unet_2d_blocks import (
+    get_down_block,
+    get_mid_block,
+    get_up_block,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class UNet2DConditionOutput(BaseOutput):
+    """
+    The output of [`UNet2DConditionModel`].
+
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+
+    sample: torch.FloatTensor = None
+
+
+class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin):
+    r"""
+    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
+    shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
+            Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`, `UNetMidBlock2D`, or
+            `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
+            The tuple of upsample blocks to use.
+        only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
+            Whether to include self-attention in the basic transformer blocks, see
+            [`~models.attention.BasicTransformerBlock`].
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+            If `None`, normalization and activation layers is skipped in post-processing.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        reverse_transformer_layers_per_block : (`Tuple[Tuple]`, *optional*, defaults to None):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`], in the upsampling
+            blocks of the U-Net. Only relevant if `transformer_layers_per_block` is of type `Tuple[Tuple]` and for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        encoder_hid_dim (`int`, *optional*, defaults to None):
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
+            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+        num_attention_heads (`int`, *optional*):
+            The number of attention heads. If not defined, defaults to `attention_head_dim`
+        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
+            for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to `None`):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
+        addition_time_embed_dim: (`int`, *optional*, defaults to `None`):
+            Dimension for the timestep embeddings.
+        num_class_embeds (`int`, *optional*, defaults to `None`):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        time_embedding_type (`str`, *optional*, defaults to `positional`):
+            The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
+        time_embedding_dim (`int`, *optional*, defaults to `None`):
+            An optional override for the dimension of the projected time embedding.
+        time_embedding_act_fn (`str`, *optional*, defaults to `None`):
+            Optional activation function to use only once on the time embeddings before they are passed to the rest of
+            the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
+        timestep_post_act (`str`, *optional*, defaults to `None`):
+            The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
+        time_cond_proj_dim (`int`, *optional*, defaults to `None`):
+            The dimension of `cond_proj` layer in the timestep embedding.
+        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
+        conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
+        projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
+            `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
+        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
+            embeddings with the class embeddings.
+        mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
+            Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If
+            `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the
+            `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False`
+            otherwise.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
+        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        dropout: float = 0.0,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
+        reverse_transformer_layers_per_block: Optional[Tuple[Tuple[int]]] = None,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        resnet_skip_time_act: bool = False,
+        resnet_out_scale_factor: float = 1.0,
+        time_embedding_type: str = "positional",
+        time_embedding_dim: Optional[int] = None,
+        time_embedding_act_fn: Optional[str] = None,
+        timestep_post_act: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        conv_out_kernel: int = 3,
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        attention_type: str = "default",
+        class_embeddings_concat: bool = False,
+        mid_block_only_cross_attention: Optional[bool] = None,
+        cross_attention_norm: Optional[str] = None,
+        addition_embed_type_num_heads: int = 64,
+    ):
+        super().__init__()
+
+        self.sample_size = sample_size
+
+        if num_attention_heads is not None:
+            raise ValueError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+
+        # Check inputs
+        self._check_config(
+            down_block_types=down_block_types,
+            up_block_types=up_block_types,
+            only_cross_attention=only_cross_attention,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            cross_attention_dim=cross_attention_dim,
+            transformer_layers_per_block=transformer_layers_per_block,
+            reverse_transformer_layers_per_block=reverse_transformer_layers_per_block,
+            attention_head_dim=attention_head_dim,
+            num_attention_heads=num_attention_heads,
+        )
+
+        # input
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+
+        # time
+        time_embed_dim, timestep_input_dim = self._set_time_proj(
+            time_embedding_type,
+            block_out_channels=block_out_channels,
+            flip_sin_to_cos=flip_sin_to_cos,
+            freq_shift=freq_shift,
+            time_embedding_dim=time_embedding_dim,
+        )
+
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+            post_act_fn=timestep_post_act,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+
+        self._set_encoder_hid_proj(
+            encoder_hid_dim_type,
+            cross_attention_dim=cross_attention_dim,
+            encoder_hid_dim=encoder_hid_dim,
+        )
+
+        # class embedding
+        self._set_class_embedding(
+            class_embed_type,
+            act_fn=act_fn,
+            num_class_embeds=num_class_embeds,
+            projection_class_embeddings_input_dim=projection_class_embeddings_input_dim,
+            time_embed_dim=time_embed_dim,
+            timestep_input_dim=timestep_input_dim,
+        )
+
+        self._set_add_embedding(
+            addition_embed_type,
+            addition_embed_type_num_heads=addition_embed_type_num_heads,
+            addition_time_embed_dim=addition_time_embed_dim,
+            cross_attention_dim=cross_attention_dim,
+            encoder_hid_dim=encoder_hid_dim,
+            flip_sin_to_cos=flip_sin_to_cos,
+            freq_shift=freq_shift,
+            projection_class_embeddings_input_dim=projection_class_embeddings_input_dim,
+            time_embed_dim=time_embed_dim,
+        )
+
+        if time_embedding_act_fn is None:
+            self.time_embed_act = None
+        else:
+            self.time_embed_act = get_activation(time_embedding_act_fn)
+
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+
+        if isinstance(only_cross_attention, bool):
+            if mid_block_only_cross_attention is None:
+                mid_block_only_cross_attention = only_cross_attention
+
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+
+        if mid_block_only_cross_attention is None:
+            mid_block_only_cross_attention = False
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+
+        if class_embeddings_concat:
+            # The time embeddings are concatenated with the class embeddings. The dimension of the
+            # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
+            # regular time embeddings
+            blocks_time_embed_dim = time_embed_dim * 2
+        else:
+            blocks_time_embed_dim = time_embed_dim
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                dropout=dropout,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = get_mid_block(
+            mid_block_type,
+            temb_channels=blocks_time_embed_dim,
+            in_channels=block_out_channels[-1],
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            resnet_groups=norm_num_groups,
+            output_scale_factor=mid_block_scale_factor,
+            transformer_layers_per_block=transformer_layers_per_block[-1],
+            num_attention_heads=num_attention_heads[-1],
+            cross_attention_dim=cross_attention_dim[-1],
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            mid_block_only_cross_attention=mid_block_only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            attention_type=attention_type,
+            resnet_skip_time_act=resnet_skip_time_act,
+            cross_attention_norm=cross_attention_norm,
+            attention_head_dim=attention_head_dim[-1],
+            dropout=dropout,
+        )
+
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = (
+            list(reversed(transformer_layers_per_block))
+            if reverse_transformer_layers_per_block is None
+            else reverse_transformer_layers_per_block
+        )
+        only_cross_attention = list(reversed(only_cross_attention))
+
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resolution_idx=i,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                dropout=dropout,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        self.fuse_gn_silu = True if int(os.environ.get("USE_NHWC_GN", 0)) else False
+        # out
+        if norm_num_groups is not None:
+            if self.fuse_gn_silu:
+                self.conv_norm_out=GN_NHWC(norm_num_groups, block_out_channels[0], activation="silu")
+            else:
+                self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps)
+                self.conv_act = get_activation(act_fn)
+
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        )
+
+        self._set_pos_net_if_use_gligen(attention_type=attention_type, cross_attention_dim=cross_attention_dim)
+
+    def _check_config(
+        self,
+        down_block_types: Tuple[str],
+        up_block_types: Tuple[str],
+        only_cross_attention: Union[bool, Tuple[bool]],
+        block_out_channels: Tuple[int],
+        layers_per_block: Union[int, Tuple[int]],
+        cross_attention_dim: Union[int, Tuple[int]],
+        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple[int]]],
+        reverse_transformer_layers_per_block: bool,
+        attention_head_dim: int,
+        num_attention_heads: Optional[Union[int, Tuple[int]]],
+    ):
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(transformer_layers_per_block, list) and reverse_transformer_layers_per_block is None:
+            for layer_number_per_block in transformer_layers_per_block:
+                if isinstance(layer_number_per_block, list):
+                    raise ValueError("Must provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet.")
+
+    def _set_time_proj(
+        self,
+        time_embedding_type: str,
+        block_out_channels: int,
+        flip_sin_to_cos: bool,
+        freq_shift: float,
+        time_embedding_dim: int,
+    ) -> Tuple[int, int]:
+        if time_embedding_type == "fourier":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
+            if time_embed_dim % 2 != 0:
+                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
+            self.time_proj = GaussianFourierProjection(
+                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+            )
+            timestep_input_dim = time_embed_dim
+        elif time_embedding_type == "positional":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
+
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        else:
+            raise ValueError(
+                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
+            )
+
+        return time_embed_dim, timestep_input_dim
+
+    def _set_encoder_hid_proj(
+        self,
+        encoder_hid_dim_type: Optional[str],
+        cross_attention_dim: Union[int, Tuple[int]],
+        encoder_hid_dim: Optional[int],
+    ):
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2
+            self.encoder_hid_proj = ImageProjection(
+                image_embed_dim=encoder_hid_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None
+
+    def _set_class_embedding(
+        self,
+        class_embed_type: Optional[str],
+        act_fn: str,
+        num_class_embeds: Optional[int],
+        projection_class_embeddings_input_dim: Optional[int],
+        time_embed_dim: int,
+        timestep_input_dim: int,
+    ):
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif class_embed_type == "simple_projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+
+    def _set_add_embedding(
+        self,
+        addition_embed_type: str,
+        addition_embed_type_num_heads: int,
+        addition_time_embed_dim: Optional[int],
+        flip_sin_to_cos: bool,
+        freq_shift: float,
+        cross_attention_dim: Optional[int],
+        encoder_hid_dim: Optional[int],
+        projection_class_embeddings_input_dim: Optional[int],
+        time_embed_dim: int,
+    ):
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
+            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif addition_embed_type == "image":
+            # Kandinsky 2.2
+            self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type == "image_hint":
+            # Kandinsky 2.2 ControlNet
+            self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
+
+    def _set_pos_net_if_use_gligen(self, attention_type: str, cross_attention_dim: int):
+        if attention_type in ["gated", "gated-text-image"]:
+            positive_len = 768
+            if isinstance(cross_attention_dim, int):
+                positive_len = cross_attention_dim
+            elif isinstance(cross_attention_dim, tuple) or isinstance(cross_attention_dim, list):
+                positive_len = cross_attention_dim[0]
+
+            feature_type = "text-only" if attention_type == "gated" else "text-image"
+            self.position_net = GLIGENTextBoundingboxProjection(
+                positive_len=positive_len, out_dim=cross_attention_dim, feature_type=feature_type
+            )
+
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    def set_attention_slice(self, slice_size: Union[str, int, List[int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+
+        num_sliceable_layers = len(sliceable_head_dims)
+
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
+        r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stage blocks where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
+        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        for i, upsample_block in enumerate(self.up_blocks):
+            setattr(upsample_block, "s1", s1)
+            setattr(upsample_block, "s2", s2)
+            setattr(upsample_block, "b1", b1)
+            setattr(upsample_block, "b2", b2)
+
+    def disable_freeu(self):
+        """Disables the FreeU mechanism."""
+        freeu_keys = {"s1", "s2", "b1", "b2"}
+        for i, upsample_block in enumerate(self.up_blocks):
+            for k in freeu_keys:
+                if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
+                    setattr(upsample_block, k, None)
+
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    def unload_lora(self):
+        """Unloads LoRA weights."""
+        deprecate(
+            "unload_lora",
+            "0.28.0",
+            "Calling `unload_lora()` is deprecated and will be removed in a future version. Please install `peft` and then call `disable_adapters().",
+        )
+        for module in self.modules():
+            if hasattr(module, "set_lora_layer"):
+                module.set_lora_layer(None)
+
+    def get_time_embed(
+        self, sample: torch.Tensor, timestep: Union[torch.Tensor, float, int]
+    ) -> Optional[torch.Tensor]:
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        return t_emb
+
+    def get_class_embed(self, sample: torch.Tensor, class_labels: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
+        class_emb = None
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+        return class_emb
+
+    def get_aug_embed(
+        self, emb: torch.Tensor, encoder_hidden_states: torch.Tensor, added_cond_kwargs: Dict[str, Any]
+    ) -> Optional[torch.Tensor]:
+        aug_emb = None
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+        elif self.config.addition_embed_type == "text_image":
+            # Kandinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+            aug_emb = self.add_embedding(text_embs, image_embs)
+        elif self.config.addition_embed_type == "text_time":
+            # SDXL - style
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(add_embeds)
+        elif self.config.addition_embed_type == "image":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            aug_emb = self.add_embedding(image_embs)
+        elif self.config.addition_embed_type == "image_hint":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            hint = added_cond_kwargs.get("hint")
+            aug_emb = self.add_embedding(image_embs, hint)
+        return aug_emb
+
+    def process_encoder_hidden_states(
+        self, encoder_hidden_states: torch.Tensor, added_cond_kwargs: Dict[str, Any]
+    ) -> torch.Tensor:
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            image_embeds = self.encoder_hid_proj(image_embeds)
+            encoder_hidden_states = (encoder_hidden_states, image_embeds)
+        return encoder_hidden_states
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        The [`UNet2DConditionModel`] forward method.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
+                Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
+                through the `self.time_embedding` layer to obtain the timestep embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
+                A tuple of tensors that if specified are added to the residuals of down unet blocks.
+            mid_block_additional_residual: (`torch.Tensor`, *optional*):
+                A tensor that if specified is added to the residual of the middle unet block.
+            encoder_attention_mask (`torch.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added to UNet long skip connections from down blocks to up blocks for
+                example from ControlNet side model(s)
+            mid_block_additional_residual (`torch.Tensor`, *optional*):
+                additional residual to be added to UNet mid block output, for example from ControlNet side model
+            down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)
+
+        Returns:
+            [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+
+        for dim in sample.shape[-2:]:
+            if dim % default_overall_up_factor != 0:
+                # Forward upsample size to force interpolation output size.
+                forward_upsample_size = True
+                break
+
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+
+        # 1. time
+        t_emb = self.get_time_embed(sample=sample, timestep=timestep)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+
+        class_emb = self.get_class_embed(sample=sample, class_labels=class_labels)
+        if class_emb is not None:
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+
+        aug_emb = self.get_aug_embed(
+            emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
+        )
+        if self.config.addition_embed_type == "image_hint":
+            aug_emb, hint = aug_emb
+            sample = torch.cat([sample, hint], dim=1)
+
+        emb = emb + aug_emb if aug_emb is not None else emb
+
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+
+        encoder_hidden_states = self.process_encoder_hidden_states(
+            encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
+        )
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        # 2.5 GLIGEN position net
+        if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
+            cross_attention_kwargs = cross_attention_kwargs.copy()
+            gligen_args = cross_attention_kwargs.pop("gligen")
+            cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
+
+        # 3. down
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+
+        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+        # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
+        is_adapter = down_intrablock_additional_residuals is not None
+        # maintain backward compatibility for legacy usage, where
+        #       T2I-Adapter and ControlNet both use down_block_additional_residuals arg
+        #       but can only use one or the other
+        if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
+            deprecate(
+                "T2I should not use down_block_additional_residuals",
+                "1.3.0",
+                "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
+                       and will be removed in diffusers 1.3.0.  `down_block_additional_residuals` should only be used \
+                       for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
+                standard_warn=False,
+            )
+            down_intrablock_additional_residuals = down_block_additional_residuals
+            is_adapter = True
+
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                # For t2i-adapter CrossAttnDownBlock2D
+                additional_residuals = {}
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
+
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    **additional_residuals,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    sample += down_intrablock_additional_residuals.pop(0)
+
+            down_block_res_samples += res_samples
+
+        if is_controlnet:
+            new_down_block_res_samples = ()
+
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+
+            down_block_res_samples = new_down_block_res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = self.mid_block(sample, emb)
+
+            # To support T2I-Adapter-XL
+            if (
+                is_adapter
+                and len(down_intrablock_additional_residuals) > 0
+                and sample.shape == down_intrablock_additional_residuals[0].shape
+            ):
+                sample += down_intrablock_additional_residuals.pop(0)
+
+        if is_controlnet:
+            sample = sample + mid_block_additional_residual
+
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                )
+
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            if not self.fuse_gn_silu:
+                sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
+        if not return_dict:
+            return (sample,)
+
+        return UNet2DConditionOutput(sample=sample)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d_condition_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d_condition_flax.py
new file mode 100644
index 000000000..a5ec2875c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d_condition_flax.py
@@ -0,0 +1,453 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, Optional, Tuple, Union
+
+import flax
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict
+
+from ...configuration_utils import ConfigMixin, flax_register_to_config
+from ...utils import BaseOutput
+from ..embeddings_flax import FlaxTimestepEmbedding, FlaxTimesteps
+from ..modeling_flax_utils import FlaxModelMixin
+from .unet_2d_blocks_flax import (
+    FlaxCrossAttnDownBlock2D,
+    FlaxCrossAttnUpBlock2D,
+    FlaxDownBlock2D,
+    FlaxUNetMidBlock2DCrossAttn,
+    FlaxUpBlock2D,
+)
+
+
+@flax.struct.dataclass
+class FlaxUNet2DConditionOutput(BaseOutput):
+    """
+    The output of [`FlaxUNet2DConditionModel`].
+
+    Args:
+        sample (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+
+    sample: jnp.ndarray
+
+
+@flax_register_to_config
+class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
+    r"""
+    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
+    shaped output.
+
+    This model inherits from [`FlaxModelMixin`]. Check the superclass documentation for it's generic methods
+    implemented for all models (such as downloading or saving).
+
+    This model is also a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
+    subclass. Use it as a regular Flax Linen module and refer to the Flax documentation for all matters related to its
+    general usage and behavior.
+
+    Inherent JAX features such as the following are supported:
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        sample_size (`int`, *optional*):
+            The size of the input sample.
+        in_channels (`int`, *optional*, defaults to 4):
+            The number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4):
+            The number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxDownBlock2D")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("FlaxUpBlock2D", "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D")`):
+            The tuple of upsample blocks to use.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
+            Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`. If `None`, the mid block layer is skipped.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        attention_head_dim (`int` or `Tuple[int]`, *optional*, defaults to 8):
+            The dimension of the attention heads.
+        num_attention_heads (`int` or `Tuple[int]`, *optional*):
+            The number of attention heads.
+        cross_attention_dim (`int`, *optional*, defaults to 768):
+            The dimension of the cross attention features.
+        dropout (`float`, *optional*, defaults to 0):
+            Dropout probability for down, up and bottleneck blocks.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
+            Enable memory efficient attention as described [here](https://arxiv.org/abs/2112.05682).
+        split_head_dim (`bool`, *optional*, defaults to `False`):
+            Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
+            enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
+    """
+
+    sample_size: int = 32
+    in_channels: int = 4
+    out_channels: int = 4
+    down_block_types: Tuple[str, ...] = (
+        "CrossAttnDownBlock2D",
+        "CrossAttnDownBlock2D",
+        "CrossAttnDownBlock2D",
+        "DownBlock2D",
+    )
+    up_block_types: Tuple[str, ...] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")
+    mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn"
+    only_cross_attention: Union[bool, Tuple[bool]] = False
+    block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280)
+    layers_per_block: int = 2
+    attention_head_dim: Union[int, Tuple[int, ...]] = 8
+    num_attention_heads: Optional[Union[int, Tuple[int, ...]]] = None
+    cross_attention_dim: int = 1280
+    dropout: float = 0.0
+    use_linear_projection: bool = False
+    dtype: jnp.dtype = jnp.float32
+    flip_sin_to_cos: bool = True
+    freq_shift: int = 0
+    use_memory_efficient_attention: bool = False
+    split_head_dim: bool = False
+    transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1
+    addition_embed_type: Optional[str] = None
+    addition_time_embed_dim: Optional[int] = None
+    addition_embed_type_num_heads: int = 64
+    projection_class_embeddings_input_dim: Optional[int] = None
+
+    def init_weights(self, rng: jax.Array) -> FrozenDict:
+        # init input tensors
+        sample_shape = (1, self.in_channels, self.sample_size, self.sample_size)
+        sample = jnp.zeros(sample_shape, dtype=jnp.float32)
+        timesteps = jnp.ones((1,), dtype=jnp.int32)
+        encoder_hidden_states = jnp.zeros((1, 1, self.cross_attention_dim), dtype=jnp.float32)
+
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        added_cond_kwargs = None
+        if self.addition_embed_type == "text_time":
+            # we retrieve the expected `text_embeds_dim` by first checking if the architecture is a refiner
+            # or non-refiner architecture and then by "reverse-computing" from `projection_class_embeddings_input_dim`
+            is_refiner = (
+                5 * self.config.addition_time_embed_dim + self.config.cross_attention_dim
+                == self.config.projection_class_embeddings_input_dim
+            )
+            num_micro_conditions = 5 if is_refiner else 6
+
+            text_embeds_dim = self.config.projection_class_embeddings_input_dim - (
+                num_micro_conditions * self.config.addition_time_embed_dim
+            )
+
+            time_ids_channels = self.projection_class_embeddings_input_dim - text_embeds_dim
+            time_ids_dims = time_ids_channels // self.addition_time_embed_dim
+            added_cond_kwargs = {
+                "text_embeds": jnp.zeros((1, text_embeds_dim), dtype=jnp.float32),
+                "time_ids": jnp.zeros((1, time_ids_dims), dtype=jnp.float32),
+            }
+        return self.init(rngs, sample, timesteps, encoder_hidden_states, added_cond_kwargs)["params"]
+
+    def setup(self) -> None:
+        block_out_channels = self.block_out_channels
+        time_embed_dim = block_out_channels[0] * 4
+
+        if self.num_attention_heads is not None:
+            raise ValueError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = self.num_attention_heads or self.attention_head_dim
+
+        # input
+        self.conv_in = nn.Conv(
+            block_out_channels[0],
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+        # time
+        self.time_proj = FlaxTimesteps(
+            block_out_channels[0], flip_sin_to_cos=self.flip_sin_to_cos, freq_shift=self.config.freq_shift
+        )
+        self.time_embedding = FlaxTimestepEmbedding(time_embed_dim, dtype=self.dtype)
+
+        only_cross_attention = self.only_cross_attention
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = (only_cross_attention,) * len(self.down_block_types)
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(self.down_block_types)
+
+        # transformer layers per block
+        transformer_layers_per_block = self.transformer_layers_per_block
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(self.down_block_types)
+
+        # addition embed types
+        if self.addition_embed_type is None:
+            self.add_embedding = None
+        elif self.addition_embed_type == "text_time":
+            if self.addition_time_embed_dim is None:
+                raise ValueError(
+                    f"addition_embed_type {self.addition_embed_type} requires `addition_time_embed_dim` to not be None"
+                )
+            self.add_time_proj = FlaxTimesteps(self.addition_time_embed_dim, self.flip_sin_to_cos, self.freq_shift)
+            self.add_embedding = FlaxTimestepEmbedding(time_embed_dim, dtype=self.dtype)
+        else:
+            raise ValueError(f"addition_embed_type: {self.addition_embed_type} must be None or `text_time`.")
+
+        # down
+        down_blocks = []
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(self.down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            if down_block_type == "CrossAttnDownBlock2D":
+                down_block = FlaxCrossAttnDownBlock2D(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    dropout=self.dropout,
+                    num_layers=self.layers_per_block,
+                    transformer_layers_per_block=transformer_layers_per_block[i],
+                    num_attention_heads=num_attention_heads[i],
+                    add_downsample=not is_final_block,
+                    use_linear_projection=self.use_linear_projection,
+                    only_cross_attention=only_cross_attention[i],
+                    use_memory_efficient_attention=self.use_memory_efficient_attention,
+                    split_head_dim=self.split_head_dim,
+                    dtype=self.dtype,
+                )
+            else:
+                down_block = FlaxDownBlock2D(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    dropout=self.dropout,
+                    num_layers=self.layers_per_block,
+                    add_downsample=not is_final_block,
+                    dtype=self.dtype,
+                )
+
+            down_blocks.append(down_block)
+        self.down_blocks = down_blocks
+
+        # mid
+        if self.config.mid_block_type == "UNetMidBlock2DCrossAttn":
+            self.mid_block = FlaxUNetMidBlock2DCrossAttn(
+                in_channels=block_out_channels[-1],
+                dropout=self.dropout,
+                num_attention_heads=num_attention_heads[-1],
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                use_linear_projection=self.use_linear_projection,
+                use_memory_efficient_attention=self.use_memory_efficient_attention,
+                split_head_dim=self.split_head_dim,
+                dtype=self.dtype,
+            )
+        elif self.config.mid_block_type is None:
+            self.mid_block = None
+        else:
+            raise ValueError(f"Unexpected mid_block_type {self.config.mid_block_type}")
+
+        # up
+        up_blocks = []
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        only_cross_attention = list(reversed(only_cross_attention))
+        output_channel = reversed_block_out_channels[0]
+        reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
+        for i, up_block_type in enumerate(self.up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            if up_block_type == "CrossAttnUpBlock2D":
+                up_block = FlaxCrossAttnUpBlock2D(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    prev_output_channel=prev_output_channel,
+                    num_layers=self.layers_per_block + 1,
+                    transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                    num_attention_heads=reversed_num_attention_heads[i],
+                    add_upsample=not is_final_block,
+                    dropout=self.dropout,
+                    use_linear_projection=self.use_linear_projection,
+                    only_cross_attention=only_cross_attention[i],
+                    use_memory_efficient_attention=self.use_memory_efficient_attention,
+                    split_head_dim=self.split_head_dim,
+                    dtype=self.dtype,
+                )
+            else:
+                up_block = FlaxUpBlock2D(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    prev_output_channel=prev_output_channel,
+                    num_layers=self.layers_per_block + 1,
+                    add_upsample=not is_final_block,
+                    dropout=self.dropout,
+                    dtype=self.dtype,
+                )
+
+            up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        self.up_blocks = up_blocks
+
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_groups=32, epsilon=1e-5)
+        self.conv_out = nn.Conv(
+            self.out_channels,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+    def __call__(
+        self,
+        sample: jnp.ndarray,
+        timesteps: Union[jnp.ndarray, float, int],
+        encoder_hidden_states: jnp.ndarray,
+        added_cond_kwargs: Optional[Union[Dict, FrozenDict]] = None,
+        down_block_additional_residuals: Optional[Tuple[jnp.ndarray, ...]] = None,
+        mid_block_additional_residual: Optional[jnp.ndarray] = None,
+        return_dict: bool = True,
+        train: bool = False,
+    ) -> Union[FlaxUNet2DConditionOutput, Tuple[jnp.ndarray]]:
+        r"""
+        Args:
+            sample (`jnp.ndarray`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`jnp.ndarray` or `float` or `int`): timesteps
+            encoder_hidden_states (`jnp.ndarray`): (batch_size, sequence_length, hidden_size) encoder hidden states
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
+                A tuple of tensors that if specified are added to the residuals of down unet blocks.
+            mid_block_additional_residual: (`torch.Tensor`, *optional*):
+                A tensor that if specified is added to the residual of the middle unet block.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] instead of a
+                plain tuple.
+            train (`bool`, *optional*, defaults to `False`):
+                Use deterministic functions and disable dropout when not training.
+
+        Returns:
+            [`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] or `tuple`:
+            [`~models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`.
+            When returning a tuple, the first element is the sample tensor.
+        """
+        # 1. time
+        if not isinstance(timesteps, jnp.ndarray):
+            timesteps = jnp.array([timesteps], dtype=jnp.int32)
+        elif isinstance(timesteps, jnp.ndarray) and len(timesteps.shape) == 0:
+            timesteps = timesteps.astype(dtype=jnp.float32)
+            timesteps = jnp.expand_dims(timesteps, 0)
+
+        t_emb = self.time_proj(timesteps)
+        t_emb = self.time_embedding(t_emb)
+
+        # additional embeddings
+        aug_emb = None
+        if self.addition_embed_type == "text_time":
+            if added_cond_kwargs is None:
+                raise ValueError(
+                    f"Need to provide argument `added_cond_kwargs` for {self.__class__} when using `addition_embed_type={self.addition_embed_type}`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if text_embeds is None:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            if time_ids is None:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            # compute time embeds
+            time_embeds = self.add_time_proj(jnp.ravel(time_ids))  # (1, 6) => (6,) => (6, 256)
+            time_embeds = jnp.reshape(time_embeds, (text_embeds.shape[0], -1))
+            add_embeds = jnp.concatenate([text_embeds, time_embeds], axis=-1)
+            aug_emb = self.add_embedding(add_embeds)
+
+        t_emb = t_emb + aug_emb if aug_emb is not None else t_emb
+
+        # 2. pre-process
+        sample = jnp.transpose(sample, (0, 2, 3, 1))
+        sample = self.conv_in(sample)
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for down_block in self.down_blocks:
+            if isinstance(down_block, FlaxCrossAttnDownBlock2D):
+                sample, res_samples = down_block(sample, t_emb, encoder_hidden_states, deterministic=not train)
+            else:
+                sample, res_samples = down_block(sample, t_emb, deterministic=not train)
+            down_block_res_samples += res_samples
+
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples = ()
+
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample += down_block_additional_residual
+                new_down_block_res_samples += (down_block_res_sample,)
+
+            down_block_res_samples = new_down_block_res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(sample, t_emb, encoder_hidden_states, deterministic=not train)
+
+        if mid_block_additional_residual is not None:
+            sample += mid_block_additional_residual
+
+        # 5. up
+        for up_block in self.up_blocks:
+            res_samples = down_block_res_samples[-(self.layers_per_block + 1) :]
+            down_block_res_samples = down_block_res_samples[: -(self.layers_per_block + 1)]
+            if isinstance(up_block, FlaxCrossAttnUpBlock2D):
+                sample = up_block(
+                    sample,
+                    temb=t_emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    res_hidden_states_tuple=res_samples,
+                    deterministic=not train,
+                )
+            else:
+                sample = up_block(sample, temb=t_emb, res_hidden_states_tuple=res_samples, deterministic=not train)
+
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = nn.silu(sample)
+        sample = self.conv_out(sample)
+        sample = jnp.transpose(sample, (0, 3, 1, 2))
+
+        if not return_dict:
+            return (sample,)
+
+        return FlaxUNet2DConditionOutput(sample=sample)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_3d_blocks.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_3d_blocks.py
new file mode 100644
index 000000000..a48f1841c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_3d_blocks.py
@@ -0,0 +1,2405 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from ...utils import deprecate, is_torch_version, logging
+from ...utils.torch_utils import apply_freeu
+from ..attention import Attention
+from ..resnet import (
+    Downsample2D,
+    ResnetBlock2D,
+    SpatioTemporalResBlock,
+    TemporalConvLayer,
+    Upsample2D,
+)
+from ..transformers.dual_transformer_2d import DualTransformer2DModel
+from ..transformers.transformer_2d import Transformer2DModel
+from ..transformers.transformer_temporal import (
+    TransformerSpatioTemporalModel,
+    TransformerTemporalModel,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def get_down_block(
+    down_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    temb_channels: int,
+    add_downsample: bool,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    num_attention_heads: int,
+    resnet_groups: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    downsample_padding: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = True,
+    only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    temporal_num_attention_heads: int = 8,
+    temporal_max_seq_length: int = 32,
+    transformer_layers_per_block: int = 1,
+) -> Union[
+    "DownBlock3D",
+    "CrossAttnDownBlock3D",
+    "DownBlockMotion",
+    "CrossAttnDownBlockMotion",
+    "DownBlockSpatioTemporal",
+    "CrossAttnDownBlockSpatioTemporal",
+]:
+    if down_block_type == "DownBlock3D":
+        return DownBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "CrossAttnDownBlock3D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock3D")
+        return CrossAttnDownBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    if down_block_type == "DownBlockMotion":
+        return DownBlockMotion(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            temporal_num_attention_heads=temporal_num_attention_heads,
+            temporal_max_seq_length=temporal_max_seq_length,
+        )
+    elif down_block_type == "CrossAttnDownBlockMotion":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlockMotion")
+        return CrossAttnDownBlockMotion(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            temporal_num_attention_heads=temporal_num_attention_heads,
+            temporal_max_seq_length=temporal_max_seq_length,
+        )
+    elif down_block_type == "DownBlockSpatioTemporal":
+        # added for SDV
+        return DownBlockSpatioTemporal(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+        )
+    elif down_block_type == "CrossAttnDownBlockSpatioTemporal":
+        # added for SDV
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlockSpatioTemporal")
+        return CrossAttnDownBlockSpatioTemporal(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            add_downsample=add_downsample,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+        )
+
+    raise ValueError(f"{down_block_type} does not exist.")
+
+
+def get_up_block(
+    up_block_type: str,
+    num_layers: int,
+    in_channels: int,
+    out_channels: int,
+    prev_output_channel: int,
+    temb_channels: int,
+    add_upsample: bool,
+    resnet_eps: float,
+    resnet_act_fn: str,
+    num_attention_heads: int,
+    resolution_idx: Optional[int] = None,
+    resnet_groups: Optional[int] = None,
+    cross_attention_dim: Optional[int] = None,
+    dual_cross_attention: bool = False,
+    use_linear_projection: bool = True,
+    only_cross_attention: bool = False,
+    upcast_attention: bool = False,
+    resnet_time_scale_shift: str = "default",
+    temporal_num_attention_heads: int = 8,
+    temporal_cross_attention_dim: Optional[int] = None,
+    temporal_max_seq_length: int = 32,
+    transformer_layers_per_block: int = 1,
+    dropout: float = 0.0,
+) -> Union[
+    "UpBlock3D",
+    "CrossAttnUpBlock3D",
+    "UpBlockMotion",
+    "CrossAttnUpBlockMotion",
+    "UpBlockSpatioTemporal",
+    "CrossAttnUpBlockSpatioTemporal",
+]:
+    if up_block_type == "UpBlock3D":
+        return UpBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            resolution_idx=resolution_idx,
+        )
+    elif up_block_type == "CrossAttnUpBlock3D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock3D")
+        return CrossAttnUpBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            resolution_idx=resolution_idx,
+        )
+    if up_block_type == "UpBlockMotion":
+        return UpBlockMotion(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            resolution_idx=resolution_idx,
+            temporal_num_attention_heads=temporal_num_attention_heads,
+            temporal_max_seq_length=temporal_max_seq_length,
+        )
+    elif up_block_type == "CrossAttnUpBlockMotion":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlockMotion")
+        return CrossAttnUpBlockMotion(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            resolution_idx=resolution_idx,
+            temporal_num_attention_heads=temporal_num_attention_heads,
+            temporal_max_seq_length=temporal_max_seq_length,
+        )
+    elif up_block_type == "UpBlockSpatioTemporal":
+        # added for SDV
+        return UpBlockSpatioTemporal(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
+            add_upsample=add_upsample,
+        )
+    elif up_block_type == "CrossAttnUpBlockSpatioTemporal":
+        # added for SDV
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlockSpatioTemporal")
+        return CrossAttnUpBlockSpatioTemporal(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            add_upsample=add_upsample,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            resolution_idx=resolution_idx,
+        )
+
+    raise ValueError(f"{up_block_type} does not exist.")
+
+
+class UNetMidBlock3DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        output_scale_factor: float = 1.0,
+        cross_attention_dim: int = 1280,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = True,
+        upcast_attention: bool = False,
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        temp_convs = [
+            TemporalConvLayer(
+                in_channels,
+                in_channels,
+                dropout=0.1,
+                norm_num_groups=resnet_groups,
+            )
+        ]
+        attentions = []
+        temp_attentions = []
+
+        for _ in range(num_layers):
+            attentions.append(
+                Transformer2DModel(
+                    in_channels // num_attention_heads,
+                    num_attention_heads,
+                    in_channels=in_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    upcast_attention=upcast_attention,
+                )
+            )
+            temp_attentions.append(
+                TransformerTemporalModel(
+                    in_channels // num_attention_heads,
+                    num_attention_heads,
+                    in_channels=in_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            temp_convs.append(
+                TemporalConvLayer(
+                    in_channels,
+                    in_channels,
+                    dropout=0.1,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+        self.temp_convs = nn.ModuleList(temp_convs)
+        self.attentions = nn.ModuleList(attentions)
+        self.temp_attentions = nn.ModuleList(temp_attentions)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        num_frames: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> torch.FloatTensor:
+        hidden_states = self.resnets[0](hidden_states, temb)
+        hidden_states = self.temp_convs[0](hidden_states, num_frames=num_frames)
+        for attn, temp_attn, resnet, temp_conv in zip(
+            self.attentions, self.temp_attentions, self.resnets[1:], self.temp_convs[1:]
+        ):
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                return_dict=False,
+            )[0]
+            hidden_states = temp_attn(
+                hidden_states,
+                num_frames=num_frames,
+                cross_attention_kwargs=cross_attention_kwargs,
+                return_dict=False,
+            )[0]
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = temp_conv(hidden_states, num_frames=num_frames)
+
+        return hidden_states
+
+
+class CrossAttnDownBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        downsample_padding: int = 1,
+        add_downsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        temp_attentions = []
+        temp_convs = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            temp_convs.append(
+                TemporalConvLayer(
+                    out_channels,
+                    out_channels,
+                    dropout=0.1,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+            attentions.append(
+                Transformer2DModel(
+                    out_channels // num_attention_heads,
+                    num_attention_heads,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                )
+            )
+            temp_attentions.append(
+                TransformerTemporalModel(
+                    out_channels // num_attention_heads,
+                    num_attention_heads,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.temp_convs = nn.ModuleList(temp_convs)
+        self.attentions = nn.ModuleList(attentions)
+        self.temp_attentions = nn.ModuleList(temp_attentions)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        num_frames: int = 1,
+        cross_attention_kwargs: Dict[str, Any] = None,
+    ) -> Union[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        # TODO(Patrick, William) - attention mask is not used
+        output_states = ()
+
+        for resnet, temp_conv, attn, temp_attn in zip(
+            self.resnets, self.temp_convs, self.attentions, self.temp_attentions
+        ):
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = temp_conv(hidden_states, num_frames=num_frames)
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                return_dict=False,
+            )[0]
+            hidden_states = temp_attn(
+                hidden_states,
+                num_frames=num_frames,
+                cross_attention_kwargs=cross_attention_kwargs,
+                return_dict=False,
+            )[0]
+
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class DownBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        resnets = []
+        temp_convs = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            temp_convs.append(
+                TemporalConvLayer(
+                    out_channels,
+                    out_channels,
+                    dropout=0.1,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+        self.temp_convs = nn.ModuleList(temp_convs)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        num_frames: int = 1,
+    ) -> Union[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+
+        for resnet, temp_conv in zip(self.resnets, self.temp_convs):
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = temp_conv(hidden_states, num_frames=num_frames)
+
+            output_states += (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states += (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class CrossAttnUpBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        resolution_idx: Optional[int] = None,
+    ):
+        super().__init__()
+        resnets = []
+        temp_convs = []
+        attentions = []
+        temp_attentions = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            temp_convs.append(
+                TemporalConvLayer(
+                    out_channels,
+                    out_channels,
+                    dropout=0.1,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+            attentions.append(
+                Transformer2DModel(
+                    out_channels // num_attention_heads,
+                    num_attention_heads,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                )
+            )
+            temp_attentions.append(
+                TransformerTemporalModel(
+                    out_channels // num_attention_heads,
+                    num_attention_heads,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.temp_convs = nn.ModuleList(temp_convs)
+        self.attentions = nn.ModuleList(attentions)
+        self.temp_attentions = nn.ModuleList(temp_attentions)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        num_frames: int = 1,
+        cross_attention_kwargs: Dict[str, Any] = None,
+    ) -> torch.FloatTensor:
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+
+        # TODO(Patrick, William) - attention mask is not used
+        for resnet, temp_conv, attn, temp_attn in zip(
+            self.resnets, self.temp_convs, self.attentions, self.temp_attentions
+        ):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = temp_conv(hidden_states, num_frames=num_frames)
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                return_dict=False,
+            )[0]
+            hidden_states = temp_attn(
+                hidden_states,
+                num_frames=num_frames,
+                cross_attention_kwargs=cross_attention_kwargs,
+                return_dict=False,
+            )[0]
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+
+        return hidden_states
+
+
+class UpBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        resolution_idx: Optional[int] = None,
+    ):
+        super().__init__()
+        resnets = []
+        temp_convs = []
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            temp_convs.append(
+                TemporalConvLayer(
+                    out_channels,
+                    out_channels,
+                    dropout=0.1,
+                    norm_num_groups=resnet_groups,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+        self.temp_convs = nn.ModuleList(temp_convs)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        num_frames: int = 1,
+    ) -> torch.FloatTensor:
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+        for resnet, temp_conv in zip(self.resnets, self.temp_convs):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = temp_conv(hidden_states, num_frames=num_frames)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+
+        return hidden_states
+
+
+class DownBlockMotion(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+        temporal_num_attention_heads: int = 1,
+        temporal_cross_attention_dim: Optional[int] = None,
+        temporal_max_seq_length: int = 32,
+    ):
+        super().__init__()
+        resnets = []
+        motion_modules = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            motion_modules.append(
+                TransformerTemporalModel(
+                    num_attention_heads=temporal_num_attention_heads,
+                    in_channels=out_channels,
+                    norm_num_groups=resnet_groups,
+                    cross_attention_dim=temporal_cross_attention_dim,
+                    attention_bias=False,
+                    activation_fn="geglu",
+                    positional_embeddings="sinusoidal",
+                    num_positional_embeddings=temporal_max_seq_length,
+                    attention_head_dim=out_channels // temporal_num_attention_heads,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        num_frames: int = 1,
+        *args,
+        **kwargs,
+    ) -> Union[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        output_states = ()
+
+        blocks = zip(self.resnets, self.motion_modules)
+        for resnet, motion_module in blocks:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        use_reentrant=False,
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+
+            else:
+                hidden_states = resnet(hidden_states, temb)
+            hidden_states = motion_module(hidden_states, num_frames=num_frames)[0]
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class CrossAttnDownBlockMotion(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        downsample_padding: int = 1,
+        add_downsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+        temporal_cross_attention_dim: Optional[int] = None,
+        temporal_num_attention_heads: int = 8,
+        temporal_max_seq_length: int = 32,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        motion_modules = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+
+            motion_modules.append(
+                TransformerTemporalModel(
+                    num_attention_heads=temporal_num_attention_heads,
+                    in_channels=out_channels,
+                    norm_num_groups=resnet_groups,
+                    cross_attention_dim=temporal_cross_attention_dim,
+                    attention_bias=False,
+                    activation_fn="geglu",
+                    positional_embeddings="sinusoidal",
+                    num_positional_embeddings=temporal_max_seq_length,
+                    attention_head_dim=out_channels // temporal_num_attention_heads,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=downsample_padding,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        num_frames: int = 1,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        additional_residuals: Optional[torch.FloatTensor] = None,
+    ):
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+
+        output_states = ()
+
+        blocks = list(zip(self.resnets, self.attentions, self.motion_modules))
+        for i, (resnet, attn, motion_module) in enumerate(blocks):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+            hidden_states = motion_module(
+                hidden_states,
+                num_frames=num_frames,
+            )[0]
+
+            # apply additional residuals to the output of the last pair of resnet and attention blocks
+            if i == len(blocks) - 1 and additional_residuals is not None:
+                hidden_states = hidden_states + additional_residuals
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class CrossAttnUpBlockMotion(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+        temporal_cross_attention_dim: Optional[int] = None,
+        temporal_num_attention_heads: int = 8,
+        temporal_max_seq_length: int = 32,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        motion_modules = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            motion_modules.append(
+                TransformerTemporalModel(
+                    num_attention_heads=temporal_num_attention_heads,
+                    in_channels=out_channels,
+                    norm_num_groups=resnet_groups,
+                    cross_attention_dim=temporal_cross_attention_dim,
+                    attention_bias=False,
+                    activation_fn="geglu",
+                    positional_embeddings="sinusoidal",
+                    num_positional_embeddings=temporal_max_seq_length,
+                    attention_head_dim=out_channels // temporal_num_attention_heads,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        num_frames: int = 1,
+    ) -> torch.FloatTensor:
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+
+        blocks = zip(self.resnets, self.attentions, self.motion_modules)
+        for resnet, attn, motion_module in blocks:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+            hidden_states = motion_module(
+                hidden_states,
+                num_frames=num_frames,
+            )[0]
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+
+        return hidden_states
+
+
+class UpBlockMotion(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        temporal_norm_num_groups: int = 32,
+        temporal_cross_attention_dim: Optional[int] = None,
+        temporal_num_attention_heads: int = 8,
+        temporal_max_seq_length: int = 32,
+    ):
+        super().__init__()
+        resnets = []
+        motion_modules = []
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+            motion_modules.append(
+                TransformerTemporalModel(
+                    num_attention_heads=temporal_num_attention_heads,
+                    in_channels=out_channels,
+                    norm_num_groups=temporal_norm_num_groups,
+                    cross_attention_dim=temporal_cross_attention_dim,
+                    attention_bias=False,
+                    activation_fn="geglu",
+                    positional_embeddings="sinusoidal",
+                    num_positional_embeddings=temporal_max_seq_length,
+                    attention_head_dim=out_channels // temporal_num_attention_heads,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        upsample_size=None,
+        num_frames: int = 1,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+
+        blocks = zip(self.resnets, self.motion_modules)
+
+        for resnet, motion_module in blocks:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        use_reentrant=False,
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+
+            else:
+                hidden_states = resnet(hidden_states, temb)
+            hidden_states = motion_module(hidden_states, num_frames=num_frames)[0]
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+
+        return hidden_states
+
+
+class UNetMidBlockCrossAttnMotion(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        output_scale_factor: float = 1.0,
+        cross_attention_dim: int = 1280,
+        dual_cross_attention: float = False,
+        use_linear_projection: float = False,
+        upcast_attention: float = False,
+        attention_type: str = "default",
+        temporal_num_attention_heads: int = 1,
+        temporal_cross_attention_dim: Optional[int] = None,
+        temporal_max_seq_length: int = 32,
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+        motion_modules = []
+
+        for _ in range(num_layers):
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            motion_modules.append(
+                TransformerTemporalModel(
+                    num_attention_heads=temporal_num_attention_heads,
+                    attention_head_dim=in_channels // temporal_num_attention_heads,
+                    in_channels=in_channels,
+                    norm_num_groups=resnet_groups,
+                    cross_attention_dim=temporal_cross_attention_dim,
+                    attention_bias=False,
+                    positional_embeddings="sinusoidal",
+                    num_positional_embeddings=temporal_max_seq_length,
+                    activation_fn="geglu",
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        self.motion_modules = nn.ModuleList(motion_modules)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        num_frames: int = 1,
+    ) -> torch.FloatTensor:
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+
+        hidden_states = self.resnets[0](hidden_states, temb)
+
+        blocks = zip(self.attentions, self.resnets[1:], self.motion_modules)
+        for attn, resnet, motion_module in blocks:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(motion_module),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+                hidden_states = motion_module(
+                    hidden_states,
+                    num_frames=num_frames,
+                )[0]
+                hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
+
+
+class MidBlockTemporalDecoder(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        attention_head_dim: int = 512,
+        num_layers: int = 1,
+        upcast_attention: bool = False,
+    ):
+        super().__init__()
+
+        resnets = []
+        attentions = []
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                SpatioTemporalResBlock(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=1e-6,
+                    temporal_eps=1e-5,
+                    merge_factor=0.0,
+                    merge_strategy="learned",
+                    switch_spatial_to_temporal_mix=True,
+                )
+            )
+
+        attentions.append(
+            Attention(
+                query_dim=in_channels,
+                heads=in_channels // attention_head_dim,
+                dim_head=attention_head_dim,
+                eps=1e-6,
+                upcast_attention=upcast_attention,
+                norm_num_groups=32,
+                bias=True,
+                residual_connection=True,
+            )
+        )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        image_only_indicator: torch.FloatTensor,
+    ):
+        hidden_states = self.resnets[0](
+            hidden_states,
+            image_only_indicator=image_only_indicator,
+        )
+        for resnet, attn in zip(self.resnets[1:], self.attentions):
+            hidden_states = attn(hidden_states)
+            hidden_states = resnet(
+                hidden_states,
+                image_only_indicator=image_only_indicator,
+            )
+
+        return hidden_states
+
+
+class UpBlockTemporalDecoder(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_layers: int = 1,
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            input_channels = in_channels if i == 0 else out_channels
+
+            resnets.append(
+                SpatioTemporalResBlock(
+                    in_channels=input_channels,
+                    out_channels=out_channels,
+                    temb_channels=None,
+                    eps=1e-6,
+                    temporal_eps=1e-5,
+                    merge_factor=0.0,
+                    merge_strategy="learned",
+                    switch_spatial_to_temporal_mix=True,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        image_only_indicator: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        for resnet in self.resnets:
+            hidden_states = resnet(
+                hidden_states,
+                image_only_indicator=image_only_indicator,
+            )
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class UNetMidBlockSpatioTemporal(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        # support for variable transformer layers per block
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        # there is always at least one resnet
+        resnets = [
+            SpatioTemporalResBlock(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=1e-5,
+            )
+        ]
+        attentions = []
+
+        for i in range(num_layers):
+            attentions.append(
+                TransformerSpatioTemporalModel(
+                    num_attention_heads,
+                    in_channels // num_attention_heads,
+                    in_channels=in_channels,
+                    num_layers=transformer_layers_per_block[i],
+                    cross_attention_dim=cross_attention_dim,
+                )
+            )
+
+            resnets.append(
+                SpatioTemporalResBlock(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=1e-5,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        image_only_indicator: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        hidden_states = self.resnets[0](
+            hidden_states,
+            temb,
+            image_only_indicator=image_only_indicator,
+        )
+
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if self.training and self.gradient_checkpointing:  # TODO
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                    return_dict=False,
+                )[0]
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    image_only_indicator,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                    return_dict=False,
+                )[0]
+                hidden_states = resnet(
+                    hidden_states,
+                    temb,
+                    image_only_indicator=image_only_indicator,
+                )
+
+        return hidden_states
+
+
+class DownBlockSpatioTemporal(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        num_layers: int = 1,
+        add_downsample: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                SpatioTemporalResBlock(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=1e-5,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        image_only_indicator: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        image_only_indicator,
+                        use_reentrant=False,
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        image_only_indicator,
+                    )
+            else:
+                hidden_states = resnet(
+                    hidden_states,
+                    temb,
+                    image_only_indicator=image_only_indicator,
+                )
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class CrossAttnDownBlockSpatioTemporal(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        add_downsample: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                SpatioTemporalResBlock(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=1e-6,
+                )
+            )
+            attentions.append(
+                TransformerSpatioTemporalModel(
+                    num_attention_heads,
+                    out_channels // num_attention_heads,
+                    in_channels=out_channels,
+                    num_layers=transformer_layers_per_block[i],
+                    cross_attention_dim=cross_attention_dim,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels,
+                        use_conv=True,
+                        out_channels=out_channels,
+                        padding=1,
+                        name="op",
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        image_only_indicator: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+
+        blocks = list(zip(self.resnets, self.attentions))
+        for resnet, attn in blocks:
+            if self.training and self.gradient_checkpointing:  # TODO
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    image_only_indicator,
+                    **ckpt_kwargs,
+                )
+
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                    return_dict=False,
+                )[0]
+            else:
+                hidden_states = resnet(
+                    hidden_states,
+                    temb,
+                    image_only_indicator=image_only_indicator,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                    return_dict=False,
+                )[0]
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class UpBlockSpatioTemporal(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                SpatioTemporalResBlock(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        image_only_indicator: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        image_only_indicator,
+                        use_reentrant=False,
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        image_only_indicator,
+                    )
+            else:
+                hidden_states = resnet(
+                    hidden_states,
+                    temb,
+                    image_only_indicator=image_only_indicator,
+                )
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
+
+
+class CrossAttnUpBlockSpatioTemporal(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                SpatioTemporalResBlock(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                )
+            )
+            attentions.append(
+                TransformerSpatioTemporalModel(
+                    num_attention_heads,
+                    out_channels // num_attention_heads,
+                    in_channels=out_channels,
+                    num_layers=transformer_layers_per_block[i],
+                    cross_attention_dim=cross_attention_dim,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        image_only_indicator: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:  # TODO
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    image_only_indicator,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                    return_dict=False,
+                )[0]
+            else:
+                hidden_states = resnet(
+                    hidden_states,
+                    temb,
+                    image_only_indicator=image_only_indicator,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                    return_dict=False,
+                )[0]
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states)
+
+        return hidden_states
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_3d_condition.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_3d_condition.py
new file mode 100644
index 000000000..b7641a96a
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_3d_condition.py
@@ -0,0 +1,753 @@
+# Copyright 2024 Alibaba DAMO-VILAB and The HuggingFace Team. All rights reserved.
+# Copyright 2024 The ModelScope Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import UNet2DConditionLoadersMixin
+from ...utils import BaseOutput, deprecate, logging
+from ..activations import get_activation
+from ..attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    Attention,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from ..embeddings import TimestepEmbedding, Timesteps
+from ..modeling_utils import ModelMixin
+from ..transformers.transformer_temporal import TransformerTemporalModel
+from .unet_3d_blocks import (
+    CrossAttnDownBlock3D,
+    CrossAttnUpBlock3D,
+    DownBlock3D,
+    UNetMidBlock3DCrossAttn,
+    UpBlock3D,
+    get_down_block,
+    get_up_block,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class UNet3DConditionOutput(BaseOutput):
+    """
+    The output of [`UNet3DConditionModel`].
+
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, num_frames, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+
+    sample: torch.FloatTensor
+
+
+class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    A conditional 3D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
+    shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "DownBlock3D")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D")`):
+            The tuple of upsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+            If `None`, normalization and activation layers is skipped in post-processing.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int`, *optional*, defaults to 1024): The dimension of the cross attention features.
+        attention_head_dim (`int`, *optional*, defaults to 64): The dimension of the attention heads.
+        num_attention_heads (`int`, *optional*): The number of attention heads.
+    """
+
+    _supports_gradient_checkpointing = False
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        down_block_types: Tuple[str, ...] = (
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "DownBlock3D",
+        ),
+        up_block_types: Tuple[str, ...] = (
+            "UpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+        ),
+        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1024,
+        attention_head_dim: Union[int, Tuple[int]] = 64,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+    ):
+        super().__init__()
+
+        self.sample_size = sample_size
+
+        if num_attention_heads is not None:
+            raise NotImplementedError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        # input
+        conv_in_kernel = 3
+        conv_out_kernel = 3
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+
+        # time
+        time_embed_dim = block_out_channels[0] * 4
+        self.time_proj = Timesteps(block_out_channels[0], True, 0)
+        timestep_input_dim = block_out_channels[0]
+
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+        )
+
+        self.transformer_in = TransformerTemporalModel(
+            num_attention_heads=8,
+            attention_head_dim=attention_head_dim,
+            in_channels=block_out_channels[0],
+            num_layers=1,
+            norm_num_groups=norm_num_groups,
+        )
+
+        # class embedding
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=False,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = UNetMidBlock3DCrossAttn(
+            in_channels=block_out_channels[-1],
+            temb_channels=time_embed_dim,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            output_scale_factor=mid_block_scale_factor,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads[-1],
+            resnet_groups=norm_num_groups,
+            dual_cross_attention=False,
+        )
+
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=reversed_num_attention_heads[i],
+                dual_cross_attention=False,
+                resolution_idx=i,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        if norm_num_groups is not None:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+            )
+            self.conv_act = get_activation("silu")
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        )
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attention_slice
+    def set_attention_slice(self, slice_size: Union[str, int, List[int]]) -> None:
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+
+        num_sliceable_layers = len(sliceable_head_dims)
+
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None:
+        """
+        Sets the attention processor to use [feed forward
+        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).
+
+        Parameters:
+            chunk_size (`int`, *optional*):
+                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
+                over each tensor of dim=`dim`.
+            dim (`int`, *optional*, defaults to `0`):
+                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
+                or dim=1 (sequence length).
+        """
+        if dim not in [0, 1]:
+            raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")
+
+        # By default chunk size is 1
+        chunk_size = chunk_size or 1
+
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+
+        for module in self.children():
+            fn_recursive_feed_forward(module, chunk_size, dim)
+
+    def disable_forward_chunking(self):
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+
+        for module in self.children():
+            fn_recursive_feed_forward(module, None, 0)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
+        if isinstance(module, (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)):
+            module.gradient_checkpointing = value
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.enable_freeu
+    def enable_freeu(self, s1, s2, b1, b2):
+        r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stage blocks where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
+        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        for i, upsample_block in enumerate(self.up_blocks):
+            setattr(upsample_block, "s1", s1)
+            setattr(upsample_block, "s2", s2)
+            setattr(upsample_block, "b1", b1)
+            setattr(upsample_block, "b2", b2)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.disable_freeu
+    def disable_freeu(self):
+        """Disables the FreeU mechanism."""
+        freeu_keys = {"s1", "s2", "b1", "b2"}
+        for i, upsample_block in enumerate(self.up_blocks):
+            for k in freeu_keys:
+                if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
+                    setattr(upsample_block, k, None)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unload_lora
+    def unload_lora(self):
+        """Unloads LoRA weights."""
+        deprecate(
+            "unload_lora",
+            "0.28.0",
+            "Calling `unload_lora()` is deprecated and will be removed in a future version. Please install `peft` and then call `disable_adapters().",
+        )
+        for module in self.modules():
+            if hasattr(module, "set_lora_layer"):
+                module.set_lora_layer(None)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet3DConditionOutput, Tuple[torch.FloatTensor]]:
+        r"""
+        The [`UNet3DConditionModel`] forward method.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, num_channels, num_frames, height, width`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
+                Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
+                through the `self.time_embedding` layer to obtain the timestep embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
+                A tuple of tensors that if specified are added to the residuals of down unet blocks.
+            mid_block_additional_residual: (`torch.Tensor`, *optional*):
+                A tensor that if specified is added to the residual of the middle unet block.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
+                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+
+        Returns:
+            [`~models.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_3d_condition.UNet3DConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        num_frames = sample.shape[2]
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+
+        emb = self.time_embedding(t_emb, timestep_cond)
+        emb = emb.repeat_interleave(repeats=num_frames, dim=0)
+        encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, dim=0)
+
+        # 2. pre-process
+        sample = sample.permute(0, 2, 1, 3, 4).reshape((sample.shape[0] * num_frames, -1) + sample.shape[3:])
+        sample = self.conv_in(sample)
+
+        sample = self.transformer_in(
+            sample,
+            num_frames=num_frames,
+            cross_attention_kwargs=cross_attention_kwargs,
+            return_dict=False,
+        )[0]
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    num_frames=num_frames,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, num_frames=num_frames)
+
+            down_block_res_samples += res_samples
+
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples = ()
+
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples += (down_block_res_sample,)
+
+            down_block_res_samples = new_down_block_res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                num_frames=num_frames,
+                cross_attention_kwargs=cross_attention_kwargs,
+            )
+
+        if mid_block_additional_residual is not None:
+            sample = sample + mid_block_additional_residual
+
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    num_frames=num_frames,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                    num_frames=num_frames,
+                )
+
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+
+        sample = self.conv_out(sample)
+
+        # reshape to (batch, channel, framerate, width, height)
+        sample = sample[None, :].reshape((-1, num_frames) + sample.shape[1:]).permute(0, 2, 1, 3, 4)
+
+        if not return_dict:
+            return (sample,)
+
+        return UNet3DConditionOutput(sample=sample)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_i2vgen_xl.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_i2vgen_xl.py
new file mode 100644
index 000000000..5c5c6a2cc
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_i2vgen_xl.py
@@ -0,0 +1,724 @@
+# Copyright 2024 Alibaba DAMO-VILAB and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import UNet2DConditionLoadersMixin
+from ...utils import logging
+from ..activations import get_activation
+from ..attention import Attention, FeedForward
+from ..attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from ..embeddings import TimestepEmbedding, Timesteps
+from ..modeling_utils import ModelMixin
+from ..transformers.transformer_temporal import TransformerTemporalModel
+from .unet_3d_blocks import (
+    CrossAttnDownBlock3D,
+    CrossAttnUpBlock3D,
+    DownBlock3D,
+    UNetMidBlock3DCrossAttn,
+    UpBlock3D,
+    get_down_block,
+    get_up_block,
+)
+from .unet_3d_condition import UNet3DConditionOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class I2VGenXLTransformerTemporalEncoder(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        activation_fn: str = "geglu",
+        upcast_attention: bool = False,
+        ff_inner_dim: Optional[int] = None,
+        dropout: int = 0.0,
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim, elementwise_affine=True, eps=1e-5)
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=False,
+            upcast_attention=upcast_attention,
+            out_bias=True,
+        )
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=False,
+            inner_dim=ff_inner_dim,
+            bias=True,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        norm_hidden_states = self.norm1(hidden_states)
+        attn_output = self.attn1(norm_hidden_states, encoder_hidden_states=None)
+        hidden_states = attn_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        ff_output = self.ff(hidden_states)
+        hidden_states = ff_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        return hidden_states
+
+
+class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    I2VGenXL UNet. It is a conditional 3D UNet model that takes a noisy sample, conditional state, and a timestep
+    and returns a sample-shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
+            The tuple of upsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+            If `None`, normalization and activation layers is skipped in post-processing.
+        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
+        attention_head_dim (`int`, *optional*, defaults to 64): Attention head dim.
+        num_attention_heads (`int`, *optional*): The number of attention heads.
+    """
+
+    _supports_gradient_checkpointing = False
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        down_block_types: Tuple[str, ...] = (
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "DownBlock3D",
+        ),
+        up_block_types: Tuple[str, ...] = (
+            "UpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+            "CrossAttnUpBlock3D",
+        ),
+        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        norm_num_groups: Optional[int] = 32,
+        cross_attention_dim: int = 1024,
+        attention_head_dim: Union[int, Tuple[int]] = 64,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+    ):
+        super().__init__()
+
+        # When we first integrated the UNet into the library, we didn't have `attention_head_dim`. As a consequence
+        # of that, we used `num_attention_heads` for arguments that actually denote attention head dimension. This
+        # is why we ignore `num_attention_heads` and calculate it from `attention_head_dims` below.
+        # This is still an incorrect way of calculating `num_attention_heads` but we need to stick to it
+        # without running proper depcrecation cycles for the {down,mid,up} blocks which are a
+        # part of the public API.
+        num_attention_heads = attention_head_dim
+
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        # input
+        self.conv_in = nn.Conv2d(in_channels + in_channels, block_out_channels[0], kernel_size=3, padding=1)
+
+        self.transformer_in = TransformerTemporalModel(
+            num_attention_heads=8,
+            attention_head_dim=num_attention_heads,
+            in_channels=block_out_channels[0],
+            num_layers=1,
+            norm_num_groups=norm_num_groups,
+        )
+
+        # image embedding
+        self.image_latents_proj_in = nn.Sequential(
+            nn.Conv2d(4, in_channels * 4, 3, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(in_channels * 4, in_channels * 4, 3, stride=1, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(in_channels * 4, in_channels, 3, stride=1, padding=1),
+        )
+        self.image_latents_temporal_encoder = I2VGenXLTransformerTemporalEncoder(
+            dim=in_channels,
+            num_attention_heads=2,
+            ff_inner_dim=in_channels * 4,
+            attention_head_dim=in_channels,
+            activation_fn="gelu",
+        )
+        self.image_latents_context_embedding = nn.Sequential(
+            nn.Conv2d(4, in_channels * 8, 3, padding=1),
+            nn.SiLU(),
+            nn.AdaptiveAvgPool2d((32, 32)),
+            nn.Conv2d(in_channels * 8, in_channels * 16, 3, stride=2, padding=1),
+            nn.SiLU(),
+            nn.Conv2d(in_channels * 16, cross_attention_dim, 3, stride=2, padding=1),
+        )
+
+        # other embeddings -- time, context, fps, etc.
+        time_embed_dim = block_out_channels[0] * 4
+        self.time_proj = Timesteps(block_out_channels[0], True, 0)
+        timestep_input_dim = block_out_channels[0]
+
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn="silu")
+        self.context_embedding = nn.Sequential(
+            nn.Linear(cross_attention_dim, time_embed_dim),
+            nn.SiLU(),
+            nn.Linear(time_embed_dim, cross_attention_dim * in_channels),
+        )
+        self.fps_embedding = nn.Sequential(
+            nn.Linear(timestep_input_dim, time_embed_dim), nn.SiLU(), nn.Linear(time_embed_dim, time_embed_dim)
+        )
+
+        # blocks
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=1e-05,
+                resnet_act_fn="silu",
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=1,
+                dual_cross_attention=False,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = UNetMidBlock3DCrossAttn(
+            in_channels=block_out_channels[-1],
+            temb_channels=time_embed_dim,
+            resnet_eps=1e-05,
+            resnet_act_fn="silu",
+            output_scale_factor=1,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads[-1],
+            resnet_groups=norm_num_groups,
+            dual_cross_attention=False,
+        )
+
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=1e-05,
+                resnet_act_fn="silu",
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=reversed_num_attention_heads[i],
+                dual_cross_attention=False,
+                resolution_idx=i,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-05)
+        self.conv_act = get_activation("silu")
+        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, kernel_size=3, padding=1)
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel.enable_forward_chunking
+    def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None:
+        """
+        Sets the attention processor to use [feed forward
+        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).
+
+        Parameters:
+            chunk_size (`int`, *optional*):
+                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
+                over each tensor of dim=`dim`.
+            dim (`int`, *optional*, defaults to `0`):
+                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
+                or dim=1 (sequence length).
+        """
+        if dim not in [0, 1]:
+            raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")
+
+        # By default chunk size is 1
+        chunk_size = chunk_size or 1
+
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+
+        for module in self.children():
+            fn_recursive_feed_forward(module, chunk_size, dim)
+
+    # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel.disable_forward_chunking
+    def disable_forward_chunking(self):
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+
+        for module in self.children():
+            fn_recursive_feed_forward(module, None, 0)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel._set_gradient_checkpointing
+    def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
+        if isinstance(module, (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)):
+            module.gradient_checkpointing = value
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.enable_freeu
+    def enable_freeu(self, s1, s2, b1, b2):
+        r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stage blocks where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
+        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        for i, upsample_block in enumerate(self.up_blocks):
+            setattr(upsample_block, "s1", s1)
+            setattr(upsample_block, "s2", s2)
+            setattr(upsample_block, "b1", b1)
+            setattr(upsample_block, "b2", b2)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.disable_freeu
+    def disable_freeu(self):
+        """Disables the FreeU mechanism."""
+        freeu_keys = {"s1", "s2", "b1", "b2"}
+        for i, upsample_block in enumerate(self.up_blocks):
+            for k in freeu_keys:
+                if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
+                    setattr(upsample_block, k, None)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        fps: torch.Tensor,
+        image_latents: torch.Tensor,
+        image_embeddings: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet3DConditionOutput, Tuple[torch.FloatTensor]]:
+        r"""
+        The [`I2VGenXLUNet`] forward method.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            fps (`torch.Tensor`): Frames per second for the video being generated. Used as a "micro-condition".
+            image_latents (`torch.FloatTensor`): Image encodings from the VAE.
+            image_embeddings (`torch.FloatTensor`): Projection embeddings of the conditioning image computed with a vision encoder.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            [`~models.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_3d_condition.UNet3DConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        batch_size, channels, num_frames, height, width = sample.shape
+
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass `timesteps` as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timesteps, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+        t_emb = self.time_embedding(t_emb, timestep_cond)
+
+        # 2. FPS
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        fps = fps.expand(fps.shape[0])
+        fps_emb = self.fps_embedding(self.time_proj(fps).to(dtype=self.dtype))
+
+        # 3. time + FPS embeddings.
+        emb = t_emb + fps_emb
+        emb = emb.repeat_interleave(repeats=num_frames, dim=0)
+
+        # 4. context embeddings.
+        # The context embeddings consist of both text embeddings from the input prompt
+        # AND the image embeddings from the input image. For images, both VAE encodings
+        # and the CLIP image embeddings are incorporated.
+        # So the final `context_embeddings` becomes the query for cross-attention.
+        context_emb = sample.new_zeros(batch_size, 0, self.config.cross_attention_dim)
+        context_emb = torch.cat([context_emb, encoder_hidden_states], dim=1)
+
+        image_latents_for_context_embds = image_latents[:, :, :1, :]
+        image_latents_context_embs = image_latents_for_context_embds.permute(0, 2, 1, 3, 4).reshape(
+            image_latents_for_context_embds.shape[0] * image_latents_for_context_embds.shape[2],
+            image_latents_for_context_embds.shape[1],
+            image_latents_for_context_embds.shape[3],
+            image_latents_for_context_embds.shape[4],
+        )
+        image_latents_context_embs = self.image_latents_context_embedding(image_latents_context_embs)
+
+        _batch_size, _channels, _height, _width = image_latents_context_embs.shape
+        image_latents_context_embs = image_latents_context_embs.permute(0, 2, 3, 1).reshape(
+            _batch_size, _height * _width, _channels
+        )
+        context_emb = torch.cat([context_emb, image_latents_context_embs], dim=1)
+
+        image_emb = self.context_embedding(image_embeddings)
+        image_emb = image_emb.view(-1, self.config.in_channels, self.config.cross_attention_dim)
+        context_emb = torch.cat([context_emb, image_emb], dim=1)
+        context_emb = context_emb.repeat_interleave(repeats=num_frames, dim=0)
+
+        image_latents = image_latents.permute(0, 2, 1, 3, 4).reshape(
+            image_latents.shape[0] * image_latents.shape[2],
+            image_latents.shape[1],
+            image_latents.shape[3],
+            image_latents.shape[4],
+        )
+        image_latents = self.image_latents_proj_in(image_latents)
+        image_latents = (
+            image_latents[None, :]
+            .reshape(batch_size, num_frames, channels, height, width)
+            .permute(0, 3, 4, 1, 2)
+            .reshape(batch_size * height * width, num_frames, channels)
+        )
+        image_latents = self.image_latents_temporal_encoder(image_latents)
+        image_latents = image_latents.reshape(batch_size, height, width, num_frames, channels).permute(0, 4, 3, 1, 2)
+
+        # 5. pre-process
+        sample = torch.cat([sample, image_latents], dim=1)
+        sample = sample.permute(0, 2, 1, 3, 4).reshape((sample.shape[0] * num_frames, -1) + sample.shape[3:])
+        sample = self.conv_in(sample)
+        sample = self.transformer_in(
+            sample,
+            num_frames=num_frames,
+            cross_attention_kwargs=cross_attention_kwargs,
+            return_dict=False,
+        )[0]
+
+        # 6. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=context_emb,
+                    num_frames=num_frames,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, num_frames=num_frames)
+
+            down_block_res_samples += res_samples
+
+        # 7. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=context_emb,
+                num_frames=num_frames,
+                cross_attention_kwargs=cross_attention_kwargs,
+            )
+        # 8. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=context_emb,
+                    upsample_size=upsample_size,
+                    num_frames=num_frames,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                    num_frames=num_frames,
+                )
+
+        # 9. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+
+        sample = self.conv_out(sample)
+
+        # reshape to (batch, channel, framerate, width, height)
+        sample = sample[None, :].reshape((-1, num_frames) + sample.shape[1:]).permute(0, 2, 1, 3, 4)
+
+        if not return_dict:
+            return (sample,)
+
+        return UNet3DConditionOutput(sample=sample)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_kandinsky3.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_kandinsky3.py
new file mode 100644
index 000000000..b981c8e17
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_kandinsky3.py
@@ -0,0 +1,535 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Dict, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import BaseOutput, logging
+from ..attention_processor import Attention, AttentionProcessor, AttnProcessor
+from ..embeddings import TimestepEmbedding, Timesteps
+from ..modeling_utils import ModelMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class Kandinsky3UNetOutput(BaseOutput):
+    sample: torch.FloatTensor = None
+
+
+class Kandinsky3EncoderProj(nn.Module):
+    def __init__(self, encoder_hid_dim, cross_attention_dim):
+        super().__init__()
+        self.projection_linear = nn.Linear(encoder_hid_dim, cross_attention_dim, bias=False)
+        self.projection_norm = nn.LayerNorm(cross_attention_dim)
+
+    def forward(self, x):
+        x = self.projection_linear(x)
+        x = self.projection_norm(x)
+        return x
+
+
+class Kandinsky3UNet(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 4,
+        time_embedding_dim: int = 1536,
+        groups: int = 32,
+        attention_head_dim: int = 64,
+        layers_per_block: Union[int, Tuple[int]] = 3,
+        block_out_channels: Tuple[int] = (384, 768, 1536, 3072),
+        cross_attention_dim: Union[int, Tuple[int]] = 4096,
+        encoder_hid_dim: int = 4096,
+    ):
+        super().__init__()
+
+        # TOOD(Yiyi): Give better name and put into config for the following 4 parameters
+        expansion_ratio = 4
+        compression_ratio = 2
+        add_cross_attention = (False, True, True, True)
+        add_self_attention = (False, True, True, True)
+
+        out_channels = in_channels
+        init_channels = block_out_channels[0] // 2
+        self.time_proj = Timesteps(init_channels, flip_sin_to_cos=False, downscale_freq_shift=1)
+
+        self.time_embedding = TimestepEmbedding(
+            init_channels,
+            time_embedding_dim,
+        )
+
+        self.add_time_condition = Kandinsky3AttentionPooling(
+            time_embedding_dim, cross_attention_dim, attention_head_dim
+        )
+
+        self.conv_in = nn.Conv2d(in_channels, init_channels, kernel_size=3, padding=1)
+
+        self.encoder_hid_proj = Kandinsky3EncoderProj(encoder_hid_dim, cross_attention_dim)
+
+        hidden_dims = [init_channels] + list(block_out_channels)
+        in_out_dims = list(zip(hidden_dims[:-1], hidden_dims[1:]))
+        text_dims = [cross_attention_dim if is_exist else None for is_exist in add_cross_attention]
+        num_blocks = len(block_out_channels) * [layers_per_block]
+        layer_params = [num_blocks, text_dims, add_self_attention]
+        rev_layer_params = map(reversed, layer_params)
+
+        cat_dims = []
+        self.num_levels = len(in_out_dims)
+        self.down_blocks = nn.ModuleList([])
+        for level, ((in_dim, out_dim), res_block_num, text_dim, self_attention) in enumerate(
+            zip(in_out_dims, *layer_params)
+        ):
+            down_sample = level != (self.num_levels - 1)
+            cat_dims.append(out_dim if level != (self.num_levels - 1) else 0)
+            self.down_blocks.append(
+                Kandinsky3DownSampleBlock(
+                    in_dim,
+                    out_dim,
+                    time_embedding_dim,
+                    text_dim,
+                    res_block_num,
+                    groups,
+                    attention_head_dim,
+                    expansion_ratio,
+                    compression_ratio,
+                    down_sample,
+                    self_attention,
+                )
+            )
+
+        self.up_blocks = nn.ModuleList([])
+        for level, ((out_dim, in_dim), res_block_num, text_dim, self_attention) in enumerate(
+            zip(reversed(in_out_dims), *rev_layer_params)
+        ):
+            up_sample = level != 0
+            self.up_blocks.append(
+                Kandinsky3UpSampleBlock(
+                    in_dim,
+                    cat_dims.pop(),
+                    out_dim,
+                    time_embedding_dim,
+                    text_dim,
+                    res_block_num,
+                    groups,
+                    attention_head_dim,
+                    expansion_ratio,
+                    compression_ratio,
+                    up_sample,
+                    self_attention,
+                )
+            )
+
+        self.conv_norm_out = nn.GroupNorm(groups, init_channels)
+        self.conv_act_out = nn.SiLU()
+        self.conv_out = nn.Conv2d(init_channels, out_channels, kernel_size=3, padding=1)
+
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "set_processor"):
+                processors[f"{name}.processor"] = module.processor
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        self.set_attn_processor(AttnProcessor())
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    def forward(self, sample, timestep, encoder_hidden_states=None, encoder_attention_mask=None, return_dict=True):
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        if not torch.is_tensor(timestep):
+            dtype = torch.float32 if isinstance(timestep, float) else torch.int32
+            timestep = torch.tensor([timestep], dtype=dtype, device=sample.device)
+        elif len(timestep.shape) == 0:
+            timestep = timestep[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timestep = timestep.expand(sample.shape[0])
+        time_embed_input = self.time_proj(timestep).to(sample.dtype)
+        time_embed = self.time_embedding(time_embed_input)
+
+        encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+
+        if encoder_hidden_states is not None:
+            time_embed = self.add_time_condition(time_embed, encoder_hidden_states, encoder_attention_mask)
+
+        hidden_states = []
+        sample = self.conv_in(sample)
+        for level, down_sample in enumerate(self.down_blocks):
+            sample = down_sample(sample, time_embed, encoder_hidden_states, encoder_attention_mask)
+            if level != self.num_levels - 1:
+                hidden_states.append(sample)
+
+        for level, up_sample in enumerate(self.up_blocks):
+            if level != 0:
+                sample = torch.cat([sample, hidden_states.pop()], dim=1)
+            sample = up_sample(sample, time_embed, encoder_hidden_states, encoder_attention_mask)
+
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act_out(sample)
+        sample = self.conv_out(sample)
+
+        if not return_dict:
+            return (sample,)
+        return Kandinsky3UNetOutput(sample=sample)
+
+
+class Kandinsky3UpSampleBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        cat_dim,
+        out_channels,
+        time_embed_dim,
+        context_dim=None,
+        num_blocks=3,
+        groups=32,
+        head_dim=64,
+        expansion_ratio=4,
+        compression_ratio=2,
+        up_sample=True,
+        self_attention=True,
+    ):
+        super().__init__()
+        up_resolutions = [[None, True if up_sample else None, None, None]] + [[None] * 4] * (num_blocks - 1)
+        hidden_channels = (
+            [(in_channels + cat_dim, in_channels)]
+            + [(in_channels, in_channels)] * (num_blocks - 2)
+            + [(in_channels, out_channels)]
+        )
+        attentions = []
+        resnets_in = []
+        resnets_out = []
+
+        self.self_attention = self_attention
+        self.context_dim = context_dim
+
+        if self_attention:
+            attentions.append(
+                Kandinsky3AttentionBlock(out_channels, time_embed_dim, None, groups, head_dim, expansion_ratio)
+            )
+        else:
+            attentions.append(nn.Identity())
+
+        for (in_channel, out_channel), up_resolution in zip(hidden_channels, up_resolutions):
+            resnets_in.append(
+                Kandinsky3ResNetBlock(in_channel, in_channel, time_embed_dim, groups, compression_ratio, up_resolution)
+            )
+
+            if context_dim is not None:
+                attentions.append(
+                    Kandinsky3AttentionBlock(
+                        in_channel, time_embed_dim, context_dim, groups, head_dim, expansion_ratio
+                    )
+                )
+            else:
+                attentions.append(nn.Identity())
+
+            resnets_out.append(
+                Kandinsky3ResNetBlock(in_channel, out_channel, time_embed_dim, groups, compression_ratio)
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets_in = nn.ModuleList(resnets_in)
+        self.resnets_out = nn.ModuleList(resnets_out)
+
+    def forward(self, x, time_embed, context=None, context_mask=None, image_mask=None):
+        for attention, resnet_in, resnet_out in zip(self.attentions[1:], self.resnets_in, self.resnets_out):
+            x = resnet_in(x, time_embed)
+            if self.context_dim is not None:
+                x = attention(x, time_embed, context, context_mask, image_mask)
+            x = resnet_out(x, time_embed)
+
+        if self.self_attention:
+            x = self.attentions[0](x, time_embed, image_mask=image_mask)
+        return x
+
+
+class Kandinsky3DownSampleBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        time_embed_dim,
+        context_dim=None,
+        num_blocks=3,
+        groups=32,
+        head_dim=64,
+        expansion_ratio=4,
+        compression_ratio=2,
+        down_sample=True,
+        self_attention=True,
+    ):
+        super().__init__()
+        attentions = []
+        resnets_in = []
+        resnets_out = []
+
+        self.self_attention = self_attention
+        self.context_dim = context_dim
+
+        if self_attention:
+            attentions.append(
+                Kandinsky3AttentionBlock(in_channels, time_embed_dim, None, groups, head_dim, expansion_ratio)
+            )
+        else:
+            attentions.append(nn.Identity())
+
+        up_resolutions = [[None] * 4] * (num_blocks - 1) + [[None, None, False if down_sample else None, None]]
+        hidden_channels = [(in_channels, out_channels)] + [(out_channels, out_channels)] * (num_blocks - 1)
+        for (in_channel, out_channel), up_resolution in zip(hidden_channels, up_resolutions):
+            resnets_in.append(
+                Kandinsky3ResNetBlock(in_channel, out_channel, time_embed_dim, groups, compression_ratio)
+            )
+
+            if context_dim is not None:
+                attentions.append(
+                    Kandinsky3AttentionBlock(
+                        out_channel, time_embed_dim, context_dim, groups, head_dim, expansion_ratio
+                    )
+                )
+            else:
+                attentions.append(nn.Identity())
+
+            resnets_out.append(
+                Kandinsky3ResNetBlock(
+                    out_channel, out_channel, time_embed_dim, groups, compression_ratio, up_resolution
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets_in = nn.ModuleList(resnets_in)
+        self.resnets_out = nn.ModuleList(resnets_out)
+
+    def forward(self, x, time_embed, context=None, context_mask=None, image_mask=None):
+        if self.self_attention:
+            x = self.attentions[0](x, time_embed, image_mask=image_mask)
+
+        for attention, resnet_in, resnet_out in zip(self.attentions[1:], self.resnets_in, self.resnets_out):
+            x = resnet_in(x, time_embed)
+            if self.context_dim is not None:
+                x = attention(x, time_embed, context, context_mask, image_mask)
+            x = resnet_out(x, time_embed)
+        return x
+
+
+class Kandinsky3ConditionalGroupNorm(nn.Module):
+    def __init__(self, groups, normalized_shape, context_dim):
+        super().__init__()
+        self.norm = nn.GroupNorm(groups, normalized_shape, affine=False)
+        self.context_mlp = nn.Sequential(nn.SiLU(), nn.Linear(context_dim, 2 * normalized_shape))
+        self.context_mlp[1].weight.data.zero_()
+        self.context_mlp[1].bias.data.zero_()
+
+    def forward(self, x, context):
+        context = self.context_mlp(context)
+
+        for _ in range(len(x.shape[2:])):
+            context = context.unsqueeze(-1)
+
+        scale, shift = context.chunk(2, dim=1)
+        x = self.norm(x) * (scale + 1.0) + shift
+        return x
+
+
+class Kandinsky3Block(nn.Module):
+    def __init__(self, in_channels, out_channels, time_embed_dim, kernel_size=3, norm_groups=32, up_resolution=None):
+        super().__init__()
+        self.group_norm = Kandinsky3ConditionalGroupNorm(norm_groups, in_channels, time_embed_dim)
+        self.activation = nn.SiLU()
+        if up_resolution is not None and up_resolution:
+            self.up_sample = nn.ConvTranspose2d(in_channels, in_channels, kernel_size=2, stride=2)
+        else:
+            self.up_sample = nn.Identity()
+
+        padding = int(kernel_size > 1)
+        self.projection = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, padding=padding)
+
+        if up_resolution is not None and not up_resolution:
+            self.down_sample = nn.Conv2d(out_channels, out_channels, kernel_size=2, stride=2)
+        else:
+            self.down_sample = nn.Identity()
+
+    def forward(self, x, time_embed):
+        x = self.group_norm(x, time_embed)
+        x = self.activation(x)
+        x = self.up_sample(x)
+        x = self.projection(x)
+        x = self.down_sample(x)
+        return x
+
+
+class Kandinsky3ResNetBlock(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, time_embed_dim, norm_groups=32, compression_ratio=2, up_resolutions=4 * [None]
+    ):
+        super().__init__()
+        kernel_sizes = [1, 3, 3, 1]
+        hidden_channel = max(in_channels, out_channels) // compression_ratio
+        hidden_channels = (
+            [(in_channels, hidden_channel)] + [(hidden_channel, hidden_channel)] * 2 + [(hidden_channel, out_channels)]
+        )
+        self.resnet_blocks = nn.ModuleList(
+            [
+                Kandinsky3Block(in_channel, out_channel, time_embed_dim, kernel_size, norm_groups, up_resolution)
+                for (in_channel, out_channel), kernel_size, up_resolution in zip(
+                    hidden_channels, kernel_sizes, up_resolutions
+                )
+            ]
+        )
+        self.shortcut_up_sample = (
+            nn.ConvTranspose2d(in_channels, in_channels, kernel_size=2, stride=2)
+            if True in up_resolutions
+            else nn.Identity()
+        )
+        self.shortcut_projection = (
+            nn.Conv2d(in_channels, out_channels, kernel_size=1) if in_channels != out_channels else nn.Identity()
+        )
+        self.shortcut_down_sample = (
+            nn.Conv2d(out_channels, out_channels, kernel_size=2, stride=2)
+            if False in up_resolutions
+            else nn.Identity()
+        )
+
+    def forward(self, x, time_embed):
+        out = x
+        for resnet_block in self.resnet_blocks:
+            out = resnet_block(out, time_embed)
+
+        x = self.shortcut_up_sample(x)
+        x = self.shortcut_projection(x)
+        x = self.shortcut_down_sample(x)
+        x = x + out
+        return x
+
+
+class Kandinsky3AttentionPooling(nn.Module):
+    def __init__(self, num_channels, context_dim, head_dim=64):
+        super().__init__()
+        self.attention = Attention(
+            context_dim,
+            context_dim,
+            dim_head=head_dim,
+            out_dim=num_channels,
+            out_bias=False,
+        )
+
+    def forward(self, x, context, context_mask=None):
+        context_mask = context_mask.to(dtype=context.dtype)
+        context = self.attention(context.mean(dim=1, keepdim=True), context, context_mask)
+        return x + context.squeeze(1)
+
+
+class Kandinsky3AttentionBlock(nn.Module):
+    def __init__(self, num_channels, time_embed_dim, context_dim=None, norm_groups=32, head_dim=64, expansion_ratio=4):
+        super().__init__()
+        self.in_norm = Kandinsky3ConditionalGroupNorm(norm_groups, num_channels, time_embed_dim)
+        self.attention = Attention(
+            num_channels,
+            context_dim or num_channels,
+            dim_head=head_dim,
+            out_dim=num_channels,
+            out_bias=False,
+        )
+
+        hidden_channels = expansion_ratio * num_channels
+        self.out_norm = Kandinsky3ConditionalGroupNorm(norm_groups, num_channels, time_embed_dim)
+        self.feed_forward = nn.Sequential(
+            nn.Conv2d(num_channels, hidden_channels, kernel_size=1, bias=False),
+            nn.SiLU(),
+            nn.Conv2d(hidden_channels, num_channels, kernel_size=1, bias=False),
+        )
+
+    def forward(self, x, time_embed, context=None, context_mask=None, image_mask=None):
+        height, width = x.shape[-2:]
+        out = self.in_norm(x, time_embed)
+        out = out.reshape(x.shape[0], -1, height * width).permute(0, 2, 1)
+        context = context if context is not None else out
+        if context_mask is not None:
+            context_mask = context_mask.to(dtype=context.dtype)
+
+        out = self.attention(out, context, context_mask)
+        out = out.permute(0, 2, 1).unsqueeze(-1).reshape(out.shape[0], -1, height, width)
+        x = x + out
+
+        out = self.out_norm(x, time_embed)
+        out = self.feed_forward(out)
+        x = x + out
+        return x
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_motion_model.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_motion_model.py
new file mode 100644
index 000000000..ab2eac4c9
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_motion_model.py
@@ -0,0 +1,948 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import UNet2DConditionLoadersMixin
+from ...utils import logging
+from ..attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    Attention,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from ..embeddings import TimestepEmbedding, Timesteps
+from ..modeling_utils import ModelMixin
+from ..transformers.transformer_temporal import TransformerTemporalModel
+from .unet_2d_blocks import UNetMidBlock2DCrossAttn
+from .unet_2d_condition import UNet2DConditionModel
+from .unet_3d_blocks import (
+    CrossAttnDownBlockMotion,
+    CrossAttnUpBlockMotion,
+    DownBlockMotion,
+    UNetMidBlockCrossAttnMotion,
+    UpBlockMotion,
+    get_down_block,
+    get_up_block,
+)
+from .unet_3d_condition import UNet3DConditionOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class MotionModules(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        layers_per_block: int = 2,
+        num_attention_heads: int = 8,
+        attention_bias: bool = False,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        norm_num_groups: int = 32,
+        max_seq_length: int = 32,
+    ):
+        super().__init__()
+        self.motion_modules = nn.ModuleList([])
+
+        for i in range(layers_per_block):
+            self.motion_modules.append(
+                TransformerTemporalModel(
+                    in_channels=in_channels,
+                    norm_num_groups=norm_num_groups,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    attention_bias=attention_bias,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=in_channels // num_attention_heads,
+                    positional_embeddings="sinusoidal",
+                    num_positional_embeddings=max_seq_length,
+                )
+            )
+
+
+class MotionAdapter(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
+        motion_layers_per_block: int = 2,
+        motion_mid_block_layers_per_block: int = 1,
+        motion_num_attention_heads: int = 8,
+        motion_norm_num_groups: int = 32,
+        motion_max_seq_length: int = 32,
+        use_motion_mid_block: bool = True,
+        conv_in_channels: Optional[int] = None,
+    ):
+        """Container to store AnimateDiff Motion Modules
+
+        Args:
+            block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each UNet block.
+            motion_layers_per_block (`int`, *optional*, defaults to 2):
+                The number of motion layers per UNet block.
+            motion_mid_block_layers_per_block (`int`, *optional*, defaults to 1):
+                The number of motion layers in the middle UNet block.
+            motion_num_attention_heads (`int`, *optional*, defaults to 8):
+                The number of heads to use in each attention layer of the motion module.
+            motion_norm_num_groups (`int`, *optional*, defaults to 32):
+                The number of groups to use in each group normalization layer of the motion module.
+            motion_max_seq_length (`int`, *optional*, defaults to 32):
+                The maximum sequence length to use in the motion module.
+            use_motion_mid_block (`bool`, *optional*, defaults to True):
+                Whether to use a motion module in the middle of the UNet.
+        """
+
+        super().__init__()
+        down_blocks = []
+        up_blocks = []
+
+        if conv_in_channels:
+            # input
+            self.conv_in = nn.Conv2d(conv_in_channels, block_out_channels[0], kernel_size=3, padding=1)
+        else:
+            self.conv_in = None
+
+        for i, channel in enumerate(block_out_channels):
+            output_channel = block_out_channels[i]
+            down_blocks.append(
+                MotionModules(
+                    in_channels=output_channel,
+                    norm_num_groups=motion_norm_num_groups,
+                    cross_attention_dim=None,
+                    activation_fn="geglu",
+                    attention_bias=False,
+                    num_attention_heads=motion_num_attention_heads,
+                    max_seq_length=motion_max_seq_length,
+                    layers_per_block=motion_layers_per_block,
+                )
+            )
+
+        if use_motion_mid_block:
+            self.mid_block = MotionModules(
+                in_channels=block_out_channels[-1],
+                norm_num_groups=motion_norm_num_groups,
+                cross_attention_dim=None,
+                activation_fn="geglu",
+                attention_bias=False,
+                num_attention_heads=motion_num_attention_heads,
+                layers_per_block=motion_mid_block_layers_per_block,
+                max_seq_length=motion_max_seq_length,
+            )
+        else:
+            self.mid_block = None
+
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, channel in enumerate(reversed_block_out_channels):
+            output_channel = reversed_block_out_channels[i]
+            up_blocks.append(
+                MotionModules(
+                    in_channels=output_channel,
+                    norm_num_groups=motion_norm_num_groups,
+                    cross_attention_dim=None,
+                    activation_fn="geglu",
+                    attention_bias=False,
+                    num_attention_heads=motion_num_attention_heads,
+                    max_seq_length=motion_max_seq_length,
+                    layers_per_block=motion_layers_per_block + 1,
+                )
+            )
+
+        self.down_blocks = nn.ModuleList(down_blocks)
+        self.up_blocks = nn.ModuleList(up_blocks)
+
+    def forward(self, sample):
+        pass
+
+
+class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    A modified conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a
+    sample shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        down_block_types: Tuple[str, ...] = (
+            "CrossAttnDownBlockMotion",
+            "CrossAttnDownBlockMotion",
+            "CrossAttnDownBlockMotion",
+            "DownBlockMotion",
+        ),
+        up_block_types: Tuple[str, ...] = (
+            "UpBlockMotion",
+            "CrossAttnUpBlockMotion",
+            "CrossAttnUpBlockMotion",
+            "CrossAttnUpBlockMotion",
+        ),
+        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        use_linear_projection: bool = False,
+        num_attention_heads: Union[int, Tuple[int, ...]] = 8,
+        motion_max_seq_length: int = 32,
+        motion_num_attention_heads: int = 8,
+        use_motion_mid_block: int = True,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+    ):
+        super().__init__()
+
+        self.sample_size = sample_size
+
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        # input
+        conv_in_kernel = 3
+        conv_out_kernel = 3
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+
+        # time
+        time_embed_dim = block_out_channels[0] * 4
+        self.time_proj = Timesteps(block_out_channels[0], True, 0)
+        timestep_input_dim = block_out_channels[0]
+
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim, time_embed_dim, act_fn=act_fn, cond_proj_dim=time_cond_proj_dim
+        )
+
+        if encoder_hid_dim_type is None:
+            self.encoder_hid_proj = None
+
+        # class embedding
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=downsample_padding,
+                use_linear_projection=use_linear_projection,
+                dual_cross_attention=False,
+                temporal_num_attention_heads=motion_num_attention_heads,
+                temporal_max_seq_length=motion_max_seq_length,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        if use_motion_mid_block:
+            self.mid_block = UNetMidBlockCrossAttnMotion(
+                in_channels=block_out_channels[-1],
+                temb_channels=time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=False,
+                use_linear_projection=use_linear_projection,
+                temporal_num_attention_heads=motion_num_attention_heads,
+                temporal_max_seq_length=motion_max_seq_length,
+            )
+
+        else:
+            self.mid_block = UNetMidBlock2DCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=False,
+                use_linear_projection=use_linear_projection,
+            )
+
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                num_attention_heads=reversed_num_attention_heads[i],
+                dual_cross_attention=False,
+                resolution_idx=i,
+                use_linear_projection=use_linear_projection,
+                temporal_num_attention_heads=motion_num_attention_heads,
+                temporal_max_seq_length=motion_max_seq_length,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        if norm_num_groups is not None:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+            )
+            self.conv_act = nn.SiLU()
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        )
+
+    @classmethod
+    def from_unet2d(
+        cls,
+        unet: UNet2DConditionModel,
+        motion_adapter: Optional[MotionAdapter] = None,
+        load_weights: bool = True,
+    ):
+        has_motion_adapter = motion_adapter is not None
+
+        # based on https://github.com/guoyww/AnimateDiff/blob/895f3220c06318ea0760131ec70408b466c49333/animatediff/models/unet.py#L459
+        config = unet.config
+        config["_class_name"] = cls.__name__
+
+        down_blocks = []
+        for down_blocks_type in config["down_block_types"]:
+            if "CrossAttn" in down_blocks_type:
+                down_blocks.append("CrossAttnDownBlockMotion")
+            else:
+                down_blocks.append("DownBlockMotion")
+        config["down_block_types"] = down_blocks
+
+        up_blocks = []
+        for down_blocks_type in config["up_block_types"]:
+            if "CrossAttn" in down_blocks_type:
+                up_blocks.append("CrossAttnUpBlockMotion")
+            else:
+                up_blocks.append("UpBlockMotion")
+
+        config["up_block_types"] = up_blocks
+
+        if has_motion_adapter:
+            config["motion_num_attention_heads"] = motion_adapter.config["motion_num_attention_heads"]
+            config["motion_max_seq_length"] = motion_adapter.config["motion_max_seq_length"]
+            config["use_motion_mid_block"] = motion_adapter.config["use_motion_mid_block"]
+
+            # For PIA UNets we need to set the number input channels to 9
+            if motion_adapter.config["conv_in_channels"]:
+                config["in_channels"] = motion_adapter.config["conv_in_channels"]
+
+        # Need this for backwards compatibility with UNet2DConditionModel checkpoints
+        if not config.get("num_attention_heads"):
+            config["num_attention_heads"] = config["attention_head_dim"]
+
+        model = cls.from_config(config)
+
+        if not load_weights:
+            return model
+
+        # Logic for loading PIA UNets which allow the first 4 channels to be any UNet2DConditionModel conv_in weight
+        # while the last 5 channels must be PIA conv_in weights.
+        if has_motion_adapter and motion_adapter.config["conv_in_channels"]:
+            model.conv_in = motion_adapter.conv_in
+            updated_conv_in_weight = torch.cat(
+                [unet.conv_in.weight, motion_adapter.conv_in.weight[:, 4:, :, :]], dim=1
+            )
+            model.conv_in.load_state_dict({"weight": updated_conv_in_weight, "bias": unet.conv_in.bias})
+        else:
+            model.conv_in.load_state_dict(unet.conv_in.state_dict())
+
+        model.time_proj.load_state_dict(unet.time_proj.state_dict())
+        model.time_embedding.load_state_dict(unet.time_embedding.state_dict())
+
+        for i, down_block in enumerate(unet.down_blocks):
+            model.down_blocks[i].resnets.load_state_dict(down_block.resnets.state_dict())
+            if hasattr(model.down_blocks[i], "attentions"):
+                model.down_blocks[i].attentions.load_state_dict(down_block.attentions.state_dict())
+            if model.down_blocks[i].downsamplers:
+                model.down_blocks[i].downsamplers.load_state_dict(down_block.downsamplers.state_dict())
+
+        for i, up_block in enumerate(unet.up_blocks):
+            model.up_blocks[i].resnets.load_state_dict(up_block.resnets.state_dict())
+            if hasattr(model.up_blocks[i], "attentions"):
+                model.up_blocks[i].attentions.load_state_dict(up_block.attentions.state_dict())
+            if model.up_blocks[i].upsamplers:
+                model.up_blocks[i].upsamplers.load_state_dict(up_block.upsamplers.state_dict())
+
+        model.mid_block.resnets.load_state_dict(unet.mid_block.resnets.state_dict())
+        model.mid_block.attentions.load_state_dict(unet.mid_block.attentions.state_dict())
+
+        if unet.conv_norm_out is not None:
+            model.conv_norm_out.load_state_dict(unet.conv_norm_out.state_dict())
+        if unet.conv_act is not None:
+            model.conv_act.load_state_dict(unet.conv_act.state_dict())
+        model.conv_out.load_state_dict(unet.conv_out.state_dict())
+
+        if has_motion_adapter:
+            model.load_motion_modules(motion_adapter)
+
+        # ensure that the Motion UNet is the same dtype as the UNet2DConditionModel
+        model.to(unet.dtype)
+
+        return model
+
+    def freeze_unet2d_params(self) -> None:
+        """Freeze the weights of just the UNet2DConditionModel, and leave the motion modules
+        unfrozen for fine tuning.
+        """
+        # Freeze everything
+        for param in self.parameters():
+            param.requires_grad = False
+
+        # Unfreeze Motion Modules
+        for down_block in self.down_blocks:
+            motion_modules = down_block.motion_modules
+            for param in motion_modules.parameters():
+                param.requires_grad = True
+
+        for up_block in self.up_blocks:
+            motion_modules = up_block.motion_modules
+            for param in motion_modules.parameters():
+                param.requires_grad = True
+
+        if hasattr(self.mid_block, "motion_modules"):
+            motion_modules = self.mid_block.motion_modules
+            for param in motion_modules.parameters():
+                param.requires_grad = True
+
+    def load_motion_modules(self, motion_adapter: Optional[MotionAdapter]) -> None:
+        for i, down_block in enumerate(motion_adapter.down_blocks):
+            self.down_blocks[i].motion_modules.load_state_dict(down_block.motion_modules.state_dict())
+        for i, up_block in enumerate(motion_adapter.up_blocks):
+            self.up_blocks[i].motion_modules.load_state_dict(up_block.motion_modules.state_dict())
+
+        # to support older motion modules that don't have a mid_block
+        if hasattr(self.mid_block, "motion_modules"):
+            self.mid_block.motion_modules.load_state_dict(motion_adapter.mid_block.motion_modules.state_dict())
+
+    def save_motion_modules(
+        self,
+        save_directory: str,
+        is_main_process: bool = True,
+        safe_serialization: bool = True,
+        variant: Optional[str] = None,
+        push_to_hub: bool = False,
+        **kwargs,
+    ) -> None:
+        state_dict = self.state_dict()
+
+        # Extract all motion modules
+        motion_state_dict = {}
+        for k, v in state_dict.items():
+            if "motion_modules" in k:
+                motion_state_dict[k] = v
+
+        adapter = MotionAdapter(
+            block_out_channels=self.config["block_out_channels"],
+            motion_layers_per_block=self.config["layers_per_block"],
+            motion_norm_num_groups=self.config["norm_num_groups"],
+            motion_num_attention_heads=self.config["motion_num_attention_heads"],
+            motion_max_seq_length=self.config["motion_max_seq_length"],
+            use_motion_mid_block=self.config["use_motion_mid_block"],
+        )
+        adapter.load_state_dict(motion_state_dict)
+        adapter.save_pretrained(
+            save_directory=save_directory,
+            is_main_process=is_main_process,
+            safe_serialization=safe_serialization,
+            variant=variant,
+            push_to_hub=push_to_hub,
+            **kwargs,
+        )
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel.enable_forward_chunking
+    def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None:
+        """
+        Sets the attention processor to use [feed forward
+        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).
+
+        Parameters:
+            chunk_size (`int`, *optional*):
+                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
+                over each tensor of dim=`dim`.
+            dim (`int`, *optional*, defaults to `0`):
+                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
+                or dim=1 (sequence length).
+        """
+        if dim not in [0, 1]:
+            raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")
+
+        # By default chunk size is 1
+        chunk_size = chunk_size or 1
+
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+
+        for module in self.children():
+            fn_recursive_feed_forward(module, chunk_size, dim)
+
+    # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel.disable_forward_chunking
+    def disable_forward_chunking(self) -> None:
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+
+        for module in self.children():
+            fn_recursive_feed_forward(module, None, 0)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self) -> None:
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
+        if isinstance(module, (CrossAttnDownBlockMotion, DownBlockMotion, CrossAttnUpBlockMotion, UpBlockMotion)):
+            module.gradient_checkpointing = value
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.enable_freeu
+    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float) -> None:
+        r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stage blocks where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
+        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        for i, upsample_block in enumerate(self.up_blocks):
+            setattr(upsample_block, "s1", s1)
+            setattr(upsample_block, "s2", s2)
+            setattr(upsample_block, "b1", b1)
+            setattr(upsample_block, "b2", b2)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.disable_freeu
+    def disable_freeu(self) -> None:
+        """Disables the FreeU mechanism."""
+        freeu_keys = {"s1", "s2", "b1", "b2"}
+        for i, upsample_block in enumerate(self.up_blocks):
+            for k in freeu_keys:
+                if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
+                    setattr(upsample_block, k, None)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet3DConditionOutput, Tuple[torch.Tensor]]:
+        r"""
+        The [`UNetMotionModel`] forward method.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
+                Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
+                through the `self.time_embedding` layer to obtain the timestep embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
+                A tuple of tensors that if specified are added to the residuals of down unet blocks.
+            mid_block_additional_residual: (`torch.Tensor`, *optional*):
+                A tensor that if specified is added to the residual of the middle unet block.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            [`~models.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_3d_condition.UNet3DConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        num_frames = sample.shape[2]
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+
+        emb = self.time_embedding(t_emb, timestep_cond)
+        emb = emb.repeat_interleave(repeats=num_frames, dim=0)
+        encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, dim=0)
+
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            image_embeds = self.encoder_hid_proj(image_embeds)
+            image_embeds = [image_embed.repeat_interleave(repeats=num_frames, dim=0) for image_embed in image_embeds]
+            encoder_hidden_states = (encoder_hidden_states, image_embeds)
+
+        # 2. pre-process
+        sample = sample.permute(0, 2, 1, 3, 4).reshape((sample.shape[0] * num_frames, -1) + sample.shape[3:])
+        sample = self.conv_in(sample)
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    num_frames=num_frames,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, num_frames=num_frames)
+
+            down_block_res_samples += res_samples
+
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples = ()
+
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples += (down_block_res_sample,)
+
+            down_block_res_samples = new_down_block_res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            # To support older versions of motion modules that don't have a mid_block
+            if hasattr(self.mid_block, "motion_modules"):
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    num_frames=num_frames,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+
+        if mid_block_additional_residual is not None:
+            sample = sample + mid_block_additional_residual
+
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    num_frames=num_frames,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                    num_frames=num_frames,
+                )
+
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+
+        sample = self.conv_out(sample)
+
+        # reshape to (batch, channel, framerate, width, height)
+        sample = sample[None, :].reshape((-1, num_frames) + sample.shape[1:]).permute(0, 2, 1, 3, 4)
+
+        if not return_dict:
+            return (sample,)
+
+        return UNet3DConditionOutput(sample=sample)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_spatio_temporal_condition.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_spatio_temporal_condition.py
new file mode 100644
index 000000000..5fe265e63
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_spatio_temporal_condition.py
@@ -0,0 +1,489 @@
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import UNet2DConditionLoadersMixin
+from ...utils import BaseOutput, logging
+from ..attention_processor import CROSS_ATTENTION_PROCESSORS, AttentionProcessor, AttnProcessor
+from ..embeddings import TimestepEmbedding, Timesteps
+from ..modeling_utils import ModelMixin
+from .unet_3d_blocks import UNetMidBlockSpatioTemporal, get_down_block, get_up_block
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class UNetSpatioTemporalConditionOutput(BaseOutput):
+    """
+    The output of [`UNetSpatioTemporalConditionModel`].
+
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
+    """
+
+    sample: torch.FloatTensor = None
+
+
+class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    A conditional Spatio-Temporal UNet model that takes a noisy video frames, conditional state, and a timestep and returns a sample
+    shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 8): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "DownBlockSpatioTemporal")`):
+            The tuple of downsample blocks to use.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal")`):
+            The tuple of upsample blocks to use.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        addition_time_embed_dim: (`int`, defaults to 256):
+            Dimension to to encode the additional time ids.
+        projection_class_embeddings_input_dim (`int`, defaults to 768):
+            The dimension of the projection of encoded `added_time_ids`.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`], [`~models.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`],
+            [`~models.unet_3d_blocks.UNetMidBlockSpatioTemporal`].
+        num_attention_heads (`int`, `Tuple[int]`, defaults to `(5, 10, 10, 20)`):
+            The number of attention heads.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 8,
+        out_channels: int = 4,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlockSpatioTemporal",
+            "CrossAttnDownBlockSpatioTemporal",
+            "CrossAttnDownBlockSpatioTemporal",
+            "DownBlockSpatioTemporal",
+        ),
+        up_block_types: Tuple[str] = (
+            "UpBlockSpatioTemporal",
+            "CrossAttnUpBlockSpatioTemporal",
+            "CrossAttnUpBlockSpatioTemporal",
+            "CrossAttnUpBlockSpatioTemporal",
+        ),
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        addition_time_embed_dim: int = 256,
+        projection_class_embeddings_input_dim: int = 768,
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        cross_attention_dim: Union[int, Tuple[int]] = 1024,
+        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
+        num_attention_heads: Union[int, Tuple[int]] = (5, 10, 20, 20),
+        num_frames: int = 25,
+    ):
+        super().__init__()
+
+        self.sample_size = sample_size
+
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+
+        # input
+        self.conv_in = nn.Conv2d(
+            in_channels,
+            block_out_channels[0],
+            kernel_size=3,
+            padding=1,
+        )
+
+        # time
+        time_embed_dim = block_out_channels[0] * 4
+
+        self.time_proj = Timesteps(block_out_channels[0], True, downscale_freq_shift=0)
+        timestep_input_dim = block_out_channels[0]
+
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+
+        self.add_time_proj = Timesteps(addition_time_embed_dim, True, downscale_freq_shift=0)
+        self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+
+        blocks_time_embed_dim = time_embed_dim
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=1e-5,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                resnet_act_fn="silu",
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = UNetMidBlockSpatioTemporal(
+            block_out_channels[-1],
+            temb_channels=blocks_time_embed_dim,
+            transformer_layers_per_block=transformer_layers_per_block[-1],
+            cross_attention_dim=cross_attention_dim[-1],
+            num_attention_heads=num_attention_heads[-1],
+        )
+
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
+
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=1e-5,
+                resolution_idx=i,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                resnet_act_fn="silu",
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=32, eps=1e-5)
+        self.conv_act = nn.SiLU()
+
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0],
+            out_channels,
+            kernel_size=3,
+            padding=1,
+        )
+
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(
+            name: str,
+            module: torch.nn.Module,
+            processors: Dict[str, AttentionProcessor],
+        ):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    # Copied from diffusers.models.unets.unet_3d_condition.UNet3DConditionModel.enable_forward_chunking
+    def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None:
+        """
+        Sets the attention processor to use [feed forward
+        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).
+
+        Parameters:
+            chunk_size (`int`, *optional*):
+                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
+                over each tensor of dim=`dim`.
+            dim (`int`, *optional*, defaults to `0`):
+                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
+                or dim=1 (sequence length).
+        """
+        if dim not in [0, 1]:
+            raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")
+
+        # By default chunk size is 1
+        chunk_size = chunk_size or 1
+
+        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
+            if hasattr(module, "set_chunk_feed_forward"):
+                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
+
+            for child in module.children():
+                fn_recursive_feed_forward(child, chunk_size, dim)
+
+        for module in self.children():
+            fn_recursive_feed_forward(module, chunk_size, dim)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        added_time_ids: torch.Tensor,
+        return_dict: bool = True,
+    ) -> Union[UNetSpatioTemporalConditionOutput, Tuple]:
+        r"""
+        The [`UNetSpatioTemporalConditionModel`] forward method.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, cross_attention_dim)`.
+            added_time_ids: (`torch.FloatTensor`):
+                The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal
+                embeddings and added to the time embeddings.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] instead of a plain
+                tuple.
+        Returns:
+            [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        batch_size, num_frames = sample.shape[:2]
+        timesteps = timesteps.expand(batch_size)
+
+        t_emb = self.time_proj(timesteps)
+
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+
+        emb = self.time_embedding(t_emb)
+
+        time_embeds = self.add_time_proj(added_time_ids.flatten())
+        time_embeds = time_embeds.reshape((batch_size, -1))
+        time_embeds = time_embeds.to(emb.dtype)
+        aug_emb = self.add_embedding(time_embeds)
+        emb = emb + aug_emb
+
+        # Flatten the batch and frames dimensions
+        # sample: [batch, frames, channels, height, width] -> [batch * frames, channels, height, width]
+        sample = sample.flatten(0, 1)
+        # Repeat the embeddings num_video_frames times
+        # emb: [batch, channels] -> [batch * frames, channels]
+        emb = emb.repeat_interleave(num_frames, dim=0)
+        # encoder_hidden_states: [batch, 1, channels] -> [batch * frames, 1, channels]
+        encoder_hidden_states = encoder_hidden_states.repeat_interleave(num_frames, dim=0)
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        image_only_indicator = torch.zeros(batch_size, num_frames, dtype=sample.dtype, device=sample.device)
+
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                )
+            else:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    image_only_indicator=image_only_indicator,
+                )
+
+            down_block_res_samples += res_samples
+
+        # 4. mid
+        sample = self.mid_block(
+            hidden_states=sample,
+            temb=emb,
+            encoder_hidden_states=encoder_hidden_states,
+            image_only_indicator=image_only_indicator,
+        )
+
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    image_only_indicator=image_only_indicator,
+                )
+
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        # 7. Reshape back to original shape
+        sample = sample.reshape(batch_size, num_frames, *sample.shape[1:])
+
+        if not return_dict:
+            return (sample,)
+
+        return UNetSpatioTemporalConditionOutput(sample=sample)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_stable_cascade.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_stable_cascade.py
new file mode 100644
index 000000000..9f81e5024
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_stable_cascade.py
@@ -0,0 +1,610 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders.unet import FromOriginalUNetMixin
+from ...utils import BaseOutput
+from ..attention_processor import Attention
+from ..modeling_utils import ModelMixin
+
+
+# Copied from diffusers.pipelines.wuerstchen.modeling_wuerstchen_common.WuerstchenLayerNorm with WuerstchenLayerNorm -> SDCascadeLayerNorm
+class SDCascadeLayerNorm(nn.LayerNorm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def forward(self, x):
+        x = x.permute(0, 2, 3, 1)
+        x = super().forward(x)
+        return x.permute(0, 3, 1, 2)
+
+
+class SDCascadeTimestepBlock(nn.Module):
+    def __init__(self, c, c_timestep, conds=[]):
+        super().__init__()
+        linear_cls = nn.Linear
+        self.mapper = linear_cls(c_timestep, c * 2)
+        self.conds = conds
+        for cname in conds:
+            setattr(self, f"mapper_{cname}", linear_cls(c_timestep, c * 2))
+
+    def forward(self, x, t):
+        t = t.chunk(len(self.conds) + 1, dim=1)
+        a, b = self.mapper(t[0])[:, :, None, None].chunk(2, dim=1)
+        for i, c in enumerate(self.conds):
+            ac, bc = getattr(self, f"mapper_{c}")(t[i + 1])[:, :, None, None].chunk(2, dim=1)
+            a, b = a + ac, b + bc
+        return x * (1 + a) + b
+
+
+class SDCascadeResBlock(nn.Module):
+    def __init__(self, c, c_skip=0, kernel_size=3, dropout=0.0):
+        super().__init__()
+        self.depthwise = nn.Conv2d(c, c, kernel_size=kernel_size, padding=kernel_size // 2, groups=c)
+        self.norm = SDCascadeLayerNorm(c, elementwise_affine=False, eps=1e-6)
+        self.channelwise = nn.Sequential(
+            nn.Linear(c + c_skip, c * 4),
+            nn.GELU(),
+            GlobalResponseNorm(c * 4),
+            nn.Dropout(dropout),
+            nn.Linear(c * 4, c),
+        )
+
+    def forward(self, x, x_skip=None):
+        x_res = x
+        x = self.norm(self.depthwise(x))
+        if x_skip is not None:
+            x = torch.cat([x, x_skip], dim=1)
+        x = self.channelwise(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+        return x + x_res
+
+
+# from https://github.com/facebookresearch/ConvNeXt-V2/blob/3608f67cc1dae164790c5d0aead7bf2d73d9719b/models/utils.py#L105
+class GlobalResponseNorm(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim))
+        self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim))
+
+    def forward(self, x):
+        agg_norm = torch.norm(x, p=2, dim=(1, 2), keepdim=True)
+        stand_div_norm = agg_norm / (agg_norm.mean(dim=-1, keepdim=True) + 1e-6)
+        return self.gamma * (x * stand_div_norm) + self.beta + x
+
+
+class SDCascadeAttnBlock(nn.Module):
+    def __init__(self, c, c_cond, nhead, self_attn=True, dropout=0.0):
+        super().__init__()
+        linear_cls = nn.Linear
+
+        self.self_attn = self_attn
+        self.norm = SDCascadeLayerNorm(c, elementwise_affine=False, eps=1e-6)
+        self.attention = Attention(query_dim=c, heads=nhead, dim_head=c // nhead, dropout=dropout, bias=True)
+        self.kv_mapper = nn.Sequential(nn.SiLU(), linear_cls(c_cond, c))
+
+    def forward(self, x, kv):
+        kv = self.kv_mapper(kv)
+        norm_x = self.norm(x)
+        if self.self_attn:
+            batch_size, channel, _, _ = x.shape
+            kv = torch.cat([norm_x.view(batch_size, channel, -1).transpose(1, 2), kv], dim=1)
+        x = x + self.attention(norm_x, encoder_hidden_states=kv)
+        return x
+
+
+class UpDownBlock2d(nn.Module):
+    def __init__(self, in_channels, out_channels, mode, enabled=True):
+        super().__init__()
+        if mode not in ["up", "down"]:
+            raise ValueError(f"{mode} not supported")
+        interpolation = (
+            nn.Upsample(scale_factor=2 if mode == "up" else 0.5, mode="bilinear", align_corners=True)
+            if enabled
+            else nn.Identity()
+        )
+        mapping = nn.Conv2d(in_channels, out_channels, kernel_size=1)
+        self.blocks = nn.ModuleList([interpolation, mapping] if mode == "up" else [mapping, interpolation])
+
+    def forward(self, x):
+        for block in self.blocks:
+            x = block(x)
+        return x
+
+
+@dataclass
+class StableCascadeUNetOutput(BaseOutput):
+    sample: torch.FloatTensor = None
+
+
+class StableCascadeUNet(ModelMixin, ConfigMixin, FromOriginalUNetMixin):
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 16,
+        out_channels: int = 16,
+        timestep_ratio_embedding_dim: int = 64,
+        patch_size: int = 1,
+        conditioning_dim: int = 2048,
+        block_out_channels: Tuple[int] = (2048, 2048),
+        num_attention_heads: Tuple[int] = (32, 32),
+        down_num_layers_per_block: Tuple[int] = (8, 24),
+        up_num_layers_per_block: Tuple[int] = (24, 8),
+        down_blocks_repeat_mappers: Optional[Tuple[int]] = (
+            1,
+            1,
+        ),
+        up_blocks_repeat_mappers: Optional[Tuple[int]] = (1, 1),
+        block_types_per_layer: Tuple[Tuple[str]] = (
+            ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"),
+            ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"),
+        ),
+        clip_text_in_channels: Optional[int] = None,
+        clip_text_pooled_in_channels=1280,
+        clip_image_in_channels: Optional[int] = None,
+        clip_seq=4,
+        effnet_in_channels: Optional[int] = None,
+        pixel_mapper_in_channels: Optional[int] = None,
+        kernel_size=3,
+        dropout: Union[float, Tuple[float]] = (0.1, 0.1),
+        self_attn: Union[bool, Tuple[bool]] = True,
+        timestep_conditioning_type: Tuple[str] = ("sca", "crp"),
+        switch_level: Optional[Tuple[bool]] = None,
+    ):
+        """
+
+        Parameters:
+            in_channels (`int`, defaults to 16):
+                Number of channels in the input sample.
+            out_channels (`int`, defaults to 16):
+                Number of channels in the output sample.
+            timestep_ratio_embedding_dim (`int`, defaults to 64):
+                Dimension of the projected time embedding.
+            patch_size (`int`, defaults to 1):
+                Patch size to use for pixel unshuffling layer
+            conditioning_dim (`int`, defaults to 2048):
+                Dimension of the image and text conditional embedding.
+            block_out_channels (Tuple[int], defaults to (2048, 2048)):
+                Tuple of output channels for each block.
+            num_attention_heads (Tuple[int], defaults to (32, 32)):
+                Number of attention heads in each attention block. Set to -1 to if block types in a layer do not have attention.
+            down_num_layers_per_block (Tuple[int], defaults to [8, 24]):
+                Number of layers in each down block.
+            up_num_layers_per_block (Tuple[int], defaults to [24, 8]):
+                Number of layers in each up block.
+            down_blocks_repeat_mappers (Tuple[int], optional, defaults to [1, 1]):
+                Number of 1x1 Convolutional layers to repeat in each down block.
+            up_blocks_repeat_mappers (Tuple[int], optional, defaults to [1, 1]):
+                Number of 1x1 Convolutional layers to repeat in each up block.
+            block_types_per_layer (Tuple[Tuple[str]], optional,
+                defaults to (
+                    ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock"),
+                    ("SDCascadeResBlock", "SDCascadeTimestepBlock", "SDCascadeAttnBlock")
+                ):
+                Block types used in each layer of the up/down blocks.
+            clip_text_in_channels (`int`, *optional*, defaults to `None`):
+                Number of input channels for CLIP based text conditioning.
+            clip_text_pooled_in_channels (`int`, *optional*, defaults to 1280):
+                Number of input channels for pooled CLIP text embeddings.
+            clip_image_in_channels (`int`, *optional*):
+                Number of input channels for CLIP based image conditioning.
+            clip_seq (`int`, *optional*, defaults to 4):
+            effnet_in_channels (`int`, *optional*, defaults to `None`):
+                Number of input channels for effnet conditioning.
+            pixel_mapper_in_channels (`int`, defaults to `None`):
+                Number of input channels for pixel mapper conditioning.
+            kernel_size (`int`, *optional*, defaults to 3):
+                Kernel size to use in the block convolutional layers.
+            dropout (Tuple[float], *optional*, defaults to (0.1, 0.1)):
+                Dropout to use per block.
+            self_attn (Union[bool, Tuple[bool]]):
+                Tuple of booleans that determine whether to use self attention in a block or not.
+            timestep_conditioning_type (Tuple[str], defaults to ("sca", "crp")):
+                Timestep conditioning type.
+            switch_level (Optional[Tuple[bool]], *optional*, defaults to `None`):
+                Tuple that indicates whether upsampling or downsampling should be applied in a block
+        """
+
+        super().__init__()
+
+        if len(block_out_channels) != len(down_num_layers_per_block):
+            raise ValueError(
+                f"Number of elements in `down_num_layers_per_block` must match the length of `block_out_channels`: {len(block_out_channels)}"
+            )
+
+        elif len(block_out_channels) != len(up_num_layers_per_block):
+            raise ValueError(
+                f"Number of elements in `up_num_layers_per_block` must match the length of `block_out_channels`: {len(block_out_channels)}"
+            )
+
+        elif len(block_out_channels) != len(down_blocks_repeat_mappers):
+            raise ValueError(
+                f"Number of elements in `down_blocks_repeat_mappers` must match the length of `block_out_channels`: {len(block_out_channels)}"
+            )
+
+        elif len(block_out_channels) != len(up_blocks_repeat_mappers):
+            raise ValueError(
+                f"Number of elements in `up_blocks_repeat_mappers` must match the length of `block_out_channels`: {len(block_out_channels)}"
+            )
+
+        elif len(block_out_channels) != len(block_types_per_layer):
+            raise ValueError(
+                f"Number of elements in `block_types_per_layer` must match the length of `block_out_channels`: {len(block_out_channels)}"
+            )
+
+        if isinstance(dropout, float):
+            dropout = (dropout,) * len(block_out_channels)
+        if isinstance(self_attn, bool):
+            self_attn = (self_attn,) * len(block_out_channels)
+
+        # CONDITIONING
+        if effnet_in_channels is not None:
+            self.effnet_mapper = nn.Sequential(
+                nn.Conv2d(effnet_in_channels, block_out_channels[0] * 4, kernel_size=1),
+                nn.GELU(),
+                nn.Conv2d(block_out_channels[0] * 4, block_out_channels[0], kernel_size=1),
+                SDCascadeLayerNorm(block_out_channels[0], elementwise_affine=False, eps=1e-6),
+            )
+        if pixel_mapper_in_channels is not None:
+            self.pixels_mapper = nn.Sequential(
+                nn.Conv2d(pixel_mapper_in_channels, block_out_channels[0] * 4, kernel_size=1),
+                nn.GELU(),
+                nn.Conv2d(block_out_channels[0] * 4, block_out_channels[0], kernel_size=1),
+                SDCascadeLayerNorm(block_out_channels[0], elementwise_affine=False, eps=1e-6),
+            )
+
+        self.clip_txt_pooled_mapper = nn.Linear(clip_text_pooled_in_channels, conditioning_dim * clip_seq)
+        if clip_text_in_channels is not None:
+            self.clip_txt_mapper = nn.Linear(clip_text_in_channels, conditioning_dim)
+        if clip_image_in_channels is not None:
+            self.clip_img_mapper = nn.Linear(clip_image_in_channels, conditioning_dim * clip_seq)
+        self.clip_norm = nn.LayerNorm(conditioning_dim, elementwise_affine=False, eps=1e-6)
+
+        self.embedding = nn.Sequential(
+            nn.PixelUnshuffle(patch_size),
+            nn.Conv2d(in_channels * (patch_size**2), block_out_channels[0], kernel_size=1),
+            SDCascadeLayerNorm(block_out_channels[0], elementwise_affine=False, eps=1e-6),
+        )
+
+        def get_block(block_type, in_channels, nhead, c_skip=0, dropout=0, self_attn=True):
+            if block_type == "SDCascadeResBlock":
+                return SDCascadeResBlock(in_channels, c_skip, kernel_size=kernel_size, dropout=dropout)
+            elif block_type == "SDCascadeAttnBlock":
+                return SDCascadeAttnBlock(in_channels, conditioning_dim, nhead, self_attn=self_attn, dropout=dropout)
+            elif block_type == "SDCascadeTimestepBlock":
+                return SDCascadeTimestepBlock(
+                    in_channels, timestep_ratio_embedding_dim, conds=timestep_conditioning_type
+                )
+            else:
+                raise ValueError(f"Block type {block_type} not supported")
+
+        # BLOCKS
+        # -- down blocks
+        self.down_blocks = nn.ModuleList()
+        self.down_downscalers = nn.ModuleList()
+        self.down_repeat_mappers = nn.ModuleList()
+        for i in range(len(block_out_channels)):
+            if i > 0:
+                self.down_downscalers.append(
+                    nn.Sequential(
+                        SDCascadeLayerNorm(block_out_channels[i - 1], elementwise_affine=False, eps=1e-6),
+                        UpDownBlock2d(
+                            block_out_channels[i - 1], block_out_channels[i], mode="down", enabled=switch_level[i - 1]
+                        )
+                        if switch_level is not None
+                        else nn.Conv2d(block_out_channels[i - 1], block_out_channels[i], kernel_size=2, stride=2),
+                    )
+                )
+            else:
+                self.down_downscalers.append(nn.Identity())
+
+            down_block = nn.ModuleList()
+            for _ in range(down_num_layers_per_block[i]):
+                for block_type in block_types_per_layer[i]:
+                    block = get_block(
+                        block_type,
+                        block_out_channels[i],
+                        num_attention_heads[i],
+                        dropout=dropout[i],
+                        self_attn=self_attn[i],
+                    )
+                    down_block.append(block)
+            self.down_blocks.append(down_block)
+
+            if down_blocks_repeat_mappers is not None:
+                block_repeat_mappers = nn.ModuleList()
+                for _ in range(down_blocks_repeat_mappers[i] - 1):
+                    block_repeat_mappers.append(nn.Conv2d(block_out_channels[i], block_out_channels[i], kernel_size=1))
+                self.down_repeat_mappers.append(block_repeat_mappers)
+
+        # -- up blocks
+        self.up_blocks = nn.ModuleList()
+        self.up_upscalers = nn.ModuleList()
+        self.up_repeat_mappers = nn.ModuleList()
+        for i in reversed(range(len(block_out_channels))):
+            if i > 0:
+                self.up_upscalers.append(
+                    nn.Sequential(
+                        SDCascadeLayerNorm(block_out_channels[i], elementwise_affine=False, eps=1e-6),
+                        UpDownBlock2d(
+                            block_out_channels[i], block_out_channels[i - 1], mode="up", enabled=switch_level[i - 1]
+                        )
+                        if switch_level is not None
+                        else nn.ConvTranspose2d(
+                            block_out_channels[i], block_out_channels[i - 1], kernel_size=2, stride=2
+                        ),
+                    )
+                )
+            else:
+                self.up_upscalers.append(nn.Identity())
+
+            up_block = nn.ModuleList()
+            for j in range(up_num_layers_per_block[::-1][i]):
+                for k, block_type in enumerate(block_types_per_layer[i]):
+                    c_skip = block_out_channels[i] if i < len(block_out_channels) - 1 and j == k == 0 else 0
+                    block = get_block(
+                        block_type,
+                        block_out_channels[i],
+                        num_attention_heads[i],
+                        c_skip=c_skip,
+                        dropout=dropout[i],
+                        self_attn=self_attn[i],
+                    )
+                    up_block.append(block)
+            self.up_blocks.append(up_block)
+
+            if up_blocks_repeat_mappers is not None:
+                block_repeat_mappers = nn.ModuleList()
+                for _ in range(up_blocks_repeat_mappers[::-1][i] - 1):
+                    block_repeat_mappers.append(nn.Conv2d(block_out_channels[i], block_out_channels[i], kernel_size=1))
+                self.up_repeat_mappers.append(block_repeat_mappers)
+
+        # OUTPUT
+        self.clf = nn.Sequential(
+            SDCascadeLayerNorm(block_out_channels[0], elementwise_affine=False, eps=1e-6),
+            nn.Conv2d(block_out_channels[0], out_channels * (patch_size**2), kernel_size=1),
+            nn.PixelShuffle(patch_size),
+        )
+
+        self.gradient_checkpointing = False
+
+    def _set_gradient_checkpointing(self, value=False):
+        self.gradient_checkpointing = value
+
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2d, nn.Linear)):
+            torch.nn.init.xavier_uniform_(m.weight)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+        nn.init.normal_(self.clip_txt_pooled_mapper.weight, std=0.02)
+        nn.init.normal_(self.clip_txt_mapper.weight, std=0.02) if hasattr(self, "clip_txt_mapper") else None
+        nn.init.normal_(self.clip_img_mapper.weight, std=0.02) if hasattr(self, "clip_img_mapper") else None
+
+        if hasattr(self, "effnet_mapper"):
+            nn.init.normal_(self.effnet_mapper[0].weight, std=0.02)  # conditionings
+            nn.init.normal_(self.effnet_mapper[2].weight, std=0.02)  # conditionings
+
+        if hasattr(self, "pixels_mapper"):
+            nn.init.normal_(self.pixels_mapper[0].weight, std=0.02)  # conditionings
+            nn.init.normal_(self.pixels_mapper[2].weight, std=0.02)  # conditionings
+
+        torch.nn.init.xavier_uniform_(self.embedding[1].weight, 0.02)  # inputs
+        nn.init.constant_(self.clf[1].weight, 0)  # outputs
+
+        # blocks
+        for level_block in self.down_blocks + self.up_blocks:
+            for block in level_block:
+                if isinstance(block, SDCascadeResBlock):
+                    block.channelwise[-1].weight.data *= np.sqrt(1 / sum(self.config.blocks[0]))
+                elif isinstance(block, SDCascadeTimestepBlock):
+                    nn.init.constant_(block.mapper.weight, 0)
+
+    def get_timestep_ratio_embedding(self, timestep_ratio, max_positions=10000):
+        r = timestep_ratio * max_positions
+        half_dim = self.config.timestep_ratio_embedding_dim // 2
+
+        emb = math.log(max_positions) / (half_dim - 1)
+        emb = torch.arange(half_dim, device=r.device).float().mul(-emb).exp()
+        emb = r[:, None] * emb[None, :]
+        emb = torch.cat([emb.sin(), emb.cos()], dim=1)
+
+        if self.config.timestep_ratio_embedding_dim % 2 == 1:  # zero pad
+            emb = nn.functional.pad(emb, (0, 1), mode="constant")
+
+        return emb.to(dtype=r.dtype)
+
+    def get_clip_embeddings(self, clip_txt_pooled, clip_txt=None, clip_img=None):
+        if len(clip_txt_pooled.shape) == 2:
+            clip_txt_pool = clip_txt_pooled.unsqueeze(1)
+        clip_txt_pool = self.clip_txt_pooled_mapper(clip_txt_pooled).view(
+            clip_txt_pooled.size(0), clip_txt_pooled.size(1) * self.config.clip_seq, -1
+        )
+        if clip_txt is not None and clip_img is not None:
+            clip_txt = self.clip_txt_mapper(clip_txt)
+            if len(clip_img.shape) == 2:
+                clip_img = clip_img.unsqueeze(1)
+            clip_img = self.clip_img_mapper(clip_img).view(
+                clip_img.size(0), clip_img.size(1) * self.config.clip_seq, -1
+            )
+            clip = torch.cat([clip_txt, clip_txt_pool, clip_img], dim=1)
+        else:
+            clip = clip_txt_pool
+        return self.clip_norm(clip)
+
+    def _down_encode(self, x, r_embed, clip):
+        level_outputs = []
+        block_group = zip(self.down_blocks, self.down_downscalers, self.down_repeat_mappers)
+
+        if self.training and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            for down_block, downscaler, repmap in block_group:
+                x = downscaler(x)
+                for i in range(len(repmap) + 1):
+                    for block in down_block:
+                        if isinstance(block, SDCascadeResBlock):
+                            x = torch.utils.checkpoint.checkpoint(create_custom_forward(block), x, use_reentrant=False)
+                        elif isinstance(block, SDCascadeAttnBlock):
+                            x = torch.utils.checkpoint.checkpoint(
+                                create_custom_forward(block), x, clip, use_reentrant=False
+                            )
+                        elif isinstance(block, SDCascadeTimestepBlock):
+                            x = torch.utils.checkpoint.checkpoint(
+                                create_custom_forward(block), x, r_embed, use_reentrant=False
+                            )
+                        else:
+                            x = x = torch.utils.checkpoint.checkpoint(
+                                create_custom_forward(block), use_reentrant=False
+                            )
+                    if i < len(repmap):
+                        x = repmap[i](x)
+                level_outputs.insert(0, x)
+        else:
+            for down_block, downscaler, repmap in block_group:
+                x = downscaler(x)
+                for i in range(len(repmap) + 1):
+                    for block in down_block:
+                        if isinstance(block, SDCascadeResBlock):
+                            x = block(x)
+                        elif isinstance(block, SDCascadeAttnBlock):
+                            x = block(x, clip)
+                        elif isinstance(block, SDCascadeTimestepBlock):
+                            x = block(x, r_embed)
+                        else:
+                            x = block(x)
+                    if i < len(repmap):
+                        x = repmap[i](x)
+                level_outputs.insert(0, x)
+        return level_outputs
+
+    def _up_decode(self, level_outputs, r_embed, clip):
+        x = level_outputs[0]
+        block_group = zip(self.up_blocks, self.up_upscalers, self.up_repeat_mappers)
+
+        if self.training and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            for i, (up_block, upscaler, repmap) in enumerate(block_group):
+                for j in range(len(repmap) + 1):
+                    for k, block in enumerate(up_block):
+                        if isinstance(block, SDCascadeResBlock):
+                            skip = level_outputs[i] if k == 0 and i > 0 else None
+                            if skip is not None and (x.size(-1) != skip.size(-1) or x.size(-2) != skip.size(-2)):
+                                x = torch.nn.functional.interpolate(
+                                    x.float(), skip.shape[-2:], mode="bilinear", align_corners=True
+                                )
+                            x = torch.utils.checkpoint.checkpoint(
+                                create_custom_forward(block), x, skip, use_reentrant=False
+                            )
+                        elif isinstance(block, SDCascadeAttnBlock):
+                            x = torch.utils.checkpoint.checkpoint(
+                                create_custom_forward(block), x, clip, use_reentrant=False
+                            )
+                        elif isinstance(block, SDCascadeTimestepBlock):
+                            x = torch.utils.checkpoint.checkpoint(
+                                create_custom_forward(block), x, r_embed, use_reentrant=False
+                            )
+                        else:
+                            x = torch.utils.checkpoint.checkpoint(create_custom_forward(block), x, use_reentrant=False)
+                    if j < len(repmap):
+                        x = repmap[j](x)
+                x = upscaler(x)
+        else:
+            for i, (up_block, upscaler, repmap) in enumerate(block_group):
+                for j in range(len(repmap) + 1):
+                    for k, block in enumerate(up_block):
+                        if isinstance(block, SDCascadeResBlock):
+                            skip = level_outputs[i] if k == 0 and i > 0 else None
+                            if skip is not None and (x.size(-1) != skip.size(-1) or x.size(-2) != skip.size(-2)):
+                                x = torch.nn.functional.interpolate(
+                                    x.float(), skip.shape[-2:], mode="bilinear", align_corners=True
+                                )
+                            x = block(x, skip)
+                        elif isinstance(block, SDCascadeAttnBlock):
+                            x = block(x, clip)
+                        elif isinstance(block, SDCascadeTimestepBlock):
+                            x = block(x, r_embed)
+                        else:
+                            x = block(x)
+                    if j < len(repmap):
+                        x = repmap[j](x)
+                x = upscaler(x)
+        return x
+
+    def forward(
+        self,
+        sample,
+        timestep_ratio,
+        clip_text_pooled,
+        clip_text=None,
+        clip_img=None,
+        effnet=None,
+        pixels=None,
+        sca=None,
+        crp=None,
+        return_dict=True,
+    ):
+        if pixels is None:
+            pixels = sample.new_zeros(sample.size(0), 3, 8, 8)
+
+        # Process the conditioning embeddings
+        timestep_ratio_embed = self.get_timestep_ratio_embedding(timestep_ratio)
+        for c in self.config.timestep_conditioning_type:
+            if c == "sca":
+                cond = sca
+            elif c == "crp":
+                cond = crp
+            else:
+                cond = None
+            t_cond = cond or torch.zeros_like(timestep_ratio)
+            timestep_ratio_embed = torch.cat([timestep_ratio_embed, self.get_timestep_ratio_embedding(t_cond)], dim=1)
+        clip = self.get_clip_embeddings(clip_txt_pooled=clip_text_pooled, clip_txt=clip_text, clip_img=clip_img)
+
+        # Model Blocks
+        x = self.embedding(sample)
+        if hasattr(self, "effnet_mapper") and effnet is not None:
+            x = x + self.effnet_mapper(
+                nn.functional.interpolate(effnet, size=x.shape[-2:], mode="bilinear", align_corners=True)
+            )
+        if hasattr(self, "pixels_mapper"):
+            x = x + nn.functional.interpolate(
+                self.pixels_mapper(pixels), size=x.shape[-2:], mode="bilinear", align_corners=True
+            )
+        level_outputs = self._down_encode(x, timestep_ratio_embed, clip)
+        x = self._up_decode(level_outputs, timestep_ratio_embed, clip)
+        sample = self.clf(x)
+
+        if not return_dict:
+            return (sample,)
+        return StableCascadeUNetOutput(sample=sample)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/uvit_2d.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/uvit_2d.py
new file mode 100644
index 000000000..bfd865d12
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/uvit_2d.py
@@ -0,0 +1,470 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.utils.checkpoint import checkpoint
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import PeftAdapterMixin
+from ..attention import BasicTransformerBlock, SkipFFTransformerBlock
+from ..attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from ..embeddings import TimestepEmbedding, get_timestep_embedding
+from ..modeling_utils import ModelMixin
+from ..normalization import GlobalResponseNorm, RMSNorm
+from ..resnet import Downsample2D, Upsample2D
+
+
+class UVit2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        # global config
+        hidden_size: int = 1024,
+        use_bias: bool = False,
+        hidden_dropout: float = 0.0,
+        # conditioning dimensions
+        cond_embed_dim: int = 768,
+        micro_cond_encode_dim: int = 256,
+        micro_cond_embed_dim: int = 1280,
+        encoder_hidden_size: int = 768,
+        # num tokens
+        vocab_size: int = 8256,  # codebook_size + 1 (for the mask token) rounded
+        codebook_size: int = 8192,
+        # `UVit2DConvEmbed`
+        in_channels: int = 768,
+        block_out_channels: int = 768,
+        num_res_blocks: int = 3,
+        downsample: bool = False,
+        upsample: bool = False,
+        block_num_heads: int = 12,
+        # `TransformerLayer`
+        num_hidden_layers: int = 22,
+        num_attention_heads: int = 16,
+        # `Attention`
+        attention_dropout: float = 0.0,
+        # `FeedForward`
+        intermediate_size: int = 2816,
+        # `Norm`
+        layer_norm_eps: float = 1e-6,
+        ln_elementwise_affine: bool = True,
+        sample_size: int = 64,
+    ):
+        super().__init__()
+
+        self.encoder_proj = nn.Linear(encoder_hidden_size, hidden_size, bias=use_bias)
+        self.encoder_proj_layer_norm = RMSNorm(hidden_size, layer_norm_eps, ln_elementwise_affine)
+
+        self.embed = UVit2DConvEmbed(
+            in_channels, block_out_channels, vocab_size, ln_elementwise_affine, layer_norm_eps, use_bias
+        )
+
+        self.cond_embed = TimestepEmbedding(
+            micro_cond_embed_dim + cond_embed_dim, hidden_size, sample_proj_bias=use_bias
+        )
+
+        self.down_block = UVitBlock(
+            block_out_channels,
+            num_res_blocks,
+            hidden_size,
+            hidden_dropout,
+            ln_elementwise_affine,
+            layer_norm_eps,
+            use_bias,
+            block_num_heads,
+            attention_dropout,
+            downsample,
+            False,
+        )
+
+        self.project_to_hidden_norm = RMSNorm(block_out_channels, layer_norm_eps, ln_elementwise_affine)
+        self.project_to_hidden = nn.Linear(block_out_channels, hidden_size, bias=use_bias)
+
+        self.transformer_layers = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    dim=hidden_size,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=hidden_size // num_attention_heads,
+                    dropout=hidden_dropout,
+                    cross_attention_dim=hidden_size,
+                    attention_bias=use_bias,
+                    norm_type="ada_norm_continuous",
+                    ada_norm_continous_conditioning_embedding_dim=hidden_size,
+                    norm_elementwise_affine=ln_elementwise_affine,
+                    norm_eps=layer_norm_eps,
+                    ada_norm_bias=use_bias,
+                    ff_inner_dim=intermediate_size,
+                    ff_bias=use_bias,
+                    attention_out_bias=use_bias,
+                )
+                for _ in range(num_hidden_layers)
+            ]
+        )
+
+        self.project_from_hidden_norm = RMSNorm(hidden_size, layer_norm_eps, ln_elementwise_affine)
+        self.project_from_hidden = nn.Linear(hidden_size, block_out_channels, bias=use_bias)
+
+        self.up_block = UVitBlock(
+            block_out_channels,
+            num_res_blocks,
+            hidden_size,
+            hidden_dropout,
+            ln_elementwise_affine,
+            layer_norm_eps,
+            use_bias,
+            block_num_heads,
+            attention_dropout,
+            downsample=False,
+            upsample=upsample,
+        )
+
+        self.mlm_layer = ConvMlmLayer(
+            block_out_channels, in_channels, use_bias, ln_elementwise_affine, layer_norm_eps, codebook_size
+        )
+
+        self.gradient_checkpointing = False
+
+    def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
+        pass
+
+    def forward(self, input_ids, encoder_hidden_states, pooled_text_emb, micro_conds, cross_attention_kwargs=None):
+        encoder_hidden_states = self.encoder_proj(encoder_hidden_states)
+        encoder_hidden_states = self.encoder_proj_layer_norm(encoder_hidden_states)
+
+        micro_cond_embeds = get_timestep_embedding(
+            micro_conds.flatten(), self.config.micro_cond_encode_dim, flip_sin_to_cos=True, downscale_freq_shift=0
+        )
+
+        micro_cond_embeds = micro_cond_embeds.reshape((input_ids.shape[0], -1))
+
+        pooled_text_emb = torch.cat([pooled_text_emb, micro_cond_embeds], dim=1)
+        pooled_text_emb = pooled_text_emb.to(dtype=self.dtype)
+        pooled_text_emb = self.cond_embed(pooled_text_emb).to(encoder_hidden_states.dtype)
+
+        hidden_states = self.embed(input_ids)
+
+        hidden_states = self.down_block(
+            hidden_states,
+            pooled_text_emb=pooled_text_emb,
+            encoder_hidden_states=encoder_hidden_states,
+            cross_attention_kwargs=cross_attention_kwargs,
+        )
+
+        batch_size, channels, height, width = hidden_states.shape
+        hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch_size, height * width, channels)
+
+        hidden_states = self.project_to_hidden_norm(hidden_states)
+        hidden_states = self.project_to_hidden(hidden_states)
+
+        for layer in self.transformer_layers:
+            if self.training and self.gradient_checkpointing:
+
+                def layer_(*args):
+                    return checkpoint(layer, *args)
+
+            else:
+                layer_ = layer
+
+            hidden_states = layer_(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                added_cond_kwargs={"pooled_text_emb": pooled_text_emb},
+            )
+
+        hidden_states = self.project_from_hidden_norm(hidden_states)
+        hidden_states = self.project_from_hidden(hidden_states)
+
+        hidden_states = hidden_states.reshape(batch_size, height, width, channels).permute(0, 3, 1, 2)
+
+        hidden_states = self.up_block(
+            hidden_states,
+            pooled_text_emb=pooled_text_emb,
+            encoder_hidden_states=encoder_hidden_states,
+            cross_attention_kwargs=cross_attention_kwargs,
+        )
+
+        logits = self.mlm_layer(hidden_states)
+
+        return logits
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+
+class UVit2DConvEmbed(nn.Module):
+    def __init__(self, in_channels, block_out_channels, vocab_size, elementwise_affine, eps, bias):
+        super().__init__()
+        self.embeddings = nn.Embedding(vocab_size, in_channels)
+        self.layer_norm = RMSNorm(in_channels, eps, elementwise_affine)
+        self.conv = nn.Conv2d(in_channels, block_out_channels, kernel_size=1, bias=bias)
+
+    def forward(self, input_ids):
+        embeddings = self.embeddings(input_ids)
+        embeddings = self.layer_norm(embeddings)
+        embeddings = embeddings.permute(0, 3, 1, 2)
+        embeddings = self.conv(embeddings)
+        return embeddings
+
+
+class UVitBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        num_res_blocks: int,
+        hidden_size,
+        hidden_dropout,
+        ln_elementwise_affine,
+        layer_norm_eps,
+        use_bias,
+        block_num_heads,
+        attention_dropout,
+        downsample: bool,
+        upsample: bool,
+    ):
+        super().__init__()
+
+        if downsample:
+            self.downsample = Downsample2D(
+                channels,
+                use_conv=True,
+                padding=0,
+                name="Conv2d_0",
+                kernel_size=2,
+                norm_type="rms_norm",
+                eps=layer_norm_eps,
+                elementwise_affine=ln_elementwise_affine,
+                bias=use_bias,
+            )
+        else:
+            self.downsample = None
+
+        self.res_blocks = nn.ModuleList(
+            [
+                ConvNextBlock(
+                    channels,
+                    layer_norm_eps,
+                    ln_elementwise_affine,
+                    use_bias,
+                    hidden_dropout,
+                    hidden_size,
+                )
+                for i in range(num_res_blocks)
+            ]
+        )
+
+        self.attention_blocks = nn.ModuleList(
+            [
+                SkipFFTransformerBlock(
+                    channels,
+                    block_num_heads,
+                    channels // block_num_heads,
+                    hidden_size,
+                    use_bias,
+                    attention_dropout,
+                    channels,
+                    attention_bias=use_bias,
+                    attention_out_bias=use_bias,
+                )
+                for _ in range(num_res_blocks)
+            ]
+        )
+
+        if upsample:
+            self.upsample = Upsample2D(
+                channels,
+                use_conv_transpose=True,
+                kernel_size=2,
+                padding=0,
+                name="conv",
+                norm_type="rms_norm",
+                eps=layer_norm_eps,
+                elementwise_affine=ln_elementwise_affine,
+                bias=use_bias,
+                interpolate=False,
+            )
+        else:
+            self.upsample = None
+
+    def forward(self, x, pooled_text_emb, encoder_hidden_states, cross_attention_kwargs):
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        for res_block, attention_block in zip(self.res_blocks, self.attention_blocks):
+            x = res_block(x, pooled_text_emb)
+
+            batch_size, channels, height, width = x.shape
+            x = x.view(batch_size, channels, height * width).permute(0, 2, 1)
+            x = attention_block(
+                x, encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=cross_attention_kwargs
+            )
+            x = x.permute(0, 2, 1).view(batch_size, channels, height, width)
+
+        if self.upsample is not None:
+            x = self.upsample(x)
+
+        return x
+
+
+class ConvNextBlock(nn.Module):
+    def __init__(
+        self, channels, layer_norm_eps, ln_elementwise_affine, use_bias, hidden_dropout, hidden_size, res_ffn_factor=4
+    ):
+        super().__init__()
+        self.depthwise = nn.Conv2d(
+            channels,
+            channels,
+            kernel_size=3,
+            padding=1,
+            groups=channels,
+            bias=use_bias,
+        )
+        self.norm = RMSNorm(channels, layer_norm_eps, ln_elementwise_affine)
+        self.channelwise_linear_1 = nn.Linear(channels, int(channels * res_ffn_factor), bias=use_bias)
+        self.channelwise_act = nn.GELU()
+        self.channelwise_norm = GlobalResponseNorm(int(channels * res_ffn_factor))
+        self.channelwise_linear_2 = nn.Linear(int(channels * res_ffn_factor), channels, bias=use_bias)
+        self.channelwise_dropout = nn.Dropout(hidden_dropout)
+        self.cond_embeds_mapper = nn.Linear(hidden_size, channels * 2, use_bias)
+
+    def forward(self, x, cond_embeds):
+        x_res = x
+
+        x = self.depthwise(x)
+
+        x = x.permute(0, 2, 3, 1)
+        x = self.norm(x)
+
+        x = self.channelwise_linear_1(x)
+        x = self.channelwise_act(x)
+        x = self.channelwise_norm(x)
+        x = self.channelwise_linear_2(x)
+        x = self.channelwise_dropout(x)
+
+        x = x.permute(0, 3, 1, 2)
+
+        x = x + x_res
+
+        scale, shift = self.cond_embeds_mapper(F.silu(cond_embeds)).chunk(2, dim=1)
+        x = x * (1 + scale[:, :, None, None]) + shift[:, :, None, None]
+
+        return x
+
+
+class ConvMlmLayer(nn.Module):
+    def __init__(
+        self,
+        block_out_channels: int,
+        in_channels: int,
+        use_bias: bool,
+        ln_elementwise_affine: bool,
+        layer_norm_eps: float,
+        codebook_size: int,
+    ):
+        super().__init__()
+        self.conv1 = nn.Conv2d(block_out_channels, in_channels, kernel_size=1, bias=use_bias)
+        self.layer_norm = RMSNorm(in_channels, layer_norm_eps, ln_elementwise_affine)
+        self.conv2 = nn.Conv2d(in_channels, codebook_size, kernel_size=1, bias=use_bias)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.layer_norm(hidden_states.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+        logits = self.conv2(hidden_states)
+        return logits
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/upsampling.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/upsampling.py
new file mode 100644
index 000000000..4ecf6ebc2
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/upsampling.py
@@ -0,0 +1,448 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..utils import deprecate
+from .normalization import RMSNorm
+
+
+class Upsample1D(nn.Module):
+    """A 1D upsampling layer with an optional convolution.
+
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        use_conv_transpose (`bool`, default `False`):
+            option to use a convolution transpose.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        name (`str`, default `conv`):
+            name of the upsampling 1D layer.
+    """
+
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = False,
+        use_conv_transpose: bool = False,
+        out_channels: Optional[int] = None,
+        name: str = "conv",
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_conv_transpose = use_conv_transpose
+        self.name = name
+
+        self.conv = None
+        if use_conv_transpose:
+            self.conv = nn.ConvTranspose1d(channels, self.out_channels, 4, 2, 1)
+        elif use_conv:
+            self.conv = nn.Conv1d(self.channels, self.out_channels, 3, padding=1)
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        assert inputs.shape[1] == self.channels
+        if self.use_conv_transpose:
+            return self.conv(inputs)
+
+        outputs = F.interpolate(inputs, scale_factor=2.0, mode="nearest")
+
+        if self.use_conv:
+            outputs = self.conv(outputs)
+
+        return outputs
+
+
+class Upsample2D(nn.Module):
+    """A 2D upsampling layer with an optional convolution.
+
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        use_conv_transpose (`bool`, default `False`):
+            option to use a convolution transpose.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        name (`str`, default `conv`):
+            name of the upsampling 2D layer.
+    """
+
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = False,
+        use_conv_transpose: bool = False,
+        out_channels: Optional[int] = None,
+        name: str = "conv",
+        kernel_size: Optional[int] = None,
+        padding=1,
+        norm_type=None,
+        eps=None,
+        elementwise_affine=None,
+        bias=True,
+        interpolate=True,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_conv_transpose = use_conv_transpose
+        self.name = name
+        self.interpolate = interpolate
+        conv_cls = nn.Conv2d
+
+        if norm_type == "ln_norm":
+            self.norm = nn.LayerNorm(channels, eps, elementwise_affine)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(channels, eps, elementwise_affine)
+        elif norm_type is None:
+            self.norm = None
+        else:
+            raise ValueError(f"unknown norm_type: {norm_type}")
+
+        conv = None
+        if use_conv_transpose:
+            if kernel_size is None:
+                kernel_size = 4
+            conv = nn.ConvTranspose2d(
+                channels, self.out_channels, kernel_size=kernel_size, stride=2, padding=padding, bias=bias
+            )
+        elif use_conv:
+            if kernel_size is None:
+                kernel_size = 3
+            conv = conv_cls(self.channels, self.out_channels, kernel_size=kernel_size, padding=padding, bias=bias)
+
+        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+        if name == "conv":
+            self.conv = conv
+        else:
+            self.Conv2d_0 = conv
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, output_size: Optional[int] = None, *args, **kwargs
+    ) -> torch.FloatTensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        assert hidden_states.shape[1] == self.channels
+
+        if self.norm is not None:
+            hidden_states = self.norm(hidden_states.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+
+        if self.use_conv_transpose:
+            return self.conv(hidden_states)
+
+        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
+        # TODO(Suraj): Remove this cast once the issue is fixed in PyTorch
+        # https://github.com/pytorch/pytorch/issues/86679
+        dtype = hidden_states.dtype
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(torch.float32)
+
+        # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+        if hidden_states.shape[0] >= 64:
+            hidden_states = hidden_states.contiguous()
+
+        # if `output_size` is passed we force the interpolation output
+        # size and do not make use of `scale_factor=2`
+        if self.interpolate:
+            if output_size is None:
+                hidden_states = F.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
+            else:
+                hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest")
+
+        # If the input is bfloat16, we cast back to bfloat16
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(dtype)
+
+        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+        if self.use_conv:
+            if self.name == "conv":
+                hidden_states = self.conv(hidden_states)
+            else:
+                hidden_states = self.Conv2d_0(hidden_states)
+
+        return hidden_states
+
+
+class FirUpsample2D(nn.Module):
+    """A 2D FIR upsampling layer with an optional convolution.
+
+    Parameters:
+        channels (`int`, optional):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        fir_kernel (`tuple`, default `(1, 3, 3, 1)`):
+            kernel for the FIR filter.
+    """
+
+    def __init__(
+        self,
+        channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        use_conv: bool = False,
+        fir_kernel: Tuple[int, int, int, int] = (1, 3, 3, 1),
+    ):
+        super().__init__()
+        out_channels = out_channels if out_channels else channels
+        if use_conv:
+            self.Conv2d_0 = nn.Conv2d(channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.use_conv = use_conv
+        self.fir_kernel = fir_kernel
+        self.out_channels = out_channels
+
+    def _upsample_2d(
+        self,
+        hidden_states: torch.FloatTensor,
+        weight: Optional[torch.FloatTensor] = None,
+        kernel: Optional[torch.FloatTensor] = None,
+        factor: int = 2,
+        gain: float = 1,
+    ) -> torch.FloatTensor:
+        """Fused `upsample_2d()` followed by `Conv2d()`.
+
+        Padding is performed only once at the beginning, not between the operations. The fused op is considerably more
+        efficient than performing the same calculation using standard TensorFlow ops. It supports gradients of
+        arbitrary order.
+
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
+            weight (`torch.FloatTensor`, *optional*):
+                Weight tensor of the shape `[filterH, filterW, inChannels, outChannels]`. Grouped convolution can be
+                performed by `inChannels = x.shape[0] // numGroups`.
+            kernel (`torch.FloatTensor`, *optional*):
+                FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which
+                corresponds to nearest-neighbor upsampling.
+            factor (`int`, *optional*): Integer upsampling factor (default: 2).
+            gain (`float`, *optional*): Scaling factor for signal magnitude (default: 1.0).
+
+        Returns:
+            output (`torch.FloatTensor`):
+                Tensor of the shape `[N, C, H * factor, W * factor]` or `[N, H * factor, W * factor, C]`, and same
+                datatype as `hidden_states`.
+        """
+
+        assert isinstance(factor, int) and factor >= 1
+
+        # Setup filter kernel.
+        if kernel is None:
+            kernel = [1] * factor
+
+        # setup kernel
+        kernel = torch.tensor(kernel, dtype=torch.float32)
+        if kernel.ndim == 1:
+            kernel = torch.outer(kernel, kernel)
+        kernel /= torch.sum(kernel)
+
+        kernel = kernel * (gain * (factor**2))
+
+        if self.use_conv:
+            convH = weight.shape[2]
+            convW = weight.shape[3]
+            inC = weight.shape[1]
+
+            pad_value = (kernel.shape[0] - factor) - (convW - 1)
+
+            stride = (factor, factor)
+            # Determine data dimensions.
+            output_shape = (
+                (hidden_states.shape[2] - 1) * factor + convH,
+                (hidden_states.shape[3] - 1) * factor + convW,
+            )
+            output_padding = (
+                output_shape[0] - (hidden_states.shape[2] - 1) * stride[0] - convH,
+                output_shape[1] - (hidden_states.shape[3] - 1) * stride[1] - convW,
+            )
+            assert output_padding[0] >= 0 and output_padding[1] >= 0
+            num_groups = hidden_states.shape[1] // inC
+
+            # Transpose weights.
+            weight = torch.reshape(weight, (num_groups, -1, inC, convH, convW))
+            weight = torch.flip(weight, dims=[3, 4]).permute(0, 2, 1, 3, 4)
+            weight = torch.reshape(weight, (num_groups * inC, -1, convH, convW))
+
+            inverse_conv = F.conv_transpose2d(
+                hidden_states,
+                weight,
+                stride=stride,
+                output_padding=output_padding,
+                padding=0,
+            )
+
+            output = upfirdn2d_native(
+                inverse_conv,
+                torch.tensor(kernel, device=inverse_conv.device),
+                pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2 + 1),
+            )
+        else:
+            pad_value = kernel.shape[0] - factor
+            output = upfirdn2d_native(
+                hidden_states,
+                torch.tensor(kernel, device=hidden_states.device),
+                up=factor,
+                pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2),
+            )
+
+        return output
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        if self.use_conv:
+            height = self._upsample_2d(hidden_states, self.Conv2d_0.weight, kernel=self.fir_kernel)
+            height = height + self.Conv2d_0.bias.reshape(1, -1, 1, 1)
+        else:
+            height = self._upsample_2d(hidden_states, kernel=self.fir_kernel, factor=2)
+
+        return height
+
+
+class KUpsample2D(nn.Module):
+    r"""A 2D K-upsampling layer.
+
+    Parameters:
+        pad_mode (`str`, *optional*, default to `"reflect"`): the padding mode to use.
+    """
+
+    def __init__(self, pad_mode: str = "reflect"):
+        super().__init__()
+        self.pad_mode = pad_mode
+        kernel_1d = torch.tensor([[1 / 8, 3 / 8, 3 / 8, 1 / 8]]) * 2
+        self.pad = kernel_1d.shape[1] // 2 - 1
+        self.register_buffer("kernel", kernel_1d.T @ kernel_1d, persistent=False)
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        inputs = F.pad(inputs, ((self.pad + 1) // 2,) * 4, self.pad_mode)
+        weight = inputs.new_zeros(
+            [
+                inputs.shape[1],
+                inputs.shape[1],
+                self.kernel.shape[0],
+                self.kernel.shape[1],
+            ]
+        )
+        indices = torch.arange(inputs.shape[1], device=inputs.device)
+        kernel = self.kernel.to(weight)[None, :].expand(inputs.shape[1], -1, -1)
+        weight[indices, indices] = kernel
+        return F.conv_transpose2d(inputs, weight, stride=2, padding=self.pad * 2 + 1)
+
+
+def upfirdn2d_native(
+    tensor: torch.Tensor,
+    kernel: torch.Tensor,
+    up: int = 1,
+    down: int = 1,
+    pad: Tuple[int, int] = (0, 0),
+) -> torch.Tensor:
+    up_x = up_y = up
+    down_x = down_y = down
+    pad_x0 = pad_y0 = pad[0]
+    pad_x1 = pad_y1 = pad[1]
+
+    _, channel, in_h, in_w = tensor.shape
+    tensor = tensor.reshape(-1, in_h, in_w, 1)
+
+    _, in_h, in_w, minor = tensor.shape
+    kernel_h, kernel_w = kernel.shape
+
+    out = tensor.view(-1, in_h, 1, in_w, 1, minor)
+    out = F.pad(out, [0, 0, 0, up_x - 1, 0, 0, 0, up_y - 1])
+    out = out.view(-1, in_h * up_y, in_w * up_x, minor)
+
+    out = F.pad(out, [0, 0, max(pad_x0, 0), max(pad_x1, 0), max(pad_y0, 0), max(pad_y1, 0)])
+    out = out.to(tensor.device)  # Move back to mps if necessary
+    out = out[
+        :,
+        max(-pad_y0, 0) : out.shape[1] - max(-pad_y1, 0),
+        max(-pad_x0, 0) : out.shape[2] - max(-pad_x1, 0),
+        :,
+    ]
+
+    out = out.permute(0, 3, 1, 2)
+    out = out.reshape([-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1])
+    w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w)
+    out = F.conv2d(out, w)
+    out = out.reshape(
+        -1,
+        minor,
+        in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1,
+        in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1,
+    )
+    out = out.permute(0, 2, 3, 1)
+    out = out[:, ::down_y, ::down_x, :]
+
+    out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1
+    out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1
+
+    return out.view(-1, channel, out_h, out_w)
+
+
+def upsample_2d(
+    hidden_states: torch.FloatTensor,
+    kernel: Optional[torch.FloatTensor] = None,
+    factor: int = 2,
+    gain: float = 1,
+) -> torch.FloatTensor:
+    r"""Upsample2D a batch of 2D images with the given filter.
+    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and upsamples each image with the given
+    filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the specified
+    `gain`. Pixels outside the image are assumed to be zero, and the filter is padded with zeros so that its shape is
+    a: multiple of the upsampling factor.
+
+    Args:
+        hidden_states (`torch.FloatTensor`):
+            Input tensor of the shape `[N, C, H, W]` or `[N, H, W, C]`.
+        kernel (`torch.FloatTensor`, *optional*):
+            FIR filter of the shape `[firH, firW]` or `[firN]` (separable). The default is `[1] * factor`, which
+            corresponds to nearest-neighbor upsampling.
+        factor (`int`, *optional*, default to `2`):
+            Integer upsampling factor.
+        gain (`float`, *optional*, default to `1.0`):
+            Scaling factor for signal magnitude (default: 1.0).
+
+    Returns:
+        output (`torch.FloatTensor`):
+            Tensor of the shape `[N, C, H * factor, W * factor]`
+    """
+    assert isinstance(factor, int) and factor >= 1
+    if kernel is None:
+        kernel = [1] * factor
+
+    kernel = torch.tensor(kernel, dtype=torch.float32)
+    if kernel.ndim == 1:
+        kernel = torch.outer(kernel, kernel)
+    kernel /= torch.sum(kernel)
+
+    kernel = kernel * (gain * (factor**2))
+    pad_value = kernel.shape[0] - factor
+    output = upfirdn2d_native(
+        hidden_states,
+        kernel.to(device=hidden_states.device),
+        up=factor,
+        pad=((pad_value + 1) // 2 + factor - 1, pad_value // 2),
+    )
+    return output
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/vae_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/vae_flax.py
new file mode 100644
index 000000000..5027f4230
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/vae_flax.py
@@ -0,0 +1,876 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# JAX implementation of VQGAN from taming-transformers https://github.com/CompVis/taming-transformers
+
+import math
+from functools import partial
+from typing import Tuple
+
+import flax
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict
+
+from ..configuration_utils import ConfigMixin, flax_register_to_config
+from ..utils import BaseOutput
+from .modeling_flax_utils import FlaxModelMixin
+
+
+@flax.struct.dataclass
+class FlaxDecoderOutput(BaseOutput):
+    """
+    Output of decoding method.
+
+    Args:
+        sample (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`):
+            The decoded output sample from the last layer of the model.
+        dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
+            The `dtype` of the parameters.
+    """
+
+    sample: jnp.ndarray
+
+
+@flax.struct.dataclass
+class FlaxAutoencoderKLOutput(BaseOutput):
+    """
+    Output of AutoencoderKL encoding method.
+
+    Args:
+        latent_dist (`FlaxDiagonalGaussianDistribution`):
+            Encoded outputs of `Encoder` represented as the mean and logvar of `FlaxDiagonalGaussianDistribution`.
+            `FlaxDiagonalGaussianDistribution` allows for sampling latents from the distribution.
+    """
+
+    latent_dist: "FlaxDiagonalGaussianDistribution"
+
+
+class FlaxUpsample2D(nn.Module):
+    """
+    Flax implementation of 2D Upsample layer
+
+    Args:
+        in_channels (`int`):
+            Input channels
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    in_channels: int
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.conv = nn.Conv(
+            self.in_channels,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+    def __call__(self, hidden_states):
+        batch, height, width, channels = hidden_states.shape
+        hidden_states = jax.image.resize(
+            hidden_states,
+            shape=(batch, height * 2, width * 2, channels),
+            method="nearest",
+        )
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class FlaxDownsample2D(nn.Module):
+    """
+    Flax implementation of 2D Downsample layer
+
+    Args:
+        in_channels (`int`):
+            Input channels
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    in_channels: int
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.conv = nn.Conv(
+            self.in_channels,
+            kernel_size=(3, 3),
+            strides=(2, 2),
+            padding="VALID",
+            dtype=self.dtype,
+        )
+
+    def __call__(self, hidden_states):
+        pad = ((0, 0), (0, 1), (0, 1), (0, 0))  # pad height and width dim
+        hidden_states = jnp.pad(hidden_states, pad_width=pad)
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class FlaxResnetBlock2D(nn.Module):
+    """
+    Flax implementation of 2D Resnet Block.
+
+    Args:
+        in_channels (`int`):
+            Input channels
+        out_channels (`int`):
+            Output channels
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        groups (:obj:`int`, *optional*, defaults to `32`):
+            The number of groups to use for group norm.
+        use_nin_shortcut (:obj:`bool`, *optional*, defaults to `None`):
+            Whether to use `nin_shortcut`. This activates a new layer inside ResNet block
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    in_channels: int
+    out_channels: int = None
+    dropout: float = 0.0
+    groups: int = 32
+    use_nin_shortcut: bool = None
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        out_channels = self.in_channels if self.out_channels is None else self.out_channels
+
+        self.norm1 = nn.GroupNorm(num_groups=self.groups, epsilon=1e-6)
+        self.conv1 = nn.Conv(
+            out_channels,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+        self.norm2 = nn.GroupNorm(num_groups=self.groups, epsilon=1e-6)
+        self.dropout_layer = nn.Dropout(self.dropout)
+        self.conv2 = nn.Conv(
+            out_channels,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+        use_nin_shortcut = self.in_channels != out_channels if self.use_nin_shortcut is None else self.use_nin_shortcut
+
+        self.conv_shortcut = None
+        if use_nin_shortcut:
+            self.conv_shortcut = nn.Conv(
+                out_channels,
+                kernel_size=(1, 1),
+                strides=(1, 1),
+                padding="VALID",
+                dtype=self.dtype,
+            )
+
+    def __call__(self, hidden_states, deterministic=True):
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = nn.swish(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = nn.swish(hidden_states)
+        hidden_states = self.dropout_layer(hidden_states, deterministic)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.conv_shortcut is not None:
+            residual = self.conv_shortcut(residual)
+
+        return hidden_states + residual
+
+
+class FlaxAttentionBlock(nn.Module):
+    r"""
+    Flax Convolutional based multi-head attention block for diffusion-based VAE.
+
+    Parameters:
+        channels (:obj:`int`):
+            Input channels
+        num_head_channels (:obj:`int`, *optional*, defaults to `None`):
+            Number of attention heads
+        num_groups (:obj:`int`, *optional*, defaults to `32`):
+            The number of groups to use for group norm
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+
+    """
+
+    channels: int
+    num_head_channels: int = None
+    num_groups: int = 32
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.num_heads = self.channels // self.num_head_channels if self.num_head_channels is not None else 1
+
+        dense = partial(nn.Dense, self.channels, dtype=self.dtype)
+
+        self.group_norm = nn.GroupNorm(num_groups=self.num_groups, epsilon=1e-6)
+        self.query, self.key, self.value = dense(), dense(), dense()
+        self.proj_attn = dense()
+
+    def transpose_for_scores(self, projection):
+        new_projection_shape = projection.shape[:-1] + (self.num_heads, -1)
+        # move heads to 2nd position (B, T, H * D) -> (B, T, H, D)
+        new_projection = projection.reshape(new_projection_shape)
+        # (B, T, H, D) -> (B, H, T, D)
+        new_projection = jnp.transpose(new_projection, (0, 2, 1, 3))
+        return new_projection
+
+    def __call__(self, hidden_states):
+        residual = hidden_states
+        batch, height, width, channels = hidden_states.shape
+
+        hidden_states = self.group_norm(hidden_states)
+
+        hidden_states = hidden_states.reshape((batch, height * width, channels))
+
+        query = self.query(hidden_states)
+        key = self.key(hidden_states)
+        value = self.value(hidden_states)
+
+        # transpose
+        query = self.transpose_for_scores(query)
+        key = self.transpose_for_scores(key)
+        value = self.transpose_for_scores(value)
+
+        # compute attentions
+        scale = 1 / math.sqrt(math.sqrt(self.channels / self.num_heads))
+        attn_weights = jnp.einsum("...qc,...kc->...qk", query * scale, key * scale)
+        attn_weights = nn.softmax(attn_weights, axis=-1)
+
+        # attend to values
+        hidden_states = jnp.einsum("...kc,...qk->...qc", value, attn_weights)
+
+        hidden_states = jnp.transpose(hidden_states, (0, 2, 1, 3))
+        new_hidden_states_shape = hidden_states.shape[:-2] + (self.channels,)
+        hidden_states = hidden_states.reshape(new_hidden_states_shape)
+
+        hidden_states = self.proj_attn(hidden_states)
+        hidden_states = hidden_states.reshape((batch, height, width, channels))
+        hidden_states = hidden_states + residual
+        return hidden_states
+
+
+class FlaxDownEncoderBlock2D(nn.Module):
+    r"""
+    Flax Resnet blocks-based Encoder block for diffusion-based VAE.
+
+    Parameters:
+        in_channels (:obj:`int`):
+            Input channels
+        out_channels (:obj:`int`):
+            Output channels
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        num_layers (:obj:`int`, *optional*, defaults to 1):
+            Number of Resnet layer block
+        resnet_groups (:obj:`int`, *optional*, defaults to `32`):
+            The number of groups to use for the Resnet block group norm
+        add_downsample (:obj:`bool`, *optional*, defaults to `True`):
+            Whether to add downsample layer
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    in_channels: int
+    out_channels: int
+    dropout: float = 0.0
+    num_layers: int = 1
+    resnet_groups: int = 32
+    add_downsample: bool = True
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        resnets = []
+        for i in range(self.num_layers):
+            in_channels = self.in_channels if i == 0 else self.out_channels
+
+            res_block = FlaxResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=self.out_channels,
+                dropout=self.dropout,
+                groups=self.resnet_groups,
+                dtype=self.dtype,
+            )
+            resnets.append(res_block)
+        self.resnets = resnets
+
+        if self.add_downsample:
+            self.downsamplers_0 = FlaxDownsample2D(self.out_channels, dtype=self.dtype)
+
+    def __call__(self, hidden_states, deterministic=True):
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, deterministic=deterministic)
+
+        if self.add_downsample:
+            hidden_states = self.downsamplers_0(hidden_states)
+
+        return hidden_states
+
+
+class FlaxUpDecoderBlock2D(nn.Module):
+    r"""
+    Flax Resnet blocks-based Decoder block for diffusion-based VAE.
+
+    Parameters:
+        in_channels (:obj:`int`):
+            Input channels
+        out_channels (:obj:`int`):
+            Output channels
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        num_layers (:obj:`int`, *optional*, defaults to 1):
+            Number of Resnet layer block
+        resnet_groups (:obj:`int`, *optional*, defaults to `32`):
+            The number of groups to use for the Resnet block group norm
+        add_upsample (:obj:`bool`, *optional*, defaults to `True`):
+            Whether to add upsample layer
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    in_channels: int
+    out_channels: int
+    dropout: float = 0.0
+    num_layers: int = 1
+    resnet_groups: int = 32
+    add_upsample: bool = True
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        resnets = []
+        for i in range(self.num_layers):
+            in_channels = self.in_channels if i == 0 else self.out_channels
+            res_block = FlaxResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=self.out_channels,
+                dropout=self.dropout,
+                groups=self.resnet_groups,
+                dtype=self.dtype,
+            )
+            resnets.append(res_block)
+
+        self.resnets = resnets
+
+        if self.add_upsample:
+            self.upsamplers_0 = FlaxUpsample2D(self.out_channels, dtype=self.dtype)
+
+    def __call__(self, hidden_states, deterministic=True):
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, deterministic=deterministic)
+
+        if self.add_upsample:
+            hidden_states = self.upsamplers_0(hidden_states)
+
+        return hidden_states
+
+
+class FlaxUNetMidBlock2D(nn.Module):
+    r"""
+    Flax Unet Mid-Block module.
+
+    Parameters:
+        in_channels (:obj:`int`):
+            Input channels
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        num_layers (:obj:`int`, *optional*, defaults to 1):
+            Number of Resnet layer block
+        resnet_groups (:obj:`int`, *optional*, defaults to `32`):
+            The number of groups to use for the Resnet and Attention block group norm
+        num_attention_heads (:obj:`int`, *optional*, defaults to `1`):
+            Number of attention heads for each attention block
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    in_channels: int
+    dropout: float = 0.0
+    num_layers: int = 1
+    resnet_groups: int = 32
+    num_attention_heads: int = 1
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        resnet_groups = self.resnet_groups if self.resnet_groups is not None else min(self.in_channels // 4, 32)
+
+        # there is always at least one resnet
+        resnets = [
+            FlaxResnetBlock2D(
+                in_channels=self.in_channels,
+                out_channels=self.in_channels,
+                dropout=self.dropout,
+                groups=resnet_groups,
+                dtype=self.dtype,
+            )
+        ]
+
+        attentions = []
+
+        for _ in range(self.num_layers):
+            attn_block = FlaxAttentionBlock(
+                channels=self.in_channels,
+                num_head_channels=self.num_attention_heads,
+                num_groups=resnet_groups,
+                dtype=self.dtype,
+            )
+            attentions.append(attn_block)
+
+            res_block = FlaxResnetBlock2D(
+                in_channels=self.in_channels,
+                out_channels=self.in_channels,
+                dropout=self.dropout,
+                groups=resnet_groups,
+                dtype=self.dtype,
+            )
+            resnets.append(res_block)
+
+        self.resnets = resnets
+        self.attentions = attentions
+
+    def __call__(self, hidden_states, deterministic=True):
+        hidden_states = self.resnets[0](hidden_states, deterministic=deterministic)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            hidden_states = attn(hidden_states)
+            hidden_states = resnet(hidden_states, deterministic=deterministic)
+
+        return hidden_states
+
+
+class FlaxEncoder(nn.Module):
+    r"""
+    Flax Implementation of VAE Encoder.
+
+    This model is a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
+    subclass. Use it as a regular Flax linen Module and refer to the Flax documentation for all matter related to
+    general usage and behavior.
+
+    Finally, this model supports inherent JAX features such as:
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        in_channels (:obj:`int`, *optional*, defaults to 3):
+            Input channels
+        out_channels (:obj:`int`, *optional*, defaults to 3):
+            Output channels
+        down_block_types (:obj:`Tuple[str]`, *optional*, defaults to `(DownEncoderBlock2D)`):
+            DownEncoder block type
+        block_out_channels (:obj:`Tuple[str]`, *optional*, defaults to `(64,)`):
+            Tuple containing the number of output channels for each block
+        layers_per_block (:obj:`int`, *optional*, defaults to `2`):
+            Number of Resnet layer for each block
+        norm_num_groups (:obj:`int`, *optional*, defaults to `32`):
+            norm num group
+        act_fn (:obj:`str`, *optional*, defaults to `silu`):
+            Activation function
+        double_z (:obj:`bool`, *optional*, defaults to `False`):
+            Whether to double the last output channels
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+
+    in_channels: int = 3
+    out_channels: int = 3
+    down_block_types: Tuple[str] = ("DownEncoderBlock2D",)
+    block_out_channels: Tuple[int] = (64,)
+    layers_per_block: int = 2
+    norm_num_groups: int = 32
+    act_fn: str = "silu"
+    double_z: bool = False
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        block_out_channels = self.block_out_channels
+        # in
+        self.conv_in = nn.Conv(
+            block_out_channels[0],
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+        # downsampling
+        down_blocks = []
+        output_channel = block_out_channels[0]
+        for i, _ in enumerate(self.down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = FlaxDownEncoderBlock2D(
+                in_channels=input_channel,
+                out_channels=output_channel,
+                num_layers=self.layers_per_block,
+                resnet_groups=self.norm_num_groups,
+                add_downsample=not is_final_block,
+                dtype=self.dtype,
+            )
+            down_blocks.append(down_block)
+        self.down_blocks = down_blocks
+
+        # middle
+        self.mid_block = FlaxUNetMidBlock2D(
+            in_channels=block_out_channels[-1],
+            resnet_groups=self.norm_num_groups,
+            num_attention_heads=None,
+            dtype=self.dtype,
+        )
+
+        # end
+        conv_out_channels = 2 * self.out_channels if self.double_z else self.out_channels
+        self.conv_norm_out = nn.GroupNorm(num_groups=self.norm_num_groups, epsilon=1e-6)
+        self.conv_out = nn.Conv(
+            conv_out_channels,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+    def __call__(self, sample, deterministic: bool = True):
+        # in
+        sample = self.conv_in(sample)
+
+        # downsampling
+        for block in self.down_blocks:
+            sample = block(sample, deterministic=deterministic)
+
+        # middle
+        sample = self.mid_block(sample, deterministic=deterministic)
+
+        # end
+        sample = self.conv_norm_out(sample)
+        sample = nn.swish(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+
+class FlaxDecoder(nn.Module):
+    r"""
+    Flax Implementation of VAE Decoder.
+
+    This model is a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
+    subclass. Use it as a regular Flax linen Module and refer to the Flax documentation for all matter related to
+    general usage and behavior.
+
+    Finally, this model supports inherent JAX features such as:
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        in_channels (:obj:`int`, *optional*, defaults to 3):
+            Input channels
+        out_channels (:obj:`int`, *optional*, defaults to 3):
+            Output channels
+        up_block_types (:obj:`Tuple[str]`, *optional*, defaults to `(UpDecoderBlock2D)`):
+            UpDecoder block type
+        block_out_channels (:obj:`Tuple[str]`, *optional*, defaults to `(64,)`):
+            Tuple containing the number of output channels for each block
+        layers_per_block (:obj:`int`, *optional*, defaults to `2`):
+            Number of Resnet layer for each block
+        norm_num_groups (:obj:`int`, *optional*, defaults to `32`):
+            norm num group
+        act_fn (:obj:`str`, *optional*, defaults to `silu`):
+            Activation function
+        double_z (:obj:`bool`, *optional*, defaults to `False`):
+            Whether to double the last output channels
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            parameters `dtype`
+    """
+
+    in_channels: int = 3
+    out_channels: int = 3
+    up_block_types: Tuple[str] = ("UpDecoderBlock2D",)
+    block_out_channels: int = (64,)
+    layers_per_block: int = 2
+    norm_num_groups: int = 32
+    act_fn: str = "silu"
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        block_out_channels = self.block_out_channels
+
+        # z to block_in
+        self.conv_in = nn.Conv(
+            block_out_channels[-1],
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+        # middle
+        self.mid_block = FlaxUNetMidBlock2D(
+            in_channels=block_out_channels[-1],
+            resnet_groups=self.norm_num_groups,
+            num_attention_heads=None,
+            dtype=self.dtype,
+        )
+
+        # upsampling
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        up_blocks = []
+        for i, _ in enumerate(self.up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_block = FlaxUpDecoderBlock2D(
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                num_layers=self.layers_per_block + 1,
+                resnet_groups=self.norm_num_groups,
+                add_upsample=not is_final_block,
+                dtype=self.dtype,
+            )
+            up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        self.up_blocks = up_blocks
+
+        # end
+        self.conv_norm_out = nn.GroupNorm(num_groups=self.norm_num_groups, epsilon=1e-6)
+        self.conv_out = nn.Conv(
+            self.out_channels,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+
+    def __call__(self, sample, deterministic: bool = True):
+        # z to block_in
+        sample = self.conv_in(sample)
+
+        # middle
+        sample = self.mid_block(sample, deterministic=deterministic)
+
+        # upsampling
+        for block in self.up_blocks:
+            sample = block(sample, deterministic=deterministic)
+
+        sample = self.conv_norm_out(sample)
+        sample = nn.swish(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+
+class FlaxDiagonalGaussianDistribution(object):
+    def __init__(self, parameters, deterministic=False):
+        # Last axis to account for channels-last
+        self.mean, self.logvar = jnp.split(parameters, 2, axis=-1)
+        self.logvar = jnp.clip(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = jnp.exp(0.5 * self.logvar)
+        self.var = jnp.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = jnp.zeros_like(self.mean)
+
+    def sample(self, key):
+        return self.mean + self.std * jax.random.normal(key, self.mean.shape)
+
+    def kl(self, other=None):
+        if self.deterministic:
+            return jnp.array([0.0])
+
+        if other is None:
+            return 0.5 * jnp.sum(self.mean**2 + self.var - 1.0 - self.logvar, axis=[1, 2, 3])
+
+        return 0.5 * jnp.sum(
+            jnp.square(self.mean - other.mean) / other.var + self.var / other.var - 1.0 - self.logvar + other.logvar,
+            axis=[1, 2, 3],
+        )
+
+    def nll(self, sample, axis=[1, 2, 3]):
+        if self.deterministic:
+            return jnp.array([0.0])
+
+        logtwopi = jnp.log(2.0 * jnp.pi)
+        return 0.5 * jnp.sum(logtwopi + self.logvar + jnp.square(sample - self.mean) / self.var, axis=axis)
+
+    def mode(self):
+        return self.mean
+
+
+@flax_register_to_config
+class FlaxAutoencoderKL(nn.Module, FlaxModelMixin, ConfigMixin):
+    r"""
+    Flax implementation of a VAE model with KL loss for decoding latent representations.
+
+    This model inherits from [`FlaxModelMixin`]. Check the superclass documentation for it's generic methods
+    implemented for all models (such as downloading or saving).
+
+    This model is a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
+    subclass. Use it as a regular Flax Linen module and refer to the Flax documentation for all matter related to its
+    general usage and behavior.
+
+    Inherent JAX features such as the following are supported:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        in_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input image.
+        out_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `(DownEncoderBlock2D)`):
+            Tuple of downsample block types.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `(UpDecoderBlock2D)`):
+            Tuple of upsample block types.
+        block_out_channels (`Tuple[str]`, *optional*, defaults to `(64,)`):
+            Tuple of block output channels.
+        layers_per_block (`int`, *optional*, defaults to `2`):
+            Number of ResNet layer for each block.
+        act_fn (`str`, *optional*, defaults to `silu`):
+            The activation function to use.
+        latent_channels (`int`, *optional*, defaults to `4`):
+            Number of channels in the latent space.
+        norm_num_groups (`int`, *optional*, defaults to `32`):
+            The number of groups for normalization.
+        sample_size (`int`, *optional*, defaults to 32):
+            Sample input size.
+        scaling_factor (`float`, *optional*, defaults to 0.18215):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+        dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
+            The `dtype` of the parameters.
+    """
+
+    in_channels: int = 3
+    out_channels: int = 3
+    down_block_types: Tuple[str] = ("DownEncoderBlock2D",)
+    up_block_types: Tuple[str] = ("UpDecoderBlock2D",)
+    block_out_channels: Tuple[int] = (64,)
+    layers_per_block: int = 1
+    act_fn: str = "silu"
+    latent_channels: int = 4
+    norm_num_groups: int = 32
+    sample_size: int = 32
+    scaling_factor: float = 0.18215
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.encoder = FlaxEncoder(
+            in_channels=self.config.in_channels,
+            out_channels=self.config.latent_channels,
+            down_block_types=self.config.down_block_types,
+            block_out_channels=self.config.block_out_channels,
+            layers_per_block=self.config.layers_per_block,
+            act_fn=self.config.act_fn,
+            norm_num_groups=self.config.norm_num_groups,
+            double_z=True,
+            dtype=self.dtype,
+        )
+        self.decoder = FlaxDecoder(
+            in_channels=self.config.latent_channels,
+            out_channels=self.config.out_channels,
+            up_block_types=self.config.up_block_types,
+            block_out_channels=self.config.block_out_channels,
+            layers_per_block=self.config.layers_per_block,
+            norm_num_groups=self.config.norm_num_groups,
+            act_fn=self.config.act_fn,
+            dtype=self.dtype,
+        )
+        self.quant_conv = nn.Conv(
+            2 * self.config.latent_channels,
+            kernel_size=(1, 1),
+            strides=(1, 1),
+            padding="VALID",
+            dtype=self.dtype,
+        )
+        self.post_quant_conv = nn.Conv(
+            self.config.latent_channels,
+            kernel_size=(1, 1),
+            strides=(1, 1),
+            padding="VALID",
+            dtype=self.dtype,
+        )
+
+    def init_weights(self, rng: jax.Array) -> FrozenDict:
+        # init input tensors
+        sample_shape = (1, self.in_channels, self.sample_size, self.sample_size)
+        sample = jnp.zeros(sample_shape, dtype=jnp.float32)
+
+        params_rng, dropout_rng, gaussian_rng = jax.random.split(rng, 3)
+        rngs = {"params": params_rng, "dropout": dropout_rng, "gaussian": gaussian_rng}
+
+        return self.init(rngs, sample)["params"]
+
+    def encode(self, sample, deterministic: bool = True, return_dict: bool = True):
+        sample = jnp.transpose(sample, (0, 2, 3, 1))
+
+        hidden_states = self.encoder(sample, deterministic=deterministic)
+        moments = self.quant_conv(hidden_states)
+        posterior = FlaxDiagonalGaussianDistribution(moments)
+
+        if not return_dict:
+            return (posterior,)
+
+        return FlaxAutoencoderKLOutput(latent_dist=posterior)
+
+    def decode(self, latents, deterministic: bool = True, return_dict: bool = True):
+        if latents.shape[-1] != self.config.latent_channels:
+            latents = jnp.transpose(latents, (0, 2, 3, 1))
+
+        hidden_states = self.post_quant_conv(latents)
+        hidden_states = self.decoder(hidden_states, deterministic=deterministic)
+
+        hidden_states = jnp.transpose(hidden_states, (0, 3, 1, 2))
+
+        if not return_dict:
+            return (hidden_states,)
+
+        return FlaxDecoderOutput(sample=hidden_states)
+
+    def __call__(self, sample, sample_posterior=False, deterministic: bool = True, return_dict: bool = True):
+        posterior = self.encode(sample, deterministic=deterministic, return_dict=return_dict)
+        if sample_posterior:
+            rng = self.make_rng("gaussian")
+            hidden_states = posterior.latent_dist.sample(rng)
+        else:
+            hidden_states = posterior.latent_dist.mode()
+
+        sample = self.decode(hidden_states, return_dict=return_dict).sample
+
+        if not return_dict:
+            return (sample,)
+
+        return FlaxDecoderOutput(sample=sample)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/vq_model.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/vq_model.py
new file mode 100644
index 000000000..e5184446c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/vq_model.py
@@ -0,0 +1,181 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from ..utils.accelerate_utils import apply_forward_hook
+from .autoencoders.vae import Decoder, DecoderOutput, Encoder, VectorQuantizer
+from .modeling_utils import ModelMixin
+
+
+@dataclass
+class VQEncoderOutput(BaseOutput):
+    """
+    Output of VQModel encoding method.
+
+    Args:
+        latents (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The encoded output sample from the last layer of the model.
+    """
+
+    latents: torch.FloatTensor
+
+
+class VQModel(ModelMixin, ConfigMixin):
+    r"""
+    A VQ-VAE model for decoding latent representations.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            Tuple of downsample block types.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            Tuple of upsample block types.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
+            Tuple of block output channels.
+        layers_per_block (`int`, *optional*, defaults to `1`): Number of layers per block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        latent_channels (`int`, *optional*, defaults to `3`): Number of channels in the latent space.
+        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
+        num_vq_embeddings (`int`, *optional*, defaults to `256`): Number of codebook vectors in the VQ-VAE.
+        norm_num_groups (`int`, *optional*, defaults to `32`): Number of groups for normalization layers.
+        vq_embed_dim (`int`, *optional*): Hidden dim of codebook vectors in the VQ-VAE.
+        scaling_factor (`float`, *optional*, defaults to `0.18215`):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+        norm_type (`str`, *optional*, defaults to `"group"`):
+            Type of normalization layer to use. Can be one of `"group"` or `"spatial"`.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",),
+        up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 1,
+        act_fn: str = "silu",
+        latent_channels: int = 3,
+        sample_size: int = 32,
+        num_vq_embeddings: int = 256,
+        norm_num_groups: int = 32,
+        vq_embed_dim: Optional[int] = None,
+        scaling_factor: float = 0.18215,
+        norm_type: str = "group",  # group, spatial
+        mid_block_add_attention=True,
+        lookup_from_codebook=False,
+        force_upcast=False,
+    ):
+        super().__init__()
+
+        # pass init params to Encoder
+        self.encoder = Encoder(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            double_z=False,
+            mid_block_add_attention=mid_block_add_attention,
+        )
+
+        vq_embed_dim = vq_embed_dim if vq_embed_dim is not None else latent_channels
+
+        self.quant_conv = nn.Conv2d(latent_channels, vq_embed_dim, 1)
+        self.quantize = VectorQuantizer(num_vq_embeddings, vq_embed_dim, beta=0.25, remap=None, sane_index_shape=False)
+        self.post_quant_conv = nn.Conv2d(vq_embed_dim, latent_channels, 1)
+
+        # pass init params to Decoder
+        self.decoder = Decoder(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            norm_type=norm_type,
+            mid_block_add_attention=mid_block_add_attention,
+        )
+
+    @apply_forward_hook
+    def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOutput:
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+
+        if not return_dict:
+            return (h,)
+
+        return VQEncoderOutput(latents=h)
+
+    @apply_forward_hook
+    def decode(
+        self, h: torch.FloatTensor, force_not_quantize: bool = False, return_dict: bool = True, shape=None
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
+        # also go through quantization layer
+        if not force_not_quantize:
+            quant, _, _ = self.quantize(h)
+        elif self.config.lookup_from_codebook:
+            quant = self.quantize.get_codebook_entry(h, shape)
+        else:
+            quant = h
+        quant2 = self.post_quant_conv(quant)
+        dec = self.decoder(quant2, quant if self.config.norm_type == "spatial" else None)
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    def forward(
+        self, sample: torch.FloatTensor, return_dict: bool = True
+    ) -> Union[DecoderOutput, Tuple[torch.FloatTensor, ...]]:
+        r"""
+        The [`VQModel`] forward method.
+
+        Args:
+            sample (`torch.FloatTensor`): Input sample.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.vq_model.VQEncoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vq_model.VQEncoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vq_model.VQEncoderOutput`] is returned, otherwise a plain `tuple`
+                is returned.
+        """
+
+        h = self.encode(sample).latents
+        dec = self.decode(h).sample
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/optimization.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/optimization.py
new file mode 100644
index 000000000..fbaa14365
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/optimization.py
@@ -0,0 +1,361 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch optimization for diffusion models."""
+
+import math
+from enum import Enum
+from typing import Optional, Union
+
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LambdaLR
+
+from .utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class SchedulerType(Enum):
+    LINEAR = "linear"
+    COSINE = "cosine"
+    COSINE_WITH_RESTARTS = "cosine_with_restarts"
+    POLYNOMIAL = "polynomial"
+    CONSTANT = "constant"
+    CONSTANT_WITH_WARMUP = "constant_with_warmup"
+    PIECEWISE_CONSTANT = "piecewise_constant"
+
+
+def get_constant_schedule(optimizer: Optimizer, last_epoch: int = -1) -> LambdaLR:
+    """
+    Create a schedule with a constant learning rate, using the learning rate set in optimizer.
+
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch)
+
+
+def get_constant_schedule_with_warmup(optimizer: Optimizer, num_warmup_steps: int, last_epoch: int = -1) -> LambdaLR:
+    """
+    Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate
+    increases linearly between 0 and the initial lr set in the optimizer.
+
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+
+    def lr_lambda(current_step: int):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1.0, num_warmup_steps))
+        return 1.0
+
+    return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
+
+
+def get_piecewise_constant_schedule(optimizer: Optimizer, step_rules: str, last_epoch: int = -1) -> LambdaLR:
+    """
+    Create a schedule with a constant learning rate, using the learning rate set in optimizer.
+
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        step_rules (`string`):
+            The rules for the learning rate. ex: rule_steps="1:10,0.1:20,0.01:30,0.005" it means that the learning rate
+            if multiple 1 for the first 10 steps, mutiple 0.1 for the next 20 steps, multiple 0.01 for the next 30
+            steps and multiple 0.005 for the other steps.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+
+    rules_dict = {}
+    rule_list = step_rules.split(",")
+    for rule_str in rule_list[:-1]:
+        value_str, steps_str = rule_str.split(":")
+        steps = int(steps_str)
+        value = float(value_str)
+        rules_dict[steps] = value
+    last_lr_multiple = float(rule_list[-1])
+
+    def create_rules_function(rules_dict, last_lr_multiple):
+        def rule_func(steps: int) -> float:
+            sorted_steps = sorted(rules_dict.keys())
+            for i, sorted_step in enumerate(sorted_steps):
+                if steps < sorted_step:
+                    return rules_dict[sorted_steps[i]]
+            return last_lr_multiple
+
+        return rule_func
+
+    rules_func = create_rules_function(rules_dict, last_lr_multiple)
+
+    return LambdaLR(optimizer, rules_func, last_epoch=last_epoch)
+
+
+def get_linear_schedule_with_warmup(
+    optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, last_epoch: int = -1
+) -> LambdaLR:
+    """
+    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
+    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
+
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+
+    def lr_lambda(current_step: int):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        return max(
+            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
+        )
+
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+
+
+def get_cosine_schedule_with_warmup(
+    optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, last_epoch: int = -1
+) -> LambdaLR:
+    """
+    Create a schedule with a learning rate that decreases following the values of the cosine function between the
+    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
+    initial lr set in the optimizer.
+
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        num_periods (`float`, *optional*, defaults to 0.5):
+            The number of periods of the cosine function in a schedule (the default is to just decrease from the max
+            value to 0 following a half-cosine).
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+
+    def lr_lambda(current_step):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
+
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+
+
+def get_cosine_with_hard_restarts_schedule_with_warmup(
+    optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: int = 1, last_epoch: int = -1
+) -> LambdaLR:
+    """
+    Create a schedule with a learning rate that decreases following the values of the cosine function between the
+    initial lr set in the optimizer to 0, with several hard restarts, after a warmup period during which it increases
+    linearly between 0 and the initial lr set in the optimizer.
+
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        num_cycles (`int`, *optional*, defaults to 1):
+            The number of hard restarts to use.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+
+    def lr_lambda(current_step):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+        if progress >= 1.0:
+            return 0.0
+        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))
+
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+
+
+def get_polynomial_decay_schedule_with_warmup(
+    optimizer: Optimizer,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    lr_end: float = 1e-7,
+    power: float = 1.0,
+    last_epoch: int = -1,
+) -> LambdaLR:
+    """
+    Create a schedule with a learning rate that decreases as a polynomial decay from the initial lr set in the
+    optimizer to end lr defined by *lr_end*, after a warmup period during which it increases linearly from 0 to the
+    initial lr set in the optimizer.
+
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        lr_end (`float`, *optional*, defaults to 1e-7):
+            The end LR.
+        power (`float`, *optional*, defaults to 1.0):
+            Power factor.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+
+    Note: *power* defaults to 1.0 as in the fairseq implementation, which in turn is based on the original BERT
+    implementation at
+    https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/optimization.py#L37
+
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+
+    """
+
+    lr_init = optimizer.defaults["lr"]
+    if not (lr_init > lr_end):
+        raise ValueError(f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})")
+
+    def lr_lambda(current_step: int):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        elif current_step > num_training_steps:
+            return lr_end / lr_init  # as LambdaLR multiplies by lr_init
+        else:
+            lr_range = lr_init - lr_end
+            decay_steps = num_training_steps - num_warmup_steps
+            pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps
+            decay = lr_range * pct_remaining**power + lr_end
+            return decay / lr_init  # as LambdaLR multiplies by lr_init
+
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+
+
+TYPE_TO_SCHEDULER_FUNCTION = {
+    SchedulerType.LINEAR: get_linear_schedule_with_warmup,
+    SchedulerType.COSINE: get_cosine_schedule_with_warmup,
+    SchedulerType.COSINE_WITH_RESTARTS: get_cosine_with_hard_restarts_schedule_with_warmup,
+    SchedulerType.POLYNOMIAL: get_polynomial_decay_schedule_with_warmup,
+    SchedulerType.CONSTANT: get_constant_schedule,
+    SchedulerType.CONSTANT_WITH_WARMUP: get_constant_schedule_with_warmup,
+    SchedulerType.PIECEWISE_CONSTANT: get_piecewise_constant_schedule,
+}
+
+
+def get_scheduler(
+    name: Union[str, SchedulerType],
+    optimizer: Optimizer,
+    step_rules: Optional[str] = None,
+    num_warmup_steps: Optional[int] = None,
+    num_training_steps: Optional[int] = None,
+    num_cycles: int = 1,
+    power: float = 1.0,
+    last_epoch: int = -1,
+) -> LambdaLR:
+    """
+    Unified API to get any scheduler from its name.
+
+    Args:
+        name (`str` or `SchedulerType`):
+            The name of the scheduler to use.
+        optimizer (`torch.optim.Optimizer`):
+            The optimizer that will be used during training.
+        step_rules (`str`, *optional*):
+            A string representing the step rules to use. This is only used by the `PIECEWISE_CONSTANT` scheduler.
+        num_warmup_steps (`int`, *optional*):
+            The number of warmup steps to do. This is not required by all schedulers (hence the argument being
+            optional), the function will raise an error if it's unset and the scheduler type requires it.
+        num_training_steps (`int``, *optional*):
+            The number of training steps to do. This is not required by all schedulers (hence the argument being
+            optional), the function will raise an error if it's unset and the scheduler type requires it.
+        num_cycles (`int`, *optional*):
+            The number of hard restarts used in `COSINE_WITH_RESTARTS` scheduler.
+        power (`float`, *optional*, defaults to 1.0):
+            Power factor. See `POLYNOMIAL` scheduler
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    """
+    name = SchedulerType(name)
+    schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
+    if name == SchedulerType.CONSTANT:
+        return schedule_func(optimizer, last_epoch=last_epoch)
+
+    if name == SchedulerType.PIECEWISE_CONSTANT:
+        return schedule_func(optimizer, step_rules=step_rules, last_epoch=last_epoch)
+
+    # All other schedulers require `num_warmup_steps`
+    if num_warmup_steps is None:
+        raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
+
+    if name == SchedulerType.CONSTANT_WITH_WARMUP:
+        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps, last_epoch=last_epoch)
+
+    # All other schedulers require `num_training_steps`
+    if num_training_steps is None:
+        raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
+
+    if name == SchedulerType.COSINE_WITH_RESTARTS:
+        return schedule_func(
+            optimizer,
+            num_warmup_steps=num_warmup_steps,
+            num_training_steps=num_training_steps,
+            num_cycles=num_cycles,
+            last_epoch=last_epoch,
+        )
+
+    if name == SchedulerType.POLYNOMIAL:
+        return schedule_func(
+            optimizer,
+            num_warmup_steps=num_warmup_steps,
+            num_training_steps=num_training_steps,
+            power=power,
+            last_epoch=last_epoch,
+        )
+
+    return schedule_func(
+        optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, last_epoch=last_epoch
+    )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/README.md b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/README.md
new file mode 100644
index 000000000..d5125ae5c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/README.md
@@ -0,0 +1,171 @@
+# 🧨 Diffusers Pipelines
+
+Pipelines provide a simple way to run state-of-the-art diffusion models in inference.
+Most diffusion systems consist of multiple independently-trained models and highly adaptable scheduler
+components - all of which are needed to have a functioning end-to-end diffusion system.
+
+As an example, [Stable Diffusion](https://huggingface.co/blog/stable_diffusion) has three independently trained models:
+- [Autoencoder](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/models/vae.py#L392)
+- [Conditional Unet](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/models/unet_2d_condition.py#L12)
+- [CLIP text encoder](https://huggingface.co/docs/transformers/main/en/model_doc/clip#transformers.CLIPTextModel)
+- a scheduler component, [scheduler](https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_pndm.py),
+- a [CLIPImageProcessor](https://huggingface.co/docs/transformers/main/en/model_doc/clip#transformers.CLIPImageProcessor),
+- as well as a [safety checker](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py).
+All of these components are necessary to run stable diffusion in inference even though they were trained
+or created independently from each other.
+
+To that end, we strive to offer all open-sourced, state-of-the-art diffusion system under a unified API.
+More specifically, we strive to provide pipelines that
+- 1. can load the officially published weights and yield 1-to-1 the same outputs as the original implementation according to the corresponding paper (*e.g.* [LDMTextToImagePipeline](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/latent_diffusion), uses the officially released weights of [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752)),
+- 2. have a simple user interface to run the model in inference (see the [Pipelines API](#pipelines-api) section),
+- 3. are easy to understand with code that is self-explanatory and can be read along-side the official paper (see [Pipelines summary](#pipelines-summary)),
+- 4. can easily be contributed by the community (see the [Contribution](#contribution) section).
+
+**Note** that pipelines do not (and should not) offer any training functionality.
+If you are looking for *official* training examples, please have a look at [examples](https://github.com/huggingface/diffusers/tree/main/examples).
+
+
+## Pipelines Summary
+
+The following table summarizes all officially supported pipelines, their corresponding paper, and if
+available a colab notebook to directly try them out.
+
+| Pipeline                                                                                                                      | Source                                                                                                                       | Tasks | Colab
+|-------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------|:---:|:---:|
+| [dance diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/dance_diffusion)                 | [**Dance Diffusion**](https://github.com/Harmonai-org/sample-generator)                                                      | *Unconditional Audio Generation* |
+| [ddpm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/ddpm)                                       | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239)                                             | *Unconditional Image Generation* |
+| [ddim](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/ddim)                                       | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502)                                                  | *Unconditional Image Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [latent_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion)               | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)                         | *Text-to-Image Generation* |
+| [latent_diffusion_uncond](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)                         | *Unconditional Image Generation* |
+| [pndm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pndm)                                       | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778)                           | *Unconditional Image Generation* |
+| [score_sde_ve](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/score_sde_ve)                       | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | *Unconditional Image Generation* |
+| [score_sde_vp](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/score_sde_vp)                       | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | *Unconditional Image Generation* |
+| [stable_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion)               | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            | *Text-to-Image Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb)
+| [stable_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion)               | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            | *Image-to-Image Text-Guided Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
+| [stable_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion)               | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release)                                            | *Text-Guided Image Inpainting* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
+| [stochastic_karras_ve](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stochastic_karras_ve)       | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364)                    | *Unconditional Image Generation* |
+
+**Note**: Pipelines are simple examples of how to play around with the diffusion systems as described in the corresponding papers.
+However, most of them can be adapted to use different scheduler components or even different model components. Some pipeline examples are shown in the [Examples](#examples) below.
+
+## Pipelines API
+
+Diffusion models often consist of multiple independently-trained models or other previously existing components.
+
+
+Each model has been trained independently on a different task and the scheduler can easily be swapped out and replaced with a different one.
+During inference, we however want to be able to easily load all components and use them in inference - even if one component, *e.g.* CLIP's text encoder, originates from a different library, such as [Transformers](https://github.com/huggingface/transformers). To that end, all pipelines provide the following functionality:
+
+- [`from_pretrained` method](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/pipeline_utils.py#L139) that accepts a Hugging Face Hub repository id, *e.g.* [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) or a path to a local directory, *e.g.*
+"./stable-diffusion". To correctly retrieve which models and components should be loaded, one has to provide a `model_index.json` file, *e.g.* [runwayml/stable-diffusion-v1-5/model_index.json](https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/model_index.json), which defines all components that should be
+loaded into the pipelines. More specifically, for each model/component one needs to define the format `<name>: ["<library>", "<class name>"]`. `<name>` is the attribute name given to the loaded instance of `<class name>` which can be found in the library or pipeline folder called `"<library>"`.
+- [`save_pretrained`](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/pipeline_utils.py#L90) that accepts a local path, *e.g.* `./stable-diffusion` under which all models/components of the pipeline will be saved. For each component/model a folder is created inside the local path that is named after the given attribute name, *e.g.* `./stable_diffusion/unet`.
+In addition, a `model_index.json` file is created at the root of the local path, *e.g.* `./stable_diffusion/model_index.json` so that the complete pipeline can again be instantiated
+from the local path.
+- [`to`](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/pipeline_utils.py#L118) which accepts a `string` or `torch.device` to move all models that are of type `torch.nn.Module` to the passed device. The behavior is fully analogous to [PyTorch's `to` method](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.to).
+- [`__call__`] method to use the pipeline in inference. `__call__` defines inference logic of the pipeline and should ideally encompass all aspects of it, from pre-processing to forwarding tensors to the different models and schedulers, as well as post-processing. The API of the `__call__` method can strongly vary from pipeline to pipeline. *E.g.* a text-to-image pipeline, such as [`StableDiffusionPipeline`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py) should accept among other things the text prompt to generate the image. A pure image generation pipeline, such as [DDPMPipeline](https://github.com/huggingface/diffusers/tree/main/src/diffusers/pipelines/ddpm) on the other hand can be run without providing any inputs. To better understand what inputs can be adapted for
+each pipeline, one should look directly into the respective pipeline.
+
+**Note**: All pipelines have PyTorch's autograd disabled by decorating the `__call__` method with a [`torch.no_grad`](https://pytorch.org/docs/stable/generated/torch.no_grad.html) decorator because pipelines should
+not be used for training. If you want to store the gradients during the forward pass, we recommend writing your own pipeline, see also our [community-examples](https://github.com/huggingface/diffusers/tree/main/examples/community)
+
+## Contribution
+
+We are more than happy about any contribution to the officially supported pipelines 🤗. We aspire
+all of our pipelines to be  **self-contained**, **easy-to-tweak**, **beginner-friendly** and for **one-purpose-only**.
+
+- **Self-contained**: A pipeline shall be as self-contained as possible. More specifically, this means that all functionality should be either directly defined in the pipeline file itself, should be inherited from (and only from) the [`DiffusionPipeline` class](https://github.com/huggingface/diffusers/blob/5cbed8e0d157f65d3ddc2420dfd09f2df630e978/src/diffusers/pipeline_utils.py#L56) or be directly attached to the model and scheduler components of the pipeline.
+- **Easy-to-use**: Pipelines should be extremely easy to use - one should be able to load the pipeline and
+use it for its designated task, *e.g.* text-to-image generation, in just a couple of lines of code. Most
+logic including pre-processing, an unrolled diffusion loop, and post-processing should all happen inside the `__call__` method.
+- **Easy-to-tweak**: Certain pipelines will not be able to handle all use cases and tasks that you might like them to. If you want to use a certain pipeline for a specific use case that is not yet supported, you might have to copy the pipeline file and tweak the code to your needs. We try to make the pipeline code as readable as possible so that each part –from pre-processing to diffusing to post-processing– can easily be adapted. If you would like the community to benefit from your customized pipeline, we would love to see a contribution to our [community-examples](https://github.com/huggingface/diffusers/tree/main/examples/community). If you feel that an important pipeline should be part of the official pipelines but isn't, a contribution to the [official pipelines](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines) would be even better.
+- **One-purpose-only**: Pipelines should be used for one task and one task only. Even if two tasks are very similar from a modeling point of view, *e.g.* image2image translation and in-painting, pipelines shall be used for one task only to keep them *easy-to-tweak* and *readable*.
+
+## Examples
+
+### Text-to-Image generation with Stable Diffusion
+
+```python
+# make sure you're logged in with `huggingface-cli login`
+from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
+
+pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt).images[0]
+
+image.save("astronaut_rides_horse.png")
+```
+
+### Image-to-Image text-guided generation with Stable Diffusion
+
+The `StableDiffusionImg2ImgPipeline` lets you pass a text prompt and an initial image to condition the generation of new images.
+
+```python
+import requests
+from PIL import Image
+from io import BytesIO
+
+from diffusers import StableDiffusionImg2ImgPipeline
+
+# load the pipeline
+device = "cuda"
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+).to(device)
+
+# let's download an initial image
+url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+init_image = init_image.resize((768, 512))
+
+prompt = "A fantasy landscape, trending on artstation"
+
+images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
+
+images[0].save("fantasy_landscape.png")
+```
+You can also run this example on colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
+
+### Tweak prompts reusing seeds and latents
+
+You can generate your own latents to reproduce results, or tweak your prompt on a specific result you liked. [This notebook](https://github.com/pcuenca/diffusers-examples/blob/main/notebooks/stable-diffusion-seeds.ipynb) shows how to do it step by step. You can also run it in Google Colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pcuenca/diffusers-examples/blob/main/notebooks/stable-diffusion-seeds.ipynb).
+
+
+### In-painting using Stable Diffusion
+
+The `StableDiffusionInpaintPipeline` lets you edit specific parts of an image by providing a mask and text prompt.
+
+```python
+import PIL
+import requests
+import torch
+from io import BytesIO
+
+from diffusers import StableDiffusionInpaintPipeline
+
+def download_image(url):
+    response = requests.get(url)
+    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+init_image = download_image(img_url).resize((512, 512))
+mask_image = download_image(mask_url).resize((512, 512))
+
+pipe = StableDiffusionInpaintPipeline.from_pretrained(
+    "runwayml/stable-diffusion-inpainting",
+    torch_dtype=torch.float16,
+)
+pipe = pipe.to("cuda")
+
+prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
+```
+
+You can also run this example on colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/__init__.py
new file mode 100644
index 000000000..2b2277809
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/__init__.py
@@ -0,0 +1,581 @@
+from typing import TYPE_CHECKING
+
+from ..utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_flax_available,
+    is_k_diffusion_available,
+    is_librosa_available,
+    is_note_seq_available,
+    is_onnx_available,
+    is_torch_available,
+    is_torch_npu_available,
+    is_transformers_available,
+)
+
+
+# These modules contain pipelines from multiple libraries/frameworks
+_dummy_objects = {}
+_import_structure = {
+    "controlnet": [],
+    "controlnet_xs": [],
+    "deprecated": [],
+    "latent_diffusion": [],
+    "ledits_pp": [],
+    "stable_diffusion": [],
+    "stable_diffusion_xl": [],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_pt_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_pt_objects))
+else:
+    _import_structure["auto_pipeline"] = [
+        "AutoPipelineForImage2Image",
+        "AutoPipelineForInpainting",
+        "AutoPipelineForText2Image",
+    ]
+    _import_structure["consistency_models"] = ["ConsistencyModelPipeline"]
+    _import_structure["dance_diffusion"] = ["DanceDiffusionPipeline"]
+    _import_structure["ddim"] = ["DDIMPipeline"]
+    _import_structure["ddpm"] = ["DDPMPipeline"]
+    _import_structure["dit"] = ["DiTPipeline"]
+    _import_structure["latent_diffusion"].extend(["LDMSuperResolutionPipeline"])
+    _import_structure["pipeline_utils"] = [
+        "AudioPipelineOutput",
+        "DiffusionPipeline",
+        "StableDiffusionMixin",
+        "ImagePipelineOutput",
+    ]
+    _import_structure["deprecated"].extend(
+        [
+            "PNDMPipeline",
+            "LDMPipeline",
+            "RePaintPipeline",
+            "ScoreSdeVePipeline",
+            "KarrasVePipeline",
+        ]
+    )
+try:
+    if not (is_torch_available() and is_librosa_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_torch_and_librosa_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_librosa_objects))
+else:
+    _import_structure["deprecated"].extend(["AudioDiffusionPipeline", "Mel"])
+
+try:
+    if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_transformers_and_torch_and_note_seq_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_transformers_and_torch_and_note_seq_objects))
+else:
+    _import_structure["deprecated"].extend(
+        [
+            "MidiProcessor",
+            "SpectrogramDiffusionPipeline",
+        ]
+    )
+
+try:
+    if not (is_torch_available() and is_transformers_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["deprecated"].extend(
+        [
+            "VQDiffusionPipeline",
+            "AltDiffusionPipeline",
+            "AltDiffusionImg2ImgPipeline",
+            "CycleDiffusionPipeline",
+            "StableDiffusionInpaintPipelineLegacy",
+            "StableDiffusionPix2PixZeroPipeline",
+            "StableDiffusionParadigmsPipeline",
+            "StableDiffusionModelEditingPipeline",
+            "VersatileDiffusionDualGuidedPipeline",
+            "VersatileDiffusionImageVariationPipeline",
+            "VersatileDiffusionPipeline",
+            "VersatileDiffusionTextToImagePipeline",
+        ]
+    )
+    _import_structure["amused"] = ["AmusedImg2ImgPipeline", "AmusedInpaintPipeline", "AmusedPipeline"]
+    _import_structure["animatediff"] = [
+        "AnimateDiffPipeline",
+        "AnimateDiffVideoToVideoPipeline",
+    ]
+    _import_structure["audioldm"] = ["AudioLDMPipeline"]
+    _import_structure["audioldm2"] = [
+        "AudioLDM2Pipeline",
+        "AudioLDM2ProjectionModel",
+        "AudioLDM2UNet2DConditionModel",
+    ]
+    _import_structure["blip_diffusion"] = ["BlipDiffusionPipeline"]
+    _import_structure["controlnet"].extend(
+        [
+            "BlipDiffusionControlNetPipeline",
+            "StableDiffusionControlNetImg2ImgPipeline",
+            "StableDiffusionControlNetInpaintPipeline",
+            "StableDiffusionControlNetPipeline",
+            "StableDiffusionXLControlNetImg2ImgPipeline",
+            "StableDiffusionXLControlNetInpaintPipeline",
+            "StableDiffusionXLControlNetPipeline",
+        ]
+    )
+    _import_structure["deepfloyd_if"] = [
+        "IFImg2ImgPipeline",
+        "IFImg2ImgSuperResolutionPipeline",
+        "IFInpaintingPipeline",
+        "IFInpaintingSuperResolutionPipeline",
+        "IFPipeline",
+        "IFSuperResolutionPipeline",
+    ]
+    _import_structure["kandinsky"] = [
+        "KandinskyCombinedPipeline",
+        "KandinskyImg2ImgCombinedPipeline",
+        "KandinskyImg2ImgPipeline",
+        "KandinskyInpaintCombinedPipeline",
+        "KandinskyInpaintPipeline",
+        "KandinskyPipeline",
+        "KandinskyPriorPipeline",
+    ]
+    _import_structure["kandinsky2_2"] = [
+        "KandinskyV22CombinedPipeline",
+        "KandinskyV22ControlnetImg2ImgPipeline",
+        "KandinskyV22ControlnetPipeline",
+        "KandinskyV22Img2ImgCombinedPipeline",
+        "KandinskyV22Img2ImgPipeline",
+        "KandinskyV22InpaintCombinedPipeline",
+        "KandinskyV22InpaintPipeline",
+        "KandinskyV22Pipeline",
+        "KandinskyV22PriorEmb2EmbPipeline",
+        "KandinskyV22PriorPipeline",
+    ]
+    _import_structure["kandinsky3"] = [
+        "Kandinsky3Img2ImgPipeline",
+        "Kandinsky3Pipeline",
+    ]
+    _import_structure["latent_consistency_models"] = [
+        "LatentConsistencyModelImg2ImgPipeline",
+        "LatentConsistencyModelPipeline",
+    ]
+    _import_structure["latent_diffusion"].extend(["LDMTextToImagePipeline"])
+    _import_structure["ledits_pp"].extend(
+        [
+            "LEditsPPPipelineStableDiffusion",
+            "LEditsPPPipelineStableDiffusionXL",
+        ]
+    )
+    _import_structure["musicldm"] = ["MusicLDMPipeline"]
+    _import_structure["paint_by_example"] = ["PaintByExamplePipeline"]
+    _import_structure["pia"] = ["PIAPipeline"]
+    _import_structure["pixart_alpha"] = ["PixArtAlphaPipeline"]
+    _import_structure["semantic_stable_diffusion"] = ["SemanticStableDiffusionPipeline"]
+    _import_structure["shap_e"] = ["ShapEImg2ImgPipeline", "ShapEPipeline"]
+    _import_structure["stable_cascade"] = [
+        "StableCascadeCombinedPipeline",
+        "StableCascadeDecoderPipeline",
+        "StableCascadePriorPipeline",
+    ]
+    _import_structure["stable_diffusion"].extend(
+        [
+            "CLIPImageProjection",
+            "StableDiffusionDepth2ImgPipeline",
+            "StableDiffusionImageVariationPipeline",
+            "StableDiffusionImg2ImgPipeline",
+            "StableDiffusionInpaintPipeline",
+            "StableDiffusionInstructPix2PixPipeline",
+            "StableDiffusionLatentUpscalePipeline",
+            "StableDiffusionPipeline",
+            "StableDiffusionUpscalePipeline",
+            "StableUnCLIPImg2ImgPipeline",
+            "StableUnCLIPPipeline",
+            "StableDiffusionLDM3DPipeline",
+        ]
+    )
+    _import_structure["stable_diffusion_attend_and_excite"] = ["StableDiffusionAttendAndExcitePipeline"]
+    _import_structure["stable_diffusion_safe"] = ["StableDiffusionPipelineSafe"]
+    _import_structure["stable_diffusion_sag"] = ["StableDiffusionSAGPipeline"]
+    _import_structure["stable_diffusion_gligen"] = [
+        "StableDiffusionGLIGENPipeline",
+        "StableDiffusionGLIGENTextImagePipeline",
+    ]
+    _import_structure["stable_video_diffusion"] = ["StableVideoDiffusionPipeline"]
+    _import_structure["stable_diffusion_xl"].extend(
+        [
+            "StableDiffusionXLImg2ImgPipeline",
+            "StableDiffusionXLInpaintPipeline",
+            "StableDiffusionXLInstructPix2PixPipeline",
+            "StableDiffusionXLPipeline",
+        ]
+    )
+    _import_structure["stable_diffusion_diffedit"] = ["StableDiffusionDiffEditPipeline"]
+    _import_structure["stable_diffusion_ldm3d"] = ["StableDiffusionLDM3DPipeline"]
+    _import_structure["stable_diffusion_panorama"] = ["StableDiffusionPanoramaPipeline"]
+    _import_structure["t2i_adapter"] = [
+        "StableDiffusionAdapterPipeline",
+        "StableDiffusionXLAdapterPipeline",
+    ]
+    _import_structure["text_to_video_synthesis"] = [
+        "TextToVideoSDPipeline",
+        "TextToVideoZeroPipeline",
+        "TextToVideoZeroSDXLPipeline",
+        "VideoToVideoSDPipeline",
+    ]
+    _import_structure["i2vgen_xl"] = ["I2VGenXLPipeline"]
+    _import_structure["unclip"] = ["UnCLIPImageVariationPipeline", "UnCLIPPipeline"]
+    _import_structure["unidiffuser"] = [
+        "ImageTextPipelineOutput",
+        "UniDiffuserModel",
+        "UniDiffuserPipeline",
+        "UniDiffuserTextDecoder",
+    ]
+    _import_structure["wuerstchen"] = [
+        "WuerstchenCombinedPipeline",
+        "WuerstchenDecoderPipeline",
+        "WuerstchenPriorPipeline",
+    ]
+try:
+    if not is_onnx_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_onnx_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_onnx_objects))
+else:
+    _import_structure["onnx_utils"] = ["OnnxRuntimeModel"]
+try:
+    if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_torch_and_transformers_and_onnx_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_onnx_objects))
+else:
+    _import_structure["stable_diffusion"].extend(
+        [
+            "OnnxStableDiffusionImg2ImgPipeline",
+            "OnnxStableDiffusionInpaintPipeline",
+            "OnnxStableDiffusionPipeline",
+            "OnnxStableDiffusionUpscalePipeline",
+            "StableDiffusionOnnxPipeline",
+        ]
+    )
+
+try:
+    if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import (
+        dummy_torch_and_transformers_and_k_diffusion_objects,
+    )
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_k_diffusion_objects))
+else:
+    _import_structure["stable_diffusion_k_diffusion"] = [
+        "StableDiffusionKDiffusionPipeline",
+        "StableDiffusionXLKDiffusionPipeline",
+    ]
+try:
+    if not is_flax_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_flax_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_flax_objects))
+else:
+    _import_structure["pipeline_flax_utils"] = ["FlaxDiffusionPipeline"]
+try:
+    if not (is_flax_available() and is_transformers_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_flax_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_flax_and_transformers_objects))
+else:
+    _import_structure["controlnet"].extend(["FlaxStableDiffusionControlNetPipeline"])
+    _import_structure["stable_diffusion"].extend(
+        [
+            "FlaxStableDiffusionImg2ImgPipeline",
+            "FlaxStableDiffusionInpaintPipeline",
+            "FlaxStableDiffusionPipeline",
+        ]
+    )
+    _import_structure["stable_diffusion_xl"].extend(
+        [
+            "FlaxStableDiffusionXLPipeline",
+        ]
+    )
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_pt_objects import *  # noqa F403
+
+    else:
+        from .auto_pipeline import (
+            AutoPipelineForImage2Image,
+            AutoPipelineForInpainting,
+            AutoPipelineForText2Image,
+        )
+        from .consistency_models import ConsistencyModelPipeline
+        from .dance_diffusion import DanceDiffusionPipeline
+        from .ddim import DDIMPipeline
+        from .ddpm import DDPMPipeline
+        from .deprecated import KarrasVePipeline, LDMPipeline, PNDMPipeline, RePaintPipeline, ScoreSdeVePipeline
+        from .dit import DiTPipeline
+        from .latent_diffusion import LDMSuperResolutionPipeline
+        from .pipeline_utils import (
+            AudioPipelineOutput,
+            DiffusionPipeline,
+            ImagePipelineOutput,
+            StableDiffusionMixin,
+        )
+
+    try:
+        if not (is_torch_available() and is_librosa_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_torch_and_librosa_objects import *
+    else:
+        from .deprecated import AudioDiffusionPipeline, Mel
+
+    try:
+        if not (is_torch_available() and is_transformers_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .amused import AmusedImg2ImgPipeline, AmusedInpaintPipeline, AmusedPipeline
+        from .animatediff import AnimateDiffPipeline, AnimateDiffVideoToVideoPipeline
+        from .audioldm import AudioLDMPipeline
+        from .audioldm2 import (
+            AudioLDM2Pipeline,
+            AudioLDM2ProjectionModel,
+            AudioLDM2UNet2DConditionModel,
+        )
+        from .blip_diffusion import BlipDiffusionPipeline
+        from .controlnet import (
+            BlipDiffusionControlNetPipeline,
+            StableDiffusionControlNetImg2ImgPipeline,
+            StableDiffusionControlNetInpaintPipeline,
+            StableDiffusionControlNetPipeline,
+            StableDiffusionXLControlNetImg2ImgPipeline,
+            StableDiffusionXLControlNetInpaintPipeline,
+            StableDiffusionXLControlNetPipeline,
+        )
+        from .deepfloyd_if import (
+            IFImg2ImgPipeline,
+            IFImg2ImgSuperResolutionPipeline,
+            IFInpaintingPipeline,
+            IFInpaintingSuperResolutionPipeline,
+            IFPipeline,
+            IFSuperResolutionPipeline,
+        )
+        from .deprecated import (
+            AltDiffusionImg2ImgPipeline,
+            AltDiffusionPipeline,
+            CycleDiffusionPipeline,
+            StableDiffusionInpaintPipelineLegacy,
+            StableDiffusionModelEditingPipeline,
+            StableDiffusionParadigmsPipeline,
+            StableDiffusionPix2PixZeroPipeline,
+            VersatileDiffusionDualGuidedPipeline,
+            VersatileDiffusionImageVariationPipeline,
+            VersatileDiffusionPipeline,
+            VersatileDiffusionTextToImagePipeline,
+            VQDiffusionPipeline,
+        )
+        from .i2vgen_xl import I2VGenXLPipeline
+        from .kandinsky import (
+            KandinskyCombinedPipeline,
+            KandinskyImg2ImgCombinedPipeline,
+            KandinskyImg2ImgPipeline,
+            KandinskyInpaintCombinedPipeline,
+            KandinskyInpaintPipeline,
+            KandinskyPipeline,
+            KandinskyPriorPipeline,
+        )
+        from .kandinsky2_2 import (
+            KandinskyV22CombinedPipeline,
+            KandinskyV22ControlnetImg2ImgPipeline,
+            KandinskyV22ControlnetPipeline,
+            KandinskyV22Img2ImgCombinedPipeline,
+            KandinskyV22Img2ImgPipeline,
+            KandinskyV22InpaintCombinedPipeline,
+            KandinskyV22InpaintPipeline,
+            KandinskyV22Pipeline,
+            KandinskyV22PriorEmb2EmbPipeline,
+            KandinskyV22PriorPipeline,
+        )
+        from .kandinsky3 import (
+            Kandinsky3Img2ImgPipeline,
+            Kandinsky3Pipeline,
+        )
+        from .latent_consistency_models import (
+            LatentConsistencyModelImg2ImgPipeline,
+            LatentConsistencyModelPipeline,
+        )
+        from .latent_diffusion import LDMTextToImagePipeline
+        from .ledits_pp import (
+            LEditsPPDiffusionPipelineOutput,
+            LEditsPPInversionPipelineOutput,
+            LEditsPPPipelineStableDiffusion,
+            LEditsPPPipelineStableDiffusionXL,
+        )
+        from .musicldm import MusicLDMPipeline
+        from .paint_by_example import PaintByExamplePipeline
+        from .pia import PIAPipeline
+        from .pixart_alpha import PixArtAlphaPipeline
+        from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
+        from .shap_e import ShapEImg2ImgPipeline, ShapEPipeline
+        from .stable_cascade import (
+            StableCascadeCombinedPipeline,
+            StableCascadeDecoderPipeline,
+            StableCascadePriorPipeline,
+        )
+        from .stable_diffusion import (
+            CLIPImageProjection,
+            StableDiffusionDepth2ImgPipeline,
+            StableDiffusionImageVariationPipeline,
+            StableDiffusionImg2ImgPipeline,
+            StableDiffusionInpaintPipeline,
+            StableDiffusionInstructPix2PixPipeline,
+            StableDiffusionLatentUpscalePipeline,
+            StableDiffusionPipeline,
+            StableDiffusionUpscalePipeline,
+            StableUnCLIPImg2ImgPipeline,
+            StableUnCLIPPipeline,
+        )
+        from .stable_diffusion_attend_and_excite import StableDiffusionAttendAndExcitePipeline
+        from .stable_diffusion_diffedit import StableDiffusionDiffEditPipeline
+        from .stable_diffusion_gligen import StableDiffusionGLIGENPipeline, StableDiffusionGLIGENTextImagePipeline
+        from .stable_diffusion_ldm3d import StableDiffusionLDM3DPipeline
+        from .stable_diffusion_panorama import StableDiffusionPanoramaPipeline
+        from .stable_diffusion_safe import StableDiffusionPipelineSafe
+        from .stable_diffusion_sag import StableDiffusionSAGPipeline
+        from .stable_diffusion_xl import (
+            StableDiffusionXLImg2ImgPipeline,
+            StableDiffusionXLInpaintPipeline,
+            StableDiffusionXLInstructPix2PixPipeline,
+            StableDiffusionXLPipeline,
+        )
+        from .stable_video_diffusion import StableVideoDiffusionPipeline
+        from .t2i_adapter import (
+            StableDiffusionAdapterPipeline,
+            StableDiffusionXLAdapterPipeline,
+        )
+        from .text_to_video_synthesis import (
+            TextToVideoSDPipeline,
+            TextToVideoZeroPipeline,
+            TextToVideoZeroSDXLPipeline,
+            VideoToVideoSDPipeline,
+        )
+        from .unclip import UnCLIPImageVariationPipeline, UnCLIPPipeline
+        from .unidiffuser import (
+            ImageTextPipelineOutput,
+            UniDiffuserModel,
+            UniDiffuserPipeline,
+            UniDiffuserTextDecoder,
+        )
+        from .wuerstchen import (
+            WuerstchenCombinedPipeline,
+            WuerstchenDecoderPipeline,
+            WuerstchenPriorPipeline,
+        )
+
+        try:
+            if not is_onnx_available():
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_onnx_objects import *  # noqa F403
+
+        else:
+            from .onnx_utils import OnnxRuntimeModel
+
+        try:
+            if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_torch_and_transformers_and_onnx_objects import *
+        else:
+            from .stable_diffusion import (
+                OnnxStableDiffusionImg2ImgPipeline,
+                OnnxStableDiffusionInpaintPipeline,
+                OnnxStableDiffusionPipeline,
+                OnnxStableDiffusionUpscalePipeline,
+                StableDiffusionOnnxPipeline,
+            )
+
+        try:
+            if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_torch_and_transformers_and_k_diffusion_objects import *
+        else:
+            from .stable_diffusion_k_diffusion import (
+                StableDiffusionKDiffusionPipeline,
+                StableDiffusionXLKDiffusionPipeline,
+            )
+
+        try:
+            if not is_flax_available():
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_flax_objects import *  # noqa F403
+        else:
+            from .pipeline_flax_utils import FlaxDiffusionPipeline
+
+        try:
+            if not (is_flax_available() and is_transformers_available()):
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_flax_and_transformers_objects import *
+        else:
+            from .controlnet import FlaxStableDiffusionControlNetPipeline
+            from .stable_diffusion import (
+                FlaxStableDiffusionImg2ImgPipeline,
+                FlaxStableDiffusionInpaintPipeline,
+                FlaxStableDiffusionPipeline,
+            )
+            from .stable_diffusion_xl import (
+                FlaxStableDiffusionXLPipeline,
+            )
+
+        try:
+            if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_transformers_and_torch_and_note_seq_objects import *  # noqa F403
+
+        else:
+            from .deprecated import (
+                MidiProcessor,
+                SpectrogramDiffusionPipeline,
+            )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/amused/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/amused/__init__.py
new file mode 100644
index 000000000..3c4d07a42
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/amused/__init__.py
@@ -0,0 +1,62 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils.dummy_torch_and_transformers_objects import (
+        AmusedImg2ImgPipeline,
+        AmusedInpaintPipeline,
+        AmusedPipeline,
+    )
+
+    _dummy_objects.update(
+        {
+            "AmusedPipeline": AmusedPipeline,
+            "AmusedImg2ImgPipeline": AmusedImg2ImgPipeline,
+            "AmusedInpaintPipeline": AmusedInpaintPipeline,
+        }
+    )
+else:
+    _import_structure["pipeline_amused"] = ["AmusedPipeline"]
+    _import_structure["pipeline_amused_img2img"] = ["AmusedImg2ImgPipeline"]
+    _import_structure["pipeline_amused_inpaint"] = ["AmusedInpaintPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import (
+            AmusedPipeline,
+        )
+    else:
+        from .pipeline_amused import AmusedPipeline
+        from .pipeline_amused_img2img import AmusedImg2ImgPipeline
+        from .pipeline_amused_inpaint import AmusedInpaintPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/amused/pipeline_amused.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/amused/pipeline_amused.py
new file mode 100644
index 000000000..aa682b46f
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/amused/pipeline_amused.py
@@ -0,0 +1,328 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from transformers import CLIPTextModelWithProjection, CLIPTokenizer
+
+from ...image_processor import VaeImageProcessor
+from ...models import UVit2DModel, VQModel
+from ...schedulers import AmusedScheduler
+from ...utils import replace_example_docstring
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import AmusedPipeline
+
+        >>> pipe = AmusedPipeline.from_pretrained(
+        ...     "amused/amused-512", variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+
+class AmusedPipeline(DiffusionPipeline):
+    image_processor: VaeImageProcessor
+    vqvae: VQModel
+    tokenizer: CLIPTokenizer
+    text_encoder: CLIPTextModelWithProjection
+    transformer: UVit2DModel
+    scheduler: AmusedScheduler
+
+    model_cpu_offload_seq = "text_encoder->transformer->vqvae"
+
+    def __init__(
+        self,
+        vqvae: VQModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModelWithProjection,
+        transformer: UVit2DModel,
+        scheduler: AmusedScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vqvae=vqvae,
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vqvae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_normalize=False)
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[List[str], str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 12,
+        guidance_scale: float = 10.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.IntTensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_encoder_hidden_states: Optional[torch.Tensor] = None,
+        output_type="pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        micro_conditioning_aesthetic_score: int = 6,
+        micro_conditioning_crop_coord: Tuple[int, int] = (0, 0),
+        temperature: Union[int, Tuple[int, int], List[int]] = (2, 0),
+    ):
+        """
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.transformer.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 16):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 10.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.IntTensor`, *optional*):
+                Pre-generated tokens representing latent vectors in `self.vqvae`, to be used as inputs for image
+                gneration. If not provided, the starting latents will be completely masked.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument. A single vector from the
+                pooled and projected final hidden states.
+            encoder_hidden_states (`torch.FloatTensor`, *optional*):
+                Pre-generated penultimate hidden states from the text encoder providing additional text conditioning.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            negative_encoder_hidden_states (`torch.FloatTensor`, *optional*):
+                Analogous to `encoder_hidden_states` for the positive prompt.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            micro_conditioning_aesthetic_score (`int`, *optional*, defaults to 6):
+                The targeted aesthetic score according to the laion aesthetic classifier. See https://laion.ai/blog/laion-aesthetics/
+                and the micro-conditioning section of https://arxiv.org/abs/2307.01952.
+            micro_conditioning_crop_coord (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                The targeted height, width crop coordinates. See the micro-conditioning section of https://arxiv.org/abs/2307.01952.
+            temperature (`Union[int, Tuple[int, int], List[int]]`, *optional*, defaults to (2, 0)):
+                Configures the temperature scheduler on `self.scheduler` see `AmusedScheduler#set_timesteps`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.pipeline_utils.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.pipeline_utils.ImagePipelineOutput`] is returned, otherwise a
+                `tuple` is returned where the first element is a list with the generated images.
+        """
+        if (prompt_embeds is not None and encoder_hidden_states is None) or (
+            prompt_embeds is None and encoder_hidden_states is not None
+        ):
+            raise ValueError("pass either both `prompt_embeds` and `encoder_hidden_states` or neither")
+
+        if (negative_prompt_embeds is not None and negative_encoder_hidden_states is None) or (
+            negative_prompt_embeds is None and negative_encoder_hidden_states is not None
+        ):
+            raise ValueError(
+                "pass either both `negatve_prompt_embeds` and `negative_encoder_hidden_states` or neither"
+            )
+
+        if (prompt is None and prompt_embeds is None) or (prompt is not None and prompt_embeds is not None):
+            raise ValueError("pass only one of `prompt` or `prompt_embeds`")
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if height is None:
+            height = self.transformer.config.sample_size * self.vae_scale_factor
+
+        if width is None:
+            width = self.transformer.config.sample_size * self.vae_scale_factor
+
+        if prompt_embeds is None:
+            input_ids = self.tokenizer(
+                prompt,
+                return_tensors="pt",
+                padding="max_length",
+                truncation=True,
+                max_length=self.tokenizer.model_max_length,
+            ).input_ids.to(self._execution_device)
+
+            outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True)
+            prompt_embeds = outputs.text_embeds
+            encoder_hidden_states = outputs.hidden_states[-2]
+
+        prompt_embeds = prompt_embeds.repeat(num_images_per_prompt, 1)
+        encoder_hidden_states = encoder_hidden_states.repeat(num_images_per_prompt, 1, 1)
+
+        if guidance_scale > 1.0:
+            if negative_prompt_embeds is None:
+                if negative_prompt is None:
+                    negative_prompt = [""] * len(prompt)
+
+                if isinstance(negative_prompt, str):
+                    negative_prompt = [negative_prompt]
+
+                input_ids = self.tokenizer(
+                    negative_prompt,
+                    return_tensors="pt",
+                    padding="max_length",
+                    truncation=True,
+                    max_length=self.tokenizer.model_max_length,
+                ).input_ids.to(self._execution_device)
+
+                outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True)
+                negative_prompt_embeds = outputs.text_embeds
+                negative_encoder_hidden_states = outputs.hidden_states[-2]
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(num_images_per_prompt, 1)
+            negative_encoder_hidden_states = negative_encoder_hidden_states.repeat(num_images_per_prompt, 1, 1)
+
+            prompt_embeds = torch.concat([negative_prompt_embeds, prompt_embeds])
+            encoder_hidden_states = torch.concat([negative_encoder_hidden_states, encoder_hidden_states])
+
+        # Note that the micro conditionings _do_ flip the order of width, height for the original size
+        # and the crop coordinates. This is how it was done in the original code base
+        micro_conds = torch.tensor(
+            [
+                width,
+                height,
+                micro_conditioning_crop_coord[0],
+                micro_conditioning_crop_coord[1],
+                micro_conditioning_aesthetic_score,
+            ],
+            device=self._execution_device,
+            dtype=encoder_hidden_states.dtype,
+        )
+        micro_conds = micro_conds.unsqueeze(0)
+        micro_conds = micro_conds.expand(2 * batch_size if guidance_scale > 1.0 else batch_size, -1)
+
+        shape = (batch_size, height // self.vae_scale_factor, width // self.vae_scale_factor)
+
+        if latents is None:
+            latents = torch.full(
+                shape, self.scheduler.config.mask_token_id, dtype=torch.long, device=self._execution_device
+            )
+
+        self.scheduler.set_timesteps(num_inference_steps, temperature, self._execution_device)
+
+        num_warmup_steps = len(self.scheduler.timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, timestep in enumerate(self.scheduler.timesteps):
+                if guidance_scale > 1.0:
+                    model_input = torch.cat([latents] * 2)
+                else:
+                    model_input = latents
+
+                model_output = self.transformer(
+                    model_input,
+                    micro_conds=micro_conds,
+                    pooled_text_emb=prompt_embeds,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+
+                if guidance_scale > 1.0:
+                    uncond_logits, cond_logits = model_output.chunk(2)
+                    model_output = uncond_logits + guidance_scale * (cond_logits - uncond_logits)
+
+                latents = self.scheduler.step(
+                    model_output=model_output,
+                    timestep=timestep,
+                    sample=latents,
+                    generator=generator,
+                ).prev_sample
+
+                if i == len(self.scheduler.timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, timestep, latents)
+
+        if output_type == "latent":
+            output = latents
+        else:
+            needs_upcasting = self.vqvae.dtype == torch.float16 and self.vqvae.config.force_upcast
+
+            if needs_upcasting:
+                self.vqvae.float()
+
+            output = self.vqvae.decode(
+                latents,
+                force_not_quantize=True,
+                shape=(
+                    batch_size,
+                    height // self.vae_scale_factor,
+                    width // self.vae_scale_factor,
+                    self.vqvae.config.latent_channels,
+                ),
+            ).sample.clip(0, 1)
+            output = self.image_processor.postprocess(output, output_type)
+
+            if needs_upcasting:
+                self.vqvae.half()
+
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (output,)
+
+        return ImagePipelineOutput(output)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/amused/pipeline_amused_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/amused/pipeline_amused_img2img.py
new file mode 100644
index 000000000..444d6354b
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/amused/pipeline_amused_img2img.py
@@ -0,0 +1,347 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from transformers import CLIPTextModelWithProjection, CLIPTokenizer
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...models import UVit2DModel, VQModel
+from ...schedulers import AmusedScheduler
+from ...utils import replace_example_docstring
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import AmusedImg2ImgPipeline
+        >>> from diffusers.utils import load_image
+
+        >>> pipe = AmusedImg2ImgPipeline.from_pretrained(
+        ...     "amused/amused-512", variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "winter mountains"
+        >>> input_image = (
+        ...     load_image(
+        ...         "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains.jpg"
+        ...     )
+        ...     .resize((512, 512))
+        ...     .convert("RGB")
+        ... )
+        >>> image = pipe(prompt, input_image).images[0]
+        ```
+"""
+
+
+class AmusedImg2ImgPipeline(DiffusionPipeline):
+    image_processor: VaeImageProcessor
+    vqvae: VQModel
+    tokenizer: CLIPTokenizer
+    text_encoder: CLIPTextModelWithProjection
+    transformer: UVit2DModel
+    scheduler: AmusedScheduler
+
+    model_cpu_offload_seq = "text_encoder->transformer->vqvae"
+
+    # TODO - when calling self.vqvae.quantize, it uses self.vqvae.quantize.embedding.weight before
+    # the forward method of self.vqvae.quantize, so the hook doesn't get called to move the parameter
+    # off the meta device. There should be a way to fix this instead of just not offloading it
+    _exclude_from_cpu_offload = ["vqvae"]
+
+    def __init__(
+        self,
+        vqvae: VQModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModelWithProjection,
+        transformer: UVit2DModel,
+        scheduler: AmusedScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vqvae=vqvae,
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vqvae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_normalize=False)
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[List[str], str]] = None,
+        image: PipelineImageInput = None,
+        strength: float = 0.5,
+        num_inference_steps: int = 12,
+        guidance_scale: float = 10.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[torch.Generator] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_encoder_hidden_states: Optional[torch.Tensor] = None,
+        output_type="pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        micro_conditioning_aesthetic_score: int = 6,
+        micro_conditioning_crop_coord: Tuple[int, int] = (0, 0),
+        temperature: Union[int, Tuple[int, int], List[int]] = (2, 0),
+    ):
+        """
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            strength (`float`, *optional*, defaults to 0.5):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 16):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 10.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument. A single vector from the
+                pooled and projected final hidden states.
+            encoder_hidden_states (`torch.FloatTensor`, *optional*):
+                Pre-generated penultimate hidden states from the text encoder providing additional text conditioning.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            negative_encoder_hidden_states (`torch.FloatTensor`, *optional*):
+                Analogous to `encoder_hidden_states` for the positive prompt.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            micro_conditioning_aesthetic_score (`int`, *optional*, defaults to 6):
+                The targeted aesthetic score according to the laion aesthetic classifier. See https://laion.ai/blog/laion-aesthetics/
+                and the micro-conditioning section of https://arxiv.org/abs/2307.01952.
+            micro_conditioning_crop_coord (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                The targeted height, width crop coordinates. See the micro-conditioning section of https://arxiv.org/abs/2307.01952.
+            temperature (`Union[int, Tuple[int, int], List[int]]`, *optional*, defaults to (2, 0)):
+                Configures the temperature scheduler on `self.scheduler` see `AmusedScheduler#set_timesteps`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.pipeline_utils.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.pipeline_utils.ImagePipelineOutput`] is returned, otherwise a
+                `tuple` is returned where the first element is a list with the generated images.
+        """
+
+        if (prompt_embeds is not None and encoder_hidden_states is None) or (
+            prompt_embeds is None and encoder_hidden_states is not None
+        ):
+            raise ValueError("pass either both `prompt_embeds` and `encoder_hidden_states` or neither")
+
+        if (negative_prompt_embeds is not None and negative_encoder_hidden_states is None) or (
+            negative_prompt_embeds is None and negative_encoder_hidden_states is not None
+        ):
+            raise ValueError(
+                "pass either both `negatve_prompt_embeds` and `negative_encoder_hidden_states` or neither"
+            )
+
+        if (prompt is None and prompt_embeds is None) or (prompt is not None and prompt_embeds is not None):
+            raise ValueError("pass only one of `prompt` or `prompt_embeds`")
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if prompt_embeds is None:
+            input_ids = self.tokenizer(
+                prompt,
+                return_tensors="pt",
+                padding="max_length",
+                truncation=True,
+                max_length=self.tokenizer.model_max_length,
+            ).input_ids.to(self._execution_device)
+
+            outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True)
+            prompt_embeds = outputs.text_embeds
+            encoder_hidden_states = outputs.hidden_states[-2]
+
+        prompt_embeds = prompt_embeds.repeat(num_images_per_prompt, 1)
+        encoder_hidden_states = encoder_hidden_states.repeat(num_images_per_prompt, 1, 1)
+
+        if guidance_scale > 1.0:
+            if negative_prompt_embeds is None:
+                if negative_prompt is None:
+                    negative_prompt = [""] * len(prompt)
+
+                if isinstance(negative_prompt, str):
+                    negative_prompt = [negative_prompt]
+
+                input_ids = self.tokenizer(
+                    negative_prompt,
+                    return_tensors="pt",
+                    padding="max_length",
+                    truncation=True,
+                    max_length=self.tokenizer.model_max_length,
+                ).input_ids.to(self._execution_device)
+
+                outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True)
+                negative_prompt_embeds = outputs.text_embeds
+                negative_encoder_hidden_states = outputs.hidden_states[-2]
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(num_images_per_prompt, 1)
+            negative_encoder_hidden_states = negative_encoder_hidden_states.repeat(num_images_per_prompt, 1, 1)
+
+            prompt_embeds = torch.concat([negative_prompt_embeds, prompt_embeds])
+            encoder_hidden_states = torch.concat([negative_encoder_hidden_states, encoder_hidden_states])
+
+        image = self.image_processor.preprocess(image)
+
+        height, width = image.shape[-2:]
+
+        # Note that the micro conditionings _do_ flip the order of width, height for the original size
+        # and the crop coordinates. This is how it was done in the original code base
+        micro_conds = torch.tensor(
+            [
+                width,
+                height,
+                micro_conditioning_crop_coord[0],
+                micro_conditioning_crop_coord[1],
+                micro_conditioning_aesthetic_score,
+            ],
+            device=self._execution_device,
+            dtype=encoder_hidden_states.dtype,
+        )
+
+        micro_conds = micro_conds.unsqueeze(0)
+        micro_conds = micro_conds.expand(2 * batch_size if guidance_scale > 1.0 else batch_size, -1)
+
+        self.scheduler.set_timesteps(num_inference_steps, temperature, self._execution_device)
+        num_inference_steps = int(len(self.scheduler.timesteps) * strength)
+        start_timestep_idx = len(self.scheduler.timesteps) - num_inference_steps
+
+        needs_upcasting = self.vqvae.dtype == torch.float16 and self.vqvae.config.force_upcast
+
+        if needs_upcasting:
+            self.vqvae.float()
+
+        latents = self.vqvae.encode(image.to(dtype=self.vqvae.dtype, device=self._execution_device)).latents
+        latents_bsz, channels, latents_height, latents_width = latents.shape
+        latents = self.vqvae.quantize(latents)[2][2].reshape(latents_bsz, latents_height, latents_width)
+        latents = self.scheduler.add_noise(
+            latents, self.scheduler.timesteps[start_timestep_idx - 1], generator=generator
+        )
+        latents = latents.repeat(num_images_per_prompt, 1, 1)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i in range(start_timestep_idx, len(self.scheduler.timesteps)):
+                timestep = self.scheduler.timesteps[i]
+
+                if guidance_scale > 1.0:
+                    model_input = torch.cat([latents] * 2)
+                else:
+                    model_input = latents
+
+                model_output = self.transformer(
+                    model_input,
+                    micro_conds=micro_conds,
+                    pooled_text_emb=prompt_embeds,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+
+                if guidance_scale > 1.0:
+                    uncond_logits, cond_logits = model_output.chunk(2)
+                    model_output = uncond_logits + guidance_scale * (cond_logits - uncond_logits)
+
+                latents = self.scheduler.step(
+                    model_output=model_output,
+                    timestep=timestep,
+                    sample=latents,
+                    generator=generator,
+                ).prev_sample
+
+                if i == len(self.scheduler.timesteps) - 1 or ((i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, timestep, latents)
+
+        if output_type == "latent":
+            output = latents
+        else:
+            output = self.vqvae.decode(
+                latents,
+                force_not_quantize=True,
+                shape=(
+                    batch_size,
+                    height // self.vae_scale_factor,
+                    width // self.vae_scale_factor,
+                    self.vqvae.config.latent_channels,
+                ),
+            ).sample.clip(0, 1)
+            output = self.image_processor.postprocess(output, output_type)
+
+            if needs_upcasting:
+                self.vqvae.half()
+
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (output,)
+
+        return ImagePipelineOutput(output)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py
new file mode 100644
index 000000000..423f5734b
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py
@@ -0,0 +1,378 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from transformers import CLIPTextModelWithProjection, CLIPTokenizer
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...models import UVit2DModel, VQModel
+from ...schedulers import AmusedScheduler
+from ...utils import replace_example_docstring
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import AmusedInpaintPipeline
+        >>> from diffusers.utils import load_image
+
+        >>> pipe = AmusedInpaintPipeline.from_pretrained(
+        ...     "amused/amused-512", variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "fall mountains"
+        >>> input_image = (
+        ...     load_image(
+        ...         "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1.jpg"
+        ...     )
+        ...     .resize((512, 512))
+        ...     .convert("RGB")
+        ... )
+        >>> mask = (
+        ...     load_image(
+        ...         "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/open_muse/mountains_1_mask.png"
+        ...     )
+        ...     .resize((512, 512))
+        ...     .convert("L")
+        ... )
+        >>> pipe(prompt, input_image, mask).images[0].save("out.png")
+        ```
+"""
+
+
+class AmusedInpaintPipeline(DiffusionPipeline):
+    image_processor: VaeImageProcessor
+    vqvae: VQModel
+    tokenizer: CLIPTokenizer
+    text_encoder: CLIPTextModelWithProjection
+    transformer: UVit2DModel
+    scheduler: AmusedScheduler
+
+    model_cpu_offload_seq = "text_encoder->transformer->vqvae"
+
+    # TODO - when calling self.vqvae.quantize, it uses self.vqvae.quantize.embedding.weight before
+    # the forward method of self.vqvae.quantize, so the hook doesn't get called to move the parameter
+    # off the meta device. There should be a way to fix this instead of just not offloading it
+    _exclude_from_cpu_offload = ["vqvae"]
+
+    def __init__(
+        self,
+        vqvae: VQModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModelWithProjection,
+        transformer: UVit2DModel,
+        scheduler: AmusedScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vqvae=vqvae,
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vqvae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_normalize=False)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor,
+            do_normalize=False,
+            do_binarize=True,
+            do_convert_grayscale=True,
+            do_resize=True,
+        )
+        self.scheduler.register_to_config(masking_schedule="linear")
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[List[str], str]] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        strength: float = 1.0,
+        num_inference_steps: int = 12,
+        guidance_scale: float = 10.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[torch.Generator] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_encoder_hidden_states: Optional[torch.Tensor] = None,
+        output_type="pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        micro_conditioning_aesthetic_score: int = 6,
+        micro_conditioning_crop_coord: Tuple[int, int] = (0, 0),
+        temperature: Union[int, Tuple[int, int], List[int]] = (2, 0),
+    ):
+        """
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask
+                are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a
+                single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one
+                color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B,
+                H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W,
+                1)`, or `(H, W)`.
+            strength (`float`, *optional*, defaults to 1.0):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 16):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 10.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument. A single vector from the
+                pooled and projected final hidden states.
+            encoder_hidden_states (`torch.FloatTensor`, *optional*):
+                Pre-generated penultimate hidden states from the text encoder providing additional text conditioning.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            negative_encoder_hidden_states (`torch.FloatTensor`, *optional*):
+                Analogous to `encoder_hidden_states` for the positive prompt.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            micro_conditioning_aesthetic_score (`int`, *optional*, defaults to 6):
+                The targeted aesthetic score according to the laion aesthetic classifier. See https://laion.ai/blog/laion-aesthetics/
+                and the micro-conditioning section of https://arxiv.org/abs/2307.01952.
+            micro_conditioning_crop_coord (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                The targeted height, width crop coordinates. See the micro-conditioning section of https://arxiv.org/abs/2307.01952.
+            temperature (`Union[int, Tuple[int, int], List[int]]`, *optional*, defaults to (2, 0)):
+                Configures the temperature scheduler on `self.scheduler` see `AmusedScheduler#set_timesteps`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.pipeline_utils.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.pipeline_utils.ImagePipelineOutput`] is returned, otherwise a
+                `tuple` is returned where the first element is a list with the generated images.
+        """
+
+        if (prompt_embeds is not None and encoder_hidden_states is None) or (
+            prompt_embeds is None and encoder_hidden_states is not None
+        ):
+            raise ValueError("pass either both `prompt_embeds` and `encoder_hidden_states` or neither")
+
+        if (negative_prompt_embeds is not None and negative_encoder_hidden_states is None) or (
+            negative_prompt_embeds is None and negative_encoder_hidden_states is not None
+        ):
+            raise ValueError(
+                "pass either both `negatve_prompt_embeds` and `negative_encoder_hidden_states` or neither"
+            )
+
+        if (prompt is None and prompt_embeds is None) or (prompt is not None and prompt_embeds is not None):
+            raise ValueError("pass only one of `prompt` or `prompt_embeds`")
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if prompt_embeds is None:
+            input_ids = self.tokenizer(
+                prompt,
+                return_tensors="pt",
+                padding="max_length",
+                truncation=True,
+                max_length=self.tokenizer.model_max_length,
+            ).input_ids.to(self._execution_device)
+
+            outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True)
+            prompt_embeds = outputs.text_embeds
+            encoder_hidden_states = outputs.hidden_states[-2]
+
+        prompt_embeds = prompt_embeds.repeat(num_images_per_prompt, 1)
+        encoder_hidden_states = encoder_hidden_states.repeat(num_images_per_prompt, 1, 1)
+
+        if guidance_scale > 1.0:
+            if negative_prompt_embeds is None:
+                if negative_prompt is None:
+                    negative_prompt = [""] * len(prompt)
+
+                if isinstance(negative_prompt, str):
+                    negative_prompt = [negative_prompt]
+
+                input_ids = self.tokenizer(
+                    negative_prompt,
+                    return_tensors="pt",
+                    padding="max_length",
+                    truncation=True,
+                    max_length=self.tokenizer.model_max_length,
+                ).input_ids.to(self._execution_device)
+
+                outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True)
+                negative_prompt_embeds = outputs.text_embeds
+                negative_encoder_hidden_states = outputs.hidden_states[-2]
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(num_images_per_prompt, 1)
+            negative_encoder_hidden_states = negative_encoder_hidden_states.repeat(num_images_per_prompt, 1, 1)
+
+            prompt_embeds = torch.concat([negative_prompt_embeds, prompt_embeds])
+            encoder_hidden_states = torch.concat([negative_encoder_hidden_states, encoder_hidden_states])
+
+        image = self.image_processor.preprocess(image)
+
+        height, width = image.shape[-2:]
+
+        # Note that the micro conditionings _do_ flip the order of width, height for the original size
+        # and the crop coordinates. This is how it was done in the original code base
+        micro_conds = torch.tensor(
+            [
+                width,
+                height,
+                micro_conditioning_crop_coord[0],
+                micro_conditioning_crop_coord[1],
+                micro_conditioning_aesthetic_score,
+            ],
+            device=self._execution_device,
+            dtype=encoder_hidden_states.dtype,
+        )
+
+        micro_conds = micro_conds.unsqueeze(0)
+        micro_conds = micro_conds.expand(2 * batch_size if guidance_scale > 1.0 else batch_size, -1)
+
+        self.scheduler.set_timesteps(num_inference_steps, temperature, self._execution_device)
+        num_inference_steps = int(len(self.scheduler.timesteps) * strength)
+        start_timestep_idx = len(self.scheduler.timesteps) - num_inference_steps
+
+        needs_upcasting = self.vqvae.dtype == torch.float16 and self.vqvae.config.force_upcast
+
+        if needs_upcasting:
+            self.vqvae.float()
+
+        latents = self.vqvae.encode(image.to(dtype=self.vqvae.dtype, device=self._execution_device)).latents
+        latents_bsz, channels, latents_height, latents_width = latents.shape
+        latents = self.vqvae.quantize(latents)[2][2].reshape(latents_bsz, latents_height, latents_width)
+
+        mask = self.mask_processor.preprocess(
+            mask_image, height // self.vae_scale_factor, width // self.vae_scale_factor
+        )
+        mask = mask.reshape(mask.shape[0], latents_height, latents_width).bool().to(latents.device)
+        latents[mask] = self.scheduler.config.mask_token_id
+
+        starting_mask_ratio = mask.sum() / latents.numel()
+
+        latents = latents.repeat(num_images_per_prompt, 1, 1)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i in range(start_timestep_idx, len(self.scheduler.timesteps)):
+                timestep = self.scheduler.timesteps[i]
+
+                if guidance_scale > 1.0:
+                    model_input = torch.cat([latents] * 2)
+                else:
+                    model_input = latents
+
+                model_output = self.transformer(
+                    model_input,
+                    micro_conds=micro_conds,
+                    pooled_text_emb=prompt_embeds,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                )
+
+                if guidance_scale > 1.0:
+                    uncond_logits, cond_logits = model_output.chunk(2)
+                    model_output = uncond_logits + guidance_scale * (cond_logits - uncond_logits)
+
+                latents = self.scheduler.step(
+                    model_output=model_output,
+                    timestep=timestep,
+                    sample=latents,
+                    generator=generator,
+                    starting_mask_ratio=starting_mask_ratio,
+                ).prev_sample
+
+                if i == len(self.scheduler.timesteps) - 1 or ((i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, timestep, latents)
+
+        if output_type == "latent":
+            output = latents
+        else:
+            output = self.vqvae.decode(
+                latents,
+                force_not_quantize=True,
+                shape=(
+                    batch_size,
+                    height // self.vae_scale_factor,
+                    width // self.vae_scale_factor,
+                    self.vqvae.config.latent_channels,
+                ),
+            ).sample.clip(0, 1)
+            output = self.image_processor.postprocess(output, output_type)
+
+            if needs_upcasting:
+                self.vqvae.half()
+
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (output,)
+
+        return ImagePipelineOutput(output)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/animatediff/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/animatediff/__init__.py
new file mode 100644
index 000000000..35b99a76f
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/animatediff/__init__.py
@@ -0,0 +1,49 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {"pipeline_output": ["AnimateDiffPipelineOutput"]}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_animatediff"] = ["AnimateDiffPipeline"]
+    _import_structure["pipeline_animatediff_video2video"] = ["AnimateDiffVideoToVideoPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+
+    else:
+        from .pipeline_animatediff import AnimateDiffPipeline
+        from .pipeline_animatediff_video2video import AnimateDiffVideoToVideoPipeline
+        from .pipeline_output import AnimateDiffPipelineOutput
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/animatediff/pipeline_animatediff.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
new file mode 100644
index 000000000..cd7f0a283
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
@@ -0,0 +1,847 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...models.unets.unet_motion_model import MotionAdapter
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..free_init_utils import FreeInitMixin
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pipeline_output import AnimateDiffPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import MotionAdapter, AnimateDiffPipeline, DDIMScheduler
+        >>> from diffusers.utils import export_to_gif
+
+        >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2")
+        >>> pipe = AnimateDiffPipeline.from_pretrained("frankjoshua/toonyou_beta6", motion_adapter=adapter)
+        >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False)
+        >>> output = pipe(prompt="A corgi walking in the park")
+        >>> frames = output.frames[0]
+        >>> export_to_gif(frames, "animation.gif")
+        ```
+"""
+
+
+def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
+    batch_size, channels, num_frames, height, width = video.shape
+    outputs = []
+    for batch_idx in range(batch_size):
+        batch_vid = video[batch_idx].permute(1, 0, 2, 3)
+        batch_output = processor.postprocess(batch_vid, output_type)
+
+        outputs.append(batch_output)
+
+    if output_type == "np":
+        outputs = np.stack(outputs)
+
+    elif output_type == "pt":
+        outputs = torch.stack(outputs)
+
+    elif not output_type == "pil":
+        raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
+
+    return outputs
+
+
+class AnimateDiffPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    LoraLoaderMixin,
+    FreeInitMixin,
+):
+    r"""
+    Pipeline for text-to-video generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
+        motion_adapter ([`MotionAdapter`]):
+            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["feature_extractor", "image_encoder", "motion_adapter"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        motion_adapter: MotionAdapter,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+    ):
+        super().__init__()
+        if isinstance(unet, UNet2DConditionModel):
+            unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            motion_adapter=motion_adapter,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+        image = self.vae.decode(latents).sample
+        video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_frames: Optional[int] = 16,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated video.
+            num_frames (`int`, *optional*, defaults to 16):
+                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
+                amounts to 2 seconds of video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
+                `(batch_size, num_channel, num_frames, height, width)`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
+                `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeine class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_videos_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1
+        for free_init_iter in range(num_free_init_iters):
+            if self.free_init_enabled:
+                latents, timesteps = self._apply_free_init(
+                    latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
+                )
+
+            self._num_timesteps = len(timesteps)
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+            # 8. Denoising loop
+            with self.progress_bar(total=num_inference_steps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                    ).sample
+
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                        latents = callback_outputs.pop("latents", latents)
+                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+                        if callback is not None and i % callback_steps == 0:
+                            callback(i, t, latents)
+
+        # 9. Post processing
+        if output_type == "latent":
+            video = latents
+        else:
+            video_tensor = self.decode_latents(latents)
+            video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
+
+        # 10. Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return AnimateDiffPipelineOutput(frames=video)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
new file mode 100644
index 000000000..cb6b71351
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
@@ -0,0 +1,997 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...models.unets.unet_motion_model import MotionAdapter
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+from ...utils.torch_utils import randn_tensor
+from ..free_init_utils import FreeInitMixin
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pipeline_output import AnimateDiffPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import imageio
+        >>> import requests
+        >>> import torch
+        >>> from diffusers import AnimateDiffVideoToVideoPipeline, DDIMScheduler, MotionAdapter
+        >>> from diffusers.utils import export_to_gif
+        >>> from io import BytesIO
+        >>> from PIL import Image
+
+        >>> adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
+        >>> pipe = AnimateDiffVideoToVideoPipeline.from_pretrained("SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter).to("cuda")
+        >>> pipe.scheduler = DDIMScheduler(beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace")
+
+        >>> def load_video(file_path: str):
+        ...     images = []
+        ...
+        ...     if file_path.startswith(('http://', 'https://')):
+        ...         # If the file_path is a URL
+        ...         response = requests.get(file_path)
+        ...         response.raise_for_status()
+        ...         content = BytesIO(response.content)
+        ...         vid = imageio.get_reader(content)
+        ...     else:
+        ...         # Assuming it's a local file path
+        ...         vid = imageio.get_reader(file_path)
+        ...
+        ...     for frame in vid:
+        ...         pil_image = Image.fromarray(frame)
+        ...         images.append(pil_image)
+        ...
+        ...     return images
+
+        >>> video = load_video("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif")
+        >>> output = pipe(video=video, prompt="panda playing a guitar, on a boat, in the ocean, high quality", strength=0.5)
+        >>> frames = output.frames[0]
+        >>> export_to_gif(frames, "animation.gif")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
+def tensor2vid(video: torch.Tensor, processor, output_type="np"):
+    batch_size, channels, num_frames, height, width = video.shape
+    outputs = []
+    for batch_idx in range(batch_size):
+        batch_vid = video[batch_idx].permute(1, 0, 2, 3)
+        batch_output = processor.postprocess(batch_vid, output_type)
+
+        outputs.append(batch_output)
+
+    if output_type == "np":
+        outputs = np.stack(outputs)
+
+    elif output_type == "pt":
+        outputs = torch.stack(outputs)
+
+    elif not output_type == "pil":
+        raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
+
+    return outputs
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class AnimateDiffVideoToVideoPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    LoraLoaderMixin,
+    FreeInitMixin,
+):
+    r"""
+    Pipeline for video-to-video generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
+        motion_adapter ([`MotionAdapter`]):
+            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["feature_extractor", "image_encoder", "motion_adapter"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        motion_adapter: MotionAdapter,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+    ):
+        super().__init__()
+        if isinstance(unet, UNet2DConditionModel):
+            unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            motion_adapter=motion_adapter,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+        image = self.vae.decode(latents).sample
+        video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        strength,
+        height,
+        width,
+        video=None,
+        latents=None,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if video is not None and latents is not None:
+            raise ValueError("Only one of `video` or `latents` should be provided")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    def get_timesteps(self, num_inference_steps, timesteps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = timesteps[t_start * self.scheduler.order :]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(
+        self,
+        video,
+        height,
+        width,
+        num_channels_latents,
+        batch_size,
+        timestep,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        # video must be a list of list of images
+        # the outer list denotes having multiple videos as input, whereas inner list means the frames of the video
+        # as a list of images
+        if not isinstance(video[0], list):
+            video = [video]
+        if latents is None:
+            video = torch.cat(
+                [self.image_processor.preprocess(vid, height=height, width=width).unsqueeze(0) for vid in video], dim=0
+            )
+            video = video.to(device=device, dtype=dtype)
+            num_frames = video.shape[1]
+        else:
+            num_frames = latents.shape[2]
+
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            if self.vae.config.force_upcast:
+                video = video.float()
+                self.vae.to(dtype=torch.float32)
+
+            if isinstance(generator, list):
+                if len(generator) != batch_size:
+                    raise ValueError(
+                        f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                        f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                    )
+
+                init_latents = [
+                    retrieve_latents(self.vae.encode(video[i]), generator=generator[i]).unsqueeze(0)
+                    for i in range(batch_size)
+                ]
+            else:
+                init_latents = [
+                    retrieve_latents(self.vae.encode(vid), generator=generator).unsqueeze(0) for vid in video
+                ]
+
+            init_latents = torch.cat(init_latents, dim=0)
+
+            # restore vae to original dtype
+            if self.vae.config.force_upcast:
+                self.vae.to(dtype)
+
+            init_latents = init_latents.to(dtype)
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+            if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+                # expand init_latents for batch_size
+                error_message = (
+                    f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                    " images (`image`). Please make sure to update your script to pass as many initial images as text prompts"
+                )
+                raise ValueError(error_message)
+            elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+                raise ValueError(
+                    f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+                )
+            else:
+                init_latents = torch.cat([init_latents], dim=0)
+
+            noise = randn_tensor(init_latents.shape, generator=generator, device=device, dtype=dtype)
+            latents = self.scheduler.add_noise(init_latents, noise, timestep).permute(0, 2, 1, 3, 4)
+        else:
+            if shape != latents.shape:
+                # [B, C, F, H, W]
+                raise ValueError(f"`latents` expected to have {shape=}, but found {latents.shape=}")
+            latents = latents.to(device, dtype=dtype)
+
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        video: List[List[PipelineImageInput]] = None,
+        prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 7.5,
+        strength: float = 0.8,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            video (`List[PipelineImageInput]`):
+                The input video to condition the generation on. Must be a list of images/frames of the video.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
+                expense of slower inference.
+            strength (`float`, *optional*, defaults to 0.8):
+                Higher strength leads to more differences between original video and generated video.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
+                `(batch_size, num_channel, num_frames, height, width)`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
+                `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`AnimateDiffPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeine class.
+
+        Examples:
+
+        Returns:
+            [`pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt=prompt,
+            strength=strength,
+            height=height,
+            width=width,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            video=video,
+            latents=latents,
+            ip_adapter_image=ip_adapter_image,
+            ip_adapter_image_embeds=ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_videos_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, timesteps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            video=video,
+            height=height,
+            width=width,
+            num_channels_latents=num_channels_latents,
+            batch_size=batch_size * num_videos_per_prompt,
+            timestep=latent_timestep,
+            dtype=prompt_embeds.dtype,
+            device=device,
+            generator=generator,
+            latents=latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1
+        for free_init_iter in range(num_free_init_iters):
+            if self.free_init_enabled:
+                latents, timesteps = self._apply_free_init(
+                    latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
+                )
+                num_inference_steps = len(timesteps)
+                # make sure to readjust timesteps based on strength
+                timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, timesteps, strength, device)
+
+            self._num_timesteps = len(timesteps)
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+            # 8. Denoising loop
+            with self.progress_bar(total=num_inference_steps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=self.cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                    ).sample
+
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                        latents = callback_outputs.pop("latents", latents)
+                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+
+        # 9. Post-processing
+        if output_type == "latent":
+            video = latents
+        else:
+            video_tensor = self.decode_latents(latents)
+            video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
+
+        # 10. Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return AnimateDiffPipelineOutput(frames=video)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/animatediff/pipeline_output.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/animatediff/pipeline_output.py
new file mode 100644
index 000000000..184a45848
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/animatediff/pipeline_output.py
@@ -0,0 +1,23 @@
+from dataclasses import dataclass
+from typing import List, Union
+
+import numpy as np
+import PIL.Image
+import torch
+
+from ...utils import BaseOutput
+
+
+@dataclass
+class AnimateDiffPipelineOutput(BaseOutput):
+    r"""
+     Output class for AnimateDiff pipelines.
+
+     Args:
+         frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
+             List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised
+     PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+    `(batch_size, num_frames, channels, height, width)`
+    """
+
+    frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm/__init__.py
new file mode 100644
index 000000000..a002b4aa7
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm/__init__.py
@@ -0,0 +1,51 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+    is_transformers_available,
+    is_transformers_version,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils.dummy_torch_and_transformers_objects import (
+        AudioLDMPipeline,
+    )
+
+    _dummy_objects.update({"AudioLDMPipeline": AudioLDMPipeline})
+else:
+    _import_structure["pipeline_audioldm"] = ["AudioLDMPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import (
+            AudioLDMPipeline,
+        )
+
+    else:
+        from .pipeline_audioldm import AudioLDMPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
new file mode 100644
index 000000000..69bebdd0d
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
@@ -0,0 +1,546 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import ClapTextModelWithProjection, RobertaTokenizer, RobertaTokenizerFast, SpeechT5HifiGan
+
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline, StableDiffusionMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import AudioLDMPipeline
+        >>> import torch
+        >>> import scipy
+
+        >>> repo_id = "cvssp/audioldm-s-full-v2"
+        >>> pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
+        >>> audio = pipe(prompt, num_inference_steps=10, audio_length_in_s=5.0).audios[0]
+
+        >>> # save the audio sample as a .wav file
+        >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio)
+        ```
+"""
+
+
+class AudioLDMPipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for text-to-audio generation using AudioLDM.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.ClapTextModelWithProjection`]):
+            Frozen text-encoder (`ClapTextModelWithProjection`, specifically the
+            [laion/clap-htsat-unfused](https://huggingface.co/laion/clap-htsat-unfused) variant.
+        tokenizer ([`PreTrainedTokenizer`]):
+            A [`~transformers.RobertaTokenizer`] to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded audio latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        vocoder ([`~transformers.SpeechT5HifiGan`]):
+            Vocoder of class `SpeechT5HifiGan`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: ClapTextModelWithProjection,
+        tokenizer: Union[RobertaTokenizer, RobertaTokenizerFast],
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        vocoder: SpeechT5HifiGan,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            vocoder=vocoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_waveforms_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device (`torch.device`):
+                torch device
+            num_waveforms_per_prompt (`int`):
+                number of waveforms that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the audio generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            attention_mask = text_inputs.attention_mask
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLAP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask.to(device),
+            )
+            prompt_embeds = prompt_embeds.text_embeds
+            # additional L_2 normalization over each hidden-state
+            prompt_embeds = F.normalize(prompt_embeds, dim=-1)
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+        (
+            bs_embed,
+            seq_len,
+        ) = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_waveforms_per_prompt)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_waveforms_per_prompt, seq_len)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            uncond_input_ids = uncond_input.input_ids.to(device)
+            attention_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input_ids,
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds.text_embeds
+            # additional L_2 normalization over each hidden-state
+            negative_prompt_embeds = F.normalize(negative_prompt_embeds, dim=-1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_waveforms_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_waveforms_per_prompt, seq_len)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        mel_spectrogram = self.vae.decode(latents).sample
+        return mel_spectrogram
+
+    def mel_spectrogram_to_waveform(self, mel_spectrogram):
+        if mel_spectrogram.dim() == 4:
+            mel_spectrogram = mel_spectrogram.squeeze(1)
+
+        waveform = self.vocoder(mel_spectrogram)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        waveform = waveform.cpu().float()
+        return waveform
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        audio_length_in_s,
+        vocoder_upsample_factor,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        min_audio_length_in_s = vocoder_upsample_factor * self.vae_scale_factor
+        if audio_length_in_s < min_audio_length_in_s:
+            raise ValueError(
+                f"`audio_length_in_s` has to be a positive value greater than or equal to {min_audio_length_in_s}, but "
+                f"is {audio_length_in_s}."
+            )
+
+        if self.vocoder.config.model_in_dim % self.vae_scale_factor != 0:
+            raise ValueError(
+                f"The number of frequency bins in the vocoder's log-mel spectrogram has to be divisible by the "
+                f"VAE scale factor, but got {self.vocoder.config.model_in_dim} bins and a scale factor of "
+                f"{self.vae_scale_factor}."
+            )
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents with width->self.vocoder.config.model_in_dim
+    def prepare_latents(self, batch_size, num_channels_latents, height, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            height // self.vae_scale_factor,
+            self.vocoder.config.model_in_dim // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        audio_length_in_s: Optional[float] = None,
+        num_inference_steps: int = 10,
+        guidance_scale: float = 2.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_waveforms_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        output_type: Optional[str] = "np",
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide audio generation. If not defined, you need to pass `prompt_embeds`.
+            audio_length_in_s (`int`, *optional*, defaults to 5.12):
+                The length of the generated audio sample in seconds.
+            num_inference_steps (`int`, *optional*, defaults to 10):
+                The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 2.5):
+                A higher guidance scale value encourages the model to generate audio that is closely linked to the text
+                `prompt` at the expense of lower sound quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in audio generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_waveforms_per_prompt (`int`, *optional*, defaults to 1):
+                The number of waveforms to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format of the generated image. Choose between `"np"` to return a NumPy `np.ndarray` or
+                `"pt"` to return a PyTorch `torch.Tensor` object.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.AudioPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.AudioPipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated audio.
+        """
+        # 0. Convert audio input length from seconds to spectrogram height
+        vocoder_upsample_factor = np.prod(self.vocoder.config.upsample_rates) / self.vocoder.config.sampling_rate
+
+        if audio_length_in_s is None:
+            audio_length_in_s = self.unet.config.sample_size * self.vae_scale_factor * vocoder_upsample_factor
+
+        height = int(audio_length_in_s / vocoder_upsample_factor)
+
+        original_waveform_length = int(audio_length_in_s * self.vocoder.config.sampling_rate)
+        if height % self.vae_scale_factor != 0:
+            height = int(np.ceil(height / self.vae_scale_factor)) * self.vae_scale_factor
+            logger.info(
+                f"Audio length in seconds {audio_length_in_s} is increased to {height * vocoder_upsample_factor} "
+                f"so that it can be handled by the model. It will be cut to {audio_length_in_s} after the "
+                f"denoising process."
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            audio_length_in_s,
+            vocoder_upsample_factor,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_waveforms_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_waveforms_per_prompt,
+            num_channels_latents,
+            height,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=None,
+                    class_labels=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # 8. Post-processing
+        mel_spectrogram = self.decode_latents(latents)
+
+        audio = self.mel_spectrogram_to_waveform(mel_spectrogram)
+
+        audio = audio[:, :original_waveform_length]
+
+        if output_type == "np":
+            audio = audio.numpy()
+
+        if not return_dict:
+            return (audio,)
+
+        return AudioPipelineOutput(audios=audio)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm2/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm2/__init__.py
new file mode 100644
index 000000000..23cd0e44f
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm2/__init__.py
@@ -0,0 +1,50 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+    is_transformers_version,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["modeling_audioldm2"] = ["AudioLDM2ProjectionModel", "AudioLDM2UNet2DConditionModel"]
+    _import_structure["pipeline_audioldm2"] = ["AudioLDM2Pipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+
+    else:
+        from .modeling_audioldm2 import AudioLDM2ProjectionModel, AudioLDM2UNet2DConditionModel
+        from .pipeline_audioldm2 import AudioLDM2Pipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py
new file mode 100644
index 000000000..c0b85e4db
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py
@@ -0,0 +1,1511 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import UNet2DConditionLoadersMixin
+from ...models.activations import get_activation
+from ...models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from ...models.embeddings import (
+    TimestepEmbedding,
+    Timesteps,
+)
+from ...models.modeling_utils import ModelMixin
+from ...models.resnet import Downsample2D, ResnetBlock2D, Upsample2D
+from ...models.transformers.transformer_2d import Transformer2DModel
+from ...models.unets.unet_2d_blocks import DownBlock2D, UpBlock2D
+from ...models.unets.unet_2d_condition import UNet2DConditionOutput
+from ...utils import BaseOutput, is_torch_version, logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def add_special_tokens(hidden_states, attention_mask, sos_token, eos_token):
+    batch_size = hidden_states.shape[0]
+
+    if attention_mask is not None:
+        # Add two more steps to attn mask
+        new_attn_mask_step = attention_mask.new_ones((batch_size, 1))
+        attention_mask = torch.concat([new_attn_mask_step, attention_mask, new_attn_mask_step], dim=-1)
+
+    # Add the SOS / EOS tokens at the start / end of the sequence respectively
+    sos_token = sos_token.expand(batch_size, 1, -1)
+    eos_token = eos_token.expand(batch_size, 1, -1)
+    hidden_states = torch.concat([sos_token, hidden_states, eos_token], dim=1)
+    return hidden_states, attention_mask
+
+
+@dataclass
+class AudioLDM2ProjectionModelOutput(BaseOutput):
+    """
+    Args:
+    Class for AudioLDM2 projection layer's outputs.
+        hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states obtained by linearly projecting the hidden-states for each of the text
+             encoders and subsequently concatenating them together.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices, formed by concatenating the attention masks
+             for the two text encoders together. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+    """
+
+    hidden_states: torch.FloatTensor
+    attention_mask: Optional[torch.LongTensor] = None
+
+
+class AudioLDM2ProjectionModel(ModelMixin, ConfigMixin):
+    """
+    A simple linear projection model to map two text embeddings to a shared latent space. It also inserts learned
+    embedding vectors at the start and end of each text embedding sequence respectively. Each variable appended with
+    `_1` refers to that corresponding to the second text encoder. Otherwise, it is from the first.
+
+    Args:
+        text_encoder_dim (`int`):
+            Dimensionality of the text embeddings from the first text encoder (CLAP).
+        text_encoder_1_dim (`int`):
+            Dimensionality of the text embeddings from the second text encoder (T5 or VITS).
+        langauge_model_dim (`int`):
+            Dimensionality of the text embeddings from the language model (GPT2).
+    """
+
+    @register_to_config
+    def __init__(self, text_encoder_dim, text_encoder_1_dim, langauge_model_dim):
+        super().__init__()
+        # additional projection layers for each text encoder
+        self.projection = nn.Linear(text_encoder_dim, langauge_model_dim)
+        self.projection_1 = nn.Linear(text_encoder_1_dim, langauge_model_dim)
+
+        # learnable SOS / EOS token embeddings for each text encoder
+        self.sos_embed = nn.Parameter(torch.ones(langauge_model_dim))
+        self.eos_embed = nn.Parameter(torch.ones(langauge_model_dim))
+
+        self.sos_embed_1 = nn.Parameter(torch.ones(langauge_model_dim))
+        self.eos_embed_1 = nn.Parameter(torch.ones(langauge_model_dim))
+
+    def forward(
+        self,
+        hidden_states: Optional[torch.FloatTensor] = None,
+        hidden_states_1: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        attention_mask_1: Optional[torch.LongTensor] = None,
+    ):
+        hidden_states = self.projection(hidden_states)
+        hidden_states, attention_mask = add_special_tokens(
+            hidden_states, attention_mask, sos_token=self.sos_embed, eos_token=self.eos_embed
+        )
+
+        hidden_states_1 = self.projection_1(hidden_states_1)
+        hidden_states_1, attention_mask_1 = add_special_tokens(
+            hidden_states_1, attention_mask_1, sos_token=self.sos_embed_1, eos_token=self.eos_embed_1
+        )
+
+        # concatenate clap and t5 text encoding
+        hidden_states = torch.cat([hidden_states, hidden_states_1], dim=1)
+
+        # concatenate attention masks
+        if attention_mask is None and attention_mask_1 is not None:
+            attention_mask = attention_mask_1.new_ones((hidden_states[:2]))
+        elif attention_mask is not None and attention_mask_1 is None:
+            attention_mask_1 = attention_mask.new_ones((hidden_states_1[:2]))
+
+        if attention_mask is not None and attention_mask_1 is not None:
+            attention_mask = torch.cat([attention_mask, attention_mask_1], dim=-1)
+        else:
+            attention_mask = None
+
+        return AudioLDM2ProjectionModelOutput(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+        )
+
+
+class AudioLDM2UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+    r"""
+    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
+    shaped output. Compared to the vanilla [`UNet2DConditionModel`], this variant optionally includes an additional
+    self-attention layer in each Transformer block, as well as multiple cross-attention layers. It also allows for up
+    to two cross-attention embeddings, `encoder_hidden_states` and `encoder_hidden_states_1`.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
+            Block type for middle of UNet, it can only be `UNetMidBlock2DCrossAttn` for AudioLDM2.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
+            The tuple of upsample blocks to use.
+        only_cross_attention (`bool` or `Tuple[bool]`, *optional*, default to `False`):
+            Whether to include self-attention in the basic transformer blocks, see
+            [`~models.attention.BasicTransformerBlock`].
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+            If `None`, normalization and activation layers is skipped in post-processing.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+        num_attention_heads (`int`, *optional*):
+            The number of attention heads. If not defined, defaults to `attention_head_dim`
+        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
+            for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        num_class_embeds (`int`, *optional*, defaults to `None`):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        time_embedding_type (`str`, *optional*, defaults to `positional`):
+            The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
+        time_embedding_dim (`int`, *optional*, defaults to `None`):
+            An optional override for the dimension of the projected time embedding.
+        time_embedding_act_fn (`str`, *optional*, defaults to `None`):
+            Optional activation function to use only once on the time embeddings before they are passed to the rest of
+            the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
+        timestep_post_act (`str`, *optional*, defaults to `None`):
+            The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
+        time_cond_proj_dim (`int`, *optional*, defaults to `None`):
+            The dimension of `cond_proj` layer in the timestep embedding.
+        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
+        conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
+        projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
+            `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
+        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
+            embeddings with the class embeddings.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
+        up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        time_embedding_type: str = "positional",
+        time_embedding_dim: Optional[int] = None,
+        time_embedding_act_fn: Optional[str] = None,
+        timestep_post_act: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        conv_out_kernel: int = 3,
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        class_embeddings_concat: bool = False,
+    ):
+        super().__init__()
+
+        self.sample_size = sample_size
+
+        if num_attention_heads is not None:
+            raise ValueError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+
+        # input
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = nn.Conv2d(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+
+        # time
+        if time_embedding_type == "positional":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
+
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        else:
+            raise ValueError(f"{time_embedding_type} does not exist. Please make sure to use `positional`.")
+
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+            post_act_fn=timestep_post_act,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif class_embed_type == "simple_projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+
+        if time_embedding_act_fn is None:
+            self.time_embed_act = None
+        else:
+            self.time_embed_act = get_activation(time_embedding_act_fn)
+
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+
+        if class_embeddings_concat:
+            # The time embeddings are concatenated with the class embeddings. The dimension of the
+            # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
+            # regular time embeddings
+            blocks_time_embed_dim = time_embed_dim * 2
+        else:
+            blocks_time_embed_dim = time_embed_dim
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=downsample_padding,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        if mid_block_type == "UNetMidBlock2DCrossAttn":
+            self.mid_block = UNetMidBlock2DCrossAttn(
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim[-1],
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+            )
+        else:
+            raise ValueError(
+                f"unknown mid_block_type : {mid_block_type}. Should be `UNetMidBlock2DCrossAttn` for AudioLDM2."
+            )
+
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
+        only_cross_attention = list(reversed(only_cross_attention))
+
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        if norm_num_groups is not None:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+            )
+
+            self.conv_act = get_activation(act_fn)
+
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = nn.Conv2d(
+            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        )
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attention_slice
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+
+        num_sliceable_layers = len(sliceable_head_dims)
+
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel._set_gradient_checkpointing
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        encoder_hidden_states_1: Optional[torch.Tensor] = None,
+        encoder_attention_mask_1: Optional[torch.Tensor] = None,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        The [`AudioLDM2UNet2DConditionModel`] forward method.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            encoder_attention_mask (`torch.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+            encoder_hidden_states_1 (`torch.FloatTensor`, *optional*):
+                A second set of encoder hidden states with shape `(batch, sequence_length_2, feature_dim_2)`. Can be
+                used to condition the model on a different set of embeddings to `encoder_hidden_states`.
+            encoder_attention_mask_1 (`torch.Tensor`, *optional*):
+                A cross-attention mask of shape `(batch, sequence_length_2)` is applied to `encoder_hidden_states_1`.
+                If `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+
+        Returns:
+            [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        if encoder_attention_mask_1 is not None:
+            encoder_attention_mask_1 = (1 - encoder_attention_mask_1.to(sample.dtype)) * -10000.0
+            encoder_attention_mask_1 = encoder_attention_mask_1.unsqueeze(1)
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+
+        emb = emb + aug_emb if aug_emb is not None else emb
+
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    encoder_hidden_states_1=encoder_hidden_states_1,
+                    encoder_attention_mask_1=encoder_attention_mask_1,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+
+            down_block_res_samples += res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                cross_attention_kwargs=cross_attention_kwargs,
+                encoder_attention_mask=encoder_attention_mask,
+                encoder_hidden_states_1=encoder_hidden_states_1,
+                encoder_attention_mask_1=encoder_attention_mask_1,
+            )
+
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    encoder_hidden_states_1=encoder_hidden_states_1,
+                    encoder_attention_mask_1=encoder_attention_mask_1,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+                )
+
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        if not return_dict:
+            return (sample,)
+
+        return UNet2DConditionOutput(sample=sample)
+
+
+def get_down_block(
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    transformer_layers_per_block=1,
+    num_attention_heads=None,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    downsample_padding=None,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+):
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+    if down_block_type == "DownBlock2D":
+        return DownBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "CrossAttnDownBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock2D")
+        return CrossAttnDownBlock2D(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+
+
+def get_up_block(
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    transformer_layers_per_block=1,
+    num_attention_heads=None,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+):
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    if up_block_type == "UpBlock2D":
+        return UpBlock2D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "CrossAttnUpBlock2D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock2D")
+        return CrossAttnUpBlock2D(
+            num_layers=num_layers,
+            transformer_layers_per_block=transformer_layers_per_block,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+
+
+class CrossAttnDownBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,)
+        if isinstance(cross_attention_dim, (list, tuple)) and len(cross_attention_dim) > 4:
+            raise ValueError(
+                "Only up to 4 cross-attention layers are supported. Ensure that the length of cross-attention "
+                f"dims is less than or equal to 4. Got cross-attention dims {cross_attention_dim} of length {len(cross_attention_dim)}"
+            )
+        self.cross_attention_dim = cross_attention_dim
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            for j in range(len(cross_attention_dim)):
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim[j],
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        double_self_attention=True if cross_attention_dim[j] is None else False,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample2D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states_1: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask_1: Optional[torch.FloatTensor] = None,
+    ):
+        output_states = ()
+        num_layers = len(self.resnets)
+        num_attention_per_layer = len(self.attentions) // num_layers
+
+        encoder_hidden_states_1 = (
+            encoder_hidden_states_1 if encoder_hidden_states_1 is not None else encoder_hidden_states
+        )
+        encoder_attention_mask_1 = (
+            encoder_attention_mask_1 if encoder_hidden_states_1 is not None else encoder_attention_mask
+        )
+
+        for i in range(num_layers):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.resnets[i]),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                for idx, cross_attention_dim in enumerate(self.cross_attention_dim):
+                    if cross_attention_dim is not None and idx <= 1:
+                        forward_encoder_hidden_states = encoder_hidden_states
+                        forward_encoder_attention_mask = encoder_attention_mask
+                    elif cross_attention_dim is not None and idx > 1:
+                        forward_encoder_hidden_states = encoder_hidden_states_1
+                        forward_encoder_attention_mask = encoder_attention_mask_1
+                    else:
+                        forward_encoder_hidden_states = None
+                        forward_encoder_attention_mask = None
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(self.attentions[i * num_attention_per_layer + idx], return_dict=False),
+                        hidden_states,
+                        forward_encoder_hidden_states,
+                        None,  # timestep
+                        None,  # class_labels
+                        cross_attention_kwargs,
+                        attention_mask,
+                        forward_encoder_attention_mask,
+                        **ckpt_kwargs,
+                    )[0]
+            else:
+                hidden_states = self.resnets[i](hidden_states, temb)
+                for idx, cross_attention_dim in enumerate(self.cross_attention_dim):
+                    if cross_attention_dim is not None and idx <= 1:
+                        forward_encoder_hidden_states = encoder_hidden_states
+                        forward_encoder_attention_mask = encoder_attention_mask
+                    elif cross_attention_dim is not None and idx > 1:
+                        forward_encoder_hidden_states = encoder_hidden_states_1
+                        forward_encoder_attention_mask = encoder_attention_mask_1
+                    else:
+                        forward_encoder_hidden_states = None
+                        forward_encoder_attention_mask = None
+                    hidden_states = self.attentions[i * num_attention_per_layer + idx](
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        encoder_hidden_states=forward_encoder_hidden_states,
+                        encoder_attention_mask=forward_encoder_attention_mask,
+                        return_dict=False,
+                    )[0]
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class UNetMidBlock2DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads=1,
+        output_scale_factor=1.0,
+        cross_attention_dim=1280,
+        use_linear_projection=False,
+        upcast_attention=False,
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,)
+        if isinstance(cross_attention_dim, (list, tuple)) and len(cross_attention_dim) > 4:
+            raise ValueError(
+                "Only up to 4 cross-attention layers are supported. Ensure that the length of cross-attention "
+                f"dims is less than or equal to 4. Got cross-attention dims {cross_attention_dim} of length {len(cross_attention_dim)}"
+            )
+        self.cross_attention_dim = cross_attention_dim
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+
+        for i in range(num_layers):
+            for j in range(len(cross_attention_dim)):
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim[j],
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        upcast_attention=upcast_attention,
+                        double_self_attention=True if cross_attention_dim[j] is None else False,
+                    )
+                )
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states_1: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask_1: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        hidden_states = self.resnets[0](hidden_states, temb)
+        num_attention_per_layer = len(self.attentions) // (len(self.resnets) - 1)
+
+        encoder_hidden_states_1 = (
+            encoder_hidden_states_1 if encoder_hidden_states_1 is not None else encoder_hidden_states
+        )
+        encoder_attention_mask_1 = (
+            encoder_attention_mask_1 if encoder_hidden_states_1 is not None else encoder_attention_mask
+        )
+
+        for i in range(len(self.resnets[1:])):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                for idx, cross_attention_dim in enumerate(self.cross_attention_dim):
+                    if cross_attention_dim is not None and idx <= 1:
+                        forward_encoder_hidden_states = encoder_hidden_states
+                        forward_encoder_attention_mask = encoder_attention_mask
+                    elif cross_attention_dim is not None and idx > 1:
+                        forward_encoder_hidden_states = encoder_hidden_states_1
+                        forward_encoder_attention_mask = encoder_attention_mask_1
+                    else:
+                        forward_encoder_hidden_states = None
+                        forward_encoder_attention_mask = None
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(self.attentions[i * num_attention_per_layer + idx], return_dict=False),
+                        hidden_states,
+                        forward_encoder_hidden_states,
+                        None,  # timestep
+                        None,  # class_labels
+                        cross_attention_kwargs,
+                        attention_mask,
+                        forward_encoder_attention_mask,
+                        **ckpt_kwargs,
+                    )[0]
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.resnets[i + 1]),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+            else:
+                for idx, cross_attention_dim in enumerate(self.cross_attention_dim):
+                    if cross_attention_dim is not None and idx <= 1:
+                        forward_encoder_hidden_states = encoder_hidden_states
+                        forward_encoder_attention_mask = encoder_attention_mask
+                    elif cross_attention_dim is not None and idx > 1:
+                        forward_encoder_hidden_states = encoder_hidden_states_1
+                        forward_encoder_attention_mask = encoder_attention_mask_1
+                    else:
+                        forward_encoder_hidden_states = None
+                        forward_encoder_attention_mask = None
+                    hidden_states = self.attentions[i * num_attention_per_layer + idx](
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        encoder_hidden_states=forward_encoder_hidden_states,
+                        encoder_attention_mask=forward_encoder_attention_mask,
+                        return_dict=False,
+                    )[0]
+
+                hidden_states = self.resnets[i + 1](hidden_states, temb)
+
+        return hidden_states
+
+
+class CrossAttnUpBlock2D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        add_upsample=True,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,)
+        if isinstance(cross_attention_dim, (list, tuple)) and len(cross_attention_dim) > 4:
+            raise ValueError(
+                "Only up to 4 cross-attention layers are supported. Ensure that the length of cross-attention "
+                f"dims is less than or equal to 4. Got cross-attention dims {cross_attention_dim} of length {len(cross_attention_dim)}"
+            )
+        self.cross_attention_dim = cross_attention_dim
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlock2D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            for j in range(len(cross_attention_dim)):
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block,
+                        cross_attention_dim=cross_attention_dim[j],
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        double_self_attention=True if cross_attention_dim[j] is None else False,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states_1: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask_1: Optional[torch.FloatTensor] = None,
+    ):
+        num_layers = len(self.resnets)
+        num_attention_per_layer = len(self.attentions) // num_layers
+
+        encoder_hidden_states_1 = (
+            encoder_hidden_states_1 if encoder_hidden_states_1 is not None else encoder_hidden_states
+        )
+        encoder_attention_mask_1 = (
+            encoder_attention_mask_1 if encoder_hidden_states_1 is not None else encoder_attention_mask
+        )
+
+        for i in range(num_layers):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.resnets[i]),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                for idx, cross_attention_dim in enumerate(self.cross_attention_dim):
+                    if cross_attention_dim is not None and idx <= 1:
+                        forward_encoder_hidden_states = encoder_hidden_states
+                        forward_encoder_attention_mask = encoder_attention_mask
+                    elif cross_attention_dim is not None and idx > 1:
+                        forward_encoder_hidden_states = encoder_hidden_states_1
+                        forward_encoder_attention_mask = encoder_attention_mask_1
+                    else:
+                        forward_encoder_hidden_states = None
+                        forward_encoder_attention_mask = None
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(self.attentions[i * num_attention_per_layer + idx], return_dict=False),
+                        hidden_states,
+                        forward_encoder_hidden_states,
+                        None,  # timestep
+                        None,  # class_labels
+                        cross_attention_kwargs,
+                        attention_mask,
+                        forward_encoder_attention_mask,
+                        **ckpt_kwargs,
+                    )[0]
+            else:
+                hidden_states = self.resnets[i](hidden_states, temb)
+                for idx, cross_attention_dim in enumerate(self.cross_attention_dim):
+                    if cross_attention_dim is not None and idx <= 1:
+                        forward_encoder_hidden_states = encoder_hidden_states
+                        forward_encoder_attention_mask = encoder_attention_mask
+                    elif cross_attention_dim is not None and idx > 1:
+                        forward_encoder_hidden_states = encoder_hidden_states_1
+                        forward_encoder_attention_mask = encoder_attention_mask_1
+                    else:
+                        forward_encoder_hidden_states = None
+                        forward_encoder_attention_mask = None
+                    hidden_states = self.attentions[i * num_attention_per_layer + idx](
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        encoder_hidden_states=forward_encoder_hidden_states,
+                        encoder_attention_mask=forward_encoder_attention_mask,
+                        return_dict=False,
+                    )[0]
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+
+        return hidden_states
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
new file mode 100644
index 000000000..e01aa9929
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
@@ -0,0 +1,980 @@
+# Copyright 2024 CVSSP, ByteDance and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import (
+    ClapFeatureExtractor,
+    ClapModel,
+    GPT2Model,
+    RobertaTokenizer,
+    RobertaTokenizerFast,
+    SpeechT5HifiGan,
+    T5EncoderModel,
+    T5Tokenizer,
+    T5TokenizerFast,
+)
+
+from ...models import AutoencoderKL
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    is_accelerate_available,
+    is_accelerate_version,
+    is_librosa_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
+from .modeling_audioldm2 import AudioLDM2ProjectionModel, AudioLDM2UNet2DConditionModel
+
+
+if is_librosa_available():
+    import librosa
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import scipy
+        >>> import torch
+        >>> from diffusers import AudioLDM2Pipeline
+
+        >>> repo_id = "cvssp/audioldm2"
+        >>> pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+
+        >>> # define the prompts
+        >>> prompt = "The sound of a hammer hitting a wooden surface."
+        >>> negative_prompt = "Low quality."
+
+        >>> # set the seed for generator
+        >>> generator = torch.Generator("cuda").manual_seed(0)
+
+        >>> # run the generation
+        >>> audio = pipe(
+        ...     prompt,
+        ...     negative_prompt=negative_prompt,
+        ...     num_inference_steps=200,
+        ...     audio_length_in_s=10.0,
+        ...     num_waveforms_per_prompt=3,
+        ...     generator=generator,
+        ... ).audios
+
+        >>> # save the best audio sample (index 0) as a .wav file
+        >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio[0])
+        ```
+"""
+
+
+def prepare_inputs_for_generation(
+    inputs_embeds,
+    attention_mask=None,
+    past_key_values=None,
+    **kwargs,
+):
+    if past_key_values is not None:
+        # only last token for inputs_embeds if past is defined in kwargs
+        inputs_embeds = inputs_embeds[:, -1:]
+
+    return {
+        "inputs_embeds": inputs_embeds,
+        "attention_mask": attention_mask,
+        "past_key_values": past_key_values,
+        "use_cache": kwargs.get("use_cache"),
+    }
+
+
+class AudioLDM2Pipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-audio generation using AudioLDM2.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.ClapModel`]):
+            First frozen text-encoder. AudioLDM2 uses the joint audio-text embedding model
+            [CLAP](https://huggingface.co/docs/transformers/model_doc/clap#transformers.CLAPTextModelWithProjection),
+            specifically the [laion/clap-htsat-unfused](https://huggingface.co/laion/clap-htsat-unfused) variant. The
+            text branch is used to encode the text prompt to a prompt embedding. The full audio-text model is used to
+            rank generated waveforms against the text prompt by computing similarity scores.
+        text_encoder_2 ([`~transformers.T5EncoderModel`]):
+            Second frozen text-encoder. AudioLDM2 uses the encoder of
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
+            [google/flan-t5-large](https://huggingface.co/google/flan-t5-large) variant.
+        projection_model ([`AudioLDM2ProjectionModel`]):
+            A trained model used to linearly project the hidden-states from the first and second text encoder models
+            and insert learned SOS and EOS token embeddings. The projected hidden-states from the two text encoders are
+            concatenated to give the input to the language model.
+        language_model ([`~transformers.GPT2Model`]):
+            An auto-regressive language model used to generate a sequence of hidden-states conditioned on the projected
+            outputs from the two text encoders.
+        tokenizer ([`~transformers.RobertaTokenizer`]):
+            Tokenizer to tokenize text for the first frozen text-encoder.
+        tokenizer_2 ([`~transformers.T5Tokenizer`]):
+            Tokenizer to tokenize text for the second frozen text-encoder.
+        feature_extractor ([`~transformers.ClapFeatureExtractor`]):
+            Feature extractor to pre-process generated audio waveforms to log-mel spectrograms for automatic scoring.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded audio latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        vocoder ([`~transformers.SpeechT5HifiGan`]):
+            Vocoder of class `SpeechT5HifiGan` to convert the mel-spectrogram latents to the final audio waveform.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: ClapModel,
+        text_encoder_2: T5EncoderModel,
+        projection_model: AudioLDM2ProjectionModel,
+        language_model: GPT2Model,
+        tokenizer: Union[RobertaTokenizer, RobertaTokenizerFast],
+        tokenizer_2: Union[T5Tokenizer, T5TokenizerFast],
+        feature_extractor: ClapFeatureExtractor,
+        unet: AudioLDM2UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        vocoder: SpeechT5HifiGan,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            projection_model=projection_model,
+            language_model=language_model,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            feature_extractor=feature_extractor,
+            unet=unet,
+            scheduler=scheduler,
+            vocoder=vocoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+    # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.enable_vae_slicing
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.disable_vae_slicing
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        model_sequence = [
+            self.text_encoder.text_model,
+            self.text_encoder.text_projection,
+            self.text_encoder_2,
+            self.projection_model,
+            self.language_model,
+            self.unet,
+            self.vae,
+            self.vocoder,
+            self.text_encoder,
+        ]
+
+        hook = None
+        for cpu_offloaded_model in model_sequence:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
+
+    def generate_language_model(
+        self,
+        inputs_embeds: torch.Tensor = None,
+        max_new_tokens: int = 8,
+        **model_kwargs,
+    ):
+        """
+
+        Generates a sequence of hidden-states from the language model, conditioned on the embedding inputs.
+
+        Parameters:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                The sequence used as a prompt for the generation.
+            max_new_tokens (`int`):
+                Number of new tokens to generate.
+            model_kwargs (`Dict[str, Any]`, *optional*):
+                Ad hoc parametrization of additional model-specific kwargs that will be forwarded to the `forward`
+                function of the model.
+
+        Return:
+            `inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                The sequence of generated hidden-states.
+        """
+        max_new_tokens = max_new_tokens if max_new_tokens is not None else self.language_model.config.max_new_tokens
+        for _ in range(max_new_tokens):
+            # prepare model inputs
+            model_inputs = prepare_inputs_for_generation(inputs_embeds, **model_kwargs)
+
+            # forward pass to get next hidden states
+            output = self.language_model(**model_inputs, return_dict=True)
+
+            next_hidden_states = output.last_hidden_state
+
+            # Update the model input
+            inputs_embeds = torch.cat([inputs_embeds, next_hidden_states[:, -1:, :]], dim=1)
+
+            # Update generated hidden states, model inputs, and length for next step
+            model_kwargs = self.language_model._update_model_kwargs_for_generation(output, model_kwargs)
+
+        return inputs_embeds[:, -max_new_tokens:, :]
+
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_waveforms_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        generated_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_generated_prompt_embeds: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        negative_attention_mask: Optional[torch.LongTensor] = None,
+        max_new_tokens: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device (`torch.device`):
+                torch device
+            num_waveforms_per_prompt (`int`):
+                number of waveforms that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the audio generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-computed text embeddings from the Flan T5 model. Can be used to easily tweak text inputs, *e.g.*
+                prompt weighting. If not provided, text embeddings will be computed from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-computed negative text embeddings from the Flan T5 model. Can be used to easily tweak text inputs,
+                *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from
+                `negative_prompt` input argument.
+            generated_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings from the GPT2 langauge model. Can be used to easily tweak text inputs,
+                 *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input
+                 argument.
+            negative_generated_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings from the GPT2 language model. Can be used to easily tweak text
+                inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from
+                `negative_prompt` input argument.
+            attention_mask (`torch.LongTensor`, *optional*):
+                Pre-computed attention mask to be applied to the `prompt_embeds`. If not provided, attention mask will
+                be computed from `prompt` input argument.
+            negative_attention_mask (`torch.LongTensor`, *optional*):
+                Pre-computed attention mask to be applied to the `negative_prompt_embeds`. If not provided, attention
+                mask will be computed from `negative_prompt` input argument.
+            max_new_tokens (`int`, *optional*, defaults to None):
+                The number of new tokens to generate with the GPT2 language model.
+        Returns:
+            prompt_embeds (`torch.FloatTensor`):
+                Text embeddings from the Flan T5 model.
+            attention_mask (`torch.LongTensor`):
+                Attention mask to be applied to the `prompt_embeds`.
+            generated_prompt_embeds (`torch.FloatTensor`):
+                Text embeddings generated from the GPT2 langauge model.
+
+        Example:
+
+        ```python
+        >>> import scipy
+        >>> import torch
+        >>> from diffusers import AudioLDM2Pipeline
+
+        >>> repo_id = "cvssp/audioldm2"
+        >>> pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+
+        >>> # Get text embedding vectors
+        >>> prompt_embeds, attention_mask, generated_prompt_embeds = pipe.encode_prompt(
+        ...     prompt="Techno music with a strong, upbeat tempo and high melodic riffs",
+        ...     device="cuda",
+        ...     do_classifier_free_guidance=True,
+        ... )
+
+        >>> # Pass text embeddings to pipeline for text-conditional audio generation
+        >>> audio = pipe(
+        ...     prompt_embeds=prompt_embeds,
+        ...     attention_mask=attention_mask,
+        ...     generated_prompt_embeds=generated_prompt_embeds,
+        ...     num_inference_steps=200,
+        ...     audio_length_in_s=10.0,
+        ... ).audios[0]
+
+        >>> # save generated audio sample
+        >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio)
+        ```"""
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2]
+        text_encoders = [self.text_encoder, self.text_encoder_2]
+
+        if prompt_embeds is None:
+            prompt_embeds_list = []
+            attention_mask_list = []
+
+            for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length" if isinstance(tokenizer, (RobertaTokenizer, RobertaTokenizerFast)) else True,
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                text_input_ids = text_inputs.input_ids
+                attention_mask = text_inputs.attention_mask
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        f"The following part of your input was truncated because {text_encoder.config.model_type} can "
+                        f"only handle sequences up to {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                text_input_ids = text_input_ids.to(device)
+                attention_mask = attention_mask.to(device)
+
+                if text_encoder.config.model_type == "clap":
+                    prompt_embeds = text_encoder.get_text_features(
+                        text_input_ids,
+                        attention_mask=attention_mask,
+                    )
+                    # append the seq-len dim: (bs, hidden_size) -> (bs, seq_len, hidden_size)
+                    prompt_embeds = prompt_embeds[:, None, :]
+                    # make sure that we attend to this single hidden-state
+                    attention_mask = attention_mask.new_ones((batch_size, 1))
+                else:
+                    prompt_embeds = text_encoder(
+                        text_input_ids,
+                        attention_mask=attention_mask,
+                    )
+                    prompt_embeds = prompt_embeds[0]
+
+                prompt_embeds_list.append(prompt_embeds)
+                attention_mask_list.append(attention_mask)
+
+            projection_output = self.projection_model(
+                hidden_states=prompt_embeds_list[0],
+                hidden_states_1=prompt_embeds_list[1],
+                attention_mask=attention_mask_list[0],
+                attention_mask_1=attention_mask_list[1],
+            )
+            projected_prompt_embeds = projection_output.hidden_states
+            projected_attention_mask = projection_output.attention_mask
+
+            generated_prompt_embeds = self.generate_language_model(
+                projected_prompt_embeds,
+                attention_mask=projected_attention_mask,
+                max_new_tokens=max_new_tokens,
+            )
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        attention_mask = (
+            attention_mask.to(device=device)
+            if attention_mask is not None
+            else torch.ones(prompt_embeds.shape[:2], dtype=torch.long, device=device)
+        )
+        generated_prompt_embeds = generated_prompt_embeds.to(dtype=self.language_model.dtype, device=device)
+
+        bs_embed, seq_len, hidden_size = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_waveforms_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_waveforms_per_prompt, seq_len, hidden_size)
+
+        # duplicate attention mask for each generation per prompt
+        attention_mask = attention_mask.repeat(1, num_waveforms_per_prompt)
+        attention_mask = attention_mask.view(bs_embed * num_waveforms_per_prompt, seq_len)
+
+        bs_embed, seq_len, hidden_size = generated_prompt_embeds.shape
+        # duplicate generated embeddings for each generation per prompt, using mps friendly method
+        generated_prompt_embeds = generated_prompt_embeds.repeat(1, num_waveforms_per_prompt, 1)
+        generated_prompt_embeds = generated_prompt_embeds.view(
+            bs_embed * num_waveforms_per_prompt, seq_len, hidden_size
+        )
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            negative_prompt_embeds_list = []
+            negative_attention_mask_list = []
+            max_length = prompt_embeds.shape[1]
+            for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+                uncond_input = tokenizer(
+                    uncond_tokens,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length
+                    if isinstance(tokenizer, (RobertaTokenizer, RobertaTokenizerFast))
+                    else max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                uncond_input_ids = uncond_input.input_ids.to(device)
+                negative_attention_mask = uncond_input.attention_mask.to(device)
+
+                if text_encoder.config.model_type == "clap":
+                    negative_prompt_embeds = text_encoder.get_text_features(
+                        uncond_input_ids,
+                        attention_mask=negative_attention_mask,
+                    )
+                    # append the seq-len dim: (bs, hidden_size) -> (bs, seq_len, hidden_size)
+                    negative_prompt_embeds = negative_prompt_embeds[:, None, :]
+                    # make sure that we attend to this single hidden-state
+                    negative_attention_mask = negative_attention_mask.new_ones((batch_size, 1))
+                else:
+                    negative_prompt_embeds = text_encoder(
+                        uncond_input_ids,
+                        attention_mask=negative_attention_mask,
+                    )
+                    negative_prompt_embeds = negative_prompt_embeds[0]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+                negative_attention_mask_list.append(negative_attention_mask)
+
+            projection_output = self.projection_model(
+                hidden_states=negative_prompt_embeds_list[0],
+                hidden_states_1=negative_prompt_embeds_list[1],
+                attention_mask=negative_attention_mask_list[0],
+                attention_mask_1=negative_attention_mask_list[1],
+            )
+            negative_projected_prompt_embeds = projection_output.hidden_states
+            negative_projected_attention_mask = projection_output.attention_mask
+
+            negative_generated_prompt_embeds = self.generate_language_model(
+                negative_projected_prompt_embeds,
+                attention_mask=negative_projected_attention_mask,
+                max_new_tokens=max_new_tokens,
+            )
+
+        if do_classifier_free_guidance:
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            negative_attention_mask = (
+                negative_attention_mask.to(device=device)
+                if negative_attention_mask is not None
+                else torch.ones(negative_prompt_embeds.shape[:2], dtype=torch.long, device=device)
+            )
+            negative_generated_prompt_embeds = negative_generated_prompt_embeds.to(
+                dtype=self.language_model.dtype, device=device
+            )
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_waveforms_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_waveforms_per_prompt, seq_len, -1)
+
+            # duplicate unconditional attention mask for each generation per prompt
+            negative_attention_mask = negative_attention_mask.repeat(1, num_waveforms_per_prompt)
+            negative_attention_mask = negative_attention_mask.view(batch_size * num_waveforms_per_prompt, seq_len)
+
+            # duplicate unconditional generated embeddings for each generation per prompt
+            seq_len = negative_generated_prompt_embeds.shape[1]
+            negative_generated_prompt_embeds = negative_generated_prompt_embeds.repeat(1, num_waveforms_per_prompt, 1)
+            negative_generated_prompt_embeds = negative_generated_prompt_embeds.view(
+                batch_size * num_waveforms_per_prompt, seq_len, -1
+            )
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            attention_mask = torch.cat([negative_attention_mask, attention_mask])
+            generated_prompt_embeds = torch.cat([negative_generated_prompt_embeds, generated_prompt_embeds])
+
+        return prompt_embeds, attention_mask, generated_prompt_embeds
+
+    # Copied from diffusers.pipelines.audioldm.pipeline_audioldm.AudioLDMPipeline.mel_spectrogram_to_waveform
+    def mel_spectrogram_to_waveform(self, mel_spectrogram):
+        if mel_spectrogram.dim() == 4:
+            mel_spectrogram = mel_spectrogram.squeeze(1)
+
+        waveform = self.vocoder(mel_spectrogram)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        waveform = waveform.cpu().float()
+        return waveform
+
+    def score_waveforms(self, text, audio, num_waveforms_per_prompt, device, dtype):
+        if not is_librosa_available():
+            logger.info(
+                "Automatic scoring of the generated audio waveforms against the input prompt text requires the "
+                "`librosa` package to resample the generated waveforms. Returning the audios in the order they were "
+                "generated. To enable automatic scoring, install `librosa` with: `pip install librosa`."
+            )
+            return audio
+        inputs = self.tokenizer(text, return_tensors="pt", padding=True)
+        resampled_audio = librosa.resample(
+            audio.numpy(), orig_sr=self.vocoder.config.sampling_rate, target_sr=self.feature_extractor.sampling_rate
+        )
+        inputs["input_features"] = self.feature_extractor(
+            list(resampled_audio), return_tensors="pt", sampling_rate=self.feature_extractor.sampling_rate
+        ).input_features.type(dtype)
+        inputs = inputs.to(device)
+
+        # compute the audio-text similarity score using the CLAP model
+        logits_per_text = self.text_encoder(**inputs).logits_per_text
+        # sort by the highest matching generations per prompt
+        indices = torch.argsort(logits_per_text, dim=1, descending=True)[:, :num_waveforms_per_prompt]
+        audio = torch.index_select(audio, 0, indices.reshape(-1).cpu())
+        return audio
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        audio_length_in_s,
+        vocoder_upsample_factor,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        generated_prompt_embeds=None,
+        negative_generated_prompt_embeds=None,
+        attention_mask=None,
+        negative_attention_mask=None,
+    ):
+        min_audio_length_in_s = vocoder_upsample_factor * self.vae_scale_factor
+        if audio_length_in_s < min_audio_length_in_s:
+            raise ValueError(
+                f"`audio_length_in_s` has to be a positive value greater than or equal to {min_audio_length_in_s}, but "
+                f"is {audio_length_in_s}."
+            )
+
+        if self.vocoder.config.model_in_dim % self.vae_scale_factor != 0:
+            raise ValueError(
+                f"The number of frequency bins in the vocoder's log-mel spectrogram has to be divisible by the "
+                f"VAE scale factor, but got {self.vocoder.config.model_in_dim} bins and a scale factor of "
+                f"{self.vae_scale_factor}."
+            )
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and (prompt_embeds is None or generated_prompt_embeds is None):
+            raise ValueError(
+                "Provide either `prompt`, or `prompt_embeds` and `generated_prompt_embeds`. Cannot leave "
+                "`prompt` undefined without specifying both `prompt_embeds` and `generated_prompt_embeds`."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_embeds is not None and negative_generated_prompt_embeds is None:
+            raise ValueError(
+                "Cannot forward `negative_prompt_embeds` without `negative_generated_prompt_embeds`. Ensure that"
+                "both arguments are specified"
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+            if attention_mask is not None and attention_mask.shape != prompt_embeds.shape[:2]:
+                raise ValueError(
+                    "`attention_mask should have the same batch size and sequence length as `prompt_embeds`, but got:"
+                    f"`attention_mask: {attention_mask.shape} != `prompt_embeds` {prompt_embeds.shape}"
+                )
+
+        if generated_prompt_embeds is not None and negative_generated_prompt_embeds is not None:
+            if generated_prompt_embeds.shape != negative_generated_prompt_embeds.shape:
+                raise ValueError(
+                    "`generated_prompt_embeds` and `negative_generated_prompt_embeds` must have the same shape when "
+                    f"passed directly, but got: `generated_prompt_embeds` {generated_prompt_embeds.shape} != "
+                    f"`negative_generated_prompt_embeds` {negative_generated_prompt_embeds.shape}."
+                )
+            if (
+                negative_attention_mask is not None
+                and negative_attention_mask.shape != negative_prompt_embeds.shape[:2]
+            ):
+                raise ValueError(
+                    "`attention_mask should have the same batch size and sequence length as `prompt_embeds`, but got:"
+                    f"`attention_mask: {negative_attention_mask.shape} != `prompt_embeds` {negative_prompt_embeds.shape}"
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents with width->self.vocoder.config.model_in_dim
+    def prepare_latents(self, batch_size, num_channels_latents, height, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            height // self.vae_scale_factor,
+            self.vocoder.config.model_in_dim // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        audio_length_in_s: Optional[float] = None,
+        num_inference_steps: int = 200,
+        guidance_scale: float = 3.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_waveforms_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        generated_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_generated_prompt_embeds: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        negative_attention_mask: Optional[torch.LongTensor] = None,
+        max_new_tokens: Optional[int] = None,
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        output_type: Optional[str] = "np",
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide audio generation. If not defined, you need to pass `prompt_embeds`.
+            audio_length_in_s (`int`, *optional*, defaults to 10.24):
+                The length of the generated audio sample in seconds.
+            num_inference_steps (`int`, *optional*, defaults to 200):
+                The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 3.5):
+                A higher guidance scale value encourages the model to generate audio that is closely linked to the text
+                `prompt` at the expense of lower sound quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in audio generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_waveforms_per_prompt (`int`, *optional*, defaults to 1):
+                The number of waveforms to generate per prompt. If `num_waveforms_per_prompt > 1`, then automatic
+                scoring is performed between the generated outputs and the text prompt. This scoring ranks the
+                generated waveforms based on their cosine similarity with the text input in the joint text-audio
+                embedding space.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for spectrogram
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            generated_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings from the GPT2 langauge model. Can be used to easily tweak text inputs,
+                 *e.g.* prompt weighting. If not provided, text embeddings will be generated from `prompt` input
+                 argument.
+            negative_generated_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings from the GPT2 language model. Can be used to easily tweak text
+                inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from
+                `negative_prompt` input argument.
+            attention_mask (`torch.LongTensor`, *optional*):
+                Pre-computed attention mask to be applied to the `prompt_embeds`. If not provided, attention mask will
+                be computed from `prompt` input argument.
+            negative_attention_mask (`torch.LongTensor`, *optional*):
+                Pre-computed attention mask to be applied to the `negative_prompt_embeds`. If not provided, attention
+                mask will be computed from `negative_prompt` input argument.
+            max_new_tokens (`int`, *optional*, defaults to None):
+                Number of new tokens to generate with the GPT2 language model. If not provided, number of tokens will
+                be taken from the config of the model.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format of the generated audio. Choose between `"np"` to return a NumPy `np.ndarray` or
+                `"pt"` to return a PyTorch `torch.Tensor` object. Set to `"latent"` to return the latent diffusion
+                model (LDM) output.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated audio.
+        """
+        # 0. Convert audio input length from seconds to spectrogram height
+        vocoder_upsample_factor = np.prod(self.vocoder.config.upsample_rates) / self.vocoder.config.sampling_rate
+
+        if audio_length_in_s is None:
+            audio_length_in_s = self.unet.config.sample_size * self.vae_scale_factor * vocoder_upsample_factor
+
+        height = int(audio_length_in_s / vocoder_upsample_factor)
+
+        original_waveform_length = int(audio_length_in_s * self.vocoder.config.sampling_rate)
+        if height % self.vae_scale_factor != 0:
+            height = int(np.ceil(height / self.vae_scale_factor)) * self.vae_scale_factor
+            logger.info(
+                f"Audio length in seconds {audio_length_in_s} is increased to {height * vocoder_upsample_factor} "
+                f"so that it can be handled by the model. It will be cut to {audio_length_in_s} after the "
+                f"denoising process."
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            audio_length_in_s,
+            vocoder_upsample_factor,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            generated_prompt_embeds,
+            negative_generated_prompt_embeds,
+            attention_mask,
+            negative_attention_mask,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, attention_mask, generated_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_waveforms_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            generated_prompt_embeds=generated_prompt_embeds,
+            negative_generated_prompt_embeds=negative_generated_prompt_embeds,
+            attention_mask=attention_mask,
+            negative_attention_mask=negative_attention_mask,
+            max_new_tokens=max_new_tokens,
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_waveforms_per_prompt,
+            num_channels_latents,
+            height,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=generated_prompt_embeds,
+                    encoder_hidden_states_1=prompt_embeds,
+                    encoder_attention_mask_1=attention_mask,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        self.maybe_free_model_hooks()
+
+        # 8. Post-processing
+        if not output_type == "latent":
+            latents = 1 / self.vae.config.scaling_factor * latents
+            mel_spectrogram = self.vae.decode(latents).sample
+        else:
+            return AudioPipelineOutput(audios=latents)
+
+        audio = self.mel_spectrogram_to_waveform(mel_spectrogram)
+
+        audio = audio[:, :original_waveform_length]
+
+        # 9. Automatic scoring
+        if num_waveforms_per_prompt > 1 and prompt is not None:
+            audio = self.score_waveforms(
+                text=prompt,
+                audio=audio,
+                num_waveforms_per_prompt=num_waveforms_per_prompt,
+                device=device,
+                dtype=prompt_embeds.dtype,
+            )
+
+        if output_type == "np":
+            audio = audio.numpy()
+
+        if not return_dict:
+            return (audio,)
+
+        return AudioPipelineOutput(audios=audio)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/auto_pipeline.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/auto_pipeline.py
new file mode 100644
index 000000000..fc30fc4d2
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/auto_pipeline.py
@@ -0,0 +1,987 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+
+from huggingface_hub.utils import validate_hf_hub_args
+
+from ..configuration_utils import ConfigMixin
+from .controlnet import (
+    StableDiffusionControlNetImg2ImgPipeline,
+    StableDiffusionControlNetInpaintPipeline,
+    StableDiffusionControlNetPipeline,
+    StableDiffusionXLControlNetImg2ImgPipeline,
+    StableDiffusionXLControlNetInpaintPipeline,
+    StableDiffusionXLControlNetPipeline,
+)
+from .deepfloyd_if import IFImg2ImgPipeline, IFInpaintingPipeline, IFPipeline
+from .kandinsky import (
+    KandinskyCombinedPipeline,
+    KandinskyImg2ImgCombinedPipeline,
+    KandinskyImg2ImgPipeline,
+    KandinskyInpaintCombinedPipeline,
+    KandinskyInpaintPipeline,
+    KandinskyPipeline,
+)
+from .kandinsky2_2 import (
+    KandinskyV22CombinedPipeline,
+    KandinskyV22Img2ImgCombinedPipeline,
+    KandinskyV22Img2ImgPipeline,
+    KandinskyV22InpaintCombinedPipeline,
+    KandinskyV22InpaintPipeline,
+    KandinskyV22Pipeline,
+)
+from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline
+from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
+from .pixart_alpha import PixArtAlphaPipeline
+from .stable_diffusion import (
+    StableDiffusionImg2ImgPipeline,
+    StableDiffusionInpaintPipeline,
+    StableDiffusionPipeline,
+)
+from .stable_diffusion_xl import (
+    StableDiffusionXLImg2ImgPipeline,
+    StableDiffusionXLInpaintPipeline,
+    StableDiffusionXLPipeline,
+)
+from .wuerstchen import WuerstchenCombinedPipeline, WuerstchenDecoderPipeline
+
+
+AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("stable-diffusion", StableDiffusionPipeline),
+        ("stable-diffusion-xl", StableDiffusionXLPipeline),
+        ("if", IFPipeline),
+        ("kandinsky", KandinskyCombinedPipeline),
+        ("kandinsky22", KandinskyV22CombinedPipeline),
+        ("kandinsky3", Kandinsky3Pipeline),
+        ("stable-diffusion-controlnet", StableDiffusionControlNetPipeline),
+        ("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetPipeline),
+        ("wuerstchen", WuerstchenCombinedPipeline),
+        ("lcm", LatentConsistencyModelPipeline),
+        ("pixart", PixArtAlphaPipeline),
+    ]
+)
+
+AUTO_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("stable-diffusion", StableDiffusionImg2ImgPipeline),
+        ("stable-diffusion-xl", StableDiffusionXLImg2ImgPipeline),
+        ("if", IFImg2ImgPipeline),
+        ("kandinsky", KandinskyImg2ImgCombinedPipeline),
+        ("kandinsky22", KandinskyV22Img2ImgCombinedPipeline),
+        ("kandinsky3", Kandinsky3Img2ImgPipeline),
+        ("stable-diffusion-controlnet", StableDiffusionControlNetImg2ImgPipeline),
+        ("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetImg2ImgPipeline),
+        ("lcm", LatentConsistencyModelImg2ImgPipeline),
+    ]
+)
+
+AUTO_INPAINT_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("stable-diffusion", StableDiffusionInpaintPipeline),
+        ("stable-diffusion-xl", StableDiffusionXLInpaintPipeline),
+        ("if", IFInpaintingPipeline),
+        ("kandinsky", KandinskyInpaintCombinedPipeline),
+        ("kandinsky22", KandinskyV22InpaintCombinedPipeline),
+        ("stable-diffusion-controlnet", StableDiffusionControlNetInpaintPipeline),
+        ("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetInpaintPipeline),
+    ]
+)
+
+_AUTO_TEXT2IMAGE_DECODER_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("kandinsky", KandinskyPipeline),
+        ("kandinsky22", KandinskyV22Pipeline),
+        ("wuerstchen", WuerstchenDecoderPipeline),
+    ]
+)
+_AUTO_IMAGE2IMAGE_DECODER_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("kandinsky", KandinskyImg2ImgPipeline),
+        ("kandinsky22", KandinskyV22Img2ImgPipeline),
+    ]
+)
+_AUTO_INPAINT_DECODER_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("kandinsky", KandinskyInpaintPipeline),
+        ("kandinsky22", KandinskyV22InpaintPipeline),
+    ]
+)
+
+SUPPORTED_TASKS_MAPPINGS = [
+    AUTO_TEXT2IMAGE_PIPELINES_MAPPING,
+    AUTO_IMAGE2IMAGE_PIPELINES_MAPPING,
+    AUTO_INPAINT_PIPELINES_MAPPING,
+    _AUTO_TEXT2IMAGE_DECODER_PIPELINES_MAPPING,
+    _AUTO_IMAGE2IMAGE_DECODER_PIPELINES_MAPPING,
+    _AUTO_INPAINT_DECODER_PIPELINES_MAPPING,
+]
+
+
+def _get_connected_pipeline(pipeline_cls):
+    # for now connected pipelines can only be loaded from decoder pipelines, such as kandinsky-community/kandinsky-2-2-decoder
+    if pipeline_cls in _AUTO_TEXT2IMAGE_DECODER_PIPELINES_MAPPING.values():
+        return _get_task_class(
+            AUTO_TEXT2IMAGE_PIPELINES_MAPPING, pipeline_cls.__name__, throw_error_if_not_exist=False
+        )
+    if pipeline_cls in _AUTO_IMAGE2IMAGE_DECODER_PIPELINES_MAPPING.values():
+        return _get_task_class(
+            AUTO_IMAGE2IMAGE_PIPELINES_MAPPING, pipeline_cls.__name__, throw_error_if_not_exist=False
+        )
+    if pipeline_cls in _AUTO_INPAINT_DECODER_PIPELINES_MAPPING.values():
+        return _get_task_class(AUTO_INPAINT_PIPELINES_MAPPING, pipeline_cls.__name__, throw_error_if_not_exist=False)
+
+
+def _get_task_class(mapping, pipeline_class_name, throw_error_if_not_exist: bool = True):
+    def get_model(pipeline_class_name):
+        for task_mapping in SUPPORTED_TASKS_MAPPINGS:
+            for model_name, pipeline in task_mapping.items():
+                if pipeline.__name__ == pipeline_class_name:
+                    return model_name
+
+    model_name = get_model(pipeline_class_name)
+
+    if model_name is not None:
+        task_class = mapping.get(model_name, None)
+        if task_class is not None:
+            return task_class
+
+    if throw_error_if_not_exist:
+        raise ValueError(f"AutoPipeline can't find a pipeline linked to {pipeline_class_name} for {model_name}")
+
+
+class AutoPipelineForText2Image(ConfigMixin):
+    r"""
+
+    [`AutoPipelineForText2Image`] is a generic pipeline class that instantiates a text-to-image pipeline class. The
+    specific underlying pipeline class is automatically selected from either the
+    [`~AutoPipelineForText2Image.from_pretrained`] or [`~AutoPipelineForText2Image.from_pipe`] methods.
+
+    This class cannot be instantiated using `__init__()` (throws an error).
+
+    Class attributes:
+
+        - **config_name** (`str`) -- The configuration filename that stores the class and module names of all the
+          diffusion pipeline's components.
+
+    """
+
+    config_name = "model_index.json"
+
+    def __init__(self, *args, **kwargs):
+        raise EnvironmentError(
+            f"{self.__class__.__name__} is designed to be instantiated "
+            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or "
+            f"`{self.__class__.__name__}.from_pipe(pipeline)` methods."
+        )
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(cls, pretrained_model_or_path, **kwargs):
+        r"""
+        Instantiates a text-to-image Pytorch diffusion pipeline from pretrained pipeline weight.
+
+        The from_pretrained() method takes care of returning the correct pipeline class instance by:
+            1. Detect the pipeline class of the pretrained_model_or_path based on the _class_name property of its
+               config object
+            2. Find the text-to-image pipeline linked to the pipeline class using pattern matching on pipeline class
+               name.
+
+        If a `controlnet` argument is passed, it will instantiate a [`StableDiffusionControlNetPipeline`] object.
+
+        The pipeline is set in evaluation mode (`model.eval()`) by default.
+
+        If you get the error message below, you need to finetune the weights for your downstream task:
+
+        ```
+        Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
+        - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated
+        You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+        ```
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+
+                    - A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline
+                      hosted on the Hub.
+                    - A path to a *directory* (for example `./my_pipeline_directory/`) containing pipeline weights
+                      saved using
+                    [`~DiffusionPipeline.save_pretrained`].
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If "auto" is passed, the
+                dtype is automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            custom_revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id similar to
+                `revision` when loading a custom pipeline from the Hub. It can be a 🤗 Diffusers version when loading a
+                custom pipeline from GitHub, otherwise it defaults to `"main"` when loading from the Hub.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn’t need to be defined for each
+                parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
+                same device.
+
+                Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            max_memory (`Dict`, *optional*):
+                A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
+                each GPU and the available CPU RAM if unset.
+            offload_folder (`str` or `os.PathLike`, *optional*):
+                The path to offload weights if device_map contains the value `"disk"`.
+            offload_state_dict (`bool`, *optional*):
+                If `True`, temporarily offloads the CPU state dict to the hard drive to avoid running out of CPU RAM if
+                the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to `True`
+                when there is some disk offload.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
+                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
+                weights. If set to `False`, safetensors weights are not loaded.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to overwrite load and saveable variables (the pipeline components of the specific pipeline
+                class). The overwritten components are passed directly to the pipelines `__init__` method. See example
+                below for more information.
+            variant (`str`, *optional*):
+                Load weights from a specified variant filename such as `"fp16"` or `"ema"`. This is ignored when
+                loading `from_flax`.
+
+        <Tip>
+
+        To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
+        `huggingface-cli login`.
+
+        </Tip>
+
+        Examples:
+
+        ```py
+        >>> from diffusers import AutoPipelineForText2Image
+
+        >>> pipeline = AutoPipelineForText2Image.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> image = pipeline(prompt).images[0]
+        ```
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
+
+        load_config_kwargs = {
+            "cache_dir": cache_dir,
+            "force_download": force_download,
+            "resume_download": resume_download,
+            "proxies": proxies,
+            "token": token,
+            "local_files_only": local_files_only,
+            "revision": revision,
+        }
+
+        config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
+        orig_class_name = config["_class_name"]
+
+        if "controlnet" in kwargs:
+            orig_class_name = config["_class_name"].replace("Pipeline", "ControlNetPipeline")
+
+        text_2_image_cls = _get_task_class(AUTO_TEXT2IMAGE_PIPELINES_MAPPING, orig_class_name)
+
+        kwargs = {**load_config_kwargs, **kwargs}
+        return text_2_image_cls.from_pretrained(pretrained_model_or_path, **kwargs)
+
+    @classmethod
+    def from_pipe(cls, pipeline, **kwargs):
+        r"""
+        Instantiates a text-to-image Pytorch diffusion pipeline from another instantiated diffusion pipeline class.
+
+        The from_pipe() method takes care of returning the correct pipeline class instance by finding the text-to-image
+        pipeline linked to the pipeline class using pattern matching on pipeline class name.
+
+        All the modules the pipeline contains will be used to initialize the new pipeline without reallocating
+        additional memory.
+
+        The pipeline is set in evaluation mode (`model.eval()`) by default.
+
+        Parameters:
+            pipeline (`DiffusionPipeline`):
+                an instantiated `DiffusionPipeline` object
+
+        ```py
+        >>> from diffusers import AutoPipelineForText2Image, AutoPipelineForImage2Image
+
+        >>> pipe_i2i = AutoPipelineForImage2Image.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", requires_safety_checker=False
+        ... )
+
+        >>> pipe_t2i = AutoPipelineForText2Image.from_pipe(pipe_i2i)
+        >>> image = pipe_t2i(prompt).images[0]
+        ```
+        """
+
+        original_config = dict(pipeline.config)
+        original_cls_name = pipeline.__class__.__name__
+
+        # derive the pipeline class to instantiate
+        text_2_image_cls = _get_task_class(AUTO_TEXT2IMAGE_PIPELINES_MAPPING, original_cls_name)
+
+        if "controlnet" in kwargs:
+            if kwargs["controlnet"] is not None:
+                text_2_image_cls = _get_task_class(
+                    AUTO_TEXT2IMAGE_PIPELINES_MAPPING,
+                    text_2_image_cls.__name__.replace("ControlNet", "").replace("Pipeline", "ControlNetPipeline"),
+                )
+            else:
+                text_2_image_cls = _get_task_class(
+                    AUTO_TEXT2IMAGE_PIPELINES_MAPPING,
+                    text_2_image_cls.__name__.replace("ControlNetPipeline", "Pipeline"),
+                )
+
+        # define expected module and optional kwargs given the pipeline signature
+        expected_modules, optional_kwargs = text_2_image_cls._get_signature_keys(text_2_image_cls)
+
+        pretrained_model_name_or_path = original_config.pop("_name_or_path", None)
+
+        # allow users pass modules in `kwargs` to override the original pipeline's components
+        passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
+        original_class_obj = {
+            k: pipeline.components[k]
+            for k, v in pipeline.components.items()
+            if k in expected_modules and k not in passed_class_obj
+        }
+
+        # allow users pass optional kwargs to override the original pipelines config attribute
+        passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
+        original_pipe_kwargs = {
+            k: original_config[k]
+            for k, v in original_config.items()
+            if k in optional_kwargs and k not in passed_pipe_kwargs
+        }
+
+        # config that were not expected by original pipeline is stored as private attribute
+        # we will pass them as optional arguments if they can be accepted by the pipeline
+        additional_pipe_kwargs = [
+            k[1:]
+            for k in original_config.keys()
+            if k.startswith("_") and k[1:] in optional_kwargs and k[1:] not in passed_pipe_kwargs
+        ]
+        for k in additional_pipe_kwargs:
+            original_pipe_kwargs[k] = original_config.pop(f"_{k}")
+
+        text_2_image_kwargs = {**passed_class_obj, **original_class_obj, **passed_pipe_kwargs, **original_pipe_kwargs}
+
+        # store unused config as private attribute
+        unused_original_config = {
+            f"{'' if k.startswith('_') else '_'}{k}": original_config[k]
+            for k, v in original_config.items()
+            if k not in text_2_image_kwargs
+        }
+
+        missing_modules = set(expected_modules) - set(pipeline._optional_components) - set(text_2_image_kwargs.keys())
+
+        if len(missing_modules) > 0:
+            raise ValueError(
+                f"Pipeline {text_2_image_cls} expected {expected_modules}, but only {set(list(passed_class_obj.keys()) + list(original_class_obj.keys()))} were passed"
+            )
+
+        model = text_2_image_cls(**text_2_image_kwargs)
+        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
+        model.register_to_config(**unused_original_config)
+
+        return model
+
+
+class AutoPipelineForImage2Image(ConfigMixin):
+    r"""
+
+    [`AutoPipelineForImage2Image`] is a generic pipeline class that instantiates an image-to-image pipeline class. The
+    specific underlying pipeline class is automatically selected from either the
+    [`~AutoPipelineForImage2Image.from_pretrained`] or [`~AutoPipelineForImage2Image.from_pipe`] methods.
+
+    This class cannot be instantiated using `__init__()` (throws an error).
+
+    Class attributes:
+
+        - **config_name** (`str`) -- The configuration filename that stores the class and module names of all the
+          diffusion pipeline's components.
+
+    """
+
+    config_name = "model_index.json"
+
+    def __init__(self, *args, **kwargs):
+        raise EnvironmentError(
+            f"{self.__class__.__name__} is designed to be instantiated "
+            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or "
+            f"`{self.__class__.__name__}.from_pipe(pipeline)` methods."
+        )
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(cls, pretrained_model_or_path, **kwargs):
+        r"""
+        Instantiates a image-to-image Pytorch diffusion pipeline from pretrained pipeline weight.
+
+        The from_pretrained() method takes care of returning the correct pipeline class instance by:
+            1. Detect the pipeline class of the pretrained_model_or_path based on the _class_name property of its
+               config object
+            2. Find the image-to-image pipeline linked to the pipeline class using pattern matching on pipeline class
+               name.
+
+        If a `controlnet` argument is passed, it will instantiate a [`StableDiffusionControlNetImg2ImgPipeline`]
+        object.
+
+        The pipeline is set in evaluation mode (`model.eval()`) by default.
+
+        If you get the error message below, you need to finetune the weights for your downstream task:
+
+        ```
+        Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
+        - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated
+        You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+        ```
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+
+                    - A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline
+                      hosted on the Hub.
+                    - A path to a *directory* (for example `./my_pipeline_directory/`) containing pipeline weights
+                      saved using
+                    [`~DiffusionPipeline.save_pretrained`].
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If "auto" is passed, the
+                dtype is automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            custom_revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id similar to
+                `revision` when loading a custom pipeline from the Hub. It can be a 🤗 Diffusers version when loading a
+                custom pipeline from GitHub, otherwise it defaults to `"main"` when loading from the Hub.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn’t need to be defined for each
+                parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
+                same device.
+
+                Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            max_memory (`Dict`, *optional*):
+                A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
+                each GPU and the available CPU RAM if unset.
+            offload_folder (`str` or `os.PathLike`, *optional*):
+                The path to offload weights if device_map contains the value `"disk"`.
+            offload_state_dict (`bool`, *optional*):
+                If `True`, temporarily offloads the CPU state dict to the hard drive to avoid running out of CPU RAM if
+                the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to `True`
+                when there is some disk offload.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
+                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
+                weights. If set to `False`, safetensors weights are not loaded.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to overwrite load and saveable variables (the pipeline components of the specific pipeline
+                class). The overwritten components are passed directly to the pipelines `__init__` method. See example
+                below for more information.
+            variant (`str`, *optional*):
+                Load weights from a specified variant filename such as `"fp16"` or `"ema"`. This is ignored when
+                loading `from_flax`.
+
+        <Tip>
+
+        To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
+        `huggingface-cli login`.
+
+        </Tip>
+
+        Examples:
+
+        ```py
+        >>> from diffusers import AutoPipelineForImage2Image
+
+        >>> pipeline = AutoPipelineForImage2Image.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> image = pipeline(prompt, image).images[0]
+        ```
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
+
+        load_config_kwargs = {
+            "cache_dir": cache_dir,
+            "force_download": force_download,
+            "resume_download": resume_download,
+            "proxies": proxies,
+            "token": token,
+            "local_files_only": local_files_only,
+            "revision": revision,
+        }
+
+        config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
+        orig_class_name = config["_class_name"]
+
+        if "controlnet" in kwargs:
+            orig_class_name = config["_class_name"].replace("Pipeline", "ControlNetPipeline")
+
+        image_2_image_cls = _get_task_class(AUTO_IMAGE2IMAGE_PIPELINES_MAPPING, orig_class_name)
+
+        kwargs = {**load_config_kwargs, **kwargs}
+        return image_2_image_cls.from_pretrained(pretrained_model_or_path, **kwargs)
+
+    @classmethod
+    def from_pipe(cls, pipeline, **kwargs):
+        r"""
+        Instantiates a image-to-image Pytorch diffusion pipeline from another instantiated diffusion pipeline class.
+
+        The from_pipe() method takes care of returning the correct pipeline class instance by finding the
+        image-to-image pipeline linked to the pipeline class using pattern matching on pipeline class name.
+
+        All the modules the pipeline contains will be used to initialize the new pipeline without reallocating
+        additional memory.
+
+        The pipeline is set in evaluation mode (`model.eval()`) by default.
+
+        Parameters:
+            pipeline (`DiffusionPipeline`):
+                an instantiated `DiffusionPipeline` object
+
+        Examples:
+
+        ```py
+        >>> from diffusers import AutoPipelineForText2Image, AutoPipelineForImage2Image
+
+        >>> pipe_t2i = AutoPipelineForText2Image.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", requires_safety_checker=False
+        ... )
+
+        >>> pipe_i2i = AutoPipelineForImage2Image.from_pipe(pipe_t2i)
+        >>> image = pipe_i2i(prompt, image).images[0]
+        ```
+        """
+
+        original_config = dict(pipeline.config)
+        original_cls_name = pipeline.__class__.__name__
+
+        # derive the pipeline class to instantiate
+        image_2_image_cls = _get_task_class(AUTO_IMAGE2IMAGE_PIPELINES_MAPPING, original_cls_name)
+
+        if "controlnet" in kwargs:
+            if kwargs["controlnet"] is not None:
+                image_2_image_cls = _get_task_class(
+                    AUTO_IMAGE2IMAGE_PIPELINES_MAPPING,
+                    image_2_image_cls.__name__.replace("ControlNet", "").replace(
+                        "Img2ImgPipeline", "ControlNetImg2ImgPipeline"
+                    ),
+                )
+            else:
+                image_2_image_cls = _get_task_class(
+                    AUTO_IMAGE2IMAGE_PIPELINES_MAPPING,
+                    image_2_image_cls.__name__.replace("ControlNetImg2ImgPipeline", "Img2ImgPipeline"),
+                )
+
+        # define expected module and optional kwargs given the pipeline signature
+        expected_modules, optional_kwargs = image_2_image_cls._get_signature_keys(image_2_image_cls)
+
+        pretrained_model_name_or_path = original_config.pop("_name_or_path", None)
+
+        # allow users pass modules in `kwargs` to override the original pipeline's components
+        passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
+        original_class_obj = {
+            k: pipeline.components[k]
+            for k, v in pipeline.components.items()
+            if k in expected_modules and k not in passed_class_obj
+        }
+
+        # allow users pass optional kwargs to override the original pipelines config attribute
+        passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
+        original_pipe_kwargs = {
+            k: original_config[k]
+            for k, v in original_config.items()
+            if k in optional_kwargs and k not in passed_pipe_kwargs
+        }
+
+        # config attribute that were not expected by original pipeline is stored as its private attribute
+        # we will pass them as optional arguments if they can be accepted by the pipeline
+        additional_pipe_kwargs = [
+            k[1:]
+            for k in original_config.keys()
+            if k.startswith("_") and k[1:] in optional_kwargs and k[1:] not in passed_pipe_kwargs
+        ]
+        for k in additional_pipe_kwargs:
+            original_pipe_kwargs[k] = original_config.pop(f"_{k}")
+
+        image_2_image_kwargs = {**passed_class_obj, **original_class_obj, **passed_pipe_kwargs, **original_pipe_kwargs}
+
+        # store unused config as private attribute
+        unused_original_config = {
+            f"{'' if k.startswith('_') else '_'}{k}": original_config[k]
+            for k, v in original_config.items()
+            if k not in image_2_image_kwargs
+        }
+
+        missing_modules = set(expected_modules) - set(pipeline._optional_components) - set(image_2_image_kwargs.keys())
+
+        if len(missing_modules) > 0:
+            raise ValueError(
+                f"Pipeline {image_2_image_cls} expected {expected_modules}, but only {set(list(passed_class_obj.keys()) + list(original_class_obj.keys()))} were passed"
+            )
+
+        model = image_2_image_cls(**image_2_image_kwargs)
+        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
+        model.register_to_config(**unused_original_config)
+
+        return model
+
+
+class AutoPipelineForInpainting(ConfigMixin):
+    r"""
+
+    [`AutoPipelineForInpainting`] is a generic pipeline class that instantiates an inpainting pipeline class. The
+    specific underlying pipeline class is automatically selected from either the
+    [`~AutoPipelineForInpainting.from_pretrained`] or [`~AutoPipelineForInpainting.from_pipe`] methods.
+
+    This class cannot be instantiated using `__init__()` (throws an error).
+
+    Class attributes:
+
+        - **config_name** (`str`) -- The configuration filename that stores the class and module names of all the
+          diffusion pipeline's components.
+
+    """
+
+    config_name = "model_index.json"
+
+    def __init__(self, *args, **kwargs):
+        raise EnvironmentError(
+            f"{self.__class__.__name__} is designed to be instantiated "
+            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or "
+            f"`{self.__class__.__name__}.from_pipe(pipeline)` methods."
+        )
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(cls, pretrained_model_or_path, **kwargs):
+        r"""
+        Instantiates a inpainting Pytorch diffusion pipeline from pretrained pipeline weight.
+
+        The from_pretrained() method takes care of returning the correct pipeline class instance by:
+            1. Detect the pipeline class of the pretrained_model_or_path based on the _class_name property of its
+               config object
+            2. Find the inpainting pipeline linked to the pipeline class using pattern matching on pipeline class name.
+
+        If a `controlnet` argument is passed, it will instantiate a [`StableDiffusionControlNetInpaintPipeline`]
+        object.
+
+        The pipeline is set in evaluation mode (`model.eval()`) by default.
+
+        If you get the error message below, you need to finetune the weights for your downstream task:
+
+        ```
+        Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
+        - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated
+        You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+        ```
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+
+                    - A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline
+                      hosted on the Hub.
+                    - A path to a *directory* (for example `./my_pipeline_directory/`) containing pipeline weights
+                      saved using
+                    [`~DiffusionPipeline.save_pretrained`].
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If "auto" is passed, the
+                dtype is automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            custom_revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id similar to
+                `revision` when loading a custom pipeline from the Hub. It can be a 🤗 Diffusers version when loading a
+                custom pipeline from GitHub, otherwise it defaults to `"main"` when loading from the Hub.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn’t need to be defined for each
+                parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
+                same device.
+
+                Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            max_memory (`Dict`, *optional*):
+                A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
+                each GPU and the available CPU RAM if unset.
+            offload_folder (`str` or `os.PathLike`, *optional*):
+                The path to offload weights if device_map contains the value `"disk"`.
+            offload_state_dict (`bool`, *optional*):
+                If `True`, temporarily offloads the CPU state dict to the hard drive to avoid running out of CPU RAM if
+                the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to `True`
+                when there is some disk offload.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
+                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
+                weights. If set to `False`, safetensors weights are not loaded.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to overwrite load and saveable variables (the pipeline components of the specific pipeline
+                class). The overwritten components are passed directly to the pipelines `__init__` method. See example
+                below for more information.
+            variant (`str`, *optional*):
+                Load weights from a specified variant filename such as `"fp16"` or `"ema"`. This is ignored when
+                loading `from_flax`.
+
+        <Tip>
+
+        To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
+        `huggingface-cli login`.
+
+        </Tip>
+
+        Examples:
+
+        ```py
+        >>> from diffusers import AutoPipelineForInpainting
+
+        >>> pipeline = AutoPipelineForInpainting.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> image = pipeline(prompt, image=init_image, mask_image=mask_image).images[0]
+        ```
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
+
+        load_config_kwargs = {
+            "cache_dir": cache_dir,
+            "force_download": force_download,
+            "resume_download": resume_download,
+            "proxies": proxies,
+            "token": token,
+            "local_files_only": local_files_only,
+            "revision": revision,
+        }
+
+        config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
+        orig_class_name = config["_class_name"]
+
+        if "controlnet" in kwargs:
+            orig_class_name = config["_class_name"].replace("Pipeline", "ControlNetPipeline")
+
+        inpainting_cls = _get_task_class(AUTO_INPAINT_PIPELINES_MAPPING, orig_class_name)
+
+        kwargs = {**load_config_kwargs, **kwargs}
+        return inpainting_cls.from_pretrained(pretrained_model_or_path, **kwargs)
+
+    @classmethod
+    def from_pipe(cls, pipeline, **kwargs):
+        r"""
+        Instantiates a inpainting Pytorch diffusion pipeline from another instantiated diffusion pipeline class.
+
+        The from_pipe() method takes care of returning the correct pipeline class instance by finding the inpainting
+        pipeline linked to the pipeline class using pattern matching on pipeline class name.
+
+        All the modules the pipeline class contain will be used to initialize the new pipeline without reallocating
+        additional memory.
+
+        The pipeline is set in evaluation mode (`model.eval()`) by default.
+
+        Parameters:
+            pipeline (`DiffusionPipeline`):
+                an instantiated `DiffusionPipeline` object
+
+        Examples:
+
+        ```py
+        >>> from diffusers import AutoPipelineForText2Image, AutoPipelineForInpainting
+
+        >>> pipe_t2i = AutoPipelineForText2Image.from_pretrained(
+        ...     "DeepFloyd/IF-I-XL-v1.0", requires_safety_checker=False
+        ... )
+
+        >>> pipe_inpaint = AutoPipelineForInpainting.from_pipe(pipe_t2i)
+        >>> image = pipe_inpaint(prompt, image=init_image, mask_image=mask_image).images[0]
+        ```
+        """
+        original_config = dict(pipeline.config)
+        original_cls_name = pipeline.__class__.__name__
+
+        # derive the pipeline class to instantiate
+        inpainting_cls = _get_task_class(AUTO_INPAINT_PIPELINES_MAPPING, original_cls_name)
+
+        if "controlnet" in kwargs:
+            if kwargs["controlnet"] is not None:
+                inpainting_cls = _get_task_class(
+                    AUTO_INPAINT_PIPELINES_MAPPING,
+                    inpainting_cls.__name__.replace("ControlNet", "").replace(
+                        "InpaintPipeline", "ControlNetInpaintPipeline"
+                    ),
+                )
+            else:
+                inpainting_cls = _get_task_class(
+                    AUTO_INPAINT_PIPELINES_MAPPING,
+                    inpainting_cls.__name__.replace("ControlNetInpaintPipeline", "InpaintPipeline"),
+                )
+
+        # define expected module and optional kwargs given the pipeline signature
+        expected_modules, optional_kwargs = inpainting_cls._get_signature_keys(inpainting_cls)
+
+        pretrained_model_name_or_path = original_config.pop("_name_or_path", None)
+
+        # allow users pass modules in `kwargs` to override the original pipeline's components
+        passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
+        original_class_obj = {
+            k: pipeline.components[k]
+            for k, v in pipeline.components.items()
+            if k in expected_modules and k not in passed_class_obj
+        }
+
+        # allow users pass optional kwargs to override the original pipelines config attribute
+        passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
+        original_pipe_kwargs = {
+            k: original_config[k]
+            for k, v in original_config.items()
+            if k in optional_kwargs and k not in passed_pipe_kwargs
+        }
+
+        # config that were not expected by original pipeline is stored as private attribute
+        # we will pass them as optional arguments if they can be accepted by the pipeline
+        additional_pipe_kwargs = [
+            k[1:]
+            for k in original_config.keys()
+            if k.startswith("_") and k[1:] in optional_kwargs and k[1:] not in passed_pipe_kwargs
+        ]
+        for k in additional_pipe_kwargs:
+            original_pipe_kwargs[k] = original_config.pop(f"_{k}")
+
+        inpainting_kwargs = {**passed_class_obj, **original_class_obj, **passed_pipe_kwargs, **original_pipe_kwargs}
+
+        # store unused config as private attribute
+        unused_original_config = {
+            f"{'' if k.startswith('_') else '_'}{k}": original_config[k]
+            for k, v in original_config.items()
+            if k not in inpainting_kwargs
+        }
+
+        missing_modules = set(expected_modules) - set(pipeline._optional_components) - set(inpainting_kwargs.keys())
+
+        if len(missing_modules) > 0:
+            raise ValueError(
+                f"Pipeline {inpainting_cls} expected {expected_modules}, but only {set(list(passed_class_obj.keys()) + list(original_class_obj.keys()))} were passed"
+            )
+
+        model = inpainting_cls(**inpainting_kwargs)
+        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
+        model.register_to_config(**unused_original_config)
+
+        return model
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/__init__.py
new file mode 100644
index 000000000..af6c879d5
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/__init__.py
@@ -0,0 +1,20 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL
+from PIL import Image
+
+from ...utils import OptionalDependencyNotAvailable, is_torch_available, is_transformers_available
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils.dummy_torch_and_transformers_objects import ShapEPipeline
+else:
+    from .blip_image_processing import BlipImageProcessor
+    from .modeling_blip2 import Blip2QFormerModel
+    from .modeling_ctx_clip import ContextCLIPTextModel
+    from .pipeline_blip_diffusion import BlipDiffusionPipeline
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/blip_image_processing.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/blip_image_processing.py
new file mode 100644
index 000000000..d71a14810
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/blip_image_processing.py
@@ -0,0 +1,318 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for BLIP."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from transformers.image_transforms import convert_to_rgb, resize, to_channel_dimension_format
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from transformers.utils import TensorType, is_vision_available, logging
+
+from diffusers.utils import numpy_to_pil
+
+
+if is_vision_available():
+    import PIL.Image
+
+
+logger = logging.get_logger(__name__)
+
+
+# We needed some extra functions on top of the ones in transformers.image_processing_utils.BaseImageProcessor, namely center crop
+# Copy-pasted from transformers.models.blip.image_processing_blip.BlipImageProcessor
+class BlipImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a BLIP image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+            overridden by the `resample` parameter in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Wwhether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
+            overridden by the `rescale_factor` parameter in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        do_center_crop: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 224, "width": 224}
+        size = get_size_dict(size, default_to_square=True)
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.do_convert_rgb = do_convert_rgb
+        self.do_center_crop = do_center_crop
+
+    # Copy-pasted from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        do_center_crop: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        do_convert_rgb: bool = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Controls the size of the image after `resize`. The shortest edge of the image is resized to
+                `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image
+                is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
+                edge equal to `int(size["shortest_edge"] * (1333 / 800))`.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to normalize the image by if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if do_resize and size is None or resample is None:
+            raise ValueError("Size and resample must be specified if do_resize is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+        if do_center_crop:
+            images = [self.center_crop(image, size, input_data_format=input_data_format) for image in images]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+        return encoded_outputs
+
+    # Follows diffusers.VaeImageProcessor.postprocess
+    def postprocess(self, sample: torch.FloatTensor, output_type: str = "pil"):
+        if output_type not in ["pt", "np", "pil"]:
+            raise ValueError(
+                f"output_type={output_type} is not supported. Make sure to choose one of ['pt', 'np', or 'pil']"
+            )
+
+        # Equivalent to diffusers.VaeImageProcessor.denormalize
+        sample = (sample / 2 + 0.5).clamp(0, 1)
+        if output_type == "pt":
+            return sample
+
+        # Equivalent to diffusers.VaeImageProcessor.pt_to_numpy
+        sample = sample.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "np":
+            return sample
+        # Output_type must be 'pil'
+        sample = numpy_to_pil(sample)
+        return sample
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py
new file mode 100644
index 000000000..c8869ad9d
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py
@@ -0,0 +1,642 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from transformers import BertTokenizer
+from transformers.activations import QuickGELUActivation as QuickGELU
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPooling,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+)
+from transformers.models.blip_2.configuration_blip_2 import Blip2Config, Blip2VisionConfig
+from transformers.models.blip_2.modeling_blip_2 import (
+    Blip2Encoder,
+    Blip2PreTrainedModel,
+    Blip2QFormerAttention,
+    Blip2QFormerIntermediate,
+    Blip2QFormerOutput,
+)
+from transformers.pytorch_utils import apply_chunking_to_forward
+from transformers.utils import (
+    logging,
+    replace_return_docstrings,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+# There is an implementation of Blip2 in `transformers` : https://github.com/huggingface/transformers/blob/main/src/transformers/models/blip_2/modeling_blip_2.py.
+# But it doesn't support getting multimodal embeddings. So, this module can be
+# replaced with a future `transformers` version supports that.
+class Blip2TextEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+
+        self.config = config
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        query_embeds=None,
+        past_key_values_length=0,
+    ):
+        if input_ids is not None:
+            seq_length = input_ids.size()[1]
+        else:
+            seq_length = 0
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length].clone()
+
+        if input_ids is not None:
+            embeddings = self.word_embeddings(input_ids)
+            if self.position_embedding_type == "absolute":
+                position_embeddings = self.position_embeddings(position_ids)
+                embeddings = embeddings + position_embeddings
+
+            if query_embeds is not None:
+                batch_size = embeddings.shape[0]
+                # repeat the query embeddings for batch size
+                query_embeds = query_embeds.repeat(batch_size, 1, 1)
+                embeddings = torch.cat((query_embeds, embeddings), dim=1)
+        else:
+            embeddings = query_embeds
+        embeddings = embeddings.to(query_embeds.dtype)
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+# Copy-pasted from transformers.models.blip.modeling_blip.BlipVisionEmbeddings with Blip->Blip2
+class Blip2VisionEmbeddings(nn.Module):
+    def __init__(self, config: Blip2VisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size, bias=False
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding[:, : embeddings.size(1), :].to(target_dtype)
+        return embeddings
+
+
+# The Qformer encoder, which takes the visual embeddings, and the text input, to get multimodal embeddings
+class Blip2QFormerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [Blip2QFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+
+        next_decoder_cache = () if use_cache else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions, query_length)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if layer_module.has_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# The layers making up the Qformer encoder
+class Blip2QFormerLayer(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = Blip2QFormerAttention(config)
+
+        self.layer_idx = layer_idx
+
+        if layer_idx % config.cross_attention_frequency == 0:
+            self.crossattention = Blip2QFormerAttention(config, is_cross_attention=True)
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+
+        self.intermediate = Blip2QFormerIntermediate(config)
+        self.intermediate_query = Blip2QFormerIntermediate(config)
+        self.output_query = Blip2QFormerOutput(config)
+        self.output = Blip2QFormerOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+
+        present_key_value = self_attention_outputs[-1]
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                if encoder_hidden_states is None:
+                    raise ValueError("encoder_hidden_states must be given for cross-attention layers")
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                # add cross attentions if we output attention weights
+                outputs = outputs + cross_attention_outputs[1:-1]
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+
+        outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+
+# ProjLayer used to project the multimodal Blip2 embeddings to be used in the text encoder
+class ProjLayer(nn.Module):
+    def __init__(self, in_dim, out_dim, hidden_dim, drop_p=0.1, eps=1e-12):
+        super().__init__()
+
+        # Dense1 -> Act -> Dense2 -> Drop -> Res -> Norm
+        self.dense1 = nn.Linear(in_dim, hidden_dim)
+        self.act_fn = QuickGELU()
+        self.dense2 = nn.Linear(hidden_dim, out_dim)
+        self.dropout = nn.Dropout(drop_p)
+
+        self.LayerNorm = nn.LayerNorm(out_dim, eps=eps)
+
+    def forward(self, x):
+        x_in = x
+
+        x = self.LayerNorm(x)
+        x = self.dropout(self.dense2(self.act_fn(self.dense1(x)))) + x_in
+
+        return x
+
+
+# Copy-pasted from transformers.models.blip.modeling_blip.BlipVisionModel with Blip->Blip2, BLIP->BLIP_2
+class Blip2VisionModel(Blip2PreTrainedModel):
+    main_input_name = "pixel_values"
+    config_class = Blip2VisionConfig
+
+    def __init__(self, config: Blip2VisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = Blip2VisionEmbeddings(config)
+        self.pre_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.encoder = Blip2Encoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        self.post_init()
+
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Blip2VisionConfig)
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layernorm(hidden_states)
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+
+# Qformer model, used to get multimodal embeddings from the text and image inputs
+class Blip2QFormerModel(Blip2PreTrainedModel):
+    """
+    Querying Transformer (Q-Former), used in BLIP-2.
+    """
+
+    def __init__(self, config: Blip2Config):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = Blip2TextEmbeddings(config.qformer_config)
+        self.visual_encoder = Blip2VisionModel(config.vision_config)
+        self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
+        if not hasattr(config, "tokenizer") or config.tokenizer is None:
+            self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", truncation_side="right")
+        else:
+            self.tokenizer = BertTokenizer.from_pretrained(config.tokenizer, truncation_side="right")
+        self.tokenizer.add_special_tokens({"bos_token": "[DEC]"})
+        self.proj_layer = ProjLayer(
+            in_dim=config.qformer_config.hidden_size,
+            out_dim=config.qformer_config.hidden_size,
+            hidden_dim=config.qformer_config.hidden_size * 4,
+            drop_p=0.1,
+            eps=1e-12,
+        )
+
+        self.encoder = Blip2QFormerEncoder(config.qformer_config)
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_shape: Tuple[int],
+        device: torch.device,
+        has_query: bool = False,
+    ) -> torch.Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`Tuple[int]`):
+                The shape of the input to the model.
+            device (`torch.device`):
+                The device of the input to the model.
+
+        Returns:
+            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def forward(
+        self,
+        text_input=None,
+        image_input=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of:
+            shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
+            value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
+            used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
+            `(batch_size, sequence_length)`.
+        use_cache (`bool`, `optional`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+
+        text = self.tokenizer(text_input, return_tensors="pt", padding=True)
+        text = text.to(self.device)
+        input_ids = text.input_ids
+        batch_size = input_ids.shape[0]
+        query_atts = torch.ones((batch_size, self.query_tokens.size()[1]), dtype=torch.long).to(self.device)
+        attention_mask = torch.cat([query_atts, text.attention_mask], dim=1)
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0
+        )
+
+        query_length = self.query_tokens.shape[1]
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            query_embeds=self.query_tokens,
+            past_key_values_length=past_key_values_length,
+        )
+
+        # embedding_output = self.layernorm(query_embeds)
+        # embedding_output = self.dropout(embedding_output)
+
+        input_shape = embedding_output.size()[:-1]
+        batch_size, seq_length = input_shape
+        device = embedding_output.device
+
+        image_embeds_frozen = self.visual_encoder(image_input).last_hidden_state
+        # image_embeds_frozen = torch.ones_like(image_embeds_frozen)
+        encoder_hidden_states = image_embeds_frozen
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if isinstance(encoder_hidden_states, list):
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
+            else:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+            if isinstance(encoder_attention_mask, list):
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.qformer_config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = sequence_output[:, 0, :]
+
+        if not return_dict:
+            return self.proj_layer(sequence_output[:, :query_length, :])
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py
new file mode 100644
index 000000000..c6772fc88
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py
@@ -0,0 +1,223 @@
+# Copyright 2024 Salesforce.com, inc.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers import CLIPPreTrainedModel
+from transformers.modeling_outputs import BaseModelOutputWithPooling
+from transformers.models.clip.configuration_clip import CLIPTextConfig
+from transformers.models.clip.modeling_clip import CLIPEncoder
+
+
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# This is a modified version of the CLIPTextModel from transformers.models.clip.modeling_clip
+# Which allows for an extra input of "context embeddings", which are the query embeddings used in Qformer
+# They pass through the clip model, along with the text embeddings, and interact with them using self attention
+class ContextCLIPTextModel(CLIPPreTrainedModel):
+    config_class = CLIPTextConfig
+
+    _no_split_modules = ["CLIPEncoderLayer"]
+
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__(config)
+        self.text_model = ContextCLIPTextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        ctx_embeddings: torch.Tensor = None,
+        ctx_begin_pos: list = None,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        return self.text_model(
+            ctx_embeddings=ctx_embeddings,
+            ctx_begin_pos=ctx_begin_pos,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class ContextCLIPTextTransformer(nn.Module):
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = ContextCLIPTextEmbeddings(config)
+        self.encoder = CLIPEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
+
+    def forward(
+        self,
+        ctx_embeddings: torch.Tensor,
+        ctx_begin_pos: list,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify either input_ids")
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+
+        hidden_states = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            ctx_embeddings=ctx_embeddings,
+            ctx_begin_pos=ctx_begin_pos,
+        )
+
+        bsz, seq_len = input_shape
+        if ctx_embeddings is not None:
+            seq_len += ctx_embeddings.size(1)
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
+            hidden_states.device
+        )
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+        pooled_output = last_hidden_state[
+            torch.arange(last_hidden_state.shape[0], device=input_ids.device),
+            input_ids.to(torch.int).argmax(dim=-1),
+        ]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def _build_causal_attention_mask(self, bsz, seq_len, dtype):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
+        mask.fill_(torch.tensor(torch.finfo(dtype).min))
+        mask.triu_(1)  # zero out the lower diagonal
+        mask = mask.unsqueeze(1)  # expand mask
+        return mask
+
+
+class ContextCLIPTextEmbeddings(nn.Module):
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+
+    def forward(
+        self,
+        ctx_embeddings: torch.Tensor,
+        ctx_begin_pos: list,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        if ctx_embeddings is None:
+            ctx_len = 0
+        else:
+            ctx_len = ctx_embeddings.shape[1]
+
+        seq_length = (input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]) + ctx_len
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+
+            # for each input embeddings, add the ctx embeddings at the correct position
+            input_embeds_ctx = []
+            bsz = inputs_embeds.shape[0]
+
+            if ctx_embeddings is not None:
+                for i in range(bsz):
+                    cbp = ctx_begin_pos[i]
+
+                    prefix = inputs_embeds[i, :cbp]
+                    # remove the special token embedding
+                    suffix = inputs_embeds[i, cbp:]
+
+                    input_embeds_ctx.append(torch.cat([prefix, ctx_embeddings[i], suffix], dim=0))
+
+                inputs_embeds = torch.stack(input_embeds_ctx, dim=0)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        return embeddings
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py
new file mode 100644
index 000000000..ba43b2e53
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py
@@ -0,0 +1,348 @@
+# Copyright 2024 Salesforce.com, inc.
+# Copyright 2024 The HuggingFace Team. All rights reserved.#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import CLIPTokenizer
+
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...schedulers import PNDMScheduler
+from ...utils import (
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from .blip_image_processing import BlipImageProcessor
+from .modeling_blip2 import Blip2QFormerModel
+from .modeling_ctx_clip import ContextCLIPTextModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers.pipelines import BlipDiffusionPipeline
+        >>> from diffusers.utils import load_image
+        >>> import torch
+
+        >>> blip_diffusion_pipe = BlipDiffusionPipeline.from_pretrained(
+        ...     "Salesforce/blipdiffusion", torch_dtype=torch.float16
+        ... ).to("cuda")
+
+
+        >>> cond_subject = "dog"
+        >>> tgt_subject = "dog"
+        >>> text_prompt_input = "swimming underwater"
+
+        >>> cond_image = load_image(
+        ...     "https://huggingface.co/datasets/ayushtues/blipdiffusion_images/resolve/main/dog.jpg"
+        ... )
+        >>> guidance_scale = 7.5
+        >>> num_inference_steps = 25
+        >>> negative_prompt = "over-exposure, under-exposure, saturated, duplicate, out of frame, lowres, cropped, worst quality, low quality, jpeg artifacts, morbid, mutilated, out of frame, ugly, bad anatomy, bad proportions, deformed, blurry, duplicate"
+
+
+        >>> output = blip_diffusion_pipe(
+        ...     text_prompt_input,
+        ...     cond_image,
+        ...     cond_subject,
+        ...     tgt_subject,
+        ...     guidance_scale=guidance_scale,
+        ...     num_inference_steps=num_inference_steps,
+        ...     neg_prompt=negative_prompt,
+        ...     height=512,
+        ...     width=512,
+        ... ).images
+        >>> output[0].save("image.png")
+        ```
+"""
+
+
+class BlipDiffusionPipeline(DiffusionPipeline):
+    """
+    Pipeline for Zero-Shot Subject Driven Generation using Blip Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        tokenizer ([`CLIPTokenizer`]):
+            Tokenizer for the text encoder
+        text_encoder ([`ContextCLIPTextModel`]):
+            Text encoder to encode the text prompt
+        vae ([`AutoencoderKL`]):
+            VAE model to map the latents to the image
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        scheduler ([`PNDMScheduler`]):
+             A scheduler to be used in combination with `unet` to generate image latents.
+        qformer ([`Blip2QFormerModel`]):
+            QFormer model to get multi-modal embeddings from the text and image.
+        image_processor ([`BlipImageProcessor`]):
+            Image Processor to preprocess and postprocess the image.
+        ctx_begin_pos (int, `optional`, defaults to 2):
+            Position of the context token in the text encoder.
+    """
+
+    model_cpu_offload_seq = "qformer->text_encoder->unet->vae"
+
+    def __init__(
+        self,
+        tokenizer: CLIPTokenizer,
+        text_encoder: ContextCLIPTextModel,
+        vae: AutoencoderKL,
+        unet: UNet2DConditionModel,
+        scheduler: PNDMScheduler,
+        qformer: Blip2QFormerModel,
+        image_processor: BlipImageProcessor,
+        ctx_begin_pos: int = 2,
+        mean: List[float] = None,
+        std: List[float] = None,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            vae=vae,
+            unet=unet,
+            scheduler=scheduler,
+            qformer=qformer,
+            image_processor=image_processor,
+        )
+        self.register_to_config(ctx_begin_pos=ctx_begin_pos, mean=mean, std=std)
+
+    def get_query_embeddings(self, input_image, src_subject):
+        return self.qformer(image_input=input_image, text_input=src_subject, return_dict=False)
+
+    # from the original Blip Diffusion code, speciefies the target subject and augments the prompt by repeating it
+    def _build_prompt(self, prompts, tgt_subjects, prompt_strength=1.0, prompt_reps=20):
+        rv = []
+        for prompt, tgt_subject in zip(prompts, tgt_subjects):
+            prompt = f"a {tgt_subject} {prompt.strip()}"
+            # a trick to amplify the prompt
+            rv.append(", ".join([prompt] * int(prompt_strength * prompt_reps)))
+
+        return rv
+
+    # Copied from diffusers.pipelines.consistency_models.pipeline_consistency_models.ConsistencyModelPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels, height, width)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device=device, dtype=dtype)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def encode_prompt(self, query_embeds, prompt, device=None):
+        device = device or self._execution_device
+
+        # embeddings for prompt, with query_embeds as context
+        max_len = self.text_encoder.text_model.config.max_position_embeddings
+        max_len -= self.qformer.config.num_query_tokens
+
+        tokenized_prompt = self.tokenizer(
+            prompt,
+            padding="max_length",
+            truncation=True,
+            max_length=max_len,
+            return_tensors="pt",
+        ).to(device)
+
+        batch_size = query_embeds.shape[0]
+        ctx_begin_pos = [self.config.ctx_begin_pos] * batch_size
+
+        text_embeddings = self.text_encoder(
+            input_ids=tokenized_prompt.input_ids,
+            ctx_embeddings=query_embeds,
+            ctx_begin_pos=ctx_begin_pos,
+        )[0]
+
+        return text_embeddings
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: List[str],
+        reference_image: PIL.Image.Image,
+        source_subject_category: List[str],
+        target_subject_category: List[str],
+        latents: Optional[torch.FloatTensor] = None,
+        guidance_scale: float = 7.5,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        neg_prompt: Optional[str] = "",
+        prompt_strength: float = 1.0,
+        prompt_reps: int = 20,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`List[str]`):
+                The prompt or prompts to guide the image generation.
+            reference_image (`PIL.Image.Image`):
+                The reference image to condition the generation on.
+            source_subject_category (`List[str]`):
+                The source subject category.
+            target_subject_category (`List[str]`):
+                The target subject category.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by random sampling.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            height (`int`, *optional*, defaults to 512):
+                The height of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            neg_prompt (`str`, *optional*, defaults to ""):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_strength (`float`, *optional*, defaults to 1.0):
+                The strength of the prompt. Specifies the number of times the prompt is repeated along with prompt_reps
+                to amplify the prompt.
+            prompt_reps (`int`, *optional*, defaults to 20):
+                The number of times the prompt is repeated along with prompt_strength to amplify the prompt.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        device = self._execution_device
+
+        reference_image = self.image_processor.preprocess(
+            reference_image, image_mean=self.config.mean, image_std=self.config.std, return_tensors="pt"
+        )["pixel_values"]
+        reference_image = reference_image.to(device)
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        if isinstance(source_subject_category, str):
+            source_subject_category = [source_subject_category]
+        if isinstance(target_subject_category, str):
+            target_subject_category = [target_subject_category]
+
+        batch_size = len(prompt)
+
+        prompt = self._build_prompt(
+            prompts=prompt,
+            tgt_subjects=target_subject_category,
+            prompt_strength=prompt_strength,
+            prompt_reps=prompt_reps,
+        )
+        query_embeds = self.get_query_embeddings(reference_image, source_subject_category)
+        text_embeddings = self.encode_prompt(query_embeds, prompt, device)
+        do_classifier_free_guidance = guidance_scale > 1.0
+        if do_classifier_free_guidance:
+            max_length = self.text_encoder.text_model.config.max_position_embeddings
+
+            uncond_input = self.tokenizer(
+                [neg_prompt] * batch_size,
+                padding="max_length",
+                max_length=max_length,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(
+                input_ids=uncond_input.input_ids.to(device),
+                ctx_embeddings=None,
+            )[0]
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+
+        scale_down_factor = 2 ** (len(self.unet.config.block_out_channels) - 1)
+        latents = self.prepare_latents(
+            batch_size=batch_size,
+            num_channels=self.unet.config.in_channels,
+            height=height // scale_down_factor,
+            width=width // scale_down_factor,
+            generator=generator,
+            latents=latents,
+            dtype=self.unet.dtype,
+            device=device,
+        )
+        # set timesteps
+        extra_set_kwargs = {}
+        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+
+        for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            do_classifier_free_guidance = guidance_scale > 1.0
+
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            noise_pred = self.unet(
+                latent_model_input,
+                timestep=t,
+                encoder_hidden_states=text_embeddings,
+                down_block_additional_residuals=None,
+                mid_block_additional_residual=None,
+            )["sample"]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+            )["prev_sample"]
+
+        image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/consistency_models/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/consistency_models/__init__.py
new file mode 100644
index 000000000..162d91c01
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/consistency_models/__init__.py
@@ -0,0 +1,24 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    _LazyModule,
+)
+
+
+_import_structure = {
+    "pipeline_consistency_models": ["ConsistencyModelPipeline"],
+}
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .pipeline_consistency_models import ConsistencyModelPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py
new file mode 100644
index 000000000..befac79c6
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py
@@ -0,0 +1,275 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, List, Optional, Union
+
+import torch
+
+from ...models import UNet2DModel
+from ...schedulers import CMStochasticIterativeScheduler
+from ...utils import (
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+
+        >>> from diffusers import ConsistencyModelPipeline
+
+        >>> device = "cuda"
+        >>> # Load the cd_imagenet64_l2 checkpoint.
+        >>> model_id_or_path = "openai/diffusers-cd_imagenet64_l2"
+        >>> pipe = ConsistencyModelPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
+        >>> pipe.to(device)
+
+        >>> # Onestep Sampling
+        >>> image = pipe(num_inference_steps=1).images[0]
+        >>> image.save("cd_imagenet64_l2_onestep_sample.png")
+
+        >>> # Onestep sampling, class-conditional image generation
+        >>> # ImageNet-64 class label 145 corresponds to king penguins
+        >>> image = pipe(num_inference_steps=1, class_labels=145).images[0]
+        >>> image.save("cd_imagenet64_l2_onestep_sample_penguin.png")
+
+        >>> # Multistep sampling, class-conditional image generation
+        >>> # Timesteps can be explicitly specified; the particular timesteps below are from the original Github repo:
+        >>> # https://github.com/openai/consistency_models/blob/main/scripts/launch.sh#L77
+        >>> image = pipe(num_inference_steps=None, timesteps=[22, 0], class_labels=145).images[0]
+        >>> image.save("cd_imagenet64_l2_multistep_sample_penguin.png")
+        ```
+"""
+
+
+class ConsistencyModelPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for unconditional or class-conditional image generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Currently only
+            compatible with [`CMStochasticIterativeScheduler`].
+    """
+
+    model_cpu_offload_seq = "unet"
+
+    def __init__(self, unet: UNet2DModel, scheduler: CMStochasticIterativeScheduler) -> None:
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+        )
+
+        self.safety_checker = None
+
+    def prepare_latents(self, batch_size, num_channels, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels, height, width)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device=device, dtype=dtype)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Follows diffusers.VaeImageProcessor.postprocess
+    def postprocess_image(self, sample: torch.FloatTensor, output_type: str = "pil"):
+        if output_type not in ["pt", "np", "pil"]:
+            raise ValueError(
+                f"output_type={output_type} is not supported. Make sure to choose one of ['pt', 'np', or 'pil']"
+            )
+
+        # Equivalent to diffusers.VaeImageProcessor.denormalize
+        sample = (sample / 2 + 0.5).clamp(0, 1)
+        if output_type == "pt":
+            return sample
+
+        # Equivalent to diffusers.VaeImageProcessor.pt_to_numpy
+        sample = sample.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "np":
+            return sample
+
+        # Output_type must be 'pil'
+        sample = self.numpy_to_pil(sample)
+        return sample
+
+    def prepare_class_labels(self, batch_size, device, class_labels=None):
+        if self.unet.config.num_class_embeds is not None:
+            if isinstance(class_labels, list):
+                class_labels = torch.tensor(class_labels, dtype=torch.int)
+            elif isinstance(class_labels, int):
+                assert batch_size == 1, "Batch size must be 1 if classes is an int"
+                class_labels = torch.tensor([class_labels], dtype=torch.int)
+            elif class_labels is None:
+                # Randomly generate batch_size class labels
+                # TODO: should use generator here? int analogue of randn_tensor is not exposed in ...utils
+                class_labels = torch.randint(0, self.unet.config.num_class_embeds, size=(batch_size,))
+            class_labels = class_labels.to(device)
+        else:
+            class_labels = None
+        return class_labels
+
+    def check_inputs(self, num_inference_steps, timesteps, latents, batch_size, img_size, callback_steps):
+        if num_inference_steps is None and timesteps is None:
+            raise ValueError("Exactly one of `num_inference_steps` or `timesteps` must be supplied.")
+
+        if num_inference_steps is not None and timesteps is not None:
+            logger.warning(
+                f"Both `num_inference_steps`: {num_inference_steps} and `timesteps`: {timesteps} are supplied;"
+                " `timesteps` will be used over `num_inference_steps`."
+            )
+
+        if latents is not None:
+            expected_shape = (batch_size, 3, img_size, img_size)
+            if latents.shape != expected_shape:
+                raise ValueError(f"The shape of latents is {latents.shape} but is expected to be {expected_shape}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        batch_size: int = 1,
+        class_labels: Optional[Union[torch.Tensor, List[int], int]] = None,
+        num_inference_steps: int = 1,
+        timesteps: List[int] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Args:
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of images to generate.
+            class_labels (`torch.Tensor` or `List[int]` or `int`, *optional*):
+                Optional class labels for conditioning class-conditional consistency models. Not used if the model is
+                not class-conditional.
+            num_inference_steps (`int`, *optional*, defaults to 1):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+        # 0. Prepare call parameters
+        img_size = self.unet.config.sample_size
+        device = self._execution_device
+
+        # 1. Check inputs
+        self.check_inputs(num_inference_steps, timesteps, latents, batch_size, img_size, callback_steps)
+
+        # 2. Prepare image latents
+        # Sample image latents x_0 ~ N(0, sigma_0^2 * I)
+        sample = self.prepare_latents(
+            batch_size=batch_size,
+            num_channels=self.unet.config.in_channels,
+            height=img_size,
+            width=img_size,
+            dtype=self.unet.dtype,
+            device=device,
+            generator=generator,
+            latents=latents,
+        )
+
+        # 3. Handle class_labels for class-conditional models
+        class_labels = self.prepare_class_labels(batch_size, device, class_labels=class_labels)
+
+        # 4. Prepare timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps)
+            timesteps = self.scheduler.timesteps
+
+        # 5. Denoising loop
+        # Multistep sampling: implements Algorithm 1 in the paper
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                scaled_sample = self.scheduler.scale_model_input(sample, t)
+                model_output = self.unet(scaled_sample, t, class_labels=class_labels, return_dict=False)[0]
+
+                sample = self.scheduler.step(model_output, t, sample, generator=generator)[0]
+
+                # call the callback, if provided
+                progress_bar.update()
+                if callback is not None and i % callback_steps == 0:
+                    callback(i, t, sample)
+
+        # 6. Post-process image sample
+        image = self.postprocess_image(sample, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/__init__.py
new file mode 100644
index 000000000..3b832c017
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/__init__.py
@@ -0,0 +1,80 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_flax_available,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["multicontrolnet"] = ["MultiControlNetModel"]
+    _import_structure["pipeline_controlnet"] = ["StableDiffusionControlNetPipeline"]
+    _import_structure["pipeline_controlnet_blip_diffusion"] = ["BlipDiffusionControlNetPipeline"]
+    _import_structure["pipeline_controlnet_img2img"] = ["StableDiffusionControlNetImg2ImgPipeline"]
+    _import_structure["pipeline_controlnet_inpaint"] = ["StableDiffusionControlNetInpaintPipeline"]
+    _import_structure["pipeline_controlnet_inpaint_sd_xl"] = ["StableDiffusionXLControlNetInpaintPipeline"]
+    _import_structure["pipeline_controlnet_sd_xl"] = ["StableDiffusionXLControlNetPipeline"]
+    _import_structure["pipeline_controlnet_sd_xl_img2img"] = ["StableDiffusionXLControlNetImg2ImgPipeline"]
+try:
+    if not (is_transformers_available() and is_flax_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_flax_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_flax_and_transformers_objects))
+else:
+    _import_structure["pipeline_flax_controlnet"] = ["FlaxStableDiffusionControlNetPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .multicontrolnet import MultiControlNetModel
+        from .pipeline_controlnet import StableDiffusionControlNetPipeline
+        from .pipeline_controlnet_blip_diffusion import BlipDiffusionControlNetPipeline
+        from .pipeline_controlnet_img2img import StableDiffusionControlNetImg2ImgPipeline
+        from .pipeline_controlnet_inpaint import StableDiffusionControlNetInpaintPipeline
+        from .pipeline_controlnet_inpaint_sd_xl import StableDiffusionXLControlNetInpaintPipeline
+        from .pipeline_controlnet_sd_xl import StableDiffusionXLControlNetPipeline
+        from .pipeline_controlnet_sd_xl_img2img import StableDiffusionXLControlNetImg2ImgPipeline
+
+    try:
+        if not (is_transformers_available() and is_flax_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_flax_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipeline_flax_controlnet import FlaxStableDiffusionControlNetPipeline
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/multicontrolnet.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/multicontrolnet.py
new file mode 100644
index 000000000..7d284f2d2
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/multicontrolnet.py
@@ -0,0 +1,187 @@
+import os
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from ...models.controlnet import ControlNetModel, ControlNetOutput
+from ...models.modeling_utils import ModelMixin
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class MultiControlNetModel(ModelMixin):
+    r"""
+    Multiple `ControlNetModel` wrapper class for Multi-ControlNet
+
+    This module is a wrapper for multiple instances of the `ControlNetModel`. The `forward()` API is designed to be
+    compatible with `ControlNetModel`.
+
+    Args:
+        controlnets (`List[ControlNetModel]`):
+            Provides additional conditioning to the unet during the denoising process. You must set multiple
+            `ControlNetModel` as a list.
+    """
+
+    def __init__(self, controlnets: Union[List[ControlNetModel], Tuple[ControlNetModel]]):
+        super().__init__()
+        self.nets = nn.ModuleList(controlnets)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        controlnet_cond: List[torch.tensor],
+        conditioning_scale: List[float],
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guess_mode: bool = False,
+        return_dict: bool = True,
+    ) -> Union[ControlNetOutput, Tuple]:
+        for i, (image, scale, controlnet) in enumerate(zip(controlnet_cond, conditioning_scale, self.nets)):
+            down_samples, mid_sample = controlnet(
+                sample=sample,
+                timestep=timestep,
+                encoder_hidden_states=encoder_hidden_states,
+                controlnet_cond=image,
+                conditioning_scale=scale,
+                class_labels=class_labels,
+                timestep_cond=timestep_cond,
+                attention_mask=attention_mask,
+                added_cond_kwargs=added_cond_kwargs,
+                cross_attention_kwargs=cross_attention_kwargs,
+                guess_mode=guess_mode,
+                return_dict=return_dict,
+            )
+
+            # merge samples
+            if i == 0:
+                down_block_res_samples, mid_block_res_sample = down_samples, mid_sample
+            else:
+                down_block_res_samples = [
+                    samples_prev + samples_curr
+                    for samples_prev, samples_curr in zip(down_block_res_samples, down_samples)
+                ]
+                mid_block_res_sample += mid_sample
+
+        return down_block_res_samples, mid_block_res_sample
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        is_main_process: bool = True,
+        save_function: Callable = None,
+        safe_serialization: bool = True,
+        variant: Optional[str] = None,
+    ):
+        """
+        Save a model and its configuration file to a directory, so that it can be re-loaded using the
+        `[`~pipelines.controlnet.MultiControlNetModel.from_pretrained`]` class method.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to which to save. Will be created if it doesn't exist.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful when in distributed training like
+                TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
+                the main process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful on distributed training like TPUs when one
+                need to replace `torch.save` by another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+            variant (`str`, *optional*):
+                If specified, weights are saved in the format pytorch_model.<variant>.bin.
+        """
+        idx = 0
+        model_path_to_save = save_directory
+        for controlnet in self.nets:
+            controlnet.save_pretrained(
+                model_path_to_save,
+                is_main_process=is_main_process,
+                save_function=save_function,
+                safe_serialization=safe_serialization,
+                variant=variant,
+            )
+
+            idx += 1
+            model_path_to_save = model_path_to_save + f"_{idx}"
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_path: Optional[Union[str, os.PathLike]], **kwargs):
+        r"""
+        Instantiate a pretrained MultiControlNet model from multiple pre-trained controlnet models.
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
+        the model, you should first set it back in training mode with `model.train()`.
+
+        The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
+        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
+        task.
+
+        The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
+        weights are discarded.
+
+        Parameters:
+            pretrained_model_path (`os.PathLike`):
+                A path to a *directory* containing model weights saved using
+                [`~diffusers.pipelines.controlnet.MultiControlNetModel.save_pretrained`], e.g.,
+                `./my_model_directory/controlnet`.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
+                will be automatically derived from the model's weights.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn't need to be refined to each
+                parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
+                same device.
+
+                To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            max_memory (`Dict`, *optional*):
+                A dictionary device identifier to maximum memory. Will default to the maximum memory available for each
+                GPU and the available CPU RAM if unset.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading by not initializing the weights and only loading the pre-trained weights. This
+                also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the
+                model. This is only supported when torch version >= 1.9.0. If you are using an older version of torch,
+                setting this argument to `True` will raise an error.
+            variant (`str`, *optional*):
+                If specified load weights from `variant` filename, *e.g.* pytorch_model.<variant>.bin. `variant` is
+                ignored when using `from_flax`.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the `safetensors` weights will be downloaded if they're available **and** if the
+                `safetensors` library is installed. If set to `True`, the model will be forcibly loaded from
+                `safetensors` weights. If set to `False`, loading will *not* use `safetensors`.
+        """
+        idx = 0
+        controlnets = []
+
+        # load controlnet and append to list until no controlnet directory exists anymore
+        # first controlnet has to be saved under `./mydirectory/controlnet` to be compliant with `DiffusionPipeline.from_prertained`
+        # second, third, ... controlnets have to be saved under `./mydirectory/controlnet_1`, `./mydirectory/controlnet_2`, ...
+        model_path_to_load = pretrained_model_path
+        while os.path.isdir(model_path_to_load):
+            controlnet = ControlNetModel.from_pretrained(model_path_to_load, **kwargs)
+            controlnets.append(controlnet)
+
+            idx += 1
+            model_path_to_load = pretrained_model_path + f"_{idx}"
+
+        logger.info(f"{len(controlnets)} controlnets loaded from {pretrained_model_path}.")
+
+        if len(controlnets) == 0:
+            raise ValueError(
+                f"No ControlNets found under {os.path.dirname(pretrained_model_path)}. Expected at least {pretrained_model_path + '_0'}."
+            )
+
+        return cls(controlnets)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
new file mode 100644
index 000000000..8f31dfc26
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
@@ -0,0 +1,1318 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from .multicontrolnet import MultiControlNetModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install opencv-python transformers accelerate
+        >>> from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
+        >>> from diffusers.utils import load_image
+        >>> import numpy as np
+        >>> import torch
+
+        >>> import cv2
+        >>> from PIL import Image
+
+        >>> # download an image
+        >>> image = load_image(
+        ...     "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
+        ... )
+        >>> image = np.array(image)
+
+        >>> # get canny image
+        >>> image = cv2.Canny(image, 100, 200)
+        >>> image = image[:, :, None]
+        >>> image = np.concatenate([image, image, image], axis=2)
+        >>> canny_image = Image.fromarray(image)
+
+        >>> # load control net and stable diffusion v1-5
+        >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+        >>> pipe = StableDiffusionControlNetPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+
+        >>> # speed up diffusion process with faster scheduler and memory optimization
+        >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+        >>> # remove following line if xformers is not installed
+        >>> pipe.enable_xformers_memory_efficient_attention()
+
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> # generate image
+        >>> generator = torch.manual_seed(0)
+        >>> image = pipe(
+        ...     "futuristic-looking woman", num_inference_steps=20, generator=generator, image=canny_image
+        ... ).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionControlNetPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    LoraLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+            Provides additional conditioning to the `unet` during the denoising process. If you set multiple
+            ControlNets as a list, the outputs from each ControlNet are added together to create one combined
+            additional conditioning.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            self.check_image(image, prompt, prompt_embeds)
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if not isinstance(image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in image):
+                transposed_image = [list(t) for t in zip(*image)]
+                if len(transposed_image) != len(self.controlnet.nets):
+                    raise ValueError(
+                        f"For multiple controlnets: if you pass`image` as a list of list, each sublist must have the same length as the number of controlnets, but the sublists in `image` got {len(transposed_image)} images and {len(self.controlnet.nets)} ControlNets."
+                    )
+                for image_ in transposed_image:
+                    self.check_image(image_, prompt, prompt_embeds)
+            elif len(image) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
+                )
+
+            for image_ in image:
+                self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError(
+                        "A single batch of varying conditioning scale settings (e.g. [[1.0, 0.5], [0.2, 0.8]]) is not supported at the moment. "
+                        "The conditioning scale must be fixed across the batch."
+                    )
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if not isinstance(control_guidance_start, (tuple, list)):
+            control_guidance_start = [control_guidance_start]
+
+        if not isinstance(control_guidance_end, (tuple, list)):
+            control_guidance_end = [control_guidance_end]
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be
+                accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
+                and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in
+                `init`, images must be passed as a list such that each element of the list can be correctly batched for
+                input to a single ControlNet. When `prompt` is a list, and if a list of images is passed for a single ControlNet,
+                each will be paired with each prompt in the `prompt` list. This also applies to multiple ControlNets,
+                where a list of image lists can be passed to batch for each prompt and each ControlNet.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                The ControlNet encoder tries to recognize the content of the input image even if you remove all
+                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeine class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            image,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare image
+        if isinstance(controlnet, ControlNetModel):
+            image = self.prepare_image(
+                image=image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+            height, width = image.shape[-2:]
+        elif isinstance(controlnet, MultiControlNetModel):
+            images = []
+
+            # Nested lists as ControlNet condition
+            if isinstance(image[0], list):
+                # Transpose the nested image list
+                image = [list(t) for t in zip(*image)]
+
+            for image_ in image:
+                image_ = self.prepare_image(
+                    image=image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+
+                images.append(image_)
+
+            image = images
+            height, width = image[0].shape[-2:]
+        else:
+            assert False
+
+        # 5. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        self._num_timesteps = len(timesteps)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6.5 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        # 7.2 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        is_unet_compiled = is_compiled_module(self.unet)
+        is_controlnet_compiled = is_compiled_module(self.controlnet)
+        is_torch_higher_equal_2_1 = is_torch_version(">=", "2.1")
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Relevant thread:
+                # https://dev-discuss.pytorch.org/t/cudagraphs-in-pytorch-2-0/1428
+                if (is_unet_compiled and is_controlnet_compiled) and is_torch_higher_equal_2_1:
+                    torch._inductor.cudagraph_mark_step_begin()
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # controlnet(s) inference
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                else:
+                    control_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=image,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    return_dict=False,
+                )
+
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infered ControlNet only for the conditional batch.
+                    # To apply the output of ControlNet to both the unconditional and conditional batches,
+                    # add 0 to the unconditional batch to keep it unchanged.
+                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py
new file mode 100644
index 000000000..b983a3f8d
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py
@@ -0,0 +1,413 @@
+# Copyright 2024 Salesforce.com, inc.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import CLIPTokenizer
+
+from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
+from ...schedulers import PNDMScheduler
+from ...utils import (
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..blip_diffusion.blip_image_processing import BlipImageProcessor
+from ..blip_diffusion.modeling_blip2 import Blip2QFormerModel
+from ..blip_diffusion.modeling_ctx_clip import ContextCLIPTextModel
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers.pipelines import BlipDiffusionControlNetPipeline
+        >>> from diffusers.utils import load_image
+        >>> from controlnet_aux import CannyDetector
+        >>> import torch
+
+        >>> blip_diffusion_pipe = BlipDiffusionControlNetPipeline.from_pretrained(
+        ...     "Salesforce/blipdiffusion-controlnet", torch_dtype=torch.float16
+        ... ).to("cuda")
+
+        >>> style_subject = "flower"
+        >>> tgt_subject = "teapot"
+        >>> text_prompt = "on a marble table"
+
+        >>> cldm_cond_image = load_image(
+        ...     "https://huggingface.co/datasets/ayushtues/blipdiffusion_images/resolve/main/kettle.jpg"
+        ... ).resize((512, 512))
+        >>> canny = CannyDetector()
+        >>> cldm_cond_image = canny(cldm_cond_image, 30, 70, output_type="pil")
+        >>> style_image = load_image(
+        ...     "https://huggingface.co/datasets/ayushtues/blipdiffusion_images/resolve/main/flower.jpg"
+        ... )
+        >>> guidance_scale = 7.5
+        >>> num_inference_steps = 50
+        >>> negative_prompt = "over-exposure, under-exposure, saturated, duplicate, out of frame, lowres, cropped, worst quality, low quality, jpeg artifacts, morbid, mutilated, out of frame, ugly, bad anatomy, bad proportions, deformed, blurry, duplicate"
+
+
+        >>> output = blip_diffusion_pipe(
+        ...     text_prompt,
+        ...     style_image,
+        ...     cldm_cond_image,
+        ...     style_subject,
+        ...     tgt_subject,
+        ...     guidance_scale=guidance_scale,
+        ...     num_inference_steps=num_inference_steps,
+        ...     neg_prompt=negative_prompt,
+        ...     height=512,
+        ...     width=512,
+        ... ).images
+        >>> output[0].save("image.png")
+        ```
+"""
+
+
+class BlipDiffusionControlNetPipeline(DiffusionPipeline):
+    """
+    Pipeline for Canny Edge based Controlled subject-driven generation using Blip Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        tokenizer ([`CLIPTokenizer`]):
+            Tokenizer for the text encoder
+        text_encoder ([`ContextCLIPTextModel`]):
+            Text encoder to encode the text prompt
+        vae ([`AutoencoderKL`]):
+            VAE model to map the latents to the image
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        scheduler ([`PNDMScheduler`]):
+             A scheduler to be used in combination with `unet` to generate image latents.
+        qformer ([`Blip2QFormerModel`]):
+            QFormer model to get multi-modal embeddings from the text and image.
+        controlnet ([`ControlNetModel`]):
+            ControlNet model to get the conditioning image embedding.
+        image_processor ([`BlipImageProcessor`]):
+            Image Processor to preprocess and postprocess the image.
+        ctx_begin_pos (int, `optional`, defaults to 2):
+            Position of the context token in the text encoder.
+    """
+
+    model_cpu_offload_seq = "qformer->text_encoder->unet->vae"
+
+    def __init__(
+        self,
+        tokenizer: CLIPTokenizer,
+        text_encoder: ContextCLIPTextModel,
+        vae: AutoencoderKL,
+        unet: UNet2DConditionModel,
+        scheduler: PNDMScheduler,
+        qformer: Blip2QFormerModel,
+        controlnet: ControlNetModel,
+        image_processor: BlipImageProcessor,
+        ctx_begin_pos: int = 2,
+        mean: List[float] = None,
+        std: List[float] = None,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            vae=vae,
+            unet=unet,
+            scheduler=scheduler,
+            qformer=qformer,
+            controlnet=controlnet,
+            image_processor=image_processor,
+        )
+        self.register_to_config(ctx_begin_pos=ctx_begin_pos, mean=mean, std=std)
+
+    def get_query_embeddings(self, input_image, src_subject):
+        return self.qformer(image_input=input_image, text_input=src_subject, return_dict=False)
+
+    # from the original Blip Diffusion code, speciefies the target subject and augments the prompt by repeating it
+    def _build_prompt(self, prompts, tgt_subjects, prompt_strength=1.0, prompt_reps=20):
+        rv = []
+        for prompt, tgt_subject in zip(prompts, tgt_subjects):
+            prompt = f"a {tgt_subject} {prompt.strip()}"
+            # a trick to amplify the prompt
+            rv.append(", ".join([prompt] * int(prompt_strength * prompt_reps)))
+
+        return rv
+
+    # Copied from diffusers.pipelines.consistency_models.pipeline_consistency_models.ConsistencyModelPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels, height, width)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device=device, dtype=dtype)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def encode_prompt(self, query_embeds, prompt, device=None):
+        device = device or self._execution_device
+
+        # embeddings for prompt, with query_embeds as context
+        max_len = self.text_encoder.text_model.config.max_position_embeddings
+        max_len -= self.qformer.config.num_query_tokens
+
+        tokenized_prompt = self.tokenizer(
+            prompt,
+            padding="max_length",
+            truncation=True,
+            max_length=max_len,
+            return_tensors="pt",
+        ).to(device)
+
+        batch_size = query_embeds.shape[0]
+        ctx_begin_pos = [self.config.ctx_begin_pos] * batch_size
+
+        text_embeddings = self.text_encoder(
+            input_ids=tokenized_prompt.input_ids,
+            ctx_embeddings=query_embeds,
+            ctx_begin_pos=ctx_begin_pos,
+        )[0]
+
+        return text_embeddings
+
+    # Adapted from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
+    def prepare_control_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+    ):
+        image = self.image_processor.preprocess(
+            image,
+            size={"width": width, "height": height},
+            do_rescale=True,
+            do_center_crop=False,
+            do_normalize=False,
+            return_tensors="pt",
+        )["pixel_values"].to(device)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: List[str],
+        reference_image: PIL.Image.Image,
+        condtioning_image: PIL.Image.Image,
+        source_subject_category: List[str],
+        target_subject_category: List[str],
+        latents: Optional[torch.FloatTensor] = None,
+        guidance_scale: float = 7.5,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        neg_prompt: Optional[str] = "",
+        prompt_strength: float = 1.0,
+        prompt_reps: int = 20,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`List[str]`):
+                The prompt or prompts to guide the image generation.
+            reference_image (`PIL.Image.Image`):
+                The reference image to condition the generation on.
+            condtioning_image (`PIL.Image.Image`):
+                The conditioning canny edge image to condition the generation on.
+            source_subject_category (`List[str]`):
+                The source subject category.
+            target_subject_category (`List[str]`):
+                The target subject category.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by random sampling.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            height (`int`, *optional*, defaults to 512):
+                The height of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width of the generated image.
+            seed (`int`, *optional*, defaults to 42):
+                The seed to use for random generation.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            neg_prompt (`str`, *optional*, defaults to ""):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_strength (`float`, *optional*, defaults to 1.0):
+                The strength of the prompt. Specifies the number of times the prompt is repeated along with prompt_reps
+                to amplify the prompt.
+            prompt_reps (`int`, *optional*, defaults to 20):
+                The number of times the prompt is repeated along with prompt_strength to amplify the prompt.
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        device = self._execution_device
+
+        reference_image = self.image_processor.preprocess(
+            reference_image, image_mean=self.config.mean, image_std=self.config.std, return_tensors="pt"
+        )["pixel_values"]
+        reference_image = reference_image.to(device)
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        if isinstance(source_subject_category, str):
+            source_subject_category = [source_subject_category]
+        if isinstance(target_subject_category, str):
+            target_subject_category = [target_subject_category]
+
+        batch_size = len(prompt)
+
+        prompt = self._build_prompt(
+            prompts=prompt,
+            tgt_subjects=target_subject_category,
+            prompt_strength=prompt_strength,
+            prompt_reps=prompt_reps,
+        )
+        query_embeds = self.get_query_embeddings(reference_image, source_subject_category)
+        text_embeddings = self.encode_prompt(query_embeds, prompt, device)
+        # 3. unconditional embedding
+        do_classifier_free_guidance = guidance_scale > 1.0
+        if do_classifier_free_guidance:
+            max_length = self.text_encoder.text_model.config.max_position_embeddings
+
+            uncond_input = self.tokenizer(
+                [neg_prompt] * batch_size,
+                padding="max_length",
+                max_length=max_length,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(
+                input_ids=uncond_input.input_ids.to(device),
+                ctx_embeddings=None,
+            )[0]
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        scale_down_factor = 2 ** (len(self.unet.config.block_out_channels) - 1)
+        latents = self.prepare_latents(
+            batch_size=batch_size,
+            num_channels=self.unet.config.in_channels,
+            height=height // scale_down_factor,
+            width=width // scale_down_factor,
+            generator=generator,
+            latents=latents,
+            dtype=self.unet.dtype,
+            device=device,
+        )
+        # set timesteps
+        extra_set_kwargs = {}
+        self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
+
+        cond_image = self.prepare_control_image(
+            image=condtioning_image,
+            width=width,
+            height=height,
+            batch_size=batch_size,
+            num_images_per_prompt=1,
+            device=device,
+            dtype=self.controlnet.dtype,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+        )
+
+        for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            do_classifier_free_guidance = guidance_scale > 1.0
+
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            down_block_res_samples, mid_block_res_sample = self.controlnet(
+                latent_model_input,
+                t,
+                encoder_hidden_states=text_embeddings,
+                controlnet_cond=cond_image,
+                return_dict=False,
+            )
+
+            noise_pred = self.unet(
+                latent_model_input,
+                timestep=t,
+                encoder_hidden_states=text_embeddings,
+                down_block_additional_residuals=down_block_res_samples,
+                mid_block_additional_residual=mid_block_res_sample,
+            )["sample"]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+            )["prev_sample"]
+        image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
new file mode 100644
index 000000000..9d2c76fd7
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
@@ -0,0 +1,1310 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from .multicontrolnet import MultiControlNetModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install opencv-python transformers accelerate
+        >>> from diffusers import StableDiffusionControlNetImg2ImgPipeline, ControlNetModel, UniPCMultistepScheduler
+        >>> from diffusers.utils import load_image
+        >>> import numpy as np
+        >>> import torch
+
+        >>> import cv2
+        >>> from PIL import Image
+
+        >>> # download an image
+        >>> image = load_image(
+        ...     "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
+        ... )
+        >>> np_image = np.array(image)
+
+        >>> # get canny image
+        >>> np_image = cv2.Canny(np_image, 100, 200)
+        >>> np_image = np_image[:, :, None]
+        >>> np_image = np.concatenate([np_image, np_image, np_image], axis=2)
+        >>> canny_image = Image.fromarray(np_image)
+
+        >>> # load control net and stable diffusion v1-5
+        >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
+        >>> pipe = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+
+        >>> # speed up diffusion process with faster scheduler and memory optimization
+        >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> # generate image
+        >>> generator = torch.manual_seed(0)
+        >>> image = pipe(
+        ...     "futuristic-looking woman",
+        ...     num_inference_steps=20,
+        ...     generator=generator,
+        ...     image=image,
+        ...     control_image=canny_image,
+        ... ).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+def prepare_image(image):
+    if isinstance(image, torch.Tensor):
+        # Batch single image
+        if image.ndim == 3:
+            image = image.unsqueeze(0)
+
+        image = image.to(dtype=torch.float32)
+    else:
+        # preprocess image
+        if isinstance(image, (PIL.Image.Image, np.ndarray)):
+            image = [image]
+
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+    return image
+
+
+class StableDiffusionControlNetImg2ImgPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    LoraLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for image-to-image generation using Stable Diffusion with ControlNet guidance.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+            Provides additional conditioning to the `unet` during the denoising process. If you set multiple
+            ControlNets as a list, the outputs from each ControlNet are added together to create one combined
+            additional conditioning.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # `prompt` needs more sophisticated handling when there are multiple
+        # conditionings.
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if isinstance(prompt, list):
+                logger.warning(
+                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+                    " prompts. The conditionings will be fixed across the prompts."
+                )
+
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            self.check_image(image, prompt, prompt_embeds)
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if not isinstance(image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in image):
+                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif len(image) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
+                )
+
+            for image_ in image:
+                self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
+    def prepare_control_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.prepare_latents
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        control_image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.8,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 0.8,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The initial image to be used as the starting point for the image generation process. Can also accept
+                image latents as `image`, and if passing latents directly they are not encoded again.
+            control_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be
+                accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
+                and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in
+                `init`, images must be passed as a list such that each element of the list can be correctly batched for
+                input to a single ControlNet.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                The ControlNet encoder tries to recognize the content of the input image even if you remove all
+                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeine class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            control_image,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare image
+        image = self.image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+
+        # 5. Prepare controlnet_conditioning_image
+        if isinstance(controlnet, ControlNetModel):
+            control_image = self.prepare_control_image(
+                image=control_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+        elif isinstance(controlnet, MultiControlNetModel):
+            control_images = []
+
+            for control_image_ in control_image:
+                control_image_ = self.prepare_control_image(
+                    image=control_image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+
+                control_images.append(control_image_)
+
+            control_image = control_images
+        else:
+            assert False
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        self._num_timesteps = len(timesteps)
+
+        # 6. Prepare latent variables
+        latents = self.prepare_latents(
+            image,
+            latent_timestep,
+            batch_size,
+            num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        # 7.2 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # controlnet(s) inference
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                else:
+                    control_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=control_image,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    return_dict=False,
+                )
+
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infered ControlNet only for the conditional batch.
+                    # To apply the output of ControlNet to both the unconditional and conditional batches,
+                    # add 0 to the unconditional batch to keep it unchanged.
+                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
new file mode 100644
index 000000000..c4f1bff5e
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
@@ -0,0 +1,1620 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This model implementation is heavily inspired by https://github.com/haofanwang/ControlNet-for-Diffusers/
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from .multicontrolnet import MultiControlNetModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install transformers accelerate
+        >>> from diffusers import StableDiffusionControlNetInpaintPipeline, ControlNetModel, DDIMScheduler
+        >>> from diffusers.utils import load_image
+        >>> import numpy as np
+        >>> import torch
+
+        >>> init_image = load_image(
+        ...     "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint/boy.png"
+        ... )
+        >>> init_image = init_image.resize((512, 512))
+
+        >>> generator = torch.Generator(device="cpu").manual_seed(1)
+
+        >>> mask_image = load_image(
+        ...     "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint/boy_mask.png"
+        ... )
+        >>> mask_image = mask_image.resize((512, 512))
+
+
+        >>> def make_canny_condition(image):
+        ...     image = np.array(image)
+        ...     image = cv2.Canny(image, 100, 200)
+        ...     image = image[:, :, None]
+        ...     image = np.concatenate([image, image, image], axis=2)
+        ...     image = Image.fromarray(image)
+        ...     return image
+
+
+        >>> control_image = make_canny_condition(init_image)
+
+        >>> controlnet = ControlNetModel.from_pretrained(
+        ...     "lllyasviel/control_v11p_sd15_inpaint", torch_dtype=torch.float16
+        ... )
+        >>> pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+
+        >>> pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> # generate image
+        >>> image = pipe(
+        ...     "a handsome man with ray-ban sunglasses",
+        ...     num_inference_steps=20,
+        ...     generator=generator,
+        ...     eta=1.0,
+        ...     image=init_image,
+        ...     mask_image=mask_image,
+        ...     control_image=control_image,
+        ... ).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.prepare_mask_and_masked_image
+def prepare_mask_and_masked_image(image, mask, height, width, return_image=False):
+    """
+    Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
+    converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
+    ``image`` and ``1`` for the ``mask``.
+
+    The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
+    binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
+
+    Args:
+        image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
+            ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
+        mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
+            ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
+
+
+    Raises:
+        ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
+        should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
+        TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
+            (ot the other way around).
+
+    Returns:
+        tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
+            dimensions: ``batch x channels x height x width``.
+    """
+    deprecation_message = "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please use VaeImageProcessor.preprocess instead"
+    deprecate(
+        "prepare_mask_and_masked_image",
+        "0.30.0",
+        deprecation_message,
+    )
+    if image is None:
+        raise ValueError("`image` input cannot be undefined.")
+
+    if mask is None:
+        raise ValueError("`mask_image` input cannot be undefined.")
+
+    if isinstance(image, torch.Tensor):
+        if not isinstance(mask, torch.Tensor):
+            raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not")
+
+        # Batch single image
+        if image.ndim == 3:
+            assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
+            image = image.unsqueeze(0)
+
+        # Batch and add channel dim for single mask
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0).unsqueeze(0)
+
+        # Batch single mask or add channel dim
+        if mask.ndim == 3:
+            # Single batched mask, no channel dim or single mask not batched but channel dim
+            if mask.shape[0] == 1:
+                mask = mask.unsqueeze(0)
+
+            # Batched masks no channel dim
+            else:
+                mask = mask.unsqueeze(1)
+
+        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+        assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
+
+        # Check image is in [-1, 1]
+        if image.min() < -1 or image.max() > 1:
+            raise ValueError("Image should be in [-1, 1] range")
+
+        # Check mask is in [0, 1]
+        if mask.min() < 0 or mask.max() > 1:
+            raise ValueError("Mask should be in [0, 1] range")
+
+        # Binarize mask
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+
+        # Image as float32
+        image = image.to(dtype=torch.float32)
+    elif isinstance(mask, torch.Tensor):
+        raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
+    else:
+        # preprocess image
+        if isinstance(image, (PIL.Image.Image, np.ndarray)):
+            image = [image]
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            # resize all images w.r.t passed height an width
+            image = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in image]
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+        # preprocess mask
+        if isinstance(mask, (PIL.Image.Image, np.ndarray)):
+            mask = [mask]
+
+        if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
+            mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
+            mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+            mask = mask.astype(np.float32) / 255.0
+        elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
+            mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
+
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        mask = torch.from_numpy(mask)
+
+    masked_image = image * (mask < 0.5)
+
+    # n.b. ensure backwards compatibility as old function does not return image
+    if return_image:
+        return mask, masked_image, image
+
+    return mask, masked_image
+
+
+class StableDiffusionControlNetInpaintPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    LoraLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for image inpainting using Stable Diffusion with ControlNet guidance.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    <Tip>
+
+    This pipeline can be used with checkpoints that have been specifically fine-tuned for inpainting
+    ([runwayml/stable-diffusion-inpainting](https://huggingface.co/runwayml/stable-diffusion-inpainting)) as well as
+    default text-to-image Stable Diffusion checkpoints
+    ([runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5)). Default text-to-image
+    Stable Diffusion checkpoints might be preferable for ControlNets that have been fine-tuned on those, such as
+    [lllyasviel/control_v11p_sd15_inpaint](https://huggingface.co/lllyasviel/control_v11p_sd15_inpaint).
+
+    </Tip>
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+            Provides additional conditioning to the `unet` during the denoising process. If you set multiple
+            ControlNets as a list, the outputs from each ControlNet are added together to create one combined
+            additional conditioning.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
+        )
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        mask_image,
+        height,
+        width,
+        callback_steps,
+        output_type,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+        callback_on_step_end_tensor_inputs=None,
+        padding_mask_crop=None,
+    ):
+        if height is not None and height % 8 != 0 or width is not None and width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if padding_mask_crop is not None:
+            if not isinstance(image, PIL.Image.Image):
+                raise ValueError(
+                    f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
+                )
+            if not isinstance(mask_image, PIL.Image.Image):
+                raise ValueError(
+                    f"The mask image should be a PIL image when inpainting mask crop, but is of type"
+                    f" {type(mask_image)}."
+                )
+            if output_type != "pil":
+                raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.")
+
+        # `prompt` needs more sophisticated handling when there are multiple
+        # conditionings.
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if isinstance(prompt, list):
+                logger.warning(
+                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+                    " prompts. The conditionings will be fixed across the prompts."
+                )
+
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            self.check_image(image, prompt, prompt_embeds)
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if not isinstance(image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in image):
+                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif len(image) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
+                )
+
+            for image_ in image:
+                self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    def prepare_control_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        crops_coords,
+        resize_mode,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(
+            image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
+        ).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+        image=None,
+        timestep=None,
+        is_strength_max=True,
+        return_noise=False,
+        return_image_latents=False,
+    ):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if (image is None or timestep is None) and not is_strength_max:
+            raise ValueError(
+                "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
+                "However, either the image or the noise timestep has not been provided."
+            )
+
+        if return_image_latents or (latents is None and not is_strength_max):
+            image = image.to(device=device, dtype=dtype)
+
+            if image.shape[1] == 4:
+                image_latents = image
+            else:
+                image_latents = self._encode_vae_image(image=image, generator=generator)
+            image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+
+        if latents is None:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # if strength is 1. then initialise the latents to noise, else initial to image + noise
+            latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
+            # if pure noise then scale the initial latents by the  Scheduler's init sigma
+            latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
+        else:
+            noise = latents.to(device)
+            latents = noise * self.scheduler.init_noise_sigma
+
+        outputs = (latents,)
+
+        if return_noise:
+            outputs += (noise,)
+
+        if return_image_latents:
+            outputs += (image_latents,)
+
+        return outputs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline.prepare_mask_latents
+    def prepare_mask_latents(
+        self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
+        mask = mask.to(device=device, dtype=dtype)
+
+        masked_image = masked_image.to(device=device, dtype=dtype)
+
+        if masked_image.shape[1] == 4:
+            masked_image_latents = masked_image
+        else:
+            masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+        if masked_image_latents.shape[0] < batch_size:
+            if not batch_size % masked_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
+
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image_latents = (
+            torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
+
+        # aligning device to prevent device errors when concating it with the latent model input
+        masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+        return mask, masked_image_latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline._encode_vae_image
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        image_latents = self.vae.config.scaling_factor * image_latents
+
+        return image_latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        control_image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        padding_mask_crop: Optional[int] = None,
+        strength: float = 1.0,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 0.5,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`,
+                    `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, NumPy array or tensor representing an image batch to be used as the starting point. For both
+                NumPy array and PyTorch tensor, the expected value range is between `[0, 1]`. If it's a tensor or a
+                list or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a NumPy array or
+                a list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)`. It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`,
+                    `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, NumPy array or tensor representing an image batch to mask `image`. White pixels in the mask
+                are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a
+                single channel (luminance) before use. If it's a NumPy array or PyTorch tensor, it should contain one
+                color channel (L) instead of 3, so the expected shape for PyTorch tensor would be `(B, 1, H, W)`, `(B,
+                H, W)`, `(1, H, W)`, `(H, W)`. And for NumPy array, it would be for `(B, H, W, 1)`, `(B, H, W)`, `(H,
+                W, 1)`, or `(H, W)`.
+            control_image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`,
+                    `List[List[torch.FloatTensor]]`, or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be
+                accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
+                and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in
+                `init`, images must be passed as a list such that each element of the list can be correctly batched for
+                input to a single ControlNet.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            padding_mask_crop (`int`, *optional*, defaults to `None`):
+                The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If
+                `padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and
+                contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on
+                the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large
+                and contain information inreleant for inpainging, such as background.
+            strength (`float`, *optional*, defaults to 1.0):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 0.5):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                The ControlNet encoder tries to recognize the content of the input image even if you remove all
+                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeine class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            control_image,
+            mask_image,
+            height,
+            width,
+            callback_steps,
+            output_type,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+            callback_on_step_end_tensor_inputs,
+            padding_mask_crop,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if padding_mask_crop is not None:
+            height, width = self.image_processor.get_default_height_width(image, height, width)
+            crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop)
+            resize_mode = "fill"
+        else:
+            crops_coords = None
+            resize_mode = "default"
+
+        device = self._execution_device
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare image
+        if isinstance(controlnet, ControlNetModel):
+            control_image = self.prepare_control_image(
+                image=control_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                crops_coords=crops_coords,
+                resize_mode=resize_mode,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+        elif isinstance(controlnet, MultiControlNetModel):
+            control_images = []
+
+            for control_image_ in control_image:
+                control_image_ = self.prepare_control_image(
+                    image=control_image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    crops_coords=crops_coords,
+                    resize_mode=resize_mode,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+
+                control_images.append(control_image_)
+
+            control_image = control_images
+        else:
+            assert False
+
+        # 4.1 Preprocess mask and image - resizes image and mask w.r.t height and width
+        original_image = image
+        init_image = self.image_processor.preprocess(
+            image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
+        )
+        init_image = init_image.to(dtype=torch.float32)
+
+        mask = self.mask_processor.preprocess(
+            mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
+        )
+
+        masked_image = init_image * (mask < 0.5)
+        _, _, height, width = init_image.shape
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps=num_inference_steps, strength=strength, device=device
+        )
+        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
+        is_strength_max = strength == 1.0
+        self._num_timesteps = len(timesteps)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        num_channels_unet = self.unet.config.in_channels
+        return_image_latents = num_channels_unet == 4
+        latents_outputs = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            image=init_image,
+            timestep=latent_timestep,
+            is_strength_max=is_strength_max,
+            return_noise=True,
+            return_image_latents=return_image_latents,
+        )
+
+        if return_image_latents:
+            latents, noise, image_latents = latents_outputs
+        else:
+            latents, noise = latents_outputs
+
+        # 7. Prepare mask latent variables
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask,
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            self.do_classifier_free_guidance,
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        # 7.2 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # controlnet(s) inference
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                else:
+                    control_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=control_image,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    return_dict=False,
+                )
+
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infered ControlNet only for the conditional batch.
+                    # To apply the output of ControlNet to both the unconditional and conditional batches,
+                    # add 0 to the unconditional batch to keep it unchanged.
+                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
+                # predict the noise residual
+                if num_channels_unet == 9:
+                    latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if num_channels_unet == 4:
+                    init_latents_proper = image_latents
+                    if self.do_classifier_free_guidance:
+                        init_mask, _ = mask.chunk(2)
+                    else:
+                        init_mask = mask
+
+                    if i < len(timesteps) - 1:
+                        noise_timestep = timesteps[i + 1]
+                        init_latents_proper = self.scheduler.add_noise(
+                            init_latents_proper, noise, torch.tensor([noise_timestep])
+                        )
+
+                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        if padding_mask_crop is not None:
+            image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image]
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
new file mode 100644
index 000000000..52ffe5a3f
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
@@ -0,0 +1,1818 @@
+# Copyright 2024 Harutatsu Akiyama, Jinbin Bai, and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    is_invisible_watermark_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+from .multicontrolnet import MultiControlNetModel
+
+
+if is_invisible_watermark_available():
+    from diffusers.pipelines.stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install transformers accelerate
+        >>> from diffusers import StableDiffusionXLControlNetInpaintPipeline, ControlNetModel, DDIMScheduler
+        >>> from diffusers.utils import load_image
+        >>> import numpy as np
+        >>> import torch
+
+        >>> init_image = load_image(
+        ...     "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint/boy.png"
+        ... )
+        >>> init_image = init_image.resize((1024, 1024))
+
+        >>> generator = torch.Generator(device="cpu").manual_seed(1)
+
+        >>> mask_image = load_image(
+        ...     "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint/boy_mask.png"
+        ... )
+        >>> mask_image = mask_image.resize((1024, 1024))
+
+
+        >>> def make_canny_condition(image):
+        ...     image = np.array(image)
+        ...     image = cv2.Canny(image, 100, 200)
+        ...     image = image[:, :, None]
+        ...     image = np.concatenate([image, image, image], axis=2)
+        ...     image = Image.fromarray(image)
+        ...     return image
+
+
+        >>> control_image = make_canny_condition(init_image)
+
+        >>> controlnet = ControlNetModel.from_pretrained(
+        ...     "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16
+        ... )
+        >>> pipe = StableDiffusionXLControlNetInpaintPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16
+        ... )
+
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> # generate image
+        >>> image = pipe(
+        ...     "a handsome man with ray-ban sunglasses",
+        ...     num_inference_steps=20,
+        ...     generator=generator,
+        ...     eta=1.0,
+        ...     image=init_image,
+        ...     mask_image=mask_image,
+        ...     control_image=control_image,
+        ... ).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+class StableDiffusionXLControlNetInpaintPipeline(
+    DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin, IPAdapterMixin
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
+    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: ControlNetModel,
+        scheduler: KarrasDiffusionSchedulers,
+        requires_aesthetics_score: bool = False,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+        feature_extractor: Optional[CLIPImageProcessor] = None,
+        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
+    ):
+        super().__init__()
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
+        )
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        image,
+        mask_image,
+        strength,
+        num_inference_steps,
+        callback_steps,
+        output_type,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+        callback_on_step_end_tensor_inputs=None,
+        padding_mask_crop=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+        if num_inference_steps is None:
+            raise ValueError("`num_inference_steps` cannot be None.")
+        elif not isinstance(num_inference_steps, int) or num_inference_steps <= 0:
+            raise ValueError(
+                f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type"
+                f" {type(num_inference_steps)}."
+            )
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if padding_mask_crop is not None:
+            if not isinstance(image, PIL.Image.Image):
+                raise ValueError(
+                    f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
+                )
+            if not isinstance(mask_image, PIL.Image.Image):
+                raise ValueError(
+                    f"The mask image should be a PIL image when inpainting mask crop, but is of type"
+                    f" {type(mask_image)}."
+                )
+            if output_type != "pil":
+                raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.")
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        # `prompt` needs more sophisticated handling when there are multiple
+        # conditionings.
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if isinstance(prompt, list):
+                logger.warning(
+                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+                    " prompts. The conditionings will be fixed across the prompts."
+                )
+
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            self.check_image(image, prompt, prompt_embeds)
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if not isinstance(image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in image):
+                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif len(image) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
+                )
+
+            for image_ in image:
+                self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if not isinstance(control_guidance_start, (tuple, list)):
+            control_guidance_start = [control_guidance_start]
+
+        if not isinstance(control_guidance_end, (tuple, list)):
+            control_guidance_end = [control_guidance_end]
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    def prepare_control_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        crops_coords,
+        resize_mode,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(
+            image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
+        ).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+        image=None,
+        timestep=None,
+        is_strength_max=True,
+        add_noise=True,
+        return_noise=False,
+        return_image_latents=False,
+    ):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if (image is None or timestep is None) and not is_strength_max:
+            raise ValueError(
+                "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
+                "However, either the image or the noise timestep has not been provided."
+            )
+
+        if return_image_latents or (latents is None and not is_strength_max):
+            image = image.to(device=device, dtype=dtype)
+
+            if image.shape[1] == 4:
+                image_latents = image
+            else:
+                image_latents = self._encode_vae_image(image=image, generator=generator)
+            image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+
+        if latents is None and add_noise:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # if strength is 1. then initialise the latents to noise, else initial to image + noise
+            latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
+            # if pure noise then scale the initial latents by the  Scheduler's init sigma
+            latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
+        elif add_noise:
+            noise = latents.to(device)
+            latents = noise * self.scheduler.init_noise_sigma
+        else:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            latents = image_latents.to(device)
+
+        outputs = (latents,)
+
+        if return_noise:
+            outputs += (noise,)
+
+        if return_image_latents:
+            outputs += (image_latents,)
+
+        return outputs
+
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        dtype = image.dtype
+        if self.vae.config.force_upcast:
+            image = image.float()
+            self.vae.to(dtype=torch.float32)
+
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        if self.vae.config.force_upcast:
+            self.vae.to(dtype)
+
+        image_latents = image_latents.to(dtype)
+        image_latents = self.vae.config.scaling_factor * image_latents
+
+        return image_latents
+
+    def prepare_mask_latents(
+        self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
+        mask = mask.to(device=device, dtype=dtype)
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+
+        masked_image_latents = None
+        if masked_image is not None:
+            masked_image = masked_image.to(device=device, dtype=dtype)
+            masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
+            if masked_image_latents.shape[0] < batch_size:
+                if not batch_size % masked_image_latents.shape[0] == 0:
+                    raise ValueError(
+                        "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                        f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                        " Make sure the number of images that you pass is divisible by the total requested batch size."
+                    )
+                masked_image_latents = masked_image_latents.repeat(
+                    batch_size // masked_image_latents.shape[0], 1, 1, 1
+                )
+
+            masked_image_latents = (
+                torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+            )
+
+            # aligning device to prevent device errors when concating it with the latent model input
+            masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+
+        return mask, masked_image_latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None):
+        # get the original timestep using init_timestep
+        if denoising_start is None:
+            init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+            t_start = max(num_inference_steps - init_timestep, 0)
+        else:
+            t_start = 0
+
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+        # Strength is irrelevant if we directly request a timestep to start at;
+        # that is, strength is determined by the denoising_start instead.
+        if denoising_start is not None:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_start * self.scheduler.config.num_train_timesteps)
+                )
+            )
+
+            num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
+            if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
+                # if the scheduler is a 2nd order scheduler we might have to do +1
+                # because `num_inference_steps` might be even given that every timestep
+                # (except the highest one) is duplicated. If `num_inference_steps` is even it would
+                # mean that we cut the timesteps in the middle of the denoising step
+                # (between 1st and 2nd devirative) which leads to incorrect results. By adding 1
+                # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
+                num_inference_steps = num_inference_steps + 1
+
+            # because t_n+1 >= t_n, we slice the timesteps starting from the end
+            timesteps = timesteps[-num_inference_steps:]
+            return timesteps, num_inference_steps
+
+        return timesteps, num_inference_steps - t_start
+
+    def _get_add_time_ids(
+        self,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        aesthetic_score,
+        negative_aesthetic_score,
+        dtype,
+        text_encoder_projection_dim=None,
+    ):
+        if self.config.requires_aesthetics_score:
+            add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
+            add_neg_time_ids = list(original_size + crops_coords_top_left + (negative_aesthetic_score,))
+        else:
+            add_time_ids = list(original_size + crops_coords_top_left + target_size)
+            add_neg_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if (
+            expected_add_embed_dim > passed_add_embed_dim
+            and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model."
+            )
+        elif (
+            expected_add_embed_dim < passed_add_embed_dim
+            and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model."
+            )
+        elif expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
+
+        return add_time_ids, add_neg_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                LoRAXFormersAttnProcessor,
+                LoRAAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        control_image: Union[
+            PipelineImageInput,
+            List[PipelineImageInput],
+        ] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        padding_mask_crop: Optional[int] = None,
+        strength: float = 0.9999,
+        num_inference_steps: int = 50,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        guidance_rescale: float = 0.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        aesthetic_score: float = 6.0,
+        negative_aesthetic_score: float = 2.5,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            padding_mask_crop (`int`, *optional*, defaults to `None`):
+                The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If
+                `padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and
+                contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on
+                the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large
+                and contain information inreleant for inpainging, such as background.
+            strength (`float`, *optional*, defaults to 0.9999):
+                Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
+                between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
+                `strength`. The number of denoising steps depends on the amount of noise initially added. When
+                `strength` is 1, added noise will be maximum and the denoising process will run for the full number of
+                iterations specified in `num_inference_steps`. A value of 1, therefore, essentially ignores the masked
+                portion of the reference `image`. Note that in the case of `denoising_start` being declared as an
+                integer, the value of `strength` will be ignored.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            denoising_start (`float`, *optional*):
+                When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
+                bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
+                it is assumed that the passed `image` is a partly denoised image. Note that when this is specified,
+                strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline
+                is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise (ca. final 20% of timesteps still needed) and should be
+                denoised by a successor pipeline that has `denoising_start` set to 0.8 so that it only denoises the
+                final 20% of the scheduler. The denoising_end parameter should ideally be utilized when this pipeline
+                forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            aesthetic_score (`float`, *optional*, defaults to 6.0):
+                Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
+                simulate an aesthetic score of the generated image by influencing the negative text condition.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeine class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple. `tuple. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # # 0.0 Default height and width to unet
+        # height = height or self.unet.config.sample_size * self.vae_scale_factor
+        # width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 0.1 align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            control_image,
+            mask_image,
+            strength,
+            num_inference_steps,
+            callback_steps,
+            output_type,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+            callback_on_step_end_tensor_inputs,
+            padding_mask_crop,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 3.1 Encode ip_adapter_image
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. set timesteps
+        def denoising_value_valid(dnv):
+            return isinstance(dnv, float) and 0 < dnv < 1
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps,
+            strength,
+            device,
+            denoising_start=denoising_start if denoising_value_valid(denoising_start) else None,
+        )
+        # check that number of inference steps is not < 1 - as this doesn't make sense
+        if num_inference_steps < 1:
+            raise ValueError(
+                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+            )
+        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
+        is_strength_max = strength == 1.0
+        self._num_timesteps = len(timesteps)
+
+        # 5. Preprocess mask and image - resizes image and mask w.r.t height and width
+        # 5.1 Prepare init image
+        if padding_mask_crop is not None:
+            height, width = self.image_processor.get_default_height_width(image, height, width)
+            crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop)
+            resize_mode = "fill"
+        else:
+            crops_coords = None
+            resize_mode = "default"
+
+        original_image = image
+        init_image = self.image_processor.preprocess(
+            image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
+        )
+        init_image = init_image.to(dtype=torch.float32)
+
+        # 5.2 Prepare control images
+        if isinstance(controlnet, ControlNetModel):
+            control_image = self.prepare_control_image(
+                image=control_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                crops_coords=crops_coords,
+                resize_mode=resize_mode,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+        elif isinstance(controlnet, MultiControlNetModel):
+            control_images = []
+
+            for control_image_ in control_image:
+                control_image_ = self.prepare_control_image(
+                    image=control_image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    crops_coords=crops_coords,
+                    resize_mode=resize_mode,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+
+                control_images.append(control_image_)
+
+            control_image = control_images
+        else:
+            raise ValueError(f"{controlnet.__class__} is not supported.")
+
+        # 5.3 Prepare mask
+        mask = self.mask_processor.preprocess(
+            mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
+        )
+
+        masked_image = init_image * (mask < 0.5)
+        _, _, height, width = init_image.shape
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        num_channels_unet = self.unet.config.in_channels
+        return_image_latents = num_channels_unet == 4
+
+        add_noise = True if denoising_start is None else False
+        latents_outputs = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            image=init_image,
+            timestep=latent_timestep,
+            is_strength_max=is_strength_max,
+            add_noise=add_noise,
+            return_noise=True,
+            return_image_latents=return_image_latents,
+        )
+
+        if return_image_latents:
+            latents, noise, image_latents = latents_outputs
+        else:
+            latents, noise = latents_outputs
+
+        # 7. Prepare mask latent variables
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask,
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            self.do_classifier_free_guidance,
+        )
+
+        # 8. Check that sizes of mask, masked image and latents match
+        if num_channels_unet == 9:
+            # default case for runwayml/stable-diffusion-inpainting
+            num_channels_mask = mask.shape[1]
+            num_channels_masked_image = masked_image_latents.shape[1]
+            if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
+                raise ValueError(
+                    f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                    f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                    f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                    f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                    " `pipeline.unet` or your `mask_image` or `image` input."
+                )
+        elif num_channels_unet != 4:
+            raise ValueError(
+                f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
+            )
+        # 8.1 Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8.2 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            if isinstance(self.controlnet, MultiControlNetModel):
+                controlnet_keep.append(keeps)
+            else:
+                controlnet_keep.append(keeps[0])
+
+        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        height, width = latents.shape[-2:]
+        height = height * self.vae_scale_factor
+        width = width * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 10. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids, add_neg_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            aesthetic_score,
+            negative_aesthetic_score,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+            add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device)
+
+        # 11. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        if (
+            denoising_end is not None
+            and denoising_start is not None
+            and denoising_value_valid(denoising_end)
+            and denoising_value_valid(denoising_start)
+            and denoising_start >= denoising_end
+        ):
+            raise ValueError(
+                f"`denoising_start`: {denoising_start} cannot be larger than or equal to `denoising_end`: "
+                + f" {denoising_end} when using type float."
+            )
+        elif denoising_end is not None and denoising_value_valid(denoising_end):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                # concat latents, mask, masked_image_latents in the channel dimension
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+                # controlnet(s) inference
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                    controlnet_added_cond_kwargs = {
+                        "text_embeds": add_text_embeds.chunk(2)[1],
+                        "time_ids": add_time_ids.chunk(2)[1],
+                    }
+                else:
+                    control_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+                    controlnet_added_cond_kwargs = added_cond_kwargs
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                # # Resize control_image to match the size of the input to the controlnet
+                # if control_image.shape[-2:] != control_model_input.shape[-2:]:
+                #     control_image = F.interpolate(control_image, size=control_model_input.shape[-2:], mode="bilinear", align_corners=False)
+
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=control_image,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    added_cond_kwargs=controlnet_added_cond_kwargs,
+                    return_dict=False,
+                )
+
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infered ControlNet only for the conditional batch.
+                    # To apply the output of ControlNet to both the unconditional and conditional batches,
+                    # add 0 to the unconditional batch to keep it unchanged.
+                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
+                if ip_adapter_image is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+
+                if num_channels_unet == 9:
+                    latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if num_channels_unet == 4:
+                    init_latents_proper = image_latents
+                    if self.do_classifier_free_guidance:
+                        init_mask, _ = mask.chunk(2)
+                    else:
+                        init_mask = mask
+
+                    if i < len(timesteps) - 1:
+                        noise_timestep = timesteps[i + 1]
+                        init_latents_proper = self.scheduler.add_noise(
+                            init_latents_proper, noise, torch.tensor([noise_timestep])
+                        )
+
+                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # make sure the VAE is in float32 mode, as it overflows in float16
+        if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
+            self.upcast_vae()
+            latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        else:
+            return StableDiffusionXLPipelineOutput(images=latents)
+
+        # apply watermark if available
+        if self.watermark is not None:
+            image = self.watermark.apply_watermark(image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        if padding_mask_crop is not None:
+            image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image]
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
new file mode 100644
index 000000000..eca81083b
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
@@ -0,0 +1,1499 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers.utils.import_utils import is_invisible_watermark_available
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+
+
+if is_invisible_watermark_available():
+    from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+
+from .multicontrolnet import MultiControlNetModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # !pip install opencv-python transformers accelerate
+        >>> from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL
+        >>> from diffusers.utils import load_image
+        >>> import numpy as np
+        >>> import torch
+
+        >>> import cv2
+        >>> from PIL import Image
+
+        >>> prompt = "aerial view, a futuristic research complex in a bright foggy jungle, hard lighting"
+        >>> negative_prompt = "low quality, bad quality, sketches"
+
+        >>> # download an image
+        >>> image = load_image(
+        ...     "https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"
+        ... )
+
+        >>> # initialize the models and pipeline
+        >>> controlnet_conditioning_scale = 0.5  # recommended for good generalization
+        >>> controlnet = ControlNetModel.from_pretrained(
+        ...     "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16
+        ... )
+        >>> vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
+        >>> pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, vae=vae, torch_dtype=torch.float16
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> # get canny image
+        >>> image = np.array(image)
+        >>> image = cv2.Canny(image, 100, 200)
+        >>> image = image[:, :, None]
+        >>> image = np.concatenate([image, image, image], axis=2)
+        >>> canny_image = Image.fromarray(image)
+
+        >>> # generate image
+        >>> image = pipe(
+        ...     prompt, controlnet_conditioning_scale=controlnet_conditioning_scale, image=canny_image
+        ... ).images[0]
+        ```
+"""
+
+
+class StableDiffusionXLControlNetPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL with ControlNet guidance.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        text_encoder_2 ([`~transformers.CLIPTextModelWithProjection`]):
+            Second frozen text-encoder
+            ([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        tokenizer_2 ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+            Provides additional conditioning to the `unet` during the denoising process. If you set multiple
+            ControlNets as a list, the outputs from each ControlNet are added together to create one combined
+            additional conditioning.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings should always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark](https://github.com/ShieldMnt/invisible-watermark/) library to
+            watermark output images. If not defined, it defaults to `True` if the package is installed; otherwise no
+            watermarker is used.
+    """
+
+    # leave controlnet out on purpose because it iterates with unet
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "feature_extractor",
+        "image_encoder",
+    ]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+        scheduler: KarrasDiffusionSchedulers,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+    ):
+        super().__init__()
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        image,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        # `prompt` needs more sophisticated handling when there are multiple
+        # conditionings.
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if isinstance(prompt, list):
+                logger.warning(
+                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+                    " prompts. The conditionings will be fixed across the prompts."
+                )
+
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            self.check_image(image, prompt, prompt_embeds)
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if not isinstance(image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in image):
+                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif len(image) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
+                )
+
+            for image_ in image:
+                self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if not isinstance(control_guidance_start, (tuple, list)):
+            control_guidance_start = [control_guidance_start]
+
+        if not isinstance(control_guidance_end, (tuple, list)):
+            control_guidance_end = [control_guidance_end]
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.prepare_image
+    def prepare_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                LoRAXFormersAttnProcessor,
+                LoRAAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be
+                accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height
+                and/or width are passed, `image` is resized accordingly. If multiple ControlNets are specified in
+                `init`, images must be passed as a list such that each element of the list can be correctly batched for
+                input to a single ControlNet.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. This is sent to `tokenizer_2`
+                and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, pooled text embeddings are generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt
+                weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input
+                argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                The ControlNet encoder tries to recognize the content of the input image even if you remove all
+                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeine class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned containing the output images.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            image,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            negative_pooled_prompt_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+
+        # 3.1 Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt,
+            prompt_2,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 3.2 Encode ip_adapter_image
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare image
+        if isinstance(controlnet, ControlNetModel):
+            image = self.prepare_image(
+                image=image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+            height, width = image.shape[-2:]
+        elif isinstance(controlnet, MultiControlNetModel):
+            images = []
+
+            for image_ in image:
+                image_ = self.prepare_image(
+                    image=image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+
+                images.append(image_)
+
+            image = images
+            height, width = image[0].shape[-2:]
+        else:
+            assert False
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        self._num_timesteps = len(timesteps)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6.5 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+        # 7.2 Prepare added time ids & embeddings
+        if isinstance(image, list):
+            original_size = original_size or image[0].shape[-2:]
+        else:
+            original_size = original_size or image.shape[-2:]
+        target_size = target_size or (height, width)
+
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+        # 8.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and isinstance(self.denoising_end, float)
+            and self.denoising_end > 0
+            and self.denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        is_unet_compiled = is_compiled_module(self.unet)
+        is_controlnet_compiled = is_compiled_module(self.controlnet)
+        is_torch_higher_equal_2_1 = is_torch_version(">=", "2.1")
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Relevant thread:
+                # https://dev-discuss.pytorch.org/t/cudagraphs-in-pytorch-2-0/1428
+                if (is_unet_compiled and is_controlnet_compiled) and is_torch_higher_equal_2_1:
+                    torch._inductor.cudagraph_mark_step_begin()
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+                # controlnet(s) inference
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                    controlnet_added_cond_kwargs = {
+                        "text_embeds": add_text_embeds.chunk(2)[1],
+                        "time_ids": add_time_ids.chunk(2)[1],
+                    }
+                else:
+                    control_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+                    controlnet_added_cond_kwargs = added_cond_kwargs
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=image,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    added_cond_kwargs=controlnet_added_cond_kwargs,
+                    return_dict=False,
+                )
+
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infered ControlNet only for the conditional batch.
+                    # To apply the output of ControlNet to both the unconditional and conditional batches,
+                    # add 0 to the unconditional batch to keep it unchanged.
+                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+            has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+            else:
+                latents = latents / self.vae.config.scaling_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
new file mode 100644
index 000000000..86a0e2c57
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
@@ -0,0 +1,1626 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from diffusers.utils.import_utils import is_invisible_watermark_available
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+
+
+if is_invisible_watermark_available():
+    from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+
+from .multicontrolnet import MultiControlNetModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> # pip install accelerate transformers safetensors diffusers
+
+        >>> import torch
+        >>> import numpy as np
+        >>> from PIL import Image
+
+        >>> from transformers import DPTFeatureExtractor, DPTForDepthEstimation
+        >>> from diffusers import ControlNetModel, StableDiffusionXLControlNetImg2ImgPipeline, AutoencoderKL
+        >>> from diffusers.utils import load_image
+
+
+        >>> depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
+        >>> feature_extractor = DPTFeatureExtractor.from_pretrained("Intel/dpt-hybrid-midas")
+        >>> controlnet = ControlNetModel.from_pretrained(
+        ...     "diffusers/controlnet-depth-sdxl-1.0-small",
+        ...     variant="fp16",
+        ...     use_safetensors=True,
+        ...     torch_dtype=torch.float16,
+        ... ).to("cuda")
+        >>> vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16).to("cuda")
+        >>> pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0",
+        ...     controlnet=controlnet,
+        ...     vae=vae,
+        ...     variant="fp16",
+        ...     use_safetensors=True,
+        ...     torch_dtype=torch.float16,
+        ... ).to("cuda")
+        >>> pipe.enable_model_cpu_offload()
+
+
+        >>> def get_depth_map(image):
+        ...     image = feature_extractor(images=image, return_tensors="pt").pixel_values.to("cuda")
+        ...     with torch.no_grad(), torch.autocast("cuda"):
+        ...         depth_map = depth_estimator(image).predicted_depth
+
+        ...     depth_map = torch.nn.functional.interpolate(
+        ...         depth_map.unsqueeze(1),
+        ...         size=(1024, 1024),
+        ...         mode="bicubic",
+        ...         align_corners=False,
+        ...     )
+        ...     depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
+        ...     depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
+        ...     depth_map = (depth_map - depth_min) / (depth_max - depth_min)
+        ...     image = torch.cat([depth_map] * 3, dim=1)
+        ...     image = image.permute(0, 2, 3, 1).cpu().numpy()[0]
+        ...     image = Image.fromarray((image * 255.0).clip(0, 255).astype(np.uint8))
+        ...     return image
+
+
+        >>> prompt = "A robot, 4k photo"
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... ).resize((1024, 1024))
+        >>> controlnet_conditioning_scale = 0.5  # recommended for good generalization
+        >>> depth_image = get_depth_map(image)
+
+        >>> images = pipe(
+        ...     prompt,
+        ...     image=image,
+        ...     control_image=depth_image,
+        ...     strength=0.99,
+        ...     num_inference_steps=50,
+        ...     controlnet_conditioning_scale=controlnet_conditioning_scale,
+        ... ).images
+        >>> images[0].save(f"robot_cat.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+class StableDiffusionXLControlNetImg2ImgPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    IPAdapterMixin,
+):
+    r"""
+    Pipeline for image-to-image generation using Stable Diffusion XL with ControlNet guidance.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        controlnet ([`ControlNetModel`] or `List[ControlNetModel]`):
+            Provides additional conditioning to the unet during the denoising process. If you set multiple ControlNets
+            as a list, the outputs from each ControlNet are added together to create one combined additional
+            conditioning.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        requires_aesthetics_score (`bool`, *optional*, defaults to `"False"`):
+            Whether the `unet` requires an `aesthetic_score` condition to be passed during inference. Also see the
+            config of `stabilityai/stable-diffusion-xl-refiner-1-0`.
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
+            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
+            watermarker will be used.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "feature_extractor",
+        "image_encoder",
+    ]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
+        scheduler: KarrasDiffusionSchedulers,
+        requires_aesthetics_score: bool = False,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+    ):
+        super().__init__()
+
+        if isinstance(controlnet, (list, tuple)):
+            controlnet = MultiControlNetModel(controlnet)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.control_image_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False
+        )
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        image,
+        strength,
+        num_inference_steps,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+        if num_inference_steps is None:
+            raise ValueError("`num_inference_steps` cannot be None.")
+        elif not isinstance(num_inference_steps, int) or num_inference_steps <= 0:
+            raise ValueError(
+                f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type"
+                f" {type(num_inference_steps)}."
+            )
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        # `prompt` needs more sophisticated handling when there are multiple
+        # conditionings.
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if isinstance(prompt, list):
+                logger.warning(
+                    f"You have {len(self.controlnet.nets)} ControlNets and you have passed {len(prompt)}"
+                    " prompts. The conditionings will be fixed across the prompts."
+                )
+
+        # Check `image`
+        is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance(
+            self.controlnet, torch._dynamo.eval_frame.OptimizedModule
+        )
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            self.check_image(image, prompt, prompt_embeds)
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if not isinstance(image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in image):
+                raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif len(image) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
+                )
+
+            for image_ in image:
+                self.check_image(image_, prompt, prompt_embeds)
+        else:
+            assert False
+
+        # Check `controlnet_conditioning_scale`
+        if (
+            isinstance(self.controlnet, ControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, ControlNetModel)
+        ):
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif (
+            isinstance(self.controlnet, MultiControlNetModel)
+            or is_compiled
+            and isinstance(self.controlnet._orig_mod, MultiControlNetModel)
+        ):
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            assert False
+
+        if not isinstance(control_guidance_start, (tuple, list)):
+            control_guidance_start = [control_guidance_start]
+
+        if not isinstance(control_guidance_end, (tuple, list)):
+            control_guidance_end = [control_guidance_end]
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if isinstance(self.controlnet, MultiControlNetModel):
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet_sd_xl.StableDiffusionXLControlNetPipeline.check_image
+    def check_image(self, image, prompt, prompt_embeds):
+        image_is_pil = isinstance(image, PIL.Image.Image)
+        image_is_tensor = isinstance(image, torch.Tensor)
+        image_is_np = isinstance(image, np.ndarray)
+        image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image)
+        image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor)
+        image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray)
+
+        if (
+            not image_is_pil
+            and not image_is_tensor
+            and not image_is_np
+            and not image_is_pil_list
+            and not image_is_tensor_list
+            and not image_is_np_list
+        ):
+            raise TypeError(
+                f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}"
+            )
+
+        if image_is_pil:
+            image_batch_size = 1
+        else:
+            image_batch_size = len(image)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if image_batch_size != 1 and image_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.controlnet.pipeline_controlnet_sd_xl.StableDiffusionXLControlNetPipeline.prepare_image
+    def prepare_control_image(
+        self,
+        image,
+        width,
+        height,
+        batch_size,
+        num_images_per_prompt,
+        device,
+        dtype,
+        do_classifier_free_guidance=False,
+        guess_mode=False,
+    ):
+        image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+        image_batch_size = image.shape[0]
+
+        if image_batch_size == 1:
+            repeat_by = batch_size
+        else:
+            # image batch size is the same as prompt batch size
+            repeat_by = num_images_per_prompt
+
+        image = image.repeat_interleave(repeat_by, dim=0)
+
+        image = image.to(device=device, dtype=dtype)
+
+        if do_classifier_free_guidance and not guess_mode:
+            image = torch.cat([image] * 2)
+
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.prepare_latents
+    def prepare_latents(
+        self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None, add_noise=True
+    ):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        # Offload text encoder if `enable_model_cpu_offload` was enabled
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.text_encoder_2.to("cpu")
+            torch.cuda.empty_cache()
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            if self.vae.config.force_upcast:
+                image = image.float()
+                self.vae.to(dtype=torch.float32)
+
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            if self.vae.config.force_upcast:
+                self.vae.to(dtype)
+
+            init_latents = init_latents.to(dtype)
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        if add_noise:
+            shape = init_latents.shape
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # get latents
+            init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+
+        latents = init_latents
+
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        aesthetic_score,
+        negative_aesthetic_score,
+        negative_original_size,
+        negative_crops_coords_top_left,
+        negative_target_size,
+        dtype,
+        text_encoder_projection_dim=None,
+    ):
+        if self.config.requires_aesthetics_score:
+            add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
+            add_neg_time_ids = list(
+                negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)
+            )
+        else:
+            add_time_ids = list(original_size + crops_coords_top_left + target_size)
+            add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if (
+            expected_add_embed_dim > passed_add_embed_dim
+            and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model."
+            )
+        elif (
+            expected_add_embed_dim < passed_add_embed_dim
+            and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model."
+            )
+        elif expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
+
+        return add_time_ids, add_neg_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                LoRAXFormersAttnProcessor,
+                LoRAAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        control_image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        strength: float = 0.8,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 0.8,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        aesthetic_score: float = 6.0,
+        negative_aesthetic_score: float = 2.5,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The initial image will be used as the starting point for the image generation process. Can also accept
+                image latents as `image`, if passing latents directly, it will not be encoded again.
+            control_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,:
+                    `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`):
+                The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If
+                the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can
+                also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If
+                height and/or width are passed, `image` is resized according to them. If multiple ControlNets are
+                specified in init, images must be passed as a list such that each element of the list can be correctly
+                batched for input to a single controlnet.
+            height (`int`, *optional*, defaults to the size of control_image):
+                The height in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to the size of control_image):
+                The width in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original unet. If multiple ControlNets are specified in init, you can set the
+                corresponding scale as a list.
+            guess_mode (`bool`, *optional*, defaults to `False`):
+                In this mode, the ControlNet encoder will try best to recognize the content of the input image even if
+                you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended.
+            control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0):
+                The percentage of total steps at which the controlnet starts applying.
+            control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The percentage of total steps at which the controlnet stops applying.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            aesthetic_score (`float`, *optional*, defaults to 6.0):
+                Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
+                simulate an aesthetic score of the generated image by influencing the negative text condition.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeine class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple`
+            containing the output images.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            control_image,
+            strength,
+            num_inference_steps,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            controlnet_conditioning_scale,
+            control_guidance_start,
+            control_guidance_end,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if isinstance(controlnet, ControlNetModel)
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+
+        # 3.1. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt,
+            prompt_2,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 3.2 Encode ip_adapter_image
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare image and controlnet_conditioning_image
+        image = self.image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32)
+
+        if isinstance(controlnet, ControlNetModel):
+            control_image = self.prepare_control_image(
+                image=control_image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=device,
+                dtype=controlnet.dtype,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+            height, width = control_image.shape[-2:]
+        elif isinstance(controlnet, MultiControlNetModel):
+            control_images = []
+
+            for control_image_ in control_image:
+                control_image_ = self.prepare_control_image(
+                    image=control_image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=device,
+                    dtype=controlnet.dtype,
+                    do_classifier_free_guidance=self.do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+
+                control_images.append(control_image_)
+
+            control_image = control_images
+            height, width = control_image[0].shape[-2:]
+        else:
+            assert False
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        self._num_timesteps = len(timesteps)
+
+        # 6. Prepare latent variables
+        latents = self.prepare_latents(
+            image,
+            latent_timestep,
+            batch_size,
+            num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            True,
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if isinstance(controlnet, ControlNetModel) else keeps)
+
+        # 7.2 Prepare added time ids & embeddings
+        if isinstance(control_image, list):
+            original_size = original_size or control_image[0].shape[-2:]
+        else:
+            original_size = original_size or control_image.shape[-2:]
+        target_size = target_size or (height, width)
+
+        if negative_original_size is None:
+            negative_original_size = original_size
+        if negative_target_size is None:
+            negative_target_size = target_size
+        add_text_embeds = pooled_prompt_embeds
+
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids, add_neg_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            aesthetic_score,
+            negative_aesthetic_score,
+            negative_original_size,
+            negative_crops_coords_top_left,
+            negative_target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+            add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+                # controlnet(s) inference
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                    controlnet_added_cond_kwargs = {
+                        "text_embeds": add_text_embeds.chunk(2)[1],
+                        "time_ids": add_time_ids.chunk(2)[1],
+                    }
+                else:
+                    control_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+                    controlnet_added_cond_kwargs = added_cond_kwargs
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=control_image,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    added_cond_kwargs=controlnet_added_cond_kwargs,
+                    return_dict=False,
+                )
+
+                if guess_mode and self.do_classifier_free_guidance:
+                    # Infered ControlNet only for the conditional batch.
+                    # To apply the output of ControlNet to both the unconditional and conditional batches,
+                    # add 0 to the unconditional batch to keep it unchanged.
+                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # If we do sequential model offloading, let's offload unet and controlnet
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+            self.controlnet.to("cpu")
+            torch.cuda.empty_cache()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+            has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+            else:
+                latents = latents / self.vae.config.scaling_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+            return StableDiffusionXLPipelineOutput(images=image)
+
+        # apply watermark if available
+        if self.watermark is not None:
+            image = self.watermark.apply_watermark(image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py
new file mode 100644
index 000000000..5b6fc2b39
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py
@@ -0,0 +1,532 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from functools import partial
+from typing import Dict, List, Optional, Union
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict
+from flax.jax_utils import unreplicate
+from flax.training.common_utils import shard
+from PIL import Image
+from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel
+
+from ...models import FlaxAutoencoderKL, FlaxControlNetModel, FlaxUNet2DConditionModel
+from ...schedulers import (
+    FlaxDDIMScheduler,
+    FlaxDPMSolverMultistepScheduler,
+    FlaxLMSDiscreteScheduler,
+    FlaxPNDMScheduler,
+)
+from ...utils import PIL_INTERPOLATION, logging, replace_example_docstring
+from ..pipeline_flax_utils import FlaxDiffusionPipeline
+from ..stable_diffusion import FlaxStableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker_flax import FlaxStableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+# Set to True to use python for loop instead of jax.fori_loop for easier debugging
+DEBUG = False
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import jax
+        >>> import numpy as np
+        >>> import jax.numpy as jnp
+        >>> from flax.jax_utils import replicate
+        >>> from flax.training.common_utils import shard
+        >>> from diffusers.utils import load_image, make_image_grid
+        >>> from PIL import Image
+        >>> from diffusers import FlaxStableDiffusionControlNetPipeline, FlaxControlNetModel
+
+
+        >>> def create_key(seed=0):
+        ...     return jax.random.PRNGKey(seed)
+
+
+        >>> rng = create_key(0)
+
+        >>> # get canny image
+        >>> canny_image = load_image(
+        ...     "https://huggingface.co/datasets/YiYiXu/test-doc-assets/resolve/main/blog_post_cell_10_output_0.jpeg"
+        ... )
+
+        >>> prompts = "best quality, extremely detailed"
+        >>> negative_prompts = "monochrome, lowres, bad anatomy, worst quality, low quality"
+
+        >>> # load control net and stable diffusion v1-5
+        >>> controlnet, controlnet_params = FlaxControlNetModel.from_pretrained(
+        ...     "lllyasviel/sd-controlnet-canny", from_pt=True, dtype=jnp.float32
+        ... )
+        >>> pipe, params = FlaxStableDiffusionControlNetPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", controlnet=controlnet, revision="flax", dtype=jnp.float32
+        ... )
+        >>> params["controlnet"] = controlnet_params
+
+        >>> num_samples = jax.device_count()
+        >>> rng = jax.random.split(rng, jax.device_count())
+
+        >>> prompt_ids = pipe.prepare_text_inputs([prompts] * num_samples)
+        >>> negative_prompt_ids = pipe.prepare_text_inputs([negative_prompts] * num_samples)
+        >>> processed_image = pipe.prepare_image_inputs([canny_image] * num_samples)
+
+        >>> p_params = replicate(params)
+        >>> prompt_ids = shard(prompt_ids)
+        >>> negative_prompt_ids = shard(negative_prompt_ids)
+        >>> processed_image = shard(processed_image)
+
+        >>> output = pipe(
+        ...     prompt_ids=prompt_ids,
+        ...     image=processed_image,
+        ...     params=p_params,
+        ...     prng_seed=rng,
+        ...     num_inference_steps=50,
+        ...     neg_prompt_ids=negative_prompt_ids,
+        ...     jit=True,
+        ... ).images
+
+        >>> output_images = pipe.numpy_to_pil(np.asarray(output.reshape((num_samples,) + output.shape[-3:])))
+        >>> output_images = make_image_grid(output_images, num_samples // 4, 4)
+        >>> output_images.save("generated_image.png")
+        ```
+"""
+
+
+class FlaxStableDiffusionControlNetPipeline(FlaxDiffusionPipeline):
+    r"""
+    Flax-based pipeline for text-to-image generation using Stable Diffusion with ControlNet Guidance.
+
+    This model inherits from [`FlaxDiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`FlaxAutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.FlaxCLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`FlaxUNet2DConditionModel`]):
+            A `FlaxUNet2DConditionModel` to denoise the encoded image latents.
+        controlnet ([`FlaxControlNetModel`]:
+            Provides additional conditioning to the `unet` during the denoising process.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`FlaxDDIMScheduler`], [`FlaxLMSDiscreteScheduler`], [`FlaxPNDMScheduler`], or
+            [`FlaxDPMSolverMultistepScheduler`].
+        safety_checker ([`FlaxStableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: FlaxAutoencoderKL,
+        text_encoder: FlaxCLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: FlaxUNet2DConditionModel,
+        controlnet: FlaxControlNetModel,
+        scheduler: Union[
+            FlaxDDIMScheduler, FlaxPNDMScheduler, FlaxLMSDiscreteScheduler, FlaxDPMSolverMultistepScheduler
+        ],
+        safety_checker: FlaxStableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        dtype: jnp.dtype = jnp.float32,
+    ):
+        super().__init__()
+        self.dtype = dtype
+
+        if safety_checker is None:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            controlnet=controlnet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+    def prepare_text_inputs(self, prompt: Union[str, List[str]]):
+        if not isinstance(prompt, (str, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="np",
+        )
+
+        return text_input.input_ids
+
+    def prepare_image_inputs(self, image: Union[Image.Image, List[Image.Image]]):
+        if not isinstance(image, (Image.Image, list)):
+            raise ValueError(f"image has to be of type `PIL.Image.Image` or list but is {type(image)}")
+
+        if isinstance(image, Image.Image):
+            image = [image]
+
+        processed_images = jnp.concatenate([preprocess(img, jnp.float32) for img in image])
+
+        return processed_images
+
+    def _get_has_nsfw_concepts(self, features, params):
+        has_nsfw_concepts = self.safety_checker(features, params)
+        return has_nsfw_concepts
+
+    def _run_safety_checker(self, images, safety_model_params, jit=False):
+        # safety_model_params should already be replicated when jit is True
+        pil_images = [Image.fromarray(image) for image in images]
+        features = self.feature_extractor(pil_images, return_tensors="np").pixel_values
+
+        if jit:
+            features = shard(features)
+            has_nsfw_concepts = _p_get_has_nsfw_concepts(self, features, safety_model_params)
+            has_nsfw_concepts = unshard(has_nsfw_concepts)
+            safety_model_params = unreplicate(safety_model_params)
+        else:
+            has_nsfw_concepts = self._get_has_nsfw_concepts(features, safety_model_params)
+
+        images_was_copied = False
+        for idx, has_nsfw_concept in enumerate(has_nsfw_concepts):
+            if has_nsfw_concept:
+                if not images_was_copied:
+                    images_was_copied = True
+                    images = images.copy()
+
+                images[idx] = np.zeros(images[idx].shape, dtype=np.uint8)  # black image
+
+            if any(has_nsfw_concepts):
+                warnings.warn(
+                    "Potential NSFW content was detected in one or more images. A black image will be returned"
+                    " instead. Try again with a different prompt and/or seed."
+                )
+
+        return images, has_nsfw_concepts
+
+    def _generate(
+        self,
+        prompt_ids: jnp.ndarray,
+        image: jnp.ndarray,
+        params: Union[Dict, FrozenDict],
+        prng_seed: jax.Array,
+        num_inference_steps: int,
+        guidance_scale: float,
+        latents: Optional[jnp.ndarray] = None,
+        neg_prompt_ids: Optional[jnp.ndarray] = None,
+        controlnet_conditioning_scale: float = 1.0,
+    ):
+        height, width = image.shape[-2:]
+        if height % 64 != 0 or width % 64 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 64 but are {height} and {width}.")
+
+        # get prompt text embeddings
+        prompt_embeds = self.text_encoder(prompt_ids, params=params["text_encoder"])[0]
+
+        # TODO: currently it is assumed `do_classifier_free_guidance = guidance_scale > 1.0`
+        # implement this conditional `do_classifier_free_guidance = guidance_scale > 1.0`
+        batch_size = prompt_ids.shape[0]
+
+        max_length = prompt_ids.shape[-1]
+
+        if neg_prompt_ids is None:
+            uncond_input = self.tokenizer(
+                [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="np"
+            ).input_ids
+        else:
+            uncond_input = neg_prompt_ids
+        negative_prompt_embeds = self.text_encoder(uncond_input, params=params["text_encoder"])[0]
+        context = jnp.concatenate([negative_prompt_embeds, prompt_embeds])
+
+        image = jnp.concatenate([image] * 2)
+
+        latents_shape = (
+            batch_size,
+            self.unet.config.in_channels,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if latents is None:
+            latents = jax.random.normal(prng_seed, shape=latents_shape, dtype=jnp.float32)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+
+        def loop_body(step, args):
+            latents, scheduler_state = args
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            latents_input = jnp.concatenate([latents] * 2)
+
+            t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
+            timestep = jnp.broadcast_to(t, latents_input.shape[0])
+
+            latents_input = self.scheduler.scale_model_input(scheduler_state, latents_input, t)
+
+            down_block_res_samples, mid_block_res_sample = self.controlnet.apply(
+                {"params": params["controlnet"]},
+                jnp.array(latents_input),
+                jnp.array(timestep, dtype=jnp.int32),
+                encoder_hidden_states=context,
+                controlnet_cond=image,
+                conditioning_scale=controlnet_conditioning_scale,
+                return_dict=False,
+            )
+
+            # predict the noise residual
+            noise_pred = self.unet.apply(
+                {"params": params["unet"]},
+                jnp.array(latents_input),
+                jnp.array(timestep, dtype=jnp.int32),
+                encoder_hidden_states=context,
+                down_block_additional_residuals=down_block_res_samples,
+                mid_block_additional_residual=mid_block_res_sample,
+            ).sample
+
+            # perform guidance
+            noise_pred_uncond, noise_prediction_text = jnp.split(noise_pred, 2, axis=0)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents, scheduler_state = self.scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
+            return latents, scheduler_state
+
+        scheduler_state = self.scheduler.set_timesteps(
+            params["scheduler"], num_inference_steps=num_inference_steps, shape=latents_shape
+        )
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * params["scheduler"].init_noise_sigma
+
+        if DEBUG:
+            # run with python for loop
+            for i in range(num_inference_steps):
+                latents, scheduler_state = loop_body(i, (latents, scheduler_state))
+        else:
+            latents, _ = jax.lax.fori_loop(0, num_inference_steps, loop_body, (latents, scheduler_state))
+
+        # scale and decode the image latents with vae
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.apply({"params": params["vae"]}, latents, method=self.vae.decode).sample
+
+        image = (image / 2 + 0.5).clip(0, 1).transpose(0, 2, 3, 1)
+        return image
+
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt_ids: jnp.ndarray,
+        image: jnp.ndarray,
+        params: Union[Dict, FrozenDict],
+        prng_seed: jax.Array,
+        num_inference_steps: int = 50,
+        guidance_scale: Union[float, jnp.ndarray] = 7.5,
+        latents: jnp.ndarray = None,
+        neg_prompt_ids: jnp.ndarray = None,
+        controlnet_conditioning_scale: Union[float, jnp.ndarray] = 1.0,
+        return_dict: bool = True,
+        jit: bool = False,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt_ids (`jnp.ndarray`):
+                The prompt or prompts to guide the image generation.
+            image (`jnp.ndarray`):
+                Array representing the ControlNet input condition to provide guidance to the `unet` for generation.
+            params (`Dict` or `FrozenDict`):
+                Dictionary containing the model parameters/weights.
+            prng_seed (`jax.Array`):
+                Array containing random number generator key.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            latents (`jnp.ndarray`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                array is generated by sampling using the supplied random `generator`.
+            controlnet_conditioning_scale (`float` or `jnp.ndarray`, *optional*, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] instead of
+                a plain tuple.
+            jit (`bool`, defaults to `False`):
+                Whether to run `pmap` versions of the generation and safety scoring functions.
+
+                    <Tip warning={true}>
+
+                    This argument exists because `__call__` is not yet end-to-end pmap-able. It will be removed in a
+                    future release.
+
+                    </Tip>
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated images
+                and the second element is a list of `bool`s indicating whether the corresponding generated image
+                contains "not-safe-for-work" (nsfw) content.
+        """
+
+        height, width = image.shape[-2:]
+
+        if isinstance(guidance_scale, float):
+            # Convert to a tensor so each device gets a copy. Follow the prompt_ids for
+            # shape information, as they may be sharded (when `jit` is `True`), or not.
+            guidance_scale = jnp.array([guidance_scale] * prompt_ids.shape[0])
+            if len(prompt_ids.shape) > 2:
+                # Assume sharded
+                guidance_scale = guidance_scale[:, None]
+
+        if isinstance(controlnet_conditioning_scale, float):
+            # Convert to a tensor so each device gets a copy. Follow the prompt_ids for
+            # shape information, as they may be sharded (when `jit` is `True`), or not.
+            controlnet_conditioning_scale = jnp.array([controlnet_conditioning_scale] * prompt_ids.shape[0])
+            if len(prompt_ids.shape) > 2:
+                # Assume sharded
+                controlnet_conditioning_scale = controlnet_conditioning_scale[:, None]
+
+        if jit:
+            images = _p_generate(
+                self,
+                prompt_ids,
+                image,
+                params,
+                prng_seed,
+                num_inference_steps,
+                guidance_scale,
+                latents,
+                neg_prompt_ids,
+                controlnet_conditioning_scale,
+            )
+        else:
+            images = self._generate(
+                prompt_ids,
+                image,
+                params,
+                prng_seed,
+                num_inference_steps,
+                guidance_scale,
+                latents,
+                neg_prompt_ids,
+                controlnet_conditioning_scale,
+            )
+
+        if self.safety_checker is not None:
+            safety_params = params["safety_checker"]
+            images_uint8_casted = (images * 255).round().astype("uint8")
+            num_devices, batch_size = images.shape[:2]
+
+            images_uint8_casted = np.asarray(images_uint8_casted).reshape(num_devices * batch_size, height, width, 3)
+            images_uint8_casted, has_nsfw_concept = self._run_safety_checker(images_uint8_casted, safety_params, jit)
+            images = np.array(images)
+
+            # block images
+            if any(has_nsfw_concept):
+                for i, is_nsfw in enumerate(has_nsfw_concept):
+                    if is_nsfw:
+                        images[i] = np.asarray(images_uint8_casted[i])
+
+            images = images.reshape(num_devices, batch_size, height, width, 3)
+        else:
+            images = np.asarray(images)
+            has_nsfw_concept = False
+
+        if not return_dict:
+            return (images, has_nsfw_concept)
+
+        return FlaxStableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept)
+
+
+# Static argnums are pipe, num_inference_steps. A change would trigger recompilation.
+# Non-static args are (sharded) input tensors mapped over their first dimension (hence, `0`).
+@partial(
+    jax.pmap,
+    in_axes=(None, 0, 0, 0, 0, None, 0, 0, 0, 0),
+    static_broadcasted_argnums=(0, 5),
+)
+def _p_generate(
+    pipe,
+    prompt_ids,
+    image,
+    params,
+    prng_seed,
+    num_inference_steps,
+    guidance_scale,
+    latents,
+    neg_prompt_ids,
+    controlnet_conditioning_scale,
+):
+    return pipe._generate(
+        prompt_ids,
+        image,
+        params,
+        prng_seed,
+        num_inference_steps,
+        guidance_scale,
+        latents,
+        neg_prompt_ids,
+        controlnet_conditioning_scale,
+    )
+
+
+@partial(jax.pmap, static_broadcasted_argnums=(0,))
+def _p_get_has_nsfw_concepts(pipe, features, params):
+    return pipe._get_has_nsfw_concepts(features, params)
+
+
+def unshard(x: jnp.ndarray):
+    # einops.rearrange(x, 'd b ... -> (d b) ...')
+    num_devices, batch_size = x.shape[:2]
+    rest = x.shape[2:]
+    return x.reshape(num_devices * batch_size, *rest)
+
+
+def preprocess(image, dtype):
+    image = image.convert("RGB")
+    w, h = image.size
+    w, h = (x - x % 64 for x in (w, h))  # resize to integer multiple of 64
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
+    image = jnp.array(image).astype(dtype) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    return image
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/dance_diffusion/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/dance_diffusion/__init__.py
new file mode 100644
index 000000000..0d3e466df
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/dance_diffusion/__init__.py
@@ -0,0 +1,18 @@
+from typing import TYPE_CHECKING
+
+from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
+
+
+_import_structure = {"pipeline_dance_diffusion": ["DanceDiffusionPipeline"]}
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .pipeline_dance_diffusion import DanceDiffusionPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
new file mode 100644
index 000000000..bcd36c412
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
@@ -0,0 +1,156 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from ...utils import logging
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class DanceDiffusionPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for audio generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        unet ([`UNet1DModel`]):
+            A `UNet1DModel` to denoise the encoded audio.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
+            [`IPNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "unet"
+
+    def __init__(self, unet, scheduler):
+        super().__init__()
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int = 1,
+        num_inference_steps: int = 100,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        audio_length_in_s: Optional[float] = None,
+        return_dict: bool = True,
+    ) -> Union[AudioPipelineOutput, Tuple]:
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of audio samples to generate.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher-quality audio sample at
+                the expense of slower inference.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            audio_length_in_s (`float`, *optional*, defaults to `self.unet.config.sample_size/self.unet.config.sample_rate`):
+                The length of the generated audio sample in seconds.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        ```py
+        from diffusers import DiffusionPipeline
+        from scipy.io.wavfile import write
+
+        model_id = "harmonai/maestro-150k"
+        pipe = DiffusionPipeline.from_pretrained(model_id)
+        pipe = pipe.to("cuda")
+
+        audios = pipe(audio_length_in_s=4.0).audios
+
+        # To save locally
+        for i, audio in enumerate(audios):
+            write(f"maestro_test_{i}.wav", pipe.unet.sample_rate, audio.transpose())
+
+        # To dislay in google colab
+        import IPython.display as ipd
+
+        for audio in audios:
+            display(ipd.Audio(audio, rate=pipe.unet.sample_rate))
+        ```
+
+        Returns:
+            [`~pipelines.AudioPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.AudioPipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated audio.
+        """
+
+        if audio_length_in_s is None:
+            audio_length_in_s = self.unet.config.sample_size / self.unet.config.sample_rate
+
+        sample_size = audio_length_in_s * self.unet.config.sample_rate
+
+        down_scale_factor = 2 ** len(self.unet.up_blocks)
+        if sample_size < 3 * down_scale_factor:
+            raise ValueError(
+                f"{audio_length_in_s} is too small. Make sure it's bigger or equal to"
+                f" {3 * down_scale_factor / self.unet.config.sample_rate}."
+            )
+
+        original_sample_size = int(sample_size)
+        if sample_size % down_scale_factor != 0:
+            sample_size = (
+                (audio_length_in_s * self.unet.config.sample_rate) // down_scale_factor + 1
+            ) * down_scale_factor
+            logger.info(
+                f"{audio_length_in_s} is increased to {sample_size / self.unet.config.sample_rate} so that it can be handled"
+                f" by the model. It will be cut to {original_sample_size / self.unet.config.sample_rate} after the denoising"
+                " process."
+            )
+        sample_size = int(sample_size)
+
+        dtype = next(self.unet.parameters()).dtype
+        shape = (batch_size, self.unet.config.in_channels, sample_size)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        audio = randn_tensor(shape, generator=generator, device=self._execution_device, dtype=dtype)
+
+        # set step values
+        self.scheduler.set_timesteps(num_inference_steps, device=audio.device)
+        self.scheduler.timesteps = self.scheduler.timesteps.to(dtype)
+
+        for t in self.progress_bar(self.scheduler.timesteps):
+            # 1. predict noise model_output
+            model_output = self.unet(audio, t).sample
+
+            # 2. compute previous audio sample: x_t -> t_t-1
+            audio = self.scheduler.step(model_output, t, audio).prev_sample
+
+        audio = audio.clamp(-1, 1).float().cpu().numpy()
+
+        audio = audio[:, :, :original_sample_size]
+
+        if not return_dict:
+            return (audio,)
+
+        return AudioPipelineOutput(audios=audio)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ddim/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ddim/__init__.py
new file mode 100644
index 000000000..d9eede47c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ddim/__init__.py
@@ -0,0 +1,18 @@
+from typing import TYPE_CHECKING
+
+from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
+
+
+_import_structure = {"pipeline_ddim": ["DDIMPipeline"]}
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .pipeline_ddim import DDIMPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ddim/pipeline_ddim.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ddim/pipeline_ddim.py
new file mode 100644
index 000000000..a3b967ed3
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ddim/pipeline_ddim.py
@@ -0,0 +1,154 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from ...schedulers import DDIMScheduler
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+class DDIMPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for image generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
+            [`DDPMScheduler`], or [`DDIMScheduler`].
+    """
+
+    model_cpu_offload_seq = "unet"
+
+    def __init__(self, unet, scheduler):
+        super().__init__()
+
+        # make sure scheduler can always be converted to DDIM
+        scheduler = DDIMScheduler.from_config(scheduler.config)
+
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        eta: float = 0.0,
+        num_inference_steps: int = 50,
+        use_clipped_model_output: Optional[bool] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of images to generate.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. A value of `0` corresponds to
+                DDIM and `1` corresponds to DDPM.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            use_clipped_model_output (`bool`, *optional*, defaults to `None`):
+                If `True` or `False`, see documentation for [`DDIMScheduler.step`]. If `None`, nothing is passed
+                downstream to the scheduler (use `None` for schedulers which don't support this argument).
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        ```py
+        >>> from diffusers import DDIMPipeline
+        >>> import PIL.Image
+        >>> import numpy as np
+
+        >>> # load model and scheduler
+        >>> pipe = DDIMPipeline.from_pretrained("fusing/ddim-lsun-bedroom")
+
+        >>> # run pipeline in inference (sample random noise and denoise)
+        >>> image = pipe(eta=0.0, num_inference_steps=50)
+
+        >>> # process image to PIL
+        >>> image_processed = image.cpu().permute(0, 2, 3, 1)
+        >>> image_processed = (image_processed + 1.0) * 127.5
+        >>> image_processed = image_processed.numpy().astype(np.uint8)
+        >>> image_pil = PIL.Image.fromarray(image_processed[0])
+
+        >>> # save image
+        >>> image_pil.save("test.png")
+        ```
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
+        """
+
+        # Sample gaussian noise to begin loop
+        if isinstance(self.unet.config.sample_size, int):
+            image_shape = (
+                batch_size,
+                self.unet.config.in_channels,
+                self.unet.config.sample_size,
+                self.unet.config.sample_size,
+            )
+        else:
+            image_shape = (batch_size, self.unet.config.in_channels, *self.unet.config.sample_size)
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        image = randn_tensor(image_shape, generator=generator, device=self._execution_device, dtype=self.unet.dtype)
+
+        # set step values
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        for t in self.progress_bar(self.scheduler.timesteps):
+            # 1. predict noise model_output
+            model_output = self.unet(image, t).sample
+
+            # 2. predict previous mean of image x_t-1 and add variance depending on eta
+            # eta corresponds to η in paper and should be between [0, 1]
+            # do x_t -> x_t-1
+            image = self.scheduler.step(
+                model_output, t, image, eta=eta, use_clipped_model_output=use_clipped_model_output, generator=generator
+            ).prev_sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ddpm/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ddpm/__init__.py
new file mode 100644
index 000000000..eb41dd1dc
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ddpm/__init__.py
@@ -0,0 +1,22 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    _LazyModule,
+)
+
+
+_import_structure = {"pipeline_ddpm": ["DDPMPipeline"]}
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .pipeline_ddpm import DDPMPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
new file mode 100644
index 000000000..093a3cdfe
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -0,0 +1,127 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+class DDPMPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for image generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image. Can be one of
+            [`DDPMScheduler`], or [`DDIMScheduler`].
+    """
+
+    model_cpu_offload_seq = "unet"
+
+    def __init__(self, unet, scheduler):
+        super().__init__()
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        num_inference_steps: int = 1000,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of images to generate.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            num_inference_steps (`int`, *optional*, defaults to 1000):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        ```py
+        >>> from diffusers import DDPMPipeline
+
+        >>> # load model and scheduler
+        >>> pipe = DDPMPipeline.from_pretrained("google/ddpm-cat-256")
+
+        >>> # run pipeline in inference (sample random noise and denoise)
+        >>> image = pipe().images[0]
+
+        >>> # save image
+        >>> image.save("ddpm_generated_image.png")
+        ```
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
+        """
+        # Sample gaussian noise to begin loop
+        if isinstance(self.unet.config.sample_size, int):
+            image_shape = (
+                batch_size,
+                self.unet.config.in_channels,
+                self.unet.config.sample_size,
+                self.unet.config.sample_size,
+            )
+        else:
+            image_shape = (batch_size, self.unet.config.in_channels, *self.unet.config.sample_size)
+
+        if self.device.type == "mps":
+            # randn does not work reproducibly on mps
+            image = randn_tensor(image_shape, generator=generator)
+            image = image.to(self.device)
+        else:
+            image = randn_tensor(image_shape, generator=generator, device=self.device)
+
+        # set step values
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        for t in self.progress_bar(self.scheduler.timesteps):
+            # 1. predict noise model_output
+            model_output = self.unet(image, t).sample
+
+            # 2. compute previous image: x_t -> x_t-1
+            image = self.scheduler.step(model_output, t, image, generator=generator).prev_sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/__init__.py
new file mode 100644
index 000000000..79aab1fb1
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/__init__.py
@@ -0,0 +1,85 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {
+    "timesteps": [
+        "fast27_timesteps",
+        "smart100_timesteps",
+        "smart185_timesteps",
+        "smart27_timesteps",
+        "smart50_timesteps",
+        "super100_timesteps",
+        "super27_timesteps",
+        "super40_timesteps",
+    ]
+}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_if"] = ["IFPipeline"]
+    _import_structure["pipeline_if_img2img"] = ["IFImg2ImgPipeline"]
+    _import_structure["pipeline_if_img2img_superresolution"] = ["IFImg2ImgSuperResolutionPipeline"]
+    _import_structure["pipeline_if_inpainting"] = ["IFInpaintingPipeline"]
+    _import_structure["pipeline_if_inpainting_superresolution"] = ["IFInpaintingSuperResolutionPipeline"]
+    _import_structure["pipeline_if_superresolution"] = ["IFSuperResolutionPipeline"]
+    _import_structure["pipeline_output"] = ["IFPipelineOutput"]
+    _import_structure["safety_checker"] = ["IFSafetyChecker"]
+    _import_structure["watermark"] = ["IFWatermarker"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_if import IFPipeline
+        from .pipeline_if_img2img import IFImg2ImgPipeline
+        from .pipeline_if_img2img_superresolution import IFImg2ImgSuperResolutionPipeline
+        from .pipeline_if_inpainting import IFInpaintingPipeline
+        from .pipeline_if_inpainting_superresolution import IFInpaintingSuperResolutionPipeline
+        from .pipeline_if_superresolution import IFSuperResolutionPipeline
+        from .pipeline_output import IFPipelineOutput
+        from .safety_checker import IFSafetyChecker
+        from .timesteps import (
+            fast27_timesteps,
+            smart27_timesteps,
+            smart50_timesteps,
+            smart100_timesteps,
+            smart185_timesteps,
+            super27_timesteps,
+            super40_timesteps,
+            super100_timesteps,
+        )
+        from .watermark import IFWatermarker
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
new file mode 100644
index 000000000..7adf9e9c4
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
@@ -0,0 +1,788 @@
+import html
+import inspect
+import re
+import urllib.parse as ul
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
+
+from ...loaders import LoraLoaderMixin
+from ...models import UNet2DConditionModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    is_accelerate_available,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import IFPipelineOutput
+from .safety_checker import IFSafetyChecker
+from .watermark import IFWatermarker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import IFPipeline, IFSuperResolutionPipeline, DiffusionPipeline
+        >>> from diffusers.utils import pt_to_pil
+        >>> import torch
+
+        >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
+        >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
+
+        >>> image = pipe(prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, output_type="pt").images
+
+        >>> # save intermediate image
+        >>> pil_image = pt_to_pil(image)
+        >>> pil_image[0].save("./if_stage_I.png")
+
+        >>> super_res_1_pipe = IFSuperResolutionPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> super_res_1_pipe.enable_model_cpu_offload()
+
+        >>> image = super_res_1_pipe(
+        ...     image=image, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, output_type="pt"
+        ... ).images
+
+        >>> # save intermediate image
+        >>> pil_image = pt_to_pil(image)
+        >>> pil_image[0].save("./if_stage_I.png")
+
+        >>> safety_modules = {
+        ...     "feature_extractor": pipe.feature_extractor,
+        ...     "safety_checker": pipe.safety_checker,
+        ...     "watermarker": pipe.watermarker,
+        ... }
+        >>> super_res_2_pipe = DiffusionPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-x4-upscaler", **safety_modules, torch_dtype=torch.float16
+        ... )
+        >>> super_res_2_pipe.enable_model_cpu_offload()
+
+        >>> image = super_res_2_pipe(
+        ...     prompt=prompt,
+        ...     image=image,
+        ... ).images
+        >>> image[0].save("./if_stage_II.png")
+        ```
+"""
+
+
+class IFPipeline(DiffusionPipeline, LoraLoaderMixin):
+    tokenizer: T5Tokenizer
+    text_encoder: T5EncoderModel
+
+    unet: UNet2DConditionModel
+    scheduler: DDPMScheduler
+
+    feature_extractor: Optional[CLIPImageProcessor]
+    safety_checker: Optional[IFSafetyChecker]
+
+    watermarker: Optional[IFWatermarker]
+
+    bad_punct_regex = re.compile(
+        r"["
+        + "#®•©™&@·º½¾¿¡§~"
+        + r"\)"
+        + r"\("
+        + r"\]"
+        + r"\["
+        + r"\}"
+        + r"\{"
+        + r"\|"
+        + "\\"
+        + r"\/"
+        + r"\*"
+        + r"]{1,}"
+    )  # noqa
+
+    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
+    model_cpu_offload_seq = "text_encoder->unet"
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        safety_checker: Optional[IFSafetyChecker],
+        feature_extractor: Optional[CLIPImageProcessor],
+        watermarker: Optional[IFWatermarker],
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the IF license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            watermarker=watermarker,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def remove_all_hooks(self):
+        if is_accelerate_available():
+            from accelerate.hooks import remove_hook_from_module
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        for model in [self.text_encoder, self.unet, self.safety_checker]:
+            if model is not None:
+                remove_hook_from_module(model, recurse=True)
+
+        self.unet_offload_hook = None
+        self.text_encoder_offload_hook = None
+        self.final_offload_hook = None
+
+    @torch.no_grad()
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        do_classifier_free_guidance: bool = True,
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        clean_caption: bool = False,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            clean_caption (bool, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+        """
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
+        max_length = 77
+
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            attention_mask = text_inputs.attention_mask.to(device)
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            dtype = self.unet.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            attention_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        else:
+            negative_prompt_embeds = None
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, nsfw_detected, watermark_detected = self.safety_checker(
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(dtype=dtype),
+            )
+        else:
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+
+        return image, nsfw_detected, watermark_detected
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def prepare_intermediate_images(self, batch_size, num_channels, height, width, dtype, device, generator):
+        shape = (batch_size, num_channels, height, width)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        intermediate_images = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        intermediate_images = intermediate_images * self.scheduler.init_noise_sigma
+        return intermediate_images
+
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_inference_steps: int = 100,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        clean_caption: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The width in pixels of the generated image.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
+            returning a tuple, the first element is a list with the generated images, and the second element is a list
+            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
+            or watermarked content, according to the `safety_checker`.
+        """
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
+
+        # 2. Define call parameters
+        height = height or self.unet.config.sample_size
+        width = width or self.unet.config.sample_size
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clean_caption=clean_caption,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        # 5. Prepare intermediate images
+        intermediate_images = self.prepare_intermediate_images(
+            batch_size * num_images_per_prompt,
+            self.unet.config.in_channels,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # HACK: see comment in `enable_model_cpu_offload`
+        if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None:
+            self.text_encoder_offload_hook.offload()
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                model_input = (
+                    torch.cat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images
+                )
+                model_input = self.scheduler.scale_model_input(model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1], dim=1)
+                    noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1], dim=1)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+
+                if self.scheduler.config.variance_type not in ["learned", "learned_range"]:
+                    noise_pred, _ = noise_pred.split(model_input.shape[1], dim=1)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                intermediate_images = self.scheduler.step(
+                    noise_pred, t, intermediate_images, **extra_step_kwargs, return_dict=False
+                )[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, intermediate_images)
+
+        image = intermediate_images
+
+        if output_type == "pil":
+            # 8. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 9. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 10. Convert to PIL
+            image = self.numpy_to_pil(image)
+
+            # 11. Apply watermark
+            if self.watermarker is not None:
+                image = self.watermarker.apply_watermark(image, self.unet.config.sample_size)
+        elif output_type == "pt":
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+        else:
+            # 8. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 9. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, nsfw_detected, watermark_detected)
+
+        return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
new file mode 100644
index 000000000..ccc7b1d15
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
@@ -0,0 +1,910 @@
+import html
+import inspect
+import re
+import urllib.parse as ul
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
+
+from ...loaders import LoraLoaderMixin
+from ...models import UNet2DConditionModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    PIL_INTERPOLATION,
+    is_accelerate_available,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import IFPipelineOutput
+from .safety_checker import IFSafetyChecker
+from .watermark import IFWatermarker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+
+def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
+    w, h = images.size
+
+    coef = w / h
+
+    w, h = img_size, img_size
+
+    if coef >= 1:
+        w = int(round(img_size / 8 * coef) * 8)
+    else:
+        h = int(round(img_size / 8 / coef) * 8)
+
+    images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
+
+    return images
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline, DiffusionPipeline
+        >>> from diffusers.utils import pt_to_pil
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from io import BytesIO
+
+        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+        >>> response = requests.get(url)
+        >>> original_image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> original_image = original_image.resize((768, 512))
+
+        >>> pipe = IFImg2ImgPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-I-XL-v1.0",
+        ...     variant="fp16",
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = "A fantasy landscape in style minecraft"
+        >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
+
+        >>> image = pipe(
+        ...     image=original_image,
+        ...     prompt_embeds=prompt_embeds,
+        ...     negative_prompt_embeds=negative_embeds,
+        ...     output_type="pt",
+        ... ).images
+
+        >>> # save intermediate image
+        >>> pil_image = pt_to_pil(image)
+        >>> pil_image[0].save("./if_stage_I.png")
+
+        >>> super_res_1_pipe = IFImg2ImgSuperResolutionPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-II-L-v1.0",
+        ...     text_encoder=None,
+        ...     variant="fp16",
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> super_res_1_pipe.enable_model_cpu_offload()
+
+        >>> image = super_res_1_pipe(
+        ...     image=image,
+        ...     original_image=original_image,
+        ...     prompt_embeds=prompt_embeds,
+        ...     negative_prompt_embeds=negative_embeds,
+        ... ).images
+        >>> image[0].save("./if_stage_II.png")
+        ```
+"""
+
+
+class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin):
+    tokenizer: T5Tokenizer
+    text_encoder: T5EncoderModel
+
+    unet: UNet2DConditionModel
+    scheduler: DDPMScheduler
+
+    feature_extractor: Optional[CLIPImageProcessor]
+    safety_checker: Optional[IFSafetyChecker]
+
+    watermarker: Optional[IFWatermarker]
+
+    bad_punct_regex = re.compile(
+        r"["
+        + "#®•©™&@·º½¾¿¡§~"
+        + r"\)"
+        + r"\("
+        + r"\]"
+        + r"\["
+        + r"\}"
+        + r"\{"
+        + r"\|"
+        + "\\"
+        + r"\/"
+        + r"\*"
+        + r"]{1,}"
+    )  # noqa
+
+    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
+    model_cpu_offload_seq = "text_encoder->unet"
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        safety_checker: Optional[IFSafetyChecker],
+        feature_extractor: Optional[CLIPImageProcessor],
+        watermarker: Optional[IFWatermarker],
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the IF license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            watermarker=watermarker,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
+    def remove_all_hooks(self):
+        if is_accelerate_available():
+            from accelerate.hooks import remove_hook_from_module
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        for model in [self.text_encoder, self.unet, self.safety_checker]:
+            if model is not None:
+                remove_hook_from_module(model, recurse=True)
+
+        self.unet_offload_hook = None
+        self.text_encoder_offload_hook = None
+        self.final_offload_hook = None
+
+    @torch.no_grad()
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        do_classifier_free_guidance: bool = True,
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        clean_caption: bool = False,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            clean_caption (bool, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+        """
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
+        max_length = 77
+
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            attention_mask = text_inputs.attention_mask.to(device)
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            dtype = self.unet.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            attention_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        else:
+            negative_prompt_embeds = None
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, nsfw_detected, watermark_detected = self.safety_checker(
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(dtype=dtype),
+            )
+        else:
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+
+        return image, nsfw_detected, watermark_detected
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        batch_size,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if isinstance(image, list):
+            check_image_type = image[0]
+        else:
+            check_image_type = image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(image, list):
+            image_batch_size = len(image)
+        elif isinstance(image, torch.Tensor):
+            image_batch_size = image.shape[0]
+        elif isinstance(image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(image, np.ndarray):
+            image_batch_size = image.shape[0]
+        else:
+            assert False
+
+        if batch_size != image_batch_size:
+            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    def preprocess_image(self, image: PIL.Image.Image) -> torch.Tensor:
+        if not isinstance(image, list):
+            image = [image]
+
+        def numpy_to_pt(images):
+            if images.ndim == 3:
+                images = images[..., None]
+
+            images = torch.from_numpy(images.transpose(0, 3, 1, 2))
+            return images
+
+        if isinstance(image[0], PIL.Image.Image):
+            new_image = []
+
+            for image_ in image:
+                image_ = image_.convert("RGB")
+                image_ = resize(image_, self.unet.sample_size)
+                image_ = np.array(image_)
+                image_ = image_.astype(np.float32)
+                image_ = image_ / 127.5 - 1
+                new_image.append(image_)
+
+            image = new_image
+
+            image = np.stack(image, axis=0)  # to np
+            image = numpy_to_pt(image)  # to pt
+
+        elif isinstance(image[0], np.ndarray):
+            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
+            image = numpy_to_pt(image)
+
+        elif isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
+
+        return image
+
+    def get_timesteps(self, num_inference_steps, strength):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_intermediate_images(
+        self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None
+    ):
+        _, channels, height, width = image.shape
+
+        batch_size = batch_size * num_images_per_prompt
+
+        shape = (batch_size, channels, height, width)
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        image = image.repeat_interleave(num_images_per_prompt, dim=0)
+        image = self.scheduler.add_noise(image, noise, timestep)
+
+        return image
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[
+            PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray]
+        ] = None,
+        strength: float = 0.7,
+        num_inference_steps: int = 80,
+        timesteps: List[int] = None,
+        guidance_scale: float = 10.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        clean_caption: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            strength (`float`, *optional*, defaults to 0.7):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 80):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 10.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
+            returning a tuple, the first element is a list with the generated images, and the second element is a list
+            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
+            or watermarked content, according to the `safety_checker`.
+        """
+        # 1. Check inputs. Raise error if not correct
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        self.check_inputs(
+            prompt, image, batch_size, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # 2. Define call parameters
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clean_caption=clean_caption,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        dtype = prompt_embeds.dtype
+
+        # 4. Prepare timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+
+        # 5. Prepare intermediate images
+        image = self.preprocess_image(image)
+        image = image.to(device=device, dtype=dtype)
+
+        noise_timestep = timesteps[0:1]
+        noise_timestep = noise_timestep.repeat(batch_size * num_images_per_prompt)
+
+        intermediate_images = self.prepare_intermediate_images(
+            image, noise_timestep, batch_size, num_images_per_prompt, dtype, device, generator
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # HACK: see comment in `enable_model_cpu_offload`
+        if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None:
+            self.text_encoder_offload_hook.offload()
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                model_input = (
+                    torch.cat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images
+                )
+                model_input = self.scheduler.scale_model_input(model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1], dim=1)
+                    noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1], dim=1)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+
+                if self.scheduler.config.variance_type not in ["learned", "learned_range"]:
+                    noise_pred, _ = noise_pred.split(model_input.shape[1], dim=1)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                intermediate_images = self.scheduler.step(
+                    noise_pred, t, intermediate_images, **extra_step_kwargs, return_dict=False
+                )[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, intermediate_images)
+
+        image = intermediate_images
+
+        if output_type == "pil":
+            # 8. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 9. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 10. Convert to PIL
+            image = self.numpy_to_pil(image)
+
+            # 11. Apply watermark
+            if self.watermarker is not None:
+                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
+        elif output_type == "pt":
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+        else:
+            # 8. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 9. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, nsfw_detected, watermark_detected)
+
+        return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
new file mode 100644
index 000000000..b4ce5831a
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
@@ -0,0 +1,1029 @@
+import html
+import inspect
+import re
+import urllib.parse as ul
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
+
+from ...loaders import LoraLoaderMixin
+from ...models import UNet2DConditionModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    PIL_INTERPOLATION,
+    is_accelerate_available,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import IFPipelineOutput
+from .safety_checker import IFSafetyChecker
+from .watermark import IFWatermarker
+
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.resize
+def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
+    w, h = images.size
+
+    coef = w / h
+
+    w, h = img_size, img_size
+
+    if coef >= 1:
+        w = int(round(img_size / 8 * coef) * 8)
+    else:
+        h = int(round(img_size / 8 / coef) * 8)
+
+    images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
+
+    return images
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import IFImg2ImgPipeline, IFImg2ImgSuperResolutionPipeline, DiffusionPipeline
+        >>> from diffusers.utils import pt_to_pil
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from io import BytesIO
+
+        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+        >>> response = requests.get(url)
+        >>> original_image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> original_image = original_image.resize((768, 512))
+
+        >>> pipe = IFImg2ImgPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-I-XL-v1.0",
+        ...     variant="fp16",
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = "A fantasy landscape in style minecraft"
+        >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
+
+        >>> image = pipe(
+        ...     image=original_image,
+        ...     prompt_embeds=prompt_embeds,
+        ...     negative_prompt_embeds=negative_embeds,
+        ...     output_type="pt",
+        ... ).images
+
+        >>> # save intermediate image
+        >>> pil_image = pt_to_pil(image)
+        >>> pil_image[0].save("./if_stage_I.png")
+
+        >>> super_res_1_pipe = IFImg2ImgSuperResolutionPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-II-L-v1.0",
+        ...     text_encoder=None,
+        ...     variant="fp16",
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> super_res_1_pipe.enable_model_cpu_offload()
+
+        >>> image = super_res_1_pipe(
+        ...     image=image,
+        ...     original_image=original_image,
+        ...     prompt_embeds=prompt_embeds,
+        ...     negative_prompt_embeds=negative_embeds,
+        ... ).images
+        >>> image[0].save("./if_stage_II.png")
+        ```
+"""
+
+
+class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
+    tokenizer: T5Tokenizer
+    text_encoder: T5EncoderModel
+
+    unet: UNet2DConditionModel
+    scheduler: DDPMScheduler
+    image_noising_scheduler: DDPMScheduler
+
+    feature_extractor: Optional[CLIPImageProcessor]
+    safety_checker: Optional[IFSafetyChecker]
+
+    watermarker: Optional[IFWatermarker]
+
+    bad_punct_regex = re.compile(
+        r"["
+        + "#®•©™&@·º½¾¿¡§~"
+        + r"\)"
+        + r"\("
+        + r"\]"
+        + r"\["
+        + r"\}"
+        + r"\{"
+        + r"\|"
+        + "\\"
+        + r"\/"
+        + r"\*"
+        + r"]{1,}"
+    )  # noqa
+
+    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor"]
+    model_cpu_offload_seq = "text_encoder->unet"
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        image_noising_scheduler: DDPMScheduler,
+        safety_checker: Optional[IFSafetyChecker],
+        feature_extractor: Optional[CLIPImageProcessor],
+        watermarker: Optional[IFWatermarker],
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the IF license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        if unet.config.in_channels != 6:
+            logger.warning(
+                "It seems like you have loaded a checkpoint that shall not be used for super resolution from {unet.config._name_or_path} as it accepts {unet.config.in_channels} input channels instead of 6. Please make sure to pass a super resolution checkpoint as the `'unet'`: IFSuperResolutionPipeline.from_pretrained(unet=super_resolution_unet, ...)`."
+            )
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            image_noising_scheduler=image_noising_scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            watermarker=watermarker,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
+    def remove_all_hooks(self):
+        if is_accelerate_available():
+            from accelerate.hooks import remove_hook_from_module
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        for model in [self.text_encoder, self.unet, self.safety_checker]:
+            if model is not None:
+                remove_hook_from_module(model, recurse=True)
+
+        self.unet_offload_hook = None
+        self.text_encoder_offload_hook = None
+        self.final_offload_hook = None
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    @torch.no_grad()
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        do_classifier_free_guidance: bool = True,
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        clean_caption: bool = False,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            clean_caption (bool, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+        """
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
+        max_length = 77
+
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            attention_mask = text_inputs.attention_mask.to(device)
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            dtype = self.unet.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            attention_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        else:
+            negative_prompt_embeds = None
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, nsfw_detected, watermark_detected = self.safety_checker(
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(dtype=dtype),
+            )
+        else:
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+
+        return image, nsfw_detected, watermark_detected
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        original_image,
+        batch_size,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # image
+
+        if isinstance(image, list):
+            check_image_type = image[0]
+        else:
+            check_image_type = image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(image, list):
+            image_batch_size = len(image)
+        elif isinstance(image, torch.Tensor):
+            image_batch_size = image.shape[0]
+        elif isinstance(image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(image, np.ndarray):
+            image_batch_size = image.shape[0]
+        else:
+            assert False
+
+        if batch_size != image_batch_size:
+            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
+
+        # original_image
+
+        if isinstance(original_image, list):
+            check_image_type = original_image[0]
+        else:
+            check_image_type = original_image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`original_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(original_image, list):
+            image_batch_size = len(original_image)
+        elif isinstance(original_image, torch.Tensor):
+            image_batch_size = original_image.shape[0]
+        elif isinstance(original_image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(original_image, np.ndarray):
+            image_batch_size = original_image.shape[0]
+        else:
+            assert False
+
+        if batch_size != image_batch_size:
+            raise ValueError(
+                f"original_image batch size: {image_batch_size} must be same as prompt batch size {batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.preprocess_image with preprocess_image -> preprocess_original_image
+    def preprocess_original_image(self, image: PIL.Image.Image) -> torch.Tensor:
+        if not isinstance(image, list):
+            image = [image]
+
+        def numpy_to_pt(images):
+            if images.ndim == 3:
+                images = images[..., None]
+
+            images = torch.from_numpy(images.transpose(0, 3, 1, 2))
+            return images
+
+        if isinstance(image[0], PIL.Image.Image):
+            new_image = []
+
+            for image_ in image:
+                image_ = image_.convert("RGB")
+                image_ = resize(image_, self.unet.sample_size)
+                image_ = np.array(image_)
+                image_ = image_.astype(np.float32)
+                image_ = image_ / 127.5 - 1
+                new_image.append(image_)
+
+            image = new_image
+
+            image = np.stack(image, axis=0)  # to np
+            image = numpy_to_pt(image)  # to pt
+
+        elif isinstance(image[0], np.ndarray):
+            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
+            image = numpy_to_pt(image)
+
+        elif isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
+
+        return image
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_superresolution.IFSuperResolutionPipeline.preprocess_image
+    def preprocess_image(self, image: PIL.Image.Image, num_images_per_prompt, device) -> torch.Tensor:
+        if not isinstance(image, torch.Tensor) and not isinstance(image, list):
+            image = [image]
+
+        if isinstance(image[0], PIL.Image.Image):
+            image = [np.array(i).astype(np.float32) / 127.5 - 1.0 for i in image]
+
+            image = np.stack(image, axis=0)  # to np
+            image = torch.from_numpy(image.transpose(0, 3, 1, 2))
+        elif isinstance(image[0], np.ndarray):
+            image = np.stack(image, axis=0)  # to np
+            if image.ndim == 5:
+                image = image[0]
+
+            image = torch.from_numpy(image.transpose(0, 3, 1, 2))
+        elif isinstance(image, list) and isinstance(image[0], torch.Tensor):
+            dims = image[0].ndim
+
+            if dims == 3:
+                image = torch.stack(image, dim=0)
+            elif dims == 4:
+                image = torch.concat(image, dim=0)
+            else:
+                raise ValueError(f"Image must have 3 or 4 dimensions, instead got {dims}")
+
+        image = image.to(device=device, dtype=self.unet.dtype)
+
+        image = image.repeat_interleave(num_images_per_prompt, dim=0)
+
+        return image
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.prepare_intermediate_images
+    def prepare_intermediate_images(
+        self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None
+    ):
+        _, channels, height, width = image.shape
+
+        batch_size = batch_size * num_images_per_prompt
+
+        shape = (batch_size, channels, height, width)
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        image = image.repeat_interleave(num_images_per_prompt, dim=0)
+        image = self.scheduler.add_noise(image, noise, timestep)
+
+        return image
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, np.ndarray, torch.FloatTensor],
+        original_image: Union[
+            PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray]
+        ] = None,
+        strength: float = 0.8,
+        prompt: Union[str, List[str]] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 4.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        noise_level: int = 250,
+        clean_caption: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            original_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                The original image that `image` was varied from.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            noise_level (`int`, *optional*, defaults to 250):
+                The amount of noise to add to the upscaled image. Must be in the range `[0, 1000)`
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
+            returning a tuple, the first element is a list with the generated images, and the second element is a list
+            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
+            or watermarked content, according to the `safety_checker`.
+        """
+        # 1. Check inputs. Raise error if not correct
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        self.check_inputs(
+            prompt,
+            image,
+            original_image,
+            batch_size,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clean_caption=clean_caption,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        dtype = prompt_embeds.dtype
+
+        # 4. Prepare timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+
+        # 5. prepare original image
+        original_image = self.preprocess_original_image(original_image)
+        original_image = original_image.to(device=device, dtype=dtype)
+
+        # 6. Prepare intermediate images
+        noise_timestep = timesteps[0:1]
+        noise_timestep = noise_timestep.repeat(batch_size * num_images_per_prompt)
+
+        intermediate_images = self.prepare_intermediate_images(
+            original_image,
+            noise_timestep,
+            batch_size,
+            num_images_per_prompt,
+            dtype,
+            device,
+            generator,
+        )
+
+        # 7. Prepare upscaled image and noise level
+        _, _, height, width = original_image.shape
+
+        image = self.preprocess_image(image, num_images_per_prompt, device)
+
+        upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True)
+
+        noise_level = torch.tensor([noise_level] * upscaled.shape[0], device=upscaled.device)
+        noise = randn_tensor(upscaled.shape, generator=generator, device=upscaled.device, dtype=upscaled.dtype)
+        upscaled = self.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level)
+
+        if do_classifier_free_guidance:
+            noise_level = torch.cat([noise_level] * 2)
+
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # HACK: see comment in `enable_model_cpu_offload`
+        if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None:
+            self.text_encoder_offload_hook.offload()
+
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                model_input = torch.cat([intermediate_images, upscaled], dim=1)
+
+                model_input = torch.cat([model_input] * 2) if do_classifier_free_guidance else model_input
+                model_input = self.scheduler.scale_model_input(model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    class_labels=noise_level,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1] // 2, dim=1)
+                    noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1] // 2, dim=1)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+
+                if self.scheduler.config.variance_type not in ["learned", "learned_range"]:
+                    noise_pred, _ = noise_pred.split(intermediate_images.shape[1], dim=1)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                intermediate_images = self.scheduler.step(
+                    noise_pred, t, intermediate_images, **extra_step_kwargs, return_dict=False
+                )[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, intermediate_images)
+
+        image = intermediate_images
+
+        if output_type == "pil":
+            # 10. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 11. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 12. Convert to PIL
+            image = self.numpy_to_pil(image)
+
+            # 13. Apply watermark
+            if self.watermarker is not None:
+                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
+        elif output_type == "pt":
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+        else:
+            # 10. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 11. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, nsfw_detected, watermark_detected)
+
+        return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
new file mode 100644
index 000000000..180e5309c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
@@ -0,0 +1,1030 @@
+import html
+import inspect
+import re
+import urllib.parse as ul
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
+
+from ...loaders import LoraLoaderMixin
+from ...models import UNet2DConditionModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    PIL_INTERPOLATION,
+    is_accelerate_available,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import IFPipelineOutput
+from .safety_checker import IFSafetyChecker
+from .watermark import IFWatermarker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+
+# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.resize
+def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
+    w, h = images.size
+
+    coef = w / h
+
+    w, h = img_size, img_size
+
+    if coef >= 1:
+        w = int(round(img_size / 8 * coef) * 8)
+    else:
+        h = int(round(img_size / 8 / coef) * 8)
+
+    images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
+
+    return images
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import IFInpaintingPipeline, IFInpaintingSuperResolutionPipeline, DiffusionPipeline
+        >>> from diffusers.utils import pt_to_pil
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from io import BytesIO
+
+        >>> url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/person.png"
+        >>> response = requests.get(url)
+        >>> original_image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> original_image = original_image
+
+        >>> url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/glasses_mask.png"
+        >>> response = requests.get(url)
+        >>> mask_image = Image.open(BytesIO(response.content))
+        >>> mask_image = mask_image
+
+        >>> pipe = IFInpaintingPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = "blue sunglasses"
+        >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
+
+        >>> image = pipe(
+        ...     image=original_image,
+        ...     mask_image=mask_image,
+        ...     prompt_embeds=prompt_embeds,
+        ...     negative_prompt_embeds=negative_embeds,
+        ...     output_type="pt",
+        ... ).images
+
+        >>> # save intermediate image
+        >>> pil_image = pt_to_pil(image)
+        >>> pil_image[0].save("./if_stage_I.png")
+
+        >>> super_res_1_pipe = IFInpaintingSuperResolutionPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> super_res_1_pipe.enable_model_cpu_offload()
+
+        >>> image = super_res_1_pipe(
+        ...     image=image,
+        ...     mask_image=mask_image,
+        ...     original_image=original_image,
+        ...     prompt_embeds=prompt_embeds,
+        ...     negative_prompt_embeds=negative_embeds,
+        ... ).images
+        >>> image[0].save("./if_stage_II.png")
+        ```
+"""
+
+
+class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin):
+    tokenizer: T5Tokenizer
+    text_encoder: T5EncoderModel
+
+    unet: UNet2DConditionModel
+    scheduler: DDPMScheduler
+
+    feature_extractor: Optional[CLIPImageProcessor]
+    safety_checker: Optional[IFSafetyChecker]
+
+    watermarker: Optional[IFWatermarker]
+
+    bad_punct_regex = re.compile(
+        r"["
+        + "#®•©™&@·º½¾¿¡§~"
+        + r"\)"
+        + r"\("
+        + r"\]"
+        + r"\["
+        + r"\}"
+        + r"\{"
+        + r"\|"
+        + "\\"
+        + r"\/"
+        + r"\*"
+        + r"]{1,}"
+    )  # noqa
+
+    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
+    model_cpu_offload_seq = "text_encoder->unet"
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        safety_checker: Optional[IFSafetyChecker],
+        feature_extractor: Optional[CLIPImageProcessor],
+        watermarker: Optional[IFWatermarker],
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the IF license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            watermarker=watermarker,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
+    def remove_all_hooks(self):
+        if is_accelerate_available():
+            from accelerate.hooks import remove_hook_from_module
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        for model in [self.text_encoder, self.unet, self.safety_checker]:
+            if model is not None:
+                remove_hook_from_module(model, recurse=True)
+
+        self.unet_offload_hook = None
+        self.text_encoder_offload_hook = None
+        self.final_offload_hook = None
+
+    @torch.no_grad()
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        do_classifier_free_guidance: bool = True,
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        clean_caption: bool = False,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            clean_caption (bool, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+        """
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
+        max_length = 77
+
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            attention_mask = text_inputs.attention_mask.to(device)
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            dtype = self.unet.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            attention_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        else:
+            negative_prompt_embeds = None
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, nsfw_detected, watermark_detected = self.safety_checker(
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(dtype=dtype),
+            )
+        else:
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+
+        return image, nsfw_detected, watermark_detected
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        mask_image,
+        batch_size,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # image
+
+        if isinstance(image, list):
+            check_image_type = image[0]
+        else:
+            check_image_type = image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(image, list):
+            image_batch_size = len(image)
+        elif isinstance(image, torch.Tensor):
+            image_batch_size = image.shape[0]
+        elif isinstance(image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(image, np.ndarray):
+            image_batch_size = image.shape[0]
+        else:
+            assert False
+
+        if batch_size != image_batch_size:
+            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
+
+        # mask_image
+
+        if isinstance(mask_image, list):
+            check_image_type = mask_image[0]
+        else:
+            check_image_type = mask_image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`mask_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(mask_image, list):
+            image_batch_size = len(mask_image)
+        elif isinstance(mask_image, torch.Tensor):
+            image_batch_size = mask_image.shape[0]
+        elif isinstance(mask_image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(mask_image, np.ndarray):
+            image_batch_size = mask_image.shape[0]
+        else:
+            assert False
+
+        if image_batch_size != 1 and batch_size != image_batch_size:
+            raise ValueError(
+                f"mask_image batch size: {image_batch_size} must be `1` or the same as prompt batch size {batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.preprocess_image
+    def preprocess_image(self, image: PIL.Image.Image) -> torch.Tensor:
+        if not isinstance(image, list):
+            image = [image]
+
+        def numpy_to_pt(images):
+            if images.ndim == 3:
+                images = images[..., None]
+
+            images = torch.from_numpy(images.transpose(0, 3, 1, 2))
+            return images
+
+        if isinstance(image[0], PIL.Image.Image):
+            new_image = []
+
+            for image_ in image:
+                image_ = image_.convert("RGB")
+                image_ = resize(image_, self.unet.sample_size)
+                image_ = np.array(image_)
+                image_ = image_.astype(np.float32)
+                image_ = image_ / 127.5 - 1
+                new_image.append(image_)
+
+            image = new_image
+
+            image = np.stack(image, axis=0)  # to np
+            image = numpy_to_pt(image)  # to pt
+
+        elif isinstance(image[0], np.ndarray):
+            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
+            image = numpy_to_pt(image)
+
+        elif isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
+
+        return image
+
+    def preprocess_mask_image(self, mask_image) -> torch.Tensor:
+        if not isinstance(mask_image, list):
+            mask_image = [mask_image]
+
+        if isinstance(mask_image[0], torch.Tensor):
+            mask_image = torch.cat(mask_image, axis=0) if mask_image[0].ndim == 4 else torch.stack(mask_image, axis=0)
+
+            if mask_image.ndim == 2:
+                # Batch and add channel dim for single mask
+                mask_image = mask_image.unsqueeze(0).unsqueeze(0)
+            elif mask_image.ndim == 3 and mask_image.shape[0] == 1:
+                # Single mask, the 0'th dimension is considered to be
+                # the existing batch size of 1
+                mask_image = mask_image.unsqueeze(0)
+            elif mask_image.ndim == 3 and mask_image.shape[0] != 1:
+                # Batch of mask, the 0'th dimension is considered to be
+                # the batching dimension
+                mask_image = mask_image.unsqueeze(1)
+
+            mask_image[mask_image < 0.5] = 0
+            mask_image[mask_image >= 0.5] = 1
+
+        elif isinstance(mask_image[0], PIL.Image.Image):
+            new_mask_image = []
+
+            for mask_image_ in mask_image:
+                mask_image_ = mask_image_.convert("L")
+                mask_image_ = resize(mask_image_, self.unet.sample_size)
+                mask_image_ = np.array(mask_image_)
+                mask_image_ = mask_image_[None, None, :]
+                new_mask_image.append(mask_image_)
+
+            mask_image = new_mask_image
+
+            mask_image = np.concatenate(mask_image, axis=0)
+            mask_image = mask_image.astype(np.float32) / 255.0
+            mask_image[mask_image < 0.5] = 0
+            mask_image[mask_image >= 0.5] = 1
+            mask_image = torch.from_numpy(mask_image)
+
+        elif isinstance(mask_image[0], np.ndarray):
+            mask_image = np.concatenate([m[None, None, :] for m in mask_image], axis=0)
+
+            mask_image[mask_image < 0.5] = 0
+            mask_image[mask_image >= 0.5] = 1
+            mask_image = torch.from_numpy(mask_image)
+
+        return mask_image
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_intermediate_images(
+        self, image, timestep, batch_size, num_images_per_prompt, dtype, device, mask_image, generator=None
+    ):
+        image_batch_size, channels, height, width = image.shape
+
+        batch_size = batch_size * num_images_per_prompt
+
+        shape = (batch_size, channels, height, width)
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        image = image.repeat_interleave(num_images_per_prompt, dim=0)
+        noised_image = self.scheduler.add_noise(image, noise, timestep)
+
+        image = (1 - mask_image) * image + mask_image * noised_image
+
+        return image
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[
+            PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray]
+        ] = None,
+        mask_image: Union[
+            PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray]
+        ] = None,
+        strength: float = 1.0,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        clean_caption: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            strength (`float`, *optional*, defaults to 1.0):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
+            returning a tuple, the first element is a list with the generated images, and the second element is a list
+            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
+            or watermarked content, according to the `safety_checker`.
+        """
+        # 1. Check inputs. Raise error if not correct
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        self.check_inputs(
+            prompt,
+            image,
+            mask_image,
+            batch_size,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clean_caption=clean_caption,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        dtype = prompt_embeds.dtype
+
+        # 4. Prepare timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+
+        # 5. Prepare intermediate images
+        image = self.preprocess_image(image)
+        image = image.to(device=device, dtype=dtype)
+
+        mask_image = self.preprocess_mask_image(mask_image)
+        mask_image = mask_image.to(device=device, dtype=dtype)
+
+        if mask_image.shape[0] == 1:
+            mask_image = mask_image.repeat_interleave(batch_size * num_images_per_prompt, dim=0)
+        else:
+            mask_image = mask_image.repeat_interleave(num_images_per_prompt, dim=0)
+
+        noise_timestep = timesteps[0:1]
+        noise_timestep = noise_timestep.repeat(batch_size * num_images_per_prompt)
+
+        intermediate_images = self.prepare_intermediate_images(
+            image, noise_timestep, batch_size, num_images_per_prompt, dtype, device, mask_image, generator
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # HACK: see comment in `enable_model_cpu_offload`
+        if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None:
+            self.text_encoder_offload_hook.offload()
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                model_input = (
+                    torch.cat([intermediate_images] * 2) if do_classifier_free_guidance else intermediate_images
+                )
+                model_input = self.scheduler.scale_model_input(model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1], dim=1)
+                    noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1], dim=1)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+
+                if self.scheduler.config.variance_type not in ["learned", "learned_range"]:
+                    noise_pred, _ = noise_pred.split(model_input.shape[1], dim=1)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                prev_intermediate_images = intermediate_images
+
+                intermediate_images = self.scheduler.step(
+                    noise_pred, t, intermediate_images, **extra_step_kwargs, return_dict=False
+                )[0]
+
+                intermediate_images = (1 - mask_image) * prev_intermediate_images + mask_image * intermediate_images
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, intermediate_images)
+
+        image = intermediate_images
+
+        if output_type == "pil":
+            # 8. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 9. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 10. Convert to PIL
+            image = self.numpy_to_pil(image)
+
+            # 11. Apply watermark
+            if self.watermarker is not None:
+                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
+        elif output_type == "pt":
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+        else:
+            # 8. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 9. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, nsfw_detected, watermark_detected)
+
+        return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
new file mode 100644
index 000000000..b67907c1c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
@@ -0,0 +1,1137 @@
+import html
+import inspect
+import re
+import urllib.parse as ul
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
+
+from ...loaders import LoraLoaderMixin
+from ...models import UNet2DConditionModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    PIL_INTERPOLATION,
+    is_accelerate_available,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import IFPipelineOutput
+from .safety_checker import IFSafetyChecker
+from .watermark import IFWatermarker
+
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.resize
+def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image:
+    w, h = images.size
+
+    coef = w / h
+
+    w, h = img_size, img_size
+
+    if coef >= 1:
+        w = int(round(img_size / 8 * coef) * 8)
+    else:
+        h = int(round(img_size / 8 / coef) * 8)
+
+    images = images.resize((w, h), resample=PIL_INTERPOLATION["bicubic"], reducing_gap=None)
+
+    return images
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import IFInpaintingPipeline, IFInpaintingSuperResolutionPipeline, DiffusionPipeline
+        >>> from diffusers.utils import pt_to_pil
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from io import BytesIO
+
+        >>> url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/person.png"
+        >>> response = requests.get(url)
+        >>> original_image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> original_image = original_image
+
+        >>> url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/if/glasses_mask.png"
+        >>> response = requests.get(url)
+        >>> mask_image = Image.open(BytesIO(response.content))
+        >>> mask_image = mask_image
+
+        >>> pipe = IFInpaintingPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = "blue sunglasses"
+
+        >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
+        >>> image = pipe(
+        ...     image=original_image,
+        ...     mask_image=mask_image,
+        ...     prompt_embeds=prompt_embeds,
+        ...     negative_prompt_embeds=negative_embeds,
+        ...     output_type="pt",
+        ... ).images
+
+        >>> # save intermediate image
+        >>> pil_image = pt_to_pil(image)
+        >>> pil_image[0].save("./if_stage_I.png")
+
+        >>> super_res_1_pipe = IFInpaintingSuperResolutionPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> super_res_1_pipe.enable_model_cpu_offload()
+
+        >>> image = super_res_1_pipe(
+        ...     image=image,
+        ...     mask_image=mask_image,
+        ...     original_image=original_image,
+        ...     prompt_embeds=prompt_embeds,
+        ...     negative_prompt_embeds=negative_embeds,
+        ... ).images
+        >>> image[0].save("./if_stage_II.png")
+        ```
+    """
+
+
+class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
+    tokenizer: T5Tokenizer
+    text_encoder: T5EncoderModel
+
+    unet: UNet2DConditionModel
+    scheduler: DDPMScheduler
+    image_noising_scheduler: DDPMScheduler
+
+    feature_extractor: Optional[CLIPImageProcessor]
+    safety_checker: Optional[IFSafetyChecker]
+
+    watermarker: Optional[IFWatermarker]
+
+    bad_punct_regex = re.compile(
+        r"["
+        + "#®•©™&@·º½¾¿¡§~"
+        + r"\)"
+        + r"\("
+        + r"\]"
+        + r"\["
+        + r"\}"
+        + r"\{"
+        + r"\|"
+        + "\\"
+        + r"\/"
+        + r"\*"
+        + r"]{1,}"
+    )  # noqa
+
+    model_cpu_offload_seq = "text_encoder->unet"
+    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        image_noising_scheduler: DDPMScheduler,
+        safety_checker: Optional[IFSafetyChecker],
+        feature_extractor: Optional[CLIPImageProcessor],
+        watermarker: Optional[IFWatermarker],
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the IF license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        if unet.config.in_channels != 6:
+            logger.warning(
+                "It seems like you have loaded a checkpoint that shall not be used for super resolution from {unet.config._name_or_path} as it accepts {unet.config.in_channels} input channels instead of 6. Please make sure to pass a super resolution checkpoint as the `'unet'`: IFSuperResolutionPipeline.from_pretrained(unet=super_resolution_unet, ...)`."
+            )
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            image_noising_scheduler=image_noising_scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            watermarker=watermarker,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
+    def remove_all_hooks(self):
+        if is_accelerate_available():
+            from accelerate.hooks import remove_hook_from_module
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        for model in [self.text_encoder, self.unet, self.safety_checker]:
+            if model is not None:
+                remove_hook_from_module(model, recurse=True)
+
+        self.unet_offload_hook = None
+        self.text_encoder_offload_hook = None
+        self.final_offload_hook = None
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    @torch.no_grad()
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        do_classifier_free_guidance: bool = True,
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        clean_caption: bool = False,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            clean_caption (bool, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+        """
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
+        max_length = 77
+
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            attention_mask = text_inputs.attention_mask.to(device)
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            dtype = self.unet.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            attention_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        else:
+            negative_prompt_embeds = None
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, nsfw_detected, watermark_detected = self.safety_checker(
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(dtype=dtype),
+            )
+        else:
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+
+        return image, nsfw_detected, watermark_detected
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        original_image,
+        mask_image,
+        batch_size,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        # image
+
+        if isinstance(image, list):
+            check_image_type = image[0]
+        else:
+            check_image_type = image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(image, list):
+            image_batch_size = len(image)
+        elif isinstance(image, torch.Tensor):
+            image_batch_size = image.shape[0]
+        elif isinstance(image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(image, np.ndarray):
+            image_batch_size = image.shape[0]
+        else:
+            assert False
+
+        if batch_size != image_batch_size:
+            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
+
+        # original_image
+
+        if isinstance(original_image, list):
+            check_image_type = original_image[0]
+        else:
+            check_image_type = original_image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`original_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(original_image, list):
+            image_batch_size = len(original_image)
+        elif isinstance(original_image, torch.Tensor):
+            image_batch_size = original_image.shape[0]
+        elif isinstance(original_image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(original_image, np.ndarray):
+            image_batch_size = original_image.shape[0]
+        else:
+            assert False
+
+        if batch_size != image_batch_size:
+            raise ValueError(
+                f"original_image batch size: {image_batch_size} must be same as prompt batch size {batch_size}"
+            )
+
+        # mask_image
+
+        if isinstance(mask_image, list):
+            check_image_type = mask_image[0]
+        else:
+            check_image_type = mask_image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`mask_image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(mask_image, list):
+            image_batch_size = len(mask_image)
+        elif isinstance(mask_image, torch.Tensor):
+            image_batch_size = mask_image.shape[0]
+        elif isinstance(mask_image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(mask_image, np.ndarray):
+            image_batch_size = mask_image.shape[0]
+        else:
+            assert False
+
+        if image_batch_size != 1 and batch_size != image_batch_size:
+            raise ValueError(
+                f"mask_image batch size: {image_batch_size} must be `1` or the same as prompt batch size {batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.preprocess_image with preprocess_image -> preprocess_original_image
+    def preprocess_original_image(self, image: PIL.Image.Image) -> torch.Tensor:
+        if not isinstance(image, list):
+            image = [image]
+
+        def numpy_to_pt(images):
+            if images.ndim == 3:
+                images = images[..., None]
+
+            images = torch.from_numpy(images.transpose(0, 3, 1, 2))
+            return images
+
+        if isinstance(image[0], PIL.Image.Image):
+            new_image = []
+
+            for image_ in image:
+                image_ = image_.convert("RGB")
+                image_ = resize(image_, self.unet.sample_size)
+                image_ = np.array(image_)
+                image_ = image_.astype(np.float32)
+                image_ = image_ / 127.5 - 1
+                new_image.append(image_)
+
+            image = new_image
+
+            image = np.stack(image, axis=0)  # to np
+            image = numpy_to_pt(image)  # to pt
+
+        elif isinstance(image[0], np.ndarray):
+            image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
+            image = numpy_to_pt(image)
+
+        elif isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
+
+        return image
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_superresolution.IFSuperResolutionPipeline.preprocess_image
+    def preprocess_image(self, image: PIL.Image.Image, num_images_per_prompt, device) -> torch.Tensor:
+        if not isinstance(image, torch.Tensor) and not isinstance(image, list):
+            image = [image]
+
+        if isinstance(image[0], PIL.Image.Image):
+            image = [np.array(i).astype(np.float32) / 127.5 - 1.0 for i in image]
+
+            image = np.stack(image, axis=0)  # to np
+            image = torch.from_numpy(image.transpose(0, 3, 1, 2))
+        elif isinstance(image[0], np.ndarray):
+            image = np.stack(image, axis=0)  # to np
+            if image.ndim == 5:
+                image = image[0]
+
+            image = torch.from_numpy(image.transpose(0, 3, 1, 2))
+        elif isinstance(image, list) and isinstance(image[0], torch.Tensor):
+            dims = image[0].ndim
+
+            if dims == 3:
+                image = torch.stack(image, dim=0)
+            elif dims == 4:
+                image = torch.concat(image, dim=0)
+            else:
+                raise ValueError(f"Image must have 3 or 4 dimensions, instead got {dims}")
+
+        image = image.to(device=device, dtype=self.unet.dtype)
+
+        image = image.repeat_interleave(num_images_per_prompt, dim=0)
+
+        return image
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_inpainting.IFInpaintingPipeline.preprocess_mask_image
+    def preprocess_mask_image(self, mask_image) -> torch.Tensor:
+        if not isinstance(mask_image, list):
+            mask_image = [mask_image]
+
+        if isinstance(mask_image[0], torch.Tensor):
+            mask_image = torch.cat(mask_image, axis=0) if mask_image[0].ndim == 4 else torch.stack(mask_image, axis=0)
+
+            if mask_image.ndim == 2:
+                # Batch and add channel dim for single mask
+                mask_image = mask_image.unsqueeze(0).unsqueeze(0)
+            elif mask_image.ndim == 3 and mask_image.shape[0] == 1:
+                # Single mask, the 0'th dimension is considered to be
+                # the existing batch size of 1
+                mask_image = mask_image.unsqueeze(0)
+            elif mask_image.ndim == 3 and mask_image.shape[0] != 1:
+                # Batch of mask, the 0'th dimension is considered to be
+                # the batching dimension
+                mask_image = mask_image.unsqueeze(1)
+
+            mask_image[mask_image < 0.5] = 0
+            mask_image[mask_image >= 0.5] = 1
+
+        elif isinstance(mask_image[0], PIL.Image.Image):
+            new_mask_image = []
+
+            for mask_image_ in mask_image:
+                mask_image_ = mask_image_.convert("L")
+                mask_image_ = resize(mask_image_, self.unet.sample_size)
+                mask_image_ = np.array(mask_image_)
+                mask_image_ = mask_image_[None, None, :]
+                new_mask_image.append(mask_image_)
+
+            mask_image = new_mask_image
+
+            mask_image = np.concatenate(mask_image, axis=0)
+            mask_image = mask_image.astype(np.float32) / 255.0
+            mask_image[mask_image < 0.5] = 0
+            mask_image[mask_image >= 0.5] = 1
+            mask_image = torch.from_numpy(mask_image)
+
+        elif isinstance(mask_image[0], np.ndarray):
+            mask_image = np.concatenate([m[None, None, :] for m in mask_image], axis=0)
+
+            mask_image[mask_image < 0.5] = 0
+            mask_image[mask_image >= 0.5] = 1
+            mask_image = torch.from_numpy(mask_image)
+
+        return mask_image
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_img2img.IFImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if_inpainting.IFInpaintingPipeline.prepare_intermediate_images
+    def prepare_intermediate_images(
+        self, image, timestep, batch_size, num_images_per_prompt, dtype, device, mask_image, generator=None
+    ):
+        image_batch_size, channels, height, width = image.shape
+
+        batch_size = batch_size * num_images_per_prompt
+
+        shape = (batch_size, channels, height, width)
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        image = image.repeat_interleave(num_images_per_prompt, dim=0)
+        noised_image = self.scheduler.add_noise(image, noise, timestep)
+
+        image = (1 - mask_image) * image + mask_image * noised_image
+
+        return image
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, np.ndarray, torch.FloatTensor],
+        original_image: Union[
+            PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray]
+        ] = None,
+        mask_image: Union[
+            PIL.Image.Image, torch.Tensor, np.ndarray, List[PIL.Image.Image], List[torch.Tensor], List[np.ndarray]
+        ] = None,
+        strength: float = 0.8,
+        prompt: Union[str, List[str]] = None,
+        num_inference_steps: int = 100,
+        timesteps: List[int] = None,
+        guidance_scale: float = 4.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        noise_level: int = 0,
+        clean_caption: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            original_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                The original image that `image` was varied from.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            noise_level (`int`, *optional*, defaults to 0):
+                The amount of noise to add to the upscaled image. Must be in the range `[0, 1000)`
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
+            returning a tuple, the first element is a list with the generated images, and the second element is a list
+            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
+            or watermarked content, according to the `safety_checker`.
+        """
+        # 1. Check inputs. Raise error if not correct
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        self.check_inputs(
+            prompt,
+            image,
+            original_image,
+            mask_image,
+            batch_size,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clean_caption=clean_caption,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        dtype = prompt_embeds.dtype
+
+        # 4. Prepare timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
+
+        # 5. prepare original image
+        original_image = self.preprocess_original_image(original_image)
+        original_image = original_image.to(device=device, dtype=dtype)
+
+        # 6. prepare mask image
+        mask_image = self.preprocess_mask_image(mask_image)
+        mask_image = mask_image.to(device=device, dtype=dtype)
+
+        if mask_image.shape[0] == 1:
+            mask_image = mask_image.repeat_interleave(batch_size * num_images_per_prompt, dim=0)
+        else:
+            mask_image = mask_image.repeat_interleave(num_images_per_prompt, dim=0)
+
+        # 6. Prepare intermediate images
+        noise_timestep = timesteps[0:1]
+        noise_timestep = noise_timestep.repeat(batch_size * num_images_per_prompt)
+
+        intermediate_images = self.prepare_intermediate_images(
+            original_image,
+            noise_timestep,
+            batch_size,
+            num_images_per_prompt,
+            dtype,
+            device,
+            mask_image,
+            generator,
+        )
+
+        # 7. Prepare upscaled image and noise level
+        _, _, height, width = original_image.shape
+
+        image = self.preprocess_image(image, num_images_per_prompt, device)
+
+        upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True)
+
+        noise_level = torch.tensor([noise_level] * upscaled.shape[0], device=upscaled.device)
+        noise = randn_tensor(upscaled.shape, generator=generator, device=upscaled.device, dtype=upscaled.dtype)
+        upscaled = self.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level)
+
+        if do_classifier_free_guidance:
+            noise_level = torch.cat([noise_level] * 2)
+
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # HACK: see comment in `enable_model_cpu_offload`
+        if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None:
+            self.text_encoder_offload_hook.offload()
+
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                model_input = torch.cat([intermediate_images, upscaled], dim=1)
+
+                model_input = torch.cat([model_input] * 2) if do_classifier_free_guidance else model_input
+                model_input = self.scheduler.scale_model_input(model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    class_labels=noise_level,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1] // 2, dim=1)
+                    noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1] // 2, dim=1)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+
+                if self.scheduler.config.variance_type not in ["learned", "learned_range"]:
+                    noise_pred, _ = noise_pred.split(intermediate_images.shape[1], dim=1)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                prev_intermediate_images = intermediate_images
+
+                intermediate_images = self.scheduler.step(
+                    noise_pred, t, intermediate_images, **extra_step_kwargs, return_dict=False
+                )[0]
+
+                intermediate_images = (1 - mask_image) * prev_intermediate_images + mask_image * intermediate_images
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, intermediate_images)
+
+        image = intermediate_images
+
+        if output_type == "pil":
+            # 10. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 11. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 12. Convert to PIL
+            image = self.numpy_to_pil(image)
+
+            # 13. Apply watermark
+            if self.watermarker is not None:
+                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
+        elif output_type == "pt":
+            nsfw_detected = None
+            watermark_detected = None
+
+        else:
+            # 10. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 11. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, nsfw_detected, watermark_detected)
+
+        return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
new file mode 100644
index 000000000..a293343eb
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
@@ -0,0 +1,885 @@
+import html
+import inspect
+import re
+import urllib.parse as ul
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
+
+from ...loaders import LoraLoaderMixin
+from ...models import UNet2DConditionModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    is_accelerate_available,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import IFPipelineOutput
+from .safety_checker import IFSafetyChecker
+from .watermark import IFWatermarker
+
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import IFPipeline, IFSuperResolutionPipeline, DiffusionPipeline
+        >>> from diffusers.utils import pt_to_pil
+        >>> import torch
+
+        >>> pipe = IFPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0", variant="fp16", torch_dtype=torch.float16)
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = 'a photo of a kangaroo wearing an orange hoodie and blue sunglasses standing in front of the eiffel tower holding a sign that says "very deep learning"'
+        >>> prompt_embeds, negative_embeds = pipe.encode_prompt(prompt)
+
+        >>> image = pipe(prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds, output_type="pt").images
+
+        >>> # save intermediate image
+        >>> pil_image = pt_to_pil(image)
+        >>> pil_image[0].save("./if_stage_I.png")
+
+        >>> super_res_1_pipe = IFSuperResolutionPipeline.from_pretrained(
+        ...     "DeepFloyd/IF-II-L-v1.0", text_encoder=None, variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> super_res_1_pipe.enable_model_cpu_offload()
+
+        >>> image = super_res_1_pipe(
+        ...     image=image, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_embeds
+        ... ).images
+        >>> image[0].save("./if_stage_II.png")
+        ```
+"""
+
+
+class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):
+    tokenizer: T5Tokenizer
+    text_encoder: T5EncoderModel
+
+    unet: UNet2DConditionModel
+    scheduler: DDPMScheduler
+    image_noising_scheduler: DDPMScheduler
+
+    feature_extractor: Optional[CLIPImageProcessor]
+    safety_checker: Optional[IFSafetyChecker]
+
+    watermarker: Optional[IFWatermarker]
+
+    bad_punct_regex = re.compile(
+        r"["
+        + "#®•©™&@·º½¾¿¡§~"
+        + r"\)"
+        + r"\("
+        + r"\]"
+        + r"\["
+        + r"\}"
+        + r"\{"
+        + r"\|"
+        + "\\"
+        + r"\/"
+        + r"\*"
+        + r"]{1,}"
+    )  # noqa
+
+    _optional_components = ["tokenizer", "text_encoder", "safety_checker", "feature_extractor", "watermarker"]
+    model_cpu_offload_seq = "text_encoder->unet"
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        image_noising_scheduler: DDPMScheduler,
+        safety_checker: Optional[IFSafetyChecker],
+        feature_extractor: Optional[CLIPImageProcessor],
+        watermarker: Optional[IFWatermarker],
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the IF license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        if unet.config.in_channels != 6:
+            logger.warning(
+                "It seems like you have loaded a checkpoint that shall not be used for super resolution from {unet.config._name_or_path} as it accepts {unet.config.in_channels} input channels instead of 6. Please make sure to pass a super resolution checkpoint as the `'unet'`: IFSuperResolutionPipeline.from_pretrained(unet=super_resolution_unet, ...)`."
+            )
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            image_noising_scheduler=image_noising_scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            watermarker=watermarker,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.remove_all_hooks
+    def remove_all_hooks(self):
+        if is_accelerate_available():
+            from accelerate.hooks import remove_hook_from_module
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        for model in [self.text_encoder, self.unet, self.safety_checker]:
+            if model is not None:
+                remove_hook_from_module(model, recurse=True)
+
+        self.unet_offload_hook = None
+        self.text_encoder_offload_hook = None
+        self.final_offload_hook = None
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    @torch.no_grad()
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        do_classifier_free_guidance: bool = True,
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        clean_caption: bool = False,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            clean_caption (bool, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+        """
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # while T5 can handle much longer input sequences than 77, the text encoder was trained with a max length of 77 for IF
+        max_length = 77
+
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            attention_mask = text_inputs.attention_mask.to(device)
+
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            dtype = self.unet.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            attention_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        else:
+            negative_prompt_embeds = None
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, nsfw_detected, watermark_detected = self.safety_checker(
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(dtype=dtype),
+            )
+        else:
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+
+        return image, nsfw_detected, watermark_detected
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        batch_size,
+        noise_level,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if noise_level < 0 or noise_level >= self.image_noising_scheduler.config.num_train_timesteps:
+            raise ValueError(
+                f"`noise_level`: {noise_level} must be a valid timestep in `self.noising_scheduler`, [0, {self.image_noising_scheduler.config.num_train_timesteps})"
+            )
+
+        if isinstance(image, list):
+            check_image_type = image[0]
+        else:
+            check_image_type = image
+
+        if (
+            not isinstance(check_image_type, torch.Tensor)
+            and not isinstance(check_image_type, PIL.Image.Image)
+            and not isinstance(check_image_type, np.ndarray)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, or List[...] but is"
+                f" {type(check_image_type)}"
+            )
+
+        if isinstance(image, list):
+            image_batch_size = len(image)
+        elif isinstance(image, torch.Tensor):
+            image_batch_size = image.shape[0]
+        elif isinstance(image, PIL.Image.Image):
+            image_batch_size = 1
+        elif isinstance(image, np.ndarray):
+            image_batch_size = image.shape[0]
+        else:
+            assert False
+
+        if batch_size != image_batch_size:
+            raise ValueError(f"image batch size: {image_batch_size} must be same as prompt batch size {batch_size}")
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline.prepare_intermediate_images
+    def prepare_intermediate_images(self, batch_size, num_channels, height, width, dtype, device, generator):
+        shape = (batch_size, num_channels, height, width)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        intermediate_images = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        intermediate_images = intermediate_images * self.scheduler.init_noise_sigma
+        return intermediate_images
+
+    def preprocess_image(self, image, num_images_per_prompt, device):
+        if not isinstance(image, torch.Tensor) and not isinstance(image, list):
+            image = [image]
+
+        if isinstance(image[0], PIL.Image.Image):
+            image = [np.array(i).astype(np.float32) / 127.5 - 1.0 for i in image]
+
+            image = np.stack(image, axis=0)  # to np
+            image = torch.from_numpy(image.transpose(0, 3, 1, 2))
+        elif isinstance(image[0], np.ndarray):
+            image = np.stack(image, axis=0)  # to np
+            if image.ndim == 5:
+                image = image[0]
+
+            image = torch.from_numpy(image.transpose(0, 3, 1, 2))
+        elif isinstance(image, list) and isinstance(image[0], torch.Tensor):
+            dims = image[0].ndim
+
+            if dims == 3:
+                image = torch.stack(image, dim=0)
+            elif dims == 4:
+                image = torch.concat(image, dim=0)
+            else:
+                raise ValueError(f"Image must have 3 or 4 dimensions, instead got {dims}")
+
+        image = image.to(device=device, dtype=self.unet.dtype)
+
+        image = image.repeat_interleave(num_images_per_prompt, dim=0)
+
+        return image
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: int = None,
+        width: int = None,
+        image: Union[PIL.Image.Image, np.ndarray, torch.FloatTensor] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 4.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        noise_level: int = 250,
+        clean_caption: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`int`, *optional*, defaults to None):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to None):
+                The width in pixels of the generated image.
+            image (`PIL.Image.Image`, `np.ndarray`, `torch.FloatTensor`):
+                The image to be upscaled.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*, defaults to None):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            noise_level (`int`, *optional*, defaults to 250):
+                The amount of noise to add to the upscaled image. Must be in the range `[0, 1000)`
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.IFPipelineOutput`] if `return_dict` is True, otherwise a `tuple. When
+            returning a tuple, the first element is a list with the generated images, and the second element is a list
+            of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw)
+            or watermarked content, according to the `safety_checker`.
+        """
+        # 1. Check inputs. Raise error if not correct
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        self.check_inputs(
+            prompt,
+            image,
+            batch_size,
+            noise_level,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+
+        height = height or self.unet.config.sample_size
+        width = width or self.unet.config.sample_size
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clean_caption=clean_caption,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        # 5. Prepare intermediate images
+        num_channels = self.unet.config.in_channels // 2
+        intermediate_images = self.prepare_intermediate_images(
+            batch_size * num_images_per_prompt,
+            num_channels,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Prepare upscaled image and noise level
+        image = self.preprocess_image(image, num_images_per_prompt, device)
+        upscaled = F.interpolate(image, (height, width), mode="bilinear", align_corners=True)
+
+        noise_level = torch.tensor([noise_level] * upscaled.shape[0], device=upscaled.device)
+        noise = randn_tensor(upscaled.shape, generator=generator, device=upscaled.device, dtype=upscaled.dtype)
+        upscaled = self.image_noising_scheduler.add_noise(upscaled, noise, timesteps=noise_level)
+
+        if do_classifier_free_guidance:
+            noise_level = torch.cat([noise_level] * 2)
+
+        # HACK: see comment in `enable_model_cpu_offload`
+        if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None:
+            self.text_encoder_offload_hook.offload()
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                model_input = torch.cat([intermediate_images, upscaled], dim=1)
+
+                model_input = torch.cat([model_input] * 2) if do_classifier_free_guidance else model_input
+                model_input = self.scheduler.scale_model_input(model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    class_labels=noise_level,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred_uncond, _ = noise_pred_uncond.split(model_input.shape[1] // 2, dim=1)
+                    noise_pred_text, predicted_variance = noise_pred_text.split(model_input.shape[1] // 2, dim=1)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+
+                if self.scheduler.config.variance_type not in ["learned", "learned_range"]:
+                    noise_pred, _ = noise_pred.split(intermediate_images.shape[1], dim=1)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                intermediate_images = self.scheduler.step(
+                    noise_pred, t, intermediate_images, **extra_step_kwargs, return_dict=False
+                )[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, intermediate_images)
+
+        image = intermediate_images
+
+        if output_type == "pil":
+            # 9. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 10. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 11. Convert to PIL
+            image = self.numpy_to_pil(image)
+
+            # 12. Apply watermark
+            if self.watermarker is not None:
+                self.watermarker.apply_watermark(image, self.unet.config.sample_size)
+        elif output_type == "pt":
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+        else:
+            # 9. Post-processing
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            # 10. Run safety checker
+            image, nsfw_detected, watermark_detected = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, nsfw_detected, watermark_detected)
+
+        return IFPipelineOutput(images=image, nsfw_detected=nsfw_detected, watermark_detected=watermark_detected)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_output.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_output.py
new file mode 100644
index 000000000..7f39ab5ba
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_output.py
@@ -0,0 +1,28 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL.Image
+
+from ...utils import BaseOutput
+
+
+@dataclass
+class IFPipelineOutput(BaseOutput):
+    """
+    Args:
+    Output class for Stable Diffusion pipelines.
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+        nsfw_detected (`List[bool]`)
+            List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content or a watermark. `None` if safety checking could not be performed.
+        watermark_detected (`List[bool]`)
+            List of flags denoting whether the corresponding generated image likely has a watermark. `None` if safety
+            checking could not be performed.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_detected: Optional[List[bool]]
+    watermark_detected: Optional[List[bool]]
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/safety_checker.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/safety_checker.py
new file mode 100644
index 000000000..8ffeed580
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/safety_checker.py
@@ -0,0 +1,59 @@
+import numpy as np
+import torch
+import torch.nn as nn
+from transformers import CLIPConfig, CLIPVisionModelWithProjection, PreTrainedModel
+
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class IFSafetyChecker(PreTrainedModel):
+    config_class = CLIPConfig
+
+    _no_split_modules = ["CLIPEncoderLayer"]
+
+    def __init__(self, config: CLIPConfig):
+        super().__init__(config)
+
+        self.vision_model = CLIPVisionModelWithProjection(config.vision_config)
+
+        self.p_head = nn.Linear(config.vision_config.projection_dim, 1)
+        self.w_head = nn.Linear(config.vision_config.projection_dim, 1)
+
+    @torch.no_grad()
+    def forward(self, clip_input, images, p_threshold=0.5, w_threshold=0.5):
+        image_embeds = self.vision_model(clip_input)[0]
+
+        nsfw_detected = self.p_head(image_embeds)
+        nsfw_detected = nsfw_detected.flatten()
+        nsfw_detected = nsfw_detected > p_threshold
+        nsfw_detected = nsfw_detected.tolist()
+
+        if any(nsfw_detected):
+            logger.warning(
+                "Potential NSFW content was detected in one or more images. A black image will be returned instead."
+                " Try again with a different prompt and/or seed."
+            )
+
+        for idx, nsfw_detected_ in enumerate(nsfw_detected):
+            if nsfw_detected_:
+                images[idx] = np.zeros(images[idx].shape)
+
+        watermark_detected = self.w_head(image_embeds)
+        watermark_detected = watermark_detected.flatten()
+        watermark_detected = watermark_detected > w_threshold
+        watermark_detected = watermark_detected.tolist()
+
+        if any(watermark_detected):
+            logger.warning(
+                "Potential watermarked content was detected in one or more images. A black image will be returned instead."
+                " Try again with a different prompt and/or seed."
+            )
+
+        for idx, watermark_detected_ in enumerate(watermark_detected):
+            if watermark_detected_:
+                images[idx] = np.zeros(images[idx].shape)
+
+        return images, nsfw_detected, watermark_detected
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/timesteps.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/timesteps.py
new file mode 100644
index 000000000..d44285c01
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/timesteps.py
@@ -0,0 +1,579 @@
+fast27_timesteps = [
+    999,
+    800,
+    799,
+    600,
+    599,
+    500,
+    400,
+    399,
+    377,
+    355,
+    333,
+    311,
+    288,
+    266,
+    244,
+    222,
+    200,
+    199,
+    177,
+    155,
+    133,
+    111,
+    88,
+    66,
+    44,
+    22,
+    0,
+]
+
+smart27_timesteps = [
+    999,
+    976,
+    952,
+    928,
+    905,
+    882,
+    858,
+    857,
+    810,
+    762,
+    715,
+    714,
+    572,
+    429,
+    428,
+    286,
+    285,
+    238,
+    190,
+    143,
+    142,
+    118,
+    95,
+    71,
+    47,
+    24,
+    0,
+]
+
+smart50_timesteps = [
+    999,
+    988,
+    977,
+    966,
+    955,
+    944,
+    933,
+    922,
+    911,
+    900,
+    899,
+    879,
+    859,
+    840,
+    820,
+    800,
+    799,
+    766,
+    733,
+    700,
+    699,
+    650,
+    600,
+    599,
+    500,
+    499,
+    400,
+    399,
+    350,
+    300,
+    299,
+    266,
+    233,
+    200,
+    199,
+    179,
+    159,
+    140,
+    120,
+    100,
+    99,
+    88,
+    77,
+    66,
+    55,
+    44,
+    33,
+    22,
+    11,
+    0,
+]
+
+smart100_timesteps = [
+    999,
+    995,
+    992,
+    989,
+    985,
+    981,
+    978,
+    975,
+    971,
+    967,
+    964,
+    961,
+    957,
+    956,
+    951,
+    947,
+    942,
+    937,
+    933,
+    928,
+    923,
+    919,
+    914,
+    913,
+    908,
+    903,
+    897,
+    892,
+    887,
+    881,
+    876,
+    871,
+    870,
+    864,
+    858,
+    852,
+    846,
+    840,
+    834,
+    828,
+    827,
+    820,
+    813,
+    806,
+    799,
+    792,
+    785,
+    784,
+    777,
+    770,
+    763,
+    756,
+    749,
+    742,
+    741,
+    733,
+    724,
+    716,
+    707,
+    699,
+    698,
+    688,
+    677,
+    666,
+    656,
+    655,
+    645,
+    634,
+    623,
+    613,
+    612,
+    598,
+    584,
+    570,
+    569,
+    555,
+    541,
+    527,
+    526,
+    505,
+    484,
+    483,
+    462,
+    440,
+    439,
+    396,
+    395,
+    352,
+    351,
+    308,
+    307,
+    264,
+    263,
+    220,
+    219,
+    176,
+    132,
+    88,
+    44,
+    0,
+]
+
+smart185_timesteps = [
+    999,
+    997,
+    995,
+    992,
+    990,
+    988,
+    986,
+    984,
+    981,
+    979,
+    977,
+    975,
+    972,
+    970,
+    968,
+    966,
+    964,
+    961,
+    959,
+    957,
+    956,
+    954,
+    951,
+    949,
+    946,
+    944,
+    941,
+    939,
+    936,
+    934,
+    931,
+    929,
+    926,
+    924,
+    921,
+    919,
+    916,
+    914,
+    913,
+    910,
+    907,
+    905,
+    902,
+    899,
+    896,
+    893,
+    891,
+    888,
+    885,
+    882,
+    879,
+    877,
+    874,
+    871,
+    870,
+    867,
+    864,
+    861,
+    858,
+    855,
+    852,
+    849,
+    846,
+    843,
+    840,
+    837,
+    834,
+    831,
+    828,
+    827,
+    824,
+    821,
+    817,
+    814,
+    811,
+    808,
+    804,
+    801,
+    798,
+    795,
+    791,
+    788,
+    785,
+    784,
+    780,
+    777,
+    774,
+    770,
+    766,
+    763,
+    760,
+    756,
+    752,
+    749,
+    746,
+    742,
+    741,
+    737,
+    733,
+    730,
+    726,
+    722,
+    718,
+    714,
+    710,
+    707,
+    703,
+    699,
+    698,
+    694,
+    690,
+    685,
+    681,
+    677,
+    673,
+    669,
+    664,
+    660,
+    656,
+    655,
+    650,
+    646,
+    641,
+    636,
+    632,
+    627,
+    622,
+    618,
+    613,
+    612,
+    607,
+    602,
+    596,
+    591,
+    586,
+    580,
+    575,
+    570,
+    569,
+    563,
+    557,
+    551,
+    545,
+    539,
+    533,
+    527,
+    526,
+    519,
+    512,
+    505,
+    498,
+    491,
+    484,
+    483,
+    474,
+    466,
+    457,
+    449,
+    440,
+    439,
+    428,
+    418,
+    407,
+    396,
+    395,
+    381,
+    366,
+    352,
+    351,
+    330,
+    308,
+    307,
+    286,
+    264,
+    263,
+    242,
+    220,
+    219,
+    176,
+    175,
+    132,
+    131,
+    88,
+    44,
+    0,
+]
+
+super27_timesteps = [
+    999,
+    991,
+    982,
+    974,
+    966,
+    958,
+    950,
+    941,
+    933,
+    925,
+    916,
+    908,
+    900,
+    899,
+    874,
+    850,
+    825,
+    800,
+    799,
+    700,
+    600,
+    500,
+    400,
+    300,
+    200,
+    100,
+    0,
+]
+
+super40_timesteps = [
+    999,
+    992,
+    985,
+    978,
+    971,
+    964,
+    957,
+    949,
+    942,
+    935,
+    928,
+    921,
+    914,
+    907,
+    900,
+    899,
+    879,
+    859,
+    840,
+    820,
+    800,
+    799,
+    766,
+    733,
+    700,
+    699,
+    650,
+    600,
+    599,
+    500,
+    499,
+    400,
+    399,
+    300,
+    299,
+    200,
+    199,
+    100,
+    99,
+    0,
+]
+
+super100_timesteps = [
+    999,
+    996,
+    992,
+    989,
+    985,
+    982,
+    979,
+    975,
+    972,
+    968,
+    965,
+    961,
+    958,
+    955,
+    951,
+    948,
+    944,
+    941,
+    938,
+    934,
+    931,
+    927,
+    924,
+    920,
+    917,
+    914,
+    910,
+    907,
+    903,
+    900,
+    899,
+    891,
+    884,
+    876,
+    869,
+    861,
+    853,
+    846,
+    838,
+    830,
+    823,
+    815,
+    808,
+    800,
+    799,
+    788,
+    777,
+    766,
+    755,
+    744,
+    733,
+    722,
+    711,
+    700,
+    699,
+    688,
+    677,
+    666,
+    655,
+    644,
+    633,
+    622,
+    611,
+    600,
+    599,
+    585,
+    571,
+    557,
+    542,
+    528,
+    514,
+    500,
+    499,
+    485,
+    471,
+    457,
+    442,
+    428,
+    414,
+    400,
+    399,
+    379,
+    359,
+    340,
+    320,
+    300,
+    299,
+    279,
+    259,
+    240,
+    220,
+    200,
+    199,
+    166,
+    133,
+    100,
+    99,
+    66,
+    33,
+    0,
+]
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/watermark.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/watermark.py
new file mode 100644
index 000000000..ca10413de
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/watermark.py
@@ -0,0 +1,46 @@
+from typing import List
+
+import PIL.Image
+import torch
+from PIL import Image
+
+from ...configuration_utils import ConfigMixin
+from ...models.modeling_utils import ModelMixin
+from ...utils import PIL_INTERPOLATION
+
+
+class IFWatermarker(ModelMixin, ConfigMixin):
+    def __init__(self):
+        super().__init__()
+
+        self.register_buffer("watermark_image", torch.zeros((62, 62, 4)))
+        self.watermark_image_as_pil = None
+
+    def apply_watermark(self, images: List[PIL.Image.Image], sample_size=None):
+        # copied from https://github.com/deep-floyd/IF/blob/b77482e36ca2031cb94dbca1001fc1e6400bf4ab/deepfloyd_if/modules/base.py#L287
+
+        h = images[0].height
+        w = images[0].width
+
+        sample_size = sample_size or h
+
+        coef = min(h / sample_size, w / sample_size)
+        img_h, img_w = (int(h / coef), int(w / coef)) if coef < 1 else (h, w)
+
+        S1, S2 = 1024**2, img_w * img_h
+        K = (S2 / S1) ** 0.5
+        wm_size, wm_x, wm_y = int(K * 62), img_w - int(14 * K), img_h - int(14 * K)
+
+        if self.watermark_image_as_pil is None:
+            watermark_image = self.watermark_image.to(torch.uint8).cpu().numpy()
+            watermark_image = Image.fromarray(watermark_image, mode="RGBA")
+            self.watermark_image_as_pil = watermark_image
+
+        wm_img = self.watermark_image_as_pil.resize(
+            (wm_size, wm_size), PIL_INTERPOLATION["bicubic"], reducing_gap=None
+        )
+
+        for pil_img in images:
+            pil_img.paste(wm_img, box=(wm_x - wm_size, wm_y - wm_size, wm_x, wm_y), mask=wm_img.split()[-1])
+
+        return images
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/README.md b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/README.md
new file mode 100644
index 000000000..1e21dbbbd
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/README.md
@@ -0,0 +1,3 @@
+# Deprecated Pipelines
+
+This folder contains pipelines that have very low usage as measured by model downloads, issues and PRs. While you can still use the pipelines just as before, we will stop testing the pipelines and will not accept any changes to existing files.
\ No newline at end of file
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/__init__.py
new file mode 100644
index 000000000..993632317
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/__init__.py
@@ -0,0 +1,153 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_librosa_available,
+    is_note_seq_available,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_pt_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_pt_objects))
+else:
+    _import_structure["latent_diffusion_uncond"] = ["LDMPipeline"]
+    _import_structure["pndm"] = ["PNDMPipeline"]
+    _import_structure["repaint"] = ["RePaintPipeline"]
+    _import_structure["score_sde_ve"] = ["ScoreSdeVePipeline"]
+    _import_structure["stochastic_karras_ve"] = ["KarrasVePipeline"]
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["alt_diffusion"] = [
+        "AltDiffusionImg2ImgPipeline",
+        "AltDiffusionPipeline",
+        "AltDiffusionPipelineOutput",
+    ]
+    _import_structure["versatile_diffusion"] = [
+        "VersatileDiffusionDualGuidedPipeline",
+        "VersatileDiffusionImageVariationPipeline",
+        "VersatileDiffusionPipeline",
+        "VersatileDiffusionTextToImagePipeline",
+    ]
+    _import_structure["vq_diffusion"] = ["VQDiffusionPipeline"]
+    _import_structure["stable_diffusion_variants"] = [
+        "CycleDiffusionPipeline",
+        "StableDiffusionInpaintPipelineLegacy",
+        "StableDiffusionPix2PixZeroPipeline",
+        "StableDiffusionParadigmsPipeline",
+        "StableDiffusionModelEditingPipeline",
+    ]
+
+try:
+    if not (is_torch_available() and is_librosa_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_librosa_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_librosa_objects))
+
+else:
+    _import_structure["audio_diffusion"] = ["AudioDiffusionPipeline", "Mel"]
+
+try:
+    if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_transformers_and_torch_and_note_seq_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_transformers_and_torch_and_note_seq_objects))
+
+else:
+    _import_structure["spectrogram_diffusion"] = ["MidiProcessor", "SpectrogramDiffusionPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_pt_objects import *
+
+    else:
+        from .latent_diffusion_uncond import LDMPipeline
+        from .pndm import PNDMPipeline
+        from .repaint import RePaintPipeline
+        from .score_sde_ve import ScoreSdeVePipeline
+        from .stochastic_karras_ve import KarrasVePipeline
+
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+
+    else:
+        from .alt_diffusion import AltDiffusionImg2ImgPipeline, AltDiffusionPipeline, AltDiffusionPipelineOutput
+        from .audio_diffusion import AudioDiffusionPipeline, Mel
+        from .spectrogram_diffusion import SpectrogramDiffusionPipeline
+        from .stable_diffusion_variants import (
+            CycleDiffusionPipeline,
+            StableDiffusionInpaintPipelineLegacy,
+            StableDiffusionModelEditingPipeline,
+            StableDiffusionParadigmsPipeline,
+            StableDiffusionPix2PixZeroPipeline,
+        )
+        from .stochastic_karras_ve import KarrasVePipeline
+        from .versatile_diffusion import (
+            VersatileDiffusionDualGuidedPipeline,
+            VersatileDiffusionImageVariationPipeline,
+            VersatileDiffusionPipeline,
+            VersatileDiffusionTextToImagePipeline,
+        )
+        from .vq_diffusion import VQDiffusionPipeline
+
+    try:
+        if not (is_torch_available() and is_librosa_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_librosa_objects import *
+    else:
+        from .audio_diffusion import AudioDiffusionPipeline, Mel
+
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_transformers_and_torch_and_note_seq_objects import *  # noqa F403
+    else:
+        from .spectrogram_diffusion import (
+            MidiProcessor,
+            SpectrogramDiffusionPipeline,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/__init__.py
new file mode 100644
index 000000000..71fa15b3f
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/__init__.py
@@ -0,0 +1,53 @@
+from typing import TYPE_CHECKING
+
+from ....utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ....utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["modeling_roberta_series"] = ["RobertaSeriesModelWithTransformation"]
+    _import_structure["pipeline_alt_diffusion"] = ["AltDiffusionPipeline"]
+    _import_structure["pipeline_alt_diffusion_img2img"] = ["AltDiffusionImg2ImgPipeline"]
+
+    _import_structure["pipeline_output"] = ["AltDiffusionPipelineOutput"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ....utils.dummy_torch_and_transformers_objects import *
+
+    else:
+        from .modeling_roberta_series import RobertaSeriesModelWithTransformation
+        from .pipeline_alt_diffusion import AltDiffusionPipeline
+        from .pipeline_alt_diffusion_img2img import AltDiffusionImg2ImgPipeline
+        from .pipeline_output import AltDiffusionPipelineOutput
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py
new file mode 100644
index 000000000..f73ef15d7
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py
@@ -0,0 +1,124 @@
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import RobertaPreTrainedModel, XLMRobertaConfig, XLMRobertaModel
+from transformers.utils import ModelOutput
+
+
+@dataclass
+class TransformationModelOutput(ModelOutput):
+    """
+    Base class for text model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The text embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    projection_state: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class RobertaSeriesConfig(XLMRobertaConfig):
+    def __init__(
+        self,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        project_dim=512,
+        pooler_fn="cls",
+        learn_encoder=False,
+        use_attention_mask=True,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.project_dim = project_dim
+        self.pooler_fn = pooler_fn
+        self.learn_encoder = learn_encoder
+        self.use_attention_mask = use_attention_mask
+
+
+class RobertaSeriesModelWithTransformation(RobertaPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler", r"logit_scale"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+    base_model_prefix = "roberta"
+    config_class = RobertaSeriesConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.roberta = XLMRobertaModel(config)
+        self.transformation = nn.Linear(config.hidden_size, config.project_dim)
+        self.has_pre_transformation = getattr(config, "has_pre_transformation", False)
+        if self.has_pre_transformation:
+            self.transformation_pre = nn.Linear(config.hidden_size, config.project_dim)
+            self.pre_LN = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ):
+        r""" """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.base_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=True if self.has_pre_transformation else output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.has_pre_transformation:
+            sequence_output2 = outputs["hidden_states"][-2]
+            sequence_output2 = self.pre_LN(sequence_output2)
+            projection_state2 = self.transformation_pre(sequence_output2)
+
+            return TransformationModelOutput(
+                projection_state=projection_state2,
+                last_hidden_state=outputs.last_hidden_state,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+        else:
+            projection_state = self.transformation(outputs.last_hidden_state)
+            return TransformationModelOutput(
+                projection_state=projection_state,
+                last_hidden_state=outputs.last_hidden_state,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py
new file mode 100644
index 000000000..e4583699e
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py
@@ -0,0 +1,946 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection, XLMRobertaTokenizer
+
+from ....configuration_utils import FrozenDict
+from ....image_processor import PipelineImageInput, VaeImageProcessor
+from ....loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ....models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ....models.lora import adjust_lora_scale_text_encoder
+from ....schedulers import KarrasDiffusionSchedulers
+from ....utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from .modeling_roberta_series import RobertaSeriesModelWithTransformation
+from .pipeline_output import AltDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import AltDiffusionPipeline
+
+        >>> pipe = AltDiffusionPipeline.from_pretrained("BAAI/AltDiffusion-m9", torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+
+        >>> # "dark elf princess, highly detailed, d & d, fantasy, highly detailed, digital painting, trending on artstation, concept art, sharp focus, illustration, art by artgerm and greg rutkowski and fuji choko and viktoria gavrilenko and hoang lap"
+        >>> prompt = "黑暗精灵公主，非常详细，幻想，非常详细，数字绘画，概念艺术，敏锐的焦点，插图"
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class AltDiffusionPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    LoraLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Alt Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.RobertaSeriesModelWithTransformation`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.XLMRobertaTokenizer`]):
+            A `XLMRobertaTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: RobertaSeriesModelWithTransformation,
+        tokenizer: XLMRobertaTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Alt Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hooks
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None:
+            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
+            image_embeds, negative_image_embeds = self.encode_image(
+                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
+            )
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
+        # 6.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return AltDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py
new file mode 100644
index 000000000..156e52c24
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py
@@ -0,0 +1,1018 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection, XLMRobertaTokenizer
+
+from ....configuration_utils import FrozenDict
+from ....image_processor import PipelineImageInput, VaeImageProcessor
+from ....loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ....models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ....models.lora import adjust_lora_scale_text_encoder
+from ....schedulers import KarrasDiffusionSchedulers
+from ....utils import (
+    PIL_INTERPOLATION,
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from .modeling_roberta_series import RobertaSeriesModelWithTransformation
+from .pipeline_output import AltDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import requests
+        >>> import torch
+        >>> from PIL import Image
+        >>> from io import BytesIO
+
+        >>> from diffusers import AltDiffusionImg2ImgPipeline
+
+        >>> device = "cuda"
+        >>> model_id_or_path = "BAAI/AltDiffusion-m9"
+        >>> pipe = AltDiffusionImg2ImgPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
+        >>> pipe = pipe.to(device)
+
+        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+        >>> response = requests.get(url)
+        >>> init_image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> init_image = init_image.resize((768, 512))
+
+        >>> # "A fantasy landscape, trending on artstation"
+        >>> prompt = "幻想风景, artstation"
+
+        >>> images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
+        >>> images[0].save("幻想风景.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
+def preprocess(image):
+    deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
+    deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class AltDiffusionImg2ImgPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    LoraLoaderMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-guided image-to-image generation using Alt Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.RobertaSeriesModelWithTransformation`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.XLMRobertaTokenizer`]):
+            A `XLMRobertaTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: RobertaSeriesModelWithTransformation,
+        tokenizer: XLMRobertaTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Alt Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        strength,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        timesteps: List[int] = None,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: int = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.AltDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            strength,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None:
+            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
+            image_embeds, negative_image_embeds = self.encode_image(
+                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
+            )
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
+        # 4. Preprocess image
+        image = self.image_processor.preprocess(image)
+
+        # 5. set timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 6. Prepare latent variables
+        latents = self.prepare_latents(
+            image,
+            latent_timestep,
+            batch_size,
+            num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
+        # 7.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return AltDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_output.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_output.py
new file mode 100644
index 000000000..dd174ae3c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_output.py
@@ -0,0 +1,28 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL.Image
+
+from ....utils import (
+    BaseOutput,
+)
+
+
+@dataclass
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_output.StableDiffusionPipelineOutput with Stable->Alt
+class AltDiffusionPipelineOutput(BaseOutput):
+    """
+    Output class for Alt Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+        nsfw_content_detected (`List[bool]`)
+            List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
+            `None` if safety checking could not be performed.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/__init__.py
new file mode 100644
index 000000000..312795186
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/__init__.py
@@ -0,0 +1,23 @@
+from typing import TYPE_CHECKING
+
+from ....utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
+
+
+_import_structure = {
+    "mel": ["Mel"],
+    "pipeline_audio_diffusion": ["AudioDiffusionPipeline"],
+}
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .mel import Mel
+    from .pipeline_audio_diffusion import AudioDiffusionPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/mel.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/mel.py
new file mode 100644
index 000000000..3426c3ad0
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/mel.py
@@ -0,0 +1,179 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import numpy as np  # noqa: E402
+
+from ....configuration_utils import ConfigMixin, register_to_config
+from ....schedulers.scheduling_utils import SchedulerMixin
+
+
+try:
+    import librosa  # noqa: E402
+
+    _librosa_can_be_imported = True
+    _import_error = ""
+except Exception as e:
+    _librosa_can_be_imported = False
+    _import_error = (
+        f"Cannot import librosa because {e}. Make sure to correctly install librosa to be able to install it."
+    )
+
+
+from PIL import Image  # noqa: E402
+
+
+class Mel(ConfigMixin, SchedulerMixin):
+    """
+    Parameters:
+        x_res (`int`):
+            x resolution of spectrogram (time).
+        y_res (`int`):
+            y resolution of spectrogram (frequency bins).
+        sample_rate (`int`):
+            Sample rate of audio.
+        n_fft (`int`):
+            Number of Fast Fourier Transforms.
+        hop_length (`int`):
+            Hop length (a higher number is recommended if `y_res` < 256).
+        top_db (`int`):
+            Loudest decibel value.
+        n_iter (`int`):
+            Number of iterations for Griffin-Lim Mel inversion.
+    """
+
+    config_name = "mel_config.json"
+
+    @register_to_config
+    def __init__(
+        self,
+        x_res: int = 256,
+        y_res: int = 256,
+        sample_rate: int = 22050,
+        n_fft: int = 2048,
+        hop_length: int = 512,
+        top_db: int = 80,
+        n_iter: int = 32,
+    ):
+        self.hop_length = hop_length
+        self.sr = sample_rate
+        self.n_fft = n_fft
+        self.top_db = top_db
+        self.n_iter = n_iter
+        self.set_resolution(x_res, y_res)
+        self.audio = None
+
+        if not _librosa_can_be_imported:
+            raise ValueError(_import_error)
+
+    def set_resolution(self, x_res: int, y_res: int):
+        """Set resolution.
+
+        Args:
+            x_res (`int`):
+                x resolution of spectrogram (time).
+            y_res (`int`):
+                y resolution of spectrogram (frequency bins).
+        """
+        self.x_res = x_res
+        self.y_res = y_res
+        self.n_mels = self.y_res
+        self.slice_size = self.x_res * self.hop_length - 1
+
+    def load_audio(self, audio_file: str = None, raw_audio: np.ndarray = None):
+        """Load audio.
+
+        Args:
+            audio_file (`str`):
+                An audio file that must be on disk due to [Librosa](https://librosa.org/) limitation.
+            raw_audio (`np.ndarray`):
+                The raw audio file as a NumPy array.
+        """
+        if audio_file is not None:
+            self.audio, _ = librosa.load(audio_file, mono=True, sr=self.sr)
+        else:
+            self.audio = raw_audio
+
+        # Pad with silence if necessary.
+        if len(self.audio) < self.x_res * self.hop_length:
+            self.audio = np.concatenate([self.audio, np.zeros((self.x_res * self.hop_length - len(self.audio),))])
+
+    def get_number_of_slices(self) -> int:
+        """Get number of slices in audio.
+
+        Returns:
+            `int`:
+                Number of spectograms audio can be sliced into.
+        """
+        return len(self.audio) // self.slice_size
+
+    def get_audio_slice(self, slice: int = 0) -> np.ndarray:
+        """Get slice of audio.
+
+        Args:
+            slice (`int`):
+                Slice number of audio (out of `get_number_of_slices()`).
+
+        Returns:
+            `np.ndarray`:
+                The audio slice as a NumPy array.
+        """
+        return self.audio[self.slice_size * slice : self.slice_size * (slice + 1)]
+
+    def get_sample_rate(self) -> int:
+        """Get sample rate.
+
+        Returns:
+            `int`:
+                Sample rate of audio.
+        """
+        return self.sr
+
+    def audio_slice_to_image(self, slice: int) -> Image.Image:
+        """Convert slice of audio to spectrogram.
+
+        Args:
+            slice (`int`):
+                Slice number of audio to convert (out of `get_number_of_slices()`).
+
+        Returns:
+            `PIL Image`:
+                A grayscale image of `x_res x y_res`.
+        """
+        S = librosa.feature.melspectrogram(
+            y=self.get_audio_slice(slice), sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels
+        )
+        log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db)
+        bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) + 0.5).astype(np.uint8)
+        image = Image.fromarray(bytedata)
+        return image
+
+    def image_to_audio(self, image: Image.Image) -> np.ndarray:
+        """Converts spectrogram to audio.
+
+        Args:
+            image (`PIL Image`):
+                An grayscale image of `x_res x y_res`.
+
+        Returns:
+            audio (`np.ndarray`):
+                The audio as a NumPy array.
+        """
+        bytedata = np.frombuffer(image.tobytes(), dtype="uint8").reshape((image.height, image.width))
+        log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db
+        S = librosa.db_to_power(log_S)
+        audio = librosa.feature.inverse.mel_to_audio(
+            S, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_iter=self.n_iter
+        )
+        return audio
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py
new file mode 100644
index 000000000..47044e050
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py
@@ -0,0 +1,329 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from math import acos, sin
+from typing import List, Tuple, Union
+
+import numpy as np
+import torch
+from PIL import Image
+
+from ....models import AutoencoderKL, UNet2DConditionModel
+from ....schedulers import DDIMScheduler, DDPMScheduler
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import AudioPipelineOutput, BaseOutput, DiffusionPipeline, ImagePipelineOutput
+from .mel import Mel
+
+
+class AudioDiffusionPipeline(DiffusionPipeline):
+    """
+    Pipeline for audio diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        vqae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        mel ([`Mel`]):
+            Transform audio into a spectrogram.
+        scheduler ([`DDIMScheduler`] or [`DDPMScheduler`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`] or [`DDPMScheduler`].
+    """
+
+    _optional_components = ["vqvae"]
+
+    def __init__(
+        self,
+        vqvae: AutoencoderKL,
+        unet: UNet2DConditionModel,
+        mel: Mel,
+        scheduler: Union[DDIMScheduler, DDPMScheduler],
+    ):
+        super().__init__()
+        self.register_modules(unet=unet, scheduler=scheduler, mel=mel, vqvae=vqvae)
+
+    def get_default_steps(self) -> int:
+        """Returns default number of steps recommended for inference.
+
+        Returns:
+            `int`:
+                The number of steps.
+        """
+        return 50 if isinstance(self.scheduler, DDIMScheduler) else 1000
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int = 1,
+        audio_file: str = None,
+        raw_audio: np.ndarray = None,
+        slice: int = 0,
+        start_step: int = 0,
+        steps: int = None,
+        generator: torch.Generator = None,
+        mask_start_secs: float = 0,
+        mask_end_secs: float = 0,
+        step_generator: torch.Generator = None,
+        eta: float = 0,
+        noise: torch.Tensor = None,
+        encoding: torch.Tensor = None,
+        return_dict=True,
+    ) -> Union[
+        Union[AudioPipelineOutput, ImagePipelineOutput],
+        Tuple[List[Image.Image], Tuple[int, List[np.ndarray]]],
+    ]:
+        """
+        The call function to the pipeline for generation.
+
+        Args:
+            batch_size (`int`):
+                Number of samples to generate.
+            audio_file (`str`):
+                An audio file that must be on disk due to [Librosa](https://librosa.org/) limitation.
+            raw_audio (`np.ndarray`):
+                The raw audio file as a NumPy array.
+            slice (`int`):
+                Slice number of audio to convert.
+            start_step (int):
+                Step to start diffusion from.
+            steps (`int`):
+                Number of denoising steps (defaults to `50` for DDIM and `1000` for DDPM).
+            generator (`torch.Generator`):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            mask_start_secs (`float`):
+                Number of seconds of audio to mask (not generate) at start.
+            mask_end_secs (`float`):
+                Number of seconds of audio to mask (not generate) at end.
+            step_generator (`torch.Generator`):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) used to denoise.
+                None
+            eta (`float`):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            noise (`torch.Tensor`):
+                A noise tensor of shape `(batch_size, 1, height, width)` or `None`.
+            encoding (`torch.Tensor`):
+                A tensor for [`UNet2DConditionModel`] of shape `(batch_size, seq_length, cross_attention_dim)`.
+            return_dict (`bool`):
+                Whether or not to return a [`AudioPipelineOutput`], [`ImagePipelineOutput`] or a plain tuple.
+
+        Examples:
+
+        For audio diffusion:
+
+        ```py
+        import torch
+        from IPython.display import Audio
+        from diffusers import DiffusionPipeline
+
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-256").to(device)
+
+        output = pipe()
+        display(output.images[0])
+        display(Audio(output.audios[0], rate=mel.get_sample_rate()))
+        ```
+
+        For latent audio diffusion:
+
+        ```py
+        import torch
+        from IPython.display import Audio
+        from diffusers import DiffusionPipeline
+
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        pipe = DiffusionPipeline.from_pretrained("teticio/latent-audio-diffusion-256").to(device)
+
+        output = pipe()
+        display(output.images[0])
+        display(Audio(output.audios[0], rate=pipe.mel.get_sample_rate()))
+        ```
+
+        For other tasks like variation, inpainting, outpainting, etc:
+
+        ```py
+        output = pipe(
+            raw_audio=output.audios[0, 0],
+            start_step=int(pipe.get_default_steps() / 2),
+            mask_start_secs=1,
+            mask_end_secs=1,
+        )
+        display(output.images[0])
+        display(Audio(output.audios[0], rate=pipe.mel.get_sample_rate()))
+        ```
+
+        Returns:
+            `List[PIL Image]`:
+                A list of Mel spectrograms (`float`, `List[np.ndarray]`) with the sample rate and raw audio.
+        """
+
+        steps = steps or self.get_default_steps()
+        self.scheduler.set_timesteps(steps)
+        step_generator = step_generator or generator
+        # For backwards compatibility
+        if isinstance(self.unet.config.sample_size, int):
+            self.unet.config.sample_size = (self.unet.config.sample_size, self.unet.config.sample_size)
+        if noise is None:
+            noise = randn_tensor(
+                (
+                    batch_size,
+                    self.unet.config.in_channels,
+                    self.unet.config.sample_size[0],
+                    self.unet.config.sample_size[1],
+                ),
+                generator=generator,
+                device=self.device,
+            )
+        images = noise
+        mask = None
+
+        if audio_file is not None or raw_audio is not None:
+            self.mel.load_audio(audio_file, raw_audio)
+            input_image = self.mel.audio_slice_to_image(slice)
+            input_image = np.frombuffer(input_image.tobytes(), dtype="uint8").reshape(
+                (input_image.height, input_image.width)
+            )
+            input_image = (input_image / 255) * 2 - 1
+            input_images = torch.tensor(input_image[np.newaxis, :, :], dtype=torch.float).to(self.device)
+
+            if self.vqvae is not None:
+                input_images = self.vqvae.encode(torch.unsqueeze(input_images, 0)).latent_dist.sample(
+                    generator=generator
+                )[0]
+                input_images = self.vqvae.config.scaling_factor * input_images
+
+            if start_step > 0:
+                images[0, 0] = self.scheduler.add_noise(input_images, noise, self.scheduler.timesteps[start_step - 1])
+
+            pixels_per_second = (
+                self.unet.config.sample_size[1] * self.mel.get_sample_rate() / self.mel.x_res / self.mel.hop_length
+            )
+            mask_start = int(mask_start_secs * pixels_per_second)
+            mask_end = int(mask_end_secs * pixels_per_second)
+            mask = self.scheduler.add_noise(input_images, noise, torch.tensor(self.scheduler.timesteps[start_step:]))
+
+        for step, t in enumerate(self.progress_bar(self.scheduler.timesteps[start_step:])):
+            if isinstance(self.unet, UNet2DConditionModel):
+                model_output = self.unet(images, t, encoding)["sample"]
+            else:
+                model_output = self.unet(images, t)["sample"]
+
+            if isinstance(self.scheduler, DDIMScheduler):
+                images = self.scheduler.step(
+                    model_output=model_output,
+                    timestep=t,
+                    sample=images,
+                    eta=eta,
+                    generator=step_generator,
+                )["prev_sample"]
+            else:
+                images = self.scheduler.step(
+                    model_output=model_output,
+                    timestep=t,
+                    sample=images,
+                    generator=step_generator,
+                )["prev_sample"]
+
+            if mask is not None:
+                if mask_start > 0:
+                    images[:, :, :, :mask_start] = mask[:, step, :, :mask_start]
+                if mask_end > 0:
+                    images[:, :, :, -mask_end:] = mask[:, step, :, -mask_end:]
+
+        if self.vqvae is not None:
+            # 0.18215 was scaling factor used in training to ensure unit variance
+            images = 1 / self.vqvae.config.scaling_factor * images
+            images = self.vqvae.decode(images)["sample"]
+
+        images = (images / 2 + 0.5).clamp(0, 1)
+        images = images.cpu().permute(0, 2, 3, 1).numpy()
+        images = (images * 255).round().astype("uint8")
+        images = list(
+            (Image.fromarray(_[:, :, 0]) for _ in images)
+            if images.shape[3] == 1
+            else (Image.fromarray(_, mode="RGB").convert("L") for _ in images)
+        )
+
+        audios = [self.mel.image_to_audio(_) for _ in images]
+        if not return_dict:
+            return images, (self.mel.get_sample_rate(), audios)
+
+        return BaseOutput(**AudioPipelineOutput(np.array(audios)[:, np.newaxis, :]), **ImagePipelineOutput(images))
+
+    @torch.no_grad()
+    def encode(self, images: List[Image.Image], steps: int = 50) -> np.ndarray:
+        """
+        Reverse the denoising step process to recover a noisy image from the generated image.
+
+        Args:
+            images (`List[PIL Image]`):
+                List of images to encode.
+            steps (`int`):
+                Number of encoding steps to perform (defaults to `50`).
+
+        Returns:
+            `np.ndarray`:
+                A noise tensor of shape `(batch_size, 1, height, width)`.
+        """
+
+        # Only works with DDIM as this method is deterministic
+        assert isinstance(self.scheduler, DDIMScheduler)
+        self.scheduler.set_timesteps(steps)
+        sample = np.array(
+            [np.frombuffer(image.tobytes(), dtype="uint8").reshape((1, image.height, image.width)) for image in images]
+        )
+        sample = (sample / 255) * 2 - 1
+        sample = torch.Tensor(sample).to(self.device)
+
+        for t in self.progress_bar(torch.flip(self.scheduler.timesteps, (0,))):
+            prev_timestep = t - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps
+            alpha_prod_t = self.scheduler.alphas_cumprod[t]
+            alpha_prod_t_prev = (
+                self.scheduler.alphas_cumprod[prev_timestep]
+                if prev_timestep >= 0
+                else self.scheduler.final_alpha_cumprod
+            )
+            beta_prod_t = 1 - alpha_prod_t
+            model_output = self.unet(sample, t)["sample"]
+            pred_sample_direction = (1 - alpha_prod_t_prev) ** (0.5) * model_output
+            sample = (sample - pred_sample_direction) * alpha_prod_t_prev ** (-0.5)
+            sample = sample * alpha_prod_t ** (0.5) + beta_prod_t ** (0.5) * model_output
+
+        return sample
+
+    @staticmethod
+    def slerp(x0: torch.Tensor, x1: torch.Tensor, alpha: float) -> torch.Tensor:
+        """Spherical Linear intERPolation.
+
+        Args:
+            x0 (`torch.Tensor`):
+                The first tensor to interpolate between.
+            x1 (`torch.Tensor`):
+                Second tensor to interpolate between.
+            alpha (`float`):
+                Interpolation between 0 and 1
+
+        Returns:
+            `torch.Tensor`:
+                The interpolated tensor.
+        """
+
+        theta = acos(torch.dot(torch.flatten(x0), torch.flatten(x1)) / torch.norm(x0) / torch.norm(x1))
+        return sin((1 - alpha) * theta) * x0 / sin(theta) + sin(alpha * theta) * x1 / sin(theta)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/__init__.py
new file mode 100644
index 000000000..214f5bbca
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/__init__.py
@@ -0,0 +1,18 @@
+from typing import TYPE_CHECKING
+
+from ....utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
+
+
+_import_structure = {"pipeline_latent_diffusion_uncond": ["LDMPipeline"]}
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .pipeline_latent_diffusion_uncond import LDMPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
new file mode 100644
index 000000000..7fe5d59f7
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
@@ -0,0 +1,130 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from ....models import UNet2DModel, VQModel
+from ....schedulers import DDIMScheduler
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+class LDMPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for unconditional image generation using latent diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        vqvae ([`VQModel`]):
+            Vector-quantized (VQ) model to encode and decode images to and from latent representations.
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            [`DDIMScheduler`] is used in combination with `unet` to denoise the encoded image latents.
+    """
+
+    def __init__(self, vqvae: VQModel, unet: UNet2DModel, scheduler: DDIMScheduler):
+        super().__init__()
+        self.register_modules(vqvae=vqvae, unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        eta: float = 0.0,
+        num_inference_steps: int = 50,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ) -> Union[Tuple, ImagePipelineOutput]:
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            batch_size (`int`, *optional*, defaults to 1):
+                Number of images to generate.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        ```py
+        >>> from diffusers import LDMPipeline
+
+        >>> # load model and scheduler
+        >>> pipe = LDMPipeline.from_pretrained("CompVis/ldm-celebahq-256")
+
+        >>> # run pipeline in inference (sample random noise and denoise)
+        >>> image = pipe().images[0]
+        ```
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
+        """
+
+        latents = randn_tensor(
+            (batch_size, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
+            generator=generator,
+        )
+        latents = latents.to(self.device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+
+        extra_kwargs = {}
+        if accepts_eta:
+            extra_kwargs["eta"] = eta
+
+        for t in self.progress_bar(self.scheduler.timesteps):
+            latent_model_input = self.scheduler.scale_model_input(latents, t)
+            # predict the noise residual
+            noise_prediction = self.unet(latent_model_input, t).sample
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_prediction, t, latents, **extra_kwargs).prev_sample
+
+        # adjust latents with inverse of vae scale
+        latents = latents / self.vqvae.config.scaling_factor
+        # decode the image latents with the VAE
+        image = self.vqvae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/pndm/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/pndm/__init__.py
new file mode 100644
index 000000000..5e3bdba74
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/pndm/__init__.py
@@ -0,0 +1,18 @@
+from typing import TYPE_CHECKING
+
+from ....utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
+
+
+_import_structure = {"pipeline_pndm": ["PNDMPipeline"]}
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .pipeline_pndm import PNDMPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/pndm/pipeline_pndm.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/pndm/pipeline_pndm.py
new file mode 100644
index 000000000..ef78af194
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/pndm/pipeline_pndm.py
@@ -0,0 +1,121 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from ....models import UNet2DModel
+from ....schedulers import PNDMScheduler
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+class PNDMPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for unconditional image generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image latents.
+        scheduler ([`PNDMScheduler`]):
+            A `PNDMScheduler` to be used in combination with `unet` to denoise the encoded image.
+    """
+
+    unet: UNet2DModel
+    scheduler: PNDMScheduler
+
+    def __init__(self, unet: UNet2DModel, scheduler: PNDMScheduler):
+        super().__init__()
+
+        scheduler = PNDMScheduler.from_config(scheduler.config)
+
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int = 1,
+        num_inference_steps: int = 50,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            batch_size (`int`, `optional`, defaults to 1):
+                The number of images to generate.
+            num_inference_steps (`int`, `optional`, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator`, `optional`):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            output_type (`str`, `optional`, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        ```py
+        >>> from diffusers import PNDMPipeline
+
+        >>> # load model and scheduler
+        >>> pndm = PNDMPipeline.from_pretrained("google/ddpm-cifar10-32")
+
+        >>> # run pipeline in inference (sample random noise and denoise)
+        >>> image = pndm().images[0]
+
+        >>> # save image
+        >>> image.save("pndm_generated_image.png")
+        ```
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+        # For more information on the sampling method you can take a look at Algorithm 2 of
+        # the official paper: https://arxiv.org/pdf/2202.09778.pdf
+
+        # Sample gaussian noise to begin loop
+        image = randn_tensor(
+            (batch_size, self.unet.config.in_channels, self.unet.config.sample_size, self.unet.config.sample_size),
+            generator=generator,
+            device=self.device,
+        )
+
+        self.scheduler.set_timesteps(num_inference_steps)
+        for t in self.progress_bar(self.scheduler.timesteps):
+            model_output = self.unet(image, t).sample
+
+            image = self.scheduler.step(model_output, t, image).prev_sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/repaint/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/repaint/__init__.py
new file mode 100644
index 000000000..2c6b04af5
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/repaint/__init__.py
@@ -0,0 +1,19 @@
+from typing import TYPE_CHECKING
+
+from ....utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
+
+
+_import_structure = {"pipeline_repaint": ["RePaintPipeline"]}
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .pipeline_repaint import RePaintPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py
new file mode 100644
index 000000000..c03a3d8fc
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py
@@ -0,0 +1,230 @@
+# Copyright 2024 ETH Zurich Computer Vision Lab and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+
+from ....models import UNet2DModel
+from ....schedulers import RePaintScheduler
+from ....utils import PIL_INTERPOLATION, deprecate, logging
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
+def _preprocess_image(image: Union[List, PIL.Image.Image, torch.Tensor]):
+    deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
+    deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+def _preprocess_mask(mask: Union[List, PIL.Image.Image, torch.Tensor]):
+    if isinstance(mask, torch.Tensor):
+        return mask
+    elif isinstance(mask, PIL.Image.Image):
+        mask = [mask]
+
+    if isinstance(mask[0], PIL.Image.Image):
+        w, h = mask[0].size
+        w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
+        mask = [np.array(m.convert("L").resize((w, h), resample=PIL_INTERPOLATION["nearest"]))[None, :] for m in mask]
+        mask = np.concatenate(mask, axis=0)
+        mask = mask.astype(np.float32) / 255.0
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        mask = torch.from_numpy(mask)
+    elif isinstance(mask[0], torch.Tensor):
+        mask = torch.cat(mask, dim=0)
+    return mask
+
+
+class RePaintPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for image inpainting using RePaint.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image latents.
+        scheduler ([`RePaintScheduler`]):
+            A `RePaintScheduler` to be used in combination with `unet` to denoise the encoded image.
+    """
+
+    unet: UNet2DModel
+    scheduler: RePaintScheduler
+    model_cpu_offload_seq = "unet"
+
+    def __init__(self, unet, scheduler):
+        super().__init__()
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[torch.Tensor, PIL.Image.Image],
+        mask_image: Union[torch.Tensor, PIL.Image.Image],
+        num_inference_steps: int = 250,
+        eta: float = 0.0,
+        jump_length: int = 10,
+        jump_n_sample: int = 10,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
+                The original image to inpaint on.
+            mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                The mask_image where 0.0 define which part of the original image to inpaint.
+            num_inference_steps (`int`, *optional*, defaults to 1000):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            eta (`float`):
+                The weight of the added noise in a diffusion step. Its value is between 0.0 and 1.0; 0.0 corresponds to
+                DDIM and 1.0 is the DDPM scheduler.
+            jump_length (`int`, *optional*, defaults to 10):
+                The number of steps taken forward in time before going backward in time for a single jump ("j" in
+                RePaint paper). Take a look at Figure 9 and 10 in the [paper](https://arxiv.org/pdf/2201.09865.pdf).
+            jump_n_sample (`int`, *optional*, defaults to 10):
+                The number of times to make a forward time jump for a given chosen time sample. Take a look at Figure 9
+                and 10 in the [paper](https://arxiv.org/pdf/2201.09865.pdf).
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            output_type (`str`, `optional`, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        ```py
+        >>> from io import BytesIO
+        >>> import torch
+        >>> import PIL
+        >>> import requests
+        >>> from diffusers import RePaintPipeline, RePaintScheduler
+
+
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+
+        >>> img_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/celeba_hq_256.png"
+        >>> mask_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/repaint/mask_256.png"
+
+        >>> # Load the original image and the mask as PIL images
+        >>> original_image = download_image(img_url).resize((256, 256))
+        >>> mask_image = download_image(mask_url).resize((256, 256))
+
+        >>> # Load the RePaint scheduler and pipeline based on a pretrained DDPM model
+        >>> scheduler = RePaintScheduler.from_pretrained("google/ddpm-ema-celebahq-256")
+        >>> pipe = RePaintPipeline.from_pretrained("google/ddpm-ema-celebahq-256", scheduler=scheduler)
+        >>> pipe = pipe.to("cuda")
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(0)
+        >>> output = pipe(
+        ...     image=original_image,
+        ...     mask_image=mask_image,
+        ...     num_inference_steps=250,
+        ...     eta=0.0,
+        ...     jump_length=10,
+        ...     jump_n_sample=10,
+        ...     generator=generator,
+        ... )
+        >>> inpainted_image = output.images[0]
+        ```
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+
+        original_image = image
+
+        original_image = _preprocess_image(original_image)
+        original_image = original_image.to(device=self._execution_device, dtype=self.unet.dtype)
+        mask_image = _preprocess_mask(mask_image)
+        mask_image = mask_image.to(device=self._execution_device, dtype=self.unet.dtype)
+
+        batch_size = original_image.shape[0]
+
+        # sample gaussian noise to begin the loop
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        image_shape = original_image.shape
+        image = randn_tensor(image_shape, generator=generator, device=self._execution_device, dtype=self.unet.dtype)
+
+        # set step values
+        self.scheduler.set_timesteps(num_inference_steps, jump_length, jump_n_sample, self._execution_device)
+        self.scheduler.eta = eta
+
+        t_last = self.scheduler.timesteps[0] + 1
+        generator = generator[0] if isinstance(generator, list) else generator
+        for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
+            if t < t_last:
+                # predict the noise residual
+                model_output = self.unet(image, t).sample
+                # compute previous image: x_t -> x_t-1
+                image = self.scheduler.step(model_output, t, image, original_image, mask_image, generator).prev_sample
+
+            else:
+                # compute the reverse: x_t-1 -> x_t
+                image = self.scheduler.undo_step(image, t_last, generator)
+            t_last = t
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/score_sde_ve/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/score_sde_ve/__init__.py
new file mode 100644
index 000000000..87c167c3d
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/score_sde_ve/__init__.py
@@ -0,0 +1,19 @@
+from typing import TYPE_CHECKING
+
+from ....utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
+
+
+_import_structure = {"pipeline_score_sde_ve": ["ScoreSdeVePipeline"]}
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .pipeline_score_sde_ve import ScoreSdeVePipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py
new file mode 100644
index 000000000..b0bb114a8
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py
@@ -0,0 +1,109 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from ....models import UNet2DModel
+from ....schedulers import ScoreSdeVeScheduler
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+class ScoreSdeVePipeline(DiffusionPipeline):
+    r"""
+    Pipeline for unconditional image generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image.
+        scheduler ([`ScoreSdeVeScheduler`]):
+            A `ScoreSdeVeScheduler` to be used in combination with `unet` to denoise the encoded image.
+    """
+
+    unet: UNet2DModel
+    scheduler: ScoreSdeVeScheduler
+
+    def __init__(self, unet: UNet2DModel, scheduler: ScoreSdeVeScheduler):
+        super().__init__()
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int = 1,
+        num_inference_steps: int = 2000,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of images to generate.
+            generator (`torch.Generator`, `optional`):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            output_type (`str`, `optional`, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+
+        img_size = self.unet.config.sample_size
+        shape = (batch_size, 3, img_size, img_size)
+
+        model = self.unet
+
+        sample = randn_tensor(shape, generator=generator) * self.scheduler.init_noise_sigma
+        sample = sample.to(self.device)
+
+        self.scheduler.set_timesteps(num_inference_steps)
+        self.scheduler.set_sigmas(num_inference_steps)
+
+        for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
+            sigma_t = self.scheduler.sigmas[i] * torch.ones(shape[0], device=self.device)
+
+            # correction step
+            for _ in range(self.scheduler.config.correct_steps):
+                model_output = self.unet(sample, sigma_t).sample
+                sample = self.scheduler.step_correct(model_output, sample, generator=generator).prev_sample
+
+            # prediction step
+            model_output = model(sample, sigma_t).sample
+            output = self.scheduler.step_pred(model_output, t, sample, generator=generator)
+
+            sample, sample_mean = output.prev_sample, output.prev_sample_mean
+
+        sample = sample_mean.clamp(0, 1)
+        sample = sample.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            sample = self.numpy_to_pil(sample)
+
+        if not return_dict:
+            return (sample,)
+
+        return ImagePipelineOutput(images=sample)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/__init__.py
new file mode 100644
index 000000000..150954baa
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/__init__.py
@@ -0,0 +1,75 @@
+# flake8: noqa
+from typing import TYPE_CHECKING
+from ....utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    _LazyModule,
+    is_note_seq_available,
+    OptionalDependencyNotAvailable,
+    is_torch_available,
+    is_transformers_available,
+    get_objects_from_module,
+)
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ....utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["continous_encoder"] = ["SpectrogramContEncoder"]
+    _import_structure["notes_encoder"] = ["SpectrogramNotesEncoder"]
+    _import_structure["pipeline_spectrogram_diffusion"] = [
+        "SpectrogramContEncoder",
+        "SpectrogramDiffusionPipeline",
+        "T5FilmDecoder",
+    ]
+try:
+    if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ....utils import dummy_transformers_and_torch_and_note_seq_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_transformers_and_torch_and_note_seq_objects))
+else:
+    _import_structure["midi_utils"] = ["MidiProcessor"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ....utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_spectrogram_diffusion import SpectrogramDiffusionPipeline
+        from .pipeline_spectrogram_diffusion import SpectrogramContEncoder
+        from .pipeline_spectrogram_diffusion import SpectrogramNotesEncoder
+        from .pipeline_spectrogram_diffusion import T5FilmDecoder
+
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ....utils.dummy_transformers_and_torch_and_note_seq_objects import *
+
+    else:
+        from .midi_utils import MidiProcessor
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py
new file mode 100644
index 000000000..8664c2fb6
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py
@@ -0,0 +1,92 @@
+# Copyright 2022 The Music Spectrogram Diffusion Authors.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+from transformers.modeling_utils import ModuleUtilsMixin
+from transformers.models.t5.modeling_t5 import (
+    T5Block,
+    T5Config,
+    T5LayerNorm,
+)
+
+from ....configuration_utils import ConfigMixin, register_to_config
+from ....models import ModelMixin
+
+
+class SpectrogramContEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin):
+    @register_to_config
+    def __init__(
+        self,
+        input_dims: int,
+        targets_context_length: int,
+        d_model: int,
+        dropout_rate: float,
+        num_layers: int,
+        num_heads: int,
+        d_kv: int,
+        d_ff: int,
+        feed_forward_proj: str,
+        is_decoder: bool = False,
+    ):
+        super().__init__()
+
+        self.input_proj = nn.Linear(input_dims, d_model, bias=False)
+
+        self.position_encoding = nn.Embedding(targets_context_length, d_model)
+        self.position_encoding.weight.requires_grad = False
+
+        self.dropout_pre = nn.Dropout(p=dropout_rate)
+
+        t5config = T5Config(
+            d_model=d_model,
+            num_heads=num_heads,
+            d_kv=d_kv,
+            d_ff=d_ff,
+            feed_forward_proj=feed_forward_proj,
+            dropout_rate=dropout_rate,
+            is_decoder=is_decoder,
+            is_encoder_decoder=False,
+        )
+        self.encoders = nn.ModuleList()
+        for lyr_num in range(num_layers):
+            lyr = T5Block(t5config)
+            self.encoders.append(lyr)
+
+        self.layer_norm = T5LayerNorm(d_model)
+        self.dropout_post = nn.Dropout(p=dropout_rate)
+
+    def forward(self, encoder_inputs, encoder_inputs_mask):
+        x = self.input_proj(encoder_inputs)
+
+        # terminal relative positional encodings
+        max_positions = encoder_inputs.shape[1]
+        input_positions = torch.arange(max_positions, device=encoder_inputs.device)
+
+        seq_lens = encoder_inputs_mask.sum(-1)
+        input_positions = torch.roll(input_positions.unsqueeze(0), tuple(seq_lens.tolist()), dims=0)
+        x += self.position_encoding(input_positions)
+
+        x = self.dropout_pre(x)
+
+        # inverted the attention mask
+        input_shape = encoder_inputs.size()
+        extended_attention_mask = self.get_extended_attention_mask(encoder_inputs_mask, input_shape)
+
+        for lyr in self.encoders:
+            x = lyr(x, extended_attention_mask)[0]
+        x = self.layer_norm(x)
+
+        return self.dropout_post(x), encoder_inputs_mask
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py
new file mode 100644
index 000000000..e777e8449
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py
@@ -0,0 +1,667 @@
+# Copyright 2022 The Music Spectrogram Diffusion Authors.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dataclasses
+import math
+import os
+from typing import Any, Callable, List, Mapping, MutableMapping, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from ....utils import is_note_seq_available
+from .pipeline_spectrogram_diffusion import TARGET_FEATURE_LENGTH
+
+
+if is_note_seq_available():
+    import note_seq
+else:
+    raise ImportError("Please install note-seq via `pip install note-seq`")
+
+
+INPUT_FEATURE_LENGTH = 2048
+
+SAMPLE_RATE = 16000
+HOP_SIZE = 320
+FRAME_RATE = int(SAMPLE_RATE // HOP_SIZE)
+
+DEFAULT_STEPS_PER_SECOND = 100
+DEFAULT_MAX_SHIFT_SECONDS = 10
+DEFAULT_NUM_VELOCITY_BINS = 1
+
+SLAKH_CLASS_PROGRAMS = {
+    "Acoustic Piano": 0,
+    "Electric Piano": 4,
+    "Chromatic Percussion": 8,
+    "Organ": 16,
+    "Acoustic Guitar": 24,
+    "Clean Electric Guitar": 26,
+    "Distorted Electric Guitar": 29,
+    "Acoustic Bass": 32,
+    "Electric Bass": 33,
+    "Violin": 40,
+    "Viola": 41,
+    "Cello": 42,
+    "Contrabass": 43,
+    "Orchestral Harp": 46,
+    "Timpani": 47,
+    "String Ensemble": 48,
+    "Synth Strings": 50,
+    "Choir and Voice": 52,
+    "Orchestral Hit": 55,
+    "Trumpet": 56,
+    "Trombone": 57,
+    "Tuba": 58,
+    "French Horn": 60,
+    "Brass Section": 61,
+    "Soprano/Alto Sax": 64,
+    "Tenor Sax": 66,
+    "Baritone Sax": 67,
+    "Oboe": 68,
+    "English Horn": 69,
+    "Bassoon": 70,
+    "Clarinet": 71,
+    "Pipe": 73,
+    "Synth Lead": 80,
+    "Synth Pad": 88,
+}
+
+
+@dataclasses.dataclass
+class NoteRepresentationConfig:
+    """Configuration note representations."""
+
+    onsets_only: bool
+    include_ties: bool
+
+
+@dataclasses.dataclass
+class NoteEventData:
+    pitch: int
+    velocity: Optional[int] = None
+    program: Optional[int] = None
+    is_drum: Optional[bool] = None
+    instrument: Optional[int] = None
+
+
+@dataclasses.dataclass
+class NoteEncodingState:
+    """Encoding state for note transcription, keeping track of active pitches."""
+
+    # velocity bin for active pitches and programs
+    active_pitches: MutableMapping[Tuple[int, int], int] = dataclasses.field(default_factory=dict)
+
+
+@dataclasses.dataclass
+class EventRange:
+    type: str
+    min_value: int
+    max_value: int
+
+
+@dataclasses.dataclass
+class Event:
+    type: str
+    value: int
+
+
+class Tokenizer:
+    def __init__(self, regular_ids: int):
+        # The special tokens: 0=PAD, 1=EOS, and 2=UNK
+        self._num_special_tokens = 3
+        self._num_regular_tokens = regular_ids
+
+    def encode(self, token_ids):
+        encoded = []
+        for token_id in token_ids:
+            if not 0 <= token_id < self._num_regular_tokens:
+                raise ValueError(
+                    f"token_id {token_id} does not fall within valid range of [0, {self._num_regular_tokens})"
+                )
+            encoded.append(token_id + self._num_special_tokens)
+
+        # Add EOS token
+        encoded.append(1)
+
+        # Pad to till INPUT_FEATURE_LENGTH
+        encoded = encoded + [0] * (INPUT_FEATURE_LENGTH - len(encoded))
+
+        return encoded
+
+
+class Codec:
+    """Encode and decode events.
+
+    Useful for declaring what certain ranges of a vocabulary should be used for. This is intended to be used from
+    Python before encoding or after decoding with GenericTokenVocabulary. This class is more lightweight and does not
+    include things like EOS or UNK token handling.
+
+    To ensure that 'shift' events are always the first block of the vocab and start at 0, that event type is required
+    and specified separately.
+    """
+
+    def __init__(self, max_shift_steps: int, steps_per_second: float, event_ranges: List[EventRange]):
+        """Define Codec.
+
+        Args:
+          max_shift_steps: Maximum number of shift steps that can be encoded.
+          steps_per_second: Shift steps will be interpreted as having a duration of
+              1 / steps_per_second.
+          event_ranges: Other supported event types and their ranges.
+        """
+        self.steps_per_second = steps_per_second
+        self._shift_range = EventRange(type="shift", min_value=0, max_value=max_shift_steps)
+        self._event_ranges = [self._shift_range] + event_ranges
+        # Ensure all event types have unique names.
+        assert len(self._event_ranges) == len({er.type for er in self._event_ranges})
+
+    @property
+    def num_classes(self) -> int:
+        return sum(er.max_value - er.min_value + 1 for er in self._event_ranges)
+
+    # The next couple methods are simplified special case methods just for shift
+    # events that are intended to be used from within autograph functions.
+
+    def is_shift_event_index(self, index: int) -> bool:
+        return (self._shift_range.min_value <= index) and (index <= self._shift_range.max_value)
+
+    @property
+    def max_shift_steps(self) -> int:
+        return self._shift_range.max_value
+
+    def encode_event(self, event: Event) -> int:
+        """Encode an event to an index."""
+        offset = 0
+        for er in self._event_ranges:
+            if event.type == er.type:
+                if not er.min_value <= event.value <= er.max_value:
+                    raise ValueError(
+                        f"Event value {event.value} is not within valid range "
+                        f"[{er.min_value}, {er.max_value}] for type {event.type}"
+                    )
+                return offset + event.value - er.min_value
+            offset += er.max_value - er.min_value + 1
+
+        raise ValueError(f"Unknown event type: {event.type}")
+
+    def event_type_range(self, event_type: str) -> Tuple[int, int]:
+        """Return [min_id, max_id] for an event type."""
+        offset = 0
+        for er in self._event_ranges:
+            if event_type == er.type:
+                return offset, offset + (er.max_value - er.min_value)
+            offset += er.max_value - er.min_value + 1
+
+        raise ValueError(f"Unknown event type: {event_type}")
+
+    def decode_event_index(self, index: int) -> Event:
+        """Decode an event index to an Event."""
+        offset = 0
+        for er in self._event_ranges:
+            if offset <= index <= offset + er.max_value - er.min_value:
+                return Event(type=er.type, value=er.min_value + index - offset)
+            offset += er.max_value - er.min_value + 1
+
+        raise ValueError(f"Unknown event index: {index}")
+
+
+@dataclasses.dataclass
+class ProgramGranularity:
+    # both tokens_map_fn and program_map_fn should be idempotent
+    tokens_map_fn: Callable[[Sequence[int], Codec], Sequence[int]]
+    program_map_fn: Callable[[int], int]
+
+
+def drop_programs(tokens, codec: Codec):
+    """Drops program change events from a token sequence."""
+    min_program_id, max_program_id = codec.event_type_range("program")
+    return tokens[(tokens < min_program_id) | (tokens > max_program_id)]
+
+
+def programs_to_midi_classes(tokens, codec):
+    """Modifies program events to be the first program in the MIDI class."""
+    min_program_id, max_program_id = codec.event_type_range("program")
+    is_program = (tokens >= min_program_id) & (tokens <= max_program_id)
+    return np.where(is_program, min_program_id + 8 * ((tokens - min_program_id) // 8), tokens)
+
+
+PROGRAM_GRANULARITIES = {
+    # "flat" granularity; drop program change tokens and set NoteSequence
+    # programs to zero
+    "flat": ProgramGranularity(tokens_map_fn=drop_programs, program_map_fn=lambda program: 0),
+    # map each program to the first program in its MIDI class
+    "midi_class": ProgramGranularity(
+        tokens_map_fn=programs_to_midi_classes, program_map_fn=lambda program: 8 * (program // 8)
+    ),
+    # leave programs as is
+    "full": ProgramGranularity(tokens_map_fn=lambda tokens, codec: tokens, program_map_fn=lambda program: program),
+}
+
+
+def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1):
+    """
+    equivalent of tf.signal.frame
+    """
+    signal_length = signal.shape[axis]
+    if pad_end:
+        frames_overlap = frame_length - frame_step
+        rest_samples = np.abs(signal_length - frames_overlap) % np.abs(frame_length - frames_overlap)
+        pad_size = int(frame_length - rest_samples)
+
+        if pad_size != 0:
+            pad_axis = [0] * signal.ndim
+            pad_axis[axis] = pad_size
+            signal = F.pad(signal, pad_axis, "constant", pad_value)
+    frames = signal.unfold(axis, frame_length, frame_step)
+    return frames
+
+
+def program_to_slakh_program(program):
+    # this is done very hackily, probably should use a custom mapping
+    for slakh_program in sorted(SLAKH_CLASS_PROGRAMS.values(), reverse=True):
+        if program >= slakh_program:
+            return slakh_program
+
+
+def audio_to_frames(
+    samples,
+    hop_size: int,
+    frame_rate: int,
+) -> Tuple[Sequence[Sequence[int]], torch.Tensor]:
+    """Convert audio samples to non-overlapping frames and frame times."""
+    frame_size = hop_size
+    samples = np.pad(samples, [0, frame_size - len(samples) % frame_size], mode="constant")
+
+    # Split audio into frames.
+    frames = frame(
+        torch.Tensor(samples).unsqueeze(0),
+        frame_length=frame_size,
+        frame_step=frame_size,
+        pad_end=False,  # TODO check why its off by 1 here when True
+    )
+
+    num_frames = len(samples) // frame_size
+
+    times = np.arange(num_frames) / frame_rate
+    return frames, times
+
+
+def note_sequence_to_onsets_and_offsets_and_programs(
+    ns: note_seq.NoteSequence,
+) -> Tuple[Sequence[float], Sequence[NoteEventData]]:
+    """Extract onset & offset times and pitches & programs from a NoteSequence.
+
+    The onset & offset times will not necessarily be in sorted order.
+
+    Args:
+      ns: NoteSequence from which to extract onsets and offsets.
+
+    Returns:
+      times: A list of note onset and offset times. values: A list of NoteEventData objects where velocity is zero for
+      note
+          offsets.
+    """
+    # Sort by program and pitch and put offsets before onsets as a tiebreaker for
+    # subsequent stable sort.
+    notes = sorted(ns.notes, key=lambda note: (note.is_drum, note.program, note.pitch))
+    times = [note.end_time for note in notes if not note.is_drum] + [note.start_time for note in notes]
+    values = [
+        NoteEventData(pitch=note.pitch, velocity=0, program=note.program, is_drum=False)
+        for note in notes
+        if not note.is_drum
+    ] + [
+        NoteEventData(pitch=note.pitch, velocity=note.velocity, program=note.program, is_drum=note.is_drum)
+        for note in notes
+    ]
+    return times, values
+
+
+def num_velocity_bins_from_codec(codec: Codec):
+    """Get number of velocity bins from event codec."""
+    lo, hi = codec.event_type_range("velocity")
+    return hi - lo
+
+
+# segment an array into segments of length n
+def segment(a, n):
+    return [a[i : i + n] for i in range(0, len(a), n)]
+
+
+def velocity_to_bin(velocity, num_velocity_bins):
+    if velocity == 0:
+        return 0
+    else:
+        return math.ceil(num_velocity_bins * velocity / note_seq.MAX_MIDI_VELOCITY)
+
+
+def note_event_data_to_events(
+    state: Optional[NoteEncodingState],
+    value: NoteEventData,
+    codec: Codec,
+) -> Sequence[Event]:
+    """Convert note event data to a sequence of events."""
+    if value.velocity is None:
+        # onsets only, no program or velocity
+        return [Event("pitch", value.pitch)]
+    else:
+        num_velocity_bins = num_velocity_bins_from_codec(codec)
+        velocity_bin = velocity_to_bin(value.velocity, num_velocity_bins)
+        if value.program is None:
+            # onsets + offsets + velocities only, no programs
+            if state is not None:
+                state.active_pitches[(value.pitch, 0)] = velocity_bin
+            return [Event("velocity", velocity_bin), Event("pitch", value.pitch)]
+        else:
+            if value.is_drum:
+                # drum events use a separate vocabulary
+                return [Event("velocity", velocity_bin), Event("drum", value.pitch)]
+            else:
+                # program + velocity + pitch
+                if state is not None:
+                    state.active_pitches[(value.pitch, value.program)] = velocity_bin
+                return [
+                    Event("program", value.program),
+                    Event("velocity", velocity_bin),
+                    Event("pitch", value.pitch),
+                ]
+
+
+def note_encoding_state_to_events(state: NoteEncodingState) -> Sequence[Event]:
+    """Output program and pitch events for active notes plus a final tie event."""
+    events = []
+    for pitch, program in sorted(state.active_pitches.keys(), key=lambda k: k[::-1]):
+        if state.active_pitches[(pitch, program)]:
+            events += [Event("program", program), Event("pitch", pitch)]
+    events.append(Event("tie", 0))
+    return events
+
+
+def encode_and_index_events(
+    state, event_times, event_values, codec, frame_times, encode_event_fn, encoding_state_to_events_fn=None
+):
+    """Encode a sequence of timed events and index to audio frame times.
+
+    Encodes time shifts as repeated single step shifts for later run length encoding.
+
+    Optionally, also encodes a sequence of "state events", keeping track of the current encoding state at each audio
+    frame. This can be used e.g. to prepend events representing the current state to a targets segment.
+
+    Args:
+      state: Initial event encoding state.
+      event_times: Sequence of event times.
+      event_values: Sequence of event values.
+      encode_event_fn: Function that transforms event value into a sequence of one
+          or more Event objects.
+      codec: An Codec object that maps Event objects to indices.
+      frame_times: Time for every audio frame.
+      encoding_state_to_events_fn: Function that transforms encoding state into a
+          sequence of one or more Event objects.
+
+    Returns:
+      events: Encoded events and shifts. event_start_indices: Corresponding start event index for every audio frame.
+          Note: one event can correspond to multiple audio indices due to sampling rate differences. This makes
+          splitting sequences tricky because the same event can appear at the end of one sequence and the beginning of
+          another.
+      event_end_indices: Corresponding end event index for every audio frame. Used
+          to ensure when slicing that one chunk ends where the next begins. Should always be true that
+          event_end_indices[i] = event_start_indices[i + 1].
+      state_events: Encoded "state" events representing the encoding state before
+          each event.
+      state_event_indices: Corresponding state event index for every audio frame.
+    """
+    indices = np.argsort(event_times, kind="stable")
+    event_steps = [round(event_times[i] * codec.steps_per_second) for i in indices]
+    event_values = [event_values[i] for i in indices]
+
+    events = []
+    state_events = []
+    event_start_indices = []
+    state_event_indices = []
+
+    cur_step = 0
+    cur_event_idx = 0
+    cur_state_event_idx = 0
+
+    def fill_event_start_indices_to_cur_step():
+        while (
+            len(event_start_indices) < len(frame_times)
+            and frame_times[len(event_start_indices)] < cur_step / codec.steps_per_second
+        ):
+            event_start_indices.append(cur_event_idx)
+            state_event_indices.append(cur_state_event_idx)
+
+    for event_step, event_value in zip(event_steps, event_values):
+        while event_step > cur_step:
+            events.append(codec.encode_event(Event(type="shift", value=1)))
+            cur_step += 1
+            fill_event_start_indices_to_cur_step()
+            cur_event_idx = len(events)
+            cur_state_event_idx = len(state_events)
+        if encoding_state_to_events_fn:
+            # Dump state to state events *before* processing the next event, because
+            # we want to capture the state prior to the occurrence of the event.
+            for e in encoding_state_to_events_fn(state):
+                state_events.append(codec.encode_event(e))
+
+        for e in encode_event_fn(state, event_value, codec):
+            events.append(codec.encode_event(e))
+
+    # After the last event, continue filling out the event_start_indices array.
+    # The inequality is not strict because if our current step lines up exactly
+    # with (the start of) an audio frame, we need to add an additional shift event
+    # to "cover" that frame.
+    while cur_step / codec.steps_per_second <= frame_times[-1]:
+        events.append(codec.encode_event(Event(type="shift", value=1)))
+        cur_step += 1
+        fill_event_start_indices_to_cur_step()
+        cur_event_idx = len(events)
+
+    # Now fill in event_end_indices. We need this extra array to make sure that
+    # when we slice events, each slice ends exactly where the subsequent slice
+    # begins.
+    event_end_indices = event_start_indices[1:] + [len(events)]
+
+    events = np.array(events).astype(np.int32)
+    state_events = np.array(state_events).astype(np.int32)
+    event_start_indices = segment(np.array(event_start_indices).astype(np.int32), TARGET_FEATURE_LENGTH)
+    event_end_indices = segment(np.array(event_end_indices).astype(np.int32), TARGET_FEATURE_LENGTH)
+    state_event_indices = segment(np.array(state_event_indices).astype(np.int32), TARGET_FEATURE_LENGTH)
+
+    outputs = []
+    for start_indices, end_indices, event_indices in zip(event_start_indices, event_end_indices, state_event_indices):
+        outputs.append(
+            {
+                "inputs": events,
+                "event_start_indices": start_indices,
+                "event_end_indices": end_indices,
+                "state_events": state_events,
+                "state_event_indices": event_indices,
+            }
+        )
+
+    return outputs
+
+
+def extract_sequence_with_indices(features, state_events_end_token=None, feature_key="inputs"):
+    """Extract target sequence corresponding to audio token segment."""
+    features = features.copy()
+    start_idx = features["event_start_indices"][0]
+    end_idx = features["event_end_indices"][-1]
+
+    features[feature_key] = features[feature_key][start_idx:end_idx]
+
+    if state_events_end_token is not None:
+        # Extract the state events corresponding to the audio start token, and
+        # prepend them to the targets array.
+        state_event_start_idx = features["state_event_indices"][0]
+        state_event_end_idx = state_event_start_idx + 1
+        while features["state_events"][state_event_end_idx - 1] != state_events_end_token:
+            state_event_end_idx += 1
+        features[feature_key] = np.concatenate(
+            [
+                features["state_events"][state_event_start_idx:state_event_end_idx],
+                features[feature_key],
+            ],
+            axis=0,
+        )
+
+    return features
+
+
+def map_midi_programs(
+    feature, codec: Codec, granularity_type: str = "full", feature_key: str = "inputs"
+) -> Mapping[str, Any]:
+    """Apply MIDI program map to token sequences."""
+    granularity = PROGRAM_GRANULARITIES[granularity_type]
+
+    feature[feature_key] = granularity.tokens_map_fn(feature[feature_key], codec)
+    return feature
+
+
+def run_length_encode_shifts_fn(
+    features,
+    codec: Codec,
+    feature_key: str = "inputs",
+    state_change_event_types: Sequence[str] = (),
+) -> Callable[[Mapping[str, Any]], Mapping[str, Any]]:
+    """Return a function that run-length encodes shifts for a given codec.
+
+    Args:
+      codec: The Codec to use for shift events.
+      feature_key: The feature key for which to run-length encode shifts.
+      state_change_event_types: A list of event types that represent state
+          changes; tokens corresponding to these event types will be interpreted as state changes and redundant ones
+          will be removed.
+
+    Returns:
+      A preprocessing function that run-length encodes single-step shifts.
+    """
+    state_change_event_ranges = [codec.event_type_range(event_type) for event_type in state_change_event_types]
+
+    def run_length_encode_shifts(features: MutableMapping[str, Any]) -> Mapping[str, Any]:
+        """Combine leading/interior shifts, trim trailing shifts.
+
+        Args:
+          features: Dict of features to process.
+
+        Returns:
+          A dict of features.
+        """
+        events = features[feature_key]
+
+        shift_steps = 0
+        total_shift_steps = 0
+        output = np.array([], dtype=np.int32)
+
+        current_state = np.zeros(len(state_change_event_ranges), dtype=np.int32)
+
+        for event in events:
+            if codec.is_shift_event_index(event):
+                shift_steps += 1
+                total_shift_steps += 1
+
+            else:
+                # If this event is a state change and has the same value as the current
+                # state, we can skip it entirely.
+                is_redundant = False
+                for i, (min_index, max_index) in enumerate(state_change_event_ranges):
+                    if (min_index <= event) and (event <= max_index):
+                        if current_state[i] == event:
+                            is_redundant = True
+                        current_state[i] = event
+                if is_redundant:
+                    continue
+
+                # Once we've reached a non-shift event, RLE all previous shift events
+                # before outputting the non-shift event.
+                if shift_steps > 0:
+                    shift_steps = total_shift_steps
+                    while shift_steps > 0:
+                        output_steps = np.minimum(codec.max_shift_steps, shift_steps)
+                        output = np.concatenate([output, [output_steps]], axis=0)
+                        shift_steps -= output_steps
+                output = np.concatenate([output, [event]], axis=0)
+
+        features[feature_key] = output
+        return features
+
+    return run_length_encode_shifts(features)
+
+
+def note_representation_processor_chain(features, codec: Codec, note_representation_config: NoteRepresentationConfig):
+    tie_token = codec.encode_event(Event("tie", 0))
+    state_events_end_token = tie_token if note_representation_config.include_ties else None
+
+    features = extract_sequence_with_indices(
+        features, state_events_end_token=state_events_end_token, feature_key="inputs"
+    )
+
+    features = map_midi_programs(features, codec)
+
+    features = run_length_encode_shifts_fn(features, codec, state_change_event_types=["velocity", "program"])
+
+    return features
+
+
+class MidiProcessor:
+    def __init__(self):
+        self.codec = Codec(
+            max_shift_steps=DEFAULT_MAX_SHIFT_SECONDS * DEFAULT_STEPS_PER_SECOND,
+            steps_per_second=DEFAULT_STEPS_PER_SECOND,
+            event_ranges=[
+                EventRange("pitch", note_seq.MIN_MIDI_PITCH, note_seq.MAX_MIDI_PITCH),
+                EventRange("velocity", 0, DEFAULT_NUM_VELOCITY_BINS),
+                EventRange("tie", 0, 0),
+                EventRange("program", note_seq.MIN_MIDI_PROGRAM, note_seq.MAX_MIDI_PROGRAM),
+                EventRange("drum", note_seq.MIN_MIDI_PITCH, note_seq.MAX_MIDI_PITCH),
+            ],
+        )
+        self.tokenizer = Tokenizer(self.codec.num_classes)
+        self.note_representation_config = NoteRepresentationConfig(onsets_only=False, include_ties=True)
+
+    def __call__(self, midi: Union[bytes, os.PathLike, str]):
+        if not isinstance(midi, bytes):
+            with open(midi, "rb") as f:
+                midi = f.read()
+
+        ns = note_seq.midi_to_note_sequence(midi)
+        ns_sus = note_seq.apply_sustain_control_changes(ns)
+
+        for note in ns_sus.notes:
+            if not note.is_drum:
+                note.program = program_to_slakh_program(note.program)
+
+        samples = np.zeros(int(ns_sus.total_time * SAMPLE_RATE))
+
+        _, frame_times = audio_to_frames(samples, HOP_SIZE, FRAME_RATE)
+        times, values = note_sequence_to_onsets_and_offsets_and_programs(ns_sus)
+
+        events = encode_and_index_events(
+            state=NoteEncodingState(),
+            event_times=times,
+            event_values=values,
+            frame_times=frame_times,
+            codec=self.codec,
+            encode_event_fn=note_event_data_to_events,
+            encoding_state_to_events_fn=note_encoding_state_to_events,
+        )
+
+        events = [
+            note_representation_processor_chain(event, self.codec, self.note_representation_config) for event in events
+        ]
+        input_tokens = [self.tokenizer.encode(event["inputs"]) for event in events]
+
+        return input_tokens
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py
new file mode 100644
index 000000000..1259f0bf0
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py
@@ -0,0 +1,86 @@
+# Copyright 2022 The Music Spectrogram Diffusion Authors.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+from transformers.modeling_utils import ModuleUtilsMixin
+from transformers.models.t5.modeling_t5 import T5Block, T5Config, T5LayerNorm
+
+from ....configuration_utils import ConfigMixin, register_to_config
+from ....models import ModelMixin
+
+
+class SpectrogramNotesEncoder(ModelMixin, ConfigMixin, ModuleUtilsMixin):
+    @register_to_config
+    def __init__(
+        self,
+        max_length: int,
+        vocab_size: int,
+        d_model: int,
+        dropout_rate: float,
+        num_layers: int,
+        num_heads: int,
+        d_kv: int,
+        d_ff: int,
+        feed_forward_proj: str,
+        is_decoder: bool = False,
+    ):
+        super().__init__()
+
+        self.token_embedder = nn.Embedding(vocab_size, d_model)
+
+        self.position_encoding = nn.Embedding(max_length, d_model)
+        self.position_encoding.weight.requires_grad = False
+
+        self.dropout_pre = nn.Dropout(p=dropout_rate)
+
+        t5config = T5Config(
+            vocab_size=vocab_size,
+            d_model=d_model,
+            num_heads=num_heads,
+            d_kv=d_kv,
+            d_ff=d_ff,
+            dropout_rate=dropout_rate,
+            feed_forward_proj=feed_forward_proj,
+            is_decoder=is_decoder,
+            is_encoder_decoder=False,
+        )
+
+        self.encoders = nn.ModuleList()
+        for lyr_num in range(num_layers):
+            lyr = T5Block(t5config)
+            self.encoders.append(lyr)
+
+        self.layer_norm = T5LayerNorm(d_model)
+        self.dropout_post = nn.Dropout(p=dropout_rate)
+
+    def forward(self, encoder_input_tokens, encoder_inputs_mask):
+        x = self.token_embedder(encoder_input_tokens)
+
+        seq_length = encoder_input_tokens.shape[1]
+        inputs_positions = torch.arange(seq_length, device=encoder_input_tokens.device)
+        x += self.position_encoding(inputs_positions)
+
+        x = self.dropout_pre(x)
+
+        # inverted the attention mask
+        input_shape = encoder_input_tokens.size()
+        extended_attention_mask = self.get_extended_attention_mask(encoder_inputs_mask, input_shape)
+
+        for lyr in self.encoders:
+            x = lyr(x, extended_attention_mask)[0]
+        x = self.layer_norm(x)
+
+        return self.dropout_post(x), encoder_inputs_mask
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
new file mode 100644
index 000000000..496a1f765
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
@@ -0,0 +1,269 @@
+# Copyright 2022 The Music Spectrogram Diffusion Authors.
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Any, Callable, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ....models import T5FilmDecoder
+from ....schedulers import DDPMScheduler
+from ....utils import is_onnx_available, logging
+from ....utils.torch_utils import randn_tensor
+
+
+if is_onnx_available():
+    from ...onnx_utils import OnnxRuntimeModel
+
+from ...pipeline_utils import AudioPipelineOutput, DiffusionPipeline
+from .continuous_encoder import SpectrogramContEncoder
+from .notes_encoder import SpectrogramNotesEncoder
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+TARGET_FEATURE_LENGTH = 256
+
+
+class SpectrogramDiffusionPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for unconditional audio generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        notes_encoder ([`SpectrogramNotesEncoder`]):
+        continuous_encoder ([`SpectrogramContEncoder`]):
+        decoder ([`T5FilmDecoder`]):
+            A [`T5FilmDecoder`] to denoise the encoded audio latents.
+        scheduler ([`DDPMScheduler`]):
+            A scheduler to be used in combination with `decoder` to denoise the encoded audio latents.
+        melgan ([`OnnxRuntimeModel`]):
+    """
+
+    _optional_components = ["melgan"]
+
+    def __init__(
+        self,
+        notes_encoder: SpectrogramNotesEncoder,
+        continuous_encoder: SpectrogramContEncoder,
+        decoder: T5FilmDecoder,
+        scheduler: DDPMScheduler,
+        melgan: OnnxRuntimeModel if is_onnx_available() else Any,
+    ) -> None:
+        super().__init__()
+
+        # From MELGAN
+        self.min_value = math.log(1e-5)  # Matches MelGAN training.
+        self.max_value = 4.0  # Largest value for most examples
+        self.n_dims = 128
+
+        self.register_modules(
+            notes_encoder=notes_encoder,
+            continuous_encoder=continuous_encoder,
+            decoder=decoder,
+            scheduler=scheduler,
+            melgan=melgan,
+        )
+
+    def scale_features(self, features, output_range=(-1.0, 1.0), clip=False):
+        """Linearly scale features to network outputs range."""
+        min_out, max_out = output_range
+        if clip:
+            features = torch.clip(features, self.min_value, self.max_value)
+        # Scale to [0, 1].
+        zero_one = (features - self.min_value) / (self.max_value - self.min_value)
+        # Scale to [min_out, max_out].
+        return zero_one * (max_out - min_out) + min_out
+
+    def scale_to_features(self, outputs, input_range=(-1.0, 1.0), clip=False):
+        """Invert by linearly scaling network outputs to features range."""
+        min_out, max_out = input_range
+        outputs = torch.clip(outputs, min_out, max_out) if clip else outputs
+        # Scale to [0, 1].
+        zero_one = (outputs - min_out) / (max_out - min_out)
+        # Scale to [self.min_value, self.max_value].
+        return zero_one * (self.max_value - self.min_value) + self.min_value
+
+    def encode(self, input_tokens, continuous_inputs, continuous_mask):
+        tokens_mask = input_tokens > 0
+        tokens_encoded, tokens_mask = self.notes_encoder(
+            encoder_input_tokens=input_tokens, encoder_inputs_mask=tokens_mask
+        )
+
+        continuous_encoded, continuous_mask = self.continuous_encoder(
+            encoder_inputs=continuous_inputs, encoder_inputs_mask=continuous_mask
+        )
+
+        return [(tokens_encoded, tokens_mask), (continuous_encoded, continuous_mask)]
+
+    def decode(self, encodings_and_masks, input_tokens, noise_time):
+        timesteps = noise_time
+        if not torch.is_tensor(timesteps):
+            timesteps = torch.tensor([timesteps], dtype=torch.long, device=input_tokens.device)
+        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(input_tokens.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps * torch.ones(input_tokens.shape[0], dtype=timesteps.dtype, device=timesteps.device)
+
+        logits = self.decoder(
+            encodings_and_masks=encodings_and_masks, decoder_input_tokens=input_tokens, decoder_noise_time=timesteps
+        )
+        return logits
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        input_tokens: List[List[int]],
+        generator: Optional[torch.Generator] = None,
+        num_inference_steps: int = 100,
+        return_dict: bool = True,
+        output_type: str = "numpy",
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+    ) -> Union[AudioPipelineOutput, Tuple]:
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            input_tokens (`List[List[int]]`):
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
+                expense of slower inference.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple.
+            output_type (`str`, *optional*, defaults to `"numpy"`):
+                The output format of the generated audio.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Example:
+
+        ```py
+        >>> from diffusers import SpectrogramDiffusionPipeline, MidiProcessor
+
+        >>> pipe = SpectrogramDiffusionPipeline.from_pretrained("google/music-spectrogram-diffusion")
+        >>> pipe = pipe.to("cuda")
+        >>> processor = MidiProcessor()
+
+        >>> # Download MIDI from: wget http://www.piano-midi.de/midis/beethoven/beethoven_hammerklavier_2.mid
+        >>> output = pipe(processor("beethoven_hammerklavier_2.mid"))
+
+        >>> audio = output.audios[0]
+        ```
+
+        Returns:
+            [`pipelines.AudioPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`pipelines.AudioPipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated audio.
+        """
+
+        pred_mel = np.zeros([1, TARGET_FEATURE_LENGTH, self.n_dims], dtype=np.float32)
+        full_pred_mel = np.zeros([1, 0, self.n_dims], np.float32)
+        ones = torch.ones((1, TARGET_FEATURE_LENGTH), dtype=bool, device=self.device)
+
+        for i, encoder_input_tokens in enumerate(input_tokens):
+            if i == 0:
+                encoder_continuous_inputs = torch.from_numpy(pred_mel[:1].copy()).to(
+                    device=self.device, dtype=self.decoder.dtype
+                )
+                # The first chunk has no previous context.
+                encoder_continuous_mask = torch.zeros((1, TARGET_FEATURE_LENGTH), dtype=bool, device=self.device)
+            else:
+                # The full song pipeline does not feed in a context feature, so the mask
+                # will be all 0s after the feature converter. Because we know we're
+                # feeding in a full context chunk from the previous prediction, set it
+                # to all 1s.
+                encoder_continuous_mask = ones
+
+            encoder_continuous_inputs = self.scale_features(
+                encoder_continuous_inputs, output_range=[-1.0, 1.0], clip=True
+            )
+
+            encodings_and_masks = self.encode(
+                input_tokens=torch.IntTensor([encoder_input_tokens]).to(device=self.device),
+                continuous_inputs=encoder_continuous_inputs,
+                continuous_mask=encoder_continuous_mask,
+            )
+
+            # Sample encoder_continuous_inputs shaped gaussian noise to begin loop
+            x = randn_tensor(
+                shape=encoder_continuous_inputs.shape,
+                generator=generator,
+                device=self.device,
+                dtype=self.decoder.dtype,
+            )
+
+            # set step values
+            self.scheduler.set_timesteps(num_inference_steps)
+
+            # Denoising diffusion loop
+            for j, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
+                output = self.decode(
+                    encodings_and_masks=encodings_and_masks,
+                    input_tokens=x,
+                    noise_time=t / self.scheduler.config.num_train_timesteps,  # rescale to [0, 1)
+                )
+
+                # Compute previous output: x_t -> x_t-1
+                x = self.scheduler.step(output, t, x, generator=generator).prev_sample
+
+            mel = self.scale_to_features(x, input_range=[-1.0, 1.0])
+            encoder_continuous_inputs = mel[:1]
+            pred_mel = mel.cpu().float().numpy()
+
+            full_pred_mel = np.concatenate([full_pred_mel, pred_mel[:1]], axis=1)
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, full_pred_mel)
+
+            logger.info("Generated segment", i)
+
+        if output_type == "numpy" and not is_onnx_available():
+            raise ValueError(
+                "Cannot return output in 'np' format if ONNX is not available. Make sure to have ONNX installed or set 'output_type' to 'mel'."
+            )
+        elif output_type == "numpy" and self.melgan is None:
+            raise ValueError(
+                "Cannot return output in 'np' format if melgan component is not defined. Make sure to define `self.melgan` or set 'output_type' to 'mel'."
+            )
+
+        if output_type == "numpy":
+            output = self.melgan(input_features=full_pred_mel.astype(np.float32))
+        else:
+            output = full_pred_mel
+
+        if not return_dict:
+            return (output,)
+
+        return AudioPipelineOutput(audios=output)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/__init__.py
new file mode 100644
index 000000000..36cf1a33c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/__init__.py
@@ -0,0 +1,55 @@
+from typing import TYPE_CHECKING
+
+from ....utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ....utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_cycle_diffusion"] = ["CycleDiffusionPipeline"]
+    _import_structure["pipeline_stable_diffusion_inpaint_legacy"] = ["StableDiffusionInpaintPipelineLegacy"]
+    _import_structure["pipeline_stable_diffusion_model_editing"] = ["StableDiffusionModelEditingPipeline"]
+
+    _import_structure["pipeline_stable_diffusion_paradigms"] = ["StableDiffusionParadigmsPipeline"]
+    _import_structure["pipeline_stable_diffusion_pix2pix_zero"] = ["StableDiffusionPix2PixZeroPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ....utils.dummy_torch_and_transformers_objects import *
+
+    else:
+        from .pipeline_cycle_diffusion import CycleDiffusionPipeline
+        from .pipeline_stable_diffusion_inpaint_legacy import StableDiffusionInpaintPipelineLegacy
+        from .pipeline_stable_diffusion_model_editing import StableDiffusionModelEditingPipeline
+        from .pipeline_stable_diffusion_paradigms import StableDiffusionParadigmsPipeline
+        from .pipeline_stable_diffusion_pix2pix_zero import StableDiffusionPix2PixZeroPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py
new file mode 100644
index 000000000..0581effef
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py
@@ -0,0 +1,948 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ....configuration_utils import FrozenDict
+from ....image_processor import PipelineImageInput, VaeImageProcessor
+from ....loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ....models import AutoencoderKL, UNet2DConditionModel
+from ....models.lora import adjust_lora_scale_text_encoder
+from ....schedulers import DDIMScheduler
+from ....utils import PIL_INTERPOLATION, USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline
+from ...stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
+def preprocess(image):
+    deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
+    deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+def posterior_sample(scheduler, latents, timestep, clean_latents, generator, eta):
+    # 1. get previous step value (=t-1)
+    prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
+
+    if prev_timestep <= 0:
+        return clean_latents
+
+    # 2. compute alphas, betas
+    alpha_prod_t = scheduler.alphas_cumprod[timestep]
+    alpha_prod_t_prev = (
+        scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
+    )
+
+    variance = scheduler._get_variance(timestep, prev_timestep)
+    std_dev_t = eta * variance ** (0.5)
+
+    # direction pointing to x_t
+    e_t = (latents - alpha_prod_t ** (0.5) * clean_latents) / (1 - alpha_prod_t) ** (0.5)
+    dir_xt = (1.0 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * e_t
+    noise = std_dev_t * randn_tensor(
+        clean_latents.shape, dtype=clean_latents.dtype, device=clean_latents.device, generator=generator
+    )
+    prev_latents = alpha_prod_t_prev ** (0.5) * clean_latents + dir_xt + noise
+
+    return prev_latents
+
+
+def compute_noise(scheduler, prev_latents, latents, timestep, noise_pred, eta):
+    # 1. get previous step value (=t-1)
+    prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
+
+    # 2. compute alphas, betas
+    alpha_prod_t = scheduler.alphas_cumprod[timestep]
+    alpha_prod_t_prev = (
+        scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
+    )
+
+    beta_prod_t = 1 - alpha_prod_t
+
+    # 3. compute predicted original sample from predicted noise also called
+    # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
+
+    # 4. Clip "predicted x_0"
+    if scheduler.config.clip_sample:
+        pred_original_sample = torch.clamp(pred_original_sample, -1, 1)
+
+    # 5. compute variance: "sigma_t(η)" -> see formula (16)
+    # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+    variance = scheduler._get_variance(timestep, prev_timestep)
+    std_dev_t = eta * variance ** (0.5)
+
+    # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * noise_pred
+
+    noise = (prev_latents - (alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction)) / (
+        variance ** (0.5) * eta
+    )
+    return noise
+
+
+class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+    r"""
+    Pipeline for text-guided image to image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can only be an
+            instance of [`DDIMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: DDIMScheduler,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def check_inputs(
+        self,
+        prompt,
+        strength,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = image.shape[0]
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            if isinstance(generator, list):
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(image.shape[0])
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
+
+        # add noise to latents using the timestep
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        clean_latents = init_latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents, clean_latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        source_prompt: Union[str, List[str]],
+        image: PipelineImageInput = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        source_guidance_scale: Optional[float] = 1,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image` or tensor representing an image batch to be used as the starting point. Can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            source_guidance_scale (`float`, *optional*, defaults to 1):
+                Guidance scale for the source prompt. This is useful to control the amount of influence the source
+                prompt has for encoding.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Example:
+
+        ```py
+        import requests
+        import torch
+        from PIL import Image
+        from io import BytesIO
+
+        from diffusers import CycleDiffusionPipeline, DDIMScheduler
+
+        # load the pipeline
+        # make sure you're logged in with `huggingface-cli login`
+        model_id_or_path = "CompVis/stable-diffusion-v1-4"
+        scheduler = DDIMScheduler.from_pretrained(model_id_or_path, subfolder="scheduler")
+        pipe = CycleDiffusionPipeline.from_pretrained(model_id_or_path, scheduler=scheduler).to("cuda")
+
+        # let's download an initial image
+        url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/An%20astronaut%20riding%20a%20horse.png"
+        response = requests.get(url)
+        init_image = Image.open(BytesIO(response.content)).convert("RGB")
+        init_image = init_image.resize((512, 512))
+        init_image.save("horse.png")
+
+        # let's specify a prompt
+        source_prompt = "An astronaut riding a horse"
+        prompt = "An astronaut riding an elephant"
+
+        # call the pipeline
+        image = pipe(
+            prompt=prompt,
+            source_prompt=source_prompt,
+            image=init_image,
+            num_inference_steps=100,
+            eta=0.1,
+            strength=0.8,
+            guidance_scale=2,
+            source_guidance_scale=1,
+        ).images[0]
+
+        image.save("horse_to_elephant.png")
+
+        # let's try another example
+        # See more samples at the original repo: https://github.com/ChenWu98/cycle-diffusion
+        url = (
+            "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/A%20black%20colored%20car.png"
+        )
+        response = requests.get(url)
+        init_image = Image.open(BytesIO(response.content)).convert("RGB")
+        init_image = init_image.resize((512, 512))
+        init_image.save("black.png")
+
+        source_prompt = "A black colored car"
+        prompt = "A blue colored car"
+
+        # call the pipeline
+        torch.manual_seed(0)
+        image = pipe(
+            prompt=prompt,
+            source_prompt=source_prompt,
+            image=init_image,
+            num_inference_steps=100,
+            eta=0.1,
+            strength=0.85,
+            guidance_scale=3,
+            source_guidance_scale=1,
+        ).images[0]
+
+        image.save("black_to_blue.png")
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # 1. Check inputs
+        self.check_inputs(prompt, strength, callback_steps)
+
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            prompt_embeds=prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
+        )
+        source_prompt_embeds_tuple = self.encode_prompt(
+            source_prompt, device, num_images_per_prompt, do_classifier_free_guidance, None, clip_skip=clip_skip
+        )
+        if prompt_embeds_tuple[1] is not None:
+            prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+        else:
+            prompt_embeds = prompt_embeds_tuple[0]
+        if source_prompt_embeds_tuple[1] is not None:
+            source_prompt_embeds = torch.cat([source_prompt_embeds_tuple[1], source_prompt_embeds_tuple[0]])
+        else:
+            source_prompt_embeds = source_prompt_embeds_tuple[0]
+
+        # 4. Preprocess image
+        image = self.image_processor.preprocess(image)
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 6. Prepare latent variables
+        latents, clean_latents = self.prepare_latents(
+            image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
+        )
+        source_latents = latents
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        generator = extra_step_kwargs.pop("generator", None)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                source_latent_model_input = (
+                    torch.cat([source_latents] * 2) if do_classifier_free_guidance else source_latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                source_latent_model_input = self.scheduler.scale_model_input(source_latent_model_input, t)
+
+                # predict the noise residual
+                if do_classifier_free_guidance:
+                    concat_latent_model_input = torch.stack(
+                        [
+                            source_latent_model_input[0],
+                            latent_model_input[0],
+                            source_latent_model_input[1],
+                            latent_model_input[1],
+                        ],
+                        dim=0,
+                    )
+                    concat_prompt_embeds = torch.stack(
+                        [
+                            source_prompt_embeds[0],
+                            prompt_embeds[0],
+                            source_prompt_embeds[1],
+                            prompt_embeds[1],
+                        ],
+                        dim=0,
+                    )
+                else:
+                    concat_latent_model_input = torch.cat(
+                        [
+                            source_latent_model_input,
+                            latent_model_input,
+                        ],
+                        dim=0,
+                    )
+                    concat_prompt_embeds = torch.cat(
+                        [
+                            source_prompt_embeds,
+                            prompt_embeds,
+                        ],
+                        dim=0,
+                    )
+
+                concat_noise_pred = self.unet(
+                    concat_latent_model_input,
+                    t,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_hidden_states=concat_prompt_embeds,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    (
+                        source_noise_pred_uncond,
+                        noise_pred_uncond,
+                        source_noise_pred_text,
+                        noise_pred_text,
+                    ) = concat_noise_pred.chunk(4, dim=0)
+
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    source_noise_pred = source_noise_pred_uncond + source_guidance_scale * (
+                        source_noise_pred_text - source_noise_pred_uncond
+                    )
+
+                else:
+                    (source_noise_pred, noise_pred) = concat_noise_pred.chunk(2, dim=0)
+
+                # Sample source_latents from the posterior distribution.
+                prev_source_latents = posterior_sample(
+                    self.scheduler, source_latents, t, clean_latents, generator=generator, **extra_step_kwargs
+                )
+                # Compute noise.
+                noise = compute_noise(
+                    self.scheduler, prev_source_latents, source_latents, t, source_noise_pred, **extra_step_kwargs
+                )
+                source_latents = prev_source_latents
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, variance_noise=noise, **extra_step_kwargs
+                ).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # 9. Post-processing
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py
new file mode 100644
index 000000000..0aa5e68bf
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py
@@ -0,0 +1,542 @@
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTokenizer
+
+from ....configuration_utils import FrozenDict
+from ....schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ....utils import deprecate, logging
+from ...onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel
+from ...pipeline_utils import DiffusionPipeline
+from ...stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def preprocess(image):
+    w, h = image.size
+    w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    return 2.0 * image - 1.0
+
+
+def preprocess_mask(mask, scale_factor=8):
+    mask = mask.convert("L")
+    w, h = mask.size
+    w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
+    mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL.Image.NEAREST)
+    mask = np.array(mask).astype(np.float32) / 255.0
+    mask = np.tile(mask, (4, 1, 1))
+    mask = mask[None].transpose(0, 1, 2, 3)  # what does this step do?
+    mask = 1 - mask  # repaint white, keep black
+    return mask
+
+
+class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
+    r"""
+    Pipeline for text-guided image inpainting using Stable Diffusion. This is a *legacy feature* for Onnx pipelines to
+    provide compatibility with StableDiffusionInpaintPipelineLegacy and may be removed in the future.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _is_onnx = True
+
+    vae_encoder: OnnxRuntimeModel
+    vae_decoder: OnnxRuntimeModel
+    text_encoder: OnnxRuntimeModel
+    tokenizer: CLIPTokenizer
+    unet: OnnxRuntimeModel
+    scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]
+    safety_checker: OnnxRuntimeModel
+    feature_extractor: CLIPImageProcessor
+
+    def __init__(
+        self,
+        vae_encoder: OnnxRuntimeModel,
+        vae_decoder: OnnxRuntimeModel,
+        text_encoder: OnnxRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: OnnxRuntimeModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: OnnxRuntimeModel,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae_encoder=vae_encoder,
+            vae_decoder=vae_decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: Optional[int],
+        do_classifier_free_guidance: bool,
+        negative_prompt: Optional[str],
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                prompt to be encoded
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
+
+            if not np.array_equal(text_input_ids, untruncated_ids):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+
+        prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt] * batch_size
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    def check_inputs(
+        self,
+        prompt,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[np.ndarray, PIL.Image.Image] = None,
+        mask_image: Union[np.ndarray, PIL.Image.Image] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[np.random.RandomState] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`nd.ndarray` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. This is the image whose masked region will be inpainted.
+            mask_image (`nd.ndarray` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
+                contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.uu
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter will be modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (?) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`np.random.RandomState`, *optional*):
+                A np.random.RandomState to make generation deterministic.
+            prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        # check inputs. Raise error if not correct
+        self.check_inputs(prompt, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
+
+        # define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if generator is None:
+            generator = np.random
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        if isinstance(image, PIL.Image.Image):
+            image = preprocess(image)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        latents_dtype = prompt_embeds.dtype
+        image = image.astype(latents_dtype)
+
+        # encode the init image into latents and scale the latents
+        init_latents = self.vae_encoder(sample=image)[0]
+        init_latents = 0.18215 * init_latents
+
+        # Expand init_latents for batch_size and num_images_per_prompt
+        init_latents = np.concatenate([init_latents] * num_images_per_prompt, axis=0)
+        init_latents_orig = init_latents
+
+        # preprocess mask
+        if not isinstance(mask_image, np.ndarray):
+            mask_image = preprocess_mask(mask_image, 8)
+        mask_image = mask_image.astype(latents_dtype)
+        mask = np.concatenate([mask_image] * num_images_per_prompt, axis=0)
+
+        # check sizes
+        if not mask.shape == init_latents.shape:
+            raise ValueError("The mask and image should be the same size!")
+
+        # get the original timestep using init_timestep
+        offset = self.scheduler.config.get("steps_offset", 0)
+        init_timestep = int(num_inference_steps * strength) + offset
+        init_timestep = min(init_timestep, num_inference_steps)
+
+        timesteps = self.scheduler.timesteps.numpy()[-init_timestep]
+        timesteps = np.array([timesteps] * batch_size * num_images_per_prompt)
+
+        # add noise to latents using the timesteps
+        noise = generator.randn(*init_latents.shape).astype(latents_dtype)
+        init_latents = self.scheduler.add_noise(
+            torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps)
+        )
+        init_latents = init_latents.numpy()
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (?) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to ? in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        latents = init_latents
+
+        t_start = max(num_inference_steps - init_timestep + offset, 0)
+        timesteps = self.scheduler.timesteps[t_start:].numpy()
+        timestep_dtype = next(
+            (input.type for input in self.unet.model.get_inputs() if input.name == "timestep"), "tensor(float)"
+        )
+        timestep_dtype = ORT_TO_NP_TYPE[timestep_dtype]
+
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            timestep = np.array([t], dtype=timestep_dtype)
+            noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds)[
+                0
+            ]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
+            ).prev_sample
+
+            latents = latents.numpy()
+
+            init_latents_proper = self.scheduler.add_noise(
+                torch.from_numpy(init_latents_orig), torch.from_numpy(noise), torch.from_numpy(np.array([t]))
+            )
+
+            init_latents_proper = init_latents_proper.numpy()
+
+            latents = (init_latents_proper * mask) + (latents * (1 - mask))
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        # image = self.vae_decoder(latent_sample=latents)[0]
+        # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
+        image = np.concatenate(
+            [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
+        )
+
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = image.transpose((0, 2, 3, 1))
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(
+                self.numpy_to_pil(image), return_tensors="np"
+            ).pixel_values.astype(image.dtype)
+            # There will throw an error if use safety_checker batchsize>1
+            images, has_nsfw_concept = [], []
+            for i in range(image.shape[0]):
+                image_i, has_nsfw_concept_i = self.safety_checker(
+                    clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
+                )
+                images.append(image_i)
+                has_nsfw_concept.append(has_nsfw_concept_i[0])
+            image = np.concatenate(images)
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py
new file mode 100644
index 000000000..980adf273
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py
@@ -0,0 +1,786 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ....configuration_utils import FrozenDict
+from ....image_processor import VaeImageProcessor
+from ....loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ....models import AutoencoderKL, UNet2DConditionModel
+from ....models.lora import adjust_lora_scale_text_encoder
+from ....schedulers import KarrasDiffusionSchedulers
+from ....utils import PIL_INTERPOLATION, USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline
+from ...stable_diffusion import StableDiffusionPipelineOutput
+from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)
+
+
+def preprocess_image(image, batch_size):
+    w, h = image.size
+    w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
+    image = np.array(image).astype(np.float32) / 255.0
+    image = np.vstack([image[None].transpose(0, 3, 1, 2)] * batch_size)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+
+
+def preprocess_mask(mask, batch_size, scale_factor=8):
+    if not isinstance(mask, torch.FloatTensor):
+        mask = mask.convert("L")
+        w, h = mask.size
+        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+        mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"])
+        mask = np.array(mask).astype(np.float32) / 255.0
+        mask = np.tile(mask, (4, 1, 1))
+        mask = np.vstack([mask[None]] * batch_size)
+        mask = 1 - mask  # repaint white, keep black
+        mask = torch.from_numpy(mask)
+        return mask
+
+    else:
+        valid_mask_channel_sizes = [1, 3]
+        # if mask channel is fourth tensor dimension, permute dimensions to pytorch standard (B, C, H, W)
+        if mask.shape[3] in valid_mask_channel_sizes:
+            mask = mask.permute(0, 3, 1, 2)
+        elif mask.shape[1] not in valid_mask_channel_sizes:
+            raise ValueError(
+                f"Mask channel dimension of size in {valid_mask_channel_sizes} should be second or fourth dimension,"
+                f" but received mask of shape {tuple(mask.shape)}"
+            )
+        # (potentially) reduce mask channel dimension from 3 to 1 for broadcasting to latent shape
+        mask = mask.mean(dim=1, keepdim=True)
+        h, w = mask.shape[-2:]
+        h, w = (x - x % 8 for x in (h, w))  # resize to integer multiple of 8
+        mask = torch.nn.functional.interpolate(mask, (h // scale_factor, w // scale_factor))
+        return mask
+
+
+class StableDiffusionInpaintPipelineLegacy(
+    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
+):
+    r"""
+    Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    In addition the pipeline inherits the following loading methods:
+        - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`]
+        - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`]
+        - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
+
+    as well as the following saving methods:
+        - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`]
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        deprecation_message = (
+            f"The class {self.__class__} is deprecated and will be removed in v1.0.0. You can achieve exactly the same functionality"
+            "by loading your model into `StableDiffusionInpaintPipeline` instead. See https://github.com/huggingface/diffusers/pull/3533"
+            "for more information."
+        )
+        deprecate("legacy is outdated", "1.0.0", deprecation_message, standard_warn=False)
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        strength,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, image, timestep, num_images_per_prompt, dtype, device, generator):
+        image = image.to(device=device, dtype=dtype)
+        init_latent_dist = self.vae.encode(image).latent_dist
+        init_latents = init_latent_dist.sample(generator=generator)
+        init_latents = self.vae.config.scaling_factor * init_latents
+
+        # Expand init_latents for batch_size and num_images_per_prompt
+        init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
+        init_latents_orig = init_latents
+
+        # add noise to latents using the timesteps
+        noise = randn_tensor(init_latents.shape, generator=generator, device=device, dtype=dtype)
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+        return latents, init_latents_orig, noise
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        add_predicted_noise: Optional[bool] = False,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. This is the image whose masked region will be inpainted.
+            mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
+                PIL image, it will be converted to a single channel (luminance) before use. If mask is a tensor, the
+                expected shape should be either `(B, H, W, C)` or `(B, C, H, W)`, where C is 1 or 3.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
+                is 1, the denoising process will be run on the masked area for the full number of iterations specified
+                in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more noise to
+                that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
+                the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
+                is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            add_predicted_noise (`bool`, *optional*, defaults to True):
+                Use predicted noise instead of random noise when constructing noisy versions of the original image in
+                the reverse diffusion process
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 1. Check inputs
+        self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Preprocess image and mask
+        if not isinstance(image, torch.FloatTensor):
+            image = preprocess_image(image, batch_size)
+
+        mask_image = preprocess_mask(mask_image, batch_size, self.vae_scale_factor)
+
+        # 5. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 6. Prepare latent variables
+        # encode the init image into latents and scale the latents
+        latents, init_latents_orig, noise = self.prepare_latents(
+            image, latent_timestep, num_images_per_prompt, prompt_embeds.dtype, device, generator
+        )
+
+        # 7. Prepare mask latent
+        mask = mask_image.to(device=device, dtype=latents.dtype)
+        mask = torch.cat([mask] * num_images_per_prompt)
+
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                # masking
+                if add_predicted_noise:
+                    init_latents_proper = self.scheduler.add_noise(
+                        init_latents_orig, noise_pred_uncond, torch.tensor([t])
+                    )
+                else:
+                    init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
+
+                latents = (init_latents_proper * mask) + (latents * (1 - mask))
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # use original latents corresponding to unmasked portions of the image
+        latents = (init_latents_orig * mask) + (latents * (1 - mask))
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py
new file mode 100644
index 000000000..dee93fc2e
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py
@@ -0,0 +1,824 @@
+# Copyright 2024 TIME Authors and The HuggingFace Team. All rights reserved."
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+from ....image_processor import VaeImageProcessor
+from ....loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ....models import AutoencoderKL, UNet2DConditionModel
+from ....models.lora import adjust_lora_scale_text_encoder
+from ....schedulers import PNDMScheduler
+from ....schedulers.scheduling_utils import SchedulerMixin
+from ....utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ...stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+AUGS_CONST = ["A photo of ", "An image of ", "A picture of "]
+
+
+class StableDiffusionModelEditingPipeline(
+    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin
+):
+    r"""
+    Pipeline for text-to-image model editing.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPFeatureExtractor`]):
+            A `CLIPFeatureExtractor` to extract features from generated images; used as inputs to the `safety_checker`.
+        with_to_k ([`bool`]):
+            Whether to edit the key projection matrices along with the value projection matrices.
+        with_augs ([`list`]):
+            Textual augmentations to apply while editing the text-to-image model. Set to `[]` for no augmentations.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: SchedulerMixin,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
+        with_to_k: bool = True,
+        with_augs: list = AUGS_CONST,
+    ):
+        super().__init__()
+
+        if isinstance(scheduler, PNDMScheduler):
+            logger.error("PNDMScheduler for this pipeline is currently not supported.")
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+        self.with_to_k = with_to_k
+        self.with_augs = with_augs
+
+        # get cross-attention layers
+        ca_layers = []
+
+        def append_ca(net_):
+            if net_.__class__.__name__ == "CrossAttention":
+                ca_layers.append(net_)
+            elif hasattr(net_, "children"):
+                for net__ in net_.children():
+                    append_ca(net__)
+
+        # recursively find all cross-attention layers in unet
+        for net in self.unet.named_children():
+            if "down" in net[0]:
+                append_ca(net[1])
+            elif "up" in net[0]:
+                append_ca(net[1])
+            elif "mid" in net[0]:
+                append_ca(net[1])
+
+        # get projection matrices
+        self.ca_clip_layers = [l for l in ca_layers if l.to_v.in_features == 768]
+        self.projection_matrices = [l.to_v for l in self.ca_clip_layers]
+        self.og_matrices = [copy.deepcopy(l.to_v) for l in self.ca_clip_layers]
+        if self.with_to_k:
+            self.projection_matrices = self.projection_matrices + [l.to_k for l in self.ca_clip_layers]
+            self.og_matrices = self.og_matrices + [copy.deepcopy(l.to_k) for l in self.ca_clip_layers]
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def edit_model(
+        self,
+        source_prompt: str,
+        destination_prompt: str,
+        lamb: float = 0.1,
+        restart_params: bool = True,
+    ):
+        r"""
+        Apply model editing via closed-form solution (see Eq. 5 in the TIME [paper](https://arxiv.org/abs/2303.08084)).
+
+        Args:
+            source_prompt (`str`):
+                The source prompt containing the concept to be edited.
+            destination_prompt (`str`):
+                The destination prompt. Must contain all words from `source_prompt` with additional ones to specify the
+                target edit.
+            lamb (`float`, *optional*, defaults to 0.1):
+                The lambda parameter specifying the regularization intesity. Smaller values increase the editing power.
+            restart_params (`bool`, *optional*, defaults to True):
+                Restart the model parameters to their pre-trained version before editing. This is done to avoid edit
+                compounding. When it is `False`, edits accumulate.
+        """
+
+        # restart LDM parameters
+        if restart_params:
+            num_ca_clip_layers = len(self.ca_clip_layers)
+            for idx_, l in enumerate(self.ca_clip_layers):
+                l.to_v = copy.deepcopy(self.og_matrices[idx_])
+                self.projection_matrices[idx_] = l.to_v
+                if self.with_to_k:
+                    l.to_k = copy.deepcopy(self.og_matrices[num_ca_clip_layers + idx_])
+                    self.projection_matrices[num_ca_clip_layers + idx_] = l.to_k
+
+        # set up sentences
+        old_texts = [source_prompt]
+        new_texts = [destination_prompt]
+        # add augmentations
+        base = old_texts[0] if old_texts[0][0:1] != "A" else "a" + old_texts[0][1:]
+        for aug in self.with_augs:
+            old_texts.append(aug + base)
+        base = new_texts[0] if new_texts[0][0:1] != "A" else "a" + new_texts[0][1:]
+        for aug in self.with_augs:
+            new_texts.append(aug + base)
+
+        # prepare input k* and v*
+        old_embs, new_embs = [], []
+        for old_text, new_text in zip(old_texts, new_texts):
+            text_input = self.tokenizer(
+                [old_text, new_text],
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
+            old_emb, new_emb = text_embeddings
+            old_embs.append(old_emb)
+            new_embs.append(new_emb)
+
+        # identify corresponding destinations for each token in old_emb
+        idxs_replaces = []
+        for old_text, new_text in zip(old_texts, new_texts):
+            tokens_a = self.tokenizer(old_text).input_ids
+            tokens_b = self.tokenizer(new_text).input_ids
+            tokens_a = [self.tokenizer.encode("a ")[1] if self.tokenizer.decode(t) == "an" else t for t in tokens_a]
+            tokens_b = [self.tokenizer.encode("a ")[1] if self.tokenizer.decode(t) == "an" else t for t in tokens_b]
+            num_orig_tokens = len(tokens_a)
+            idxs_replace = []
+            j = 0
+            for i in range(num_orig_tokens):
+                curr_token = tokens_a[i]
+                while tokens_b[j] != curr_token:
+                    j += 1
+                idxs_replace.append(j)
+                j += 1
+            while j < 77:
+                idxs_replace.append(j)
+                j += 1
+            while len(idxs_replace) < 77:
+                idxs_replace.append(76)
+            idxs_replaces.append(idxs_replace)
+
+        # prepare batch: for each pair of setences, old context and new values
+        contexts, valuess = [], []
+        for old_emb, new_emb, idxs_replace in zip(old_embs, new_embs, idxs_replaces):
+            context = old_emb.detach()
+            values = []
+            with torch.no_grad():
+                for layer in self.projection_matrices:
+                    values.append(layer(new_emb[idxs_replace]).detach())
+            contexts.append(context)
+            valuess.append(values)
+
+        # edit the model
+        for layer_num in range(len(self.projection_matrices)):
+            # mat1 = \lambda W + \sum{v k^T}
+            mat1 = lamb * self.projection_matrices[layer_num].weight
+
+            # mat2 = \lambda I + \sum{k k^T}
+            mat2 = lamb * torch.eye(
+                self.projection_matrices[layer_num].weight.shape[1],
+                device=self.projection_matrices[layer_num].weight.device,
+            )
+
+            # aggregate sums for mat1, mat2
+            for context, values in zip(contexts, valuess):
+                context_vector = context.reshape(context.shape[0], context.shape[1], 1)
+                context_vector_T = context.reshape(context.shape[0], 1, context.shape[1])
+                value_vector = values[layer_num].reshape(values[layer_num].shape[0], values[layer_num].shape[1], 1)
+                for_mat1 = (value_vector @ context_vector_T).sum(dim=0)
+                for_mat2 = (context_vector @ context_vector_T).sum(dim=0)
+                mat1 += for_mat1
+                mat2 += for_mat2
+
+            # update projection matrix
+            self.projection_matrices[layer_num].weight = torch.nn.Parameter(mat1 @ torch.inverse(mat2))
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+
+        Examples:
+
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionModelEditingPipeline
+
+        >>> model_ckpt = "CompVis/stable-diffusion-v1-4"
+        >>> pipe = StableDiffusionModelEditingPipeline.from_pretrained(model_ckpt)
+
+        >>> pipe = pipe.to("cuda")
+
+        >>> source_prompt = "A pack of roses"
+        >>> destination_prompt = "A pack of blue roses"
+        >>> pipe.edit_model(source_prompt, destination_prompt)
+
+        >>> prompt = "A field of roses"
+        >>> image = pipe(prompt).images[0]
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py
new file mode 100644
index 000000000..ddc866ef9
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py
@@ -0,0 +1,786 @@
+# Copyright 2024 ParaDiGMS authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ....image_processor import VaeImageProcessor
+from ....loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ....models import AutoencoderKL, UNet2DConditionModel
+from ....models.lora import adjust_lora_scale_text_encoder
+from ....schedulers import KarrasDiffusionSchedulers
+from ....utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ...stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import DDPMParallelScheduler
+        >>> from diffusers import StableDiffusionParadigmsPipeline
+
+        >>> scheduler = DDPMParallelScheduler.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="scheduler")
+
+        >>> pipe = StableDiffusionParadigmsPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", scheduler=scheduler, torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> ngpu, batch_per_device = torch.cuda.device_count(), 5
+        >>> pipe.wrapped_unet = torch.nn.DataParallel(pipe.unet, device_ids=[d for d in range(ngpu)])
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt, parallel=ngpu * batch_per_device, num_inference_steps=1000).images[0]
+        ```
+"""
+
+
+class StableDiffusionParadigmsPipeline(
+    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
+):
+    r"""
+    Pipeline for text-to-image generation using a parallelized version of Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+        # attribute to wrap the unet with torch.nn.DataParallel when running multiple denoising steps on multiple GPUs
+        self.wrapped_unet = self.unet
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def _cumsum(self, input, dim, debug=False):
+        if debug:
+            # cumsum_cuda_kernel does not have a deterministic implementation
+            # so perform cumsum on cpu for debugging purposes
+            return torch.cumsum(input.cpu().float(), dim=dim).to(input.device)
+        else:
+            return torch.cumsum(input, dim=dim)
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        parallel: int = 10,
+        tolerance: float = 0.1,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        debug: bool = False,
+        clip_skip: int = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            parallel (`int`, *optional*, defaults to 10):
+                The batch size to use when doing parallel sampling. More parallelism may lead to faster inference but
+                requires higher memory usage and can also require more total FLOPs.
+            tolerance (`float`, *optional*, defaults to 0.1):
+                The error tolerance for determining when to slide the batch window forward for parallel sampling. Lower
+                tolerance usually leads to less or no degradation. Higher tolerance is faster but can risk degradation
+                of sample quality. The tolerance is specified as a ratio of the scheduler's noise magnitude.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            debug (`bool`, *optional*, defaults to `False`):
+                Whether or not to run in debug mode. In debug mode, `torch.cumsum` is evaluated using the CPU.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        extra_step_kwargs.pop("generator", None)
+
+        # # 7. Denoising loop
+        scheduler = self.scheduler
+        parallel = min(parallel, len(scheduler.timesteps))
+
+        begin_idx = 0
+        end_idx = parallel
+        latents_time_evolution_buffer = torch.stack([latents] * (len(scheduler.timesteps) + 1))
+
+        # We must make sure the noise of stochastic schedulers such as DDPM is sampled only once per timestep.
+        # Sampling inside the parallel denoising loop will mess this up, so we pre-sample the noise vectors outside the denoising loop.
+        noise_array = torch.zeros_like(latents_time_evolution_buffer)
+        for j in range(len(scheduler.timesteps)):
+            base_noise = randn_tensor(
+                shape=latents.shape, generator=generator, device=latents.device, dtype=prompt_embeds.dtype
+            )
+            noise = (self.scheduler._get_variance(scheduler.timesteps[j]) ** 0.5) * base_noise
+            noise_array[j] = noise.clone()
+
+        # We specify the error tolerance as a ratio of the scheduler's noise magnitude. We similarly compute the error tolerance
+        # outside of the denoising loop to avoid recomputing it at every step.
+        # We will be dividing the norm of the noise, so we store its inverse here to avoid a division at every step.
+        inverse_variance_norm = 1.0 / torch.tensor(
+            [scheduler._get_variance(scheduler.timesteps[j]) for j in range(len(scheduler.timesteps))] + [0]
+        ).to(noise_array.device)
+        latent_dim = noise_array[0, 0].numel()
+        inverse_variance_norm = inverse_variance_norm[:, None] / latent_dim
+
+        scaled_tolerance = tolerance**2
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            steps = 0
+            while begin_idx < len(scheduler.timesteps):
+                # these have shape (parallel_dim, 2*batch_size, ...)
+                # parallel_len is at most parallel, but could be less if we are at the end of the timesteps
+                # we are processing batch window of timesteps spanning [begin_idx, end_idx)
+                parallel_len = end_idx - begin_idx
+
+                block_prompt_embeds = torch.stack([prompt_embeds] * parallel_len)
+                block_latents = latents_time_evolution_buffer[begin_idx:end_idx]
+                block_t = scheduler.timesteps[begin_idx:end_idx, None].repeat(1, batch_size * num_images_per_prompt)
+                t_vec = block_t
+                if do_classifier_free_guidance:
+                    t_vec = t_vec.repeat(1, 2)
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([block_latents] * 2, dim=1) if do_classifier_free_guidance else block_latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t_vec)
+
+                # if parallel_len is small, no need to use multiple GPUs
+                net = self.wrapped_unet if parallel_len > 3 else self.unet
+                # predict the noise residual, shape is now [parallel_len * 2 * batch_size * num_images_per_prompt, ...]
+                model_output = net(
+                    latent_model_input.flatten(0, 1),
+                    t_vec.flatten(0, 1),
+                    encoder_hidden_states=block_prompt_embeds.flatten(0, 1),
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                per_latent_shape = model_output.shape[1:]
+                if do_classifier_free_guidance:
+                    model_output = model_output.reshape(
+                        parallel_len, 2, batch_size * num_images_per_prompt, *per_latent_shape
+                    )
+                    noise_pred_uncond, noise_pred_text = model_output[:, 0], model_output[:, 1]
+                    model_output = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                model_output = model_output.reshape(
+                    parallel_len * batch_size * num_images_per_prompt, *per_latent_shape
+                )
+
+                block_latents_denoise = scheduler.batch_step_no_noise(
+                    model_output=model_output,
+                    timesteps=block_t.flatten(0, 1),
+                    sample=block_latents.flatten(0, 1),
+                    **extra_step_kwargs,
+                ).reshape(block_latents.shape)
+
+                # back to shape (parallel_dim, batch_size, ...)
+                # now we want to add the pre-sampled noise
+                # parallel sampling algorithm requires computing the cumulative drift from the beginning
+                # of the window, so we need to compute cumulative sum of the deltas and the pre-sampled noises.
+                delta = block_latents_denoise - block_latents
+                cumulative_delta = self._cumsum(delta, dim=0, debug=debug)
+                cumulative_noise = self._cumsum(noise_array[begin_idx:end_idx], dim=0, debug=debug)
+
+                # if we are using an ODE-like scheduler (like DDIM), we don't want to add noise
+                if scheduler._is_ode_scheduler:
+                    cumulative_noise = 0
+
+                block_latents_new = (
+                    latents_time_evolution_buffer[begin_idx][None,] + cumulative_delta + cumulative_noise
+                )
+                cur_error = torch.linalg.norm(
+                    (block_latents_new - latents_time_evolution_buffer[begin_idx + 1 : end_idx + 1]).reshape(
+                        parallel_len, batch_size * num_images_per_prompt, -1
+                    ),
+                    dim=-1,
+                ).pow(2)
+                error_ratio = cur_error * inverse_variance_norm[begin_idx + 1 : end_idx + 1]
+
+                # find the first index of the vector error_ratio that is greater than error tolerance
+                # we can shift the window for the next iteration up to this index
+                error_ratio = torch.nn.functional.pad(
+                    error_ratio, (0, 0, 0, 1), value=1e9
+                )  # handle the case when everything is below ratio, by padding the end of parallel_len dimension
+                any_error_at_time = torch.max(error_ratio > scaled_tolerance, dim=1).values.int()
+                ind = torch.argmax(any_error_at_time).item()
+
+                # compute the new begin and end idxs for the window
+                new_begin_idx = begin_idx + min(1 + ind, parallel)
+                new_end_idx = min(new_begin_idx + parallel, len(scheduler.timesteps))
+
+                # store the computed latents for the current window in the global buffer
+                latents_time_evolution_buffer[begin_idx + 1 : end_idx + 1] = block_latents_new
+                # initialize the new sliding window latents with the end of the current window,
+                # should be better than random initialization
+                latents_time_evolution_buffer[end_idx : new_end_idx + 1] = latents_time_evolution_buffer[end_idx][
+                    None,
+                ]
+
+                steps += 1
+
+                progress_bar.update(new_begin_idx - begin_idx)
+                if callback is not None and steps % callback_steps == 0:
+                    callback(begin_idx, block_t[begin_idx], latents_time_evolution_buffer[begin_idx])
+
+                begin_idx = new_begin_idx
+                end_idx = new_end_idx
+
+        latents = latents_time_evolution_buffer[-1]
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py
new file mode 100644
index 000000000..c819e5728
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py
@@ -0,0 +1,1304 @@
+# Copyright 2024 Pix2Pix Zero Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import (
+    BlipForConditionalGeneration,
+    BlipProcessor,
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+)
+
+from ....image_processor import PipelineImageInput, VaeImageProcessor
+from ....loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ....models import AutoencoderKL, UNet2DConditionModel
+from ....models.attention_processor import Attention
+from ....models.lora import adjust_lora_scale_text_encoder
+from ....schedulers import DDIMScheduler, DDPMScheduler, EulerAncestralDiscreteScheduler, LMSDiscreteScheduler
+from ....schedulers.scheduling_ddim_inverse import DDIMInverseScheduler
+from ....utils import (
+    PIL_INTERPOLATION,
+    USE_PEFT_BACKEND,
+    BaseOutput,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ...stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from ...stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class Pix2PixInversionPipelineOutput(BaseOutput, TextualInversionLoaderMixin):
+    """
+    Output class for Stable Diffusion pipelines.
+
+    Args:
+        latents (`torch.FloatTensor`)
+            inverted latents tensor
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+    """
+
+    latents: torch.FloatTensor
+    images: Union[List[PIL.Image.Image], np.ndarray]
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import requests
+        >>> import torch
+
+        >>> from diffusers import DDIMScheduler, StableDiffusionPix2PixZeroPipeline
+
+
+        >>> def download(embedding_url, local_filepath):
+        ...     r = requests.get(embedding_url)
+        ...     with open(local_filepath, "wb") as f:
+        ...         f.write(r.content)
+
+
+        >>> model_ckpt = "CompVis/stable-diffusion-v1-4"
+        >>> pipeline = StableDiffusionPix2PixZeroPipeline.from_pretrained(model_ckpt, torch_dtype=torch.float16)
+        >>> pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
+        >>> pipeline.to("cuda")
+
+        >>> prompt = "a high resolution painting of a cat in the style of van gough"
+        >>> source_emb_url = "https://hf.co/datasets/sayakpaul/sample-datasets/resolve/main/cat.pt"
+        >>> target_emb_url = "https://hf.co/datasets/sayakpaul/sample-datasets/resolve/main/dog.pt"
+
+        >>> for url in [source_emb_url, target_emb_url]:
+        ...     download(url, url.split("/")[-1])
+
+        >>> src_embeds = torch.load(source_emb_url.split("/")[-1])
+        >>> target_embeds = torch.load(target_emb_url.split("/")[-1])
+        >>> images = pipeline(
+        ...     prompt,
+        ...     source_embeds=src_embeds,
+        ...     target_embeds=target_embeds,
+        ...     num_inference_steps=50,
+        ...     cross_attention_guidance_amount=0.15,
+        ... ).images
+
+        >>> images[0].save("edited_image_dog.png")
+        ```
+"""
+
+EXAMPLE_INVERT_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from transformers import BlipForConditionalGeneration, BlipProcessor
+        >>> from diffusers import DDIMScheduler, DDIMInverseScheduler, StableDiffusionPix2PixZeroPipeline
+
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> captioner_id = "Salesforce/blip-image-captioning-base"
+        >>> processor = BlipProcessor.from_pretrained(captioner_id)
+        >>> model = BlipForConditionalGeneration.from_pretrained(
+        ...     captioner_id, torch_dtype=torch.float16, low_cpu_mem_usage=True
+        ... )
+
+        >>> sd_model_ckpt = "CompVis/stable-diffusion-v1-4"
+        >>> pipeline = StableDiffusionPix2PixZeroPipeline.from_pretrained(
+        ...     sd_model_ckpt,
+        ...     caption_generator=model,
+        ...     caption_processor=processor,
+        ...     torch_dtype=torch.float16,
+        ...     safety_checker=None,
+        ... )
+
+        >>> pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
+        >>> pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config)
+        >>> pipeline.enable_model_cpu_offload()
+
+        >>> img_url = "https://github.com/pix2pixzero/pix2pix-zero/raw/main/assets/test_images/cats/cat_6.png"
+
+        >>> raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB").resize((512, 512))
+        >>> # generate caption
+        >>> caption = pipeline.generate_caption(raw_image)
+
+        >>> # "a photography of a cat with flowers and dai dai daie - daie - daie kasaii"
+        >>> inv_latents = pipeline.invert(caption, image=raw_image).latents
+        >>> # we need to generate source and target embeds
+
+        >>> source_prompts = ["a cat sitting on the street", "a cat playing in the field", "a face of a cat"]
+
+        >>> target_prompts = ["a dog sitting on the street", "a dog playing in the field", "a face of a dog"]
+
+        >>> source_embeds = pipeline.get_embeds(source_prompts)
+        >>> target_embeds = pipeline.get_embeds(target_prompts)
+        >>> # the latents can then be used to edit a real image
+        >>> # when using Stable Diffusion 2 or other models that use v-prediction
+        >>> # set `cross_attention_guidance_amount` to 0.01 or less to avoid input latent gradient explosion
+
+        >>> image = pipeline(
+        ...     caption,
+        ...     source_embeds=source_embeds,
+        ...     target_embeds=target_embeds,
+        ...     num_inference_steps=50,
+        ...     cross_attention_guidance_amount=0.15,
+        ...     generator=generator,
+        ...     latents=inv_latents,
+        ...     negative_prompt=caption,
+        ... ).images[0]
+        >>> image.save("edited_image.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
+def preprocess(image):
+    deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
+    deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+def prepare_unet(unet: UNet2DConditionModel):
+    """Modifies the UNet (`unet`) to perform Pix2Pix Zero optimizations."""
+    pix2pix_zero_attn_procs = {}
+    for name in unet.attn_processors.keys():
+        module_name = name.replace(".processor", "")
+        module = unet.get_submodule(module_name)
+        if "attn2" in name:
+            pix2pix_zero_attn_procs[name] = Pix2PixZeroAttnProcessor(is_pix2pix_zero=True)
+            module.requires_grad_(True)
+        else:
+            pix2pix_zero_attn_procs[name] = Pix2PixZeroAttnProcessor(is_pix2pix_zero=False)
+            module.requires_grad_(False)
+
+    unet.set_attn_processor(pix2pix_zero_attn_procs)
+    return unet
+
+
+class Pix2PixZeroL2Loss:
+    def __init__(self):
+        self.loss = 0.0
+
+    def compute_loss(self, predictions, targets):
+        self.loss += ((predictions - targets) ** 2).sum((1, 2)).mean(0)
+
+
+class Pix2PixZeroAttnProcessor:
+    """An attention processor class to store the attention weights.
+    In Pix2Pix Zero, it happens during computations in the cross-attention blocks."""
+
+    def __init__(self, is_pix2pix_zero=False):
+        self.is_pix2pix_zero = is_pix2pix_zero
+        if self.is_pix2pix_zero:
+            self.reference_cross_attn_map = {}
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        timestep=None,
+        loss=None,
+    ):
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        if self.is_pix2pix_zero and timestep is not None:
+            # new bookkeeping to save the attention weights.
+            if loss is None:
+                self.reference_cross_attn_map[timestep.item()] = attention_probs.detach().cpu()
+            # compute loss
+            elif loss is not None:
+                prev_attn_probs = self.reference_cross_attn_map.pop(timestep.item())
+                loss.compute_loss(attention_probs, prev_attn_probs.to(attention_probs.device))
+
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        return hidden_states
+
+
+class StableDiffusionPix2PixZeroPipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for pixel-level image editing using Pix2Pix Zero. Based on Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`], or [`DDPMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+        requires_safety_checker (bool):
+            Whether the pipeline requires a safety checker. We recommend setting it to True if you're using the
+            pipeline publicly.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = [
+        "safety_checker",
+        "feature_extractor",
+        "caption_generator",
+        "caption_processor",
+        "inverse_scheduler",
+    ]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDPMScheduler, DDIMScheduler, EulerAncestralDiscreteScheduler, LMSDiscreteScheduler],
+        feature_extractor: CLIPImageProcessor,
+        safety_checker: StableDiffusionSafetyChecker,
+        inverse_scheduler: DDIMInverseScheduler,
+        caption_generator: BlipForConditionalGeneration,
+        caption_processor: BlipProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            caption_processor=caption_processor,
+            caption_generator=caption_generator,
+            inverse_scheduler=inverse_scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        source_embeds,
+        target_embeds,
+        callback_steps,
+        prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if source_embeds is None and target_embeds is None:
+            raise ValueError("`source_embeds` and `target_embeds` cannot be undefined.")
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+    #  Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def generate_caption(self, images):
+        """Generates caption for a given image."""
+        text = "a photography of"
+
+        prev_device = self.caption_generator.device
+
+        device = self._execution_device
+        inputs = self.caption_processor(images, text, return_tensors="pt").to(
+            device=device, dtype=self.caption_generator.dtype
+        )
+        self.caption_generator.to(device)
+        outputs = self.caption_generator.generate(**inputs, max_new_tokens=128)
+
+        # offload caption generator
+        self.caption_generator.to(prev_device)
+
+        caption = self.caption_processor.batch_decode(outputs, skip_special_tokens=True)[0]
+        return caption
+
+    def construct_direction(self, embs_source: torch.Tensor, embs_target: torch.Tensor):
+        """Constructs the edit direction to steer the image generation process semantically."""
+        return (embs_target.mean(0) - embs_source.mean(0)).unsqueeze(0)
+
+    @torch.no_grad()
+    def get_embeds(self, prompt: List[str], batch_size: int = 16) -> torch.FloatTensor:
+        num_prompts = len(prompt)
+        embeds = []
+        for i in range(0, num_prompts, batch_size):
+            prompt_slice = prompt[i : i + batch_size]
+
+            input_ids = self.tokenizer(
+                prompt_slice,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            ).input_ids
+
+            input_ids = input_ids.to(self.text_encoder.device)
+            embeds.append(self.text_encoder(input_ids)[0])
+
+        return torch.cat(embeds, dim=0).mean(0)[None]
+
+    def prepare_image_latents(self, image, batch_size, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        if image.shape[1] == 4:
+            latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            if isinstance(generator, list):
+                latents = [
+                    self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
+                ]
+                latents = torch.cat(latents, dim=0)
+            else:
+                latents = self.vae.encode(image).latent_dist.sample(generator)
+
+            latents = self.vae.config.scaling_factor * latents
+
+        if batch_size != latents.shape[0]:
+            if batch_size % latents.shape[0] == 0:
+                # expand image_latents for batch_size
+                deprecation_message = (
+                    f"You have passed {batch_size} text prompts (`prompt`), but only {latents.shape[0]} initial"
+                    " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                    " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                    " your script to pass as many initial images as text prompts to suppress this warning."
+                )
+                deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+                additional_latents_per_image = batch_size // latents.shape[0]
+                latents = torch.cat([latents] * additional_latents_per_image, dim=0)
+            else:
+                raise ValueError(
+                    f"Cannot duplicate `image` of batch size {latents.shape[0]} to {batch_size} text prompts."
+                )
+        else:
+            latents = torch.cat([latents], dim=0)
+
+        return latents
+
+    def get_epsilon(self, model_output: torch.Tensor, sample: torch.Tensor, timestep: int):
+        pred_type = self.inverse_scheduler.config.prediction_type
+        alpha_prod_t = self.inverse_scheduler.alphas_cumprod[timestep]
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        if pred_type == "epsilon":
+            return model_output
+        elif pred_type == "sample":
+            return (sample - alpha_prod_t ** (0.5) * model_output) / beta_prod_t ** (0.5)
+        elif pred_type == "v_prediction":
+            return (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {pred_type} must be one of `epsilon`, `sample`, or `v_prediction`"
+            )
+
+    def auto_corr_loss(self, hidden_states, generator=None):
+        reg_loss = 0.0
+        for i in range(hidden_states.shape[0]):
+            for j in range(hidden_states.shape[1]):
+                noise = hidden_states[i : i + 1, j : j + 1, :, :]
+                while True:
+                    roll_amount = torch.randint(noise.shape[2] // 2, (1,), generator=generator).item()
+                    reg_loss += (noise * torch.roll(noise, shifts=roll_amount, dims=2)).mean() ** 2
+                    reg_loss += (noise * torch.roll(noise, shifts=roll_amount, dims=3)).mean() ** 2
+
+                    if noise.shape[2] <= 8:
+                        break
+                    noise = F.avg_pool2d(noise, kernel_size=2)
+        return reg_loss
+
+    def kl_divergence(self, hidden_states):
+        mean = hidden_states.mean()
+        var = hidden_states.var()
+        return var + mean**2 - 1 - torch.log(var + 1e-7)
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        source_embeds: torch.Tensor = None,
+        target_embeds: torch.Tensor = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        cross_attention_guidance_amount: float = 0.1,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            source_embeds (`torch.Tensor`):
+                Source concept embeddings. Generation of the embeddings as per the [original
+                paper](https://arxiv.org/abs/2302.03027). Used in discovering the edit direction.
+            target_embeds (`torch.Tensor`):
+                Target concept embeddings. Generation of the embeddings as per the [original
+                paper](https://arxiv.org/abs/2302.03027). Used in discovering the edit direction.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            cross_attention_guidance_amount (`float`, defaults to 0.1):
+                Amount of guidance needed from the reference cross-attention maps.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Define the spatial resolutions.
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            source_embeds,
+            target_embeds,
+            callback_steps,
+            prompt_embeds,
+        )
+
+        # 3. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if cross_attention_kwargs is None:
+            cross_attention_kwargs = {}
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Generate the inverted noise from the input image or any other image
+        # generated from the input prompt.
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        latents_init = latents.clone()
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8. Rejig the UNet so that we can obtain the cross-attenion maps and
+        # use them for guiding the subsequent image generation.
+        self.unet = prepare_unet(self.unet)
+
+        # 7. Denoising loop where we obtain the cross-attention maps.
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs={"timestep": t},
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # 8. Compute the edit directions.
+        edit_direction = self.construct_direction(source_embeds, target_embeds).to(prompt_embeds.device)
+
+        # 9. Edit the prompt embeddings as per the edit directions discovered.
+        prompt_embeds_edit = prompt_embeds.clone()
+        prompt_embeds_edit[1:2] += edit_direction
+
+        # 10. Second denoising loop to generate the edited image.
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        latents = latents_init
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # we want to learn the latent such that it steers the generation
+                # process towards the edited direction, so make the make initial
+                # noise learnable
+                x_in = latent_model_input.detach().clone()
+                x_in.requires_grad = True
+
+                # optimizer
+                opt = torch.optim.SGD([x_in], lr=cross_attention_guidance_amount)
+
+                with torch.enable_grad():
+                    # initialize loss
+                    loss = Pix2PixZeroL2Loss()
+
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        x_in,
+                        t,
+                        encoder_hidden_states=prompt_embeds_edit.detach(),
+                        cross_attention_kwargs={"timestep": t, "loss": loss},
+                    ).sample
+
+                    loss.loss.backward(retain_graph=False)
+                    opt.step()
+
+                # recompute the noise
+                noise_pred = self.unet(
+                    x_in.detach(),
+                    t,
+                    encoder_hidden_states=prompt_embeds_edit,
+                    cross_attention_kwargs={"timestep": None},
+                ).sample
+
+                latents = x_in.detach().chunk(2)[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_INVERT_DOC_STRING)
+    def invert(
+        self,
+        prompt: Optional[str] = None,
+        image: PipelineImageInput = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        cross_attention_guidance_amount: float = 0.1,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        lambda_auto_corr: float = 20.0,
+        lambda_kl: float = 20.0,
+        num_reg_steps: int = 5,
+        num_auto_corr_rolls: int = 5,
+    ):
+        r"""
+        Function used to generate inverted latents given a prompt and image.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, or tensor representing an image batch which will be used for conditioning. Can also accept
+                image latents as `image`, if passing latents directly, it will not be encoded again.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 1):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            cross_attention_guidance_amount (`float`, defaults to 0.1):
+                Amount of guidance needed from the reference cross-attention maps.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            lambda_auto_corr (`float`, *optional*, defaults to 20.0):
+                Lambda parameter to control auto correction
+            lambda_kl (`float`, *optional*, defaults to 20.0):
+                Lambda parameter to control Kullback–Leibler divergence output
+            num_reg_steps (`int`, *optional*, defaults to 5):
+                Number of regularization loss steps
+            num_auto_corr_rolls (`int`, *optional*, defaults to 5):
+                Number of auto correction roll steps
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.pipeline_stable_diffusion_pix2pix_zero.Pix2PixInversionPipelineOutput`] or
+            `tuple`:
+            [`~pipelines.stable_diffusion.pipeline_stable_diffusion_pix2pix_zero.Pix2PixInversionPipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is the inverted
+            latents tensor and then second is the corresponding decoded image.
+        """
+        # 1. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if cross_attention_kwargs is None:
+            cross_attention_kwargs = {}
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Preprocess image
+        image = self.image_processor.preprocess(image)
+
+        # 4. Prepare latent variables
+        latents = self.prepare_image_latents(image, batch_size, self.vae.dtype, device, generator)
+
+        # 5. Encode input prompt
+        num_images_per_prompt = 1
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            prompt_embeds=prompt_embeds,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        self.inverse_scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.inverse_scheduler.timesteps
+
+        # 6. Rejig the UNet so that we can obtain the cross-attenion maps and
+        # use them for guiding the subsequent image generation.
+        self.unet = prepare_unet(self.unet)
+
+        # 7. Denoising loop where we obtain the cross-attention maps.
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.inverse_scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.inverse_scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs={"timestep": t},
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # regularization of the noise prediction
+                with torch.enable_grad():
+                    for _ in range(num_reg_steps):
+                        if lambda_auto_corr > 0:
+                            for _ in range(num_auto_corr_rolls):
+                                var = torch.autograd.Variable(noise_pred.detach().clone(), requires_grad=True)
+
+                                # Derive epsilon from model output before regularizing to IID standard normal
+                                var_epsilon = self.get_epsilon(var, latent_model_input.detach(), t)
+
+                                l_ac = self.auto_corr_loss(var_epsilon, generator=generator)
+                                l_ac.backward()
+
+                                grad = var.grad.detach() / num_auto_corr_rolls
+                                noise_pred = noise_pred - lambda_auto_corr * grad
+
+                        if lambda_kl > 0:
+                            var = torch.autograd.Variable(noise_pred.detach().clone(), requires_grad=True)
+
+                            # Derive epsilon from model output before regularizing to IID standard normal
+                            var_epsilon = self.get_epsilon(var, latent_model_input.detach(), t)
+
+                            l_kld = self.kl_divergence(var_epsilon)
+                            l_kld.backward()
+
+                            grad = var.grad.detach()
+                            noise_pred = noise_pred - lambda_kl * grad
+
+                        noise_pred = noise_pred.detach()
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.inverse_scheduler.step(noise_pred, t, latents).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.inverse_scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        inverted_latents = latents.detach().clone()
+
+        # 8. Post-processing
+        image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (inverted_latents, image)
+
+        return Pix2PixInversionPipelineOutput(latents=inverted_latents, images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stochastic_karras_ve/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stochastic_karras_ve/__init__.py
new file mode 100644
index 000000000..15c9a8c27
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stochastic_karras_ve/__init__.py
@@ -0,0 +1,19 @@
+from typing import TYPE_CHECKING
+
+from ....utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
+
+
+_import_structure = {"pipeline_stochastic_karras_ve": ["KarrasVePipeline"]}
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .pipeline_stochastic_karras_ve import KarrasVePipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
new file mode 100644
index 000000000..023edb4ce
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
@@ -0,0 +1,128 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from ....models import UNet2DModel
+from ....schedulers import KarrasVeScheduler
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+class KarrasVePipeline(DiffusionPipeline):
+    r"""
+    Pipeline for unconditional image generation.
+
+    Parameters:
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image.
+        scheduler ([`KarrasVeScheduler`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image.
+    """
+
+    # add type hints for linting
+    unet: UNet2DModel
+    scheduler: KarrasVeScheduler
+
+    def __init__(self, unet: UNet2DModel, scheduler: KarrasVeScheduler):
+        super().__init__()
+        self.register_modules(unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        batch_size: int = 1,
+        num_inference_steps: int = 50,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ) -> Union[Tuple, ImagePipelineOutput]:
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of images to generate.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+
+        img_size = self.unet.config.sample_size
+        shape = (batch_size, 3, img_size, img_size)
+
+        model = self.unet
+
+        # sample x_0 ~ N(0, sigma_0^2 * I)
+        sample = randn_tensor(shape, generator=generator, device=self.device) * self.scheduler.init_noise_sigma
+
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        for t in self.progress_bar(self.scheduler.timesteps):
+            # here sigma_t == t_i from the paper
+            sigma = self.scheduler.schedule[t]
+            sigma_prev = self.scheduler.schedule[t - 1] if t > 0 else 0
+
+            # 1. Select temporarily increased noise level sigma_hat
+            # 2. Add new noise to move from sample_i to sample_hat
+            sample_hat, sigma_hat = self.scheduler.add_noise_to_input(sample, sigma, generator=generator)
+
+            # 3. Predict the noise residual given the noise magnitude `sigma_hat`
+            # The model inputs and output are adjusted by following eq. (213) in [1].
+            model_output = (sigma_hat / 2) * model((sample_hat + 1) / 2, sigma_hat / 2).sample
+
+            # 4. Evaluate dx/dt at sigma_hat
+            # 5. Take Euler step from sigma to sigma_prev
+            step_output = self.scheduler.step(model_output, sigma_hat, sigma_prev, sample_hat)
+
+            if sigma_prev != 0:
+                # 6. Apply 2nd order correction
+                # The model inputs and output are adjusted by following eq. (213) in [1].
+                model_output = (sigma_prev / 2) * model((step_output.prev_sample + 1) / 2, sigma_prev / 2).sample
+                step_output = self.scheduler.step_correct(
+                    model_output,
+                    sigma_hat,
+                    sigma_prev,
+                    sample_hat,
+                    step_output.prev_sample,
+                    step_output["derivative"],
+                )
+            sample = step_output.prev_sample
+
+        sample = (sample / 2 + 0.5).clamp(0, 1)
+        image = sample.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/__init__.py
new file mode 100644
index 000000000..8ea6ef6e2
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/__init__.py
@@ -0,0 +1,71 @@
+from typing import TYPE_CHECKING
+
+from ....utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+    is_transformers_available,
+    is_transformers_version,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ....utils.dummy_torch_and_transformers_objects import (
+        VersatileDiffusionDualGuidedPipeline,
+        VersatileDiffusionImageVariationPipeline,
+        VersatileDiffusionPipeline,
+        VersatileDiffusionTextToImagePipeline,
+    )
+
+    _dummy_objects.update(
+        {
+            "VersatileDiffusionDualGuidedPipeline": VersatileDiffusionDualGuidedPipeline,
+            "VersatileDiffusionImageVariationPipeline": VersatileDiffusionImageVariationPipeline,
+            "VersatileDiffusionPipeline": VersatileDiffusionPipeline,
+            "VersatileDiffusionTextToImagePipeline": VersatileDiffusionTextToImagePipeline,
+        }
+    )
+else:
+    _import_structure["modeling_text_unet"] = ["UNetFlatConditionModel"]
+    _import_structure["pipeline_versatile_diffusion"] = ["VersatileDiffusionPipeline"]
+    _import_structure["pipeline_versatile_diffusion_dual_guided"] = ["VersatileDiffusionDualGuidedPipeline"]
+    _import_structure["pipeline_versatile_diffusion_image_variation"] = ["VersatileDiffusionImageVariationPipeline"]
+    _import_structure["pipeline_versatile_diffusion_text_to_image"] = ["VersatileDiffusionTextToImagePipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ....utils.dummy_torch_and_transformers_objects import (
+            VersatileDiffusionDualGuidedPipeline,
+            VersatileDiffusionImageVariationPipeline,
+            VersatileDiffusionPipeline,
+            VersatileDiffusionTextToImagePipeline,
+        )
+    else:
+        from .pipeline_versatile_diffusion import VersatileDiffusionPipeline
+        from .pipeline_versatile_diffusion_dual_guided import VersatileDiffusionDualGuidedPipeline
+        from .pipeline_versatile_diffusion_image_variation import VersatileDiffusionImageVariationPipeline
+        from .pipeline_versatile_diffusion_text_to_image import VersatileDiffusionTextToImagePipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
new file mode 100644
index 000000000..62a3a8728
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
@@ -0,0 +1,2508 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from diffusers.utils import deprecate
+
+from ....configuration_utils import ConfigMixin, register_to_config
+from ....models import ModelMixin
+from ....models.activations import get_activation
+from ....models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    Attention,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnAddedKVProcessor2_0,
+    AttnProcessor,
+)
+from ....models.embeddings import (
+    GaussianFourierProjection,
+    ImageHintTimeEmbedding,
+    ImageProjection,
+    ImageTimeEmbedding,
+    TextImageProjection,
+    TextImageTimeEmbedding,
+    TextTimeEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
+from ....models.resnet import ResnetBlockCondNorm2D
+from ....models.transformers.dual_transformer_2d import DualTransformer2DModel
+from ....models.transformers.transformer_2d import Transformer2DModel
+from ....models.unets.unet_2d_condition import UNet2DConditionOutput
+from ....utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
+from ....utils.torch_utils import apply_freeu
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def get_down_block(
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    num_attention_heads,
+    transformer_layers_per_block,
+    attention_type,
+    attention_head_dim,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    downsample_padding=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+    resnet_skip_time_act=False,
+    resnet_out_scale_factor=1.0,
+    cross_attention_norm=None,
+    dropout=0.0,
+):
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+    if down_block_type == "DownBlockFlat":
+        return DownBlockFlat(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "CrossAttnDownBlockFlat":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlockFlat")
+        return CrossAttnDownBlockFlat(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    raise ValueError(f"{down_block_type} is not supported.")
+
+
+def get_up_block(
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    num_attention_heads,
+    transformer_layers_per_block,
+    resolution_idx,
+    attention_type,
+    attention_head_dim,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+    resnet_skip_time_act=False,
+    resnet_out_scale_factor=1.0,
+    cross_attention_norm=None,
+    dropout=0.0,
+):
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    if up_block_type == "UpBlockFlat":
+        return UpBlockFlat(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "CrossAttnUpBlockFlat":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlockFlat")
+        return CrossAttnUpBlockFlat(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            dropout=dropout,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    raise ValueError(f"{up_block_type} is not supported.")
+
+
+class FourierEmbedder(nn.Module):
+    def __init__(self, num_freqs=64, temperature=100):
+        super().__init__()
+
+        self.num_freqs = num_freqs
+        self.temperature = temperature
+
+        freq_bands = temperature ** (torch.arange(num_freqs) / num_freqs)
+        freq_bands = freq_bands[None, None, None]
+        self.register_buffer("freq_bands", freq_bands, persistent=False)
+
+    def __call__(self, x):
+        x = self.freq_bands * x.unsqueeze(-1)
+        return torch.stack((x.sin(), x.cos()), dim=-1).permute(0, 1, 3, 4, 2).reshape(*x.shape[:2], -1)
+
+
+class GLIGENTextBoundingboxProjection(nn.Module):
+    def __init__(self, positive_len, out_dim, feature_type, fourier_freqs=8):
+        super().__init__()
+        self.positive_len = positive_len
+        self.out_dim = out_dim
+
+        self.fourier_embedder = FourierEmbedder(num_freqs=fourier_freqs)
+        self.position_dim = fourier_freqs * 2 * 4  # 2: sin/cos, 4: xyxy
+
+        if isinstance(out_dim, tuple):
+            out_dim = out_dim[0]
+
+        if feature_type == "text-only":
+            self.linears = nn.Sequential(
+                nn.Linear(self.positive_len + self.position_dim, 512),
+                nn.SiLU(),
+                nn.Linear(512, 512),
+                nn.SiLU(),
+                nn.Linear(512, out_dim),
+            )
+            self.null_positive_feature = torch.nn.Parameter(torch.zeros([self.positive_len]))
+
+        elif feature_type == "text-image":
+            self.linears_text = nn.Sequential(
+                nn.Linear(self.positive_len + self.position_dim, 512),
+                nn.SiLU(),
+                nn.Linear(512, 512),
+                nn.SiLU(),
+                nn.Linear(512, out_dim),
+            )
+            self.linears_image = nn.Sequential(
+                nn.Linear(self.positive_len + self.position_dim, 512),
+                nn.SiLU(),
+                nn.Linear(512, 512),
+                nn.SiLU(),
+                nn.Linear(512, out_dim),
+            )
+            self.null_text_feature = torch.nn.Parameter(torch.zeros([self.positive_len]))
+            self.null_image_feature = torch.nn.Parameter(torch.zeros([self.positive_len]))
+
+        self.null_position_feature = torch.nn.Parameter(torch.zeros([self.position_dim]))
+
+    def forward(
+        self,
+        boxes,
+        masks,
+        positive_embeddings=None,
+        phrases_masks=None,
+        image_masks=None,
+        phrases_embeddings=None,
+        image_embeddings=None,
+    ):
+        masks = masks.unsqueeze(-1)
+
+        xyxy_embedding = self.fourier_embedder(boxes)
+        xyxy_null = self.null_position_feature.view(1, 1, -1)
+        xyxy_embedding = xyxy_embedding * masks + (1 - masks) * xyxy_null
+
+        if positive_embeddings:
+            positive_null = self.null_positive_feature.view(1, 1, -1)
+            positive_embeddings = positive_embeddings * masks + (1 - masks) * positive_null
+
+            objs = self.linears(torch.cat([positive_embeddings, xyxy_embedding], dim=-1))
+        else:
+            phrases_masks = phrases_masks.unsqueeze(-1)
+            image_masks = image_masks.unsqueeze(-1)
+
+            text_null = self.null_text_feature.view(1, 1, -1)
+            image_null = self.null_image_feature.view(1, 1, -1)
+
+            phrases_embeddings = phrases_embeddings * phrases_masks + (1 - phrases_masks) * text_null
+            image_embeddings = image_embeddings * image_masks + (1 - image_masks) * image_null
+
+            objs_text = self.linears_text(torch.cat([phrases_embeddings, xyxy_embedding], dim=-1))
+            objs_image = self.linears_image(torch.cat([image_embeddings, xyxy_embedding], dim=-1))
+            objs = torch.cat([objs_text, objs_image], dim=1)
+
+        return objs
+
+
+class UNetFlatConditionModel(ModelMixin, ConfigMixin):
+    r"""
+    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
+    shaped output.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample.
+        in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlockFlat", "CrossAttnDownBlockFlat", "CrossAttnDownBlockFlat", "DownBlockFlat")`):
+            The tuple of downsample blocks to use.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlockFlatCrossAttn"`):
+            Block type for middle of UNet, it can be one of `UNetMidBlockFlatCrossAttn`, `UNetMidBlockFlat`, or
+            `UNetMidBlockFlatSimpleCrossAttn`. If `None`, the mid block layer is skipped.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlockFlat", "CrossAttnUpBlockFlat", "CrossAttnUpBlockFlat", "CrossAttnUpBlockFlat")`):
+            The tuple of upsample blocks to use.
+        only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
+            Whether to include self-attention in the basic transformer blocks, see
+            [`~models.attention.BasicTransformerBlock`].
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+            If `None`, normalization and activation layers is skipped in post-processing.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
+        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
+            The dimension of the cross attention features.
+        transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
+            [`~models.unet_2d_blocks.CrossAttnDownBlockFlat`], [`~models.unet_2d_blocks.CrossAttnUpBlockFlat`],
+            [`~models.unet_2d_blocks.UNetMidBlockFlatCrossAttn`].
+       reverse_transformer_layers_per_block : (`Tuple[Tuple]`, *optional*, defaults to None):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`], in the upsampling
+            blocks of the U-Net. Only relevant if `transformer_layers_per_block` is of type `Tuple[Tuple]` and for
+            [`~models.unet_2d_blocks.CrossAttnDownBlockFlat`], [`~models.unet_2d_blocks.CrossAttnUpBlockFlat`],
+            [`~models.unet_2d_blocks.UNetMidBlockFlatCrossAttn`].
+        encoder_hid_dim (`int`, *optional*, defaults to None):
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
+            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+        num_attention_heads (`int`, *optional*):
+            The number of attention heads. If not defined, defaults to `attention_head_dim`
+        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
+            for ResNet blocks (see [`~models.resnet.ResnetBlockFlat`]). Choose from `default` or `scale_shift`.
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to `None`):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
+        addition_time_embed_dim: (`int`, *optional*, defaults to `None`):
+            Dimension for the timestep embeddings.
+        num_class_embeds (`int`, *optional*, defaults to `None`):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        time_embedding_type (`str`, *optional*, defaults to `positional`):
+            The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
+        time_embedding_dim (`int`, *optional*, defaults to `None`):
+            An optional override for the dimension of the projected time embedding.
+        time_embedding_act_fn (`str`, *optional*, defaults to `None`):
+            Optional activation function to use only once on the time embeddings before they are passed to the rest of
+            the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
+        timestep_post_act (`str`, *optional*, defaults to `None`):
+            The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
+        time_cond_proj_dim (`int`, *optional*, defaults to `None`):
+            The dimension of `cond_proj` layer in the timestep embedding.
+        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer. conv_out_kernel (`int`,
+        *optional*, default to `3`): The kernel size of `conv_out` layer. projection_class_embeddings_input_dim (`int`,
+        *optional*): The dimension of the `class_labels` input when
+            `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
+        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
+            embeddings with the class embeddings.
+        mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
+            Whether to use cross attention with the mid block when using the `UNetMidBlockFlatSimpleCrossAttn`. If
+            `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the
+            `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False`
+            otherwise.
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlockFlat",
+            "CrossAttnDownBlockFlat",
+            "CrossAttnDownBlockFlat",
+            "DownBlockFlat",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlockFlatCrossAttn",
+        up_block_types: Tuple[str] = (
+            "UpBlockFlat",
+            "CrossAttnUpBlockFlat",
+            "CrossAttnUpBlockFlat",
+            "CrossAttnUpBlockFlat",
+        ),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: Union[int, Tuple[int]] = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        dropout: float = 0.0,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: Union[int, Tuple[int]] = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
+        reverse_transformer_layers_per_block: Optional[Tuple[Tuple[int]]] = None,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        resnet_skip_time_act: bool = False,
+        resnet_out_scale_factor: int = 1.0,
+        time_embedding_type: str = "positional",
+        time_embedding_dim: Optional[int] = None,
+        time_embedding_act_fn: Optional[str] = None,
+        timestep_post_act: Optional[str] = None,
+        time_cond_proj_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        conv_out_kernel: int = 3,
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        attention_type: str = "default",
+        class_embeddings_concat: bool = False,
+        mid_block_only_cross_attention: Optional[bool] = None,
+        cross_attention_norm: Optional[str] = None,
+        addition_embed_type_num_heads=64,
+    ):
+        super().__init__()
+
+        self.sample_size = sample_size
+
+        if num_attention_heads is not None:
+            raise ValueError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(transformer_layers_per_block, list) and reverse_transformer_layers_per_block is None:
+            for layer_number_per_block in transformer_layers_per_block:
+                if isinstance(layer_number_per_block, list):
+                    raise ValueError("Must provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet.")
+
+        # input
+        conv_in_padding = (conv_in_kernel - 1) // 2
+        self.conv_in = LinearMultiDim(
+            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
+        )
+
+        # time
+        if time_embedding_type == "fourier":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
+            if time_embed_dim % 2 != 0:
+                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
+            self.time_proj = GaussianFourierProjection(
+                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+            )
+            timestep_input_dim = time_embed_dim
+        elif time_embedding_type == "positional":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
+
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        else:
+            raise ValueError(
+                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
+            )
+
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+            post_act_fn=timestep_post_act,
+            cond_proj_dim=time_cond_proj_dim,
+        )
+
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2
+            self.encoder_hid_proj = ImageProjection(
+                image_embed_dim=encoder_hid_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None
+
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif class_embed_type == "simple_projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
+            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif addition_embed_type == "image":
+            # Kandinsky 2.2
+            self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type == "image_hint":
+            # Kandinsky 2.2 ControlNet
+            self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
+
+        if time_embedding_act_fn is None:
+            self.time_embed_act = None
+        else:
+            self.time_embed_act = get_activation(time_embedding_act_fn)
+
+        self.down_blocks = nn.ModuleList([])
+        self.up_blocks = nn.ModuleList([])
+
+        if isinstance(only_cross_attention, bool):
+            if mid_block_only_cross_attention is None:
+                mid_block_only_cross_attention = only_cross_attention
+
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+
+        if mid_block_only_cross_attention is None:
+            mid_block_only_cross_attention = False
+
+        if isinstance(num_attention_heads, int):
+            num_attention_heads = (num_attention_heads,) * len(down_block_types)
+
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
+        if isinstance(cross_attention_dim, int):
+            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
+
+        if isinstance(layers_per_block, int):
+            layers_per_block = [layers_per_block] * len(down_block_types)
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+
+        if class_embeddings_concat:
+            # The time embeddings are concatenated with the class embeddings. The dimension of the
+            # time embeddings passed to the down, middle, and up blocks is twice the dimension of the
+            # regular time embeddings
+            blocks_time_embed_dim = time_embed_dim * 2
+        else:
+            blocks_time_embed_dim = time_embed_dim
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block[i],
+                transformer_layers_per_block=transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim[i],
+                num_attention_heads=num_attention_heads[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                dropout=dropout,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        if mid_block_type == "UNetMidBlockFlatCrossAttn":
+            self.mid_block = UNetMidBlockFlatCrossAttn(
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim[-1],
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+                attention_type=attention_type,
+            )
+        elif mid_block_type == "UNetMidBlockFlatSimpleCrossAttn":
+            self.mid_block = UNetMidBlockFlatSimpleCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                cross_attention_dim=cross_attention_dim[-1],
+                attention_head_dim=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                skip_time_act=resnet_skip_time_act,
+                only_cross_attention=mid_block_only_cross_attention,
+                cross_attention_norm=cross_attention_norm,
+            )
+        elif mid_block_type == "UNetMidBlockFlat":
+            self.mid_block = UNetMidBlockFlat(
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                num_layers=0,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                add_attention=False,
+            )
+        elif mid_block_type is None:
+            self.mid_block = None
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+
+        # count how many layers upsample the images
+        self.num_upsamplers = 0
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_num_attention_heads = list(reversed(num_attention_heads))
+        reversed_layers_per_block = list(reversed(layers_per_block))
+        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
+        reversed_transformer_layers_per_block = (
+            list(reversed(transformer_layers_per_block))
+            if reverse_transformer_layers_per_block is None
+            else reverse_transformer_layers_per_block
+        )
+        only_cross_attention = list(reversed(only_cross_attention))
+
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=reversed_layers_per_block[i] + 1,
+                transformer_layers_per_block=reversed_transformer_layers_per_block[i],
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=blocks_time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resolution_idx=i,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=reversed_cross_attention_dim[i],
+                num_attention_heads=reversed_num_attention_heads[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                attention_type=attention_type,
+                resnet_skip_time_act=resnet_skip_time_act,
+                resnet_out_scale_factor=resnet_out_scale_factor,
+                cross_attention_norm=cross_attention_norm,
+                attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
+                dropout=dropout,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        if norm_num_groups is not None:
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+            )
+
+            self.conv_act = get_activation(act_fn)
+
+        else:
+            self.conv_norm_out = None
+            self.conv_act = None
+
+        conv_out_padding = (conv_out_kernel - 1) // 2
+        self.conv_out = LinearMultiDim(
+            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
+        )
+
+        if attention_type in ["gated", "gated-text-image"]:
+            positive_len = 768
+            if isinstance(cross_attention_dim, int):
+                positive_len = cross_attention_dim
+            elif isinstance(cross_attention_dim, tuple) or isinstance(cross_attention_dim, list):
+                positive_len = cross_attention_dim[0]
+
+            feature_type = "text-only" if attention_type == "gated" else "text-image"
+            self.position_net = GLIGENTextBoundingboxProjection(
+                positive_len=positive_len, out_dim=cross_attention_dim, feature_type=feature_type
+            )
+
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+
+        num_sliceable_layers = len(sliceable_head_dims)
+
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+
+        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    def enable_freeu(self, s1, s2, b1, b2):
+        r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stage blocks where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
+        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        for i, upsample_block in enumerate(self.up_blocks):
+            setattr(upsample_block, "s1", s1)
+            setattr(upsample_block, "s2", s2)
+            setattr(upsample_block, "b1", b1)
+            setattr(upsample_block, "b2", b2)
+
+    def disable_freeu(self):
+        """Disables the FreeU mechanism."""
+        freeu_keys = {"s1", "s2", "b1", "b2"}
+        for i, upsample_block in enumerate(self.up_blocks):
+            for k in freeu_keys:
+                if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
+                    setattr(upsample_block, k, None)
+
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+
+    def unload_lora(self):
+        """Unloads LoRA weights."""
+        deprecate(
+            "unload_lora",
+            "0.28.0",
+            "Calling `unload_lora()` is deprecated and will be removed in a future version. Please install `peft` and then call `disable_adapters().",
+        )
+        for module in self.modules():
+            if hasattr(module, "set_lora_layer"):
+                module.set_lora_layer(None)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+        down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        mid_block_additional_residual: Optional[torch.Tensor] = None,
+        down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet2DConditionOutput, Tuple]:
+        r"""
+        The [`UNetFlatConditionModel`] forward method.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
+            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
+                Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
+                through the `self.time_embedding` layer to obtain the timestep embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
+                A tuple of tensors that if specified are added to the residuals of down unet blocks.
+            mid_block_additional_residual: (`torch.Tensor`, *optional*):
+                A tensor that if specified is added to the residual of the middle unet block.
+            encoder_attention_mask (`torch.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added to UNet long skip connections from down blocks to up blocks for
+                example from ControlNet side model(s)
+            mid_block_additional_residual (`torch.Tensor`, *optional*):
+                additional residual to be added to UNet mid block output, for example from ControlNet side model
+            down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)
+
+        Returns:
+            [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+
+        for dim in sample.shape[-2:]:
+            if dim % default_overall_up_factor != 0:
+                # Forward upsample size to force interpolation output size.
+                forward_upsample_size = True
+                break
+
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+
+        emb = self.time_embedding(t_emb, timestep_cond)
+        aug_emb = None
+
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+
+            if self.config.class_embeddings_concat:
+                emb = torch.cat([emb, class_emb], dim=-1)
+            else:
+                emb = emb + class_emb
+
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+        elif self.config.addition_embed_type == "text_image":
+            # Kandinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+            aug_emb = self.add_embedding(text_embs, image_embs)
+        elif self.config.addition_embed_type == "text_time":
+            # SDXL - style
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(add_embeds)
+        elif self.config.addition_embed_type == "image":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            aug_emb = self.add_embedding(image_embs)
+        elif self.config.addition_embed_type == "image_hint":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            hint = added_cond_kwargs.get("hint")
+            aug_emb, hint = self.add_embedding(image_embs, hint)
+            sample = torch.cat([sample, hint], dim=1)
+
+        emb = emb + aug_emb if aug_emb is not None else emb
+
+        if self.time_embed_act is not None:
+            emb = self.time_embed_act(emb)
+
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            image_embeds = self.encoder_hid_proj(image_embeds)
+            encoder_hidden_states = (encoder_hidden_states, image_embeds)
+
+        # 2. pre-process
+        sample = self.conv_in(sample)
+
+        # 2.5 GLIGEN position net
+        if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
+            cross_attention_kwargs = cross_attention_kwargs.copy()
+            gligen_args = cross_attention_kwargs.pop("gligen")
+            cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
+
+        # 3. down
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+
+        is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
+        # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
+        is_adapter = down_intrablock_additional_residuals is not None
+        # maintain backward compatibility for legacy usage, where
+        #       T2I-Adapter and ControlNet both use down_block_additional_residuals arg
+        #       but can only use one or the other
+        if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
+            deprecate(
+                "T2I should not use down_block_additional_residuals",
+                "1.3.0",
+                "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
+                       and will be removed in diffusers 1.3.0.  `down_block_additional_residuals` should only be used \
+                       for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
+                standard_warn=False,
+            )
+            down_intrablock_additional_residuals = down_block_additional_residuals
+            is_adapter = True
+
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                # For t2i-adapter CrossAttnDownBlockFlat
+                additional_residuals = {}
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
+
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    **additional_residuals,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+                if is_adapter and len(down_intrablock_additional_residuals) > 0:
+                    sample += down_intrablock_additional_residuals.pop(0)
+
+            down_block_res_samples += res_samples
+
+        if is_controlnet:
+            new_down_block_res_samples = ()
+
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+
+            down_block_res_samples = new_down_block_res_samples
+
+        # 4. mid
+        if self.mid_block is not None:
+            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
+                sample = self.mid_block(
+                    sample,
+                    emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = self.mid_block(sample, emb)
+
+            # To support T2I-Adapter-XL
+            if (
+                is_adapter
+                and len(down_intrablock_additional_residuals) > 0
+                and sample.shape == down_intrablock_additional_residuals[0].shape
+            ):
+                sample += down_intrablock_additional_residuals.pop(0)
+
+        if is_controlnet:
+            sample = sample + mid_block_additional_residual
+
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                    scale=lora_scale,
+                )
+
+        # 6. post-process
+        if self.conv_norm_out:
+            sample = self.conv_norm_out(sample)
+            sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
+        if not return_dict:
+            return (sample,)
+
+        return UNet2DConditionOutput(sample=sample)
+
+
+class LinearMultiDim(nn.Linear):
+    def __init__(self, in_features, out_features=None, second_dim=4, *args, **kwargs):
+        in_features = [in_features, second_dim, 1] if isinstance(in_features, int) else list(in_features)
+        if out_features is None:
+            out_features = in_features
+        out_features = [out_features, second_dim, 1] if isinstance(out_features, int) else list(out_features)
+        self.in_features_multidim = in_features
+        self.out_features_multidim = out_features
+        super().__init__(np.array(in_features).prod(), np.array(out_features).prod())
+
+    def forward(self, input_tensor, *args, **kwargs):
+        shape = input_tensor.shape
+        n_dim = len(self.in_features_multidim)
+        input_tensor = input_tensor.reshape(*shape[0:-n_dim], self.in_features)
+        output_tensor = super().forward(input_tensor)
+        output_tensor = output_tensor.view(*shape[0:-n_dim], *self.out_features_multidim)
+        return output_tensor
+
+
+class ResnetBlockFlat(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        dropout=0.0,
+        temb_channels=512,
+        groups=32,
+        groups_out=None,
+        pre_norm=True,
+        eps=1e-6,
+        time_embedding_norm="default",
+        use_in_shortcut=None,
+        second_dim=4,
+        **kwargs,
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+
+        in_channels = [in_channels, second_dim, 1] if isinstance(in_channels, int) else list(in_channels)
+        self.in_channels_prod = np.array(in_channels).prod()
+        self.channels_multidim = in_channels
+
+        if out_channels is not None:
+            out_channels = [out_channels, second_dim, 1] if isinstance(out_channels, int) else list(out_channels)
+            out_channels_prod = np.array(out_channels).prod()
+            self.out_channels_multidim = out_channels
+        else:
+            out_channels_prod = self.in_channels_prod
+            self.out_channels_multidim = self.channels_multidim
+        self.time_embedding_norm = time_embedding_norm
+
+        if groups_out is None:
+            groups_out = groups
+
+        self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=self.in_channels_prod, eps=eps, affine=True)
+        self.conv1 = torch.nn.Conv2d(self.in_channels_prod, out_channels_prod, kernel_size=1, padding=0)
+
+        if temb_channels is not None:
+            self.time_emb_proj = torch.nn.Linear(temb_channels, out_channels_prod)
+        else:
+            self.time_emb_proj = None
+
+        self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels_prod, eps=eps, affine=True)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels_prod, out_channels_prod, kernel_size=1, padding=0)
+
+        self.nonlinearity = nn.SiLU()
+
+        self.use_in_shortcut = (
+            self.in_channels_prod != out_channels_prod if use_in_shortcut is None else use_in_shortcut
+        )
+
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = torch.nn.Conv2d(
+                self.in_channels_prod, out_channels_prod, kernel_size=1, stride=1, padding=0
+            )
+
+    def forward(self, input_tensor, temb):
+        shape = input_tensor.shape
+        n_dim = len(self.channels_multidim)
+        input_tensor = input_tensor.reshape(*shape[0:-n_dim], self.in_channels_prod, 1, 1)
+        input_tensor = input_tensor.view(-1, self.in_channels_prod, 1, 1)
+
+        hidden_states = input_tensor
+
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        if temb is not None:
+            temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None]
+            hidden_states = hidden_states + temb
+
+        hidden_states = self.norm2(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+
+        output_tensor = input_tensor + hidden_states
+
+        output_tensor = output_tensor.view(*shape[0:-n_dim], -1)
+        output_tensor = output_tensor.view(*shape[0:-n_dim], *self.out_channels_multidim)
+
+        return output_tensor
+
+
+class DownBlockFlat(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_downsample: bool = True,
+        downsample_padding: int = 1,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlockFlat(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    LinearMultiDim(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+class CrossAttnDownBlockFlat(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        downsample_padding: int = 1,
+        add_downsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlockFlat(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    LinearMultiDim(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        additional_residuals: Optional[torch.FloatTensor] = None,
+    ) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]:
+        output_states = ()
+
+        blocks = list(zip(self.resnets, self.attentions))
+
+        for i, (resnet, attn) in enumerate(blocks):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+
+            # apply additional residuals to the output of the last pair of resnet and attention blocks
+            if i == len(blocks) - 1 and additional_residuals is not None:
+                hidden_states = hidden_states + additional_residuals
+
+            output_states = output_states + (hidden_states,)
+
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+
+            output_states = output_states + (hidden_states,)
+
+        return hidden_states, output_states
+
+
+# Copied from diffusers.models.unets.unet_2d_blocks.UpBlock2D with UpBlock2D->UpBlockFlat, ResnetBlock2D->ResnetBlockFlat, Upsample2D->LinearMultiDim
+class UpBlockFlat(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+    ):
+        super().__init__()
+        resnets = []
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlockFlat(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([LinearMultiDim(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        upsample_size: Optional[int] = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet), hidden_states, temb
+                    )
+            else:
+                hidden_states = resnet(hidden_states, temb)
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+
+        return hidden_states
+
+
+# Copied from diffusers.models.unets.unet_2d_blocks.CrossAttnUpBlock2D with CrossAttnUpBlock2D->CrossAttnUpBlockFlat, ResnetBlock2D->ResnetBlockFlat, Upsample2D->LinearMultiDim
+class CrossAttnUpBlockFlat(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        resolution_idx: Optional[int] = None,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        cross_attention_dim: int = 1280,
+        output_scale_factor: float = 1.0,
+        add_upsample: bool = True,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+
+            resnets.append(
+                ResnetBlockFlat(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        only_cross_attention=only_cross_attention,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        out_channels // num_attention_heads,
+                        in_channels=out_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([LinearMultiDim(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+
+        self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        upsample_size: Optional[int] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # FreeU: Only operate on the first two stages
+            if is_freeu_enabled:
+                hidden_states, res_hidden_states = apply_freeu(
+                    self.resolution_idx,
+                    hidden_states,
+                    res_hidden_states,
+                    s1=self.s1,
+                    s2=self.s2,
+                    b1=self.b1,
+                    b2=self.b2,
+                )
+
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+
+        return hidden_states
+
+
+# Copied from diffusers.models.unets.unet_2d_blocks.UNetMidBlock2D with UNetMidBlock2D->UNetMidBlockFlat, ResnetBlock2D->ResnetBlockFlat
+class UNetMidBlockFlat(nn.Module):
+    """
+    A 2D UNet mid-block [`UNetMidBlockFlat`] with multiple residual blocks and optional attention blocks.
+
+    Args:
+        in_channels (`int`): The number of input channels.
+        temb_channels (`int`): The number of temporal embedding channels.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
+        num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
+        resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
+        resnet_time_scale_shift (`str`, *optional*, defaults to `default`):
+            The type of normalization to apply to the time embeddings. This can help to improve the performance of the
+            model on tasks with long-range temporal dependencies.
+        resnet_act_fn (`str`, *optional*, defaults to `swish`): The activation function for the resnet blocks.
+        resnet_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use in the group normalization layers of the resnet blocks.
+        attn_groups (`Optional[int]`, *optional*, defaults to None): The number of groups for the attention blocks.
+        resnet_pre_norm (`bool`, *optional*, defaults to `True`):
+            Whether to use pre-normalization for the resnet blocks.
+        add_attention (`bool`, *optional*, defaults to `True`): Whether to add attention blocks.
+        attention_head_dim (`int`, *optional*, defaults to 1):
+            Dimension of a single attention head. The number of attention heads is determined based on this value and
+            the number of input channels.
+        output_scale_factor (`float`, *optional*, defaults to 1.0): The output scale factor.
+
+    Returns:
+        `torch.FloatTensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
+        in_channels, height, width)`.
+
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",  # default, spatial
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        attn_groups: Optional[int] = None,
+        resnet_pre_norm: bool = True,
+        add_attention: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+    ):
+        super().__init__()
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        self.add_attention = add_attention
+
+        if attn_groups is None:
+            attn_groups = resnet_groups if resnet_time_scale_shift == "default" else None
+
+        # there is always at least one resnet
+        if resnet_time_scale_shift == "spatial":
+            resnets = [
+                ResnetBlockCondNorm2D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm="spatial",
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                )
+            ]
+        else:
+            resnets = [
+                ResnetBlockFlat(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            ]
+        attentions = []
+
+        if attention_head_dim is None:
+            logger.warning(
+                f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {in_channels}."
+            )
+            attention_head_dim = in_channels
+
+        for _ in range(num_layers):
+            if self.add_attention:
+                attentions.append(
+                    Attention(
+                        in_channels,
+                        heads=in_channels // attention_head_dim,
+                        dim_head=attention_head_dim,
+                        rescale_output_factor=output_scale_factor,
+                        eps=resnet_eps,
+                        norm_num_groups=attn_groups,
+                        spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
+                        residual_connection=True,
+                        bias=True,
+                        upcast_softmax=True,
+                        _from_deprecated_attn_block=True,
+                    )
+                )
+            else:
+                attentions.append(None)
+
+            if resnet_time_scale_shift == "spatial":
+                resnets.append(
+                    ResnetBlockCondNorm2D(
+                        in_channels=in_channels,
+                        out_channels=in_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm="spatial",
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                    )
+                )
+            else:
+                resnets.append(
+                    ResnetBlockFlat(
+                        in_channels=in_channels,
+                        out_channels=in_channels,
+                        temb_channels=temb_channels,
+                        eps=resnet_eps,
+                        groups=resnet_groups,
+                        dropout=dropout,
+                        time_embedding_norm=resnet_time_scale_shift,
+                        non_linearity=resnet_act_fn,
+                        output_scale_factor=output_scale_factor,
+                        pre_norm=resnet_pre_norm,
+                    )
+                )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None) -> torch.FloatTensor:
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if attn is not None:
+                hidden_states = attn(hidden_states, temb=temb)
+            hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
+
+
+# Copied from diffusers.models.unets.unet_2d_blocks.UNetMidBlock2DCrossAttn with UNetMidBlock2DCrossAttn->UNetMidBlockFlatCrossAttn, ResnetBlock2D->ResnetBlockFlat
+class UNetMidBlockFlatCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        num_attention_heads: int = 1,
+        output_scale_factor: float = 1.0,
+        cross_attention_dim: int = 1280,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        upcast_attention: bool = False,
+        attention_type: str = "default",
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+        self.num_attention_heads = num_attention_heads
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+
+        # support for variable transformer layers per block
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * num_layers
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlockFlat(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+
+        for i in range(num_layers):
+            if not dual_cross_attention:
+                attentions.append(
+                    Transformer2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=transformer_layers_per_block[i],
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                        use_linear_projection=use_linear_projection,
+                        upcast_attention=upcast_attention,
+                        attention_type=attention_type,
+                    )
+                )
+            else:
+                attentions.append(
+                    DualTransformer2DModel(
+                        num_attention_heads,
+                        in_channels // num_attention_heads,
+                        in_channels=in_channels,
+                        num_layers=1,
+                        cross_attention_dim=cross_attention_dim,
+                        norm_num_groups=resnet_groups,
+                    )
+                )
+            resnets.append(
+                ResnetBlockFlat(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    return_dict=False,
+                )[0]
+                hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
+
+
+# Copied from diffusers.models.unets.unet_2d_blocks.UNetMidBlock2DSimpleCrossAttn with UNetMidBlock2DSimpleCrossAttn->UNetMidBlockFlatSimpleCrossAttn, ResnetBlock2D->ResnetBlockFlat
+class UNetMidBlockFlatSimpleCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attention_head_dim: int = 1,
+        output_scale_factor: float = 1.0,
+        cross_attention_dim: int = 1280,
+        skip_time_act: bool = False,
+        only_cross_attention: bool = False,
+        cross_attention_norm: Optional[str] = None,
+    ):
+        super().__init__()
+
+        self.has_cross_attention = True
+
+        self.attention_head_dim = attention_head_dim
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+
+        self.num_heads = in_channels // self.attention_head_dim
+
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlockFlat(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+                skip_time_act=skip_time_act,
+            )
+        ]
+        attentions = []
+
+        for _ in range(num_layers):
+            processor = (
+                AttnAddedKVProcessor2_0() if hasattr(F, "scaled_dot_product_attention") else AttnAddedKVProcessor()
+            )
+
+            attentions.append(
+                Attention(
+                    query_dim=in_channels,
+                    cross_attention_dim=in_channels,
+                    heads=self.num_heads,
+                    dim_head=self.attention_head_dim,
+                    added_kv_proj_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    bias=True,
+                    upcast_softmax=True,
+                    only_cross_attention=only_cross_attention,
+                    cross_attention_norm=cross_attention_norm,
+                    processor=processor,
+                )
+            )
+            resnets.append(
+                ResnetBlockFlat(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                    skip_time_act=skip_time_act,
+                )
+            )
+
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        if cross_attention_kwargs.get("scale", None) is not None:
+            logger.warning("Passing `scale` to `cross_attention_kwargs` is depcrecated. `scale` will be ignored.")
+
+        if attention_mask is None:
+            # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
+            mask = None if encoder_hidden_states is None else encoder_attention_mask
+        else:
+            # when attention_mask is defined: we don't even check for encoder_attention_mask.
+            # this is to maintain compatibility with UnCLIP, which uses 'attention_mask' param for cross-attn masks.
+            # TODO: UnCLIP should express cross-attn mask via encoder_attention_mask param instead of via attention_mask.
+            #       then we can simplify this whole if/else block to:
+            #         mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask
+            mask = attention_mask
+
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            # attn
+            hidden_states = attn(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=mask,
+                **cross_attention_kwargs,
+            )
+
+            # resnet
+            hidden_states = resnet(hidden_states, temb)
+
+        return hidden_states
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py
new file mode 100644
index 000000000..4455d20df
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py
@@ -0,0 +1,421 @@
+import inspect
+from typing import Callable, List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModel
+
+from ....models import AutoencoderKL, UNet2DConditionModel
+from ....schedulers import KarrasDiffusionSchedulers
+from ....utils import logging
+from ...pipeline_utils import DiffusionPipeline
+from .pipeline_versatile_diffusion_dual_guided import VersatileDiffusionDualGuidedPipeline
+from .pipeline_versatile_diffusion_image_variation import VersatileDiffusionImageVariationPipeline
+from .pipeline_versatile_diffusion_text_to_image import VersatileDiffusionTextToImagePipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class VersatileDiffusionPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    tokenizer: CLIPTokenizer
+    image_feature_extractor: CLIPImageProcessor
+    text_encoder: CLIPTextModel
+    image_encoder: CLIPVisionModel
+    image_unet: UNet2DConditionModel
+    text_unet: UNet2DConditionModel
+    vae: AutoencoderKL
+    scheduler: KarrasDiffusionSchedulers
+
+    def __init__(
+        self,
+        tokenizer: CLIPTokenizer,
+        image_feature_extractor: CLIPImageProcessor,
+        text_encoder: CLIPTextModel,
+        image_encoder: CLIPVisionModel,
+        image_unet: UNet2DConditionModel,
+        text_unet: UNet2DConditionModel,
+        vae: AutoencoderKL,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer,
+            image_feature_extractor=image_feature_extractor,
+            text_encoder=text_encoder,
+            image_encoder=image_encoder,
+            image_unet=image_unet,
+            text_unet=text_unet,
+            vae=vae,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+    @torch.no_grad()
+    def image_variation(
+        self,
+        image: Union[torch.FloatTensor, PIL.Image.Image],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            image (`PIL.Image.Image`, `List[PIL.Image.Image]` or `torch.Tensor`):
+                The image prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import VersatileDiffusionPipeline
+        >>> import torch
+        >>> import requests
+        >>> from io import BytesIO
+        >>> from PIL import Image
+
+        >>> # let's download an initial image
+        >>> url = "https://huggingface.co/datasets/diffusers/images/resolve/main/benz.jpg"
+
+        >>> response = requests.get(url)
+        >>> image = Image.open(BytesIO(response.content)).convert("RGB")
+
+        >>> pipe = VersatileDiffusionPipeline.from_pretrained(
+        ...     "shi-labs/versatile-diffusion", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(0)
+        >>> image = pipe.image_variation(image, generator=generator).images[0]
+        >>> image.save("./car_variation.png")
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        expected_components = inspect.signature(VersatileDiffusionImageVariationPipeline.__init__).parameters.keys()
+        components = {name: component for name, component in self.components.items() if name in expected_components}
+        return VersatileDiffusionImageVariationPipeline(**components)(
+            image=image,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+        )
+
+    @torch.no_grad()
+    def text_to_image(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image generation.
+            height (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import VersatileDiffusionPipeline
+        >>> import torch
+
+        >>> pipe = VersatileDiffusionPipeline.from_pretrained(
+        ...     "shi-labs/versatile-diffusion", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(0)
+        >>> image = pipe.text_to_image("an astronaut riding on a horse on mars", generator=generator).images[0]
+        >>> image.save("./astronaut.png")
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        expected_components = inspect.signature(VersatileDiffusionTextToImagePipeline.__init__).parameters.keys()
+        components = {name: component for name, component in self.components.items() if name in expected_components}
+        temp_pipeline = VersatileDiffusionTextToImagePipeline(**components)
+        output = temp_pipeline(
+            prompt=prompt,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+        )
+        # swap the attention blocks back to the original state
+        temp_pipeline._swap_unet_attention_blocks()
+
+        return output
+
+    @torch.no_grad()
+    def dual_guided(
+        self,
+        prompt: Union[PIL.Image.Image, List[PIL.Image.Image]],
+        image: Union[str, List[str]],
+        text_to_image_strength: float = 0.5,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image generation.
+            height (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import VersatileDiffusionPipeline
+        >>> import torch
+        >>> import requests
+        >>> from io import BytesIO
+        >>> from PIL import Image
+
+        >>> # let's download an initial image
+        >>> url = "https://huggingface.co/datasets/diffusers/images/resolve/main/benz.jpg"
+
+        >>> response = requests.get(url)
+        >>> image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> text = "a red car in the sun"
+
+        >>> pipe = VersatileDiffusionPipeline.from_pretrained(
+        ...     "shi-labs/versatile-diffusion", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(0)
+        >>> text_to_image_strength = 0.75
+
+        >>> image = pipe.dual_guided(
+        ...     prompt=text, image=image, text_to_image_strength=text_to_image_strength, generator=generator
+        ... ).images[0]
+        >>> image.save("./car_variation.png")
+        ```
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+
+        expected_components = inspect.signature(VersatileDiffusionDualGuidedPipeline.__init__).parameters.keys()
+        components = {name: component for name, component in self.components.items() if name in expected_components}
+        temp_pipeline = VersatileDiffusionDualGuidedPipeline(**components)
+        output = temp_pipeline(
+            prompt=prompt,
+            image=image,
+            text_to_image_strength=text_to_image_strength,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            num_images_per_prompt=num_images_per_prompt,
+            eta=eta,
+            generator=generator,
+            latents=latents,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback=callback,
+            callback_steps=callback_steps,
+        )
+        temp_pipeline._revert_dual_attention()
+
+        return output
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
new file mode 100644
index 000000000..8af739bbe
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
@@ -0,0 +1,556 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.utils.checkpoint
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ....image_processor import VaeImageProcessor
+from ....models import AutoencoderKL, DualTransformer2DModel, Transformer2DModel, UNet2DConditionModel
+from ....schedulers import KarrasDiffusionSchedulers
+from ....utils import deprecate, logging
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from .modeling_text_unet import UNetFlatConditionModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class VersatileDiffusionDualGuidedPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for image-text dual-guided generation using Versatile Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        vqvae ([`VQModel`]):
+            Vector-quantized (VQ) model to encode and decode images to and from latent representations.
+        bert ([`LDMBertModel`]):
+            Text-encoder model based on [`~transformers.BERT`].
+        tokenizer ([`~transformers.BertTokenizer`]):
+            A `BertTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "bert->unet->vqvae"
+
+    tokenizer: CLIPTokenizer
+    image_feature_extractor: CLIPImageProcessor
+    text_encoder: CLIPTextModelWithProjection
+    image_encoder: CLIPVisionModelWithProjection
+    image_unet: UNet2DConditionModel
+    text_unet: UNetFlatConditionModel
+    vae: AutoencoderKL
+    scheduler: KarrasDiffusionSchedulers
+
+    _optional_components = ["text_unet"]
+
+    def __init__(
+        self,
+        tokenizer: CLIPTokenizer,
+        image_feature_extractor: CLIPImageProcessor,
+        text_encoder: CLIPTextModelWithProjection,
+        image_encoder: CLIPVisionModelWithProjection,
+        image_unet: UNet2DConditionModel,
+        text_unet: UNetFlatConditionModel,
+        vae: AutoencoderKL,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
+        super().__init__()
+        self.register_modules(
+            tokenizer=tokenizer,
+            image_feature_extractor=image_feature_extractor,
+            text_encoder=text_encoder,
+            image_encoder=image_encoder,
+            image_unet=image_unet,
+            text_unet=text_unet,
+            vae=vae,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        if self.text_unet is not None and (
+            "dual_cross_attention" not in self.image_unet.config or not self.image_unet.config.dual_cross_attention
+        ):
+            # if loading from a universal checkpoint rather than a saved dual-guided pipeline
+            self._convert_to_dual_attention()
+
+    def remove_unused_weights(self):
+        self.register_modules(text_unet=None)
+
+    def _convert_to_dual_attention(self):
+        """
+        Replace image_unet's `Transformer2DModel` blocks with `DualTransformer2DModel` that contains transformer blocks
+        from both `image_unet` and `text_unet`
+        """
+        for name, module in self.image_unet.named_modules():
+            if isinstance(module, Transformer2DModel):
+                parent_name, index = name.rsplit(".", 1)
+                index = int(index)
+
+                image_transformer = self.image_unet.get_submodule(parent_name)[index]
+                text_transformer = self.text_unet.get_submodule(parent_name)[index]
+
+                config = image_transformer.config
+                dual_transformer = DualTransformer2DModel(
+                    num_attention_heads=config.num_attention_heads,
+                    attention_head_dim=config.attention_head_dim,
+                    in_channels=config.in_channels,
+                    num_layers=config.num_layers,
+                    dropout=config.dropout,
+                    norm_num_groups=config.norm_num_groups,
+                    cross_attention_dim=config.cross_attention_dim,
+                    attention_bias=config.attention_bias,
+                    sample_size=config.sample_size,
+                    num_vector_embeds=config.num_vector_embeds,
+                    activation_fn=config.activation_fn,
+                    num_embeds_ada_norm=config.num_embeds_ada_norm,
+                )
+                dual_transformer.transformers[0] = image_transformer
+                dual_transformer.transformers[1] = text_transformer
+
+                self.image_unet.get_submodule(parent_name)[index] = dual_transformer
+                self.image_unet.register_to_config(dual_cross_attention=True)
+
+    def _revert_dual_attention(self):
+        """
+        Revert the image_unet `DualTransformer2DModel` blocks back to `Transformer2DModel` with image_unet weights Call
+        this function if you reuse `image_unet` in another pipeline, e.g. `VersatileDiffusionPipeline`
+        """
+        for name, module in self.image_unet.named_modules():
+            if isinstance(module, DualTransformer2DModel):
+                parent_name, index = name.rsplit(".", 1)
+                index = int(index)
+                self.image_unet.get_submodule(parent_name)[index] = module.transformers[0]
+
+        self.image_unet.register_to_config(dual_cross_attention=False)
+
+    def _encode_text_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+        """
+
+        def normalize_embeddings(encoder_output):
+            embeds = self.text_encoder.text_projection(encoder_output.last_hidden_state)
+            embeds_pooled = encoder_output.text_embeds
+            embeds = embeds / torch.norm(embeds_pooled.unsqueeze(1), dim=-1, keepdim=True)
+            return embeds
+
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="pt").input_ids
+
+        if not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+
+        prompt_embeds = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        prompt_embeds = normalize_embeddings(prompt_embeds)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens = [""] * batch_size
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = normalize_embeddings(negative_prompt_embeds)
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    def _encode_image_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+        """
+
+        def normalize_embeddings(encoder_output):
+            embeds = self.image_encoder.vision_model.post_layernorm(encoder_output.last_hidden_state)
+            embeds = self.image_encoder.visual_projection(embeds)
+            embeds_pooled = embeds[:, 0:1]
+            embeds = embeds / torch.norm(embeds_pooled, dim=-1, keepdim=True)
+            return embeds
+
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        # get prompt text embeddings
+        image_input = self.image_feature_extractor(images=prompt, return_tensors="pt")
+        pixel_values = image_input.pixel_values.to(device).to(self.image_encoder.dtype)
+        image_embeddings = self.image_encoder(pixel_values)
+        image_embeddings = normalize_embeddings(image_embeddings)
+
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_images_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_images = [np.zeros((512, 512, 3)) + 0.5] * batch_size
+            uncond_images = self.image_feature_extractor(images=uncond_images, return_tensors="pt")
+            pixel_values = uncond_images.pixel_values.to(device).to(self.image_encoder.dtype)
+            negative_prompt_embeds = self.image_encoder(pixel_values)
+            negative_prompt_embeds = normalize_embeddings(negative_prompt_embeds)
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and conditional embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_prompt_embeds, image_embeddings])
+
+        return image_embeddings
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(self, prompt, image, height, width, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, PIL.Image.Image) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` `PIL.Image` or `list` but is {type(prompt)}")
+        if not isinstance(image, str) and not isinstance(image, PIL.Image.Image) and not isinstance(image, list):
+            raise ValueError(f"`image` has to be of type `str` `PIL.Image` or `list` but is {type(image)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def set_transformer_params(self, mix_ratio: float = 0.5, condition_types: Tuple = ("text", "image")):
+        for name, module in self.image_unet.named_modules():
+            if isinstance(module, DualTransformer2DModel):
+                module.mix_ratio = mix_ratio
+
+                for i, type in enumerate(condition_types):
+                    if type == "text":
+                        module.condition_lengths[i] = self.text_encoder.config.max_position_embeddings
+                        module.transformer_index_for_condition[i] = 1  # use the second (text) transformer
+                    else:
+                        module.condition_lengths[i] = 257
+                        module.transformer_index_for_condition[i] = 0  # use the first (image) transformer
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[PIL.Image.Image, List[PIL.Image.Image]],
+        image: Union[str, List[str]],
+        text_to_image_strength: float = 0.5,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image generation.
+            height (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import VersatileDiffusionDualGuidedPipeline
+        >>> import torch
+        >>> import requests
+        >>> from io import BytesIO
+        >>> from PIL import Image
+
+        >>> # let's download an initial image
+        >>> url = "https://huggingface.co/datasets/diffusers/images/resolve/main/benz.jpg"
+
+        >>> response = requests.get(url)
+        >>> image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> text = "a red car in the sun"
+
+        >>> pipe = VersatileDiffusionDualGuidedPipeline.from_pretrained(
+        ...     "shi-labs/versatile-diffusion", torch_dtype=torch.float16
+        ... )
+        >>> pipe.remove_unused_weights()
+        >>> pipe = pipe.to("cuda")
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(0)
+        >>> text_to_image_strength = 0.75
+
+        >>> image = pipe(
+        ...     prompt=text, image=image, text_to_image_strength=text_to_image_strength, generator=generator
+        ... ).images[0]
+        >>> image.save("./car_variation.png")
+        ```
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+        # 0. Default height and width to unet
+        height = height or self.image_unet.config.sample_size * self.vae_scale_factor
+        width = width or self.image_unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, image, height, width, callback_steps)
+
+        # 2. Define call parameters
+        prompt = [prompt] if not isinstance(prompt, list) else prompt
+        image = [image] if not isinstance(image, list) else image
+        batch_size = len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompts
+        prompt_embeds = self._encode_text_prompt(prompt, device, num_images_per_prompt, do_classifier_free_guidance)
+        image_embeddings = self._encode_image_prompt(image, device, num_images_per_prompt, do_classifier_free_guidance)
+        dual_prompt_embeddings = torch.cat([prompt_embeds, image_embeddings], dim=1)
+        prompt_types = ("text", "image")
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.image_unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            dual_prompt_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Combine the attention blocks of the image and text UNets
+        self.set_transformer_params(text_to_image_strength, prompt_types)
+
+        # 8. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.image_unet(latent_model_input, t, encoder_hidden_states=dual_prompt_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        else:
+            image = latents
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
new file mode 100644
index 000000000..345c15f18
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
@@ -0,0 +1,397 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.utils.checkpoint
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+
+from ....image_processor import VaeImageProcessor
+from ....models import AutoencoderKL, UNet2DConditionModel
+from ....schedulers import KarrasDiffusionSchedulers
+from ....utils import deprecate, logging
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class VersatileDiffusionImageVariationPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for image variation using Versatile Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        vqvae ([`VQModel`]):
+            Vector-quantized (VQ) model to encode and decode images to and from latent representations.
+        bert ([`LDMBertModel`]):
+            Text-encoder model based on [`~transformers.BERT`].
+        tokenizer ([`~transformers.BertTokenizer`]):
+            A `BertTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "bert->unet->vqvae"
+
+    image_feature_extractor: CLIPImageProcessor
+    image_encoder: CLIPVisionModelWithProjection
+    image_unet: UNet2DConditionModel
+    vae: AutoencoderKL
+    scheduler: KarrasDiffusionSchedulers
+
+    def __init__(
+        self,
+        image_feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection,
+        image_unet: UNet2DConditionModel,
+        vae: AutoencoderKL,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
+        super().__init__()
+        self.register_modules(
+            image_feature_extractor=image_feature_extractor,
+            image_encoder=image_encoder,
+            image_unet=image_unet,
+            vae=vae,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+
+        def normalize_embeddings(encoder_output):
+            embeds = self.image_encoder.vision_model.post_layernorm(encoder_output.last_hidden_state)
+            embeds = self.image_encoder.visual_projection(embeds)
+            embeds_pooled = embeds[:, 0:1]
+            embeds = embeds / torch.norm(embeds_pooled, dim=-1, keepdim=True)
+            return embeds
+
+        if isinstance(prompt, torch.Tensor) and len(prompt.shape) == 4:
+            prompt = list(prompt)
+
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        # get prompt text embeddings
+        image_input = self.image_feature_extractor(images=prompt, return_tensors="pt")
+        pixel_values = image_input.pixel_values.to(device).to(self.image_encoder.dtype)
+        image_embeddings = self.image_encoder(pixel_values)
+        image_embeddings = normalize_embeddings(image_embeddings)
+
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_images_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_images: List[str]
+            if negative_prompt is None:
+                uncond_images = [np.zeros((512, 512, 3)) + 0.5] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, PIL.Image.Image):
+                uncond_images = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_images = negative_prompt
+
+            uncond_images = self.image_feature_extractor(images=uncond_images, return_tensors="pt")
+            pixel_values = uncond_images.pixel_values.to(device).to(self.image_encoder.dtype)
+            negative_prompt_embeds = self.image_encoder(pixel_values)
+            negative_prompt_embeds = normalize_embeddings(negative_prompt_embeds)
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and conditional embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_prompt_embeds, image_embeddings])
+
+        return image_embeddings
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_image_variation.StableDiffusionImageVariationPipeline.check_inputs
+    def check_inputs(self, image, height, width, callback_steps):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.Tensor],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            image (`PIL.Image.Image`, `List[PIL.Image.Image]` or `torch.Tensor`):
+                The image prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import VersatileDiffusionImageVariationPipeline
+        >>> import torch
+        >>> import requests
+        >>> from io import BytesIO
+        >>> from PIL import Image
+
+        >>> # let's download an initial image
+        >>> url = "https://huggingface.co/datasets/diffusers/images/resolve/main/benz.jpg"
+
+        >>> response = requests.get(url)
+        >>> image = Image.open(BytesIO(response.content)).convert("RGB")
+
+        >>> pipe = VersatileDiffusionImageVariationPipeline.from_pretrained(
+        ...     "shi-labs/versatile-diffusion", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(0)
+        >>> image = pipe(image, generator=generator).images[0]
+        >>> image.save("./car_variation.png")
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images.
+        """
+        # 0. Default height and width to unet
+        height = height or self.image_unet.config.sample_size * self.vae_scale_factor
+        width = width or self.image_unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(image, height, width, callback_steps)
+
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(image, PIL.Image.Image) else len(image)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        image_embeddings = self._encode_prompt(
+            image, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.image_unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.image_unet(latent_model_input, t, encoder_hidden_states=image_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        else:
+            image = latents
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
new file mode 100644
index 000000000..0b2518f7e
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
@@ -0,0 +1,475 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer
+
+from ....image_processor import VaeImageProcessor
+from ....models import AutoencoderKL, Transformer2DModel, UNet2DConditionModel
+from ....schedulers import KarrasDiffusionSchedulers
+from ....utils import deprecate, logging
+from ....utils.torch_utils import randn_tensor
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from .modeling_text_unet import UNetFlatConditionModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class VersatileDiffusionTextToImagePipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using Versatile Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        vqvae ([`VQModel`]):
+            Vector-quantized (VQ) model to encode and decode images to and from latent representations.
+        bert ([`LDMBertModel`]):
+            Text-encoder model based on [`~transformers.BERT`].
+        tokenizer ([`~transformers.BertTokenizer`]):
+            A `BertTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "bert->unet->vqvae"
+
+    tokenizer: CLIPTokenizer
+    image_feature_extractor: CLIPImageProcessor
+    text_encoder: CLIPTextModelWithProjection
+    image_unet: UNet2DConditionModel
+    text_unet: UNetFlatConditionModel
+    vae: AutoencoderKL
+    scheduler: KarrasDiffusionSchedulers
+
+    _optional_components = ["text_unet"]
+
+    def __init__(
+        self,
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModelWithProjection,
+        image_unet: UNet2DConditionModel,
+        text_unet: UNetFlatConditionModel,
+        vae: AutoencoderKL,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
+        super().__init__()
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            image_unet=image_unet,
+            text_unet=text_unet,
+            vae=vae,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        if self.text_unet is not None:
+            self._swap_unet_attention_blocks()
+
+    def _swap_unet_attention_blocks(self):
+        """
+        Swap the `Transformer2DModel` blocks between the image and text UNets
+        """
+        for name, module in self.image_unet.named_modules():
+            if isinstance(module, Transformer2DModel):
+                parent_name, index = name.rsplit(".", 1)
+                index = int(index)
+                self.image_unet.get_submodule(parent_name)[index], self.text_unet.get_submodule(parent_name)[index] = (
+                    self.text_unet.get_submodule(parent_name)[index],
+                    self.image_unet.get_submodule(parent_name)[index],
+                )
+
+    def remove_unused_weights(self):
+        self.register_modules(text_unet=None)
+
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+
+        def normalize_embeddings(encoder_output):
+            embeds = self.text_encoder.text_projection(encoder_output.last_hidden_state)
+            embeds_pooled = encoder_output.text_embeds
+            embeds = embeds / torch.norm(embeds_pooled.unsqueeze(1), dim=-1, keepdim=True)
+            return embeds
+
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="pt").input_ids
+
+        if not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+
+        prompt_embeds = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        prompt_embeds = normalize_embeddings(prompt_embeds)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = normalize_embeddings(negative_prompt_embeds)
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image generation.
+            height (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.image_unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import VersatileDiffusionTextToImagePipeline
+        >>> import torch
+
+        >>> pipe = VersatileDiffusionTextToImagePipeline.from_pretrained(
+        ...     "shi-labs/versatile-diffusion", torch_dtype=torch.float16
+        ... )
+        >>> pipe.remove_unused_weights()
+        >>> pipe = pipe.to("cuda")
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(0)
+        >>> image = pipe("an astronaut riding on a horse on mars", generator=generator).images[0]
+        >>> image.save("./astronaut.png")
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images.
+        """
+        # 0. Default height and width to unet
+        height = height or self.image_unet.config.sample_size * self.vae_scale_factor
+        width = width or self.image_unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.image_unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.image_unet(latent_model_input, t, encoder_hidden_states=prompt_embeds).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        else:
+            image = latents
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/vq_diffusion/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/vq_diffusion/__init__.py
new file mode 100644
index 000000000..070903377
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/vq_diffusion/__init__.py
@@ -0,0 +1,57 @@
+from typing import TYPE_CHECKING
+
+from ....utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ....utils.dummy_torch_and_transformers_objects import (
+        LearnedClassifierFreeSamplingEmbeddings,
+        VQDiffusionPipeline,
+    )
+
+    _dummy_objects.update(
+        {
+            "LearnedClassifierFreeSamplingEmbeddings": LearnedClassifierFreeSamplingEmbeddings,
+            "VQDiffusionPipeline": VQDiffusionPipeline,
+        }
+    )
+else:
+    _import_structure["pipeline_vq_diffusion"] = ["LearnedClassifierFreeSamplingEmbeddings", "VQDiffusionPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ....utils.dummy_torch_and_transformers_objects import (
+            LearnedClassifierFreeSamplingEmbeddings,
+            VQDiffusionPipeline,
+        )
+    else:
+        from .pipeline_vq_diffusion import LearnedClassifierFreeSamplingEmbeddings, VQDiffusionPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py
new file mode 100644
index 000000000..0c55d04e6
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py
@@ -0,0 +1,325 @@
+# Copyright 2024 Microsoft and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ....configuration_utils import ConfigMixin, register_to_config
+from ....models import ModelMixin, Transformer2DModel, VQModel
+from ....schedulers import VQDiffusionScheduler
+from ....utils import logging
+from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class LearnedClassifierFreeSamplingEmbeddings(ModelMixin, ConfigMixin):
+    """
+    Utility class for storing learned text embeddings for classifier free sampling
+    """
+
+    @register_to_config
+    def __init__(self, learnable: bool, hidden_size: Optional[int] = None, length: Optional[int] = None):
+        super().__init__()
+
+        self.learnable = learnable
+
+        if self.learnable:
+            assert hidden_size is not None, "learnable=True requires `hidden_size` to be set"
+            assert length is not None, "learnable=True requires `length` to be set"
+
+            embeddings = torch.zeros(length, hidden_size)
+        else:
+            embeddings = None
+
+        self.embeddings = torch.nn.Parameter(embeddings)
+
+
+class VQDiffusionPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using VQ Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vqvae ([`VQModel`]):
+            Vector Quantized Variational Auto-Encoder (VAE) model to encode and decode images to and from latent
+            representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        transformer ([`Transformer2DModel`]):
+            A conditional `Transformer2DModel` to denoise the encoded image latents.
+        scheduler ([`VQDiffusionScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+    """
+
+    vqvae: VQModel
+    text_encoder: CLIPTextModel
+    tokenizer: CLIPTokenizer
+    transformer: Transformer2DModel
+    learned_classifier_free_sampling_embeddings: LearnedClassifierFreeSamplingEmbeddings
+    scheduler: VQDiffusionScheduler
+
+    def __init__(
+        self,
+        vqvae: VQModel,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        transformer: Transformer2DModel,
+        scheduler: VQDiffusionScheduler,
+        learned_classifier_free_sampling_embeddings: LearnedClassifierFreeSamplingEmbeddings,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vqvae=vqvae,
+            transformer=transformer,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            learned_classifier_free_sampling_embeddings=learned_classifier_free_sampling_embeddings,
+        )
+
+    def _encode_prompt(self, prompt, num_images_per_prompt, do_classifier_free_guidance):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        prompt_embeds = self.text_encoder(text_input_ids.to(self.device))[0]
+
+        # NOTE: This additional step of normalizing the text embeddings is from VQ-Diffusion.
+        # While CLIP does normalize the pooled output of the text transformer when combining
+        # the image and text embeddings, CLIP does not directly normalize the last hidden state.
+        #
+        # CLIP normalizing the pooled output.
+        # https://github.com/huggingface/transformers/blob/d92e22d1f28324f513f3080e5c47c071a3916721/src/transformers/models/clip/modeling_clip.py#L1052-L1053
+        prompt_embeds = prompt_embeds / prompt_embeds.norm(dim=-1, keepdim=True)
+
+        # duplicate text embeddings for each generation per prompt
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            if self.learned_classifier_free_sampling_embeddings.learnable:
+                negative_prompt_embeds = self.learned_classifier_free_sampling_embeddings.embeddings
+                negative_prompt_embeds = negative_prompt_embeds.unsqueeze(0).repeat(batch_size, 1, 1)
+            else:
+                uncond_tokens = [""] * batch_size
+
+                max_length = text_input_ids.shape[-1]
+                uncond_input = self.tokenizer(
+                    uncond_tokens,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                negative_prompt_embeds = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+                # See comment for normalizing text embeddings
+                negative_prompt_embeds = negative_prompt_embeds / negative_prompt_embeds.norm(dim=-1, keepdim=True)
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        num_inference_steps: int = 100,
+        guidance_scale: float = 5.0,
+        truncation_rate: float = 1.0,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        """
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image generation.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            truncation_rate (`float`, *optional*, defaults to 1.0 (equivalent to no truncation)):
+                Used to "truncate" the predicted classes for x_0 such that the cumulative probability for a pixel is at
+                most `truncation_rate`. The lowest probabilities that would increase the cumulative probability above
+                `truncation_rate` are set to zero.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor` of shape (batch), *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Must be valid embedding indices.If not provided, a latents tensor will be generated of
+                completely masked latent pixels.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        batch_size = batch_size * num_images_per_prompt
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        prompt_embeds = self._encode_prompt(prompt, num_images_per_prompt, do_classifier_free_guidance)
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        # get the initial completely masked latents unless the user supplied it
+
+        latents_shape = (batch_size, self.transformer.num_latent_pixels)
+        if latents is None:
+            mask_class = self.transformer.num_vector_embeds - 1
+            latents = torch.full(latents_shape, mask_class).to(self.device)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            if (latents < 0).any() or (latents >= self.transformer.num_vector_embeds).any():
+                raise ValueError(
+                    "Unexpected latents value(s). All latents be valid embedding indices i.e. in the range 0,"
+                    f" {self.transformer.num_vector_embeds - 1} (inclusive)."
+                )
+            latents = latents.to(self.device)
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=self.device)
+
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+
+        sample = latents
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the sample if we are doing classifier free guidance
+            latent_model_input = torch.cat([sample] * 2) if do_classifier_free_guidance else sample
+
+            # predict the un-noised image
+            # model_output == `log_p_x_0`
+            model_output = self.transformer(latent_model_input, encoder_hidden_states=prompt_embeds, timestep=t).sample
+
+            if do_classifier_free_guidance:
+                model_output_uncond, model_output_text = model_output.chunk(2)
+                model_output = model_output_uncond + guidance_scale * (model_output_text - model_output_uncond)
+                model_output -= torch.logsumexp(model_output, dim=1, keepdim=True)
+
+            model_output = self.truncate(model_output, truncation_rate)
+
+            # remove `log(0)`'s (`-inf`s)
+            model_output = model_output.clamp(-70)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            sample = self.scheduler.step(model_output, timestep=t, sample=sample, generator=generator).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, sample)
+
+        embedding_channels = self.vqvae.config.vq_embed_dim
+        embeddings_shape = (batch_size, self.transformer.height, self.transformer.width, embedding_channels)
+        embeddings = self.vqvae.quantize.get_codebook_entry(sample, shape=embeddings_shape)
+        image = self.vqvae.decode(embeddings, force_not_quantize=True).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
+
+    def truncate(self, log_p_x_0: torch.FloatTensor, truncation_rate: float) -> torch.FloatTensor:
+        """
+        Truncates `log_p_x_0` such that for each column vector, the total cumulative probability is `truncation_rate`
+        The lowest probabilities that would increase the cumulative probability above `truncation_rate` are set to
+        zero.
+        """
+        sorted_log_p_x_0, indices = torch.sort(log_p_x_0, 1, descending=True)
+        sorted_p_x_0 = torch.exp(sorted_log_p_x_0)
+        keep_mask = sorted_p_x_0.cumsum(dim=1) < truncation_rate
+
+        # Ensure that at least the largest probability is not zeroed out
+        all_true = torch.full_like(keep_mask[:, 0:1, :], True)
+        keep_mask = torch.cat((all_true, keep_mask), dim=1)
+        keep_mask = keep_mask[:, :-1, :]
+
+        keep_mask = keep_mask.gather(1, indices.argsort(1))
+
+        rv = log_p_x_0.clone()
+
+        rv[~keep_mask] = -torch.inf  # -inf = log(0)
+
+        return rv
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/dit/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/dit/__init__.py
new file mode 100644
index 000000000..fe2a94f3c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/dit/__init__.py
@@ -0,0 +1,19 @@
+from typing import TYPE_CHECKING
+
+from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule
+
+
+_import_structure = {"pipeline_dit": ["DiTPipeline"]}
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from .pipeline_dit import DiTPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/dit/pipeline_dit.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/dit/pipeline_dit.py
new file mode 100644
index 000000000..289ea4960
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/dit/pipeline_dit.py
@@ -0,0 +1,233 @@
+# Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
+# William Peebles and Saining Xie
+#
+# Copyright (c) 2021 OpenAI
+# MIT License
+#
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+
+from ...models import AutoencoderKL, Transformer2DModel
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+class DiTPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for image generation based on a Transformer backbone instead of a UNet.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        transformer ([`Transformer2DModel`]):
+            A class conditioned `Transformer2DModel` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        scheduler ([`DDIMScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+    """
+
+    model_cpu_offload_seq = "transformer->vae"
+
+    def __init__(
+        self,
+        transformer: Transformer2DModel,
+        vae: AutoencoderKL,
+        scheduler: KarrasDiffusionSchedulers,
+        id2label: Optional[Dict[int, str]] = None,
+    ):
+        super().__init__()
+        self.register_modules(transformer=transformer, vae=vae, scheduler=scheduler)
+
+        # create a imagenet -> id dictionary for easier use
+        self.labels = {}
+        if id2label is not None:
+            for key, value in id2label.items():
+                for label in value.split(","):
+                    self.labels[label.lstrip().rstrip()] = int(key)
+            self.labels = dict(sorted(self.labels.items()))
+
+    def get_label_ids(self, label: Union[str, List[str]]) -> List[int]:
+        r"""
+
+        Map label strings from ImageNet to corresponding class ids.
+
+        Parameters:
+            label (`str` or `dict` of `str`):
+                Label strings to be mapped to class ids.
+
+        Returns:
+            `list` of `int`:
+                Class ids to be processed by pipeline.
+        """
+
+        if not isinstance(label, list):
+            label = list(label)
+
+        for l in label:
+            if l not in self.labels:
+                raise ValueError(
+                    f"{l} does not exist. Please make sure to select one of the following labels: \n {self.labels}."
+                )
+
+        return [self.labels[l] for l in label]
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        class_labels: List[int],
+        guidance_scale: float = 4.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        num_inference_steps: int = 50,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            class_labels (List[int]):
+                List of ImageNet class labels for the images to be generated.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            num_inference_steps (`int`, *optional*, defaults to 250):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import DiTPipeline, DPMSolverMultistepScheduler
+        >>> import torch
+
+        >>> pipe = DiTPipeline.from_pretrained("facebook/DiT-XL-2-256", torch_dtype=torch.float16)
+        >>> pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+        >>> pipe = pipe.to("cuda")
+
+        >>> # pick words from Imagenet class labels
+        >>> pipe.labels  # to print all available words
+
+        >>> # pick words that exist in ImageNet
+        >>> words = ["white shark", "umbrella"]
+
+        >>> class_ids = pipe.get_label_ids(words)
+
+        >>> generator = torch.manual_seed(33)
+        >>> output = pipe(class_labels=class_ids, num_inference_steps=25, generator=generator)
+
+        >>> image = output.images[0]  # label 'white shark'
+        ```
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
+        """
+
+        batch_size = len(class_labels)
+        latent_size = self.transformer.config.sample_size
+        latent_channels = self.transformer.config.in_channels
+
+        latents = randn_tensor(
+            shape=(batch_size, latent_channels, latent_size, latent_size),
+            generator=generator,
+            device=self._execution_device,
+            dtype=self.transformer.dtype,
+        )
+        latent_model_input = torch.cat([latents] * 2) if guidance_scale > 1 else latents
+
+        class_labels = torch.tensor(class_labels, device=self._execution_device).reshape(-1)
+        class_null = torch.tensor([1000] * batch_size, device=self._execution_device)
+        class_labels_input = torch.cat([class_labels, class_null], 0) if guidance_scale > 1 else class_labels
+
+        # set step values
+        self.scheduler.set_timesteps(num_inference_steps)
+        for t in self.progress_bar(self.scheduler.timesteps):
+            if guidance_scale > 1:
+                half = latent_model_input[: len(latent_model_input) // 2]
+                latent_model_input = torch.cat([half, half], dim=0)
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            timesteps = t
+            if not torch.is_tensor(timesteps):
+                # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+                # This would be a good case for the `match` statement (Python 3.10+)
+                is_mps = latent_model_input.device.type == "mps"
+                if isinstance(timesteps, float):
+                    dtype = torch.float32 if is_mps else torch.float64
+                else:
+                    dtype = torch.int32 if is_mps else torch.int64
+                timesteps = torch.tensor([timesteps], dtype=dtype, device=latent_model_input.device)
+            elif len(timesteps.shape) == 0:
+                timesteps = timesteps[None].to(latent_model_input.device)
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            timesteps = timesteps.expand(latent_model_input.shape[0])
+            # predict noise model_output
+            noise_pred = self.transformer(
+                latent_model_input, timestep=timesteps, class_labels=class_labels_input
+            ).sample
+
+            # perform guidance
+            if guidance_scale > 1:
+                eps, rest = noise_pred[:, :latent_channels], noise_pred[:, latent_channels:]
+                cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+
+                half_eps = uncond_eps + guidance_scale * (cond_eps - uncond_eps)
+                eps = torch.cat([half_eps, half_eps], dim=0)
+
+                noise_pred = torch.cat([eps, rest], dim=1)
+
+            # learned sigma
+            if self.transformer.config.out_channels // 2 == latent_channels:
+                model_output, _ = torch.split(noise_pred, latent_channels, dim=1)
+            else:
+                model_output = noise_pred
+
+            # compute previous image: x_t -> x_t-1
+            latent_model_input = self.scheduler.step(model_output, t, latent_model_input).prev_sample
+
+        if guidance_scale > 1:
+            latents, _ = latent_model_input.chunk(2, dim=0)
+        else:
+            latents = latent_model_input
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        samples = self.vae.decode(latents).sample
+
+        samples = (samples / 2 + 0.5).clamp(0, 1)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        samples = samples.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            samples = self.numpy_to_pil(samples)
+
+        if not return_dict:
+            return (samples,)
+
+        return ImagePipelineOutput(images=samples)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/free_init_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/free_init_utils.py
new file mode 100644
index 000000000..50c28cc69
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/free_init_utils.py
@@ -0,0 +1,184 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Tuple, Union
+
+import torch
+import torch.fft as fft
+
+from ..utils.torch_utils import randn_tensor
+
+
+class FreeInitMixin:
+    r"""Mixin class for FreeInit."""
+
+    def enable_free_init(
+        self,
+        num_iters: int = 3,
+        use_fast_sampling: bool = False,
+        method: str = "butterworth",
+        order: int = 4,
+        spatial_stop_frequency: float = 0.25,
+        temporal_stop_frequency: float = 0.25,
+    ):
+        """Enables the FreeInit mechanism as in https://arxiv.org/abs/2312.07537.
+
+        This implementation has been adapted from the [official repository](https://github.com/TianxingWu/FreeInit).
+
+        Args:
+            num_iters (`int`, *optional*, defaults to `3`):
+                Number of FreeInit noise re-initialization iterations.
+            use_fast_sampling (`bool`, *optional*, defaults to `False`):
+                Whether or not to speedup sampling procedure at the cost of probably lower quality results. Enables
+                the "Coarse-to-Fine Sampling" strategy, as mentioned in the paper, if set to `True`.
+            method (`str`, *optional*, defaults to `butterworth`):
+                Must be one of `butterworth`, `ideal` or `gaussian` to use as the filtering method for the
+                FreeInit low pass filter.
+            order (`int`, *optional*, defaults to `4`):
+                Order of the filter used in `butterworth` method. Larger values lead to `ideal` method behaviour
+                whereas lower values lead to `gaussian` method behaviour.
+            spatial_stop_frequency (`float`, *optional*, defaults to `0.25`):
+                Normalized stop frequency for spatial dimensions. Must be between 0 to 1. Referred to as `d_s` in
+                the original implementation.
+            temporal_stop_frequency (`float`, *optional*, defaults to `0.25`):
+                Normalized stop frequency for temporal dimensions. Must be between 0 to 1. Referred to as `d_t` in
+                the original implementation.
+        """
+        self._free_init_num_iters = num_iters
+        self._free_init_use_fast_sampling = use_fast_sampling
+        self._free_init_method = method
+        self._free_init_order = order
+        self._free_init_spatial_stop_frequency = spatial_stop_frequency
+        self._free_init_temporal_stop_frequency = temporal_stop_frequency
+
+    def disable_free_init(self):
+        """Disables the FreeInit mechanism if enabled."""
+        self._free_init_num_iters = None
+
+    @property
+    def free_init_enabled(self):
+        return hasattr(self, "_free_init_num_iters") and self._free_init_num_iters is not None
+
+    def _get_free_init_freq_filter(
+        self,
+        shape: Tuple[int, ...],
+        device: Union[str, torch.dtype],
+        filter_type: str,
+        order: float,
+        spatial_stop_frequency: float,
+        temporal_stop_frequency: float,
+    ) -> torch.Tensor:
+        r"""Returns the FreeInit filter based on filter type and other input conditions."""
+
+        time, height, width = shape[-3], shape[-2], shape[-1]
+        mask = torch.zeros(shape)
+
+        if spatial_stop_frequency == 0 or temporal_stop_frequency == 0:
+            return mask
+
+        if filter_type == "butterworth":
+
+            def retrieve_mask(x):
+                return 1 / (1 + (x / spatial_stop_frequency**2) ** order)
+        elif filter_type == "gaussian":
+
+            def retrieve_mask(x):
+                return math.exp(-1 / (2 * spatial_stop_frequency**2) * x)
+        elif filter_type == "ideal":
+
+            def retrieve_mask(x):
+                return 1 if x <= spatial_stop_frequency * 2 else 0
+        else:
+            raise NotImplementedError("`filter_type` must be one of gaussian, butterworth or ideal")
+
+        for t in range(time):
+            for h in range(height):
+                for w in range(width):
+                    d_square = (
+                        ((spatial_stop_frequency / temporal_stop_frequency) * (2 * t / time - 1)) ** 2
+                        + (2 * h / height - 1) ** 2
+                        + (2 * w / width - 1) ** 2
+                    )
+                    mask[..., t, h, w] = retrieve_mask(d_square)
+
+        return mask.to(device)
+
+    def _apply_freq_filter(self, x: torch.Tensor, noise: torch.Tensor, low_pass_filter: torch.Tensor) -> torch.Tensor:
+        r"""Noise reinitialization."""
+        # FFT
+        x_freq = fft.fftn(x, dim=(-3, -2, -1))
+        x_freq = fft.fftshift(x_freq, dim=(-3, -2, -1))
+        noise_freq = fft.fftn(noise, dim=(-3, -2, -1))
+        noise_freq = fft.fftshift(noise_freq, dim=(-3, -2, -1))
+
+        # frequency mix
+        high_pass_filter = 1 - low_pass_filter
+        x_freq_low = x_freq * low_pass_filter
+        noise_freq_high = noise_freq * high_pass_filter
+        x_freq_mixed = x_freq_low + noise_freq_high  # mix in freq domain
+
+        # IFFT
+        x_freq_mixed = fft.ifftshift(x_freq_mixed, dim=(-3, -2, -1))
+        x_mixed = fft.ifftn(x_freq_mixed, dim=(-3, -2, -1)).real
+
+        return x_mixed
+
+    def _apply_free_init(
+        self,
+        latents: torch.Tensor,
+        free_init_iteration: int,
+        num_inference_steps: int,
+        device: torch.device,
+        dtype: torch.dtype,
+        generator: torch.Generator,
+    ):
+        if free_init_iteration == 0:
+            self._free_init_initial_noise = latents.detach().clone()
+            return latents, self.scheduler.timesteps
+
+        latent_shape = latents.shape
+
+        free_init_filter_shape = (1, *latent_shape[1:])
+        free_init_freq_filter = self._get_free_init_freq_filter(
+            shape=free_init_filter_shape,
+            device=device,
+            filter_type=self._free_init_method,
+            order=self._free_init_order,
+            spatial_stop_frequency=self._free_init_spatial_stop_frequency,
+            temporal_stop_frequency=self._free_init_temporal_stop_frequency,
+        )
+
+        current_diffuse_timestep = self.scheduler.config.num_train_timesteps - 1
+        diffuse_timesteps = torch.full((latent_shape[0],), current_diffuse_timestep).long()
+
+        z_t = self.scheduler.add_noise(
+            original_samples=latents, noise=self._free_init_initial_noise, timesteps=diffuse_timesteps.to(device)
+        ).to(dtype=torch.float32)
+
+        z_rand = randn_tensor(
+            shape=latent_shape,
+            generator=generator,
+            device=device,
+            dtype=torch.float32,
+        )
+        latents = self._apply_freq_filter(z_t, z_rand, low_pass_filter=free_init_freq_filter)
+        latents = latents.to(dtype)
+
+        # Coarse-to-Fine Sampling for faster inference (can lead to lower quality)
+        if self._free_init_use_fast_sampling:
+            num_inference_steps = int(num_inference_steps / self._free_init_num_iters * (free_init_iteration + 1))
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+
+        return latents, self.scheduler.timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/i2vgen_xl/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/i2vgen_xl/__init__.py
new file mode 100644
index 000000000..b24a7e4ce
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/i2vgen_xl/__init__.py
@@ -0,0 +1,46 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_i2vgen_xl"] = ["I2VGenXLPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipeline_i2vgen_xl import I2VGenXLPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py
new file mode 100644
index 000000000..cb6f3e300
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py
@@ -0,0 +1,798 @@
+# Copyright 2024 Alibaba DAMO-VILAB and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...models import AutoencoderKL
+from ...models.unets.unet_i2vgen_xl import I2VGenXLUNet
+from ...schedulers import DDIMScheduler
+from ...utils import (
+    BaseOutput,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import I2VGenXLPipeline
+        >>> from diffusers.utils import export_to_gif, load_image
+
+        >>> pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16")
+        >>> pipeline.enable_model_cpu_offload()
+
+        >>> image_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"
+        >>> image = load_image(image_url).convert("RGB")
+
+        >>> prompt = "Papers were floating in the air on a table in the library"
+        >>> negative_prompt = "Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms"
+        >>> generator = torch.manual_seed(8888)
+
+        >>> frames = pipeline(
+        ...     prompt=prompt,
+        ...     image=image,
+        ...     num_inference_steps=50,
+        ...     negative_prompt=negative_prompt,
+        ...     guidance_scale=9.0,
+        ...     generator=generator
+        ... ).frames[0]
+        >>> video_path = export_to_gif(frames, "i2v.gif")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
+def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
+    batch_size, channels, num_frames, height, width = video.shape
+    outputs = []
+    for batch_idx in range(batch_size):
+        batch_vid = video[batch_idx].permute(1, 0, 2, 3)
+        batch_output = processor.postprocess(batch_vid, output_type)
+
+        outputs.append(batch_output)
+
+    if output_type == "np":
+        outputs = np.stack(outputs)
+
+    elif output_type == "pt":
+        outputs = torch.stack(outputs)
+
+    elif not output_type == "pil":
+        raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
+
+    return outputs
+
+
+@dataclass
+class I2VGenXLPipelineOutput(BaseOutput):
+    r"""
+     Output class for image-to-video pipeline.
+
+     Args:
+         frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
+             List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised
+     PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+    `(batch_size, num_frames, channels, height, width)`
+    """
+
+    frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
+
+
+class I2VGenXLPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+):
+    r"""
+    Pipeline for image-to-video generation as proposed in [I2VGenXL](https://i2vgen-xl.github.io/).
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`I2VGenXLUNet`]):
+            A [`I2VGenXLUNet`] to denoise the encoded video latents.
+        scheduler ([`DDIMScheduler`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        image_encoder: CLIPVisionModelWithProjection,
+        feature_extractor: CLIPImageProcessor,
+        unet: I2VGenXLUNet,
+        scheduler: DDIMScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        # `do_resize=False` as we do custom resizing.
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_resize=False)
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_videos_per_prompt,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_videos_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if self.do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            # Apply clip_skip to negative prompt embeds
+            if clip_skip is None:
+                negative_prompt_embeds = self.text_encoder(
+                    uncond_input.input_ids.to(device),
+                    attention_mask=attention_mask,
+                )
+                negative_prompt_embeds = negative_prompt_embeds[0]
+            else:
+                negative_prompt_embeds = self.text_encoder(
+                    uncond_input.input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                negative_prompt_embeds = negative_prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                negative_prompt_embeds = self.text_encoder.text_model.final_layer_norm(negative_prompt_embeds)
+
+        if self.do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def _encode_image(self, image, device, num_videos_per_prompt):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.image_processor.pil_to_numpy(image)
+            image = self.image_processor.numpy_to_pt(image)
+
+            # Normalize the image with CLIP training stats.
+            image = self.feature_extractor(
+                images=image,
+                do_normalize=True,
+                do_center_crop=False,
+                do_resize=False,
+                do_rescale=False,
+                return_tensors="pt",
+            ).pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeddings = self.image_encoder(image).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_videos_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+
+        if self.do_classifier_free_guidance:
+            negative_image_embeddings = torch.zeros_like(image_embeddings)
+            image_embeddings = torch.cat([negative_image_embeddings, image_embeddings])
+
+        return image_embeddings
+
+    def decode_latents(self, latents, decode_chunk_size=None):
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+        if decode_chunk_size is not None:
+            frames = []
+            for i in range(0, latents.shape[0], decode_chunk_size):
+                frame = self.vae.decode(latents[i : i + decode_chunk_size]).sample
+                frames.append(frame)
+            image = torch.cat(frames, dim=0)
+        else:
+            image = self.vae.decode(latents).sample
+
+        decode_shape = (batch_size, num_frames, -1) + image.shape[2:]
+        video = image[None, :].reshape(decode_shape).permute(0, 2, 1, 3, 4)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        height,
+        width,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+
+    def prepare_image_latents(
+        self,
+        image,
+        device,
+        num_frames,
+        num_videos_per_prompt,
+    ):
+        image = image.to(device=device)
+        image_latents = self.vae.encode(image).latent_dist.sample()
+        image_latents = image_latents * self.vae.config.scaling_factor
+
+        # Add frames dimension to image latents
+        image_latents = image_latents.unsqueeze(2)
+
+        # Append a position mask for each subsequent frame
+        # after the intial image latent frame
+        frame_position_mask = []
+        for frame_idx in range(num_frames - 1):
+            scale = (frame_idx + 1) / (num_frames - 1)
+            frame_position_mask.append(torch.ones_like(image_latents[:, :, :1]) * scale)
+        if frame_position_mask:
+            frame_position_mask = torch.cat(frame_position_mask, dim=2)
+            image_latents = torch.cat([image_latents, frame_position_mask], dim=2)
+
+        # duplicate image_latents for each generation per prompt, using mps friendly method
+        image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1, 1)
+
+        if self.do_classifier_free_guidance:
+            image_latents = torch.cat([image_latents] * 2)
+
+        return image_latents
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        height: Optional[int] = 704,
+        width: Optional[int] = 1280,
+        target_fps: Optional[int] = 16,
+        num_frames: int = 16,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 9.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        eta: float = 0.0,
+        num_videos_per_prompt: Optional[int] = 1,
+        decode_chunk_size: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = 1,
+    ):
+        r"""
+        The call function to the pipeline for image-to-video generation with [`I2VGenXLPipeline`].
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
+                [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            target_fps (`int`, *optional*):
+                Frames per second. The rate at which the generated images shall be exported to a video after generation. This is also used as a "micro-condition" while generation.
+            num_frames (`int`, *optional*):
+                The number of video frames to generate.
+            num_inference_steps (`int`, *optional*):
+                The number of denoising steps.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            num_videos_per_prompt (`int`, *optional*):
+                The number of images to generate per prompt.
+            decode_chunk_size (`int`, *optional*):
+                The number of frames to decode at a time. The higher the chunk size, the higher the temporal consistency
+                between frames, but also the higher the memory consumption. By default, the decoder will decode all frames at once
+                for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+
+        Examples:
+
+        Returns:
+            [`pipelines.i2vgen_xl.pipeline_i2vgen_xl.I2VGenXLPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`pipelines.i2vgen_xl.pipeline_i2vgen_xl.I2VGenXLPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, image, height, width, negative_prompt, prompt_embeds, negative_prompt_embeds)
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        self._guidance_scale = guidance_scale
+
+        # 3.1 Encode input text prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_videos_per_prompt,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 3.2 Encode image prompt
+        # 3.2.1 Image encodings.
+        # https://github.com/ali-vilab/i2vgen-xl/blob/2539c9262ff8a2a22fa9daecbfd13f0a2dbc32d0/tools/inferences/inference_i2vgen_entrance.py#L114
+        cropped_image = _center_crop_wide(image, (width, width))
+        cropped_image = _resize_bilinear(
+            cropped_image, (self.feature_extractor.crop_size["width"], self.feature_extractor.crop_size["height"])
+        )
+        image_embeddings = self._encode_image(cropped_image, device, num_videos_per_prompt)
+
+        # 3.2.2 Image latents.
+        resized_image = _center_crop_wide(image, (width, height))
+        image = self.image_processor.preprocess(resized_image).to(device=device, dtype=image_embeddings.dtype)
+        image_latents = self.prepare_image_latents(
+            image,
+            device=device,
+            num_frames=num_frames,
+            num_videos_per_prompt=num_videos_per_prompt,
+        )
+
+        # 3.3 Prepare additional conditions for the UNet.
+        if self.do_classifier_free_guidance:
+            fps_tensor = torch.tensor([target_fps, target_fps]).to(device)
+        else:
+            fps_tensor = torch.tensor([target_fps]).to(device)
+        fps_tensor = fps_tensor.repeat(batch_size * num_videos_per_prompt, 1).ravel()
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    fps=fps_tensor,
+                    image_latents=image_latents,
+                    image_embeddings=image_embeddings,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # reshape latents
+                batch_size, channel, frames, width, height = latents.shape
+                latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * frames, channel, width, height)
+                noise_pred = noise_pred.permute(0, 2, 1, 3, 4).reshape(batch_size * frames, channel, width, height)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # reshape latents back
+                latents = latents[None, :].reshape(batch_size, frames, channel, width, height).permute(0, 2, 1, 3, 4)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        # 8. Post processing
+        if output_type == "latent":
+            video = latents
+        else:
+            video_tensor = self.decode_latents(latents, decode_chunk_size=decode_chunk_size)
+            video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
+
+        # 9. Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return I2VGenXLPipelineOutput(frames=video)
+
+
+# The following utilities are taken and adapted from
+# https://github.com/ali-vilab/i2vgen-xl/blob/main/utils/transforms.py.
+
+
+def _convert_pt_to_pil(image: Union[torch.Tensor, List[torch.Tensor]]):
+    if isinstance(image, list) and isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, 0)
+
+    if isinstance(image, torch.Tensor):
+        if image.ndim == 3:
+            image = image.unsqueeze(0)
+
+        image_numpy = VaeImageProcessor.pt_to_numpy(image)
+        image_pil = VaeImageProcessor.numpy_to_pil(image_numpy)
+        image = image_pil
+
+    return image
+
+
+def _resize_bilinear(
+    image: Union[torch.Tensor, List[torch.Tensor], PIL.Image.Image, List[PIL.Image.Image]], resolution: Tuple[int, int]
+):
+    # First convert the images to PIL in case they are float tensors (only relevant for tests now).
+    image = _convert_pt_to_pil(image)
+
+    if isinstance(image, list):
+        image = [u.resize(resolution, PIL.Image.BILINEAR) for u in image]
+    else:
+        image = image.resize(resolution, PIL.Image.BILINEAR)
+    return image
+
+
+def _center_crop_wide(
+    image: Union[torch.Tensor, List[torch.Tensor], PIL.Image.Image, List[PIL.Image.Image]], resolution: Tuple[int, int]
+):
+    # First convert the images to PIL in case they are float tensors (only relevant for tests now).
+    image = _convert_pt_to_pil(image)
+
+    if isinstance(image, list):
+        scale = min(image[0].size[0] / resolution[0], image[0].size[1] / resolution[1])
+        image = [u.resize((round(u.width // scale), round(u.height // scale)), resample=PIL.Image.BOX) for u in image]
+
+        # center crop
+        x1 = (image[0].width - resolution[0]) // 2
+        y1 = (image[0].height - resolution[1]) // 2
+        image = [u.crop((x1, y1, x1 + resolution[0], y1 + resolution[1])) for u in image]
+        return image
+    else:
+        scale = min(image.size[0] / resolution[0], image.size[1] / resolution[1])
+        image = image.resize((round(image.width // scale), round(image.height // scale)), resample=PIL.Image.BOX)
+        x1 = (image.width - resolution[0]) // 2
+        y1 = (image.height - resolution[1]) // 2
+        image = image.crop((x1, y1, x1 + resolution[0], y1 + resolution[1]))
+        return image
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/__init__.py
new file mode 100644
index 000000000..606f7b378
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/__init__.py
@@ -0,0 +1,66 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_kandinsky"] = ["KandinskyPipeline"]
+    _import_structure["pipeline_kandinsky_combined"] = [
+        "KandinskyCombinedPipeline",
+        "KandinskyImg2ImgCombinedPipeline",
+        "KandinskyInpaintCombinedPipeline",
+    ]
+    _import_structure["pipeline_kandinsky_img2img"] = ["KandinskyImg2ImgPipeline"]
+    _import_structure["pipeline_kandinsky_inpaint"] = ["KandinskyInpaintPipeline"]
+    _import_structure["pipeline_kandinsky_prior"] = ["KandinskyPriorPipeline", "KandinskyPriorPipelineOutput"]
+    _import_structure["text_encoder"] = ["MultilingualCLIP"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+
+    else:
+        from .pipeline_kandinsky import KandinskyPipeline
+        from .pipeline_kandinsky_combined import (
+            KandinskyCombinedPipeline,
+            KandinskyImg2ImgCombinedPipeline,
+            KandinskyInpaintCombinedPipeline,
+        )
+        from .pipeline_kandinsky_img2img import KandinskyImg2ImgPipeline
+        from .pipeline_kandinsky_inpaint import KandinskyInpaintPipeline
+        from .pipeline_kandinsky_prior import KandinskyPriorPipeline, KandinskyPriorPipelineOutput
+        from .text_encoder import MultilingualCLIP
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
new file mode 100644
index 000000000..34b5a47c2
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
@@ -0,0 +1,407 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, List, Optional, Union
+
+import torch
+from transformers import (
+    XLMRobertaTokenizer,
+)
+
+from ...models import UNet2DConditionModel, VQModel
+from ...schedulers import DDIMScheduler, DDPMScheduler
+from ...utils import (
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from .text_encoder import MultilingualCLIP
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyPipeline, KandinskyPriorPipeline
+        >>> import torch
+
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/Kandinsky-2-1-prior")
+        >>> pipe_prior.to("cuda")
+
+        >>> prompt = "red cat, 4k photo"
+        >>> out = pipe_prior(prompt)
+        >>> image_emb = out.image_embeds
+        >>> negative_image_emb = out.negative_image_embeds
+
+        >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1")
+        >>> pipe.to("cuda")
+
+        >>> image = pipe(
+        ...     prompt,
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=negative_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=100,
+        ... ).images
+
+        >>> image[0].save("cat.png")
+        ```
+"""
+
+
+def get_new_h_w(h, w, scale_factor=8):
+    new_h = h // scale_factor**2
+    if h % scale_factor**2 != 0:
+        new_h += 1
+    new_w = w // scale_factor**2
+    if w % scale_factor**2 != 0:
+        new_w += 1
+    return new_h * scale_factor, new_w * scale_factor
+
+
+class KandinskyPipeline(DiffusionPipeline):
+    """
+    Pipeline for text-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        text_encoder ([`MultilingualCLIP`]):
+            Frozen text-encoder.
+        tokenizer ([`XLMRobertaTokenizer`]):
+            Tokenizer of class
+        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->movq"
+
+    def __init__(
+        self,
+        text_encoder: MultilingualCLIP,
+        tokenizer: XLMRobertaTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, DDPMScheduler],
+        movq: VQModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+    ):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            truncation=True,
+            max_length=77,
+            return_attention_mask=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        text_input_ids = text_input_ids.to(device)
+        text_mask = text_inputs.attention_mask.to(device)
+
+        prompt_embeds, text_encoder_hidden_states = self.text_encoder(
+            input_ids=text_input_ids, attention_mask=text_mask
+        )
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=77,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            uncond_text_input_ids = uncond_input.input_ids.to(device)
+            uncond_text_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds, uncond_text_encoder_hidden_states = self.text_encoder(
+                input_ids=uncond_text_input_ids, attention_mask=uncond_text_mask
+            )
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for text prompt, that will be used to condition the image generation.
+            negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for negative text prompt, will be used to condition the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        device = self._execution_device
+
+        batch_size = batch_size * num_images_per_prompt
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        prompt_embeds, text_encoder_hidden_states, _ = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        if isinstance(image_embeds, list):
+            image_embeds = torch.cat(image_embeds, dim=0)
+        if isinstance(negative_image_embeds, list):
+            negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+
+        if do_classifier_free_guidance:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+            image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
+                dtype=prompt_embeds.dtype, device=device
+            )
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps_tensor = self.scheduler.timesteps
+
+        num_channels_latents = self.unet.config.in_channels
+
+        height, width = get_new_h_w(height, width, self.movq_scale_factor)
+
+        # create initial latent
+        latents = self.prepare_latents(
+            (batch_size, num_channels_latents, height, width),
+            text_encoder_hidden_states.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            added_cond_kwargs = {"text_embeds": prompt_embeds, "image_embeds": image_embeds}
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=text_encoder_hidden_states,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+            if do_classifier_free_guidance:
+                noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                _, variance_pred_text = variance_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
+
+            if not (
+                hasattr(self.scheduler.config, "variance_type")
+                and self.scheduler.config.variance_type in ["learned", "learned_range"]
+            ):
+                noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+                generator=generator,
+            ).prev_sample
+
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        # post-processing
+        image = self.movq.decode(latents, force_not_quantize=True)["sample"]
+
+        self.maybe_free_model_hooks()
+
+        if output_type not in ["pt", "np", "pil"]:
+            raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
+
+        if output_type in ["np", "pil"]:
+            image = image * 0.5 + 0.5
+            image = image.clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py
new file mode 100644
index 000000000..da5ff52ed
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py
@@ -0,0 +1,814 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+    XLMRobertaTokenizer,
+)
+
+from ...models import PriorTransformer, UNet2DConditionModel, VQModel
+from ...schedulers import DDIMScheduler, DDPMScheduler, UnCLIPScheduler
+from ...utils import (
+    replace_example_docstring,
+)
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_kandinsky import KandinskyPipeline
+from .pipeline_kandinsky_img2img import KandinskyImg2ImgPipeline
+from .pipeline_kandinsky_inpaint import KandinskyInpaintPipeline
+from .pipeline_kandinsky_prior import KandinskyPriorPipeline
+from .text_encoder import MultilingualCLIP
+
+
+TEXT2IMAGE_EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        from diffusers import AutoPipelineForText2Image
+        import torch
+
+        pipe = AutoPipelineForText2Image.from_pretrained(
+            "kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+
+        prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k"
+
+        image = pipe(prompt=prompt, num_inference_steps=25).images[0]
+        ```
+"""
+
+IMAGE2IMAGE_EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        from diffusers import AutoPipelineForImage2Image
+        import torch
+        import requests
+        from io import BytesIO
+        from PIL import Image
+        import os
+
+        pipe = AutoPipelineForImage2Image.from_pretrained(
+            "kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+
+        prompt = "A fantasy landscape, Cinematic lighting"
+        negative_prompt = "low quality, bad quality"
+
+        url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+        response = requests.get(url)
+        image = Image.open(BytesIO(response.content)).convert("RGB")
+        image.thumbnail((768, 768))
+
+        image = pipe(prompt=prompt, image=original_image, num_inference_steps=25).images[0]
+        ```
+"""
+
+INPAINT_EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        from diffusers import AutoPipelineForInpainting
+        from diffusers.utils import load_image
+        import torch
+        import numpy as np
+
+        pipe = AutoPipelineForInpainting.from_pretrained(
+            "kandinsky-community/kandinsky-2-1-inpaint", torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+
+        prompt = "A fantasy landscape, Cinematic lighting"
+        negative_prompt = "low quality, bad quality"
+
+        original_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
+        )
+
+        mask = np.zeros((768, 768), dtype=np.float32)
+        # Let's mask out an area above the cat's head
+        mask[:250, 250:-250] = 1
+
+        image = pipe(prompt=prompt, image=original_image, mask_image=mask, num_inference_steps=25).images[0]
+        ```
+"""
+
+
+class KandinskyCombinedPipeline(DiffusionPipeline):
+    """
+    Combined Pipeline for text-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        text_encoder ([`MultilingualCLIP`]):
+            Frozen text-encoder.
+        tokenizer ([`XLMRobertaTokenizer`]):
+            Tokenizer of class
+        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+        prior_prior ([`PriorTransformer`]):
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
+        prior_text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        prior_tokenizer (`CLIPTokenizer`):
+             Tokenizer of class
+             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        prior_scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+    """
+
+    _load_connected_pipes = True
+    model_cpu_offload_seq = "text_encoder->unet->movq->prior_prior->prior_image_encoder->prior_text_encoder"
+
+    def __init__(
+        self,
+        text_encoder: MultilingualCLIP,
+        tokenizer: XLMRobertaTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, DDPMScheduler],
+        movq: VQModel,
+        prior_prior: PriorTransformer,
+        prior_image_encoder: CLIPVisionModelWithProjection,
+        prior_text_encoder: CLIPTextModelWithProjection,
+        prior_tokenizer: CLIPTokenizer,
+        prior_scheduler: UnCLIPScheduler,
+        prior_image_processor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+            prior_prior=prior_prior,
+            prior_image_encoder=prior_image_encoder,
+            prior_text_encoder=prior_text_encoder,
+            prior_tokenizer=prior_tokenizer,
+            prior_scheduler=prior_scheduler,
+            prior_image_processor=prior_image_processor,
+        )
+        self.prior_pipe = KandinskyPriorPipeline(
+            prior=prior_prior,
+            image_encoder=prior_image_encoder,
+            text_encoder=prior_text_encoder,
+            tokenizer=prior_tokenizer,
+            scheduler=prior_scheduler,
+            image_processor=prior_image_processor,
+        )
+        self.decoder_pipe = KandinskyPipeline(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
+        self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
+        Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a
+        GPU only when their specific submodule's `forward` method is called. Offloading happens on a submodule basis.
+        Memory savings are higher than using `enable_model_cpu_offload`, but performance is lower.
+        """
+        self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
+        self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
+
+    def progress_bar(self, iterable=None, total=None):
+        self.prior_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.enable_model_cpu_offload()
+
+    def set_progress_bar_config(self, **kwargs):
+        self.prior_pipe.set_progress_bar_config(**kwargs)
+        self.decoder_pipe.set_progress_bar_config(**kwargs)
+
+    @torch.no_grad()
+    @replace_example_docstring(TEXT2IMAGE_EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        height: int = 512,
+        width: int = 512,
+        prior_guidance_scale: float = 4.0,
+        prior_num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            prior_num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        prior_outputs = self.prior_pipe(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            num_inference_steps=prior_num_inference_steps,
+            generator=generator,
+            latents=latents,
+            guidance_scale=prior_guidance_scale,
+            output_type="pt",
+            return_dict=False,
+        )
+        image_embeds = prior_outputs[0]
+        negative_image_embeds = prior_outputs[1]
+
+        prompt = [prompt] if not isinstance(prompt, (list, tuple)) else prompt
+
+        if len(prompt) < image_embeds.shape[0] and image_embeds.shape[0] % len(prompt) == 0:
+            prompt = (image_embeds.shape[0] // len(prompt)) * prompt
+
+        outputs = self.decoder_pipe(
+            prompt=prompt,
+            image_embeds=image_embeds,
+            negative_image_embeds=negative_image_embeds,
+            width=width,
+            height=height,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            guidance_scale=guidance_scale,
+            output_type=output_type,
+            callback=callback,
+            callback_steps=callback_steps,
+            return_dict=return_dict,
+        )
+
+        self.maybe_free_model_hooks()
+
+        return outputs
+
+
+class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
+    """
+    Combined Pipeline for image-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        text_encoder ([`MultilingualCLIP`]):
+            Frozen text-encoder.
+        tokenizer ([`XLMRobertaTokenizer`]):
+            Tokenizer of class
+        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+        prior_prior ([`PriorTransformer`]):
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
+        prior_text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        prior_tokenizer (`CLIPTokenizer`):
+             Tokenizer of class
+             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        prior_scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+    """
+
+    _load_connected_pipes = True
+    model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->prior_prior->" "text_encoder->unet->movq"
+
+    def __init__(
+        self,
+        text_encoder: MultilingualCLIP,
+        tokenizer: XLMRobertaTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, DDPMScheduler],
+        movq: VQModel,
+        prior_prior: PriorTransformer,
+        prior_image_encoder: CLIPVisionModelWithProjection,
+        prior_text_encoder: CLIPTextModelWithProjection,
+        prior_tokenizer: CLIPTokenizer,
+        prior_scheduler: UnCLIPScheduler,
+        prior_image_processor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+            prior_prior=prior_prior,
+            prior_image_encoder=prior_image_encoder,
+            prior_text_encoder=prior_text_encoder,
+            prior_tokenizer=prior_tokenizer,
+            prior_scheduler=prior_scheduler,
+            prior_image_processor=prior_image_processor,
+        )
+        self.prior_pipe = KandinskyPriorPipeline(
+            prior=prior_prior,
+            image_encoder=prior_image_encoder,
+            text_encoder=prior_text_encoder,
+            tokenizer=prior_tokenizer,
+            scheduler=prior_scheduler,
+            image_processor=prior_image_processor,
+        )
+        self.decoder_pipe = KandinskyImg2ImgPipeline(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
+        self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        Note that offloading happens on a submodule basis. Memory savings are higher than with
+        `enable_model_cpu_offload`, but performance is lower.
+        """
+        self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
+        self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
+
+    def progress_bar(self, iterable=None, total=None):
+        self.prior_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.enable_model_cpu_offload()
+
+    def set_progress_bar_config(self, **kwargs):
+        self.prior_pipe.set_progress_bar_config(**kwargs)
+        self.decoder_pipe.set_progress_bar_config(**kwargs)
+
+    @torch.no_grad()
+    @replace_example_docstring(IMAGE2IMAGE_EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        strength: float = 0.3,
+        height: int = 512,
+        width: int = 512,
+        prior_guidance_scale: float = 4.0,
+        prior_num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
+                again.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            strength (`float`, *optional*, defaults to 0.3):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            prior_num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        prior_outputs = self.prior_pipe(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            num_inference_steps=prior_num_inference_steps,
+            generator=generator,
+            latents=latents,
+            guidance_scale=prior_guidance_scale,
+            output_type="pt",
+            return_dict=False,
+        )
+        image_embeds = prior_outputs[0]
+        negative_image_embeds = prior_outputs[1]
+
+        prompt = [prompt] if not isinstance(prompt, (list, tuple)) else prompt
+        image = [image] if isinstance(prompt, PIL.Image.Image) else image
+
+        if len(prompt) < image_embeds.shape[0] and image_embeds.shape[0] % len(prompt) == 0:
+            prompt = (image_embeds.shape[0] // len(prompt)) * prompt
+
+        if (
+            isinstance(image, (list, tuple))
+            and len(image) < image_embeds.shape[0]
+            and image_embeds.shape[0] % len(image) == 0
+        ):
+            image = (image_embeds.shape[0] // len(image)) * image
+
+        outputs = self.decoder_pipe(
+            prompt=prompt,
+            image=image,
+            image_embeds=image_embeds,
+            negative_image_embeds=negative_image_embeds,
+            strength=strength,
+            width=width,
+            height=height,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            guidance_scale=guidance_scale,
+            output_type=output_type,
+            callback=callback,
+            callback_steps=callback_steps,
+            return_dict=return_dict,
+        )
+
+        self.maybe_free_model_hooks()
+
+        return outputs
+
+
+class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
+    """
+    Combined Pipeline for generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        text_encoder ([`MultilingualCLIP`]):
+            Frozen text-encoder.
+        tokenizer ([`XLMRobertaTokenizer`]):
+            Tokenizer of class
+        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+        prior_prior ([`PriorTransformer`]):
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
+        prior_text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        prior_tokenizer (`CLIPTokenizer`):
+             Tokenizer of class
+             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        prior_scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+    """
+
+    _load_connected_pipes = True
+    model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->prior_prior->text_encoder->unet->movq"
+
+    def __init__(
+        self,
+        text_encoder: MultilingualCLIP,
+        tokenizer: XLMRobertaTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, DDPMScheduler],
+        movq: VQModel,
+        prior_prior: PriorTransformer,
+        prior_image_encoder: CLIPVisionModelWithProjection,
+        prior_text_encoder: CLIPTextModelWithProjection,
+        prior_tokenizer: CLIPTokenizer,
+        prior_scheduler: UnCLIPScheduler,
+        prior_image_processor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+            prior_prior=prior_prior,
+            prior_image_encoder=prior_image_encoder,
+            prior_text_encoder=prior_text_encoder,
+            prior_tokenizer=prior_tokenizer,
+            prior_scheduler=prior_scheduler,
+            prior_image_processor=prior_image_processor,
+        )
+        self.prior_pipe = KandinskyPriorPipeline(
+            prior=prior_prior,
+            image_encoder=prior_image_encoder,
+            text_encoder=prior_text_encoder,
+            tokenizer=prior_tokenizer,
+            scheduler=prior_scheduler,
+            image_processor=prior_image_processor,
+        )
+        self.decoder_pipe = KandinskyInpaintPipeline(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
+        self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        Note that offloading happens on a submodule basis. Memory savings are higher than with
+        `enable_model_cpu_offload`, but performance is lower.
+        """
+        self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
+        self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
+
+    def progress_bar(self, iterable=None, total=None):
+        self.prior_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.enable_model_cpu_offload()
+
+    def set_progress_bar_config(self, **kwargs):
+        self.prior_pipe.set_progress_bar_config(**kwargs)
+        self.decoder_pipe.set_progress_bar_config(**kwargs)
+
+    @torch.no_grad()
+    @replace_example_docstring(INPAINT_EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]],
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        height: int = 512,
+        width: int = 512,
+        prior_guidance_scale: float = 4.0,
+        prior_num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
+                again.
+            mask_image (`np.array`):
+                Tensor representing an image batch, to mask `image`. White pixels in the mask will be repainted, while
+                black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single
+                channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3,
+                so the expected shape would be `(B, H, W, 1)`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            prior_num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        prior_outputs = self.prior_pipe(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            num_inference_steps=prior_num_inference_steps,
+            generator=generator,
+            latents=latents,
+            guidance_scale=prior_guidance_scale,
+            output_type="pt",
+            return_dict=False,
+        )
+        image_embeds = prior_outputs[0]
+        negative_image_embeds = prior_outputs[1]
+
+        prompt = [prompt] if not isinstance(prompt, (list, tuple)) else prompt
+        image = [image] if isinstance(prompt, PIL.Image.Image) else image
+        mask_image = [mask_image] if isinstance(mask_image, PIL.Image.Image) else mask_image
+
+        if len(prompt) < image_embeds.shape[0] and image_embeds.shape[0] % len(prompt) == 0:
+            prompt = (image_embeds.shape[0] // len(prompt)) * prompt
+
+        if (
+            isinstance(image, (list, tuple))
+            and len(image) < image_embeds.shape[0]
+            and image_embeds.shape[0] % len(image) == 0
+        ):
+            image = (image_embeds.shape[0] // len(image)) * image
+
+        if (
+            isinstance(mask_image, (list, tuple))
+            and len(mask_image) < image_embeds.shape[0]
+            and image_embeds.shape[0] % len(mask_image) == 0
+        ):
+            mask_image = (image_embeds.shape[0] // len(mask_image)) * mask_image
+
+        outputs = self.decoder_pipe(
+            prompt=prompt,
+            image=image,
+            mask_image=mask_image,
+            image_embeds=image_embeds,
+            negative_image_embeds=negative_image_embeds,
+            width=width,
+            height=height,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            guidance_scale=guidance_scale,
+            output_type=output_type,
+            callback=callback,
+            callback_steps=callback_steps,
+            return_dict=return_dict,
+        )
+
+        self.maybe_free_model_hooks()
+
+        return outputs
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
new file mode 100644
index 000000000..4d091e7d7
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
@@ -0,0 +1,500 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from PIL import Image
+from transformers import (
+    XLMRobertaTokenizer,
+)
+
+from ...models import UNet2DConditionModel, VQModel
+from ...schedulers import DDIMScheduler
+from ...utils import (
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from .text_encoder import MultilingualCLIP
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyImg2ImgPipeline, KandinskyPriorPipeline
+        >>> from diffusers.utils import load_image
+        >>> import torch
+
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior.to("cuda")
+
+        >>> prompt = "A red cartoon frog, 4k"
+        >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
+
+        >>> pipe = KandinskyImg2ImgPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> init_image = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/frog.png"
+        ... )
+
+        >>> image = pipe(
+        ...     prompt,
+        ...     image=init_image,
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=100,
+        ...     strength=0.2,
+        ... ).images
+
+        >>> image[0].save("red_frog.png")
+        ```
+"""
+
+
+def get_new_h_w(h, w, scale_factor=8):
+    new_h = h // scale_factor**2
+    if h % scale_factor**2 != 0:
+        new_h += 1
+    new_w = w // scale_factor**2
+    if w % scale_factor**2 != 0:
+        new_w += 1
+    return new_h * scale_factor, new_w * scale_factor
+
+
+def prepare_image(pil_image, w=512, h=512):
+    pil_image = pil_image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1)
+    arr = np.array(pil_image.convert("RGB"))
+    arr = arr.astype(np.float32) / 127.5 - 1
+    arr = np.transpose(arr, [2, 0, 1])
+    image = torch.from_numpy(arr).unsqueeze(0)
+    return image
+
+
+class KandinskyImg2ImgPipeline(DiffusionPipeline):
+    """
+    Pipeline for image-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        text_encoder ([`MultilingualCLIP`]):
+            Frozen text-encoder.
+        tokenizer ([`XLMRobertaTokenizer`]):
+            Tokenizer of class
+        scheduler ([`DDIMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ image encoder and decoder
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->movq"
+
+    def __init__(
+        self,
+        text_encoder: MultilingualCLIP,
+        movq: VQModel,
+        tokenizer: XLMRobertaTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: DDIMScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, latents, latent_timestep, shape, dtype, device, generator, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+
+        shape = latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        latents = self.add_noise(latents, noise, latent_timestep)
+        return latents
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+    ):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            truncation=True,
+            return_attention_mask=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        text_input_ids = text_input_ids.to(device)
+        text_mask = text_inputs.attention_mask.to(device)
+
+        prompt_embeds, text_encoder_hidden_states = self.text_encoder(
+            input_ids=text_input_ids, attention_mask=text_mask
+        )
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=77,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            uncond_text_input_ids = uncond_input.input_ids.to(device)
+            uncond_text_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds, uncond_text_encoder_hidden_states = self.text_encoder(
+                input_ids=uncond_text_input_ids, attention_mask=uncond_text_mask
+            )
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+
+    #  add_noise method to overwrite the one in schedule because it use a different beta schedule for adding noise vs sampling
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        betas = torch.linspace(0.0001, 0.02, 1000, dtype=torch.float32)
+        alphas = 1.0 - betas
+        alphas_cumprod = torch.cumprod(alphas, dim=0)
+        alphas_cumprod = alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+
+        return noisy_samples
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]],
+        image_embeds: torch.FloatTensor,
+        negative_image_embeds: torch.FloatTensor,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 100,
+        strength: float = 0.3,
+        guidance_scale: float = 7.0,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`torch.FloatTensor`, `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for text prompt, that will be used to condition the image generation.
+            negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for negative text prompt, will be used to condition the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            strength (`float`, *optional*, defaults to 0.3):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        # 1. Define call parameters
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        device = self._execution_device
+
+        batch_size = batch_size * num_images_per_prompt
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 2. get text and image embeddings
+        prompt_embeds, text_encoder_hidden_states, _ = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        if isinstance(image_embeds, list):
+            image_embeds = torch.cat(image_embeds, dim=0)
+        if isinstance(negative_image_embeds, list):
+            negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+
+        if do_classifier_free_guidance:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+            image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
+                dtype=prompt_embeds.dtype, device=device
+            )
+
+        # 3. pre-processing initial image
+        if not isinstance(image, list):
+            image = [image]
+        if not all(isinstance(i, (PIL.Image.Image, torch.Tensor)) for i in image):
+            raise ValueError(
+                f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support  PIL image and pytorch tensor"
+            )
+
+        image = torch.cat([prepare_image(i, width, height) for i in image], dim=0)
+        image = image.to(dtype=prompt_embeds.dtype, device=device)
+
+        latents = self.movq.encode(image)["latents"]
+        latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
+
+        # 4. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+
+        timesteps_tensor, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+
+        # the formular to calculate timestep for add_noise is taken from the original kandinsky repo
+        latent_timestep = int(self.scheduler.config.num_train_timesteps * strength) - 2
+
+        latent_timestep = torch.tensor([latent_timestep] * batch_size, dtype=timesteps_tensor.dtype, device=device)
+
+        num_channels_latents = self.unet.config.in_channels
+
+        height, width = get_new_h_w(height, width, self.movq_scale_factor)
+
+        # 5. Create initial latent
+        latents = self.prepare_latents(
+            latents,
+            latent_timestep,
+            (batch_size, num_channels_latents, height, width),
+            text_encoder_hidden_states.dtype,
+            device,
+            generator,
+            self.scheduler,
+        )
+
+        # 6. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            added_cond_kwargs = {"text_embeds": prompt_embeds, "image_embeds": image_embeds}
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=text_encoder_hidden_states,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+            if do_classifier_free_guidance:
+                noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                _, variance_pred_text = variance_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
+
+            if not (
+                hasattr(self.scheduler.config, "variance_type")
+                and self.scheduler.config.variance_type in ["learned", "learned_range"]
+            ):
+                noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+                generator=generator,
+            ).prev_sample
+
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        # 7. post-processing
+        image = self.movq.decode(latents, force_not_quantize=True)["sample"]
+
+        self.maybe_free_model_hooks()
+
+        if output_type not in ["pt", "np", "pil"]:
+            raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
+
+        if output_type in ["np", "pil"]:
+            image = image * 0.5 + 0.5
+            image = image.clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
new file mode 100644
index 000000000..d8d9e96e6
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
@@ -0,0 +1,635 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from copy import deepcopy
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from packaging import version
+from PIL import Image
+from transformers import (
+    XLMRobertaTokenizer,
+)
+
+from ... import __version__
+from ...models import UNet2DConditionModel, VQModel
+from ...schedulers import DDIMScheduler
+from ...utils import (
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from .text_encoder import MultilingualCLIP
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyInpaintPipeline, KandinskyPriorPipeline
+        >>> from diffusers.utils import load_image
+        >>> import torch
+        >>> import numpy as np
+
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior.to("cuda")
+
+        >>> prompt = "a hat"
+        >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
+
+        >>> pipe = KandinskyInpaintPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-1-inpaint", torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> init_image = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... )
+
+        >>> mask = np.zeros((768, 768), dtype=np.float32)
+        >>> mask[:250, 250:-250] = 1
+
+        >>> out = pipe(
+        ...     prompt,
+        ...     image=init_image,
+        ...     mask_image=mask,
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=50,
+        ... )
+
+        >>> image = out.images[0]
+        >>> image.save("cat_with_hat.png")
+        ```
+"""
+
+
+def get_new_h_w(h, w, scale_factor=8):
+    new_h = h // scale_factor**2
+    if h % scale_factor**2 != 0:
+        new_h += 1
+    new_w = w // scale_factor**2
+    if w % scale_factor**2 != 0:
+        new_w += 1
+    return new_h * scale_factor, new_w * scale_factor
+
+
+def prepare_mask(masks):
+    prepared_masks = []
+    for mask in masks:
+        old_mask = deepcopy(mask)
+        for i in range(mask.shape[1]):
+            for j in range(mask.shape[2]):
+                if old_mask[0][i][j] == 1:
+                    continue
+                if i != 0:
+                    mask[:, i - 1, j] = 0
+                if j != 0:
+                    mask[:, i, j - 1] = 0
+                if i != 0 and j != 0:
+                    mask[:, i - 1, j - 1] = 0
+                if i != mask.shape[1] - 1:
+                    mask[:, i + 1, j] = 0
+                if j != mask.shape[2] - 1:
+                    mask[:, i, j + 1] = 0
+                if i != mask.shape[1] - 1 and j != mask.shape[2] - 1:
+                    mask[:, i + 1, j + 1] = 0
+        prepared_masks.append(mask)
+    return torch.stack(prepared_masks, dim=0)
+
+
+def prepare_mask_and_masked_image(image, mask, height, width):
+    r"""
+    Prepares a pair (mask, image) to be consumed by the Kandinsky inpaint pipeline. This means that those inputs will
+    be converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for
+    the ``image`` and ``1`` for the ``mask``.
+
+    The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
+    binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
+
+    Args:
+        image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
+            ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
+        mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
+            ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
+        height (`int`, *optional*, defaults to 512):
+            The height in pixels of the generated image.
+        width (`int`, *optional*, defaults to 512):
+            The width in pixels of the generated image.
+
+
+    Raises:
+        ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
+        should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
+        TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
+            (ot the other way around).
+
+    Returns:
+        tuple[torch.Tensor]: The pair (mask, image) as ``torch.Tensor`` with 4
+            dimensions: ``batch x channels x height x width``.
+    """
+
+    if image is None:
+        raise ValueError("`image` input cannot be undefined.")
+
+    if mask is None:
+        raise ValueError("`mask_image` input cannot be undefined.")
+
+    if isinstance(image, torch.Tensor):
+        if not isinstance(mask, torch.Tensor):
+            raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not")
+
+        # Batch single image
+        if image.ndim == 3:
+            assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
+            image = image.unsqueeze(0)
+
+        # Batch and add channel dim for single mask
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0).unsqueeze(0)
+
+        # Batch single mask or add channel dim
+        if mask.ndim == 3:
+            # Single batched mask, no channel dim or single mask not batched but channel dim
+            if mask.shape[0] == 1:
+                mask = mask.unsqueeze(0)
+
+            # Batched masks no channel dim
+            else:
+                mask = mask.unsqueeze(1)
+
+        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+        assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
+
+        # Check image is in [-1, 1]
+        if image.min() < -1 or image.max() > 1:
+            raise ValueError("Image should be in [-1, 1] range")
+
+        # Check mask is in [0, 1]
+        if mask.min() < 0 or mask.max() > 1:
+            raise ValueError("Mask should be in [0, 1] range")
+
+        # Binarize mask
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+
+        # Image as float32
+        image = image.to(dtype=torch.float32)
+    elif isinstance(mask, torch.Tensor):
+        raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
+    else:
+        # preprocess image
+        if isinstance(image, (PIL.Image.Image, np.ndarray)):
+            image = [image]
+
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            # resize all images w.r.t passed height an width
+            image = [i.resize((width, height), resample=Image.BICUBIC, reducing_gap=1) for i in image]
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+        # preprocess mask
+        if isinstance(mask, (PIL.Image.Image, np.ndarray)):
+            mask = [mask]
+
+        if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
+            mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
+            mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+            mask = mask.astype(np.float32) / 255.0
+        elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
+            mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
+
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        mask = torch.from_numpy(mask)
+
+    mask = 1 - mask
+
+    return mask, image
+
+
+class KandinskyInpaintPipeline(DiffusionPipeline):
+    """
+    Pipeline for text-guided image inpainting using Kandinsky2.1
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        text_encoder ([`MultilingualCLIP`]):
+            Frozen text-encoder.
+        tokenizer ([`XLMRobertaTokenizer`]):
+            Tokenizer of class
+        scheduler ([`DDIMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ image encoder and decoder
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->movq"
+
+    def __init__(
+        self,
+        text_encoder: MultilingualCLIP,
+        movq: VQModel,
+        tokenizer: XLMRobertaTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: DDIMScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            text_encoder=text_encoder,
+            movq=movq,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
+        self._warn_has_been_called = False
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+    ):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            truncation=True,
+            return_attention_mask=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        text_input_ids = text_input_ids.to(device)
+        text_mask = text_inputs.attention_mask.to(device)
+
+        prompt_embeds, text_encoder_hidden_states = self.text_encoder(
+            input_ids=text_input_ids, attention_mask=text_mask
+        )
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=77,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            uncond_text_input_ids = uncond_input.input_ids.to(device)
+            uncond_text_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds, uncond_text_encoder_hidden_states = self.text_encoder(
+                input_ids=uncond_text_input_ids, attention_mask=uncond_text_mask
+            )
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.FloatTensor, PIL.Image.Image],
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
+        image_embeds: torch.FloatTensor,
+        negative_image_embeds: torch.FloatTensor,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`torch.FloatTensor`, `PIL.Image.Image` or `np.ndarray`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            mask_image (`PIL.Image.Image`,`torch.FloatTensor` or `np.ndarray`):
+                `Image`, or a tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. You can pass a pytorch tensor as mask only if the
+                image you passed is a pytorch tensor, and it should contain one color channel (L) instead of 3, so the
+                expected shape would be either `(B, 1, H, W,)`, `(B, H, W)`, `(1, H, W)` or `(H, W)` If image is an PIL
+                image or numpy array, mask should also be a either PIL image or numpy array. If it is a PIL image, it
+                will be converted to a single channel (luminance) before use. If it is a nummpy array, the expected
+                shape is `(H, W)`.
+            image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for text prompt, that will be used to condition the image generation.
+            negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for negative text prompt, will be used to condition the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        if not self._warn_has_been_called and version.parse(version.parse(__version__).base_version) < version.parse(
+            "0.23.0.dev0"
+        ):
+            logger.warning(
+                "Please note that the expected format of `mask_image` has recently been changed. "
+                "Before diffusers == 0.19.0, Kandinsky Inpainting pipelines repainted black pixels and preserved black pixels. "
+                "As of diffusers==0.19.0 this behavior has been inverted. Now white pixels are repainted and black pixels are preserved. "
+                "This way, Kandinsky's masking behavior is aligned with Stable Diffusion. "
+                "THIS means that you HAVE to invert the input mask to have the same behavior as before as explained in https://github.com/huggingface/diffusers/pull/4207. "
+                "This warning will be surpressed after the first inference call and will be removed in diffusers>0.23.0"
+            )
+            self._warn_has_been_called = True
+
+        # Define call parameters
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        device = self._execution_device
+
+        batch_size = batch_size * num_images_per_prompt
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        prompt_embeds, text_encoder_hidden_states, _ = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        if isinstance(image_embeds, list):
+            image_embeds = torch.cat(image_embeds, dim=0)
+        if isinstance(negative_image_embeds, list):
+            negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+
+        if do_classifier_free_guidance:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+            image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
+                dtype=prompt_embeds.dtype, device=device
+            )
+
+        # preprocess image and mask
+        mask_image, image = prepare_mask_and_masked_image(image, mask_image, height, width)
+
+        image = image.to(dtype=prompt_embeds.dtype, device=device)
+        image = self.movq.encode(image)["latents"]
+
+        mask_image = mask_image.to(dtype=prompt_embeds.dtype, device=device)
+
+        image_shape = tuple(image.shape[-2:])
+        mask_image = F.interpolate(
+            mask_image,
+            image_shape,
+            mode="nearest",
+        )
+        mask_image = prepare_mask(mask_image)
+        masked_image = image * mask_image
+
+        mask_image = mask_image.repeat_interleave(num_images_per_prompt, dim=0)
+        masked_image = masked_image.repeat_interleave(num_images_per_prompt, dim=0)
+        if do_classifier_free_guidance:
+            mask_image = mask_image.repeat(2, 1, 1, 1)
+            masked_image = masked_image.repeat(2, 1, 1, 1)
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps_tensor = self.scheduler.timesteps
+
+        num_channels_latents = self.movq.config.latent_channels
+
+        # get h, w for latents
+        sample_height, sample_width = get_new_h_w(height, width, self.movq_scale_factor)
+
+        # create initial latent
+        latents = self.prepare_latents(
+            (batch_size, num_channels_latents, sample_height, sample_width),
+            text_encoder_hidden_states.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+
+        # Check that sizes of mask, masked image and latents match with expected
+        num_channels_mask = mask_image.shape[1]
+        num_channels_masked_image = masked_image.shape[1]
+        if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                " `pipeline.unet` or your `mask_image` or `image` input."
+            )
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = torch.cat([latent_model_input, masked_image, mask_image], dim=1)
+
+            added_cond_kwargs = {"text_embeds": prompt_embeds, "image_embeds": image_embeds}
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=text_encoder_hidden_states,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+            if do_classifier_free_guidance:
+                noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                _, variance_pred_text = variance_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
+
+            if not (
+                hasattr(self.scheduler.config, "variance_type")
+                and self.scheduler.config.variance_type in ["learned", "learned_range"]
+            ):
+                noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+                generator=generator,
+            ).prev_sample
+
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        # post-processing
+        image = self.movq.decode(latents, force_not_quantize=True)["sample"]
+
+        self.maybe_free_model_hooks()
+
+        if output_type not in ["pt", "np", "pil"]:
+            raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
+
+        if output_type in ["np", "pil"]:
+            image = image * 0.5 + 0.5
+            image = image.clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
new file mode 100644
index 000000000..0d9f54349
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
@@ -0,0 +1,547 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...models import PriorTransformer
+from ...schedulers import UnCLIPScheduler
+from ...utils import (
+    BaseOutput,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyPipeline, KandinskyPriorPipeline
+        >>> import torch
+
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior")
+        >>> pipe_prior.to("cuda")
+
+        >>> prompt = "red cat, 4k photo"
+        >>> out = pipe_prior(prompt)
+        >>> image_emb = out.image_embeds
+        >>> negative_image_emb = out.negative_image_embeds
+
+        >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1")
+        >>> pipe.to("cuda")
+
+        >>> image = pipe(
+        ...     prompt,
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=negative_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=100,
+        ... ).images
+
+        >>> image[0].save("cat.png")
+        ```
+"""
+
+EXAMPLE_INTERPOLATE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyPriorPipeline, KandinskyPipeline
+        >>> from diffusers.utils import load_image
+        >>> import PIL
+
+        >>> import torch
+        >>> from torchvision import transforms
+
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior.to("cuda")
+
+        >>> img1 = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... )
+
+        >>> img2 = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/starry_night.jpeg"
+        ... )
+
+        >>> images_texts = ["a cat", img1, img2]
+        >>> weights = [0.3, 0.3, 0.4]
+        >>> image_emb, zero_image_emb = pipe_prior.interpolate(images_texts, weights)
+
+        >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
+        >>> pipe.to("cuda")
+
+        >>> image = pipe(
+        ...     "",
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=150,
+        ... ).images[0]
+
+        >>> image.save("starry_cat.png")
+        ```
+"""
+
+
+@dataclass
+class KandinskyPriorPipelineOutput(BaseOutput):
+    """
+    Output class for KandinskyPriorPipeline.
+
+    Args:
+        image_embeds (`torch.FloatTensor`)
+            clip image embeddings for text prompt
+        negative_image_embeds (`List[PIL.Image.Image]` or `np.ndarray`)
+            clip image embeddings for unconditional tokens
+    """
+
+    image_embeds: Union[torch.FloatTensor, np.ndarray]
+    negative_image_embeds: Union[torch.FloatTensor, np.ndarray]
+
+
+class KandinskyPriorPipeline(DiffusionPipeline):
+    """
+    Pipeline for generating image prior for Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        prior ([`PriorTransformer`]):
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+        image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+    """
+
+    _exclude_from_cpu_offload = ["prior"]
+    model_cpu_offload_seq = "text_encoder->prior"
+
+    def __init__(
+        self,
+        prior: PriorTransformer,
+        image_encoder: CLIPVisionModelWithProjection,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        scheduler: UnCLIPScheduler,
+        image_processor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            prior=prior,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            image_processor=image_processor,
+        )
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING)
+    def interpolate(
+        self,
+        images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]],
+        weights: List[float],
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        negative_prior_prompt: Optional[str] = None,
+        negative_prompt: str = "",
+        guidance_scale: float = 4.0,
+        device=None,
+    ):
+        """
+        Function invoked when using the prior pipeline for interpolation.
+
+        Args:
+            images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`):
+                list of prompts and images to guide the image generation.
+            weights: (`List[float]`):
+                list of weights for each condition in `images_and_prompts`
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            negative_prior_prompt (`str`, *optional*):
+                The prompt not to guide the prior diffusion process. Ignored when not using guidance (i.e., ignored if
+                `guidance_scale` is less than `1`).
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the image generation. Ignored when not using guidance (i.e., ignored if
+                `guidance_scale` is less than `1`).
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+
+        Examples:
+
+        Returns:
+            [`KandinskyPriorPipelineOutput`] or `tuple`
+        """
+
+        device = device or self.device
+
+        if len(images_and_prompts) != len(weights):
+            raise ValueError(
+                f"`images_and_prompts` contains {len(images_and_prompts)} items and `weights` contains {len(weights)} items - they should be lists of same length"
+            )
+
+        image_embeddings = []
+        for cond, weight in zip(images_and_prompts, weights):
+            if isinstance(cond, str):
+                image_emb = self(
+                    cond,
+                    num_inference_steps=num_inference_steps,
+                    num_images_per_prompt=num_images_per_prompt,
+                    generator=generator,
+                    latents=latents,
+                    negative_prompt=negative_prior_prompt,
+                    guidance_scale=guidance_scale,
+                ).image_embeds
+
+            elif isinstance(cond, (PIL.Image.Image, torch.Tensor)):
+                if isinstance(cond, PIL.Image.Image):
+                    cond = (
+                        self.image_processor(cond, return_tensors="pt")
+                        .pixel_values[0]
+                        .unsqueeze(0)
+                        .to(dtype=self.image_encoder.dtype, device=device)
+                    )
+
+                image_emb = self.image_encoder(cond)["image_embeds"]
+
+            else:
+                raise ValueError(
+                    f"`images_and_prompts` can only contains elements to be of type `str`, `PIL.Image.Image` or `torch.Tensor`  but is {type(cond)}"
+                )
+
+            image_embeddings.append(image_emb * weight)
+
+        image_emb = torch.cat(image_embeddings).sum(dim=0, keepdim=True)
+
+        out_zero = self(
+            negative_prompt,
+            num_inference_steps=num_inference_steps,
+            num_images_per_prompt=num_images_per_prompt,
+            generator=generator,
+            latents=latents,
+            negative_prompt=negative_prior_prompt,
+            guidance_scale=guidance_scale,
+        )
+        zero_image_emb = out_zero.negative_image_embeds if negative_prompt == "" else out_zero.image_embeds
+
+        return KandinskyPriorPipelineOutput(image_embeds=image_emb, negative_image_embeds=zero_image_emb)
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def get_zero_embed(self, batch_size=1, device=None):
+        device = device or self.device
+        zero_img = torch.zeros(1, 3, self.image_encoder.config.image_size, self.image_encoder.config.image_size).to(
+            device=device, dtype=self.image_encoder.dtype
+        )
+        zero_image_emb = self.image_encoder(zero_img)["image_embeds"]
+        zero_image_emb = zero_image_emb.repeat(batch_size, 1)
+        return zero_image_emb
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+    ):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        text_mask = text_inputs.attention_mask.bool().to(device)
+
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+
+        text_encoder_output = self.text_encoder(text_input_ids.to(device))
+
+        prompt_embeds = text_encoder_output.text_embeds
+        text_encoder_hidden_states = text_encoder_output.last_hidden_state
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_text_mask = uncond_input.attention_mask.bool().to(device)
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids.to(device))
+
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
+            uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        guidance_scale: float = 4.0,
+        output_type: Optional[str] = "pt",
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            output_type (`str`, *optional*, defaults to `"pt"`):
+                The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"`
+                (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`KandinskyPriorPipelineOutput`] or `tuple`
+        """
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        elif not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt]
+        elif not isinstance(negative_prompt, list) and negative_prompt is not None:
+            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
+
+        # if the negative prompt is defined we double the batch size to
+        # directly retrieve the negative prompt embedding
+        if negative_prompt is not None:
+            prompt = prompt + negative_prompt
+            negative_prompt = 2 * negative_prompt
+
+        device = self._execution_device
+
+        batch_size = len(prompt)
+        batch_size = batch_size * num_images_per_prompt
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+        prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        # prior
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        prior_timesteps_tensor = self.scheduler.timesteps
+
+        embedding_dim = self.prior.config.embedding_dim
+
+        latents = self.prepare_latents(
+            (batch_size, embedding_dim),
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+
+        for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            predicted_image_embedding = self.prior(
+                latent_model_input,
+                timestep=t,
+                proj_embedding=prompt_embeds,
+                encoder_hidden_states=text_encoder_hidden_states,
+                attention_mask=text_mask,
+            ).predicted_image_embedding
+
+            if do_classifier_free_guidance:
+                predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2)
+                predicted_image_embedding = predicted_image_embedding_uncond + guidance_scale * (
+                    predicted_image_embedding_text - predicted_image_embedding_uncond
+                )
+
+            if i + 1 == prior_timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = prior_timesteps_tensor[i + 1]
+
+            latents = self.scheduler.step(
+                predicted_image_embedding,
+                timestep=t,
+                sample=latents,
+                generator=generator,
+                prev_timestep=prev_timestep,
+            ).prev_sample
+
+        latents = self.prior.post_process_latents(latents)
+
+        image_embeddings = latents
+
+        # if negative prompt has been defined, we retrieve split the image embedding into two
+        if negative_prompt is None:
+            zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
+
+            self.maybe_free_model_hooks()
+        else:
+            image_embeddings, zero_embeds = image_embeddings.chunk(2)
+
+            if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+                self.prior_hook.offload()
+
+        if output_type not in ["pt", "np"]:
+            raise ValueError(f"Only the output types `pt` and `np` are supported not output_type={output_type}")
+
+        if output_type == "np":
+            image_embeddings = image_embeddings.cpu().numpy()
+            zero_embeds = zero_embeds.cpu().numpy()
+
+        if not return_dict:
+            return (image_embeddings, zero_embeds)
+
+        return KandinskyPriorPipelineOutput(image_embeds=image_embeddings, negative_image_embeds=zero_embeds)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/text_encoder.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/text_encoder.py
new file mode 100644
index 000000000..caa0029f0
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/text_encoder.py
@@ -0,0 +1,27 @@
+import torch
+from transformers import PreTrainedModel, XLMRobertaConfig, XLMRobertaModel
+
+
+class MCLIPConfig(XLMRobertaConfig):
+    model_type = "M-CLIP"
+
+    def __init__(self, transformerDimSize=1024, imageDimSize=768, **kwargs):
+        self.transformerDimensions = transformerDimSize
+        self.numDims = imageDimSize
+        super().__init__(**kwargs)
+
+
+class MultilingualCLIP(PreTrainedModel):
+    config_class = MCLIPConfig
+
+    def __init__(self, config, *args, **kwargs):
+        super().__init__(config, *args, **kwargs)
+        self.transformer = XLMRobertaModel(config)
+        self.LinearTransformation = torch.nn.Linear(
+            in_features=config.transformerDimensions, out_features=config.numDims
+        )
+
+    def forward(self, input_ids, attention_mask):
+        embs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)[0]
+        embs2 = (embs * attention_mask.unsqueeze(2)).sum(dim=1) / attention_mask.sum(dim=1)[:, None]
+        return self.LinearTransformation(embs2), embs
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/__init__.py
new file mode 100644
index 000000000..67e97f161
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/__init__.py
@@ -0,0 +1,70 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_kandinsky2_2"] = ["KandinskyV22Pipeline"]
+    _import_structure["pipeline_kandinsky2_2_combined"] = [
+        "KandinskyV22CombinedPipeline",
+        "KandinskyV22Img2ImgCombinedPipeline",
+        "KandinskyV22InpaintCombinedPipeline",
+    ]
+    _import_structure["pipeline_kandinsky2_2_controlnet"] = ["KandinskyV22ControlnetPipeline"]
+    _import_structure["pipeline_kandinsky2_2_controlnet_img2img"] = ["KandinskyV22ControlnetImg2ImgPipeline"]
+    _import_structure["pipeline_kandinsky2_2_img2img"] = ["KandinskyV22Img2ImgPipeline"]
+    _import_structure["pipeline_kandinsky2_2_inpainting"] = ["KandinskyV22InpaintPipeline"]
+    _import_structure["pipeline_kandinsky2_2_prior"] = ["KandinskyV22PriorPipeline"]
+    _import_structure["pipeline_kandinsky2_2_prior_emb2emb"] = ["KandinskyV22PriorEmb2EmbPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_kandinsky2_2 import KandinskyV22Pipeline
+        from .pipeline_kandinsky2_2_combined import (
+            KandinskyV22CombinedPipeline,
+            KandinskyV22Img2ImgCombinedPipeline,
+            KandinskyV22InpaintCombinedPipeline,
+        )
+        from .pipeline_kandinsky2_2_controlnet import KandinskyV22ControlnetPipeline
+        from .pipeline_kandinsky2_2_controlnet_img2img import KandinskyV22ControlnetImg2ImgPipeline
+        from .pipeline_kandinsky2_2_img2img import KandinskyV22Img2ImgPipeline
+        from .pipeline_kandinsky2_2_inpainting import KandinskyV22InpaintPipeline
+        from .pipeline_kandinsky2_2_prior import KandinskyV22PriorPipeline
+        from .pipeline_kandinsky2_2_prior_emb2emb import KandinskyV22PriorEmb2EmbPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py
new file mode 100644
index 000000000..4b977af0d
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py
@@ -0,0 +1,320 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Dict, List, Optional, Union
+
+import torch
+
+from ...models import UNet2DConditionModel, VQModel
+from ...schedulers import DDPMScheduler
+from ...utils import deprecate, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyV22Pipeline, KandinskyV22PriorPipeline
+        >>> import torch
+
+        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior")
+        >>> pipe_prior.to("cuda")
+        >>> prompt = "red cat, 4k photo"
+        >>> out = pipe_prior(prompt)
+        >>> image_emb = out.image_embeds
+        >>> zero_image_emb = out.negative_image_embeds
+        >>> pipe = KandinskyV22Pipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder")
+        >>> pipe.to("cuda")
+        >>> image = pipe(
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=50,
+        ... ).images
+        >>> image[0].save("cat.png")
+        ```
+"""
+
+
+def downscale_height_and_width(height, width, scale_factor=8):
+    new_height = height // scale_factor**2
+    if height % scale_factor**2 != 0:
+        new_height += 1
+    new_width = width // scale_factor**2
+    if width % scale_factor**2 != 0:
+        new_width += 1
+    return new_height * scale_factor, new_width * scale_factor
+
+
+class KandinskyV22Pipeline(DiffusionPipeline):
+    """
+    Pipeline for text-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+    """
+
+    model_cpu_offload_seq = "unet->movq"
+    _callback_tensor_inputs = ["latents", "image_embeds", "negative_image_embeds"]
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        movq: VQModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for text prompt, that will be used to condition the image generation.
+            negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for negative text prompt, will be used to condition the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        device = self._execution_device
+
+        self._guidance_scale = guidance_scale
+
+        if isinstance(image_embeds, list):
+            image_embeds = torch.cat(image_embeds, dim=0)
+        batch_size = image_embeds.shape[0] * num_images_per_prompt
+        if isinstance(negative_image_embeds, list):
+            negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+
+        if self.do_classifier_free_guidance:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+            image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
+                dtype=self.unet.dtype, device=device
+            )
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        num_channels_latents = self.unet.config.in_channels
+
+        height, width = downscale_height_and_width(height, width, self.movq_scale_factor)
+
+        # create initial latent
+        latents = self.prepare_latents(
+            (batch_size, num_channels_latents, height, width),
+            image_embeds.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+
+        self._num_timesteps = len(timesteps)
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+            added_cond_kwargs = {"image_embeds": image_embeds}
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=None,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+            if self.do_classifier_free_guidance:
+                noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                _, variance_pred_text = variance_pred.chunk(2)
+                noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
+
+            if not (
+                hasattr(self.scheduler.config, "variance_type")
+                and self.scheduler.config.variance_type in ["learned", "learned_range"]
+            ):
+                noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+                generator=generator,
+            )[0]
+
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                latents = callback_outputs.pop("latents", latents)
+                image_embeds = callback_outputs.pop("image_embeds", image_embeds)
+                negative_image_embeds = callback_outputs.pop("negative_image_embeds", negative_image_embeds)
+
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        if output_type not in ["pt", "np", "pil", "latent"]:
+            raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
+
+        if not output_type == "latent":
+            # post-processing
+            image = self.movq.decode(latents, force_not_quantize=True)["sample"]
+            if output_type in ["np", "pil"]:
+                image = image * 0.5 + 0.5
+                image = image.clamp(0, 1)
+                image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            if output_type == "pil":
+                image = self.numpy_to_pil(image)
+        else:
+            image = latents
+
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py
new file mode 100644
index 000000000..65ba22cd6
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py
@@ -0,0 +1,851 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Dict, List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...models import PriorTransformer, UNet2DConditionModel, VQModel
+from ...schedulers import DDPMScheduler, UnCLIPScheduler
+from ...utils import deprecate, logging, replace_example_docstring
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_kandinsky2_2 import KandinskyV22Pipeline
+from .pipeline_kandinsky2_2_img2img import KandinskyV22Img2ImgPipeline
+from .pipeline_kandinsky2_2_inpainting import KandinskyV22InpaintPipeline
+from .pipeline_kandinsky2_2_prior import KandinskyV22PriorPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+TEXT2IMAGE_EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        from diffusers import AutoPipelineForText2Image
+        import torch
+
+        pipe = AutoPipelineForText2Image.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+
+        prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k"
+
+        image = pipe(prompt=prompt, num_inference_steps=25).images[0]
+        ```
+"""
+
+IMAGE2IMAGE_EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        from diffusers import AutoPipelineForImage2Image
+        import torch
+        import requests
+        from io import BytesIO
+        from PIL import Image
+        import os
+
+        pipe = AutoPipelineForImage2Image.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+
+        prompt = "A fantasy landscape, Cinematic lighting"
+        negative_prompt = "low quality, bad quality"
+
+        url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+        response = requests.get(url)
+        image = Image.open(BytesIO(response.content)).convert("RGB")
+        image.thumbnail((768, 768))
+
+        image = pipe(prompt=prompt, image=original_image, num_inference_steps=25).images[0]
+        ```
+"""
+
+INPAINT_EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        from diffusers import AutoPipelineForInpainting
+        from diffusers.utils import load_image
+        import torch
+        import numpy as np
+
+        pipe = AutoPipelineForInpainting.from_pretrained(
+            "kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16
+        )
+        pipe.enable_model_cpu_offload()
+
+        prompt = "A fantasy landscape, Cinematic lighting"
+        negative_prompt = "low quality, bad quality"
+
+        original_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
+        )
+
+        mask = np.zeros((768, 768), dtype=np.float32)
+        # Let's mask out an area above the cat's head
+        mask[:250, 250:-250] = 1
+
+        image = pipe(prompt=prompt, image=original_image, mask_image=mask, num_inference_steps=25).images[0]
+        ```
+"""
+
+
+class KandinskyV22CombinedPipeline(DiffusionPipeline):
+    """
+    Combined Pipeline for text-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+        prior_prior ([`PriorTransformer`]):
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
+        prior_text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        prior_tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        prior_scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+        prior_image_processor ([`CLIPImageProcessor`]):
+            A image_processor to be used to preprocess image from clip.
+    """
+
+    model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->unet->movq"
+    _load_connected_pipes = True
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        movq: VQModel,
+        prior_prior: PriorTransformer,
+        prior_image_encoder: CLIPVisionModelWithProjection,
+        prior_text_encoder: CLIPTextModelWithProjection,
+        prior_tokenizer: CLIPTokenizer,
+        prior_scheduler: UnCLIPScheduler,
+        prior_image_processor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+            prior_prior=prior_prior,
+            prior_image_encoder=prior_image_encoder,
+            prior_text_encoder=prior_text_encoder,
+            prior_tokenizer=prior_tokenizer,
+            prior_scheduler=prior_scheduler,
+            prior_image_processor=prior_image_processor,
+        )
+        self.prior_pipe = KandinskyV22PriorPipeline(
+            prior=prior_prior,
+            image_encoder=prior_image_encoder,
+            text_encoder=prior_text_encoder,
+            tokenizer=prior_tokenizer,
+            scheduler=prior_scheduler,
+            image_processor=prior_image_processor,
+        )
+        self.decoder_pipe = KandinskyV22Pipeline(
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
+        self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        Note that offloading happens on a submodule basis. Memory savings are higher than with
+        `enable_model_cpu_offload`, but performance is lower.
+        """
+        self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
+        self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
+
+    def progress_bar(self, iterable=None, total=None):
+        self.prior_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.enable_model_cpu_offload()
+
+    def set_progress_bar_config(self, **kwargs):
+        self.prior_pipe.set_progress_bar_config(**kwargs)
+        self.decoder_pipe.set_progress_bar_config(**kwargs)
+
+    @torch.no_grad()
+    @replace_example_docstring(TEXT2IMAGE_EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        height: int = 512,
+        width: int = 512,
+        prior_guidance_scale: float = 4.0,
+        prior_num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        return_dict: bool = True,
+        prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        prior_callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            prior_num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            prior_callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference of the prior pipeline.
+                The function is called with the following arguments: `prior_callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`.
+            prior_callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `prior_callback_on_step_end` function. The tensors specified in the
+                list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in
+                the `._callback_tensor_inputs` attribute of your prior pipeline class.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference of the decoder pipeline.
+                The function is called with the following arguments: `callback_on_step_end(self: DiffusionPipeline,
+                step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors
+                as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        prior_outputs = self.prior_pipe(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            num_inference_steps=prior_num_inference_steps,
+            generator=generator,
+            latents=latents,
+            guidance_scale=prior_guidance_scale,
+            output_type="pt",
+            return_dict=False,
+            callback_on_step_end=prior_callback_on_step_end,
+            callback_on_step_end_tensor_inputs=prior_callback_on_step_end_tensor_inputs,
+        )
+        image_embeds = prior_outputs[0]
+        negative_image_embeds = prior_outputs[1]
+
+        prompt = [prompt] if not isinstance(prompt, (list, tuple)) else prompt
+
+        if len(prompt) < image_embeds.shape[0] and image_embeds.shape[0] % len(prompt) == 0:
+            prompt = (image_embeds.shape[0] // len(prompt)) * prompt
+
+        outputs = self.decoder_pipe(
+            image_embeds=image_embeds,
+            negative_image_embeds=negative_image_embeds,
+            width=width,
+            height=height,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            guidance_scale=guidance_scale,
+            output_type=output_type,
+            callback=callback,
+            callback_steps=callback_steps,
+            return_dict=return_dict,
+            callback_on_step_end=callback_on_step_end,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        )
+        self.maybe_free_model_hooks()
+
+        return outputs
+
+
+class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
+    """
+    Combined Pipeline for image-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+        prior_prior ([`PriorTransformer`]):
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
+        prior_text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        prior_tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        prior_scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+        prior_image_processor ([`CLIPImageProcessor`]):
+            A image_processor to be used to preprocess image from clip.
+    """
+
+    model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->unet->movq"
+    _load_connected_pipes = True
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        movq: VQModel,
+        prior_prior: PriorTransformer,
+        prior_image_encoder: CLIPVisionModelWithProjection,
+        prior_text_encoder: CLIPTextModelWithProjection,
+        prior_tokenizer: CLIPTokenizer,
+        prior_scheduler: UnCLIPScheduler,
+        prior_image_processor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+            prior_prior=prior_prior,
+            prior_image_encoder=prior_image_encoder,
+            prior_text_encoder=prior_text_encoder,
+            prior_tokenizer=prior_tokenizer,
+            prior_scheduler=prior_scheduler,
+            prior_image_processor=prior_image_processor,
+        )
+        self.prior_pipe = KandinskyV22PriorPipeline(
+            prior=prior_prior,
+            image_encoder=prior_image_encoder,
+            text_encoder=prior_text_encoder,
+            tokenizer=prior_tokenizer,
+            scheduler=prior_scheduler,
+            image_processor=prior_image_processor,
+        )
+        self.decoder_pipe = KandinskyV22Img2ImgPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
+        self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
+
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        self.prior_pipe.enable_model_cpu_offload()
+        self.decoder_pipe.enable_model_cpu_offload()
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        Note that offloading happens on a submodule basis. Memory savings are higher than with
+        `enable_model_cpu_offload`, but performance is lower.
+        """
+        self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
+        self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
+
+    def progress_bar(self, iterable=None, total=None):
+        self.prior_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.enable_model_cpu_offload()
+
+    def set_progress_bar_config(self, **kwargs):
+        self.prior_pipe.set_progress_bar_config(**kwargs)
+        self.decoder_pipe.set_progress_bar_config(**kwargs)
+
+    @torch.no_grad()
+    @replace_example_docstring(IMAGE2IMAGE_EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        strength: float = 0.3,
+        num_images_per_prompt: int = 1,
+        height: int = 512,
+        width: int = 512,
+        prior_guidance_scale: float = 4.0,
+        prior_num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        return_dict: bool = True,
+        prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        prior_callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
+                again.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            strength (`float`, *optional*, defaults to 0.3):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            prior_num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        prior_outputs = self.prior_pipe(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            num_inference_steps=prior_num_inference_steps,
+            generator=generator,
+            latents=latents,
+            guidance_scale=prior_guidance_scale,
+            output_type="pt",
+            return_dict=False,
+            callback_on_step_end=prior_callback_on_step_end,
+            callback_on_step_end_tensor_inputs=prior_callback_on_step_end_tensor_inputs,
+        )
+        image_embeds = prior_outputs[0]
+        negative_image_embeds = prior_outputs[1]
+
+        prompt = [prompt] if not isinstance(prompt, (list, tuple)) else prompt
+        image = [image] if isinstance(prompt, PIL.Image.Image) else image
+
+        if len(prompt) < image_embeds.shape[0] and image_embeds.shape[0] % len(prompt) == 0:
+            prompt = (image_embeds.shape[0] // len(prompt)) * prompt
+
+        if (
+            isinstance(image, (list, tuple))
+            and len(image) < image_embeds.shape[0]
+            and image_embeds.shape[0] % len(image) == 0
+        ):
+            image = (image_embeds.shape[0] // len(image)) * image
+
+        outputs = self.decoder_pipe(
+            image=image,
+            image_embeds=image_embeds,
+            negative_image_embeds=negative_image_embeds,
+            width=width,
+            height=height,
+            strength=strength,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            guidance_scale=guidance_scale,
+            output_type=output_type,
+            callback=callback,
+            callback_steps=callback_steps,
+            return_dict=return_dict,
+            callback_on_step_end=callback_on_step_end,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        )
+
+        self.maybe_free_model_hooks()
+        return outputs
+
+
+class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
+    """
+    Combined Pipeline for inpainting generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+        prior_prior ([`PriorTransformer`]):
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
+        prior_text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        prior_tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        prior_scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+        prior_image_processor ([`CLIPImageProcessor`]):
+            A image_processor to be used to preprocess image from clip.
+    """
+
+    model_cpu_offload_seq = "prior_text_encoder->prior_image_encoder->unet->movq"
+    _load_connected_pipes = True
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        movq: VQModel,
+        prior_prior: PriorTransformer,
+        prior_image_encoder: CLIPVisionModelWithProjection,
+        prior_text_encoder: CLIPTextModelWithProjection,
+        prior_tokenizer: CLIPTokenizer,
+        prior_scheduler: UnCLIPScheduler,
+        prior_image_processor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+            prior_prior=prior_prior,
+            prior_image_encoder=prior_image_encoder,
+            prior_text_encoder=prior_text_encoder,
+            prior_tokenizer=prior_tokenizer,
+            prior_scheduler=prior_scheduler,
+            prior_image_processor=prior_image_processor,
+        )
+        self.prior_pipe = KandinskyV22PriorPipeline(
+            prior=prior_prior,
+            image_encoder=prior_image_encoder,
+            text_encoder=prior_text_encoder,
+            tokenizer=prior_tokenizer,
+            scheduler=prior_scheduler,
+            image_processor=prior_image_processor,
+        )
+        self.decoder_pipe = KandinskyV22InpaintPipeline(
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
+        self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        Note that offloading happens on a submodule basis. Memory savings are higher than with
+        `enable_model_cpu_offload`, but performance is lower.
+        """
+        self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
+        self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
+
+    def progress_bar(self, iterable=None, total=None):
+        self.prior_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.enable_model_cpu_offload()
+
+    def set_progress_bar_config(self, **kwargs):
+        self.prior_pipe.set_progress_bar_config(**kwargs)
+        self.decoder_pipe.set_progress_bar_config(**kwargs)
+
+    @torch.no_grad()
+    @replace_example_docstring(INPAINT_EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]],
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        height: int = 512,
+        width: int = 512,
+        prior_guidance_scale: float = 4.0,
+        prior_num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        prior_callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
+                again.
+            mask_image (`np.array`):
+                Tensor representing an image batch, to mask `image`. White pixels in the mask will be repainted, while
+                black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single
+                channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3,
+                so the expected shape would be `(B, H, W, 1)`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            prior_num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            prior_callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `prior_callback_on_step_end(self: DiffusionPipeline, step: int, timestep:
+                int, callback_kwargs: Dict)`.
+            prior_callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `prior_callback_on_step_end` function. The tensors specified in the
+                list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in
+                the `._callback_tensor_inputs` attribute of your pipeline class.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        prior_kwargs = {}
+        if kwargs.get("prior_callback", None) is not None:
+            prior_kwargs["callback"] = kwargs.pop("prior_callback")
+            deprecate(
+                "prior_callback",
+                "1.0.0",
+                "Passing `prior_callback` as an input argument to `__call__` is deprecated, consider use `prior_callback_on_step_end`",
+            )
+        if kwargs.get("prior_callback_steps", None) is not None:
+            deprecate(
+                "prior_callback_steps",
+                "1.0.0",
+                "Passing `prior_callback_steps` as an input argument to `__call__` is deprecated, consider use `prior_callback_on_step_end`",
+            )
+            prior_kwargs["callback_steps"] = kwargs.pop("prior_callback_steps")
+
+        prior_outputs = self.prior_pipe(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            num_inference_steps=prior_num_inference_steps,
+            generator=generator,
+            latents=latents,
+            guidance_scale=prior_guidance_scale,
+            output_type="pt",
+            return_dict=False,
+            callback_on_step_end=prior_callback_on_step_end,
+            callback_on_step_end_tensor_inputs=prior_callback_on_step_end_tensor_inputs,
+            **prior_kwargs,
+        )
+        image_embeds = prior_outputs[0]
+        negative_image_embeds = prior_outputs[1]
+
+        prompt = [prompt] if not isinstance(prompt, (list, tuple)) else prompt
+        image = [image] if isinstance(prompt, PIL.Image.Image) else image
+        mask_image = [mask_image] if isinstance(mask_image, PIL.Image.Image) else mask_image
+
+        if len(prompt) < image_embeds.shape[0] and image_embeds.shape[0] % len(prompt) == 0:
+            prompt = (image_embeds.shape[0] // len(prompt)) * prompt
+
+        if (
+            isinstance(image, (list, tuple))
+            and len(image) < image_embeds.shape[0]
+            and image_embeds.shape[0] % len(image) == 0
+        ):
+            image = (image_embeds.shape[0] // len(image)) * image
+
+        if (
+            isinstance(mask_image, (list, tuple))
+            and len(mask_image) < image_embeds.shape[0]
+            and image_embeds.shape[0] % len(mask_image) == 0
+        ):
+            mask_image = (image_embeds.shape[0] // len(mask_image)) * mask_image
+
+        outputs = self.decoder_pipe(
+            image=image,
+            mask_image=mask_image,
+            image_embeds=image_embeds,
+            negative_image_embeds=negative_image_embeds,
+            width=width,
+            height=height,
+            num_inference_steps=num_inference_steps,
+            generator=generator,
+            guidance_scale=guidance_scale,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback_on_step_end=callback_on_step_end,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            **kwargs,
+        )
+        self.maybe_free_model_hooks()
+
+        return outputs
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
new file mode 100644
index 000000000..de87dd3c3
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
@@ -0,0 +1,320 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, List, Optional, Union
+
+import torch
+
+from ...models import UNet2DConditionModel, VQModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    logging,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> import numpy as np
+
+        >>> from diffusers import KandinskyV22PriorPipeline, KandinskyV22ControlnetPipeline
+        >>> from transformers import pipeline
+        >>> from diffusers.utils import load_image
+
+
+        >>> def make_hint(image, depth_estimator):
+        ...     image = depth_estimator(image)["depth"]
+        ...     image = np.array(image)
+        ...     image = image[:, :, None]
+        ...     image = np.concatenate([image, image, image], axis=2)
+        ...     detected_map = torch.from_numpy(image).float() / 255.0
+        ...     hint = detected_map.permute(2, 0, 1)
+        ...     return hint
+
+
+        >>> depth_estimator = pipeline("depth-estimation")
+
+        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior = pipe_prior.to("cuda")
+
+        >>> pipe = KandinskyV22ControlnetPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+
+        >>> img = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... ).resize((768, 768))
+
+        >>> hint = make_hint(img, depth_estimator).unsqueeze(0).half().to("cuda")
+
+        >>> prompt = "A robot, 4k photo"
+        >>> negative_prior_prompt = "lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature"
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(43)
+
+        >>> image_emb, zero_image_emb = pipe_prior(
+        ...     prompt=prompt, negative_prompt=negative_prior_prompt, generator=generator
+        ... ).to_tuple()
+
+        >>> images = pipe(
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     hint=hint,
+        ...     num_inference_steps=50,
+        ...     generator=generator,
+        ...     height=768,
+        ...     width=768,
+        ... ).images
+
+        >>> images[0].save("robot_cat.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.downscale_height_and_width
+def downscale_height_and_width(height, width, scale_factor=8):
+    new_height = height // scale_factor**2
+    if height % scale_factor**2 != 0:
+        new_height += 1
+    new_width = width // scale_factor**2
+    if width % scale_factor**2 != 0:
+        new_width += 1
+    return new_height * scale_factor, new_width * scale_factor
+
+
+class KandinskyV22ControlnetPipeline(DiffusionPipeline):
+    """
+    Pipeline for text-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        scheduler ([`DDIMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+    """
+
+    model_cpu_offload_seq = "unet->movq"
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        movq: VQModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        hint: torch.FloatTensor,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            hint (`torch.FloatTensor`):
+                The controlnet condition.
+            image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for text prompt, that will be used to condition the image generation.
+            negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for negative text prompt, will be used to condition the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        device = self._execution_device
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        if isinstance(image_embeds, list):
+            image_embeds = torch.cat(image_embeds, dim=0)
+        if isinstance(negative_image_embeds, list):
+            negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+        if isinstance(hint, list):
+            hint = torch.cat(hint, dim=0)
+
+        batch_size = image_embeds.shape[0] * num_images_per_prompt
+
+        if do_classifier_free_guidance:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            hint = hint.repeat_interleave(num_images_per_prompt, dim=0)
+
+            image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
+                dtype=self.unet.dtype, device=device
+            )
+            hint = torch.cat([hint, hint], dim=0).to(dtype=self.unet.dtype, device=device)
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps_tensor = self.scheduler.timesteps
+
+        num_channels_latents = self.movq.config.latent_channels
+
+        height, width = downscale_height_and_width(height, width, self.movq_scale_factor)
+
+        # create initial latent
+        latents = self.prepare_latents(
+            (batch_size, num_channels_latents, height, width),
+            image_embeds.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            added_cond_kwargs = {"image_embeds": image_embeds, "hint": hint}
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=None,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+            if do_classifier_free_guidance:
+                noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                _, variance_pred_text = variance_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
+
+            if not (
+                hasattr(self.scheduler.config, "variance_type")
+                and self.scheduler.config.variance_type in ["learned", "learned_range"]
+            ):
+                noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+                generator=generator,
+            )[0]
+
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+        # post-processing
+        image = self.movq.decode(latents, force_not_quantize=True)["sample"]
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if output_type not in ["pt", "np", "pil"]:
+            raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
+
+        if output_type in ["np", "pil"]:
+            image = image * 0.5 + 0.5
+            image = image.clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
new file mode 100644
index 000000000..c3ac7bcf6
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
@@ -0,0 +1,381 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from PIL import Image
+
+from ...models import UNet2DConditionModel, VQModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    logging,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> import numpy as np
+
+        >>> from diffusers import KandinskyV22PriorEmb2EmbPipeline, KandinskyV22ControlnetImg2ImgPipeline
+        >>> from transformers import pipeline
+        >>> from diffusers.utils import load_image
+
+
+        >>> def make_hint(image, depth_estimator):
+        ...     image = depth_estimator(image)["depth"]
+        ...     image = np.array(image)
+        ...     image = image[:, :, None]
+        ...     image = np.concatenate([image, image, image], axis=2)
+        ...     detected_map = torch.from_numpy(image).float() / 255.0
+        ...     hint = detected_map.permute(2, 0, 1)
+        ...     return hint
+
+
+        >>> depth_estimator = pipeline("depth-estimation")
+
+        >>> pipe_prior = KandinskyV22PriorEmb2EmbPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior = pipe_prior.to("cuda")
+
+        >>> pipe = KandinskyV22ControlnetImg2ImgPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> img = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... ).resize((768, 768))
+
+
+        >>> hint = make_hint(img, depth_estimator).unsqueeze(0).half().to("cuda")
+
+        >>> prompt = "A robot, 4k photo"
+        >>> negative_prior_prompt = "lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature"
+
+        >>> generator = torch.Generator(device="cuda").manual_seed(43)
+
+        >>> img_emb = pipe_prior(prompt=prompt, image=img, strength=0.85, generator=generator)
+        >>> negative_emb = pipe_prior(prompt=negative_prior_prompt, image=img, strength=1, generator=generator)
+
+        >>> images = pipe(
+        ...     image=img,
+        ...     strength=0.5,
+        ...     image_embeds=img_emb.image_embeds,
+        ...     negative_image_embeds=negative_emb.image_embeds,
+        ...     hint=hint,
+        ...     num_inference_steps=50,
+        ...     generator=generator,
+        ...     height=768,
+        ...     width=768,
+        ... ).images
+
+        >>> images[0].save("robot_cat.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.downscale_height_and_width
+def downscale_height_and_width(height, width, scale_factor=8):
+    new_height = height // scale_factor**2
+    if height % scale_factor**2 != 0:
+        new_height += 1
+    new_width = width // scale_factor**2
+    if width % scale_factor**2 != 0:
+        new_width += 1
+    return new_height * scale_factor, new_width * scale_factor
+
+
+# Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_img2img.prepare_image
+def prepare_image(pil_image, w=512, h=512):
+    pil_image = pil_image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1)
+    arr = np.array(pil_image.convert("RGB"))
+    arr = arr.astype(np.float32) / 127.5 - 1
+    arr = np.transpose(arr, [2, 0, 1])
+    image = torch.from_numpy(arr).unsqueeze(0)
+    return image
+
+
+class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
+    """
+    Pipeline for image-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        scheduler ([`DDIMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+    """
+
+    model_cpu_offload_seq = "unet->movq"
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        movq: VQModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
+
+    # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_img2img.KandinskyImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2_img2img.KandinskyV22Img2ImgPipeline.prepare_latents
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                init_latents = [
+                    self.movq.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = self.movq.encode(image).latent_dist.sample(generator)
+
+            init_latents = self.movq.config.scaling_factor * init_latents
+
+        init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+
+        latents = init_latents
+
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]],
+        negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        hint: torch.FloatTensor,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        strength: float = 0.3,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for text prompt, that will be used to condition the image generation.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
+                again.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            hint (`torch.FloatTensor`):
+                The controlnet condition.
+            negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for negative text prompt, will be used to condition the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        device = self._execution_device
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        if isinstance(image_embeds, list):
+            image_embeds = torch.cat(image_embeds, dim=0)
+        if isinstance(negative_image_embeds, list):
+            negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+        if isinstance(hint, list):
+            hint = torch.cat(hint, dim=0)
+
+        batch_size = image_embeds.shape[0]
+
+        if do_classifier_free_guidance:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            hint = hint.repeat_interleave(num_images_per_prompt, dim=0)
+
+            image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
+                dtype=self.unet.dtype, device=device
+            )
+            hint = torch.cat([hint, hint], dim=0).to(dtype=self.unet.dtype, device=device)
+
+        if not isinstance(image, list):
+            image = [image]
+        if not all(isinstance(i, (PIL.Image.Image, torch.Tensor)) for i in image):
+            raise ValueError(
+                f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support  PIL image and pytorch tensor"
+            )
+
+        image = torch.cat([prepare_image(i, width, height) for i in image], dim=0)
+        image = image.to(dtype=image_embeds.dtype, device=device)
+
+        latents = self.movq.encode(image)["latents"]
+        latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        height, width = downscale_height_and_width(height, width, self.movq_scale_factor)
+        latents = self.prepare_latents(
+            latents, latent_timestep, batch_size, num_images_per_prompt, image_embeds.dtype, device, generator
+        )
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            added_cond_kwargs = {"image_embeds": image_embeds, "hint": hint}
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=None,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+            if do_classifier_free_guidance:
+                noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                _, variance_pred_text = variance_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
+
+            if not (
+                hasattr(self.scheduler.config, "variance_type")
+                and self.scheduler.config.variance_type in ["learned", "learned_range"]
+            ):
+                noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
+
+            # compute the previous noisy sample x_t -> x_t-1
+
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+                generator=generator,
+            )[0]
+
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        # post-processing
+        image = self.movq.decode(latents, force_not_quantize=True)["sample"]
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if output_type not in ["pt", "np", "pil"]:
+            raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}")
+
+        if output_type in ["np", "pil"]:
+            image = image * 0.5 + 0.5
+            image = image.clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
new file mode 100644
index 000000000..3fdae934a
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
@@ -0,0 +1,399 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from PIL import Image
+
+from ...models import UNet2DConditionModel, VQModel
+from ...schedulers import DDPMScheduler
+from ...utils import deprecate, logging
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyV22Img2ImgPipeline, KandinskyV22PriorPipeline
+        >>> from diffusers.utils import load_image
+        >>> import torch
+
+        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior.to("cuda")
+
+        >>> prompt = "A red cartoon frog, 4k"
+        >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
+
+        >>> pipe = KandinskyV22Img2ImgPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> init_image = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/frog.png"
+        ... )
+
+        >>> image = pipe(
+        ...     image=init_image,
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=100,
+        ...     strength=0.2,
+        ... ).images
+
+        >>> image[0].save("red_frog.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.downscale_height_and_width
+def downscale_height_and_width(height, width, scale_factor=8):
+    new_height = height // scale_factor**2
+    if height % scale_factor**2 != 0:
+        new_height += 1
+    new_width = width // scale_factor**2
+    if width % scale_factor**2 != 0:
+        new_width += 1
+    return new_height * scale_factor, new_width * scale_factor
+
+
+# Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_img2img.prepare_image
+def prepare_image(pil_image, w=512, h=512):
+    pil_image = pil_image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1)
+    arr = np.array(pil_image.convert("RGB"))
+    arr = arr.astype(np.float32) / 127.5 - 1
+    arr = np.transpose(arr, [2, 0, 1])
+    image = torch.from_numpy(arr).unsqueeze(0)
+    return image
+
+
+class KandinskyV22Img2ImgPipeline(DiffusionPipeline):
+    """
+    Pipeline for image-to-image generation using Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        scheduler ([`DDIMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+    """
+
+    model_cpu_offload_seq = "unet->movq"
+    _callback_tensor_inputs = ["latents", "image_embeds", "negative_image_embeds"]
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        movq: VQModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
+
+    # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_img2img.KandinskyImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                init_latents = [
+                    self.movq.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = self.movq.encode(image).latent_dist.sample(generator)
+
+            init_latents = self.movq.config.scaling_factor * init_latents
+
+        init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+
+        latents = init_latents
+
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]],
+        negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        strength: float = 0.3,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for text prompt, that will be used to condition the image generation.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
+                again.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for negative text prompt, will be used to condition the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        device = self._execution_device
+
+        self._guidance_scale = guidance_scale
+
+        if isinstance(image_embeds, list):
+            image_embeds = torch.cat(image_embeds, dim=0)
+        batch_size = image_embeds.shape[0]
+        if isinstance(negative_image_embeds, list):
+            negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+
+        if self.do_classifier_free_guidance:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+            image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
+                dtype=self.unet.dtype, device=device
+            )
+
+        if not isinstance(image, list):
+            image = [image]
+        if not all(isinstance(i, (PIL.Image.Image, torch.Tensor)) for i in image):
+            raise ValueError(
+                f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support  PIL image and pytorch tensor"
+            )
+
+        image = torch.cat([prepare_image(i, width, height) for i in image], dim=0)
+        image = image.to(dtype=image_embeds.dtype, device=device)
+
+        latents = self.movq.encode(image)["latents"]
+        latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        height, width = downscale_height_and_width(height, width, self.movq_scale_factor)
+        latents = self.prepare_latents(
+            latents, latent_timestep, batch_size, num_images_per_prompt, image_embeds.dtype, device, generator
+        )
+        self._num_timesteps = len(timesteps)
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+            added_cond_kwargs = {"image_embeds": image_embeds}
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=None,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+            if self.do_classifier_free_guidance:
+                noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                _, variance_pred_text = variance_pred.chunk(2)
+                noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
+
+            if not (
+                hasattr(self.scheduler.config, "variance_type")
+                and self.scheduler.config.variance_type in ["learned", "learned_range"]
+            ):
+                noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+                generator=generator,
+            )[0]
+
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                latents = callback_outputs.pop("latents", latents)
+                image_embeds = callback_outputs.pop("image_embeds", image_embeds)
+                negative_image_embeds = callback_outputs.pop("negative_image_embeds", negative_image_embeds)
+
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        if output_type not in ["pt", "np", "pil", "latent"]:
+            raise ValueError(
+                f"Only the output types `pt`, `pil` ,`np` and `latent` are supported not output_type={output_type}"
+            )
+
+        if not output_type == "latent":
+            # post-processing
+            image = self.movq.decode(latents, force_not_quantize=True)["sample"]
+            if output_type in ["np", "pil"]:
+                image = image * 0.5 + 0.5
+                image = image.clamp(0, 1)
+                image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            if output_type == "pil":
+                image = self.numpy_to_pil(image)
+        else:
+            image = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
new file mode 100644
index 000000000..2fb8731f8
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
@@ -0,0 +1,556 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from copy import deepcopy
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from packaging import version
+from PIL import Image
+
+from ... import __version__
+from ...models import UNet2DConditionModel, VQModel
+from ...schedulers import DDPMScheduler
+from ...utils import deprecate, logging
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyV22InpaintPipeline, KandinskyV22PriorPipeline
+        >>> from diffusers.utils import load_image
+        >>> import torch
+        >>> import numpy as np
+
+        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior.to("cuda")
+
+        >>> prompt = "a hat"
+        >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)
+
+        >>> pipe = KandinskyV22InpaintPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> init_image = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... )
+
+        >>> mask = np.zeros((768, 768), dtype=np.float32)
+        >>> mask[:250, 250:-250] = 1
+
+        >>> out = pipe(
+        ...     image=init_image,
+        ...     mask_image=mask,
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=50,
+        ... )
+
+        >>> image = out.images[0]
+        >>> image.save("cat_with_hat.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.downscale_height_and_width
+def downscale_height_and_width(height, width, scale_factor=8):
+    new_height = height // scale_factor**2
+    if height % scale_factor**2 != 0:
+        new_height += 1
+    new_width = width // scale_factor**2
+    if width % scale_factor**2 != 0:
+        new_width += 1
+    return new_height * scale_factor, new_width * scale_factor
+
+
+# Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_inpaint.prepare_mask
+def prepare_mask(masks):
+    prepared_masks = []
+    for mask in masks:
+        old_mask = deepcopy(mask)
+        for i in range(mask.shape[1]):
+            for j in range(mask.shape[2]):
+                if old_mask[0][i][j] == 1:
+                    continue
+                if i != 0:
+                    mask[:, i - 1, j] = 0
+                if j != 0:
+                    mask[:, i, j - 1] = 0
+                if i != 0 and j != 0:
+                    mask[:, i - 1, j - 1] = 0
+                if i != mask.shape[1] - 1:
+                    mask[:, i + 1, j] = 0
+                if j != mask.shape[2] - 1:
+                    mask[:, i, j + 1] = 0
+                if i != mask.shape[1] - 1 and j != mask.shape[2] - 1:
+                    mask[:, i + 1, j + 1] = 0
+        prepared_masks.append(mask)
+    return torch.stack(prepared_masks, dim=0)
+
+
+# Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_inpaint.prepare_mask_and_masked_image
+def prepare_mask_and_masked_image(image, mask, height, width):
+    r"""
+    Prepares a pair (mask, image) to be consumed by the Kandinsky inpaint pipeline. This means that those inputs will
+    be converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for
+    the ``image`` and ``1`` for the ``mask``.
+
+    The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
+    binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
+
+    Args:
+        image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
+            ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
+        mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
+            ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
+        height (`int`, *optional*, defaults to 512):
+            The height in pixels of the generated image.
+        width (`int`, *optional*, defaults to 512):
+            The width in pixels of the generated image.
+
+
+    Raises:
+        ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
+        should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
+        TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
+            (ot the other way around).
+
+    Returns:
+        tuple[torch.Tensor]: The pair (mask, image) as ``torch.Tensor`` with 4
+            dimensions: ``batch x channels x height x width``.
+    """
+
+    if image is None:
+        raise ValueError("`image` input cannot be undefined.")
+
+    if mask is None:
+        raise ValueError("`mask_image` input cannot be undefined.")
+
+    if isinstance(image, torch.Tensor):
+        if not isinstance(mask, torch.Tensor):
+            raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not")
+
+        # Batch single image
+        if image.ndim == 3:
+            assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
+            image = image.unsqueeze(0)
+
+        # Batch and add channel dim for single mask
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0).unsqueeze(0)
+
+        # Batch single mask or add channel dim
+        if mask.ndim == 3:
+            # Single batched mask, no channel dim or single mask not batched but channel dim
+            if mask.shape[0] == 1:
+                mask = mask.unsqueeze(0)
+
+            # Batched masks no channel dim
+            else:
+                mask = mask.unsqueeze(1)
+
+        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+        assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
+
+        # Check image is in [-1, 1]
+        if image.min() < -1 or image.max() > 1:
+            raise ValueError("Image should be in [-1, 1] range")
+
+        # Check mask is in [0, 1]
+        if mask.min() < 0 or mask.max() > 1:
+            raise ValueError("Mask should be in [0, 1] range")
+
+        # Binarize mask
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+
+        # Image as float32
+        image = image.to(dtype=torch.float32)
+    elif isinstance(mask, torch.Tensor):
+        raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
+    else:
+        # preprocess image
+        if isinstance(image, (PIL.Image.Image, np.ndarray)):
+            image = [image]
+
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            # resize all images w.r.t passed height an width
+            image = [i.resize((width, height), resample=Image.BICUBIC, reducing_gap=1) for i in image]
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+        # preprocess mask
+        if isinstance(mask, (PIL.Image.Image, np.ndarray)):
+            mask = [mask]
+
+        if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
+            mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
+            mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+            mask = mask.astype(np.float32) / 255.0
+        elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
+            mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
+
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        mask = torch.from_numpy(mask)
+
+    mask = 1 - mask
+
+    return mask, image
+
+
+class KandinskyV22InpaintPipeline(DiffusionPipeline):
+    """
+    Pipeline for text-guided image inpainting using Kandinsky2.1
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        scheduler ([`DDIMScheduler`]):
+            A scheduler to be used in combination with `unet` to generate image latents.
+        unet ([`UNet2DConditionModel`]):
+            Conditional U-Net architecture to denoise the image embedding.
+        movq ([`VQModel`]):
+            MoVQ Decoder to generate the image from the latents.
+    """
+
+    model_cpu_offload_seq = "unet->movq"
+    _callback_tensor_inputs = ["latents", "image_embeds", "negative_image_embeds", "masked_image", "mask_image"]
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        scheduler: DDPMScheduler,
+        movq: VQModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            scheduler=scheduler,
+            movq=movq,
+        )
+        self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1)
+        self._warn_has_been_called = False
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        image: Union[torch.FloatTensor, PIL.Image.Image],
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
+        negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 4.0,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for text prompt, that will be used to condition the image generation.
+            image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            mask_image (`np.array`):
+                Tensor representing an image batch, to mask `image`. White pixels in the mask will be repainted, while
+                black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single
+                channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3,
+                so the expected shape would be `(B, H, W, 1)`.
+            negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                The clip image embeddings for negative text prompt, will be used to condition the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+        """
+        if not self._warn_has_been_called and version.parse(version.parse(__version__).base_version) < version.parse(
+            "0.23.0.dev0"
+        ):
+            logger.warning(
+                "Please note that the expected format of `mask_image` has recently been changed. "
+                "Before diffusers == 0.19.0, Kandinsky Inpainting pipelines repainted black pixels and preserved black pixels. "
+                "As of diffusers==0.19.0 this behavior has been inverted. Now white pixels are repainted and black pixels are preserved. "
+                "This way, Kandinsky's masking behavior is aligned with Stable Diffusion. "
+                "THIS means that you HAVE to invert the input mask to have the same behavior as before as explained in https://github.com/huggingface/diffusers/pull/4207. "
+                "This warning will be surpressed after the first inference call and will be removed in diffusers>0.23.0"
+            )
+            self._warn_has_been_called = True
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        self._guidance_scale = guidance_scale
+
+        device = self._execution_device
+
+        if isinstance(image_embeds, list):
+            image_embeds = torch.cat(image_embeds, dim=0)
+        batch_size = image_embeds.shape[0] * num_images_per_prompt
+        if isinstance(negative_image_embeds, list):
+            negative_image_embeds = torch.cat(negative_image_embeds, dim=0)
+
+        if self.do_classifier_free_guidance:
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+            image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(
+                dtype=self.unet.dtype, device=device
+            )
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # preprocess image and mask
+        mask_image, image = prepare_mask_and_masked_image(image, mask_image, height, width)
+
+        image = image.to(dtype=image_embeds.dtype, device=device)
+        image = self.movq.encode(image)["latents"]
+
+        mask_image = mask_image.to(dtype=image_embeds.dtype, device=device)
+
+        image_shape = tuple(image.shape[-2:])
+        mask_image = F.interpolate(
+            mask_image,
+            image_shape,
+            mode="nearest",
+        )
+        mask_image = prepare_mask(mask_image)
+        masked_image = image * mask_image
+
+        mask_image = mask_image.repeat_interleave(num_images_per_prompt, dim=0)
+        masked_image = masked_image.repeat_interleave(num_images_per_prompt, dim=0)
+        if self.do_classifier_free_guidance:
+            mask_image = mask_image.repeat(2, 1, 1, 1)
+            masked_image = masked_image.repeat(2, 1, 1, 1)
+
+        num_channels_latents = self.movq.config.latent_channels
+
+        height, width = downscale_height_and_width(height, width, self.movq_scale_factor)
+
+        # create initial latent
+        latents = self.prepare_latents(
+            (batch_size, num_channels_latents, height, width),
+            image_embeds.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+        noise = torch.clone(latents)
+
+        self._num_timesteps = len(timesteps)
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+            latent_model_input = torch.cat([latent_model_input, masked_image, mask_image], dim=1)
+
+            added_cond_kwargs = {"image_embeds": image_embeds}
+            noise_pred = self.unet(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=None,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+            if self.do_classifier_free_guidance:
+                noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1)
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                _, variance_pred_text = variance_pred.chunk(2)
+                noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1)
+
+            if not (
+                hasattr(self.scheduler.config, "variance_type")
+                and self.scheduler.config.variance_type in ["learned", "learned_range"]
+            ):
+                noise_pred, _ = noise_pred.split(latents.shape[1], dim=1)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(
+                noise_pred,
+                t,
+                latents,
+                generator=generator,
+            )[0]
+            init_latents_proper = image[:1]
+            init_mask = mask_image[:1]
+
+            if i < len(timesteps) - 1:
+                noise_timestep = timesteps[i + 1]
+                init_latents_proper = self.scheduler.add_noise(
+                    init_latents_proper, noise, torch.tensor([noise_timestep])
+                )
+
+            latents = init_mask * init_latents_proper + (1 - init_mask) * latents
+
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                latents = callback_outputs.pop("latents", latents)
+                image_embeds = callback_outputs.pop("image_embeds", image_embeds)
+                negative_image_embeds = callback_outputs.pop("negative_image_embeds", negative_image_embeds)
+                masked_image = callback_outputs.pop("masked_image", masked_image)
+                mask_image = callback_outputs.pop("mask_image", mask_image)
+
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        # post-processing
+        latents = mask_image[:1] * image[:1] + (1 - mask_image[:1]) * latents
+
+        if output_type not in ["pt", "np", "pil", "latent"]:
+            raise ValueError(
+                f"Only the output types `pt`, `pil`, `np` and `latent` are supported not output_type={output_type}"
+            )
+
+        if not output_type == "latent":
+            image = self.movq.decode(latents, force_not_quantize=True)["sample"]
+
+            if output_type in ["np", "pil"]:
+                image = image * 0.5 + 0.5
+                image = image.clamp(0, 1)
+                image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+            if output_type == "pil":
+                image = self.numpy_to_pil(image)
+        else:
+            image = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
new file mode 100644
index 000000000..83427c68f
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
@@ -0,0 +1,549 @@
+from typing import Callable, Dict, List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...models import PriorTransformer
+from ...schedulers import UnCLIPScheduler
+from ...utils import (
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..kandinsky import KandinskyPriorPipelineOutput
+from ..pipeline_utils import DiffusionPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyV22Pipeline, KandinskyV22PriorPipeline
+        >>> import torch
+
+        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior")
+        >>> pipe_prior.to("cuda")
+        >>> prompt = "red cat, 4k photo"
+        >>> image_emb, negative_image_emb = pipe_prior(prompt).to_tuple()
+
+        >>> pipe = KandinskyV22Pipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder")
+        >>> pipe.to("cuda")
+        >>> image = pipe(
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=negative_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=50,
+        ... ).images
+        >>> image[0].save("cat.png")
+        ```
+"""
+
+EXAMPLE_INTERPOLATE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyV22PriorPipeline, KandinskyV22Pipeline
+        >>> from diffusers.utils import load_image
+        >>> import PIL
+        >>> import torch
+        >>> from torchvision import transforms
+
+        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior.to("cuda")
+        >>> img1 = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... )
+        >>> img2 = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/starry_night.jpeg"
+        ... )
+        >>> images_texts = ["a cat", img1, img2]
+        >>> weights = [0.3, 0.3, 0.4]
+        >>> out = pipe_prior.interpolate(images_texts, weights)
+        >>> pipe = KandinskyV22Pipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+        >>> image = pipe(
+        ...     image_embeds=out.image_embeds,
+        ...     negative_image_embeds=out.negative_image_embeds,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=50,
+        ... ).images[0]
+        >>> image.save("starry_cat.png")
+        ```
+"""
+
+
+class KandinskyV22PriorPipeline(DiffusionPipeline):
+    """
+    Pipeline for generating image prior for Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        prior ([`PriorTransformer`]):
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+        image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+        image_processor ([`CLIPImageProcessor`]):
+            A image_processor to be used to preprocess image from clip.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->prior"
+    _exclude_from_cpu_offload = ["prior"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "text_encoder_hidden_states", "text_mask"]
+
+    def __init__(
+        self,
+        prior: PriorTransformer,
+        image_encoder: CLIPVisionModelWithProjection,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        scheduler: UnCLIPScheduler,
+        image_processor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            prior=prior,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            image_processor=image_processor,
+        )
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING)
+    def interpolate(
+        self,
+        images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]],
+        weights: List[float],
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        negative_prior_prompt: Optional[str] = None,
+        negative_prompt: str = "",
+        guidance_scale: float = 4.0,
+        device=None,
+    ):
+        """
+        Function invoked when using the prior pipeline for interpolation.
+
+        Args:
+            images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`):
+                list of prompts and images to guide the image generation.
+            weights: (`List[float]`):
+                list of weights for each condition in `images_and_prompts`
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            negative_prior_prompt (`str`, *optional*):
+                The prompt not to guide the prior diffusion process. Ignored when not using guidance (i.e., ignored if
+                `guidance_scale` is less than `1`).
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the image generation. Ignored when not using guidance (i.e., ignored if
+                `guidance_scale` is less than `1`).
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+
+        Examples:
+
+        Returns:
+            [`KandinskyPriorPipelineOutput`] or `tuple`
+        """
+
+        device = device or self.device
+
+        if len(images_and_prompts) != len(weights):
+            raise ValueError(
+                f"`images_and_prompts` contains {len(images_and_prompts)} items and `weights` contains {len(weights)} items - they should be lists of same length"
+            )
+
+        image_embeddings = []
+        for cond, weight in zip(images_and_prompts, weights):
+            if isinstance(cond, str):
+                image_emb = self(
+                    cond,
+                    num_inference_steps=num_inference_steps,
+                    num_images_per_prompt=num_images_per_prompt,
+                    generator=generator,
+                    latents=latents,
+                    negative_prompt=negative_prior_prompt,
+                    guidance_scale=guidance_scale,
+                ).image_embeds.unsqueeze(0)
+
+            elif isinstance(cond, (PIL.Image.Image, torch.Tensor)):
+                if isinstance(cond, PIL.Image.Image):
+                    cond = (
+                        self.image_processor(cond, return_tensors="pt")
+                        .pixel_values[0]
+                        .unsqueeze(0)
+                        .to(dtype=self.image_encoder.dtype, device=device)
+                    )
+
+                image_emb = self.image_encoder(cond)["image_embeds"].repeat(num_images_per_prompt, 1).unsqueeze(0)
+
+            else:
+                raise ValueError(
+                    f"`images_and_prompts` can only contains elements to be of type `str`, `PIL.Image.Image` or `torch.Tensor`  but is {type(cond)}"
+                )
+
+            image_embeddings.append(image_emb * weight)
+
+        image_emb = torch.cat(image_embeddings).sum(dim=0)
+
+        out_zero = self(
+            negative_prompt,
+            num_inference_steps=num_inference_steps,
+            num_images_per_prompt=num_images_per_prompt,
+            generator=generator,
+            latents=latents,
+            negative_prompt=negative_prior_prompt,
+            guidance_scale=guidance_scale,
+        )
+        zero_image_emb = out_zero.negative_image_embeds if negative_prompt == "" else out_zero.image_embeds
+
+        return KandinskyPriorPipelineOutput(image_embeds=image_emb, negative_image_embeds=zero_image_emb)
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline.get_zero_embed
+    def get_zero_embed(self, batch_size=1, device=None):
+        device = device or self.device
+        zero_img = torch.zeros(1, 3, self.image_encoder.config.image_size, self.image_encoder.config.image_size).to(
+            device=device, dtype=self.image_encoder.dtype
+        )
+        zero_image_emb = self.image_encoder(zero_img)["image_embeds"]
+        zero_image_emb = zero_image_emb.repeat(batch_size, 1)
+        return zero_image_emb
+
+    # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+    ):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        text_mask = text_inputs.attention_mask.bool().to(device)
+
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+
+        text_encoder_output = self.text_encoder(text_input_ids.to(device))
+
+        prompt_embeds = text_encoder_output.text_embeds
+        text_encoder_hidden_states = text_encoder_output.last_hidden_state
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_text_mask = uncond_input.attention_mask.bool().to(device)
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids.to(device))
+
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
+            uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        guidance_scale: float = 4.0,
+        output_type: Optional[str] = "pt",  # pt only
+        return_dict: bool = True,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            output_type (`str`, *optional*, defaults to `"pt"`):
+                The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"`
+                (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`KandinskyPriorPipelineOutput`] or `tuple`
+        """
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        elif not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt]
+        elif not isinstance(negative_prompt, list) and negative_prompt is not None:
+            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
+
+        # if the negative prompt is defined we double the batch size to
+        # directly retrieve the negative prompt embedding
+        if negative_prompt is not None:
+            prompt = prompt + negative_prompt
+            negative_prompt = 2 * negative_prompt
+
+        device = self._execution_device
+
+        batch_size = len(prompt)
+        batch_size = batch_size * num_images_per_prompt
+
+        self._guidance_scale = guidance_scale
+
+        prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
+            prompt, device, num_images_per_prompt, self.do_classifier_free_guidance, negative_prompt
+        )
+
+        # prior
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        embedding_dim = self.prior.config.embedding_dim
+
+        latents = self.prepare_latents(
+            (batch_size, embedding_dim),
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+        self._num_timesteps = len(timesteps)
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+            predicted_image_embedding = self.prior(
+                latent_model_input,
+                timestep=t,
+                proj_embedding=prompt_embeds,
+                encoder_hidden_states=text_encoder_hidden_states,
+                attention_mask=text_mask,
+            ).predicted_image_embedding
+
+            if self.do_classifier_free_guidance:
+                predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2)
+                predicted_image_embedding = predicted_image_embedding_uncond + self.guidance_scale * (
+                    predicted_image_embedding_text - predicted_image_embedding_uncond
+                )
+
+            if i + 1 == timesteps.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = timesteps[i + 1]
+
+            latents = self.scheduler.step(
+                predicted_image_embedding,
+                timestep=t,
+                sample=latents,
+                generator=generator,
+                prev_timestep=prev_timestep,
+            ).prev_sample
+
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                latents = callback_outputs.pop("latents", latents)
+                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                text_encoder_hidden_states = callback_outputs.pop(
+                    "text_encoder_hidden_states", text_encoder_hidden_states
+                )
+                text_mask = callback_outputs.pop("text_mask", text_mask)
+
+        latents = self.prior.post_process_latents(latents)
+
+        image_embeddings = latents
+
+        # if negative prompt has been defined, we retrieve split the image embedding into two
+        if negative_prompt is None:
+            zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
+        else:
+            image_embeddings, zero_embeds = image_embeddings.chunk(2)
+
+        self.maybe_free_model_hooks()
+
+        if output_type not in ["pt", "np"]:
+            raise ValueError(f"Only the output types `pt` and `np` are supported not output_type={output_type}")
+
+        if output_type == "np":
+            image_embeddings = image_embeddings.cpu().numpy()
+            zero_embeds = zero_embeds.cpu().numpy()
+
+        if not return_dict:
+            return (image_embeddings, zero_embeds)
+
+        return KandinskyPriorPipelineOutput(image_embeds=image_embeddings, negative_image_embeds=zero_embeds)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
new file mode 100644
index 000000000..bef70821c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
@@ -0,0 +1,563 @@
+from typing import List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...models import PriorTransformer
+from ...schedulers import UnCLIPScheduler
+from ...utils import (
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..kandinsky import KandinskyPriorPipelineOutput
+from ..pipeline_utils import DiffusionPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyV22Pipeline, KandinskyV22PriorEmb2EmbPipeline
+        >>> import torch
+
+        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior.to("cuda")
+
+        >>> prompt = "red cat, 4k photo"
+        >>> img = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... )
+        >>> image_emb, nagative_image_emb = pipe_prior(prompt, image=img, strength=0.2).to_tuple()
+
+        >>> pipe = KandinskyPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-decoder, torch_dtype=torch.float16"
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> image = pipe(
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=negative_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=100,
+        ... ).images
+
+        >>> image[0].save("cat.png")
+        ```
+"""
+
+EXAMPLE_INTERPOLATE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import KandinskyV22PriorEmb2EmbPipeline, KandinskyV22Pipeline
+        >>> from diffusers.utils import load_image
+        >>> import PIL
+
+        >>> import torch
+        >>> from torchvision import transforms
+
+        >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16
+        ... )
+        >>> pipe_prior.to("cuda")
+
+        >>> img1 = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/cat.png"
+        ... )
+
+        >>> img2 = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+        ...     "/kandinsky/starry_night.jpeg"
+        ... )
+
+        >>> images_texts = ["a cat", img1, img2]
+        >>> weights = [0.3, 0.3, 0.4]
+        >>> image_emb, zero_image_emb = pipe_prior.interpolate(images_texts, weights)
+
+        >>> pipe = KandinskyV22Pipeline.from_pretrained(
+        ...     "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> image = pipe(
+        ...     image_embeds=image_emb,
+        ...     negative_image_embeds=zero_image_emb,
+        ...     height=768,
+        ...     width=768,
+        ...     num_inference_steps=150,
+        ... ).images[0]
+
+        >>> image.save("starry_cat.png")
+        ```
+"""
+
+
+class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):
+    """
+    Pipeline for generating image prior for Kandinsky
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        prior ([`PriorTransformer`]):
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+        image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen image-encoder.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        scheduler ([`UnCLIPScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->prior"
+    _exclude_from_cpu_offload = ["prior"]
+
+    def __init__(
+        self,
+        prior: PriorTransformer,
+        image_encoder: CLIPVisionModelWithProjection,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        scheduler: UnCLIPScheduler,
+        image_processor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            prior=prior,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            image_processor=image_processor,
+        )
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING)
+    def interpolate(
+        self,
+        images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]],
+        weights: List[float],
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        negative_prior_prompt: Optional[str] = None,
+        negative_prompt: str = "",
+        guidance_scale: float = 4.0,
+        device=None,
+    ):
+        """
+        Function invoked when using the prior pipeline for interpolation.
+
+        Args:
+            images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`):
+                list of prompts and images to guide the image generation.
+            weights: (`List[float]`):
+                list of weights for each condition in `images_and_prompts`
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            negative_prior_prompt (`str`, *optional*):
+                The prompt not to guide the prior diffusion process. Ignored when not using guidance (i.e., ignored if
+                `guidance_scale` is less than `1`).
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the image generation. Ignored when not using guidance (i.e., ignored if
+                `guidance_scale` is less than `1`).
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+
+        Examples:
+
+        Returns:
+            [`KandinskyPriorPipelineOutput`] or `tuple`
+        """
+
+        device = device or self.device
+
+        if len(images_and_prompts) != len(weights):
+            raise ValueError(
+                f"`images_and_prompts` contains {len(images_and_prompts)} items and `weights` contains {len(weights)} items - they should be lists of same length"
+            )
+
+        image_embeddings = []
+        for cond, weight in zip(images_and_prompts, weights):
+            if isinstance(cond, str):
+                image_emb = self(
+                    cond,
+                    num_inference_steps=num_inference_steps,
+                    num_images_per_prompt=num_images_per_prompt,
+                    generator=generator,
+                    latents=latents,
+                    negative_prompt=negative_prior_prompt,
+                    guidance_scale=guidance_scale,
+                ).image_embeds.unsqueeze(0)
+
+            elif isinstance(cond, (PIL.Image.Image, torch.Tensor)):
+                image_emb = self._encode_image(
+                    cond, device=device, num_images_per_prompt=num_images_per_prompt
+                ).unsqueeze(0)
+
+            else:
+                raise ValueError(
+                    f"`images_and_prompts` can only contains elements to be of type `str`, `PIL.Image.Image` or `torch.Tensor`  but is {type(cond)}"
+                )
+
+            image_embeddings.append(image_emb * weight)
+
+        image_emb = torch.cat(image_embeddings).sum(dim=0)
+
+        return KandinskyPriorPipelineOutput(image_embeds=image_emb, negative_image_embeds=torch.randn_like(image_emb))
+
+    def _encode_image(
+        self,
+        image: Union[torch.Tensor, List[PIL.Image.Image]],
+        device,
+        num_images_per_prompt,
+    ):
+        if not isinstance(image, torch.Tensor):
+            image = self.image_processor(image, return_tensors="pt").pixel_values.to(
+                dtype=self.image_encoder.dtype, device=device
+            )
+
+        image_emb = self.image_encoder(image)["image_embeds"]  # B, D
+        image_emb = image_emb.repeat_interleave(num_images_per_prompt, dim=0)
+        image_emb.to(device=device)
+
+        return image_emb
+
+    def prepare_latents(self, emb, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        emb = emb.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        init_latents = emb
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline.get_zero_embed
+    def get_zero_embed(self, batch_size=1, device=None):
+        device = device or self.device
+        zero_img = torch.zeros(1, 3, self.image_encoder.config.image_size, self.image_encoder.config.image_size).to(
+            device=device, dtype=self.image_encoder.dtype
+        )
+        zero_image_emb = self.image_encoder(zero_img)["image_embeds"]
+        zero_image_emb = zero_image_emb.repeat(batch_size, 1)
+        return zero_image_emb
+
+    # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+    ):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        text_mask = text_inputs.attention_mask.bool().to(device)
+
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+
+        text_encoder_output = self.text_encoder(text_input_ids.to(device))
+
+        prompt_embeds = text_encoder_output.text_embeds
+        text_encoder_hidden_states = text_encoder_output.last_hidden_state
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_text_mask = uncond_input.attention_mask.bool().to(device)
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids.to(device))
+
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
+            uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[torch.Tensor, List[torch.Tensor], PIL.Image.Image, List[PIL.Image.Image]],
+        strength: float = 0.3,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        guidance_scale: float = 4.0,
+        output_type: Optional[str] = "pt",  # pt only
+        return_dict: bool = True,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `emb`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added.
+            emb (`torch.FloatTensor`):
+                The image embedding.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            output_type (`str`, *optional*, defaults to `"pt"`):
+                The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"`
+                (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`KandinskyPriorPipelineOutput`] or `tuple`
+        """
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        elif not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt]
+        elif not isinstance(negative_prompt, list) and negative_prompt is not None:
+            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
+
+        # if the negative prompt is defined we double the batch size to
+        # directly retrieve the negative prompt embedding
+        if negative_prompt is not None:
+            prompt = prompt + negative_prompt
+            negative_prompt = 2 * negative_prompt
+
+        device = self._execution_device
+
+        batch_size = len(prompt)
+        batch_size = batch_size * num_images_per_prompt
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+        prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+
+        if not isinstance(image, List):
+            image = [image]
+
+        if isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, dim=0)
+
+        if isinstance(image, torch.Tensor) and image.ndim == 2:
+            # allow user to pass image_embeds directly
+            image_embeds = image.repeat_interleave(num_images_per_prompt, dim=0)
+        elif isinstance(image, torch.Tensor) and image.ndim != 4:
+            raise ValueError(
+                f" if pass `image` as pytorch tensor, or a list of pytorch tensor, please make sure each tensor has shape [batch_size, channels, height, width], currently {image[0].unsqueeze(0).shape}"
+            )
+        else:
+            image_embeds = self._encode_image(image, device, num_images_per_prompt)
+
+        # prior
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+
+        latents = image_embeds
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size)
+        latents = self.prepare_latents(
+            latents,
+            latent_timestep,
+            batch_size // num_images_per_prompt,
+            num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+            predicted_image_embedding = self.prior(
+                latent_model_input,
+                timestep=t,
+                proj_embedding=prompt_embeds,
+                encoder_hidden_states=text_encoder_hidden_states,
+                attention_mask=text_mask,
+            ).predicted_image_embedding
+
+            if do_classifier_free_guidance:
+                predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2)
+                predicted_image_embedding = predicted_image_embedding_uncond + guidance_scale * (
+                    predicted_image_embedding_text - predicted_image_embedding_uncond
+                )
+
+            if i + 1 == timesteps.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = timesteps[i + 1]
+
+            latents = self.scheduler.step(
+                predicted_image_embedding,
+                timestep=t,
+                sample=latents,
+                generator=generator,
+                prev_timestep=prev_timestep,
+            ).prev_sample
+
+        latents = self.prior.post_process_latents(latents)
+
+        image_embeddings = latents
+
+        # if negative prompt has been defined, we retrieve split the image embedding into two
+        if negative_prompt is None:
+            zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device)
+        else:
+            image_embeddings, zero_embeds = image_embeddings.chunk(2)
+
+        self.maybe_free_model_hooks()
+
+        if output_type not in ["pt", "np"]:
+            raise ValueError(f"Only the output types `pt` and `np` are supported not output_type={output_type}")
+
+        if output_type == "np":
+            image_embeddings = image_embeddings.cpu().numpy()
+            zero_embeds = zero_embeds.cpu().numpy()
+
+        if not return_dict:
+            return (image_embeddings, zero_embeds)
+
+        return KandinskyPriorPipelineOutput(image_embeds=image_embeddings, negative_image_embeds=zero_embeds)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky3/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky3/__init__.py
new file mode 100644
index 000000000..e8a306314
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky3/__init__.py
@@ -0,0 +1,49 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_kandinsky3"] = ["Kandinsky3Pipeline"]
+    _import_structure["pipeline_kandinsky3_img2img"] = ["Kandinsky3Img2ImgPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_kandinsky3 import Kandinsky3Pipeline
+        from .pipeline_kandinsky3_img2img import Kandinsky3Img2ImgPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py
new file mode 100644
index 000000000..4fe8c54eb
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+import argparse
+import fnmatch
+
+from safetensors.torch import load_file
+
+from diffusers import Kandinsky3UNet
+
+
+MAPPING = {
+    "to_time_embed.1": "time_embedding.linear_1",
+    "to_time_embed.3": "time_embedding.linear_2",
+    "in_layer": "conv_in",
+    "out_layer.0": "conv_norm_out",
+    "out_layer.2": "conv_out",
+    "down_samples": "down_blocks",
+    "up_samples": "up_blocks",
+    "projection_lin": "encoder_hid_proj.projection_linear",
+    "projection_ln": "encoder_hid_proj.projection_norm",
+    "feature_pooling": "add_time_condition",
+    "to_query": "to_q",
+    "to_key": "to_k",
+    "to_value": "to_v",
+    "output_layer": "to_out.0",
+    "self_attention_block": "attentions.0",
+}
+
+DYNAMIC_MAP = {
+    "resnet_attn_blocks.*.0": "resnets_in.*",
+    "resnet_attn_blocks.*.1": ("attentions.*", 1),
+    "resnet_attn_blocks.*.2": "resnets_out.*",
+}
+# MAPPING = {}
+
+
+def convert_state_dict(unet_state_dict):
+    """
+    Convert the state dict of a U-Net model to match the key format expected by Kandinsky3UNet model.
+    Args:
+        unet_model (torch.nn.Module): The original U-Net model.
+        unet_kandi3_model (torch.nn.Module): The Kandinsky3UNet model to match keys with.
+
+    Returns:
+        OrderedDict: The converted state dictionary.
+    """
+    # Example of renaming logic (this will vary based on your model's architecture)
+    converted_state_dict = {}
+    for key in unet_state_dict:
+        new_key = key
+        for pattern, new_pattern in MAPPING.items():
+            new_key = new_key.replace(pattern, new_pattern)
+
+        for dyn_pattern, dyn_new_pattern in DYNAMIC_MAP.items():
+            has_matched = False
+            if fnmatch.fnmatch(new_key, f"*.{dyn_pattern}.*") and not has_matched:
+                star = int(new_key.split(dyn_pattern.split(".")[0])[-1].split(".")[1])
+
+                if isinstance(dyn_new_pattern, tuple):
+                    new_star = star + dyn_new_pattern[-1]
+                    dyn_new_pattern = dyn_new_pattern[0]
+                else:
+                    new_star = star
+
+                pattern = dyn_pattern.replace("*", str(star))
+                new_pattern = dyn_new_pattern.replace("*", str(new_star))
+
+                new_key = new_key.replace(pattern, new_pattern)
+                has_matched = True
+
+        converted_state_dict[new_key] = unet_state_dict[key]
+
+    return converted_state_dict
+
+
+def main(model_path, output_path):
+    # Load your original U-Net model
+    unet_state_dict = load_file(model_path)
+
+    # Initialize your Kandinsky3UNet model
+    config = {}
+
+    # Convert the state dict
+    converted_state_dict = convert_state_dict(unet_state_dict)
+
+    unet = Kandinsky3UNet(config)
+    unet.load_state_dict(converted_state_dict)
+
+    unet.save_pretrained(output_path)
+    print(f"Converted model saved to {output_path}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert U-Net PyTorch model to Kandinsky3UNet format")
+    parser.add_argument("--model_path", type=str, required=True, help="Path to the original U-Net PyTorch model")
+    parser.add_argument("--output_path", type=str, required=True, help="Path to save the converted model")
+
+    args = parser.parse_args()
+    main(args.model_path, args.output_path)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py
new file mode 100644
index 000000000..fcf7ddcb9
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py
@@ -0,0 +1,589 @@
+from typing import Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+
+from ...loaders import LoraLoaderMixin
+from ...models import Kandinsky3UNet, VQModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    deprecate,
+    is_accelerate_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import AutoPipelineForText2Image
+        >>> import torch
+
+        >>> pipe = AutoPipelineForText2Image.from_pretrained("kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16)
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = "A photograph of the inside of a subway train. There are raccoons sitting on the seats. One of them is reading a newspaper. The window shows the city in the background."
+
+        >>> generator = torch.Generator(device="cpu").manual_seed(0)
+        >>> image = pipe(prompt, num_inference_steps=25, generator=generator).images[0]
+        ```
+
+"""
+
+
+def downscale_height_and_width(height, width, scale_factor=8):
+    new_height = height // scale_factor**2
+    if height % scale_factor**2 != 0:
+        new_height += 1
+    new_width = width // scale_factor**2
+    if width % scale_factor**2 != 0:
+        new_width += 1
+    return new_height * scale_factor, new_width * scale_factor
+
+
+class Kandinsky3Pipeline(DiffusionPipeline, LoraLoaderMixin):
+    model_cpu_offload_seq = "text_encoder->unet->movq"
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "negative_attention_mask",
+        "attention_mask",
+    ]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: Kandinsky3UNet,
+        scheduler: DDPMScheduler,
+        movq: VQModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer, text_encoder=text_encoder, unet=unet, scheduler=scheduler, movq=movq
+        )
+
+    def remove_all_hooks(self):
+        if is_accelerate_available():
+            from accelerate.hooks import remove_hook_from_module
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        for model in [self.text_encoder, self.unet, self.movq]:
+            if model is not None:
+                remove_hook_from_module(model, recurse=True)
+
+        self.unet_offload_hook = None
+        self.text_encoder_offload_hook = None
+        self.final_offload_hook = None
+
+    def process_embeds(self, embeddings, attention_mask, cut_context):
+        if cut_context:
+            embeddings[attention_mask == 0] = torch.zeros_like(embeddings[attention_mask == 0])
+            max_seq_length = attention_mask.sum(-1).max() + 1
+            embeddings = embeddings[:, :max_seq_length]
+            attention_mask = attention_mask[:, :max_seq_length]
+        return embeddings, attention_mask
+
+    @torch.no_grad()
+    def encode_prompt(
+        self,
+        prompt,
+        do_classifier_free_guidance=True,
+        num_images_per_prompt=1,
+        device=None,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        _cut_context=False,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        negative_attention_mask: Optional[torch.FloatTensor] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            attention_mask (`torch.FloatTensor`, *optional*):
+                Pre-generated attention mask. Must provide if passing `prompt_embeds` directly.
+            negative_attention_mask (`torch.FloatTensor`, *optional*):
+                Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly.
+        """
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        max_length = 128
+
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids.to(device)
+            attention_mask = text_inputs.attention_mask.to(device)
+            prompt_embeds = self.text_encoder(
+                text_input_ids,
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+            prompt_embeds, attention_mask = self.process_embeds(prompt_embeds, attention_mask, _cut_context)
+            prompt_embeds = prompt_embeds * attention_mask.unsqueeze(2)
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        attention_mask = attention_mask.repeat(num_images_per_prompt, 1)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            if negative_prompt is not None:
+                uncond_input = self.tokenizer(
+                    uncond_tokens,
+                    padding="max_length",
+                    max_length=128,
+                    truncation=True,
+                    return_attention_mask=True,
+                    return_tensors="pt",
+                )
+                text_input_ids = uncond_input.input_ids.to(device)
+                negative_attention_mask = uncond_input.attention_mask.to(device)
+
+                negative_prompt_embeds = self.text_encoder(
+                    text_input_ids,
+                    attention_mask=negative_attention_mask,
+                )
+                negative_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds[:, : prompt_embeds.shape[1]]
+                negative_attention_mask = negative_attention_mask[:, : prompt_embeds.shape[1]]
+                negative_prompt_embeds = negative_prompt_embeds * negative_attention_mask.unsqueeze(2)
+
+            else:
+                negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+                negative_attention_mask = torch.zeros_like(attention_mask)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+            if negative_prompt_embeds.shape != prompt_embeds.shape:
+                negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+                negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+                negative_attention_mask = negative_attention_mask.repeat(num_images_per_prompt, 1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        else:
+            negative_prompt_embeds = None
+            negative_attention_mask = None
+        return prompt_embeds, negative_prompt_embeds, attention_mask, negative_attention_mask
+
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def check_inputs(
+        self,
+        prompt,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        attention_mask=None,
+        negative_attention_mask=None,
+    ):
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+        if negative_prompt_embeds is not None and negative_attention_mask is None:
+            raise ValueError("Please provide `negative_attention_mask` along with `negative_prompt_embeds`")
+
+        if negative_prompt_embeds is not None and negative_attention_mask is not None:
+            if negative_prompt_embeds.shape[:2] != negative_attention_mask.shape:
+                raise ValueError(
+                    "`negative_prompt_embeds` and `negative_attention_mask` must have the same batch_size and token length when passed directly, but"
+                    f" got: `negative_prompt_embeds` {negative_prompt_embeds.shape[:2]} != `negative_attention_mask`"
+                    f" {negative_attention_mask.shape}."
+                )
+
+        if prompt_embeds is not None and attention_mask is None:
+            raise ValueError("Please provide `attention_mask` along with `prompt_embeds`")
+
+        if prompt_embeds is not None and attention_mask is not None:
+            if prompt_embeds.shape[:2] != attention_mask.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `attention_mask` must have the same batch_size and token length when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape[:2]} != `attention_mask`"
+                    f" {attention_mask.shape}."
+                )
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_inference_steps: int = 25,
+        guidance_scale: float = 3.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        height: Optional[int] = 1024,
+        width: Optional[int] = 1024,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        negative_attention_mask: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        latents=None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 3.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The width in pixels of the generated image.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            attention_mask (`torch.FloatTensor`, *optional*):
+                Pre-generated attention mask. Must provide if passing `prompt_embeds` directly.
+            negative_attention_mask (`torch.FloatTensor`, *optional*):
+                Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        cut_context = True
+        device = self._execution_device
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+            attention_mask,
+            negative_attention_mask,
+        )
+
+        self._guidance_scale = guidance_scale
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds, attention_mask, negative_attention_mask = self.encode_prompt(
+            prompt,
+            self.do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            _cut_context=cut_context,
+            attention_mask=attention_mask,
+            negative_attention_mask=negative_attention_mask,
+        )
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            attention_mask = torch.cat([negative_attention_mask, attention_mask]).bool()
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latents
+        height, width = downscale_height_and_width(height, width, 8)
+
+        latents = self.prepare_latents(
+            (batch_size * num_images_per_prompt, 4, height, width),
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+
+        if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None:
+            self.text_encoder_offload_hook.offload()
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    encoder_attention_mask=attention_mask,
+                    return_dict=False,
+                )[0]
+
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+
+                    noise_pred = (guidance_scale + 1.0) * noise_pred_text - guidance_scale * noise_pred_uncond
+                    # noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred,
+                    t,
+                    latents,
+                    generator=generator,
+                ).prev_sample
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    attention_mask = callback_outputs.pop("attention_mask", attention_mask)
+                    negative_attention_mask = callback_outputs.pop("negative_attention_mask", negative_attention_mask)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+            # post-processing
+            if output_type not in ["pt", "np", "pil", "latent"]:
+                raise ValueError(
+                    f"Only the output types `pt`, `pil`, `np` and `latent` are supported not output_type={output_type}"
+                )
+
+            if not output_type == "latent":
+                image = self.movq.decode(latents, force_not_quantize=True)["sample"]
+
+                if output_type in ["np", "pil"]:
+                    image = image * 0.5 + 0.5
+                    image = image.clamp(0, 1)
+                    image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+                if output_type == "pil":
+                    image = self.numpy_to_pil(image)
+            else:
+                image = latents
+
+            self.maybe_free_model_hooks()
+
+            if not return_dict:
+                return (image,)
+
+            return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py
new file mode 100644
index 000000000..7f4164a04
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py
@@ -0,0 +1,654 @@
+import inspect
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL
+import PIL.Image
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+
+from ...loaders import LoraLoaderMixin
+from ...models import Kandinsky3UNet, VQModel
+from ...schedulers import DDPMScheduler
+from ...utils import (
+    deprecate,
+    is_accelerate_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import AutoPipelineForImage2Image
+        >>> from diffusers.utils import load_image
+        >>> import torch
+
+        >>> pipe = AutoPipelineForImage2Image.from_pretrained("kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16)
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = "A painting of the inside of a subway train with tiny raccoons."
+        >>> image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky3/t2i.png")
+
+        >>> generator = torch.Generator(device="cpu").manual_seed(0)
+        >>> image = pipe(prompt, image=image, strength=0.75, num_inference_steps=25, generator=generator).images[0]
+        ```
+"""
+
+
+def downscale_height_and_width(height, width, scale_factor=8):
+    new_height = height // scale_factor**2
+    if height % scale_factor**2 != 0:
+        new_height += 1
+    new_width = width // scale_factor**2
+    if width % scale_factor**2 != 0:
+        new_width += 1
+    return new_height * scale_factor, new_width * scale_factor
+
+
+def prepare_image(pil_image):
+    arr = np.array(pil_image.convert("RGB"))
+    arr = arr.astype(np.float32) / 127.5 - 1
+    arr = np.transpose(arr, [2, 0, 1])
+    image = torch.from_numpy(arr).unsqueeze(0)
+    return image
+
+
+class Kandinsky3Img2ImgPipeline(DiffusionPipeline, LoraLoaderMixin):
+    model_cpu_offload_seq = "text_encoder->movq->unet->movq"
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "negative_attention_mask",
+        "attention_mask",
+    ]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        unet: Kandinsky3UNet,
+        scheduler: DDPMScheduler,
+        movq: VQModel,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer, text_encoder=text_encoder, unet=unet, scheduler=scheduler, movq=movq
+        )
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start:]
+
+        return timesteps, num_inference_steps - t_start
+
+    def remove_all_hooks(self):
+        if is_accelerate_available():
+            from accelerate.hooks import remove_hook_from_module
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+
+        for model in [self.text_encoder, self.unet]:
+            if model is not None:
+                remove_hook_from_module(model, recurse=True)
+
+        self.unet_offload_hook = None
+        self.text_encoder_offload_hook = None
+        self.final_offload_hook = None
+
+    def _process_embeds(self, embeddings, attention_mask, cut_context):
+        # return embeddings, attention_mask
+        if cut_context:
+            embeddings[attention_mask == 0] = torch.zeros_like(embeddings[attention_mask == 0])
+            max_seq_length = attention_mask.sum(-1).max() + 1
+            embeddings = embeddings[:, :max_seq_length]
+            attention_mask = attention_mask[:, :max_seq_length]
+        return embeddings, attention_mask
+
+    @torch.no_grad()
+    def encode_prompt(
+        self,
+        prompt,
+        do_classifier_free_guidance=True,
+        num_images_per_prompt=1,
+        device=None,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        _cut_context=False,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        negative_attention_mask: Optional[torch.FloatTensor] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            attention_mask (`torch.FloatTensor`, *optional*):
+                Pre-generated attention mask. Must provide if passing `prompt_embeds` directly.
+            negative_attention_mask (`torch.FloatTensor`, *optional*):
+                Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly.
+        """
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        max_length = 128
+
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids.to(device)
+            attention_mask = text_inputs.attention_mask.to(device)
+            prompt_embeds = self.text_encoder(
+                text_input_ids,
+                attention_mask=attention_mask,
+            )
+            prompt_embeds = prompt_embeds[0]
+            prompt_embeds, attention_mask = self._process_embeds(prompt_embeds, attention_mask, _cut_context)
+            prompt_embeds = prompt_embeds * attention_mask.unsqueeze(2)
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        attention_mask = attention_mask.repeat(num_images_per_prompt, 1)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            if negative_prompt is not None:
+                uncond_input = self.tokenizer(
+                    uncond_tokens,
+                    padding="max_length",
+                    max_length=128,
+                    truncation=True,
+                    return_attention_mask=True,
+                    return_tensors="pt",
+                )
+                text_input_ids = uncond_input.input_ids.to(device)
+                negative_attention_mask = uncond_input.attention_mask.to(device)
+
+                negative_prompt_embeds = self.text_encoder(
+                    text_input_ids,
+                    attention_mask=negative_attention_mask,
+                )
+                negative_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds[:, : prompt_embeds.shape[1]]
+                negative_attention_mask = negative_attention_mask[:, : prompt_embeds.shape[1]]
+                negative_prompt_embeds = negative_prompt_embeds * negative_attention_mask.unsqueeze(2)
+
+            else:
+                negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+                negative_attention_mask = torch.zeros_like(attention_mask)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+            if negative_prompt_embeds.shape != prompt_embeds.shape:
+                negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+                negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+                negative_attention_mask = negative_attention_mask.repeat(num_images_per_prompt, 1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        else:
+            negative_prompt_embeds = None
+            negative_attention_mask = None
+        return prompt_embeds, negative_prompt_embeds, attention_mask, negative_attention_mask
+
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                init_latents = [
+                    self.movq.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = self.movq.encode(image).latent_dist.sample(generator)
+
+            init_latents = self.movq.config.scaling_factor * init_latents
+
+        init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+
+        latents = init_latents
+
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        attention_mask=None,
+        negative_attention_mask=None,
+    ):
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if negative_prompt_embeds is not None and negative_attention_mask is None:
+            raise ValueError("Please provide `negative_attention_mask` along with `negative_prompt_embeds`")
+
+        if negative_prompt_embeds is not None and negative_attention_mask is not None:
+            if negative_prompt_embeds.shape[:2] != negative_attention_mask.shape:
+                raise ValueError(
+                    "`negative_prompt_embeds` and `negative_attention_mask` must have the same batch_size and token length when passed directly, but"
+                    f" got: `negative_prompt_embeds` {negative_prompt_embeds.shape[:2]} != `negative_attention_mask`"
+                    f" {negative_attention_mask.shape}."
+                )
+
+        if prompt_embeds is not None and attention_mask is None:
+            raise ValueError("Please provide `attention_mask` along with `prompt_embeds`")
+
+        if prompt_embeds is not None and attention_mask is not None:
+            if prompt_embeds.shape[:2] != attention_mask.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `attention_mask` must have the same batch_size and token length when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape[:2]} != `attention_mask`"
+                    f" {attention_mask.shape}."
+                )
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None,
+        strength: float = 0.3,
+        num_inference_steps: int = 25,
+        guidance_scale: float = 3.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        negative_attention_mask: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 3.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            attention_mask (`torch.FloatTensor`, *optional*):
+                Pre-generated attention mask. Must provide if passing `prompt_embeds` directly.
+            negative_attention_mask (`torch.FloatTensor`, *optional*):
+                Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`
+
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        cut_context = True
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+            attention_mask,
+            negative_attention_mask,
+        )
+
+        self._guidance_scale = guidance_scale
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds, attention_mask, negative_attention_mask = self.encode_prompt(
+            prompt,
+            self.do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            _cut_context=cut_context,
+            attention_mask=attention_mask,
+            negative_attention_mask=negative_attention_mask,
+        )
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            attention_mask = torch.cat([negative_attention_mask, attention_mask]).bool()
+        if not isinstance(image, list):
+            image = [image]
+        if not all(isinstance(i, (PIL.Image.Image, torch.Tensor)) for i in image):
+            raise ValueError(
+                f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support  PIL image and pytorch tensor"
+            )
+
+        image = torch.cat([prepare_image(i) for i in image], dim=0)
+        image = image.to(dtype=prompt_embeds.dtype, device=device)
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        # 5. Prepare latents
+        latents = self.movq.encode(image)["latents"]
+        latents = latents.repeat_interleave(num_images_per_prompt, dim=0)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        latents = self.prepare_latents(
+            latents, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
+        )
+        if hasattr(self, "text_encoder_offload_hook") and self.text_encoder_offload_hook is not None:
+            self.text_encoder_offload_hook.offload()
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    encoder_attention_mask=attention_mask,
+                )[0]
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+
+                    noise_pred = (guidance_scale + 1.0) * noise_pred_text - guidance_scale * noise_pred_uncond
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred,
+                    t,
+                    latents,
+                    generator=generator,
+                ).prev_sample
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    attention_mask = callback_outputs.pop("attention_mask", attention_mask)
+                    negative_attention_mask = callback_outputs.pop("negative_attention_mask", negative_attention_mask)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+            # post-processing
+            if output_type not in ["pt", "np", "pil", "latent"]:
+                raise ValueError(
+                    f"Only the output types `pt`, `pil`, `np` and `latent` are supported not output_type={output_type}"
+                )
+            if not output_type == "latent":
+                image = self.movq.decode(latents, force_not_quantize=True)["sample"]
+
+                if output_type in ["np", "pil"]:
+                    image = image * 0.5 + 0.5
+                    image = image.clamp(0, 1)
+                    image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+                if output_type == "pil":
+                    image = self.numpy_to_pil(image)
+            else:
+                image = latents
+
+            self.maybe_free_model_hooks()
+
+            if not return_dict:
+                return (image,)
+
+            return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_consistency_models/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_consistency_models/__init__.py
new file mode 100644
index 000000000..8f79d3c47
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_consistency_models/__init__.py
@@ -0,0 +1,50 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_latent_consistency_img2img"] = ["LatentConsistencyModelImg2ImgPipeline"]
+    _import_structure["pipeline_latent_consistency_text2img"] = ["LatentConsistencyModelPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_latent_consistency_img2img import LatentConsistencyModelImg2ImgPipeline
+        from .pipeline_latent_consistency_text2img import LatentConsistencyModelPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py
new file mode 100644
index 000000000..f64854ea9
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py
@@ -0,0 +1,956 @@
+# Copyright 2024 Stanford University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import LCMScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import AutoPipelineForImage2Image
+        >>> import torch
+        >>> import PIL
+
+        >>> pipe = AutoPipelineForImage2Image.from_pretrained("SimianLuo/LCM_Dreamshaper_v7")
+        >>> # To save GPU memory, torch.float16 can be used, but it may compromise image quality.
+        >>> pipe.to(torch_device="cuda", torch_dtype=torch.float32)
+
+        >>> prompt = "High altitude snowy mountains"
+        >>> image = PIL.Image.open("./snowy_mountains.png")
+
+        >>> # Can be set to 1~50 steps. LCM support fast inference even <= 4 steps. Recommend: 1~8 steps.
+        >>> num_inference_steps = 4
+        >>> images = pipe(
+        ...     prompt=prompt, image=image, num_inference_steps=num_inference_steps, guidance_scale=8.0
+        ... ).images
+
+        >>> images[0].save("image.png")
+        ```
+
+"""
+
+
+class LatentConsistencyModelImg2ImgPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    LoraLoaderMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for image-to-image generation using a latent consistency model.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Currently only
+            supports [`LCMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+        requires_safety_checker (`bool`, *optional*, defaults to `True`):
+            Whether the pipeline requires a safety checker component.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "denoised", "prompt_embeds", "w_embedding"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: LCMScheduler,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.prepare_latents
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    def check_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        strength: float,
+        callback_steps: int,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    @property
+    def do_classifier_free_guidance(self):
+        return False
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        num_inference_steps: int = 4,
+        strength: float = 0.8,
+        original_inference_steps: int = None,
+        timesteps: List[int] = None,
+        guidance_scale: float = 8.5,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            original_inference_steps (`int`, *optional*):
+                The original number of inference steps use to generate a linearly-spaced timestep schedule, from which
+                we will draw `num_inference_steps` evenly spaced timesteps from as our final timestep schedule,
+                following the Skipping-Step method in the paper (see Section 4.3). If not set this will default to the
+                scheduler's `original_inference_steps` attribute.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps on the original LCM training/distillation timestep schedule are used. Must be in descending
+                order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+                Note that the original latent consistency models paper uses a different CFG formulation where the
+                guidance scales are decreased by 1 (so in the paper formulation CFG is enabled when `guidance_scale >
+                0`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            strength,
+            callback_steps,
+            prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        # NOTE: when a LCM is distilled from an LDM via latent consistency distillation (Algorithm 1) with guided
+        # distillation, the forward pass of the LCM learns to approximate sampling from the LDM using CFG with the
+        # unconditional prompt "" (the empty string). Due to this, LCMs currently do not support negative prompts.
+        prompt_embeds, _ = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt=None,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=None,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 4. Encode image
+        image = self.image_processor.preprocess(image)
+
+        # 5. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            original_inference_steps=original_inference_steps,
+            strength=strength,
+        )
+
+        # 6. Prepare latent variables
+        original_inference_steps = (
+            original_inference_steps
+            if original_inference_steps is not None
+            else self.scheduler.config.original_inference_steps
+        )
+        latent_timestep = timesteps[:1]
+        latents = self.prepare_latents(
+            image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
+        )
+        bs = batch_size * num_images_per_prompt
+
+        # 6. Get Guidance Scale Embedding
+        # NOTE: We use the Imagen CFG formulation that StableDiffusionPipeline uses rather than the original LCM paper
+        # CFG formulation, so we need to subtract 1 from the input guidance_scale.
+        # LCM CFG formulation:  cfg_noise = noise_cond + cfg_scale * (noise_cond - noise_uncond), (cfg_scale > 0.0 using CFG)
+        w = torch.tensor(self.guidance_scale - 1).repeat(bs)
+        w_embedding = self.get_guidance_scale_embedding(w, embedding_dim=self.unet.config.time_cond_proj_dim).to(
+            device=device, dtype=latents.dtype
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, None)
+
+        # 7.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        # 8. LCM Multistep Sampling Loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                latents = latents.to(prompt_embeds.dtype)
+
+                # model prediction (v-prediction, eps, x)
+                model_pred = self.unet(
+                    latents,
+                    t,
+                    timestep_cond=w_embedding,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents, denoised = self.scheduler.step(model_pred, t, latents, **extra_step_kwargs, return_dict=False)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    w_embedding = callback_outputs.pop("w_embedding", w_embedding)
+                    denoised = callback_outputs.pop("denoised", denoised)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        denoised = denoised.to(prompt_embeds.dtype)
+        if not output_type == "latent":
+            image = self.vae.decode(denoised / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = denoised
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
new file mode 100644
index 000000000..e9bacaa89
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
@@ -0,0 +1,888 @@
+# Copyright 2024 Stanford University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import LCMScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import DiffusionPipeline
+        >>> import torch
+
+        >>> pipe = DiffusionPipeline.from_pretrained("SimianLuo/LCM_Dreamshaper_v7")
+        >>> # To save GPU memory, torch.float16 can be used, but it may compromise image quality.
+        >>> pipe.to(torch_device="cuda", torch_dtype=torch.float32)
+
+        >>> prompt = "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k"
+
+        >>> # Can be set to 1~50 steps. LCM support fast inference even <= 4 steps. Recommend: 1~8 steps.
+        >>> num_inference_steps = 4
+        >>> images = pipe(prompt=prompt, num_inference_steps=num_inference_steps, guidance_scale=8.0).images
+        >>> images[0].save("image.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class LatentConsistencyModelPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    LoraLoaderMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using a latent consistency model.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Currently only
+            supports [`LCMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+        requires_safety_checker (`bool`, *optional*, defaults to `True`):
+            Whether the pipeline requires a safety checker component.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "denoised", "prompt_embeds", "w_embedding"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: LCMScheduler,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Currently StableDiffusionPipeline.check_inputs with negative prompt stuff removed
+    def check_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        height: int,
+        width: int,
+        callback_steps: int,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    @property
+    def do_classifier_free_guidance(self):
+        return False
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 4,
+        original_inference_steps: int = None,
+        timesteps: List[int] = None,
+        guidance_scale: float = 8.5,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            original_inference_steps (`int`, *optional*):
+                The original number of inference steps use to generate a linearly-spaced timestep schedule, from which
+                we will draw `num_inference_steps` evenly spaced timesteps from as our final timestep schedule,
+                following the Skipping-Step method in the paper (see Section 4.3). If not set this will default to the
+                scheduler's `original_inference_steps` attribute.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps on the original LCM training/distillation timestep schedule are used. Must be in descending
+                order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+                Note that the original latent consistency models paper uses a different CFG formulation where the
+                guidance scales are decreased by 1 (so in the paper formulation CFG is enabled when `guidance_scale >
+                0`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        # NOTE: when a LCM is distilled from an LDM via latent consistency distillation (Algorithm 1) with guided
+        # distillation, the forward pass of the LCM learns to approximate sampling from the LDM using CFG with the
+        # unconditional prompt "" (the empty string). Due to this, LCMs currently do not support negative prompts.
+        prompt_embeds, _ = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt=None,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=None,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, original_inference_steps=original_inference_steps
+        )
+
+        # 5. Prepare latent variable
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        bs = batch_size * num_images_per_prompt
+
+        # 6. Get Guidance Scale Embedding
+        # NOTE: We use the Imagen CFG formulation that StableDiffusionPipeline uses rather than the original LCM paper
+        # CFG formulation, so we need to subtract 1 from the input guidance_scale.
+        # LCM CFG formulation:  cfg_noise = noise_cond + cfg_scale * (noise_cond - noise_uncond), (cfg_scale > 0.0 using CFG)
+        w = torch.tensor(self.guidance_scale - 1).repeat(bs)
+        w_embedding = self.get_guidance_scale_embedding(w, embedding_dim=self.unet.config.time_cond_proj_dim).to(
+            device=device, dtype=latents.dtype
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, None)
+
+        # 7.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        # 8. LCM MultiStep Sampling Loop:
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                latents = latents.to(prompt_embeds.dtype)
+
+                # model prediction (v-prediction, eps, x)
+                model_pred = self.unet(
+                    latents,
+                    t,
+                    timestep_cond=w_embedding,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents, denoised = self.scheduler.step(model_pred, t, latents, **extra_step_kwargs, return_dict=False)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    w_embedding = callback_outputs.pop("w_embedding", w_embedding)
+                    denoised = callback_outputs.pop("denoised", denoised)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        denoised = denoised.to(prompt_embeds.dtype)
+        if not output_type == "latent":
+            image = self.vae.decode(denoised / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = denoised
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_diffusion/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_diffusion/__init__.py
new file mode 100644
index 000000000..561f96fc7
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_diffusion/__init__.py
@@ -0,0 +1,50 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_latent_diffusion"] = ["LDMBertModel", "LDMTextToImagePipeline"]
+    _import_structure["pipeline_latent_diffusion_superresolution"] = ["LDMSuperResolutionPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_latent_diffusion import LDMBertModel, LDMTextToImagePipeline
+        from .pipeline_latent_diffusion_superresolution import LDMSuperResolutionPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
new file mode 100644
index 000000000..f39cbc839
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
@@ -0,0 +1,746 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutput
+from transformers.utils import logging
+
+from ...models import AutoencoderKL, UNet2DConditionModel, UNet2DModel, VQModel
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+class LDMTextToImagePipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using latent diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        vqvae ([`VQModel`]):
+            Vector-quantized (VQ) model to encode and decode images to and from latent representations.
+        bert ([`LDMBertModel`]):
+            Text-encoder model based on [`~transformers.BERT`].
+        tokenizer ([`~transformers.BertTokenizer`]):
+            A `BertTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "bert->unet->vqvae"
+
+    def __init__(
+        self,
+        vqvae: Union[VQModel, AutoencoderKL],
+        bert: PreTrainedModel,
+        tokenizer: PreTrainedTokenizer,
+        unet: Union[UNet2DModel, UNet2DConditionModel],
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+    ):
+        super().__init__()
+        self.register_modules(vqvae=vqvae, bert=bert, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
+        self.vae_scale_factor = 2 ** (len(self.vqvae.config.block_out_channels) - 1)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 1.0,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        **kwargs,
+    ) -> Union[Tuple, ImagePipelineOutput]:
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 1.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        ```py
+        >>> from diffusers import DiffusionPipeline
+
+        >>> # load model and scheduler
+        >>> ldm = DiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256")
+
+        >>> # run pipeline in inference (sample random noise and denoise)
+        >>> prompt = "A painting of a squirrel eating a burger"
+        >>> images = ldm([prompt], num_inference_steps=50, eta=0.3, guidance_scale=6).images
+
+        >>> # save images
+        >>> for idx, image in enumerate(images):
+        ...     image.save(f"squirrel-{idx}.png")
+        ```
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        # get unconditional embeddings for classifier free guidance
+        if guidance_scale != 1.0:
+            uncond_input = self.tokenizer(
+                [""] * batch_size, padding="max_length", max_length=77, truncation=True, return_tensors="pt"
+            )
+            negative_prompt_embeds = self.bert(uncond_input.input_ids.to(self._execution_device))[0]
+
+        # get prompt text embeddings
+        text_input = self.tokenizer(prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt")
+        prompt_embeds = self.bert(text_input.input_ids.to(self._execution_device))[0]
+
+        # get the initial random noise unless the user supplied it
+        latents_shape = (batch_size, self.unet.config.in_channels, height // 8, width // 8)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(
+                latents_shape, generator=generator, device=self._execution_device, dtype=prompt_embeds.dtype
+            )
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+        latents = latents.to(self._execution_device)
+
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+
+        extra_kwargs = {}
+        if accepts_eta:
+            extra_kwargs["eta"] = eta
+
+        for t in self.progress_bar(self.scheduler.timesteps):
+            if guidance_scale == 1.0:
+                # guidance_scale of 1 means no guidance
+                latents_input = latents
+                context = prompt_embeds
+            else:
+                # For classifier free guidance, we need to do two forward passes.
+                # Here we concatenate the unconditional and text embeddings into a single batch
+                # to avoid doing two forward passes
+                latents_input = torch.cat([latents] * 2)
+                context = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+            # predict the noise residual
+            noise_pred = self.unet(latents_input, t, encoder_hidden_states=context).sample
+            # perform guidance
+            if guidance_scale != 1.0:
+                noise_pred_uncond, noise_prediction_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_kwargs).prev_sample
+
+        # scale and decode the image latents with vae
+        latents = 1 / self.vqvae.config.scaling_factor * latents
+        image = self.vqvae.decode(latents).sample
+
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
+
+
+################################################################################
+# Code for the text transformer model
+################################################################################
+""" PyTorch LDMBERT model."""
+
+
+logger = logging.get_logger(__name__)
+
+LDMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "ldm-bert",
+    # See all LDMBert models at https://huggingface.co/models?filter=ldmbert
+]
+
+
+LDMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "ldm-bert": "https://huggingface.co/valhalla/ldm-bert/blob/main/config.json",
+}
+
+
+""" LDMBERT model configuration"""
+
+
+class LDMBertConfig(PretrainedConfig):
+    model_type = "ldmbert"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        max_position_embeddings=77,
+        encoder_layers=32,
+        encoder_ffn_dim=5120,
+        encoder_attention_heads=8,
+        head_dim=64,
+        encoder_layerdrop=0.0,
+        activation_function="gelu",
+        d_model=1280,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        use_cache=True,
+        pad_token_id=0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.head_dim = head_dim
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+
+
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->LDMBert
+class LDMBertAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        head_dim: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = False,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = head_dim
+        self.inner_dim = head_dim * num_heads
+
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, self.inner_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, self.inner_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, self.inner_dim, bias=bias)
+        self.out_proj = nn.Linear(self.inner_dim, embed_dim)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.inner_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class LDMBertEncoderLayer(nn.Module):
+    def __init__(self, config: LDMBertConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = LDMBertAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            head_dim=config.head_dim,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: torch.FloatTensor,
+        layer_head_mask: torch.FloatTensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.bart.modeling_bart.BartPretrainedModel with Bart->LDMBert
+class LDMBertPreTrainedModel(PreTrainedModel):
+    config_class = LDMBertConfig
+    base_model_prefix = "model"
+    _supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_unexpected = [r"encoder\.version", r"decoder\.version"]
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (LDMBertEncoder,)):
+            module.gradient_checkpointing = value
+
+    @property
+    def dummy_inputs(self):
+        pad_token = self.config.pad_token_id
+        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
+        dummy_inputs = {
+            "attention_mask": input_ids.ne(pad_token),
+            "input_ids": input_ids,
+        }
+        return dummy_inputs
+
+
+class LDMBertEncoder(LDMBertPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`LDMBertEncoderLayer`].
+
+    Args:
+        config: LDMBertConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: LDMBertConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+
+        embed_dim = config.d_model
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_position_embeddings
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim)
+        self.embed_positions = nn.Embedding(config.max_position_embeddings, embed_dim)
+        self.layers = nn.ModuleList([LDMBertEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layer_norm = nn.LayerNorm(embed_dim)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`BartTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.BaseModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        seq_len = input_shape[1]
+        if position_ids is None:
+            position_ids = torch.arange(seq_len, dtype=torch.long, device=inputs_embeds.device).expand((1, -1))
+        embed_pos = self.embed_positions(position_ids)
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.size()[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    (head_mask[idx] if head_mask is not None else None),
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class LDMBertModel(LDMBertPreTrainedModel):
+    _no_split_modules = []
+
+    def __init__(self, config: LDMBertConfig):
+        super().__init__(config)
+        self.model = LDMBertEncoder(config)
+        self.to_logits = nn.Linear(config.hidden_size, config.vocab_size)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return outputs
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
new file mode 100644
index 000000000..bb72b4d4e
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
@@ -0,0 +1,189 @@
+import inspect
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.utils.checkpoint
+
+from ...models import UNet2DModel, VQModel
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from ...utils import PIL_INTERPOLATION
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+def preprocess(image):
+    w, h = image.size
+    w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+
+
+class LDMSuperResolutionPipeline(DiffusionPipeline):
+    r"""
+    A pipeline for image super-resolution using latent diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Parameters:
+        vqvae ([`VQModel`]):
+            Vector-quantized (VQ) model to encode and decode images to and from latent representations.
+        unet ([`UNet2DModel`]):
+            A `UNet2DModel` to denoise the encoded image.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`],
+            [`EulerAncestralDiscreteScheduler`], [`DPMSolverMultistepScheduler`], or [`PNDMScheduler`].
+    """
+
+    def __init__(
+        self,
+        vqvae: VQModel,
+        unet: UNet2DModel,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+    ):
+        super().__init__()
+        self.register_modules(vqvae=vqvae, unet=unet, scheduler=scheduler)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[torch.Tensor, PIL.Image.Image] = None,
+        batch_size: Optional[int] = 1,
+        num_inference_steps: Optional[int] = 100,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ) -> Union[Tuple, ImagePipelineOutput]:
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            image (`torch.Tensor` or `PIL.Image.Image`):
+                `Image` or tensor representing an image batch to be used as the starting point for the process.
+            batch_size (`int`, *optional*, defaults to 1):
+                Number of images to generate.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`ImagePipelineOutput`] instead of a plain tuple.
+
+        Example:
+
+        ```py
+        >>> import requests
+        >>> from PIL import Image
+        >>> from io import BytesIO
+        >>> from diffusers import LDMSuperResolutionPipeline
+        >>> import torch
+
+        >>> # load model and scheduler
+        >>> pipeline = LDMSuperResolutionPipeline.from_pretrained("CompVis/ldm-super-resolution-4x-openimages")
+        >>> pipeline = pipeline.to("cuda")
+
+        >>> # let's download an  image
+        >>> url = (
+        ...     "https://user-images.githubusercontent.com/38061659/199705896-b48e17b8-b231-47cd-a270-4ffa5a93fa3e.png"
+        ... )
+        >>> response = requests.get(url)
+        >>> low_res_img = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> low_res_img = low_res_img.resize((128, 128))
+
+        >>> # run pipeline in inference (sample random noise and denoise)
+        >>> upscaled_image = pipeline(low_res_img, num_inference_steps=100, eta=1).images[0]
+        >>> # save image
+        >>> upscaled_image.save("ldm_generated_image.png")
+        ```
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
+        """
+        if isinstance(image, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(image, torch.Tensor):
+            batch_size = image.shape[0]
+        else:
+            raise ValueError(f"`image` has to be of type `PIL.Image.Image` or `torch.Tensor` but is {type(image)}")
+
+        if isinstance(image, PIL.Image.Image):
+            image = preprocess(image)
+
+        height, width = image.shape[-2:]
+
+        # in_channels should be 6: 3 for latents, 3 for low resolution image
+        latents_shape = (batch_size, self.unet.config.in_channels // 2, height, width)
+        latents_dtype = next(self.unet.parameters()).dtype
+
+        latents = randn_tensor(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
+
+        image = image.to(device=self.device, dtype=latents_dtype)
+
+        # set timesteps and move to the correct device
+        self.scheduler.set_timesteps(num_inference_steps, device=self.device)
+        timesteps_tensor = self.scheduler.timesteps
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature.
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_kwargs = {}
+        if accepts_eta:
+            extra_kwargs["eta"] = eta
+
+        for t in self.progress_bar(timesteps_tensor):
+            # concat latents and low resolution image in the channel dimension.
+            latents_input = torch.cat([latents, image], dim=1)
+            latents_input = self.scheduler.scale_model_input(latents_input, t)
+            # predict the noise residual
+            noise_pred = self.unet(latents_input, t).sample
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_kwargs).prev_sample
+
+        # decode the image latents with the VQVAE
+        image = self.vqvae.decode(latents).sample
+        image = torch.clamp(image, -1.0, 1.0)
+        image = image / 2 + 0.5
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ledits_pp/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ledits_pp/__init__.py
new file mode 100644
index 000000000..aae3b1cb1
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ledits_pp/__init__.py
@@ -0,0 +1,55 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_leditspp_stable_diffusion"] = ["LEditsPPPipelineStableDiffusion"]
+    _import_structure["pipeline_leditspp_stable_diffusion_xl"] = ["LEditsPPPipelineStableDiffusionXL"]
+
+    _import_structure["pipeline_output"] = ["LEditsPPDiffusionPipelineOutput", "LEditsPPDiffusionPipelineOutput"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_leditspp_stable_diffusion import (
+            LEditsPPDiffusionPipelineOutput,
+            LEditsPPInversionPipelineOutput,
+            LEditsPPPipelineStableDiffusion,
+        )
+        from .pipeline_leditspp_stable_diffusion_xl import LEditsPPPipelineStableDiffusionXL
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py
new file mode 100644
index 000000000..a6357c4cd
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py
@@ -0,0 +1,1505 @@
+import inspect
+import math
+from itertools import repeat
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.attention_processor import Attention, AttnProcessor
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from ...schedulers import DDIMScheduler, DPMSolverMultistepScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import LEditsPPDiffusionPipelineOutput, LEditsPPInversionPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import PIL
+        >>> import requests
+        >>> import torch
+        >>> from io import BytesIO
+
+        >>> from diffusers import LEditsPPPipelineStableDiffusion
+
+        >>> pipe = LEditsPPPipelineStableDiffusion.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+        >>> img_url = "https://www.aiml.informatik.tu-darmstadt.de/people/mbrack/cherry_blossom.png"
+        >>> image = download_image(img_url)
+
+        >>> _ = pipe.invert(
+        ...     image = image,
+        ...     num_inversion_steps=50,
+        ...     skip=0.1
+        ... )
+
+        >>> edited_image = pipe(
+        ...     editing_prompt=["cherry blossom"],
+        ...     edit_guidance_scale=10.0,
+        ...     edit_threshold=0.75,
+        ).images[0]
+        ```
+"""
+
+
+# Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionAttendAndExcitePipeline.AttentionStore
+class LeditsAttentionStore:
+    @staticmethod
+    def get_empty_store():
+        return {"down_cross": [], "mid_cross": [], "up_cross": [], "down_self": [], "mid_self": [], "up_self": []}
+
+    def __call__(self, attn, is_cross: bool, place_in_unet: str, editing_prompts, PnP=False):
+        # attn.shape = batch_size * head_size, seq_len query, seq_len_key
+        if attn.shape[1] <= self.max_size:
+            bs = 1 + int(PnP) + editing_prompts
+            skip = 2 if PnP else 1  # skip PnP & unconditional
+            attn = torch.stack(attn.split(self.batch_size)).permute(1, 0, 2, 3)
+            source_batch_size = int(attn.shape[1] // bs)
+            self.forward(attn[:, skip * source_batch_size :], is_cross, place_in_unet)
+
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
+
+        self.step_store[key].append(attn)
+
+    def between_steps(self, store_step=True):
+        if store_step:
+            if self.average:
+                if len(self.attention_store) == 0:
+                    self.attention_store = self.step_store
+                else:
+                    for key in self.attention_store:
+                        for i in range(len(self.attention_store[key])):
+                            self.attention_store[key][i] += self.step_store[key][i]
+            else:
+                if len(self.attention_store) == 0:
+                    self.attention_store = [self.step_store]
+                else:
+                    self.attention_store.append(self.step_store)
+
+            self.cur_step += 1
+        self.step_store = self.get_empty_store()
+
+    def get_attention(self, step: int):
+        if self.average:
+            attention = {
+                key: [item / self.cur_step for item in self.attention_store[key]] for key in self.attention_store
+            }
+        else:
+            assert step is not None
+            attention = self.attention_store[step]
+        return attention
+
+    def aggregate_attention(
+        self, attention_maps, prompts, res: Union[int, Tuple[int]], from_where: List[str], is_cross: bool, select: int
+    ):
+        out = [[] for x in range(self.batch_size)]
+        if isinstance(res, int):
+            num_pixels = res**2
+            resolution = (res, res)
+        else:
+            num_pixels = res[0] * res[1]
+            resolution = res[:2]
+
+        for location in from_where:
+            for bs_item in attention_maps[f"{location}_{'cross' if is_cross else 'self'}"]:
+                for batch, item in enumerate(bs_item):
+                    if item.shape[1] == num_pixels:
+                        cross_maps = item.reshape(len(prompts), -1, *resolution, item.shape[-1])[select]
+                        out[batch].append(cross_maps)
+
+        out = torch.stack([torch.cat(x, dim=0) for x in out])
+        # average over heads
+        out = out.sum(1) / out.shape[1]
+        return out
+
+    def __init__(self, average: bool, batch_size=1, max_resolution=16, max_size: int = None):
+        self.step_store = self.get_empty_store()
+        self.attention_store = []
+        self.cur_step = 0
+        self.average = average
+        self.batch_size = batch_size
+        if max_size is None:
+            self.max_size = max_resolution**2
+        elif max_size is not None and max_resolution is None:
+            self.max_size = max_size
+        else:
+            raise ValueError("Only allowed to set one of max_resolution or max_size")
+
+
+# Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionAttendAndExcitePipeline.GaussianSmoothing
+class LeditsGaussianSmoothing:
+    def __init__(self, device):
+        kernel_size = [3, 3]
+        sigma = [0.5, 0.5]
+
+        # The gaussian kernel is the product of the gaussian function of each dimension.
+        kernel = 1
+        meshgrids = torch.meshgrid([torch.arange(size, dtype=torch.float32) for size in kernel_size])
+        for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
+            mean = (size - 1) / 2
+            kernel *= 1 / (std * math.sqrt(2 * math.pi)) * torch.exp(-(((mgrid - mean) / (2 * std)) ** 2))
+
+        # Make sure sum of values in gaussian kernel equals 1.
+        kernel = kernel / torch.sum(kernel)
+
+        # Reshape to depthwise convolutional weight
+        kernel = kernel.view(1, 1, *kernel.size())
+        kernel = kernel.repeat(1, *[1] * (kernel.dim() - 1))
+
+        self.weight = kernel.to(device)
+
+    def __call__(self, input):
+        """
+        Arguments:
+        Apply gaussian filter to input.
+            input (torch.Tensor): Input to apply gaussian filter on.
+        Returns:
+            filtered (torch.Tensor): Filtered output.
+        """
+        return F.conv2d(input, weight=self.weight.to(input.dtype))
+
+
+class LEDITSCrossAttnProcessor:
+    def __init__(self, attention_store, place_in_unet, pnp, editing_prompts):
+        self.attnstore = attention_store
+        self.place_in_unet = place_in_unet
+        self.editing_prompts = editing_prompts
+        self.pnp = pnp
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states,
+        attention_mask=None,
+        temb=None,
+    ):
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        self.attnstore(
+            attention_probs,
+            is_cross=True,
+            place_in_unet=self.place_in_unet,
+            editing_prompts=self.editing_prompts,
+            PnP=self.pnp,
+        )
+
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+class LEditsPPPipelineStableDiffusion(
+    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
+):
+    """
+    Pipeline for textual image editing using LEDits++ with Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`] and builds on the [`StableDiffusionPipeline`]. Check the superclass
+    documentation for the generic methods implemented for all pipelines (downloading, saving, running on a particular
+    device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will automatically
+            be set to [`DPMSolverMultistepScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, DPMSolverMultistepScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if not isinstance(scheduler, DDIMScheduler) and not isinstance(scheduler, DPMSolverMultistepScheduler):
+            scheduler = DPMSolverMultistepScheduler.from_config(
+                scheduler.config, algorithm_type="sde-dpmsolver++", solver_order=2
+            )
+            logger.warning(
+                "This pipeline only supports DDIMScheduler and DPMSolverMultistepScheduler. "
+                "The scheduler has been changed to DPMSolverMultistepScheduler."
+            )
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+        self.inversion_steps = None
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, eta, generator=None):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        negative_prompt=None,
+        editing_prompt_embeddings=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if editing_prompt_embeddings is not None and negative_prompt_embeds is not None:
+            if editing_prompt_embeddings.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`editing_prompt_embeddings` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `editing_prompt_embeddings` {editing_prompt_embeddings.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, latents):
+        # shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+
+        # if latents.shape != shape:
+        #    raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+
+        latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def prepare_unet(self, attention_store, PnP: bool = False):
+        attn_procs = {}
+        for name in self.unet.attn_processors.keys():
+            if name.startswith("mid_block"):
+                place_in_unet = "mid"
+            elif name.startswith("up_blocks"):
+                place_in_unet = "up"
+            elif name.startswith("down_blocks"):
+                place_in_unet = "down"
+            else:
+                continue
+
+            if "attn2" in name and place_in_unet != "mid":
+                attn_procs[name] = LEDITSCrossAttnProcessor(
+                    attention_store=attention_store,
+                    place_in_unet=place_in_unet,
+                    pnp=PnP,
+                    editing_prompts=self.enabled_editing_prompts,
+                )
+            else:
+                attn_procs[name] = AttnProcessor()
+
+        self.unet.set_attn_processor(attn_procs)
+
+    def encode_prompt(
+        self,
+        device,
+        num_images_per_prompt,
+        enable_edit_guidance,
+        negative_prompt=None,
+        editing_prompt=None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        editing_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            enable_edit_guidance (`bool`):
+                whether to perform any editing or reconstruct the input image instead
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            editing_prompt (`str` or `List[str]`, *optional*):
+                Editing prompt(s) to be encoded. If not defined, one has to pass
+                `editing_prompt_embeds` instead.
+            editing_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        batch_size = self.batch_size
+        num_edit_tokens = None
+
+        if negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but exoected"
+                    f"{batch_size} based on the input images. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: procecss multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = negative_prompt_embeds.dtype
+
+        negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        if enable_edit_guidance:
+            if editing_prompt_embeds is None:
+                # textual inversion: procecss multi-vector tokens if necessary
+                # if isinstance(self, TextualInversionLoaderMixin):
+                #    prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+                if isinstance(editing_prompt, str):
+                    editing_prompt = [editing_prompt]
+
+                max_length = negative_prompt_embeds.shape[1]
+                text_inputs = self.tokenizer(
+                    [x for item in editing_prompt for x in repeat(item, batch_size)],
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                    return_length=True,
+                )
+
+                num_edit_tokens = text_inputs.length - 2  # not counting startoftext and endoftext
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = self.tokenizer(
+                    [x for item in editing_prompt for x in repeat(item, batch_size)],
+                    padding="longest",
+                    return_tensors="pt",
+                ).input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = self.tokenizer.batch_decode(
+                        untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                    )
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                if (
+                    hasattr(self.text_encoder.config, "use_attention_mask")
+                    and self.text_encoder.config.use_attention_mask
+                ):
+                    attention_mask = text_inputs.attention_mask.to(device)
+                else:
+                    attention_mask = None
+
+                if clip_skip is None:
+                    editing_prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                    editing_prompt_embeds = editing_prompt_embeds[0]
+                else:
+                    editing_prompt_embeds = self.text_encoder(
+                        text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                    )
+                    # Access the `hidden_states` first, that contains a tuple of
+                    # all the hidden states from the encoder layers. Then index into
+                    # the tuple to access the hidden states from the desired layer.
+                    editing_prompt_embeds = editing_prompt_embeds[-1][-(clip_skip + 1)]
+                    # We also need to apply the final LayerNorm here to not mess with the
+                    # representations. The `last_hidden_states` that we typically use for
+                    # obtaining the final prompt representations passes through the LayerNorm
+                    # layer.
+                    editing_prompt_embeds = self.text_encoder.text_model.final_layer_norm(editing_prompt_embeds)
+
+            editing_prompt_embeds = editing_prompt_embeds.to(dtype=negative_prompt_embeds.dtype, device=device)
+
+            bs_embed_edit, seq_len, _ = editing_prompt_embeds.shape
+            editing_prompt_embeds = editing_prompt_embeds.to(dtype=negative_prompt_embeds.dtype, device=device)
+            editing_prompt_embeds = editing_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            editing_prompt_embeds = editing_prompt_embeds.view(bs_embed_edit * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+
+        # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+        seq_len = negative_prompt_embeds.shape[1]
+
+        negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return editing_prompt_embeds, negative_prompt_embeds, num_edit_tokens
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        editing_prompt: Optional[Union[str, List[str]]] = None,
+        editing_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        reverse_editing_direction: Optional[Union[bool, List[bool]]] = False,
+        edit_guidance_scale: Optional[Union[float, List[float]]] = 5,
+        edit_warmup_steps: Optional[Union[int, List[int]]] = 0,
+        edit_cooldown_steps: Optional[Union[int, List[int]]] = None,
+        edit_threshold: Optional[Union[float, List[float]]] = 0.9,
+        user_mask: Optional[torch.FloatTensor] = None,
+        sem_guidance: Optional[List[torch.Tensor]] = None,
+        use_cross_attn_mask: bool = False,
+        use_intersect_mask: bool = True,
+        attn_store_steps: Optional[List[int]] = [],
+        store_averaged_over_steps: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for editing. The [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusion.invert`]
+        method has to be called beforehand. Edits will always be performed for the last inverted image(s).
+
+        Args:
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            generator (`torch.Generator`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            editing_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. The image is reconstructed by setting
+                `editing_prompt = None`. Guidance direction of prompt should be specified via `reverse_editing_direction`.
+            editing_prompt_embeds (`torch.Tensor>`, *optional*):
+                Pre-computed embeddings to use for guiding the image generation. Guidance direction of embedding should be
+                specified via `reverse_editing_direction`.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`):
+                Whether the corresponding prompt in `editing_prompt` should be increased or decreased.
+            edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5):
+                Guidance scale for guiding the image generation. If provided as list values should correspond to `editing_prompt`.
+                `edit_guidance_scale` is defined as `s_e` of equation 12 of
+                [LEDITS++ Paper](https://arxiv.org/abs/2301.12247).
+            edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10):
+                Number of diffusion steps (for each prompt) for which guidance will not be applied.
+            edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`):
+                Number of diffusion steps (for each prompt) after which guidance will no longer be applied.
+            edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9):
+                Masking threshold of guidance. Threshold should be proportional to the image region that is modified.
+                'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++ Paper](https://arxiv.org/abs/2301.12247).
+            user_mask (`torch.FloatTensor`, *optional*):
+                User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s implicit
+                masks do not meet user preferences.
+            sem_guidance (`List[torch.Tensor]`, *optional*):
+                List of pre-generated guidance vectors to be applied at generation. Length of the list has to
+                correspond to `num_inference_steps`.
+            use_cross_attn_mask (`bool`, defaults to `False`):
+                Whether cross-attention masks are used. Cross-attention masks are always used when use_intersect_mask
+                is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of
+                [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
+            use_intersect_mask (`bool`, defaults to `True`):
+                Whether the masking term is calculated as intersection of cross-attention masks and masks derived
+                from the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise
+                estimate are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
+            attn_store_steps (`List[int]`, *optional*):
+                Steps for which the attention maps are stored in the AttentionStore. Just for visualization purposes.
+            store_averaged_over_steps (`bool`, defaults to `True`):
+                Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps.
+                If False, attention maps for each step are stores separately. Just for visualization purposes.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True,
+            otherwise a `tuple. When returning a tuple, the first element is a list with the generated images, and the
+            second element is a list of `bool`s denoting whether the corresponding generated image likely represents
+            "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
+        """
+
+        if self.inversion_steps is None:
+            raise ValueError(
+                "You need to invert an input image first before calling the pipeline. The `invert` method has to be called beforehand. Edits will always be performed for the last inverted image(s)."
+            )
+
+        eta = self.eta
+        num_images_per_prompt = 1
+        latents = self.init_latents
+
+        zs = self.zs
+        self.scheduler.set_timesteps(len(self.scheduler.timesteps))
+
+        if use_intersect_mask:
+            use_cross_attn_mask = True
+
+        if use_cross_attn_mask:
+            self.smoothing = LeditsGaussianSmoothing(self.device)
+
+        if user_mask is not None:
+            user_mask = user_mask.to(self.device)
+
+        org_prompt = ""
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            negative_prompt,
+            editing_prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        batch_size = self.batch_size
+
+        if editing_prompt:
+            enable_edit_guidance = True
+            if isinstance(editing_prompt, str):
+                editing_prompt = [editing_prompt]
+            self.enabled_editing_prompts = len(editing_prompt)
+        elif editing_prompt_embeds is not None:
+            enable_edit_guidance = True
+            self.enabled_editing_prompts = editing_prompt_embeds.shape[0]
+        else:
+            self.enabled_editing_prompts = 0
+            enable_edit_guidance = False
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        edit_concepts, uncond_embeddings, num_edit_tokens = self.encode_prompt(
+            editing_prompt=editing_prompt,
+            device=self.device,
+            num_images_per_prompt=num_images_per_prompt,
+            enable_edit_guidance=enable_edit_guidance,
+            negative_prompt=negative_prompt,
+            editing_prompt_embeds=editing_prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if enable_edit_guidance:
+            text_embeddings = torch.cat([uncond_embeddings, edit_concepts])
+            self.text_cross_attention_maps = [editing_prompt] if isinstance(editing_prompt, str) else editing_prompt
+        else:
+            text_embeddings = torch.cat([uncond_embeddings])
+
+        # 4. Prepare timesteps
+        # self.scheduler.set_timesteps(num_inference_steps, device=self.device)
+        timesteps = self.inversion_steps
+        t_to_idx = {int(v): k for k, v in enumerate(timesteps[-zs.shape[0] :])}
+
+        if use_cross_attn_mask:
+            self.attention_store = LeditsAttentionStore(
+                average=store_averaged_over_steps,
+                batch_size=batch_size,
+                max_size=(latents.shape[-2] / 4.0) * (latents.shape[-1] / 4.0),
+                max_resolution=None,
+            )
+            self.prepare_unet(self.attention_store, PnP=False)
+            resolution = latents.shape[-2:]
+            att_res = (int(resolution[0] / 4), int(resolution[1] / 4))
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            None,
+            None,
+            text_embeddings.dtype,
+            self.device,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(eta)
+
+        self.sem_guidance = None
+        self.activation_mask = None
+
+        # 7. Denoising loop
+        num_warmup_steps = 0
+        with self.progress_bar(total=len(timesteps)) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+
+                if enable_edit_guidance:
+                    latent_model_input = torch.cat([latents] * (1 + self.enabled_editing_prompts))
+                else:
+                    latent_model_input = latents
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                text_embed_input = text_embeddings
+
+                # predict the noise residual
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embed_input).sample
+
+                noise_pred_out = noise_pred.chunk(1 + self.enabled_editing_prompts)  # [b,4, 64, 64]
+                noise_pred_uncond = noise_pred_out[0]
+                noise_pred_edit_concepts = noise_pred_out[1:]
+
+                noise_guidance_edit = torch.zeros(
+                    noise_pred_uncond.shape,
+                    device=self.device,
+                    dtype=noise_pred_uncond.dtype,
+                )
+
+                if sem_guidance is not None and len(sem_guidance) > i:
+                    noise_guidance_edit += sem_guidance[i].to(self.device)
+
+                elif enable_edit_guidance:
+                    if self.activation_mask is None:
+                        self.activation_mask = torch.zeros(
+                            (len(timesteps), len(noise_pred_edit_concepts), *noise_pred_edit_concepts[0].shape)
+                        )
+
+                    if self.sem_guidance is None:
+                        self.sem_guidance = torch.zeros((len(timesteps), *noise_pred_uncond.shape))
+
+                    for c, noise_pred_edit_concept in enumerate(noise_pred_edit_concepts):
+                        if isinstance(edit_warmup_steps, list):
+                            edit_warmup_steps_c = edit_warmup_steps[c]
+                        else:
+                            edit_warmup_steps_c = edit_warmup_steps
+                        if i < edit_warmup_steps_c:
+                            continue
+
+                        if isinstance(edit_guidance_scale, list):
+                            edit_guidance_scale_c = edit_guidance_scale[c]
+                        else:
+                            edit_guidance_scale_c = edit_guidance_scale
+
+                        if isinstance(edit_threshold, list):
+                            edit_threshold_c = edit_threshold[c]
+                        else:
+                            edit_threshold_c = edit_threshold
+                        if isinstance(reverse_editing_direction, list):
+                            reverse_editing_direction_c = reverse_editing_direction[c]
+                        else:
+                            reverse_editing_direction_c = reverse_editing_direction
+
+                        if isinstance(edit_cooldown_steps, list):
+                            edit_cooldown_steps_c = edit_cooldown_steps[c]
+                        elif edit_cooldown_steps is None:
+                            edit_cooldown_steps_c = i + 1
+                        else:
+                            edit_cooldown_steps_c = edit_cooldown_steps
+
+                        if i >= edit_cooldown_steps_c:
+                            continue
+
+                        noise_guidance_edit_tmp = noise_pred_edit_concept - noise_pred_uncond
+
+                        if reverse_editing_direction_c:
+                            noise_guidance_edit_tmp = noise_guidance_edit_tmp * -1
+
+                        noise_guidance_edit_tmp = noise_guidance_edit_tmp * edit_guidance_scale_c
+
+                        if user_mask is not None:
+                            noise_guidance_edit_tmp = noise_guidance_edit_tmp * user_mask
+
+                        if use_cross_attn_mask:
+                            out = self.attention_store.aggregate_attention(
+                                attention_maps=self.attention_store.step_store,
+                                prompts=self.text_cross_attention_maps,
+                                res=att_res,
+                                from_where=["up", "down"],
+                                is_cross=True,
+                                select=self.text_cross_attention_maps.index(editing_prompt[c]),
+                            )
+                            attn_map = out[:, :, :, 1 : 1 + num_edit_tokens[c]]  # 0 -> startoftext
+
+                            # average over all tokens
+                            if attn_map.shape[3] != num_edit_tokens[c]:
+                                raise ValueError(
+                                    f"Incorrect shape of attention_map. Expected size {num_edit_tokens[c]}, but found {attn_map.shape[3]}!"
+                                )
+
+                            attn_map = torch.sum(attn_map, dim=3)
+
+                            # gaussian_smoothing
+                            attn_map = F.pad(attn_map.unsqueeze(1), (1, 1, 1, 1), mode="reflect")
+                            attn_map = self.smoothing(attn_map).squeeze(1)
+
+                            # torch.quantile function expects float32
+                            if attn_map.dtype == torch.float32:
+                                tmp = torch.quantile(attn_map.flatten(start_dim=1), edit_threshold_c, dim=1)
+                            else:
+                                tmp = torch.quantile(
+                                    attn_map.flatten(start_dim=1).to(torch.float32), edit_threshold_c, dim=1
+                                ).to(attn_map.dtype)
+                            attn_mask = torch.where(
+                                attn_map >= tmp.unsqueeze(1).unsqueeze(1).repeat(1, *att_res), 1.0, 0.0
+                            )
+
+                            # resolution must match latent space dimension
+                            attn_mask = F.interpolate(
+                                attn_mask.unsqueeze(1),
+                                noise_guidance_edit_tmp.shape[-2:],  # 64,64
+                            ).repeat(1, 4, 1, 1)
+                            self.activation_mask[i, c] = attn_mask.detach().cpu()
+                            if not use_intersect_mask:
+                                noise_guidance_edit_tmp = noise_guidance_edit_tmp * attn_mask
+
+                        if use_intersect_mask:
+                            if t <= 800:
+                                noise_guidance_edit_tmp_quantile = torch.abs(noise_guidance_edit_tmp)
+                                noise_guidance_edit_tmp_quantile = torch.sum(
+                                    noise_guidance_edit_tmp_quantile, dim=1, keepdim=True
+                                )
+                                noise_guidance_edit_tmp_quantile = noise_guidance_edit_tmp_quantile.repeat(
+                                    1, self.unet.config.in_channels, 1, 1
+                                )
+
+                                # torch.quantile function expects float32
+                                if noise_guidance_edit_tmp_quantile.dtype == torch.float32:
+                                    tmp = torch.quantile(
+                                        noise_guidance_edit_tmp_quantile.flatten(start_dim=2),
+                                        edit_threshold_c,
+                                        dim=2,
+                                        keepdim=False,
+                                    )
+                                else:
+                                    tmp = torch.quantile(
+                                        noise_guidance_edit_tmp_quantile.flatten(start_dim=2).to(torch.float32),
+                                        edit_threshold_c,
+                                        dim=2,
+                                        keepdim=False,
+                                    ).to(noise_guidance_edit_tmp_quantile.dtype)
+
+                                intersect_mask = (
+                                    torch.where(
+                                        noise_guidance_edit_tmp_quantile >= tmp[:, :, None, None],
+                                        torch.ones_like(noise_guidance_edit_tmp),
+                                        torch.zeros_like(noise_guidance_edit_tmp),
+                                    )
+                                    * attn_mask
+                                )
+
+                                self.activation_mask[i, c] = intersect_mask.detach().cpu()
+
+                                noise_guidance_edit_tmp = noise_guidance_edit_tmp * intersect_mask
+
+                            else:
+                                # print(f"only attention mask for step {i}")
+                                noise_guidance_edit_tmp = noise_guidance_edit_tmp * attn_mask
+
+                        elif not use_cross_attn_mask:
+                            # calculate quantile
+                            noise_guidance_edit_tmp_quantile = torch.abs(noise_guidance_edit_tmp)
+                            noise_guidance_edit_tmp_quantile = torch.sum(
+                                noise_guidance_edit_tmp_quantile, dim=1, keepdim=True
+                            )
+                            noise_guidance_edit_tmp_quantile = noise_guidance_edit_tmp_quantile.repeat(1, 4, 1, 1)
+
+                            # torch.quantile function expects float32
+                            if noise_guidance_edit_tmp_quantile.dtype == torch.float32:
+                                tmp = torch.quantile(
+                                    noise_guidance_edit_tmp_quantile.flatten(start_dim=2),
+                                    edit_threshold_c,
+                                    dim=2,
+                                    keepdim=False,
+                                )
+                            else:
+                                tmp = torch.quantile(
+                                    noise_guidance_edit_tmp_quantile.flatten(start_dim=2).to(torch.float32),
+                                    edit_threshold_c,
+                                    dim=2,
+                                    keepdim=False,
+                                ).to(noise_guidance_edit_tmp_quantile.dtype)
+
+                            self.activation_mask[i, c] = (
+                                torch.where(
+                                    noise_guidance_edit_tmp_quantile >= tmp[:, :, None, None],
+                                    torch.ones_like(noise_guidance_edit_tmp),
+                                    torch.zeros_like(noise_guidance_edit_tmp),
+                                )
+                                .detach()
+                                .cpu()
+                            )
+
+                            noise_guidance_edit_tmp = torch.where(
+                                noise_guidance_edit_tmp_quantile >= tmp[:, :, None, None],
+                                noise_guidance_edit_tmp,
+                                torch.zeros_like(noise_guidance_edit_tmp),
+                            )
+
+                        noise_guidance_edit += noise_guidance_edit_tmp
+
+                    self.sem_guidance[i] = noise_guidance_edit.detach().cpu()
+
+                noise_pred = noise_pred_uncond + noise_guidance_edit
+
+                if enable_edit_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred,
+                        noise_pred_edit_concepts.mean(dim=0, keepdim=False),
+                        guidance_rescale=self.guidance_rescale,
+                    )
+
+                idx = t_to_idx[int(t)]
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, variance_noise=zs[idx], **extra_step_kwargs
+                ).prev_sample
+
+                # step callback
+                if use_cross_attn_mask:
+                    store_step = i in attn_store_steps
+                    self.attention_store.between_steps(store_step)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    # prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        # 8. Post-processing
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, self.device, text_embeddings.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return LEditsPPDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+    @torch.no_grad()
+    def invert(
+        self,
+        image: PipelineImageInput,
+        source_prompt: str = "",
+        source_guidance_scale: float = 3.5,
+        num_inversion_steps: int = 30,
+        skip: float = 0.15,
+        generator: Optional[torch.Generator] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        resize_mode: Optional[str] = "default",
+        crops_coords: Optional[Tuple[int, int, int, int]] = None,
+    ):
+        r"""
+        The function to the pipeline for image inversion as described by the [LEDITS++ Paper](https://arxiv.org/abs/2301.12247).
+        If the scheduler is set to [`~schedulers.DDIMScheduler`] the inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140)
+        will be performed instead.
+
+         Args:
+            image (`PipelineImageInput`):
+                Input for the image(s) that are to be edited. Multiple input images have to default to the same aspect
+                ratio.
+            source_prompt (`str`, defaults to `""`):
+                Prompt describing the input image that will be used for guidance during inversion. Guidance is disabled
+                if the `source_prompt` is `""`.
+            source_guidance_scale (`float`, defaults to `3.5`):
+                Strength of guidance during inversion.
+            num_inversion_steps (`int`, defaults to `30`):
+                Number of total performed inversion steps after discarding the initial `skip` steps.
+            skip (`float`, defaults to `0.15`):
+                Portion of initial steps that will be ignored for inversion and subsequent generation. Lower values
+                will lead to stronger changes to the input image. `skip` has to be between `0` and `1`.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                inversion deterministic.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            height (`int`, *optional*, defaults to `None`):
+                The height in preprocessed image. If `None`, will use the `get_default_height_width()` to get default height.
+            width (`int`, *optional*`, defaults to `None`):
+                The width in preprocessed. If `None`, will use  get_default_height_width()` to get the default width.
+            resize_mode (`str`, *optional*, defaults to `default`):
+                The resize mode, can be one of `default` or `fill`. If `default`, will resize the image to fit
+                within the specified width and height, and it may not maintaining the original aspect ratio.
+                If `fill`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image
+                within the dimensions, filling empty with data from image.
+                If `crop`, will resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center the image
+                within the dimensions, cropping the excess.
+                Note that resize_mode `fill` and `crop` are only supported for PIL image input.
+            crops_coords (`List[Tuple[int, int, int, int]]`, *optional*, defaults to `None`):
+                The crop coordinates for each image in the batch. If `None`, will not crop the image.
+
+        Returns:
+            [`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]:
+            Output will contain the resized input image(s) and respective VAE reconstruction(s).
+        """
+        # Reset attn processor, we do not want to store attn maps during inversion
+        self.unet.set_attn_processor(AttnProcessor())
+
+        self.eta = 1.0
+
+        self.scheduler.config.timestep_spacing = "leading"
+        self.scheduler.set_timesteps(int(num_inversion_steps * (1 + skip)))
+        self.inversion_steps = self.scheduler.timesteps[-num_inversion_steps:]
+        timesteps = self.inversion_steps
+
+        # 1. encode image
+        x0, resized = self.encode_image(
+            image,
+            dtype=self.text_encoder.dtype,
+            height=height,
+            width=width,
+            resize_mode=resize_mode,
+            crops_coords=crops_coords,
+        )
+        self.batch_size = x0.shape[0]
+
+        # autoencoder reconstruction
+        image_rec = self.vae.decode(x0 / self.vae.config.scaling_factor, return_dict=False, generator=generator)[0]
+        image_rec = self.image_processor.postprocess(image_rec, output_type="pil")
+
+        # 2. get embeddings
+        do_classifier_free_guidance = source_guidance_scale > 1.0
+
+        lora_scale = cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+
+        uncond_embedding, text_embeddings, _ = self.encode_prompt(
+            num_images_per_prompt=1,
+            device=self.device,
+            negative_prompt=None,
+            enable_edit_guidance=do_classifier_free_guidance,
+            editing_prompt=source_prompt,
+            lora_scale=lora_scale,
+            clip_skip=clip_skip,
+        )
+
+        # 3. find zs and xts
+        variance_noise_shape = (num_inversion_steps, *x0.shape)
+
+        # intermediate latents
+        t_to_idx = {int(v): k for k, v in enumerate(timesteps)}
+        xts = torch.zeros(size=variance_noise_shape, device=self.device, dtype=uncond_embedding.dtype)
+
+        for t in reversed(timesteps):
+            idx = num_inversion_steps - t_to_idx[int(t)] - 1
+            noise = randn_tensor(shape=x0.shape, generator=generator, device=self.device, dtype=x0.dtype)
+            xts[idx] = self.scheduler.add_noise(x0, noise, torch.Tensor([t]))
+        xts = torch.cat([x0.unsqueeze(0), xts], dim=0)
+
+        self.scheduler.set_timesteps(len(self.scheduler.timesteps))
+        # noise maps
+        zs = torch.zeros(size=variance_noise_shape, device=self.device, dtype=uncond_embedding.dtype)
+
+        with self.progress_bar(total=len(timesteps)) as progress_bar:
+            for t in timesteps:
+                idx = num_inversion_steps - t_to_idx[int(t)] - 1
+                # 1. predict noise residual
+                xt = xts[idx + 1]
+
+                noise_pred = self.unet(xt, timestep=t, encoder_hidden_states=uncond_embedding).sample
+
+                if not source_prompt == "":
+                    noise_pred_cond = self.unet(xt, timestep=t, encoder_hidden_states=text_embeddings).sample
+                    noise_pred = noise_pred + source_guidance_scale * (noise_pred_cond - noise_pred)
+
+                xtm1 = xts[idx]
+                z, xtm1_corrected = compute_noise(self.scheduler, xtm1, xt, t, noise_pred, self.eta)
+                zs[idx] = z
+
+                # correction to avoid error accumulation
+                xts[idx] = xtm1_corrected
+
+                progress_bar.update()
+
+        self.init_latents = xts[-1].expand(self.batch_size, -1, -1, -1)
+        zs = zs.flip(0)
+        self.zs = zs
+
+        return LEditsPPInversionPipelineOutput(images=resized, vae_reconstruction_images=image_rec)
+
+    @torch.no_grad()
+    def encode_image(self, image, dtype=None, height=None, width=None, resize_mode="default", crops_coords=None):
+        image = self.image_processor.preprocess(
+            image=image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
+        )
+        resized = self.image_processor.postprocess(image=image, output_type="pil")
+
+        if max(image.shape[-2:]) > self.vae.config["sample_size"] * 1.5:
+            logger.warning(
+                "Your input images far exceed the default resolution of the underlying diffusion model. "
+                "The output images may contain severe artifacts! "
+                "Consider down-sampling the input using the `height` and `width` parameters"
+            )
+        image = image.to(dtype)
+
+        x0 = self.vae.encode(image.to(self.device)).latent_dist.mode()
+        x0 = x0.to(dtype)
+        x0 = self.vae.config.scaling_factor * x0
+        return x0, resized
+
+
+def compute_noise_ddim(scheduler, prev_latents, latents, timestep, noise_pred, eta):
+    # 1. get previous step value (=t-1)
+    prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
+
+    # 2. compute alphas, betas
+    alpha_prod_t = scheduler.alphas_cumprod[timestep]
+    alpha_prod_t_prev = (
+        scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
+    )
+
+    beta_prod_t = 1 - alpha_prod_t
+
+    # 3. compute predicted original sample from predicted noise also called
+    # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
+
+    # 4. Clip "predicted x_0"
+    if scheduler.config.clip_sample:
+        pred_original_sample = torch.clamp(pred_original_sample, -1, 1)
+
+    # 5. compute variance: "sigma_t(η)" -> see formula (16)
+    # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+    variance = scheduler._get_variance(timestep, prev_timestep)
+    std_dev_t = eta * variance ** (0.5)
+
+    # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * noise_pred
+
+    # modifed so that updated xtm1 is returned as well (to avoid error accumulation)
+    mu_xt = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+    if variance > 0.0:
+        noise = (prev_latents - mu_xt) / (variance ** (0.5) * eta)
+    else:
+        noise = torch.tensor([0.0]).to(latents.device)
+
+    return noise, mu_xt + (eta * variance**0.5) * noise
+
+
+def compute_noise_sde_dpm_pp_2nd(scheduler, prev_latents, latents, timestep, noise_pred, eta):
+    def first_order_update(model_output, sample):  # timestep, prev_timestep, sample):
+        sigma_t, sigma_s = scheduler.sigmas[scheduler.step_index + 1], scheduler.sigmas[scheduler.step_index]
+        alpha_t, sigma_t = scheduler._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s, sigma_s = scheduler._sigma_to_alpha_sigma_t(sigma_s)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s = torch.log(alpha_s) - torch.log(sigma_s)
+
+        h = lambda_t - lambda_s
+
+        mu_xt = (sigma_t / sigma_s * torch.exp(-h)) * sample + (alpha_t * (1 - torch.exp(-2.0 * h))) * model_output
+
+        mu_xt = scheduler.dpm_solver_first_order_update(
+            model_output=model_output, sample=sample, noise=torch.zeros_like(sample)
+        )
+
+        sigma = sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h))
+        if sigma > 0.0:
+            noise = (prev_latents - mu_xt) / sigma
+        else:
+            noise = torch.tensor([0.0]).to(sample.device)
+
+        prev_sample = mu_xt + sigma * noise
+        return noise, prev_sample
+
+    def second_order_update(model_output_list, sample):  # timestep_list, prev_timestep, sample):
+        sigma_t, sigma_s0, sigma_s1 = (
+            scheduler.sigmas[scheduler.step_index + 1],
+            scheduler.sigmas[scheduler.step_index],
+            scheduler.sigmas[scheduler.step_index - 1],
+        )
+
+        alpha_t, sigma_t = scheduler._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = scheduler._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = scheduler._sigma_to_alpha_sigma_t(sigma_s1)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+
+        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
+        r0 = h_0 / h
+        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
+
+        mu_xt = (
+            (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+            + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+            + 0.5 * (alpha_t * (1 - torch.exp(-2.0 * h))) * D1
+        )
+
+        sigma = sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h))
+        if sigma > 0.0:
+            noise = (prev_latents - mu_xt) / sigma
+        else:
+            noise = torch.tensor([0.0]).to(sample.device)
+
+        prev_sample = mu_xt + sigma * noise
+
+        return noise, prev_sample
+
+    if scheduler.step_index is None:
+        scheduler._init_step_index(timestep)
+
+    model_output = scheduler.convert_model_output(model_output=noise_pred, sample=latents)
+    for i in range(scheduler.config.solver_order - 1):
+        scheduler.model_outputs[i] = scheduler.model_outputs[i + 1]
+    scheduler.model_outputs[-1] = model_output
+
+    if scheduler.lower_order_nums < 1:
+        noise, prev_sample = first_order_update(model_output, latents)
+    else:
+        noise, prev_sample = second_order_update(scheduler.model_outputs, latents)
+
+    if scheduler.lower_order_nums < scheduler.config.solver_order:
+        scheduler.lower_order_nums += 1
+
+    # upon completion increase step index by one
+    scheduler._step_index += 1
+
+    return noise, prev_sample
+
+
+def compute_noise(scheduler, *args):
+    if isinstance(scheduler, DDIMScheduler):
+        return compute_noise_ddim(scheduler, *args)
+    elif (
+        isinstance(scheduler, DPMSolverMultistepScheduler)
+        and scheduler.config.algorithm_type == "sde-dpmsolver++"
+        and scheduler.config.solver_order == 2
+    ):
+        return compute_noise_sde_dpm_pp_2nd(scheduler, *args)
+    else:
+        raise NotImplementedError
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py
new file mode 100644
index 000000000..874a10a7c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py
@@ -0,0 +1,1797 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import math
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.attention_processor import (
+    Attention,
+    AttnProcessor,
+    AttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import DDIMScheduler, DPMSolverMultistepScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    is_invisible_watermark_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import LEditsPPDiffusionPipelineOutput, LEditsPPInversionPipelineOutput
+
+
+if is_invisible_watermark_available():
+    from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> import PIL
+        >>> import requests
+        >>> from io import BytesIO
+
+        >>> from diffusers import LEditsPPPipelineStableDiffusionXL
+
+        >>> pipe = LEditsPPPipelineStableDiffusionXL.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+        >>> img_url = "https://www.aiml.informatik.tu-darmstadt.de/people/mbrack/tennis.jpg"
+        >>> image = download_image(img_url)
+
+        >>> _ = pipe.invert(
+        ...     image = image,
+        ...     num_inversion_steps=50,
+        ...     skip=0.2
+        ... )
+
+        >>> edited_image = pipe(
+        ...     editing_prompt=["tennis ball","tomato"],
+        ...     reverse_editing_direction=[True,False],
+        ...     edit_guidance_scale=[5.0,10.0],
+        ...     edit_threshold=[0.9,0.85],
+        ).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.ledits_pp.pipeline_leditspp_stable_diffusion.LeditsAttentionStore
+class LeditsAttentionStore:
+    @staticmethod
+    def get_empty_store():
+        return {"down_cross": [], "mid_cross": [], "up_cross": [], "down_self": [], "mid_self": [], "up_self": []}
+
+    def __call__(self, attn, is_cross: bool, place_in_unet: str, editing_prompts, PnP=False):
+        # attn.shape = batch_size * head_size, seq_len query, seq_len_key
+        if attn.shape[1] <= self.max_size:
+            bs = 1 + int(PnP) + editing_prompts
+            skip = 2 if PnP else 1  # skip PnP & unconditional
+            attn = torch.stack(attn.split(self.batch_size)).permute(1, 0, 2, 3)
+            source_batch_size = int(attn.shape[1] // bs)
+            self.forward(attn[:, skip * source_batch_size :], is_cross, place_in_unet)
+
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
+
+        self.step_store[key].append(attn)
+
+    def between_steps(self, store_step=True):
+        if store_step:
+            if self.average:
+                if len(self.attention_store) == 0:
+                    self.attention_store = self.step_store
+                else:
+                    for key in self.attention_store:
+                        for i in range(len(self.attention_store[key])):
+                            self.attention_store[key][i] += self.step_store[key][i]
+            else:
+                if len(self.attention_store) == 0:
+                    self.attention_store = [self.step_store]
+                else:
+                    self.attention_store.append(self.step_store)
+
+            self.cur_step += 1
+        self.step_store = self.get_empty_store()
+
+    def get_attention(self, step: int):
+        if self.average:
+            attention = {
+                key: [item / self.cur_step for item in self.attention_store[key]] for key in self.attention_store
+            }
+        else:
+            assert step is not None
+            attention = self.attention_store[step]
+        return attention
+
+    def aggregate_attention(
+        self, attention_maps, prompts, res: Union[int, Tuple[int]], from_where: List[str], is_cross: bool, select: int
+    ):
+        out = [[] for x in range(self.batch_size)]
+        if isinstance(res, int):
+            num_pixels = res**2
+            resolution = (res, res)
+        else:
+            num_pixels = res[0] * res[1]
+            resolution = res[:2]
+
+        for location in from_where:
+            for bs_item in attention_maps[f"{location}_{'cross' if is_cross else 'self'}"]:
+                for batch, item in enumerate(bs_item):
+                    if item.shape[1] == num_pixels:
+                        cross_maps = item.reshape(len(prompts), -1, *resolution, item.shape[-1])[select]
+                        out[batch].append(cross_maps)
+
+        out = torch.stack([torch.cat(x, dim=0) for x in out])
+        # average over heads
+        out = out.sum(1) / out.shape[1]
+        return out
+
+    def __init__(self, average: bool, batch_size=1, max_resolution=16, max_size: int = None):
+        self.step_store = self.get_empty_store()
+        self.attention_store = []
+        self.cur_step = 0
+        self.average = average
+        self.batch_size = batch_size
+        if max_size is None:
+            self.max_size = max_resolution**2
+        elif max_size is not None and max_resolution is None:
+            self.max_size = max_size
+        else:
+            raise ValueError("Only allowed to set one of max_resolution or max_size")
+
+
+# Copied from diffusers.pipelines.ledits_pp.pipeline_leditspp_stable_diffusion.LeditsGaussianSmoothing
+class LeditsGaussianSmoothing:
+    def __init__(self, device):
+        kernel_size = [3, 3]
+        sigma = [0.5, 0.5]
+
+        # The gaussian kernel is the product of the gaussian function of each dimension.
+        kernel = 1
+        meshgrids = torch.meshgrid([torch.arange(size, dtype=torch.float32) for size in kernel_size])
+        for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
+            mean = (size - 1) / 2
+            kernel *= 1 / (std * math.sqrt(2 * math.pi)) * torch.exp(-(((mgrid - mean) / (2 * std)) ** 2))
+
+        # Make sure sum of values in gaussian kernel equals 1.
+        kernel = kernel / torch.sum(kernel)
+
+        # Reshape to depthwise convolutional weight
+        kernel = kernel.view(1, 1, *kernel.size())
+        kernel = kernel.repeat(1, *[1] * (kernel.dim() - 1))
+
+        self.weight = kernel.to(device)
+
+    def __call__(self, input):
+        """
+        Arguments:
+        Apply gaussian filter to input.
+            input (torch.Tensor): Input to apply gaussian filter on.
+        Returns:
+            filtered (torch.Tensor): Filtered output.
+        """
+        return F.conv2d(input, weight=self.weight.to(input.dtype))
+
+
+# Copied from diffusers.pipelines.ledits_pp.pipeline_leditspp_stable_diffusion.LEDITSCrossAttnProcessor
+class LEDITSCrossAttnProcessor:
+    def __init__(self, attention_store, place_in_unet, pnp, editing_prompts):
+        self.attnstore = attention_store
+        self.place_in_unet = place_in_unet
+        self.editing_prompts = editing_prompts
+        self.pnp = pnp
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states,
+        encoder_hidden_states,
+        attention_mask=None,
+        temb=None,
+    ):
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        self.attnstore(
+            attention_probs,
+            is_cross=True,
+            place_in_unet=self.place_in_unet,
+            editing_prompts=self.editing_prompts,
+            PnP=self.pnp,
+        )
+
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+
+
+class LEditsPPPipelineStableDiffusionXL(
+    DiffusionPipeline,
+    FromSingleFileMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+):
+    """
+    Pipeline for textual image editing using LEDits++ with Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`] and builds on the [`StableDiffusionXLPipeline`]. Check the superclass
+    documentation for the generic methods implemented for all pipelines (downloading, saving, running on a particular
+    device, etc.).
+
+    In addition the pipeline inherits the following loading methods:
+        - *LoRA*: [`LEditsPPPipelineStableDiffusionXL.load_lora_weights`]
+        - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`]
+
+    as well as the following saving methods:
+        - *LoRA*: [`loaders.StableDiffusionXLPipeline.save_lora_weights`]
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([`~transformers.CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 ([`~transformers.CLIPTokenizer`]):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DPMSolverMultistepScheduler`] or [`DDIMScheduler`]. If any other scheduler is passed it will automatically
+            be set to [`DPMSolverMultistepScheduler`].
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
+            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
+            watermarker will be used.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "negative_add_time_ids",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DPMSolverMultistepScheduler, DDIMScheduler],
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        if not isinstance(scheduler, DDIMScheduler) and not isinstance(scheduler, DPMSolverMultistepScheduler):
+            self.scheduler = DPMSolverMultistepScheduler.from_config(
+                scheduler.config, algorithm_type="sde-dpmsolver++", solver_order=2
+            )
+            logger.warning(
+                "This pipeline only supports DDIMScheduler and DPMSolverMultistepScheduler. "
+                "The scheduler has been changed to DPMSolverMultistepScheduler."
+            )
+
+        self.default_sample_size = self.unet.config.sample_size
+
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+        self.inversion_steps = None
+
+    def encode_prompt(
+        self,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+        enable_edit_guidance: bool = True,
+        editing_prompt: Optional[str] = None,
+        editing_prompt_embeds: Optional[torch.FloatTensor] = None,
+        editing_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    ) -> object:
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead.
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            enable_edit_guidance (`bool`):
+                Whether to guide towards an editing prompt or not.
+            editing_prompt (`str` or `List[str]`, *optional*):
+                Editing prompt(s) to be encoded. If not defined and 'enable_edit_guidance' is True, one has to pass
+                `editing_prompt_embeds` instead.
+            editing_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided and 'enable_edit_guidance' is True, editing_prompt_embeds will be generated from `editing_prompt` input
+                argument.
+            editing_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated edit pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled editing_pooled_prompt_embeds will be generated from `editing_prompt`
+                input argument.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        batch_size = self.batch_size
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+        num_edit_tokens = 0
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+
+        if negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+
+            if batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but image inversion "
+                    f" has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of the input images."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+            if zero_out_negative_prompt:
+                negative_prompt_embeds = torch.zeros_like(negative_prompt_embeds)
+                negative_pooled_prompt_embeds = torch.zeros_like(negative_pooled_prompt_embeds)
+
+        if enable_edit_guidance and editing_prompt_embeds is None:
+            editing_prompt_2 = editing_prompt
+
+            editing_prompts = [editing_prompt, editing_prompt_2]
+            edit_prompt_embeds_list = []
+
+            for editing_prompt, tokenizer, text_encoder in zip(editing_prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    editing_prompt = self.maybe_convert_prompt(editing_prompt, tokenizer)
+
+                max_length = negative_prompt_embeds.shape[1]
+                edit_concepts_input = tokenizer(
+                    # [x for item in editing_prompt for x in repeat(item, batch_size)],
+                    editing_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                    return_length=True,
+                )
+                num_edit_tokens = edit_concepts_input.length - 2
+
+                edit_concepts_embeds = text_encoder(
+                    edit_concepts_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                editing_pooled_prompt_embeds = edit_concepts_embeds[0]
+                if clip_skip is None:
+                    edit_concepts_embeds = edit_concepts_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    edit_concepts_embeds = edit_concepts_embeds.hidden_states[-(clip_skip + 2)]
+
+                edit_prompt_embeds_list.append(edit_concepts_embeds)
+
+            edit_concepts_embeds = torch.concat(edit_prompt_embeds_list, dim=-1)
+        elif not enable_edit_guidance:
+            edit_concepts_embeds = None
+            editing_pooled_prompt_embeds = None
+
+        negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        bs_embed, seq_len, _ = negative_prompt_embeds.shape
+        # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+        seq_len = negative_prompt_embeds.shape[1]
+        negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if enable_edit_guidance:
+            bs_embed_edit, seq_len, _ = edit_concepts_embeds.shape
+            edit_concepts_embeds = edit_concepts_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            edit_concepts_embeds = edit_concepts_embeds.repeat(1, num_images_per_prompt, 1)
+            edit_concepts_embeds = edit_concepts_embeds.view(bs_embed_edit * num_images_per_prompt, seq_len, -1)
+
+        negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+
+        if enable_edit_guidance:
+            editing_pooled_prompt_embeds = editing_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed_edit * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return (
+            negative_prompt_embeds,
+            edit_concepts_embeds,
+            negative_pooled_prompt_embeds,
+            editing_pooled_prompt_embeds,
+            num_edit_tokens,
+        )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, eta, generator=None):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        negative_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+    ):
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+    # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, device, latents):
+        latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                LoRAXFormersAttnProcessor,
+                LoRAAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    # Copied from diffusers.pipelines.ledits_pp.pipeline_leditspp_stable_diffusion.LEditsPPPipelineStableDiffusion.prepare_unet
+    def prepare_unet(self, attention_store, PnP: bool = False):
+        attn_procs = {}
+        for name in self.unet.attn_processors.keys():
+            if name.startswith("mid_block"):
+                place_in_unet = "mid"
+            elif name.startswith("up_blocks"):
+                place_in_unet = "up"
+            elif name.startswith("down_blocks"):
+                place_in_unet = "down"
+            else:
+                continue
+
+            if "attn2" in name and place_in_unet != "mid":
+                attn_procs[name] = LEDITSCrossAttnProcessor(
+                    attention_store=attention_store,
+                    place_in_unet=place_in_unet,
+                    pnp=PnP,
+                    editing_prompts=self.enabled_editing_prompts,
+                )
+            else:
+                attn_procs[name] = AttnProcessor()
+
+        self.unet.set_attn_processor(attn_procs)
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        denoising_end: Optional[float] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        editing_prompt: Optional[Union[str, List[str]]] = None,
+        editing_prompt_embeddings: Optional[torch.Tensor] = None,
+        editing_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        reverse_editing_direction: Optional[Union[bool, List[bool]]] = False,
+        edit_guidance_scale: Optional[Union[float, List[float]]] = 5,
+        edit_warmup_steps: Optional[Union[int, List[int]]] = 0,
+        edit_cooldown_steps: Optional[Union[int, List[int]]] = None,
+        edit_threshold: Optional[Union[float, List[float]]] = 0.9,
+        sem_guidance: Optional[List[torch.Tensor]] = None,
+        use_cross_attn_mask: bool = False,
+        use_intersect_mask: bool = False,
+        user_mask: Optional[torch.FloatTensor] = None,
+        attn_store_steps: Optional[List[int]] = [],
+        store_averaged_over_steps: bool = True,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for editing. The [`~pipelines.ledits_pp.LEditsPPPipelineStableDiffusionXL.invert`]
+        method has to be called beforehand. Edits will always be performed for the last inverted image(s).
+
+        Args:
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.7):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            editing_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. The image is reconstructed by setting
+                `editing_prompt = None`. Guidance direction of prompt should be specified via `reverse_editing_direction`.
+            editing_prompt_embeddings (`torch.Tensor`, *optional*):
+                Pre-generated edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input
+                argument.
+            editing_pooled_prompt_embeddings (`torch.Tensor`, *optional*):
+                Pre-generated pooled edit text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, editing_prompt_embeddings will be generated from `editing_prompt` input
+                argument.
+            reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`):
+                Whether the corresponding prompt in `editing_prompt` should be increased or decreased.
+            edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5):
+                Guidance scale for guiding the image generation. If provided as list values should correspond to `editing_prompt`.
+                `edit_guidance_scale` is defined as `s_e` of equation 12 of
+                [LEDITS++ Paper](https://arxiv.org/abs/2301.12247).
+            edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10):
+                Number of diffusion steps (for each prompt) for which guidance is not applied.
+            edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`):
+                Number of diffusion steps (for each prompt) after which guidance is no longer applied.
+            edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9):
+                Masking threshold of guidance. Threshold should be proportional to the image region that is modified.
+                'edit_threshold' is defined as 'λ' of equation 12 of [LEDITS++ Paper](https://arxiv.org/abs/2301.12247).
+            sem_guidance (`List[torch.Tensor]`, *optional*):
+                List of pre-generated guidance vectors to be applied at generation. Length of the list has to
+                correspond to `num_inference_steps`.
+            use_cross_attn_mask:
+                Whether cross-attention masks are used. Cross-attention masks are always used when use_intersect_mask
+                is set to true. Cross-attention masks are defined as 'M^1' of equation 12 of
+                [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
+            use_intersect_mask:
+                Whether the masking term is calculated as intersection of cross-attention masks and masks derived
+                from the noise estimate. Cross-attention mask are defined as 'M^1' and masks derived from the noise
+                estimate are defined as 'M^2' of equation 12 of [LEDITS++ paper](https://arxiv.org/pdf/2311.16711.pdf).
+            user_mask:
+                User-provided mask for even better control over the editing process. This is helpful when LEDITS++'s implicit
+                masks do not meet user preferences.
+            attn_store_steps:
+                Steps for which the attention maps are stored in the AttentionStore. Just for visualization purposes.
+            store_averaged_over_steps:
+                Whether the attention maps for the 'attn_store_steps' are stored averaged over the diffusion steps.
+                If False, attention maps for each step are stores separately. Just for visualization purposes.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.ledits_pp.LEditsPPDiffusionPipelineOutput`] if `return_dict` is True,
+            otherwise a `tuple. When returning a tuple, the first element is a list with the generated images.
+        """
+        if self.inversion_steps is None:
+            raise ValueError(
+                "You need to invert an input image first before calling the pipeline. The `invert` method has to be called beforehand. Edits will always be performed for the last inverted image(s)."
+            )
+
+        eta = self.eta
+        num_images_per_prompt = 1
+        latents = self.init_latents
+
+        zs = self.zs
+        self.scheduler.set_timesteps(len(self.scheduler.timesteps))
+
+        if use_intersect_mask:
+            use_cross_attn_mask = True
+
+        if use_cross_attn_mask:
+            self.smoothing = LeditsGaussianSmoothing(self.device)
+
+        if user_mask is not None:
+            user_mask = user_mask.to(self.device)
+
+        # TODO: Check inputs
+        # 1. Check inputs. Raise error if not correct
+        # self.check_inputs(
+        #    callback_steps,
+        #    negative_prompt,
+        #    negative_prompt_2,
+        #    prompt_embeds,
+        #    negative_prompt_embeds,
+        #    pooled_prompt_embeds,
+        #    negative_pooled_prompt_embeds,
+        # )
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+
+        # 2. Define call parameters
+        batch_size = self.batch_size
+
+        device = self._execution_device
+
+        if editing_prompt:
+            enable_edit_guidance = True
+            if isinstance(editing_prompt, str):
+                editing_prompt = [editing_prompt]
+            self.enabled_editing_prompts = len(editing_prompt)
+        elif editing_prompt_embeddings is not None:
+            enable_edit_guidance = True
+            self.enabled_editing_prompts = editing_prompt_embeddings.shape[0]
+        else:
+            self.enabled_editing_prompts = 0
+            enable_edit_guidance = False
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            edit_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            pooled_edit_embeds,
+            num_edit_tokens,
+        ) = self.encode_prompt(
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+            enable_edit_guidance=enable_edit_guidance,
+            editing_prompt=editing_prompt,
+            editing_prompt_embeds=editing_prompt_embeddings,
+            editing_pooled_prompt_embeds=editing_pooled_prompt_embeds,
+        )
+
+        # 4. Prepare timesteps
+        # self.scheduler.set_timesteps(num_inference_steps, device=device)
+
+        timesteps = self.inversion_steps
+        t_to_idx = {int(v): k for k, v in enumerate(timesteps)}
+
+        if use_cross_attn_mask:
+            self.attention_store = LeditsAttentionStore(
+                average=store_averaged_over_steps,
+                batch_size=batch_size,
+                max_size=(latents.shape[-2] / 4.0) * (latents.shape[-1] / 4.0),
+                max_resolution=None,
+            )
+            self.prepare_unet(self.attention_store)
+            resolution = latents.shape[-2:]
+            att_res = (int(resolution[0] / 4), int(resolution[1] / 4))
+
+        # 5. Prepare latent variables
+        latents = self.prepare_latents(device=device, latents=latents)
+
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(eta)
+
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(negative_pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = negative_pooled_prompt_embeds
+        add_time_ids = self._get_add_time_ids(
+            self.size,
+            crops_coords_top_left,
+            self.size,
+            dtype=negative_pooled_prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+
+        if enable_edit_guidance:
+            prompt_embeds = torch.cat([prompt_embeds, edit_prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([add_text_embeds, pooled_edit_embeds], dim=0)
+            edit_concepts_time_ids = add_time_ids.repeat(edit_prompt_embeds.shape[0], 1)
+            add_time_ids = torch.cat([add_time_ids, edit_concepts_time_ids], dim=0)
+            self.text_cross_attention_maps = [editing_prompt] if isinstance(editing_prompt, str) else editing_prompt
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        if ip_adapter_image is not None:
+            # TODO: fix image encoding
+            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+                image_embeds = image_embeds.to(device)
+
+        # 8. Denoising loop
+        self.sem_guidance = None
+        self.activation_mask = None
+
+        if (
+            self.denoising_end is not None
+            and isinstance(self.denoising_end, float)
+            and self.denoising_end > 0
+            and self.denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # 9. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=self._num_timesteps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * (1 + self.enabled_editing_prompts))
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                noise_pred_out = noise_pred.chunk(1 + self.enabled_editing_prompts)  # [b,4, 64, 64]
+                noise_pred_uncond = noise_pred_out[0]
+                noise_pred_edit_concepts = noise_pred_out[1:]
+
+                noise_guidance_edit = torch.zeros(
+                    noise_pred_uncond.shape,
+                    device=self.device,
+                    dtype=noise_pred_uncond.dtype,
+                )
+
+                if sem_guidance is not None and len(sem_guidance) > i:
+                    noise_guidance_edit += sem_guidance[i].to(self.device)
+
+                elif enable_edit_guidance:
+                    if self.activation_mask is None:
+                        self.activation_mask = torch.zeros(
+                            (len(timesteps), self.enabled_editing_prompts, *noise_pred_edit_concepts[0].shape)
+                        )
+                    if self.sem_guidance is None:
+                        self.sem_guidance = torch.zeros((len(timesteps), *noise_pred_uncond.shape))
+
+                    # noise_guidance_edit = torch.zeros_like(noise_guidance)
+                    for c, noise_pred_edit_concept in enumerate(noise_pred_edit_concepts):
+                        if isinstance(edit_warmup_steps, list):
+                            edit_warmup_steps_c = edit_warmup_steps[c]
+                        else:
+                            edit_warmup_steps_c = edit_warmup_steps
+                        if i < edit_warmup_steps_c:
+                            continue
+
+                        if isinstance(edit_guidance_scale, list):
+                            edit_guidance_scale_c = edit_guidance_scale[c]
+                        else:
+                            edit_guidance_scale_c = edit_guidance_scale
+
+                        if isinstance(edit_threshold, list):
+                            edit_threshold_c = edit_threshold[c]
+                        else:
+                            edit_threshold_c = edit_threshold
+                        if isinstance(reverse_editing_direction, list):
+                            reverse_editing_direction_c = reverse_editing_direction[c]
+                        else:
+                            reverse_editing_direction_c = reverse_editing_direction
+
+                        if isinstance(edit_cooldown_steps, list):
+                            edit_cooldown_steps_c = edit_cooldown_steps[c]
+                        elif edit_cooldown_steps is None:
+                            edit_cooldown_steps_c = i + 1
+                        else:
+                            edit_cooldown_steps_c = edit_cooldown_steps
+
+                        if i >= edit_cooldown_steps_c:
+                            continue
+
+                        noise_guidance_edit_tmp = noise_pred_edit_concept - noise_pred_uncond
+
+                        if reverse_editing_direction_c:
+                            noise_guidance_edit_tmp = noise_guidance_edit_tmp * -1
+
+                        noise_guidance_edit_tmp = noise_guidance_edit_tmp * edit_guidance_scale_c
+
+                        if user_mask is not None:
+                            noise_guidance_edit_tmp = noise_guidance_edit_tmp * user_mask
+
+                        if use_cross_attn_mask:
+                            out = self.attention_store.aggregate_attention(
+                                attention_maps=self.attention_store.step_store,
+                                prompts=self.text_cross_attention_maps,
+                                res=att_res,
+                                from_where=["up", "down"],
+                                is_cross=True,
+                                select=self.text_cross_attention_maps.index(editing_prompt[c]),
+                            )
+                            attn_map = out[:, :, :, 1 : 1 + num_edit_tokens[c]]  # 0 -> startoftext
+
+                            # average over all tokens
+                            if attn_map.shape[3] != num_edit_tokens[c]:
+                                raise ValueError(
+                                    f"Incorrect shape of attention_map. Expected size {num_edit_tokens[c]}, but found {attn_map.shape[3]}!"
+                                )
+                            attn_map = torch.sum(attn_map, dim=3)
+
+                            # gaussian_smoothing
+                            attn_map = F.pad(attn_map.unsqueeze(1), (1, 1, 1, 1), mode="reflect")
+                            attn_map = self.smoothing(attn_map).squeeze(1)
+
+                            # torch.quantile function expects float32
+                            if attn_map.dtype == torch.float32:
+                                tmp = torch.quantile(attn_map.flatten(start_dim=1), edit_threshold_c, dim=1)
+                            else:
+                                tmp = torch.quantile(
+                                    attn_map.flatten(start_dim=1).to(torch.float32), edit_threshold_c, dim=1
+                                ).to(attn_map.dtype)
+                            attn_mask = torch.where(
+                                attn_map >= tmp.unsqueeze(1).unsqueeze(1).repeat(1, *att_res), 1.0, 0.0
+                            )
+
+                            # resolution must match latent space dimension
+                            attn_mask = F.interpolate(
+                                attn_mask.unsqueeze(1),
+                                noise_guidance_edit_tmp.shape[-2:],  # 64,64
+                            ).repeat(1, 4, 1, 1)
+                            self.activation_mask[i, c] = attn_mask.detach().cpu()
+                            if not use_intersect_mask:
+                                noise_guidance_edit_tmp = noise_guidance_edit_tmp * attn_mask
+
+                        if use_intersect_mask:
+                            noise_guidance_edit_tmp_quantile = torch.abs(noise_guidance_edit_tmp)
+                            noise_guidance_edit_tmp_quantile = torch.sum(
+                                noise_guidance_edit_tmp_quantile, dim=1, keepdim=True
+                            )
+                            noise_guidance_edit_tmp_quantile = noise_guidance_edit_tmp_quantile.repeat(
+                                1, self.unet.config.in_channels, 1, 1
+                            )
+
+                            # torch.quantile function expects float32
+                            if noise_guidance_edit_tmp_quantile.dtype == torch.float32:
+                                tmp = torch.quantile(
+                                    noise_guidance_edit_tmp_quantile.flatten(start_dim=2),
+                                    edit_threshold_c,
+                                    dim=2,
+                                    keepdim=False,
+                                )
+                            else:
+                                tmp = torch.quantile(
+                                    noise_guidance_edit_tmp_quantile.flatten(start_dim=2).to(torch.float32),
+                                    edit_threshold_c,
+                                    dim=2,
+                                    keepdim=False,
+                                ).to(noise_guidance_edit_tmp_quantile.dtype)
+
+                            intersect_mask = (
+                                torch.where(
+                                    noise_guidance_edit_tmp_quantile >= tmp[:, :, None, None],
+                                    torch.ones_like(noise_guidance_edit_tmp),
+                                    torch.zeros_like(noise_guidance_edit_tmp),
+                                )
+                                * attn_mask
+                            )
+
+                            self.activation_mask[i, c] = intersect_mask.detach().cpu()
+
+                            noise_guidance_edit_tmp = noise_guidance_edit_tmp * intersect_mask
+
+                        elif not use_cross_attn_mask:
+                            # calculate quantile
+                            noise_guidance_edit_tmp_quantile = torch.abs(noise_guidance_edit_tmp)
+                            noise_guidance_edit_tmp_quantile = torch.sum(
+                                noise_guidance_edit_tmp_quantile, dim=1, keepdim=True
+                            )
+                            noise_guidance_edit_tmp_quantile = noise_guidance_edit_tmp_quantile.repeat(1, 4, 1, 1)
+
+                            # torch.quantile function expects float32
+                            if noise_guidance_edit_tmp_quantile.dtype == torch.float32:
+                                tmp = torch.quantile(
+                                    noise_guidance_edit_tmp_quantile.flatten(start_dim=2),
+                                    edit_threshold_c,
+                                    dim=2,
+                                    keepdim=False,
+                                )
+                            else:
+                                tmp = torch.quantile(
+                                    noise_guidance_edit_tmp_quantile.flatten(start_dim=2).to(torch.float32),
+                                    edit_threshold_c,
+                                    dim=2,
+                                    keepdim=False,
+                                ).to(noise_guidance_edit_tmp_quantile.dtype)
+
+                            self.activation_mask[i, c] = (
+                                torch.where(
+                                    noise_guidance_edit_tmp_quantile >= tmp[:, :, None, None],
+                                    torch.ones_like(noise_guidance_edit_tmp),
+                                    torch.zeros_like(noise_guidance_edit_tmp),
+                                )
+                                .detach()
+                                .cpu()
+                            )
+
+                            noise_guidance_edit_tmp = torch.where(
+                                noise_guidance_edit_tmp_quantile >= tmp[:, :, None, None],
+                                noise_guidance_edit_tmp,
+                                torch.zeros_like(noise_guidance_edit_tmp),
+                            )
+
+                        noise_guidance_edit += noise_guidance_edit_tmp
+
+                    self.sem_guidance[i] = noise_guidance_edit.detach().cpu()
+
+                noise_pred = noise_pred_uncond + noise_guidance_edit
+
+                # compute the previous noisy sample x_t -> x_t-1
+                if enable_edit_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred,
+                        noise_pred_edit_concepts.mean(dim=0, keepdim=False),
+                        guidance_rescale=self.guidance_rescale,
+                    )
+
+                idx = t_to_idx[int(t)]
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, variance_noise=zs[idx], **extra_step_kwargs, return_dict=False
+                )[0]
+
+                # step callback
+                if use_cross_attn_mask:
+                    store_step = i in attn_store_steps
+                    self.attention_store.between_steps(store_step)
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    # negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > 0 and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return LEditsPPDiffusionPipelineOutput(images=image, nsfw_content_detected=None)
+
+    @torch.no_grad()
+    # Modified from diffusers.pipelines.ledits_pp.pipeline_leditspp_stable_diffusion.LEditsPPPipelineStableDiffusion.encode_image
+    def encode_image(self, image, dtype=None, height=None, width=None, resize_mode="default", crops_coords=None):
+        image = self.image_processor.preprocess(
+            image=image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
+        )
+        resized = self.image_processor.postprocess(image=image, output_type="pil")
+
+        if max(image.shape[-2:]) > self.vae.config["sample_size"] * 1.5:
+            logger.warning(
+                "Your input images far exceed the default resolution of the underlying diffusion model. "
+                "The output images may contain severe artifacts! "
+                "Consider down-sampling the input using the `height` and `width` parameters"
+            )
+        image = image.to(self.device, dtype=dtype)
+        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+        if needs_upcasting:
+            image = image.float()
+            self.upcast_vae()
+            image = image.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+        x0 = self.vae.encode(image).latent_dist.mode()
+        x0 = x0.to(dtype)
+        # cast back to fp16 if needed
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float16)
+
+        x0 = self.vae.config.scaling_factor * x0
+        return x0, resized
+
+    @torch.no_grad()
+    def invert(
+        self,
+        image: PipelineImageInput,
+        source_prompt: str = "",
+        source_guidance_scale=3.5,
+        negative_prompt: str = None,
+        negative_prompt_2: str = None,
+        num_inversion_steps: int = 50,
+        skip: float = 0.15,
+        generator: Optional[torch.Generator] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        num_zero_noise_steps: int = 3,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        r"""
+        The function to the pipeline for image inversion as described by the [LEDITS++ Paper](https://arxiv.org/abs/2301.12247).
+        If the scheduler is set to [`~schedulers.DDIMScheduler`] the inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140)
+        will be performed instead.
+
+         Args:
+            image (`PipelineImageInput`):
+                Input for the image(s) that are to be edited. Multiple input images have to default to the same aspect
+                ratio.
+            source_prompt (`str`, defaults to `""`):
+                Prompt describing the input image that will be used for guidance during inversion. Guidance is disabled
+                if the `source_prompt` is `""`.
+            source_guidance_scale (`float`, defaults to `3.5`):
+                Strength of guidance during inversion.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_inversion_steps (`int`, defaults to `50`):
+                Number of total performed inversion steps after discarding the initial `skip` steps.
+            skip (`float`, defaults to `0.15`):
+                Portion of initial steps that will be ignored for inversion and subsequent generation. Lower values
+                will lead to stronger changes to the input image. `skip` has to be between `0` and `1`.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                inversion deterministic.
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            num_zero_noise_steps (`int`, defaults to `3`):
+                Number of final diffusion steps that will not renoise the current image. If no steps are set to zero
+                SD-XL in combination with [`DPMSolverMultistepScheduler`] will produce noise artifacts.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+        Returns:
+            [`~pipelines.ledits_pp.LEditsPPInversionPipelineOutput`]:
+            Output will contain the resized input image(s) and respective VAE reconstruction(s).
+        """
+
+        # Reset attn processor, we do not want to store attn maps during inversion
+        self.unet.set_attn_processor(AttnProcessor())
+
+        self.eta = 1.0
+
+        self.scheduler.config.timestep_spacing = "leading"
+        self.scheduler.set_timesteps(int(num_inversion_steps * (1 + skip)))
+        self.inversion_steps = self.scheduler.timesteps[-num_inversion_steps:]
+        timesteps = self.inversion_steps
+
+        num_images_per_prompt = 1
+
+        device = self._execution_device
+
+        # 0. Ensure that only uncond embedding is used if prompt = ""
+        if source_prompt == "":
+            # noise pred should only be noise_pred_uncond
+            source_guidance_scale = 0.0
+            do_classifier_free_guidance = False
+        else:
+            do_classifier_free_guidance = source_guidance_scale > 1.0
+
+        # 1. prepare image
+        x0, resized = self.encode_image(image, dtype=self.text_encoder_2.dtype)
+        width = x0.shape[2] * self.vae_scale_factor
+        height = x0.shape[3] * self.vae_scale_factor
+        self.size = (height, width)
+
+        self.batch_size = x0.shape[0]
+
+        # 2. get embeddings
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+
+        if isinstance(source_prompt, str):
+            source_prompt = [source_prompt] * self.batch_size
+
+        (
+            negative_prompt_embeds,
+            prompt_embeds,
+            negative_pooled_prompt_embeds,
+            edit_pooled_prompt_embeds,
+            _,
+        ) = self.encode_prompt(
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            editing_prompt=source_prompt,
+            lora_scale=text_encoder_lora_scale,
+            enable_edit_guidance=do_classifier_free_guidance,
+        )
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(negative_pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        # 3. Prepare added time ids & embeddings
+        add_text_embeds = negative_pooled_prompt_embeds
+        add_time_ids = self._get_add_time_ids(
+            self.size,
+            crops_coords_top_left,
+            self.size,
+            dtype=negative_prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([add_text_embeds, edit_pooled_prompt_embeds], dim=0)
+            add_time_ids = torch.cat([add_time_ids, add_time_ids], dim=0)
+
+        negative_prompt_embeds = negative_prompt_embeds.to(device)
+
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(self.batch_size * num_images_per_prompt, 1)
+
+        # autoencoder reconstruction
+        if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
+            self.upcast_vae()
+            x0_tmp = x0.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            image_rec = self.vae.decode(
+                x0_tmp / self.vae.config.scaling_factor, return_dict=False, generator=generator
+            )[0]
+        elif self.vae.config.force_upcast:
+            x0_tmp = x0.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            image_rec = self.vae.decode(
+                x0_tmp / self.vae.config.scaling_factor, return_dict=False, generator=generator
+            )[0]
+        else:
+            image_rec = self.vae.decode(x0 / self.vae.config.scaling_factor, return_dict=False, generator=generator)[0]
+
+        image_rec = self.image_processor.postprocess(image_rec, output_type="pil")
+
+        # 5. find zs and xts
+        variance_noise_shape = (num_inversion_steps, *x0.shape)
+
+        # intermediate latents
+        t_to_idx = {int(v): k for k, v in enumerate(timesteps)}
+        xts = torch.zeros(size=variance_noise_shape, device=self.device, dtype=negative_prompt_embeds.dtype)
+
+        for t in reversed(timesteps):
+            idx = num_inversion_steps - t_to_idx[int(t)] - 1
+            noise = randn_tensor(shape=x0.shape, generator=generator, device=self.device, dtype=x0.dtype)
+            xts[idx] = self.scheduler.add_noise(x0, noise, t.unsqueeze(0))
+        xts = torch.cat([x0.unsqueeze(0), xts], dim=0)
+
+        # noise maps
+        zs = torch.zeros(size=variance_noise_shape, device=self.device, dtype=negative_prompt_embeds.dtype)
+
+        self.scheduler.set_timesteps(len(self.scheduler.timesteps))
+
+        for t in self.progress_bar(timesteps):
+            idx = num_inversion_steps - t_to_idx[int(t)] - 1
+            # 1. predict noise residual
+            xt = xts[idx + 1]
+
+            latent_model_input = torch.cat([xt] * 2) if do_classifier_free_guidance else xt
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+            noise_pred = self.unet(
+                latent_model_input,
+                t,
+                encoder_hidden_states=negative_prompt_embeds,
+                cross_attention_kwargs=cross_attention_kwargs,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+            # 2. perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_out = noise_pred.chunk(2)
+                noise_pred_uncond, noise_pred_text = noise_pred_out[0], noise_pred_out[1]
+                noise_pred = noise_pred_uncond + source_guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            xtm1 = xts[idx]
+            z, xtm1_corrected = compute_noise(self.scheduler, xtm1, xt, t, noise_pred, self.eta)
+            zs[idx] = z
+
+            # correction to avoid error accumulation
+            xts[idx] = xtm1_corrected
+
+        self.init_latents = xts[-1]
+        zs = zs.flip(0)
+
+        if num_zero_noise_steps > 0:
+            zs[-num_zero_noise_steps:] = torch.zeros_like(zs[-num_zero_noise_steps:])
+        self.zs = zs
+        return LEditsPPInversionPipelineOutput(images=resized, vae_reconstruction_images=image_rec)
+
+
+# Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+# Copied from diffusers.pipelines.ledits_pp.pipeline_leditspp_stable_diffusion.compute_noise_ddim
+def compute_noise_ddim(scheduler, prev_latents, latents, timestep, noise_pred, eta):
+    # 1. get previous step value (=t-1)
+    prev_timestep = timestep - scheduler.config.num_train_timesteps // scheduler.num_inference_steps
+
+    # 2. compute alphas, betas
+    alpha_prod_t = scheduler.alphas_cumprod[timestep]
+    alpha_prod_t_prev = (
+        scheduler.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else scheduler.final_alpha_cumprod
+    )
+
+    beta_prod_t = 1 - alpha_prod_t
+
+    # 3. compute predicted original sample from predicted noise also called
+    # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5)
+
+    # 4. Clip "predicted x_0"
+    if scheduler.config.clip_sample:
+        pred_original_sample = torch.clamp(pred_original_sample, -1, 1)
+
+    # 5. compute variance: "sigma_t(η)" -> see formula (16)
+    # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+    variance = scheduler._get_variance(timestep, prev_timestep)
+    std_dev_t = eta * variance ** (0.5)
+
+    # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+    pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * noise_pred
+
+    # modifed so that updated xtm1 is returned as well (to avoid error accumulation)
+    mu_xt = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+    if variance > 0.0:
+        noise = (prev_latents - mu_xt) / (variance ** (0.5) * eta)
+    else:
+        noise = torch.tensor([0.0]).to(latents.device)
+
+    return noise, mu_xt + (eta * variance**0.5) * noise
+
+
+# Copied from diffusers.pipelines.ledits_pp.pipeline_leditspp_stable_diffusion.compute_noise_sde_dpm_pp_2nd
+def compute_noise_sde_dpm_pp_2nd(scheduler, prev_latents, latents, timestep, noise_pred, eta):
+    def first_order_update(model_output, sample):  # timestep, prev_timestep, sample):
+        sigma_t, sigma_s = scheduler.sigmas[scheduler.step_index + 1], scheduler.sigmas[scheduler.step_index]
+        alpha_t, sigma_t = scheduler._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s, sigma_s = scheduler._sigma_to_alpha_sigma_t(sigma_s)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s = torch.log(alpha_s) - torch.log(sigma_s)
+
+        h = lambda_t - lambda_s
+
+        mu_xt = (sigma_t / sigma_s * torch.exp(-h)) * sample + (alpha_t * (1 - torch.exp(-2.0 * h))) * model_output
+
+        mu_xt = scheduler.dpm_solver_first_order_update(
+            model_output=model_output, sample=sample, noise=torch.zeros_like(sample)
+        )
+
+        sigma = sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h))
+        if sigma > 0.0:
+            noise = (prev_latents - mu_xt) / sigma
+        else:
+            noise = torch.tensor([0.0]).to(sample.device)
+
+        prev_sample = mu_xt + sigma * noise
+        return noise, prev_sample
+
+    def second_order_update(model_output_list, sample):  # timestep_list, prev_timestep, sample):
+        sigma_t, sigma_s0, sigma_s1 = (
+            scheduler.sigmas[scheduler.step_index + 1],
+            scheduler.sigmas[scheduler.step_index],
+            scheduler.sigmas[scheduler.step_index - 1],
+        )
+
+        alpha_t, sigma_t = scheduler._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = scheduler._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = scheduler._sigma_to_alpha_sigma_t(sigma_s1)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+
+        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
+        r0 = h_0 / h
+        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
+
+        mu_xt = (
+            (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+            + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+            + 0.5 * (alpha_t * (1 - torch.exp(-2.0 * h))) * D1
+        )
+
+        sigma = sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h))
+        if sigma > 0.0:
+            noise = (prev_latents - mu_xt) / sigma
+        else:
+            noise = torch.tensor([0.0]).to(sample.device)
+
+        prev_sample = mu_xt + sigma * noise
+
+        return noise, prev_sample
+
+    if scheduler.step_index is None:
+        scheduler._init_step_index(timestep)
+
+    model_output = scheduler.convert_model_output(model_output=noise_pred, sample=latents)
+    for i in range(scheduler.config.solver_order - 1):
+        scheduler.model_outputs[i] = scheduler.model_outputs[i + 1]
+    scheduler.model_outputs[-1] = model_output
+
+    if scheduler.lower_order_nums < 1:
+        noise, prev_sample = first_order_update(model_output, latents)
+    else:
+        noise, prev_sample = second_order_update(scheduler.model_outputs, latents)
+
+    if scheduler.lower_order_nums < scheduler.config.solver_order:
+        scheduler.lower_order_nums += 1
+
+    # upon completion increase step index by one
+    scheduler._step_index += 1
+
+    return noise, prev_sample
+
+
+# Copied from diffusers.pipelines.ledits_pp.pipeline_leditspp_stable_diffusion.compute_noise
+def compute_noise(scheduler, *args):
+    if isinstance(scheduler, DDIMScheduler):
+        return compute_noise_ddim(scheduler, *args)
+    elif (
+        isinstance(scheduler, DPMSolverMultistepScheduler)
+        and scheduler.config.algorithm_type == "sde-dpmsolver++"
+        and scheduler.config.solver_order == 2
+    ):
+        return compute_noise_sde_dpm_pp_2nd(scheduler, *args)
+    else:
+        raise NotImplementedError
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_output.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_output.py
new file mode 100644
index 000000000..b90005c97
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_output.py
@@ -0,0 +1,43 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL.Image
+
+from ...utils import BaseOutput
+
+
+@dataclass
+class LEditsPPDiffusionPipelineOutput(BaseOutput):
+    """
+    Output class for LEdits++ Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+        nsfw_content_detected (`List[bool]`)
+            List indicating whether the corresponding generated image contains “not-safe-for-work” (nsfw) content or
+            `None` if safety checking could not be performed.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
+
+
+@dataclass
+class LEditsPPInversionPipelineOutput(BaseOutput):
+    """
+    Output class for LEdits++ Diffusion pipelines.
+
+    Args:
+        input_images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of the cropped and resized input images as PIL images of length `batch_size` or NumPy array of shape `
+            (batch_size, height, width, num_channels)`.
+        vae_reconstruction_images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of VAE reconstruction of all input images as PIL images of length `batch_size` or NumPy array of shape `
+            (batch_size, height, width, num_channels)`.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    vae_reconstruction_images: Union[List[PIL.Image.Image], np.ndarray]
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/musicldm/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/musicldm/__init__.py
new file mode 100644
index 000000000..ed71eeb1d
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/musicldm/__init__.py
@@ -0,0 +1,49 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+    is_transformers_version,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_musicldm"] = ["MusicLDMPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_musicldm import MusicLDMPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/musicldm/pipeline_musicldm.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/musicldm/pipeline_musicldm.py
new file mode 100644
index 000000000..5fde3450b
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/musicldm/pipeline_musicldm.py
@@ -0,0 +1,635 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import (
+    ClapFeatureExtractor,
+    ClapModel,
+    ClapTextModelWithProjection,
+    RobertaTokenizer,
+    RobertaTokenizerFast,
+    SpeechT5HifiGan,
+)
+
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    is_accelerate_available,
+    is_accelerate_version,
+    is_librosa_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline, StableDiffusionMixin
+
+
+if is_librosa_available():
+    import librosa
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import MusicLDMPipeline
+        >>> import torch
+        >>> import scipy
+
+        >>> repo_id = "ucsd-reach/musicldm"
+        >>> pipe = MusicLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
+        >>> audio = pipe(prompt, num_inference_steps=10, audio_length_in_s=5.0).audios[0]
+
+        >>> # save the audio sample as a .wav file
+        >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio)
+        ```
+"""
+
+
+class MusicLDMPipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for text-to-audio generation using MusicLDM.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.ClapModel`]):
+            Frozen text-audio embedding model (`ClapTextModel`), specifically the
+            [laion/clap-htsat-unfused](https://huggingface.co/laion/clap-htsat-unfused) variant.
+        tokenizer ([`PreTrainedTokenizer`]):
+            A [`~transformers.RobertaTokenizer`] to tokenize text.
+        feature_extractor ([`~transformers.ClapFeatureExtractor`]):
+            Feature extractor to compute mel-spectrograms from audio waveforms.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded audio latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        vocoder ([`~transformers.SpeechT5HifiGan`]):
+            Vocoder of class `SpeechT5HifiGan`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: Union[ClapTextModelWithProjection, ClapModel],
+        tokenizer: Union[RobertaTokenizer, RobertaTokenizerFast],
+        feature_extractor: Optional[ClapFeatureExtractor],
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        vocoder: SpeechT5HifiGan,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            feature_extractor=feature_extractor,
+            unet=unet,
+            scheduler=scheduler,
+            vocoder=vocoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_waveforms_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device (`torch.device`):
+                torch device
+            num_waveforms_per_prompt (`int`):
+                number of waveforms that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the audio generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            attention_mask = text_inputs.attention_mask
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLAP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            prompt_embeds = self.text_encoder.get_text_features(
+                text_input_ids.to(device),
+                attention_mask=attention_mask.to(device),
+            )
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.text_model.dtype, device=device)
+
+        (
+            bs_embed,
+            seq_len,
+        ) = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_waveforms_per_prompt)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_waveforms_per_prompt, seq_len)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            uncond_input_ids = uncond_input.input_ids.to(device)
+            attention_mask = uncond_input.attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder.get_text_features(
+                uncond_input_ids,
+                attention_mask=attention_mask,
+            )
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.text_model.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_waveforms_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_waveforms_per_prompt, seq_len)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.audioldm.pipeline_audioldm.AudioLDMPipeline.mel_spectrogram_to_waveform
+    def mel_spectrogram_to_waveform(self, mel_spectrogram):
+        if mel_spectrogram.dim() == 4:
+            mel_spectrogram = mel_spectrogram.squeeze(1)
+
+        waveform = self.vocoder(mel_spectrogram)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        waveform = waveform.cpu().float()
+        return waveform
+
+    # Copied from diffusers.pipelines.audioldm2.pipeline_audioldm2.AudioLDM2Pipeline.score_waveforms
+    def score_waveforms(self, text, audio, num_waveforms_per_prompt, device, dtype):
+        if not is_librosa_available():
+            logger.info(
+                "Automatic scoring of the generated audio waveforms against the input prompt text requires the "
+                "`librosa` package to resample the generated waveforms. Returning the audios in the order they were "
+                "generated. To enable automatic scoring, install `librosa` with: `pip install librosa`."
+            )
+            return audio
+        inputs = self.tokenizer(text, return_tensors="pt", padding=True)
+        resampled_audio = librosa.resample(
+            audio.numpy(), orig_sr=self.vocoder.config.sampling_rate, target_sr=self.feature_extractor.sampling_rate
+        )
+        inputs["input_features"] = self.feature_extractor(
+            list(resampled_audio), return_tensors="pt", sampling_rate=self.feature_extractor.sampling_rate
+        ).input_features.type(dtype)
+        inputs = inputs.to(device)
+
+        # compute the audio-text similarity score using the CLAP model
+        logits_per_text = self.text_encoder(**inputs).logits_per_text
+        # sort by the highest matching generations per prompt
+        indices = torch.argsort(logits_per_text, dim=1, descending=True)[:, :num_waveforms_per_prompt]
+        audio = torch.index_select(audio, 0, indices.reshape(-1).cpu())
+        return audio
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.audioldm.pipeline_audioldm.AudioLDMPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        audio_length_in_s,
+        vocoder_upsample_factor,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        min_audio_length_in_s = vocoder_upsample_factor * self.vae_scale_factor
+        if audio_length_in_s < min_audio_length_in_s:
+            raise ValueError(
+                f"`audio_length_in_s` has to be a positive value greater than or equal to {min_audio_length_in_s}, but "
+                f"is {audio_length_in_s}."
+            )
+
+        if self.vocoder.config.model_in_dim % self.vae_scale_factor != 0:
+            raise ValueError(
+                f"The number of frequency bins in the vocoder's log-mel spectrogram has to be divisible by the "
+                f"VAE scale factor, but got {self.vocoder.config.model_in_dim} bins and a scale factor of "
+                f"{self.vae_scale_factor}."
+            )
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.audioldm.pipeline_audioldm.AudioLDMPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            height // self.vae_scale_factor,
+            self.vocoder.config.model_in_dim // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        device = torch.device(f"cuda:{gpu_id}")
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        model_sequence = [
+            self.text_encoder.text_model,
+            self.text_encoder.text_projection,
+            self.unet,
+            self.vae,
+            self.vocoder,
+            self.text_encoder,
+        ]
+
+        hook = None
+        for cpu_offloaded_model in model_sequence:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+
+        # We'll offload the last model manually.
+        self.final_offload_hook = hook
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        audio_length_in_s: Optional[float] = None,
+        num_inference_steps: int = 200,
+        guidance_scale: float = 2.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_waveforms_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        output_type: Optional[str] = "np",
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide audio generation. If not defined, you need to pass `prompt_embeds`.
+            audio_length_in_s (`int`, *optional*, defaults to 10.24):
+                The length of the generated audio sample in seconds.
+            num_inference_steps (`int`, *optional*, defaults to 200):
+                The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 2.0):
+                A higher guidance scale value encourages the model to generate audio that is closely linked to the text
+                `prompt` at the expense of lower sound quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in audio generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_waveforms_per_prompt (`int`, *optional*, defaults to 1):
+                The number of waveforms to generate per prompt. If `num_waveforms_per_prompt > 1`, the text encoding
+                model is a joint text-audio model ([`~transformers.ClapModel`]), and the tokenizer is a
+                `[~transformers.ClapProcessor]`, then automatic scoring will be performed between the generated outputs
+                and the input text. This scoring ranks the generated waveforms based on their cosine similarity to text
+                input in the joint text-audio embedding space.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format of the generated audio. Choose between `"np"` to return a NumPy `np.ndarray` or
+                `"pt"` to return a PyTorch `torch.Tensor` object. Set to `"latent"` to return the latent diffusion
+                model (LDM) output.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.AudioPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.AudioPipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated audio.
+        """
+        # 0. Convert audio input length from seconds to spectrogram height
+        vocoder_upsample_factor = np.prod(self.vocoder.config.upsample_rates) / self.vocoder.config.sampling_rate
+
+        if audio_length_in_s is None:
+            audio_length_in_s = self.unet.config.sample_size * self.vae_scale_factor * vocoder_upsample_factor
+
+        height = int(audio_length_in_s / vocoder_upsample_factor)
+
+        original_waveform_length = int(audio_length_in_s * self.vocoder.config.sampling_rate)
+        if height % self.vae_scale_factor != 0:
+            height = int(np.ceil(height / self.vae_scale_factor)) * self.vae_scale_factor
+            logger.info(
+                f"Audio length in seconds {audio_length_in_s} is increased to {height * vocoder_upsample_factor} "
+                f"so that it can be handled by the model. It will be cut to {audio_length_in_s} after the "
+                f"denoising process."
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            audio_length_in_s,
+            vocoder_upsample_factor,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_waveforms_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_waveforms_per_prompt,
+            num_channels_latents,
+            height,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=None,
+                    class_labels=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        self.maybe_free_model_hooks()
+
+        # 8. Post-processing
+        if not output_type == "latent":
+            latents = 1 / self.vae.config.scaling_factor * latents
+            mel_spectrogram = self.vae.decode(latents).sample
+        else:
+            return AudioPipelineOutput(audios=latents)
+
+        audio = self.mel_spectrogram_to_waveform(mel_spectrogram)
+
+        audio = audio[:, :original_waveform_length]
+
+        # 9. Automatic scoring
+        if num_waveforms_per_prompt > 1 and prompt is not None:
+            audio = self.score_waveforms(
+                text=prompt,
+                audio=audio,
+                num_waveforms_per_prompt=num_waveforms_per_prompt,
+                device=device,
+                dtype=prompt_embeds.dtype,
+            )
+
+        if output_type == "np":
+            audio = audio.numpy()
+
+        if not return_dict:
+            return (audio,)
+
+        return AudioPipelineOutput(audios=audio)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/onnx_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/onnx_utils.py
new file mode 100644
index 000000000..11f2241c6
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/onnx_utils.py
@@ -0,0 +1,215 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import shutil
+from pathlib import Path
+from typing import Optional, Union
+
+import numpy as np
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import validate_hf_hub_args
+
+from ..utils import ONNX_EXTERNAL_WEIGHTS_NAME, ONNX_WEIGHTS_NAME, is_onnx_available, logging
+
+
+if is_onnx_available():
+    import onnxruntime as ort
+
+
+logger = logging.get_logger(__name__)
+
+ORT_TO_NP_TYPE = {
+    "tensor(bool)": np.bool_,
+    "tensor(int8)": np.int8,
+    "tensor(uint8)": np.uint8,
+    "tensor(int16)": np.int16,
+    "tensor(uint16)": np.uint16,
+    "tensor(int32)": np.int32,
+    "tensor(uint32)": np.uint32,
+    "tensor(int64)": np.int64,
+    "tensor(uint64)": np.uint64,
+    "tensor(float16)": np.float16,
+    "tensor(float)": np.float32,
+    "tensor(double)": np.float64,
+}
+
+
+class OnnxRuntimeModel:
+    def __init__(self, model=None, **kwargs):
+        logger.info("`diffusers.OnnxRuntimeModel` is experimental and might change in the future.")
+        self.model = model
+        self.model_save_dir = kwargs.get("model_save_dir", None)
+        self.latest_model_name = kwargs.get("latest_model_name", ONNX_WEIGHTS_NAME)
+
+    def __call__(self, **kwargs):
+        inputs = {k: np.array(v) for k, v in kwargs.items()}
+        return self.model.run(None, inputs)
+
+    @staticmethod
+    def load_model(path: Union[str, Path], provider=None, sess_options=None):
+        """
+        Loads an ONNX Inference session with an ExecutionProvider. Default provider is `CPUExecutionProvider`
+
+        Arguments:
+            path (`str` or `Path`):
+                Directory from which to load
+            provider(`str`, *optional*):
+                Onnxruntime execution provider to use for loading the model, defaults to `CPUExecutionProvider`
+        """
+        if provider is None:
+            logger.info("No onnxruntime provider specified, using CPUExecutionProvider")
+            provider = "CPUExecutionProvider"
+
+        return ort.InferenceSession(path, providers=[provider], sess_options=sess_options)
+
+    def _save_pretrained(self, save_directory: Union[str, Path], file_name: Optional[str] = None, **kwargs):
+        """
+        Save a model and its configuration file to a directory, so that it can be re-loaded using the
+        [`~optimum.onnxruntime.modeling_ort.ORTModel.from_pretrained`] class method. It will always save the
+        latest_model_name.
+
+        Arguments:
+            save_directory (`str` or `Path`):
+                Directory where to save the model file.
+            file_name(`str`, *optional*):
+                Overwrites the default model file name from `"model.onnx"` to `file_name`. This allows you to save the
+                model with a different name.
+        """
+        model_file_name = file_name if file_name is not None else ONNX_WEIGHTS_NAME
+
+        src_path = self.model_save_dir.joinpath(self.latest_model_name)
+        dst_path = Path(save_directory).joinpath(model_file_name)
+        try:
+            shutil.copyfile(src_path, dst_path)
+        except shutil.SameFileError:
+            pass
+
+        # copy external weights (for models >2GB)
+        src_path = self.model_save_dir.joinpath(ONNX_EXTERNAL_WEIGHTS_NAME)
+        if src_path.exists():
+            dst_path = Path(save_directory).joinpath(ONNX_EXTERNAL_WEIGHTS_NAME)
+            try:
+                shutil.copyfile(src_path, dst_path)
+            except shutil.SameFileError:
+                pass
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        **kwargs,
+    ):
+        """
+        Save a model to a directory, so that it can be re-loaded using the [`~OnnxModel.from_pretrained`] class
+        method.:
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to which to save. Will be created if it doesn't exist.
+        """
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        # saving model weights/files
+        self._save_pretrained(save_directory, **kwargs)
+
+    @classmethod
+    @validate_hf_hub_args
+    def _from_pretrained(
+        cls,
+        model_id: Union[str, Path],
+        token: Optional[Union[bool, str, None]] = None,
+        revision: Optional[Union[str, None]] = None,
+        force_download: bool = False,
+        cache_dir: Optional[str] = None,
+        file_name: Optional[str] = None,
+        provider: Optional[str] = None,
+        sess_options: Optional["ort.SessionOptions"] = None,
+        **kwargs,
+    ):
+        """
+        Load a model from a directory or the HF Hub.
+
+        Arguments:
+            model_id (`str` or `Path`):
+                Directory from which to load
+            token (`str` or `bool`):
+                Is needed to load models from a private or gated repository
+            revision (`str`):
+                Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id
+            cache_dir (`Union[str, Path]`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            file_name(`str`):
+                Overwrites the default model file name from `"model.onnx"` to `file_name`. This allows you to load
+                different model files from the same repository or directory.
+            provider(`str`):
+                The ONNX runtime provider, e.g. `CPUExecutionProvider` or `CUDAExecutionProvider`.
+            kwargs (`Dict`, *optional*):
+                kwargs will be passed to the model during initialization
+        """
+        model_file_name = file_name if file_name is not None else ONNX_WEIGHTS_NAME
+        # load model from local directory
+        if os.path.isdir(model_id):
+            model = OnnxRuntimeModel.load_model(
+                Path(model_id, model_file_name).as_posix(), provider=provider, sess_options=sess_options
+            )
+            kwargs["model_save_dir"] = Path(model_id)
+        # load model from hub
+        else:
+            # download model
+            model_cache_path = hf_hub_download(
+                repo_id=model_id,
+                filename=model_file_name,
+                token=token,
+                revision=revision,
+                cache_dir=cache_dir,
+                force_download=force_download,
+            )
+            kwargs["model_save_dir"] = Path(model_cache_path).parent
+            kwargs["latest_model_name"] = Path(model_cache_path).name
+            model = OnnxRuntimeModel.load_model(model_cache_path, provider=provider, sess_options=sess_options)
+        return cls(model=model, **kwargs)
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(
+        cls,
+        model_id: Union[str, Path],
+        force_download: bool = True,
+        token: Optional[str] = None,
+        cache_dir: Optional[str] = None,
+        **model_kwargs,
+    ):
+        revision = None
+        if len(str(model_id).split("@")) == 2:
+            model_id, revision = model_id.split("@")
+
+        return cls._from_pretrained(
+            model_id=model_id,
+            revision=revision,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            token=token,
+            **model_kwargs,
+        )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/paint_by_example/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/paint_by_example/__init__.py
new file mode 100644
index 000000000..aaa775f69
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/paint_by_example/__init__.py
@@ -0,0 +1,55 @@
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, List, Optional, Union
+
+import numpy as np
+import PIL
+from PIL import Image
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["image_encoder"] = ["PaintByExampleImageEncoder"]
+    _import_structure["pipeline_paint_by_example"] = ["PaintByExamplePipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .image_encoder import PaintByExampleImageEncoder
+        from .pipeline_paint_by_example import PaintByExamplePipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/paint_by_example/image_encoder.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/paint_by_example/image_encoder.py
new file mode 100644
index 000000000..2fd0338b1
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/paint_by_example/image_encoder.py
@@ -0,0 +1,67 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from torch import nn
+from transformers import CLIPPreTrainedModel, CLIPVisionModel
+
+from ...models.attention import BasicTransformerBlock
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class PaintByExampleImageEncoder(CLIPPreTrainedModel):
+    def __init__(self, config, proj_size=None):
+        super().__init__(config)
+        self.proj_size = proj_size or getattr(config, "projection_dim", 768)
+
+        self.model = CLIPVisionModel(config)
+        self.mapper = PaintByExampleMapper(config)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size)
+        self.proj_out = nn.Linear(config.hidden_size, self.proj_size)
+
+        # uncondition for scaling
+        self.uncond_vector = nn.Parameter(torch.randn((1, 1, self.proj_size)))
+
+    def forward(self, pixel_values, return_uncond_vector=False):
+        clip_output = self.model(pixel_values=pixel_values)
+        latent_states = clip_output.pooler_output
+        latent_states = self.mapper(latent_states[:, None])
+        latent_states = self.final_layer_norm(latent_states)
+        latent_states = self.proj_out(latent_states)
+        if return_uncond_vector:
+            return latent_states, self.uncond_vector
+
+        return latent_states
+
+
+class PaintByExampleMapper(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        num_layers = (config.num_hidden_layers + 1) // 5
+        hid_size = config.hidden_size
+        num_heads = 1
+        self.blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(hid_size, num_heads, hid_size, activation_fn="gelu", attention_bias=True)
+                for _ in range(num_layers)
+            ]
+        )
+
+    def forward(self, hidden_states):
+        for block in self.blocks:
+            hidden_states = block(hidden_states)
+
+        return hidden_states
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
new file mode 100644
index 000000000..8a24f134e
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
@@ -0,0 +1,621 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor
+
+from ...image_processor import VaeImageProcessor
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils import deprecate, logging
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from .image_encoder import PaintByExampleImageEncoder
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+def prepare_mask_and_masked_image(image, mask):
+    """
+    Prepares a pair (image, mask) to be consumed by the Paint by Example pipeline. This means that those inputs will be
+    converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
+    ``image`` and ``1`` for the ``mask``.
+
+    The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
+    binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
+
+    Args:
+        image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
+            ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
+        mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
+            ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
+
+
+    Raises:
+        ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
+        should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
+        TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
+            (ot the other way around).
+
+    Returns:
+        tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
+            dimensions: ``batch x channels x height x width``.
+    """
+    if isinstance(image, torch.Tensor):
+        if not isinstance(mask, torch.Tensor):
+            raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not")
+
+        # Batch single image
+        if image.ndim == 3:
+            assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
+            image = image.unsqueeze(0)
+
+        # Batch and add channel dim for single mask
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0).unsqueeze(0)
+
+        # Batch single mask or add channel dim
+        if mask.ndim == 3:
+            # Batched mask
+            if mask.shape[0] == image.shape[0]:
+                mask = mask.unsqueeze(1)
+            else:
+                mask = mask.unsqueeze(0)
+
+        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+        assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
+        assert mask.shape[1] == 1, "Mask image must have a single channel"
+
+        # Check image is in [-1, 1]
+        if image.min() < -1 or image.max() > 1:
+            raise ValueError("Image should be in [-1, 1] range")
+
+        # Check mask is in [0, 1]
+        if mask.min() < 0 or mask.max() > 1:
+            raise ValueError("Mask should be in [0, 1] range")
+
+        # paint-by-example inverses the mask
+        mask = 1 - mask
+
+        # Binarize mask
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+
+        # Image as float32
+        image = image.to(dtype=torch.float32)
+    elif isinstance(mask, torch.Tensor):
+        raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
+    else:
+        if isinstance(image, PIL.Image.Image):
+            image = [image]
+
+        image = np.concatenate([np.array(i.convert("RGB"))[None, :] for i in image], axis=0)
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+        # preprocess mask
+        if isinstance(mask, PIL.Image.Image):
+            mask = [mask]
+
+        mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+        mask = mask.astype(np.float32) / 255.0
+
+        # paint-by-example inverses the mask
+        mask = 1 - mask
+
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        mask = torch.from_numpy(mask)
+
+    masked_image = image * mask
+
+    return mask, masked_image
+
+
+class PaintByExamplePipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    <Tip warning={true}>
+
+    🧪 This is an experimental feature!
+
+    </Tip>
+
+    Pipeline for image-guided image inpainting using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        image_encoder ([`PaintByExampleImageEncoder`]):
+            Encodes the example input image. The `unet` is conditioned on the example image instead of a text prompt.
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+
+    """
+
+    # TODO: feature_extractor is required to encode initial images (if they are in PIL format),
+    # we should give a descriptive message if the pipeline doesn't have one.
+
+    model_cpu_offload_seq = "unet->vae"
+    _exclude_from_cpu_offload = ["image_encoder"]
+    _optional_components = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        image_encoder: PaintByExampleImageEncoder,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = False,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_image_variation.StableDiffusionImageVariationPipeline.check_inputs
+    def check_inputs(self, image, height, width, callback_steps):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline.prepare_mask_latents
+    def prepare_mask_latents(
+        self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
+        mask = mask.to(device=device, dtype=dtype)
+
+        masked_image = masked_image.to(device=device, dtype=dtype)
+
+        if masked_image.shape[1] == 4:
+            masked_image_latents = masked_image
+        else:
+            masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+        if masked_image_latents.shape[0] < batch_size:
+            if not batch_size % masked_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
+
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image_latents = (
+            torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
+
+        # aligning device to prevent device errors when concating it with the latent model input
+        masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+        return mask, masked_image_latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline._encode_vae_image
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        image_latents = self.vae.config.scaling_factor * image_latents
+
+        return image_latents
+
+    def _encode_image(self, image, device, num_images_per_prompt, do_classifier_free_guidance):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(images=image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeddings, negative_prompt_embeds = self.image_encoder(image, return_uncond_vector=True)
+
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_images_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, image_embeddings.shape[0], 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(bs_embed * num_images_per_prompt, 1, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_prompt_embeds, image_embeddings])
+
+        return image_embeddings
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        example_image: Union[torch.FloatTensor, PIL.Image.Image],
+        image: Union[torch.FloatTensor, PIL.Image.Image],
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            example_image (`torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`):
+                An example image to guide image generation.
+            image (`torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`):
+                `Image` or tensor representing an image batch to be inpainted (parts of the image are masked out with
+                `mask_image` and repainted according to `prompt`).
+            mask_image (`torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]`):
+                `Image` or tensor representing an image batch to mask `image`. White pixels in the mask are repainted,
+                while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel
+                (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the
+                expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Example:
+
+        ```py
+        >>> import PIL
+        >>> import requests
+        >>> import torch
+        >>> from io import BytesIO
+        >>> from diffusers import PaintByExamplePipeline
+
+
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+
+        >>> img_url = (
+        ...     "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/image/example_1.png"
+        ... )
+        >>> mask_url = (
+        ...     "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/mask/example_1.png"
+        ... )
+        >>> example_url = "https://raw.githubusercontent.com/Fantasy-Studio/Paint-by-Example/main/examples/reference/example_1.jpg"
+
+        >>> init_image = download_image(img_url).resize((512, 512))
+        >>> mask_image = download_image(mask_url).resize((512, 512))
+        >>> example_image = download_image(example_url).resize((512, 512))
+
+        >>> pipe = PaintByExamplePipeline.from_pretrained(
+        ...     "Fantasy-Studio/Paint-by-Example",
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> image = pipe(image=init_image, mask_image=mask_image, example_image=example_image).images[0]
+        >>> image
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # 1. Define call parameters
+        if isinstance(image, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(image, list):
+            batch_size = len(image)
+        else:
+            batch_size = image.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 2. Preprocess mask and image
+        mask, masked_image = prepare_mask_and_masked_image(image, mask_image)
+        height, width = masked_image.shape[-2:]
+
+        # 3. Check inputs
+        self.check_inputs(example_image, height, width, callback_steps)
+
+        # 4. Encode input image
+        image_embeddings = self._encode_image(
+            example_image, device, num_images_per_prompt, do_classifier_free_guidance
+        )
+
+        # 5. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 7. Prepare mask latent variables
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask,
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            do_classifier_free_guidance,
+        )
+
+        # 8. Check that sizes of mask, masked image and latents match
+        num_channels_mask = mask.shape[1]
+        num_channels_masked_image = masked_image_latents.shape[1]
+        if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                " `pipeline.unet` or your `mask_image` or `image` input."
+            )
+
+        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 10. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+                # concat latents, mask, masked_image_latents in the channel dimension
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                latent_model_input = torch.cat([latent_model_input, masked_image_latents, mask], dim=1)
+
+                # predict the noise residual
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=image_embeddings).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        self.maybe_free_model_hooks()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pia/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pia/__init__.py
new file mode 100644
index 000000000..16e800496
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pia/__init__.py
@@ -0,0 +1,46 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_pia"] = ["PIAPipeline", "PIAPipelineOutput"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+
+    else:
+        from .pipeline_pia import PIAPipeline, PIAPipelineOutput
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pia/pipeline_pia.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pia/pipeline_pia.py
new file mode 100644
index 000000000..507088991
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pia/pipeline_pia.py
@@ -0,0 +1,1034 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import math
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL
+import torch
+import torch.fft as fft
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel, UNetMotionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...models.unets.unet_motion_model import MotionAdapter
+from ...schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from ...utils import (
+    USE_PEFT_BACKEND,
+    BaseOutput,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..free_init_utils import FreeInitMixin
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import (
+        ...     EulerDiscreteScheduler,
+        ...     MotionAdapter,
+        ...     PIAPipeline,
+        ... )
+        >>> from diffusers.utils import export_to_gif, load_image
+        >>> adapter = MotionAdapter.from_pretrained("../checkpoints/pia-diffusers")
+        >>> pipe = PIAPipeline.from_pretrained("SG161222/Realistic_Vision_V6.0_B1_noVAE", motion_adapter=adapter)
+        >>> pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/pix2pix/cat_6.png?download=true"
+        ... )
+        >>> image = image.resize((512, 512))
+        >>> prompt = "cat in a hat"
+        >>> negative_prompt = "wrong white balance, dark, sketches,worst quality,low quality, deformed, distorted, disfigured, bad eyes, wrong lips,weird mouth, bad teeth, mutated hands and fingers, bad anatomy,wrong anatomy, amputation, extra limb, missing limb, floating,limbs, disconnected limbs, mutation, ugly, disgusting, bad_pictures, negative_hand-neg"
+        >>> generator = torch.Generator("cpu").manual_seed(0)
+        >>> output = pipe(image=image, prompt=prompt, negative_prompt=negative_prompt, generator=generator)
+        >>> frames = output.frames[0]
+        >>> export_to_gif(frames, "pia-animation.gif")
+        ```
+"""
+
+RANGE_LIST = [
+    [1.0, 0.9, 0.85, 0.85, 0.85, 0.8],  # 0 Small Motion
+    [1.0, 0.8, 0.8, 0.8, 0.79, 0.78, 0.75],  # Moderate Motion
+    [1.0, 0.8, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.6, 0.5, 0.5],  # Large Motion
+    [1.0, 0.9, 0.85, 0.85, 0.85, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.85, 0.85, 0.9, 1.0],  # Loop
+    [1.0, 0.8, 0.8, 0.8, 0.79, 0.78, 0.75, 0.75, 0.75, 0.75, 0.75, 0.78, 0.79, 0.8, 0.8, 1.0],  # Loop
+    [1.0, 0.8, 0.7, 0.7, 0.7, 0.7, 0.6, 0.5, 0.5, 0.6, 0.7, 0.7, 0.7, 0.7, 0.8, 1.0],  # Loop
+    [0.5, 0.4, 0.4, 0.4, 0.35, 0.3],  # Style Transfer Candidate Small Motion
+    [0.5, 0.4, 0.4, 0.4, 0.35, 0.35, 0.3, 0.25, 0.2],  # Style Transfer Moderate Motion
+    [0.5, 0.2],  # Style Transfer Large Motion
+]
+
+
+# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
+def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
+    batch_size, channels, num_frames, height, width = video.shape
+    outputs = []
+    for batch_idx in range(batch_size):
+        batch_vid = video[batch_idx].permute(1, 0, 2, 3)
+        batch_output = processor.postprocess(batch_vid, output_type)
+
+        outputs.append(batch_output)
+
+    if output_type == "np":
+        outputs = np.stack(outputs)
+
+    elif output_type == "pt":
+        outputs = torch.stack(outputs)
+
+    elif not output_type == "pil":
+        raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
+
+    return outputs
+
+
+def prepare_mask_coef_by_statistics(num_frames: int, cond_frame: int, motion_scale: int):
+    assert num_frames > 0, "video_length should be greater than 0"
+
+    assert num_frames > cond_frame, "video_length should be greater than cond_frame"
+
+    range_list = RANGE_LIST
+
+    assert motion_scale < len(range_list), f"motion_scale type{motion_scale} not implemented"
+
+    coef = range_list[motion_scale]
+    coef = coef + ([coef[-1]] * (num_frames - len(coef)))
+
+    order = [abs(i - cond_frame) for i in range(num_frames)]
+    coef = [coef[order[i]] for i in range(num_frames)]
+
+    return coef
+
+
+def _get_freeinit_freq_filter(
+    shape: Tuple[int, ...],
+    device: Union[str, torch.dtype],
+    filter_type: str,
+    order: float,
+    spatial_stop_frequency: float,
+    temporal_stop_frequency: float,
+) -> torch.Tensor:
+    r"""Returns the FreeInit filter based on filter type and other input conditions."""
+
+    time, height, width = shape[-3], shape[-2], shape[-1]
+    mask = torch.zeros(shape)
+
+    if spatial_stop_frequency == 0 or temporal_stop_frequency == 0:
+        return mask
+
+    if filter_type == "butterworth":
+
+        def retrieve_mask(x):
+            return 1 / (1 + (x / spatial_stop_frequency**2) ** order)
+    elif filter_type == "gaussian":
+
+        def retrieve_mask(x):
+            return math.exp(-1 / (2 * spatial_stop_frequency**2) * x)
+    elif filter_type == "ideal":
+
+        def retrieve_mask(x):
+            return 1 if x <= spatial_stop_frequency * 2 else 0
+    else:
+        raise NotImplementedError("`filter_type` must be one of gaussian, butterworth or ideal")
+
+    for t in range(time):
+        for h in range(height):
+            for w in range(width):
+                d_square = (
+                    ((spatial_stop_frequency / temporal_stop_frequency) * (2 * t / time - 1)) ** 2
+                    + (2 * h / height - 1) ** 2
+                    + (2 * w / width - 1) ** 2
+                )
+                mask[..., t, h, w] = retrieve_mask(d_square)
+
+    return mask.to(device)
+
+
+def _freq_mix_3d(x: torch.Tensor, noise: torch.Tensor, LPF: torch.Tensor) -> torch.Tensor:
+    r"""Noise reinitialization."""
+    # FFT
+    x_freq = fft.fftn(x, dim=(-3, -2, -1))
+    x_freq = fft.fftshift(x_freq, dim=(-3, -2, -1))
+    noise_freq = fft.fftn(noise, dim=(-3, -2, -1))
+    noise_freq = fft.fftshift(noise_freq, dim=(-3, -2, -1))
+
+    # frequency mix
+    HPF = 1 - LPF
+    x_freq_low = x_freq * LPF
+    noise_freq_high = noise_freq * HPF
+    x_freq_mixed = x_freq_low + noise_freq_high  # mix in freq domain
+
+    # IFFT
+    x_freq_mixed = fft.ifftshift(x_freq_mixed, dim=(-3, -2, -1))
+    x_mixed = fft.ifftn(x_freq_mixed, dim=(-3, -2, -1)).real
+
+    return x_mixed
+
+
+@dataclass
+class PIAPipelineOutput(BaseOutput):
+    r"""
+    Output class for PIAPipeline.
+
+    Args:
+        frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
+        Nested list of length `batch_size` with denoised PIL image sequences of length `num_frames`,
+        NumPy array of shape `(batch_size, num_frames, channels, height, width,
+        Torch tensor of shape `(batch_size, num_frames, channels, height, width)`.
+    """
+
+    frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
+
+
+class PIAPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    LoraLoaderMixin,
+    FromSingleFileMixin,
+    FreeInitMixin,
+):
+    r"""
+    Pipeline for text-to-video generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
+        motion_adapter ([`MotionAdapter`]):
+            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["feature_extractor", "image_encoder", "motion_adapter"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: Union[UNet2DConditionModel, UNetMotionModel],
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        motion_adapter: Optional[MotionAdapter] = None,
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+    ):
+        super().__init__()
+        if isinstance(unet, UNet2DConditionModel):
+            unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            motion_adapter=motion_adapter,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with num_images_per_prompt -> num_videos_per_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+        image = self.vae.decode(latents).sample
+        video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.prepare_latents
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def prepare_masked_condition(
+        self,
+        image,
+        batch_size,
+        num_channels_latents,
+        num_frames,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        motion_scale=0,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        _, _, _, scaled_height, scaled_width = shape
+
+        image = self.image_processor.preprocess(image)
+        image = image.to(device, dtype)
+
+        if isinstance(generator, list):
+            image_latent = [
+                self.vae.encode(image[k : k + 1]).latent_dist.sample(generator[k]) for k in range(batch_size)
+            ]
+            image_latent = torch.cat(image_latent, dim=0)
+        else:
+            image_latent = self.vae.encode(image).latent_dist.sample(generator)
+
+        image_latent = image_latent.to(device=device, dtype=dtype)
+        image_latent = torch.nn.functional.interpolate(image_latent, size=[scaled_height, scaled_width])
+        image_latent_padding = image_latent.clone() * self.vae.config.scaling_factor
+
+        mask = torch.zeros((batch_size, 1, num_frames, scaled_height, scaled_width)).to(device=device, dtype=dtype)
+        mask_coef = prepare_mask_coef_by_statistics(num_frames, 0, motion_scale)
+        masked_image = torch.zeros(batch_size, 4, num_frames, scaled_height, scaled_width).to(
+            device=device, dtype=self.unet.dtype
+        )
+        for f in range(num_frames):
+            mask[:, :, f, :, :] = mask_coef[f]
+            masked_image[:, :, f, :, :] = image_latent_padding.clone()
+
+        mask = torch.cat([mask] * 2) if self.do_classifier_free_guidance else mask
+        masked_image = torch.cat([masked_image] * 2) if self.do_classifier_free_guidance else masked_image
+
+        return mask, masked_image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        prompt: Union[str, List[str]] = None,
+        strength: float = 1.0,
+        num_frames: Optional[int] = 16,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        motion_scale: int = 0,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            image (`PipelineImageInput`):
+                The input image to be used for video generation.
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            strength (`float`, *optional*, defaults to 1.0): Indicates extent to transform the reference `image`. Must be between 0 and 1.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated video.
+            num_frames (`int`, *optional*, defaults to 16):
+                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
+                amounts to 2 seconds of video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
+                `(batch_size, num_channel, num_frames, height, width)`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            motion_scale: (`int`, *optional*, defaults to 0):
+                Parameter that controls the amount and type of motion that is added to the image. Increasing the value increases the amount of motion, while specific
+                ranges of values control the type of motion that is added. Must be between 0 and 8.
+                Set between 0-2 to only increase the amount of motion.
+                Set between 3-5 to create looping motion.
+                Set between 6-8 to perform motion with image style transfer.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
+                `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeine class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.pia.pipeline_pia.PIAPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.pia.pipeline_pia.PIAPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_videos_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
+        self._num_timesteps = len(timesteps)
+
+        # 5. Prepare latent variables
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            4,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents=latents,
+        )
+        mask, masked_image = self.prepare_masked_condition(
+            image,
+            batch_size * num_videos_per_prompt,
+            4,
+            num_frames=num_frames,
+            height=height,
+            width=width,
+            dtype=self.unet.dtype,
+            device=device,
+            generator=generator,
+            motion_scale=motion_scale,
+        )
+        if strength < 1.0:
+            noise = randn_tensor(latents.shape, generator=generator, device=device, dtype=latents.dtype)
+            latents = self.scheduler.add_noise(masked_image[0], noise, latent_timestep)
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        # 8. Denoising loop
+        num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1
+        for free_init_iter in range(num_free_init_iters):
+            if self.free_init_enabled:
+                latents, timesteps = self._apply_free_init(
+                    latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
+                )
+
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+            with self.progress_bar(total=num_inference_steps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                    latent_model_input = torch.cat([latent_model_input, mask, masked_image], dim=1)
+
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                    ).sample
+
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                        latents = callback_outputs.pop("latents", latents)
+                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+
+        # 9. Post processing
+        if output_type == "latent":
+            video = latents
+        else:
+            video_tensor = self.decode_latents(latents)
+            video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
+
+        # 10. Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return PIAPipelineOutput(frames=video)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pipeline_flax_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pipeline_flax_utils.py
new file mode 100644
index 000000000..b1035c1f2
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pipeline_flax_utils.py
@@ -0,0 +1,616 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import inspect
+import os
+from typing import Any, Dict, List, Optional, Union
+
+import flax
+import numpy as np
+import PIL.Image
+from flax.core.frozen_dict import FrozenDict
+from huggingface_hub import create_repo, snapshot_download
+from huggingface_hub.utils import validate_hf_hub_args
+from PIL import Image
+from tqdm.auto import tqdm
+
+from ..configuration_utils import ConfigMixin
+from ..models.modeling_flax_utils import FLAX_WEIGHTS_NAME, FlaxModelMixin
+from ..schedulers.scheduling_utils_flax import SCHEDULER_CONFIG_NAME, FlaxSchedulerMixin
+from ..utils import (
+    CONFIG_NAME,
+    BaseOutput,
+    PushToHubMixin,
+    http_user_agent,
+    is_transformers_available,
+    logging,
+)
+
+
+if is_transformers_available():
+    from transformers import FlaxPreTrainedModel
+
+INDEX_FILE = "diffusion_flax_model.bin"
+
+
+logger = logging.get_logger(__name__)
+
+
+LOADABLE_CLASSES = {
+    "diffusers": {
+        "FlaxModelMixin": ["save_pretrained", "from_pretrained"],
+        "FlaxSchedulerMixin": ["save_pretrained", "from_pretrained"],
+        "FlaxDiffusionPipeline": ["save_pretrained", "from_pretrained"],
+    },
+    "transformers": {
+        "PreTrainedTokenizer": ["save_pretrained", "from_pretrained"],
+        "PreTrainedTokenizerFast": ["save_pretrained", "from_pretrained"],
+        "FlaxPreTrainedModel": ["save_pretrained", "from_pretrained"],
+        "FeatureExtractionMixin": ["save_pretrained", "from_pretrained"],
+        "ProcessorMixin": ["save_pretrained", "from_pretrained"],
+        "ImageProcessingMixin": ["save_pretrained", "from_pretrained"],
+    },
+}
+
+ALL_IMPORTABLE_CLASSES = {}
+for library in LOADABLE_CLASSES:
+    ALL_IMPORTABLE_CLASSES.update(LOADABLE_CLASSES[library])
+
+
+def import_flax_or_no_model(module, class_name):
+    try:
+        # 1. First make sure that if a Flax object is present, import this one
+        class_obj = getattr(module, "Flax" + class_name)
+    except AttributeError:
+        # 2. If this doesn't work, it's not a model and we don't append "Flax"
+        class_obj = getattr(module, class_name)
+    except AttributeError:
+        raise ValueError(f"Neither Flax{class_name} nor {class_name} exist in {module}")
+
+    return class_obj
+
+
+@flax.struct.dataclass
+class FlaxImagePipelineOutput(BaseOutput):
+    """
+    Output class for image pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+
+
+class FlaxDiffusionPipeline(ConfigMixin, PushToHubMixin):
+    r"""
+    Base class for Flax-based pipelines.
+
+    [`FlaxDiffusionPipeline`] stores all components (models, schedulers, and processors) for diffusion pipelines and
+    provides methods for loading, downloading and saving models. It also includes methods to:
+
+        - enable/disable the progress bar for the denoising iteration
+
+    Class attributes:
+
+        - **config_name** ([`str`]) -- The configuration filename that stores the class and module names of all the
+          diffusion pipeline's components.
+    """
+
+    config_name = "model_index.json"
+
+    def register_modules(self, **kwargs):
+        # import it here to avoid circular import
+        from diffusers import pipelines
+
+        for name, module in kwargs.items():
+            if module is None:
+                register_dict = {name: (None, None)}
+            else:
+                # retrieve library
+                library = module.__module__.split(".")[0]
+
+                # check if the module is a pipeline module
+                pipeline_dir = module.__module__.split(".")[-2]
+                path = module.__module__.split(".")
+                is_pipeline_module = pipeline_dir in path and hasattr(pipelines, pipeline_dir)
+
+                # if library is not in LOADABLE_CLASSES, then it is a custom module.
+                # Or if it's a pipeline module, then the module is inside the pipeline
+                # folder so we set the library to module name.
+                if library not in LOADABLE_CLASSES or is_pipeline_module:
+                    library = pipeline_dir
+
+                # retrieve class_name
+                class_name = module.__class__.__name__
+
+                register_dict = {name: (library, class_name)}
+
+            # save model index config
+            self.register_to_config(**register_dict)
+
+            # set models
+            setattr(self, name, module)
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        params: Union[Dict, FrozenDict],
+        push_to_hub: bool = False,
+        **kwargs,
+    ):
+        # TODO: handle inference_state
+        """
+        Save all saveable variables of the pipeline to a directory. A pipeline variable can be saved and loaded if its
+        class implements both a save and loading method. The pipeline is easily reloaded using the
+        [`~FlaxDiffusionPipeline.from_pretrained`] class method.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to which to save. Will be created if it doesn't exist.
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        self.save_config(save_directory)
+
+        model_index_dict = dict(self.config)
+        model_index_dict.pop("_class_name")
+        model_index_dict.pop("_diffusers_version")
+        model_index_dict.pop("_module", None)
+
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            private = kwargs.pop("private", False)
+            create_pr = kwargs.pop("create_pr", False)
+            token = kwargs.pop("token", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
+
+        for pipeline_component_name in model_index_dict.keys():
+            sub_model = getattr(self, pipeline_component_name)
+            if sub_model is None:
+                # edge case for saving a pipeline with safety_checker=None
+                continue
+
+            model_cls = sub_model.__class__
+
+            save_method_name = None
+            # search for the model's base class in LOADABLE_CLASSES
+            for library_name, library_classes in LOADABLE_CLASSES.items():
+                library = importlib.import_module(library_name)
+                for base_class, save_load_methods in library_classes.items():
+                    class_candidate = getattr(library, base_class, None)
+                    if class_candidate is not None and issubclass(model_cls, class_candidate):
+                        # if we found a suitable base class in LOADABLE_CLASSES then grab its save method
+                        save_method_name = save_load_methods[0]
+                        break
+                if save_method_name is not None:
+                    break
+
+            save_method = getattr(sub_model, save_method_name)
+            expects_params = "params" in set(inspect.signature(save_method).parameters.keys())
+
+            if expects_params:
+                save_method(
+                    os.path.join(save_directory, pipeline_component_name), params=params[pipeline_component_name]
+                )
+            else:
+                save_method(os.path.join(save_directory, pipeline_component_name))
+
+            if push_to_hub:
+                self._upload_folder(
+                    save_directory,
+                    repo_id,
+                    token=token,
+                    commit_message=commit_message,
+                    create_pr=create_pr,
+                )
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        r"""
+        Instantiate a Flax-based diffusion pipeline from pretrained pipeline weights.
+
+        The pipeline is set in evaluation mode (`model.eval()) by default and dropout modules are deactivated.
+
+        If you get the error message below, you need to finetune the weights for your downstream task:
+
+        ```
+        Some weights of FlaxUNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
+        ```
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+
+                    - A string, the *repo id* (for example `runwayml/stable-diffusion-v1-5`) of a pretrained pipeline
+                      hosted on the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      using [`~FlaxDiffusionPipeline.save_pretrained`].
+            dtype (`str` or `jnp.dtype`, *optional*):
+                Override the default `jnp.dtype` and load the model under this dtype. If `"auto"`, the dtype is
+                automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to overwrite load and saveable variables (the pipeline components) of the specific pipeline
+                class. The overwritten components are passed directly to the pipelines `__init__` method.
+
+        <Tip>
+
+        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with
+        `huggingface-cli login`.
+
+        </Tip>
+
+        Examples:
+
+        ```py
+        >>> from diffusers import FlaxDiffusionPipeline
+
+        >>> # Download pipeline from huggingface.co and cache.
+        >>> # Requires to be logged in to Hugging Face hub,
+        >>> # see more in [the documentation](https://huggingface.co/docs/hub/security-tokens)
+        >>> pipeline, params = FlaxDiffusionPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5",
+        ...     revision="bf16",
+        ...     dtype=jnp.bfloat16,
+        ... )
+
+        >>> # Download pipeline, but use a different scheduler
+        >>> from diffusers import FlaxDPMSolverMultistepScheduler
+
+        >>> model_id = "runwayml/stable-diffusion-v1-5"
+        >>> dpmpp, dpmpp_state = FlaxDPMSolverMultistepScheduler.from_pretrained(
+        ...     model_id,
+        ...     subfolder="scheduler",
+        ... )
+
+        >>> dpm_pipe, dpm_params = FlaxStableDiffusionPipeline.from_pretrained(
+        ...     model_id, revision="bf16", dtype=jnp.bfloat16, scheduler=dpmpp
+        ... )
+        >>> dpm_params["scheduler"] = dpmpp_state
+        ```
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        from_pt = kwargs.pop("from_pt", False)
+        use_memory_efficient_attention = kwargs.pop("use_memory_efficient_attention", False)
+        split_head_dim = kwargs.pop("split_head_dim", False)
+        dtype = kwargs.pop("dtype", None)
+
+        # 1. Download the checkpoints and configs
+        # use snapshot download here to get it working from from_pretrained
+        if not os.path.isdir(pretrained_model_name_or_path):
+            config_dict = cls.load_config(
+                pretrained_model_name_or_path,
+                cache_dir=cache_dir,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+            )
+            # make sure we only download sub-folders and `diffusers` filenames
+            folder_names = [k for k in config_dict.keys() if not k.startswith("_")]
+            allow_patterns = [os.path.join(k, "*") for k in folder_names]
+            allow_patterns += [FLAX_WEIGHTS_NAME, SCHEDULER_CONFIG_NAME, CONFIG_NAME, cls.config_name]
+
+            ignore_patterns = ["*.bin", "*.safetensors"] if not from_pt else []
+            ignore_patterns += ["*.onnx", "*.onnx_data", "*.xml", "*.pb"]
+
+            if cls != FlaxDiffusionPipeline:
+                requested_pipeline_class = cls.__name__
+            else:
+                requested_pipeline_class = config_dict.get("_class_name", cls.__name__)
+                requested_pipeline_class = (
+                    requested_pipeline_class
+                    if requested_pipeline_class.startswith("Flax")
+                    else "Flax" + requested_pipeline_class
+                )
+
+            user_agent = {"pipeline_class": requested_pipeline_class}
+            user_agent = http_user_agent(user_agent)
+
+            # download all allow_patterns
+            cached_folder = snapshot_download(
+                pretrained_model_name_or_path,
+                cache_dir=cache_dir,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                allow_patterns=allow_patterns,
+                ignore_patterns=ignore_patterns,
+                user_agent=user_agent,
+            )
+        else:
+            cached_folder = pretrained_model_name_or_path
+
+        config_dict = cls.load_config(cached_folder)
+
+        # 2. Load the pipeline class, if using custom module then load it from the hub
+        # if we load from explicit class, let's use it
+        if cls != FlaxDiffusionPipeline:
+            pipeline_class = cls
+        else:
+            diffusers_module = importlib.import_module(cls.__module__.split(".")[0])
+            class_name = (
+                config_dict["_class_name"]
+                if config_dict["_class_name"].startswith("Flax")
+                else "Flax" + config_dict["_class_name"]
+            )
+            pipeline_class = getattr(diffusers_module, class_name)
+
+        # some modules can be passed directly to the init
+        # in this case they are already instantiated in `kwargs`
+        # extract them here
+        expected_modules, optional_kwargs = cls._get_signature_keys(pipeline_class)
+        passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
+        passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
+
+        init_dict, unused_kwargs, _ = pipeline_class.extract_init_dict(config_dict, **kwargs)
+
+        # define init kwargs
+        init_kwargs = {k: init_dict.pop(k) for k in optional_kwargs if k in init_dict}
+        init_kwargs = {**init_kwargs, **passed_pipe_kwargs}
+
+        # remove `null` components
+        def load_module(name, value):
+            if value[0] is None:
+                return False
+            if name in passed_class_obj and passed_class_obj[name] is None:
+                return False
+            return True
+
+        init_dict = {k: v for k, v in init_dict.items() if load_module(k, v)}
+
+        # Throw nice warnings / errors for fast accelerate loading
+        if len(unused_kwargs) > 0:
+            logger.warning(
+                f"Keyword arguments {unused_kwargs} are not expected by {pipeline_class.__name__} and will be ignored."
+            )
+
+        # inference_params
+        params = {}
+
+        # import it here to avoid circular import
+        from diffusers import pipelines
+
+        # 3. Load each module in the pipeline
+        for name, (library_name, class_name) in init_dict.items():
+            if class_name is None:
+                # edge case for when the pipeline was saved with safety_checker=None
+                init_kwargs[name] = None
+                continue
+
+            is_pipeline_module = hasattr(pipelines, library_name)
+            loaded_sub_model = None
+            sub_model_should_be_defined = True
+
+            # if the model is in a pipeline module, then we load it from the pipeline
+            if name in passed_class_obj:
+                # 1. check that passed_class_obj has correct parent class
+                if not is_pipeline_module:
+                    library = importlib.import_module(library_name)
+                    class_obj = getattr(library, class_name)
+                    importable_classes = LOADABLE_CLASSES[library_name]
+                    class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
+
+                    expected_class_obj = None
+                    for class_name, class_candidate in class_candidates.items():
+                        if class_candidate is not None and issubclass(class_obj, class_candidate):
+                            expected_class_obj = class_candidate
+
+                    if not issubclass(passed_class_obj[name].__class__, expected_class_obj):
+                        raise ValueError(
+                            f"{passed_class_obj[name]} is of type: {type(passed_class_obj[name])}, but should be"
+                            f" {expected_class_obj}"
+                        )
+                elif passed_class_obj[name] is None:
+                    logger.warning(
+                        f"You have passed `None` for {name} to disable its functionality in {pipeline_class}. Note"
+                        f" that this might lead to problems when using {pipeline_class} and is not recommended."
+                    )
+                    sub_model_should_be_defined = False
+                else:
+                    logger.warning(
+                        f"You have passed a non-standard module {passed_class_obj[name]}. We cannot verify whether it"
+                        " has the correct type"
+                    )
+
+                # set passed class object
+                loaded_sub_model = passed_class_obj[name]
+            elif is_pipeline_module:
+                pipeline_module = getattr(pipelines, library_name)
+                class_obj = import_flax_or_no_model(pipeline_module, class_name)
+
+                importable_classes = ALL_IMPORTABLE_CLASSES
+                class_candidates = {c: class_obj for c in importable_classes.keys()}
+            else:
+                # else we just import it from the library.
+                library = importlib.import_module(library_name)
+                class_obj = import_flax_or_no_model(library, class_name)
+
+                importable_classes = LOADABLE_CLASSES[library_name]
+                class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
+
+            if loaded_sub_model is None and sub_model_should_be_defined:
+                load_method_name = None
+                for class_name, class_candidate in class_candidates.items():
+                    if class_candidate is not None and issubclass(class_obj, class_candidate):
+                        load_method_name = importable_classes[class_name][1]
+
+                load_method = getattr(class_obj, load_method_name)
+
+                # check if the module is in a subdirectory
+                if os.path.isdir(os.path.join(cached_folder, name)):
+                    loadable_folder = os.path.join(cached_folder, name)
+                else:
+                    loaded_sub_model = cached_folder
+
+                if issubclass(class_obj, FlaxModelMixin):
+                    loaded_sub_model, loaded_params = load_method(
+                        loadable_folder,
+                        from_pt=from_pt,
+                        use_memory_efficient_attention=use_memory_efficient_attention,
+                        split_head_dim=split_head_dim,
+                        dtype=dtype,
+                    )
+                    params[name] = loaded_params
+                elif is_transformers_available() and issubclass(class_obj, FlaxPreTrainedModel):
+                    if from_pt:
+                        # TODO(Suraj): Fix this in Transformers. We should be able to use `_do_init=False` here
+                        loaded_sub_model = load_method(loadable_folder, from_pt=from_pt)
+                        loaded_params = loaded_sub_model.params
+                        del loaded_sub_model._params
+                    else:
+                        loaded_sub_model, loaded_params = load_method(loadable_folder, _do_init=False)
+                    params[name] = loaded_params
+                elif issubclass(class_obj, FlaxSchedulerMixin):
+                    loaded_sub_model, scheduler_state = load_method(loadable_folder)
+                    params[name] = scheduler_state
+                else:
+                    loaded_sub_model = load_method(loadable_folder)
+
+            init_kwargs[name] = loaded_sub_model  # UNet(...), # DiffusionSchedule(...)
+
+        # 4. Potentially add passed objects if expected
+        missing_modules = set(expected_modules) - set(init_kwargs.keys())
+        passed_modules = list(passed_class_obj.keys())
+
+        if len(missing_modules) > 0 and missing_modules <= set(passed_modules):
+            for module in missing_modules:
+                init_kwargs[module] = passed_class_obj.get(module, None)
+        elif len(missing_modules) > 0:
+            passed_modules = set(list(init_kwargs.keys()) + list(passed_class_obj.keys())) - optional_kwargs
+            raise ValueError(
+                f"Pipeline {pipeline_class} expected {expected_modules}, but only {passed_modules} were passed."
+            )
+
+        model = pipeline_class(**init_kwargs, dtype=dtype)
+        return model, params
+
+    @classmethod
+    def _get_signature_keys(cls, obj):
+        parameters = inspect.signature(obj.__init__).parameters
+        required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty}
+        optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
+        expected_modules = set(required_parameters.keys()) - {"self"}
+
+        return expected_modules, optional_parameters
+
+    @property
+    def components(self) -> Dict[str, Any]:
+        r"""
+
+        The `self.components` property can be useful to run different pipelines with the same weights and
+        configurations to not have to re-allocate memory.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import (
+        ...     FlaxStableDiffusionPipeline,
+        ...     FlaxStableDiffusionImg2ImgPipeline,
+        ... )
+
+        >>> text2img = FlaxStableDiffusionPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", revision="bf16", dtype=jnp.bfloat16
+        ... )
+        >>> img2img = FlaxStableDiffusionImg2ImgPipeline(**text2img.components)
+        ```
+
+        Returns:
+            A dictionary containing all the modules needed to initialize the pipeline.
+        """
+        expected_modules, optional_parameters = self._get_signature_keys(self)
+        components = {
+            k: getattr(self, k) for k in self.config.keys() if not k.startswith("_") and k not in optional_parameters
+        }
+
+        if set(components.keys()) != expected_modules:
+            raise ValueError(
+                f"{self} has been incorrectly initialized or {self.__class__} is incorrectly implemented. Expected"
+                f" {expected_modules} to be defined, but {components} are defined."
+            )
+
+        return components
+
+    @staticmethod
+    def numpy_to_pil(images):
+        """
+        Convert a NumPy image or a batch of images to a PIL image.
+        """
+        if images.ndim == 3:
+            images = images[None, ...]
+        images = (images * 255).round().astype("uint8")
+        if images.shape[-1] == 1:
+            # special case for grayscale (single channel) images
+            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+        else:
+            pil_images = [Image.fromarray(image) for image in images]
+
+        return pil_images
+
+    # TODO: make it compatible with jax.lax
+    def progress_bar(self, iterable):
+        if not hasattr(self, "_progress_bar_config"):
+            self._progress_bar_config = {}
+        elif not isinstance(self._progress_bar_config, dict):
+            raise ValueError(
+                f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}."
+            )
+
+        return tqdm(iterable, **self._progress_bar_config)
+
+    def set_progress_bar_config(self, **kwargs):
+        self._progress_bar_config = kwargs
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pipeline_loading_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pipeline_loading_utils.py
new file mode 100644
index 000000000..30c17eec1
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pipeline_loading_utils.py
@@ -0,0 +1,508 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import importlib
+import os
+import re
+import warnings
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+
+import torch
+from huggingface_hub import (
+    model_info,
+)
+from packaging import version
+
+from ..utils import (
+    SAFETENSORS_WEIGHTS_NAME,
+    WEIGHTS_NAME,
+    get_class_from_dynamic_module,
+    is_peft_available,
+    is_transformers_available,
+    logging,
+)
+from ..utils.torch_utils import is_compiled_module
+
+
+if is_transformers_available():
+    import transformers
+    from transformers import PreTrainedModel
+    from transformers.utils import FLAX_WEIGHTS_NAME as TRANSFORMERS_FLAX_WEIGHTS_NAME
+    from transformers.utils import SAFE_WEIGHTS_NAME as TRANSFORMERS_SAFE_WEIGHTS_NAME
+    from transformers.utils import WEIGHTS_NAME as TRANSFORMERS_WEIGHTS_NAME
+from huggingface_hub.utils import validate_hf_hub_args
+
+from ..utils import FLAX_WEIGHTS_NAME, ONNX_EXTERNAL_WEIGHTS_NAME, ONNX_WEIGHTS_NAME
+
+
+INDEX_FILE = "diffusion_pytorch_model.bin"
+CUSTOM_PIPELINE_FILE_NAME = "pipeline.py"
+DUMMY_MODULES_FOLDER = "diffusers.utils"
+TRANSFORMERS_DUMMY_MODULES_FOLDER = "transformers.utils"
+CONNECTED_PIPES_KEYS = ["prior"]
+
+logger = logging.get_logger(__name__)
+
+LOADABLE_CLASSES = {
+    "diffusers": {
+        "ModelMixin": ["save_pretrained", "from_pretrained"],
+        "SchedulerMixin": ["save_pretrained", "from_pretrained"],
+        "DiffusionPipeline": ["save_pretrained", "from_pretrained"],
+        "OnnxRuntimeModel": ["save_pretrained", "from_pretrained"],
+    },
+    "transformers": {
+        "PreTrainedTokenizer": ["save_pretrained", "from_pretrained"],
+        "PreTrainedTokenizerFast": ["save_pretrained", "from_pretrained"],
+        "PreTrainedModel": ["save_pretrained", "from_pretrained"],
+        "FeatureExtractionMixin": ["save_pretrained", "from_pretrained"],
+        "ProcessorMixin": ["save_pretrained", "from_pretrained"],
+        "ImageProcessingMixin": ["save_pretrained", "from_pretrained"],
+    },
+    "onnxruntime.training": {
+        "ORTModule": ["save_pretrained", "from_pretrained"],
+    },
+}
+
+ALL_IMPORTABLE_CLASSES = {}
+for library in LOADABLE_CLASSES:
+    ALL_IMPORTABLE_CLASSES.update(LOADABLE_CLASSES[library])
+
+
+def is_safetensors_compatible(filenames, variant=None, passed_components=None) -> bool:
+    """
+    Checking for safetensors compatibility:
+    - By default, all models are saved with the default pytorch serialization, so we use the list of default pytorch
+      files to know which safetensors files are needed.
+    - The model is safetensors compatible only if there is a matching safetensors file for every default pytorch file.
+
+    Converting default pytorch serialized filenames to safetensors serialized filenames:
+    - For models from the diffusers library, just replace the ".bin" extension with ".safetensors"
+    - For models from the transformers library, the filename changes from "pytorch_model" to "model", and the ".bin"
+      extension is replaced with ".safetensors"
+    """
+    pt_filenames = []
+
+    sf_filenames = set()
+
+    passed_components = passed_components or []
+
+    for filename in filenames:
+        _, extension = os.path.splitext(filename)
+
+        if len(filename.split("/")) == 2 and filename.split("/")[0] in passed_components:
+            continue
+
+        if extension == ".bin":
+            pt_filenames.append(os.path.normpath(filename))
+        elif extension == ".safetensors":
+            sf_filenames.add(os.path.normpath(filename))
+
+    for filename in pt_filenames:
+        #  filename = 'foo/bar/baz.bam' -> path = 'foo/bar', filename = 'baz', extension = '.bam'
+        path, filename = os.path.split(filename)
+        filename, extension = os.path.splitext(filename)
+
+        if filename.startswith("pytorch_model"):
+            filename = filename.replace("pytorch_model", "model")
+        else:
+            filename = filename
+
+        expected_sf_filename = os.path.normpath(os.path.join(path, filename))
+        expected_sf_filename = f"{expected_sf_filename}.safetensors"
+        if expected_sf_filename not in sf_filenames:
+            logger.warning(f"{expected_sf_filename} not found")
+            return False
+
+    return True
+
+
+def variant_compatible_siblings(filenames, variant=None) -> Union[List[os.PathLike], str]:
+    weight_names = [
+        WEIGHTS_NAME,
+        SAFETENSORS_WEIGHTS_NAME,
+        FLAX_WEIGHTS_NAME,
+        ONNX_WEIGHTS_NAME,
+        ONNX_EXTERNAL_WEIGHTS_NAME,
+    ]
+
+    if is_transformers_available():
+        weight_names += [TRANSFORMERS_WEIGHTS_NAME, TRANSFORMERS_SAFE_WEIGHTS_NAME, TRANSFORMERS_FLAX_WEIGHTS_NAME]
+
+    # model_pytorch, diffusion_model_pytorch, ...
+    weight_prefixes = [w.split(".")[0] for w in weight_names]
+    # .bin, .safetensors, ...
+    weight_suffixs = [w.split(".")[-1] for w in weight_names]
+    # -00001-of-00002
+    transformers_index_format = r"\d{5}-of-\d{5}"
+
+    if variant is not None:
+        # `diffusion_pytorch_model.fp16.bin` as well as `model.fp16-00001-of-00002.safetensors`
+        variant_file_re = re.compile(
+            rf"({'|'.join(weight_prefixes)})\.({variant}|{variant}-{transformers_index_format})\.({'|'.join(weight_suffixs)})$"
+        )
+        # `text_encoder/pytorch_model.bin.index.fp16.json`
+        variant_index_re = re.compile(
+            rf"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.{variant}\.json$"
+        )
+
+    # `diffusion_pytorch_model.bin` as well as `model-00001-of-00002.safetensors`
+    non_variant_file_re = re.compile(
+        rf"({'|'.join(weight_prefixes)})(-{transformers_index_format})?\.({'|'.join(weight_suffixs)})$"
+    )
+    # `text_encoder/pytorch_model.bin.index.json`
+    non_variant_index_re = re.compile(rf"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.json")
+
+    if variant is not None:
+        variant_weights = {f for f in filenames if variant_file_re.match(f.split("/")[-1]) is not None}
+        variant_indexes = {f for f in filenames if variant_index_re.match(f.split("/")[-1]) is not None}
+        variant_filenames = variant_weights | variant_indexes
+    else:
+        variant_filenames = set()
+
+    non_variant_weights = {f for f in filenames if non_variant_file_re.match(f.split("/")[-1]) is not None}
+    non_variant_indexes = {f for f in filenames if non_variant_index_re.match(f.split("/")[-1]) is not None}
+    non_variant_filenames = non_variant_weights | non_variant_indexes
+
+    # all variant filenames will be used by default
+    usable_filenames = set(variant_filenames)
+
+    def convert_to_variant(filename):
+        if "index" in filename:
+            variant_filename = filename.replace("index", f"index.{variant}")
+        elif re.compile(f"^(.*?){transformers_index_format}").match(filename) is not None:
+            variant_filename = f"{filename.split('-')[0]}.{variant}-{'-'.join(filename.split('-')[1:])}"
+        else:
+            variant_filename = f"{filename.split('.')[0]}.{variant}.{filename.split('.')[1]}"
+        return variant_filename
+
+    for f in non_variant_filenames:
+        variant_filename = convert_to_variant(f)
+        if variant_filename not in usable_filenames:
+            usable_filenames.add(f)
+
+    return usable_filenames, variant_filenames
+
+
+@validate_hf_hub_args
+def warn_deprecated_model_variant(pretrained_model_name_or_path, token, variant, revision, model_filenames):
+    info = model_info(
+        pretrained_model_name_or_path,
+        token=token,
+        revision=None,
+    )
+    filenames = {sibling.rfilename for sibling in info.siblings}
+    comp_model_filenames, _ = variant_compatible_siblings(filenames, variant=revision)
+    comp_model_filenames = [".".join(f.split(".")[:1] + f.split(".")[2:]) for f in comp_model_filenames]
+
+    if set(model_filenames).issubset(set(comp_model_filenames)):
+        warnings.warn(
+            f"You are loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'` even though you can load it via `variant=`{revision}`. Loading model variants via `revision='{revision}'` is deprecated and will be removed in diffusers v1. Please use `variant='{revision}'` instead.",
+            FutureWarning,
+        )
+    else:
+        warnings.warn(
+            f"You are loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'`. This behavior is deprecated and will be removed in diffusers v1. One should use `variant='{revision}'` instead. However, it appears that {pretrained_model_name_or_path} currently does not have the required variant filenames in the 'main' branch. \n The Diffusers team and community would be very grateful if you could open an issue: https://github.com/huggingface/diffusers/issues/new with the title '{pretrained_model_name_or_path} is missing {revision} files' so that the correct variant file can be added.",
+            FutureWarning,
+        )
+
+
+def _unwrap_model(model):
+    """Unwraps a model."""
+    if is_compiled_module(model):
+        model = model._orig_mod
+
+    if is_peft_available():
+        from peft import PeftModel
+
+        if isinstance(model, PeftModel):
+            model = model.base_model.model
+
+    return model
+
+
+def maybe_raise_or_warn(
+    library_name, library, class_name, importable_classes, passed_class_obj, name, is_pipeline_module
+):
+    """Simple helper method to raise or warn in case incorrect module has been passed"""
+    if not is_pipeline_module:
+        library = importlib.import_module(library_name)
+        class_obj = getattr(library, class_name)
+        class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
+
+        expected_class_obj = None
+        for class_name, class_candidate in class_candidates.items():
+            if class_candidate is not None and issubclass(class_obj, class_candidate):
+                expected_class_obj = class_candidate
+
+        # Dynamo wraps the original model in a private class.
+        # I didn't find a public API to get the original class.
+        sub_model = passed_class_obj[name]
+        unwrapped_sub_model = _unwrap_model(sub_model)
+        model_cls = unwrapped_sub_model.__class__
+
+        if not issubclass(model_cls, expected_class_obj):
+            raise ValueError(
+                f"{passed_class_obj[name]} is of type: {model_cls}, but should be" f" {expected_class_obj}"
+            )
+    else:
+        logger.warning(
+            f"You have passed a non-standard module {passed_class_obj[name]}. We cannot verify whether it"
+            " has the correct type"
+        )
+
+
+def get_class_obj_and_candidates(
+    library_name, class_name, importable_classes, pipelines, is_pipeline_module, component_name=None, cache_dir=None
+):
+    """Simple helper method to retrieve class object of module as well as potential parent class objects"""
+    component_folder = os.path.join(cache_dir, component_name)
+
+    if is_pipeline_module:
+        pipeline_module = getattr(pipelines, library_name)
+
+        class_obj = getattr(pipeline_module, class_name)
+        class_candidates = {c: class_obj for c in importable_classes.keys()}
+    elif os.path.isfile(os.path.join(component_folder, library_name + ".py")):
+        # load custom component
+        class_obj = get_class_from_dynamic_module(
+            component_folder, module_file=library_name + ".py", class_name=class_name
+        )
+        class_candidates = {c: class_obj for c in importable_classes.keys()}
+    else:
+        # else we just import it from the library.
+        library = importlib.import_module(library_name)
+
+        class_obj = getattr(library, class_name)
+        class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()}
+
+    return class_obj, class_candidates
+
+
+def _get_pipeline_class(
+    class_obj,
+    config=None,
+    load_connected_pipeline=False,
+    custom_pipeline=None,
+    repo_id=None,
+    hub_revision=None,
+    class_name=None,
+    cache_dir=None,
+    revision=None,
+):
+    if custom_pipeline is not None:
+        if custom_pipeline.endswith(".py"):
+            path = Path(custom_pipeline)
+            # decompose into folder & file
+            file_name = path.name
+            custom_pipeline = path.parent.absolute()
+        elif repo_id is not None:
+            file_name = f"{custom_pipeline}.py"
+            custom_pipeline = repo_id
+        else:
+            file_name = CUSTOM_PIPELINE_FILE_NAME
+
+        if repo_id is not None and hub_revision is not None:
+            # if we load the pipeline code from the Hub
+            # make sure to overwrite the `revision`
+            revision = hub_revision
+
+        return get_class_from_dynamic_module(
+            custom_pipeline,
+            module_file=file_name,
+            class_name=class_name,
+            cache_dir=cache_dir,
+            revision=revision,
+        )
+
+    if class_obj.__name__ != "DiffusionPipeline":
+        return class_obj
+
+    diffusers_module = importlib.import_module(class_obj.__module__.split(".")[0])
+    class_name = class_name or config["_class_name"]
+    if not class_name:
+        raise ValueError(
+            "The class name could not be found in the configuration file. Please make sure to pass the correct `class_name`."
+        )
+
+    class_name = class_name[4:] if class_name.startswith("Flax") else class_name
+
+    pipeline_cls = getattr(diffusers_module, class_name)
+
+    if load_connected_pipeline:
+        from .auto_pipeline import _get_connected_pipeline
+
+        connected_pipeline_cls = _get_connected_pipeline(pipeline_cls)
+        if connected_pipeline_cls is not None:
+            logger.info(
+                f"Loading connected pipeline {connected_pipeline_cls.__name__} instead of {pipeline_cls.__name__} as specified via `load_connected_pipeline=True`"
+            )
+        else:
+            logger.info(f"{pipeline_cls.__name__} has no connected pipeline class. Loading {pipeline_cls.__name__}.")
+
+        pipeline_cls = connected_pipeline_cls or pipeline_cls
+
+    return pipeline_cls
+
+
+def load_sub_model(
+    library_name: str,
+    class_name: str,
+    importable_classes: List[Any],
+    pipelines: Any,
+    is_pipeline_module: bool,
+    pipeline_class: Any,
+    torch_dtype: torch.dtype,
+    provider: Any,
+    sess_options: Any,
+    device_map: Optional[Union[Dict[str, torch.device], str]],
+    max_memory: Optional[Dict[Union[int, str], Union[int, str]]],
+    offload_folder: Optional[Union[str, os.PathLike]],
+    offload_state_dict: bool,
+    model_variants: Dict[str, str],
+    name: str,
+    from_flax: bool,
+    variant: str,
+    low_cpu_mem_usage: bool,
+    cached_folder: Union[str, os.PathLike],
+):
+    """Helper method to load the module `name` from `library_name` and `class_name`"""
+    # retrieve class candidates
+    class_obj, class_candidates = get_class_obj_and_candidates(
+        library_name,
+        class_name,
+        importable_classes,
+        pipelines,
+        is_pipeline_module,
+        component_name=name,
+        cache_dir=cached_folder,
+    )
+
+    load_method_name = None
+    # retrieve load method name
+    for class_name, class_candidate in class_candidates.items():
+        if class_candidate is not None and issubclass(class_obj, class_candidate):
+            load_method_name = importable_classes[class_name][1]
+
+    # if load method name is None, then we have a dummy module -> raise Error
+    if load_method_name is None:
+        none_module = class_obj.__module__
+        is_dummy_path = none_module.startswith(DUMMY_MODULES_FOLDER) or none_module.startswith(
+            TRANSFORMERS_DUMMY_MODULES_FOLDER
+        )
+        if is_dummy_path and "dummy" in none_module:
+            # call class_obj for nice error message of missing requirements
+            class_obj()
+
+        raise ValueError(
+            f"The component {class_obj} of {pipeline_class} cannot be loaded as it does not seem to have"
+            f" any of the loading methods defined in {ALL_IMPORTABLE_CLASSES}."
+        )
+
+    load_method = getattr(class_obj, load_method_name)
+
+    # add kwargs to loading method
+    diffusers_module = importlib.import_module(__name__.split(".")[0])
+    loading_kwargs = {}
+    if issubclass(class_obj, torch.nn.Module):
+        loading_kwargs["torch_dtype"] = torch_dtype
+    if issubclass(class_obj, diffusers_module.OnnxRuntimeModel):
+        loading_kwargs["provider"] = provider
+        loading_kwargs["sess_options"] = sess_options
+
+    is_diffusers_model = issubclass(class_obj, diffusers_module.ModelMixin)
+
+    if is_transformers_available():
+        transformers_version = version.parse(version.parse(transformers.__version__).base_version)
+    else:
+        transformers_version = "N/A"
+
+    is_transformers_model = (
+        is_transformers_available()
+        and issubclass(class_obj, PreTrainedModel)
+        and transformers_version >= version.parse("4.20.0")
+    )
+
+    # When loading a transformers model, if the device_map is None, the weights will be initialized as opposed to diffusers.
+    # To make default loading faster we set the `low_cpu_mem_usage=low_cpu_mem_usage` flag which is `True` by default.
+    # This makes sure that the weights won't be initialized which significantly speeds up loading.
+    if is_diffusers_model or is_transformers_model:
+        loading_kwargs["device_map"] = device_map
+        loading_kwargs["max_memory"] = max_memory
+        loading_kwargs["offload_folder"] = offload_folder
+        loading_kwargs["offload_state_dict"] = offload_state_dict
+        loading_kwargs["variant"] = model_variants.pop(name, None)
+
+        if from_flax:
+            loading_kwargs["from_flax"] = True
+
+        # the following can be deleted once the minimum required `transformers` version
+        # is higher than 4.27
+        if (
+            is_transformers_model
+            and loading_kwargs["variant"] is not None
+            and transformers_version < version.parse("4.27.0")
+        ):
+            raise ImportError(
+                f"When passing `variant='{variant}'`, please make sure to upgrade your `transformers` version to at least 4.27.0.dev0"
+            )
+        elif is_transformers_model and loading_kwargs["variant"] is None:
+            loading_kwargs.pop("variant")
+
+        # if `from_flax` and model is transformer model, can currently not load with `low_cpu_mem_usage`
+        if not (from_flax and is_transformers_model):
+            loading_kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage
+        else:
+            loading_kwargs["low_cpu_mem_usage"] = False
+
+    # check if the module is in a subdirectory
+    if os.path.isdir(os.path.join(cached_folder, name)):
+        loaded_sub_model = load_method(os.path.join(cached_folder, name), **loading_kwargs)
+    else:
+        # else load from the root directory
+        loaded_sub_model = load_method(cached_folder, **loading_kwargs)
+
+    return loaded_sub_model
+
+
+def _fetch_class_library_tuple(module):
+    # import it here to avoid circular import
+    diffusers_module = importlib.import_module(__name__.split(".")[0])
+    pipelines = getattr(diffusers_module, "pipelines")
+
+    # register the config from the original module, not the dynamo compiled one
+    not_compiled_module = _unwrap_model(module)
+    library = not_compiled_module.__module__.split(".")[0]
+
+    # check if the module is a pipeline module
+    module_path_items = not_compiled_module.__module__.split(".")
+    pipeline_dir = module_path_items[-2] if len(module_path_items) > 2 else None
+
+    path = not_compiled_module.__module__.split(".")
+    is_pipeline_module = pipeline_dir in path and hasattr(pipelines, pipeline_dir)
+
+    # if library is not in LOADABLE_CLASSES, then it is a custom module.
+    # Or if it's a pipeline module, then the module is inside the pipeline
+    # folder so we set the library to module name.
+    if is_pipeline_module:
+        library = pipeline_dir
+    elif library not in LOADABLE_CLASSES:
+        library = not_compiled_module.__module__
+
+    # retrieve class_name
+    class_name = not_compiled_module.__class__.__name__
+
+    return (library, class_name)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pipeline_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pipeline_utils.py
new file mode 100644
index 000000000..341360d4f
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pipeline_utils.py
@@ -0,0 +1,1771 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import fnmatch
+import importlib
+import inspect
+import os
+import re
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import requests
+import torch
+from huggingface_hub import (
+    ModelCard,
+    create_repo,
+    hf_hub_download,
+    model_info,
+    snapshot_download,
+)
+from huggingface_hub.utils import OfflineModeIsEnabled, validate_hf_hub_args
+from packaging import version
+from requests.exceptions import HTTPError
+from tqdm.auto import tqdm
+
+from .. import __version__
+from ..configuration_utils import ConfigMixin
+from ..models import AutoencoderKL
+from ..models.attention_processor import FusedAttnProcessor2_0
+from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT
+from ..schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
+from ..utils import (
+    CONFIG_NAME,
+    DEPRECATED_REVISION_ARGS,
+    BaseOutput,
+    PushToHubMixin,
+    deprecate,
+    is_accelerate_available,
+    is_accelerate_version,
+    is_torch_npu_available,
+    is_torch_version,
+    logging,
+    numpy_to_pil,
+)
+from ..utils.hub_utils import load_or_create_model_card, populate_model_card
+from ..utils.torch_utils import is_compiled_module
+
+
+if is_torch_npu_available():
+    import torch_npu  # noqa: F401
+
+
+from .pipeline_loading_utils import (
+    ALL_IMPORTABLE_CLASSES,
+    CONNECTED_PIPES_KEYS,
+    CUSTOM_PIPELINE_FILE_NAME,
+    LOADABLE_CLASSES,
+    _fetch_class_library_tuple,
+    _get_pipeline_class,
+    _unwrap_model,
+    is_safetensors_compatible,
+    load_sub_model,
+    maybe_raise_or_warn,
+    variant_compatible_siblings,
+    warn_deprecated_model_variant,
+)
+
+
+if is_accelerate_available():
+    import accelerate
+
+
+LIBRARIES = []
+for library in LOADABLE_CLASSES:
+    LIBRARIES.append(library)
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+class ImagePipelineOutput(BaseOutput):
+    """
+    Output class for image pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+
+
+@dataclass
+class AudioPipelineOutput(BaseOutput):
+    """
+    Output class for audio pipelines.
+
+    Args:
+        audios (`np.ndarray`)
+            List of denoised audio samples of a NumPy array of shape `(batch_size, num_channels, sample_rate)`.
+    """
+
+    audios: np.ndarray
+
+
+class DiffusionPipeline(ConfigMixin, PushToHubMixin):
+    r"""
+    Base class for all pipelines.
+
+    [`DiffusionPipeline`] stores all components (models, schedulers, and processors) for diffusion pipelines and
+    provides methods for loading, downloading and saving models. It also includes methods to:
+
+        - move all PyTorch modules to the device of your choice
+        - enable/disable the progress bar for the denoising iteration
+
+    Class attributes:
+
+        - **config_name** (`str`) -- The configuration filename that stores the class and module names of all the
+          diffusion pipeline's components.
+        - **_optional_components** (`List[str]`) -- List of all optional components that don't have to be passed to the
+          pipeline to function (should be overridden by subclasses).
+    """
+
+    config_name = "model_index.json"
+    model_cpu_offload_seq = None
+    _optional_components = []
+    _exclude_from_cpu_offload = []
+    _load_connected_pipes = False
+    _is_onnx = False
+
+    def register_modules(self, **kwargs):
+        for name, module in kwargs.items():
+            # retrieve library
+            if module is None or isinstance(module, (tuple, list)) and module[0] is None:
+                register_dict = {name: (None, None)}
+            else:
+                library, class_name = _fetch_class_library_tuple(module)
+                register_dict = {name: (library, class_name)}
+
+            # save model index config
+            self.register_to_config(**register_dict)
+
+            # set models
+            setattr(self, name, module)
+
+    def __setattr__(self, name: str, value: Any):
+        if name in self.__dict__ and hasattr(self.config, name):
+            # We need to overwrite the config if name exists in config
+            if isinstance(getattr(self.config, name), (tuple, list)):
+                if value is not None and self.config[name][0] is not None:
+                    class_library_tuple = _fetch_class_library_tuple(value)
+                else:
+                    class_library_tuple = (None, None)
+
+                self.register_to_config(**{name: class_library_tuple})
+            else:
+                self.register_to_config(**{name: value})
+
+        super().__setattr__(name, value)
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        safe_serialization: bool = True,
+        variant: Optional[str] = None,
+        push_to_hub: bool = False,
+        **kwargs,
+    ):
+        """
+        Save all saveable variables of the pipeline to a directory. A pipeline variable can be saved and loaded if its
+        class implements both a save and loading method. The pipeline is easily reloaded using the
+        [`~DiffusionPipeline.from_pretrained`] class method.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to save a pipeline to. Will be created if it doesn't exist.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
+            variant (`str`, *optional*):
+                If specified, weights are saved in the format `pytorch_model.<variant>.bin`.
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        model_index_dict = dict(self.config)
+        model_index_dict.pop("_class_name", None)
+        model_index_dict.pop("_diffusers_version", None)
+        model_index_dict.pop("_module", None)
+        model_index_dict.pop("_name_or_path", None)
+
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            private = kwargs.pop("private", False)
+            create_pr = kwargs.pop("create_pr", False)
+            token = kwargs.pop("token", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
+
+        expected_modules, optional_kwargs = self._get_signature_keys(self)
+
+        def is_saveable_module(name, value):
+            if name not in expected_modules:
+                return False
+            if name in self._optional_components and value[0] is None:
+                return False
+            return True
+
+        model_index_dict = {k: v for k, v in model_index_dict.items() if is_saveable_module(k, v)}
+        for pipeline_component_name in model_index_dict.keys():
+            sub_model = getattr(self, pipeline_component_name)
+            model_cls = sub_model.__class__
+
+            # Dynamo wraps the original model in a private class.
+            # I didn't find a public API to get the original class.
+            if is_compiled_module(sub_model):
+                sub_model = _unwrap_model(sub_model)
+                model_cls = sub_model.__class__
+
+            save_method_name = None
+            # search for the model's base class in LOADABLE_CLASSES
+            for library_name, library_classes in LOADABLE_CLASSES.items():
+                if library_name in sys.modules:
+                    library = importlib.import_module(library_name)
+                else:
+                    logger.info(
+                        f"{library_name} is not installed. Cannot save {pipeline_component_name} as {library_classes} from {library_name}"
+                    )
+
+                for base_class, save_load_methods in library_classes.items():
+                    class_candidate = getattr(library, base_class, None)
+                    if class_candidate is not None and issubclass(model_cls, class_candidate):
+                        # if we found a suitable base class in LOADABLE_CLASSES then grab its save method
+                        save_method_name = save_load_methods[0]
+                        break
+                if save_method_name is not None:
+                    break
+
+            if save_method_name is None:
+                logger.warning(
+                    f"self.{pipeline_component_name}={sub_model} of type {type(sub_model)} cannot be saved."
+                )
+                # make sure that unsaveable components are not tried to be loaded afterward
+                self.register_to_config(**{pipeline_component_name: (None, None)})
+                continue
+
+            save_method = getattr(sub_model, save_method_name)
+
+            # Call the save method with the argument safe_serialization only if it's supported
+            save_method_signature = inspect.signature(save_method)
+            save_method_accept_safe = "safe_serialization" in save_method_signature.parameters
+            save_method_accept_variant = "variant" in save_method_signature.parameters
+
+            save_kwargs = {}
+            if save_method_accept_safe:
+                save_kwargs["safe_serialization"] = safe_serialization
+            if save_method_accept_variant:
+                save_kwargs["variant"] = variant
+
+            save_method(os.path.join(save_directory, pipeline_component_name), **save_kwargs)
+
+        # finally save the config
+        self.save_config(save_directory)
+
+        if push_to_hub:
+            # Create a new empty model card and eventually tag it
+            model_card = load_or_create_model_card(repo_id, token=token, is_pipeline=True)
+            model_card = populate_model_card(model_card)
+            model_card.save(os.path.join(save_directory, "README.md"))
+
+            self._upload_folder(
+                save_directory,
+                repo_id,
+                token=token,
+                commit_message=commit_message,
+                create_pr=create_pr,
+            )
+
+    def to(self, *args, **kwargs):
+        r"""
+        Performs Pipeline dtype and/or device conversion. A torch.dtype and torch.device are inferred from the
+        arguments of `self.to(*args, **kwargs).`
+
+        <Tip>
+
+            If the pipeline already has the correct torch.dtype and torch.device, then it is returned as is. Otherwise,
+            the returned pipeline is a copy of self with the desired torch.dtype and torch.device.
+
+        </Tip>
+
+
+        Here are the ways to call `to`:
+
+        - `to(dtype, silence_dtype_warnings=False) → DiffusionPipeline` to return a pipeline with the specified
+          [`dtype`](https://pytorch.org/docs/stable/tensor_attributes.html#torch.dtype)
+        - `to(device, silence_dtype_warnings=False) → DiffusionPipeline` to return a pipeline with the specified
+          [`device`](https://pytorch.org/docs/stable/tensor_attributes.html#torch.device)
+        - `to(device=None, dtype=None, silence_dtype_warnings=False) → DiffusionPipeline` to return a pipeline with the
+          specified [`device`](https://pytorch.org/docs/stable/tensor_attributes.html#torch.device) and
+          [`dtype`](https://pytorch.org/docs/stable/tensor_attributes.html#torch.dtype)
+
+        Arguments:
+            dtype (`torch.dtype`, *optional*):
+                Returns a pipeline with the specified
+                [`dtype`](https://pytorch.org/docs/stable/tensor_attributes.html#torch.dtype)
+            device (`torch.Device`, *optional*):
+                Returns a pipeline with the specified
+                [`device`](https://pytorch.org/docs/stable/tensor_attributes.html#torch.device)
+            silence_dtype_warnings (`str`, *optional*, defaults to `False`):
+                Whether to omit warnings if the target `dtype` is not compatible with the target `device`.
+
+        Returns:
+            [`DiffusionPipeline`]: The pipeline converted to specified `dtype` and/or `dtype`.
+        """
+        dtype = kwargs.pop("dtype", None)
+        device = kwargs.pop("device", None)
+        silence_dtype_warnings = kwargs.pop("silence_dtype_warnings", False)
+
+        dtype_arg = None
+        device_arg = None
+        if len(args) == 1:
+            if isinstance(args[0], torch.dtype):
+                dtype_arg = args[0]
+            else:
+                device_arg = torch.device(args[0]) if args[0] is not None else None
+        elif len(args) == 2:
+            if isinstance(args[0], torch.dtype):
+                raise ValueError(
+                    "When passing two arguments, make sure the first corresponds to `device` and the second to `dtype`."
+                )
+            device_arg = torch.device(args[0]) if args[0] is not None else None
+            dtype_arg = args[1]
+        elif len(args) > 2:
+            raise ValueError("Please make sure to pass at most two arguments (`device` and `dtype`) `.to(...)`")
+
+        if dtype is not None and dtype_arg is not None:
+            raise ValueError(
+                "You have passed `dtype` both as an argument and as a keyword argument. Please only pass one of the two."
+            )
+
+        dtype = dtype or dtype_arg
+
+        if device is not None and device_arg is not None:
+            raise ValueError(
+                "You have passed `device` both as an argument and as a keyword argument. Please only pass one of the two."
+            )
+
+        device = device or device_arg
+
+        # throw warning if pipeline is in "offloaded"-mode but user tries to manually set to GPU.
+        def module_is_sequentially_offloaded(module):
+            if not is_accelerate_available() or is_accelerate_version("<", "0.14.0"):
+                return False
+
+            return hasattr(module, "_hf_hook") and not isinstance(
+                module._hf_hook, (accelerate.hooks.CpuOffload, accelerate.hooks.AlignDevicesHook)
+            )
+
+        def module_is_offloaded(module):
+            if not is_accelerate_available() or is_accelerate_version("<", "0.17.0.dev0"):
+                return False
+
+            return hasattr(module, "_hf_hook") and isinstance(module._hf_hook, accelerate.hooks.CpuOffload)
+
+        # .to("cuda") would raise an error if the pipeline is sequentially offloaded, so we raise our own to make it clearer
+        pipeline_is_sequentially_offloaded = any(
+            module_is_sequentially_offloaded(module) for _, module in self.components.items()
+        )
+        if pipeline_is_sequentially_offloaded and device and torch.device(device).type == "cuda":
+            raise ValueError(
+                "It seems like you have activated sequential model offloading by calling `enable_sequential_cpu_offload`, but are now attempting to move the pipeline to GPU. This is not compatible with offloading. Please, move your pipeline `.to('cpu')` or consider removing the move altogether if you use sequential offloading."
+            )
+
+        # Display a warning in this case (the operation succeeds but the benefits are lost)
+        pipeline_is_offloaded = any(module_is_offloaded(module) for _, module in self.components.items())
+        if pipeline_is_offloaded and device and torch.device(device).type == "cuda":
+            logger.warning(
+                f"It seems like you have activated model offloading by calling `enable_model_cpu_offload`, but are now manually moving the pipeline to GPU. It is strongly recommended against doing so as memory gains from offloading are likely to be lost. Offloading automatically takes care of moving the individual components {', '.join(self.components.keys())} to GPU when needed. To make sure offloading works as expected, you should consider moving the pipeline back to CPU: `pipeline.to('cpu')` or removing the move altogether if you use offloading."
+            )
+
+        module_names, _ = self._get_signature_keys(self)
+        modules = [getattr(self, n, None) for n in module_names]
+        modules = [m for m in modules if isinstance(m, torch.nn.Module)]
+
+        is_offloaded = pipeline_is_offloaded or pipeline_is_sequentially_offloaded
+        for module in modules:
+            is_loaded_in_8bit = hasattr(module, "is_loaded_in_8bit") and module.is_loaded_in_8bit
+
+            if is_loaded_in_8bit and dtype is not None:
+                logger.warning(
+                    f"The module '{module.__class__.__name__}' has been loaded in 8bit and conversion to {dtype} is not yet supported. Module is still in 8bit precision."
+                )
+
+            if is_loaded_in_8bit and device is not None:
+                logger.warning(
+                    f"The module '{module.__class__.__name__}' has been loaded in 8bit and moving it to {dtype} via `.to()` is not yet supported. Module is still on {module.device}."
+                )
+            else:
+                module.to(device, dtype)
+
+            if (
+                module.dtype == torch.float16
+                and str(device) in ["cpu"]
+                and not silence_dtype_warnings
+                and not is_offloaded
+            ):
+                logger.warning(
+                    "Pipelines loaded with `dtype=torch.float16` cannot run with `cpu` device. It"
+                    " is not recommended to move them to `cpu` as running them will fail. Please make"
+                    " sure to use an accelerator to run the pipeline in inference, due to the lack of"
+                    " support for`float16` operations on this device in PyTorch. Please, remove the"
+                    " `torch_dtype=torch.float16` argument, or use another device for inference."
+                )
+        return self
+
+    @property
+    def device(self) -> torch.device:
+        r"""
+        Returns:
+            `torch.device`: The torch device on which the pipeline is located.
+        """
+        module_names, _ = self._get_signature_keys(self)
+        modules = [getattr(self, n, None) for n in module_names]
+        modules = [m for m in modules if isinstance(m, torch.nn.Module)]
+
+        for module in modules:
+            return module.device
+
+        return torch.device("cpu")
+
+    @property
+    def dtype(self) -> torch.dtype:
+        r"""
+        Returns:
+            `torch.dtype`: The torch dtype on which the pipeline is located.
+        """
+        module_names, _ = self._get_signature_keys(self)
+        modules = [getattr(self, n, None) for n in module_names]
+        modules = [m for m in modules if isinstance(m, torch.nn.Module)]
+
+        for module in modules:
+            return module.dtype
+
+        return torch.float32
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        r"""
+        Instantiate a PyTorch diffusion pipeline from pretrained pipeline weights.
+
+        The pipeline is set in evaluation mode (`model.eval()`) by default.
+
+        If you get the error message below, you need to finetune the weights for your downstream task:
+
+        ```
+        Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
+        - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated
+        You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+        ```
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+
+                    - A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline
+                      hosted on the Hub.
+                    - A path to a *directory* (for example `./my_pipeline_directory/`) containing pipeline weights
+                      saved using
+                    [`~DiffusionPipeline.save_pretrained`].
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If "auto" is passed, the
+                dtype is automatically derived from the model's weights.
+            custom_pipeline (`str`, *optional*):
+
+                <Tip warning={true}>
+
+                🧪 This is an experimental feature and may change in the future.
+
+                </Tip>
+
+                Can be either:
+
+                    - A string, the *repo id* (for example `hf-internal-testing/diffusers-dummy-pipeline`) of a custom
+                      pipeline hosted on the Hub. The repository must contain a file called pipeline.py that defines
+                      the custom pipeline.
+                    - A string, the *file name* of a community pipeline hosted on GitHub under
+                      [Community](https://github.com/huggingface/diffusers/tree/main/examples/community). Valid file
+                      names must match the file name and not the pipeline script (`clip_guided_stable_diffusion`
+                      instead of `clip_guided_stable_diffusion.py`). Community pipelines are always loaded from the
+                      current main branch of GitHub.
+                    - A path to a directory (`./my_pipeline_directory/`) containing a custom pipeline. The directory
+                      must contain a file called `pipeline.py` that defines the custom pipeline.
+
+                For more information on how to load and create custom pipelines, please have a look at [Loading and
+                Adding Custom
+                Pipelines](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipeline_overview)
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            custom_revision (`str`, *optional*):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id similar to
+                `revision` when loading a custom pipeline from the Hub. Defaults to the latest stable 🤗 Diffusers version.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn’t need to be defined for each
+                parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
+                same device.
+
+                Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            max_memory (`Dict`, *optional*):
+                A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
+                each GPU and the available CPU RAM if unset.
+            offload_folder (`str` or `os.PathLike`, *optional*):
+                The path to offload weights if device_map contains the value `"disk"`.
+            offload_state_dict (`bool`, *optional*):
+                If `True`, temporarily offloads the CPU state dict to the hard drive to avoid running out of CPU RAM if
+                the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to `True`
+                when there is some disk offload.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
+                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
+                weights. If set to `False`, safetensors weights are not loaded.
+            use_onnx (`bool`, *optional*, defaults to `None`):
+                If set to `True`, ONNX weights will always be downloaded if present. If set to `False`, ONNX weights
+                will never be downloaded. By default `use_onnx` defaults to the `_is_onnx` class attribute which is
+                `False` for non-ONNX pipelines and `True` for ONNX pipelines. ONNX weights include both files ending
+                with `.onnx` and `.pb`.
+            kwargs (remaining dictionary of keyword arguments, *optional*):
+                Can be used to overwrite load and saveable variables (the pipeline components of the specific pipeline
+                class). The overwritten components are passed directly to the pipelines `__init__` method. See example
+                below for more information.
+            variant (`str`, *optional*):
+                Load weights from a specified variant filename such as `"fp16"` or `"ema"`. This is ignored when
+                loading `from_flax`.
+
+        <Tip>
+
+        To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with
+        `huggingface-cli login`.
+
+        </Tip>
+
+        Examples:
+
+        ```py
+        >>> from diffusers import DiffusionPipeline
+
+        >>> # Download pipeline from huggingface.co and cache.
+        >>> pipeline = DiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256")
+
+        >>> # Download pipeline that requires an authorization token
+        >>> # For more information on access tokens, please refer to this section
+        >>> # of the documentation](https://huggingface.co/docs/hub/security-tokens)
+        >>> pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+
+        >>> # Use a different scheduler
+        >>> from diffusers import LMSDiscreteScheduler
+
+        >>> scheduler = LMSDiscreteScheduler.from_config(pipeline.scheduler.config)
+        >>> pipeline.scheduler = scheduler
+        ```
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        resume_download = kwargs.pop("resume_download", False)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        from_flax = kwargs.pop("from_flax", False)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        custom_pipeline = kwargs.pop("custom_pipeline", None)
+        custom_revision = kwargs.pop("custom_revision", None)
+        provider = kwargs.pop("provider", None)
+        sess_options = kwargs.pop("sess_options", None)
+        device_map = kwargs.pop("device_map", None)
+        max_memory = kwargs.pop("max_memory", None)
+        offload_folder = kwargs.pop("offload_folder", None)
+        offload_state_dict = kwargs.pop("offload_state_dict", False)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
+        variant = kwargs.pop("variant", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+        use_onnx = kwargs.pop("use_onnx", None)
+        load_connected_pipeline = kwargs.pop("load_connected_pipeline", False)
+
+        if low_cpu_mem_usage and not is_accelerate_available():
+            low_cpu_mem_usage = False
+            logger.warning(
+                "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                " install accelerate\n```\n."
+            )
+
+        if device_map is not None and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `device_map=None`."
+            )
+
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+
+        if low_cpu_mem_usage is False and device_map is not None:
+            raise ValueError(
+                f"You cannot set `low_cpu_mem_usage` to False while using device_map={device_map} for loading and"
+                " dispatching. Please make sure to set `low_cpu_mem_usage=True`."
+            )
+
+        # 1. Download the checkpoints and configs
+        # use snapshot download here to get it working from from_pretrained
+        if not os.path.isdir(pretrained_model_name_or_path):
+            if pretrained_model_name_or_path.count("/") > 1:
+                raise ValueError(
+                    f'The provided pretrained_model_name_or_path "{pretrained_model_name_or_path}"'
+                    " is neither a valid local path nor a valid repo id. Please check the parameter."
+                )
+            cached_folder = cls.download(
+                pretrained_model_name_or_path,
+                cache_dir=cache_dir,
+                resume_download=resume_download,
+                force_download=force_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                from_flax=from_flax,
+                use_safetensors=use_safetensors,
+                use_onnx=use_onnx,
+                custom_pipeline=custom_pipeline,
+                custom_revision=custom_revision,
+                variant=variant,
+                load_connected_pipeline=load_connected_pipeline,
+                **kwargs,
+            )
+        else:
+            cached_folder = pretrained_model_name_or_path
+
+        config_dict = cls.load_config(cached_folder)
+
+        # pop out "_ignore_files" as it is only needed for download
+        config_dict.pop("_ignore_files", None)
+
+        # 2. Define which model components should load variants
+        # We retrieve the information by matching whether variant
+        # model checkpoints exist in the subfolders
+        model_variants = {}
+        if variant is not None:
+            for folder in os.listdir(cached_folder):
+                folder_path = os.path.join(cached_folder, folder)
+                is_folder = os.path.isdir(folder_path) and folder in config_dict
+                variant_exists = is_folder and any(
+                    p.split(".")[1].startswith(variant) for p in os.listdir(folder_path)
+                )
+                if variant_exists:
+                    model_variants[folder] = variant
+
+        # 3. Load the pipeline class, if using custom module then load it from the hub
+        # if we load from explicit class, let's use it
+        custom_class_name = None
+        if os.path.isfile(os.path.join(cached_folder, f"{custom_pipeline}.py")):
+            custom_pipeline = os.path.join(cached_folder, f"{custom_pipeline}.py")
+        elif isinstance(config_dict["_class_name"], (list, tuple)) and os.path.isfile(
+            os.path.join(cached_folder, f"{config_dict['_class_name'][0]}.py")
+        ):
+            custom_pipeline = os.path.join(cached_folder, f"{config_dict['_class_name'][0]}.py")
+            custom_class_name = config_dict["_class_name"][1]
+
+        pipeline_class = _get_pipeline_class(
+            cls,
+            config_dict,
+            load_connected_pipeline=load_connected_pipeline,
+            custom_pipeline=custom_pipeline,
+            class_name=custom_class_name,
+            cache_dir=cache_dir,
+            revision=custom_revision,
+        )
+
+        # DEPRECATED: To be removed in 1.0.0
+        if pipeline_class.__name__ == "StableDiffusionInpaintPipeline" and version.parse(
+            version.parse(config_dict["_diffusers_version"]).base_version
+        ) <= version.parse("0.5.1"):
+            from diffusers import StableDiffusionInpaintPipeline, StableDiffusionInpaintPipelineLegacy
+
+            pipeline_class = StableDiffusionInpaintPipelineLegacy
+
+            deprecation_message = (
+                "You are using a legacy checkpoint for inpainting with Stable Diffusion, therefore we are loading the"
+                f" {StableDiffusionInpaintPipelineLegacy} class instead of {StableDiffusionInpaintPipeline}. For"
+                " better inpainting results, we strongly suggest using Stable Diffusion's official inpainting"
+                " checkpoint: https://huggingface.co/runwayml/stable-diffusion-inpainting instead or adapting your"
+                f" checkpoint {pretrained_model_name_or_path} to the format of"
+                " https://huggingface.co/runwayml/stable-diffusion-inpainting. Note that we do not actively maintain"
+                " the {StableDiffusionInpaintPipelineLegacy} class and will likely remove it in version 1.0.0."
+            )
+            deprecate("StableDiffusionInpaintPipelineLegacy", "1.0.0", deprecation_message, standard_warn=False)
+
+        # 4. Define expected modules given pipeline signature
+        # and define non-None initialized modules (=`init_kwargs`)
+
+        # some modules can be passed directly to the init
+        # in this case they are already instantiated in `kwargs`
+        # extract them here
+        expected_modules, optional_kwargs = cls._get_signature_keys(pipeline_class)
+        passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
+        passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}
+
+        init_dict, unused_kwargs, _ = pipeline_class.extract_init_dict(config_dict, **kwargs)
+
+        # define init kwargs and make sure that optional component modules are filtered out
+        init_kwargs = {
+            k: init_dict.pop(k)
+            for k in optional_kwargs
+            if k in init_dict and k not in pipeline_class._optional_components
+        }
+        init_kwargs = {**init_kwargs, **passed_pipe_kwargs}
+
+        # remove `null` components
+        def load_module(name, value):
+            if value[0] is None:
+                return False
+            if name in passed_class_obj and passed_class_obj[name] is None:
+                return False
+            return True
+
+        init_dict = {k: v for k, v in init_dict.items() if load_module(k, v)}
+
+        # Special case: safety_checker must be loaded separately when using `from_flax`
+        if from_flax and "safety_checker" in init_dict and "safety_checker" not in passed_class_obj:
+            raise NotImplementedError(
+                "The safety checker cannot be automatically loaded when loading weights `from_flax`."
+                " Please, pass `safety_checker=None` to `from_pretrained`, and load the safety checker"
+                " separately if you need it."
+            )
+
+        # 5. Throw nice warnings / errors for fast accelerate loading
+        if len(unused_kwargs) > 0:
+            logger.warning(
+                f"Keyword arguments {unused_kwargs} are not expected by {pipeline_class.__name__} and will be ignored."
+            )
+
+        # import it here to avoid circular import
+        from diffusers import pipelines
+
+        # 6. Load each module in the pipeline
+        for name, (library_name, class_name) in logging.tqdm(init_dict.items(), desc="Loading pipeline components..."):
+            # 6.1 - now that JAX/Flax is an official framework of the library, we might load from Flax names
+            class_name = class_name[4:] if class_name.startswith("Flax") else class_name
+
+            # 6.2 Define all importable classes
+            is_pipeline_module = hasattr(pipelines, library_name)
+            importable_classes = ALL_IMPORTABLE_CLASSES
+            loaded_sub_model = None
+
+            # 6.3 Use passed sub model or load class_name from library_name
+            if name in passed_class_obj:
+                # if the model is in a pipeline module, then we load it from the pipeline
+                # check that passed_class_obj has correct parent class
+                maybe_raise_or_warn(
+                    library_name, library, class_name, importable_classes, passed_class_obj, name, is_pipeline_module
+                )
+
+                loaded_sub_model = passed_class_obj[name]
+            else:
+                # load sub model
+                loaded_sub_model = load_sub_model(
+                    library_name=library_name,
+                    class_name=class_name,
+                    importable_classes=importable_classes,
+                    pipelines=pipelines,
+                    is_pipeline_module=is_pipeline_module,
+                    pipeline_class=pipeline_class,
+                    torch_dtype=torch_dtype,
+                    provider=provider,
+                    sess_options=sess_options,
+                    device_map=device_map,
+                    max_memory=max_memory,
+                    offload_folder=offload_folder,
+                    offload_state_dict=offload_state_dict,
+                    model_variants=model_variants,
+                    name=name,
+                    from_flax=from_flax,
+                    variant=variant,
+                    low_cpu_mem_usage=low_cpu_mem_usage,
+                    cached_folder=cached_folder,
+                )
+                logger.info(
+                    f"Loaded {name} as {class_name} from `{name}` subfolder of {pretrained_model_name_or_path}."
+                )
+
+            init_kwargs[name] = loaded_sub_model  # UNet(...), # DiffusionSchedule(...)
+
+        if pipeline_class._load_connected_pipes and os.path.isfile(os.path.join(cached_folder, "README.md")):
+            modelcard = ModelCard.load(os.path.join(cached_folder, "README.md"))
+            connected_pipes = {prefix: getattr(modelcard.data, prefix, [None])[0] for prefix in CONNECTED_PIPES_KEYS}
+            load_kwargs = {
+                "cache_dir": cache_dir,
+                "resume_download": resume_download,
+                "force_download": force_download,
+                "proxies": proxies,
+                "local_files_only": local_files_only,
+                "token": token,
+                "revision": revision,
+                "torch_dtype": torch_dtype,
+                "custom_pipeline": custom_pipeline,
+                "custom_revision": custom_revision,
+                "provider": provider,
+                "sess_options": sess_options,
+                "device_map": device_map,
+                "max_memory": max_memory,
+                "offload_folder": offload_folder,
+                "offload_state_dict": offload_state_dict,
+                "low_cpu_mem_usage": low_cpu_mem_usage,
+                "variant": variant,
+                "use_safetensors": use_safetensors,
+            }
+
+            def get_connected_passed_kwargs(prefix):
+                connected_passed_class_obj = {
+                    k.replace(f"{prefix}_", ""): w for k, w in passed_class_obj.items() if k.split("_")[0] == prefix
+                }
+                connected_passed_pipe_kwargs = {
+                    k.replace(f"{prefix}_", ""): w for k, w in passed_pipe_kwargs.items() if k.split("_")[0] == prefix
+                }
+
+                connected_passed_kwargs = {**connected_passed_class_obj, **connected_passed_pipe_kwargs}
+                return connected_passed_kwargs
+
+            connected_pipes = {
+                prefix: DiffusionPipeline.from_pretrained(
+                    repo_id, **load_kwargs.copy(), **get_connected_passed_kwargs(prefix)
+                )
+                for prefix, repo_id in connected_pipes.items()
+                if repo_id is not None
+            }
+
+            for prefix, connected_pipe in connected_pipes.items():
+                # add connected pipes to `init_kwargs` with <prefix>_<component_name>, e.g. "prior_text_encoder"
+                init_kwargs.update(
+                    {"_".join([prefix, name]): component for name, component in connected_pipe.components.items()}
+                )
+
+        # 7. Potentially add passed objects if expected
+        missing_modules = set(expected_modules) - set(init_kwargs.keys())
+        passed_modules = list(passed_class_obj.keys())
+        optional_modules = pipeline_class._optional_components
+        if len(missing_modules) > 0 and missing_modules <= set(passed_modules + optional_modules):
+            for module in missing_modules:
+                init_kwargs[module] = passed_class_obj.get(module, None)
+        elif len(missing_modules) > 0:
+            passed_modules = set(list(init_kwargs.keys()) + list(passed_class_obj.keys())) - optional_kwargs
+            raise ValueError(
+                f"Pipeline {pipeline_class} expected {expected_modules}, but only {passed_modules} were passed."
+            )
+
+        # 8. Instantiate the pipeline
+        model = pipeline_class(**init_kwargs)
+
+        # 9. Save where the model was instantiated from
+        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
+        return model
+
+    @property
+    def name_or_path(self) -> str:
+        return getattr(self.config, "_name_or_path", None)
+
+    @property
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        [`~DiffusionPipeline.enable_sequential_cpu_offload`] the execution device can only be inferred from
+        Accelerate's module hooks.
+        """
+        for name, model in self.components.items():
+            if not isinstance(model, torch.nn.Module) or name in self._exclude_from_cpu_offload:
+                continue
+
+            if not hasattr(model, "_hf_hook"):
+                return self.device
+            for module in model.modules():
+                if (
+                    hasattr(module, "_hf_hook")
+                    and hasattr(module._hf_hook, "execution_device")
+                    and module._hf_hook.execution_device is not None
+                ):
+                    return torch.device(module._hf_hook.execution_device)
+        return self.device
+
+    def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+
+        Arguments:
+            gpu_id (`int`, *optional*):
+                The ID of the accelerator that shall be used in inference. If not specified, it will default to 0.
+            device (`torch.Device` or `str`, *optional*, defaults to "cuda"):
+                The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
+                default to "cuda".
+        """
+        if self.model_cpu_offload_seq is None:
+            raise ValueError(
+                "Model CPU offload cannot be enabled because no `model_cpu_offload_seq` class attribute is set."
+            )
+
+        if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"):
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")
+
+        torch_device = torch.device(device)
+        device_index = torch_device.index
+
+        if gpu_id is not None and device_index is not None:
+            raise ValueError(
+                f"You have passed both `gpu_id`={gpu_id} and an index as part of the passed device `device`={device}"
+                f"Cannot pass both. Please make sure to either not define `gpu_id` or not pass the index as part of the device: `device`={torch_device.type}"
+            )
+
+        # _offload_gpu_id should be set to passed gpu_id (or id in passed `device`) or default to previously set id or default to 0
+        self._offload_gpu_id = gpu_id or torch_device.index or getattr(self, "_offload_gpu_id", 0)
+
+        device_type = torch_device.type
+        device = torch.device(f"{device_type}:{self._offload_gpu_id}")
+        self._offload_device = device
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            device_mod = getattr(torch, self.device.type, None)
+            if hasattr(device_mod, "empty_cache") and device_mod.is_available():
+                device_mod.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        all_model_components = {k: v for k, v in self.components.items() if isinstance(v, torch.nn.Module)}
+
+        self._all_hooks = []
+        hook = None
+        for model_str in self.model_cpu_offload_seq.split("->"):
+            model = all_model_components.pop(model_str, None)
+            if not isinstance(model, torch.nn.Module):
+                continue
+
+            _, hook = cpu_offload_with_hook(model, device, prev_module_hook=hook)
+            self._all_hooks.append(hook)
+
+        # CPU offload models that are not in the seq chain unless they are explicitly excluded
+        # these models will stay on CPU until maybe_free_model_hooks is called
+        # some models cannot be in the seq chain because they are iteratively called, such as controlnet
+        for name, model in all_model_components.items():
+            if not isinstance(model, torch.nn.Module):
+                continue
+
+            if name in self._exclude_from_cpu_offload:
+                model.to(device)
+            else:
+                _, hook = cpu_offload_with_hook(model, device)
+                self._all_hooks.append(hook)
+
+    def maybe_free_model_hooks(self):
+        r"""
+        Function that offloads all components, removes all model hooks that were added when using
+        `enable_model_cpu_offload` and then applies them again. In case the model has not been offloaded this function
+        is a no-op. Make sure to add this function to the end of the `__call__` function of your pipeline so that it
+        functions correctly when applying enable_model_cpu_offload.
+        """
+        if not hasattr(self, "_all_hooks") or len(self._all_hooks) == 0:
+            # `enable_model_cpu_offload` has not be called, so silently do nothing
+            return
+
+        for hook in self._all_hooks:
+            # offload model and remove hook from model
+            hook.offload()
+            hook.remove()
+
+        # make sure the model is in the same state as before calling it
+        self.enable_model_cpu_offload(device=getattr(self, "_offload_device", "cuda"))
+
+    def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
+        r"""
+        Offloads all models to CPU using 🤗 Accelerate, significantly reducing memory usage. When called, the state
+        dicts of all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are saved to CPU
+        and then moved to `torch.device('meta')` and loaded to GPU only when their specific submodule has its `forward`
+        method called. Offloading happens on a submodule basis. Memory savings are higher than with
+        `enable_model_cpu_offload`, but performance is lower.
+
+        Arguments:
+            gpu_id (`int`, *optional*):
+                The ID of the accelerator that shall be used in inference. If not specified, it will default to 0.
+            device (`torch.Device` or `str`, *optional*, defaults to "cuda"):
+                The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
+                default to "cuda".
+        """
+        if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")
+
+        torch_device = torch.device(device)
+        device_index = torch_device.index
+
+        if gpu_id is not None and device_index is not None:
+            raise ValueError(
+                f"You have passed both `gpu_id`={gpu_id} and an index as part of the passed device `device`={device}"
+                f"Cannot pass both. Please make sure to either not define `gpu_id` or not pass the index as part of the device: `device`={torch_device.type}"
+            )
+
+        # _offload_gpu_id should be set to passed gpu_id (or id in passed `device`) or default to previously set id or default to 0
+        self._offload_gpu_id = gpu_id or torch_device.index or getattr(self, "_offload_gpu_id", 0)
+
+        device_type = torch_device.type
+        device = torch.device(f"{device_type}:{self._offload_gpu_id}")
+        self._offload_device = device
+
+        if self.device.type != "cpu":
+            self.to("cpu", silence_dtype_warnings=True)
+            device_mod = getattr(torch, self.device.type, None)
+            if hasattr(device_mod, "empty_cache") and device_mod.is_available():
+                device_mod.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+
+        for name, model in self.components.items():
+            if not isinstance(model, torch.nn.Module):
+                continue
+
+            if name in self._exclude_from_cpu_offload:
+                model.to(device)
+            else:
+                # make sure to offload buffers if not all high level weights
+                # are of type nn.Module
+                offload_buffers = len(model._parameters) > 0
+                cpu_offload(model, device, offload_buffers=offload_buffers)
+
+    @classmethod
+    @validate_hf_hub_args
+    def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]:
+        r"""
+        Download and cache a PyTorch diffusion pipeline from pretrained pipeline weights.
+
+        Parameters:
+            pretrained_model_name (`str` or `os.PathLike`, *optional*):
+                A string, the *repository id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline
+                hosted on the Hub.
+            custom_pipeline (`str`, *optional*):
+                Can be either:
+
+                    - A string, the *repository id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained
+                      pipeline hosted on the Hub. The repository must contain a file called `pipeline.py` that defines
+                      the custom pipeline.
+
+                    - A string, the *file name* of a community pipeline hosted on GitHub under
+                      [Community](https://github.com/huggingface/diffusers/tree/main/examples/community). Valid file
+                      names must match the file name and not the pipeline script (`clip_guided_stable_diffusion`
+                      instead of `clip_guided_stable_diffusion.py`). Community pipelines are always loaded from the
+                      current `main` branch of GitHub.
+
+                    - A path to a *directory* (`./my_pipeline_directory/`) containing a custom pipeline. The directory
+                      must contain a file called `pipeline.py` that defines the custom pipeline.
+
+                <Tip warning={true}>
+
+                🧪 This is an experimental feature and may change in the future.
+
+                </Tip>
+
+                For more information on how to load and create custom pipelines, take a look at [How to contribute a
+                community pipeline](https://huggingface.co/docs/diffusers/main/en/using-diffusers/contribute_pipeline).
+
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only (`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            custom_revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id similar to
+                `revision` when loading a custom pipeline from the Hub. It can be a 🤗 Diffusers version when loading a
+                custom pipeline from GitHub, otherwise it defaults to `"main"` when loading from the Hub.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+            variant (`str`, *optional*):
+                Load weights from a specified variant filename such as `"fp16"` or `"ema"`. This is ignored when
+                loading `from_flax`.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
+                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
+                weights. If set to `False`, safetensors weights are not loaded.
+            use_onnx (`bool`, *optional*, defaults to `False`):
+                If set to `True`, ONNX weights will always be downloaded if present. If set to `False`, ONNX weights
+                will never be downloaded. By default `use_onnx` defaults to the `_is_onnx` class attribute which is
+                `False` for non-ONNX pipelines and `True` for ONNX pipelines. ONNX weights include both files ending
+                with `.onnx` and `.pb`.
+            trust_remote_code (`bool`, *optional*, defaults to `False`):
+                Whether or not to allow for custom pipelines and components defined on the Hub in their own files. This
+                option should only be set to `True` for repositories you trust and in which you have read the code, as
+                it will execute code present on the Hub on your local machine.
+
+        Returns:
+            `os.PathLike`:
+                A path to the downloaded pipeline.
+
+        <Tip>
+
+        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with
+        `huggingface-cli login`.
+
+        </Tip>
+
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        resume_download = kwargs.pop("resume_download", False)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        from_flax = kwargs.pop("from_flax", False)
+        custom_pipeline = kwargs.pop("custom_pipeline", None)
+        custom_revision = kwargs.pop("custom_revision", None)
+        variant = kwargs.pop("variant", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+        use_onnx = kwargs.pop("use_onnx", None)
+        load_connected_pipeline = kwargs.pop("load_connected_pipeline", False)
+        trust_remote_code = kwargs.pop("trust_remote_code", False)
+
+        allow_pickle = False
+        if use_safetensors is None:
+            use_safetensors = True
+            allow_pickle = True
+
+        allow_patterns = None
+        ignore_patterns = None
+
+        model_info_call_error: Optional[Exception] = None
+        if not local_files_only:
+            try:
+                info = model_info(pretrained_model_name, token=token, revision=revision)
+            except (HTTPError, OfflineModeIsEnabled, requests.ConnectionError) as e:
+                logger.warning(f"Couldn't connect to the Hub: {e}.\nWill try to load from local cache.")
+                local_files_only = True
+                model_info_call_error = e  # save error to reraise it if model is not cached locally
+
+        if not local_files_only:
+            config_file = hf_hub_download(
+                pretrained_model_name,
+                cls.config_name,
+                cache_dir=cache_dir,
+                revision=revision,
+                proxies=proxies,
+                force_download=force_download,
+                resume_download=resume_download,
+                token=token,
+            )
+
+            config_dict = cls._dict_from_json_file(config_file)
+            ignore_filenames = config_dict.pop("_ignore_files", [])
+
+            # retrieve all folder_names that contain relevant files
+            folder_names = [k for k, v in config_dict.items() if isinstance(v, list) and k != "_class_name"]
+
+            filenames = {sibling.rfilename for sibling in info.siblings}
+            model_filenames, variant_filenames = variant_compatible_siblings(filenames, variant=variant)
+
+            diffusers_module = importlib.import_module(__name__.split(".")[0])
+            pipelines = getattr(diffusers_module, "pipelines")
+
+            # optionally create a custom component <> custom file mapping
+            custom_components = {}
+            for component in folder_names:
+                module_candidate = config_dict[component][0]
+
+                if module_candidate is None or not isinstance(module_candidate, str):
+                    continue
+
+                # We compute candidate file path on the Hub. Do not use `os.path.join`.
+                candidate_file = f"{component}/{module_candidate}.py"
+
+                if candidate_file in filenames:
+                    custom_components[component] = module_candidate
+                elif module_candidate not in LOADABLE_CLASSES and not hasattr(pipelines, module_candidate):
+                    raise ValueError(
+                        f"{candidate_file} as defined in `model_index.json` does not exist in {pretrained_model_name} and is not a module in 'diffusers/pipelines'."
+                    )
+
+            if len(variant_filenames) == 0 and variant is not None:
+                deprecation_message = (
+                    f"You are trying to load the model files of the `variant={variant}`, but no such modeling files are available."
+                    f"The default model files: {model_filenames} will be loaded instead. Make sure to not load from `variant={variant}`"
+                    "if such variant modeling files are not available. Doing so will lead to an error in v0.24.0 as defaulting to non-variant"
+                    "modeling files is deprecated."
+                )
+                deprecate("no variant default", "0.24.0", deprecation_message, standard_warn=False)
+
+            # remove ignored filenames
+            model_filenames = set(model_filenames) - set(ignore_filenames)
+            variant_filenames = set(variant_filenames) - set(ignore_filenames)
+
+            # if the whole pipeline is cached we don't have to ping the Hub
+            if revision in DEPRECATED_REVISION_ARGS and version.parse(
+                version.parse(__version__).base_version
+            ) >= version.parse("0.22.0"):
+                warn_deprecated_model_variant(pretrained_model_name, token, variant, revision, model_filenames)
+
+            model_folder_names = {os.path.split(f)[0] for f in model_filenames if os.path.split(f)[0] in folder_names}
+
+            custom_class_name = None
+            if custom_pipeline is None and isinstance(config_dict["_class_name"], (list, tuple)):
+                custom_pipeline = config_dict["_class_name"][0]
+                custom_class_name = config_dict["_class_name"][1]
+
+            # all filenames compatible with variant will be added
+            allow_patterns = list(model_filenames)
+
+            # allow all patterns from non-model folders
+            # this enables downloading schedulers, tokenizers, ...
+            allow_patterns += [f"{k}/*" for k in folder_names if k not in model_folder_names]
+            # add custom component files
+            allow_patterns += [f"{k}/{f}.py" for k, f in custom_components.items()]
+            # add custom pipeline file
+            allow_patterns += [f"{custom_pipeline}.py"] if f"{custom_pipeline}.py" in filenames else []
+            # also allow downloading config.json files with the model
+            allow_patterns += [os.path.join(k, "config.json") for k in model_folder_names]
+
+            allow_patterns += [
+                SCHEDULER_CONFIG_NAME,
+                CONFIG_NAME,
+                cls.config_name,
+                CUSTOM_PIPELINE_FILE_NAME,
+            ]
+
+            load_pipe_from_hub = custom_pipeline is not None and f"{custom_pipeline}.py" in filenames
+            load_components_from_hub = len(custom_components) > 0
+
+            if load_pipe_from_hub and not trust_remote_code:
+                raise ValueError(
+                    f"The repository for {pretrained_model_name} contains custom code in {custom_pipeline}.py which must be executed to correctly "
+                    f"load the model. You can inspect the repository content at https://hf.co/{pretrained_model_name}/blob/main/{custom_pipeline}.py.\n"
+                    f"Please pass the argument `trust_remote_code=True` to allow custom code to be run."
+                )
+
+            if load_components_from_hub and not trust_remote_code:
+                raise ValueError(
+                    f"The repository for {pretrained_model_name} contains custom code in {'.py, '.join([os.path.join(k, v) for k,v in custom_components.items()])} which must be executed to correctly "
+                    f"load the model. You can inspect the repository content at {', '.join([f'https://hf.co/{pretrained_model_name}/{k}/{v}.py' for k,v in custom_components.items()])}.\n"
+                    f"Please pass the argument `trust_remote_code=True` to allow custom code to be run."
+                )
+
+            # retrieve passed components that should not be downloaded
+            pipeline_class = _get_pipeline_class(
+                cls,
+                config_dict,
+                load_connected_pipeline=load_connected_pipeline,
+                custom_pipeline=custom_pipeline,
+                repo_id=pretrained_model_name if load_pipe_from_hub else None,
+                hub_revision=revision,
+                class_name=custom_class_name,
+                cache_dir=cache_dir,
+                revision=custom_revision,
+            )
+            expected_components, _ = cls._get_signature_keys(pipeline_class)
+            passed_components = [k for k in expected_components if k in kwargs]
+
+            if (
+                use_safetensors
+                and not allow_pickle
+                and not is_safetensors_compatible(
+                    model_filenames, variant=variant, passed_components=passed_components
+                )
+            ):
+                raise EnvironmentError(
+                    f"Could not find the necessary `safetensors` weights in {model_filenames} (variant={variant})"
+                )
+            if from_flax:
+                ignore_patterns = ["*.bin", "*.safetensors", "*.onnx", "*.pb"]
+            elif use_safetensors and is_safetensors_compatible(
+                model_filenames, variant=variant, passed_components=passed_components
+            ):
+                ignore_patterns = ["*.bin", "*.msgpack"]
+
+                use_onnx = use_onnx if use_onnx is not None else pipeline_class._is_onnx
+                if not use_onnx:
+                    ignore_patterns += ["*.onnx", "*.pb"]
+
+                safetensors_variant_filenames = {f for f in variant_filenames if f.endswith(".safetensors")}
+                safetensors_model_filenames = {f for f in model_filenames if f.endswith(".safetensors")}
+                if (
+                    len(safetensors_variant_filenames) > 0
+                    and safetensors_model_filenames != safetensors_variant_filenames
+                ):
+                    logger.warning(
+                        f"\nA mixture of {variant} and non-{variant} filenames will be loaded.\nLoaded {variant} filenames:\n[{', '.join(safetensors_variant_filenames)}]\nLoaded non-{variant} filenames:\n[{', '.join(safetensors_model_filenames - safetensors_variant_filenames)}\nIf this behavior is not expected, please check your folder structure."
+                    )
+            else:
+                ignore_patterns = ["*.safetensors", "*.msgpack"]
+
+                use_onnx = use_onnx if use_onnx is not None else pipeline_class._is_onnx
+                if not use_onnx:
+                    ignore_patterns += ["*.onnx", "*.pb"]
+
+                bin_variant_filenames = {f for f in variant_filenames if f.endswith(".bin")}
+                bin_model_filenames = {f for f in model_filenames if f.endswith(".bin")}
+                if len(bin_variant_filenames) > 0 and bin_model_filenames != bin_variant_filenames:
+                    logger.warning(
+                        f"\nA mixture of {variant} and non-{variant} filenames will be loaded.\nLoaded {variant} filenames:\n[{', '.join(bin_variant_filenames)}]\nLoaded non-{variant} filenames:\n[{', '.join(bin_model_filenames - bin_variant_filenames)}\nIf this behavior is not expected, please check your folder structure."
+                    )
+
+            # Don't download any objects that are passed
+            allow_patterns = [
+                p for p in allow_patterns if not (len(p.split("/")) == 2 and p.split("/")[0] in passed_components)
+            ]
+
+            if pipeline_class._load_connected_pipes:
+                allow_patterns.append("README.md")
+
+            # Don't download index files of forbidden patterns either
+            ignore_patterns = ignore_patterns + [f"{i}.index.*json" for i in ignore_patterns]
+
+            re_ignore_pattern = [re.compile(fnmatch.translate(p)) for p in ignore_patterns]
+            re_allow_pattern = [re.compile(fnmatch.translate(p)) for p in allow_patterns]
+
+            expected_files = [f for f in filenames if not any(p.match(f) for p in re_ignore_pattern)]
+            expected_files = [f for f in expected_files if any(p.match(f) for p in re_allow_pattern)]
+
+            snapshot_folder = Path(config_file).parent
+            pipeline_is_cached = all((snapshot_folder / f).is_file() for f in expected_files)
+
+            if pipeline_is_cached and not force_download:
+                # if the pipeline is cached, we can directly return it
+                # else call snapshot_download
+                return snapshot_folder
+
+        user_agent = {"pipeline_class": cls.__name__}
+        if custom_pipeline is not None and not custom_pipeline.endswith(".py"):
+            user_agent["custom_pipeline"] = custom_pipeline
+
+        # download all allow_patterns - ignore_patterns
+        try:
+            cached_folder = snapshot_download(
+                pretrained_model_name,
+                cache_dir=cache_dir,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                allow_patterns=allow_patterns,
+                ignore_patterns=ignore_patterns,
+                user_agent=user_agent,
+            )
+
+            # retrieve pipeline class from local file
+            cls_name = cls.load_config(os.path.join(cached_folder, "model_index.json")).get("_class_name", None)
+            cls_name = cls_name[4:] if isinstance(cls_name, str) and cls_name.startswith("Flax") else cls_name
+
+            diffusers_module = importlib.import_module(__name__.split(".")[0])
+            pipeline_class = getattr(diffusers_module, cls_name, None) if isinstance(cls_name, str) else None
+
+            if pipeline_class is not None and pipeline_class._load_connected_pipes:
+                modelcard = ModelCard.load(os.path.join(cached_folder, "README.md"))
+                connected_pipes = sum([getattr(modelcard.data, k, []) for k in CONNECTED_PIPES_KEYS], [])
+                for connected_pipe_repo_id in connected_pipes:
+                    download_kwargs = {
+                        "cache_dir": cache_dir,
+                        "resume_download": resume_download,
+                        "force_download": force_download,
+                        "proxies": proxies,
+                        "local_files_only": local_files_only,
+                        "token": token,
+                        "variant": variant,
+                        "use_safetensors": use_safetensors,
+                    }
+                    DiffusionPipeline.download(connected_pipe_repo_id, **download_kwargs)
+
+            return cached_folder
+
+        except FileNotFoundError:
+            # Means we tried to load pipeline with `local_files_only=True` but the files have not been found in local cache.
+            # This can happen in two cases:
+            # 1. If the user passed `local_files_only=True`                    => we raise the error directly
+            # 2. If we forced `local_files_only=True` when `model_info` failed => we raise the initial error
+            if model_info_call_error is None:
+                # 1. user passed `local_files_only=True`
+                raise
+            else:
+                # 2. we forced `local_files_only=True` when `model_info` failed
+                raise EnvironmentError(
+                    f"Cannot load model {pretrained_model_name}: model is not cached locally and an error occurred"
+                    " while trying to fetch metadata from the Hub. Please check out the root cause in the stacktrace"
+                    " above."
+                ) from model_info_call_error
+
+    @classmethod
+    def _get_signature_keys(cls, obj):
+        parameters = inspect.signature(obj.__init__).parameters
+        required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty}
+        optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
+        expected_modules = set(required_parameters.keys()) - {"self"}
+
+        optional_names = list(optional_parameters)
+        for name in optional_names:
+            if name in cls._optional_components:
+                expected_modules.add(name)
+                optional_parameters.remove(name)
+
+        return expected_modules, optional_parameters
+
+    @property
+    def components(self) -> Dict[str, Any]:
+        r"""
+        The `self.components` property can be useful to run different pipelines with the same weights and
+        configurations without reallocating additional memory.
+
+        Returns (`dict`):
+            A dictionary containing all the modules needed to initialize the pipeline.
+
+        Examples:
+
+        ```py
+        >>> from diffusers import (
+        ...     StableDiffusionPipeline,
+        ...     StableDiffusionImg2ImgPipeline,
+        ...     StableDiffusionInpaintPipeline,
+        ... )
+
+        >>> text2img = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+        >>> img2img = StableDiffusionImg2ImgPipeline(**text2img.components)
+        >>> inpaint = StableDiffusionInpaintPipeline(**text2img.components)
+        ```
+        """
+        expected_modules, optional_parameters = self._get_signature_keys(self)
+        components = {
+            k: getattr(self, k) for k in self.config.keys() if not k.startswith("_") and k not in optional_parameters
+        }
+
+        if set(components.keys()) != expected_modules:
+            raise ValueError(
+                f"{self} has been incorrectly initialized or {self.__class__} is incorrectly implemented. Expected"
+                f" {expected_modules} to be defined, but {components.keys()} are defined."
+            )
+
+        return components
+
+    @staticmethod
+    def numpy_to_pil(images):
+        """
+        Convert a NumPy image or a batch of images to a PIL image.
+        """
+        return numpy_to_pil(images)
+
+    def progress_bar(self, iterable=None, total=None):
+        if not hasattr(self, "_progress_bar_config"):
+            self._progress_bar_config = {}
+        elif not isinstance(self._progress_bar_config, dict):
+            raise ValueError(
+                f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}."
+            )
+
+        if iterable is not None:
+            return tqdm(iterable, **self._progress_bar_config)
+        elif total is not None:
+            return tqdm(total=total, **self._progress_bar_config)
+        else:
+            raise ValueError("Either `total` or `iterable` has to be defined.")
+
+    def set_progress_bar_config(self, **kwargs):
+        self._progress_bar_config = kwargs
+
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
+        r"""
+        Enable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/). When this
+        option is enabled, you should observe lower GPU memory usage and a potential speed up during inference. Speed
+        up during training is not guaranteed.
+
+        <Tip warning={true}>
+
+        ⚠️ When memory efficient attention and sliced attention are both enabled, memory efficient attention takes
+        precedent.
+
+        </Tip>
+
+        Parameters:
+            attention_op (`Callable`, *optional*):
+                Override the default `None` operator for use as `op` argument to the
+                [`memory_efficient_attention()`](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.memory_efficient_attention)
+                function of xFormers.
+
+        Examples:
+
+        ```py
+        >>> import torch
+        >>> from diffusers import DiffusionPipeline
+        >>> from xformers.ops import MemoryEfficientAttentionFlashAttentionOp
+
+        >>> pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+        >>> pipe.enable_xformers_memory_efficient_attention(attention_op=MemoryEfficientAttentionFlashAttentionOp)
+        >>> # Workaround for not accepting attention shape using VAE for Flash Attention
+        >>> pipe.vae.enable_xformers_memory_efficient_attention(attention_op=None)
+        ```
+        """
+        self.set_use_memory_efficient_attention_xformers(True, attention_op)
+
+    def disable_xformers_memory_efficient_attention(self):
+        r"""
+        Disable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/).
+        """
+        self.set_use_memory_efficient_attention_xformers(False)
+
+    def set_use_memory_efficient_attention_xformers(
+        self, valid: bool, attention_op: Optional[Callable] = None
+    ) -> None:
+        # Recursively walk through all the children.
+        # Any children which exposes the set_use_memory_efficient_attention_xformers method
+        # gets the message
+        def fn_recursive_set_mem_eff(module: torch.nn.Module):
+            if hasattr(module, "set_use_memory_efficient_attention_xformers"):
+                module.set_use_memory_efficient_attention_xformers(valid, attention_op)
+
+            for child in module.children():
+                fn_recursive_set_mem_eff(child)
+
+        module_names, _ = self._get_signature_keys(self)
+        modules = [getattr(self, n, None) for n in module_names]
+        modules = [m for m in modules if isinstance(m, torch.nn.Module)]
+
+        for module in modules:
+            fn_recursive_set_mem_eff(module)
+
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation. When this option is enabled, the attention module splits the input tensor
+        in slices to compute attention in several steps. For more than one attention head, the computation is performed
+        sequentially over each head. This is useful to save some memory in exchange for a small speed decrease.
+
+        <Tip warning={true}>
+
+        ⚠️ Don't enable attention slicing if you're already using `scaled_dot_product_attention` (SDPA) from PyTorch
+        2.0 or xFormers. These attention computations are already very memory efficient so you won't need to enable
+        this function. If you enable attention slicing with SDPA or xFormers, it can lead to serious slow downs!
+
+        </Tip>
+
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                `"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+
+        Examples:
+
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionPipeline
+
+        >>> pipe = StableDiffusionPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5",
+        ...     torch_dtype=torch.float16,
+        ...     use_safetensors=True,
+        ... )
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> pipe.enable_attention_slicing()
+        >>> image = pipe(prompt).images[0]
+        ```
+        """
+        self.set_attention_slice(slice_size)
+
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously called, attention is
+        computed in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)
+
+    def set_attention_slice(self, slice_size: Optional[int]):
+        module_names, _ = self._get_signature_keys(self)
+        modules = [getattr(self, n, None) for n in module_names]
+        modules = [m for m in modules if isinstance(m, torch.nn.Module) and hasattr(m, "set_attention_slice")]
+
+        for module in modules:
+            module.set_attention_slice(slice_size)
+
+
+class StableDiffusionMixin:
+    r"""
+    Helper for DiffusionPipeline with vae and unet.(mainly for LDM such as stable diffusion)
+    """
+
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+
+    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
+        r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stages where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
+        that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        if not hasattr(self, "unet"):
+            raise ValueError("The pipeline must have `unet` for using FreeU.")
+        self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
+
+    def disable_freeu(self):
+        """Disables the FreeU mechanism if enabled."""
+        self.unet.disable_freeu()
+
+    def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+        """
+        self.fusing_unet = False
+        self.fusing_vae = False
+
+        if unet:
+            self.fusing_unet = True
+            self.unet.fuse_qkv_projections()
+            self.unet.set_attn_processor(FusedAttnProcessor2_0())
+
+        if vae:
+            if not isinstance(self.vae, AutoencoderKL):
+                raise ValueError("`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`.")
+
+            self.fusing_vae = True
+            self.vae.fuse_qkv_projections()
+            self.vae.set_attn_processor(FusedAttnProcessor2_0())
+
+    def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """Disable QKV projection fusion if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+
+        """
+        if unet:
+            if not self.fusing_unet:
+                logger.warning("The UNet was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.unet.unfuse_qkv_projections()
+                self.fusing_unet = False
+
+        if vae:
+            if not self.fusing_vae:
+                logger.warning("The VAE was not initially fused for QKV projections. Doing nothing.")
+            else:
+                self.vae.unfuse_qkv_projections()
+                self.fusing_vae = False
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pixart_alpha/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pixart_alpha/__init__.py
new file mode 100644
index 000000000..0bfa28fcd
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pixart_alpha/__init__.py
@@ -0,0 +1,48 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_pixart_alpha"] = ["PixArtAlphaPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_pixart_alpha import PixArtAlphaPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py
new file mode 100644
index 000000000..e7213a38b
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py
@@ -0,0 +1,979 @@
+# Copyright 2024 PixArt-Alpha Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import html
+import inspect
+import re
+import urllib.parse as ul
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from transformers import T5EncoderModel, T5Tokenizer
+
+from ...image_processor import VaeImageProcessor
+from ...models import AutoencoderKL, Transformer2DModel
+from ...schedulers import DPMSolverMultistepScheduler
+from ...utils import (
+    BACKENDS_MAPPING,
+    deprecate,
+    is_bs4_available,
+    is_ftfy_available,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_bs4_available():
+    from bs4 import BeautifulSoup
+
+if is_ftfy_available():
+    import ftfy
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import PixArtAlphaPipeline
+
+        >>> # You can replace the checkpoint id with "PixArt-alpha/PixArt-XL-2-512x512" too.
+        >>> pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16)
+        >>> # Enable memory optimizations.
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = "A small cactus with a happy face in the Sahara desert."
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+ASPECT_RATIO_1024_BIN = {
+    "0.25": [512.0, 2048.0],
+    "0.28": [512.0, 1856.0],
+    "0.32": [576.0, 1792.0],
+    "0.33": [576.0, 1728.0],
+    "0.35": [576.0, 1664.0],
+    "0.4": [640.0, 1600.0],
+    "0.42": [640.0, 1536.0],
+    "0.48": [704.0, 1472.0],
+    "0.5": [704.0, 1408.0],
+    "0.52": [704.0, 1344.0],
+    "0.57": [768.0, 1344.0],
+    "0.6": [768.0, 1280.0],
+    "0.68": [832.0, 1216.0],
+    "0.72": [832.0, 1152.0],
+    "0.78": [896.0, 1152.0],
+    "0.82": [896.0, 1088.0],
+    "0.88": [960.0, 1088.0],
+    "0.94": [960.0, 1024.0],
+    "1.0": [1024.0, 1024.0],
+    "1.07": [1024.0, 960.0],
+    "1.13": [1088.0, 960.0],
+    "1.21": [1088.0, 896.0],
+    "1.29": [1152.0, 896.0],
+    "1.38": [1152.0, 832.0],
+    "1.46": [1216.0, 832.0],
+    "1.67": [1280.0, 768.0],
+    "1.75": [1344.0, 768.0],
+    "2.0": [1408.0, 704.0],
+    "2.09": [1472.0, 704.0],
+    "2.4": [1536.0, 640.0],
+    "2.5": [1600.0, 640.0],
+    "3.0": [1728.0, 576.0],
+    "4.0": [2048.0, 512.0],
+}
+
+ASPECT_RATIO_512_BIN = {
+    "0.25": [256.0, 1024.0],
+    "0.28": [256.0, 928.0],
+    "0.32": [288.0, 896.0],
+    "0.33": [288.0, 864.0],
+    "0.35": [288.0, 832.0],
+    "0.4": [320.0, 800.0],
+    "0.42": [320.0, 768.0],
+    "0.48": [352.0, 736.0],
+    "0.5": [352.0, 704.0],
+    "0.52": [352.0, 672.0],
+    "0.57": [384.0, 672.0],
+    "0.6": [384.0, 640.0],
+    "0.68": [416.0, 608.0],
+    "0.72": [416.0, 576.0],
+    "0.78": [448.0, 576.0],
+    "0.82": [448.0, 544.0],
+    "0.88": [480.0, 544.0],
+    "0.94": [480.0, 512.0],
+    "1.0": [512.0, 512.0],
+    "1.07": [512.0, 480.0],
+    "1.13": [544.0, 480.0],
+    "1.21": [544.0, 448.0],
+    "1.29": [576.0, 448.0],
+    "1.38": [576.0, 416.0],
+    "1.46": [608.0, 416.0],
+    "1.67": [640.0, 384.0],
+    "1.75": [672.0, 384.0],
+    "2.0": [704.0, 352.0],
+    "2.09": [736.0, 352.0],
+    "2.4": [768.0, 320.0],
+    "2.5": [800.0, 320.0],
+    "3.0": [864.0, 288.0],
+    "4.0": [1024.0, 256.0],
+}
+
+ASPECT_RATIO_256_BIN = {
+    "0.25": [128.0, 512.0],
+    "0.28": [128.0, 464.0],
+    "0.32": [144.0, 448.0],
+    "0.33": [144.0, 432.0],
+    "0.35": [144.0, 416.0],
+    "0.4": [160.0, 400.0],
+    "0.42": [160.0, 384.0],
+    "0.48": [176.0, 368.0],
+    "0.5": [176.0, 352.0],
+    "0.52": [176.0, 336.0],
+    "0.57": [192.0, 336.0],
+    "0.6": [192.0, 320.0],
+    "0.68": [208.0, 304.0],
+    "0.72": [208.0, 288.0],
+    "0.78": [224.0, 288.0],
+    "0.82": [224.0, 272.0],
+    "0.88": [240.0, 272.0],
+    "0.94": [240.0, 256.0],
+    "1.0": [256.0, 256.0],
+    "1.07": [256.0, 240.0],
+    "1.13": [272.0, 240.0],
+    "1.21": [272.0, 224.0],
+    "1.29": [288.0, 224.0],
+    "1.38": [288.0, 208.0],
+    "1.46": [304.0, 208.0],
+    "1.67": [320.0, 192.0],
+    "1.75": [336.0, 192.0],
+    "2.0": [352.0, 176.0],
+    "2.09": [368.0, 176.0],
+    "2.4": [384.0, 160.0],
+    "2.5": [400.0, 160.0],
+    "3.0": [432.0, 144.0],
+    "4.0": [512.0, 128.0],
+}
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class PixArtAlphaPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using PixArt-Alpha.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`T5EncoderModel`]):
+            Frozen text-encoder. PixArt-Alpha uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
+            [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
+        tokenizer (`T5Tokenizer`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+        transformer ([`Transformer2DModel`]):
+            A text conditioned `Transformer2DModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+    """
+
+    bad_punct_regex = re.compile(
+        r"["
+        + "#®•©™&@·º½¾¿¡§~"
+        + r"\)"
+        + r"\("
+        + r"\]"
+        + r"\["
+        + r"\}"
+        + r"\{"
+        + r"\|"
+        + "\\"
+        + r"\/"
+        + r"\*"
+        + r"]{1,}"
+    )  # noqa
+
+    _optional_components = ["tokenizer", "text_encoder"]
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        vae: AutoencoderKL,
+        transformer: Transformer2DModel,
+        scheduler: DPMSolverMultistepScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
+        )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    # Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/utils.py
+    def mask_text_embeddings(self, emb, mask):
+        if emb.shape[0] == 1:
+            keep_index = mask.sum().item()
+            return emb[:, :, :keep_index, :], keep_index
+        else:
+            masked_feature = emb * mask[:, None, :, None]
+            return masked_feature, emb.shape[2]
+
+    # Adapted from diffusers.pipelines.deepfloyd_if.pipeline_if.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: str = "",
+        num_images_per_prompt: int = 1,
+        device: Optional[torch.device] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        prompt_attention_mask: Optional[torch.FloatTensor] = None,
+        negative_prompt_attention_mask: Optional[torch.FloatTensor] = None,
+        clean_caption: bool = False,
+        max_sequence_length: int = 120,
+        **kwargs,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds`
+                instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). For
+                PixArt-Alpha, this should be "".
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                whether to use classifier free guidance or not
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                number of images that should be generated per prompt
+            device: (`torch.device`, *optional*):
+                torch device to place the resulting embeddings on
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. For PixArt-Alpha, it's should be the embeddings of the ""
+                string.
+            clean_caption (`bool`, defaults to `False`):
+                If `True`, the function will preprocess and clean the provided caption before encoding.
+            max_sequence_length (`int`, defaults to 120): Maximum sequence length to use for the prompt.
+        """
+
+        if "mask_feature" in kwargs:
+            deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version."
+            deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False)
+
+        if device is None:
+            device = self._execution_device
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # See Section 3.1. of the paper.
+        max_length = max_sequence_length
+
+        if prompt_embeds is None:
+            prompt = self._text_preprocessing(prompt, clean_caption=clean_caption)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {max_length} tokens: {removed_text}"
+                )
+
+            prompt_attention_mask = text_inputs.attention_mask
+            prompt_attention_mask = prompt_attention_mask.to(device)
+
+            prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=prompt_attention_mask)
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            dtype = self.text_encoder.dtype
+        elif self.transformer is not None:
+            dtype = self.transformer.dtype
+        else:
+            dtype = None
+
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        prompt_attention_mask = prompt_attention_mask.view(bs_embed, -1)
+        prompt_attention_mask = prompt_attention_mask.repeat(num_images_per_prompt, 1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens = [negative_prompt] * batch_size
+            uncond_tokens = self._text_preprocessing(uncond_tokens, clean_caption=clean_caption)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_attention_mask=True,
+                add_special_tokens=True,
+                return_tensors="pt",
+            )
+            negative_prompt_attention_mask = uncond_input.attention_mask
+            negative_prompt_attention_mask = negative_prompt_attention_mask.to(device)
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device), attention_mask=negative_prompt_attention_mask
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            negative_prompt_attention_mask = negative_prompt_attention_mask.view(bs_embed, -1)
+            negative_prompt_attention_mask = negative_prompt_attention_mask.repeat(num_images_per_prompt, 1)
+        else:
+            negative_prompt_embeds = None
+            negative_prompt_attention_mask = None
+
+        return prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt,
+        callback_steps,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        prompt_attention_mask=None,
+        negative_prompt_attention_mask=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and prompt_attention_mask is None:
+            raise ValueError("Must provide `prompt_attention_mask` when specifying `prompt_embeds`.")
+
+        if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
+            raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+            if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
+                raise ValueError(
+                    "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
+                    f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
+                    f" {negative_prompt_attention_mask.shape}."
+                )
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._text_preprocessing
+    def _text_preprocessing(self, text, clean_caption=False):
+        if clean_caption and not is_bs4_available():
+            logger.warning(BACKENDS_MAPPING["bs4"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if clean_caption and not is_ftfy_available():
+            logger.warning(BACKENDS_MAPPING["ftfy"][-1].format("Setting `clean_caption=True`"))
+            logger.warning("Setting `clean_caption` to False...")
+            clean_caption = False
+
+        if not isinstance(text, (tuple, list)):
+            text = [text]
+
+        def process(text: str):
+            if clean_caption:
+                text = self._clean_caption(text)
+                text = self._clean_caption(text)
+            else:
+                text = text.lower().strip()
+            return text
+
+        return [process(t) for t in text]
+
+    # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
+    def _clean_caption(self, caption):
+        caption = str(caption)
+        caption = ul.unquote_plus(caption)
+        caption = caption.strip().lower()
+        caption = re.sub("<person>", "person", caption)
+        # urls:
+        caption = re.sub(
+            r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        caption = re.sub(
+            r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))",  # noqa
+            "",
+            caption,
+        )  # regex for urls
+        # html:
+        caption = BeautifulSoup(caption, features="html.parser").text
+
+        # @<nickname>
+        caption = re.sub(r"@[\w\d]+\b", "", caption)
+
+        # 31C0—31EF CJK Strokes
+        # 31F0—31FF Katakana Phonetic Extensions
+        # 3200—32FF Enclosed CJK Letters and Months
+        # 3300—33FF CJK Compatibility
+        # 3400—4DBF CJK Unified Ideographs Extension A
+        # 4DC0—4DFF Yijing Hexagram Symbols
+        # 4E00—9FFF CJK Unified Ideographs
+        caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
+        caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
+        caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
+        caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
+        caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
+        caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
+        caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
+        #######################################################
+
+        # все виды тире / all types of dash --> "-"
+        caption = re.sub(
+            r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+",  # noqa
+            "-",
+            caption,
+        )
+
+        # кавычки к одному стандарту
+        caption = re.sub(r"[`´«»“”¨]", '"', caption)
+        caption = re.sub(r"[‘’]", "'", caption)
+
+        # &quot;
+        caption = re.sub(r"&quot;?", "", caption)
+        # &amp
+        caption = re.sub(r"&amp", "", caption)
+
+        # ip adresses:
+        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
+
+        # article ids:
+        caption = re.sub(r"\d:\d\d\s+$", "", caption)
+
+        # \n
+        caption = re.sub(r"\\n", " ", caption)
+
+        # "#123"
+        caption = re.sub(r"#\d{1,3}\b", "", caption)
+        # "#12345.."
+        caption = re.sub(r"#\d{5,}\b", "", caption)
+        # "123456.."
+        caption = re.sub(r"\b\d{6,}\b", "", caption)
+        # filenames:
+        caption = re.sub(r"[\S]+\.(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)", "", caption)
+
+        #
+        caption = re.sub(r"[\"\']{2,}", r'"', caption)  # """AUSVERKAUFT"""
+        caption = re.sub(r"[\.]{2,}", r" ", caption)  # """AUSVERKAUFT"""
+
+        caption = re.sub(self.bad_punct_regex, r" ", caption)  # ***AUSVERKAUFT***, #AUSVERKAUFT
+        caption = re.sub(r"\s+\.\s+", r" ", caption)  # " . "
+
+        # this-is-my-cute-cat / this_is_my_cute_cat
+        regex2 = re.compile(r"(?:\-|\_)")
+        if len(re.findall(regex2, caption)) > 3:
+            caption = re.sub(regex2, " ", caption)
+
+        caption = ftfy.fix_text(caption)
+        caption = html.unescape(html.unescape(caption))
+
+        caption = re.sub(r"\b[a-zA-Z]{1,3}\d{3,15}\b", "", caption)  # jc6640
+        caption = re.sub(r"\b[a-zA-Z]+\d+[a-zA-Z]+\b", "", caption)  # jc6640vc
+        caption = re.sub(r"\b\d+[a-zA-Z]+\d+\b", "", caption)  # 6640vc231
+
+        caption = re.sub(r"(worldwide\s+)?(free\s+)?shipping", "", caption)
+        caption = re.sub(r"(free\s)?download(\sfree)?", "", caption)
+        caption = re.sub(r"\bclick\b\s(?:for|on)\s\w+", "", caption)
+        caption = re.sub(r"\b(?:png|jpg|jpeg|bmp|webp|eps|pdf|apk|mp4)(\simage[s]?)?", "", caption)
+        caption = re.sub(r"\bpage\s+\d+\b", "", caption)
+
+        caption = re.sub(r"\b\d*[a-zA-Z]+\d+[a-zA-Z]+\d+[a-zA-Z\d]*\b", r" ", caption)  # j2d1a2a...
+
+        caption = re.sub(r"\b\d+\.?\d*[xх×]\d+\.?\d*\b", "", caption)
+
+        caption = re.sub(r"\b\s+\:\s+", r": ", caption)
+        caption = re.sub(r"(\D[,\./])\b", r"\1 ", caption)
+        caption = re.sub(r"\s+", " ", caption)
+
+        caption.strip()
+
+        caption = re.sub(r"^[\"\']([\w\W]+)[\"\']$", r"\1", caption)
+        caption = re.sub(r"^[\'\_,\-\:;]", r"", caption)
+        caption = re.sub(r"[\'\_,\-\:\-\+]$", r"", caption)
+        caption = re.sub(r"^\.\S+$", "", caption)
+
+        return caption.strip()
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @staticmethod
+    def classify_height_width_bin(height: int, width: int, ratios: dict) -> Tuple[int, int]:
+        """Returns binned height and width."""
+        ar = float(height / width)
+        closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))
+        default_hw = ratios[closest_ratio]
+        return int(default_hw[0]), int(default_hw[1])
+
+    @staticmethod
+    def resize_and_crop_tensor(samples: torch.Tensor, new_width: int, new_height: int) -> torch.Tensor:
+        orig_height, orig_width = samples.shape[2], samples.shape[3]
+
+        # Check if resizing is needed
+        if orig_height != new_height or orig_width != new_width:
+            ratio = max(new_height / orig_height, new_width / orig_width)
+            resized_width = int(orig_width * ratio)
+            resized_height = int(orig_height * ratio)
+
+            # Resize
+            samples = F.interpolate(
+                samples, size=(resized_height, resized_width), mode="bilinear", align_corners=False
+            )
+
+            # Center Crop
+            start_x = (resized_width - new_width) // 2
+            end_x = start_x + new_width
+            start_y = (resized_height - new_height) // 2
+            end_y = start_y + new_height
+            samples = samples[:, :, start_y:end_y, start_x:end_x]
+
+        return samples
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: str = "",
+        num_inference_steps: int = 20,
+        timesteps: List[int] = None,
+        guidance_scale: float = 4.5,
+        num_images_per_prompt: Optional[int] = 1,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        prompt_attention_mask: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_attention_mask: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        clean_caption: bool = True,
+        use_resolution_binning: bool = True,
+        max_sequence_length: int = 120,
+        **kwargs,
+    ) -> Union[ImagePipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 4.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size):
+                The width in pixels of the generated image.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            prompt_attention_mask (`torch.FloatTensor`, *optional*): Pre-generated attention mask for text embeddings.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. For PixArt-Alpha this negative prompt should be "". If not
+                provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
+            negative_prompt_attention_mask (`torch.FloatTensor`, *optional*):
+                Pre-generated attention mask for negative text embeddings.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            clean_caption (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean the caption before creating embeddings. Requires `beautifulsoup4` and `ftfy` to
+                be installed. If the dependencies are not installed, the embeddings will be created from the raw
+                prompt.
+            use_resolution_binning (`bool` defaults to `True`):
+                If set to `True`, the requested height and width are first mapped to the closest resolutions using
+                `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to
+                the requested resolution. Useful for generating non-square images.
+            max_sequence_length (`int` defaults to 120): Maximum sequence length to use with the `prompt`.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images
+        """
+        if "mask_feature" in kwargs:
+            deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version."
+            deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False)
+        # 1. Check inputs. Raise error if not correct
+        height = height or self.transformer.config.sample_size * self.vae_scale_factor
+        width = width or self.transformer.config.sample_size * self.vae_scale_factor
+        if use_resolution_binning:
+            if self.transformer.config.sample_size == 128:
+                aspect_ratio_bin = ASPECT_RATIO_1024_BIN
+            elif self.transformer.config.sample_size == 64:
+                aspect_ratio_bin = ASPECT_RATIO_512_BIN
+            elif self.transformer.config.sample_size == 32:
+                aspect_ratio_bin = ASPECT_RATIO_256_BIN
+            else:
+                raise ValueError("Invalid sample size")
+            orig_height, orig_width = height, width
+            height, width = self.classify_height_width_bin(height, width, ratios=aspect_ratio_bin)
+
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            callback_steps,
+            prompt_embeds,
+            negative_prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_attention_mask,
+        )
+
+        # 2. Default height and width to transformer
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        (
+            prompt_embeds,
+            prompt_attention_mask,
+            negative_prompt_embeds,
+            negative_prompt_attention_mask,
+        ) = self.encode_prompt(
+            prompt,
+            do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            device=device,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_attention_mask=prompt_attention_mask,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            clean_caption=clean_caption,
+            max_sequence_length=max_sequence_length,
+        )
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+        # 5. Prepare latents.
+        latent_channels = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            latent_channels,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Prepare micro-conditions.
+        added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
+        if self.transformer.config.sample_size == 128:
+            resolution = torch.tensor([height, width]).repeat(batch_size * num_images_per_prompt, 1)
+            aspect_ratio = torch.tensor([float(height / width)]).repeat(batch_size * num_images_per_prompt, 1)
+            resolution = resolution.to(dtype=prompt_embeds.dtype, device=device)
+            aspect_ratio = aspect_ratio.to(dtype=prompt_embeds.dtype, device=device)
+
+            if do_classifier_free_guidance:
+                resolution = torch.cat([resolution, resolution], dim=0)
+                aspect_ratio = torch.cat([aspect_ratio, aspect_ratio], dim=0)
+
+            added_cond_kwargs = {"resolution": resolution, "aspect_ratio": aspect_ratio}
+
+        # 7. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                current_timestep = t
+                if not torch.is_tensor(current_timestep):
+                    # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+                    # This would be a good case for the `match` statement (Python 3.10+)
+                    is_mps = latent_model_input.device.type == "mps"
+                    if isinstance(current_timestep, float):
+                        dtype = torch.float32 if is_mps else torch.float64
+                    else:
+                        dtype = torch.int32 if is_mps else torch.int64
+                    current_timestep = torch.tensor([current_timestep], dtype=dtype, device=latent_model_input.device)
+                elif len(current_timestep.shape) == 0:
+                    current_timestep = current_timestep[None].to(latent_model_input.device)
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                current_timestep = current_timestep.expand(latent_model_input.shape[0])
+
+                # predict noise model_output
+                noise_pred = self.transformer(
+                    latent_model_input,
+                    encoder_hidden_states=prompt_embeds,
+                    encoder_attention_mask=prompt_attention_mask,
+                    timestep=current_timestep,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # learned sigma
+                if self.transformer.config.out_channels // 2 == latent_channels:
+                    noise_pred = noise_pred.chunk(2, dim=1)[0]
+                else:
+                    noise_pred = noise_pred
+
+                # compute previous image: x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            if use_resolution_binning:
+                image = self.resize_and_crop_tensor(image, orig_width, orig_height)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py
new file mode 100644
index 000000000..70f5b1a54
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py
@@ -0,0 +1,49 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_output"] = ["SemanticStableDiffusionPipelineOutput"]
+    _import_structure["pipeline_semantic_stable_diffusion"] = ["SemanticStableDiffusionPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_semantic_stable_diffusion import SemanticStableDiffusionPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_output.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_output.py
new file mode 100644
index 000000000..349912993
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_output.py
@@ -0,0 +1,25 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL.Image
+
+from ...utils import BaseOutput
+
+
+@dataclass
+class SemanticStableDiffusionPipelineOutput(BaseOutput):
+    """
+    Output class for Stable Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+        nsfw_content_detected (`List[bool]`)
+            List indicating whether the corresponding generated image contains “not-safe-for-work” (nsfw) content or
+            `None` if safety checking could not be performed.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
new file mode 100644
index 000000000..96873423f
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
@@ -0,0 +1,718 @@
+import inspect
+from itertools import repeat
+from typing import Callable, List, Optional, Union
+
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import VaeImageProcessor
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import deprecate, logging
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pipeline_output import SemanticStableDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class SemanticStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion with latent editing.
+
+    This model inherits from [`DiffusionPipeline`] and builds on the [`StableDiffusionPipeline`]. Check the superclass
+    documentation for the generic methods implemented for all pipelines (downloading, saving, running on a particular
+    device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`Q16SafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion_k_diffusion.pipeline_stable_diffusion_k_diffusion.StableDiffusionKDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        editing_prompt: Optional[Union[str, List[str]]] = None,
+        editing_prompt_embeddings: Optional[torch.Tensor] = None,
+        reverse_editing_direction: Optional[Union[bool, List[bool]]] = False,
+        edit_guidance_scale: Optional[Union[float, List[float]]] = 5,
+        edit_warmup_steps: Optional[Union[int, List[int]]] = 10,
+        edit_cooldown_steps: Optional[Union[int, List[int]]] = None,
+        edit_threshold: Optional[Union[float, List[float]]] = 0.9,
+        edit_momentum_scale: Optional[float] = 0.1,
+        edit_mom_beta: Optional[float] = 0.4,
+        edit_weights: Optional[List[float]] = None,
+        sem_guidance: Optional[List[torch.Tensor]] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image generation.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            editing_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to use for semantic guidance. Semantic guidance is disabled by setting
+                `editing_prompt = None`. Guidance direction of prompt should be specified via
+                `reverse_editing_direction`.
+            editing_prompt_embeddings (`torch.Tensor`, *optional*):
+                Pre-computed embeddings to use for semantic guidance. Guidance direction of embedding should be
+                specified via `reverse_editing_direction`.
+            reverse_editing_direction (`bool` or `List[bool]`, *optional*, defaults to `False`):
+                Whether the corresponding prompt in `editing_prompt` should be increased or decreased.
+            edit_guidance_scale (`float` or `List[float]`, *optional*, defaults to 5):
+                Guidance scale for semantic guidance. If provided as a list, values should correspond to
+                `editing_prompt`.
+            edit_warmup_steps (`float` or `List[float]`, *optional*, defaults to 10):
+                Number of diffusion steps (for each prompt) for which semantic guidance is not applied. Momentum is
+                calculated for those steps and applied once all warmup periods are over.
+            edit_cooldown_steps (`float` or `List[float]`, *optional*, defaults to `None`):
+                Number of diffusion steps (for each prompt) after which semantic guidance is longer applied.
+            edit_threshold (`float` or `List[float]`, *optional*, defaults to 0.9):
+                Threshold of semantic guidance.
+            edit_momentum_scale (`float`, *optional*, defaults to 0.1):
+                Scale of the momentum to be added to the semantic guidance at each diffusion step. If set to 0.0,
+                momentum is disabled. Momentum is already built up during warmup (for diffusion steps smaller than
+                `sld_warmup_steps`). Momentum is only added to latent guidance once all warmup periods are finished.
+            edit_mom_beta (`float`, *optional*, defaults to 0.4):
+                Defines how semantic guidance momentum builds up. `edit_mom_beta` indicates how much of the previous
+                momentum is kept. Momentum is already built up during warmup (for diffusion steps smaller than
+                `edit_warmup_steps`).
+            edit_weights (`List[float]`, *optional*, defaults to `None`):
+                Indicates how much each individual concept should influence the overall guidance. If no weights are
+                provided all concepts are applied equally.
+            sem_guidance (`List[torch.Tensor]`, *optional*):
+                List of pre-generated guidance vectors to be applied at generation. Length of the list has to
+                correspond to `num_inference_steps`.
+
+        Examples:
+
+        ```py
+        >>> import torch
+        >>> from diffusers import SemanticStableDiffusionPipeline
+
+        >>> pipe = SemanticStableDiffusionPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> out = pipe(
+        ...     prompt="a photo of the face of a woman",
+        ...     num_images_per_prompt=1,
+        ...     guidance_scale=7,
+        ...     editing_prompt=[
+        ...         "smiling, smile",  # Concepts to apply
+        ...         "glasses, wearing glasses",
+        ...         "curls, wavy hair, curly hair",
+        ...         "beard, full beard, mustache",
+        ...     ],
+        ...     reverse_editing_direction=[
+        ...         False,
+        ...         False,
+        ...         False,
+        ...         False,
+        ...     ],  # Direction of guidance i.e. increase all concepts
+        ...     edit_warmup_steps=[10, 10, 10, 10],  # Warmup period for each concept
+        ...     edit_guidance_scale=[4, 5, 5, 5.4],  # Guidance scale for each concept
+        ...     edit_threshold=[
+        ...         0.99,
+        ...         0.975,
+        ...         0.925,
+        ...         0.96,
+        ...     ],  # Threshold for each concept. Threshold equals the percentile of the latent space that will be discarded. I.e. threshold=0.99 uses 1% of the latent dimensions
+        ...     edit_momentum_scale=0.3,  # Momentum scale that will be added to the latent guidance
+        ...     edit_mom_beta=0.6,  # Momentum beta
+        ...     edit_weights=[1, 1, 1, 1, 1],  # Weights of the individual concepts against each other
+        ... )
+        >>> image = out.images[0]
+        ```
+
+        Returns:
+            [`~pipelines.semantic_stable_diffusion.SemanticStableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`,
+                [`~pipelines.semantic_stable_diffusion.SemanticStableDiffusionPipelineOutput`] is returned, otherwise a
+                `tuple` is returned where the first element is a list with the generated images and the second element
+                is a list of `bool`s indicating whether the corresponding generated image contains "not-safe-for-work"
+                (nsfw) content.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+
+        if editing_prompt:
+            enable_edit_guidance = True
+            if isinstance(editing_prompt, str):
+                editing_prompt = [editing_prompt]
+            enabled_editing_prompts = len(editing_prompt)
+        elif editing_prompt_embeddings is not None:
+            enable_edit_guidance = True
+            enabled_editing_prompts = editing_prompt_embeddings.shape[0]
+        else:
+            enabled_editing_prompts = 0
+            enable_edit_guidance = False
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+            removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+        text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if enable_edit_guidance:
+            # get safety text embeddings
+            if editing_prompt_embeddings is None:
+                edit_concepts_input = self.tokenizer(
+                    [x for item in editing_prompt for x in repeat(item, batch_size)],
+                    padding="max_length",
+                    max_length=self.tokenizer.model_max_length,
+                    return_tensors="pt",
+                )
+
+                edit_concepts_input_ids = edit_concepts_input.input_ids
+
+                if edit_concepts_input_ids.shape[-1] > self.tokenizer.model_max_length:
+                    removed_text = self.tokenizer.batch_decode(
+                        edit_concepts_input_ids[:, self.tokenizer.model_max_length :]
+                    )
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+                    edit_concepts_input_ids = edit_concepts_input_ids[:, : self.tokenizer.model_max_length]
+                edit_concepts = self.text_encoder(edit_concepts_input_ids.to(self.device))[0]
+            else:
+                edit_concepts = editing_prompt_embeddings.to(self.device).repeat(batch_size, 1, 1)
+
+            # duplicate text embeddings for each generation per prompt, using mps friendly method
+            bs_embed_edit, seq_len_edit, _ = edit_concepts.shape
+            edit_concepts = edit_concepts.repeat(1, num_images_per_prompt, 1)
+            edit_concepts = edit_concepts.view(bs_embed_edit * num_images_per_prompt, seq_len_edit, -1)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            if enable_edit_guidance:
+                text_embeddings = torch.cat([uncond_embeddings, text_embeddings, edit_concepts])
+            else:
+                text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        # get the initial random noise unless the user supplied it
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=self.device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            text_embeddings.dtype,
+            self.device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # Initialize edit_momentum to None
+        edit_momentum = None
+
+        self.uncond_estimates = None
+        self.text_estimates = None
+        self.edit_estimates = None
+        self.sem_guidance = None
+
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = (
+                torch.cat([latents] * (2 + enabled_editing_prompts)) if do_classifier_free_guidance else latents
+            )
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_out = noise_pred.chunk(2 + enabled_editing_prompts)  # [b,4, 64, 64]
+                noise_pred_uncond, noise_pred_text = noise_pred_out[0], noise_pred_out[1]
+                noise_pred_edit_concepts = noise_pred_out[2:]
+
+                # default text guidance
+                noise_guidance = guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # noise_guidance = (noise_pred_text - noise_pred_edit_concepts[0])
+
+                if self.uncond_estimates is None:
+                    self.uncond_estimates = torch.zeros((num_inference_steps + 1, *noise_pred_uncond.shape))
+                self.uncond_estimates[i] = noise_pred_uncond.detach().cpu()
+
+                if self.text_estimates is None:
+                    self.text_estimates = torch.zeros((num_inference_steps + 1, *noise_pred_text.shape))
+                self.text_estimates[i] = noise_pred_text.detach().cpu()
+
+                if self.edit_estimates is None and enable_edit_guidance:
+                    self.edit_estimates = torch.zeros(
+                        (num_inference_steps + 1, len(noise_pred_edit_concepts), *noise_pred_edit_concepts[0].shape)
+                    )
+
+                if self.sem_guidance is None:
+                    self.sem_guidance = torch.zeros((num_inference_steps + 1, *noise_pred_text.shape))
+
+                if edit_momentum is None:
+                    edit_momentum = torch.zeros_like(noise_guidance)
+
+                if enable_edit_guidance:
+                    concept_weights = torch.zeros(
+                        (len(noise_pred_edit_concepts), noise_guidance.shape[0]),
+                        device=self.device,
+                        dtype=noise_guidance.dtype,
+                    )
+                    noise_guidance_edit = torch.zeros(
+                        (len(noise_pred_edit_concepts), *noise_guidance.shape),
+                        device=self.device,
+                        dtype=noise_guidance.dtype,
+                    )
+                    # noise_guidance_edit = torch.zeros_like(noise_guidance)
+                    warmup_inds = []
+                    for c, noise_pred_edit_concept in enumerate(noise_pred_edit_concepts):
+                        self.edit_estimates[i, c] = noise_pred_edit_concept
+                        if isinstance(edit_guidance_scale, list):
+                            edit_guidance_scale_c = edit_guidance_scale[c]
+                        else:
+                            edit_guidance_scale_c = edit_guidance_scale
+
+                        if isinstance(edit_threshold, list):
+                            edit_threshold_c = edit_threshold[c]
+                        else:
+                            edit_threshold_c = edit_threshold
+                        if isinstance(reverse_editing_direction, list):
+                            reverse_editing_direction_c = reverse_editing_direction[c]
+                        else:
+                            reverse_editing_direction_c = reverse_editing_direction
+                        if edit_weights:
+                            edit_weight_c = edit_weights[c]
+                        else:
+                            edit_weight_c = 1.0
+                        if isinstance(edit_warmup_steps, list):
+                            edit_warmup_steps_c = edit_warmup_steps[c]
+                        else:
+                            edit_warmup_steps_c = edit_warmup_steps
+
+                        if isinstance(edit_cooldown_steps, list):
+                            edit_cooldown_steps_c = edit_cooldown_steps[c]
+                        elif edit_cooldown_steps is None:
+                            edit_cooldown_steps_c = i + 1
+                        else:
+                            edit_cooldown_steps_c = edit_cooldown_steps
+                        if i >= edit_warmup_steps_c:
+                            warmup_inds.append(c)
+                        if i >= edit_cooldown_steps_c:
+                            noise_guidance_edit[c, :, :, :, :] = torch.zeros_like(noise_pred_edit_concept)
+                            continue
+
+                        noise_guidance_edit_tmp = noise_pred_edit_concept - noise_pred_uncond
+                        # tmp_weights = (noise_pred_text - noise_pred_edit_concept).sum(dim=(1, 2, 3))
+                        tmp_weights = (noise_guidance - noise_pred_edit_concept).sum(dim=(1, 2, 3))
+
+                        tmp_weights = torch.full_like(tmp_weights, edit_weight_c)  # * (1 / enabled_editing_prompts)
+                        if reverse_editing_direction_c:
+                            noise_guidance_edit_tmp = noise_guidance_edit_tmp * -1
+                        concept_weights[c, :] = tmp_weights
+
+                        noise_guidance_edit_tmp = noise_guidance_edit_tmp * edit_guidance_scale_c
+
+                        # torch.quantile function expects float32
+                        if noise_guidance_edit_tmp.dtype == torch.float32:
+                            tmp = torch.quantile(
+                                torch.abs(noise_guidance_edit_tmp).flatten(start_dim=2),
+                                edit_threshold_c,
+                                dim=2,
+                                keepdim=False,
+                            )
+                        else:
+                            tmp = torch.quantile(
+                                torch.abs(noise_guidance_edit_tmp).flatten(start_dim=2).to(torch.float32),
+                                edit_threshold_c,
+                                dim=2,
+                                keepdim=False,
+                            ).to(noise_guidance_edit_tmp.dtype)
+
+                        noise_guidance_edit_tmp = torch.where(
+                            torch.abs(noise_guidance_edit_tmp) >= tmp[:, :, None, None],
+                            noise_guidance_edit_tmp,
+                            torch.zeros_like(noise_guidance_edit_tmp),
+                        )
+                        noise_guidance_edit[c, :, :, :, :] = noise_guidance_edit_tmp
+
+                        # noise_guidance_edit = noise_guidance_edit + noise_guidance_edit_tmp
+
+                    warmup_inds = torch.tensor(warmup_inds).to(self.device)
+                    if len(noise_pred_edit_concepts) > warmup_inds.shape[0] > 0:
+                        concept_weights = concept_weights.to("cpu")  # Offload to cpu
+                        noise_guidance_edit = noise_guidance_edit.to("cpu")
+
+                        concept_weights_tmp = torch.index_select(concept_weights.to(self.device), 0, warmup_inds)
+                        concept_weights_tmp = torch.where(
+                            concept_weights_tmp < 0, torch.zeros_like(concept_weights_tmp), concept_weights_tmp
+                        )
+                        concept_weights_tmp = concept_weights_tmp / concept_weights_tmp.sum(dim=0)
+                        # concept_weights_tmp = torch.nan_to_num(concept_weights_tmp)
+
+                        noise_guidance_edit_tmp = torch.index_select(
+                            noise_guidance_edit.to(self.device), 0, warmup_inds
+                        )
+                        noise_guidance_edit_tmp = torch.einsum(
+                            "cb,cbijk->bijk", concept_weights_tmp, noise_guidance_edit_tmp
+                        )
+                        noise_guidance_edit_tmp = noise_guidance_edit_tmp
+                        noise_guidance = noise_guidance + noise_guidance_edit_tmp
+
+                        self.sem_guidance[i] = noise_guidance_edit_tmp.detach().cpu()
+
+                        del noise_guidance_edit_tmp
+                        del concept_weights_tmp
+                        concept_weights = concept_weights.to(self.device)
+                        noise_guidance_edit = noise_guidance_edit.to(self.device)
+
+                    concept_weights = torch.where(
+                        concept_weights < 0, torch.zeros_like(concept_weights), concept_weights
+                    )
+
+                    concept_weights = torch.nan_to_num(concept_weights)
+
+                    noise_guidance_edit = torch.einsum("cb,cbijk->bijk", concept_weights, noise_guidance_edit)
+
+                    noise_guidance_edit = noise_guidance_edit + edit_momentum_scale * edit_momentum
+
+                    edit_momentum = edit_mom_beta * edit_momentum + (1 - edit_mom_beta) * noise_guidance_edit
+
+                    if warmup_inds.shape[0] == len(noise_pred_edit_concepts):
+                        noise_guidance = noise_guidance + noise_guidance_edit
+                        self.sem_guidance[i] = noise_guidance_edit.detach().cpu()
+
+                if sem_guidance is not None:
+                    edit_guidance = sem_guidance[i].to(self.device)
+                    noise_guidance = noise_guidance + edit_guidance
+
+                noise_pred = noise_pred_uncond + noise_guidance
+
+                # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        # 8. Post-processing
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, self.device, text_embeddings.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return SemanticStableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/__init__.py
new file mode 100644
index 000000000..4ed563c4a
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/__init__.py
@@ -0,0 +1,71 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["camera"] = ["create_pan_cameras"]
+    _import_structure["pipeline_shap_e"] = ["ShapEPipeline"]
+    _import_structure["pipeline_shap_e_img2img"] = ["ShapEImg2ImgPipeline"]
+    _import_structure["renderer"] = [
+        "BoundingBoxVolume",
+        "ImportanceRaySampler",
+        "MLPNeRFModelOutput",
+        "MLPNeRSTFModel",
+        "ShapEParamsProjModel",
+        "ShapERenderer",
+        "StratifiedRaySampler",
+        "VoidNeRFModel",
+    ]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .camera import create_pan_cameras
+        from .pipeline_shap_e import ShapEPipeline
+        from .pipeline_shap_e_img2img import ShapEImg2ImgPipeline
+        from .renderer import (
+            BoundingBoxVolume,
+            ImportanceRaySampler,
+            MLPNeRFModelOutput,
+            MLPNeRSTFModel,
+            ShapEParamsProjModel,
+            ShapERenderer,
+            StratifiedRaySampler,
+            VoidNeRFModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/camera.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/camera.py
new file mode 100644
index 000000000..d4b94c300
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/camera.py
@@ -0,0 +1,147 @@
+# Copyright 2024 Open AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Tuple
+
+import numpy as np
+import torch
+
+
+@dataclass
+class DifferentiableProjectiveCamera:
+    """
+    Implements a batch, differentiable, standard pinhole camera
+    """
+
+    origin: torch.Tensor  # [batch_size x 3]
+    x: torch.Tensor  # [batch_size x 3]
+    y: torch.Tensor  # [batch_size x 3]
+    z: torch.Tensor  # [batch_size x 3]
+    width: int
+    height: int
+    x_fov: float
+    y_fov: float
+    shape: Tuple[int]
+
+    def __post_init__(self):
+        assert self.x.shape[0] == self.y.shape[0] == self.z.shape[0] == self.origin.shape[0]
+        assert self.x.shape[1] == self.y.shape[1] == self.z.shape[1] == self.origin.shape[1] == 3
+        assert len(self.x.shape) == len(self.y.shape) == len(self.z.shape) == len(self.origin.shape) == 2
+
+    def resolution(self):
+        return torch.from_numpy(np.array([self.width, self.height], dtype=np.float32))
+
+    def fov(self):
+        return torch.from_numpy(np.array([self.x_fov, self.y_fov], dtype=np.float32))
+
+    def get_image_coords(self) -> torch.Tensor:
+        """
+        :return: coords of shape (width * height, 2)
+        """
+        pixel_indices = torch.arange(self.height * self.width)
+        coords = torch.stack(
+            [
+                pixel_indices % self.width,
+                torch.div(pixel_indices, self.width, rounding_mode="trunc"),
+            ],
+            axis=1,
+        )
+        return coords
+
+    @property
+    def camera_rays(self):
+        batch_size, *inner_shape = self.shape
+        inner_batch_size = int(np.prod(inner_shape))
+
+        coords = self.get_image_coords()
+        coords = torch.broadcast_to(coords.unsqueeze(0), [batch_size * inner_batch_size, *coords.shape])
+        rays = self.get_camera_rays(coords)
+
+        rays = rays.view(batch_size, inner_batch_size * self.height * self.width, 2, 3)
+
+        return rays
+
+    def get_camera_rays(self, coords: torch.Tensor) -> torch.Tensor:
+        batch_size, *shape, n_coords = coords.shape
+        assert n_coords == 2
+        assert batch_size == self.origin.shape[0]
+
+        flat = coords.view(batch_size, -1, 2)
+
+        res = self.resolution()
+        fov = self.fov()
+
+        fracs = (flat.float() / (res - 1)) * 2 - 1
+        fracs = fracs * torch.tan(fov / 2)
+
+        fracs = fracs.view(batch_size, -1, 2)
+        directions = (
+            self.z.view(batch_size, 1, 3)
+            + self.x.view(batch_size, 1, 3) * fracs[:, :, :1]
+            + self.y.view(batch_size, 1, 3) * fracs[:, :, 1:]
+        )
+        directions = directions / directions.norm(dim=-1, keepdim=True)
+        rays = torch.stack(
+            [
+                torch.broadcast_to(self.origin.view(batch_size, 1, 3), [batch_size, directions.shape[1], 3]),
+                directions,
+            ],
+            dim=2,
+        )
+        return rays.view(batch_size, *shape, 2, 3)
+
+    def resize_image(self, width: int, height: int) -> "DifferentiableProjectiveCamera":
+        """
+        Creates a new camera for the resized view assuming the aspect ratio does not change.
+        """
+        assert width * self.height == height * self.width, "The aspect ratio should not change."
+        return DifferentiableProjectiveCamera(
+            origin=self.origin,
+            x=self.x,
+            y=self.y,
+            z=self.z,
+            width=width,
+            height=height,
+            x_fov=self.x_fov,
+            y_fov=self.y_fov,
+        )
+
+
+def create_pan_cameras(size: int) -> DifferentiableProjectiveCamera:
+    origins = []
+    xs = []
+    ys = []
+    zs = []
+    for theta in np.linspace(0, 2 * np.pi, num=20):
+        z = np.array([np.sin(theta), np.cos(theta), -0.5])
+        z /= np.sqrt(np.sum(z**2))
+        origin = -z * 4
+        x = np.array([np.cos(theta), -np.sin(theta), 0.0])
+        y = np.cross(z, x)
+        origins.append(origin)
+        xs.append(x)
+        ys.append(y)
+        zs.append(z)
+    return DifferentiableProjectiveCamera(
+        origin=torch.from_numpy(np.stack(origins, axis=0)).float(),
+        x=torch.from_numpy(np.stack(xs, axis=0)).float(),
+        y=torch.from_numpy(np.stack(ys, axis=0)).float(),
+        z=torch.from_numpy(np.stack(zs, axis=0)).float(),
+        width=size,
+        height=size,
+        x_fov=0.7,
+        y_fov=0.7,
+        shape=(1, len(xs)),
+    )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
new file mode 100644
index 000000000..1ef10e17c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
@@ -0,0 +1,334 @@
+# Copyright 2024 Open AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPTextModelWithProjection, CLIPTokenizer
+
+from ...models import PriorTransformer
+from ...schedulers import HeunDiscreteScheduler
+from ...utils import (
+    BaseOutput,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .renderer import ShapERenderer
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import DiffusionPipeline
+        >>> from diffusers.utils import export_to_gif
+
+        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+        >>> repo = "openai/shap-e"
+        >>> pipe = DiffusionPipeline.from_pretrained(repo, torch_dtype=torch.float16)
+        >>> pipe = pipe.to(device)
+
+        >>> guidance_scale = 15.0
+        >>> prompt = "a shark"
+
+        >>> images = pipe(
+        ...     prompt,
+        ...     guidance_scale=guidance_scale,
+        ...     num_inference_steps=64,
+        ...     frame_size=256,
+        ... ).images
+
+        >>> gif_path = export_to_gif(images[0], "shark_3d.gif")
+        ```
+"""
+
+
+@dataclass
+class ShapEPipelineOutput(BaseOutput):
+    """
+    Output class for [`ShapEPipeline`] and [`ShapEImg2ImgPipeline`].
+
+    Args:
+        images (`torch.FloatTensor`)
+            A list of images for 3D rendering.
+    """
+
+    images: Union[List[List[PIL.Image.Image]], List[List[np.ndarray]]]
+
+
+class ShapEPipeline(DiffusionPipeline):
+    """
+    Pipeline for generating latent representation of a 3D asset and rendering with the NeRF method.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        prior ([`PriorTransformer`]):
+            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+        text_encoder ([`~transformers.CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+             A `CLIPTokenizer` to tokenize text.
+        scheduler ([`HeunDiscreteScheduler`]):
+            A scheduler to be used in combination with the `prior` model to generate image embedding.
+        shap_e_renderer ([`ShapERenderer`]):
+            Shap-E renderer projects the generated latents into parameters of a MLP to create 3D objects with the NeRF
+            rendering method.
+    """
+
+    model_cpu_offload_seq = "text_encoder->prior"
+    _exclude_from_cpu_offload = ["shap_e_renderer"]
+
+    def __init__(
+        self,
+        prior: PriorTransformer,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        scheduler: HeunDiscreteScheduler,
+        shap_e_renderer: ShapERenderer,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            prior=prior,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            scheduler=scheduler,
+            shap_e_renderer=shap_e_renderer,
+        )
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+    ):
+        len(prompt) if isinstance(prompt, list) else 1
+
+        # YiYi Notes: set pad_token_id to be 0, not sure why I can't set in the config file
+        self.tokenizer.pad_token_id = 0
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        text_encoder_output = self.text_encoder(text_input_ids.to(device))
+        prompt_embeds = text_encoder_output.text_embeds
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        # in Shap-E it normalize the prompt_embeds and then later rescale it
+        prompt_embeds = prompt_embeds / torch.linalg.norm(prompt_embeds, dim=-1, keepdim=True)
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # Rescale the features to have unit variance
+        prompt_embeds = math.sqrt(prompt_embeds.shape[1]) * prompt_embeds
+
+        return prompt_embeds
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: str,
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        guidance_scale: float = 4.0,
+        frame_size: int = 64,
+        output_type: Optional[str] = "pil",  # pil, np, latent, mesh
+        return_dict: bool = True,
+    ):
+        """
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            frame_size (`int`, *optional*, default to 64):
+                The width and height of each image frame of the generated 3D output.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`), `"latent"` (`torch.Tensor`), or mesh ([`MeshDecoderOutput`]).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] instead of a plain
+                tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images.
+        """
+
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        device = self._execution_device
+
+        batch_size = batch_size * num_images_per_prompt
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+        prompt_embeds = self._encode_prompt(prompt, device, num_images_per_prompt, do_classifier_free_guidance)
+
+        # prior
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        num_embeddings = self.prior.config.num_embeddings
+        embedding_dim = self.prior.config.embedding_dim
+
+        latents = self.prepare_latents(
+            (batch_size, num_embeddings * embedding_dim),
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+
+        # YiYi notes: for testing only to match ldm, we can directly create a latents with desired shape: batch_size, num_embeddings, embedding_dim
+        latents = latents.reshape(latents.shape[0], num_embeddings, embedding_dim)
+
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            scaled_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            noise_pred = self.prior(
+                scaled_model_input,
+                timestep=t,
+                proj_embedding=prompt_embeds,
+            ).predicted_image_embedding
+
+            # remove the variance
+            noise_pred, _ = noise_pred.split(
+                scaled_model_input.shape[2], dim=2
+            )  # batch_size, num_embeddings, embedding_dim
+
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
+
+            latents = self.scheduler.step(
+                noise_pred,
+                timestep=t,
+                sample=latents,
+            ).prev_sample
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if output_type not in ["np", "pil", "latent", "mesh"]:
+            raise ValueError(
+                f"Only the output types `pil`, `np`, `latent` and `mesh` are supported not output_type={output_type}"
+            )
+
+        if output_type == "latent":
+            return ShapEPipelineOutput(images=latents)
+
+        images = []
+        if output_type == "mesh":
+            for i, latent in enumerate(latents):
+                mesh = self.shap_e_renderer.decode_to_mesh(
+                    latent[None, :],
+                    device,
+                )
+                images.append(mesh)
+
+        else:
+            # np, pil
+            for i, latent in enumerate(latents):
+                image = self.shap_e_renderer.decode_to_image(
+                    latent[None, :],
+                    device,
+                    size=frame_size,
+                )
+                images.append(image)
+
+            images = torch.stack(images)
+
+            images = images.cpu().numpy()
+
+            if output_type == "pil":
+                images = [self.numpy_to_pil(image) for image in images]
+
+        if not return_dict:
+            return (images,)
+
+        return ShapEPipelineOutput(images=images)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
new file mode 100644
index 000000000..641ec56a1
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
@@ -0,0 +1,321 @@
+# Copyright 2024 Open AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPVisionModel
+
+from ...models import PriorTransformer
+from ...schedulers import HeunDiscreteScheduler
+from ...utils import (
+    BaseOutput,
+    logging,
+    replace_example_docstring,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .renderer import ShapERenderer
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from PIL import Image
+        >>> import torch
+        >>> from diffusers import DiffusionPipeline
+        >>> from diffusers.utils import export_to_gif, load_image
+
+        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+        >>> repo = "openai/shap-e-img2img"
+        >>> pipe = DiffusionPipeline.from_pretrained(repo, torch_dtype=torch.float16)
+        >>> pipe = pipe.to(device)
+
+        >>> guidance_scale = 3.0
+        >>> image_url = "https://hf.co/datasets/diffusers/docs-images/resolve/main/shap-e/corgi.png"
+        >>> image = load_image(image_url).convert("RGB")
+
+        >>> images = pipe(
+        ...     image,
+        ...     guidance_scale=guidance_scale,
+        ...     num_inference_steps=64,
+        ...     frame_size=256,
+        ... ).images
+
+        >>> gif_path = export_to_gif(images[0], "corgi_3d.gif")
+        ```
+"""
+
+
+@dataclass
+class ShapEPipelineOutput(BaseOutput):
+    """
+    Output class for [`ShapEPipeline`] and [`ShapEImg2ImgPipeline`].
+
+    Args:
+        images (`torch.FloatTensor`)
+            A list of images for 3D rendering.
+    """
+
+    images: Union[PIL.Image.Image, np.ndarray]
+
+
+class ShapEImg2ImgPipeline(DiffusionPipeline):
+    """
+    Pipeline for generating latent representation of a 3D asset and rendering with the NeRF method from an image.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        prior ([`PriorTransformer`]):
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+        image_encoder ([`~transformers.CLIPVisionModel`]):
+            Frozen image-encoder.
+        image_processor ([`~transformers.CLIPImageProcessor`]):
+             A `CLIPImageProcessor` to process images.
+        scheduler ([`HeunDiscreteScheduler`]):
+            A scheduler to be used in combination with the `prior` model to generate image embedding.
+        shap_e_renderer ([`ShapERenderer`]):
+            Shap-E renderer projects the generated latents into parameters of a MLP to create 3D objects with the NeRF
+            rendering method.
+    """
+
+    model_cpu_offload_seq = "image_encoder->prior"
+    _exclude_from_cpu_offload = ["shap_e_renderer"]
+
+    def __init__(
+        self,
+        prior: PriorTransformer,
+        image_encoder: CLIPVisionModel,
+        image_processor: CLIPImageProcessor,
+        scheduler: HeunDiscreteScheduler,
+        shap_e_renderer: ShapERenderer,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            prior=prior,
+            image_encoder=image_encoder,
+            image_processor=image_processor,
+            scheduler=scheduler,
+            shap_e_renderer=shap_e_renderer,
+        )
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def _encode_image(
+        self,
+        image,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+    ):
+        if isinstance(image, List) and isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0)
+
+        if not isinstance(image, torch.Tensor):
+            image = self.image_processor(image, return_tensors="pt").pixel_values[0].unsqueeze(0)
+
+        image = image.to(dtype=self.image_encoder.dtype, device=device)
+
+        image_embeds = self.image_encoder(image)["last_hidden_state"]
+        image_embeds = image_embeds[:, 1:, :].contiguous()  # batch_size, dim, 256
+
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            negative_image_embeds = torch.zeros_like(image_embeds)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
+        return image_embeds
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image]],
+        num_images_per_prompt: int = 1,
+        num_inference_steps: int = 25,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        guidance_scale: float = 4.0,
+        frame_size: int = 64,
+        output_type: Optional[str] = "pil",  # pil, np, latent, mesh
+        return_dict: bool = True,
+    ):
+        """
+        The call function to the pipeline for generation.
+
+        Args:
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image` or tensor representing an image batch to be used as the starting point. Can also accept image
+                latents as image, but if passing latents directly it is not encoded again.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            guidance_scale (`float`, *optional*, defaults to 4.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            frame_size (`int`, *optional*, default to 64):
+                The width and height of each image frame of the generated 3D output.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`), `"latent"` (`torch.Tensor`), or mesh ([`MeshDecoderOutput`]).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] instead of a plain
+                tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.shap_e.pipeline_shap_e.ShapEPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images.
+        """
+
+        if isinstance(image, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(image, torch.Tensor):
+            batch_size = image.shape[0]
+        elif isinstance(image, list) and isinstance(image[0], (torch.Tensor, PIL.Image.Image)):
+            batch_size = len(image)
+        else:
+            raise ValueError(
+                f"`image` has to be of type `PIL.Image.Image`, `torch.Tensor`, `List[PIL.Image.Image]` or `List[torch.Tensor]` but is {type(image)}"
+            )
+
+        device = self._execution_device
+
+        batch_size = batch_size * num_images_per_prompt
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+        image_embeds = self._encode_image(image, device, num_images_per_prompt, do_classifier_free_guidance)
+
+        # prior
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        num_embeddings = self.prior.config.num_embeddings
+        embedding_dim = self.prior.config.embedding_dim
+
+        latents = self.prepare_latents(
+            (batch_size, num_embeddings * embedding_dim),
+            image_embeds.dtype,
+            device,
+            generator,
+            latents,
+            self.scheduler,
+        )
+
+        # YiYi notes: for testing only to match ldm, we can directly create a latents with desired shape: batch_size, num_embeddings, embedding_dim
+        latents = latents.reshape(latents.shape[0], num_embeddings, embedding_dim)
+
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            scaled_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            noise_pred = self.prior(
+                scaled_model_input,
+                timestep=t,
+                proj_embedding=image_embeds,
+            ).predicted_image_embedding
+
+            # remove the variance
+            noise_pred, _ = noise_pred.split(
+                scaled_model_input.shape[2], dim=2
+            )  # batch_size, num_embeddings, embedding_dim
+
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond)
+
+            latents = self.scheduler.step(
+                noise_pred,
+                timestep=t,
+                sample=latents,
+            ).prev_sample
+
+        if output_type not in ["np", "pil", "latent", "mesh"]:
+            raise ValueError(
+                f"Only the output types `pil`, `np`, `latent` and `mesh` are supported not output_type={output_type}"
+            )
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if output_type == "latent":
+            return ShapEPipelineOutput(images=latents)
+
+        images = []
+        if output_type == "mesh":
+            for i, latent in enumerate(latents):
+                mesh = self.shap_e_renderer.decode_to_mesh(
+                    latent[None, :],
+                    device,
+                )
+                images.append(mesh)
+
+        else:
+            # np, pil
+            for i, latent in enumerate(latents):
+                image = self.shap_e_renderer.decode_to_image(
+                    latent[None, :],
+                    device,
+                    size=frame_size,
+                )
+                images.append(image)
+
+            images = torch.stack(images)
+
+            images = images.cpu().numpy()
+
+            if output_type == "pil":
+                images = [self.numpy_to_pil(image) for image in images]
+
+        if not return_dict:
+            return (images,)
+
+        return ShapEPipelineOutput(images=images)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/renderer.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/renderer.py
new file mode 100644
index 000000000..047c6f7dd
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/renderer.py
@@ -0,0 +1,1050 @@
+# Copyright 2024 Open AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models import ModelMixin
+from ...utils import BaseOutput
+from .camera import create_pan_cameras
+
+
+def sample_pmf(pmf: torch.Tensor, n_samples: int) -> torch.Tensor:
+    r"""
+    Sample from the given discrete probability distribution with replacement.
+
+    The i-th bin is assumed to have mass pmf[i].
+
+    Args:
+        pmf: [batch_size, *shape, n_samples, 1] where (pmf.sum(dim=-2) == 1).all()
+        n_samples: number of samples
+
+    Return:
+        indices sampled with replacement
+    """
+
+    *shape, support_size, last_dim = pmf.shape
+    assert last_dim == 1
+
+    cdf = torch.cumsum(pmf.view(-1, support_size), dim=1)
+    inds = torch.searchsorted(cdf, torch.rand(cdf.shape[0], n_samples, device=cdf.device))
+
+    return inds.view(*shape, n_samples, 1).clamp(0, support_size - 1)
+
+
+def posenc_nerf(x: torch.Tensor, min_deg: int = 0, max_deg: int = 15) -> torch.Tensor:
+    """
+    Concatenate x and its positional encodings, following NeRF.
+
+    Reference: https://arxiv.org/pdf/2210.04628.pdf
+    """
+    if min_deg == max_deg:
+        return x
+
+    scales = 2.0 ** torch.arange(min_deg, max_deg, dtype=x.dtype, device=x.device)
+    *shape, dim = x.shape
+    xb = (x.reshape(-1, 1, dim) * scales.view(1, -1, 1)).reshape(*shape, -1)
+    assert xb.shape[-1] == dim * (max_deg - min_deg)
+    emb = torch.cat([xb, xb + math.pi / 2.0], axis=-1).sin()
+    return torch.cat([x, emb], dim=-1)
+
+
+def encode_position(position):
+    return posenc_nerf(position, min_deg=0, max_deg=15)
+
+
+def encode_direction(position, direction=None):
+    if direction is None:
+        return torch.zeros_like(posenc_nerf(position, min_deg=0, max_deg=8))
+    else:
+        return posenc_nerf(direction, min_deg=0, max_deg=8)
+
+
+def _sanitize_name(x: str) -> str:
+    return x.replace(".", "__")
+
+
+def integrate_samples(volume_range, ts, density, channels):
+    r"""
+    Function integrating the model output.
+
+    Args:
+        volume_range: Specifies the integral range [t0, t1]
+        ts: timesteps
+        density: torch.Tensor [batch_size, *shape, n_samples, 1]
+        channels: torch.Tensor [batch_size, *shape, n_samples, n_channels]
+    returns:
+        channels: integrated rgb output weights: torch.Tensor [batch_size, *shape, n_samples, 1] (density
+        *transmittance)[i] weight for each rgb output at [..., i, :]. transmittance: transmittance of this volume
+    )
+    """
+
+    # 1. Calculate the weights
+    _, _, dt = volume_range.partition(ts)
+    ddensity = density * dt
+
+    mass = torch.cumsum(ddensity, dim=-2)
+    transmittance = torch.exp(-mass[..., -1, :])
+
+    alphas = 1.0 - torch.exp(-ddensity)
+    Ts = torch.exp(torch.cat([torch.zeros_like(mass[..., :1, :]), -mass[..., :-1, :]], dim=-2))
+    # This is the probability of light hitting and reflecting off of
+    # something at depth [..., i, :].
+    weights = alphas * Ts
+
+    # 2. Integrate channels
+    channels = torch.sum(channels * weights, dim=-2)
+
+    return channels, weights, transmittance
+
+
+def volume_query_points(volume, grid_size):
+    indices = torch.arange(grid_size**3, device=volume.bbox_min.device)
+    zs = indices % grid_size
+    ys = torch.div(indices, grid_size, rounding_mode="trunc") % grid_size
+    xs = torch.div(indices, grid_size**2, rounding_mode="trunc") % grid_size
+    combined = torch.stack([xs, ys, zs], dim=1)
+    return (combined.float() / (grid_size - 1)) * (volume.bbox_max - volume.bbox_min) + volume.bbox_min
+
+
+def _convert_srgb_to_linear(u: torch.Tensor):
+    return torch.where(u <= 0.04045, u / 12.92, ((u + 0.055) / 1.055) ** 2.4)
+
+
+def _create_flat_edge_indices(
+    flat_cube_indices: torch.Tensor,
+    grid_size: Tuple[int, int, int],
+):
+    num_xs = (grid_size[0] - 1) * grid_size[1] * grid_size[2]
+    y_offset = num_xs
+    num_ys = grid_size[0] * (grid_size[1] - 1) * grid_size[2]
+    z_offset = num_xs + num_ys
+    return torch.stack(
+        [
+            # Edges spanning x-axis.
+            flat_cube_indices[:, 0] * grid_size[1] * grid_size[2]
+            + flat_cube_indices[:, 1] * grid_size[2]
+            + flat_cube_indices[:, 2],
+            flat_cube_indices[:, 0] * grid_size[1] * grid_size[2]
+            + (flat_cube_indices[:, 1] + 1) * grid_size[2]
+            + flat_cube_indices[:, 2],
+            flat_cube_indices[:, 0] * grid_size[1] * grid_size[2]
+            + flat_cube_indices[:, 1] * grid_size[2]
+            + flat_cube_indices[:, 2]
+            + 1,
+            flat_cube_indices[:, 0] * grid_size[1] * grid_size[2]
+            + (flat_cube_indices[:, 1] + 1) * grid_size[2]
+            + flat_cube_indices[:, 2]
+            + 1,
+            # Edges spanning y-axis.
+            (
+                y_offset
+                + flat_cube_indices[:, 0] * (grid_size[1] - 1) * grid_size[2]
+                + flat_cube_indices[:, 1] * grid_size[2]
+                + flat_cube_indices[:, 2]
+            ),
+            (
+                y_offset
+                + (flat_cube_indices[:, 0] + 1) * (grid_size[1] - 1) * grid_size[2]
+                + flat_cube_indices[:, 1] * grid_size[2]
+                + flat_cube_indices[:, 2]
+            ),
+            (
+                y_offset
+                + flat_cube_indices[:, 0] * (grid_size[1] - 1) * grid_size[2]
+                + flat_cube_indices[:, 1] * grid_size[2]
+                + flat_cube_indices[:, 2]
+                + 1
+            ),
+            (
+                y_offset
+                + (flat_cube_indices[:, 0] + 1) * (grid_size[1] - 1) * grid_size[2]
+                + flat_cube_indices[:, 1] * grid_size[2]
+                + flat_cube_indices[:, 2]
+                + 1
+            ),
+            # Edges spanning z-axis.
+            (
+                z_offset
+                + flat_cube_indices[:, 0] * grid_size[1] * (grid_size[2] - 1)
+                + flat_cube_indices[:, 1] * (grid_size[2] - 1)
+                + flat_cube_indices[:, 2]
+            ),
+            (
+                z_offset
+                + (flat_cube_indices[:, 0] + 1) * grid_size[1] * (grid_size[2] - 1)
+                + flat_cube_indices[:, 1] * (grid_size[2] - 1)
+                + flat_cube_indices[:, 2]
+            ),
+            (
+                z_offset
+                + flat_cube_indices[:, 0] * grid_size[1] * (grid_size[2] - 1)
+                + (flat_cube_indices[:, 1] + 1) * (grid_size[2] - 1)
+                + flat_cube_indices[:, 2]
+            ),
+            (
+                z_offset
+                + (flat_cube_indices[:, 0] + 1) * grid_size[1] * (grid_size[2] - 1)
+                + (flat_cube_indices[:, 1] + 1) * (grid_size[2] - 1)
+                + flat_cube_indices[:, 2]
+            ),
+        ],
+        dim=-1,
+    )
+
+
+class VoidNeRFModel(nn.Module):
+    """
+    Implements the default empty space model where all queries are rendered as background.
+    """
+
+    def __init__(self, background, channel_scale=255.0):
+        super().__init__()
+        background = nn.Parameter(torch.from_numpy(np.array(background)).to(dtype=torch.float32) / channel_scale)
+
+        self.register_buffer("background", background)
+
+    def forward(self, position):
+        background = self.background[None].to(position.device)
+
+        shape = position.shape[:-1]
+        ones = [1] * (len(shape) - 1)
+        n_channels = background.shape[-1]
+        background = torch.broadcast_to(background.view(background.shape[0], *ones, n_channels), [*shape, n_channels])
+
+        return background
+
+
+@dataclass
+class VolumeRange:
+    t0: torch.Tensor
+    t1: torch.Tensor
+    intersected: torch.Tensor
+
+    def __post_init__(self):
+        assert self.t0.shape == self.t1.shape == self.intersected.shape
+
+    def partition(self, ts):
+        """
+        Partitions t0 and t1 into n_samples intervals.
+
+        Args:
+            ts: [batch_size, *shape, n_samples, 1]
+
+        Return:
+
+            lower: [batch_size, *shape, n_samples, 1] upper: [batch_size, *shape, n_samples, 1] delta: [batch_size,
+            *shape, n_samples, 1]
+
+        where
+            ts \\in [lower, upper] deltas = upper - lower
+        """
+
+        mids = (ts[..., 1:, :] + ts[..., :-1, :]) * 0.5
+        lower = torch.cat([self.t0[..., None, :], mids], dim=-2)
+        upper = torch.cat([mids, self.t1[..., None, :]], dim=-2)
+        delta = upper - lower
+        assert lower.shape == upper.shape == delta.shape == ts.shape
+        return lower, upper, delta
+
+
+class BoundingBoxVolume(nn.Module):
+    """
+    Axis-aligned bounding box defined by the two opposite corners.
+    """
+
+    def __init__(
+        self,
+        *,
+        bbox_min,
+        bbox_max,
+        min_dist: float = 0.0,
+        min_t_range: float = 1e-3,
+    ):
+        """
+        Args:
+            bbox_min: the left/bottommost corner of the bounding box
+            bbox_max: the other corner of the bounding box
+            min_dist: all rays should start at least this distance away from the origin.
+        """
+        super().__init__()
+
+        self.min_dist = min_dist
+        self.min_t_range = min_t_range
+
+        self.bbox_min = torch.tensor(bbox_min)
+        self.bbox_max = torch.tensor(bbox_max)
+        self.bbox = torch.stack([self.bbox_min, self.bbox_max])
+        assert self.bbox.shape == (2, 3)
+        assert min_dist >= 0.0
+        assert min_t_range > 0.0
+
+    def intersect(
+        self,
+        origin: torch.Tensor,
+        direction: torch.Tensor,
+        t0_lower: Optional[torch.Tensor] = None,
+        epsilon=1e-6,
+    ):
+        """
+        Args:
+            origin: [batch_size, *shape, 3]
+            direction: [batch_size, *shape, 3]
+            t0_lower: Optional [batch_size, *shape, 1] lower bound of t0 when intersecting this volume.
+            params: Optional meta parameters in case Volume is parametric
+            epsilon: to stabilize calculations
+
+        Return:
+            A tuple of (t0, t1, intersected) where each has a shape [batch_size, *shape, 1]. If a ray intersects with
+            the volume, `o + td` is in the volume for all t in [t0, t1]. If the volume is bounded, t1 is guaranteed to
+            be on the boundary of the volume.
+        """
+
+        batch_size, *shape, _ = origin.shape
+        ones = [1] * len(shape)
+        bbox = self.bbox.view(1, *ones, 2, 3).to(origin.device)
+
+        def _safe_divide(a, b, epsilon=1e-6):
+            return a / torch.where(b < 0, b - epsilon, b + epsilon)
+
+        ts = _safe_divide(bbox - origin[..., None, :], direction[..., None, :], epsilon=epsilon)
+
+        # Cases to think about:
+        #
+        #   1. t1 <= t0: the ray does not pass through the AABB.
+        #   2. t0 < t1 <= 0: the ray intersects but the BB is behind the origin.
+        #   3. t0 <= 0 <= t1: the ray starts from inside the BB
+        #   4. 0 <= t0 < t1: the ray is not inside and intersects with the BB twice.
+        #
+        # 1 and 4 are clearly handled from t0 < t1 below.
+        # Making t0 at least min_dist (>= 0) takes care of 2 and 3.
+        t0 = ts.min(dim=-2).values.max(dim=-1, keepdim=True).values.clamp(self.min_dist)
+        t1 = ts.max(dim=-2).values.min(dim=-1, keepdim=True).values
+        assert t0.shape == t1.shape == (batch_size, *shape, 1)
+        if t0_lower is not None:
+            assert t0.shape == t0_lower.shape
+            t0 = torch.maximum(t0, t0_lower)
+
+        intersected = t0 + self.min_t_range < t1
+        t0 = torch.where(intersected, t0, torch.zeros_like(t0))
+        t1 = torch.where(intersected, t1, torch.ones_like(t1))
+
+        return VolumeRange(t0=t0, t1=t1, intersected=intersected)
+
+
+class StratifiedRaySampler(nn.Module):
+    """
+    Instead of fixed intervals, a sample is drawn uniformly at random from each interval.
+    """
+
+    def __init__(self, depth_mode: str = "linear"):
+        """
+        :param depth_mode: linear samples ts linearly in depth. harmonic ensures
+            closer points are sampled more densely.
+        """
+        self.depth_mode = depth_mode
+        assert self.depth_mode in ("linear", "geometric", "harmonic")
+
+    def sample(
+        self,
+        t0: torch.Tensor,
+        t1: torch.Tensor,
+        n_samples: int,
+        epsilon: float = 1e-3,
+    ) -> torch.Tensor:
+        """
+        Args:
+            t0: start time has shape [batch_size, *shape, 1]
+            t1: finish time has shape [batch_size, *shape, 1]
+            n_samples: number of ts to sample
+        Return:
+            sampled ts of shape [batch_size, *shape, n_samples, 1]
+        """
+        ones = [1] * (len(t0.shape) - 1)
+        ts = torch.linspace(0, 1, n_samples).view(*ones, n_samples).to(t0.dtype).to(t0.device)
+
+        if self.depth_mode == "linear":
+            ts = t0 * (1.0 - ts) + t1 * ts
+        elif self.depth_mode == "geometric":
+            ts = (t0.clamp(epsilon).log() * (1.0 - ts) + t1.clamp(epsilon).log() * ts).exp()
+        elif self.depth_mode == "harmonic":
+            # The original NeRF recommends this interpolation scheme for
+            # spherical scenes, but there could be some weird edge cases when
+            # the observer crosses from the inner to outer volume.
+            ts = 1.0 / (1.0 / t0.clamp(epsilon) * (1.0 - ts) + 1.0 / t1.clamp(epsilon) * ts)
+
+        mids = 0.5 * (ts[..., 1:] + ts[..., :-1])
+        upper = torch.cat([mids, t1], dim=-1)
+        lower = torch.cat([t0, mids], dim=-1)
+        # yiyi notes: add a random seed here for testing, don't forget to remove
+        torch.manual_seed(0)
+        t_rand = torch.rand_like(ts)
+
+        ts = lower + (upper - lower) * t_rand
+        return ts.unsqueeze(-1)
+
+
+class ImportanceRaySampler(nn.Module):
+    """
+    Given the initial estimate of densities, this samples more from regions/bins expected to have objects.
+    """
+
+    def __init__(
+        self,
+        volume_range: VolumeRange,
+        ts: torch.Tensor,
+        weights: torch.Tensor,
+        blur_pool: bool = False,
+        alpha: float = 1e-5,
+    ):
+        """
+        Args:
+            volume_range: the range in which a ray intersects the given volume.
+            ts: earlier samples from the coarse rendering step
+            weights: discretized version of density * transmittance
+            blur_pool: if true, use 2-tap max + 2-tap blur filter from mip-NeRF.
+            alpha: small value to add to weights.
+        """
+        self.volume_range = volume_range
+        self.ts = ts.clone().detach()
+        self.weights = weights.clone().detach()
+        self.blur_pool = blur_pool
+        self.alpha = alpha
+
+    @torch.no_grad()
+    def sample(self, t0: torch.Tensor, t1: torch.Tensor, n_samples: int) -> torch.Tensor:
+        """
+        Args:
+            t0: start time has shape [batch_size, *shape, 1]
+            t1: finish time has shape [batch_size, *shape, 1]
+            n_samples: number of ts to sample
+        Return:
+            sampled ts of shape [batch_size, *shape, n_samples, 1]
+        """
+        lower, upper, _ = self.volume_range.partition(self.ts)
+
+        batch_size, *shape, n_coarse_samples, _ = self.ts.shape
+
+        weights = self.weights
+        if self.blur_pool:
+            padded = torch.cat([weights[..., :1, :], weights, weights[..., -1:, :]], dim=-2)
+            maxes = torch.maximum(padded[..., :-1, :], padded[..., 1:, :])
+            weights = 0.5 * (maxes[..., :-1, :] + maxes[..., 1:, :])
+        weights = weights + self.alpha
+        pmf = weights / weights.sum(dim=-2, keepdim=True)
+        inds = sample_pmf(pmf, n_samples)
+        assert inds.shape == (batch_size, *shape, n_samples, 1)
+        assert (inds >= 0).all() and (inds < n_coarse_samples).all()
+
+        t_rand = torch.rand(inds.shape, device=inds.device)
+        lower_ = torch.gather(lower, -2, inds)
+        upper_ = torch.gather(upper, -2, inds)
+
+        ts = lower_ + (upper_ - lower_) * t_rand
+        ts = torch.sort(ts, dim=-2).values
+        return ts
+
+
+@dataclass
+class MeshDecoderOutput(BaseOutput):
+    """
+    A 3D triangle mesh with optional data at the vertices and faces.
+
+    Args:
+        verts (`torch.Tensor` of shape `(N, 3)`):
+            array of vertext coordinates
+        faces (`torch.Tensor` of shape `(N, 3)`):
+            array of triangles, pointing to indices in verts.
+        vertext_channels (Dict):
+            vertext coordinates for each color channel
+    """
+
+    verts: torch.Tensor
+    faces: torch.Tensor
+    vertex_channels: Dict[str, torch.Tensor]
+
+
+class MeshDecoder(nn.Module):
+    """
+    Construct meshes from Signed distance functions (SDFs) using marching cubes method
+    """
+
+    def __init__(self):
+        super().__init__()
+        cases = torch.zeros(256, 5, 3, dtype=torch.long)
+        masks = torch.zeros(256, 5, dtype=torch.bool)
+
+        self.register_buffer("cases", cases)
+        self.register_buffer("masks", masks)
+
+    def forward(self, field: torch.Tensor, min_point: torch.Tensor, size: torch.Tensor):
+        """
+        For a signed distance field, produce a mesh using marching cubes.
+
+        :param field: a 3D tensor of field values, where negative values correspond
+                    to the outside of the shape. The dimensions correspond to the x, y, and z directions, respectively.
+        :param min_point: a tensor of shape [3] containing the point corresponding
+                        to (0, 0, 0) in the field.
+        :param size: a tensor of shape [3] containing the per-axis distance from the
+                    (0, 0, 0) field corner and the (-1, -1, -1) field corner.
+        """
+        assert len(field.shape) == 3, "input must be a 3D scalar field"
+        dev = field.device
+
+        cases = self.cases.to(dev)
+        masks = self.masks.to(dev)
+
+        min_point = min_point.to(dev)
+        size = size.to(dev)
+
+        grid_size = field.shape
+        grid_size_tensor = torch.tensor(grid_size).to(size)
+
+        # Create bitmasks between 0 and 255 (inclusive) indicating the state
+        # of the eight corners of each cube.
+        bitmasks = (field > 0).to(torch.uint8)
+        bitmasks = bitmasks[:-1, :, :] | (bitmasks[1:, :, :] << 1)
+        bitmasks = bitmasks[:, :-1, :] | (bitmasks[:, 1:, :] << 2)
+        bitmasks = bitmasks[:, :, :-1] | (bitmasks[:, :, 1:] << 4)
+
+        # Compute corner coordinates across the entire grid.
+        corner_coords = torch.empty(*grid_size, 3, device=dev, dtype=field.dtype)
+        corner_coords[range(grid_size[0]), :, :, 0] = torch.arange(grid_size[0], device=dev, dtype=field.dtype)[
+            :, None, None
+        ]
+        corner_coords[:, range(grid_size[1]), :, 1] = torch.arange(grid_size[1], device=dev, dtype=field.dtype)[
+            :, None
+        ]
+        corner_coords[:, :, range(grid_size[2]), 2] = torch.arange(grid_size[2], device=dev, dtype=field.dtype)
+
+        # Compute all vertices across all edges in the grid, even though we will
+        # throw some out later. We have (X-1)*Y*Z + X*(Y-1)*Z + X*Y*(Z-1) vertices.
+        # These are all midpoints, and don't account for interpolation (which is
+        # done later based on the used edge midpoints).
+        edge_midpoints = torch.cat(
+            [
+                ((corner_coords[:-1] + corner_coords[1:]) / 2).reshape(-1, 3),
+                ((corner_coords[:, :-1] + corner_coords[:, 1:]) / 2).reshape(-1, 3),
+                ((corner_coords[:, :, :-1] + corner_coords[:, :, 1:]) / 2).reshape(-1, 3),
+            ],
+            dim=0,
+        )
+
+        # Create a flat array of [X, Y, Z] indices for each cube.
+        cube_indices = torch.zeros(
+            grid_size[0] - 1, grid_size[1] - 1, grid_size[2] - 1, 3, device=dev, dtype=torch.long
+        )
+        cube_indices[range(grid_size[0] - 1), :, :, 0] = torch.arange(grid_size[0] - 1, device=dev)[:, None, None]
+        cube_indices[:, range(grid_size[1] - 1), :, 1] = torch.arange(grid_size[1] - 1, device=dev)[:, None]
+        cube_indices[:, :, range(grid_size[2] - 1), 2] = torch.arange(grid_size[2] - 1, device=dev)
+        flat_cube_indices = cube_indices.reshape(-1, 3)
+
+        # Create a flat array mapping each cube to 12 global edge indices.
+        edge_indices = _create_flat_edge_indices(flat_cube_indices, grid_size)
+
+        # Apply the LUT to figure out the triangles.
+        flat_bitmasks = bitmasks.reshape(-1).long()  # must cast to long for indexing to believe this not a mask
+        local_tris = cases[flat_bitmasks]
+        local_masks = masks[flat_bitmasks]
+        # Compute the global edge indices for the triangles.
+        global_tris = torch.gather(edge_indices, 1, local_tris.reshape(local_tris.shape[0], -1)).reshape(
+            local_tris.shape
+        )
+        # Select the used triangles for each cube.
+        selected_tris = global_tris.reshape(-1, 3)[local_masks.reshape(-1)]
+
+        # Now we have a bunch of indices into the full list of possible vertices,
+        # but we want to reduce this list to only the used vertices.
+        used_vertex_indices = torch.unique(selected_tris.view(-1))
+        used_edge_midpoints = edge_midpoints[used_vertex_indices]
+        old_index_to_new_index = torch.zeros(len(edge_midpoints), device=dev, dtype=torch.long)
+        old_index_to_new_index[used_vertex_indices] = torch.arange(
+            len(used_vertex_indices), device=dev, dtype=torch.long
+        )
+
+        # Rewrite the triangles to use the new indices
+        faces = torch.gather(old_index_to_new_index, 0, selected_tris.view(-1)).reshape(selected_tris.shape)
+
+        # Compute the actual interpolated coordinates corresponding to edge midpoints.
+        v1 = torch.floor(used_edge_midpoints).to(torch.long)
+        v2 = torch.ceil(used_edge_midpoints).to(torch.long)
+        s1 = field[v1[:, 0], v1[:, 1], v1[:, 2]]
+        s2 = field[v2[:, 0], v2[:, 1], v2[:, 2]]
+        p1 = (v1.float() / (grid_size_tensor - 1)) * size + min_point
+        p2 = (v2.float() / (grid_size_tensor - 1)) * size + min_point
+        # The signs of s1 and s2 should be different. We want to find
+        # t such that t*s2 + (1-t)*s1 = 0.
+        t = (s1 / (s1 - s2))[:, None]
+        verts = t * p2 + (1 - t) * p1
+
+        return MeshDecoderOutput(verts=verts, faces=faces, vertex_channels=None)
+
+
+@dataclass
+class MLPNeRFModelOutput(BaseOutput):
+    density: torch.Tensor
+    signed_distance: torch.Tensor
+    channels: torch.Tensor
+    ts: torch.Tensor
+
+
+class MLPNeRSTFModel(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        d_hidden: int = 256,
+        n_output: int = 12,
+        n_hidden_layers: int = 6,
+        act_fn: str = "swish",
+        insert_direction_at: int = 4,
+    ):
+        super().__init__()
+
+        # Instantiate the MLP
+
+        # Find out the dimension of encoded position and direction
+        dummy = torch.eye(1, 3)
+        d_posenc_pos = encode_position(position=dummy).shape[-1]
+        d_posenc_dir = encode_direction(position=dummy).shape[-1]
+
+        mlp_widths = [d_hidden] * n_hidden_layers
+        input_widths = [d_posenc_pos] + mlp_widths
+        output_widths = mlp_widths + [n_output]
+
+        if insert_direction_at is not None:
+            input_widths[insert_direction_at] += d_posenc_dir
+
+        self.mlp = nn.ModuleList([nn.Linear(d_in, d_out) for d_in, d_out in zip(input_widths, output_widths)])
+
+        if act_fn == "swish":
+            # self.activation = swish
+            # yiyi testing:
+            self.activation = lambda x: F.silu(x)
+        else:
+            raise ValueError(f"Unsupported activation function {act_fn}")
+
+        self.sdf_activation = torch.tanh
+        self.density_activation = torch.nn.functional.relu
+        self.channel_activation = torch.sigmoid
+
+    def map_indices_to_keys(self, output):
+        h_map = {
+            "sdf": (0, 1),
+            "density_coarse": (1, 2),
+            "density_fine": (2, 3),
+            "stf": (3, 6),
+            "nerf_coarse": (6, 9),
+            "nerf_fine": (9, 12),
+        }
+
+        mapped_output = {k: output[..., start:end] for k, (start, end) in h_map.items()}
+
+        return mapped_output
+
+    def forward(self, *, position, direction, ts, nerf_level="coarse", rendering_mode="nerf"):
+        h = encode_position(position)
+
+        h_preact = h
+        h_directionless = None
+        for i, layer in enumerate(self.mlp):
+            if i == self.config.insert_direction_at:  # 4 in the config
+                h_directionless = h_preact
+                h_direction = encode_direction(position, direction=direction)
+                h = torch.cat([h, h_direction], dim=-1)
+
+            h = layer(h)
+
+            h_preact = h
+
+            if i < len(self.mlp) - 1:
+                h = self.activation(h)
+
+        h_final = h
+        if h_directionless is None:
+            h_directionless = h_preact
+
+        activation = self.map_indices_to_keys(h_final)
+
+        if nerf_level == "coarse":
+            h_density = activation["density_coarse"]
+        else:
+            h_density = activation["density_fine"]
+
+        if rendering_mode == "nerf":
+            if nerf_level == "coarse":
+                h_channels = activation["nerf_coarse"]
+            else:
+                h_channels = activation["nerf_fine"]
+
+        elif rendering_mode == "stf":
+            h_channels = activation["stf"]
+
+        density = self.density_activation(h_density)
+        signed_distance = self.sdf_activation(activation["sdf"])
+        channels = self.channel_activation(h_channels)
+
+        # yiyi notes: I think signed_distance is not used
+        return MLPNeRFModelOutput(density=density, signed_distance=signed_distance, channels=channels, ts=ts)
+
+
+class ChannelsProj(nn.Module):
+    def __init__(
+        self,
+        *,
+        vectors: int,
+        channels: int,
+        d_latent: int,
+    ):
+        super().__init__()
+        self.proj = nn.Linear(d_latent, vectors * channels)
+        self.norm = nn.LayerNorm(channels)
+        self.d_latent = d_latent
+        self.vectors = vectors
+        self.channels = channels
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_bvd = x
+        w_vcd = self.proj.weight.view(self.vectors, self.channels, self.d_latent)
+        b_vc = self.proj.bias.view(1, self.vectors, self.channels)
+        h = torch.einsum("bvd,vcd->bvc", x_bvd, w_vcd)
+        h = self.norm(h)
+
+        h = h + b_vc
+        return h
+
+
+class ShapEParamsProjModel(ModelMixin, ConfigMixin):
+    """
+    project the latent representation of a 3D asset to obtain weights of a multi-layer perceptron (MLP).
+
+    For more details, see the original paper:
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        *,
+        param_names: Tuple[str] = (
+            "nerstf.mlp.0.weight",
+            "nerstf.mlp.1.weight",
+            "nerstf.mlp.2.weight",
+            "nerstf.mlp.3.weight",
+        ),
+        param_shapes: Tuple[Tuple[int]] = (
+            (256, 93),
+            (256, 256),
+            (256, 256),
+            (256, 256),
+        ),
+        d_latent: int = 1024,
+    ):
+        super().__init__()
+
+        # check inputs
+        if len(param_names) != len(param_shapes):
+            raise ValueError("Must provide same number of `param_names` as `param_shapes`")
+        self.projections = nn.ModuleDict({})
+        for k, (vectors, channels) in zip(param_names, param_shapes):
+            self.projections[_sanitize_name(k)] = ChannelsProj(
+                vectors=vectors,
+                channels=channels,
+                d_latent=d_latent,
+            )
+
+    def forward(self, x: torch.Tensor):
+        out = {}
+        start = 0
+        for k, shape in zip(self.config.param_names, self.config.param_shapes):
+            vectors, _ = shape
+            end = start + vectors
+            x_bvd = x[:, start:end]
+            out[k] = self.projections[_sanitize_name(k)](x_bvd).reshape(len(x), *shape)
+            start = end
+        return out
+
+
+class ShapERenderer(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        *,
+        param_names: Tuple[str] = (
+            "nerstf.mlp.0.weight",
+            "nerstf.mlp.1.weight",
+            "nerstf.mlp.2.weight",
+            "nerstf.mlp.3.weight",
+        ),
+        param_shapes: Tuple[Tuple[int]] = (
+            (256, 93),
+            (256, 256),
+            (256, 256),
+            (256, 256),
+        ),
+        d_latent: int = 1024,
+        d_hidden: int = 256,
+        n_output: int = 12,
+        n_hidden_layers: int = 6,
+        act_fn: str = "swish",
+        insert_direction_at: int = 4,
+        background: Tuple[float] = (
+            255.0,
+            255.0,
+            255.0,
+        ),
+    ):
+        super().__init__()
+
+        self.params_proj = ShapEParamsProjModel(
+            param_names=param_names,
+            param_shapes=param_shapes,
+            d_latent=d_latent,
+        )
+        self.mlp = MLPNeRSTFModel(d_hidden, n_output, n_hidden_layers, act_fn, insert_direction_at)
+        self.void = VoidNeRFModel(background=background, channel_scale=255.0)
+        self.volume = BoundingBoxVolume(bbox_max=[1.0, 1.0, 1.0], bbox_min=[-1.0, -1.0, -1.0])
+        self.mesh_decoder = MeshDecoder()
+
+    @torch.no_grad()
+    def render_rays(self, rays, sampler, n_samples, prev_model_out=None, render_with_direction=False):
+        """
+        Perform volumetric rendering over a partition of possible t's in the union of rendering volumes (written below
+        with some abuse of notations)
+
+            C(r) := sum(
+                transmittance(t[i]) * integrate(
+                    lambda t: density(t) * channels(t) * transmittance(t), [t[i], t[i + 1]],
+                ) for i in range(len(parts))
+            ) + transmittance(t[-1]) * void_model(t[-1]).channels
+
+        where
+
+        1) transmittance(s) := exp(-integrate(density, [t[0], s])) calculates the probability of light passing through
+        the volume specified by [t[0], s]. (transmittance of 1 means light can pass freely) 2) density and channels are
+        obtained by evaluating the appropriate part.model at time t. 3) [t[i], t[i + 1]] is defined as the range of t
+        where the ray intersects (parts[i].volume \\ union(part.volume for part in parts[:i])) at the surface of the
+        shell (if bounded). If the ray does not intersect, the integral over this segment is evaluated as 0 and
+        transmittance(t[i + 1]) := transmittance(t[i]). 4) The last term is integration to infinity (e.g. [t[-1],
+        math.inf]) that is evaluated by the void_model (i.e. we consider this space to be empty).
+
+        args:
+            rays: [batch_size x ... x 2 x 3] origin and direction. sampler: disjoint volume integrals. n_samples:
+            number of ts to sample. prev_model_outputs: model outputs from the previous rendering step, including
+
+        :return: A tuple of
+            - `channels`
+            - A importance samplers for additional fine-grained rendering
+            - raw model output
+        """
+        origin, direction = rays[..., 0, :], rays[..., 1, :]
+
+        # Integrate over [t[i], t[i + 1]]
+
+        # 1 Intersect the rays with the current volume and sample ts to integrate along.
+        vrange = self.volume.intersect(origin, direction, t0_lower=None)
+        ts = sampler.sample(vrange.t0, vrange.t1, n_samples)
+        ts = ts.to(rays.dtype)
+
+        if prev_model_out is not None:
+            # Append the previous ts now before fprop because previous
+            # rendering used a different model and we can't reuse the output.
+            ts = torch.sort(torch.cat([ts, prev_model_out.ts], dim=-2), dim=-2).values
+
+        batch_size, *_shape, _t0_dim = vrange.t0.shape
+        _, *ts_shape, _ts_dim = ts.shape
+
+        # 2. Get the points along the ray and query the model
+        directions = torch.broadcast_to(direction.unsqueeze(-2), [batch_size, *ts_shape, 3])
+        positions = origin.unsqueeze(-2) + ts * directions
+
+        directions = directions.to(self.mlp.dtype)
+        positions = positions.to(self.mlp.dtype)
+
+        optional_directions = directions if render_with_direction else None
+
+        model_out = self.mlp(
+            position=positions,
+            direction=optional_directions,
+            ts=ts,
+            nerf_level="coarse" if prev_model_out is None else "fine",
+        )
+
+        # 3. Integrate the model results
+        channels, weights, transmittance = integrate_samples(
+            vrange, model_out.ts, model_out.density, model_out.channels
+        )
+
+        # 4. Clean up results that do not intersect with the volume.
+        transmittance = torch.where(vrange.intersected, transmittance, torch.ones_like(transmittance))
+        channels = torch.where(vrange.intersected, channels, torch.zeros_like(channels))
+        # 5. integration to infinity (e.g. [t[-1], math.inf]) that is evaluated by the void_model (i.e. we consider this space to be empty).
+        channels = channels + transmittance * self.void(origin)
+
+        weighted_sampler = ImportanceRaySampler(vrange, ts=model_out.ts, weights=weights)
+
+        return channels, weighted_sampler, model_out
+
+    @torch.no_grad()
+    def decode_to_image(
+        self,
+        latents,
+        device,
+        size: int = 64,
+        ray_batch_size: int = 4096,
+        n_coarse_samples=64,
+        n_fine_samples=128,
+    ):
+        # project the parameters from the generated latents
+        projected_params = self.params_proj(latents)
+
+        # update the mlp layers of the renderer
+        for name, param in self.mlp.state_dict().items():
+            if f"nerstf.{name}" in projected_params.keys():
+                param.copy_(projected_params[f"nerstf.{name}"].squeeze(0))
+
+        # create cameras object
+        camera = create_pan_cameras(size)
+        rays = camera.camera_rays
+        rays = rays.to(device)
+        n_batches = rays.shape[1] // ray_batch_size
+
+        coarse_sampler = StratifiedRaySampler()
+
+        images = []
+
+        for idx in range(n_batches):
+            rays_batch = rays[:, idx * ray_batch_size : (idx + 1) * ray_batch_size]
+
+            # render rays with coarse, stratified samples.
+            _, fine_sampler, coarse_model_out = self.render_rays(rays_batch, coarse_sampler, n_coarse_samples)
+            # Then, render with additional importance-weighted ray samples.
+            channels, _, _ = self.render_rays(
+                rays_batch, fine_sampler, n_fine_samples, prev_model_out=coarse_model_out
+            )
+
+            images.append(channels)
+
+        images = torch.cat(images, dim=1)
+        images = images.view(*camera.shape, camera.height, camera.width, -1).squeeze(0)
+
+        return images
+
+    @torch.no_grad()
+    def decode_to_mesh(
+        self,
+        latents,
+        device,
+        grid_size: int = 128,
+        query_batch_size: int = 4096,
+        texture_channels: Tuple = ("R", "G", "B"),
+    ):
+        # 1. project the parameters from the generated latents
+        projected_params = self.params_proj(latents)
+
+        # 2. update the mlp layers of the renderer
+        for name, param in self.mlp.state_dict().items():
+            if f"nerstf.{name}" in projected_params.keys():
+                param.copy_(projected_params[f"nerstf.{name}"].squeeze(0))
+
+        # 3. decoding with STF rendering
+        # 3.1 query the SDF values at vertices along a regular 128**3 grid
+
+        query_points = volume_query_points(self.volume, grid_size)
+        query_positions = query_points[None].repeat(1, 1, 1).to(device=device, dtype=self.mlp.dtype)
+
+        fields = []
+
+        for idx in range(0, query_positions.shape[1], query_batch_size):
+            query_batch = query_positions[:, idx : idx + query_batch_size]
+
+            model_out = self.mlp(
+                position=query_batch, direction=None, ts=None, nerf_level="fine", rendering_mode="stf"
+            )
+            fields.append(model_out.signed_distance)
+
+        # predicted SDF values
+        fields = torch.cat(fields, dim=1)
+        fields = fields.float()
+
+        assert (
+            len(fields.shape) == 3 and fields.shape[-1] == 1
+        ), f"expected [meta_batch x inner_batch] SDF results, but got {fields.shape}"
+
+        fields = fields.reshape(1, *([grid_size] * 3))
+
+        # create grid 128 x 128 x 128
+        # - force a negative border around the SDFs to close off all the models.
+        full_grid = torch.zeros(
+            1,
+            grid_size + 2,
+            grid_size + 2,
+            grid_size + 2,
+            device=fields.device,
+            dtype=fields.dtype,
+        )
+        full_grid.fill_(-1.0)
+        full_grid[:, 1:-1, 1:-1, 1:-1] = fields
+        fields = full_grid
+
+        # apply a differentiable implementation of Marching Cubes to construct meshs
+        raw_meshes = []
+        mesh_mask = []
+
+        for field in fields:
+            raw_mesh = self.mesh_decoder(field, self.volume.bbox_min, self.volume.bbox_max - self.volume.bbox_min)
+            mesh_mask.append(True)
+            raw_meshes.append(raw_mesh)
+
+        mesh_mask = torch.tensor(mesh_mask, device=fields.device)
+        max_vertices = max(len(m.verts) for m in raw_meshes)
+
+        # 3.2. query the texture color head at each vertex of the resulting mesh.
+        texture_query_positions = torch.stack(
+            [m.verts[torch.arange(0, max_vertices) % len(m.verts)] for m in raw_meshes],
+            dim=0,
+        )
+        texture_query_positions = texture_query_positions.to(device=device, dtype=self.mlp.dtype)
+
+        textures = []
+
+        for idx in range(0, texture_query_positions.shape[1], query_batch_size):
+            query_batch = texture_query_positions[:, idx : idx + query_batch_size]
+
+            texture_model_out = self.mlp(
+                position=query_batch, direction=None, ts=None, nerf_level="fine", rendering_mode="stf"
+            )
+            textures.append(texture_model_out.channels)
+
+        # predict texture color
+        textures = torch.cat(textures, dim=1)
+
+        textures = _convert_srgb_to_linear(textures)
+        textures = textures.float()
+
+        # 3.3 augument the mesh with texture data
+        assert len(textures.shape) == 3 and textures.shape[-1] == len(
+            texture_channels
+        ), f"expected [meta_batch x inner_batch x texture_channels] field results, but got {textures.shape}"
+
+        for m, texture in zip(raw_meshes, textures):
+            texture = texture[: len(m.verts)]
+            m.vertex_channels = dict(zip(texture_channels, texture.unbind(-1)))
+
+        return raw_meshes[0]
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_cascade/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_cascade/__init__.py
new file mode 100644
index 000000000..5270cb94a
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_cascade/__init__.py
@@ -0,0 +1,50 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_stable_cascade"] = ["StableCascadeDecoderPipeline"]
+    _import_structure["pipeline_stable_cascade_combined"] = ["StableCascadeCombinedPipeline"]
+    _import_structure["pipeline_stable_cascade_prior"] = ["StableCascadePriorPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipeline_stable_cascade import StableCascadeDecoderPipeline
+        from .pipeline_stable_cascade_combined import StableCascadeCombinedPipeline
+        from .pipeline_stable_cascade_prior import StableCascadePriorPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py
new file mode 100644
index 000000000..a05fb9001
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py
@@ -0,0 +1,482 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ...models import StableCascadeUNet
+from ...schedulers import DDPMWuerstchenScheduler
+from ...utils import is_torch_version, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from ..wuerstchen.modeling_paella_vq_model import PaellaVQModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableCascadePriorPipeline, StableCascadeDecoderPipeline
+
+        >>> prior_pipe = StableCascadePriorPipeline.from_pretrained(
+        ...     "stabilityai/stable-cascade-prior", torch_dtype=torch.bfloat16
+        ... ).to("cuda")
+        >>> gen_pipe = StableCascadeDecoderPipeline.from_pretrain(
+        ...     "stabilityai/stable-cascade", torch_dtype=torch.float16
+        ... ).to("cuda")
+
+        >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
+        >>> prior_output = pipe(prompt)
+        >>> images = gen_pipe(prior_output.image_embeddings, prompt=prompt)
+        ```
+"""
+
+
+class StableCascadeDecoderPipeline(DiffusionPipeline):
+    """
+    Pipeline for generating images from the Stable Cascade model.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        tokenizer (`CLIPTokenizer`):
+            The CLIP tokenizer.
+        text_encoder (`CLIPTextModel`):
+            The CLIP text encoder.
+        decoder ([`StableCascadeUNet`]):
+            The Stable Cascade decoder unet.
+        vqgan ([`PaellaVQModel`]):
+            The VQGAN model.
+        scheduler ([`DDPMWuerstchenScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+        latent_dim_scale (float, `optional`, defaults to 10.67):
+            Multiplier to determine the VQ latent space size from the image embeddings. If the image embeddings are
+            height=24 and width=24, the VQ latent shape needs to be height=int(24*10.67)=256 and
+            width=int(24*10.67)=256 in order to match the training conditions.
+    """
+
+    unet_name = "decoder"
+    text_encoder_name = "text_encoder"
+    model_cpu_offload_seq = "text_encoder->decoder->vqgan"
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds_pooled",
+        "negative_prompt_embeds",
+        "image_embeddings",
+    ]
+
+    def __init__(
+        self,
+        decoder: StableCascadeUNet,
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModel,
+        scheduler: DDPMWuerstchenScheduler,
+        vqgan: PaellaVQModel,
+        latent_dim_scale: float = 10.67,
+    ) -> None:
+        super().__init__()
+        self.register_modules(
+            decoder=decoder,
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            scheduler=scheduler,
+            vqgan=vqgan,
+        )
+        self.register_to_config(latent_dim_scale=latent_dim_scale)
+
+    def prepare_latents(self, image_embeddings, num_images_per_prompt, dtype, device, generator, latents, scheduler):
+        batch_size, channels, height, width = image_embeddings.shape
+        latents_shape = (
+            batch_size * num_images_per_prompt,
+            4,
+            int(height * self.config.latent_dim_scale),
+            int(width * self.config.latent_dim_scale),
+        )
+
+        if latents is None:
+            latents = randn_tensor(latents_shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def encode_prompt(
+        self,
+        device,
+        batch_size,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        prompt=None,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
+    ):
+        if prompt_embeds is None:
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            attention_mask = text_inputs.attention_mask
+
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+                text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+                attention_mask = attention_mask[:, : self.tokenizer.model_max_length]
+
+            text_encoder_output = self.text_encoder(
+                text_input_ids.to(device), attention_mask=attention_mask.to(device), output_hidden_states=True
+            )
+            prompt_embeds = text_encoder_output.hidden_states[-1]
+            if prompt_embeds_pooled is None:
+                prompt_embeds_pooled = text_encoder_output.text_embeds.unsqueeze(1)
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        prompt_embeds_pooled = prompt_embeds_pooled.to(dtype=self.text_encoder.dtype, device=device)
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        prompt_embeds_pooled = prompt_embeds_pooled.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if negative_prompt_embeds is None and do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=uncond_input.attention_mask.to(device),
+                output_hidden_states=True,
+            )
+
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.hidden_states[-1]
+            negative_prompt_embeds_pooled = negative_prompt_embeds_text_encoder_output.text_embeds.unsqueeze(1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            seq_len = negative_prompt_embeds_pooled.shape[1]
+            negative_prompt_embeds_pooled = negative_prompt_embeds_pooled.to(
+                dtype=self.text_encoder.dtype, device=device
+            )
+            negative_prompt_embeds_pooled = negative_prompt_embeds_pooled.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds_pooled = negative_prompt_embeds_pooled.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            # done duplicates
+
+        return prompt_embeds, prompt_embeds_pooled, negative_prompt_embeds, negative_prompt_embeds_pooled
+
+    def check_inputs(
+        self,
+        prompt,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image_embeddings: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        prompt: Union[str, List[str]] = None,
+        num_inference_steps: int = 10,
+        guidance_scale: float = 0.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image_embedding (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                Image Embeddings either extracted from an image or generated by a Prior Model.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            num_inference_steps (`int`, *optional*, defaults to 12):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 0.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `decoder_guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting
+                `decoder_guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely
+                linked to the text `prompt`, usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `decoder_guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` input
+                argument.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple` [`~pipelines.ImagePipelineOutput`] if `return_dict` is True,
+            otherwise a `tuple`. When returning a tuple, the first element is a list with the generated image
+            embeddings.
+        """
+
+        # 0. Define commonly used variables
+        device = self._execution_device
+        dtype = self.decoder.dtype
+        self._guidance_scale = guidance_scale
+        if is_torch_version("<", "2.2.0") and dtype == torch.bfloat16:
+            raise ValueError("`StableCascadeDecoderPipeline` requires torch>=2.2.0 when using `torch.bfloat16` dtype.")
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        )
+        if isinstance(image_embeddings, list):
+            image_embeddings = torch.cat(image_embeddings, dim=0)
+        batch_size = image_embeddings.shape[0]
+
+        # 2. Encode caption
+        if prompt_embeds is None and negative_prompt_embeds is None:
+            _, prompt_embeds_pooled, _, negative_prompt_embeds_pooled = self.encode_prompt(
+                prompt=prompt,
+                device=device,
+                batch_size=batch_size,
+                num_images_per_prompt=num_images_per_prompt,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                negative_prompt=negative_prompt,
+                prompt_embeds=prompt_embeds,
+                prompt_embeds_pooled=prompt_embeds_pooled,
+                negative_prompt_embeds=negative_prompt_embeds,
+                negative_prompt_embeds_pooled=negative_prompt_embeds_pooled,
+            )
+
+        # The pooled embeds from the prior are pooled again before being passed to the decoder
+        prompt_embeds_pooled = (
+            torch.cat([prompt_embeds_pooled, negative_prompt_embeds_pooled])
+            if self.do_classifier_free_guidance
+            else prompt_embeds_pooled
+        )
+        effnet = (
+            torch.cat([image_embeddings, torch.zeros_like(image_embeddings)])
+            if self.do_classifier_free_guidance
+            else image_embeddings
+        )
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latents
+        latents = self.prepare_latents(
+            image_embeddings, num_images_per_prompt, dtype, device, generator, latents, self.scheduler
+        )
+
+        # 6. Run denoising loop
+        self._num_timesteps = len(timesteps[:-1])
+        for i, t in enumerate(self.progress_bar(timesteps[:-1])):
+            timestep_ratio = t.expand(latents.size(0)).to(dtype)
+
+            # 7. Denoise latents
+            predicted_latents = self.decoder(
+                sample=torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents,
+                timestep_ratio=torch.cat([timestep_ratio] * 2) if self.do_classifier_free_guidance else timestep_ratio,
+                clip_text_pooled=prompt_embeds_pooled,
+                effnet=effnet,
+                return_dict=False,
+            )[0]
+
+            # 8. Check for classifier free guidance and apply it
+            if self.do_classifier_free_guidance:
+                predicted_latents_text, predicted_latents_uncond = predicted_latents.chunk(2)
+                predicted_latents = torch.lerp(predicted_latents_uncond, predicted_latents_text, self.guidance_scale)
+
+            # 9. Renoise latents to next timestep
+            latents = self.scheduler.step(
+                model_output=predicted_latents,
+                timestep=timestep_ratio,
+                sample=latents,
+                generator=generator,
+            ).prev_sample
+
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                latents = callback_outputs.pop("latents", latents)
+                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+        if output_type not in ["pt", "np", "pil", "latent"]:
+            raise ValueError(
+                f"Only the output types `pt`, `np`, `pil` and `latent` are supported not output_type={output_type}"
+            )
+
+        if not output_type == "latent":
+            # 10. Scale and decode the image latents with vq-vae
+            latents = self.vqgan.config.scale_factor * latents
+            images = self.vqgan.decode(latents).sample.clamp(0, 1)
+            if output_type == "np":
+                images = images.permute(0, 2, 3, 1).cpu().float().numpy()  # float() as bfloat16-> numpy doesnt work
+            elif output_type == "pil":
+                images = images.permute(0, 2, 3, 1).cpu().float().numpy()  # float() as bfloat16-> numpy doesnt work
+                images = self.numpy_to_pil(images)
+        else:
+            images = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return images
+        return ImagePipelineOutput(images)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py
new file mode 100644
index 000000000..07afdedac
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py
@@ -0,0 +1,311 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Dict, List, Optional, Union
+
+import PIL
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...models import StableCascadeUNet
+from ...schedulers import DDPMWuerstchenScheduler
+from ...utils import is_torch_version, replace_example_docstring
+from ..pipeline_utils import DiffusionPipeline
+from ..wuerstchen.modeling_paella_vq_model import PaellaVQModel
+from .pipeline_stable_cascade import StableCascadeDecoderPipeline
+from .pipeline_stable_cascade_prior import StableCascadePriorPipeline
+
+
+TEXT2IMAGE_EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableCascadeCombinedPipeline
+        >>> pipe = StableCascadeCombinedPipeline.from_pretrained("stabilityai/stable-cascade", variant="bf16", torch_dtype=torch.bfloat16)
+        >>> pipe.enable_model_cpu_offload()
+        >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
+        >>> images = pipe(prompt=prompt)
+        ```
+"""
+
+
+class StableCascadeCombinedPipeline(DiffusionPipeline):
+    """
+    Combined Pipeline for text-to-image generation using Stable Cascade.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        tokenizer (`CLIPTokenizer`):
+            The decoder tokenizer to be used for text inputs.
+        text_encoder (`CLIPTextModel`):
+            The decoder text encoder to be used for text inputs.
+        decoder (`StableCascadeUNet`):
+            The decoder model to be used for decoder image generation pipeline.
+        scheduler (`DDPMWuerstchenScheduler`):
+            The scheduler to be used for decoder image generation pipeline.
+        vqgan (`PaellaVQModel`):
+            The VQGAN model to be used for decoder image generation pipeline.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `image_encoder`.
+        image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen CLIP image-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        prior_prior (`StableCascadeUNet`):
+            The prior model to be used for prior pipeline.
+        prior_scheduler (`DDPMWuerstchenScheduler`):
+            The scheduler to be used for prior pipeline.
+    """
+
+    _load_connected_pipes = True
+
+    def __init__(
+        self,
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModel,
+        decoder: StableCascadeUNet,
+        scheduler: DDPMWuerstchenScheduler,
+        vqgan: PaellaVQModel,
+        prior_prior: StableCascadeUNet,
+        prior_text_encoder: CLIPTextModel,
+        prior_tokenizer: CLIPTokenizer,
+        prior_scheduler: DDPMWuerstchenScheduler,
+        prior_feature_extractor: Optional[CLIPImageProcessor] = None,
+        prior_image_encoder: Optional[CLIPVisionModelWithProjection] = None,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            decoder=decoder,
+            scheduler=scheduler,
+            vqgan=vqgan,
+            prior_text_encoder=prior_text_encoder,
+            prior_tokenizer=prior_tokenizer,
+            prior_prior=prior_prior,
+            prior_scheduler=prior_scheduler,
+            prior_feature_extractor=prior_feature_extractor,
+            prior_image_encoder=prior_image_encoder,
+        )
+        self.prior_pipe = StableCascadePriorPipeline(
+            prior=prior_prior,
+            text_encoder=prior_text_encoder,
+            tokenizer=prior_tokenizer,
+            scheduler=prior_scheduler,
+            image_encoder=prior_image_encoder,
+            feature_extractor=prior_feature_extractor,
+        )
+        self.decoder_pipe = StableCascadeDecoderPipeline(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            decoder=decoder,
+            scheduler=scheduler,
+            vqgan=vqgan,
+        )
+
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
+        self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
+
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id)
+        self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id)
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
+        Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a
+        GPU only when their specific submodule's `forward` method is called. Offloading happens on a submodule basis.
+        Memory savings are higher than using `enable_model_cpu_offload`, but performance is lower.
+        """
+        self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
+        self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
+
+    def progress_bar(self, iterable=None, total=None):
+        self.prior_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.progress_bar(iterable=iterable, total=total)
+
+    def set_progress_bar_config(self, **kwargs):
+        self.prior_pipe.set_progress_bar_config(**kwargs)
+        self.decoder_pipe.set_progress_bar_config(**kwargs)
+
+    @torch.no_grad()
+    @replace_example_docstring(TEXT2IMAGE_EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        images: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]] = None,
+        height: int = 512,
+        width: int = 512,
+        prior_num_inference_steps: int = 60,
+        prior_guidance_scale: float = 4.0,
+        num_inference_steps: int = 12,
+        decoder_guidance_scale: float = 0.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        prior_callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation for the prior and decoder.
+            images (`torch.Tensor`, `PIL.Image.Image`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, *optional*):
+                The images to guide the image generation for the prior.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, text embeddings will be generated from `prompt` input argument.
+            prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.*
+                prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.*
+                prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `prior_guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting
+                `prior_guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked
+                to the text `prompt`, usually at the expense of lower image quality.
+            prior_num_inference_steps (`Union[int, Dict[float, int]]`, *optional*, defaults to 60):
+                The number of prior denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. For more specific timestep spacing, you can pass customized
+                `prior_timesteps`
+            num_inference_steps (`int`, *optional*, defaults to 12):
+                The number of decoder denoising steps. More denoising steps usually lead to a higher quality image at
+                the expense of slower inference. For more specific timestep spacing, you can pass customized
+                `timesteps`
+            decoder_guidance_scale (`float`, *optional*, defaults to 0.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            prior_callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `prior_callback_on_step_end(self: DiffusionPipeline, step: int, timestep:
+                int, callback_kwargs: Dict)`.
+            prior_callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `prior_callback_on_step_end` function. The tensors specified in the
+                list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in
+                the `._callback_tensor_inputs` attribute of your pipeine class.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeine class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple` [`~pipelines.ImagePipelineOutput`] if `return_dict` is True,
+            otherwise a `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        dtype = self.decoder_pipe.decoder.dtype
+        if is_torch_version("<", "2.2.0") and dtype == torch.bfloat16:
+            raise ValueError(
+                "`StableCascadeCombinedPipeline` requires torch>=2.2.0 when using `torch.bfloat16` dtype."
+            )
+
+        prior_outputs = self.prior_pipe(
+            prompt=prompt if prompt_embeds is None else None,
+            images=images,
+            height=height,
+            width=width,
+            num_inference_steps=prior_num_inference_steps,
+            guidance_scale=prior_guidance_scale,
+            negative_prompt=negative_prompt if negative_prompt_embeds is None else None,
+            prompt_embeds=prompt_embeds,
+            prompt_embeds_pooled=prompt_embeds_pooled,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_prompt_embeds_pooled=negative_prompt_embeds_pooled,
+            num_images_per_prompt=num_images_per_prompt,
+            generator=generator,
+            latents=latents,
+            output_type="pt",
+            return_dict=True,
+            callback_on_step_end=prior_callback_on_step_end,
+            callback_on_step_end_tensor_inputs=prior_callback_on_step_end_tensor_inputs,
+        )
+        image_embeddings = prior_outputs.image_embeddings
+        prompt_embeds = prior_outputs.get("prompt_embeds", None)
+        prompt_embeds_pooled = prior_outputs.get("prompt_embeds_pooled", None)
+        negative_prompt_embeds = prior_outputs.get("negative_prompt_embeds", None)
+        negative_prompt_embeds_pooled = prior_outputs.get("negative_prompt_embeds_pooled", None)
+
+        outputs = self.decoder_pipe(
+            image_embeddings=image_embeddings,
+            prompt=prompt if prompt_embeds is None else None,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=decoder_guidance_scale,
+            negative_prompt=negative_prompt if negative_prompt_embeds is None else None,
+            prompt_embeds=prompt_embeds,
+            prompt_embeds_pooled=prompt_embeds_pooled,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_prompt_embeds_pooled=negative_prompt_embeds_pooled,
+            generator=generator,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback_on_step_end=callback_on_step_end,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        )
+
+        return outputs
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py
new file mode 100644
index 000000000..24ccc4b88
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py
@@ -0,0 +1,638 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from math import ceil
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...models import StableCascadeUNet
+from ...schedulers import DDPMWuerstchenScheduler
+from ...utils import BaseOutput, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+DEFAULT_STAGE_C_TIMESTEPS = list(np.linspace(1.0, 2 / 3, 20)) + list(np.linspace(2 / 3, 0.0, 11))[1:]
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableCascadePriorPipeline
+
+        >>> prior_pipe = StableCascadePriorPipeline.from_pretrained(
+        ...     "stabilityai/stable-cascade-prior", torch_dtype=torch.bfloat16
+        ... ).to("cuda")
+
+        >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
+        >>> prior_output = pipe(prompt)
+        ```
+"""
+
+
+@dataclass
+class StableCascadePriorPipelineOutput(BaseOutput):
+    """
+    Output class for WuerstchenPriorPipeline.
+
+    Args:
+        image_embeddings (`torch.FloatTensor` or `np.ndarray`)
+            Prior image embeddings for text prompt
+        prompt_embeds (`torch.FloatTensor`):
+            Text embeddings for the prompt.
+        negative_prompt_embeds (`torch.FloatTensor`):
+            Text embeddings for the negative prompt.
+    """
+
+    image_embeddings: Union[torch.FloatTensor, np.ndarray]
+    prompt_embeds: Union[torch.FloatTensor, np.ndarray]
+    prompt_embeds_pooled: Union[torch.FloatTensor, np.ndarray]
+    negative_prompt_embeds: Union[torch.FloatTensor, np.ndarray]
+    negative_prompt_embeds_pooled: Union[torch.FloatTensor, np.ndarray]
+
+
+class StableCascadePriorPipeline(DiffusionPipeline):
+    """
+    Pipeline for generating image prior for Stable Cascade.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        prior ([`StableCascadeUNet`]):
+            The Stable Cascade prior to approximate the image embedding from the text and/or image embedding.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder ([laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)).
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `image_encoder`.
+        image_encoder ([`CLIPVisionModelWithProjection`]):
+            Frozen CLIP image-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        scheduler ([`DDPMWuerstchenScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+        resolution_multiple ('float', *optional*, defaults to 42.67):
+            Default resolution for multiple images generated.
+    """
+
+    unet_name = "prior"
+    text_encoder_name = "text_encoder"
+    model_cpu_offload_seq = "image_encoder->text_encoder->prior"
+    _optional_components = ["image_encoder", "feature_extractor"]
+    _callback_tensor_inputs = ["latents", "text_encoder_hidden_states", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModelWithProjection,
+        prior: StableCascadeUNet,
+        scheduler: DDPMWuerstchenScheduler,
+        resolution_multiple: float = 42.67,
+        feature_extractor: Optional[CLIPImageProcessor] = None,
+        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
+    ) -> None:
+        super().__init__()
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            prior=prior,
+            scheduler=scheduler,
+        )
+        self.register_to_config(resolution_multiple=resolution_multiple)
+
+    def prepare_latents(
+        self, batch_size, height, width, num_images_per_prompt, dtype, device, generator, latents, scheduler
+    ):
+        latent_shape = (
+            num_images_per_prompt * batch_size,
+            self.prior.config.in_channels,
+            ceil(height / self.config.resolution_multiple),
+            ceil(width / self.config.resolution_multiple),
+        )
+
+        if latents is None:
+            latents = randn_tensor(latent_shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != latent_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latent_shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def encode_prompt(
+        self,
+        device,
+        batch_size,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        prompt=None,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
+    ):
+        if prompt_embeds is None:
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            attention_mask = text_inputs.attention_mask
+
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+                text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+                attention_mask = attention_mask[:, : self.tokenizer.model_max_length]
+
+            text_encoder_output = self.text_encoder(
+                text_input_ids.to(device), attention_mask=attention_mask.to(device), output_hidden_states=True
+            )
+            prompt_embeds = text_encoder_output.hidden_states[-1]
+            if prompt_embeds_pooled is None:
+                prompt_embeds_pooled = text_encoder_output.text_embeds.unsqueeze(1)
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        prompt_embeds_pooled = prompt_embeds_pooled.to(dtype=self.text_encoder.dtype, device=device)
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        prompt_embeds_pooled = prompt_embeds_pooled.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if negative_prompt_embeds is None and do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=uncond_input.attention_mask.to(device),
+                output_hidden_states=True,
+            )
+
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.hidden_states[-1]
+            negative_prompt_embeds_pooled = negative_prompt_embeds_text_encoder_output.text_embeds.unsqueeze(1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            seq_len = negative_prompt_embeds_pooled.shape[1]
+            negative_prompt_embeds_pooled = negative_prompt_embeds_pooled.to(
+                dtype=self.text_encoder.dtype, device=device
+            )
+            negative_prompt_embeds_pooled = negative_prompt_embeds_pooled.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds_pooled = negative_prompt_embeds_pooled.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            # done duplicates
+
+        return prompt_embeds, prompt_embeds_pooled, negative_prompt_embeds, negative_prompt_embeds_pooled
+
+    def encode_image(self, images, device, dtype, batch_size, num_images_per_prompt):
+        image_embeds = []
+        for image in images:
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+            image = image.to(device=device, dtype=dtype)
+            image_embed = self.image_encoder(image).image_embeds.unsqueeze(1)
+            image_embeds.append(image_embed)
+        image_embeds = torch.cat(image_embeds, dim=1)
+
+        image_embeds = image_embeds.repeat(batch_size * num_images_per_prompt, 1, 1)
+        negative_image_embeds = torch.zeros_like(image_embeds)
+
+        return image_embeds, negative_image_embeds
+
+    def check_inputs(
+        self,
+        prompt,
+        images=None,
+        image_embeds=None,
+        negative_prompt=None,
+        prompt_embeds=None,
+        prompt_embeds_pooled=None,
+        negative_prompt_embeds=None,
+        negative_prompt_embeds_pooled=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and prompt_embeds_pooled is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `prompt_embeds_pooled` must also be provided. Make sure to generate `prompt_embeds_pooled` from the same text encoder that was used to generate `prompt_embeds`"
+            )
+
+        if negative_prompt_embeds is not None and negative_prompt_embeds_pooled is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_prompt_embeds_pooled` must also be provided. Make sure to generate `prompt_embeds_pooled` from the same text encoder that was used to generate `prompt_embeds`"
+            )
+
+        if prompt_embeds_pooled is not None and negative_prompt_embeds_pooled is not None:
+            if prompt_embeds_pooled.shape != negative_prompt_embeds_pooled.shape:
+                raise ValueError(
+                    "`prompt_embeds_pooled` and `negative_prompt_embeds_pooled` must have the same shape when passed"
+                    f"directly, but got: `prompt_embeds_pooled` {prompt_embeds_pooled.shape} !="
+                    f"`negative_prompt_embeds_pooled` {negative_prompt_embeds_pooled.shape}."
+                )
+
+        if image_embeds is not None and images is not None:
+            raise ValueError(
+                f"Cannot forward both `images`: {images} and `image_embeds`: {image_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+
+        if images:
+            for i, image in enumerate(images):
+                if not isinstance(image, torch.Tensor) and not isinstance(image, PIL.Image.Image):
+                    raise TypeError(
+                        f"'images' must contain images of type 'torch.Tensor' or 'PIL.Image.Image, but got"
+                        f"{type(image)} for image number {i}."
+                    )
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    def get_timestep_ratio_conditioning(self, t, alphas_cumprod):
+        s = torch.tensor([0.003])
+        clamp_range = [0, 1]
+        min_var = torch.cos(s / (1 + s) * torch.pi * 0.5) ** 2
+        var = alphas_cumprod[t]
+        var = var.clamp(*clamp_range)
+        s, min_var = s.to(var.device), min_var.to(var.device)
+        ratio = (((var * min_var) ** 0.5).acos() / (torch.pi * 0.5)) * (1 + s) - s
+        return ratio
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        images: Union[torch.Tensor, PIL.Image.Image, List[torch.Tensor], List[PIL.Image.Image]] = None,
+        height: int = 1024,
+        width: int = 1024,
+        num_inference_steps: int = 20,
+        timesteps: List[float] = None,
+        guidance_scale: float = 4.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds_pooled: Optional[torch.FloatTensor] = None,
+        image_embeds: Optional[torch.FloatTensor] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pt",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 1024):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 1024):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 60):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 8.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `decoder_guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting
+                `decoder_guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely
+                linked to the text `prompt`, usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `decoder_guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_prompt_embeds_pooled (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt` input
+                argument.
+            image_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated image embeddings. Can be used to easily tweak image inputs, *e.g.* prompt weighting.
+                If not provided, image embeddings will be generated from `image` input argument if existing.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`StableCascadePriorPipelineOutput`] or `tuple` [`StableCascadePriorPipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
+            generated image embeddings.
+        """
+
+        # 0. Define commonly used variables
+        device = self._execution_device
+        dtype = next(self.prior.parameters()).dtype
+        self._guidance_scale = guidance_scale
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            images=images,
+            image_embeds=image_embeds,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            prompt_embeds_pooled=prompt_embeds_pooled,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_prompt_embeds_pooled=negative_prompt_embeds_pooled,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        )
+
+        # 2. Encode caption + images
+        (
+            prompt_embeds,
+            prompt_embeds_pooled,
+            negative_prompt_embeds,
+            negative_prompt_embeds_pooled,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            batch_size=batch_size,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            prompt_embeds_pooled=prompt_embeds_pooled,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_prompt_embeds_pooled=negative_prompt_embeds_pooled,
+        )
+
+        if images is not None:
+            image_embeds_pooled, uncond_image_embeds_pooled = self.encode_image(
+                images=images,
+                device=device,
+                dtype=dtype,
+                batch_size=batch_size,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+        elif image_embeds is not None:
+            image_embeds_pooled = image_embeds.repeat(batch_size * num_images_per_prompt, 1, 1)
+            uncond_image_embeds_pooled = torch.zeros_like(image_embeds_pooled)
+        else:
+            image_embeds_pooled = torch.zeros(
+                batch_size * num_images_per_prompt,
+                1,
+                self.prior.config.clip_image_in_channels,
+                device=device,
+                dtype=dtype,
+            )
+            uncond_image_embeds_pooled = torch.zeros(
+                batch_size * num_images_per_prompt,
+                1,
+                self.prior.config.clip_image_in_channels,
+                device=device,
+                dtype=dtype,
+            )
+
+        if self.do_classifier_free_guidance:
+            image_embeds = torch.cat([image_embeds_pooled, uncond_image_embeds_pooled], dim=0)
+        else:
+            image_embeds = image_embeds_pooled
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        text_encoder_hidden_states = (
+            torch.cat([prompt_embeds, negative_prompt_embeds]) if negative_prompt_embeds is not None else prompt_embeds
+        )
+        text_encoder_pooled = (
+            torch.cat([prompt_embeds_pooled, negative_prompt_embeds_pooled])
+            if negative_prompt_embeds is not None
+            else prompt_embeds_pooled
+        )
+
+        # 4. Prepare and set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latents
+        latents = self.prepare_latents(
+            batch_size, height, width, num_images_per_prompt, dtype, device, generator, latents, self.scheduler
+        )
+
+        if isinstance(self.scheduler, DDPMWuerstchenScheduler):
+            timesteps = timesteps[:-1]
+        else:
+            if self.scheduler.config.clip_sample:
+                self.scheduler.config.clip_sample = False  # disample sample clipping
+                logger.warning(" set `clip_sample` to be False")
+        # 6. Run denoising loop
+        if hasattr(self.scheduler, "betas"):
+            alphas = 1.0 - self.scheduler.betas
+            alphas_cumprod = torch.cumprod(alphas, dim=0)
+        else:
+            alphas_cumprod = []
+
+        self._num_timesteps = len(timesteps)
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            if not isinstance(self.scheduler, DDPMWuerstchenScheduler):
+                if len(alphas_cumprod) > 0:
+                    timestep_ratio = self.get_timestep_ratio_conditioning(t.long().cpu(), alphas_cumprod)
+                    timestep_ratio = timestep_ratio.expand(latents.size(0)).to(dtype).to(device)
+                else:
+                    timestep_ratio = t.float().div(self.scheduler.timesteps[-1]).expand(latents.size(0)).to(dtype)
+            else:
+                timestep_ratio = t.expand(latents.size(0)).to(dtype)
+            # 7. Denoise image embeddings
+            predicted_image_embedding = self.prior(
+                sample=torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents,
+                timestep_ratio=torch.cat([timestep_ratio] * 2) if self.do_classifier_free_guidance else timestep_ratio,
+                clip_text_pooled=text_encoder_pooled,
+                clip_text=text_encoder_hidden_states,
+                clip_img=image_embeds,
+                return_dict=False,
+            )[0]
+
+            # 8. Check for classifier free guidance and apply it
+            if self.do_classifier_free_guidance:
+                predicted_image_embedding_text, predicted_image_embedding_uncond = predicted_image_embedding.chunk(2)
+                predicted_image_embedding = torch.lerp(
+                    predicted_image_embedding_uncond, predicted_image_embedding_text, self.guidance_scale
+                )
+
+            # 9. Renoise latents to next timestep
+            if not isinstance(self.scheduler, DDPMWuerstchenScheduler):
+                timestep_ratio = t
+            latents = self.scheduler.step(
+                model_output=predicted_image_embedding, timestep=timestep_ratio, sample=latents, generator=generator
+            ).prev_sample
+
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                latents = callback_outputs.pop("latents", latents)
+                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if output_type == "np":
+            latents = latents.cpu().float().numpy()  # float() as bfloat16-> numpy doesnt work
+            prompt_embeds = prompt_embeds.cpu().float().numpy()  # float() as bfloat16-> numpy doesnt work
+            negative_prompt_embeds = (
+                negative_prompt_embeds.cpu().float().numpy() if negative_prompt_embeds is not None else None
+            )  # float() as bfloat16-> numpy doesnt work
+
+        if not return_dict:
+            return (
+                latents,
+                prompt_embeds,
+                prompt_embeds_pooled,
+                negative_prompt_embeds,
+                negative_prompt_embeds_pooled,
+            )
+
+        return StableCascadePriorPipelineOutput(
+            image_embeddings=latents,
+            prompt_embeds=prompt_embeds,
+            prompt_embeds_pooled=prompt_embeds_pooled,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_prompt_embeds_pooled=negative_prompt_embeds_pooled,
+        )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/README.md b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/README.md
new file mode 100644
index 000000000..5b6424308
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/README.md
@@ -0,0 +1,176 @@
+# Stable Diffusion
+
+## Overview
+
+Stable Diffusion was proposed in [Stable Diffusion Announcement](https://stability.ai/blog/stable-diffusion-announcement) by Patrick Esser and Robin Rombach and the Stability AI team.
+
+The summary of the model is the following:
+
+*Stable Diffusion is a text-to-image model that will empower billions of people to create stunning art within seconds. It is a breakthrough in speed and quality meaning that it can run on consumer GPUs. You can see some of the amazing output that has been created by this model without pre or post-processing on this page. The model itself builds upon the work of the team at CompVis and Runway in their widely used latent diffusion model combined with insights from the conditional diffusion models by our lead generative AI developer Katherine Crowson, Dall-E 2 by Open AI, Imagen by Google Brain and many others. We are delighted that AI media generation is a cooperative field and hope it can continue this way to bring the gift of creativity to all.*
+
+## Tips:
+
+- Stable Diffusion has the same architecture as [Latent Diffusion](https://arxiv.org/abs/2112.10752) but uses a frozen CLIP Text Encoder instead of training the text encoder jointly with the diffusion model.
+- An in-detail explanation of the Stable Diffusion model can be found under [Stable Diffusion with 🧨 Diffusers](https://huggingface.co/blog/stable_diffusion).
+- If you don't want to rely on the Hugging Face Hub and having to pass a authentication token, you can
+download the weights with `git lfs install; git clone https://huggingface.co/runwayml/stable-diffusion-v1-5` and instead pass the local path to the cloned folder to `from_pretrained` as shown below.
+- Stable Diffusion can work with a variety of different samplers as is shown below.
+
+## Available Pipelines:
+
+| Pipeline | Tasks | Colab
+|---|---|:---:|
+| [pipeline_stable_diffusion.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py) | *Text-to-Image Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb)
+| [pipeline_stable_diffusion_img2img](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py) | *Image-to-Image Text-Guided Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb)
+| [pipeline_stable_diffusion_inpaint](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py) | *Text-Guided Image Inpainting* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb)
+
+## Examples:
+
+### Using Stable Diffusion without being logged into the Hub.
+
+If you want to download the model weights using a single Python line, you need to be logged in via `huggingface-cli login`.
+
+```python
+from diffusers import DiffusionPipeline
+
+pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+```
+
+This however can make it difficult to build applications on top of `diffusers` as you will always have to pass the token around. A potential way to solve this issue is by downloading the weights to a local path `"./stable-diffusion-v1-5"`:
+
+```
+git lfs install
+git clone https://huggingface.co/runwayml/stable-diffusion-v1-5
+```
+
+and simply passing the local path to `from_pretrained`:
+
+```python
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained("./stable-diffusion-v1-5")
+```
+
+### Text-to-Image with default PLMS scheduler
+
+```python
+# make sure you're logged in with `huggingface-cli login`
+from diffusers import StableDiffusionPipeline
+
+pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt).images[0]
+
+image.save("astronaut_rides_horse.png")
+```
+
+### Text-to-Image with DDIM scheduler
+
+```python
+# make sure you're logged in with `huggingface-cli login`
+from diffusers import StableDiffusionPipeline, DDIMScheduler
+
+scheduler =  DDIMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    scheduler=scheduler,
+).to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt).images[0]
+
+image.save("astronaut_rides_horse.png")
+```
+
+### Text-to-Image with K-LMS scheduler
+
+```python
+# make sure you're logged in with `huggingface-cli login`
+from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
+
+lms = LMSDiscreteScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
+
+pipe = StableDiffusionPipeline.from_pretrained(
+    "runwayml/stable-diffusion-v1-5",
+    scheduler=lms,
+).to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+image = pipe(prompt).images[0]
+
+image.save("astronaut_rides_horse.png")
+```
+
+### CycleDiffusion using Stable Diffusion and DDIM scheduler
+
+```python
+import requests
+import torch
+from PIL import Image
+from io import BytesIO
+
+from diffusers import CycleDiffusionPipeline, DDIMScheduler
+
+
+# load the scheduler. CycleDiffusion only supports stochastic schedulers.
+
+# load the pipeline
+# make sure you're logged in with `huggingface-cli login`
+model_id_or_path = "CompVis/stable-diffusion-v1-4"
+scheduler = DDIMScheduler.from_pretrained(model_id_or_path, subfolder="scheduler")
+pipe = CycleDiffusionPipeline.from_pretrained(model_id_or_path, scheduler=scheduler).to("cuda")
+
+# let's download an initial image
+url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/An%20astronaut%20riding%20a%20horse.png"
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+init_image = init_image.resize((512, 512))
+init_image.save("horse.png")
+
+# let's specify a prompt
+source_prompt = "An astronaut riding a horse"
+prompt = "An astronaut riding an elephant"
+
+# call the pipeline
+image = pipe(
+    prompt=prompt,
+    source_prompt=source_prompt,
+    image=init_image,
+    num_inference_steps=100,
+    eta=0.1,
+    strength=0.8,
+    guidance_scale=2,
+    source_guidance_scale=1,
+).images[0]
+
+image.save("horse_to_elephant.png")
+
+# let's try another example
+# See more samples at the original repo: https://github.com/ChenWu98/cycle-diffusion
+url = "https://raw.githubusercontent.com/ChenWu98/cycle-diffusion/main/data/dalle2/A%20black%20colored%20car.png"
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+init_image = init_image.resize((512, 512))
+init_image.save("black.png")
+
+source_prompt = "A black colored car"
+prompt = "A blue colored car"
+
+# call the pipeline
+torch.manual_seed(0)
+image = pipe(
+    prompt=prompt,
+    source_prompt=source_prompt,
+    image=init_image,
+    num_inference_steps=100,
+    eta=0.1,
+    strength=0.85,
+    guidance_scale=3,
+    source_guidance_scale=1,
+).images[0]
+
+image.save("black_to_blue.png")
+```
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/__init__.py
new file mode 100644
index 000000000..0eda32d33
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/__init__.py
@@ -0,0 +1,203 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_flax_available,
+    is_k_diffusion_available,
+    is_k_diffusion_version,
+    is_onnx_available,
+    is_torch_available,
+    is_transformers_available,
+    is_transformers_version,
+)
+
+
+_dummy_objects = {}
+_additional_imports = {}
+_import_structure = {"pipeline_output": ["StableDiffusionPipelineOutput"]}
+
+if is_transformers_available() and is_flax_available():
+    _import_structure["pipeline_output"].extend(["FlaxStableDiffusionPipelineOutput"])
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["clip_image_project_model"] = ["CLIPImageProjection"]
+    _import_structure["pipeline_cycle_diffusion"] = ["CycleDiffusionPipeline"]
+    _import_structure["pipeline_stable_diffusion"] = ["StableDiffusionPipeline"]
+    _import_structure["pipeline_stable_diffusion_attend_and_excite"] = ["StableDiffusionAttendAndExcitePipeline"]
+    _import_structure["pipeline_stable_diffusion_gligen"] = ["StableDiffusionGLIGENPipeline"]
+    _import_structure["pipeline_stable_diffusion_gligen_text_image"] = ["StableDiffusionGLIGENTextImagePipeline"]
+    _import_structure["pipeline_stable_diffusion_img2img"] = ["StableDiffusionImg2ImgPipeline"]
+    _import_structure["pipeline_stable_diffusion_inpaint"] = ["StableDiffusionInpaintPipeline"]
+    _import_structure["pipeline_stable_diffusion_inpaint_legacy"] = ["StableDiffusionInpaintPipelineLegacy"]
+    _import_structure["pipeline_stable_diffusion_instruct_pix2pix"] = ["StableDiffusionInstructPix2PixPipeline"]
+    _import_structure["pipeline_stable_diffusion_latent_upscale"] = ["StableDiffusionLatentUpscalePipeline"]
+    _import_structure["pipeline_stable_diffusion_model_editing"] = ["StableDiffusionModelEditingPipeline"]
+    _import_structure["pipeline_stable_diffusion_paradigms"] = ["StableDiffusionParadigmsPipeline"]
+    _import_structure["pipeline_stable_diffusion_upscale"] = ["StableDiffusionUpscalePipeline"]
+    _import_structure["pipeline_stable_unclip"] = ["StableUnCLIPPipeline"]
+    _import_structure["pipeline_stable_unclip_img2img"] = ["StableUnCLIPImg2ImgPipeline"]
+    _import_structure["safety_checker"] = ["StableDiffusionSafetyChecker"]
+    _import_structure["stable_unclip_image_normalizer"] = ["StableUnCLIPImageNormalizer"]
+try:
+    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils.dummy_torch_and_transformers_objects import (
+        StableDiffusionImageVariationPipeline,
+    )
+
+    _dummy_objects.update({"StableDiffusionImageVariationPipeline": StableDiffusionImageVariationPipeline})
+else:
+    _import_structure["pipeline_stable_diffusion_image_variation"] = ["StableDiffusionImageVariationPipeline"]
+try:
+    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.26.0")):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils.dummy_torch_and_transformers_objects import (
+        StableDiffusionDepth2ImgPipeline,
+    )
+
+    _dummy_objects.update(
+        {
+            "StableDiffusionDepth2ImgPipeline": StableDiffusionDepth2ImgPipeline,
+        }
+    )
+else:
+    _import_structure["pipeline_stable_diffusion_depth2img"] = ["StableDiffusionDepth2ImgPipeline"]
+
+try:
+    if not (is_transformers_available() and is_onnx_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_onnx_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_onnx_objects))
+else:
+    _import_structure["pipeline_onnx_stable_diffusion"] = [
+        "OnnxStableDiffusionPipeline",
+        "StableDiffusionOnnxPipeline",
+    ]
+    _import_structure["pipeline_onnx_stable_diffusion_img2img"] = ["OnnxStableDiffusionImg2ImgPipeline"]
+    _import_structure["pipeline_onnx_stable_diffusion_inpaint"] = ["OnnxStableDiffusionInpaintPipeline"]
+    _import_structure["pipeline_onnx_stable_diffusion_inpaint_legacy"] = ["OnnxStableDiffusionInpaintPipelineLegacy"]
+    _import_structure["pipeline_onnx_stable_diffusion_upscale"] = ["OnnxStableDiffusionUpscalePipeline"]
+
+if is_transformers_available() and is_flax_available():
+    from ...schedulers.scheduling_pndm_flax import PNDMSchedulerState
+
+    _additional_imports.update({"PNDMSchedulerState": PNDMSchedulerState})
+    _import_structure["pipeline_flax_stable_diffusion"] = ["FlaxStableDiffusionPipeline"]
+    _import_structure["pipeline_flax_stable_diffusion_img2img"] = ["FlaxStableDiffusionImg2ImgPipeline"]
+    _import_structure["pipeline_flax_stable_diffusion_inpaint"] = ["FlaxStableDiffusionInpaintPipeline"]
+    _import_structure["safety_checker_flax"] = ["FlaxStableDiffusionSafetyChecker"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+
+    else:
+        from .clip_image_project_model import CLIPImageProjection
+        from .pipeline_stable_diffusion import (
+            StableDiffusionPipeline,
+            StableDiffusionPipelineOutput,
+            StableDiffusionSafetyChecker,
+        )
+        from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
+        from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
+        from .pipeline_stable_diffusion_instruct_pix2pix import (
+            StableDiffusionInstructPix2PixPipeline,
+        )
+        from .pipeline_stable_diffusion_latent_upscale import (
+            StableDiffusionLatentUpscalePipeline,
+        )
+        from .pipeline_stable_diffusion_upscale import StableDiffusionUpscalePipeline
+        from .pipeline_stable_unclip import StableUnCLIPPipeline
+        from .pipeline_stable_unclip_img2img import StableUnCLIPImg2ImgPipeline
+        from .safety_checker import StableDiffusionSafetyChecker
+        from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
+
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import (
+            StableDiffusionImageVariationPipeline,
+        )
+    else:
+        from .pipeline_stable_diffusion_image_variation import (
+            StableDiffusionImageVariationPipeline,
+        )
+
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.26.0")):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import StableDiffusionDepth2ImgPipeline
+    else:
+        from .pipeline_stable_diffusion_depth2img import (
+            StableDiffusionDepth2ImgPipeline,
+        )
+
+    try:
+        if not (is_transformers_available() and is_onnx_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_onnx_objects import *
+    else:
+        from .pipeline_onnx_stable_diffusion import (
+            OnnxStableDiffusionPipeline,
+            StableDiffusionOnnxPipeline,
+        )
+        from .pipeline_onnx_stable_diffusion_img2img import (
+            OnnxStableDiffusionImg2ImgPipeline,
+        )
+        from .pipeline_onnx_stable_diffusion_inpaint import (
+            OnnxStableDiffusionInpaintPipeline,
+        )
+        from .pipeline_onnx_stable_diffusion_upscale import (
+            OnnxStableDiffusionUpscalePipeline,
+        )
+
+    try:
+        if not (is_transformers_available() and is_flax_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_flax_objects import *
+    else:
+        from .pipeline_flax_stable_diffusion import FlaxStableDiffusionPipeline
+        from .pipeline_flax_stable_diffusion_img2img import (
+            FlaxStableDiffusionImg2ImgPipeline,
+        )
+        from .pipeline_flax_stable_diffusion_inpaint import (
+            FlaxStableDiffusionInpaintPipeline,
+        )
+        from .pipeline_output import FlaxStableDiffusionPipelineOutput
+        from .safety_checker_flax import FlaxStableDiffusionSafetyChecker
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
+    for name, value in _additional_imports.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/clip_image_project_model.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/clip_image_project_model.py
new file mode 100644
index 000000000..71f9d9714
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/clip_image_project_model.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The GLIGEN Authors and HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models.modeling_utils import ModelMixin
+
+
+class CLIPImageProjection(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(self, hidden_size: int = 768):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.project = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+
+    def forward(self, x):
+        return self.project(x)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
new file mode 100644
index 000000000..30c3c5b51
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
@@ -0,0 +1,1860 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conversion script for the Stable Diffusion checkpoints."""
+
+import re
+from contextlib import nullcontext
+from io import BytesIO
+from typing import Dict, Optional, Union
+
+import requests
+import torch
+import yaml
+from transformers import (
+    AutoFeatureExtractor,
+    BertTokenizerFast,
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+
+from ...models import (
+    AutoencoderKL,
+    ControlNetModel,
+    PriorTransformer,
+    UNet2DConditionModel,
+)
+from ...schedulers import (
+    DDIMScheduler,
+    DDPMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+    UnCLIPScheduler,
+)
+from ...utils import is_accelerate_available, logging
+from ..latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
+from ..paint_by_example import PaintByExampleImageEncoder
+from ..pipeline_utils import DiffusionPipeline
+from .safety_checker import StableDiffusionSafetyChecker
+from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
+
+
+if is_accelerate_available():
+    from accelerate import init_empty_weights
+    from accelerate.utils import set_module_tensor_to_device
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def shave_segments(path, n_shave_prefix_segments=1):
+    """
+    Removes segments. Positive values shave the first segments, negative shave the last segments.
+    """
+    if n_shave_prefix_segments >= 0:
+        return ".".join(path.split(".")[n_shave_prefix_segments:])
+    else:
+        return ".".join(path.split(".")[:n_shave_prefix_segments])
+
+
+def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item.replace("in_layers.0", "norm1")
+        new_item = new_item.replace("in_layers.2", "conv1")
+
+        new_item = new_item.replace("out_layers.0", "norm2")
+        new_item = new_item.replace("out_layers.3", "conv2")
+
+        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
+        new_item = new_item.replace("skip_connection", "conv_shortcut")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside resnets to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("nin_shortcut", "conv_shortcut")
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
+        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
+
+        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
+        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
+
+        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0):
+    """
+    Updates paths inside attentions to the new naming scheme (local renaming)
+    """
+    mapping = []
+    for old_item in old_list:
+        new_item = old_item
+
+        new_item = new_item.replace("norm.weight", "group_norm.weight")
+        new_item = new_item.replace("norm.bias", "group_norm.bias")
+
+        new_item = new_item.replace("q.weight", "to_q.weight")
+        new_item = new_item.replace("q.bias", "to_q.bias")
+
+        new_item = new_item.replace("k.weight", "to_k.weight")
+        new_item = new_item.replace("k.bias", "to_k.bias")
+
+        new_item = new_item.replace("v.weight", "to_v.weight")
+        new_item = new_item.replace("v.bias", "to_v.bias")
+
+        new_item = new_item.replace("proj_out.weight", "to_out.0.weight")
+        new_item = new_item.replace("proj_out.bias", "to_out.0.bias")
+
+        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
+
+        mapping.append({"old": old_item, "new": new_item})
+
+    return mapping
+
+
+def assign_to_checkpoint(
+    paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None
+):
+    """
+    This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits
+    attention layers, and takes into account additional replacements that may arise.
+
+    Assigns the weights to the new checkpoint.
+    """
+    assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys."
+
+    # Splits the attention layers into three variables.
+    if attention_paths_to_split is not None:
+        for path, path_map in attention_paths_to_split.items():
+            old_tensor = old_checkpoint[path]
+            channels = old_tensor.shape[0] // 3
+
+            target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1)
+
+            num_heads = old_tensor.shape[0] // config["num_head_channels"] // 3
+
+            old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:])
+            query, key, value = old_tensor.split(channels // num_heads, dim=1)
+
+            checkpoint[path_map["query"]] = query.reshape(target_shape)
+            checkpoint[path_map["key"]] = key.reshape(target_shape)
+            checkpoint[path_map["value"]] = value.reshape(target_shape)
+
+    for path in paths:
+        new_path = path["new"]
+
+        # These have already been assigned
+        if attention_paths_to_split is not None and new_path in attention_paths_to_split:
+            continue
+
+        # Global renaming happens here
+        new_path = new_path.replace("middle_block.0", "mid_block.resnets.0")
+        new_path = new_path.replace("middle_block.1", "mid_block.attentions.0")
+        new_path = new_path.replace("middle_block.2", "mid_block.resnets.1")
+
+        if additional_replacements is not None:
+            for replacement in additional_replacements:
+                new_path = new_path.replace(replacement["old"], replacement["new"])
+
+        # proj_attn.weight has to be converted from conv 1D to linear
+        is_attn_weight = "proj_attn.weight" in new_path or ("attentions" in new_path and "to_" in new_path)
+        shape = old_checkpoint[path["old"]].shape
+        if is_attn_weight and len(shape) == 3:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0]
+        elif is_attn_weight and len(shape) == 4:
+            checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0, 0]
+        else:
+            checkpoint[new_path] = old_checkpoint[path["old"]]
+
+
+def conv_attn_to_linear(checkpoint):
+    keys = list(checkpoint.keys())
+    attn_keys = ["query.weight", "key.weight", "value.weight"]
+    for key in keys:
+        if ".".join(key.split(".")[-2:]) in attn_keys:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0, 0]
+        elif "proj_attn.weight" in key:
+            if checkpoint[key].ndim > 2:
+                checkpoint[key] = checkpoint[key][:, :, 0]
+
+
+def create_unet_diffusers_config(original_config, image_size: int, controlnet=False):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    if controlnet:
+        unet_params = original_config["model"]["params"]["control_stage_config"]["params"]
+    else:
+        if (
+            "unet_config" in original_config["model"]["params"]
+            and original_config["model"]["params"]["unet_config"] is not None
+        ):
+            unet_params = original_config["model"]["params"]["unet_config"]["params"]
+        else:
+            unet_params = original_config["model"]["params"]["network_config"]["params"]
+
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+
+    block_out_channels = [unet_params["model_channels"] * mult for mult in unet_params["channel_mult"]]
+
+    down_block_types = []
+    resolution = 1
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnDownBlock2D" if resolution in unet_params["attention_resolutions"] else "DownBlock2D"
+        down_block_types.append(block_type)
+        if i != len(block_out_channels) - 1:
+            resolution *= 2
+
+    up_block_types = []
+    for i in range(len(block_out_channels)):
+        block_type = "CrossAttnUpBlock2D" if resolution in unet_params["attention_resolutions"] else "UpBlock2D"
+        up_block_types.append(block_type)
+        resolution //= 2
+
+    if unet_params["transformer_depth"] is not None:
+        transformer_layers_per_block = (
+            unet_params["transformer_depth"]
+            if isinstance(unet_params["transformer_depth"], int)
+            else list(unet_params["transformer_depth"])
+        )
+    else:
+        transformer_layers_per_block = 1
+
+    vae_scale_factor = 2 ** (len(vae_params["ch_mult"]) - 1)
+
+    head_dim = unet_params["num_heads"] if "num_heads" in unet_params else None
+    use_linear_projection = (
+        unet_params["use_linear_in_transformer"] if "use_linear_in_transformer" in unet_params else False
+    )
+    if use_linear_projection:
+        # stable diffusion 2-base-512 and 2-768
+        if head_dim is None:
+            head_dim_mult = unet_params["model_channels"] // unet_params["num_head_channels"]
+            head_dim = [head_dim_mult * c for c in list(unet_params["channel_mult"])]
+
+    class_embed_type = None
+    addition_embed_type = None
+    addition_time_embed_dim = None
+    projection_class_embeddings_input_dim = None
+    context_dim = None
+
+    if unet_params["context_dim"] is not None:
+        context_dim = (
+            unet_params["context_dim"]
+            if isinstance(unet_params["context_dim"], int)
+            else unet_params["context_dim"][0]
+        )
+
+    if "num_classes" in unet_params:
+        if unet_params["num_classes"] == "sequential":
+            if context_dim in [2048, 1280]:
+                # SDXL
+                addition_embed_type = "text_time"
+                addition_time_embed_dim = 256
+            else:
+                class_embed_type = "projection"
+            assert "adm_in_channels" in unet_params
+            projection_class_embeddings_input_dim = unet_params["adm_in_channels"]
+
+    config = {
+        "sample_size": image_size // vae_scale_factor,
+        "in_channels": unet_params["in_channels"],
+        "down_block_types": tuple(down_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "layers_per_block": unet_params["num_res_blocks"],
+        "cross_attention_dim": context_dim,
+        "attention_head_dim": head_dim,
+        "use_linear_projection": use_linear_projection,
+        "class_embed_type": class_embed_type,
+        "addition_embed_type": addition_embed_type,
+        "addition_time_embed_dim": addition_time_embed_dim,
+        "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
+        "transformer_layers_per_block": transformer_layers_per_block,
+    }
+
+    if "disable_self_attentions" in unet_params:
+        config["only_cross_attention"] = unet_params["disable_self_attentions"]
+
+    if "num_classes" in unet_params and isinstance(unet_params["num_classes"], int):
+        config["num_class_embeds"] = unet_params["num_classes"]
+
+    if controlnet:
+        config["conditioning_channels"] = unet_params["hint_channels"]
+    else:
+        config["out_channels"] = unet_params["out_channels"]
+        config["up_block_types"] = tuple(up_block_types)
+
+    return config
+
+
+def create_vae_diffusers_config(original_config, image_size: int):
+    """
+    Creates a config for the diffusers based on the config of the LDM model.
+    """
+    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
+    _ = original_config["model"]["params"]["first_stage_config"]["params"]["embed_dim"]
+
+    block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)
+
+    config = {
+        "sample_size": image_size,
+        "in_channels": vae_params["in_channels"],
+        "out_channels": vae_params["out_ch"],
+        "down_block_types": tuple(down_block_types),
+        "up_block_types": tuple(up_block_types),
+        "block_out_channels": tuple(block_out_channels),
+        "latent_channels": vae_params["z_channels"],
+        "layers_per_block": vae_params["num_res_blocks"],
+    }
+    return config
+
+
+def create_diffusers_schedular(original_config):
+    schedular = DDIMScheduler(
+        num_train_timesteps=original_config["model"]["params"]["timesteps"],
+        beta_start=original_config["model"]["params"]["linear_start"],
+        beta_end=original_config["model"]["params"]["linear_end"],
+        beta_schedule="scaled_linear",
+    )
+    return schedular
+
+
+def create_ldm_bert_config(original_config):
+    bert_params = original_config["model"]["params"]["cond_stage_config"]["params"]
+    config = LDMBertConfig(
+        d_model=bert_params.n_embed,
+        encoder_layers=bert_params.n_layer,
+        encoder_ffn_dim=bert_params.n_embed * 4,
+    )
+    return config
+
+
+def convert_ldm_unet_checkpoint(
+    checkpoint, config, path=None, extract_ema=False, controlnet=False, skip_extract_state_dict=False
+):
+    """
+    Takes a state dict and a config, and returns a converted checkpoint.
+    """
+
+    if skip_extract_state_dict:
+        unet_state_dict = checkpoint
+    else:
+        # extract state_dict for UNet
+        unet_state_dict = {}
+        keys = list(checkpoint.keys())
+
+        if controlnet:
+            unet_key = "control_model."
+        else:
+            unet_key = "model.diffusion_model."
+
+        # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
+        if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
+            logger.warning(f"Checkpoint {path} has both EMA and non-EMA weights.")
+            logger.warning(
+                "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
+                " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
+            )
+            for key in keys:
+                if key.startswith("model.diffusion_model"):
+                    flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
+                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
+        else:
+            if sum(k.startswith("model_ema") for k in keys) > 100:
+                logger.warning(
+                    "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
+                    " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
+                )
+
+            for key in keys:
+                if key.startswith(unet_key):
+                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
+    new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
+    new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
+    new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
+
+    if config["class_embed_type"] is None:
+        # No parameters to port
+        ...
+    elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
+        new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+        new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+        new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+        new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+    else:
+        raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
+
+    if config["addition_embed_type"] == "text_time":
+        new_checkpoint["add_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
+        new_checkpoint["add_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
+        new_checkpoint["add_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
+        new_checkpoint["add_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
+
+    # Relevant to StableDiffusionUpscalePipeline
+    if "num_class_embeds" in config:
+        if (config["num_class_embeds"] is not None) and ("label_emb.weight" in unet_state_dict):
+            new_checkpoint["class_embedding.weight"] = unet_state_dict["label_emb.weight"]
+
+    new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
+    new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
+
+    if not controlnet:
+        new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
+        new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
+        new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
+        new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
+
+    # Retrieves the keys for the input blocks only
+    num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
+    input_blocks = {
+        layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
+        for layer_id in range(num_input_blocks)
+    }
+
+    # Retrieves the keys for the middle blocks only
+    num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
+    middle_blocks = {
+        layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
+        for layer_id in range(num_middle_blocks)
+    }
+
+    # Retrieves the keys for the output blocks only
+    num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
+    output_blocks = {
+        layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
+        for layer_id in range(num_output_blocks)
+    }
+
+    for i in range(1, num_input_blocks):
+        block_id = (i - 1) // (config["layers_per_block"] + 1)
+        layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
+
+        resnets = [
+            key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
+        ]
+        attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
+
+        if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.weight"
+            )
+            new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
+                f"input_blocks.{i}.0.op.bias"
+            )
+
+        paths = renew_resnet_paths(resnets)
+        meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
+        assign_to_checkpoint(
+            paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+        )
+
+        if len(attentions):
+            paths = renew_attention_paths(attentions)
+
+            meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+    resnet_0 = middle_blocks[0]
+    attentions = middle_blocks[1]
+    resnet_1 = middle_blocks[2]
+
+    resnet_0_paths = renew_resnet_paths(resnet_0)
+    assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
+
+    resnet_1_paths = renew_resnet_paths(resnet_1)
+    assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
+
+    attentions_paths = renew_attention_paths(attentions)
+    meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(
+        attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+    )
+
+    for i in range(num_output_blocks):
+        block_id = i // (config["layers_per_block"] + 1)
+        layer_in_block_id = i % (config["layers_per_block"] + 1)
+        output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
+        output_block_list = {}
+
+        for layer in output_block_layers:
+            layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
+            if layer_id in output_block_list:
+                output_block_list[layer_id].append(layer_name)
+            else:
+                output_block_list[layer_id] = [layer_name]
+
+        if len(output_block_list) > 1:
+            resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
+            attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
+
+            resnet_0_paths = renew_resnet_paths(resnets)
+            paths = renew_resnet_paths(resnets)
+
+            meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
+            assign_to_checkpoint(
+                paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+            )
+
+            output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
+            if ["conv.bias", "conv.weight"] in output_block_list.values():
+                index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.weight"
+                ]
+                new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
+                    f"output_blocks.{i}.{index}.conv.bias"
+                ]
+
+                # Clear attentions as they have been attributed above.
+                if len(attentions) == 2:
+                    attentions = []
+
+            if len(attentions):
+                paths = renew_attention_paths(attentions)
+                meta_path = {
+                    "old": f"output_blocks.{i}.1",
+                    "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
+                }
+                assign_to_checkpoint(
+                    paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
+                )
+        else:
+            resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
+            for path in resnet_0_paths:
+                old_path = ".".join(["output_blocks", str(i), path["old"]])
+                new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
+
+                new_checkpoint[new_path] = unet_state_dict[old_path]
+
+    if controlnet:
+        # conditioning embedding
+
+        orig_index = 0
+
+        new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
+
+        orig_index += 2
+
+        diffusers_index = 0
+
+        while diffusers_index < 6:
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.weight"
+            )
+            new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
+                f"input_hint_block.{orig_index}.bias"
+            )
+            diffusers_index += 1
+            orig_index += 2
+
+        new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.weight"
+        )
+        new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
+            f"input_hint_block.{orig_index}.bias"
+        )
+
+        # down blocks
+        for i in range(num_input_blocks):
+            new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(f"zero_convs.{i}.0.weight")
+            new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(f"zero_convs.{i}.0.bias")
+
+        # mid block
+        new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight")
+        new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias")
+
+    return new_checkpoint
+
+
+def convert_ldm_vae_checkpoint(checkpoint, config):
+    # extract state dict for VAE
+    vae_state_dict = {}
+    keys = list(checkpoint.keys())
+    vae_key = "first_stage_model." if any(k.startswith("first_stage_model.") for k in keys) else ""
+    for key in keys:
+        if key.startswith(vae_key):
+            vae_state_dict[key.replace(vae_key, "")] = checkpoint.get(key)
+
+    new_checkpoint = {}
+
+    new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"]
+    new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"]
+    new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"]
+    new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"]
+    new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"]
+    new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"]
+
+    new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"]
+    new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"]
+    new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"]
+    new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"]
+    new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"]
+    new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"]
+
+    new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"]
+    new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"]
+    new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"]
+    new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"]
+
+    # Retrieves the keys for the encoder down blocks only
+    num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer})
+    down_blocks = {
+        layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks)
+    }
+
+    # Retrieves the keys for the decoder up blocks only
+    num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer})
+    up_blocks = {
+        layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks)
+    }
+
+    for i in range(num_down_blocks):
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]
+
+        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.weight"
+            )
+            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop(
+                f"encoder.down.{i}.downsample.conv.bias"
+            )
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
+        resnets = [
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
+        ]
+
+        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.weight"
+            ]
+            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[
+                f"decoder.up.{block_id}.upsample.conv.bias"
+            ]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
+    num_mid_res_blocks = 2
+    for i in range(1, num_mid_res_blocks + 1):
+        resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key]
+
+        paths = renew_vae_resnet_paths(resnets)
+        meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"}
+        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+
+    mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key]
+    paths = renew_vae_attention_paths(mid_attentions)
+    meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"}
+    assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
+    conv_attn_to_linear(new_checkpoint)
+    return new_checkpoint
+
+
+def convert_ldm_bert_checkpoint(checkpoint, config):
+    def _copy_attn_layer(hf_attn_layer, pt_attn_layer):
+        hf_attn_layer.q_proj.weight.data = pt_attn_layer.to_q.weight
+        hf_attn_layer.k_proj.weight.data = pt_attn_layer.to_k.weight
+        hf_attn_layer.v_proj.weight.data = pt_attn_layer.to_v.weight
+
+        hf_attn_layer.out_proj.weight = pt_attn_layer.to_out.weight
+        hf_attn_layer.out_proj.bias = pt_attn_layer.to_out.bias
+
+    def _copy_linear(hf_linear, pt_linear):
+        hf_linear.weight = pt_linear.weight
+        hf_linear.bias = pt_linear.bias
+
+    def _copy_layer(hf_layer, pt_layer):
+        # copy layer norms
+        _copy_linear(hf_layer.self_attn_layer_norm, pt_layer[0][0])
+        _copy_linear(hf_layer.final_layer_norm, pt_layer[1][0])
+
+        # copy attn
+        _copy_attn_layer(hf_layer.self_attn, pt_layer[0][1])
+
+        # copy MLP
+        pt_mlp = pt_layer[1][1]
+        _copy_linear(hf_layer.fc1, pt_mlp.net[0][0])
+        _copy_linear(hf_layer.fc2, pt_mlp.net[2])
+
+    def _copy_layers(hf_layers, pt_layers):
+        for i, hf_layer in enumerate(hf_layers):
+            if i != 0:
+                i += i
+            pt_layer = pt_layers[i : i + 2]
+            _copy_layer(hf_layer, pt_layer)
+
+    hf_model = LDMBertModel(config).eval()
+
+    # copy  embeds
+    hf_model.model.embed_tokens.weight = checkpoint.transformer.token_emb.weight
+    hf_model.model.embed_positions.weight.data = checkpoint.transformer.pos_emb.emb.weight
+
+    # copy layer norm
+    _copy_linear(hf_model.model.layer_norm, checkpoint.transformer.norm)
+
+    # copy hidden layers
+    _copy_layers(hf_model.model.layers, checkpoint.transformer.attn_layers.layers)
+
+    _copy_linear(hf_model.to_logits, checkpoint.transformer.to_logits)
+
+    return hf_model
+
+
+def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder=None):
+    if text_encoder is None:
+        config_name = "openai/clip-vit-large-patch14"
+        try:
+            config = CLIPTextConfig.from_pretrained(config_name, local_files_only=local_files_only)
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the configuration in the following path: 'openai/clip-vit-large-patch14'."
+            )
+
+        ctx = init_empty_weights if is_accelerate_available() else nullcontext
+        with ctx():
+            text_model = CLIPTextModel(config)
+    else:
+        text_model = text_encoder
+
+    keys = list(checkpoint.keys())
+
+    text_model_dict = {}
+
+    remove_prefixes = ["cond_stage_model.transformer", "conditioner.embedders.0.transformer"]
+
+    for key in keys:
+        for prefix in remove_prefixes:
+            if key.startswith(prefix):
+                text_model_dict[key[len(prefix + ".") :]] = checkpoint[key]
+
+    if is_accelerate_available():
+        for param_name, param in text_model_dict.items():
+            set_module_tensor_to_device(text_model, param_name, "cpu", value=param)
+    else:
+        if not (hasattr(text_model, "embeddings") and hasattr(text_model.embeddings.position_ids)):
+            text_model_dict.pop("text_model.embeddings.position_ids", None)
+
+        text_model.load_state_dict(text_model_dict)
+
+    return text_model
+
+
+textenc_conversion_lst = [
+    ("positional_embedding", "text_model.embeddings.position_embedding.weight"),
+    ("token_embedding.weight", "text_model.embeddings.token_embedding.weight"),
+    ("ln_final.weight", "text_model.final_layer_norm.weight"),
+    ("ln_final.bias", "text_model.final_layer_norm.bias"),
+    ("text_projection", "text_projection.weight"),
+]
+textenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst}
+
+textenc_transformer_conversion_lst = [
+    # (stable-diffusion, HF Diffusers)
+    ("resblocks.", "text_model.encoder.layers."),
+    ("ln_1", "layer_norm1"),
+    ("ln_2", "layer_norm2"),
+    (".c_fc.", ".fc1."),
+    (".c_proj.", ".fc2."),
+    (".attn", ".self_attn"),
+    ("ln_final.", "transformer.text_model.final_layer_norm."),
+    ("token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"),
+    ("positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"),
+]
+protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}
+textenc_pattern = re.compile("|".join(protected.keys()))
+
+
+def convert_paint_by_example_checkpoint(checkpoint, local_files_only=False):
+    config = CLIPVisionConfig.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
+    model = PaintByExampleImageEncoder(config)
+
+    keys = list(checkpoint.keys())
+
+    text_model_dict = {}
+
+    for key in keys:
+        if key.startswith("cond_stage_model.transformer"):
+            text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key]
+
+    # load clip vision
+    model.model.load_state_dict(text_model_dict)
+
+    # load mapper
+    keys_mapper = {
+        k[len("cond_stage_model.mapper.res") :]: v
+        for k, v in checkpoint.items()
+        if k.startswith("cond_stage_model.mapper")
+    }
+
+    MAPPING = {
+        "attn.c_qkv": ["attn1.to_q", "attn1.to_k", "attn1.to_v"],
+        "attn.c_proj": ["attn1.to_out.0"],
+        "ln_1": ["norm1"],
+        "ln_2": ["norm3"],
+        "mlp.c_fc": ["ff.net.0.proj"],
+        "mlp.c_proj": ["ff.net.2"],
+    }
+
+    mapped_weights = {}
+    for key, value in keys_mapper.items():
+        prefix = key[: len("blocks.i")]
+        suffix = key.split(prefix)[-1].split(".")[-1]
+        name = key.split(prefix)[-1].split(suffix)[0][1:-1]
+        mapped_names = MAPPING[name]
+
+        num_splits = len(mapped_names)
+        for i, mapped_name in enumerate(mapped_names):
+            new_name = ".".join([prefix, mapped_name, suffix])
+            shape = value.shape[0] // num_splits
+            mapped_weights[new_name] = value[i * shape : (i + 1) * shape]
+
+    model.mapper.load_state_dict(mapped_weights)
+
+    # load final layer norm
+    model.final_layer_norm.load_state_dict(
+        {
+            "bias": checkpoint["cond_stage_model.final_ln.bias"],
+            "weight": checkpoint["cond_stage_model.final_ln.weight"],
+        }
+    )
+
+    # load final proj
+    model.proj_out.load_state_dict(
+        {
+            "bias": checkpoint["proj_out.bias"],
+            "weight": checkpoint["proj_out.weight"],
+        }
+    )
+
+    # load uncond vector
+    model.uncond_vector.data = torch.nn.Parameter(checkpoint["learnable_vector"])
+    return model
+
+
+def convert_open_clip_checkpoint(
+    checkpoint,
+    config_name,
+    prefix="cond_stage_model.model.",
+    has_projection=False,
+    local_files_only=False,
+    **config_kwargs,
+):
+    # text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
+    # text_model = CLIPTextModelWithProjection.from_pretrained(
+    #    "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280
+    # )
+    try:
+        config = CLIPTextConfig.from_pretrained(config_name, **config_kwargs, local_files_only=local_files_only)
+    except Exception:
+        raise ValueError(
+            f"With local_files_only set to {local_files_only}, you must first locally save the configuration in the following path: '{config_name}'."
+        )
+
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+    with ctx():
+        text_model = CLIPTextModelWithProjection(config) if has_projection else CLIPTextModel(config)
+
+    keys = list(checkpoint.keys())
+
+    keys_to_ignore = []
+    if config_name == "stabilityai/stable-diffusion-2" and config.num_hidden_layers == 23:
+        # make sure to remove all keys > 22
+        keys_to_ignore += [k for k in keys if k.startswith("cond_stage_model.model.transformer.resblocks.23")]
+        keys_to_ignore += ["cond_stage_model.model.text_projection"]
+
+    text_model_dict = {}
+
+    if prefix + "text_projection" in checkpoint:
+        d_model = int(checkpoint[prefix + "text_projection"].shape[0])
+    else:
+        d_model = 1024
+
+    text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")
+
+    for key in keys:
+        if key in keys_to_ignore:
+            continue
+        if key[len(prefix) :] in textenc_conversion_map:
+            if key.endswith("text_projection"):
+                value = checkpoint[key].T.contiguous()
+            else:
+                value = checkpoint[key]
+
+            text_model_dict[textenc_conversion_map[key[len(prefix) :]]] = value
+
+        if key.startswith(prefix + "transformer."):
+            new_key = key[len(prefix + "transformer.") :]
+            if new_key.endswith(".in_proj_weight"):
+                new_key = new_key[: -len(".in_proj_weight")]
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key + ".q_proj.weight"] = checkpoint[key][:d_model, :]
+                text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][d_model : d_model * 2, :]
+                text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][d_model * 2 :, :]
+            elif new_key.endswith(".in_proj_bias"):
+                new_key = new_key[: -len(".in_proj_bias")]
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key + ".q_proj.bias"] = checkpoint[key][:d_model]
+                text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][d_model : d_model * 2]
+                text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][d_model * 2 :]
+            else:
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+
+                text_model_dict[new_key] = checkpoint[key]
+
+    if is_accelerate_available():
+        for param_name, param in text_model_dict.items():
+            set_module_tensor_to_device(text_model, param_name, "cpu", value=param)
+    else:
+        if not (hasattr(text_model, "embeddings") and hasattr(text_model.embeddings.position_ids)):
+            text_model_dict.pop("text_model.embeddings.position_ids", None)
+
+        text_model.load_state_dict(text_model_dict)
+
+    return text_model
+
+
+def stable_unclip_image_encoder(original_config, local_files_only=False):
+    """
+    Returns the image processor and clip image encoder for the img2img unclip pipeline.
+
+    We currently know of two types of stable unclip models which separately use the clip and the openclip image
+    encoders.
+    """
+
+    image_embedder_config = original_config["model"]["params"]["embedder_config"]
+
+    sd_clip_image_embedder_class = image_embedder_config["target"]
+    sd_clip_image_embedder_class = sd_clip_image_embedder_class.split(".")[-1]
+
+    if sd_clip_image_embedder_class == "ClipImageEmbedder":
+        clip_model_name = image_embedder_config.params.model
+
+        if clip_model_name == "ViT-L/14":
+            feature_extractor = CLIPImageProcessor()
+            image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+                "openai/clip-vit-large-patch14", local_files_only=local_files_only
+            )
+        else:
+            raise NotImplementedError(f"Unknown CLIP checkpoint name in stable diffusion checkpoint {clip_model_name}")
+
+    elif sd_clip_image_embedder_class == "FrozenOpenCLIPImageEmbedder":
+        feature_extractor = CLIPImageProcessor()
+        image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+            "laion/CLIP-ViT-H-14-laion2B-s32B-b79K", local_files_only=local_files_only
+        )
+    else:
+        raise NotImplementedError(
+            f"Unknown CLIP image embedder class in stable diffusion checkpoint {sd_clip_image_embedder_class}"
+        )
+
+    return feature_extractor, image_encoder
+
+
+def stable_unclip_image_noising_components(
+    original_config, clip_stats_path: Optional[str] = None, device: Optional[str] = None
+):
+    """
+    Returns the noising components for the img2img and txt2img unclip pipelines.
+
+    Converts the stability noise augmentor into
+    1. a `StableUnCLIPImageNormalizer` for holding the CLIP stats
+    2. a `DDPMScheduler` for holding the noise schedule
+
+    If the noise augmentor config specifies a clip stats path, the `clip_stats_path` must be provided.
+    """
+    noise_aug_config = original_config["model"]["params"]["noise_aug_config"]
+    noise_aug_class = noise_aug_config["target"]
+    noise_aug_class = noise_aug_class.split(".")[-1]
+
+    if noise_aug_class == "CLIPEmbeddingNoiseAugmentation":
+        noise_aug_config = noise_aug_config.params
+        embedding_dim = noise_aug_config.timestep_dim
+        max_noise_level = noise_aug_config.noise_schedule_config.timesteps
+        beta_schedule = noise_aug_config.noise_schedule_config.beta_schedule
+
+        image_normalizer = StableUnCLIPImageNormalizer(embedding_dim=embedding_dim)
+        image_noising_scheduler = DDPMScheduler(num_train_timesteps=max_noise_level, beta_schedule=beta_schedule)
+
+        if "clip_stats_path" in noise_aug_config:
+            if clip_stats_path is None:
+                raise ValueError("This stable unclip config requires a `clip_stats_path`")
+
+            clip_mean, clip_std = torch.load(clip_stats_path, map_location=device)
+            clip_mean = clip_mean[None, :]
+            clip_std = clip_std[None, :]
+
+            clip_stats_state_dict = {
+                "mean": clip_mean,
+                "std": clip_std,
+            }
+
+            image_normalizer.load_state_dict(clip_stats_state_dict)
+    else:
+        raise NotImplementedError(f"Unknown noise augmentor class: {noise_aug_class}")
+
+    return image_normalizer, image_noising_scheduler
+
+
+def convert_controlnet_checkpoint(
+    checkpoint,
+    original_config,
+    checkpoint_path,
+    image_size,
+    upcast_attention,
+    extract_ema,
+    use_linear_projection=None,
+    cross_attention_dim=None,
+):
+    ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, controlnet=True)
+    ctrlnet_config["upcast_attention"] = upcast_attention
+
+    ctrlnet_config.pop("sample_size")
+
+    if use_linear_projection is not None:
+        ctrlnet_config["use_linear_projection"] = use_linear_projection
+
+    if cross_attention_dim is not None:
+        ctrlnet_config["cross_attention_dim"] = cross_attention_dim
+
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+    with ctx():
+        controlnet = ControlNetModel(**ctrlnet_config)
+
+    # Some controlnet ckpt files are distributed independently from the rest of the
+    # model components i.e. https://huggingface.co/thibaud/controlnet-sd21/
+    if "time_embed.0.weight" in checkpoint:
+        skip_extract_state_dict = True
+    else:
+        skip_extract_state_dict = False
+
+    converted_ctrl_checkpoint = convert_ldm_unet_checkpoint(
+        checkpoint,
+        ctrlnet_config,
+        path=checkpoint_path,
+        extract_ema=extract_ema,
+        controlnet=True,
+        skip_extract_state_dict=skip_extract_state_dict,
+    )
+
+    if is_accelerate_available():
+        for param_name, param in converted_ctrl_checkpoint.items():
+            set_module_tensor_to_device(controlnet, param_name, "cpu", value=param)
+    else:
+        controlnet.load_state_dict(converted_ctrl_checkpoint)
+
+    return controlnet
+
+
+def download_from_original_stable_diffusion_ckpt(
+    checkpoint_path_or_dict: Union[str, Dict[str, torch.Tensor]],
+    original_config_file: str = None,
+    image_size: Optional[int] = None,
+    prediction_type: str = None,
+    model_type: str = None,
+    extract_ema: bool = False,
+    scheduler_type: str = "pndm",
+    num_in_channels: Optional[int] = None,
+    upcast_attention: Optional[bool] = None,
+    device: str = None,
+    from_safetensors: bool = False,
+    stable_unclip: Optional[str] = None,
+    stable_unclip_prior: Optional[str] = None,
+    clip_stats_path: Optional[str] = None,
+    controlnet: Optional[bool] = None,
+    adapter: Optional[bool] = None,
+    load_safety_checker: bool = True,
+    pipeline_class: DiffusionPipeline = None,
+    local_files_only=False,
+    vae_path=None,
+    vae=None,
+    text_encoder=None,
+    text_encoder_2=None,
+    tokenizer=None,
+    tokenizer_2=None,
+    config_files=None,
+) -> DiffusionPipeline:
+    """
+    Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml`
+    config file.
+
+    Although many of the arguments can be automatically inferred, some of these rely on brittle checks against the
+    global step count, which will likely fail for models that have undergone further fine-tuning. Therefore, it is
+    recommended that you override the default values and/or supply an `original_config_file` wherever possible.
+
+    Args:
+        checkpoint_path_or_dict (`str` or `dict`): Path to `.ckpt` file, or the state dict.
+        original_config_file (`str`):
+            Path to `.yaml` config file corresponding to the original architecture. If `None`, will be automatically
+            inferred by looking for a key that only exists in SD2.0 models.
+        image_size (`int`, *optional*, defaults to 512):
+            The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2
+            Base. Use 768 for Stable Diffusion v2.
+        prediction_type (`str`, *optional*):
+            The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion v1.X and Stable
+            Diffusion v2 Base. Use `'v_prediction'` for Stable Diffusion v2.
+        num_in_channels (`int`, *optional*, defaults to None):
+            The number of input channels. If `None`, it will be automatically inferred.
+        scheduler_type (`str`, *optional*, defaults to 'pndm'):
+            Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm",
+            "ddim"]`.
+        model_type (`str`, *optional*, defaults to `None`):
+            The pipeline type. `None` to automatically infer, or one of `["FrozenOpenCLIPEmbedder",
+            "FrozenCLIPEmbedder", "PaintByExample"]`.
+        is_img2img (`bool`, *optional*, defaults to `False`):
+            Whether the model should be loaded as an img2img pipeline.
+        extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for
+            checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults to
+            `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for
+            inference. Non-EMA weights are usually better to continue fine-tuning.
+        upcast_attention (`bool`, *optional*, defaults to `None`):
+            Whether the attention computation should always be upcasted. This is necessary when running stable
+            diffusion 2.1.
+        device (`str`, *optional*, defaults to `None`):
+            The device to use. Pass `None` to determine automatically.
+        from_safetensors (`str`, *optional*, defaults to `False`):
+            If `checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.
+        load_safety_checker (`bool`, *optional*, defaults to `True`):
+            Whether to load the safety checker or not. Defaults to `True`.
+        pipeline_class (`str`, *optional*, defaults to `None`):
+            The pipeline class to use. Pass `None` to determine automatically.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            Whether or not to only look at local files (i.e., do not try to download the model).
+        vae (`AutoencoderKL`, *optional*, defaults to `None`):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. If
+            this parameter is `None`, the function will load a new instance of [CLIP] by itself, if needed.
+        text_encoder (`CLIPTextModel`, *optional*, defaults to `None`):
+            An instance of [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel)
+            to use, specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)
+            variant. If this parameter is `None`, the function will load a new instance of [CLIP] by itself, if needed.
+        tokenizer (`CLIPTokenizer`, *optional*, defaults to `None`):
+            An instance of
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer)
+            to use. If this parameter is `None`, the function will load a new instance of [CLIPTokenizer] by itself, if
+            needed.
+        config_files (`Dict[str, str]`, *optional*, defaults to `None`):
+            A dictionary mapping from config file names to their contents. If this parameter is `None`, the function
+            will load the config files by itself, if needed. Valid keys are:
+                - `v1`: Config file for Stable Diffusion v1
+                - `v2`: Config file for Stable Diffusion v2
+                - `xl`: Config file for Stable Diffusion XL
+                - `xl_refiner`: Config file for Stable Diffusion XL Refiner
+        return: A StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file.
+    """
+
+    # import pipelines here to avoid circular import error when using from_single_file method
+    from diffusers import (
+        LDMTextToImagePipeline,
+        PaintByExamplePipeline,
+        StableDiffusionControlNetPipeline,
+        StableDiffusionInpaintPipeline,
+        StableDiffusionPipeline,
+        StableDiffusionUpscalePipeline,
+        StableDiffusionXLControlNetInpaintPipeline,
+        StableDiffusionXLImg2ImgPipeline,
+        StableDiffusionXLInpaintPipeline,
+        StableDiffusionXLPipeline,
+        StableUnCLIPImg2ImgPipeline,
+        StableUnCLIPPipeline,
+    )
+
+    if prediction_type == "v-prediction":
+        prediction_type = "v_prediction"
+
+    if isinstance(checkpoint_path_or_dict, str):
+        if from_safetensors:
+            from safetensors.torch import load_file as safe_load
+
+            checkpoint = safe_load(checkpoint_path_or_dict, device="cpu")
+        else:
+            if device is None:
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+                checkpoint = torch.load(checkpoint_path_or_dict, map_location=device)
+            else:
+                checkpoint = torch.load(checkpoint_path_or_dict, map_location=device)
+    elif isinstance(checkpoint_path_or_dict, dict):
+        checkpoint = checkpoint_path_or_dict
+
+    # Sometimes models don't have the global_step item
+    if "global_step" in checkpoint:
+        global_step = checkpoint["global_step"]
+    else:
+        logger.debug("global_step key not found in model")
+        global_step = None
+
+    # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
+    # "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21
+    while "state_dict" in checkpoint:
+        checkpoint = checkpoint["state_dict"]
+
+    if original_config_file is None:
+        key_name_v2_1 = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight"
+        key_name_sd_xl_base = "conditioner.embedders.1.model.transformer.resblocks.9.mlp.c_proj.bias"
+        key_name_sd_xl_refiner = "conditioner.embedders.0.model.transformer.resblocks.9.mlp.c_proj.bias"
+        is_upscale = pipeline_class == StableDiffusionUpscalePipeline
+
+        config_url = None
+
+        # model_type = "v1"
+        if config_files is not None and "v1" in config_files:
+            original_config_file = config_files["v1"]
+        else:
+            config_url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml"
+
+        if key_name_v2_1 in checkpoint and checkpoint[key_name_v2_1].shape[-1] == 1024:
+            # model_type = "v2"
+            if config_files is not None and "v2" in config_files:
+                original_config_file = config_files["v2"]
+            else:
+                config_url = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml"
+            if global_step == 110000:
+                # v2.1 needs to upcast attention
+                upcast_attention = True
+        elif key_name_sd_xl_base in checkpoint:
+            # only base xl has two text embedders
+            if config_files is not None and "xl" in config_files:
+                original_config_file = config_files["xl"]
+            else:
+                config_url = "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml"
+        elif key_name_sd_xl_refiner in checkpoint:
+            # only refiner xl has embedder and one text embedders
+            if config_files is not None and "xl_refiner" in config_files:
+                original_config_file = config_files["xl_refiner"]
+            else:
+                config_url = "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_refiner.yaml"
+
+        if is_upscale:
+            config_url = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/x4-upscaling.yaml"
+
+        if config_url is not None:
+            original_config_file = BytesIO(requests.get(config_url).content)
+        else:
+            with open(original_config_file, "r") as f:
+                original_config_file = f.read()
+    else:
+        with open(original_config_file, "r") as f:
+            original_config_file = f.read()
+
+    original_config = yaml.safe_load(original_config_file)
+
+    # Convert the text model.
+    if (
+        model_type is None
+        and "cond_stage_config" in original_config["model"]["params"]
+        and original_config["model"]["params"]["cond_stage_config"] is not None
+    ):
+        model_type = original_config["model"]["params"]["cond_stage_config"]["target"].split(".")[-1]
+        logger.debug(f"no `model_type` given, `model_type` inferred as: {model_type}")
+    elif model_type is None and original_config["model"]["params"]["network_config"] is not None:
+        if original_config["model"]["params"]["network_config"]["params"]["context_dim"] == 2048:
+            model_type = "SDXL"
+        else:
+            model_type = "SDXL-Refiner"
+        if image_size is None:
+            image_size = 1024
+
+    if pipeline_class is None:
+        # Check if we have a SDXL or SD model and initialize default pipeline
+        if model_type not in ["SDXL", "SDXL-Refiner"]:
+            pipeline_class = StableDiffusionPipeline if not controlnet else StableDiffusionControlNetPipeline
+        else:
+            pipeline_class = StableDiffusionXLPipeline if model_type == "SDXL" else StableDiffusionXLImg2ImgPipeline
+
+    if num_in_channels is None and pipeline_class in [
+        StableDiffusionInpaintPipeline,
+        StableDiffusionXLInpaintPipeline,
+        StableDiffusionXLControlNetInpaintPipeline,
+    ]:
+        num_in_channels = 9
+    if num_in_channels is None and pipeline_class == StableDiffusionUpscalePipeline:
+        num_in_channels = 7
+    elif num_in_channels is None:
+        num_in_channels = 4
+
+    if "unet_config" in original_config["model"]["params"]:
+        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
+
+    if (
+        "parameterization" in original_config["model"]["params"]
+        and original_config["model"]["params"]["parameterization"] == "v"
+    ):
+        if prediction_type is None:
+            # NOTE: For stable diffusion 2 base it is recommended to pass `prediction_type=="epsilon"`
+            # as it relies on a brittle global step parameter here
+            prediction_type = "epsilon" if global_step == 875000 else "v_prediction"
+        if image_size is None:
+            # NOTE: For stable diffusion 2 base one has to pass `image_size==512`
+            # as it relies on a brittle global step parameter here
+            image_size = 512 if global_step == 875000 else 768
+    else:
+        if prediction_type is None:
+            prediction_type = "epsilon"
+        if image_size is None:
+            image_size = 512
+
+    if controlnet is None and "control_stage_config" in original_config["model"]["params"]:
+        path = checkpoint_path_or_dict if isinstance(checkpoint_path_or_dict, str) else ""
+        controlnet = convert_controlnet_checkpoint(
+            checkpoint, original_config, path, image_size, upcast_attention, extract_ema
+        )
+
+    if "timesteps" in original_config["model"]["params"]:
+        num_train_timesteps = original_config["model"]["params"]["timesteps"]
+    else:
+        num_train_timesteps = 1000
+
+    if model_type in ["SDXL", "SDXL-Refiner"]:
+        scheduler_dict = {
+            "beta_schedule": "scaled_linear",
+            "beta_start": 0.00085,
+            "beta_end": 0.012,
+            "interpolation_type": "linear",
+            "num_train_timesteps": num_train_timesteps,
+            "prediction_type": "epsilon",
+            "sample_max_value": 1.0,
+            "set_alpha_to_one": False,
+            "skip_prk_steps": True,
+            "steps_offset": 1,
+            "timestep_spacing": "leading",
+        }
+        scheduler = EulerDiscreteScheduler.from_config(scheduler_dict)
+        scheduler_type = "euler"
+    else:
+        if "linear_start" in original_config["model"]["params"]:
+            beta_start = original_config["model"]["params"]["linear_start"]
+        else:
+            beta_start = 0.02
+
+        if "linear_end" in original_config["model"]["params"]:
+            beta_end = original_config["model"]["params"]["linear_end"]
+        else:
+            beta_end = 0.085
+        scheduler = DDIMScheduler(
+            beta_end=beta_end,
+            beta_schedule="scaled_linear",
+            beta_start=beta_start,
+            num_train_timesteps=num_train_timesteps,
+            steps_offset=1,
+            clip_sample=False,
+            set_alpha_to_one=False,
+            prediction_type=prediction_type,
+        )
+    # make sure scheduler works correctly with DDIM
+    scheduler.register_to_config(clip_sample=False)
+
+    if scheduler_type == "pndm":
+        config = dict(scheduler.config)
+        config["skip_prk_steps"] = True
+        scheduler = PNDMScheduler.from_config(config)
+    elif scheduler_type == "lms":
+        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "heun":
+        scheduler = HeunDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "euler":
+        scheduler = EulerDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "euler-ancestral":
+        scheduler = EulerAncestralDiscreteScheduler.from_config(scheduler.config)
+    elif scheduler_type == "dpm":
+        scheduler = DPMSolverMultistepScheduler.from_config(scheduler.config)
+    elif scheduler_type == "ddim":
+        scheduler = scheduler
+    else:
+        raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
+
+    if pipeline_class == StableDiffusionUpscalePipeline:
+        image_size = original_config["model"]["params"]["unet_config"]["params"]["image_size"]
+
+    # Convert the UNet2DConditionModel model.
+    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
+    unet_config["upcast_attention"] = upcast_attention
+
+    path = checkpoint_path_or_dict if isinstance(checkpoint_path_or_dict, str) else ""
+    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
+        checkpoint, unet_config, path=path, extract_ema=extract_ema
+    )
+
+    ctx = init_empty_weights if is_accelerate_available() else nullcontext
+    with ctx():
+        unet = UNet2DConditionModel(**unet_config)
+
+    if is_accelerate_available():
+        if model_type not in ["SDXL", "SDXL-Refiner"]:  # SBM Delay this.
+            for param_name, param in converted_unet_checkpoint.items():
+                set_module_tensor_to_device(unet, param_name, "cpu", value=param)
+    else:
+        unet.load_state_dict(converted_unet_checkpoint)
+
+    # Convert the VAE model.
+    if vae_path is None and vae is None:
+        vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
+        converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)
+
+        if (
+            "model" in original_config
+            and "params" in original_config["model"]
+            and "scale_factor" in original_config["model"]["params"]
+        ):
+            vae_scaling_factor = original_config["model"]["params"]["scale_factor"]
+        else:
+            vae_scaling_factor = 0.18215  # default SD scaling factor
+
+        vae_config["scaling_factor"] = vae_scaling_factor
+
+        ctx = init_empty_weights if is_accelerate_available() else nullcontext
+        with ctx():
+            vae = AutoencoderKL(**vae_config)
+
+        if is_accelerate_available():
+            for param_name, param in converted_vae_checkpoint.items():
+                set_module_tensor_to_device(vae, param_name, "cpu", value=param)
+        else:
+            vae.load_state_dict(converted_vae_checkpoint)
+    elif vae is None:
+        vae = AutoencoderKL.from_pretrained(vae_path, local_files_only=local_files_only)
+
+    if model_type == "FrozenOpenCLIPEmbedder":
+        config_name = "stabilityai/stable-diffusion-2"
+        config_kwargs = {"subfolder": "text_encoder"}
+
+        if text_encoder is None:
+            text_model = convert_open_clip_checkpoint(
+                checkpoint, config_name, local_files_only=local_files_only, **config_kwargs
+            )
+        else:
+            text_model = text_encoder
+
+        try:
+            tokenizer = CLIPTokenizer.from_pretrained(
+                "stabilityai/stable-diffusion-2", subfolder="tokenizer", local_files_only=local_files_only
+            )
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'stabilityai/stable-diffusion-2'."
+            )
+
+        if stable_unclip is None:
+            if controlnet:
+                pipe = pipeline_class(
+                    vae=vae,
+                    text_encoder=text_model,
+                    tokenizer=tokenizer,
+                    unet=unet,
+                    scheduler=scheduler,
+                    controlnet=controlnet,
+                    safety_checker=None,
+                    feature_extractor=None,
+                )
+                if hasattr(pipe, "requires_safety_checker"):
+                    pipe.requires_safety_checker = False
+
+            elif pipeline_class == StableDiffusionUpscalePipeline:
+                scheduler = DDIMScheduler.from_pretrained(
+                    "stabilityai/stable-diffusion-x4-upscaler", subfolder="scheduler"
+                )
+                low_res_scheduler = DDPMScheduler.from_pretrained(
+                    "stabilityai/stable-diffusion-x4-upscaler", subfolder="low_res_scheduler"
+                )
+
+                pipe = pipeline_class(
+                    vae=vae,
+                    text_encoder=text_model,
+                    tokenizer=tokenizer,
+                    unet=unet,
+                    scheduler=scheduler,
+                    low_res_scheduler=low_res_scheduler,
+                    safety_checker=None,
+                    feature_extractor=None,
+                )
+
+            else:
+                pipe = pipeline_class(
+                    vae=vae,
+                    text_encoder=text_model,
+                    tokenizer=tokenizer,
+                    unet=unet,
+                    scheduler=scheduler,
+                    safety_checker=None,
+                    feature_extractor=None,
+                )
+                if hasattr(pipe, "requires_safety_checker"):
+                    pipe.requires_safety_checker = False
+
+        else:
+            image_normalizer, image_noising_scheduler = stable_unclip_image_noising_components(
+                original_config, clip_stats_path=clip_stats_path, device=device
+            )
+
+            if stable_unclip == "img2img":
+                feature_extractor, image_encoder = stable_unclip_image_encoder(original_config)
+
+                pipe = StableUnCLIPImg2ImgPipeline(
+                    # image encoding components
+                    feature_extractor=feature_extractor,
+                    image_encoder=image_encoder,
+                    # image noising components
+                    image_normalizer=image_normalizer,
+                    image_noising_scheduler=image_noising_scheduler,
+                    # regular denoising components
+                    tokenizer=tokenizer,
+                    text_encoder=text_model,
+                    unet=unet,
+                    scheduler=scheduler,
+                    # vae
+                    vae=vae,
+                )
+            elif stable_unclip == "txt2img":
+                if stable_unclip_prior is None or stable_unclip_prior == "karlo":
+                    karlo_model = "kakaobrain/karlo-v1-alpha"
+                    prior = PriorTransformer.from_pretrained(
+                        karlo_model, subfolder="prior", local_files_only=local_files_only
+                    )
+
+                    try:
+                        prior_tokenizer = CLIPTokenizer.from_pretrained(
+                            "openai/clip-vit-large-patch14", local_files_only=local_files_only
+                        )
+                    except Exception:
+                        raise ValueError(
+                            f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
+                        )
+                    prior_text_model = CLIPTextModelWithProjection.from_pretrained(
+                        "openai/clip-vit-large-patch14", local_files_only=local_files_only
+                    )
+
+                    prior_scheduler = UnCLIPScheduler.from_pretrained(
+                        karlo_model, subfolder="prior_scheduler", local_files_only=local_files_only
+                    )
+                    prior_scheduler = DDPMScheduler.from_config(prior_scheduler.config)
+                else:
+                    raise NotImplementedError(f"unknown prior for stable unclip model: {stable_unclip_prior}")
+
+                pipe = StableUnCLIPPipeline(
+                    # prior components
+                    prior_tokenizer=prior_tokenizer,
+                    prior_text_encoder=prior_text_model,
+                    prior=prior,
+                    prior_scheduler=prior_scheduler,
+                    # image noising components
+                    image_normalizer=image_normalizer,
+                    image_noising_scheduler=image_noising_scheduler,
+                    # regular denoising components
+                    tokenizer=tokenizer,
+                    text_encoder=text_model,
+                    unet=unet,
+                    scheduler=scheduler,
+                    # vae
+                    vae=vae,
+                )
+            else:
+                raise NotImplementedError(f"unknown `stable_unclip` type: {stable_unclip}")
+    elif model_type == "PaintByExample":
+        vision_model = convert_paint_by_example_checkpoint(checkpoint)
+        try:
+            tokenizer = CLIPTokenizer.from_pretrained(
+                "openai/clip-vit-large-patch14", local_files_only=local_files_only
+            )
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
+            )
+        try:
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
+            )
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the feature_extractor in the following path: 'CompVis/stable-diffusion-safety-checker'."
+            )
+        pipe = PaintByExamplePipeline(
+            vae=vae,
+            image_encoder=vision_model,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=None,
+            feature_extractor=feature_extractor,
+        )
+    elif model_type == "FrozenCLIPEmbedder":
+        text_model = convert_ldm_clip_checkpoint(
+            checkpoint, local_files_only=local_files_only, text_encoder=text_encoder
+        )
+        try:
+            tokenizer = (
+                CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
+                if tokenizer is None
+                else tokenizer
+            )
+        except Exception:
+            raise ValueError(
+                f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
+            )
+
+        if load_safety_checker:
+            safety_checker = StableDiffusionSafetyChecker.from_pretrained(
+                "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
+            )
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                "CompVis/stable-diffusion-safety-checker", local_files_only=local_files_only
+            )
+        else:
+            safety_checker = None
+            feature_extractor = None
+
+        if controlnet:
+            pipe = pipeline_class(
+                vae=vae,
+                text_encoder=text_model,
+                tokenizer=tokenizer,
+                unet=unet,
+                controlnet=controlnet,
+                scheduler=scheduler,
+                safety_checker=safety_checker,
+                feature_extractor=feature_extractor,
+            )
+        else:
+            pipe = pipeline_class(
+                vae=vae,
+                text_encoder=text_model,
+                tokenizer=tokenizer,
+                unet=unet,
+                scheduler=scheduler,
+                safety_checker=safety_checker,
+                feature_extractor=feature_extractor,
+            )
+    elif model_type in ["SDXL", "SDXL-Refiner"]:
+        is_refiner = model_type == "SDXL-Refiner"
+
+        if (is_refiner is False) and (tokenizer is None):
+            try:
+                tokenizer = CLIPTokenizer.from_pretrained(
+                    "openai/clip-vit-large-patch14", local_files_only=local_files_only
+                )
+            except Exception:
+                raise ValueError(
+                    f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'openai/clip-vit-large-patch14'."
+                )
+
+        if (is_refiner is False) and (text_encoder is None):
+            text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
+
+        if tokenizer_2 is None:
+            try:
+                tokenizer_2 = CLIPTokenizer.from_pretrained(
+                    "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!", local_files_only=local_files_only
+                )
+            except Exception:
+                raise ValueError(
+                    f"With local_files_only set to {local_files_only}, you must first locally save the tokenizer in the following path: 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k' with `pad_token` set to '!'."
+                )
+
+        if text_encoder_2 is None:
+            config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
+            config_kwargs = {"projection_dim": 1280}
+            prefix = "conditioner.embedders.0.model." if is_refiner else "conditioner.embedders.1.model."
+
+            text_encoder_2 = convert_open_clip_checkpoint(
+                checkpoint,
+                config_name,
+                prefix=prefix,
+                has_projection=True,
+                local_files_only=local_files_only,
+                **config_kwargs,
+            )
+
+        if is_accelerate_available():  # SBM Now move model to cpu.
+            for param_name, param in converted_unet_checkpoint.items():
+                set_module_tensor_to_device(unet, param_name, "cpu", value=param)
+
+        if controlnet:
+            pipe = pipeline_class(
+                vae=vae,
+                text_encoder=text_encoder,
+                tokenizer=tokenizer,
+                text_encoder_2=text_encoder_2,
+                tokenizer_2=tokenizer_2,
+                unet=unet,
+                controlnet=controlnet,
+                scheduler=scheduler,
+                force_zeros_for_empty_prompt=True,
+            )
+        elif adapter:
+            pipe = pipeline_class(
+                vae=vae,
+                text_encoder=text_encoder,
+                tokenizer=tokenizer,
+                text_encoder_2=text_encoder_2,
+                tokenizer_2=tokenizer_2,
+                unet=unet,
+                adapter=adapter,
+                scheduler=scheduler,
+                force_zeros_for_empty_prompt=True,
+            )
+
+        else:
+            pipeline_kwargs = {
+                "vae": vae,
+                "text_encoder": text_encoder,
+                "tokenizer": tokenizer,
+                "text_encoder_2": text_encoder_2,
+                "tokenizer_2": tokenizer_2,
+                "unet": unet,
+                "scheduler": scheduler,
+            }
+
+            if (pipeline_class == StableDiffusionXLImg2ImgPipeline) or (
+                pipeline_class == StableDiffusionXLInpaintPipeline
+            ):
+                pipeline_kwargs.update({"requires_aesthetics_score": is_refiner})
+
+            if is_refiner:
+                pipeline_kwargs.update({"force_zeros_for_empty_prompt": False})
+
+            pipe = pipeline_class(**pipeline_kwargs)
+    else:
+        text_config = create_ldm_bert_config(original_config)
+        text_model = convert_ldm_bert_checkpoint(checkpoint, text_config)
+        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", local_files_only=local_files_only)
+        pipe = LDMTextToImagePipeline(vqvae=vae, bert=text_model, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
+
+    return pipe
+
+
+def download_controlnet_from_original_ckpt(
+    checkpoint_path: str,
+    original_config_file: str,
+    image_size: int = 512,
+    extract_ema: bool = False,
+    num_in_channels: Optional[int] = None,
+    upcast_attention: Optional[bool] = None,
+    device: str = None,
+    from_safetensors: bool = False,
+    use_linear_projection: Optional[bool] = None,
+    cross_attention_dim: Optional[bool] = None,
+) -> DiffusionPipeline:
+    if from_safetensors:
+        from safetensors import safe_open
+
+        checkpoint = {}
+        with safe_open(checkpoint_path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                checkpoint[key] = f.get_tensor(key)
+    else:
+        if device is None:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+        else:
+            checkpoint = torch.load(checkpoint_path, map_location=device)
+
+    # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
+    # "state_dict" key https://huggingface.co/thibaud/controlnet-canny-sd21
+    while "state_dict" in checkpoint:
+        checkpoint = checkpoint["state_dict"]
+
+    original_config = yaml.safe_load(original_config_file)
+
+    if num_in_channels is not None:
+        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
+
+    if "control_stage_config" not in original_config["model"]["params"]:
+        raise ValueError("`control_stage_config` not present in original config")
+
+    controlnet = convert_controlnet_checkpoint(
+        checkpoint,
+        original_config,
+        checkpoint_path,
+        image_size,
+        upcast_attention,
+        extract_ema,
+        use_linear_projection=use_linear_projection,
+        cross_attention_dim=cross_attention_dim,
+    )
+
+    return controlnet
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
new file mode 100644
index 000000000..55ff51c62
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
@@ -0,0 +1,473 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from functools import partial
+from typing import Dict, List, Optional, Union
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict
+from flax.jax_utils import unreplicate
+from flax.training.common_utils import shard
+from packaging import version
+from PIL import Image
+from transformers import CLIPImageProcessor, CLIPTokenizer, FlaxCLIPTextModel
+
+from ...models import FlaxAutoencoderKL, FlaxUNet2DConditionModel
+from ...schedulers import (
+    FlaxDDIMScheduler,
+    FlaxDPMSolverMultistepScheduler,
+    FlaxLMSDiscreteScheduler,
+    FlaxPNDMScheduler,
+)
+from ...utils import deprecate, logging, replace_example_docstring
+from ..pipeline_flax_utils import FlaxDiffusionPipeline
+from .pipeline_output import FlaxStableDiffusionPipelineOutput
+from .safety_checker_flax import FlaxStableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+# Set to True to use python for loop instead of jax.fori_loop for easier debugging
+DEBUG = False
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import jax
+        >>> import numpy as np
+        >>> from flax.jax_utils import replicate
+        >>> from flax.training.common_utils import shard
+
+        >>> from diffusers import FlaxStableDiffusionPipeline
+
+        >>> pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", revision="bf16", dtype=jax.numpy.bfloat16
+        ... )
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+
+        >>> prng_seed = jax.random.PRNGKey(0)
+        >>> num_inference_steps = 50
+
+        >>> num_samples = jax.device_count()
+        >>> prompt = num_samples * [prompt]
+        >>> prompt_ids = pipeline.prepare_inputs(prompt)
+        # shard inputs and rng
+
+        >>> params = replicate(params)
+        >>> prng_seed = jax.random.split(prng_seed, jax.device_count())
+        >>> prompt_ids = shard(prompt_ids)
+
+        >>> images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
+        >>> images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
+        ```
+"""
+
+
+class FlaxStableDiffusionPipeline(FlaxDiffusionPipeline):
+    r"""
+    Flax-based pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`FlaxDiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`FlaxAutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.FlaxCLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`FlaxUNet2DConditionModel`]):
+            A `FlaxUNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`FlaxDDIMScheduler`], [`FlaxLMSDiscreteScheduler`], [`FlaxPNDMScheduler`], or
+            [`FlaxDPMSolverMultistepScheduler`].
+        safety_checker ([`FlaxStableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: FlaxAutoencoderKL,
+        text_encoder: FlaxCLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: FlaxUNet2DConditionModel,
+        scheduler: Union[
+            FlaxDDIMScheduler, FlaxPNDMScheduler, FlaxLMSDiscreteScheduler, FlaxDPMSolverMultistepScheduler
+        ],
+        safety_checker: FlaxStableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        dtype: jnp.dtype = jnp.float32,
+    ):
+        super().__init__()
+        self.dtype = dtype
+
+        if safety_checker is None:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+    def prepare_inputs(self, prompt: Union[str, List[str]]):
+        if not isinstance(prompt, (str, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="np",
+        )
+        return text_input.input_ids
+
+    def _get_has_nsfw_concepts(self, features, params):
+        has_nsfw_concepts = self.safety_checker(features, params)
+        return has_nsfw_concepts
+
+    def _run_safety_checker(self, images, safety_model_params, jit=False):
+        # safety_model_params should already be replicated when jit is True
+        pil_images = [Image.fromarray(image) for image in images]
+        features = self.feature_extractor(pil_images, return_tensors="np").pixel_values
+
+        if jit:
+            features = shard(features)
+            has_nsfw_concepts = _p_get_has_nsfw_concepts(self, features, safety_model_params)
+            has_nsfw_concepts = unshard(has_nsfw_concepts)
+            safety_model_params = unreplicate(safety_model_params)
+        else:
+            has_nsfw_concepts = self._get_has_nsfw_concepts(features, safety_model_params)
+
+        images_was_copied = False
+        for idx, has_nsfw_concept in enumerate(has_nsfw_concepts):
+            if has_nsfw_concept:
+                if not images_was_copied:
+                    images_was_copied = True
+                    images = images.copy()
+
+                images[idx] = np.zeros(images[idx].shape, dtype=np.uint8)  # black image
+
+            if any(has_nsfw_concepts):
+                warnings.warn(
+                    "Potential NSFW content was detected in one or more images. A black image will be returned"
+                    " instead. Try again with a different prompt and/or seed."
+                )
+
+        return images, has_nsfw_concepts
+
+    def _generate(
+        self,
+        prompt_ids: jnp.array,
+        params: Union[Dict, FrozenDict],
+        prng_seed: jax.Array,
+        num_inference_steps: int,
+        height: int,
+        width: int,
+        guidance_scale: float,
+        latents: Optional[jnp.ndarray] = None,
+        neg_prompt_ids: Optional[jnp.ndarray] = None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        # get prompt text embeddings
+        prompt_embeds = self.text_encoder(prompt_ids, params=params["text_encoder"])[0]
+
+        # TODO: currently it is assumed `do_classifier_free_guidance = guidance_scale > 1.0`
+        # implement this conditional `do_classifier_free_guidance = guidance_scale > 1.0`
+        batch_size = prompt_ids.shape[0]
+
+        max_length = prompt_ids.shape[-1]
+
+        if neg_prompt_ids is None:
+            uncond_input = self.tokenizer(
+                [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="np"
+            ).input_ids
+        else:
+            uncond_input = neg_prompt_ids
+        negative_prompt_embeds = self.text_encoder(uncond_input, params=params["text_encoder"])[0]
+        context = jnp.concatenate([negative_prompt_embeds, prompt_embeds])
+
+        # Ensure model output will be `float32` before going into the scheduler
+        guidance_scale = jnp.array([guidance_scale], dtype=jnp.float32)
+
+        latents_shape = (
+            batch_size,
+            self.unet.config.in_channels,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if latents is None:
+            latents = jax.random.normal(prng_seed, shape=latents_shape, dtype=jnp.float32)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+
+        def loop_body(step, args):
+            latents, scheduler_state = args
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            latents_input = jnp.concatenate([latents] * 2)
+
+            t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
+            timestep = jnp.broadcast_to(t, latents_input.shape[0])
+
+            latents_input = self.scheduler.scale_model_input(scheduler_state, latents_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet.apply(
+                {"params": params["unet"]},
+                jnp.array(latents_input),
+                jnp.array(timestep, dtype=jnp.int32),
+                encoder_hidden_states=context,
+            ).sample
+            # perform guidance
+            noise_pred_uncond, noise_prediction_text = jnp.split(noise_pred, 2, axis=0)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents, scheduler_state = self.scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
+            return latents, scheduler_state
+
+        scheduler_state = self.scheduler.set_timesteps(
+            params["scheduler"], num_inference_steps=num_inference_steps, shape=latents.shape
+        )
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * params["scheduler"].init_noise_sigma
+
+        if DEBUG:
+            # run with python for loop
+            for i in range(num_inference_steps):
+                latents, scheduler_state = loop_body(i, (latents, scheduler_state))
+        else:
+            latents, _ = jax.lax.fori_loop(0, num_inference_steps, loop_body, (latents, scheduler_state))
+
+        # scale and decode the image latents with vae
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.apply({"params": params["vae"]}, latents, method=self.vae.decode).sample
+
+        image = (image / 2 + 0.5).clip(0, 1).transpose(0, 2, 3, 1)
+        return image
+
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt_ids: jnp.array,
+        params: Union[Dict, FrozenDict],
+        prng_seed: jax.Array,
+        num_inference_steps: int = 50,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        guidance_scale: Union[float, jnp.ndarray] = 7.5,
+        latents: jnp.ndarray = None,
+        neg_prompt_ids: jnp.ndarray = None,
+        return_dict: bool = True,
+        jit: bool = False,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            latents (`jnp.ndarray`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                array is generated by sampling using the supplied random `generator`.
+            jit (`bool`, defaults to `False`):
+                Whether to run `pmap` versions of the generation and safety scoring functions.
+
+                    <Tip warning={true}>
+
+                    This argument exists because `__call__` is not yet end-to-end pmap-able. It will be removed in a
+                    future release.
+
+                    </Tip>
+
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] instead of
+                a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated images
+                and the second element is a list of `bool`s indicating whether the corresponding generated image
+                contains "not-safe-for-work" (nsfw) content.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        if isinstance(guidance_scale, float):
+            # Convert to a tensor so each device gets a copy. Follow the prompt_ids for
+            # shape information, as they may be sharded (when `jit` is `True`), or not.
+            guidance_scale = jnp.array([guidance_scale] * prompt_ids.shape[0])
+            if len(prompt_ids.shape) > 2:
+                # Assume sharded
+                guidance_scale = guidance_scale[:, None]
+
+        if jit:
+            images = _p_generate(
+                self,
+                prompt_ids,
+                params,
+                prng_seed,
+                num_inference_steps,
+                height,
+                width,
+                guidance_scale,
+                latents,
+                neg_prompt_ids,
+            )
+        else:
+            images = self._generate(
+                prompt_ids,
+                params,
+                prng_seed,
+                num_inference_steps,
+                height,
+                width,
+                guidance_scale,
+                latents,
+                neg_prompt_ids,
+            )
+
+        if self.safety_checker is not None:
+            safety_params = params["safety_checker"]
+            images_uint8_casted = (images * 255).round().astype("uint8")
+            num_devices, batch_size = images.shape[:2]
+
+            images_uint8_casted = np.asarray(images_uint8_casted).reshape(num_devices * batch_size, height, width, 3)
+            images_uint8_casted, has_nsfw_concept = self._run_safety_checker(images_uint8_casted, safety_params, jit)
+            images = np.asarray(images).copy()
+
+            # block images
+            if any(has_nsfw_concept):
+                for i, is_nsfw in enumerate(has_nsfw_concept):
+                    if is_nsfw:
+                        images[i, 0] = np.asarray(images_uint8_casted[i])
+
+            images = images.reshape(num_devices, batch_size, height, width, 3)
+        else:
+            images = np.asarray(images)
+            has_nsfw_concept = False
+
+        if not return_dict:
+            return (images, has_nsfw_concept)
+
+        return FlaxStableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept)
+
+
+# Static argnums are pipe, num_inference_steps, height, width. A change would trigger recompilation.
+# Non-static args are (sharded) input tensors mapped over their first dimension (hence, `0`).
+@partial(
+    jax.pmap,
+    in_axes=(None, 0, 0, 0, None, None, None, 0, 0, 0),
+    static_broadcasted_argnums=(0, 4, 5, 6),
+)
+def _p_generate(
+    pipe,
+    prompt_ids,
+    params,
+    prng_seed,
+    num_inference_steps,
+    height,
+    width,
+    guidance_scale,
+    latents,
+    neg_prompt_ids,
+):
+    return pipe._generate(
+        prompt_ids,
+        params,
+        prng_seed,
+        num_inference_steps,
+        height,
+        width,
+        guidance_scale,
+        latents,
+        neg_prompt_ids,
+    )
+
+
+@partial(jax.pmap, static_broadcasted_argnums=(0,))
+def _p_get_has_nsfw_concepts(pipe, features, params):
+    return pipe._get_has_nsfw_concepts(features, params)
+
+
+def unshard(x: jnp.ndarray):
+    # einops.rearrange(x, 'd b ... -> (d b) ...')
+    num_devices, batch_size = x.shape[:2]
+    rest = x.shape[2:]
+    return x.reshape(num_devices * batch_size, *rest)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py
new file mode 100644
index 000000000..7792bc097
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py
@@ -0,0 +1,532 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from functools import partial
+from typing import Dict, List, Optional, Union
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict
+from flax.jax_utils import unreplicate
+from flax.training.common_utils import shard
+from PIL import Image
+from transformers import CLIPImageProcessor, CLIPTokenizer, FlaxCLIPTextModel
+
+from ...models import FlaxAutoencoderKL, FlaxUNet2DConditionModel
+from ...schedulers import (
+    FlaxDDIMScheduler,
+    FlaxDPMSolverMultistepScheduler,
+    FlaxLMSDiscreteScheduler,
+    FlaxPNDMScheduler,
+)
+from ...utils import PIL_INTERPOLATION, logging, replace_example_docstring
+from ..pipeline_flax_utils import FlaxDiffusionPipeline
+from .pipeline_output import FlaxStableDiffusionPipelineOutput
+from .safety_checker_flax import FlaxStableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+# Set to True to use python for loop instead of jax.fori_loop for easier debugging
+DEBUG = False
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import jax
+        >>> import numpy as np
+        >>> import jax.numpy as jnp
+        >>> from flax.jax_utils import replicate
+        >>> from flax.training.common_utils import shard
+        >>> import requests
+        >>> from io import BytesIO
+        >>> from PIL import Image
+        >>> from diffusers import FlaxStableDiffusionImg2ImgPipeline
+
+
+        >>> def create_key(seed=0):
+        ...     return jax.random.PRNGKey(seed)
+
+
+        >>> rng = create_key(0)
+
+        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+        >>> response = requests.get(url)
+        >>> init_img = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> init_img = init_img.resize((768, 512))
+
+        >>> prompts = "A fantasy landscape, trending on artstation"
+
+        >>> pipeline, params = FlaxStableDiffusionImg2ImgPipeline.from_pretrained(
+        ...     "CompVis/stable-diffusion-v1-4",
+        ...     revision="flax",
+        ...     dtype=jnp.bfloat16,
+        ... )
+
+        >>> num_samples = jax.device_count()
+        >>> rng = jax.random.split(rng, jax.device_count())
+        >>> prompt_ids, processed_image = pipeline.prepare_inputs(
+        ...     prompt=[prompts] * num_samples, image=[init_img] * num_samples
+        ... )
+        >>> p_params = replicate(params)
+        >>> prompt_ids = shard(prompt_ids)
+        >>> processed_image = shard(processed_image)
+
+        >>> output = pipeline(
+        ...     prompt_ids=prompt_ids,
+        ...     image=processed_image,
+        ...     params=p_params,
+        ...     prng_seed=rng,
+        ...     strength=0.75,
+        ...     num_inference_steps=50,
+        ...     jit=True,
+        ...     height=512,
+        ...     width=768,
+        ... ).images
+
+        >>> output_images = pipeline.numpy_to_pil(np.asarray(output.reshape((num_samples,) + output.shape[-3:])))
+        ```
+"""
+
+
+class FlaxStableDiffusionImg2ImgPipeline(FlaxDiffusionPipeline):
+    r"""
+    Flax-based pipeline for text-guided image-to-image generation using Stable Diffusion.
+
+    This model inherits from [`FlaxDiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`FlaxAutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.FlaxCLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`FlaxUNet2DConditionModel`]):
+            A `FlaxUNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`FlaxDDIMScheduler`], [`FlaxLMSDiscreteScheduler`], [`FlaxPNDMScheduler`], or
+            [`FlaxDPMSolverMultistepScheduler`].
+        safety_checker ([`FlaxStableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: FlaxAutoencoderKL,
+        text_encoder: FlaxCLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: FlaxUNet2DConditionModel,
+        scheduler: Union[
+            FlaxDDIMScheduler, FlaxPNDMScheduler, FlaxLMSDiscreteScheduler, FlaxDPMSolverMultistepScheduler
+        ],
+        safety_checker: FlaxStableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        dtype: jnp.dtype = jnp.float32,
+    ):
+        super().__init__()
+        self.dtype = dtype
+
+        if safety_checker is None:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+    def prepare_inputs(self, prompt: Union[str, List[str]], image: Union[Image.Image, List[Image.Image]]):
+        if not isinstance(prompt, (str, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if not isinstance(image, (Image.Image, list)):
+            raise ValueError(f"image has to be of type `PIL.Image.Image` or list but is {type(image)}")
+
+        if isinstance(image, Image.Image):
+            image = [image]
+
+        processed_images = jnp.concatenate([preprocess(img, jnp.float32) for img in image])
+
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="np",
+        )
+        return text_input.input_ids, processed_images
+
+    def _get_has_nsfw_concepts(self, features, params):
+        has_nsfw_concepts = self.safety_checker(features, params)
+        return has_nsfw_concepts
+
+    def _run_safety_checker(self, images, safety_model_params, jit=False):
+        # safety_model_params should already be replicated when jit is True
+        pil_images = [Image.fromarray(image) for image in images]
+        features = self.feature_extractor(pil_images, return_tensors="np").pixel_values
+
+        if jit:
+            features = shard(features)
+            has_nsfw_concepts = _p_get_has_nsfw_concepts(self, features, safety_model_params)
+            has_nsfw_concepts = unshard(has_nsfw_concepts)
+            safety_model_params = unreplicate(safety_model_params)
+        else:
+            has_nsfw_concepts = self._get_has_nsfw_concepts(features, safety_model_params)
+
+        images_was_copied = False
+        for idx, has_nsfw_concept in enumerate(has_nsfw_concepts):
+            if has_nsfw_concept:
+                if not images_was_copied:
+                    images_was_copied = True
+                    images = images.copy()
+
+                images[idx] = np.zeros(images[idx].shape, dtype=np.uint8)  # black image
+
+            if any(has_nsfw_concepts):
+                warnings.warn(
+                    "Potential NSFW content was detected in one or more images. A black image will be returned"
+                    " instead. Try again with a different prompt and/or seed."
+                )
+
+        return images, has_nsfw_concepts
+
+    def get_timestep_start(self, num_inference_steps, strength):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+
+        return t_start
+
+    def _generate(
+        self,
+        prompt_ids: jnp.ndarray,
+        image: jnp.ndarray,
+        params: Union[Dict, FrozenDict],
+        prng_seed: jax.Array,
+        start_timestep: int,
+        num_inference_steps: int,
+        height: int,
+        width: int,
+        guidance_scale: float,
+        noise: Optional[jnp.ndarray] = None,
+        neg_prompt_ids: Optional[jnp.ndarray] = None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        # get prompt text embeddings
+        prompt_embeds = self.text_encoder(prompt_ids, params=params["text_encoder"])[0]
+
+        # TODO: currently it is assumed `do_classifier_free_guidance = guidance_scale > 1.0`
+        # implement this conditional `do_classifier_free_guidance = guidance_scale > 1.0`
+        batch_size = prompt_ids.shape[0]
+
+        max_length = prompt_ids.shape[-1]
+
+        if neg_prompt_ids is None:
+            uncond_input = self.tokenizer(
+                [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="np"
+            ).input_ids
+        else:
+            uncond_input = neg_prompt_ids
+        negative_prompt_embeds = self.text_encoder(uncond_input, params=params["text_encoder"])[0]
+        context = jnp.concatenate([negative_prompt_embeds, prompt_embeds])
+
+        latents_shape = (
+            batch_size,
+            self.unet.config.in_channels,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if noise is None:
+            noise = jax.random.normal(prng_seed, shape=latents_shape, dtype=jnp.float32)
+        else:
+            if noise.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {noise.shape}, expected {latents_shape}")
+
+        # Create init_latents
+        init_latent_dist = self.vae.apply({"params": params["vae"]}, image, method=self.vae.encode).latent_dist
+        init_latents = init_latent_dist.sample(key=prng_seed).transpose((0, 3, 1, 2))
+        init_latents = self.vae.config.scaling_factor * init_latents
+
+        def loop_body(step, args):
+            latents, scheduler_state = args
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            latents_input = jnp.concatenate([latents] * 2)
+
+            t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
+            timestep = jnp.broadcast_to(t, latents_input.shape[0])
+
+            latents_input = self.scheduler.scale_model_input(scheduler_state, latents_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet.apply(
+                {"params": params["unet"]},
+                jnp.array(latents_input),
+                jnp.array(timestep, dtype=jnp.int32),
+                encoder_hidden_states=context,
+            ).sample
+            # perform guidance
+            noise_pred_uncond, noise_prediction_text = jnp.split(noise_pred, 2, axis=0)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents, scheduler_state = self.scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
+            return latents, scheduler_state
+
+        scheduler_state = self.scheduler.set_timesteps(
+            params["scheduler"], num_inference_steps=num_inference_steps, shape=latents_shape
+        )
+
+        latent_timestep = scheduler_state.timesteps[start_timestep : start_timestep + 1].repeat(batch_size)
+
+        latents = self.scheduler.add_noise(params["scheduler"], init_latents, noise, latent_timestep)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * params["scheduler"].init_noise_sigma
+
+        if DEBUG:
+            # run with python for loop
+            for i in range(start_timestep, num_inference_steps):
+                latents, scheduler_state = loop_body(i, (latents, scheduler_state))
+        else:
+            latents, _ = jax.lax.fori_loop(start_timestep, num_inference_steps, loop_body, (latents, scheduler_state))
+
+        # scale and decode the image latents with vae
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.apply({"params": params["vae"]}, latents, method=self.vae.decode).sample
+
+        image = (image / 2 + 0.5).clip(0, 1).transpose(0, 2, 3, 1)
+        return image
+
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt_ids: jnp.ndarray,
+        image: jnp.ndarray,
+        params: Union[Dict, FrozenDict],
+        prng_seed: jax.Array,
+        strength: float = 0.8,
+        num_inference_steps: int = 50,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        guidance_scale: Union[float, jnp.ndarray] = 7.5,
+        noise: jnp.ndarray = None,
+        neg_prompt_ids: jnp.ndarray = None,
+        return_dict: bool = True,
+        jit: bool = False,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt_ids (`jnp.ndarray`):
+                The prompt or prompts to guide image generation.
+            image (`jnp.ndarray`):
+                Array representing an image batch to be used as the starting point.
+            params (`Dict` or `FrozenDict`):
+                Dictionary containing the model parameters/weights.
+            prng_seed (`jax.Array` or `jax.Array`):
+                Array containing random number generator key.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            noise (`jnp.ndarray`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. The array is generated by
+                sampling using the supplied random `generator`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] instead of
+                a plain tuple.
+            jit (`bool`, defaults to `False`):
+                Whether to run `pmap` versions of the generation and safety scoring functions.
+
+                    <Tip warning={true}>
+
+                    This argument exists because `__call__` is not yet end-to-end pmap-able. It will be removed in a
+                    future release.
+
+                    </Tip>
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated images
+                and the second element is a list of `bool`s indicating whether the corresponding generated image
+                contains "not-safe-for-work" (nsfw) content.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        if isinstance(guidance_scale, float):
+            # Convert to a tensor so each device gets a copy. Follow the prompt_ids for
+            # shape information, as they may be sharded (when `jit` is `True`), or not.
+            guidance_scale = jnp.array([guidance_scale] * prompt_ids.shape[0])
+            if len(prompt_ids.shape) > 2:
+                # Assume sharded
+                guidance_scale = guidance_scale[:, None]
+
+        start_timestep = self.get_timestep_start(num_inference_steps, strength)
+
+        if jit:
+            images = _p_generate(
+                self,
+                prompt_ids,
+                image,
+                params,
+                prng_seed,
+                start_timestep,
+                num_inference_steps,
+                height,
+                width,
+                guidance_scale,
+                noise,
+                neg_prompt_ids,
+            )
+        else:
+            images = self._generate(
+                prompt_ids,
+                image,
+                params,
+                prng_seed,
+                start_timestep,
+                num_inference_steps,
+                height,
+                width,
+                guidance_scale,
+                noise,
+                neg_prompt_ids,
+            )
+
+        if self.safety_checker is not None:
+            safety_params = params["safety_checker"]
+            images_uint8_casted = (images * 255).round().astype("uint8")
+            num_devices, batch_size = images.shape[:2]
+
+            images_uint8_casted = np.asarray(images_uint8_casted).reshape(num_devices * batch_size, height, width, 3)
+            images_uint8_casted, has_nsfw_concept = self._run_safety_checker(images_uint8_casted, safety_params, jit)
+            images = np.asarray(images)
+
+            # block images
+            if any(has_nsfw_concept):
+                for i, is_nsfw in enumerate(has_nsfw_concept):
+                    if is_nsfw:
+                        images[i] = np.asarray(images_uint8_casted[i])
+
+            images = images.reshape(num_devices, batch_size, height, width, 3)
+        else:
+            images = np.asarray(images)
+            has_nsfw_concept = False
+
+        if not return_dict:
+            return (images, has_nsfw_concept)
+
+        return FlaxStableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept)
+
+
+# Static argnums are pipe, start_timestep, num_inference_steps, height, width. A change would trigger recompilation.
+# Non-static args are (sharded) input tensors mapped over their first dimension (hence, `0`).
+@partial(
+    jax.pmap,
+    in_axes=(None, 0, 0, 0, 0, None, None, None, None, 0, 0, 0),
+    static_broadcasted_argnums=(0, 5, 6, 7, 8),
+)
+def _p_generate(
+    pipe,
+    prompt_ids,
+    image,
+    params,
+    prng_seed,
+    start_timestep,
+    num_inference_steps,
+    height,
+    width,
+    guidance_scale,
+    noise,
+    neg_prompt_ids,
+):
+    return pipe._generate(
+        prompt_ids,
+        image,
+        params,
+        prng_seed,
+        start_timestep,
+        num_inference_steps,
+        height,
+        width,
+        guidance_scale,
+        noise,
+        neg_prompt_ids,
+    )
+
+
+@partial(jax.pmap, static_broadcasted_argnums=(0,))
+def _p_get_has_nsfw_concepts(pipe, features, params):
+    return pipe._get_has_nsfw_concepts(features, params)
+
+
+def unshard(x: jnp.ndarray):
+    # einops.rearrange(x, 'd b ... -> (d b) ...')
+    num_devices, batch_size = x.shape[:2]
+    rest = x.shape[2:]
+    return x.reshape(num_devices * batch_size, *rest)
+
+
+def preprocess(image, dtype):
+    w, h = image.size
+    w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
+    image = jnp.array(image).astype(dtype) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    return 2.0 * image - 1.0
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py
new file mode 100644
index 000000000..f6bb0ac29
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py
@@ -0,0 +1,589 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from functools import partial
+from typing import Dict, List, Optional, Union
+
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict
+from flax.jax_utils import unreplicate
+from flax.training.common_utils import shard
+from packaging import version
+from PIL import Image
+from transformers import CLIPImageProcessor, CLIPTokenizer, FlaxCLIPTextModel
+
+from ...models import FlaxAutoencoderKL, FlaxUNet2DConditionModel
+from ...schedulers import (
+    FlaxDDIMScheduler,
+    FlaxDPMSolverMultistepScheduler,
+    FlaxLMSDiscreteScheduler,
+    FlaxPNDMScheduler,
+)
+from ...utils import PIL_INTERPOLATION, deprecate, logging, replace_example_docstring
+from ..pipeline_flax_utils import FlaxDiffusionPipeline
+from .pipeline_output import FlaxStableDiffusionPipelineOutput
+from .safety_checker_flax import FlaxStableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+# Set to True to use python for loop instead of jax.fori_loop for easier debugging
+DEBUG = False
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import jax
+        >>> import numpy as np
+        >>> from flax.jax_utils import replicate
+        >>> from flax.training.common_utils import shard
+        >>> import PIL
+        >>> import requests
+        >>> from io import BytesIO
+        >>> from diffusers import FlaxStableDiffusionInpaintPipeline
+
+
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+
+        >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+        >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+        >>> init_image = download_image(img_url).resize((512, 512))
+        >>> mask_image = download_image(mask_url).resize((512, 512))
+
+        >>> pipeline, params = FlaxStableDiffusionInpaintPipeline.from_pretrained(
+        ...     "xvjiarui/stable-diffusion-2-inpainting"
+        ... )
+
+        >>> prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+        >>> prng_seed = jax.random.PRNGKey(0)
+        >>> num_inference_steps = 50
+
+        >>> num_samples = jax.device_count()
+        >>> prompt = num_samples * [prompt]
+        >>> init_image = num_samples * [init_image]
+        >>> mask_image = num_samples * [mask_image]
+        >>> prompt_ids, processed_masked_images, processed_masks = pipeline.prepare_inputs(
+        ...     prompt, init_image, mask_image
+        ... )
+        # shard inputs and rng
+
+        >>> params = replicate(params)
+        >>> prng_seed = jax.random.split(prng_seed, jax.device_count())
+        >>> prompt_ids = shard(prompt_ids)
+        >>> processed_masked_images = shard(processed_masked_images)
+        >>> processed_masks = shard(processed_masks)
+
+        >>> images = pipeline(
+        ...     prompt_ids, processed_masks, processed_masked_images, params, prng_seed, num_inference_steps, jit=True
+        ... ).images
+        >>> images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
+        ```
+"""
+
+
+class FlaxStableDiffusionInpaintPipeline(FlaxDiffusionPipeline):
+    r"""
+    Flax-based pipeline for text-guided image inpainting using Stable Diffusion.
+
+    <Tip warning={true}>
+
+    🧪 This is an experimental feature!
+
+    </Tip>
+
+    This model inherits from [`FlaxDiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`FlaxAutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.FlaxCLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`FlaxUNet2DConditionModel`]):
+            A `FlaxUNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`FlaxDDIMScheduler`], [`FlaxLMSDiscreteScheduler`], [`FlaxPNDMScheduler`], or
+            [`FlaxDPMSolverMultistepScheduler`].
+        safety_checker ([`FlaxStableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: FlaxAutoencoderKL,
+        text_encoder: FlaxCLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: FlaxUNet2DConditionModel,
+        scheduler: Union[
+            FlaxDDIMScheduler, FlaxPNDMScheduler, FlaxLMSDiscreteScheduler, FlaxDPMSolverMultistepScheduler
+        ],
+        safety_checker: FlaxStableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        dtype: jnp.dtype = jnp.float32,
+    ):
+        super().__init__()
+        self.dtype = dtype
+
+        if safety_checker is None:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+    def prepare_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[Image.Image, List[Image.Image]],
+        mask: Union[Image.Image, List[Image.Image]],
+    ):
+        if not isinstance(prompt, (str, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if not isinstance(image, (Image.Image, list)):
+            raise ValueError(f"image has to be of type `PIL.Image.Image` or list but is {type(image)}")
+
+        if isinstance(image, Image.Image):
+            image = [image]
+
+        if not isinstance(mask, (Image.Image, list)):
+            raise ValueError(f"image has to be of type `PIL.Image.Image` or list but is {type(image)}")
+
+        if isinstance(mask, Image.Image):
+            mask = [mask]
+
+        processed_images = jnp.concatenate([preprocess_image(img, jnp.float32) for img in image])
+        processed_masks = jnp.concatenate([preprocess_mask(m, jnp.float32) for m in mask])
+        # processed_masks[processed_masks < 0.5] = 0
+        processed_masks = processed_masks.at[processed_masks < 0.5].set(0)
+        # processed_masks[processed_masks >= 0.5] = 1
+        processed_masks = processed_masks.at[processed_masks >= 0.5].set(1)
+
+        processed_masked_images = processed_images * (processed_masks < 0.5)
+
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="np",
+        )
+        return text_input.input_ids, processed_masked_images, processed_masks
+
+    def _get_has_nsfw_concepts(self, features, params):
+        has_nsfw_concepts = self.safety_checker(features, params)
+        return has_nsfw_concepts
+
+    def _run_safety_checker(self, images, safety_model_params, jit=False):
+        # safety_model_params should already be replicated when jit is True
+        pil_images = [Image.fromarray(image) for image in images]
+        features = self.feature_extractor(pil_images, return_tensors="np").pixel_values
+
+        if jit:
+            features = shard(features)
+            has_nsfw_concepts = _p_get_has_nsfw_concepts(self, features, safety_model_params)
+            has_nsfw_concepts = unshard(has_nsfw_concepts)
+            safety_model_params = unreplicate(safety_model_params)
+        else:
+            has_nsfw_concepts = self._get_has_nsfw_concepts(features, safety_model_params)
+
+        images_was_copied = False
+        for idx, has_nsfw_concept in enumerate(has_nsfw_concepts):
+            if has_nsfw_concept:
+                if not images_was_copied:
+                    images_was_copied = True
+                    images = images.copy()
+
+                images[idx] = np.zeros(images[idx].shape, dtype=np.uint8)  # black image
+
+            if any(has_nsfw_concepts):
+                warnings.warn(
+                    "Potential NSFW content was detected in one or more images. A black image will be returned"
+                    " instead. Try again with a different prompt and/or seed."
+                )
+
+        return images, has_nsfw_concepts
+
+    def _generate(
+        self,
+        prompt_ids: jnp.ndarray,
+        mask: jnp.ndarray,
+        masked_image: jnp.ndarray,
+        params: Union[Dict, FrozenDict],
+        prng_seed: jax.Array,
+        num_inference_steps: int,
+        height: int,
+        width: int,
+        guidance_scale: float,
+        latents: Optional[jnp.ndarray] = None,
+        neg_prompt_ids: Optional[jnp.ndarray] = None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        # get prompt text embeddings
+        prompt_embeds = self.text_encoder(prompt_ids, params=params["text_encoder"])[0]
+
+        # TODO: currently it is assumed `do_classifier_free_guidance = guidance_scale > 1.0`
+        # implement this conditional `do_classifier_free_guidance = guidance_scale > 1.0`
+        batch_size = prompt_ids.shape[0]
+
+        max_length = prompt_ids.shape[-1]
+
+        if neg_prompt_ids is None:
+            uncond_input = self.tokenizer(
+                [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="np"
+            ).input_ids
+        else:
+            uncond_input = neg_prompt_ids
+        negative_prompt_embeds = self.text_encoder(uncond_input, params=params["text_encoder"])[0]
+        context = jnp.concatenate([negative_prompt_embeds, prompt_embeds])
+
+        latents_shape = (
+            batch_size,
+            self.vae.config.latent_channels,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if latents is None:
+            latents = jax.random.normal(prng_seed, shape=latents_shape, dtype=self.dtype)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+
+        prng_seed, mask_prng_seed = jax.random.split(prng_seed)
+
+        masked_image_latent_dist = self.vae.apply(
+            {"params": params["vae"]}, masked_image, method=self.vae.encode
+        ).latent_dist
+        masked_image_latents = masked_image_latent_dist.sample(key=mask_prng_seed).transpose((0, 3, 1, 2))
+        masked_image_latents = self.vae.config.scaling_factor * masked_image_latents
+        del mask_prng_seed
+
+        mask = jax.image.resize(mask, (*mask.shape[:-2], *masked_image_latents.shape[-2:]), method="nearest")
+
+        # 8. Check that sizes of mask, masked image and latents match
+        num_channels_latents = self.vae.config.latent_channels
+        num_channels_mask = mask.shape[1]
+        num_channels_masked_image = masked_image_latents.shape[1]
+        if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                " `pipeline.unet` or your `mask_image` or `image` input."
+            )
+
+        def loop_body(step, args):
+            latents, mask, masked_image_latents, scheduler_state = args
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            latents_input = jnp.concatenate([latents] * 2)
+            mask_input = jnp.concatenate([mask] * 2)
+            masked_image_latents_input = jnp.concatenate([masked_image_latents] * 2)
+
+            t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
+            timestep = jnp.broadcast_to(t, latents_input.shape[0])
+
+            latents_input = self.scheduler.scale_model_input(scheduler_state, latents_input, t)
+            # concat latents, mask, masked_image_latents in the channel dimension
+            latents_input = jnp.concatenate([latents_input, mask_input, masked_image_latents_input], axis=1)
+
+            # predict the noise residual
+            noise_pred = self.unet.apply(
+                {"params": params["unet"]},
+                jnp.array(latents_input),
+                jnp.array(timestep, dtype=jnp.int32),
+                encoder_hidden_states=context,
+            ).sample
+            # perform guidance
+            noise_pred_uncond, noise_prediction_text = jnp.split(noise_pred, 2, axis=0)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents, scheduler_state = self.scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
+            return latents, mask, masked_image_latents, scheduler_state
+
+        scheduler_state = self.scheduler.set_timesteps(
+            params["scheduler"], num_inference_steps=num_inference_steps, shape=latents.shape
+        )
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * params["scheduler"].init_noise_sigma
+
+        if DEBUG:
+            # run with python for loop
+            for i in range(num_inference_steps):
+                latents, mask, masked_image_latents, scheduler_state = loop_body(
+                    i, (latents, mask, masked_image_latents, scheduler_state)
+                )
+        else:
+            latents, _, _, _ = jax.lax.fori_loop(
+                0, num_inference_steps, loop_body, (latents, mask, masked_image_latents, scheduler_state)
+            )
+
+        # scale and decode the image latents with vae
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.apply({"params": params["vae"]}, latents, method=self.vae.decode).sample
+
+        image = (image / 2 + 0.5).clip(0, 1).transpose(0, 2, 3, 1)
+        return image
+
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt_ids: jnp.ndarray,
+        mask: jnp.ndarray,
+        masked_image: jnp.ndarray,
+        params: Union[Dict, FrozenDict],
+        prng_seed: jax.Array,
+        num_inference_steps: int = 50,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        guidance_scale: Union[float, jnp.ndarray] = 7.5,
+        latents: jnp.ndarray = None,
+        neg_prompt_ids: jnp.ndarray = None,
+        return_dict: bool = True,
+        jit: bool = False,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image generation.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            latents (`jnp.ndarray`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                array is generated by sampling using the supplied random `generator`.
+            jit (`bool`, defaults to `False`):
+                Whether to run `pmap` versions of the generation and safety scoring functions.
+
+                    <Tip warning={true}>
+
+                    This argument exists because `__call__` is not yet end-to-end pmap-able. It will be removed in a
+                    future release.
+
+                    </Tip>
+
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] instead of
+                a plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated images
+                and the second element is a list of `bool`s indicating whether the corresponding generated image
+                contains "not-safe-for-work" (nsfw) content.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        masked_image = jax.image.resize(masked_image, (*masked_image.shape[:-2], height, width), method="bicubic")
+        mask = jax.image.resize(mask, (*mask.shape[:-2], height, width), method="nearest")
+
+        if isinstance(guidance_scale, float):
+            # Convert to a tensor so each device gets a copy. Follow the prompt_ids for
+            # shape information, as they may be sharded (when `jit` is `True`), or not.
+            guidance_scale = jnp.array([guidance_scale] * prompt_ids.shape[0])
+            if len(prompt_ids.shape) > 2:
+                # Assume sharded
+                guidance_scale = guidance_scale[:, None]
+
+        if jit:
+            images = _p_generate(
+                self,
+                prompt_ids,
+                mask,
+                masked_image,
+                params,
+                prng_seed,
+                num_inference_steps,
+                height,
+                width,
+                guidance_scale,
+                latents,
+                neg_prompt_ids,
+            )
+        else:
+            images = self._generate(
+                prompt_ids,
+                mask,
+                masked_image,
+                params,
+                prng_seed,
+                num_inference_steps,
+                height,
+                width,
+                guidance_scale,
+                latents,
+                neg_prompt_ids,
+            )
+
+        if self.safety_checker is not None:
+            safety_params = params["safety_checker"]
+            images_uint8_casted = (images * 255).round().astype("uint8")
+            num_devices, batch_size = images.shape[:2]
+
+            images_uint8_casted = np.asarray(images_uint8_casted).reshape(num_devices * batch_size, height, width, 3)
+            images_uint8_casted, has_nsfw_concept = self._run_safety_checker(images_uint8_casted, safety_params, jit)
+            images = np.asarray(images)
+
+            # block images
+            if any(has_nsfw_concept):
+                for i, is_nsfw in enumerate(has_nsfw_concept):
+                    if is_nsfw:
+                        images[i] = np.asarray(images_uint8_casted[i])
+
+            images = images.reshape(num_devices, batch_size, height, width, 3)
+        else:
+            images = np.asarray(images)
+            has_nsfw_concept = False
+
+        if not return_dict:
+            return (images, has_nsfw_concept)
+
+        return FlaxStableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept)
+
+
+# Static argnums are pipe, num_inference_steps, height, width. A change would trigger recompilation.
+# Non-static args are (sharded) input tensors mapped over their first dimension (hence, `0`).
+@partial(
+    jax.pmap,
+    in_axes=(None, 0, 0, 0, 0, 0, None, None, None, 0, 0, 0),
+    static_broadcasted_argnums=(0, 6, 7, 8),
+)
+def _p_generate(
+    pipe,
+    prompt_ids,
+    mask,
+    masked_image,
+    params,
+    prng_seed,
+    num_inference_steps,
+    height,
+    width,
+    guidance_scale,
+    latents,
+    neg_prompt_ids,
+):
+    return pipe._generate(
+        prompt_ids,
+        mask,
+        masked_image,
+        params,
+        prng_seed,
+        num_inference_steps,
+        height,
+        width,
+        guidance_scale,
+        latents,
+        neg_prompt_ids,
+    )
+
+
+@partial(jax.pmap, static_broadcasted_argnums=(0,))
+def _p_get_has_nsfw_concepts(pipe, features, params):
+    return pipe._get_has_nsfw_concepts(features, params)
+
+
+def unshard(x: jnp.ndarray):
+    # einops.rearrange(x, 'd b ... -> (d b) ...')
+    num_devices, batch_size = x.shape[:2]
+    rest = x.shape[2:]
+    return x.reshape(num_devices * batch_size, *rest)
+
+
+def preprocess_image(image, dtype):
+    w, h = image.size
+    w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
+    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
+    image = jnp.array(image).astype(dtype) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    return 2.0 * image - 1.0
+
+
+def preprocess_mask(mask, dtype):
+    w, h = mask.size
+    w, h = (x - x % 32 for x in (w, h))  # resize to integer multiple of 32
+    mask = mask.resize((w, h))
+    mask = jnp.array(mask.convert("L")).astype(dtype) / 255.0
+    mask = jnp.expand_dims(mask, axis=(0, 1))
+
+    return mask
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
new file mode 100644
index 000000000..311347dcc
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
@@ -0,0 +1,487 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import CLIPImageProcessor, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils import deprecate, logging
+from ..onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel
+from ..pipeline_utils import DiffusionPipeline
+from . import StableDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__)
+
+
+class OnnxStableDiffusionPipeline(DiffusionPipeline):
+    vae_encoder: OnnxRuntimeModel
+    vae_decoder: OnnxRuntimeModel
+    text_encoder: OnnxRuntimeModel
+    tokenizer: CLIPTokenizer
+    unet: OnnxRuntimeModel
+    scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]
+    safety_checker: OnnxRuntimeModel
+    feature_extractor: CLIPImageProcessor
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _is_onnx = True
+
+    def __init__(
+        self,
+        vae_encoder: OnnxRuntimeModel,
+        vae_decoder: OnnxRuntimeModel,
+        text_encoder: OnnxRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: OnnxRuntimeModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: OnnxRuntimeModel,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae_encoder=vae_encoder,
+            vae_decoder=vae_decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: Optional[int],
+        do_classifier_free_guidance: bool,
+        negative_prompt: Optional[str],
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                prompt to be encoded
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
+
+            if not np.array_equal(text_input_ids, untruncated_ids):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+
+        prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt] * batch_size
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    def check_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int],
+        width: Optional[int],
+        callback_steps: int,
+        negative_prompt: Optional[str] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[np.random.RandomState] = None,
+        latents: Optional[np.ndarray] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `torch.FloatTensor`):
+                `Image`, or tensor representing an image batch which will be upscaled. *
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
+                is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`np.random.RandomState`, *optional*):
+                One or a list of [numpy generator(s)](TODO) to make generation deterministic.
+            latents (`np.ndarray`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        # check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if generator is None:
+            generator = np.random
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        # get the initial random noise unless the user supplied it
+        latents_dtype = prompt_embeds.dtype
+        latents_shape = (batch_size * num_images_per_prompt, 4, height // 8, width // 8)
+        if latents is None:
+            latents = generator.randn(*latents_shape).astype(latents_dtype)
+        elif latents.shape != latents_shape:
+            raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        latents = latents * np.float64(self.scheduler.init_noise_sigma)
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        timestep_dtype = next(
+            (input.type for input in self.unet.model.get_inputs() if input.name == "timestep"), "tensor(float)"
+        )
+        timestep_dtype = ORT_TO_NP_TYPE[timestep_dtype]
+
+        for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
+            latent_model_input = latent_model_input.cpu().numpy()
+
+            # predict the noise residual
+            timestep = np.array([t], dtype=timestep_dtype)
+            noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds)
+            noise_pred = noise_pred[0]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            scheduler_output = self.scheduler.step(
+                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
+            )
+            latents = scheduler_output.prev_sample.numpy()
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        # image = self.vae_decoder(latent_sample=latents)[0]
+        # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
+        image = np.concatenate(
+            [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
+        )
+
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = image.transpose((0, 2, 3, 1))
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(
+                self.numpy_to_pil(image), return_tensors="np"
+            ).pixel_values.astype(image.dtype)
+
+            images, has_nsfw_concept = [], []
+            for i in range(image.shape[0]):
+                image_i, has_nsfw_concept_i = self.safety_checker(
+                    clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
+                )
+                images.append(image_i)
+                has_nsfw_concept.append(has_nsfw_concept_i[0])
+            image = np.concatenate(images)
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+
+class StableDiffusionOnnxPipeline(OnnxStableDiffusionPipeline):
+    def __init__(
+        self,
+        vae_encoder: OnnxRuntimeModel,
+        vae_decoder: OnnxRuntimeModel,
+        text_encoder: OnnxRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: OnnxRuntimeModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: OnnxRuntimeModel,
+        feature_extractor: CLIPImageProcessor,
+    ):
+        deprecation_message = "Please use `OnnxStableDiffusionPipeline` instead of `StableDiffusionOnnxPipeline`."
+        deprecate("StableDiffusionOnnxPipeline", "1.0.0", deprecation_message)
+        super().__init__(
+            vae_encoder=vae_encoder,
+            vae_decoder=vae_decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
new file mode 100644
index 000000000..c39409886
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
@@ -0,0 +1,549 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils import PIL_INTERPOLATION, deprecate, logging
+from ..onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel
+from ..pipeline_utils import DiffusionPipeline
+from . import StableDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess with 8->64
+def preprocess(image):
+    deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
+    deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 64 for x in (w, h))  # resize to integer multiple of 64
+
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-guided image to image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    vae_encoder: OnnxRuntimeModel
+    vae_decoder: OnnxRuntimeModel
+    text_encoder: OnnxRuntimeModel
+    tokenizer: CLIPTokenizer
+    unet: OnnxRuntimeModel
+    scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]
+    safety_checker: OnnxRuntimeModel
+    feature_extractor: CLIPImageProcessor
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _is_onnx = True
+
+    def __init__(
+        self,
+        vae_encoder: OnnxRuntimeModel,
+        vae_decoder: OnnxRuntimeModel,
+        text_encoder: OnnxRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: OnnxRuntimeModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: OnnxRuntimeModel,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae_encoder=vae_encoder,
+            vae_decoder=vae_decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: Optional[int],
+        do_classifier_free_guidance: bool,
+        negative_prompt: Optional[str],
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                prompt to be encoded
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
+
+            if not np.array_equal(text_input_ids, untruncated_ids):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+
+        prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt] * batch_size
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    def check_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        callback_steps: int,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[np.ndarray, PIL.Image.Image] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[np.random.RandomState] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`np.ndarray` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            strength (`float`, *optional*, defaults to 0.8):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter will be modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`np.random.RandomState`, *optional*):
+                A np.random.RandomState to make generation deterministic.
+            prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        # check inputs. Raise error if not correct
+        self.check_inputs(prompt, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
+
+        # define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if generator is None:
+            generator = np.random
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        image = preprocess(image).cpu().numpy()
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        latents_dtype = prompt_embeds.dtype
+        image = image.astype(latents_dtype)
+        # encode the init image into latents and scale the latents
+        init_latents = self.vae_encoder(sample=image)[0]
+        init_latents = 0.18215 * init_latents
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        if len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {len(prompt)} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = len(prompt) // init_latents.shape[0]
+            init_latents = np.concatenate([init_latents] * additional_image_per_prompt * num_images_per_prompt, axis=0)
+        elif len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts."
+            )
+        else:
+            init_latents = np.concatenate([init_latents] * num_images_per_prompt, axis=0)
+
+        # get the original timestep using init_timestep
+        offset = self.scheduler.config.get("steps_offset", 0)
+        init_timestep = int(num_inference_steps * strength) + offset
+        init_timestep = min(init_timestep, num_inference_steps)
+
+        timesteps = self.scheduler.timesteps.numpy()[-init_timestep]
+        timesteps = np.array([timesteps] * batch_size * num_images_per_prompt)
+
+        # add noise to latents using the timesteps
+        noise = generator.randn(*init_latents.shape).astype(latents_dtype)
+        init_latents = self.scheduler.add_noise(
+            torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps)
+        )
+        init_latents = init_latents.numpy()
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        latents = init_latents
+
+        t_start = max(num_inference_steps - init_timestep + offset, 0)
+        timesteps = self.scheduler.timesteps[t_start:].numpy()
+
+        timestep_dtype = next(
+            (input.type for input in self.unet.model.get_inputs() if input.name == "timestep"), "tensor(float)"
+        )
+        timestep_dtype = ORT_TO_NP_TYPE[timestep_dtype]
+
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
+            latent_model_input = latent_model_input.cpu().numpy()
+
+            # predict the noise residual
+            timestep = np.array([t], dtype=timestep_dtype)
+            noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds)[
+                0
+            ]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            scheduler_output = self.scheduler.step(
+                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
+            )
+            latents = scheduler_output.prev_sample.numpy()
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        # image = self.vae_decoder(latent_sample=latents)[0]
+        # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
+        image = np.concatenate(
+            [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
+        )
+
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = image.transpose((0, 2, 3, 1))
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(
+                self.numpy_to_pil(image), return_tensors="np"
+            ).pixel_values.astype(image.dtype)
+            # safety_checker does not support batched inputs yet
+            images, has_nsfw_concept = [], []
+            for i in range(image.shape[0]):
+                image_i, has_nsfw_concept_i = self.safety_checker(
+                    clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
+                )
+                images.append(image_i)
+                has_nsfw_concept.append(has_nsfw_concept_i[0])
+            image = np.concatenate(images)
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
new file mode 100644
index 000000000..18d805082
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
@@ -0,0 +1,563 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from ...utils import PIL_INTERPOLATION, deprecate, logging
+from ..onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel
+from ..pipeline_utils import DiffusionPipeline
+from . import StableDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+NUM_UNET_INPUT_CHANNELS = 9
+NUM_LATENT_CHANNELS = 4
+
+
+def prepare_mask_and_masked_image(image, mask, latents_shape):
+    image = np.array(image.convert("RGB").resize((latents_shape[1] * 8, latents_shape[0] * 8)))
+    image = image[None].transpose(0, 3, 1, 2)
+    image = image.astype(np.float32) / 127.5 - 1.0
+
+    image_mask = np.array(mask.convert("L").resize((latents_shape[1] * 8, latents_shape[0] * 8)))
+    masked_image = image * (image_mask < 127.5)
+
+    mask = mask.resize((latents_shape[1], latents_shape[0]), PIL_INTERPOLATION["nearest"])
+    mask = np.array(mask.convert("L"))
+    mask = mask.astype(np.float32) / 255.0
+    mask = mask[None, None]
+    mask[mask < 0.5] = 0
+    mask[mask >= 0.5] = 1
+
+    return mask, masked_image
+
+
+class OnnxStableDiffusionInpaintPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    vae_encoder: OnnxRuntimeModel
+    vae_decoder: OnnxRuntimeModel
+    text_encoder: OnnxRuntimeModel
+    tokenizer: CLIPTokenizer
+    unet: OnnxRuntimeModel
+    scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]
+    safety_checker: OnnxRuntimeModel
+    feature_extractor: CLIPImageProcessor
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _is_onnx = True
+
+    def __init__(
+        self,
+        vae_encoder: OnnxRuntimeModel,
+        vae_decoder: OnnxRuntimeModel,
+        text_encoder: OnnxRuntimeModel,
+        tokenizer: CLIPTokenizer,
+        unet: OnnxRuntimeModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: OnnxRuntimeModel,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+        logger.info("`OnnxStableDiffusionInpaintPipeline` is experimental and will very likely change in the future.")
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae_encoder=vae_encoder,
+            vae_decoder=vae_decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: Optional[int],
+        do_classifier_free_guidance: bool,
+        negative_prompt: Optional[str],
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                prompt to be encoded
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
+
+            if not np.array_equal(text_input_ids, untruncated_ids):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+
+        prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt] * batch_size
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int],
+        width: Optional[int],
+        callback_steps: int,
+        negative_prompt: Optional[str] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: PIL.Image.Image,
+        mask_image: PIL.Image.Image,
+        height: Optional[int] = 512,
+        width: Optional[int] = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[np.random.RandomState] = None,
+        latents: Optional[np.ndarray] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`np.random.RandomState`, *optional*):
+                A np.random.RandomState to make generation deterministic.
+            latents (`np.ndarray`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        # check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if generator is None:
+            generator = np.random
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        num_channels_latents = NUM_LATENT_CHANNELS
+        latents_shape = (batch_size * num_images_per_prompt, num_channels_latents, height // 8, width // 8)
+        latents_dtype = prompt_embeds.dtype
+        if latents is None:
+            latents = generator.randn(*latents_shape).astype(latents_dtype)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+
+        # prepare mask and masked_image
+        mask, masked_image = prepare_mask_and_masked_image(image, mask_image, latents_shape[-2:])
+        mask = mask.astype(latents.dtype)
+        masked_image = masked_image.astype(latents.dtype)
+
+        masked_image_latents = self.vae_encoder(sample=masked_image)[0]
+        masked_image_latents = 0.18215 * masked_image_latents
+
+        # duplicate mask and masked_image_latents for each generation per prompt
+        mask = mask.repeat(batch_size * num_images_per_prompt, 0)
+        masked_image_latents = masked_image_latents.repeat(batch_size * num_images_per_prompt, 0)
+
+        mask = np.concatenate([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image_latents = (
+            np.concatenate([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
+
+        num_channels_mask = mask.shape[1]
+        num_channels_masked_image = masked_image_latents.shape[1]
+
+        unet_input_channels = NUM_UNET_INPUT_CHANNELS
+        if num_channels_latents + num_channels_mask + num_channels_masked_image != unet_input_channels:
+            raise ValueError(
+                "Incorrect configuration settings! The config of `pipeline.unet` expects"
+                f" {unet_input_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                " `pipeline.unet` or your `mask_image` or `image` input."
+            )
+
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * np.float64(self.scheduler.init_noise_sigma)
+
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        timestep_dtype = next(
+            (input.type for input in self.unet.model.get_inputs() if input.name == "timestep"), "tensor(float)"
+        )
+        timestep_dtype = ORT_TO_NP_TYPE[timestep_dtype]
+
+        for i, t in enumerate(self.progress_bar(self.scheduler.timesteps)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
+            # concat latents, mask, masked_image_latnets in the channel dimension
+            latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
+            latent_model_input = latent_model_input.cpu().numpy()
+            latent_model_input = np.concatenate([latent_model_input, mask, masked_image_latents], axis=1)
+
+            # predict the noise residual
+            timestep = np.array([t], dtype=timestep_dtype)
+            noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds)[
+                0
+            ]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            scheduler_output = self.scheduler.step(
+                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
+            )
+            latents = scheduler_output.prev_sample.numpy()
+
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        latents = 1 / 0.18215 * latents
+        # image = self.vae_decoder(latent_sample=latents)[0]
+        # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
+        image = np.concatenate(
+            [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
+        )
+
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = image.transpose((0, 2, 3, 1))
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(
+                self.numpy_to_pil(image), return_tensors="np"
+            ).pixel_values.astype(image.dtype)
+            # safety_checker does not support batched inputs yet
+            images, has_nsfw_concept = [], []
+            for i in range(image.shape[0]):
+                image_i, has_nsfw_concept_i = self.safety_checker(
+                    clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
+                )
+                images.append(image_i)
+                has_nsfw_concept.append(has_nsfw_concept_i[0])
+            image = np.concatenate(images)
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py
new file mode 100644
index 000000000..58d83de0d
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py
@@ -0,0 +1,586 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers
+from ...utils import deprecate, logging
+from ..onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel
+from ..pipeline_utils import DiffusionPipeline
+from . import StableDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__)
+
+
+def preprocess(image):
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 64 for x in (w, h))  # resize to integer multiple of 32
+
+        image = [np.array(i.resize((w, h)))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+
+    return image
+
+
+class OnnxStableDiffusionUpscalePipeline(DiffusionPipeline):
+    vae: OnnxRuntimeModel
+    text_encoder: OnnxRuntimeModel
+    tokenizer: CLIPTokenizer
+    unet: OnnxRuntimeModel
+    low_res_scheduler: DDPMScheduler
+    scheduler: KarrasDiffusionSchedulers
+    safety_checker: OnnxRuntimeModel
+    feature_extractor: CLIPImageProcessor
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _is_onnx = True
+
+    def __init__(
+        self,
+        vae: OnnxRuntimeModel,
+        text_encoder: OnnxRuntimeModel,
+        tokenizer: Any,
+        unet: OnnxRuntimeModel,
+        low_res_scheduler: DDPMScheduler,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: Optional[OnnxRuntimeModel] = None,
+        feature_extractor: Optional[CLIPImageProcessor] = None,
+        max_noise_level: int = 350,
+        num_latent_channels=4,
+        num_unet_input_channels=7,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            low_res_scheduler=low_res_scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(
+            max_noise_level=max_noise_level,
+            num_latent_channels=num_latent_channels,
+            num_unet_input_channels=num_unet_input_channels,
+        )
+
+    def check_inputs(
+        self,
+        prompt: Union[str, List[str]],
+        image,
+        noise_level,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, np.ndarray)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `np.ndarray`, `PIL.Image.Image` or `list` but is {type(image)}"
+            )
+
+        # verify batch size of prompt and image are same if image is a list or tensor or numpy array
+        if isinstance(image, list) or isinstance(image, np.ndarray):
+            if prompt is not None and isinstance(prompt, str):
+                batch_size = 1
+            elif prompt is not None and isinstance(prompt, list):
+                batch_size = len(prompt)
+            else:
+                batch_size = prompt_embeds.shape[0]
+
+            if isinstance(image, list):
+                image_batch_size = len(image)
+            else:
+                image_batch_size = image.shape[0]
+            if batch_size != image_batch_size:
+                raise ValueError(
+                    f"`prompt` has batch size {batch_size} and `image` has batch size {image_batch_size}."
+                    " Please make sure that passed `prompt` matches the batch size of `image`."
+                )
+
+        # check noise level
+        if noise_level > self.config.max_noise_level:
+            raise ValueError(f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height, width)
+        if latents is None:
+            latents = generator.randn(*shape).astype(dtype)
+        elif latents.shape != shape:
+            raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+
+        return latents
+
+    def decode_latents(self, latents):
+        latents = 1 / 0.08333 * latents
+        image = self.vae(latent_sample=latents)[0]
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = image.transpose((0, 2, 3, 1))
+        return image
+
+    def _encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        num_images_per_prompt: Optional[int],
+        do_classifier_free_guidance: bool,
+        negative_prompt: Optional[str],
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                prompt to be encoded
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
+
+            if not np.array_equal(text_input_ids, untruncated_ids):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
+
+        prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt] * batch_size
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: Union[np.ndarray, PIL.Image.Image, List[PIL.Image.Image]],
+        num_inference_steps: int = 75,
+        guidance_scale: float = 9.0,
+        noise_level: int = 20,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[np.random.RandomState, List[np.random.RandomState]]] = None,
+        latents: Optional[np.ndarray] = None,
+        prompt_embeds: Optional[np.ndarray] = None,
+        negative_prompt_embeds: Optional[np.ndarray] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
+        callback_steps: Optional[int] = 1,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            image (`np.ndarray` or `PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, that will be used as the starting point for the
+                process.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter will be modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            noise_level (`float`, defaults to 0.2):
+                Deteremines the amount of noise to add to the initial image before performing upscaling.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`np.random.RandomState`, *optional*):
+                A np.random.RandomState to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`np.ndarray`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: np.ndarray)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            image,
+            noise_level,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if generator is None:
+            generator = np.random
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        latents_dtype = prompt_embeds.dtype
+        image = preprocess(image).cpu().numpy()
+        height, width = image.shape[2:]
+
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            self.num_latent_channels,
+            height,
+            width,
+            latents_dtype,
+            generator,
+        )
+        image = image.astype(latents_dtype)
+
+        self.scheduler.set_timesteps(num_inference_steps)
+        timesteps = self.scheduler.timesteps
+
+        # Scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * np.float64(self.scheduler.init_noise_sigma)
+
+        # 5. Add noise to image
+        noise_level = np.array([noise_level]).astype(np.int64)
+        noise = generator.randn(*image.shape).astype(latents_dtype)
+
+        image = self.low_res_scheduler.add_noise(
+            torch.from_numpy(image), torch.from_numpy(noise), torch.from_numpy(noise_level)
+        )
+        image = image.numpy()
+
+        batch_multiplier = 2 if do_classifier_free_guidance else 1
+        image = np.concatenate([image] * batch_multiplier * num_images_per_prompt)
+        noise_level = np.concatenate([noise_level] * image.shape[0])
+
+        # 7. Check that sizes of image and latents match
+        num_channels_image = image.shape[1]
+        if self.num_latent_channels + num_channels_image != self.num_unet_input_channels:
+            raise ValueError(
+                "Incorrect configuration settings! The config of `pipeline.unet` expects"
+                f" {self.num_unet_input_channels} but received `num_channels_latents`: {self.num_latent_channels} +"
+                f" `num_channels_image`: {num_channels_image} "
+                f" = {self.num_latent_channels + num_channels_image}. Please verify the config of"
+                " `pipeline.unet` or your `image` input."
+            )
+
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        timestep_dtype = next(
+            (input.type for input in self.unet.model.get_inputs() if input.name == "timestep"), "tensor(float)"
+        )
+        timestep_dtype = ORT_TO_NP_TYPE[timestep_dtype]
+
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
+
+                # concat latents, mask, masked_image_latents in the channel dimension
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                latent_model_input = np.concatenate([latent_model_input, image], axis=1)
+
+                # timestep to tensor
+                timestep = np.array([t], dtype=timestep_dtype)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    sample=latent_model_input,
+                    timestep=timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    class_labels=noise_level,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
+                ).prev_sample
+                latents = latents.numpy()
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # 10. Post-processing
+        image = self.decode_latents(latents)
+
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(
+                self.numpy_to_pil(image), return_tensors="np"
+            ).pixel_values.astype(image.dtype)
+
+            images, has_nsfw_concept = [], []
+            for i in range(image.shape[0]):
+                image_i, has_nsfw_concept_i = self.safety_checker(
+                    clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
+                )
+                images.append(image_i)
+                has_nsfw_concept.append(has_nsfw_concept_i[0])
+            image = np.concatenate(images)
+        else:
+            has_nsfw_concept = None
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_output.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_output.py
new file mode 100644
index 000000000..5fb9b1a14
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_output.py
@@ -0,0 +1,45 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL.Image
+
+from ...utils import BaseOutput, is_flax_available
+
+
+@dataclass
+class StableDiffusionPipelineOutput(BaseOutput):
+    """
+    Output class for Stable Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+        nsfw_content_detected (`List[bool]`)
+            List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
+            `None` if safety checking could not be performed.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
+
+
+if is_flax_available():
+    import flax
+
+    @flax.struct.dataclass
+    class FlaxStableDiffusionPipelineOutput(BaseOutput):
+        """
+        Output class for Flax-based Stable Diffusion pipelines.
+
+        Args:
+            images (`np.ndarray`):
+                Denoised images of array shape of `(batch_size, height, width, num_channels)`.
+            nsfw_content_detected (`List[bool]`):
+                List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content
+                or `None` if safety checking could not be performed.
+        """
+
+        images: np.ndarray
+        nsfw_content_detected: List[bool]
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
new file mode 100644
index 000000000..9e4e6c186
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -0,0 +1,1032 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...configuration_utils import FrozenDict
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pipeline_output import StableDiffusionPipelineOutput
+from .safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionPipeline
+
+        >>> pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16)
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    LoraLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hooks
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if (ip_adapter_image is not None or ip_adapter_image_embeds is not None)
+            else None
+        )
+
+        # 6.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
new file mode 100644
index 000000000..c410acbed
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
@@ -0,0 +1,860 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from packaging import version
+from transformers import CLIPTextModel, CLIPTokenizer, DPTFeatureExtractor, DPTForDepthEstimation
+
+from ...configuration_utils import FrozenDict
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import PIL_INTERPOLATION, USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
+def preprocess(image):
+    deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
+    deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
+    r"""
+    Pipeline for text-guided depth-based image-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "depth_mask"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        depth_estimator: DPTForDepthEstimation,
+        feature_extractor: DPTFeatureExtractor,
+    ):
+        super().__init__()
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            depth_estimator=depth_estimator,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        strength,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.prepare_latents
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    def prepare_depth_map(self, image, depth_map, batch_size, do_classifier_free_guidance, dtype, device):
+        if isinstance(image, PIL.Image.Image):
+            image = [image]
+        else:
+            image = list(image)
+
+        if isinstance(image[0], PIL.Image.Image):
+            width, height = image[0].size
+        elif isinstance(image[0], np.ndarray):
+            width, height = image[0].shape[:-1]
+        else:
+            height, width = image[0].shape[-2:]
+
+        if depth_map is None:
+            pixel_values = self.feature_extractor(images=image, return_tensors="pt").pixel_values
+            pixel_values = pixel_values.to(device=device)
+            # The DPT-Hybrid model uses batch-norm layers which are not compatible with fp16.
+            # So we use `torch.autocast` here for half precision inference.
+            context_manger = torch.autocast("cuda", dtype=dtype) if device.type == "cuda" else contextlib.nullcontext()
+            with context_manger:
+                depth_map = self.depth_estimator(pixel_values).predicted_depth
+        else:
+            depth_map = depth_map.to(device=device, dtype=dtype)
+
+        depth_map = torch.nn.functional.interpolate(
+            depth_map.unsqueeze(1),
+            size=(height // self.vae_scale_factor, width // self.vae_scale_factor),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
+        depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
+        depth_map = 2.0 * (depth_map - depth_min) / (depth_max - depth_min) - 1.0
+        depth_map = depth_map.to(dtype)
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if depth_map.shape[0] < batch_size:
+            repeat_by = batch_size // depth_map.shape[0]
+            depth_map = depth_map.repeat(repeat_by, 1, 1, 1)
+
+        depth_map = torch.cat([depth_map] * 2) if do_classifier_free_guidance else depth_map
+        return depth_map
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        depth_map: Optional[torch.FloatTensor] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image` or tensor representing an image batch to be used as the starting point. Can accept image
+                latents as `image` only if `depth_map` is not `None`.
+            depth_map (`torch.FloatTensor`, *optional*):
+                Depth prediction to be used as additional conditioning for the image generation process. If not
+                defined, it automatically predicts the depth with `self.depth_estimator`.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+
+        ```py
+        >>> import torch
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> from diffusers import StableDiffusionDepth2ImgPipeline
+
+        >>> pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-2-depth",
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> pipe.to("cuda")
+
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> init_image = Image.open(requests.get(url, stream=True).raw)
+        >>> prompt = "two tigers"
+        >>> n_propmt = "bad, deformed, ugly, bad anotomy"
+        >>> image = pipe(prompt=prompt, image=init_image, negative_prompt=n_propmt, strength=0.7).images[0]
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            strength,
+            callback_steps,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+
+        if image is None:
+            raise ValueError("`image` input cannot be undefined.")
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare depth mask
+        depth_mask = self.prepare_depth_map(
+            image,
+            depth_map,
+            batch_size * num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            prompt_embeds.dtype,
+            device,
+        )
+
+        # 5. Preprocess image
+        image = self.image_processor.preprocess(image)
+
+        # 6. Set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 7. Prepare latent variables
+        latents = self.prepare_latents(
+            image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
+        )
+
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                latent_model_input = torch.cat([latent_model_input, depth_mask], dim=1)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    depth_mask = callback_outputs.pop("depth_mask", depth_mask)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        else:
+            image = latents
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
new file mode 100644
index 000000000..afd872904
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
@@ -0,0 +1,420 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, List, Optional, Union
+
+import PIL.Image
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+
+from ...configuration_utils import FrozenDict
+from ...image_processor import VaeImageProcessor
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import deprecate, logging
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from . import StableDiffusionPipelineOutput
+from .safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class StableDiffusionImageVariationPipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline to generate image variations from an input image using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
+            Frozen CLIP image-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    # TODO: feature_extractor is required to encode images (if they are in PIL format),
+    # we should give a descriptive message if the pipeline doesn't have one.
+    _optional_components = ["safety_checker"]
+    model_cpu_offload_seq = "image_encoder->unet->vae"
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    def _encode_image(self, image, device, num_images_per_prompt, do_classifier_free_guidance):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(images=image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeddings = self.image_encoder(image).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_images_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = torch.zeros_like(image_embeddings)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_prompt_embeds, image_embeddings])
+
+        return image_embeddings
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(self, image, height, width, callback_steps):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
+                [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+
+        Examples:
+
+        ```py
+        from diffusers import StableDiffusionImageVariationPipeline
+        from PIL import Image
+        from io import BytesIO
+        import requests
+
+        pipe = StableDiffusionImageVariationPipeline.from_pretrained(
+            "lambdalabs/sd-image-variations-diffusers", revision="v2.0"
+        )
+        pipe = pipe.to("cuda")
+
+        url = "https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200"
+
+        response = requests.get(url)
+        image = Image.open(BytesIO(response.content)).convert("RGB")
+
+        out = pipe(image, num_images_per_prompt=3, guidance_scale=15)
+        out["images"][0].save("result.jpg")
+        ```
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(image, height, width, callback_steps)
+
+        # 2. Define call parameters
+        if isinstance(image, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(image, list):
+            batch_size = len(image)
+        else:
+            batch_size = image.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input image
+        image_embeddings = self._encode_image(image, device, num_images_per_prompt, do_classifier_free_guidance)
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=image_embeddings).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        self.maybe_free_model_hooks()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, image_embeddings.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
new file mode 100644
index 000000000..b43e0eb2a
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -0,0 +1,1113 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...configuration_utils import FrozenDict
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    PIL_INTERPOLATION,
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from . import StableDiffusionPipelineOutput
+from .safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import requests
+        >>> import torch
+        >>> from PIL import Image
+        >>> from io import BytesIO
+
+        >>> from diffusers import StableDiffusionImg2ImgPipeline
+
+        >>> device = "cuda"
+        >>> model_id_or_path = "runwayml/stable-diffusion-v1-5"
+        >>> pipe = StableDiffusionImg2ImgPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)
+        >>> pipe = pipe.to(device)
+
+        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+        >>> response = requests.get(url)
+        >>> init_image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> init_image = init_image.resize((768, 512))
+
+        >>> prompt = "A fantasy landscape, trending on artstation"
+
+        >>> images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
+        >>> images[0].save("fantasy_landscape.png")
+        ```
+"""
+
+
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+def preprocess(image):
+    deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
+    deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionImg2ImgPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    LoraLoaderMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-guided image-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        strength,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        timesteps: List[int] = None,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: int = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            strength,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Preprocess image
+        image = self.image_processor.preprocess(image)
+
+        # 5. set timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 6. Prepare latent variables
+        latents = self.prepare_latents(
+            image,
+            latent_timestep,
+            batch_size,
+            num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            generator,
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        # 7.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
new file mode 100644
index 000000000..221d5c2cf
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -0,0 +1,1430 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...configuration_utils import FrozenDict
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AsymmetricAutoencoderKL, AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from . import StableDiffusionPipelineOutput
+from .safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool = False):
+    """
+    Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
+    converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
+    ``image`` and ``1`` for the ``mask``.
+
+    The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
+    binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
+
+    Args:
+        image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
+            ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
+        mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
+            ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
+
+
+    Raises:
+        ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
+        should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
+        TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
+            (ot the other way around).
+
+    Returns:
+        tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
+            dimensions: ``batch x channels x height x width``.
+    """
+    deprecation_message = "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please use VaeImageProcessor.preprocess instead"
+    deprecate(
+        "prepare_mask_and_masked_image",
+        "0.30.0",
+        deprecation_message,
+    )
+    if image is None:
+        raise ValueError("`image` input cannot be undefined.")
+
+    if mask is None:
+        raise ValueError("`mask_image` input cannot be undefined.")
+
+    if isinstance(image, torch.Tensor):
+        if not isinstance(mask, torch.Tensor):
+            raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not")
+
+        # Batch single image
+        if image.ndim == 3:
+            assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)"
+            image = image.unsqueeze(0)
+
+        # Batch and add channel dim for single mask
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0).unsqueeze(0)
+
+        # Batch single mask or add channel dim
+        if mask.ndim == 3:
+            # Single batched mask, no channel dim or single mask not batched but channel dim
+            if mask.shape[0] == 1:
+                mask = mask.unsqueeze(0)
+
+            # Batched masks no channel dim
+            else:
+                mask = mask.unsqueeze(1)
+
+        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+        assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
+
+        # Check image is in [-1, 1]
+        if image.min() < -1 or image.max() > 1:
+            raise ValueError("Image should be in [-1, 1] range")
+
+        # Check mask is in [0, 1]
+        if mask.min() < 0 or mask.max() > 1:
+            raise ValueError("Mask should be in [0, 1] range")
+
+        # Binarize mask
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+
+        # Image as float32
+        image = image.to(dtype=torch.float32)
+    elif isinstance(mask, torch.Tensor):
+        raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
+    else:
+        # preprocess image
+        if isinstance(image, (PIL.Image.Image, np.ndarray)):
+            image = [image]
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            # resize all images w.r.t passed height an width
+            image = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in image]
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+        # preprocess mask
+        if isinstance(mask, (PIL.Image.Image, np.ndarray)):
+            mask = [mask]
+
+        if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
+            mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
+            mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+            mask = mask.astype(np.float32) / 255.0
+        elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
+            mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
+
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        mask = torch.from_numpy(mask)
+
+    masked_image = image * (mask < 0.5)
+
+    # n.b. ensure backwards compatibility as old function does not return image
+    if return_image:
+        return mask, masked_image, image
+
+    return mask, masked_image
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionInpaintPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    LoraLoaderMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-guided image inpainting using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+
+    Args:
+        vae ([`AutoencoderKL`, `AsymmetricAutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds", "mask", "masked_image_latents"]
+
+    def __init__(
+        self,
+        vae: Union[AutoencoderKL, AsymmetricAutoencoderKL],
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "skip_prk_steps") and scheduler.config.skip_prk_steps is False:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration"
+                " `skip_prk_steps`. `skip_prk_steps` should be set to True in the configuration file. Please make"
+                " sure to update the config accordingly as not setting `skip_prk_steps` in the config might lead to"
+                " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging Face"
+                " Hub, it would be very nice if you could open a Pull request for the"
+                " `scheduler/scheduler_config.json` file"
+            )
+            deprecate("skip_prk_steps not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["skip_prk_steps"] = True
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        # Check shapes, assume num_channels_latents == 4, num_channels_mask == 1, num_channels_masked == 4
+        if unet.config.in_channels != 9:
+            logger.info(f"You have loaded a UNet with {unet.config.in_channels} input channels which.")
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        mask_image,
+        height,
+        width,
+        strength,
+        callback_steps,
+        output_type,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        padding_mask_crop=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+        if padding_mask_crop is not None:
+            if not isinstance(image, PIL.Image.Image):
+                raise ValueError(
+                    f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
+                )
+            if not isinstance(mask_image, PIL.Image.Image):
+                raise ValueError(
+                    f"The mask image should be a PIL image when inpainting mask crop, but is of type"
+                    f" {type(mask_image)}."
+                )
+            if output_type != "pil":
+                raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+        image=None,
+        timestep=None,
+        is_strength_max=True,
+        return_noise=False,
+        return_image_latents=False,
+    ):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if (image is None or timestep is None) and not is_strength_max:
+            raise ValueError(
+                "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
+                "However, either the image or the noise timestep has not been provided."
+            )
+
+        if return_image_latents or (latents is None and not is_strength_max):
+            image = image.to(device=device, dtype=dtype)
+
+            if image.shape[1] == 4:
+                image_latents = image
+            else:
+                image_latents = self._encode_vae_image(image=image, generator=generator)
+            image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+
+        if latents is None:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # if strength is 1. then initialise the latents to noise, else initial to image + noise
+            latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
+            # if pure noise then scale the initial latents by the  Scheduler's init sigma
+            latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
+        else:
+            noise = latents.to(device)
+            latents = noise * self.scheduler.init_noise_sigma
+
+        outputs = (latents,)
+
+        if return_noise:
+            outputs += (noise,)
+
+        if return_image_latents:
+            outputs += (image_latents,)
+
+        return outputs
+
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        image_latents = self.vae.config.scaling_factor * image_latents
+
+        return image_latents
+
+    def prepare_mask_latents(
+        self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
+        mask = mask.to(device=device, dtype=dtype)
+
+        masked_image = masked_image.to(device=device, dtype=dtype)
+
+        if masked_image.shape[1] == 4:
+            masked_image_latents = masked_image
+        else:
+            masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+        if masked_image_latents.shape[0] < batch_size:
+            if not batch_size % masked_image_latents.shape[0] == 0:
+                raise ValueError(
+                    "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                    f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                    " Make sure the number of images that you pass is divisible by the total requested batch size."
+                )
+            masked_image_latents = masked_image_latents.repeat(batch_size // masked_image_latents.shape[0], 1, 1, 1)
+
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+        masked_image_latents = (
+            torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+        )
+
+        # aligning device to prevent device errors when concating it with the latent model input
+        masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+        return mask, masked_image_latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        masked_image_latents: torch.FloatTensor = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        padding_mask_crop: Optional[int] = None,
+        strength: float = 1.0,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: int = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be inpainted (which parts of the image to
+                be masked out with `mask_image` and repainted according to `prompt`). For both numpy array and pytorch
+                tensor, the expected value range is between `[0, 1]` If it's a tensor or a list or tensors, the
+                expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a list of arrays, the
+                expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image latents as `image`, but
+                if passing latents directly it is not encoded again.
+            mask_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to mask `image`. White pixels in the mask
+                are repainted while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a
+                single channel (luminance) before use. If it's a numpy array or pytorch tensor, it should contain one
+                color channel (L) instead of 3, so the expected shape for pytorch tensor would be `(B, 1, H, W)`, `(B,
+                H, W)`, `(1, H, W)`, `(H, W)`. And for numpy array would be for `(B, H, W, 1)`, `(B, H, W)`, `(H, W,
+                1)`, or `(H, W)`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            padding_mask_crop (`int`, *optional*, defaults to `None`):
+                The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If
+                `padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and
+                contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on
+                the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large
+                and contain information inreleant for inpainging, such as background.
+            strength (`float`, *optional*, defaults to 1.0):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+
+        ```py
+        >>> import PIL
+        >>> import requests
+        >>> import torch
+        >>> from io import BytesIO
+
+        >>> from diffusers import StableDiffusionInpaintPipeline
+
+
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+
+        >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+        >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+        >>> init_image = download_image(img_url).resize((512, 512))
+        >>> mask_image = download_image(mask_url).resize((512, 512))
+
+        >>> pipe = StableDiffusionInpaintPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
+        >>> image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            image,
+            mask_image,
+            height,
+            width,
+            strength,
+            callback_steps,
+            output_type,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+            padding_mask_crop,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. set timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps=num_inference_steps, strength=strength, device=device
+        )
+        # check that number of inference steps is not < 1 - as this doesn't make sense
+        if num_inference_steps < 1:
+            raise ValueError(
+                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+            )
+        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
+        is_strength_max = strength == 1.0
+
+        # 5. Preprocess mask and image
+
+        if padding_mask_crop is not None:
+            crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop)
+            resize_mode = "fill"
+        else:
+            crops_coords = None
+            resize_mode = "default"
+
+        original_image = image
+        init_image = self.image_processor.preprocess(
+            image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
+        )
+        init_image = init_image.to(dtype=torch.float32)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        num_channels_unet = self.unet.config.in_channels
+        return_image_latents = num_channels_unet == 4
+
+        latents_outputs = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            image=init_image,
+            timestep=latent_timestep,
+            is_strength_max=is_strength_max,
+            return_noise=True,
+            return_image_latents=return_image_latents,
+        )
+
+        if return_image_latents:
+            latents, noise, image_latents = latents_outputs
+        else:
+            latents, noise = latents_outputs
+
+        # 7. Prepare mask latent variables
+        mask_condition = self.mask_processor.preprocess(
+            mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
+        )
+
+        if masked_image_latents is None:
+            masked_image = init_image * (mask_condition < 0.5)
+        else:
+            masked_image = masked_image_latents
+
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask_condition,
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            self.do_classifier_free_guidance,
+        )
+
+        # 8. Check that sizes of mask, masked image and latents match
+        if num_channels_unet == 9:
+            # default case for runwayml/stable-diffusion-inpainting
+            num_channels_mask = mask.shape[1]
+            num_channels_masked_image = masked_image_latents.shape[1]
+            if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
+                raise ValueError(
+                    f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                    f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                    f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                    f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                    " `pipeline.unet` or your `mask_image` or `image` input."
+                )
+        elif num_channels_unet != 4:
+            raise ValueError(
+                f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
+            )
+
+        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 9.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        # 9.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 10. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                # concat latents, mask, masked_image_latents in the channel dimension
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                if num_channels_unet == 9:
+                    latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                if num_channels_unet == 4:
+                    init_latents_proper = image_latents
+                    if self.do_classifier_free_guidance:
+                        init_mask, _ = mask.chunk(2)
+                    else:
+                        init_mask = mask
+
+                    if i < len(timesteps) - 1:
+                        noise_timestep = timesteps[i + 1]
+                        init_latents_proper = self.scheduler.add_noise(
+                            init_latents_proper, noise, torch.tensor([noise_timestep])
+                        )
+
+                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    mask = callback_outputs.pop("mask", mask)
+                    masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            condition_kwargs = {}
+            if isinstance(self.vae, AsymmetricAutoencoderKL):
+                init_image = init_image.to(device=device, dtype=masked_image_latents.dtype)
+                init_image_condition = init_image.clone()
+                init_image = self._encode_vae_image(init_image, generator=generator)
+                mask_condition = mask_condition.to(device=device, dtype=masked_image_latents.dtype)
+                condition_kwargs = {"image": init_image_condition, "mask": mask_condition}
+            image = self.vae.decode(
+                latents / self.vae.config.scaling_factor, return_dict=False, generator=generator, **condition_kwargs
+            )[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        if padding_mask_crop is not None:
+            image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image]
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
new file mode 100644
index 000000000..cbb6ed4fa
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -0,0 +1,807 @@
+# Copyright 2024 The InstructPix2Pix Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import PIL_INTERPOLATION, deprecate, logging
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from . import StableDiffusionPipelineOutput
+from .safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
+def preprocess(image):
+    deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
+    deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+class StableDiffusionInstructPix2PixPipeline(
+    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin
+):
+    r"""
+    Pipeline for pixel-level image editing by following text instructions (based on Stable Diffusion).
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "image_latents"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        num_inference_steps: int = 100,
+        guidance_scale: float = 7.5,
+        image_guidance_scale: float = 1.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image` or tensor representing an image batch to be repainted according to `prompt`. Can also accept
+                image latents as `image`, but if passing latents directly it is not encoded again.
+            num_inference_steps (`int`, *optional*, defaults to 100):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            image_guidance_scale (`float`, *optional*, defaults to 1.5):
+                Push the generated image towards the inital `image`. Image guidance scale is enabled by setting
+                `image_guidance_scale > 1`. Higher image guidance scale encourages generated images that are closely
+                linked to the source `image`, usually at the expense of lower image quality. This pipeline requires a
+                value of at least `1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        ```py
+        >>> import PIL
+        >>> import requests
+        >>> import torch
+        >>> from io import BytesIO
+
+        >>> from diffusers import StableDiffusionInstructPix2PixPipeline
+
+
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+
+        >>> img_url = "https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/mountain.png"
+
+        >>> image = download_image(img_url).resize((512, 512))
+
+        >>> pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
+        ...     "timbrooks/instruct-pix2pix", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "make the mountains snowy"
+        >>> image = pipe(prompt=prompt, image=image).images[0]
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 0. Check inputs
+        self.check_inputs(
+            prompt,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._image_guidance_scale = image_guidance_scale
+
+        device = self._execution_device
+
+        if ip_adapter_image is not None:
+            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
+            image_embeds, negative_image_embeds = self.encode_image(
+                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
+            )
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([image_embeds, negative_image_embeds, negative_image_embeds])
+
+        if image is None:
+            raise ValueError("`image` input cannot be undefined.")
+
+        # 1. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 2. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        # 3. Preprocess image
+        image = self.image_processor.preprocess(image)
+
+        # 4. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare Image latents
+        image_latents = self.prepare_image_latents(
+            image,
+            batch_size,
+            num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            self.do_classifier_free_guidance,
+        )
+
+        height, width = image_latents.shape[-2:]
+        height = height * self.vae_scale_factor
+        width = width * self.vae_scale_factor
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 7. Check that shapes of latents and image match the UNet channels
+        num_channels_image = image_latents.shape[1]
+        if num_channels_latents + num_channels_image != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_image`: {num_channels_image} "
+                f" = {num_channels_latents+num_channels_image}. Please verify the config of"
+                " `pipeline.unet` or your `image` input."
+            )
+
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Expand the latents if we are doing classifier free guidance.
+                # The latents are expanded 3 times because for pix2pix the guidance\
+                # is applied for both the text and the input image.
+                latent_model_input = torch.cat([latents] * 3) if self.do_classifier_free_guidance else latents
+
+                # concat latents, image_latents in the channel dimension
+                scaled_latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                scaled_latent_model_input = torch.cat([scaled_latent_model_input, image_latents], dim=1)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    scaled_latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_text, noise_pred_image, noise_pred_uncond = noise_pred.chunk(3)
+                    noise_pred = (
+                        noise_pred_uncond
+                        + self.guidance_scale * (noise_pred_text - noise_pred_image)
+                        + self.image_guidance_scale * (noise_pred_image - noise_pred_uncond)
+                    )
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    image_latents = callback_outputs.pop("image_latents", image_latents)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+             prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_ prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+            prompt_embeds = prompt_embeds[0]
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        else:
+            prompt_embeds_dtype = self.unet.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            # pix2pix has two negative embeddings, and unlike in other pipelines latents are ordered [prompt_embeds, negative_prompt_embeds, negative_prompt_embeds]
+            prompt_embeds = torch.cat([prompt_embeds, negative_prompt_embeds, negative_prompt_embeds])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def check_inputs(
+        self,
+        prompt,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def prepare_image_latents(
+        self, image, batch_size, num_images_per_prompt, dtype, device, do_classifier_free_guidance, generator=None
+    ):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            image_latents = image
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), sample_mode="argmax")
+
+        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+            # expand image_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {image_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // image_latents.shape[0]
+            image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            image_latents = torch.cat([image_latents], dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_image_latents = torch.zeros_like(image_latents)
+            image_latents = torch.cat([image_latents, image_latents, uncond_image_latents], dim=0)
+
+        return image_latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def image_guidance_scale(self):
+        return self._image_guidance_scale
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self.guidance_scale > 1.0 and self.image_guidance_scale >= 1.0
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
new file mode 100644
index 000000000..918dffe51
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
@@ -0,0 +1,495 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...schedulers import EulerDiscreteScheduler
+from ...utils import deprecate, logging
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput, StableDiffusionMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.preprocess
+def preprocess(image):
+    warnings.warn(
+        "The preprocess method is deprecated and will be removed in a future version. Please"
+        " use VaeImageProcessor.preprocess instead",
+        FutureWarning,
+    )
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 64 for x in (w, h))  # resize to integer multiple of 64
+
+        image = [np.array(i.resize((w, h)))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, StableDiffusionMixin, FromSingleFileMixin):
+    r"""
+    Pipeline for upscaling Stable Diffusion output image resolution by a factor of 2.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A [`EulerDiscreteScheduler`] to be used in combination with `unet` to denoise the encoded image latents.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: EulerDiscreteScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, resample="bicubic")
+
+    def _encode_prompt(self, prompt, device, do_classifier_free_guidance, negative_prompt):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list(int)`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_length=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        text_encoder_out = self.text_encoder(
+            text_input_ids.to(device),
+            output_hidden_states=True,
+        )
+        text_embeddings = text_encoder_out.hidden_states[-1]
+        text_pooler_out = text_encoder_out.pooler_output
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_length=True,
+                return_tensors="pt",
+            )
+
+            uncond_encoder_out = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                output_hidden_states=True,
+            )
+
+            uncond_embeddings = uncond_encoder_out.hidden_states[-1]
+            uncond_pooler_out = uncond_encoder_out.pooler_output
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+            text_pooler_out = torch.cat([uncond_pooler_out, text_pooler_out])
+
+        return text_embeddings, text_pooler_out
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def check_inputs(self, prompt, image, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or `list` but is {type(image)}"
+            )
+
+        # verify batch size of prompt and image are same if image is a list or tensor
+        if isinstance(image, list) or isinstance(image, torch.Tensor):
+            if isinstance(prompt, str):
+                batch_size = 1
+            else:
+                batch_size = len(prompt)
+            if isinstance(image, list):
+                image_batch_size = len(image)
+            else:
+                image_batch_size = image.shape[0] if image.ndim == 4 else 1
+            if batch_size != image_batch_size:
+                raise ValueError(
+                    f"`prompt` has batch size {batch_size} and `image` has batch size {image_batch_size}."
+                    " Please make sure that passed `prompt` matches the batch size of `image`."
+                )
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height, width)
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        image: PipelineImageInput = None,
+        num_inference_steps: int = 75,
+        guidance_scale: float = 9.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image upscaling.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image` or tensor representing an image batch to be upscaled. If it's a tensor, it can be either a
+                latent output from a Stable Diffusion model or an image tensor in the range `[-1, 1]`. It is considered
+                a `latent` if `image.shape[1]` is `4`; otherwise, it is considered to be an image representation and
+                encoded using this pipeline's `vae` encoder.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Examples:
+        ```py
+        >>> from diffusers import StableDiffusionLatentUpscalePipeline, StableDiffusionPipeline
+        >>> import torch
+
+
+        >>> pipeline = StableDiffusionPipeline.from_pretrained(
+        ...     "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
+        ... )
+        >>> pipeline.to("cuda")
+
+        >>> model_id = "stabilityai/sd-x2-latent-upscaler"
+        >>> upscaler = StableDiffusionLatentUpscalePipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+        >>> upscaler.to("cuda")
+
+        >>> prompt = "a photo of an astronaut high resolution, unreal engine, ultra realistic"
+        >>> generator = torch.manual_seed(33)
+
+        >>> low_res_latents = pipeline(prompt, generator=generator, output_type="latent").images
+
+        >>> with torch.no_grad():
+        ...     image = pipeline.decode_latents(low_res_latents)
+        >>> image = pipeline.numpy_to_pil(image)[0]
+
+        >>> image.save("../images/a1.png")
+
+        >>> upscaled_image = upscaler(
+        ...     prompt=prompt,
+        ...     image=low_res_latents,
+        ...     num_inference_steps=20,
+        ...     guidance_scale=0,
+        ...     generator=generator,
+        ... ).images[0]
+
+        >>> upscaled_image.save("../images/a2.png")
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images.
+        """
+
+        # 1. Check inputs
+        self.check_inputs(prompt, image, callback_steps)
+
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        if guidance_scale == 0:
+            prompt = [""] * batch_size
+
+        # 3. Encode input prompt
+        text_embeddings, text_pooler_out = self._encode_prompt(
+            prompt, device, do_classifier_free_guidance, negative_prompt
+        )
+
+        # 4. Preprocess image
+        image = self.image_processor.preprocess(image)
+        image = image.to(dtype=text_embeddings.dtype, device=device)
+        if image.shape[1] == 3:
+            # encode image if not in latent-space yet
+            image = self.vae.encode(image).latent_dist.sample() * self.vae.config.scaling_factor
+
+        # 5. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        batch_multiplier = 2 if do_classifier_free_guidance else 1
+        image = image[None, :] if image.ndim == 3 else image
+        image = torch.cat([image] * batch_multiplier)
+
+        # 5. Add noise to image (set to be 0):
+        # (see below notes from the author):
+        # "the This step theoretically can make the model work better on out-of-distribution inputs, but mostly just seems to make it match the input less, so it's turned off by default."
+        noise_level = torch.tensor([0.0], dtype=torch.float32, device=device)
+        noise_level = torch.cat([noise_level] * image.shape[0])
+        inv_noise_level = (noise_level**2 + 1) ** (-0.5)
+
+        image_cond = F.interpolate(image, scale_factor=2, mode="nearest") * inv_noise_level[:, None, None, None]
+        image_cond = image_cond.to(text_embeddings.dtype)
+
+        noise_level_embed = torch.cat(
+            [
+                torch.ones(text_pooler_out.shape[0], 64, dtype=text_pooler_out.dtype, device=device),
+                torch.zeros(text_pooler_out.shape[0], 64, dtype=text_pooler_out.dtype, device=device),
+            ],
+            dim=1,
+        )
+
+        timestep_condition = torch.cat([noise_level_embed, text_pooler_out], dim=1)
+
+        # 6. Prepare latent variables
+        height, width = image.shape[2:]
+        num_channels_latents = self.vae.config.latent_channels
+        latents = self.prepare_latents(
+            batch_size,
+            num_channels_latents,
+            height * 2,  # 2x upscale
+            width * 2,
+            text_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 7. Check that sizes of image and latents match
+        num_channels_image = image.shape[1]
+        if num_channels_latents + num_channels_image != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_image`: {num_channels_image} "
+                f" = {num_channels_latents+num_channels_image}. Please verify the config of"
+                " `pipeline.unet` or your `image` input."
+            )
+
+        # 9. Denoising loop
+        num_warmup_steps = 0
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                sigma = self.scheduler.sigmas[i]
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                scaled_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                scaled_model_input = torch.cat([scaled_model_input, image_cond], dim=1)
+                # preconditioning parameter based on  Karras et al. (2022) (table 1)
+                timestep = torch.log(sigma) * 0.25
+
+                noise_pred = self.unet(
+                    scaled_model_input,
+                    timestep,
+                    encoder_hidden_states=text_embeddings,
+                    timestep_cond=timestep_condition,
+                ).sample
+
+                # in original repo, the output contains a variance channel that's not used
+                noise_pred = noise_pred[:, :-1]
+
+                # apply preconditioning, based on table 1 in Karras et al. (2022)
+                inv_sigma = 1 / (sigma**2 + 1)
+                noise_pred = inv_sigma * latent_model_input + self.scheduler.scale_model_input(sigma, t) * noise_pred
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        else:
+            image = latents
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
new file mode 100644
index 000000000..2d04cf41d
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -0,0 +1,808 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers
+from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from . import StableDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def preprocess(image):
+    warnings.warn(
+        "The preprocess method is deprecated and will be removed in a future version. Please"
+        " use VaeImageProcessor.preprocess instead",
+        FutureWarning,
+    )
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 64 for x in (w, h))  # resize to integer multiple of 64
+
+        image = [np.array(i.resize((w, h)))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+class StableDiffusionUpscalePipeline(
+    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
+):
+    r"""
+    Pipeline for text-guided image super-resolution using Stable Diffusion 2.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        low_res_scheduler ([`SchedulerMixin`]):
+            A scheduler used to add initial noise to the low resolution conditioning image. It must be an instance of
+            [`DDPMScheduler`].
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["watermarker", "safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        low_res_scheduler: DDPMScheduler,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: Optional[Any] = None,
+        feature_extractor: Optional[CLIPImageProcessor] = None,
+        watermarker: Optional[Any] = None,
+        max_noise_level: int = 350,
+    ):
+        super().__init__()
+
+        if hasattr(
+            vae, "config"
+        ):  # check if vae has a config attribute `scaling_factor` and if it is set to 0.08333, else set it to 0.08333 and deprecate
+            is_vae_scaling_factor_set_to_0_08333 = (
+                hasattr(vae.config, "scaling_factor") and vae.config.scaling_factor == 0.08333
+            )
+            if not is_vae_scaling_factor_set_to_0_08333:
+                deprecation_message = (
+                    "The configuration file of the vae does not contain `scaling_factor` or it is set to"
+                    f" {vae.config.scaling_factor}, which seems highly unlikely. If your checkpoint is a fine-tuned"
+                    " version of `stabilityai/stable-diffusion-x4-upscaler` you should change 'scaling_factor' to"
+                    " 0.08333 Please make sure to update the config accordingly, as not doing so might lead to"
+                    " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging"
+                    " Face Hub, it would be very nice if you could open a Pull Request for the `vae/config.json` file"
+                )
+                deprecate("wrong scaling_factor", "1.0.0", deprecation_message, standard_warn=False)
+                vae.register_to_config(scaling_factor=0.08333)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            low_res_scheduler=low_res_scheduler,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            watermarker=watermarker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, resample="bicubic")
+        self.register_to_config(max_noise_level=max_noise_level)
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is not None:
+            feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, nsfw_detected, watermark_detected = self.safety_checker(
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(dtype=dtype),
+            )
+        else:
+            nsfw_detected = None
+            watermark_detected = None
+
+            if hasattr(self, "unet_offload_hook") and self.unet_offload_hook is not None:
+                self.unet_offload_hook.offload()
+
+        return image, nsfw_detected, watermark_detected
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        noise_level,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, np.ndarray)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `np.ndarray`, `PIL.Image.Image` or `list` but is {type(image)}"
+            )
+
+        # verify batch size of prompt and image are same if image is a list or tensor or numpy array
+        if isinstance(image, list) or isinstance(image, torch.Tensor) or isinstance(image, np.ndarray):
+            if prompt is not None and isinstance(prompt, str):
+                batch_size = 1
+            elif prompt is not None and isinstance(prompt, list):
+                batch_size = len(prompt)
+            else:
+                batch_size = prompt_embeds.shape[0]
+
+            if isinstance(image, list):
+                image_batch_size = len(image)
+            else:
+                image_batch_size = image.shape[0]
+            if batch_size != image_batch_size:
+                raise ValueError(
+                    f"`prompt` has batch size {batch_size} and `image` has batch size {image_batch_size}."
+                    " Please make sure that passed `prompt` matches the batch size of `image`."
+                )
+
+        # check noise level
+        if noise_level > self.config.max_noise_level:
+            raise ValueError(f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height, width)
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                LoRAXFormersAttnProcessor,
+                LoRAAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        num_inference_steps: int = 75,
+        guidance_scale: float = 9.0,
+        noise_level: int = 20,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: int = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image` or tensor representing an image batch to be upscaled.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+        ```py
+        >>> import requests
+        >>> from PIL import Image
+        >>> from io import BytesIO
+        >>> from diffusers import StableDiffusionUpscalePipeline
+        >>> import torch
+
+        >>> # load model and scheduler
+        >>> model_id = "stabilityai/stable-diffusion-x4-upscaler"
+        >>> pipeline = StableDiffusionUpscalePipeline.from_pretrained(
+        ...     model_id, revision="fp16", torch_dtype=torch.float16
+        ... )
+        >>> pipeline = pipeline.to("cuda")
+
+        >>> # let's download an  image
+        >>> url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale/low_res_cat.png"
+        >>> response = requests.get(url)
+        >>> low_res_img = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> low_res_img = low_res_img.resize((128, 128))
+        >>> prompt = "a white cat"
+
+        >>> upscaled_image = pipeline(prompt=prompt, image=low_res_img).images[0]
+        >>> upscaled_image.save("upsampled_cat.png")
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            image,
+            noise_level,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        if image is None:
+            raise ValueError("`image` input cannot be undefined.")
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Preprocess image
+        image = self.image_processor.preprocess(image)
+        image = image.to(dtype=prompt_embeds.dtype, device=device)
+
+        # 5. set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Add noise to image
+        noise_level = torch.tensor([noise_level], dtype=torch.long, device=device)
+        noise = randn_tensor(image.shape, generator=generator, device=device, dtype=prompt_embeds.dtype)
+        image = self.low_res_scheduler.add_noise(image, noise, noise_level)
+
+        batch_multiplier = 2 if do_classifier_free_guidance else 1
+        image = torch.cat([image] * batch_multiplier * num_images_per_prompt)
+        noise_level = torch.cat([noise_level] * image.shape[0])
+
+        # 6. Prepare latent variables
+        height, width = image.shape[2:]
+        num_channels_latents = self.vae.config.latent_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 7. Check that sizes of image and latents match
+        num_channels_image = image.shape[1]
+        if num_channels_latents + num_channels_image != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_image`: {num_channels_image} "
+                f" = {num_channels_latents+num_channels_image}. Please verify the config of"
+                " `pipeline.unet` or your `image` input."
+            )
+
+        # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+
+                # concat latents, mask, masked_image_latents in the channel dimension
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                latent_model_input = torch.cat([latent_model_input, image], dim=1)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    class_labels=noise_level,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+
+            # Ensure latents are always the same type as the VAE
+            latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+
+            image, has_nsfw_concept, _ = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # 11. Apply watermark
+        if output_type == "pil" and self.watermarker is not None:
+            image = self.watermarker.apply_watermark(image)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
new file mode 100644
index 000000000..c62e0f4ec
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
@@ -0,0 +1,932 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+from transformers.models.clip.modeling_clip import CLIPTextModelOutput
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, PriorTransformer, UNet2DConditionModel
+from ...models.embeddings import get_timestep_embedding
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput, StableDiffusionMixin
+from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableUnCLIPPipeline
+
+        >>> pipe = StableUnCLIPPipeline.from_pretrained(
+        ...     "fusing/stable-unclip-2-1-l", torch_dtype=torch.float16
+        ... )  # TODO update model path
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> images = pipe(prompt).images
+        >>> images[0].save("astronaut_horse.png")
+        ```
+"""
+
+
+class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin):
+    """
+    Pipeline for text-to-image generation using stable unCLIP.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
+    Args:
+        prior_tokenizer ([`CLIPTokenizer`]):
+            A [`CLIPTokenizer`].
+        prior_text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen [`CLIPTextModelWithProjection`] text-encoder.
+        prior ([`PriorTransformer`]):
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+        prior_scheduler ([`KarrasDiffusionSchedulers`]):
+            Scheduler used in the prior denoising process.
+        image_normalizer ([`StableUnCLIPImageNormalizer`]):
+            Used to normalize the predicted image embeddings before the noise is applied and un-normalize the image
+            embeddings after the noise has been applied.
+        image_noising_scheduler ([`KarrasDiffusionSchedulers`]):
+            Noise schedule for adding noise to the predicted image embeddings. The amount of noise to add is determined
+            by the `noise_level`.
+        tokenizer ([`CLIPTokenizer`]):
+            A [`CLIPTokenizer`].
+        text_encoder ([`CLIPTextModel`]):
+            Frozen [`CLIPTextModel`] text-encoder.
+        unet ([`UNet2DConditionModel`]):
+            A [`UNet2DConditionModel`] to denoise the encoded image latents.
+        scheduler ([`KarrasDiffusionSchedulers`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+    """
+
+    _exclude_from_cpu_offload = ["prior", "image_normalizer"]
+    model_cpu_offload_seq = "text_encoder->prior_text_encoder->unet->vae"
+
+    # prior components
+    prior_tokenizer: CLIPTokenizer
+    prior_text_encoder: CLIPTextModelWithProjection
+    prior: PriorTransformer
+    prior_scheduler: KarrasDiffusionSchedulers
+
+    # image noising components
+    image_normalizer: StableUnCLIPImageNormalizer
+    image_noising_scheduler: KarrasDiffusionSchedulers
+
+    # regular denoising components
+    tokenizer: CLIPTokenizer
+    text_encoder: CLIPTextModel
+    unet: UNet2DConditionModel
+    scheduler: KarrasDiffusionSchedulers
+
+    vae: AutoencoderKL
+
+    def __init__(
+        self,
+        # prior components
+        prior_tokenizer: CLIPTokenizer,
+        prior_text_encoder: CLIPTextModelWithProjection,
+        prior: PriorTransformer,
+        prior_scheduler: KarrasDiffusionSchedulers,
+        # image noising components
+        image_normalizer: StableUnCLIPImageNormalizer,
+        image_noising_scheduler: KarrasDiffusionSchedulers,
+        # regular denoising components
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModelWithProjection,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        # vae
+        vae: AutoencoderKL,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            prior_tokenizer=prior_tokenizer,
+            prior_text_encoder=prior_text_encoder,
+            prior=prior,
+            prior_scheduler=prior_scheduler,
+            image_normalizer=image_normalizer,
+            image_noising_scheduler=image_noising_scheduler,
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+        )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline._encode_prompt with _encode_prompt->_encode_prior_prompt, tokenizer->prior_tokenizer, text_encoder->prior_text_encoder
+    def _encode_prior_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None,
+        text_attention_mask: Optional[torch.Tensor] = None,
+    ):
+        if text_model_output is None:
+            batch_size = len(prompt) if isinstance(prompt, list) else 1
+            # get prompt text embeddings
+            text_inputs = self.prior_tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.prior_tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            text_mask = text_inputs.attention_mask.bool().to(device)
+
+            untruncated_ids = self.prior_tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.prior_tokenizer.batch_decode(
+                    untruncated_ids[:, self.prior_tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.prior_tokenizer.model_max_length} tokens: {removed_text}"
+                )
+                text_input_ids = text_input_ids[:, : self.prior_tokenizer.model_max_length]
+
+            prior_text_encoder_output = self.prior_text_encoder(text_input_ids.to(device))
+
+            prompt_embeds = prior_text_encoder_output.text_embeds
+            text_enc_hid_states = prior_text_encoder_output.last_hidden_state
+
+        else:
+            batch_size = text_model_output[0].shape[0]
+            prompt_embeds, text_enc_hid_states = text_model_output[0], text_model_output[1]
+            text_mask = text_attention_mask
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_enc_hid_states = text_enc_hid_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens = [""] * batch_size
+
+            uncond_input = self.prior_tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.prior_tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_text_mask = uncond_input.attention_mask.bool().to(device)
+            negative_prompt_embeds_prior_text_encoder_output = self.prior_text_encoder(
+                uncond_input.input_ids.to(device)
+            )
+
+            negative_prompt_embeds = negative_prompt_embeds_prior_text_encoder_output.text_embeds
+            uncond_text_enc_hid_states = negative_prompt_embeds_prior_text_encoder_output.last_hidden_state
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_enc_hid_states.shape[1]
+            uncond_text_enc_hid_states = uncond_text_enc_hid_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_enc_hid_states = uncond_text_enc_hid_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_enc_hid_states = torch.cat([uncond_text_enc_hid_states, text_enc_hid_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_enc_hid_states, text_mask
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs with prepare_extra_step_kwargs->prepare_prior_extra_step_kwargs, scheduler->prior_scheduler
+    def prepare_prior_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the prior_scheduler step, since not all prior_schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other prior_schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.prior_scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the prior_scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.prior_scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        noise_level,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Please make sure to define only one of the two."
+            )
+
+        if prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+
+        if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                "Provide either `negative_prompt` or `negative_prompt_embeds`. Cannot leave both `negative_prompt` and `negative_prompt_embeds` undefined."
+            )
+
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if noise_level < 0 or noise_level >= self.image_noising_scheduler.config.num_train_timesteps:
+            raise ValueError(
+                f"`noise_level` must be between 0 and {self.image_noising_scheduler.config.num_train_timesteps - 1}, inclusive."
+            )
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def noise_image_embeddings(
+        self,
+        image_embeds: torch.Tensor,
+        noise_level: int,
+        noise: Optional[torch.FloatTensor] = None,
+        generator: Optional[torch.Generator] = None,
+    ):
+        """
+        Add noise to the image embeddings. The amount of noise is controlled by a `noise_level` input. A higher
+        `noise_level` increases the variance in the final un-noised images.
+
+        The noise is applied in two ways:
+        1. A noise schedule is applied directly to the embeddings.
+        2. A vector of sinusoidal time embeddings are appended to the output.
+
+        In both cases, the amount of noise is controlled by the same `noise_level`.
+
+        The embeddings are normalized before the noise is applied and un-normalized after the noise is applied.
+        """
+        if noise is None:
+            noise = randn_tensor(
+                image_embeds.shape, generator=generator, device=image_embeds.device, dtype=image_embeds.dtype
+            )
+
+        noise_level = torch.tensor([noise_level] * image_embeds.shape[0], device=image_embeds.device)
+
+        self.image_normalizer.to(image_embeds.device)
+        image_embeds = self.image_normalizer.scale(image_embeds)
+
+        image_embeds = self.image_noising_scheduler.add_noise(image_embeds, timesteps=noise_level, noise=noise)
+
+        image_embeds = self.image_normalizer.unscale(image_embeds)
+
+        noise_level = get_timestep_embedding(
+            timesteps=noise_level, embedding_dim=image_embeds.shape[-1], flip_sin_to_cos=True, downscale_freq_shift=0
+        )
+
+        # `get_timestep_embeddings` does not contain any weights and will always return f32 tensors,
+        # but we might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        noise_level = noise_level.to(image_embeds.dtype)
+
+        image_embeds = torch.cat((image_embeds, noise_level), 1)
+
+        return image_embeds
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        # regular denoising process args
+        prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 20,
+        guidance_scale: float = 10.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        noise_level: int = 0,
+        # prior args
+        prior_num_inference_steps: int = 25,
+        prior_guidance_scale: float = 4.0,
+        prior_latents: Optional[torch.FloatTensor] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        """
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 20):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 10.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            noise_level (`int`, *optional*, defaults to `0`):
+                The amount of noise to add to the image embeddings. A higher `noise_level` increases the variance in
+                the final un-noised images. See [`StableUnCLIPPipeline.noise_image_embeddings`] for more details.
+            prior_num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps in the prior denoising process. More denoising steps usually lead to a
+                higher quality image at the expense of slower inference.
+            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            prior_latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                embedding generation in the prior denoising process. Can be used to tweak the same generation with
+                different prompts. If not provided, a latents tensor is generated by sampling using the supplied random
+                `generator`.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                [`~ pipeline_utils.ImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When returning
+                a tuple, the first element is a list with the generated images.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt=prompt,
+            height=height,
+            width=width,
+            callback_steps=callback_steps,
+            noise_level=noise_level,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        batch_size = batch_size * num_images_per_prompt
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        prior_do_classifier_free_guidance = prior_guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prior_prompt_embeds, prior_text_encoder_hidden_states, prior_text_mask = self._encode_prior_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=prior_do_classifier_free_guidance,
+        )
+
+        # 4. Prepare prior timesteps
+        self.prior_scheduler.set_timesteps(prior_num_inference_steps, device=device)
+        prior_timesteps_tensor = self.prior_scheduler.timesteps
+
+        # 5. Prepare prior latent variables
+        embedding_dim = self.prior.config.embedding_dim
+        prior_latents = self.prepare_latents(
+            (batch_size, embedding_dim),
+            prior_prompt_embeds.dtype,
+            device,
+            generator,
+            prior_latents,
+            self.prior_scheduler,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        prior_extra_step_kwargs = self.prepare_prior_extra_step_kwargs(generator, eta)
+
+        # 7. Prior denoising loop
+        for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([prior_latents] * 2) if prior_do_classifier_free_guidance else prior_latents
+            latent_model_input = self.prior_scheduler.scale_model_input(latent_model_input, t)
+
+            predicted_image_embedding = self.prior(
+                latent_model_input,
+                timestep=t,
+                proj_embedding=prior_prompt_embeds,
+                encoder_hidden_states=prior_text_encoder_hidden_states,
+                attention_mask=prior_text_mask,
+            ).predicted_image_embedding
+
+            if prior_do_classifier_free_guidance:
+                predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2)
+                predicted_image_embedding = predicted_image_embedding_uncond + prior_guidance_scale * (
+                    predicted_image_embedding_text - predicted_image_embedding_uncond
+                )
+
+            prior_latents = self.prior_scheduler.step(
+                predicted_image_embedding,
+                timestep=t,
+                sample=prior_latents,
+                **prior_extra_step_kwargs,
+                return_dict=False,
+            )[0]
+
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, prior_latents)
+
+        prior_latents = self.prior.post_process_latents(prior_latents)
+
+        image_embeds = prior_latents
+
+        # done prior
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 8. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 9. Prepare image embeddings
+        image_embeds = self.noise_image_embeddings(
+            image_embeds=image_embeds,
+            noise_level=noise_level,
+            generator=generator,
+        )
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = torch.zeros_like(image_embeds)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeds = torch.cat([negative_prompt_embeds, image_embeds])
+
+        # 10. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 11. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        latents = self.prepare_latents(
+            shape=shape,
+            dtype=prompt_embeds.dtype,
+            device=device,
+            generator=generator,
+            latents=latents,
+            scheduler=self.scheduler,
+        )
+
+        # 12. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 13. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(
+                latent_model_input,
+                t,
+                encoder_hidden_states=prompt_embeds,
+                class_labels=image_embeds,
+                cross_attention_kwargs=cross_attention_kwargs,
+                return_dict=False,
+            )[0]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        else:
+            image = latents
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
new file mode 100644
index 000000000..9b85d9e6b
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
@@ -0,0 +1,839 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.embeddings import get_timestep_embedding
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput, StableDiffusionMixin
+from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import requests
+        >>> import torch
+        >>> from PIL import Image
+        >>> from io import BytesIO
+
+        >>> from diffusers import StableUnCLIPImg2ImgPipeline
+
+        >>> pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
+        ...     "fusing/stable-unclip-2-1-l-img2img", torch_dtype=torch.float16
+        ... )  # TODO update model path
+        >>> pipe = pipe.to("cuda")
+
+        >>> url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+
+        >>> response = requests.get(url)
+        >>> init_image = Image.open(BytesIO(response.content)).convert("RGB")
+        >>> init_image = init_image.resize((768, 512))
+
+        >>> prompt = "A fantasy landscape, trending on artstation"
+
+        >>> images = pipe(prompt, init_image).images
+        >>> images[0].save("fantasy_landscape.png")
+        ```
+"""
+
+
+class StableUnCLIPImg2ImgPipeline(
+    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin
+):
+    """
+    Pipeline for text-guided image-to-image generation using stable unCLIP.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
+    Args:
+        feature_extractor ([`CLIPImageProcessor`]):
+            Feature extractor for image pre-processing before being encoded.
+        image_encoder ([`CLIPVisionModelWithProjection`]):
+            CLIP vision model for encoding images.
+        image_normalizer ([`StableUnCLIPImageNormalizer`]):
+            Used to normalize the predicted image embeddings before the noise is applied and un-normalize the image
+            embeddings after the noise has been applied.
+        image_noising_scheduler ([`KarrasDiffusionSchedulers`]):
+            Noise schedule for adding noise to the predicted image embeddings. The amount of noise to add is determined
+            by the `noise_level`.
+        tokenizer (`~transformers.CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`)].
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen [`~transformers.CLIPTextModel`] text-encoder.
+        unet ([`UNet2DConditionModel`]):
+            A [`UNet2DConditionModel`] to denoise the encoded image latents.
+        scheduler ([`KarrasDiffusionSchedulers`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+    """
+
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _exclude_from_cpu_offload = ["image_normalizer"]
+
+    # image encoding components
+    feature_extractor: CLIPImageProcessor
+    image_encoder: CLIPVisionModelWithProjection
+
+    # image noising components
+    image_normalizer: StableUnCLIPImageNormalizer
+    image_noising_scheduler: KarrasDiffusionSchedulers
+
+    # regular denoising components
+    tokenizer: CLIPTokenizer
+    text_encoder: CLIPTextModel
+    unet: UNet2DConditionModel
+    scheduler: KarrasDiffusionSchedulers
+
+    vae: AutoencoderKL
+
+    def __init__(
+        self,
+        # image encoding components
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection,
+        # image noising components
+        image_normalizer: StableUnCLIPImageNormalizer,
+        image_noising_scheduler: KarrasDiffusionSchedulers,
+        # regular denoising components
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModel,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        # vae
+        vae: AutoencoderKL,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+            image_normalizer=image_normalizer,
+            image_noising_scheduler=image_noising_scheduler,
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            vae=vae,
+        )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    def _encode_image(
+        self,
+        image,
+        device,
+        batch_size,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        noise_level,
+        generator,
+        image_embeds,
+    ):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if isinstance(image, PIL.Image.Image):
+            # the image embedding should repeated so it matches the total batch size of the prompt
+            repeat_by = batch_size
+        else:
+            # assume the image input is already properly batched and just needs to be repeated so
+            # it matches the num_images_per_prompt.
+            #
+            # NOTE(will) this is probably missing a few number of side cases. I.e. batched/non-batched
+            # `image_embeds`. If those happen to be common use cases, let's think harder about
+            # what the expected dimensions of inputs should be and how we handle the encoding.
+            repeat_by = num_images_per_prompt
+
+        if image_embeds is None:
+            if not isinstance(image, torch.Tensor):
+                image = self.feature_extractor(images=image, return_tensors="pt").pixel_values
+
+            image = image.to(device=device, dtype=dtype)
+            image_embeds = self.image_encoder(image).image_embeds
+
+        image_embeds = self.noise_image_embeddings(
+            image_embeds=image_embeds,
+            noise_level=noise_level,
+            generator=generator,
+        )
+
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        image_embeds = image_embeds.unsqueeze(1)
+        bs_embed, seq_len, _ = image_embeds.shape
+        image_embeds = image_embeds.repeat(1, repeat_by, 1)
+        image_embeds = image_embeds.view(bs_embed * repeat_by, seq_len, -1)
+        image_embeds = image_embeds.squeeze(1)
+
+        if do_classifier_free_guidance:
+            negative_prompt_embeds = torch.zeros_like(image_embeds)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeds = torch.cat([negative_prompt_embeds, image_embeds])
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        image,
+        height,
+        width,
+        callback_steps,
+        noise_level,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        image_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Please make sure to define only one of the two."
+            )
+
+        if prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+
+        if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                "Provide either `negative_prompt` or `negative_prompt_embeds`. Cannot leave both `negative_prompt` and `negative_prompt_embeds` undefined."
+            )
+
+        if prompt is not None and negative_prompt is not None:
+            if type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if noise_level < 0 or noise_level >= self.image_noising_scheduler.config.num_train_timesteps:
+            raise ValueError(
+                f"`noise_level` must be between 0 and {self.image_noising_scheduler.config.num_train_timesteps - 1}, inclusive."
+            )
+
+        if image is not None and image_embeds is not None:
+            raise ValueError(
+                "Provide either `image` or `image_embeds`. Please make sure to define only one of the two."
+            )
+
+        if image is None and image_embeds is None:
+            raise ValueError(
+                "Provide either `image` or `image_embeds`. Cannot leave both `image` and `image_embeds` undefined."
+            )
+
+        if image is not None:
+            if (
+                not isinstance(image, torch.Tensor)
+                and not isinstance(image, PIL.Image.Image)
+                and not isinstance(image, list)
+            ):
+                raise ValueError(
+                    "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                    f" {type(image)}"
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_unclip.StableUnCLIPPipeline.noise_image_embeddings
+    def noise_image_embeddings(
+        self,
+        image_embeds: torch.Tensor,
+        noise_level: int,
+        noise: Optional[torch.FloatTensor] = None,
+        generator: Optional[torch.Generator] = None,
+    ):
+        """
+        Add noise to the image embeddings. The amount of noise is controlled by a `noise_level` input. A higher
+        `noise_level` increases the variance in the final un-noised images.
+
+        The noise is applied in two ways:
+        1. A noise schedule is applied directly to the embeddings.
+        2. A vector of sinusoidal time embeddings are appended to the output.
+
+        In both cases, the amount of noise is controlled by the same `noise_level`.
+
+        The embeddings are normalized before the noise is applied and un-normalized after the noise is applied.
+        """
+        if noise is None:
+            noise = randn_tensor(
+                image_embeds.shape, generator=generator, device=image_embeds.device, dtype=image_embeds.dtype
+            )
+
+        noise_level = torch.tensor([noise_level] * image_embeds.shape[0], device=image_embeds.device)
+
+        self.image_normalizer.to(image_embeds.device)
+        image_embeds = self.image_normalizer.scale(image_embeds)
+
+        image_embeds = self.image_noising_scheduler.add_noise(image_embeds, timesteps=noise_level, noise=noise)
+
+        image_embeds = self.image_normalizer.unscale(image_embeds)
+
+        noise_level = get_timestep_embedding(
+            timesteps=noise_level, embedding_dim=image_embeds.shape[-1], flip_sin_to_cos=True, downscale_freq_shift=0
+        )
+
+        # `get_timestep_embeddings` does not contain any weights and will always return f32 tensors,
+        # but we might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        noise_level = noise_level.to(image_embeds.dtype)
+
+        image_embeds = torch.cat((image_embeds, noise_level), 1)
+
+        return image_embeds
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 20,
+        guidance_scale: float = 10,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        noise_level: int = 0,
+        image_embeds: Optional[torch.FloatTensor] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, either `prompt_embeds` will be
+                used or prompt is initialized to `""`.
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
+                `Image` or tensor representing an image batch. The image is encoded to its CLIP embedding which the
+                `unet` is conditioned on. The image is _not_ encoded by the `vae` and then used as the latents in the
+                denoising process like it is in the standard Stable Diffusion text-guided image variation process.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 20):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 10.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            noise_level (`int`, *optional*, defaults to `0`):
+                The amount of noise to add to the image embeddings. A higher `noise_level` increases the variance in
+                the final un-noised images. See [`StableUnCLIPPipeline.noise_image_embeddings`] for more details.
+            image_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated CLIP embeddings to condition the `unet` on. These latents are not used in the denoising
+                process. If you want to provide pre-generated latents, pass them to `__call__` as `latents`.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                [`~ pipeline_utils.ImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When returning
+                a tuple, the first element is a list with the generated images.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        if prompt is None and prompt_embeds is None:
+            prompt = len(image) * [""] if isinstance(image, list) else ""
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt=prompt,
+            image=image,
+            height=height,
+            width=width,
+            callback_steps=callback_steps,
+            noise_level=noise_level,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            image_embeds=image_embeds,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        batch_size = batch_size * num_images_per_prompt
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Encoder input image
+        noise_level = torch.tensor([noise_level], device=device)
+        image_embeds = self._encode_image(
+            image=image,
+            device=device,
+            batch_size=batch_size,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            noise_level=noise_level,
+            generator=generator,
+            image_embeds=image_embeds,
+        )
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size=batch_size,
+            num_channels_latents=num_channels_latents,
+            height=height,
+            width=width,
+            dtype=prompt_embeds.dtype,
+            device=device,
+            generator=generator,
+            latents=latents,
+        )
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8. Denoising loop
+        for i, t in enumerate(self.progress_bar(timesteps)):
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet(
+                latent_model_input,
+                t,
+                encoder_hidden_states=prompt_embeds,
+                class_labels=image_embeds,
+                cross_attention_kwargs=cross_attention_kwargs,
+                return_dict=False,
+            )[0]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        # 9. Post-processing
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        else:
+            image = latents
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/safety_checker.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/safety_checker.py
new file mode 100644
index 000000000..6cc4d26f2
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/safety_checker.py
@@ -0,0 +1,125 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import torch
+import torch.nn as nn
+from transformers import CLIPConfig, CLIPVisionModel, PreTrainedModel
+
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def cosine_distance(image_embeds, text_embeds):
+    normalized_image_embeds = nn.functional.normalize(image_embeds)
+    normalized_text_embeds = nn.functional.normalize(text_embeds)
+    return torch.mm(normalized_image_embeds, normalized_text_embeds.t())
+
+
+class StableDiffusionSafetyChecker(PreTrainedModel):
+    config_class = CLIPConfig
+
+    _no_split_modules = ["CLIPEncoderLayer"]
+
+    def __init__(self, config: CLIPConfig):
+        super().__init__(config)
+
+        self.vision_model = CLIPVisionModel(config.vision_config)
+        self.visual_projection = nn.Linear(config.vision_config.hidden_size, config.projection_dim, bias=False)
+
+        self.concept_embeds = nn.Parameter(torch.ones(17, config.projection_dim), requires_grad=False)
+        self.special_care_embeds = nn.Parameter(torch.ones(3, config.projection_dim), requires_grad=False)
+
+        self.concept_embeds_weights = nn.Parameter(torch.ones(17), requires_grad=False)
+        self.special_care_embeds_weights = nn.Parameter(torch.ones(3), requires_grad=False)
+
+    @torch.no_grad()
+    def forward(self, clip_input, images):
+        pooled_output = self.vision_model(clip_input)[1]  # pooled_output
+        image_embeds = self.visual_projection(pooled_output)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds).cpu().float().numpy()
+        cos_dist = cosine_distance(image_embeds, self.concept_embeds).cpu().float().numpy()
+
+        result = []
+        batch_size = image_embeds.shape[0]
+        for i in range(batch_size):
+            result_img = {"special_scores": {}, "special_care": [], "concept_scores": {}, "bad_concepts": []}
+
+            # increase this value to create a stronger `nfsw` filter
+            # at the cost of increasing the possibility of filtering benign images
+            adjustment = 0.0
+
+            for concept_idx in range(len(special_cos_dist[0])):
+                concept_cos = special_cos_dist[i][concept_idx]
+                concept_threshold = self.special_care_embeds_weights[concept_idx].item()
+                result_img["special_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
+                if result_img["special_scores"][concept_idx] > 0:
+                    result_img["special_care"].append({concept_idx, result_img["special_scores"][concept_idx]})
+                    adjustment = 0.01
+
+            for concept_idx in range(len(cos_dist[0])):
+                concept_cos = cos_dist[i][concept_idx]
+                concept_threshold = self.concept_embeds_weights[concept_idx].item()
+                result_img["concept_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
+                if result_img["concept_scores"][concept_idx] > 0:
+                    result_img["bad_concepts"].append(concept_idx)
+
+            result.append(result_img)
+
+        has_nsfw_concepts = [len(res["bad_concepts"]) > 0 for res in result]
+
+        for idx, has_nsfw_concept in enumerate(has_nsfw_concepts):
+            if has_nsfw_concept:
+                if torch.is_tensor(images) or torch.is_tensor(images[0]):
+                    images[idx] = torch.zeros_like(images[idx])  # black image
+                else:
+                    images[idx] = np.zeros(images[idx].shape)  # black image
+
+        if any(has_nsfw_concepts):
+            logger.warning(
+                "Potential NSFW content was detected in one or more images. A black image will be returned instead."
+                " Try again with a different prompt and/or seed."
+            )
+
+        return images, has_nsfw_concepts
+
+    @torch.no_grad()
+    def forward_onnx(self, clip_input: torch.FloatTensor, images: torch.FloatTensor):
+        pooled_output = self.vision_model(clip_input)[1]  # pooled_output
+        image_embeds = self.visual_projection(pooled_output)
+
+        special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds)
+        cos_dist = cosine_distance(image_embeds, self.concept_embeds)
+
+        # increase this value to create a stronger `nsfw` filter
+        # at the cost of increasing the possibility of filtering benign images
+        adjustment = 0.0
+
+        special_scores = special_cos_dist - self.special_care_embeds_weights + adjustment
+        # special_scores = special_scores.round(decimals=3)
+        special_care = torch.any(special_scores > 0, dim=1)
+        special_adjustment = special_care * 0.01
+        special_adjustment = special_adjustment.unsqueeze(1).expand(-1, cos_dist.shape[1])
+
+        concept_scores = (cos_dist - self.concept_embeds_weights) + special_adjustment
+        # concept_scores = concept_scores.round(decimals=3)
+        has_nsfw_concepts = torch.any(concept_scores > 0, dim=1)
+
+        images[has_nsfw_concepts] = 0.0  # black image
+
+        return images, has_nsfw_concepts
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/safety_checker_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/safety_checker_flax.py
new file mode 100644
index 000000000..571a4f2d7
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/safety_checker_flax.py
@@ -0,0 +1,112 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple
+
+import jax
+import jax.numpy as jnp
+from flax import linen as nn
+from flax.core.frozen_dict import FrozenDict
+from transformers import CLIPConfig, FlaxPreTrainedModel
+from transformers.models.clip.modeling_flax_clip import FlaxCLIPVisionModule
+
+
+def jax_cosine_distance(emb_1, emb_2, eps=1e-12):
+    norm_emb_1 = jnp.divide(emb_1.T, jnp.clip(jnp.linalg.norm(emb_1, axis=1), a_min=eps)).T
+    norm_emb_2 = jnp.divide(emb_2.T, jnp.clip(jnp.linalg.norm(emb_2, axis=1), a_min=eps)).T
+    return jnp.matmul(norm_emb_1, norm_emb_2.T)
+
+
+class FlaxStableDiffusionSafetyCheckerModule(nn.Module):
+    config: CLIPConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.vision_model = FlaxCLIPVisionModule(self.config.vision_config)
+        self.visual_projection = nn.Dense(self.config.projection_dim, use_bias=False, dtype=self.dtype)
+
+        self.concept_embeds = self.param("concept_embeds", jax.nn.initializers.ones, (17, self.config.projection_dim))
+        self.special_care_embeds = self.param(
+            "special_care_embeds", jax.nn.initializers.ones, (3, self.config.projection_dim)
+        )
+
+        self.concept_embeds_weights = self.param("concept_embeds_weights", jax.nn.initializers.ones, (17,))
+        self.special_care_embeds_weights = self.param("special_care_embeds_weights", jax.nn.initializers.ones, (3,))
+
+    def __call__(self, clip_input):
+        pooled_output = self.vision_model(clip_input)[1]
+        image_embeds = self.visual_projection(pooled_output)
+
+        special_cos_dist = jax_cosine_distance(image_embeds, self.special_care_embeds)
+        cos_dist = jax_cosine_distance(image_embeds, self.concept_embeds)
+
+        # increase this value to create a stronger `nfsw` filter
+        # at the cost of increasing the possibility of filtering benign image inputs
+        adjustment = 0.0
+
+        special_scores = special_cos_dist - self.special_care_embeds_weights[None, :] + adjustment
+        special_scores = jnp.round(special_scores, 3)
+        is_special_care = jnp.any(special_scores > 0, axis=1, keepdims=True)
+        # Use a lower threshold if an image has any special care concept
+        special_adjustment = is_special_care * 0.01
+
+        concept_scores = cos_dist - self.concept_embeds_weights[None, :] + special_adjustment
+        concept_scores = jnp.round(concept_scores, 3)
+        has_nsfw_concepts = jnp.any(concept_scores > 0, axis=1)
+
+        return has_nsfw_concepts
+
+
+class FlaxStableDiffusionSafetyChecker(FlaxPreTrainedModel):
+    config_class = CLIPConfig
+    main_input_name = "clip_input"
+    module_class = FlaxStableDiffusionSafetyCheckerModule
+
+    def __init__(
+        self,
+        config: CLIPConfig,
+        input_shape: Optional[Tuple] = None,
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        if input_shape is None:
+            input_shape = (1, 224, 224, 3)
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.Array, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensor
+        clip_input = jax.random.normal(rng, input_shape)
+
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        random_params = self.module.init(rngs, clip_input)["params"]
+
+        return random_params
+
+    def __call__(
+        self,
+        clip_input,
+        params: dict = None,
+    ):
+        clip_input = jnp.transpose(clip_input, (0, 2, 3, 1))
+
+        return self.module.apply(
+            {"params": params or self.params},
+            jnp.array(clip_input, dtype=jnp.float32),
+            rngs={},
+        )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py
new file mode 100644
index 000000000..3fc6b3a3f
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py
@@ -0,0 +1,57 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models.modeling_utils import ModelMixin
+
+
+class StableUnCLIPImageNormalizer(ModelMixin, ConfigMixin):
+    """
+    This class is used to hold the mean and standard deviation of the CLIP embedder used in stable unCLIP.
+
+    It is used to normalize the image embeddings before the noise is applied and un-normalize the noised image
+    embeddings.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        embedding_dim: int = 768,
+    ):
+        super().__init__()
+
+        self.mean = nn.Parameter(torch.zeros(1, embedding_dim))
+        self.std = nn.Parameter(torch.ones(1, embedding_dim))
+
+    def to(
+        self,
+        torch_device: Optional[Union[str, torch.device]] = None,
+        torch_dtype: Optional[torch.dtype] = None,
+    ):
+        self.mean = nn.Parameter(self.mean.to(torch_device).to(torch_dtype))
+        self.std = nn.Parameter(self.std.to(torch_device).to(torch_dtype))
+        return self
+
+    def scale(self, embeds):
+        embeds = (embeds - self.mean) * 1.0 / self.std
+        return embeds
+
+    def unscale(self, embeds):
+        embeds = (embeds * self.std) + self.mean
+        return embeds
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_attend_and_excite/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_attend_and_excite/__init__.py
new file mode 100644
index 000000000..cce556fce
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_attend_and_excite/__init__.py
@@ -0,0 +1,48 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_stable_diffusion_attend_and_excite"] = ["StableDiffusionAttendAndExcitePipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_stable_diffusion_attend_and_excite import StableDiffusionAttendAndExcitePipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py
new file mode 100644
index 000000000..03c80b46b
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py
@@ -0,0 +1,1088 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import math
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch.nn import functional as F
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.attention_processor import Attention
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionAttendAndExcitePipeline
+
+        >>> pipe = StableDiffusionAttendAndExcitePipeline.from_pretrained(
+        ...     "CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16
+        ... ).to("cuda")
+
+
+        >>> prompt = "a cat and a frog"
+
+        >>> # use get_indices function to find out indices of the tokens you want to alter
+        >>> pipe.get_indices(prompt)
+        {0: '<|startoftext|>', 1: 'a</w>', 2: 'cat</w>', 3: 'and</w>', 4: 'a</w>', 5: 'frog</w>', 6: '<|endoftext|>'}
+
+        >>> token_indices = [2, 5]
+        >>> seed = 6141
+        >>> generator = torch.Generator("cuda").manual_seed(seed)
+
+        >>> images = pipe(
+        ...     prompt=prompt,
+        ...     token_indices=token_indices,
+        ...     guidance_scale=7.5,
+        ...     generator=generator,
+        ...     num_inference_steps=50,
+        ...     max_iter_to_alter=25,
+        ... ).images
+
+        >>> image = images[0]
+        >>> image.save(f"../images/{prompt}_{seed}.png")
+        ```
+"""
+
+
+class AttentionStore:
+    @staticmethod
+    def get_empty_store():
+        return {"down": [], "mid": [], "up": []}
+
+    def __call__(self, attn, is_cross: bool, place_in_unet: str):
+        if self.cur_att_layer >= 0 and is_cross:
+            if attn.shape[1] == np.prod(self.attn_res):
+                self.step_store[place_in_unet].append(attn)
+
+        self.cur_att_layer += 1
+        if self.cur_att_layer == self.num_att_layers:
+            self.cur_att_layer = 0
+            self.between_steps()
+
+    def between_steps(self):
+        self.attention_store = self.step_store
+        self.step_store = self.get_empty_store()
+
+    def get_average_attention(self):
+        average_attention = self.attention_store
+        return average_attention
+
+    def aggregate_attention(self, from_where: List[str]) -> torch.Tensor:
+        """Aggregates the attention across the different layers and heads at the specified resolution."""
+        out = []
+        attention_maps = self.get_average_attention()
+        for location in from_where:
+            for item in attention_maps[location]:
+                cross_maps = item.reshape(-1, self.attn_res[0], self.attn_res[1], item.shape[-1])
+                out.append(cross_maps)
+        out = torch.cat(out, dim=0)
+        out = out.sum(0) / out.shape[0]
+        return out
+
+    def reset(self):
+        self.cur_att_layer = 0
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+
+    def __init__(self, attn_res):
+        """
+        Initialize an empty AttentionStore :param step_index: used to visualize only a specific step in the diffusion
+        process
+        """
+        self.num_att_layers = -1
+        self.cur_att_layer = 0
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+        self.curr_step_index = 0
+        self.attn_res = attn_res
+
+
+class AttendExciteAttnProcessor:
+    def __init__(self, attnstore, place_in_unet):
+        super().__init__()
+        self.attnstore = attnstore
+        self.place_in_unet = place_in_unet
+
+    def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+        query = attn.to_q(hidden_states)
+
+        is_cross = encoder_hidden_states is not None
+        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+
+        # only need to store attention maps during the Attend and Excite process
+        if attention_probs.requires_grad:
+            self.attnstore(attention_probs, is_cross, self.place_in_unet)
+
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        return hidden_states
+
+
+class StableDiffusionAttendAndExcitePipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion and Attend-and-Excite.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        indices,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        indices_is_list_ints = isinstance(indices, list) and isinstance(indices[0], int)
+        indices_is_list_list_ints = (
+            isinstance(indices, list) and isinstance(indices[0], list) and isinstance(indices[0][0], int)
+        )
+
+        if not indices_is_list_ints and not indices_is_list_list_ints:
+            raise TypeError("`indices` must be a list of ints or a list of a list of ints")
+
+        if indices_is_list_ints:
+            indices_batch_size = 1
+        elif indices_is_list_list_ints:
+            indices_batch_size = len(indices)
+
+        if prompt is not None and isinstance(prompt, str):
+            prompt_batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            prompt_batch_size = len(prompt)
+        elif prompt_embeds is not None:
+            prompt_batch_size = prompt_embeds.shape[0]
+
+        if indices_batch_size != prompt_batch_size:
+            raise ValueError(
+                f"indices batch size must be same as prompt batch size. indices batch size: {indices_batch_size}, prompt batch size: {prompt_batch_size}"
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @staticmethod
+    def _compute_max_attention_per_index(
+        attention_maps: torch.Tensor,
+        indices: List[int],
+    ) -> List[torch.Tensor]:
+        """Computes the maximum attention value for each of the tokens we wish to alter."""
+        attention_for_text = attention_maps[:, :, 1:-1]
+        attention_for_text *= 100
+        attention_for_text = torch.nn.functional.softmax(attention_for_text, dim=-1)
+
+        # Shift indices since we removed the first token
+        indices = [index - 1 for index in indices]
+
+        # Extract the maximum values
+        max_indices_list = []
+        for i in indices:
+            image = attention_for_text[:, :, i]
+            smoothing = GaussianSmoothing().to(attention_maps.device)
+            input = F.pad(image.unsqueeze(0).unsqueeze(0), (1, 1, 1, 1), mode="reflect")
+            image = smoothing(input).squeeze(0).squeeze(0)
+            max_indices_list.append(image.max())
+        return max_indices_list
+
+    def _aggregate_and_get_max_attention_per_token(
+        self,
+        indices: List[int],
+    ):
+        """Aggregates the attention for each token and computes the max activation value for each token to alter."""
+        attention_maps = self.attention_store.aggregate_attention(
+            from_where=("up", "down", "mid"),
+        )
+        max_attention_per_index = self._compute_max_attention_per_index(
+            attention_maps=attention_maps,
+            indices=indices,
+        )
+        return max_attention_per_index
+
+    @staticmethod
+    def _compute_loss(max_attention_per_index: List[torch.Tensor]) -> torch.Tensor:
+        """Computes the attend-and-excite loss using the maximum attention value for each token."""
+        losses = [max(0, 1.0 - curr_max) for curr_max in max_attention_per_index]
+        loss = max(losses)
+        return loss
+
+    @staticmethod
+    def _update_latent(latents: torch.Tensor, loss: torch.Tensor, step_size: float) -> torch.Tensor:
+        """Update the latent according to the computed loss."""
+        grad_cond = torch.autograd.grad(loss.requires_grad_(True), [latents], retain_graph=True)[0]
+        latents = latents - step_size * grad_cond
+        return latents
+
+    def _perform_iterative_refinement_step(
+        self,
+        latents: torch.Tensor,
+        indices: List[int],
+        loss: torch.Tensor,
+        threshold: float,
+        text_embeddings: torch.Tensor,
+        step_size: float,
+        t: int,
+        max_refinement_steps: int = 20,
+    ):
+        """
+        Performs the iterative latent refinement introduced in the paper. Here, we continuously update the latent code
+        according to our loss objective until the given threshold is reached for all tokens.
+        """
+        iteration = 0
+        target_loss = max(0, 1.0 - threshold)
+        while loss > target_loss:
+            iteration += 1
+
+            latents = latents.clone().detach().requires_grad_(True)
+            self.unet(latents, t, encoder_hidden_states=text_embeddings).sample
+            self.unet.zero_grad()
+
+            # Get max activation value for each subject token
+            max_attention_per_index = self._aggregate_and_get_max_attention_per_token(
+                indices=indices,
+            )
+
+            loss = self._compute_loss(max_attention_per_index)
+
+            if loss != 0:
+                latents = self._update_latent(latents, loss, step_size)
+
+            logger.info(f"\t Try {iteration}. loss: {loss}")
+
+            if iteration >= max_refinement_steps:
+                logger.info(f"\t Exceeded max number of iterations ({max_refinement_steps})! ")
+                break
+
+        # Run one more time but don't compute gradients and update the latents.
+        # We just need to compute the new loss - the grad update will occur below
+        latents = latents.clone().detach().requires_grad_(True)
+        _ = self.unet(latents, t, encoder_hidden_states=text_embeddings).sample
+        self.unet.zero_grad()
+
+        # Get max activation value for each subject token
+        max_attention_per_index = self._aggregate_and_get_max_attention_per_token(
+            indices=indices,
+        )
+        loss = self._compute_loss(max_attention_per_index)
+        logger.info(f"\t Finished with loss of: {loss}")
+        return loss, latents, max_attention_per_index
+
+    def register_attention_control(self):
+        attn_procs = {}
+        cross_att_count = 0
+        for name in self.unet.attn_processors.keys():
+            if name.startswith("mid_block"):
+                place_in_unet = "mid"
+            elif name.startswith("up_blocks"):
+                place_in_unet = "up"
+            elif name.startswith("down_blocks"):
+                place_in_unet = "down"
+            else:
+                continue
+
+            cross_att_count += 1
+            attn_procs[name] = AttendExciteAttnProcessor(attnstore=self.attention_store, place_in_unet=place_in_unet)
+
+        self.unet.set_attn_processor(attn_procs)
+        self.attention_store.num_att_layers = cross_att_count
+
+    def get_indices(self, prompt: str) -> Dict[str, int]:
+        """Utility function to list the indices of the tokens you wish to alte"""
+        ids = self.tokenizer(prompt).input_ids
+        indices = {i: tok for tok, i in zip(self.tokenizer.convert_ids_to_tokens(ids), range(len(ids)))}
+        return indices
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        token_indices: Union[List[int], List[List[int]]],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        max_iter_to_alter: int = 25,
+        thresholds: dict = {0: 0.05, 10: 0.5, 20: 0.8},
+        scale_factor: int = 20,
+        attn_res: Optional[Tuple[int]] = (16, 16),
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            token_indices (`List[int]`):
+                The token indices to alter with attend-and-excite.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            max_iter_to_alter (`int`, *optional*, defaults to `25`):
+                Number of denoising steps to apply attend-and-excite. The `max_iter_to_alter` denoising steps are when
+                attend-and-excite is applied. For example, if `max_iter_to_alter` is `25` and there are a total of `30`
+                denoising steps, the first `25` denoising steps applies attend-and-excite and the last `5` will not.
+            thresholds (`dict`, *optional*, defaults to `{0: 0.05, 10: 0.5, 20: 0.8}`):
+                Dictionary defining the iterations and desired thresholds to apply iterative latent refinement in.
+            scale_factor (`int`, *optional*, default to 20):
+                Scale factor to control the step size of each attend-and-excite update.
+            attn_res (`tuple`, *optional*, default computed from width and height):
+                The 2D resolution of the semantic attention map.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            token_indices,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        if attn_res is None:
+            attn_res = int(np.ceil(width / 32)), int(np.ceil(height / 32))
+        self.attention_store = AttentionStore(attn_res)
+        self.register_attention_control()
+
+        # default config for step size from original repo
+        scale_range = np.linspace(1.0, 0.5, len(self.scheduler.timesteps))
+        step_size = scale_factor * np.sqrt(scale_range)
+
+        text_embeddings = (
+            prompt_embeds[batch_size * num_images_per_prompt :] if do_classifier_free_guidance else prompt_embeds
+        )
+
+        if isinstance(token_indices[0], int):
+            token_indices = [token_indices]
+
+        indices = []
+
+        for ind in token_indices:
+            indices = indices + [ind] * num_images_per_prompt
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Attend and excite process
+                with torch.enable_grad():
+                    latents = latents.clone().detach().requires_grad_(True)
+                    updated_latents = []
+                    for latent, index, text_embedding in zip(latents, indices, text_embeddings):
+                        # Forward pass of denoising with text conditioning
+                        latent = latent.unsqueeze(0)
+                        text_embedding = text_embedding.unsqueeze(0)
+
+                        self.unet(
+                            latent,
+                            t,
+                            encoder_hidden_states=text_embedding,
+                            cross_attention_kwargs=cross_attention_kwargs,
+                        ).sample
+                        self.unet.zero_grad()
+
+                        # Get max activation value for each subject token
+                        max_attention_per_index = self._aggregate_and_get_max_attention_per_token(
+                            indices=index,
+                        )
+
+                        loss = self._compute_loss(max_attention_per_index=max_attention_per_index)
+
+                        # If this is an iterative refinement step, verify we have reached the desired threshold for all
+                        if i in thresholds.keys() and loss > 1.0 - thresholds[i]:
+                            loss, latent, max_attention_per_index = self._perform_iterative_refinement_step(
+                                latents=latent,
+                                indices=index,
+                                loss=loss,
+                                threshold=thresholds[i],
+                                text_embeddings=text_embedding,
+                                step_size=step_size[i],
+                                t=t,
+                            )
+
+                        # Perform gradient update
+                        if i < max_iter_to_alter:
+                            if loss != 0:
+                                latent = self._update_latent(
+                                    latents=latent,
+                                    loss=loss,
+                                    step_size=step_size[i],
+                                )
+                            logger.info(f"Iteration {i} | Loss: {loss:0.4f}")
+
+                        updated_latents.append(latent)
+
+                    latents = torch.cat(updated_latents, dim=0)
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # 8. Post-processing
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+
+class GaussianSmoothing(torch.nn.Module):
+    """
+    Arguments:
+    Apply gaussian smoothing on a 1d, 2d or 3d tensor. Filtering is performed seperately for each channel in the input
+    using a depthwise convolution.
+        channels (int, sequence): Number of channels of the input tensors. Output will
+            have this number of channels as well.
+        kernel_size (int, sequence): Size of the gaussian kernel. sigma (float, sequence): Standard deviation of the
+        gaussian kernel. dim (int, optional): The number of dimensions of the data.
+            Default value is 2 (spatial).
+    """
+
+    # channels=1, kernel_size=kernel_size, sigma=sigma, dim=2
+    def __init__(
+        self,
+        channels: int = 1,
+        kernel_size: int = 3,
+        sigma: float = 0.5,
+        dim: int = 2,
+    ):
+        super().__init__()
+
+        if isinstance(kernel_size, int):
+            kernel_size = [kernel_size] * dim
+        if isinstance(sigma, float):
+            sigma = [sigma] * dim
+
+        # The gaussian kernel is the product of the
+        # gaussian function of each dimension.
+        kernel = 1
+        meshgrids = torch.meshgrid([torch.arange(size, dtype=torch.float32) for size in kernel_size])
+        for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
+            mean = (size - 1) / 2
+            kernel *= 1 / (std * math.sqrt(2 * math.pi)) * torch.exp(-(((mgrid - mean) / (2 * std)) ** 2))
+
+        # Make sure sum of values in gaussian kernel equals 1.
+        kernel = kernel / torch.sum(kernel)
+
+        # Reshape to depthwise convolutional weight
+        kernel = kernel.view(1, 1, *kernel.size())
+        kernel = kernel.repeat(channels, *[1] * (kernel.dim() - 1))
+
+        self.register_buffer("weight", kernel)
+        self.groups = channels
+
+        if dim == 1:
+            self.conv = F.conv1d
+        elif dim == 2:
+            self.conv = F.conv2d
+        elif dim == 3:
+            self.conv = F.conv3d
+        else:
+            raise RuntimeError("Only 1, 2 and 3 dimensions are supported. Received {}.".format(dim))
+
+    def forward(self, input):
+        """
+        Arguments:
+        Apply gaussian filter to input.
+            input (torch.Tensor): Input to apply gaussian filter on.
+        Returns:
+            filtered (torch.Tensor): Filtered output.
+        """
+        return self.conv(input, weight=self.weight.to(input.dtype), groups=self.groups)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_diffedit/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_diffedit/__init__.py
new file mode 100644
index 000000000..e2145edb9
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_diffedit/__init__.py
@@ -0,0 +1,48 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_stable_diffusion_diffedit"] = ["StableDiffusionDiffEditPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_stable_diffusion_diffedit import StableDiffusionDiffEditPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py
new file mode 100644
index 000000000..4c90ce064
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py
@@ -0,0 +1,1530 @@
+# Copyright 2024 DiffEdit Authors and Pix2Pix Zero Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ...configuration_utils import FrozenDict
+from ...image_processor import VaeImageProcessor
+from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import DDIMInverseScheduler, KarrasDiffusionSchedulers
+from ...utils import (
+    PIL_INTERPOLATION,
+    USE_PEFT_BACKEND,
+    BaseOutput,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class DiffEditInversionPipelineOutput(BaseOutput):
+    """
+    Output class for Stable Diffusion pipelines.
+
+    Args:
+        latents (`torch.FloatTensor`)
+            inverted latents tensor
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `num_timesteps * batch_size` or numpy array of shape `(num_timesteps,
+            batch_size, height, width, num_channels)`. PIL images or numpy array present the denoised images of the
+            diffusion pipeline.
+    """
+
+    latents: torch.FloatTensor
+    images: Union[List[PIL.Image.Image], np.ndarray]
+
+
+EXAMPLE_DOC_STRING = """
+
+        ```py
+        >>> import PIL
+        >>> import requests
+        >>> import torch
+        >>> from io import BytesIO
+
+        >>> from diffusers import StableDiffusionDiffEditPipeline
+
+
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+
+        >>> img_url = "https://github.com/Xiang-cd/DiffEdit-stable-diffusion/raw/main/assets/origin.png"
+
+        >>> init_image = download_image(img_url).resize((768, 768))
+
+        >>> pipe = StableDiffusionDiffEditPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
+        >>> pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config)
+        >>> pipeline.enable_model_cpu_offload()
+
+        >>> mask_prompt = "A bowl of fruits"
+        >>> prompt = "A bowl of pears"
+
+        >>> mask_image = pipe.generate_mask(image=init_image, source_prompt=prompt, target_prompt=mask_prompt)
+        >>> image_latents = pipe.invert(image=init_image, prompt=mask_prompt).latents
+        >>> image = pipe(prompt=prompt, mask_image=mask_image, image_latents=image_latents).images[0]
+        ```
+"""
+
+EXAMPLE_INVERT_DOC_STRING = """
+        ```py
+        >>> import PIL
+        >>> import requests
+        >>> import torch
+        >>> from io import BytesIO
+
+        >>> from diffusers import StableDiffusionDiffEditPipeline
+
+
+        >>> def download_image(url):
+        ...     response = requests.get(url)
+        ...     return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+
+
+        >>> img_url = "https://github.com/Xiang-cd/DiffEdit-stable-diffusion/raw/main/assets/origin.png"
+
+        >>> init_image = download_image(img_url).resize((768, 768))
+
+        >>> pipe = StableDiffusionDiffEditPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
+        >>> pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config)
+        >>> pipeline.enable_model_cpu_offload()
+
+        >>> prompt = "A bowl of fruits"
+
+        >>> inverted_latents = pipe.invert(image=init_image, prompt=prompt).latents
+        ```
+"""
+
+
+def auto_corr_loss(hidden_states, generator=None):
+    reg_loss = 0.0
+    for i in range(hidden_states.shape[0]):
+        for j in range(hidden_states.shape[1]):
+            noise = hidden_states[i : i + 1, j : j + 1, :, :]
+            while True:
+                roll_amount = torch.randint(noise.shape[2] // 2, (1,), generator=generator).item()
+                reg_loss += (noise * torch.roll(noise, shifts=roll_amount, dims=2)).mean() ** 2
+                reg_loss += (noise * torch.roll(noise, shifts=roll_amount, dims=3)).mean() ** 2
+
+                if noise.shape[2] <= 8:
+                    break
+                noise = torch.nn.functional.avg_pool2d(noise, kernel_size=2)
+    return reg_loss
+
+
+def kl_divergence(hidden_states):
+    return hidden_states.var() + hidden_states.mean() ** 2 - 1 - torch.log(hidden_states.var() + 1e-7)
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess
+def preprocess(image):
+    deprecation_message = "The preprocess method is deprecated and will be removed in diffusers 1.0.0. Please use VaeImageProcessor.preprocess(...) instead"
+    deprecate("preprocess", "1.0.0", deprecation_message, standard_warn=False)
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        w, h = image[0].size
+        w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+
+        image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = 2.0 * image - 1.0
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        image = torch.cat(image, dim=0)
+    return image
+
+
+def preprocess_mask(mask, batch_size: int = 1):
+    if not isinstance(mask, torch.Tensor):
+        # preprocess mask
+        if isinstance(mask, PIL.Image.Image) or isinstance(mask, np.ndarray):
+            mask = [mask]
+
+        if isinstance(mask, list):
+            if isinstance(mask[0], PIL.Image.Image):
+                mask = [np.array(m.convert("L")).astype(np.float32) / 255.0 for m in mask]
+            if isinstance(mask[0], np.ndarray):
+                mask = np.stack(mask, axis=0) if mask[0].ndim < 3 else np.concatenate(mask, axis=0)
+                mask = torch.from_numpy(mask)
+            elif isinstance(mask[0], torch.Tensor):
+                mask = torch.stack(mask, dim=0) if mask[0].ndim < 3 else torch.cat(mask, dim=0)
+
+    # Batch and add channel dim for single mask
+    if mask.ndim == 2:
+        mask = mask.unsqueeze(0).unsqueeze(0)
+
+    # Batch single mask or add channel dim
+    if mask.ndim == 3:
+        # Single batched mask, no channel dim or single mask not batched but channel dim
+        if mask.shape[0] == 1:
+            mask = mask.unsqueeze(0)
+
+        # Batched masks no channel dim
+        else:
+            mask = mask.unsqueeze(1)
+
+    # Check mask shape
+    if batch_size > 1:
+        if mask.shape[0] == 1:
+            mask = torch.cat([mask] * batch_size)
+        elif mask.shape[0] > 1 and mask.shape[0] != batch_size:
+            raise ValueError(
+                f"`mask_image` with batch size {mask.shape[0]} cannot be broadcasted to batch size {batch_size} "
+                f"inferred by prompt inputs"
+            )
+
+    if mask.shape[1] != 1:
+        raise ValueError(f"`mask_image` must have 1 channel, but has {mask.shape[1]} channels")
+
+    # Check mask is in [0, 1]
+    if mask.min() < 0 or mask.max() > 1:
+        raise ValueError("`mask_image` should be in [0, 1] range")
+
+    # Binarize mask
+    mask[mask < 0.5] = 0
+    mask[mask >= 0.5] = 1
+
+    return mask
+
+
+class StableDiffusionDiffEditPipeline(
+    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin
+):
+    r"""
+    <Tip warning={true}>
+
+    This is an experimental feature!
+
+    </Tip>
+
+    Pipeline for text-guided image inpainting using Stable Diffusion and DiffEdit.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading and saving methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        inverse_scheduler ([`DDIMInverseScheduler`]):
+            A scheduler to be used in combination with `unet` to fill in the unmasked part of the input latents.
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "inverse_scheduler"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        inverse_scheduler: DDIMInverseScheduler,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "skip_prk_steps") and scheduler.config.skip_prk_steps is False:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration"
+                " `skip_prk_steps`. `skip_prk_steps` should be set to True in the configuration file. Please make"
+                " sure to update the config accordingly as not setting `skip_prk_steps` in the config might lead to"
+                " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging Face"
+                " Hub, it would be very nice if you could open a Pull request for the"
+                " `scheduler/scheduler_config.json` file"
+            )
+            deprecate("skip_prk_steps not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["skip_prk_steps"] = True
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            inverse_scheduler=inverse_scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def check_inputs(
+        self,
+        prompt,
+        strength,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (strength is None) or (strength is not None and (strength < 0 or strength > 1)):
+            raise ValueError(
+                f"The value of `strength` should in [0.0, 1.0] but is, but is {strength} of type {type(strength)}."
+            )
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def check_source_inputs(
+        self,
+        source_prompt=None,
+        source_negative_prompt=None,
+        source_prompt_embeds=None,
+        source_negative_prompt_embeds=None,
+    ):
+        if source_prompt is not None and source_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `source_prompt`: {source_prompt} and `source_prompt_embeds`: {source_prompt_embeds}."
+                "  Please make sure to only forward one of the two."
+            )
+        elif source_prompt is None and source_prompt_embeds is None:
+            raise ValueError(
+                "Provide either `source_image` or `source_prompt_embeds`. Cannot leave all both of the arguments undefined."
+            )
+        elif source_prompt is not None and (
+            not isinstance(source_prompt, str) and not isinstance(source_prompt, list)
+        ):
+            raise ValueError(f"`source_prompt` has to be of type `str` or `list` but is {type(source_prompt)}")
+
+        if source_negative_prompt is not None and source_negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `source_negative_prompt`: {source_negative_prompt} and `source_negative_prompt_embeds`:"
+                f" {source_negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if source_prompt_embeds is not None and source_negative_prompt_embeds is not None:
+            if source_prompt_embeds.shape != source_negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`source_prompt_embeds` and `source_negative_prompt_embeds` must have the same shape when passed"
+                    f" directly, but got: `source_prompt_embeds` {source_prompt_embeds.shape} !="
+                    f" `source_negative_prompt_embeds` {source_negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    def get_inverse_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+
+        # safety for t_start overflow to prevent empty timsteps slice
+        if t_start == 0:
+            return self.inverse_scheduler.timesteps, num_inference_steps
+        timesteps = self.inverse_scheduler.timesteps[:-t_start]
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def prepare_image_latents(self, image, batch_size, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        if image.shape[1] == 4:
+            latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            if isinstance(generator, list):
+                latents = [
+                    self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)
+                ]
+                latents = torch.cat(latents, dim=0)
+            else:
+                latents = self.vae.encode(image).latent_dist.sample(generator)
+
+            latents = self.vae.config.scaling_factor * latents
+
+        if batch_size != latents.shape[0]:
+            if batch_size % latents.shape[0] == 0:
+                # expand image_latents for batch_size
+                deprecation_message = (
+                    f"You have passed {batch_size} text prompts (`prompt`), but only {latents.shape[0]} initial"
+                    " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                    " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                    " your script to pass as many initial images as text prompts to suppress this warning."
+                )
+                deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+                additional_latents_per_image = batch_size // latents.shape[0]
+                latents = torch.cat([latents] * additional_latents_per_image, dim=0)
+            else:
+                raise ValueError(
+                    f"Cannot duplicate `image` of batch size {latents.shape[0]} to {batch_size} text prompts."
+                )
+        else:
+            latents = torch.cat([latents], dim=0)
+
+        return latents
+
+    def get_epsilon(self, model_output: torch.Tensor, sample: torch.Tensor, timestep: int):
+        pred_type = self.inverse_scheduler.config.prediction_type
+        alpha_prod_t = self.inverse_scheduler.alphas_cumprod[timestep]
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        if pred_type == "epsilon":
+            return model_output
+        elif pred_type == "sample":
+            return (sample - alpha_prod_t ** (0.5) * model_output) / beta_prod_t ** (0.5)
+        elif pred_type == "v_prediction":
+            return (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {pred_type} must be one of `epsilon`, `sample`, or `v_prediction`"
+            )
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def generate_mask(
+        self,
+        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        target_prompt: Optional[Union[str, List[str]]] = None,
+        target_negative_prompt: Optional[Union[str, List[str]]] = None,
+        target_prompt_embeds: Optional[torch.FloatTensor] = None,
+        target_negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        source_prompt: Optional[Union[str, List[str]]] = None,
+        source_negative_prompt: Optional[Union[str, List[str]]] = None,
+        source_prompt_embeds: Optional[torch.FloatTensor] = None,
+        source_negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        num_maps_per_mask: Optional[int] = 10,
+        mask_encode_strength: Optional[float] = 0.5,
+        mask_thresholding_ratio: Optional[float] = 3.0,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        output_type: Optional[str] = "np",
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        r"""
+        Generate a latent mask given a mask prompt, a target prompt, and an image.
+
+        Args:
+            image (`PIL.Image.Image`):
+                `Image` or tensor representing an image batch to be used for computing the mask.
+            target_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide semantic mask generation. If not defined, you need to pass
+                `prompt_embeds`.
+            target_negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            target_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            target_negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            source_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide semantic mask generation using DiffEdit. If not defined, you need to
+                pass `source_prompt_embeds` or `source_image` instead.
+            source_negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide semantic mask generation away from using DiffEdit. If not defined, you
+                need to pass `source_negative_prompt_embeds` or `source_image` instead.
+            source_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings to guide the semantic mask generation. Can be used to easily tweak text
+                inputs (prompt weighting). If not provided, text embeddings are generated from `source_prompt` input
+                argument.
+            source_negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings to negatively guide the semantic mask generation. Can be used to easily
+                tweak text inputs (prompt weighting). If not provided, text embeddings are generated from
+                `source_negative_prompt` input argument.
+            num_maps_per_mask (`int`, *optional*, defaults to 10):
+                The number of noise maps sampled to generate the semantic mask using DiffEdit.
+            mask_encode_strength (`float`, *optional*, defaults to 0.5):
+                The strength of the noise maps sampled to generate the semantic mask using DiffEdit. Must be between 0
+                and 1.
+            mask_thresholding_ratio (`float`, *optional*, defaults to 3.0):
+                The maximum multiple of the mean absolute difference used to clamp the semantic guidance map before
+                mask binarization.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the
+                [`~models.attention_processor.AttnProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+
+        Examples:
+
+        Returns:
+            `List[PIL.Image.Image]` or `np.array`:
+                When returning a `List[PIL.Image.Image]`, the list consists of a batch of single-channel binary images
+                with dimensions `(height // self.vae_scale_factor, width // self.vae_scale_factor)`. If it's
+                `np.array`, the shape is `(batch_size, height // self.vae_scale_factor, width //
+                self.vae_scale_factor)`.
+        """
+
+        # 1. Check inputs (Provide dummy argument for callback_steps)
+        self.check_inputs(
+            target_prompt,
+            mask_encode_strength,
+            1,
+            target_negative_prompt,
+            target_prompt_embeds,
+            target_negative_prompt_embeds,
+        )
+
+        self.check_source_inputs(
+            source_prompt,
+            source_negative_prompt,
+            source_prompt_embeds,
+            source_negative_prompt_embeds,
+        )
+
+        if (num_maps_per_mask is None) or (
+            num_maps_per_mask is not None and (not isinstance(num_maps_per_mask, int) or num_maps_per_mask <= 0)
+        ):
+            raise ValueError(
+                f"`num_maps_per_mask` has to be a positive integer but is {num_maps_per_mask} of type"
+                f" {type(num_maps_per_mask)}."
+            )
+
+        if mask_thresholding_ratio is None or mask_thresholding_ratio <= 0:
+            raise ValueError(
+                f"`mask_thresholding_ratio` has to be positive but is {mask_thresholding_ratio} of type"
+                f" {type(mask_thresholding_ratio)}."
+            )
+
+        # 2. Define call parameters
+        if target_prompt is not None and isinstance(target_prompt, str):
+            batch_size = 1
+        elif target_prompt is not None and isinstance(target_prompt, list):
+            batch_size = len(target_prompt)
+        else:
+            batch_size = target_prompt_embeds.shape[0]
+        if cross_attention_kwargs is None:
+            cross_attention_kwargs = {}
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompts
+        (cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None)
+        target_negative_prompt_embeds, target_prompt_embeds = self.encode_prompt(
+            target_prompt,
+            device,
+            num_maps_per_mask,
+            do_classifier_free_guidance,
+            target_negative_prompt,
+            prompt_embeds=target_prompt_embeds,
+            negative_prompt_embeds=target_negative_prompt_embeds,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            target_prompt_embeds = torch.cat([target_negative_prompt_embeds, target_prompt_embeds])
+
+        source_negative_prompt_embeds, source_prompt_embeds = self.encode_prompt(
+            source_prompt,
+            device,
+            num_maps_per_mask,
+            do_classifier_free_guidance,
+            source_negative_prompt,
+            prompt_embeds=source_prompt_embeds,
+            negative_prompt_embeds=source_negative_prompt_embeds,
+        )
+        if do_classifier_free_guidance:
+            source_prompt_embeds = torch.cat([source_negative_prompt_embeds, source_prompt_embeds])
+
+        # 4. Preprocess image
+        image = self.image_processor.preprocess(image).repeat_interleave(num_maps_per_mask, dim=0)
+
+        # 5. Set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, _ = self.get_timesteps(num_inference_steps, mask_encode_strength, device)
+        encode_timestep = timesteps[0]
+
+        # 6. Prepare image latents and add noise with specified strength
+        image_latents = self.prepare_image_latents(
+            image, batch_size * num_maps_per_mask, self.vae.dtype, device, generator
+        )
+        noise = randn_tensor(image_latents.shape, generator=generator, device=device, dtype=self.vae.dtype)
+        image_latents = self.scheduler.add_noise(image_latents, noise, encode_timestep)
+
+        latent_model_input = torch.cat([image_latents] * (4 if do_classifier_free_guidance else 2))
+        latent_model_input = self.scheduler.scale_model_input(latent_model_input, encode_timestep)
+
+        # 7. Predict the noise residual
+        prompt_embeds = torch.cat([source_prompt_embeds, target_prompt_embeds])
+        noise_pred = self.unet(
+            latent_model_input,
+            encode_timestep,
+            encoder_hidden_states=prompt_embeds,
+            cross_attention_kwargs=cross_attention_kwargs,
+        ).sample
+
+        if do_classifier_free_guidance:
+            noise_pred_neg_src, noise_pred_source, noise_pred_uncond, noise_pred_target = noise_pred.chunk(4)
+            noise_pred_source = noise_pred_neg_src + guidance_scale * (noise_pred_source - noise_pred_neg_src)
+            noise_pred_target = noise_pred_uncond + guidance_scale * (noise_pred_target - noise_pred_uncond)
+        else:
+            noise_pred_source, noise_pred_target = noise_pred.chunk(2)
+
+        # 8. Compute the mask from the absolute difference of predicted noise residuals
+        # TODO: Consider smoothing mask guidance map
+        mask_guidance_map = (
+            torch.abs(noise_pred_target - noise_pred_source)
+            .reshape(batch_size, num_maps_per_mask, *noise_pred_target.shape[-3:])
+            .mean([1, 2])
+        )
+        clamp_magnitude = mask_guidance_map.mean() * mask_thresholding_ratio
+        semantic_mask_image = mask_guidance_map.clamp(0, clamp_magnitude) / clamp_magnitude
+        semantic_mask_image = torch.where(semantic_mask_image <= 0.5, 0, 1)
+        mask_image = semantic_mask_image.cpu().numpy()
+
+        # 9. Convert to Numpy array or PIL.
+        if output_type == "pil":
+            mask_image = self.image_processor.numpy_to_pil(mask_image)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        return mask_image
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_INVERT_DOC_STRING)
+    def invert(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        num_inference_steps: int = 50,
+        inpaint_strength: float = 0.8,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        decode_latents: bool = False,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        lambda_auto_corr: float = 20.0,
+        lambda_kl: float = 20.0,
+        num_reg_steps: int = 0,
+        num_auto_corr_rolls: int = 5,
+    ):
+        r"""
+        Generate inverted latents given a prompt and image.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`PIL.Image.Image`):
+                `Image` or tensor representing an image batch to produce the inverted latents guided by `prompt`.
+            inpaint_strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent of the noising process to run latent inversion. Must be between 0 and 1. When
+                `inpaint_strength` is 1, the inversion process is run for the full number of iterations specified in
+                `num_inference_steps`. `image` is used as a reference for the inversion process, and adding more noise
+                increases `inpaint_strength`. If `inpaint_strength` is 0, no inpainting occurs.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            decode_latents (`bool`, *optional*, defaults to `False`):
+                Whether or not to decode the inverted latents into a generated image. Setting this argument to `True`
+                decodes all inverted latents for each timestep into a list of generated images.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.DiffEditInversionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the
+                [`~models.attention_processor.AttnProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            lambda_auto_corr (`float`, *optional*, defaults to 20.0):
+                Lambda parameter to control auto correction.
+            lambda_kl (`float`, *optional*, defaults to 20.0):
+                Lambda parameter to control Kullback-Leibler divergence output.
+            num_reg_steps (`int`, *optional*, defaults to 0):
+                Number of regularization loss steps.
+            num_auto_corr_rolls (`int`, *optional*, defaults to 5):
+                Number of auto correction roll steps.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.pipeline_stable_diffusion_diffedit.DiffEditInversionPipelineOutput`] or
+            `tuple`:
+                If `return_dict` is `True`,
+                [`~pipelines.stable_diffusion.pipeline_stable_diffusion_diffedit.DiffEditInversionPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is the inverted latents tensors
+                ordered by increasing noise, and the second is the corresponding decoded images if `decode_latents` is
+                `True`, otherwise `None`.
+        """
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            inpaint_strength,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        if image is None:
+            raise ValueError("`image` input cannot be undefined.")
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if cross_attention_kwargs is None:
+            cross_attention_kwargs = {}
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Preprocess image
+        image = self.image_processor.preprocess(image)
+
+        # 4. Prepare latent variables
+        num_images_per_prompt = 1
+        latents = self.prepare_image_latents(
+            image, batch_size * num_images_per_prompt, self.vae.dtype, device, generator
+        )
+
+        # 5. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 6. Prepare timesteps
+        self.inverse_scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_inverse_timesteps(num_inference_steps, inpaint_strength, device)
+
+        # 7. Noising loop where we obtain the intermediate noised latent image for each timestep.
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.inverse_scheduler.order
+        inverted_latents = []
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.inverse_scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # regularization of the noise prediction (not in original code or paper but borrowed from Pix2PixZero)
+                if num_reg_steps > 0:
+                    with torch.enable_grad():
+                        for _ in range(num_reg_steps):
+                            if lambda_auto_corr > 0:
+                                for _ in range(num_auto_corr_rolls):
+                                    var = torch.autograd.Variable(noise_pred.detach().clone(), requires_grad=True)
+
+                                    # Derive epsilon from model output before regularizing to IID standard normal
+                                    var_epsilon = self.get_epsilon(var, latent_model_input.detach(), t)
+
+                                    l_ac = auto_corr_loss(var_epsilon, generator=generator)
+                                    l_ac.backward()
+
+                                    grad = var.grad.detach() / num_auto_corr_rolls
+                                    noise_pred = noise_pred - lambda_auto_corr * grad
+
+                            if lambda_kl > 0:
+                                var = torch.autograd.Variable(noise_pred.detach().clone(), requires_grad=True)
+
+                                # Derive epsilon from model output before regularizing to IID standard normal
+                                var_epsilon = self.get_epsilon(var, latent_model_input.detach(), t)
+
+                                l_kld = kl_divergence(var_epsilon)
+                                l_kld.backward()
+
+                                grad = var.grad.detach()
+                                noise_pred = noise_pred - lambda_kl * grad
+
+                            noise_pred = noise_pred.detach()
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.inverse_scheduler.step(noise_pred, t, latents).prev_sample
+                inverted_latents.append(latents.detach().clone())
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.inverse_scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        assert len(inverted_latents) == len(timesteps)
+        latents = torch.stack(list(reversed(inverted_latents)), 1)
+
+        # 8. Post-processing
+        image = None
+        if decode_latents:
+            image = self.decode_latents(latents.flatten(0, 1))
+
+        # 9. Convert to PIL.
+        if decode_latents and output_type == "pil":
+            image = self.image_processor.numpy_to_pil(image)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (latents, image)
+
+        return DiffEditInversionPipelineOutput(latents=latents, images=image)
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        image_latents: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        inpaint_strength: Optional[float] = 0.8,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_ckip: int = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            mask_image (`PIL.Image.Image`):
+                `Image` or tensor representing an image batch to mask the generated image. White pixels in the mask are
+                repainted, while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a
+                single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, 1, H, W)`.
+            image_latents (`PIL.Image.Image` or `torch.FloatTensor`):
+                Partially noised image latents from the inversion process to be used as inputs for image generation.
+            inpaint_strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to inpaint the masked area. Must be between 0 and 1. When `inpaint_strength` is 1, the
+                denoising process is run on the masked area for the full number of iterations specified in
+                `num_inference_steps`. `image_latents` is used as a reference for the masked area, and adding more
+                noise to a region increases `inpaint_strength`. If `inpaint_strength` is 0, no inpainting occurs.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            inpaint_strength,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        if mask_image is None:
+            raise ValueError(
+                "`mask_image` input cannot be undefined. Use `generate_mask()` to compute `mask_image` from text prompts."
+            )
+        if image_latents is None:
+            raise ValueError(
+                "`image_latents` input cannot be undefined. Use `invert()` to compute `image_latents` from input images."
+            )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if cross_attention_kwargs is None:
+            cross_attention_kwargs = {}
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_ckip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Preprocess mask
+        mask_image = preprocess_mask(mask_image, batch_size)
+        latent_height, latent_width = mask_image.shape[-2:]
+        mask_image = torch.cat([mask_image] * num_images_per_prompt)
+        mask_image = mask_image.to(device=device, dtype=prompt_embeds.dtype)
+
+        # 5. Set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, inpaint_strength, device)
+
+        # 6. Preprocess image latents
+        if isinstance(image_latents, list) and any(isinstance(l, torch.Tensor) and l.ndim == 5 for l in image_latents):
+            image_latents = torch.cat(image_latents).detach()
+        elif isinstance(image_latents, torch.Tensor) and image_latents.ndim == 5:
+            image_latents = image_latents.detach()
+        else:
+            image_latents = self.image_processor.preprocess(image_latents).detach()
+
+        latent_shape = (self.vae.config.latent_channels, latent_height, latent_width)
+        if image_latents.shape[-3:] != latent_shape:
+            raise ValueError(
+                f"Each latent image in `image_latents` must have shape {latent_shape}, "
+                f"but has shape {image_latents.shape[-3:]}"
+            )
+        if image_latents.ndim == 4:
+            image_latents = image_latents.reshape(batch_size, len(timesteps), *latent_shape)
+        if image_latents.shape[:2] != (batch_size, len(timesteps)):
+            raise ValueError(
+                f"`image_latents` must have batch size {batch_size} with latent images from {len(timesteps)}"
+                f" timesteps, but has batch size {image_latents.shape[0]} with latent images from"
+                f" {image_latents.shape[1]} timesteps."
+            )
+        image_latents = image_latents.transpose(0, 1).repeat_interleave(num_images_per_prompt, dim=1)
+        image_latents = image_latents.to(device=device, dtype=prompt_embeds.dtype)
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8. Denoising loop
+        latents = image_latents[0].clone()
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # mask with inverted latents from appropriate timestep - use original image latent for last step
+                latents = latents * mask_image + image_latents[i] * (1 - mask_image)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/__init__.py
new file mode 100644
index 000000000..147980cbf
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/__init__.py
@@ -0,0 +1,50 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_stable_diffusion_gligen"] = ["StableDiffusionGLIGENPipeline"]
+    _import_structure["pipeline_stable_diffusion_gligen_text_image"] = ["StableDiffusionGLIGENTextImagePipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_stable_diffusion_gligen import StableDiffusionGLIGENPipeline
+        from .pipeline_stable_diffusion_gligen_text_image import StableDiffusionGLIGENTextImagePipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py
new file mode 100644
index 000000000..9f0d1190f
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py
@@ -0,0 +1,845 @@
+# Copyright 2024 The GLIGEN Authors and HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.attention import GatedSelfAttentionDense
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionGLIGENPipeline
+        >>> from diffusers.utils import load_image
+
+        >>> # Insert objects described by text at the region defined by bounding boxes
+        >>> pipe = StableDiffusionGLIGENPipeline.from_pretrained(
+        ...     "masterful/gligen-1-4-inpainting-text-box", variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> input_image = load_image(
+        ...     "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/gligen/livingroom_modern.png"
+        ... )
+        >>> prompt = "a birthday cake"
+        >>> boxes = [[0.2676, 0.6088, 0.4773, 0.7183]]
+        >>> phrases = ["a birthday cake"]
+
+        >>> images = pipe(
+        ...     prompt=prompt,
+        ...     gligen_phrases=phrases,
+        ...     gligen_inpaint_image=input_image,
+        ...     gligen_boxes=boxes,
+        ...     gligen_scheduled_sampling_beta=1,
+        ...     output_type="pil",
+        ...     num_inference_steps=50,
+        ... ).images
+
+        >>> images[0].save("./gligen-1-4-inpainting-text-box.jpg")
+
+        >>> # Generate an image described by the prompt and
+        >>> # insert objects described by text at the region defined by bounding boxes
+        >>> pipe = StableDiffusionGLIGENPipeline.from_pretrained(
+        ...     "masterful/gligen-1-4-generation-text-box", variant="fp16", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a waterfall and a modern high speed train running through the tunnel in a beautiful forest with fall foliage"
+        >>> boxes = [[0.1387, 0.2051, 0.4277, 0.7090], [0.4980, 0.4355, 0.8516, 0.7266]]
+        >>> phrases = ["a waterfall", "a modern high speed train running through the tunnel"]
+
+        >>> images = pipe(
+        ...     prompt=prompt,
+        ...     gligen_phrases=phrases,
+        ...     gligen_boxes=boxes,
+        ...     gligen_scheduled_sampling_beta=1,
+        ...     output_type="pil",
+        ...     num_inference_steps=50,
+        ... ).images
+
+        >>> images[0].save("./gligen-1-4-generation-text-box.jpg")
+        ```
+"""
+
+
+class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion with Grounded-Language-to-Image Generation (GLIGEN).
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    _optional_components = ["safety_checker", "feature_extractor"]
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        gligen_phrases,
+        gligen_boxes,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if len(gligen_phrases) != len(gligen_boxes):
+            ValueError(
+                "length of `gligen_phrases` and `gligen_boxes` has to be same, but"
+                f" got: `gligen_phrases` {len(gligen_phrases)} != `gligen_boxes` {len(gligen_boxes)}"
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def enable_fuser(self, enabled=True):
+        for module in self.unet.modules():
+            if type(module) is GatedSelfAttentionDense:
+                module.enabled = enabled
+
+    def draw_inpaint_mask_from_boxes(self, boxes, size):
+        inpaint_mask = torch.ones(size[0], size[1])
+        for box in boxes:
+            x0, x1 = box[0] * size[0], box[2] * size[0]
+            y0, y1 = box[1] * size[1], box[3] * size[1]
+            inpaint_mask[int(y0) : int(y1), int(x0) : int(x1)] = 0
+        return inpaint_mask
+
+    def crop(self, im, new_width, new_height):
+        width, height = im.size
+        left = (width - new_width) / 2
+        top = (height - new_height) / 2
+        right = (width + new_width) / 2
+        bottom = (height + new_height) / 2
+        return im.crop((left, top, right, bottom))
+
+    def target_size_center_crop(self, im, new_hw):
+        width, height = im.size
+        if width != height:
+            im = self.crop(im, min(height, width), min(height, width))
+        return im.resize((new_hw, new_hw), PIL.Image.LANCZOS)
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        gligen_scheduled_sampling_beta: float = 0.3,
+        gligen_phrases: List[str] = None,
+        gligen_boxes: List[List[float]] = None,
+        gligen_inpaint_image: Optional[PIL.Image.Image] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            gligen_phrases (`List[str]`):
+                The phrases to guide what to include in each of the regions defined by the corresponding
+                `gligen_boxes`. There should only be one phrase per bounding box.
+            gligen_boxes (`List[List[float]]`):
+                The bounding boxes that identify rectangular regions of the image that are going to be filled with the
+                content described by the corresponding `gligen_phrases`. Each rectangular box is defined as a
+                `List[float]` of 4 elements `[xmin, ymin, xmax, ymax]` where each value is between [0,1].
+            gligen_inpaint_image (`PIL.Image.Image`, *optional*):
+                The input image, if provided, is inpainted with objects described by the `gligen_boxes` and
+                `gligen_phrases`. Otherwise, it is treated as a generation task on a blank input image.
+            gligen_scheduled_sampling_beta (`float`, defaults to 0.3):
+                Scheduled Sampling factor from [GLIGEN: Open-Set Grounded Text-to-Image
+                Generation](https://arxiv.org/pdf/2301.07093.pdf). Scheduled Sampling factor is only varied for
+                scheduled sampling during inference for improved quality and controllability.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            gligen_phrases,
+            gligen_boxes,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 5.1 Prepare GLIGEN variables
+        max_objs = 30
+        if len(gligen_boxes) > max_objs:
+            warnings.warn(
+                f"More that {max_objs} objects found. Only first {max_objs} objects will be processed.",
+                FutureWarning,
+            )
+            gligen_phrases = gligen_phrases[:max_objs]
+            gligen_boxes = gligen_boxes[:max_objs]
+        # prepare batched input to the GLIGENTextBoundingboxProjection (boxes, phrases, mask)
+        # Get tokens for phrases from pre-trained CLIPTokenizer
+        tokenizer_inputs = self.tokenizer(gligen_phrases, padding=True, return_tensors="pt").to(device)
+        # For the token, we use the same pre-trained text encoder
+        # to obtain its text feature
+        _text_embeddings = self.text_encoder(**tokenizer_inputs).pooler_output
+        n_objs = len(gligen_boxes)
+        # For each entity, described in phrases, is denoted with a bounding box,
+        # we represent the location information as (xmin,ymin,xmax,ymax)
+        boxes = torch.zeros(max_objs, 4, device=device, dtype=self.text_encoder.dtype)
+        boxes[:n_objs] = torch.tensor(gligen_boxes)
+        text_embeddings = torch.zeros(
+            max_objs, self.unet.cross_attention_dim, device=device, dtype=self.text_encoder.dtype
+        )
+        text_embeddings[:n_objs] = _text_embeddings
+        # Generate a mask for each object that is entity described by phrases
+        masks = torch.zeros(max_objs, device=device, dtype=self.text_encoder.dtype)
+        masks[:n_objs] = 1
+
+        repeat_batch = batch_size * num_images_per_prompt
+        boxes = boxes.unsqueeze(0).expand(repeat_batch, -1, -1).clone()
+        text_embeddings = text_embeddings.unsqueeze(0).expand(repeat_batch, -1, -1).clone()
+        masks = masks.unsqueeze(0).expand(repeat_batch, -1).clone()
+        if do_classifier_free_guidance:
+            repeat_batch = repeat_batch * 2
+            boxes = torch.cat([boxes] * 2)
+            text_embeddings = torch.cat([text_embeddings] * 2)
+            masks = torch.cat([masks] * 2)
+            masks[: repeat_batch // 2] = 0
+        if cross_attention_kwargs is None:
+            cross_attention_kwargs = {}
+        cross_attention_kwargs["gligen"] = {"boxes": boxes, "positive_embeddings": text_embeddings, "masks": masks}
+
+        # Prepare latent variables for GLIGEN inpainting
+        if gligen_inpaint_image is not None:
+            # if the given input image is not of the same size as expected by VAE
+            # center crop and resize the input image to expected shape
+            if gligen_inpaint_image.size != (self.vae.sample_size, self.vae.sample_size):
+                gligen_inpaint_image = self.target_size_center_crop(gligen_inpaint_image, self.vae.sample_size)
+            # Convert a single image into a batch of images with a batch size of 1
+            # The resulting shape becomes (1, C, H, W), where C is the number of channels,
+            # and H and W are the height and width of the image.
+            # scales the pixel values to a range [-1, 1]
+            gligen_inpaint_image = self.image_processor.preprocess(gligen_inpaint_image)
+            gligen_inpaint_image = gligen_inpaint_image.to(dtype=self.vae.dtype, device=self.vae.device)
+            # Run AutoEncoder to get corresponding latents
+            gligen_inpaint_latent = self.vae.encode(gligen_inpaint_image).latent_dist.sample()
+            gligen_inpaint_latent = self.vae.config.scaling_factor * gligen_inpaint_latent
+            # Generate an inpainting mask
+            # pixel value = 0, where the object is present (defined by bounding boxes above)
+            #               1, everywhere else
+            gligen_inpaint_mask = self.draw_inpaint_mask_from_boxes(gligen_boxes, gligen_inpaint_latent.shape[2:])
+            gligen_inpaint_mask = gligen_inpaint_mask.to(
+                dtype=gligen_inpaint_latent.dtype, device=gligen_inpaint_latent.device
+            )
+            gligen_inpaint_mask = gligen_inpaint_mask[None, None]
+            gligen_inpaint_mask_addition = torch.cat(
+                (gligen_inpaint_latent * gligen_inpaint_mask, gligen_inpaint_mask), dim=1
+            )
+            # Convert a single mask into a batch of masks with a batch size of 1
+            gligen_inpaint_mask_addition = gligen_inpaint_mask_addition.expand(repeat_batch, -1, -1, -1).clone()
+
+        num_grounding_steps = int(gligen_scheduled_sampling_beta * len(timesteps))
+        self.enable_fuser(True)
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Scheduled sampling
+                if i == num_grounding_steps:
+                    self.enable_fuser(False)
+
+                if latents.shape[1] != 4:
+                    latents = torch.randn_like(latents[:, :4])
+
+                if gligen_inpaint_image is not None:
+                    gligen_inpaint_latent_with_noise = (
+                        self.scheduler.add_noise(
+                            gligen_inpaint_latent, torch.randn_like(gligen_inpaint_latent), torch.tensor([t])
+                        )
+                        .expand(latents.shape[0], -1, -1, -1)
+                        .clone()
+                    )
+                    latents = gligen_inpaint_latent_with_noise * gligen_inpaint_mask + latents * (
+                        1 - gligen_inpaint_mask
+                    )
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                if gligen_inpaint_image is not None:
+                    latent_model_input = torch.cat((latent_model_input, gligen_inpaint_mask_addition), dim=1)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py
new file mode 100644
index 000000000..296ecae65
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py
@@ -0,0 +1,1017 @@
+# Copyright 2024 The GLIGEN Authors and HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import warnings
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import PIL.Image
+import torch
+from transformers import (
+    CLIPFeatureExtractor,
+    CLIPProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.attention import GatedSelfAttentionDense
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import USE_PEFT_BACKEND, logging, replace_example_docstring, scale_lora_layers, unscale_lora_layers
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionPipelineOutput
+from ..stable_diffusion.clip_image_project_model import CLIPImageProjection
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionGLIGENTextImagePipeline
+        >>> from diffusers.utils import load_image
+
+        >>> # Insert objects described by image at the region defined by bounding boxes
+        >>> pipe = StableDiffusionGLIGENTextImagePipeline.from_pretrained(
+        ...     "anhnct/Gligen_Inpainting_Text_Image", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> input_image = load_image(
+        ...     "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/gligen/livingroom_modern.png"
+        ... )
+        >>> prompt = "a backpack"
+        >>> boxes = [[0.2676, 0.4088, 0.4773, 0.7183]]
+        >>> phrases = None
+        >>> gligen_image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/gligen/backpack.jpeg"
+        ... )
+
+        >>> images = pipe(
+        ...     prompt=prompt,
+        ...     gligen_phrases=phrases,
+        ...     gligen_inpaint_image=input_image,
+        ...     gligen_boxes=boxes,
+        ...     gligen_images=[gligen_image],
+        ...     gligen_scheduled_sampling_beta=1,
+        ...     output_type="pil",
+        ...     num_inference_steps=50,
+        ... ).images
+
+        >>> images[0].save("./gligen-inpainting-text-image-box.jpg")
+
+        >>> # Generate an image described by the prompt and
+        >>> # insert objects described by text and image at the region defined by bounding boxes
+        >>> pipe = StableDiffusionGLIGENTextImagePipeline.from_pretrained(
+        ...     "anhnct/Gligen_Text_Image", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a flower sitting on the beach"
+        >>> boxes = [[0.0, 0.09, 0.53, 0.76]]
+        >>> phrases = ["flower"]
+        >>> gligen_image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/gligen/pexels-pixabay-60597.jpg"
+        ... )
+
+        >>> images = pipe(
+        ...     prompt=prompt,
+        ...     gligen_phrases=phrases,
+        ...     gligen_images=[gligen_image],
+        ...     gligen_boxes=boxes,
+        ...     gligen_scheduled_sampling_beta=1,
+        ...     output_type="pil",
+        ...     num_inference_steps=50,
+        ... ).images
+
+        >>> images[0].save("./gligen-generation-text-image-box.jpg")
+
+        >>> # Generate an image described by the prompt and
+        >>> # transfer style described by image at the region defined by bounding boxes
+        >>> pipe = StableDiffusionGLIGENTextImagePipeline.from_pretrained(
+        ...     "anhnct/Gligen_Text_Image", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a dragon flying on the sky"
+        >>> boxes = [[0.4, 0.2, 1.0, 0.8], [0.0, 1.0, 0.0, 1.0]]  # Set `[0.0, 1.0, 0.0, 1.0]` for the style
+
+        >>> gligen_image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/landscape.png"
+        ... )
+
+        >>> gligen_placeholder = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/landscape.png"
+        ... )
+
+        >>> images = pipe(
+        ...     prompt=prompt,
+        ...     gligen_phrases=[
+        ...         "dragon",
+        ...         "placeholder",
+        ...     ],  # Can use any text instead of `placeholder` token, because we will use mask here
+        ...     gligen_images=[
+        ...         gligen_placeholder,
+        ...         gligen_image,
+        ...     ],  # Can use any image in gligen_placeholder, because we will use mask here
+        ...     input_phrases_mask=[1, 0],  # Set 0 for the placeholder token
+        ...     input_images_mask=[0, 1],  # Set 0 for the placeholder image
+        ...     gligen_boxes=boxes,
+        ...     gligen_scheduled_sampling_beta=1,
+        ...     output_type="pil",
+        ...     num_inference_steps=50,
+        ... ).images
+
+        >>> images[0].save("./gligen-generation-text-image-box-style-transfer.jpg")
+        ```
+"""
+
+
+class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion with Grounded-Language-to-Image Generation (GLIGEN).
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        processor ([`~transformers.CLIPProcessor`]):
+            A `CLIPProcessor` to procces reference image.
+        image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
+            Frozen image-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        image_project ([`CLIPImageProjection`]):
+            A `CLIPImageProjection` to project image embedding into phrases embedding space.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        processor: CLIPProcessor,
+        image_encoder: CLIPVisionModelWithProjection,
+        image_project: CLIPImageProjection,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            image_encoder=image_encoder,
+            processor=processor,
+            image_project=image_project,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion_k_diffusion.pipeline_stable_diffusion_k_diffusion.StableDiffusionKDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def enable_fuser(self, enabled=True):
+        for module in self.unet.modules():
+            if type(module) is GatedSelfAttentionDense:
+                module.enabled = enabled
+
+    def draw_inpaint_mask_from_boxes(self, boxes, size):
+        """
+        Create an inpainting mask based on given boxes. This function generates an inpainting mask using the provided
+        boxes to mark regions that need to be inpainted.
+        """
+        inpaint_mask = torch.ones(size[0], size[1])
+        for box in boxes:
+            x0, x1 = box[0] * size[0], box[2] * size[0]
+            y0, y1 = box[1] * size[1], box[3] * size[1]
+            inpaint_mask[int(y0) : int(y1), int(x0) : int(x1)] = 0
+        return inpaint_mask
+
+    def crop(self, im, new_width, new_height):
+        """
+        Crop the input image to the specified dimensions.
+        """
+        width, height = im.size
+        left = (width - new_width) / 2
+        top = (height - new_height) / 2
+        right = (width + new_width) / 2
+        bottom = (height + new_height) / 2
+        return im.crop((left, top, right, bottom))
+
+    def target_size_center_crop(self, im, new_hw):
+        """
+        Crop and resize the image to the target size while keeping the center.
+        """
+        width, height = im.size
+        if width != height:
+            im = self.crop(im, min(height, width), min(height, width))
+        return im.resize((new_hw, new_hw), PIL.Image.LANCZOS)
+
+    def complete_mask(self, has_mask, max_objs, device):
+        """
+        Based on the input mask corresponding value `0 or 1` for each phrases and image, mask the features
+        corresponding to phrases and images.
+        """
+        mask = torch.ones(1, max_objs).type(self.text_encoder.dtype).to(device)
+        if has_mask is None:
+            return mask
+
+        if isinstance(has_mask, int):
+            return mask * has_mask
+        else:
+            for idx, value in enumerate(has_mask):
+                mask[0, idx] = value
+            return mask
+
+    def get_clip_feature(self, input, normalize_constant, device, is_image=False):
+        """
+        Get image and phrases embedding by using CLIP pretrain model. The image embedding is transformed into the
+        phrases embedding space through a projection.
+        """
+        if is_image:
+            if input is None:
+                return None
+            inputs = self.processor(images=[input], return_tensors="pt").to(device)
+            inputs["pixel_values"] = inputs["pixel_values"].to(self.image_encoder.dtype)
+
+            outputs = self.image_encoder(**inputs)
+            feature = outputs.image_embeds
+            feature = self.image_project(feature).squeeze(0)
+            feature = (feature / feature.norm()) * normalize_constant
+            feature = feature.unsqueeze(0)
+        else:
+            if input is None:
+                return None
+            inputs = self.tokenizer(input, return_tensors="pt", padding=True).to(device)
+            outputs = self.text_encoder(**inputs)
+            feature = outputs.pooler_output
+        return feature
+
+    def get_cross_attention_kwargs_with_grounded(
+        self,
+        hidden_size,
+        gligen_phrases,
+        gligen_images,
+        gligen_boxes,
+        input_phrases_mask,
+        input_images_mask,
+        repeat_batch,
+        normalize_constant,
+        max_objs,
+        device,
+    ):
+        """
+        Prepare the cross-attention kwargs containing information about the grounded input (boxes, mask, image
+        embedding, phrases embedding).
+        """
+        phrases, images = gligen_phrases, gligen_images
+        images = [None] * len(phrases) if images is None else images
+        phrases = [None] * len(images) if phrases is None else phrases
+
+        boxes = torch.zeros(max_objs, 4, device=device, dtype=self.text_encoder.dtype)
+        masks = torch.zeros(max_objs, device=device, dtype=self.text_encoder.dtype)
+        phrases_masks = torch.zeros(max_objs, device=device, dtype=self.text_encoder.dtype)
+        image_masks = torch.zeros(max_objs, device=device, dtype=self.text_encoder.dtype)
+        phrases_embeddings = torch.zeros(max_objs, hidden_size, device=device, dtype=self.text_encoder.dtype)
+        image_embeddings = torch.zeros(max_objs, hidden_size, device=device, dtype=self.text_encoder.dtype)
+
+        text_features = []
+        image_features = []
+        for phrase, image in zip(phrases, images):
+            text_features.append(self.get_clip_feature(phrase, normalize_constant, device, is_image=False))
+            image_features.append(self.get_clip_feature(image, normalize_constant, device, is_image=True))
+
+        for idx, (box, text_feature, image_feature) in enumerate(zip(gligen_boxes, text_features, image_features)):
+            boxes[idx] = torch.tensor(box)
+            masks[idx] = 1
+            if text_feature is not None:
+                phrases_embeddings[idx] = text_feature
+                phrases_masks[idx] = 1
+            if image_feature is not None:
+                image_embeddings[idx] = image_feature
+                image_masks[idx] = 1
+
+        input_phrases_mask = self.complete_mask(input_phrases_mask, max_objs, device)
+        phrases_masks = phrases_masks.unsqueeze(0).repeat(repeat_batch, 1) * input_phrases_mask
+        input_images_mask = self.complete_mask(input_images_mask, max_objs, device)
+        image_masks = image_masks.unsqueeze(0).repeat(repeat_batch, 1) * input_images_mask
+        boxes = boxes.unsqueeze(0).repeat(repeat_batch, 1, 1)
+        masks = masks.unsqueeze(0).repeat(repeat_batch, 1)
+        phrases_embeddings = phrases_embeddings.unsqueeze(0).repeat(repeat_batch, 1, 1)
+        image_embeddings = image_embeddings.unsqueeze(0).repeat(repeat_batch, 1, 1)
+
+        out = {
+            "boxes": boxes,
+            "masks": masks,
+            "phrases_masks": phrases_masks,
+            "image_masks": image_masks,
+            "phrases_embeddings": phrases_embeddings,
+            "image_embeddings": image_embeddings,
+        }
+
+        return out
+
+    def get_cross_attention_kwargs_without_grounded(self, hidden_size, repeat_batch, max_objs, device):
+        """
+        Prepare the cross-attention kwargs without information about the grounded input (boxes, mask, image embedding,
+        phrases embedding) (All are zero tensor).
+        """
+        boxes = torch.zeros(max_objs, 4, device=device, dtype=self.text_encoder.dtype)
+        masks = torch.zeros(max_objs, device=device, dtype=self.text_encoder.dtype)
+        phrases_masks = torch.zeros(max_objs, device=device, dtype=self.text_encoder.dtype)
+        image_masks = torch.zeros(max_objs, device=device, dtype=self.text_encoder.dtype)
+        phrases_embeddings = torch.zeros(max_objs, hidden_size, device=device, dtype=self.text_encoder.dtype)
+        image_embeddings = torch.zeros(max_objs, hidden_size, device=device, dtype=self.text_encoder.dtype)
+
+        out = {
+            "boxes": boxes.unsqueeze(0).repeat(repeat_batch, 1, 1),
+            "masks": masks.unsqueeze(0).repeat(repeat_batch, 1),
+            "phrases_masks": phrases_masks.unsqueeze(0).repeat(repeat_batch, 1),
+            "image_masks": image_masks.unsqueeze(0).repeat(repeat_batch, 1),
+            "phrases_embeddings": phrases_embeddings.unsqueeze(0).repeat(repeat_batch, 1, 1),
+            "image_embeddings": image_embeddings.unsqueeze(0).repeat(repeat_batch, 1, 1),
+        }
+
+        return out
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        gligen_scheduled_sampling_beta: float = 0.3,
+        gligen_phrases: List[str] = None,
+        gligen_images: List[PIL.Image.Image] = None,
+        input_phrases_mask: Union[int, List[int]] = None,
+        input_images_mask: Union[int, List[int]] = None,
+        gligen_boxes: List[List[float]] = None,
+        gligen_inpaint_image: Optional[PIL.Image.Image] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        gligen_normalize_constant: float = 28.7,
+        clip_skip: int = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            gligen_phrases (`List[str]`):
+                The phrases to guide what to include in each of the regions defined by the corresponding
+                `gligen_boxes`. There should only be one phrase per bounding box.
+            gligen_images (`List[PIL.Image.Image]`):
+                The images to guide what to include in each of the regions defined by the corresponding `gligen_boxes`.
+                There should only be one image per bounding box
+            input_phrases_mask (`int` or `List[int]`):
+                pre phrases mask input defined by the correspongding `input_phrases_mask`
+            input_images_mask (`int` or `List[int]`):
+                pre images mask input defined by the correspongding `input_images_mask`
+            gligen_boxes (`List[List[float]]`):
+                The bounding boxes that identify rectangular regions of the image that are going to be filled with the
+                content described by the corresponding `gligen_phrases`. Each rectangular box is defined as a
+                `List[float]` of 4 elements `[xmin, ymin, xmax, ymax]` where each value is between [0,1].
+            gligen_inpaint_image (`PIL.Image.Image`, *optional*):
+                The input image, if provided, is inpainted with objects described by the `gligen_boxes` and
+                `gligen_phrases`. Otherwise, it is treated as a generation task on a blank input image.
+            gligen_scheduled_sampling_beta (`float`, defaults to 0.3):
+                Scheduled Sampling factor from [GLIGEN: Open-Set Grounded Text-to-Image
+                Generation](https://arxiv.org/pdf/2301.07093.pdf). Scheduled Sampling factor is only varied for
+                scheduled sampling during inference for improved quality and controllability.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            gligen_normalize_constant (`float`, *optional*, defaults to 28.7):
+                The normalize value of the image embedding.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 5.1 Prepare GLIGEN variables
+        max_objs = 30
+        if len(gligen_boxes) > max_objs:
+            warnings.warn(
+                f"More that {max_objs} objects found. Only first {max_objs} objects will be processed.",
+                FutureWarning,
+            )
+            gligen_phrases = gligen_phrases[:max_objs]
+            gligen_boxes = gligen_boxes[:max_objs]
+            gligen_images = gligen_images[:max_objs]
+
+        repeat_batch = batch_size * num_images_per_prompt
+
+        if do_classifier_free_guidance:
+            repeat_batch = repeat_batch * 2
+
+        if cross_attention_kwargs is None:
+            cross_attention_kwargs = {}
+
+        hidden_size = prompt_embeds.shape[2]
+
+        cross_attention_kwargs["gligen"] = self.get_cross_attention_kwargs_with_grounded(
+            hidden_size=hidden_size,
+            gligen_phrases=gligen_phrases,
+            gligen_images=gligen_images,
+            gligen_boxes=gligen_boxes,
+            input_phrases_mask=input_phrases_mask,
+            input_images_mask=input_images_mask,
+            repeat_batch=repeat_batch,
+            normalize_constant=gligen_normalize_constant,
+            max_objs=max_objs,
+            device=device,
+        )
+
+        cross_attention_kwargs_without_grounded = {}
+        cross_attention_kwargs_without_grounded["gligen"] = self.get_cross_attention_kwargs_without_grounded(
+            hidden_size=hidden_size, repeat_batch=repeat_batch, max_objs=max_objs, device=device
+        )
+
+        # Prepare latent variables for GLIGEN inpainting
+        if gligen_inpaint_image is not None:
+            # if the given input image is not of the same size as expected by VAE
+            # center crop and resize the input image to expected shape
+            if gligen_inpaint_image.size != (self.vae.sample_size, self.vae.sample_size):
+                gligen_inpaint_image = self.target_size_center_crop(gligen_inpaint_image, self.vae.sample_size)
+            # Convert a single image into a batch of images with a batch size of 1
+            # The resulting shape becomes (1, C, H, W), where C is the number of channels,
+            # and H and W are the height and width of the image.
+            # scales the pixel values to a range [-1, 1]
+            gligen_inpaint_image = self.image_processor.preprocess(gligen_inpaint_image)
+            gligen_inpaint_image = gligen_inpaint_image.to(dtype=self.vae.dtype, device=self.vae.device)
+            # Run AutoEncoder to get corresponding latents
+            gligen_inpaint_latent = self.vae.encode(gligen_inpaint_image).latent_dist.sample()
+            gligen_inpaint_latent = self.vae.config.scaling_factor * gligen_inpaint_latent
+            # Generate an inpainting mask
+            # pixel value = 0, where the object is present (defined by bounding boxes above)
+            #               1, everywhere else
+            gligen_inpaint_mask = self.draw_inpaint_mask_from_boxes(gligen_boxes, gligen_inpaint_latent.shape[2:])
+            gligen_inpaint_mask = gligen_inpaint_mask.to(
+                dtype=gligen_inpaint_latent.dtype, device=gligen_inpaint_latent.device
+            )
+            gligen_inpaint_mask = gligen_inpaint_mask[None, None]
+            gligen_inpaint_mask_addition = torch.cat(
+                (gligen_inpaint_latent * gligen_inpaint_mask, gligen_inpaint_mask), dim=1
+            )
+            # Convert a single mask into a batch of masks with a batch size of 1
+            gligen_inpaint_mask_addition = gligen_inpaint_mask_addition.expand(repeat_batch, -1, -1, -1).clone()
+
+        int(gligen_scheduled_sampling_beta * len(timesteps))
+        self.enable_fuser(True)
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if latents.shape[1] != 4:
+                    latents = torch.randn_like(latents[:, :4])
+
+                if gligen_inpaint_image is not None:
+                    gligen_inpaint_latent_with_noise = (
+                        self.scheduler.add_noise(
+                            gligen_inpaint_latent, torch.randn_like(gligen_inpaint_latent), torch.tensor([t])
+                        )
+                        .expand(latents.shape[0], -1, -1, -1)
+                        .clone()
+                    )
+                    latents = gligen_inpaint_latent_with_noise * gligen_inpaint_mask + latents * (
+                        1 - gligen_inpaint_mask
+                    )
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                if gligen_inpaint_image is not None:
+                    latent_model_input = torch.cat((latent_model_input, gligen_inpaint_mask_addition), dim=1)
+
+                # predict the noise residual with grounded information
+                noise_pred_with_grounding = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # predict the noise residual without grounded information
+                noise_pred_without_grounding = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs_without_grounded,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    # Using noise_pred_text from noise residual with grounded information and noise_pred_uncond from noise residual without grounded information
+                    _, noise_pred_text = noise_pred_with_grounding.chunk(2)
+                    noise_pred_uncond, _ = noise_pred_without_grounding.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                else:
+                    noise_pred = noise_pred_with_grounding
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/__init__.py
new file mode 100644
index 000000000..7eb5bf8c2
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/__init__.py
@@ -0,0 +1,62 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_k_diffusion_available,
+    is_k_diffusion_version,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (
+        is_transformers_available()
+        and is_torch_available()
+        and is_k_diffusion_available()
+        and is_k_diffusion_version(">=", "0.0.12")
+    ):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_and_k_diffusion_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_k_diffusion_objects))
+else:
+    _import_structure["pipeline_stable_diffusion_k_diffusion"] = ["StableDiffusionKDiffusionPipeline"]
+    _import_structure["pipeline_stable_diffusion_xl_k_diffusion"] = ["StableDiffusionXLKDiffusionPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (
+            is_transformers_available()
+            and is_torch_available()
+            and is_k_diffusion_available()
+            and is_k_diffusion_version(">=", "0.0.12")
+        ):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_and_k_diffusion_objects import *
+    else:
+        from .pipeline_stable_diffusion_k_diffusion import StableDiffusionKDiffusionPipeline
+        from .pipeline_stable_diffusion_xl_k_diffusion import StableDiffusionXLKDiffusionPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py
new file mode 100644
index 000000000..bc565c938
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py
@@ -0,0 +1,664 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import inspect
+from typing import Callable, List, Optional, Union
+
+import torch
+from k_diffusion.external import CompVisDenoiser, CompVisVDenoiser
+from k_diffusion.sampling import BrownianTreeNoiseSampler, get_sigmas_karras
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import LMSDiscreteScheduler
+from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class ModelWrapper:
+    def __init__(self, model, alphas_cumprod):
+        self.model = model
+        self.alphas_cumprod = alphas_cumprod
+
+    def apply_model(self, *args, **kwargs):
+        if len(args) == 3:
+            encoder_hidden_states = args[-1]
+            args = args[:2]
+        if kwargs.get("cond", None) is not None:
+            encoder_hidden_states = kwargs.pop("cond")
+        return self.model(*args, encoder_hidden_states=encoder_hidden_states, **kwargs).sample
+
+
+class StableDiffusionKDiffusionPipeline(
+    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
+    <Tip warning={true}>
+
+        This is an experimental pipeline and is likely to change in the future.
+
+    </Tip>
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae,
+        text_encoder,
+        tokenizer,
+        unet,
+        scheduler,
+        safety_checker,
+        feature_extractor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        logger.info(
+            f"{self.__class__} is an experimntal pipeline and is likely to change in the future. We recommend to use"
+            " this pipeline for fast experimentation / iteration if needed, but advice to rely on existing pipelines"
+            " as defined in https://huggingface.co/docs/diffusers/api/schedulers#implemented-schedulers for"
+            " production settings."
+        )
+
+        # get correct sigmas from LMS
+        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        model = ModelWrapper(unet, scheduler.alphas_cumprod)
+        if scheduler.config.prediction_type == "v_prediction":
+            self.k_diffusion_model = CompVisVDenoiser(model)
+        else:
+            self.k_diffusion_model = CompVisDenoiser(model)
+
+    def set_scheduler(self, scheduler_type: str):
+        library = importlib.import_module("k_diffusion")
+        sampling = getattr(library, "sampling")
+        try:
+            self.sampler = getattr(sampling, scheduler_type)
+        except Exception:
+            valid_samplers = []
+            for s in dir(sampling):
+                if "sample_" in s:
+                    valid_samplers.append(s)
+
+            raise ValueError(f"Invalid scheduler type {scheduler_type}. Please choose one of {valid_samplers}.")
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        use_karras_sigmas: Optional[bool] = False,
+        noise_sampler_seed: Optional[int] = None,
+        clip_skip: int = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
+                is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+                Use karras sigmas. For example, specifying `sample_dpmpp_2m` to `set_scheduler` will be equivalent to
+                `DPM++2M` in stable-diffusion-webui. On top of that, setting this option to True will make it `DPM++2M
+                Karras`.
+            noise_sampler_seed (`int`, *optional*, defaults to `None`):
+                The random seed to use for the noise sampler. If `None`, a random seed will be generated.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = True
+        if guidance_scale <= 1.0:
+            raise ValueError("has to use guidance_scale")
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=prompt_embeds.device)
+
+        # 5. Prepare sigmas
+        if use_karras_sigmas:
+            sigma_min: float = self.k_diffusion_model.sigmas[0].item()
+            sigma_max: float = self.k_diffusion_model.sigmas[-1].item()
+            sigmas = get_sigmas_karras(n=num_inference_steps, sigma_min=sigma_min, sigma_max=sigma_max)
+            sigmas = sigmas.to(device)
+        else:
+            sigmas = self.scheduler.sigmas
+        sigmas = sigmas.to(prompt_embeds.dtype)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        latents = latents * sigmas[0]
+        self.k_diffusion_model.sigmas = self.k_diffusion_model.sigmas.to(latents.device)
+        self.k_diffusion_model.log_sigmas = self.k_diffusion_model.log_sigmas.to(latents.device)
+
+        # 7. Define model function
+        def model_fn(x, t):
+            latent_model_input = torch.cat([x] * 2)
+            t = torch.cat([t] * 2)
+
+            noise_pred = self.k_diffusion_model(latent_model_input, t, cond=prompt_embeds)
+
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            return noise_pred
+
+        # 8. Run k-diffusion solver
+        sampler_kwargs = {}
+
+        if "noise_sampler" in inspect.signature(self.sampler).parameters:
+            min_sigma, max_sigma = sigmas[sigmas > 0].min(), sigmas.max()
+            noise_sampler = BrownianTreeNoiseSampler(latents, min_sigma, max_sigma, noise_sampler_seed)
+            sampler_kwargs["noise_sampler"] = noise_sampler
+
+        if "generator" in inspect.signature(self.sampler).parameters:
+            sampler_kwargs["generator"] = generator
+
+        latents = self.sampler(model_fn, latents, sigmas, **sampler_kwargs)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py
new file mode 100644
index 000000000..ed46a1e36
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py
@@ -0,0 +1,891 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import inspect
+from typing import List, Optional, Tuple, Union
+
+import torch
+from k_diffusion.external import CompVisDenoiser, CompVisVDenoiser
+from k_diffusion.sampling import BrownianTreeNoiseSampler, get_sigmas_karras
+from transformers import (
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+)
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    FusedAttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers, LMSDiscreteScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionXLKDiffusionPipeline
+
+        >>> pipe = StableDiffusionXLKDiffusionPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+        >>> pipe.set_scheduler("sample_dpmpp_2m_sde")
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion_k_diffusion.pipeline_stable_diffusion_k_diffusion.ModelWrapper
+class ModelWrapper:
+    def __init__(self, model, alphas_cumprod):
+        self.model = model
+        self.alphas_cumprod = alphas_cumprod
+
+    def apply_model(self, *args, **kwargs):
+        if len(args) == 3:
+            encoder_hidden_states = args[-1]
+            args = args[:2]
+        if kwargs.get("cond", None) is not None:
+            encoder_hidden_states = kwargs.pop("cond")
+        return self.model(*args, encoder_hidden_states=encoder_hidden_states, **kwargs).sample
+
+
+class StableDiffusionXLKDiffusionPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    FromSingleFileMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL and k-diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "feature_extractor",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        force_zeros_for_empty_prompt: bool = True,
+    ):
+        super().__init__()
+
+        # get correct sigmas from LMS
+        scheduler = LMSDiscreteScheduler.from_config(scheduler.config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        self.default_sample_size = self.unet.config.sample_size
+
+        model = ModelWrapper(unet, scheduler.alphas_cumprod)
+        if scheduler.config.prediction_type == "v_prediction":
+            self.k_diffusion_model = CompVisVDenoiser(model)
+        else:
+            self.k_diffusion_model = CompVisDenoiser(model)
+
+    # Copied from diffusers.pipelines.stable_diffusion_k_diffusion.pipeline_stable_diffusion_k_diffusion.StableDiffusionKDiffusionPipeline.set_scheduler
+    def set_scheduler(self, scheduler_type: str):
+        library = importlib.import_module("k_diffusion")
+        sampling = getattr(library, "sampling")
+        try:
+            self.sampler = getattr(sampling, scheduler_type)
+        except Exception:
+            valid_samplers = []
+            for s in dir(sampling):
+                if "sample_" in s:
+                    valid_samplers.append(s)
+
+            raise ValueError(f"Invalid scheduler type {scheduler_type}. Please choose one of {valid_samplers}.")
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                LoRAXFormersAttnProcessor,
+                LoRAAttnProcessor2_0,
+                FusedAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        use_karras_sigmas: Optional[bool] = False,
+        noise_sampler_seed: Optional[int] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        )
+
+        if guidance_scale <= 1.0:
+            raise ValueError("has to use guidance_scale")
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        lora_scale = None
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=prompt_embeds.device)
+
+        # 5. Prepare sigmas
+        if use_karras_sigmas:
+            sigma_min: float = self.k_diffusion_model.sigmas[0].item()
+            sigma_max: float = self.k_diffusion_model.sigmas[-1].item()
+            sigmas = get_sigmas_karras(n=num_inference_steps, sigma_min=sigma_min, sigma_max=sigma_max)
+        else:
+            sigmas = self.scheduler.sigmas
+        sigmas = sigmas.to(dtype=prompt_embeds.dtype, device=device)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        latents = latents * sigmas[0]
+
+        self.k_diffusion_model.sigmas = self.k_diffusion_model.sigmas.to(latents.device)
+        self.k_diffusion_model.log_sigmas = self.k_diffusion_model.log_sigmas.to(latents.device)
+
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+        # 8. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 9. Define model function
+        def model_fn(x, t):
+            latent_model_input = torch.cat([x] * 2)
+            t = torch.cat([t] * 2)
+
+            noise_pred = self.k_diffusion_model(
+                latent_model_input,
+                t,
+                cond=prompt_embeds,
+                timestep_cond=timestep_cond,
+                added_cond_kwargs=added_cond_kwargs,
+            )
+
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            return noise_pred
+
+        # 10. Run k-diffusion solver
+        sampler_kwargs = {}
+
+        if "noise_sampler" in inspect.signature(self.sampler).parameters:
+            min_sigma, max_sigma = sigmas[sigmas > 0].min(), sigmas.max()
+            noise_sampler = BrownianTreeNoiseSampler(latents, min_sigma, max_sigma, noise_sampler_seed)
+            sampler_kwargs["noise_sampler"] = noise_sampler
+
+        if "generator" in inspect.signature(self.sampler).parameters:
+            sampler_kwargs["generator"] = generator
+
+        latents = self.sampler(model_fn, latents, sigmas, **sampler_kwargs)
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_ldm3d/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_ldm3d/__init__.py
new file mode 100644
index 000000000..dae2affdd
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_ldm3d/__init__.py
@@ -0,0 +1,48 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_stable_diffusion_ldm3d"] = ["StableDiffusionLDM3DPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_stable_diffusion_ldm3d import StableDiffusionLDM3DPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py
new file mode 100644
index 000000000..c7c05feaf
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py
@@ -0,0 +1,985 @@
+# Copyright 2024 The Intel Labs Team Authors and the HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput, VaeImageProcessorLDM3D
+from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    BaseOutput,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```python
+        >>> from diffusers import StableDiffusionLDM3DPipeline
+
+        >>> pipe = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d-4c")
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> output = pipe(prompt)
+        >>> rgb_image, depth_image = output.rgb, output.depth
+        >>> rgb_image[0].save("astronaut_ldm3d_rgb.jpg")
+        >>> depth_image[0].save("astronaut_ldm3d_depth.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+@dataclass
+class LDM3DPipelineOutput(BaseOutput):
+    """
+    Output class for Stable Diffusion pipelines.
+
+    Args:
+        rgb (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+        depth (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+        nsfw_content_detected (`List[bool]`)
+            List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
+            `None` if safety checking could not be performed.
+    """
+
+    rgb: Union[List[PIL.Image.Image], np.ndarray]
+    depth: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
+
+
+class StableDiffusionLDM3DPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    LoraLoaderMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image and 3D generation using LDM3D.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: Optional[CLIPVisionModelWithProjection],
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessorLDM3D(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            rgb_feature_extractor_input = feature_extractor_input[0]
+            safety_checker_input = self.feature_extractor(rgb_feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 49,
+        timesteps: List[int] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
+        # 6.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        rgb, depth = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return ((rgb, depth), has_nsfw_concept)
+
+        return LDM3DPipelineOutput(rgb=rgb, depth=depth, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_panorama/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_panorama/__init__.py
new file mode 100644
index 000000000..f7572db72
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_panorama/__init__.py
@@ -0,0 +1,48 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_stable_diffusion_panorama"] = ["StableDiffusionPanoramaPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_stable_diffusion_panorama import StableDiffusionPanoramaPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py
new file mode 100644
index 000000000..feda710e0
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py
@@ -0,0 +1,933 @@
+# Copyright 2024 MultiDiffusion Authors and The HuggingFace Team. All rights reserved."
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import DDIMScheduler
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionPanoramaPipeline, DDIMScheduler
+
+        >>> model_ckpt = "stabilityai/stable-diffusion-2-base"
+        >>> scheduler = DDIMScheduler.from_pretrained(model_ckpt, subfolder="scheduler")
+        >>> pipe = StableDiffusionPanoramaPipeline.from_pretrained(
+        ...     model_ckpt, scheduler=scheduler, torch_dtype=torch.float16
+        ... )
+
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a photo of the dolomites"
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+
+class StableDiffusionPanoramaPipeline(
+    DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin
+):
+    r"""
+    Pipeline for text-to-image generation using MultiDiffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: DDIMScheduler,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    def decode_latents_with_padding(self, latents, padding=8):
+        # Add padding to latents for circular inference
+        # padding is the number of latents to add on each side
+        # it would slightly increase the memory usage, but remove the boundary artifacts
+        latents = 1 / self.vae.config.scaling_factor * latents
+        latents_left = latents[..., :padding]
+        latents_right = latents[..., -padding:]
+        latents = torch.cat((latents_right, latents, latents_left), axis=-1)
+        image = self.vae.decode(latents, return_dict=False)[0]
+        padding_pix = self.vae_scale_factor * padding
+        image = image[..., padding_pix:-padding_pix]
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def get_views(self, panorama_height, panorama_width, window_size=64, stride=8, circular_padding=False):
+        # Here, we define the mappings F_i (see Eq. 7 in the MultiDiffusion paper https://arxiv.org/abs/2302.08113)
+        # if panorama's height/width < window_size, num_blocks of height/width should return 1
+        panorama_height /= 8
+        panorama_width /= 8
+        num_blocks_height = (panorama_height - window_size) // stride + 1 if panorama_height > window_size else 1
+        if circular_padding:
+            num_blocks_width = panorama_width // stride if panorama_width > window_size else 1
+        else:
+            num_blocks_width = (panorama_width - window_size) // stride + 1 if panorama_width > window_size else 1
+        total_num_blocks = int(num_blocks_height * num_blocks_width)
+        views = []
+        for i in range(total_num_blocks):
+            h_start = int((i // num_blocks_width) * stride)
+            h_end = h_start + window_size
+            w_start = int((i % num_blocks_width) * stride)
+            w_end = w_start + window_size
+            views.append((h_start, h_end, w_start, w_end))
+        return views
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = 512,
+        width: Optional[int] = 2048,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        view_batch_size: int = 1,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        circular_padding: bool = False,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 2048):
+                The width in pixels of the generated image. The width is kept high because the pipeline is supposed
+                generate panorama-like images.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            view_batch_size (`int`, *optional*, defaults to 1):
+                The batch size to denoise split views. For some GPUs with high performance, higher view batch size can
+                speedup the generation and increase the VRAM usage.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            circular_padding (`bool`, *optional*, defaults to `False`):
+                If set to `True`, circular padding is applied to ensure there are no stitching artifacts. Circular
+                padding allows the model to seamlessly generate a transition from the rightmost part of the image to
+                the leftmost part, maintaining consistency in a 360-degree sense.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Define panorama grid and initialize views for synthesis.
+        # prepare batch grid
+        views = self.get_views(height, width, circular_padding=circular_padding)
+        views_batch = [views[i : i + view_batch_size] for i in range(0, len(views), view_batch_size)]
+        views_scheduler_status = [copy.deepcopy(self.scheduler.__dict__)] * len(views_batch)
+        count = torch.zeros_like(latents)
+        value = torch.zeros_like(latents)
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+
+        # 8. Denoising loop
+        # Each denoising step also includes refinement of the latents with respect to the
+        # views.
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                count.zero_()
+                value.zero_()
+
+                # generate views
+                # Here, we iterate through different spatial crops of the latents and denoise them. These
+                # denoised (latent) crops are then averaged to produce the final latent
+                # for the current timestep via MultiDiffusion. Please see Sec. 4.1 in the
+                # MultiDiffusion paper for more details: https://arxiv.org/abs/2302.08113
+                # Batch views denoise
+                for j, batch_view in enumerate(views_batch):
+                    vb_size = len(batch_view)
+                    # get the latents corresponding to the current view coordinates
+                    if circular_padding:
+                        latents_for_view = []
+                        for h_start, h_end, w_start, w_end in batch_view:
+                            if w_end > latents.shape[3]:
+                                # Add circular horizontal padding
+                                latent_view = torch.cat(
+                                    (
+                                        latents[:, :, h_start:h_end, w_start:],
+                                        latents[:, :, h_start:h_end, : w_end - latents.shape[3]],
+                                    ),
+                                    axis=-1,
+                                )
+                            else:
+                                latent_view = latents[:, :, h_start:h_end, w_start:w_end]
+                            latents_for_view.append(latent_view)
+                        latents_for_view = torch.cat(latents_for_view)
+                    else:
+                        latents_for_view = torch.cat(
+                            [
+                                latents[:, :, h_start:h_end, w_start:w_end]
+                                for h_start, h_end, w_start, w_end in batch_view
+                            ]
+                        )
+
+                    # rematch block's scheduler status
+                    self.scheduler.__dict__.update(views_scheduler_status[j])
+
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = (
+                        latents_for_view.repeat_interleave(2, dim=0)
+                        if do_classifier_free_guidance
+                        else latents_for_view
+                    )
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                    # repeat prompt_embeds for batch
+                    prompt_embeds_input = torch.cat([prompt_embeds] * vb_size)
+
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds_input,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                    ).sample
+
+                    # perform guidance
+                    if do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred[::2], noise_pred[1::2]
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents_denoised_batch = self.scheduler.step(
+                        noise_pred, t, latents_for_view, **extra_step_kwargs
+                    ).prev_sample
+
+                    # save views scheduler status after sample
+                    views_scheduler_status[j] = copy.deepcopy(self.scheduler.__dict__)
+
+                    # extract value from batch
+                    for latents_view_denoised, (h_start, h_end, w_start, w_end) in zip(
+                        latents_denoised_batch.chunk(vb_size), batch_view
+                    ):
+                        if circular_padding and w_end > latents.shape[3]:
+                            # Case for circular padding
+                            value[:, :, h_start:h_end, w_start:] += latents_view_denoised[
+                                :, :, h_start:h_end, : latents.shape[3] - w_start
+                            ]
+                            value[:, :, h_start:h_end, : w_end - latents.shape[3]] += latents_view_denoised[
+                                :, :, h_start:h_end, latents.shape[3] - w_start :
+                            ]
+                            count[:, :, h_start:h_end, w_start:] += 1
+                            count[:, :, h_start:h_end, : w_end - latents.shape[3]] += 1
+                        else:
+                            value[:, :, h_start:h_end, w_start:w_end] += latents_view_denoised
+                            count[:, :, h_start:h_end, w_start:w_end] += 1
+
+                # take the MultiDiffusion step. Eq. 5 in MultiDiffusion paper: https://arxiv.org/abs/2302.08113
+                latents = torch.where(count > 0, value / count, value)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            if circular_padding:
+                image = self.decode_latents_with_padding(latents)
+            else:
+                image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_safe/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_safe/__init__.py
new file mode 100644
index 000000000..b432b9418
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_safe/__init__.py
@@ -0,0 +1,99 @@
+from dataclasses import dataclass
+from enum import Enum
+from typing import TYPE_CHECKING, List, Optional, Union
+
+import numpy as np
+import PIL
+from PIL import Image
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    BaseOutput,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+@dataclass
+class SafetyConfig(object):
+    WEAK = {
+        "sld_warmup_steps": 15,
+        "sld_guidance_scale": 20,
+        "sld_threshold": 0.0,
+        "sld_momentum_scale": 0.0,
+        "sld_mom_beta": 0.0,
+    }
+    MEDIUM = {
+        "sld_warmup_steps": 10,
+        "sld_guidance_scale": 1000,
+        "sld_threshold": 0.01,
+        "sld_momentum_scale": 0.3,
+        "sld_mom_beta": 0.4,
+    }
+    STRONG = {
+        "sld_warmup_steps": 7,
+        "sld_guidance_scale": 2000,
+        "sld_threshold": 0.025,
+        "sld_momentum_scale": 0.5,
+        "sld_mom_beta": 0.7,
+    }
+    MAX = {
+        "sld_warmup_steps": 0,
+        "sld_guidance_scale": 5000,
+        "sld_threshold": 1.0,
+        "sld_momentum_scale": 0.5,
+        "sld_mom_beta": 0.7,
+    }
+
+
+_dummy_objects = {}
+_additional_imports = {}
+_import_structure = {}
+
+_additional_imports.update({"SafetyConfig": SafetyConfig})
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure.update(
+        {
+            "pipeline_output": ["StableDiffusionSafePipelineOutput"],
+            "pipeline_stable_diffusion_safe": ["StableDiffusionPipelineSafe"],
+            "safety_checker": ["StableDiffusionSafetyChecker"],
+        }
+    )
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_output import StableDiffusionSafePipelineOutput
+        from .pipeline_stable_diffusion_safe import StableDiffusionPipelineSafe
+        from .safety_checker import SafeStableDiffusionSafetyChecker
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
+    for name, value in _additional_imports.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_safe/pipeline_output.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_safe/pipeline_output.py
new file mode 100644
index 000000000..69a064d66
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_safe/pipeline_output.py
@@ -0,0 +1,34 @@
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import numpy as np
+import PIL.Image
+
+from ...utils import (
+    BaseOutput,
+)
+
+
+@dataclass
+class StableDiffusionSafePipelineOutput(BaseOutput):
+    """
+    Output class for Safe Stable Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+        nsfw_content_detected (`List[bool]`)
+            List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, or `None` if safety checking could not be performed.
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images that were flagged by the safety checker any may contain "not-safe-for-work"
+            (nsfw) content, or `None` if no safety check was performed or no images were flagged.
+        applied_safety_concept (`str`)
+            The safety concept that was applied for safety guidance, or `None` if safety guidance was disabled
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
+    unsafe_images: Optional[Union[List[PIL.Image.Image], np.ndarray]]
+    applied_safety_concept: Optional[str]
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
new file mode 100644
index 000000000..ae74e0967
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
@@ -0,0 +1,764 @@
+import inspect
+import warnings
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import torch
+from packaging import version
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...configuration_utils import FrozenDict
+from ...image_processor import PipelineImageInput
+from ...loaders import IPAdapterMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import deprecate, logging
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from . import StableDiffusionSafePipelineOutput
+from .safety_checker import SafeStableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class StableDiffusionPipelineSafe(DiffusionPipeline, StableDiffusionMixin, IPAdapterMixin):
+    r"""
+    Pipeline based on the [`StableDiffusionPipeline`] for text-to-image generation using Safe Latent Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: SafeStableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+        safety_concept: Optional[str] = (
+            "an image showing hate, harassment, violence, suffering, humiliation, harm, suicide, sexual, nudity,"
+            " bodily fluids, blood, obscene gestures, illegal activity, drug use, theft, vandalism, weapons, child"
+            " abuse, brutality, cruelty"
+        )
+
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self._safety_text_concept = safety_concept
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    @property
+    def safety_concept(self):
+        r"""
+        Getter method for the safety concept used with SLD
+
+        Returns:
+            `str`: The text describing the safety concept
+        """
+        return self._safety_text_concept
+
+    @safety_concept.setter
+    def safety_concept(self, concept):
+        r"""
+        Setter method for the safety concept used with SLD
+
+        Args:
+            concept (`str`):
+                The text of the new safety concept
+        """
+        self._safety_text_concept = concept
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt,
+        enable_safety_guidance,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+        """
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="pt").input_ids
+
+        if not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+
+        prompt_embeds = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        prompt_embeds = prompt_embeds[0]
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+            # Encode the safety concept text
+            if enable_safety_guidance:
+                safety_concept_input = self.tokenizer(
+                    [self._safety_text_concept],
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                safety_embeddings = self.text_encoder(safety_concept_input.input_ids.to(self.device))[0]
+
+                # duplicate safety embeddings for each generation per prompt, using mps friendly method
+                seq_len = safety_embeddings.shape[1]
+                safety_embeddings = safety_embeddings.repeat(batch_size, num_images_per_prompt, 1)
+                safety_embeddings = safety_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+                # For classifier free guidance + sld, we need to do three forward passes.
+                # Here we concatenate the unconditional and text embeddings into a single batch
+                # to avoid doing three forward passes
+                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds, safety_embeddings])
+
+            else:
+                # For classifier free guidance, we need to do two forward passes.
+                # Here we concatenate the unconditional and text embeddings into a single batch
+                # to avoid doing two forward passes
+                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        return prompt_embeds
+
+    def run_safety_checker(self, image, device, dtype, enable_safety_guidance):
+        if self.safety_checker is not None:
+            images = image.copy()
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+            flagged_images = np.zeros((2, *image.shape[1:]))
+            if any(has_nsfw_concept):
+                logger.warning(
+                    "Potential NSFW content was detected in one or more images. A black image will be returned"
+                    " instead."
+                    f"{'You may look at this images in the `unsafe_images` variable of the output at your own discretion.' if enable_safety_guidance else 'Try again with a different prompt and/or seed.'}"
+                )
+                for idx, has_nsfw_concept in enumerate(has_nsfw_concept):
+                    if has_nsfw_concept:
+                        flagged_images[idx] = images[idx]
+                        image[idx] = np.zeros(image[idx].shape)  # black image
+        else:
+            has_nsfw_concept = None
+            flagged_images = None
+        return image, has_nsfw_concept, flagged_images
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion_k_diffusion.pipeline_stable_diffusion_k_diffusion.StableDiffusionKDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def perform_safety_guidance(
+        self,
+        enable_safety_guidance,
+        safety_momentum,
+        noise_guidance,
+        noise_pred_out,
+        i,
+        sld_guidance_scale,
+        sld_warmup_steps,
+        sld_threshold,
+        sld_momentum_scale,
+        sld_mom_beta,
+    ):
+        # Perform SLD guidance
+        if enable_safety_guidance:
+            if safety_momentum is None:
+                safety_momentum = torch.zeros_like(noise_guidance)
+            noise_pred_text, noise_pred_uncond = noise_pred_out[0], noise_pred_out[1]
+            noise_pred_safety_concept = noise_pred_out[2]
+
+            # Equation 6
+            scale = torch.clamp(torch.abs((noise_pred_text - noise_pred_safety_concept)) * sld_guidance_scale, max=1.0)
+
+            # Equation 6
+            safety_concept_scale = torch.where(
+                (noise_pred_text - noise_pred_safety_concept) >= sld_threshold, torch.zeros_like(scale), scale
+            )
+
+            # Equation 4
+            noise_guidance_safety = torch.mul((noise_pred_safety_concept - noise_pred_uncond), safety_concept_scale)
+
+            # Equation 7
+            noise_guidance_safety = noise_guidance_safety + sld_momentum_scale * safety_momentum
+
+            # Equation 8
+            safety_momentum = sld_mom_beta * safety_momentum + (1 - sld_mom_beta) * noise_guidance_safety
+
+            if i >= sld_warmup_steps:  # Warmup
+                # Equation 3
+                noise_guidance = noise_guidance - noise_guidance_safety
+        return noise_guidance, safety_momentum
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        sld_guidance_scale: Optional[float] = 1000,
+        sld_warmup_steps: Optional[int] = 10,
+        sld_threshold: Optional[float] = 0.01,
+        sld_momentum_scale: Optional[float] = 0.3,
+        sld_mom_beta: Optional[float] = 0.4,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            sld_guidance_scale (`float`, *optional*, defaults to 1000):
+                If `sld_guidance_scale < 1`, safety guidance is disabled.
+            sld_warmup_steps (`int`, *optional*, defaults to 10):
+                Number of warmup steps for safety guidance. SLD is only be applied for diffusion steps greater than
+                `sld_warmup_steps`.
+            sld_threshold (`float`, *optional*, defaults to 0.01):
+                Threshold that separates the hyperplane between appropriate and inappropriate images.
+            sld_momentum_scale (`float`, *optional*, defaults to 0.3):
+                Scale of the SLD momentum to be added to the safety guidance at each diffusion step. If set to 0.0,
+                momentum is disabled. Momentum is built up during warmup for diffusion steps smaller than
+                `sld_warmup_steps`.
+            sld_mom_beta (`float`, *optional*, defaults to 0.4):
+                Defines how safety guidance momentum builds up. `sld_mom_beta` indicates how much of the previous
+                momentum is kept. Momentum is built up during warmup for diffusion steps smaller than
+                `sld_warmup_steps`.
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+
+        Examples:
+
+        ```py
+        import torch
+        from diffusers import StableDiffusionPipelineSafe
+        from diffusers.pipelines.stable_diffusion_safe import SafetyConfig
+
+        pipeline = StableDiffusionPipelineSafe.from_pretrained(
+            "AIML-TUDA/stable-diffusion-safe", torch_dtype=torch.float16
+        ).to("cuda")
+        prompt = "the four horsewomen of the apocalypse, painting by tom of finland, gaston bussiere, craig mullins, j. c. leyendecker"
+        image = pipeline(prompt=prompt, **SafetyConfig.MEDIUM).images[0]
+        ```
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        enable_safety_guidance = sld_guidance_scale > 1.0 and do_classifier_free_guidance
+        if not enable_safety_guidance:
+            warnings.warn("Safety checker disabled!")
+
+        if ip_adapter_image is not None:
+            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
+            image_embeds, negative_image_embeds = self.encode_image(
+                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
+            )
+            if do_classifier_free_guidance:
+                if enable_safety_guidance:
+                    image_embeds = torch.cat([negative_image_embeds, image_embeds, image_embeds])
+                else:
+                    image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
+        # 3. Encode input prompt
+        prompt_embeds = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt, enable_safety_guidance
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
+        safety_momentum = None
+
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([latents] * (3 if enable_safety_guidance else 2))
+                    if do_classifier_free_guidance
+                    else latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input, t, encoder_hidden_states=prompt_embeds, added_cond_kwargs=added_cond_kwargs
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_out = noise_pred.chunk((3 if enable_safety_guidance else 2))
+                    noise_pred_uncond, noise_pred_text = noise_pred_out[0], noise_pred_out[1]
+
+                    # default classifier free guidance
+                    noise_guidance = noise_pred_text - noise_pred_uncond
+
+                    # Perform SLD guidance
+                    if enable_safety_guidance:
+                        if safety_momentum is None:
+                            safety_momentum = torch.zeros_like(noise_guidance)
+                        noise_pred_safety_concept = noise_pred_out[2]
+
+                        # Equation 6
+                        scale = torch.clamp(
+                            torch.abs((noise_pred_text - noise_pred_safety_concept)) * sld_guidance_scale, max=1.0
+                        )
+
+                        # Equation 6
+                        safety_concept_scale = torch.where(
+                            (noise_pred_text - noise_pred_safety_concept) >= sld_threshold,
+                            torch.zeros_like(scale),
+                            scale,
+                        )
+
+                        # Equation 4
+                        noise_guidance_safety = torch.mul(
+                            (noise_pred_safety_concept - noise_pred_uncond), safety_concept_scale
+                        )
+
+                        # Equation 7
+                        noise_guidance_safety = noise_guidance_safety + sld_momentum_scale * safety_momentum
+
+                        # Equation 8
+                        safety_momentum = sld_mom_beta * safety_momentum + (1 - sld_mom_beta) * noise_guidance_safety
+
+                        if i >= sld_warmup_steps:  # Warmup
+                            # Equation 3
+                            noise_guidance = noise_guidance - noise_guidance_safety
+
+                    noise_pred = noise_pred_uncond + guidance_scale * noise_guidance
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # 8. Post-processing
+        image = self.decode_latents(latents)
+
+        # 9. Run safety checker
+        image, has_nsfw_concept, flagged_images = self.run_safety_checker(
+            image, device, prompt_embeds.dtype, enable_safety_guidance
+        )
+
+        # 10. Convert to PIL
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+            if flagged_images is not None:
+                flagged_images = self.numpy_to_pil(flagged_images)
+
+        if not return_dict:
+            return (
+                image,
+                has_nsfw_concept,
+                self._safety_text_concept if enable_safety_guidance else None,
+                flagged_images,
+            )
+
+        return StableDiffusionSafePipelineOutput(
+            images=image,
+            nsfw_content_detected=has_nsfw_concept,
+            applied_safety_concept=self._safety_text_concept if enable_safety_guidance else None,
+            unsafe_images=flagged_images,
+        )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py
new file mode 100644
index 000000000..549747e97
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py
@@ -0,0 +1,109 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+from transformers import CLIPConfig, CLIPVisionModel, PreTrainedModel
+
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def cosine_distance(image_embeds, text_embeds):
+    normalized_image_embeds = nn.functional.normalize(image_embeds)
+    normalized_text_embeds = nn.functional.normalize(text_embeds)
+    return torch.mm(normalized_image_embeds, normalized_text_embeds.t())
+
+
+class SafeStableDiffusionSafetyChecker(PreTrainedModel):
+    config_class = CLIPConfig
+
+    _no_split_modules = ["CLIPEncoderLayer"]
+
+    def __init__(self, config: CLIPConfig):
+        super().__init__(config)
+
+        self.vision_model = CLIPVisionModel(config.vision_config)
+        self.visual_projection = nn.Linear(config.vision_config.hidden_size, config.projection_dim, bias=False)
+
+        self.concept_embeds = nn.Parameter(torch.ones(17, config.projection_dim), requires_grad=False)
+        self.special_care_embeds = nn.Parameter(torch.ones(3, config.projection_dim), requires_grad=False)
+
+        self.concept_embeds_weights = nn.Parameter(torch.ones(17), requires_grad=False)
+        self.special_care_embeds_weights = nn.Parameter(torch.ones(3), requires_grad=False)
+
+    @torch.no_grad()
+    def forward(self, clip_input, images):
+        pooled_output = self.vision_model(clip_input)[1]  # pooled_output
+        image_embeds = self.visual_projection(pooled_output)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds).cpu().float().numpy()
+        cos_dist = cosine_distance(image_embeds, self.concept_embeds).cpu().float().numpy()
+
+        result = []
+        batch_size = image_embeds.shape[0]
+        for i in range(batch_size):
+            result_img = {"special_scores": {}, "special_care": [], "concept_scores": {}, "bad_concepts": []}
+
+            # increase this value to create a stronger `nfsw` filter
+            # at the cost of increasing the possibility of filtering benign images
+            adjustment = 0.0
+
+            for concept_idx in range(len(special_cos_dist[0])):
+                concept_cos = special_cos_dist[i][concept_idx]
+                concept_threshold = self.special_care_embeds_weights[concept_idx].item()
+                result_img["special_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
+                if result_img["special_scores"][concept_idx] > 0:
+                    result_img["special_care"].append({concept_idx, result_img["special_scores"][concept_idx]})
+                    adjustment = 0.01
+
+            for concept_idx in range(len(cos_dist[0])):
+                concept_cos = cos_dist[i][concept_idx]
+                concept_threshold = self.concept_embeds_weights[concept_idx].item()
+                result_img["concept_scores"][concept_idx] = round(concept_cos - concept_threshold + adjustment, 3)
+                if result_img["concept_scores"][concept_idx] > 0:
+                    result_img["bad_concepts"].append(concept_idx)
+
+            result.append(result_img)
+
+        has_nsfw_concepts = [len(res["bad_concepts"]) > 0 for res in result]
+
+        return images, has_nsfw_concepts
+
+    @torch.no_grad()
+    def forward_onnx(self, clip_input: torch.FloatTensor, images: torch.FloatTensor):
+        pooled_output = self.vision_model(clip_input)[1]  # pooled_output
+        image_embeds = self.visual_projection(pooled_output)
+
+        special_cos_dist = cosine_distance(image_embeds, self.special_care_embeds)
+        cos_dist = cosine_distance(image_embeds, self.concept_embeds)
+
+        # increase this value to create a stronger `nsfw` filter
+        # at the cost of increasing the possibility of filtering benign images
+        adjustment = 0.0
+
+        special_scores = special_cos_dist - self.special_care_embeds_weights + adjustment
+        # special_scores = special_scores.round(decimals=3)
+        special_care = torch.any(special_scores > 0, dim=1)
+        special_adjustment = special_care * 0.01
+        special_adjustment = special_adjustment.unsqueeze(1).expand(-1, cos_dist.shape[1])
+
+        concept_scores = (cos_dist - self.concept_embeds_weights) + special_adjustment
+        # concept_scores = concept_scores.round(decimals=3)
+        has_nsfw_concepts = torch.any(concept_scores > 0, dim=1)
+
+        return images, has_nsfw_concepts
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_sag/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_sag/__init__.py
new file mode 100644
index 000000000..378e0e578
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_sag/__init__.py
@@ -0,0 +1,48 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_stable_diffusion_sag"] = ["StableDiffusionSAGPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_stable_diffusion_sag import StableDiffusionSAGPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py
new file mode 100644
index 000000000..96aa006d2
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py
@@ -0,0 +1,886 @@
+# Copyright 2024 Susung Hong and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionPipelineOutput
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionSAGPipeline
+
+        >>> pipe = StableDiffusionSAGPipeline.from_pretrained(
+        ...     "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt, sag_scale=0.75).images[0]
+        ```
+"""
+
+
+# processes and stores attention probabilities
+class CrossAttnStoreProcessor:
+    def __init__(self):
+        self.attention_probs = None
+
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+    ):
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        query = attn.to_q(hidden_states)
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        self.attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(self.attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        return hidden_states
+
+
+# Modified to get self-attention guidance scale in this paper (https://arxiv.org/pdf/2210.00939.pdf) as an input
+class StableDiffusionSAGPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, IPAdapterMixin):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion_k_diffusion.pipeline_stable_diffusion_k_diffusion.StableDiffusionKDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        sag_scale: float = 0.75,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            sag_scale (`float`, *optional*, defaults to 0.75):
+                Chosen between [0, 1.0] for better quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # and `sag_scale` is` `s` of equation (16)
+        # of the self-attentnion guidance paper: https://arxiv.org/pdf/2210.00939.pdf
+        # `sag_scale = 0` means no self-attention guidance
+        do_self_attention_guidance = sag_scale > 0.0
+
+        if ip_adapter_image is not None:
+            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
+            image_embeds, negative_image_embeds = self.encode_image(
+                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
+            )
+            if do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        if timesteps.dtype not in [torch.int16, torch.int32, torch.int64]:
+            raise ValueError(
+                f"{self.__class__.__name__} does not support using a scheduler of type {self.scheduler.__class__.__name__}. Please make sure to use one of 'DDIMScheduler, PNDMScheduler, DDPMScheduler, DEISMultistepScheduler, UniPCMultistepScheduler, DPMSolverMultistepScheduler, DPMSolverSinlgestepScheduler'."
+            )
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+        added_uncond_kwargs = {"image_embeds": negative_image_embeds} if ip_adapter_image is not None else None
+
+        # 7. Denoising loop
+        store_processor = CrossAttnStoreProcessor()
+        self.unet.mid_block.attentions[0].transformer_blocks[0].attn1.processor = store_processor
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+        map_size = None
+
+        def get_map_size(module, input, output):
+            nonlocal map_size
+            map_size = output[0].shape[-2:]
+
+        with self.unet.mid_block.attentions[0].register_forward_hook(get_map_size):
+            with self.progress_bar(total=num_inference_steps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                    # predict the noise residual
+
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                    ).sample
+
+                    # perform guidance
+                    if do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    # perform self-attention guidance with the stored self-attentnion map
+                    if do_self_attention_guidance:
+                        # classifier-free guidance produces two chunks of attention map
+                        # and we only use unconditional one according to equation (25)
+                        # in https://arxiv.org/pdf/2210.00939.pdf
+                        if do_classifier_free_guidance:
+                            # DDIM-like prediction of x0
+                            pred_x0 = self.pred_x0(latents, noise_pred_uncond, t)
+                            # get the stored attention maps
+                            uncond_attn, cond_attn = store_processor.attention_probs.chunk(2)
+                            # self-attention-based degrading of latents
+                            degraded_latents = self.sag_masking(
+                                pred_x0, uncond_attn, map_size, t, self.pred_epsilon(latents, noise_pred_uncond, t)
+                            )
+                            uncond_emb, _ = prompt_embeds.chunk(2)
+                            # forward and give guidance
+                            degraded_pred = self.unet(
+                                degraded_latents,
+                                t,
+                                encoder_hidden_states=uncond_emb,
+                                added_cond_kwargs=added_uncond_kwargs,
+                            ).sample
+                            noise_pred += sag_scale * (noise_pred_uncond - degraded_pred)
+                        else:
+                            # DDIM-like prediction of x0
+                            pred_x0 = self.pred_x0(latents, noise_pred, t)
+                            # get the stored attention maps
+                            cond_attn = store_processor.attention_probs
+                            # self-attention-based degrading of latents
+                            degraded_latents = self.sag_masking(
+                                pred_x0, cond_attn, map_size, t, self.pred_epsilon(latents, noise_pred, t)
+                            )
+                            # forward and give guidance
+                            degraded_pred = self.unet(
+                                degraded_latents,
+                                t,
+                                encoder_hidden_states=prompt_embeds,
+                                added_cond_kwargs=added_cond_kwargs,
+                            ).sample
+                            noise_pred += sag_scale * (noise_pred - degraded_pred)
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+                        if callback is not None and i % callback_steps == 0:
+                            step_idx = i // getattr(self.scheduler, "order", 1)
+                            callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+    def sag_masking(self, original_latents, attn_map, map_size, t, eps):
+        # Same masking process as in SAG paper: https://arxiv.org/pdf/2210.00939.pdf
+        bh, hw1, hw2 = attn_map.shape
+        b, latent_channel, latent_h, latent_w = original_latents.shape
+        h = self.unet.config.attention_head_dim
+        if isinstance(h, list):
+            h = h[-1]
+
+        # Produce attention mask
+        attn_map = attn_map.reshape(b, h, hw1, hw2)
+        attn_mask = attn_map.mean(1, keepdim=False).sum(1, keepdim=False) > 1.0
+        attn_mask = (
+            attn_mask.reshape(b, map_size[0], map_size[1])
+            .unsqueeze(1)
+            .repeat(1, latent_channel, 1, 1)
+            .type(attn_map.dtype)
+        )
+        attn_mask = F.interpolate(attn_mask, (latent_h, latent_w))
+
+        # Blur according to the self-attention mask
+        degraded_latents = gaussian_blur_2d(original_latents, kernel_size=9, sigma=1.0)
+        degraded_latents = degraded_latents * attn_mask + original_latents * (1 - attn_mask)
+
+        # Noise it again to match the noise level
+        degraded_latents = self.scheduler.add_noise(degraded_latents, noise=eps, timesteps=t[None])
+
+        return degraded_latents
+
+    # Modified from diffusers.schedulers.scheduling_ddim.DDIMScheduler.step
+    # Note: there are some schedulers that clip or do not return x_0 (PNDMScheduler, DDIMScheduler, etc.)
+    def pred_x0(self, sample, model_output, timestep):
+        alpha_prod_t = self.scheduler.alphas_cumprod[timestep].to(sample.device)
+
+        beta_prod_t = 1 - alpha_prod_t
+        if self.scheduler.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        elif self.scheduler.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        elif self.scheduler.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            # predict V
+            model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.scheduler.config.prediction_type} must be one of `epsilon`, `sample`,"
+                " or `v_prediction`"
+            )
+
+        return pred_original_sample
+
+    def pred_epsilon(self, sample, model_output, timestep):
+        alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
+
+        beta_prod_t = 1 - alpha_prod_t
+        if self.scheduler.config.prediction_type == "epsilon":
+            pred_eps = model_output
+        elif self.scheduler.config.prediction_type == "sample":
+            pred_eps = (sample - (alpha_prod_t**0.5) * model_output) / (beta_prod_t**0.5)
+        elif self.scheduler.config.prediction_type == "v_prediction":
+            pred_eps = (beta_prod_t**0.5) * sample + (alpha_prod_t**0.5) * model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.scheduler.config.prediction_type} must be one of `epsilon`, `sample`,"
+                " or `v_prediction`"
+            )
+
+        return pred_eps
+
+
+# Gaussian blur
+def gaussian_blur_2d(img, kernel_size, sigma):
+    ksize_half = (kernel_size - 1) * 0.5
+
+    x = torch.linspace(-ksize_half, ksize_half, steps=kernel_size)
+
+    pdf = torch.exp(-0.5 * (x / sigma).pow(2))
+
+    x_kernel = pdf / pdf.sum()
+    x_kernel = x_kernel.to(device=img.device, dtype=img.dtype)
+
+    kernel2d = torch.mm(x_kernel[:, None], x_kernel[None, :])
+    kernel2d = kernel2d.expand(img.shape[-3], 1, kernel2d.shape[0], kernel2d.shape[1])
+
+    padding = [kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2]
+
+    img = F.pad(img, padding, mode="reflect")
+    img = F.conv2d(img, kernel2d, groups=img.shape[-3])
+
+    return img
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/__init__.py
new file mode 100644
index 000000000..8088fbcfc
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/__init__.py
@@ -0,0 +1,76 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_flax_available,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_additional_imports = {}
+_import_structure = {"pipeline_output": ["StableDiffusionXLPipelineOutput"]}
+
+if is_transformers_available() and is_flax_available():
+    _import_structure["pipeline_output"].extend(["FlaxStableDiffusionXLPipelineOutput"])
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_stable_diffusion_xl"] = ["StableDiffusionXLPipeline"]
+    _import_structure["pipeline_stable_diffusion_xl_img2img"] = ["StableDiffusionXLImg2ImgPipeline"]
+    _import_structure["pipeline_stable_diffusion_xl_inpaint"] = ["StableDiffusionXLInpaintPipeline"]
+    _import_structure["pipeline_stable_diffusion_xl_instruct_pix2pix"] = ["StableDiffusionXLInstructPix2PixPipeline"]
+
+if is_transformers_available() and is_flax_available():
+    from ...schedulers.scheduling_pndm_flax import PNDMSchedulerState
+
+    _additional_imports.update({"PNDMSchedulerState": PNDMSchedulerState})
+    _import_structure["pipeline_flax_stable_diffusion_xl"] = ["FlaxStableDiffusionXLPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipeline_stable_diffusion_xl import StableDiffusionXLPipeline
+        from .pipeline_stable_diffusion_xl_img2img import StableDiffusionXLImg2ImgPipeline
+        from .pipeline_stable_diffusion_xl_inpaint import StableDiffusionXLInpaintPipeline
+        from .pipeline_stable_diffusion_xl_instruct_pix2pix import StableDiffusionXLInstructPix2PixPipeline
+
+    try:
+        if not (is_transformers_available() and is_flax_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_flax_objects import *
+    else:
+        from .pipeline_flax_stable_diffusion_xl import (
+            FlaxStableDiffusionXLPipeline,
+        )
+        from .pipeline_output import FlaxStableDiffusionXLPipelineOutput
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
+    for name, value in _additional_imports.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py
new file mode 100644
index 000000000..77363b254
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py
@@ -0,0 +1,308 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+from typing import Dict, List, Optional, Union
+
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict
+from transformers import CLIPTokenizer, FlaxCLIPTextModel
+
+from diffusers.utils import logging
+
+from ...models import FlaxAutoencoderKL, FlaxUNet2DConditionModel
+from ...schedulers import (
+    FlaxDDIMScheduler,
+    FlaxDPMSolverMultistepScheduler,
+    FlaxLMSDiscreteScheduler,
+    FlaxPNDMScheduler,
+)
+from ..pipeline_flax_utils import FlaxDiffusionPipeline
+from .pipeline_output import FlaxStableDiffusionXLPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+# Set to True to use python for loop instead of jax.fori_loop for easier debugging
+DEBUG = False
+
+
+class FlaxStableDiffusionXLPipeline(FlaxDiffusionPipeline):
+    def __init__(
+        self,
+        text_encoder: FlaxCLIPTextModel,
+        text_encoder_2: FlaxCLIPTextModel,
+        vae: FlaxAutoencoderKL,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: FlaxUNet2DConditionModel,
+        scheduler: Union[
+            FlaxDDIMScheduler, FlaxPNDMScheduler, FlaxLMSDiscreteScheduler, FlaxDPMSolverMultistepScheduler
+        ],
+        dtype: jnp.dtype = jnp.float32,
+    ):
+        super().__init__()
+        self.dtype = dtype
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+
+    def prepare_inputs(self, prompt: Union[str, List[str]]):
+        if not isinstance(prompt, (str, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        # Assume we have the two encoders
+        inputs = []
+        for tokenizer in [self.tokenizer, self.tokenizer_2]:
+            text_inputs = tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="np",
+            )
+            inputs.append(text_inputs.input_ids)
+        inputs = jnp.stack(inputs, axis=1)
+        return inputs
+
+    def __call__(
+        self,
+        prompt_ids: jax.Array,
+        params: Union[Dict, FrozenDict],
+        prng_seed: jax.Array,
+        num_inference_steps: int = 50,
+        guidance_scale: Union[float, jax.Array] = 7.5,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        latents: jnp.array = None,
+        neg_prompt_ids: jnp.array = None,
+        return_dict: bool = True,
+        output_type: str = None,
+        jit: bool = False,
+    ):
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        if isinstance(guidance_scale, float) and jit:
+            # Convert to a tensor so each device gets a copy.
+            guidance_scale = jnp.array([guidance_scale] * prompt_ids.shape[0])
+            guidance_scale = guidance_scale[:, None]
+
+        return_latents = output_type == "latent"
+
+        if jit:
+            images = _p_generate(
+                self,
+                prompt_ids,
+                params,
+                prng_seed,
+                num_inference_steps,
+                height,
+                width,
+                guidance_scale,
+                latents,
+                neg_prompt_ids,
+                return_latents,
+            )
+        else:
+            images = self._generate(
+                prompt_ids,
+                params,
+                prng_seed,
+                num_inference_steps,
+                height,
+                width,
+                guidance_scale,
+                latents,
+                neg_prompt_ids,
+                return_latents,
+            )
+
+        if not return_dict:
+            return (images,)
+
+        return FlaxStableDiffusionXLPipelineOutput(images=images)
+
+    def get_embeddings(self, prompt_ids: jnp.array, params):
+        # We assume we have the two encoders
+
+        # bs, encoder_input, seq_length
+        te_1_inputs = prompt_ids[:, 0, :]
+        te_2_inputs = prompt_ids[:, 1, :]
+
+        prompt_embeds = self.text_encoder(te_1_inputs, params=params["text_encoder"], output_hidden_states=True)
+        prompt_embeds = prompt_embeds["hidden_states"][-2]
+        prompt_embeds_2_out = self.text_encoder_2(
+            te_2_inputs, params=params["text_encoder_2"], output_hidden_states=True
+        )
+        prompt_embeds_2 = prompt_embeds_2_out["hidden_states"][-2]
+        text_embeds = prompt_embeds_2_out["text_embeds"]
+        prompt_embeds = jnp.concatenate([prompt_embeds, prompt_embeds_2], axis=-1)
+        return prompt_embeds, text_embeds
+
+    def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, bs, dtype):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+        add_time_ids = jnp.array([add_time_ids] * bs, dtype=dtype)
+        return add_time_ids
+
+    def _generate(
+        self,
+        prompt_ids: jnp.array,
+        params: Union[Dict, FrozenDict],
+        prng_seed: jax.Array,
+        num_inference_steps: int,
+        height: int,
+        width: int,
+        guidance_scale: float,
+        latents: Optional[jnp.array] = None,
+        neg_prompt_ids: Optional[jnp.array] = None,
+        return_latents=False,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        # Encode input prompt
+        prompt_embeds, pooled_embeds = self.get_embeddings(prompt_ids, params)
+
+        # Get unconditional embeddings
+        batch_size = prompt_embeds.shape[0]
+        if neg_prompt_ids is None:
+            neg_prompt_embeds = jnp.zeros_like(prompt_embeds)
+            negative_pooled_embeds = jnp.zeros_like(pooled_embeds)
+        else:
+            neg_prompt_embeds, negative_pooled_embeds = self.get_embeddings(neg_prompt_ids, params)
+
+        add_time_ids = self._get_add_time_ids(
+            (height, width), (0, 0), (height, width), prompt_embeds.shape[0], dtype=prompt_embeds.dtype
+        )
+
+        prompt_embeds = jnp.concatenate([neg_prompt_embeds, prompt_embeds], axis=0)  # (2, 77, 2048)
+        add_text_embeds = jnp.concatenate([negative_pooled_embeds, pooled_embeds], axis=0)
+        add_time_ids = jnp.concatenate([add_time_ids, add_time_ids], axis=0)
+
+        # Ensure model output will be `float32` before going into the scheduler
+        guidance_scale = jnp.array([guidance_scale], dtype=jnp.float32)
+
+        # Create random latents
+        latents_shape = (
+            batch_size,
+            self.unet.config.in_channels,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if latents is None:
+            latents = jax.random.normal(prng_seed, shape=latents_shape, dtype=jnp.float32)
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+
+        # Prepare scheduler state
+        scheduler_state = self.scheduler.set_timesteps(
+            params["scheduler"], num_inference_steps=num_inference_steps, shape=latents.shape
+        )
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * scheduler_state.init_noise_sigma
+
+        added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+        # Denoising loop
+        def loop_body(step, args):
+            latents, scheduler_state = args
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            latents_input = jnp.concatenate([latents] * 2)
+
+            t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
+            timestep = jnp.broadcast_to(t, latents_input.shape[0])
+
+            latents_input = self.scheduler.scale_model_input(scheduler_state, latents_input, t)
+
+            # predict the noise residual
+            noise_pred = self.unet.apply(
+                {"params": params["unet"]},
+                jnp.array(latents_input),
+                jnp.array(timestep, dtype=jnp.int32),
+                encoder_hidden_states=prompt_embeds,
+                added_cond_kwargs=added_cond_kwargs,
+            ).sample
+            # perform guidance
+            noise_pred_uncond, noise_prediction_text = jnp.split(noise_pred, 2, axis=0)
+            noise_pred = noise_pred_uncond + guidance_scale * (noise_prediction_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents, scheduler_state = self.scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
+            return latents, scheduler_state
+
+        if DEBUG:
+            # run with python for loop
+            for i in range(num_inference_steps):
+                latents, scheduler_state = loop_body(i, (latents, scheduler_state))
+        else:
+            latents, _ = jax.lax.fori_loop(0, num_inference_steps, loop_body, (latents, scheduler_state))
+
+        if return_latents:
+            return latents
+
+        # Decode latents
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.apply({"params": params["vae"]}, latents, method=self.vae.decode).sample
+
+        image = (image / 2 + 0.5).clip(0, 1).transpose(0, 2, 3, 1)
+        return image
+
+
+# Static argnums are pipe, num_inference_steps, height, width, return_latents. A change would trigger recompilation.
+# Non-static args are (sharded) input tensors mapped over their first dimension (hence, `0`).
+@partial(
+    jax.pmap,
+    in_axes=(None, 0, 0, 0, None, None, None, 0, 0, 0, None),
+    static_broadcasted_argnums=(0, 4, 5, 6, 10),
+)
+def _p_generate(
+    pipe,
+    prompt_ids,
+    params,
+    prng_seed,
+    num_inference_steps,
+    height,
+    width,
+    guidance_scale,
+    latents,
+    neg_prompt_ids,
+    return_latents,
+):
+    return pipe._generate(
+        prompt_ids,
+        params,
+        prng_seed,
+        num_inference_steps,
+        height,
+        width,
+        guidance_scale,
+        latents,
+        neg_prompt_ids,
+        return_latents,
+    )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_output.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_output.py
new file mode 100644
index 000000000..0783f4448
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_output.py
@@ -0,0 +1,37 @@
+from dataclasses import dataclass
+from typing import List, Union
+
+import numpy as np
+import PIL.Image
+
+from ...utils import BaseOutput, is_flax_available
+
+
+@dataclass
+class StableDiffusionXLPipelineOutput(BaseOutput):
+    """
+    Output class for Stable Diffusion pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+
+
+if is_flax_available():
+    import flax
+
+    @flax.struct.dataclass
+    class FlaxStableDiffusionXLPipelineOutput(BaseOutput):
+        """
+        Output class for Flax Stable Diffusion XL pipelines.
+
+        Args:
+            images (`np.ndarray`)
+                Array of shape `(batch_size, height, width, num_channels)` with images from the diffusion pipeline.
+        """
+
+        images: np.ndarray
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
new file mode 100644
index 000000000..776696e9d
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -0,0 +1,1266 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    FusedAttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    is_invisible_watermark_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pipeline_output import StableDiffusionXLPipelineOutput
+
+
+if is_invisible_watermark_available():
+    from .watermark import StableDiffusionXLWatermarker
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionXLPipeline
+
+        >>> pipe = StableDiffusionXLPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionXLPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    FromSingleFileMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
+            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
+            watermarker will be used.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "negative_add_time_ids",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        self.default_sample_size = self.unet.config.sample_size
+
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                LoRAXFormersAttnProcessor,
+                LoRAAttnProcessor2_0,
+                FusedAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        # 8.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and isinstance(self.denoising_end, float)
+            and self.denoising_end > 0
+            and self.denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # 9. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+            has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+            else:
+                latents = latents / self.vae.config.scaling_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
new file mode 100644
index 000000000..fd4c412f4
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -0,0 +1,1442 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import PIL.Image
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    is_invisible_watermark_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pipeline_output import StableDiffusionXLPipelineOutput
+
+
+if is_invisible_watermark_available():
+    from .watermark import StableDiffusionXLWatermarker
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionXLImg2ImgPipeline
+        >>> from diffusers.utils import load_image
+
+        >>> pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-refiner-1.0", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+        >>> url = "https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/aa_xl/000000009.png"
+
+        >>> init_image = load_image(url).convert("RGB")
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt, image=init_image).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionXLImg2ImgPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    FromSingleFileMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    IPAdapterMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        requires_aesthetics_score (`bool`, *optional*, defaults to `"False"`):
+            Whether the `unet` requires an `aesthetic_score` condition to be passed during inference. Also see the
+            config of `stabilityai/stable-diffusion-xl-refiner-1-0`.
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
+            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
+            watermarker will be used.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "add_neg_time_ids",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        requires_aesthetics_score: bool = False,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            scheduler=scheduler,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        strength,
+        num_inference_steps,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+        if num_inference_steps is None:
+            raise ValueError("`num_inference_steps` cannot be None.")
+        elif not isinstance(num_inference_steps, int) or num_inference_steps <= 0:
+            raise ValueError(
+                f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type"
+                f" {type(num_inference_steps)}."
+            )
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None):
+        # get the original timestep using init_timestep
+        if denoising_start is None:
+            init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+            t_start = max(num_inference_steps - init_timestep, 0)
+        else:
+            t_start = 0
+
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+        # Strength is irrelevant if we directly request a timestep to start at;
+        # that is, strength is determined by the denoising_start instead.
+        if denoising_start is not None:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_start * self.scheduler.config.num_train_timesteps)
+                )
+            )
+
+            num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
+            if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
+                # if the scheduler is a 2nd order scheduler we might have to do +1
+                # because `num_inference_steps` might be even given that every timestep
+                # (except the highest one) is duplicated. If `num_inference_steps` is even it would
+                # mean that we cut the timesteps in the middle of the denoising step
+                # (between 1st and 2nd devirative) which leads to incorrect results. By adding 1
+                # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
+                num_inference_steps = num_inference_steps + 1
+
+            # because t_n+1 >= t_n, we slice the timesteps starting from the end
+            timesteps = timesteps[-num_inference_steps:]
+            return timesteps, num_inference_steps
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(
+        self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None, add_noise=True
+    ):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        # Offload text encoder if `enable_model_cpu_offload` was enabled
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.text_encoder_2.to("cpu")
+            torch.cuda.empty_cache()
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            if self.vae.config.force_upcast:
+                image = image.float()
+                self.vae.to(dtype=torch.float32)
+
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            if self.vae.config.force_upcast:
+                self.vae.to(dtype)
+
+            init_latents = init_latents.to(dtype)
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        if add_noise:
+            shape = init_latents.shape
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # get latents
+            init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+
+        latents = init_latents
+
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    def _get_add_time_ids(
+        self,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        aesthetic_score,
+        negative_aesthetic_score,
+        negative_original_size,
+        negative_crops_coords_top_left,
+        negative_target_size,
+        dtype,
+        text_encoder_projection_dim=None,
+    ):
+        if self.config.requires_aesthetics_score:
+            add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
+            add_neg_time_ids = list(
+                negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)
+            )
+        else:
+            add_time_ids = list(original_size + crops_coords_top_left + target_size)
+            add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if (
+            expected_add_embed_dim > passed_add_embed_dim
+            and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model."
+            )
+        elif (
+            expected_add_embed_dim < passed_add_embed_dim
+            and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model."
+            )
+        elif expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
+
+        return add_time_ids, add_neg_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                LoRAXFormersAttnProcessor,
+                LoRAAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def denoising_start(self):
+        return self._denoising_start
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        strength: float = 0.3,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        aesthetic_score: float = 6.0,
+        negative_aesthetic_score: float = 2.5,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            image (`torch.FloatTensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`):
+                The image(s) to modify with the pipeline.
+            strength (`float`, *optional*, defaults to 0.3):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. Note that in the case of
+                `denoising_start` being declared as an integer, the value of `strength` will be ignored.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            denoising_start (`float`, *optional*):
+                When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
+                bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
+                it is assumed that the passed `image` is a partly denoised image. Note that when this is specified,
+                strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline
+                is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refine Image
+                Quality**](https://huggingface.co/docs/diffusers/using-diffusers/sdxl#refine-image-quality).
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise (ca. final 20% of timesteps still needed) and should be
+                denoised by a successor pipeline that has `denoising_start` set to 0.8 so that it only denoises the
+                final 20% of the scheduler. The denoising_end parameter should ideally be utilized when this pipeline
+                forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refine Image
+                Quality**](https://huggingface.co/docs/diffusers/using-diffusers/sdxl#refine-image-quality).
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            aesthetic_score (`float`, *optional*, defaults to 6.0):
+                Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
+                simulate an aesthetic score of the generated image by influencing the negative text condition.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            strength,
+            num_inference_steps,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._denoising_start = denoising_start
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 4. Preprocess image
+        image = self.image_processor.preprocess(image)
+
+        # 5. Prepare timesteps
+        def denoising_value_valid(dnv):
+            return isinstance(dnv, float) and 0 < dnv < 1
+
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps,
+            strength,
+            device,
+            denoising_start=self.denoising_start if denoising_value_valid(self.denoising_start) else None,
+        )
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        add_noise = True if self.denoising_start is None else False
+        # 6. Prepare latent variables
+        latents = self.prepare_latents(
+            image,
+            latent_timestep,
+            batch_size,
+            num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            add_noise,
+        )
+        # 7. Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        height, width = latents.shape[-2:]
+        height = height * self.vae_scale_factor
+        width = width * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 8. Prepare added time ids & embeddings
+        if negative_original_size is None:
+            negative_original_size = original_size
+        if negative_target_size is None:
+            negative_target_size = target_size
+
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids, add_neg_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            aesthetic_score,
+            negative_aesthetic_score,
+            negative_original_size,
+            negative_crops_coords_top_left,
+            negative_target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+            add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device)
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 9. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        # 9.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and self.denoising_start is not None
+            and denoising_value_valid(self.denoising_end)
+            and denoising_value_valid(self.denoising_start)
+            and self.denoising_start >= self.denoising_end
+        ):
+            raise ValueError(
+                f"`denoising_start`: {self.denoising_start} cannot be larger than or equal to `denoising_end`: "
+                + f" {self.denoising_end} when using type float."
+            )
+        elif self.denoising_end is not None and denoising_value_valid(self.denoising_end):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # 9.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+            has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+            else:
+                latents = latents / self.vae.config.scaling_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+
+        # apply watermark if available
+        if self.watermark is not None:
+            image = self.watermark.apply_watermark(image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
new file mode 100644
index 000000000..c25628c22
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -0,0 +1,1812 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    is_invisible_watermark_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pipeline_output import StableDiffusionXLPipelineOutput
+
+
+if is_invisible_watermark_available():
+    from .watermark import StableDiffusionXLWatermarker
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionXLInpaintPipeline
+        >>> from diffusers.utils import load_image
+
+        >>> pipe = StableDiffusionXLInpaintPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0",
+        ...     torch_dtype=torch.float16,
+        ...     variant="fp16",
+        ...     use_safetensors=True,
+        ... )
+        >>> pipe.to("cuda")
+
+        >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
+        >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
+
+        >>> init_image = load_image(img_url).convert("RGB")
+        >>> mask_image = load_image(mask_url).convert("RGB")
+
+        >>> prompt = "A majestic tiger sitting on a bench"
+        >>> image = pipe(
+        ...     prompt=prompt, image=init_image, mask_image=mask_image, num_inference_steps=50, strength=0.80
+        ... ).images[0]
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+def mask_pil_to_torch(mask, height, width):
+    # preprocess mask
+    if isinstance(mask, (PIL.Image.Image, np.ndarray)):
+        mask = [mask]
+
+    if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image):
+        mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask]
+        mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0)
+        mask = mask.astype(np.float32) / 255.0
+    elif isinstance(mask, list) and isinstance(mask[0], np.ndarray):
+        mask = np.concatenate([m[None, None, :] for m in mask], axis=0)
+
+    mask = torch.from_numpy(mask)
+    return mask
+
+
+def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool = False):
+    """
+    Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be
+    converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the
+    ``image`` and ``1`` for the ``mask``.
+
+    The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be
+    binarized (``mask > 0.5``) and cast to ``torch.float32`` too.
+
+    Args:
+        image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width``
+            ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``.
+        mask (_type_): The mask to apply to the image, i.e. regions to inpaint.
+            It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width``
+            ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``.
+
+
+    Raises:
+        ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask
+        should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions.
+        TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not
+            (ot the other way around).
+
+    Returns:
+        tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4
+            dimensions: ``batch x channels x height x width``.
+    """
+
+    # checkpoint. TOD(Yiyi) - need to clean this up later
+    deprecation_message = "The prepare_mask_and_masked_image method is deprecated and will be removed in a future version. Please use VaeImageProcessor.preprocess instead"
+    deprecate(
+        "prepare_mask_and_masked_image",
+        "0.30.0",
+        deprecation_message,
+    )
+    if image is None:
+        raise ValueError("`image` input cannot be undefined.")
+
+    if mask is None:
+        raise ValueError("`mask_image` input cannot be undefined.")
+
+    if isinstance(image, torch.Tensor):
+        if not isinstance(mask, torch.Tensor):
+            mask = mask_pil_to_torch(mask, height, width)
+
+        if image.ndim == 3:
+            image = image.unsqueeze(0)
+
+        # Batch and add channel dim for single mask
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0).unsqueeze(0)
+
+        # Batch single mask or add channel dim
+        if mask.ndim == 3:
+            # Single batched mask, no channel dim or single mask not batched but channel dim
+            if mask.shape[0] == 1:
+                mask = mask.unsqueeze(0)
+
+            # Batched masks no channel dim
+            else:
+                mask = mask.unsqueeze(1)
+
+        assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions"
+        # assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions"
+        assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size"
+
+        # Check image is in [-1, 1]
+        # if image.min() < -1 or image.max() > 1:
+        #    raise ValueError("Image should be in [-1, 1] range")
+
+        # Check mask is in [0, 1]
+        if mask.min() < 0 or mask.max() > 1:
+            raise ValueError("Mask should be in [0, 1] range")
+
+        # Binarize mask
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+
+        # Image as float32
+        image = image.to(dtype=torch.float32)
+    elif isinstance(mask, torch.Tensor):
+        raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not")
+    else:
+        # preprocess image
+        if isinstance(image, (PIL.Image.Image, np.ndarray)):
+            image = [image]
+        if isinstance(image, list) and isinstance(image[0], PIL.Image.Image):
+            # resize all images w.r.t passed height an width
+            image = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in image]
+            image = [np.array(i.convert("RGB"))[None, :] for i in image]
+            image = np.concatenate(image, axis=0)
+        elif isinstance(image, list) and isinstance(image[0], np.ndarray):
+            image = np.concatenate([i[None, :] for i in image], axis=0)
+
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+
+        mask = mask_pil_to_torch(mask, height, width)
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+
+    if image.shape[1] == 4:
+        # images are in latent space and thus can't
+        # be masked set masked_image to None
+        # we assume that the checkpoint is not an inpainting
+        # checkpoint. TOD(Yiyi) - need to clean this up later
+        masked_image = None
+    else:
+        masked_image = image * (mask < 0.5)
+
+    # n.b. ensure backwards compatibility as old function does not return image
+    if return_image:
+        return mask, masked_image, image
+
+    return mask, masked_image
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionXLInpaintPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    FromSingleFileMixin,
+    IPAdapterMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        requires_aesthetics_score (`bool`, *optional*, defaults to `"False"`):
+            Whether the `unet` requires a aesthetic_score condition to be passed during inference. Also see the config
+            of `stabilityai/stable-diffusion-xl-refiner-1-0`.
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
+            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
+            watermarker will be used.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "add_neg_time_ids",
+        "mask",
+        "masked_image_latents",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        requires_aesthetics_score: bool = False,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+            scheduler=scheduler,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.mask_processor = VaeImageProcessor(
+            vae_scale_factor=self.vae_scale_factor, do_normalize=False, do_binarize=True, do_convert_grayscale=True
+        )
+
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        image,
+        mask_image,
+        height,
+        width,
+        strength,
+        callback_steps,
+        output_type,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+        padding_mask_crop=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+        if padding_mask_crop is not None:
+            if not isinstance(image, PIL.Image.Image):
+                raise ValueError(
+                    f"The image should be a PIL image when inpainting mask crop, but is of type" f" {type(image)}."
+                )
+            if not isinstance(mask_image, PIL.Image.Image):
+                raise ValueError(
+                    f"The mask image should be a PIL image when inpainting mask crop, but is of type"
+                    f" {type(mask_image)}."
+                )
+            if output_type != "pil":
+                raise ValueError(f"The output type should be PIL when inpainting mask crop, but is" f" {output_type}.")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+        image=None,
+        timestep=None,
+        is_strength_max=True,
+        add_noise=True,
+        return_noise=False,
+        return_image_latents=False,
+    ):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if (image is None or timestep is None) and not is_strength_max:
+            raise ValueError(
+                "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise."
+                "However, either the image or the noise timestep has not been provided."
+            )
+
+        if image.shape[1] == 4:
+            image_latents = image.to(device=device, dtype=dtype)
+            image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+        elif return_image_latents or (latents is None and not is_strength_max):
+            image = image.to(device=device, dtype=dtype)
+            image_latents = self._encode_vae_image(image=image, generator=generator)
+            image_latents = image_latents.repeat(batch_size // image_latents.shape[0], 1, 1, 1)
+
+        if latents is None and add_noise:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            # if strength is 1. then initialise the latents to noise, else initial to image + noise
+            latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep)
+            # if pure noise then scale the initial latents by the  Scheduler's init sigma
+            latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
+        elif add_noise:
+            noise = latents.to(device)
+            latents = noise * self.scheduler.init_noise_sigma
+        else:
+            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            latents = image_latents.to(device)
+
+        outputs = (latents,)
+
+        if return_noise:
+            outputs += (noise,)
+
+        if return_image_latents:
+            outputs += (image_latents,)
+
+        return outputs
+
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        dtype = image.dtype
+        if self.vae.config.force_upcast:
+            image = image.float()
+            self.vae.to(dtype=torch.float32)
+
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+        if self.vae.config.force_upcast:
+            self.vae.to(dtype)
+
+        image_latents = image_latents.to(dtype)
+        image_latents = self.vae.config.scaling_factor * image_latents
+
+        return image_latents
+
+    def prepare_mask_latents(
+        self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance
+    ):
+        # resize the mask to latents shape as we concatenate the mask to the latents
+        # we do that before converting to dtype to avoid breaking in case we're using cpu_offload
+        # and half precision
+        mask = torch.nn.functional.interpolate(
+            mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor)
+        )
+        mask = mask.to(device=device, dtype=dtype)
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if mask.shape[0] < batch_size:
+            if not batch_size % mask.shape[0] == 0:
+                raise ValueError(
+                    "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to"
+                    f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number"
+                    " of masks that you pass is divisible by the total requested batch size."
+                )
+            mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1)
+
+        mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask
+
+        if masked_image is not None and masked_image.shape[1] == 4:
+            masked_image_latents = masked_image
+        else:
+            masked_image_latents = None
+
+        if masked_image is not None:
+            if masked_image_latents is None:
+                masked_image = masked_image.to(device=device, dtype=dtype)
+                masked_image_latents = self._encode_vae_image(masked_image, generator=generator)
+
+            if masked_image_latents.shape[0] < batch_size:
+                if not batch_size % masked_image_latents.shape[0] == 0:
+                    raise ValueError(
+                        "The passed images and the required batch size don't match. Images are supposed to be duplicated"
+                        f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed."
+                        " Make sure the number of images that you pass is divisible by the total requested batch size."
+                    )
+                masked_image_latents = masked_image_latents.repeat(
+                    batch_size // masked_image_latents.shape[0], 1, 1, 1
+                )
+
+            masked_image_latents = (
+                torch.cat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
+            )
+
+            # aligning device to prevent device errors when concating it with the latent model input
+            masked_image_latents = masked_image_latents.to(device=device, dtype=dtype)
+
+        return mask, masked_image_latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device, denoising_start=None):
+        # get the original timestep using init_timestep
+        if denoising_start is None:
+            init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+            t_start = max(num_inference_steps - init_timestep, 0)
+        else:
+            t_start = 0
+
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+
+        # Strength is irrelevant if we directly request a timestep to start at;
+        # that is, strength is determined by the denoising_start instead.
+        if denoising_start is not None:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_start * self.scheduler.config.num_train_timesteps)
+                )
+            )
+
+            num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
+            if self.scheduler.order == 2 and num_inference_steps % 2 == 0:
+                # if the scheduler is a 2nd order scheduler we might have to do +1
+                # because `num_inference_steps` might be even given that every timestep
+                # (except the highest one) is duplicated. If `num_inference_steps` is even it would
+                # mean that we cut the timesteps in the middle of the denoising step
+                # (between 1st and 2nd devirative) which leads to incorrect results. By adding 1
+                # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
+                num_inference_steps = num_inference_steps + 1
+
+            # because t_n+1 >= t_n, we slice the timesteps starting from the end
+            timesteps = timesteps[-num_inference_steps:]
+            return timesteps, num_inference_steps
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        aesthetic_score,
+        negative_aesthetic_score,
+        negative_original_size,
+        negative_crops_coords_top_left,
+        negative_target_size,
+        dtype,
+        text_encoder_projection_dim=None,
+    ):
+        if self.config.requires_aesthetics_score:
+            add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
+            add_neg_time_ids = list(
+                negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)
+            )
+        else:
+            add_time_ids = list(original_size + crops_coords_top_left + target_size)
+            add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if (
+            expected_add_embed_dim > passed_add_embed_dim
+            and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model."
+            )
+        elif (
+            expected_add_embed_dim < passed_add_embed_dim
+            and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim
+        ):
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model."
+            )
+        elif expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
+
+        return add_time_ids, add_neg_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                LoRAXFormersAttnProcessor,
+                LoRAAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+
+    @property
+    def denoising_start(self):
+        return self._denoising_start
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        mask_image: PipelineImageInput = None,
+        masked_image_latents: torch.FloatTensor = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        padding_mask_crop: Optional[int] = None,
+        strength: float = 0.9999,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_start: Optional[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        aesthetic_score: float = 6.0,
+        negative_aesthetic_score: float = 2.5,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
+                be masked out with `mask_image` and repainted according to `prompt`.
+            mask_image (`PIL.Image.Image`):
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
+                repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted
+                to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L)
+                instead of 3, so the expected shape would be `(B, H, W, 1)`.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            padding_mask_crop (`int`, *optional*, defaults to `None`):
+                The size of margin in the crop to be applied to the image and masking. If `None`, no crop is applied to image and mask_image. If
+                `padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and
+                contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on
+                the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large
+                and contain information inreleant for inpainging, such as background.
+            strength (`float`, *optional*, defaults to 0.9999):
+                Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
+                between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
+                `strength`. The number of denoising steps depends on the amount of noise initially added. When
+                `strength` is 1, added noise will be maximum and the denoising process will run for the full number of
+                iterations specified in `num_inference_steps`. A value of 1, therefore, essentially ignores the masked
+                portion of the reference `image`. Note that in the case of `denoising_start` being declared as an
+                integer, the value of `strength` will be ignored.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            denoising_start (`float`, *optional*):
+                When specified, indicates the fraction (between 0.0 and 1.0) of the total denoising process to be
+                bypassed before it is initiated. Consequently, the initial part of the denoising process is skipped and
+                it is assumed that the passed `image` is a partly denoised image. Note that when this is specified,
+                strength will be ignored. The `denoising_start` parameter is particularly beneficial when this pipeline
+                is integrated into a "Mixture of Denoisers" multi-pipeline setup, as detailed in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise (ca. final 20% of timesteps still needed) and should be
+                denoised by a successor pipeline that has `denoising_start` set to 0.8 so that it only denoises the
+                final 20% of the scheduler. The denoising_end parameter should ideally be utilized when this pipeline
+                forms a part of a "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output).
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            aesthetic_score (`float`, *optional*, defaults to 6.0):
+                Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
+                simulate an aesthetic score of the generated image by influencing the negative text condition.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple. `tuple. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # 1. Check inputs
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            image,
+            mask_image,
+            height,
+            width,
+            strength,
+            callback_steps,
+            output_type,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+            padding_mask_crop,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._denoising_start = denoising_start
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 4. set timesteps
+        def denoising_value_valid(dnv):
+            return isinstance(dnv, float) and 0 < dnv < 1
+
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps,
+            strength,
+            device,
+            denoising_start=self.denoising_start if denoising_value_valid(self.denoising_start) else None,
+        )
+        # check that number of inference steps is not < 1 - as this doesn't make sense
+        if num_inference_steps < 1:
+            raise ValueError(
+                f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline"
+                f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline."
+            )
+        # at which timestep to set the initial noise (n.b. 50% if strength is 0.5)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+        # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise
+        is_strength_max = strength == 1.0
+
+        # 5. Preprocess mask and image
+        if padding_mask_crop is not None:
+            crops_coords = self.mask_processor.get_crop_region(mask_image, width, height, pad=padding_mask_crop)
+            resize_mode = "fill"
+        else:
+            crops_coords = None
+            resize_mode = "default"
+
+        original_image = image
+        init_image = self.image_processor.preprocess(
+            image, height=height, width=width, crops_coords=crops_coords, resize_mode=resize_mode
+        )
+        init_image = init_image.to(dtype=torch.float32)
+
+        mask = self.mask_processor.preprocess(
+            mask_image, height=height, width=width, resize_mode=resize_mode, crops_coords=crops_coords
+        )
+
+        if masked_image_latents is not None:
+            masked_image = masked_image_latents
+        elif init_image.shape[1] == 4:
+            # if images are in latent space, we can't mask it
+            masked_image = None
+        else:
+            masked_image = init_image * (mask < 0.5)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        num_channels_unet = self.unet.config.in_channels
+        return_image_latents = num_channels_unet == 4
+
+        add_noise = True if self.denoising_start is None else False
+        latents_outputs = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+            image=init_image,
+            timestep=latent_timestep,
+            is_strength_max=is_strength_max,
+            add_noise=add_noise,
+            return_noise=True,
+            return_image_latents=return_image_latents,
+        )
+
+        if return_image_latents:
+            latents, noise, image_latents = latents_outputs
+        else:
+            latents, noise = latents_outputs
+
+        # 7. Prepare mask latent variables
+        mask, masked_image_latents = self.prepare_mask_latents(
+            mask,
+            masked_image,
+            batch_size * num_images_per_prompt,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            self.do_classifier_free_guidance,
+        )
+
+        # 8. Check that sizes of mask, masked image and latents match
+        if num_channels_unet == 9:
+            # default case for runwayml/stable-diffusion-inpainting
+            num_channels_mask = mask.shape[1]
+            num_channels_masked_image = masked_image_latents.shape[1]
+            if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels:
+                raise ValueError(
+                    f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                    f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                    f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
+                    f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
+                    " `pipeline.unet` or your `mask_image` or `image` input."
+                )
+        elif num_channels_unet != 4:
+            raise ValueError(
+                f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}."
+            )
+        # 8.1 Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        height, width = latents.shape[-2:]
+        height = height * self.vae_scale_factor
+        width = width * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 10. Prepare added time ids & embeddings
+        if negative_original_size is None:
+            negative_original_size = original_size
+        if negative_target_size is None:
+            negative_target_size = target_size
+
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids, add_neg_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            aesthetic_score,
+            negative_aesthetic_score,
+            negative_original_size,
+            negative_crops_coords_top_left,
+            negative_target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_neg_time_ids = add_neg_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+            add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device)
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 11. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        if (
+            self.denoising_end is not None
+            and self.denoising_start is not None
+            and denoising_value_valid(self.denoising_end)
+            and denoising_value_valid(self.denoising_start)
+            and self.denoising_start >= self.denoising_end
+        ):
+            raise ValueError(
+                f"`denoising_start`: {self.denoising_start} cannot be larger than or equal to `denoising_end`: "
+                + f" {self.denoising_end} when using type float."
+            )
+        elif self.denoising_end is not None and denoising_value_valid(self.denoising_end):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        # 11.1 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                # concat latents, mask, masked_image_latents in the channel dimension
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                if num_channels_unet == 9:
+                    latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if num_channels_unet == 4:
+                    init_latents_proper = image_latents
+                    if self.do_classifier_free_guidance:
+                        init_mask, _ = mask.chunk(2)
+                    else:
+                        init_mask = mask
+
+                    if i < len(timesteps) - 1:
+                        noise_timestep = timesteps[i + 1]
+                        init_latents_proper = self.scheduler.add_noise(
+                            init_latents_proper, noise, torch.tensor([noise_timestep])
+                        )
+
+                    latents = (1 - init_mask) * init_latents_proper + init_mask * latents
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)
+                    mask = callback_outputs.pop("mask", mask)
+                    masked_image_latents = callback_outputs.pop("masked_image_latents", masked_image_latents)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+            has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+            else:
+                latents = latents / self.vae.config.scaling_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            return StableDiffusionXLPipelineOutput(images=latents)
+
+        # apply watermark if available
+        if self.watermark is not None:
+            image = self.watermark.apply_watermark(image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        if padding_mask_crop is not None:
+            image = [self.image_processor.apply_overlay(mask_image, original_image, i, crops_coords) for i in image]
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
new file mode 100644
index 000000000..51e413d4b
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
@@ -0,0 +1,976 @@
+# Copyright 2024 Harutatsu Akiyama and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import PIL.Image
+import torch
+from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    FusedAttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    is_invisible_watermark_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from .pipeline_output import StableDiffusionXLPipelineOutput
+
+
+if is_invisible_watermark_available():
+    from .watermark import StableDiffusionXLWatermarker
+
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionXLInstructPix2PixPipeline
+        >>> from diffusers.utils import load_image
+
+        >>> resolution = 768
+        >>> image = load_image(
+        ...     "https://hf.co/datasets/diffusers/diffusers-images-docs/resolve/main/mountain.png"
+        ... ).resize((resolution, resolution))
+        >>> edit_instruction = "Turn sky into a cloudy one"
+
+        >>> pipe = StableDiffusionXLInstructPix2PixPipeline.from_pretrained(
+        ...     "diffusers/sdxl-instructpix2pix-768", torch_dtype=torch.float16
+        ... ).to("cuda")
+
+        >>> edited_image = pipe(
+        ...     prompt=edit_instruction,
+        ...     image=image,
+        ...     height=resolution,
+        ...     width=resolution,
+        ...     guidance_scale=3.0,
+        ...     image_guidance_scale=1.5,
+        ...     num_inference_steps=30,
+        ... ).images[0]
+        >>> edited_image
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+class StableDiffusionXLInstructPix2PixPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    FromSingleFileMixin,
+    StableDiffusionXLLoraLoaderMixin,
+):
+    r"""
+    Pipeline for pixel-level image editing by following text instructions. Based on Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        requires_aesthetics_score (`bool`, *optional*, defaults to `"False"`):
+            Whether the `unet` requires a aesthetic_score condition to be passed during inference. Also see the config
+            of `stabilityai/stable-diffusion-xl-refiner-1-0`.
+        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
+            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
+            `stabilityai/stable-diffusion-xl-base-1-0`.
+        add_watermarker (`bool`, *optional*):
+            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
+            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
+            watermarker will be used.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
+    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.default_sample_size = self.unet.config.sample_size
+
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(
+                    text_input_ids.to(device),
+                    output_hidden_states=True,
+                )
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                prompt_embeds = prompt_embeds.hidden_states[-2]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        prompt_embeds_dtype = self.text_encoder_2.dtype if self.text_encoder_2 is not None else self.unet.dtype
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_instruct_pix2pix.StableDiffusionInstructPix2PixPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def prepare_image_latents(
+        self, image, batch_size, num_images_per_prompt, dtype, device, do_classifier_free_guidance, generator=None
+    ):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            image_latents = image
+        else:
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+            if needs_upcasting:
+                self.upcast_vae()
+                image = image.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            image_latents = retrieve_latents(self.vae.encode(image), sample_mode="argmax")
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+
+        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+            # expand image_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {image_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // image_latents.shape[0]
+            image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            image_latents = torch.cat([image_latents], dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_image_latents = torch.zeros_like(image_latents)
+            image_latents = torch.cat([image_latents, image_latents, uncond_image_latents], dim=0)
+
+        if image_latents.dtype != self.vae.dtype:
+            image_latents = image_latents.to(dtype=self.vae.dtype)
+
+        return image_latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                LoRAXFormersAttnProcessor,
+                LoRAAttnProcessor2_0,
+                FusedAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 100,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        image_guidance_scale: float = 1.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Tuple[int, int] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Tuple[int, int] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            image (`torch.FloatTensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`):
+                The image(s) to modify with the pipeline.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            image_guidance_scale (`float`, *optional*, defaults to 1.5):
+                Image guidance scale is to push the generated image towards the inital image `image`. Image guidance
+                scale is enabled by setting `image_guidance_scale > 1`. Higher image guidance scale encourages to
+                generate images that are closely linked to the source image `image`, usually at the expense of lower
+                image quality. This pipeline requires a value of at least `1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            aesthetic_score (`float`, *optional*, defaults to 6.0):
+                Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_aesthetic_score (`float`, *optional*, defaults to 2.5):
+                Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). Can be used to
+                simulate an aesthetic score of the generated image by influencing the negative text condition.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
+
+        if image is None:
+            raise ValueError("`image` input cannot be undefined.")
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0 and image_guidance_scale >= 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+
+        # 4. Preprocess image
+        image = self.image_processor.preprocess(image, height=height, width=width).to(device)
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 6. Prepare Image latents
+        image_latents = self.prepare_image_latents(
+            image,
+            batch_size,
+            num_images_per_prompt,
+            prompt_embeds.dtype,
+            device,
+            do_classifier_free_guidance,
+        )
+
+        # 7. Prepare latent variables
+        num_channels_latents = self.vae.config.latent_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 8. Check that shapes of latents and image match the UNet channels
+        num_channels_image = image_latents.shape[1]
+        if num_channels_latents + num_channels_image != self.unet.config.in_channels:
+            raise ValueError(
+                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
+                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
+                f" `num_channels_image`: {num_channels_image} "
+                f" = {num_channels_latents + num_channels_image}. Please verify the config of"
+                " `pipeline.unet` or your `image` input."
+            )
+
+        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 10. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+
+        if do_classifier_free_guidance:
+            # The extra concat similar to how it's done in SD InstructPix2Pix.
+            prompt_embeds = torch.cat([prompt_embeds, negative_prompt_embeds, negative_prompt_embeds], dim=0)
+            add_text_embeds = torch.cat(
+                [add_text_embeds, negative_pooled_prompt_embeds, negative_pooled_prompt_embeds], dim=0
+            )
+            add_time_ids = torch.cat([add_time_ids, add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        # 11. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Expand the latents if we are doing classifier free guidance.
+                # The latents are expanded 3 times because for pix2pix the guidance
+                # is applied for both the text and the input image.
+                latent_model_input = torch.cat([latents] * 3) if do_classifier_free_guidance else latents
+
+                # concat latents, image_latents in the channel dimension
+                scaled_latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                scaled_latent_model_input = torch.cat([scaled_latent_model_input, image_latents], dim=1)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                noise_pred = self.unet(
+                    scaled_latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_text, noise_pred_image, noise_pred_uncond = noise_pred.chunk(3)
+                    noise_pred = (
+                        noise_pred_uncond
+                        + guidance_scale * (noise_pred_text - noise_pred_image)
+                        + image_guidance_scale * (noise_pred_image - noise_pred_uncond)
+                    )
+
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+            has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+            else:
+                latents = latents / self.vae.config.scaling_factor
+
+            image = self.vae.decode(latents, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            return StableDiffusionXLPipelineOutput(images=latents)
+
+        # apply watermark if available
+        if self.watermark is not None:
+            image = self.watermark.apply_watermark(image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/watermark.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/watermark.py
new file mode 100644
index 000000000..5b6e36d9f
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/watermark.py
@@ -0,0 +1,36 @@
+import numpy as np
+import torch
+
+from ...utils import is_invisible_watermark_available
+
+
+if is_invisible_watermark_available():
+    from imwatermark import WatermarkEncoder
+
+
+# Copied from https://github.com/Stability-AI/generative-models/blob/613af104c6b85184091d42d374fef420eddb356d/scripts/demo/streamlit_helpers.py#L66
+WATERMARK_MESSAGE = 0b101100111110110010010000011110111011000110011110
+# bin(x)[2:] gives bits of x as str, use int to convert them to 0/1
+WATERMARK_BITS = [int(bit) for bit in bin(WATERMARK_MESSAGE)[2:]]
+
+
+class StableDiffusionXLWatermarker:
+    def __init__(self):
+        self.watermark = WATERMARK_BITS
+        self.encoder = WatermarkEncoder()
+
+        self.encoder.set_watermark("bits", self.watermark)
+
+    def apply_watermark(self, images: torch.FloatTensor):
+        # can't encode images that are smaller than 256
+        if images.shape[-1] < 256:
+            return images
+
+        images = (255 * (images / 2 + 0.5)).cpu().permute(0, 2, 3, 1).float().numpy()
+
+        images = [self.encoder.encode(image, "dwtDct") for image in images]
+
+        images = torch.from_numpy(np.array(images)).permute(0, 3, 1, 2)
+
+        images = torch.clamp(2 * (images / 255 - 0.5), min=-1.0, max=1.0)
+        return images
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_video_diffusion/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_video_diffusion/__init__.py
new file mode 100644
index 000000000..3bd4dc789
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_video_diffusion/__init__.py
@@ -0,0 +1,58 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    BaseOutput,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure.update(
+        {
+            "pipeline_stable_video_diffusion": [
+                "StableVideoDiffusionPipeline",
+                "StableVideoDiffusionPipelineOutput",
+            ],
+        }
+    )
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_stable_video_diffusion import (
+            StableVideoDiffusionPipeline,
+            StableVideoDiffusionPipelineOutput,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
new file mode 100644
index 000000000..1342fe429
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
@@ -0,0 +1,673 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...models import AutoencoderKLTemporalDecoder, UNetSpatioTemporalConditionModel
+from ...schedulers import EulerDiscreteScheduler
+from ...utils import BaseOutput, logging, replace_example_docstring
+from ...utils.torch_utils import is_compiled_module, randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusers import StableVideoDiffusionPipeline
+        >>> from diffusers.utils import load_image, export_to_video
+
+        >>> pipe = StableVideoDiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16")
+        >>> pipe.to("cuda")
+
+        >>> image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd-docstring-example.jpeg")
+        >>> image = image.resize((1024, 576))
+
+        >>> frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
+        >>> export_to_video(frames, "generated.mp4", fps=7)
+        ```
+"""
+
+
+def _append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less")
+    return x[(...,) + (None,) * dims_to_append]
+
+
+# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
+def tensor2vid(video: torch.Tensor, processor: VaeImageProcessor, output_type: str = "np"):
+    batch_size, channels, num_frames, height, width = video.shape
+    outputs = []
+    for batch_idx in range(batch_size):
+        batch_vid = video[batch_idx].permute(1, 0, 2, 3)
+        batch_output = processor.postprocess(batch_vid, output_type)
+
+        outputs.append(batch_output)
+
+    if output_type == "np":
+        outputs = np.stack(outputs)
+
+    elif output_type == "pt":
+        outputs = torch.stack(outputs)
+
+    elif not output_type == "pil":
+        raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
+
+    return outputs
+
+
+@dataclass
+class StableVideoDiffusionPipelineOutput(BaseOutput):
+    r"""
+    Output class for Stable Video Diffusion pipeline.
+
+    Args:
+        frames (`[List[List[PIL.Image.Image]]`, `np.ndarray`, `torch.FloatTensor`]):
+            List of denoised PIL images of length `batch_size` or numpy array or torch tensor
+            of shape `(batch_size, num_frames, height, width, num_channels)`.
+    """
+
+    frames: Union[List[List[PIL.Image.Image]], np.ndarray, torch.FloatTensor]
+
+
+class StableVideoDiffusionPipeline(DiffusionPipeline):
+    r"""
+    Pipeline to generate video from an input image using Stable Video Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKLTemporalDecoder`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
+            Frozen CLIP image-encoder ([laion/CLIP-ViT-H-14-laion2B-s32B-b79K](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K)).
+        unet ([`UNetSpatioTemporalConditionModel`]):
+            A `UNetSpatioTemporalConditionModel` to denoise the encoded image latents.
+        scheduler ([`EulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images.
+    """
+
+    model_cpu_offload_seq = "image_encoder->unet->vae"
+    _callback_tensor_inputs = ["latents"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKLTemporalDecoder,
+        image_encoder: CLIPVisionModelWithProjection,
+        unet: UNetSpatioTemporalConditionModel,
+        scheduler: EulerDiscreteScheduler,
+        feature_extractor: CLIPImageProcessor,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            image_encoder=image_encoder,
+            unet=unet,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    def _encode_image(
+        self,
+        image: PipelineImageInput,
+        device: Union[str, torch.device],
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
+    ) -> torch.FloatTensor:
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.image_processor.pil_to_numpy(image)
+            image = self.image_processor.numpy_to_pt(image)
+
+            # We normalize the image before resizing to match with the original implementation.
+            # Then we unnormalize it after resizing.
+            image = image * 2.0 - 1.0
+            image = _resize_with_antialiasing(image, (224, 224))
+            image = (image + 1.0) / 2.0
+
+        # Normalize the image with for CLIP input
+        image = self.feature_extractor(
+            images=image,
+            do_normalize=True,
+            do_center_crop=False,
+            do_resize=False,
+            do_rescale=False,
+            return_tensors="pt",
+        ).pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        image_embeddings = self.image_encoder(image).image_embeds
+        image_embeddings = image_embeddings.unsqueeze(1)
+
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_videos_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            negative_image_embeddings = torch.zeros_like(image_embeddings)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_image_embeddings, image_embeddings])
+
+        return image_embeddings
+
+    def _encode_vae_image(
+        self,
+        image: torch.Tensor,
+        device: Union[str, torch.device],
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
+    ):
+        image = image.to(device=device)
+        image_latents = self.vae.encode(image).latent_dist.mode()
+
+        if do_classifier_free_guidance:
+            negative_image_latents = torch.zeros_like(image_latents)
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_latents = torch.cat([negative_image_latents, image_latents])
+
+        # duplicate image_latents for each generation per prompt, using mps friendly method
+        image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
+
+        return image_latents
+
+    def _get_add_time_ids(
+        self,
+        fps: int,
+        motion_bucket_id: int,
+        noise_aug_strength: float,
+        dtype: torch.dtype,
+        batch_size: int,
+        num_videos_per_prompt: int,
+        do_classifier_free_guidance: bool,
+    ):
+        add_time_ids = [fps, motion_bucket_id, noise_aug_strength]
+
+        passed_add_embed_dim = self.unet.config.addition_time_embed_dim * len(add_time_ids)
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_time_ids = add_time_ids.repeat(batch_size * num_videos_per_prompt, 1)
+
+        if do_classifier_free_guidance:
+            add_time_ids = torch.cat([add_time_ids, add_time_ids])
+
+        return add_time_ids
+
+    def decode_latents(self, latents: torch.FloatTensor, num_frames: int, decode_chunk_size: int = 14):
+        # [batch, frames, channels, height, width] -> [batch*frames, channels, height, width]
+        latents = latents.flatten(0, 1)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        forward_vae_fn = self.vae._orig_mod.forward if is_compiled_module(self.vae) else self.vae.forward
+        accepts_num_frames = "num_frames" in set(inspect.signature(forward_vae_fn).parameters.keys())
+
+        # decode decode_chunk_size frames at a time to avoid OOM
+        frames = []
+        for i in range(0, latents.shape[0], decode_chunk_size):
+            num_frames_in = latents[i : i + decode_chunk_size].shape[0]
+            decode_kwargs = {}
+            if accepts_num_frames:
+                # we only pass num_frames_in if it's expected
+                decode_kwargs["num_frames"] = num_frames_in
+
+            frame = self.vae.decode(latents[i : i + decode_chunk_size], **decode_kwargs).sample
+            frames.append(frame)
+        frames = torch.cat(frames, dim=0)
+
+        # [batch*frames, channels, height, width] -> [batch, channels, frames, height, width]
+        frames = frames.reshape(-1, num_frames, *frames.shape[1:]).permute(0, 2, 1, 3, 4)
+
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        frames = frames.float()
+        return frames
+
+    def check_inputs(self, image, height, width):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.FloatTensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_frames: int,
+        num_channels_latents: int,
+        height: int,
+        width: int,
+        dtype: torch.dtype,
+        device: Union[str, torch.device],
+        generator: torch.Generator,
+        latents: Optional[torch.FloatTensor] = None,
+    ):
+        shape = (
+            batch_size,
+            num_frames,
+            num_channels_latents // 2,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        if isinstance(self.guidance_scale, (int, float)):
+            return self.guidance_scale > 1
+        return self.guidance_scale.max() > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor],
+        height: int = 576,
+        width: int = 1024,
+        num_frames: Optional[int] = None,
+        num_inference_steps: int = 25,
+        min_guidance_scale: float = 1.0,
+        max_guidance_scale: float = 3.0,
+        fps: int = 7,
+        motion_bucket_id: int = 127,
+        noise_aug_strength: float = 0.02,
+        decode_chunk_size: Optional[int] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        return_dict: bool = True,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                Image(s) to guide image generation. If you provide a tensor, the expected value range is between `[0, 1]`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_frames (`int`, *optional*):
+                The number of video frames to generate. Defaults to `self.unet.config.num_frames`
+                (14 for `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`).
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            min_guidance_scale (`float`, *optional*, defaults to 1.0):
+                The minimum guidance scale. Used for the classifier free guidance with first frame.
+            max_guidance_scale (`float`, *optional*, defaults to 3.0):
+                The maximum guidance scale. Used for the classifier free guidance with last frame.
+            fps (`int`, *optional*, defaults to 7):
+                Frames per second. The rate at which the generated images shall be exported to a video after generation.
+                Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
+            motion_bucket_id (`int`, *optional*, defaults to 127):
+                Used for conditioning the amount of motion for the generation. The higher the number the more motion
+                will be in the video.
+            noise_aug_strength (`float`, *optional*, defaults to 0.02):
+                The amount of noise added to the init image, the higher it is the less the video will look like the init image. Increase it for more motion.
+            decode_chunk_size (`int`, *optional*):
+                The number of frames to decode at a time. Higher chunk size leads to better temporal consistency at the expense of more memory usage. By default, the decoder decodes all frames at once for maximal
+                quality. For lower memory usage, reduce `decode_chunk_size`.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `pil`, `np` or `pt`.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that is called at the end of each denoising step during inference. The function is called
+                with the following arguments:
+                    `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`.
+                `callback_kwargs` will include a list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` of (`List[List[PIL.Image.Image]]` or `np.ndarray` or `torch.FloatTensor`) is returned.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        num_frames = num_frames if num_frames is not None else self.unet.config.num_frames
+        decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(image, height, width)
+
+        # 2. Define call parameters
+        if isinstance(image, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(image, list):
+            batch_size = len(image)
+        else:
+            batch_size = image.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        self._guidance_scale = max_guidance_scale
+
+        # 3. Encode input image
+        image_embeddings = self._encode_image(image, device, num_videos_per_prompt, self.do_classifier_free_guidance)
+
+        # NOTE: Stable Video Diffusion was conditioned on fps - 1, which is why it is reduced here.
+        # See: https://github.com/Stability-AI/generative-models/blob/ed0997173f98eaf8f4edf7ba5fe8f15c6b877fd3/scripts/sampling/simple_video_sample.py#L188
+        fps = fps - 1
+
+        # 4. Encode input image using VAE
+        image = self.image_processor.preprocess(image, height=height, width=width).to(device)
+        noise = randn_tensor(image.shape, generator=generator, device=device, dtype=image.dtype)
+        image = image + noise_aug_strength * noise
+
+        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float32)
+
+        image_latents = self._encode_vae_image(
+            image,
+            device=device,
+            num_videos_per_prompt=num_videos_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+        )
+        image_latents = image_latents.to(image_embeddings.dtype)
+
+        # cast back to fp16 if needed
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float16)
+
+        # Repeat the image latents for each frame so we can concatenate them with the noise
+        # image_latents [batch, channels, height, width] ->[batch, num_frames, channels, height, width]
+        image_latents = image_latents.unsqueeze(1).repeat(1, num_frames, 1, 1, 1)
+
+        # 5. Get Added Time IDs
+        added_time_ids = self._get_add_time_ids(
+            fps,
+            motion_bucket_id,
+            noise_aug_strength,
+            image_embeddings.dtype,
+            batch_size,
+            num_videos_per_prompt,
+            self.do_classifier_free_guidance,
+        )
+        added_time_ids = added_time_ids.to(device)
+
+        # 6. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 7. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_frames,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 8. Prepare guidance scale
+        guidance_scale = torch.linspace(min_guidance_scale, max_guidance_scale, num_frames).unsqueeze(0)
+        guidance_scale = guidance_scale.to(device, latents.dtype)
+        guidance_scale = guidance_scale.repeat(batch_size * num_videos_per_prompt, 1)
+        guidance_scale = _append_dims(guidance_scale, latents.ndim)
+
+        self._guidance_scale = guidance_scale
+
+        # 9. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # Concatenate image_latents over channels dimension
+                latent_model_input = torch.cat([latent_model_input, image_latents], dim=2)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=image_embeddings,
+                    added_time_ids=added_time_ids,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        if not output_type == "latent":
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+            frames = self.decode_latents(latents, num_frames, decode_chunk_size)
+            frames = tensor2vid(frames, self.image_processor, output_type=output_type)
+        else:
+            frames = latents
+
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return frames
+
+        return StableVideoDiffusionPipelineOutput(frames=frames)
+
+
+# resizing utils
+# TODO: clean up later
+def _resize_with_antialiasing(input, size, interpolation="bicubic", align_corners=True):
+    h, w = input.shape[-2:]
+    factors = (h / size[0], w / size[1])
+
+    # First, we have to determine sigma
+    # Taken from skimage: https://github.com/scikit-image/scikit-image/blob/v0.19.2/skimage/transform/_warps.py#L171
+    sigmas = (
+        max((factors[0] - 1.0) / 2.0, 0.001),
+        max((factors[1] - 1.0) / 2.0, 0.001),
+    )
+
+    # Now kernel size. Good results are for 3 sigma, but that is kind of slow. Pillow uses 1 sigma
+    # https://github.com/python-pillow/Pillow/blob/master/src/libImaging/Resample.c#L206
+    # But they do it in the 2 passes, which gives better results. Let's try 2 sigmas for now
+    ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3))
+
+    # Make sure it is odd
+    if (ks[0] % 2) == 0:
+        ks = ks[0] + 1, ks[1]
+
+    if (ks[1] % 2) == 0:
+        ks = ks[0], ks[1] + 1
+
+    input = _gaussian_blur2d(input, ks, sigmas)
+
+    output = torch.nn.functional.interpolate(input, size=size, mode=interpolation, align_corners=align_corners)
+    return output
+
+
+def _compute_padding(kernel_size):
+    """Compute padding tuple."""
+    # 4 or 6 ints:  (padding_left, padding_right,padding_top,padding_bottom)
+    # https://pytorch.org/docs/stable/nn.html#torch.nn.functional.pad
+    if len(kernel_size) < 2:
+        raise AssertionError(kernel_size)
+    computed = [k - 1 for k in kernel_size]
+
+    # for even kernels we need to do asymmetric padding :(
+    out_padding = 2 * len(kernel_size) * [0]
+
+    for i in range(len(kernel_size)):
+        computed_tmp = computed[-(i + 1)]
+
+        pad_front = computed_tmp // 2
+        pad_rear = computed_tmp - pad_front
+
+        out_padding[2 * i + 0] = pad_front
+        out_padding[2 * i + 1] = pad_rear
+
+    return out_padding
+
+
+def _filter2d(input, kernel):
+    # prepare kernel
+    b, c, h, w = input.shape
+    tmp_kernel = kernel[:, None, ...].to(device=input.device, dtype=input.dtype)
+
+    tmp_kernel = tmp_kernel.expand(-1, c, -1, -1)
+
+    height, width = tmp_kernel.shape[-2:]
+
+    padding_shape: list[int] = _compute_padding([height, width])
+    input = torch.nn.functional.pad(input, padding_shape, mode="reflect")
+
+    # kernel and input tensor reshape to align element-wise or batch-wise params
+    tmp_kernel = tmp_kernel.reshape(-1, 1, height, width)
+    input = input.view(-1, tmp_kernel.size(0), input.size(-2), input.size(-1))
+
+    # convolve the tensor with the kernel.
+    output = torch.nn.functional.conv2d(input, tmp_kernel, groups=tmp_kernel.size(0), padding=0, stride=1)
+
+    out = output.view(b, c, h, w)
+    return out
+
+
+def _gaussian(window_size: int, sigma):
+    if isinstance(sigma, float):
+        sigma = torch.tensor([[sigma]])
+
+    batch_size = sigma.shape[0]
+
+    x = (torch.arange(window_size, device=sigma.device, dtype=sigma.dtype) - window_size // 2).expand(batch_size, -1)
+
+    if window_size % 2 == 0:
+        x = x + 0.5
+
+    gauss = torch.exp(-x.pow(2.0) / (2 * sigma.pow(2.0)))
+
+    return gauss / gauss.sum(-1, keepdim=True)
+
+
+def _gaussian_blur2d(input, kernel_size, sigma):
+    if isinstance(sigma, tuple):
+        sigma = torch.tensor([sigma], dtype=input.dtype)
+    else:
+        sigma = sigma.to(dtype=input.dtype)
+
+    ky, kx = int(kernel_size[0]), int(kernel_size[1])
+    bs = sigma.shape[0]
+    kernel_x = _gaussian(kx, sigma[:, 1].view(bs, 1))
+    kernel_y = _gaussian(ky, sigma[:, 0].view(bs, 1))
+    out_x = _filter2d(input, kernel_x[..., None, :])
+    out = _filter2d(out_x, kernel_y[..., None])
+
+    return out
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/t2i_adapter/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/t2i_adapter/__init__.py
new file mode 100644
index 000000000..08c22a270
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/t2i_adapter/__init__.py
@@ -0,0 +1,47 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_stable_diffusion_adapter"] = ["StableDiffusionAdapterPipeline"]
+    _import_structure["pipeline_stable_diffusion_xl_adapter"] = ["StableDiffusionXLAdapterPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipeline_stable_diffusion_adapter import StableDiffusionAdapterPipeline
+        from .pipeline_stable_diffusion_xl_adapter import StableDiffusionXLAdapterPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
new file mode 100644
index 000000000..0b55bb38b
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
@@ -0,0 +1,912 @@
+# Copyright 2024 TencentARC and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, MultiAdapter, T2IAdapter, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    PIL_INTERPOLATION,
+    USE_PEFT_BACKEND,
+    BaseOutput,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+
+
+@dataclass
+class StableDiffusionAdapterPipelineOutput(BaseOutput):
+    """
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+        nsfw_content_detected (`List[bool]`)
+            List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, or `None` if safety checking could not be performed.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from PIL import Image
+        >>> from diffusers.utils import load_image
+        >>> import torch
+        >>> from diffusers import StableDiffusionAdapterPipeline, T2IAdapter
+
+        >>> image = load_image(
+        ...     "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/color_ref.png"
+        ... )
+
+        >>> color_palette = image.resize((8, 8))
+        >>> color_palette = color_palette.resize((512, 512), resample=Image.Resampling.NEAREST)
+
+        >>> adapter = T2IAdapter.from_pretrained("TencentARC/t2iadapter_color_sd14v1", torch_dtype=torch.float16)
+        >>> pipe = StableDiffusionAdapterPipeline.from_pretrained(
+        ...     "CompVis/stable-diffusion-v1-4",
+        ...     adapter=adapter,
+        ...     torch_dtype=torch.float16,
+        ... )
+
+        >>> pipe.to("cuda")
+
+        >>> out_image = pipe(
+        ...     "At night, glowing cubes in front of the beach",
+        ...     image=color_palette,
+        ... ).images[0]
+        ```
+"""
+
+
+def _preprocess_adapter_image(image, height, width):
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        image = [np.array(i.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])) for i in image]
+        image = [
+            i[None, ..., None] if i.ndim == 2 else i[None, ...] for i in image
+        ]  # expand [h, w] or [h, w, c] to [b, h, w, c]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        if image[0].ndim == 3:
+            image = torch.stack(image, dim=0)
+        elif image[0].ndim == 4:
+            image = torch.cat(image, dim=0)
+        else:
+            raise ValueError(
+                f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but recive: {image[0].ndim}"
+            )
+    return image
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionAdapterPipeline(DiffusionPipeline, StableDiffusionMixin):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion augmented with T2I-Adapter
+    https://arxiv.org/abs/2302.08453
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        adapter ([`T2IAdapter`] or [`MultiAdapter`] or `List[T2IAdapter]`):
+            Provides additional conditioning to the unet during the denoising process. If you set multiple Adapter as a
+            list, the outputs from each Adapter are added together to create one combined additional conditioning.
+        adapter_weights (`List[float]`, *optional*, defaults to None):
+            List of floats representing the weight which will be multiply to each adapter's output before adding them
+            together.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->adapter->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor"]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        adapter: Union[T2IAdapter, MultiAdapter, List[T2IAdapter]],
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+
+        if isinstance(adapter, (list, tuple)):
+            adapter = MultiAdapter(adapter)
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            adapter=adapter,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        image,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if isinstance(self.adapter, MultiAdapter):
+            if not isinstance(image, list):
+                raise ValueError(
+                    "MultiAdapter is enabled, but `image` is not a list. Please pass a list of images to `image`."
+                )
+
+            if len(image) != len(self.adapter.adapters):
+                raise ValueError(
+                    f"MultiAdapter requires passing the same number of images as adapters. Given {len(image)} images and {len(self.adapter.adapters)} adapters."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def _default_height_width(self, height, width, image):
+        # NOTE: It is possible that a list of images have different
+        # dimensions for each image, so just checking the first image
+        # is not _exactly_ correct, but it is simple.
+        while isinstance(image, list):
+            image = image[0]
+
+        if height is None:
+            if isinstance(image, PIL.Image.Image):
+                height = image.height
+            elif isinstance(image, torch.Tensor):
+                height = image.shape[-2]
+
+            # round down to nearest multiple of `self.adapter.downscale_factor`
+            height = (height // self.adapter.downscale_factor) * self.adapter.downscale_factor
+
+        if width is None:
+            if isinstance(image, PIL.Image.Image):
+                width = image.width
+            elif isinstance(image, torch.Tensor):
+                width = image.shape[-1]
+
+            # round down to nearest multiple of `self.adapter.downscale_factor`
+            width = (width // self.adapter.downscale_factor) * self.adapter.downscale_factor
+
+        return height, width
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Union[torch.Tensor, PIL.Image.Image, List[PIL.Image.Image]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        adapter_conditioning_scale: Union[float, List[float]] = 1.0,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`):
+                The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the
+                type is specified as `Torch.FloatTensor`, it is passed to Adapter as is. PIL.Image.Image` can also be
+                accepted as an image. The control image is automatically resized to fit the output image.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
+                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionAdapterPipelineOutput`] instead
+                of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            adapter_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the adapter are multiplied by `adapter_conditioning_scale` before they are added to the
+                residual in the original unet. If multiple adapters are specified in init, you can set the
+                corresponding scale as a list.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionAdapterPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionAdapterPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple. When returning a tuple, the first element is a list with the generated images, and the second
+            element is a list of `bool`s denoting whether the corresponding generated image likely represents
+            "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height, width = self._default_height_width(height, width, image)
+        device = self._execution_device
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, image, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        self._guidance_scale = guidance_scale
+
+        if isinstance(self.adapter, MultiAdapter):
+            adapter_input = []
+
+            for one_image in image:
+                one_image = _preprocess_adapter_image(one_image, height, width)
+                one_image = one_image.to(device=device, dtype=self.adapter.dtype)
+                adapter_input.append(one_image)
+        else:
+            adapter_input = _preprocess_adapter_image(image, height, width)
+            adapter_input = adapter_input.to(device=device, dtype=self.adapter.dtype)
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.5 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Denoising loop
+        if isinstance(self.adapter, MultiAdapter):
+            adapter_state = self.adapter(adapter_input, adapter_conditioning_scale)
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = v
+        else:
+            adapter_state = self.adapter(adapter_input)
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = v * adapter_conditioning_scale
+        if num_images_per_prompt > 1:
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = v.repeat(num_images_per_prompt, 1, 1, 1)
+        if self.do_classifier_free_guidance:
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = torch.cat([v] * 2, dim=0)
+
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    down_intrablock_additional_residuals=[state.clone() for state in adapter_state],
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if output_type == "latent":
+            image = latents
+            has_nsfw_concept = None
+        elif output_type == "pil":
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+            # 10. Convert to PIL
+            image = self.numpy_to_pil(image)
+        else:
+            # 8. Post-processing
+            image = self.decode_latents(latents)
+
+            # 9. Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionAdapterPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
new file mode 100644
index 000000000..4e0cc61f5
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
@@ -0,0 +1,1258 @@
+# Copyright 2024 TencentARC and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...image_processor import PipelineImageInput, VaeImageProcessor
+from ...loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+from ...models import AutoencoderKL, ImageProjection, MultiAdapter, T2IAdapter, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    PIL_INTERPOLATION,
+    USE_PEFT_BACKEND,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import T2IAdapter, StableDiffusionXLAdapterPipeline, DDPMScheduler
+        >>> from diffusers.utils import load_image
+
+        >>> sketch_image = load_image("https://huggingface.co/Adapter/t2iadapter/resolve/main/sketch.png").convert("L")
+
+        >>> model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+
+        >>> adapter = T2IAdapter.from_pretrained(
+        ...     "Adapter/t2iadapter",
+        ...     subfolder="sketch_sdxl_1.0",
+        ...     torch_dtype=torch.float16,
+        ...     adapter_type="full_adapter_xl",
+        ... )
+        >>> scheduler = DDPMScheduler.from_pretrained(model_id, subfolder="scheduler")
+
+        >>> pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
+        ...     model_id, adapter=adapter, torch_dtype=torch.float16, variant="fp16", scheduler=scheduler
+        ... ).to("cuda")
+
+        >>> generator = torch.manual_seed(42)
+        >>> sketch_image_out = pipe(
+        ...     prompt="a photo of a dog in real world, high quality",
+        ...     negative_prompt="extra digit, fewer digits, cropped, worst quality, low quality",
+        ...     image=sketch_image,
+        ...     generator=generator,
+        ...     guidance_scale=7.5,
+        ... ).images[0]
+        ```
+"""
+
+
+def _preprocess_adapter_image(image, height, width):
+    if isinstance(image, torch.Tensor):
+        return image
+    elif isinstance(image, PIL.Image.Image):
+        image = [image]
+
+    if isinstance(image[0], PIL.Image.Image):
+        image = [np.array(i.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])) for i in image]
+        image = [
+            i[None, ..., None] if i.ndim == 2 else i[None, ...] for i in image
+        ]  # expand [h, w] or [h, w, c] to [b, h, w, c]
+        image = np.concatenate(image, axis=0)
+        image = np.array(image).astype(np.float32) / 255.0
+        image = image.transpose(0, 3, 1, 2)
+        image = torch.from_numpy(image)
+    elif isinstance(image[0], torch.Tensor):
+        if image[0].ndim == 3:
+            image = torch.stack(image, dim=0)
+        elif image[0].ndim == 4:
+            image = torch.cat(image, dim=0)
+        else:
+            raise ValueError(
+                f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but recive: {image[0].ndim}"
+            )
+    return image
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class StableDiffusionXLAdapterPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion augmented with T2I-Adapter
+    https://arxiv.org/abs/2302.08453
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+
+    Args:
+        adapter ([`T2IAdapter`] or [`MultiAdapter`] or `List[T2IAdapter]`):
+            Provides additional conditioning to the unet during the denoising process. If you set multiple Adapter as a
+            list, the outputs from each Adapter are added together to create one combined additional conditioning.
+        adapter_weights (`List[float]`, *optional*, defaults to None):
+            List of floats representing the weight which will be multiply to each adapter's output before adding them
+            together.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "feature_extractor",
+        "image_encoder",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        adapter: Union[T2IAdapter, MultiAdapter, List[T2IAdapter]],
+        scheduler: KarrasDiffusionSchedulers,
+        force_zeros_for_empty_prompt: bool = True,
+        feature_extractor: CLIPImageProcessor = None,
+        image_encoder: CLIPVisionModelWithProjection = None,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            adapter=adapter,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.default_sample_size = self.unet.config.sample_size
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
+    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+
+            return image_embeds, uncond_image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
+    ):
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+
+            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+
+            image_embeds = []
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
+                single_negative_image_embeds = torch.stack(
+                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
+                )
+
+                if do_classifier_free_guidance:
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                    single_image_embeds = single_image_embeds.to(device)
+
+                image_embeds.append(single_image_embeds)
+        else:
+            repeat_dims = [1]
+            image_embeds = []
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                    single_negative_image_embeds = single_negative_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:]))
+                    )
+                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
+                else:
+                    single_image_embeds = single_image_embeds.repeat(
+                        num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:]))
+                    )
+                image_embeds.append(single_image_embeds)
+
+        return image_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.StableDiffusionUpscalePipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                LoRAXFormersAttnProcessor,
+                LoRAAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.t2i_adapter.pipeline_stable_diffusion_adapter.StableDiffusionAdapterPipeline._default_height_width
+    def _default_height_width(self, height, width, image):
+        # NOTE: It is possible that a list of images have different
+        # dimensions for each image, so just checking the first image
+        # is not _exactly_ correct, but it is simple.
+        while isinstance(image, list):
+            image = image[0]
+
+        if height is None:
+            if isinstance(image, PIL.Image.Image):
+                height = image.height
+            elif isinstance(image, torch.Tensor):
+                height = image.shape[-2]
+
+            # round down to nearest multiple of `self.adapter.downscale_factor`
+            height = (height // self.adapter.downscale_factor) * self.adapter.downscale_factor
+
+        if width is None:
+            if isinstance(image, PIL.Image.Image):
+                width = image.width
+            elif isinstance(image, torch.Tensor):
+                width = image.shape[-1]
+
+            # round down to nearest multiple of `self.adapter.downscale_factor`
+            width = (width // self.adapter.downscale_factor) * self.adapter.downscale_factor
+
+        return height, width
+
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+        Args:
+            timesteps (`torch.Tensor`):
+                generate embedding vectors at these timesteps
+            embedding_dim (`int`, *optional*, defaults to 512):
+                dimension of the embeddings to generate
+            dtype:
+                data type of the generated embeddings
+
+        Returns:
+            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        adapter_conditioning_scale: Union[float, List[float]] = 1.0,
+        adapter_conditioning_factor: float = 1.0,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[List[PIL.Image.Image]]`):
+                The Adapter input condition. Adapter uses this input condition to generate guidance to Unet. If the
+                type is specified as `Torch.FloatTensor`, it is passed to Adapter as is. PIL.Image.Image` can also be
+                accepted as an image. The control image is automatically resized to fit the output image.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionAdapterPipelineOutput`]
+                instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            adapter_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
+                The outputs of the adapter are multiplied by `adapter_conditioning_scale` before they are added to the
+                residual in the original unet. If multiple adapters are specified in init, you can set the
+                corresponding scale as a list.
+            adapter_conditioning_factor (`float`, *optional*, defaults to 1.0):
+                The fraction of timesteps for which adapter should be applied. If `adapter_conditioning_factor` is
+                `0.0`, adapter is not applied at all. If `adapter_conditioning_factor` is `1.0`, adapter is applied for
+                all timesteps. If `adapter_conditioning_factor` is `0.5`, adapter is applied for half of the timesteps.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionAdapterPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionAdapterPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        # 0. Default height and width to unet
+
+        height, width = self._default_height_width(height, width, image)
+        device = self._execution_device
+
+        if isinstance(self.adapter, MultiAdapter):
+            adapter_input = []
+
+            for one_image in image:
+                one_image = _preprocess_adapter_image(one_image, height, width)
+                one_image = one_image.to(device=device, dtype=self.adapter.dtype)
+                adapter_input.append(one_image)
+        else:
+            adapter_input = _preprocess_adapter_image(image, height, width)
+            adapter_input = adapter_input.to(device=device, dtype=self.adapter.dtype)
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+        )
+
+        self._guidance_scale = guidance_scale
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3.1 Encode input prompt
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            clip_skip=clip_skip,
+        )
+
+        # 3.2 Encode ip_adapter_image
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6.1 Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Prepare added time ids & embeddings & adapter features
+        if isinstance(self.adapter, MultiAdapter):
+            adapter_state = self.adapter(adapter_input, adapter_conditioning_scale)
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = v
+        else:
+            adapter_state = self.adapter(adapter_input)
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = v * adapter_conditioning_scale
+        if num_images_per_prompt > 1:
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = v.repeat(num_images_per_prompt, 1, 1, 1)
+        if self.do_classifier_free_guidance:
+            for k, v in enumerate(adapter_state):
+                adapter_state[k] = torch.cat([v] * 2, dim=0)
+
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        # Apply denoising_end
+        if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    added_cond_kwargs["image_embeds"] = image_embeds
+
+                # predict the noise residual
+                if i < int(num_inference_steps * adapter_conditioning_factor):
+                    down_intrablock_additional_residuals = [state.clone() for state in adapter_state]
+                else:
+                    down_intrablock_additional_residuals = None
+
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    down_intrablock_additional_residuals=down_intrablock_additional_residuals,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+            return StableDiffusionXLPipelineOutput(images=image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/__init__.py
new file mode 100644
index 000000000..8d8fdb927
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/__init__.py
@@ -0,0 +1,54 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_output"] = ["TextToVideoSDPipelineOutput"]
+    _import_structure["pipeline_text_to_video_synth"] = ["TextToVideoSDPipeline"]
+    _import_structure["pipeline_text_to_video_synth_img2img"] = ["VideoToVideoSDPipeline"]
+    _import_structure["pipeline_text_to_video_zero"] = ["TextToVideoZeroPipeline"]
+    _import_structure["pipeline_text_to_video_zero_sdxl"] = ["TextToVideoZeroSDXLPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipeline_output import TextToVideoSDPipelineOutput
+        from .pipeline_text_to_video_synth import TextToVideoSDPipeline
+        from .pipeline_text_to_video_synth_img2img import VideoToVideoSDPipeline
+        from .pipeline_text_to_video_zero import TextToVideoZeroPipeline
+        from .pipeline_text_to_video_zero_sdxl import TextToVideoZeroSDXLPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py
new file mode 100644
index 000000000..c155386cf
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py
@@ -0,0 +1,25 @@
+from dataclasses import dataclass
+from typing import List, Union
+
+import numpy as np
+import PIL
+import torch
+
+from ...utils import (
+    BaseOutput,
+)
+
+
+@dataclass
+class TextToVideoSDPipelineOutput(BaseOutput):
+    """
+     Output class for text-to-video pipelines.
+
+     Args:
+         frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
+             List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing denoised
+     PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+    `(batch_size, num_frames, channels, height, width)`
+    """
+
+    frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
new file mode 100644
index 000000000..6c33836e6
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
@@ -0,0 +1,663 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet3DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from . import TextToVideoSDPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import TextToVideoSDPipeline
+        >>> from diffusers.utils import export_to_video
+
+        >>> pipe = TextToVideoSDPipeline.from_pretrained(
+        ...     "damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16"
+        ... )
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> prompt = "Spiderman is surfing"
+        >>> video_frames = pipe(prompt).frames[0]
+        >>> video_path = export_to_video(video_frames)
+        >>> video_path
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
+def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
+    batch_size, channels, num_frames, height, width = video.shape
+    outputs = []
+    for batch_idx in range(batch_size):
+        batch_vid = video[batch_idx].permute(1, 0, 2, 3)
+        batch_output = processor.postprocess(batch_vid, output_type)
+
+        outputs.append(batch_output)
+
+    if output_type == "np":
+        outputs = np.stack(outputs)
+
+    elif output_type == "pt":
+        outputs = torch.stack(outputs)
+
+    elif not output_type == "pil":
+        raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
+
+    return outputs
+
+
+class TextToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin):
+    r"""
+    Pipeline for text-to-video generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`UNet3DConditionModel`]):
+            A [`UNet3DConditionModel`] to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet3DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+        image = self.vae.decode(latents).sample
+        video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion_k_diffusion.pipeline_stable_diffusion_k_diffusion.StableDiffusionKDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_frames: int = 16,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 9.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "np",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated video.
+            num_frames (`int`, *optional*, defaults to 16):
+                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
+                amounts to 2 seconds of video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
+                `(batch_size, num_channel, num_frames, height, width)`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format of the generated video. Choose between `torch.FloatTensor` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
+                of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+
+        Returns:
+            [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        num_images_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
+        )
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # reshape latents
+                bsz, channel, frames, width, height = latents.shape
+                latents = latents.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height)
+                noise_pred = noise_pred.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # reshape latents back
+                latents = latents[None, :].reshape(bsz, frames, channel, width, height).permute(0, 2, 1, 3, 4)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # 8. Post processing
+        if output_type == "latent":
+            video = latents
+        else:
+            video_tensor = self.decode_latents(latents)
+            video = tensor2vid(video_tensor, self.image_processor, output_type)
+
+        # 9. Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return TextToVideoSDPipelineOutput(frames=video)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
new file mode 100644
index 000000000..3901946af
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
@@ -0,0 +1,760 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet3DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from . import TextToVideoSDPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
+        >>> from diffusers.utils import export_to_video
+
+        >>> pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_576w", torch_dtype=torch.float16)
+        >>> pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+        >>> pipe.to("cuda")
+
+        >>> prompt = "spiderman running in the desert"
+        >>> video_frames = pipe(prompt, num_inference_steps=40, height=320, width=576, num_frames=24).frames[0]
+        >>> # safe low-res video
+        >>> video_path = export_to_video(video_frames, output_video_path="./video_576_spiderman.mp4")
+
+        >>> # let's offload the text-to-image model
+        >>> pipe.to("cpu")
+
+        >>> # and load the image-to-image model
+        >>> pipe = DiffusionPipeline.from_pretrained(
+        ...     "cerspense/zeroscope_v2_XL", torch_dtype=torch.float16, revision="refs/pr/15"
+        ... )
+        >>> pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+        >>> pipe.enable_model_cpu_offload()
+
+        >>> # The VAE consumes A LOT of memory, let's make sure we run it in sliced mode
+        >>> pipe.vae.enable_slicing()
+
+        >>> # now let's upscale it
+        >>> video = [Image.fromarray(frame).resize((1024, 576)) for frame in video_frames]
+
+        >>> # and denoise it
+        >>> video_frames = pipe(prompt, video=video, strength=0.6).frames[0]
+        >>> video_path = export_to_video(video_frames, output_video_path="./video_1024_spiderman.mp4")
+        >>> video_path
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+
+# Copied from diffusers.pipelines.animatediff.pipeline_animatediff.tensor2vid
+def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type: str = "np"):
+    batch_size, channels, num_frames, height, width = video.shape
+    outputs = []
+    for batch_idx in range(batch_size):
+        batch_vid = video[batch_idx].permute(1, 0, 2, 3)
+        batch_output = processor.postprocess(batch_vid, output_type)
+
+        outputs.append(batch_output)
+
+    if output_type == "np":
+        outputs = np.stack(outputs)
+
+    elif output_type == "pt":
+        outputs = torch.stack(outputs)
+
+    elif not output_type == "pil":
+        raise ValueError(f"{output_type} does not exist. Please choose one of ['np', 'pt', 'pil']")
+
+    return outputs
+
+
+def preprocess_video(video):
+    supported_formats = (np.ndarray, torch.Tensor, PIL.Image.Image)
+
+    if isinstance(video, supported_formats):
+        video = [video]
+    elif not (isinstance(video, list) and all(isinstance(i, supported_formats) for i in video)):
+        raise ValueError(
+            f"Input is in incorrect format: {[type(i) for i in video]}. Currently, we only support {', '.join(supported_formats)}"
+        )
+
+    if isinstance(video[0], PIL.Image.Image):
+        video = [np.array(frame) for frame in video]
+
+    if isinstance(video[0], np.ndarray):
+        video = np.concatenate(video, axis=0) if video[0].ndim == 5 else np.stack(video, axis=0)
+
+        if video.dtype == np.uint8:
+            video = np.array(video).astype(np.float32) / 255.0
+
+        if video.ndim == 4:
+            video = video[None, ...]
+
+        video = torch.from_numpy(video.transpose(0, 4, 1, 2, 3))
+
+    elif isinstance(video[0], torch.Tensor):
+        video = torch.cat(video, axis=0) if video[0].ndim == 5 else torch.stack(video, axis=0)
+
+        # don't need any preprocess if the video is latents
+        channel = video.shape[1]
+        if channel == 4:
+            return video
+
+        # move channels before num_frames
+        video = video.permute(0, 2, 1, 3, 4)
+
+    # normalize video
+    video = 2.0 * video - 1.0
+
+    return video
+
+
+class VideoToVideoSDPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin):
+    r"""
+    Pipeline for text-guided video-to-video generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`UNet3DConditionModel`]):
+            A [`UNet3DConditionModel`] to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->unet->vae"
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet3DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+
+        batch_size, channels, num_frames, height, width = latents.shape
+        latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
+
+        image = self.vae.decode(latents).sample
+        video = image[None, :].reshape((batch_size, num_frames, -1) + image.shape[2:]).permute(0, 2, 1, 3, 4)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        video = video.float()
+        return video
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        prompt,
+        strength,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if strength < 0 or strength > 1:
+            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+    def prepare_latents(self, video, timestep, batch_size, dtype, device, generator=None):
+        video = video.to(device=device, dtype=dtype)
+
+        # change from (b, c, f, h, w) -> (b * f, c, w, h)
+        bsz, channel, frames, width, height = video.shape
+        video = video.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height)
+
+        if video.shape[1] == 4:
+            init_latents = video
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+            elif isinstance(generator, list):
+                init_latents = [
+                    retrieve_latents(self.vae.encode(video[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(video), generator=generator)
+
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `video` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        latents = latents[None, :].reshape((bsz, frames, latents.shape[1]) + latents.shape[2:]).permute(0, 2, 1, 3, 4)
+
+        return latents
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        video: Union[List[np.ndarray], torch.FloatTensor] = None,
+        strength: float = 0.6,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 15.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "np",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            video (`List[np.ndarray]` or `torch.FloatTensor`):
+                `video` frames or tensor representing a video batch to be used as the starting point for the process.
+                Can also accept video latents as `image`, if passing latents directly, it will not be encoded again.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `video`. Must be between 0 and 1. `video` is used as a
+                starting point, adding more noise to it the larger the `strength`. The number of denoising steps
+                depends on the amount of noise initially added. When `strength` is 1, added noise is maximum and the
+                denoising process runs for the full number of iterations specified in `num_inference_steps`. A value of
+                1 essentially ignores `video`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in video generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
+                `(batch_size, num_channel, num_frames, height, width)`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format of the generated video. Choose between `torch.FloatTensor` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
+                of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+
+        Returns:
+            [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+        # 0. Default height and width to unet
+        num_images_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        # 4. Preprocess video
+        video = preprocess_video(video)
+
+        # 5. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
+        latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+
+        # 6. Prepare latent variables
+        latents = self.prepare_latents(video, latent_timestep, batch_size, prompt_embeds.dtype, device, generator)
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # reshape latents
+                bsz, channel, frames, width, height = latents.shape
+                latents = latents.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height)
+                noise_pred = noise_pred.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # reshape latents back
+                latents = latents[None, :].reshape(bsz, frames, channel, width, height).permute(0, 2, 1, 3, 4)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+
+        # 9. Post processing
+        if output_type == "latent":
+            video = latents
+        else:
+            video_tensor = self.decode_latents(latents)
+            video = tensor2vid(video_tensor, self.image_processor, output_type)
+
+        # 10. Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return TextToVideoSDPipelineOutput(frames=video)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
new file mode 100644
index 000000000..d3ff3728c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
@@ -0,0 +1,969 @@
+import copy
+import inspect
+from dataclasses import dataclass
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from torch.nn.functional import grid_sample
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import USE_PEFT_BACKEND, BaseOutput, logging, scale_lora_layers, unscale_lora_layers
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from ..stable_diffusion import StableDiffusionSafetyChecker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def rearrange_0(tensor, f):
+    F, C, H, W = tensor.size()
+    tensor = torch.permute(torch.reshape(tensor, (F // f, f, C, H, W)), (0, 2, 1, 3, 4))
+    return tensor
+
+
+def rearrange_1(tensor):
+    B, C, F, H, W = tensor.size()
+    return torch.reshape(torch.permute(tensor, (0, 2, 1, 3, 4)), (B * F, C, H, W))
+
+
+def rearrange_3(tensor, f):
+    F, D, C = tensor.size()
+    return torch.reshape(tensor, (F // f, f, D, C))
+
+
+def rearrange_4(tensor):
+    B, F, D, C = tensor.size()
+    return torch.reshape(tensor, (B * F, D, C))
+
+
+class CrossFrameAttnProcessor:
+    """
+    Cross frame attention processor. Each frame attends the first frame.
+
+    Args:
+        batch_size: The number that represents actual batch size, other than the frames.
+            For example, calling unet with a single prompt and num_images_per_prompt=1, batch_size should be equal to
+            2, due to classifier-free guidance.
+    """
+
+    def __init__(self, batch_size=2):
+        self.batch_size = batch_size
+
+    def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        query = attn.to_q(hidden_states)
+
+        is_cross_attention = encoder_hidden_states is not None
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        # Cross Frame Attention
+        if not is_cross_attention:
+            video_length = key.size()[0] // self.batch_size
+            first_frame_index = [0] * video_length
+
+            # rearrange keys to have batch and frames in the 1st and 2nd dims respectively
+            key = rearrange_3(key, video_length)
+            key = key[:, first_frame_index]
+            # rearrange values to have batch and frames in the 1st and 2nd dims respectively
+            value = rearrange_3(value, video_length)
+            value = value[:, first_frame_index]
+
+            # rearrange back to original shape
+            key = rearrange_4(key)
+            value = rearrange_4(value)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        return hidden_states
+
+
+class CrossFrameAttnProcessor2_0:
+    """
+    Cross frame attention processor with scaled_dot_product attention of Pytorch 2.0.
+
+    Args:
+        batch_size: The number that represents actual batch size, other than the frames.
+            For example, calling unet with a single prompt and num_images_per_prompt=1, batch_size should be equal to
+            2, due to classifier-free guidance.
+    """
+
+    def __init__(self, batch_size=2):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.batch_size = batch_size
+
+    def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        inner_dim = hidden_states.shape[-1]
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        query = attn.to_q(hidden_states)
+
+        is_cross_attention = encoder_hidden_states is not None
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        # Cross Frame Attention
+        if not is_cross_attention:
+            video_length = max(1, key.size()[0] // self.batch_size)
+            first_frame_index = [0] * video_length
+
+            # rearrange keys to have batch and frames in the 1st and 2nd dims respectively
+            key = rearrange_3(key, video_length)
+            key = key[:, first_frame_index]
+            # rearrange values to have batch and frames in the 1st and 2nd dims respectively
+            value = rearrange_3(value, video_length)
+            value = value[:, first_frame_index]
+
+            # rearrange back to original shape
+            key = rearrange_4(key)
+            value = rearrange_4(value)
+
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+
+
+@dataclass
+class TextToVideoPipelineOutput(BaseOutput):
+    r"""
+    Output class for zero-shot text-to-video pipeline.
+
+    Args:
+        images (`[List[PIL.Image.Image]`, `np.ndarray`]):
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+        nsfw_content_detected (`[List[bool]]`):
+            List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
+            `None` if safety checking could not be performed.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]
+
+
+def coords_grid(batch, ht, wd, device):
+    # Adapted from https://github.com/princeton-vl/RAFT/blob/master/core/utils/utils.py
+    coords = torch.meshgrid(torch.arange(ht, device=device), torch.arange(wd, device=device))
+    coords = torch.stack(coords[::-1], dim=0).float()
+    return coords[None].repeat(batch, 1, 1, 1)
+
+
+def warp_single_latent(latent, reference_flow):
+    """
+    Warp latent of a single frame with given flow
+
+    Args:
+        latent: latent code of a single frame
+        reference_flow: flow which to warp the latent with
+
+    Returns:
+        warped: warped latent
+    """
+    _, _, H, W = reference_flow.size()
+    _, _, h, w = latent.size()
+    coords0 = coords_grid(1, H, W, device=latent.device).to(latent.dtype)
+
+    coords_t0 = coords0 + reference_flow
+    coords_t0[:, 0] /= W
+    coords_t0[:, 1] /= H
+
+    coords_t0 = coords_t0 * 2.0 - 1.0
+    coords_t0 = F.interpolate(coords_t0, size=(h, w), mode="bilinear")
+    coords_t0 = torch.permute(coords_t0, (0, 2, 3, 1))
+
+    warped = grid_sample(latent, coords_t0, mode="nearest", padding_mode="reflection")
+    return warped
+
+
+def create_motion_field(motion_field_strength_x, motion_field_strength_y, frame_ids, device, dtype):
+    """
+    Create translation motion field
+
+    Args:
+        motion_field_strength_x: motion strength along x-axis
+        motion_field_strength_y: motion strength along y-axis
+        frame_ids: indexes of the frames the latents of which are being processed.
+            This is needed when we perform chunk-by-chunk inference
+        device: device
+        dtype: dtype
+
+    Returns:
+
+    """
+    seq_length = len(frame_ids)
+    reference_flow = torch.zeros((seq_length, 2, 512, 512), device=device, dtype=dtype)
+    for fr_idx in range(seq_length):
+        reference_flow[fr_idx, 0, :, :] = motion_field_strength_x * (frame_ids[fr_idx])
+        reference_flow[fr_idx, 1, :, :] = motion_field_strength_y * (frame_ids[fr_idx])
+    return reference_flow
+
+
+def create_motion_field_and_warp_latents(motion_field_strength_x, motion_field_strength_y, frame_ids, latents):
+    """
+    Creates translation motion and warps the latents accordingly
+
+    Args:
+        motion_field_strength_x: motion strength along x-axis
+        motion_field_strength_y: motion strength along y-axis
+        frame_ids: indexes of the frames the latents of which are being processed.
+            This is needed when we perform chunk-by-chunk inference
+        latents: latent codes of frames
+
+    Returns:
+        warped_latents: warped latents
+    """
+    motion_field = create_motion_field(
+        motion_field_strength_x=motion_field_strength_x,
+        motion_field_strength_y=motion_field_strength_y,
+        frame_ids=frame_ids,
+        device=latents.device,
+        dtype=latents.dtype,
+    )
+    warped_latents = latents.clone().detach()
+    for i in range(len(warped_latents)):
+        warped_latents[i] = warp_single_latent(latents[i][None], motion_field[i][None])
+    return warped_latents
+
+
+class TextToVideoZeroPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInversionLoaderMixin, LoraLoaderMixin):
+    r"""
+    Pipeline for zero-shot text-to-video generation using Stable Diffusion.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer (`CLIPTokenizer`):
+            A [`~transformers.CLIPTokenizer`] to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A [`UNet3DConditionModel`] to denoise the encoded video latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`CLIPImageProcessor`]):
+            A [`CLIPImageProcessor`] to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        processor = (
+            CrossFrameAttnProcessor2_0(batch_size=2)
+            if hasattr(F, "scaled_dot_product_attention")
+            else CrossFrameAttnProcessor(batch_size=2)
+        )
+        self.unet.set_attn_processor(processor)
+
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    def forward_loop(self, x_t0, t0, t1, generator):
+        """
+        Perform DDPM forward process from time t0 to t1. This is the same as adding noise with corresponding variance.
+
+        Args:
+            x_t0:
+                Latent code at time t0.
+            t0:
+                Timestep at t0.
+            t1:
+                Timestamp at t1.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+
+        Returns:
+            x_t1:
+                Forward process applied to x_t0 from time t0 to t1.
+        """
+        eps = randn_tensor(x_t0.size(), generator=generator, dtype=x_t0.dtype, device=x_t0.device)
+        alpha_vec = torch.prod(self.scheduler.alphas[t0:t1])
+        x_t1 = torch.sqrt(alpha_vec) * x_t0 + torch.sqrt(1 - alpha_vec) * eps
+        return x_t1
+
+    def backward_loop(
+        self,
+        latents,
+        timesteps,
+        prompt_embeds,
+        guidance_scale,
+        callback,
+        callback_steps,
+        num_warmup_steps,
+        extra_step_kwargs,
+        cross_attention_kwargs=None,
+    ):
+        """
+        Perform backward process given list of time steps.
+
+        Args:
+            latents:
+                Latents at time timesteps[0].
+            timesteps:
+                Time steps along which to perform backward process.
+            prompt_embeds:
+                Pre-generated text embeddings.
+            guidance_scale:
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            extra_step_kwargs:
+                Extra_step_kwargs.
+            cross_attention_kwargs:
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            num_warmup_steps:
+                number of warmup steps.
+
+        Returns:
+            latents:
+                Latents of backward process output at time timesteps[-1].
+        """
+        do_classifier_free_guidance = guidance_scale > 1.0
+        num_steps = (len(timesteps) - num_warmup_steps) // self.scheduler.order
+        with self.progress_bar(total=num_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+        return latents.clone().detach()
+
+    # Copied from diffusers.pipelines.stable_diffusion_k_diffusion.pipeline_stable_diffusion_k_diffusion.StableDiffusionKDiffusionPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        video_length: Optional[int] = 8,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        motion_field_strength_x: float = 12,
+        motion_field_strength_y: float = 12,
+        output_type: Optional[str] = "tensor",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        t0: int = 44,
+        t1: int = 47,
+        frame_ids: Optional[List[int]] = None,
+    ):
+        """
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            video_length (`int`, *optional*, defaults to 8):
+                The number of generated video frames.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in video generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"numpy"`):
+                The output format of the generated video. Choose between `"latent"` and `"numpy"`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a
+                [`~pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.TextToVideoPipelineOutput`] instead of
+                a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            motion_field_strength_x (`float`, *optional*, defaults to 12):
+                Strength of motion in generated video along x-axis. See the [paper](https://arxiv.org/abs/2303.13439),
+                Sect. 3.3.1.
+            motion_field_strength_y (`float`, *optional*, defaults to 12):
+                Strength of motion in generated video along y-axis. See the [paper](https://arxiv.org/abs/2303.13439),
+                Sect. 3.3.1.
+            t0 (`int`, *optional*, defaults to 44):
+                Timestep t0. Should be in the range [0, num_inference_steps - 1]. See the
+                [paper](https://arxiv.org/abs/2303.13439), Sect. 3.3.1.
+            t1 (`int`, *optional*, defaults to 47):
+                Timestep t0. Should be in the range [t0 + 1, num_inference_steps - 1]. See the
+                [paper](https://arxiv.org/abs/2303.13439), Sect. 3.3.1.
+            frame_ids (`List[int]`, *optional*):
+                Indexes of the frames that are being generated. This is used when generating longer videos
+                chunk-by-chunk.
+
+        Returns:
+            [`~pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.TextToVideoPipelineOutput`]:
+                The output contains a `ndarray` of the generated video, when `output_type` != `"latent"`, otherwise a
+                latent code of generated videos and a list of `bool`s indicating whether the corresponding generated
+                video contains "not-safe-for-work" (nsfw) content..
+        """
+        assert video_length > 0
+        if frame_ids is None:
+            frame_ids = list(range(video_length))
+        assert len(frame_ids) == video_length
+
+        assert num_videos_per_prompt == 1
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        if isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt]
+
+        # Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+        # Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+
+        # Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # Encode input prompt
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt, device, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        # Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+        # Perform the first backward process up to time T_1
+        x_1_t1 = self.backward_loop(
+            timesteps=timesteps[: -t1 - 1],
+            prompt_embeds=prompt_embeds,
+            latents=latents,
+            guidance_scale=guidance_scale,
+            callback=callback,
+            callback_steps=callback_steps,
+            extra_step_kwargs=extra_step_kwargs,
+            num_warmup_steps=num_warmup_steps,
+        )
+        scheduler_copy = copy.deepcopy(self.scheduler)
+
+        # Perform the second backward process up to time T_0
+        x_1_t0 = self.backward_loop(
+            timesteps=timesteps[-t1 - 1 : -t0 - 1],
+            prompt_embeds=prompt_embeds,
+            latents=x_1_t1,
+            guidance_scale=guidance_scale,
+            callback=callback,
+            callback_steps=callback_steps,
+            extra_step_kwargs=extra_step_kwargs,
+            num_warmup_steps=0,
+        )
+
+        # Propagate first frame latents at time T_0 to remaining frames
+        x_2k_t0 = x_1_t0.repeat(video_length - 1, 1, 1, 1)
+
+        # Add motion in latents at time T_0
+        x_2k_t0 = create_motion_field_and_warp_latents(
+            motion_field_strength_x=motion_field_strength_x,
+            motion_field_strength_y=motion_field_strength_y,
+            latents=x_2k_t0,
+            frame_ids=frame_ids[1:],
+        )
+
+        # Perform forward process up to time T_1
+        x_2k_t1 = self.forward_loop(
+            x_t0=x_2k_t0,
+            t0=timesteps[-t0 - 1].item(),
+            t1=timesteps[-t1 - 1].item(),
+            generator=generator,
+        )
+
+        # Perform backward process from time T_1 to 0
+        x_1k_t1 = torch.cat([x_1_t1, x_2k_t1])
+        b, l, d = prompt_embeds.size()
+        prompt_embeds = prompt_embeds[:, None].repeat(1, video_length, 1, 1).reshape(b * video_length, l, d)
+
+        self.scheduler = scheduler_copy
+        x_1k_0 = self.backward_loop(
+            timesteps=timesteps[-t1 - 1 :],
+            prompt_embeds=prompt_embeds,
+            latents=x_1k_t1,
+            guidance_scale=guidance_scale,
+            callback=callback,
+            callback_steps=callback_steps,
+            extra_step_kwargs=extra_step_kwargs,
+            num_warmup_steps=0,
+        )
+        latents = x_1k_0
+
+        # manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.unet.to("cpu")
+        torch.cuda.empty_cache()
+
+        if output_type == "latent":
+            image = latents
+            has_nsfw_concept = None
+        else:
+            image = self.decode_latents(latents)
+            # Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return TextToVideoPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def decode_latents(self, latents):
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py
new file mode 100644
index 000000000..eaa276036
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py
@@ -0,0 +1,1315 @@
+import copy
+import inspect
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import PIL
+import torch
+import torch.nn.functional as F
+from torch.nn.functional import grid_sample
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
+from ...models.attention_processor import (
+    AttnProcessor2_0,
+    FusedAttnProcessor2_0,
+    LoRAAttnProcessor2_0,
+    LoRAXFormersAttnProcessor,
+    XFormersAttnProcessor,
+)
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    BaseOutput,
+    is_invisible_watermark_available,
+    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+
+
+if is_invisible_watermark_available():
+    from ..stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.rearrange_0
+def rearrange_0(tensor, f):
+    F, C, H, W = tensor.size()
+    tensor = torch.permute(torch.reshape(tensor, (F // f, f, C, H, W)), (0, 2, 1, 3, 4))
+    return tensor
+
+
+# Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.rearrange_1
+def rearrange_1(tensor):
+    B, C, F, H, W = tensor.size()
+    return torch.reshape(torch.permute(tensor, (0, 2, 1, 3, 4)), (B * F, C, H, W))
+
+
+# Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.rearrange_3
+def rearrange_3(tensor, f):
+    F, D, C = tensor.size()
+    return torch.reshape(tensor, (F // f, f, D, C))
+
+
+# Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.rearrange_4
+def rearrange_4(tensor):
+    B, F, D, C = tensor.size()
+    return torch.reshape(tensor, (B * F, D, C))
+
+
+# Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.CrossFrameAttnProcessor
+class CrossFrameAttnProcessor:
+    """
+    Cross frame attention processor. Each frame attends the first frame.
+
+    Args:
+        batch_size: The number that represents actual batch size, other than the frames.
+            For example, calling unet with a single prompt and num_images_per_prompt=1, batch_size should be equal to
+            2, due to classifier-free guidance.
+    """
+
+    def __init__(self, batch_size=2):
+        self.batch_size = batch_size
+
+    def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        query = attn.to_q(hidden_states)
+
+        is_cross_attention = encoder_hidden_states is not None
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        # Cross Frame Attention
+        if not is_cross_attention:
+            video_length = key.size()[0] // self.batch_size
+            first_frame_index = [0] * video_length
+
+            # rearrange keys to have batch and frames in the 1st and 2nd dims respectively
+            key = rearrange_3(key, video_length)
+            key = key[:, first_frame_index]
+            # rearrange values to have batch and frames in the 1st and 2nd dims respectively
+            value = rearrange_3(value, video_length)
+            value = value[:, first_frame_index]
+
+            # rearrange back to original shape
+            key = rearrange_4(key)
+            value = rearrange_4(value)
+
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        return hidden_states
+
+
+# Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.CrossFrameAttnProcessor2_0
+class CrossFrameAttnProcessor2_0:
+    """
+    Cross frame attention processor with scaled_dot_product attention of Pytorch 2.0.
+
+    Args:
+        batch_size: The number that represents actual batch size, other than the frames.
+            For example, calling unet with a single prompt and num_images_per_prompt=1, batch_size should be equal to
+            2, due to classifier-free guidance.
+    """
+
+    def __init__(self, batch_size=2):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+        self.batch_size = batch_size
+
+    def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None):
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        inner_dim = hidden_states.shape[-1]
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        query = attn.to_q(hidden_states)
+
+        is_cross_attention = encoder_hidden_states is not None
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        # Cross Frame Attention
+        if not is_cross_attention:
+            video_length = max(1, key.size()[0] // self.batch_size)
+            first_frame_index = [0] * video_length
+
+            # rearrange keys to have batch and frames in the 1st and 2nd dims respectively
+            key = rearrange_3(key, video_length)
+            key = key[:, first_frame_index]
+            # rearrange values to have batch and frames in the 1st and 2nd dims respectively
+            value = rearrange_3(value, video_length)
+            value = value[:, first_frame_index]
+
+            # rearrange back to original shape
+            key = rearrange_4(key)
+            value = rearrange_4(value)
+
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+
+
+@dataclass
+class TextToVideoSDXLPipelineOutput(BaseOutput):
+    """
+    Output class for zero-shot text-to-video pipeline.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
+
+
+# Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.coords_grid
+def coords_grid(batch, ht, wd, device):
+    # Adapted from https://github.com/princeton-vl/RAFT/blob/master/core/utils/utils.py
+    coords = torch.meshgrid(torch.arange(ht, device=device), torch.arange(wd, device=device))
+    coords = torch.stack(coords[::-1], dim=0).float()
+    return coords[None].repeat(batch, 1, 1, 1)
+
+
+# Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.warp_single_latent
+def warp_single_latent(latent, reference_flow):
+    """
+    Warp latent of a single frame with given flow
+
+    Args:
+        latent: latent code of a single frame
+        reference_flow: flow which to warp the latent with
+
+    Returns:
+        warped: warped latent
+    """
+    _, _, H, W = reference_flow.size()
+    _, _, h, w = latent.size()
+    coords0 = coords_grid(1, H, W, device=latent.device).to(latent.dtype)
+
+    coords_t0 = coords0 + reference_flow
+    coords_t0[:, 0] /= W
+    coords_t0[:, 1] /= H
+
+    coords_t0 = coords_t0 * 2.0 - 1.0
+    coords_t0 = F.interpolate(coords_t0, size=(h, w), mode="bilinear")
+    coords_t0 = torch.permute(coords_t0, (0, 2, 3, 1))
+
+    warped = grid_sample(latent, coords_t0, mode="nearest", padding_mode="reflection")
+    return warped
+
+
+# Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.create_motion_field
+def create_motion_field(motion_field_strength_x, motion_field_strength_y, frame_ids, device, dtype):
+    """
+    Create translation motion field
+
+    Args:
+        motion_field_strength_x: motion strength along x-axis
+        motion_field_strength_y: motion strength along y-axis
+        frame_ids: indexes of the frames the latents of which are being processed.
+            This is needed when we perform chunk-by-chunk inference
+        device: device
+        dtype: dtype
+
+    Returns:
+
+    """
+    seq_length = len(frame_ids)
+    reference_flow = torch.zeros((seq_length, 2, 512, 512), device=device, dtype=dtype)
+    for fr_idx in range(seq_length):
+        reference_flow[fr_idx, 0, :, :] = motion_field_strength_x * (frame_ids[fr_idx])
+        reference_flow[fr_idx, 1, :, :] = motion_field_strength_y * (frame_ids[fr_idx])
+    return reference_flow
+
+
+# Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.create_motion_field_and_warp_latents
+def create_motion_field_and_warp_latents(motion_field_strength_x, motion_field_strength_y, frame_ids, latents):
+    """
+    Creates translation motion and warps the latents accordingly
+
+    Args:
+        motion_field_strength_x: motion strength along x-axis
+        motion_field_strength_y: motion strength along y-axis
+        frame_ids: indexes of the frames the latents of which are being processed.
+            This is needed when we perform chunk-by-chunk inference
+        latents: latent codes of frames
+
+    Returns:
+        warped_latents: warped latents
+    """
+    motion_field = create_motion_field(
+        motion_field_strength_x=motion_field_strength_x,
+        motion_field_strength_y=motion_field_strength_y,
+        frame_ids=frame_ids,
+        device=latents.device,
+        dtype=latents.dtype,
+    )
+    warped_latents = latents.clone().detach()
+    for i in range(len(warped_latents)):
+        warped_latents[i] = warp_single_latent(latents[i][None], motion_field[i][None])
+    return warped_latents
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    return noise_cfg
+
+
+class TextToVideoZeroSDXLPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+):
+    r"""
+    Pipeline for zero-shot text-to-video generation using Stable Diffusion XL.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion XL uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([` CLIPTextModelWithProjection`]):
+            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
+            specifically the
+            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
+            variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`CLIPTokenizer`):
+            Second Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+    """
+
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        text_encoder_2: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        tokenizer_2: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        force_zeros_for_empty_prompt: bool = True,
+        add_watermarker: Optional[bool] = None,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            unet=unet,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        self.default_sample_size = self.unet.config.sample_size
+
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+
+        processor = (
+            CrossFrameAttnProcessor2_0(batch_size=2)
+            if hasattr(F, "scaled_dot_product_attention")
+            else CrossFrameAttnProcessor(batch_size=2)
+        )
+
+        self.unet.set_attn_processor(processor)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.upcast_vae
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                LoRAXFormersAttnProcessor,
+                LoRAAttnProcessor2_0,
+                FusedAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline._get_add_time_ids
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        passed_add_embed_dim = (
+            self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str,
+        prompt_2: Optional[str] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        do_classifier_free_guidance: bool = True,
+        negative_prompt: Optional[str] = None,
+        negative_prompt_2: Optional[str] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        device = device or self._execution_device
+
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder, lora_scale)
+
+            if self.text_encoder_2 is not None:
+                if not USE_PEFT_BACKEND:
+                    adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+                else:
+                    scale_lora_layers(self.text_encoder_2, lora_scale)
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # Define tokenizers and text encoders
+        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
+        text_encoders = (
+            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
+        )
+
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+
+            # textual inversion: process multi-vector tokens if necessary
+            prompt_embeds_list = []
+            prompts = [prompt, prompt_2]
+            for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    prompt = self.maybe_convert_prompt(prompt, tokenizer)
+
+                text_inputs = tokenizer(
+                    prompt,
+                    padding="max_length",
+                    max_length=tokenizer.model_max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                text_input_ids = text_inputs.input_ids
+                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                    text_input_ids, untruncated_ids
+                ):
+                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
+                    logger.warning(
+                        "The following part of your input was truncated because CLIP can only handle sequences up to"
+                        f" {tokenizer.model_max_length} tokens: {removed_text}"
+                    )
+
+                prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
+
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                pooled_prompt_embeds = prompt_embeds[0]
+                if clip_skip is None:
+                    prompt_embeds = prompt_embeds.hidden_states[-2]
+                else:
+                    # "2" because SDXL always indexes from the penultimate layer.
+                    prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
+
+                prompt_embeds_list.append(prompt_embeds)
+
+            prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
+
+        # get unconditional embeddings for classifier free guidance
+        zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
+        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
+            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
+        elif do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt_2 = negative_prompt_2 or negative_prompt
+
+            # normalize str to list
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+            negative_prompt_2 = (
+                batch_size * [negative_prompt_2] if isinstance(negative_prompt_2, str) else negative_prompt_2
+            )
+
+            uncond_tokens: List[str]
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = [negative_prompt, negative_prompt_2]
+
+            negative_prompt_embeds_list = []
+            for negative_prompt, tokenizer, text_encoder in zip(uncond_tokens, tokenizers, text_encoders):
+                if isinstance(self, TextualInversionLoaderMixin):
+                    negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer)
+
+                max_length = prompt_embeds.shape[1]
+                uncond_input = tokenizer(
+                    negative_prompt,
+                    padding="max_length",
+                    max_length=max_length,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+
+                negative_prompt_embeds = text_encoder(
+                    uncond_input.input_ids.to(device),
+                    output_hidden_states=True,
+                )
+                # We are only ALWAYS interested in the pooled output of the final text encoder
+                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
+
+                negative_prompt_embeds_list.append(negative_prompt_embeds)
+
+            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)
+
+        if self.text_encoder_2 is not None:
+            prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+        else:
+            prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            if self.text_encoder_2 is not None:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
+            else:
+                negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+            bs_embed * num_images_per_prompt, -1
+        )
+        if do_classifier_free_guidance:
+            negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
+                bs_embed * num_images_per_prompt, -1
+            )
+
+        if self.text_encoder is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+
+        if self.text_encoder_2 is not None:
+            if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
+
+    # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.TextToVideoZeroPipeline.forward_loop
+    def forward_loop(self, x_t0, t0, t1, generator):
+        """
+        Perform DDPM forward process from time t0 to t1. This is the same as adding noise with corresponding variance.
+
+        Args:
+            x_t0:
+                Latent code at time t0.
+            t0:
+                Timestep at t0.
+            t1:
+                Timestamp at t1.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+
+        Returns:
+            x_t1:
+                Forward process applied to x_t0 from time t0 to t1.
+        """
+        eps = randn_tensor(x_t0.size(), generator=generator, dtype=x_t0.dtype, device=x_t0.device)
+        alpha_vec = torch.prod(self.scheduler.alphas[t0:t1])
+        x_t1 = torch.sqrt(alpha_vec) * x_t0 + torch.sqrt(1 - alpha_vec) * eps
+        return x_t1
+
+    def backward_loop(
+        self,
+        latents,
+        timesteps,
+        prompt_embeds,
+        guidance_scale,
+        callback,
+        callback_steps,
+        num_warmup_steps,
+        extra_step_kwargs,
+        add_text_embeds,
+        add_time_ids,
+        cross_attention_kwargs=None,
+        guidance_rescale: float = 0.0,
+    ):
+        """
+        Perform backward process given list of time steps
+
+        Args:
+            latents:
+                Latents at time timesteps[0].
+            timesteps:
+                Time steps along which to perform backward process.
+            prompt_embeds:
+                Pre-generated text embeddings.
+            guidance_scale:
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            extra_step_kwargs:
+                Extra_step_kwargs.
+            cross_attention_kwargs:
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            num_warmup_steps:
+                number of warmup steps.
+
+        Returns:
+            latents: latents of backward process output at time timesteps[-1]
+        """
+
+        do_classifier_free_guidance = guidance_scale > 1.0
+        num_steps = (len(timesteps) - num_warmup_steps) // self.scheduler.order
+
+        with self.progress_bar(total=num_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if do_classifier_free_guidance and guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        return latents.clone().detach()
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        video_length: Optional[int] = 8,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        frame_ids: Optional[List[int]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        motion_field_strength_x: float = 12,
+        motion_field_strength_y: float = 12,
+        output_type: Optional[str] = "tensor",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        t0: int = 44,
+        t1: int = 47,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            video_length (`int`, *optional*, defaults to 8):
+                The number of generated video frames.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of videos to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            frame_ids (`List[int]`, *optional*):
+                Indexes of the frames that are being generated. This is used when generating longer videos
+                chunk-by-chunk.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            motion_field_strength_x (`float`, *optional*, defaults to 12):
+                Strength of motion in generated video along x-axis. See the [paper](https://arxiv.org/abs/2303.13439),
+                Sect. 3.3.1.
+            motion_field_strength_y (`float`, *optional*, defaults to 12):
+                Strength of motion in generated video along y-axis. See the [paper](https://arxiv.org/abs/2303.13439),
+                Sect. 3.3.1.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.7):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            t0 (`int`, *optional*, defaults to 44):
+                Timestep t0. Should be in the range [0, num_inference_steps - 1]. See the
+                [paper](https://arxiv.org/abs/2303.13439), Sect. 3.3.1.
+            t1 (`int`, *optional*, defaults to 47):
+                Timestep t0. Should be in the range [t0 + 1, num_inference_steps - 1]. See the
+                [paper](https://arxiv.org/abs/2303.13439), Sect. 3.3.1.
+
+        Returns:
+            [`~pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.TextToVideoSDXLPipelineOutput`] or
+            `tuple`: [`~pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.TextToVideoSDXLPipelineOutput`]
+            if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
+            generated images.
+        """
+        assert video_length > 0
+        if frame_ids is None:
+            frame_ids = list(range(video_length))
+        assert len(frame_ids) == video_length
+
+        assert num_videos_per_prompt == 1
+
+        if isinstance(prompt, str):
+            prompt = [prompt]
+        if isinstance(negative_prompt, str):
+            negative_prompt = [negative_prompt]
+
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            negative_prompt_2,
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        )
+
+        # 2. Define call parameters
+        batch_size = (
+            1 if isinstance(prompt, str) else len(prompt) if isinstance(prompt, list) else prompt_embeds.shape[0]
+        )
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            device=device,
+            num_images_per_prompt=num_videos_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+        )
+
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+        )
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([add_time_ids, add_time_ids], dim=0)
+
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_videos_per_prompt, 1)
+
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+        # Perform the first backward process up to time T_1
+        x_1_t1 = self.backward_loop(
+            timesteps=timesteps[: -t1 - 1],
+            prompt_embeds=prompt_embeds,
+            latents=latents,
+            guidance_scale=guidance_scale,
+            callback=callback,
+            callback_steps=callback_steps,
+            extra_step_kwargs=extra_step_kwargs,
+            num_warmup_steps=num_warmup_steps,
+            add_text_embeds=add_text_embeds,
+            add_time_ids=add_time_ids,
+        )
+
+        scheduler_copy = copy.deepcopy(self.scheduler)
+
+        # Perform the second backward process up to time T_0
+        x_1_t0 = self.backward_loop(
+            timesteps=timesteps[-t1 - 1 : -t0 - 1],
+            prompt_embeds=prompt_embeds,
+            latents=x_1_t1,
+            guidance_scale=guidance_scale,
+            callback=callback,
+            callback_steps=callback_steps,
+            extra_step_kwargs=extra_step_kwargs,
+            num_warmup_steps=0,
+            add_text_embeds=add_text_embeds,
+            add_time_ids=add_time_ids,
+        )
+
+        # Propagate first frame latents at time T_0 to remaining frames
+        x_2k_t0 = x_1_t0.repeat(video_length - 1, 1, 1, 1)
+
+        # Add motion in latents at time T_0
+        x_2k_t0 = create_motion_field_and_warp_latents(
+            motion_field_strength_x=motion_field_strength_x,
+            motion_field_strength_y=motion_field_strength_y,
+            latents=x_2k_t0,
+            frame_ids=frame_ids[1:],
+        )
+
+        # Perform forward process up to time T_1
+        x_2k_t1 = self.forward_loop(
+            x_t0=x_2k_t0,
+            t0=timesteps[-t0 - 1].to(torch.long),
+            t1=timesteps[-t1 - 1].to(torch.long),
+            generator=generator,
+        )
+
+        # Perform backward process from time T_1 to 0
+        latents = torch.cat([x_1_t1, x_2k_t1])
+
+        self.scheduler = scheduler_copy
+        timesteps = timesteps[-t1 - 1 :]
+
+        b, l, d = prompt_embeds.size()
+        prompt_embeds = prompt_embeds[:, None].repeat(1, video_length, 1, 1).reshape(b * video_length, l, d)
+
+        b, k = add_text_embeds.size()
+        add_text_embeds = add_text_embeds[:, None].repeat(1, video_length, 1).reshape(b * video_length, k)
+
+        b, k = add_time_ids.size()
+        add_time_ids = add_time_ids[:, None].repeat(1, video_length, 1).reshape(b * video_length, k)
+
+        # 7.1 Apply denoising_end
+        if denoising_end is not None and isinstance(denoising_end, float) and denoising_end > 0 and denoising_end < 1:
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        x_1k_0 = self.backward_loop(
+            timesteps=timesteps,
+            prompt_embeds=prompt_embeds,
+            latents=latents,
+            guidance_scale=guidance_scale,
+            callback=callback,
+            callback_steps=callback_steps,
+            extra_step_kwargs=extra_step_kwargs,
+            num_warmup_steps=0,
+            add_text_embeds=add_text_embeds,
+            add_time_ids=add_time_ids,
+        )
+
+        latents = x_1k_0
+
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+            return TextToVideoSDXLPipelineOutput(images=image)
+
+        # apply watermark if available
+        if self.watermark is not None:
+            image = self.watermark.apply_watermark(image)
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload last model to CPU manually for max memory savings
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image,)
+
+        return TextToVideoSDXLPipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unclip/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unclip/__init__.py
new file mode 100644
index 000000000..c89e89946
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unclip/__init__.py
@@ -0,0 +1,52 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+    is_transformers_available,
+    is_transformers_version,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils.dummy_torch_and_transformers_objects import UnCLIPImageVariationPipeline, UnCLIPPipeline
+
+    _dummy_objects.update(
+        {"UnCLIPImageVariationPipeline": UnCLIPImageVariationPipeline, "UnCLIPPipeline": UnCLIPPipeline}
+    )
+else:
+    _import_structure["pipeline_unclip"] = ["UnCLIPPipeline"]
+    _import_structure["pipeline_unclip_image_variation"] = ["UnCLIPImageVariationPipeline"]
+    _import_structure["text_proj"] = ["UnCLIPTextProjModel"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.25.0")):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .pipeline_unclip import UnCLIPPipeline
+        from .pipeline_unclip_image_variation import UnCLIPImageVariationPipeline
+        from .text_proj import UnCLIPTextProjModel
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unclip/pipeline_unclip.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unclip/pipeline_unclip.py
new file mode 100644
index 000000000..72e5b3113
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unclip/pipeline_unclip.py
@@ -0,0 +1,493 @@
+# Copyright 2024 Kakao Brain and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch.nn import functional as F
+from transformers import CLIPTextModelWithProjection, CLIPTokenizer
+from transformers.models.clip.modeling_clip import CLIPTextModelOutput
+
+from ...models import PriorTransformer, UNet2DConditionModel, UNet2DModel
+from ...schedulers import UnCLIPScheduler
+from ...utils import logging
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from .text_proj import UnCLIPTextProjModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class UnCLIPPipeline(DiffusionPipeline):
+    """
+    Pipeline for text-to-image generation using unCLIP.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        text_encoder ([`~transformers.CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        prior ([`PriorTransformer`]):
+            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+        text_proj ([`UnCLIPTextProjModel`]):
+            Utility class to prepare and combine the embeddings before they are passed to the decoder.
+        decoder ([`UNet2DConditionModel`]):
+            The decoder to invert the image embedding into an image.
+        super_res_first ([`UNet2DModel`]):
+            Super resolution UNet. Used in all but the last step of the super resolution diffusion process.
+        super_res_last ([`UNet2DModel`]):
+            Super resolution UNet. Used in the last step of the super resolution diffusion process.
+        prior_scheduler ([`UnCLIPScheduler`]):
+            Scheduler used in the prior denoising process (a modified [`DDPMScheduler`]).
+        decoder_scheduler ([`UnCLIPScheduler`]):
+            Scheduler used in the decoder denoising process (a modified [`DDPMScheduler`]).
+        super_res_scheduler ([`UnCLIPScheduler`]):
+            Scheduler used in the super resolution denoising process (a modified [`DDPMScheduler`]).
+
+    """
+
+    _exclude_from_cpu_offload = ["prior"]
+
+    prior: PriorTransformer
+    decoder: UNet2DConditionModel
+    text_proj: UnCLIPTextProjModel
+    text_encoder: CLIPTextModelWithProjection
+    tokenizer: CLIPTokenizer
+    super_res_first: UNet2DModel
+    super_res_last: UNet2DModel
+
+    prior_scheduler: UnCLIPScheduler
+    decoder_scheduler: UnCLIPScheduler
+    super_res_scheduler: UnCLIPScheduler
+
+    model_cpu_offload_seq = "text_encoder->text_proj->decoder->super_res_first->super_res_last"
+
+    def __init__(
+        self,
+        prior: PriorTransformer,
+        decoder: UNet2DConditionModel,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        text_proj: UnCLIPTextProjModel,
+        super_res_first: UNet2DModel,
+        super_res_last: UNet2DModel,
+        prior_scheduler: UnCLIPScheduler,
+        decoder_scheduler: UnCLIPScheduler,
+        super_res_scheduler: UnCLIPScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            prior=prior,
+            decoder=decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            text_proj=text_proj,
+            super_res_first=super_res_first,
+            super_res_last=super_res_last,
+            prior_scheduler=prior_scheduler,
+            decoder_scheduler=decoder_scheduler,
+            super_res_scheduler=super_res_scheduler,
+        )
+
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None,
+        text_attention_mask: Optional[torch.Tensor] = None,
+    ):
+        if text_model_output is None:
+            batch_size = len(prompt) if isinstance(prompt, list) else 1
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            text_mask = text_inputs.attention_mask.bool().to(device)
+
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+                text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+
+            text_encoder_output = self.text_encoder(text_input_ids.to(device))
+
+            prompt_embeds = text_encoder_output.text_embeds
+            text_enc_hid_states = text_encoder_output.last_hidden_state
+
+        else:
+            batch_size = text_model_output[0].shape[0]
+            prompt_embeds, text_enc_hid_states = text_model_output[0], text_model_output[1]
+            text_mask = text_attention_mask
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_enc_hid_states = text_enc_hid_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens = [""] * batch_size
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_text_mask = uncond_input.attention_mask.bool().to(device)
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids.to(device))
+
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
+            uncond_text_enc_hid_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_enc_hid_states.shape[1]
+            uncond_text_enc_hid_states = uncond_text_enc_hid_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_enc_hid_states = uncond_text_enc_hid_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_enc_hid_states = torch.cat([uncond_text_enc_hid_states, text_enc_hid_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_enc_hid_states, text_mask
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        prior_num_inference_steps: int = 25,
+        decoder_num_inference_steps: int = 25,
+        super_res_num_inference_steps: int = 7,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prior_latents: Optional[torch.FloatTensor] = None,
+        decoder_latents: Optional[torch.FloatTensor] = None,
+        super_res_latents: Optional[torch.FloatTensor] = None,
+        text_model_output: Optional[Union[CLIPTextModelOutput, Tuple]] = None,
+        text_attention_mask: Optional[torch.Tensor] = None,
+        prior_guidance_scale: float = 4.0,
+        decoder_guidance_scale: float = 8.0,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        """
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide image generation. This can only be left undefined if `text_model_output`
+                and `text_attention_mask` is passed.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            prior_num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps for the prior. More denoising steps usually lead to a higher quality
+                image at the expense of slower inference.
+            decoder_num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps for the decoder. More denoising steps usually lead to a higher quality
+                image at the expense of slower inference.
+            super_res_num_inference_steps (`int`, *optional*, defaults to 7):
+                The number of denoising steps for super resolution. More denoising steps usually lead to a higher
+                quality image at the expense of slower inference.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prior_latents (`torch.FloatTensor` of shape (batch size, embeddings dimension), *optional*):
+                Pre-generated noisy latents to be used as inputs for the prior.
+            decoder_latents (`torch.FloatTensor` of shape (batch size, channels, height, width), *optional*):
+                Pre-generated noisy latents to be used as inputs for the decoder.
+            super_res_latents (`torch.FloatTensor` of shape (batch size, channels, super res height, super res width), *optional*):
+                Pre-generated noisy latents to be used as inputs for the decoder.
+            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            decoder_guidance_scale (`float`, *optional*, defaults to 4.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            text_model_output (`CLIPTextModelOutput`, *optional*):
+                Pre-defined [`CLIPTextModel`] outputs that can be derived from the text encoder. Pre-defined text
+                outputs can be passed for tasks like text embedding interpolations. Make sure to also pass
+                `text_attention_mask` in this case. `prompt` can the be left `None`.
+            text_attention_mask (`torch.Tensor`, *optional*):
+                Pre-defined CLIP text attention mask that can be derived from the tokenizer. Pre-defined text attention
+                masks are necessary when passing `text_model_output`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+        if prompt is not None:
+            if isinstance(prompt, str):
+                batch_size = 1
+            elif isinstance(prompt, list):
+                batch_size = len(prompt)
+            else:
+                raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        else:
+            batch_size = text_model_output[0].shape[0]
+
+        device = self._execution_device
+
+        batch_size = batch_size * num_images_per_prompt
+
+        do_classifier_free_guidance = prior_guidance_scale > 1.0 or decoder_guidance_scale > 1.0
+
+        prompt_embeds, text_enc_hid_states, text_mask = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance, text_model_output, text_attention_mask
+        )
+
+        # prior
+
+        self.prior_scheduler.set_timesteps(prior_num_inference_steps, device=device)
+        prior_timesteps_tensor = self.prior_scheduler.timesteps
+
+        embedding_dim = self.prior.config.embedding_dim
+
+        prior_latents = self.prepare_latents(
+            (batch_size, embedding_dim),
+            prompt_embeds.dtype,
+            device,
+            generator,
+            prior_latents,
+            self.prior_scheduler,
+        )
+
+        for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([prior_latents] * 2) if do_classifier_free_guidance else prior_latents
+
+            predicted_image_embedding = self.prior(
+                latent_model_input,
+                timestep=t,
+                proj_embedding=prompt_embeds,
+                encoder_hidden_states=text_enc_hid_states,
+                attention_mask=text_mask,
+            ).predicted_image_embedding
+
+            if do_classifier_free_guidance:
+                predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2)
+                predicted_image_embedding = predicted_image_embedding_uncond + prior_guidance_scale * (
+                    predicted_image_embedding_text - predicted_image_embedding_uncond
+                )
+
+            if i + 1 == prior_timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = prior_timesteps_tensor[i + 1]
+
+            prior_latents = self.prior_scheduler.step(
+                predicted_image_embedding,
+                timestep=t,
+                sample=prior_latents,
+                generator=generator,
+                prev_timestep=prev_timestep,
+            ).prev_sample
+
+        prior_latents = self.prior.post_process_latents(prior_latents)
+
+        image_embeddings = prior_latents
+
+        # done prior
+
+        # decoder
+
+        text_enc_hid_states, additive_clip_time_embeddings = self.text_proj(
+            image_embeddings=image_embeddings,
+            prompt_embeds=prompt_embeds,
+            text_encoder_hidden_states=text_enc_hid_states,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+        )
+
+        if device.type == "mps":
+            # HACK: MPS: There is a panic when padding bool tensors,
+            # so cast to int tensor for the pad and back to bool afterwards
+            text_mask = text_mask.type(torch.int)
+            decoder_text_mask = F.pad(text_mask, (self.text_proj.clip_extra_context_tokens, 0), value=1)
+            decoder_text_mask = decoder_text_mask.type(torch.bool)
+        else:
+            decoder_text_mask = F.pad(text_mask, (self.text_proj.clip_extra_context_tokens, 0), value=True)
+
+        self.decoder_scheduler.set_timesteps(decoder_num_inference_steps, device=device)
+        decoder_timesteps_tensor = self.decoder_scheduler.timesteps
+
+        num_channels_latents = self.decoder.config.in_channels
+        height = self.decoder.config.sample_size
+        width = self.decoder.config.sample_size
+
+        decoder_latents = self.prepare_latents(
+            (batch_size, num_channels_latents, height, width),
+            text_enc_hid_states.dtype,
+            device,
+            generator,
+            decoder_latents,
+            self.decoder_scheduler,
+        )
+
+        for i, t in enumerate(self.progress_bar(decoder_timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([decoder_latents] * 2) if do_classifier_free_guidance else decoder_latents
+
+            noise_pred = self.decoder(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=text_enc_hid_states,
+                class_labels=additive_clip_time_embeddings,
+                attention_mask=decoder_text_mask,
+            ).sample
+
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred_uncond, _ = noise_pred_uncond.split(latent_model_input.shape[1], dim=1)
+                noise_pred_text, predicted_variance = noise_pred_text.split(latent_model_input.shape[1], dim=1)
+                noise_pred = noise_pred_uncond + decoder_guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+
+            if i + 1 == decoder_timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = decoder_timesteps_tensor[i + 1]
+
+            # compute the previous noisy sample x_t -> x_t-1
+            decoder_latents = self.decoder_scheduler.step(
+                noise_pred, t, decoder_latents, prev_timestep=prev_timestep, generator=generator
+            ).prev_sample
+
+        decoder_latents = decoder_latents.clamp(-1, 1)
+
+        image_small = decoder_latents
+
+        # done decoder
+
+        # super res
+
+        self.super_res_scheduler.set_timesteps(super_res_num_inference_steps, device=device)
+        super_res_timesteps_tensor = self.super_res_scheduler.timesteps
+
+        channels = self.super_res_first.config.in_channels // 2
+        height = self.super_res_first.config.sample_size
+        width = self.super_res_first.config.sample_size
+
+        super_res_latents = self.prepare_latents(
+            (batch_size, channels, height, width),
+            image_small.dtype,
+            device,
+            generator,
+            super_res_latents,
+            self.super_res_scheduler,
+        )
+
+        if device.type == "mps":
+            # MPS does not support many interpolations
+            image_upscaled = F.interpolate(image_small, size=[height, width])
+        else:
+            interpolate_antialias = {}
+            if "antialias" in inspect.signature(F.interpolate).parameters:
+                interpolate_antialias["antialias"] = True
+
+            image_upscaled = F.interpolate(
+                image_small, size=[height, width], mode="bicubic", align_corners=False, **interpolate_antialias
+            )
+
+        for i, t in enumerate(self.progress_bar(super_res_timesteps_tensor)):
+            # no classifier free guidance
+
+            if i == super_res_timesteps_tensor.shape[0] - 1:
+                unet = self.super_res_last
+            else:
+                unet = self.super_res_first
+
+            latent_model_input = torch.cat([super_res_latents, image_upscaled], dim=1)
+
+            noise_pred = unet(
+                sample=latent_model_input,
+                timestep=t,
+            ).sample
+
+            if i + 1 == super_res_timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = super_res_timesteps_tensor[i + 1]
+
+            # compute the previous noisy sample x_t -> x_t-1
+            super_res_latents = self.super_res_scheduler.step(
+                noise_pred, t, super_res_latents, prev_timestep=prev_timestep, generator=generator
+            ).prev_sample
+
+        image = super_res_latents
+        # done super res
+
+        self.maybe_free_model_hooks()
+
+        # post processing
+        image = image * 0.5 + 0.5
+        image = image.clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
new file mode 100644
index 000000000..6c646a7df
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
@@ -0,0 +1,420 @@
+# Copyright 2024 Kakao Brain and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import List, Optional, Union
+
+import PIL.Image
+import torch
+from torch.nn import functional as F
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+
+from ...models import UNet2DConditionModel, UNet2DModel
+from ...schedulers import UnCLIPScheduler
+from ...utils import logging
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from .text_proj import UnCLIPTextProjModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class UnCLIPImageVariationPipeline(DiffusionPipeline):
+    """
+    Pipeline to generate image variations from an input image using UnCLIP.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        text_encoder ([`~transformers.CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            Model that extracts features from generated images to be used as inputs for the `image_encoder`.
+        image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
+            Frozen CLIP image-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        text_proj ([`UnCLIPTextProjModel`]):
+            Utility class to prepare and combine the embeddings before they are passed to the decoder.
+        decoder ([`UNet2DConditionModel`]):
+            The decoder to invert the image embedding into an image.
+        super_res_first ([`UNet2DModel`]):
+            Super resolution UNet. Used in all but the last step of the super resolution diffusion process.
+        super_res_last ([`UNet2DModel`]):
+            Super resolution UNet. Used in the last step of the super resolution diffusion process.
+        decoder_scheduler ([`UnCLIPScheduler`]):
+            Scheduler used in the decoder denoising process (a modified [`DDPMScheduler`]).
+        super_res_scheduler ([`UnCLIPScheduler`]):
+            Scheduler used in the super resolution denoising process (a modified [`DDPMScheduler`]).
+    """
+
+    decoder: UNet2DConditionModel
+    text_proj: UnCLIPTextProjModel
+    text_encoder: CLIPTextModelWithProjection
+    tokenizer: CLIPTokenizer
+    feature_extractor: CLIPImageProcessor
+    image_encoder: CLIPVisionModelWithProjection
+    super_res_first: UNet2DModel
+    super_res_last: UNet2DModel
+
+    decoder_scheduler: UnCLIPScheduler
+    super_res_scheduler: UnCLIPScheduler
+    model_cpu_offload_seq = "text_encoder->image_encoder->text_proj->decoder->super_res_first->super_res_last"
+
+    def __init__(
+        self,
+        decoder: UNet2DConditionModel,
+        text_encoder: CLIPTextModelWithProjection,
+        tokenizer: CLIPTokenizer,
+        text_proj: UnCLIPTextProjModel,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection,
+        super_res_first: UNet2DModel,
+        super_res_last: UNet2DModel,
+        decoder_scheduler: UnCLIPScheduler,
+        super_res_scheduler: UnCLIPScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            decoder=decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            text_proj=text_proj,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+            super_res_first=super_res_first,
+            super_res_last=super_res_last,
+            decoder_scheduler=decoder_scheduler,
+            super_res_scheduler=super_res_scheduler,
+        )
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def _encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        text_mask = text_inputs.attention_mask.bool().to(device)
+        text_encoder_output = self.text_encoder(text_input_ids.to(device))
+
+        prompt_embeds = text_encoder_output.text_embeds
+        text_encoder_hidden_states = text_encoder_output.last_hidden_state
+
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+        text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_tokens = [""] * batch_size
+
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_text_mask = uncond_input.attention_mask.bool().to(device)
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids.to(device))
+
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds
+            uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len)
+
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0)
+
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states])
+
+            text_mask = torch.cat([uncond_text_mask, text_mask])
+
+        return prompt_embeds, text_encoder_hidden_states, text_mask
+
+    def _encode_image(self, image, device, num_images_per_prompt, image_embeddings: Optional[torch.Tensor] = None):
+        dtype = next(self.image_encoder.parameters()).dtype
+
+        if image_embeddings is None:
+            if not isinstance(image, torch.Tensor):
+                image = self.feature_extractor(images=image, return_tensors="pt").pixel_values
+
+            image = image.to(device=device, dtype=dtype)
+            image_embeddings = self.image_encoder(image).image_embeds
+
+        image_embeddings = image_embeddings.repeat_interleave(num_images_per_prompt, dim=0)
+
+        return image_embeddings
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Optional[Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor]] = None,
+        num_images_per_prompt: int = 1,
+        decoder_num_inference_steps: int = 25,
+        super_res_num_inference_steps: int = 7,
+        generator: Optional[torch.Generator] = None,
+        decoder_latents: Optional[torch.FloatTensor] = None,
+        super_res_latents: Optional[torch.FloatTensor] = None,
+        image_embeddings: Optional[torch.Tensor] = None,
+        decoder_guidance_scale: float = 8.0,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+    ):
+        """
+        The call function to the pipeline for generation.
+
+        Args:
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                `Image` or tensor representing an image batch to be used as the starting point. If you provide a
+                tensor, it needs to be compatible with the [`CLIPImageProcessor`]
+                [configuration](https://huggingface.co/fusing/karlo-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
+                Can be left as `None` only when `image_embeddings` are passed.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            decoder_num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps for the decoder. More denoising steps usually lead to a higher quality
+                image at the expense of slower inference.
+            super_res_num_inference_steps (`int`, *optional*, defaults to 7):
+                The number of denoising steps for super resolution. More denoising steps usually lead to a higher
+                quality image at the expense of slower inference.
+            generator (`torch.Generator`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            decoder_latents (`torch.FloatTensor` of shape (batch size, channels, height, width), *optional*):
+                Pre-generated noisy latents to be used as inputs for the decoder.
+            super_res_latents (`torch.FloatTensor` of shape (batch size, channels, super res height, super res width), *optional*):
+                Pre-generated noisy latents to be used as inputs for the decoder.
+            decoder_guidance_scale (`float`, *optional*, defaults to 4.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            image_embeddings (`torch.Tensor`, *optional*):
+                Pre-defined image embeddings that can be derived from the image encoder. Pre-defined image embeddings
+                can be passed for tasks like image interpolations. `image` can be left as `None`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is a list with the generated images.
+        """
+        if image is not None:
+            if isinstance(image, PIL.Image.Image):
+                batch_size = 1
+            elif isinstance(image, list):
+                batch_size = len(image)
+            else:
+                batch_size = image.shape[0]
+        else:
+            batch_size = image_embeddings.shape[0]
+
+        prompt = [""] * batch_size
+
+        device = self._execution_device
+
+        batch_size = batch_size * num_images_per_prompt
+
+        do_classifier_free_guidance = decoder_guidance_scale > 1.0
+
+        prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt(
+            prompt, device, num_images_per_prompt, do_classifier_free_guidance
+        )
+
+        image_embeddings = self._encode_image(image, device, num_images_per_prompt, image_embeddings)
+
+        # decoder
+        text_encoder_hidden_states, additive_clip_time_embeddings = self.text_proj(
+            image_embeddings=image_embeddings,
+            prompt_embeds=prompt_embeds,
+            text_encoder_hidden_states=text_encoder_hidden_states,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+        )
+
+        if device.type == "mps":
+            # HACK: MPS: There is a panic when padding bool tensors,
+            # so cast to int tensor for the pad and back to bool afterwards
+            text_mask = text_mask.type(torch.int)
+            decoder_text_mask = F.pad(text_mask, (self.text_proj.clip_extra_context_tokens, 0), value=1)
+            decoder_text_mask = decoder_text_mask.type(torch.bool)
+        else:
+            decoder_text_mask = F.pad(text_mask, (self.text_proj.clip_extra_context_tokens, 0), value=True)
+
+        self.decoder_scheduler.set_timesteps(decoder_num_inference_steps, device=device)
+        decoder_timesteps_tensor = self.decoder_scheduler.timesteps
+
+        num_channels_latents = self.decoder.config.in_channels
+        height = self.decoder.config.sample_size
+        width = self.decoder.config.sample_size
+
+        if decoder_latents is None:
+            decoder_latents = self.prepare_latents(
+                (batch_size, num_channels_latents, height, width),
+                text_encoder_hidden_states.dtype,
+                device,
+                generator,
+                decoder_latents,
+                self.decoder_scheduler,
+            )
+
+        for i, t in enumerate(self.progress_bar(decoder_timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([decoder_latents] * 2) if do_classifier_free_guidance else decoder_latents
+
+            noise_pred = self.decoder(
+                sample=latent_model_input,
+                timestep=t,
+                encoder_hidden_states=text_encoder_hidden_states,
+                class_labels=additive_clip_time_embeddings,
+                attention_mask=decoder_text_mask,
+            ).sample
+
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred_uncond, _ = noise_pred_uncond.split(latent_model_input.shape[1], dim=1)
+                noise_pred_text, predicted_variance = noise_pred_text.split(latent_model_input.shape[1], dim=1)
+                noise_pred = noise_pred_uncond + decoder_guidance_scale * (noise_pred_text - noise_pred_uncond)
+                noise_pred = torch.cat([noise_pred, predicted_variance], dim=1)
+
+            if i + 1 == decoder_timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = decoder_timesteps_tensor[i + 1]
+
+            # compute the previous noisy sample x_t -> x_t-1
+            decoder_latents = self.decoder_scheduler.step(
+                noise_pred, t, decoder_latents, prev_timestep=prev_timestep, generator=generator
+            ).prev_sample
+
+        decoder_latents = decoder_latents.clamp(-1, 1)
+
+        image_small = decoder_latents
+
+        # done decoder
+
+        # super res
+
+        self.super_res_scheduler.set_timesteps(super_res_num_inference_steps, device=device)
+        super_res_timesteps_tensor = self.super_res_scheduler.timesteps
+
+        channels = self.super_res_first.config.in_channels // 2
+        height = self.super_res_first.config.sample_size
+        width = self.super_res_first.config.sample_size
+
+        if super_res_latents is None:
+            super_res_latents = self.prepare_latents(
+                (batch_size, channels, height, width),
+                image_small.dtype,
+                device,
+                generator,
+                super_res_latents,
+                self.super_res_scheduler,
+            )
+
+        if device.type == "mps":
+            # MPS does not support many interpolations
+            image_upscaled = F.interpolate(image_small, size=[height, width])
+        else:
+            interpolate_antialias = {}
+            if "antialias" in inspect.signature(F.interpolate).parameters:
+                interpolate_antialias["antialias"] = True
+
+            image_upscaled = F.interpolate(
+                image_small, size=[height, width], mode="bicubic", align_corners=False, **interpolate_antialias
+            )
+
+        for i, t in enumerate(self.progress_bar(super_res_timesteps_tensor)):
+            # no classifier free guidance
+
+            if i == super_res_timesteps_tensor.shape[0] - 1:
+                unet = self.super_res_last
+            else:
+                unet = self.super_res_first
+
+            latent_model_input = torch.cat([super_res_latents, image_upscaled], dim=1)
+
+            noise_pred = unet(
+                sample=latent_model_input,
+                timestep=t,
+            ).sample
+
+            if i + 1 == super_res_timesteps_tensor.shape[0]:
+                prev_timestep = None
+            else:
+                prev_timestep = super_res_timesteps_tensor[i + 1]
+
+            # compute the previous noisy sample x_t -> x_t-1
+            super_res_latents = self.super_res_scheduler.step(
+                noise_pred, t, super_res_latents, prev_timestep=prev_timestep, generator=generator
+            ).prev_sample
+
+        image = super_res_latents
+
+        # done super res
+        self.maybe_free_model_hooks()
+
+        # post processing
+
+        image = image * 0.5 + 0.5
+        image = image.clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unclip/text_proj.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unclip/text_proj.py
new file mode 100644
index 000000000..5a86d0c08
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unclip/text_proj.py
@@ -0,0 +1,86 @@
+# Copyright 2024 Kakao Brain and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models import ModelMixin
+
+
+class UnCLIPTextProjModel(ModelMixin, ConfigMixin):
+    """
+    Utility class for CLIP embeddings. Used to combine the image and text embeddings into a format usable by the
+    decoder.
+
+    For more details, see the original paper: https://arxiv.org/abs/2204.06125 section 2.1
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        *,
+        clip_extra_context_tokens: int = 4,
+        clip_embeddings_dim: int = 768,
+        time_embed_dim: int,
+        cross_attention_dim,
+    ):
+        super().__init__()
+
+        self.learned_classifier_free_guidance_embeddings = nn.Parameter(torch.zeros(clip_embeddings_dim))
+
+        # parameters for additional clip time embeddings
+        self.embedding_proj = nn.Linear(clip_embeddings_dim, time_embed_dim)
+        self.clip_image_embeddings_project_to_time_embeddings = nn.Linear(clip_embeddings_dim, time_embed_dim)
+
+        # parameters for encoder hidden states
+        self.clip_extra_context_tokens = clip_extra_context_tokens
+        self.clip_extra_context_tokens_proj = nn.Linear(
+            clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim
+        )
+        self.encoder_hidden_states_proj = nn.Linear(clip_embeddings_dim, cross_attention_dim)
+        self.text_encoder_hidden_states_norm = nn.LayerNorm(cross_attention_dim)
+
+    def forward(self, *, image_embeddings, prompt_embeds, text_encoder_hidden_states, do_classifier_free_guidance):
+        if do_classifier_free_guidance:
+            # Add the classifier free guidance embeddings to the image embeddings
+            image_embeddings_batch_size = image_embeddings.shape[0]
+            classifier_free_guidance_embeddings = self.learned_classifier_free_guidance_embeddings.unsqueeze(0)
+            classifier_free_guidance_embeddings = classifier_free_guidance_embeddings.expand(
+                image_embeddings_batch_size, -1
+            )
+            image_embeddings = torch.cat([classifier_free_guidance_embeddings, image_embeddings], dim=0)
+
+        # The image embeddings batch size and the text embeddings batch size are equal
+        assert image_embeddings.shape[0] == prompt_embeds.shape[0]
+
+        batch_size = prompt_embeds.shape[0]
+
+        # "Specifically, we modify the architecture described in Nichol et al. (2021) by projecting and
+        # adding CLIP embeddings to the existing timestep embedding, ...
+        time_projected_prompt_embeds = self.embedding_proj(prompt_embeds)
+        time_projected_image_embeddings = self.clip_image_embeddings_project_to_time_embeddings(image_embeddings)
+        additive_clip_time_embeddings = time_projected_image_embeddings + time_projected_prompt_embeds
+
+        # ... and by projecting CLIP embeddings into four
+        # extra tokens of context that are concatenated to the sequence of outputs from the GLIDE text encoder"
+        clip_extra_context_tokens = self.clip_extra_context_tokens_proj(image_embeddings)
+        clip_extra_context_tokens = clip_extra_context_tokens.reshape(batch_size, -1, self.clip_extra_context_tokens)
+        clip_extra_context_tokens = clip_extra_context_tokens.permute(0, 2, 1)
+
+        text_encoder_hidden_states = self.encoder_hidden_states_proj(text_encoder_hidden_states)
+        text_encoder_hidden_states = self.text_encoder_hidden_states_norm(text_encoder_hidden_states)
+        text_encoder_hidden_states = torch.cat([clip_extra_context_tokens, text_encoder_hidden_states], dim=1)
+
+        return text_encoder_hidden_states, additive_clip_time_embeddings
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unidiffuser/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unidiffuser/__init__.py
new file mode 100644
index 000000000..1ac2b09a6
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unidiffuser/__init__.py
@@ -0,0 +1,58 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils.dummy_torch_and_transformers_objects import (
+        ImageTextPipelineOutput,
+        UniDiffuserPipeline,
+    )
+
+    _dummy_objects.update(
+        {"ImageTextPipelineOutput": ImageTextPipelineOutput, "UniDiffuserPipeline": UniDiffuserPipeline}
+    )
+else:
+    _import_structure["modeling_text_decoder"] = ["UniDiffuserTextDecoder"]
+    _import_structure["modeling_uvit"] = ["UniDiffuserModel", "UTransformer2DModel"]
+    _import_structure["pipeline_unidiffuser"] = ["ImageTextPipelineOutput", "UniDiffuserPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import (
+            ImageTextPipelineOutput,
+            UniDiffuserPipeline,
+        )
+    else:
+        from .modeling_text_decoder import UniDiffuserTextDecoder
+        from .modeling_uvit import UniDiffuserModel, UTransformer2DModel
+        from .pipeline_unidiffuser import ImageTextPipelineOutput, UniDiffuserPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py
new file mode 100644
index 000000000..bf0a4eb47
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py
@@ -0,0 +1,296 @@
+from typing import Optional
+
+import numpy as np
+import torch
+from torch import nn
+from transformers import GPT2Config, GPT2LMHeadModel
+from transformers.modeling_utils import ModuleUtilsMixin
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models import ModelMixin
+
+
+# Modified from ClipCaptionModel in https://github.com/thu-ml/unidiffuser/blob/main/libs/caption_decoder.py
+class UniDiffuserTextDecoder(ModelMixin, ConfigMixin, ModuleUtilsMixin):
+    """
+    Text decoder model for a image-text [UniDiffuser](https://arxiv.org/pdf/2303.06555.pdf) model. This is used to
+    generate text from the UniDiffuser image-text embedding.
+
+    Parameters:
+        prefix_length (`int`):
+            Max number of prefix tokens that will be supplied to the model.
+        prefix_inner_dim (`int`):
+            The hidden size of the incoming prefix embeddings. For UniDiffuser, this would be the hidden dim of the
+            CLIP text encoder.
+        prefix_hidden_dim (`int`, *optional*):
+            Hidden dim of the MLP if we encode the prefix.
+        vocab_size (`int`, *optional*, defaults to 50257):
+            Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GPT2Model`] or [`TFGPT2Model`].
+        n_positions (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        n_embd (`int`, *optional*, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_inner (`int`, *optional*, defaults to None):
+            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"gelu"`):
+            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        scale_attn_weights (`bool`, *optional*, defaults to `True`):
+            Scale attention weights by dividing by sqrt(hidden_size)..
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
+            Whether to additionally scale attention weights by `1 / layer_idx + 1`.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
+            Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
+            dot-product/softmax to float() when training with mixed precision.
+    """
+
+    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"]
+
+    @register_to_config
+    def __init__(
+        self,
+        prefix_length: int,
+        prefix_inner_dim: int,
+        prefix_hidden_dim: Optional[int] = None,
+        vocab_size: int = 50257,  # Start of GPT2 config args
+        n_positions: int = 1024,
+        n_embd: int = 768,
+        n_layer: int = 12,
+        n_head: int = 12,
+        n_inner: Optional[int] = None,
+        activation_function: str = "gelu_new",
+        resid_pdrop: float = 0.1,
+        embd_pdrop: float = 0.1,
+        attn_pdrop: float = 0.1,
+        layer_norm_epsilon: float = 1e-5,
+        initializer_range: float = 0.02,
+        scale_attn_weights: bool = True,
+        use_cache: bool = True,
+        scale_attn_by_inverse_layer_idx: bool = False,
+        reorder_and_upcast_attn: bool = False,
+    ):
+        super().__init__()
+
+        self.prefix_length = prefix_length
+
+        if prefix_inner_dim != n_embd and prefix_hidden_dim is None:
+            raise ValueError(
+                f"`prefix_hidden_dim` cannot be `None` when `prefix_inner_dim`: {prefix_hidden_dim} and"
+                f" `n_embd`: {n_embd} are not equal."
+            )
+
+        self.prefix_inner_dim = prefix_inner_dim
+        self.prefix_hidden_dim = prefix_hidden_dim
+
+        self.encode_prefix = (
+            nn.Linear(self.prefix_inner_dim, self.prefix_hidden_dim)
+            if self.prefix_hidden_dim is not None
+            else nn.Identity()
+        )
+        self.decode_prefix = (
+            nn.Linear(self.prefix_hidden_dim, n_embd) if self.prefix_hidden_dim is not None else nn.Identity()
+        )
+
+        gpt_config = GPT2Config(
+            vocab_size=vocab_size,
+            n_positions=n_positions,
+            n_embd=n_embd,
+            n_layer=n_layer,
+            n_head=n_head,
+            n_inner=n_inner,
+            activation_function=activation_function,
+            resid_pdrop=resid_pdrop,
+            embd_pdrop=embd_pdrop,
+            attn_pdrop=attn_pdrop,
+            layer_norm_epsilon=layer_norm_epsilon,
+            initializer_range=initializer_range,
+            scale_attn_weights=scale_attn_weights,
+            use_cache=use_cache,
+            scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx,
+            reorder_and_upcast_attn=reorder_and_upcast_attn,
+        )
+        self.transformer = GPT2LMHeadModel(gpt_config)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        prefix_embeds: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+    ):
+        """
+        Args:
+            input_ids (`torch.Tensor` of shape `(N, max_seq_len)`):
+                Text tokens to use for inference.
+            prefix_embeds (`torch.Tensor` of shape `(N, prefix_length, 768)`):
+                Prefix embedding to preprend to the embedded tokens.
+            attention_mask (`torch.Tensor` of shape `(N, prefix_length + max_seq_len, 768)`, *optional*):
+                Attention mask for the prefix embedding.
+            labels (`torch.Tensor`, *optional*):
+                Labels to use for language modeling.
+        """
+        embedding_text = self.transformer.transformer.wte(input_ids)
+        hidden = self.encode_prefix(prefix_embeds)
+        prefix_embeds = self.decode_prefix(hidden)
+        embedding_cat = torch.cat((prefix_embeds, embedding_text), dim=1)
+
+        if labels is not None:
+            dummy_token = self.get_dummy_token(input_ids.shape[0], input_ids.device)
+            labels = torch.cat((dummy_token, input_ids), dim=1)
+        out = self.transformer(inputs_embeds=embedding_cat, labels=labels, attention_mask=attention_mask)
+        if self.prefix_hidden_dim is not None:
+            return out, hidden
+        else:
+            return out
+
+    def get_dummy_token(self, batch_size: int, device: torch.device) -> torch.Tensor:
+        return torch.zeros(batch_size, self.prefix_length, dtype=torch.int64, device=device)
+
+    def encode(self, prefix):
+        return self.encode_prefix(prefix)
+
+    @torch.no_grad()
+    def generate_captions(self, features, eos_token_id, device):
+        """
+        Generate captions given text embedding features. Returns list[L].
+
+        Args:
+            features (`torch.Tensor` of shape `(B, L, D)`):
+                Text embedding features to generate captions from.
+            eos_token_id (`int`):
+                The token ID of the EOS token for the text decoder model.
+            device:
+                Device to perform text generation on.
+
+        Returns:
+            `List[str]`: A list of strings generated from the decoder model.
+        """
+
+        features = torch.split(features, 1, dim=0)
+        generated_tokens = []
+        generated_seq_lengths = []
+        for feature in features:
+            feature = self.decode_prefix(feature.to(device))  # back to the clip feature
+            # Only support beam search for now
+            output_tokens, seq_lengths = self.generate_beam(
+                input_embeds=feature, device=device, eos_token_id=eos_token_id
+            )
+            generated_tokens.append(output_tokens[0])
+            generated_seq_lengths.append(seq_lengths[0])
+        generated_tokens = torch.stack(generated_tokens)
+        generated_seq_lengths = torch.stack(generated_seq_lengths)
+        return generated_tokens, generated_seq_lengths
+
+    @torch.no_grad()
+    def generate_beam(
+        self,
+        input_ids=None,
+        input_embeds=None,
+        device=None,
+        beam_size: int = 5,
+        entry_length: int = 67,
+        temperature: float = 1.0,
+        eos_token_id: Optional[int] = None,
+    ):
+        """
+        Generates text using the given tokenizer and text prompt or token embedding via beam search. This
+        implementation is based on the beam search implementation from the [original UniDiffuser
+        code](https://github.com/thu-ml/unidiffuser/blob/main/libs/caption_decoder.py#L89).
+
+        Args:
+            eos_token_id (`int`, *optional*):
+                The token ID of the EOS token for the text decoder model.
+            input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
+                Tokenizer indices of input sequence tokens in the vocabulary. One of `input_ids` and `input_embeds`
+                must be supplied.
+            input_embeds (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
+                An embedded representation to directly pass to the transformer as a prefix for beam search. One of
+                `input_ids` and `input_embeds` must be supplied.
+            device:
+                The device to perform beam search on.
+            beam_size (`int`, *optional*, defaults to `5`):
+                The number of best states to store during beam search.
+            entry_length (`int`, *optional*, defaults to `67`):
+                The number of iterations to run beam search.
+            temperature (`float`, *optional*, defaults to 1.0):
+                The temperature to use when performing the softmax over logits from the decoding model.
+
+        Returns:
+            `Tuple(torch.Tensor, torch.Tensor)`: A tuple of tensors where the first element is a tensor of generated
+            token sequences sorted by score in descending order, and the second element is the sequence lengths
+            corresponding to those sequences.
+        """
+        # Generates text until stop_token is reached using beam search with the desired beam size.
+        stop_token_index = eos_token_id
+        tokens = None
+        scores = None
+        seq_lengths = torch.ones(beam_size, device=device, dtype=torch.int)
+        is_stopped = torch.zeros(beam_size, device=device, dtype=torch.bool)
+
+        if input_embeds is not None:
+            generated = input_embeds
+        else:
+            generated = self.transformer.transformer.wte(input_ids)
+
+        for i in range(entry_length):
+            outputs = self.transformer(inputs_embeds=generated)
+            logits = outputs.logits
+            logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
+            logits = logits.softmax(-1).log()
+
+            if scores is None:
+                scores, next_tokens = logits.topk(beam_size, -1)
+                generated = generated.expand(beam_size, *generated.shape[1:])
+                next_tokens, scores = next_tokens.permute(1, 0), scores.squeeze(0)
+                if tokens is None:
+                    tokens = next_tokens
+                else:
+                    tokens = tokens.expand(beam_size, *tokens.shape[1:])
+                    tokens = torch.cat((tokens, next_tokens), dim=1)
+            else:
+                logits[is_stopped] = -float(np.inf)
+                logits[is_stopped, 0] = 0
+                scores_sum = scores[:, None] + logits
+                seq_lengths[~is_stopped] += 1
+                scores_sum_average = scores_sum / seq_lengths[:, None]
+                scores_sum_average, next_tokens = scores_sum_average.view(-1).topk(beam_size, -1)
+                next_tokens_source = next_tokens // scores_sum.shape[1]
+                seq_lengths = seq_lengths[next_tokens_source]
+                next_tokens = next_tokens % scores_sum.shape[1]
+                next_tokens = next_tokens.unsqueeze(1)
+                tokens = tokens[next_tokens_source]
+                tokens = torch.cat((tokens, next_tokens), dim=1)
+                generated = generated[next_tokens_source]
+                scores = scores_sum_average * seq_lengths
+                is_stopped = is_stopped[next_tokens_source]
+
+            next_token_embed = self.transformer.transformer.wte(next_tokens.squeeze()).view(generated.shape[0], 1, -1)
+            generated = torch.cat((generated, next_token_embed), dim=1)
+            is_stopped = is_stopped + next_tokens.eq(stop_token_index).squeeze()
+            if is_stopped.all():
+                break
+
+        scores = scores / seq_lengths
+        order = scores.argsort(descending=True)
+        # tokens tensors are already padded to max_seq_length
+        output_texts = [tokens[i] for i in order]
+        output_texts = torch.stack(output_texts, dim=0)
+        seq_lengths = torch.tensor([seq_lengths[i] for i in order], dtype=seq_lengths.dtype)
+        return output_texts, seq_lengths
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unidiffuser/modeling_uvit.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unidiffuser/modeling_uvit.py
new file mode 100644
index 000000000..c074b9916
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unidiffuser/modeling_uvit.py
@@ -0,0 +1,1197 @@
+import math
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models import ModelMixin
+from ...models.attention import FeedForward
+from ...models.attention_processor import Attention
+from ...models.embeddings import TimestepEmbedding, Timesteps, get_2d_sincos_pos_embed
+from ...models.normalization import AdaLayerNorm
+from ...models.transformers.transformer_2d import Transformer2DModelOutput
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        logger.warning(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect."
+        )
+
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.0))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+
+
+def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
+    # type: (torch.Tensor, float, float, float, float) -> torch.Tensor
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the normal distribution :math:`\mathcal{N}(\text{mean},
+    \text{std}^2)` with values outside :math:`[a, b]` redrawn until they are within the bounds. The method used for
+    generating the random values works best when :math:`a \leq \text{mean} \leq b`.
+
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5) >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding"""
+
+    def __init__(
+        self,
+        height=224,
+        width=224,
+        patch_size=16,
+        in_channels=3,
+        embed_dim=768,
+        layer_norm=False,
+        flatten=True,
+        bias=True,
+        use_pos_embed=True,
+    ):
+        super().__init__()
+
+        num_patches = (height // patch_size) * (width // patch_size)
+        self.flatten = flatten
+        self.layer_norm = layer_norm
+
+        self.proj = nn.Conv2d(
+            in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
+        )
+        if layer_norm:
+            self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6)
+        else:
+            self.norm = None
+
+        self.use_pos_embed = use_pos_embed
+        if self.use_pos_embed:
+            pos_embed = get_2d_sincos_pos_embed(embed_dim, int(num_patches**0.5))
+            self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=False)
+
+    def forward(self, latent):
+        latent = self.proj(latent)
+        if self.flatten:
+            latent = latent.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        if self.layer_norm:
+            latent = self.norm(latent)
+        if self.use_pos_embed:
+            return latent + self.pos_embed
+        else:
+            return latent
+
+
+class SkipBlock(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+
+        self.skip_linear = nn.Linear(2 * dim, dim)
+
+        # Use torch.nn.LayerNorm for now, following the original code
+        self.norm = nn.LayerNorm(dim)
+
+    def forward(self, x, skip):
+        x = self.skip_linear(torch.cat([x, skip], dim=-1))
+        x = self.norm(x)
+
+        return x
+
+
+# Modified to support both pre-LayerNorm and post-LayerNorm configurations
+# Don't support AdaLayerNormZero for now
+# Modified from diffusers.models.attention.BasicTransformerBlock
+class UTransformerBlock(nn.Module):
+    r"""
+    A modification of BasicTransformerBlock which supports pre-LayerNorm and post-LayerNorm configurations.
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`):
+            Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:obj: `int`, *optional*):
+            The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:obj: `bool`, *optional*, defaults to `False`):
+            Configure if the attentions should contain a bias parameter.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the query and key to float32 when performing the attention calculation.
+        norm_elementwise_affine (`bool`, *optional*):
+            Whether to use learnable per-element affine parameters during layer normalization.
+        norm_type (`str`, defaults to `"layer_norm"`):
+            The layer norm implementation to use.
+        pre_layer_norm (`bool`, *optional*):
+            Whether to perform layer normalization before the attention and feedforward operations ("pre-LayerNorm"),
+            as opposed to after ("post-LayerNorm"). Note that `BasicTransformerBlock` uses pre-LayerNorm, e.g.
+            `pre_layer_norm = True`.
+        final_dropout (`bool`, *optional*):
+            Whether to use a final Dropout layer after the feedforward network.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",
+        pre_layer_norm: bool = True,
+        final_dropout: bool = False,
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+
+        self.pre_layer_norm = pre_layer_norm
+
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+
+        # 1. Self-Attn
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+        )
+
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.attn2 = None
+
+        if self.use_ada_layer_norm:
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            self.norm2 = (
+                AdaLayerNorm(dim, num_embeds_ada_norm)
+                if self.use_ada_layer_norm
+                else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+            )
+        else:
+            self.norm2 = None
+
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        timestep=None,
+        cross_attention_kwargs=None,
+        class_labels=None,
+    ):
+        # Pre-LayerNorm
+        if self.pre_layer_norm:
+            if self.use_ada_layer_norm:
+                norm_hidden_states = self.norm1(hidden_states, timestep)
+            else:
+                norm_hidden_states = self.norm1(hidden_states)
+        else:
+            norm_hidden_states = hidden_states
+
+        # 1. Self-Attention
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+
+        # Post-LayerNorm
+        if not self.pre_layer_norm:
+            if self.use_ada_layer_norm:
+                attn_output = self.norm1(attn_output, timestep)
+            else:
+                attn_output = self.norm1(attn_output)
+
+        hidden_states = attn_output + hidden_states
+
+        if self.attn2 is not None:
+            # Pre-LayerNorm
+            if self.pre_layer_norm:
+                norm_hidden_states = (
+                    self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+                )
+            else:
+                norm_hidden_states = hidden_states
+            # TODO (Birch-San): Here we should prepare the encoder_attention mask correctly
+            # prepare attention mask here
+
+            # 2. Cross-Attention
+            attn_output = self.attn2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
+
+            # Post-LayerNorm
+            if not self.pre_layer_norm:
+                attn_output = self.norm2(attn_output, timestep) if self.use_ada_layer_norm else self.norm2(attn_output)
+
+            hidden_states = attn_output + hidden_states
+
+        # 3. Feed-forward
+        # Pre-LayerNorm
+        if self.pre_layer_norm:
+            norm_hidden_states = self.norm3(hidden_states)
+        else:
+            norm_hidden_states = hidden_states
+
+        ff_output = self.ff(norm_hidden_states)
+
+        # Post-LayerNorm
+        if not self.pre_layer_norm:
+            ff_output = self.norm3(ff_output)
+
+        hidden_states = ff_output + hidden_states
+
+        return hidden_states
+
+
+# Like UTransformerBlock except with LayerNorms on the residual backbone of the block
+# Modified from diffusers.models.attention.BasicTransformerBlock
+class UniDiffuserBlock(nn.Module):
+    r"""
+    A modification of BasicTransformerBlock which supports pre-LayerNorm and post-LayerNorm configurations and puts the
+    LayerNorms on the residual backbone of the block. This matches the transformer block in the [original UniDiffuser
+    implementation](https://github.com/thu-ml/unidiffuser/blob/main/libs/uvit_multi_post_ln_v1.py#L104).
+
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`):
+            Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:obj: `int`, *optional*):
+            The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:obj: `bool`, *optional*, defaults to `False`):
+            Configure if the attentions should contain a bias parameter.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the query and key to float() when performing the attention calculation.
+        norm_elementwise_affine (`bool`, *optional*):
+            Whether to use learnable per-element affine parameters during layer normalization.
+        norm_type (`str`, defaults to `"layer_norm"`):
+            The layer norm implementation to use.
+        pre_layer_norm (`bool`, *optional*):
+            Whether to perform layer normalization before the attention and feedforward operations ("pre-LayerNorm"),
+            as opposed to after ("post-LayerNorm"). The original UniDiffuser implementation is post-LayerNorm
+            (`pre_layer_norm = False`).
+        final_dropout (`bool`, *optional*):
+            Whether to use a final Dropout layer after the feedforward network.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        double_self_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",
+        pre_layer_norm: bool = False,
+        final_dropout: bool = True,
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+
+        self.pre_layer_norm = pre_layer_norm
+
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+
+        # 1. Self-Attn
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+        )
+
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            self.attn2 = None
+
+        if self.use_ada_layer_norm:
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            self.norm2 = (
+                AdaLayerNorm(dim, num_embeds_ada_norm)
+                if self.use_ada_layer_norm
+                else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+            )
+        else:
+            self.norm2 = None
+
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        timestep=None,
+        cross_attention_kwargs=None,
+        class_labels=None,
+    ):
+        # Following the diffusers transformer block implementation, put the LayerNorm on the
+        # residual backbone
+        # Pre-LayerNorm
+        if self.pre_layer_norm:
+            if self.use_ada_layer_norm:
+                hidden_states = self.norm1(hidden_states, timestep)
+            else:
+                hidden_states = self.norm1(hidden_states)
+
+        # 1. Self-Attention
+        cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
+        attn_output = self.attn1(
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+
+        hidden_states = attn_output + hidden_states
+
+        # Following the diffusers transformer block implementation, put the LayerNorm on the
+        # residual backbone
+        # Post-LayerNorm
+        if not self.pre_layer_norm:
+            if self.use_ada_layer_norm:
+                hidden_states = self.norm1(hidden_states, timestep)
+            else:
+                hidden_states = self.norm1(hidden_states)
+
+        if self.attn2 is not None:
+            # Pre-LayerNorm
+            if self.pre_layer_norm:
+                hidden_states = (
+                    self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+                )
+            # TODO (Birch-San): Here we should prepare the encoder_attention mask correctly
+            # prepare attention mask here
+
+            # 2. Cross-Attention
+            attn_output = self.attn2(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
+
+            hidden_states = attn_output + hidden_states
+
+            # Post-LayerNorm
+            if not self.pre_layer_norm:
+                hidden_states = (
+                    self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+                )
+
+        # 3. Feed-forward
+        # Pre-LayerNorm
+        if self.pre_layer_norm:
+            hidden_states = self.norm3(hidden_states)
+
+        ff_output = self.ff(hidden_states)
+
+        hidden_states = ff_output + hidden_states
+
+        # Post-LayerNorm
+        if not self.pre_layer_norm:
+            hidden_states = self.norm3(hidden_states)
+
+        return hidden_states
+
+
+# Modified from diffusers.models.transformer_2d.Transformer2DModel
+# Modify the transformer block structure to be U-Net like following U-ViT
+# Only supports patch-style input and torch.nn.LayerNorm currently
+# https://github.com/baofff/U-ViT
+class UTransformer2DModel(ModelMixin, ConfigMixin):
+    """
+    Transformer model based on the [U-ViT](https://github.com/baofff/U-ViT) architecture for image-like data. Compared
+    to [`Transformer2DModel`], this model has skip connections between transformer blocks in a "U"-shaped fashion,
+    similar to a U-Net. Supports only continuous (actual embeddings) inputs, which are embedded via a [`PatchEmbed`]
+    layer and then reshaped to (b, t, d).
+
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            Pass if the input is continuous. The number of channels in the input.
+        out_channels (`int`, *optional*):
+            The number of output channels; if `None`, defaults to `in_channels`.
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        norm_num_groups (`int`, *optional*, defaults to `32`):
+            The number of groups to use when performing Group Normalization.
+        cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use.
+        attention_bias (`bool`, *optional*):
+            Configure if the TransformerBlocks' attention should contain a bias parameter.
+        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
+            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
+            `ImagePositionalEmbeddings`.
+        num_vector_embeds (`int`, *optional*):
+            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
+            Includes the class for the masked latent pixel.
+        patch_size (`int`, *optional*, defaults to 2):
+            The patch size to use in the patch embedding.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
+            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
+            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
+            up to but not more than steps than `num_embeds_ada_norm`.
+        use_linear_projection (int, *optional*): TODO: Not used
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used in each
+            transformer block.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the query and key to float() when performing the attention calculation.
+        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+            The Layer Normalization implementation to use. Defaults to `torch.nn.LayerNorm`.
+        block_type (`str`, *optional*, defaults to `"unidiffuser"`):
+            The transformer block implementation to use. If `"unidiffuser"`, has the LayerNorms on the residual
+            backbone of each transformer block; otherwise has them in the attention/feedforward branches (the standard
+            behavior in `diffusers`.)
+        pre_layer_norm (`bool`, *optional*):
+            Whether to perform layer normalization before the attention and feedforward operations ("pre-LayerNorm"),
+            as opposed to after ("post-LayerNorm"). The original UniDiffuser implementation is post-LayerNorm
+            (`pre_layer_norm = False`).
+        norm_elementwise_affine (`bool`, *optional*):
+            Whether to use learnable per-element affine parameters during layer normalization.
+        use_patch_pos_embed (`bool`, *optional*):
+            Whether to use position embeddings inside the patch embedding layer (`PatchEmbed`).
+        final_dropout (`bool`, *optional*):
+            Whether to use a final Dropout layer after the feedforward network.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        patch_size: Optional[int] = 2,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_type: str = "layer_norm",
+        block_type: str = "unidiffuser",
+        pre_layer_norm: bool = False,
+        norm_elementwise_affine: bool = True,
+        use_patch_pos_embed=False,
+        ff_final_dropout: bool = False,
+    ):
+        super().__init__()
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+
+        # 1. Input
+        # Only support patch input of shape (batch_size, num_channels, height, width) for now
+        assert in_channels is not None and patch_size is not None, "Patch input requires in_channels and patch_size."
+
+        assert sample_size is not None, "UTransformer2DModel over patched input must provide sample_size"
+
+        # 2. Define input layers
+        self.height = sample_size
+        self.width = sample_size
+
+        self.patch_size = patch_size
+        self.pos_embed = PatchEmbed(
+            height=sample_size,
+            width=sample_size,
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dim=inner_dim,
+            use_pos_embed=use_patch_pos_embed,
+        )
+
+        # 3. Define transformers blocks
+        # Modify this to have in_blocks ("downsample" blocks, even though we don't actually downsample), a mid_block,
+        # and out_blocks ("upsample" blocks). Like a U-Net, there are skip connections from in_blocks to out_blocks in
+        # a "U"-shaped fashion (e.g. first in_block to last out_block, etc.).
+        # Quick hack to make the transformer block type configurable
+        if block_type == "unidiffuser":
+            block_cls = UniDiffuserBlock
+        else:
+            block_cls = UTransformerBlock
+        self.transformer_in_blocks = nn.ModuleList(
+            [
+                block_cls(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                    norm_type=norm_type,
+                    pre_layer_norm=pre_layer_norm,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    final_dropout=ff_final_dropout,
+                )
+                for d in range(num_layers // 2)
+            ]
+        )
+
+        self.transformer_mid_block = block_cls(
+            inner_dim,
+            num_attention_heads,
+            attention_head_dim,
+            dropout=dropout,
+            cross_attention_dim=cross_attention_dim,
+            activation_fn=activation_fn,
+            num_embeds_ada_norm=num_embeds_ada_norm,
+            attention_bias=attention_bias,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            norm_type=norm_type,
+            pre_layer_norm=pre_layer_norm,
+            norm_elementwise_affine=norm_elementwise_affine,
+            final_dropout=ff_final_dropout,
+        )
+
+        # For each skip connection, we use a SkipBlock (concatenation + Linear + LayerNorm) to process the inputs
+        # before each transformer out_block.
+        self.transformer_out_blocks = nn.ModuleList(
+            [
+                nn.ModuleDict(
+                    {
+                        "skip": SkipBlock(
+                            inner_dim,
+                        ),
+                        "block": block_cls(
+                            inner_dim,
+                            num_attention_heads,
+                            attention_head_dim,
+                            dropout=dropout,
+                            cross_attention_dim=cross_attention_dim,
+                            activation_fn=activation_fn,
+                            num_embeds_ada_norm=num_embeds_ada_norm,
+                            attention_bias=attention_bias,
+                            only_cross_attention=only_cross_attention,
+                            upcast_attention=upcast_attention,
+                            norm_type=norm_type,
+                            pre_layer_norm=pre_layer_norm,
+                            norm_elementwise_affine=norm_elementwise_affine,
+                            final_dropout=ff_final_dropout,
+                        ),
+                    }
+                )
+                for d in range(num_layers // 2)
+            ]
+        )
+
+        # 4. Define output layers
+        self.out_channels = in_channels if out_channels is None else out_channels
+
+        # Following the UniDiffuser U-ViT implementation, we process the transformer output with
+        # a LayerNorm layer with per-element affine params
+        self.norm_out = nn.LayerNorm(inner_dim)
+
+    def forward(
+        self,
+        hidden_states,
+        encoder_hidden_states=None,
+        timestep=None,
+        class_labels=None,
+        cross_attention_kwargs=None,
+        return_dict: bool = True,
+        hidden_states_is_embedding: bool = False,
+        unpatchify: bool = True,
+    ):
+        """
+        Args:
+            hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
+                When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
+                hidden_states
+            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.long`, *optional*):
+                Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
+            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+                Optional class labels to be applied as an embedding in AdaLayerZeroNorm. Used to indicate class labels
+                conditioning.
+            cross_attention_kwargs (*optional*):
+                Keyword arguments to supply to the cross attention layers, if used.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+            hidden_states_is_embedding (`bool`, *optional*, defaults to `False`):
+                Whether or not hidden_states is an embedding directly usable by the transformer. In this case we will
+                ignore input handling (e.g. continuous, vectorized, etc.) and directly feed hidden_states into the
+                transformer blocks.
+            unpatchify (`bool`, *optional*, defaults to `True`):
+                Whether to unpatchify the transformer output.
+
+        Returns:
+            [`~models.transformer_2d.Transformer2DModelOutput`] or `tuple`:
+            [`~models.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+        # 0. Check inputs
+
+        if not unpatchify and return_dict:
+            raise ValueError(
+                f"Cannot both define `unpatchify`: {unpatchify} and `return_dict`: {return_dict} since when"
+                f" `unpatchify` is {unpatchify} the returned output is of shape (batch_size, seq_len, hidden_dim)"
+                " rather than (batch_size, num_channels, height, width)."
+            )
+
+        # 1. Input
+        if not hidden_states_is_embedding:
+            hidden_states = self.pos_embed(hidden_states)
+
+        # 2. Blocks
+
+        # In ("downsample") blocks
+        skips = []
+        for in_block in self.transformer_in_blocks:
+            hidden_states = in_block(
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                timestep=timestep,
+                cross_attention_kwargs=cross_attention_kwargs,
+                class_labels=class_labels,
+            )
+            skips.append(hidden_states)
+
+        # Mid block
+        hidden_states = self.transformer_mid_block(hidden_states)
+
+        # Out ("upsample") blocks
+        for out_block in self.transformer_out_blocks:
+            hidden_states = out_block["skip"](hidden_states, skips.pop())
+            hidden_states = out_block["block"](
+                hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                timestep=timestep,
+                cross_attention_kwargs=cross_attention_kwargs,
+                class_labels=class_labels,
+            )
+
+        # 3. Output
+        # Don't support AdaLayerNorm for now, so no conditioning/scale/shift logic
+        hidden_states = self.norm_out(hidden_states)
+        # hidden_states = self.proj_out(hidden_states)
+
+        if unpatchify:
+            # unpatchify
+            height = width = int(hidden_states.shape[1] ** 0.5)
+            hidden_states = hidden_states.reshape(
+                shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
+            )
+            hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+            output = hidden_states.reshape(
+                shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
+            )
+        else:
+            output = hidden_states
+
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)
+
+
+class UniDiffuserModel(ModelMixin, ConfigMixin):
+    """
+    Transformer model for a image-text [UniDiffuser](https://arxiv.org/pdf/2303.06555.pdf) model. This is a
+    modification of [`UTransformer2DModel`] with input and output heads for the VAE-embedded latent image, the
+    CLIP-embedded image, and the CLIP-embedded prompt (see paper for more details).
+
+    Parameters:
+        text_dim (`int`): The hidden dimension of the CLIP text model used to embed images.
+        clip_img_dim (`int`): The hidden dimension of the CLIP vision model used to embed prompts.
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            Pass if the input is continuous. The number of channels in the input.
+        out_channels (`int`, *optional*):
+            The number of output channels; if `None`, defaults to `in_channels`.
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        norm_num_groups (`int`, *optional*, defaults to `32`):
+            The number of groups to use when performing Group Normalization.
+        cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use.
+        attention_bias (`bool`, *optional*):
+            Configure if the TransformerBlocks' attention should contain a bias parameter.
+        sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
+            Note that this is fixed at training time as it is used for learning a number of position embeddings. See
+            `ImagePositionalEmbeddings`.
+        num_vector_embeds (`int`, *optional*):
+            Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
+            Includes the class for the masked latent pixel.
+        patch_size (`int`, *optional*, defaults to 2):
+            The patch size to use in the patch embedding.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
+            The number of diffusion steps used during training. Note that this is fixed at training time as it is used
+            to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
+            up to but not more than steps than `num_embeds_ada_norm`.
+        use_linear_projection (int, *optional*): TODO: Not used
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used in each
+            transformer block.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the query and key to float32 when performing the attention calculation.
+        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+            The Layer Normalization implementation to use. Defaults to `torch.nn.LayerNorm`.
+        block_type (`str`, *optional*, defaults to `"unidiffuser"`):
+            The transformer block implementation to use. If `"unidiffuser"`, has the LayerNorms on the residual
+            backbone of each transformer block; otherwise has them in the attention/feedforward branches (the standard
+            behavior in `diffusers`.)
+        pre_layer_norm (`bool`, *optional*):
+            Whether to perform layer normalization before the attention and feedforward operations ("pre-LayerNorm"),
+            as opposed to after ("post-LayerNorm"). The original UniDiffuser implementation is post-LayerNorm
+            (`pre_layer_norm = False`).
+        norm_elementwise_affine (`bool`, *optional*):
+            Whether to use learnable per-element affine parameters during layer normalization.
+        use_patch_pos_embed (`bool`, *optional*):
+            Whether to use position embeddings inside the patch embedding layer (`PatchEmbed`).
+        ff_final_dropout (`bool`, *optional*):
+            Whether to use a final Dropout layer after the feedforward network.
+        use_data_type_embedding (`bool`, *optional*):
+            Whether to use a data type embedding. This is only relevant for UniDiffuser-v1 style models; UniDiffuser-v1
+            is continue-trained from UniDiffuser-v0 on non-publically-available data and accepts a `data_type`
+            argument, which can either be `1` to use the weights trained on non-publically-available data or `0`
+            otherwise. This argument is subsequently embedded by the data type embedding, if used.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        text_dim: int = 768,
+        clip_img_dim: int = 512,
+        num_text_tokens: int = 77,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        out_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        sample_size: Optional[int] = None,
+        num_vector_embeds: Optional[int] = None,
+        patch_size: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+        norm_type: str = "layer_norm",
+        block_type: str = "unidiffuser",
+        pre_layer_norm: bool = False,
+        use_timestep_embedding=False,
+        norm_elementwise_affine: bool = True,
+        use_patch_pos_embed=False,
+        ff_final_dropout: bool = True,
+        use_data_type_embedding: bool = False,
+    ):
+        super().__init__()
+
+        # 0. Handle dimensions
+        self.inner_dim = num_attention_heads * attention_head_dim
+
+        assert sample_size is not None, "UniDiffuserModel over patched input must provide sample_size"
+        self.sample_size = sample_size
+        self.in_channels = in_channels
+        self.out_channels = in_channels if out_channels is None else out_channels
+
+        self.patch_size = patch_size
+        # Assume image is square...
+        self.num_patches = (self.sample_size // patch_size) * (self.sample_size // patch_size)
+
+        # 1. Define input layers
+        # 1.1 Input layers for text and image input
+        # For now, only support patch input for VAE latent image input
+        self.vae_img_in = PatchEmbed(
+            height=sample_size,
+            width=sample_size,
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dim=self.inner_dim,
+            use_pos_embed=use_patch_pos_embed,
+        )
+        self.clip_img_in = nn.Linear(clip_img_dim, self.inner_dim)
+        self.text_in = nn.Linear(text_dim, self.inner_dim)
+
+        # 1.2. Timestep embeddings for t_img, t_text
+        self.timestep_img_proj = Timesteps(
+            self.inner_dim,
+            flip_sin_to_cos=True,
+            downscale_freq_shift=0,
+        )
+        self.timestep_img_embed = (
+            TimestepEmbedding(
+                self.inner_dim,
+                4 * self.inner_dim,
+                out_dim=self.inner_dim,
+            )
+            if use_timestep_embedding
+            else nn.Identity()
+        )
+
+        self.timestep_text_proj = Timesteps(
+            self.inner_dim,
+            flip_sin_to_cos=True,
+            downscale_freq_shift=0,
+        )
+        self.timestep_text_embed = (
+            TimestepEmbedding(
+                self.inner_dim,
+                4 * self.inner_dim,
+                out_dim=self.inner_dim,
+            )
+            if use_timestep_embedding
+            else nn.Identity()
+        )
+
+        # 1.3. Positional embedding
+        self.num_text_tokens = num_text_tokens
+        self.num_tokens = 1 + 1 + num_text_tokens + 1 + self.num_patches
+        self.pos_embed = nn.Parameter(torch.zeros(1, self.num_tokens, self.inner_dim))
+        self.pos_embed_drop = nn.Dropout(p=dropout)
+        trunc_normal_(self.pos_embed, std=0.02)
+
+        # 1.4. Handle data type token embeddings for UniDiffuser-V1, if necessary
+        self.use_data_type_embedding = use_data_type_embedding
+        if self.use_data_type_embedding:
+            self.data_type_token_embedding = nn.Embedding(2, self.inner_dim)
+            self.data_type_pos_embed_token = nn.Parameter(torch.zeros(1, 1, self.inner_dim))
+
+        # 2. Define transformer blocks
+        self.transformer = UTransformer2DModel(
+            num_attention_heads=num_attention_heads,
+            attention_head_dim=attention_head_dim,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            num_layers=num_layers,
+            dropout=dropout,
+            norm_num_groups=norm_num_groups,
+            cross_attention_dim=cross_attention_dim,
+            attention_bias=attention_bias,
+            sample_size=sample_size,
+            num_vector_embeds=num_vector_embeds,
+            patch_size=patch_size,
+            activation_fn=activation_fn,
+            num_embeds_ada_norm=num_embeds_ada_norm,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            norm_type=norm_type,
+            block_type=block_type,
+            pre_layer_norm=pre_layer_norm,
+            norm_elementwise_affine=norm_elementwise_affine,
+            use_patch_pos_embed=use_patch_pos_embed,
+            ff_final_dropout=ff_final_dropout,
+        )
+
+        # 3. Define output layers
+        patch_dim = (patch_size**2) * out_channels
+        self.vae_img_out = nn.Linear(self.inner_dim, patch_dim)
+        self.clip_img_out = nn.Linear(self.inner_dim, clip_img_dim)
+        self.text_out = nn.Linear(self.inner_dim, text_dim)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {"pos_embed"}
+
+    def forward(
+        self,
+        latent_image_embeds: torch.FloatTensor,
+        image_embeds: torch.FloatTensor,
+        prompt_embeds: torch.FloatTensor,
+        timestep_img: Union[torch.Tensor, float, int],
+        timestep_text: Union[torch.Tensor, float, int],
+        data_type: Optional[Union[torch.Tensor, float, int]] = 1,
+        encoder_hidden_states=None,
+        cross_attention_kwargs=None,
+    ):
+        """
+        Args:
+            latent_image_embeds (`torch.FloatTensor` of shape `(batch size, latent channels, height, width)`):
+                Latent image representation from the VAE encoder.
+            image_embeds (`torch.FloatTensor` of shape `(batch size, 1, clip_img_dim)`):
+                CLIP-embedded image representation (unsqueezed in the first dimension).
+            prompt_embeds (`torch.FloatTensor` of shape `(batch size, seq_len, text_dim)`):
+                CLIP-embedded text representation.
+            timestep_img (`torch.long` or `float` or `int`):
+                Current denoising step for the image.
+            timestep_text (`torch.long` or `float` or `int`):
+                Current denoising step for the text.
+            data_type: (`torch.int` or `float` or `int`, *optional*, defaults to `1`):
+                Only used in UniDiffuser-v1-style models. Can be either `1`, to use weights trained on nonpublic data,
+                or `0` otherwise.
+            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            cross_attention_kwargs (*optional*):
+                Keyword arguments to supply to the cross attention layers, if used.
+
+
+        Returns:
+            `tuple`: Returns relevant parts of the model's noise prediction: the first element of the tuple is tbe VAE
+            image embedding, the second element is the CLIP image embedding, and the third element is the CLIP text
+            embedding.
+        """
+        batch_size = latent_image_embeds.shape[0]
+
+        # 1. Input
+        # 1.1. Map inputs to shape (B, N, inner_dim)
+        vae_hidden_states = self.vae_img_in(latent_image_embeds)
+        clip_hidden_states = self.clip_img_in(image_embeds)
+        text_hidden_states = self.text_in(prompt_embeds)
+
+        num_text_tokens, num_img_tokens = text_hidden_states.size(1), vae_hidden_states.size(1)
+
+        # 1.2. Encode image timesteps to single token (B, 1, inner_dim)
+        if not torch.is_tensor(timestep_img):
+            timestep_img = torch.tensor([timestep_img], dtype=torch.long, device=vae_hidden_states.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timestep_img = timestep_img * torch.ones(batch_size, dtype=timestep_img.dtype, device=timestep_img.device)
+
+        timestep_img_token = self.timestep_img_proj(timestep_img)
+        # t_img_token does not contain any weights and will always return f32 tensors
+        # but time_embedding might be fp16, so we need to cast here.
+        timestep_img_token = timestep_img_token.to(dtype=self.dtype)
+        timestep_img_token = self.timestep_img_embed(timestep_img_token)
+        timestep_img_token = timestep_img_token.unsqueeze(dim=1)
+
+        # 1.3. Encode text timesteps to single token (B, 1, inner_dim)
+        if not torch.is_tensor(timestep_text):
+            timestep_text = torch.tensor([timestep_text], dtype=torch.long, device=vae_hidden_states.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timestep_text = timestep_text * torch.ones(batch_size, dtype=timestep_text.dtype, device=timestep_text.device)
+
+        timestep_text_token = self.timestep_text_proj(timestep_text)
+        # t_text_token does not contain any weights and will always return f32 tensors
+        # but time_embedding might be fp16, so we need to cast here.
+        timestep_text_token = timestep_text_token.to(dtype=self.dtype)
+        timestep_text_token = self.timestep_text_embed(timestep_text_token)
+        timestep_text_token = timestep_text_token.unsqueeze(dim=1)
+
+        # 1.4. Concatenate all of the embeddings together.
+        if self.use_data_type_embedding:
+            assert data_type is not None, "data_type must be supplied if the model uses a data type embedding"
+            if not torch.is_tensor(data_type):
+                data_type = torch.tensor([data_type], dtype=torch.int, device=vae_hidden_states.device)
+
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            data_type = data_type * torch.ones(batch_size, dtype=data_type.dtype, device=data_type.device)
+
+            data_type_token = self.data_type_token_embedding(data_type).unsqueeze(dim=1)
+            hidden_states = torch.cat(
+                [
+                    timestep_img_token,
+                    timestep_text_token,
+                    data_type_token,
+                    text_hidden_states,
+                    clip_hidden_states,
+                    vae_hidden_states,
+                ],
+                dim=1,
+            )
+        else:
+            hidden_states = torch.cat(
+                [timestep_img_token, timestep_text_token, text_hidden_states, clip_hidden_states, vae_hidden_states],
+                dim=1,
+            )
+
+        # 1.5. Prepare the positional embeddings and add to hidden states
+        # Note: I think img_vae should always have the proper shape, so there's no need to interpolate
+        # the position embeddings.
+        if self.use_data_type_embedding:
+            pos_embed = torch.cat(
+                [self.pos_embed[:, : 1 + 1, :], self.data_type_pos_embed_token, self.pos_embed[:, 1 + 1 :, :]], dim=1
+            )
+        else:
+            pos_embed = self.pos_embed
+        hidden_states = hidden_states + pos_embed
+        hidden_states = self.pos_embed_drop(hidden_states)
+
+        # 2. Blocks
+        hidden_states = self.transformer(
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            timestep=None,
+            class_labels=None,
+            cross_attention_kwargs=cross_attention_kwargs,
+            return_dict=False,
+            hidden_states_is_embedding=True,
+            unpatchify=False,
+        )[0]
+
+        # 3. Output
+        # Split out the predicted noise representation.
+        if self.use_data_type_embedding:
+            (
+                t_img_token_out,
+                t_text_token_out,
+                data_type_token_out,
+                text_out,
+                img_clip_out,
+                img_vae_out,
+            ) = hidden_states.split((1, 1, 1, num_text_tokens, 1, num_img_tokens), dim=1)
+        else:
+            t_img_token_out, t_text_token_out, text_out, img_clip_out, img_vae_out = hidden_states.split(
+                (1, 1, num_text_tokens, 1, num_img_tokens), dim=1
+            )
+
+        img_vae_out = self.vae_img_out(img_vae_out)
+
+        # unpatchify
+        height = width = int(img_vae_out.shape[1] ** 0.5)
+        img_vae_out = img_vae_out.reshape(
+            shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
+        )
+        img_vae_out = torch.einsum("nhwpqc->nchpwq", img_vae_out)
+        img_vae_out = img_vae_out.reshape(
+            shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
+        )
+
+        img_clip_out = self.clip_img_out(img_clip_out)
+
+        text_out = self.text_out(text_out)
+
+        return img_vae_out, img_clip_out, text_out
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
new file mode 100644
index 000000000..5d61b1054
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
@@ -0,0 +1,1419 @@
+import inspect
+from dataclasses import dataclass
+from typing import Callable, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+    GPT2Tokenizer,
+)
+
+from ...image_processor import VaeImageProcessor
+from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL
+from ...models.lora import adjust_lora_scale_text_encoder
+from ...schedulers import KarrasDiffusionSchedulers
+from ...utils import USE_PEFT_BACKEND, deprecate, logging, scale_lora_layers, unscale_lora_layers
+from ...utils.outputs import BaseOutput
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .modeling_text_decoder import UniDiffuserTextDecoder
+from .modeling_uvit import UniDiffuserModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# New BaseOutput child class for joint image-text output
+@dataclass
+class ImageTextPipelineOutput(BaseOutput):
+    """
+    Output class for joint image-text pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+        text (`List[str]` or `List[List[str]]`)
+            List of generated text strings of length `batch_size` or a list of list of strings whose outer list has
+            length `batch_size`.
+    """
+
+    images: Optional[Union[List[PIL.Image.Image], np.ndarray]]
+    text: Optional[Union[List[str], List[List[str]]]]
+
+
+class UniDiffuserPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for a bimodal image-text model which supports unconditional text and image generation, text-conditioned
+    image generation, image-conditioned text generation, and joint image-text generation.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations. This
+            is part of the UniDiffuser image representation along with the CLIP vision encoding.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        image_encoder ([`CLIPVisionModel`]):
+            A [`~transformers.CLIPVisionModel`] to encode images as part of its image representation along with the VAE
+            latent representation.
+        image_processor ([`CLIPImageProcessor`]):
+            [`~transformers.CLIPImageProcessor`] to preprocess an image before CLIP encoding it with `image_encoder`.
+        clip_tokenizer ([`CLIPTokenizer`]):
+             A [`~transformers.CLIPTokenizer`] to tokenize the prompt before encoding it with `text_encoder`.
+        text_decoder ([`UniDiffuserTextDecoder`]):
+            Frozen text decoder. This is a GPT-style model which is used to generate text from the UniDiffuser
+            embedding.
+        text_tokenizer ([`GPT2Tokenizer`]):
+            A [`~transformers.GPT2Tokenizer`] to decode text for text generation; used along with the `text_decoder`.
+        unet ([`UniDiffuserModel`]):
+            A [U-ViT](https://github.com/baofff/U-ViT) model with UNNet-style skip connections between transformer
+            layers to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image and/or text latents. The
+            original UniDiffuser paper uses the [`DPMSolverMultistepScheduler`] scheduler.
+    """
+
+    # TODO: support for moving submodules for components with enable_model_cpu_offload
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae->text_decoder"
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        image_encoder: CLIPVisionModelWithProjection,
+        clip_image_processor: CLIPImageProcessor,
+        clip_tokenizer: CLIPTokenizer,
+        text_decoder: UniDiffuserTextDecoder,
+        text_tokenizer: GPT2Tokenizer,
+        unet: UniDiffuserModel,
+        scheduler: KarrasDiffusionSchedulers,
+    ):
+        super().__init__()
+
+        if text_encoder.config.hidden_size != text_decoder.prefix_inner_dim:
+            raise ValueError(
+                f"The text encoder hidden size and text decoder prefix inner dim must be the same, but"
+                f" `text_encoder.config.hidden_size`: {text_encoder.config.hidden_size} and `text_decoder.prefix_inner_dim`: {text_decoder.prefix_inner_dim}"
+            )
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            image_encoder=image_encoder,
+            clip_image_processor=clip_image_processor,
+            clip_tokenizer=clip_tokenizer,
+            text_decoder=text_decoder,
+            text_tokenizer=text_tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+        )
+
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+        self.num_channels_latents = vae.config.latent_channels
+        self.text_encoder_seq_len = text_encoder.config.max_position_embeddings
+        self.text_encoder_hidden_size = text_encoder.config.hidden_size
+        self.image_encoder_projection_dim = image_encoder.config.projection_dim
+        self.unet_resolution = unet.config.sample_size
+
+        self.text_intermediate_dim = self.text_encoder_hidden_size
+        if self.text_decoder.prefix_hidden_dim is not None:
+            self.text_intermediate_dim = self.text_decoder.prefix_hidden_dim
+
+        self.mode = None
+
+        # TODO: handle safety checking?
+        self.safety_checker = None
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def _infer_mode(self, prompt, prompt_embeds, image, latents, prompt_latents, vae_latents, clip_latents):
+        r"""
+        Infer the generation task ('mode') from the inputs to `__call__`. If the mode has been manually set, the set
+        mode will be used.
+        """
+        prompt_available = (prompt is not None) or (prompt_embeds is not None)
+        image_available = image is not None
+        input_available = prompt_available or image_available
+
+        prompt_latents_available = prompt_latents is not None
+        vae_latents_available = vae_latents is not None
+        clip_latents_available = clip_latents is not None
+        full_latents_available = latents is not None
+        image_latents_available = vae_latents_available and clip_latents_available
+        all_indv_latents_available = prompt_latents_available and image_latents_available
+
+        if self.mode is not None:
+            # Preferentially use the mode set by the user
+            mode = self.mode
+        elif prompt_available:
+            mode = "text2img"
+        elif image_available:
+            mode = "img2text"
+        else:
+            # Neither prompt nor image supplied, infer based on availability of latents
+            if full_latents_available or all_indv_latents_available:
+                mode = "joint"
+            elif prompt_latents_available:
+                mode = "text"
+            elif image_latents_available:
+                mode = "img"
+            else:
+                # No inputs or latents available
+                mode = "joint"
+
+        # Give warnings for ambiguous cases
+        if self.mode is None and prompt_available and image_available:
+            logger.warning(
+                f"You have supplied both a text prompt and image to the pipeline and mode has not been set manually,"
+                f" defaulting to mode '{mode}'."
+            )
+
+        if self.mode is None and not input_available:
+            if vae_latents_available != clip_latents_available:
+                # Exactly one of vae_latents and clip_latents is supplied
+                logger.warning(
+                    f"You have supplied exactly one of `vae_latents` and `clip_latents`, whereas either both or none"
+                    f" are expected to be supplied. Defaulting to mode '{mode}'."
+                )
+            elif not prompt_latents_available and not vae_latents_available and not clip_latents_available:
+                # No inputs or latents supplied
+                logger.warning(
+                    f"No inputs or latents have been supplied, and mode has not been manually set,"
+                    f" defaulting to mode '{mode}'."
+                )
+
+        return mode
+
+    # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.enable_vae_slicing
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+
+    # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.disable_vae_slicing
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+
+    # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.enable_vae_tiling
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+
+    # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.disable_vae_tiling
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+
+    # Functions to manually set the mode
+    def set_text_mode(self):
+        r"""Manually set the generation mode to unconditional ("marginal") text generation."""
+        self.mode = "text"
+
+    def set_image_mode(self):
+        r"""Manually set the generation mode to unconditional ("marginal") image generation."""
+        self.mode = "img"
+
+    def set_text_to_image_mode(self):
+        r"""Manually set the generation mode to text-conditioned image generation."""
+        self.mode = "text2img"
+
+    def set_image_to_text_mode(self):
+        r"""Manually set the generation mode to image-conditioned text generation."""
+        self.mode = "img2text"
+
+    def set_joint_mode(self):
+        r"""Manually set the generation mode to unconditional joint image-text generation."""
+        self.mode = "joint"
+
+    def reset_mode(self):
+        r"""Removes a manually set mode; after calling this, the pipeline will infer the mode from inputs."""
+        self.mode = None
+
+    def _infer_batch_size(
+        self,
+        mode,
+        prompt,
+        prompt_embeds,
+        image,
+        num_images_per_prompt,
+        num_prompts_per_image,
+        latents,
+        prompt_latents,
+        vae_latents,
+        clip_latents,
+    ):
+        r"""Infers the batch size and multiplier depending on mode and supplied arguments to `__call__`."""
+        if num_images_per_prompt is None:
+            num_images_per_prompt = 1
+        if num_prompts_per_image is None:
+            num_prompts_per_image = 1
+
+        assert num_images_per_prompt > 0, "num_images_per_prompt must be a positive integer"
+        assert num_prompts_per_image > 0, "num_prompts_per_image must be a positive integer"
+
+        if mode in ["text2img"]:
+            if prompt is not None and isinstance(prompt, str):
+                batch_size = 1
+            elif prompt is not None and isinstance(prompt, list):
+                batch_size = len(prompt)
+            else:
+                # Either prompt or prompt_embeds must be present for text2img.
+                batch_size = prompt_embeds.shape[0]
+            multiplier = num_images_per_prompt
+        elif mode in ["img2text"]:
+            if isinstance(image, PIL.Image.Image):
+                batch_size = 1
+            else:
+                # Image must be available and type either PIL.Image.Image or torch.FloatTensor.
+                # Not currently supporting something like image_embeds.
+                batch_size = image.shape[0]
+            multiplier = num_prompts_per_image
+        elif mode in ["img"]:
+            if vae_latents is not None:
+                batch_size = vae_latents.shape[0]
+            elif clip_latents is not None:
+                batch_size = clip_latents.shape[0]
+            else:
+                batch_size = 1
+            multiplier = num_images_per_prompt
+        elif mode in ["text"]:
+            if prompt_latents is not None:
+                batch_size = prompt_latents.shape[0]
+            else:
+                batch_size = 1
+            multiplier = num_prompts_per_image
+        elif mode in ["joint"]:
+            if latents is not None:
+                batch_size = latents.shape[0]
+            elif prompt_latents is not None:
+                batch_size = prompt_latents.shape[0]
+            elif vae_latents is not None:
+                batch_size = vae_latents.shape[0]
+            elif clip_latents is not None:
+                batch_size = clip_latents.shape[0]
+            else:
+                batch_size = 1
+
+            if num_images_per_prompt == num_prompts_per_image:
+                multiplier = num_images_per_prompt
+            else:
+                multiplier = min(num_images_per_prompt, num_prompts_per_image)
+                logger.warning(
+                    f"You are using mode `{mode}` and `num_images_per_prompt`: {num_images_per_prompt} and"
+                    f" num_prompts_per_image: {num_prompts_per_image} are not equal. Using batch size equal to"
+                    f" `min(num_images_per_prompt, num_prompts_per_image) = {batch_size}."
+                )
+        return batch_size, multiplier
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt with self.tokenizer->self.clip_tokenizer
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
+            self._lora_scale = lora_scale
+
+            # dynamically adjust the LoRA scale
+            if not USE_PEFT_BACKEND:
+                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
+            else:
+                scale_lora_layers(self.text_encoder, lora_scale)
+
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.clip_tokenizer)
+
+            text_inputs = self.clip_tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.clip_tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.clip_tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.clip_tokenizer.batch_decode(
+                    untruncated_ids[:, self.clip_tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.clip_tokenizer.model_max_length} tokens: {removed_text}"
+                )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
+
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.clip_tokenizer)
+
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.clip_tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
+            # Retrieve the original scale by scaling back the LoRA layers
+            unscale_lora_layers(self.text_encoder, lora_scale)
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_instruct_pix2pix.StableDiffusionInstructPix2PixPipeline.prepare_image_latents
+    # Add num_prompts_per_image argument, sample from autoencoder moment distribution
+    def encode_image_vae_latents(
+        self,
+        image,
+        batch_size,
+        num_prompts_per_image,
+        dtype,
+        device,
+        do_classifier_free_guidance,
+        generator=None,
+    ):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_prompts_per_image
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if isinstance(generator, list):
+            image_latents = [
+                self.vae.encode(image[i : i + 1]).latent_dist.sample(generator=generator[i])
+                * self.vae.config.scaling_factor
+                for i in range(batch_size)
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = self.vae.encode(image).latent_dist.sample(generator=generator)
+            # Scale image_latents by the VAE's scaling factor
+            image_latents = image_latents * self.vae.config.scaling_factor
+
+        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+            # expand image_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {image_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // image_latents.shape[0]
+            image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            image_latents = torch.cat([image_latents], dim=0)
+
+        if do_classifier_free_guidance:
+            uncond_image_latents = torch.zeros_like(image_latents)
+            image_latents = torch.cat([image_latents, image_latents, uncond_image_latents], dim=0)
+
+        return image_latents
+
+    def encode_image_clip_latents(
+        self,
+        image,
+        batch_size,
+        num_prompts_per_image,
+        dtype,
+        device,
+        generator=None,
+    ):
+        # Map image to CLIP embedding.
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        preprocessed_image = self.clip_image_processor.preprocess(
+            image,
+            return_tensors="pt",
+        )
+        preprocessed_image = preprocessed_image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_prompts_per_image
+        if isinstance(generator, list):
+            image_latents = [
+                self.image_encoder(**preprocessed_image[i : i + 1]).image_embeds for i in range(batch_size)
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = self.image_encoder(**preprocessed_image).image_embeds
+
+        if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+            # expand image_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {image_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // image_latents.shape[0]
+            image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            image_latents = torch.cat([image_latents], dim=0)
+
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        return image_latents
+
+    def prepare_text_latents(
+        self, batch_size, num_images_per_prompt, seq_len, hidden_size, dtype, device, generator, latents=None
+    ):
+        # Prepare latents for the CLIP embedded prompt.
+        shape = (batch_size * num_images_per_prompt, seq_len, hidden_size)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            # latents is assumed to have shace (B, L, D)
+            latents = latents.repeat(num_images_per_prompt, 1, 1)
+            latents = latents.to(device=device, dtype=dtype)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    # Rename prepare_latents -> prepare_image_vae_latents and add num_prompts_per_image argument.
+    def prepare_image_vae_latents(
+        self,
+        batch_size,
+        num_prompts_per_image,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        shape = (
+            batch_size * num_prompts_per_image,
+            num_channels_latents,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            # latents is assumed to have shape (B, C, H, W)
+            latents = latents.repeat(num_prompts_per_image, 1, 1, 1)
+            latents = latents.to(device=device, dtype=dtype)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def prepare_image_clip_latents(
+        self, batch_size, num_prompts_per_image, clip_img_dim, dtype, device, generator, latents=None
+    ):
+        # Prepare latents for the CLIP embedded image.
+        shape = (batch_size * num_prompts_per_image, 1, clip_img_dim)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            # latents is assumed to have shape (B, L, D)
+            latents = latents.repeat(num_prompts_per_image, 1, 1)
+            latents = latents.to(device=device, dtype=dtype)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    def decode_text_latents(self, text_latents, device):
+        output_token_list, seq_lengths = self.text_decoder.generate_captions(
+            text_latents, self.text_tokenizer.eos_token_id, device=device
+        )
+        output_list = output_token_list.cpu().numpy()
+        generated_text = [
+            self.text_tokenizer.decode(output[: int(length)], skip_special_tokens=True)
+            for output, length in zip(output_list, seq_lengths)
+        ]
+        return generated_text
+
+    def _split(self, x, height, width):
+        r"""
+        Splits a flattened embedding x of shape (B, C * H * W + clip_img_dim) into two tensors of shape (B, C, H, W)
+        and (B, 1, clip_img_dim)
+        """
+        batch_size = x.shape[0]
+        latent_height = height // self.vae_scale_factor
+        latent_width = width // self.vae_scale_factor
+        img_vae_dim = self.num_channels_latents * latent_height * latent_width
+
+        img_vae, img_clip = x.split([img_vae_dim, self.image_encoder_projection_dim], dim=1)
+
+        img_vae = torch.reshape(img_vae, (batch_size, self.num_channels_latents, latent_height, latent_width))
+        img_clip = torch.reshape(img_clip, (batch_size, 1, self.image_encoder_projection_dim))
+        return img_vae, img_clip
+
+    def _combine(self, img_vae, img_clip):
+        r"""
+        Combines a latent iamge img_vae of shape (B, C, H, W) and a CLIP-embedded image img_clip of shape (B, 1,
+        clip_img_dim) into a single tensor of shape (B, C * H * W + clip_img_dim).
+        """
+        img_vae = torch.reshape(img_vae, (img_vae.shape[0], -1))
+        img_clip = torch.reshape(img_clip, (img_clip.shape[0], -1))
+        return torch.concat([img_vae, img_clip], dim=-1)
+
+    def _split_joint(self, x, height, width):
+        r"""
+        Splits a flattened embedding x of shape (B, C * H * W + clip_img_dim + text_seq_len * text_dim] into (img_vae,
+        img_clip, text) where img_vae is of shape (B, C, H, W), img_clip is of shape (B, 1, clip_img_dim), and text is
+        of shape (B, text_seq_len, text_dim).
+        """
+        batch_size = x.shape[0]
+        latent_height = height // self.vae_scale_factor
+        latent_width = width // self.vae_scale_factor
+        img_vae_dim = self.num_channels_latents * latent_height * latent_width
+        text_dim = self.text_encoder_seq_len * self.text_intermediate_dim
+
+        img_vae, img_clip, text = x.split([img_vae_dim, self.image_encoder_projection_dim, text_dim], dim=1)
+
+        img_vae = torch.reshape(img_vae, (batch_size, self.num_channels_latents, latent_height, latent_width))
+        img_clip = torch.reshape(img_clip, (batch_size, 1, self.image_encoder_projection_dim))
+        text = torch.reshape(text, (batch_size, self.text_encoder_seq_len, self.text_intermediate_dim))
+        return img_vae, img_clip, text
+
+    def _combine_joint(self, img_vae, img_clip, text):
+        r"""
+        Combines a latent image img_vae of shape (B, C, H, W), a CLIP-embedded image img_clip of shape (B, L_img,
+        clip_img_dim), and a text embedding text of shape (B, L_text, text_dim) into a single embedding x of shape (B,
+        C * H * W + L_img * clip_img_dim + L_text * text_dim).
+        """
+        img_vae = torch.reshape(img_vae, (img_vae.shape[0], -1))
+        img_clip = torch.reshape(img_clip, (img_clip.shape[0], -1))
+        text = torch.reshape(text, (text.shape[0], -1))
+        return torch.concat([img_vae, img_clip, text], dim=-1)
+
+    def _get_noise_pred(
+        self,
+        mode,
+        latents,
+        t,
+        prompt_embeds,
+        img_vae,
+        img_clip,
+        max_timestep,
+        data_type,
+        guidance_scale,
+        generator,
+        device,
+        height,
+        width,
+    ):
+        r"""
+        Gets the noise prediction using the `unet` and performs classifier-free guidance, if necessary.
+        """
+        if mode == "joint":
+            # Joint text-image generation
+            img_vae_latents, img_clip_latents, text_latents = self._split_joint(latents, height, width)
+
+            img_vae_out, img_clip_out, text_out = self.unet(
+                img_vae_latents, img_clip_latents, text_latents, timestep_img=t, timestep_text=t, data_type=data_type
+            )
+
+            x_out = self._combine_joint(img_vae_out, img_clip_out, text_out)
+
+            if guidance_scale <= 1.0:
+                return x_out
+
+            # Classifier-free guidance
+            img_vae_T = randn_tensor(img_vae.shape, generator=generator, device=device, dtype=img_vae.dtype)
+            img_clip_T = randn_tensor(img_clip.shape, generator=generator, device=device, dtype=img_clip.dtype)
+            text_T = randn_tensor(prompt_embeds.shape, generator=generator, device=device, dtype=prompt_embeds.dtype)
+
+            _, _, text_out_uncond = self.unet(
+                img_vae_T, img_clip_T, text_latents, timestep_img=max_timestep, timestep_text=t, data_type=data_type
+            )
+
+            img_vae_out_uncond, img_clip_out_uncond, _ = self.unet(
+                img_vae_latents,
+                img_clip_latents,
+                text_T,
+                timestep_img=t,
+                timestep_text=max_timestep,
+                data_type=data_type,
+            )
+
+            x_out_uncond = self._combine_joint(img_vae_out_uncond, img_clip_out_uncond, text_out_uncond)
+
+            return guidance_scale * x_out + (1.0 - guidance_scale) * x_out_uncond
+        elif mode == "text2img":
+            # Text-conditioned image generation
+            img_vae_latents, img_clip_latents = self._split(latents, height, width)
+
+            img_vae_out, img_clip_out, text_out = self.unet(
+                img_vae_latents, img_clip_latents, prompt_embeds, timestep_img=t, timestep_text=0, data_type=data_type
+            )
+
+            img_out = self._combine(img_vae_out, img_clip_out)
+
+            if guidance_scale <= 1.0:
+                return img_out
+
+            # Classifier-free guidance
+            text_T = randn_tensor(prompt_embeds.shape, generator=generator, device=device, dtype=prompt_embeds.dtype)
+
+            img_vae_out_uncond, img_clip_out_uncond, text_out_uncond = self.unet(
+                img_vae_latents,
+                img_clip_latents,
+                text_T,
+                timestep_img=t,
+                timestep_text=max_timestep,
+                data_type=data_type,
+            )
+
+            img_out_uncond = self._combine(img_vae_out_uncond, img_clip_out_uncond)
+
+            return guidance_scale * img_out + (1.0 - guidance_scale) * img_out_uncond
+        elif mode == "img2text":
+            # Image-conditioned text generation
+            img_vae_out, img_clip_out, text_out = self.unet(
+                img_vae, img_clip, latents, timestep_img=0, timestep_text=t, data_type=data_type
+            )
+
+            if guidance_scale <= 1.0:
+                return text_out
+
+            # Classifier-free guidance
+            img_vae_T = randn_tensor(img_vae.shape, generator=generator, device=device, dtype=img_vae.dtype)
+            img_clip_T = randn_tensor(img_clip.shape, generator=generator, device=device, dtype=img_clip.dtype)
+
+            img_vae_out_uncond, img_clip_out_uncond, text_out_uncond = self.unet(
+                img_vae_T, img_clip_T, latents, timestep_img=max_timestep, timestep_text=t, data_type=data_type
+            )
+
+            return guidance_scale * text_out + (1.0 - guidance_scale) * text_out_uncond
+        elif mode == "text":
+            # Unconditional ("marginal") text generation (no CFG)
+            img_vae_out, img_clip_out, text_out = self.unet(
+                img_vae, img_clip, latents, timestep_img=max_timestep, timestep_text=t, data_type=data_type
+            )
+
+            return text_out
+        elif mode == "img":
+            # Unconditional ("marginal") image generation (no CFG)
+            img_vae_latents, img_clip_latents = self._split(latents, height, width)
+
+            img_vae_out, img_clip_out, text_out = self.unet(
+                img_vae_latents,
+                img_clip_latents,
+                prompt_embeds,
+                timestep_img=t,
+                timestep_text=max_timestep,
+                data_type=data_type,
+            )
+
+            img_out = self._combine(img_vae_out, img_clip_out)
+            return img_out
+
+    def check_latents_shape(self, latents_name, latents, expected_shape):
+        latents_shape = latents.shape
+        expected_num_dims = len(expected_shape) + 1  # expected dimensions plus the batch dimension
+        expected_shape_str = ", ".join(str(dim) for dim in expected_shape)
+        if len(latents_shape) != expected_num_dims:
+            raise ValueError(
+                f"`{latents_name}` should have shape (batch_size, {expected_shape_str}), but the current shape"
+                f" {latents_shape} has {len(latents_shape)} dimensions."
+            )
+        for i in range(1, expected_num_dims):
+            if latents_shape[i] != expected_shape[i - 1]:
+                raise ValueError(
+                    f"`{latents_name}` should have shape (batch_size, {expected_shape_str}), but the current shape"
+                    f" {latents_shape} has {latents_shape[i]} != {expected_shape[i - 1]} at dimension {i}."
+                )
+
+    def check_inputs(
+        self,
+        mode,
+        prompt,
+        image,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        latents=None,
+        prompt_latents=None,
+        vae_latents=None,
+        clip_latents=None,
+    ):
+        # Check inputs before running the generative process.
+        if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}."
+            )
+
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+
+        if mode == "text2img":
+            if prompt is not None and prompt_embeds is not None:
+                raise ValueError(
+                    f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                    " only forward one of the two."
+                )
+            elif prompt is None and prompt_embeds is None:
+                raise ValueError(
+                    "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+                )
+            elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+                raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+            if negative_prompt is not None and negative_prompt_embeds is not None:
+                raise ValueError(
+                    f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                    f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+                )
+
+            if prompt_embeds is not None and negative_prompt_embeds is not None:
+                if prompt_embeds.shape != negative_prompt_embeds.shape:
+                    raise ValueError(
+                        "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                        f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                        f" {negative_prompt_embeds.shape}."
+                    )
+
+        if mode == "img2text":
+            if image is None:
+                raise ValueError("`img2text` mode requires an image to be provided.")
+
+        # Check provided latents
+        latent_height = height // self.vae_scale_factor
+        latent_width = width // self.vae_scale_factor
+        full_latents_available = latents is not None
+        prompt_latents_available = prompt_latents is not None
+        vae_latents_available = vae_latents is not None
+        clip_latents_available = clip_latents is not None
+
+        if full_latents_available:
+            individual_latents_available = (
+                prompt_latents is not None or vae_latents is not None or clip_latents is not None
+            )
+            if individual_latents_available:
+                logger.warning(
+                    "You have supplied both `latents` and at least one of `prompt_latents`, `vae_latents`, and"
+                    " `clip_latents`. The value of `latents` will override the value of any individually supplied latents."
+                )
+            # Check shape of full latents
+            img_vae_dim = self.num_channels_latents * latent_height * latent_width
+            text_dim = self.text_encoder_seq_len * self.text_encoder_hidden_size
+            latents_dim = img_vae_dim + self.image_encoder_projection_dim + text_dim
+            latents_expected_shape = (latents_dim,)
+            self.check_latents_shape("latents", latents, latents_expected_shape)
+
+        # Check individual latent shapes, if present
+        if prompt_latents_available:
+            prompt_latents_expected_shape = (self.text_encoder_seq_len, self.text_encoder_hidden_size)
+            self.check_latents_shape("prompt_latents", prompt_latents, prompt_latents_expected_shape)
+
+        if vae_latents_available:
+            vae_latents_expected_shape = (self.num_channels_latents, latent_height, latent_width)
+            self.check_latents_shape("vae_latents", vae_latents, vae_latents_expected_shape)
+
+        if clip_latents_available:
+            clip_latents_expected_shape = (1, self.image_encoder_projection_dim)
+            self.check_latents_shape("clip_latents", clip_latents, clip_latents_expected_shape)
+
+        if mode in ["text2img", "img"] and vae_latents_available and clip_latents_available:
+            if vae_latents.shape[0] != clip_latents.shape[0]:
+                raise ValueError(
+                    f"Both `vae_latents` and `clip_latents` are supplied, but their batch dimensions are not equal:"
+                    f" {vae_latents.shape[0]} != {clip_latents.shape[0]}."
+                )
+
+        if mode == "joint" and prompt_latents_available and vae_latents_available and clip_latents_available:
+            if prompt_latents.shape[0] != vae_latents.shape[0] or prompt_latents.shape[0] != clip_latents.shape[0]:
+                raise ValueError(
+                    f"All of `prompt_latents`, `vae_latents`, and `clip_latents` are supplied, but their batch"
+                    f" dimensions are not equal: {prompt_latents.shape[0]} != {vae_latents.shape[0]}"
+                    f" != {clip_latents.shape[0]}."
+                )
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        image: Optional[Union[torch.FloatTensor, PIL.Image.Image]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        data_type: Optional[int] = 1,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 8.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        num_prompts_per_image: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_latents: Optional[torch.FloatTensor] = None,
+        vae_latents: Optional[torch.FloatTensor] = None,
+        clip_latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+                Required for text-conditioned image generation (`text2img`) mode.
+            image (`torch.FloatTensor` or `PIL.Image.Image`, *optional*):
+                `Image` or tensor representing an image batch. Required for image-conditioned text generation
+                (`img2text`) mode.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            data_type (`int`, *optional*, defaults to 1):
+                The data type (either 0 or 1). Only used if you are loading a checkpoint which supports a data type
+                embedding; this is added for compatibility with the
+                [UniDiffuser-v1](https://huggingface.co/thu-ml/unidiffuser-v1) checkpoint.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 8.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). Used in
+                text-conditioned image generation (`text2img`) mode.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt. Used in `text2img` (text-conditioned image generation) and
+                `img` mode. If the mode is joint and both `num_images_per_prompt` and `num_prompts_per_image` are
+                supplied, `min(num_images_per_prompt, num_prompts_per_image)` samples are generated.
+            num_prompts_per_image (`int`, *optional*, defaults to 1):
+                The number of prompts to generate per image. Used in `img2text` (image-conditioned text generation) and
+                `text` mode. If the mode is joint and both `num_images_per_prompt` and `num_prompts_per_image` are
+                supplied, `min(num_images_per_prompt, num_prompts_per_image)` samples are generated.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for joint
+                image-text generation. Can be used to tweak the same generation with different prompts. If not
+                provided, a latents tensor is generated by sampling using the supplied random `generator`. This assumes
+                a full set of VAE, CLIP, and text latents, if supplied, overrides the value of `prompt_latents`,
+                `vae_latents`, and `clip_latents`.
+            prompt_latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for text
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            vae_latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            clip_latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument. Used in text-conditioned
+                image generation (`text2img`) mode.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are be generated from the `negative_prompt` input argument. Used
+                in text-conditioned image generation (`text2img`) mode.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImageTextPipelineOutput`] instead of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+
+        Returns:
+            [`~pipelines.unidiffuser.ImageTextPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.unidiffuser.ImageTextPipelineOutput`] is returned, otherwise a
+                `tuple` is returned where the first element is a list with the generated images and the second element
+                is a list of generated texts.
+        """
+
+        # 0. Default height and width to unet
+        height = height or self.unet_resolution * self.vae_scale_factor
+        width = width or self.unet_resolution * self.vae_scale_factor
+
+        # 1. Check inputs
+        # Recalculate mode for each call to the pipeline.
+        mode = self._infer_mode(prompt, prompt_embeds, image, latents, prompt_latents, vae_latents, clip_latents)
+        self.check_inputs(
+            mode,
+            prompt,
+            image,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            latents,
+            prompt_latents,
+            vae_latents,
+            clip_latents,
+        )
+
+        # 2. Define call parameters
+        batch_size, multiplier = self._infer_batch_size(
+            mode,
+            prompt,
+            prompt_embeds,
+            image,
+            num_images_per_prompt,
+            num_prompts_per_image,
+            latents,
+            prompt_latents,
+            vae_latents,
+            clip_latents,
+        )
+        device = self._execution_device
+        reduce_text_emb_dim = self.text_intermediate_dim < self.text_encoder_hidden_size or self.mode != "text2img"
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        # Note that this differs from the formulation in the unidiffusers paper!
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # check if scheduler is in sigmas space
+        # scheduler_is_in_sigma_space = hasattr(self.scheduler, "sigmas")
+
+        # 3. Encode input prompt, if available; otherwise prepare text latents
+        if latents is not None:
+            # Overwrite individual latents
+            vae_latents, clip_latents, prompt_latents = self._split_joint(latents, height, width)
+
+        if mode in ["text2img"]:
+            # 3.1. Encode input prompt, if available
+            assert prompt is not None or prompt_embeds is not None
+            prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=multiplier,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+                negative_prompt=negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+            )
+
+            # if do_classifier_free_guidance:
+            #     prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        else:
+            # 3.2. Prepare text latent variables, if input not available
+            prompt_embeds = self.prepare_text_latents(
+                batch_size=batch_size,
+                num_images_per_prompt=multiplier,
+                seq_len=self.text_encoder_seq_len,
+                hidden_size=self.text_encoder_hidden_size,
+                dtype=self.text_encoder.dtype,  # Should work with both full precision and mixed precision
+                device=device,
+                generator=generator,
+                latents=prompt_latents,
+            )
+
+        if reduce_text_emb_dim:
+            prompt_embeds = self.text_decoder.encode(prompt_embeds)
+
+        # 4. Encode image, if available; otherwise prepare image latents
+        if mode in ["img2text"]:
+            # 4.1. Encode images, if available
+            assert image is not None, "`img2text` requires a conditioning image"
+            # Encode image using VAE
+            image_vae = self.image_processor.preprocess(image)
+            height, width = image_vae.shape[-2:]
+            image_vae_latents = self.encode_image_vae_latents(
+                image=image_vae,
+                batch_size=batch_size,
+                num_prompts_per_image=multiplier,
+                dtype=prompt_embeds.dtype,
+                device=device,
+                do_classifier_free_guidance=False,  # Copied from InstructPix2Pix, don't use their version of CFG
+                generator=generator,
+            )
+
+            # Encode image using CLIP
+            image_clip_latents = self.encode_image_clip_latents(
+                image=image,
+                batch_size=batch_size,
+                num_prompts_per_image=multiplier,
+                dtype=prompt_embeds.dtype,
+                device=device,
+                generator=generator,
+            )
+            # (batch_size, clip_hidden_size) => (batch_size, 1, clip_hidden_size)
+            image_clip_latents = image_clip_latents.unsqueeze(1)
+        else:
+            # 4.2. Prepare image latent variables, if input not available
+            # Prepare image VAE latents in latent space
+            image_vae_latents = self.prepare_image_vae_latents(
+                batch_size=batch_size,
+                num_prompts_per_image=multiplier,
+                num_channels_latents=self.num_channels_latents,
+                height=height,
+                width=width,
+                dtype=prompt_embeds.dtype,
+                device=device,
+                generator=generator,
+                latents=vae_latents,
+            )
+
+            # Prepare image CLIP latents
+            image_clip_latents = self.prepare_image_clip_latents(
+                batch_size=batch_size,
+                num_prompts_per_image=multiplier,
+                clip_img_dim=self.image_encoder_projection_dim,
+                dtype=prompt_embeds.dtype,
+                device=device,
+                generator=generator,
+                latents=clip_latents,
+            )
+
+        # 5. Set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # max_timestep = timesteps[0]
+        max_timestep = self.scheduler.config.num_train_timesteps
+
+        # 6. Prepare latent variables
+        if mode == "joint":
+            latents = self._combine_joint(image_vae_latents, image_clip_latents, prompt_embeds)
+        elif mode in ["text2img", "img"]:
+            latents = self._combine(image_vae_latents, image_clip_latents)
+        elif mode in ["img2text", "text"]:
+            latents = prompt_embeds
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        logger.debug(f"Scheduler extra step kwargs: {extra_step_kwargs}")
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # predict the noise residual
+                # Also applies classifier-free guidance as described in the UniDiffuser paper
+                noise_pred = self._get_noise_pred(
+                    mode,
+                    latents,
+                    t,
+                    prompt_embeds,
+                    image_vae_latents,
+                    image_clip_latents,
+                    max_timestep,
+                    data_type,
+                    guidance_scale,
+                    generator,
+                    device,
+                    height,
+                    width,
+                )
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        # 9. Post-processing
+        image = None
+        text = None
+        if mode == "joint":
+            image_vae_latents, image_clip_latents, text_latents = self._split_joint(latents, height, width)
+
+            if not output_type == "latent":
+                # Map latent VAE image back to pixel space
+                image = self.vae.decode(image_vae_latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            else:
+                image = image_vae_latents
+
+            text = self.decode_text_latents(text_latents, device)
+        elif mode in ["text2img", "img"]:
+            image_vae_latents, image_clip_latents = self._split(latents, height, width)
+
+            if not output_type == "latent":
+                # Map latent VAE image back to pixel space
+                image = self.vae.decode(image_vae_latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            else:
+                image = image_vae_latents
+        elif mode in ["img2text", "text"]:
+            text_latents = latents
+            text = self.decode_text_latents(text_latents, device)
+
+        self.maybe_free_model_hooks()
+
+        # 10. Postprocess the image, if necessary
+        if image is not None:
+            do_denormalize = [True] * image.shape[0]
+            image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload last model to CPU
+        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
+            self.final_offload_hook.offload()
+
+        if not return_dict:
+            return (image, text)
+
+        return ImageTextPipelineOutput(images=image, text=text)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/__init__.py
new file mode 100644
index 000000000..ddb852d19
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/__init__.py
@@ -0,0 +1,56 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["modeling_paella_vq_model"] = ["PaellaVQModel"]
+    _import_structure["modeling_wuerstchen_diffnext"] = ["WuerstchenDiffNeXt"]
+    _import_structure["modeling_wuerstchen_prior"] = ["WuerstchenPrior"]
+    _import_structure["pipeline_wuerstchen"] = ["WuerstchenDecoderPipeline"]
+    _import_structure["pipeline_wuerstchen_combined"] = ["WuerstchenCombinedPipeline"]
+    _import_structure["pipeline_wuerstchen_prior"] = ["DEFAULT_STAGE_C_TIMESTEPS", "WuerstchenPriorPipeline"]
+
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .modeling_paella_vq_model import PaellaVQModel
+        from .modeling_wuerstchen_diffnext import WuerstchenDiffNeXt
+        from .modeling_wuerstchen_prior import WuerstchenPrior
+        from .pipeline_wuerstchen import WuerstchenDecoderPipeline
+        from .pipeline_wuerstchen_combined import WuerstchenCombinedPipeline
+        from .pipeline_wuerstchen_prior import DEFAULT_STAGE_C_TIMESTEPS, WuerstchenPriorPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py
new file mode 100644
index 000000000..3b21dfb5f
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2022 Dominic Rampas MIT License
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Union
+
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models.autoencoders.vae import DecoderOutput, VectorQuantizer
+from ...models.modeling_utils import ModelMixin
+from ...models.vq_model import VQEncoderOutput
+from ...utils.accelerate_utils import apply_forward_hook
+
+
+class MixingResidualBlock(nn.Module):
+    """
+    Residual block with mixing used by Paella's VQ-VAE.
+    """
+
+    def __init__(self, inp_channels, embed_dim):
+        super().__init__()
+        # depthwise
+        self.norm1 = nn.LayerNorm(inp_channels, elementwise_affine=False, eps=1e-6)
+        self.depthwise = nn.Sequential(
+            nn.ReplicationPad2d(1), nn.Conv2d(inp_channels, inp_channels, kernel_size=3, groups=inp_channels)
+        )
+
+        # channelwise
+        self.norm2 = nn.LayerNorm(inp_channels, elementwise_affine=False, eps=1e-6)
+        self.channelwise = nn.Sequential(
+            nn.Linear(inp_channels, embed_dim), nn.GELU(), nn.Linear(embed_dim, inp_channels)
+        )
+
+        self.gammas = nn.Parameter(torch.zeros(6), requires_grad=True)
+
+    def forward(self, x):
+        mods = self.gammas
+        x_temp = self.norm1(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2) * (1 + mods[0]) + mods[1]
+        x = x + self.depthwise(x_temp) * mods[2]
+        x_temp = self.norm2(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2) * (1 + mods[3]) + mods[4]
+        x = x + self.channelwise(x_temp.permute(0, 2, 3, 1)).permute(0, 3, 1, 2) * mods[5]
+        return x
+
+
+class PaellaVQModel(ModelMixin, ConfigMixin):
+    r"""VQ-VAE model from Paella model.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+    implements for all the model (such as downloading or saving, etc.)
+
+    Parameters:
+        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
+        up_down_scale_factor (int, *optional*, defaults to 2): Up and Downscale factor of the input image.
+        levels  (int, *optional*, defaults to 2): Number of levels in the model.
+        bottleneck_blocks (int, *optional*, defaults to 12): Number of bottleneck blocks in the model.
+        embed_dim (int, *optional*, defaults to 384): Number of hidden channels in the model.
+        latent_channels (int, *optional*, defaults to 4): Number of latent channels in the VQ-VAE model.
+        num_vq_embeddings (int, *optional*, defaults to 8192): Number of codebook vectors in the VQ-VAE.
+        scale_factor (float, *optional*, defaults to 0.3764): Scaling factor of the latent space.
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        up_down_scale_factor: int = 2,
+        levels: int = 2,
+        bottleneck_blocks: int = 12,
+        embed_dim: int = 384,
+        latent_channels: int = 4,
+        num_vq_embeddings: int = 8192,
+        scale_factor: float = 0.3764,
+    ):
+        super().__init__()
+
+        c_levels = [embed_dim // (2**i) for i in reversed(range(levels))]
+        # Encoder blocks
+        self.in_block = nn.Sequential(
+            nn.PixelUnshuffle(up_down_scale_factor),
+            nn.Conv2d(in_channels * up_down_scale_factor**2, c_levels[0], kernel_size=1),
+        )
+        down_blocks = []
+        for i in range(levels):
+            if i > 0:
+                down_blocks.append(nn.Conv2d(c_levels[i - 1], c_levels[i], kernel_size=4, stride=2, padding=1))
+            block = MixingResidualBlock(c_levels[i], c_levels[i] * 4)
+            down_blocks.append(block)
+        down_blocks.append(
+            nn.Sequential(
+                nn.Conv2d(c_levels[-1], latent_channels, kernel_size=1, bias=False),
+                nn.BatchNorm2d(latent_channels),  # then normalize them to have mean 0 and std 1
+            )
+        )
+        self.down_blocks = nn.Sequential(*down_blocks)
+
+        # Vector Quantizer
+        self.vquantizer = VectorQuantizer(num_vq_embeddings, vq_embed_dim=latent_channels, legacy=False, beta=0.25)
+
+        # Decoder blocks
+        up_blocks = [nn.Sequential(nn.Conv2d(latent_channels, c_levels[-1], kernel_size=1))]
+        for i in range(levels):
+            for j in range(bottleneck_blocks if i == 0 else 1):
+                block = MixingResidualBlock(c_levels[levels - 1 - i], c_levels[levels - 1 - i] * 4)
+                up_blocks.append(block)
+            if i < levels - 1:
+                up_blocks.append(
+                    nn.ConvTranspose2d(
+                        c_levels[levels - 1 - i], c_levels[levels - 2 - i], kernel_size=4, stride=2, padding=1
+                    )
+                )
+        self.up_blocks = nn.Sequential(*up_blocks)
+        self.out_block = nn.Sequential(
+            nn.Conv2d(c_levels[0], out_channels * up_down_scale_factor**2, kernel_size=1),
+            nn.PixelShuffle(up_down_scale_factor),
+        )
+
+    @apply_forward_hook
+    def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOutput:
+        h = self.in_block(x)
+        h = self.down_blocks(h)
+
+        if not return_dict:
+            return (h,)
+
+        return VQEncoderOutput(latents=h)
+
+    @apply_forward_hook
+    def decode(
+        self, h: torch.FloatTensor, force_not_quantize: bool = True, return_dict: bool = True
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
+        if not force_not_quantize:
+            quant, _, _ = self.vquantizer(h)
+        else:
+            quant = h
+
+        x = self.up_blocks(quant)
+        dec = self.out_block(x)
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    def forward(self, sample: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): Input sample.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        h = self.encode(x).latents
+        dec = self.decode(h).sample
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py
new file mode 100644
index 000000000..101acafcf
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py
@@ -0,0 +1,81 @@
+import torch
+import torch.nn as nn
+
+from ...models.attention_processor import Attention
+
+
+class WuerstchenLayerNorm(nn.LayerNorm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def forward(self, x):
+        x = x.permute(0, 2, 3, 1)
+        x = super().forward(x)
+        return x.permute(0, 3, 1, 2)
+
+
+class TimestepBlock(nn.Module):
+    def __init__(self, c, c_timestep):
+        super().__init__()
+        linear_cls = nn.Linear
+        self.mapper = linear_cls(c_timestep, c * 2)
+
+    def forward(self, x, t):
+        a, b = self.mapper(t)[:, :, None, None].chunk(2, dim=1)
+        return x * (1 + a) + b
+
+
+class ResBlock(nn.Module):
+    def __init__(self, c, c_skip=0, kernel_size=3, dropout=0.0):
+        super().__init__()
+
+        conv_cls = nn.Conv2d
+        linear_cls = nn.Linear
+
+        self.depthwise = conv_cls(c + c_skip, c, kernel_size=kernel_size, padding=kernel_size // 2, groups=c)
+        self.norm = WuerstchenLayerNorm(c, elementwise_affine=False, eps=1e-6)
+        self.channelwise = nn.Sequential(
+            linear_cls(c, c * 4), nn.GELU(), GlobalResponseNorm(c * 4), nn.Dropout(dropout), linear_cls(c * 4, c)
+        )
+
+    def forward(self, x, x_skip=None):
+        x_res = x
+        if x_skip is not None:
+            x = torch.cat([x, x_skip], dim=1)
+        x = self.norm(self.depthwise(x)).permute(0, 2, 3, 1)
+        x = self.channelwise(x).permute(0, 3, 1, 2)
+        return x + x_res
+
+
+# from https://github.com/facebookresearch/ConvNeXt-V2/blob/3608f67cc1dae164790c5d0aead7bf2d73d9719b/models/utils.py#L105
+class GlobalResponseNorm(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.zeros(1, 1, 1, dim))
+        self.beta = nn.Parameter(torch.zeros(1, 1, 1, dim))
+
+    def forward(self, x):
+        agg_norm = torch.norm(x, p=2, dim=(1, 2), keepdim=True)
+        stand_div_norm = agg_norm / (agg_norm.mean(dim=-1, keepdim=True) + 1e-6)
+        return self.gamma * (x * stand_div_norm) + self.beta + x
+
+
+class AttnBlock(nn.Module):
+    def __init__(self, c, c_cond, nhead, self_attn=True, dropout=0.0):
+        super().__init__()
+
+        linear_cls = nn.Linear
+
+        self.self_attn = self_attn
+        self.norm = WuerstchenLayerNorm(c, elementwise_affine=False, eps=1e-6)
+        self.attention = Attention(query_dim=c, heads=nhead, dim_head=c // nhead, dropout=dropout, bias=True)
+        self.kv_mapper = nn.Sequential(nn.SiLU(), linear_cls(c_cond, c))
+
+    def forward(self, x, kv):
+        kv = self.kv_mapper(kv)
+        norm_x = self.norm(x)
+        if self.self_attn:
+            batch_size, channel, _, _ = x.shape
+            kv = torch.cat([norm_x.view(batch_size, channel, -1).transpose(1, 2), kv], dim=1)
+        x = x + self.attention(norm_x, encoder_hidden_states=kv)
+        return x
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py
new file mode 100644
index 000000000..6c06cc0e7
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py
@@ -0,0 +1,254 @@
+# Copyright (c) 2023 Dominic Rampas MIT License
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models.modeling_utils import ModelMixin
+from .modeling_wuerstchen_common import AttnBlock, GlobalResponseNorm, TimestepBlock, WuerstchenLayerNorm
+
+
+class WuerstchenDiffNeXt(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        c_in=4,
+        c_out=4,
+        c_r=64,
+        patch_size=2,
+        c_cond=1024,
+        c_hidden=[320, 640, 1280, 1280],
+        nhead=[-1, 10, 20, 20],
+        blocks=[4, 4, 14, 4],
+        level_config=["CT", "CTA", "CTA", "CTA"],
+        inject_effnet=[False, True, True, True],
+        effnet_embd=16,
+        clip_embd=1024,
+        kernel_size=3,
+        dropout=0.1,
+    ):
+        super().__init__()
+        self.c_r = c_r
+        self.c_cond = c_cond
+        if not isinstance(dropout, list):
+            dropout = [dropout] * len(c_hidden)
+
+        # CONDITIONING
+        self.clip_mapper = nn.Linear(clip_embd, c_cond)
+        self.effnet_mappers = nn.ModuleList(
+            [
+                nn.Conv2d(effnet_embd, c_cond, kernel_size=1) if inject else None
+                for inject in inject_effnet + list(reversed(inject_effnet))
+            ]
+        )
+        self.seq_norm = nn.LayerNorm(c_cond, elementwise_affine=False, eps=1e-6)
+
+        self.embedding = nn.Sequential(
+            nn.PixelUnshuffle(patch_size),
+            nn.Conv2d(c_in * (patch_size**2), c_hidden[0], kernel_size=1),
+            WuerstchenLayerNorm(c_hidden[0], elementwise_affine=False, eps=1e-6),
+        )
+
+        def get_block(block_type, c_hidden, nhead, c_skip=0, dropout=0):
+            if block_type == "C":
+                return ResBlockStageB(c_hidden, c_skip, kernel_size=kernel_size, dropout=dropout)
+            elif block_type == "A":
+                return AttnBlock(c_hidden, c_cond, nhead, self_attn=True, dropout=dropout)
+            elif block_type == "T":
+                return TimestepBlock(c_hidden, c_r)
+            else:
+                raise ValueError(f"Block type {block_type} not supported")
+
+        # BLOCKS
+        # -- down blocks
+        self.down_blocks = nn.ModuleList()
+        for i in range(len(c_hidden)):
+            down_block = nn.ModuleList()
+            if i > 0:
+                down_block.append(
+                    nn.Sequential(
+                        WuerstchenLayerNorm(c_hidden[i - 1], elementwise_affine=False, eps=1e-6),
+                        nn.Conv2d(c_hidden[i - 1], c_hidden[i], kernel_size=2, stride=2),
+                    )
+                )
+            for _ in range(blocks[i]):
+                for block_type in level_config[i]:
+                    c_skip = c_cond if inject_effnet[i] else 0
+                    down_block.append(get_block(block_type, c_hidden[i], nhead[i], c_skip=c_skip, dropout=dropout[i]))
+            self.down_blocks.append(down_block)
+
+        # -- up blocks
+        self.up_blocks = nn.ModuleList()
+        for i in reversed(range(len(c_hidden))):
+            up_block = nn.ModuleList()
+            for j in range(blocks[i]):
+                for k, block_type in enumerate(level_config[i]):
+                    c_skip = c_hidden[i] if i < len(c_hidden) - 1 and j == k == 0 else 0
+                    c_skip += c_cond if inject_effnet[i] else 0
+                    up_block.append(get_block(block_type, c_hidden[i], nhead[i], c_skip=c_skip, dropout=dropout[i]))
+            if i > 0:
+                up_block.append(
+                    nn.Sequential(
+                        WuerstchenLayerNorm(c_hidden[i], elementwise_affine=False, eps=1e-6),
+                        nn.ConvTranspose2d(c_hidden[i], c_hidden[i - 1], kernel_size=2, stride=2),
+                    )
+                )
+            self.up_blocks.append(up_block)
+
+        # OUTPUT
+        self.clf = nn.Sequential(
+            WuerstchenLayerNorm(c_hidden[0], elementwise_affine=False, eps=1e-6),
+            nn.Conv2d(c_hidden[0], 2 * c_out * (patch_size**2), kernel_size=1),
+            nn.PixelShuffle(patch_size),
+        )
+
+        # --- WEIGHT INIT ---
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        # General init
+        if isinstance(m, (nn.Conv2d, nn.Linear)):
+            nn.init.xavier_uniform_(m.weight)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+        for mapper in self.effnet_mappers:
+            if mapper is not None:
+                nn.init.normal_(mapper.weight, std=0.02)  # conditionings
+        nn.init.normal_(self.clip_mapper.weight, std=0.02)  # conditionings
+        nn.init.xavier_uniform_(self.embedding[1].weight, 0.02)  # inputs
+        nn.init.constant_(self.clf[1].weight, 0)  # outputs
+
+        # blocks
+        for level_block in self.down_blocks + self.up_blocks:
+            for block in level_block:
+                if isinstance(block, ResBlockStageB):
+                    block.channelwise[-1].weight.data *= np.sqrt(1 / sum(self.config.blocks))
+                elif isinstance(block, TimestepBlock):
+                    nn.init.constant_(block.mapper.weight, 0)
+
+    def gen_r_embedding(self, r, max_positions=10000):
+        r = r * max_positions
+        half_dim = self.c_r // 2
+        emb = math.log(max_positions) / (half_dim - 1)
+        emb = torch.arange(half_dim, device=r.device).float().mul(-emb).exp()
+        emb = r[:, None] * emb[None, :]
+        emb = torch.cat([emb.sin(), emb.cos()], dim=1)
+        if self.c_r % 2 == 1:  # zero pad
+            emb = nn.functional.pad(emb, (0, 1), mode="constant")
+        return emb.to(dtype=r.dtype)
+
+    def gen_c_embeddings(self, clip):
+        clip = self.clip_mapper(clip)
+        clip = self.seq_norm(clip)
+        return clip
+
+    def _down_encode(self, x, r_embed, effnet, clip=None):
+        level_outputs = []
+        for i, down_block in enumerate(self.down_blocks):
+            effnet_c = None
+            for block in down_block:
+                if isinstance(block, ResBlockStageB):
+                    if effnet_c is None and self.effnet_mappers[i] is not None:
+                        dtype = effnet.dtype
+                        effnet_c = self.effnet_mappers[i](
+                            nn.functional.interpolate(
+                                effnet.float(), size=x.shape[-2:], mode="bicubic", antialias=True, align_corners=True
+                            ).to(dtype)
+                        )
+                    skip = effnet_c if self.effnet_mappers[i] is not None else None
+                    x = block(x, skip)
+                elif isinstance(block, AttnBlock):
+                    x = block(x, clip)
+                elif isinstance(block, TimestepBlock):
+                    x = block(x, r_embed)
+                else:
+                    x = block(x)
+            level_outputs.insert(0, x)
+        return level_outputs
+
+    def _up_decode(self, level_outputs, r_embed, effnet, clip=None):
+        x = level_outputs[0]
+        for i, up_block in enumerate(self.up_blocks):
+            effnet_c = None
+            for j, block in enumerate(up_block):
+                if isinstance(block, ResBlockStageB):
+                    if effnet_c is None and self.effnet_mappers[len(self.down_blocks) + i] is not None:
+                        dtype = effnet.dtype
+                        effnet_c = self.effnet_mappers[len(self.down_blocks) + i](
+                            nn.functional.interpolate(
+                                effnet.float(), size=x.shape[-2:], mode="bicubic", antialias=True, align_corners=True
+                            ).to(dtype)
+                        )
+                    skip = level_outputs[i] if j == 0 and i > 0 else None
+                    if effnet_c is not None:
+                        if skip is not None:
+                            skip = torch.cat([skip, effnet_c], dim=1)
+                        else:
+                            skip = effnet_c
+                    x = block(x, skip)
+                elif isinstance(block, AttnBlock):
+                    x = block(x, clip)
+                elif isinstance(block, TimestepBlock):
+                    x = block(x, r_embed)
+                else:
+                    x = block(x)
+        return x
+
+    def forward(self, x, r, effnet, clip=None, x_cat=None, eps=1e-3, return_noise=True):
+        if x_cat is not None:
+            x = torch.cat([x, x_cat], dim=1)
+        # Process the conditioning embeddings
+        r_embed = self.gen_r_embedding(r)
+        if clip is not None:
+            clip = self.gen_c_embeddings(clip)
+
+        # Model Blocks
+        x_in = x
+        x = self.embedding(x)
+        level_outputs = self._down_encode(x, r_embed, effnet, clip)
+        x = self._up_decode(level_outputs, r_embed, effnet, clip)
+        a, b = self.clf(x).chunk(2, dim=1)
+        b = b.sigmoid() * (1 - eps * 2) + eps
+        if return_noise:
+            return (x_in - a) / b
+        else:
+            return a, b
+
+
+class ResBlockStageB(nn.Module):
+    def __init__(self, c, c_skip=0, kernel_size=3, dropout=0.0):
+        super().__init__()
+        self.depthwise = nn.Conv2d(c, c, kernel_size=kernel_size, padding=kernel_size // 2, groups=c)
+        self.norm = WuerstchenLayerNorm(c, elementwise_affine=False, eps=1e-6)
+        self.channelwise = nn.Sequential(
+            nn.Linear(c + c_skip, c * 4),
+            nn.GELU(),
+            GlobalResponseNorm(c * 4),
+            nn.Dropout(dropout),
+            nn.Linear(c * 4, c),
+        )
+
+    def forward(self, x, x_skip=None):
+        x_res = x
+        x = self.norm(self.depthwise(x))
+        if x_skip is not None:
+            x = torch.cat([x, x_skip], dim=1)
+        x = self.channelwise(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
+        return x + x_res
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py
new file mode 100644
index 000000000..8cc294eaf
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2023 Dominic Rampas MIT License
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Dict, Union
+
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import PeftAdapterMixin, UNet2DConditionLoadersMixin
+from ...models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
+from ...models.modeling_utils import ModelMixin
+from ...utils import is_torch_version
+from .modeling_wuerstchen_common import AttnBlock, ResBlock, TimestepBlock, WuerstchenLayerNorm
+
+
+class WuerstchenPrior(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin):
+    unet_name = "prior"
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(self, c_in=16, c=1280, c_cond=1024, c_r=64, depth=16, nhead=16, dropout=0.1):
+        super().__init__()
+        conv_cls = nn.Conv2d
+        linear_cls = nn.Linear
+
+        self.c_r = c_r
+        self.projection = conv_cls(c_in, c, kernel_size=1)
+        self.cond_mapper = nn.Sequential(
+            linear_cls(c_cond, c),
+            nn.LeakyReLU(0.2),
+            linear_cls(c, c),
+        )
+
+        self.blocks = nn.ModuleList()
+        for _ in range(depth):
+            self.blocks.append(ResBlock(c, dropout=dropout))
+            self.blocks.append(TimestepBlock(c, c_r))
+            self.blocks.append(AttnBlock(c, c, nhead, self_attn=True, dropout=dropout))
+        self.out = nn.Sequential(
+            WuerstchenLayerNorm(c, elementwise_affine=False, eps=1e-6),
+            conv_cls(c, c_in * 2, kernel_size=1),
+        )
+
+        self.gradient_checkpointing = False
+        self.set_default_attn_processor()
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        self.gradient_checkpointing = value
+
+    def gen_r_embedding(self, r, max_positions=10000):
+        r = r * max_positions
+        half_dim = self.c_r // 2
+        emb = math.log(max_positions) / (half_dim - 1)
+        emb = torch.arange(half_dim, device=r.device).float().mul(-emb).exp()
+        emb = r[:, None] * emb[None, :]
+        emb = torch.cat([emb.sin(), emb.cos()], dim=1)
+        if self.c_r % 2 == 1:  # zero pad
+            emb = nn.functional.pad(emb, (0, 1), mode="constant")
+        return emb.to(dtype=r.dtype)
+
+    def forward(self, x, r, c):
+        x_in = x
+        x = self.projection(x)
+        c_embed = self.cond_mapper(c)
+        r_embed = self.gen_r_embedding(r)
+
+        if self.training and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            if is_torch_version(">=", "1.11.0"):
+                for block in self.blocks:
+                    if isinstance(block, AttnBlock):
+                        x = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(block), x, c_embed, use_reentrant=False
+                        )
+                    elif isinstance(block, TimestepBlock):
+                        x = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(block), x, r_embed, use_reentrant=False
+                        )
+                    else:
+                        x = torch.utils.checkpoint.checkpoint(create_custom_forward(block), x, use_reentrant=False)
+            else:
+                for block in self.blocks:
+                    if isinstance(block, AttnBlock):
+                        x = torch.utils.checkpoint.checkpoint(create_custom_forward(block), x, c_embed)
+                    elif isinstance(block, TimestepBlock):
+                        x = torch.utils.checkpoint.checkpoint(create_custom_forward(block), x, r_embed)
+                    else:
+                        x = torch.utils.checkpoint.checkpoint(create_custom_forward(block), x)
+        else:
+            for block in self.blocks:
+                if isinstance(block, AttnBlock):
+                    x = block(x, c_embed)
+                elif isinstance(block, TimestepBlock):
+                    x = block(x, r_embed)
+                else:
+                    x = block(x)
+        a, b = self.out(x).chunk(2, dim=1)
+        return (x_in - a) / ((1 - b).abs() + 1e-5)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py
new file mode 100644
index 000000000..e4277d58a
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py
@@ -0,0 +1,438 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ...schedulers import DDPMWuerstchenScheduler
+from ...utils import deprecate, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput
+from .modeling_paella_vq_model import PaellaVQModel
+from .modeling_wuerstchen_diffnext import WuerstchenDiffNeXt
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import WuerstchenPriorPipeline, WuerstchenDecoderPipeline
+
+        >>> prior_pipe = WuerstchenPriorPipeline.from_pretrained(
+        ...     "warp-ai/wuerstchen-prior", torch_dtype=torch.float16
+        ... ).to("cuda")
+        >>> gen_pipe = WuerstchenDecoderPipeline.from_pretrain("warp-ai/wuerstchen", torch_dtype=torch.float16).to(
+        ...     "cuda"
+        ... )
+
+        >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
+        >>> prior_output = pipe(prompt)
+        >>> images = gen_pipe(prior_output.image_embeddings, prompt=prompt)
+        ```
+"""
+
+
+class WuerstchenDecoderPipeline(DiffusionPipeline):
+    """
+    Pipeline for generating images from the Wuerstchen model.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        tokenizer (`CLIPTokenizer`):
+            The CLIP tokenizer.
+        text_encoder (`CLIPTextModel`):
+            The CLIP text encoder.
+        decoder ([`WuerstchenDiffNeXt`]):
+            The WuerstchenDiffNeXt unet decoder.
+        vqgan ([`PaellaVQModel`]):
+            The VQGAN model.
+        scheduler ([`DDPMWuerstchenScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+        latent_dim_scale (float, `optional`, defaults to 10.67):
+            Multiplier to determine the VQ latent space size from the image embeddings. If the image embeddings are
+            height=24 and width=24, the VQ latent shape needs to be height=int(24*10.67)=256 and
+            width=int(24*10.67)=256 in order to match the training conditions.
+    """
+
+    model_cpu_offload_seq = "text_encoder->decoder->vqgan"
+    _callback_tensor_inputs = [
+        "latents",
+        "text_encoder_hidden_states",
+        "negative_prompt_embeds",
+        "image_embeddings",
+    ]
+
+    def __init__(
+        self,
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModel,
+        decoder: WuerstchenDiffNeXt,
+        scheduler: DDPMWuerstchenScheduler,
+        vqgan: PaellaVQModel,
+        latent_dim_scale: float = 10.67,
+    ) -> None:
+        super().__init__()
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            decoder=decoder,
+            scheduler=scheduler,
+            vqgan=vqgan,
+        )
+        self.register_to_config(latent_dim_scale=latent_dim_scale)
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+    ):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        # get prompt text embeddings
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        attention_mask = text_inputs.attention_mask
+
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+            text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+            attention_mask = attention_mask[:, : self.tokenizer.model_max_length]
+
+        text_encoder_output = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask.to(device))
+        text_encoder_hidden_states = text_encoder_output.last_hidden_state
+        text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
+
+        uncond_text_encoder_hidden_states = None
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(
+                uncond_input.input_ids.to(device), attention_mask=uncond_input.attention_mask.to(device)
+            )
+
+            uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state
+
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_text_encoder_hidden_states.shape[1]
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1)
+            uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+            # done duplicates
+
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+        return text_encoder_hidden_states, uncond_text_encoder_hidden_states
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image_embeddings: Union[torch.FloatTensor, List[torch.FloatTensor]],
+        prompt: Union[str, List[str]] = None,
+        num_inference_steps: int = 12,
+        timesteps: Optional[List[float]] = None,
+        guidance_scale: float = 0.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            image_embedding (`torch.FloatTensor` or `List[torch.FloatTensor]`):
+                Image Embeddings either extracted from an image or generated by a Prior Model.
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            num_inference_steps (`int`, *optional*, defaults to 12):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 0.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `decoder_guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting
+                `decoder_guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely
+                linked to the text `prompt`, usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `decoder_guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple` [`~pipelines.ImagePipelineOutput`] if `return_dict` is True,
+            otherwise a `tuple`. When returning a tuple, the first element is a list with the generated image
+            embeddings.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        # 0. Define commonly used variables
+        device = self._execution_device
+        dtype = self.decoder.dtype
+        self._guidance_scale = guidance_scale
+
+        # 1. Check inputs. Raise error if not correct
+        if not isinstance(prompt, list):
+            if isinstance(prompt, str):
+                prompt = [prompt]
+            else:
+                raise TypeError(f"'prompt' must be of type 'list' or 'str', but got {type(prompt)}.")
+
+        if self.do_classifier_free_guidance:
+            if negative_prompt is not None and not isinstance(negative_prompt, list):
+                if isinstance(negative_prompt, str):
+                    negative_prompt = [negative_prompt]
+                else:
+                    raise TypeError(
+                        f"'negative_prompt' must be of type 'list' or 'str', but got {type(negative_prompt)}."
+                    )
+
+        if isinstance(image_embeddings, list):
+            image_embeddings = torch.cat(image_embeddings, dim=0)
+        if isinstance(image_embeddings, np.ndarray):
+            image_embeddings = torch.Tensor(image_embeddings, device=device).to(dtype=dtype)
+        if not isinstance(image_embeddings, torch.Tensor):
+            raise TypeError(
+                f"'image_embeddings' must be of type 'torch.Tensor' or 'np.array', but got {type(image_embeddings)}."
+            )
+
+        if not isinstance(num_inference_steps, int):
+            raise TypeError(
+                f"'num_inference_steps' must be of type 'int', but got {type(num_inference_steps)}\
+                           In Case you want to provide explicit timesteps, please use the 'timesteps' argument."
+            )
+
+        # 2. Encode caption
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            image_embeddings.size(0) * num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+        )
+        text_encoder_hidden_states = (
+            torch.cat([prompt_embeds, negative_prompt_embeds]) if negative_prompt_embeds is not None else prompt_embeds
+        )
+        effnet = (
+            torch.cat([image_embeddings, torch.zeros_like(image_embeddings)])
+            if self.do_classifier_free_guidance
+            else image_embeddings
+        )
+
+        # 3. Determine latent shape of latents
+        latent_height = int(image_embeddings.size(2) * self.config.latent_dim_scale)
+        latent_width = int(image_embeddings.size(3) * self.config.latent_dim_scale)
+        latent_features_shape = (image_embeddings.size(0) * num_images_per_prompt, 4, latent_height, latent_width)
+
+        # 4. Prepare and set timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latents
+        latents = self.prepare_latents(latent_features_shape, dtype, device, generator, latents, self.scheduler)
+
+        # 6. Run denoising loop
+        self._num_timesteps = len(timesteps[:-1])
+        for i, t in enumerate(self.progress_bar(timesteps[:-1])):
+            ratio = t.expand(latents.size(0)).to(dtype)
+            # 7. Denoise latents
+            predicted_latents = self.decoder(
+                torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents,
+                r=torch.cat([ratio] * 2) if self.do_classifier_free_guidance else ratio,
+                effnet=effnet,
+                clip=text_encoder_hidden_states,
+            )
+
+            # 8. Check for classifier free guidance and apply it
+            if self.do_classifier_free_guidance:
+                predicted_latents_text, predicted_latents_uncond = predicted_latents.chunk(2)
+                predicted_latents = torch.lerp(predicted_latents_uncond, predicted_latents_text, self.guidance_scale)
+
+            # 9. Renoise latents to next timestep
+            latents = self.scheduler.step(
+                model_output=predicted_latents,
+                timestep=ratio,
+                sample=latents,
+                generator=generator,
+            ).prev_sample
+
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                latents = callback_outputs.pop("latents", latents)
+                image_embeddings = callback_outputs.pop("image_embeddings", image_embeddings)
+                text_encoder_hidden_states = callback_outputs.pop(
+                    "text_encoder_hidden_states", text_encoder_hidden_states
+                )
+
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        if output_type not in ["pt", "np", "pil", "latent"]:
+            raise ValueError(
+                f"Only the output types `pt`, `np`, `pil` and `latent` are supported not output_type={output_type}"
+            )
+
+        if not output_type == "latent":
+            # 10. Scale and decode the image latents with vq-vae
+            latents = self.vqgan.config.scale_factor * latents
+            images = self.vqgan.decode(latents).sample.clamp(0, 1)
+            if output_type == "np":
+                images = images.permute(0, 2, 3, 1).cpu().float().numpy()
+            elif output_type == "pil":
+                images = images.permute(0, 2, 3, 1).cpu().float().numpy()
+                images = self.numpy_to_pil(images)
+        else:
+            images = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return images
+        return ImagePipelineOutput(images)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py
new file mode 100644
index 000000000..3a43ad5b9
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py
@@ -0,0 +1,306 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Dict, List, Optional, Union
+
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ...schedulers import DDPMWuerstchenScheduler
+from ...utils import deprecate, replace_example_docstring
+from ..pipeline_utils import DiffusionPipeline
+from .modeling_paella_vq_model import PaellaVQModel
+from .modeling_wuerstchen_diffnext import WuerstchenDiffNeXt
+from .modeling_wuerstchen_prior import WuerstchenPrior
+from .pipeline_wuerstchen import WuerstchenDecoderPipeline
+from .pipeline_wuerstchen_prior import WuerstchenPriorPipeline
+
+
+TEXT2IMAGE_EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> from diffusions import WuerstchenCombinedPipeline
+
+        >>> pipe = WuerstchenCombinedPipeline.from_pretrained("warp-ai/Wuerstchen", torch_dtype=torch.float16).to(
+        ...     "cuda"
+        ... )
+        >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
+        >>> images = pipe(prompt=prompt)
+        ```
+"""
+
+
+class WuerstchenCombinedPipeline(DiffusionPipeline):
+    """
+    Combined Pipeline for text-to-image generation using Wuerstchen
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        tokenizer (`CLIPTokenizer`):
+            The decoder tokenizer to be used for text inputs.
+        text_encoder (`CLIPTextModel`):
+            The decoder text encoder to be used for text inputs.
+        decoder (`WuerstchenDiffNeXt`):
+            The decoder model to be used for decoder image generation pipeline.
+        scheduler (`DDPMWuerstchenScheduler`):
+            The scheduler to be used for decoder image generation pipeline.
+        vqgan (`PaellaVQModel`):
+            The VQGAN model to be used for decoder image generation pipeline.
+        prior_tokenizer (`CLIPTokenizer`):
+            The prior tokenizer to be used for text inputs.
+        prior_text_encoder (`CLIPTextModel`):
+            The prior text encoder to be used for text inputs.
+        prior_prior (`WuerstchenPrior`):
+            The prior model to be used for prior pipeline.
+        prior_scheduler (`DDPMWuerstchenScheduler`):
+            The scheduler to be used for prior pipeline.
+    """
+
+    _load_connected_pipes = True
+
+    def __init__(
+        self,
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModel,
+        decoder: WuerstchenDiffNeXt,
+        scheduler: DDPMWuerstchenScheduler,
+        vqgan: PaellaVQModel,
+        prior_tokenizer: CLIPTokenizer,
+        prior_text_encoder: CLIPTextModel,
+        prior_prior: WuerstchenPrior,
+        prior_scheduler: DDPMWuerstchenScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            decoder=decoder,
+            scheduler=scheduler,
+            vqgan=vqgan,
+            prior_prior=prior_prior,
+            prior_text_encoder=prior_text_encoder,
+            prior_tokenizer=prior_tokenizer,
+            prior_scheduler=prior_scheduler,
+        )
+        self.prior_pipe = WuerstchenPriorPipeline(
+            prior=prior_prior,
+            text_encoder=prior_text_encoder,
+            tokenizer=prior_tokenizer,
+            scheduler=prior_scheduler,
+        )
+        self.decoder_pipe = WuerstchenDecoderPipeline(
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            decoder=decoder,
+            scheduler=scheduler,
+            vqgan=vqgan,
+        )
+
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None):
+        self.decoder_pipe.enable_xformers_memory_efficient_attention(attention_op)
+
+    def enable_model_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
+        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
+        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
+        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+        """
+        self.prior_pipe.enable_model_cpu_offload(gpu_id=gpu_id)
+        self.decoder_pipe.enable_model_cpu_offload(gpu_id=gpu_id)
+
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
+        Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a
+        GPU only when their specific submodule's `forward` method is called. Offloading happens on a submodule basis.
+        Memory savings are higher than using `enable_model_cpu_offload`, but performance is lower.
+        """
+        self.prior_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
+        self.decoder_pipe.enable_sequential_cpu_offload(gpu_id=gpu_id)
+
+    def progress_bar(self, iterable=None, total=None):
+        self.prior_pipe.progress_bar(iterable=iterable, total=total)
+        self.decoder_pipe.progress_bar(iterable=iterable, total=total)
+
+    def set_progress_bar_config(self, **kwargs):
+        self.prior_pipe.set_progress_bar_config(**kwargs)
+        self.decoder_pipe.set_progress_bar_config(**kwargs)
+
+    @torch.no_grad()
+    @replace_example_docstring(TEXT2IMAGE_EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 512,
+        width: int = 512,
+        prior_num_inference_steps: int = 60,
+        prior_timesteps: Optional[List[float]] = None,
+        prior_guidance_scale: float = 4.0,
+        num_inference_steps: int = 12,
+        decoder_timesteps: Optional[List[float]] = None,
+        decoder_guidance_scale: float = 0.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        prior_callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        prior_callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation for the prior and decoder.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.*
+                prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `prior_guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting
+                `prior_guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked
+                to the text `prompt`, usually at the expense of lower image quality.
+            prior_num_inference_steps (`Union[int, Dict[float, int]]`, *optional*, defaults to 60):
+                The number of prior denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. For more specific timestep spacing, you can pass customized
+                `prior_timesteps`
+            num_inference_steps (`int`, *optional*, defaults to 12):
+                The number of decoder denoising steps. More denoising steps usually lead to a higher quality image at
+                the expense of slower inference. For more specific timestep spacing, you can pass customized
+                `timesteps`
+            prior_timesteps (`List[float]`, *optional*):
+                Custom timesteps to use for the denoising process for the prior. If not defined, equal spaced
+                `prior_num_inference_steps` timesteps are used. Must be in descending order.
+            decoder_timesteps (`List[float]`, *optional*):
+                Custom timesteps to use for the denoising process for the decoder. If not defined, equal spaced
+                `num_inference_steps` timesteps are used. Must be in descending order.
+            decoder_guidance_scale (`float`, *optional*, defaults to 0.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            prior_callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `prior_callback_on_step_end(self: DiffusionPipeline, step: int, timestep:
+                int, callback_kwargs: Dict)`.
+            prior_callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `prior_callback_on_step_end` function. The tensors specified in the
+                list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in
+                the `._callback_tensor_inputs` attribute of your pipeline class.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.ImagePipelineOutput`] or `tuple` [`~pipelines.ImagePipelineOutput`] if `return_dict` is True,
+            otherwise a `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        prior_kwargs = {}
+        if kwargs.get("prior_callback", None) is not None:
+            prior_kwargs["callback"] = kwargs.pop("prior_callback")
+            deprecate(
+                "prior_callback",
+                "1.0.0",
+                "Passing `prior_callback` as an input argument to `__call__` is deprecated, consider use `prior_callback_on_step_end`",
+            )
+        if kwargs.get("prior_callback_steps", None) is not None:
+            deprecate(
+                "prior_callback_steps",
+                "1.0.0",
+                "Passing `prior_callback_steps` as an input argument to `__call__` is deprecated, consider use `prior_callback_on_step_end`",
+            )
+            prior_kwargs["callback_steps"] = kwargs.pop("prior_callback_steps")
+
+        prior_outputs = self.prior_pipe(
+            prompt=prompt if prompt_embeds is None else None,
+            height=height,
+            width=width,
+            num_inference_steps=prior_num_inference_steps,
+            timesteps=prior_timesteps,
+            guidance_scale=prior_guidance_scale,
+            negative_prompt=negative_prompt if negative_prompt_embeds is None else None,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            num_images_per_prompt=num_images_per_prompt,
+            generator=generator,
+            latents=latents,
+            output_type="pt",
+            return_dict=False,
+            callback_on_step_end=prior_callback_on_step_end,
+            callback_on_step_end_tensor_inputs=prior_callback_on_step_end_tensor_inputs,
+            **prior_kwargs,
+        )
+        image_embeddings = prior_outputs[0]
+
+        outputs = self.decoder_pipe(
+            image_embeddings=image_embeddings,
+            prompt=prompt if prompt is not None else "",
+            num_inference_steps=num_inference_steps,
+            timesteps=decoder_timesteps,
+            guidance_scale=decoder_guidance_scale,
+            negative_prompt=negative_prompt,
+            generator=generator,
+            output_type=output_type,
+            return_dict=return_dict,
+            callback_on_step_end=callback_on_step_end,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            **kwargs,
+        )
+
+        return outputs
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py
new file mode 100644
index 000000000..4640f7696
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py
@@ -0,0 +1,516 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from math import ceil
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ...loaders import LoraLoaderMixin
+from ...schedulers import DDPMWuerstchenScheduler
+from ...utils import BaseOutput, deprecate, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ..pipeline_utils import DiffusionPipeline
+from .modeling_wuerstchen_prior import WuerstchenPrior
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+DEFAULT_STAGE_C_TIMESTEPS = list(np.linspace(1.0, 2 / 3, 20)) + list(np.linspace(2 / 3, 0.0, 11))[1:]
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import WuerstchenPriorPipeline
+
+        >>> prior_pipe = WuerstchenPriorPipeline.from_pretrained(
+        ...     "warp-ai/wuerstchen-prior", torch_dtype=torch.float16
+        ... ).to("cuda")
+
+        >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
+        >>> prior_output = pipe(prompt)
+        ```
+"""
+
+
+@dataclass
+class WuerstchenPriorPipelineOutput(BaseOutput):
+    """
+    Output class for WuerstchenPriorPipeline.
+
+    Args:
+        image_embeddings (`torch.FloatTensor` or `np.ndarray`)
+            Prior image embeddings for text prompt
+
+    """
+
+    image_embeddings: Union[torch.FloatTensor, np.ndarray]
+
+
+class WuerstchenPriorPipeline(DiffusionPipeline, LoraLoaderMixin):
+    """
+    Pipeline for generating image prior for Wuerstchen.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+
+    Args:
+        prior ([`Prior`]):
+            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+        text_encoder ([`CLIPTextModelWithProjection`]):
+            Frozen text-encoder.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        scheduler ([`DDPMWuerstchenScheduler`]):
+            A scheduler to be used in combination with `prior` to generate image embedding.
+        latent_mean ('float', *optional*, defaults to 42.0):
+            Mean value for latent diffusers.
+        latent_std ('float', *optional*, defaults to 1.0):
+            Standard value for latent diffusers.
+        resolution_multiple ('float', *optional*, defaults to 42.67):
+            Default resolution for multiple images generated.
+    """
+
+    unet_name = "prior"
+    text_encoder_name = "text_encoder"
+    model_cpu_offload_seq = "text_encoder->prior"
+    _callback_tensor_inputs = ["latents", "text_encoder_hidden_states", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        tokenizer: CLIPTokenizer,
+        text_encoder: CLIPTextModel,
+        prior: WuerstchenPrior,
+        scheduler: DDPMWuerstchenScheduler,
+        latent_mean: float = 42.0,
+        latent_std: float = 1.0,
+        resolution_multiple: float = 42.67,
+    ) -> None:
+        super().__init__()
+        self.register_modules(
+            tokenizer=tokenizer,
+            text_encoder=text_encoder,
+            prior=prior,
+            scheduler=scheduler,
+        )
+        self.register_to_config(
+            latent_mean=latent_mean, latent_std=latent_std, resolution_multiple=resolution_multiple
+        )
+
+    # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents
+    def prepare_latents(self, shape, dtype, device, generator, latents, scheduler):
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+
+        latents = latents * scheduler.init_noise_sigma
+        return latents
+
+    def encode_prompt(
+        self,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        prompt=None,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+    ):
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            attention_mask = text_inputs.attention_mask
+
+            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
+                text_input_ids, untruncated_ids
+            ):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+                text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+                attention_mask = attention_mask[:, : self.tokenizer.model_max_length]
+
+            text_encoder_output = self.text_encoder(
+                text_input_ids.to(device), attention_mask=attention_mask.to(device)
+            )
+            prompt_embeds = text_encoder_output.last_hidden_state
+
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+
+        if negative_prompt_embeds is None and do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            negative_prompt_embeds_text_encoder_output = self.text_encoder(
+                uncond_input.input_ids.to(device), attention_mask=uncond_input.attention_mask.to(device)
+            )
+
+            negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.last_hidden_state
+
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
+            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+            # done duplicates
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def check_inputs(
+        self,
+        prompt,
+        negative_prompt,
+        num_inference_steps,
+        do_classifier_free_guidance,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+        if not isinstance(num_inference_steps, int):
+            raise TypeError(
+                f"'num_inference_steps' must be of type 'int', but got {type(num_inference_steps)}\
+                           In Case you want to provide explicit timesteps, please use the 'timesteps' argument."
+            )
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 1024,
+        width: int = 1024,
+        num_inference_steps: int = 60,
+        timesteps: List[float] = None,
+        guidance_scale: float = 8.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pt",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 1024):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 1024):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 60):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
+                timesteps are used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 8.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `decoder_guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting
+                `decoder_guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely
+                linked to the text `prompt`, usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `decoder_guidance_scale` is less than `1`).
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
+                (`np.array`) or `"pt"` (`torch.Tensor`).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.WuerstchenPriorPipelineOutput`] or `tuple` [`~pipelines.WuerstchenPriorPipelineOutput`] if
+            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
+            generated image embeddings.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        # 0. Define commonly used variables
+        device = self._execution_device
+        self._guidance_scale = guidance_scale
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # 1. Check inputs. Raise error if not correct
+        if prompt is not None and not isinstance(prompt, list):
+            if isinstance(prompt, str):
+                prompt = [prompt]
+            else:
+                raise TypeError(f"'prompt' must be of type 'list' or 'str', but got {type(prompt)}.")
+
+        if self.do_classifier_free_guidance:
+            if negative_prompt is not None and not isinstance(negative_prompt, list):
+                if isinstance(negative_prompt, str):
+                    negative_prompt = [negative_prompt]
+                else:
+                    raise TypeError(
+                        f"'negative_prompt' must be of type 'list' or 'str', but got {type(negative_prompt)}."
+                    )
+
+        self.check_inputs(
+            prompt,
+            negative_prompt,
+            num_inference_steps,
+            self.do_classifier_free_guidance,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        # 2. Encode caption
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        text_encoder_hidden_states = (
+            torch.cat([prompt_embeds, negative_prompt_embeds]) if negative_prompt_embeds is not None else prompt_embeds
+        )
+
+        # 3. Determine latent shape of image embeddings
+        dtype = text_encoder_hidden_states.dtype
+        latent_height = ceil(height / self.config.resolution_multiple)
+        latent_width = ceil(width / self.config.resolution_multiple)
+        num_channels = self.prior.config.c_in
+        effnet_features_shape = (num_images_per_prompt * batch_size, num_channels, latent_height, latent_width)
+
+        # 4. Prepare and set timesteps
+        if timesteps is not None:
+            self.scheduler.set_timesteps(timesteps=timesteps, device=device)
+            timesteps = self.scheduler.timesteps
+            num_inference_steps = len(timesteps)
+        else:
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+        # 5. Prepare latents
+        latents = self.prepare_latents(effnet_features_shape, dtype, device, generator, latents, self.scheduler)
+
+        # 6. Run denoising loop
+        self._num_timesteps = len(timesteps[:-1])
+        for i, t in enumerate(self.progress_bar(timesteps[:-1])):
+            ratio = t.expand(latents.size(0)).to(dtype)
+
+            # 7. Denoise image embeddings
+            predicted_image_embedding = self.prior(
+                torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents,
+                r=torch.cat([ratio] * 2) if self.do_classifier_free_guidance else ratio,
+                c=text_encoder_hidden_states,
+            )
+
+            # 8. Check for classifier free guidance and apply it
+            if self.do_classifier_free_guidance:
+                predicted_image_embedding_text, predicted_image_embedding_uncond = predicted_image_embedding.chunk(2)
+                predicted_image_embedding = torch.lerp(
+                    predicted_image_embedding_uncond, predicted_image_embedding_text, self.guidance_scale
+                )
+
+            # 9. Renoise latents to next timestep
+            latents = self.scheduler.step(
+                model_output=predicted_image_embedding,
+                timestep=ratio,
+                sample=latents,
+                generator=generator,
+            ).prev_sample
+
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                latents = callback_outputs.pop("latents", latents)
+                text_encoder_hidden_states = callback_outputs.pop(
+                    "text_encoder_hidden_states", text_encoder_hidden_states
+                )
+                negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+            if callback is not None and i % callback_steps == 0:
+                step_idx = i // getattr(self.scheduler, "order", 1)
+                callback(step_idx, t, latents)
+
+        # 10. Denormalize the latents
+        latents = latents * self.config.latent_mean - self.config.latent_std
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if output_type == "np":
+            latents = latents.cpu().float().numpy()
+
+        if not return_dict:
+            return (latents,)
+
+        return WuerstchenPriorPipelineOutput(latents)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/py.typed b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/py.typed
new file mode 100644
index 000000000..e69de29bb
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/README.md b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/README.md
new file mode 100644
index 000000000..31ad27793
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/README.md
@@ -0,0 +1,3 @@
+# Schedulers
+
+For more information on the schedulers, please refer to the [docs](https://huggingface.co/docs/diffusers/api/schedulers/overview).
\ No newline at end of file
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/__init__.py
new file mode 100644
index 000000000..720d8ea25
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/__init__.py
@@ -0,0 +1,211 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ..utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_flax_available,
+    is_scipy_available,
+    is_torch_available,
+    is_torchsde_available,
+)
+
+
+_dummy_modules = {}
+_import_structure = {}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_pt_objects  # noqa F403
+
+    _dummy_modules.update(get_objects_from_module(dummy_pt_objects))
+
+else:
+    _import_structure["deprecated"] = ["KarrasVeScheduler", "ScoreSdeVpScheduler"]
+    _import_structure["scheduling_amused"] = ["AmusedScheduler"]
+    _import_structure["scheduling_consistency_decoder"] = ["ConsistencyDecoderScheduler"]
+    _import_structure["scheduling_consistency_models"] = ["CMStochasticIterativeScheduler"]
+    _import_structure["scheduling_ddim"] = ["DDIMScheduler"]
+    _import_structure["scheduling_ddim_inverse"] = ["DDIMInverseScheduler"]
+    _import_structure["scheduling_ddim_parallel"] = ["DDIMParallelScheduler"]
+    _import_structure["scheduling_ddpm"] = ["DDPMScheduler"]
+    _import_structure["scheduling_ddpm_parallel"] = ["DDPMParallelScheduler"]
+    _import_structure["scheduling_ddpm_wuerstchen"] = ["DDPMWuerstchenScheduler"]
+    _import_structure["scheduling_deis_multistep"] = ["DEISMultistepScheduler"]
+    _import_structure["scheduling_dpmsolver_multistep"] = ["DPMSolverMultistepScheduler"]
+    _import_structure["scheduling_dpmsolver_multistep_inverse"] = ["DPMSolverMultistepInverseScheduler"]
+    _import_structure["scheduling_dpmsolver_singlestep"] = ["DPMSolverSinglestepScheduler"]
+    _import_structure["scheduling_edm_dpmsolver_multistep"] = ["EDMDPMSolverMultistepScheduler"]
+    _import_structure["scheduling_edm_euler"] = ["EDMEulerScheduler"]
+    _import_structure["scheduling_euler_ancestral_discrete"] = ["EulerAncestralDiscreteScheduler"]
+    _import_structure["scheduling_euler_discrete"] = ["EulerDiscreteScheduler"]
+    _import_structure["scheduling_heun_discrete"] = ["HeunDiscreteScheduler"]
+    _import_structure["scheduling_ipndm"] = ["IPNDMScheduler"]
+    _import_structure["scheduling_k_dpm_2_ancestral_discrete"] = ["KDPM2AncestralDiscreteScheduler"]
+    _import_structure["scheduling_k_dpm_2_discrete"] = ["KDPM2DiscreteScheduler"]
+    _import_structure["scheduling_lcm"] = ["LCMScheduler"]
+    _import_structure["scheduling_pndm"] = ["PNDMScheduler"]
+    _import_structure["scheduling_repaint"] = ["RePaintScheduler"]
+    _import_structure["scheduling_sasolver"] = ["SASolverScheduler"]
+    _import_structure["scheduling_sde_ve"] = ["ScoreSdeVeScheduler"]
+    _import_structure["scheduling_tcd"] = ["TCDScheduler"]
+    _import_structure["scheduling_unclip"] = ["UnCLIPScheduler"]
+    _import_structure["scheduling_unipc_multistep"] = ["UniPCMultistepScheduler"]
+    _import_structure["scheduling_utils"] = ["KarrasDiffusionSchedulers", "SchedulerMixin"]
+    _import_structure["scheduling_vq_diffusion"] = ["VQDiffusionScheduler"]
+
+try:
+    if not is_flax_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_flax_objects  # noqa F403
+
+    _dummy_modules.update(get_objects_from_module(dummy_flax_objects))
+
+else:
+    _import_structure["scheduling_ddim_flax"] = ["FlaxDDIMScheduler"]
+    _import_structure["scheduling_ddpm_flax"] = ["FlaxDDPMScheduler"]
+    _import_structure["scheduling_dpmsolver_multistep_flax"] = ["FlaxDPMSolverMultistepScheduler"]
+    _import_structure["scheduling_euler_discrete_flax"] = ["FlaxEulerDiscreteScheduler"]
+    _import_structure["scheduling_karras_ve_flax"] = ["FlaxKarrasVeScheduler"]
+    _import_structure["scheduling_lms_discrete_flax"] = ["FlaxLMSDiscreteScheduler"]
+    _import_structure["scheduling_pndm_flax"] = ["FlaxPNDMScheduler"]
+    _import_structure["scheduling_sde_ve_flax"] = ["FlaxScoreSdeVeScheduler"]
+    _import_structure["scheduling_utils_flax"] = [
+        "FlaxKarrasDiffusionSchedulers",
+        "FlaxSchedulerMixin",
+        "FlaxSchedulerOutput",
+        "broadcast_to_shape_from_left",
+    ]
+
+
+try:
+    if not (is_torch_available() and is_scipy_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_torch_and_scipy_objects  # noqa F403
+
+    _dummy_modules.update(get_objects_from_module(dummy_torch_and_scipy_objects))
+
+else:
+    _import_structure["scheduling_lms_discrete"] = ["LMSDiscreteScheduler"]
+
+try:
+    if not (is_torch_available() and is_torchsde_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_torch_and_torchsde_objects  # noqa F403
+
+    _dummy_modules.update(get_objects_from_module(dummy_torch_and_torchsde_objects))
+
+else:
+    _import_structure["scheduling_dpmsolver_sde"] = ["DPMSolverSDEScheduler"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    from ..utils import (
+        OptionalDependencyNotAvailable,
+        is_flax_available,
+        is_scipy_available,
+        is_torch_available,
+        is_torchsde_available,
+    )
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_pt_objects import *  # noqa F403
+    else:
+        from .deprecated import KarrasVeScheduler, ScoreSdeVpScheduler
+        from .scheduling_amused import AmusedScheduler
+        from .scheduling_consistency_decoder import ConsistencyDecoderScheduler
+        from .scheduling_consistency_models import CMStochasticIterativeScheduler
+        from .scheduling_ddim import DDIMScheduler
+        from .scheduling_ddim_inverse import DDIMInverseScheduler
+        from .scheduling_ddim_parallel import DDIMParallelScheduler
+        from .scheduling_ddpm import DDPMScheduler
+        from .scheduling_ddpm_parallel import DDPMParallelScheduler
+        from .scheduling_ddpm_wuerstchen import DDPMWuerstchenScheduler
+        from .scheduling_deis_multistep import DEISMultistepScheduler
+        from .scheduling_dpmsolver_multistep import DPMSolverMultistepScheduler
+        from .scheduling_dpmsolver_multistep_inverse import DPMSolverMultistepInverseScheduler
+        from .scheduling_dpmsolver_singlestep import DPMSolverSinglestepScheduler
+        from .scheduling_edm_dpmsolver_multistep import EDMDPMSolverMultistepScheduler
+        from .scheduling_edm_euler import EDMEulerScheduler
+        from .scheduling_euler_ancestral_discrete import EulerAncestralDiscreteScheduler
+        from .scheduling_euler_discrete import EulerDiscreteScheduler
+        from .scheduling_heun_discrete import HeunDiscreteScheduler
+        from .scheduling_ipndm import IPNDMScheduler
+        from .scheduling_k_dpm_2_ancestral_discrete import KDPM2AncestralDiscreteScheduler
+        from .scheduling_k_dpm_2_discrete import KDPM2DiscreteScheduler
+        from .scheduling_lcm import LCMScheduler
+        from .scheduling_pndm import PNDMScheduler
+        from .scheduling_repaint import RePaintScheduler
+        from .scheduling_sasolver import SASolverScheduler
+        from .scheduling_sde_ve import ScoreSdeVeScheduler
+        from .scheduling_tcd import TCDScheduler
+        from .scheduling_unclip import UnCLIPScheduler
+        from .scheduling_unipc_multistep import UniPCMultistepScheduler
+        from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
+        from .scheduling_vq_diffusion import VQDiffusionScheduler
+
+    try:
+        if not is_flax_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_flax_objects import *  # noqa F403
+    else:
+        from .scheduling_ddim_flax import FlaxDDIMScheduler
+        from .scheduling_ddpm_flax import FlaxDDPMScheduler
+        from .scheduling_dpmsolver_multistep_flax import FlaxDPMSolverMultistepScheduler
+        from .scheduling_euler_discrete_flax import FlaxEulerDiscreteScheduler
+        from .scheduling_karras_ve_flax import FlaxKarrasVeScheduler
+        from .scheduling_lms_discrete_flax import FlaxLMSDiscreteScheduler
+        from .scheduling_pndm_flax import FlaxPNDMScheduler
+        from .scheduling_sde_ve_flax import FlaxScoreSdeVeScheduler
+        from .scheduling_utils_flax import (
+            FlaxKarrasDiffusionSchedulers,
+            FlaxSchedulerMixin,
+            FlaxSchedulerOutput,
+            broadcast_to_shape_from_left,
+        )
+
+    try:
+        if not (is_torch_available() and is_scipy_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_torch_and_scipy_objects import *  # noqa F403
+    else:
+        from .scheduling_lms_discrete import LMSDiscreteScheduler
+
+    try:
+        if not (is_torch_available() and is_torchsde_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_torch_and_torchsde_objects import *  # noqa F403
+    else:
+        from .scheduling_dpmsolver_sde import DPMSolverSDEScheduler
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
+    for name, value in _dummy_modules.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/deprecated/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/deprecated/__init__.py
new file mode 100644
index 000000000..786707f45
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/deprecated/__init__.py
@@ -0,0 +1,50 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_pt_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_pt_objects))
+else:
+    _import_structure["scheduling_karras_ve"] = ["KarrasVeScheduler"]
+    _import_structure["scheduling_sde_vp"] = ["ScoreSdeVpScheduler"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_pt_objects import *  # noqa F403
+    else:
+        from .scheduling_karras_ve import KarrasVeScheduler
+        from .scheduling_sde_vp import ScoreSdeVpScheduler
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py
new file mode 100644
index 000000000..d776d989a
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py
@@ -0,0 +1,243 @@
+# Copyright 2024 NVIDIA and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils import BaseOutput
+from ...utils.torch_utils import randn_tensor
+from ..scheduling_utils import SchedulerMixin
+
+
+@dataclass
+class KarrasVeOutput(BaseOutput):
+    """
+    Output class for the scheduler's step function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        derivative (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Derivative of predicted original image sample (x_0).
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample (x_{0}) based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.FloatTensor
+    derivative: torch.FloatTensor
+    pred_original_sample: Optional[torch.FloatTensor] = None
+
+
+class KarrasVeScheduler(SchedulerMixin, ConfigMixin):
+    """
+    A stochastic scheduler tailored to variance-expanding models.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    <Tip>
+
+    For more details on the parameters, see [Appendix E](https://arxiv.org/abs/2206.00364). The grid search values used
+    to find the optimal `{s_noise, s_churn, s_min, s_max}` for a specific model are described in Table 5 of the paper.
+
+    </Tip>
+
+    Args:
+        sigma_min (`float`, defaults to 0.02):
+            The minimum noise magnitude.
+        sigma_max (`float`, defaults to 100):
+            The maximum noise magnitude.
+        s_noise (`float`, defaults to 1.007):
+            The amount of additional noise to counteract loss of detail during sampling. A reasonable range is [1.000,
+            1.011].
+        s_churn (`float`, defaults to 80):
+            The parameter controlling the overall amount of stochasticity. A reasonable range is [0, 100].
+        s_min (`float`, defaults to 0.05):
+            The start value of the sigma range to add noise (enable stochasticity). A reasonable range is [0, 10].
+        s_max (`float`, defaults to 50):
+            The end value of the sigma range to add noise. A reasonable range is [0.2, 80].
+    """
+
+    order = 2
+
+    @register_to_config
+    def __init__(
+        self,
+        sigma_min: float = 0.02,
+        sigma_max: float = 100,
+        s_noise: float = 1.007,
+        s_churn: float = 80,
+        s_min: float = 0.05,
+        s_max: float = 50,
+    ):
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = sigma_max
+
+        # setable values
+        self.num_inference_steps: int = None
+        self.timesteps: np.IntTensor = None
+        self.schedule: torch.FloatTensor = None  # sigma(t_i)
+
+    def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+        timesteps = np.arange(0, self.num_inference_steps)[::-1].copy()
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+        schedule = [
+            (
+                self.config.sigma_max**2
+                * (self.config.sigma_min**2 / self.config.sigma_max**2) ** (i / (num_inference_steps - 1))
+            )
+            for i in self.timesteps
+        ]
+        self.schedule = torch.tensor(schedule, dtype=torch.float32, device=device)
+
+    def add_noise_to_input(
+        self, sample: torch.FloatTensor, sigma: float, generator: Optional[torch.Generator] = None
+    ) -> Tuple[torch.FloatTensor, float]:
+        """
+        Explicit Langevin-like "churn" step of adding noise to the sample according to a `gamma_i ≥ 0` to reach a
+        higher noise level `sigma_hat = sigma_i + gamma_i*sigma_i`.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            sigma (`float`):
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+        """
+        if self.config.s_min <= sigma <= self.config.s_max:
+            gamma = min(self.config.s_churn / self.num_inference_steps, 2**0.5 - 1)
+        else:
+            gamma = 0
+
+        # sample eps ~ N(0, S_noise^2 * I)
+        eps = self.config.s_noise * randn_tensor(sample.shape, generator=generator).to(sample.device)
+        sigma_hat = sigma + gamma * sigma
+        sample_hat = sample + ((sigma_hat**2 - sigma**2) ** 0.5 * eps)
+
+        return sample_hat, sigma_hat
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        sigma_hat: float,
+        sigma_prev: float,
+        sample_hat: torch.FloatTensor,
+        return_dict: bool = True,
+    ) -> Union[KarrasVeOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            sigma_hat (`float`):
+            sigma_prev (`float`):
+            sample_hat (`torch.FloatTensor`):
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_karras_ve.KarrasVESchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_karras_ve.KarrasVESchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_karras_ve.KarrasVESchedulerOutput`] is returned,
+                otherwise a tuple is returned where the first element is the sample tensor.
+
+        """
+
+        pred_original_sample = sample_hat + sigma_hat * model_output
+        derivative = (sample_hat - pred_original_sample) / sigma_hat
+        sample_prev = sample_hat + (sigma_prev - sigma_hat) * derivative
+
+        if not return_dict:
+            return (sample_prev, derivative)
+
+        return KarrasVeOutput(
+            prev_sample=sample_prev, derivative=derivative, pred_original_sample=pred_original_sample
+        )
+
+    def step_correct(
+        self,
+        model_output: torch.FloatTensor,
+        sigma_hat: float,
+        sigma_prev: float,
+        sample_hat: torch.FloatTensor,
+        sample_prev: torch.FloatTensor,
+        derivative: torch.FloatTensor,
+        return_dict: bool = True,
+    ) -> Union[KarrasVeOutput, Tuple]:
+        """
+        Corrects the predicted sample based on the `model_output` of the network.
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            sigma_hat (`float`): TODO
+            sigma_prev (`float`): TODO
+            sample_hat (`torch.FloatTensor`): TODO
+            sample_prev (`torch.FloatTensor`): TODO
+            derivative (`torch.FloatTensor`): TODO
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`.
+
+        Returns:
+            prev_sample (TODO): updated sample in the diffusion chain. derivative (TODO): TODO
+
+        """
+        pred_original_sample = sample_prev + sigma_prev * model_output
+        derivative_corr = (sample_prev - pred_original_sample) / sigma_prev
+        sample_prev = sample_hat + (sigma_prev - sigma_hat) * (0.5 * derivative + 0.5 * derivative_corr)
+
+        if not return_dict:
+            return (sample_prev, derivative)
+
+        return KarrasVeOutput(
+            prev_sample=sample_prev, derivative=derivative, pred_original_sample=pred_original_sample
+        )
+
+    def add_noise(self, original_samples, noise, timesteps):
+        raise NotImplementedError()
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/deprecated/scheduling_sde_vp.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/deprecated/scheduling_sde_vp.py
new file mode 100644
index 000000000..09b02cadc
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/deprecated/scheduling_sde_vp.py
@@ -0,0 +1,109 @@
+# Copyright 2024 Google Brain and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/yang-song/score_sde_pytorch
+
+import math
+from typing import Union
+
+import torch
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...utils.torch_utils import randn_tensor
+from ..scheduling_utils import SchedulerMixin
+
+
+class ScoreSdeVpScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `ScoreSdeVpScheduler` is a variance preserving stochastic differential equation (SDE) scheduler.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 2000):
+            The number of diffusion steps to train the model.
+        beta_min (`int`, defaults to 0.1):
+        beta_max (`int`, defaults to 20):
+        sampling_eps (`int`, defaults to 1e-3):
+            The end value of sampling where timesteps decrease progressively from 1 to epsilon.
+    """
+
+    order = 1
+
+    @register_to_config
+    def __init__(self, num_train_timesteps=2000, beta_min=0.1, beta_max=20, sampling_eps=1e-3):
+        self.sigmas = None
+        self.discrete_sigmas = None
+        self.timesteps = None
+
+    def set_timesteps(self, num_inference_steps, device: Union[str, torch.device] = None):
+        """
+        Sets the continuous timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.timesteps = torch.linspace(1, self.config.sampling_eps, num_inference_steps, device=device)
+
+    def step_pred(self, score, x, t, generator=None):
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            score ():
+            x ():
+            t ():
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+        """
+        if self.timesteps is None:
+            raise ValueError(
+                "`self.timesteps` is not set, you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        # TODO(Patrick) better comments + non-PyTorch
+        # postprocess model score
+        log_mean_coeff = -0.25 * t**2 * (self.config.beta_max - self.config.beta_min) - 0.5 * t * self.config.beta_min
+        std = torch.sqrt(1.0 - torch.exp(2.0 * log_mean_coeff))
+        std = std.flatten()
+        while len(std.shape) < len(score.shape):
+            std = std.unsqueeze(-1)
+        score = -score / std
+
+        # compute
+        dt = -1.0 / len(self.timesteps)
+
+        beta_t = self.config.beta_min + t * (self.config.beta_max - self.config.beta_min)
+        beta_t = beta_t.flatten()
+        while len(beta_t.shape) < len(x.shape):
+            beta_t = beta_t.unsqueeze(-1)
+        drift = -0.5 * beta_t * x
+
+        diffusion = torch.sqrt(beta_t)
+        drift = drift - diffusion**2 * score
+        x_mean = x + drift * dt
+
+        # add noise
+        noise = randn_tensor(x.shape, layout=x.layout, generator=generator, device=x.device, dtype=x.dtype)
+        x = x_mean + diffusion * math.sqrt(-dt) * noise
+
+        return x, x_mean
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_amused.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_amused.py
new file mode 100644
index 000000000..51fbe6a4d
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_amused.py
@@ -0,0 +1,162 @@
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from .scheduling_utils import SchedulerMixin
+
+
+def gumbel_noise(t, generator=None):
+    device = generator.device if generator is not None else t.device
+    noise = torch.zeros_like(t, device=device).uniform_(0, 1, generator=generator).to(t.device)
+    return -torch.log((-torch.log(noise.clamp(1e-20))).clamp(1e-20))
+
+
+def mask_by_random_topk(mask_len, probs, temperature=1.0, generator=None):
+    confidence = torch.log(probs.clamp(1e-20)) + temperature * gumbel_noise(probs, generator=generator)
+    sorted_confidence = torch.sort(confidence, dim=-1).values
+    cut_off = torch.gather(sorted_confidence, 1, mask_len.long())
+    masking = confidence < cut_off
+    return masking
+
+
+@dataclass
+class AmusedSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.FloatTensor
+    pred_original_sample: torch.FloatTensor = None
+
+
+class AmusedScheduler(SchedulerMixin, ConfigMixin):
+    order = 1
+
+    temperatures: torch.Tensor
+
+    @register_to_config
+    def __init__(
+        self,
+        mask_token_id: int,
+        masking_schedule: str = "cosine",
+    ):
+        self.temperatures = None
+        self.timesteps = None
+
+    def set_timesteps(
+        self,
+        num_inference_steps: int,
+        temperature: Union[int, Tuple[int, int], List[int]] = (2, 0),
+        device: Union[str, torch.device] = None,
+    ):
+        self.timesteps = torch.arange(num_inference_steps, device=device).flip(0)
+
+        if isinstance(temperature, (tuple, list)):
+            self.temperatures = torch.linspace(temperature[0], temperature[1], num_inference_steps, device=device)
+        else:
+            self.temperatures = torch.linspace(temperature, 0.01, num_inference_steps, device=device)
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: torch.long,
+        sample: torch.LongTensor,
+        starting_mask_ratio: int = 1,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[AmusedSchedulerOutput, Tuple]:
+        two_dim_input = sample.ndim == 3 and model_output.ndim == 4
+
+        if two_dim_input:
+            batch_size, codebook_size, height, width = model_output.shape
+            sample = sample.reshape(batch_size, height * width)
+            model_output = model_output.reshape(batch_size, codebook_size, height * width).permute(0, 2, 1)
+
+        unknown_map = sample == self.config.mask_token_id
+
+        probs = model_output.softmax(dim=-1)
+
+        device = probs.device
+        probs_ = probs.to(generator.device) if generator is not None else probs  # handles when generator is on CPU
+        if probs_.device.type == "cpu" and probs_.dtype != torch.float32:
+            probs_ = probs_.float()  # multinomial is not implemented for cpu half precision
+        probs_ = probs_.reshape(-1, probs.size(-1))
+        pred_original_sample = torch.multinomial(probs_, 1, generator=generator).to(device=device)
+        pred_original_sample = pred_original_sample[:, 0].view(*probs.shape[:-1])
+        pred_original_sample = torch.where(unknown_map, pred_original_sample, sample)
+
+        if timestep == 0:
+            prev_sample = pred_original_sample
+        else:
+            seq_len = sample.shape[1]
+            step_idx = (self.timesteps == timestep).nonzero()
+            ratio = (step_idx + 1) / len(self.timesteps)
+
+            if self.config.masking_schedule == "cosine":
+                mask_ratio = torch.cos(ratio * math.pi / 2)
+            elif self.config.masking_schedule == "linear":
+                mask_ratio = 1 - ratio
+            else:
+                raise ValueError(f"unknown masking schedule {self.config.masking_schedule}")
+
+            mask_ratio = starting_mask_ratio * mask_ratio
+
+            mask_len = (seq_len * mask_ratio).floor()
+            # do not mask more than amount previously masked
+            mask_len = torch.min(unknown_map.sum(dim=-1, keepdim=True) - 1, mask_len)
+            # mask at least one
+            mask_len = torch.max(torch.tensor([1], device=model_output.device), mask_len)
+
+            selected_probs = torch.gather(probs, -1, pred_original_sample[:, :, None])[:, :, 0]
+            # Ignores the tokens given in the input by overwriting their confidence.
+            selected_probs = torch.where(unknown_map, selected_probs, torch.finfo(selected_probs.dtype).max)
+
+            masking = mask_by_random_topk(mask_len, selected_probs, self.temperatures[step_idx], generator)
+
+            # Masks tokens with lower confidence.
+            prev_sample = torch.where(masking, self.config.mask_token_id, pred_original_sample)
+
+        if two_dim_input:
+            prev_sample = prev_sample.reshape(batch_size, height, width)
+            pred_original_sample = pred_original_sample.reshape(batch_size, height, width)
+
+        if not return_dict:
+            return (prev_sample, pred_original_sample)
+
+        return AmusedSchedulerOutput(prev_sample, pred_original_sample)
+
+    def add_noise(self, sample, timesteps, generator=None):
+        step_idx = (self.timesteps == timesteps).nonzero()
+        ratio = (step_idx + 1) / len(self.timesteps)
+
+        if self.config.masking_schedule == "cosine":
+            mask_ratio = torch.cos(ratio * math.pi / 2)
+        elif self.config.masking_schedule == "linear":
+            mask_ratio = 1 - ratio
+        else:
+            raise ValueError(f"unknown masking schedule {self.config.masking_schedule}")
+
+        mask_indices = (
+            torch.rand(
+                sample.shape, device=generator.device if generator is not None else sample.device, generator=generator
+            ).to(sample.device)
+            < mask_ratio
+        )
+
+        masked_sample = sample.clone()
+
+        masked_sample[mask_indices] = self.config.mask_token_id
+
+        return masked_sample
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_consistency_decoder.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_consistency_decoder.py
new file mode 100644
index 000000000..69ca8a173
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_consistency_decoder.py
@@ -0,0 +1,180 @@
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import SchedulerMixin
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+@dataclass
+class ConsistencyDecoderSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+
+    prev_sample: torch.FloatTensor
+
+
+class ConsistencyDecoderScheduler(SchedulerMixin, ConfigMixin):
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1024,
+        sigma_data: float = 0.5,
+    ):
+        betas = betas_for_alpha_bar(num_train_timesteps)
+
+        alphas = 1.0 - betas
+        alphas_cumprod = torch.cumprod(alphas, dim=0)
+
+        self.sqrt_alphas_cumprod = torch.sqrt(alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - alphas_cumprod)
+
+        sigmas = torch.sqrt(1.0 / alphas_cumprod - 1)
+
+        sqrt_recip_alphas_cumprod = torch.sqrt(1.0 / alphas_cumprod)
+
+        self.c_skip = sqrt_recip_alphas_cumprod * sigma_data**2 / (sigmas**2 + sigma_data**2)
+        self.c_out = sigmas * sigma_data / (sigmas**2 + sigma_data**2) ** 0.5
+        self.c_in = sqrt_recip_alphas_cumprod / (sigmas**2 + sigma_data**2) ** 0.5
+
+    def set_timesteps(
+        self,
+        num_inference_steps: Optional[int] = None,
+        device: Union[str, torch.device] = None,
+    ):
+        if num_inference_steps != 2:
+            raise ValueError("Currently more than 2 inference steps are not supported.")
+
+        self.timesteps = torch.tensor([1008, 512], dtype=torch.long, device=device)
+        self.sqrt_alphas_cumprod = self.sqrt_alphas_cumprod.to(device)
+        self.sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod.to(device)
+        self.c_skip = self.c_skip.to(device)
+        self.c_out = self.c_out.to(device)
+        self.c_in = self.c_in.to(device)
+
+    @property
+    def init_noise_sigma(self):
+        return self.sqrt_one_minus_alphas_cumprod[self.timesteps[0]]
+
+    def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        return sample * self.c_in[timestep]
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[ConsistencyDecoderSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from the learned diffusion model.
+            timestep (`float`):
+                The current timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a
+                [`~schedulers.scheduling_consistency_models.ConsistencyDecoderSchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_consistency_models.ConsistencyDecoderSchedulerOutput`] or `tuple`:
+                If return_dict is `True`,
+                [`~schedulers.scheduling_consistency_models.ConsistencyDecoderSchedulerOutput`] is returned, otherwise
+                a tuple is returned where the first element is the sample tensor.
+        """
+        x_0 = self.c_out[timestep] * model_output + self.c_skip[timestep] * sample
+
+        timestep_idx = torch.where(self.timesteps == timestep)[0]
+
+        if timestep_idx == len(self.timesteps) - 1:
+            prev_sample = x_0
+        else:
+            noise = randn_tensor(x_0.shape, generator=generator, dtype=x_0.dtype, device=x_0.device)
+            prev_sample = (
+                self.sqrt_alphas_cumprod[self.timesteps[timestep_idx + 1]].to(x_0.dtype) * x_0
+                + self.sqrt_one_minus_alphas_cumprod[self.timesteps[timestep_idx + 1]].to(x_0.dtype) * noise
+            )
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return ConsistencyDecoderSchedulerOutput(prev_sample=prev_sample)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_consistency_models.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_consistency_models.py
new file mode 100644
index 000000000..14d37a390
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_consistency_models.py
@@ -0,0 +1,448 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput, logging
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import SchedulerMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class CMStochasticIterativeSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+
+    prev_sample: torch.FloatTensor
+
+
+class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Multistep and onestep sampling for consistency models.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 40):
+            The number of diffusion steps to train the model.
+        sigma_min (`float`, defaults to 0.002):
+            Minimum noise magnitude in the sigma schedule. Defaults to 0.002 from the original implementation.
+        sigma_max (`float`, defaults to 80.0):
+            Maximum noise magnitude in the sigma schedule. Defaults to 80.0 from the original implementation.
+        sigma_data (`float`, defaults to 0.5):
+            The standard deviation of the data distribution from the EDM
+            [paper](https://huggingface.co/papers/2206.00364). Defaults to 0.5 from the original implementation.
+        s_noise (`float`, defaults to 1.0):
+            The amount of additional noise to counteract loss of detail during sampling. A reasonable range is [1.000,
+            1.011]. Defaults to 1.0 from the original implementation.
+        rho (`float`, defaults to 7.0):
+            The parameter for calculating the Karras sigma schedule from the EDM
+            [paper](https://huggingface.co/papers/2206.00364). Defaults to 7.0 from the original implementation.
+        clip_denoised (`bool`, defaults to `True`):
+            Whether to clip the denoised outputs to `(-1, 1)`.
+        timesteps (`List` or `np.ndarray` or `torch.Tensor`, *optional*):
+            An explicit timestep schedule that can be optionally specified. The timesteps are expected to be in
+            increasing order.
+    """
+
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 40,
+        sigma_min: float = 0.002,
+        sigma_max: float = 80.0,
+        sigma_data: float = 0.5,
+        s_noise: float = 1.0,
+        rho: float = 7.0,
+        clip_denoised: bool = True,
+    ):
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = sigma_max
+
+        ramp = np.linspace(0, 1, num_train_timesteps)
+        sigmas = self._convert_to_karras(ramp)
+        timesteps = self.sigma_to_t(sigmas)
+
+        # setable values
+        self.num_inference_steps = None
+        self.sigmas = torch.from_numpy(sigmas)
+        self.timesteps = torch.from_numpy(timesteps)
+        self.custom_timesteps = False
+        self.is_scale_input_called = False
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increae 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_model_input(
+        self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor]
+    ) -> torch.FloatTensor:
+        """
+        Scales the consistency model input by `(sigma**2 + sigma_data**2) ** 0.5`.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`float` or `torch.FloatTensor`):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        # Get sigma corresponding to timestep
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+
+        sample = sample / ((sigma**2 + self.config.sigma_data**2) ** 0.5)
+
+        self.is_scale_input_called = True
+        return sample
+
+    def sigma_to_t(self, sigmas: Union[float, np.ndarray]):
+        """
+        Gets scaled timesteps from the Karras sigmas for input to the consistency model.
+
+        Args:
+            sigmas (`float` or `np.ndarray`):
+                A single Karras sigma or an array of Karras sigmas.
+
+        Returns:
+            `float` or `np.ndarray`:
+                A scaled input timestep or scaled input timestep array.
+        """
+        if not isinstance(sigmas, np.ndarray):
+            sigmas = np.array(sigmas, dtype=np.float64)
+
+        timesteps = 1000 * 0.25 * np.log(sigmas + 1e-44)
+
+        return timesteps
+
+    def set_timesteps(
+        self,
+        num_inference_steps: Optional[int] = None,
+        device: Union[str, torch.device] = None,
+        timesteps: Optional[List[int]] = None,
+    ):
+        """
+        Sets the timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of equal spacing between timesteps is used. If `timesteps` is passed,
+                `num_inference_steps` must be `None`.
+        """
+        if num_inference_steps is None and timesteps is None:
+            raise ValueError("Exactly one of `num_inference_steps` or `timesteps` must be supplied.")
+
+        if num_inference_steps is not None and timesteps is not None:
+            raise ValueError("Can only pass one of `num_inference_steps` or `timesteps`.")
+
+        # Follow DDPMScheduler custom timesteps logic
+        if timesteps is not None:
+            for i in range(1, len(timesteps)):
+                if timesteps[i] >= timesteps[i - 1]:
+                    raise ValueError("`timesteps` must be in descending order.")
+
+            if timesteps[0] >= self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`timesteps` must start before `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps}."
+                )
+
+            timesteps = np.array(timesteps, dtype=np.int64)
+            self.custom_timesteps = True
+        else:
+            if num_inference_steps > self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                    f" maximal {self.config.num_train_timesteps} timesteps."
+                )
+
+            self.num_inference_steps = num_inference_steps
+
+            step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+            self.custom_timesteps = False
+
+        # Map timesteps to Karras sigmas directly for multistep sampling
+        # See https://github.com/openai/consistency_models/blob/main/cm/karras_diffusion.py#L675
+        num_train_timesteps = self.config.num_train_timesteps
+        ramp = timesteps[::-1].copy()
+        ramp = ramp / (num_train_timesteps - 1)
+        sigmas = self._convert_to_karras(ramp)
+        timesteps = self.sigma_to_t(sigmas)
+
+        sigmas = np.concatenate([sigmas, [self.sigma_min]]).astype(np.float32)
+        self.sigmas = torch.from_numpy(sigmas).to(device=device)
+
+        if str(device).startswith("mps"):
+            # mps does not support float64
+            self.timesteps = torch.from_numpy(timesteps).to(device, dtype=torch.float32)
+        else:
+            self.timesteps = torch.from_numpy(timesteps).to(device=device)
+
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Modified _convert_to_karras implementation that takes in ramp as argument
+    def _convert_to_karras(self, ramp):
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        sigma_min: float = self.config.sigma_min
+        sigma_max: float = self.config.sigma_max
+
+        rho = self.config.rho
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    def get_scalings(self, sigma):
+        sigma_data = self.config.sigma_data
+
+        c_skip = sigma_data**2 / (sigma**2 + sigma_data**2)
+        c_out = sigma * sigma_data / (sigma**2 + sigma_data**2) ** 0.5
+        return c_skip, c_out
+
+    def get_scalings_for_boundary_condition(self, sigma):
+        """
+        Gets the scalings used in the consistency model parameterization (from Appendix C of the
+        [paper](https://huggingface.co/papers/2303.01469)) to enforce boundary condition.
+
+        <Tip>
+
+        `epsilon` in the equations for `c_skip` and `c_out` is set to `sigma_min`.
+
+        </Tip>
+
+        Args:
+            sigma (`torch.FloatTensor`):
+                The current sigma in the Karras sigma schedule.
+
+        Returns:
+            `tuple`:
+                A two-element tuple where `c_skip` (which weights the current sample) is the first element and `c_out`
+                (which weights the consistency model output) is the second element.
+        """
+        sigma_min = self.config.sigma_min
+        sigma_data = self.config.sigma_data
+
+        c_skip = sigma_data**2 / ((sigma - sigma_min) ** 2 + sigma_data**2)
+        c_out = (sigma - sigma_min) * sigma_data / (sigma**2 + sigma_data**2) ** 0.5
+        return c_skip, c_out
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[CMStochasticIterativeSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from the learned diffusion model.
+            timestep (`float`):
+                The current timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a
+                [`~schedulers.scheduling_consistency_models.CMStochasticIterativeSchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_consistency_models.CMStochasticIterativeSchedulerOutput`] or `tuple`:
+                If return_dict is `True`,
+                [`~schedulers.scheduling_consistency_models.CMStochasticIterativeSchedulerOutput`] is returned,
+                otherwise a tuple is returned where the first element is the sample tensor.
+        """
+
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    f" `{self.__class__}.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+
+        if not self.is_scale_input_called:
+            logger.warning(
+                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+                "See `StableDiffusionPipeline` for a usage example."
+            )
+
+        sigma_min = self.config.sigma_min
+        sigma_max = self.config.sigma_max
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # sigma_next corresponds to next_t in original implementation
+        sigma = self.sigmas[self.step_index]
+        if self.step_index + 1 < self.config.num_train_timesteps:
+            sigma_next = self.sigmas[self.step_index + 1]
+        else:
+            # Set sigma_next to sigma_min
+            sigma_next = self.sigmas[-1]
+
+        # Get scalings for boundary conditions
+        c_skip, c_out = self.get_scalings_for_boundary_condition(sigma)
+
+        # 1. Denoise model output using boundary conditions
+        denoised = c_out * model_output + c_skip * sample
+        if self.config.clip_denoised:
+            denoised = denoised.clamp(-1, 1)
+
+        # 2. Sample z ~ N(0, s_noise^2 * I)
+        # Noise is not used for onestep sampling.
+        if len(self.timesteps) > 1:
+            noise = randn_tensor(
+                model_output.shape, dtype=model_output.dtype, device=model_output.device, generator=generator
+            )
+        else:
+            noise = torch.zeros_like(model_output)
+        z = noise * self.config.s_noise
+
+        sigma_hat = sigma_next.clamp(min=sigma_min, max=sigma_max)
+
+        # 3. Return noisy sample
+        # tau = sigma_hat, eps = sigma_min
+        prev_sample = denoised + z * (sigma_hat**2 - sigma_min**2) ** 0.5
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return CMStochasticIterativeSchedulerOutput(prev_sample=prev_sample)
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        else:
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddim.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddim.py
new file mode 100644
index 000000000..33d3892a0
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddim.py
@@ -0,0 +1,520 @@
+# Copyright 2024 Stanford University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
+class DDIMSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.FloatTensor
+    pred_original_sample: Optional[torch.FloatTensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.FloatTensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.FloatTensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class DDIMScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `DDIMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
+    non-Markovian guidance.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample for numerical stability.
+        clip_sample_range (`float`, defaults to 1.0):
+            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_one (`bool`, defaults to `True`):
+            Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
+            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the alpha value at step 0.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        timestep_spacing (`str`, defaults to `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        clip_sample: bool = True,
+        set_alpha_to_one: bool = True,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        clip_sample_range: float = 1.0,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        rescale_betas_zero_snr: bool = False,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
+
+    def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def _get_variance(self, timestep, prev_timestep):
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+
+        return variance
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+        """
+
+        if num_inference_steps > self.config.num_train_timesteps:
+            raise ValueError(
+                f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                f" maximal {self.config.num_train_timesteps} timesteps."
+            )
+
+        self.num_inference_steps = num_inference_steps
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = (
+                np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps)
+                .round()[::-1]
+                .copy()
+                .astype(np.int64)
+            )
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)).astype(np.int64)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'leading' or 'trailing'."
+            )
+
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        eta: float = 0.0,
+        use_clipped_model_output: bool = False,
+        generator=None,
+        variance_noise: Optional[torch.FloatTensor] = None,
+        return_dict: bool = True,
+    ) -> Union[DDIMSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            eta (`float`):
+                The weight of noise for added noise in diffusion step.
+            use_clipped_model_output (`bool`, defaults to `False`):
+                If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
+                because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
+                clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
+                `use_clipped_model_output` has no effect.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            variance_noise (`torch.FloatTensor`):
+                Alternative to generating noise with `generator` by directly providing the noise for the variance
+                itself. Useful for methods such as [`CycleDiffusion`].
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_utils.DDIMSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
+        # Ideally, read DDIM paper in-detail understanding
+
+        # Notation (<variable name> -> <name in paper>
+        # - pred_noise_t -> e_theta(x_t, t)
+        # - pred_original_sample -> f_theta(x_t, t) or x_0
+        # - std_dev_t -> sigma_t
+        # - eta -> η
+        # - pred_sample_direction -> "direction pointing to x_t"
+        # - pred_prev_sample -> "x_t-1"
+
+        # 1. get previous step value (=t-1)
+        prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
+
+        # 2. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        # 3. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+            pred_epsilon = model_output
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                " `v_prediction`"
+            )
+
+        # 4. Clip or threshold "predicted x_0"
+        if self.config.thresholding:
+            pred_original_sample = self._threshold_sample(pred_original_sample)
+        elif self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        # 5. compute variance: "sigma_t(η)" -> see formula (16)
+        # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+        variance = self._get_variance(timestep, prev_timestep)
+        std_dev_t = eta * variance ** (0.5)
+
+        if use_clipped_model_output:
+            # the pred_epsilon is always re-derived from the clipped x_0 in Glide
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+
+        # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * pred_epsilon
+
+        # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+
+        if eta > 0:
+            if variance_noise is not None and generator is not None:
+                raise ValueError(
+                    "Cannot pass both generator and variance_noise. Please make sure that either `generator` or"
+                    " `variance_noise` stays `None`."
+                )
+
+            if variance_noise is None:
+                variance_noise = randn_tensor(
+                    model_output.shape, generator=generator, device=model_output.device, dtype=model_output.dtype
+                )
+            variance = std_dev_t * variance_noise
+
+            prev_sample = prev_sample + variance
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+        # for the subsequent add_noise calls
+        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
+    def get_velocity(
+        self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddim_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddim_flax.py
new file mode 100644
index 000000000..dc3d8455b
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddim_flax.py
@@ -0,0 +1,313 @@
+# Copyright 2024 Stanford University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import flax
+import jax.numpy as jnp
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils_flax import (
+    CommonSchedulerState,
+    FlaxKarrasDiffusionSchedulers,
+    FlaxSchedulerMixin,
+    FlaxSchedulerOutput,
+    add_noise_common,
+    get_velocity_common,
+)
+
+
+@flax.struct.dataclass
+class DDIMSchedulerState:
+    common: CommonSchedulerState
+    final_alpha_cumprod: jnp.ndarray
+
+    # setable values
+    init_noise_sigma: jnp.ndarray
+    timesteps: jnp.ndarray
+    num_inference_steps: Optional[int] = None
+
+    @classmethod
+    def create(
+        cls,
+        common: CommonSchedulerState,
+        final_alpha_cumprod: jnp.ndarray,
+        init_noise_sigma: jnp.ndarray,
+        timesteps: jnp.ndarray,
+    ):
+        return cls(
+            common=common,
+            final_alpha_cumprod=final_alpha_cumprod,
+            init_noise_sigma=init_noise_sigma,
+            timesteps=timesteps,
+        )
+
+
+@dataclass
+class FlaxDDIMSchedulerOutput(FlaxSchedulerOutput):
+    state: DDIMSchedulerState
+
+
+class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
+    """
+    Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising
+    diffusion probabilistic models (DDPMs) with non-Markovian guidance.
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
+
+    For more details, see the original paper: https://arxiv.org/abs/2010.02502
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`jnp.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+        clip_sample (`bool`, default `True`):
+            option to clip predicted sample between for numerical stability. The clip range is determined by `clip_sample_range`.
+        clip_sample_range (`float`, default `1.0`):
+            the maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_one (`bool`, default `True`):
+            each diffusion step uses the value of alphas product at that step and at the previous one. For the final
+            step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the value of alpha at step 0.
+        steps_offset (`int`, default `0`):
+            An offset added to the inference steps, as required by some model families.
+        prediction_type (`str`, default `epsilon`):
+            indicates whether the model predicts the noise (epsilon), or the samples. One of `epsilon`, `sample`.
+            `v-prediction` is not supported for this scheduler.
+        dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
+            the `dtype` used for params and computation.
+    """
+
+    _compatibles = [e.name for e in FlaxKarrasDiffusionSchedulers]
+
+    dtype: jnp.dtype
+
+    @property
+    def has_state(self):
+        return True
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[jnp.ndarray] = None,
+        clip_sample: bool = True,
+        clip_sample_range: float = 1.0,
+        set_alpha_to_one: bool = True,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        dtype: jnp.dtype = jnp.float32,
+    ):
+        self.dtype = dtype
+
+    def create_state(self, common: Optional[CommonSchedulerState] = None) -> DDIMSchedulerState:
+        if common is None:
+            common = CommonSchedulerState.create(self)
+
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        final_alpha_cumprod = (
+            jnp.array(1.0, dtype=self.dtype) if self.config.set_alpha_to_one else common.alphas_cumprod[0]
+        )
+
+        # standard deviation of the initial noise distribution
+        init_noise_sigma = jnp.array(1.0, dtype=self.dtype)
+
+        timesteps = jnp.arange(0, self.config.num_train_timesteps).round()[::-1]
+
+        return DDIMSchedulerState.create(
+            common=common,
+            final_alpha_cumprod=final_alpha_cumprod,
+            init_noise_sigma=init_noise_sigma,
+            timesteps=timesteps,
+        )
+
+    def scale_model_input(
+        self, state: DDIMSchedulerState, sample: jnp.ndarray, timestep: Optional[int] = None
+    ) -> jnp.ndarray:
+        """
+        Args:
+            state (`PNDMSchedulerState`): the `FlaxPNDMScheduler` state data class instance.
+            sample (`jnp.ndarray`): input sample
+            timestep (`int`, optional): current timestep
+
+        Returns:
+            `jnp.ndarray`: scaled input sample
+        """
+        return sample
+
+    def set_timesteps(
+        self, state: DDIMSchedulerState, num_inference_steps: int, shape: Tuple = ()
+    ) -> DDIMSchedulerState:
+        """
+        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            state (`DDIMSchedulerState`):
+                the `FlaxDDIMScheduler` state data class instance.
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+        """
+        step_ratio = self.config.num_train_timesteps // num_inference_steps
+        # creates integer timesteps by multiplying by ratio
+        # rounding to avoid issues when num_inference_step is power of 3
+        timesteps = (jnp.arange(0, num_inference_steps) * step_ratio).round()[::-1] + self.config.steps_offset
+
+        return state.replace(
+            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
+        )
+
+    def _get_variance(self, state: DDIMSchedulerState, timestep, prev_timestep):
+        alpha_prod_t = state.common.alphas_cumprod[timestep]
+        alpha_prod_t_prev = jnp.where(
+            prev_timestep >= 0, state.common.alphas_cumprod[prev_timestep], state.final_alpha_cumprod
+        )
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+
+        return variance
+
+    def step(
+        self,
+        state: DDIMSchedulerState,
+        model_output: jnp.ndarray,
+        timestep: int,
+        sample: jnp.ndarray,
+        eta: float = 0.0,
+        return_dict: bool = True,
+    ) -> Union[FlaxDDIMSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            state (`DDIMSchedulerState`): the `FlaxDDIMScheduler` state data class instance.
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            return_dict (`bool`): option for returning tuple rather than FlaxDDIMSchedulerOutput class
+
+        Returns:
+            [`FlaxDDIMSchedulerOutput`] or `tuple`: [`FlaxDDIMSchedulerOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+        if state.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
+        # Ideally, read DDIM paper in-detail understanding
+
+        # Notation (<variable name> -> <name in paper>
+        # - pred_noise_t -> e_theta(x_t, t)
+        # - pred_original_sample -> f_theta(x_t, t) or x_0
+        # - std_dev_t -> sigma_t
+        # - eta -> η
+        # - pred_sample_direction -> "direction pointing to x_t"
+        # - pred_prev_sample -> "x_t-1"
+
+        # 1. get previous step value (=t-1)
+        prev_timestep = timestep - self.config.num_train_timesteps // state.num_inference_steps
+
+        alphas_cumprod = state.common.alphas_cumprod
+        final_alpha_cumprod = state.final_alpha_cumprod
+
+        # 2. compute alphas, betas
+        alpha_prod_t = alphas_cumprod[timestep]
+        alpha_prod_t_prev = jnp.where(prev_timestep >= 0, alphas_cumprod[prev_timestep], final_alpha_cumprod)
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        # 3. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+            pred_epsilon = model_output
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                " `v_prediction`"
+            )
+
+        # 4. Clip or threshold "predicted x_0"
+        if self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clip(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        # 4. compute variance: "sigma_t(η)" -> see formula (16)
+        # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+        variance = self._get_variance(state, timestep, prev_timestep)
+        std_dev_t = eta * variance ** (0.5)
+
+        # 5. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * pred_epsilon
+
+        # 6. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+
+        if not return_dict:
+            return (prev_sample, state)
+
+        return FlaxDDIMSchedulerOutput(prev_sample=prev_sample, state=state)
+
+    def add_noise(
+        self,
+        state: DDIMSchedulerState,
+        original_samples: jnp.ndarray,
+        noise: jnp.ndarray,
+        timesteps: jnp.ndarray,
+    ) -> jnp.ndarray:
+        return add_noise_common(state.common, original_samples, noise, timesteps)
+
+    def get_velocity(
+        self,
+        state: DDIMSchedulerState,
+        sample: jnp.ndarray,
+        noise: jnp.ndarray,
+        timesteps: jnp.ndarray,
+    ) -> jnp.ndarray:
+        return get_velocity_common(state.common, sample, noise, timesteps)
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddim_inverse.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddim_inverse.py
new file mode 100644
index 000000000..b4c19e455
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddim_inverse.py
@@ -0,0 +1,374 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+from diffusers.utils import BaseOutput, deprecate
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
+class DDIMSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.FloatTensor
+    pred_original_sample: Optional[torch.FloatTensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.FloatTensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.FloatTensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class DDIMInverseScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `DDIMInverseScheduler` is the reverse scheduler of [`DDIMScheduler`].
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample for numerical stability.
+        clip_sample_range (`float`, defaults to 1.0):
+            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_one (`bool`, defaults to `True`):
+            Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
+            there is no previous alpha. When this option is `True` the previous alpha product is fixed to 0, otherwise
+            it uses the alpha value at step `num_train_timesteps - 1`.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        timestep_spacing (`str`, defaults to `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    order = 1
+    ignore_for_config = ["kwargs"]
+    _deprecated_kwargs = ["set_alpha_to_zero"]
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        clip_sample: bool = True,
+        set_alpha_to_one: bool = True,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        clip_sample_range: float = 1.0,
+        timestep_spacing: str = "leading",
+        rescale_betas_zero_snr: bool = False,
+        **kwargs,
+    ):
+        if kwargs.get("set_alpha_to_zero", None) is not None:
+            deprecation_message = (
+                "The `set_alpha_to_zero` argument is deprecated. Please use `set_alpha_to_one` instead."
+            )
+            deprecate("set_alpha_to_zero", "1.0.0", deprecation_message, standard_warn=False)
+            set_alpha_to_one = kwargs["set_alpha_to_zero"]
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        # At every step in inverted ddim, we are looking into the next alphas_cumprod
+        # For the initial step, there is no current alphas_cumprod, and the index is out of bounds
+        # `set_alpha_to_one` decides whether we set this parameter simply to one
+        # in this case, self.step() just output the predicted noise
+        # or whether we use the initial alpha used in training the diffusion model.
+        self.initial_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps).copy().astype(np.int64))
+
+    # Copied from diffusers.schedulers.scheduling_ddim.DDIMScheduler.scale_model_input
+    def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+        """
+
+        if num_inference_steps > self.config.num_train_timesteps:
+            raise ValueError(
+                f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                f" maximal {self.config.num_train_timesteps} timesteps."
+            )
+
+        self.num_inference_steps = num_inference_steps
+
+        # "leading" and "trailing" corresponds to annotation of Table 1. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round().copy().astype(np.int64)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)[::-1]).astype(np.int64)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'leading' or 'trailing'."
+            )
+
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        return_dict: bool = True,
+    ) -> Union[DDIMSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            eta (`float`):
+                The weight of noise for added noise in diffusion step.
+            use_clipped_model_output (`bool`, defaults to `False`):
+                If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
+                because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
+                clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
+                `use_clipped_model_output` has no effect.
+            variance_noise (`torch.FloatTensor`):
+                Alternative to generating noise with `generator` by directly providing the noise for the variance
+                itself. Useful for methods such as [`CycleDiffusion`].
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_ddim_inverse.DDIMInverseSchedulerOutput`] or
+                `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_ddim_inverse.DDIMInverseSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_ddim_inverse.DDIMInverseSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+
+        """
+        # 1. get previous step value (=t+1)
+        prev_timestep = timestep
+        timestep = min(
+            timestep - self.config.num_train_timesteps // self.num_inference_steps, self.config.num_train_timesteps - 1
+        )
+
+        # 2. compute alphas, betas
+        # change original implementation to exactly match noise levels for analogous forward process
+        alpha_prod_t = self.alphas_cumprod[timestep] if timestep >= 0 else self.initial_alpha_cumprod
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep]
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        # 3. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+            pred_epsilon = model_output
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                " `v_prediction`"
+            )
+
+        # 4. Clip or threshold "predicted x_0"
+        if self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        # 5. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        pred_sample_direction = (1 - alpha_prod_t_prev) ** (0.5) * pred_epsilon
+
+        # 6. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+
+        if not return_dict:
+            return (prev_sample, pred_original_sample)
+        return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddim_parallel.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddim_parallel.py
new file mode 100644
index 000000000..225ed736c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddim_parallel.py
@@ -0,0 +1,645 @@
+# Copyright 2024 ParaDiGMS authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput
+class DDIMParallelSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.FloatTensor
+    pred_original_sample: Optional[torch.FloatTensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.FloatTensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.FloatTensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising
+    diffusion probabilistic models (DDPMs) with non-Markovian guidance.
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
+
+    For more details, see the original paper: https://arxiv.org/abs/2010.02502
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+        clip_sample (`bool`, default `True`):
+            option to clip predicted sample for numerical stability.
+        clip_sample_range (`float`, default `1.0`):
+            the maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_one (`bool`, default `True`):
+            each diffusion step uses the value of alphas product at that step and at the previous one. For the final
+            step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the value of alpha at step 0.
+        steps_offset (`int`, default `0`):
+            An offset added to the inference steps, as required by some model families.
+        prediction_type (`str`, default `epsilon`, optional):
+            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
+            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
+            https://imagen.research.google/video/paper.pdf)
+        thresholding (`bool`, default `False`):
+            whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
+            Note that the thresholding method is unsuitable for latent-space diffusion models (such as
+            stable-diffusion).
+        dynamic_thresholding_ratio (`float`, default `0.995`):
+            the ratio for the dynamic thresholding method. Default is `0.995`, the same as Imagen
+            (https://arxiv.org/abs/2205.11487). Valid only when `thresholding=True`.
+        sample_max_value (`float`, default `1.0`):
+            the threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        timestep_spacing (`str`, default `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2. of [Common Diffusion Noise Schedules and Sample
+            Steps are Flawed](https://arxiv.org/abs/2305.08891) for more information.
+        rescale_betas_zero_snr (`bool`, default `False`):
+            whether to rescale the betas to have zero terminal SNR (proposed by https://arxiv.org/pdf/2305.08891.pdf).
+            This can enable the model to generate very bright and dark samples instead of limiting it to samples with
+            medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+    _is_ode_scheduler = True
+
+    @register_to_config
+    # Copied from diffusers.schedulers.scheduling_ddim.DDIMScheduler.__init__
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        clip_sample: bool = True,
+        set_alpha_to_one: bool = True,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        clip_sample_range: float = 1.0,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        rescale_betas_zero_snr: bool = False,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
+
+    # Copied from diffusers.schedulers.scheduling_ddim.DDIMScheduler.scale_model_input
+    def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def _get_variance(self, timestep, prev_timestep=None):
+        if prev_timestep is None:
+            prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
+
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+
+        return variance
+
+    def _batch_get_variance(self, t, prev_t):
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[torch.clip(prev_t, min=0)]
+        alpha_prod_t_prev[prev_t < 0] = torch.tensor(1.0)
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+
+        return variance
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_ddim.DDIMScheduler.set_timesteps
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+        """
+
+        if num_inference_steps > self.config.num_train_timesteps:
+            raise ValueError(
+                f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                f" maximal {self.config.num_train_timesteps} timesteps."
+            )
+
+        self.num_inference_steps = num_inference_steps
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = (
+                np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps)
+                .round()[::-1]
+                .copy()
+                .astype(np.int64)
+            )
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)).astype(np.int64)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'leading' or 'trailing'."
+            )
+
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        eta: float = 0.0,
+        use_clipped_model_output: bool = False,
+        generator=None,
+        variance_noise: Optional[torch.FloatTensor] = None,
+        return_dict: bool = True,
+    ) -> Union[DDIMParallelSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                current instance of sample being created by diffusion process.
+            eta (`float`): weight of noise for added noise in diffusion step.
+            use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped
+                predicted original sample. Necessary because predicted original sample is clipped to [-1, 1] when
+                `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would
+                coincide with the one provided as input and `use_clipped_model_output` will have not effect.
+            generator: random number generator.
+            variance_noise (`torch.FloatTensor`): instead of generating noise for the variance using `generator`, we
+                can directly provide the noise for the variance itself. This is useful for methods such as
+                CycleDiffusion. (https://arxiv.org/abs/2210.05559)
+            return_dict (`bool`): option for returning tuple rather than DDIMParallelSchedulerOutput class
+
+        Returns:
+            [`~schedulers.scheduling_utils.DDIMParallelSchedulerOutput`] or `tuple`:
+            [`~schedulers.scheduling_utils.DDIMParallelSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`.
+            When returning a tuple, the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
+        # Ideally, read DDIM paper in-detail understanding
+
+        # Notation (<variable name> -> <name in paper>
+        # - pred_noise_t -> e_theta(x_t, t)
+        # - pred_original_sample -> f_theta(x_t, t) or x_0
+        # - std_dev_t -> sigma_t
+        # - eta -> η
+        # - pred_sample_direction -> "direction pointing to x_t"
+        # - pred_prev_sample -> "x_t-1"
+
+        # 1. get previous step value (=t-1)
+        prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
+
+        # 2. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        # 3. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+            pred_epsilon = model_output
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                " `v_prediction`"
+            )
+
+        # 4. Clip or threshold "predicted x_0"
+        if self.config.thresholding:
+            pred_original_sample = self._threshold_sample(pred_original_sample)
+        elif self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        # 5. compute variance: "sigma_t(η)" -> see formula (16)
+        # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+        variance = self._get_variance(timestep, prev_timestep)
+        std_dev_t = eta * variance ** (0.5)
+
+        if use_clipped_model_output:
+            # the pred_epsilon is always re-derived from the clipped x_0 in Glide
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+
+        # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * pred_epsilon
+
+        # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+
+        if eta > 0:
+            if variance_noise is not None and generator is not None:
+                raise ValueError(
+                    "Cannot pass both generator and variance_noise. Please make sure that either `generator` or"
+                    " `variance_noise` stays `None`."
+                )
+
+            if variance_noise is None:
+                variance_noise = randn_tensor(
+                    model_output.shape, generator=generator, device=model_output.device, dtype=model_output.dtype
+                )
+            variance = std_dev_t * variance_noise
+
+            prev_sample = prev_sample + variance
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return DDIMParallelSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+    def batch_step_no_noise(
+        self,
+        model_output: torch.FloatTensor,
+        timesteps: List[int],
+        sample: torch.FloatTensor,
+        eta: float = 0.0,
+        use_clipped_model_output: bool = False,
+    ) -> torch.FloatTensor:
+        """
+        Batched version of the `step` function, to be able to reverse the SDE for multiple samples/timesteps at once.
+        Also, does not add any noise to the predicted sample, which is necessary for parallel sampling where the noise
+        is pre-sampled by the pipeline.
+
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+            timesteps (`List[int]`):
+                current discrete timesteps in the diffusion chain. This is now a list of integers.
+            sample (`torch.FloatTensor`):
+                current instance of sample being created by diffusion process.
+            eta (`float`): weight of noise for added noise in diffusion step.
+            use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped
+                predicted original sample. Necessary because predicted original sample is clipped to [-1, 1] when
+                `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would
+                coincide with the one provided as input and `use_clipped_model_output` will have not effect.
+
+        Returns:
+            `torch.FloatTensor`: sample tensor at previous timestep.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        assert eta == 0.0
+
+        # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
+        # Ideally, read DDIM paper in-detail understanding
+
+        # Notation (<variable name> -> <name in paper>
+        # - pred_noise_t -> e_theta(x_t, t)
+        # - pred_original_sample -> f_theta(x_t, t) or x_0
+        # - std_dev_t -> sigma_t
+        # - eta -> η
+        # - pred_sample_direction -> "direction pointing to x_t"
+        # - pred_prev_sample -> "x_t-1"
+
+        # 1. get previous step value (=t-1)
+        t = timesteps
+        prev_t = t - self.config.num_train_timesteps // self.num_inference_steps
+
+        t = t.view(-1, *([1] * (model_output.ndim - 1)))
+        prev_t = prev_t.view(-1, *([1] * (model_output.ndim - 1)))
+
+        # 1. compute alphas, betas
+        self.alphas_cumprod = self.alphas_cumprod.to(model_output.device)
+        self.final_alpha_cumprod = self.final_alpha_cumprod.to(model_output.device)
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[torch.clip(prev_t, min=0)]
+        alpha_prod_t_prev[prev_t < 0] = torch.tensor(1.0)
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        # 3. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+            pred_epsilon = model_output
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                " `v_prediction`"
+            )
+
+        # 4. Clip or threshold "predicted x_0"
+        if self.config.thresholding:
+            pred_original_sample = self._threshold_sample(pred_original_sample)
+        elif self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        # 5. compute variance: "sigma_t(η)" -> see formula (16)
+        # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1)
+        variance = self._batch_get_variance(t, prev_t).to(model_output.device).view(*alpha_prod_t_prev.shape)
+        std_dev_t = eta * variance ** (0.5)
+
+        if use_clipped_model_output:
+            # the pred_epsilon is always re-derived from the clipped x_0 in Glide
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+
+        # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * pred_epsilon
+
+        # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction
+
+        return prev_sample
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+        # for the subsequent add_noise calls
+        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
+    def get_velocity(
+        self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddpm.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddpm.py
new file mode 100644
index 000000000..e1f55a202
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddpm.py
@@ -0,0 +1,562 @@
+# Copyright 2024 UC Berkeley Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
+
+
+@dataclass
+class DDPMSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.FloatTensor
+    pred_original_sample: Optional[torch.FloatTensor] = None
+
+
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.FloatTensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.FloatTensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class DDPMScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `DDPMScheduler` explores the connections between denoising score matching and Langevin dynamics sampling.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            An array of betas to pass directly to the constructor without using `beta_start` and `beta_end`.
+        variance_type (`str`, defaults to `"fixed_small"`):
+            Clip the variance when adding noise to the denoised sample. Choose from `fixed_small`, `fixed_small_log`,
+            `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample for numerical stability.
+        clip_sample_range (`float`, defaults to 1.0):
+            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        timestep_spacing (`str`, defaults to `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        variance_type: str = "fixed_small",
+        clip_sample: bool = True,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        clip_sample_range: float = 1.0,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        steps_offset: int = 0,
+        rescale_betas_zero_snr: int = False,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        elif beta_schedule == "sigmoid":
+            # GeoDiff sigmoid schedule
+            betas = torch.linspace(-6, 6, num_train_timesteps)
+            self.betas = torch.sigmoid(betas) * (beta_end - beta_start) + beta_start
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        self.one = torch.tensor(1.0)
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.custom_timesteps = False
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy())
+
+        self.variance_type = variance_type
+
+    def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def set_timesteps(
+        self,
+        num_inference_steps: Optional[int] = None,
+        device: Union[str, torch.device] = None,
+        timesteps: Optional[List[int]] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model. If used,
+                `timesteps` must be `None`.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of equal spacing between timesteps is used. If `timesteps` is passed,
+                `num_inference_steps` must be `None`.
+
+        """
+        if num_inference_steps is not None and timesteps is not None:
+            raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.")
+
+        if timesteps is not None:
+            for i in range(1, len(timesteps)):
+                if timesteps[i] >= timesteps[i - 1]:
+                    raise ValueError("`custom_timesteps` must be in descending order.")
+
+            if timesteps[0] >= self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`timesteps` must start before `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps}."
+                )
+
+            timesteps = np.array(timesteps, dtype=np.int64)
+            self.custom_timesteps = True
+        else:
+            if num_inference_steps > self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                    f" maximal {self.config.num_train_timesteps} timesteps."
+                )
+
+            self.num_inference_steps = num_inference_steps
+            self.custom_timesteps = False
+
+            # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+            if self.config.timestep_spacing == "linspace":
+                timesteps = (
+                    np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps)
+                    .round()[::-1]
+                    .copy()
+                    .astype(np.int64)
+                )
+            elif self.config.timestep_spacing == "leading":
+                step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+                # creates integer timesteps by multiplying by ratio
+                # casting to int to avoid issues when num_inference_step is power of 3
+                timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+                timesteps += self.config.steps_offset
+            elif self.config.timestep_spacing == "trailing":
+                step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+                # creates integer timesteps by multiplying by ratio
+                # casting to int to avoid issues when num_inference_step is power of 3
+                timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)).astype(np.int64)
+                timesteps -= 1
+            else:
+                raise ValueError(
+                    f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+                )
+
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+    def _get_variance(self, t, predicted_variance=None, variance_type=None):
+        prev_t = self.previous_timestep(t)
+
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
+        current_beta_t = 1 - alpha_prod_t / alpha_prod_t_prev
+
+        # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
+        # and sample from it to get previous sample
+        # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
+        variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * current_beta_t
+
+        # we always take the log of variance, so clamp it to ensure it's not 0
+        variance = torch.clamp(variance, min=1e-20)
+
+        if variance_type is None:
+            variance_type = self.config.variance_type
+
+        # hacks - were probably added for training stability
+        if variance_type == "fixed_small":
+            variance = variance
+        # for rl-diffuser https://arxiv.org/abs/2205.09991
+        elif variance_type == "fixed_small_log":
+            variance = torch.log(variance)
+            variance = torch.exp(0.5 * variance)
+        elif variance_type == "fixed_large":
+            variance = current_beta_t
+        elif variance_type == "fixed_large_log":
+            # Glide max_log
+            variance = torch.log(current_beta_t)
+        elif variance_type == "learned":
+            return predicted_variance
+        elif variance_type == "learned_range":
+            min_log = torch.log(variance)
+            max_log = torch.log(current_beta_t)
+            frac = (predicted_variance + 1) / 2
+            variance = frac * max_log + (1 - frac) * min_log
+
+        return variance
+
+    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        generator=None,
+        return_dict: bool = True,
+    ) -> Union[DDPMSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        t = timestep
+
+        prev_t = self.previous_timestep(t)
+
+        if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]:
+            model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1)
+        else:
+            predicted_variance = None
+
+        # 1. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        current_alpha_t = alpha_prod_t / alpha_prod_t_prev
+        current_beta_t = 1 - current_alpha_t
+
+        # 2. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
+                " `v_prediction`  for the DDPMScheduler."
+            )
+
+        # 3. Clip or threshold "predicted x_0"
+        if self.config.thresholding:
+            pred_original_sample = self._threshold_sample(pred_original_sample)
+        elif self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * current_beta_t) / beta_prod_t
+        current_sample_coeff = current_alpha_t ** (0.5) * beta_prod_t_prev / beta_prod_t
+
+        # 5. Compute predicted previous sample µ_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
+
+        # 6. Add noise
+        variance = 0
+        if t > 0:
+            device = model_output.device
+            variance_noise = randn_tensor(
+                model_output.shape, generator=generator, device=device, dtype=model_output.dtype
+            )
+            if self.variance_type == "fixed_small_log":
+                variance = self._get_variance(t, predicted_variance=predicted_variance) * variance_noise
+            elif self.variance_type == "learned_range":
+                variance = self._get_variance(t, predicted_variance=predicted_variance)
+                variance = torch.exp(0.5 * variance) * variance_noise
+            else:
+                variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * variance_noise
+
+        pred_prev_sample = pred_prev_sample + variance
+
+        if not return_dict:
+            return (pred_prev_sample,)
+
+        return DDPMSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
+
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+        # for the subsequent add_noise calls
+        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    def get_velocity(
+        self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+
+    def __len__(self):
+        return self.config.num_train_timesteps
+
+    def previous_timestep(self, timestep):
+        if self.custom_timesteps:
+            index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
+            if index == self.timesteps.shape[0] - 1:
+                prev_t = torch.tensor(-1)
+            else:
+                prev_t = self.timesteps[index + 1]
+        else:
+            num_inference_steps = (
+                self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps
+            )
+            prev_t = timestep - self.config.num_train_timesteps // num_inference_steps
+
+        return prev_t
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddpm_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddpm_flax.py
new file mode 100644
index 000000000..6bdfa5eb5
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddpm_flax.py
@@ -0,0 +1,299 @@
+# Copyright 2024 UC Berkeley Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import flax
+import jax
+import jax.numpy as jnp
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils_flax import (
+    CommonSchedulerState,
+    FlaxKarrasDiffusionSchedulers,
+    FlaxSchedulerMixin,
+    FlaxSchedulerOutput,
+    add_noise_common,
+    get_velocity_common,
+)
+
+
+@flax.struct.dataclass
+class DDPMSchedulerState:
+    common: CommonSchedulerState
+
+    # setable values
+    init_noise_sigma: jnp.ndarray
+    timesteps: jnp.ndarray
+    num_inference_steps: Optional[int] = None
+
+    @classmethod
+    def create(cls, common: CommonSchedulerState, init_noise_sigma: jnp.ndarray, timesteps: jnp.ndarray):
+        return cls(common=common, init_noise_sigma=init_noise_sigma, timesteps=timesteps)
+
+
+@dataclass
+class FlaxDDPMSchedulerOutput(FlaxSchedulerOutput):
+    state: DDPMSchedulerState
+
+
+class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
+    """
+    Denoising diffusion probabilistic models (DDPMs) explores the connections between denoising score matching and
+    Langevin dynamics sampling.
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
+
+    For more details, see the original paper: https://arxiv.org/abs/2006.11239
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+        variance_type (`str`):
+            options to clip the variance used when adding noise to the denoised sample. Choose from `fixed_small`,
+            `fixed_small_log`, `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
+        clip_sample (`bool`, default `True`):
+            option to clip predicted sample between -1 and 1 for numerical stability.
+        prediction_type (`str`, default `epsilon`):
+            indicates whether the model predicts the noise (epsilon), or the samples. One of `epsilon`, `sample`.
+            `v-prediction` is not supported for this scheduler.
+        dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
+            the `dtype` used for params and computation.
+    """
+
+    _compatibles = [e.name for e in FlaxKarrasDiffusionSchedulers]
+
+    dtype: jnp.dtype
+
+    @property
+    def has_state(self):
+        return True
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[jnp.ndarray] = None,
+        variance_type: str = "fixed_small",
+        clip_sample: bool = True,
+        prediction_type: str = "epsilon",
+        dtype: jnp.dtype = jnp.float32,
+    ):
+        self.dtype = dtype
+
+    def create_state(self, common: Optional[CommonSchedulerState] = None) -> DDPMSchedulerState:
+        if common is None:
+            common = CommonSchedulerState.create(self)
+
+        # standard deviation of the initial noise distribution
+        init_noise_sigma = jnp.array(1.0, dtype=self.dtype)
+
+        timesteps = jnp.arange(0, self.config.num_train_timesteps).round()[::-1]
+
+        return DDPMSchedulerState.create(
+            common=common,
+            init_noise_sigma=init_noise_sigma,
+            timesteps=timesteps,
+        )
+
+    def scale_model_input(
+        self, state: DDPMSchedulerState, sample: jnp.ndarray, timestep: Optional[int] = None
+    ) -> jnp.ndarray:
+        """
+        Args:
+            state (`PNDMSchedulerState`): the `FlaxPNDMScheduler` state data class instance.
+            sample (`jnp.ndarray`): input sample
+            timestep (`int`, optional): current timestep
+
+        Returns:
+            `jnp.ndarray`: scaled input sample
+        """
+        return sample
+
+    def set_timesteps(
+        self, state: DDPMSchedulerState, num_inference_steps: int, shape: Tuple = ()
+    ) -> DDPMSchedulerState:
+        """
+        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            state (`DDIMSchedulerState`):
+                the `FlaxDDPMScheduler` state data class instance.
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+        """
+
+        step_ratio = self.config.num_train_timesteps // num_inference_steps
+        # creates integer timesteps by multiplying by ratio
+        # rounding to avoid issues when num_inference_step is power of 3
+        timesteps = (jnp.arange(0, num_inference_steps) * step_ratio).round()[::-1]
+
+        return state.replace(
+            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
+        )
+
+    def _get_variance(self, state: DDPMSchedulerState, t, predicted_variance=None, variance_type=None):
+        alpha_prod_t = state.common.alphas_cumprod[t]
+        alpha_prod_t_prev = jnp.where(t > 0, state.common.alphas_cumprod[t - 1], jnp.array(1.0, dtype=self.dtype))
+
+        # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
+        # and sample from it to get previous sample
+        # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
+        variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * state.common.betas[t]
+
+        if variance_type is None:
+            variance_type = self.config.variance_type
+
+        # hacks - were probably added for training stability
+        if variance_type == "fixed_small":
+            variance = jnp.clip(variance, a_min=1e-20)
+        # for rl-diffuser https://arxiv.org/abs/2205.09991
+        elif variance_type == "fixed_small_log":
+            variance = jnp.log(jnp.clip(variance, a_min=1e-20))
+        elif variance_type == "fixed_large":
+            variance = state.common.betas[t]
+        elif variance_type == "fixed_large_log":
+            # Glide max_log
+            variance = jnp.log(state.common.betas[t])
+        elif variance_type == "learned":
+            return predicted_variance
+        elif variance_type == "learned_range":
+            min_log = variance
+            max_log = state.common.betas[t]
+            frac = (predicted_variance + 1) / 2
+            variance = frac * max_log + (1 - frac) * min_log
+
+        return variance
+
+    def step(
+        self,
+        state: DDPMSchedulerState,
+        model_output: jnp.ndarray,
+        timestep: int,
+        sample: jnp.ndarray,
+        key: Optional[jax.Array] = None,
+        return_dict: bool = True,
+    ) -> Union[FlaxDDPMSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            state (`DDPMSchedulerState`): the `FlaxDDPMScheduler` state data class instance.
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            key (`jax.Array`): a PRNG key.
+            return_dict (`bool`): option for returning tuple rather than FlaxDDPMSchedulerOutput class
+
+        Returns:
+            [`FlaxDDPMSchedulerOutput`] or `tuple`: [`FlaxDDPMSchedulerOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+        t = timestep
+
+        if key is None:
+            key = jax.random.PRNGKey(0)
+
+        if model_output.shape[1] == sample.shape[1] * 2 and self.config.variance_type in ["learned", "learned_range"]:
+            model_output, predicted_variance = jnp.split(model_output, sample.shape[1], axis=1)
+        else:
+            predicted_variance = None
+
+        # 1. compute alphas, betas
+        alpha_prod_t = state.common.alphas_cumprod[t]
+        alpha_prod_t_prev = jnp.where(t > 0, state.common.alphas_cumprod[t - 1], jnp.array(1.0, dtype=self.dtype))
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        # 2. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` "
+                " for the FlaxDDPMScheduler."
+            )
+
+        # 3. Clip "predicted x_0"
+        if self.config.clip_sample:
+            pred_original_sample = jnp.clip(pred_original_sample, -1, 1)
+
+        # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * state.common.betas[t]) / beta_prod_t
+        current_sample_coeff = state.common.alphas[t] ** (0.5) * beta_prod_t_prev / beta_prod_t
+
+        # 5. Compute predicted previous sample µ_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
+
+        # 6. Add noise
+        def random_variance():
+            split_key = jax.random.split(key, num=1)
+            noise = jax.random.normal(split_key, shape=model_output.shape, dtype=self.dtype)
+            return (self._get_variance(state, t, predicted_variance=predicted_variance) ** 0.5) * noise
+
+        variance = jnp.where(t > 0, random_variance(), jnp.zeros(model_output.shape, dtype=self.dtype))
+
+        pred_prev_sample = pred_prev_sample + variance
+
+        if not return_dict:
+            return (pred_prev_sample, state)
+
+        return FlaxDDPMSchedulerOutput(prev_sample=pred_prev_sample, state=state)
+
+    def add_noise(
+        self,
+        state: DDPMSchedulerState,
+        original_samples: jnp.ndarray,
+        noise: jnp.ndarray,
+        timesteps: jnp.ndarray,
+    ) -> jnp.ndarray:
+        return add_noise_common(state.common, original_samples, noise, timesteps)
+
+    def get_velocity(
+        self,
+        state: DDPMSchedulerState,
+        sample: jnp.ndarray,
+        noise: jnp.ndarray,
+        timesteps: jnp.ndarray,
+    ) -> jnp.ndarray:
+        return get_velocity_common(state.common, sample, noise, timesteps)
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddpm_parallel.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddpm_parallel.py
new file mode 100644
index 000000000..ec4fbd4eb
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddpm_parallel.py
@@ -0,0 +1,653 @@
+# Copyright 2024 ParaDiGMS authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput
+class DDPMParallelSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.FloatTensor
+    pred_original_sample: Optional[torch.FloatTensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.FloatTensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.FloatTensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Denoising diffusion probabilistic models (DDPMs) explores the connections between denoising score matching and
+    Langevin dynamics sampling.
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
+
+    For more details, see the original paper: https://arxiv.org/abs/2006.11239
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, `squaredcos_cap_v2` or `sigmoid`.
+        trained_betas (`np.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+        variance_type (`str`):
+            options to clip the variance used when adding noise to the denoised sample. Choose from `fixed_small`,
+            `fixed_small_log`, `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
+        clip_sample (`bool`, default `True`):
+            option to clip predicted sample for numerical stability.
+        clip_sample_range (`float`, default `1.0`):
+            the maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        prediction_type (`str`, default `epsilon`, optional):
+            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
+            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
+            https://imagen.research.google/video/paper.pdf)
+        thresholding (`bool`, default `False`):
+            whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
+            Note that the thresholding method is unsuitable for latent-space diffusion models (such as
+            stable-diffusion).
+        dynamic_thresholding_ratio (`float`, default `0.995`):
+            the ratio for the dynamic thresholding method. Default is `0.995`, the same as Imagen
+            (https://arxiv.org/abs/2205.11487). Valid only when `thresholding=True`.
+        sample_max_value (`float`, default `1.0`):
+            the threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        timestep_spacing (`str`, default `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2. of [Common Diffusion Noise Schedules and Sample
+            Steps are Flawed](https://arxiv.org/abs/2305.08891) for more information.
+        steps_offset (`int`, default `0`):
+            An offset added to the inference steps, as required by some model families.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+    _is_ode_scheduler = False
+
+    @register_to_config
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.__init__
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        variance_type: str = "fixed_small",
+        clip_sample: bool = True,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        clip_sample_range: float = 1.0,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        steps_offset: int = 0,
+        rescale_betas_zero_snr: int = False,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        elif beta_schedule == "sigmoid":
+            # GeoDiff sigmoid schedule
+            betas = torch.linspace(-6, 6, num_train_timesteps)
+            self.betas = torch.sigmoid(betas) * (beta_end - beta_start) + beta_start
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        self.one = torch.tensor(1.0)
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.custom_timesteps = False
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy())
+
+        self.variance_type = variance_type
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.scale_model_input
+    def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.set_timesteps
+    def set_timesteps(
+        self,
+        num_inference_steps: Optional[int] = None,
+        device: Union[str, torch.device] = None,
+        timesteps: Optional[List[int]] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model. If used,
+                `timesteps` must be `None`.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of equal spacing between timesteps is used. If `timesteps` is passed,
+                `num_inference_steps` must be `None`.
+
+        """
+        if num_inference_steps is not None and timesteps is not None:
+            raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.")
+
+        if timesteps is not None:
+            for i in range(1, len(timesteps)):
+                if timesteps[i] >= timesteps[i - 1]:
+                    raise ValueError("`custom_timesteps` must be in descending order.")
+
+            if timesteps[0] >= self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`timesteps` must start before `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps}."
+                )
+
+            timesteps = np.array(timesteps, dtype=np.int64)
+            self.custom_timesteps = True
+        else:
+            if num_inference_steps > self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                    f" maximal {self.config.num_train_timesteps} timesteps."
+                )
+
+            self.num_inference_steps = num_inference_steps
+            self.custom_timesteps = False
+
+            # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+            if self.config.timestep_spacing == "linspace":
+                timesteps = (
+                    np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps)
+                    .round()[::-1]
+                    .copy()
+                    .astype(np.int64)
+                )
+            elif self.config.timestep_spacing == "leading":
+                step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+                # creates integer timesteps by multiplying by ratio
+                # casting to int to avoid issues when num_inference_step is power of 3
+                timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+                timesteps += self.config.steps_offset
+            elif self.config.timestep_spacing == "trailing":
+                step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+                # creates integer timesteps by multiplying by ratio
+                # casting to int to avoid issues when num_inference_step is power of 3
+                timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)).astype(np.int64)
+                timesteps -= 1
+            else:
+                raise ValueError(
+                    f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+                )
+
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._get_variance
+    def _get_variance(self, t, predicted_variance=None, variance_type=None):
+        prev_t = self.previous_timestep(t)
+
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
+        current_beta_t = 1 - alpha_prod_t / alpha_prod_t_prev
+
+        # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
+        # and sample from it to get previous sample
+        # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
+        variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * current_beta_t
+
+        # we always take the log of variance, so clamp it to ensure it's not 0
+        variance = torch.clamp(variance, min=1e-20)
+
+        if variance_type is None:
+            variance_type = self.config.variance_type
+
+        # hacks - were probably added for training stability
+        if variance_type == "fixed_small":
+            variance = variance
+        # for rl-diffuser https://arxiv.org/abs/2205.09991
+        elif variance_type == "fixed_small_log":
+            variance = torch.log(variance)
+            variance = torch.exp(0.5 * variance)
+        elif variance_type == "fixed_large":
+            variance = current_beta_t
+        elif variance_type == "fixed_large_log":
+            # Glide max_log
+            variance = torch.log(current_beta_t)
+        elif variance_type == "learned":
+            return predicted_variance
+        elif variance_type == "learned_range":
+            min_log = torch.log(variance)
+            max_log = torch.log(current_beta_t)
+            frac = (predicted_variance + 1) / 2
+            variance = frac * max_log + (1 - frac) * min_log
+
+        return variance
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        generator=None,
+        return_dict: bool = True,
+    ) -> Union[DDPMParallelSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                current instance of sample being created by diffusion process.
+            generator: random number generator.
+            return_dict (`bool`): option for returning tuple rather than DDPMParallelSchedulerOutput class
+
+        Returns:
+            [`~schedulers.scheduling_utils.DDPMParallelSchedulerOutput`] or `tuple`:
+            [`~schedulers.scheduling_utils.DDPMParallelSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`.
+            When returning a tuple, the first element is the sample tensor.
+
+        """
+        t = timestep
+
+        prev_t = self.previous_timestep(t)
+
+        if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]:
+            model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1)
+        else:
+            predicted_variance = None
+
+        # 1. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        current_alpha_t = alpha_prod_t / alpha_prod_t_prev
+        current_beta_t = 1 - current_alpha_t
+
+        # 2. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
+                " `v_prediction`  for the DDPMScheduler."
+            )
+
+        # 3. Clip or threshold "predicted x_0"
+        if self.config.thresholding:
+            pred_original_sample = self._threshold_sample(pred_original_sample)
+        elif self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * current_beta_t) / beta_prod_t
+        current_sample_coeff = current_alpha_t ** (0.5) * beta_prod_t_prev / beta_prod_t
+
+        # 5. Compute predicted previous sample µ_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
+
+        # 6. Add noise
+        variance = 0
+        if t > 0:
+            device = model_output.device
+            variance_noise = randn_tensor(
+                model_output.shape, generator=generator, device=device, dtype=model_output.dtype
+            )
+            if self.variance_type == "fixed_small_log":
+                variance = self._get_variance(t, predicted_variance=predicted_variance) * variance_noise
+            elif self.variance_type == "learned_range":
+                variance = self._get_variance(t, predicted_variance=predicted_variance)
+                variance = torch.exp(0.5 * variance) * variance_noise
+            else:
+                variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * variance_noise
+
+        pred_prev_sample = pred_prev_sample + variance
+
+        if not return_dict:
+            return (pred_prev_sample,)
+
+        return DDPMParallelSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
+
+    def batch_step_no_noise(
+        self,
+        model_output: torch.FloatTensor,
+        timesteps: List[int],
+        sample: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        """
+        Batched version of the `step` function, to be able to reverse the SDE for multiple samples/timesteps at once.
+        Also, does not add any noise to the predicted sample, which is necessary for parallel sampling where the noise
+        is pre-sampled by the pipeline.
+
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+            timesteps (`List[int]`):
+                current discrete timesteps in the diffusion chain. This is now a list of integers.
+            sample (`torch.FloatTensor`):
+                current instance of sample being created by diffusion process.
+
+        Returns:
+            `torch.FloatTensor`: sample tensor at previous timestep.
+        """
+        t = timesteps
+        num_inference_steps = self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps
+        prev_t = t - self.config.num_train_timesteps // num_inference_steps
+
+        t = t.view(-1, *([1] * (model_output.ndim - 1)))
+        prev_t = prev_t.view(-1, *([1] * (model_output.ndim - 1)))
+
+        if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]:
+            model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1)
+        else:
+            pass
+
+        # 1. compute alphas, betas
+        self.alphas_cumprod = self.alphas_cumprod.to(model_output.device)
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[torch.clip(prev_t, min=0)]
+        alpha_prod_t_prev[prev_t < 0] = torch.tensor(1.0)
+
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+        current_alpha_t = alpha_prod_t / alpha_prod_t_prev
+        current_beta_t = 1 - current_alpha_t
+
+        # 2. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
+                " `v_prediction`  for the DDPMParallelScheduler."
+            )
+
+        # 3. Clip or threshold "predicted x_0"
+        if self.config.thresholding:
+            pred_original_sample = self._threshold_sample(pred_original_sample)
+        elif self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * current_beta_t) / beta_prod_t
+        current_sample_coeff = current_alpha_t ** (0.5) * beta_prod_t_prev / beta_prod_t
+
+        # 5. Compute predicted previous sample µ_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
+
+        return pred_prev_sample
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+        # for the subsequent add_noise calls
+        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
+    def get_velocity(
+        self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+
+    def __len__(self):
+        return self.config.num_train_timesteps
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.previous_timestep
+    def previous_timestep(self, timestep):
+        if self.custom_timesteps:
+            index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
+            if index == self.timesteps.shape[0] - 1:
+                prev_t = torch.tensor(-1)
+            else:
+                prev_t = self.timesteps[index + 1]
+        else:
+            num_inference_steps = (
+                self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps
+            )
+            prev_t = timestep - self.config.num_train_timesteps // num_inference_steps
+
+        return prev_t
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py
new file mode 100644
index 000000000..ad4e4f414
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py
@@ -0,0 +1,230 @@
+# Copyright (c) 2022 Pablo Pernías MIT License
+# Copyright 2024 UC Berkeley Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import SchedulerMixin
+
+
+@dataclass
+class DDPMWuerstchenSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's step function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+
+    prev_sample: torch.FloatTensor
+
+
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class DDPMWuerstchenScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Denoising diffusion probabilistic models (DDPMs) explores the connections between denoising score matching and
+    Langevin dynamics sampling.
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
+
+    For more details, see the original paper: https://arxiv.org/abs/2006.11239
+
+    Args:
+        scaler (`float`): ....
+        s (`float`): ....
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        scaler: float = 1.0,
+        s: float = 0.008,
+    ):
+        self.scaler = scaler
+        self.s = torch.tensor([s])
+        self._init_alpha_cumprod = torch.cos(self.s / (1 + self.s) * torch.pi * 0.5) ** 2
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+    def _alpha_cumprod(self, t, device):
+        if self.scaler > 1:
+            t = 1 - (1 - t) ** self.scaler
+        elif self.scaler < 1:
+            t = t**self.scaler
+        alpha_cumprod = torch.cos(
+            (t + self.s.to(device)) / (1 + self.s.to(device)) * torch.pi * 0.5
+        ) ** 2 / self._init_alpha_cumprod.to(device)
+        return alpha_cumprod.clamp(0.0001, 0.9999)
+
+    def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`): input sample
+            timestep (`int`, optional): current timestep
+
+        Returns:
+            `torch.FloatTensor`: scaled input sample
+        """
+        return sample
+
+    def set_timesteps(
+        self,
+        num_inference_steps: int = None,
+        timesteps: Optional[List[int]] = None,
+        device: Union[str, torch.device] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            num_inference_steps (`Dict[float, int]`):
+                the number of diffusion steps used when generating samples with a pre-trained model. If passed, then
+                `timesteps` must be `None`.
+            device (`str` or `torch.device`, optional):
+                the device to which the timesteps are moved to. {2 / 3: 20, 0.0: 10}
+        """
+        if timesteps is None:
+            timesteps = torch.linspace(1.0, 0.0, num_inference_steps + 1, device=device)
+        if not isinstance(timesteps, torch.Tensor):
+            timesteps = torch.Tensor(timesteps).to(device)
+        self.timesteps = timesteps
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        generator=None,
+        return_dict: bool = True,
+    ) -> Union[DDPMWuerstchenSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                current instance of sample being created by diffusion process.
+            generator: random number generator.
+            return_dict (`bool`): option for returning tuple rather than DDPMWuerstchenSchedulerOutput class
+
+        Returns:
+            [`DDPMWuerstchenSchedulerOutput`] or `tuple`: [`DDPMWuerstchenSchedulerOutput`] if `return_dict` is True,
+            otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+        dtype = model_output.dtype
+        device = model_output.device
+        t = timestep
+
+        prev_t = self.previous_timestep(t)
+
+        alpha_cumprod = self._alpha_cumprod(t, device).view(t.size(0), *[1 for _ in sample.shape[1:]])
+        alpha_cumprod_prev = self._alpha_cumprod(prev_t, device).view(prev_t.size(0), *[1 for _ in sample.shape[1:]])
+        alpha = alpha_cumprod / alpha_cumprod_prev
+
+        mu = (1.0 / alpha).sqrt() * (sample - (1 - alpha) * model_output / (1 - alpha_cumprod).sqrt())
+
+        std_noise = randn_tensor(mu.shape, generator=generator, device=model_output.device, dtype=model_output.dtype)
+        std = ((1 - alpha) * (1.0 - alpha_cumprod_prev) / (1.0 - alpha_cumprod)).sqrt() * std_noise
+        pred = mu + std * (prev_t != 0).float().view(prev_t.size(0), *[1 for _ in sample.shape[1:]])
+
+        if not return_dict:
+            return (pred.to(dtype),)
+
+        return DDPMWuerstchenSchedulerOutput(prev_sample=pred.to(dtype))
+
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        device = original_samples.device
+        dtype = original_samples.dtype
+        alpha_cumprod = self._alpha_cumprod(timesteps, device=device).view(
+            timesteps.size(0), *[1 for _ in original_samples.shape[1:]]
+        )
+        noisy_samples = alpha_cumprod.sqrt() * original_samples + (1 - alpha_cumprod).sqrt() * noise
+        return noisy_samples.to(dtype=dtype)
+
+    def __len__(self):
+        return self.config.num_train_timesteps
+
+    def previous_timestep(self, timestep):
+        index = (self.timesteps - timestep[0]).abs().argmin().item()
+        prev_t = self.timesteps[index + 1][None].expand(timestep.shape[0])
+        return prev_t
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_deis_multistep.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_deis_multistep.py
new file mode 100644
index 000000000..a4af0c272
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_deis_multistep.py
@@ -0,0 +1,786 @@
+# Copyright 2024 FLAIR Lab and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: check https://arxiv.org/abs/2204.13902 and https://github.com/qsh-zh/deis for more info
+# The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import deprecate
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `DEISMultistepScheduler` is a fast high order solver for diffusion ordinary differential equations (ODEs).
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        solver_order (`int`, defaults to 2):
+            The DEIS order which can be `1` or `2` or `3`. It is recommended to use `solver_order=2` for guided
+            sampling, and `solver_order=3` for unconditional sampling.
+        prediction_type (`str`, defaults to `epsilon`):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        algorithm_type (`str`, defaults to `deis`):
+            The algorithm type for the solver.
+        lower_order_final (`bool`, defaults to `True`):
+            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+             Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+             the sigmas are determined according to a sequence of noise levels {σi}.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[np.ndarray] = None,
+        solver_order: int = 2,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        algorithm_type: str = "deis",
+        solver_type: str = "logrho",
+        lower_order_final: bool = True,
+        use_karras_sigmas: Optional[bool] = False,
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        # Currently we only support VP-type noise schedule
+        self.alpha_t = torch.sqrt(self.alphas_cumprod)
+        self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
+        self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
+        self.sigmas = ((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # settings for DEIS
+        if algorithm_type not in ["deis"]:
+            if algorithm_type in ["dpmsolver", "dpmsolver++"]:
+                self.register_to_config(algorithm_type="deis")
+            else:
+                raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
+
+        if solver_type not in ["logrho"]:
+            if solver_type in ["midpoint", "heun", "bh1", "bh2"]:
+                self.register_to_config(solver_type="logrho")
+            else:
+                raise NotImplementedError(f"solver type {solver_type} does is not implemented for {self.__class__}")
+
+        # setable values
+        self.num_inference_steps = None
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
+        self.timesteps = torch.from_numpy(timesteps)
+        self.model_outputs = [None] * solver_order
+        self.lower_order_nums = 0
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increae 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = (
+                np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1)
+                .round()[::-1][:-1]
+                .copy()
+                .astype(np.int64)
+            )
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // (num_inference_steps + 1)
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps + 1) * step_ratio).round()[::-1][:-1].copy().astype(np.int64)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = np.arange(self.config.num_train_timesteps, 0, -step_ratio).round().copy().astype(np.int64)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        if self.config.use_karras_sigmas:
+            log_sigmas = np.log(sigmas)
+            sigmas = np.flip(sigmas).copy()
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+            sigmas = np.concatenate([sigmas, sigmas[-1:]]).astype(np.float32)
+        else:
+            sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+            sigma_last = ((1 - self.alphas_cumprod[0]) / self.alphas_cumprod[0]) ** 0.5
+            sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)
+
+        self.sigmas = torch.from_numpy(sigmas)
+        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64)
+
+        self.num_inference_steps = len(timesteps)
+
+        self.model_outputs = [
+            None,
+        ] * self.config.solver_order
+        self.lower_order_nums = 0
+
+        # add an index counter for schedulers that allow duplicated timesteps
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._sigma_to_alpha_sigma_t
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        alpha_t = 1 / ((sigma**2 + 1) ** 0.5)
+        sigma_t = sigma * alpha_t
+
+        return alpha_t, sigma_t
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    def convert_model_output(
+        self,
+        model_output: torch.FloatTensor,
+        *args,
+        sample: torch.FloatTensor = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        Convert the model output to the corresponding type the DEIS algorithm needs.
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from the learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.FloatTensor`:
+                The converted model output.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError("missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma = self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        if self.config.prediction_type == "epsilon":
+            x0_pred = (sample - sigma_t * model_output) / alpha_t
+        elif self.config.prediction_type == "sample":
+            x0_pred = model_output
+        elif self.config.prediction_type == "v_prediction":
+            x0_pred = alpha_t * sample - sigma_t * model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                " `v_prediction` for the DEISMultistepScheduler."
+            )
+
+        if self.config.thresholding:
+            x0_pred = self._threshold_sample(x0_pred)
+
+        if self.config.algorithm_type == "deis":
+            return (sample - alpha_t * x0_pred) / sigma_t
+        else:
+            raise NotImplementedError("only support log-rho multistep deis now")
+
+    def deis_first_order_update(
+        self,
+        model_output: torch.FloatTensor,
+        *args,
+        sample: torch.FloatTensor = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        One step for the first-order DEIS (equivalent to DDIM).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from the learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            prev_timestep (`int`):
+                The previous discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.FloatTensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s, sigma_s = self._sigma_to_alpha_sigma_t(sigma_s)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s = torch.log(alpha_s) - torch.log(sigma_s)
+
+        h = lambda_t - lambda_s
+        if self.config.algorithm_type == "deis":
+            x_t = (alpha_t / alpha_s) * sample - (sigma_t * (torch.exp(h) - 1.0)) * model_output
+        else:
+            raise NotImplementedError("only support log-rho multistep deis now")
+        return x_t
+
+    def multistep_deis_second_order_update(
+        self,
+        model_output_list: List[torch.FloatTensor],
+        *args,
+        sample: torch.FloatTensor = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        One step for the second-order multistep DEIS.
+
+        Args:
+            model_output_list (`List[torch.FloatTensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.FloatTensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if timestep_list is not None:
+            deprecate(
+                "timestep_list",
+                "1.0.0",
+                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s0, sigma_s1 = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+
+        rho_t, rho_s0, rho_s1 = sigma_t / alpha_t, sigma_s0 / alpha_s0, sigma_s1 / alpha_s1
+
+        if self.config.algorithm_type == "deis":
+
+            def ind_fn(t, b, c):
+                # Integrate[(log(t) - log(c)) / (log(b) - log(c)), {t}]
+                return t * (-np.log(c) + np.log(t) - 1) / (np.log(b) - np.log(c))
+
+            coef1 = ind_fn(rho_t, rho_s0, rho_s1) - ind_fn(rho_s0, rho_s0, rho_s1)
+            coef2 = ind_fn(rho_t, rho_s1, rho_s0) - ind_fn(rho_s0, rho_s1, rho_s0)
+
+            x_t = alpha_t * (sample / alpha_s0 + coef1 * m0 + coef2 * m1)
+            return x_t
+        else:
+            raise NotImplementedError("only support log-rho multistep deis now")
+
+    def multistep_deis_third_order_update(
+        self,
+        model_output_list: List[torch.FloatTensor],
+        *args,
+        sample: torch.FloatTensor = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        One step for the third-order multistep DEIS.
+
+        Args:
+            model_output_list (`List[torch.FloatTensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by diffusion process.
+
+        Returns:
+            `torch.FloatTensor`:
+                The sample tensor at the previous timestep.
+        """
+
+        timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing`sample` as a required keyward argument")
+        if timestep_list is not None:
+            deprecate(
+                "timestep_list",
+                "1.0.0",
+                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s0, sigma_s1, sigma_s2 = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],
+            self.sigmas[self.step_index - 2],
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+        alpha_s2, sigma_s2 = self._sigma_to_alpha_sigma_t(sigma_s2)
+
+        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
+
+        rho_t, rho_s0, rho_s1, rho_s2 = (
+            sigma_t / alpha_t,
+            sigma_s0 / alpha_s0,
+            sigma_s1 / alpha_s1,
+            sigma_s2 / alpha_s2,
+        )
+
+        if self.config.algorithm_type == "deis":
+
+            def ind_fn(t, b, c, d):
+                # Integrate[(log(t) - log(c))(log(t) - log(d)) / (log(b) - log(c))(log(b) - log(d)), {t}]
+                numerator = t * (
+                    np.log(c) * (np.log(d) - np.log(t) + 1)
+                    - np.log(d) * np.log(t)
+                    + np.log(d)
+                    + np.log(t) ** 2
+                    - 2 * np.log(t)
+                    + 2
+                )
+                denominator = (np.log(b) - np.log(c)) * (np.log(b) - np.log(d))
+                return numerator / denominator
+
+            coef1 = ind_fn(rho_t, rho_s0, rho_s1, rho_s2) - ind_fn(rho_s0, rho_s0, rho_s1, rho_s2)
+            coef2 = ind_fn(rho_t, rho_s1, rho_s2, rho_s0) - ind_fn(rho_s0, rho_s1, rho_s2, rho_s0)
+            coef3 = ind_fn(rho_t, rho_s2, rho_s0, rho_s1) - ind_fn(rho_s0, rho_s2, rho_s0, rho_s1)
+
+            x_t = alpha_t * (sample / alpha_s0 + coef1 * m0 + coef2 * m1 + coef3 * m2)
+
+            return x_t
+        else:
+            raise NotImplementedError("only support log-rho multistep deis now")
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        index_candidates = (schedule_timesteps == timestep).nonzero()
+
+        if len(index_candidates) == 0:
+            step_index = len(self.timesteps) - 1
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        elif len(index_candidates) > 1:
+            step_index = index_candidates[1].item()
+        else:
+            step_index = index_candidates[0].item()
+
+        return step_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        """
+        Initialize the step_index counter for the scheduler.
+        """
+
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the multistep DEIS.
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        lower_order_final = (
+            (self.step_index == len(self.timesteps) - 1) and self.config.lower_order_final and len(self.timesteps) < 15
+        )
+        lower_order_second = (
+            (self.step_index == len(self.timesteps) - 2) and self.config.lower_order_final and len(self.timesteps) < 15
+        )
+
+        model_output = self.convert_model_output(model_output, sample=sample)
+        for i in range(self.config.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+        self.model_outputs[-1] = model_output
+
+        if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
+            prev_sample = self.deis_first_order_update(model_output, sample=sample)
+        elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
+            prev_sample = self.multistep_deis_second_order_update(self.model_outputs, sample=sample)
+        else:
+            prev_sample = self.multistep_deis_third_order_update(self.model_outputs, sample=sample)
+
+        if self.lower_order_nums < self.config.solver_order:
+            self.lower_order_nums += 1
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # begin_index is None when the scheduler is used for training
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        else:
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        noisy_samples = alpha_t * original_samples + sigma_t * noise
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
new file mode 100644
index 000000000..3bbfc65e2
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
@@ -0,0 +1,1029 @@
+# Copyright 2024 TSAIL Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import deprecate
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.FloatTensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.FloatTensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `DPMSolverMultistepScheduler` is a fast dedicated high-order solver for diffusion ODEs.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        solver_order (`int`, defaults to 2):
+            The DPMSolver order which can be `1` or `2` or `3`. It is recommended to use `solver_order=2` for guided
+            sampling, and `solver_order=3` for unconditional sampling.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
+            `algorithm_type="dpmsolver++"`.
+        algorithm_type (`str`, defaults to `dpmsolver++`):
+            Algorithm type for the solver; can be `dpmsolver`, `dpmsolver++`, `sde-dpmsolver` or `sde-dpmsolver++`. The
+            `dpmsolver` type implements the algorithms in the [DPMSolver](https://huggingface.co/papers/2206.00927)
+            paper, and the `dpmsolver++` type implements the algorithms in the
+            [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to use `dpmsolver++` or
+            `sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion.
+        solver_type (`str`, defaults to `midpoint`):
+            Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the
+            sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers.
+        lower_order_final (`bool`, defaults to `True`):
+            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
+            stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
+        euler_at_final (`bool`, defaults to `False`):
+            Whether to use Euler's method in the final step. It is a trade-off between numerical stability and detail
+            richness. This can stabilize the sampling of the SDE variant of DPMSolver for small number of inference
+            steps, but sometimes may result in blurring.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        use_lu_lambdas (`bool`, *optional*, defaults to `False`):
+            Whether to use the uniform-logSNR for step sizes proposed by Lu's DPM-Solver in the noise schedule during
+            the sampling process. If `True`, the sigmas and time steps are determined according to a sequence of
+            `lambda(t)`.
+        final_sigmas_type (`str`, defaults to `"zero"`):
+            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final sigma
+            is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+        lambda_min_clipped (`float`, defaults to `-inf`):
+            Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the
+            cosine (`squaredcos_cap_v2`) noise schedule.
+        variance_type (`str`, *optional*):
+            Set to "learned" or "learned_range" for diffusion models that predict variance. If set, the model's output
+            contains the predicted Gaussian variance.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        solver_order: int = 2,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        algorithm_type: str = "dpmsolver++",
+        solver_type: str = "midpoint",
+        lower_order_final: bool = True,
+        euler_at_final: bool = False,
+        use_karras_sigmas: Optional[bool] = False,
+        use_lu_lambdas: Optional[bool] = False,
+        final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
+        lambda_min_clipped: float = -float("inf"),
+        variance_type: Optional[str] = None,
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+        rescale_betas_zero_snr: bool = False,
+    ):
+        if algorithm_type in ["dpmsolver", "sde-dpmsolver"]:
+            deprecation_message = f"algorithm_type {algorithm_type} is deprecated and will be removed in a future version. Choose from `dpmsolver++` or `sde-dpmsolver++` instead"
+            deprecate("algorithm_types dpmsolver and sde-dpmsolver", "1.0.0", deprecation_message)
+
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        if rescale_betas_zero_snr:
+            # Close to 0 without being 0 so first sigma is not inf
+            # FP16 smallest positive subnormal works well here
+            self.alphas_cumprod[-1] = 2**-24
+
+        # Currently we only support VP-type noise schedule
+        self.alpha_t = torch.sqrt(self.alphas_cumprod)
+        self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
+        self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
+        self.sigmas = ((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # settings for DPM-Solver
+        if algorithm_type not in ["dpmsolver", "dpmsolver++", "sde-dpmsolver", "sde-dpmsolver++"]:
+            if algorithm_type == "deis":
+                self.register_to_config(algorithm_type="dpmsolver++")
+            else:
+                raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
+
+        if solver_type not in ["midpoint", "heun"]:
+            if solver_type in ["logrho", "bh1", "bh2"]:
+                self.register_to_config(solver_type="midpoint")
+            else:
+                raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
+
+        if algorithm_type not in ["dpmsolver++", "sde-dpmsolver++"] and final_sigmas_type == "zero":
+            raise ValueError(
+                f"`final_sigmas_type` {final_sigmas_type} is not supported for `algorithm_type` {algorithm_type}. Please choose `sigma_min` instead."
+            )
+
+        # setable values
+        self.num_inference_steps = None
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
+        self.timesteps = torch.from_numpy(timesteps)
+        self.model_outputs = [None] * solver_order
+        self.lower_order_nums = 0
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increae 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        # Clipping the minimum of all lambda(t) for numerical stability.
+        # This is critical for cosine (squaredcos_cap_v2) noise schedule.
+        clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped)
+        last_timestep = ((self.config.num_train_timesteps - clipped_idx).numpy()).item()
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = (
+                np.linspace(0, last_timestep - 1, num_inference_steps + 1).round()[::-1][:-1].copy().astype(np.int64)
+            )
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = last_timestep // (num_inference_steps + 1)
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps + 1) * step_ratio).round()[::-1][:-1].copy().astype(np.int64)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = np.arange(last_timestep, 0, -step_ratio).round().copy().astype(np.int64)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        log_sigmas = np.log(sigmas)
+
+        if self.config.use_karras_sigmas:
+            sigmas = np.flip(sigmas).copy()
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+        elif self.config.use_lu_lambdas:
+            lambdas = np.flip(log_sigmas.copy())
+            lambdas = self._convert_to_lu(in_lambdas=lambdas, num_inference_steps=num_inference_steps)
+            sigmas = np.exp(lambdas)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+        else:
+            sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+
+        if self.config.final_sigmas_type == "sigma_min":
+            sigma_last = ((1 - self.alphas_cumprod[0]) / self.alphas_cumprod[0]) ** 0.5
+        elif self.config.final_sigmas_type == "zero":
+            sigma_last = 0
+        else:
+            raise ValueError(
+                f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
+            )
+
+        sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)
+
+        self.sigmas = torch.from_numpy(sigmas)
+        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64)
+
+        self.num_inference_steps = len(timesteps)
+
+        self.model_outputs = [
+            None,
+        ] * self.config.solver_order
+        self.lower_order_nums = 0
+
+        # add an index counter for schedulers that allow duplicated timesteps
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        alpha_t = 1 / ((sigma**2 + 1) ** 0.5)
+        sigma_t = sigma * alpha_t
+
+        return alpha_t, sigma_t
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    def _convert_to_lu(self, in_lambdas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
+        """Constructs the noise schedule of Lu et al. (2022)."""
+
+        lambda_min: float = in_lambdas[-1].item()
+        lambda_max: float = in_lambdas[0].item()
+
+        rho = 1.0  # 1.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = lambda_min ** (1 / rho)
+        max_inv_rho = lambda_max ** (1 / rho)
+        lambdas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return lambdas
+
+    def convert_model_output(
+        self,
+        model_output: torch.FloatTensor,
+        *args,
+        sample: torch.FloatTensor = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is
+        designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an
+        integral of the data prediction model.
+
+        <Tip>
+
+        The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise
+        prediction and data prediction models.
+
+        </Tip>
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from the learned diffusion model.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.FloatTensor`:
+                The converted model output.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError("missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        # DPM-Solver++ needs to solve an integral of the data prediction model.
+        if self.config.algorithm_type in ["dpmsolver++", "sde-dpmsolver++"]:
+            if self.config.prediction_type == "epsilon":
+                # DPM-Solver and DPM-Solver++ only need the "mean" output.
+                if self.config.variance_type in ["learned", "learned_range"]:
+                    model_output = model_output[:, :3]
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                x0_pred = (sample - sigma_t * model_output) / alpha_t
+            elif self.config.prediction_type == "sample":
+                x0_pred = model_output
+            elif self.config.prediction_type == "v_prediction":
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                x0_pred = alpha_t * sample - sigma_t * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the DPMSolverMultistepScheduler."
+                )
+
+            if self.config.thresholding:
+                x0_pred = self._threshold_sample(x0_pred)
+
+            return x0_pred
+
+        # DPM-Solver needs to solve an integral of the noise prediction model.
+        elif self.config.algorithm_type in ["dpmsolver", "sde-dpmsolver"]:
+            if self.config.prediction_type == "epsilon":
+                # DPM-Solver and DPM-Solver++ only need the "mean" output.
+                if self.config.variance_type in ["learned", "learned_range"]:
+                    epsilon = model_output[:, :3]
+                else:
+                    epsilon = model_output
+            elif self.config.prediction_type == "sample":
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                epsilon = (sample - alpha_t * model_output) / sigma_t
+            elif self.config.prediction_type == "v_prediction":
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                epsilon = alpha_t * model_output + sigma_t * sample
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the DPMSolverMultistepScheduler."
+                )
+
+            if self.config.thresholding:
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                x0_pred = (sample - sigma_t * epsilon) / alpha_t
+                x0_pred = self._threshold_sample(x0_pred)
+                epsilon = (sample - alpha_t * x0_pred) / sigma_t
+
+            return epsilon
+
+    def dpm_solver_first_order_update(
+        self,
+        model_output: torch.FloatTensor,
+        *args,
+        sample: torch.FloatTensor = None,
+        noise: Optional[torch.FloatTensor] = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        One step for the first-order DPMSolver (equivalent to DDIM).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from the learned diffusion model.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.FloatTensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s, sigma_s = self._sigma_to_alpha_sigma_t(sigma_s)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s = torch.log(alpha_s) - torch.log(sigma_s)
+
+        h = lambda_t - lambda_s
+        if self.config.algorithm_type == "dpmsolver++":
+            x_t = (sigma_t / sigma_s) * sample - (alpha_t * (torch.exp(-h) - 1.0)) * model_output
+        elif self.config.algorithm_type == "dpmsolver":
+            x_t = (alpha_t / alpha_s) * sample - (sigma_t * (torch.exp(h) - 1.0)) * model_output
+        elif self.config.algorithm_type == "sde-dpmsolver++":
+            assert noise is not None
+            x_t = (
+                (sigma_t / sigma_s * torch.exp(-h)) * sample
+                + (alpha_t * (1 - torch.exp(-2.0 * h))) * model_output
+                + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+            )
+        elif self.config.algorithm_type == "sde-dpmsolver":
+            assert noise is not None
+            x_t = (
+                (alpha_t / alpha_s) * sample
+                - 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * model_output
+                + sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
+            )
+        return x_t
+
+    def multistep_dpm_solver_second_order_update(
+        self,
+        model_output_list: List[torch.FloatTensor],
+        *args,
+        sample: torch.FloatTensor = None,
+        noise: Optional[torch.FloatTensor] = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        One step for the second-order multistep DPMSolver.
+
+        Args:
+            model_output_list (`List[torch.FloatTensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.FloatTensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if timestep_list is not None:
+            deprecate(
+                "timestep_list",
+                "1.0.0",
+                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s0, sigma_s1 = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+
+        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
+        r0 = h_0 / h
+        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2211.01095 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    - 0.5 * (alpha_t * (torch.exp(-h) - 1.0)) * D1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+                )
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - 0.5 * (sigma_t * (torch.exp(h) - 1.0)) * D1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                )
+        elif self.config.algorithm_type == "sde-dpmsolver++":
+            assert noise is not None
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+                    + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+                    + 0.5 * (alpha_t * (1 - torch.exp(-2.0 * h))) * D1
+                    + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+                    + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+                    + (alpha_t * ((1.0 - torch.exp(-2.0 * h)) / (-2.0 * h) + 1.0)) * D1
+                    + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+                )
+        elif self.config.algorithm_type == "sde-dpmsolver":
+            assert noise is not None
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D1
+                    + sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - 2.0 * (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                    + sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
+                )
+        return x_t
+
+    def multistep_dpm_solver_third_order_update(
+        self,
+        model_output_list: List[torch.FloatTensor],
+        *args,
+        sample: torch.FloatTensor = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        One step for the third-order multistep DPMSolver.
+
+        Args:
+            model_output_list (`List[torch.FloatTensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by diffusion process.
+
+        Returns:
+            `torch.FloatTensor`:
+                The sample tensor at the previous timestep.
+        """
+
+        timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing`sample` as a required keyward argument")
+        if timestep_list is not None:
+            deprecate(
+                "timestep_list",
+                "1.0.0",
+                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s0, sigma_s1, sigma_s2 = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],
+            self.sigmas[self.step_index - 2],
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+        alpha_s2, sigma_s2 = self._sigma_to_alpha_sigma_t(sigma_s2)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+        lambda_s2 = torch.log(alpha_s2) - torch.log(sigma_s2)
+
+        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
+
+        h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
+        r0, r1 = h_0 / h, h_1 / h
+        D0 = m0
+        D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
+        D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
+        D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                (sigma_t / sigma_s0) * sample
+                - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+                - (alpha_t * ((torch.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
+            )
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                (alpha_t / alpha_s0) * sample
+                - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                - (sigma_t * ((torch.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2
+            )
+        return x_t
+
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        index_candidates = (schedule_timesteps == timestep).nonzero()
+
+        if len(index_candidates) == 0:
+            step_index = len(self.timesteps) - 1
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        elif len(index_candidates) > 1:
+            step_index = index_candidates[1].item()
+        else:
+            step_index = index_candidates[0].item()
+
+        return step_index
+
+    def _init_step_index(self, timestep):
+        """
+        Initialize the step_index counter for the scheduler.
+        """
+
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        generator=None,
+        variance_noise: Optional[torch.FloatTensor] = None,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the multistep DPMSolver.
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            variance_noise (`torch.FloatTensor`):
+                Alternative to generating noise with `generator` by directly providing the noise for the variance
+                itself. Useful for methods such as [`LEdits++`].
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # Improve numerical stability for small number of steps
+        lower_order_final = (self.step_index == len(self.timesteps) - 1) and (
+            self.config.euler_at_final
+            or (self.config.lower_order_final and len(self.timesteps) < 15)
+            or self.config.final_sigmas_type == "zero"
+        )
+        lower_order_second = (
+            (self.step_index == len(self.timesteps) - 2) and self.config.lower_order_final and len(self.timesteps) < 15
+        )
+
+        model_output = self.convert_model_output(model_output, sample=sample)
+        for i in range(self.config.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+        self.model_outputs[-1] = model_output
+
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+        if self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"] and variance_noise is None:
+            noise = randn_tensor(
+                model_output.shape, generator=generator, device=model_output.device, dtype=torch.float32
+            )
+        elif self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"]:
+            noise = variance_noise.to(device=model_output.device, dtype=torch.float32)
+        else:
+            noise = None
+
+        if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
+            prev_sample = self.dpm_solver_first_order_update(model_output, sample=sample, noise=noise)
+        elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
+            prev_sample = self.multistep_dpm_solver_second_order_update(self.model_outputs, sample=sample, noise=noise)
+        else:
+            prev_sample = self.multistep_dpm_solver_third_order_update(self.model_outputs, sample=sample)
+
+        if self.lower_order_nums < self.config.solver_order:
+            self.lower_order_nums += 1
+
+        # Cast sample back to expected dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # begin_index is None when the scheduler is used for training
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        else:
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        noisy_samples = alpha_t * original_samples + sigma_t * noise
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
new file mode 100644
index 000000000..0b48b499d
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
@@ -0,0 +1,643 @@
+# Copyright 2024 TSAIL Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
+
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import flax
+import jax
+import jax.numpy as jnp
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils_flax import (
+    CommonSchedulerState,
+    FlaxKarrasDiffusionSchedulers,
+    FlaxSchedulerMixin,
+    FlaxSchedulerOutput,
+    add_noise_common,
+)
+
+
+@flax.struct.dataclass
+class DPMSolverMultistepSchedulerState:
+    common: CommonSchedulerState
+    alpha_t: jnp.ndarray
+    sigma_t: jnp.ndarray
+    lambda_t: jnp.ndarray
+
+    # setable values
+    init_noise_sigma: jnp.ndarray
+    timesteps: jnp.ndarray
+    num_inference_steps: Optional[int] = None
+
+    # running values
+    model_outputs: Optional[jnp.ndarray] = None
+    lower_order_nums: Optional[jnp.int32] = None
+    prev_timestep: Optional[jnp.int32] = None
+    cur_sample: Optional[jnp.ndarray] = None
+
+    @classmethod
+    def create(
+        cls,
+        common: CommonSchedulerState,
+        alpha_t: jnp.ndarray,
+        sigma_t: jnp.ndarray,
+        lambda_t: jnp.ndarray,
+        init_noise_sigma: jnp.ndarray,
+        timesteps: jnp.ndarray,
+    ):
+        return cls(
+            common=common,
+            alpha_t=alpha_t,
+            sigma_t=sigma_t,
+            lambda_t=lambda_t,
+            init_noise_sigma=init_noise_sigma,
+            timesteps=timesteps,
+        )
+
+
+@dataclass
+class FlaxDPMSolverMultistepSchedulerOutput(FlaxSchedulerOutput):
+    state: DPMSolverMultistepSchedulerState
+
+
+class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
+    """
+    DPM-Solver (and the improved version DPM-Solver++) is a fast dedicated high-order solver for diffusion ODEs with
+    the convergence order guarantee. Empirically, sampling by DPM-Solver with only 20 steps can generate high-quality
+    samples, and it can generate quite good samples even in only 10 steps.
+
+    For more details, see the original paper: https://arxiv.org/abs/2206.00927 and https://arxiv.org/abs/2211.01095
+
+    Currently, we support the multistep DPM-Solver for both noise prediction models and data prediction models. We
+    recommend to use `solver_order=2` for guided sampling, and `solver_order=3` for unconditional sampling.
+
+    We also support the "dynamic thresholding" method in Imagen (https://arxiv.org/abs/2205.11487). For pixel-space
+    diffusion models, you can set both `algorithm_type="dpmsolver++"` and `thresholding=True` to use the dynamic
+    thresholding. Note that the thresholding method is unsuitable for latent-space diffusion models (such as
+    stable-diffusion).
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
+
+    For more details, see the original paper: https://arxiv.org/abs/2206.00927 and https://arxiv.org/abs/2211.01095
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+        solver_order (`int`, default `2`):
+            the order of DPM-Solver; can be `1` or `2` or `3`. We recommend to use `solver_order=2` for guided
+            sampling, and `solver_order=3` for unconditional sampling.
+        prediction_type (`str`, default `epsilon`):
+            indicates whether the model predicts the noise (epsilon), or the data / `x0`. One of `epsilon`, `sample`,
+            or `v-prediction`.
+        thresholding (`bool`, default `False`):
+            whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487).
+            For pixel-space diffusion models, you can set both `algorithm_type=dpmsolver++` and `thresholding=True` to
+            use the dynamic thresholding. Note that the thresholding method is unsuitable for latent-space diffusion
+            models (such as stable-diffusion).
+        dynamic_thresholding_ratio (`float`, default `0.995`):
+            the ratio for the dynamic thresholding method. Default is `0.995`, the same as Imagen
+            (https://arxiv.org/abs/2205.11487).
+        sample_max_value (`float`, default `1.0`):
+            the threshold value for dynamic thresholding. Valid only when `thresholding=True` and
+            `algorithm_type="dpmsolver++`.
+        algorithm_type (`str`, default `dpmsolver++`):
+            the algorithm type for the solver. Either `dpmsolver` or `dpmsolver++`. The `dpmsolver` type implements the
+            algorithms in https://arxiv.org/abs/2206.00927, and the `dpmsolver++` type implements the algorithms in
+            https://arxiv.org/abs/2211.01095. We recommend to use `dpmsolver++` with `solver_order=2` for guided
+            sampling (e.g. stable-diffusion).
+        solver_type (`str`, default `midpoint`):
+            the solver type for the second-order solver. Either `midpoint` or `heun`. The solver type slightly affects
+            the sample quality, especially for small number of steps. We empirically find that `midpoint` solvers are
+            slightly better, so we recommend to use the `midpoint` type.
+        lower_order_final (`bool`, default `True`):
+            whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. We empirically
+            find this trick can stabilize the sampling of DPM-Solver for steps < 15, especially for steps <= 10.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
+            the `dtype` used for params and computation.
+    """
+
+    _compatibles = [e.name for e in FlaxKarrasDiffusionSchedulers]
+
+    dtype: jnp.dtype
+
+    @property
+    def has_state(self):
+        return True
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[jnp.ndarray] = None,
+        solver_order: int = 2,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        algorithm_type: str = "dpmsolver++",
+        solver_type: str = "midpoint",
+        lower_order_final: bool = True,
+        timestep_spacing: str = "linspace",
+        dtype: jnp.dtype = jnp.float32,
+    ):
+        self.dtype = dtype
+
+    def create_state(self, common: Optional[CommonSchedulerState] = None) -> DPMSolverMultistepSchedulerState:
+        if common is None:
+            common = CommonSchedulerState.create(self)
+
+        # Currently we only support VP-type noise schedule
+        alpha_t = jnp.sqrt(common.alphas_cumprod)
+        sigma_t = jnp.sqrt(1 - common.alphas_cumprod)
+        lambda_t = jnp.log(alpha_t) - jnp.log(sigma_t)
+
+        # settings for DPM-Solver
+        if self.config.algorithm_type not in ["dpmsolver", "dpmsolver++"]:
+            raise NotImplementedError(f"{self.config.algorithm_type} does is not implemented for {self.__class__}")
+        if self.config.solver_type not in ["midpoint", "heun"]:
+            raise NotImplementedError(f"{self.config.solver_type} does is not implemented for {self.__class__}")
+
+        # standard deviation of the initial noise distribution
+        init_noise_sigma = jnp.array(1.0, dtype=self.dtype)
+
+        timesteps = jnp.arange(0, self.config.num_train_timesteps).round()[::-1]
+
+        return DPMSolverMultistepSchedulerState.create(
+            common=common,
+            alpha_t=alpha_t,
+            sigma_t=sigma_t,
+            lambda_t=lambda_t,
+            init_noise_sigma=init_noise_sigma,
+            timesteps=timesteps,
+        )
+
+    def set_timesteps(
+        self, state: DPMSolverMultistepSchedulerState, num_inference_steps: int, shape: Tuple
+    ) -> DPMSolverMultistepSchedulerState:
+        """
+        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            state (`DPMSolverMultistepSchedulerState`):
+                the `FlaxDPMSolverMultistepScheduler` state data class instance.
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+            shape (`Tuple`):
+                the shape of the samples to be generated.
+        """
+        last_timestep = self.config.num_train_timesteps
+        if self.config.timestep_spacing == "linspace":
+            timesteps = (
+                jnp.linspace(0, last_timestep - 1, num_inference_steps + 1).round()[::-1][:-1].astype(jnp.int32)
+            )
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = last_timestep // (num_inference_steps + 1)
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (
+                (jnp.arange(0, num_inference_steps + 1) * step_ratio).round()[::-1][:-1].copy().astype(jnp.int32)
+            )
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = jnp.arange(last_timestep, 0, -step_ratio).round().copy().astype(jnp.int32)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+
+        # initial running values
+
+        model_outputs = jnp.zeros((self.config.solver_order,) + shape, dtype=self.dtype)
+        lower_order_nums = jnp.int32(0)
+        prev_timestep = jnp.int32(-1)
+        cur_sample = jnp.zeros(shape, dtype=self.dtype)
+
+        return state.replace(
+            num_inference_steps=num_inference_steps,
+            timesteps=timesteps,
+            model_outputs=model_outputs,
+            lower_order_nums=lower_order_nums,
+            prev_timestep=prev_timestep,
+            cur_sample=cur_sample,
+        )
+
+    def convert_model_output(
+        self,
+        state: DPMSolverMultistepSchedulerState,
+        model_output: jnp.ndarray,
+        timestep: int,
+        sample: jnp.ndarray,
+    ) -> jnp.ndarray:
+        """
+        Convert the model output to the corresponding type that the algorithm (DPM-Solver / DPM-Solver++) needs.
+
+        DPM-Solver is designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to
+        discretize an integral of the data prediction model. So we need to first convert the model output to the
+        corresponding type to match the algorithm.
+
+        Note that the algorithm type and the model type is decoupled. That is to say, we can use either DPM-Solver or
+        DPM-Solver++ for both noise prediction model and data prediction model.
+
+        Args:
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+
+        Returns:
+            `jnp.ndarray`: the converted model output.
+        """
+        # DPM-Solver++ needs to solve an integral of the data prediction model.
+        if self.config.algorithm_type == "dpmsolver++":
+            if self.config.prediction_type == "epsilon":
+                alpha_t, sigma_t = state.alpha_t[timestep], state.sigma_t[timestep]
+                x0_pred = (sample - sigma_t * model_output) / alpha_t
+            elif self.config.prediction_type == "sample":
+                x0_pred = model_output
+            elif self.config.prediction_type == "v_prediction":
+                alpha_t, sigma_t = state.alpha_t[timestep], state.sigma_t[timestep]
+                x0_pred = alpha_t * sample - sigma_t * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, "
+                    " or `v_prediction` for the FlaxDPMSolverMultistepScheduler."
+                )
+
+            if self.config.thresholding:
+                # Dynamic thresholding in https://arxiv.org/abs/2205.11487
+                dynamic_max_val = jnp.percentile(
+                    jnp.abs(x0_pred), self.config.dynamic_thresholding_ratio, axis=tuple(range(1, x0_pred.ndim))
+                )
+                dynamic_max_val = jnp.maximum(
+                    dynamic_max_val, self.config.sample_max_value * jnp.ones_like(dynamic_max_val)
+                )
+                x0_pred = jnp.clip(x0_pred, -dynamic_max_val, dynamic_max_val) / dynamic_max_val
+            return x0_pred
+        # DPM-Solver needs to solve an integral of the noise prediction model.
+        elif self.config.algorithm_type == "dpmsolver":
+            if self.config.prediction_type == "epsilon":
+                return model_output
+            elif self.config.prediction_type == "sample":
+                alpha_t, sigma_t = state.alpha_t[timestep], state.sigma_t[timestep]
+                epsilon = (sample - alpha_t * model_output) / sigma_t
+                return epsilon
+            elif self.config.prediction_type == "v_prediction":
+                alpha_t, sigma_t = state.alpha_t[timestep], state.sigma_t[timestep]
+                epsilon = alpha_t * model_output + sigma_t * sample
+                return epsilon
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, "
+                    " or `v_prediction` for the FlaxDPMSolverMultistepScheduler."
+                )
+
+    def dpm_solver_first_order_update(
+        self,
+        state: DPMSolverMultistepSchedulerState,
+        model_output: jnp.ndarray,
+        timestep: int,
+        prev_timestep: int,
+        sample: jnp.ndarray,
+    ) -> jnp.ndarray:
+        """
+        One step for the first-order DPM-Solver (equivalent to DDIM).
+
+        See https://arxiv.org/abs/2206.00927 for the detailed derivation.
+
+        Args:
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+
+        Returns:
+            `jnp.ndarray`: the sample tensor at the previous timestep.
+        """
+        t, s0 = prev_timestep, timestep
+        m0 = model_output
+        lambda_t, lambda_s = state.lambda_t[t], state.lambda_t[s0]
+        alpha_t, alpha_s = state.alpha_t[t], state.alpha_t[s0]
+        sigma_t, sigma_s = state.sigma_t[t], state.sigma_t[s0]
+        h = lambda_t - lambda_s
+        if self.config.algorithm_type == "dpmsolver++":
+            x_t = (sigma_t / sigma_s) * sample - (alpha_t * (jnp.exp(-h) - 1.0)) * m0
+        elif self.config.algorithm_type == "dpmsolver":
+            x_t = (alpha_t / alpha_s) * sample - (sigma_t * (jnp.exp(h) - 1.0)) * m0
+        return x_t
+
+    def multistep_dpm_solver_second_order_update(
+        self,
+        state: DPMSolverMultistepSchedulerState,
+        model_output_list: jnp.ndarray,
+        timestep_list: List[int],
+        prev_timestep: int,
+        sample: jnp.ndarray,
+    ) -> jnp.ndarray:
+        """
+        One step for the second-order multistep DPM-Solver.
+
+        Args:
+            model_output_list (`List[jnp.ndarray]`):
+                direct outputs from learned diffusion model at current and latter timesteps.
+            timestep (`int`): current and latter discrete timestep in the diffusion chain.
+            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+
+        Returns:
+            `jnp.ndarray`: the sample tensor at the previous timestep.
+        """
+        t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2]
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+        lambda_t, lambda_s0, lambda_s1 = state.lambda_t[t], state.lambda_t[s0], state.lambda_t[s1]
+        alpha_t, alpha_s0 = state.alpha_t[t], state.alpha_t[s0]
+        sigma_t, sigma_s0 = state.sigma_t[t], state.sigma_t[s0]
+        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
+        r0 = h_0 / h
+        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2211.01095 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (jnp.exp(-h) - 1.0)) * D0
+                    - 0.5 * (alpha_t * (jnp.exp(-h) - 1.0)) * D1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (jnp.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((jnp.exp(-h) - 1.0) / h + 1.0)) * D1
+                )
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - (sigma_t * (jnp.exp(h) - 1.0)) * D0
+                    - 0.5 * (sigma_t * (jnp.exp(h) - 1.0)) * D1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - (sigma_t * (jnp.exp(h) - 1.0)) * D0
+                    - (sigma_t * ((jnp.exp(h) - 1.0) / h - 1.0)) * D1
+                )
+        return x_t
+
+    def multistep_dpm_solver_third_order_update(
+        self,
+        state: DPMSolverMultistepSchedulerState,
+        model_output_list: jnp.ndarray,
+        timestep_list: List[int],
+        prev_timestep: int,
+        sample: jnp.ndarray,
+    ) -> jnp.ndarray:
+        """
+        One step for the third-order multistep DPM-Solver.
+
+        Args:
+            model_output_list (`List[jnp.ndarray]`):
+                direct outputs from learned diffusion model at current and latter timesteps.
+            timestep (`int`): current and latter discrete timestep in the diffusion chain.
+            prev_timestep (`int`): previous discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+
+        Returns:
+            `jnp.ndarray`: the sample tensor at the previous timestep.
+        """
+        t, s0, s1, s2 = prev_timestep, timestep_list[-1], timestep_list[-2], timestep_list[-3]
+        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
+        lambda_t, lambda_s0, lambda_s1, lambda_s2 = (
+            state.lambda_t[t],
+            state.lambda_t[s0],
+            state.lambda_t[s1],
+            state.lambda_t[s2],
+        )
+        alpha_t, alpha_s0 = state.alpha_t[t], state.alpha_t[s0]
+        sigma_t, sigma_s0 = state.sigma_t[t], state.sigma_t[s0]
+        h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
+        r0, r1 = h_0 / h, h_1 / h
+        D0 = m0
+        D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
+        D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
+        D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                (sigma_t / sigma_s0) * sample
+                - (alpha_t * (jnp.exp(-h) - 1.0)) * D0
+                + (alpha_t * ((jnp.exp(-h) - 1.0) / h + 1.0)) * D1
+                - (alpha_t * ((jnp.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
+            )
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                (alpha_t / alpha_s0) * sample
+                - (sigma_t * (jnp.exp(h) - 1.0)) * D0
+                - (sigma_t * ((jnp.exp(h) - 1.0) / h - 1.0)) * D1
+                - (sigma_t * ((jnp.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2
+            )
+        return x_t
+
+    def step(
+        self,
+        state: DPMSolverMultistepSchedulerState,
+        model_output: jnp.ndarray,
+        timestep: int,
+        sample: jnp.ndarray,
+        return_dict: bool = True,
+    ) -> Union[FlaxDPMSolverMultistepSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by DPM-Solver. Core function to propagate the diffusion process
+        from the learned model outputs (most often the predicted noise).
+
+        Args:
+            state (`DPMSolverMultistepSchedulerState`):
+                the `FlaxDPMSolverMultistepScheduler` state data class instance.
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            return_dict (`bool`): option for returning tuple rather than FlaxDPMSolverMultistepSchedulerOutput class
+
+        Returns:
+            [`FlaxDPMSolverMultistepSchedulerOutput`] or `tuple`: [`FlaxDPMSolverMultistepSchedulerOutput`] if
+            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+        if state.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        (step_index,) = jnp.where(state.timesteps == timestep, size=1)
+        step_index = step_index[0]
+
+        prev_timestep = jax.lax.select(step_index == len(state.timesteps) - 1, 0, state.timesteps[step_index + 1])
+
+        model_output = self.convert_model_output(state, model_output, timestep, sample)
+
+        model_outputs_new = jnp.roll(state.model_outputs, -1, axis=0)
+        model_outputs_new = model_outputs_new.at[-1].set(model_output)
+        state = state.replace(
+            model_outputs=model_outputs_new,
+            prev_timestep=prev_timestep,
+            cur_sample=sample,
+        )
+
+        def step_1(state: DPMSolverMultistepSchedulerState) -> jnp.ndarray:
+            return self.dpm_solver_first_order_update(
+                state,
+                state.model_outputs[-1],
+                state.timesteps[step_index],
+                state.prev_timestep,
+                state.cur_sample,
+            )
+
+        def step_23(state: DPMSolverMultistepSchedulerState) -> jnp.ndarray:
+            def step_2(state: DPMSolverMultistepSchedulerState) -> jnp.ndarray:
+                timestep_list = jnp.array([state.timesteps[step_index - 1], state.timesteps[step_index]])
+                return self.multistep_dpm_solver_second_order_update(
+                    state,
+                    state.model_outputs,
+                    timestep_list,
+                    state.prev_timestep,
+                    state.cur_sample,
+                )
+
+            def step_3(state: DPMSolverMultistepSchedulerState) -> jnp.ndarray:
+                timestep_list = jnp.array(
+                    [
+                        state.timesteps[step_index - 2],
+                        state.timesteps[step_index - 1],
+                        state.timesteps[step_index],
+                    ]
+                )
+                return self.multistep_dpm_solver_third_order_update(
+                    state,
+                    state.model_outputs,
+                    timestep_list,
+                    state.prev_timestep,
+                    state.cur_sample,
+                )
+
+            step_2_output = step_2(state)
+            step_3_output = step_3(state)
+
+            if self.config.solver_order == 2:
+                return step_2_output
+            elif self.config.lower_order_final and len(state.timesteps) < 15:
+                return jax.lax.select(
+                    state.lower_order_nums < 2,
+                    step_2_output,
+                    jax.lax.select(
+                        step_index == len(state.timesteps) - 2,
+                        step_2_output,
+                        step_3_output,
+                    ),
+                )
+            else:
+                return jax.lax.select(
+                    state.lower_order_nums < 2,
+                    step_2_output,
+                    step_3_output,
+                )
+
+        step_1_output = step_1(state)
+        step_23_output = step_23(state)
+
+        if self.config.solver_order == 1:
+            prev_sample = step_1_output
+
+        elif self.config.lower_order_final and len(state.timesteps) < 15:
+            prev_sample = jax.lax.select(
+                state.lower_order_nums < 1,
+                step_1_output,
+                jax.lax.select(
+                    step_index == len(state.timesteps) - 1,
+                    step_1_output,
+                    step_23_output,
+                ),
+            )
+
+        else:
+            prev_sample = jax.lax.select(
+                state.lower_order_nums < 1,
+                step_1_output,
+                step_23_output,
+            )
+
+        state = state.replace(
+            lower_order_nums=jnp.minimum(state.lower_order_nums + 1, self.config.solver_order),
+        )
+
+        if not return_dict:
+            return (prev_sample, state)
+
+        return FlaxDPMSolverMultistepSchedulerOutput(prev_sample=prev_sample, state=state)
+
+    def scale_model_input(
+        self, state: DPMSolverMultistepSchedulerState, sample: jnp.ndarray, timestep: Optional[int] = None
+    ) -> jnp.ndarray:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            state (`DPMSolverMultistepSchedulerState`):
+                the `FlaxDPMSolverMultistepScheduler` state data class instance.
+            sample (`jnp.ndarray`): input sample
+            timestep (`int`, optional): current timestep
+
+        Returns:
+            `jnp.ndarray`: scaled input sample
+        """
+        return sample
+
+    def add_noise(
+        self,
+        state: DPMSolverMultistepSchedulerState,
+        original_samples: jnp.ndarray,
+        noise: jnp.ndarray,
+        timesteps: jnp.ndarray,
+    ) -> jnp.ndarray:
+        return add_noise_common(state.common, original_samples, noise, timesteps)
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
new file mode 100644
index 000000000..318b9ed54
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
@@ -0,0 +1,921 @@
+# Copyright 2024 TSAIL Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import deprecate
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `DPMSolverMultistepInverseScheduler` is the reverse scheduler of [`DPMSolverMultistepScheduler`].
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        solver_order (`int`, defaults to 2):
+            The DPMSolver order which can be `1` or `2` or `3`. It is recommended to use `solver_order=2` for guided
+            sampling, and `solver_order=3` for unconditional sampling.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
+            `algorithm_type="dpmsolver++"`.
+        algorithm_type (`str`, defaults to `dpmsolver++`):
+            Algorithm type for the solver; can be `dpmsolver`, `dpmsolver++`, `sde-dpmsolver` or `sde-dpmsolver++`. The
+            `dpmsolver` type implements the algorithms in the [DPMSolver](https://huggingface.co/papers/2206.00927)
+            paper, and the `dpmsolver++` type implements the algorithms in the
+            [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to use `dpmsolver++` or
+            `sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion.
+        solver_type (`str`, defaults to `midpoint`):
+            Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the
+            sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers.
+        lower_order_final (`bool`, defaults to `True`):
+            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
+            stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
+        euler_at_final (`bool`, defaults to `False`):
+            Whether to use Euler's method in the final step. It is a trade-off between numerical stability and detail
+            richness. This can stabilize the sampling of the SDE variant of DPMSolver for small number of inference
+            steps, but sometimes may result in blurring.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        lambda_min_clipped (`float`, defaults to `-inf`):
+            Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the
+            cosine (`squaredcos_cap_v2`) noise schedule.
+        variance_type (`str`, *optional*):
+            Set to "learned" or "learned_range" for diffusion models that predict variance. If set, the model's output
+            contains the predicted Gaussian variance.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        solver_order: int = 2,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        algorithm_type: str = "dpmsolver++",
+        solver_type: str = "midpoint",
+        lower_order_final: bool = True,
+        euler_at_final: bool = False,
+        use_karras_sigmas: Optional[bool] = False,
+        lambda_min_clipped: float = -float("inf"),
+        variance_type: Optional[str] = None,
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+    ):
+        if algorithm_type in ["dpmsolver", "sde-dpmsolver"]:
+            deprecation_message = f"algorithm_type {algorithm_type} is deprecated and will be removed in a future version. Choose from `dpmsolver++` or `sde-dpmsolver++` instead"
+            deprecate("algorithm_types dpmsolver and sde-dpmsolver", "1.0.0", deprecation_message)
+
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        # Currently we only support VP-type noise schedule
+        self.alpha_t = torch.sqrt(self.alphas_cumprod)
+        self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
+        self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
+        self.sigmas = ((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # settings for DPM-Solver
+        if algorithm_type not in ["dpmsolver", "dpmsolver++", "sde-dpmsolver", "sde-dpmsolver++"]:
+            if algorithm_type == "deis":
+                self.register_to_config(algorithm_type="dpmsolver++")
+            else:
+                raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
+
+        if solver_type not in ["midpoint", "heun"]:
+            if solver_type in ["logrho", "bh1", "bh2"]:
+                self.register_to_config(solver_type="midpoint")
+            else:
+                raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
+
+        # setable values
+        self.num_inference_steps = None
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32).copy()
+        self.timesteps = torch.from_numpy(timesteps)
+        self.model_outputs = [None] * solver_order
+        self.lower_order_nums = 0
+        self._step_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+        self.use_karras_sigmas = use_karras_sigmas
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increae 1 after each scheduler step.
+        """
+        return self._step_index
+
+    def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        # Clipping the minimum of all lambda(t) for numerical stability.
+        # This is critical for cosine (squaredcos_cap_v2) noise schedule.
+        clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.lambda_min_clipped).item()
+        self.noisiest_timestep = self.config.num_train_timesteps - 1 - clipped_idx
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = (
+                np.linspace(0, self.noisiest_timestep, num_inference_steps + 1).round()[:-1].copy().astype(np.int64)
+            )
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = (self.noisiest_timestep + 1) // (num_inference_steps + 1)
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps + 1) * step_ratio).round()[:-1].copy().astype(np.int64)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = np.arange(self.noisiest_timestep + 1, 0, -step_ratio).round()[::-1].copy().astype(np.int64)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', "
+                "'leading' or 'trailing'."
+            )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        log_sigmas = np.log(sigmas)
+
+        if self.config.use_karras_sigmas:
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+            timesteps = timesteps.copy().astype(np.int64)
+            sigmas = np.concatenate([sigmas, sigmas[-1:]]).astype(np.float32)
+        else:
+            sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+            sigma_max = (
+                (1 - self.alphas_cumprod[self.noisiest_timestep]) / self.alphas_cumprod[self.noisiest_timestep]
+            ) ** 0.5
+            sigmas = np.concatenate([sigmas, [sigma_max]]).astype(np.float32)
+
+        self.sigmas = torch.from_numpy(sigmas)
+
+        # when num_inference_steps == num_train_timesteps, we can end up with
+        # duplicates in timesteps.
+        _, unique_indices = np.unique(timesteps, return_index=True)
+        timesteps = timesteps[np.sort(unique_indices)]
+
+        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64)
+
+        self.num_inference_steps = len(timesteps)
+
+        self.model_outputs = [
+            None,
+        ] * self.config.solver_order
+        self.lower_order_nums = 0
+
+        # add an index counter for schedulers that allow duplicated timesteps
+        self._step_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._sigma_to_alpha_sigma_t
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        alpha_t = 1 / ((sigma**2 + 1) ** 0.5)
+        sigma_t = sigma * alpha_t
+
+        return alpha_t, sigma_t
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.convert_model_output
+    def convert_model_output(
+        self,
+        model_output: torch.FloatTensor,
+        *args,
+        sample: torch.FloatTensor = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is
+        designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an
+        integral of the data prediction model.
+
+        <Tip>
+
+        The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise
+        prediction and data prediction models.
+
+        </Tip>
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from the learned diffusion model.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.FloatTensor`:
+                The converted model output.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError("missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        # DPM-Solver++ needs to solve an integral of the data prediction model.
+        if self.config.algorithm_type in ["dpmsolver++", "sde-dpmsolver++"]:
+            if self.config.prediction_type == "epsilon":
+                # DPM-Solver and DPM-Solver++ only need the "mean" output.
+                if self.config.variance_type in ["learned", "learned_range"]:
+                    model_output = model_output[:, :3]
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                x0_pred = (sample - sigma_t * model_output) / alpha_t
+            elif self.config.prediction_type == "sample":
+                x0_pred = model_output
+            elif self.config.prediction_type == "v_prediction":
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                x0_pred = alpha_t * sample - sigma_t * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the DPMSolverMultistepScheduler."
+                )
+
+            if self.config.thresholding:
+                x0_pred = self._threshold_sample(x0_pred)
+
+            return x0_pred
+
+        # DPM-Solver needs to solve an integral of the noise prediction model.
+        elif self.config.algorithm_type in ["dpmsolver", "sde-dpmsolver"]:
+            if self.config.prediction_type == "epsilon":
+                # DPM-Solver and DPM-Solver++ only need the "mean" output.
+                if self.config.variance_type in ["learned", "learned_range"]:
+                    epsilon = model_output[:, :3]
+                else:
+                    epsilon = model_output
+            elif self.config.prediction_type == "sample":
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                epsilon = (sample - alpha_t * model_output) / sigma_t
+            elif self.config.prediction_type == "v_prediction":
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                epsilon = alpha_t * model_output + sigma_t * sample
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the DPMSolverMultistepScheduler."
+                )
+
+            if self.config.thresholding:
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                x0_pred = (sample - sigma_t * epsilon) / alpha_t
+                x0_pred = self._threshold_sample(x0_pred)
+                epsilon = (sample - alpha_t * x0_pred) / sigma_t
+
+            return epsilon
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.dpm_solver_first_order_update
+    def dpm_solver_first_order_update(
+        self,
+        model_output: torch.FloatTensor,
+        *args,
+        sample: torch.FloatTensor = None,
+        noise: Optional[torch.FloatTensor] = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        One step for the first-order DPMSolver (equivalent to DDIM).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from the learned diffusion model.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.FloatTensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s, sigma_s = self._sigma_to_alpha_sigma_t(sigma_s)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s = torch.log(alpha_s) - torch.log(sigma_s)
+
+        h = lambda_t - lambda_s
+        if self.config.algorithm_type == "dpmsolver++":
+            x_t = (sigma_t / sigma_s) * sample - (alpha_t * (torch.exp(-h) - 1.0)) * model_output
+        elif self.config.algorithm_type == "dpmsolver":
+            x_t = (alpha_t / alpha_s) * sample - (sigma_t * (torch.exp(h) - 1.0)) * model_output
+        elif self.config.algorithm_type == "sde-dpmsolver++":
+            assert noise is not None
+            x_t = (
+                (sigma_t / sigma_s * torch.exp(-h)) * sample
+                + (alpha_t * (1 - torch.exp(-2.0 * h))) * model_output
+                + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+            )
+        elif self.config.algorithm_type == "sde-dpmsolver":
+            assert noise is not None
+            x_t = (
+                (alpha_t / alpha_s) * sample
+                - 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * model_output
+                + sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
+            )
+        return x_t
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.multistep_dpm_solver_second_order_update
+    def multistep_dpm_solver_second_order_update(
+        self,
+        model_output_list: List[torch.FloatTensor],
+        *args,
+        sample: torch.FloatTensor = None,
+        noise: Optional[torch.FloatTensor] = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        One step for the second-order multistep DPMSolver.
+
+        Args:
+            model_output_list (`List[torch.FloatTensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.FloatTensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if timestep_list is not None:
+            deprecate(
+                "timestep_list",
+                "1.0.0",
+                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s0, sigma_s1 = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+
+        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
+        r0 = h_0 / h
+        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2211.01095 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    - 0.5 * (alpha_t * (torch.exp(-h) - 1.0)) * D1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+                )
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - 0.5 * (sigma_t * (torch.exp(h) - 1.0)) * D1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                )
+        elif self.config.algorithm_type == "sde-dpmsolver++":
+            assert noise is not None
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+                    + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+                    + 0.5 * (alpha_t * (1 - torch.exp(-2.0 * h))) * D1
+                    + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+                    + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+                    + (alpha_t * ((1.0 - torch.exp(-2.0 * h)) / (-2.0 * h) + 1.0)) * D1
+                    + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+                )
+        elif self.config.algorithm_type == "sde-dpmsolver":
+            assert noise is not None
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D1
+                    + sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (alpha_t / alpha_s0) * sample
+                    - 2.0 * (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - 2.0 * (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                    + sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise
+                )
+        return x_t
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.multistep_dpm_solver_third_order_update
+    def multistep_dpm_solver_third_order_update(
+        self,
+        model_output_list: List[torch.FloatTensor],
+        *args,
+        sample: torch.FloatTensor = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        One step for the third-order multistep DPMSolver.
+
+        Args:
+            model_output_list (`List[torch.FloatTensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by diffusion process.
+
+        Returns:
+            `torch.FloatTensor`:
+                The sample tensor at the previous timestep.
+        """
+
+        timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing`sample` as a required keyward argument")
+        if timestep_list is not None:
+            deprecate(
+                "timestep_list",
+                "1.0.0",
+                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s0, sigma_s1, sigma_s2 = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],
+            self.sigmas[self.step_index - 2],
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+        alpha_s2, sigma_s2 = self._sigma_to_alpha_sigma_t(sigma_s2)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+        lambda_s2 = torch.log(alpha_s2) - torch.log(sigma_s2)
+
+        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
+
+        h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
+        r0, r1 = h_0 / h, h_1 / h
+        D0 = m0
+        D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
+        D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
+        D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                (sigma_t / sigma_s0) * sample
+                - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+                - (alpha_t * ((torch.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
+            )
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                (alpha_t / alpha_s0) * sample
+                - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                - (sigma_t * ((torch.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2
+            )
+        return x_t
+
+    def _init_step_index(self, timestep):
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.to(self.timesteps.device)
+
+        index_candidates = (self.timesteps == timestep).nonzero()
+
+        if len(index_candidates) == 0:
+            step_index = len(self.timesteps) - 1
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        elif len(index_candidates) > 1:
+            step_index = index_candidates[1].item()
+        else:
+            step_index = index_candidates[0].item()
+
+        self._step_index = step_index
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        generator=None,
+        variance_noise: Optional[torch.FloatTensor] = None,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the multistep DPMSolver.
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            variance_noise (`torch.FloatTensor`):
+                Alternative to generating noise with `generator` by directly providing the noise for the variance
+                itself. Useful for methods such as [`CycleDiffusion`].
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # Improve numerical stability for small number of steps
+        lower_order_final = (self.step_index == len(self.timesteps) - 1) and (
+            self.config.euler_at_final or (self.config.lower_order_final and len(self.timesteps) < 15)
+        )
+        lower_order_second = (
+            (self.step_index == len(self.timesteps) - 2) and self.config.lower_order_final and len(self.timesteps) < 15
+        )
+
+        model_output = self.convert_model_output(model_output, sample=sample)
+        for i in range(self.config.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+        self.model_outputs[-1] = model_output
+
+        if self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"] and variance_noise is None:
+            noise = randn_tensor(
+                model_output.shape, generator=generator, device=model_output.device, dtype=model_output.dtype
+            )
+        elif self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"]:
+            noise = variance_noise
+        else:
+            noise = None
+
+        if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
+            prev_sample = self.dpm_solver_first_order_update(model_output, sample=sample, noise=noise)
+        elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
+            prev_sample = self.multistep_dpm_solver_second_order_update(self.model_outputs, sample=sample, noise=noise)
+        else:
+            prev_sample = self.multistep_dpm_solver_third_order_update(self.model_outputs, sample=sample)
+
+        if self.lower_order_nums < self.config.solver_order:
+            self.lower_order_nums += 1
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.scale_model_input
+    def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        step_indices = []
+        for timestep in timesteps:
+            index_candidates = (schedule_timesteps == timestep).nonzero()
+            if len(index_candidates) == 0:
+                step_index = len(schedule_timesteps) - 1
+            elif len(index_candidates) > 1:
+                step_index = index_candidates[1].item()
+            else:
+                step_index = index_candidates[0].item()
+            step_indices.append(step_index)
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        noisy_samples = alpha_t * original_samples + sigma_t * noise
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
new file mode 100644
index 000000000..4721933ae
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
@@ -0,0 +1,557 @@
+# Copyright 2024 Katherine Crowson, The HuggingFace Team and hlky. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torchsde
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+class BatchedBrownianTree:
+    """A wrapper around torchsde.BrownianTree that enables batches of entropy."""
+
+    def __init__(self, x, t0, t1, seed=None, **kwargs):
+        t0, t1, self.sign = self.sort(t0, t1)
+        w0 = kwargs.get("w0", torch.zeros_like(x))
+        if seed is None:
+            seed = torch.randint(0, 2**63 - 1, []).item()
+        self.batched = True
+        try:
+            assert len(seed) == x.shape[0]
+            w0 = w0[0]
+        except TypeError:
+            seed = [seed]
+            self.batched = False
+        self.trees = [torchsde.BrownianTree(t0, w0, t1, entropy=s, **kwargs) for s in seed]
+
+    @staticmethod
+    def sort(a, b):
+        return (a, b, 1) if a < b else (b, a, -1)
+
+    def __call__(self, t0, t1):
+        t0, t1, sign = self.sort(t0, t1)
+        w = torch.stack([tree(t0, t1) for tree in self.trees]) * (self.sign * sign)
+        return w if self.batched else w[0]
+
+
+class BrownianTreeNoiseSampler:
+    """A noise sampler backed by a torchsde.BrownianTree.
+
+    Args:
+        x (Tensor): The tensor whose shape, device and dtype to use to generate
+            random samples.
+        sigma_min (float): The low end of the valid interval.
+        sigma_max (float): The high end of the valid interval.
+        seed (int or List[int]): The random seed. If a list of seeds is
+            supplied instead of a single integer, then the noise sampler will use one BrownianTree per batch item, each
+            with its own seed.
+        transform (callable): A function that maps sigma to the sampler's
+            internal timestep.
+    """
+
+    def __init__(self, x, sigma_min, sigma_max, seed=None, transform=lambda x: x):
+        self.transform = transform
+        t0, t1 = self.transform(torch.as_tensor(sigma_min)), self.transform(torch.as_tensor(sigma_max))
+        self.tree = BatchedBrownianTree(x, t0, t1, seed)
+
+    def __call__(self, sigma, sigma_next):
+        t0, t1 = self.transform(torch.as_tensor(sigma)), self.transform(torch.as_tensor(sigma_next))
+        return self.tree(t0, t1) / (t1 - t0).abs().sqrt()
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
+    """
+    DPMSolverSDEScheduler implements the stochastic sampler from the [Elucidating the Design Space of Diffusion-Based
+    Generative Models](https://huggingface.co/papers/2206.00364) paper.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.00085):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.012):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear` or `scaled_linear`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        noise_sampler_seed (`int`, *optional*, defaults to `None`):
+            The random seed to use for the noise sampler. If `None`, a random seed is generated.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 2
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.00085,  # sensible defaults
+        beta_end: float = 0.012,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+        use_karras_sigmas: Optional[bool] = False,
+        noise_sampler_seed: Optional[int] = None,
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        #  set all values
+        self.set_timesteps(num_train_timesteps, None, num_train_timesteps)
+        self.use_karras_sigmas = use_karras_sigmas
+        self.noise_sampler = None
+        self.noise_sampler_seed = noise_sampler_seed
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    @property
+    def init_noise_sigma(self):
+        # standard deviation of the initial noise distribution
+        if self.config.timestep_spacing in ["linspace", "trailing"]:
+            return self.sigmas.max()
+
+        return (self.sigmas.max() ** 2 + 1) ** 0.5
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increae 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_model_input(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+    ) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+        sigma_input = sigma if self.state_in_first_order else self.mid_point_sigma
+        sample = sample / ((sigma_input**2 + 1) ** 0.5)
+        return sample
+
+    def set_timesteps(
+        self,
+        num_inference_steps: int,
+        device: Union[str, torch.device] = None,
+        num_train_timesteps: Optional[int] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+
+        num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy()
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(float)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(num_train_timesteps, 0, -step_ratio)).round().copy().astype(float)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        log_sigmas = np.log(sigmas)
+        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+
+        if self.use_karras_sigmas:
+            sigmas = self._convert_to_karras(in_sigmas=sigmas)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+
+        second_order_timesteps = self._second_order_timesteps(sigmas, log_sigmas)
+
+        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+        sigmas = torch.from_numpy(sigmas).to(device=device)
+        self.sigmas = torch.cat([sigmas[:1], sigmas[1:-1].repeat_interleave(2), sigmas[-1:]])
+
+        timesteps = torch.from_numpy(timesteps)
+        second_order_timesteps = torch.from_numpy(second_order_timesteps)
+        timesteps = torch.cat([timesteps[:1], timesteps[1:].repeat_interleave(2)])
+        timesteps[1::2] = second_order_timesteps
+
+        if str(device).startswith("mps"):
+            # mps does not support float64
+            self.timesteps = timesteps.to(device, dtype=torch.float32)
+        else:
+            self.timesteps = timesteps.to(device=device)
+
+        # empty first order variables
+        self.sample = None
+        self.mid_point_sigma = None
+
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+        self.noise_sampler = None
+
+    def _second_order_timesteps(self, sigmas, log_sigmas):
+        def sigma_fn(_t):
+            return np.exp(-_t)
+
+        def t_fn(_sigma):
+            return -np.log(_sigma)
+
+        midpoint_ratio = 0.5
+        t = t_fn(sigmas)
+        delta_time = np.diff(t)
+        t_proposed = t[:-1] + delta_time * midpoint_ratio
+        sig_proposed = sigma_fn(t_proposed)
+        timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sig_proposed])
+        return timesteps
+
+    # copied from diffusers.schedulers.scheduling_euler_discrete._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    # copied from diffusers.schedulers.scheduling_euler_discrete._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.FloatTensor) -> torch.FloatTensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        sigma_min: float = in_sigmas[-1].item()
+        sigma_max: float = in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, self.num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    @property
+    def state_in_first_order(self):
+        return self.sample is None
+
+    def step(
+        self,
+        model_output: Union[torch.FloatTensor, np.ndarray],
+        timestep: Union[float, torch.FloatTensor],
+        sample: Union[torch.FloatTensor, np.ndarray],
+        return_dict: bool = True,
+        s_noise: float = 1.0,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor` or `np.ndarray`):
+                The direct output from learned diffusion model.
+            timestep (`float` or `torch.FloatTensor`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor` or `np.ndarray`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple.
+            s_noise (`float`, *optional*, defaults to 1.0):
+                Scaling factor for noise added to the sample.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # Create a noise sampler if it hasn't been created yet
+        if self.noise_sampler is None:
+            min_sigma, max_sigma = self.sigmas[self.sigmas > 0].min(), self.sigmas.max()
+            self.noise_sampler = BrownianTreeNoiseSampler(sample, min_sigma, max_sigma, self.noise_sampler_seed)
+
+        # Define functions to compute sigma and t from each other
+        def sigma_fn(_t: torch.FloatTensor) -> torch.FloatTensor:
+            return _t.neg().exp()
+
+        def t_fn(_sigma: torch.FloatTensor) -> torch.FloatTensor:
+            return _sigma.log().neg()
+
+        if self.state_in_first_order:
+            sigma = self.sigmas[self.step_index]
+            sigma_next = self.sigmas[self.step_index + 1]
+        else:
+            # 2nd order
+            sigma = self.sigmas[self.step_index - 1]
+            sigma_next = self.sigmas[self.step_index]
+
+        # Set the midpoint and step size for the current step
+        midpoint_ratio = 0.5
+        t, t_next = t_fn(sigma), t_fn(sigma_next)
+        delta_time = t_next - t
+        t_proposed = t + delta_time * midpoint_ratio
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        if self.config.prediction_type == "epsilon":
+            sigma_input = sigma if self.state_in_first_order else sigma_fn(t_proposed)
+            pred_original_sample = sample - sigma_input * model_output
+        elif self.config.prediction_type == "v_prediction":
+            sigma_input = sigma if self.state_in_first_order else sigma_fn(t_proposed)
+            pred_original_sample = model_output * (-sigma_input / (sigma_input**2 + 1) ** 0.5) + (
+                sample / (sigma_input**2 + 1)
+            )
+        elif self.config.prediction_type == "sample":
+            raise NotImplementedError("prediction_type not implemented yet: sample")
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+
+        if sigma_next == 0:
+            derivative = (sample - pred_original_sample) / sigma
+            dt = sigma_next - sigma
+            prev_sample = sample + derivative * dt
+        else:
+            if self.state_in_first_order:
+                t_next = t_proposed
+            else:
+                sample = self.sample
+
+            sigma_from = sigma_fn(t)
+            sigma_to = sigma_fn(t_next)
+            sigma_up = min(sigma_to, (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5)
+            sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+            ancestral_t = t_fn(sigma_down)
+            prev_sample = (sigma_fn(ancestral_t) / sigma_fn(t)) * sample - (
+                t - ancestral_t
+            ).expm1() * pred_original_sample
+            prev_sample = prev_sample + self.noise_sampler(sigma_fn(t), sigma_fn(t_next)) * s_noise * sigma_up
+
+            if self.state_in_first_order:
+                # store for 2nd order step
+                self.sample = sample
+                self.mid_point_sigma = sigma_fn(t_next)
+            else:
+                # free for "first order mode"
+                self.sample = None
+                self.mid_point_sigma = None
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        else:
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
new file mode 100644
index 000000000..7bb201de4
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
@@ -0,0 +1,979 @@
+# Copyright 2024 TSAIL Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import deprecate, logging
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `DPMSolverSinglestepScheduler` is a fast dedicated high-order solver for diffusion ODEs.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        solver_order (`int`, defaults to 2):
+            The DPMSolver order which can be `1` or `2` or `3`. It is recommended to use `solver_order=2` for guided
+            sampling, and `solver_order=3` for unconditional sampling.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
+            `algorithm_type="dpmsolver++"`.
+        algorithm_type (`str`, defaults to `dpmsolver++`):
+            Algorithm type for the solver; can be `dpmsolver` or `dpmsolver++`. The
+            `dpmsolver` type implements the algorithms in the [DPMSolver](https://huggingface.co/papers/2206.00927)
+            paper, and the `dpmsolver++` type implements the algorithms in the
+            [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to use `dpmsolver++` or
+            `sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion.
+        solver_type (`str`, defaults to `midpoint`):
+            Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the
+            sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers.
+        lower_order_final (`bool`, defaults to `True`):
+            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
+            stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        final_sigmas_type (`str`, *optional*, defaults to `"zero"`):
+            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final sigma
+            is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+        lambda_min_clipped (`float`, defaults to `-inf`):
+            Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the
+            cosine (`squaredcos_cap_v2`) noise schedule.
+        variance_type (`str`, *optional*):
+            Set to "learned" or "learned_range" for diffusion models that predict variance. If set, the model's output
+            contains the predicted Gaussian variance.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[np.ndarray] = None,
+        solver_order: int = 2,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        algorithm_type: str = "dpmsolver++",
+        solver_type: str = "midpoint",
+        lower_order_final: bool = False,
+        use_karras_sigmas: Optional[bool] = False,
+        final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
+        lambda_min_clipped: float = -float("inf"),
+        variance_type: Optional[str] = None,
+    ):
+        if algorithm_type == "dpmsolver":
+            deprecation_message = "algorithm_type `dpmsolver` is deprecated and will be removed in a future version. Choose from `dpmsolver++` or `sde-dpmsolver++` instead"
+            deprecate("algorithm_types=dpmsolver", "1.0.0", deprecation_message)
+
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        # Currently we only support VP-type noise schedule
+        self.alpha_t = torch.sqrt(self.alphas_cumprod)
+        self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
+        self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
+        self.sigmas = ((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # settings for DPM-Solver
+        if algorithm_type not in ["dpmsolver", "dpmsolver++"]:
+            if algorithm_type == "deis":
+                self.register_to_config(algorithm_type="dpmsolver++")
+            else:
+                raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
+        if solver_type not in ["midpoint", "heun"]:
+            if solver_type in ["logrho", "bh1", "bh2"]:
+                self.register_to_config(solver_type="midpoint")
+            else:
+                raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
+
+        if algorithm_type != "dpmsolver++" and final_sigmas_type == "zero":
+            raise ValueError(
+                f"`final_sigmas_type` {final_sigmas_type} is not supported for `algorithm_type` {algorithm_type}. Please chooose `sigma_min` instead."
+            )
+
+        # setable values
+        self.num_inference_steps = None
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
+        self.timesteps = torch.from_numpy(timesteps)
+        self.model_outputs = [None] * solver_order
+        self.sample = None
+        self.order_list = self.get_order_list(num_train_timesteps)
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    def get_order_list(self, num_inference_steps: int) -> List[int]:
+        """
+        Computes the solver order at each time step.
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+        """
+        steps = num_inference_steps
+        order = self.config.solver_order
+        if order > 3:
+            raise ValueError("Order > 3 is not supported by this scheduler")
+        if self.config.lower_order_final:
+            if order == 3:
+                if steps % 3 == 0:
+                    orders = [1, 2, 3] * (steps // 3 - 1) + [1, 2] + [1]
+                elif steps % 3 == 1:
+                    orders = [1, 2, 3] * (steps // 3) + [1]
+                else:
+                    orders = [1, 2, 3] * (steps // 3) + [1, 2]
+            elif order == 2:
+                if steps % 2 == 0:
+                    orders = [1, 2] * (steps // 2 - 1) + [1, 1]
+                else:
+                    orders = [1, 2] * (steps // 2) + [1]
+            elif order == 1:
+                orders = [1] * steps
+        else:
+            if order == 3:
+                orders = [1, 2, 3] * (steps // 3)
+            elif order == 2:
+                orders = [1, 2] * (steps // 2)
+            elif order == 1:
+                orders = [1] * steps
+        return orders
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increae 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+        # Clipping the minimum of all lambda(t) for numerical stability.
+        # This is critical for cosine (squaredcos_cap_v2) noise schedule.
+        clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped)
+        timesteps = (
+            np.linspace(0, self.config.num_train_timesteps - 1 - clipped_idx, num_inference_steps + 1)
+            .round()[::-1][:-1]
+            .copy()
+            .astype(np.int64)
+        )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        if self.config.use_karras_sigmas:
+            log_sigmas = np.log(sigmas)
+            sigmas = np.flip(sigmas).copy()
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+        else:
+            sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+
+        if self.config.final_sigmas_type == "sigma_min":
+            sigma_last = ((1 - self.alphas_cumprod[0]) / self.alphas_cumprod[0]) ** 0.5
+        elif self.config.final_sigmas_type == "zero":
+            sigma_last = 0
+        else:
+            raise ValueError(
+                f" `final_sigmas_type` must be one of `sigma_min` or `zero`, but got {self.config.final_sigmas_type}"
+            )
+        sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)
+
+        self.sigmas = torch.from_numpy(sigmas).to(device=device)
+
+        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64)
+        self.model_outputs = [None] * self.config.solver_order
+        self.sample = None
+
+        if not self.config.lower_order_final and num_inference_steps % self.config.solver_order != 0:
+            logger.warning(
+                "Changing scheduler {self.config} to have `lower_order_final` set to True to handle uneven amount of inference steps. Please make sure to always use an even number of `num_inference steps when using `lower_order_final=False`."
+            )
+            self.register_to_config(lower_order_final=True)
+
+        if not self.config.lower_order_final and self.config.final_sigmas_type == "zero":
+            logger.warning(
+                " `last_sigmas_type='zero'` is not supported for `lower_order_final=False`. Changing scheduler {self.config} to have `lower_order_final` set to True."
+            )
+            self.register_to_config(lower_order_final=True)
+
+        self.order_list = self.get_order_list(num_inference_steps)
+
+        # add an index counter for schedulers that allow duplicated timesteps
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._sigma_to_alpha_sigma_t
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        alpha_t = 1 / ((sigma**2 + 1) ** 0.5)
+        sigma_t = sigma * alpha_t
+
+        return alpha_t, sigma_t
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    def convert_model_output(
+        self,
+        model_output: torch.FloatTensor,
+        *args,
+        sample: torch.FloatTensor = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is
+        designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an
+        integral of the data prediction model.
+
+        <Tip>
+
+        The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise
+        prediction and data prediction models.
+
+        </Tip>
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from the learned diffusion model.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.FloatTensor`:
+                The converted model output.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError("missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        # DPM-Solver++ needs to solve an integral of the data prediction model.
+        if self.config.algorithm_type == "dpmsolver++":
+            if self.config.prediction_type == "epsilon":
+                # DPM-Solver and DPM-Solver++ only need the "mean" output.
+                if self.config.variance_type in ["learned_range"]:
+                    model_output = model_output[:, :3]
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                x0_pred = (sample - sigma_t * model_output) / alpha_t
+            elif self.config.prediction_type == "sample":
+                x0_pred = model_output
+            elif self.config.prediction_type == "v_prediction":
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                x0_pred = alpha_t * sample - sigma_t * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the DPMSolverSinglestepScheduler."
+                )
+
+            if self.config.thresholding:
+                x0_pred = self._threshold_sample(x0_pred)
+
+            return x0_pred
+        # DPM-Solver needs to solve an integral of the noise prediction model.
+        elif self.config.algorithm_type == "dpmsolver":
+            if self.config.prediction_type == "epsilon":
+                # DPM-Solver and DPM-Solver++ only need the "mean" output.
+                if self.config.variance_type in ["learned_range"]:
+                    model_output = model_output[:, :3]
+                return model_output
+            elif self.config.prediction_type == "sample":
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                epsilon = (sample - alpha_t * model_output) / sigma_t
+                return epsilon
+            elif self.config.prediction_type == "v_prediction":
+                sigma = self.sigmas[self.step_index]
+                alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+                epsilon = alpha_t * model_output + sigma_t * sample
+                return epsilon
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the DPMSolverSinglestepScheduler."
+                )
+
+    def dpm_solver_first_order_update(
+        self,
+        model_output: torch.FloatTensor,
+        *args,
+        sample: torch.FloatTensor = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        One step for the first-order DPMSolver (equivalent to DDIM).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from the learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            prev_timestep (`int`):
+                The previous discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.FloatTensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        sigma_t, sigma_s = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s, sigma_s = self._sigma_to_alpha_sigma_t(sigma_s)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s = torch.log(alpha_s) - torch.log(sigma_s)
+        h = lambda_t - lambda_s
+        if self.config.algorithm_type == "dpmsolver++":
+            x_t = (sigma_t / sigma_s) * sample - (alpha_t * (torch.exp(-h) - 1.0)) * model_output
+        elif self.config.algorithm_type == "dpmsolver":
+            x_t = (alpha_t / alpha_s) * sample - (sigma_t * (torch.exp(h) - 1.0)) * model_output
+        return x_t
+
+    def singlestep_dpm_solver_second_order_update(
+        self,
+        model_output_list: List[torch.FloatTensor],
+        *args,
+        sample: torch.FloatTensor = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        One step for the second-order singlestep DPMSolver that computes the solution at time `prev_timestep` from the
+        time `timestep_list[-2]`.
+
+        Args:
+            model_output_list (`List[torch.FloatTensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            timestep (`int`):
+                The current and latter discrete timestep in the diffusion chain.
+            prev_timestep (`int`):
+                The previous discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.FloatTensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if timestep_list is not None:
+            deprecate(
+                "timestep_list",
+                "1.0.0",
+                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        sigma_t, sigma_s0, sigma_s1 = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+
+        h, h_0 = lambda_t - lambda_s1, lambda_s0 - lambda_s1
+        r0 = h_0 / h
+        D0, D1 = m1, (1.0 / r0) * (m0 - m1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2211.01095 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s1) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    - 0.5 * (alpha_t * (torch.exp(-h) - 1.0)) * D1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s1) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+                )
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (alpha_t / alpha_s1) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - 0.5 * (sigma_t * (torch.exp(h) - 1.0)) * D1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (alpha_t / alpha_s1) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                )
+        return x_t
+
+    def singlestep_dpm_solver_third_order_update(
+        self,
+        model_output_list: List[torch.FloatTensor],
+        *args,
+        sample: torch.FloatTensor = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        One step for the third-order singlestep DPMSolver that computes the solution at time `prev_timestep` from the
+        time `timestep_list[-3]`.
+
+        Args:
+            model_output_list (`List[torch.FloatTensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            timestep (`int`):
+                The current and latter discrete timestep in the diffusion chain.
+            prev_timestep (`int`):
+                The previous discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by diffusion process.
+
+        Returns:
+            `torch.FloatTensor`:
+                The sample tensor at the previous timestep.
+        """
+
+        timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing`sample` as a required keyward argument")
+        if timestep_list is not None:
+            deprecate(
+                "timestep_list",
+                "1.0.0",
+                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma_t, sigma_s0, sigma_s1, sigma_s2 = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],
+            self.sigmas[self.step_index - 2],
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+        alpha_s2, sigma_s2 = self._sigma_to_alpha_sigma_t(sigma_s2)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+        lambda_s2 = torch.log(alpha_s2) - torch.log(sigma_s2)
+
+        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
+
+        h, h_0, h_1 = lambda_t - lambda_s2, lambda_s0 - lambda_s2, lambda_s1 - lambda_s2
+        r0, r1 = h_0 / h, h_1 / h
+        D0 = m2
+        D1_0, D1_1 = (1.0 / r1) * (m1 - m2), (1.0 / r0) * (m0 - m2)
+        D1 = (r0 * D1_0 - r1 * D1_1) / (r0 - r1)
+        D2 = 2.0 * (D1_1 - D1_0) / (r0 - r1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s2) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1_1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s2) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+                    - (alpha_t * ((torch.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
+                )
+        elif self.config.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (alpha_t / alpha_s2) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1_1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (alpha_t / alpha_s2) * sample
+                    - (sigma_t * (torch.exp(h) - 1.0)) * D0
+                    - (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1
+                    - (sigma_t * ((torch.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2
+                )
+        return x_t
+
+    def singlestep_dpm_solver_update(
+        self,
+        model_output_list: List[torch.FloatTensor],
+        *args,
+        sample: torch.FloatTensor = None,
+        order: int = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        One step for the singlestep DPMSolver.
+
+        Args:
+            model_output_list (`List[torch.FloatTensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            timestep (`int`):
+                The current and latter discrete timestep in the diffusion chain.
+            prev_timestep (`int`):
+                The previous discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by diffusion process.
+            order (`int`):
+                The solver order at this step.
+
+        Returns:
+            `torch.FloatTensor`:
+                The sample tensor at the previous timestep.
+        """
+        timestep_list = args[0] if len(args) > 0 else kwargs.pop("timestep_list", None)
+        prev_timestep = args[1] if len(args) > 1 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 2:
+                sample = args[2]
+            else:
+                raise ValueError(" missing`sample` as a required keyward argument")
+        if order is None:
+            if len(args) > 3:
+                order = args[3]
+            else:
+                raise ValueError(" missing `order` as a required keyward argument")
+        if timestep_list is not None:
+            deprecate(
+                "timestep_list",
+                "1.0.0",
+                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        if order == 1:
+            return self.dpm_solver_first_order_update(model_output_list[-1], sample=sample)
+        elif order == 2:
+            return self.singlestep_dpm_solver_second_order_update(model_output_list, sample=sample)
+        elif order == 3:
+            return self.singlestep_dpm_solver_third_order_update(model_output_list, sample=sample)
+        else:
+            raise ValueError(f"Order must be 1, 2, 3, got {order}")
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        index_candidates = (schedule_timesteps == timestep).nonzero()
+
+        if len(index_candidates) == 0:
+            step_index = len(self.timesteps) - 1
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        elif len(index_candidates) > 1:
+            step_index = index_candidates[1].item()
+        else:
+            step_index = index_candidates[0].item()
+
+        return step_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        """
+        Initialize the step_index counter for the scheduler.
+        """
+
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the singlestep DPMSolver.
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        model_output = self.convert_model_output(model_output, sample=sample)
+        for i in range(self.config.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+        self.model_outputs[-1] = model_output
+
+        order = self.order_list[self.step_index]
+
+        #  For img2img denoising might start with order>1 which is not possible
+        #  In this case make sure that the first two steps are both order=1
+        while self.model_outputs[-order] is None:
+            order -= 1
+
+        # For single-step solvers, we use the initial value at each time with order = 1.
+        if order == 1:
+            self.sample = sample
+
+        prev_sample = self.singlestep_dpm_solver_update(self.model_outputs, sample=self.sample, order=order)
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # begin_index is None when the scheduler is used for training
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        else:
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        noisy_samples = alpha_t * original_samples + sigma_t * noise
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
new file mode 100644
index 000000000..5fea89bb8
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
@@ -0,0 +1,683 @@
+# Copyright 2024 TSAIL Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver and https://github.com/NVlabs/edm
+
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import SchedulerMixin, SchedulerOutput
+
+
+class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Implements DPMSolverMultistepScheduler in EDM formulation as presented in Karras et al. 2022 [1].
+    `EDMDPMSolverMultistepScheduler` is a fast dedicated high-order solver for diffusion ODEs.
+
+    [1] Karras, Tero, et al. "Elucidating the Design Space of Diffusion-Based Generative Models."
+    https://arxiv.org/abs/2206.00364
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        sigma_min (`float`, *optional*, defaults to 0.002):
+            Minimum noise magnitude in the sigma schedule. This was set to 0.002 in the EDM paper [1]; a reasonable
+            range is [0, 10].
+        sigma_max (`float`, *optional*, defaults to 80.0):
+            Maximum noise magnitude in the sigma schedule. This was set to 80.0 in the EDM paper [1]; a reasonable
+            range is [0.2, 80.0].
+        sigma_data (`float`, *optional*, defaults to 0.5):
+            The standard deviation of the data distribution. This is set to 0.5 in the EDM paper [1].
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        solver_order (`int`, defaults to 2):
+            The DPMSolver order which can be `1` or `2` or `3`. It is recommended to use `solver_order=2` for guided
+            sampling, and `solver_order=3` for unconditional sampling.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
+            `algorithm_type="dpmsolver++"`.
+        algorithm_type (`str`, defaults to `dpmsolver++`):
+            Algorithm type for the solver; can be `dpmsolver++` or `sde-dpmsolver++`. The
+            `dpmsolver++` type implements the algorithms in the
+            [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to use `dpmsolver++` or
+            `sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion.
+        solver_type (`str`, defaults to `midpoint`):
+            Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the
+            sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers.
+        lower_order_final (`bool`, defaults to `True`):
+            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
+            stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
+        euler_at_final (`bool`, defaults to `False`):
+            Whether to use Euler's method in the final step. It is a trade-off between numerical stability and detail
+            richness. This can stabilize the sampling of the SDE variant of DPMSolver for small number of inference
+            steps, but sometimes may result in blurring.
+        final_sigmas_type (`str`, defaults to `"zero"`):
+            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final sigma
+            is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
+    """
+
+    _compatibles = []
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        sigma_min: float = 0.002,
+        sigma_max: float = 80.0,
+        sigma_data: float = 0.5,
+        num_train_timesteps: int = 1000,
+        prediction_type: str = "epsilon",
+        rho: float = 7.0,
+        solver_order: int = 2,
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        algorithm_type: str = "dpmsolver++",
+        solver_type: str = "midpoint",
+        lower_order_final: bool = True,
+        euler_at_final: bool = False,
+        final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
+    ):
+        # settings for DPM-Solver
+        if algorithm_type not in ["dpmsolver++", "sde-dpmsolver++"]:
+            if algorithm_type == "deis":
+                self.register_to_config(algorithm_type="dpmsolver++")
+            else:
+                raise NotImplementedError(f"{algorithm_type} is not implemented for {self.__class__}")
+
+        if solver_type not in ["midpoint", "heun"]:
+            if solver_type in ["logrho", "bh1", "bh2"]:
+                self.register_to_config(solver_type="midpoint")
+            else:
+                raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
+
+        if algorithm_type not in ["dpmsolver++", "sde-dpmsolver++"] and final_sigmas_type == "zero":
+            raise ValueError(
+                f"`final_sigmas_type` {final_sigmas_type} is not supported for `algorithm_type` {algorithm_type}. Please choose `sigma_min` instead."
+            )
+
+        ramp = torch.linspace(0, 1, num_train_timesteps)
+        sigmas = self._compute_sigmas(ramp)
+        self.timesteps = self.precondition_noise(sigmas)
+
+        self.sigmas = self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
+
+        # setable values
+        self.num_inference_steps = None
+        self.model_outputs = [None] * solver_order
+        self.lower_order_nums = 0
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def init_noise_sigma(self):
+        # standard deviation of the initial noise distribution
+        return (self.config.sigma_max**2 + 1) ** 0.5
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increae 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler.precondition_inputs
+    def precondition_inputs(self, sample, sigma):
+        c_in = 1 / ((sigma**2 + self.config.sigma_data**2) ** 0.5)
+        scaled_sample = sample * c_in
+        return scaled_sample
+
+    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler.precondition_noise
+    def precondition_noise(self, sigma):
+        if not isinstance(sigma, torch.Tensor):
+            sigma = torch.tensor([sigma])
+
+        c_noise = 0.25 * torch.log(sigma)
+
+        return c_noise
+
+    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler.precondition_outputs
+    def precondition_outputs(self, sample, model_output, sigma):
+        sigma_data = self.config.sigma_data
+        c_skip = sigma_data**2 / (sigma**2 + sigma_data**2)
+
+        if self.config.prediction_type == "epsilon":
+            c_out = sigma * sigma_data / (sigma**2 + sigma_data**2) ** 0.5
+        elif self.config.prediction_type == "v_prediction":
+            c_out = -sigma * sigma_data / (sigma**2 + sigma_data**2) ** 0.5
+        else:
+            raise ValueError(f"Prediction type {self.config.prediction_type} is not supported.")
+
+        denoised = c_skip * sample + c_out * model_output
+
+        return denoised
+
+    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler.scale_model_input
+    def scale_model_input(
+        self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor]
+    ) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+        sample = self.precondition_inputs(sample, sigma)
+
+        self.is_scale_input_called = True
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+
+        self.num_inference_steps = num_inference_steps
+
+        ramp = np.linspace(0, 1, self.num_inference_steps)
+        sigmas = self._compute_sigmas(ramp)
+
+        sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
+        self.timesteps = self.precondition_noise(sigmas)
+
+        if self.config.final_sigmas_type == "sigma_min":
+            sigma_last = self.config.sigma_min
+        elif self.config.final_sigmas_type == "zero":
+            sigma_last = 0
+        else:
+            raise ValueError(
+                f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
+            )
+
+        self.sigmas = torch.cat([sigmas, torch.tensor([sigma_last], dtype=torch.float32, device=device)])
+
+        self.model_outputs = [
+            None,
+        ] * self.config.solver_order
+        self.lower_order_nums = 0
+
+        # add an index counter for schedulers that allow duplicated timesteps
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Taken from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17
+    def _compute_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        sigma_min = sigma_min or self.config.sigma_min
+        sigma_max = sigma_max or self.config.sigma_max
+
+        rho = self.config.rho
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        alpha_t = torch.tensor(1)  # Inputs are pre-scaled before going into unet, so alpha_t = 1
+        sigma_t = sigma
+
+        return alpha_t, sigma_t
+
+    def convert_model_output(
+        self,
+        model_output: torch.FloatTensor,
+        sample: torch.FloatTensor = None,
+    ) -> torch.FloatTensor:
+        """
+        Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is
+        designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an
+        integral of the data prediction model.
+
+        <Tip>
+
+        The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise
+        prediction and data prediction models.
+
+        </Tip>
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from the learned diffusion model.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.FloatTensor`:
+                The converted model output.
+        """
+        sigma = self.sigmas[self.step_index]
+        x0_pred = self.precondition_outputs(sample, model_output, sigma)
+
+        if self.config.thresholding:
+            x0_pred = self._threshold_sample(x0_pred)
+
+        return x0_pred
+
+    def dpm_solver_first_order_update(
+        self,
+        model_output: torch.FloatTensor,
+        sample: torch.FloatTensor = None,
+        noise: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        One step for the first-order DPMSolver (equivalent to DDIM).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from the learned diffusion model.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.FloatTensor`:
+                The sample tensor at the previous timestep.
+        """
+        sigma_t, sigma_s = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s, sigma_s = self._sigma_to_alpha_sigma_t(sigma_s)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s = torch.log(alpha_s) - torch.log(sigma_s)
+
+        h = lambda_t - lambda_s
+        if self.config.algorithm_type == "dpmsolver++":
+            x_t = (sigma_t / sigma_s) * sample - (alpha_t * (torch.exp(-h) - 1.0)) * model_output
+        elif self.config.algorithm_type == "sde-dpmsolver++":
+            assert noise is not None
+            x_t = (
+                (sigma_t / sigma_s * torch.exp(-h)) * sample
+                + (alpha_t * (1 - torch.exp(-2.0 * h))) * model_output
+                + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+            )
+
+        return x_t
+
+    def multistep_dpm_solver_second_order_update(
+        self,
+        model_output_list: List[torch.FloatTensor],
+        sample: torch.FloatTensor = None,
+        noise: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        """
+        One step for the second-order multistep DPMSolver.
+
+        Args:
+            model_output_list (`List[torch.FloatTensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.FloatTensor`:
+                The sample tensor at the previous timestep.
+        """
+        sigma_t, sigma_s0, sigma_s1 = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+
+        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
+        r0 = h_0 / h
+        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2211.01095 for detailed derivations
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    - 0.5 * (alpha_t * (torch.exp(-h) - 1.0)) * D1
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s0) * sample
+                    - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                    + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+                )
+        elif self.config.algorithm_type == "sde-dpmsolver++":
+            assert noise is not None
+            if self.config.solver_type == "midpoint":
+                x_t = (
+                    (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+                    + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+                    + 0.5 * (alpha_t * (1 - torch.exp(-2.0 * h))) * D1
+                    + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+                )
+            elif self.config.solver_type == "heun":
+                x_t = (
+                    (sigma_t / sigma_s0 * torch.exp(-h)) * sample
+                    + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
+                    + (alpha_t * ((1.0 - torch.exp(-2.0 * h)) / (-2.0 * h) + 1.0)) * D1
+                    + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
+                )
+
+        return x_t
+
+    def multistep_dpm_solver_third_order_update(
+        self,
+        model_output_list: List[torch.FloatTensor],
+        sample: torch.FloatTensor = None,
+    ) -> torch.FloatTensor:
+        """
+        One step for the third-order multistep DPMSolver.
+
+        Args:
+            model_output_list (`List[torch.FloatTensor]`):
+                The direct outputs from learned diffusion model at current and latter timesteps.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by diffusion process.
+
+        Returns:
+            `torch.FloatTensor`:
+                The sample tensor at the previous timestep.
+        """
+        sigma_t, sigma_s0, sigma_s1, sigma_s2 = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],
+            self.sigmas[self.step_index - 2],
+        )
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
+        alpha_s2, sigma_s2 = self._sigma_to_alpha_sigma_t(sigma_s2)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
+        lambda_s2 = torch.log(alpha_s2) - torch.log(sigma_s2)
+
+        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
+
+        h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
+        r0, r1 = h_0 / h, h_1 / h
+        D0 = m0
+        D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
+        D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
+        D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
+        if self.config.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                (sigma_t / sigma_s0) * sample
+                - (alpha_t * (torch.exp(-h) - 1.0)) * D0
+                + (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1
+                - (alpha_t * ((torch.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2
+            )
+
+        return x_t
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        index_candidates = (schedule_timesteps == timestep).nonzero()
+
+        if len(index_candidates) == 0:
+            step_index = len(self.timesteps) - 1
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        elif len(index_candidates) > 1:
+            step_index = index_candidates[1].item()
+        else:
+            step_index = index_candidates[0].item()
+
+        return step_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        """
+        Initialize the step_index counter for the scheduler.
+        """
+
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        generator=None,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the multistep DPMSolver.
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # Improve numerical stability for small number of steps
+        lower_order_final = (self.step_index == len(self.timesteps) - 1) and (
+            self.config.euler_at_final
+            or (self.config.lower_order_final and len(self.timesteps) < 15)
+            or self.config.final_sigmas_type == "zero"
+        )
+        lower_order_second = (
+            (self.step_index == len(self.timesteps) - 2) and self.config.lower_order_final and len(self.timesteps) < 15
+        )
+
+        model_output = self.convert_model_output(model_output, sample=sample)
+        for i in range(self.config.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+        self.model_outputs[-1] = model_output
+
+        if self.config.algorithm_type == "sde-dpmsolver++":
+            noise = randn_tensor(
+                model_output.shape, generator=generator, device=model_output.device, dtype=model_output.dtype
+            )
+        else:
+            noise = None
+
+        if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
+            prev_sample = self.dpm_solver_first_order_update(model_output, sample=sample, noise=noise)
+        elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
+            prev_sample = self.multistep_dpm_solver_second_order_update(self.model_outputs, sample=sample, noise=noise)
+        else:
+            prev_sample = self.multistep_dpm_solver_third_order_update(self.model_outputs, sample=sample)
+
+        if self.lower_order_nums < self.config.solver_order:
+            self.lower_order_nums += 1
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        else:
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_edm_euler.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_edm_euler.py
new file mode 100644
index 000000000..e62a486cc
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_edm_euler.py
@@ -0,0 +1,381 @@
+# Copyright 2024 Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput, logging
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import SchedulerMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->EulerDiscrete
+class EDMEulerSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.FloatTensor
+    pred_original_sample: Optional[torch.FloatTensor] = None
+
+
+class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Implements the Euler scheduler in EDM formulation as presented in Karras et al. 2022 [1].
+
+    [1] Karras, Tero, et al. "Elucidating the Design Space of Diffusion-Based Generative Models."
+    https://arxiv.org/abs/2206.00364
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        sigma_min (`float`, *optional*, defaults to 0.002):
+            Minimum noise magnitude in the sigma schedule. This was set to 0.002 in the EDM paper [1]; a reasonable
+            range is [0, 10].
+        sigma_max (`float`, *optional*, defaults to 80.0):
+            Maximum noise magnitude in the sigma schedule. This was set to 80.0 in the EDM paper [1]; a reasonable
+            range is [0.2, 80.0].
+        sigma_data (`float`, *optional*, defaults to 0.5):
+            The standard deviation of the data distribution. This is set to 0.5 in the EDM paper [1].
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        rho (`float`, *optional*, defaults to 7.0):
+            The rho parameter used for calculating the Karras sigma schedule, which is set to 7.0 in the EDM paper [1].
+    """
+
+    _compatibles = []
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        sigma_min: float = 0.002,
+        sigma_max: float = 80.0,
+        sigma_data: float = 0.5,
+        num_train_timesteps: int = 1000,
+        prediction_type: str = "epsilon",
+        rho: float = 7.0,
+    ):
+        # setable values
+        self.num_inference_steps = None
+
+        ramp = torch.linspace(0, 1, num_train_timesteps)
+        sigmas = self._compute_sigmas(ramp)
+        self.timesteps = self.precondition_noise(sigmas)
+
+        self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
+
+        self.is_scale_input_called = False
+
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def init_noise_sigma(self):
+        # standard deviation of the initial noise distribution
+        return (self.config.sigma_max**2 + 1) ** 0.5
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increae 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def precondition_inputs(self, sample, sigma):
+        c_in = 1 / ((sigma**2 + self.config.sigma_data**2) ** 0.5)
+        scaled_sample = sample * c_in
+        return scaled_sample
+
+    def precondition_noise(self, sigma):
+        if not isinstance(sigma, torch.Tensor):
+            sigma = torch.tensor([sigma])
+
+        c_noise = 0.25 * torch.log(sigma)
+
+        return c_noise
+
+    def precondition_outputs(self, sample, model_output, sigma):
+        sigma_data = self.config.sigma_data
+        c_skip = sigma_data**2 / (sigma**2 + sigma_data**2)
+
+        if self.config.prediction_type == "epsilon":
+            c_out = sigma * sigma_data / (sigma**2 + sigma_data**2) ** 0.5
+        elif self.config.prediction_type == "v_prediction":
+            c_out = -sigma * sigma_data / (sigma**2 + sigma_data**2) ** 0.5
+        else:
+            raise ValueError(f"Prediction type {self.config.prediction_type} is not supported.")
+
+        denoised = c_skip * sample + c_out * model_output
+
+        return denoised
+
+    def scale_model_input(
+        self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor]
+    ) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+        sample = self.precondition_inputs(sample, sigma)
+
+        self.is_scale_input_called = True
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+
+        ramp = np.linspace(0, 1, self.num_inference_steps)
+        sigmas = self._compute_sigmas(ramp)
+
+        sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
+        self.timesteps = self.precondition_noise(sigmas)
+
+        self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Taken from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17
+    def _compute_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.FloatTensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        sigma_min = sigma_min or self.config.sigma_min
+        sigma_max = sigma_max or self.config.sigma_max
+
+        rho = self.config.rho
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        s_churn: float = 0.0,
+        s_tmin: float = 0.0,
+        s_tmax: float = float("inf"),
+        s_noise: float = 1.0,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[EDMEulerSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            s_churn (`float`):
+            s_tmin  (`float`):
+            s_tmax  (`float`):
+            s_noise (`float`, defaults to 1.0):
+                Scaling factor for noise added to the sample.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EDMEulerSchedulerOutput`] or
+                tuple.
+
+        Returns:
+            [`~schedulers.scheduling_euler_discrete.EDMEulerSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EDMEulerSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EDMEulerScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+
+        if not self.is_scale_input_called:
+            logger.warning(
+                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+                "See `StableDiffusionPipeline` for a usage example."
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+
+        sigma = self.sigmas[self.step_index]
+
+        gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
+
+        noise = randn_tensor(
+            model_output.shape, dtype=model_output.dtype, device=model_output.device, generator=generator
+        )
+
+        eps = noise * s_noise
+        sigma_hat = sigma * (gamma + 1)
+
+        if gamma > 0:
+            sample = sample + eps * (sigma_hat**2 - sigma**2) ** 0.5
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        pred_original_sample = self.precondition_outputs(sample, model_output, sigma_hat)
+
+        # 2. Convert to an ODE derivative
+        derivative = (sample - pred_original_sample) / sigma_hat
+
+        dt = self.sigmas[self.step_index + 1] - sigma_hat
+
+        prev_sample = sample + derivative * dt
+
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return EDMEulerSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        else:
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
new file mode 100644
index 000000000..dfab59272
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
@@ -0,0 +1,481 @@
+# Copyright 2024 Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput, logging
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->EulerAncestralDiscrete
+class EulerAncestralDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.FloatTensor
+    pred_original_sample: Optional[torch.FloatTensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.FloatTensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.FloatTensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Ancestral sampling with Euler method steps.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear` or `scaled_linear`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+        rescale_betas_zero_snr: bool = False,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        if rescale_betas_zero_snr:
+            # Close to 0 without being 0 so first sigma is not inf
+            # FP16 smallest positive subnormal works well here
+            self.alphas_cumprod[-1] = 2**-24
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
+        self.sigmas = torch.from_numpy(sigmas)
+
+        # setable values
+        self.num_inference_steps = None
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
+        self.timesteps = torch.from_numpy(timesteps)
+        self.is_scale_input_called = False
+
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def init_noise_sigma(self):
+        # standard deviation of the initial noise distribution
+        if self.config.timestep_spacing in ["linspace", "trailing"]:
+            return self.sigmas.max()
+
+        return (self.sigmas.max() ** 2 + 1) ** 0.5
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increae 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_model_input(
+        self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor]
+    ) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+        sample = sample / ((sigma**2 + 1) ** 0.5)
+        self.is_scale_input_called = True
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=np.float32)[
+                ::-1
+            ].copy()
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.float32)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(self.config.num_train_timesteps, 0, -step_ratio)).round().copy().astype(np.float32)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+        self.sigmas = torch.from_numpy(sigmas).to(device=device)
+
+        self.timesteps = torch.from_numpy(timesteps).to(device=device)
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[EulerAncestralDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a
+                [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or tuple.
+
+        Returns:
+            [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`,
+                [`~schedulers.scheduling_euler_ancestral_discrete.EulerAncestralDiscreteSchedulerOutput`] is returned,
+                otherwise a tuple is returned where the first element is the sample tensor.
+
+        """
+
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+
+        if not self.is_scale_input_called:
+            logger.warning(
+                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+                "See `StableDiffusionPipeline` for a usage example."
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = sample - sigma * model_output
+        elif self.config.prediction_type == "v_prediction":
+            # * c_out + input * c_skip
+            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+        elif self.config.prediction_type == "sample":
+            raise NotImplementedError("prediction_type not implemented yet: sample")
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+
+        sigma_from = self.sigmas[self.step_index]
+        sigma_to = self.sigmas[self.step_index + 1]
+        sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
+        sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+
+        # 2. Convert to an ODE derivative
+        derivative = (sample - pred_original_sample) / sigma
+
+        dt = sigma_down - sigma
+
+        prev_sample = sample + derivative * dt
+
+        device = model_output.device
+        noise = randn_tensor(model_output.shape, dtype=model_output.dtype, device=device, generator=generator)
+
+        prev_sample = prev_sample + noise * sigma_up
+
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return EulerAncestralDiscreteSchedulerOutput(
+            prev_sample=prev_sample, pred_original_sample=pred_original_sample
+        )
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        else:
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_euler_discrete.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_euler_discrete.py
new file mode 100644
index 000000000..22258abc8
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_euler_discrete.py
@@ -0,0 +1,576 @@
+# Copyright 2024 Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput, logging
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->EulerDiscrete
+class EulerDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.FloatTensor
+    pred_original_sample: Optional[torch.FloatTensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
+def rescale_zero_terminal_snr(betas):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.FloatTensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.FloatTensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Euler scheduler.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear` or `scaled_linear`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        interpolation_type(`str`, defaults to `"linear"`, *optional*):
+            The interpolation type to compute intermediate sigmas for the scheduler denoising steps. Should be on of
+            `"linear"` or `"log_linear"`.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+        interpolation_type: str = "linear",
+        use_karras_sigmas: Optional[bool] = False,
+        sigma_min: Optional[float] = None,
+        sigma_max: Optional[float] = None,
+        timestep_spacing: str = "linspace",
+        timestep_type: str = "discrete",  # can be "discrete" or "continuous"
+        steps_offset: int = 0,
+        rescale_betas_zero_snr: bool = False,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        if rescale_betas_zero_snr:
+            # Close to 0 without being 0 so first sigma is not inf
+            # FP16 smallest positive subnormal works well here
+            self.alphas_cumprod[-1] = 2**-24
+
+        sigmas = (((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5).flip(0)
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy()
+        timesteps = torch.from_numpy(timesteps).to(dtype=torch.float32)
+
+        # setable values
+        self.num_inference_steps = None
+
+        # TODO: Support the full EDM scalings for all prediction types and timestep types
+        if timestep_type == "continuous" and prediction_type == "v_prediction":
+            self.timesteps = torch.Tensor([0.25 * sigma.log() for sigma in sigmas])
+        else:
+            self.timesteps = timesteps
+
+        self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
+
+        self.is_scale_input_called = False
+        self.use_karras_sigmas = use_karras_sigmas
+
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def init_noise_sigma(self):
+        # standard deviation of the initial noise distribution
+        max_sigma = max(self.sigmas) if isinstance(self.sigmas, list) else self.sigmas.max()
+        if self.config.timestep_spacing in ["linspace", "trailing"]:
+            return max_sigma
+
+        return (max_sigma**2 + 1) ** 0.5
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increae 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_model_input(
+        self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor]
+    ) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+        sample = sample / ((sigma**2 + 1) ** 0.5)
+
+        self.is_scale_input_called = True
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=np.float32)[
+                ::-1
+            ].copy()
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.float32)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(self.config.num_train_timesteps, 0, -step_ratio)).round().copy().astype(np.float32)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        log_sigmas = np.log(sigmas)
+
+        if self.config.interpolation_type == "linear":
+            sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+        elif self.config.interpolation_type == "log_linear":
+            sigmas = torch.linspace(np.log(sigmas[-1]), np.log(sigmas[0]), num_inference_steps + 1).exp().numpy()
+        else:
+            raise ValueError(
+                f"{self.config.interpolation_type} is not implemented. Please specify interpolation_type to either"
+                " 'linear' or 'log_linear'"
+            )
+
+        if self.use_karras_sigmas:
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+
+        sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32, device=device)
+
+        # TODO: Support the full EDM scalings for all prediction types and timestep types
+        if self.config.timestep_type == "continuous" and self.config.prediction_type == "v_prediction":
+            self.timesteps = torch.Tensor([0.25 * sigma.log() for sigma in sigmas]).to(device=device)
+        else:
+            self.timesteps = torch.from_numpy(timesteps.astype(np.float32)).to(device=device)
+
+        self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    # Copied from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17
+    def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        s_churn: float = 0.0,
+        s_tmin: float = 0.0,
+        s_tmax: float = float("inf"),
+        s_noise: float = 1.0,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[EulerDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            s_churn (`float`):
+            s_tmin  (`float`):
+            s_tmax  (`float`):
+            s_noise (`float`, defaults to 1.0):
+                Scaling factor for noise added to the sample.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
+                tuple.
+
+        Returns:
+            [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+
+        if not self.is_scale_input_called:
+            logger.warning(
+                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+                "See `StableDiffusionPipeline` for a usage example."
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+
+        sigma = self.sigmas[self.step_index]
+
+        gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
+
+        noise = randn_tensor(
+            model_output.shape, dtype=model_output.dtype, device=model_output.device, generator=generator
+        )
+
+        eps = noise * s_noise
+        sigma_hat = sigma * (gamma + 1)
+
+        if gamma > 0:
+            sample = sample + eps * (sigma_hat**2 - sigma**2) ** 0.5
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        # NOTE: "original_sample" should not be an expected prediction_type but is left in for
+        # backwards compatibility
+        if self.config.prediction_type == "original_sample" or self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        elif self.config.prediction_type == "epsilon":
+            pred_original_sample = sample - sigma_hat * model_output
+        elif self.config.prediction_type == "v_prediction":
+            # denoised = model_output * c_out + input * c_skip
+            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+
+        # 2. Convert to an ODE derivative
+        derivative = (sample - pred_original_sample) / sigma_hat
+
+        dt = self.sigmas[self.step_index + 1] - sigma_hat
+
+        prev_sample = sample + derivative * dt
+
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return EulerDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        else:
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_euler_discrete_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_euler_discrete_flax.py
new file mode 100644
index 000000000..55b0c2460
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_euler_discrete_flax.py
@@ -0,0 +1,265 @@
+# Copyright 2024 Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import flax
+import jax.numpy as jnp
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils_flax import (
+    CommonSchedulerState,
+    FlaxKarrasDiffusionSchedulers,
+    FlaxSchedulerMixin,
+    FlaxSchedulerOutput,
+    broadcast_to_shape_from_left,
+)
+
+
+@flax.struct.dataclass
+class EulerDiscreteSchedulerState:
+    common: CommonSchedulerState
+
+    # setable values
+    init_noise_sigma: jnp.ndarray
+    timesteps: jnp.ndarray
+    sigmas: jnp.ndarray
+    num_inference_steps: Optional[int] = None
+
+    @classmethod
+    def create(
+        cls, common: CommonSchedulerState, init_noise_sigma: jnp.ndarray, timesteps: jnp.ndarray, sigmas: jnp.ndarray
+    ):
+        return cls(common=common, init_noise_sigma=init_noise_sigma, timesteps=timesteps, sigmas=sigmas)
+
+
+@dataclass
+class FlaxEulerDiscreteSchedulerOutput(FlaxSchedulerOutput):
+    state: EulerDiscreteSchedulerState
+
+
+class FlaxEulerDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin):
+    """
+    Euler scheduler (Algorithm 2) from Karras et al. (2022) https://arxiv.org/abs/2206.00364. . Based on the original
+    k-diffusion implementation by Katherine Crowson:
+    https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L51
+
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear` or `scaled_linear`.
+        trained_betas (`jnp.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+        prediction_type (`str`, default `epsilon`, optional):
+            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
+            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
+            https://imagen.research.google/video/paper.pdf)
+        dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
+            the `dtype` used for params and computation.
+    """
+
+    _compatibles = [e.name for e in FlaxKarrasDiffusionSchedulers]
+
+    dtype: jnp.dtype
+
+    @property
+    def has_state(self):
+        return True
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[jnp.ndarray] = None,
+        prediction_type: str = "epsilon",
+        timestep_spacing: str = "linspace",
+        dtype: jnp.dtype = jnp.float32,
+    ):
+        self.dtype = dtype
+
+    def create_state(self, common: Optional[CommonSchedulerState] = None) -> EulerDiscreteSchedulerState:
+        if common is None:
+            common = CommonSchedulerState.create(self)
+
+        timesteps = jnp.arange(0, self.config.num_train_timesteps).round()[::-1]
+        sigmas = ((1 - common.alphas_cumprod) / common.alphas_cumprod) ** 0.5
+        sigmas = jnp.interp(timesteps, jnp.arange(0, len(sigmas)), sigmas)
+        sigmas = jnp.concatenate([sigmas, jnp.array([0.0], dtype=self.dtype)])
+
+        # standard deviation of the initial noise distribution
+        if self.config.timestep_spacing in ["linspace", "trailing"]:
+            init_noise_sigma = sigmas.max()
+        else:
+            init_noise_sigma = (sigmas.max() ** 2 + 1) ** 0.5
+
+        return EulerDiscreteSchedulerState.create(
+            common=common,
+            init_noise_sigma=init_noise_sigma,
+            timesteps=timesteps,
+            sigmas=sigmas,
+        )
+
+    def scale_model_input(self, state: EulerDiscreteSchedulerState, sample: jnp.ndarray, timestep: int) -> jnp.ndarray:
+        """
+        Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
+
+        Args:
+            state (`EulerDiscreteSchedulerState`):
+                the `FlaxEulerDiscreteScheduler` state data class instance.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            timestep (`int`):
+                current discrete timestep in the diffusion chain.
+
+        Returns:
+            `jnp.ndarray`: scaled input sample
+        """
+        (step_index,) = jnp.where(state.timesteps == timestep, size=1)
+        step_index = step_index[0]
+
+        sigma = state.sigmas[step_index]
+        sample = sample / ((sigma**2 + 1) ** 0.5)
+        return sample
+
+    def set_timesteps(
+        self, state: EulerDiscreteSchedulerState, num_inference_steps: int, shape: Tuple = ()
+    ) -> EulerDiscreteSchedulerState:
+        """
+        Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            state (`EulerDiscreteSchedulerState`):
+                the `FlaxEulerDiscreteScheduler` state data class instance.
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+        """
+
+        if self.config.timestep_spacing == "linspace":
+            timesteps = jnp.linspace(self.config.num_train_timesteps - 1, 0, num_inference_steps, dtype=self.dtype)
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // num_inference_steps
+            timesteps = (jnp.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(float)
+            timesteps += 1
+        else:
+            raise ValueError(
+                f"timestep_spacing must be one of ['linspace', 'leading'], got {self.config.timestep_spacing}"
+            )
+
+        sigmas = ((1 - state.common.alphas_cumprod) / state.common.alphas_cumprod) ** 0.5
+        sigmas = jnp.interp(timesteps, jnp.arange(0, len(sigmas)), sigmas)
+        sigmas = jnp.concatenate([sigmas, jnp.array([0.0], dtype=self.dtype)])
+
+        # standard deviation of the initial noise distribution
+        if self.config.timestep_spacing in ["linspace", "trailing"]:
+            init_noise_sigma = sigmas.max()
+        else:
+            init_noise_sigma = (sigmas.max() ** 2 + 1) ** 0.5
+
+        return state.replace(
+            timesteps=timesteps,
+            sigmas=sigmas,
+            num_inference_steps=num_inference_steps,
+            init_noise_sigma=init_noise_sigma,
+        )
+
+    def step(
+        self,
+        state: EulerDiscreteSchedulerState,
+        model_output: jnp.ndarray,
+        timestep: int,
+        sample: jnp.ndarray,
+        return_dict: bool = True,
+    ) -> Union[FlaxEulerDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            state (`EulerDiscreteSchedulerState`):
+                the `FlaxEulerDiscreteScheduler` state data class instance.
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            order: coefficient for multi-step inference.
+            return_dict (`bool`): option for returning tuple rather than FlaxEulerDiscreteScheduler class
+
+        Returns:
+            [`FlaxEulerDiscreteScheduler`] or `tuple`: [`FlaxEulerDiscreteScheduler`] if `return_dict` is True,
+            otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+        if state.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        (step_index,) = jnp.where(state.timesteps == timestep, size=1)
+        step_index = step_index[0]
+
+        sigma = state.sigmas[step_index]
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = sample - sigma * model_output
+        elif self.config.prediction_type == "v_prediction":
+            # * c_out + input * c_skip
+            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+
+        # 2. Convert to an ODE derivative
+        derivative = (sample - pred_original_sample) / sigma
+
+        # dt = sigma_down - sigma
+        dt = state.sigmas[step_index + 1] - sigma
+
+        prev_sample = sample + derivative * dt
+
+        if not return_dict:
+            return (prev_sample, state)
+
+        return FlaxEulerDiscreteSchedulerOutput(prev_sample=prev_sample, state=state)
+
+    def add_noise(
+        self,
+        state: EulerDiscreteSchedulerState,
+        original_samples: jnp.ndarray,
+        noise: jnp.ndarray,
+        timesteps: jnp.ndarray,
+    ) -> jnp.ndarray:
+        sigma = state.sigmas[timesteps].flatten()
+        sigma = broadcast_to_shape_from_left(sigma, noise.shape)
+
+        noisy_samples = original_samples + noise * sigma
+
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_heun_discrete.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_heun_discrete.py
new file mode 100644
index 000000000..fc955ac49
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_heun_discrete.py
@@ -0,0 +1,482 @@
+# Copyright 2024 Katherine Crowson, The HuggingFace Team and hlky. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Scheduler with Heun steps for discrete beta schedules.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear` or `scaled_linear`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample for numerical stability.
+        clip_sample_range (`float`, defaults to 1.0):
+            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 2
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.00085,  # sensible defaults
+        beta_end: float = 0.012,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        prediction_type: str = "epsilon",
+        use_karras_sigmas: Optional[bool] = False,
+        clip_sample: Optional[bool] = False,
+        clip_sample_range: float = 1.0,
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps, alpha_transform_type="cosine")
+        elif beta_schedule == "exp":
+            self.betas = betas_for_alpha_bar(num_train_timesteps, alpha_transform_type="exp")
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        #  set all values
+        self.set_timesteps(num_train_timesteps, None, num_train_timesteps)
+        self.use_karras_sigmas = use_karras_sigmas
+
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    @property
+    def init_noise_sigma(self):
+        # standard deviation of the initial noise distribution
+        if self.config.timestep_spacing in ["linspace", "trailing"]:
+            return self.sigmas.max()
+
+        return (self.sigmas.max() ** 2 + 1) ** 0.5
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increae 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_model_input(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+    ) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+        sample = sample / ((sigma**2 + 1) ** 0.5)
+        return sample
+
+    def set_timesteps(
+        self,
+        num_inference_steps: int,
+        device: Union[str, torch.device] = None,
+        num_train_timesteps: Optional[int] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+
+        num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=np.float32)[::-1].copy()
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.float32)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(num_train_timesteps, 0, -step_ratio)).round().copy().astype(np.float32)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        log_sigmas = np.log(sigmas)
+        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+
+        if self.config.use_karras_sigmas:
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+
+        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+        sigmas = torch.from_numpy(sigmas).to(device=device)
+        self.sigmas = torch.cat([sigmas[:1], sigmas[1:-1].repeat_interleave(2), sigmas[-1:]])
+
+        timesteps = torch.from_numpy(timesteps)
+        timesteps = torch.cat([timesteps[:1], timesteps[1:].repeat_interleave(2)])
+
+        self.timesteps = timesteps.to(device=device)
+
+        # empty dt and derivative
+        self.prev_derivative = None
+        self.dt = None
+
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    @property
+    def state_in_first_order(self):
+        return self.dt is None
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: Union[torch.FloatTensor, np.ndarray],
+        timestep: Union[float, torch.FloatTensor],
+        sample: Union[torch.FloatTensor, np.ndarray],
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        if self.state_in_first_order:
+            sigma = self.sigmas[self.step_index]
+            sigma_next = self.sigmas[self.step_index + 1]
+        else:
+            # 2nd order / Heun's method
+            sigma = self.sigmas[self.step_index - 1]
+            sigma_next = self.sigmas[self.step_index]
+
+        # currently only gamma=0 is supported. This usually works best anyways.
+        # We can support gamma in the future but then need to scale the timestep before
+        # passing it to the model which requires a change in API
+        gamma = 0
+        sigma_hat = sigma * (gamma + 1)  # Note: sigma_hat == sigma for now
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        if self.config.prediction_type == "epsilon":
+            sigma_input = sigma_hat if self.state_in_first_order else sigma_next
+            pred_original_sample = sample - sigma_input * model_output
+        elif self.config.prediction_type == "v_prediction":
+            sigma_input = sigma_hat if self.state_in_first_order else sigma_next
+            pred_original_sample = model_output * (-sigma_input / (sigma_input**2 + 1) ** 0.5) + (
+                sample / (sigma_input**2 + 1)
+            )
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+
+        if self.config.clip_sample:
+            pred_original_sample = pred_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        if self.state_in_first_order:
+            # 2. Convert to an ODE derivative for 1st order
+            derivative = (sample - pred_original_sample) / sigma_hat
+            # 3. delta timestep
+            dt = sigma_next - sigma_hat
+
+            # store for 2nd order step
+            self.prev_derivative = derivative
+            self.dt = dt
+            self.sample = sample
+        else:
+            # 2. 2nd order / Heun's method
+            derivative = (sample - pred_original_sample) / sigma_next
+            derivative = (self.prev_derivative + derivative) / 2
+
+            # 3. take prev timestep & sample
+            dt = self.dt
+            sample = self.sample
+
+            # free dt and derivative
+            # Note, this puts the scheduler in "first order mode"
+            self.prev_derivative = None
+            self.dt = None
+            self.sample = None
+
+        prev_sample = sample + derivative * dt
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        else:
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ipndm.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ipndm.py
new file mode 100644
index 000000000..583afa4d2
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ipndm.py
@@ -0,0 +1,224 @@
+# Copyright 2024 Zhejiang University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils import SchedulerMixin, SchedulerOutput
+
+
+class IPNDMScheduler(SchedulerMixin, ConfigMixin):
+    """
+    A fourth-order Improved Pseudo Linear Multistep scheduler.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+    """
+
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self, num_train_timesteps: int = 1000, trained_betas: Optional[Union[np.ndarray, List[float]]] = None
+    ):
+        # set `betas`, `alphas`, `timesteps`
+        self.set_timesteps(num_train_timesteps)
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # For now we only support F-PNDM, i.e. the runge-kutta method
+        # For more information on the algorithm please take a look at the paper: https://arxiv.org/pdf/2202.09778.pdf
+        # mainly at formula (9), (12), (13) and the Algorithm 2.
+        self.pndm_order = 4
+
+        # running values
+        self.ets = []
+        self._step_index = None
+        self._begin_index = None
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increae 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+        steps = torch.linspace(1, 0, num_inference_steps + 1)[:-1]
+        steps = torch.cat([steps, torch.tensor([0.0])])
+
+        if self.config.trained_betas is not None:
+            self.betas = torch.tensor(self.config.trained_betas, dtype=torch.float32)
+        else:
+            self.betas = torch.sin(steps * math.pi / 2) ** 2
+
+        self.alphas = (1.0 - self.betas**2) ** 0.5
+
+        timesteps = (torch.atan2(self.betas, self.alphas) / math.pi * 2)[:-1]
+        self.timesteps = timesteps.to(device)
+
+        self.ets = []
+        self._step_index = None
+        self._begin_index = None
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the linear multistep method. It performs one forward pass multiple times to approximate the solution.
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        timestep_index = self.step_index
+        prev_timestep_index = self.step_index + 1
+
+        ets = sample * self.betas[timestep_index] + model_output * self.alphas[timestep_index]
+        self.ets.append(ets)
+
+        if len(self.ets) == 1:
+            ets = self.ets[-1]
+        elif len(self.ets) == 2:
+            ets = (3 * self.ets[-1] - self.ets[-2]) / 2
+        elif len(self.ets) == 3:
+            ets = (23 * self.ets[-1] - 16 * self.ets[-2] + 5 * self.ets[-3]) / 12
+        else:
+            ets = (1 / 24) * (55 * self.ets[-1] - 59 * self.ets[-2] + 37 * self.ets[-3] - 9 * self.ets[-4])
+
+        prev_sample = self._get_prev_sample(sample, timestep_index, prev_timestep_index, ets)
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def _get_prev_sample(self, sample, timestep_index, prev_timestep_index, ets):
+        alpha = self.alphas[timestep_index]
+        sigma = self.betas[timestep_index]
+
+        next_alpha = self.alphas[prev_timestep_index]
+        next_sigma = self.betas[prev_timestep_index]
+
+        pred = (sample - sigma * ets) / max(alpha, 1e-8)
+        prev_sample = next_alpha * pred + ets * next_sigma
+
+        return prev_sample
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
new file mode 100644
index 000000000..9521c9c95
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
@@ -0,0 +1,508 @@
+# Copyright 2024 Katherine Crowson, The HuggingFace Team and hlky. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    KDPM2DiscreteScheduler with ancestral sampling is inspired by the DPMSolver2 and Algorithm 2 from the [Elucidating
+    the Design Space of Diffusion-Based Generative Models](https://huggingface.co/papers/2206.00364) paper.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.00085):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.012):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear` or `scaled_linear`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 2
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.00085,  # sensible defaults
+        beta_end: float = 0.012,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        use_karras_sigmas: Optional[bool] = False,
+        prediction_type: str = "epsilon",
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        #  set all values
+        self.set_timesteps(num_train_timesteps, None, num_train_timesteps)
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def init_noise_sigma(self):
+        # standard deviation of the initial noise distribution
+        if self.config.timestep_spacing in ["linspace", "trailing"]:
+            return self.sigmas.max()
+
+        return (self.sigmas.max() ** 2 + 1) ** 0.5
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increae 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_model_input(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+    ) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        if self.state_in_first_order:
+            sigma = self.sigmas[self.step_index]
+        else:
+            sigma = self.sigmas_interpol[self.step_index - 1]
+
+        sample = sample / ((sigma**2 + 1) ** 0.5)
+        return sample
+
+    def set_timesteps(
+        self,
+        num_inference_steps: int,
+        device: Union[str, torch.device] = None,
+        num_train_timesteps: Optional[int] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+
+        num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=np.float32)[::-1].copy()
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.float32)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(num_train_timesteps, 0, -step_ratio)).round().copy().astype(np.float32)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        log_sigmas = np.log(sigmas)
+
+        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+
+        if self.config.use_karras_sigmas:
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+
+        self.log_sigmas = torch.from_numpy(log_sigmas).to(device)
+        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+        sigmas = torch.from_numpy(sigmas).to(device=device)
+
+        # compute up and down sigmas
+        sigmas_next = sigmas.roll(-1)
+        sigmas_next[-1] = 0.0
+        sigmas_up = (sigmas_next**2 * (sigmas**2 - sigmas_next**2) / sigmas**2) ** 0.5
+        sigmas_down = (sigmas_next**2 - sigmas_up**2) ** 0.5
+        sigmas_down[-1] = 0.0
+
+        # compute interpolated sigmas
+        sigmas_interpol = sigmas.log().lerp(sigmas_down.log(), 0.5).exp()
+        sigmas_interpol[-2:] = 0.0
+
+        # set sigmas
+        self.sigmas = torch.cat([sigmas[:1], sigmas[1:].repeat_interleave(2), sigmas[-1:]])
+        self.sigmas_interpol = torch.cat(
+            [sigmas_interpol[:1], sigmas_interpol[1:].repeat_interleave(2), sigmas_interpol[-1:]]
+        )
+        self.sigmas_up = torch.cat([sigmas_up[:1], sigmas_up[1:].repeat_interleave(2), sigmas_up[-1:]])
+        self.sigmas_down = torch.cat([sigmas_down[:1], sigmas_down[1:].repeat_interleave(2), sigmas_down[-1:]])
+
+        if str(device).startswith("mps"):
+            timesteps = torch.from_numpy(timesteps).to(device, dtype=torch.float32)
+        else:
+            timesteps = torch.from_numpy(timesteps).to(device)
+
+        sigmas_interpol = sigmas_interpol.cpu()
+        log_sigmas = self.log_sigmas.cpu()
+        timesteps_interpol = np.array(
+            [self._sigma_to_t(sigma_interpol, log_sigmas) for sigma_interpol in sigmas_interpol]
+        )
+
+        timesteps_interpol = torch.from_numpy(timesteps_interpol).to(device, dtype=timesteps.dtype)
+        interleaved_timesteps = torch.stack((timesteps_interpol[:-2, None], timesteps[1:, None]), dim=-1).flatten()
+
+        self.timesteps = torch.cat([timesteps[:1], interleaved_timesteps])
+
+        self.sample = None
+
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    @property
+    def state_in_first_order(self):
+        return self.sample is None
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: Union[torch.FloatTensor, np.ndarray],
+        timestep: Union[float, torch.FloatTensor],
+        sample: Union[torch.FloatTensor, np.ndarray],
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_ddim.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        if self.state_in_first_order:
+            sigma = self.sigmas[self.step_index]
+            sigma_interpol = self.sigmas_interpol[self.step_index]
+            sigma_up = self.sigmas_up[self.step_index]
+            sigma_down = self.sigmas_down[self.step_index - 1]
+        else:
+            # 2nd order / KPDM2's method
+            sigma = self.sigmas[self.step_index - 1]
+            sigma_interpol = self.sigmas_interpol[self.step_index - 1]
+            sigma_up = self.sigmas_up[self.step_index - 1]
+            sigma_down = self.sigmas_down[self.step_index - 1]
+
+        # currently only gamma=0 is supported. This usually works best anyways.
+        # We can support gamma in the future but then need to scale the timestep before
+        # passing it to the model which requires a change in API
+        gamma = 0
+        sigma_hat = sigma * (gamma + 1)  # Note: sigma_hat == sigma for now
+
+        device = model_output.device
+        noise = randn_tensor(model_output.shape, dtype=model_output.dtype, device=device, generator=generator)
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        if self.config.prediction_type == "epsilon":
+            sigma_input = sigma_hat if self.state_in_first_order else sigma_interpol
+            pred_original_sample = sample - sigma_input * model_output
+        elif self.config.prediction_type == "v_prediction":
+            sigma_input = sigma_hat if self.state_in_first_order else sigma_interpol
+            pred_original_sample = model_output * (-sigma_input / (sigma_input**2 + 1) ** 0.5) + (
+                sample / (sigma_input**2 + 1)
+            )
+        elif self.config.prediction_type == "sample":
+            raise NotImplementedError("prediction_type not implemented yet: sample")
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+
+        if self.state_in_first_order:
+            # 2. Convert to an ODE derivative for 1st order
+            derivative = (sample - pred_original_sample) / sigma_hat
+            # 3. delta timestep
+            dt = sigma_interpol - sigma_hat
+
+            # store for 2nd order step
+            self.sample = sample
+            self.dt = dt
+            prev_sample = sample + derivative * dt
+        else:
+            # DPM-Solver-2
+            # 2. Convert to an ODE derivative for 2nd order
+            derivative = (sample - pred_original_sample) / sigma_interpol
+            # 3. delta timestep
+            dt = sigma_down - sigma_hat
+
+            sample = self.sample
+            self.sample = None
+
+            prev_sample = sample + derivative * dt
+            prev_sample = prev_sample + noise * sigma_up
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        else:
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
new file mode 100644
index 000000000..5be07b6da
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
@@ -0,0 +1,483 @@
+# Copyright 2024 Katherine Crowson, The HuggingFace Team and hlky. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    KDPM2DiscreteScheduler is inspired by the DPMSolver2 and Algorithm 2 from the [Elucidating the Design Space of
+    Diffusion-Based Generative Models](https://huggingface.co/papers/2206.00364) paper.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.00085):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.012):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear` or `scaled_linear`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 2
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.00085,  # sensible defaults
+        beta_end: float = 0.012,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        use_karras_sigmas: Optional[bool] = False,
+        prediction_type: str = "epsilon",
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        #  set all values
+        self.set_timesteps(num_train_timesteps, None, num_train_timesteps)
+
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def init_noise_sigma(self):
+        # standard deviation of the initial noise distribution
+        if self.config.timestep_spacing in ["linspace", "trailing"]:
+            return self.sigmas.max()
+
+        return (self.sigmas.max() ** 2 + 1) ** 0.5
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increae 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_model_input(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+    ) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        if self.state_in_first_order:
+            sigma = self.sigmas[self.step_index]
+        else:
+            sigma = self.sigmas_interpol[self.step_index]
+
+        sample = sample / ((sigma**2 + 1) ** 0.5)
+        return sample
+
+    def set_timesteps(
+        self,
+        num_inference_steps: int,
+        device: Union[str, torch.device] = None,
+        num_train_timesteps: Optional[int] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+
+        num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=np.float32)[::-1].copy()
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.float32)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(num_train_timesteps, 0, -step_ratio)).round().copy().astype(np.float32)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        log_sigmas = np.log(sigmas)
+        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+
+        if self.config.use_karras_sigmas:
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+
+        self.log_sigmas = torch.from_numpy(log_sigmas).to(device=device)
+        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+        sigmas = torch.from_numpy(sigmas).to(device=device)
+
+        # interpolate sigmas
+        sigmas_interpol = sigmas.log().lerp(sigmas.roll(1).log(), 0.5).exp()
+
+        self.sigmas = torch.cat([sigmas[:1], sigmas[1:].repeat_interleave(2), sigmas[-1:]])
+        self.sigmas_interpol = torch.cat(
+            [sigmas_interpol[:1], sigmas_interpol[1:].repeat_interleave(2), sigmas_interpol[-1:]]
+        )
+
+        timesteps = torch.from_numpy(timesteps).to(device)
+
+        # interpolate timesteps
+        sigmas_interpol = sigmas_interpol.cpu()
+        log_sigmas = self.log_sigmas.cpu()
+        timesteps_interpol = np.array(
+            [self._sigma_to_t(sigma_interpol, log_sigmas) for sigma_interpol in sigmas_interpol]
+        )
+        timesteps_interpol = torch.from_numpy(timesteps_interpol).to(device, dtype=timesteps.dtype)
+        interleaved_timesteps = torch.stack((timesteps_interpol[1:-1, None], timesteps[1:, None]), dim=-1).flatten()
+
+        self.timesteps = torch.cat([timesteps[:1], interleaved_timesteps])
+
+        self.sample = None
+
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def state_in_first_order(self):
+        return self.sample is None
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    def step(
+        self,
+        model_output: Union[torch.FloatTensor, np.ndarray],
+        timestep: Union[float, torch.FloatTensor],
+        sample: Union[torch.FloatTensor, np.ndarray],
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        if self.state_in_first_order:
+            sigma = self.sigmas[self.step_index]
+            sigma_interpol = self.sigmas_interpol[self.step_index + 1]
+            sigma_next = self.sigmas[self.step_index + 1]
+        else:
+            # 2nd order / KDPM2's method
+            sigma = self.sigmas[self.step_index - 1]
+            sigma_interpol = self.sigmas_interpol[self.step_index]
+            sigma_next = self.sigmas[self.step_index]
+
+        # currently only gamma=0 is supported. This usually works best anyways.
+        # We can support gamma in the future but then need to scale the timestep before
+        # passing it to the model which requires a change in API
+        gamma = 0
+        sigma_hat = sigma * (gamma + 1)  # Note: sigma_hat == sigma for now
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        if self.config.prediction_type == "epsilon":
+            sigma_input = sigma_hat if self.state_in_first_order else sigma_interpol
+            pred_original_sample = sample - sigma_input * model_output
+        elif self.config.prediction_type == "v_prediction":
+            sigma_input = sigma_hat if self.state_in_first_order else sigma_interpol
+            pred_original_sample = model_output * (-sigma_input / (sigma_input**2 + 1) ** 0.5) + (
+                sample / (sigma_input**2 + 1)
+            )
+        elif self.config.prediction_type == "sample":
+            raise NotImplementedError("prediction_type not implemented yet: sample")
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+
+        if self.state_in_first_order:
+            # 2. Convert to an ODE derivative for 1st order
+            derivative = (sample - pred_original_sample) / sigma_hat
+            # 3. delta timestep
+            dt = sigma_interpol - sigma_hat
+
+            # store for 2nd order step
+            self.sample = sample
+        else:
+            # DPM-Solver-2
+            # 2. Convert to an ODE derivative for 2nd order
+            derivative = (sample - pred_original_sample) / sigma_interpol
+
+            # 3. delta timestep
+            dt = sigma_next - sigma_hat
+
+            sample = self.sample
+            self.sample = None
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        prev_sample = sample + derivative * dt
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        else:
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_karras_ve_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_karras_ve_flax.py
new file mode 100644
index 000000000..4d099604a
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_karras_ve_flax.py
@@ -0,0 +1,238 @@
+# Copyright 2024 NVIDIA and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import flax
+import jax
+import jax.numpy as jnp
+from jax import random
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from .scheduling_utils_flax import FlaxSchedulerMixin
+
+
+@flax.struct.dataclass
+class KarrasVeSchedulerState:
+    # setable values
+    num_inference_steps: Optional[int] = None
+    timesteps: Optional[jnp.ndarray] = None
+    schedule: Optional[jnp.ndarray] = None  # sigma(t_i)
+
+    @classmethod
+    def create(cls):
+        return cls()
+
+
+@dataclass
+class FlaxKarrasVeOutput(BaseOutput):
+    """
+    Output class for the scheduler's step function output.
+
+    Args:
+        prev_sample (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        derivative (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)` for images):
+            Derivative of predicted original image sample (x_0).
+        state (`KarrasVeSchedulerState`): the `FlaxKarrasVeScheduler` state data class.
+    """
+
+    prev_sample: jnp.ndarray
+    derivative: jnp.ndarray
+    state: KarrasVeSchedulerState
+
+
+class FlaxKarrasVeScheduler(FlaxSchedulerMixin, ConfigMixin):
+    """
+    Stochastic sampling from Karras et al. [1] tailored to the Variance-Expanding (VE) models [2]. Use Algorithm 2 and
+    the VE column of Table 1 from [1] for reference.
+
+    [1] Karras, Tero, et al. "Elucidating the Design Space of Diffusion-Based Generative Models."
+    https://arxiv.org/abs/2206.00364 [2] Song, Yang, et al. "Score-based generative modeling through stochastic
+    differential equations." https://arxiv.org/abs/2011.13456
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
+
+    For more details on the parameters, see the original paper's Appendix E.: "Elucidating the Design Space of
+    Diffusion-Based Generative Models." https://arxiv.org/abs/2206.00364. The grid search values used to find the
+    optimal {s_noise, s_churn, s_min, s_max} for a specific model are described in Table 5 of the paper.
+
+    Args:
+        sigma_min (`float`): minimum noise magnitude
+        sigma_max (`float`): maximum noise magnitude
+        s_noise (`float`): the amount of additional noise to counteract loss of detail during sampling.
+            A reasonable range is [1.000, 1.011].
+        s_churn (`float`): the parameter controlling the overall amount of stochasticity.
+            A reasonable range is [0, 100].
+        s_min (`float`): the start value of the sigma range where we add noise (enable stochasticity).
+            A reasonable range is [0, 10].
+        s_max (`float`): the end value of the sigma range where we add noise.
+            A reasonable range is [0.2, 80].
+    """
+
+    @property
+    def has_state(self):
+        return True
+
+    @register_to_config
+    def __init__(
+        self,
+        sigma_min: float = 0.02,
+        sigma_max: float = 100,
+        s_noise: float = 1.007,
+        s_churn: float = 80,
+        s_min: float = 0.05,
+        s_max: float = 50,
+    ):
+        pass
+
+    def create_state(self):
+        return KarrasVeSchedulerState.create()
+
+    def set_timesteps(
+        self, state: KarrasVeSchedulerState, num_inference_steps: int, shape: Tuple = ()
+    ) -> KarrasVeSchedulerState:
+        """
+        Sets the continuous timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            state (`KarrasVeSchedulerState`):
+                the `FlaxKarrasVeScheduler` state data class.
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+
+        """
+        timesteps = jnp.arange(0, num_inference_steps)[::-1].copy()
+        schedule = [
+            (
+                self.config.sigma_max**2
+                * (self.config.sigma_min**2 / self.config.sigma_max**2) ** (i / (num_inference_steps - 1))
+            )
+            for i in timesteps
+        ]
+
+        return state.replace(
+            num_inference_steps=num_inference_steps,
+            schedule=jnp.array(schedule, dtype=jnp.float32),
+            timesteps=timesteps,
+        )
+
+    def add_noise_to_input(
+        self,
+        state: KarrasVeSchedulerState,
+        sample: jnp.ndarray,
+        sigma: float,
+        key: jax.Array,
+    ) -> Tuple[jnp.ndarray, float]:
+        """
+        Explicit Langevin-like "churn" step of adding noise to the sample according to a factor gamma_i ≥ 0 to reach a
+        higher noise level sigma_hat = sigma_i + gamma_i*sigma_i.
+
+        TODO Args:
+        """
+        if self.config.s_min <= sigma <= self.config.s_max:
+            gamma = min(self.config.s_churn / state.num_inference_steps, 2**0.5 - 1)
+        else:
+            gamma = 0
+
+        # sample eps ~ N(0, S_noise^2 * I)
+        key = random.split(key, num=1)
+        eps = self.config.s_noise * random.normal(key=key, shape=sample.shape)
+        sigma_hat = sigma + gamma * sigma
+        sample_hat = sample + ((sigma_hat**2 - sigma**2) ** 0.5 * eps)
+
+        return sample_hat, sigma_hat
+
+    def step(
+        self,
+        state: KarrasVeSchedulerState,
+        model_output: jnp.ndarray,
+        sigma_hat: float,
+        sigma_prev: float,
+        sample_hat: jnp.ndarray,
+        return_dict: bool = True,
+    ) -> Union[FlaxKarrasVeOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            state (`KarrasVeSchedulerState`): the `FlaxKarrasVeScheduler` state data class.
+            model_output (`torch.FloatTensor` or `np.ndarray`): direct output from learned diffusion model.
+            sigma_hat (`float`): TODO
+            sigma_prev (`float`): TODO
+            sample_hat (`torch.FloatTensor` or `np.ndarray`): TODO
+            return_dict (`bool`): option for returning tuple rather than FlaxKarrasVeOutput class
+
+        Returns:
+            [`~schedulers.scheduling_karras_ve_flax.FlaxKarrasVeOutput`] or `tuple`: Updated sample in the diffusion
+            chain and derivative. [`~schedulers.scheduling_karras_ve_flax.FlaxKarrasVeOutput`] if `return_dict` is
+            True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor.
+        """
+
+        pred_original_sample = sample_hat + sigma_hat * model_output
+        derivative = (sample_hat - pred_original_sample) / sigma_hat
+        sample_prev = sample_hat + (sigma_prev - sigma_hat) * derivative
+
+        if not return_dict:
+            return (sample_prev, derivative, state)
+
+        return FlaxKarrasVeOutput(prev_sample=sample_prev, derivative=derivative, state=state)
+
+    def step_correct(
+        self,
+        state: KarrasVeSchedulerState,
+        model_output: jnp.ndarray,
+        sigma_hat: float,
+        sigma_prev: float,
+        sample_hat: jnp.ndarray,
+        sample_prev: jnp.ndarray,
+        derivative: jnp.ndarray,
+        return_dict: bool = True,
+    ) -> Union[FlaxKarrasVeOutput, Tuple]:
+        """
+        Correct the predicted sample based on the output model_output of the network. TODO complete description
+
+        Args:
+            state (`KarrasVeSchedulerState`): the `FlaxKarrasVeScheduler` state data class.
+            model_output (`torch.FloatTensor` or `np.ndarray`): direct output from learned diffusion model.
+            sigma_hat (`float`): TODO
+            sigma_prev (`float`): TODO
+            sample_hat (`torch.FloatTensor` or `np.ndarray`): TODO
+            sample_prev (`torch.FloatTensor` or `np.ndarray`): TODO
+            derivative (`torch.FloatTensor` or `np.ndarray`): TODO
+            return_dict (`bool`): option for returning tuple rather than FlaxKarrasVeOutput class
+
+        Returns:
+            prev_sample (TODO): updated sample in the diffusion chain. derivative (TODO): TODO
+
+        """
+        pred_original_sample = sample_prev + sigma_prev * model_output
+        derivative_corr = (sample_prev - pred_original_sample) / sigma_prev
+        sample_prev = sample_hat + (sigma_prev - sigma_hat) * (0.5 * derivative + 0.5 * derivative_corr)
+
+        if not return_dict:
+            return (sample_prev, derivative, state)
+
+        return FlaxKarrasVeOutput(prev_sample=sample_prev, derivative=derivative, state=state)
+
+    def add_noise(self, state: KarrasVeSchedulerState, original_samples, noise, timesteps):
+        raise NotImplementedError()
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_lcm.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_lcm.py
new file mode 100644
index 000000000..846558b38
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_lcm.py
@@ -0,0 +1,660 @@
+# Copyright 2024 Stanford University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput, logging
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import SchedulerMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class LCMSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.FloatTensor
+    denoised: Optional[torch.FloatTensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
+def rescale_zero_terminal_snr(betas: torch.FloatTensor) -> torch.FloatTensor:
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.FloatTensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.FloatTensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class LCMScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `LCMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
+    non-Markovian guidance.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. [`~ConfigMixin`] takes care of storing all config
+    attributes that are passed in the scheduler's `__init__` function, such as `num_train_timesteps`. They can be
+    accessed via `scheduler.config.num_train_timesteps`. [`SchedulerMixin`] provides general loading and saving
+    functionality via the [`SchedulerMixin.save_pretrained`] and [`~SchedulerMixin.from_pretrained`] functions.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        original_inference_steps (`int`, *optional*, defaults to 50):
+            The default number of inference steps used to generate a linearly-spaced timestep schedule, from which we
+            will ultimately take `num_inference_steps` evenly spaced timesteps to form the final timestep schedule.
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample for numerical stability.
+        clip_sample_range (`float`, defaults to 1.0):
+            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_one (`bool`, defaults to `True`):
+            Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
+            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the alpha value at step 0.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        timestep_spacing (`str`, defaults to `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        timestep_scaling (`float`, defaults to 10.0):
+            The factor the timesteps will be multiplied by when calculating the consistency model boundary conditions
+            `c_skip` and `c_out`. Increasing this will decrease the approximation error (although the approximation
+            error at the default of `10.0` is already pretty small).
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.00085,
+        beta_end: float = 0.012,
+        beta_schedule: str = "scaled_linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        original_inference_steps: int = 50,
+        clip_sample: bool = False,
+        clip_sample_range: float = 1.0,
+        set_alpha_to_one: bool = True,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        timestep_scaling: float = 10.0,
+        rescale_betas_zero_snr: bool = False,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
+        self.custom_timesteps = False
+
+        self._step_index = None
+        self._begin_index = None
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    @property
+    def step_index(self):
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    def set_timesteps(
+        self,
+        num_inference_steps: Optional[int] = None,
+        device: Union[str, torch.device] = None,
+        original_inference_steps: Optional[int] = None,
+        timesteps: Optional[List[int]] = None,
+        strength: int = 1.0,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`, *optional*):
+                The number of diffusion steps used when generating samples with a pre-trained model. If used,
+                `timesteps` must be `None`.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            original_inference_steps (`int`, *optional*):
+                The original number of inference steps, which will be used to generate a linearly-spaced timestep
+                schedule (which is different from the standard `diffusers` implementation). We will then take
+                `num_inference_steps` timesteps from this schedule, evenly spaced in terms of indices, and use that as
+                our final timestep schedule. If not set, this will default to the `original_inference_steps` attribute.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of equal spacing between timesteps on the training/distillation timestep
+                schedule is used. If `timesteps` is passed, `num_inference_steps` must be `None`.
+        """
+        # 0. Check inputs
+        if num_inference_steps is None and timesteps is None:
+            raise ValueError("Must pass exactly one of `num_inference_steps` or `custom_timesteps`.")
+
+        if num_inference_steps is not None and timesteps is not None:
+            raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.")
+
+        # 1. Calculate the LCM original training/distillation timestep schedule.
+        original_steps = (
+            original_inference_steps if original_inference_steps is not None else self.config.original_inference_steps
+        )
+
+        if original_steps > self.config.num_train_timesteps:
+            raise ValueError(
+                f"`original_steps`: {original_steps} cannot be larger than `self.config.train_timesteps`:"
+                f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                f" maximal {self.config.num_train_timesteps} timesteps."
+            )
+
+        # LCM Timesteps Setting
+        # The skipping step parameter k from the paper.
+        k = self.config.num_train_timesteps // original_steps
+        # LCM Training/Distillation Steps Schedule
+        # Currently, only a linearly-spaced schedule is supported (same as in the LCM distillation scripts).
+        lcm_origin_timesteps = np.asarray(list(range(1, int(original_steps * strength) + 1))) * k - 1
+
+        # 2. Calculate the LCM inference timestep schedule.
+        if timesteps is not None:
+            # 2.1 Handle custom timestep schedules.
+            train_timesteps = set(lcm_origin_timesteps)
+            non_train_timesteps = []
+            for i in range(1, len(timesteps)):
+                if timesteps[i] >= timesteps[i - 1]:
+                    raise ValueError("`custom_timesteps` must be in descending order.")
+
+                if timesteps[i] not in train_timesteps:
+                    non_train_timesteps.append(timesteps[i])
+
+            if timesteps[0] >= self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`timesteps` must start before `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps}."
+                )
+
+            # Raise warning if timestep schedule does not start with self.config.num_train_timesteps - 1
+            if strength == 1.0 and timesteps[0] != self.config.num_train_timesteps - 1:
+                logger.warning(
+                    f"The first timestep on the custom timestep schedule is {timesteps[0]}, not"
+                    f" `self.config.num_train_timesteps - 1`: {self.config.num_train_timesteps - 1}. You may get"
+                    f" unexpected results when using this timestep schedule."
+                )
+
+            # Raise warning if custom timestep schedule contains timesteps not on original timestep schedule
+            if non_train_timesteps:
+                logger.warning(
+                    f"The custom timestep schedule contains the following timesteps which are not on the original"
+                    f" training/distillation timestep schedule: {non_train_timesteps}. You may get unexpected results"
+                    f" when using this timestep schedule."
+                )
+
+            # Raise warning if custom timestep schedule is longer than original_steps
+            if len(timesteps) > original_steps:
+                logger.warning(
+                    f"The number of timesteps in the custom timestep schedule is {len(timesteps)}, which exceeds the"
+                    f" the length of the timestep schedule used for training: {original_steps}. You may get some"
+                    f" unexpected results when using this timestep schedule."
+                )
+
+            timesteps = np.array(timesteps, dtype=np.int64)
+            self.num_inference_steps = len(timesteps)
+            self.custom_timesteps = True
+
+            # Apply strength (e.g. for img2img pipelines) (see StableDiffusionImg2ImgPipeline.get_timesteps)
+            init_timestep = min(int(self.num_inference_steps * strength), self.num_inference_steps)
+            t_start = max(self.num_inference_steps - init_timestep, 0)
+            timesteps = timesteps[t_start * self.order :]
+            # TODO: also reset self.num_inference_steps?
+        else:
+            # 2.2 Create the "standard" LCM inference timestep schedule.
+            if num_inference_steps > self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                    f" maximal {self.config.num_train_timesteps} timesteps."
+                )
+
+            skipping_step = len(lcm_origin_timesteps) // num_inference_steps
+
+            if skipping_step < 1:
+                raise ValueError(
+                    f"The combination of `original_steps x strength`: {original_steps} x {strength} is smaller than `num_inference_steps`: {num_inference_steps}. Make sure to either reduce `num_inference_steps` to a value smaller than {int(original_steps * strength)} or increase `strength` to a value higher than {float(num_inference_steps / original_steps)}."
+                )
+
+            self.num_inference_steps = num_inference_steps
+
+            if num_inference_steps > original_steps:
+                raise ValueError(
+                    f"`num_inference_steps`: {num_inference_steps} cannot be larger than `original_inference_steps`:"
+                    f" {original_steps} because the final timestep schedule will be a subset of the"
+                    f" `original_inference_steps`-sized initial timestep schedule."
+                )
+
+            # LCM Inference Steps Schedule
+            lcm_origin_timesteps = lcm_origin_timesteps[::-1].copy()
+            # Select (approximately) evenly spaced indices from lcm_origin_timesteps.
+            inference_indices = np.linspace(0, len(lcm_origin_timesteps), num=num_inference_steps, endpoint=False)
+            inference_indices = np.floor(inference_indices).astype(np.int64)
+            timesteps = lcm_origin_timesteps[inference_indices]
+
+        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.long)
+
+        self._step_index = None
+        self._begin_index = None
+
+    def get_scalings_for_boundary_condition_discrete(self, timestep):
+        self.sigma_data = 0.5  # Default: 0.5
+        scaled_timestep = timestep * self.config.timestep_scaling
+
+        c_skip = self.sigma_data**2 / (scaled_timestep**2 + self.sigma_data**2)
+        c_out = scaled_timestep / (scaled_timestep**2 + self.sigma_data**2) ** 0.5
+        return c_skip, c_out
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[LCMSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] or `tuple`.
+        Returns:
+            [`~schedulers.scheduling_utils.LCMSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_lcm.LCMSchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        # 1. get previous step value
+        prev_step_index = self.step_index + 1
+        if prev_step_index < len(self.timesteps):
+            prev_timestep = self.timesteps[prev_step_index]
+        else:
+            prev_timestep = timestep
+
+        # 2. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        # 3. Get scalings for boundary conditions
+        c_skip, c_out = self.get_scalings_for_boundary_condition_discrete(timestep)
+
+        # 4. Compute the predicted original sample x_0 based on the model parameterization
+        if self.config.prediction_type == "epsilon":  # noise-prediction
+            predicted_original_sample = (sample - beta_prod_t.sqrt() * model_output) / alpha_prod_t.sqrt()
+        elif self.config.prediction_type == "sample":  # x-prediction
+            predicted_original_sample = model_output
+        elif self.config.prediction_type == "v_prediction":  # v-prediction
+            predicted_original_sample = alpha_prod_t.sqrt() * sample - beta_prod_t.sqrt() * model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
+                " `v_prediction` for `LCMScheduler`."
+            )
+
+        # 5. Clip or threshold "predicted x_0"
+        if self.config.thresholding:
+            predicted_original_sample = self._threshold_sample(predicted_original_sample)
+        elif self.config.clip_sample:
+            predicted_original_sample = predicted_original_sample.clamp(
+                -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        # 6. Denoise model output using boundary conditions
+        denoised = c_out * predicted_original_sample + c_skip * sample
+
+        # 7. Sample and inject noise z ~ N(0, I) for MultiStep Inference
+        # Noise is not used on the final timestep of the timestep schedule.
+        # This also means that noise is not used for one-step sampling.
+        if self.step_index != self.num_inference_steps - 1:
+            noise = randn_tensor(
+                model_output.shape, generator=generator, device=model_output.device, dtype=denoised.dtype
+            )
+            prev_sample = alpha_prod_t_prev.sqrt() * denoised + beta_prod_t_prev.sqrt() * noise
+        else:
+            prev_sample = denoised
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample, denoised)
+
+        return LCMSchedulerOutput(prev_sample=prev_sample, denoised=denoised)
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+        # for the subsequent add_noise calls
+        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
+    def get_velocity(
+        self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+
+    def __len__(self):
+        return self.config.num_train_timesteps
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.previous_timestep
+    def previous_timestep(self, timestep):
+        if self.custom_timesteps:
+            index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
+            if index == self.timesteps.shape[0] - 1:
+                prev_t = torch.tensor(-1)
+            else:
+                prev_t = self.timesteps[index + 1]
+        else:
+            num_inference_steps = (
+                self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps
+            )
+            prev_t = timestep - self.config.num_train_timesteps // num_inference_steps
+
+        return prev_t
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_lms_discrete.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_lms_discrete.py
new file mode 100644
index 000000000..43a0ba4a2
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_lms_discrete.py
@@ -0,0 +1,475 @@
+# Copyright 2024 Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import warnings
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from scipy import integrate
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->LMSDiscrete
+class LMSDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.FloatTensor
+    pred_original_sample: Optional[torch.FloatTensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    A linear multistep scheduler for discrete beta schedules.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear` or `scaled_linear`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        use_karras_sigmas: Optional[bool] = False,
+        prediction_type: str = "epsilon",
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
+        self.sigmas = torch.from_numpy(sigmas)
+
+        # setable values
+        self.num_inference_steps = None
+        self.use_karras_sigmas = use_karras_sigmas
+        self.set_timesteps(num_train_timesteps, None)
+        self.derivatives = []
+        self.is_scale_input_called = False
+
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def init_noise_sigma(self):
+        # standard deviation of the initial noise distribution
+        if self.config.timestep_spacing in ["linspace", "trailing"]:
+            return self.sigmas.max()
+
+        return (self.sigmas.max() ** 2 + 1) ** 0.5
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increae 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_model_input(
+        self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor]
+    ) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`float` or `torch.FloatTensor`):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+        sample = sample / ((sigma**2 + 1) ** 0.5)
+        self.is_scale_input_called = True
+        return sample
+
+    def get_lms_coefficient(self, order, t, current_order):
+        """
+        Compute the linear multistep coefficient.
+
+        Args:
+            order ():
+            t ():
+            current_order ():
+        """
+
+        def lms_derivative(tau):
+            prod = 1.0
+            for k in range(order):
+                if current_order == k:
+                    continue
+                prod *= (tau - self.sigmas[t - k]) / (self.sigmas[t - current_order] - self.sigmas[t - k])
+            return prod
+
+        integrated_coeff = integrate.quad(lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0]
+
+        return integrated_coeff
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        self.num_inference_steps = num_inference_steps
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=np.float32)[
+                ::-1
+            ].copy()
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.float32)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(self.config.num_train_timesteps, 0, -step_ratio)).round().copy().astype(np.float32)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        log_sigmas = np.log(sigmas)
+        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+
+        if self.use_karras_sigmas:
+            sigmas = self._convert_to_karras(in_sigmas=sigmas)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])
+
+        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+
+        self.sigmas = torch.from_numpy(sigmas).to(device=device)
+        self.timesteps = torch.from_numpy(timesteps).to(device=device)
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+        self.derivatives = []
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    # copied from diffusers.schedulers.scheduling_euler_discrete._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    # copied from diffusers.schedulers.scheduling_euler_discrete._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.FloatTensor) -> torch.FloatTensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        sigma_min: float = in_sigmas[-1].item()
+        sigma_max: float = in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, self.num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        order: int = 4,
+        return_dict: bool = True,
+    ) -> Union[LMSDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float` or `torch.FloatTensor`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            order (`int`, defaults to 4):
+                The order of the linear multistep method.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if not self.is_scale_input_called:
+            warnings.warn(
+                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+                "See `StableDiffusionPipeline` for a usage example."
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        sigma = self.sigmas[self.step_index]
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = sample - sigma * model_output
+        elif self.config.prediction_type == "v_prediction":
+            # * c_out + input * c_skip
+            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+
+        # 2. Convert to an ODE derivative
+        derivative = (sample - pred_original_sample) / sigma
+        self.derivatives.append(derivative)
+        if len(self.derivatives) > order:
+            self.derivatives.pop(0)
+
+        # 3. Compute linear multistep coefficients
+        order = min(self.step_index + 1, order)
+        lms_coeffs = [self.get_lms_coefficient(order, self.step_index, curr_order) for curr_order in range(order)]
+
+        # 4. Compute previous sample based on the derivatives path
+        prev_sample = sample + sum(
+            coeff * derivative for coeff, derivative in zip(lms_coeffs, reversed(self.derivatives))
+        )
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return LMSDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        else:
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        noisy_samples = original_samples + noise * sigma
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_lms_discrete_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_lms_discrete_flax.py
new file mode 100644
index 000000000..f1169cc90
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_lms_discrete_flax.py
@@ -0,0 +1,283 @@
+# Copyright 2024 Katherine Crowson and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import flax
+import jax.numpy as jnp
+from scipy import integrate
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils_flax import (
+    CommonSchedulerState,
+    FlaxKarrasDiffusionSchedulers,
+    FlaxSchedulerMixin,
+    FlaxSchedulerOutput,
+    broadcast_to_shape_from_left,
+)
+
+
+@flax.struct.dataclass
+class LMSDiscreteSchedulerState:
+    common: CommonSchedulerState
+
+    # setable values
+    init_noise_sigma: jnp.ndarray
+    timesteps: jnp.ndarray
+    sigmas: jnp.ndarray
+    num_inference_steps: Optional[int] = None
+
+    # running values
+    derivatives: Optional[jnp.ndarray] = None
+
+    @classmethod
+    def create(
+        cls, common: CommonSchedulerState, init_noise_sigma: jnp.ndarray, timesteps: jnp.ndarray, sigmas: jnp.ndarray
+    ):
+        return cls(common=common, init_noise_sigma=init_noise_sigma, timesteps=timesteps, sigmas=sigmas)
+
+
+@dataclass
+class FlaxLMSSchedulerOutput(FlaxSchedulerOutput):
+    state: LMSDiscreteSchedulerState
+
+
+class FlaxLMSDiscreteScheduler(FlaxSchedulerMixin, ConfigMixin):
+    """
+    Linear Multistep Scheduler for discrete beta schedules. Based on the original k-diffusion implementation by
+    Katherine Crowson:
+    https://github.com/crowsonkb/k-diffusion/blob/481677d114f6ea445aa009cf5bd7a9cdee909e47/k_diffusion/sampling.py#L181
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear` or `scaled_linear`.
+        trained_betas (`jnp.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+        prediction_type (`str`, default `epsilon`, optional):
+            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
+            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
+            https://imagen.research.google/video/paper.pdf)
+        dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
+            the `dtype` used for params and computation.
+    """
+
+    _compatibles = [e.name for e in FlaxKarrasDiffusionSchedulers]
+
+    dtype: jnp.dtype
+
+    @property
+    def has_state(self):
+        return True
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[jnp.ndarray] = None,
+        prediction_type: str = "epsilon",
+        dtype: jnp.dtype = jnp.float32,
+    ):
+        self.dtype = dtype
+
+    def create_state(self, common: Optional[CommonSchedulerState] = None) -> LMSDiscreteSchedulerState:
+        if common is None:
+            common = CommonSchedulerState.create(self)
+
+        timesteps = jnp.arange(0, self.config.num_train_timesteps).round()[::-1]
+        sigmas = ((1 - common.alphas_cumprod) / common.alphas_cumprod) ** 0.5
+
+        # standard deviation of the initial noise distribution
+        init_noise_sigma = sigmas.max()
+
+        return LMSDiscreteSchedulerState.create(
+            common=common,
+            init_noise_sigma=init_noise_sigma,
+            timesteps=timesteps,
+            sigmas=sigmas,
+        )
+
+    def scale_model_input(self, state: LMSDiscreteSchedulerState, sample: jnp.ndarray, timestep: int) -> jnp.ndarray:
+        """
+        Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the K-LMS algorithm.
+
+        Args:
+            state (`LMSDiscreteSchedulerState`):
+                the `FlaxLMSDiscreteScheduler` state data class instance.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            timestep (`int`):
+                current discrete timestep in the diffusion chain.
+
+        Returns:
+            `jnp.ndarray`: scaled input sample
+        """
+        (step_index,) = jnp.where(state.timesteps == timestep, size=1)
+        step_index = step_index[0]
+
+        sigma = state.sigmas[step_index]
+        sample = sample / ((sigma**2 + 1) ** 0.5)
+        return sample
+
+    def get_lms_coefficient(self, state: LMSDiscreteSchedulerState, order, t, current_order):
+        """
+        Compute a linear multistep coefficient.
+
+        Args:
+            order (TODO):
+            t (TODO):
+            current_order (TODO):
+        """
+
+        def lms_derivative(tau):
+            prod = 1.0
+            for k in range(order):
+                if current_order == k:
+                    continue
+                prod *= (tau - state.sigmas[t - k]) / (state.sigmas[t - current_order] - state.sigmas[t - k])
+            return prod
+
+        integrated_coeff = integrate.quad(lms_derivative, state.sigmas[t], state.sigmas[t + 1], epsrel=1e-4)[0]
+
+        return integrated_coeff
+
+    def set_timesteps(
+        self, state: LMSDiscreteSchedulerState, num_inference_steps: int, shape: Tuple = ()
+    ) -> LMSDiscreteSchedulerState:
+        """
+        Sets the timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            state (`LMSDiscreteSchedulerState`):
+                the `FlaxLMSDiscreteScheduler` state data class instance.
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+        """
+
+        timesteps = jnp.linspace(self.config.num_train_timesteps - 1, 0, num_inference_steps, dtype=self.dtype)
+
+        low_idx = jnp.floor(timesteps).astype(jnp.int32)
+        high_idx = jnp.ceil(timesteps).astype(jnp.int32)
+
+        frac = jnp.mod(timesteps, 1.0)
+
+        sigmas = ((1 - state.common.alphas_cumprod) / state.common.alphas_cumprod) ** 0.5
+        sigmas = (1 - frac) * sigmas[low_idx] + frac * sigmas[high_idx]
+        sigmas = jnp.concatenate([sigmas, jnp.array([0.0], dtype=self.dtype)])
+
+        timesteps = timesteps.astype(jnp.int32)
+
+        # initial running values
+        derivatives = jnp.zeros((0,) + shape, dtype=self.dtype)
+
+        return state.replace(
+            timesteps=timesteps,
+            sigmas=sigmas,
+            num_inference_steps=num_inference_steps,
+            derivatives=derivatives,
+        )
+
+    def step(
+        self,
+        state: LMSDiscreteSchedulerState,
+        model_output: jnp.ndarray,
+        timestep: int,
+        sample: jnp.ndarray,
+        order: int = 4,
+        return_dict: bool = True,
+    ) -> Union[FlaxLMSSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            state (`LMSDiscreteSchedulerState`): the `FlaxLMSDiscreteScheduler` state data class instance.
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            order: coefficient for multi-step inference.
+            return_dict (`bool`): option for returning tuple rather than FlaxLMSSchedulerOutput class
+
+        Returns:
+            [`FlaxLMSSchedulerOutput`] or `tuple`: [`FlaxLMSSchedulerOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+        if state.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        sigma = state.sigmas[timestep]
+
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = sample - sigma * model_output
+        elif self.config.prediction_type == "v_prediction":
+            # * c_out + input * c_skip
+            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+
+        # 2. Convert to an ODE derivative
+        derivative = (sample - pred_original_sample) / sigma
+        state = state.replace(derivatives=jnp.append(state.derivatives, derivative))
+        if len(state.derivatives) > order:
+            state = state.replace(derivatives=jnp.delete(state.derivatives, 0))
+
+        # 3. Compute linear multistep coefficients
+        order = min(timestep + 1, order)
+        lms_coeffs = [self.get_lms_coefficient(state, order, timestep, curr_order) for curr_order in range(order)]
+
+        # 4. Compute previous sample based on the derivatives path
+        prev_sample = sample + sum(
+            coeff * derivative for coeff, derivative in zip(lms_coeffs, reversed(state.derivatives))
+        )
+
+        if not return_dict:
+            return (prev_sample, state)
+
+        return FlaxLMSSchedulerOutput(prev_sample=prev_sample, state=state)
+
+    def add_noise(
+        self,
+        state: LMSDiscreteSchedulerState,
+        original_samples: jnp.ndarray,
+        noise: jnp.ndarray,
+        timesteps: jnp.ndarray,
+    ) -> jnp.ndarray:
+        sigma = state.sigmas[timesteps].flatten()
+        sigma = broadcast_to_shape_from_left(sigma, noise.shape)
+
+        noisy_samples = original_samples + noise * sigma
+
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_pndm.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_pndm.py
new file mode 100644
index 000000000..a8f8b0971
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_pndm.py
@@ -0,0 +1,476 @@
+# Copyright 2024 Zhejiang University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class PNDMScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `PNDMScheduler` uses pseudo numerical methods for diffusion models such as the Runge-Kutta and linear multi-step
+    method.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        skip_prk_steps (`bool`, defaults to `False`):
+            Allows the scheduler to skip the Runge-Kutta steps defined in the original paper as being required before
+            PLMS steps.
+        set_alpha_to_one (`bool`, defaults to `False`):
+            Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
+            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the alpha value at step 0.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process)
+            or `v_prediction` (see section 2.4 of [Imagen Video](https://imagen.research.google/video/paper.pdf)
+            paper).
+        timestep_spacing (`str`, defaults to `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        skip_prk_steps: bool = False,
+        set_alpha_to_one: bool = False,
+        prediction_type: str = "epsilon",
+        timestep_spacing: str = "leading",
+        steps_offset: int = 0,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # For now we only support F-PNDM, i.e. the runge-kutta method
+        # For more information on the algorithm please take a look at the paper: https://arxiv.org/pdf/2202.09778.pdf
+        # mainly at formula (9), (12), (13) and the Algorithm 2.
+        self.pndm_order = 4
+
+        # running values
+        self.cur_model_output = 0
+        self.counter = 0
+        self.cur_sample = None
+        self.ets = []
+
+        # setable values
+        self.num_inference_steps = None
+        self._timesteps = np.arange(0, num_train_timesteps)[::-1].copy()
+        self.prk_timesteps = None
+        self.plms_timesteps = None
+        self.timesteps = None
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+
+        self.num_inference_steps = num_inference_steps
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            self._timesteps = (
+                np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps).round().astype(np.int64)
+            )
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            self._timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()
+            self._timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            self._timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio))[::-1].astype(
+                np.int64
+            )
+            self._timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+
+        if self.config.skip_prk_steps:
+            # for some models like stable diffusion the prk steps can/should be skipped to
+            # produce better results. When using PNDM with `self.config.skip_prk_steps` the implementation
+            # is based on crowsonkb's PLMS sampler implementation: https://github.com/CompVis/latent-diffusion/pull/51
+            self.prk_timesteps = np.array([])
+            self.plms_timesteps = np.concatenate([self._timesteps[:-1], self._timesteps[-2:-1], self._timesteps[-1:]])[
+                ::-1
+            ].copy()
+        else:
+            prk_timesteps = np.array(self._timesteps[-self.pndm_order :]).repeat(2) + np.tile(
+                np.array([0, self.config.num_train_timesteps // num_inference_steps // 2]), self.pndm_order
+            )
+            self.prk_timesteps = (prk_timesteps[:-1].repeat(2)[1:-1])[::-1].copy()
+            self.plms_timesteps = self._timesteps[:-3][
+                ::-1
+            ].copy()  # we copy to avoid having negative strides which are not supported by torch.from_numpy
+
+        timesteps = np.concatenate([self.prk_timesteps, self.plms_timesteps]).astype(np.int64)
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+        self.ets = []
+        self.counter = 0
+        self.cur_model_output = 0
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise), and calls [`~PNDMScheduler.step_prk`]
+        or [`~PNDMScheduler.step_plms`] depending on the internal variable `counter`.
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.counter < len(self.prk_timesteps) and not self.config.skip_prk_steps:
+            return self.step_prk(model_output=model_output, timestep=timestep, sample=sample, return_dict=return_dict)
+        else:
+            return self.step_plms(model_output=model_output, timestep=timestep, sample=sample, return_dict=return_dict)
+
+    def step_prk(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the Runge-Kutta method. It performs four forward passes to approximate the solution to the differential
+        equation.
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        diff_to_prev = 0 if self.counter % 2 else self.config.num_train_timesteps // self.num_inference_steps // 2
+        prev_timestep = timestep - diff_to_prev
+        timestep = self.prk_timesteps[self.counter // 4 * 4]
+
+        if self.counter % 4 == 0:
+            self.cur_model_output += 1 / 6 * model_output
+            self.ets.append(model_output)
+            self.cur_sample = sample
+        elif (self.counter - 1) % 4 == 0:
+            self.cur_model_output += 1 / 3 * model_output
+        elif (self.counter - 2) % 4 == 0:
+            self.cur_model_output += 1 / 3 * model_output
+        elif (self.counter - 3) % 4 == 0:
+            model_output = self.cur_model_output + 1 / 6 * model_output
+            self.cur_model_output = 0
+
+        # cur_sample should not be `None`
+        cur_sample = self.cur_sample if self.cur_sample is not None else sample
+
+        prev_sample = self._get_prev_sample(cur_sample, timestep, prev_timestep, model_output)
+        self.counter += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def step_plms(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the linear multistep method. It performs one forward pass multiple times to approximate the solution.
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or tuple.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if not self.config.skip_prk_steps and len(self.ets) < 3:
+            raise ValueError(
+                f"{self.__class__} can only be run AFTER scheduler has been run "
+                "in 'prk' mode for at least 12 iterations "
+                "See: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_pndm.py "
+                "for more information."
+            )
+
+        prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
+
+        if self.counter != 1:
+            self.ets = self.ets[-3:]
+            self.ets.append(model_output)
+        else:
+            prev_timestep = timestep
+            timestep = timestep + self.config.num_train_timesteps // self.num_inference_steps
+
+        if len(self.ets) == 1 and self.counter == 0:
+            model_output = model_output
+            self.cur_sample = sample
+        elif len(self.ets) == 1 and self.counter == 1:
+            model_output = (model_output + self.ets[-1]) / 2
+            sample = self.cur_sample
+            self.cur_sample = None
+        elif len(self.ets) == 2:
+            model_output = (3 * self.ets[-1] - self.ets[-2]) / 2
+        elif len(self.ets) == 3:
+            model_output = (23 * self.ets[-1] - 16 * self.ets[-2] + 5 * self.ets[-3]) / 12
+        else:
+            model_output = (1 / 24) * (55 * self.ets[-1] - 59 * self.ets[-2] + 37 * self.ets[-3] - 9 * self.ets[-4])
+
+        prev_sample = self._get_prev_sample(sample, timestep, prev_timestep, model_output)
+        self.counter += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def _get_prev_sample(self, sample, timestep, prev_timestep, model_output):
+        # See formula (9) of PNDM paper https://arxiv.org/pdf/2202.09778.pdf
+        # this function computes x_(t−δ) using the formula of (9)
+        # Note that x_t needs to be added to both sides of the equation
+
+        # Notation (<variable name> -> <name in paper>
+        # alpha_prod_t -> α_t
+        # alpha_prod_t_prev -> α_(t−δ)
+        # beta_prod_t -> (1 - α_t)
+        # beta_prod_t_prev -> (1 - α_(t−δ))
+        # sample -> x_t
+        # model_output -> e_θ(x_t, t)
+        # prev_sample -> x_(t−δ)
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        if self.config.prediction_type == "v_prediction":
+            model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        elif self.config.prediction_type != "epsilon":
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon` or `v_prediction`"
+            )
+
+        # corresponds to (α_(t−δ) - α_t) divided by
+        # denominator of x_t in formula (9) and plus 1
+        # Note: (α_(t−δ) - α_t) / (sqrt(α_t) * (sqrt(α_(t−δ)) + sqr(α_t))) =
+        # sqrt(α_(t−δ)) / sqrt(α_t))
+        sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5)
+
+        # corresponds to denominator of e_θ(x_t, t) in formula (9)
+        model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + (
+            alpha_prod_t * beta_prod_t * alpha_prod_t_prev
+        ) ** (0.5)
+
+        # full formula (9)
+        prev_sample = (
+            sample_coeff * sample - (alpha_prod_t_prev - alpha_prod_t) * model_output / model_output_denom_coeff
+        )
+
+        return prev_sample
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+        # for the subsequent add_noise calls
+        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_pndm_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_pndm_flax.py
new file mode 100644
index 000000000..3ac3ba5ca
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_pndm_flax.py
@@ -0,0 +1,509 @@
+# Copyright 2024 Zhejiang University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import flax
+import jax
+import jax.numpy as jnp
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils_flax import (
+    CommonSchedulerState,
+    FlaxKarrasDiffusionSchedulers,
+    FlaxSchedulerMixin,
+    FlaxSchedulerOutput,
+    add_noise_common,
+)
+
+
+@flax.struct.dataclass
+class PNDMSchedulerState:
+    common: CommonSchedulerState
+    final_alpha_cumprod: jnp.ndarray
+
+    # setable values
+    init_noise_sigma: jnp.ndarray
+    timesteps: jnp.ndarray
+    num_inference_steps: Optional[int] = None
+    prk_timesteps: Optional[jnp.ndarray] = None
+    plms_timesteps: Optional[jnp.ndarray] = None
+
+    # running values
+    cur_model_output: Optional[jnp.ndarray] = None
+    counter: Optional[jnp.int32] = None
+    cur_sample: Optional[jnp.ndarray] = None
+    ets: Optional[jnp.ndarray] = None
+
+    @classmethod
+    def create(
+        cls,
+        common: CommonSchedulerState,
+        final_alpha_cumprod: jnp.ndarray,
+        init_noise_sigma: jnp.ndarray,
+        timesteps: jnp.ndarray,
+    ):
+        return cls(
+            common=common,
+            final_alpha_cumprod=final_alpha_cumprod,
+            init_noise_sigma=init_noise_sigma,
+            timesteps=timesteps,
+        )
+
+
+@dataclass
+class FlaxPNDMSchedulerOutput(FlaxSchedulerOutput):
+    state: PNDMSchedulerState
+
+
+class FlaxPNDMScheduler(FlaxSchedulerMixin, ConfigMixin):
+    """
+    Pseudo numerical methods for diffusion models (PNDM) proposes using more advanced ODE integration techniques,
+    namely Runge-Kutta method and a linear multi-step method.
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
+
+    For more details, see the original paper: https://arxiv.org/abs/2202.09778
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        beta_start (`float`): the starting `beta` value of inference.
+        beta_end (`float`): the final `beta` value.
+        beta_schedule (`str`):
+            the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`jnp.ndarray`, optional):
+            option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc.
+        skip_prk_steps (`bool`):
+            allows the scheduler to skip the Runge-Kutta steps that are defined in the original paper as being required
+            before plms steps; defaults to `False`.
+        set_alpha_to_one (`bool`, default `False`):
+            each diffusion step uses the value of alphas product at that step and at the previous one. For the final
+            step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the value of alpha at step 0.
+        steps_offset (`int`, default `0`):
+            An offset added to the inference steps, as required by some model families.
+        prediction_type (`str`, default `epsilon`, optional):
+            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion
+            process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4
+            https://imagen.research.google/video/paper.pdf)
+        dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`):
+            the `dtype` used for params and computation.
+    """
+
+    _compatibles = [e.name for e in FlaxKarrasDiffusionSchedulers]
+
+    dtype: jnp.dtype
+    pndm_order: int
+
+    @property
+    def has_state(self):
+        return True
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[jnp.ndarray] = None,
+        skip_prk_steps: bool = False,
+        set_alpha_to_one: bool = False,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        dtype: jnp.dtype = jnp.float32,
+    ):
+        self.dtype = dtype
+
+        # For now we only support F-PNDM, i.e. the runge-kutta method
+        # For more information on the algorithm please take a look at the paper: https://arxiv.org/pdf/2202.09778.pdf
+        # mainly at formula (9), (12), (13) and the Algorithm 2.
+        self.pndm_order = 4
+
+    def create_state(self, common: Optional[CommonSchedulerState] = None) -> PNDMSchedulerState:
+        if common is None:
+            common = CommonSchedulerState.create(self)
+
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        final_alpha_cumprod = (
+            jnp.array(1.0, dtype=self.dtype) if self.config.set_alpha_to_one else common.alphas_cumprod[0]
+        )
+
+        # standard deviation of the initial noise distribution
+        init_noise_sigma = jnp.array(1.0, dtype=self.dtype)
+
+        timesteps = jnp.arange(0, self.config.num_train_timesteps).round()[::-1]
+
+        return PNDMSchedulerState.create(
+            common=common,
+            final_alpha_cumprod=final_alpha_cumprod,
+            init_noise_sigma=init_noise_sigma,
+            timesteps=timesteps,
+        )
+
+    def set_timesteps(self, state: PNDMSchedulerState, num_inference_steps: int, shape: Tuple) -> PNDMSchedulerState:
+        """
+        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            state (`PNDMSchedulerState`):
+                the `FlaxPNDMScheduler` state data class instance.
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+            shape (`Tuple`):
+                the shape of the samples to be generated.
+        """
+
+        step_ratio = self.config.num_train_timesteps // num_inference_steps
+        # creates integer timesteps by multiplying by ratio
+        # rounding to avoid issues when num_inference_step is power of 3
+        _timesteps = (jnp.arange(0, num_inference_steps) * step_ratio).round() + self.config.steps_offset
+
+        if self.config.skip_prk_steps:
+            # for some models like stable diffusion the prk steps can/should be skipped to
+            # produce better results. When using PNDM with `self.config.skip_prk_steps` the implementation
+            # is based on crowsonkb's PLMS sampler implementation: https://github.com/CompVis/latent-diffusion/pull/51
+
+            prk_timesteps = jnp.array([], dtype=jnp.int32)
+            plms_timesteps = jnp.concatenate([_timesteps[:-1], _timesteps[-2:-1], _timesteps[-1:]])[::-1]
+
+        else:
+            prk_timesteps = _timesteps[-self.pndm_order :].repeat(2) + jnp.tile(
+                jnp.array([0, self.config.num_train_timesteps // num_inference_steps // 2], dtype=jnp.int32),
+                self.pndm_order,
+            )
+
+            prk_timesteps = (prk_timesteps[:-1].repeat(2)[1:-1])[::-1]
+            plms_timesteps = _timesteps[:-3][::-1]
+
+        timesteps = jnp.concatenate([prk_timesteps, plms_timesteps])
+
+        # initial running values
+
+        cur_model_output = jnp.zeros(shape, dtype=self.dtype)
+        counter = jnp.int32(0)
+        cur_sample = jnp.zeros(shape, dtype=self.dtype)
+        ets = jnp.zeros((4,) + shape, dtype=self.dtype)
+
+        return state.replace(
+            timesteps=timesteps,
+            num_inference_steps=num_inference_steps,
+            prk_timesteps=prk_timesteps,
+            plms_timesteps=plms_timesteps,
+            cur_model_output=cur_model_output,
+            counter=counter,
+            cur_sample=cur_sample,
+            ets=ets,
+        )
+
+    def scale_model_input(
+        self, state: PNDMSchedulerState, sample: jnp.ndarray, timestep: Optional[int] = None
+    ) -> jnp.ndarray:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            state (`PNDMSchedulerState`): the `FlaxPNDMScheduler` state data class instance.
+            sample (`jnp.ndarray`): input sample
+            timestep (`int`, optional): current timestep
+
+        Returns:
+            `jnp.ndarray`: scaled input sample
+        """
+        return sample
+
+    def step(
+        self,
+        state: PNDMSchedulerState,
+        model_output: jnp.ndarray,
+        timestep: int,
+        sample: jnp.ndarray,
+        return_dict: bool = True,
+    ) -> Union[FlaxPNDMSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        This function calls `step_prk()` or `step_plms()` depending on the internal variable `counter`.
+
+        Args:
+            state (`PNDMSchedulerState`): the `FlaxPNDMScheduler` state data class instance.
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            return_dict (`bool`): option for returning tuple rather than FlaxPNDMSchedulerOutput class
+
+        Returns:
+            [`FlaxPNDMSchedulerOutput`] or `tuple`: [`FlaxPNDMSchedulerOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+
+        if state.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.config.skip_prk_steps:
+            prev_sample, state = self.step_plms(state, model_output, timestep, sample)
+        else:
+            prk_prev_sample, prk_state = self.step_prk(state, model_output, timestep, sample)
+            plms_prev_sample, plms_state = self.step_plms(state, model_output, timestep, sample)
+
+            cond = state.counter < len(state.prk_timesteps)
+
+            prev_sample = jax.lax.select(cond, prk_prev_sample, plms_prev_sample)
+
+            state = state.replace(
+                cur_model_output=jax.lax.select(cond, prk_state.cur_model_output, plms_state.cur_model_output),
+                ets=jax.lax.select(cond, prk_state.ets, plms_state.ets),
+                cur_sample=jax.lax.select(cond, prk_state.cur_sample, plms_state.cur_sample),
+                counter=jax.lax.select(cond, prk_state.counter, plms_state.counter),
+            )
+
+        if not return_dict:
+            return (prev_sample, state)
+
+        return FlaxPNDMSchedulerOutput(prev_sample=prev_sample, state=state)
+
+    def step_prk(
+        self,
+        state: PNDMSchedulerState,
+        model_output: jnp.ndarray,
+        timestep: int,
+        sample: jnp.ndarray,
+    ) -> Union[FlaxPNDMSchedulerOutput, Tuple]:
+        """
+        Step function propagating the sample with the Runge-Kutta method. RK takes 4 forward passes to approximate the
+        solution to the differential equation.
+
+        Args:
+            state (`PNDMSchedulerState`): the `FlaxPNDMScheduler` state data class instance.
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            return_dict (`bool`): option for returning tuple rather than FlaxPNDMSchedulerOutput class
+
+        Returns:
+            [`FlaxPNDMSchedulerOutput`] or `tuple`: [`FlaxPNDMSchedulerOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+
+        if state.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        diff_to_prev = jnp.where(
+            state.counter % 2, 0, self.config.num_train_timesteps // state.num_inference_steps // 2
+        )
+        prev_timestep = timestep - diff_to_prev
+        timestep = state.prk_timesteps[state.counter // 4 * 4]
+
+        model_output = jax.lax.select(
+            (state.counter % 4) != 3,
+            model_output,  # remainder 0, 1, 2
+            state.cur_model_output + 1 / 6 * model_output,  # remainder 3
+        )
+
+        state = state.replace(
+            cur_model_output=jax.lax.select_n(
+                state.counter % 4,
+                state.cur_model_output + 1 / 6 * model_output,  # remainder 0
+                state.cur_model_output + 1 / 3 * model_output,  # remainder 1
+                state.cur_model_output + 1 / 3 * model_output,  # remainder 2
+                jnp.zeros_like(state.cur_model_output),  # remainder 3
+            ),
+            ets=jax.lax.select(
+                (state.counter % 4) == 0,
+                state.ets.at[0:3].set(state.ets[1:4]).at[3].set(model_output),  # remainder 0
+                state.ets,  # remainder 1, 2, 3
+            ),
+            cur_sample=jax.lax.select(
+                (state.counter % 4) == 0,
+                sample,  # remainder 0
+                state.cur_sample,  # remainder 1, 2, 3
+            ),
+        )
+
+        cur_sample = state.cur_sample
+        prev_sample = self._get_prev_sample(state, cur_sample, timestep, prev_timestep, model_output)
+        state = state.replace(counter=state.counter + 1)
+
+        return (prev_sample, state)
+
+    def step_plms(
+        self,
+        state: PNDMSchedulerState,
+        model_output: jnp.ndarray,
+        timestep: int,
+        sample: jnp.ndarray,
+    ) -> Union[FlaxPNDMSchedulerOutput, Tuple]:
+        """
+        Step function propagating the sample with the linear multi-step method. This has one forward pass with multiple
+        times to approximate the solution.
+
+        Args:
+            state (`PNDMSchedulerState`): the `FlaxPNDMScheduler` state data class instance.
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            return_dict (`bool`): option for returning tuple rather than FlaxPNDMSchedulerOutput class
+
+        Returns:
+            [`FlaxPNDMSchedulerOutput`] or `tuple`: [`FlaxPNDMSchedulerOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is the sample tensor.
+
+        """
+
+        if state.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        # NOTE: There is no way to check in the jitted runtime if the prk mode was ran before
+
+        prev_timestep = timestep - self.config.num_train_timesteps // state.num_inference_steps
+        prev_timestep = jnp.where(prev_timestep > 0, prev_timestep, 0)
+
+        # Reference:
+        # if state.counter != 1:
+        #     state.ets.append(model_output)
+        # else:
+        #     prev_timestep = timestep
+        #     timestep = timestep + self.config.num_train_timesteps // state.num_inference_steps
+
+        prev_timestep = jnp.where(state.counter == 1, timestep, prev_timestep)
+        timestep = jnp.where(
+            state.counter == 1, timestep + self.config.num_train_timesteps // state.num_inference_steps, timestep
+        )
+
+        # Reference:
+        # if len(state.ets) == 1 and state.counter == 0:
+        #     model_output = model_output
+        #     state.cur_sample = sample
+        # elif len(state.ets) == 1 and state.counter == 1:
+        #     model_output = (model_output + state.ets[-1]) / 2
+        #     sample = state.cur_sample
+        #     state.cur_sample = None
+        # elif len(state.ets) == 2:
+        #     model_output = (3 * state.ets[-1] - state.ets[-2]) / 2
+        # elif len(state.ets) == 3:
+        #     model_output = (23 * state.ets[-1] - 16 * state.ets[-2] + 5 * state.ets[-3]) / 12
+        # else:
+        #     model_output = (1 / 24) * (55 * state.ets[-1] - 59 * state.ets[-2] + 37 * state.ets[-3] - 9 * state.ets[-4])
+
+        state = state.replace(
+            ets=jax.lax.select(
+                state.counter != 1,
+                state.ets.at[0:3].set(state.ets[1:4]).at[3].set(model_output),  # counter != 1
+                state.ets,  # counter 1
+            ),
+            cur_sample=jax.lax.select(
+                state.counter != 1,
+                sample,  # counter != 1
+                state.cur_sample,  # counter 1
+            ),
+        )
+
+        state = state.replace(
+            cur_model_output=jax.lax.select_n(
+                jnp.clip(state.counter, 0, 4),
+                model_output,  # counter 0
+                (model_output + state.ets[-1]) / 2,  # counter 1
+                (3 * state.ets[-1] - state.ets[-2]) / 2,  # counter 2
+                (23 * state.ets[-1] - 16 * state.ets[-2] + 5 * state.ets[-3]) / 12,  # counter 3
+                (1 / 24)
+                * (55 * state.ets[-1] - 59 * state.ets[-2] + 37 * state.ets[-3] - 9 * state.ets[-4]),  # counter >= 4
+            ),
+        )
+
+        sample = state.cur_sample
+        model_output = state.cur_model_output
+        prev_sample = self._get_prev_sample(state, sample, timestep, prev_timestep, model_output)
+        state = state.replace(counter=state.counter + 1)
+
+        return (prev_sample, state)
+
+    def _get_prev_sample(self, state: PNDMSchedulerState, sample, timestep, prev_timestep, model_output):
+        # See formula (9) of PNDM paper https://arxiv.org/pdf/2202.09778.pdf
+        # this function computes x_(t−δ) using the formula of (9)
+        # Note that x_t needs to be added to both sides of the equation
+
+        # Notation (<variable name> -> <name in paper>
+        # alpha_prod_t -> α_t
+        # alpha_prod_t_prev -> α_(t−δ)
+        # beta_prod_t -> (1 - α_t)
+        # beta_prod_t_prev -> (1 - α_(t−δ))
+        # sample -> x_t
+        # model_output -> e_θ(x_t, t)
+        # prev_sample -> x_(t−δ)
+        alpha_prod_t = state.common.alphas_cumprod[timestep]
+        alpha_prod_t_prev = jnp.where(
+            prev_timestep >= 0, state.common.alphas_cumprod[prev_timestep], state.final_alpha_cumprod
+        )
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        if self.config.prediction_type == "v_prediction":
+            model_output = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        elif self.config.prediction_type != "epsilon":
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon` or `v_prediction`"
+            )
+
+        # corresponds to (α_(t−δ) - α_t) divided by
+        # denominator of x_t in formula (9) and plus 1
+        # Note: (α_(t−δ) - α_t) / (sqrt(α_t) * (sqrt(α_(t−δ)) + sqr(α_t))) =
+        # sqrt(α_(t−δ)) / sqrt(α_t))
+        sample_coeff = (alpha_prod_t_prev / alpha_prod_t) ** (0.5)
+
+        # corresponds to denominator of e_θ(x_t, t) in formula (9)
+        model_output_denom_coeff = alpha_prod_t * beta_prod_t_prev ** (0.5) + (
+            alpha_prod_t * beta_prod_t * alpha_prod_t_prev
+        ) ** (0.5)
+
+        # full formula (9)
+        prev_sample = (
+            sample_coeff * sample - (alpha_prod_t_prev - alpha_prod_t) * model_output / model_output_denom_coeff
+        )
+
+        return prev_sample
+
+    def add_noise(
+        self,
+        state: PNDMSchedulerState,
+        original_samples: jnp.ndarray,
+        noise: jnp.ndarray,
+        timesteps: jnp.ndarray,
+    ) -> jnp.ndarray:
+        return add_noise_common(state.common, original_samples, noise, timesteps)
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_repaint.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_repaint.py
new file mode 100644
index 000000000..0bff07c4f
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_repaint.py
@@ -0,0 +1,361 @@
+# Copyright 2024 ETH Zurich Computer Vision Lab and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import SchedulerMixin
+
+
+@dataclass
+class RePaintSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's step function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample (x_{0}) based on the model output from
+             the current timestep. `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.FloatTensor
+    pred_original_sample: torch.FloatTensor
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class RePaintScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `RePaintScheduler` is a scheduler for DDPM inpainting inside a given mask.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, `squaredcos_cap_v2`, or `sigmoid`.
+        eta (`float`):
+            The weight of noise for added noise in diffusion step. If its value is between 0.0 and 1.0 it corresponds
+            to the DDIM scheduler, and if its value is between -0.0 and 1.0 it corresponds to the DDPM scheduler.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample between -1 and 1 for numerical stability.
+
+    """
+
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        eta: float = 0.0,
+        trained_betas: Optional[np.ndarray] = None,
+        clip_sample: bool = True,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.from_numpy(trained_betas)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        elif beta_schedule == "sigmoid":
+            # GeoDiff sigmoid schedule
+            betas = torch.linspace(-6, 6, num_train_timesteps)
+            self.betas = torch.sigmoid(betas) * (beta_end - beta_start) + beta_start
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        self.one = torch.tensor(1.0)
+
+        self.final_alpha_cumprod = torch.tensor(1.0)
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy())
+
+        self.eta = eta
+
+    def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def set_timesteps(
+        self,
+        num_inference_steps: int,
+        jump_length: int = 10,
+        jump_n_sample: int = 10,
+        device: Union[str, torch.device] = None,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model. If used,
+                `timesteps` must be `None`.
+            jump_length (`int`, defaults to 10):
+                The number of steps taken forward in time before going backward in time for a single jump (“j” in
+                RePaint paper). Take a look at Figure 9 and 10 in the paper.
+            jump_n_sample (`int`, defaults to 10):
+                The number of times to make a forward time jump for a given chosen time sample. Take a look at Figure 9
+                and 10 in the paper.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+
+        """
+        num_inference_steps = min(self.config.num_train_timesteps, num_inference_steps)
+        self.num_inference_steps = num_inference_steps
+
+        timesteps = []
+
+        jumps = {}
+        for j in range(0, num_inference_steps - jump_length, jump_length):
+            jumps[j] = jump_n_sample - 1
+
+        t = num_inference_steps
+        while t >= 1:
+            t = t - 1
+            timesteps.append(t)
+
+            if jumps.get(t, 0) > 0:
+                jumps[t] = jumps[t] - 1
+                for _ in range(jump_length):
+                    t = t + 1
+                    timesteps.append(t)
+
+        timesteps = np.array(timesteps) * (self.config.num_train_timesteps // self.num_inference_steps)
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+    def _get_variance(self, t):
+        prev_timestep = t - self.config.num_train_timesteps // self.num_inference_steps
+
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        # For t > 0, compute predicted variance βt (see formula (6) and (7) from
+        # https://arxiv.org/pdf/2006.11239.pdf) and sample from it to get
+        # previous sample x_{t-1} ~ N(pred_prev_sample, variance) == add
+        # variance to pred_sample
+        # Is equivalent to formula (16) in https://arxiv.org/pdf/2010.02502.pdf
+        # without eta.
+        # variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * self.betas[t]
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+
+        return variance
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        original_image: torch.FloatTensor,
+        mask: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[RePaintSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            original_image (`torch.FloatTensor`):
+                The original image to inpaint on.
+            mask (`torch.FloatTensor`):
+                The mask where a value of 0.0 indicates which part of the original image to inpaint.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_repaint.RePaintSchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_repaint.RePaintSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_repaint.RePaintSchedulerOutput`] is returned,
+                otherwise a tuple is returned where the first element is the sample tensor.
+
+        """
+        t = timestep
+        prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
+
+        # 1. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+
+        # 2. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_original_sample = (sample - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5
+
+        # 3. Clip "predicted x_0"
+        if self.config.clip_sample:
+            pred_original_sample = torch.clamp(pred_original_sample, -1, 1)
+
+        # We choose to follow RePaint Algorithm 1 to get x_{t-1}, however we
+        # substitute formula (7) in the algorithm coming from DDPM paper
+        # (formula (4) Algorithm 2 - Sampling) with formula (12) from DDIM paper.
+        # DDIM schedule gives the same results as DDPM with eta = 1.0
+        # Noise is being reused in 7. and 8., but no impact on quality has
+        # been observed.
+
+        # 5. Add noise
+        device = model_output.device
+        noise = randn_tensor(model_output.shape, generator=generator, device=device, dtype=model_output.dtype)
+        std_dev_t = self.eta * self._get_variance(timestep) ** 0.5
+
+        variance = 0
+        if t > 0 and self.eta > 0:
+            variance = std_dev_t * noise
+
+        # 6. compute "direction pointing to x_t" of formula (12)
+        # from https://arxiv.org/pdf/2010.02502.pdf
+        pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** 0.5 * model_output
+
+        # 7. compute x_{t-1} of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        prev_unknown_part = alpha_prod_t_prev**0.5 * pred_original_sample + pred_sample_direction + variance
+
+        # 8. Algorithm 1 Line 5 https://arxiv.org/pdf/2201.09865.pdf
+        prev_known_part = (alpha_prod_t_prev**0.5) * original_image + ((1 - alpha_prod_t_prev) ** 0.5) * noise
+
+        # 9. Algorithm 1 Line 8 https://arxiv.org/pdf/2201.09865.pdf
+        pred_prev_sample = mask * prev_known_part + (1.0 - mask) * prev_unknown_part
+
+        if not return_dict:
+            return (
+                pred_prev_sample,
+                pred_original_sample,
+            )
+
+        return RePaintSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
+
+    def undo_step(self, sample, timestep, generator=None):
+        n = self.config.num_train_timesteps // self.num_inference_steps
+
+        for i in range(n):
+            beta = self.betas[timestep + i]
+            if sample.device.type == "mps":
+                # randn does not work reproducibly on mps
+                noise = randn_tensor(sample.shape, dtype=sample.dtype, generator=generator)
+                noise = noise.to(sample.device)
+            else:
+                noise = randn_tensor(sample.shape, generator=generator, device=sample.device, dtype=sample.dtype)
+
+            # 10. Algorithm 1 Line 10 https://arxiv.org/pdf/2201.09865.pdf
+            sample = (1 - beta) ** 0.5 * sample + beta**0.5 * noise
+
+        return sample
+
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        raise NotImplementedError("Use `DDPMScheduler.add_noise()` to train for sampling with RePaint.")
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_sasolver.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_sasolver.py
new file mode 100644
index 000000000..b46f6de8a
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_sasolver.py
@@ -0,0 +1,1124 @@
+# Copyright 2024 Shuchen Xue, etc. in University of Chinese Academy of Sciences Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: check https://arxiv.org/abs/2309.05019
+# The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+
+import math
+from typing import Callable, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import deprecate
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class SASolverScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `SASolverScheduler` is a fast dedicated high-order solver for diffusion SDEs.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        predictor_order (`int`, defaults to 2):
+            The predictor order which can be `1` or `2` or `3` or '4'. It is recommended to use `predictor_order=2` for guided
+            sampling, and `predictor_order=3` for unconditional sampling.
+        corrector_order (`int`, defaults to 2):
+            The corrector order which can be `1` or `2` or `3` or '4'. It is recommended to use `corrector_order=2` for guided
+            sampling, and `corrector_order=3` for unconditional sampling.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        tau_func (`Callable`, *optional*):
+            Stochasticity during the sampling. Default in init is `lambda t: 1 if t >= 200 and t <= 800 else 0`. SA-Solver
+            will sample from vanilla diffusion ODE if tau_func is set to `lambda t: 0`. SA-Solver will sample from vanilla
+            diffusion SDE if tau_func is set to `lambda t: 1`. For more details, please check https://arxiv.org/abs/2309.05019
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
+            `algorithm_type="dpmsolver++"`.
+        algorithm_type (`str`, defaults to `data_prediction`):
+            Algorithm type for the solver; can be `data_prediction` or `noise_prediction`. It is recommended to use `data_prediction`
+            with `solver_order=2` for guided sampling like in Stable Diffusion.
+        lower_order_final (`bool`, defaults to `True`):
+            Whether to use lower-order solvers in the final steps. Default = True.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        lambda_min_clipped (`float`, defaults to `-inf`):
+            Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the
+            cosine (`squaredcos_cap_v2`) noise schedule.
+        variance_type (`str`, *optional*):
+            Set to "learned" or "learned_range" for diffusion models that predict variance. If set, the model's output
+            contains the predicted Gaussian variance.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        predictor_order: int = 2,
+        corrector_order: int = 2,
+        prediction_type: str = "epsilon",
+        tau_func: Optional[Callable] = None,
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        algorithm_type: str = "data_prediction",
+        lower_order_final: bool = True,
+        use_karras_sigmas: Optional[bool] = False,
+        lambda_min_clipped: float = -float("inf"),
+        variance_type: Optional[str] = None,
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = (
+                torch.linspace(
+                    beta_start**0.5,
+                    beta_end**0.5,
+                    num_train_timesteps,
+                    dtype=torch.float32,
+                )
+                ** 2
+            )
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        # Currently we only support VP-type noise schedule
+        self.alpha_t = torch.sqrt(self.alphas_cumprod)
+        self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
+        self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
+        self.sigmas = ((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        if algorithm_type not in ["data_prediction", "noise_prediction"]:
+            raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
+
+        # setable values
+        self.num_inference_steps = None
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
+        self.timesteps = torch.from_numpy(timesteps)
+        self.timestep_list = [None] * max(predictor_order, corrector_order - 1)
+        self.model_outputs = [None] * max(predictor_order, corrector_order - 1)
+
+        if tau_func is None:
+            self.tau_func = lambda t: 1 if t >= 200 and t <= 800 else 0
+        else:
+            self.tau_func = tau_func
+        self.predict_x0 = algorithm_type == "data_prediction"
+        self.lower_order_nums = 0
+        self.last_sample = None
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increae 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        # Clipping the minimum of all lambda(t) for numerical stability.
+        # This is critical for cosine (squaredcos_cap_v2) noise schedule.
+        clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped)
+        last_timestep = ((self.config.num_train_timesteps - clipped_idx).numpy()).item()
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = (
+                np.linspace(0, last_timestep - 1, num_inference_steps + 1).round()[::-1][:-1].copy().astype(np.int64)
+            )
+
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = last_timestep // (num_inference_steps + 1)
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps + 1) * step_ratio).round()[::-1][:-1].copy().astype(np.int64)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = np.arange(last_timestep, 0, -step_ratio).round().copy().astype(np.int64)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        if self.config.use_karras_sigmas:
+            log_sigmas = np.log(sigmas)
+            sigmas = np.flip(sigmas).copy()
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+            sigmas = np.concatenate([sigmas, sigmas[-1:]]).astype(np.float32)
+        else:
+            sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+            sigma_last = ((1 - self.alphas_cumprod[0]) / self.alphas_cumprod[0]) ** 0.5
+            sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)
+
+        self.sigmas = torch.from_numpy(sigmas)
+        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64)
+
+        self.num_inference_steps = len(timesteps)
+        self.model_outputs = [
+            None,
+        ] * max(self.config.predictor_order, self.config.corrector_order - 1)
+        self.lower_order_nums = 0
+        self.last_sample = None
+
+        # add an index counter for schedulers that allow duplicated timesteps
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._sigma_to_alpha_sigma_t
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        alpha_t = 1 / ((sigma**2 + 1) ** 0.5)
+        sigma_t = sigma * alpha_t
+
+        return alpha_t, sigma_t
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    def convert_model_output(
+        self,
+        model_output: torch.FloatTensor,
+        *args,
+        sample: torch.FloatTensor = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        Convert the model output to the corresponding type the data_prediction/noise_prediction algorithm needs. Noise_prediction is
+        designed to discretize an integral of the noise prediction model, and data_prediction is designed to discretize an
+        integral of the data prediction model.
+
+        <Tip>
+
+        The algorithm and model type are decoupled. You can use either data_prediction or noise_prediction for both noise
+        prediction and data prediction models.
+
+        </Tip>
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from the learned diffusion model.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.FloatTensor`:
+                The converted model output.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError("missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma = self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        # SA-Solver_data_prediction needs to solve an integral of the data prediction model.
+        if self.config.algorithm_type in ["data_prediction"]:
+            if self.config.prediction_type == "epsilon":
+                # SA-Solver only needs the "mean" output.
+                if self.config.variance_type in ["learned", "learned_range"]:
+                    model_output = model_output[:, :3]
+                x0_pred = (sample - sigma_t * model_output) / alpha_t
+            elif self.config.prediction_type == "sample":
+                x0_pred = model_output
+            elif self.config.prediction_type == "v_prediction":
+                x0_pred = alpha_t * sample - sigma_t * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the SASolverScheduler."
+                )
+
+            if self.config.thresholding:
+                x0_pred = self._threshold_sample(x0_pred)
+
+            return x0_pred
+
+        # SA-Solver_noise_prediction needs to solve an integral of the noise prediction model.
+        elif self.config.algorithm_type in ["noise_prediction"]:
+            if self.config.prediction_type == "epsilon":
+                # SA-Solver only needs the "mean" output.
+                if self.config.variance_type in ["learned", "learned_range"]:
+                    epsilon = model_output[:, :3]
+                else:
+                    epsilon = model_output
+            elif self.config.prediction_type == "sample":
+                epsilon = (sample - alpha_t * model_output) / sigma_t
+            elif self.config.prediction_type == "v_prediction":
+                epsilon = alpha_t * model_output + sigma_t * sample
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the SASolverScheduler."
+                )
+
+            if self.config.thresholding:
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                x0_pred = (sample - sigma_t * epsilon) / alpha_t
+                x0_pred = self._threshold_sample(x0_pred)
+                epsilon = (sample - alpha_t * x0_pred) / sigma_t
+
+            return epsilon
+
+    def get_coefficients_exponential_negative(self, order, interval_start, interval_end):
+        """
+        Calculate the integral of exp(-x) * x^order dx from interval_start to interval_end
+        """
+        assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3"
+
+        if order == 0:
+            return torch.exp(-interval_end) * (torch.exp(interval_end - interval_start) - 1)
+        elif order == 1:
+            return torch.exp(-interval_end) * (
+                (interval_start + 1) * torch.exp(interval_end - interval_start) - (interval_end + 1)
+            )
+        elif order == 2:
+            return torch.exp(-interval_end) * (
+                (interval_start**2 + 2 * interval_start + 2) * torch.exp(interval_end - interval_start)
+                - (interval_end**2 + 2 * interval_end + 2)
+            )
+        elif order == 3:
+            return torch.exp(-interval_end) * (
+                (interval_start**3 + 3 * interval_start**2 + 6 * interval_start + 6)
+                * torch.exp(interval_end - interval_start)
+                - (interval_end**3 + 3 * interval_end**2 + 6 * interval_end + 6)
+            )
+
+    def get_coefficients_exponential_positive(self, order, interval_start, interval_end, tau):
+        """
+        Calculate the integral of exp(x(1+tau^2)) * x^order dx from interval_start to interval_end
+        """
+        assert order in [0, 1, 2, 3], "order is only supported for 0, 1, 2 and 3"
+
+        # after change of variable(cov)
+        interval_end_cov = (1 + tau**2) * interval_end
+        interval_start_cov = (1 + tau**2) * interval_start
+
+        if order == 0:
+            return (
+                torch.exp(interval_end_cov) * (1 - torch.exp(-(interval_end_cov - interval_start_cov))) / (1 + tau**2)
+            )
+        elif order == 1:
+            return (
+                torch.exp(interval_end_cov)
+                * (
+                    (interval_end_cov - 1)
+                    - (interval_start_cov - 1) * torch.exp(-(interval_end_cov - interval_start_cov))
+                )
+                / ((1 + tau**2) ** 2)
+            )
+        elif order == 2:
+            return (
+                torch.exp(interval_end_cov)
+                * (
+                    (interval_end_cov**2 - 2 * interval_end_cov + 2)
+                    - (interval_start_cov**2 - 2 * interval_start_cov + 2)
+                    * torch.exp(-(interval_end_cov - interval_start_cov))
+                )
+                / ((1 + tau**2) ** 3)
+            )
+        elif order == 3:
+            return (
+                torch.exp(interval_end_cov)
+                * (
+                    (interval_end_cov**3 - 3 * interval_end_cov**2 + 6 * interval_end_cov - 6)
+                    - (interval_start_cov**3 - 3 * interval_start_cov**2 + 6 * interval_start_cov - 6)
+                    * torch.exp(-(interval_end_cov - interval_start_cov))
+                )
+                / ((1 + tau**2) ** 4)
+            )
+
+    def lagrange_polynomial_coefficient(self, order, lambda_list):
+        """
+        Calculate the coefficient of lagrange polynomial
+        """
+
+        assert order in [0, 1, 2, 3]
+        assert order == len(lambda_list) - 1
+        if order == 0:
+            return [[1]]
+        elif order == 1:
+            return [
+                [
+                    1 / (lambda_list[0] - lambda_list[1]),
+                    -lambda_list[1] / (lambda_list[0] - lambda_list[1]),
+                ],
+                [
+                    1 / (lambda_list[1] - lambda_list[0]),
+                    -lambda_list[0] / (lambda_list[1] - lambda_list[0]),
+                ],
+            ]
+        elif order == 2:
+            denominator1 = (lambda_list[0] - lambda_list[1]) * (lambda_list[0] - lambda_list[2])
+            denominator2 = (lambda_list[1] - lambda_list[0]) * (lambda_list[1] - lambda_list[2])
+            denominator3 = (lambda_list[2] - lambda_list[0]) * (lambda_list[2] - lambda_list[1])
+            return [
+                [
+                    1 / denominator1,
+                    (-lambda_list[1] - lambda_list[2]) / denominator1,
+                    lambda_list[1] * lambda_list[2] / denominator1,
+                ],
+                [
+                    1 / denominator2,
+                    (-lambda_list[0] - lambda_list[2]) / denominator2,
+                    lambda_list[0] * lambda_list[2] / denominator2,
+                ],
+                [
+                    1 / denominator3,
+                    (-lambda_list[0] - lambda_list[1]) / denominator3,
+                    lambda_list[0] * lambda_list[1] / denominator3,
+                ],
+            ]
+        elif order == 3:
+            denominator1 = (
+                (lambda_list[0] - lambda_list[1])
+                * (lambda_list[0] - lambda_list[2])
+                * (lambda_list[0] - lambda_list[3])
+            )
+            denominator2 = (
+                (lambda_list[1] - lambda_list[0])
+                * (lambda_list[1] - lambda_list[2])
+                * (lambda_list[1] - lambda_list[3])
+            )
+            denominator3 = (
+                (lambda_list[2] - lambda_list[0])
+                * (lambda_list[2] - lambda_list[1])
+                * (lambda_list[2] - lambda_list[3])
+            )
+            denominator4 = (
+                (lambda_list[3] - lambda_list[0])
+                * (lambda_list[3] - lambda_list[1])
+                * (lambda_list[3] - lambda_list[2])
+            )
+            return [
+                [
+                    1 / denominator1,
+                    (-lambda_list[1] - lambda_list[2] - lambda_list[3]) / denominator1,
+                    (
+                        lambda_list[1] * lambda_list[2]
+                        + lambda_list[1] * lambda_list[3]
+                        + lambda_list[2] * lambda_list[3]
+                    )
+                    / denominator1,
+                    (-lambda_list[1] * lambda_list[2] * lambda_list[3]) / denominator1,
+                ],
+                [
+                    1 / denominator2,
+                    (-lambda_list[0] - lambda_list[2] - lambda_list[3]) / denominator2,
+                    (
+                        lambda_list[0] * lambda_list[2]
+                        + lambda_list[0] * lambda_list[3]
+                        + lambda_list[2] * lambda_list[3]
+                    )
+                    / denominator2,
+                    (-lambda_list[0] * lambda_list[2] * lambda_list[3]) / denominator2,
+                ],
+                [
+                    1 / denominator3,
+                    (-lambda_list[0] - lambda_list[1] - lambda_list[3]) / denominator3,
+                    (
+                        lambda_list[0] * lambda_list[1]
+                        + lambda_list[0] * lambda_list[3]
+                        + lambda_list[1] * lambda_list[3]
+                    )
+                    / denominator3,
+                    (-lambda_list[0] * lambda_list[1] * lambda_list[3]) / denominator3,
+                ],
+                [
+                    1 / denominator4,
+                    (-lambda_list[0] - lambda_list[1] - lambda_list[2]) / denominator4,
+                    (
+                        lambda_list[0] * lambda_list[1]
+                        + lambda_list[0] * lambda_list[2]
+                        + lambda_list[1] * lambda_list[2]
+                    )
+                    / denominator4,
+                    (-lambda_list[0] * lambda_list[1] * lambda_list[2]) / denominator4,
+                ],
+            ]
+
+    def get_coefficients_fn(self, order, interval_start, interval_end, lambda_list, tau):
+        assert order in [1, 2, 3, 4]
+        assert order == len(lambda_list), "the length of lambda list must be equal to the order"
+        coefficients = []
+        lagrange_coefficient = self.lagrange_polynomial_coefficient(order - 1, lambda_list)
+        for i in range(order):
+            coefficient = 0
+            for j in range(order):
+                if self.predict_x0:
+                    coefficient += lagrange_coefficient[i][j] * self.get_coefficients_exponential_positive(
+                        order - 1 - j, interval_start, interval_end, tau
+                    )
+                else:
+                    coefficient += lagrange_coefficient[i][j] * self.get_coefficients_exponential_negative(
+                        order - 1 - j, interval_start, interval_end
+                    )
+            coefficients.append(coefficient)
+        assert len(coefficients) == order, "the length of coefficients does not match the order"
+        return coefficients
+
+    def stochastic_adams_bashforth_update(
+        self,
+        model_output: torch.FloatTensor,
+        *args,
+        sample: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        order: int,
+        tau: torch.FloatTensor,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        One step for the SA-Predictor.
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from the learned diffusion model at the current timestep.
+            prev_timestep (`int`):
+                The previous discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            order (`int`):
+                The order of SA-Predictor at this timestep.
+
+        Returns:
+            `torch.FloatTensor`:
+                The sample tensor at the previous timestep.
+        """
+        prev_timestep = args[0] if len(args) > 0 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if noise is None:
+            if len(args) > 2:
+                noise = args[2]
+            else:
+                raise ValueError(" missing `noise` as a required keyward argument")
+        if order is None:
+            if len(args) > 3:
+                order = args[3]
+            else:
+                raise ValueError(" missing `order` as a required keyward argument")
+        if tau is None:
+            if len(args) > 4:
+                tau = args[4]
+            else:
+                raise ValueError(" missing `tau` as a required keyward argument")
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        model_output_list = self.model_outputs
+        sigma_t, sigma_s0 = (
+            self.sigmas[self.step_index + 1],
+            self.sigmas[self.step_index],
+        )
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+
+        gradient_part = torch.zeros_like(sample)
+        h = lambda_t - lambda_s0
+        lambda_list = []
+
+        for i in range(order):
+            si = self.step_index - i
+            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
+            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
+            lambda_list.append(lambda_si)
+
+        gradient_coefficients = self.get_coefficients_fn(order, lambda_s0, lambda_t, lambda_list, tau)
+
+        x = sample
+
+        if self.predict_x0:
+            if (
+                order == 2
+            ):  ## if order = 2 we do a modification that does not influence the convergence order similar to unipc. Note: This is used only for few steps sampling.
+                # The added term is O(h^3). Empirically we find it will slightly improve the image quality.
+                # ODE case
+                # gradient_coefficients[0] += 1.0 * torch.exp(lambda_t) * (h ** 2 / 2 - (h - 1 + torch.exp(-h))) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(t_prev_list[-2]))
+                # gradient_coefficients[1] -= 1.0 * torch.exp(lambda_t) * (h ** 2 / 2 - (h - 1 + torch.exp(-h))) / (ns.marginal_lambda(t_prev_list[-1]) - ns.marginal_lambda(t_prev_list[-2]))
+                temp_sigma = self.sigmas[self.step_index - 1]
+                temp_alpha_s, temp_sigma_s = self._sigma_to_alpha_sigma_t(temp_sigma)
+                temp_lambda_s = torch.log(temp_alpha_s) - torch.log(temp_sigma_s)
+                gradient_coefficients[0] += (
+                    1.0
+                    * torch.exp((1 + tau**2) * lambda_t)
+                    * (h**2 / 2 - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h))) / ((1 + tau**2) ** 2))
+                    / (lambda_s0 - temp_lambda_s)
+                )
+                gradient_coefficients[1] -= (
+                    1.0
+                    * torch.exp((1 + tau**2) * lambda_t)
+                    * (h**2 / 2 - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h))) / ((1 + tau**2) ** 2))
+                    / (lambda_s0 - temp_lambda_s)
+                )
+
+        for i in range(order):
+            if self.predict_x0:
+                gradient_part += (
+                    (1 + tau**2)
+                    * sigma_t
+                    * torch.exp(-(tau**2) * lambda_t)
+                    * gradient_coefficients[i]
+                    * model_output_list[-(i + 1)]
+                )
+            else:
+                gradient_part += -(1 + tau**2) * alpha_t * gradient_coefficients[i] * model_output_list[-(i + 1)]
+
+        if self.predict_x0:
+            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau**2 * h)) * noise
+        else:
+            noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * noise
+
+        if self.predict_x0:
+            x_t = torch.exp(-(tau**2) * h) * (sigma_t / sigma_s0) * x + gradient_part + noise_part
+        else:
+            x_t = (alpha_t / alpha_s0) * x + gradient_part + noise_part
+
+        x_t = x_t.to(x.dtype)
+        return x_t
+
+    def stochastic_adams_moulton_update(
+        self,
+        this_model_output: torch.FloatTensor,
+        *args,
+        last_sample: torch.FloatTensor,
+        last_noise: torch.FloatTensor,
+        this_sample: torch.FloatTensor,
+        order: int,
+        tau: torch.FloatTensor,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        One step for the SA-Corrector.
+
+        Args:
+            this_model_output (`torch.FloatTensor`):
+                The model outputs at `x_t`.
+            this_timestep (`int`):
+                The current timestep `t`.
+            last_sample (`torch.FloatTensor`):
+                The generated sample before the last predictor `x_{t-1}`.
+            this_sample (`torch.FloatTensor`):
+                The generated sample after the last predictor `x_{t}`.
+            order (`int`):
+                The order of SA-Corrector at this step.
+
+        Returns:
+            `torch.FloatTensor`:
+                The corrected sample tensor at the current timestep.
+        """
+
+        this_timestep = args[0] if len(args) > 0 else kwargs.pop("this_timestep", None)
+        if last_sample is None:
+            if len(args) > 1:
+                last_sample = args[1]
+            else:
+                raise ValueError(" missing`last_sample` as a required keyward argument")
+        if last_noise is None:
+            if len(args) > 2:
+                last_noise = args[2]
+            else:
+                raise ValueError(" missing`last_noise` as a required keyward argument")
+        if this_sample is None:
+            if len(args) > 3:
+                this_sample = args[3]
+            else:
+                raise ValueError(" missing`this_sample` as a required keyward argument")
+        if order is None:
+            if len(args) > 4:
+                order = args[4]
+            else:
+                raise ValueError(" missing`order` as a required keyward argument")
+        if tau is None:
+            if len(args) > 5:
+                tau = args[5]
+            else:
+                raise ValueError(" missing`tau` as a required keyward argument")
+        if this_timestep is not None:
+            deprecate(
+                "this_timestep",
+                "1.0.0",
+                "Passing `this_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        model_output_list = self.model_outputs
+        sigma_t, sigma_s0 = (
+            self.sigmas[self.step_index],
+            self.sigmas[self.step_index - 1],
+        )
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+        gradient_part = torch.zeros_like(this_sample)
+        h = lambda_t - lambda_s0
+        lambda_list = []
+        for i in range(order):
+            si = self.step_index - i
+            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
+            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
+            lambda_list.append(lambda_si)
+
+        model_prev_list = model_output_list + [this_model_output]
+
+        gradient_coefficients = self.get_coefficients_fn(order, lambda_s0, lambda_t, lambda_list, tau)
+
+        x = last_sample
+
+        if self.predict_x0:
+            if (
+                order == 2
+            ):  ## if order = 2 we do a modification that does not influence the convergence order similar to UniPC. Note: This is used only for few steps sampling.
+                # The added term is O(h^3). Empirically we find it will slightly improve the image quality.
+                # ODE case
+                # gradient_coefficients[0] += 1.0 * torch.exp(lambda_t) * (h / 2 - (h - 1 + torch.exp(-h)) / h)
+                # gradient_coefficients[1] -= 1.0 * torch.exp(lambda_t) * (h / 2 - (h - 1 + torch.exp(-h)) / h)
+                gradient_coefficients[0] += (
+                    1.0
+                    * torch.exp((1 + tau**2) * lambda_t)
+                    * (h / 2 - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h))) / ((1 + tau**2) ** 2 * h))
+                )
+                gradient_coefficients[1] -= (
+                    1.0
+                    * torch.exp((1 + tau**2) * lambda_t)
+                    * (h / 2 - (h * (1 + tau**2) - 1 + torch.exp((1 + tau**2) * (-h))) / ((1 + tau**2) ** 2 * h))
+                )
+
+        for i in range(order):
+            if self.predict_x0:
+                gradient_part += (
+                    (1 + tau**2)
+                    * sigma_t
+                    * torch.exp(-(tau**2) * lambda_t)
+                    * gradient_coefficients[i]
+                    * model_prev_list[-(i + 1)]
+                )
+            else:
+                gradient_part += -(1 + tau**2) * alpha_t * gradient_coefficients[i] * model_prev_list[-(i + 1)]
+
+        if self.predict_x0:
+            noise_part = sigma_t * torch.sqrt(1 - torch.exp(-2 * tau**2 * h)) * last_noise
+        else:
+            noise_part = tau * sigma_t * torch.sqrt(torch.exp(2 * h) - 1) * last_noise
+
+        if self.predict_x0:
+            x_t = torch.exp(-(tau**2) * h) * (sigma_t / sigma_s0) * x + gradient_part + noise_part
+        else:
+            x_t = (alpha_t / alpha_s0) * x + gradient_part + noise_part
+
+        x_t = x_t.to(x.dtype)
+        return x_t
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        index_candidates = (schedule_timesteps == timestep).nonzero()
+
+        if len(index_candidates) == 0:
+            step_index = len(self.timesteps) - 1
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        elif len(index_candidates) > 1:
+            step_index = index_candidates[1].item()
+        else:
+            step_index = index_candidates[0].item()
+
+        return step_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        """
+        Initialize the step_index counter for the scheduler.
+        """
+
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        generator=None,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the SA-Solver.
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        use_corrector = self.step_index > 0 and self.last_sample is not None
+
+        model_output_convert = self.convert_model_output(model_output, sample=sample)
+
+        if use_corrector:
+            current_tau = self.tau_func(self.timestep_list[-1])
+            sample = self.stochastic_adams_moulton_update(
+                this_model_output=model_output_convert,
+                last_sample=self.last_sample,
+                last_noise=self.last_noise,
+                this_sample=sample,
+                order=self.this_corrector_order,
+                tau=current_tau,
+            )
+
+        for i in range(max(self.config.predictor_order, self.config.corrector_order - 1) - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+            self.timestep_list[i] = self.timestep_list[i + 1]
+
+        self.model_outputs[-1] = model_output_convert
+        self.timestep_list[-1] = timestep
+
+        noise = randn_tensor(
+            model_output.shape,
+            generator=generator,
+            device=model_output.device,
+            dtype=model_output.dtype,
+        )
+
+        if self.config.lower_order_final:
+            this_predictor_order = min(self.config.predictor_order, len(self.timesteps) - self.step_index)
+            this_corrector_order = min(self.config.corrector_order, len(self.timesteps) - self.step_index + 1)
+        else:
+            this_predictor_order = self.config.predictor_order
+            this_corrector_order = self.config.corrector_order
+
+        self.this_predictor_order = min(this_predictor_order, self.lower_order_nums + 1)  # warmup for multistep
+        self.this_corrector_order = min(this_corrector_order, self.lower_order_nums + 2)  # warmup for multistep
+        assert self.this_predictor_order > 0
+        assert self.this_corrector_order > 0
+
+        self.last_sample = sample
+        self.last_noise = noise
+
+        current_tau = self.tau_func(self.timestep_list[-1])
+        prev_sample = self.stochastic_adams_bashforth_update(
+            model_output=model_output_convert,
+            sample=sample,
+            noise=noise,
+            order=self.this_predictor_order,
+            tau=current_tau,
+        )
+
+        if self.lower_order_nums < max(self.config.predictor_order, self.config.corrector_order - 1):
+            self.lower_order_nums += 1
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+        # for the subsequent add_noise calls
+        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_sde_ve.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_sde_ve.py
new file mode 100644
index 000000000..8f8dd1877
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_sde_ve.py
@@ -0,0 +1,301 @@
+# Copyright 2024 Google Brain and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/yang-song/score_sde_pytorch
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import SchedulerMixin, SchedulerOutput
+
+
+@dataclass
+class SdeVeOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        prev_sample_mean (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Mean averaged `prev_sample` over previous timesteps.
+    """
+
+    prev_sample: torch.FloatTensor
+    prev_sample_mean: torch.FloatTensor
+
+
+class ScoreSdeVeScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `ScoreSdeVeScheduler` is a variance exploding stochastic differential equation (SDE) scheduler.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        snr (`float`, defaults to 0.15):
+            A coefficient weighting the step from the `model_output` sample (from the network) to the random noise.
+        sigma_min (`float`, defaults to 0.01):
+            The initial noise scale for the sigma sequence in the sampling procedure. The minimum sigma should mirror
+            the distribution of the data.
+        sigma_max (`float`, defaults to 1348.0):
+            The maximum value used for the range of continuous timesteps passed into the model.
+        sampling_eps (`float`, defaults to 1e-5):
+            The end value of sampling where timesteps decrease progressively from 1 to epsilon.
+        correct_steps (`int`, defaults to 1):
+            The number of correction steps performed on a produced sample.
+    """
+
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 2000,
+        snr: float = 0.15,
+        sigma_min: float = 0.01,
+        sigma_max: float = 1348.0,
+        sampling_eps: float = 1e-5,
+        correct_steps: int = 1,
+    ):
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = sigma_max
+
+        # setable values
+        self.timesteps = None
+
+        self.set_sigmas(num_train_timesteps, sigma_min, sigma_max, sampling_eps)
+
+    def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def set_timesteps(
+        self, num_inference_steps: int, sampling_eps: float = None, device: Union[str, torch.device] = None
+    ):
+        """
+        Sets the continuous timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            sampling_eps (`float`, *optional*):
+                The final timestep value (overrides value given during scheduler instantiation).
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+
+        """
+        sampling_eps = sampling_eps if sampling_eps is not None else self.config.sampling_eps
+
+        self.timesteps = torch.linspace(1, sampling_eps, num_inference_steps, device=device)
+
+    def set_sigmas(
+        self, num_inference_steps: int, sigma_min: float = None, sigma_max: float = None, sampling_eps: float = None
+    ):
+        """
+        Sets the noise scales used for the diffusion chain (to be run before inference). The sigmas control the weight
+        of the `drift` and `diffusion` components of the sample update.
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            sigma_min (`float`, optional):
+                The initial noise scale value (overrides value given during scheduler instantiation).
+            sigma_max (`float`, optional):
+                The final noise scale value (overrides value given during scheduler instantiation).
+            sampling_eps (`float`, optional):
+                The final timestep value (overrides value given during scheduler instantiation).
+
+        """
+        sigma_min = sigma_min if sigma_min is not None else self.config.sigma_min
+        sigma_max = sigma_max if sigma_max is not None else self.config.sigma_max
+        sampling_eps = sampling_eps if sampling_eps is not None else self.config.sampling_eps
+        if self.timesteps is None:
+            self.set_timesteps(num_inference_steps, sampling_eps)
+
+        self.sigmas = sigma_min * (sigma_max / sigma_min) ** (self.timesteps / sampling_eps)
+        self.discrete_sigmas = torch.exp(torch.linspace(math.log(sigma_min), math.log(sigma_max), num_inference_steps))
+        self.sigmas = torch.tensor([sigma_min * (sigma_max / sigma_min) ** t for t in self.timesteps])
+
+    def get_adjacent_sigma(self, timesteps, t):
+        return torch.where(
+            timesteps == 0,
+            torch.zeros_like(t.to(timesteps.device)),
+            self.discrete_sigmas[timesteps - 1].to(timesteps.device),
+        )
+
+    def step_pred(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[SdeVeOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_sde_ve.SdeVeOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_sde_ve.SdeVeOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_sde_ve.SdeVeOutput`] is returned, otherwise a tuple
+                is returned where the first element is the sample tensor.
+
+        """
+        if self.timesteps is None:
+            raise ValueError(
+                "`self.timesteps` is not set, you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        timestep = timestep * torch.ones(
+            sample.shape[0], device=sample.device
+        )  # torch.repeat_interleave(timestep, sample.shape[0])
+        timesteps = (timestep * (len(self.timesteps) - 1)).long()
+
+        # mps requires indices to be in the same device, so we use cpu as is the default with cuda
+        timesteps = timesteps.to(self.discrete_sigmas.device)
+
+        sigma = self.discrete_sigmas[timesteps].to(sample.device)
+        adjacent_sigma = self.get_adjacent_sigma(timesteps, timestep).to(sample.device)
+        drift = torch.zeros_like(sample)
+        diffusion = (sigma**2 - adjacent_sigma**2) ** 0.5
+
+        # equation 6 in the paper: the model_output modeled by the network is grad_x log pt(x)
+        # also equation 47 shows the analog from SDE models to ancestral sampling methods
+        diffusion = diffusion.flatten()
+        while len(diffusion.shape) < len(sample.shape):
+            diffusion = diffusion.unsqueeze(-1)
+        drift = drift - diffusion**2 * model_output
+
+        #  equation 6: sample noise for the diffusion term of
+        noise = randn_tensor(
+            sample.shape, layout=sample.layout, generator=generator, device=sample.device, dtype=sample.dtype
+        )
+        prev_sample_mean = sample - drift  # subtract because `dt` is a small negative timestep
+        # TODO is the variable diffusion the correct scaling term for the noise?
+        prev_sample = prev_sample_mean + diffusion * noise  # add impact of diffusion field g
+
+        if not return_dict:
+            return (prev_sample, prev_sample_mean)
+
+        return SdeVeOutput(prev_sample=prev_sample, prev_sample_mean=prev_sample_mean)
+
+    def step_correct(
+        self,
+        model_output: torch.FloatTensor,
+        sample: torch.FloatTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Correct the predicted sample based on the `model_output` of the network. This is often run repeatedly after
+        making the prediction for the previous timestep.
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_sde_ve.SdeVeOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_sde_ve.SdeVeOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_sde_ve.SdeVeOutput`] is returned, otherwise a tuple
+                is returned where the first element is the sample tensor.
+
+        """
+        if self.timesteps is None:
+            raise ValueError(
+                "`self.timesteps` is not set, you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        # For small batch sizes, the paper "suggest replacing norm(z) with sqrt(d), where d is the dim. of z"
+        # sample noise for correction
+        noise = randn_tensor(sample.shape, layout=sample.layout, generator=generator).to(sample.device)
+
+        # compute step size from the model_output, the noise, and the snr
+        grad_norm = torch.norm(model_output.reshape(model_output.shape[0], -1), dim=-1).mean()
+        noise_norm = torch.norm(noise.reshape(noise.shape[0], -1), dim=-1).mean()
+        step_size = (self.config.snr * noise_norm / grad_norm) ** 2 * 2
+        step_size = step_size * torch.ones(sample.shape[0]).to(sample.device)
+        # self.repeat_scalar(step_size, sample.shape[0])
+
+        # compute corrected sample: model_output term and noise term
+        step_size = step_size.flatten()
+        while len(step_size.shape) < len(sample.shape):
+            step_size = step_size.unsqueeze(-1)
+        prev_sample_mean = sample + step_size * model_output
+        prev_sample = prev_sample_mean + ((step_size * 2) ** 0.5) * noise
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.FloatTensor,
+    ) -> torch.FloatTensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        timesteps = timesteps.to(original_samples.device)
+        sigmas = self.discrete_sigmas.to(original_samples.device)[timesteps]
+        noise = (
+            noise * sigmas[:, None, None, None]
+            if noise is not None
+            else torch.randn_like(original_samples) * sigmas[:, None, None, None]
+        )
+        noisy_samples = noise + original_samples
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_sde_ve_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_sde_ve_flax.py
new file mode 100644
index 000000000..0a8d45d4a
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_sde_ve_flax.py
@@ -0,0 +1,280 @@
+# Copyright 2024 Google Brain and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This file is strongly influenced by https://github.com/yang-song/score_sde_pytorch
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import flax
+import jax
+import jax.numpy as jnp
+from jax import random
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from .scheduling_utils_flax import FlaxSchedulerMixin, FlaxSchedulerOutput, broadcast_to_shape_from_left
+
+
+@flax.struct.dataclass
+class ScoreSdeVeSchedulerState:
+    # setable values
+    timesteps: Optional[jnp.ndarray] = None
+    discrete_sigmas: Optional[jnp.ndarray] = None
+    sigmas: Optional[jnp.ndarray] = None
+
+    @classmethod
+    def create(cls):
+        return cls()
+
+
+@dataclass
+class FlaxSdeVeOutput(FlaxSchedulerOutput):
+    """
+    Output class for the ScoreSdeVeScheduler's step function output.
+
+    Args:
+        state (`ScoreSdeVeSchedulerState`):
+        prev_sample (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        prev_sample_mean (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)` for images):
+            Mean averaged `prev_sample`. Same as `prev_sample`, only mean-averaged over previous timesteps.
+    """
+
+    state: ScoreSdeVeSchedulerState
+    prev_sample: jnp.ndarray
+    prev_sample_mean: Optional[jnp.ndarray] = None
+
+
+class FlaxScoreSdeVeScheduler(FlaxSchedulerMixin, ConfigMixin):
+    """
+    The variance exploding stochastic differential equation (SDE) scheduler.
+
+    For more information, see the original paper: https://arxiv.org/abs/2011.13456
+
+    [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__`
+    function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`.
+    [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and
+    [`~SchedulerMixin.from_pretrained`] functions.
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        snr (`float`):
+            coefficient weighting the step from the model_output sample (from the network) to the random noise.
+        sigma_min (`float`):
+                initial noise scale for sigma sequence in sampling procedure. The minimum sigma should mirror the
+                distribution of the data.
+        sigma_max (`float`): maximum value used for the range of continuous timesteps passed into the model.
+        sampling_eps (`float`): the end value of sampling, where timesteps decrease progressively from 1 to
+        epsilon.
+        correct_steps (`int`): number of correction steps performed on a produced sample.
+    """
+
+    @property
+    def has_state(self):
+        return True
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 2000,
+        snr: float = 0.15,
+        sigma_min: float = 0.01,
+        sigma_max: float = 1348.0,
+        sampling_eps: float = 1e-5,
+        correct_steps: int = 1,
+    ):
+        pass
+
+    def create_state(self):
+        state = ScoreSdeVeSchedulerState.create()
+        return self.set_sigmas(
+            state,
+            self.config.num_train_timesteps,
+            self.config.sigma_min,
+            self.config.sigma_max,
+            self.config.sampling_eps,
+        )
+
+    def set_timesteps(
+        self, state: ScoreSdeVeSchedulerState, num_inference_steps: int, shape: Tuple = (), sampling_eps: float = None
+    ) -> ScoreSdeVeSchedulerState:
+        """
+        Sets the continuous timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Args:
+            state (`ScoreSdeVeSchedulerState`): the `FlaxScoreSdeVeScheduler` state data class instance.
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+            sampling_eps (`float`, optional):
+                final timestep value (overrides value given at Scheduler instantiation).
+
+        """
+        sampling_eps = sampling_eps if sampling_eps is not None else self.config.sampling_eps
+
+        timesteps = jnp.linspace(1, sampling_eps, num_inference_steps)
+        return state.replace(timesteps=timesteps)
+
+    def set_sigmas(
+        self,
+        state: ScoreSdeVeSchedulerState,
+        num_inference_steps: int,
+        sigma_min: float = None,
+        sigma_max: float = None,
+        sampling_eps: float = None,
+    ) -> ScoreSdeVeSchedulerState:
+        """
+        Sets the noise scales used for the diffusion chain. Supporting function to be run before inference.
+
+        The sigmas control the weight of the `drift` and `diffusion` components of sample update.
+
+        Args:
+            state (`ScoreSdeVeSchedulerState`): the `FlaxScoreSdeVeScheduler` state data class instance.
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+            sigma_min (`float`, optional):
+                initial noise scale value (overrides value given at Scheduler instantiation).
+            sigma_max (`float`, optional):
+                final noise scale value (overrides value given at Scheduler instantiation).
+            sampling_eps (`float`, optional):
+                final timestep value (overrides value given at Scheduler instantiation).
+        """
+        sigma_min = sigma_min if sigma_min is not None else self.config.sigma_min
+        sigma_max = sigma_max if sigma_max is not None else self.config.sigma_max
+        sampling_eps = sampling_eps if sampling_eps is not None else self.config.sampling_eps
+        if state.timesteps is None:
+            state = self.set_timesteps(state, num_inference_steps, sampling_eps)
+
+        discrete_sigmas = jnp.exp(jnp.linspace(jnp.log(sigma_min), jnp.log(sigma_max), num_inference_steps))
+        sigmas = jnp.array([sigma_min * (sigma_max / sigma_min) ** t for t in state.timesteps])
+
+        return state.replace(discrete_sigmas=discrete_sigmas, sigmas=sigmas)
+
+    def get_adjacent_sigma(self, state, timesteps, t):
+        return jnp.where(timesteps == 0, jnp.zeros_like(t), state.discrete_sigmas[timesteps - 1])
+
+    def step_pred(
+        self,
+        state: ScoreSdeVeSchedulerState,
+        model_output: jnp.ndarray,
+        timestep: int,
+        sample: jnp.ndarray,
+        key: jax.Array,
+        return_dict: bool = True,
+    ) -> Union[FlaxSdeVeOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            state (`ScoreSdeVeSchedulerState`): the `FlaxScoreSdeVeScheduler` state data class instance.
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            generator: random number generator.
+            return_dict (`bool`): option for returning tuple rather than FlaxSdeVeOutput class
+
+        Returns:
+            [`FlaxSdeVeOutput`] or `tuple`: [`FlaxSdeVeOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+
+        """
+        if state.timesteps is None:
+            raise ValueError(
+                "`state.timesteps` is not set, you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        timestep = timestep * jnp.ones(
+            sample.shape[0],
+        )
+        timesteps = (timestep * (len(state.timesteps) - 1)).long()
+
+        sigma = state.discrete_sigmas[timesteps]
+        adjacent_sigma = self.get_adjacent_sigma(state, timesteps, timestep)
+        drift = jnp.zeros_like(sample)
+        diffusion = (sigma**2 - adjacent_sigma**2) ** 0.5
+
+        # equation 6 in the paper: the model_output modeled by the network is grad_x log pt(x)
+        # also equation 47 shows the analog from SDE models to ancestral sampling methods
+        diffusion = diffusion.flatten()
+        diffusion = broadcast_to_shape_from_left(diffusion, sample.shape)
+        drift = drift - diffusion**2 * model_output
+
+        #  equation 6: sample noise for the diffusion term of
+        key = random.split(key, num=1)
+        noise = random.normal(key=key, shape=sample.shape)
+        prev_sample_mean = sample - drift  # subtract because `dt` is a small negative timestep
+        # TODO is the variable diffusion the correct scaling term for the noise?
+        prev_sample = prev_sample_mean + diffusion * noise  # add impact of diffusion field g
+
+        if not return_dict:
+            return (prev_sample, prev_sample_mean, state)
+
+        return FlaxSdeVeOutput(prev_sample=prev_sample, prev_sample_mean=prev_sample_mean, state=state)
+
+    def step_correct(
+        self,
+        state: ScoreSdeVeSchedulerState,
+        model_output: jnp.ndarray,
+        sample: jnp.ndarray,
+        key: jax.Array,
+        return_dict: bool = True,
+    ) -> Union[FlaxSdeVeOutput, Tuple]:
+        """
+        Correct the predicted sample based on the output model_output of the network. This is often run repeatedly
+        after making the prediction for the previous timestep.
+
+        Args:
+            state (`ScoreSdeVeSchedulerState`): the `FlaxScoreSdeVeScheduler` state data class instance.
+            model_output (`jnp.ndarray`): direct output from learned diffusion model.
+            sample (`jnp.ndarray`):
+                current instance of sample being created by diffusion process.
+            generator: random number generator.
+            return_dict (`bool`): option for returning tuple rather than FlaxSdeVeOutput class
+
+        Returns:
+            [`FlaxSdeVeOutput`] or `tuple`: [`FlaxSdeVeOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+
+        """
+        if state.timesteps is None:
+            raise ValueError(
+                "`state.timesteps` is not set, you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        # For small batch sizes, the paper "suggest replacing norm(z) with sqrt(d), where d is the dim. of z"
+        # sample noise for correction
+        key = random.split(key, num=1)
+        noise = random.normal(key=key, shape=sample.shape)
+
+        # compute step size from the model_output, the noise, and the snr
+        grad_norm = jnp.linalg.norm(model_output)
+        noise_norm = jnp.linalg.norm(noise)
+        step_size = (self.config.snr * noise_norm / grad_norm) ** 2 * 2
+        step_size = step_size * jnp.ones(sample.shape[0])
+
+        # compute corrected sample: model_output term and noise term
+        step_size = step_size.flatten()
+        step_size = broadcast_to_shape_from_left(step_size, sample.shape)
+        prev_sample_mean = sample + step_size * model_output
+        prev_sample = prev_sample_mean + ((step_size * 2) ** 0.5) * noise
+
+        if not return_dict:
+            return (prev_sample, state)
+
+        return FlaxSdeVeOutput(prev_sample=prev_sample, state=state)
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_tcd.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_tcd.py
new file mode 100644
index 000000000..7eb01b382
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_tcd.py
@@ -0,0 +1,686 @@
+# Copyright 2024 Stanford University Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..schedulers.scheduling_utils import SchedulerMixin
+from ..utils import BaseOutput, logging
+from ..utils.torch_utils import randn_tensor
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class TCDSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_noised_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted noised sample `(x_{s})` based on the model output from the current timestep.
+    """
+
+    prev_sample: torch.FloatTensor
+    pred_noised_sample: Optional[torch.FloatTensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
+def rescale_zero_terminal_snr(betas: torch.FloatTensor) -> torch.FloatTensor:
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.FloatTensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.FloatTensor`: rescaled betas with zero terminal SNR
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1.0 - betas
+    alphas_cumprod = torch.cumprod(alphas, dim=0)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas = alphas_bar[1:] / alphas_bar[:-1]  # Revert cumprod
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+
+    return betas
+
+
+class TCDScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `TCDScheduler` incorporates the `Strategic Stochastic Sampling` introduced by the paper `Trajectory Consistency Distillation`,
+    extending the original Multistep Consistency Sampling to enable unrestricted trajectory traversal.
+
+    This code is based on the official repo of TCD(https://github.com/jabir-zheng/TCD).
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. [`~ConfigMixin`] takes care of storing all config
+    attributes that are passed in the scheduler's `__init__` function, such as `num_train_timesteps`. They can be
+    accessed via `scheduler.config.num_train_timesteps`. [`SchedulerMixin`] provides general loading and saving
+    functionality via the [`SchedulerMixin.save_pretrained`] and [`~SchedulerMixin.from_pretrained`] functions.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        original_inference_steps (`int`, *optional*, defaults to 50):
+            The default number of inference steps used to generate a linearly-spaced timestep schedule, from which we
+            will ultimately take `num_inference_steps` evenly spaced timesteps to form the final timestep schedule.
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample for numerical stability.
+        clip_sample_range (`float`, defaults to 1.0):
+            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_one (`bool`, defaults to `True`):
+            Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
+            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the alpha value at step 0.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        timestep_spacing (`str`, defaults to `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        timestep_scaling (`float`, defaults to 10.0):
+            The factor the timesteps will be multiplied by when calculating the consistency model boundary conditions
+            `c_skip` and `c_out`. Increasing this will decrease the approximation error (although the approximation
+            error at the default of `10.0` is already pretty small).
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.00085,
+        beta_end: float = 0.012,
+        beta_schedule: str = "scaled_linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        original_inference_steps: int = 50,
+        clip_sample: bool = False,
+        clip_sample_range: float = 1.0,
+        set_alpha_to_one: bool = True,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        timestep_scaling: float = 10.0,
+        rescale_betas_zero_snr: bool = False,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.betas = rescale_zero_terminal_snr(self.betas)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
+        self.custom_timesteps = False
+
+        self._step_index = None
+        self._begin_index = None
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        indices = (schedule_timesteps == timestep).nonzero()
+
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        pos = 1 if len(indices) > 1 else 0
+
+        return indices[pos].item()
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    @property
+    def step_index(self):
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_ddim.DDIMScheduler._get_variance
+    def _get_variance(self, timestep, prev_timestep):
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+
+        return variance
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    def set_timesteps(
+        self,
+        num_inference_steps: Optional[int] = None,
+        device: Union[str, torch.device] = None,
+        original_inference_steps: Optional[int] = None,
+        timesteps: Optional[List[int]] = None,
+        strength: int = 1.0,
+    ):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`, *optional*):
+                The number of diffusion steps used when generating samples with a pre-trained model. If used,
+                `timesteps` must be `None`.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+            original_inference_steps (`int`, *optional*):
+                The original number of inference steps, which will be used to generate a linearly-spaced timestep
+                schedule (which is different from the standard `diffusers` implementation). We will then take
+                `num_inference_steps` timesteps from this schedule, evenly spaced in terms of indices, and use that as
+                our final timestep schedule. If not set, this will default to the `original_inference_steps` attribute.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of equal spacing between timesteps on the training/distillation timestep
+                schedule is used. If `timesteps` is passed, `num_inference_steps` must be `None`.
+        """
+        # 0. Check inputs
+        if num_inference_steps is None and timesteps is None:
+            raise ValueError("Must pass exactly one of `num_inference_steps` or `custom_timesteps`.")
+
+        if num_inference_steps is not None and timesteps is not None:
+            raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.")
+
+        # 1. Calculate the TCD original training/distillation timestep schedule.
+        original_steps = (
+            original_inference_steps if original_inference_steps is not None else self.config.original_inference_steps
+        )
+
+        if original_inference_steps is None:
+            # default option, timesteps align with discrete inference steps
+            if original_steps > self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`original_steps`: {original_steps} cannot be larger than `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                    f" maximal {self.config.num_train_timesteps} timesteps."
+                )
+            # TCD Timesteps Setting
+            # The skipping step parameter k from the paper.
+            k = self.config.num_train_timesteps // original_steps
+            # TCD Training/Distillation Steps Schedule
+            tcd_origin_timesteps = np.asarray(list(range(1, int(original_steps * strength) + 1))) * k - 1
+        else:
+            # customised option, sampled timesteps can be any arbitrary value
+            tcd_origin_timesteps = np.asarray(list(range(0, int(self.config.num_train_timesteps * strength))))
+
+        # 2. Calculate the TCD inference timestep schedule.
+        if timesteps is not None:
+            # 2.1 Handle custom timestep schedules.
+            train_timesteps = set(tcd_origin_timesteps)
+            non_train_timesteps = []
+            for i in range(1, len(timesteps)):
+                if timesteps[i] >= timesteps[i - 1]:
+                    raise ValueError("`custom_timesteps` must be in descending order.")
+
+                if timesteps[i] not in train_timesteps:
+                    non_train_timesteps.append(timesteps[i])
+
+            if timesteps[0] >= self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`timesteps` must start before `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps}."
+                )
+
+            # Raise warning if timestep schedule does not start with self.config.num_train_timesteps - 1
+            if strength == 1.0 and timesteps[0] != self.config.num_train_timesteps - 1:
+                logger.warning(
+                    f"The first timestep on the custom timestep schedule is {timesteps[0]}, not"
+                    f" `self.config.num_train_timesteps - 1`: {self.config.num_train_timesteps - 1}. You may get"
+                    f" unexpected results when using this timestep schedule."
+                )
+
+            # Raise warning if custom timestep schedule contains timesteps not on original timestep schedule
+            if non_train_timesteps:
+                logger.warning(
+                    f"The custom timestep schedule contains the following timesteps which are not on the original"
+                    f" training/distillation timestep schedule: {non_train_timesteps}. You may get unexpected results"
+                    f" when using this timestep schedule."
+                )
+
+            # Raise warning if custom timestep schedule is longer than original_steps
+            if original_steps is not None:
+                if len(timesteps) > original_steps:
+                    logger.warning(
+                        f"The number of timesteps in the custom timestep schedule is {len(timesteps)}, which exceeds the"
+                        f" the length of the timestep schedule used for training: {original_steps}. You may get some"
+                        f" unexpected results when using this timestep schedule."
+                    )
+            else:
+                if len(timesteps) > self.config.num_train_timesteps:
+                    logger.warning(
+                        f"The number of timesteps in the custom timestep schedule is {len(timesteps)}, which exceeds the"
+                        f" the length of the timestep schedule used for training: {self.config.num_train_timesteps}. You may get some"
+                        f" unexpected results when using this timestep schedule."
+                    )
+
+            timesteps = np.array(timesteps, dtype=np.int64)
+            self.num_inference_steps = len(timesteps)
+            self.custom_timesteps = True
+
+            # Apply strength (e.g. for img2img pipelines) (see StableDiffusionImg2ImgPipeline.get_timesteps)
+            init_timestep = min(int(self.num_inference_steps * strength), self.num_inference_steps)
+            t_start = max(self.num_inference_steps - init_timestep, 0)
+            timesteps = timesteps[t_start * self.order :]
+            # TODO: also reset self.num_inference_steps?
+        else:
+            # 2.2 Create the "standard" TCD inference timestep schedule.
+            if num_inference_steps > self.config.num_train_timesteps:
+                raise ValueError(
+                    f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                    f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                    f" maximal {self.config.num_train_timesteps} timesteps."
+                )
+
+            if original_steps is not None:
+                skipping_step = len(tcd_origin_timesteps) // num_inference_steps
+
+                if skipping_step < 1:
+                    raise ValueError(
+                        f"The combination of `original_steps x strength`: {original_steps} x {strength} is smaller than `num_inference_steps`: {num_inference_steps}. Make sure to either reduce `num_inference_steps` to a value smaller than {int(original_steps * strength)} or increase `strength` to a value higher than {float(num_inference_steps / original_steps)}."
+                    )
+
+            self.num_inference_steps = num_inference_steps
+
+            if original_steps is not None:
+                if num_inference_steps > original_steps:
+                    raise ValueError(
+                        f"`num_inference_steps`: {num_inference_steps} cannot be larger than `original_inference_steps`:"
+                        f" {original_steps} because the final timestep schedule will be a subset of the"
+                        f" `original_inference_steps`-sized initial timestep schedule."
+                    )
+            else:
+                if num_inference_steps > self.config.num_train_timesteps:
+                    raise ValueError(
+                        f"`num_inference_steps`: {num_inference_steps} cannot be larger than `num_train_timesteps`:"
+                        f" {self.config.num_train_timesteps} because the final timestep schedule will be a subset of the"
+                        f" `num_train_timesteps`-sized initial timestep schedule."
+                    )
+
+            # TCD Inference Steps Schedule
+            tcd_origin_timesteps = tcd_origin_timesteps[::-1].copy()
+            # Select (approximately) evenly spaced indices from tcd_origin_timesteps.
+            inference_indices = np.linspace(0, len(tcd_origin_timesteps), num=num_inference_steps, endpoint=False)
+            inference_indices = np.floor(inference_indices).astype(np.int64)
+            timesteps = tcd_origin_timesteps[inference_indices]
+
+        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.long)
+
+        self._step_index = None
+        self._begin_index = None
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        eta: float = 0.3,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[TCDSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            eta (`float`):
+                A stochastic parameter (referred to as `gamma` in the paper) used to control the stochasticity in every step.
+                When eta = 0, it represents deterministic sampling, whereas eta = 1 indicates full stochastic sampling.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_tcd.TCDSchedulerOutput`] or `tuple`.
+        Returns:
+            [`~schedulers.scheduling_utils.TCDSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_tcd.TCDSchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        assert 0 <= eta <= 1.0, "gamma must be less than or equal to 1.0"
+
+        # 1. get previous step value
+        prev_step_index = self.step_index + 1
+        if prev_step_index < len(self.timesteps):
+            prev_timestep = self.timesteps[prev_step_index]
+        else:
+            prev_timestep = torch.tensor(0)
+
+        timestep_s = torch.floor((1 - eta) * prev_timestep).to(dtype=torch.long)
+
+        # 2. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        beta_prod_t = 1 - alpha_prod_t
+
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+
+        alpha_prod_s = self.alphas_cumprod[timestep_s]
+        beta_prod_s = 1 - alpha_prod_s
+
+        # 3. Compute the predicted noised sample x_s based on the model parameterization
+        if self.config.prediction_type == "epsilon":  # noise-prediction
+            pred_original_sample = (sample - beta_prod_t.sqrt() * model_output) / alpha_prod_t.sqrt()
+            pred_epsilon = model_output
+            pred_noised_sample = alpha_prod_s.sqrt() * pred_original_sample + beta_prod_s.sqrt() * pred_epsilon
+        elif self.config.prediction_type == "sample":  # x-prediction
+            pred_original_sample = model_output
+            pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+            pred_noised_sample = alpha_prod_s.sqrt() * pred_original_sample + beta_prod_s.sqrt() * pred_epsilon
+        elif self.config.prediction_type == "v_prediction":  # v-prediction
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+            pred_noised_sample = alpha_prod_s.sqrt() * pred_original_sample + beta_prod_s.sqrt() * pred_epsilon
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or"
+                " `v_prediction` for `TCDScheduler`."
+            )
+
+        # 4. Sample and inject noise z ~ N(0, I) for MultiStep Inference
+        # Noise is not used on the final timestep of the timestep schedule.
+        # This also means that noise is not used for one-step sampling.
+        # Eta (referred to as "gamma" in the paper) was introduced to control the stochasticity in every step.
+        # When eta = 0, it represents deterministic sampling, whereas eta = 1 indicates full stochastic sampling.
+        if eta > 0:
+            if self.step_index != self.num_inference_steps - 1:
+                noise = randn_tensor(
+                    model_output.shape, generator=generator, device=model_output.device, dtype=pred_noised_sample.dtype
+                )
+                prev_sample = (alpha_prod_t_prev / alpha_prod_s).sqrt() * pred_noised_sample + (
+                    1 - alpha_prod_t_prev / alpha_prod_s
+                ).sqrt() * noise
+            else:
+                prev_sample = pred_noised_sample
+        else:
+            prev_sample = pred_noised_sample
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample, pred_noised_sample)
+
+        return TCDSchedulerOutput(prev_sample=prev_sample, pred_noised_sample=pred_noised_sample)
+
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    def get_velocity(
+        self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+
+    def __len__(self):
+        return self.config.num_train_timesteps
+
+    def previous_timestep(self, timestep):
+        if self.custom_timesteps:
+            index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
+            if index == self.timesteps.shape[0] - 1:
+                prev_t = torch.tensor(-1)
+            else:
+                prev_t = self.timesteps[index + 1]
+        else:
+            num_inference_steps = (
+                self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps
+            )
+            prev_t = timestep - self.config.num_train_timesteps // num_inference_steps
+
+        return prev_t
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_unclip.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_unclip.py
new file mode 100644
index 000000000..7bc0a0f9b
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_unclip.py
@@ -0,0 +1,352 @@
+# Copyright 2024 Kakao Brain and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from ..utils.torch_utils import randn_tensor
+from .scheduling_utils import SchedulerMixin
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->UnCLIP
+class UnCLIPSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.FloatTensor
+    pred_original_sample: Optional[torch.FloatTensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class UnCLIPScheduler(SchedulerMixin, ConfigMixin):
+    """
+    NOTE: do not use this scheduler. The DDPM scheduler has been updated to support the changes made here. This
+    scheduler will be removed and replaced with DDPM.
+
+    This is a modified DDPM Scheduler specifically for the karlo unCLIP model.
+
+    This scheduler has some minor variations in how it calculates the learned range variance and dynamically
+    re-calculates betas based off the timesteps it is skipping.
+
+    The scheduler also uses a slightly different step ratio when computing timesteps to use for inference.
+
+    See [`~DDPMScheduler`] for more information on DDPM scheduling
+
+    Args:
+        num_train_timesteps (`int`): number of diffusion steps used to train the model.
+        variance_type (`str`):
+            options to clip the variance used when adding noise to the denoised sample. Choose from `fixed_small_log`
+            or `learned_range`.
+        clip_sample (`bool`, default `True`):
+            option to clip predicted sample between `-clip_sample_range` and `clip_sample_range` for numerical
+            stability.
+        clip_sample_range (`float`, default `1.0`):
+            The range to clip the sample between. See `clip_sample`.
+        prediction_type (`str`, default `epsilon`, optional):
+            prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process)
+            or `sample` (directly predicting the noisy sample`)
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        variance_type: str = "fixed_small_log",
+        clip_sample: bool = True,
+        clip_sample_range: Optional[float] = 1.0,
+        prediction_type: str = "epsilon",
+        beta_schedule: str = "squaredcos_cap_v2",
+    ):
+        if beta_schedule != "squaredcos_cap_v2":
+            raise ValueError("UnCLIPScheduler only supports `beta_schedule`: 'squaredcos_cap_v2'")
+
+        self.betas = betas_for_alpha_bar(num_train_timesteps)
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        self.one = torch.tensor(1.0)
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy())
+
+        self.variance_type = variance_type
+
+    def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`): input sample
+            timestep (`int`, optional): current timestep
+
+        Returns:
+            `torch.FloatTensor`: scaled input sample
+        """
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+
+        Note that this scheduler uses a slightly different step ratio than the other diffusers schedulers. The
+        different step ratio is to mimic the original karlo implementation and does not affect the quality or accuracy
+        of the results.
+
+        Args:
+            num_inference_steps (`int`):
+                the number of diffusion steps used when generating samples with a pre-trained model.
+        """
+        self.num_inference_steps = num_inference_steps
+        step_ratio = (self.config.num_train_timesteps - 1) / (self.num_inference_steps - 1)
+        timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+    def _get_variance(self, t, prev_timestep=None, predicted_variance=None, variance_type=None):
+        if prev_timestep is None:
+            prev_timestep = t - 1
+
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.one
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        if prev_timestep == t - 1:
+            beta = self.betas[t]
+        else:
+            beta = 1 - alpha_prod_t / alpha_prod_t_prev
+
+        # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
+        # and sample from it to get previous sample
+        # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample
+        variance = beta_prod_t_prev / beta_prod_t * beta
+
+        if variance_type is None:
+            variance_type = self.config.variance_type
+
+        # hacks - were probably added for training stability
+        if variance_type == "fixed_small_log":
+            variance = torch.log(torch.clamp(variance, min=1e-20))
+            variance = torch.exp(0.5 * variance)
+        elif variance_type == "learned_range":
+            # NOTE difference with DDPM scheduler
+            min_log = variance.log()
+            max_log = beta.log()
+
+            frac = (predicted_variance + 1) / 2
+            variance = frac * max_log + (1 - frac) * min_log
+
+        return variance
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        prev_timestep: Optional[int] = None,
+        generator=None,
+        return_dict: bool = True,
+    ) -> Union[UnCLIPSchedulerOutput, Tuple]:
+        """
+        Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.FloatTensor`): direct output from learned diffusion model.
+            timestep (`int`): current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                current instance of sample being created by diffusion process.
+            prev_timestep (`int`, *optional*): The previous timestep to predict the previous sample at.
+                Used to dynamically compute beta. If not given, `t-1` is used and the pre-computed beta is used.
+            generator: random number generator.
+            return_dict (`bool`): option for returning tuple rather than UnCLIPSchedulerOutput class
+
+        Returns:
+            [`~schedulers.scheduling_utils.UnCLIPSchedulerOutput`] or `tuple`:
+            [`~schedulers.scheduling_utils.UnCLIPSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+
+        """
+        t = timestep
+
+        if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type == "learned_range":
+            model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1)
+        else:
+            predicted_variance = None
+
+        # 1. compute alphas, betas
+        if prev_timestep is None:
+            prev_timestep = t - 1
+
+        alpha_prod_t = self.alphas_cumprod[t]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.one
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        if prev_timestep == t - 1:
+            beta = self.betas[t]
+            alpha = self.alphas[t]
+        else:
+            beta = 1 - alpha_prod_t / alpha_prod_t_prev
+            alpha = 1 - beta
+
+        # 2. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon` or `sample`"
+                " for the UnCLIPScheduler."
+            )
+
+        # 3. Clip "predicted x_0"
+        if self.config.clip_sample:
+            pred_original_sample = torch.clamp(
+                pred_original_sample, -self.config.clip_sample_range, self.config.clip_sample_range
+            )
+
+        # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * beta) / beta_prod_t
+        current_sample_coeff = alpha ** (0.5) * beta_prod_t_prev / beta_prod_t
+
+        # 5. Compute predicted previous sample µ_t
+        # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
+        pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
+
+        # 6. Add noise
+        variance = 0
+        if t > 0:
+            variance_noise = randn_tensor(
+                model_output.shape, dtype=model_output.dtype, generator=generator, device=model_output.device
+            )
+
+            variance = self._get_variance(
+                t,
+                predicted_variance=predicted_variance,
+                prev_timestep=prev_timestep,
+            )
+
+            if self.variance_type == "fixed_small_log":
+                variance = variance
+            elif self.variance_type == "learned_range":
+                variance = (0.5 * variance).exp()
+            else:
+                raise ValueError(
+                    f"variance_type given as {self.variance_type} must be one of `fixed_small_log` or `learned_range`"
+                    " for the UnCLIPScheduler."
+                )
+
+            variance = variance * variance_noise
+
+        pred_prev_sample = pred_prev_sample + variance
+
+        if not return_dict:
+            return (pred_prev_sample,)
+
+        return UnCLIPSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample)
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+        # for the subsequent add_noise calls
+        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_unipc_multistep.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_unipc_multistep.py
new file mode 100644
index 000000000..8ba9a9c7d
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_unipc_multistep.py
@@ -0,0 +1,880 @@
+# Copyright 2024 TSAIL Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: check https://arxiv.org/abs/2302.04867 and https://github.com/wl-zhao/UniPC for more info
+# The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import deprecate
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `UniPCMultistepScheduler` is a training-free framework designed for the fast sampling of diffusion models.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        solver_order (`int`, default `2`):
+            The UniPC order which can be any positive integer. The effective order of accuracy is `solver_order + 1`
+            due to the UniC. It is recommended to use `solver_order=2` for guided sampling, and `solver_order=3` for
+            unconditional sampling.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and `predict_x0=True`.
+        predict_x0 (`bool`, defaults to `True`):
+            Whether to use the updating algorithm on the predicted x0.
+        solver_type (`str`, default `bh2`):
+            Solver type for UniPC. It is recommended to use `bh1` for unconditional sampling when steps < 10, and `bh2`
+            otherwise.
+        lower_order_final (`bool`, default `True`):
+            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
+            stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
+        disable_corrector (`list`, default `[]`):
+            Decides which step to disable the corrector to mitigate the misalignment between `epsilon_theta(x_t, c)`
+            and `epsilon_theta(x_t^c, c)` which can influence convergence for a large guidance scale. Corrector is
+            usually disabled during the first few steps.
+        solver_p (`SchedulerMixin`, default `None`):
+            Any other scheduler that if specified, the algorithm becomes `solver_p + UniC`.
+        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
+            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
+            the sigmas are determined according to a sequence of noise levels {σi}.
+        timestep_spacing (`str`, defaults to `"linspace"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.0001,
+        beta_end: float = 0.02,
+        beta_schedule: str = "linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        solver_order: int = 2,
+        prediction_type: str = "epsilon",
+        thresholding: bool = False,
+        dynamic_thresholding_ratio: float = 0.995,
+        sample_max_value: float = 1.0,
+        predict_x0: bool = True,
+        solver_type: str = "bh2",
+        lower_order_final: bool = True,
+        disable_corrector: List[int] = [],
+        solver_p: SchedulerMixin = None,
+        use_karras_sigmas: Optional[bool] = False,
+        timestep_spacing: str = "linspace",
+        steps_offset: int = 0,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        # Currently we only support VP-type noise schedule
+        self.alpha_t = torch.sqrt(self.alphas_cumprod)
+        self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
+        self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
+        self.sigmas = ((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        if solver_type not in ["bh1", "bh2"]:
+            if solver_type in ["midpoint", "heun", "logrho"]:
+                self.register_to_config(solver_type="bh2")
+            else:
+                raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
+
+        self.predict_x0 = predict_x0
+        # setable values
+        self.num_inference_steps = None
+        timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=np.float32)[::-1].copy()
+        self.timesteps = torch.from_numpy(timesteps)
+        self.model_outputs = [None] * solver_order
+        self.timestep_list = [None] * solver_order
+        self.lower_order_nums = 0
+        self.disable_corrector = disable_corrector
+        self.solver_p = solver_p
+        self.last_sample = None
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    @property
+    def step_index(self):
+        """
+        The index counter for current timestep. It will increae 1 after each scheduler step.
+        """
+        return self._step_index
+
+    @property
+    def begin_index(self):
+        """
+        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
+        """
+        return self._begin_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
+    def set_begin_index(self, begin_index: int = 0):
+        """
+        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
+
+        Args:
+            begin_index (`int`):
+                The begin index for the scheduler.
+        """
+        self._begin_index = begin_index
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        """
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = (
+                np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1)
+                .round()[::-1][:-1]
+                .copy()
+                .astype(np.int64)
+            )
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // (num_inference_steps + 1)
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps + 1) * step_ratio).round()[::-1][:-1].copy().astype(np.int64)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = np.arange(self.config.num_train_timesteps, 0, -step_ratio).round().copy().astype(np.int64)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'."
+            )
+
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        if self.config.use_karras_sigmas:
+            log_sigmas = np.log(sigmas)
+            sigmas = np.flip(sigmas).copy()
+            sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps)
+            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round()
+            sigmas = np.concatenate([sigmas, sigmas[-1:]]).astype(np.float32)
+        else:
+            sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+            sigma_last = ((1 - self.alphas_cumprod[0]) / self.alphas_cumprod[0]) ** 0.5
+            sigmas = np.concatenate([sigmas, [sigma_last]]).astype(np.float32)
+
+        self.sigmas = torch.from_numpy(sigmas)
+        self.timesteps = torch.from_numpy(timesteps).to(device=device, dtype=torch.int64)
+
+        self.num_inference_steps = len(timesteps)
+
+        self.model_outputs = [
+            None,
+        ] * self.config.solver_order
+        self.lower_order_nums = 0
+        self.last_sample = None
+        if self.solver_p:
+            self.solver_p.set_timesteps(self.num_inference_steps, device=device)
+
+        # add an index counter for schedulers that allow duplicated timesteps
+        self._step_index = None
+        self._begin_index = None
+        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
+    def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor:
+        """
+        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
+        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
+        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
+        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
+        photorealism as well as better image-text alignment, especially when using very large guidance weights."
+
+        https://arxiv.org/abs/2205.11487
+        """
+        dtype = sample.dtype
+        batch_size, channels, *remaining_dims = sample.shape
+
+        if dtype not in (torch.float32, torch.float64):
+            sample = sample.float()  # upcast for quantile calculation, and clamp not implemented for cpu half
+
+        # Flatten sample for doing quantile calculation along each image
+        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
+
+        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
+
+        s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
+        s = torch.clamp(
+            s, min=1, max=self.config.sample_max_value
+        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
+        s = s.unsqueeze(1)  # (batch_size, 1) because clamp will broadcast along dim=0
+        sample = torch.clamp(sample, -s, s) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
+
+        sample = sample.reshape(batch_size, channels, *remaining_dims)
+        sample = sample.to(dtype)
+
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
+    def _sigma_to_t(self, sigma, log_sigmas):
+        # get log sigma
+        log_sigma = np.log(np.maximum(sigma, 1e-10))
+
+        # get distribution
+        dists = log_sigma - log_sigmas[:, np.newaxis]
+
+        # get sigmas range
+        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+
+        low = log_sigmas[low_idx]
+        high = log_sigmas[high_idx]
+
+        # interpolate sigmas
+        w = (low - log_sigma) / (low - high)
+        w = np.clip(w, 0, 1)
+
+        # transform interpolation to time range
+        t = (1 - w) * low_idx + w * high_idx
+        t = t.reshape(sigma.shape)
+        return t
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._sigma_to_alpha_sigma_t
+    def _sigma_to_alpha_sigma_t(self, sigma):
+        alpha_t = 1 / ((sigma**2 + 1) ** 0.5)
+        sigma_t = sigma * alpha_t
+
+        return alpha_t, sigma_t
+
+    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
+    def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor:
+        """Constructs the noise schedule of Karras et al. (2022)."""
+
+        # Hack to make sure that other schedulers which copy this function don't break
+        # TODO: Add this logic to the other schedulers
+        if hasattr(self.config, "sigma_min"):
+            sigma_min = self.config.sigma_min
+        else:
+            sigma_min = None
+
+        if hasattr(self.config, "sigma_max"):
+            sigma_max = self.config.sigma_max
+        else:
+            sigma_max = None
+
+        sigma_min = sigma_min if sigma_min is not None else in_sigmas[-1].item()
+        sigma_max = sigma_max if sigma_max is not None else in_sigmas[0].item()
+
+        rho = 7.0  # 7.0 is the value used in the paper
+        ramp = np.linspace(0, 1, num_inference_steps)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return sigmas
+
+    def convert_model_output(
+        self,
+        model_output: torch.FloatTensor,
+        *args,
+        sample: torch.FloatTensor = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        r"""
+        Convert the model output to the corresponding type the UniPC algorithm needs.
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from the learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+
+        Returns:
+            `torch.FloatTensor`:
+                The converted model output.
+        """
+        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError("missing `sample` as a required keyward argument")
+        if timestep is not None:
+            deprecate(
+                "timesteps",
+                "1.0.0",
+                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        sigma = self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+
+        if self.predict_x0:
+            if self.config.prediction_type == "epsilon":
+                x0_pred = (sample - sigma_t * model_output) / alpha_t
+            elif self.config.prediction_type == "sample":
+                x0_pred = model_output
+            elif self.config.prediction_type == "v_prediction":
+                x0_pred = alpha_t * sample - sigma_t * model_output
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the UniPCMultistepScheduler."
+                )
+
+            if self.config.thresholding:
+                x0_pred = self._threshold_sample(x0_pred)
+
+            return x0_pred
+        else:
+            if self.config.prediction_type == "epsilon":
+                return model_output
+            elif self.config.prediction_type == "sample":
+                epsilon = (sample - alpha_t * model_output) / sigma_t
+                return epsilon
+            elif self.config.prediction_type == "v_prediction":
+                epsilon = alpha_t * model_output + sigma_t * sample
+                return epsilon
+            else:
+                raise ValueError(
+                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                    " `v_prediction` for the UniPCMultistepScheduler."
+                )
+
+    def multistep_uni_p_bh_update(
+        self,
+        model_output: torch.FloatTensor,
+        *args,
+        sample: torch.FloatTensor = None,
+        order: int = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        One step for the UniP (B(h) version). Alternatively, `self.solver_p` is used if is specified.
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from the learned diffusion model at the current timestep.
+            prev_timestep (`int`):
+                The previous discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            order (`int`):
+                The order of UniP at this timestep (corresponds to the *p* in UniPC-p).
+
+        Returns:
+            `torch.FloatTensor`:
+                The sample tensor at the previous timestep.
+        """
+        prev_timestep = args[0] if len(args) > 0 else kwargs.pop("prev_timestep", None)
+        if sample is None:
+            if len(args) > 1:
+                sample = args[1]
+            else:
+                raise ValueError(" missing `sample` as a required keyward argument")
+        if order is None:
+            if len(args) > 2:
+                order = args[2]
+            else:
+                raise ValueError(" missing `order` as a required keyward argument")
+        if prev_timestep is not None:
+            deprecate(
+                "prev_timestep",
+                "1.0.0",
+                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+        model_output_list = self.model_outputs
+
+        s0 = self.timestep_list[-1]
+        m0 = model_output_list[-1]
+        x = sample
+
+        if self.solver_p:
+            x_t = self.solver_p.step(model_output, s0, x).prev_sample
+            return x_t
+
+        sigma_t, sigma_s0 = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+
+        h = lambda_t - lambda_s0
+        device = sample.device
+
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            si = self.step_index - i
+            mi = model_output_list[-(i + 1)]
+            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
+            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
+            rk = (lambda_si - lambda_s0) / h
+            rks.append(rk)
+            D1s.append((mi - m0) / rk)
+
+        rks.append(1.0)
+        rks = torch.tensor(rks, device=device)
+
+        R = []
+        b = []
+
+        hh = -h if self.predict_x0 else h
+        h_phi_1 = torch.expm1(hh)  # h\phi_1(h) = e^h - 1
+        h_phi_k = h_phi_1 / hh - 1
+
+        factorial_i = 1
+
+        if self.config.solver_type == "bh1":
+            B_h = hh
+        elif self.config.solver_type == "bh2":
+            B_h = torch.expm1(hh)
+        else:
+            raise NotImplementedError()
+
+        for i in range(1, order + 1):
+            R.append(torch.pow(rks, i - 1))
+            b.append(h_phi_k * factorial_i / B_h)
+            factorial_i *= i + 1
+            h_phi_k = h_phi_k / hh - 1 / factorial_i
+
+        R = torch.stack(R)
+        b = torch.tensor(b, device=device)
+
+        if len(D1s) > 0:
+            D1s = torch.stack(D1s, dim=1)  # (B, K)
+            # for order 2, we use a simplified version
+            if order == 2:
+                rhos_p = torch.tensor([0.5], dtype=x.dtype, device=device)
+            else:
+                rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1])
+        else:
+            D1s = None
+
+        if self.predict_x0:
+            x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
+            if D1s is not None:
+                pred_res = torch.einsum("k,bkc...->bc...", rhos_p, D1s)
+            else:
+                pred_res = 0
+            x_t = x_t_ - alpha_t * B_h * pred_res
+        else:
+            x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
+            if D1s is not None:
+                pred_res = torch.einsum("k,bkc...->bc...", rhos_p, D1s)
+            else:
+                pred_res = 0
+            x_t = x_t_ - sigma_t * B_h * pred_res
+
+        x_t = x_t.to(x.dtype)
+        return x_t
+
+    def multistep_uni_c_bh_update(
+        self,
+        this_model_output: torch.FloatTensor,
+        *args,
+        last_sample: torch.FloatTensor = None,
+        this_sample: torch.FloatTensor = None,
+        order: int = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        """
+        One step for the UniC (B(h) version).
+
+        Args:
+            this_model_output (`torch.FloatTensor`):
+                The model outputs at `x_t`.
+            this_timestep (`int`):
+                The current timestep `t`.
+            last_sample (`torch.FloatTensor`):
+                The generated sample before the last predictor `x_{t-1}`.
+            this_sample (`torch.FloatTensor`):
+                The generated sample after the last predictor `x_{t}`.
+            order (`int`):
+                The `p` of UniC-p at this step. The effective order of accuracy should be `order + 1`.
+
+        Returns:
+            `torch.FloatTensor`:
+                The corrected sample tensor at the current timestep.
+        """
+        this_timestep = args[0] if len(args) > 0 else kwargs.pop("this_timestep", None)
+        if last_sample is None:
+            if len(args) > 1:
+                last_sample = args[1]
+            else:
+                raise ValueError(" missing`last_sample` as a required keyward argument")
+        if this_sample is None:
+            if len(args) > 2:
+                this_sample = args[2]
+            else:
+                raise ValueError(" missing`this_sample` as a required keyward argument")
+        if order is None:
+            if len(args) > 3:
+                order = args[3]
+            else:
+                raise ValueError(" missing`order` as a required keyward argument")
+        if this_timestep is not None:
+            deprecate(
+                "this_timestep",
+                "1.0.0",
+                "Passing `this_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
+            )
+
+        model_output_list = self.model_outputs
+
+        m0 = model_output_list[-1]
+        x = last_sample
+        x_t = this_sample
+        model_t = this_model_output
+
+        sigma_t, sigma_s0 = self.sigmas[self.step_index], self.sigmas[self.step_index - 1]
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
+        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
+
+        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
+        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
+
+        h = lambda_t - lambda_s0
+        device = this_sample.device
+
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            si = self.step_index - (i + 1)
+            mi = model_output_list[-(i + 1)]
+            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
+            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
+            rk = (lambda_si - lambda_s0) / h
+            rks.append(rk)
+            D1s.append((mi - m0) / rk)
+
+        rks.append(1.0)
+        rks = torch.tensor(rks, device=device)
+
+        R = []
+        b = []
+
+        hh = -h if self.predict_x0 else h
+        h_phi_1 = torch.expm1(hh)  # h\phi_1(h) = e^h - 1
+        h_phi_k = h_phi_1 / hh - 1
+
+        factorial_i = 1
+
+        if self.config.solver_type == "bh1":
+            B_h = hh
+        elif self.config.solver_type == "bh2":
+            B_h = torch.expm1(hh)
+        else:
+            raise NotImplementedError()
+
+        for i in range(1, order + 1):
+            R.append(torch.pow(rks, i - 1))
+            b.append(h_phi_k * factorial_i / B_h)
+            factorial_i *= i + 1
+            h_phi_k = h_phi_k / hh - 1 / factorial_i
+
+        R = torch.stack(R)
+        b = torch.tensor(b, device=device)
+
+        if len(D1s) > 0:
+            D1s = torch.stack(D1s, dim=1)
+        else:
+            D1s = None
+
+        # for order 1, we use a simplified version
+        if order == 1:
+            rhos_c = torch.tensor([0.5], dtype=x.dtype, device=device)
+        else:
+            rhos_c = torch.linalg.solve(R, b)
+
+        if self.predict_x0:
+            x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
+            if D1s is not None:
+                corr_res = torch.einsum("k,bkc...->bc...", rhos_c[:-1], D1s)
+            else:
+                corr_res = 0
+            D1_t = model_t - m0
+            x_t = x_t_ - alpha_t * B_h * (corr_res + rhos_c[-1] * D1_t)
+        else:
+            x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
+            if D1s is not None:
+                corr_res = torch.einsum("k,bkc...->bc...", rhos_c[:-1], D1s)
+            else:
+                corr_res = 0
+            D1_t = model_t - m0
+            x_t = x_t_ - sigma_t * B_h * (corr_res + rhos_c[-1] * D1_t)
+        x_t = x_t.to(x.dtype)
+        return x_t
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
+    def index_for_timestep(self, timestep, schedule_timesteps=None):
+        if schedule_timesteps is None:
+            schedule_timesteps = self.timesteps
+
+        index_candidates = (schedule_timesteps == timestep).nonzero()
+
+        if len(index_candidates) == 0:
+            step_index = len(self.timesteps) - 1
+        # The sigma index that is taken for the **very** first `step`
+        # is always the second index (or the last index if there is only 1)
+        # This way we can ensure we don't accidentally skip a sigma in
+        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
+        elif len(index_candidates) > 1:
+            step_index = index_candidates[1].item()
+        else:
+            step_index = index_candidates[0].item()
+
+        return step_index
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index
+    def _init_step_index(self, timestep):
+        """
+        Initialize the step_index counter for the scheduler.
+        """
+
+        if self.begin_index is None:
+            if isinstance(timestep, torch.Tensor):
+                timestep = timestep.to(self.timesteps.device)
+            self._step_index = self.index_for_timestep(timestep)
+        else:
+            self._step_index = self._begin_index
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        sample: torch.FloatTensor,
+        return_dict: bool = True,
+    ) -> Union[SchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
+        the multistep UniPC.
+
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`int`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        if self.step_index is None:
+            self._init_step_index(timestep)
+
+        use_corrector = (
+            self.step_index > 0 and self.step_index - 1 not in self.disable_corrector and self.last_sample is not None
+        )
+
+        model_output_convert = self.convert_model_output(model_output, sample=sample)
+        if use_corrector:
+            sample = self.multistep_uni_c_bh_update(
+                this_model_output=model_output_convert,
+                last_sample=self.last_sample,
+                this_sample=sample,
+                order=self.this_order,
+            )
+
+        for i in range(self.config.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+            self.timestep_list[i] = self.timestep_list[i + 1]
+
+        self.model_outputs[-1] = model_output_convert
+        self.timestep_list[-1] = timestep
+
+        if self.config.lower_order_final:
+            this_order = min(self.config.solver_order, len(self.timesteps) - self.step_index)
+        else:
+            this_order = self.config.solver_order
+
+        self.this_order = min(this_order, self.lower_order_nums + 1)  # warmup for multistep
+        assert self.this_order > 0
+
+        self.last_sample = sample
+        prev_sample = self.multistep_uni_p_bh_update(
+            model_output=model_output,  # pass the original non-converted model output, in case solver-p is used
+            sample=sample,
+            order=self.this_order,
+        )
+
+        if self.lower_order_nums < self.config.solver_order:
+            self.lower_order_nums += 1
+
+        # upon completion increase step index by one
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return SchedulerOutput(prev_sample=prev_sample)
+
+    def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.FloatTensor`):
+                The input sample.
+
+        Returns:
+            `torch.FloatTensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.FloatTensor:
+        # Make sure sigmas and timesteps have the same device and dtype as original_samples
+        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
+        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+            # mps does not support float64
+            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
+            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
+        else:
+            schedule_timesteps = self.timesteps.to(original_samples.device)
+            timesteps = timesteps.to(original_samples.device)
+
+        # begin_index is None when the scheduler is used for training
+        if self.begin_index is None:
+            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
+        else:
+            step_indices = [self.begin_index] * timesteps.shape[0]
+
+        sigma = sigmas[step_indices].flatten()
+        while len(sigma.shape) < len(original_samples.shape):
+            sigma = sigma.unsqueeze(-1)
+
+        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
+        noisy_samples = alpha_t * original_samples + sigma_t * noise
+        return noisy_samples
+
+    def __len__(self):
+        return self.config.num_train_timesteps
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_utils.py
new file mode 100644
index 000000000..5dbdb8288
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_utils.py
@@ -0,0 +1,186 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import os
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional, Union
+
+import torch
+from huggingface_hub.utils import validate_hf_hub_args
+
+from ..utils import BaseOutput, PushToHubMixin
+
+
+SCHEDULER_CONFIG_NAME = "scheduler_config.json"
+
+
+# NOTE: We make this type an enum because it simplifies usage in docs and prevents
+# circular imports when used for `_compatibles` within the schedulers module.
+# When it's used as a type in pipelines, it really is a Union because the actual
+# scheduler instance is passed in.
+class KarrasDiffusionSchedulers(Enum):
+    DDIMScheduler = 1
+    DDPMScheduler = 2
+    PNDMScheduler = 3
+    LMSDiscreteScheduler = 4
+    EulerDiscreteScheduler = 5
+    HeunDiscreteScheduler = 6
+    EulerAncestralDiscreteScheduler = 7
+    DPMSolverMultistepScheduler = 8
+    DPMSolverSinglestepScheduler = 9
+    KDPM2DiscreteScheduler = 10
+    KDPM2AncestralDiscreteScheduler = 11
+    DEISMultistepScheduler = 12
+    UniPCMultistepScheduler = 13
+    DPMSolverSDEScheduler = 14
+    EDMEulerScheduler = 15
+
+
+@dataclass
+class SchedulerOutput(BaseOutput):
+    """
+    Base class for the output of a scheduler's `step` function.
+
+    Args:
+        prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+
+    prev_sample: torch.FloatTensor
+
+
+class SchedulerMixin(PushToHubMixin):
+    """
+    Base class for all schedulers.
+
+    [`SchedulerMixin`] contains common functions shared by all schedulers such as general loading and saving
+    functionalities.
+
+    [`ConfigMixin`] takes care of storing the configuration attributes (like `num_train_timesteps`) that are passed to
+    the scheduler's `__init__` function, and the attributes can be accessed by `scheduler.config.num_train_timesteps`.
+
+    Class attributes:
+        - **_compatibles** (`List[str]`) -- A list of scheduler classes that are compatible with the parent scheduler
+          class. Use [`~ConfigMixin.from_config`] to load a different compatible scheduler class (should be overridden
+          by parent class).
+    """
+
+    config_name = SCHEDULER_CONFIG_NAME
+    _compatibles = []
+    has_compatibles = True
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
+        subfolder: Optional[str] = None,
+        return_unused_kwargs=False,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a scheduler from a pre-defined JSON configuration file in a local directory or Hub repository.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the scheduler
+                      configuration saved with [`~SchedulerMixin.save_pretrained`].
+            subfolder (`str`, *optional*):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                Whether kwargs that are not consumed by the Python class should be returned or not.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to resume downloading the model weights and configuration files. If set to `False`, any
+                incompletely downloaded files are deleted.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+
+        <Tip>
+
+        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with
+        `huggingface-cli login`. You can also activate the special
+        ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a
+        firewalled environment.
+
+        </Tip>
+
+        """
+        config, kwargs, commit_hash = cls.load_config(
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
+            subfolder=subfolder,
+            return_unused_kwargs=True,
+            return_commit_hash=True,
+            **kwargs,
+        )
+        return cls.from_config(config, return_unused_kwargs=return_unused_kwargs, **kwargs)
+
+    def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
+        """
+        Save a scheduler configuration object to a directory so that it can be reloaded using the
+        [`~SchedulerMixin.from_pretrained`] class method.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the configuration JSON file will be saved (will be created if it does not exist).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        self.save_config(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs)
+
+    @property
+    def compatibles(self):
+        """
+        Returns all schedulers that are compatible with this scheduler
+
+        Returns:
+            `List[SchedulerMixin]`: List of compatible schedulers
+        """
+        return self._get_compatibles()
+
+    @classmethod
+    def _get_compatibles(cls):
+        compatible_classes_str = list(set([cls.__name__] + cls._compatibles))
+        diffusers_library = importlib.import_module(__name__.split(".")[0])
+        compatible_classes = [
+            getattr(diffusers_library, c) for c in compatible_classes_str if hasattr(diffusers_library, c)
+        ]
+        return compatible_classes
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_utils_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_utils_flax.py
new file mode 100644
index 000000000..a1d471f91
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_utils_flax.py
@@ -0,0 +1,293 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import math
+import os
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional, Tuple, Union
+
+import flax
+import jax.numpy as jnp
+from huggingface_hub.utils import validate_hf_hub_args
+
+from ..utils import BaseOutput, PushToHubMixin
+
+
+SCHEDULER_CONFIG_NAME = "scheduler_config.json"
+
+
+# NOTE: We make this type an enum because it simplifies usage in docs and prevents
+# circular imports when used for `_compatibles` within the schedulers module.
+# When it's used as a type in pipelines, it really is a Union because the actual
+# scheduler instance is passed in.
+class FlaxKarrasDiffusionSchedulers(Enum):
+    FlaxDDIMScheduler = 1
+    FlaxDDPMScheduler = 2
+    FlaxPNDMScheduler = 3
+    FlaxLMSDiscreteScheduler = 4
+    FlaxDPMSolverMultistepScheduler = 5
+    FlaxEulerDiscreteScheduler = 6
+
+
+@dataclass
+class FlaxSchedulerOutput(BaseOutput):
+    """
+    Base class for the scheduler's step function output.
+
+    Args:
+        prev_sample (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+
+    prev_sample: jnp.ndarray
+
+
+class FlaxSchedulerMixin(PushToHubMixin):
+    """
+    Mixin containing common functions for the schedulers.
+
+    Class attributes:
+        - **_compatibles** (`List[str]`) -- A list of classes that are compatible with the parent class, so that
+          `from_config` can be used from a class different than the one used to save the config (should be overridden
+          by parent class).
+    """
+
+    config_name = SCHEDULER_CONFIG_NAME
+    ignore_for_config = ["dtype"]
+    _compatibles = []
+    has_compatibles = True
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
+        subfolder: Optional[str] = None,
+        return_unused_kwargs=False,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a Scheduler class from a pre-defined JSON-file.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+
+                    - A string, the *model id* of a model repo on huggingface.co. Valid model ids should have an
+                      organization name, like `google/ddpm-celebahq-256`.
+                    - A path to a *directory* containing model weights saved using [`~SchedulerMixin.save_pretrained`],
+                      e.g., `./my_model_directory/`.
+            subfolder (`str`, *optional*):
+                In case the relevant files are located inside a subfolder of the model repo (either remote in
+                huggingface.co or downloaded locally), you can specify the folder name here.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                Whether kwargs that are not consumed by the Python class should be returned or not.
+
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+                file exists.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether or not to only look at local files (i.e., do not try to download the model).
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `transformers-cli login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+
+        <Tip>
+
+         It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated
+         models](https://huggingface.co/docs/hub/models-gated#gated-models).
+
+        </Tip>
+
+        <Tip>
+
+        Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
+        use this method in a firewalled environment.
+
+        </Tip>
+
+        """
+        config, kwargs = cls.load_config(
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
+            subfolder=subfolder,
+            return_unused_kwargs=True,
+            **kwargs,
+        )
+        scheduler, unused_kwargs = cls.from_config(config, return_unused_kwargs=True, **kwargs)
+
+        if hasattr(scheduler, "create_state") and getattr(scheduler, "has_state", False):
+            state = scheduler.create_state()
+
+        if return_unused_kwargs:
+            return scheduler, state, unused_kwargs
+
+        return scheduler, state
+
+    def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
+        """
+        Save a scheduler configuration object to the directory `save_directory`, so that it can be re-loaded using the
+        [`~FlaxSchedulerMixin.from_pretrained`] class method.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the configuration JSON file will be saved (will be created if it does not exist).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        self.save_config(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs)
+
+    @property
+    def compatibles(self):
+        """
+        Returns all schedulers that are compatible with this scheduler
+
+        Returns:
+            `List[SchedulerMixin]`: List of compatible schedulers
+        """
+        return self._get_compatibles()
+
+    @classmethod
+    def _get_compatibles(cls):
+        compatible_classes_str = list(set([cls.__name__] + cls._compatibles))
+        diffusers_library = importlib.import_module(__name__.split(".")[0])
+        compatible_classes = [
+            getattr(diffusers_library, c) for c in compatible_classes_str if hasattr(diffusers_library, c)
+        ]
+        return compatible_classes
+
+
+def broadcast_to_shape_from_left(x: jnp.ndarray, shape: Tuple[int]) -> jnp.ndarray:
+    assert len(shape) >= x.ndim
+    return jnp.broadcast_to(x.reshape(x.shape + (1,) * (len(shape) - x.ndim)), shape)
+
+
+def betas_for_alpha_bar(num_diffusion_timesteps: int, max_beta=0.999, dtype=jnp.float32) -> jnp.ndarray:
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+
+    Returns:
+        betas (`jnp.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+
+    def alpha_bar(time_step):
+        return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return jnp.array(betas, dtype=dtype)
+
+
+@flax.struct.dataclass
+class CommonSchedulerState:
+    alphas: jnp.ndarray
+    betas: jnp.ndarray
+    alphas_cumprod: jnp.ndarray
+
+    @classmethod
+    def create(cls, scheduler):
+        config = scheduler.config
+
+        if config.trained_betas is not None:
+            betas = jnp.asarray(config.trained_betas, dtype=scheduler.dtype)
+        elif config.beta_schedule == "linear":
+            betas = jnp.linspace(config.beta_start, config.beta_end, config.num_train_timesteps, dtype=scheduler.dtype)
+        elif config.beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            betas = (
+                jnp.linspace(
+                    config.beta_start**0.5, config.beta_end**0.5, config.num_train_timesteps, dtype=scheduler.dtype
+                )
+                ** 2
+            )
+        elif config.beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            betas = betas_for_alpha_bar(config.num_train_timesteps, dtype=scheduler.dtype)
+        else:
+            raise NotImplementedError(
+                f"beta_schedule {config.beta_schedule} is not implemented for scheduler {scheduler.__class__.__name__}"
+            )
+
+        alphas = 1.0 - betas
+
+        alphas_cumprod = jnp.cumprod(alphas, axis=0)
+
+        return cls(
+            alphas=alphas,
+            betas=betas,
+            alphas_cumprod=alphas_cumprod,
+        )
+
+
+def get_sqrt_alpha_prod(
+    state: CommonSchedulerState, original_samples: jnp.ndarray, noise: jnp.ndarray, timesteps: jnp.ndarray
+):
+    alphas_cumprod = state.alphas_cumprod
+
+    sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+    sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+    sqrt_alpha_prod = broadcast_to_shape_from_left(sqrt_alpha_prod, original_samples.shape)
+
+    sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+    sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+    sqrt_one_minus_alpha_prod = broadcast_to_shape_from_left(sqrt_one_minus_alpha_prod, original_samples.shape)
+
+    return sqrt_alpha_prod, sqrt_one_minus_alpha_prod
+
+
+def add_noise_common(
+    state: CommonSchedulerState, original_samples: jnp.ndarray, noise: jnp.ndarray, timesteps: jnp.ndarray
+):
+    sqrt_alpha_prod, sqrt_one_minus_alpha_prod = get_sqrt_alpha_prod(state, original_samples, noise, timesteps)
+    noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+    return noisy_samples
+
+
+def get_velocity_common(state: CommonSchedulerState, sample: jnp.ndarray, noise: jnp.ndarray, timesteps: jnp.ndarray):
+    sqrt_alpha_prod, sqrt_one_minus_alpha_prod = get_sqrt_alpha_prod(state, sample, noise, timesteps)
+    velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+    return velocity
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_vq_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_vq_diffusion.py
new file mode 100644
index 000000000..03ba95cad
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_vq_diffusion.py
@@ -0,0 +1,467 @@
+# Copyright 2024 Microsoft and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from .scheduling_utils import SchedulerMixin
+
+
+@dataclass
+class VQDiffusionSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's step function output.
+
+    Args:
+        prev_sample (`torch.LongTensor` of shape `(batch size, num latent pixels)`):
+            Computed sample x_{t-1} of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+
+    prev_sample: torch.LongTensor
+
+
+def index_to_log_onehot(x: torch.LongTensor, num_classes: int) -> torch.FloatTensor:
+    """
+    Convert batch of vector of class indices into batch of log onehot vectors
+
+    Args:
+        x (`torch.LongTensor` of shape `(batch size, vector length)`):
+            Batch of class indices
+
+        num_classes (`int`):
+            number of classes to be used for the onehot vectors
+
+    Returns:
+        `torch.FloatTensor` of shape `(batch size, num classes, vector length)`:
+            Log onehot vectors
+    """
+    x_onehot = F.one_hot(x, num_classes)
+    x_onehot = x_onehot.permute(0, 2, 1)
+    log_x = torch.log(x_onehot.float().clamp(min=1e-30))
+    return log_x
+
+
+def gumbel_noised(logits: torch.FloatTensor, generator: Optional[torch.Generator]) -> torch.FloatTensor:
+    """
+    Apply gumbel noise to `logits`
+    """
+    uniform = torch.rand(logits.shape, device=logits.device, generator=generator)
+    gumbel_noise = -torch.log(-torch.log(uniform + 1e-30) + 1e-30)
+    noised = gumbel_noise + logits
+    return noised
+
+
+def alpha_schedules(num_diffusion_timesteps: int, alpha_cum_start=0.99999, alpha_cum_end=0.000009):
+    """
+    Cumulative and non-cumulative alpha schedules.
+
+    See section 4.1.
+    """
+    att = (
+        np.arange(0, num_diffusion_timesteps) / (num_diffusion_timesteps - 1) * (alpha_cum_end - alpha_cum_start)
+        + alpha_cum_start
+    )
+    att = np.concatenate(([1], att))
+    at = att[1:] / att[:-1]
+    att = np.concatenate((att[1:], [1]))
+    return at, att
+
+
+def gamma_schedules(num_diffusion_timesteps: int, gamma_cum_start=0.000009, gamma_cum_end=0.99999):
+    """
+    Cumulative and non-cumulative gamma schedules.
+
+    See section 4.1.
+    """
+    ctt = (
+        np.arange(0, num_diffusion_timesteps) / (num_diffusion_timesteps - 1) * (gamma_cum_end - gamma_cum_start)
+        + gamma_cum_start
+    )
+    ctt = np.concatenate(([0], ctt))
+    one_minus_ctt = 1 - ctt
+    one_minus_ct = one_minus_ctt[1:] / one_minus_ctt[:-1]
+    ct = 1 - one_minus_ct
+    ctt = np.concatenate((ctt[1:], [0]))
+    return ct, ctt
+
+
+class VQDiffusionScheduler(SchedulerMixin, ConfigMixin):
+    """
+    A scheduler for vector quantized diffusion.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_vec_classes (`int`):
+            The number of classes of the vector embeddings of the latent pixels. Includes the class for the masked
+            latent pixel.
+        num_train_timesteps (`int`, defaults to 100):
+            The number of diffusion steps to train the model.
+        alpha_cum_start (`float`, defaults to 0.99999):
+            The starting cumulative alpha value.
+        alpha_cum_end (`float`, defaults to 0.00009):
+            The ending cumulative alpha value.
+        gamma_cum_start (`float`, defaults to 0.00009):
+            The starting cumulative gamma value.
+        gamma_cum_end (`float`, defaults to 0.99999):
+            The ending cumulative gamma value.
+    """
+
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_vec_classes: int,
+        num_train_timesteps: int = 100,
+        alpha_cum_start: float = 0.99999,
+        alpha_cum_end: float = 0.000009,
+        gamma_cum_start: float = 0.000009,
+        gamma_cum_end: float = 0.99999,
+    ):
+        self.num_embed = num_vec_classes
+
+        # By convention, the index for the mask class is the last class index
+        self.mask_class = self.num_embed - 1
+
+        at, att = alpha_schedules(num_train_timesteps, alpha_cum_start=alpha_cum_start, alpha_cum_end=alpha_cum_end)
+        ct, ctt = gamma_schedules(num_train_timesteps, gamma_cum_start=gamma_cum_start, gamma_cum_end=gamma_cum_end)
+
+        num_non_mask_classes = self.num_embed - 1
+        bt = (1 - at - ct) / num_non_mask_classes
+        btt = (1 - att - ctt) / num_non_mask_classes
+
+        at = torch.tensor(at.astype("float64"))
+        bt = torch.tensor(bt.astype("float64"))
+        ct = torch.tensor(ct.astype("float64"))
+        log_at = torch.log(at)
+        log_bt = torch.log(bt)
+        log_ct = torch.log(ct)
+
+        att = torch.tensor(att.astype("float64"))
+        btt = torch.tensor(btt.astype("float64"))
+        ctt = torch.tensor(ctt.astype("float64"))
+        log_cumprod_at = torch.log(att)
+        log_cumprod_bt = torch.log(btt)
+        log_cumprod_ct = torch.log(ctt)
+
+        self.log_at = log_at.float()
+        self.log_bt = log_bt.float()
+        self.log_ct = log_ct.float()
+        self.log_cumprod_at = log_cumprod_at.float()
+        self.log_cumprod_bt = log_cumprod_bt.float()
+        self.log_cumprod_ct = log_cumprod_ct.float()
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy())
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+            device (`str` or `torch.device`, *optional*):
+                The device to which the timesteps and diffusion process parameters (alpha, beta, gamma) should be moved
+                to.
+        """
+        self.num_inference_steps = num_inference_steps
+        timesteps = np.arange(0, self.num_inference_steps)[::-1].copy()
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+        self.log_at = self.log_at.to(device)
+        self.log_bt = self.log_bt.to(device)
+        self.log_ct = self.log_ct.to(device)
+        self.log_cumprod_at = self.log_cumprod_at.to(device)
+        self.log_cumprod_bt = self.log_cumprod_bt.to(device)
+        self.log_cumprod_ct = self.log_cumprod_ct.to(device)
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: torch.long,
+        sample: torch.LongTensor,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[VQDiffusionSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by the reverse transition distribution. See
+        [`~VQDiffusionScheduler.q_posterior`] for more details about how the distribution is computer.
+
+        Args:
+            log_p_x_0: (`torch.FloatTensor` of shape `(batch size, num classes - 1, num latent pixels)`):
+                The log probabilities for the predicted classes of the initial latent pixels. Does not include a
+                prediction for the masked class as the initial unnoised image cannot be masked.
+            t (`torch.long`):
+                The timestep that determines which transition matrices are used.
+            x_t (`torch.LongTensor` of shape `(batch size, num latent pixels)`):
+                The classes of each latent pixel at time `t`.
+            generator (`torch.Generator`, or `None`):
+                A random number generator for the noise applied to `p(x_{t-1} | x_t)` before it is sampled from.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_vq_diffusion.VQDiffusionSchedulerOutput`] or
+                `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_vq_diffusion.VQDiffusionSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_vq_diffusion.VQDiffusionSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+        if timestep == 0:
+            log_p_x_t_min_1 = model_output
+        else:
+            log_p_x_t_min_1 = self.q_posterior(model_output, sample, timestep)
+
+        log_p_x_t_min_1 = gumbel_noised(log_p_x_t_min_1, generator)
+
+        x_t_min_1 = log_p_x_t_min_1.argmax(dim=1)
+
+        if not return_dict:
+            return (x_t_min_1,)
+
+        return VQDiffusionSchedulerOutput(prev_sample=x_t_min_1)
+
+    def q_posterior(self, log_p_x_0, x_t, t):
+        """
+        Calculates the log probabilities for the predicted classes of the image at timestep `t-1`:
+
+        ```
+        p(x_{t-1} | x_t) = sum( q(x_t | x_{t-1}) * q(x_{t-1} | x_0) * p(x_0) / q(x_t | x_0) )
+        ```
+
+        Args:
+            log_p_x_0 (`torch.FloatTensor` of shape `(batch size, num classes - 1, num latent pixels)`):
+                The log probabilities for the predicted classes of the initial latent pixels. Does not include a
+                prediction for the masked class as the initial unnoised image cannot be masked.
+            x_t (`torch.LongTensor` of shape `(batch size, num latent pixels)`):
+                The classes of each latent pixel at time `t`.
+            t (`torch.Long`):
+                The timestep that determines which transition matrix is used.
+
+        Returns:
+            `torch.FloatTensor` of shape `(batch size, num classes, num latent pixels)`:
+                The log probabilities for the predicted classes of the image at timestep `t-1`.
+        """
+        log_onehot_x_t = index_to_log_onehot(x_t, self.num_embed)
+
+        log_q_x_t_given_x_0 = self.log_Q_t_transitioning_to_known_class(
+            t=t, x_t=x_t, log_onehot_x_t=log_onehot_x_t, cumulative=True
+        )
+
+        log_q_t_given_x_t_min_1 = self.log_Q_t_transitioning_to_known_class(
+            t=t, x_t=x_t, log_onehot_x_t=log_onehot_x_t, cumulative=False
+        )
+
+        # p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0)          ...      p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0)
+        #               .                    .                                   .
+        #               .                            .                           .
+        #               .                                      .                 .
+        # p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1})  ...      p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1})
+        q = log_p_x_0 - log_q_x_t_given_x_0
+
+        # sum_0 = p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0) + ... + p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}), ... ,
+        # sum_n = p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0) + ... + p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1})
+        q_log_sum_exp = torch.logsumexp(q, dim=1, keepdim=True)
+
+        # p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_0          ...      p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_n
+        #                        .                             .                                   .
+        #                        .                                     .                           .
+        #                        .                                               .                 .
+        # p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_0  ...      p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_n
+        q = q - q_log_sum_exp
+
+        # (p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_0) * a_cumulative_{t-1} + b_cumulative_{t-1}          ...      (p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_n) * a_cumulative_{t-1} + b_cumulative_{t-1}
+        #                                         .                                                .                                              .
+        #                                         .                                                        .                                      .
+        #                                         .                                                                  .                            .
+        # (p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_0) * a_cumulative_{t-1} + b_cumulative_{t-1}  ...      (p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_n) * a_cumulative_{t-1} + b_cumulative_{t-1}
+        # c_cumulative_{t-1}                                                                                 ...      c_cumulative_{t-1}
+        q = self.apply_cumulative_transitions(q, t - 1)
+
+        # ((p_0(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_0) * a_cumulative_{t-1} + b_cumulative_{t-1}) * q(x_t | x_{t-1}=C_0) * sum_0              ...      ((p_n(x_0=C_0 | x_t) / q(x_t | x_0=C_0) / sum_n) * a_cumulative_{t-1} + b_cumulative_{t-1}) * q(x_t | x_{t-1}=C_0) * sum_n
+        #                                                            .                                                                 .                                              .
+        #                                                            .                                                                         .                                      .
+        #                                                            .                                                                                   .                            .
+        # ((p_0(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_0) * a_cumulative_{t-1} + b_cumulative_{t-1}) * q(x_t | x_{t-1}=C_{k-1}) * sum_0  ...      ((p_n(x_0=C_{k-1} | x_t) / q(x_t | x_0=C_{k-1}) / sum_n) * a_cumulative_{t-1} + b_cumulative_{t-1}) * q(x_t | x_{t-1}=C_{k-1}) * sum_n
+        # c_cumulative_{t-1} * q(x_t | x_{t-1}=C_k) * sum_0                                                                                       ...      c_cumulative_{t-1} * q(x_t | x_{t-1}=C_k) * sum_0
+        log_p_x_t_min_1 = q + log_q_t_given_x_t_min_1 + q_log_sum_exp
+
+        # For each column, there are two possible cases.
+        #
+        # Where:
+        # - sum(p_n(x_0))) is summing over all classes for x_0
+        # - C_i is the class transitioning from (not to be confused with c_t and c_cumulative_t being used for gamma's)
+        # - C_j is the class transitioning to
+        #
+        # 1. x_t is masked i.e. x_t = c_k
+        #
+        # Simplifying the expression, the column vector is:
+        #                                                      .
+        #                                                      .
+        #                                                      .
+        # (c_t / c_cumulative_t) * (a_cumulative_{t-1} * p_n(x_0 = C_i | x_t) + b_cumulative_{t-1} * sum(p_n(x_0)))
+        #                                                      .
+        #                                                      .
+        #                                                      .
+        # (c_cumulative_{t-1} / c_cumulative_t) * sum(p_n(x_0))
+        #
+        # From equation (11) stated in terms of forward probabilities, the last row is trivially verified.
+        #
+        # For the other rows, we can state the equation as ...
+        #
+        # (c_t / c_cumulative_t) * [b_cumulative_{t-1} * p(x_0=c_0) + ... + (a_cumulative_{t-1} + b_cumulative_{t-1}) * p(x_0=C_i) + ... + b_cumulative_{k-1} * p(x_0=c_{k-1})]
+        #
+        # This verifies the other rows.
+        #
+        # 2. x_t is not masked
+        #
+        # Simplifying the expression, there are two cases for the rows of the column vector, where C_j = C_i and where C_j != C_i:
+        #                                                      .
+        #                                                      .
+        #                                                      .
+        # C_j != C_i:        b_t * ((b_cumulative_{t-1} / b_cumulative_t) * p_n(x_0 = c_0) + ... + ((a_cumulative_{t-1} + b_cumulative_{t-1}) / b_cumulative_t) * p_n(x_0 = C_i) + ... + (b_cumulative_{t-1} / (a_cumulative_t + b_cumulative_t)) * p_n(c_0=C_j) + ... + (b_cumulative_{t-1} / b_cumulative_t) * p_n(x_0 = c_{k-1}))
+        #                                                      .
+        #                                                      .
+        #                                                      .
+        # C_j = C_i: (a_t + b_t) * ((b_cumulative_{t-1} / b_cumulative_t) * p_n(x_0 = c_0) + ... + ((a_cumulative_{t-1} + b_cumulative_{t-1}) / (a_cumulative_t + b_cumulative_t)) * p_n(x_0 = C_i = C_j) + ... + (b_cumulative_{t-1} / b_cumulative_t) * p_n(x_0 = c_{k-1}))
+        #                                                      .
+        #                                                      .
+        #                                                      .
+        # 0
+        #
+        # The last row is trivially verified. The other rows can be verified by directly expanding equation (11) stated in terms of forward probabilities.
+        return log_p_x_t_min_1
+
+    def log_Q_t_transitioning_to_known_class(
+        self, *, t: torch.int, x_t: torch.LongTensor, log_onehot_x_t: torch.FloatTensor, cumulative: bool
+    ):
+        """
+        Calculates the log probabilities of the rows from the (cumulative or non-cumulative) transition matrix for each
+        latent pixel in `x_t`.
+
+        Args:
+            t (`torch.Long`):
+                The timestep that determines which transition matrix is used.
+            x_t (`torch.LongTensor` of shape `(batch size, num latent pixels)`):
+                The classes of each latent pixel at time `t`.
+            log_onehot_x_t (`torch.FloatTensor` of shape `(batch size, num classes, num latent pixels)`):
+                The log one-hot vectors of `x_t`.
+            cumulative (`bool`):
+                If cumulative is `False`, the single step transition matrix `t-1`->`t` is used. If cumulative is
+                `True`, the cumulative transition matrix `0`->`t` is used.
+
+        Returns:
+            `torch.FloatTensor` of shape `(batch size, num classes - 1, num latent pixels)`:
+                Each _column_ of the returned matrix is a _row_ of log probabilities of the complete probability
+                transition matrix.
+
+                When non cumulative, returns `self.num_classes - 1` rows because the initial latent pixel cannot be
+                masked.
+
+                Where:
+                - `q_n` is the probability distribution for the forward process of the `n`th latent pixel.
+                - C_0 is a class of a latent pixel embedding
+                - C_k is the class of the masked latent pixel
+
+                non-cumulative result (omitting logarithms):
+                ```
+                q_0(x_t | x_{t-1} = C_0) ... q_n(x_t | x_{t-1} = C_0)
+                          .      .                     .
+                          .               .            .
+                          .                      .     .
+                q_0(x_t | x_{t-1} = C_k) ... q_n(x_t | x_{t-1} = C_k)
+                ```
+
+                cumulative result (omitting logarithms):
+                ```
+                q_0_cumulative(x_t | x_0 = C_0)    ...  q_n_cumulative(x_t | x_0 = C_0)
+                          .               .                          .
+                          .                        .                 .
+                          .                               .          .
+                q_0_cumulative(x_t | x_0 = C_{k-1}) ... q_n_cumulative(x_t | x_0 = C_{k-1})
+                ```
+        """
+        if cumulative:
+            a = self.log_cumprod_at[t]
+            b = self.log_cumprod_bt[t]
+            c = self.log_cumprod_ct[t]
+        else:
+            a = self.log_at[t]
+            b = self.log_bt[t]
+            c = self.log_ct[t]
+
+        if not cumulative:
+            # The values in the onehot vector can also be used as the logprobs for transitioning
+            # from masked latent pixels. If we are not calculating the cumulative transitions,
+            # we need to save these vectors to be re-appended to the final matrix so the values
+            # aren't overwritten.
+            #
+            # `P(x_t!=mask|x_{t-1=mask}) = 0` and 0 will be the value of the last row of the onehot vector
+            # if x_t is not masked
+            #
+            # `P(x_t=mask|x_{t-1=mask}) = 1` and 1 will be the value of the last row of the onehot vector
+            # if x_t is masked
+            log_onehot_x_t_transitioning_from_masked = log_onehot_x_t[:, -1, :].unsqueeze(1)
+
+        # `index_to_log_onehot` will add onehot vectors for masked pixels,
+        # so the default one hot matrix has one too many rows. See the doc string
+        # for an explanation of the dimensionality of the returned matrix.
+        log_onehot_x_t = log_onehot_x_t[:, :-1, :]
+
+        # this is a cheeky trick to produce the transition probabilities using log one-hot vectors.
+        #
+        # Don't worry about what values this sets in the columns that mark transitions
+        # to masked latent pixels. They are overwrote later with the `mask_class_mask`.
+        #
+        # Looking at the below logspace formula in non-logspace, each value will evaluate to either
+        # `1 * a + b = a + b` where `log_Q_t` has the one hot value in the column
+        # or
+        # `0 * a + b = b` where `log_Q_t` has the 0 values in the column.
+        #
+        # See equation 7 for more details.
+        log_Q_t = (log_onehot_x_t + a).logaddexp(b)
+
+        # The whole column of each masked pixel is `c`
+        mask_class_mask = x_t == self.mask_class
+        mask_class_mask = mask_class_mask.unsqueeze(1).expand(-1, self.num_embed - 1, -1)
+        log_Q_t[mask_class_mask] = c
+
+        if not cumulative:
+            log_Q_t = torch.cat((log_Q_t, log_onehot_x_t_transitioning_from_masked), dim=1)
+
+        return log_Q_t
+
+    def apply_cumulative_transitions(self, q, t):
+        bsz = q.shape[0]
+        a = self.log_cumprod_at[t]
+        b = self.log_cumprod_bt[t]
+        c = self.log_cumprod_ct[t]
+
+        num_latent_pixels = q.shape[2]
+        c = c.expand(bsz, 1, num_latent_pixels)
+
+        q = (q + a).logaddexp(b)
+        q = torch.cat((q, c), dim=1)
+
+        return q
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/training_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/training_utils.py
new file mode 100644
index 000000000..25e02a3d1
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/training_utils.py
@@ -0,0 +1,453 @@
+import contextlib
+import copy
+import random
+from typing import Any, Dict, Iterable, List, Optional, Union
+
+import numpy as np
+import torch
+
+from .models import UNet2DConditionModel
+from .utils import (
+    convert_state_dict_to_diffusers,
+    convert_state_dict_to_peft,
+    deprecate,
+    is_peft_available,
+    is_torch_npu_available,
+    is_torchvision_available,
+    is_transformers_available,
+)
+
+
+if is_transformers_available():
+    import transformers
+
+if is_peft_available():
+    from peft import set_peft_model_state_dict
+
+if is_torchvision_available():
+    from torchvision import transforms
+
+if is_torch_npu_available():
+    import torch_npu  # noqa: F401
+
+
+def set_seed(seed: int):
+    """
+    Args:
+    Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch`.
+        seed (`int`): The seed to set.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if is_torch_npu_available():
+        torch.npu.manual_seed_all(seed)
+    else:
+        torch.cuda.manual_seed_all(seed)
+        # ^^ safe to call this function even if cuda is not available
+
+
+def compute_snr(noise_scheduler, timesteps):
+    """
+    Computes SNR as per
+    https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
+    """
+    alphas_cumprod = noise_scheduler.alphas_cumprod
+    sqrt_alphas_cumprod = alphas_cumprod**0.5
+    sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5
+
+    # Expand the tensors.
+    # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
+    sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
+    while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
+        sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
+    alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
+
+    sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(device=timesteps.device)[timesteps].float()
+    while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
+        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None]
+    sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
+
+    # Compute SNR.
+    snr = (alpha / sigma) ** 2
+    return snr
+
+
+def resolve_interpolation_mode(interpolation_type: str):
+    """
+    Maps a string describing an interpolation function to the corresponding torchvision `InterpolationMode` enum. The
+    full list of supported enums is documented at
+    https://pytorch.org/vision/0.9/transforms.html#torchvision.transforms.functional.InterpolationMode.
+
+    Args:
+        interpolation_type (`str`):
+            A string describing an interpolation method. Currently, `bilinear`, `bicubic`, `box`, `nearest`,
+            `nearest_exact`, `hamming`, and `lanczos` are supported, corresponding to the supported interpolation modes
+            in torchvision.
+
+    Returns:
+        `torchvision.transforms.InterpolationMode`: an `InterpolationMode` enum used by torchvision's `resize`
+        transform.
+    """
+    if not is_torchvision_available():
+        raise ImportError(
+            "Please make sure to install `torchvision` to be able to use the `resolve_interpolation_mode()` function."
+        )
+
+    if interpolation_type == "bilinear":
+        interpolation_mode = transforms.InterpolationMode.BILINEAR
+    elif interpolation_type == "bicubic":
+        interpolation_mode = transforms.InterpolationMode.BICUBIC
+    elif interpolation_type == "box":
+        interpolation_mode = transforms.InterpolationMode.BOX
+    elif interpolation_type == "nearest":
+        interpolation_mode = transforms.InterpolationMode.NEAREST
+    elif interpolation_type == "nearest_exact":
+        interpolation_mode = transforms.InterpolationMode.NEAREST_EXACT
+    elif interpolation_type == "hamming":
+        interpolation_mode = transforms.InterpolationMode.HAMMING
+    elif interpolation_type == "lanczos":
+        interpolation_mode = transforms.InterpolationMode.LANCZOS
+    else:
+        raise ValueError(
+            f"The given interpolation mode {interpolation_type} is not supported. Currently supported interpolation"
+            f" modes are `bilinear`, `bicubic`, `box`, `nearest`, `nearest_exact`, `hamming`, and `lanczos`."
+        )
+
+    return interpolation_mode
+
+
+def unet_lora_state_dict(unet: UNet2DConditionModel) -> Dict[str, torch.Tensor]:
+    r"""
+    Returns:
+        A state dict containing just the LoRA parameters.
+    """
+    lora_state_dict = {}
+
+    for name, module in unet.named_modules():
+        if hasattr(module, "set_lora_layer"):
+            lora_layer = getattr(module, "lora_layer")
+            if lora_layer is not None:
+                current_lora_layer_sd = lora_layer.state_dict()
+                for lora_layer_matrix_name, lora_param in current_lora_layer_sd.items():
+                    # The matrix name can either be "down" or "up".
+                    lora_state_dict[f"{name}.lora.{lora_layer_matrix_name}"] = lora_param
+
+    return lora_state_dict
+
+
+def cast_training_params(model: Union[torch.nn.Module, List[torch.nn.Module]], dtype=torch.float32):
+    if not isinstance(model, list):
+        model = [model]
+    for m in model:
+        for param in m.parameters():
+            # only upcast trainable parameters into fp32
+            if param.requires_grad:
+                param.data = param.to(dtype)
+
+
+def _set_state_dict_into_text_encoder(
+    lora_state_dict: Dict[str, torch.Tensor], prefix: str, text_encoder: torch.nn.Module
+):
+    """
+    Sets the `lora_state_dict` into `text_encoder` coming from `transformers`.
+
+    Args:
+        lora_state_dict: The state dictionary to be set.
+        prefix: String identifier to retrieve the portion of the state dict that belongs to `text_encoder`.
+        text_encoder: Where the `lora_state_dict` is to be set.
+    """
+
+    text_encoder_state_dict = {
+        f'{k.replace(prefix, "")}': v for k, v in lora_state_dict.items() if k.startswith(prefix)
+    }
+    text_encoder_state_dict = convert_state_dict_to_peft(convert_state_dict_to_diffusers(text_encoder_state_dict))
+    set_peft_model_state_dict(text_encoder, text_encoder_state_dict, adapter_name="default")
+
+
+# Adapted from torch-ema https://github.com/fadel/pytorch_ema/blob/master/torch_ema/ema.py#L14
+class EMAModel:
+    """
+    Exponential Moving Average of models weights
+    """
+
+    def __init__(
+        self,
+        parameters: Iterable[torch.nn.Parameter],
+        decay: float = 0.9999,
+        min_decay: float = 0.0,
+        update_after_step: int = 0,
+        use_ema_warmup: bool = False,
+        inv_gamma: Union[float, int] = 1.0,
+        power: Union[float, int] = 2 / 3,
+        model_cls: Optional[Any] = None,
+        model_config: Dict[str, Any] = None,
+        **kwargs,
+    ):
+        """
+        Args:
+            parameters (Iterable[torch.nn.Parameter]): The parameters to track.
+            decay (float): The decay factor for the exponential moving average.
+            min_decay (float): The minimum decay factor for the exponential moving average.
+            update_after_step (int): The number of steps to wait before starting to update the EMA weights.
+            use_ema_warmup (bool): Whether to use EMA warmup.
+            inv_gamma (float):
+                Inverse multiplicative factor of EMA warmup. Default: 1. Only used if `use_ema_warmup` is True.
+            power (float): Exponential factor of EMA warmup. Default: 2/3. Only used if `use_ema_warmup` is True.
+            device (Optional[Union[str, torch.device]]): The device to store the EMA weights on. If None, the EMA
+                        weights will be stored on CPU.
+
+        @crowsonkb's notes on EMA Warmup:
+            If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are good values for models you plan
+            to train for a million or more steps (reaches decay factor 0.999 at 31.6K steps, 0.9999 at 1M steps),
+            gamma=1, power=3/4 for models you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999
+            at 215.4k steps).
+        """
+
+        if isinstance(parameters, torch.nn.Module):
+            deprecation_message = (
+                "Passing a `torch.nn.Module` to `ExponentialMovingAverage` is deprecated. "
+                "Please pass the parameters of the module instead."
+            )
+            deprecate(
+                "passing a `torch.nn.Module` to `ExponentialMovingAverage`",
+                "1.0.0",
+                deprecation_message,
+                standard_warn=False,
+            )
+            parameters = parameters.parameters()
+
+            # set use_ema_warmup to True if a torch.nn.Module is passed for backwards compatibility
+            use_ema_warmup = True
+
+        if kwargs.get("max_value", None) is not None:
+            deprecation_message = "The `max_value` argument is deprecated. Please use `decay` instead."
+            deprecate("max_value", "1.0.0", deprecation_message, standard_warn=False)
+            decay = kwargs["max_value"]
+
+        if kwargs.get("min_value", None) is not None:
+            deprecation_message = "The `min_value` argument is deprecated. Please use `min_decay` instead."
+            deprecate("min_value", "1.0.0", deprecation_message, standard_warn=False)
+            min_decay = kwargs["min_value"]
+
+        parameters = list(parameters)
+        self.shadow_params = [p.clone().detach() for p in parameters]
+
+        if kwargs.get("device", None) is not None:
+            deprecation_message = "The `device` argument is deprecated. Please use `to` instead."
+            deprecate("device", "1.0.0", deprecation_message, standard_warn=False)
+            self.to(device=kwargs["device"])
+
+        self.temp_stored_params = None
+
+        self.decay = decay
+        self.min_decay = min_decay
+        self.update_after_step = update_after_step
+        self.use_ema_warmup = use_ema_warmup
+        self.inv_gamma = inv_gamma
+        self.power = power
+        self.optimization_step = 0
+        self.cur_decay_value = None  # set in `step()`
+
+        self.model_cls = model_cls
+        self.model_config = model_config
+
+    @classmethod
+    def from_pretrained(cls, path, model_cls) -> "EMAModel":
+        _, ema_kwargs = model_cls.load_config(path, return_unused_kwargs=True)
+        model = model_cls.from_pretrained(path)
+
+        ema_model = cls(model.parameters(), model_cls=model_cls, model_config=model.config)
+
+        ema_model.load_state_dict(ema_kwargs)
+        return ema_model
+
+    def save_pretrained(self, path):
+        if self.model_cls is None:
+            raise ValueError("`save_pretrained` can only be used if `model_cls` was defined at __init__.")
+
+        if self.model_config is None:
+            raise ValueError("`save_pretrained` can only be used if `model_config` was defined at __init__.")
+
+        model = self.model_cls.from_config(self.model_config)
+        state_dict = self.state_dict()
+        state_dict.pop("shadow_params", None)
+
+        model.register_to_config(**state_dict)
+        self.copy_to(model.parameters())
+        model.save_pretrained(path)
+
+    def get_decay(self, optimization_step: int) -> float:
+        """
+        Compute the decay factor for the exponential moving average.
+        """
+        step = max(0, optimization_step - self.update_after_step - 1)
+
+        if step <= 0:
+            return 0.0
+
+        if self.use_ema_warmup:
+            cur_decay_value = 1 - (1 + step / self.inv_gamma) ** -self.power
+        else:
+            cur_decay_value = (1 + step) / (10 + step)
+
+        cur_decay_value = min(cur_decay_value, self.decay)
+        # make sure decay is not smaller than min_decay
+        cur_decay_value = max(cur_decay_value, self.min_decay)
+        return cur_decay_value
+
+    @torch.no_grad()
+    def step(self, parameters: Iterable[torch.nn.Parameter]):
+        if isinstance(parameters, torch.nn.Module):
+            deprecation_message = (
+                "Passing a `torch.nn.Module` to `ExponentialMovingAverage.step` is deprecated. "
+                "Please pass the parameters of the module instead."
+            )
+            deprecate(
+                "passing a `torch.nn.Module` to `ExponentialMovingAverage.step`",
+                "1.0.0",
+                deprecation_message,
+                standard_warn=False,
+            )
+            parameters = parameters.parameters()
+
+        parameters = list(parameters)
+
+        self.optimization_step += 1
+
+        # Compute the decay factor for the exponential moving average.
+        decay = self.get_decay(self.optimization_step)
+        self.cur_decay_value = decay
+        one_minus_decay = 1 - decay
+
+        context_manager = contextlib.nullcontext
+        if is_transformers_available() and transformers.deepspeed.is_deepspeed_zero3_enabled():
+            import deepspeed
+
+        for s_param, param in zip(self.shadow_params, parameters):
+            if is_transformers_available() and transformers.deepspeed.is_deepspeed_zero3_enabled():
+                context_manager = deepspeed.zero.GatheredParameters(param, modifier_rank=None)
+
+            with context_manager():
+                if param.requires_grad:
+                    s_param.sub_(one_minus_decay * (s_param - param))
+                else:
+                    s_param.copy_(param)
+
+    def copy_to(self, parameters: Iterable[torch.nn.Parameter]) -> None:
+        """
+        Copy current averaged parameters into given collection of parameters.
+
+        Args:
+            parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+                updated with the stored moving averages. If `None`, the parameters with which this
+                `ExponentialMovingAverage` was initialized will be used.
+        """
+        parameters = list(parameters)
+        for s_param, param in zip(self.shadow_params, parameters):
+            param.data.copy_(s_param.to(param.device).data)
+
+    def to(self, device=None, dtype=None) -> None:
+        r"""Move internal buffers of the ExponentialMovingAverage to `device`.
+
+        Args:
+            device: like `device` argument to `torch.Tensor.to`
+        """
+        # .to() on the tensors handles None correctly
+        self.shadow_params = [
+            p.to(device=device, dtype=dtype) if p.is_floating_point() else p.to(device=device)
+            for p in self.shadow_params
+        ]
+
+    def state_dict(self) -> dict:
+        r"""
+        Returns the state of the ExponentialMovingAverage as a dict. This method is used by accelerate during
+        checkpointing to save the ema state dict.
+        """
+        # Following PyTorch conventions, references to tensors are returned:
+        # "returns a reference to the state and not its copy!" -
+        # https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict
+        return {
+            "decay": self.decay,
+            "min_decay": self.min_decay,
+            "optimization_step": self.optimization_step,
+            "update_after_step": self.update_after_step,
+            "use_ema_warmup": self.use_ema_warmup,
+            "inv_gamma": self.inv_gamma,
+            "power": self.power,
+            "shadow_params": self.shadow_params,
+        }
+
+    def store(self, parameters: Iterable[torch.nn.Parameter]) -> None:
+        r"""
+        Args:
+        Save the current parameters for restoring later.
+            parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+                temporarily stored.
+        """
+        self.temp_stored_params = [param.detach().cpu().clone() for param in parameters]
+
+    def restore(self, parameters: Iterable[torch.nn.Parameter]) -> None:
+        r"""
+        Args:
+        Restore the parameters stored with the `store` method. Useful to validate the model with EMA parameters without:
+        affecting the original optimization process. Store the parameters before the `copy_to()` method. After
+        validation (or model saving), use this to restore the former parameters.
+            parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+                updated with the stored parameters. If `None`, the parameters with which this
+                `ExponentialMovingAverage` was initialized will be used.
+        """
+        if self.temp_stored_params is None:
+            raise RuntimeError("This ExponentialMovingAverage has no `store()`ed weights " "to `restore()`")
+        for c_param, param in zip(self.temp_stored_params, parameters):
+            param.data.copy_(c_param.data)
+
+        # Better memory-wise.
+        self.temp_stored_params = None
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        r"""
+        Args:
+        Loads the ExponentialMovingAverage state. This method is used by accelerate during checkpointing to save the
+        ema state dict.
+            state_dict (dict): EMA state. Should be an object returned
+                from a call to :meth:`state_dict`.
+        """
+        # deepcopy, to be consistent with module API
+        state_dict = copy.deepcopy(state_dict)
+
+        self.decay = state_dict.get("decay", self.decay)
+        if self.decay < 0.0 or self.decay > 1.0:
+            raise ValueError("Decay must be between 0 and 1")
+
+        self.min_decay = state_dict.get("min_decay", self.min_decay)
+        if not isinstance(self.min_decay, float):
+            raise ValueError("Invalid min_decay")
+
+        self.optimization_step = state_dict.get("optimization_step", self.optimization_step)
+        if not isinstance(self.optimization_step, int):
+            raise ValueError("Invalid optimization_step")
+
+        self.update_after_step = state_dict.get("update_after_step", self.update_after_step)
+        if not isinstance(self.update_after_step, int):
+            raise ValueError("Invalid update_after_step")
+
+        self.use_ema_warmup = state_dict.get("use_ema_warmup", self.use_ema_warmup)
+        if not isinstance(self.use_ema_warmup, bool):
+            raise ValueError("Invalid use_ema_warmup")
+
+        self.inv_gamma = state_dict.get("inv_gamma", self.inv_gamma)
+        if not isinstance(self.inv_gamma, (float, int)):
+            raise ValueError("Invalid inv_gamma")
+
+        self.power = state_dict.get("power", self.power)
+        if not isinstance(self.power, (float, int)):
+            raise ValueError("Invalid power")
+
+        shadow_params = state_dict.get("shadow_params", None)
+        if shadow_params is not None:
+            self.shadow_params = shadow_params
+            if not isinstance(self.shadow_params, list):
+                raise ValueError("shadow_params must be a list")
+            if not all(isinstance(p, torch.Tensor) for p in self.shadow_params):
+                raise ValueError("shadow_params must all be Tensors")
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/__init__.py
new file mode 100644
index 000000000..4e2f07f2b
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/__init__.py
@@ -0,0 +1,124 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+
+from packaging import version
+
+from .. import __version__
+from .constants import (
+    CONFIG_NAME,
+    DEPRECATED_REVISION_ARGS,
+    DIFFUSERS_DYNAMIC_MODULE_NAME,
+    FLAX_WEIGHTS_NAME,
+    HF_MODULES_CACHE,
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    MIN_PEFT_VERSION,
+    ONNX_EXTERNAL_WEIGHTS_NAME,
+    ONNX_WEIGHTS_NAME,
+    SAFETENSORS_FILE_EXTENSION,
+    SAFETENSORS_WEIGHTS_NAME,
+    USE_PEFT_BACKEND,
+    WEIGHTS_NAME,
+)
+from .deprecation_utils import deprecate
+from .doc_utils import replace_example_docstring
+from .dynamic_modules_utils import get_class_from_dynamic_module
+from .export_utils import export_to_gif, export_to_obj, export_to_ply, export_to_video
+from .hub_utils import (
+    PushToHubMixin,
+    _add_variant,
+    _get_model_file,
+    extract_commit_hash,
+    http_user_agent,
+)
+from .import_utils import (
+    BACKENDS_MAPPING,
+    DIFFUSERS_SLOW_IMPORT,
+    ENV_VARS_TRUE_AND_AUTO_VALUES,
+    ENV_VARS_TRUE_VALUES,
+    USE_JAX,
+    USE_TF,
+    USE_TORCH,
+    DummyObject,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_accelerate_available,
+    is_accelerate_version,
+    is_bs4_available,
+    is_flax_available,
+    is_ftfy_available,
+    is_inflect_available,
+    is_invisible_watermark_available,
+    is_k_diffusion_available,
+    is_k_diffusion_version,
+    is_librosa_available,
+    is_note_seq_available,
+    is_onnx_available,
+    is_peft_available,
+    is_scipy_available,
+    is_tensorboard_available,
+    is_torch_available,
+    is_torch_npu_available,
+    is_torch_version,
+    is_torch_xla_available,
+    is_torchsde_available,
+    is_torchvision_available,
+    is_transformers_available,
+    is_transformers_version,
+    is_unidecode_available,
+    is_wandb_available,
+    is_xformers_available,
+    requires_backends,
+)
+from .loading_utils import load_image
+from .logging import get_logger
+from .outputs import BaseOutput
+from .peft_utils import (
+    check_peft_version,
+    delete_adapter_layers,
+    get_adapter_name,
+    get_peft_kwargs,
+    recurse_remove_peft_layers,
+    scale_lora_layers,
+    set_adapter_layers,
+    set_weights_and_activate_adapters,
+    unscale_lora_layers,
+)
+from .pil_utils import PIL_INTERPOLATION, make_image_grid, numpy_to_pil, pt_to_pil
+from .state_dict_utils import (
+    convert_all_state_dict_to_peft,
+    convert_state_dict_to_diffusers,
+    convert_state_dict_to_kohya,
+    convert_state_dict_to_peft,
+    convert_unet_state_dict_to_peft,
+)
+
+
+logger = get_logger(__name__)
+
+
+def check_min_version(min_version):
+    if version.parse(__version__) < version.parse(min_version):
+        if "dev" in min_version:
+            error_message = (
+                "This example requires a source install from HuggingFace diffusers (see "
+                "`https://huggingface.co/docs/diffusers/installation#install-from-source`),"
+            )
+        else:
+            error_message = f"This example requires a minimum version of {min_version},"
+        error_message += f" but the version found is {__version__}.\n"
+        raise ImportError(error_message)
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/accelerate_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/accelerate_utils.py
new file mode 100644
index 000000000..99a8b3a47
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/accelerate_utils.py
@@ -0,0 +1,48 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Accelerate utilities: Utilities related to accelerate
+"""
+
+from packaging import version
+
+from .import_utils import is_accelerate_available
+
+
+if is_accelerate_available():
+    import accelerate
+
+
+def apply_forward_hook(method):
+    """
+    Decorator that applies a registered CpuOffload hook to an arbitrary function rather than `forward`. This is useful
+    for cases where a PyTorch module provides functions other than `forward` that should trigger a move to the
+    appropriate acceleration device. This is the case for `encode` and `decode` in [`AutoencoderKL`].
+
+    This decorator looks inside the internal `_hf_hook` property to find a registered offload hook.
+
+    :param method: The method to decorate. This method should be a method of a PyTorch module.
+    """
+    if not is_accelerate_available():
+        return method
+    accelerate_version = version.parse(accelerate.__version__).base_version
+    if version.parse(accelerate_version) < version.parse("0.17.0"):
+        return method
+
+    def wrapper(self, *args, **kwargs):
+        if hasattr(self, "_hf_hook") and hasattr(self._hf_hook, "pre_forward"):
+            self._hf_hook.pre_forward(self)
+        return method(self, *args, **kwargs)
+
+    return wrapper
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/constants.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/constants.py
new file mode 100644
index 000000000..bc4268a32
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/constants.py
@@ -0,0 +1,55 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import os
+
+from huggingface_hub.constants import HF_HOME
+from packaging import version
+
+from ..dependency_versions_check import dep_version_check
+from .import_utils import ENV_VARS_TRUE_VALUES, is_peft_available, is_transformers_available
+
+
+MIN_PEFT_VERSION = "0.6.0"
+MIN_TRANSFORMERS_VERSION = "4.34.0"
+_CHECK_PEFT = os.environ.get("_CHECK_PEFT", "1") in ENV_VARS_TRUE_VALUES
+
+
+CONFIG_NAME = "config.json"
+WEIGHTS_NAME = "diffusion_pytorch_model.bin"
+FLAX_WEIGHTS_NAME = "diffusion_flax_model.msgpack"
+ONNX_WEIGHTS_NAME = "model.onnx"
+SAFETENSORS_WEIGHTS_NAME = "diffusion_pytorch_model.safetensors"
+SAFETENSORS_FILE_EXTENSION = "safetensors"
+ONNX_EXTERNAL_WEIGHTS_NAME = "weights.pb"
+HUGGINGFACE_CO_RESOLVE_ENDPOINT = os.environ.get("HF_ENDPOINT", "https://huggingface.co")
+DIFFUSERS_DYNAMIC_MODULE_NAME = "diffusers_modules"
+HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(HF_HOME, "modules"))
+DEPRECATED_REVISION_ARGS = ["fp16", "non-ema"]
+
+# Below should be `True` if the current version of `peft` and `transformers` are compatible with
+# PEFT backend. Will automatically fall back to PEFT backend if the correct versions of the libraries are
+# available.
+# For PEFT it is has to be greater than or equal to 0.6.0 and for transformers it has to be greater than or equal to 4.34.0.
+_required_peft_version = is_peft_available() and version.parse(
+    version.parse(importlib.metadata.version("peft")).base_version
+) >= version.parse(MIN_PEFT_VERSION)
+_required_transformers_version = is_transformers_available() and version.parse(
+    version.parse(importlib.metadata.version("transformers")).base_version
+) >= version.parse(MIN_TRANSFORMERS_VERSION)
+
+USE_PEFT_BACKEND = _required_peft_version and _required_transformers_version
+
+if USE_PEFT_BACKEND and _CHECK_PEFT:
+    dep_version_check("peft")
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/deprecation_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/deprecation_utils.py
new file mode 100644
index 000000000..f482deddd
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/deprecation_utils.py
@@ -0,0 +1,49 @@
+import inspect
+import warnings
+from typing import Any, Dict, Optional, Union
+
+from packaging import version
+
+
+def deprecate(*args, take_from: Optional[Union[Dict, Any]] = None, standard_warn=True, stacklevel=2):
+    from .. import __version__
+
+    deprecated_kwargs = take_from
+    values = ()
+    if not isinstance(args[0], tuple):
+        args = (args,)
+
+    for attribute, version_name, message in args:
+        if version.parse(version.parse(__version__).base_version) >= version.parse(version_name):
+            raise ValueError(
+                f"The deprecation tuple {(attribute, version_name, message)} should be removed since diffusers'"
+                f" version {__version__} is >= {version_name}"
+            )
+
+        warning = None
+        if isinstance(deprecated_kwargs, dict) and attribute in deprecated_kwargs:
+            values += (deprecated_kwargs.pop(attribute),)
+            warning = f"The `{attribute}` argument is deprecated and will be removed in version {version_name}."
+        elif hasattr(deprecated_kwargs, attribute):
+            values += (getattr(deprecated_kwargs, attribute),)
+            warning = f"The `{attribute}` attribute is deprecated and will be removed in version {version_name}."
+        elif deprecated_kwargs is None:
+            warning = f"`{attribute}` is deprecated and will be removed in version {version_name}."
+
+        if warning is not None:
+            warning = warning + " " if standard_warn else ""
+            warnings.warn(warning + message, FutureWarning, stacklevel=stacklevel)
+
+    if isinstance(deprecated_kwargs, dict) and len(deprecated_kwargs) > 0:
+        call_frame = inspect.getouterframes(inspect.currentframe())[1]
+        filename = call_frame.filename
+        line_number = call_frame.lineno
+        function = call_frame.function
+        key, value = next(iter(deprecated_kwargs.items()))
+        raise TypeError(f"{function} in {filename} line {line_number-1} got an unexpected keyword argument `{key}`")
+
+    if len(values) == 0:
+        return
+    elif len(values) == 1:
+        return values[0]
+    return values
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/doc_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/doc_utils.py
new file mode 100644
index 000000000..03b6b7a5a
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/doc_utils.py
@@ -0,0 +1,38 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Doc utilities: Utilities related to documentation
+"""
+import re
+
+
+def replace_example_docstring(example_docstring):
+    def docstring_decorator(fn):
+        func_doc = fn.__doc__
+        lines = func_doc.split("\n")
+        i = 0
+        while i < len(lines) and re.search(r"^\s*Examples?:\s*$", lines[i]) is None:
+            i += 1
+        if i < len(lines):
+            lines[i] = example_docstring
+            func_doc = "\n".join(lines)
+        else:
+            raise ValueError(
+                f"The function {fn} should have an empty 'Examples:' in its docstring as placeholder, "
+                f"current docstring is:\n{func_doc}"
+            )
+        fn.__doc__ = func_doc
+        return fn
+
+    return docstring_decorator
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_flax_and_transformers_objects.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_flax_and_transformers_objects.py
new file mode 100644
index 000000000..5e65e5349
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_flax_and_transformers_objects.py
@@ -0,0 +1,77 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class FlaxStableDiffusionControlNetPipeline(metaclass=DummyObject):
+    _backends = ["flax", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax", "transformers"])
+
+
+class FlaxStableDiffusionImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["flax", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax", "transformers"])
+
+
+class FlaxStableDiffusionInpaintPipeline(metaclass=DummyObject):
+    _backends = ["flax", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax", "transformers"])
+
+
+class FlaxStableDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["flax", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax", "transformers"])
+
+
+class FlaxStableDiffusionXLPipeline(metaclass=DummyObject):
+    _backends = ["flax", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax", "transformers"])
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_flax_objects.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_flax_objects.py
new file mode 100644
index 000000000..5fa8dbc81
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_flax_objects.py
@@ -0,0 +1,212 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class FlaxControlNetModel(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxModelMixin(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxUNet2DConditionModel(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxAutoencoderKL(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxDDIMScheduler(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxDDPMScheduler(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxDPMSolverMultistepScheduler(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxEulerDiscreteScheduler(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxKarrasVeScheduler(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxLMSDiscreteScheduler(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxPNDMScheduler(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxSchedulerMixin(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+
+class FlaxScoreSdeVeScheduler(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["flax"])
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_note_seq_objects.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_note_seq_objects.py
new file mode 100644
index 000000000..c02d0b015
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_note_seq_objects.py
@@ -0,0 +1,17 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class MidiProcessor(metaclass=DummyObject):
+    _backends = ["note_seq"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["note_seq"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["note_seq"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["note_seq"])
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_onnx_objects.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_onnx_objects.py
new file mode 100644
index 000000000..bde5f6ad0
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_onnx_objects.py
@@ -0,0 +1,17 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class OnnxRuntimeModel(metaclass=DummyObject):
+    _backends = ["onnx"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["onnx"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["onnx"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["onnx"])
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_pt_objects.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_pt_objects.py
new file mode 100644
index 000000000..14947848a
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_pt_objects.py
@@ -0,0 +1,1170 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class AsymmetricAutoencoderKL(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class AutoencoderKL(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class AutoencoderKLTemporalDecoder(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class AutoencoderTiny(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class ConsistencyDecoderVAE(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class ControlNetModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class I2VGenXLUNet(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class Kandinsky3UNet(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class ModelMixin(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class MotionAdapter(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class MultiAdapter(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class PriorTransformer(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class T2IAdapter(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class T5FilmDecoder(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class Transformer2DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class UNet1DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class UNet2DConditionModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class UNet2DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class UNet3DConditionModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class UNetMotionModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class UNetSpatioTemporalConditionModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class UVit2DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class VQModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+def get_constant_schedule(*args, **kwargs):
+    requires_backends(get_constant_schedule, ["torch"])
+
+
+def get_constant_schedule_with_warmup(*args, **kwargs):
+    requires_backends(get_constant_schedule_with_warmup, ["torch"])
+
+
+def get_cosine_schedule_with_warmup(*args, **kwargs):
+    requires_backends(get_cosine_schedule_with_warmup, ["torch"])
+
+
+def get_cosine_with_hard_restarts_schedule_with_warmup(*args, **kwargs):
+    requires_backends(get_cosine_with_hard_restarts_schedule_with_warmup, ["torch"])
+
+
+def get_linear_schedule_with_warmup(*args, **kwargs):
+    requires_backends(get_linear_schedule_with_warmup, ["torch"])
+
+
+def get_polynomial_decay_schedule_with_warmup(*args, **kwargs):
+    requires_backends(get_polynomial_decay_schedule_with_warmup, ["torch"])
+
+
+def get_scheduler(*args, **kwargs):
+    requires_backends(get_scheduler, ["torch"])
+
+
+class AudioPipelineOutput(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class AutoPipelineForImage2Image(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class AutoPipelineForInpainting(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class AutoPipelineForText2Image(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class BlipDiffusionControlNetPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class BlipDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class CLIPImageProjection(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class ConsistencyModelPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DanceDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DDIMPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DDPMPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DiTPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class ImagePipelineOutput(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class KarrasVePipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class LDMPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class LDMSuperResolutionPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class PNDMPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class RePaintPipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class ScoreSdeVePipeline(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class StableDiffusionMixin(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class AmusedScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class CMStochasticIterativeScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DDIMInverseScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DDIMParallelScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DDIMScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DDPMParallelScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DDPMScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DDPMWuerstchenScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DEISMultistepScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DPMSolverMultistepInverseScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DPMSolverMultistepScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class DPMSolverSinglestepScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class EDMDPMSolverMultistepScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class EDMEulerScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class EulerAncestralDiscreteScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class EulerDiscreteScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class HeunDiscreteScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class IPNDMScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class KarrasVeScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class KDPM2AncestralDiscreteScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class KDPM2DiscreteScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class LCMScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class PNDMScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class RePaintScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class SASolverScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class SchedulerMixin(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class ScoreSdeVeScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class TCDScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class UnCLIPScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class UniPCMultistepScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class VQDiffusionScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class EMAModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_librosa_objects.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_librosa_objects.py
new file mode 100644
index 000000000..2088bc4a7
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_librosa_objects.py
@@ -0,0 +1,32 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class AudioDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "librosa"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "librosa"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "librosa"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "librosa"])
+
+
+class Mel(metaclass=DummyObject):
+    _backends = ["torch", "librosa"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "librosa"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "librosa"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "librosa"])
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_scipy_objects.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_scipy_objects.py
new file mode 100644
index 000000000..a1ff25863
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_scipy_objects.py
@@ -0,0 +1,17 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class LMSDiscreteScheduler(metaclass=DummyObject):
+    _backends = ["torch", "scipy"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "scipy"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "scipy"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "scipy"])
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_torchsde_objects.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_torchsde_objects.py
new file mode 100644
index 000000000..a81bbb316
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_torchsde_objects.py
@@ -0,0 +1,17 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class DPMSolverSDEScheduler(metaclass=DummyObject):
+    _backends = ["torch", "torchsde"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "torchsde"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "torchsde"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "torchsde"])
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_transformers_and_k_diffusion_objects.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_transformers_and_k_diffusion_objects.py
new file mode 100644
index 000000000..2ab00c54c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_transformers_and_k_diffusion_objects.py
@@ -0,0 +1,32 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class StableDiffusionKDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers", "k_diffusion"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers", "k_diffusion"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "k_diffusion"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "k_diffusion"])
+
+
+class StableDiffusionXLKDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers", "k_diffusion"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers", "k_diffusion"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "k_diffusion"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "k_diffusion"])
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py
new file mode 100644
index 000000000..b7afad822
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py
@@ -0,0 +1,92 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class OnnxStableDiffusionImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers", "onnx"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+
+class OnnxStableDiffusionInpaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers", "onnx"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+
+class OnnxStableDiffusionInpaintPipelineLegacy(metaclass=DummyObject):
+    _backends = ["torch", "transformers", "onnx"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+
+class OnnxStableDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers", "onnx"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+
+class OnnxStableDiffusionUpscalePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers", "onnx"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+
+class StableDiffusionOnnxPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers", "onnx"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers", "onnx"])
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_transformers_objects.py
new file mode 100644
index 000000000..f64c15702
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -0,0 +1,1607 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class AltDiffusionImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AltDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AmusedImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AmusedInpaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AmusedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AnimateDiffPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AnimateDiffVideoToVideoPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AudioLDM2Pipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AudioLDM2ProjectionModel(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AudioLDM2UNet2DConditionModel(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AudioLDMPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class CLIPImageProjection(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class CycleDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class I2VGenXLPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class IFImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class IFImg2ImgSuperResolutionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class IFInpaintingPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class IFInpaintingSuperResolutionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class IFPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class IFSuperResolutionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class ImageTextPipelineOutput(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class Kandinsky3Img2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class Kandinsky3Pipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyCombinedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyImg2ImgCombinedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyInpaintCombinedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyInpaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyPriorPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22CombinedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22ControlnetImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22ControlnetPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22Img2ImgCombinedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22Img2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22InpaintCombinedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22InpaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22Pipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22PriorEmb2EmbPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class KandinskyV22PriorPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class LatentConsistencyModelImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class LatentConsistencyModelPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class LDMTextToImagePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class LEditsPPPipelineStableDiffusion(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class LEditsPPPipelineStableDiffusionXL(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class MusicLDMPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class PaintByExamplePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class PIAPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class PixArtAlphaPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class SemanticStableDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class ShapEImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class ShapEPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableCascadeCombinedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableCascadeDecoderPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableCascadePriorPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionAdapterPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionAttendAndExcitePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionControlNetImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionControlNetInpaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionControlNetPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionDepth2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionDiffEditPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionGLIGENPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionGLIGENTextImagePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionImageVariationPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionInpaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionInpaintPipelineLegacy(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionInstructPix2PixPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionLatentUpscalePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionLDM3DPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionModelEditingPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionPanoramaPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionParadigmsPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionPipelineSafe(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionPix2PixZeroPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionSAGPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionUpscalePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionXLAdapterPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionXLControlNetImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionXLControlNetInpaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionXLControlNetPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionXLImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionXLInpaintPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionXLInstructPix2PixPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableDiffusionXLPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableUnCLIPImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableUnCLIPPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class StableVideoDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class TextToVideoSDPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class TextToVideoZeroPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class TextToVideoZeroSDXLPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class UnCLIPImageVariationPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class UnCLIPPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class UniDiffuserModel(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class UniDiffuserPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class UniDiffuserTextDecoder(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class VersatileDiffusionDualGuidedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class VersatileDiffusionImageVariationPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class VersatileDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class VersatileDiffusionTextToImagePipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class VideoToVideoSDPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class VQDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class WuerstchenCombinedPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class WuerstchenDecoderPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class WuerstchenPriorPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_transformers_and_torch_and_note_seq_objects.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_transformers_and_torch_and_note_seq_objects.py
new file mode 100644
index 000000000..fbde04e33
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_transformers_and_torch_and_note_seq_objects.py
@@ -0,0 +1,17 @@
+# This file is autogenerated by the command `make fix-copies`, do not edit.
+from ..utils import DummyObject, requires_backends
+
+
+class SpectrogramDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["transformers", "torch", "note_seq"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["transformers", "torch", "note_seq"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["transformers", "torch", "note_seq"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["transformers", "torch", "note_seq"])
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dynamic_modules_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dynamic_modules_utils.py
new file mode 100644
index 000000000..a4c704a91
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dynamic_modules_utils.py
@@ -0,0 +1,452 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities to dynamically load objects from the Hub."""
+
+import importlib
+import inspect
+import json
+import os
+import re
+import shutil
+import sys
+from pathlib import Path
+from typing import Dict, Optional, Union
+from urllib import request
+
+from huggingface_hub import cached_download, hf_hub_download, model_info
+from huggingface_hub.utils import validate_hf_hub_args
+from packaging import version
+
+from .. import __version__
+from . import DIFFUSERS_DYNAMIC_MODULE_NAME, HF_MODULES_CACHE, logging
+
+
+COMMUNITY_PIPELINES_URL = (
+    "https://raw.githubusercontent.com/huggingface/diffusers/{revision}/examples/community/{pipeline}.py"
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def get_diffusers_versions():
+    url = "https://pypi.org/pypi/diffusers/json"
+    releases = json.loads(request.urlopen(url).read())["releases"].keys()
+    return sorted(releases, key=lambda x: version.Version(x))
+
+
+def init_hf_modules():
+    """
+    Creates the cache directory for modules with an init, and adds it to the Python path.
+    """
+    # This function has already been executed if HF_MODULES_CACHE already is in the Python path.
+    if HF_MODULES_CACHE in sys.path:
+        return
+
+    sys.path.append(HF_MODULES_CACHE)
+    os.makedirs(HF_MODULES_CACHE, exist_ok=True)
+    init_path = Path(HF_MODULES_CACHE) / "__init__.py"
+    if not init_path.exists():
+        init_path.touch()
+
+
+def create_dynamic_module(name: Union[str, os.PathLike]):
+    """
+    Creates a dynamic module in the cache directory for modules.
+    """
+    init_hf_modules()
+    dynamic_module_path = Path(HF_MODULES_CACHE) / name
+    # If the parent module does not exist yet, recursively create it.
+    if not dynamic_module_path.parent.exists():
+        create_dynamic_module(dynamic_module_path.parent)
+    os.makedirs(dynamic_module_path, exist_ok=True)
+    init_path = dynamic_module_path / "__init__.py"
+    if not init_path.exists():
+        init_path.touch()
+
+
+def get_relative_imports(module_file):
+    """
+    Get the list of modules that are relatively imported in a module file.
+
+    Args:
+        module_file (`str` or `os.PathLike`): The module file to inspect.
+    """
+    with open(module_file, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    # Imports of the form `import .xxx`
+    relative_imports = re.findall(r"^\s*import\s+\.(\S+)\s*$", content, flags=re.MULTILINE)
+    # Imports of the form `from .xxx import yyy`
+    relative_imports += re.findall(r"^\s*from\s+\.(\S+)\s+import", content, flags=re.MULTILINE)
+    # Unique-ify
+    return list(set(relative_imports))
+
+
+def get_relative_import_files(module_file):
+    """
+    Get the list of all files that are needed for a given module. Note that this function recurses through the relative
+    imports (if a imports b and b imports c, it will return module files for b and c).
+
+    Args:
+        module_file (`str` or `os.PathLike`): The module file to inspect.
+    """
+    no_change = False
+    files_to_check = [module_file]
+    all_relative_imports = []
+
+    # Let's recurse through all relative imports
+    while not no_change:
+        new_imports = []
+        for f in files_to_check:
+            new_imports.extend(get_relative_imports(f))
+
+        module_path = Path(module_file).parent
+        new_import_files = [str(module_path / m) for m in new_imports]
+        new_import_files = [f for f in new_import_files if f not in all_relative_imports]
+        files_to_check = [f"{f}.py" for f in new_import_files]
+
+        no_change = len(new_import_files) == 0
+        all_relative_imports.extend(files_to_check)
+
+    return all_relative_imports
+
+
+def check_imports(filename):
+    """
+    Check if the current Python environment contains all the libraries that are imported in a file.
+    """
+    with open(filename, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    # Imports of the form `import xxx`
+    imports = re.findall(r"^\s*import\s+(\S+)\s*$", content, flags=re.MULTILINE)
+    # Imports of the form `from xxx import yyy`
+    imports += re.findall(r"^\s*from\s+(\S+)\s+import", content, flags=re.MULTILINE)
+    # Only keep the top-level module
+    imports = [imp.split(".")[0] for imp in imports if not imp.startswith(".")]
+
+    # Unique-ify and test we got them all
+    imports = list(set(imports))
+    missing_packages = []
+    for imp in imports:
+        try:
+            importlib.import_module(imp)
+        except ImportError:
+            missing_packages.append(imp)
+
+    if len(missing_packages) > 0:
+        raise ImportError(
+            "This modeling file requires the following packages that were not found in your environment: "
+            f"{', '.join(missing_packages)}. Run `pip install {' '.join(missing_packages)}`"
+        )
+
+    return get_relative_imports(filename)
+
+
+def get_class_in_module(class_name, module_path):
+    """
+    Import a module on the cache directory for modules and extract a class from it.
+    """
+    module_path = module_path.replace(os.path.sep, ".")
+    module = importlib.import_module(module_path)
+
+    if class_name is None:
+        return find_pipeline_class(module)
+    return getattr(module, class_name)
+
+
+def find_pipeline_class(loaded_module):
+    """
+    Retrieve pipeline class that inherits from `DiffusionPipeline`. Note that there has to be exactly one class
+    inheriting from `DiffusionPipeline`.
+    """
+    from ..pipelines import DiffusionPipeline
+
+    cls_members = dict(inspect.getmembers(loaded_module, inspect.isclass))
+
+    pipeline_class = None
+    for cls_name, cls in cls_members.items():
+        if (
+            cls_name != DiffusionPipeline.__name__
+            and issubclass(cls, DiffusionPipeline)
+            and cls.__module__.split(".")[0] != "diffusers"
+        ):
+            if pipeline_class is not None:
+                raise ValueError(
+                    f"Multiple classes that inherit from {DiffusionPipeline.__name__} have been found:"
+                    f" {pipeline_class.__name__}, and {cls_name}. Please make sure to define only one in"
+                    f" {loaded_module}."
+                )
+            pipeline_class = cls
+
+    return pipeline_class
+
+
+@validate_hf_hub_args
+def get_cached_module_file(
+    pretrained_model_name_or_path: Union[str, os.PathLike],
+    module_file: str,
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    force_download: bool = False,
+    resume_download: bool = False,
+    proxies: Optional[Dict[str, str]] = None,
+    token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+):
+    """
+    Prepares Downloads a module from a local folder or a distant repo and returns its path inside the cached
+    Transformers module.
+
+    Args:
+        pretrained_model_name_or_path (`str` or `os.PathLike`):
+            This can be either:
+
+            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
+              under a user or organization name, like `dbmdz/bert-base-german-cased`.
+            - a path to a *directory* containing a configuration file saved using the
+              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
+
+        module_file (`str`):
+            The name of the module file containing the class to look for.
+        cache_dir (`str` or `os.PathLike`, *optional*):
+            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
+            cache should not be used.
+        force_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to force to (re-)download the configuration files and override the cached versions if they
+            exist.
+        resume_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
+        proxies (`Dict[str, str]`, *optional*):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+        token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `transformers-cli login` (stored in `~/.huggingface`).
+        revision (`str`, *optional*, defaults to `"main"`):
+            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+            identifier allowed by git.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            If `True`, will only try to load the tokenizer configuration from local files.
+
+    <Tip>
+
+    You may pass a token in `token` if you are not logged in (`huggingface-cli login`) and want to use private
+    or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models).
+
+    </Tip>
+
+    Returns:
+        `str`: The path to the module inside the cache.
+    """
+    # Download and cache module_file from the repo `pretrained_model_name_or_path` of grab it if it's a local file.
+    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+
+    module_file_or_url = os.path.join(pretrained_model_name_or_path, module_file)
+
+    if os.path.isfile(module_file_or_url):
+        resolved_module_file = module_file_or_url
+        submodule = "local"
+    elif pretrained_model_name_or_path.count("/") == 0:
+        available_versions = get_diffusers_versions()
+        # cut ".dev0"
+        latest_version = "v" + ".".join(__version__.split(".")[:3])
+
+        # retrieve github version that matches
+        if revision is None:
+            revision = latest_version if latest_version[1:] in available_versions else "main"
+            logger.info(f"Defaulting to latest_version: {revision}.")
+        elif revision in available_versions:
+            revision = f"v{revision}"
+        elif revision == "main":
+            revision = revision
+        else:
+            raise ValueError(
+                f"`custom_revision`: {revision} does not exist. Please make sure to choose one of"
+                f" {', '.join(available_versions + ['main'])}."
+            )
+
+        # community pipeline on GitHub
+        github_url = COMMUNITY_PIPELINES_URL.format(revision=revision, pipeline=pretrained_model_name_or_path)
+        try:
+            resolved_module_file = cached_download(
+                github_url,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                local_files_only=local_files_only,
+                token=False,
+            )
+            submodule = "git"
+            module_file = pretrained_model_name_or_path + ".py"
+        except EnvironmentError:
+            logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.")
+            raise
+    else:
+        try:
+            # Load from URL or cache if already cached
+            resolved_module_file = hf_hub_download(
+                pretrained_model_name_or_path,
+                module_file,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                local_files_only=local_files_only,
+                token=token,
+            )
+            submodule = os.path.join("local", "--".join(pretrained_model_name_or_path.split("/")))
+        except EnvironmentError:
+            logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.")
+            raise
+
+    # Check we have all the requirements in our environment
+    modules_needed = check_imports(resolved_module_file)
+
+    # Now we move the module inside our cached dynamic modules.
+    full_submodule = DIFFUSERS_DYNAMIC_MODULE_NAME + os.path.sep + submodule
+    create_dynamic_module(full_submodule)
+    submodule_path = Path(HF_MODULES_CACHE) / full_submodule
+    if submodule == "local" or submodule == "git":
+        # We always copy local files (we could hash the file to see if there was a change, and give them the name of
+        # that hash, to only copy when there is a modification but it seems overkill for now).
+        # The only reason we do the copy is to avoid putting too many folders in sys.path.
+        shutil.copy(resolved_module_file, submodule_path / module_file)
+        for module_needed in modules_needed:
+            module_needed = f"{module_needed}.py"
+            shutil.copy(os.path.join(pretrained_model_name_or_path, module_needed), submodule_path / module_needed)
+    else:
+        # Get the commit hash
+        # TODO: we will get this info in the etag soon, so retrieve it from there and not here.
+        commit_hash = model_info(pretrained_model_name_or_path, revision=revision, token=token).sha
+
+        # The module file will end up being placed in a subfolder with the git hash of the repo. This way we get the
+        # benefit of versioning.
+        submodule_path = submodule_path / commit_hash
+        full_submodule = full_submodule + os.path.sep + commit_hash
+        create_dynamic_module(full_submodule)
+
+        if not (submodule_path / module_file).exists():
+            shutil.copy(resolved_module_file, submodule_path / module_file)
+        # Make sure we also have every file with relative
+        for module_needed in modules_needed:
+            if not (submodule_path / module_needed).exists():
+                get_cached_module_file(
+                    pretrained_model_name_or_path,
+                    f"{module_needed}.py",
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    token=token,
+                    revision=revision,
+                    local_files_only=local_files_only,
+                )
+    return os.path.join(full_submodule, module_file)
+
+
+@validate_hf_hub_args
+def get_class_from_dynamic_module(
+    pretrained_model_name_or_path: Union[str, os.PathLike],
+    module_file: str,
+    class_name: Optional[str] = None,
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    force_download: bool = False,
+    resume_download: bool = False,
+    proxies: Optional[Dict[str, str]] = None,
+    token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    **kwargs,
+):
+    """
+    Extracts a class from a module file, present in the local folder or repository of a model.
+
+    <Tip warning={true}>
+
+    Calling this function will execute the code in the module file found locally or downloaded from the Hub. It should
+    therefore only be called on trusted repos.
+
+    </Tip>
+
+    Args:
+        pretrained_model_name_or_path (`str` or `os.PathLike`):
+            This can be either:
+
+            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced
+              under a user or organization name, like `dbmdz/bert-base-german-cased`.
+            - a path to a *directory* containing a configuration file saved using the
+              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
+
+        module_file (`str`):
+            The name of the module file containing the class to look for.
+        class_name (`str`):
+            The name of the class to import in the module.
+        cache_dir (`str` or `os.PathLike`, *optional*):
+            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
+            cache should not be used.
+        force_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to force to (re-)download the configuration files and override the cached versions if they
+            exist.
+        resume_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
+        proxies (`Dict[str, str]`, *optional*):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+        token (`str` or `bool`, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `transformers-cli login` (stored in `~/.huggingface`).
+        revision (`str`, *optional*, defaults to `"main"`):
+            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+            identifier allowed by git.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            If `True`, will only try to load the tokenizer configuration from local files.
+
+    <Tip>
+
+    You may pass a token in `token` if you are not logged in (`huggingface-cli login`) and want to use private
+    or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models).
+
+    </Tip>
+
+    Returns:
+        `type`: The class, dynamically imported from the module.
+
+    Examples:
+
+    ```python
+    # Download module `modeling.py` from huggingface.co and cache then extract the class `MyBertModel` from this
+    # module.
+    cls = get_class_from_dynamic_module("sgugger/my-bert-model", "modeling.py", "MyBertModel")
+    ```"""
+    # And lastly we get the class inside our newly created module
+    final_module = get_cached_module_file(
+        pretrained_model_name_or_path,
+        module_file,
+        cache_dir=cache_dir,
+        force_download=force_download,
+        resume_download=resume_download,
+        proxies=proxies,
+        token=token,
+        revision=revision,
+        local_files_only=local_files_only,
+    )
+    return get_class_in_module(class_name, final_module.replace(".py", ""))
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/export_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/export_utils.py
new file mode 100644
index 000000000..bb5307756
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/export_utils.py
@@ -0,0 +1,140 @@
+import io
+import random
+import struct
+import tempfile
+from contextlib import contextmanager
+from typing import List, Union
+
+import numpy as np
+import PIL.Image
+import PIL.ImageOps
+
+from .import_utils import (
+    BACKENDS_MAPPING,
+    is_opencv_available,
+)
+from .logging import get_logger
+
+
+global_rng = random.Random()
+
+logger = get_logger(__name__)
+
+
+@contextmanager
+def buffered_writer(raw_f):
+    f = io.BufferedWriter(raw_f)
+    yield f
+    f.flush()
+
+
+def export_to_gif(image: List[PIL.Image.Image], output_gif_path: str = None, fps: int = 10) -> str:
+    if output_gif_path is None:
+        output_gif_path = tempfile.NamedTemporaryFile(suffix=".gif").name
+
+    image[0].save(
+        output_gif_path,
+        save_all=True,
+        append_images=image[1:],
+        optimize=False,
+        duration=1000 // fps,
+        loop=0,
+    )
+    return output_gif_path
+
+
+def export_to_ply(mesh, output_ply_path: str = None):
+    """
+    Write a PLY file for a mesh.
+    """
+    if output_ply_path is None:
+        output_ply_path = tempfile.NamedTemporaryFile(suffix=".ply").name
+
+    coords = mesh.verts.detach().cpu().numpy()
+    faces = mesh.faces.cpu().numpy()
+    rgb = np.stack([mesh.vertex_channels[x].detach().cpu().numpy() for x in "RGB"], axis=1)
+
+    with buffered_writer(open(output_ply_path, "wb")) as f:
+        f.write(b"ply\n")
+        f.write(b"format binary_little_endian 1.0\n")
+        f.write(bytes(f"element vertex {len(coords)}\n", "ascii"))
+        f.write(b"property float x\n")
+        f.write(b"property float y\n")
+        f.write(b"property float z\n")
+        if rgb is not None:
+            f.write(b"property uchar red\n")
+            f.write(b"property uchar green\n")
+            f.write(b"property uchar blue\n")
+        if faces is not None:
+            f.write(bytes(f"element face {len(faces)}\n", "ascii"))
+            f.write(b"property list uchar int vertex_index\n")
+        f.write(b"end_header\n")
+
+        if rgb is not None:
+            rgb = (rgb * 255.499).round().astype(int)
+            vertices = [
+                (*coord, *rgb)
+                for coord, rgb in zip(
+                    coords.tolist(),
+                    rgb.tolist(),
+                )
+            ]
+            format = struct.Struct("<3f3B")
+            for item in vertices:
+                f.write(format.pack(*item))
+        else:
+            format = struct.Struct("<3f")
+            for vertex in coords.tolist():
+                f.write(format.pack(*vertex))
+
+        if faces is not None:
+            format = struct.Struct("<B3I")
+            for tri in faces.tolist():
+                f.write(format.pack(len(tri), *tri))
+
+    return output_ply_path
+
+
+def export_to_obj(mesh, output_obj_path: str = None):
+    if output_obj_path is None:
+        output_obj_path = tempfile.NamedTemporaryFile(suffix=".obj").name
+
+    verts = mesh.verts.detach().cpu().numpy()
+    faces = mesh.faces.cpu().numpy()
+
+    vertex_colors = np.stack([mesh.vertex_channels[x].detach().cpu().numpy() for x in "RGB"], axis=1)
+    vertices = [
+        "{} {} {} {} {} {}".format(*coord, *color) for coord, color in zip(verts.tolist(), vertex_colors.tolist())
+    ]
+
+    faces = ["f {} {} {}".format(str(tri[0] + 1), str(tri[1] + 1), str(tri[2] + 1)) for tri in faces.tolist()]
+
+    combined_data = ["v " + vertex for vertex in vertices] + faces
+
+    with open(output_obj_path, "w") as f:
+        f.writelines("\n".join(combined_data))
+
+
+def export_to_video(
+    video_frames: Union[List[np.ndarray], List[PIL.Image.Image]], output_video_path: str = None, fps: int = 10
+) -> str:
+    if is_opencv_available():
+        import cv2
+    else:
+        raise ImportError(BACKENDS_MAPPING["opencv"][1].format("export_to_video"))
+    if output_video_path is None:
+        output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4").name
+
+    if isinstance(video_frames[0], np.ndarray):
+        video_frames = [(frame * 255).astype(np.uint8) for frame in video_frames]
+
+    elif isinstance(video_frames[0], PIL.Image.Image):
+        video_frames = [np.array(frame) for frame in video_frames]
+
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    h, w, c = video_frames[0].shape
+    video_writer = cv2.VideoWriter(output_video_path, fourcc, fps=fps, frameSize=(w, h))
+    for i in range(len(video_frames)):
+        img = cv2.cvtColor(video_frames[i], cv2.COLOR_RGB2BGR)
+        video_writer.write(img)
+    return output_video_path
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/hub_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/hub_utils.py
new file mode 100644
index 000000000..e554b42dd
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/hub_utils.py
@@ -0,0 +1,493 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import re
+import sys
+import tempfile
+import traceback
+import warnings
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+from uuid import uuid4
+
+from huggingface_hub import (
+    ModelCard,
+    ModelCardData,
+    create_repo,
+    hf_hub_download,
+    upload_folder,
+)
+from huggingface_hub.constants import HF_HUB_CACHE, HF_HUB_DISABLE_TELEMETRY, HF_HUB_OFFLINE
+from huggingface_hub.file_download import REGEX_COMMIT_HASH
+from huggingface_hub.utils import (
+    EntryNotFoundError,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+    is_jinja_available,
+    validate_hf_hub_args,
+)
+from packaging import version
+from requests import HTTPError
+
+from .. import __version__
+from .constants import (
+    DEPRECATED_REVISION_ARGS,
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    SAFETENSORS_WEIGHTS_NAME,
+    WEIGHTS_NAME,
+)
+from .import_utils import (
+    ENV_VARS_TRUE_VALUES,
+    _flax_version,
+    _jax_version,
+    _onnxruntime_version,
+    _torch_version,
+    is_flax_available,
+    is_onnx_available,
+    is_torch_available,
+)
+from .logging import get_logger
+
+
+logger = get_logger(__name__)
+
+MODEL_CARD_TEMPLATE_PATH = Path(__file__).parent / "model_card_template.md"
+SESSION_ID = uuid4().hex
+
+
+def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
+    """
+    Formats a user-agent string with basic info about a request.
+    """
+    ua = f"diffusers/{__version__}; python/{sys.version.split()[0]}; session_id/{SESSION_ID}"
+    if HF_HUB_DISABLE_TELEMETRY or HF_HUB_OFFLINE:
+        return ua + "; telemetry/off"
+    if is_torch_available():
+        ua += f"; torch/{_torch_version}"
+    if is_flax_available():
+        ua += f"; jax/{_jax_version}"
+        ua += f"; flax/{_flax_version}"
+    if is_onnx_available():
+        ua += f"; onnxruntime/{_onnxruntime_version}"
+    # CI will set this value to True
+    if os.environ.get("DIFFUSERS_IS_CI", "").upper() in ENV_VARS_TRUE_VALUES:
+        ua += "; is_ci/true"
+    if isinstance(user_agent, dict):
+        ua += "; " + "; ".join(f"{k}/{v}" for k, v in user_agent.items())
+    elif isinstance(user_agent, str):
+        ua += "; " + user_agent
+    return ua
+
+
+def load_or_create_model_card(
+    repo_id_or_path: str = None,
+    token: Optional[str] = None,
+    is_pipeline: bool = False,
+    from_training: bool = False,
+    model_description: Optional[str] = None,
+    base_model: str = None,
+    prompt: Optional[str] = None,
+    license: Optional[str] = None,
+    widget: Optional[List[dict]] = None,
+    inference: Optional[bool] = None,
+) -> ModelCard:
+    """
+    Loads or creates a model card.
+
+    Args:
+        repo_id_or_path (`str`):
+            The repo id (e.g., "runwayml/stable-diffusion-v1-5") or local path where to look for the model card.
+        token (`str`, *optional*):
+            Authentication token. Will default to the stored token. See https://huggingface.co/settings/token for more details.
+        is_pipeline (`bool`):
+            Boolean to indicate if we're adding tag to a [`DiffusionPipeline`].
+        from_training: (`bool`): Boolean flag to denote if the model card is being created from a training script.
+        model_description (`str`, *optional*): Model description to add to the model card. Helpful when using
+            `load_or_create_model_card` from a training script.
+        base_model (`str`): Base model identifier (e.g., "stabilityai/stable-diffusion-xl-base-1.0"). Useful
+            for DreamBooth-like training.
+        prompt (`str`, *optional*): Prompt used for training. Useful for DreamBooth-like training.
+        license: (`str`, *optional*): License of the output artifact. Helpful when using
+            `load_or_create_model_card` from a training script.
+        widget (`List[dict]`, *optional*): Widget to accompany a gallery template.
+        inference: (`bool`, optional): Whether to turn on inference widget. Helpful when using
+            `load_or_create_model_card` from a training script.
+    """
+    if not is_jinja_available():
+        raise ValueError(
+            "Modelcard rendering is based on Jinja templates."
+            " Please make sure to have `jinja` installed before using `load_or_create_model_card`."
+            " To install it, please run `pip install Jinja2`."
+        )
+
+    try:
+        # Check if the model card is present on the remote repo
+        model_card = ModelCard.load(repo_id_or_path, token=token)
+    except (EntryNotFoundError, RepositoryNotFoundError):
+        # Otherwise create a model card from template
+        if from_training:
+            model_card = ModelCard.from_template(
+                card_data=ModelCardData(  # Card metadata object that will be converted to YAML block
+                    license=license,
+                    library_name="diffusers",
+                    inference=inference,
+                    base_model=base_model,
+                    instance_prompt=prompt,
+                    widget=widget,
+                ),
+                template_path=MODEL_CARD_TEMPLATE_PATH,
+                model_description=model_description,
+            )
+        else:
+            card_data = ModelCardData()
+            component = "pipeline" if is_pipeline else "model"
+            if model_description is None:
+                model_description = f"This is the model card of a 🧨 diffusers {component} that has been pushed on the Hub. This model card has been automatically generated."
+            model_card = ModelCard.from_template(card_data, model_description=model_description)
+
+    return model_card
+
+
+def populate_model_card(model_card: ModelCard, tags: Union[str, List[str]] = None) -> ModelCard:
+    """Populates the `model_card` with library name and optional tags."""
+    if model_card.data.library_name is None:
+        model_card.data.library_name = "diffusers"
+
+    if tags is not None:
+        if isinstance(tags, str):
+            tags = [tags]
+        if model_card.data.tags is None:
+            model_card.data.tags = []
+        for tag in tags:
+            model_card.data.tags.append(tag)
+
+    return model_card
+
+
+def extract_commit_hash(resolved_file: Optional[str], commit_hash: Optional[str] = None):
+    """
+    Extracts the commit hash from a resolved filename toward a cache file.
+    """
+    if resolved_file is None or commit_hash is not None:
+        return commit_hash
+    resolved_file = str(Path(resolved_file).as_posix())
+    search = re.search(r"snapshots/([^/]+)/", resolved_file)
+    if search is None:
+        return None
+    commit_hash = search.groups()[0]
+    return commit_hash if REGEX_COMMIT_HASH.match(commit_hash) else None
+
+
+# Old default cache path, potentially to be migrated.
+# This logic was more or less taken from `transformers`, with the following differences:
+# - Diffusers doesn't use custom environment variables to specify the cache path.
+# - There is no need to migrate the cache format, just move the files to the new location.
+hf_cache_home = os.path.expanduser(
+    os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
+)
+old_diffusers_cache = os.path.join(hf_cache_home, "diffusers")
+
+
+def move_cache(old_cache_dir: Optional[str] = None, new_cache_dir: Optional[str] = None) -> None:
+    if new_cache_dir is None:
+        new_cache_dir = HF_HUB_CACHE
+    if old_cache_dir is None:
+        old_cache_dir = old_diffusers_cache
+
+    old_cache_dir = Path(old_cache_dir).expanduser()
+    new_cache_dir = Path(new_cache_dir).expanduser()
+    for old_blob_path in old_cache_dir.glob("**/blobs/*"):
+        if old_blob_path.is_file() and not old_blob_path.is_symlink():
+            new_blob_path = new_cache_dir / old_blob_path.relative_to(old_cache_dir)
+            new_blob_path.parent.mkdir(parents=True, exist_ok=True)
+            os.replace(old_blob_path, new_blob_path)
+            try:
+                os.symlink(new_blob_path, old_blob_path)
+            except OSError:
+                logger.warning(
+                    "Could not create symlink between old cache and new cache. If you use an older version of diffusers again, files will be re-downloaded."
+                )
+    # At this point, old_cache_dir contains symlinks to the new cache (it can still be used).
+
+
+cache_version_file = os.path.join(HF_HUB_CACHE, "version_diffusers_cache.txt")
+if not os.path.isfile(cache_version_file):
+    cache_version = 0
+else:
+    with open(cache_version_file) as f:
+        try:
+            cache_version = int(f.read())
+        except ValueError:
+            cache_version = 0
+
+if cache_version < 1:
+    old_cache_is_not_empty = os.path.isdir(old_diffusers_cache) and len(os.listdir(old_diffusers_cache)) > 0
+    if old_cache_is_not_empty:
+        logger.warning(
+            "The cache for model files in Diffusers v0.14.0 has moved to a new location. Moving your "
+            "existing cached models. This is a one-time operation, you can interrupt it or run it "
+            "later by calling `diffusers.utils.hub_utils.move_cache()`."
+        )
+        try:
+            move_cache()
+        except Exception as e:
+            trace = "\n".join(traceback.format_tb(e.__traceback__))
+            logger.error(
+                f"There was a problem when trying to move your cache:\n\n{trace}\n{e.__class__.__name__}: {e}\n\nPlease "
+                "file an issue at https://github.com/huggingface/diffusers/issues/new/choose, copy paste this whole "
+                "message and we will do our best to help."
+            )
+
+if cache_version < 1:
+    try:
+        os.makedirs(HF_HUB_CACHE, exist_ok=True)
+        with open(cache_version_file, "w") as f:
+            f.write("1")
+    except Exception:
+        logger.warning(
+            f"There was a problem when trying to write in your cache folder ({HF_HUB_CACHE}). Please, ensure "
+            "the directory exists and can be written to."
+        )
+
+
+def _add_variant(weights_name: str, variant: Optional[str] = None) -> str:
+    if variant is not None:
+        splits = weights_name.split(".")
+        splits = splits[:-1] + [variant] + splits[-1:]
+        weights_name = ".".join(splits)
+
+    return weights_name
+
+
+@validate_hf_hub_args
+def _get_model_file(
+    pretrained_model_name_or_path: Union[str, Path],
+    *,
+    weights_name: str,
+    subfolder: Optional[str] = None,
+    cache_dir: Optional[str] = None,
+    force_download: bool = False,
+    proxies: Optional[Dict] = None,
+    resume_download: bool = False,
+    local_files_only: bool = False,
+    token: Optional[str] = None,
+    user_agent: Optional[Union[Dict, str]] = None,
+    revision: Optional[str] = None,
+    commit_hash: Optional[str] = None,
+):
+    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+    if os.path.isfile(pretrained_model_name_or_path):
+        return pretrained_model_name_or_path
+    elif os.path.isdir(pretrained_model_name_or_path):
+        if os.path.isfile(os.path.join(pretrained_model_name_or_path, weights_name)):
+            # Load from a PyTorch checkpoint
+            model_file = os.path.join(pretrained_model_name_or_path, weights_name)
+            return model_file
+        elif subfolder is not None and os.path.isfile(
+            os.path.join(pretrained_model_name_or_path, subfolder, weights_name)
+        ):
+            model_file = os.path.join(pretrained_model_name_or_path, subfolder, weights_name)
+            return model_file
+        else:
+            raise EnvironmentError(
+                f"Error no file named {weights_name} found in directory {pretrained_model_name_or_path}."
+            )
+    else:
+        # 1. First check if deprecated way of loading from branches is used
+        if (
+            revision in DEPRECATED_REVISION_ARGS
+            and (weights_name == WEIGHTS_NAME or weights_name == SAFETENSORS_WEIGHTS_NAME)
+            and version.parse(version.parse(__version__).base_version) >= version.parse("0.22.0")
+        ):
+            try:
+                model_file = hf_hub_download(
+                    pretrained_model_name_or_path,
+                    filename=_add_variant(weights_name, revision),
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    local_files_only=local_files_only,
+                    token=token,
+                    user_agent=user_agent,
+                    subfolder=subfolder,
+                    revision=revision or commit_hash,
+                )
+                warnings.warn(
+                    f"Loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'` is deprecated. Loading instead from `revision='main'` with `variant={revision}`. Loading model variants via `revision='{revision}'` will be removed in diffusers v1. Please use `variant='{revision}'` instead.",
+                    FutureWarning,
+                )
+                return model_file
+            except:  # noqa: E722
+                warnings.warn(
+                    f"You are loading the variant {revision} from {pretrained_model_name_or_path} via `revision='{revision}'`. This behavior is deprecated and will be removed in diffusers v1. One should use `variant='{revision}'` instead. However, it appears that {pretrained_model_name_or_path} currently does not have a {_add_variant(weights_name, revision)} file in the 'main' branch of {pretrained_model_name_or_path}. \n The Diffusers team and community would be very grateful if you could open an issue: https://github.com/huggingface/diffusers/issues/new with the title '{pretrained_model_name_or_path} is missing {_add_variant(weights_name, revision)}' so that the correct variant file can be added.",
+                    FutureWarning,
+                )
+        try:
+            # 2. Load model file as usual
+            model_file = hf_hub_download(
+                pretrained_model_name_or_path,
+                filename=weights_name,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                local_files_only=local_files_only,
+                token=token,
+                user_agent=user_agent,
+                subfolder=subfolder,
+                revision=revision or commit_hash,
+            )
+            return model_file
+
+        except RepositoryNotFoundError:
+            raise EnvironmentError(
+                f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
+                "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a "
+                "token having permission to this repo with `token` or log in with `huggingface-cli "
+                "login`."
+            )
+        except RevisionNotFoundError:
+            raise EnvironmentError(
+                f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
+                "this model name. Check the model page at "
+                f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
+            )
+        except EntryNotFoundError:
+            raise EnvironmentError(
+                f"{pretrained_model_name_or_path} does not appear to have a file named {weights_name}."
+            )
+        except HTTPError as err:
+            raise EnvironmentError(
+                f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n{err}"
+            )
+        except ValueError:
+            raise EnvironmentError(
+                f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
+                f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
+                f" directory containing a file named {weights_name} or"
+                " \nCheckout your internet connection or see how to run the library in"
+                " offline mode at 'https://huggingface.co/docs/diffusers/installation#offline-mode'."
+            )
+        except EnvironmentError:
+            raise EnvironmentError(
+                f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
+                "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
+                f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
+                f"containing a file named {weights_name}"
+            )
+
+
+class PushToHubMixin:
+    """
+    A Mixin to push a model, scheduler, or pipeline to the Hugging Face Hub.
+    """
+
+    def _upload_folder(
+        self,
+        working_dir: Union[str, os.PathLike],
+        repo_id: str,
+        token: Optional[str] = None,
+        commit_message: Optional[str] = None,
+        create_pr: bool = False,
+    ):
+        """
+        Uploads all files in `working_dir` to `repo_id`.
+        """
+        if commit_message is None:
+            if "Model" in self.__class__.__name__:
+                commit_message = "Upload model"
+            elif "Scheduler" in self.__class__.__name__:
+                commit_message = "Upload scheduler"
+            else:
+                commit_message = f"Upload {self.__class__.__name__}"
+
+        logger.info(f"Uploading the files of {working_dir} to {repo_id}.")
+        return upload_folder(
+            repo_id=repo_id, folder_path=working_dir, token=token, commit_message=commit_message, create_pr=create_pr
+        )
+
+    def push_to_hub(
+        self,
+        repo_id: str,
+        commit_message: Optional[str] = None,
+        private: Optional[bool] = None,
+        token: Optional[str] = None,
+        create_pr: bool = False,
+        safe_serialization: bool = True,
+        variant: Optional[str] = None,
+    ) -> str:
+        """
+        Upload model, scheduler, or pipeline files to the 🤗 Hugging Face Hub.
+
+        Parameters:
+            repo_id (`str`):
+                The name of the repository you want to push your model, scheduler, or pipeline files to. It should
+                contain your organization name when pushing to an organization. `repo_id` can also be a path to a local
+                directory.
+            commit_message (`str`, *optional*):
+                Message to commit while pushing. Default to `"Upload {object}"`.
+            private (`bool`, *optional*):
+                Whether or not the repository created should be private.
+            token (`str`, *optional*):
+                The token to use as HTTP bearer authorization for remote files. The token generated when running
+                `huggingface-cli login` (stored in `~/.huggingface`).
+            create_pr (`bool`, *optional*, defaults to `False`):
+                Whether or not to create a PR with the uploaded files or directly commit.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether or not to convert the model weights to the `safetensors` format.
+            variant (`str`, *optional*):
+                If specified, weights are saved in the format `pytorch_model.<variant>.bin`.
+
+        Examples:
+
+        ```python
+        from diffusers import UNet2DConditionModel
+
+        unet = UNet2DConditionModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="unet")
+
+        # Push the `unet` to your namespace with the name "my-finetuned-unet".
+        unet.push_to_hub("my-finetuned-unet")
+
+        # Push the `unet` to an organization with the name "my-finetuned-unet".
+        unet.push_to_hub("your-org/my-finetuned-unet")
+        ```
+        """
+        repo_id = create_repo(repo_id, private=private, token=token, exist_ok=True).repo_id
+
+        # Create a new empty model card and eventually tag it
+        model_card = load_or_create_model_card(repo_id, token=token)
+        model_card = populate_model_card(model_card)
+
+        # Save all files.
+        save_kwargs = {"safe_serialization": safe_serialization}
+        if "Scheduler" not in self.__class__.__name__:
+            save_kwargs.update({"variant": variant})
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            self.save_pretrained(tmpdir, **save_kwargs)
+
+            # Update model card if needed:
+            model_card.save(os.path.join(tmpdir, "README.md"))
+
+            return self._upload_folder(
+                tmpdir,
+                repo_id,
+                token=token,
+                commit_message=commit_message,
+                create_pr=create_pr,
+            )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/import_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/import_utils.py
new file mode 100644
index 000000000..a3ee31c91
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/import_utils.py
@@ -0,0 +1,726 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Import utilities: Utilities related to imports and our lazy inits.
+"""
+
+import importlib.util
+import operator as op
+import os
+import sys
+from collections import OrderedDict
+from itertools import chain
+from types import ModuleType
+from typing import Any, Union
+
+from huggingface_hub.utils import is_jinja_available  # noqa: F401
+from packaging import version
+from packaging.version import Version, parse
+
+from . import logging
+
+
+# The package importlib_metadata is in a different place, depending on the python version.
+if sys.version_info < (3, 8):
+    import importlib_metadata
+else:
+    import importlib.metadata as importlib_metadata
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
+ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
+
+USE_TF = os.environ.get("USE_TF", "AUTO").upper()
+USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
+USE_JAX = os.environ.get("USE_FLAX", "AUTO").upper()
+USE_SAFETENSORS = os.environ.get("USE_SAFETENSORS", "AUTO").upper()
+DIFFUSERS_SLOW_IMPORT = os.environ.get("DIFFUSERS_SLOW_IMPORT", "FALSE").upper()
+DIFFUSERS_SLOW_IMPORT = DIFFUSERS_SLOW_IMPORT in ENV_VARS_TRUE_VALUES
+
+STR_OPERATION_TO_FUNC = {">": op.gt, ">=": op.ge, "==": op.eq, "!=": op.ne, "<=": op.le, "<": op.lt}
+
+_torch_version = "N/A"
+if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
+    _torch_available = importlib.util.find_spec("torch") is not None
+    if _torch_available:
+        try:
+            _torch_version = importlib_metadata.version("torch")
+            logger.info(f"PyTorch version {_torch_version} available.")
+        except importlib_metadata.PackageNotFoundError:
+            _torch_available = False
+else:
+    logger.info("Disabling PyTorch because USE_TORCH is set")
+    _torch_available = False
+
+_torch_xla_available = importlib.util.find_spec("torch_xla") is not None
+if _torch_xla_available:
+    try:
+        _torch_xla_version = importlib_metadata.version("torch_xla")
+        logger.info(f"PyTorch XLA version {_torch_xla_version} available.")
+    except ImportError:
+        _torch_xla_available = False
+
+# check whether torch_npu is available
+_torch_npu_available = importlib.util.find_spec("torch_npu") is not None
+if _torch_npu_available:
+    try:
+        _torch_npu_version = importlib_metadata.version("torch_npu")
+        logger.info(f"torch_npu version {_torch_npu_version} available.")
+    except ImportError:
+        _torch_npu_available = False
+
+_jax_version = "N/A"
+_flax_version = "N/A"
+if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:
+    _flax_available = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("flax") is not None
+    if _flax_available:
+        try:
+            _jax_version = importlib_metadata.version("jax")
+            _flax_version = importlib_metadata.version("flax")
+            logger.info(f"JAX version {_jax_version}, Flax version {_flax_version} available.")
+        except importlib_metadata.PackageNotFoundError:
+            _flax_available = False
+else:
+    _flax_available = False
+
+if USE_SAFETENSORS in ENV_VARS_TRUE_AND_AUTO_VALUES:
+    _safetensors_available = importlib.util.find_spec("safetensors") is not None
+    if _safetensors_available:
+        try:
+            _safetensors_version = importlib_metadata.version("safetensors")
+            logger.info(f"Safetensors version {_safetensors_version} available.")
+        except importlib_metadata.PackageNotFoundError:
+            _safetensors_available = False
+else:
+    logger.info("Disabling Safetensors because USE_TF is set")
+    _safetensors_available = False
+
+_transformers_available = importlib.util.find_spec("transformers") is not None
+try:
+    _transformers_version = importlib_metadata.version("transformers")
+    logger.debug(f"Successfully imported transformers version {_transformers_version}")
+except importlib_metadata.PackageNotFoundError:
+    _transformers_available = False
+
+
+_inflect_available = importlib.util.find_spec("inflect") is not None
+try:
+    _inflect_version = importlib_metadata.version("inflect")
+    logger.debug(f"Successfully imported inflect version {_inflect_version}")
+except importlib_metadata.PackageNotFoundError:
+    _inflect_available = False
+
+
+_unidecode_available = importlib.util.find_spec("unidecode") is not None
+try:
+    _unidecode_version = importlib_metadata.version("unidecode")
+    logger.debug(f"Successfully imported unidecode version {_unidecode_version}")
+except importlib_metadata.PackageNotFoundError:
+    _unidecode_available = False
+
+
+_onnxruntime_version = "N/A"
+_onnx_available = importlib.util.find_spec("onnxruntime") is not None
+if _onnx_available:
+    candidates = (
+        "onnxruntime",
+        "onnxruntime-gpu",
+        "ort_nightly_gpu",
+        "onnxruntime-directml",
+        "onnxruntime-openvino",
+        "ort_nightly_directml",
+        "onnxruntime-rocm",
+        "onnxruntime-training",
+    )
+    _onnxruntime_version = None
+    # For the metadata, we have to look for both onnxruntime and onnxruntime-gpu
+    for pkg in candidates:
+        try:
+            _onnxruntime_version = importlib_metadata.version(pkg)
+            break
+        except importlib_metadata.PackageNotFoundError:
+            pass
+    _onnx_available = _onnxruntime_version is not None
+    if _onnx_available:
+        logger.debug(f"Successfully imported onnxruntime version {_onnxruntime_version}")
+
+# (sayakpaul): importlib.util.find_spec("opencv-python") returns None even when it's installed.
+# _opencv_available = importlib.util.find_spec("opencv-python") is not None
+try:
+    candidates = (
+        "opencv-python",
+        "opencv-contrib-python",
+        "opencv-python-headless",
+        "opencv-contrib-python-headless",
+    )
+    _opencv_version = None
+    for pkg in candidates:
+        try:
+            _opencv_version = importlib_metadata.version(pkg)
+            break
+        except importlib_metadata.PackageNotFoundError:
+            pass
+    _opencv_available = _opencv_version is not None
+    if _opencv_available:
+        logger.debug(f"Successfully imported cv2 version {_opencv_version}")
+except importlib_metadata.PackageNotFoundError:
+    _opencv_available = False
+
+_scipy_available = importlib.util.find_spec("scipy") is not None
+try:
+    _scipy_version = importlib_metadata.version("scipy")
+    logger.debug(f"Successfully imported scipy version {_scipy_version}")
+except importlib_metadata.PackageNotFoundError:
+    _scipy_available = False
+
+_librosa_available = importlib.util.find_spec("librosa") is not None
+try:
+    _librosa_version = importlib_metadata.version("librosa")
+    logger.debug(f"Successfully imported librosa version {_librosa_version}")
+except importlib_metadata.PackageNotFoundError:
+    _librosa_available = False
+
+_accelerate_available = importlib.util.find_spec("accelerate") is not None
+try:
+    _accelerate_version = importlib_metadata.version("accelerate")
+    logger.debug(f"Successfully imported accelerate version {_accelerate_version}")
+except importlib_metadata.PackageNotFoundError:
+    _accelerate_available = False
+
+_xformers_available = importlib.util.find_spec("xformers") is not None
+try:
+    _xformers_version = importlib_metadata.version("xformers")
+    if _torch_available:
+        _torch_version = importlib_metadata.version("torch")
+        if version.Version(_torch_version) < version.Version("1.12"):
+            raise ValueError("xformers is installed in your environment and requires PyTorch >= 1.12")
+
+    logger.debug(f"Successfully imported xformers version {_xformers_version}")
+except importlib_metadata.PackageNotFoundError:
+    _xformers_available = False
+
+_k_diffusion_available = importlib.util.find_spec("k_diffusion") is not None
+try:
+    _k_diffusion_version = importlib_metadata.version("k_diffusion")
+    logger.debug(f"Successfully imported k-diffusion version {_k_diffusion_version}")
+except importlib_metadata.PackageNotFoundError:
+    _k_diffusion_available = False
+
+_note_seq_available = importlib.util.find_spec("note_seq") is not None
+try:
+    _note_seq_version = importlib_metadata.version("note_seq")
+    logger.debug(f"Successfully imported note-seq version {_note_seq_version}")
+except importlib_metadata.PackageNotFoundError:
+    _note_seq_available = False
+
+_wandb_available = importlib.util.find_spec("wandb") is not None
+try:
+    _wandb_version = importlib_metadata.version("wandb")
+    logger.debug(f"Successfully imported wandb version {_wandb_version }")
+except importlib_metadata.PackageNotFoundError:
+    _wandb_available = False
+
+
+_tensorboard_available = importlib.util.find_spec("tensorboard")
+try:
+    _tensorboard_version = importlib_metadata.version("tensorboard")
+    logger.debug(f"Successfully imported tensorboard version {_tensorboard_version}")
+except importlib_metadata.PackageNotFoundError:
+    _tensorboard_available = False
+
+
+_compel_available = importlib.util.find_spec("compel")
+try:
+    _compel_version = importlib_metadata.version("compel")
+    logger.debug(f"Successfully imported compel version {_compel_version}")
+except importlib_metadata.PackageNotFoundError:
+    _compel_available = False
+
+
+_ftfy_available = importlib.util.find_spec("ftfy") is not None
+try:
+    _ftfy_version = importlib_metadata.version("ftfy")
+    logger.debug(f"Successfully imported ftfy version {_ftfy_version}")
+except importlib_metadata.PackageNotFoundError:
+    _ftfy_available = False
+
+
+_bs4_available = importlib.util.find_spec("bs4") is not None
+try:
+    # importlib metadata under different name
+    _bs4_version = importlib_metadata.version("beautifulsoup4")
+    logger.debug(f"Successfully imported ftfy version {_bs4_version}")
+except importlib_metadata.PackageNotFoundError:
+    _bs4_available = False
+
+_torchsde_available = importlib.util.find_spec("torchsde") is not None
+try:
+    _torchsde_version = importlib_metadata.version("torchsde")
+    logger.debug(f"Successfully imported torchsde version {_torchsde_version}")
+except importlib_metadata.PackageNotFoundError:
+    _torchsde_available = False
+
+_invisible_watermark_available = importlib.util.find_spec("imwatermark") is not None
+try:
+    _invisible_watermark_version = importlib_metadata.version("invisible-watermark")
+    logger.debug(f"Successfully imported invisible-watermark version {_invisible_watermark_version}")
+except importlib_metadata.PackageNotFoundError:
+    _invisible_watermark_available = False
+
+
+_peft_available = importlib.util.find_spec("peft") is not None
+try:
+    _peft_version = importlib_metadata.version("peft")
+    logger.debug(f"Successfully imported peft version {_peft_version}")
+except importlib_metadata.PackageNotFoundError:
+    _peft_available = False
+
+_torchvision_available = importlib.util.find_spec("torchvision") is not None
+try:
+    _torchvision_version = importlib_metadata.version("torchvision")
+    logger.debug(f"Successfully imported torchvision version {_torchvision_version}")
+except importlib_metadata.PackageNotFoundError:
+    _torchvision_available = False
+
+
+def is_torch_available():
+    return _torch_available
+
+
+def is_torch_xla_available():
+    return _torch_xla_available
+
+
+def is_torch_npu_available():
+    return _torch_npu_available
+
+
+def is_flax_available():
+    return _flax_available
+
+
+def is_transformers_available():
+    return _transformers_available
+
+
+def is_inflect_available():
+    return _inflect_available
+
+
+def is_unidecode_available():
+    return _unidecode_available
+
+
+def is_onnx_available():
+    return _onnx_available
+
+
+def is_opencv_available():
+    return _opencv_available
+
+
+def is_scipy_available():
+    return _scipy_available
+
+
+def is_librosa_available():
+    return _librosa_available
+
+
+def is_xformers_available():
+    return _xformers_available
+
+
+def is_accelerate_available():
+    return _accelerate_available
+
+
+def is_k_diffusion_available():
+    return _k_diffusion_available
+
+
+def is_note_seq_available():
+    return _note_seq_available
+
+
+def is_wandb_available():
+    return _wandb_available
+
+
+def is_tensorboard_available():
+    return _tensorboard_available
+
+
+def is_compel_available():
+    return _compel_available
+
+
+def is_ftfy_available():
+    return _ftfy_available
+
+
+def is_bs4_available():
+    return _bs4_available
+
+
+def is_torchsde_available():
+    return _torchsde_available
+
+
+def is_invisible_watermark_available():
+    return _invisible_watermark_available
+
+
+def is_peft_available():
+    return _peft_available
+
+
+def is_torchvision_available():
+    return _torchvision_available
+
+
+# docstyle-ignore
+FLAX_IMPORT_ERROR = """
+{0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the
+installation page: https://github.com/google/flax and follow the ones that match your environment.
+"""
+
+# docstyle-ignore
+INFLECT_IMPORT_ERROR = """
+{0} requires the inflect library but it was not found in your environment. You can install it with pip: `pip install
+inflect`
+"""
+
+# docstyle-ignore
+PYTORCH_IMPORT_ERROR = """
+{0} requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
+installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
+"""
+
+# docstyle-ignore
+ONNX_IMPORT_ERROR = """
+{0} requires the onnxruntime library but it was not found in your environment. You can install it with pip: `pip
+install onnxruntime`
+"""
+
+# docstyle-ignore
+OPENCV_IMPORT_ERROR = """
+{0} requires the OpenCV library but it was not found in your environment. You can install it with pip: `pip
+install opencv-python`
+"""
+
+# docstyle-ignore
+SCIPY_IMPORT_ERROR = """
+{0} requires the scipy library but it was not found in your environment. You can install it with pip: `pip install
+scipy`
+"""
+
+# docstyle-ignore
+LIBROSA_IMPORT_ERROR = """
+{0} requires the librosa library but it was not found in your environment.  Checkout the instructions on the
+installation page: https://librosa.org/doc/latest/install.html and follow the ones that match your environment.
+"""
+
+# docstyle-ignore
+TRANSFORMERS_IMPORT_ERROR = """
+{0} requires the transformers library but it was not found in your environment. You can install it with pip: `pip
+install transformers`
+"""
+
+# docstyle-ignore
+UNIDECODE_IMPORT_ERROR = """
+{0} requires the unidecode library but it was not found in your environment. You can install it with pip: `pip install
+Unidecode`
+"""
+
+# docstyle-ignore
+K_DIFFUSION_IMPORT_ERROR = """
+{0} requires the k-diffusion library but it was not found in your environment. You can install it with pip: `pip
+install k-diffusion`
+"""
+
+# docstyle-ignore
+NOTE_SEQ_IMPORT_ERROR = """
+{0} requires the note-seq library but it was not found in your environment. You can install it with pip: `pip
+install note-seq`
+"""
+
+# docstyle-ignore
+WANDB_IMPORT_ERROR = """
+{0} requires the wandb library but it was not found in your environment. You can install it with pip: `pip
+install wandb`
+"""
+
+# docstyle-ignore
+TENSORBOARD_IMPORT_ERROR = """
+{0} requires the tensorboard library but it was not found in your environment. You can install it with pip: `pip
+install tensorboard`
+"""
+
+
+# docstyle-ignore
+COMPEL_IMPORT_ERROR = """
+{0} requires the compel library but it was not found in your environment. You can install it with pip: `pip install compel`
+"""
+
+# docstyle-ignore
+BS4_IMPORT_ERROR = """
+{0} requires the Beautiful Soup library but it was not found in your environment. You can install it with pip:
+`pip install beautifulsoup4`. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+FTFY_IMPORT_ERROR = """
+{0} requires the ftfy library but it was not found in your environment. Checkout the instructions on the
+installation section: https://github.com/rspeer/python-ftfy/tree/master#installing and follow the ones
+that match your environment. Please note that you may need to restart your runtime after installation.
+"""
+
+# docstyle-ignore
+TORCHSDE_IMPORT_ERROR = """
+{0} requires the torchsde library but it was not found in your environment. You can install it with pip: `pip install torchsde`
+"""
+
+# docstyle-ignore
+INVISIBLE_WATERMARK_IMPORT_ERROR = """
+{0} requires the invisible-watermark library but it was not found in your environment. You can install it with pip: `pip install invisible-watermark>=0.2.0`
+"""
+
+
+BACKENDS_MAPPING = OrderedDict(
+    [
+        ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)),
+        ("flax", (is_flax_available, FLAX_IMPORT_ERROR)),
+        ("inflect", (is_inflect_available, INFLECT_IMPORT_ERROR)),
+        ("onnx", (is_onnx_available, ONNX_IMPORT_ERROR)),
+        ("opencv", (is_opencv_available, OPENCV_IMPORT_ERROR)),
+        ("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)),
+        ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
+        ("transformers", (is_transformers_available, TRANSFORMERS_IMPORT_ERROR)),
+        ("unidecode", (is_unidecode_available, UNIDECODE_IMPORT_ERROR)),
+        ("librosa", (is_librosa_available, LIBROSA_IMPORT_ERROR)),
+        ("k_diffusion", (is_k_diffusion_available, K_DIFFUSION_IMPORT_ERROR)),
+        ("note_seq", (is_note_seq_available, NOTE_SEQ_IMPORT_ERROR)),
+        ("wandb", (is_wandb_available, WANDB_IMPORT_ERROR)),
+        ("tensorboard", (is_tensorboard_available, TENSORBOARD_IMPORT_ERROR)),
+        ("compel", (is_compel_available, COMPEL_IMPORT_ERROR)),
+        ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)),
+        ("torchsde", (is_torchsde_available, TORCHSDE_IMPORT_ERROR)),
+        ("invisible_watermark", (is_invisible_watermark_available, INVISIBLE_WATERMARK_IMPORT_ERROR)),
+    ]
+)
+
+
+def requires_backends(obj, backends):
+    if not isinstance(backends, (list, tuple)):
+        backends = [backends]
+
+    name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
+    checks = (BACKENDS_MAPPING[backend] for backend in backends)
+    failed = [msg.format(name) for available, msg in checks if not available()]
+    if failed:
+        raise ImportError("".join(failed))
+
+    if name in [
+        "VersatileDiffusionTextToImagePipeline",
+        "VersatileDiffusionPipeline",
+        "VersatileDiffusionDualGuidedPipeline",
+        "StableDiffusionImageVariationPipeline",
+        "UnCLIPPipeline",
+    ] and is_transformers_version("<", "4.25.0"):
+        raise ImportError(
+            f"You need to install `transformers>=4.25` in order to use {name}: \n```\n pip install"
+            " --upgrade transformers \n```"
+        )
+
+    if name in ["StableDiffusionDepth2ImgPipeline", "StableDiffusionPix2PixZeroPipeline"] and is_transformers_version(
+        "<", "4.26.0"
+    ):
+        raise ImportError(
+            f"You need to install `transformers>=4.26` in order to use {name}: \n```\n pip install"
+            " --upgrade transformers \n```"
+        )
+
+
+class DummyObject(type):
+    """
+    Metaclass for the dummy objects. Any class inheriting from it will return the ImportError generated by
+    `requires_backend` each time a user tries to access any method of that class.
+    """
+
+    def __getattr__(cls, key):
+        if key.startswith("_") and key not in ["_load_connected_pipes", "_is_onnx"]:
+            return super().__getattr__(cls, key)
+        requires_backends(cls, cls._backends)
+
+
+# This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L319
+def compare_versions(library_or_version: Union[str, Version], operation: str, requirement_version: str):
+    """
+    Args:
+    Compares a library version to some requirement using a given operation.
+        library_or_version (`str` or `packaging.version.Version`):
+            A library name or a version to check.
+        operation (`str`):
+            A string representation of an operator, such as `">"` or `"<="`.
+        requirement_version (`str`):
+            The version to compare the library version against
+    """
+    if operation not in STR_OPERATION_TO_FUNC.keys():
+        raise ValueError(f"`operation` must be one of {list(STR_OPERATION_TO_FUNC.keys())}, received {operation}")
+    operation = STR_OPERATION_TO_FUNC[operation]
+    if isinstance(library_or_version, str):
+        library_or_version = parse(importlib_metadata.version(library_or_version))
+    return operation(library_or_version, parse(requirement_version))
+
+
+# This function was copied from: https://github.com/huggingface/accelerate/blob/874c4967d94badd24f893064cc3bef45f57cadf7/src/accelerate/utils/versions.py#L338
+def is_torch_version(operation: str, version: str):
+    """
+    Args:
+    Compares the current PyTorch version to a given reference with an operation.
+        operation (`str`):
+            A string representation of an operator, such as `">"` or `"<="`
+        version (`str`):
+            A string version of PyTorch
+    """
+    return compare_versions(parse(_torch_version), operation, version)
+
+
+def is_transformers_version(operation: str, version: str):
+    """
+    Args:
+    Compares the current Transformers version to a given reference with an operation.
+        operation (`str`):
+            A string representation of an operator, such as `">"` or `"<="`
+        version (`str`):
+            A version string
+    """
+    if not _transformers_available:
+        return False
+    return compare_versions(parse(_transformers_version), operation, version)
+
+
+def is_accelerate_version(operation: str, version: str):
+    """
+    Args:
+    Compares the current Accelerate version to a given reference with an operation.
+        operation (`str`):
+            A string representation of an operator, such as `">"` or `"<="`
+        version (`str`):
+            A version string
+    """
+    if not _accelerate_available:
+        return False
+    return compare_versions(parse(_accelerate_version), operation, version)
+
+
+def is_k_diffusion_version(operation: str, version: str):
+    """
+    Args:
+    Compares the current k-diffusion version to a given reference with an operation.
+        operation (`str`):
+            A string representation of an operator, such as `">"` or `"<="`
+        version (`str`):
+            A version string
+    """
+    if not _k_diffusion_available:
+        return False
+    return compare_versions(parse(_k_diffusion_version), operation, version)
+
+
+def get_objects_from_module(module):
+    """
+    Args:
+    Returns a dict of object names and values in a module, while skipping private/internal objects
+        module (ModuleType):
+            Module to extract the objects from.
+
+    Returns:
+        dict: Dictionary of object names and corresponding values
+    """
+
+    objects = {}
+    for name in dir(module):
+        if name.startswith("_"):
+            continue
+        objects[name] = getattr(module, name)
+
+    return objects
+
+
+class OptionalDependencyNotAvailable(BaseException):
+    """An error indicating that an optional dependency of Diffusers was not found in the environment."""
+
+
+class _LazyModule(ModuleType):
+    """
+    Module class that surfaces all objects but only performs associated imports when the objects are requested.
+    """
+
+    # Very heavily inspired by optuna.integration._IntegrationModule
+    # https://github.com/optuna/optuna/blob/master/optuna/integration/__init__.py
+    def __init__(self, name, module_file, import_structure, module_spec=None, extra_objects=None):
+        super().__init__(name)
+        self._modules = set(import_structure.keys())
+        self._class_to_module = {}
+        for key, values in import_structure.items():
+            for value in values:
+                self._class_to_module[value] = key
+        # Needed for autocompletion in an IDE
+        self.__all__ = list(import_structure.keys()) + list(chain(*import_structure.values()))
+        self.__file__ = module_file
+        self.__spec__ = module_spec
+        self.__path__ = [os.path.dirname(module_file)]
+        self._objects = {} if extra_objects is None else extra_objects
+        self._name = name
+        self._import_structure = import_structure
+
+    # Needed for autocompletion in an IDE
+    def __dir__(self):
+        result = super().__dir__()
+        # The elements of self.__all__ that are submodules may or may not be in the dir already, depending on whether
+        # they have been accessed or not. So we only add the elements of self.__all__ that are not already in the dir.
+        for attr in self.__all__:
+            if attr not in result:
+                result.append(attr)
+        return result
+
+    def __getattr__(self, name: str) -> Any:
+        if name in self._objects:
+            return self._objects[name]
+        if name in self._modules:
+            value = self._get_module(name)
+        elif name in self._class_to_module.keys():
+            module = self._get_module(self._class_to_module[name])
+            value = getattr(module, name)
+        else:
+            raise AttributeError(f"module {self.__name__} has no attribute {name}")
+
+        setattr(self, name, value)
+        return value
+
+    def _get_module(self, module_name: str):
+        try:
+            return importlib.import_module("." + module_name, self.__name__)
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to import {self.__name__}.{module_name} because of the following error (look up to see its"
+                f" traceback):\n{e}"
+            ) from e
+
+    def __reduce__(self):
+        return (self.__class__, (self._name, self.__file__, self._import_structure))
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/loading_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/loading_utils.py
new file mode 100644
index 000000000..18f6ead64
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/loading_utils.py
@@ -0,0 +1,49 @@
+import os
+from typing import Callable, Union
+
+import PIL.Image
+import PIL.ImageOps
+import requests
+
+
+def load_image(
+    image: Union[str, PIL.Image.Image], convert_method: Callable[[PIL.Image.Image], PIL.Image.Image] = None
+) -> PIL.Image.Image:
+    """
+    Loads `image` to a PIL Image.
+
+    Args:
+        image (`str` or `PIL.Image.Image`):
+            The image to convert to the PIL Image format.
+        convert_method (Callable[[PIL.Image.Image], PIL.Image.Image], optional):
+            A conversion method to apply to the image after loading it.
+            When set to `None` the image will be converted "RGB".
+
+    Returns:
+        `PIL.Image.Image`:
+            A PIL Image.
+    """
+    if isinstance(image, str):
+        if image.startswith("http://") or image.startswith("https://"):
+            image = PIL.Image.open(requests.get(image, stream=True).raw)
+        elif os.path.isfile(image):
+            image = PIL.Image.open(image)
+        else:
+            raise ValueError(
+                f"Incorrect path or URL. URLs must start with `http://` or `https://`, and {image} is not a valid path."
+            )
+    elif isinstance(image, PIL.Image.Image):
+        image = image
+    else:
+        raise ValueError(
+            "Incorrect format used for the image. Should be a URL linking to an image, a local path, or a PIL image."
+        )
+
+    image = PIL.ImageOps.exif_transpose(image)
+
+    if convert_method is not None:
+        image = convert_method(image)
+    else:
+        image = image.convert("RGB")
+
+    return image
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/logging.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/logging.py
new file mode 100644
index 000000000..37bc05d0c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/logging.py
@@ -0,0 +1,339 @@
+# coding=utf-8
+# Copyright 2024 Optuna, Hugging Face
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Logging utilities."""
+
+import logging
+import os
+import sys
+import threading
+from logging import (
+    CRITICAL,  # NOQA
+    DEBUG,  # NOQA
+    ERROR,  # NOQA
+    FATAL,  # NOQA
+    INFO,  # NOQA
+    NOTSET,  # NOQA
+    WARN,  # NOQA
+    WARNING,  # NOQA
+)
+from typing import Dict, Optional
+
+from tqdm import auto as tqdm_lib
+
+
+_lock = threading.Lock()
+_default_handler: Optional[logging.Handler] = None
+
+log_levels = {
+    "debug": logging.DEBUG,
+    "info": logging.INFO,
+    "warning": logging.WARNING,
+    "error": logging.ERROR,
+    "critical": logging.CRITICAL,
+}
+
+_default_log_level = logging.WARNING
+
+_tqdm_active = True
+
+
+def _get_default_logging_level() -> int:
+    """
+    If DIFFUSERS_VERBOSITY env var is set to one of the valid choices return that as the new default level. If it is
+    not - fall back to `_default_log_level`
+    """
+    env_level_str = os.getenv("DIFFUSERS_VERBOSITY", None)
+    if env_level_str:
+        if env_level_str in log_levels:
+            return log_levels[env_level_str]
+        else:
+            logging.getLogger().warning(
+                f"Unknown option DIFFUSERS_VERBOSITY={env_level_str}, "
+                f"has to be one of: { ', '.join(log_levels.keys()) }"
+            )
+    return _default_log_level
+
+
+def _get_library_name() -> str:
+    return __name__.split(".")[0]
+
+
+def _get_library_root_logger() -> logging.Logger:
+    return logging.getLogger(_get_library_name())
+
+
+def _configure_library_root_logger() -> None:
+    global _default_handler
+
+    with _lock:
+        if _default_handler:
+            # This library has already configured the library root logger.
+            return
+        _default_handler = logging.StreamHandler()  # Set sys.stderr as stream.
+        _default_handler.flush = sys.stderr.flush
+
+        # Apply our default configuration to the library root logger.
+        library_root_logger = _get_library_root_logger()
+        library_root_logger.addHandler(_default_handler)
+        library_root_logger.setLevel(_get_default_logging_level())
+        library_root_logger.propagate = False
+
+
+def _reset_library_root_logger() -> None:
+    global _default_handler
+
+    with _lock:
+        if not _default_handler:
+            return
+
+        library_root_logger = _get_library_root_logger()
+        library_root_logger.removeHandler(_default_handler)
+        library_root_logger.setLevel(logging.NOTSET)
+        _default_handler = None
+
+
+def get_log_levels_dict() -> Dict[str, int]:
+    return log_levels
+
+
+def get_logger(name: Optional[str] = None) -> logging.Logger:
+    """
+    Return a logger with the specified name.
+
+    This function is not supposed to be directly accessed unless you are writing a custom diffusers module.
+    """
+
+    if name is None:
+        name = _get_library_name()
+
+    _configure_library_root_logger()
+    return logging.getLogger(name)
+
+
+def get_verbosity() -> int:
+    """
+    Return the current level for the 🤗 Diffusers' root logger as an `int`.
+
+    Returns:
+        `int`:
+            Logging level integers which can be one of:
+
+            - `50`: `diffusers.logging.CRITICAL` or `diffusers.logging.FATAL`
+            - `40`: `diffusers.logging.ERROR`
+            - `30`: `diffusers.logging.WARNING` or `diffusers.logging.WARN`
+            - `20`: `diffusers.logging.INFO`
+            - `10`: `diffusers.logging.DEBUG`
+
+    """
+
+    _configure_library_root_logger()
+    return _get_library_root_logger().getEffectiveLevel()
+
+
+def set_verbosity(verbosity: int) -> None:
+    """
+    Set the verbosity level for the 🤗 Diffusers' root logger.
+
+    Args:
+        verbosity (`int`):
+            Logging level which can be one of:
+
+            - `diffusers.logging.CRITICAL` or `diffusers.logging.FATAL`
+            - `diffusers.logging.ERROR`
+            - `diffusers.logging.WARNING` or `diffusers.logging.WARN`
+            - `diffusers.logging.INFO`
+            - `diffusers.logging.DEBUG`
+    """
+
+    _configure_library_root_logger()
+    _get_library_root_logger().setLevel(verbosity)
+
+
+def set_verbosity_info() -> None:
+    """Set the verbosity to the `INFO` level."""
+    return set_verbosity(INFO)
+
+
+def set_verbosity_warning() -> None:
+    """Set the verbosity to the `WARNING` level."""
+    return set_verbosity(WARNING)
+
+
+def set_verbosity_debug() -> None:
+    """Set the verbosity to the `DEBUG` level."""
+    return set_verbosity(DEBUG)
+
+
+def set_verbosity_error() -> None:
+    """Set the verbosity to the `ERROR` level."""
+    return set_verbosity(ERROR)
+
+
+def disable_default_handler() -> None:
+    """Disable the default handler of the 🤗 Diffusers' root logger."""
+
+    _configure_library_root_logger()
+
+    assert _default_handler is not None
+    _get_library_root_logger().removeHandler(_default_handler)
+
+
+def enable_default_handler() -> None:
+    """Enable the default handler of the 🤗 Diffusers' root logger."""
+
+    _configure_library_root_logger()
+
+    assert _default_handler is not None
+    _get_library_root_logger().addHandler(_default_handler)
+
+
+def add_handler(handler: logging.Handler) -> None:
+    """adds a handler to the HuggingFace Diffusers' root logger."""
+
+    _configure_library_root_logger()
+
+    assert handler is not None
+    _get_library_root_logger().addHandler(handler)
+
+
+def remove_handler(handler: logging.Handler) -> None:
+    """removes given handler from the HuggingFace Diffusers' root logger."""
+
+    _configure_library_root_logger()
+
+    assert handler is not None and handler in _get_library_root_logger().handlers
+    _get_library_root_logger().removeHandler(handler)
+
+
+def disable_propagation() -> None:
+    """
+    Disable propagation of the library log outputs. Note that log propagation is disabled by default.
+    """
+
+    _configure_library_root_logger()
+    _get_library_root_logger().propagate = False
+
+
+def enable_propagation() -> None:
+    """
+    Enable propagation of the library log outputs. Please disable the HuggingFace Diffusers' default handler to prevent
+    double logging if the root logger has been configured.
+    """
+
+    _configure_library_root_logger()
+    _get_library_root_logger().propagate = True
+
+
+def enable_explicit_format() -> None:
+    """
+    Enable explicit formatting for every 🤗 Diffusers' logger. The explicit formatter is as follows:
+    ```
+    [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE
+    ```
+    All handlers currently bound to the root logger are affected by this method.
+    """
+    handlers = _get_library_root_logger().handlers
+
+    for handler in handlers:
+        formatter = logging.Formatter("[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s")
+        handler.setFormatter(formatter)
+
+
+def reset_format() -> None:
+    """
+    Resets the formatting for 🤗 Diffusers' loggers.
+
+    All handlers currently bound to the root logger are affected by this method.
+    """
+    handlers = _get_library_root_logger().handlers
+
+    for handler in handlers:
+        handler.setFormatter(None)
+
+
+def warning_advice(self, *args, **kwargs) -> None:
+    """
+    This method is identical to `logger.warning()`, but if env var DIFFUSERS_NO_ADVISORY_WARNINGS=1 is set, this
+    warning will not be printed
+    """
+    no_advisory_warnings = os.getenv("DIFFUSERS_NO_ADVISORY_WARNINGS", False)
+    if no_advisory_warnings:
+        return
+    self.warning(*args, **kwargs)
+
+
+logging.Logger.warning_advice = warning_advice
+
+
+class EmptyTqdm:
+    """Dummy tqdm which doesn't do anything."""
+
+    def __init__(self, *args, **kwargs):  # pylint: disable=unused-argument
+        self._iterator = args[0] if args else None
+
+    def __iter__(self):
+        return iter(self._iterator)
+
+    def __getattr__(self, _):
+        """Return empty function."""
+
+        def empty_fn(*args, **kwargs):  # pylint: disable=unused-argument
+            return
+
+        return empty_fn
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type_, value, traceback):
+        return
+
+
+class _tqdm_cls:
+    def __call__(self, *args, **kwargs):
+        if _tqdm_active:
+            return tqdm_lib.tqdm(*args, **kwargs)
+        else:
+            return EmptyTqdm(*args, **kwargs)
+
+    def set_lock(self, *args, **kwargs):
+        self._lock = None
+        if _tqdm_active:
+            return tqdm_lib.tqdm.set_lock(*args, **kwargs)
+
+    def get_lock(self):
+        if _tqdm_active:
+            return tqdm_lib.tqdm.get_lock()
+
+
+tqdm = _tqdm_cls()
+
+
+def is_progress_bar_enabled() -> bool:
+    """Return a boolean indicating whether tqdm progress bars are enabled."""
+    global _tqdm_active
+    return bool(_tqdm_active)
+
+
+def enable_progress_bar() -> None:
+    """Enable tqdm progress bar."""
+    global _tqdm_active
+    _tqdm_active = True
+
+
+def disable_progress_bar() -> None:
+    """Disable tqdm progress bar."""
+    global _tqdm_active
+    _tqdm_active = False
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/model_card_template.md b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/model_card_template.md
new file mode 100644
index 000000000..f41b71e24
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/model_card_template.md
@@ -0,0 +1,24 @@
+---
+{{ card_data }}
+---
+
+<!-- This model card has been generated automatically according to the information the training script had access to. You
+should probably proofread and complete it, then remove this comment. -->
+
+{{ model_description }}
+
+## Intended uses & limitations
+
+#### How to use
+
+```python
+# TODO: add an example code snippet for running this diffusion pipeline
+```
+
+#### Limitations and bias
+
+[TODO: provide examples of latent issues and potential remediations]
+
+## Training details
+
+[TODO: describe the data used to train the model]
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/outputs.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/outputs.py
new file mode 100644
index 000000000..6080a86b8
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/outputs.py
@@ -0,0 +1,137 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Generic utilities
+"""
+
+from collections import OrderedDict
+from dataclasses import fields, is_dataclass
+from typing import Any, Tuple
+
+import numpy as np
+
+from .import_utils import is_torch_available, is_torch_version
+
+
+def is_tensor(x) -> bool:
+    """
+    Tests if `x` is a `torch.Tensor` or `np.ndarray`.
+    """
+    if is_torch_available():
+        import torch
+
+        if isinstance(x, torch.Tensor):
+            return True
+
+    return isinstance(x, np.ndarray)
+
+
+class BaseOutput(OrderedDict):
+    """
+    Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a
+    tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular
+    Python dictionary.
+
+    <Tip warning={true}>
+
+    You can't unpack a [`BaseOutput`] directly. Use the [`~utils.BaseOutput.to_tuple`] method to convert it to a tuple
+    first.
+
+    </Tip>
+    """
+
+    def __init_subclass__(cls) -> None:
+        """Register subclasses as pytree nodes.
+
+        This is necessary to synchronize gradients when using `torch.nn.parallel.DistributedDataParallel` with
+        `static_graph=True` with modules that output `ModelOutput` subclasses.
+        """
+        if is_torch_available():
+            import torch.utils._pytree
+
+            if is_torch_version("<", "2.2"):
+                torch.utils._pytree._register_pytree_node(
+                    cls,
+                    torch.utils._pytree._dict_flatten,
+                    lambda values, context: cls(**torch.utils._pytree._dict_unflatten(values, context)),
+                )
+            else:
+                torch.utils._pytree.register_pytree_node(
+                    cls,
+                    torch.utils._pytree._dict_flatten,
+                    lambda values, context: cls(**torch.utils._pytree._dict_unflatten(values, context)),
+                )
+
+    def __post_init__(self) -> None:
+        class_fields = fields(self)
+
+        # Safety and consistency checks
+        if not len(class_fields):
+            raise ValueError(f"{self.__class__.__name__} has no fields.")
+
+        first_field = getattr(self, class_fields[0].name)
+        other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:])
+
+        if other_fields_are_none and isinstance(first_field, dict):
+            for key, value in first_field.items():
+                self[key] = value
+        else:
+            for field in class_fields:
+                v = getattr(self, field.name)
+                if v is not None:
+                    self[field.name] = v
+
+    def __delitem__(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
+
+    def setdefault(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
+
+    def pop(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
+
+    def update(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
+
+    def __getitem__(self, k: Any) -> Any:
+        if isinstance(k, str):
+            inner_dict = dict(self.items())
+            return inner_dict[k]
+        else:
+            return self.to_tuple()[k]
+
+    def __setattr__(self, name: Any, value: Any) -> None:
+        if name in self.keys() and value is not None:
+            # Don't call self.__setitem__ to avoid recursion errors
+            super().__setitem__(name, value)
+        super().__setattr__(name, value)
+
+    def __setitem__(self, key, value):
+        # Will raise a KeyException if needed
+        super().__setitem__(key, value)
+        # Don't call self.__setattr__ to avoid recursion errors
+        super().__setattr__(key, value)
+
+    def __reduce__(self):
+        if not is_dataclass(self):
+            return super().__reduce__()
+        callable, _args, *remaining = super().__reduce__()
+        args = tuple(getattr(self, field.name) for field in fields(self))
+        return callable, args, *remaining
+
+    def to_tuple(self) -> Tuple[Any, ...]:
+        """
+        Convert self to a tuple containing all the attributes/keys that are not `None`.
+        """
+        return tuple(self[k] for k in self.keys())
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/peft_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/peft_utils.py
new file mode 100644
index 000000000..85d16c7b5
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/peft_utils.py
@@ -0,0 +1,268 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+PEFT utilities: Utilities related to peft library
+"""
+import collections
+import importlib
+from typing import Optional
+
+from packaging import version
+
+from .import_utils import is_peft_available, is_torch_available
+
+
+if is_torch_available():
+    import torch
+
+
+def recurse_remove_peft_layers(model):
+    r"""
+    Recursively replace all instances of `LoraLayer` with corresponding new layers in `model`.
+    """
+    from peft.tuners.tuners_utils import BaseTunerLayer
+
+    has_base_layer_pattern = False
+    for module in model.modules():
+        if isinstance(module, BaseTunerLayer):
+            has_base_layer_pattern = hasattr(module, "base_layer")
+            break
+
+    if has_base_layer_pattern:
+        from peft.utils import _get_submodules
+
+        key_list = [key for key, _ in model.named_modules() if "lora" not in key]
+        for key in key_list:
+            try:
+                parent, target, target_name = _get_submodules(model, key)
+            except AttributeError:
+                continue
+            if hasattr(target, "base_layer"):
+                setattr(parent, target_name, target.get_base_layer())
+    else:
+        # This is for backwards compatibility with PEFT <= 0.6.2.
+        # TODO can be removed once that PEFT version is no longer supported.
+        from peft.tuners.lora import LoraLayer
+
+        for name, module in model.named_children():
+            if len(list(module.children())) > 0:
+                ## compound module, go inside it
+                recurse_remove_peft_layers(module)
+
+            module_replaced = False
+
+            if isinstance(module, LoraLayer) and isinstance(module, torch.nn.Linear):
+                new_module = torch.nn.Linear(module.in_features, module.out_features, bias=module.bias is not None).to(
+                    module.weight.device
+                )
+                new_module.weight = module.weight
+                if module.bias is not None:
+                    new_module.bias = module.bias
+
+                module_replaced = True
+            elif isinstance(module, LoraLayer) and isinstance(module, torch.nn.Conv2d):
+                new_module = torch.nn.Conv2d(
+                    module.in_channels,
+                    module.out_channels,
+                    module.kernel_size,
+                    module.stride,
+                    module.padding,
+                    module.dilation,
+                    module.groups,
+                ).to(module.weight.device)
+
+                new_module.weight = module.weight
+                if module.bias is not None:
+                    new_module.bias = module.bias
+
+                module_replaced = True
+
+            if module_replaced:
+                setattr(model, name, new_module)
+                del module
+
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+    return model
+
+
+def scale_lora_layers(model, weight):
+    """
+    Adjust the weightage given to the LoRA layers of the model.
+
+    Args:
+        model (`torch.nn.Module`):
+            The model to scale.
+        weight (`float`):
+            The weight to be given to the LoRA layers.
+    """
+    from peft.tuners.tuners_utils import BaseTunerLayer
+
+    for module in model.modules():
+        if isinstance(module, BaseTunerLayer):
+            module.scale_layer(weight)
+
+
+def unscale_lora_layers(model, weight: Optional[float] = None):
+    """
+    Removes the previously passed weight given to the LoRA layers of the model.
+
+    Args:
+        model (`torch.nn.Module`):
+            The model to scale.
+        weight (`float`, *optional*):
+            The weight to be given to the LoRA layers. If no scale is passed the scale of the lora layer will be
+            re-initialized to the correct value. If 0.0 is passed, we will re-initialize the scale with the correct
+            value.
+    """
+    from peft.tuners.tuners_utils import BaseTunerLayer
+
+    for module in model.modules():
+        if isinstance(module, BaseTunerLayer):
+            if weight is not None and weight != 0:
+                module.unscale_layer(weight)
+            elif weight is not None and weight == 0:
+                for adapter_name in module.active_adapters:
+                    # if weight == 0 unscale should re-set the scale to the original value.
+                    module.set_scale(adapter_name, 1.0)
+
+
+def get_peft_kwargs(rank_dict, network_alpha_dict, peft_state_dict, is_unet=True):
+    rank_pattern = {}
+    alpha_pattern = {}
+    r = lora_alpha = list(rank_dict.values())[0]
+
+    if len(set(rank_dict.values())) > 1:
+        # get the rank occuring the most number of times
+        r = collections.Counter(rank_dict.values()).most_common()[0][0]
+
+        # for modules with rank different from the most occuring rank, add it to the `rank_pattern`
+        rank_pattern = dict(filter(lambda x: x[1] != r, rank_dict.items()))
+        rank_pattern = {k.split(".lora_B.")[0]: v for k, v in rank_pattern.items()}
+
+    if network_alpha_dict is not None and len(network_alpha_dict) > 0:
+        if len(set(network_alpha_dict.values())) > 1:
+            # get the alpha occuring the most number of times
+            lora_alpha = collections.Counter(network_alpha_dict.values()).most_common()[0][0]
+
+            # for modules with alpha different from the most occuring alpha, add it to the `alpha_pattern`
+            alpha_pattern = dict(filter(lambda x: x[1] != lora_alpha, network_alpha_dict.items()))
+            if is_unet:
+                alpha_pattern = {
+                    ".".join(k.split(".lora_A.")[0].split(".")).replace(".alpha", ""): v
+                    for k, v in alpha_pattern.items()
+                }
+            else:
+                alpha_pattern = {".".join(k.split(".down.")[0].split(".")[:-1]): v for k, v in alpha_pattern.items()}
+        else:
+            lora_alpha = set(network_alpha_dict.values()).pop()
+
+    # layer names without the Diffusers specific
+    target_modules = list({name.split(".lora")[0] for name in peft_state_dict.keys()})
+
+    lora_config_kwargs = {
+        "r": r,
+        "lora_alpha": lora_alpha,
+        "rank_pattern": rank_pattern,
+        "alpha_pattern": alpha_pattern,
+        "target_modules": target_modules,
+    }
+    return lora_config_kwargs
+
+
+def get_adapter_name(model):
+    from peft.tuners.tuners_utils import BaseTunerLayer
+
+    for module in model.modules():
+        if isinstance(module, BaseTunerLayer):
+            return f"default_{len(module.r)}"
+    return "default_0"
+
+
+def set_adapter_layers(model, enabled=True):
+    from peft.tuners.tuners_utils import BaseTunerLayer
+
+    for module in model.modules():
+        if isinstance(module, BaseTunerLayer):
+            # The recent version of PEFT needs to call `enable_adapters` instead
+            if hasattr(module, "enable_adapters"):
+                module.enable_adapters(enabled=enabled)
+            else:
+                module.disable_adapters = not enabled
+
+
+def delete_adapter_layers(model, adapter_name):
+    from peft.tuners.tuners_utils import BaseTunerLayer
+
+    for module in model.modules():
+        if isinstance(module, BaseTunerLayer):
+            if hasattr(module, "delete_adapter"):
+                module.delete_adapter(adapter_name)
+            else:
+                raise ValueError(
+                    "The version of PEFT you are using is not compatible, please use a version that is greater than 0.6.1"
+                )
+
+    # For transformers integration - we need to pop the adapter from the config
+    if getattr(model, "_hf_peft_config_loaded", False) and hasattr(model, "peft_config"):
+        model.peft_config.pop(adapter_name, None)
+        # In case all adapters are deleted, we need to delete the config
+        # and make sure to set the flag to False
+        if len(model.peft_config) == 0:
+            del model.peft_config
+            model._hf_peft_config_loaded = None
+
+
+def set_weights_and_activate_adapters(model, adapter_names, weights):
+    from peft.tuners.tuners_utils import BaseTunerLayer
+
+    # iterate over each adapter, make it active and set the corresponding scaling weight
+    for adapter_name, weight in zip(adapter_names, weights):
+        for module in model.modules():
+            if isinstance(module, BaseTunerLayer):
+                # For backward compatbility with previous PEFT versions
+                if hasattr(module, "set_adapter"):
+                    module.set_adapter(adapter_name)
+                else:
+                    module.active_adapter = adapter_name
+                module.set_scale(adapter_name, weight)
+
+    # set multiple active adapters
+    for module in model.modules():
+        if isinstance(module, BaseTunerLayer):
+            # For backward compatbility with previous PEFT versions
+            if hasattr(module, "set_adapter"):
+                module.set_adapter(adapter_names)
+            else:
+                module.active_adapter = adapter_names
+
+
+def check_peft_version(min_version: str) -> None:
+    r"""
+    Checks if the version of PEFT is compatible.
+
+    Args:
+        version (`str`):
+            The version of PEFT to check against.
+    """
+    if not is_peft_available():
+        raise ValueError("PEFT is not installed. Please install it with `pip install peft`")
+
+    is_peft_version_compatible = version.parse(importlib.metadata.version("peft")) > version.parse(min_version)
+
+    if not is_peft_version_compatible:
+        raise ValueError(
+            f"The version of PEFT you are using is not compatible, please use a version that is greater"
+            f" than {min_version}"
+        )
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/pil_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/pil_utils.py
new file mode 100644
index 000000000..76678070b
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/pil_utils.py
@@ -0,0 +1,67 @@
+from typing import List
+
+import PIL.Image
+import PIL.ImageOps
+from packaging import version
+from PIL import Image
+
+
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.Resampling.BILINEAR,
+        "bilinear": PIL.Image.Resampling.BILINEAR,
+        "bicubic": PIL.Image.Resampling.BICUBIC,
+        "lanczos": PIL.Image.Resampling.LANCZOS,
+        "nearest": PIL.Image.Resampling.NEAREST,
+    }
+else:
+    PIL_INTERPOLATION = {
+        "linear": PIL.Image.LINEAR,
+        "bilinear": PIL.Image.BILINEAR,
+        "bicubic": PIL.Image.BICUBIC,
+        "lanczos": PIL.Image.LANCZOS,
+        "nearest": PIL.Image.NEAREST,
+    }
+
+
+def pt_to_pil(images):
+    """
+    Convert a torch image to a PIL image.
+    """
+    images = (images / 2 + 0.5).clamp(0, 1)
+    images = images.cpu().permute(0, 2, 3, 1).float().numpy()
+    images = numpy_to_pil(images)
+    return images
+
+
+def numpy_to_pil(images):
+    """
+    Convert a numpy image or a batch of images to a PIL image.
+    """
+    if images.ndim == 3:
+        images = images[None, ...]
+    images = (images * 255).round().astype("uint8")
+    if images.shape[-1] == 1:
+        # special case for grayscale (single channel) images
+        pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+    else:
+        pil_images = [Image.fromarray(image) for image in images]
+
+    return pil_images
+
+
+def make_image_grid(images: List[PIL.Image.Image], rows: int, cols: int, resize: int = None) -> PIL.Image.Image:
+    """
+    Prepares a single grid of images. Useful for visualization purposes.
+    """
+    assert len(images) == rows * cols
+
+    if resize is not None:
+        images = [img.resize((resize, resize)) for img in images]
+
+    w, h = images[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+
+    for i, img in enumerate(images):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/state_dict_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/state_dict_utils.py
new file mode 100644
index 000000000..c4566636d
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/state_dict_utils.py
@@ -0,0 +1,324 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+State dict utilities: utility methods for converting state dicts easily
+"""
+import enum
+
+from .logging import get_logger
+
+
+logger = get_logger(__name__)
+
+
+class StateDictType(enum.Enum):
+    """
+    The mode to use when converting state dicts.
+    """
+
+    DIFFUSERS_OLD = "diffusers_old"
+    KOHYA_SS = "kohya_ss"
+    PEFT = "peft"
+    DIFFUSERS = "diffusers"
+
+
+# We need to define a proper mapping for Unet since it uses different output keys than text encoder
+# e.g. to_q_lora -> q_proj / to_q
+UNET_TO_DIFFUSERS = {
+    ".to_out_lora.up": ".to_out.0.lora_B",
+    ".to_out_lora.down": ".to_out.0.lora_A",
+    ".to_q_lora.down": ".to_q.lora_A",
+    ".to_q_lora.up": ".to_q.lora_B",
+    ".to_k_lora.down": ".to_k.lora_A",
+    ".to_k_lora.up": ".to_k.lora_B",
+    ".to_v_lora.down": ".to_v.lora_A",
+    ".to_v_lora.up": ".to_v.lora_B",
+    ".lora.up": ".lora_B",
+    ".lora.down": ".lora_A",
+}
+
+
+DIFFUSERS_TO_PEFT = {
+    ".q_proj.lora_linear_layer.up": ".q_proj.lora_B",
+    ".q_proj.lora_linear_layer.down": ".q_proj.lora_A",
+    ".k_proj.lora_linear_layer.up": ".k_proj.lora_B",
+    ".k_proj.lora_linear_layer.down": ".k_proj.lora_A",
+    ".v_proj.lora_linear_layer.up": ".v_proj.lora_B",
+    ".v_proj.lora_linear_layer.down": ".v_proj.lora_A",
+    ".out_proj.lora_linear_layer.up": ".out_proj.lora_B",
+    ".out_proj.lora_linear_layer.down": ".out_proj.lora_A",
+    ".lora_linear_layer.up": ".lora_B",
+    ".lora_linear_layer.down": ".lora_A",
+}
+
+DIFFUSERS_OLD_TO_PEFT = {
+    ".to_q_lora.up": ".q_proj.lora_B",
+    ".to_q_lora.down": ".q_proj.lora_A",
+    ".to_k_lora.up": ".k_proj.lora_B",
+    ".to_k_lora.down": ".k_proj.lora_A",
+    ".to_v_lora.up": ".v_proj.lora_B",
+    ".to_v_lora.down": ".v_proj.lora_A",
+    ".to_out_lora.up": ".out_proj.lora_B",
+    ".to_out_lora.down": ".out_proj.lora_A",
+    ".lora_linear_layer.up": ".lora_B",
+    ".lora_linear_layer.down": ".lora_A",
+}
+
+PEFT_TO_DIFFUSERS = {
+    ".q_proj.lora_B": ".q_proj.lora_linear_layer.up",
+    ".q_proj.lora_A": ".q_proj.lora_linear_layer.down",
+    ".k_proj.lora_B": ".k_proj.lora_linear_layer.up",
+    ".k_proj.lora_A": ".k_proj.lora_linear_layer.down",
+    ".v_proj.lora_B": ".v_proj.lora_linear_layer.up",
+    ".v_proj.lora_A": ".v_proj.lora_linear_layer.down",
+    ".out_proj.lora_B": ".out_proj.lora_linear_layer.up",
+    ".out_proj.lora_A": ".out_proj.lora_linear_layer.down",
+    "to_k.lora_A": "to_k.lora.down",
+    "to_k.lora_B": "to_k.lora.up",
+    "to_q.lora_A": "to_q.lora.down",
+    "to_q.lora_B": "to_q.lora.up",
+    "to_v.lora_A": "to_v.lora.down",
+    "to_v.lora_B": "to_v.lora.up",
+    "to_out.0.lora_A": "to_out.0.lora.down",
+    "to_out.0.lora_B": "to_out.0.lora.up",
+}
+
+DIFFUSERS_OLD_TO_DIFFUSERS = {
+    ".to_q_lora.up": ".q_proj.lora_linear_layer.up",
+    ".to_q_lora.down": ".q_proj.lora_linear_layer.down",
+    ".to_k_lora.up": ".k_proj.lora_linear_layer.up",
+    ".to_k_lora.down": ".k_proj.lora_linear_layer.down",
+    ".to_v_lora.up": ".v_proj.lora_linear_layer.up",
+    ".to_v_lora.down": ".v_proj.lora_linear_layer.down",
+    ".to_out_lora.up": ".out_proj.lora_linear_layer.up",
+    ".to_out_lora.down": ".out_proj.lora_linear_layer.down",
+}
+
+PEFT_TO_KOHYA_SS = {
+    "lora_A": "lora_down",
+    "lora_B": "lora_up",
+    # This is not a comprehensive dict as kohya format requires replacing `.` with `_` in keys,
+    # adding prefixes and adding alpha values
+    # Check `convert_state_dict_to_kohya` for more
+}
+
+PEFT_STATE_DICT_MAPPINGS = {
+    StateDictType.DIFFUSERS_OLD: DIFFUSERS_OLD_TO_PEFT,
+    StateDictType.DIFFUSERS: DIFFUSERS_TO_PEFT,
+}
+
+DIFFUSERS_STATE_DICT_MAPPINGS = {
+    StateDictType.DIFFUSERS_OLD: DIFFUSERS_OLD_TO_DIFFUSERS,
+    StateDictType.PEFT: PEFT_TO_DIFFUSERS,
+}
+
+KOHYA_STATE_DICT_MAPPINGS = {StateDictType.PEFT: PEFT_TO_KOHYA_SS}
+
+KEYS_TO_ALWAYS_REPLACE = {
+    ".processor.": ".",
+}
+
+
+def convert_state_dict(state_dict, mapping):
+    r"""
+    Simply iterates over the state dict and replaces the patterns in `mapping` with the corresponding values.
+
+    Args:
+        state_dict (`dict[str, torch.Tensor]`):
+            The state dict to convert.
+        mapping (`dict[str, str]`):
+            The mapping to use for conversion, the mapping should be a dictionary with the following structure:
+                - key: the pattern to replace
+                - value: the pattern to replace with
+
+    Returns:
+        converted_state_dict (`dict`)
+            The converted state dict.
+    """
+    converted_state_dict = {}
+    for k, v in state_dict.items():
+        # First, filter out the keys that we always want to replace
+        for pattern in KEYS_TO_ALWAYS_REPLACE.keys():
+            if pattern in k:
+                new_pattern = KEYS_TO_ALWAYS_REPLACE[pattern]
+                k = k.replace(pattern, new_pattern)
+
+        for pattern in mapping.keys():
+            if pattern in k:
+                new_pattern = mapping[pattern]
+                k = k.replace(pattern, new_pattern)
+                break
+        converted_state_dict[k] = v
+    return converted_state_dict
+
+
+def convert_state_dict_to_peft(state_dict, original_type=None, **kwargs):
+    r"""
+    Converts a state dict to the PEFT format The state dict can be from previous diffusers format (`OLD_DIFFUSERS`), or
+    new diffusers format (`DIFFUSERS`). The method only supports the conversion from diffusers old/new to PEFT for now.
+
+    Args:
+        state_dict (`dict[str, torch.Tensor]`):
+            The state dict to convert.
+        original_type (`StateDictType`, *optional*):
+            The original type of the state dict, if not provided, the method will try to infer it automatically.
+    """
+    if original_type is None:
+        # Old diffusers to PEFT
+        if any("to_out_lora" in k for k in state_dict.keys()):
+            original_type = StateDictType.DIFFUSERS_OLD
+        elif any("lora_linear_layer" in k for k in state_dict.keys()):
+            original_type = StateDictType.DIFFUSERS
+        else:
+            raise ValueError("Could not automatically infer state dict type")
+
+    if original_type not in PEFT_STATE_DICT_MAPPINGS.keys():
+        raise ValueError(f"Original type {original_type} is not supported")
+
+    mapping = PEFT_STATE_DICT_MAPPINGS[original_type]
+    return convert_state_dict(state_dict, mapping)
+
+
+def convert_state_dict_to_diffusers(state_dict, original_type=None, **kwargs):
+    r"""
+    Converts a state dict to new diffusers format. The state dict can be from previous diffusers format
+    (`OLD_DIFFUSERS`), or PEFT format (`PEFT`) or new diffusers format (`DIFFUSERS`). In the last case the method will
+    return the state dict as is.
+
+    The method only supports the conversion from diffusers old, PEFT to diffusers new for now.
+
+    Args:
+        state_dict (`dict[str, torch.Tensor]`):
+            The state dict to convert.
+        original_type (`StateDictType`, *optional*):
+            The original type of the state dict, if not provided, the method will try to infer it automatically.
+        kwargs (`dict`, *args*):
+            Additional arguments to pass to the method.
+
+            - **adapter_name**: For example, in case of PEFT, some keys will be pre-pended
+                with the adapter name, therefore needs a special handling. By default PEFT also takes care of that in
+                `get_peft_model_state_dict` method:
+                https://github.com/huggingface/peft/blob/ba0477f2985b1ba311b83459d29895c809404e99/src/peft/utils/save_and_load.py#L92
+                but we add it here in case we don't want to rely on that method.
+    """
+    peft_adapter_name = kwargs.pop("adapter_name", None)
+    if peft_adapter_name is not None:
+        peft_adapter_name = "." + peft_adapter_name
+    else:
+        peft_adapter_name = ""
+
+    if original_type is None:
+        # Old diffusers to PEFT
+        if any("to_out_lora" in k for k in state_dict.keys()):
+            original_type = StateDictType.DIFFUSERS_OLD
+        elif any(f".lora_A{peft_adapter_name}.weight" in k for k in state_dict.keys()):
+            original_type = StateDictType.PEFT
+        elif any("lora_linear_layer" in k for k in state_dict.keys()):
+            # nothing to do
+            return state_dict
+        else:
+            raise ValueError("Could not automatically infer state dict type")
+
+    if original_type not in DIFFUSERS_STATE_DICT_MAPPINGS.keys():
+        raise ValueError(f"Original type {original_type} is not supported")
+
+    mapping = DIFFUSERS_STATE_DICT_MAPPINGS[original_type]
+    return convert_state_dict(state_dict, mapping)
+
+
+def convert_unet_state_dict_to_peft(state_dict):
+    r"""
+    Converts a state dict from UNet format to diffusers format - i.e. by removing some keys
+    """
+    mapping = UNET_TO_DIFFUSERS
+    return convert_state_dict(state_dict, mapping)
+
+
+def convert_all_state_dict_to_peft(state_dict):
+    r"""
+    Attempts to first `convert_state_dict_to_peft`, and if it doesn't detect `lora_linear_layer`
+    for a valid `DIFFUSERS` LoRA for example, attempts to exclusively convert the Unet `convert_unet_state_dict_to_peft`
+    """
+    try:
+        peft_dict = convert_state_dict_to_peft(state_dict)
+    except Exception as e:
+        if str(e) == "Could not automatically infer state dict type":
+            peft_dict = convert_unet_state_dict_to_peft(state_dict)
+        else:
+            raise
+
+    if not any("lora_A" in key or "lora_B" in key for key in peft_dict.keys()):
+        raise ValueError("Your LoRA was not converted to PEFT")
+
+    return peft_dict
+
+
+def convert_state_dict_to_kohya(state_dict, original_type=None, **kwargs):
+    r"""
+    Converts a `PEFT` state dict to `Kohya` format that can be used in AUTOMATIC1111, ComfyUI, SD.Next, InvokeAI, etc.
+    The method only supports the conversion from PEFT to Kohya for now.
+
+    Args:
+        state_dict (`dict[str, torch.Tensor]`):
+            The state dict to convert.
+        original_type (`StateDictType`, *optional*):
+            The original type of the state dict, if not provided, the method will try to infer it automatically.
+        kwargs (`dict`, *args*):
+            Additional arguments to pass to the method.
+
+            - **adapter_name**: For example, in case of PEFT, some keys will be pre-pended
+                with the adapter name, therefore needs a special handling. By default PEFT also takes care of that in
+                `get_peft_model_state_dict` method:
+                https://github.com/huggingface/peft/blob/ba0477f2985b1ba311b83459d29895c809404e99/src/peft/utils/save_and_load.py#L92
+                but we add it here in case we don't want to rely on that method.
+    """
+    try:
+        import torch
+    except ImportError:
+        logger.error("Converting PEFT state dicts to Kohya requires torch to be installed.")
+        raise
+
+    peft_adapter_name = kwargs.pop("adapter_name", None)
+    if peft_adapter_name is not None:
+        peft_adapter_name = "." + peft_adapter_name
+    else:
+        peft_adapter_name = ""
+
+    if original_type is None:
+        if any(f".lora_A{peft_adapter_name}.weight" in k for k in state_dict.keys()):
+            original_type = StateDictType.PEFT
+
+    if original_type not in KOHYA_STATE_DICT_MAPPINGS.keys():
+        raise ValueError(f"Original type {original_type} is not supported")
+
+    # Use the convert_state_dict function with the appropriate mapping
+    kohya_ss_partial_state_dict = convert_state_dict(state_dict, KOHYA_STATE_DICT_MAPPINGS[StateDictType.PEFT])
+    kohya_ss_state_dict = {}
+
+    # Additional logic for replacing header, alpha parameters `.` with `_` in all keys
+    for kohya_key, weight in kohya_ss_partial_state_dict.items():
+        if "text_encoder_2." in kohya_key:
+            kohya_key = kohya_key.replace("text_encoder_2.", "lora_te2.")
+        elif "text_encoder." in kohya_key:
+            kohya_key = kohya_key.replace("text_encoder.", "lora_te1.")
+        elif "unet" in kohya_key:
+            kohya_key = kohya_key.replace("unet", "lora_unet")
+        kohya_key = kohya_key.replace(".", "_", kohya_key.count(".") - 2)
+        kohya_key = kohya_key.replace(peft_adapter_name, "")  # Kohya doesn't take names
+        kohya_ss_state_dict[kohya_key] = weight
+        if "lora_down" in kohya_key:
+            alpha_key = f'{kohya_key.split(".")[0]}.alpha'
+            kohya_ss_state_dict[alpha_key] = torch.tensor(len(weight))
+
+    return kohya_ss_state_dict
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/testing_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/testing_utils.py
new file mode 100644
index 000000000..edbf6f31a
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/testing_utils.py
@@ -0,0 +1,967 @@
+import functools
+import importlib
+import inspect
+import io
+import logging
+import multiprocessing
+import os
+import random
+import re
+import struct
+import sys
+import tempfile
+import time
+import unittest
+import urllib.parse
+from contextlib import contextmanager
+from distutils.util import strtobool
+from io import BytesIO, StringIO
+from pathlib import Path
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import PIL.ImageOps
+import requests
+from numpy.linalg import norm
+from packaging import version
+
+from .import_utils import (
+    BACKENDS_MAPPING,
+    is_compel_available,
+    is_flax_available,
+    is_note_seq_available,
+    is_onnx_available,
+    is_opencv_available,
+    is_peft_available,
+    is_torch_available,
+    is_torch_version,
+    is_torchsde_available,
+    is_transformers_available,
+)
+from .logging import get_logger
+
+
+global_rng = random.Random()
+
+logger = get_logger(__name__)
+
+_required_peft_version = is_peft_available() and version.parse(
+    version.parse(importlib.metadata.version("peft")).base_version
+) > version.parse("0.5")
+_required_transformers_version = is_transformers_available() and version.parse(
+    version.parse(importlib.metadata.version("transformers")).base_version
+) > version.parse("4.33")
+
+USE_PEFT_BACKEND = _required_peft_version and _required_transformers_version
+
+if is_torch_available():
+    import torch
+
+    # Set a backend environment variable for any extra module import required for a custom accelerator
+    if "DIFFUSERS_TEST_BACKEND" in os.environ:
+        backend = os.environ["DIFFUSERS_TEST_BACKEND"]
+        try:
+            _ = importlib.import_module(backend)
+        except ModuleNotFoundError as e:
+            raise ModuleNotFoundError(
+                f"Failed to import `DIFFUSERS_TEST_BACKEND` '{backend}'! This should be the name of an installed module \
+                    to enable a specified backend.):\n{e}"
+            ) from e
+
+    if "DIFFUSERS_TEST_DEVICE" in os.environ:
+        torch_device = os.environ["DIFFUSERS_TEST_DEVICE"]
+        try:
+            # try creating device to see if provided device is valid
+            _ = torch.device(torch_device)
+        except RuntimeError as e:
+            raise RuntimeError(
+                f"Unknown testing device specified by environment variable `DIFFUSERS_TEST_DEVICE`: {torch_device}"
+            ) from e
+        logger.info(f"torch_device overrode to {torch_device}")
+    else:
+        torch_device = "cuda" if torch.cuda.is_available() else "cpu"
+        is_torch_higher_equal_than_1_12 = version.parse(
+            version.parse(torch.__version__).base_version
+        ) >= version.parse("1.12")
+
+        if is_torch_higher_equal_than_1_12:
+            # Some builds of torch 1.12 don't have the mps backend registered. See #892 for more details
+            mps_backend_registered = hasattr(torch.backends, "mps")
+            torch_device = "mps" if (mps_backend_registered and torch.backends.mps.is_available()) else torch_device
+
+
+def torch_all_close(a, b, *args, **kwargs):
+    if not is_torch_available():
+        raise ValueError("PyTorch needs to be installed to use this function.")
+    if not torch.allclose(a, b, *args, **kwargs):
+        assert False, f"Max diff is absolute {(a - b).abs().max()}. Diff tensor is {(a - b).abs()}."
+    return True
+
+
+def numpy_cosine_similarity_distance(a, b):
+    similarity = np.dot(a, b) / (norm(a) * norm(b))
+    distance = 1.0 - similarity.mean()
+
+    return distance
+
+
+def print_tensor_test(tensor, filename="test_corrections.txt", expected_tensor_name="expected_slice"):
+    test_name = os.environ.get("PYTEST_CURRENT_TEST")
+    if not torch.is_tensor(tensor):
+        tensor = torch.from_numpy(tensor)
+
+    tensor_str = str(tensor.detach().cpu().flatten().to(torch.float32)).replace("\n", "")
+    # format is usually:
+    # expected_slice = np.array([-0.5713, -0.3018, -0.9814, 0.04663, -0.879, 0.76, -1.734, 0.1044, 1.161])
+    output_str = tensor_str.replace("tensor", f"{expected_tensor_name} = np.array")
+    test_file, test_class, test_fn = test_name.split("::")
+    test_fn = test_fn.split()[0]
+    with open(filename, "a") as f:
+        print(";".join([test_file, test_class, test_fn, output_str]), file=f)
+
+
+def get_tests_dir(append_path=None):
+    """
+    Args:
+        append_path: optional path to append to the tests dir path
+    Return:
+        The full path to the `tests` dir, so that the tests can be invoked from anywhere. Optionally `append_path` is
+        joined after the `tests` dir the former is provided.
+    """
+    # this function caller's __file__
+    caller__file__ = inspect.stack()[1][1]
+    tests_dir = os.path.abspath(os.path.dirname(caller__file__))
+
+    while not tests_dir.endswith("tests"):
+        tests_dir = os.path.dirname(tests_dir)
+
+    if append_path:
+        return Path(tests_dir, append_path).as_posix()
+    else:
+        return tests_dir
+
+
+def parse_flag_from_env(key, default=False):
+    try:
+        value = os.environ[key]
+    except KeyError:
+        # KEY isn't set, default to `default`.
+        _value = default
+    else:
+        # KEY is set, convert it to True or False.
+        try:
+            _value = strtobool(value)
+        except ValueError:
+            # More values are supported, but let's keep the message simple.
+            raise ValueError(f"If set, {key} must be yes or no.")
+    return _value
+
+
+_run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
+_run_nightly_tests = parse_flag_from_env("RUN_NIGHTLY", default=False)
+
+
+def floats_tensor(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    total_dims = 1
+    for dim in shape:
+        total_dims *= dim
+
+    values = []
+    for _ in range(total_dims):
+        values.append(rng.random() * scale)
+
+    return torch.tensor(data=values, dtype=torch.float).view(shape).contiguous()
+
+
+def slow(test_case):
+    """
+    Decorator marking a test as slow.
+
+    Slow tests are skipped by default. Set the RUN_SLOW environment variable to a truthy value to run them.
+
+    """
+    return unittest.skipUnless(_run_slow_tests, "test is slow")(test_case)
+
+
+def nightly(test_case):
+    """
+    Decorator marking a test that runs nightly in the diffusers CI.
+
+    Slow tests are skipped by default. Set the RUN_NIGHTLY environment variable to a truthy value to run them.
+
+    """
+    return unittest.skipUnless(_run_nightly_tests, "test is nightly")(test_case)
+
+
+def require_torch(test_case):
+    """
+    Decorator marking a test that requires PyTorch. These tests are skipped when PyTorch isn't installed.
+    """
+    return unittest.skipUnless(is_torch_available(), "test requires PyTorch")(test_case)
+
+
+def require_torch_2(test_case):
+    """
+    Decorator marking a test that requires PyTorch 2. These tests are skipped when it isn't installed.
+    """
+    return unittest.skipUnless(is_torch_available() and is_torch_version(">=", "2.0.0"), "test requires PyTorch 2")(
+        test_case
+    )
+
+
+def require_torch_gpu(test_case):
+    """Decorator marking a test that requires CUDA and PyTorch."""
+    return unittest.skipUnless(is_torch_available() and torch_device == "cuda", "test requires PyTorch+CUDA")(
+        test_case
+    )
+
+
+# These decorators are for accelerator-specific behaviours that are not GPU-specific
+def require_torch_accelerator(test_case):
+    """Decorator marking a test that requires an accelerator backend and PyTorch."""
+    return unittest.skipUnless(is_torch_available() and torch_device != "cpu", "test requires accelerator+PyTorch")(
+        test_case
+    )
+
+
+def require_torch_accelerator_with_fp16(test_case):
+    """Decorator marking a test that requires an accelerator with support for the FP16 data type."""
+    return unittest.skipUnless(_is_torch_fp16_available(torch_device), "test requires accelerator with fp16 support")(
+        test_case
+    )
+
+
+def require_torch_accelerator_with_fp64(test_case):
+    """Decorator marking a test that requires an accelerator with support for the FP64 data type."""
+    return unittest.skipUnless(_is_torch_fp64_available(torch_device), "test requires accelerator with fp64 support")(
+        test_case
+    )
+
+
+def require_torch_accelerator_with_training(test_case):
+    """Decorator marking a test that requires an accelerator with support for training."""
+    return unittest.skipUnless(
+        is_torch_available() and backend_supports_training(torch_device),
+        "test requires accelerator with training support",
+    )(test_case)
+
+
+def skip_mps(test_case):
+    """Decorator marking a test to skip if torch_device is 'mps'"""
+    return unittest.skipUnless(torch_device != "mps", "test requires non 'mps' device")(test_case)
+
+
+def require_flax(test_case):
+    """
+    Decorator marking a test that requires JAX & Flax. These tests are skipped when one / both are not installed
+    """
+    return unittest.skipUnless(is_flax_available(), "test requires JAX & Flax")(test_case)
+
+
+def require_compel(test_case):
+    """
+    Decorator marking a test that requires compel: https://github.com/damian0815/compel. These tests are skipped when
+    the library is not installed.
+    """
+    return unittest.skipUnless(is_compel_available(), "test requires compel")(test_case)
+
+
+def require_onnxruntime(test_case):
+    """
+    Decorator marking a test that requires onnxruntime. These tests are skipped when onnxruntime isn't installed.
+    """
+    return unittest.skipUnless(is_onnx_available(), "test requires onnxruntime")(test_case)
+
+
+def require_note_seq(test_case):
+    """
+    Decorator marking a test that requires note_seq. These tests are skipped when note_seq isn't installed.
+    """
+    return unittest.skipUnless(is_note_seq_available(), "test requires note_seq")(test_case)
+
+
+def require_torchsde(test_case):
+    """
+    Decorator marking a test that requires torchsde. These tests are skipped when torchsde isn't installed.
+    """
+    return unittest.skipUnless(is_torchsde_available(), "test requires torchsde")(test_case)
+
+
+def require_peft_backend(test_case):
+    """
+    Decorator marking a test that requires PEFT backend, this would require some specific versions of PEFT and
+    transformers.
+    """
+    return unittest.skipUnless(USE_PEFT_BACKEND, "test requires PEFT backend")(test_case)
+
+
+def require_peft_version_greater(peft_version):
+    """
+    Decorator marking a test that requires PEFT backend with a specific version, this would require some specific
+    versions of PEFT and transformers.
+    """
+
+    def decorator(test_case):
+        correct_peft_version = is_peft_available() and version.parse(
+            version.parse(importlib.metadata.version("peft")).base_version
+        ) > version.parse(peft_version)
+        return unittest.skipUnless(
+            correct_peft_version, f"test requires PEFT backend with the version greater than {peft_version}"
+        )(test_case)
+
+    return decorator
+
+
+def deprecate_after_peft_backend(test_case):
+    """
+    Decorator marking a test that will be skipped after PEFT backend
+    """
+    return unittest.skipUnless(not USE_PEFT_BACKEND, "test skipped in favor of PEFT backend")(test_case)
+
+
+def require_python39_or_higher(test_case):
+    def python39_available():
+        sys_info = sys.version_info
+        major, minor = sys_info.major, sys_info.minor
+        return major == 3 and minor >= 9
+
+    return unittest.skipUnless(python39_available(), "test requires Python 3.9 or higher")(test_case)
+
+
+def load_numpy(arry: Union[str, np.ndarray], local_path: Optional[str] = None) -> np.ndarray:
+    if isinstance(arry, str):
+        if local_path is not None:
+            # local_path can be passed to correct images of tests
+            return Path(local_path, arry.split("/")[-5], arry.split("/")[-2], arry.split("/")[-1]).as_posix()
+        elif arry.startswith("http://") or arry.startswith("https://"):
+            response = requests.get(arry)
+            response.raise_for_status()
+            arry = np.load(BytesIO(response.content))
+        elif os.path.isfile(arry):
+            arry = np.load(arry)
+        else:
+            raise ValueError(
+                f"Incorrect path or url, URLs must start with `http://` or `https://`, and {arry} is not a valid path"
+            )
+    elif isinstance(arry, np.ndarray):
+        pass
+    else:
+        raise ValueError(
+            "Incorrect format used for numpy ndarray. Should be an url linking to an image, a local path, or a"
+            " ndarray."
+        )
+
+    return arry
+
+
+def load_pt(url: str):
+    response = requests.get(url)
+    response.raise_for_status()
+    arry = torch.load(BytesIO(response.content))
+    return arry
+
+
+def load_image(image: Union[str, PIL.Image.Image]) -> PIL.Image.Image:
+    """
+    Loads `image` to a PIL Image.
+
+    Args:
+        image (`str` or `PIL.Image.Image`):
+            The image to convert to the PIL Image format.
+    Returns:
+        `PIL.Image.Image`:
+            A PIL Image.
+    """
+    if isinstance(image, str):
+        if image.startswith("http://") or image.startswith("https://"):
+            image = PIL.Image.open(requests.get(image, stream=True).raw)
+        elif os.path.isfile(image):
+            image = PIL.Image.open(image)
+        else:
+            raise ValueError(
+                f"Incorrect path or url, URLs must start with `http://` or `https://`, and {image} is not a valid path"
+            )
+    elif isinstance(image, PIL.Image.Image):
+        image = image
+    else:
+        raise ValueError(
+            "Incorrect format used for image. Should be an url linking to an image, a local path, or a PIL image."
+        )
+    image = PIL.ImageOps.exif_transpose(image)
+    image = image.convert("RGB")
+    return image
+
+
+def preprocess_image(image: PIL.Image, batch_size: int):
+    w, h = image.size
+    w, h = (x - x % 8 for x in (w, h))  # resize to integer multiple of 8
+    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = np.vstack([image[None].transpose(0, 3, 1, 2)] * batch_size)
+    image = torch.from_numpy(image)
+    return 2.0 * image - 1.0
+
+
+def export_to_gif(image: List[PIL.Image.Image], output_gif_path: str = None) -> str:
+    if output_gif_path is None:
+        output_gif_path = tempfile.NamedTemporaryFile(suffix=".gif").name
+
+    image[0].save(
+        output_gif_path,
+        save_all=True,
+        append_images=image[1:],
+        optimize=False,
+        duration=100,
+        loop=0,
+    )
+    return output_gif_path
+
+
+@contextmanager
+def buffered_writer(raw_f):
+    f = io.BufferedWriter(raw_f)
+    yield f
+    f.flush()
+
+
+def export_to_ply(mesh, output_ply_path: str = None):
+    """
+    Write a PLY file for a mesh.
+    """
+    if output_ply_path is None:
+        output_ply_path = tempfile.NamedTemporaryFile(suffix=".ply").name
+
+    coords = mesh.verts.detach().cpu().numpy()
+    faces = mesh.faces.cpu().numpy()
+    rgb = np.stack([mesh.vertex_channels[x].detach().cpu().numpy() for x in "RGB"], axis=1)
+
+    with buffered_writer(open(output_ply_path, "wb")) as f:
+        f.write(b"ply\n")
+        f.write(b"format binary_little_endian 1.0\n")
+        f.write(bytes(f"element vertex {len(coords)}\n", "ascii"))
+        f.write(b"property float x\n")
+        f.write(b"property float y\n")
+        f.write(b"property float z\n")
+        if rgb is not None:
+            f.write(b"property uchar red\n")
+            f.write(b"property uchar green\n")
+            f.write(b"property uchar blue\n")
+        if faces is not None:
+            f.write(bytes(f"element face {len(faces)}\n", "ascii"))
+            f.write(b"property list uchar int vertex_index\n")
+        f.write(b"end_header\n")
+
+        if rgb is not None:
+            rgb = (rgb * 255.499).round().astype(int)
+            vertices = [
+                (*coord, *rgb)
+                for coord, rgb in zip(
+                    coords.tolist(),
+                    rgb.tolist(),
+                )
+            ]
+            format = struct.Struct("<3f3B")
+            for item in vertices:
+                f.write(format.pack(*item))
+        else:
+            format = struct.Struct("<3f")
+            for vertex in coords.tolist():
+                f.write(format.pack(*vertex))
+
+        if faces is not None:
+            format = struct.Struct("<B3I")
+            for tri in faces.tolist():
+                f.write(format.pack(len(tri), *tri))
+
+    return output_ply_path
+
+
+def export_to_obj(mesh, output_obj_path: str = None):
+    if output_obj_path is None:
+        output_obj_path = tempfile.NamedTemporaryFile(suffix=".obj").name
+
+    verts = mesh.verts.detach().cpu().numpy()
+    faces = mesh.faces.cpu().numpy()
+
+    vertex_colors = np.stack([mesh.vertex_channels[x].detach().cpu().numpy() for x in "RGB"], axis=1)
+    vertices = [
+        "{} {} {} {} {} {}".format(*coord, *color) for coord, color in zip(verts.tolist(), vertex_colors.tolist())
+    ]
+
+    faces = ["f {} {} {}".format(str(tri[0] + 1), str(tri[1] + 1), str(tri[2] + 1)) for tri in faces.tolist()]
+
+    combined_data = ["v " + vertex for vertex in vertices] + faces
+
+    with open(output_obj_path, "w") as f:
+        f.writelines("\n".join(combined_data))
+
+
+def export_to_video(video_frames: List[np.ndarray], output_video_path: str = None) -> str:
+    if is_opencv_available():
+        import cv2
+    else:
+        raise ImportError(BACKENDS_MAPPING["opencv"][1].format("export_to_video"))
+    if output_video_path is None:
+        output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4").name
+
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    h, w, c = video_frames[0].shape
+    video_writer = cv2.VideoWriter(output_video_path, fourcc, fps=8, frameSize=(w, h))
+    for i in range(len(video_frames)):
+        img = cv2.cvtColor(video_frames[i], cv2.COLOR_RGB2BGR)
+        video_writer.write(img)
+    return output_video_path
+
+
+def load_hf_numpy(path) -> np.ndarray:
+    base_url = "https://huggingface.co/datasets/fusing/diffusers-testing/resolve/main"
+
+    if not path.startswith("http://") and not path.startswith("https://"):
+        path = os.path.join(base_url, urllib.parse.quote(path))
+
+    return load_numpy(path)
+
+
+# --- pytest conf functions --- #
+
+# to avoid multiple invocation from tests/conftest.py and examples/conftest.py - make sure it's called only once
+pytest_opt_registered = {}
+
+
+def pytest_addoption_shared(parser):
+    """
+    This function is to be called from `conftest.py` via `pytest_addoption` wrapper that has to be defined there.
+
+    It allows loading both `conftest.py` files at once without causing a failure due to adding the same `pytest`
+    option.
+
+    """
+    option = "--make-reports"
+    if option not in pytest_opt_registered:
+        parser.addoption(
+            option,
+            action="store",
+            default=False,
+            help="generate report files. The value of this option is used as a prefix to report names",
+        )
+        pytest_opt_registered[option] = 1
+
+
+def pytest_terminal_summary_main(tr, id):
+    """
+    Generate multiple reports at the end of test suite run - each report goes into a dedicated file in the current
+    directory. The report files are prefixed with the test suite name.
+
+    This function emulates --duration and -rA pytest arguments.
+
+    This function is to be called from `conftest.py` via `pytest_terminal_summary` wrapper that has to be defined
+    there.
+
+    Args:
+    - tr: `terminalreporter` passed from `conftest.py`
+    - id: unique id like `tests` or `examples` that will be incorporated into the final reports filenames - this is
+      needed as some jobs have multiple runs of pytest, so we can't have them overwrite each other.
+
+    NB: this functions taps into a private _pytest API and while unlikely, it could break should
+    pytest do internal changes - also it calls default internal methods of terminalreporter which
+    can be hijacked by various `pytest-` plugins and interfere.
+
+    """
+    from _pytest.config import create_terminal_writer
+
+    if not len(id):
+        id = "tests"
+
+    config = tr.config
+    orig_writer = config.get_terminal_writer()
+    orig_tbstyle = config.option.tbstyle
+    orig_reportchars = tr.reportchars
+
+    dir = "reports"
+    Path(dir).mkdir(parents=True, exist_ok=True)
+    report_files = {
+        k: f"{dir}/{id}_{k}.txt"
+        for k in [
+            "durations",
+            "errors",
+            "failures_long",
+            "failures_short",
+            "failures_line",
+            "passes",
+            "stats",
+            "summary_short",
+            "warnings",
+        ]
+    }
+
+    # custom durations report
+    # note: there is no need to call pytest --durations=XX to get this separate report
+    # adapted from https://github.com/pytest-dev/pytest/blob/897f151e/src/_pytest/runner.py#L66
+    dlist = []
+    for replist in tr.stats.values():
+        for rep in replist:
+            if hasattr(rep, "duration"):
+                dlist.append(rep)
+    if dlist:
+        dlist.sort(key=lambda x: x.duration, reverse=True)
+        with open(report_files["durations"], "w") as f:
+            durations_min = 0.05  # sec
+            f.write("slowest durations\n")
+            for i, rep in enumerate(dlist):
+                if rep.duration < durations_min:
+                    f.write(f"{len(dlist)-i} durations < {durations_min} secs were omitted")
+                    break
+                f.write(f"{rep.duration:02.2f}s {rep.when:<8} {rep.nodeid}\n")
+
+    def summary_failures_short(tr):
+        # expecting that the reports were --tb=long (default) so we chop them off here to the last frame
+        reports = tr.getreports("failed")
+        if not reports:
+            return
+        tr.write_sep("=", "FAILURES SHORT STACK")
+        for rep in reports:
+            msg = tr._getfailureheadline(rep)
+            tr.write_sep("_", msg, red=True, bold=True)
+            # chop off the optional leading extra frames, leaving only the last one
+            longrepr = re.sub(r".*_ _ _ (_ ){10,}_ _ ", "", rep.longreprtext, 0, re.M | re.S)
+            tr._tw.line(longrepr)
+            # note: not printing out any rep.sections to keep the report short
+
+    # use ready-made report funcs, we are just hijacking the filehandle to log to a dedicated file each
+    # adapted from https://github.com/pytest-dev/pytest/blob/897f151e/src/_pytest/terminal.py#L814
+    # note: some pytest plugins may interfere by hijacking the default `terminalreporter` (e.g.
+    # pytest-instafail does that)
+
+    # report failures with line/short/long styles
+    config.option.tbstyle = "auto"  # full tb
+    with open(report_files["failures_long"], "w") as f:
+        tr._tw = create_terminal_writer(config, f)
+        tr.summary_failures()
+
+    # config.option.tbstyle = "short" # short tb
+    with open(report_files["failures_short"], "w") as f:
+        tr._tw = create_terminal_writer(config, f)
+        summary_failures_short(tr)
+
+    config.option.tbstyle = "line"  # one line per error
+    with open(report_files["failures_line"], "w") as f:
+        tr._tw = create_terminal_writer(config, f)
+        tr.summary_failures()
+
+    with open(report_files["errors"], "w") as f:
+        tr._tw = create_terminal_writer(config, f)
+        tr.summary_errors()
+
+    with open(report_files["warnings"], "w") as f:
+        tr._tw = create_terminal_writer(config, f)
+        tr.summary_warnings()  # normal warnings
+        tr.summary_warnings()  # final warnings
+
+    tr.reportchars = "wPpsxXEf"  # emulate -rA (used in summary_passes() and short_test_summary())
+    with open(report_files["passes"], "w") as f:
+        tr._tw = create_terminal_writer(config, f)
+        tr.summary_passes()
+
+    with open(report_files["summary_short"], "w") as f:
+        tr._tw = create_terminal_writer(config, f)
+        tr.short_test_summary()
+
+    with open(report_files["stats"], "w") as f:
+        tr._tw = create_terminal_writer(config, f)
+        tr.summary_stats()
+
+    # restore:
+    tr._tw = orig_writer
+    tr.reportchars = orig_reportchars
+    config.option.tbstyle = orig_tbstyle
+
+
+# Copied from https://github.com/huggingface/transformers/blob/000e52aec8850d3fe2f360adc6fd256e5b47fe4c/src/transformers/testing_utils.py#L1905
+def is_flaky(max_attempts: int = 5, wait_before_retry: Optional[float] = None, description: Optional[str] = None):
+    """
+    To decorate flaky tests. They will be retried on failures.
+
+    Args:
+        max_attempts (`int`, *optional*, defaults to 5):
+            The maximum number of attempts to retry the flaky test.
+        wait_before_retry (`float`, *optional*):
+            If provided, will wait that number of seconds before retrying the test.
+        description (`str`, *optional*):
+            A string to describe the situation (what / where / why is flaky, link to GH issue/PR comments, errors,
+            etc.)
+    """
+
+    def decorator(test_func_ref):
+        @functools.wraps(test_func_ref)
+        def wrapper(*args, **kwargs):
+            retry_count = 1
+
+            while retry_count < max_attempts:
+                try:
+                    return test_func_ref(*args, **kwargs)
+
+                except Exception as err:
+                    print(f"Test failed with {err} at try {retry_count}/{max_attempts}.", file=sys.stderr)
+                    if wait_before_retry is not None:
+                        time.sleep(wait_before_retry)
+                    retry_count += 1
+
+            return test_func_ref(*args, **kwargs)
+
+        return wrapper
+
+    return decorator
+
+
+# Taken from: https://github.com/huggingface/transformers/blob/3658488ff77ff8d45101293e749263acf437f4d5/src/transformers/testing_utils.py#L1787
+def run_test_in_subprocess(test_case, target_func, inputs=None, timeout=None):
+    """
+    To run a test in a subprocess. In particular, this can avoid (GPU) memory issue.
+
+    Args:
+        test_case (`unittest.TestCase`):
+            The test that will run `target_func`.
+        target_func (`Callable`):
+            The function implementing the actual testing logic.
+        inputs (`dict`, *optional*, defaults to `None`):
+            The inputs that will be passed to `target_func` through an (input) queue.
+        timeout (`int`, *optional*, defaults to `None`):
+            The timeout (in seconds) that will be passed to the input and output queues. If not specified, the env.
+            variable `PYTEST_TIMEOUT` will be checked. If still `None`, its value will be set to `600`.
+    """
+    if timeout is None:
+        timeout = int(os.environ.get("PYTEST_TIMEOUT", 600))
+
+    start_methohd = "spawn"
+    ctx = multiprocessing.get_context(start_methohd)
+
+    input_queue = ctx.Queue(1)
+    output_queue = ctx.JoinableQueue(1)
+
+    # We can't send `unittest.TestCase` to the child, otherwise we get issues regarding pickle.
+    input_queue.put(inputs, timeout=timeout)
+
+    process = ctx.Process(target=target_func, args=(input_queue, output_queue, timeout))
+    process.start()
+    # Kill the child process if we can't get outputs from it in time: otherwise, the hanging subprocess prevents
+    # the test to exit properly.
+    try:
+        results = output_queue.get(timeout=timeout)
+        output_queue.task_done()
+    except Exception as e:
+        process.terminate()
+        test_case.fail(e)
+    process.join(timeout=timeout)
+
+    if results["error"] is not None:
+        test_case.fail(f'{results["error"]}')
+
+
+class CaptureLogger:
+    """
+    Args:
+    Context manager to capture `logging` streams
+        logger: 'logging` logger object
+    Returns:
+        The captured output is available via `self.out`
+    Example:
+    ```python
+    >>> from diffusers import logging
+    >>> from diffusers.testing_utils import CaptureLogger
+
+    >>> msg = "Testing 1, 2, 3"
+    >>> logging.set_verbosity_info()
+    >>> logger = logging.get_logger("diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.py")
+    >>> with CaptureLogger(logger) as cl:
+    ...     logger.info(msg)
+    >>> assert cl.out, msg + "\n"
+    ```
+    """
+
+    def __init__(self, logger):
+        self.logger = logger
+        self.io = StringIO()
+        self.sh = logging.StreamHandler(self.io)
+        self.out = ""
+
+    def __enter__(self):
+        self.logger.addHandler(self.sh)
+        return self
+
+    def __exit__(self, *exc):
+        self.logger.removeHandler(self.sh)
+        self.out = self.io.getvalue()
+
+    def __repr__(self):
+        return f"captured: {self.out}\n"
+
+
+def enable_full_determinism():
+    """
+    Helper function for reproducible behavior during distributed training. See
+    - https://pytorch.org/docs/stable/notes/randomness.html for pytorch
+    """
+    #  Enable PyTorch deterministic mode. This potentially requires either the environment
+    #  variable 'CUDA_LAUNCH_BLOCKING' or 'CUBLAS_WORKSPACE_CONFIG' to be set,
+    # depending on the CUDA version, so we set them both here
+    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
+    torch.use_deterministic_algorithms(True)
+
+    # Enable CUDNN deterministic mode
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cuda.matmul.allow_tf32 = False
+
+
+def disable_full_determinism():
+    os.environ["CUDA_LAUNCH_BLOCKING"] = "0"
+    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ""
+    torch.use_deterministic_algorithms(False)
+
+
+# Utils for custom and alternative accelerator devices
+def _is_torch_fp16_available(device):
+    if not is_torch_available():
+        return False
+
+    import torch
+
+    device = torch.device(device)
+
+    try:
+        x = torch.zeros((2, 2), dtype=torch.float16).to(device)
+        _ = torch.mul(x, x)
+        return True
+
+    except Exception as e:
+        if device.type == "cuda":
+            raise ValueError(
+                f"You have passed a device of type 'cuda' which should work with 'fp16', but 'cuda' does not seem to be correctly installed on your machine: {e}"
+            )
+
+        return False
+
+
+def _is_torch_fp64_available(device):
+    if not is_torch_available():
+        return False
+
+    import torch
+
+    device = torch.device(device)
+
+    try:
+        x = torch.zeros((2, 2), dtype=torch.float64).to(device)
+        _ = torch.mul(x, x)
+        return True
+
+    except Exception as e:
+        if device.type == "cuda":
+            raise ValueError(
+                f"You have passed a device of type 'cuda' which should work with 'fp64', but 'cuda' does not seem to be correctly installed on your machine: {e}"
+            )
+
+        return False
+
+
+# Guard these lookups for when Torch is not used - alternative accelerator support is for PyTorch
+if is_torch_available():
+    # Behaviour flags
+    BACKEND_SUPPORTS_TRAINING = {"cuda": True, "cpu": True, "mps": False, "default": True}
+
+    # Function definitions
+    BACKEND_EMPTY_CACHE = {"cuda": torch.cuda.empty_cache, "cpu": None, "mps": None, "default": None}
+    BACKEND_DEVICE_COUNT = {"cuda": torch.cuda.device_count, "cpu": lambda: 0, "mps": lambda: 0, "default": 0}
+    BACKEND_MANUAL_SEED = {"cuda": torch.cuda.manual_seed, "cpu": torch.manual_seed, "default": torch.manual_seed}
+
+
+# This dispatches a defined function according to the accelerator from the function definitions.
+def _device_agnostic_dispatch(device: str, dispatch_table: Dict[str, Callable], *args, **kwargs):
+    if device not in dispatch_table:
+        return dispatch_table["default"](*args, **kwargs)
+
+    fn = dispatch_table[device]
+
+    # Some device agnostic functions return values. Need to guard against 'None' instead at
+    # user level
+    if fn is None:
+        return None
+
+    return fn(*args, **kwargs)
+
+
+# These are callables which automatically dispatch the function specific to the accelerator
+def backend_manual_seed(device: str, seed: int):
+    return _device_agnostic_dispatch(device, BACKEND_MANUAL_SEED, seed)
+
+
+def backend_empty_cache(device: str):
+    return _device_agnostic_dispatch(device, BACKEND_EMPTY_CACHE)
+
+
+def backend_device_count(device: str):
+    return _device_agnostic_dispatch(device, BACKEND_DEVICE_COUNT)
+
+
+# These are callables which return boolean behaviour flags and can be used to specify some
+# device agnostic alternative where the feature is unsupported.
+def backend_supports_training(device: str):
+    if not is_torch_available():
+        return False
+
+    if device not in BACKEND_SUPPORTS_TRAINING:
+        device = "default"
+
+    return BACKEND_SUPPORTS_TRAINING[device]
+
+
+# Guard for when Torch is not available
+if is_torch_available():
+    # Update device function dict mapping
+    def update_mapping_from_spec(device_fn_dict: Dict[str, Callable], attribute_name: str):
+        try:
+            # Try to import the function directly
+            spec_fn = getattr(device_spec_module, attribute_name)
+            device_fn_dict[torch_device] = spec_fn
+        except AttributeError as e:
+            # If the function doesn't exist, and there is no default, throw an error
+            if "default" not in device_fn_dict:
+                raise AttributeError(
+                    f"`{attribute_name}` not found in '{device_spec_path}' and no default fallback function found."
+                ) from e
+
+    if "DIFFUSERS_TEST_DEVICE_SPEC" in os.environ:
+        device_spec_path = os.environ["DIFFUSERS_TEST_DEVICE_SPEC"]
+        if not Path(device_spec_path).is_file():
+            raise ValueError(f"Specified path to device specification file is not found. Received {device_spec_path}")
+
+        try:
+            import_name = device_spec_path[: device_spec_path.index(".py")]
+        except ValueError as e:
+            raise ValueError(f"Provided device spec file is not a Python file! Received {device_spec_path}") from e
+
+        device_spec_module = importlib.import_module(import_name)
+
+        try:
+            device_name = device_spec_module.DEVICE_NAME
+        except AttributeError:
+            raise AttributeError("Device spec file did not contain `DEVICE_NAME`")
+
+        if "DIFFUSERS_TEST_DEVICE" in os.environ and torch_device != device_name:
+            msg = f"Mismatch between environment variable `DIFFUSERS_TEST_DEVICE` '{torch_device}' and device found in spec '{device_name}'\n"
+            msg += "Either unset `DIFFUSERS_TEST_DEVICE` or ensure it matches device spec name."
+            raise ValueError(msg)
+
+        torch_device = device_name
+
+        # Add one entry here for each `BACKEND_*` dictionary.
+        update_mapping_from_spec(BACKEND_MANUAL_SEED, "MANUAL_SEED_FN")
+        update_mapping_from_spec(BACKEND_EMPTY_CACHE, "EMPTY_CACHE_FN")
+        update_mapping_from_spec(BACKEND_DEVICE_COUNT, "DEVICE_COUNT_FN")
+        update_mapping_from_spec(BACKEND_SUPPORTS_TRAINING, "SUPPORTS_TRAINING")
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/torch_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/torch_utils.py
new file mode 100644
index 000000000..cc9c050a7
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/torch_utils.py
@@ -0,0 +1,147 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+PyTorch utilities: Utilities related to PyTorch
+"""
+from typing import List, Optional, Tuple, Union
+
+from . import logging
+from .import_utils import is_torch_available, is_torch_version
+
+
+if is_torch_available():
+    import torch
+    from torch.fft import fftn, fftshift, ifftn, ifftshift
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+try:
+    from torch._dynamo import allow_in_graph as maybe_allow_in_graph
+except (ImportError, ModuleNotFoundError):
+
+    def maybe_allow_in_graph(cls):
+        return cls
+
+
+def randn_tensor(
+    shape: Union[Tuple, List],
+    generator: Optional[Union[List["torch.Generator"], "torch.Generator"]] = None,
+    device: Optional["torch.device"] = None,
+    dtype: Optional["torch.dtype"] = None,
+    layout: Optional["torch.layout"] = None,
+):
+    """A helper function to create random tensors on the desired `device` with the desired `dtype`. When
+    passing a list of generators, you can seed each batch size individually. If CPU generators are passed, the tensor
+    is always created on the CPU.
+    """
+    # device on which tensor is created defaults to device
+    rand_device = device
+    batch_size = shape[0]
+
+    layout = layout or torch.strided
+    device = device or torch.device("cpu")
+
+    if generator is not None:
+        gen_device_type = generator.device.type if not isinstance(generator, list) else generator[0].device.type
+        if gen_device_type != device.type and gen_device_type == "cpu":
+            rand_device = "cpu"
+            if device != "mps":
+                logger.info(
+                    f"The passed generator was created on 'cpu' even though a tensor on {device} was expected."
+                    f" Tensors will be created on 'cpu' and then moved to {device}. Note that one can probably"
+                    f" slighly speed up this function by passing a generator that was created on the {device} device."
+                )
+        elif gen_device_type != device.type and gen_device_type == "cuda":
+            raise ValueError(f"Cannot generate a {device} tensor from a generator of type {gen_device_type}.")
+
+    # make sure generator list of length 1 is treated like a non-list
+    if isinstance(generator, list) and len(generator) == 1:
+        generator = generator[0]
+
+    if isinstance(generator, list):
+        shape = (1,) + shape[1:]
+        latents = [
+            torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype, layout=layout)
+            for i in range(batch_size)
+        ]
+        latents = torch.cat(latents, dim=0).to(device)
+    else:
+        latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype, layout=layout).to(device)
+
+    return latents
+
+
+def is_compiled_module(module) -> bool:
+    """Check whether the module was compiled with torch.compile()"""
+    if is_torch_version("<", "2.0.0") or not hasattr(torch, "_dynamo"):
+        return False
+    return isinstance(module, torch._dynamo.eval_frame.OptimizedModule)
+
+
+def fourier_filter(x_in: "torch.Tensor", threshold: int, scale: int) -> "torch.Tensor":
+    """Fourier filter as introduced in FreeU (https://arxiv.org/abs/2309.11497).
+
+    This version of the method comes from here:
+    https://github.com/huggingface/diffusers/pull/5164#issuecomment-1732638706
+    """
+    x = x_in
+    B, C, H, W = x.shape
+
+    # Non-power of 2 images must be float32
+    if (W & (W - 1)) != 0 or (H & (H - 1)) != 0:
+        x = x.to(dtype=torch.float32)
+
+    # FFT
+    x_freq = fftn(x, dim=(-2, -1))
+    x_freq = fftshift(x_freq, dim=(-2, -1))
+
+    B, C, H, W = x_freq.shape
+    mask = torch.ones((B, C, H, W), device=x.device)
+
+    crow, ccol = H // 2, W // 2
+    mask[..., crow - threshold : crow + threshold, ccol - threshold : ccol + threshold] = scale
+    x_freq = x_freq * mask
+
+    # IFFT
+    x_freq = ifftshift(x_freq, dim=(-2, -1))
+    x_filtered = ifftn(x_freq, dim=(-2, -1)).real
+
+    return x_filtered.to(dtype=x_in.dtype)
+
+
+def apply_freeu(
+    resolution_idx: int, hidden_states: "torch.Tensor", res_hidden_states: "torch.Tensor", **freeu_kwargs
+) -> Tuple["torch.Tensor", "torch.Tensor"]:
+    """Applies the FreeU mechanism as introduced in https:
+    //arxiv.org/abs/2309.11497. Adapted from the official code repository: https://github.com/ChenyangSi/FreeU.
+
+    Args:
+        resolution_idx (`int`): Integer denoting the UNet block where FreeU is being applied.
+        hidden_states (`torch.Tensor`): Inputs to the underlying block.
+        res_hidden_states (`torch.Tensor`): Features from the skip block corresponding to the underlying block.
+        s1 (`float`): Scaling factor for stage 1 to attenuate the contributions of the skip features.
+        s2 (`float`): Scaling factor for stage 2 to attenuate the contributions of the skip features.
+        b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+        b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+    """
+    if resolution_idx == 0:
+        num_half_channels = hidden_states.shape[1] // 2
+        hidden_states[:, :num_half_channels] = hidden_states[:, :num_half_channels] * freeu_kwargs["b1"]
+        res_hidden_states = fourier_filter(res_hidden_states, threshold=1, scale=freeu_kwargs["s1"])
+    if resolution_idx == 1:
+        num_half_channels = hidden_states.shape[1] // 2
+        hidden_states[:, :num_half_channels] = hidden_states[:, :num_half_channels] * freeu_kwargs["b2"]
+        res_hidden_states = fourier_filter(res_hidden_states, threshold=1, scale=freeu_kwargs["s2"])
+
+    return hidden_states, res_hidden_states
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/versions.py b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/versions.py
new file mode 100644
index 000000000..945a3977c
--- /dev/null
+++ b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/versions.py
@@ -0,0 +1,117 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utilities for working with package versions
+"""
+
+import importlib.metadata
+import operator
+import re
+import sys
+from typing import Optional
+
+from packaging import version
+
+
+ops = {
+    "<": operator.lt,
+    "<=": operator.le,
+    "==": operator.eq,
+    "!=": operator.ne,
+    ">=": operator.ge,
+    ">": operator.gt,
+}
+
+
+def _compare_versions(op, got_ver, want_ver, requirement, pkg, hint):
+    if got_ver is None or want_ver is None:
+        raise ValueError(
+            f"Unable to compare versions for {requirement}: need={want_ver} found={got_ver}. This is unusual. Consider"
+            f" reinstalling {pkg}."
+        )
+    if not ops[op](version.parse(got_ver), version.parse(want_ver)):
+        raise ImportError(
+            f"{requirement} is required for a normal functioning of this module, but found {pkg}=={got_ver}.{hint}"
+        )
+
+
+def require_version(requirement: str, hint: Optional[str] = None) -> None:
+    """
+    Perform a runtime check of the dependency versions, using the exact same syntax used by pip.
+
+    The installed module version comes from the *site-packages* dir via *importlib.metadata*.
+
+    Args:
+        requirement (`str`): pip style definition, e.g.,  "tokenizers==0.9.4", "tqdm>=4.27", "numpy"
+        hint (`str`, *optional*): what suggestion to print in case of requirements not being met
+
+    Example:
+
+    ```python
+    require_version("pandas>1.1.2")
+    require_version("numpy>1.18.5", "this is important to have for whatever reason")
+    ```"""
+
+    hint = f"\n{hint}" if hint is not None else ""
+
+    # non-versioned check
+    if re.match(r"^[\w_\-\d]+$", requirement):
+        pkg, op, want_ver = requirement, None, None
+    else:
+        match = re.findall(r"^([^!=<>\s]+)([\s!=<>]{1,2}.+)", requirement)
+        if not match:
+            raise ValueError(
+                "requirement needs to be in the pip package format, .e.g., package_a==1.23, or package_b>=1.23, but"
+                f" got {requirement}"
+            )
+        pkg, want_full = match[0]
+        want_range = want_full.split(",")  # there could be multiple requirements
+        wanted = {}
+        for w in want_range:
+            match = re.findall(r"^([\s!=<>]{1,2})(.+)", w)
+            if not match:
+                raise ValueError(
+                    "requirement needs to be in the pip package format, .e.g., package_a==1.23, or package_b>=1.23,"
+                    f" but got {requirement}"
+                )
+            op, want_ver = match[0]
+            wanted[op] = want_ver
+            if op not in ops:
+                raise ValueError(f"{requirement}: need one of {list(ops.keys())}, but got {op}")
+
+    # special case
+    if pkg == "python":
+        got_ver = ".".join([str(x) for x in sys.version_info[:3]])
+        for op, want_ver in wanted.items():
+            _compare_versions(op, got_ver, want_ver, requirement, pkg, hint)
+        return
+
+    # check if any version is installed
+    try:
+        got_ver = importlib.metadata.version(pkg)
+    except importlib.metadata.PackageNotFoundError:
+        raise importlib.metadata.PackageNotFoundError(
+            f"The '{requirement}' distribution was not found and is required by this application. {hint}"
+        )
+
+    # check that the right version is installed if version number or a range was provided
+    if want_ver is not None:
+        for op, want_ver in wanted.items():
+            _compare_versions(op, got_ver, want_ver, requirement, pkg, hint)
+
+
+def require_version_core(requirement):
+    """require_version wrapper which emits a core-specific hint on failure"""
+    hint = "Try: pip install transformers -U or pip install -e '.[dev]' if you're working with git main"
+    return require_version(requirement, hint)
diff --git a/tests/executables/stable-diffusion/init_torch.sh b/tests/executables/stable-diffusion/init_torch.sh
new file mode 100644
index 000000000..37ed859e4
--- /dev/null
+++ b/tests/executables/stable-diffusion/init_torch.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+ROOT_DIR="$(cd "$(dirname "$0")/../.."; pwd)"
+SRC_DIR=$ROOT_DIR/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch
+DATA_DIR=$ROOT_DIR/data
+
+# install packages
+pip3 install --no-index --find-links=$DATA_DIR/packages IXPyLogger==1.0.0
+pip3 install --upgrade pillow
+pip3 install huggingface_hub==0.25.1
+pip3 install $DATA_DIR/packages/addons/transformers-4.38.1-py3-none-any.whl
+pip3 install -r $SRC_DIR/examples/text_to_image/requirements.txt --cache-dir=$DATA_DIR/packages
+bash $SRC_DIR/build_diffusers.sh && bash $SRC_DIR/install_diffusers.sh
+
+# unzip dataset and checkpoints
+if [[ ! -d "$DATA_DIR/datasets/pokemon-blip-captions" ]]; then
+    echo "Unarchive pokemon-blip-captions.tar"
+    tar -xvf $DATA_DIR/datasets/pokemon-blip-captions.tar -C $DATA_DIR/datasets
+fi
+if [[ ! -d "$DATA_DIR/model_zoo/stabilityai" ]]; then
+    echo "Unarchive stabilityai.tar"
+    tar -xvf $DATA_DIR/model_zoo/stabilityai.tar -C $DATA_DIR/model_zoo
+fi
+if [[ ! -d "$DATA_DIR/model_zoo/stable-diffusion-v1-5" ]]; then
+    echo "Unarchive stable-diffusion-v1-5.zip"
+    unzip $DATA_DIR/model_zoo/stable-diffusion-v1-5.zip -d $DATA_DIR/model_zoo
+fi 
diff --git a/tests/executables/stable-diffusion/train_sd2.1_pokemon_dist_1x8_torch.sh b/tests/executables/stable-diffusion/train_sd2.1_pokemon_dist_1x8_torch.sh
new file mode 100644
index 000000000..36ce85c69
--- /dev/null
+++ b/tests/executables/stable-diffusion/train_sd2.1_pokemon_dist_1x8_torch.sh
@@ -0,0 +1,63 @@
+source ../_utils/global_environment_variables.sh
+export ENABLE_FLASH_ATTENTION_WITH_IXDNN=1
+export ENABLE_FLASH_ATTENTION_WITH_IXATTNBKD=0
+
+
+: ${BATCH_SIZE:=8}
+
+ROOT_DIR="$(cd "$(dirname "$0")/../.."; pwd)"
+SRC_DIR=$ROOT_DIR/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image
+DATASET_NAME=$ROOT_DIR/data/datasets/pokemon-blip-captions
+MODEL_NAME=$ROOT_DIR/data/model_zoo/stabilityai/stable-diffusion-2-1
+export DRT_MEMCPYUSEKERNEL=20000000000
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0)); then
+        EXIT_STATUS=1
+    fi
+}
+
+# 训练用的一些环境变量，可以提高性能
+export CLIP_FLASH_ATTN=1
+export USE_NHWC_GN=1
+export USE_IXFORMER_GEGLU=0
+export USE_APEX_LN=1
+export ENABLE_FLASH_ATTENTION_WITH_IXDNN=1
+echo $ENABLE_FLASH_ATTENTION_WITH_IXDNN
+
+
+# 根据BI环境调整卡数
+cd $SRC_DIR
+actual_num_devices=$(ixsmi --list-gpus | wc -l)
+num_devices=$(awk '/num_processes:/ {print $2}' $SRC_DIR/default_config.yaml)
+echo $num_devices
+echo $actual_num_devices
+if [ "$num_devices" != "$actual_num_devices" ]; then
+    echo "num_devices not matches actual_num_devices."
+    sed -i "s/^num_processes:.*/num_processes: $actual_num_devices/" $SRC_DIR/default_config.yaml
+fi
+
+# 开始训练
+accelerate launch --config_file default_config.yaml --mixed_precision="fp16" train_text_to_image.py \
+  --pretrained_model_name_or_path=$MODEL_NAME \
+  --dataset_name=$DATASET_NAME \
+  --resolution=512 \
+  --seed 42 \
+  --center_crop \
+  --random_flip \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --gradient_checkpointing \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --output_dir="sd-pokemon-model-3" \
+  --max_train_steps=100 \
+  --NHWC \
+  --dataloader_num_workers=32 \
+  --apex_fused_adam "$@"; check_status
+
+  exit ${EXIT_STATUS}
-- 
Gitee


From 954dd9256b5e3689025a4e51865dc87a7e6a9e29 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Fri, 26 Sep 2025 14:52:09 +0800
Subject: [PATCH 14/20] sync resnet50 paddle all

---
 .../resnet50/paddlepaddle/.gitignore          |   16 +
 .../resnet50/paddlepaddle/LICENSE             |  201 +++
 .../resnet50/paddlepaddle/MANIFEST.in         |    7 +
 .../resnet50/paddlepaddle/README_ch.md        |  151 ++
 .../resnet50/paddlepaddle/README_en.md        |  135 ++
 .../resnet50/paddlepaddle/__init__.py         |   17 +
 .../resnet50/paddlepaddle/hubconf.py          |  788 +++++++++
 .../resnet50/paddlepaddle/ppcls/__init__.py   |   20 +
 .../paddlepaddle/ppcls/arch/__init__.py       |  134 ++
 .../ppcls/arch/backbone/__init__.py           |   34 +
 .../ppcls/arch/backbone/base/__init__.py      |    0
 .../ppcls/arch/backbone/base/theseus_layer.py |  301 ++++
 .../backbone/legendary_models/__init__.py     |    8 +
 .../arch/backbone/legendary_models/resnet.py  |  591 +++++++
 .../ppcls/arch/backbone/model_zoo/__init__.py |    0
 .../arch/backbone/model_zoo/resnet_vc.py      |  309 ++++
 .../arch/backbone/variant_models/__init__.py  |    3 +
 .../backbone/variant_models/resnet_variant.py |   23 +
 .../paddlepaddle/ppcls/arch/gears/__init__.py |   32 +
 .../ppcls/arch/gears/arcmargin.py             |   72 +
 .../ppcls/arch/gears/circlemargin.py          |   59 +
 .../ppcls/arch/gears/cosmargin.py             |   55 +
 .../paddlepaddle/ppcls/arch/gears/fc.py       |   35 +
 .../ppcls/arch/gears/identity_head.py         |    9 +
 .../ppcls/arch/gears/vehicle_neck.py          |   52 +
 .../paddlepaddle/ppcls/arch/slim/__init__.py  |   16 +
 .../paddlepaddle/ppcls/arch/slim/prune.py     |   65 +
 .../paddlepaddle/ppcls/arch/slim/quant.py     |   55 +
 .../resnet50/paddlepaddle/ppcls/arch/utils.py |   53 +
 .../DataAugment/ResNet50_AutoAugment.yaml     |  129 ++
 .../DataAugment/ResNet50_Baseline.yaml        |  128 ++
 .../ImageNet/DataAugment/ResNet50_Cutmix.yaml |  128 ++
 .../ImageNet/DataAugment/ResNet50_Cutout.yaml |  131 ++
 .../DataAugment/ResNet50_GridMask.yaml        |  134 ++
 .../DataAugment/ResNet50_HideAndSeek.yaml     |  129 ++
 .../ImageNet/DataAugment/ResNet50_Mixup.yaml  |  128 ++
 .../DataAugment/ResNet50_RandAugment.yaml     |  131 ++
 .../DataAugment/ResNet50_RandomErasing.yaml   |  134 ++
 .../configs/ImageNet/ResNet/ResNet50.yaml     |  132 ++
 .../ImageNet/ResNet/ResNet50_amp_4x8.yaml     |  140 ++
 .../configs/ImageNet/ResNet/ResNet50_vd.yaml  |  130 ++
 .../configs/quick_start/ResNet50_vd.yaml      |  107 ++
 .../paddlepaddle/ppcls/engine/__init__.py     |    0
 .../paddlepaddle/ppcls/engine/engine.py       |  468 ++++++
 .../ppcls/engine/evaluation/__init__.py       |   16 +
 .../ppcls/engine/evaluation/classification.py |  174 ++
 .../ppcls/engine/evaluation/retrieval.py      |  179 ++
 .../ppcls/engine/train/__init__.py            |   14 +
 .../paddlepaddle/ppcls/engine/train/train.py  |   82 +
 .../paddlepaddle/ppcls/engine/train/utils.py  |   72 +
 .../paddlepaddle/ppcls/loss/__init__.py       |   47 +
 .../paddlepaddle/ppcls/loss/celoss.py         |   67 +
 .../paddlepaddle/ppcls/loss/comfunc.py        |   45 +
 .../paddlepaddle/ppcls/loss/distanceloss.py   |   43 +
 .../paddlepaddle/ppcls/metric/__init__.py     |   51 +
 .../paddlepaddle/ppcls/metric/metrics.py      |  306 ++++
 .../paddlepaddle/ppcls/optimizer/__init__.py  |   72 +
 .../ppcls/optimizer/learning_rate.py          |  326 ++++
 .../paddlepaddle/ppcls/optimizer/optimizer.py |  217 +++
 .../paddlepaddle/ppcls/static/program.py      |  449 +++++
 .../paddlepaddle/ppcls/static/save_load.py    |  139 ++
 .../paddlepaddle/ppcls/static/train.py        |  212 +++
 .../paddlepaddle/ppcls/utils/__init__.py      |   27 +
 .../paddlepaddle/ppcls/utils/check.py         |  151 ++
 .../paddlepaddle/ppcls/utils/config.py        |  210 +++
 .../paddlepaddle/ppcls/utils/download.py      |  319 ++++
 .../resnet50/paddlepaddle/ppcls/utils/ema.py  |   63 +
 .../feature_maps_visualization/fm_vis.py      |   97 ++
 .../feature_maps_visualization/resnet.py      |  535 ++++++
 .../utils/feature_maps_visualization/utils.py |   85 +
 .../paddlepaddle/ppcls/utils/gallery2fc.py    |  119 ++
 .../ppcls/utils/imagenet1k_label_list.txt     | 1000 ++++++++++++
 .../paddlepaddle/ppcls/utils/logger.py        |  138 ++
 .../paddlepaddle/ppcls/utils/metrics.py       |  107 ++
 .../resnet50/paddlepaddle/ppcls/utils/misc.py |   63 +
 .../paddlepaddle/ppcls/utils/model_zoo.py     |  213 +++
 .../paddlepaddle/ppcls/utils/pretrained.list  |  121 ++
 .../paddlepaddle/ppcls/utils/profiler.py      |  111 ++
 .../paddlepaddle/ppcls/utils/save_load.py     |  136 ++
 .../paddlepaddle/ppcls_2.5/__init__.py        |   20 +
 .../paddlepaddle/ppcls_2.5/arch/__init__.py   |  134 ++
 .../ppcls_2.5/arch/backbone/__init__.py       |   34 +
 .../ppcls_2.5/arch/backbone/base/__init__.py  |    0
 .../arch/backbone/base/theseus_layer.py       |  301 ++++
 .../backbone/legendary_models/__init__.py     |    8 +
 .../arch/backbone/legendary_models/resnet.py  |  591 +++++++
 .../arch/backbone/model_zoo/__init__.py       |    0
 .../arch/backbone/model_zoo/resnet_vc.py      |  309 ++++
 .../arch/backbone/variant_models/__init__.py  |    3 +
 .../backbone/variant_models/resnet_variant.py |   23 +
 .../ppcls_2.5/arch/gears/__init__.py          |   32 +
 .../ppcls_2.5/arch/gears/arcmargin.py         |   72 +
 .../ppcls_2.5/arch/gears/circlemargin.py      |   59 +
 .../ppcls_2.5/arch/gears/cosmargin.py         |   55 +
 .../paddlepaddle/ppcls_2.5/arch/gears/fc.py   |   35 +
 .../ppcls_2.5/arch/gears/identity_head.py     |    9 +
 .../ppcls_2.5/arch/gears/vehicle_neck.py      |   52 +
 .../ppcls_2.5/arch/slim/__init__.py           |   16 +
 .../paddlepaddle/ppcls_2.5/arch/slim/prune.py |   65 +
 .../paddlepaddle/ppcls_2.5/arch/slim/quant.py |   55 +
 .../paddlepaddle/ppcls_2.5/arch/utils.py      |   53 +
 .../DataAugment/ResNet50_AutoAugment.yaml     |  129 ++
 .../DataAugment/ResNet50_Baseline.yaml        |  128 ++
 .../ImageNet/DataAugment/ResNet50_Cutmix.yaml |  128 ++
 .../ImageNet/DataAugment/ResNet50_Cutout.yaml |  131 ++
 .../DataAugment/ResNet50_GridMask.yaml        |  134 ++
 .../DataAugment/ResNet50_HideAndSeek.yaml     |  129 ++
 .../ImageNet/DataAugment/ResNet50_Mixup.yaml  |  128 ++
 .../DataAugment/ResNet50_RandAugment.yaml     |  131 ++
 .../DataAugment/ResNet50_RandomErasing.yaml   |  134 ++
 .../configs/ImageNet/ResNet/ResNet50.yaml     |  132 ++
 .../ImageNet/ResNet/ResNet50_amp_4x8.yaml     |  140 ++
 .../configs/ImageNet/ResNet/ResNet50_vd.yaml  |  130 ++
 .../configs/quick_start/ResNet50_vd.yaml      |  107 ++
 .../paddlepaddle/ppcls_2.5/engine/__init__.py |    0
 .../paddlepaddle/ppcls_2.5/engine/engine.py   |  468 ++++++
 .../ppcls_2.5/engine/evaluation/__init__.py   |   16 +
 .../engine/evaluation/classification.py       |  174 ++
 .../ppcls_2.5/engine/evaluation/retrieval.py  |  179 ++
 .../ppcls_2.5/engine/train/__init__.py        |   14 +
 .../ppcls_2.5/engine/train/train.py           |   82 +
 .../ppcls_2.5/engine/train/utils.py           |   72 +
 .../paddlepaddle/ppcls_2.5/loss/__init__.py   |   47 +
 .../paddlepaddle/ppcls_2.5/loss/celoss.py     |   67 +
 .../paddlepaddle/ppcls_2.5/loss/comfunc.py    |   45 +
 .../ppcls_2.5/loss/distanceloss.py            |   43 +
 .../paddlepaddle/ppcls_2.5/metric/__init__.py |   51 +
 .../paddlepaddle/ppcls_2.5/metric/metrics.py  |  306 ++++
 .../ppcls_2.5/optimizer/__init__.py           |   72 +
 .../ppcls_2.5/optimizer/learning_rate.py      |  326 ++++
 .../ppcls_2.5/optimizer/optimizer.py          |  217 +++
 .../paddlepaddle/ppcls_2.5/static/program.py  |  449 +++++
 .../ppcls_2.5/static/save_load.py             |  139 ++
 .../paddlepaddle/ppcls_2.5/static/train.py    |  212 +++
 .../paddlepaddle/ppcls_2.5/utils/__init__.py  |   27 +
 .../paddlepaddle/ppcls_2.5/utils/check.py     |  151 ++
 .../paddlepaddle/ppcls_2.5/utils/config.py    |  210 +++
 .../paddlepaddle/ppcls_2.5/utils/download.py  |  319 ++++
 .../paddlepaddle/ppcls_2.5/utils/ema.py       |   63 +
 .../feature_maps_visualization/fm_vis.py      |   97 ++
 .../feature_maps_visualization/resnet.py      |  535 ++++++
 .../utils/feature_maps_visualization/utils.py |   85 +
 .../ppcls_2.5/utils/gallery2fc.py             |  119 ++
 .../ppcls_2.5/utils/imagenet1k_label_list.txt | 1000 ++++++++++++
 .../paddlepaddle/ppcls_2.5/utils/logger.py    |  138 ++
 .../paddlepaddle/ppcls_2.5/utils/metrics.py   |  107 ++
 .../paddlepaddle/ppcls_2.5/utils/misc.py      |   63 +
 .../paddlepaddle/ppcls_2.5/utils/model_zoo.py |  213 +++
 .../ppcls_2.5/utils/pretrained.list           |  121 ++
 .../paddlepaddle/ppcls_2.5/utils/profiler.py  |  111 ++
 .../paddlepaddle/ppcls_2.5/utils/save_load.py |  136 ++
 .../paddlepaddle/ppcls_2.6/__init__.py        |   20 +
 .../paddlepaddle/ppcls_2.6/arch/__init__.py   |  177 ++
 .../ppcls_2.6/arch/backbone/__init__.py       |  118 ++
 .../ppcls_2.6/arch/backbone/base/__init__.py  |    0
 .../arch/backbone/base/dbb/dbb_block.py       |  365 +++++
 .../arch/backbone/base/dbb/dbb_transforms.py  |   73 +
 .../arch/backbone/base/theseus_layer.py       |  398 +++++
 .../backbone/legendary_models/__init__.py     |    8 +
 .../legendary_models/custom_devices_layers.py |   37 +
 .../arch/backbone/legendary_models/esnet.py   |  369 +++++
 .../arch/backbone/legendary_models/hrnet.py   |  797 +++++++++
 .../backbone/legendary_models/inception_v3.py |  559 +++++++
 .../backbone/legendary_models/mobilenet_v1.py |  259 +++
 .../backbone/legendary_models/mobilenet_v3.py |  591 +++++++
 .../backbone/legendary_models/mobilenet_v4.py |  836 ++++++++++
 .../backbone/legendary_models/pp_hgnet.py     |  376 +++++
 .../backbone/legendary_models/pp_hgnet_v2.py  |  706 ++++++++
 .../backbone/legendary_models/pp_lcnet.py     |  520 ++++++
 .../backbone/legendary_models/pp_lcnet_v2.py  |  417 +++++
 .../arch/backbone/legendary_models/resnet.py  |  653 ++++++++
 .../legendary_models/swin_transformer.py      | 1002 ++++++++++++
 .../arch/backbone/legendary_models/vgg.py     |  261 +++
 .../arch/backbone/model_zoo/__init__.py       |    0
 .../arch/backbone/model_zoo/adaface_ir_net.py |  529 ++++++
 .../arch/backbone/model_zoo/alexnet.py        |  170 ++
 .../ppcls_2.6/arch/backbone/model_zoo/cae.py  |  860 ++++++++++
 .../arch/backbone/model_zoo/convnext.py       |  282 ++++
 .../arch/backbone/model_zoo/cspnet.py         |  377 +++++
 .../backbone/model_zoo/cswin_transformer.py   |  651 ++++++++
 .../ppcls_2.6/arch/backbone/model_zoo/cvt.py  |  723 +++++++++
 .../arch/backbone/model_zoo/darknet.py        |  199 +++
 .../arch/backbone/model_zoo/densenet.py       |  346 ++++
 .../model_zoo/distilled_vision_transformer.py |  273 ++++
 .../ppcls_2.6/arch/backbone/model_zoo/dla.py  |  529 ++++++
 .../ppcls_2.6/arch/backbone/model_zoo/dpn.py  |  453 ++++++
 .../arch/backbone/model_zoo/dsnet.py          |  701 ++++++++
 .../arch/backbone/model_zoo/efficientnet.py   | 1028 ++++++++++++
 .../backbone/model_zoo/efficientnet_v2.py     |  994 ++++++++++++
 .../arch/backbone/model_zoo/fasternet.py      |  399 +++++
 .../arch/backbone/model_zoo/foundation_vit.py | 1261 ++++++++++++++
 .../arch/backbone/model_zoo/ghostnet.py       |  364 +++++
 .../arch/backbone/model_zoo/googlenet.py      |  365 +++++
 .../arch/backbone/model_zoo/hardnet.py        |  294 ++++
 .../arch/backbone/model_zoo/inception_v4.py   |  479 ++++++
 .../arch/backbone/model_zoo/levit.py          |  590 +++++++
 .../arch/backbone/model_zoo/micronet.py       |  618 +++++++
 .../arch/backbone/model_zoo/mixnet.py         |  812 +++++++++
 .../arch/backbone/model_zoo/mobilefacenet.py  |  166 ++
 .../arch/backbone/model_zoo/mobilenet_v2.py   |  316 ++++
 .../arch/backbone/model_zoo/mobilenext.py     |  262 +++
 .../arch/backbone/model_zoo/mobilevit.py      |  479 ++++++
 .../arch/backbone/model_zoo/mobilevit_v2.py   |  593 +++++++
 .../arch/backbone/model_zoo/mobilevit_v3.py   | 1445 +++++++++++++++++
 .../arch/backbone/model_zoo/nextvit.py        |  643 ++++++++
 .../arch/backbone/model_zoo/peleenet.py       |  264 +++
 .../arch/backbone/model_zoo/pvt_v2.py         |  493 ++++++
 .../arch/backbone/model_zoo/rednet.py         |  204 +++
 .../arch/backbone/model_zoo/regnet.py         |  531 ++++++
 .../arch/backbone/model_zoo/repvgg.py         |  451 +++++
 .../arch/backbone/model_zoo/res2net.py        |  266 +++
 .../arch/backbone/model_zoo/res2net_vd.py     |  308 ++++
 .../arch/backbone/model_zoo/resnest.py        |  780 +++++++++
 .../arch/backbone/model_zoo/resnet_vc.py      |  311 ++++
 .../arch/backbone/model_zoo/resnext.py        |  303 ++++
 .../arch/backbone/model_zoo/resnext101_wsl.py |  506 ++++++
 .../arch/backbone/model_zoo/resnext_vd.py     |  319 ++++
 .../arch/backbone/model_zoo/rexnet.py         |  283 ++++
 .../arch/backbone/model_zoo/se_resnet_vd.py   |  392 +++++
 .../arch/backbone/model_zoo/se_resnext.py     |  369 +++++
 .../arch/backbone/model_zoo/se_resnext_vd.py  |  311 ++++
 .../arch/backbone/model_zoo/shufflenet_v2.py  |  364 +++++
 .../arch/backbone/model_zoo/squeezenet.py     |  196 +++
 .../arch/backbone/model_zoo/starnet.py        |  197 +++
 .../arch/backbone/model_zoo/svtrnet.py        |  699 ++++++++
 .../backbone/model_zoo/swin_transformer_v2.py | 1061 ++++++++++++
 .../arch/backbone/model_zoo/tinynet.py        |  196 +++
 .../ppcls_2.6/arch/backbone/model_zoo/tnt.py  |  410 +++++
 .../arch/backbone/model_zoo/twins.py          |  692 ++++++++
 .../arch/backbone/model_zoo/uniformer.py      |  552 +++++++
 .../ppcls_2.6/arch/backbone/model_zoo/van.py  |  362 +++++
 .../backbone/model_zoo/vision_transformer.py  |  459 ++++++
 .../arch/backbone/model_zoo/wideresnet.py     |  236 +++
 .../arch/backbone/model_zoo/xception.py       |  393 +++++
 .../backbone/model_zoo/xception_deeplab.py    |  423 +++++
 .../arch/backbone/variant_models/__init__.py  |    5 +
 .../variant_models/efficientnet_variant.py    |   44 +
 .../variant_models/foundation_vit_variant.py  |   52 +
 .../variant_models/pp_lcnet_variant.py        |   29 +
 .../variant_models/pp_lcnetv2_variant.py      |   56 +
 .../backbone/variant_models/resnet_variant.py |  203 +++
 .../swin_transformer_variant.py               |  355 ++++
 .../backbone/variant_models/vgg_variant.py    |   28 +
 .../ppcls_2.6/arch/distill/afd_attention.py   |  123 ++
 .../ppcls_2.6/arch/gears/__init__.py          |   74 +
 .../ppcls_2.6/arch/gears/adamargin.py         |  113 ++
 .../ppcls_2.6/arch/gears/arcmargin.py         |   74 +
 .../ppcls_2.6/arch/gears/bnneck.py            |   56 +
 .../ppcls_2.6/arch/gears/circlemargin.py      |   61 +
 .../ppcls_2.6/arch/gears/cosmargin.py         |   57 +
 .../paddlepaddle/ppcls_2.6/arch/gears/fc.py   |   48 +
 .../ppcls_2.6/arch/gears/frfn_neck.py         |   32 +
 .../ppcls_2.6/arch/gears/identity_head.py     |    9 +
 .../ppcls_2.6/arch/gears/metabnneck.py        |  122 ++
 .../ppcls_2.6/arch/gears/ml_decoder.py        |  124 ++
 .../ppcls_2.6/arch/gears/vehicle_neck.py      |   52 +
 .../ppcls_2.6/arch/slim/__init__.py           |   16 +
 .../paddlepaddle/ppcls_2.6/arch/slim/prune.py |   64 +
 .../paddlepaddle/ppcls_2.6/arch/slim/quant.py |   63 +
 .../paddlepaddle/ppcls_2.6/arch/utils.py      |   99 ++
 .../PPLCNet_x1_0_pedestrian_attribute.yaml    |  148 ++
 .../Attr/PPLCNet_x1_0_vehicle_attribute.yaml  |  149 ++
 .../configs/Attr/StrongBaselineAttr.yaml      |  113 ++
 .../CAE/cae_base_patch16_224_finetune.yaml    |  169 ++
 .../CAE/cae_large_patch16_224_finetune.yaml   |  169 ++
 .../CLIP_vit_base_patch16_224_finetune.yaml   |  162 ++
 .../CLIP_vit_large_patch14_224_finetune.yaml  |  162 ++
 .../Cartoonface/ResNet50_icartoon.yaml        |  149 ++
 .../ppcls_2.6/configs/DeepHash/DCH.yaml       |  141 ++
 .../ppcls_2.6/configs/DeepHash/DSHSD.yaml     |  142 ++
 .../ppcls_2.6/configs/DeepHash/LCDSH.yaml     |  138 ++
 ...FaceRecognition_ArcFace_MobileFaceNet.yaml |  128 ++
 .../FaceRecognition_ArcFace_ResNet50.yaml     |  131 ++
 .../Gallery2FC_PPLCNet_x2_5.yaml              |   51 +
 .../GeneralRecognition_PPLCNet_x2_5.yaml      |  148 ++
 ...eneralRecognition_PPLCNet_x2_5_binary.yaml |  145 ++
 .../GeneralRecognition_PPLCNet_x2_5_dml.yaml  |  188 +++
 .../GeneralRecognition_PPLCNet_x2_5_udml.yaml |  193 +++
 .../GeneralRecognitionV2_CLIP_vit_base.yaml   |  169 ++
 .../GeneralRecognitionV2_CLIP_vit_large.yaml  |  169 ++
 .../GeneralRecognitionV2_PPLCNetV2_base.yaml  |  209 +++
 .../configs/ImageNet/CSPNet/CSPDarkNet53.yaml |  143 ++
 .../CSWinTransformer_base_224.yaml            |  174 ++
 .../CSWinTransformer_base_384.yaml            |  173 ++
 .../CSWinTransformer_large_224.yaml           |  173 ++
 .../CSWinTransformer_large_384.yaml           |  173 ++
 .../CSWinTransformer_small_224.yaml           |  173 ++
 .../CSWinTransformer_tiny_224.yaml            |  173 ++
 .../ImageNet/ConvNeXt/ConvNeXt_base_224.yaml  |  182 +++
 .../ImageNet/ConvNeXt/ConvNeXt_base_384.yaml  |  182 +++
 .../ImageNet/ConvNeXt/ConvNeXt_large_224.yaml |  182 +++
 .../ImageNet/ConvNeXt/ConvNeXt_large_384.yaml |  182 +++
 .../ImageNet/ConvNeXt/ConvNeXt_small.yaml     |  182 +++
 .../ImageNet/ConvNeXt/ConvNeXt_tiny.yaml      |  182 +++
 .../configs/ImageNet/CvT/CvT_13_224.yaml      |  174 ++
 .../configs/ImageNet/CvT/CvT_13_384.yaml      |  170 ++
 .../configs/ImageNet/CvT/CvT_21_224.yaml      |  154 ++
 .../configs/ImageNet/CvT/CvT_21_384.yaml      |  170 ++
 .../configs/ImageNet/CvT/CvT_W24_384.yaml     |  170 ++
 .../configs/ImageNet/DLA/DLA102.yaml          |  142 ++
 .../configs/ImageNet/DLA/DLA102x.yaml         |  142 ++
 .../configs/ImageNet/DLA/DLA102x2.yaml        |  142 ++
 .../configs/ImageNet/DLA/DLA169.yaml          |  142 ++
 .../ppcls_2.6/configs/ImageNet/DLA/DLA34.yaml |  142 ++
 .../configs/ImageNet/DLA/DLA46_c.yaml         |  142 ++
 .../configs/ImageNet/DLA/DLA46x_c.yaml        |  142 ++
 .../ppcls_2.6/configs/ImageNet/DLA/DLA60.yaml |  142 ++
 .../configs/ImageNet/DLA/DLA60x.yaml          |  142 ++
 .../configs/ImageNet/DLA/DLA60x_c.yaml        |  142 ++
 .../configs/ImageNet/DPN/DPN107.yaml          |  142 ++
 .../configs/ImageNet/DPN/DPN131.yaml          |  142 ++
 .../ppcls_2.6/configs/ImageNet/DPN/DPN68.yaml |  142 ++
 .../ppcls_2.6/configs/ImageNet/DPN/DPN92.yaml |  142 ++
 .../ppcls_2.6/configs/ImageNet/DPN/DPN98.yaml |  142 ++
 .../configs/ImageNet/DSNet/DSNet_base.yaml    |  169 ++
 .../configs/ImageNet/DSNet/DSNet_small.yaml   |  170 ++
 .../configs/ImageNet/DSNet/DSNet_tiny.yaml    |  169 ++
 .../configs/ImageNet/DarkNet/DarkNet53.yaml   |  142 ++
 .../DataAugment/ResNet50_AutoAugment.yaml     |  141 ++
 .../DataAugment/ResNet50_Baseline.yaml        |  140 ++
 .../ImageNet/DataAugment/ResNet50_Cutmix.yaml |  140 ++
 .../ImageNet/DataAugment/ResNet50_Cutout.yaml |  143 ++
 .../DataAugment/ResNet50_GridMask.yaml        |  146 ++
 .../DataAugment/ResNet50_HideAndSeek.yaml     |  141 ++
 .../ImageNet/DataAugment/ResNet50_Mixup.yaml  |  140 ++
 .../DataAugment/ResNet50_RandAugment.yaml     |  143 ++
 .../DataAugment/ResNet50_RandomErasing.yaml   |  146 ++
 .../DeiT/DeiT_base_distilled_patch16_224.yaml |  169 ++
 .../DeiT/DeiT_base_distilled_patch16_384.yaml |  169 ++
 .../ImageNet/DeiT/DeiT_base_patch16_224.yaml  |  169 ++
 .../ImageNet/DeiT/DeiT_base_patch16_384.yaml  |  169 ++
 .../DeiT_small_distilled_patch16_224.yaml     |  168 ++
 .../ImageNet/DeiT/DeiT_small_patch16_224.yaml |  169 ++
 .../DeiT/DeiT_tiny_distilled_patch16_224.yaml |  169 ++
 .../ImageNet/DeiT/DeiT_tiny_patch16_224.yaml  |  169 ++
 .../ImageNet/DenseNet/DenseNet121.yaml        |  142 ++
 .../ImageNet/DenseNet/DenseNet161.yaml        |  142 ++
 .../ImageNet/DenseNet/DenseNet169.yaml        |  142 ++
 .../ImageNet/DenseNet/DenseNet201.yaml        |  142 ++
 .../ImageNet/DenseNet/DenseNet264.yaml        |  142 ++
 .../Distillation/PPLCNet_x1_0_ssld.yaml       |  161 ++
 .../Distillation/PPLCNet_x2_5_dml.yaml        |  161 ++
 .../Distillation/PPLCNet_x2_5_ssld.yaml       |  160 ++
 .../Distillation/PPLCNet_x2_5_udml.yaml       |  171 ++
 ...mv3_large_x1_0_distill_mv3_small_x1_0.yaml |  167 ++
 .../res2net200_vd_distill_pphgnet_base.yaml   |  171 ++
 .../resnet34_distill_resnet18_afd.yaml        |  211 +++
 .../resnet34_distill_resnet18_dist.yaml       |  164 ++
 .../resnet34_distill_resnet18_dkd.yaml        |  166 ++
 .../resnet34_distill_resnet18_mgd.yaml        |  171 ++
 .../resnet34_distill_resnet18_pefd.yaml       |  171 ++
 .../resnet34_distill_resnet18_skd.yaml        |  163 ++
 .../resnet34_distill_resnet18_wsl.yaml        |  164 ++
 .../configs/ImageNet/ESNet/ESNet_x0_25.yaml   |  141 ++
 .../configs/ImageNet/ESNet/ESNet_x0_5.yaml    |  141 ++
 .../configs/ImageNet/ESNet/ESNet_x0_75.yaml   |  141 ++
 .../configs/ImageNet/ESNet/ESNet_x1_0.yaml    |  141 ++
 .../ImageNet/EfficientNet/EfficientNetB0.yaml |  145 ++
 .../ImageNet/EfficientNet/EfficientNetB1.yaml |  145 ++
 .../ImageNet/EfficientNet/EfficientNetB2.yaml |  145 ++
 .../ImageNet/EfficientNet/EfficientNetB3.yaml |  145 ++
 .../ImageNet/EfficientNet/EfficientNetB4.yaml |  145 ++
 .../ImageNet/EfficientNet/EfficientNetB5.yaml |  145 ++
 .../ImageNet/EfficientNet/EfficientNetB6.yaml |  145 ++
 .../ImageNet/EfficientNet/EfficientNetB7.yaml |  145 ++
 .../EfficientNetV2/EfficientNetV2_S.yaml      |  147 ++
 .../ImageNet/FasterNet/FasterNet_L.yaml       |  163 ++
 .../ImageNet/FasterNet/FasterNet_M.yaml       |  163 ++
 .../ImageNet/FasterNet/FasterNet_S.yaml       |  163 ++
 .../ImageNet/FasterNet/FasterNet_T0.yaml      |  163 ++
 .../ImageNet/FasterNet/FasterNet_T1.yaml      |  163 ++
 .../ImageNet/FasterNet/FasterNet_T2.yaml      |  162 ++
 .../ImageNet/GhostNet/GhostNet_x0_5.yaml      |  142 ++
 .../ImageNet/GhostNet/GhostNet_x1_0.yaml      |  142 ++
 .../ImageNet/GhostNet/GhostNet_x1_3.yaml      |  142 ++
 .../ImageNet/HarDNet/HarDNet39_ds.yaml        |  142 ++
 .../configs/ImageNet/HarDNet/HarDNet68.yaml   |  142 ++
 .../ImageNet/HarDNet/HarDNet68_ds.yaml        |  142 ++
 .../configs/ImageNet/HarDNet/HarDNet85.yaml   |  142 ++
 .../configs/ImageNet/Inception/GoogLeNet.yaml |  141 ++
 .../ImageNet/Inception/InceptionV3.yaml       |  142 ++
 .../ImageNet/Inception/InceptionV4.yaml       |  142 ++
 .../configs/ImageNet/LeViT/LeViT_128.yaml     |  142 ++
 .../configs/ImageNet/LeViT/LeViT_128S.yaml    |  142 ++
 .../configs/ImageNet/LeViT/LeViT_192.yaml     |  142 ++
 .../configs/ImageNet/LeViT/LeViT_256.yaml     |  142 ++
 .../configs/ImageNet/LeViT/LeViT_384.yaml     |  142 ++
 .../ImageNet/MicroNet/MicroNet_M0.yaml        |  147 ++
 .../ImageNet/MicroNet/MicroNet_M1.yaml        |  147 ++
 .../ImageNet/MicroNet/MicroNet_M2.yaml        |  147 ++
 .../ImageNet/MicroNet/MicroNet_M3.yaml        |  152 ++
 .../configs/ImageNet/MixNet/MixNet_L.yaml     |  144 ++
 .../configs/ImageNet/MixNet/MixNet_M.yaml     |  144 ++
 .../configs/ImageNet/MixNet/MixNet_S.yaml     |  144 ++
 .../ImageNet/MobileNeXt/MobileNeXt_x1_0.yaml  |  160 ++
 .../ImageNet/MobileNetV1/MobileNetV1.yaml     |  144 ++
 .../MobileNetV1/MobileNetV1_x0_25.yaml        |  142 ++
 .../MobileNetV1/MobileNetV1_x0_5.yaml         |  142 ++
 .../MobileNetV1/MobileNetV1_x0_75.yaml        |  142 ++
 .../ImageNet/MobileNetV2/MobileNetV2.yaml     |  142 ++
 .../MobileNetV2/MobileNetV2_x0_25.yaml        |  140 ++
 .../MobileNetV2/MobileNetV2_x0_5.yaml         |  140 ++
 .../MobileNetV2/MobileNetV2_x0_75.yaml        |  140 ++
 .../MobileNetV2/MobileNetV2_x1_5.yaml         |  140 ++
 .../MobileNetV2/MobileNetV2_x2_0.yaml         |  140 ++
 .../MobileNetV3/MobileNetV3_large_x0_35.yaml  |  142 ++
 .../MobileNetV3/MobileNetV3_large_x0_5.yaml   |  142 ++
 .../MobileNetV3/MobileNetV3_large_x0_75.yaml  |  142 ++
 .../MobileNetV3/MobileNetV3_large_x1_0.yaml   |  143 ++
 .../MobileNetV3/MobileNetV3_large_x1_25.yaml  |  142 ++
 .../MobileNetV3/MobileNetV3_small_x0_35.yaml  |  142 ++
 .../MobileNetV3/MobileNetV3_small_x0_5.yaml   |  142 ++
 .../MobileNetV3/MobileNetV3_small_x0_75.yaml  |  142 ++
 .../MobileNetV3/MobileNetV3_small_x1_0.yaml   |  143 ++
 .../MobileNetV3_small_x1_0_ampo2_ultra.yaml   |  141 ++
 .../MobileNetV3_small_x1_0_fp32_ultra.yaml    |  143 ++
 .../MobileNetV3/MobileNetV3_small_x1_25.yaml  |  142 ++
 .../MobileNetV4/MobileNetV4_conv_large.yaml   |  181 +++
 .../MobileNetV4/MobileNetV4_conv_medium.yaml  |  181 +++
 .../MobileNetV4/MobileNetV4_conv_small.yaml   |  181 +++
 .../MobileNetV4/MobileNetV4_hybrid_large.yaml |  182 +++
 .../MobileNetV4_hybrid_medium.yaml            |  176 ++
 .../ImageNet/MobileViT/MobileViT_S.yaml       |  151 ++
 .../ImageNet/MobileViT/MobileViT_XS.yaml      |  151 ++
 .../ImageNet/MobileViT/MobileViT_XXS.yaml     |  151 ++
 .../MobileViTV2/MobileViTV2_x0_5.yaml         |  174 ++
 .../MobileViTV2/MobileViTV2_x1_0.yaml         |  174 ++
 .../MobileViTV2/MobileViTV2_x1_5.yaml         |  174 ++
 .../MobileViTV2/MobileViTV2_x2_0.yaml         |  174 ++
 .../ImageNet/MobileViTV3/MobileViTV3_S.yaml   |  153 ++
 .../MobileViTV3/MobileViTV3_S_L2.yaml         |  153 ++
 .../ImageNet/MobileViTV3/MobileViTV3_XS.yaml  |  153 ++
 .../MobileViTV3/MobileViTV3_XS_L2.yaml        |  153 ++
 .../ImageNet/MobileViTV3/MobileViTV3_XXS.yaml |  153 ++
 .../MobileViTV3/MobileViTV3_XXS_L2.yaml       |  153 ++
 .../MobileViTV3/MobileViTV3_x0_5.yaml         |  175 ++
 .../MobileViTV3/MobileViTV3_x0_75.yaml        |  175 ++
 .../MobileViTV3/MobileViTV3_x1_0.yaml         |  175 ++
 .../ImageNet/NextViT/NextViT_base_224.yaml    |  172 ++
 .../ImageNet/NextViT/NextViT_base_384.yaml    |  172 ++
 .../ImageNet/NextViT/NextViT_large_224.yaml   |  172 ++
 .../ImageNet/NextViT/NextViT_large_384.yaml   |  172 ++
 .../ImageNet/NextViT/NextViT_small_224.yaml   |  172 ++
 .../ImageNet/NextViT/NextViT_small_384.yaml   |  172 ++
 .../ImageNet/PPHGNet/PPHGNet_base.yaml        |  169 ++
 .../ImageNet/PPHGNet/PPHGNet_small.yaml       |  170 ++
 .../ImageNet/PPHGNet/PPHGNet_tiny.yaml        |  170 ++
 .../ImageNet/PPHGNetV2/PPHGNetV2_B0.yaml      |  164 ++
 .../ImageNet/PPHGNetV2/PPHGNetV2_B1.yaml      |  164 ++
 .../ImageNet/PPHGNetV2/PPHGNetV2_B2.yaml      |  164 ++
 .../ImageNet/PPHGNetV2/PPHGNetV2_B3.yaml      |  164 ++
 .../ImageNet/PPHGNetV2/PPHGNetV2_B4.yaml      |  164 ++
 .../PPHGNetV2/PPHGNetV2_B4_ssld_stage1.yaml   |  172 ++
 .../PPHGNetV2/PPHGNetV2_B4_ssld_stage2.yaml   |  173 ++
 .../ImageNet/PPHGNetV2/PPHGNetV2_B5.yaml      |  164 ++
 .../ImageNet/PPHGNetV2/PPHGNetV2_B6.yaml      |  164 ++
 .../ImageNet/PPLCNet/PPLCNet_x0_25.yaml       |  141 ++
 .../ImageNet/PPLCNet/PPLCNet_x0_35.yaml       |  141 ++
 .../ImageNet/PPLCNet/PPLCNet_x0_5.yaml        |  141 ++
 .../ImageNet/PPLCNet/PPLCNet_x0_75.yaml       |  141 ++
 .../ImageNet/PPLCNet/PPLCNet_x1_0.yaml        |  141 ++
 .../PPLCNet/PPLCNet_x1_0_ampo2_ultra.yaml     |  141 ++
 .../PPLCNet/PPLCNet_x1_0_fp32_ultra.yaml      |  143 ++
 .../ImageNet/PPLCNet/PPLCNet_x1_5.yaml        |  141 ++
 .../ImageNet/PPLCNet/PPLCNet_x2_0.yaml        |  140 ++
 .../ImageNet/PPLCNet/PPLCNet_x2_5.yaml        |  142 ++
 .../ImageNet/PPLCNetV2/PPLCNetV2_base.yaml    |  145 ++
 .../ImageNet/PPLCNetV2/PPLCNetV2_large.yaml   |  145 ++
 .../ImageNet/PPLCNetV2/PPLCNetV2_small.yaml   |  145 ++
 .../configs/ImageNet/PVTV2/PVT_V2_B0.yaml     |  174 ++
 .../configs/ImageNet/PVTV2/PVT_V2_B1.yaml     |  174 ++
 .../configs/ImageNet/PVTV2/PVT_V2_B2.yaml     |  174 ++
 .../ImageNet/PVTV2/PVT_V2_B2_Linear.yaml      |  174 ++
 .../configs/ImageNet/PVTV2/PVT_V2_B3.yaml     |  175 ++
 .../configs/ImageNet/PVTV2/PVT_V2_B4.yaml     |  175 ++
 .../configs/ImageNet/PVTV2/PVT_V2_B5.yaml     |  175 ++
 .../configs/ImageNet/PeleeNet/PeleeNet.yaml   |  148 ++
 .../configs/ImageNet/ReXNet/ReXNet_1_0.yaml   |  144 ++
 .../configs/ImageNet/ReXNet/ReXNet_1_3.yaml   |  144 ++
 .../configs/ImageNet/ReXNet/ReXNet_1_5.yaml   |  144 ++
 .../configs/ImageNet/ReXNet/ReXNet_2_0.yaml   |  144 ++
 .../configs/ImageNet/ReXNet/ReXNet_3_0.yaml   |  144 ++
 .../configs/ImageNet/RedNet/RedNet101.yaml    |  142 ++
 .../configs/ImageNet/RedNet/RedNet152.yaml    |  142 ++
 .../configs/ImageNet/RedNet/RedNet26.yaml     |  142 ++
 .../configs/ImageNet/RedNet/RedNet38.yaml     |  142 ++
 .../configs/ImageNet/RedNet/RedNet50.yaml     |  142 ++
 .../configs/ImageNet/RegNet/RegNetX_12GF.yaml |  142 ++
 .../ImageNet/RegNet/RegNetX_1600MF.yaml       |  142 ++
 .../configs/ImageNet/RegNet/RegNetX_16GF.yaml |  142 ++
 .../ImageNet/RegNet/RegNetX_200MF.yaml        |  142 ++
 .../ImageNet/RegNet/RegNetX_3200MF.yaml       |  142 ++
 .../configs/ImageNet/RegNet/RegNetX_32GF.yaml |  142 ++
 .../ImageNet/RegNet/RegNetX_400MF.yaml        |  142 ++
 .../ImageNet/RegNet/RegNetX_600MF.yaml        |  142 ++
 .../ImageNet/RegNet/RegNetX_6400MF.yaml       |  142 ++
 .../ImageNet/RegNet/RegNetX_800MF.yaml        |  142 ++
 .../configs/ImageNet/RegNet/RegNetX_8GF.yaml  |  142 ++
 .../configs/ImageNet/RepVGG/RepVGG_A0.yaml    |  140 ++
 .../configs/ImageNet/RepVGG/RepVGG_A1.yaml    |  140 ++
 .../configs/ImageNet/RepVGG/RepVGG_A2.yaml    |  140 ++
 .../configs/ImageNet/RepVGG/RepVGG_B0.yaml    |  140 ++
 .../configs/ImageNet/RepVGG/RepVGG_B1.yaml    |  140 ++
 .../configs/ImageNet/RepVGG/RepVGG_B1g2.yaml  |  140 ++
 .../configs/ImageNet/RepVGG/RepVGG_B1g4.yaml  |  140 ++
 .../configs/ImageNet/RepVGG/RepVGG_B2.yaml    |  145 ++
 .../configs/ImageNet/RepVGG/RepVGG_B2g4.yaml  |  145 ++
 .../configs/ImageNet/RepVGG/RepVGG_B3.yaml    |  149 ++
 .../configs/ImageNet/RepVGG/RepVGG_B3g4.yaml  |  149 ++
 .../configs/ImageNet/RepVGG/RepVGG_D2se.yaml  |  149 ++
 .../Res2Net/Res2Net101_vd_26w_4s.yaml         |  142 ++
 .../Res2Net/Res2Net200_vd_26w_4s.yaml         |  142 ++
 .../ImageNet/Res2Net/Res2Net50_14w_8s.yaml    |  142 ++
 .../ImageNet/Res2Net/Res2Net50_26w_4s.yaml    |  142 ++
 .../ImageNet/Res2Net/Res2Net50_vd_26w_4s.yaml |  142 ++
 .../configs/ImageNet/ResNeSt/ResNeSt101.yaml  |  143 ++
 .../configs/ImageNet/ResNeSt/ResNeSt200.yaml  |  143 ++
 .../configs/ImageNet/ResNeSt/ResNeSt269.yaml  |  143 ++
 .../configs/ImageNet/ResNeSt/ResNeSt50.yaml   |  143 ++
 .../ResNeSt/ResNeSt50_fast_1s1x64d.yaml       |  143 ++
 .../ResNeXt101_wsl/ResNeXt101_32x16d_wsl.yaml |  142 ++
 .../ResNeXt101_wsl/ResNeXt101_32x32d_wsl.yaml |  142 ++
 .../ResNeXt101_wsl/ResNeXt101_32x48d_wsl.yaml |  142 ++
 .../ResNeXt101_wsl/ResNeXt101_32x8d_wsl.yaml  |  142 ++
 .../configs/ImageNet/ResNet/ResNet101.yaml    |  144 ++
 .../configs/ImageNet/ResNet/ResNet101_vd.yaml |  142 ++
 .../configs/ImageNet/ResNet/ResNet152.yaml    |  144 ++
 .../configs/ImageNet/ResNet/ResNet152_vd.yaml |  142 ++
 .../configs/ImageNet/ResNet/ResNet18.yaml     |  142 ++
 .../configs/ImageNet/ResNet/ResNet18_dbb.yaml |  153 ++
 .../configs/ImageNet/ResNet/ResNet18_vd.yaml  |  142 ++
 .../configs/ImageNet/ResNet/ResNet200_vd.yaml |  142 ++
 .../configs/ImageNet/ResNet/ResNet34.yaml     |  142 ++
 .../configs/ImageNet/ResNet/ResNet34_vd.yaml  |  142 ++
 .../configs/ImageNet/ResNet/ResNet50.yaml     |  145 ++
 .../ImageNet/ResNet/ResNet50_amp_O1.yaml      |  144 ++
 .../ResNet/ResNet50_amp_O1_ultra.yaml         |  150 ++
 .../ResNet/ResNet50_amp_O2_ultra.yaml         |  150 ++
 .../ImageNet/ResNet/ResNet50_ampo2_ultra.yaml |  144 ++
 .../ImageNet/ResNet/ResNet50_fp32_ultra.yaml  |  146 ++
 .../configs/ImageNet/ResNet/ResNet50_vd.yaml  |  142 ++
 .../configs/ImageNet/SENet/SENet154_vd.yaml   |  142 ++
 .../ImageNet/SENet/SE_ResNeXt101_32x4d.yaml   |  142 ++
 .../SE_ResNeXt101_32x4d_amp_O2_ultra.yaml     |  144 ++
 .../ImageNet/SENet/SE_ResNeXt50_32x4d.yaml    |  142 ++
 .../ImageNet/SENet/SE_ResNeXt50_vd_32x4d.yaml |  142 ++
 .../ImageNet/SENet/SE_ResNet18_vd.yaml        |  142 ++
 .../ImageNet/SENet/SE_ResNet34_vd.yaml        |  142 ++
 .../ImageNet/SENet/SE_ResNet50_vd.yaml        |  142 ++
 .../ShuffleNet/ShuffleNetV2_swish.yaml        |  141 ++
 .../ShuffleNet/ShuffleNetV2_x0_25.yaml        |  141 ++
 .../ShuffleNet/ShuffleNetV2_x0_33.yaml        |  141 ++
 .../ShuffleNet/ShuffleNetV2_x0_5.yaml         |  141 ++
 .../ShuffleNet/ShuffleNetV2_x1_0.yaml         |  141 ++
 .../ShuffleNet/ShuffleNetV2_x1_5.yaml         |  141 ++
 .../ShuffleNet/ShuffleNetV2_x2_0.yaml         |  141 ++
 .../ImageNet/SqueezeNet/SqueezeNet1_0.yaml    |  140 ++
 .../ImageNet/SqueezeNet/SqueezeNet1_1.yaml    |  140 ++
 .../configs/ImageNet/StarNet/StarNet_S1.yaml  |  165 ++
 .../configs/ImageNet/StarNet/StarNet_S2.yaml  |  165 ++
 .../configs/ImageNet/StarNet/StarNet_S3.yaml  |  166 ++
 .../configs/ImageNet/StarNet/StarNet_S4.yaml  |  165 ++
 ...nTransformer_base_patch4_window12_384.yaml |  175 ++
 ...inTransformer_base_patch4_window7_224.yaml |  176 ++
 ...Transformer_large_patch4_window12_384.yaml |  175 ++
 ...nTransformer_large_patch4_window7_224.yaml |  175 ++
 ...nTransformer_small_patch4_window7_224.yaml |  175 ++
 ...inTransformer_tiny_patch4_window7_224.yaml |  175 ++
 ...ransformerV2_base_patch4_window16_256.yaml |  172 ++
 ...ransformerV2_base_patch4_window24_384.yaml |  172 ++
 ...TransformerV2_base_patch4_window8_256.yaml |  172 ++
 ...ansformerV2_large_patch4_window16_256.yaml |  172 ++
 ...ansformerV2_large_patch4_window24_384.yaml |  172 ++
 ...ansformerV2_small_patch4_window16_256.yaml |  172 ++
 ...ransformerV2_small_patch4_window8_256.yaml |  172 ++
 ...ransformerV2_tiny_patch4_window16_256.yaml |  172 ++
 ...TransformerV2_tiny_patch4_window8_256.yaml |  172 ++
 .../configs/ImageNet/TNT/TNT_base.yaml        |  146 ++
 .../configs/ImageNet/TNT/TNT_small.yaml       |  146 ++
 .../configs/ImageNet/TinyNet/TinyNet_A.yaml   |  167 ++
 .../configs/ImageNet/TinyNet/TinyNet_B.yaml   |  167 ++
 .../configs/ImageNet/TinyNet/TinyNet_C.yaml   |  167 ++
 .../configs/ImageNet/TinyNet/TinyNet_D.yaml   |  167 ++
 .../configs/ImageNet/TinyNet/TinyNet_E.yaml   |  167 ++
 .../configs/ImageNet/Twins/alt_gvt_base.yaml  |  174 ++
 .../configs/ImageNet/Twins/alt_gvt_large.yaml |  174 ++
 .../configs/ImageNet/Twins/alt_gvt_small.yaml |  174 ++
 .../configs/ImageNet/Twins/pcpvt_base.yaml    |  174 ++
 .../configs/ImageNet/Twins/pcpvt_large.yaml   |  174 ++
 .../configs/ImageNet/Twins/pcpvt_small.yaml   |  174 ++
 .../ImageNet/UniFormer/UniFormer_base.yaml    |  174 ++
 .../ImageNet/UniFormer/UniFormer_base_ls.yaml |  174 ++
 .../ImageNet/UniFormer/UniFormer_small.yaml   |  174 ++
 .../UniFormer/UniFormer_small_plus.yaml       |  174 ++
 .../UniFormer/UniFormer_small_plus_dim64.yaml |  174 ++
 .../configs/ImageNet/VAN/VAN_B0.yaml          |  170 ++
 .../configs/ImageNet/VAN/VAN_B1.yaml          |  170 ++
 .../configs/ImageNet/VAN/VAN_B2.yaml          |  170 ++
 .../configs/ImageNet/VAN/VAN_B3.yaml          |  170 ++
 .../ViT_base_patch16_224.yaml                 |  142 ++
 .../ViT_base_patch16_384.yaml                 |  142 ++
 .../ViT_base_patch32_384.yaml                 |  142 ++
 .../ViT_large_patch16_224.yaml                |  142 ++
 .../ViT_large_patch16_384.yaml                |  142 ++
 .../ViT_large_patch32_384.yaml                |  142 ++
 .../ViT_small_patch16_224.yaml                |  142 ++
 .../configs/ImageNet/Xception/Xception41.yaml |  141 ++
 .../ImageNet/Xception/Xception41_deeplab.yaml |  141 ++
 .../configs/ImageNet/Xception/Xception65.yaml |  142 ++
 .../ImageNet/Xception/Xception65_deeplab.yaml |  141 ++
 .../configs/ImageNet/Xception/Xception71.yaml |  142 ++
 .../ppcls_2.6/configs/Logo/ResNet50_ReID.yaml |  151 ++
 ...P_vit_base_patch16_448_ml_decoder_448.yaml |  172 ++
 .../PP-HGNetV2-B0_ml_decoder_448.yaml         |  168 ++
 .../PP-HGNetV2-B4_ml_decoder_448.yaml         |  168 ++
 .../PP-HGNetV2-B6_ml_decoder_448.yaml         |  168 ++
 .../PP-LCNet_x1_0_ml_decoder_448.yaml         |  170 ++
 .../MultiLabelCOCO/MLDecoder/README.md        |  272 ++++
 .../MLDecoder/ResNet101_ml_decoder_448.yaml   |  168 ++
 .../MLDecoder/ResNet50_ml_decoder_448.yaml    |  168 ++
 .../car_exists/MobileNetV3_small_x0_35.yaml   |  139 ++
 .../configs/PULC/car_exists/PPLCNet_x1_0.yaml |  152 ++
 .../car_exists/PPLCNet_x1_0_distillation.yaml |  169 ++
 .../PULC/car_exists/PPLCNet_x1_0_search.yaml  |  152 ++
 ...inTransformer_tiny_patch4_window7_224.yaml |  169 ++
 .../configs/PULC/car_exists/search.yaml       |   40 +
 .../PULC/clarity_assessment/PPLCNet_x1_0.yaml |  133 ++
 .../code_exists/MobileNetV3_small_x0_35.yaml  |  137 ++
 .../PULC/code_exists/PPLCNet_x1_0.yaml        |  145 ++
 .../PPLCNet_x1_0_distillation.yaml            |  167 ++
 .../PULC/code_exists/PPLCNet_x1_0_search.yaml |  150 ++
 ...inTransformer_tiny_patch4_window7_224.yaml |  167 ++
 .../configs/PULC/code_exists/search.yaml      |   40 +
 .../PULC/image_orientation/PPLCNet_x1_0.yaml  |  144 ++
 .../MobileNetV3_small_x0_35.yaml              |  132 ++
 .../language_classification/PPLCNet_x1_0.yaml |  143 ++
 .../PPLCNet_x1_0_distillation.yaml            |  164 ++
 .../PPLCNet_x1_0_search.yaml                  |  142 ++
 ...inTransformer_tiny_patch4_window7_224.yaml |  160 ++
 .../PULC/language_classification/search.yaml  |   40 +
 .../MobileNetV3_small_x0_35.yaml              |  135 ++
 .../PULC/person_attribute/PPLCNet_x1_0.yaml   |  149 ++
 .../PPLCNet_x1_0_Distillation.yaml            |  172 ++
 .../person_attribute/PPLCNet_x1_0_search.yaml |  149 ++
 .../Res2Net200_vd_26w_4s.yaml                 |  134 ++
 ...inTransformer_tiny_patch4_window7_224.yaml |  135 ++
 .../configs/PULC/person_attribute/search.yaml |   41 +
 .../MobileNetV3_small_x0_35.yaml              |  138 ++
 .../PULC/person_exists/PPLCNet_x1_0.yaml      |  151 ++
 .../PPLCNet_x1_0_distillation.yaml            |  168 ++
 .../person_exists/PPLCNet_x1_0_search.yaml    |  151 ++
 ...inTransformer_tiny_patch4_window7_224.yaml |  168 ++
 .../configs/PULC/person_exists/search.yaml    |   40 +
 .../MobileNetV3_small_x0_35.yaml              |  134 ++
 .../PULC/safety_helmet/PPLCNet_x1_0.yaml      |  148 ++
 .../PPLCNet_x1_0_distillation.yaml            |  185 +++
 .../safety_helmet/PPLCNet_x1_0_search.yaml    |  148 ++
 .../safety_helmet/Res2Net200_vd_26w_4s.yaml   |  137 ++
 ...inTransformer_tiny_patch4_window7_224.yaml |  159 ++
 .../configs/PULC/safety_helmet/search.yaml    |   36 +
 .../PULC/table_attribute/PPLCNet_x1_0.yaml    |  133 ++
 .../PPLCNet_x1_0_distillation.yaml            |  155 ++
 .../MobileNetV3_small_x0_35.yaml              |  132 ++
 .../text_image_orientation/PPLCNet_x1_0.yaml  |  143 ++
 .../PPLCNet_x1_0_distillation.yaml            |  164 ++
 .../PPLCNet_x1_0_search.yaml                  |  146 ++
 ...inTransformer_tiny_patch4_window7_224.yaml |  157 ++
 .../PULC/text_image_orientation/search.yaml   |   41 +
 .../MobileNetV3_small_x0_35.yaml              |  134 ++
 .../textline_orientation/PPLCNet_x1_0.yaml    |  143 ++
 .../PPLCNet_x1_0_224x224.yaml                 |  132 ++
 .../PPLCNet_x1_0_distillation.yaml            |  162 ++
 .../PPLCNet_x1_0_search.yaml                  |  144 ++
 ...inTransformer_tiny_patch4_window7_224.yaml |  164 ++
 .../PULC/textline_orientation/search.yaml     |   41 +
 .../traffic_sign/MobileNetV3_samll_x0_35.yaml |  132 ++
 .../PULC/traffic_sign/PPLCNet_x1_0.yaml       |  148 ++
 .../PPLCNet_x1_0_distillation.yaml            |  172 ++
 .../traffic_sign/PPLCNet_x1_0_search.yaml     |  148 ++
 ...inTransformer_tiny_patch4_window7_224.yaml |  170 ++
 .../configs/PULC/traffic_sign/search.yaml     |   41 +
 .../MobileNetV3_small_x0_35.yaml              |  115 ++
 .../PULC/vehicle_attribute/PPLCNet_x1_0.yaml  |  149 ++
 .../PPLCNet_x1_0_distillation.yaml            |  171 ++
 .../PPLCNet_x1_0_search.yaml                  |  129 ++
 .../Res2Net200_vd_26w_4s.yaml                 |  122 ++
 .../PULC/vehicle_attribute/ResNet50.yaml      |  116 ++
 .../PULC/vehicle_attribute/search.yaml        |   35 +
 .../MV3_Large_1x_Aliproduct_DLBHC.yaml        |  149 ++
 .../Products/ResNet50_vd_Aliproduct.yaml      |  119 ++
 .../configs/Products/ResNet50_vd_Inshop.yaml  |  157 ++
 .../configs/Products/ResNet50_vd_SOP.yaml     |  156 ++
 .../configs/ResNet50_UReID_infer.yaml         |  152 ++
 .../ppcls_2.6/configs/SVTR/svtr_base.yml      |  146 ++
 .../ppcls_2.6/configs/SVTR/svtr_large.yml     |  146 ++
 .../ppcls_2.6/configs/SVTR/svtr_tiny.yml      |  146 ++
 .../configs/StrategySearch/person.yaml        |   40 +
 .../configs/Vehicle/PPLCNet_2.5x_ReID.yaml    |  158 ++
 .../ppcls_2.6/configs/Vehicle/ResNet50.yaml   |  130 ++
 .../configs/Vehicle/ResNet50_ReID.yaml        |  155 ++
 .../configs/metric_learning/adaface_ir18.yaml |  105 ++
 .../configs/metric_learning/xbm_resnet50.yaml |  170 ++
 .../multi_scale/MobileNetV1_multi_scale.yaml  |  138 ++
 .../configs/practical_models/.gitkeep         |    1 +
 .../CLIP_large_patch14_224_aesthetic.yaml     |   78 +
 .../EfficientNetB3_watermark.yaml             |   81 +
 .../PPHGNet_tiny_calling_halfbody.yaml        |  150 ++
 .../quick_start/MobileNetV1_retrieval.yaml    |  157 ++
 .../quick_start/MobileNetV3_large_x1_0.yaml   |  130 ++
 .../configs/quick_start/ResNet50_vd.yaml      |  129 ++
 .../kunlun/HRNet_W18_C_finetune_kunlun.yaml   |   68 +
 .../kunlun/ResNet50_vd_finetune_kunlun.yaml   |   69 +
 .../kunlun/VGG16_finetune_kunlun.yaml         |   70 +
 .../kunlun/VGG19_finetune_kunlun.yaml         |   70 +
 .../new_user/ShuffleNetV2_x0_25.yaml          |  129 ++
 .../professional/MobileNetV1_multilabel.yaml  |  130 ++
 ...ileNetV3_large_x1_0_CIFAR100_finetune.yaml |  127 ++
 ...50_vd_distill_MV3_large_x1_0_CIFAR100.yaml |  151 ++
 .../professional/ResNet50_vd_CIFAR100.yaml    |  127 ++
 .../ResNet50_vd_mixup_CIFAR100_finetune.yaml  |  127 ++
 .../professional/VGG19_CIFAR10_DeepHash.yaml  |  147 ++
 .../reid/MetaBIN_ResNet50_cross_domain.yaml   |  277 ++++
 .../reid/strong_baseline/baseline.yaml        |  158 ++
 .../reid/strong_baseline/softmax_triplet.yaml |  176 ++
 .../softmax_triplet_with_center.yaml          |  187 +++
 ...Recognition_PPLCNet_x2_5_quantization.yaml |  154 ++
 .../slim/MobileNetV3_large_x1_0_prune.yaml    |  139 ++
 .../MobileNetV3_large_x1_0_quantization.yaml  |  138 ++
 .../slim/PPLCNet_x1_0_quantization.yaml       |  138 ++
 .../configs/slim/ResNet50_vd_prune.yaml       |  138 ++
 .../slim/ResNet50_vd_quantization.yaml        |  137 ++
 .../slim/ResNet50_vehicle_cls_prune.yaml      |  135 ++
 .../ResNet50_vehicle_cls_quantization.yaml    |  134 ++
 .../slim/ResNet50_vehicle_reid_prune.yaml     |  162 ++
 .../ResNet50_vehicle_reid_quantization.yaml   |  161 ++
 .../FixMatchCCSSL_cifar100_10000_4gpu.yaml    |  209 +++
 .../FixMatchCCSSL_cifar10_4000_4gpu.yaml      |  208 +++
 .../ssl/FixMatch/FixMatch_cifar10_250.yaml    |  175 ++
 .../ssl/FixMatch/FixMatch_cifar10_40.yaml     |  175 ++
 .../ssl/FixMatch/FixMatch_cifar10_4000.yaml   |  175 ++
 .../FixMatch/FixMatch_cifar10_40_4gpu.yaml    |  176 ++
 .../paddlepaddle/ppcls_2.6/engine/__init__.py |    0
 .../paddlepaddle/ppcls_2.6/engine/engine.py   |  717 ++++++++
 .../ppcls_2.6/engine/evaluation/__init__.py   |   18 +
 .../ppcls_2.6/engine/evaluation/adaface.py    |  260 +++
 .../engine/evaluation/classification.py       |  175 ++
 .../engine/evaluation/face_recognition.py     |  152 ++
 .../ppcls_2.6/engine/evaluation/retrieval.py  |  327 ++++
 .../ppcls_2.6/engine/train/__init__.py        |   18 +
 .../ppcls_2.6/engine/train/train.py           |  100 ++
 .../ppcls_2.6/engine/train/train_fixmatch.py  |  152 ++
 .../engine/train/train_fixmatch_ccssl.py      |  125 ++
 .../ppcls_2.6/engine/train/train_metabin.py   |  251 +++
 .../engine/train/train_progressive.py         |   72 +
 .../ppcls_2.6/engine/train/utils.py           |   94 ++
 .../paddlepaddle/ppcls_2.6/loss/__init__.py   |   91 ++
 .../paddlepaddle/ppcls_2.6/loss/afdloss.py    |  130 ++
 .../paddlepaddle/ppcls_2.6/loss/ccssl_loss.py |   19 +
 .../paddlepaddle/ppcls_2.6/loss/celoss.py     |   76 +
 .../paddlepaddle/ppcls_2.6/loss/centerloss.py |   80 +
 .../paddlepaddle/ppcls_2.6/loss/comfunc.py    |   45 +
 .../ppcls_2.6/loss/contrasiveloss.py          |  152 ++
 .../ppcls_2.6/loss/deephashloss.py            |  149 ++
 .../paddlepaddle/ppcls_2.6/loss/dist_loss.py  |   52 +
 .../ppcls_2.6/loss/distanceloss.py            |   43 +
 .../ppcls_2.6/loss/distillationloss.py        |  426 +++++
 .../paddlepaddle/ppcls_2.6/loss/dkdloss.py    |   68 +
 .../paddlepaddle/ppcls_2.6/loss/dmlloss.py    |   62 +
 .../paddlepaddle/ppcls_2.6/loss/emlloss.py    |  102 ++
 .../ppcls_2.6/loss/googlenetloss.py           |   43 +
 .../paddlepaddle/ppcls_2.6/loss/kldivloss.py  |   33 +
 .../ppcls_2.6/loss/metabinloss.py             |  206 +++
 .../paddlepaddle/ppcls_2.6/loss/mgd_loss.py   |   84 +
 .../paddlepaddle/ppcls_2.6/loss/msmloss.py    |   80 +
 .../ppcls_2.6/loss/multilabelloss.py          |  119 ++
 .../paddlepaddle/ppcls_2.6/loss/npairsloss.py |   43 +
 .../ppcls_2.6/loss/pairwisecosface.py         |   64 +
 .../paddlepaddle/ppcls_2.6/loss/pefdloss.py   |   83 +
 .../paddlepaddle/ppcls_2.6/loss/rkdloss.py    |   99 ++
 .../paddlepaddle/ppcls_2.6/loss/skdloss.py    |   72 +
 .../ppcls_2.6/loss/softsuploss.py             |   75 +
 .../ppcls_2.6/loss/softtargetceloss.py        |   16 +
 .../paddlepaddle/ppcls_2.6/loss/supconloss.py |  109 ++
 .../ppcls_2.6/loss/trihardloss.py             |   84 +
 .../paddlepaddle/ppcls_2.6/loss/triplet.py    |  157 ++
 .../loss/tripletangularmarginloss.py          |  241 +++
 .../paddlepaddle/ppcls_2.6/loss/wslloss.py    |   66 +
 .../paddlepaddle/ppcls_2.6/loss/xbm.py        |   89 +
 .../paddlepaddle/ppcls_2.6/metric/__init__.py |   72 +
 .../ppcls_2.6/metric/avg_metrics.py           |   20 +
 .../ppcls_2.6/metric/face_metrics.py          |  201 +++
 .../paddlepaddle/ppcls_2.6/metric/metrics.py  |  661 ++++++++
 .../ppcls_2.6/optimizer/__init__.py           |  137 ++
 .../ppcls_2.6/optimizer/learning_rate.py      |  687 ++++++++
 .../ppcls_2.6/optimizer/optimizer.py          |  518 ++++++
 .../paddlepaddle/ppcls_2.6/static/README.md   |   19 +
 .../paddlepaddle/ppcls_2.6/static/program.py  |  445 +++++
 .../paddlepaddle/ppcls_2.6/static/run_dali.sh |    8 +
 .../ppcls_2.6/static/save_load.py             |  139 ++
 .../paddlepaddle/ppcls_2.6/static/train.py    |  227 +++
 .../ppcls_2.6/utils/COCO2017_label_list.txt   |   80 +
 .../utils/NUS-WIDE-SCENE_label_list.txt       |   33 +
 .../image_orientation_label_list.txt          |    4 +
 .../language_classification_label_list.txt    |   10 +
 .../text_image_orientation_label_list.txt     |    4 +
 .../textline_orientation_label_list.txt       |    2 +
 .../traffic_sign_label_list.txt               |  232 +++
 .../paddlepaddle/ppcls_2.6/utils/__init__.py  |   29 +
 .../paddlepaddle/ppcls_2.6/utils/amp.py       |   61 +
 .../paddlepaddle/ppcls_2.6/utils/check.py     |  149 ++
 .../paddlepaddle/ppcls_2.6/utils/config.py    |  326 ++++
 .../utils/create_cls_trainval_lists.py        |  111 ++
 .../utils/create_coco_multilabel_lists.py     |  126 ++
 .../ppcls_2.6/utils/dist_utils.py             |   36 +
 .../paddlepaddle/ppcls_2.6/utils/download.py  |  304 ++++
 .../paddlepaddle/ppcls_2.6/utils/ema.py       |   45 +
 .../feature_maps_visualization/fm_vis.py      |   97 ++
 .../feature_maps_visualization/resnet.py      |  535 ++++++
 .../utils/feature_maps_visualization/utils.py |   85 +
 .../ppcls_2.6/utils/imagenet1k_label_list.txt | 1000 ++++++++++++
 .../ppcls_2.6/utils/initializer.py            |  318 ++++
 .../paddlepaddle/ppcls_2.6/utils/logger.py    |  173 ++
 .../paddlepaddle/ppcls_2.6/utils/metrics.py   |  107 ++
 .../paddlepaddle/ppcls_2.6/utils/misc.py      |  155 ++
 .../paddlepaddle/ppcls_2.6/utils/model_zoo.py |  213 +++
 .../utils/pedestrian_attribute_label_list.txt |   26 +
 .../ppcls_2.6/utils/pretrained.list           |  121 ++
 .../paddlepaddle/ppcls_2.6/utils/profiler.py  |  129 ++
 .../paddlepaddle/ppcls_2.6/utils/save_load.py |  225 +++
 .../ppcls_2.6/utils/save_result.py            |  102 ++
 .../utils/vehicle_attribute_label_list.txt    |   19 +
 .../resnet50/paddlepaddle/requirements.txt    |   11 +
 .../resnet50/paddlepaddle/run_resnet50.sh     |    8 +
 .../paddlepaddle/run_resnet50_dist.sh         |    8 +
 .../resnet50/paddlepaddle/train.py            |   40 +
 tests/executables/resnet/init_paddle.sh       |    2 +-
 .../resnet/train_resnet50_dist_paddle.sh      |    2 +-
 837 files changed, 147131 insertions(+), 2 deletions(-)
 create mode 100644 cv/classification/resnet50/paddlepaddle/.gitignore
 create mode 100644 cv/classification/resnet50/paddlepaddle/LICENSE
 create mode 100644 cv/classification/resnet50/paddlepaddle/MANIFEST.in
 create mode 100644 cv/classification/resnet50/paddlepaddle/README_ch.md
 create mode 100644 cv/classification/resnet50/paddlepaddle/README_en.md
 create mode 100644 cv/classification/resnet50/paddlepaddle/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/hubconf.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/arch/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/base/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/base/theseus_layer.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/legendary_models/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/legendary_models/resnet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/model_zoo/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/model_zoo/resnet_vc.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/variant_models/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/variant_models/resnet_variant.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/arcmargin.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/circlemargin.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/cosmargin.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/fc.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/identity_head.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/vehicle_neck.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/arch/slim/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/arch/slim/prune.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/arch/slim/quant.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/arch/utils.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_AutoAugment.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_Baseline.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_Cutmix.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_Cutout.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_GridMask.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_HideAndSeek.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_Mixup.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_RandAugment.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_RandomErasing.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/ResNet/ResNet50.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/ResNet/ResNet50_amp_4x8.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/ResNet/ResNet50_vd.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/configs/quick_start/ResNet50_vd.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/engine/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/engine/engine.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/engine/evaluation/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/engine/evaluation/classification.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/engine/evaluation/retrieval.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/engine/train/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/engine/train/train.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/engine/train/utils.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/loss/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/loss/celoss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/loss/comfunc.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/loss/distanceloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/metric/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/metric/metrics.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/optimizer/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/optimizer/learning_rate.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/optimizer/optimizer.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/static/program.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/static/save_load.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/static/train.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/utils/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/utils/check.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/utils/config.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/utils/download.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/utils/ema.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/utils/feature_maps_visualization/fm_vis.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/utils/feature_maps_visualization/resnet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/utils/feature_maps_visualization/utils.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/utils/gallery2fc.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/utils/imagenet1k_label_list.txt
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/utils/logger.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/utils/metrics.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/utils/misc.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/utils/model_zoo.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/utils/pretrained.list
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/utils/profiler.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/utils/save_load.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/base/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/base/theseus_layer.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/legendary_models/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/legendary_models/resnet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/model_zoo/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/model_zoo/resnet_vc.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/variant_models/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/variant_models/resnet_variant.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/arcmargin.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/circlemargin.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/cosmargin.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/fc.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/identity_head.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/vehicle_neck.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/slim/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/slim/prune.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/slim/quant.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/utils.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_AutoAugment.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_Baseline.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_Cutmix.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_Cutout.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_GridMask.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_HideAndSeek.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_Mixup.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_RandAugment.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_RandomErasing.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/ResNet/ResNet50.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/ResNet/ResNet50_amp_4x8.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/ResNet/ResNet50_vd.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/quick_start/ResNet50_vd.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/engine.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/evaluation/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/evaluation/classification.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/evaluation/retrieval.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/train/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/train/train.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/train/utils.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/loss/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/loss/celoss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/loss/comfunc.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/loss/distanceloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/metric/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/metric/metrics.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/optimizer/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/optimizer/learning_rate.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/optimizer/optimizer.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/static/program.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/static/save_load.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/static/train.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/check.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/config.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/download.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/ema.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/feature_maps_visualization/fm_vis.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/feature_maps_visualization/resnet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/feature_maps_visualization/utils.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/gallery2fc.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/imagenet1k_label_list.txt
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/logger.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/metrics.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/misc.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/model_zoo.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/pretrained.list
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/profiler.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/save_load.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/base/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/base/dbb/dbb_block.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/base/dbb/dbb_transforms.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/base/theseus_layer.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/custom_devices_layers.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/esnet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/hrnet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/inception_v3.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/mobilenet_v1.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/mobilenet_v3.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/mobilenet_v4.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/pp_hgnet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/pp_hgnet_v2.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/pp_lcnet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/pp_lcnet_v2.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/resnet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/swin_transformer.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/vgg.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/adaface_ir_net.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/alexnet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/cae.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/convnext.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/cspnet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/cswin_transformer.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/cvt.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/darknet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/densenet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/distilled_vision_transformer.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/dla.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/dpn.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/dsnet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/efficientnet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/efficientnet_v2.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/fasternet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/foundation_vit.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/ghostnet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/googlenet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/hardnet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/inception_v4.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/levit.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/micronet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mixnet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mobilefacenet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mobilenet_v2.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mobilenext.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mobilevit.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mobilevit_v2.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mobilevit_v3.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/nextvit.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/peleenet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/pvt_v2.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/rednet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/regnet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/repvgg.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/res2net.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/res2net_vd.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/resnest.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/resnet_vc.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/resnext.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/resnext101_wsl.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/resnext_vd.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/rexnet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/se_resnet_vd.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/se_resnext.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/se_resnext_vd.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/shufflenet_v2.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/squeezenet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/starnet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/svtrnet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/swin_transformer_v2.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/tinynet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/tnt.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/twins.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/uniformer.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/van.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/vision_transformer.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/wideresnet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/xception.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/xception_deeplab.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/efficientnet_variant.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/foundation_vit_variant.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/pp_lcnet_variant.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/pp_lcnetv2_variant.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/resnet_variant.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/swin_transformer_variant.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/vgg_variant.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/distill/afd_attention.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/adamargin.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/arcmargin.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/bnneck.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/circlemargin.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/cosmargin.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/fc.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/frfn_neck.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/identity_head.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/metabnneck.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/ml_decoder.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/vehicle_neck.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/slim/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/slim/prune.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/slim/quant.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/utils.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Attr/PPLCNet_x1_0_pedestrian_attribute.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Attr/PPLCNet_x1_0_vehicle_attribute.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Attr/StrongBaselineAttr.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/CAE/cae_base_patch16_224_finetune.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/CAE/cae_large_patch16_224_finetune.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/CLIP/CLIP_vit_base_patch16_224_finetune.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/CLIP/CLIP_vit_large_patch14_224_finetune.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Cartoonface/ResNet50_icartoon.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/DeepHash/DCH.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/DeepHash/DSHSD.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/DeepHash/LCDSH.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Face_Recognition/FaceRecognition_ArcFace_MobileFaceNet.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Face_Recognition/FaceRecognition_ArcFace_ResNet50.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognition/Gallery2FC_PPLCNet_x2_5.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_binary.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_dml.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_udml.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognitionV2/GeneralRecognitionV2_CLIP_vit_base.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognitionV2/GeneralRecognitionV2_CLIP_vit_large.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognitionV2/GeneralRecognitionV2_PPLCNetV2_base.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSPNet/CSPDarkNet53.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSWinTransformer/CSWinTransformer_base_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSWinTransformer/CSWinTransformer_base_384.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSWinTransformer/CSWinTransformer_large_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSWinTransformer/CSWinTransformer_large_384.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSWinTransformer/CSWinTransformer_small_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSWinTransformer/CSWinTransformer_tiny_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ConvNeXt/ConvNeXt_base_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ConvNeXt/ConvNeXt_base_384.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ConvNeXt/ConvNeXt_large_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ConvNeXt/ConvNeXt_large_384.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ConvNeXt/ConvNeXt_small.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ConvNeXt/ConvNeXt_tiny.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CvT/CvT_13_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CvT/CvT_13_384.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CvT/CvT_21_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CvT/CvT_21_384.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CvT/CvT_W24_384.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA102.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA102x.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA102x2.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA169.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA34.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA46_c.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA46x_c.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA60.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA60x.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA60x_c.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DPN/DPN107.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DPN/DPN131.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DPN/DPN68.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DPN/DPN92.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DPN/DPN98.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DSNet/DSNet_base.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DSNet/DSNet_small.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DSNet/DSNet_tiny.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DarkNet/DarkNet53.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_AutoAugment.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_Baseline.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_Cutmix.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_Cutout.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_GridMask.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_HideAndSeek.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_Mixup.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_RandAugment.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_RandomErasing.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_384.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_base_patch16_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_base_patch16_384.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_small_distilled_patch16_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_small_patch16_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_tiny_distilled_patch16_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_tiny_patch16_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DenseNet/DenseNet121.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DenseNet/DenseNet161.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DenseNet/DenseNet169.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DenseNet/DenseNet201.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DenseNet/DenseNet264.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/PPLCNet_x1_0_ssld.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/PPLCNet_x2_5_dml.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/PPLCNet_x2_5_ssld.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/PPLCNet_x2_5_udml.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/res2net200_vd_distill_pphgnet_base.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_afd.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_dist.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_dkd.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_mgd.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_pefd.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_skd.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_wsl.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ESNet/ESNet_x0_25.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ESNet/ESNet_x0_5.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ESNet/ESNet_x0_75.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ESNet/ESNet_x1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB1.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB2.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB3.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB4.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB5.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB6.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB7.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNetV2/EfficientNetV2_S.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/FasterNet/FasterNet_L.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/FasterNet/FasterNet_M.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/FasterNet/FasterNet_S.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/FasterNet/FasterNet_T0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/FasterNet/FasterNet_T1.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/FasterNet/FasterNet_T2.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/GhostNet/GhostNet_x0_5.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/GhostNet/GhostNet_x1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/GhostNet/GhostNet_x1_3.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/HarDNet/HarDNet39_ds.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/HarDNet/HarDNet68.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/HarDNet/HarDNet68_ds.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/HarDNet/HarDNet85.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Inception/GoogLeNet.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Inception/InceptionV3.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Inception/InceptionV4.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/LeViT/LeViT_128.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/LeViT/LeViT_128S.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/LeViT/LeViT_192.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/LeViT/LeViT_256.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/LeViT/LeViT_384.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MicroNet/MicroNet_M0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MicroNet/MicroNet_M1.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MicroNet/MicroNet_M2.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MicroNet/MicroNet_M3.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MixNet/MixNet_L.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MixNet/MixNet_M.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MixNet/MixNet_S.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNeXt/MobileNeXt_x1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV1/MobileNetV1.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV1/MobileNetV1_x0_25.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV1/MobileNetV1_x0_5.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV1/MobileNetV1_x0_75.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV2/MobileNetV2.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV2/MobileNetV2_x0_25.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV2/MobileNetV2_x0_5.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV2/MobileNetV2_x0_75.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV2/MobileNetV2_x1_5.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV2/MobileNetV2_x2_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_35.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_5.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_75.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_25.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_35.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_5.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_75.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0_ampo2_ultra.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0_fp32_ultra.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_25.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV4/MobileNetV4_conv_large.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV4/MobileNetV4_conv_medium.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV4/MobileNetV4_conv_small.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV4/MobileNetV4_hybrid_large.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV4/MobileNetV4_hybrid_medium.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViT/MobileViT_S.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViT/MobileViT_XS.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViT/MobileViT_XXS.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV2/MobileViTV2_x0_5.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV2/MobileViTV2_x1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV2/MobileViTV2_x1_5.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV2/MobileViTV2_x2_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_S.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_S_L2.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_XS.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_XS_L2.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_XXS.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_XXS_L2.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_x0_5.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_x0_75.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_x1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/NextViT/NextViT_base_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/NextViT/NextViT_base_384.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/NextViT/NextViT_large_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/NextViT/NextViT_large_384.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/NextViT/NextViT_small_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/NextViT/NextViT_small_384.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNet/PPHGNet_base.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNet/PPHGNet_small.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNet/PPHGNet_tiny.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B1.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B2.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B3.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B4.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B4_ssld_stage1.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B4_ssld_stage2.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B5.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B6.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x0_25.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x0_35.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x0_5.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x0_75.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x1_0_ampo2_ultra.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x1_0_fp32_ultra.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x1_5.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x2_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x2_5.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNetV2/PPLCNetV2_base.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNetV2/PPLCNetV2_large.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNetV2/PPLCNetV2_small.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B1.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B2.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B2_Linear.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B3.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B4.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B5.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PeleeNet/PeleeNet.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ReXNet/ReXNet_1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ReXNet/ReXNet_1_3.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ReXNet/ReXNet_1_5.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ReXNet/ReXNet_2_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ReXNet/ReXNet_3_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RedNet/RedNet101.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RedNet/RedNet152.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RedNet/RedNet26.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RedNet/RedNet38.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RedNet/RedNet50.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_12GF.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_1600MF.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_16GF.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_200MF.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_3200MF.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_32GF.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_400MF.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_600MF.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_6400MF.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_800MF.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_8GF.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_A0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_A1.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_A2.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B1.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B1g2.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B1g4.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B2.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B2g4.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B3.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B3g4.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_D2se.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Res2Net/Res2Net101_vd_26w_4s.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Res2Net/Res2Net200_vd_26w_4s.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Res2Net/Res2Net50_14w_8s.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Res2Net/Res2Net50_26w_4s.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Res2Net/Res2Net50_vd_26w_4s.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeSt/ResNeSt101.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeSt/ResNeSt200.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeSt/ResNeSt269.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeSt/ResNeSt50.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeSt/ResNeSt50_fast_1s1x64d.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x16d_wsl.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x32d_wsl.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x48d_wsl.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x8d_wsl.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet101.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet101_vd.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet152.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet152_vd.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet18.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet18_dbb.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet18_vd.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet200_vd.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet34.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet34_vd.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50_amp_O1_ultra.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50_amp_O2_ultra.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50_ampo2_ultra.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50_fp32_ultra.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50_vd.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SENet154_vd.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNeXt101_32x4d.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_amp_O2_ultra.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNeXt50_32x4d.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNeXt50_vd_32x4d.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNet18_vd.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNet34_vd.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNet50_vd.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_swish.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_25.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_33.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_5.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_5.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_x2_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SqueezeNet/SqueezeNet1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SqueezeNet/SqueezeNet1_1.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/StarNet/StarNet_S1.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/StarNet/StarNet_S2.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/StarNet/StarNet_S3.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/StarNet/StarNet_S4.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window12_384.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window7_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window12_384.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window7_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformer/SwinTransformer_small_patch4_window7_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformer/SwinTransformer_tiny_patch4_window7_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_base_patch4_window16_256.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_base_patch4_window24_384.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_base_patch4_window8_256.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_large_patch4_window16_256.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_large_patch4_window24_384.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_small_patch4_window16_256.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_small_patch4_window8_256.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_tiny_patch4_window16_256.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_tiny_patch4_window8_256.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TNT/TNT_base.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TNT/TNT_small.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TinyNet/TinyNet_A.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TinyNet/TinyNet_B.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TinyNet/TinyNet_C.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TinyNet/TinyNet_D.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TinyNet/TinyNet_E.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Twins/alt_gvt_base.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Twins/alt_gvt_large.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Twins/alt_gvt_small.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Twins/pcpvt_base.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Twins/pcpvt_large.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Twins/pcpvt_small.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/UniFormer/UniFormer_base.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/UniFormer/UniFormer_base_ls.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/UniFormer/UniFormer_small.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/UniFormer/UniFormer_small_plus.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/UniFormer/UniFormer_small_plus_dim64.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VAN/VAN_B0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VAN/VAN_B1.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VAN/VAN_B2.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VAN/VAN_B3.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_base_patch16_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_base_patch16_384.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_base_patch32_384.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_large_patch16_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_large_patch16_384.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_large_patch32_384.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_small_patch16_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Xception/Xception41.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Xception/Xception41_deeplab.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Xception/Xception65.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Xception/Xception65_deeplab.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Xception/Xception71.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Logo/ResNet50_ReID.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/CLIP_vit_base_patch16_448_ml_decoder_448.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/PP-HGNetV2-B0_ml_decoder_448.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/PP-HGNetV2-B4_ml_decoder_448.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/PP-HGNetV2-B6_ml_decoder_448.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/PP-LCNet_x1_0_ml_decoder_448.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/README.md
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/ResNet101_ml_decoder_448.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/ResNet50_ml_decoder_448.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/car_exists/MobileNetV3_small_x0_35.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/car_exists/PPLCNet_x1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/car_exists/PPLCNet_x1_0_distillation.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/car_exists/PPLCNet_x1_0_search.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/car_exists/SwinTransformer_tiny_patch4_window7_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/car_exists/search.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/clarity_assessment/PPLCNet_x1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/code_exists/MobileNetV3_small_x0_35.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/code_exists/PPLCNet_x1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/code_exists/PPLCNet_x1_0_distillation.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/code_exists/PPLCNet_x1_0_search.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/code_exists/SwinTransformer_tiny_patch4_window7_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/code_exists/search.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/image_orientation/PPLCNet_x1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/language_classification/MobileNetV3_small_x0_35.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/language_classification/PPLCNet_x1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/language_classification/PPLCNet_x1_0_distillation.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/language_classification/PPLCNet_x1_0_search.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/language_classification/SwinTransformer_tiny_patch4_window7_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/language_classification/search.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/MobileNetV3_small_x0_35.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/PPLCNet_x1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/PPLCNet_x1_0_Distillation.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/PPLCNet_x1_0_search.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/Res2Net200_vd_26w_4s.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/SwinTransformer_tiny_patch4_window7_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/search.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_exists/MobileNetV3_small_x0_35.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_exists/PPLCNet_x1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_exists/PPLCNet_x1_0_distillation.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_exists/PPLCNet_x1_0_search.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_exists/SwinTransformer_tiny_patch4_window7_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_exists/search.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/MobileNetV3_small_x0_35.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/PPLCNet_x1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/PPLCNet_x1_0_distillation.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/PPLCNet_x1_0_search.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/Res2Net200_vd_26w_4s.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/SwinTransformer_tiny_patch4_window7_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/search.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/table_attribute/PPLCNet_x1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/table_attribute/PPLCNet_x1_0_distillation.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/text_image_orientation/MobileNetV3_small_x0_35.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/text_image_orientation/PPLCNet_x1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/text_image_orientation/PPLCNet_x1_0_distillation.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/text_image_orientation/PPLCNet_x1_0_search.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/text_image_orientation/SwinTransformer_tiny_patch4_window7_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/text_image_orientation/search.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/MobileNetV3_small_x0_35.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/PPLCNet_x1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/PPLCNet_x1_0_224x224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/PPLCNet_x1_0_distillation.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/PPLCNet_x1_0_search.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/SwinTransformer_tiny_patch4_window7_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/search.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/traffic_sign/MobileNetV3_samll_x0_35.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/traffic_sign/PPLCNet_x1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/traffic_sign/PPLCNet_x1_0_distillation.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/traffic_sign/PPLCNet_x1_0_search.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/traffic_sign/SwinTransformer_tiny_patch4_window7_224.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/traffic_sign/search.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/MobileNetV3_small_x0_35.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/PPLCNet_x1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/PPLCNet_x1_0_distillation.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/PPLCNet_x1_0_search.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/Res2Net200_vd_26w_4s.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/ResNet50.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/search.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Products/MV3_Large_1x_Aliproduct_DLBHC.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Products/ResNet50_vd_Aliproduct.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Products/ResNet50_vd_Inshop.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Products/ResNet50_vd_SOP.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ResNet50_UReID_infer.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/SVTR/svtr_base.yml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/SVTR/svtr_large.yml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/SVTR/svtr_tiny.yml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/StrategySearch/person.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Vehicle/PPLCNet_2.5x_ReID.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Vehicle/ResNet50.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Vehicle/ResNet50_ReID.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/metric_learning/adaface_ir18.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/metric_learning/xbm_resnet50.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/multi_scale/MobileNetV1_multi_scale.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/practical_models/.gitkeep
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/practical_models/CLIP_large_patch14_224_aesthetic.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/practical_models/EfficientNetB3_watermark.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/practical_models/PPHGNet_tiny_calling_halfbody.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/MobileNetV1_retrieval.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/MobileNetV3_large_x1_0.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/ResNet50_vd.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/kunlun/HRNet_W18_C_finetune_kunlun.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/kunlun/ResNet50_vd_finetune_kunlun.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/kunlun/VGG16_finetune_kunlun.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/kunlun/VGG19_finetune_kunlun.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/new_user/ShuffleNetV2_x0_25.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/professional/MobileNetV1_multilabel.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/professional/MobileNetV3_large_x1_0_CIFAR100_finetune.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/professional/R50_vd_distill_MV3_large_x1_0_CIFAR100.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/professional/ResNet50_vd_CIFAR100.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/professional/ResNet50_vd_mixup_CIFAR100_finetune.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/professional/VGG19_CIFAR10_DeepHash.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/reid/MetaBIN_ResNet50_cross_domain.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/reid/strong_baseline/baseline.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/reid/strong_baseline/softmax_triplet.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/reid/strong_baseline/softmax_triplet_with_center.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/GeneralRecognition_PPLCNet_x2_5_quantization.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/MobileNetV3_large_x1_0_prune.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/MobileNetV3_large_x1_0_quantization.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/PPLCNet_x1_0_quantization.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/ResNet50_vd_prune.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/ResNet50_vd_quantization.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/ResNet50_vehicle_cls_prune.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/ResNet50_vehicle_cls_quantization.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/ResNet50_vehicle_reid_prune.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/ResNet50_vehicle_reid_quantization.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ssl/CCSSL/FixMatchCCSSL_cifar100_10000_4gpu.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ssl/CCSSL/FixMatchCCSSL_cifar10_4000_4gpu.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ssl/FixMatch/FixMatch_cifar10_250.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ssl/FixMatch/FixMatch_cifar10_40.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ssl/FixMatch/FixMatch_cifar10_4000.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ssl/FixMatch/FixMatch_cifar10_40_4gpu.yaml
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/engine.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/evaluation/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/evaluation/adaface.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/evaluation/classification.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/evaluation/face_recognition.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/evaluation/retrieval.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/train.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/train_fixmatch.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/train_fixmatch_ccssl.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/train_metabin.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/train_progressive.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/utils.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/afdloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/ccssl_loss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/celoss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/centerloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/comfunc.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/contrasiveloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/deephashloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/dist_loss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/distanceloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/distillationloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/dkdloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/dmlloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/emlloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/googlenetloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/kldivloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/metabinloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/mgd_loss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/msmloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/multilabelloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/npairsloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/pairwisecosface.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/pefdloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/rkdloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/skdloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/softsuploss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/softtargetceloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/supconloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/trihardloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/triplet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/tripletangularmarginloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/wslloss.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/xbm.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/metric/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/metric/avg_metrics.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/metric/face_metrics.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/metric/metrics.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/optimizer/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/optimizer/learning_rate.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/optimizer/optimizer.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/static/README.md
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/static/program.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/static/run_dali.sh
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/static/save_load.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/static/train.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/COCO2017_label_list.txt
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/NUS-WIDE-SCENE_label_list.txt
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/PULC_label_list/image_orientation_label_list.txt
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/PULC_label_list/language_classification_label_list.txt
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/PULC_label_list/text_image_orientation_label_list.txt
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/PULC_label_list/textline_orientation_label_list.txt
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/PULC_label_list/traffic_sign_label_list.txt
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/amp.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/check.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/config.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/create_cls_trainval_lists.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/create_coco_multilabel_lists.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/dist_utils.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/download.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/ema.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/feature_maps_visualization/fm_vis.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/feature_maps_visualization/resnet.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/feature_maps_visualization/utils.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/imagenet1k_label_list.txt
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/initializer.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/logger.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/metrics.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/misc.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/model_zoo.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/pedestrian_attribute_label_list.txt
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/pretrained.list
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/profiler.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/save_load.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/save_result.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/vehicle_attribute_label_list.txt
 create mode 100644 cv/classification/resnet50/paddlepaddle/requirements.txt
 create mode 100644 cv/classification/resnet50/paddlepaddle/run_resnet50.sh
 create mode 100644 cv/classification/resnet50/paddlepaddle/run_resnet50_dist.sh
 create mode 100644 cv/classification/resnet50/paddlepaddle/train.py

diff --git a/cv/classification/resnet50/paddlepaddle/.gitignore b/cv/classification/resnet50/paddlepaddle/.gitignore
new file mode 100644
index 000000000..dcf07c225
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/.gitignore
@@ -0,0 +1,16 @@
+data/
+dataset/
+__pycache__/
+*.pyc
+*.sw*
+*/workerlog*
+checkpoints/
+output*/
+pretrained/
+.ipynb_checkpoints/
+*.ipynb*
+_build/
+build/
+log/
+nohup.out
+.DS_Store
diff --git a/cv/classification/resnet50/paddlepaddle/LICENSE b/cv/classification/resnet50/paddlepaddle/LICENSE
new file mode 100644
index 000000000..261eeb9e9
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/cv/classification/resnet50/paddlepaddle/MANIFEST.in b/cv/classification/resnet50/paddlepaddle/MANIFEST.in
new file mode 100644
index 000000000..b0a4f6dc1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/MANIFEST.in
@@ -0,0 +1,7 @@
+include LICENSE.txt
+include README.md
+include docs/en/whl_en.md
+recursive-include deploy/python predict_cls.py preprocess.py postprocess.py det_preprocess.py
+recursive-include deploy/utils get_image_list.py config.py logger.py predictor.py
+
+recursive-include ppcls/ *.py *.txt
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/README_ch.md b/cv/classification/resnet50/paddlepaddle/README_ch.md
new file mode 100644
index 000000000..9219857fd
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/README_ch.md
@@ -0,0 +1,151 @@
+简体中文 | [English](README_en.md)
+
+# PaddleClas
+
+## 简介
+
+飞桨图像识别套件PaddleClas是飞桨为工业界和学术界所准备的一个图像识别任务的工具集，助力使用者训练出更好的视觉模型和应用落地。
+
+**近期更新**
+- 2022.4.21 新增 CVPR2022 oral论文 [MixFormer](https://arxiv.org/pdf/2204.02557.pdf) 相关[代码](https://github.com/PaddlePaddle/PaddleClas/pull/1820/files)。
+- 2022.1.27 全面升级文档；新增[PaddleServing C++ pipeline部署方式](./deploy/paddleserving)和[18M图像识别安卓部署Demo](./deploy/lite_shitu)。
+- 2021.11.1 发布[PP-ShiTu技术报告](https://arxiv.org/pdf/2111.00775.pdf)，新增饮料识别demo
+- 2021.10.23 发布轻量级图像识别系统PP-ShiTu，CPU上0.2s即可完成在10w+库的图像识别。
+[点击这里](./docs/zh_CN/quick_start/quick_start_recognition.md)立即体验
+- 2021.09.17 发布PP-LCNet系列超轻量骨干网络模型, 在Intel CPU上，单张图像预测速度约5ms，ImageNet-1K数据集上Top1识别准确率达到80.82%，超越ResNet152的模型效果。PP-LCNet的介绍可以参考[论文](https://arxiv.org/pdf/2109.15099.pdf), 或者[PP-LCNet模型介绍](docs/zh_CN/models/PP-LCNet.md)，相关指标和预训练权重可以从 [这里](docs/zh_CN/algorithm_introduction/ImageNet_models.md)下载。
+- [more](./docs/zh_CN/others/update_history.md)
+
+## 特性
+
+- PP-ShiTu轻量图像识别系统：集成了目标检测、特征学习、图像检索等模块，广泛适用于各类图像识别任务。cpu上0.2s即可完成在10w+库的图像识别。
+
+- PP-LCNet轻量级CPU骨干网络：专门为CPU设备打造轻量级骨干网络，速度、精度均远超竞品。
+
+- 丰富的预训练模型库：提供了36个系列共175个ImageNet预训练模型，其中7个精选系列模型支持结构快速修改。
+
+- 全面易用的特征学习组件：集成arcmargin, triplet loss等12度量学习方法，通过配置文件即可随意组合切换。
+
+- SSLD知识蒸馏：14个分类预训练模型，精度普遍提升3%以上；其中ResNet50_vd模型在ImageNet-1k数据集上的Top-1精度达到了84.0%，
+Res2Net200_vd预训练模型Top-1精度高达85.1%。
+
+<div align="center">
+<img src="./docs/images/recognition.gif"  width = "400" />
+</div>
+
+
+## 欢迎加入技术交流群
+
+* 您可以扫描下面的QQ/微信二维码（添加小助手微信并回复“C”），加入PaddleClas微信交流群，获得更高效的问题答疑，与各行各业开发者充分交流，期待您的加入。
+
+<div align="center">
+<img src="https://user-images.githubusercontent.com/80816848/164383225-e375eb86-716e-41b4-a9e0-4b8a3976c1aa.jpg" width="200"/>
+<img src="https://user-images.githubusercontent.com/48054808/160531099-9811bbe6-cfbb-47d5-8bdb-c2b40684d7dd.png" width="200"/>
+</div>
+
+## 快速体验
+
+PP-ShiTu图像识别快速体验：[点击这里](./docs/zh_CN/quick_start/quick_start_recognition.md)
+
+## 文档教程
+- 安装说明
+  - [安装Paddle](./docs/zh_CN/installation/install_paddle.md)
+  - [安装PaddleClas](./docs/zh_CN/installation/install_paddleclas.md)
+- 快速体验
+  - [PP-ShiTu图像识别快速体验](./docs/zh_CN/quick_start/quick_start_recognition.md)
+  - 图像分类快速体验
+    - [尝鲜版](./docs/zh_CN/quick_start/quick_start_classification_new_user.md)
+    - [进阶版](./docs/zh_CN/quick_start/quick_start_classification_professional.md)
+    - [多标签分类](./docs/zh_CN/quick_start/quick_start_multilabel_classification.md)
+- [PP-ShiTu图像识别系统介绍](#图像识别系统介绍)
+    - [主体检测](./docs/zh_CN/image_recognition_pipeline/mainbody_detection.md)
+    - [特征提取](./docs/zh_CN/image_recognition_pipeline/feature_extraction.md)
+    - [向量检索](./docs/zh_CN/image_recognition_pipeline/vector_search.md)
+- [骨干网络和预训练模型库](./docs/zh_CN/algorithm_introduction/ImageNet_models.md)
+- 数据准备
+  - [图像分类数据集介绍](./docs/zh_CN/data_preparation/classification_dataset.md)
+  - [图像识别数据集介绍](./docs/zh_CN/data_preparation/recognition_dataset.md)
+- 模型训练
+    - [图像分类任务](./docs/zh_CN/models_training/classification.md)
+    - [图像识别任务](./docs/zh_CN/models_training/recognition.md)
+    - [训练参数调整策略](./docs/zh_CN/models_training/train_strategy.md)
+    - [配置文件说明](./docs/zh_CN/models_training/config_description.md)
+- 模型预测部署
+    - [模型导出](./docs/zh_CN/inference_deployment/export_model.md)
+    - Python/C++ 预测引擎
+      - [基于Python预测引擎预测推理](./docs/zh_CN/inference_deployment/python_deploy.md)
+      - [基于C++分类预测引擎预测推理](./docs/zh_CN/inference_deployment/cpp_deploy.md)、[基于C++的PP-ShiTu预测引擎预测推理](deploy/cpp_shitu/readme.md)
+    - 服务化部署
+      - [Paddle Serving服务化部署(推荐)](./docs/zh_CN/inference_deployment/paddle_serving_deploy.md)
+      - [Hub serving服务化部署](./docs/zh_CN/inference_deployment/paddle_hub_serving_deploy.md)
+    - [端侧部署](./deploy/lite/readme.md)
+    - [whl包预测](./docs/zh_CN/inference_deployment/whl_deploy.md)
+- 算法介绍
+    - [图像分类任务介绍](./docs/zh_CN/algorithm_introduction/image_classification.md)
+    - [度量学习介绍](./docs/zh_CN/algorithm_introduction/metric_learning.md)
+- 高阶使用
+    - [数据增广](./docs/zh_CN/advanced_tutorials/DataAugmentation.md)
+    - [模型量化](./docs/zh_CN/advanced_tutorials/model_prune_quantization.md)
+    - [知识蒸馏](./docs/zh_CN/advanced_tutorials/knowledge_distillation.md)
+    - [PaddleClas结构解析](./docs/zh_CN/advanced_tutorials/code_overview.md)
+    - [社区贡献指南](./docs/zh_CN/advanced_tutorials/how_to_contribute.md)
+- FAQ
+    - [图像识别精选问题](docs/zh_CN/faq_series/faq_2021_s2.md)
+    - [图像分类精选问题](docs/zh_CN/faq_series/faq_selected_30.md)
+    - [图像分类FAQ第一季](docs/zh_CN/faq_series/faq_2020_s1.md)
+    - [图像分类FAQ第二季](docs/zh_CN/faq_series/faq_2021_s1.md)
+- [许可证书](#许可证书)
+- [贡献代码](#贡献代码)
+
+<a name="图像识别系统介绍"></a>
+## PP-ShiTu图像识别系统介绍
+
+<div align="center">
+<img src="./docs/images/structure.jpg"  width = "800" />
+</div>
+
+PP-ShiTu是一个实用的轻量级通用图像识别系统，主要由主体检测、特征学习和向量检索三个模块组成。该系统从骨干网络选择和调整、损失函数的选择、数据增强、学习率变换策略、正则化参数选择、预训练模型使用以及模型裁剪量化8个方面，采用多种策略，对各个模块的模型进行优化，最终得到在CPU上仅0.2s即可完成10w+库的图像识别的系统。更多细节请参考[PP-ShiTu技术方案](https://arxiv.org/pdf/2111.00775.pdf)。
+
+
+<a name="识别效果展示"></a>
+## PP-ShiTu图像识别系统效果展示
+- 瓶装饮料识别
+<div align="center">
+<img src="docs/images/drink_demo.gif">
+</div>
+
+- 商品识别
+<div align="center">
+<img src="https://user-images.githubusercontent.com/18028216/122769644-51604f80-d2d7-11eb-8290-c53b12a5c1f6.gif"  width = "400" />
+</div>
+
+- 动漫人物识别
+<div align="center">
+<img src="https://user-images.githubusercontent.com/18028216/122769746-6b019700-d2d7-11eb-86df-f1d710999ba6.gif"  width = "400" />
+</div>
+
+- logo识别
+<div align="center">
+<img src="https://user-images.githubusercontent.com/18028216/122769837-7fde2a80-d2d7-11eb-9b69-04140e9d785f.gif"  width = "400" />
+</div>
+
+
+- 车辆识别
+<div align="center">
+<img src="https://user-images.githubusercontent.com/18028216/122769916-8ec4dd00-d2d7-11eb-8c60-42d89e25030c.gif"  width = "400" />
+</div>
+
+
+<a name="许可证书"></a>
+
+## 许可证书
+本项目的发布受<a href="https://github.com/PaddlePaddle/PaddleCLS/blob/master/LICENSE">Apache 2.0 license</a>许可认证。
+
+
+<a name="贡献代码"></a>
+## 贡献代码
+我们非常欢迎你为PaddleClas贡献代码，也十分感谢你的反馈。
+如果想为PaddleCLas贡献代码，可以参考[贡献指南](./docs/zh_CN/advanced_tutorials/how_to_contribute.md)。
+
+- 非常感谢[nblib](https://github.com/nblib)修正了PaddleClas中RandErasing的数据增广配置文件。
+- 非常感谢[chenpy228](https://github.com/chenpy228)修正了PaddleClas文档中的部分错别字。
+- 非常感谢[jm12138](https://github.com/jm12138)为PaddleClas添加ViT，DeiT系列模型和RepVGG系列模型。
diff --git a/cv/classification/resnet50/paddlepaddle/README_en.md b/cv/classification/resnet50/paddlepaddle/README_en.md
new file mode 100644
index 000000000..9b0d7c85d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/README_en.md
@@ -0,0 +1,135 @@
+[简体中文](README_ch.md) | English
+
+# PaddleClas
+
+## Introduction
+
+PaddleClas is an image recognition toolset for industry and academia, helping users train better computer vision models and apply them in real scenarios.
+
+**Recent updates**
+
+- 2022.4.21 Added the related [code](https://github.com/PaddlePaddle/PaddleClas/pull/1820/files) of the CVPR2022 oral paper [MixFormer](https://arxiv.org/pdf/2204.02557.pdf).
+
+- 2021.09.17 Add PP-LCNet series model developed by PaddleClas, these models show strong competitiveness on Intel CPUs.
+For the introduction of PP-LCNet, please refer to [paper](https://arxiv.org/pdf/2109.15099.pdf) or [PP-LCNet model introduction](docs/en/models/PP-LCNet_en.md). The metrics and pretrained model are available [here](docs/en/ImageNet_models_en.md).
+
+- 2021.06.29 Add Swin-transformer series model，Highest top1 acc on ImageNet1k dataset reaches 87.2%, training, evaluation and inference are all supported. Pretrained models can be downloaded [here](docs/en/models/models_intro_en.md).
+- 2021.06.16 PaddleClas release/2.2. Add metric learning and vector search modules. Add product recognition, animation character recognition, vehicle recognition and logo recognition. Added 30 pretrained models of LeViT, Twins, TNT, DLA, HarDNet, and RedNet, and the accuracy is roughly the same as that of the paper.
+- [more](./docs/en/update_history_en.md)
+
+## Features
+
+- A practical image recognition system consist of detection, feature learning and retrieval modules, widely applicable to all types of image recognition tasks.
+Four sample solutions are provided, including product recognition, vehicle recognition, logo recognition and animation character recognition.
+
+- Rich library of pre-trained models: Provide a total of 164 ImageNet pre-trained models in 35 series, among which 6 selected series of models support fast structural modification.
+
+- Comprehensive and easy-to-use feature learning components: 12 metric learning methods are integrated and can be combined and switched at will through configuration files.
+
+- SSLD knowledge distillation: The 14 classification pre-training models generally improved their accuracy by more than 3%; among them, the ResNet50_vd model achieved a Top-1 accuracy of 84.0% on the Image-Net-1k dataset and the Res2Net200_vd pre-training model achieved a Top-1 accuracy of 85.1%.
+
+- Data augmentation: Provide 8 data augmentation algorithms such as AutoAugment, Cutout, Cutmix, etc.  with detailed introduction, code replication and evaluation of effectiveness in a unified experimental environment.
+
+
+
+
+<div align="center">
+<img src="./docs/images/recognition_en.gif"  width = "400" />
+</div>
+
+
+## Welcome to Join the Technical Exchange Group
+
+* You can also scan the QR code below to join the PaddleClas QQ group and WeChat group (add and replay "C") to get more efficient answers to your questions and to communicate with developers from all walks of life. We look forward to hearing from you.
+
+<div align="center">
+<img src="https://user-images.githubusercontent.com/80816848/164383225-e375eb86-716e-41b4-a9e0-4b8a3976c1aa.jpg" width="200"/>
+<img src="https://user-images.githubusercontent.com/48054808/160531099-9811bbe6-cfbb-47d5-8bdb-c2b40684d7dd.png" width="200"/>
+</div>
+
+## Quick Start
+Quick experience of image recognition：[Link](./docs/en/tutorials/quick_start_recognition_en.md)
+
+## Tutorials
+
+- [Quick Installation](./docs/en/tutorials/install_en.md)
+- [Quick Start of Recognition](./docs/en/tutorials/quick_start_recognition_en.md)
+- [Introduction to Image Recognition Systems](#Introduction_to_Image_Recognition_Systems)
+- [Demo images](#Demo_images)
+- Algorithms Introduction
+    - [Backbone Network and Pre-trained Model Library](./docs/en/ImageNet_models_en.md)
+    - [Mainbody Detection](./docs/en/application/mainbody_detection_en.md)
+    - [Image Classification](./docs/en/tutorials/image_classification_en.md)
+    - [Feature Learning](./docs/en/application/feature_learning_en.md)
+        - [Product Recognition](./docs/en/application/product_recognition_en.md)
+        - [Vehicle Recognition](./docs/en/application/vehicle_recognition_en.md)
+        - [Logo Recognition](./docs/en/application/logo_recognition_en.md)
+        - [Animation Character Recognition](./docs/en/application/cartoon_character_recognition_en.md)
+    - [Vector Search](./deploy/vector_search/README.md)
+- Models Training/Evaluation
+    - [Image Classification](./docs/en/tutorials/getting_started_en.md)
+    - [Feature Learning](./docs/en/tutorials/getting_started_retrieval_en.md)
+- Inference Model Prediction
+    - [Python Inference](./docs/en/inference.md)
+    - [C++ Classfication Inference](./deploy/cpp/readme_en.md)， [C++ PP-ShiTu Inference](deploy/cpp_shitu/readme_en.md)
+- Model Deploy (only support classification for now, recognition coming soon)
+    - [Hub Serving Deployment](./deploy/hubserving/readme_en.md)
+    - [Mobile Deployment](./deploy/lite/readme_en.md)
+    - [Inference Using whl](./docs/en/whl_en.md)
+- Advanced Tutorial
+    - [Knowledge Distillation](./docs/en/advanced_tutorials/distillation/distillation_en.md)
+    - [Model Quantization](./docs/en/extension/paddle_quantization_en.md)
+    - [Data Augmentation](./docs/en/advanced_tutorials/image_augmentation/ImageAugment_en.md)
+- [License](#License)
+- [Contribution](#Contribution)
+
+<a name="Introduction_to_Image_Recognition_Systems"></a>
+## Introduction to Image Recognition Systems
+
+<div align="center">
+<img src="./docs/images/structure.jpg"  width = "800" />
+</div>
+
+Image recognition can be divided into three steps:
+- （1）Identify region proposal for target objects through a detection model；
+- （2）Extract features for each region proposal;
+- （3）Search features in the retrieval database and output results;
+
+For a new unknown category, there is no need to retrain the model, just prepare images of new category, extract features and update retrieval database and the category can be recognised.
+
+<a name="Demo_images"></a>
+## Demo images [more](https://github.com/PaddlePaddle/PaddleClas/tree/release/2.2/docs/images/recognition/more_demo_images)
+- Product recognition
+<div align="center">
+<img src="https://user-images.githubusercontent.com/18028216/122769644-51604f80-d2d7-11eb-8290-c53b12a5c1f6.gif"  width = "400" />
+</div>
+
+- Cartoon character recognition
+<div align="center">
+<img src="https://user-images.githubusercontent.com/18028216/122769746-6b019700-d2d7-11eb-86df-f1d710999ba6.gif"  width = "400" />
+</div>
+
+- Logo recognition
+<div align="center">
+<img src="https://user-images.githubusercontent.com/18028216/122769837-7fde2a80-d2d7-11eb-9b69-04140e9d785f.gif"  width = "400" />
+</div>
+
+- Car recognition
+<div align="center">
+<img src="https://user-images.githubusercontent.com/18028216/122769916-8ec4dd00-d2d7-11eb-8c60-42d89e25030c.gif"  width = "400" />
+</div>
+
+<a name="License"></a>
+## License
+PaddleClas is released under the Apache 2.0 license <a href="https://github.com/PaddlePaddle/PaddleCLS/blob/master/LICENSE">Apache 2.0 license</a>
+
+
+<a name="Contribution"></a>
+## Contribution
+Contributions are highly welcomed and we would really appreciate your feedback!!
+
+
+- Thank [nblib](https://github.com/nblib) to fix bug of RandErasing.
+- Thank [chenpy228](https://github.com/chenpy228) to fix some typos PaddleClas.
+- Thank [jm12138](https://github.com/jm12138) to add ViT, DeiT models and RepVGG models into PaddleClas.
+- Thank [FutureSI](https://aistudio.baidu.com/aistudio/personalcenter/thirdview/76563) to parse and summarize the PaddleClas code.
diff --git a/cv/classification/resnet50/paddlepaddle/__init__.py b/cv/classification/resnet50/paddlepaddle/__init__.py
new file mode 100644
index 000000000..2128a6cc7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['PaddleClas']
+from .paddleclas import PaddleClas
+from ppcls.arch.backbone import *
diff --git a/cv/classification/resnet50/paddlepaddle/hubconf.py b/cv/classification/resnet50/paddlepaddle/hubconf.py
new file mode 100644
index 000000000..b7f76745a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/hubconf.py
@@ -0,0 +1,788 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+dependencies = ['paddle']
+
+import paddle
+import os
+import sys
+
+
+class _SysPathG(object):
+    """
+    _SysPathG used to add/clean path for sys.path. Making sure minimal pkgs dependents by skiping parent dirs.
+
+    __enter__
+        add path into sys.path
+    __exit__
+        clean user's sys.path to avoid unexpect behaviors
+    """
+
+    def __init__(self, path):
+        self.path = path
+
+    def __enter__(self, ):
+        sys.path.insert(0, self.path)
+
+    def __exit__(self, type, value, traceback):
+        _p = sys.path.pop(0)
+        assert _p == self.path, 'Make sure sys.path cleaning {} correctly.'.format(
+            self.path)
+
+
+with _SysPathG(os.path.dirname(os.path.abspath(__file__)), ):
+    import ppcls
+    import ppcls.arch.backbone as backbone
+
+    def ppclas_init():
+        if ppcls.utils.logger._logger is None:
+            ppcls.utils.logger.init_logger()
+
+    ppclas_init()
+
+    def _load_pretrained_parameters(model, name):
+        url = 'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/{}_pretrained.pdparams'.format(
+            name)
+        path = paddle.utils.download.get_weights_path_from_url(url)
+        model.set_state_dict(paddle.load(path))
+        return model
+
+    def alexnet(pretrained=False, **kwargs):
+        """
+        AlexNet
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `AlexNet` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.AlexNet(**kwargs)
+
+        return model
+
+    def vgg11(pretrained=False, **kwargs):
+        """
+        VGG11
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+                stop_grad_layers: int=0. The parameters in blocks which index larger than `stop_grad_layers`, will be set `param.trainable=False`
+        Returns:
+            model: nn.Layer. Specific `VGG11` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.VGG11(**kwargs)
+
+        return model
+
+    def vgg13(pretrained=False, **kwargs):
+        """
+        VGG13
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+                stop_grad_layers: int=0. The parameters in blocks which index larger than `stop_grad_layers`, will be set `param.trainable=False`
+        Returns:
+            model: nn.Layer. Specific `VGG13` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.VGG13(**kwargs)
+
+        return model
+
+    def vgg16(pretrained=False, **kwargs):
+        """
+        VGG16
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+                stop_grad_layers: int=0. The parameters in blocks which index larger than `stop_grad_layers`, will be set `param.trainable=False`
+        Returns:
+            model: nn.Layer. Specific `VGG16` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.VGG16(**kwargs)
+
+        return model
+
+    def vgg19(pretrained=False, **kwargs):
+        """
+        VGG19
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+                stop_grad_layers: int=0. The parameters in blocks which index larger than `stop_grad_layers`, will be set `param.trainable=False`
+        Returns:
+            model: nn.Layer. Specific `VGG19` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.VGG19(**kwargs)
+
+        return model
+
+    def resnet18(pretrained=False, **kwargs):
+        """
+        ResNet18
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+                input_image_channel: int=3. The number of input image channels
+                data_format: str='NCHW'. The data format of batch input images, should in ('NCHW', 'NHWC')
+        Returns:
+            model: nn.Layer. Specific `ResNet18` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.ResNet18(**kwargs)
+
+        return model
+
+    def resnet34(pretrained=False, **kwargs):
+        """
+        ResNet34
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+                input_image_channel: int=3. The number of input image channels
+                data_format: str='NCHW'. The data format of batch input images, should in ('NCHW', 'NHWC')
+        Returns:
+            model: nn.Layer. Specific `ResNet34` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.ResNet34(**kwargs)
+
+        return model
+
+    def resnet50(pretrained=False, **kwargs):
+        """
+        ResNet50
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+                input_image_channel: int=3. The number of input image channels
+                data_format: str='NCHW'. The data format of batch input images, should in ('NCHW', 'NHWC')
+        Returns:
+            model: nn.Layer. Specific `ResNet50` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.ResNet50(**kwargs)
+
+        return model
+
+    def resnet101(pretrained=False, **kwargs):
+        """
+        ResNet101
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+                input_image_channel: int=3. The number of input image channels
+                data_format: str='NCHW'. The data format of batch input images, should in ('NCHW', 'NHWC')
+        Returns:
+            model: nn.Layer. Specific `ResNet101` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.ResNet101(**kwargs)
+
+        return model
+
+    def resnet152(pretrained=False, **kwargs):
+        """
+        ResNet152
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+                input_image_channel: int=3. The number of input image channels
+                data_format: str='NCHW'. The data format of batch input images, should in ('NCHW', 'NHWC')
+        Returns:
+            model: nn.Layer. Specific `ResNet152` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.ResNet152(**kwargs)
+
+        return model
+
+    def squeezenet1_0(pretrained=False, **kwargs):
+        """
+        SqueezeNet1_0
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `SqueezeNet1_0` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.SqueezeNet1_0(**kwargs)
+
+        return model
+
+    def squeezenet1_1(pretrained=False, **kwargs):
+        """
+        SqueezeNet1_1
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `SqueezeNet1_1` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.SqueezeNet1_1(**kwargs)
+
+        return model
+
+    def densenet121(pretrained=False, **kwargs):
+        """
+        DenseNet121
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+                dropout: float=0. Probability of setting units to zero.
+                bn_size: int=4. The number of channals per group
+        Returns:
+            model: nn.Layer. Specific `DenseNet121` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.DenseNet121(**kwargs)
+
+        return model
+
+    def densenet161(pretrained=False, **kwargs):
+        """
+        DenseNet161
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+                dropout: float=0. Probability of setting units to zero.
+                bn_size: int=4. The number of channals per group
+        Returns:
+            model: nn.Layer. Specific `DenseNet161` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.DenseNet161(**kwargs)
+
+        return model
+
+    def densenet169(pretrained=False, **kwargs):
+        """
+        DenseNet169
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+                dropout: float=0. Probability of setting units to zero.
+                bn_size: int=4. The number of channals per group
+        Returns:
+            model: nn.Layer. Specific `DenseNet169` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.DenseNet169(**kwargs)
+
+        return model
+
+    def densenet201(pretrained=False, **kwargs):
+        """
+        DenseNet201
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+                dropout: float=0. Probability of setting units to zero.
+                bn_size: int=4. The number of channals per group
+        Returns:
+            model: nn.Layer. Specific `DenseNet201` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.DenseNet201(**kwargs)
+
+        return model
+
+    def densenet264(pretrained=False, **kwargs):
+        """
+        DenseNet264
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+                dropout: float=0. Probability of setting units to zero.
+                bn_size: int=4. The number of channals per group
+        Returns:
+            model: nn.Layer. Specific `DenseNet264` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.DenseNet264(**kwargs)
+
+        return model
+
+    def inceptionv3(pretrained=False, **kwargs):
+        """
+        InceptionV3
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `InceptionV3` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.InceptionV3(**kwargs)
+
+        return model
+
+    def inceptionv4(pretrained=False, **kwargs):
+        """
+        InceptionV4
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `InceptionV4` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.InceptionV4(**kwargs)
+
+        return model
+
+    def googlenet(pretrained=False, **kwargs):
+        """
+        GoogLeNet
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `GoogLeNet` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.GoogLeNet(**kwargs)
+
+        return model
+
+    def shufflenetv2_x0_25(pretrained=False, **kwargs):
+        """
+        ShuffleNetV2_x0_25
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `ShuffleNetV2_x0_25` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.ShuffleNetV2_x0_25(**kwargs)
+
+        return model
+
+    def mobilenetv1(pretrained=False, **kwargs):
+        """
+        MobileNetV1
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `MobileNetV1` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.MobileNetV1(**kwargs)
+
+        return model
+
+    def mobilenetv1_x0_25(pretrained=False, **kwargs):
+        """
+        MobileNetV1_x0_25
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `MobileNetV1_x0_25` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.MobileNetV1_x0_25(**kwargs)
+
+        return model
+
+    def mobilenetv1_x0_5(pretrained=False, **kwargs):
+        """
+        MobileNetV1_x0_5
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `MobileNetV1_x0_5` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.MobileNetV1_x0_5(**kwargs)
+
+        return model
+
+    def mobilenetv1_x0_75(pretrained=False, **kwargs):
+        """
+        MobileNetV1_x0_75
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `MobileNetV1_x0_75` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.MobileNetV1_x0_75(**kwargs)
+
+        return model
+
+    def mobilenetv2_x0_25(pretrained=False, **kwargs):
+        """
+        MobileNetV2_x0_25
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `MobileNetV2_x0_25` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.MobileNetV2_x0_25(**kwargs)
+
+        return model
+
+    def mobilenetv2_x0_5(pretrained=False, **kwargs):
+        """
+        MobileNetV2_x0_5
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `MobileNetV2_x0_5` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.MobileNetV2_x0_5(**kwargs)
+
+        return model
+
+    def mobilenetv2_x0_75(pretrained=False, **kwargs):
+        """
+        MobileNetV2_x0_75
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `MobileNetV2_x0_75` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.MobileNetV2_x0_75(**kwargs)
+
+        return model
+
+    def mobilenetv2_x1_5(pretrained=False, **kwargs):
+        """
+        MobileNetV2_x1_5
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `MobileNetV2_x1_5` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.MobileNetV2_x1_5(**kwargs)
+
+        return model
+
+    def mobilenetv2_x2_0(pretrained=False, **kwargs):
+        """
+        MobileNetV2_x2_0
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `MobileNetV2_x2_0` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.MobileNetV2_x2_0(**kwargs)
+
+        return model
+
+    def mobilenetv3_large_x0_35(pretrained=False, **kwargs):
+        """
+        MobileNetV3_large_x0_35
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `MobileNetV3_large_x0_35` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.MobileNetV3_large_x0_35(**kwargs)
+
+        return model
+
+    def mobilenetv3_large_x0_5(pretrained=False, **kwargs):
+        """
+        MobileNetV3_large_x0_5
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `MobileNetV3_large_x0_5` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.MobileNetV3_large_x0_5(**kwargs)
+
+        return model
+
+    def mobilenetv3_large_x0_75(pretrained=False, **kwargs):
+        """
+        MobileNetV3_large_x0_75
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `MobileNetV3_large_x0_75` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.MobileNetV3_large_x0_75(**kwargs)
+
+        return model
+
+    def mobilenetv3_large_x1_0(pretrained=False, **kwargs):
+        """
+        MobileNetV3_large_x1_0
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `MobileNetV3_large_x1_0` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.MobileNetV3_large_x1_0(**kwargs)
+
+        return model
+
+    def mobilenetv3_large_x1_25(pretrained=False, **kwargs):
+        """
+        MobileNetV3_large_x1_25
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `MobileNetV3_large_x1_25` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.MobileNetV3_large_x1_25(**kwargs)
+
+        return model
+
+    def mobilenetv3_small_x0_35(pretrained=False, **kwargs):
+        """
+        MobileNetV3_small_x0_35
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `MobileNetV3_small_x0_35` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.MobileNetV3_small_x0_35(**kwargs)
+
+        return model
+
+    def mobilenetv3_small_x0_5(pretrained=False, **kwargs):
+        """
+        MobileNetV3_small_x0_5
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `MobileNetV3_small_x0_5` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.MobileNetV3_small_x0_5(**kwargs)
+
+        return model
+
+    def mobilenetv3_small_x0_75(pretrained=False, **kwargs):
+        """
+        MobileNetV3_small_x0_75
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `MobileNetV3_small_x0_75` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.MobileNetV3_small_x0_75(**kwargs)
+
+        return model
+
+    def mobilenetv3_small_x1_0(pretrained=False, **kwargs):
+        """
+        MobileNetV3_small_x1_0
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `MobileNetV3_small_x1_0` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.MobileNetV3_small_x1_0(**kwargs)
+
+        return model
+
+    def mobilenetv3_small_x1_25(pretrained=False, **kwargs):
+        """
+        MobileNetV3_small_x1_25
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `MobileNetV3_small_x1_25` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.MobileNetV3_small_x1_25(**kwargs)
+
+        return model
+
+    def resnext101_32x4d(pretrained=False, **kwargs):
+        """
+        ResNeXt101_32x4d
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `ResNeXt101_32x4d` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.ResNeXt101_32x4d(**kwargs)
+
+        return model
+
+    def resnext101_64x4d(pretrained=False, **kwargs):
+        """
+        ResNeXt101_64x4d
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `ResNeXt101_64x4d` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.ResNeXt101_64x4d(**kwargs)
+
+        return model
+
+    def resnext152_32x4d(pretrained=False, **kwargs):
+        """
+        ResNeXt152_32x4d
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `ResNeXt152_32x4d` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.ResNeXt152_32x4d(**kwargs)
+
+        return model
+
+    def resnext152_64x4d(pretrained=False, **kwargs):
+        """
+        ResNeXt152_64x4d
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `ResNeXt152_64x4d` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.ResNeXt152_64x4d(**kwargs)
+
+        return model
+
+    def resnext50_32x4d(pretrained=False, **kwargs):
+        """
+        ResNeXt50_32x4d
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `ResNeXt50_32x4d` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.ResNeXt50_32x4d(**kwargs)
+
+        return model
+
+    def resnext50_64x4d(pretrained=False, **kwargs):
+        """
+        ResNeXt50_64x4d
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `ResNeXt50_64x4d` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.ResNeXt50_64x4d(**kwargs)
+
+        return model
+
+    def darknet53(pretrained=False, **kwargs):
+        """
+        DarkNet53
+        Args:
+            pretrained: bool=False. If `True` load pretrained parameters, `False` otherwise.
+            kwargs: 
+                class_dim: int=1000. Output dim of last fc layer.
+        Returns:
+            model: nn.Layer. Specific `ResNeXt50_64x4d` model depends on args.
+        """
+        kwargs.update({'pretrained': pretrained})
+        model = backbone.DarkNet53(**kwargs)
+
+        return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls/__init__.py
new file mode 100644
index 000000000..d6cdb6f8f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import optimizer
+
+from .arch import *
+from .optimizer import *
+from .data import *
+from .utils import *
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/arch/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls/arch/__init__.py
new file mode 100644
index 000000000..2d5e29db8
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/arch/__init__.py
@@ -0,0 +1,134 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import copy
+import importlib
+
+import paddle.nn as nn
+from paddle.jit import to_static
+from paddle.static import InputSpec
+
+from . import backbone, gears
+from .backbone import *
+from .gears import build_gear
+from .utils import *
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils import logger
+from ppcls.utils.save_load import load_dygraph_pretrain
+from ppcls.arch.slim import prune_model, quantize_model
+
+__all__ = ["build_model", "RecModel", "DistillationModel"]
+
+
+def build_model(config):
+    arch_config = copy.deepcopy(config["Arch"])
+    model_type = arch_config.pop("name")
+    mod = importlib.import_module(__name__)
+    arch = getattr(mod, model_type)(**arch_config)
+    if isinstance(arch, TheseusLayer):
+        prune_model(config, arch)
+        quantize_model(config, arch)
+    return arch
+
+
+def apply_to_static(config, model):
+    support_to_static = config['Global'].get('to_static', False)
+
+    if support_to_static:
+        specs = None
+        if 'image_shape' in config['Global']:
+            specs = [InputSpec([None] + config['Global']['image_shape'])]
+        model = to_static(model, input_spec=specs)
+        logger.info("Successfully to apply @to_static with specs: {}".format(
+            specs))
+    return model
+
+
+class RecModel(TheseusLayer):
+    def __init__(self, **config):
+        super().__init__()
+        backbone_config = config["Backbone"]
+        backbone_name = backbone_config.pop("name")
+        self.backbone = eval(backbone_name)(**backbone_config)
+        if "BackboneStopLayer" in config:
+            backbone_stop_layer = config["BackboneStopLayer"]["name"]
+            self.backbone.stop_after(backbone_stop_layer)
+
+        if "Neck" in config:
+            self.neck = build_gear(config["Neck"])
+        else:
+            self.neck = None
+
+        if "Head" in config:
+            self.head = build_gear(config["Head"])
+        else:
+            self.head = None
+
+    def forward(self, x, label=None):
+        out = dict()
+        x = self.backbone(x)
+        out["backbone"] = x
+        if self.neck is not None:
+            x = self.neck(x)
+            out["neck"] = x
+        out["features"] = x
+        if self.head is not None:
+            y = self.head(x, label)
+            out["logits"] = y
+        return out
+
+
+class DistillationModel(nn.Layer):
+    def __init__(self,
+                 models=None,
+                 pretrained_list=None,
+                 freeze_params_list=None,
+                 **kargs):
+        super().__init__()
+        assert isinstance(models, list)
+        self.model_list = []
+        self.model_name_list = []
+        if pretrained_list is not None:
+            assert len(pretrained_list) == len(models)
+
+        if freeze_params_list is None:
+            freeze_params_list = [False] * len(models)
+        assert len(freeze_params_list) == len(models)
+        for idx, model_config in enumerate(models):
+            assert len(model_config) == 1
+            key = list(model_config.keys())[0]
+            model_config = model_config[key]
+            model_name = model_config.pop("name")
+            model = eval(model_name)(**model_config)
+
+            if freeze_params_list[idx]:
+                for param in model.parameters():
+                    param.trainable = False
+            self.model_list.append(self.add_sublayer(key, model))
+            self.model_name_list.append(key)
+
+        if pretrained_list is not None:
+            for idx, pretrained in enumerate(pretrained_list):
+                if pretrained is not None:
+                    load_dygraph_pretrain(
+                        self.model_name_list[idx], path=pretrained)
+
+    def forward(self, x, label=None):
+        result_dict = dict()
+        for idx, model_name in enumerate(self.model_name_list):
+            if label is None:
+                result_dict[model_name] = self.model_list[idx](x)
+            else:
+                result_dict[model_name] = self.model_list[idx](x, label)
+        return result_dict
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/__init__.py
new file mode 100644
index 000000000..74d266414
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/__init__.py
@@ -0,0 +1,34 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import inspect
+
+from ppcls.arch.backbone.legendary_models.resnet import ResNet50, ResNet50_vd
+from ppcls.arch.backbone.model_zoo.resnet_vc import ResNet50_vc
+
+# help whl get all the models' api (class type) and components' api (func type)
+def get_apis():
+    current_func = sys._getframe().f_code.co_name
+    current_module = sys.modules[__name__]
+    api = []
+    for _, obj in inspect.getmembers(current_module,
+                                     inspect.isclass) + inspect.getmembers(
+                                         current_module, inspect.isfunction):
+        api.append(obj.__name__)
+    api.remove(current_func)
+    return api
+
+
+__all__ = get_apis()
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/base/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/base/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/base/theseus_layer.py b/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/base/theseus_layer.py
new file mode 100644
index 000000000..908d94445
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/base/theseus_layer.py
@@ -0,0 +1,301 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple, List, Dict, Union, Callable, Any
+
+from paddle import nn
+from ppcls.utils import logger
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, inputs):
+        return inputs
+
+
+class TheseusLayer(nn.Layer):
+    def __init__(self, *args, **kwargs):
+        super(TheseusLayer, self).__init__()
+        self.res_dict = {}
+        self.res_name = self.full_name()
+        self.pruner = None
+        self.quanter = None
+
+    def _return_dict_hook(self, layer, input, output):
+        res_dict = {"output": output}
+        # 'list' is needed to avoid error raised by popping self.res_dict
+        for res_key in list(self.res_dict):
+            # clear the res_dict because the forward process may change according to input
+            res_dict[res_key] = self.res_dict.pop(res_key)
+        return res_dict
+
+    def init_res(self,
+                 stages_pattern,
+                 return_patterns=None,
+                 return_stages=None):
+        if return_patterns and return_stages:
+            msg = f"The 'return_patterns' would be ignored when 'return_stages' is set."
+            logger.warning(msg)
+            return_stages = None
+
+        if return_stages is True:
+            return_patterns = stages_pattern
+        # return_stages is int or bool
+        if type(return_stages) is int:
+            return_stages = [return_stages]
+        if isinstance(return_stages, list):
+            if max(return_stages) > len(stages_pattern) or min(
+                    return_stages) < 0:
+                msg = f"The 'return_stages' set error. Illegal value(s) have been ignored. The stages' pattern list is {stages_pattern}."
+                logger.warning(msg)
+                return_stages = [
+                    val for val in return_stages
+                    if val >= 0 and val < len(stages_pattern)
+                ]
+            return_patterns = [stages_pattern[i] for i in return_stages]
+
+        if return_patterns:
+            self.update_res(return_patterns)
+
+    def replace_sub(self, *args, **kwargs) -> None:
+        msg = "The function 'replace_sub()' is deprecated, please use 'upgrade_sublayer()' instead."
+        logger.error(DeprecationWarning(msg))
+        raise DeprecationWarning(msg)
+
+    def upgrade_sublayer(self,
+                         layer_name_pattern: Union[str, List[str]],
+                         handle_func: Callable[[nn.Layer, str], nn.Layer]
+                         ) -> Dict[str, nn.Layer]:
+        """use 'handle_func' to modify the sub-layer(s) specified by 'layer_name_pattern'.
+
+        Args:
+            layer_name_pattern (Union[str, List[str]]): The name of layer to be modified by 'handle_func'.
+            handle_func (Callable[[nn.Layer, str], nn.Layer]): The function to modify target layer specified by 'layer_name_pattern'. The formal params are the layer(nn.Layer) and pattern(str) that is (a member of) layer_name_pattern (when layer_name_pattern is List type). And the return is the layer processed.
+
+        Returns:
+            Dict[str, nn.Layer]: The key is the pattern and corresponding value is the result returned by 'handle_func()'.
+
+        Examples:
+
+            from paddle import nn
+            import paddleclas
+
+            def rep_func(layer: nn.Layer, pattern: str):
+                new_layer = nn.Conv2D(
+                    in_channels=layer._in_channels,
+                    out_channels=layer._out_channels,
+                    kernel_size=5,
+                    padding=2
+                )
+                return new_layer
+
+            net = paddleclas.MobileNetV1()
+            res = net.replace_sub(layer_name_pattern=["blocks[11].depthwise_conv.conv", "blocks[12].depthwise_conv.conv"], handle_func=rep_func)
+            print(res)
+            # {'blocks[11].depthwise_conv.conv': the corresponding new_layer, 'blocks[12].depthwise_conv.conv': the corresponding new_layer}
+        """
+
+        if not isinstance(layer_name_pattern, list):
+            layer_name_pattern = [layer_name_pattern]
+
+        hit_layer_pattern_list = []
+        for pattern in layer_name_pattern:
+            # parse pattern to find target layer and its parent
+            layer_list = parse_pattern_str(pattern=pattern, parent_layer=self)
+            if not layer_list:
+                continue
+            sub_layer_parent = layer_list[-2]["layer"] if len(
+                layer_list) > 1 else self
+
+            sub_layer = layer_list[-1]["layer"]
+            sub_layer_name = layer_list[-1]["name"]
+            sub_layer_index = layer_list[-1]["index"]
+
+            new_sub_layer = handle_func(sub_layer, pattern)
+
+            if sub_layer_index:
+                getattr(sub_layer_parent,
+                        sub_layer_name)[sub_layer_index] = new_sub_layer
+            else:
+                setattr(sub_layer_parent, sub_layer_name, new_sub_layer)
+
+            hit_layer_pattern_list.append(pattern)
+        return hit_layer_pattern_list
+
+    def stop_after(self, stop_layer_name: str) -> bool:
+        """stop forward and backward after 'stop_layer_name'.
+
+        Args:
+            stop_layer_name (str): The name of layer that stop forward and backward after this layer.
+
+        Returns:
+            bool: 'True' if successful, 'False' otherwise.
+        """
+
+        layer_list = parse_pattern_str(stop_layer_name, self)
+        if not layer_list:
+            return False
+
+        parent_layer = self
+        for layer_dict in layer_list:
+            name, index = layer_dict["name"], layer_dict["index"]
+            if not set_identity(parent_layer, name, index):
+                msg = f"Failed to set the layers that after stop_layer_name('{stop_layer_name}') to IdentityLayer. The error layer's name is '{name}'."
+                logger.warning(msg)
+                return False
+            parent_layer = layer_dict["layer"]
+
+        return True
+
+    def update_res(
+            self,
+            return_patterns: Union[str, List[str]]) -> Dict[str, nn.Layer]:
+        """update the result(s) to be returned.
+
+        Args:
+            return_patterns (Union[str, List[str]]): The name of layer to return output.
+
+        Returns:
+            Dict[str, nn.Layer]: The pattern(str) and corresponding layer(nn.Layer) that have been set successfully.
+        """
+
+        # clear res_dict that could have been set
+        self.res_dict = {}
+
+        class Handler(object):
+            def __init__(self, res_dict):
+                # res_dict is a reference
+                self.res_dict = res_dict
+
+            def __call__(self, layer, pattern):
+                layer.res_dict = self.res_dict
+                layer.res_name = pattern
+                if hasattr(layer, "hook_remove_helper"):
+                    layer.hook_remove_helper.remove()
+                layer.hook_remove_helper = layer.register_forward_post_hook(
+                    save_sub_res_hook)
+                return layer
+
+        handle_func = Handler(self.res_dict)
+
+        hit_layer_pattern_list = self.upgrade_sublayer(
+            return_patterns, handle_func=handle_func)
+
+        if hasattr(self, "hook_remove_helper"):
+            self.hook_remove_helper.remove()
+        self.hook_remove_helper = self.register_forward_post_hook(
+            self._return_dict_hook)
+
+        return hit_layer_pattern_list
+
+
+def save_sub_res_hook(layer, input, output):
+    layer.res_dict[layer.res_name] = output
+
+
+def set_identity(parent_layer: nn.Layer,
+                 layer_name: str,
+                 layer_index: str=None) -> bool:
+    """set the layer specified by layer_name and layer_index to Indentity.
+
+    Args:
+        parent_layer (nn.Layer): The parent layer of target layer specified by layer_name and layer_index.
+        layer_name (str): The name of target layer to be set to Indentity.
+        layer_index (str, optional): The index of target layer to be set to Indentity in parent_layer. Defaults to None.
+
+    Returns:
+        bool: True if successfully, False otherwise.
+    """
+
+    stop_after = False
+    for sub_layer_name in parent_layer._sub_layers:
+        if stop_after:
+            parent_layer._sub_layers[sub_layer_name] = Identity()
+            continue
+        if sub_layer_name == layer_name:
+            stop_after = True
+
+    if layer_index and stop_after:
+        stop_after = False
+        for sub_layer_index in parent_layer._sub_layers[
+                layer_name]._sub_layers:
+            if stop_after:
+                parent_layer._sub_layers[layer_name][
+                    sub_layer_index] = Identity()
+                continue
+            if layer_index == sub_layer_index:
+                stop_after = True
+
+    return stop_after
+
+
+def parse_pattern_str(pattern: str, parent_layer: nn.Layer) -> Union[
+        None, List[Dict[str, Union[nn.Layer, str, None]]]]:
+    """parse the string type pattern.
+
+    Args:
+        pattern (str): The pattern to discribe layer.
+        parent_layer (nn.Layer): The root layer relative to the pattern.
+
+    Returns:
+        Union[None, List[Dict[str, Union[nn.Layer, str, None]]]]: None if failed. If successfully, the members are layers parsed in order:
+                                                                [
+                                                                    {"layer": first layer, "name": first layer's name parsed, "index": first layer's index parsed if exist},
+                                                                    {"layer": second layer, "name": second layer's name parsed, "index": second layer's index parsed if exist},
+                                                                    ...
+                                                                ]
+    """
+
+    pattern_list = pattern.split(".")
+    if not pattern_list:
+        msg = f"The pattern('{pattern}') is illegal. Please check and retry."
+        logger.warning(msg)
+        return None
+
+    layer_list = []
+    while len(pattern_list) > 0:
+        if '[' in pattern_list[0]:
+            target_layer_name = pattern_list[0].split('[')[0]
+            target_layer_index = pattern_list[0].split('[')[1].split(']')[0]
+        else:
+            target_layer_name = pattern_list[0]
+            target_layer_index = None
+
+        target_layer = getattr(parent_layer, target_layer_name, None)
+
+        if target_layer is None:
+            msg = f"Not found layer named('{target_layer_name}') specifed in pattern('{pattern}')."
+            logger.warning(msg)
+            return None
+
+        if target_layer_index and target_layer:
+            if int(target_layer_index) < 0 or int(target_layer_index) >= len(
+                    target_layer):
+                msg = f"Not found layer by index('{target_layer_index}') specifed in pattern('{pattern}'). The index should < {len(target_layer)} and > 0."
+                logger.warning(msg)
+                return None
+
+            target_layer = target_layer[target_layer_index]
+
+        layer_list.append({
+            "layer": target_layer,
+            "name": target_layer_name,
+            "index": target_layer_index
+        })
+
+        pattern_list = pattern_list[1:]
+        parent_layer = target_layer
+    return layer_list
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/legendary_models/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/legendary_models/__init__.py
new file mode 100644
index 000000000..550e5544c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/legendary_models/__init__.py
@@ -0,0 +1,8 @@
+from .resnet import ResNet50
+
+# from .resnet import ResNet18, ResNet34, ResNet50, ResNet101, ResNet152, ResNet18_vd, ResNet34_vd, ResNet50_vd, ResNet101_vd, ResNet152_vd
+# from .hrnet import HRNet_W18_C, HRNet_W30_C, HRNet_W32_C, HRNet_W40_C, HRNet_W44_C, HRNet_W48_C, HRNet_W64_C
+# from .mobilenet_v1 import MobileNetV1_x0_25, MobileNetV1_x0_5, MobileNetV1_x0_75, MobileNetV1
+# from .mobilenet_v3 import MobileNetV3_small_x0_35, MobileNetV3_small_x0_5, MobileNetV3_small_x0_75, MobileNetV3_small_x1_0, MobileNetV3_small_x1_25, MobileNetV3_large_x0_35, MobileNetV3_large_x0_5, MobileNetV3_large_x0_75, MobileNetV3_large_x1_0, MobileNetV3_large_x1_25
+# from .inception_v3 import InceptionV3
+# from .vgg import VGG11, VGG13, VGG16, VGG19
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/legendary_models/resnet.py b/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/legendary_models/resnet.py
new file mode 100644
index 000000000..74c5c5fa6
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/legendary_models/resnet.py
@@ -0,0 +1,591 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "ResNet18":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_pretrained.pdparams",
+    "ResNet18_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_vd_pretrained.pdparams",
+    "ResNet34":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet34_pretrained.pdparams",
+    "ResNet34_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet34_vd_pretrained.pdparams",
+    "ResNet50":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_pretrained.pdparams",
+    "ResNet50_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_vd_pretrained.pdparams",
+    "ResNet101":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet101_pretrained.pdparams",
+    "ResNet101_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet101_vd_pretrained.pdparams",
+    "ResNet152":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_pretrained.pdparams",
+    "ResNet152_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_vd_pretrained.pdparams",
+    "ResNet200_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet200_vd_pretrained.pdparams",
+}
+
+MODEL_STAGES_PATTERN = {
+    "ResNet18": ["blocks[1]", "blocks[3]", "blocks[5]", "blocks[7]"],
+    "ResNet34": ["blocks[2]", "blocks[6]", "blocks[12]", "blocks[15]"],
+    "ResNet50": ["blocks[2]", "blocks[6]", "blocks[12]", "blocks[15]"],
+    "ResNet101": ["blocks[2]", "blocks[6]", "blocks[29]", "blocks[32]"],
+    "ResNet152": ["blocks[2]", "blocks[10]", "blocks[46]", "blocks[49]"],
+    "ResNet200": ["blocks[2]", "blocks[14]", "blocks[62]", "blocks[65]"]
+}
+
+__all__ = MODEL_URLS.keys()
+'''
+ResNet config: dict.
+    key: depth of ResNet.
+    values: config's dict of specific model.
+        keys:
+            block_type: Two different blocks in ResNet, BasicBlock and BottleneckBlock are optional.
+            block_depth: The number of blocks in different stages in ResNet.
+            num_channels: The number of channels to enter the next stage.
+'''
+NET_CONFIG = {
+    "18": {
+        "block_type": "BasicBlock",
+        "block_depth": [2, 2, 2, 2],
+        "num_channels": [64, 64, 128, 256]
+    },
+    "34": {
+        "block_type": "BasicBlock",
+        "block_depth": [3, 4, 6, 3],
+        "num_channels": [64, 64, 128, 256]
+    },
+    "50": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 4, 6, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "101": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 4, 23, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "152": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 8, 36, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "200": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 12, 48, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+}
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 is_vd_mode=False,
+                 act=None,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+        self.is_vd_mode = is_vd_mode
+        self.act = act
+        self.avg_pool = AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=False,
+            data_format=data_format)
+        self.bn = BatchNorm(
+            num_filters,
+            param_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult),
+            data_layout=data_format)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        if self.is_vd_mode:
+            x = self.avg_pool(x)
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act:
+            x = self.relu(x)
+        return x
+
+
+class BottleneckBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            lr_mult=lr_mult,
+            data_format=data_format)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride if if_first else 1,
+                is_vd_mode=False if if_first else True,
+                lr_mult=lr_mult,
+                data_format=data_format)
+        self.relu = nn.ReLU()
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        x = self.conv2(x)
+
+        if self.shortcut:
+            short = identity
+        else:
+            short = self.short(identity)
+        x = paddle.add(x=x, y=short)
+        x = self.relu(x)
+        return x
+
+
+class BasicBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            lr_mult=lr_mult,
+            data_format=data_format)
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=stride if if_first else 1,
+                is_vd_mode=False if if_first else True,
+                lr_mult=lr_mult,
+                data_format=data_format)
+        self.shortcut = shortcut
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        if self.shortcut:
+            short = identity
+        else:
+            short = self.short(identity)
+        x = paddle.add(x=x, y=short)
+        x = self.relu(x)
+        return x
+
+
+class ResNet(TheseusLayer):
+    """
+    ResNet
+    Args:
+        config: dict. config of ResNet.
+        version: str="vb". Different version of ResNet, version vd can perform better. 
+        class_num: int=1000. The number of classes.
+        lr_mult_list: list. Control the learning rate of different stages.
+    Returns:
+        model: nn.Layer. Specific ResNet model depends on args.
+    """
+
+    def __init__(self,
+                 config,
+                 stages_pattern,
+                 version="vb",
+                 class_num=1000,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+                 data_format="NCHW",
+                 input_image_channel=3,
+                 return_patterns=None,
+                 return_stages=None):
+        super().__init__()
+
+        self.cfg = config
+        self.lr_mult_list = lr_mult_list
+        self.is_vd_mode = version == "vd"
+        self.class_num = class_num
+        self.num_filters = [64, 128, 256, 512]
+        self.block_depth = self.cfg["block_depth"]
+        self.block_type = self.cfg["block_type"]
+        self.num_channels = self.cfg["num_channels"]
+        self.channels_mult = 1 if self.num_channels[-1] == 256 else 4
+
+        assert isinstance(self.lr_mult_list, (
+            list, tuple
+        )), "lr_mult_list should be in (list, tuple) but got {}".format(
+            type(self.lr_mult_list))
+        assert len(self.lr_mult_list
+                   ) == 5, "lr_mult_list length should be 5 but got {}".format(
+                       len(self.lr_mult_list))
+
+        self.stem_cfg = {
+            #num_channels, num_filters, filter_size, stride
+            "vb": [[input_image_channel, 64, 7, 2]],
+            "vd":
+            [[input_image_channel, 32, 3, 2], [32, 32, 3, 1], [32, 64, 3, 1]]
+        }
+
+        self.stem = nn.Sequential(* [
+            ConvBNLayer(
+                num_channels=in_c,
+                num_filters=out_c,
+                filter_size=k,
+                stride=s,
+                act="relu",
+                lr_mult=self.lr_mult_list[0],
+                data_format=data_format)
+            for in_c, out_c, k, s in self.stem_cfg[version]
+        ])
+
+        self.max_pool = MaxPool2D(
+            kernel_size=3, stride=2, padding=1, data_format=data_format)
+        block_list = []
+        for block_idx in range(len(self.block_depth)):
+            shortcut = False
+            for i in range(self.block_depth[block_idx]):
+                block_list.append(globals()[self.block_type](
+                    num_channels=self.num_channels[block_idx] if i == 0 else
+                    self.num_filters[block_idx] * self.channels_mult,
+                    num_filters=self.num_filters[block_idx],
+                    stride=2 if i == 0 and block_idx != 0 else 1,
+                    shortcut=shortcut,
+                    if_first=block_idx == i == 0 if version == "vd" else True,
+                    lr_mult=self.lr_mult_list[block_idx + 1],
+                    data_format=data_format))
+                shortcut = True
+        self.blocks = nn.Sequential(*block_list)
+
+        self.avg_pool = AdaptiveAvgPool2D(1, data_format=data_format)
+        self.flatten = nn.Flatten()
+        self.avg_pool_channels = self.num_channels[-1] * 2
+        stdv = 1.0 / math.sqrt(self.avg_pool_channels * 1.0)
+        self.fc = Linear(
+            self.avg_pool_channels,
+            self.class_num,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
+
+        self.data_format = data_format
+
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward(self, x):
+        with paddle.static.amp.fp16_guard():
+            if self.data_format == "NHWC":
+                x = paddle.transpose(x, [0, 2, 3, 1])
+                x.stop_gradient = True
+            x = self.stem(x)
+            x = self.max_pool(x)
+            x = self.blocks(x)
+            x = self.avg_pool(x)
+            x = self.flatten(x)
+            x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNet18(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet18
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet18` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["18"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet18"],
+        version="vb",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet18"], use_ssld)
+    return model
+
+
+def ResNet18_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet18_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet18_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["18"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet18"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet18_vd"], use_ssld)
+    return model
+
+
+def ResNet34(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet34
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet34` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["34"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet34"],
+        version="vb",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet34"], use_ssld)
+    return model
+
+
+def ResNet34_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet34_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet34_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["34"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet34"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet34_vd"], use_ssld)
+    return model
+
+
+def ResNet50(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet50
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet50` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["50"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet50"],
+        version="vb",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50"], use_ssld)
+    return model
+
+
+def ResNet50_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet50_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet50_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["50"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet50"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50_vd"], use_ssld)
+    return model
+
+
+def ResNet101(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet101
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet101` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["101"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet101"],
+        version="vb",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet101"], use_ssld)
+    return model
+
+
+def ResNet101_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet101_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet101_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["101"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet101"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet101_vd"], use_ssld)
+    return model
+
+
+def ResNet152(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet152
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet152` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["152"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet152"],
+        version="vb",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet152"], use_ssld)
+    return model
+
+
+def ResNet152_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet152_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet152_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["152"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet152"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet152_vd"], use_ssld)
+    return model
+
+
+def ResNet200_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet200_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet200_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["200"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet200"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet200_vd"], use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/model_zoo/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/model_zoo/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/model_zoo/resnet_vc.py b/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/model_zoo/resnet_vc.py
new file mode 100644
index 000000000..6b972dc7b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/model_zoo/resnet_vc.py
@@ -0,0 +1,309 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "ResNet50_vc":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vc_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+        self._num_channels_out = num_filters * 4
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            name=name + "_branch2b")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=stride,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv1)
+        y = F.relu(y)
+        return y
+
+
+class ResNet_vc(nn.Layer):
+    def __init__(self, layers=50, class_num=1000):
+        super(ResNet_vc, self).__init__()
+
+        self.layers = layers
+        supported_layers = [18, 34, 50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_channels = [64, 256, 512,
+                        1024] if layers >= 50 else [64, 64, 128, 256]
+        num_filters = [64, 128, 256, 512]
+
+        self.conv1_1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=32,
+            filter_size=3,
+            stride=2,
+            act='relu',
+            name="conv1_1")
+        self.conv1_2 = ConvBNLayer(
+            num_channels=32,
+            num_filters=32,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_2")
+        self.conv1_3 = ConvBNLayer(
+            num_channels=32,
+            num_filters=64,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_3")
+
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        if layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BottleneckBlock(
+                            num_channels=num_channels[block]
+                            if i == 0 else num_filters[block] * 4,
+                            num_filters=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            name=conv_name))
+                    self.block_list.append(bottleneck_block)
+                    shortcut = True
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BasicBlock(
+                            num_channels=num_channels[block]
+                            if i == 0 else num_filters[block],
+                            num_filters=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            name=conv_name))
+                    self.block_list.append(basic_block)
+                    shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_0.w_0"),
+            bias_attr=ParamAttr(name="fc_0.b_0"))
+
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNet50_vc(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNet_vc(layers=50, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNet50_vc"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/variant_models/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/variant_models/__init__.py
new file mode 100644
index 000000000..ae9549246
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/variant_models/__init__.py
@@ -0,0 +1,3 @@
+from .resnet_variant import ResNet50_last_stage_stride1
+# from .vgg_variant import VGG19Sigmoid
+# from .pp_lcnet_variant import PPLCNet_x2_5_Tanh
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/variant_models/resnet_variant.py b/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/variant_models/resnet_variant.py
new file mode 100644
index 000000000..0219344b1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/arch/backbone/variant_models/resnet_variant.py
@@ -0,0 +1,23 @@
+from paddle.nn import Conv2D
+from ppcls.arch.backbone.legendary_models.resnet import ResNet50, MODEL_URLS, _load_pretrained
+
+__all__ = ["ResNet50_last_stage_stride1"]
+
+
+def ResNet50_last_stage_stride1(pretrained=False, use_ssld=False, **kwargs):
+    def replace_function(conv, pattern):
+        new_conv = Conv2D(
+            in_channels=conv._in_channels,
+            out_channels=conv._out_channels,
+            kernel_size=conv._kernel_size,
+            stride=1,
+            padding=conv._padding,
+            groups=conv._groups,
+            bias_attr=conv._bias_attr)
+        return new_conv
+
+    pattern = ["blocks[13].conv1.conv", "blocks[13].short.conv"]
+    model = ResNet50(pretrained=False, use_ssld=use_ssld, **kwargs)
+    model.upgrade_sublayer(pattern, replace_function)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50"], use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/__init__.py
new file mode 100644
index 000000000..75ca41d8a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .arcmargin import ArcMargin
+from .cosmargin import CosMargin
+from .circlemargin import CircleMargin
+from .fc import FC
+from .vehicle_neck import VehicleNeck
+
+__all__ = ['build_gear']
+
+
+def build_gear(config):
+    support_dict = [
+        'ArcMargin', 'CosMargin', 'CircleMargin', 'FC', 'VehicleNeck'
+    ]
+    module_name = config.pop('name')
+    assert module_name in support_dict, Exception(
+        'head only support {}'.format(support_dict))
+    module_class = eval(module_name)(**config)
+    return module_class
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/arcmargin.py b/cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/arcmargin.py
new file mode 100644
index 000000000..22cc76e1d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/arcmargin.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import math
+
+
+class ArcMargin(nn.Layer):
+    def __init__(self,
+                 embedding_size,
+                 class_num,
+                 margin=0.5,
+                 scale=80.0,
+                 easy_margin=False):
+        super().__init__()
+        self.embedding_size = embedding_size
+        self.class_num = class_num
+        self.margin = margin
+        self.scale = scale
+        self.easy_margin = easy_margin
+        self.weight = self.create_parameter(
+            shape=[self.embedding_size, self.class_num],
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.XavierNormal())
+
+    def forward(self, input, label=None):
+        input_norm = paddle.sqrt(
+            paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        input = paddle.divide(input, input_norm)
+
+        weight_norm = paddle.sqrt(
+            paddle.sum(paddle.square(self.weight), axis=0, keepdim=True))
+        weight = paddle.divide(self.weight, weight_norm)
+
+        cos = paddle.matmul(input, weight)
+        if not self.training or label is None:
+            return cos
+        sin = paddle.sqrt(1.0 - paddle.square(cos) + 1e-6)
+        cos_m = math.cos(self.margin)
+        sin_m = math.sin(self.margin)
+        phi = cos * cos_m - sin * sin_m
+
+        th = math.cos(self.margin) * (-1)
+        mm = math.sin(self.margin) * self.margin
+        if self.easy_margin:
+            phi = self._paddle_where_more_than(cos, 0, phi, cos)
+        else:
+            phi = self._paddle_where_more_than(cos, th, phi, cos - mm)
+
+        one_hot = paddle.nn.functional.one_hot(label, self.class_num)
+        one_hot = paddle.squeeze(one_hot, axis=[1])
+        output = paddle.multiply(one_hot, phi) + paddle.multiply(
+            (1.0 - one_hot), cos)
+        output = output * self.scale
+        return output
+
+    def _paddle_where_more_than(self, target, limit, x, y):
+        mask = paddle.cast(x=(target > limit), dtype='float32')
+        output = paddle.multiply(mask, x) + paddle.multiply((1.0 - mask), y)
+        return output
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/circlemargin.py b/cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/circlemargin.py
new file mode 100644
index 000000000..d1bce83cb
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/circlemargin.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class CircleMargin(nn.Layer):
+    def __init__(self, embedding_size, class_num, margin, scale):
+        super(CircleMargin, self).__init__()
+        self.scale = scale
+        self.margin = margin
+        self.embedding_size = embedding_size
+        self.class_num = class_num
+
+        self.weight = self.create_parameter(
+            shape=[self.embedding_size, self.class_num],
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.XavierNormal())
+
+    def forward(self, input, label):
+        feat_norm = paddle.sqrt(
+            paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        input = paddle.divide(input, feat_norm)
+
+        weight_norm = paddle.sqrt(
+            paddle.sum(paddle.square(self.weight), axis=0, keepdim=True))
+        weight = paddle.divide(self.weight, weight_norm)
+
+        logits = paddle.matmul(input, weight)
+        if not self.training or label is None:
+            return logits
+
+        alpha_p = paddle.clip(-logits.detach() + 1 + self.margin, min=0.)
+        alpha_n = paddle.clip(logits.detach() + self.margin, min=0.)
+        delta_p = 1 - self.margin
+        delta_n = self.margin
+
+        m_hot = F.one_hot(label.reshape([-1]), num_classes=logits.shape[1])
+
+        logits_p = alpha_p * (logits - delta_p)
+        logits_n = alpha_n * (logits - delta_n)
+        pre_logits = logits_p * m_hot + logits_n * (1 - m_hot)
+        pre_logits = self.scale * pre_logits
+
+        return pre_logits
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/cosmargin.py b/cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/cosmargin.py
new file mode 100644
index 000000000..578b64c2b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/cosmargin.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import math
+import paddle.nn as nn
+
+
+class CosMargin(paddle.nn.Layer):
+    def __init__(self, embedding_size, class_num, margin=0.35, scale=64.0):
+        super(CosMargin, self).__init__()
+        self.scale = scale
+        self.margin = margin
+        self.embedding_size = embedding_size
+        self.class_num = class_num
+
+        self.weight = self.create_parameter(
+            shape=[self.embedding_size, self.class_num],
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.XavierNormal())
+
+    def forward(self, input, label):
+        label.stop_gradient = True
+
+        input_norm = paddle.sqrt(
+            paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        input = paddle.divide(input, input_norm)
+
+        weight_norm = paddle.sqrt(
+            paddle.sum(paddle.square(self.weight), axis=0, keepdim=True))
+        weight = paddle.divide(self.weight, weight_norm)
+
+        cos = paddle.matmul(input, weight)
+        if not self.training or label is None:
+            return cos
+
+        cos_m = cos - self.margin
+
+        one_hot = paddle.nn.functional.one_hot(label, self.class_num)
+        one_hot = paddle.squeeze(one_hot, axis=[1])
+        output = paddle.multiply(one_hot, cos_m) + paddle.multiply(
+            (1.0 - one_hot), cos)
+        output = output * self.scale
+        return output
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/fc.py b/cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/fc.py
new file mode 100644
index 000000000..b32474195
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/fc.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+
+
+class FC(nn.Layer):
+    def __init__(self, embedding_size, class_num):
+        super(FC, self).__init__()
+        self.embedding_size = embedding_size
+        self.class_num = class_num
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.XavierNormal())
+        self.fc = paddle.nn.Linear(
+            self.embedding_size, self.class_num, weight_attr=weight_attr)
+
+    def forward(self, input, label=None):
+        out = self.fc(input)
+        return out
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/identity_head.py b/cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/identity_head.py
new file mode 100644
index 000000000..7d11e5742
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/identity_head.py
@@ -0,0 +1,9 @@
+from paddle import nn
+
+
+class IdentityHead(nn.Layer):
+    def __init__(self):
+        super(IdentityHead, self).__init__()
+
+    def forward(self, x, label=None):
+        return {"features": x, "logits": None}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/vehicle_neck.py b/cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/vehicle_neck.py
new file mode 100644
index 000000000..05f4e333f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/arch/gears/vehicle_neck.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+
+
+class VehicleNeck(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=1,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 padding_mode='zeros',
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCHW'):
+        super().__init__()
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            padding_mode=padding_mode,
+            weight_attr=weight_attr,
+            bias_attr=weight_attr,
+            data_format=data_format)
+        self.flatten = nn.Flatten()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.flatten(x)
+        return x
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/arch/slim/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls/arch/slim/__init__.py
new file mode 100644
index 000000000..3733059ce
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/arch/slim/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ppcls.arch.slim.prune import prune_model
+from ppcls.arch.slim.quant import quantize_model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/arch/slim/prune.py b/cv/classification/resnet50/paddlepaddle/ppcls/arch/slim/prune.py
new file mode 100644
index 000000000..c0c9d220b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/arch/slim/prune.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+import paddle
+from ppcls.utils import logger
+
+
+def prune_model(config, model):
+    if config.get("Slim", False) and config["Slim"].get("prune", False):
+        import paddleslim
+        prune_method_name = config["Slim"]["prune"]["name"].lower()
+        assert prune_method_name in [
+            "fpgm", "l1_norm"
+        ], "The prune methods only support 'fpgm' and 'l1_norm'"
+        if prune_method_name == "fpgm":
+            model.pruner = paddleslim.dygraph.FPGMFilterPruner(
+                model, [1] + config["Global"]["image_shape"])
+        else:
+            model.pruner = paddleslim.dygraph.L1NormFilterPruner(
+                model, [1] + config["Global"]["image_shape"])
+
+        # prune model
+        _prune_model(config, model)
+    else:
+        model.pruner = None
+
+
+
+def _prune_model(config, model):
+    from paddleslim.analysis import dygraph_flops as flops
+    logger.info("FLOPs before pruning: {}GFLOPs".format(
+        flops(model, [1] + config["Global"]["image_shape"]) / 1e9))
+    model.eval()
+
+    params = []
+    for sublayer in model.sublayers():
+        for param in sublayer.parameters(include_sublayers=False):
+            if isinstance(sublayer, paddle.nn.Conv2D):
+                params.append(param.name)
+    ratios = {}
+    for param in params:
+        ratios[param] = config["Slim"]["prune"]["pruned_ratio"]
+    plan = model.pruner.prune_vars(ratios, [0])
+
+    logger.info("FLOPs after pruning: {}GFLOPs; pruned ratio: {}".format(
+        flops(model, [1] + config["Global"]["image_shape"]) / 1e9,
+        plan.pruned_flops))
+
+    for param in model.parameters():
+        if "conv2d" in param.name:
+            logger.info("{}\t{}".format(param.name, param.shape))
+
+    model.train()
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/arch/slim/quant.py b/cv/classification/resnet50/paddlepaddle/ppcls/arch/slim/quant.py
new file mode 100644
index 000000000..b8f59a78f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/arch/slim/quant.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+import paddle
+from ppcls.utils import logger
+
+QUANT_CONFIG = {
+    # weight preprocess type, default is None and no preprocessing is performed.
+    'weight_preprocess_type': None,
+    # activation preprocess type, default is None and no preprocessing is performed.
+    'activation_preprocess_type': None,
+    # weight quantize type, default is 'channel_wise_abs_max'
+    'weight_quantize_type': 'channel_wise_abs_max',
+    # activation quantize type, default is 'moving_average_abs_max'
+    'activation_quantize_type': 'moving_average_abs_max',
+    # weight quantize bit num, default is 8
+    'weight_bits': 8,
+    # activation quantize bit num, default is 8
+    'activation_bits': 8,
+    # data type after quantization, such as 'uint8', 'int8', etc. default is 'int8'
+    'dtype': 'int8',
+    # window size for 'range_abs_max' quantization. default is 10000
+    'window_size': 10000,
+    # The decay coefficient of moving average, default is 0.9
+    'moving_rate': 0.9,
+    # for dygraph quantization, layers of type in quantizable_layer_type will be quantized
+    'quantizable_layer_type': ['Conv2D', 'Linear'],
+}
+
+
+def quantize_model(config, model):
+    if config.get("Slim", False) and config["Slim"].get("quant", False):
+        from paddleslim.dygraph.quant import QAT
+        assert config["Slim"]["quant"]["name"].lower(
+        ) == 'pact', 'Only PACT quantization method is supported now'
+        QUANT_CONFIG["activation_preprocess_type"] = "PACT"
+        model.quanter = QAT(config=QUANT_CONFIG)
+        model.quanter.quantize(model)
+        logger.info("QAT model summary:")
+        paddle.summary(model, (1, 3, 224, 224))
+    else:
+        model.quanter = None
+    return
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/arch/utils.py b/cv/classification/resnet50/paddlepaddle/ppcls/arch/utils.py
new file mode 100644
index 000000000..308475d7d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/arch/utils.py
@@ -0,0 +1,53 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+import types
+from difflib import SequenceMatcher
+
+from . import backbone
+
+
+def get_architectures():
+    """
+    get all of model architectures
+    """
+    names = []
+    for k, v in backbone.__dict__.items():
+        if isinstance(v, (types.FunctionType, six.class_types)):
+            names.append(k)
+    return names
+
+
+def get_blacklist_model_in_static_mode():
+    from ppcls.arch.backbone import distilled_vision_transformer
+    from ppcls.arch.backbone import vision_transformer
+    blacklist = distilled_vision_transformer.__all__ + vision_transformer.__all__
+    return blacklist
+
+
+def similar_architectures(name='', names=[], thresh=0.1, topk=10):
+    """
+    inferred similar architectures
+    """
+    scores = []
+    for idx, n in enumerate(names):
+        if n.startswith('__'):
+            continue
+        score = SequenceMatcher(None, n.lower(), name.lower()).quick_ratio()
+        if score > thresh:
+            scores.append((idx, score))
+    scores.sort(key=lambda x: x[1], reverse=True)
+    similar_names = [names[s[0]] for s in scores[:min(topk, len(scores))]]
+    return similar_names
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_AutoAugment.yaml b/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_AutoAugment.yaml
new file mode 100644
index 000000000..ab4c29c30
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_AutoAugment.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_Baseline.yaml b/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_Baseline.yaml
new file mode 100644
index 000000000..d75fede9e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_Baseline.yaml
@@ -0,0 +1,128 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_Cutmix.yaml b/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_Cutmix.yaml
new file mode 100644
index 000000000..2fefb9f4b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_Cutmix.yaml
@@ -0,0 +1,128 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - CutmixOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_Cutout.yaml b/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_Cutout.yaml
new file mode 100644
index 000000000..4bf530664
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_Cutout.yaml
@@ -0,0 +1,131 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - Cutout:
+            n_holes: 1
+            length: 112
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_GridMask.yaml b/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_GridMask.yaml
new file mode 100644
index 000000000..c0016aa00
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_GridMask.yaml
@@ -0,0 +1,134 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - GridMask:
+            d1: 96
+            d2: 224
+            rotate: 1
+            ratio: 0.5
+            mode: 0
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_HideAndSeek.yaml b/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_HideAndSeek.yaml
new file mode 100644
index 000000000..12e4ac8db
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_HideAndSeek.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - HideAndSeek:
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_Mixup.yaml b/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_Mixup.yaml
new file mode 100644
index 000000000..3434cab5a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_Mixup.yaml
@@ -0,0 +1,128 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_RandAugment.yaml b/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_RandAugment.yaml
new file mode 100644
index 000000000..153451e13
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_RandAugment.yaml
@@ -0,0 +1,131 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - RandAugment:
+            num_layers: 2 
+            magnitude: 5
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_RandomErasing.yaml b/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_RandomErasing.yaml
new file mode 100644
index 000000000..8e89c5ca1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/DataAugment/ResNet50_RandomErasing.yaml
@@ -0,0 +1,134 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 0.4
+            r1: 0.3
+            mean: [0., 0., 0.]
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/ResNet/ResNet50.yaml b/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/ResNet/ResNet50.yaml
new file mode 100644
index 000000000..c2da23fb3
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/ResNet/ResNet50.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/ResNet/ResNet50_amp_4x8.yaml b/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/ResNet/ResNet50_amp_4x8.yaml
new file mode 100644
index 000000000..2d6de3b21
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/ResNet/ResNet50_amp_4x8.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 90
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: False
+  # O1: mixed fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.875
+  multi_precision: True
+  lr:
+    name: Cosine
+    warmup_epoch: 8
+    learning_rate: 8.192
+  regularizer:
+    name: 'L2'
+    coeff: 2.5e-05
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/ResNet/ResNet50_vd.yaml b/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/ResNet/ResNet50_vd.yaml
new file mode 100644
index 000000000..be7b2d9db
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/configs/ImageNet/ResNet/ResNet50_vd.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/configs/quick_start/ResNet50_vd.yaml b/cv/classification/resnet50/paddlepaddle/ppcls/configs/quick_start/ResNet50_vd.yaml
new file mode 100644
index 000000000..30d745599
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/configs/quick_start/ResNet50_vd.yaml
@@ -0,0 +1,107 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 5
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50_vd 
+  class_num: 102
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.0125
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./data/datasets/flowers102/
+      cls_label_path: ./data/datasets/flowers102/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./data/datasets/flowers102/
+      cls_label_path: ./data/datasets/flowers102/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/engine/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls/engine/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/engine/engine.py b/cv/classification/resnet50/paddlepaddle/ppcls/engine/engine.py
new file mode 100644
index 000000000..019cf1650
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/engine/engine.py
@@ -0,0 +1,468 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import platform
+import paddle
+import paddle.distributed as dist
+from visualdl import LogWriter
+from paddle import nn
+import numpy as np
+import random
+
+from ppcls.utils.check import check_gpu
+from ppcls.utils.misc import AverageMeter
+from ppcls.utils import logger
+from ppcls.utils.logger import init_logger
+from ppcls.utils.config import print_config
+from ppcls.data import build_dataloader
+from ppcls.arch import build_model, RecModel, DistillationModel, TheseusLayer
+from ppcls.arch import apply_to_static
+from ppcls.loss import build_loss
+from ppcls.metric import build_metrics
+from ppcls.optimizer import build_optimizer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+from ppcls.utils.save_load import init_model
+from ppcls.utils import save_load
+
+from ppcls.data.utils.get_image_list import get_image_list
+from ppcls.data.postprocess import build_postprocess
+from ppcls.data import create_operators
+from ppcls.engine.train import train_epoch
+from ppcls.engine import evaluation
+from ppcls.arch.gears.identity_head import IdentityHead
+
+
+class Engine(object):
+    def __init__(self, config, mode="train"):
+        assert mode in ["train", "eval", "infer", "export"]
+        self.mode = mode
+        self.config = config
+        self.eval_mode = self.config["Global"].get("eval_mode",
+                                                   "classification")
+        if "Head" in self.config["Arch"] or self.config["Arch"].get("is_rec",
+                                                                    False):
+            self.is_rec = True
+        else:
+            self.is_rec = False
+
+        # set seed
+        seed = self.config["Global"].get("seed", False)
+        if seed or seed == 0:
+            assert isinstance(seed, int), "The 'seed' must be a integer!"
+            paddle.seed(seed)
+            np.random.seed(seed)
+            random.seed(seed)
+
+        # init logger
+        self.output_dir = self.config['Global']['output_dir']
+        log_file = os.path.join(self.output_dir, self.config["Arch"]["name"],
+                                f"{mode}.log")
+        init_logger(log_file=log_file)
+        print_config(config)
+
+        # init train_func and eval_func
+        assert self.eval_mode in ["classification", "retrieval"], logger.error(
+            "Invalid eval mode: {}".format(self.eval_mode))
+        self.train_epoch_func = train_epoch
+        self.eval_func = getattr(evaluation, self.eval_mode + "_eval")
+
+        self.use_dali = self.config['Global'].get("use_dali", False)
+
+        # for visualdl
+        self.vdl_writer = None
+        if self.config['Global'][
+                'use_visualdl'] and mode == "train" and dist.get_rank() == 0:
+            vdl_writer_path = os.path.join(self.output_dir, "vdl")
+            if not os.path.exists(vdl_writer_path):
+                os.makedirs(vdl_writer_path)
+            self.vdl_writer = LogWriter(logdir=vdl_writer_path)
+
+        # set device
+        assert self.config["Global"]["device"] in ["cpu", "gpu", "xpu", "npu", "mlu"]
+        self.device = paddle.set_device(self.config["Global"]["device"])
+        logger.info('train with paddle {} and device {}'.format(
+            paddle.__version__, self.device))
+
+        # AMP training
+        self.amp = True if "AMP" in self.config and self.mode == "train" else False
+        if self.amp and self.config["AMP"] is not None:
+            self.scale_loss = self.config["AMP"].get("scale_loss", 1.0)
+            self.use_dynamic_loss_scaling = self.config["AMP"].get(
+                "use_dynamic_loss_scaling", False)
+        else:
+            self.scale_loss = 1.0
+            self.use_dynamic_loss_scaling = False
+        if self.amp:
+            AMP_RELATED_FLAGS_SETTING = {
+                'FLAGS_max_inplace_grad_add': 8,
+            }
+            if paddle.is_compiled_with_cuda():
+                AMP_RELATED_FLAGS_SETTING.update({
+                    'FLAGS_cudnn_batchnorm_spatial_persistent': 1
+                })
+            paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)
+
+        if "class_num" in config["Global"]:
+            global_class_num = config["Global"]["class_num"]
+            if "class_num" not in config["Arch"]:
+                config["Arch"]["class_num"] = global_class_num
+                msg = f"The Global.class_num will be deprecated. Please use Arch.class_num instead. Arch.class_num has been set to {global_class_num}."
+            else:
+                msg = "The Global.class_num will be deprecated. Please use Arch.class_num instead. The Global.class_num has been ignored."
+            logger.warning(msg)
+        #TODO(gaotingquan): support rec
+        class_num = config["Arch"].get("class_num", None)
+        self.config["DataLoader"].update({"class_num": class_num})
+        # build dataloader
+        if self.mode == 'train':
+            self.train_dataloader = build_dataloader(
+                self.config["DataLoader"], "Train", self.device, self.use_dali)
+        if self.mode == "eval" or (self.mode == "train" and
+                                   self.config["Global"]["eval_during_train"]):
+            if self.eval_mode == "classification":
+                self.eval_dataloader = build_dataloader(
+                    self.config["DataLoader"], "Eval", self.device,
+                    self.use_dali)
+            elif self.eval_mode == "retrieval":
+                self.gallery_query_dataloader = None
+                if len(self.config["DataLoader"]["Eval"].keys()) == 1:
+                    key = list(self.config["DataLoader"]["Eval"].keys())[0]
+                    self.gallery_query_dataloader = build_dataloader(
+                        self.config["DataLoader"]["Eval"], key, self.device,
+                        self.use_dali)
+                else:
+                    self.gallery_dataloader = build_dataloader(
+                        self.config["DataLoader"]["Eval"], "Gallery",
+                        self.device, self.use_dali)
+                    self.query_dataloader = build_dataloader(
+                        self.config["DataLoader"]["Eval"], "Query",
+                        self.device, self.use_dali)
+
+        # build loss
+        if self.mode == "train":
+            loss_info = self.config["Loss"]["Train"]
+            self.train_loss_func = build_loss(loss_info)
+        if self.mode == "eval" or (self.mode == "train" and
+                                   self.config["Global"]["eval_during_train"]):
+            loss_config = self.config.get("Loss", None)
+            if loss_config is not None:
+                loss_config = loss_config.get("Eval")
+                if loss_config is not None:
+                    self.eval_loss_func = build_loss(loss_config)
+                else:
+                    self.eval_loss_func = None
+            else:
+                self.eval_loss_func = None
+
+        # build metric
+        if self.mode == 'train':
+            metric_config = self.config.get("Metric")
+            if metric_config is not None:
+                metric_config = metric_config.get("Train")
+                if metric_config is not None:
+                    if hasattr(
+                            self.train_dataloader, "collate_fn"
+                    ) and self.train_dataloader.collate_fn is not None:
+                        for m_idx, m in enumerate(metric_config):
+                            if "TopkAcc" in m:
+                                msg = f"'TopkAcc' metric can not be used when setting 'batch_transform_ops' in config. The 'TopkAcc' metric has been removed."
+                                logger.warning(msg)
+                                break
+                        metric_config.pop(m_idx)
+                    self.train_metric_func = build_metrics(metric_config)
+                else:
+                    self.train_metric_func = None
+        else:
+            self.train_metric_func = None
+
+        if self.mode == "eval" or (self.mode == "train" and
+                                   self.config["Global"]["eval_during_train"]):
+            metric_config = self.config.get("Metric")
+            if self.eval_mode == "classification":
+                if metric_config is not None:
+                    metric_config = metric_config.get("Eval")
+                    if metric_config is not None:
+                        self.eval_metric_func = build_metrics(metric_config)
+            elif self.eval_mode == "retrieval":
+                if metric_config is None:
+                    metric_config = [{"name": "Recallk", "topk": (1, 5)}]
+                else:
+                    metric_config = metric_config["Eval"]
+                self.eval_metric_func = build_metrics(metric_config)
+        else:
+            self.eval_metric_func = None
+
+        # build model
+        self.model = build_model(self.config)
+        # set @to_static for benchmark, skip this by default.
+        apply_to_static(self.config, self.model)
+
+        # load_pretrain
+        if self.config["Global"]["pretrained_model"] is not None:
+            if self.config["Global"]["pretrained_model"].startswith("http"):
+                load_dygraph_pretrain_from_url(
+                    self.model, self.config["Global"]["pretrained_model"])
+            else:
+                load_dygraph_pretrain(
+                    self.model, self.config["Global"]["pretrained_model"])
+
+        # build optimizer
+        if self.mode == 'train':
+            self.optimizer, self.lr_sch = build_optimizer(
+                self.config["Optimizer"], self.config["Global"]["epochs"],
+                len(self.train_dataloader), [self.model])
+
+        # for amp training
+        if self.amp:
+            self.scaler = paddle.amp.GradScaler(
+                init_loss_scaling=self.scale_loss,
+                use_dynamic_loss_scaling=self.use_dynamic_loss_scaling)
+            amp_level = self.config['AMP'].get("level", "O1")
+            if amp_level not in ["O1", "O2"]:
+                msg = "[Parameter Error]: The optimize level of AMP only support 'O1' and 'O2'. The level has been set 'O1'."
+                logger.warning(msg)
+                self.config['AMP']["level"] = "O1"
+                amp_level = "O1"
+            self.model, self.optimizer = paddle.amp.decorate(
+                models=self.model,
+                optimizers=self.optimizer,
+                level=amp_level,
+                save_dtype='float32')
+
+        # for distributed
+        world_size = dist.get_world_size()
+        self.config["Global"]["distributed"] = world_size != 1
+        if world_size != 4 and self.mode == "train":
+            msg = f"The training strategy in config files provided by PaddleClas is based on 4 gpus. But the number of gpus is {world_size} in current training. Please modify the stategy (learning rate, batch size and so on) if use config files in PaddleClas to train."
+            logger.warning(msg)
+        if self.config["Global"]["distributed"]:
+            dist.init_parallel_env()
+            self.model = paddle.DataParallel(self.model)
+
+        # build postprocess for infer
+        if self.mode == 'infer':
+            self.preprocess_func = create_operators(self.config["Infer"][
+                "transforms"])
+            self.postprocess_func = build_postprocess(self.config["Infer"][
+                "PostProcess"])
+
+    def train(self):
+        assert self.mode == "train"
+        print_batch_step = self.config['Global']['print_batch_step']
+        save_interval = self.config["Global"]["save_interval"]
+        best_metric = {
+            "metric": 0.0,
+            "epoch": 0,
+        }
+        # key:
+        # val: metrics list word
+        self.output_info = dict()
+        self.time_info = {
+            "batch_cost": AverageMeter(
+                "batch_cost", '.5f', postfix=" s,"),
+            "reader_cost": AverageMeter(
+                "reader_cost", ".5f", postfix=" s,"),
+        }
+        # global iter counter
+        self.global_step = 0
+
+        if self.config["Global"]["checkpoints"] is not None:
+            metric_info = init_model(self.config["Global"], self.model,
+                                     self.optimizer)
+            if metric_info is not None:
+                best_metric.update(metric_info)
+
+        self.max_iter = len(self.train_dataloader) - 1 if platform.system(
+        ) == "Windows" else len(self.train_dataloader)
+        for epoch_id in range(best_metric["epoch"] + 1,
+                              self.config["Global"]["epochs"] + 1):
+            acc = 0.0
+            # for one epoch train
+            self.train_epoch_func(self, epoch_id, print_batch_step)
+
+            if self.use_dali:
+                self.train_dataloader.reset()
+            metric_msg = ", ".join([
+                "{}: {:.5f}".format(key, self.output_info[key].avg)
+                for key in self.output_info
+            ])
+            logger.info("[Train][Epoch {}/{}][Avg]{}".format(
+                epoch_id, self.config["Global"]["epochs"], metric_msg))
+            self.output_info.clear()
+
+            # eval model and save model if possible
+            if self.config["Global"][
+                    "eval_during_train"] and epoch_id % self.config["Global"][
+                        "eval_interval"] == 0:
+                acc = self.eval(epoch_id)
+                if acc > best_metric["metric"]:
+                    best_metric["metric"] = acc
+                    best_metric["epoch"] = epoch_id
+                    save_load.save_model(
+                        self.model,
+                        self.optimizer,
+                        best_metric,
+                        self.output_dir,
+                        model_name=self.config["Arch"]["name"],
+                        prefix="best_model")
+                logger.info("[Eval][Epoch {}][best metric: {}]".format(
+                    epoch_id, best_metric["metric"]))
+                logger.scaler(
+                    name="eval_acc",
+                    value=acc,
+                    step=epoch_id,
+                    writer=self.vdl_writer)
+
+                self.model.train()
+
+            # save model
+            if epoch_id % save_interval == 0:
+                save_load.save_model(
+                    self.model,
+                    self.optimizer, {"metric": acc,
+                                     "epoch": epoch_id},
+                    self.output_dir,
+                    model_name=self.config["Arch"]["name"],
+                    prefix="epoch_{}".format(epoch_id))
+            # save the latest model
+            save_load.save_model(
+                self.model,
+                self.optimizer, {"metric": acc,
+                                 "epoch": epoch_id},
+                self.output_dir,
+                model_name=self.config["Arch"]["name"],
+                prefix="latest")
+
+        if self.vdl_writer is not None:
+            self.vdl_writer.close()
+
+    @paddle.no_grad()
+    def eval(self, epoch_id=0):
+        assert self.mode in ["train", "eval"]
+        self.model.eval()
+        eval_result = self.eval_func(self, epoch_id)
+        self.model.train()
+        return eval_result
+
+    @paddle.no_grad()
+    def infer(self):
+        assert self.mode == "infer" and self.eval_mode == "classification"
+        total_trainer = dist.get_world_size()
+        local_rank = dist.get_rank()
+        image_list = get_image_list(self.config["Infer"]["infer_imgs"])
+        # data split
+        image_list = image_list[local_rank::total_trainer]
+
+        batch_size = self.config["Infer"]["batch_size"]
+        self.model.eval()
+        batch_data = []
+        image_file_list = []
+        for idx, image_file in enumerate(image_list):
+            with open(image_file, 'rb') as f:
+                x = f.read()
+            for process in self.preprocess_func:
+                x = process(x)
+            batch_data.append(x)
+            image_file_list.append(image_file)
+            if len(batch_data) >= batch_size or idx == len(image_list) - 1:
+                batch_tensor = paddle.to_tensor(batch_data)
+                out = self.model(batch_tensor)
+                if isinstance(out, list):
+                    out = out[0]
+                if isinstance(out, dict) and "logits" in out:
+                    out = out["logits"]
+                if isinstance(out, dict) and "output" in out:
+                    out = out["output"]
+                result = self.postprocess_func(out, image_file_list)
+                print(result)
+                batch_data.clear()
+                image_file_list.clear()
+
+    def export(self):
+        assert self.mode == "export"
+        use_multilabel = self.config["Global"].get("use_multilabel", False)
+        model = ExportModel(self.config["Arch"], self.model, use_multilabel)
+        if self.config["Global"]["pretrained_model"] is not None:
+            load_dygraph_pretrain(model.base_model,
+                                  self.config["Global"]["pretrained_model"])
+
+        model.eval()
+        save_path = os.path.join(self.config["Global"]["save_inference_dir"],
+                                 "inference")
+        if model.quanter:
+            model.quanter.save_quantized_model(
+                model.base_model,
+                save_path,
+                input_spec=[
+                    paddle.static.InputSpec(
+                        shape=[None] + self.config["Global"]["image_shape"],
+                        dtype='float32')
+                ])
+        else:
+            model = paddle.jit.to_static(
+                model,
+                input_spec=[
+                    paddle.static.InputSpec(
+                        shape=[None] + self.config["Global"]["image_shape"],
+                        dtype='float32')
+                ])
+            paddle.jit.save(model, save_path)
+
+
+class ExportModel(TheseusLayer):
+    """
+    ExportModel: add softmax onto the model
+    """
+
+    def __init__(self, config, model, use_multilabel):
+        super().__init__()
+        self.base_model = model
+        # we should choose a final model to export
+        if isinstance(self.base_model, DistillationModel):
+            self.infer_model_name = config["infer_model_name"]
+        else:
+            self.infer_model_name = None
+
+        self.infer_output_key = config.get("infer_output_key", None)
+        if self.infer_output_key == "features" and isinstance(self.base_model,
+                                                              RecModel):
+            self.base_model.head = IdentityHead()
+        if use_multilabel:
+            self.out_act = nn.Sigmoid()
+        else:
+            if config.get("infer_add_softmax", True):
+                self.out_act = nn.Softmax(axis=-1)
+            else:
+                self.out_act = None
+
+    def eval(self):
+        self.training = False
+        for layer in self.sublayers():
+            layer.training = False
+            layer.eval()
+
+    def forward(self, x):
+        x = self.base_model(x)
+        if isinstance(x, list):
+            x = x[0]
+        if self.infer_model_name is not None:
+            x = x[self.infer_model_name]
+        if self.infer_output_key is not None:
+            x = x[self.infer_output_key]
+        if self.out_act is not None:
+            x = self.out_act(x)
+        return x
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/engine/evaluation/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls/engine/evaluation/__init__.py
new file mode 100644
index 000000000..e0cd77888
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/engine/evaluation/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ppcls.engine.evaluation.classification import classification_eval
+from ppcls.engine.evaluation.retrieval import retrieval_eval
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/engine/evaluation/classification.py b/cv/classification/resnet50/paddlepaddle/ppcls/engine/evaluation/classification.py
new file mode 100644
index 000000000..79fb1d692
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/engine/evaluation/classification.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import time
+import platform
+import paddle
+
+from ppcls.utils.misc import AverageMeter
+from ppcls.utils import logger
+
+
+def classification_eval(engine, epoch_id=0):
+    output_info = dict()
+    time_info = {
+        "batch_cost": AverageMeter(
+            "batch_cost", '.5f', postfix=" s,"),
+        "reader_cost": AverageMeter(
+            "reader_cost", ".5f", postfix=" s,"),
+    }
+    print_batch_step = engine.config["Global"]["print_batch_step"]
+
+    metric_key = None
+    tic = time.time()
+    accum_samples = 0
+    total_samples = len(
+        engine.eval_dataloader.
+        dataset) if not engine.use_dali else engine.eval_dataloader.size
+    max_iter = len(engine.eval_dataloader) - 1 if platform.system(
+    ) == "Windows" else len(engine.eval_dataloader)
+    for iter_id, batch in enumerate(engine.eval_dataloader):
+        if iter_id >= max_iter:
+            break
+        if iter_id == 5:
+            for key in time_info:
+                time_info[key].reset()
+        if engine.use_dali:
+            batch = [
+                paddle.to_tensor(batch[0]['data']),
+                paddle.to_tensor(batch[0]['label'])
+            ]
+        time_info["reader_cost"].update(time.time() - tic)
+        batch_size = batch[0].shape[0]
+        batch[0] = paddle.to_tensor(batch[0]).astype("float32")
+        if not engine.config["Global"].get("use_multilabel", False):
+            batch[1] = batch[1].reshape([-1, 1]).astype("int64")
+
+        # image input
+        if engine.amp:
+            amp_level = engine.config['AMP'].get("level", "O1").upper()
+            with paddle.amp.auto_cast(
+                    custom_black_list={
+                        "flatten_contiguous_range", "greater_than"
+                    },
+                    level=amp_level):
+                out = engine.model(batch[0])
+        else:
+            out = engine.model(batch[0])
+
+        # just for DistributedBatchSampler issue: repeat sampling
+        current_samples = batch_size * paddle.distributed.get_world_size()
+        accum_samples += current_samples
+
+        # gather Tensor when distributed
+        if paddle.distributed.get_world_size() > 1:
+            label_list = []
+            paddle.distributed.all_gather(label_list, batch[1])
+            labels = paddle.concat(label_list, 0)
+
+            if isinstance(out, dict):
+                if "Student" in out:
+                    out = out["Student"]
+                    if isinstance(out, dict):
+                        out = out["logits"]
+                elif "logits" in out:
+                    out = out["logits"]
+                else:
+                    msg = "Error: Wrong key in out!"
+                    raise Exception(msg)
+            if isinstance(out, list):
+                preds = []
+                for x in out:
+                    pred_list = []
+                    paddle.distributed.all_gather(pred_list, x)
+                    pred_x = paddle.concat(pred_list, 0)
+                    preds.append(pred_x)
+            else:
+                pred_list = []
+                paddle.distributed.all_gather(pred_list, out)
+                preds = paddle.concat(pred_list, 0)
+
+            if accum_samples > total_samples and not engine.use_dali:
+                preds = preds[:total_samples + current_samples - accum_samples]
+                labels = labels[:total_samples + current_samples -
+                                accum_samples]
+                current_samples = total_samples + current_samples - accum_samples
+        else:
+            labels = batch[1]
+            preds = out
+
+        # calc loss
+        if engine.eval_loss_func is not None:
+            if engine.amp and engine.config["AMP"].get("use_fp16_test", False):
+                amp_level = engine.config['AMP'].get("level", "O1").upper()
+                with paddle.amp.auto_cast(
+                        custom_black_list={
+                            "flatten_contiguous_range", "greater_than"
+                        },
+                        level=amp_level):
+                    loss_dict = engine.eval_loss_func(preds, labels)
+            else:
+                loss_dict = engine.eval_loss_func(preds, labels)
+
+            for key in loss_dict:
+                if key not in output_info:
+                    output_info[key] = AverageMeter(key, '7.5f')
+                output_info[key].update(loss_dict[key].numpy()[0],
+                                        current_samples)
+        #  calc metric
+        if engine.eval_metric_func is not None:
+            metric_dict = engine.eval_metric_func(preds, labels)
+            for key in metric_dict:
+                if metric_key is None:
+                    metric_key = key
+                if key not in output_info:
+                    output_info[key] = AverageMeter(key, '7.5f')
+
+                output_info[key].update(metric_dict[key].numpy()[0],
+                                        current_samples)
+
+        time_info["batch_cost"].update(time.time() - tic)
+
+        if iter_id % print_batch_step == 0:
+            time_msg = "s, ".join([
+                "{}: {:.5f}".format(key, time_info[key].avg)
+                for key in time_info
+            ])
+
+            ips_msg = "ips: {:.5f} images/sec".format(
+                batch_size / time_info["batch_cost"].avg)
+
+            metric_msg = ", ".join([
+                "{}: {:.5f}".format(key, output_info[key].val)
+                for key in output_info
+            ])
+            logger.info("[Eval][Epoch {}][Iter: {}/{}]{}, {}, {}".format(
+                epoch_id, iter_id,
+                len(engine.eval_dataloader), metric_msg, time_msg, ips_msg))
+
+        tic = time.time()
+    if engine.use_dali:
+        engine.eval_dataloader.reset()
+    metric_msg = ", ".join([
+        "{}: {:.5f}".format(key, output_info[key].avg) for key in output_info
+    ])
+    logger.info("[Eval][Epoch {}][Avg]{}".format(epoch_id, metric_msg))
+
+    # do not try to save best eval.model
+    if engine.eval_metric_func is None:
+        return -1
+    # return 1st metric in the dict
+    return output_info[metric_key].avg
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/engine/evaluation/retrieval.py b/cv/classification/resnet50/paddlepaddle/ppcls/engine/evaluation/retrieval.py
new file mode 100644
index 000000000..b481efae1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/engine/evaluation/retrieval.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import platform
+import paddle
+from ppcls.utils import logger
+
+
+def retrieval_eval(engine, epoch_id=0):
+    engine.model.eval()
+    # step1. build gallery
+    if engine.gallery_query_dataloader is not None:
+        gallery_feas, gallery_img_id, gallery_unique_id = cal_feature(
+            engine, name='gallery_query')
+        query_feas, query_img_id, query_query_id = gallery_feas, gallery_img_id, gallery_unique_id
+    else:
+        gallery_feas, gallery_img_id, gallery_unique_id = cal_feature(
+            engine, name='gallery')
+        query_feas, query_img_id, query_query_id = cal_feature(
+            engine, name='query')
+
+    # step2. do evaluation
+    sim_block_size = engine.config["Global"].get("sim_block_size", 64)
+    sections = [sim_block_size] * (len(query_feas) // sim_block_size)
+    if len(query_feas) % sim_block_size:
+        sections.append(len(query_feas) % sim_block_size)
+    fea_blocks = paddle.split(query_feas, num_or_sections=sections)
+    if query_query_id is not None:
+        query_id_blocks = paddle.split(
+            query_query_id, num_or_sections=sections)
+    image_id_blocks = paddle.split(query_img_id, num_or_sections=sections)
+    metric_key = None
+
+    if engine.eval_loss_func is None:
+        metric_dict = {metric_key: 0.}
+    else:
+        metric_dict = dict()
+        for block_idx, block_fea in enumerate(fea_blocks):
+            similarity_matrix = paddle.matmul(
+                block_fea, gallery_feas, transpose_y=True)
+            if query_query_id is not None:
+                query_id_block = query_id_blocks[block_idx]
+                query_id_mask = (query_id_block != gallery_unique_id.t())
+
+                image_id_block = image_id_blocks[block_idx]
+                image_id_mask = (image_id_block != gallery_img_id.t())
+
+                keep_mask = paddle.logical_or(query_id_mask, image_id_mask)
+                similarity_matrix = similarity_matrix * keep_mask.astype(
+                    "float32")
+            else:
+                keep_mask = None
+
+            metric_tmp = engine.eval_metric_func(similarity_matrix,
+                                                 image_id_blocks[block_idx],
+                                                 gallery_img_id, keep_mask)
+
+            for key in metric_tmp:
+                if key not in metric_dict:
+                    metric_dict[key] = metric_tmp[key] * block_fea.shape[
+                        0] / len(query_feas)
+                else:
+                    metric_dict[key] += metric_tmp[key] * block_fea.shape[
+                        0] / len(query_feas)
+
+    metric_info_list = []
+    for key in metric_dict:
+        if metric_key is None:
+            metric_key = key
+        metric_info_list.append("{}: {:.5f}".format(key, metric_dict[key]))
+    metric_msg = ", ".join(metric_info_list)
+    logger.info("[Eval][Epoch {}][Avg]{}".format(epoch_id, metric_msg))
+
+    return metric_dict[metric_key]
+
+
+def cal_feature(engine, name='gallery'):
+    has_unique_id = False
+    all_unique_id = None
+
+    if name == 'gallery':
+        dataloader = engine.gallery_dataloader
+    elif name == 'query':
+        dataloader = engine.query_dataloader
+    elif name == 'gallery_query':
+        dataloader = engine.gallery_query_dataloader
+    else:
+        raise RuntimeError("Only support gallery or query dataset")
+
+    batch_feas_list = []
+    img_id_list = []
+    unique_id_list = []
+    max_iter = len(dataloader) - 1 if platform.system() == "Windows" else len(
+        dataloader)
+    for idx, batch in enumerate(dataloader):  # load is very time-consuming
+        if idx >= max_iter:
+            break
+        if idx % engine.config["Global"]["print_batch_step"] == 0:
+            logger.info(
+                f"{name} feature calculation process: [{idx}/{len(dataloader)}]"
+            )
+        if engine.use_dali:
+            batch = [
+                paddle.to_tensor(batch[0]['data']),
+                paddle.to_tensor(batch[0]['label'])
+            ]
+        batch = [paddle.to_tensor(x) for x in batch]
+        batch[1] = batch[1].reshape([-1, 1]).astype("int64")
+        if len(batch) == 3:
+            has_unique_id = True
+            batch[2] = batch[2].reshape([-1, 1]).astype("int64")
+        out = engine.model(batch[0], batch[1])
+        if "Student" in out:
+            out = out["Student"]
+        batch_feas = out["features"]
+
+        # do norm
+        if engine.config["Global"].get("feature_normalize", True):
+            feas_norm = paddle.sqrt(
+                paddle.sum(paddle.square(batch_feas), axis=1, keepdim=True))
+            batch_feas = paddle.divide(batch_feas, feas_norm)
+
+        # do binarize
+        if engine.config["Global"].get("feature_binarize") == "round":
+            batch_feas = paddle.round(batch_feas).astype("float32") * 2.0 - 1.0
+
+        if engine.config["Global"].get("feature_binarize") == "sign":
+            batch_feas = paddle.sign(batch_feas).astype("float32")
+
+        if paddle.distributed.get_world_size() > 1:
+            batch_feas_gather = []
+            img_id_gather = []
+            unique_id_gather = []
+            paddle.distributed.all_gather(batch_feas_gather, batch_feas)
+            paddle.distributed.all_gather(img_id_gather, batch[1])
+            batch_feas_list.append(paddle.concat(batch_feas_gather))
+            img_id_list.append(paddle.concat(img_id_gather))
+            if has_unique_id:
+                paddle.distributed.all_gather(unique_id_gather, batch[2])
+                unique_id_list.append(paddle.concat(unique_id_gather))
+        else:
+            batch_feas_list.append(batch_feas)
+            img_id_list.append(batch[1])
+            if has_unique_id:
+                unique_id_list.append(batch[2])
+
+    if engine.use_dali:
+        dataloader.reset()
+
+    all_feas = paddle.concat(batch_feas_list)
+    all_img_id = paddle.concat(img_id_list)
+    if has_unique_id:
+        all_unique_id = paddle.concat(unique_id_list)
+
+    # just for DistributedBatchSampler issue: repeat sampling
+    total_samples = len(
+        dataloader.dataset) if not engine.use_dali else dataloader.size
+    all_feas = all_feas[:total_samples]
+    all_img_id = all_img_id[:total_samples]
+    if has_unique_id:
+        all_unique_id = all_unique_id[:total_samples]
+
+    logger.info("Build {} done, all feat shape: {}, begin to eval..".format(
+        name, all_feas.shape))
+    return all_feas, all_img_id, all_unique_id
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/engine/train/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls/engine/train/__init__.py
new file mode 100644
index 000000000..800d3a41e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/engine/train/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ppcls.engine.train.train import train_epoch
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/engine/train/train.py b/cv/classification/resnet50/paddlepaddle/ppcls/engine/train/train.py
new file mode 100644
index 000000000..b15c1088a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/engine/train/train.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+
+import time
+import paddle
+from ppcls.engine.train.utils import update_loss, update_metric, log_info
+from ppcls.utils import profiler
+
+
+def train_epoch(engine, epoch_id, print_batch_step):
+    tic = time.time()
+    for iter_id, batch in enumerate(engine.train_dataloader):
+        if iter_id >= engine.max_iter:
+            break
+        profiler.add_profiler_step(engine.config["profiler_options"])
+        if iter_id == 5:
+            for key in engine.time_info:
+                engine.time_info[key].reset()
+        engine.time_info["reader_cost"].update(time.time() - tic)
+        if engine.use_dali:
+            batch = [
+                paddle.to_tensor(batch[0]['data']),
+                paddle.to_tensor(batch[0]['label'])
+            ]
+        batch_size = batch[0].shape[0]
+        if not engine.config["Global"].get("use_multilabel", False):
+            batch[1] = batch[1].reshape([batch_size, -1])
+        engine.global_step += 1
+
+        # image input
+        if engine.amp:
+            amp_level = engine.config['AMP'].get("level", "O1").upper()
+            with paddle.amp.auto_cast(
+                    custom_black_list={
+                        "flatten_contiguous_range", "greater_than"
+                    },
+                    level=amp_level):
+                out = forward(engine, batch)
+                loss_dict = engine.train_loss_func(out, batch[1])
+        else:
+            out = forward(engine, batch)
+            loss_dict = engine.train_loss_func(out, batch[1])
+
+        # step opt and lr
+        if engine.amp:
+            scaled = engine.scaler.scale(loss_dict["loss"])
+            scaled.backward()
+            engine.scaler.minimize(engine.optimizer, scaled)
+        else:
+            loss_dict["loss"].backward()
+            engine.optimizer.step()
+        engine.optimizer.clear_grad()
+        engine.lr_sch.step()
+
+        # below code just for logging
+        # update metric_for_logger
+        update_metric(engine, out, batch, batch_size)
+        # update_loss_for_logger
+        update_loss(engine, loss_dict, batch_size)
+        engine.time_info["batch_cost"].update(time.time() - tic)
+        if iter_id % print_batch_step == 0:
+            log_info(engine, batch_size, epoch_id, iter_id)
+        tic = time.time()
+
+
+def forward(engine, batch):
+    if not engine.is_rec:
+        return engine.model(batch[0])
+    else:
+        return engine.model(batch[0], batch[1])
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/engine/train/utils.py b/cv/classification/resnet50/paddlepaddle/ppcls/engine/train/utils.py
new file mode 100644
index 000000000..92eb35d75
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/engine/train/utils.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+
+import datetime
+from ppcls.utils import logger
+from ppcls.utils.misc import AverageMeter
+
+
+def update_metric(trainer, out, batch, batch_size):
+    # calc metric
+    if trainer.train_metric_func is not None:
+        metric_dict = trainer.train_metric_func(out, batch[-1])
+        for key in metric_dict:
+            if key not in trainer.output_info:
+                trainer.output_info[key] = AverageMeter(key, '7.5f')
+            trainer.output_info[key].update(metric_dict[key].numpy()[0],
+                                            batch_size)
+
+
+def update_loss(trainer, loss_dict, batch_size):
+    # update_output_info
+    for key in loss_dict:
+        if key not in trainer.output_info:
+            trainer.output_info[key] = AverageMeter(key, '7.5f')
+        trainer.output_info[key].update(loss_dict[key].numpy()[0], batch_size)
+
+
+def log_info(trainer, batch_size, epoch_id, iter_id):
+    lr_msg = "lr: {:.5f}".format(trainer.lr_sch.get_lr())
+    metric_msg = ", ".join([
+        "{}: {:.5f}".format(key, trainer.output_info[key].avg)
+        for key in trainer.output_info
+    ])
+    time_msg = "s, ".join([
+        "{}: {:.5f}".format(key, trainer.time_info[key].avg)
+        for key in trainer.time_info
+    ])
+
+    ips_msg = "ips: {:.5f} images/sec".format(
+        batch_size / trainer.time_info["batch_cost"].avg)
+    eta_sec = ((trainer.config["Global"]["epochs"] - epoch_id + 1
+                ) * len(trainer.train_dataloader) - iter_id
+               ) * trainer.time_info["batch_cost"].avg
+    eta_msg = "eta: {:s}".format(str(datetime.timedelta(seconds=int(eta_sec))))
+    logger.info("[Train][Epoch {}/{}][Iter: {}/{}]{}, {}, {}, {}, {}".format(
+        epoch_id, trainer.config["Global"]["epochs"], iter_id,
+        len(trainer.train_dataloader), lr_msg, metric_msg, time_msg, ips_msg,
+        eta_msg))
+
+    logger.scaler(
+        name="lr",
+        value=trainer.lr_sch.get_lr(),
+        step=trainer.global_step,
+        writer=trainer.vdl_writer)
+    for key in trainer.output_info:
+        logger.scaler(
+            name="train_{}".format(key),
+            value=trainer.output_info[key].avg,
+            step=trainer.global_step,
+            writer=trainer.vdl_writer)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/loss/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls/loss/__init__.py
new file mode 100644
index 000000000..7c50ff76f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/loss/__init__.py
@@ -0,0 +1,47 @@
+import copy
+
+import paddle
+import paddle.nn as nn
+from ppcls.utils import logger
+
+from .celoss import CELoss, MixCELoss
+from .distanceloss import DistanceLoss
+
+class CombinedLoss(nn.Layer):
+    def __init__(self, config_list):
+        super().__init__()
+        self.loss_func = []
+        self.loss_weight = []
+        assert isinstance(config_list, list), (
+            'operator config should be a list')
+        for config in config_list:
+            assert isinstance(config,
+                              dict) and len(config) == 1, "yaml format error"
+            name = list(config)[0]
+            param = config[name]
+            assert "weight" in param, "weight must be in param, but param just contains {}".format(
+                param.keys())
+            self.loss_weight.append(param.pop("weight"))
+            self.loss_func.append(eval(name)(**param))
+
+    def __call__(self, input, batch):
+        loss_dict = {}
+        # just for accelerate classification traing speed
+        if len(self.loss_func) == 1:
+            loss = self.loss_func[0](input, batch)
+            loss_dict.update(loss)
+            loss_dict["loss"] = list(loss.values())[0]
+        else:
+            for idx, loss_func in enumerate(self.loss_func):
+                loss = loss_func(input, batch)
+                weight = self.loss_weight[idx]
+                loss = {key: loss[key] * weight for key in loss}
+                loss_dict.update(loss)
+            loss_dict["loss"] = paddle.add_n(list(loss_dict.values()))
+        return loss_dict
+
+
+def build_loss(config):
+    module_class = CombinedLoss(copy.deepcopy(config))
+    logger.debug("build loss {} success.".format(module_class))
+    return module_class
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/loss/celoss.py b/cv/classification/resnet50/paddlepaddle/ppcls/loss/celoss.py
new file mode 100644
index 000000000..a78926170
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/loss/celoss.py
@@ -0,0 +1,67 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ppcls.utils import logger
+
+
+class CELoss(nn.Layer):
+    """
+    Cross entropy loss
+    """
+
+    def __init__(self, epsilon=None):
+        super().__init__()
+        if epsilon is not None and (epsilon <= 0 or epsilon >= 1):
+            epsilon = None
+        self.epsilon = epsilon
+
+    def _labelsmoothing(self, target, class_num):
+        if len(target.shape) == 1 or target.shape[-1] != class_num:
+            one_hot_target = F.one_hot(target, class_num)
+        else:
+            one_hot_target = target
+        soft_target = F.label_smooth(one_hot_target, epsilon=self.epsilon)
+        soft_target = paddle.reshape(soft_target, shape=[-1, class_num])
+        return soft_target
+
+    def forward(self, x, label):
+        if isinstance(x, dict):
+            x = x["logits"]
+        if self.epsilon is not None:
+            class_num = x.shape[-1]
+            label = self._labelsmoothing(label, class_num)
+            x = -F.log_softmax(x, axis=-1)
+            loss = paddle.sum(x * label, axis=-1)
+        else:
+            if label.shape[-1] == x.shape[-1]:
+                label = F.softmax(label, axis=-1)
+                soft_label = True
+            else:
+                soft_label = False
+            loss = F.cross_entropy(x, label=label, soft_label=soft_label)
+        loss = loss.mean()
+        return {"CELoss": loss}
+
+
+class MixCELoss(object):
+    def __init__(self, *args, **kwargs):
+        msg = "\"MixCELos\" is deprecated, please use \"CELoss\" instead."
+        logger.error(DeprecationWarning(msg))
+        raise DeprecationWarning(msg)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/loss/comfunc.py b/cv/classification/resnet50/paddlepaddle/ppcls/loss/comfunc.py
new file mode 100644
index 000000000..277bdd6b5
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/loss/comfunc.py
@@ -0,0 +1,45 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def rerange_index(batch_size, samples_each_class):
+    tmp = np.arange(0, batch_size * batch_size)
+    tmp = tmp.reshape(-1, batch_size)
+    rerange_index = []
+
+    for i in range(batch_size):
+        step = i // samples_each_class
+        start = step * samples_each_class
+        end = (step + 1) * samples_each_class
+
+        pos_idx = []
+        neg_idx = []
+        for j, k in enumerate(tmp[i]):
+            if j >= start and j < end:
+                if j == i:
+                    pos_idx.insert(0, k)
+                else:
+                    pos_idx.append(k)
+            else:
+                neg_idx.append(k)
+        rerange_index += (pos_idx + neg_idx)
+
+    rerange_index = np.array(rerange_index).astype(np.int32)
+    return rerange_index
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/loss/distanceloss.py b/cv/classification/resnet50/paddlepaddle/ppcls/loss/distanceloss.py
new file mode 100644
index 000000000..0a09f0cb2
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/loss/distanceloss.py
@@ -0,0 +1,43 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddle.nn import L1Loss
+from paddle.nn import MSELoss as L2Loss
+from paddle.nn import SmoothL1Loss
+
+
+class DistanceLoss(nn.Layer):
+    """
+    DistanceLoss:
+        mode: loss mode
+    """
+
+    def __init__(self, mode="l2", **kargs):
+        super().__init__()
+        assert mode in ["l1", "l2", "smooth_l1"]
+        if mode == "l1":
+            self.loss_func = nn.L1Loss(**kargs)
+        elif mode == "l2":
+            self.loss_func = nn.MSELoss(**kargs)
+        elif mode == "smooth_l1":
+            self.loss_func = nn.SmoothL1Loss(**kargs)
+        self.mode = mode
+
+    def forward(self, x, y):
+        loss = self.loss_func(x, y)
+        return {"loss_{}".format(self.mode): loss}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/metric/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls/metric/__init__.py
new file mode 100644
index 000000000..94721235b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/metric/__init__.py
@@ -0,0 +1,51 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from paddle import nn
+import copy
+from collections import OrderedDict
+
+from .metrics import TopkAcc, mAP, mINP, Recallk, Precisionk
+from .metrics import DistillationTopkAcc
+from .metrics import GoogLeNetTopkAcc
+from .metrics import HammingDistance, AccuracyScore
+
+
+class CombinedMetrics(nn.Layer):
+    def __init__(self, config_list):
+        super().__init__()
+        self.metric_func_list = []
+        assert isinstance(config_list, list), (
+            'operator config should be a list')
+        for config in config_list:
+            assert isinstance(config,
+                              dict) and len(config) == 1, "yaml format error"
+            metric_name = list(config)[0]
+            metric_params = config[metric_name]
+            if metric_params is not None:
+                self.metric_func_list.append(
+                    eval(metric_name)(**metric_params))
+            else:
+                self.metric_func_list.append(eval(metric_name)())
+
+    def __call__(self, *args, **kwargs):
+        metric_dict = OrderedDict()
+        for idx, metric_func in enumerate(self.metric_func_list):
+            metric_dict.update(metric_func(*args, **kwargs))
+        return metric_dict
+
+
+def build_metrics(config):
+    metrics_list = CombinedMetrics(copy.deepcopy(config))
+    return metrics_list
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/metric/metrics.py b/cv/classification/resnet50/paddlepaddle/ppcls/metric/metrics.py
new file mode 100644
index 000000000..03e742082
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/metric/metrics.py
@@ -0,0 +1,306 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from sklearn.metrics import hamming_loss
+from sklearn.metrics import accuracy_score as accuracy_metric
+from sklearn.metrics import multilabel_confusion_matrix
+from sklearn.preprocessing import binarize
+
+
+class TopkAcc(nn.Layer):
+    def __init__(self, topk=(1, 5)):
+        super().__init__()
+        assert isinstance(topk, (int, list, tuple))
+        if isinstance(topk, int):
+            topk = [topk]
+        self.topk = topk
+
+    def forward(self, x, label):
+        if isinstance(x, dict):
+            x = x["logits"]
+
+        metric_dict = dict()
+        for k in self.topk:
+            metric_dict["top{}".format(k)] = paddle.metric.accuracy(
+                x, label, k=k)
+        return metric_dict
+
+
+class mAP(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, similarities_matrix, query_img_id, gallery_img_id,
+                keep_mask):
+        metric_dict = dict()
+
+        choosen_indices = paddle.argsort(
+            similarities_matrix, axis=1, descending=True)
+        gallery_labels_transpose = paddle.transpose(gallery_img_id, [1, 0])
+        gallery_labels_transpose = paddle.broadcast_to(
+            gallery_labels_transpose,
+            shape=[
+                choosen_indices.shape[0], gallery_labels_transpose.shape[1]
+            ])
+        choosen_label = paddle.index_sample(gallery_labels_transpose,
+                                            choosen_indices)
+        equal_flag = paddle.equal(choosen_label, query_img_id)
+        if keep_mask is not None:
+            keep_mask = paddle.index_sample(
+                keep_mask.astype('float32'), choosen_indices)
+            equal_flag = paddle.logical_and(equal_flag,
+                                            keep_mask.astype('bool'))
+        equal_flag = paddle.cast(equal_flag, 'float32')
+
+        num_rel = paddle.sum(equal_flag, axis=1)
+        num_rel = paddle.greater_than(num_rel, paddle.to_tensor(0.))
+        num_rel_index = paddle.nonzero(num_rel.astype("int"))
+        num_rel_index = paddle.reshape(num_rel_index, [num_rel_index.shape[0]])
+        equal_flag = paddle.index_select(equal_flag, num_rel_index, axis=0)
+
+        acc_sum = paddle.cumsum(equal_flag, axis=1)
+        div = paddle.arange(acc_sum.shape[1]).astype("float32") + 1
+        precision = paddle.divide(acc_sum, div)
+
+        #calc map
+        precision_mask = paddle.multiply(equal_flag, precision)
+        ap = paddle.sum(precision_mask, axis=1) / paddle.sum(equal_flag,
+                                                             axis=1)
+        metric_dict["mAP"] = paddle.mean(ap).numpy()[0]
+        return metric_dict
+
+
+class mINP(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, similarities_matrix, query_img_id, gallery_img_id,
+                keep_mask):
+        metric_dict = dict()
+
+        choosen_indices = paddle.argsort(
+            similarities_matrix, axis=1, descending=True)
+        gallery_labels_transpose = paddle.transpose(gallery_img_id, [1, 0])
+        gallery_labels_transpose = paddle.broadcast_to(
+            gallery_labels_transpose,
+            shape=[
+                choosen_indices.shape[0], gallery_labels_transpose.shape[1]
+            ])
+        choosen_label = paddle.index_sample(gallery_labels_transpose,
+                                            choosen_indices)
+        equal_flag = paddle.equal(choosen_label, query_img_id)
+        if keep_mask is not None:
+            keep_mask = paddle.index_sample(
+                keep_mask.astype('float32'), choosen_indices)
+            equal_flag = paddle.logical_and(equal_flag,
+                                            keep_mask.astype('bool'))
+        equal_flag = paddle.cast(equal_flag, 'float32')
+
+        num_rel = paddle.sum(equal_flag, axis=1)
+        num_rel = paddle.greater_than(num_rel, paddle.to_tensor(0.))
+        num_rel_index = paddle.nonzero(num_rel.astype("int"))
+        num_rel_index = paddle.reshape(num_rel_index, [num_rel_index.shape[0]])
+        equal_flag = paddle.index_select(equal_flag, num_rel_index, axis=0)
+
+        #do accumulative sum
+        div = paddle.arange(equal_flag.shape[1]).astype("float32") + 2
+        minus = paddle.divide(equal_flag, div)
+        auxilary = paddle.subtract(equal_flag, minus)
+        hard_index = paddle.argmax(auxilary, axis=1).astype("float32")
+        all_INP = paddle.divide(paddle.sum(equal_flag, axis=1), hard_index)
+        mINP = paddle.mean(all_INP)
+        metric_dict["mINP"] = mINP.numpy()[0]
+        return metric_dict
+
+
+class Recallk(nn.Layer):
+    def __init__(self, topk=(1, 5)):
+        super().__init__()
+        assert isinstance(topk, (int, list, tuple))
+        if isinstance(topk, int):
+            topk = [topk]
+        self.topk = topk
+
+    def forward(self, similarities_matrix, query_img_id, gallery_img_id,
+                keep_mask):
+        metric_dict = dict()
+
+        #get cmc
+        choosen_indices = paddle.argsort(
+            similarities_matrix, axis=1, descending=True)
+        gallery_labels_transpose = paddle.transpose(gallery_img_id, [1, 0])
+        gallery_labels_transpose = paddle.broadcast_to(
+            gallery_labels_transpose,
+            shape=[
+                choosen_indices.shape[0], gallery_labels_transpose.shape[1]
+            ])
+        choosen_label = paddle.index_sample(gallery_labels_transpose,
+                                            choosen_indices)
+        equal_flag = paddle.equal(choosen_label, query_img_id)
+        if keep_mask is not None:
+            keep_mask = paddle.index_sample(
+                keep_mask.astype('float32'), choosen_indices)
+            equal_flag = paddle.logical_and(equal_flag,
+                                            keep_mask.astype('bool'))
+        equal_flag = paddle.cast(equal_flag, 'float32')
+        real_query_num = paddle.sum(equal_flag, axis=1)
+        real_query_num = paddle.sum(
+            paddle.greater_than(real_query_num, paddle.to_tensor(0.)).astype(
+                "float32"))
+
+        acc_sum = paddle.cumsum(equal_flag, axis=1)
+        mask = paddle.greater_than(acc_sum,
+                                   paddle.to_tensor(0.)).astype("float32")
+        all_cmc = (paddle.sum(mask, axis=0) / real_query_num).numpy()
+
+        for k in self.topk:
+            metric_dict["recall{}".format(k)] = all_cmc[k - 1]
+        return metric_dict
+
+
+class Precisionk(nn.Layer):
+    def __init__(self, topk=(1, 5)):
+        super().__init__()
+        assert isinstance(topk, (int, list, tuple))
+        if isinstance(topk, int):
+            topk = [topk]
+        self.topk = topk
+
+    def forward(self, similarities_matrix, query_img_id, gallery_img_id,
+                keep_mask):
+        metric_dict = dict()
+
+        #get cmc
+        choosen_indices = paddle.argsort(
+            similarities_matrix, axis=1, descending=True)
+        gallery_labels_transpose = paddle.transpose(gallery_img_id, [1, 0])
+        gallery_labels_transpose = paddle.broadcast_to(
+            gallery_labels_transpose,
+            shape=[
+                choosen_indices.shape[0], gallery_labels_transpose.shape[1]
+            ])
+        choosen_label = paddle.index_sample(gallery_labels_transpose,
+                                            choosen_indices)
+        equal_flag = paddle.equal(choosen_label, query_img_id)
+        if keep_mask is not None:
+            keep_mask = paddle.index_sample(
+                keep_mask.astype('float32'), choosen_indices)
+            equal_flag = paddle.logical_and(equal_flag,
+                                            keep_mask.astype('bool'))
+        equal_flag = paddle.cast(equal_flag, 'float32')
+
+        Ns = paddle.arange(gallery_img_id.shape[0]) + 1
+        equal_flag_cumsum = paddle.cumsum(equal_flag, axis=1)
+        Precision_at_k = (paddle.mean(equal_flag_cumsum, axis=0) / Ns).numpy()
+
+        for k in self.topk:
+            metric_dict["precision@{}".format(k)] = Precision_at_k[k - 1]
+
+        return metric_dict
+
+
+class DistillationTopkAcc(TopkAcc):
+    def __init__(self, model_key, feature_key=None, topk=(1, 5)):
+        super().__init__(topk=topk)
+        self.model_key = model_key
+        self.feature_key = feature_key
+
+    def forward(self, x, label):
+        if isinstance(x, dict):
+            x = x[self.model_key]
+        if self.feature_key is not None:
+            x = x[self.feature_key]
+        return super().forward(x, label)
+
+
+class GoogLeNetTopkAcc(TopkAcc):
+    def __init__(self, topk=(1, 5)):
+        super().__init__()
+        assert isinstance(topk, (int, list, tuple))
+        if isinstance(topk, int):
+            topk = [topk]
+        self.topk = topk
+
+    def forward(self, x, label):
+        return super().forward(x[0], label)
+
+
+class MutiLabelMetric(object):
+    def __init__(self):
+        pass
+
+    def _multi_hot_encode(self, logits, threshold=0.5):
+        return binarize(logits, threshold=threshold)
+
+    def __call__(self, output):
+        output = F.sigmoid(output)
+        preds = self._multi_hot_encode(logits=output.numpy(), threshold=0.5)
+        return preds
+
+
+class HammingDistance(MutiLabelMetric):
+    """
+    Soft metric based label for multilabel classification
+    Returns:
+        The smaller the return value is, the better model is.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def __call__(self, output, target):
+        preds = super().__call__(output)
+        metric_dict = dict()
+        metric_dict["HammingDistance"] = paddle.to_tensor(
+            hamming_loss(target, preds))
+        return metric_dict
+
+
+class AccuracyScore(MutiLabelMetric):
+    """
+    Hard metric for multilabel classification
+    Args:
+        base: ["sample", "label"], default="sample"
+            if "sample", return metric score based sample,
+            if "label", return metric score based label.
+    Returns:
+        accuracy:
+    """
+
+    def __init__(self, base="label"):
+        super().__init__()
+        assert base in ["sample", "label"
+                        ], 'must be one of ["sample", "label"]'
+        self.base = base
+
+    def __call__(self, output, target):
+        preds = super().__call__(output)
+        metric_dict = dict()
+        if self.base == "sample":
+            accuracy = accuracy_metric(target, preds)
+        elif self.base == "label":
+            mcm = multilabel_confusion_matrix(target, preds)
+            tns = mcm[:, 0, 0]
+            fns = mcm[:, 1, 0]
+            tps = mcm[:, 1, 1]
+            fps = mcm[:, 0, 1]
+            accuracy = (sum(tps) + sum(tns)) / (
+                sum(tps) + sum(tns) + sum(fns) + sum(fps))
+        metric_dict["AccuracyScore"] = paddle.to_tensor(accuracy)
+        return metric_dict
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/optimizer/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls/optimizer/__init__.py
new file mode 100644
index 000000000..61db39f89
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/optimizer/__init__.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import paddle
+
+from ppcls.utils import logger
+
+from . import optimizer
+
+__all__ = ['build_optimizer']
+
+
+def build_lr_scheduler(lr_config, epochs, step_each_epoch):
+    from . import learning_rate
+    lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch})
+    if 'name' in lr_config:
+        lr_name = lr_config.pop('name')
+        lr = getattr(learning_rate, lr_name)(**lr_config)
+        if isinstance(lr, paddle.optimizer.lr.LRScheduler):
+            return lr
+        else:
+            return lr()
+    else:
+        lr = lr_config['learning_rate']
+    return lr
+
+
+# model_list is None in static graph
+def build_optimizer(config, epochs, step_each_epoch, model_list=None):
+    config = copy.deepcopy(config)
+    # step1 build lr
+    lr = build_lr_scheduler(config.pop('lr'), epochs, step_each_epoch)
+    logger.debug("build lr ({}) success..".format(lr))
+    # step2 build regularization
+    if 'regularizer' in config and config['regularizer'] is not None:
+        if 'weight_decay' in config:
+            logger.warning(
+                "ConfigError: Only one of regularizer and weight_decay can be set in Optimizer Config. \"weight_decay\" has been ignored."
+            )
+        reg_config = config.pop('regularizer')
+        reg_name = reg_config.pop('name') + 'Decay'
+        reg = getattr(paddle.regularizer, reg_name)(**reg_config)
+        config["weight_decay"] = reg
+        logger.debug("build regularizer ({}) success..".format(reg))
+    # step3 build optimizer
+    optim_name = config.pop('name')
+    if 'clip_norm' in config:
+        clip_norm = config.pop('clip_norm')
+        grad_clip = paddle.nn.ClipGradByNorm(clip_norm=clip_norm)
+    else:
+        grad_clip = None
+    optim = getattr(optimizer, optim_name)(learning_rate=lr,
+                                           grad_clip=grad_clip,
+                                           **config)(model_list=model_list)
+    logger.debug("build optimizer ({}) success..".format(optim))
+    return optim, lr
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/optimizer/learning_rate.py b/cv/classification/resnet50/paddlepaddle/ppcls/optimizer/learning_rate.py
new file mode 100644
index 000000000..b59387dd9
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/optimizer/learning_rate.py
@@ -0,0 +1,326 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+from paddle.optimizer import lr
+from paddle.optimizer.lr import LRScheduler
+
+from ppcls.utils import logger
+
+
+class Linear(object):
+    """
+    Linear learning rate decay
+    Args:
+        lr (float): The initial learning rate. It is a python float number.
+        epochs(int): The decay step size. It determines the decay cycle.
+        end_lr(float, optional): The minimum final learning rate. Default: 0.0001.
+        power(float, optional): Power of polynomial. Default: 1.0.
+        warmup_epoch(int): The epoch numbers for LinearWarmup. Default: 0.
+        warmup_start_lr(float): Initial learning rate of warm up. Default: 0.0.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 epochs,
+                 step_each_epoch,
+                 end_lr=0.0,
+                 power=1.0,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 **kwargs):
+        super().__init__()
+        if warmup_epoch >= epochs:
+            msg = f"When using warm up, the value of \"Global.epochs\" must be greater than value of \"Optimizer.lr.warmup_epoch\". The value of \"Optimizer.lr.warmup_epoch\" has been set to {epochs}."
+            logger.warning(msg)
+            warmup_epoch = epochs
+        self.learning_rate = learning_rate
+        self.steps = (epochs - warmup_epoch) * step_each_epoch
+        self.end_lr = end_lr
+        self.power = power
+        self.last_epoch = last_epoch
+        self.warmup_steps = round(warmup_epoch * step_each_epoch)
+        self.warmup_start_lr = warmup_start_lr
+
+    def __call__(self):
+        learning_rate = lr.PolynomialDecay(
+            learning_rate=self.learning_rate,
+            decay_steps=self.steps,
+            end_lr=self.end_lr,
+            power=self.power,
+            last_epoch=self.
+            last_epoch) if self.steps > 0 else self.learning_rate
+        if self.warmup_steps > 0:
+            learning_rate = lr.LinearWarmup(
+                learning_rate=learning_rate,
+                warmup_steps=self.warmup_steps,
+                start_lr=self.warmup_start_lr,
+                end_lr=self.learning_rate,
+                last_epoch=self.last_epoch)
+        return learning_rate
+
+
+class Cosine(object):
+    """
+    Cosine learning rate decay
+    lr = 0.05 * (math.cos(epoch * (math.pi / epochs)) + 1)
+    Args:
+        lr(float): initial learning rate
+        step_each_epoch(int): steps each epoch
+        epochs(int): total training epochs
+        eta_min(float): Minimum learning rate. Default: 0.0.
+        warmup_epoch(int): The epoch numbers for LinearWarmup. Default: 0.
+        warmup_start_lr(float): Initial learning rate of warm up. Default: 0.0.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 step_each_epoch,
+                 epochs,
+                 eta_min=0.0,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 **kwargs):
+        super().__init__()
+        if warmup_epoch >= epochs:
+            msg = f"When using warm up, the value of \"Global.epochs\" must be greater than value of \"Optimizer.lr.warmup_epoch\". The value of \"Optimizer.lr.warmup_epoch\" has been set to {epochs}."
+            logger.warning(msg)
+            warmup_epoch = epochs
+        self.learning_rate = learning_rate
+        self.T_max = (epochs - warmup_epoch) * step_each_epoch
+        self.eta_min = eta_min
+        self.last_epoch = last_epoch
+        self.warmup_steps = round(warmup_epoch * step_each_epoch)
+        self.warmup_start_lr = warmup_start_lr
+
+    def __call__(self):
+        learning_rate = lr.CosineAnnealingDecay(
+            learning_rate=self.learning_rate,
+            T_max=self.T_max,
+            eta_min=self.eta_min,
+            last_epoch=self.
+            last_epoch) if self.T_max > 0 else self.learning_rate
+        if self.warmup_steps > 0:
+            learning_rate = lr.LinearWarmup(
+                learning_rate=learning_rate,
+                warmup_steps=self.warmup_steps,
+                start_lr=self.warmup_start_lr,
+                end_lr=self.learning_rate,
+                last_epoch=self.last_epoch)
+        return learning_rate
+
+
+class Step(object):
+    """
+    Piecewise learning rate decay
+    Args:
+        step_each_epoch(int): steps each epoch
+        learning_rate (float): The initial learning rate. It is a python float number.
+        step_size (int): the interval to update.
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
+            It should be less than 1.0. Default: 0.1.
+        warmup_epoch(int): The epoch numbers for LinearWarmup. Default: 0.
+        warmup_start_lr(float): Initial learning rate of warm up. Default: 0.0.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 step_size,
+                 step_each_epoch,
+                 epochs,
+                 gamma,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 **kwargs):
+        super().__init__()
+        if warmup_epoch >= epochs:
+            msg = f"When using warm up, the value of \"Global.epochs\" must be greater than value of \"Optimizer.lr.warmup_epoch\". The value of \"Optimizer.lr.warmup_epoch\" has been set to {epochs}."
+            logger.warning(msg)
+            warmup_epoch = epochs
+        self.step_size = step_each_epoch * step_size
+        self.learning_rate = learning_rate
+        self.gamma = gamma
+        self.last_epoch = last_epoch
+        self.warmup_steps = round(warmup_epoch * step_each_epoch)
+        self.warmup_start_lr = warmup_start_lr
+
+    def __call__(self):
+        learning_rate = lr.StepDecay(
+            learning_rate=self.learning_rate,
+            step_size=self.step_size,
+            gamma=self.gamma,
+            last_epoch=self.last_epoch)
+        if self.warmup_steps > 0:
+            learning_rate = lr.LinearWarmup(
+                learning_rate=learning_rate,
+                warmup_steps=self.warmup_steps,
+                start_lr=self.warmup_start_lr,
+                end_lr=self.learning_rate,
+                last_epoch=self.last_epoch)
+        return learning_rate
+
+
+class Piecewise(object):
+    """
+    Piecewise learning rate decay
+    Args:
+        boundaries(list): A list of steps numbers. The type of element in the list is python int.
+        values(list): A list of learning rate values that will be picked during different epoch boundaries.
+            The type of element in the list is python float.
+        warmup_epoch(int): The epoch numbers for LinearWarmup. Default: 0.
+        warmup_start_lr(float): Initial learning rate of warm up. Default: 0.0.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+    """
+
+    def __init__(self,
+                 step_each_epoch,
+                 decay_epochs,
+                 values,
+                 epochs,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 **kwargs):
+        super().__init__()
+        if warmup_epoch >= epochs:
+            msg = f"When using warm up, the value of \"Global.epochs\" must be greater than value of \"Optimizer.lr.warmup_epoch\". The value of \"Optimizer.lr.warmup_epoch\" has been set to {epochs}."
+            logger.warning(msg)
+            warmup_epoch = epochs
+        self.boundaries = [step_each_epoch * e for e in decay_epochs]
+        self.values = values
+        self.last_epoch = last_epoch
+        self.warmup_steps = round(warmup_epoch * step_each_epoch)
+        self.warmup_start_lr = warmup_start_lr
+
+    def __call__(self):
+        learning_rate = lr.PiecewiseDecay(
+            boundaries=self.boundaries,
+            values=self.values,
+            last_epoch=self.last_epoch)
+        if self.warmup_steps > 0:
+            learning_rate = lr.LinearWarmup(
+                learning_rate=learning_rate,
+                warmup_steps=self.warmup_steps,
+                start_lr=self.warmup_start_lr,
+                end_lr=self.values[0],
+                last_epoch=self.last_epoch)
+        return learning_rate
+
+
+class MultiStepDecay(LRScheduler):
+    """
+    Update the learning rate by ``gamma`` once ``epoch`` reaches one of the milestones.
+    The algorithm can be described as the code below.
+    .. code-block:: text
+        learning_rate = 0.5
+        milestones = [30, 50]
+        gamma = 0.1
+        if epoch < 30:
+            learning_rate = 0.5
+        elif epoch < 50:
+            learning_rate = 0.05
+        else:
+            learning_rate = 0.005
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        milestones (tuple|list): List or tuple of each boundaries. Must be increasing.
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
+            It should be less than 1.0. Default: 0.1.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``MultiStepDecay`` instance to schedule learning rate.
+    Examples:
+
+        .. code-block:: python
+            import paddle
+            import numpy as np
+            # train on default dynamic graph mode
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(5):
+                    x = paddle.uniform([10, 10])
+                    out = linear(x)
+                    loss = paddle.mean(out)
+                    loss.backward()
+                    sgd.step()
+                    sgd.clear_gradients()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
+            # train on static graph mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(5):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 milestones,
+                 epochs,
+                 step_each_epoch,
+                 gamma=0.1,
+                 last_epoch=-1,
+                 verbose=False):
+        if not isinstance(milestones, (tuple, list)):
+            raise TypeError(
+                "The type of 'milestones' in 'MultiStepDecay' must be 'tuple, list', but received %s."
+                % type(milestones))
+        if not all([
+                milestones[i] < milestones[i + 1]
+                for i in range(len(milestones) - 1)
+        ]):
+            raise ValueError('The elements of milestones must be incremented')
+        if gamma >= 1.0:
+            raise ValueError('gamma should be < 1.0.')
+        self.milestones = [x * step_each_epoch for x in milestones]
+        self.gamma = gamma
+        super().__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        for i in range(len(self.milestones)):
+            if self.last_epoch < self.milestones[i]:
+                return self.base_lr * (self.gamma**i)
+        return self.base_lr * (self.gamma**len(self.milestones))
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/optimizer/optimizer.py b/cv/classification/resnet50/paddlepaddle/ppcls/optimizer/optimizer.py
new file mode 100644
index 000000000..4422ea70d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/optimizer/optimizer.py
@@ -0,0 +1,217 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from paddle import optimizer as optim
+import paddle
+
+from ppcls.utils import logger
+
+
+class Momentum(object):
+    """
+    Simple Momentum optimizer with velocity state.
+    Args:
+        learning_rate (float|Variable) - The learning rate used to update parameters.
+            Can be a float value or a Variable with one float value as data element.
+        momentum (float) - Momentum factor.
+        regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 momentum,
+                 weight_decay=None,
+                 grad_clip=None,
+                 multi_precision=True):
+        super().__init__()
+        self.learning_rate = learning_rate
+        self.momentum = momentum
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
+        self.multi_precision = multi_precision
+
+    def __call__(self, model_list):
+        # model_list is None in static graph
+        parameters = sum([m.parameters() for m in model_list],
+                         []) if model_list else None
+        opt = optim.Momentum(
+            learning_rate=self.learning_rate,
+            momentum=self.momentum,
+            weight_decay=self.weight_decay,
+            grad_clip=self.grad_clip,
+            multi_precision=self.multi_precision,
+            parameters=parameters)
+        if hasattr(opt, '_use_multi_tensor'):
+            opt = optim.Momentum(
+                learning_rate=self.learning_rate,
+                momentum=self.momentum,
+                weight_decay=self.weight_decay,
+                grad_clip=self.grad_clip,
+                multi_precision=self.multi_precision,
+                parameters=parameters,
+                use_multi_tensor=True)
+        return opt
+
+
+class Adam(object):
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-08,
+                 parameter_list=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None,
+                 lazy_mode=False,
+                 multi_precision=False):
+        self.learning_rate = learning_rate
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.parameter_list = parameter_list
+        self.learning_rate = learning_rate
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
+        self.name = name
+        self.lazy_mode = lazy_mode
+        self.multi_precision = multi_precision
+
+    def __call__(self, model_list):
+        # model_list is None in static graph
+        parameters = sum([m.parameters() for m in model_list],
+                         []) if model_list else None
+        opt = optim.Adam(
+            learning_rate=self.learning_rate,
+            beta1=self.beta1,
+            beta2=self.beta2,
+            epsilon=self.epsilon,
+            weight_decay=self.weight_decay,
+            grad_clip=self.grad_clip,
+            name=self.name,
+            lazy_mode=self.lazy_mode,
+            multi_precision=self.multi_precision,
+            parameters=parameters)
+        return opt
+
+
+class RMSProp(object):
+    """
+    Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning rate method.
+    Args:
+        learning_rate (float|Variable) - The learning rate used to update parameters.
+            Can be a float value or a Variable with one float value as data element.
+        momentum (float) - Momentum factor.
+        rho (float) - rho value in equation.
+        epsilon (float) - avoid division by zero, default is 1e-6.
+        regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 momentum=0.0,
+                 rho=0.95,
+                 epsilon=1e-6,
+                 weight_decay=None,
+                 grad_clip=None,
+                 multi_precision=False):
+        super().__init__()
+        self.learning_rate = learning_rate
+        self.momentum = momentum
+        self.rho = rho
+        self.epsilon = epsilon
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
+
+    def __call__(self, model_list):
+        # model_list is None in static graph
+        parameters = sum([m.parameters() for m in model_list],
+                         []) if model_list else None
+        opt = optim.RMSProp(
+            learning_rate=self.learning_rate,
+            momentum=self.momentum,
+            rho=self.rho,
+            epsilon=self.epsilon,
+            weight_decay=self.weight_decay,
+            grad_clip=self.grad_clip,
+            parameters=parameters)
+        return opt
+
+
+class AdamW(object):
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-8,
+                 weight_decay=None,
+                 multi_precision=False,
+                 grad_clip=None,
+                 no_weight_decay_name=None,
+                 one_dim_param_no_weight_decay=False,
+                 **args):
+        super().__init__()
+        self.learning_rate = learning_rate
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.grad_clip = grad_clip
+        self.weight_decay = weight_decay
+        self.multi_precision = multi_precision
+        self.no_weight_decay_name_list = no_weight_decay_name.split(
+        ) if no_weight_decay_name else []
+        self.one_dim_param_no_weight_decay = one_dim_param_no_weight_decay
+
+    def __call__(self, model_list):
+        # model_list is None in static graph
+        parameters = sum([m.parameters() for m in model_list],
+                         []) if model_list else None
+
+        # TODO(gaotingquan): model_list is None when in static graph, "no_weight_decay" not work.
+        if model_list is None:
+            if self.one_dim_param_no_weight_decay or len(
+                    self.no_weight_decay_name_list) != 0:
+                msg = "\"AdamW\" does not support setting \"no_weight_decay\" in static graph. Please use dynamic graph."
+                logger.error(Exception(msg))
+                raise Exception(msg)
+
+        self.no_weight_decay_param_name_list = [
+            p.name for model in model_list for n, p in model.named_parameters()
+            if any(nd in n for nd in self.no_weight_decay_name_list)
+        ] if model_list else []
+
+        if self.one_dim_param_no_weight_decay:
+            self.no_weight_decay_param_name_list += [
+                p.name for model in model_list
+                for n, p in model.named_parameters() if len(p.shape) == 1
+            ] if model_list else []
+
+        opt = optim.AdamW(
+            learning_rate=self.learning_rate,
+            beta1=self.beta1,
+            beta2=self.beta2,
+            epsilon=self.epsilon,
+            parameters=parameters,
+            weight_decay=self.weight_decay,
+            multi_precision=self.multi_precision,
+            grad_clip=self.grad_clip,
+            apply_decay_param_fun=self._apply_decay_param_fun)
+        return opt
+
+    def _apply_decay_param_fun(self, name):
+        return name not in self.no_weight_decay_param_name_list
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/static/program.py b/cv/classification/resnet50/paddlepaddle/ppcls/static/program.py
new file mode 100644
index 000000000..b3534a2cf
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/static/program.py
@@ -0,0 +1,449 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+import numpy as np
+
+from collections import OrderedDict
+
+import paddle
+import paddle.nn.functional as F
+
+from paddle.distributed import fleet
+from paddle.distributed.fleet import DistributedStrategy
+
+# from ppcls.optimizer import OptimizerBuilder
+# from ppcls.optimizer.learning_rate import LearningRateBuilder
+
+from ppcls.arch import build_model
+from ppcls.loss import build_loss
+from ppcls.metric import build_metrics
+from ppcls.optimizer import build_optimizer
+from ppcls.optimizer import build_lr_scheduler
+
+from ppcls.utils.misc import AverageMeter
+from ppcls.utils import logger, profiler
+
+
+def create_feeds(image_shape, use_mix=False, class_num=None, dtype="float32"):
+    """
+    Create feeds as model input
+
+    Args:
+        image_shape(list[int]): model input shape, such as [3, 224, 224]
+        use_mix(bool): whether to use mix(include mixup, cutmix, fmix)
+        class_num(int): the class number of network, required if use_mix
+
+    Returns:
+        feeds(dict): dict of model input variables
+    """
+    feeds = OrderedDict()
+    feeds['data'] = paddle.static.data(
+        name="data", shape=[None] + image_shape, dtype=dtype)
+
+    if use_mix:
+        if class_num is None:
+            msg = "When use MixUp, CutMix and so on, you must set class_num."
+            logger.error(msg)
+            raise Exception(msg)
+        feeds['target'] = paddle.static.data(
+            name="target", shape=[None, class_num], dtype="float32")
+    else:
+        feeds['label'] = paddle.static.data(
+            name="label", shape=[None, 1], dtype="int64")
+
+    return feeds
+
+
+def create_fetchs(out,
+                  feeds,
+                  architecture,
+                  topk=5,
+                  epsilon=None,
+                  class_num=None,
+                  use_mix=False,
+                  config=None,
+                  mode="Train"):
+    """
+    Create fetchs as model outputs(included loss and measures),
+    will call create_loss and create_metric(if use_mix).
+    Args:
+        out(variable): model output variable
+        feeds(dict): dict of model input variables.
+            If use mix_up, it will not include label.
+        architecture(dict): architecture information,
+            name(such as ResNet50) is needed
+        topk(int): usually top5
+        epsilon(float): parameter for label smoothing, 0.0 <= epsilon <= 1.0
+        class_num(int): the class number of network, required if use_mix
+        use_mix(bool): whether to use mix(include mixup, cutmix, fmix)
+        config(dict): model config
+
+    Returns:
+        fetchs(dict): dict of model outputs(included loss and measures)
+    """
+    fetchs = OrderedDict()
+    # build loss
+    if use_mix:
+        if class_num is None:
+            msg = "When use MixUp, CutMix and so on, you must set class_num."
+            logger.error(msg)
+            raise Exception(msg)
+        target = paddle.reshape(feeds['target'], [-1, class_num])
+    else:
+        target = paddle.reshape(feeds['label'], [-1, 1])
+
+    loss_func = build_loss(config["Loss"][mode])
+    loss_dict = loss_func(out, target)
+
+    loss_out = loss_dict["loss"]
+    fetchs['loss'] = (loss_out, AverageMeter('loss', '7.4f', need_avg=True))
+
+    # build metric
+    if not use_mix:
+        metric_func = build_metrics(config["Metric"][mode])
+
+        metric_dict = metric_func(out, target)
+
+        for key in metric_dict:
+            if mode != "Train" and paddle.distributed.get_world_size() > 1:
+                paddle.distributed.all_reduce(
+                    metric_dict[key], op=paddle.distributed.ReduceOp.SUM)
+                metric_dict[key] = metric_dict[
+                    key] / paddle.distributed.get_world_size()
+
+            fetchs[key] = (metric_dict[key], AverageMeter(
+                key, '7.4f', need_avg=True))
+
+    return fetchs
+
+
+def create_optimizer(config, step_each_epoch):
+    # create learning_rate instance
+    optimizer, lr_sch = build_optimizer(
+        config["Optimizer"], config["Global"]["epochs"], step_each_epoch)
+    return optimizer, lr_sch
+
+
+def create_strategy(config):
+    """
+    Create build strategy and exec strategy.
+
+    Args:
+        config(dict): config
+
+    Returns:
+        build_strategy: build strategy
+        exec_strategy: exec strategy
+    """
+    build_strategy = paddle.static.BuildStrategy()
+    exec_strategy = paddle.static.ExecutionStrategy()
+
+    exec_strategy.num_threads = 1
+    exec_strategy.num_iteration_per_drop_scope = (
+        10000
+        if 'AMP' in config and config.AMP.get("level", "O1") == "O2" else 10)
+
+    fuse_op = True if 'AMP' in config else False
+
+    fuse_bn_act_ops = config.get('fuse_bn_act_ops', fuse_op)
+    fuse_elewise_add_act_ops = config.get('fuse_elewise_add_act_ops', fuse_op)
+    fuse_bn_add_act_ops = config.get('fuse_bn_add_act_ops', fuse_op)
+    enable_addto = config.get('enable_addto', fuse_op)
+
+    build_strategy.fuse_bn_act_ops = fuse_bn_act_ops
+    build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
+    build_strategy.fuse_bn_add_act_ops = fuse_bn_add_act_ops
+    build_strategy.enable_addto = enable_addto
+
+    return build_strategy, exec_strategy
+
+
+def dist_optimizer(config, optimizer):
+    """
+    Create a distributed optimizer based on a normal optimizer
+
+    Args:
+        config(dict):
+        optimizer(): a normal optimizer
+
+    Returns:
+        optimizer: a distributed optimizer
+    """
+    build_strategy, exec_strategy = create_strategy(config)
+
+    dist_strategy = DistributedStrategy()
+    dist_strategy.execution_strategy = exec_strategy
+    dist_strategy.build_strategy = build_strategy
+
+    dist_strategy.nccl_comm_num = 1
+    dist_strategy.fuse_all_reduce_ops = True
+    dist_strategy.fuse_grad_size_in_MB = 16
+    optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)
+
+    return optimizer
+
+
+def mixed_precision_optimizer(config, optimizer):
+    if 'AMP' in config:
+        amp_cfg = config.AMP if config.AMP else dict()
+        scale_loss = amp_cfg.get('scale_loss', 1.0)
+        use_dynamic_loss_scaling = amp_cfg.get('use_dynamic_loss_scaling',
+                                               False)
+        use_pure_fp16 = amp_cfg.get("level", "O1") == "O2"
+        optimizer = paddle.static.amp.decorate(
+            optimizer,
+            init_loss_scaling=scale_loss,
+            use_dynamic_loss_scaling=use_dynamic_loss_scaling,
+            use_pure_fp16=use_pure_fp16,
+            use_fp16_guard=True)
+
+    return optimizer
+
+
+def build(config,
+          main_prog,
+          startup_prog,
+          class_num=None,
+          step_each_epoch=100,
+          is_train=True,
+          is_distributed=True):
+    """
+    Build a program using a model and an optimizer
+        1. create feeds
+        2. create a dataloader
+        3. create a model
+        4. create fetchs
+        5. create an optimizer
+
+    Args:
+        config(dict): config
+        main_prog(): main program
+        startup_prog(): startup program
+        class_num(int): the class number of network, required if use_mix
+        is_train(bool): train or eval
+        is_distributed(bool): whether to use distributed training method
+
+    Returns:
+        dataloader(): a bridge between the model and the data
+        fetchs(dict): dict of model outputs(included loss and measures)
+    """
+    with paddle.static.program_guard(main_prog, startup_prog):
+        with paddle.utils.unique_name.guard():
+            mode = "Train" if is_train else "Eval"
+            use_mix = "batch_transform_ops" in config["DataLoader"][mode][
+                "dataset"]
+            feeds = create_feeds(
+                config["Global"]["image_shape"],
+                use_mix,
+                class_num=class_num,
+                dtype="float32")
+
+            # build model
+            # data_format should be assigned in arch-dict
+            input_image_channel = config["Global"]["image_shape"][
+                0]  # default as [3, 224, 224]
+            model = build_model(config)
+            out = model(feeds["data"])
+            # end of build model
+
+            fetchs = create_fetchs(
+                out,
+                feeds,
+                config["Arch"],
+                epsilon=config.get('ls_epsilon'),
+                class_num=class_num,
+                use_mix=use_mix,
+                config=config,
+                mode=mode)
+            lr_scheduler = None
+            optimizer = None
+            if is_train:
+                optimizer, lr_scheduler = build_optimizer(
+                    config["Optimizer"], config["Global"]["epochs"],
+                    step_each_epoch)
+                optimizer = mixed_precision_optimizer(config, optimizer)
+                if is_distributed:
+                    optimizer = dist_optimizer(config, optimizer)
+                optimizer.minimize(fetchs['loss'][0])
+    return fetchs, lr_scheduler, feeds, optimizer
+
+
+def compile(config, program, loss_name=None, share_prog=None):
+    """
+    Compile the program
+
+    Args:
+        config(dict): config
+        program(): the program which is wrapped by
+        loss_name(str): loss name
+        share_prog(): the shared program, used for evaluation during training
+
+    Returns:
+        compiled_program(): a compiled program
+    """
+    build_strategy, exec_strategy = create_strategy(config)
+
+    compiled_program = paddle.static.CompiledProgram(
+        program).with_data_parallel(
+            share_vars_from=share_prog,
+            loss_name=loss_name,
+            build_strategy=build_strategy,
+            exec_strategy=exec_strategy)
+
+    return compiled_program
+
+
+total_step = 0
+
+
+def run(dataloader,
+        exe,
+        program,
+        feeds,
+        fetchs,
+        epoch=0,
+        mode='train',
+        config=None,
+        vdl_writer=None,
+        lr_scheduler=None,
+        profiler_options=None):
+    """
+    Feed data to the model and fetch the measures and loss
+
+    Args:
+        dataloader(paddle io dataloader):
+        exe():
+        program():
+        fetchs(dict): dict of measures and the loss
+        epoch(int): epoch of training or evaluation
+        model(str): log only
+
+    Returns:
+    """
+    fetch_list = [f[0] for f in fetchs.values()]
+    metric_dict = OrderedDict([("lr", AverageMeter(
+        'lr', 'f', postfix=",", need_avg=False))])
+
+    for k in fetchs:
+        metric_dict[k] = fetchs[k][1]
+
+    metric_dict["batch_time"] = AverageMeter(
+        'batch_cost', '.5f', postfix=" s,")
+    metric_dict["reader_time"] = AverageMeter(
+        'reader_cost', '.5f', postfix=" s,")
+
+    for m in metric_dict.values():
+        m.reset()
+
+    use_dali = config["Global"].get('use_dali', False)
+    tic = time.time()
+
+    if not use_dali:
+        dataloader = dataloader()
+
+    idx = 0
+    batch_size = None
+    while True:
+        # The DALI maybe raise RuntimeError for some particular images, such as ImageNet1k/n04418357_26036.JPEG
+        try:
+            batch = next(dataloader)
+        except StopIteration:
+            break
+        except RuntimeError:
+            logger.warning(
+                "Except RuntimeError when reading data from dataloader, try to read once again..."
+            )
+            continue
+        idx += 1
+        # ignore the warmup iters
+        if idx == 5:
+            metric_dict["batch_time"].reset()
+            metric_dict["reader_time"].reset()
+
+        metric_dict['reader_time'].update(time.time() - tic)
+
+        profiler.add_profiler_step(profiler_options)
+
+        if use_dali:
+            batch_size = batch[0]["data"].shape()[0]
+            feed_dict = batch[0]
+        else:
+            batch_size = batch[0].shape()[0]
+            feed_dict = {
+                key.name: batch[idx]
+                for idx, key in enumerate(feeds.values())
+            }
+
+        metrics = exe.run(program=program,
+                          feed=feed_dict,
+                          fetch_list=fetch_list)
+
+        for name, m in zip(fetchs.keys(), metrics):
+            metric_dict[name].update(np.mean(m), batch_size)
+        metric_dict["batch_time"].update(time.time() - tic)
+        if mode == "train":
+            metric_dict['lr'].update(lr_scheduler.get_lr())
+
+        fetchs_str = ' '.join([
+            str(metric_dict[key].mean)
+            if "time" in key else str(metric_dict[key].value)
+            for key in metric_dict
+        ])
+        ips_info = " ips: {:.5f} images/sec.".format(
+            batch_size / metric_dict["batch_time"].avg)
+        fetchs_str += ips_info
+
+        if lr_scheduler is not None:
+            lr_scheduler.step()
+
+        if vdl_writer:
+            global total_step
+            logger.scaler('loss', metrics[0][0], total_step, vdl_writer)
+            total_step += 1
+        if mode == 'eval':
+            if idx % config.get('print_interval', 10) == 0:
+                logger.info("{:s} step:{:<4d} {:s}".format(mode, idx,
+                                                           fetchs_str))
+        else:
+            epoch_str = "epoch:{:<3d}".format(epoch)
+            step_str = "{:s} step:{:<4d}".format(mode, idx)
+
+            if idx % config.get('print_interval', 10) == 0:
+                logger.info("{:s} {:s} {:s}".format(epoch_str, step_str,
+                                                    fetchs_str))
+
+        tic = time.time()
+
+    end_str = ' '.join([str(m.mean) for m in metric_dict.values()] +
+                       [metric_dict["batch_time"].total])
+    ips_info = "ips: {:.5f} images/sec.".format(batch_size /
+                                                metric_dict["batch_time"].avg)
+    if mode == 'eval':
+        logger.info("END {:s} {:s} {:s}".format(mode, end_str, ips_info))
+    else:
+        end_epoch_str = "END epoch:{:<3d}".format(epoch)
+        logger.info("{:s} {:s} {:s} {:s}".format(end_epoch_str, mode, end_str,
+                                                 ips_info))
+    if use_dali:
+        dataloader.reset()
+
+    # return top1_acc in order to save the best model
+    if mode == 'eval':
+        return fetchs["top1"][1].avg
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/static/save_load.py b/cv/classification/resnet50/paddlepaddle/ppcls/static/save_load.py
new file mode 100644
index 000000000..13badfddc
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/static/save_load.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import errno
+import os
+import re
+import shutil
+import tempfile
+
+import paddle
+
+from ppcls.utils import logger
+
+__all__ = ['init_model', 'save_model']
+
+
+def _mkdir_if_not_exist(path):
+    """
+    mkdir if not exists, ignore the exception when multiprocess mkdir together
+    """
+    if not os.path.exists(path):
+        try:
+            os.makedirs(path)
+        except OSError as e:
+            if e.errno == errno.EEXIST and os.path.isdir(path):
+                logger.warning(
+                    'be happy if some process has already created {}'.format(
+                        path))
+            else:
+                raise OSError('Failed to mkdir {}'.format(path))
+
+
+def _load_state(path):
+    if os.path.exists(path + '.pdopt'):
+        # XXX another hack to ignore the optimizer state
+        tmp = tempfile.mkdtemp()
+        dst = os.path.join(tmp, os.path.basename(os.path.normpath(path)))
+        shutil.copy(path + '.pdparams', dst + '.pdparams')
+        state = paddle.static.load_program_state(dst)
+        shutil.rmtree(tmp)
+    else:
+        state = paddle.static.load_program_state(path)
+    return state
+
+
+def load_params(exe, prog, path, ignore_params=None):
+    """
+    Load model from the given path.
+    Args:
+        exe (fluid.Executor): The fluid.Executor object.
+        prog (fluid.Program): load weight to which Program object.
+        path (string): URL string or loca model path.
+        ignore_params (list): ignore variable to load when finetuning.
+            It can be specified by finetune_exclude_pretrained_params
+            and the usage can refer to the document
+            docs/advanced_tutorials/TRANSFER_LEARNING.md
+    """
+    if not (os.path.isdir(path) or os.path.exists(path + '.pdparams')):
+        raise ValueError("Model pretrain path {} does not "
+                         "exists.".format(path))
+
+    logger.info("Loading parameters from {}...".format(path))
+
+    ignore_set = set()
+    state = _load_state(path)
+
+    # ignore the parameter which mismatch the shape
+    # between the model and pretrain weight.
+    all_var_shape = {}
+    for block in prog.blocks:
+        for param in block.all_parameters():
+            all_var_shape[param.name] = param.shape
+    ignore_set.update([
+        name for name, shape in all_var_shape.items()
+        if name in state and shape != state[name].shape
+    ])
+
+    if ignore_params:
+        all_var_names = [var.name for var in prog.list_vars()]
+        ignore_list = filter(
+            lambda var: any([re.match(name, var) for name in ignore_params]),
+            all_var_names)
+        ignore_set.update(list(ignore_list))
+
+    if len(ignore_set) > 0:
+        for k in ignore_set:
+            if k in state:
+                logger.warning(
+                    'variable {} is already excluded automatically'.format(k))
+                del state[k]
+
+    paddle.static.set_program_state(prog, state)
+
+
+def init_model(config, program, exe):
+    """
+    load model from checkpoint or pretrained_model
+    """
+    checkpoints = config.get('checkpoints')
+    if checkpoints:
+        paddle.static.load(program, checkpoints, exe)
+        logger.info("Finish initing model from {}".format(checkpoints))
+        return
+
+    pretrained_model = config.get('pretrained_model')
+    if pretrained_model:
+        if not isinstance(pretrained_model, list):
+            pretrained_model = [pretrained_model]
+        for pretrain in pretrained_model:
+            load_params(exe, program, pretrain)
+        logger.info("Finish initing model from {}".format(pretrained_model))
+
+
+def save_model(program, model_path, epoch_id, prefix='ppcls'):
+    """
+    save model to the target path
+    """
+    if paddle.distributed.get_rank() != 0:
+        return
+    model_path = os.path.join(model_path, str(epoch_id))
+    _mkdir_if_not_exist(model_path)
+    model_prefix = os.path.join(model_path, prefix)
+    paddle.static.save(program, model_prefix)
+    logger.info("Already save model in {}".format(model_path))
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/static/train.py b/cv/classification/resnet50/paddlepaddle/ppcls/static/train.py
new file mode 100644
index 000000000..dd16cdb4c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/static/train.py
@@ -0,0 +1,212 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import sys
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(__dir__)
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))
+
+import paddle
+from paddle.distributed import fleet
+from visualdl import LogWriter
+
+from ppcls.data import build_dataloader
+from ppcls.utils.config import get_config, print_config
+from ppcls.utils import logger
+from ppcls.utils.logger import init_logger
+from ppcls.static.save_load import init_model, save_model
+from ppcls.static import program
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("PaddleClas train script")
+    parser.add_argument(
+        '-c',
+        '--config',
+        type=str,
+        default='configs/ResNet/ResNet50.yaml',
+        help='config file path')
+    parser.add_argument(
+        '-p',
+        '--profiler_options',
+        type=str,
+        default=None,
+        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
+    )
+    parser.add_argument(
+        '-o',
+        '--override',
+        action='append',
+        default=[],
+        help='config options to be overridden')
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    """
+    all the config of training paradigm should be in config["Global"]
+    """
+    config = get_config(args.config, overrides=args.override, show=False)
+    global_config = config["Global"]
+
+    mode = "train"
+
+    log_file = os.path.join(global_config['output_dir'],
+                            config["Arch"]["name"], f"{mode}.log")
+    init_logger(log_file=log_file)
+    print_config(config)
+
+    if global_config.get("is_distributed", True):
+        fleet.init(is_collective=True)
+    # assign the device
+    use_gpu = global_config.get("use_gpu", True)
+    # amp related config
+    if 'AMP' in config:
+        AMP_RELATED_FLAGS_SETTING = {
+            'FLAGS_cudnn_exhaustive_search': 1,
+            'FLAGS_conv_workspace_size_limit': 1500,
+            'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
+            'FLAGS_max_inplace_grad_add': 8,
+        }
+        os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '1'
+        paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)
+
+    use_xpu = global_config.get("use_xpu", False)
+    use_npu = global_config.get("use_npu", False)
+    use_mlu = global_config.get("use_mlu", False)
+    assert (
+        use_gpu and use_xpu and use_npu and use_mlu
+    ) is not True, "gpu, xpu, npu and mlu can not be true in the same time in static mode!"
+
+    if use_gpu:
+        device = paddle.set_device('gpu')
+    elif use_xpu:
+        device = paddle.set_device('xpu')
+    elif use_npu:
+        device = paddle.set_device('npu')
+    elif use_mlu:
+        device = paddle.set_device('mlu')
+    else:
+        device = paddle.set_device('cpu')
+
+    # visualDL
+    vdl_writer = None
+    if global_config["use_visualdl"]:
+        vdl_dir = os.path.join(global_config["output_dir"], "vdl")
+        vdl_writer = LogWriter(vdl_dir)
+
+    # build dataloader
+    eval_dataloader = None
+    use_dali = global_config.get('use_dali', False)
+
+    class_num = config["Arch"].get("class_num", None)
+    config["DataLoader"].update({"class_num": class_num})
+    train_dataloader = build_dataloader(
+        config["DataLoader"], "Train", device=device, use_dali=use_dali)
+    if global_config["eval_during_train"]:
+        eval_dataloader = build_dataloader(
+            config["DataLoader"], "Eval", device=device, use_dali=use_dali)
+
+    step_each_epoch = len(train_dataloader)
+
+    # startup_prog is used to do some parameter init work,
+    # and train prog is used to hold the network
+    startup_prog = paddle.static.Program()
+    train_prog = paddle.static.Program()
+
+    best_top1_acc = 0.0  # best top1 acc record
+
+    train_fetchs, lr_scheduler, train_feeds, optimizer = program.build(
+        config,
+        train_prog,
+        startup_prog,
+        class_num,
+        step_each_epoch=step_each_epoch,
+        is_train=True,
+        is_distributed=global_config.get("is_distributed", True))
+
+    if global_config["eval_during_train"]:
+        eval_prog = paddle.static.Program()
+        eval_fetchs, _, eval_feeds, _ = program.build(
+            config,
+            eval_prog,
+            startup_prog,
+            is_train=False,
+            is_distributed=global_config.get("is_distributed", True))
+        # clone to prune some content which is irrelevant in eval_prog
+        eval_prog = eval_prog.clone(for_test=True)
+
+    # create the "Executor" with the statement of which device
+    exe = paddle.static.Executor(device)
+    # Parameter initialization
+    exe.run(startup_prog)
+    # load pretrained models or checkpoints
+    init_model(global_config, train_prog, exe)
+
+    if 'AMP' in config and config.AMP.get("level", "O1") == "O2":
+        optimizer.amp_init(
+            device,
+            scope=paddle.static.global_scope(),
+            test_program=eval_prog
+            if global_config["eval_during_train"] else None)
+
+    if not global_config.get("is_distributed", True):
+        compiled_train_prog = program.compile(
+            config, train_prog, loss_name=train_fetchs["loss"][0].name)
+    else:
+        compiled_train_prog = train_prog
+
+    if eval_dataloader is not None:
+        compiled_eval_prog = program.compile(config, eval_prog)
+
+    for epoch_id in range(global_config["epochs"]):
+        # 1. train with train dataset
+        program.run(train_dataloader, exe, compiled_train_prog, train_feeds,
+                    train_fetchs, epoch_id, 'train', config, vdl_writer,
+                    lr_scheduler, args.profiler_options)
+        # 2. evaate with eval dataset
+        if global_config["eval_during_train"] and epoch_id % global_config[
+                "eval_interval"] == 0:
+            top1_acc = program.run(eval_dataloader, exe, compiled_eval_prog,
+                                   eval_feeds, eval_fetchs, epoch_id, "eval",
+                                   config)
+            if top1_acc > best_top1_acc:
+                best_top1_acc = top1_acc
+                message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
+                    best_top1_acc, epoch_id)
+                logger.info(message)
+                if epoch_id % global_config["save_interval"] == 0:
+
+                    model_path = os.path.join(global_config["output_dir"],
+                                              config["Arch"]["name"])
+                    save_model(train_prog, model_path, "best_model")
+
+        # 3. save the persistable model
+        if epoch_id % global_config["save_interval"] == 0:
+            model_path = os.path.join(global_config["output_dir"],
+                                      config["Arch"]["name"])
+            save_model(train_prog, model_path, epoch_id)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    args = parse_args()
+    main(args)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/utils/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls/utils/__init__.py
new file mode 100644
index 000000000..632cc7882
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/utils/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import logger
+from . import misc
+from . import model_zoo
+from . import metrics
+
+from .save_load import init_model, save_model
+from .config import get_config
+from .misc import AverageMeter
+from .metrics import multi_hot_encode
+from .metrics import hamming_distance
+from .metrics import accuracy_score
+from .metrics import precision_recall_fscore
+from .metrics import mean_average_precision
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/utils/check.py b/cv/classification/resnet50/paddlepaddle/ppcls/utils/check.py
new file mode 100644
index 000000000..bc7030818
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/utils/check.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+import paddle
+from paddle import is_compiled_with_cuda
+
+from ppcls.arch import get_architectures
+from ppcls.arch import similar_architectures
+from ppcls.arch import get_blacklist_model_in_static_mode
+from ppcls.utils import logger
+
+
+def check_version():
+    """
+    Log error and exit when the installed version of paddlepaddle is
+    not satisfied.
+    """
+    err = "PaddlePaddle version 1.8.0 or higher is required, " \
+          "or a suitable develop version is satisfied as well. \n" \
+          "Please make sure the version is good with your code."
+    try:
+        pass
+        # paddle.utils.require_version('0.0.0')
+    except Exception:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_gpu():
+    """
+    Log error and exit when using paddlepaddle cpu version.
+    """
+    err = "You are using paddlepaddle cpu version! Please try to " \
+          "install paddlepaddle-gpu to run model on GPU."
+
+    try:
+        assert is_compiled_with_cuda()
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_architecture(architecture):
+    """
+    check architecture and recommend similar architectures
+    """
+    assert isinstance(architecture, dict), \
+        ("the type of architecture({}) should be dict". format(architecture))
+    assert "name" in architecture, \
+        ("name must be in the architecture keys, just contains: {}". format(
+            architecture.keys()))
+
+    similar_names = similar_architectures(architecture["name"],
+                                          get_architectures())
+    model_list = ', '.join(similar_names)
+    err = "Architecture [{}] is not exist! Maybe you want: [{}]" \
+          "".format(architecture["name"], model_list)
+    try:
+        assert architecture["name"] in similar_names
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_model_with_running_mode(architecture):
+    """
+    check whether the model is consistent with the operating mode 
+    """
+    # some model are not supported in the static mode
+    blacklist = get_blacklist_model_in_static_mode()
+    if not paddle.in_dynamic_mode() and architecture["name"] in blacklist:
+        logger.error("Model: {} is not supported in the staic mode.".format(
+            architecture["name"]))
+        sys.exit(1)
+    return
+
+
+def check_mix(architecture, use_mix=False):
+    """
+    check mix parameter
+    """
+    err = "Cannot use mix processing in GoogLeNet, " \
+          "please set use_mix = False."
+    try:
+        if architecture["name"] == "GoogLeNet":
+            assert use_mix is not True
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_classes_num(classes_num):
+    """
+    check classes_num
+    """
+    err = "classes_num({}) should be a positive integer" \
+        "and larger than 1".format(classes_num)
+    try:
+        assert isinstance(classes_num, int)
+        assert classes_num > 1
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_data_dir(path):
+    """
+    check cata_dir
+    """
+    err = "Data path is not exist, please given a right path" \
+          "".format(path)
+    try:
+        assert os.isdir(path)
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_function_params(config, key):
+    """
+    check specify config
+    """
+    k_config = config.get(key)
+    assert k_config is not None, \
+        ('{} is required in config'.format(key))
+
+    assert k_config.get('function'), \
+        ('function is required {} config'.format(key))
+    params = k_config.get('params')
+    assert params is not None, \
+        ('params is required in {} config'.format(key))
+    assert isinstance(params, dict), \
+        ('the params in {} config should be a dict'.format(key))
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/utils/config.py b/cv/classification/resnet50/paddlepaddle/ppcls/utils/config.py
new file mode 100644
index 000000000..e3277c480
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/utils/config.py
@@ -0,0 +1,210 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import copy
+import argparse
+import yaml
+from ppcls.utils import logger
+from ppcls.utils import check
+__all__ = ['get_config']
+
+
+class AttrDict(dict):
+    def __getattr__(self, key):
+        return self[key]
+
+    def __setattr__(self, key, value):
+        if key in self.__dict__:
+            self.__dict__[key] = value
+        else:
+            self[key] = value
+
+    def __deepcopy__(self, content):
+        return copy.deepcopy(dict(self))
+
+
+def create_attr_dict(yaml_config):
+    from ast import literal_eval
+    for key, value in yaml_config.items():
+        if type(value) is dict:
+            yaml_config[key] = value = AttrDict(value)
+        if isinstance(value, str):
+            try:
+                value = literal_eval(value)
+            except BaseException:
+                pass
+        if isinstance(value, AttrDict):
+            create_attr_dict(yaml_config[key])
+        else:
+            yaml_config[key] = value
+
+
+def parse_config(cfg_file):
+    """Load a config file into AttrDict"""
+    with open(cfg_file, 'r') as fopen:
+        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.SafeLoader))
+    create_attr_dict(yaml_config)
+    return yaml_config
+
+
+def print_dict(d, delimiter=0):
+    """
+    Recursively visualize a dict and
+    indenting acrrording by the relationship of keys.
+    """
+    placeholder = "-" * 60
+    for k, v in sorted(d.items()):
+        if isinstance(v, dict):
+            logger.info("{}{} : ".format(delimiter * " ", k))
+            print_dict(v, delimiter + 4)
+        elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):
+            logger.info("{}{} : ".format(delimiter * " ", k))
+            for value in v:
+                print_dict(value, delimiter + 4)
+        else:
+            logger.info("{}{} : {}".format(delimiter * " ", k, v))
+        if k.isupper():
+            logger.info(placeholder)
+
+
+def print_config(config):
+    """
+    visualize configs
+    Arguments:
+        config: configs
+    """
+    logger.advertise()
+    print_dict(config)
+
+
+def check_config(config):
+    """
+    Check config
+    """
+    check.check_version()
+    use_gpu = config.get('use_gpu', True)
+    if use_gpu:
+        check.check_gpu()
+    architecture = config.get('ARCHITECTURE')
+    #check.check_architecture(architecture)
+    use_mix = config.get('use_mix', False)
+    check.check_mix(architecture, use_mix)
+    classes_num = config.get('classes_num')
+    check.check_classes_num(classes_num)
+    mode = config.get('mode', 'train')
+    if mode.lower() == 'train':
+        check.check_function_params(config, 'LEARNING_RATE')
+        check.check_function_params(config, 'OPTIMIZER')
+
+
+def override(dl, ks, v):
+    """
+    Recursively replace dict of list
+    Args:
+        dl(dict or list): dict or list to be replaced
+        ks(list): list of keys
+        v(str): value to be replaced
+    """
+
+    def str2num(v):
+        try:
+            return eval(v)
+        except Exception:
+            return v
+
+    assert isinstance(dl, (list, dict)), ("{} should be a list or a dict")
+    assert len(ks) > 0, ('lenght of keys should larger than 0')
+    if isinstance(dl, list):
+        k = str2num(ks[0])
+        if len(ks) == 1:
+            assert k < len(dl), ('index({}) out of range({})'.format(k, dl))
+            dl[k] = str2num(v)
+        else:
+            override(dl[k], ks[1:], v)
+    else:
+        if len(ks) == 1:
+            # assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl))
+            if not ks[0] in dl:
+                print('A new filed ({}) detected!'.format(ks[0], dl))
+            dl[ks[0]] = str2num(v)
+        else:
+            override(dl[ks[0]], ks[1:], v)
+
+
+def override_config(config, options=None):
+    """
+    Recursively override the config
+    Args:
+        config(dict): dict to be replaced
+        options(list): list of pairs(key0.key1.idx.key2=value)
+            such as: [
+                'topk=2',
+                'VALID.transforms.1.ResizeImage.resize_short=300'
+            ]
+    Returns:
+        config(dict): replaced config
+    """
+    if options is not None:
+        for opt in options:
+            assert isinstance(opt, str), (
+                "option({}) should be a str".format(opt))
+            assert "=" in opt, (
+                "option({}) should contain a ="
+                "to distinguish between key and value".format(opt))
+            pair = opt.split('=')
+            assert len(pair) == 2, ("there can be only a = in the option")
+            key, value = pair
+            keys = key.split('.')
+            override(config, keys, value)
+    return config
+
+
+def get_config(fname, overrides=None, show=False):
+    """
+    Read config from file
+    """
+    assert os.path.exists(fname), (
+        'config file({}) is not exist'.format(fname))
+    config = parse_config(fname)
+    override_config(config, overrides)
+    if show:
+        print_config(config)
+    # check_config(config)
+    return config
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("generic-image-rec train script")
+    parser.add_argument(
+        '-c',
+        '--config',
+        type=str,
+        default='configs/config.yaml',
+        help='config file path')
+    parser.add_argument(
+        '-o',
+        '--override',
+        action='append',
+        default=[],
+        help='config options to be overridden')
+    parser.add_argument(
+        '-p',
+        '--profiler_options',
+        type=str,
+        default=None,
+        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
+    )
+    args = parser.parse_args()
+    return args
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/utils/download.py b/cv/classification/resnet50/paddlepaddle/ppcls/utils/download.py
new file mode 100644
index 000000000..9c4575048
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/utils/download.py
@@ -0,0 +1,319 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import os.path as osp
+import shutil
+import requests
+import hashlib
+import tarfile
+import zipfile
+import time
+from collections import OrderedDict
+from tqdm import tqdm
+
+from ppcls.utils import logger
+
+__all__ = ['get_weights_path_from_url']
+
+WEIGHTS_HOME = osp.expanduser("~/.paddleclas/weights")
+
+DOWNLOAD_RETRY_LIMIT = 3
+
+
+def is_url(path):
+    """
+    Whether path is URL.
+    Args:
+        path (string): URL string or not.
+    """
+    return path.startswith('http://') or path.startswith('https://')
+
+
+def get_weights_path_from_url(url, md5sum=None):
+    """Get weights path from WEIGHT_HOME, if not exists,
+    download it from url.
+
+    Args:
+        url (str): download url
+        md5sum (str): md5 sum of download package
+    
+    Returns:
+        str: a local path to save downloaded weights.
+
+    Examples:
+        .. code-block:: python
+
+            from paddle.utils.download import get_weights_path_from_url
+
+            resnet18_pretrained_weight_url = 'https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams'
+            local_weight_path = get_weights_path_from_url(resnet18_pretrained_weight_url)
+
+    """
+    path = get_path_from_url(url, WEIGHTS_HOME, md5sum)
+    return path
+
+
+def _map_path(url, root_dir):
+    # parse path after download under root_dir
+    fname = osp.split(url)[-1]
+    fpath = fname
+    return osp.join(root_dir, fpath)
+
+
+def _get_unique_endpoints(trainer_endpoints):
+    # Sorting is to avoid different environmental variables for each card
+    trainer_endpoints.sort()
+    ips = set()
+    unique_endpoints = set()
+    for endpoint in trainer_endpoints:
+        ip = endpoint.split(":")[0]
+        if ip in ips:
+            continue
+        ips.add(ip)
+        unique_endpoints.add(endpoint)
+    logger.info("unique_endpoints {}".format(unique_endpoints))
+    return unique_endpoints
+
+
+def get_path_from_url(url,
+                      root_dir,
+                      md5sum=None,
+                      check_exist=True,
+                      decompress=True):
+    """ Download from given url to root_dir.
+    if file or directory specified by url is exists under
+    root_dir, return the path directly, otherwise download
+    from url and decompress it, return the path.
+
+    Args:
+        url (str): download url
+        root_dir (str): root dir for downloading, it should be
+                        WEIGHTS_HOME or DATASET_HOME
+        md5sum (str): md5 sum of download package
+    
+    Returns:
+        str: a local path to save downloaded models & weights & datasets.
+    """
+
+    from paddle.fluid.dygraph.parallel import ParallelEnv
+
+    assert is_url(url), "downloading from {} not a url".format(url)
+    # parse path after download to decompress under root_dir
+    fullpath = _map_path(url, root_dir)
+    # Mainly used to solve the problem of downloading data from different 
+    # machines in the case of multiple machines. Different ips will download 
+    # data, and the same ip will only download data once.
+    unique_endpoints = _get_unique_endpoints(ParallelEnv()
+                                             .trainer_endpoints[:])
+    if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum):
+        logger.info("Found {}".format(fullpath))
+    else:
+        if ParallelEnv().current_endpoint in unique_endpoints:
+            fullpath = _download(url, root_dir, md5sum)
+        else:
+            while not os.path.exists(fullpath):
+                time.sleep(1)
+
+    if ParallelEnv().current_endpoint in unique_endpoints:
+        if decompress and (tarfile.is_tarfile(fullpath) or
+                           zipfile.is_zipfile(fullpath)):
+            fullpath = _decompress(fullpath)
+
+    return fullpath
+
+
+def _download(url, path, md5sum=None):
+    """
+    Download from url, save to path.
+
+    url (str): download url
+    path (str): download to given path
+    """
+    if not osp.exists(path):
+        os.makedirs(path)
+
+    fname = osp.split(url)[-1]
+    fullname = osp.join(path, fname)
+    retry_cnt = 0
+
+    while not (osp.exists(fullname) and _md5check(fullname, md5sum)):
+        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+            retry_cnt += 1
+        else:
+            raise RuntimeError("Download from {} failed. "
+                               "Retry limit reached".format(url))
+
+        logger.info("Downloading {} from {}".format(fname, url))
+
+        try:
+            req = requests.get(url, stream=True)
+        except Exception as e:  # requests.exceptions.ConnectionError
+            logger.info(
+                "Downloading {} from {} failed {} times with exception {}".
+                format(fname, url, retry_cnt + 1, str(e)))
+            time.sleep(1)
+            continue
+
+        if req.status_code != 200:
+            raise RuntimeError("Downloading from {} failed with code "
+                               "{}!".format(url, req.status_code))
+
+        # For protecting download interupted, download to
+        # tmp_fullname firstly, move tmp_fullname to fullname
+        # after download finished
+        tmp_fullname = fullname + "_tmp"
+        total_size = req.headers.get('content-length')
+        with open(tmp_fullname, 'wb') as f:
+            if total_size:
+                with tqdm(total=(int(total_size) + 1023) // 1024) as pbar:
+                    for chunk in req.iter_content(chunk_size=1024):
+                        f.write(chunk)
+                        pbar.update(1)
+            else:
+                for chunk in req.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+        shutil.move(tmp_fullname, fullname)
+
+    return fullname
+
+
+def _md5check(fullname, md5sum=None):
+    if md5sum is None:
+        return True
+
+    logger.info("File {} md5 checking...".format(fullname))
+    md5 = hashlib.md5()
+    with open(fullname, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5.update(chunk)
+    calc_md5sum = md5.hexdigest()
+
+    if calc_md5sum != md5sum:
+        logger.info("File {} md5 check failed, {}(calc) != "
+                    "{}(base)".format(fullname, calc_md5sum, md5sum))
+        return False
+    return True
+
+
+def _decompress(fname):
+    """
+    Decompress for zip and tar file
+    """
+    logger.info("Decompressing {}...".format(fname))
+
+    # For protecting decompressing interupted,
+    # decompress to fpath_tmp directory firstly, if decompress
+    # successed, move decompress files to fpath and delete
+    # fpath_tmp and remove download compress file.
+
+    if tarfile.is_tarfile(fname):
+        uncompressed_path = _uncompress_file_tar(fname)
+    elif zipfile.is_zipfile(fname):
+        uncompressed_path = _uncompress_file_zip(fname)
+    else:
+        raise TypeError("Unsupport compress file type {}".format(fname))
+
+    return uncompressed_path
+
+
+def _uncompress_file_zip(filepath):
+    files = zipfile.ZipFile(filepath, 'r')
+    file_list = files.namelist()
+
+    file_dir = os.path.dirname(filepath)
+
+    if _is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+
+        for item in file_list:
+            files.extract(item, file_dir)
+
+    elif _is_a_single_dir(file_list):
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+
+        for item in file_list:
+            files.extract(item, file_dir)
+
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        if not os.path.exists(uncompressed_path):
+            os.makedirs(uncompressed_path)
+        for item in file_list:
+            files.extract(item, os.path.join(file_dir, rootpath))
+
+    files.close()
+
+    return uncompressed_path
+
+
+def _uncompress_file_tar(filepath, mode="r:*"):
+    files = tarfile.open(filepath, mode)
+    file_list = files.getnames()
+
+    file_dir = os.path.dirname(filepath)
+
+    if _is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        for item in file_list:
+            files.extract(item, file_dir)
+    elif _is_a_single_dir(file_list):
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        for item in file_list:
+            files.extract(item, file_dir)
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        if not os.path.exists(uncompressed_path):
+            os.makedirs(uncompressed_path)
+
+        for item in file_list:
+            files.extract(item, os.path.join(file_dir, rootpath))
+
+    files.close()
+
+    return uncompressed_path
+
+
+def _is_a_single_file(file_list):
+    if len(file_list) == 1 and file_list[0].find(os.sep) < -1:
+        return True
+    return False
+
+
+def _is_a_single_dir(file_list):
+    new_file_list = []
+    for file_path in file_list:
+        if '/' in file_path:
+            file_path = file_path.replace('/', os.sep)
+        elif '\\' in file_path:
+            file_path = file_path.replace('\\', os.sep)
+        new_file_list.append(file_path)
+
+    file_name = new_file_list[0].split(os.sep)[0]
+    for i in range(1, len(new_file_list)):
+        if file_name != new_file_list[i].split(os.sep)[0]:
+            return False
+    return True
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/utils/ema.py b/cv/classification/resnet50/paddlepaddle/ppcls/utils/ema.py
new file mode 100644
index 000000000..b54cdb1b2
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/utils/ema.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+
+
+class ExponentialMovingAverage():
+    """
+    Exponential Moving Average
+    Code was heavily based on https://github.com/Wanger-SJTU/SegToolbox.Pytorch/blob/master/lib/utils/ema.py
+    """
+
+    def __init__(self, model, decay, thres_steps=True):
+        self._model = model
+        self._decay = decay
+        self._thres_steps = thres_steps
+        self._shadow = {}
+        self._backup = {}
+
+    def register(self):
+        self._update_step = 0
+        for name, param in self._model.named_parameters():
+            if param.stop_gradient is False:
+                self._shadow[name] = param.numpy().copy()
+
+    def update(self):
+        decay = min(self._decay, (1 + self._update_step) / (
+            10 + self._update_step)) if self._thres_steps else self._decay
+        for name, param in self._model.named_parameters():
+            if param.stop_gradient is False:
+                assert name in self._shadow
+                new_val = np.array(param.numpy().copy())
+                old_val = np.array(self._shadow[name])
+                new_average = decay * old_val + (1 - decay) * new_val
+                self._shadow[name] = new_average
+        self._update_step += 1
+        return decay
+
+    def apply(self):
+        for name, param in self._model.named_parameters():
+            if param.stop_gradient is False:
+                assert name in self._shadow
+                self._backup[name] = np.array(param.numpy().copy())
+                param.set_value(np.array(self._shadow[name]))
+
+    def restore(self):
+        for name, param in self._model.named_parameters():
+            if param.stop_gradient is False:
+                assert name in self._backup
+                param.set_value(self._backup[name])
+        self._backup = {}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/utils/feature_maps_visualization/fm_vis.py b/cv/classification/resnet50/paddlepaddle/ppcls/utils/feature_maps_visualization/fm_vis.py
new file mode 100644
index 000000000..a5368b10e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/utils/feature_maps_visualization/fm_vis.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import cv2
+import utils
+import argparse
+import os
+import sys
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(__dir__)
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../../..')))
+
+import paddle
+from paddle.distributed import ParallelEnv
+
+from resnet import ResNet50
+from ppcls.utils.save_load import load_dygraph_pretrain
+
+
+def parse_args():
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--image_file", required=True, type=str)
+    parser.add_argument("-c", "--channel_num", type=int)
+    parser.add_argument("-p", "--pretrained_model", type=str)
+    parser.add_argument("--show", type=str2bool, default=False)
+    parser.add_argument("--interpolation", type=int, default=1)
+    parser.add_argument("--save_path", type=str, default=None)
+    parser.add_argument("--use_gpu", type=str2bool, default=True)
+
+    return parser.parse_args()
+
+
+def create_operators(interpolation=1):
+    size = 224
+    img_mean = [0.485, 0.456, 0.406]
+    img_std = [0.229, 0.224, 0.225]
+    img_scale = 1.0 / 255.0
+
+    resize_op = utils.ResizeImage(
+        resize_short=256, interpolation=interpolation)
+    crop_op = utils.CropImage(size=(size, size))
+    normalize_op = utils.NormalizeImage(
+        scale=img_scale, mean=img_mean, std=img_std)
+    totensor_op = utils.ToTensor()
+
+    return [resize_op, crop_op, normalize_op, totensor_op]
+
+
+def preprocess(data, ops):
+    for op in ops:
+        data = op(data)
+    return data
+
+
+def main():
+    args = parse_args()
+    operators = create_operators(args.interpolation)
+    # assign the place
+    place = 'gpu:{}'.format(ParallelEnv().dev_id) if args.use_gpu else 'cpu'
+    place = paddle.set_device(place)
+
+    net = ResNet50()
+    load_dygraph_pretrain(net, args.pretrained_model)
+
+    img = cv2.imread(args.image_file, cv2.IMREAD_COLOR)
+    data = preprocess(img, operators)
+    data = np.expand_dims(data, axis=0)
+    data = paddle.to_tensor(data)
+    net.eval()
+    _, fm = net(data)
+    assert args.channel_num >= 0 and args.channel_num <= fm.shape[
+        1], "the channel is out of the range, should be in {} but got {}".format(
+            [0, fm.shape[1]], args.channel_num)
+
+    fm = (np.squeeze(fm[0][args.channel_num].numpy()) * 255).astype(np.uint8)
+    fm = cv2.resize(fm, (img.shape[1], img.shape[0]))
+    if args.save_path is not None:
+        print("the feature map is saved in path: {}".format(args.save_path))
+        cv2.imwrite(args.save_path, fm)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/utils/feature_maps_visualization/resnet.py b/cv/classification/resnet50/paddlepaddle/ppcls/utils/feature_maps_visualization/resnet.py
new file mode 100644
index 000000000..b75881414
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/utils/feature_maps_visualization/resnet.py
@@ -0,0 +1,535 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "ResNet18":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_pretrained.pdparams",
+    "ResNet18_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_vd_pretrained.pdparams",
+    "ResNet34":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet34_pretrained.pdparams",
+    "ResNet34_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet34_vd_pretrained.pdparams",
+    "ResNet50":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_pretrained.pdparams",
+    "ResNet50_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_vd_pretrained.pdparams",
+    "ResNet101":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet101_pretrained.pdparams",
+    "ResNet101_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet101_vd_pretrained.pdparams",
+    "ResNet152":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_pretrained.pdparams",
+    "ResNet152_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_vd_pretrained.pdparams",
+    "ResNet200_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet200_vd_pretrained.pdparams",
+}
+
+__all__ = MODEL_URLS.keys()
+'''
+ResNet config: dict.
+    key: depth of ResNet.
+    values: config's dict of specific model.
+        keys:
+            block_type: Two different blocks in ResNet, BasicBlock and BottleneckBlock are optional.
+            block_depth: The number of blocks in different stages in ResNet.
+            num_channels: The number of channels to enter the next stage.
+'''
+NET_CONFIG = {
+    "18": {
+        "block_type": "BasicBlock",
+        "block_depth": [2, 2, 2, 2],
+        "num_channels": [64, 64, 128, 256]
+    },
+    "34": {
+        "block_type": "BasicBlock",
+        "block_depth": [3, 4, 6, 3],
+        "num_channels": [64, 64, 128, 256]
+    },
+    "50": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 4, 6, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "101": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 4, 23, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "152": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 8, 36, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "200": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 12, 48, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+}
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 is_vd_mode=False,
+                 act=None,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+        self.is_vd_mode = is_vd_mode
+        self.act = act
+        self.avg_pool = AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=False,
+            data_format=data_format)
+        self.bn = BatchNorm(
+            num_filters,
+            param_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult),
+            data_layout=data_format)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        if self.is_vd_mode:
+            x = self.avg_pool(x)
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act:
+            x = self.relu(x)
+        return x
+
+
+class BottleneckBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            lr_mult=lr_mult,
+            data_format=data_format)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride if if_first else 1,
+                is_vd_mode=False if if_first else True,
+                lr_mult=lr_mult,
+                data_format=data_format)
+        self.relu = nn.ReLU()
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        x = self.conv2(x)
+
+        if self.shortcut:
+            short = identity
+        else:
+            short = self.short(identity)
+        x = paddle.add(x=x, y=short)
+        x = self.relu(x)
+        return x
+
+
+class BasicBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            lr_mult=lr_mult,
+            data_format=data_format)
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=stride if if_first else 1,
+                is_vd_mode=False if if_first else True,
+                lr_mult=lr_mult,
+                data_format=data_format)
+        self.shortcut = shortcut
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        if self.shortcut:
+            short = identity
+        else:
+            short = self.short(identity)
+        x = paddle.add(x=x, y=short)
+        x = self.relu(x)
+        return x
+
+
+class ResNet(TheseusLayer):
+    """
+    ResNet
+    Args:
+        config: dict. config of ResNet.
+        version: str="vb". Different version of ResNet, version vd can perform better.
+        class_num: int=1000. The number of classes.
+        lr_mult_list: list. Control the learning rate of different stages.
+    Returns:
+        model: nn.Layer. Specific ResNet model depends on args.
+    """
+
+    def __init__(self,
+                 config,
+                 version="vb",
+                 class_num=1000,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+                 data_format="NCHW",
+                 input_image_channel=3,
+                 return_patterns=None):
+        super().__init__()
+
+        self.cfg = config
+        self.lr_mult_list = lr_mult_list
+        self.is_vd_mode = version == "vd"
+        self.class_num = class_num
+        self.num_filters = [64, 128, 256, 512]
+        self.block_depth = self.cfg["block_depth"]
+        self.block_type = self.cfg["block_type"]
+        self.num_channels = self.cfg["num_channels"]
+        self.channels_mult = 1 if self.num_channels[-1] == 256 else 4
+
+        assert isinstance(self.lr_mult_list, (
+            list, tuple
+        )), "lr_mult_list should be in (list, tuple) but got {}".format(
+            type(self.lr_mult_list))
+        assert len(self.lr_mult_list
+                   ) == 5, "lr_mult_list length should be 5 but got {}".format(
+                       len(self.lr_mult_list))
+
+        self.stem_cfg = {
+            #num_channels, num_filters, filter_size, stride
+            "vb": [[input_image_channel, 64, 7, 2]],
+            "vd":
+            [[input_image_channel, 32, 3, 2], [32, 32, 3, 1], [32, 64, 3, 1]]
+        }
+
+        self.stem = nn.Sequential(* [
+            ConvBNLayer(
+                num_channels=in_c,
+                num_filters=out_c,
+                filter_size=k,
+                stride=s,
+                act="relu",
+                lr_mult=self.lr_mult_list[0],
+                data_format=data_format)
+            for in_c, out_c, k, s in self.stem_cfg[version]
+        ])
+
+        self.max_pool = MaxPool2D(
+            kernel_size=3, stride=2, padding=1, data_format=data_format)
+        block_list = []
+        for block_idx in range(len(self.block_depth)):
+            shortcut = False
+            for i in range(self.block_depth[block_idx]):
+                block_list.append(globals()[self.block_type](
+                    num_channels=self.num_channels[block_idx] if i == 0 else
+                    self.num_filters[block_idx] * self.channels_mult,
+                    num_filters=self.num_filters[block_idx],
+                    stride=2 if i == 0 and block_idx != 0 else 1,
+                    shortcut=shortcut,
+                    if_first=block_idx == i == 0 if version == "vd" else True,
+                    lr_mult=self.lr_mult_list[block_idx + 1],
+                    data_format=data_format))
+                shortcut = True
+        self.blocks = nn.Sequential(*block_list)
+
+        self.avg_pool = AdaptiveAvgPool2D(1, data_format=data_format)
+        self.flatten = nn.Flatten()
+        self.avg_pool_channels = self.num_channels[-1] * 2
+        stdv = 1.0 / math.sqrt(self.avg_pool_channels * 1.0)
+        self.fc = Linear(
+            self.avg_pool_channels,
+            self.class_num,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
+
+        self.data_format = data_format
+        if return_patterns is not None:
+            self.update_res(return_patterns)
+            self.register_forward_post_hook(self._return_dict_hook)
+
+    def forward(self, x):
+        with paddle.static.amp.fp16_guard():
+            if self.data_format == "NHWC":
+                x = paddle.transpose(x, [0, 2, 3, 1])
+                x.stop_gradient = True
+            x = self.stem(x)
+            fm = x
+            x = self.max_pool(x)
+            x = self.blocks(x)
+            x = self.avg_pool(x)
+            x = self.flatten(x)
+            x = self.fc(x)
+        return x, fm
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNet18(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet18
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet18` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["18"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet18"], use_ssld)
+    return model
+
+
+def ResNet18_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet18_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet18_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["18"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet18_vd"], use_ssld)
+    return model
+
+
+def ResNet34(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet34
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet34` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["34"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet34"], use_ssld)
+    return model
+
+
+def ResNet34_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet34_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet34_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["34"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet34_vd"], use_ssld)
+    return model
+
+
+def ResNet50(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet50
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet50` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["50"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50"], use_ssld)
+    return model
+
+
+def ResNet50_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet50_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet50_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["50"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50_vd"], use_ssld)
+    return model
+
+
+def ResNet101(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet101
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet101` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["101"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet101"], use_ssld)
+    return model
+
+
+def ResNet101_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet101_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet101_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["101"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet101_vd"], use_ssld)
+    return model
+
+
+def ResNet152(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet152
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet152` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["152"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet152"], use_ssld)
+    return model
+
+
+def ResNet152_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet152_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet152_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["152"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet152_vd"], use_ssld)
+    return model
+
+
+def ResNet200_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet200_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet200_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["200"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet200_vd"], use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/utils/feature_maps_visualization/utils.py b/cv/classification/resnet50/paddlepaddle/ppcls/utils/feature_maps_visualization/utils.py
new file mode 100644
index 000000000..7c7014932
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/utils/feature_maps_visualization/utils.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+
+
+class DecodeImage(object):
+    def __init__(self, to_rgb=True):
+        self.to_rgb = to_rgb
+
+    def __call__(self, img):
+        data = np.frombuffer(img, dtype='uint8')
+        img = cv2.imdecode(data, 1)
+        if self.to_rgb:
+            assert img.shape[2] == 3, 'invalid shape of image[%s]' % (
+                img.shape)
+            img = img[:, :, ::-1]
+
+        return img
+
+
+class ResizeImage(object):
+    def __init__(self, resize_short=None, interpolation=1):
+        self.resize_short = resize_short
+        self.interpolation = interpolation
+
+    def __call__(self, img):
+        img_h, img_w = img.shape[:2]
+        percent = float(self.resize_short) / min(img_w, img_h)
+        w = int(round(img_w * percent))
+        h = int(round(img_h * percent))
+        return cv2.resize(img, (w, h), interpolation=self.interpolation)
+
+
+class CropImage(object):
+    def __init__(self, size):
+        if type(size) is int:
+            self.size = (size, size)
+        else:
+            self.size = size
+
+    def __call__(self, img):
+        w, h = self.size
+        img_h, img_w = img.shape[:2]
+        w_start = (img_w - w) // 2
+        h_start = (img_h - h) // 2
+
+        w_end = w_start + w
+        h_end = h_start + h
+        return img[h_start:h_end, w_start:w_end, :]
+
+
+class NormalizeImage(object):
+    def __init__(self, scale=None, mean=None, std=None):
+        self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
+        mean = mean if mean is not None else [0.485, 0.456, 0.406]
+        std = std if std is not None else [0.229, 0.224, 0.225]
+
+        shape = (1, 1, 3)
+        self.mean = np.array(mean).reshape(shape).astype('float32')
+        self.std = np.array(std).reshape(shape).astype('float32')
+
+    def __call__(self, img):
+        return (img.astype('float32') * self.scale - self.mean) / self.std
+
+
+class ToTensor(object):
+    def __init__(self):
+        pass
+
+    def __call__(self, img):
+        img = img.transpose((2, 0, 1))
+        return img
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/utils/gallery2fc.py b/cv/classification/resnet50/paddlepaddle/ppcls/utils/gallery2fc.py
new file mode 100644
index 000000000..67b08529e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/utils/gallery2fc.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import paddle
+import cv2
+
+from ppcls.arch import build_model
+from ppcls.utils.config import parse_config, parse_args
+from ppcls.utils.save_load import load_dygraph_pretrain
+from ppcls.utils.logger import init_logger
+from ppcls.data import create_operators
+from ppcls.arch.slim import quantize_model
+
+
+class GalleryLayer(paddle.nn.Layer):
+    def __init__(self, configs):
+        super().__init__()
+        self.configs = configs
+        embedding_size = self.configs["Arch"]["Head"]["embedding_size"]
+        self.batch_size = self.configs["IndexProcess"]["batch_size"]
+        self.image_shape = self.configs["Global"]["image_shape"].copy()
+        self.image_shape.insert(0, self.batch_size)
+
+        image_root = self.configs["IndexProcess"]["image_root"]
+        data_file = self.configs["IndexProcess"]["data_file"]
+        delimiter = self.configs["IndexProcess"]["delimiter"]
+        self.gallery_images = []
+        gallery_docs = []
+        gallery_labels = []
+
+        with open(data_file, 'r', encoding='utf-8') as f:
+            lines = f.readlines()
+            for ori_line in lines:
+                line = ori_line.strip().split(delimiter)
+                text_num = len(line)
+                assert text_num >= 2, f"line({ori_line}) must be splitted into at least 2 parts, but got {text_num}"
+                image_file = os.path.join(image_root, line[0])
+
+                self.gallery_images.append(image_file)
+                gallery_docs.append(ori_line.strip())
+                gallery_labels.append(line[1].strip())
+        self.gallery_layer = paddle.nn.Linear(embedding_size, len(self.gallery_images), bias_attr=False)
+        self.gallery_layer.skip_quant = True
+        output_label_str = ""
+        for i, label_i in enumerate(gallery_labels):
+            output_label_str += "{} {}\n".format(i, label_i)
+        output_path = configs["Global"]["save_inference_dir"] + "_label.txt"
+        with open(output_path, "w") as f:
+            f.write(output_label_str)
+
+    def forward(self, x, label=None):
+        x = paddle.nn.functional.normalize(x)
+        x = self.gallery_layer(x)
+        return x
+
+    def build_gallery_layer(self, feature_extractor):
+        transform_configs = self.configs["IndexProcess"]["transform_ops"]
+        preprocess_ops = create_operators(transform_configs)
+        embedding_size = self.configs["Arch"]["Head"]["embedding_size"]
+        batch_index = 0
+        input_tensor = paddle.zeros(self.image_shape)
+        gallery_feature = paddle.zeros((len(self.gallery_images), embedding_size))
+        for i, image_path in enumerate(self.gallery_images):
+            image = cv2.imread(image_path)[:, :, ::-1]
+            for op in preprocess_ops:
+                image = op(image)
+            input_tensor[batch_index] = image
+            batch_index += 1
+            if batch_index == self.batch_size or i == len(self.gallery_images) - 1:
+                batch_feature = feature_extractor(input_tensor)["features"]
+                for j in range(batch_index):
+                    feature = batch_feature[j]
+                    norm_feature = paddle.nn.functional.normalize(feature, axis=0)
+                    gallery_feature[i - batch_index + j + 1] = norm_feature
+        self.gallery_layer.set_state_dict({"_layer.weight": gallery_feature.T})
+
+
+def export_fuse_model(configs):
+    slim_config = configs["Slim"].copy()
+    configs["Slim"] = None
+    fuse_model = build_model(configs)
+    fuse_model.head = GalleryLayer(configs)
+    configs["Slim"] = slim_config
+    quantize_model(configs, fuse_model)
+    load_dygraph_pretrain(fuse_model, configs["Global"]["pretrained_model"])
+    fuse_model.eval()
+    fuse_model.head.build_gallery_layer(fuse_model)
+    save_path = configs["Global"]["save_inference_dir"]
+    fuse_model.quanter.save_quantized_model(
+        fuse_model,
+        save_path,
+        input_spec=[
+            paddle.static.InputSpec(
+                shape=[None] + configs["Global"]["image_shape"],
+                dtype='float32')
+        ])
+
+
+def main():
+    args = parse_args()
+    configs = parse_config(args.config)
+    init_logger(name='gallery2fc')
+    export_fuse_model(configs)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/utils/imagenet1k_label_list.txt b/cv/classification/resnet50/paddlepaddle/ppcls/utils/imagenet1k_label_list.txt
new file mode 100644
index 000000000..376e18021
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/utils/imagenet1k_label_list.txt
@@ -0,0 +1,1000 @@
+0 tench, Tinca tinca
+1 goldfish, Carassius auratus
+2 great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias
+3 tiger shark, Galeocerdo cuvieri
+4 hammerhead, hammerhead shark
+5 electric ray, crampfish, numbfish, torpedo
+6 stingray
+7 cock
+8 hen
+9 ostrich, Struthio camelus
+10 brambling, Fringilla montifringilla
+11 goldfinch, Carduelis carduelis
+12 house finch, linnet, Carpodacus mexicanus
+13 junco, snowbird
+14 indigo bunting, indigo finch, indigo bird, Passerina cyanea
+15 robin, American robin, Turdus migratorius
+16 bulbul
+17 jay
+18 magpie
+19 chickadee
+20 water ouzel, dipper
+21 kite
+22 bald eagle, American eagle, Haliaeetus leucocephalus
+23 vulture
+24 great grey owl, great gray owl, Strix nebulosa
+25 European fire salamander, Salamandra salamandra
+26 common newt, Triturus vulgaris
+27 eft
+28 spotted salamander, Ambystoma maculatum
+29 axolotl, mud puppy, Ambystoma mexicanum
+30 bullfrog, Rana catesbeiana
+31 tree frog, tree-frog
+32 tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui
+33 loggerhead, loggerhead turtle, Caretta caretta
+34 leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea
+35 mud turtle
+36 terrapin
+37 box turtle, box tortoise
+38 banded gecko
+39 common iguana, iguana, Iguana iguana
+40 American chameleon, anole, Anolis carolinensis
+41 whiptail, whiptail lizard
+42 agama
+43 frilled lizard, Chlamydosaurus kingi
+44 alligator lizard
+45 Gila monster, Heloderma suspectum
+46 green lizard, Lacerta viridis
+47 African chameleon, Chamaeleo chamaeleon
+48 Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis
+49 African crocodile, Nile crocodile, Crocodylus niloticus
+50 American alligator, Alligator mississipiensis
+51 triceratops
+52 thunder snake, worm snake, Carphophis amoenus
+53 ringneck snake, ring-necked snake, ring snake
+54 hognose snake, puff adder, sand viper
+55 green snake, grass snake
+56 king snake, kingsnake
+57 garter snake, grass snake
+58 water snake
+59 vine snake
+60 night snake, Hypsiglena torquata
+61 boa constrictor, Constrictor constrictor
+62 rock python, rock snake, Python sebae
+63 Indian cobra, Naja naja
+64 green mamba
+65 sea snake
+66 horned viper, cerastes, sand viper, horned asp, Cerastes cornutus
+67 diamondback, diamondback rattlesnake, Crotalus adamanteus
+68 sidewinder, horned rattlesnake, Crotalus cerastes
+69 trilobite
+70 harvestman, daddy longlegs, Phalangium opilio
+71 scorpion
+72 black and gold garden spider, Argiope aurantia
+73 barn spider, Araneus cavaticus
+74 garden spider, Aranea diademata
+75 black widow, Latrodectus mactans
+76 tarantula
+77 wolf spider, hunting spider
+78 tick
+79 centipede
+80 black grouse
+81 ptarmigan
+82 ruffed grouse, partridge, Bonasa umbellus
+83 prairie chicken, prairie grouse, prairie fowl
+84 peacock
+85 quail
+86 partridge
+87 African grey, African gray, Psittacus erithacus
+88 macaw
+89 sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita
+90 lorikeet
+91 coucal
+92 bee eater
+93 hornbill
+94 hummingbird
+95 jacamar
+96 toucan
+97 drake
+98 red-breasted merganser, Mergus serrator
+99 goose
+100 black swan, Cygnus atratus
+101 tusker
+102 echidna, spiny anteater, anteater
+103 platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus
+104 wallaby, brush kangaroo
+105 koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus
+106 wombat
+107 jellyfish
+108 sea anemone, anemone
+109 brain coral
+110 flatworm, platyhelminth
+111 nematode, nematode worm, roundworm
+112 conch
+113 snail
+114 slug
+115 sea slug, nudibranch
+116 chiton, coat-of-mail shell, sea cradle, polyplacophore
+117 chambered nautilus, pearly nautilus, nautilus
+118 Dungeness crab, Cancer magister
+119 rock crab, Cancer irroratus
+120 fiddler crab
+121 king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica
+122 American lobster, Northern lobster, Maine lobster, Homarus americanus
+123 spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish
+124 crayfish, crawfish, crawdad, crawdaddy
+125 hermit crab
+126 isopod
+127 white stork, Ciconia ciconia
+128 black stork, Ciconia nigra
+129 spoonbill
+130 flamingo
+131 little blue heron, Egretta caerulea
+132 American egret, great white heron, Egretta albus
+133 bittern
+134 crane
+135 limpkin, Aramus pictus
+136 European gallinule, Porphyrio porphyrio
+137 American coot, marsh hen, mud hen, water hen, Fulica americana
+138 bustard
+139 ruddy turnstone, Arenaria interpres
+140 red-backed sandpiper, dunlin, Erolia alpina
+141 redshank, Tringa totanus
+142 dowitcher
+143 oystercatcher, oyster catcher
+144 pelican
+145 king penguin, Aptenodytes patagonica
+146 albatross, mollymawk
+147 grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus
+148 killer whale, killer, orca, grampus, sea wolf, Orcinus orca
+149 dugong, Dugong dugon
+150 sea lion
+151 Chihuahua
+152 Japanese spaniel
+153 Maltese dog, Maltese terrier, Maltese
+154 Pekinese, Pekingese, Peke
+155 Shih-Tzu
+156 Blenheim spaniel
+157 papillon
+158 toy terrier
+159 Rhodesian ridgeback
+160 Afghan hound, Afghan
+161 basset, basset hound
+162 beagle
+163 bloodhound, sleuthhound
+164 bluetick
+165 black-and-tan coonhound
+166 Walker hound, Walker foxhound
+167 English foxhound
+168 redbone
+169 borzoi, Russian wolfhound
+170 Irish wolfhound
+171 Italian greyhound
+172 whippet
+173 Ibizan hound, Ibizan Podenco
+174 Norwegian elkhound, elkhound
+175 otterhound, otter hound
+176 Saluki, gazelle hound
+177 Scottish deerhound, deerhound
+178 Weimaraner
+179 Staffordshire bullterrier, Staffordshire bull terrier
+180 American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier
+181 Bedlington terrier
+182 Border terrier
+183 Kerry blue terrier
+184 Irish terrier
+185 Norfolk terrier
+186 Norwich terrier
+187 Yorkshire terrier
+188 wire-haired fox terrier
+189 Lakeland terrier
+190 Sealyham terrier, Sealyham
+191 Airedale, Airedale terrier
+192 cairn, cairn terrier
+193 Australian terrier
+194 Dandie Dinmont, Dandie Dinmont terrier
+195 Boston bull, Boston terrier
+196 miniature schnauzer
+197 giant schnauzer
+198 standard schnauzer
+199 Scotch terrier, Scottish terrier, Scottie
+200 Tibetan terrier, chrysanthemum dog
+201 silky terrier, Sydney silky
+202 soft-coated wheaten terrier
+203 West Highland white terrier
+204 Lhasa, Lhasa apso
+205 flat-coated retriever
+206 curly-coated retriever
+207 golden retriever
+208 Labrador retriever
+209 Chesapeake Bay retriever
+210 German short-haired pointer
+211 vizsla, Hungarian pointer
+212 English setter
+213 Irish setter, red setter
+214 Gordon setter
+215 Brittany spaniel
+216 clumber, clumber spaniel
+217 English springer, English springer spaniel
+218 Welsh springer spaniel
+219 cocker spaniel, English cocker spaniel, cocker
+220 Sussex spaniel
+221 Irish water spaniel
+222 kuvasz
+223 schipperke
+224 groenendael
+225 malinois
+226 briard
+227 kelpie
+228 komondor
+229 Old English sheepdog, bobtail
+230 Shetland sheepdog, Shetland sheep dog, Shetland
+231 collie
+232 Border collie
+233 Bouvier des Flandres, Bouviers des Flandres
+234 Rottweiler
+235 German shepherd, German shepherd dog, German police dog, alsatian
+236 Doberman, Doberman pinscher
+237 miniature pinscher
+238 Greater Swiss Mountain dog
+239 Bernese mountain dog
+240 Appenzeller
+241 EntleBucher
+242 boxer
+243 bull mastiff
+244 Tibetan mastiff
+245 French bulldog
+246 Great Dane
+247 Saint Bernard, St Bernard
+248 Eskimo dog, husky
+249 malamute, malemute, Alaskan malamute
+250 Siberian husky
+251 dalmatian, coach dog, carriage dog
+252 affenpinscher, monkey pinscher, monkey dog
+253 basenji
+254 pug, pug-dog
+255 Leonberg
+256 Newfoundland, Newfoundland dog
+257 Great Pyrenees
+258 Samoyed, Samoyede
+259 Pomeranian
+260 chow, chow chow
+261 keeshond
+262 Brabancon griffon
+263 Pembroke, Pembroke Welsh corgi
+264 Cardigan, Cardigan Welsh corgi
+265 toy poodle
+266 miniature poodle
+267 standard poodle
+268 Mexican hairless
+269 timber wolf, grey wolf, gray wolf, Canis lupus
+270 white wolf, Arctic wolf, Canis lupus tundrarum
+271 red wolf, maned wolf, Canis rufus, Canis niger
+272 coyote, prairie wolf, brush wolf, Canis latrans
+273 dingo, warrigal, warragal, Canis dingo
+274 dhole, Cuon alpinus
+275 African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus
+276 hyena, hyaena
+277 red fox, Vulpes vulpes
+278 kit fox, Vulpes macrotis
+279 Arctic fox, white fox, Alopex lagopus
+280 grey fox, gray fox, Urocyon cinereoargenteus
+281 tabby, tabby cat
+282 tiger cat
+283 Persian cat
+284 Siamese cat, Siamese
+285 Egyptian cat
+286 cougar, puma, catamount, mountain lion, painter, panther, Felis concolor
+287 lynx, catamount
+288 leopard, Panthera pardus
+289 snow leopard, ounce, Panthera uncia
+290 jaguar, panther, Panthera onca, Felis onca
+291 lion, king of beasts, Panthera leo
+292 tiger, Panthera tigris
+293 cheetah, chetah, Acinonyx jubatus
+294 brown bear, bruin, Ursus arctos
+295 American black bear, black bear, Ursus americanus, Euarctos americanus
+296 ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus
+297 sloth bear, Melursus ursinus, Ursus ursinus
+298 mongoose
+299 meerkat, mierkat
+300 tiger beetle
+301 ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle
+302 ground beetle, carabid beetle
+303 long-horned beetle, longicorn, longicorn beetle
+304 leaf beetle, chrysomelid
+305 dung beetle
+306 rhinoceros beetle
+307 weevil
+308 fly
+309 bee
+310 ant, emmet, pismire
+311 grasshopper, hopper
+312 cricket
+313 walking stick, walkingstick, stick insect
+314 cockroach, roach
+315 mantis, mantid
+316 cicada, cicala
+317 leafhopper
+318 lacewing, lacewing fly
+319 dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk
+320 damselfly
+321 admiral
+322 ringlet, ringlet butterfly
+323 monarch, monarch butterfly, milkweed butterfly, Danaus plexippus
+324 cabbage butterfly
+325 sulphur butterfly, sulfur butterfly
+326 lycaenid, lycaenid butterfly
+327 starfish, sea star
+328 sea urchin
+329 sea cucumber, holothurian
+330 wood rabbit, cottontail, cottontail rabbit
+331 hare
+332 Angora, Angora rabbit
+333 hamster
+334 porcupine, hedgehog
+335 fox squirrel, eastern fox squirrel, Sciurus niger
+336 marmot
+337 beaver
+338 guinea pig, Cavia cobaya
+339 sorrel
+340 zebra
+341 hog, pig, grunter, squealer, Sus scrofa
+342 wild boar, boar, Sus scrofa
+343 warthog
+344 hippopotamus, hippo, river horse, Hippopotamus amphibius
+345 ox
+346 water buffalo, water ox, Asiatic buffalo, Bubalus bubalis
+347 bison
+348 ram, tup
+349 bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis
+350 ibex, Capra ibex
+351 hartebeest
+352 impala, Aepyceros melampus
+353 gazelle
+354 Arabian camel, dromedary, Camelus dromedarius
+355 llama
+356 weasel
+357 mink
+358 polecat, fitch, foulmart, foumart, Mustela putorius
+359 black-footed ferret, ferret, Mustela nigripes
+360 otter
+361 skunk, polecat, wood pussy
+362 badger
+363 armadillo
+364 three-toed sloth, ai, Bradypus tridactylus
+365 orangutan, orang, orangutang, Pongo pygmaeus
+366 gorilla, Gorilla gorilla
+367 chimpanzee, chimp, Pan troglodytes
+368 gibbon, Hylobates lar
+369 siamang, Hylobates syndactylus, Symphalangus syndactylus
+370 guenon, guenon monkey
+371 patas, hussar monkey, Erythrocebus patas
+372 baboon
+373 macaque
+374 langur
+375 colobus, colobus monkey
+376 proboscis monkey, Nasalis larvatus
+377 marmoset
+378 capuchin, ringtail, Cebus capucinus
+379 howler monkey, howler
+380 titi, titi monkey
+381 spider monkey, Ateles geoffroyi
+382 squirrel monkey, Saimiri sciureus
+383 Madagascar cat, ring-tailed lemur, Lemur catta
+384 indri, indris, Indri indri, Indri brevicaudatus
+385 Indian elephant, Elephas maximus
+386 African elephant, Loxodonta africana
+387 lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens
+388 giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca
+389 barracouta, snoek
+390 eel
+391 coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch
+392 rock beauty, Holocanthus tricolor
+393 anemone fish
+394 sturgeon
+395 gar, garfish, garpike, billfish, Lepisosteus osseus
+396 lionfish
+397 puffer, pufferfish, blowfish, globefish
+398 abacus
+399 abaya
+400 academic gown, academic robe, judge's robe
+401 accordion, piano accordion, squeeze box
+402 acoustic guitar
+403 aircraft carrier, carrier, flattop, attack aircraft carrier
+404 airliner
+405 airship, dirigible
+406 altar
+407 ambulance
+408 amphibian, amphibious vehicle
+409 analog clock
+410 apiary, bee house
+411 apron
+412 ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin
+413 assault rifle, assault gun
+414 backpack, back pack, knapsack, packsack, rucksack, haversack
+415 bakery, bakeshop, bakehouse
+416 balance beam, beam
+417 balloon
+418 ballpoint, ballpoint pen, ballpen, Biro
+419 Band Aid
+420 banjo
+421 bannister, banister, balustrade, balusters, handrail
+422 barbell
+423 barber chair
+424 barbershop
+425 barn
+426 barometer
+427 barrel, cask
+428 barrow, garden cart, lawn cart, wheelbarrow
+429 baseball
+430 basketball
+431 bassinet
+432 bassoon
+433 bathing cap, swimming cap
+434 bath towel
+435 bathtub, bathing tub, bath, tub
+436 beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon
+437 beacon, lighthouse, beacon light, pharos
+438 beaker
+439 bearskin, busby, shako
+440 beer bottle
+441 beer glass
+442 bell cote, bell cot
+443 bib
+444 bicycle-built-for-two, tandem bicycle, tandem
+445 bikini, two-piece
+446 binder, ring-binder
+447 binoculars, field glasses, opera glasses
+448 birdhouse
+449 boathouse
+450 bobsled, bobsleigh, bob
+451 bolo tie, bolo, bola tie, bola
+452 bonnet, poke bonnet
+453 bookcase
+454 bookshop, bookstore, bookstall
+455 bottlecap
+456 bow
+457 bow tie, bow-tie, bowtie
+458 brass, memorial tablet, plaque
+459 brassiere, bra, bandeau
+460 breakwater, groin, groyne, mole, bulwark, seawall, jetty
+461 breastplate, aegis, egis
+462 broom
+463 bucket, pail
+464 buckle
+465 bulletproof vest
+466 bullet train, bullet
+467 butcher shop, meat market
+468 cab, hack, taxi, taxicab
+469 caldron, cauldron
+470 candle, taper, wax light
+471 cannon
+472 canoe
+473 can opener, tin opener
+474 cardigan
+475 car mirror
+476 carousel, carrousel, merry-go-round, roundabout, whirligig
+477 carpenter's kit, tool kit
+478 carton
+479 car wheel
+480 cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM
+481 cassette
+482 cassette player
+483 castle
+484 catamaran
+485 CD player
+486 cello, violoncello
+487 cellular telephone, cellular phone, cellphone, cell, mobile phone
+488 chain
+489 chainlink fence
+490 chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour
+491 chain saw, chainsaw
+492 chest
+493 chiffonier, commode
+494 chime, bell, gong
+495 china cabinet, china closet
+496 Christmas stocking
+497 church, church building
+498 cinema, movie theater, movie theatre, movie house, picture palace
+499 cleaver, meat cleaver, chopper
+500 cliff dwelling
+501 cloak
+502 clog, geta, patten, sabot
+503 cocktail shaker
+504 coffee mug
+505 coffeepot
+506 coil, spiral, volute, whorl, helix
+507 combination lock
+508 computer keyboard, keypad
+509 confectionery, confectionary, candy store
+510 container ship, containership, container vessel
+511 convertible
+512 corkscrew, bottle screw
+513 cornet, horn, trumpet, trump
+514 cowboy boot
+515 cowboy hat, ten-gallon hat
+516 cradle
+517 crane
+518 crash helmet
+519 crate
+520 crib, cot
+521 Crock Pot
+522 croquet ball
+523 crutch
+524 cuirass
+525 dam, dike, dyke
+526 desk
+527 desktop computer
+528 dial telephone, dial phone
+529 diaper, nappy, napkin
+530 digital clock
+531 digital watch
+532 dining table, board
+533 dishrag, dishcloth
+534 dishwasher, dish washer, dishwashing machine
+535 disk brake, disc brake
+536 dock, dockage, docking facility
+537 dogsled, dog sled, dog sleigh
+538 dome
+539 doormat, welcome mat
+540 drilling platform, offshore rig
+541 drum, membranophone, tympan
+542 drumstick
+543 dumbbell
+544 Dutch oven
+545 electric fan, blower
+546 electric guitar
+547 electric locomotive
+548 entertainment center
+549 envelope
+550 espresso maker
+551 face powder
+552 feather boa, boa
+553 file, file cabinet, filing cabinet
+554 fireboat
+555 fire engine, fire truck
+556 fire screen, fireguard
+557 flagpole, flagstaff
+558 flute, transverse flute
+559 folding chair
+560 football helmet
+561 forklift
+562 fountain
+563 fountain pen
+564 four-poster
+565 freight car
+566 French horn, horn
+567 frying pan, frypan, skillet
+568 fur coat
+569 garbage truck, dustcart
+570 gasmask, respirator, gas helmet
+571 gas pump, gasoline pump, petrol pump, island dispenser
+572 goblet
+573 go-kart
+574 golf ball
+575 golfcart, golf cart
+576 gondola
+577 gong, tam-tam
+578 gown
+579 grand piano, grand
+580 greenhouse, nursery, glasshouse
+581 grille, radiator grille
+582 grocery store, grocery, food market, market
+583 guillotine
+584 hair slide
+585 hair spray
+586 half track
+587 hammer
+588 hamper
+589 hand blower, blow dryer, blow drier, hair dryer, hair drier
+590 hand-held computer, hand-held microcomputer
+591 handkerchief, hankie, hanky, hankey
+592 hard disc, hard disk, fixed disk
+593 harmonica, mouth organ, harp, mouth harp
+594 harp
+595 harvester, reaper
+596 hatchet
+597 holster
+598 home theater, home theatre
+599 honeycomb
+600 hook, claw
+601 hoopskirt, crinoline
+602 horizontal bar, high bar
+603 horse cart, horse-cart
+604 hourglass
+605 iPod
+606 iron, smoothing iron
+607 jack-o'-lantern
+608 jean, blue jean, denim
+609 jeep, landrover
+610 jersey, T-shirt, tee shirt
+611 jigsaw puzzle
+612 jinrikisha, ricksha, rickshaw
+613 joystick
+614 kimono
+615 knee pad
+616 knot
+617 lab coat, laboratory coat
+618 ladle
+619 lampshade, lamp shade
+620 laptop, laptop computer
+621 lawn mower, mower
+622 lens cap, lens cover
+623 letter opener, paper knife, paperknife
+624 library
+625 lifeboat
+626 lighter, light, igniter, ignitor
+627 limousine, limo
+628 liner, ocean liner
+629 lipstick, lip rouge
+630 Loafer
+631 lotion
+632 loudspeaker, speaker, speaker unit, loudspeaker system, speaker system
+633 loupe, jeweler's loupe
+634 lumbermill, sawmill
+635 magnetic compass
+636 mailbag, postbag
+637 mailbox, letter box
+638 maillot
+639 maillot, tank suit
+640 manhole cover
+641 maraca
+642 marimba, xylophone
+643 mask
+644 matchstick
+645 maypole
+646 maze, labyrinth
+647 measuring cup
+648 medicine chest, medicine cabinet
+649 megalith, megalithic structure
+650 microphone, mike
+651 microwave, microwave oven
+652 military uniform
+653 milk can
+654 minibus
+655 miniskirt, mini
+656 minivan
+657 missile
+658 mitten
+659 mixing bowl
+660 mobile home, manufactured home
+661 Model T
+662 modem
+663 monastery
+664 monitor
+665 moped
+666 mortar
+667 mortarboard
+668 mosque
+669 mosquito net
+670 motor scooter, scooter
+671 mountain bike, all-terrain bike, off-roader
+672 mountain tent
+673 mouse, computer mouse
+674 mousetrap
+675 moving van
+676 muzzle
+677 nail
+678 neck brace
+679 necklace
+680 nipple
+681 notebook, notebook computer
+682 obelisk
+683 oboe, hautboy, hautbois
+684 ocarina, sweet potato
+685 odometer, hodometer, mileometer, milometer
+686 oil filter
+687 organ, pipe organ
+688 oscilloscope, scope, cathode-ray oscilloscope, CRO
+689 overskirt
+690 oxcart
+691 oxygen mask
+692 packet
+693 paddle, boat paddle
+694 paddlewheel, paddle wheel
+695 padlock
+696 paintbrush
+697 pajama, pyjama, pj's, jammies
+698 palace
+699 panpipe, pandean pipe, syrinx
+700 paper towel
+701 parachute, chute
+702 parallel bars, bars
+703 park bench
+704 parking meter
+705 passenger car, coach, carriage
+706 patio, terrace
+707 pay-phone, pay-station
+708 pedestal, plinth, footstall
+709 pencil box, pencil case
+710 pencil sharpener
+711 perfume, essence
+712 Petri dish
+713 photocopier
+714 pick, plectrum, plectron
+715 pickelhaube
+716 picket fence, paling
+717 pickup, pickup truck
+718 pier
+719 piggy bank, penny bank
+720 pill bottle
+721 pillow
+722 ping-pong ball
+723 pinwheel
+724 pirate, pirate ship
+725 pitcher, ewer
+726 plane, carpenter's plane, woodworking plane
+727 planetarium
+728 plastic bag
+729 plate rack
+730 plow, plough
+731 plunger, plumber's helper
+732 Polaroid camera, Polaroid Land camera
+733 pole
+734 police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria
+735 poncho
+736 pool table, billiard table, snooker table
+737 pop bottle, soda bottle
+738 pot, flowerpot
+739 potter's wheel
+740 power drill
+741 prayer rug, prayer mat
+742 printer
+743 prison, prison house
+744 projectile, missile
+745 projector
+746 puck, hockey puck
+747 punching bag, punch bag, punching ball, punchball
+748 purse
+749 quill, quill pen
+750 quilt, comforter, comfort, puff
+751 racer, race car, racing car
+752 racket, racquet
+753 radiator
+754 radio, wireless
+755 radio telescope, radio reflector
+756 rain barrel
+757 recreational vehicle, RV, R.V.
+758 reel
+759 reflex camera
+760 refrigerator, icebox
+761 remote control, remote
+762 restaurant, eating house, eating place, eatery
+763 revolver, six-gun, six-shooter
+764 rifle
+765 rocking chair, rocker
+766 rotisserie
+767 rubber eraser, rubber, pencil eraser
+768 rugby ball
+769 rule, ruler
+770 running shoe
+771 safe
+772 safety pin
+773 saltshaker, salt shaker
+774 sandal
+775 sarong
+776 sax, saxophone
+777 scabbard
+778 scale, weighing machine
+779 school bus
+780 schooner
+781 scoreboard
+782 screen, CRT screen
+783 screw
+784 screwdriver
+785 seat belt, seatbelt
+786 sewing machine
+787 shield, buckler
+788 shoe shop, shoe-shop, shoe store
+789 shoji
+790 shopping basket
+791 shopping cart
+792 shovel
+793 shower cap
+794 shower curtain
+795 ski
+796 ski mask
+797 sleeping bag
+798 slide rule, slipstick
+799 sliding door
+800 slot, one-armed bandit
+801 snorkel
+802 snowmobile
+803 snowplow, snowplough
+804 soap dispenser
+805 soccer ball
+806 sock
+807 solar dish, solar collector, solar furnace
+808 sombrero
+809 soup bowl
+810 space bar
+811 space heater
+812 space shuttle
+813 spatula
+814 speedboat
+815 spider web, spider's web
+816 spindle
+817 sports car, sport car
+818 spotlight, spot
+819 stage
+820 steam locomotive
+821 steel arch bridge
+822 steel drum
+823 stethoscope
+824 stole
+825 stone wall
+826 stopwatch, stop watch
+827 stove
+828 strainer
+829 streetcar, tram, tramcar, trolley, trolley car
+830 stretcher
+831 studio couch, day bed
+832 stupa, tope
+833 submarine, pigboat, sub, U-boat
+834 suit, suit of clothes
+835 sundial
+836 sunglass
+837 sunglasses, dark glasses, shades
+838 sunscreen, sunblock, sun blocker
+839 suspension bridge
+840 swab, swob, mop
+841 sweatshirt
+842 swimming trunks, bathing trunks
+843 swing
+844 switch, electric switch, electrical switch
+845 syringe
+846 table lamp
+847 tank, army tank, armored combat vehicle, armoured combat vehicle
+848 tape player
+849 teapot
+850 teddy, teddy bear
+851 television, television system
+852 tennis ball
+853 thatch, thatched roof
+854 theater curtain, theatre curtain
+855 thimble
+856 thresher, thrasher, threshing machine
+857 throne
+858 tile roof
+859 toaster
+860 tobacco shop, tobacconist shop, tobacconist
+861 toilet seat
+862 torch
+863 totem pole
+864 tow truck, tow car, wrecker
+865 toyshop
+866 tractor
+867 trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi
+868 tray
+869 trench coat
+870 tricycle, trike, velocipede
+871 trimaran
+872 tripod
+873 triumphal arch
+874 trolleybus, trolley coach, trackless trolley
+875 trombone
+876 tub, vat
+877 turnstile
+878 typewriter keyboard
+879 umbrella
+880 unicycle, monocycle
+881 upright, upright piano
+882 vacuum, vacuum cleaner
+883 vase
+884 vault
+885 velvet
+886 vending machine
+887 vestment
+888 viaduct
+889 violin, fiddle
+890 volleyball
+891 waffle iron
+892 wall clock
+893 wallet, billfold, notecase, pocketbook
+894 wardrobe, closet, press
+895 warplane, military plane
+896 washbasin, handbasin, washbowl, lavabo, wash-hand basin
+897 washer, automatic washer, washing machine
+898 water bottle
+899 water jug
+900 water tower
+901 whiskey jug
+902 whistle
+903 wig
+904 window screen
+905 window shade
+906 Windsor tie
+907 wine bottle
+908 wing
+909 wok
+910 wooden spoon
+911 wool, woolen, woollen
+912 worm fence, snake fence, snake-rail fence, Virginia fence
+913 wreck
+914 yawl
+915 yurt
+916 web site, website, internet site, site
+917 comic book
+918 crossword puzzle, crossword
+919 street sign
+920 traffic light, traffic signal, stoplight
+921 book jacket, dust cover, dust jacket, dust wrapper
+922 menu
+923 plate
+924 guacamole
+925 consomme
+926 hot pot, hotpot
+927 trifle
+928 ice cream, icecream
+929 ice lolly, lolly, lollipop, popsicle
+930 French loaf
+931 bagel, beigel
+932 pretzel
+933 cheeseburger
+934 hotdog, hot dog, red hot
+935 mashed potato
+936 head cabbage
+937 broccoli
+938 cauliflower
+939 zucchini, courgette
+940 spaghetti squash
+941 acorn squash
+942 butternut squash
+943 cucumber, cuke
+944 artichoke, globe artichoke
+945 bell pepper
+946 cardoon
+947 mushroom
+948 Granny Smith
+949 strawberry
+950 orange
+951 lemon
+952 fig
+953 pineapple, ananas
+954 banana
+955 jackfruit, jak, jack
+956 custard apple
+957 pomegranate
+958 hay
+959 carbonara
+960 chocolate sauce, chocolate syrup
+961 dough
+962 meat loaf, meatloaf
+963 pizza, pizza pie
+964 potpie
+965 burrito
+966 red wine
+967 espresso
+968 cup
+969 eggnog
+970 alp
+971 bubble
+972 cliff, drop, drop-off
+973 coral reef
+974 geyser
+975 lakeside, lakeshore
+976 promontory, headland, head, foreland
+977 sandbar, sand bar
+978 seashore, coast, seacoast, sea-coast
+979 valley, vale
+980 volcano
+981 ballplayer, baseball player
+982 groom, bridegroom
+983 scuba diver
+984 rapeseed
+985 daisy
+986 yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum
+987 corn
+988 acorn
+989 hip, rose hip, rosehip
+990 buckeye, horse chestnut, conker
+991 coral fungus
+992 agaric
+993 gyromitra
+994 stinkhorn, carrion fungus
+995 earthstar
+996 hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa
+997 bolete
+998 ear, spike, capitulum
+999 toilet tissue, toilet paper, bathroom tissue
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/utils/logger.py b/cv/classification/resnet50/paddlepaddle/ppcls/utils/logger.py
new file mode 100644
index 000000000..bc8de3640
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/utils/logger.py
@@ -0,0 +1,138 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+
+import logging
+import datetime
+import paddle.distributed as dist
+
+_logger = None
+
+
+def init_logger(name='ppcls', log_file=None, log_level=logging.INFO):
+    """Initialize and get a logger by name.
+    If the logger has not been initialized, this method will initialize the
+    logger by adding one or two handlers, otherwise the initialized logger will
+    be directly returned. During initialization, a StreamHandler will always be
+    added. If `log_file` is specified a FileHandler will also be added.
+    Args:
+        name (str): Logger name.
+        log_file (str | None): The log filename. If specified, a FileHandler
+            will be added to the logger.
+        log_level (int): The logger level. Note that only the process of
+            rank 0 is affected, and other processes will set the level to
+            "Error" thus be silent most of the time.
+    Returns:
+        logging.Logger: The expected logger.
+    """
+    global _logger
+    assert _logger is None, "logger should not be initialized twice or more."
+    _logger = logging.getLogger(name)
+
+    formatter = logging.Formatter(
+        '[%(asctime)s] %(name)s %(levelname)s: %(message)s',
+        datefmt="%Y/%m/%d %H:%M:%S")
+
+    stream_handler = logging.StreamHandler(stream=sys.stdout)
+    stream_handler.setFormatter(formatter)
+    _logger.addHandler(stream_handler)
+    if log_file is not None and dist.get_rank() == 0:
+        log_file_folder = os.path.split(log_file)[0]
+        os.makedirs(log_file_folder, exist_ok=True)
+        file_handler = logging.FileHandler(log_file, 'a')
+        file_handler.setFormatter(formatter)
+        _logger.addHandler(file_handler)
+    if dist.get_rank() == 0:
+        _logger.setLevel(log_level)
+    else:
+        _logger.setLevel(logging.ERROR)
+    _logger.propagate = False
+
+
+def log_at_trainer0(log):
+    """
+    logs will print multi-times when calling Fleet API.
+    Only display single log and ignore the others.
+    """
+
+    def wrapper(fmt, *args):
+        if dist.get_rank() == 0:
+            log(fmt, *args)
+
+    return wrapper
+
+
+@log_at_trainer0
+def info(fmt, *args):
+    _logger.info(fmt, *args)
+
+
+@log_at_trainer0
+def debug(fmt, *args):
+    _logger.debug(fmt, *args)
+
+
+@log_at_trainer0
+def warning(fmt, *args):
+    _logger.warning(fmt, *args)
+
+
+@log_at_trainer0
+def error(fmt, *args):
+    _logger.error(fmt, *args)
+
+
+def scaler(name, value, step, writer):
+    """
+    This function will draw a scalar curve generated by the visualdl.
+    Usage: Install visualdl: pip3 install visualdl==2.0.0b4
+           and then:
+           visualdl --logdir ./scalar --host 0.0.0.0 --port 8830 
+           to preview loss corve in real time.
+    """
+    if writer is None:
+        return
+    writer.add_scalar(tag=name, step=step, value=value)
+
+
+def advertise():
+    """
+    Show the advertising message like the following:
+
+    ===========================================================
+    ==        PaddleClas is powered by PaddlePaddle !        ==
+    ===========================================================
+    ==                                                       ==
+    ==   For more info please go to the following website.   ==
+    ==                                                       ==
+    ==       https://github.com/PaddlePaddle/PaddleClas      ==
+    ===========================================================
+
+    """
+    copyright = "PaddleClas is powered by PaddlePaddle !"
+    ad = "For more info please go to the following website."
+    website = "https://github.com/PaddlePaddle/PaddleClas"
+    AD_LEN = 6 + len(max([copyright, ad, website], key=len))
+
+    info("\n{0}\n{1}\n{2}\n{3}\n{4}\n{5}\n{6}\n{7}\n".format(
+        "=" * (AD_LEN + 4),
+        "=={}==".format(copyright.center(AD_LEN)),
+        "=" * (AD_LEN + 4),
+        "=={}==".format(' ' * AD_LEN),
+        "=={}==".format(ad.center(AD_LEN)),
+        "=={}==".format(' ' * AD_LEN),
+        "=={}==".format(website.center(AD_LEN)),
+        "=" * (AD_LEN + 4), ))
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/utils/metrics.py b/cv/classification/resnet50/paddlepaddle/ppcls/utils/metrics.py
new file mode 100644
index 000000000..b0db68a75
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/utils/metrics.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from sklearn.metrics import hamming_loss
+from sklearn.metrics import accuracy_score as accuracy_metric
+from sklearn.metrics import multilabel_confusion_matrix
+from sklearn.metrics import precision_recall_fscore_support
+from sklearn.metrics import average_precision_score
+from sklearn.preprocessing import binarize
+
+import numpy as np
+
+__all__ = ["multi_hot_encode", "hamming_distance", "accuracy_score", "precision_recall_fscore", "mean_average_precision"]
+
+
+def multi_hot_encode(logits, threshold=0.5):
+    """
+    Encode logits to multi-hot by elementwise for multilabel
+    """
+
+    return binarize(logits, threshold=threshold)
+
+
+def hamming_distance(output, target):
+    """
+    Soft metric based label for multilabel classification
+    Returns:
+        The smaller the return value is, the better model is.
+    """
+
+    return hamming_loss(target, output)
+
+
+def accuracy_score(output, target, base="sample"):
+    """
+    Hard metric for multilabel classification
+    Args:
+        output:
+        target:
+        base: ["sample", "label"], default="sample"
+            if "sample", return metric score based sample,
+            if "label", return metric score based label.
+    Returns:
+        accuracy:
+    """
+
+    assert base in ["sample", "label"], 'must be one of ["sample", "label"]'
+
+    if base == "sample":
+        accuracy = accuracy_metric(target, output)
+    elif base == "label":
+        mcm = multilabel_confusion_matrix(target, output)
+        tns = mcm[:, 0, 0]
+        fns = mcm[:, 1, 0]
+        tps = mcm[:, 1, 1]
+        fps = mcm[:, 0, 1]
+
+        accuracy = (sum(tps) + sum(tns)) / (sum(tps) + sum(tns) + sum(fns) + sum(fps))
+
+    return accuracy
+
+
+def precision_recall_fscore(output, target):
+    """
+    Metric based label for multilabel classification
+    Returns:
+        precisions:
+        recalls:
+        fscores:
+    """
+
+    precisions, recalls, fscores, _ = precision_recall_fscore_support(target, output)
+
+    return precisions, recalls, fscores
+
+
+def mean_average_precision(logits, target):
+    """
+    Calculate average precision
+    Args:
+        logits: probability from network before sigmoid or softmax
+        target: ground truth, 0 or 1
+    """
+    if not (isinstance(logits, np.ndarray) and isinstance(target, np.ndarray)):
+        raise TypeError("logits and target should be np.ndarray.")
+
+    aps = []
+    for i in range(target.shape[1]):
+        ap = average_precision_score(target[:, i], logits[:, i])
+        aps.append(ap)
+
+    return np.mean(aps)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/utils/misc.py b/cv/classification/resnet50/paddlepaddle/ppcls/utils/misc.py
new file mode 100644
index 000000000..08ab7b6f7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/utils/misc.py
@@ -0,0 +1,63 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['AverageMeter']
+
+
+class AverageMeter(object):
+    """
+    Computes and stores the average and current value
+    Code was based on https://github.com/pytorch/examples/blob/master/imagenet/main.py
+    """
+
+    def __init__(self, name='', fmt='f', postfix="", need_avg=True):
+        self.name = name
+        self.fmt = fmt
+        self.postfix = postfix
+        self.need_avg = need_avg
+        self.reset()
+
+    def reset(self):
+        """ reset """
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        """ update """
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    @property
+    def total(self):
+        return '{self.name}_sum: {self.sum:{self.fmt}}{self.postfix}'.format(
+            self=self)
+
+    @property
+    def total_minute(self):
+        return '{self.name} {s:{self.fmt}}{self.postfix} min'.format(
+            s=self.sum / 60, self=self)
+
+    @property
+    def mean(self):
+        return '{self.name}: {self.avg:{self.fmt}}{self.postfix}'.format(
+            self=self) if self.need_avg else ''
+
+    @property
+    def value(self):
+        return '{self.name}: {self.val:{self.fmt}}{self.postfix}'.format(
+            self=self)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/utils/model_zoo.py b/cv/classification/resnet50/paddlepaddle/ppcls/utils/model_zoo.py
new file mode 100644
index 000000000..fc527f6a1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/utils/model_zoo.py
@@ -0,0 +1,213 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import requests
+import shutil
+import tarfile
+import tqdm
+import zipfile
+
+from ppcls.arch import similar_architectures
+from ppcls.utils import logger
+
+__all__ = ['get']
+
+DOWNLOAD_RETRY_LIMIT = 3
+
+
+class UrlError(Exception):
+    """ UrlError
+    """
+
+    def __init__(self, url='', code=''):
+        message = "Downloading from {} failed with code {}!".format(url, code)
+        super(UrlError, self).__init__(message)
+
+
+class ModelNameError(Exception):
+    """ ModelNameError
+    """
+
+    def __init__(self, message=''):
+        super(ModelNameError, self).__init__(message)
+
+
+class RetryError(Exception):
+    """ RetryError
+    """
+
+    def __init__(self, url='', times=''):
+        message = "Download from {} failed. Retry({}) limit reached".format(
+            url, times)
+        super(RetryError, self).__init__(message)
+
+
+def _get_url(architecture, postfix="pdparams"):
+    prefix = "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/"
+    fname = architecture + "_pretrained." + postfix
+    return prefix + fname
+
+
+def _move_and_merge_tree(src, dst):
+    """
+    Move src directory to dst, if dst is already exists,
+    merge src to dst
+    """
+    if not os.path.exists(dst):
+        shutil.move(src, dst)
+    elif os.path.isfile(src):
+        shutil.move(src, dst)
+    else:
+        for fp in os.listdir(src):
+            src_fp = os.path.join(src, fp)
+            dst_fp = os.path.join(dst, fp)
+            if os.path.isdir(src_fp):
+                if os.path.isdir(dst_fp):
+                    _move_and_merge_tree(src_fp, dst_fp)
+                else:
+                    shutil.move(src_fp, dst_fp)
+            elif os.path.isfile(src_fp) and \
+                    not os.path.isfile(dst_fp):
+                shutil.move(src_fp, dst_fp)
+
+
+def _download(url, path):
+    """
+    Download from url, save to path.
+    url (str): download url
+    path (str): download to given path
+    """
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+    fname = os.path.split(url)[-1]
+    fullname = os.path.join(path, fname)
+    retry_cnt = 0
+
+    while not os.path.exists(fullname):
+        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+            retry_cnt += 1
+        else:
+            raise RetryError(url, DOWNLOAD_RETRY_LIMIT)
+
+        logger.info("Downloading {} from {}".format(fname, url))
+
+        req = requests.get(url, stream=True)
+        if req.status_code != 200:
+            raise UrlError(url, req.status_code)
+
+        # For protecting download interupted, download to
+        # tmp_fullname firstly, move tmp_fullname to fullname
+        # after download finished
+        tmp_fullname = fullname + "_tmp"
+        total_size = req.headers.get('content-length')
+        with open(tmp_fullname, 'wb') as f:
+            if total_size:
+                for chunk in tqdm.tqdm(
+                        req.iter_content(chunk_size=1024),
+                        total=(int(total_size) + 1023) // 1024,
+                        unit='KB'):
+                    f.write(chunk)
+            else:
+                for chunk in req.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+        shutil.move(tmp_fullname, fullname)
+
+    return fullname
+
+
+def _decompress(fname):
+    """
+    Decompress for zip and tar file
+    """
+    logger.info("Decompressing {}...".format(fname))
+
+    # For protecting decompressing interupted,
+    # decompress to fpath_tmp directory firstly, if decompress
+    # successed, move decompress files to fpath and delete
+    # fpath_tmp and remove download compress file.
+    fpath = os.path.split(fname)[0]
+    fpath_tmp = os.path.join(fpath, 'tmp')
+    if os.path.isdir(fpath_tmp):
+        shutil.rmtree(fpath_tmp)
+        os.makedirs(fpath_tmp)
+
+    if fname.find('tar') >= 0:
+        with tarfile.open(fname) as tf:
+            tf.extractall(path=fpath_tmp)
+    elif fname.find('zip') >= 0:
+        with zipfile.ZipFile(fname) as zf:
+            zf.extractall(path=fpath_tmp)
+    else:
+        raise TypeError("Unsupport compress file type {}".format(fname))
+
+    fs = os.listdir(fpath_tmp)
+    assert len(
+        fs
+    ) == 1, "There should just be 1 pretrained path in an archive file but got {}.".format(
+        len(fs))
+
+    f = fs[0]
+    src_dir = os.path.join(fpath_tmp, f)
+    dst_dir = os.path.join(fpath, f)
+    _move_and_merge_tree(src_dir, dst_dir)
+
+    shutil.rmtree(fpath_tmp)
+    os.remove(fname)
+
+    return f
+
+
+def _get_pretrained():
+    with open('./ppcls/utils/pretrained.list') as flist:
+        pretrained = [line.strip() for line in flist]
+    return pretrained
+
+
+def _check_pretrained_name(architecture):
+    assert isinstance(architecture, str), \
+        ("the type of architecture({}) should be str". format(architecture))
+    pretrained = _get_pretrained()
+    similar_names = similar_architectures(architecture, pretrained)
+    model_list = ', '.join(similar_names)
+    err = "{} is not exist! Maybe you want: [{}]" \
+          "".format(architecture, model_list)
+    if architecture not in similar_names:
+        raise ModelNameError(err)
+
+
+def list_models():
+    pretrained = _get_pretrained()
+    msg = "All avialable pretrained models are as follows: {}".format(
+        pretrained)
+    logger.info(msg)
+    return
+
+
+def get(architecture, path, decompress=False, postfix="pdparams"):
+    """
+    Get the pretrained model.
+    """
+    _check_pretrained_name(architecture)
+    url = _get_url(architecture, postfix=postfix)
+    fname = _download(url, path)
+    if postfix == "tar" and decompress:
+        _decompress(fname)
+    logger.info("download {} finished ".format(fname))
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/utils/pretrained.list b/cv/classification/resnet50/paddlepaddle/ppcls/utils/pretrained.list
new file mode 100644
index 000000000..36d70f5a2
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/utils/pretrained.list
@@ -0,0 +1,121 @@
+ResNet18
+ResNet34
+ResNet50
+ResNet101
+ResNet152
+ResNet50_vc
+ResNet18_vd
+ResNet34_vd
+ResNet50_vd
+ResNet50_vd_v2
+ResNet101_vd
+ResNet152_vd
+ResNet200_vd
+ResNet50_vd_ssld
+ResNet50_vd_ssld_v2
+Fix_ResNet50_vd_ssld_v2
+ResNet101_vd_ssld
+MobileNetV3_large_x0_35
+MobileNetV3_large_x0_5
+MobileNetV3_large_x0_75
+MobileNetV3_large_x1_0
+MobileNetV3_large_x1_25
+MobileNetV3_small_x0_35
+MobileNetV3_small_x0_5
+MobileNetV3_small_x0_75
+MobileNetV3_small_x1_0
+MobileNetV3_small_x1_25
+MobileNetV3_large_x1_0_ssld
+MobileNetV3_large_x1_0_ssld_int8
+MobileNetV3_small_x1_0_ssld
+MobileNetV2_x0_25
+MobileNetV2_x0_5
+MobileNetV2_x0_75
+MobileNetV2
+MobileNetV2_x1_5
+MobileNetV2_x2_0
+MobileNetV2_ssld
+MobileNetV1_x0_25
+MobileNetV1_x0_5
+MobileNetV1_x0_75
+MobileNetV1
+MobileNetV1_ssld
+ShuffleNetV2_x0_25
+ShuffleNetV2_x0_33
+ShuffleNetV2_x0_5
+ShuffleNetV2
+ShuffleNetV2_x1_5
+ShuffleNetV2_x2_0
+ShuffleNetV2_swish
+ResNeXt50_32x4d
+ResNeXt50_64x4d
+ResNeXt101_32x4d
+ResNeXt101_64x4d
+ResNeXt152_32x4d
+ResNeXt152_64x4d
+ResNeXt50_vd_32x4d
+ResNeXt50_vd_64x4d
+ResNeXt101_vd_32x4d
+ResNeXt101_vd_64x4d
+ResNeXt152_vd_32x4d
+ResNeXt152_vd_64x4d
+SE_ResNet18_vd
+SE_ResNet34_vd
+SE_ResNet50_vd
+SE_ResNeXt50_32x4d
+SE_ResNeXt101_32x4d
+SE_ResNeXt50_vd_32x4d
+SENet154_vd
+Res2Net50_26w_4s
+Res2Net50_vd_26w_4s
+Res2Net50_14w_8s
+Res2Net101_vd_26w_4s
+Res2Net200_vd_26w_4s
+GoogLeNet
+InceptionV4
+Xception41
+Xception41_deeplab
+Xception65
+Xception65_deeplab
+Xception71
+HRNet_W18_C
+HRNet_W30_C
+HRNet_W32_C
+HRNet_W40_C
+HRNet_W44_C
+HRNet_W48_C
+HRNet_W64_C
+DPN68
+DPN92
+DPN98
+DPN107
+DPN131
+DenseNet121
+DenseNet161
+DenseNet169
+DenseNet201
+DenseNet264
+EfficientNetB0_small
+EfficientNetB0
+EfficientNetB1
+EfficientNetB2
+EfficientNetB3
+EfficientNetB4
+EfficientNetB5
+EfficientNetB6
+EfficientNetB7
+ResNeXt101_32x8d_wsl
+ResNeXt101_32x16d_wsl
+ResNeXt101_32x32d_wsl
+ResNeXt101_32x48d_wsl
+Fix_ResNeXt101_32x48d_wsl
+AlexNet
+SqueezeNet1_0
+SqueezeNet1_1
+VGG11
+VGG13
+VGG16
+VGG19
+DarkNet53_ImageNet1k
+ResNet50_ACNet_deploy
+CSPResNet50_leaky
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/utils/profiler.py b/cv/classification/resnet50/paddlepaddle/ppcls/utils/profiler.py
new file mode 100644
index 000000000..7cf945a26
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/utils/profiler.py
@@ -0,0 +1,111 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import paddle
+
+# A global variable to record the number of calling times for profiler
+# functions. It is used to specify the tracing range of training steps.
+_profiler_step_id = 0
+
+# A global variable to avoid parsing from string every time.
+_profiler_options = None
+
+
+class ProfilerOptions(object):
+    '''
+    Use a string to initialize a ProfilerOptions.
+    The string should be in the format: "key1=value1;key2=value;key3=value3".
+    For example:
+      "profile_path=model.profile"
+      "batch_range=[50, 60]; profile_path=model.profile"
+      "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
+
+    ProfilerOptions supports following key-value pair:
+      batch_range      - a integer list, e.g. [100, 110].
+      state            - a string, the optional values are 'CPU', 'GPU' or 'All'. 
+      sorted_key       - a string, the optional values are 'calls', 'total',
+                         'max', 'min' or 'ave.
+      tracer_option    - a string, the optional values are 'Default', 'OpDetail',
+                         'AllOpDetail'.
+      profile_path     - a string, the path to save the serialized profile data,
+                         which can be used to generate a timeline.
+      exit_on_finished - a boolean.
+    '''
+
+    def __init__(self, options_str):
+        assert isinstance(options_str, str)
+
+        self._options = {
+            'batch_range': [10, 20],
+            'state': 'All',
+            'sorted_key': 'total',
+            'tracer_option': 'Default',
+            'profile_path': '/tmp/profile',
+            'exit_on_finished': True
+        }
+        self._parse_from_string(options_str)
+
+    def _parse_from_string(self, options_str):
+        for kv in options_str.replace(' ', '').split(';'):
+            key, value = kv.split('=')
+            if key == 'batch_range':
+                value_list = value.replace('[', '').replace(']', '').split(',')
+                value_list = list(map(int, value_list))
+                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
+                        1] > value_list[0]:
+                    self._options[key] = value_list
+            elif key == 'exit_on_finished':
+                self._options[key] = value.lower() in ("yes", "true", "t", "1")
+            elif key in [
+                    'state', 'sorted_key', 'tracer_option', 'profile_path'
+            ]:
+                self._options[key] = value
+
+    def __getitem__(self, name):
+        if self._options.get(name, None) is None:
+            raise ValueError(
+                "ProfilerOptions does not have an option named %s." % name)
+        return self._options[name]
+
+
+def add_profiler_step(options_str=None):
+    '''
+    Enable the operator-level timing using PaddlePaddle's profiler.
+    The profiler uses a independent variable to count the profiler steps.
+    One call of this function is treated as a profiler step.
+    
+    Args:
+      profiler_options - a string to initialize the ProfilerOptions.
+                         Default is None, and the profiler is disabled.
+    '''
+    if options_str is None:
+        return
+
+    global _profiler_step_id
+    global _profiler_options
+
+    if _profiler_options is None:
+        _profiler_options = ProfilerOptions(options_str)
+
+    if _profiler_step_id == _profiler_options['batch_range'][0]:
+        paddle.utils.profiler.start_profiler(
+            _profiler_options['state'], _profiler_options['tracer_option'])
+    elif _profiler_step_id == _profiler_options['batch_range'][1]:
+        paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'],
+                                            _profiler_options['profile_path'])
+        if _profiler_options['exit_on_finished']:
+            sys.exit(0)
+
+    _profiler_step_id += 1
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/utils/save_load.py b/cv/classification/resnet50/paddlepaddle/ppcls/utils/save_load.py
new file mode 100644
index 000000000..625a28483
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/utils/save_load.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import errno
+import os
+import re
+import shutil
+import tempfile
+
+import paddle
+from ppcls.utils import logger
+from .download import get_weights_path_from_url
+
+__all__ = ['init_model', 'save_model', 'load_dygraph_pretrain']
+
+
+def _mkdir_if_not_exist(path):
+    """
+    mkdir if not exists, ignore the exception when multiprocess mkdir together
+    """
+    if not os.path.exists(path):
+        try:
+            os.makedirs(path)
+        except OSError as e:
+            if e.errno == errno.EEXIST and os.path.isdir(path):
+                logger.warning(
+                    'be happy if some process has already created {}'.format(
+                        path))
+            else:
+                raise OSError('Failed to mkdir {}'.format(path))
+
+
+def load_dygraph_pretrain(model, path=None):
+    if not (os.path.isdir(path) or os.path.exists(path + '.pdparams')):
+        raise ValueError("Model pretrain path {} does not "
+                         "exists.".format(path))
+    param_state_dict = paddle.load(path + ".pdparams")
+    model.set_dict(param_state_dict)
+    return
+
+
+def load_dygraph_pretrain_from_url(model, pretrained_url, use_ssld=False):
+    if use_ssld:
+        pretrained_url = pretrained_url.replace("_pretrained",
+                                                "_ssld_pretrained")
+    local_weight_path = get_weights_path_from_url(pretrained_url).replace(
+        ".pdparams", "")
+    load_dygraph_pretrain(model, path=local_weight_path)
+    return
+
+
+def load_distillation_model(model, pretrained_model):
+    logger.info("In distillation mode, teacher model will be "
+                "loaded firstly before student model.")
+
+    if not isinstance(pretrained_model, list):
+        pretrained_model = [pretrained_model]
+
+    teacher = model.teacher if hasattr(model,
+                                       "teacher") else model._layers.teacher
+    student = model.student if hasattr(model,
+                                       "student") else model._layers.student
+    load_dygraph_pretrain(teacher, path=pretrained_model[0])
+    logger.info("Finish initing teacher model from {}".format(
+        pretrained_model))
+    # load student model
+    if len(pretrained_model) >= 2:
+        load_dygraph_pretrain(student, path=pretrained_model[1])
+        logger.info("Finish initing student model from {}".format(
+            pretrained_model))
+
+
+def init_model(config, net, optimizer=None):
+    """
+    load model from checkpoint or pretrained_model
+    """
+    checkpoints = config.get('checkpoints')
+    if checkpoints and optimizer is not None:
+        assert os.path.exists(checkpoints + ".pdparams"), \
+            "Given dir {}.pdparams not exist.".format(checkpoints)
+        assert os.path.exists(checkpoints + ".pdopt"), \
+            "Given dir {}.pdopt not exist.".format(checkpoints)
+        para_dict = paddle.load(checkpoints + ".pdparams")
+        opti_dict = paddle.load(checkpoints + ".pdopt")
+        metric_dict = paddle.load(checkpoints + ".pdstates")
+        net.set_dict(para_dict)
+        optimizer.set_state_dict(opti_dict)
+        logger.info("Finish load checkpoints from {}".format(checkpoints))
+        return metric_dict
+
+    pretrained_model = config.get('pretrained_model')
+    use_distillation = config.get('use_distillation', False)
+    if pretrained_model:
+        if use_distillation:
+            load_distillation_model(net, pretrained_model)
+        else:  # common load
+            load_dygraph_pretrain(net, path=pretrained_model)
+            logger.info(
+                logger.coloring("Finish load pretrained model from {}".format(
+                    pretrained_model), "HEADER"))
+
+
+def save_model(net,
+               optimizer,
+               metric_info,
+               model_path,
+               model_name="",
+               prefix='ppcls'):
+    """
+    save model to the target path
+    """
+    if paddle.distributed.get_rank() != 0:
+        return
+    model_path = os.path.join(model_path, model_name)
+    _mkdir_if_not_exist(model_path)
+    model_path = os.path.join(model_path, prefix)
+
+    paddle.save(net.state_dict(), model_path + ".pdparams")
+    paddle.save(optimizer.state_dict(), model_path + ".pdopt")
+    paddle.save(metric_info, model_path + ".pdstates")
+    logger.info("Already save model in {}".format(model_path))
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/__init__.py
new file mode 100644
index 000000000..d6cdb6f8f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import optimizer
+
+from .arch import *
+from .optimizer import *
+from .data import *
+from .utils import *
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/__init__.py
new file mode 100644
index 000000000..2d5e29db8
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/__init__.py
@@ -0,0 +1,134 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import copy
+import importlib
+
+import paddle.nn as nn
+from paddle.jit import to_static
+from paddle.static import InputSpec
+
+from . import backbone, gears
+from .backbone import *
+from .gears import build_gear
+from .utils import *
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils import logger
+from ppcls.utils.save_load import load_dygraph_pretrain
+from ppcls.arch.slim import prune_model, quantize_model
+
+__all__ = ["build_model", "RecModel", "DistillationModel"]
+
+
+def build_model(config):
+    arch_config = copy.deepcopy(config["Arch"])
+    model_type = arch_config.pop("name")
+    mod = importlib.import_module(__name__)
+    arch = getattr(mod, model_type)(**arch_config)
+    if isinstance(arch, TheseusLayer):
+        prune_model(config, arch)
+        quantize_model(config, arch)
+    return arch
+
+
+def apply_to_static(config, model):
+    support_to_static = config['Global'].get('to_static', False)
+
+    if support_to_static:
+        specs = None
+        if 'image_shape' in config['Global']:
+            specs = [InputSpec([None] + config['Global']['image_shape'])]
+        model = to_static(model, input_spec=specs)
+        logger.info("Successfully to apply @to_static with specs: {}".format(
+            specs))
+    return model
+
+
+class RecModel(TheseusLayer):
+    def __init__(self, **config):
+        super().__init__()
+        backbone_config = config["Backbone"]
+        backbone_name = backbone_config.pop("name")
+        self.backbone = eval(backbone_name)(**backbone_config)
+        if "BackboneStopLayer" in config:
+            backbone_stop_layer = config["BackboneStopLayer"]["name"]
+            self.backbone.stop_after(backbone_stop_layer)
+
+        if "Neck" in config:
+            self.neck = build_gear(config["Neck"])
+        else:
+            self.neck = None
+
+        if "Head" in config:
+            self.head = build_gear(config["Head"])
+        else:
+            self.head = None
+
+    def forward(self, x, label=None):
+        out = dict()
+        x = self.backbone(x)
+        out["backbone"] = x
+        if self.neck is not None:
+            x = self.neck(x)
+            out["neck"] = x
+        out["features"] = x
+        if self.head is not None:
+            y = self.head(x, label)
+            out["logits"] = y
+        return out
+
+
+class DistillationModel(nn.Layer):
+    def __init__(self,
+                 models=None,
+                 pretrained_list=None,
+                 freeze_params_list=None,
+                 **kargs):
+        super().__init__()
+        assert isinstance(models, list)
+        self.model_list = []
+        self.model_name_list = []
+        if pretrained_list is not None:
+            assert len(pretrained_list) == len(models)
+
+        if freeze_params_list is None:
+            freeze_params_list = [False] * len(models)
+        assert len(freeze_params_list) == len(models)
+        for idx, model_config in enumerate(models):
+            assert len(model_config) == 1
+            key = list(model_config.keys())[0]
+            model_config = model_config[key]
+            model_name = model_config.pop("name")
+            model = eval(model_name)(**model_config)
+
+            if freeze_params_list[idx]:
+                for param in model.parameters():
+                    param.trainable = False
+            self.model_list.append(self.add_sublayer(key, model))
+            self.model_name_list.append(key)
+
+        if pretrained_list is not None:
+            for idx, pretrained in enumerate(pretrained_list):
+                if pretrained is not None:
+                    load_dygraph_pretrain(
+                        self.model_name_list[idx], path=pretrained)
+
+    def forward(self, x, label=None):
+        result_dict = dict()
+        for idx, model_name in enumerate(self.model_name_list):
+            if label is None:
+                result_dict[model_name] = self.model_list[idx](x)
+            else:
+                result_dict[model_name] = self.model_list[idx](x, label)
+        return result_dict
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/__init__.py
new file mode 100644
index 000000000..74d266414
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/__init__.py
@@ -0,0 +1,34 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import inspect
+
+from ppcls.arch.backbone.legendary_models.resnet import ResNet50, ResNet50_vd
+from ppcls.arch.backbone.model_zoo.resnet_vc import ResNet50_vc
+
+# help whl get all the models' api (class type) and components' api (func type)
+def get_apis():
+    current_func = sys._getframe().f_code.co_name
+    current_module = sys.modules[__name__]
+    api = []
+    for _, obj in inspect.getmembers(current_module,
+                                     inspect.isclass) + inspect.getmembers(
+                                         current_module, inspect.isfunction):
+        api.append(obj.__name__)
+    api.remove(current_func)
+    return api
+
+
+__all__ = get_apis()
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/base/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/base/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/base/theseus_layer.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/base/theseus_layer.py
new file mode 100644
index 000000000..908d94445
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/base/theseus_layer.py
@@ -0,0 +1,301 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple, List, Dict, Union, Callable, Any
+
+from paddle import nn
+from ppcls.utils import logger
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, inputs):
+        return inputs
+
+
+class TheseusLayer(nn.Layer):
+    def __init__(self, *args, **kwargs):
+        super(TheseusLayer, self).__init__()
+        self.res_dict = {}
+        self.res_name = self.full_name()
+        self.pruner = None
+        self.quanter = None
+
+    def _return_dict_hook(self, layer, input, output):
+        res_dict = {"output": output}
+        # 'list' is needed to avoid error raised by popping self.res_dict
+        for res_key in list(self.res_dict):
+            # clear the res_dict because the forward process may change according to input
+            res_dict[res_key] = self.res_dict.pop(res_key)
+        return res_dict
+
+    def init_res(self,
+                 stages_pattern,
+                 return_patterns=None,
+                 return_stages=None):
+        if return_patterns and return_stages:
+            msg = f"The 'return_patterns' would be ignored when 'return_stages' is set."
+            logger.warning(msg)
+            return_stages = None
+
+        if return_stages is True:
+            return_patterns = stages_pattern
+        # return_stages is int or bool
+        if type(return_stages) is int:
+            return_stages = [return_stages]
+        if isinstance(return_stages, list):
+            if max(return_stages) > len(stages_pattern) or min(
+                    return_stages) < 0:
+                msg = f"The 'return_stages' set error. Illegal value(s) have been ignored. The stages' pattern list is {stages_pattern}."
+                logger.warning(msg)
+                return_stages = [
+                    val for val in return_stages
+                    if val >= 0 and val < len(stages_pattern)
+                ]
+            return_patterns = [stages_pattern[i] for i in return_stages]
+
+        if return_patterns:
+            self.update_res(return_patterns)
+
+    def replace_sub(self, *args, **kwargs) -> None:
+        msg = "The function 'replace_sub()' is deprecated, please use 'upgrade_sublayer()' instead."
+        logger.error(DeprecationWarning(msg))
+        raise DeprecationWarning(msg)
+
+    def upgrade_sublayer(self,
+                         layer_name_pattern: Union[str, List[str]],
+                         handle_func: Callable[[nn.Layer, str], nn.Layer]
+                         ) -> Dict[str, nn.Layer]:
+        """use 'handle_func' to modify the sub-layer(s) specified by 'layer_name_pattern'.
+
+        Args:
+            layer_name_pattern (Union[str, List[str]]): The name of layer to be modified by 'handle_func'.
+            handle_func (Callable[[nn.Layer, str], nn.Layer]): The function to modify target layer specified by 'layer_name_pattern'. The formal params are the layer(nn.Layer) and pattern(str) that is (a member of) layer_name_pattern (when layer_name_pattern is List type). And the return is the layer processed.
+
+        Returns:
+            Dict[str, nn.Layer]: The key is the pattern and corresponding value is the result returned by 'handle_func()'.
+
+        Examples:
+
+            from paddle import nn
+            import paddleclas
+
+            def rep_func(layer: nn.Layer, pattern: str):
+                new_layer = nn.Conv2D(
+                    in_channels=layer._in_channels,
+                    out_channels=layer._out_channels,
+                    kernel_size=5,
+                    padding=2
+                )
+                return new_layer
+
+            net = paddleclas.MobileNetV1()
+            res = net.replace_sub(layer_name_pattern=["blocks[11].depthwise_conv.conv", "blocks[12].depthwise_conv.conv"], handle_func=rep_func)
+            print(res)
+            # {'blocks[11].depthwise_conv.conv': the corresponding new_layer, 'blocks[12].depthwise_conv.conv': the corresponding new_layer}
+        """
+
+        if not isinstance(layer_name_pattern, list):
+            layer_name_pattern = [layer_name_pattern]
+
+        hit_layer_pattern_list = []
+        for pattern in layer_name_pattern:
+            # parse pattern to find target layer and its parent
+            layer_list = parse_pattern_str(pattern=pattern, parent_layer=self)
+            if not layer_list:
+                continue
+            sub_layer_parent = layer_list[-2]["layer"] if len(
+                layer_list) > 1 else self
+
+            sub_layer = layer_list[-1]["layer"]
+            sub_layer_name = layer_list[-1]["name"]
+            sub_layer_index = layer_list[-1]["index"]
+
+            new_sub_layer = handle_func(sub_layer, pattern)
+
+            if sub_layer_index:
+                getattr(sub_layer_parent,
+                        sub_layer_name)[sub_layer_index] = new_sub_layer
+            else:
+                setattr(sub_layer_parent, sub_layer_name, new_sub_layer)
+
+            hit_layer_pattern_list.append(pattern)
+        return hit_layer_pattern_list
+
+    def stop_after(self, stop_layer_name: str) -> bool:
+        """stop forward and backward after 'stop_layer_name'.
+
+        Args:
+            stop_layer_name (str): The name of layer that stop forward and backward after this layer.
+
+        Returns:
+            bool: 'True' if successful, 'False' otherwise.
+        """
+
+        layer_list = parse_pattern_str(stop_layer_name, self)
+        if not layer_list:
+            return False
+
+        parent_layer = self
+        for layer_dict in layer_list:
+            name, index = layer_dict["name"], layer_dict["index"]
+            if not set_identity(parent_layer, name, index):
+                msg = f"Failed to set the layers that after stop_layer_name('{stop_layer_name}') to IdentityLayer. The error layer's name is '{name}'."
+                logger.warning(msg)
+                return False
+            parent_layer = layer_dict["layer"]
+
+        return True
+
+    def update_res(
+            self,
+            return_patterns: Union[str, List[str]]) -> Dict[str, nn.Layer]:
+        """update the result(s) to be returned.
+
+        Args:
+            return_patterns (Union[str, List[str]]): The name of layer to return output.
+
+        Returns:
+            Dict[str, nn.Layer]: The pattern(str) and corresponding layer(nn.Layer) that have been set successfully.
+        """
+
+        # clear res_dict that could have been set
+        self.res_dict = {}
+
+        class Handler(object):
+            def __init__(self, res_dict):
+                # res_dict is a reference
+                self.res_dict = res_dict
+
+            def __call__(self, layer, pattern):
+                layer.res_dict = self.res_dict
+                layer.res_name = pattern
+                if hasattr(layer, "hook_remove_helper"):
+                    layer.hook_remove_helper.remove()
+                layer.hook_remove_helper = layer.register_forward_post_hook(
+                    save_sub_res_hook)
+                return layer
+
+        handle_func = Handler(self.res_dict)
+
+        hit_layer_pattern_list = self.upgrade_sublayer(
+            return_patterns, handle_func=handle_func)
+
+        if hasattr(self, "hook_remove_helper"):
+            self.hook_remove_helper.remove()
+        self.hook_remove_helper = self.register_forward_post_hook(
+            self._return_dict_hook)
+
+        return hit_layer_pattern_list
+
+
+def save_sub_res_hook(layer, input, output):
+    layer.res_dict[layer.res_name] = output
+
+
+def set_identity(parent_layer: nn.Layer,
+                 layer_name: str,
+                 layer_index: str=None) -> bool:
+    """set the layer specified by layer_name and layer_index to Indentity.
+
+    Args:
+        parent_layer (nn.Layer): The parent layer of target layer specified by layer_name and layer_index.
+        layer_name (str): The name of target layer to be set to Indentity.
+        layer_index (str, optional): The index of target layer to be set to Indentity in parent_layer. Defaults to None.
+
+    Returns:
+        bool: True if successfully, False otherwise.
+    """
+
+    stop_after = False
+    for sub_layer_name in parent_layer._sub_layers:
+        if stop_after:
+            parent_layer._sub_layers[sub_layer_name] = Identity()
+            continue
+        if sub_layer_name == layer_name:
+            stop_after = True
+
+    if layer_index and stop_after:
+        stop_after = False
+        for sub_layer_index in parent_layer._sub_layers[
+                layer_name]._sub_layers:
+            if stop_after:
+                parent_layer._sub_layers[layer_name][
+                    sub_layer_index] = Identity()
+                continue
+            if layer_index == sub_layer_index:
+                stop_after = True
+
+    return stop_after
+
+
+def parse_pattern_str(pattern: str, parent_layer: nn.Layer) -> Union[
+        None, List[Dict[str, Union[nn.Layer, str, None]]]]:
+    """parse the string type pattern.
+
+    Args:
+        pattern (str): The pattern to discribe layer.
+        parent_layer (nn.Layer): The root layer relative to the pattern.
+
+    Returns:
+        Union[None, List[Dict[str, Union[nn.Layer, str, None]]]]: None if failed. If successfully, the members are layers parsed in order:
+                                                                [
+                                                                    {"layer": first layer, "name": first layer's name parsed, "index": first layer's index parsed if exist},
+                                                                    {"layer": second layer, "name": second layer's name parsed, "index": second layer's index parsed if exist},
+                                                                    ...
+                                                                ]
+    """
+
+    pattern_list = pattern.split(".")
+    if not pattern_list:
+        msg = f"The pattern('{pattern}') is illegal. Please check and retry."
+        logger.warning(msg)
+        return None
+
+    layer_list = []
+    while len(pattern_list) > 0:
+        if '[' in pattern_list[0]:
+            target_layer_name = pattern_list[0].split('[')[0]
+            target_layer_index = pattern_list[0].split('[')[1].split(']')[0]
+        else:
+            target_layer_name = pattern_list[0]
+            target_layer_index = None
+
+        target_layer = getattr(parent_layer, target_layer_name, None)
+
+        if target_layer is None:
+            msg = f"Not found layer named('{target_layer_name}') specifed in pattern('{pattern}')."
+            logger.warning(msg)
+            return None
+
+        if target_layer_index and target_layer:
+            if int(target_layer_index) < 0 or int(target_layer_index) >= len(
+                    target_layer):
+                msg = f"Not found layer by index('{target_layer_index}') specifed in pattern('{pattern}'). The index should < {len(target_layer)} and > 0."
+                logger.warning(msg)
+                return None
+
+            target_layer = target_layer[target_layer_index]
+
+        layer_list.append({
+            "layer": target_layer,
+            "name": target_layer_name,
+            "index": target_layer_index
+        })
+
+        pattern_list = pattern_list[1:]
+        parent_layer = target_layer
+    return layer_list
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/legendary_models/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/legendary_models/__init__.py
new file mode 100644
index 000000000..550e5544c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/legendary_models/__init__.py
@@ -0,0 +1,8 @@
+from .resnet import ResNet50
+
+# from .resnet import ResNet18, ResNet34, ResNet50, ResNet101, ResNet152, ResNet18_vd, ResNet34_vd, ResNet50_vd, ResNet101_vd, ResNet152_vd
+# from .hrnet import HRNet_W18_C, HRNet_W30_C, HRNet_W32_C, HRNet_W40_C, HRNet_W44_C, HRNet_W48_C, HRNet_W64_C
+# from .mobilenet_v1 import MobileNetV1_x0_25, MobileNetV1_x0_5, MobileNetV1_x0_75, MobileNetV1
+# from .mobilenet_v3 import MobileNetV3_small_x0_35, MobileNetV3_small_x0_5, MobileNetV3_small_x0_75, MobileNetV3_small_x1_0, MobileNetV3_small_x1_25, MobileNetV3_large_x0_35, MobileNetV3_large_x0_5, MobileNetV3_large_x0_75, MobileNetV3_large_x1_0, MobileNetV3_large_x1_25
+# from .inception_v3 import InceptionV3
+# from .vgg import VGG11, VGG13, VGG16, VGG19
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/legendary_models/resnet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/legendary_models/resnet.py
new file mode 100644
index 000000000..74c5c5fa6
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/legendary_models/resnet.py
@@ -0,0 +1,591 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "ResNet18":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_pretrained.pdparams",
+    "ResNet18_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_vd_pretrained.pdparams",
+    "ResNet34":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet34_pretrained.pdparams",
+    "ResNet34_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet34_vd_pretrained.pdparams",
+    "ResNet50":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_pretrained.pdparams",
+    "ResNet50_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_vd_pretrained.pdparams",
+    "ResNet101":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet101_pretrained.pdparams",
+    "ResNet101_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet101_vd_pretrained.pdparams",
+    "ResNet152":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_pretrained.pdparams",
+    "ResNet152_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_vd_pretrained.pdparams",
+    "ResNet200_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet200_vd_pretrained.pdparams",
+}
+
+MODEL_STAGES_PATTERN = {
+    "ResNet18": ["blocks[1]", "blocks[3]", "blocks[5]", "blocks[7]"],
+    "ResNet34": ["blocks[2]", "blocks[6]", "blocks[12]", "blocks[15]"],
+    "ResNet50": ["blocks[2]", "blocks[6]", "blocks[12]", "blocks[15]"],
+    "ResNet101": ["blocks[2]", "blocks[6]", "blocks[29]", "blocks[32]"],
+    "ResNet152": ["blocks[2]", "blocks[10]", "blocks[46]", "blocks[49]"],
+    "ResNet200": ["blocks[2]", "blocks[14]", "blocks[62]", "blocks[65]"]
+}
+
+__all__ = MODEL_URLS.keys()
+'''
+ResNet config: dict.
+    key: depth of ResNet.
+    values: config's dict of specific model.
+        keys:
+            block_type: Two different blocks in ResNet, BasicBlock and BottleneckBlock are optional.
+            block_depth: The number of blocks in different stages in ResNet.
+            num_channels: The number of channels to enter the next stage.
+'''
+NET_CONFIG = {
+    "18": {
+        "block_type": "BasicBlock",
+        "block_depth": [2, 2, 2, 2],
+        "num_channels": [64, 64, 128, 256]
+    },
+    "34": {
+        "block_type": "BasicBlock",
+        "block_depth": [3, 4, 6, 3],
+        "num_channels": [64, 64, 128, 256]
+    },
+    "50": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 4, 6, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "101": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 4, 23, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "152": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 8, 36, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "200": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 12, 48, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+}
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 is_vd_mode=False,
+                 act=None,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+        self.is_vd_mode = is_vd_mode
+        self.act = act
+        self.avg_pool = AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=False,
+            data_format=data_format)
+        self.bn = BatchNorm(
+            num_filters,
+            param_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult),
+            data_layout=data_format)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        if self.is_vd_mode:
+            x = self.avg_pool(x)
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act:
+            x = self.relu(x)
+        return x
+
+
+class BottleneckBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            lr_mult=lr_mult,
+            data_format=data_format)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride if if_first else 1,
+                is_vd_mode=False if if_first else True,
+                lr_mult=lr_mult,
+                data_format=data_format)
+        self.relu = nn.ReLU()
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        x = self.conv2(x)
+
+        if self.shortcut:
+            short = identity
+        else:
+            short = self.short(identity)
+        x = paddle.add(x=x, y=short)
+        x = self.relu(x)
+        return x
+
+
+class BasicBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            lr_mult=lr_mult,
+            data_format=data_format)
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=stride if if_first else 1,
+                is_vd_mode=False if if_first else True,
+                lr_mult=lr_mult,
+                data_format=data_format)
+        self.shortcut = shortcut
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        if self.shortcut:
+            short = identity
+        else:
+            short = self.short(identity)
+        x = paddle.add(x=x, y=short)
+        x = self.relu(x)
+        return x
+
+
+class ResNet(TheseusLayer):
+    """
+    ResNet
+    Args:
+        config: dict. config of ResNet.
+        version: str="vb". Different version of ResNet, version vd can perform better. 
+        class_num: int=1000. The number of classes.
+        lr_mult_list: list. Control the learning rate of different stages.
+    Returns:
+        model: nn.Layer. Specific ResNet model depends on args.
+    """
+
+    def __init__(self,
+                 config,
+                 stages_pattern,
+                 version="vb",
+                 class_num=1000,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+                 data_format="NCHW",
+                 input_image_channel=3,
+                 return_patterns=None,
+                 return_stages=None):
+        super().__init__()
+
+        self.cfg = config
+        self.lr_mult_list = lr_mult_list
+        self.is_vd_mode = version == "vd"
+        self.class_num = class_num
+        self.num_filters = [64, 128, 256, 512]
+        self.block_depth = self.cfg["block_depth"]
+        self.block_type = self.cfg["block_type"]
+        self.num_channels = self.cfg["num_channels"]
+        self.channels_mult = 1 if self.num_channels[-1] == 256 else 4
+
+        assert isinstance(self.lr_mult_list, (
+            list, tuple
+        )), "lr_mult_list should be in (list, tuple) but got {}".format(
+            type(self.lr_mult_list))
+        assert len(self.lr_mult_list
+                   ) == 5, "lr_mult_list length should be 5 but got {}".format(
+                       len(self.lr_mult_list))
+
+        self.stem_cfg = {
+            #num_channels, num_filters, filter_size, stride
+            "vb": [[input_image_channel, 64, 7, 2]],
+            "vd":
+            [[input_image_channel, 32, 3, 2], [32, 32, 3, 1], [32, 64, 3, 1]]
+        }
+
+        self.stem = nn.Sequential(* [
+            ConvBNLayer(
+                num_channels=in_c,
+                num_filters=out_c,
+                filter_size=k,
+                stride=s,
+                act="relu",
+                lr_mult=self.lr_mult_list[0],
+                data_format=data_format)
+            for in_c, out_c, k, s in self.stem_cfg[version]
+        ])
+
+        self.max_pool = MaxPool2D(
+            kernel_size=3, stride=2, padding=1, data_format=data_format)
+        block_list = []
+        for block_idx in range(len(self.block_depth)):
+            shortcut = False
+            for i in range(self.block_depth[block_idx]):
+                block_list.append(globals()[self.block_type](
+                    num_channels=self.num_channels[block_idx] if i == 0 else
+                    self.num_filters[block_idx] * self.channels_mult,
+                    num_filters=self.num_filters[block_idx],
+                    stride=2 if i == 0 and block_idx != 0 else 1,
+                    shortcut=shortcut,
+                    if_first=block_idx == i == 0 if version == "vd" else True,
+                    lr_mult=self.lr_mult_list[block_idx + 1],
+                    data_format=data_format))
+                shortcut = True
+        self.blocks = nn.Sequential(*block_list)
+
+        self.avg_pool = AdaptiveAvgPool2D(1, data_format=data_format)
+        self.flatten = nn.Flatten()
+        self.avg_pool_channels = self.num_channels[-1] * 2
+        stdv = 1.0 / math.sqrt(self.avg_pool_channels * 1.0)
+        self.fc = Linear(
+            self.avg_pool_channels,
+            self.class_num,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
+
+        self.data_format = data_format
+
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward(self, x):
+        with paddle.static.amp.fp16_guard():
+            if self.data_format == "NHWC":
+                x = paddle.transpose(x, [0, 2, 3, 1])
+                x.stop_gradient = True
+            x = self.stem(x)
+            x = self.max_pool(x)
+            x = self.blocks(x)
+            x = self.avg_pool(x)
+            x = self.flatten(x)
+            x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNet18(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet18
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet18` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["18"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet18"],
+        version="vb",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet18"], use_ssld)
+    return model
+
+
+def ResNet18_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet18_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet18_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["18"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet18"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet18_vd"], use_ssld)
+    return model
+
+
+def ResNet34(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet34
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet34` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["34"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet34"],
+        version="vb",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet34"], use_ssld)
+    return model
+
+
+def ResNet34_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet34_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet34_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["34"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet34"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet34_vd"], use_ssld)
+    return model
+
+
+def ResNet50(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet50
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet50` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["50"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet50"],
+        version="vb",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50"], use_ssld)
+    return model
+
+
+def ResNet50_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet50_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet50_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["50"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet50"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50_vd"], use_ssld)
+    return model
+
+
+def ResNet101(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet101
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet101` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["101"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet101"],
+        version="vb",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet101"], use_ssld)
+    return model
+
+
+def ResNet101_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet101_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet101_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["101"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet101"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet101_vd"], use_ssld)
+    return model
+
+
+def ResNet152(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet152
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet152` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["152"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet152"],
+        version="vb",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet152"], use_ssld)
+    return model
+
+
+def ResNet152_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet152_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet152_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["152"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet152"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet152_vd"], use_ssld)
+    return model
+
+
+def ResNet200_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet200_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet200_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["200"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet200"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet200_vd"], use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/model_zoo/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/model_zoo/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/model_zoo/resnet_vc.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/model_zoo/resnet_vc.py
new file mode 100644
index 000000000..6b972dc7b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/model_zoo/resnet_vc.py
@@ -0,0 +1,309 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "ResNet50_vc":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vc_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+        self._num_channels_out = num_filters * 4
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            name=name + "_branch2b")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=stride,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv1)
+        y = F.relu(y)
+        return y
+
+
+class ResNet_vc(nn.Layer):
+    def __init__(self, layers=50, class_num=1000):
+        super(ResNet_vc, self).__init__()
+
+        self.layers = layers
+        supported_layers = [18, 34, 50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_channels = [64, 256, 512,
+                        1024] if layers >= 50 else [64, 64, 128, 256]
+        num_filters = [64, 128, 256, 512]
+
+        self.conv1_1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=32,
+            filter_size=3,
+            stride=2,
+            act='relu',
+            name="conv1_1")
+        self.conv1_2 = ConvBNLayer(
+            num_channels=32,
+            num_filters=32,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_2")
+        self.conv1_3 = ConvBNLayer(
+            num_channels=32,
+            num_filters=64,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_3")
+
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        if layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BottleneckBlock(
+                            num_channels=num_channels[block]
+                            if i == 0 else num_filters[block] * 4,
+                            num_filters=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            name=conv_name))
+                    self.block_list.append(bottleneck_block)
+                    shortcut = True
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BasicBlock(
+                            num_channels=num_channels[block]
+                            if i == 0 else num_filters[block],
+                            num_filters=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            name=conv_name))
+                    self.block_list.append(basic_block)
+                    shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_0.w_0"),
+            bias_attr=ParamAttr(name="fc_0.b_0"))
+
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNet50_vc(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNet_vc(layers=50, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNet50_vc"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/variant_models/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/variant_models/__init__.py
new file mode 100644
index 000000000..ae9549246
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/variant_models/__init__.py
@@ -0,0 +1,3 @@
+from .resnet_variant import ResNet50_last_stage_stride1
+# from .vgg_variant import VGG19Sigmoid
+# from .pp_lcnet_variant import PPLCNet_x2_5_Tanh
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/variant_models/resnet_variant.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/variant_models/resnet_variant.py
new file mode 100644
index 000000000..0219344b1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/backbone/variant_models/resnet_variant.py
@@ -0,0 +1,23 @@
+from paddle.nn import Conv2D
+from ppcls.arch.backbone.legendary_models.resnet import ResNet50, MODEL_URLS, _load_pretrained
+
+__all__ = ["ResNet50_last_stage_stride1"]
+
+
+def ResNet50_last_stage_stride1(pretrained=False, use_ssld=False, **kwargs):
+    def replace_function(conv, pattern):
+        new_conv = Conv2D(
+            in_channels=conv._in_channels,
+            out_channels=conv._out_channels,
+            kernel_size=conv._kernel_size,
+            stride=1,
+            padding=conv._padding,
+            groups=conv._groups,
+            bias_attr=conv._bias_attr)
+        return new_conv
+
+    pattern = ["blocks[13].conv1.conv", "blocks[13].short.conv"]
+    model = ResNet50(pretrained=False, use_ssld=use_ssld, **kwargs)
+    model.upgrade_sublayer(pattern, replace_function)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50"], use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/__init__.py
new file mode 100644
index 000000000..75ca41d8a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .arcmargin import ArcMargin
+from .cosmargin import CosMargin
+from .circlemargin import CircleMargin
+from .fc import FC
+from .vehicle_neck import VehicleNeck
+
+__all__ = ['build_gear']
+
+
+def build_gear(config):
+    support_dict = [
+        'ArcMargin', 'CosMargin', 'CircleMargin', 'FC', 'VehicleNeck'
+    ]
+    module_name = config.pop('name')
+    assert module_name in support_dict, Exception(
+        'head only support {}'.format(support_dict))
+    module_class = eval(module_name)(**config)
+    return module_class
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/arcmargin.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/arcmargin.py
new file mode 100644
index 000000000..22cc76e1d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/arcmargin.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import math
+
+
+class ArcMargin(nn.Layer):
+    def __init__(self,
+                 embedding_size,
+                 class_num,
+                 margin=0.5,
+                 scale=80.0,
+                 easy_margin=False):
+        super().__init__()
+        self.embedding_size = embedding_size
+        self.class_num = class_num
+        self.margin = margin
+        self.scale = scale
+        self.easy_margin = easy_margin
+        self.weight = self.create_parameter(
+            shape=[self.embedding_size, self.class_num],
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.XavierNormal())
+
+    def forward(self, input, label=None):
+        input_norm = paddle.sqrt(
+            paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        input = paddle.divide(input, input_norm)
+
+        weight_norm = paddle.sqrt(
+            paddle.sum(paddle.square(self.weight), axis=0, keepdim=True))
+        weight = paddle.divide(self.weight, weight_norm)
+
+        cos = paddle.matmul(input, weight)
+        if not self.training or label is None:
+            return cos
+        sin = paddle.sqrt(1.0 - paddle.square(cos) + 1e-6)
+        cos_m = math.cos(self.margin)
+        sin_m = math.sin(self.margin)
+        phi = cos * cos_m - sin * sin_m
+
+        th = math.cos(self.margin) * (-1)
+        mm = math.sin(self.margin) * self.margin
+        if self.easy_margin:
+            phi = self._paddle_where_more_than(cos, 0, phi, cos)
+        else:
+            phi = self._paddle_where_more_than(cos, th, phi, cos - mm)
+
+        one_hot = paddle.nn.functional.one_hot(label, self.class_num)
+        one_hot = paddle.squeeze(one_hot, axis=[1])
+        output = paddle.multiply(one_hot, phi) + paddle.multiply(
+            (1.0 - one_hot), cos)
+        output = output * self.scale
+        return output
+
+    def _paddle_where_more_than(self, target, limit, x, y):
+        mask = paddle.cast(x=(target > limit), dtype='float32')
+        output = paddle.multiply(mask, x) + paddle.multiply((1.0 - mask), y)
+        return output
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/circlemargin.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/circlemargin.py
new file mode 100644
index 000000000..d1bce83cb
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/circlemargin.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class CircleMargin(nn.Layer):
+    def __init__(self, embedding_size, class_num, margin, scale):
+        super(CircleMargin, self).__init__()
+        self.scale = scale
+        self.margin = margin
+        self.embedding_size = embedding_size
+        self.class_num = class_num
+
+        self.weight = self.create_parameter(
+            shape=[self.embedding_size, self.class_num],
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.XavierNormal())
+
+    def forward(self, input, label):
+        feat_norm = paddle.sqrt(
+            paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        input = paddle.divide(input, feat_norm)
+
+        weight_norm = paddle.sqrt(
+            paddle.sum(paddle.square(self.weight), axis=0, keepdim=True))
+        weight = paddle.divide(self.weight, weight_norm)
+
+        logits = paddle.matmul(input, weight)
+        if not self.training or label is None:
+            return logits
+
+        alpha_p = paddle.clip(-logits.detach() + 1 + self.margin, min=0.)
+        alpha_n = paddle.clip(logits.detach() + self.margin, min=0.)
+        delta_p = 1 - self.margin
+        delta_n = self.margin
+
+        m_hot = F.one_hot(label.reshape([-1]), num_classes=logits.shape[1])
+
+        logits_p = alpha_p * (logits - delta_p)
+        logits_n = alpha_n * (logits - delta_n)
+        pre_logits = logits_p * m_hot + logits_n * (1 - m_hot)
+        pre_logits = self.scale * pre_logits
+
+        return pre_logits
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/cosmargin.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/cosmargin.py
new file mode 100644
index 000000000..578b64c2b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/cosmargin.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import math
+import paddle.nn as nn
+
+
+class CosMargin(paddle.nn.Layer):
+    def __init__(self, embedding_size, class_num, margin=0.35, scale=64.0):
+        super(CosMargin, self).__init__()
+        self.scale = scale
+        self.margin = margin
+        self.embedding_size = embedding_size
+        self.class_num = class_num
+
+        self.weight = self.create_parameter(
+            shape=[self.embedding_size, self.class_num],
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.XavierNormal())
+
+    def forward(self, input, label):
+        label.stop_gradient = True
+
+        input_norm = paddle.sqrt(
+            paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        input = paddle.divide(input, input_norm)
+
+        weight_norm = paddle.sqrt(
+            paddle.sum(paddle.square(self.weight), axis=0, keepdim=True))
+        weight = paddle.divide(self.weight, weight_norm)
+
+        cos = paddle.matmul(input, weight)
+        if not self.training or label is None:
+            return cos
+
+        cos_m = cos - self.margin
+
+        one_hot = paddle.nn.functional.one_hot(label, self.class_num)
+        one_hot = paddle.squeeze(one_hot, axis=[1])
+        output = paddle.multiply(one_hot, cos_m) + paddle.multiply(
+            (1.0 - one_hot), cos)
+        output = output * self.scale
+        return output
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/fc.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/fc.py
new file mode 100644
index 000000000..b32474195
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/fc.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+
+
+class FC(nn.Layer):
+    def __init__(self, embedding_size, class_num):
+        super(FC, self).__init__()
+        self.embedding_size = embedding_size
+        self.class_num = class_num
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.XavierNormal())
+        self.fc = paddle.nn.Linear(
+            self.embedding_size, self.class_num, weight_attr=weight_attr)
+
+    def forward(self, input, label=None):
+        out = self.fc(input)
+        return out
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/identity_head.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/identity_head.py
new file mode 100644
index 000000000..7d11e5742
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/identity_head.py
@@ -0,0 +1,9 @@
+from paddle import nn
+
+
+class IdentityHead(nn.Layer):
+    def __init__(self):
+        super(IdentityHead, self).__init__()
+
+    def forward(self, x, label=None):
+        return {"features": x, "logits": None}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/vehicle_neck.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/vehicle_neck.py
new file mode 100644
index 000000000..05f4e333f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/gears/vehicle_neck.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+
+
+class VehicleNeck(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=1,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 padding_mode='zeros',
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCHW'):
+        super().__init__()
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            padding_mode=padding_mode,
+            weight_attr=weight_attr,
+            bias_attr=weight_attr,
+            data_format=data_format)
+        self.flatten = nn.Flatten()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.flatten(x)
+        return x
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/slim/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/slim/__init__.py
new file mode 100644
index 000000000..3733059ce
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/slim/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ppcls.arch.slim.prune import prune_model
+from ppcls.arch.slim.quant import quantize_model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/slim/prune.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/slim/prune.py
new file mode 100644
index 000000000..c0c9d220b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/slim/prune.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+import paddle
+from ppcls.utils import logger
+
+
+def prune_model(config, model):
+    if config.get("Slim", False) and config["Slim"].get("prune", False):
+        import paddleslim
+        prune_method_name = config["Slim"]["prune"]["name"].lower()
+        assert prune_method_name in [
+            "fpgm", "l1_norm"
+        ], "The prune methods only support 'fpgm' and 'l1_norm'"
+        if prune_method_name == "fpgm":
+            model.pruner = paddleslim.dygraph.FPGMFilterPruner(
+                model, [1] + config["Global"]["image_shape"])
+        else:
+            model.pruner = paddleslim.dygraph.L1NormFilterPruner(
+                model, [1] + config["Global"]["image_shape"])
+
+        # prune model
+        _prune_model(config, model)
+    else:
+        model.pruner = None
+
+
+
+def _prune_model(config, model):
+    from paddleslim.analysis import dygraph_flops as flops
+    logger.info("FLOPs before pruning: {}GFLOPs".format(
+        flops(model, [1] + config["Global"]["image_shape"]) / 1e9))
+    model.eval()
+
+    params = []
+    for sublayer in model.sublayers():
+        for param in sublayer.parameters(include_sublayers=False):
+            if isinstance(sublayer, paddle.nn.Conv2D):
+                params.append(param.name)
+    ratios = {}
+    for param in params:
+        ratios[param] = config["Slim"]["prune"]["pruned_ratio"]
+    plan = model.pruner.prune_vars(ratios, [0])
+
+    logger.info("FLOPs after pruning: {}GFLOPs; pruned ratio: {}".format(
+        flops(model, [1] + config["Global"]["image_shape"]) / 1e9,
+        plan.pruned_flops))
+
+    for param in model.parameters():
+        if "conv2d" in param.name:
+            logger.info("{}\t{}".format(param.name, param.shape))
+
+    model.train()
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/slim/quant.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/slim/quant.py
new file mode 100644
index 000000000..b8f59a78f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/slim/quant.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+import paddle
+from ppcls.utils import logger
+
+QUANT_CONFIG = {
+    # weight preprocess type, default is None and no preprocessing is performed.
+    'weight_preprocess_type': None,
+    # activation preprocess type, default is None and no preprocessing is performed.
+    'activation_preprocess_type': None,
+    # weight quantize type, default is 'channel_wise_abs_max'
+    'weight_quantize_type': 'channel_wise_abs_max',
+    # activation quantize type, default is 'moving_average_abs_max'
+    'activation_quantize_type': 'moving_average_abs_max',
+    # weight quantize bit num, default is 8
+    'weight_bits': 8,
+    # activation quantize bit num, default is 8
+    'activation_bits': 8,
+    # data type after quantization, such as 'uint8', 'int8', etc. default is 'int8'
+    'dtype': 'int8',
+    # window size for 'range_abs_max' quantization. default is 10000
+    'window_size': 10000,
+    # The decay coefficient of moving average, default is 0.9
+    'moving_rate': 0.9,
+    # for dygraph quantization, layers of type in quantizable_layer_type will be quantized
+    'quantizable_layer_type': ['Conv2D', 'Linear'],
+}
+
+
+def quantize_model(config, model):
+    if config.get("Slim", False) and config["Slim"].get("quant", False):
+        from paddleslim.dygraph.quant import QAT
+        assert config["Slim"]["quant"]["name"].lower(
+        ) == 'pact', 'Only PACT quantization method is supported now'
+        QUANT_CONFIG["activation_preprocess_type"] = "PACT"
+        model.quanter = QAT(config=QUANT_CONFIG)
+        model.quanter.quantize(model)
+        logger.info("QAT model summary:")
+        paddle.summary(model, (1, 3, 224, 224))
+    else:
+        model.quanter = None
+    return
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/utils.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/utils.py
new file mode 100644
index 000000000..308475d7d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/arch/utils.py
@@ -0,0 +1,53 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+import types
+from difflib import SequenceMatcher
+
+from . import backbone
+
+
+def get_architectures():
+    """
+    get all of model architectures
+    """
+    names = []
+    for k, v in backbone.__dict__.items():
+        if isinstance(v, (types.FunctionType, six.class_types)):
+            names.append(k)
+    return names
+
+
+def get_blacklist_model_in_static_mode():
+    from ppcls.arch.backbone import distilled_vision_transformer
+    from ppcls.arch.backbone import vision_transformer
+    blacklist = distilled_vision_transformer.__all__ + vision_transformer.__all__
+    return blacklist
+
+
+def similar_architectures(name='', names=[], thresh=0.1, topk=10):
+    """
+    inferred similar architectures
+    """
+    scores = []
+    for idx, n in enumerate(names):
+        if n.startswith('__'):
+            continue
+        score = SequenceMatcher(None, n.lower(), name.lower()).quick_ratio()
+        if score > thresh:
+            scores.append((idx, score))
+    scores.sort(key=lambda x: x[1], reverse=True)
+    similar_names = [names[s[0]] for s in scores[:min(topk, len(scores))]]
+    return similar_names
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_AutoAugment.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_AutoAugment.yaml
new file mode 100644
index 000000000..ab4c29c30
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_AutoAugment.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_Baseline.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_Baseline.yaml
new file mode 100644
index 000000000..d75fede9e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_Baseline.yaml
@@ -0,0 +1,128 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_Cutmix.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_Cutmix.yaml
new file mode 100644
index 000000000..2fefb9f4b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_Cutmix.yaml
@@ -0,0 +1,128 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - CutmixOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_Cutout.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_Cutout.yaml
new file mode 100644
index 000000000..4bf530664
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_Cutout.yaml
@@ -0,0 +1,131 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - Cutout:
+            n_holes: 1
+            length: 112
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_GridMask.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_GridMask.yaml
new file mode 100644
index 000000000..c0016aa00
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_GridMask.yaml
@@ -0,0 +1,134 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - GridMask:
+            d1: 96
+            d2: 224
+            rotate: 1
+            ratio: 0.5
+            mode: 0
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_HideAndSeek.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_HideAndSeek.yaml
new file mode 100644
index 000000000..12e4ac8db
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_HideAndSeek.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - HideAndSeek:
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_Mixup.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_Mixup.yaml
new file mode 100644
index 000000000..3434cab5a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_Mixup.yaml
@@ -0,0 +1,128 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_RandAugment.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_RandAugment.yaml
new file mode 100644
index 000000000..153451e13
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_RandAugment.yaml
@@ -0,0 +1,131 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - RandAugment:
+            num_layers: 2 
+            magnitude: 5
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_RandomErasing.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_RandomErasing.yaml
new file mode 100644
index 000000000..8e89c5ca1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/DataAugment/ResNet50_RandomErasing.yaml
@@ -0,0 +1,134 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 0.4
+            r1: 0.3
+            mean: [0., 0., 0.]
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/ResNet/ResNet50.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/ResNet/ResNet50.yaml
new file mode 100644
index 000000000..c2da23fb3
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/ResNet/ResNet50.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/ResNet/ResNet50_amp_4x8.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/ResNet/ResNet50_amp_4x8.yaml
new file mode 100644
index 000000000..2d6de3b21
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/ResNet/ResNet50_amp_4x8.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 90
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: False
+  # O1: mixed fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.875
+  multi_precision: True
+  lr:
+    name: Cosine
+    warmup_epoch: 8
+    learning_rate: 8.192
+  regularizer:
+    name: 'L2'
+    coeff: 2.5e-05
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/ResNet/ResNet50_vd.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/ResNet/ResNet50_vd.yaml
new file mode 100644
index 000000000..be7b2d9db
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/ImageNet/ResNet/ResNet50_vd.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/quick_start/ResNet50_vd.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/quick_start/ResNet50_vd.yaml
new file mode 100644
index 000000000..30d745599
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/configs/quick_start/ResNet50_vd.yaml
@@ -0,0 +1,107 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 5
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50_vd 
+  class_num: 102
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.0125
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./data/datasets/flowers102/
+      cls_label_path: ./data/datasets/flowers102/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./data/datasets/flowers102/
+      cls_label_path: ./data/datasets/flowers102/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/engine.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/engine.py
new file mode 100644
index 000000000..019cf1650
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/engine.py
@@ -0,0 +1,468 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import platform
+import paddle
+import paddle.distributed as dist
+from visualdl import LogWriter
+from paddle import nn
+import numpy as np
+import random
+
+from ppcls.utils.check import check_gpu
+from ppcls.utils.misc import AverageMeter
+from ppcls.utils import logger
+from ppcls.utils.logger import init_logger
+from ppcls.utils.config import print_config
+from ppcls.data import build_dataloader
+from ppcls.arch import build_model, RecModel, DistillationModel, TheseusLayer
+from ppcls.arch import apply_to_static
+from ppcls.loss import build_loss
+from ppcls.metric import build_metrics
+from ppcls.optimizer import build_optimizer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+from ppcls.utils.save_load import init_model
+from ppcls.utils import save_load
+
+from ppcls.data.utils.get_image_list import get_image_list
+from ppcls.data.postprocess import build_postprocess
+from ppcls.data import create_operators
+from ppcls.engine.train import train_epoch
+from ppcls.engine import evaluation
+from ppcls.arch.gears.identity_head import IdentityHead
+
+
+class Engine(object):
+    def __init__(self, config, mode="train"):
+        assert mode in ["train", "eval", "infer", "export"]
+        self.mode = mode
+        self.config = config
+        self.eval_mode = self.config["Global"].get("eval_mode",
+                                                   "classification")
+        if "Head" in self.config["Arch"] or self.config["Arch"].get("is_rec",
+                                                                    False):
+            self.is_rec = True
+        else:
+            self.is_rec = False
+
+        # set seed
+        seed = self.config["Global"].get("seed", False)
+        if seed or seed == 0:
+            assert isinstance(seed, int), "The 'seed' must be a integer!"
+            paddle.seed(seed)
+            np.random.seed(seed)
+            random.seed(seed)
+
+        # init logger
+        self.output_dir = self.config['Global']['output_dir']
+        log_file = os.path.join(self.output_dir, self.config["Arch"]["name"],
+                                f"{mode}.log")
+        init_logger(log_file=log_file)
+        print_config(config)
+
+        # init train_func and eval_func
+        assert self.eval_mode in ["classification", "retrieval"], logger.error(
+            "Invalid eval mode: {}".format(self.eval_mode))
+        self.train_epoch_func = train_epoch
+        self.eval_func = getattr(evaluation, self.eval_mode + "_eval")
+
+        self.use_dali = self.config['Global'].get("use_dali", False)
+
+        # for visualdl
+        self.vdl_writer = None
+        if self.config['Global'][
+                'use_visualdl'] and mode == "train" and dist.get_rank() == 0:
+            vdl_writer_path = os.path.join(self.output_dir, "vdl")
+            if not os.path.exists(vdl_writer_path):
+                os.makedirs(vdl_writer_path)
+            self.vdl_writer = LogWriter(logdir=vdl_writer_path)
+
+        # set device
+        assert self.config["Global"]["device"] in ["cpu", "gpu", "xpu", "npu", "mlu"]
+        self.device = paddle.set_device(self.config["Global"]["device"])
+        logger.info('train with paddle {} and device {}'.format(
+            paddle.__version__, self.device))
+
+        # AMP training
+        self.amp = True if "AMP" in self.config and self.mode == "train" else False
+        if self.amp and self.config["AMP"] is not None:
+            self.scale_loss = self.config["AMP"].get("scale_loss", 1.0)
+            self.use_dynamic_loss_scaling = self.config["AMP"].get(
+                "use_dynamic_loss_scaling", False)
+        else:
+            self.scale_loss = 1.0
+            self.use_dynamic_loss_scaling = False
+        if self.amp:
+            AMP_RELATED_FLAGS_SETTING = {
+                'FLAGS_max_inplace_grad_add': 8,
+            }
+            if paddle.is_compiled_with_cuda():
+                AMP_RELATED_FLAGS_SETTING.update({
+                    'FLAGS_cudnn_batchnorm_spatial_persistent': 1
+                })
+            paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)
+
+        if "class_num" in config["Global"]:
+            global_class_num = config["Global"]["class_num"]
+            if "class_num" not in config["Arch"]:
+                config["Arch"]["class_num"] = global_class_num
+                msg = f"The Global.class_num will be deprecated. Please use Arch.class_num instead. Arch.class_num has been set to {global_class_num}."
+            else:
+                msg = "The Global.class_num will be deprecated. Please use Arch.class_num instead. The Global.class_num has been ignored."
+            logger.warning(msg)
+        #TODO(gaotingquan): support rec
+        class_num = config["Arch"].get("class_num", None)
+        self.config["DataLoader"].update({"class_num": class_num})
+        # build dataloader
+        if self.mode == 'train':
+            self.train_dataloader = build_dataloader(
+                self.config["DataLoader"], "Train", self.device, self.use_dali)
+        if self.mode == "eval" or (self.mode == "train" and
+                                   self.config["Global"]["eval_during_train"]):
+            if self.eval_mode == "classification":
+                self.eval_dataloader = build_dataloader(
+                    self.config["DataLoader"], "Eval", self.device,
+                    self.use_dali)
+            elif self.eval_mode == "retrieval":
+                self.gallery_query_dataloader = None
+                if len(self.config["DataLoader"]["Eval"].keys()) == 1:
+                    key = list(self.config["DataLoader"]["Eval"].keys())[0]
+                    self.gallery_query_dataloader = build_dataloader(
+                        self.config["DataLoader"]["Eval"], key, self.device,
+                        self.use_dali)
+                else:
+                    self.gallery_dataloader = build_dataloader(
+                        self.config["DataLoader"]["Eval"], "Gallery",
+                        self.device, self.use_dali)
+                    self.query_dataloader = build_dataloader(
+                        self.config["DataLoader"]["Eval"], "Query",
+                        self.device, self.use_dali)
+
+        # build loss
+        if self.mode == "train":
+            loss_info = self.config["Loss"]["Train"]
+            self.train_loss_func = build_loss(loss_info)
+        if self.mode == "eval" or (self.mode == "train" and
+                                   self.config["Global"]["eval_during_train"]):
+            loss_config = self.config.get("Loss", None)
+            if loss_config is not None:
+                loss_config = loss_config.get("Eval")
+                if loss_config is not None:
+                    self.eval_loss_func = build_loss(loss_config)
+                else:
+                    self.eval_loss_func = None
+            else:
+                self.eval_loss_func = None
+
+        # build metric
+        if self.mode == 'train':
+            metric_config = self.config.get("Metric")
+            if metric_config is not None:
+                metric_config = metric_config.get("Train")
+                if metric_config is not None:
+                    if hasattr(
+                            self.train_dataloader, "collate_fn"
+                    ) and self.train_dataloader.collate_fn is not None:
+                        for m_idx, m in enumerate(metric_config):
+                            if "TopkAcc" in m:
+                                msg = f"'TopkAcc' metric can not be used when setting 'batch_transform_ops' in config. The 'TopkAcc' metric has been removed."
+                                logger.warning(msg)
+                                break
+                        metric_config.pop(m_idx)
+                    self.train_metric_func = build_metrics(metric_config)
+                else:
+                    self.train_metric_func = None
+        else:
+            self.train_metric_func = None
+
+        if self.mode == "eval" or (self.mode == "train" and
+                                   self.config["Global"]["eval_during_train"]):
+            metric_config = self.config.get("Metric")
+            if self.eval_mode == "classification":
+                if metric_config is not None:
+                    metric_config = metric_config.get("Eval")
+                    if metric_config is not None:
+                        self.eval_metric_func = build_metrics(metric_config)
+            elif self.eval_mode == "retrieval":
+                if metric_config is None:
+                    metric_config = [{"name": "Recallk", "topk": (1, 5)}]
+                else:
+                    metric_config = metric_config["Eval"]
+                self.eval_metric_func = build_metrics(metric_config)
+        else:
+            self.eval_metric_func = None
+
+        # build model
+        self.model = build_model(self.config)
+        # set @to_static for benchmark, skip this by default.
+        apply_to_static(self.config, self.model)
+
+        # load_pretrain
+        if self.config["Global"]["pretrained_model"] is not None:
+            if self.config["Global"]["pretrained_model"].startswith("http"):
+                load_dygraph_pretrain_from_url(
+                    self.model, self.config["Global"]["pretrained_model"])
+            else:
+                load_dygraph_pretrain(
+                    self.model, self.config["Global"]["pretrained_model"])
+
+        # build optimizer
+        if self.mode == 'train':
+            self.optimizer, self.lr_sch = build_optimizer(
+                self.config["Optimizer"], self.config["Global"]["epochs"],
+                len(self.train_dataloader), [self.model])
+
+        # for amp training
+        if self.amp:
+            self.scaler = paddle.amp.GradScaler(
+                init_loss_scaling=self.scale_loss,
+                use_dynamic_loss_scaling=self.use_dynamic_loss_scaling)
+            amp_level = self.config['AMP'].get("level", "O1")
+            if amp_level not in ["O1", "O2"]:
+                msg = "[Parameter Error]: The optimize level of AMP only support 'O1' and 'O2'. The level has been set 'O1'."
+                logger.warning(msg)
+                self.config['AMP']["level"] = "O1"
+                amp_level = "O1"
+            self.model, self.optimizer = paddle.amp.decorate(
+                models=self.model,
+                optimizers=self.optimizer,
+                level=amp_level,
+                save_dtype='float32')
+
+        # for distributed
+        world_size = dist.get_world_size()
+        self.config["Global"]["distributed"] = world_size != 1
+        if world_size != 4 and self.mode == "train":
+            msg = f"The training strategy in config files provided by PaddleClas is based on 4 gpus. But the number of gpus is {world_size} in current training. Please modify the stategy (learning rate, batch size and so on) if use config files in PaddleClas to train."
+            logger.warning(msg)
+        if self.config["Global"]["distributed"]:
+            dist.init_parallel_env()
+            self.model = paddle.DataParallel(self.model)
+
+        # build postprocess for infer
+        if self.mode == 'infer':
+            self.preprocess_func = create_operators(self.config["Infer"][
+                "transforms"])
+            self.postprocess_func = build_postprocess(self.config["Infer"][
+                "PostProcess"])
+
+    def train(self):
+        assert self.mode == "train"
+        print_batch_step = self.config['Global']['print_batch_step']
+        save_interval = self.config["Global"]["save_interval"]
+        best_metric = {
+            "metric": 0.0,
+            "epoch": 0,
+        }
+        # key:
+        # val: metrics list word
+        self.output_info = dict()
+        self.time_info = {
+            "batch_cost": AverageMeter(
+                "batch_cost", '.5f', postfix=" s,"),
+            "reader_cost": AverageMeter(
+                "reader_cost", ".5f", postfix=" s,"),
+        }
+        # global iter counter
+        self.global_step = 0
+
+        if self.config["Global"]["checkpoints"] is not None:
+            metric_info = init_model(self.config["Global"], self.model,
+                                     self.optimizer)
+            if metric_info is not None:
+                best_metric.update(metric_info)
+
+        self.max_iter = len(self.train_dataloader) - 1 if platform.system(
+        ) == "Windows" else len(self.train_dataloader)
+        for epoch_id in range(best_metric["epoch"] + 1,
+                              self.config["Global"]["epochs"] + 1):
+            acc = 0.0
+            # for one epoch train
+            self.train_epoch_func(self, epoch_id, print_batch_step)
+
+            if self.use_dali:
+                self.train_dataloader.reset()
+            metric_msg = ", ".join([
+                "{}: {:.5f}".format(key, self.output_info[key].avg)
+                for key in self.output_info
+            ])
+            logger.info("[Train][Epoch {}/{}][Avg]{}".format(
+                epoch_id, self.config["Global"]["epochs"], metric_msg))
+            self.output_info.clear()
+
+            # eval model and save model if possible
+            if self.config["Global"][
+                    "eval_during_train"] and epoch_id % self.config["Global"][
+                        "eval_interval"] == 0:
+                acc = self.eval(epoch_id)
+                if acc > best_metric["metric"]:
+                    best_metric["metric"] = acc
+                    best_metric["epoch"] = epoch_id
+                    save_load.save_model(
+                        self.model,
+                        self.optimizer,
+                        best_metric,
+                        self.output_dir,
+                        model_name=self.config["Arch"]["name"],
+                        prefix="best_model")
+                logger.info("[Eval][Epoch {}][best metric: {}]".format(
+                    epoch_id, best_metric["metric"]))
+                logger.scaler(
+                    name="eval_acc",
+                    value=acc,
+                    step=epoch_id,
+                    writer=self.vdl_writer)
+
+                self.model.train()
+
+            # save model
+            if epoch_id % save_interval == 0:
+                save_load.save_model(
+                    self.model,
+                    self.optimizer, {"metric": acc,
+                                     "epoch": epoch_id},
+                    self.output_dir,
+                    model_name=self.config["Arch"]["name"],
+                    prefix="epoch_{}".format(epoch_id))
+            # save the latest model
+            save_load.save_model(
+                self.model,
+                self.optimizer, {"metric": acc,
+                                 "epoch": epoch_id},
+                self.output_dir,
+                model_name=self.config["Arch"]["name"],
+                prefix="latest")
+
+        if self.vdl_writer is not None:
+            self.vdl_writer.close()
+
+    @paddle.no_grad()
+    def eval(self, epoch_id=0):
+        assert self.mode in ["train", "eval"]
+        self.model.eval()
+        eval_result = self.eval_func(self, epoch_id)
+        self.model.train()
+        return eval_result
+
+    @paddle.no_grad()
+    def infer(self):
+        assert self.mode == "infer" and self.eval_mode == "classification"
+        total_trainer = dist.get_world_size()
+        local_rank = dist.get_rank()
+        image_list = get_image_list(self.config["Infer"]["infer_imgs"])
+        # data split
+        image_list = image_list[local_rank::total_trainer]
+
+        batch_size = self.config["Infer"]["batch_size"]
+        self.model.eval()
+        batch_data = []
+        image_file_list = []
+        for idx, image_file in enumerate(image_list):
+            with open(image_file, 'rb') as f:
+                x = f.read()
+            for process in self.preprocess_func:
+                x = process(x)
+            batch_data.append(x)
+            image_file_list.append(image_file)
+            if len(batch_data) >= batch_size or idx == len(image_list) - 1:
+                batch_tensor = paddle.to_tensor(batch_data)
+                out = self.model(batch_tensor)
+                if isinstance(out, list):
+                    out = out[0]
+                if isinstance(out, dict) and "logits" in out:
+                    out = out["logits"]
+                if isinstance(out, dict) and "output" in out:
+                    out = out["output"]
+                result = self.postprocess_func(out, image_file_list)
+                print(result)
+                batch_data.clear()
+                image_file_list.clear()
+
+    def export(self):
+        assert self.mode == "export"
+        use_multilabel = self.config["Global"].get("use_multilabel", False)
+        model = ExportModel(self.config["Arch"], self.model, use_multilabel)
+        if self.config["Global"]["pretrained_model"] is not None:
+            load_dygraph_pretrain(model.base_model,
+                                  self.config["Global"]["pretrained_model"])
+
+        model.eval()
+        save_path = os.path.join(self.config["Global"]["save_inference_dir"],
+                                 "inference")
+        if model.quanter:
+            model.quanter.save_quantized_model(
+                model.base_model,
+                save_path,
+                input_spec=[
+                    paddle.static.InputSpec(
+                        shape=[None] + self.config["Global"]["image_shape"],
+                        dtype='float32')
+                ])
+        else:
+            model = paddle.jit.to_static(
+                model,
+                input_spec=[
+                    paddle.static.InputSpec(
+                        shape=[None] + self.config["Global"]["image_shape"],
+                        dtype='float32')
+                ])
+            paddle.jit.save(model, save_path)
+
+
+class ExportModel(TheseusLayer):
+    """
+    ExportModel: add softmax onto the model
+    """
+
+    def __init__(self, config, model, use_multilabel):
+        super().__init__()
+        self.base_model = model
+        # we should choose a final model to export
+        if isinstance(self.base_model, DistillationModel):
+            self.infer_model_name = config["infer_model_name"]
+        else:
+            self.infer_model_name = None
+
+        self.infer_output_key = config.get("infer_output_key", None)
+        if self.infer_output_key == "features" and isinstance(self.base_model,
+                                                              RecModel):
+            self.base_model.head = IdentityHead()
+        if use_multilabel:
+            self.out_act = nn.Sigmoid()
+        else:
+            if config.get("infer_add_softmax", True):
+                self.out_act = nn.Softmax(axis=-1)
+            else:
+                self.out_act = None
+
+    def eval(self):
+        self.training = False
+        for layer in self.sublayers():
+            layer.training = False
+            layer.eval()
+
+    def forward(self, x):
+        x = self.base_model(x)
+        if isinstance(x, list):
+            x = x[0]
+        if self.infer_model_name is not None:
+            x = x[self.infer_model_name]
+        if self.infer_output_key is not None:
+            x = x[self.infer_output_key]
+        if self.out_act is not None:
+            x = self.out_act(x)
+        return x
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/evaluation/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/evaluation/__init__.py
new file mode 100644
index 000000000..e0cd77888
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/evaluation/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ppcls.engine.evaluation.classification import classification_eval
+from ppcls.engine.evaluation.retrieval import retrieval_eval
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/evaluation/classification.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/evaluation/classification.py
new file mode 100644
index 000000000..79fb1d692
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/evaluation/classification.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import time
+import platform
+import paddle
+
+from ppcls.utils.misc import AverageMeter
+from ppcls.utils import logger
+
+
+def classification_eval(engine, epoch_id=0):
+    output_info = dict()
+    time_info = {
+        "batch_cost": AverageMeter(
+            "batch_cost", '.5f', postfix=" s,"),
+        "reader_cost": AverageMeter(
+            "reader_cost", ".5f", postfix=" s,"),
+    }
+    print_batch_step = engine.config["Global"]["print_batch_step"]
+
+    metric_key = None
+    tic = time.time()
+    accum_samples = 0
+    total_samples = len(
+        engine.eval_dataloader.
+        dataset) if not engine.use_dali else engine.eval_dataloader.size
+    max_iter = len(engine.eval_dataloader) - 1 if platform.system(
+    ) == "Windows" else len(engine.eval_dataloader)
+    for iter_id, batch in enumerate(engine.eval_dataloader):
+        if iter_id >= max_iter:
+            break
+        if iter_id == 5:
+            for key in time_info:
+                time_info[key].reset()
+        if engine.use_dali:
+            batch = [
+                paddle.to_tensor(batch[0]['data']),
+                paddle.to_tensor(batch[0]['label'])
+            ]
+        time_info["reader_cost"].update(time.time() - tic)
+        batch_size = batch[0].shape[0]
+        batch[0] = paddle.to_tensor(batch[0]).astype("float32")
+        if not engine.config["Global"].get("use_multilabel", False):
+            batch[1] = batch[1].reshape([-1, 1]).astype("int64")
+
+        # image input
+        if engine.amp:
+            amp_level = engine.config['AMP'].get("level", "O1").upper()
+            with paddle.amp.auto_cast(
+                    custom_black_list={
+                        "flatten_contiguous_range", "greater_than"
+                    },
+                    level=amp_level):
+                out = engine.model(batch[0])
+        else:
+            out = engine.model(batch[0])
+
+        # just for DistributedBatchSampler issue: repeat sampling
+        current_samples = batch_size * paddle.distributed.get_world_size()
+        accum_samples += current_samples
+
+        # gather Tensor when distributed
+        if paddle.distributed.get_world_size() > 1:
+            label_list = []
+            paddle.distributed.all_gather(label_list, batch[1])
+            labels = paddle.concat(label_list, 0)
+
+            if isinstance(out, dict):
+                if "Student" in out:
+                    out = out["Student"]
+                    if isinstance(out, dict):
+                        out = out["logits"]
+                elif "logits" in out:
+                    out = out["logits"]
+                else:
+                    msg = "Error: Wrong key in out!"
+                    raise Exception(msg)
+            if isinstance(out, list):
+                preds = []
+                for x in out:
+                    pred_list = []
+                    paddle.distributed.all_gather(pred_list, x)
+                    pred_x = paddle.concat(pred_list, 0)
+                    preds.append(pred_x)
+            else:
+                pred_list = []
+                paddle.distributed.all_gather(pred_list, out)
+                preds = paddle.concat(pred_list, 0)
+
+            if accum_samples > total_samples and not engine.use_dali:
+                preds = preds[:total_samples + current_samples - accum_samples]
+                labels = labels[:total_samples + current_samples -
+                                accum_samples]
+                current_samples = total_samples + current_samples - accum_samples
+        else:
+            labels = batch[1]
+            preds = out
+
+        # calc loss
+        if engine.eval_loss_func is not None:
+            if engine.amp and engine.config["AMP"].get("use_fp16_test", False):
+                amp_level = engine.config['AMP'].get("level", "O1").upper()
+                with paddle.amp.auto_cast(
+                        custom_black_list={
+                            "flatten_contiguous_range", "greater_than"
+                        },
+                        level=amp_level):
+                    loss_dict = engine.eval_loss_func(preds, labels)
+            else:
+                loss_dict = engine.eval_loss_func(preds, labels)
+
+            for key in loss_dict:
+                if key not in output_info:
+                    output_info[key] = AverageMeter(key, '7.5f')
+                output_info[key].update(loss_dict[key].numpy()[0],
+                                        current_samples)
+        #  calc metric
+        if engine.eval_metric_func is not None:
+            metric_dict = engine.eval_metric_func(preds, labels)
+            for key in metric_dict:
+                if metric_key is None:
+                    metric_key = key
+                if key not in output_info:
+                    output_info[key] = AverageMeter(key, '7.5f')
+
+                output_info[key].update(metric_dict[key].numpy()[0],
+                                        current_samples)
+
+        time_info["batch_cost"].update(time.time() - tic)
+
+        if iter_id % print_batch_step == 0:
+            time_msg = "s, ".join([
+                "{}: {:.5f}".format(key, time_info[key].avg)
+                for key in time_info
+            ])
+
+            ips_msg = "ips: {:.5f} images/sec".format(
+                batch_size / time_info["batch_cost"].avg)
+
+            metric_msg = ", ".join([
+                "{}: {:.5f}".format(key, output_info[key].val)
+                for key in output_info
+            ])
+            logger.info("[Eval][Epoch {}][Iter: {}/{}]{}, {}, {}".format(
+                epoch_id, iter_id,
+                len(engine.eval_dataloader), metric_msg, time_msg, ips_msg))
+
+        tic = time.time()
+    if engine.use_dali:
+        engine.eval_dataloader.reset()
+    metric_msg = ", ".join([
+        "{}: {:.5f}".format(key, output_info[key].avg) for key in output_info
+    ])
+    logger.info("[Eval][Epoch {}][Avg]{}".format(epoch_id, metric_msg))
+
+    # do not try to save best eval.model
+    if engine.eval_metric_func is None:
+        return -1
+    # return 1st metric in the dict
+    return output_info[metric_key].avg
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/evaluation/retrieval.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/evaluation/retrieval.py
new file mode 100644
index 000000000..b481efae1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/evaluation/retrieval.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import platform
+import paddle
+from ppcls.utils import logger
+
+
+def retrieval_eval(engine, epoch_id=0):
+    engine.model.eval()
+    # step1. build gallery
+    if engine.gallery_query_dataloader is not None:
+        gallery_feas, gallery_img_id, gallery_unique_id = cal_feature(
+            engine, name='gallery_query')
+        query_feas, query_img_id, query_query_id = gallery_feas, gallery_img_id, gallery_unique_id
+    else:
+        gallery_feas, gallery_img_id, gallery_unique_id = cal_feature(
+            engine, name='gallery')
+        query_feas, query_img_id, query_query_id = cal_feature(
+            engine, name='query')
+
+    # step2. do evaluation
+    sim_block_size = engine.config["Global"].get("sim_block_size", 64)
+    sections = [sim_block_size] * (len(query_feas) // sim_block_size)
+    if len(query_feas) % sim_block_size:
+        sections.append(len(query_feas) % sim_block_size)
+    fea_blocks = paddle.split(query_feas, num_or_sections=sections)
+    if query_query_id is not None:
+        query_id_blocks = paddle.split(
+            query_query_id, num_or_sections=sections)
+    image_id_blocks = paddle.split(query_img_id, num_or_sections=sections)
+    metric_key = None
+
+    if engine.eval_loss_func is None:
+        metric_dict = {metric_key: 0.}
+    else:
+        metric_dict = dict()
+        for block_idx, block_fea in enumerate(fea_blocks):
+            similarity_matrix = paddle.matmul(
+                block_fea, gallery_feas, transpose_y=True)
+            if query_query_id is not None:
+                query_id_block = query_id_blocks[block_idx]
+                query_id_mask = (query_id_block != gallery_unique_id.t())
+
+                image_id_block = image_id_blocks[block_idx]
+                image_id_mask = (image_id_block != gallery_img_id.t())
+
+                keep_mask = paddle.logical_or(query_id_mask, image_id_mask)
+                similarity_matrix = similarity_matrix * keep_mask.astype(
+                    "float32")
+            else:
+                keep_mask = None
+
+            metric_tmp = engine.eval_metric_func(similarity_matrix,
+                                                 image_id_blocks[block_idx],
+                                                 gallery_img_id, keep_mask)
+
+            for key in metric_tmp:
+                if key not in metric_dict:
+                    metric_dict[key] = metric_tmp[key] * block_fea.shape[
+                        0] / len(query_feas)
+                else:
+                    metric_dict[key] += metric_tmp[key] * block_fea.shape[
+                        0] / len(query_feas)
+
+    metric_info_list = []
+    for key in metric_dict:
+        if metric_key is None:
+            metric_key = key
+        metric_info_list.append("{}: {:.5f}".format(key, metric_dict[key]))
+    metric_msg = ", ".join(metric_info_list)
+    logger.info("[Eval][Epoch {}][Avg]{}".format(epoch_id, metric_msg))
+
+    return metric_dict[metric_key]
+
+
+def cal_feature(engine, name='gallery'):
+    has_unique_id = False
+    all_unique_id = None
+
+    if name == 'gallery':
+        dataloader = engine.gallery_dataloader
+    elif name == 'query':
+        dataloader = engine.query_dataloader
+    elif name == 'gallery_query':
+        dataloader = engine.gallery_query_dataloader
+    else:
+        raise RuntimeError("Only support gallery or query dataset")
+
+    batch_feas_list = []
+    img_id_list = []
+    unique_id_list = []
+    max_iter = len(dataloader) - 1 if platform.system() == "Windows" else len(
+        dataloader)
+    for idx, batch in enumerate(dataloader):  # load is very time-consuming
+        if idx >= max_iter:
+            break
+        if idx % engine.config["Global"]["print_batch_step"] == 0:
+            logger.info(
+                f"{name} feature calculation process: [{idx}/{len(dataloader)}]"
+            )
+        if engine.use_dali:
+            batch = [
+                paddle.to_tensor(batch[0]['data']),
+                paddle.to_tensor(batch[0]['label'])
+            ]
+        batch = [paddle.to_tensor(x) for x in batch]
+        batch[1] = batch[1].reshape([-1, 1]).astype("int64")
+        if len(batch) == 3:
+            has_unique_id = True
+            batch[2] = batch[2].reshape([-1, 1]).astype("int64")
+        out = engine.model(batch[0], batch[1])
+        if "Student" in out:
+            out = out["Student"]
+        batch_feas = out["features"]
+
+        # do norm
+        if engine.config["Global"].get("feature_normalize", True):
+            feas_norm = paddle.sqrt(
+                paddle.sum(paddle.square(batch_feas), axis=1, keepdim=True))
+            batch_feas = paddle.divide(batch_feas, feas_norm)
+
+        # do binarize
+        if engine.config["Global"].get("feature_binarize") == "round":
+            batch_feas = paddle.round(batch_feas).astype("float32") * 2.0 - 1.0
+
+        if engine.config["Global"].get("feature_binarize") == "sign":
+            batch_feas = paddle.sign(batch_feas).astype("float32")
+
+        if paddle.distributed.get_world_size() > 1:
+            batch_feas_gather = []
+            img_id_gather = []
+            unique_id_gather = []
+            paddle.distributed.all_gather(batch_feas_gather, batch_feas)
+            paddle.distributed.all_gather(img_id_gather, batch[1])
+            batch_feas_list.append(paddle.concat(batch_feas_gather))
+            img_id_list.append(paddle.concat(img_id_gather))
+            if has_unique_id:
+                paddle.distributed.all_gather(unique_id_gather, batch[2])
+                unique_id_list.append(paddle.concat(unique_id_gather))
+        else:
+            batch_feas_list.append(batch_feas)
+            img_id_list.append(batch[1])
+            if has_unique_id:
+                unique_id_list.append(batch[2])
+
+    if engine.use_dali:
+        dataloader.reset()
+
+    all_feas = paddle.concat(batch_feas_list)
+    all_img_id = paddle.concat(img_id_list)
+    if has_unique_id:
+        all_unique_id = paddle.concat(unique_id_list)
+
+    # just for DistributedBatchSampler issue: repeat sampling
+    total_samples = len(
+        dataloader.dataset) if not engine.use_dali else dataloader.size
+    all_feas = all_feas[:total_samples]
+    all_img_id = all_img_id[:total_samples]
+    if has_unique_id:
+        all_unique_id = all_unique_id[:total_samples]
+
+    logger.info("Build {} done, all feat shape: {}, begin to eval..".format(
+        name, all_feas.shape))
+    return all_feas, all_img_id, all_unique_id
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/train/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/train/__init__.py
new file mode 100644
index 000000000..800d3a41e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/train/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ppcls.engine.train.train import train_epoch
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/train/train.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/train/train.py
new file mode 100644
index 000000000..b15c1088a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/train/train.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+
+import time
+import paddle
+from ppcls.engine.train.utils import update_loss, update_metric, log_info
+from ppcls.utils import profiler
+
+
+def train_epoch(engine, epoch_id, print_batch_step):
+    tic = time.time()
+    for iter_id, batch in enumerate(engine.train_dataloader):
+        if iter_id >= engine.max_iter:
+            break
+        profiler.add_profiler_step(engine.config["profiler_options"])
+        if iter_id == 5:
+            for key in engine.time_info:
+                engine.time_info[key].reset()
+        engine.time_info["reader_cost"].update(time.time() - tic)
+        if engine.use_dali:
+            batch = [
+                paddle.to_tensor(batch[0]['data']),
+                paddle.to_tensor(batch[0]['label'])
+            ]
+        batch_size = batch[0].shape[0]
+        if not engine.config["Global"].get("use_multilabel", False):
+            batch[1] = batch[1].reshape([batch_size, -1])
+        engine.global_step += 1
+
+        # image input
+        if engine.amp:
+            amp_level = engine.config['AMP'].get("level", "O1").upper()
+            with paddle.amp.auto_cast(
+                    custom_black_list={
+                        "flatten_contiguous_range", "greater_than"
+                    },
+                    level=amp_level):
+                out = forward(engine, batch)
+                loss_dict = engine.train_loss_func(out, batch[1])
+        else:
+            out = forward(engine, batch)
+            loss_dict = engine.train_loss_func(out, batch[1])
+
+        # step opt and lr
+        if engine.amp:
+            scaled = engine.scaler.scale(loss_dict["loss"])
+            scaled.backward()
+            engine.scaler.minimize(engine.optimizer, scaled)
+        else:
+            loss_dict["loss"].backward()
+            engine.optimizer.step()
+        engine.optimizer.clear_grad()
+        engine.lr_sch.step()
+
+        # below code just for logging
+        # update metric_for_logger
+        update_metric(engine, out, batch, batch_size)
+        # update_loss_for_logger
+        update_loss(engine, loss_dict, batch_size)
+        engine.time_info["batch_cost"].update(time.time() - tic)
+        if iter_id % print_batch_step == 0:
+            log_info(engine, batch_size, epoch_id, iter_id)
+        tic = time.time()
+
+
+def forward(engine, batch):
+    if not engine.is_rec:
+        return engine.model(batch[0])
+    else:
+        return engine.model(batch[0], batch[1])
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/train/utils.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/train/utils.py
new file mode 100644
index 000000000..92eb35d75
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/engine/train/utils.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+
+import datetime
+from ppcls.utils import logger
+from ppcls.utils.misc import AverageMeter
+
+
+def update_metric(trainer, out, batch, batch_size):
+    # calc metric
+    if trainer.train_metric_func is not None:
+        metric_dict = trainer.train_metric_func(out, batch[-1])
+        for key in metric_dict:
+            if key not in trainer.output_info:
+                trainer.output_info[key] = AverageMeter(key, '7.5f')
+            trainer.output_info[key].update(metric_dict[key].numpy()[0],
+                                            batch_size)
+
+
+def update_loss(trainer, loss_dict, batch_size):
+    # update_output_info
+    for key in loss_dict:
+        if key not in trainer.output_info:
+            trainer.output_info[key] = AverageMeter(key, '7.5f')
+        trainer.output_info[key].update(loss_dict[key].numpy()[0], batch_size)
+
+
+def log_info(trainer, batch_size, epoch_id, iter_id):
+    lr_msg = "lr: {:.5f}".format(trainer.lr_sch.get_lr())
+    metric_msg = ", ".join([
+        "{}: {:.5f}".format(key, trainer.output_info[key].avg)
+        for key in trainer.output_info
+    ])
+    time_msg = "s, ".join([
+        "{}: {:.5f}".format(key, trainer.time_info[key].avg)
+        for key in trainer.time_info
+    ])
+
+    ips_msg = "ips: {:.5f} images/sec".format(
+        batch_size / trainer.time_info["batch_cost"].avg)
+    eta_sec = ((trainer.config["Global"]["epochs"] - epoch_id + 1
+                ) * len(trainer.train_dataloader) - iter_id
+               ) * trainer.time_info["batch_cost"].avg
+    eta_msg = "eta: {:s}".format(str(datetime.timedelta(seconds=int(eta_sec))))
+    logger.info("[Train][Epoch {}/{}][Iter: {}/{}]{}, {}, {}, {}, {}".format(
+        epoch_id, trainer.config["Global"]["epochs"], iter_id,
+        len(trainer.train_dataloader), lr_msg, metric_msg, time_msg, ips_msg,
+        eta_msg))
+
+    logger.scaler(
+        name="lr",
+        value=trainer.lr_sch.get_lr(),
+        step=trainer.global_step,
+        writer=trainer.vdl_writer)
+    for key in trainer.output_info:
+        logger.scaler(
+            name="train_{}".format(key),
+            value=trainer.output_info[key].avg,
+            step=trainer.global_step,
+            writer=trainer.vdl_writer)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/loss/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/loss/__init__.py
new file mode 100644
index 000000000..7c50ff76f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/loss/__init__.py
@@ -0,0 +1,47 @@
+import copy
+
+import paddle
+import paddle.nn as nn
+from ppcls.utils import logger
+
+from .celoss import CELoss, MixCELoss
+from .distanceloss import DistanceLoss
+
+class CombinedLoss(nn.Layer):
+    def __init__(self, config_list):
+        super().__init__()
+        self.loss_func = []
+        self.loss_weight = []
+        assert isinstance(config_list, list), (
+            'operator config should be a list')
+        for config in config_list:
+            assert isinstance(config,
+                              dict) and len(config) == 1, "yaml format error"
+            name = list(config)[0]
+            param = config[name]
+            assert "weight" in param, "weight must be in param, but param just contains {}".format(
+                param.keys())
+            self.loss_weight.append(param.pop("weight"))
+            self.loss_func.append(eval(name)(**param))
+
+    def __call__(self, input, batch):
+        loss_dict = {}
+        # just for accelerate classification traing speed
+        if len(self.loss_func) == 1:
+            loss = self.loss_func[0](input, batch)
+            loss_dict.update(loss)
+            loss_dict["loss"] = list(loss.values())[0]
+        else:
+            for idx, loss_func in enumerate(self.loss_func):
+                loss = loss_func(input, batch)
+                weight = self.loss_weight[idx]
+                loss = {key: loss[key] * weight for key in loss}
+                loss_dict.update(loss)
+            loss_dict["loss"] = paddle.add_n(list(loss_dict.values()))
+        return loss_dict
+
+
+def build_loss(config):
+    module_class = CombinedLoss(copy.deepcopy(config))
+    logger.debug("build loss {} success.".format(module_class))
+    return module_class
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/loss/celoss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/loss/celoss.py
new file mode 100644
index 000000000..a78926170
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/loss/celoss.py
@@ -0,0 +1,67 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ppcls.utils import logger
+
+
+class CELoss(nn.Layer):
+    """
+    Cross entropy loss
+    """
+
+    def __init__(self, epsilon=None):
+        super().__init__()
+        if epsilon is not None and (epsilon <= 0 or epsilon >= 1):
+            epsilon = None
+        self.epsilon = epsilon
+
+    def _labelsmoothing(self, target, class_num):
+        if len(target.shape) == 1 or target.shape[-1] != class_num:
+            one_hot_target = F.one_hot(target, class_num)
+        else:
+            one_hot_target = target
+        soft_target = F.label_smooth(one_hot_target, epsilon=self.epsilon)
+        soft_target = paddle.reshape(soft_target, shape=[-1, class_num])
+        return soft_target
+
+    def forward(self, x, label):
+        if isinstance(x, dict):
+            x = x["logits"]
+        if self.epsilon is not None:
+            class_num = x.shape[-1]
+            label = self._labelsmoothing(label, class_num)
+            x = -F.log_softmax(x, axis=-1)
+            loss = paddle.sum(x * label, axis=-1)
+        else:
+            if label.shape[-1] == x.shape[-1]:
+                label = F.softmax(label, axis=-1)
+                soft_label = True
+            else:
+                soft_label = False
+            loss = F.cross_entropy(x, label=label, soft_label=soft_label)
+        loss = loss.mean()
+        return {"CELoss": loss}
+
+
+class MixCELoss(object):
+    def __init__(self, *args, **kwargs):
+        msg = "\"MixCELos\" is deprecated, please use \"CELoss\" instead."
+        logger.error(DeprecationWarning(msg))
+        raise DeprecationWarning(msg)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/loss/comfunc.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/loss/comfunc.py
new file mode 100644
index 000000000..277bdd6b5
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/loss/comfunc.py
@@ -0,0 +1,45 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def rerange_index(batch_size, samples_each_class):
+    tmp = np.arange(0, batch_size * batch_size)
+    tmp = tmp.reshape(-1, batch_size)
+    rerange_index = []
+
+    for i in range(batch_size):
+        step = i // samples_each_class
+        start = step * samples_each_class
+        end = (step + 1) * samples_each_class
+
+        pos_idx = []
+        neg_idx = []
+        for j, k in enumerate(tmp[i]):
+            if j >= start and j < end:
+                if j == i:
+                    pos_idx.insert(0, k)
+                else:
+                    pos_idx.append(k)
+            else:
+                neg_idx.append(k)
+        rerange_index += (pos_idx + neg_idx)
+
+    rerange_index = np.array(rerange_index).astype(np.int32)
+    return rerange_index
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/loss/distanceloss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/loss/distanceloss.py
new file mode 100644
index 000000000..0a09f0cb2
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/loss/distanceloss.py
@@ -0,0 +1,43 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddle.nn import L1Loss
+from paddle.nn import MSELoss as L2Loss
+from paddle.nn import SmoothL1Loss
+
+
+class DistanceLoss(nn.Layer):
+    """
+    DistanceLoss:
+        mode: loss mode
+    """
+
+    def __init__(self, mode="l2", **kargs):
+        super().__init__()
+        assert mode in ["l1", "l2", "smooth_l1"]
+        if mode == "l1":
+            self.loss_func = nn.L1Loss(**kargs)
+        elif mode == "l2":
+            self.loss_func = nn.MSELoss(**kargs)
+        elif mode == "smooth_l1":
+            self.loss_func = nn.SmoothL1Loss(**kargs)
+        self.mode = mode
+
+    def forward(self, x, y):
+        loss = self.loss_func(x, y)
+        return {"loss_{}".format(self.mode): loss}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/metric/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/metric/__init__.py
new file mode 100644
index 000000000..94721235b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/metric/__init__.py
@@ -0,0 +1,51 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from paddle import nn
+import copy
+from collections import OrderedDict
+
+from .metrics import TopkAcc, mAP, mINP, Recallk, Precisionk
+from .metrics import DistillationTopkAcc
+from .metrics import GoogLeNetTopkAcc
+from .metrics import HammingDistance, AccuracyScore
+
+
+class CombinedMetrics(nn.Layer):
+    def __init__(self, config_list):
+        super().__init__()
+        self.metric_func_list = []
+        assert isinstance(config_list, list), (
+            'operator config should be a list')
+        for config in config_list:
+            assert isinstance(config,
+                              dict) and len(config) == 1, "yaml format error"
+            metric_name = list(config)[0]
+            metric_params = config[metric_name]
+            if metric_params is not None:
+                self.metric_func_list.append(
+                    eval(metric_name)(**metric_params))
+            else:
+                self.metric_func_list.append(eval(metric_name)())
+
+    def __call__(self, *args, **kwargs):
+        metric_dict = OrderedDict()
+        for idx, metric_func in enumerate(self.metric_func_list):
+            metric_dict.update(metric_func(*args, **kwargs))
+        return metric_dict
+
+
+def build_metrics(config):
+    metrics_list = CombinedMetrics(copy.deepcopy(config))
+    return metrics_list
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/metric/metrics.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/metric/metrics.py
new file mode 100644
index 000000000..03e742082
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/metric/metrics.py
@@ -0,0 +1,306 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from sklearn.metrics import hamming_loss
+from sklearn.metrics import accuracy_score as accuracy_metric
+from sklearn.metrics import multilabel_confusion_matrix
+from sklearn.preprocessing import binarize
+
+
+class TopkAcc(nn.Layer):
+    def __init__(self, topk=(1, 5)):
+        super().__init__()
+        assert isinstance(topk, (int, list, tuple))
+        if isinstance(topk, int):
+            topk = [topk]
+        self.topk = topk
+
+    def forward(self, x, label):
+        if isinstance(x, dict):
+            x = x["logits"]
+
+        metric_dict = dict()
+        for k in self.topk:
+            metric_dict["top{}".format(k)] = paddle.metric.accuracy(
+                x, label, k=k)
+        return metric_dict
+
+
+class mAP(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, similarities_matrix, query_img_id, gallery_img_id,
+                keep_mask):
+        metric_dict = dict()
+
+        choosen_indices = paddle.argsort(
+            similarities_matrix, axis=1, descending=True)
+        gallery_labels_transpose = paddle.transpose(gallery_img_id, [1, 0])
+        gallery_labels_transpose = paddle.broadcast_to(
+            gallery_labels_transpose,
+            shape=[
+                choosen_indices.shape[0], gallery_labels_transpose.shape[1]
+            ])
+        choosen_label = paddle.index_sample(gallery_labels_transpose,
+                                            choosen_indices)
+        equal_flag = paddle.equal(choosen_label, query_img_id)
+        if keep_mask is not None:
+            keep_mask = paddle.index_sample(
+                keep_mask.astype('float32'), choosen_indices)
+            equal_flag = paddle.logical_and(equal_flag,
+                                            keep_mask.astype('bool'))
+        equal_flag = paddle.cast(equal_flag, 'float32')
+
+        num_rel = paddle.sum(equal_flag, axis=1)
+        num_rel = paddle.greater_than(num_rel, paddle.to_tensor(0.))
+        num_rel_index = paddle.nonzero(num_rel.astype("int"))
+        num_rel_index = paddle.reshape(num_rel_index, [num_rel_index.shape[0]])
+        equal_flag = paddle.index_select(equal_flag, num_rel_index, axis=0)
+
+        acc_sum = paddle.cumsum(equal_flag, axis=1)
+        div = paddle.arange(acc_sum.shape[1]).astype("float32") + 1
+        precision = paddle.divide(acc_sum, div)
+
+        #calc map
+        precision_mask = paddle.multiply(equal_flag, precision)
+        ap = paddle.sum(precision_mask, axis=1) / paddle.sum(equal_flag,
+                                                             axis=1)
+        metric_dict["mAP"] = paddle.mean(ap).numpy()[0]
+        return metric_dict
+
+
+class mINP(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, similarities_matrix, query_img_id, gallery_img_id,
+                keep_mask):
+        metric_dict = dict()
+
+        choosen_indices = paddle.argsort(
+            similarities_matrix, axis=1, descending=True)
+        gallery_labels_transpose = paddle.transpose(gallery_img_id, [1, 0])
+        gallery_labels_transpose = paddle.broadcast_to(
+            gallery_labels_transpose,
+            shape=[
+                choosen_indices.shape[0], gallery_labels_transpose.shape[1]
+            ])
+        choosen_label = paddle.index_sample(gallery_labels_transpose,
+                                            choosen_indices)
+        equal_flag = paddle.equal(choosen_label, query_img_id)
+        if keep_mask is not None:
+            keep_mask = paddle.index_sample(
+                keep_mask.astype('float32'), choosen_indices)
+            equal_flag = paddle.logical_and(equal_flag,
+                                            keep_mask.astype('bool'))
+        equal_flag = paddle.cast(equal_flag, 'float32')
+
+        num_rel = paddle.sum(equal_flag, axis=1)
+        num_rel = paddle.greater_than(num_rel, paddle.to_tensor(0.))
+        num_rel_index = paddle.nonzero(num_rel.astype("int"))
+        num_rel_index = paddle.reshape(num_rel_index, [num_rel_index.shape[0]])
+        equal_flag = paddle.index_select(equal_flag, num_rel_index, axis=0)
+
+        #do accumulative sum
+        div = paddle.arange(equal_flag.shape[1]).astype("float32") + 2
+        minus = paddle.divide(equal_flag, div)
+        auxilary = paddle.subtract(equal_flag, minus)
+        hard_index = paddle.argmax(auxilary, axis=1).astype("float32")
+        all_INP = paddle.divide(paddle.sum(equal_flag, axis=1), hard_index)
+        mINP = paddle.mean(all_INP)
+        metric_dict["mINP"] = mINP.numpy()[0]
+        return metric_dict
+
+
+class Recallk(nn.Layer):
+    def __init__(self, topk=(1, 5)):
+        super().__init__()
+        assert isinstance(topk, (int, list, tuple))
+        if isinstance(topk, int):
+            topk = [topk]
+        self.topk = topk
+
+    def forward(self, similarities_matrix, query_img_id, gallery_img_id,
+                keep_mask):
+        metric_dict = dict()
+
+        #get cmc
+        choosen_indices = paddle.argsort(
+            similarities_matrix, axis=1, descending=True)
+        gallery_labels_transpose = paddle.transpose(gallery_img_id, [1, 0])
+        gallery_labels_transpose = paddle.broadcast_to(
+            gallery_labels_transpose,
+            shape=[
+                choosen_indices.shape[0], gallery_labels_transpose.shape[1]
+            ])
+        choosen_label = paddle.index_sample(gallery_labels_transpose,
+                                            choosen_indices)
+        equal_flag = paddle.equal(choosen_label, query_img_id)
+        if keep_mask is not None:
+            keep_mask = paddle.index_sample(
+                keep_mask.astype('float32'), choosen_indices)
+            equal_flag = paddle.logical_and(equal_flag,
+                                            keep_mask.astype('bool'))
+        equal_flag = paddle.cast(equal_flag, 'float32')
+        real_query_num = paddle.sum(equal_flag, axis=1)
+        real_query_num = paddle.sum(
+            paddle.greater_than(real_query_num, paddle.to_tensor(0.)).astype(
+                "float32"))
+
+        acc_sum = paddle.cumsum(equal_flag, axis=1)
+        mask = paddle.greater_than(acc_sum,
+                                   paddle.to_tensor(0.)).astype("float32")
+        all_cmc = (paddle.sum(mask, axis=0) / real_query_num).numpy()
+
+        for k in self.topk:
+            metric_dict["recall{}".format(k)] = all_cmc[k - 1]
+        return metric_dict
+
+
+class Precisionk(nn.Layer):
+    def __init__(self, topk=(1, 5)):
+        super().__init__()
+        assert isinstance(topk, (int, list, tuple))
+        if isinstance(topk, int):
+            topk = [topk]
+        self.topk = topk
+
+    def forward(self, similarities_matrix, query_img_id, gallery_img_id,
+                keep_mask):
+        metric_dict = dict()
+
+        #get cmc
+        choosen_indices = paddle.argsort(
+            similarities_matrix, axis=1, descending=True)
+        gallery_labels_transpose = paddle.transpose(gallery_img_id, [1, 0])
+        gallery_labels_transpose = paddle.broadcast_to(
+            gallery_labels_transpose,
+            shape=[
+                choosen_indices.shape[0], gallery_labels_transpose.shape[1]
+            ])
+        choosen_label = paddle.index_sample(gallery_labels_transpose,
+                                            choosen_indices)
+        equal_flag = paddle.equal(choosen_label, query_img_id)
+        if keep_mask is not None:
+            keep_mask = paddle.index_sample(
+                keep_mask.astype('float32'), choosen_indices)
+            equal_flag = paddle.logical_and(equal_flag,
+                                            keep_mask.astype('bool'))
+        equal_flag = paddle.cast(equal_flag, 'float32')
+
+        Ns = paddle.arange(gallery_img_id.shape[0]) + 1
+        equal_flag_cumsum = paddle.cumsum(equal_flag, axis=1)
+        Precision_at_k = (paddle.mean(equal_flag_cumsum, axis=0) / Ns).numpy()
+
+        for k in self.topk:
+            metric_dict["precision@{}".format(k)] = Precision_at_k[k - 1]
+
+        return metric_dict
+
+
+class DistillationTopkAcc(TopkAcc):
+    def __init__(self, model_key, feature_key=None, topk=(1, 5)):
+        super().__init__(topk=topk)
+        self.model_key = model_key
+        self.feature_key = feature_key
+
+    def forward(self, x, label):
+        if isinstance(x, dict):
+            x = x[self.model_key]
+        if self.feature_key is not None:
+            x = x[self.feature_key]
+        return super().forward(x, label)
+
+
+class GoogLeNetTopkAcc(TopkAcc):
+    def __init__(self, topk=(1, 5)):
+        super().__init__()
+        assert isinstance(topk, (int, list, tuple))
+        if isinstance(topk, int):
+            topk = [topk]
+        self.topk = topk
+
+    def forward(self, x, label):
+        return super().forward(x[0], label)
+
+
+class MutiLabelMetric(object):
+    def __init__(self):
+        pass
+
+    def _multi_hot_encode(self, logits, threshold=0.5):
+        return binarize(logits, threshold=threshold)
+
+    def __call__(self, output):
+        output = F.sigmoid(output)
+        preds = self._multi_hot_encode(logits=output.numpy(), threshold=0.5)
+        return preds
+
+
+class HammingDistance(MutiLabelMetric):
+    """
+    Soft metric based label for multilabel classification
+    Returns:
+        The smaller the return value is, the better model is.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def __call__(self, output, target):
+        preds = super().__call__(output)
+        metric_dict = dict()
+        metric_dict["HammingDistance"] = paddle.to_tensor(
+            hamming_loss(target, preds))
+        return metric_dict
+
+
+class AccuracyScore(MutiLabelMetric):
+    """
+    Hard metric for multilabel classification
+    Args:
+        base: ["sample", "label"], default="sample"
+            if "sample", return metric score based sample,
+            if "label", return metric score based label.
+    Returns:
+        accuracy:
+    """
+
+    def __init__(self, base="label"):
+        super().__init__()
+        assert base in ["sample", "label"
+                        ], 'must be one of ["sample", "label"]'
+        self.base = base
+
+    def __call__(self, output, target):
+        preds = super().__call__(output)
+        metric_dict = dict()
+        if self.base == "sample":
+            accuracy = accuracy_metric(target, preds)
+        elif self.base == "label":
+            mcm = multilabel_confusion_matrix(target, preds)
+            tns = mcm[:, 0, 0]
+            fns = mcm[:, 1, 0]
+            tps = mcm[:, 1, 1]
+            fps = mcm[:, 0, 1]
+            accuracy = (sum(tps) + sum(tns)) / (
+                sum(tps) + sum(tns) + sum(fns) + sum(fps))
+        metric_dict["AccuracyScore"] = paddle.to_tensor(accuracy)
+        return metric_dict
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/optimizer/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/optimizer/__init__.py
new file mode 100644
index 000000000..61db39f89
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/optimizer/__init__.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import paddle
+
+from ppcls.utils import logger
+
+from . import optimizer
+
+__all__ = ['build_optimizer']
+
+
+def build_lr_scheduler(lr_config, epochs, step_each_epoch):
+    from . import learning_rate
+    lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch})
+    if 'name' in lr_config:
+        lr_name = lr_config.pop('name')
+        lr = getattr(learning_rate, lr_name)(**lr_config)
+        if isinstance(lr, paddle.optimizer.lr.LRScheduler):
+            return lr
+        else:
+            return lr()
+    else:
+        lr = lr_config['learning_rate']
+    return lr
+
+
+# model_list is None in static graph
+def build_optimizer(config, epochs, step_each_epoch, model_list=None):
+    config = copy.deepcopy(config)
+    # step1 build lr
+    lr = build_lr_scheduler(config.pop('lr'), epochs, step_each_epoch)
+    logger.debug("build lr ({}) success..".format(lr))
+    # step2 build regularization
+    if 'regularizer' in config and config['regularizer'] is not None:
+        if 'weight_decay' in config:
+            logger.warning(
+                "ConfigError: Only one of regularizer and weight_decay can be set in Optimizer Config. \"weight_decay\" has been ignored."
+            )
+        reg_config = config.pop('regularizer')
+        reg_name = reg_config.pop('name') + 'Decay'
+        reg = getattr(paddle.regularizer, reg_name)(**reg_config)
+        config["weight_decay"] = reg
+        logger.debug("build regularizer ({}) success..".format(reg))
+    # step3 build optimizer
+    optim_name = config.pop('name')
+    if 'clip_norm' in config:
+        clip_norm = config.pop('clip_norm')
+        grad_clip = paddle.nn.ClipGradByNorm(clip_norm=clip_norm)
+    else:
+        grad_clip = None
+    optim = getattr(optimizer, optim_name)(learning_rate=lr,
+                                           grad_clip=grad_clip,
+                                           **config)(model_list=model_list)
+    logger.debug("build optimizer ({}) success..".format(optim))
+    return optim, lr
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/optimizer/learning_rate.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/optimizer/learning_rate.py
new file mode 100644
index 000000000..b59387dd9
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/optimizer/learning_rate.py
@@ -0,0 +1,326 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+from paddle.optimizer import lr
+from paddle.optimizer.lr import LRScheduler
+
+from ppcls.utils import logger
+
+
+class Linear(object):
+    """
+    Linear learning rate decay
+    Args:
+        lr (float): The initial learning rate. It is a python float number.
+        epochs(int): The decay step size. It determines the decay cycle.
+        end_lr(float, optional): The minimum final learning rate. Default: 0.0001.
+        power(float, optional): Power of polynomial. Default: 1.0.
+        warmup_epoch(int): The epoch numbers for LinearWarmup. Default: 0.
+        warmup_start_lr(float): Initial learning rate of warm up. Default: 0.0.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 epochs,
+                 step_each_epoch,
+                 end_lr=0.0,
+                 power=1.0,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 **kwargs):
+        super().__init__()
+        if warmup_epoch >= epochs:
+            msg = f"When using warm up, the value of \"Global.epochs\" must be greater than value of \"Optimizer.lr.warmup_epoch\". The value of \"Optimizer.lr.warmup_epoch\" has been set to {epochs}."
+            logger.warning(msg)
+            warmup_epoch = epochs
+        self.learning_rate = learning_rate
+        self.steps = (epochs - warmup_epoch) * step_each_epoch
+        self.end_lr = end_lr
+        self.power = power
+        self.last_epoch = last_epoch
+        self.warmup_steps = round(warmup_epoch * step_each_epoch)
+        self.warmup_start_lr = warmup_start_lr
+
+    def __call__(self):
+        learning_rate = lr.PolynomialDecay(
+            learning_rate=self.learning_rate,
+            decay_steps=self.steps,
+            end_lr=self.end_lr,
+            power=self.power,
+            last_epoch=self.
+            last_epoch) if self.steps > 0 else self.learning_rate
+        if self.warmup_steps > 0:
+            learning_rate = lr.LinearWarmup(
+                learning_rate=learning_rate,
+                warmup_steps=self.warmup_steps,
+                start_lr=self.warmup_start_lr,
+                end_lr=self.learning_rate,
+                last_epoch=self.last_epoch)
+        return learning_rate
+
+
+class Cosine(object):
+    """
+    Cosine learning rate decay
+    lr = 0.05 * (math.cos(epoch * (math.pi / epochs)) + 1)
+    Args:
+        lr(float): initial learning rate
+        step_each_epoch(int): steps each epoch
+        epochs(int): total training epochs
+        eta_min(float): Minimum learning rate. Default: 0.0.
+        warmup_epoch(int): The epoch numbers for LinearWarmup. Default: 0.
+        warmup_start_lr(float): Initial learning rate of warm up. Default: 0.0.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 step_each_epoch,
+                 epochs,
+                 eta_min=0.0,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 **kwargs):
+        super().__init__()
+        if warmup_epoch >= epochs:
+            msg = f"When using warm up, the value of \"Global.epochs\" must be greater than value of \"Optimizer.lr.warmup_epoch\". The value of \"Optimizer.lr.warmup_epoch\" has been set to {epochs}."
+            logger.warning(msg)
+            warmup_epoch = epochs
+        self.learning_rate = learning_rate
+        self.T_max = (epochs - warmup_epoch) * step_each_epoch
+        self.eta_min = eta_min
+        self.last_epoch = last_epoch
+        self.warmup_steps = round(warmup_epoch * step_each_epoch)
+        self.warmup_start_lr = warmup_start_lr
+
+    def __call__(self):
+        learning_rate = lr.CosineAnnealingDecay(
+            learning_rate=self.learning_rate,
+            T_max=self.T_max,
+            eta_min=self.eta_min,
+            last_epoch=self.
+            last_epoch) if self.T_max > 0 else self.learning_rate
+        if self.warmup_steps > 0:
+            learning_rate = lr.LinearWarmup(
+                learning_rate=learning_rate,
+                warmup_steps=self.warmup_steps,
+                start_lr=self.warmup_start_lr,
+                end_lr=self.learning_rate,
+                last_epoch=self.last_epoch)
+        return learning_rate
+
+
+class Step(object):
+    """
+    Piecewise learning rate decay
+    Args:
+        step_each_epoch(int): steps each epoch
+        learning_rate (float): The initial learning rate. It is a python float number.
+        step_size (int): the interval to update.
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
+            It should be less than 1.0. Default: 0.1.
+        warmup_epoch(int): The epoch numbers for LinearWarmup. Default: 0.
+        warmup_start_lr(float): Initial learning rate of warm up. Default: 0.0.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 step_size,
+                 step_each_epoch,
+                 epochs,
+                 gamma,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 **kwargs):
+        super().__init__()
+        if warmup_epoch >= epochs:
+            msg = f"When using warm up, the value of \"Global.epochs\" must be greater than value of \"Optimizer.lr.warmup_epoch\". The value of \"Optimizer.lr.warmup_epoch\" has been set to {epochs}."
+            logger.warning(msg)
+            warmup_epoch = epochs
+        self.step_size = step_each_epoch * step_size
+        self.learning_rate = learning_rate
+        self.gamma = gamma
+        self.last_epoch = last_epoch
+        self.warmup_steps = round(warmup_epoch * step_each_epoch)
+        self.warmup_start_lr = warmup_start_lr
+
+    def __call__(self):
+        learning_rate = lr.StepDecay(
+            learning_rate=self.learning_rate,
+            step_size=self.step_size,
+            gamma=self.gamma,
+            last_epoch=self.last_epoch)
+        if self.warmup_steps > 0:
+            learning_rate = lr.LinearWarmup(
+                learning_rate=learning_rate,
+                warmup_steps=self.warmup_steps,
+                start_lr=self.warmup_start_lr,
+                end_lr=self.learning_rate,
+                last_epoch=self.last_epoch)
+        return learning_rate
+
+
+class Piecewise(object):
+    """
+    Piecewise learning rate decay
+    Args:
+        boundaries(list): A list of steps numbers. The type of element in the list is python int.
+        values(list): A list of learning rate values that will be picked during different epoch boundaries.
+            The type of element in the list is python float.
+        warmup_epoch(int): The epoch numbers for LinearWarmup. Default: 0.
+        warmup_start_lr(float): Initial learning rate of warm up. Default: 0.0.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+    """
+
+    def __init__(self,
+                 step_each_epoch,
+                 decay_epochs,
+                 values,
+                 epochs,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 **kwargs):
+        super().__init__()
+        if warmup_epoch >= epochs:
+            msg = f"When using warm up, the value of \"Global.epochs\" must be greater than value of \"Optimizer.lr.warmup_epoch\". The value of \"Optimizer.lr.warmup_epoch\" has been set to {epochs}."
+            logger.warning(msg)
+            warmup_epoch = epochs
+        self.boundaries = [step_each_epoch * e for e in decay_epochs]
+        self.values = values
+        self.last_epoch = last_epoch
+        self.warmup_steps = round(warmup_epoch * step_each_epoch)
+        self.warmup_start_lr = warmup_start_lr
+
+    def __call__(self):
+        learning_rate = lr.PiecewiseDecay(
+            boundaries=self.boundaries,
+            values=self.values,
+            last_epoch=self.last_epoch)
+        if self.warmup_steps > 0:
+            learning_rate = lr.LinearWarmup(
+                learning_rate=learning_rate,
+                warmup_steps=self.warmup_steps,
+                start_lr=self.warmup_start_lr,
+                end_lr=self.values[0],
+                last_epoch=self.last_epoch)
+        return learning_rate
+
+
+class MultiStepDecay(LRScheduler):
+    """
+    Update the learning rate by ``gamma`` once ``epoch`` reaches one of the milestones.
+    The algorithm can be described as the code below.
+    .. code-block:: text
+        learning_rate = 0.5
+        milestones = [30, 50]
+        gamma = 0.1
+        if epoch < 30:
+            learning_rate = 0.5
+        elif epoch < 50:
+            learning_rate = 0.05
+        else:
+            learning_rate = 0.005
+    Args:
+        learning_rate (float): The initial learning rate. It is a python float number.
+        milestones (tuple|list): List or tuple of each boundaries. Must be increasing.
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
+            It should be less than 1.0. Default: 0.1.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+        ``MultiStepDecay`` instance to schedule learning rate.
+    Examples:
+
+        .. code-block:: python
+            import paddle
+            import numpy as np
+            # train on default dynamic graph mode
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
+            for epoch in range(20):
+                for batch_id in range(5):
+                    x = paddle.uniform([10, 10])
+                    out = linear(x)
+                    loss = paddle.mean(out)
+                    loss.backward()
+                    sgd.step()
+                    sgd.clear_gradients()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
+            # train on static graph mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(20):
+                for batch_id in range(5):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 milestones,
+                 epochs,
+                 step_each_epoch,
+                 gamma=0.1,
+                 last_epoch=-1,
+                 verbose=False):
+        if not isinstance(milestones, (tuple, list)):
+            raise TypeError(
+                "The type of 'milestones' in 'MultiStepDecay' must be 'tuple, list', but received %s."
+                % type(milestones))
+        if not all([
+                milestones[i] < milestones[i + 1]
+                for i in range(len(milestones) - 1)
+        ]):
+            raise ValueError('The elements of milestones must be incremented')
+        if gamma >= 1.0:
+            raise ValueError('gamma should be < 1.0.')
+        self.milestones = [x * step_each_epoch for x in milestones]
+        self.gamma = gamma
+        super().__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        for i in range(len(self.milestones)):
+            if self.last_epoch < self.milestones[i]:
+                return self.base_lr * (self.gamma**i)
+        return self.base_lr * (self.gamma**len(self.milestones))
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/optimizer/optimizer.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/optimizer/optimizer.py
new file mode 100644
index 000000000..4422ea70d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/optimizer/optimizer.py
@@ -0,0 +1,217 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from paddle import optimizer as optim
+import paddle
+
+from ppcls.utils import logger
+
+
+class Momentum(object):
+    """
+    Simple Momentum optimizer with velocity state.
+    Args:
+        learning_rate (float|Variable) - The learning rate used to update parameters.
+            Can be a float value or a Variable with one float value as data element.
+        momentum (float) - Momentum factor.
+        regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 momentum,
+                 weight_decay=None,
+                 grad_clip=None,
+                 multi_precision=True):
+        super().__init__()
+        self.learning_rate = learning_rate
+        self.momentum = momentum
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
+        self.multi_precision = multi_precision
+
+    def __call__(self, model_list):
+        # model_list is None in static graph
+        parameters = sum([m.parameters() for m in model_list],
+                         []) if model_list else None
+        opt = optim.Momentum(
+            learning_rate=self.learning_rate,
+            momentum=self.momentum,
+            weight_decay=self.weight_decay,
+            grad_clip=self.grad_clip,
+            multi_precision=self.multi_precision,
+            parameters=parameters)
+        if hasattr(opt, '_use_multi_tensor'):
+            opt = optim.Momentum(
+                learning_rate=self.learning_rate,
+                momentum=self.momentum,
+                weight_decay=self.weight_decay,
+                grad_clip=self.grad_clip,
+                multi_precision=self.multi_precision,
+                parameters=parameters,
+                use_multi_tensor=True)
+        return opt
+
+
+class Adam(object):
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-08,
+                 parameter_list=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None,
+                 lazy_mode=False,
+                 multi_precision=False):
+        self.learning_rate = learning_rate
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.parameter_list = parameter_list
+        self.learning_rate = learning_rate
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
+        self.name = name
+        self.lazy_mode = lazy_mode
+        self.multi_precision = multi_precision
+
+    def __call__(self, model_list):
+        # model_list is None in static graph
+        parameters = sum([m.parameters() for m in model_list],
+                         []) if model_list else None
+        opt = optim.Adam(
+            learning_rate=self.learning_rate,
+            beta1=self.beta1,
+            beta2=self.beta2,
+            epsilon=self.epsilon,
+            weight_decay=self.weight_decay,
+            grad_clip=self.grad_clip,
+            name=self.name,
+            lazy_mode=self.lazy_mode,
+            multi_precision=self.multi_precision,
+            parameters=parameters)
+        return opt
+
+
+class RMSProp(object):
+    """
+    Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning rate method.
+    Args:
+        learning_rate (float|Variable) - The learning rate used to update parameters.
+            Can be a float value or a Variable with one float value as data element.
+        momentum (float) - Momentum factor.
+        rho (float) - rho value in equation.
+        epsilon (float) - avoid division by zero, default is 1e-6.
+        regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 momentum=0.0,
+                 rho=0.95,
+                 epsilon=1e-6,
+                 weight_decay=None,
+                 grad_clip=None,
+                 multi_precision=False):
+        super().__init__()
+        self.learning_rate = learning_rate
+        self.momentum = momentum
+        self.rho = rho
+        self.epsilon = epsilon
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
+
+    def __call__(self, model_list):
+        # model_list is None in static graph
+        parameters = sum([m.parameters() for m in model_list],
+                         []) if model_list else None
+        opt = optim.RMSProp(
+            learning_rate=self.learning_rate,
+            momentum=self.momentum,
+            rho=self.rho,
+            epsilon=self.epsilon,
+            weight_decay=self.weight_decay,
+            grad_clip=self.grad_clip,
+            parameters=parameters)
+        return opt
+
+
+class AdamW(object):
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-8,
+                 weight_decay=None,
+                 multi_precision=False,
+                 grad_clip=None,
+                 no_weight_decay_name=None,
+                 one_dim_param_no_weight_decay=False,
+                 **args):
+        super().__init__()
+        self.learning_rate = learning_rate
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.grad_clip = grad_clip
+        self.weight_decay = weight_decay
+        self.multi_precision = multi_precision
+        self.no_weight_decay_name_list = no_weight_decay_name.split(
+        ) if no_weight_decay_name else []
+        self.one_dim_param_no_weight_decay = one_dim_param_no_weight_decay
+
+    def __call__(self, model_list):
+        # model_list is None in static graph
+        parameters = sum([m.parameters() for m in model_list],
+                         []) if model_list else None
+
+        # TODO(gaotingquan): model_list is None when in static graph, "no_weight_decay" not work.
+        if model_list is None:
+            if self.one_dim_param_no_weight_decay or len(
+                    self.no_weight_decay_name_list) != 0:
+                msg = "\"AdamW\" does not support setting \"no_weight_decay\" in static graph. Please use dynamic graph."
+                logger.error(Exception(msg))
+                raise Exception(msg)
+
+        self.no_weight_decay_param_name_list = [
+            p.name for model in model_list for n, p in model.named_parameters()
+            if any(nd in n for nd in self.no_weight_decay_name_list)
+        ] if model_list else []
+
+        if self.one_dim_param_no_weight_decay:
+            self.no_weight_decay_param_name_list += [
+                p.name for model in model_list
+                for n, p in model.named_parameters() if len(p.shape) == 1
+            ] if model_list else []
+
+        opt = optim.AdamW(
+            learning_rate=self.learning_rate,
+            beta1=self.beta1,
+            beta2=self.beta2,
+            epsilon=self.epsilon,
+            parameters=parameters,
+            weight_decay=self.weight_decay,
+            multi_precision=self.multi_precision,
+            grad_clip=self.grad_clip,
+            apply_decay_param_fun=self._apply_decay_param_fun)
+        return opt
+
+    def _apply_decay_param_fun(self, name):
+        return name not in self.no_weight_decay_param_name_list
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/static/program.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/static/program.py
new file mode 100644
index 000000000..b3534a2cf
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/static/program.py
@@ -0,0 +1,449 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+import numpy as np
+
+from collections import OrderedDict
+
+import paddle
+import paddle.nn.functional as F
+
+from paddle.distributed import fleet
+from paddle.distributed.fleet import DistributedStrategy
+
+# from ppcls.optimizer import OptimizerBuilder
+# from ppcls.optimizer.learning_rate import LearningRateBuilder
+
+from ppcls.arch import build_model
+from ppcls.loss import build_loss
+from ppcls.metric import build_metrics
+from ppcls.optimizer import build_optimizer
+from ppcls.optimizer import build_lr_scheduler
+
+from ppcls.utils.misc import AverageMeter
+from ppcls.utils import logger, profiler
+
+
+def create_feeds(image_shape, use_mix=False, class_num=None, dtype="float32"):
+    """
+    Create feeds as model input
+
+    Args:
+        image_shape(list[int]): model input shape, such as [3, 224, 224]
+        use_mix(bool): whether to use mix(include mixup, cutmix, fmix)
+        class_num(int): the class number of network, required if use_mix
+
+    Returns:
+        feeds(dict): dict of model input variables
+    """
+    feeds = OrderedDict()
+    feeds['data'] = paddle.static.data(
+        name="data", shape=[None] + image_shape, dtype=dtype)
+
+    if use_mix:
+        if class_num is None:
+            msg = "When use MixUp, CutMix and so on, you must set class_num."
+            logger.error(msg)
+            raise Exception(msg)
+        feeds['target'] = paddle.static.data(
+            name="target", shape=[None, class_num], dtype="float32")
+    else:
+        feeds['label'] = paddle.static.data(
+            name="label", shape=[None, 1], dtype="int64")
+
+    return feeds
+
+
+def create_fetchs(out,
+                  feeds,
+                  architecture,
+                  topk=5,
+                  epsilon=None,
+                  class_num=None,
+                  use_mix=False,
+                  config=None,
+                  mode="Train"):
+    """
+    Create fetchs as model outputs(included loss and measures),
+    will call create_loss and create_metric(if use_mix).
+    Args:
+        out(variable): model output variable
+        feeds(dict): dict of model input variables.
+            If use mix_up, it will not include label.
+        architecture(dict): architecture information,
+            name(such as ResNet50) is needed
+        topk(int): usually top5
+        epsilon(float): parameter for label smoothing, 0.0 <= epsilon <= 1.0
+        class_num(int): the class number of network, required if use_mix
+        use_mix(bool): whether to use mix(include mixup, cutmix, fmix)
+        config(dict): model config
+
+    Returns:
+        fetchs(dict): dict of model outputs(included loss and measures)
+    """
+    fetchs = OrderedDict()
+    # build loss
+    if use_mix:
+        if class_num is None:
+            msg = "When use MixUp, CutMix and so on, you must set class_num."
+            logger.error(msg)
+            raise Exception(msg)
+        target = paddle.reshape(feeds['target'], [-1, class_num])
+    else:
+        target = paddle.reshape(feeds['label'], [-1, 1])
+
+    loss_func = build_loss(config["Loss"][mode])
+    loss_dict = loss_func(out, target)
+
+    loss_out = loss_dict["loss"]
+    fetchs['loss'] = (loss_out, AverageMeter('loss', '7.4f', need_avg=True))
+
+    # build metric
+    if not use_mix:
+        metric_func = build_metrics(config["Metric"][mode])
+
+        metric_dict = metric_func(out, target)
+
+        for key in metric_dict:
+            if mode != "Train" and paddle.distributed.get_world_size() > 1:
+                paddle.distributed.all_reduce(
+                    metric_dict[key], op=paddle.distributed.ReduceOp.SUM)
+                metric_dict[key] = metric_dict[
+                    key] / paddle.distributed.get_world_size()
+
+            fetchs[key] = (metric_dict[key], AverageMeter(
+                key, '7.4f', need_avg=True))
+
+    return fetchs
+
+
+def create_optimizer(config, step_each_epoch):
+    # create learning_rate instance
+    optimizer, lr_sch = build_optimizer(
+        config["Optimizer"], config["Global"]["epochs"], step_each_epoch)
+    return optimizer, lr_sch
+
+
+def create_strategy(config):
+    """
+    Create build strategy and exec strategy.
+
+    Args:
+        config(dict): config
+
+    Returns:
+        build_strategy: build strategy
+        exec_strategy: exec strategy
+    """
+    build_strategy = paddle.static.BuildStrategy()
+    exec_strategy = paddle.static.ExecutionStrategy()
+
+    exec_strategy.num_threads = 1
+    exec_strategy.num_iteration_per_drop_scope = (
+        10000
+        if 'AMP' in config and config.AMP.get("level", "O1") == "O2" else 10)
+
+    fuse_op = True if 'AMP' in config else False
+
+    fuse_bn_act_ops = config.get('fuse_bn_act_ops', fuse_op)
+    fuse_elewise_add_act_ops = config.get('fuse_elewise_add_act_ops', fuse_op)
+    fuse_bn_add_act_ops = config.get('fuse_bn_add_act_ops', fuse_op)
+    enable_addto = config.get('enable_addto', fuse_op)
+
+    build_strategy.fuse_bn_act_ops = fuse_bn_act_ops
+    build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
+    build_strategy.fuse_bn_add_act_ops = fuse_bn_add_act_ops
+    build_strategy.enable_addto = enable_addto
+
+    return build_strategy, exec_strategy
+
+
+def dist_optimizer(config, optimizer):
+    """
+    Create a distributed optimizer based on a normal optimizer
+
+    Args:
+        config(dict):
+        optimizer(): a normal optimizer
+
+    Returns:
+        optimizer: a distributed optimizer
+    """
+    build_strategy, exec_strategy = create_strategy(config)
+
+    dist_strategy = DistributedStrategy()
+    dist_strategy.execution_strategy = exec_strategy
+    dist_strategy.build_strategy = build_strategy
+
+    dist_strategy.nccl_comm_num = 1
+    dist_strategy.fuse_all_reduce_ops = True
+    dist_strategy.fuse_grad_size_in_MB = 16
+    optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)
+
+    return optimizer
+
+
+def mixed_precision_optimizer(config, optimizer):
+    if 'AMP' in config:
+        amp_cfg = config.AMP if config.AMP else dict()
+        scale_loss = amp_cfg.get('scale_loss', 1.0)
+        use_dynamic_loss_scaling = amp_cfg.get('use_dynamic_loss_scaling',
+                                               False)
+        use_pure_fp16 = amp_cfg.get("level", "O1") == "O2"
+        optimizer = paddle.static.amp.decorate(
+            optimizer,
+            init_loss_scaling=scale_loss,
+            use_dynamic_loss_scaling=use_dynamic_loss_scaling,
+            use_pure_fp16=use_pure_fp16,
+            use_fp16_guard=True)
+
+    return optimizer
+
+
+def build(config,
+          main_prog,
+          startup_prog,
+          class_num=None,
+          step_each_epoch=100,
+          is_train=True,
+          is_distributed=True):
+    """
+    Build a program using a model and an optimizer
+        1. create feeds
+        2. create a dataloader
+        3. create a model
+        4. create fetchs
+        5. create an optimizer
+
+    Args:
+        config(dict): config
+        main_prog(): main program
+        startup_prog(): startup program
+        class_num(int): the class number of network, required if use_mix
+        is_train(bool): train or eval
+        is_distributed(bool): whether to use distributed training method
+
+    Returns:
+        dataloader(): a bridge between the model and the data
+        fetchs(dict): dict of model outputs(included loss and measures)
+    """
+    with paddle.static.program_guard(main_prog, startup_prog):
+        with paddle.utils.unique_name.guard():
+            mode = "Train" if is_train else "Eval"
+            use_mix = "batch_transform_ops" in config["DataLoader"][mode][
+                "dataset"]
+            feeds = create_feeds(
+                config["Global"]["image_shape"],
+                use_mix,
+                class_num=class_num,
+                dtype="float32")
+
+            # build model
+            # data_format should be assigned in arch-dict
+            input_image_channel = config["Global"]["image_shape"][
+                0]  # default as [3, 224, 224]
+            model = build_model(config)
+            out = model(feeds["data"])
+            # end of build model
+
+            fetchs = create_fetchs(
+                out,
+                feeds,
+                config["Arch"],
+                epsilon=config.get('ls_epsilon'),
+                class_num=class_num,
+                use_mix=use_mix,
+                config=config,
+                mode=mode)
+            lr_scheduler = None
+            optimizer = None
+            if is_train:
+                optimizer, lr_scheduler = build_optimizer(
+                    config["Optimizer"], config["Global"]["epochs"],
+                    step_each_epoch)
+                optimizer = mixed_precision_optimizer(config, optimizer)
+                if is_distributed:
+                    optimizer = dist_optimizer(config, optimizer)
+                optimizer.minimize(fetchs['loss'][0])
+    return fetchs, lr_scheduler, feeds, optimizer
+
+
+def compile(config, program, loss_name=None, share_prog=None):
+    """
+    Compile the program
+
+    Args:
+        config(dict): config
+        program(): the program which is wrapped by
+        loss_name(str): loss name
+        share_prog(): the shared program, used for evaluation during training
+
+    Returns:
+        compiled_program(): a compiled program
+    """
+    build_strategy, exec_strategy = create_strategy(config)
+
+    compiled_program = paddle.static.CompiledProgram(
+        program).with_data_parallel(
+            share_vars_from=share_prog,
+            loss_name=loss_name,
+            build_strategy=build_strategy,
+            exec_strategy=exec_strategy)
+
+    return compiled_program
+
+
+total_step = 0
+
+
+def run(dataloader,
+        exe,
+        program,
+        feeds,
+        fetchs,
+        epoch=0,
+        mode='train',
+        config=None,
+        vdl_writer=None,
+        lr_scheduler=None,
+        profiler_options=None):
+    """
+    Feed data to the model and fetch the measures and loss
+
+    Args:
+        dataloader(paddle io dataloader):
+        exe():
+        program():
+        fetchs(dict): dict of measures and the loss
+        epoch(int): epoch of training or evaluation
+        model(str): log only
+
+    Returns:
+    """
+    fetch_list = [f[0] for f in fetchs.values()]
+    metric_dict = OrderedDict([("lr", AverageMeter(
+        'lr', 'f', postfix=",", need_avg=False))])
+
+    for k in fetchs:
+        metric_dict[k] = fetchs[k][1]
+
+    metric_dict["batch_time"] = AverageMeter(
+        'batch_cost', '.5f', postfix=" s,")
+    metric_dict["reader_time"] = AverageMeter(
+        'reader_cost', '.5f', postfix=" s,")
+
+    for m in metric_dict.values():
+        m.reset()
+
+    use_dali = config["Global"].get('use_dali', False)
+    tic = time.time()
+
+    if not use_dali:
+        dataloader = dataloader()
+
+    idx = 0
+    batch_size = None
+    while True:
+        # The DALI maybe raise RuntimeError for some particular images, such as ImageNet1k/n04418357_26036.JPEG
+        try:
+            batch = next(dataloader)
+        except StopIteration:
+            break
+        except RuntimeError:
+            logger.warning(
+                "Except RuntimeError when reading data from dataloader, try to read once again..."
+            )
+            continue
+        idx += 1
+        # ignore the warmup iters
+        if idx == 5:
+            metric_dict["batch_time"].reset()
+            metric_dict["reader_time"].reset()
+
+        metric_dict['reader_time'].update(time.time() - tic)
+
+        profiler.add_profiler_step(profiler_options)
+
+        if use_dali:
+            batch_size = batch[0]["data"].shape()[0]
+            feed_dict = batch[0]
+        else:
+            batch_size = batch[0].shape()[0]
+            feed_dict = {
+                key.name: batch[idx]
+                for idx, key in enumerate(feeds.values())
+            }
+
+        metrics = exe.run(program=program,
+                          feed=feed_dict,
+                          fetch_list=fetch_list)
+
+        for name, m in zip(fetchs.keys(), metrics):
+            metric_dict[name].update(np.mean(m), batch_size)
+        metric_dict["batch_time"].update(time.time() - tic)
+        if mode == "train":
+            metric_dict['lr'].update(lr_scheduler.get_lr())
+
+        fetchs_str = ' '.join([
+            str(metric_dict[key].mean)
+            if "time" in key else str(metric_dict[key].value)
+            for key in metric_dict
+        ])
+        ips_info = " ips: {:.5f} images/sec.".format(
+            batch_size / metric_dict["batch_time"].avg)
+        fetchs_str += ips_info
+
+        if lr_scheduler is not None:
+            lr_scheduler.step()
+
+        if vdl_writer:
+            global total_step
+            logger.scaler('loss', metrics[0][0], total_step, vdl_writer)
+            total_step += 1
+        if mode == 'eval':
+            if idx % config.get('print_interval', 10) == 0:
+                logger.info("{:s} step:{:<4d} {:s}".format(mode, idx,
+                                                           fetchs_str))
+        else:
+            epoch_str = "epoch:{:<3d}".format(epoch)
+            step_str = "{:s} step:{:<4d}".format(mode, idx)
+
+            if idx % config.get('print_interval', 10) == 0:
+                logger.info("{:s} {:s} {:s}".format(epoch_str, step_str,
+                                                    fetchs_str))
+
+        tic = time.time()
+
+    end_str = ' '.join([str(m.mean) for m in metric_dict.values()] +
+                       [metric_dict["batch_time"].total])
+    ips_info = "ips: {:.5f} images/sec.".format(batch_size /
+                                                metric_dict["batch_time"].avg)
+    if mode == 'eval':
+        logger.info("END {:s} {:s} {:s}".format(mode, end_str, ips_info))
+    else:
+        end_epoch_str = "END epoch:{:<3d}".format(epoch)
+        logger.info("{:s} {:s} {:s} {:s}".format(end_epoch_str, mode, end_str,
+                                                 ips_info))
+    if use_dali:
+        dataloader.reset()
+
+    # return top1_acc in order to save the best model
+    if mode == 'eval':
+        return fetchs["top1"][1].avg
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/static/save_load.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/static/save_load.py
new file mode 100644
index 000000000..13badfddc
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/static/save_load.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import errno
+import os
+import re
+import shutil
+import tempfile
+
+import paddle
+
+from ppcls.utils import logger
+
+__all__ = ['init_model', 'save_model']
+
+
+def _mkdir_if_not_exist(path):
+    """
+    mkdir if not exists, ignore the exception when multiprocess mkdir together
+    """
+    if not os.path.exists(path):
+        try:
+            os.makedirs(path)
+        except OSError as e:
+            if e.errno == errno.EEXIST and os.path.isdir(path):
+                logger.warning(
+                    'be happy if some process has already created {}'.format(
+                        path))
+            else:
+                raise OSError('Failed to mkdir {}'.format(path))
+
+
+def _load_state(path):
+    if os.path.exists(path + '.pdopt'):
+        # XXX another hack to ignore the optimizer state
+        tmp = tempfile.mkdtemp()
+        dst = os.path.join(tmp, os.path.basename(os.path.normpath(path)))
+        shutil.copy(path + '.pdparams', dst + '.pdparams')
+        state = paddle.static.load_program_state(dst)
+        shutil.rmtree(tmp)
+    else:
+        state = paddle.static.load_program_state(path)
+    return state
+
+
+def load_params(exe, prog, path, ignore_params=None):
+    """
+    Load model from the given path.
+    Args:
+        exe (fluid.Executor): The fluid.Executor object.
+        prog (fluid.Program): load weight to which Program object.
+        path (string): URL string or loca model path.
+        ignore_params (list): ignore variable to load when finetuning.
+            It can be specified by finetune_exclude_pretrained_params
+            and the usage can refer to the document
+            docs/advanced_tutorials/TRANSFER_LEARNING.md
+    """
+    if not (os.path.isdir(path) or os.path.exists(path + '.pdparams')):
+        raise ValueError("Model pretrain path {} does not "
+                         "exists.".format(path))
+
+    logger.info("Loading parameters from {}...".format(path))
+
+    ignore_set = set()
+    state = _load_state(path)
+
+    # ignore the parameter which mismatch the shape
+    # between the model and pretrain weight.
+    all_var_shape = {}
+    for block in prog.blocks:
+        for param in block.all_parameters():
+            all_var_shape[param.name] = param.shape
+    ignore_set.update([
+        name for name, shape in all_var_shape.items()
+        if name in state and shape != state[name].shape
+    ])
+
+    if ignore_params:
+        all_var_names = [var.name for var in prog.list_vars()]
+        ignore_list = filter(
+            lambda var: any([re.match(name, var) for name in ignore_params]),
+            all_var_names)
+        ignore_set.update(list(ignore_list))
+
+    if len(ignore_set) > 0:
+        for k in ignore_set:
+            if k in state:
+                logger.warning(
+                    'variable {} is already excluded automatically'.format(k))
+                del state[k]
+
+    paddle.static.set_program_state(prog, state)
+
+
+def init_model(config, program, exe):
+    """
+    load model from checkpoint or pretrained_model
+    """
+    checkpoints = config.get('checkpoints')
+    if checkpoints:
+        paddle.static.load(program, checkpoints, exe)
+        logger.info("Finish initing model from {}".format(checkpoints))
+        return
+
+    pretrained_model = config.get('pretrained_model')
+    if pretrained_model:
+        if not isinstance(pretrained_model, list):
+            pretrained_model = [pretrained_model]
+        for pretrain in pretrained_model:
+            load_params(exe, program, pretrain)
+        logger.info("Finish initing model from {}".format(pretrained_model))
+
+
+def save_model(program, model_path, epoch_id, prefix='ppcls'):
+    """
+    save model to the target path
+    """
+    if paddle.distributed.get_rank() != 0:
+        return
+    model_path = os.path.join(model_path, str(epoch_id))
+    _mkdir_if_not_exist(model_path)
+    model_prefix = os.path.join(model_path, prefix)
+    paddle.static.save(program, model_prefix)
+    logger.info("Already save model in {}".format(model_path))
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/static/train.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/static/train.py
new file mode 100644
index 000000000..dd16cdb4c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/static/train.py
@@ -0,0 +1,212 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import sys
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(__dir__)
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))
+
+import paddle
+from paddle.distributed import fleet
+from visualdl import LogWriter
+
+from ppcls.data import build_dataloader
+from ppcls.utils.config import get_config, print_config
+from ppcls.utils import logger
+from ppcls.utils.logger import init_logger
+from ppcls.static.save_load import init_model, save_model
+from ppcls.static import program
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("PaddleClas train script")
+    parser.add_argument(
+        '-c',
+        '--config',
+        type=str,
+        default='configs/ResNet/ResNet50.yaml',
+        help='config file path')
+    parser.add_argument(
+        '-p',
+        '--profiler_options',
+        type=str,
+        default=None,
+        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
+    )
+    parser.add_argument(
+        '-o',
+        '--override',
+        action='append',
+        default=[],
+        help='config options to be overridden')
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    """
+    all the config of training paradigm should be in config["Global"]
+    """
+    config = get_config(args.config, overrides=args.override, show=False)
+    global_config = config["Global"]
+
+    mode = "train"
+
+    log_file = os.path.join(global_config['output_dir'],
+                            config["Arch"]["name"], f"{mode}.log")
+    init_logger(log_file=log_file)
+    print_config(config)
+
+    if global_config.get("is_distributed", True):
+        fleet.init(is_collective=True)
+    # assign the device
+    use_gpu = global_config.get("use_gpu", True)
+    # amp related config
+    if 'AMP' in config:
+        AMP_RELATED_FLAGS_SETTING = {
+            'FLAGS_cudnn_exhaustive_search': 1,
+            'FLAGS_conv_workspace_size_limit': 1500,
+            'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
+            'FLAGS_max_inplace_grad_add': 8,
+        }
+        os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '1'
+        paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING)
+
+    use_xpu = global_config.get("use_xpu", False)
+    use_npu = global_config.get("use_npu", False)
+    use_mlu = global_config.get("use_mlu", False)
+    assert (
+        use_gpu and use_xpu and use_npu and use_mlu
+    ) is not True, "gpu, xpu, npu and mlu can not be true in the same time in static mode!"
+
+    if use_gpu:
+        device = paddle.set_device('gpu')
+    elif use_xpu:
+        device = paddle.set_device('xpu')
+    elif use_npu:
+        device = paddle.set_device('npu')
+    elif use_mlu:
+        device = paddle.set_device('mlu')
+    else:
+        device = paddle.set_device('cpu')
+
+    # visualDL
+    vdl_writer = None
+    if global_config["use_visualdl"]:
+        vdl_dir = os.path.join(global_config["output_dir"], "vdl")
+        vdl_writer = LogWriter(vdl_dir)
+
+    # build dataloader
+    eval_dataloader = None
+    use_dali = global_config.get('use_dali', False)
+
+    class_num = config["Arch"].get("class_num", None)
+    config["DataLoader"].update({"class_num": class_num})
+    train_dataloader = build_dataloader(
+        config["DataLoader"], "Train", device=device, use_dali=use_dali)
+    if global_config["eval_during_train"]:
+        eval_dataloader = build_dataloader(
+            config["DataLoader"], "Eval", device=device, use_dali=use_dali)
+
+    step_each_epoch = len(train_dataloader)
+
+    # startup_prog is used to do some parameter init work,
+    # and train prog is used to hold the network
+    startup_prog = paddle.static.Program()
+    train_prog = paddle.static.Program()
+
+    best_top1_acc = 0.0  # best top1 acc record
+
+    train_fetchs, lr_scheduler, train_feeds, optimizer = program.build(
+        config,
+        train_prog,
+        startup_prog,
+        class_num,
+        step_each_epoch=step_each_epoch,
+        is_train=True,
+        is_distributed=global_config.get("is_distributed", True))
+
+    if global_config["eval_during_train"]:
+        eval_prog = paddle.static.Program()
+        eval_fetchs, _, eval_feeds, _ = program.build(
+            config,
+            eval_prog,
+            startup_prog,
+            is_train=False,
+            is_distributed=global_config.get("is_distributed", True))
+        # clone to prune some content which is irrelevant in eval_prog
+        eval_prog = eval_prog.clone(for_test=True)
+
+    # create the "Executor" with the statement of which device
+    exe = paddle.static.Executor(device)
+    # Parameter initialization
+    exe.run(startup_prog)
+    # load pretrained models or checkpoints
+    init_model(global_config, train_prog, exe)
+
+    if 'AMP' in config and config.AMP.get("level", "O1") == "O2":
+        optimizer.amp_init(
+            device,
+            scope=paddle.static.global_scope(),
+            test_program=eval_prog
+            if global_config["eval_during_train"] else None)
+
+    if not global_config.get("is_distributed", True):
+        compiled_train_prog = program.compile(
+            config, train_prog, loss_name=train_fetchs["loss"][0].name)
+    else:
+        compiled_train_prog = train_prog
+
+    if eval_dataloader is not None:
+        compiled_eval_prog = program.compile(config, eval_prog)
+
+    for epoch_id in range(global_config["epochs"]):
+        # 1. train with train dataset
+        program.run(train_dataloader, exe, compiled_train_prog, train_feeds,
+                    train_fetchs, epoch_id, 'train', config, vdl_writer,
+                    lr_scheduler, args.profiler_options)
+        # 2. evaate with eval dataset
+        if global_config["eval_during_train"] and epoch_id % global_config[
+                "eval_interval"] == 0:
+            top1_acc = program.run(eval_dataloader, exe, compiled_eval_prog,
+                                   eval_feeds, eval_fetchs, epoch_id, "eval",
+                                   config)
+            if top1_acc > best_top1_acc:
+                best_top1_acc = top1_acc
+                message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
+                    best_top1_acc, epoch_id)
+                logger.info(message)
+                if epoch_id % global_config["save_interval"] == 0:
+
+                    model_path = os.path.join(global_config["output_dir"],
+                                              config["Arch"]["name"])
+                    save_model(train_prog, model_path, "best_model")
+
+        # 3. save the persistable model
+        if epoch_id % global_config["save_interval"] == 0:
+            model_path = os.path.join(global_config["output_dir"],
+                                      config["Arch"]["name"])
+            save_model(train_prog, model_path, epoch_id)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    args = parse_args()
+    main(args)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/__init__.py
new file mode 100644
index 000000000..632cc7882
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/__init__.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import logger
+from . import misc
+from . import model_zoo
+from . import metrics
+
+from .save_load import init_model, save_model
+from .config import get_config
+from .misc import AverageMeter
+from .metrics import multi_hot_encode
+from .metrics import hamming_distance
+from .metrics import accuracy_score
+from .metrics import precision_recall_fscore
+from .metrics import mean_average_precision
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/check.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/check.py
new file mode 100644
index 000000000..bc7030818
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/check.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+import paddle
+from paddle import is_compiled_with_cuda
+
+from ppcls.arch import get_architectures
+from ppcls.arch import similar_architectures
+from ppcls.arch import get_blacklist_model_in_static_mode
+from ppcls.utils import logger
+
+
+def check_version():
+    """
+    Log error and exit when the installed version of paddlepaddle is
+    not satisfied.
+    """
+    err = "PaddlePaddle version 1.8.0 or higher is required, " \
+          "or a suitable develop version is satisfied as well. \n" \
+          "Please make sure the version is good with your code."
+    try:
+        pass
+        # paddle.utils.require_version('0.0.0')
+    except Exception:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_gpu():
+    """
+    Log error and exit when using paddlepaddle cpu version.
+    """
+    err = "You are using paddlepaddle cpu version! Please try to " \
+          "install paddlepaddle-gpu to run model on GPU."
+
+    try:
+        assert is_compiled_with_cuda()
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_architecture(architecture):
+    """
+    check architecture and recommend similar architectures
+    """
+    assert isinstance(architecture, dict), \
+        ("the type of architecture({}) should be dict". format(architecture))
+    assert "name" in architecture, \
+        ("name must be in the architecture keys, just contains: {}". format(
+            architecture.keys()))
+
+    similar_names = similar_architectures(architecture["name"],
+                                          get_architectures())
+    model_list = ', '.join(similar_names)
+    err = "Architecture [{}] is not exist! Maybe you want: [{}]" \
+          "".format(architecture["name"], model_list)
+    try:
+        assert architecture["name"] in similar_names
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_model_with_running_mode(architecture):
+    """
+    check whether the model is consistent with the operating mode 
+    """
+    # some model are not supported in the static mode
+    blacklist = get_blacklist_model_in_static_mode()
+    if not paddle.in_dynamic_mode() and architecture["name"] in blacklist:
+        logger.error("Model: {} is not supported in the staic mode.".format(
+            architecture["name"]))
+        sys.exit(1)
+    return
+
+
+def check_mix(architecture, use_mix=False):
+    """
+    check mix parameter
+    """
+    err = "Cannot use mix processing in GoogLeNet, " \
+          "please set use_mix = False."
+    try:
+        if architecture["name"] == "GoogLeNet":
+            assert use_mix is not True
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_classes_num(classes_num):
+    """
+    check classes_num
+    """
+    err = "classes_num({}) should be a positive integer" \
+        "and larger than 1".format(classes_num)
+    try:
+        assert isinstance(classes_num, int)
+        assert classes_num > 1
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_data_dir(path):
+    """
+    check cata_dir
+    """
+    err = "Data path is not exist, please given a right path" \
+          "".format(path)
+    try:
+        assert os.isdir(path)
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_function_params(config, key):
+    """
+    check specify config
+    """
+    k_config = config.get(key)
+    assert k_config is not None, \
+        ('{} is required in config'.format(key))
+
+    assert k_config.get('function'), \
+        ('function is required {} config'.format(key))
+    params = k_config.get('params')
+    assert params is not None, \
+        ('params is required in {} config'.format(key))
+    assert isinstance(params, dict), \
+        ('the params in {} config should be a dict'.format(key))
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/config.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/config.py
new file mode 100644
index 000000000..e3277c480
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/config.py
@@ -0,0 +1,210 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import copy
+import argparse
+import yaml
+from ppcls.utils import logger
+from ppcls.utils import check
+__all__ = ['get_config']
+
+
+class AttrDict(dict):
+    def __getattr__(self, key):
+        return self[key]
+
+    def __setattr__(self, key, value):
+        if key in self.__dict__:
+            self.__dict__[key] = value
+        else:
+            self[key] = value
+
+    def __deepcopy__(self, content):
+        return copy.deepcopy(dict(self))
+
+
+def create_attr_dict(yaml_config):
+    from ast import literal_eval
+    for key, value in yaml_config.items():
+        if type(value) is dict:
+            yaml_config[key] = value = AttrDict(value)
+        if isinstance(value, str):
+            try:
+                value = literal_eval(value)
+            except BaseException:
+                pass
+        if isinstance(value, AttrDict):
+            create_attr_dict(yaml_config[key])
+        else:
+            yaml_config[key] = value
+
+
+def parse_config(cfg_file):
+    """Load a config file into AttrDict"""
+    with open(cfg_file, 'r') as fopen:
+        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.SafeLoader))
+    create_attr_dict(yaml_config)
+    return yaml_config
+
+
+def print_dict(d, delimiter=0):
+    """
+    Recursively visualize a dict and
+    indenting acrrording by the relationship of keys.
+    """
+    placeholder = "-" * 60
+    for k, v in sorted(d.items()):
+        if isinstance(v, dict):
+            logger.info("{}{} : ".format(delimiter * " ", k))
+            print_dict(v, delimiter + 4)
+        elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):
+            logger.info("{}{} : ".format(delimiter * " ", k))
+            for value in v:
+                print_dict(value, delimiter + 4)
+        else:
+            logger.info("{}{} : {}".format(delimiter * " ", k, v))
+        if k.isupper():
+            logger.info(placeholder)
+
+
+def print_config(config):
+    """
+    visualize configs
+    Arguments:
+        config: configs
+    """
+    logger.advertise()
+    print_dict(config)
+
+
+def check_config(config):
+    """
+    Check config
+    """
+    check.check_version()
+    use_gpu = config.get('use_gpu', True)
+    if use_gpu:
+        check.check_gpu()
+    architecture = config.get('ARCHITECTURE')
+    #check.check_architecture(architecture)
+    use_mix = config.get('use_mix', False)
+    check.check_mix(architecture, use_mix)
+    classes_num = config.get('classes_num')
+    check.check_classes_num(classes_num)
+    mode = config.get('mode', 'train')
+    if mode.lower() == 'train':
+        check.check_function_params(config, 'LEARNING_RATE')
+        check.check_function_params(config, 'OPTIMIZER')
+
+
+def override(dl, ks, v):
+    """
+    Recursively replace dict of list
+    Args:
+        dl(dict or list): dict or list to be replaced
+        ks(list): list of keys
+        v(str): value to be replaced
+    """
+
+    def str2num(v):
+        try:
+            return eval(v)
+        except Exception:
+            return v
+
+    assert isinstance(dl, (list, dict)), ("{} should be a list or a dict")
+    assert len(ks) > 0, ('lenght of keys should larger than 0')
+    if isinstance(dl, list):
+        k = str2num(ks[0])
+        if len(ks) == 1:
+            assert k < len(dl), ('index({}) out of range({})'.format(k, dl))
+            dl[k] = str2num(v)
+        else:
+            override(dl[k], ks[1:], v)
+    else:
+        if len(ks) == 1:
+            # assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl))
+            if not ks[0] in dl:
+                print('A new filed ({}) detected!'.format(ks[0], dl))
+            dl[ks[0]] = str2num(v)
+        else:
+            override(dl[ks[0]], ks[1:], v)
+
+
+def override_config(config, options=None):
+    """
+    Recursively override the config
+    Args:
+        config(dict): dict to be replaced
+        options(list): list of pairs(key0.key1.idx.key2=value)
+            such as: [
+                'topk=2',
+                'VALID.transforms.1.ResizeImage.resize_short=300'
+            ]
+    Returns:
+        config(dict): replaced config
+    """
+    if options is not None:
+        for opt in options:
+            assert isinstance(opt, str), (
+                "option({}) should be a str".format(opt))
+            assert "=" in opt, (
+                "option({}) should contain a ="
+                "to distinguish between key and value".format(opt))
+            pair = opt.split('=')
+            assert len(pair) == 2, ("there can be only a = in the option")
+            key, value = pair
+            keys = key.split('.')
+            override(config, keys, value)
+    return config
+
+
+def get_config(fname, overrides=None, show=False):
+    """
+    Read config from file
+    """
+    assert os.path.exists(fname), (
+        'config file({}) is not exist'.format(fname))
+    config = parse_config(fname)
+    override_config(config, overrides)
+    if show:
+        print_config(config)
+    # check_config(config)
+    return config
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("generic-image-rec train script")
+    parser.add_argument(
+        '-c',
+        '--config',
+        type=str,
+        default='configs/config.yaml',
+        help='config file path')
+    parser.add_argument(
+        '-o',
+        '--override',
+        action='append',
+        default=[],
+        help='config options to be overridden')
+    parser.add_argument(
+        '-p',
+        '--profiler_options',
+        type=str,
+        default=None,
+        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
+    )
+    args = parser.parse_args()
+    return args
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/download.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/download.py
new file mode 100644
index 000000000..9c4575048
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/download.py
@@ -0,0 +1,319 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import os.path as osp
+import shutil
+import requests
+import hashlib
+import tarfile
+import zipfile
+import time
+from collections import OrderedDict
+from tqdm import tqdm
+
+from ppcls.utils import logger
+
+__all__ = ['get_weights_path_from_url']
+
+WEIGHTS_HOME = osp.expanduser("~/.paddleclas/weights")
+
+DOWNLOAD_RETRY_LIMIT = 3
+
+
+def is_url(path):
+    """
+    Whether path is URL.
+    Args:
+        path (string): URL string or not.
+    """
+    return path.startswith('http://') or path.startswith('https://')
+
+
+def get_weights_path_from_url(url, md5sum=None):
+    """Get weights path from WEIGHT_HOME, if not exists,
+    download it from url.
+
+    Args:
+        url (str): download url
+        md5sum (str): md5 sum of download package
+    
+    Returns:
+        str: a local path to save downloaded weights.
+
+    Examples:
+        .. code-block:: python
+
+            from paddle.utils.download import get_weights_path_from_url
+
+            resnet18_pretrained_weight_url = 'https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams'
+            local_weight_path = get_weights_path_from_url(resnet18_pretrained_weight_url)
+
+    """
+    path = get_path_from_url(url, WEIGHTS_HOME, md5sum)
+    return path
+
+
+def _map_path(url, root_dir):
+    # parse path after download under root_dir
+    fname = osp.split(url)[-1]
+    fpath = fname
+    return osp.join(root_dir, fpath)
+
+
+def _get_unique_endpoints(trainer_endpoints):
+    # Sorting is to avoid different environmental variables for each card
+    trainer_endpoints.sort()
+    ips = set()
+    unique_endpoints = set()
+    for endpoint in trainer_endpoints:
+        ip = endpoint.split(":")[0]
+        if ip in ips:
+            continue
+        ips.add(ip)
+        unique_endpoints.add(endpoint)
+    logger.info("unique_endpoints {}".format(unique_endpoints))
+    return unique_endpoints
+
+
+def get_path_from_url(url,
+                      root_dir,
+                      md5sum=None,
+                      check_exist=True,
+                      decompress=True):
+    """ Download from given url to root_dir.
+    if file or directory specified by url is exists under
+    root_dir, return the path directly, otherwise download
+    from url and decompress it, return the path.
+
+    Args:
+        url (str): download url
+        root_dir (str): root dir for downloading, it should be
+                        WEIGHTS_HOME or DATASET_HOME
+        md5sum (str): md5 sum of download package
+    
+    Returns:
+        str: a local path to save downloaded models & weights & datasets.
+    """
+
+    from paddle.fluid.dygraph.parallel import ParallelEnv
+
+    assert is_url(url), "downloading from {} not a url".format(url)
+    # parse path after download to decompress under root_dir
+    fullpath = _map_path(url, root_dir)
+    # Mainly used to solve the problem of downloading data from different 
+    # machines in the case of multiple machines. Different ips will download 
+    # data, and the same ip will only download data once.
+    unique_endpoints = _get_unique_endpoints(ParallelEnv()
+                                             .trainer_endpoints[:])
+    if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum):
+        logger.info("Found {}".format(fullpath))
+    else:
+        if ParallelEnv().current_endpoint in unique_endpoints:
+            fullpath = _download(url, root_dir, md5sum)
+        else:
+            while not os.path.exists(fullpath):
+                time.sleep(1)
+
+    if ParallelEnv().current_endpoint in unique_endpoints:
+        if decompress and (tarfile.is_tarfile(fullpath) or
+                           zipfile.is_zipfile(fullpath)):
+            fullpath = _decompress(fullpath)
+
+    return fullpath
+
+
+def _download(url, path, md5sum=None):
+    """
+    Download from url, save to path.
+
+    url (str): download url
+    path (str): download to given path
+    """
+    if not osp.exists(path):
+        os.makedirs(path)
+
+    fname = osp.split(url)[-1]
+    fullname = osp.join(path, fname)
+    retry_cnt = 0
+
+    while not (osp.exists(fullname) and _md5check(fullname, md5sum)):
+        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+            retry_cnt += 1
+        else:
+            raise RuntimeError("Download from {} failed. "
+                               "Retry limit reached".format(url))
+
+        logger.info("Downloading {} from {}".format(fname, url))
+
+        try:
+            req = requests.get(url, stream=True)
+        except Exception as e:  # requests.exceptions.ConnectionError
+            logger.info(
+                "Downloading {} from {} failed {} times with exception {}".
+                format(fname, url, retry_cnt + 1, str(e)))
+            time.sleep(1)
+            continue
+
+        if req.status_code != 200:
+            raise RuntimeError("Downloading from {} failed with code "
+                               "{}!".format(url, req.status_code))
+
+        # For protecting download interupted, download to
+        # tmp_fullname firstly, move tmp_fullname to fullname
+        # after download finished
+        tmp_fullname = fullname + "_tmp"
+        total_size = req.headers.get('content-length')
+        with open(tmp_fullname, 'wb') as f:
+            if total_size:
+                with tqdm(total=(int(total_size) + 1023) // 1024) as pbar:
+                    for chunk in req.iter_content(chunk_size=1024):
+                        f.write(chunk)
+                        pbar.update(1)
+            else:
+                for chunk in req.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+        shutil.move(tmp_fullname, fullname)
+
+    return fullname
+
+
+def _md5check(fullname, md5sum=None):
+    if md5sum is None:
+        return True
+
+    logger.info("File {} md5 checking...".format(fullname))
+    md5 = hashlib.md5()
+    with open(fullname, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5.update(chunk)
+    calc_md5sum = md5.hexdigest()
+
+    if calc_md5sum != md5sum:
+        logger.info("File {} md5 check failed, {}(calc) != "
+                    "{}(base)".format(fullname, calc_md5sum, md5sum))
+        return False
+    return True
+
+
+def _decompress(fname):
+    """
+    Decompress for zip and tar file
+    """
+    logger.info("Decompressing {}...".format(fname))
+
+    # For protecting decompressing interupted,
+    # decompress to fpath_tmp directory firstly, if decompress
+    # successed, move decompress files to fpath and delete
+    # fpath_tmp and remove download compress file.
+
+    if tarfile.is_tarfile(fname):
+        uncompressed_path = _uncompress_file_tar(fname)
+    elif zipfile.is_zipfile(fname):
+        uncompressed_path = _uncompress_file_zip(fname)
+    else:
+        raise TypeError("Unsupport compress file type {}".format(fname))
+
+    return uncompressed_path
+
+
+def _uncompress_file_zip(filepath):
+    files = zipfile.ZipFile(filepath, 'r')
+    file_list = files.namelist()
+
+    file_dir = os.path.dirname(filepath)
+
+    if _is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+
+        for item in file_list:
+            files.extract(item, file_dir)
+
+    elif _is_a_single_dir(file_list):
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+
+        for item in file_list:
+            files.extract(item, file_dir)
+
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        if not os.path.exists(uncompressed_path):
+            os.makedirs(uncompressed_path)
+        for item in file_list:
+            files.extract(item, os.path.join(file_dir, rootpath))
+
+    files.close()
+
+    return uncompressed_path
+
+
+def _uncompress_file_tar(filepath, mode="r:*"):
+    files = tarfile.open(filepath, mode)
+    file_list = files.getnames()
+
+    file_dir = os.path.dirname(filepath)
+
+    if _is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        for item in file_list:
+            files.extract(item, file_dir)
+    elif _is_a_single_dir(file_list):
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        for item in file_list:
+            files.extract(item, file_dir)
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        if not os.path.exists(uncompressed_path):
+            os.makedirs(uncompressed_path)
+
+        for item in file_list:
+            files.extract(item, os.path.join(file_dir, rootpath))
+
+    files.close()
+
+    return uncompressed_path
+
+
+def _is_a_single_file(file_list):
+    if len(file_list) == 1 and file_list[0].find(os.sep) < -1:
+        return True
+    return False
+
+
+def _is_a_single_dir(file_list):
+    new_file_list = []
+    for file_path in file_list:
+        if '/' in file_path:
+            file_path = file_path.replace('/', os.sep)
+        elif '\\' in file_path:
+            file_path = file_path.replace('\\', os.sep)
+        new_file_list.append(file_path)
+
+    file_name = new_file_list[0].split(os.sep)[0]
+    for i in range(1, len(new_file_list)):
+        if file_name != new_file_list[i].split(os.sep)[0]:
+            return False
+    return True
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/ema.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/ema.py
new file mode 100644
index 000000000..b54cdb1b2
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/ema.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+
+
+class ExponentialMovingAverage():
+    """
+    Exponential Moving Average
+    Code was heavily based on https://github.com/Wanger-SJTU/SegToolbox.Pytorch/blob/master/lib/utils/ema.py
+    """
+
+    def __init__(self, model, decay, thres_steps=True):
+        self._model = model
+        self._decay = decay
+        self._thres_steps = thres_steps
+        self._shadow = {}
+        self._backup = {}
+
+    def register(self):
+        self._update_step = 0
+        for name, param in self._model.named_parameters():
+            if param.stop_gradient is False:
+                self._shadow[name] = param.numpy().copy()
+
+    def update(self):
+        decay = min(self._decay, (1 + self._update_step) / (
+            10 + self._update_step)) if self._thres_steps else self._decay
+        for name, param in self._model.named_parameters():
+            if param.stop_gradient is False:
+                assert name in self._shadow
+                new_val = np.array(param.numpy().copy())
+                old_val = np.array(self._shadow[name])
+                new_average = decay * old_val + (1 - decay) * new_val
+                self._shadow[name] = new_average
+        self._update_step += 1
+        return decay
+
+    def apply(self):
+        for name, param in self._model.named_parameters():
+            if param.stop_gradient is False:
+                assert name in self._shadow
+                self._backup[name] = np.array(param.numpy().copy())
+                param.set_value(np.array(self._shadow[name]))
+
+    def restore(self):
+        for name, param in self._model.named_parameters():
+            if param.stop_gradient is False:
+                assert name in self._backup
+                param.set_value(self._backup[name])
+        self._backup = {}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/feature_maps_visualization/fm_vis.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/feature_maps_visualization/fm_vis.py
new file mode 100644
index 000000000..a5368b10e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/feature_maps_visualization/fm_vis.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import cv2
+import utils
+import argparse
+import os
+import sys
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(__dir__)
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../../..')))
+
+import paddle
+from paddle.distributed import ParallelEnv
+
+from resnet import ResNet50
+from ppcls.utils.save_load import load_dygraph_pretrain
+
+
+def parse_args():
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--image_file", required=True, type=str)
+    parser.add_argument("-c", "--channel_num", type=int)
+    parser.add_argument("-p", "--pretrained_model", type=str)
+    parser.add_argument("--show", type=str2bool, default=False)
+    parser.add_argument("--interpolation", type=int, default=1)
+    parser.add_argument("--save_path", type=str, default=None)
+    parser.add_argument("--use_gpu", type=str2bool, default=True)
+
+    return parser.parse_args()
+
+
+def create_operators(interpolation=1):
+    size = 224
+    img_mean = [0.485, 0.456, 0.406]
+    img_std = [0.229, 0.224, 0.225]
+    img_scale = 1.0 / 255.0
+
+    resize_op = utils.ResizeImage(
+        resize_short=256, interpolation=interpolation)
+    crop_op = utils.CropImage(size=(size, size))
+    normalize_op = utils.NormalizeImage(
+        scale=img_scale, mean=img_mean, std=img_std)
+    totensor_op = utils.ToTensor()
+
+    return [resize_op, crop_op, normalize_op, totensor_op]
+
+
+def preprocess(data, ops):
+    for op in ops:
+        data = op(data)
+    return data
+
+
+def main():
+    args = parse_args()
+    operators = create_operators(args.interpolation)
+    # assign the place
+    place = 'gpu:{}'.format(ParallelEnv().dev_id) if args.use_gpu else 'cpu'
+    place = paddle.set_device(place)
+
+    net = ResNet50()
+    load_dygraph_pretrain(net, args.pretrained_model)
+
+    img = cv2.imread(args.image_file, cv2.IMREAD_COLOR)
+    data = preprocess(img, operators)
+    data = np.expand_dims(data, axis=0)
+    data = paddle.to_tensor(data)
+    net.eval()
+    _, fm = net(data)
+    assert args.channel_num >= 0 and args.channel_num <= fm.shape[
+        1], "the channel is out of the range, should be in {} but got {}".format(
+            [0, fm.shape[1]], args.channel_num)
+
+    fm = (np.squeeze(fm[0][args.channel_num].numpy()) * 255).astype(np.uint8)
+    fm = cv2.resize(fm, (img.shape[1], img.shape[0]))
+    if args.save_path is not None:
+        print("the feature map is saved in path: {}".format(args.save_path))
+        cv2.imwrite(args.save_path, fm)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/feature_maps_visualization/resnet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/feature_maps_visualization/resnet.py
new file mode 100644
index 000000000..b75881414
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/feature_maps_visualization/resnet.py
@@ -0,0 +1,535 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "ResNet18":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_pretrained.pdparams",
+    "ResNet18_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_vd_pretrained.pdparams",
+    "ResNet34":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet34_pretrained.pdparams",
+    "ResNet34_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet34_vd_pretrained.pdparams",
+    "ResNet50":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_pretrained.pdparams",
+    "ResNet50_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_vd_pretrained.pdparams",
+    "ResNet101":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet101_pretrained.pdparams",
+    "ResNet101_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet101_vd_pretrained.pdparams",
+    "ResNet152":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_pretrained.pdparams",
+    "ResNet152_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_vd_pretrained.pdparams",
+    "ResNet200_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet200_vd_pretrained.pdparams",
+}
+
+__all__ = MODEL_URLS.keys()
+'''
+ResNet config: dict.
+    key: depth of ResNet.
+    values: config's dict of specific model.
+        keys:
+            block_type: Two different blocks in ResNet, BasicBlock and BottleneckBlock are optional.
+            block_depth: The number of blocks in different stages in ResNet.
+            num_channels: The number of channels to enter the next stage.
+'''
+NET_CONFIG = {
+    "18": {
+        "block_type": "BasicBlock",
+        "block_depth": [2, 2, 2, 2],
+        "num_channels": [64, 64, 128, 256]
+    },
+    "34": {
+        "block_type": "BasicBlock",
+        "block_depth": [3, 4, 6, 3],
+        "num_channels": [64, 64, 128, 256]
+    },
+    "50": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 4, 6, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "101": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 4, 23, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "152": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 8, 36, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "200": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 12, 48, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+}
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 is_vd_mode=False,
+                 act=None,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+        self.is_vd_mode = is_vd_mode
+        self.act = act
+        self.avg_pool = AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=False,
+            data_format=data_format)
+        self.bn = BatchNorm(
+            num_filters,
+            param_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult),
+            data_layout=data_format)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        if self.is_vd_mode:
+            x = self.avg_pool(x)
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act:
+            x = self.relu(x)
+        return x
+
+
+class BottleneckBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            lr_mult=lr_mult,
+            data_format=data_format)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride if if_first else 1,
+                is_vd_mode=False if if_first else True,
+                lr_mult=lr_mult,
+                data_format=data_format)
+        self.relu = nn.ReLU()
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        x = self.conv2(x)
+
+        if self.shortcut:
+            short = identity
+        else:
+            short = self.short(identity)
+        x = paddle.add(x=x, y=short)
+        x = self.relu(x)
+        return x
+
+
+class BasicBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            lr_mult=lr_mult,
+            data_format=data_format)
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=stride if if_first else 1,
+                is_vd_mode=False if if_first else True,
+                lr_mult=lr_mult,
+                data_format=data_format)
+        self.shortcut = shortcut
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        if self.shortcut:
+            short = identity
+        else:
+            short = self.short(identity)
+        x = paddle.add(x=x, y=short)
+        x = self.relu(x)
+        return x
+
+
+class ResNet(TheseusLayer):
+    """
+    ResNet
+    Args:
+        config: dict. config of ResNet.
+        version: str="vb". Different version of ResNet, version vd can perform better.
+        class_num: int=1000. The number of classes.
+        lr_mult_list: list. Control the learning rate of different stages.
+    Returns:
+        model: nn.Layer. Specific ResNet model depends on args.
+    """
+
+    def __init__(self,
+                 config,
+                 version="vb",
+                 class_num=1000,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+                 data_format="NCHW",
+                 input_image_channel=3,
+                 return_patterns=None):
+        super().__init__()
+
+        self.cfg = config
+        self.lr_mult_list = lr_mult_list
+        self.is_vd_mode = version == "vd"
+        self.class_num = class_num
+        self.num_filters = [64, 128, 256, 512]
+        self.block_depth = self.cfg["block_depth"]
+        self.block_type = self.cfg["block_type"]
+        self.num_channels = self.cfg["num_channels"]
+        self.channels_mult = 1 if self.num_channels[-1] == 256 else 4
+
+        assert isinstance(self.lr_mult_list, (
+            list, tuple
+        )), "lr_mult_list should be in (list, tuple) but got {}".format(
+            type(self.lr_mult_list))
+        assert len(self.lr_mult_list
+                   ) == 5, "lr_mult_list length should be 5 but got {}".format(
+                       len(self.lr_mult_list))
+
+        self.stem_cfg = {
+            #num_channels, num_filters, filter_size, stride
+            "vb": [[input_image_channel, 64, 7, 2]],
+            "vd":
+            [[input_image_channel, 32, 3, 2], [32, 32, 3, 1], [32, 64, 3, 1]]
+        }
+
+        self.stem = nn.Sequential(* [
+            ConvBNLayer(
+                num_channels=in_c,
+                num_filters=out_c,
+                filter_size=k,
+                stride=s,
+                act="relu",
+                lr_mult=self.lr_mult_list[0],
+                data_format=data_format)
+            for in_c, out_c, k, s in self.stem_cfg[version]
+        ])
+
+        self.max_pool = MaxPool2D(
+            kernel_size=3, stride=2, padding=1, data_format=data_format)
+        block_list = []
+        for block_idx in range(len(self.block_depth)):
+            shortcut = False
+            for i in range(self.block_depth[block_idx]):
+                block_list.append(globals()[self.block_type](
+                    num_channels=self.num_channels[block_idx] if i == 0 else
+                    self.num_filters[block_idx] * self.channels_mult,
+                    num_filters=self.num_filters[block_idx],
+                    stride=2 if i == 0 and block_idx != 0 else 1,
+                    shortcut=shortcut,
+                    if_first=block_idx == i == 0 if version == "vd" else True,
+                    lr_mult=self.lr_mult_list[block_idx + 1],
+                    data_format=data_format))
+                shortcut = True
+        self.blocks = nn.Sequential(*block_list)
+
+        self.avg_pool = AdaptiveAvgPool2D(1, data_format=data_format)
+        self.flatten = nn.Flatten()
+        self.avg_pool_channels = self.num_channels[-1] * 2
+        stdv = 1.0 / math.sqrt(self.avg_pool_channels * 1.0)
+        self.fc = Linear(
+            self.avg_pool_channels,
+            self.class_num,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
+
+        self.data_format = data_format
+        if return_patterns is not None:
+            self.update_res(return_patterns)
+            self.register_forward_post_hook(self._return_dict_hook)
+
+    def forward(self, x):
+        with paddle.static.amp.fp16_guard():
+            if self.data_format == "NHWC":
+                x = paddle.transpose(x, [0, 2, 3, 1])
+                x.stop_gradient = True
+            x = self.stem(x)
+            fm = x
+            x = self.max_pool(x)
+            x = self.blocks(x)
+            x = self.avg_pool(x)
+            x = self.flatten(x)
+            x = self.fc(x)
+        return x, fm
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNet18(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet18
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet18` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["18"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet18"], use_ssld)
+    return model
+
+
+def ResNet18_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet18_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet18_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["18"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet18_vd"], use_ssld)
+    return model
+
+
+def ResNet34(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet34
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet34` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["34"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet34"], use_ssld)
+    return model
+
+
+def ResNet34_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet34_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet34_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["34"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet34_vd"], use_ssld)
+    return model
+
+
+def ResNet50(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet50
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet50` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["50"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50"], use_ssld)
+    return model
+
+
+def ResNet50_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet50_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet50_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["50"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50_vd"], use_ssld)
+    return model
+
+
+def ResNet101(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet101
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet101` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["101"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet101"], use_ssld)
+    return model
+
+
+def ResNet101_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet101_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet101_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["101"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet101_vd"], use_ssld)
+    return model
+
+
+def ResNet152(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet152
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet152` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["152"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet152"], use_ssld)
+    return model
+
+
+def ResNet152_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet152_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet152_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["152"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet152_vd"], use_ssld)
+    return model
+
+
+def ResNet200_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet200_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet200_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["200"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet200_vd"], use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/feature_maps_visualization/utils.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/feature_maps_visualization/utils.py
new file mode 100644
index 000000000..7c7014932
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/feature_maps_visualization/utils.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+
+
+class DecodeImage(object):
+    def __init__(self, to_rgb=True):
+        self.to_rgb = to_rgb
+
+    def __call__(self, img):
+        data = np.frombuffer(img, dtype='uint8')
+        img = cv2.imdecode(data, 1)
+        if self.to_rgb:
+            assert img.shape[2] == 3, 'invalid shape of image[%s]' % (
+                img.shape)
+            img = img[:, :, ::-1]
+
+        return img
+
+
+class ResizeImage(object):
+    def __init__(self, resize_short=None, interpolation=1):
+        self.resize_short = resize_short
+        self.interpolation = interpolation
+
+    def __call__(self, img):
+        img_h, img_w = img.shape[:2]
+        percent = float(self.resize_short) / min(img_w, img_h)
+        w = int(round(img_w * percent))
+        h = int(round(img_h * percent))
+        return cv2.resize(img, (w, h), interpolation=self.interpolation)
+
+
+class CropImage(object):
+    def __init__(self, size):
+        if type(size) is int:
+            self.size = (size, size)
+        else:
+            self.size = size
+
+    def __call__(self, img):
+        w, h = self.size
+        img_h, img_w = img.shape[:2]
+        w_start = (img_w - w) // 2
+        h_start = (img_h - h) // 2
+
+        w_end = w_start + w
+        h_end = h_start + h
+        return img[h_start:h_end, w_start:w_end, :]
+
+
+class NormalizeImage(object):
+    def __init__(self, scale=None, mean=None, std=None):
+        self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
+        mean = mean if mean is not None else [0.485, 0.456, 0.406]
+        std = std if std is not None else [0.229, 0.224, 0.225]
+
+        shape = (1, 1, 3)
+        self.mean = np.array(mean).reshape(shape).astype('float32')
+        self.std = np.array(std).reshape(shape).astype('float32')
+
+    def __call__(self, img):
+        return (img.astype('float32') * self.scale - self.mean) / self.std
+
+
+class ToTensor(object):
+    def __init__(self):
+        pass
+
+    def __call__(self, img):
+        img = img.transpose((2, 0, 1))
+        return img
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/gallery2fc.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/gallery2fc.py
new file mode 100644
index 000000000..67b08529e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/gallery2fc.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import paddle
+import cv2
+
+from ppcls.arch import build_model
+from ppcls.utils.config import parse_config, parse_args
+from ppcls.utils.save_load import load_dygraph_pretrain
+from ppcls.utils.logger import init_logger
+from ppcls.data import create_operators
+from ppcls.arch.slim import quantize_model
+
+
+class GalleryLayer(paddle.nn.Layer):
+    def __init__(self, configs):
+        super().__init__()
+        self.configs = configs
+        embedding_size = self.configs["Arch"]["Head"]["embedding_size"]
+        self.batch_size = self.configs["IndexProcess"]["batch_size"]
+        self.image_shape = self.configs["Global"]["image_shape"].copy()
+        self.image_shape.insert(0, self.batch_size)
+
+        image_root = self.configs["IndexProcess"]["image_root"]
+        data_file = self.configs["IndexProcess"]["data_file"]
+        delimiter = self.configs["IndexProcess"]["delimiter"]
+        self.gallery_images = []
+        gallery_docs = []
+        gallery_labels = []
+
+        with open(data_file, 'r', encoding='utf-8') as f:
+            lines = f.readlines()
+            for ori_line in lines:
+                line = ori_line.strip().split(delimiter)
+                text_num = len(line)
+                assert text_num >= 2, f"line({ori_line}) must be splitted into at least 2 parts, but got {text_num}"
+                image_file = os.path.join(image_root, line[0])
+
+                self.gallery_images.append(image_file)
+                gallery_docs.append(ori_line.strip())
+                gallery_labels.append(line[1].strip())
+        self.gallery_layer = paddle.nn.Linear(embedding_size, len(self.gallery_images), bias_attr=False)
+        self.gallery_layer.skip_quant = True
+        output_label_str = ""
+        for i, label_i in enumerate(gallery_labels):
+            output_label_str += "{} {}\n".format(i, label_i)
+        output_path = configs["Global"]["save_inference_dir"] + "_label.txt"
+        with open(output_path, "w") as f:
+            f.write(output_label_str)
+
+    def forward(self, x, label=None):
+        x = paddle.nn.functional.normalize(x)
+        x = self.gallery_layer(x)
+        return x
+
+    def build_gallery_layer(self, feature_extractor):
+        transform_configs = self.configs["IndexProcess"]["transform_ops"]
+        preprocess_ops = create_operators(transform_configs)
+        embedding_size = self.configs["Arch"]["Head"]["embedding_size"]
+        batch_index = 0
+        input_tensor = paddle.zeros(self.image_shape)
+        gallery_feature = paddle.zeros((len(self.gallery_images), embedding_size))
+        for i, image_path in enumerate(self.gallery_images):
+            image = cv2.imread(image_path)[:, :, ::-1]
+            for op in preprocess_ops:
+                image = op(image)
+            input_tensor[batch_index] = image
+            batch_index += 1
+            if batch_index == self.batch_size or i == len(self.gallery_images) - 1:
+                batch_feature = feature_extractor(input_tensor)["features"]
+                for j in range(batch_index):
+                    feature = batch_feature[j]
+                    norm_feature = paddle.nn.functional.normalize(feature, axis=0)
+                    gallery_feature[i - batch_index + j + 1] = norm_feature
+        self.gallery_layer.set_state_dict({"_layer.weight": gallery_feature.T})
+
+
+def export_fuse_model(configs):
+    slim_config = configs["Slim"].copy()
+    configs["Slim"] = None
+    fuse_model = build_model(configs)
+    fuse_model.head = GalleryLayer(configs)
+    configs["Slim"] = slim_config
+    quantize_model(configs, fuse_model)
+    load_dygraph_pretrain(fuse_model, configs["Global"]["pretrained_model"])
+    fuse_model.eval()
+    fuse_model.head.build_gallery_layer(fuse_model)
+    save_path = configs["Global"]["save_inference_dir"]
+    fuse_model.quanter.save_quantized_model(
+        fuse_model,
+        save_path,
+        input_spec=[
+            paddle.static.InputSpec(
+                shape=[None] + configs["Global"]["image_shape"],
+                dtype='float32')
+        ])
+
+
+def main():
+    args = parse_args()
+    configs = parse_config(args.config)
+    init_logger(name='gallery2fc')
+    export_fuse_model(configs)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/imagenet1k_label_list.txt b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/imagenet1k_label_list.txt
new file mode 100644
index 000000000..376e18021
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/imagenet1k_label_list.txt
@@ -0,0 +1,1000 @@
+0 tench, Tinca tinca
+1 goldfish, Carassius auratus
+2 great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias
+3 tiger shark, Galeocerdo cuvieri
+4 hammerhead, hammerhead shark
+5 electric ray, crampfish, numbfish, torpedo
+6 stingray
+7 cock
+8 hen
+9 ostrich, Struthio camelus
+10 brambling, Fringilla montifringilla
+11 goldfinch, Carduelis carduelis
+12 house finch, linnet, Carpodacus mexicanus
+13 junco, snowbird
+14 indigo bunting, indigo finch, indigo bird, Passerina cyanea
+15 robin, American robin, Turdus migratorius
+16 bulbul
+17 jay
+18 magpie
+19 chickadee
+20 water ouzel, dipper
+21 kite
+22 bald eagle, American eagle, Haliaeetus leucocephalus
+23 vulture
+24 great grey owl, great gray owl, Strix nebulosa
+25 European fire salamander, Salamandra salamandra
+26 common newt, Triturus vulgaris
+27 eft
+28 spotted salamander, Ambystoma maculatum
+29 axolotl, mud puppy, Ambystoma mexicanum
+30 bullfrog, Rana catesbeiana
+31 tree frog, tree-frog
+32 tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui
+33 loggerhead, loggerhead turtle, Caretta caretta
+34 leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea
+35 mud turtle
+36 terrapin
+37 box turtle, box tortoise
+38 banded gecko
+39 common iguana, iguana, Iguana iguana
+40 American chameleon, anole, Anolis carolinensis
+41 whiptail, whiptail lizard
+42 agama
+43 frilled lizard, Chlamydosaurus kingi
+44 alligator lizard
+45 Gila monster, Heloderma suspectum
+46 green lizard, Lacerta viridis
+47 African chameleon, Chamaeleo chamaeleon
+48 Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis
+49 African crocodile, Nile crocodile, Crocodylus niloticus
+50 American alligator, Alligator mississipiensis
+51 triceratops
+52 thunder snake, worm snake, Carphophis amoenus
+53 ringneck snake, ring-necked snake, ring snake
+54 hognose snake, puff adder, sand viper
+55 green snake, grass snake
+56 king snake, kingsnake
+57 garter snake, grass snake
+58 water snake
+59 vine snake
+60 night snake, Hypsiglena torquata
+61 boa constrictor, Constrictor constrictor
+62 rock python, rock snake, Python sebae
+63 Indian cobra, Naja naja
+64 green mamba
+65 sea snake
+66 horned viper, cerastes, sand viper, horned asp, Cerastes cornutus
+67 diamondback, diamondback rattlesnake, Crotalus adamanteus
+68 sidewinder, horned rattlesnake, Crotalus cerastes
+69 trilobite
+70 harvestman, daddy longlegs, Phalangium opilio
+71 scorpion
+72 black and gold garden spider, Argiope aurantia
+73 barn spider, Araneus cavaticus
+74 garden spider, Aranea diademata
+75 black widow, Latrodectus mactans
+76 tarantula
+77 wolf spider, hunting spider
+78 tick
+79 centipede
+80 black grouse
+81 ptarmigan
+82 ruffed grouse, partridge, Bonasa umbellus
+83 prairie chicken, prairie grouse, prairie fowl
+84 peacock
+85 quail
+86 partridge
+87 African grey, African gray, Psittacus erithacus
+88 macaw
+89 sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita
+90 lorikeet
+91 coucal
+92 bee eater
+93 hornbill
+94 hummingbird
+95 jacamar
+96 toucan
+97 drake
+98 red-breasted merganser, Mergus serrator
+99 goose
+100 black swan, Cygnus atratus
+101 tusker
+102 echidna, spiny anteater, anteater
+103 platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus
+104 wallaby, brush kangaroo
+105 koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus
+106 wombat
+107 jellyfish
+108 sea anemone, anemone
+109 brain coral
+110 flatworm, platyhelminth
+111 nematode, nematode worm, roundworm
+112 conch
+113 snail
+114 slug
+115 sea slug, nudibranch
+116 chiton, coat-of-mail shell, sea cradle, polyplacophore
+117 chambered nautilus, pearly nautilus, nautilus
+118 Dungeness crab, Cancer magister
+119 rock crab, Cancer irroratus
+120 fiddler crab
+121 king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica
+122 American lobster, Northern lobster, Maine lobster, Homarus americanus
+123 spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish
+124 crayfish, crawfish, crawdad, crawdaddy
+125 hermit crab
+126 isopod
+127 white stork, Ciconia ciconia
+128 black stork, Ciconia nigra
+129 spoonbill
+130 flamingo
+131 little blue heron, Egretta caerulea
+132 American egret, great white heron, Egretta albus
+133 bittern
+134 crane
+135 limpkin, Aramus pictus
+136 European gallinule, Porphyrio porphyrio
+137 American coot, marsh hen, mud hen, water hen, Fulica americana
+138 bustard
+139 ruddy turnstone, Arenaria interpres
+140 red-backed sandpiper, dunlin, Erolia alpina
+141 redshank, Tringa totanus
+142 dowitcher
+143 oystercatcher, oyster catcher
+144 pelican
+145 king penguin, Aptenodytes patagonica
+146 albatross, mollymawk
+147 grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus
+148 killer whale, killer, orca, grampus, sea wolf, Orcinus orca
+149 dugong, Dugong dugon
+150 sea lion
+151 Chihuahua
+152 Japanese spaniel
+153 Maltese dog, Maltese terrier, Maltese
+154 Pekinese, Pekingese, Peke
+155 Shih-Tzu
+156 Blenheim spaniel
+157 papillon
+158 toy terrier
+159 Rhodesian ridgeback
+160 Afghan hound, Afghan
+161 basset, basset hound
+162 beagle
+163 bloodhound, sleuthhound
+164 bluetick
+165 black-and-tan coonhound
+166 Walker hound, Walker foxhound
+167 English foxhound
+168 redbone
+169 borzoi, Russian wolfhound
+170 Irish wolfhound
+171 Italian greyhound
+172 whippet
+173 Ibizan hound, Ibizan Podenco
+174 Norwegian elkhound, elkhound
+175 otterhound, otter hound
+176 Saluki, gazelle hound
+177 Scottish deerhound, deerhound
+178 Weimaraner
+179 Staffordshire bullterrier, Staffordshire bull terrier
+180 American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier
+181 Bedlington terrier
+182 Border terrier
+183 Kerry blue terrier
+184 Irish terrier
+185 Norfolk terrier
+186 Norwich terrier
+187 Yorkshire terrier
+188 wire-haired fox terrier
+189 Lakeland terrier
+190 Sealyham terrier, Sealyham
+191 Airedale, Airedale terrier
+192 cairn, cairn terrier
+193 Australian terrier
+194 Dandie Dinmont, Dandie Dinmont terrier
+195 Boston bull, Boston terrier
+196 miniature schnauzer
+197 giant schnauzer
+198 standard schnauzer
+199 Scotch terrier, Scottish terrier, Scottie
+200 Tibetan terrier, chrysanthemum dog
+201 silky terrier, Sydney silky
+202 soft-coated wheaten terrier
+203 West Highland white terrier
+204 Lhasa, Lhasa apso
+205 flat-coated retriever
+206 curly-coated retriever
+207 golden retriever
+208 Labrador retriever
+209 Chesapeake Bay retriever
+210 German short-haired pointer
+211 vizsla, Hungarian pointer
+212 English setter
+213 Irish setter, red setter
+214 Gordon setter
+215 Brittany spaniel
+216 clumber, clumber spaniel
+217 English springer, English springer spaniel
+218 Welsh springer spaniel
+219 cocker spaniel, English cocker spaniel, cocker
+220 Sussex spaniel
+221 Irish water spaniel
+222 kuvasz
+223 schipperke
+224 groenendael
+225 malinois
+226 briard
+227 kelpie
+228 komondor
+229 Old English sheepdog, bobtail
+230 Shetland sheepdog, Shetland sheep dog, Shetland
+231 collie
+232 Border collie
+233 Bouvier des Flandres, Bouviers des Flandres
+234 Rottweiler
+235 German shepherd, German shepherd dog, German police dog, alsatian
+236 Doberman, Doberman pinscher
+237 miniature pinscher
+238 Greater Swiss Mountain dog
+239 Bernese mountain dog
+240 Appenzeller
+241 EntleBucher
+242 boxer
+243 bull mastiff
+244 Tibetan mastiff
+245 French bulldog
+246 Great Dane
+247 Saint Bernard, St Bernard
+248 Eskimo dog, husky
+249 malamute, malemute, Alaskan malamute
+250 Siberian husky
+251 dalmatian, coach dog, carriage dog
+252 affenpinscher, monkey pinscher, monkey dog
+253 basenji
+254 pug, pug-dog
+255 Leonberg
+256 Newfoundland, Newfoundland dog
+257 Great Pyrenees
+258 Samoyed, Samoyede
+259 Pomeranian
+260 chow, chow chow
+261 keeshond
+262 Brabancon griffon
+263 Pembroke, Pembroke Welsh corgi
+264 Cardigan, Cardigan Welsh corgi
+265 toy poodle
+266 miniature poodle
+267 standard poodle
+268 Mexican hairless
+269 timber wolf, grey wolf, gray wolf, Canis lupus
+270 white wolf, Arctic wolf, Canis lupus tundrarum
+271 red wolf, maned wolf, Canis rufus, Canis niger
+272 coyote, prairie wolf, brush wolf, Canis latrans
+273 dingo, warrigal, warragal, Canis dingo
+274 dhole, Cuon alpinus
+275 African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus
+276 hyena, hyaena
+277 red fox, Vulpes vulpes
+278 kit fox, Vulpes macrotis
+279 Arctic fox, white fox, Alopex lagopus
+280 grey fox, gray fox, Urocyon cinereoargenteus
+281 tabby, tabby cat
+282 tiger cat
+283 Persian cat
+284 Siamese cat, Siamese
+285 Egyptian cat
+286 cougar, puma, catamount, mountain lion, painter, panther, Felis concolor
+287 lynx, catamount
+288 leopard, Panthera pardus
+289 snow leopard, ounce, Panthera uncia
+290 jaguar, panther, Panthera onca, Felis onca
+291 lion, king of beasts, Panthera leo
+292 tiger, Panthera tigris
+293 cheetah, chetah, Acinonyx jubatus
+294 brown bear, bruin, Ursus arctos
+295 American black bear, black bear, Ursus americanus, Euarctos americanus
+296 ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus
+297 sloth bear, Melursus ursinus, Ursus ursinus
+298 mongoose
+299 meerkat, mierkat
+300 tiger beetle
+301 ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle
+302 ground beetle, carabid beetle
+303 long-horned beetle, longicorn, longicorn beetle
+304 leaf beetle, chrysomelid
+305 dung beetle
+306 rhinoceros beetle
+307 weevil
+308 fly
+309 bee
+310 ant, emmet, pismire
+311 grasshopper, hopper
+312 cricket
+313 walking stick, walkingstick, stick insect
+314 cockroach, roach
+315 mantis, mantid
+316 cicada, cicala
+317 leafhopper
+318 lacewing, lacewing fly
+319 dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk
+320 damselfly
+321 admiral
+322 ringlet, ringlet butterfly
+323 monarch, monarch butterfly, milkweed butterfly, Danaus plexippus
+324 cabbage butterfly
+325 sulphur butterfly, sulfur butterfly
+326 lycaenid, lycaenid butterfly
+327 starfish, sea star
+328 sea urchin
+329 sea cucumber, holothurian
+330 wood rabbit, cottontail, cottontail rabbit
+331 hare
+332 Angora, Angora rabbit
+333 hamster
+334 porcupine, hedgehog
+335 fox squirrel, eastern fox squirrel, Sciurus niger
+336 marmot
+337 beaver
+338 guinea pig, Cavia cobaya
+339 sorrel
+340 zebra
+341 hog, pig, grunter, squealer, Sus scrofa
+342 wild boar, boar, Sus scrofa
+343 warthog
+344 hippopotamus, hippo, river horse, Hippopotamus amphibius
+345 ox
+346 water buffalo, water ox, Asiatic buffalo, Bubalus bubalis
+347 bison
+348 ram, tup
+349 bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis
+350 ibex, Capra ibex
+351 hartebeest
+352 impala, Aepyceros melampus
+353 gazelle
+354 Arabian camel, dromedary, Camelus dromedarius
+355 llama
+356 weasel
+357 mink
+358 polecat, fitch, foulmart, foumart, Mustela putorius
+359 black-footed ferret, ferret, Mustela nigripes
+360 otter
+361 skunk, polecat, wood pussy
+362 badger
+363 armadillo
+364 three-toed sloth, ai, Bradypus tridactylus
+365 orangutan, orang, orangutang, Pongo pygmaeus
+366 gorilla, Gorilla gorilla
+367 chimpanzee, chimp, Pan troglodytes
+368 gibbon, Hylobates lar
+369 siamang, Hylobates syndactylus, Symphalangus syndactylus
+370 guenon, guenon monkey
+371 patas, hussar monkey, Erythrocebus patas
+372 baboon
+373 macaque
+374 langur
+375 colobus, colobus monkey
+376 proboscis monkey, Nasalis larvatus
+377 marmoset
+378 capuchin, ringtail, Cebus capucinus
+379 howler monkey, howler
+380 titi, titi monkey
+381 spider monkey, Ateles geoffroyi
+382 squirrel monkey, Saimiri sciureus
+383 Madagascar cat, ring-tailed lemur, Lemur catta
+384 indri, indris, Indri indri, Indri brevicaudatus
+385 Indian elephant, Elephas maximus
+386 African elephant, Loxodonta africana
+387 lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens
+388 giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca
+389 barracouta, snoek
+390 eel
+391 coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch
+392 rock beauty, Holocanthus tricolor
+393 anemone fish
+394 sturgeon
+395 gar, garfish, garpike, billfish, Lepisosteus osseus
+396 lionfish
+397 puffer, pufferfish, blowfish, globefish
+398 abacus
+399 abaya
+400 academic gown, academic robe, judge's robe
+401 accordion, piano accordion, squeeze box
+402 acoustic guitar
+403 aircraft carrier, carrier, flattop, attack aircraft carrier
+404 airliner
+405 airship, dirigible
+406 altar
+407 ambulance
+408 amphibian, amphibious vehicle
+409 analog clock
+410 apiary, bee house
+411 apron
+412 ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin
+413 assault rifle, assault gun
+414 backpack, back pack, knapsack, packsack, rucksack, haversack
+415 bakery, bakeshop, bakehouse
+416 balance beam, beam
+417 balloon
+418 ballpoint, ballpoint pen, ballpen, Biro
+419 Band Aid
+420 banjo
+421 bannister, banister, balustrade, balusters, handrail
+422 barbell
+423 barber chair
+424 barbershop
+425 barn
+426 barometer
+427 barrel, cask
+428 barrow, garden cart, lawn cart, wheelbarrow
+429 baseball
+430 basketball
+431 bassinet
+432 bassoon
+433 bathing cap, swimming cap
+434 bath towel
+435 bathtub, bathing tub, bath, tub
+436 beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon
+437 beacon, lighthouse, beacon light, pharos
+438 beaker
+439 bearskin, busby, shako
+440 beer bottle
+441 beer glass
+442 bell cote, bell cot
+443 bib
+444 bicycle-built-for-two, tandem bicycle, tandem
+445 bikini, two-piece
+446 binder, ring-binder
+447 binoculars, field glasses, opera glasses
+448 birdhouse
+449 boathouse
+450 bobsled, bobsleigh, bob
+451 bolo tie, bolo, bola tie, bola
+452 bonnet, poke bonnet
+453 bookcase
+454 bookshop, bookstore, bookstall
+455 bottlecap
+456 bow
+457 bow tie, bow-tie, bowtie
+458 brass, memorial tablet, plaque
+459 brassiere, bra, bandeau
+460 breakwater, groin, groyne, mole, bulwark, seawall, jetty
+461 breastplate, aegis, egis
+462 broom
+463 bucket, pail
+464 buckle
+465 bulletproof vest
+466 bullet train, bullet
+467 butcher shop, meat market
+468 cab, hack, taxi, taxicab
+469 caldron, cauldron
+470 candle, taper, wax light
+471 cannon
+472 canoe
+473 can opener, tin opener
+474 cardigan
+475 car mirror
+476 carousel, carrousel, merry-go-round, roundabout, whirligig
+477 carpenter's kit, tool kit
+478 carton
+479 car wheel
+480 cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM
+481 cassette
+482 cassette player
+483 castle
+484 catamaran
+485 CD player
+486 cello, violoncello
+487 cellular telephone, cellular phone, cellphone, cell, mobile phone
+488 chain
+489 chainlink fence
+490 chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour
+491 chain saw, chainsaw
+492 chest
+493 chiffonier, commode
+494 chime, bell, gong
+495 china cabinet, china closet
+496 Christmas stocking
+497 church, church building
+498 cinema, movie theater, movie theatre, movie house, picture palace
+499 cleaver, meat cleaver, chopper
+500 cliff dwelling
+501 cloak
+502 clog, geta, patten, sabot
+503 cocktail shaker
+504 coffee mug
+505 coffeepot
+506 coil, spiral, volute, whorl, helix
+507 combination lock
+508 computer keyboard, keypad
+509 confectionery, confectionary, candy store
+510 container ship, containership, container vessel
+511 convertible
+512 corkscrew, bottle screw
+513 cornet, horn, trumpet, trump
+514 cowboy boot
+515 cowboy hat, ten-gallon hat
+516 cradle
+517 crane
+518 crash helmet
+519 crate
+520 crib, cot
+521 Crock Pot
+522 croquet ball
+523 crutch
+524 cuirass
+525 dam, dike, dyke
+526 desk
+527 desktop computer
+528 dial telephone, dial phone
+529 diaper, nappy, napkin
+530 digital clock
+531 digital watch
+532 dining table, board
+533 dishrag, dishcloth
+534 dishwasher, dish washer, dishwashing machine
+535 disk brake, disc brake
+536 dock, dockage, docking facility
+537 dogsled, dog sled, dog sleigh
+538 dome
+539 doormat, welcome mat
+540 drilling platform, offshore rig
+541 drum, membranophone, tympan
+542 drumstick
+543 dumbbell
+544 Dutch oven
+545 electric fan, blower
+546 electric guitar
+547 electric locomotive
+548 entertainment center
+549 envelope
+550 espresso maker
+551 face powder
+552 feather boa, boa
+553 file, file cabinet, filing cabinet
+554 fireboat
+555 fire engine, fire truck
+556 fire screen, fireguard
+557 flagpole, flagstaff
+558 flute, transverse flute
+559 folding chair
+560 football helmet
+561 forklift
+562 fountain
+563 fountain pen
+564 four-poster
+565 freight car
+566 French horn, horn
+567 frying pan, frypan, skillet
+568 fur coat
+569 garbage truck, dustcart
+570 gasmask, respirator, gas helmet
+571 gas pump, gasoline pump, petrol pump, island dispenser
+572 goblet
+573 go-kart
+574 golf ball
+575 golfcart, golf cart
+576 gondola
+577 gong, tam-tam
+578 gown
+579 grand piano, grand
+580 greenhouse, nursery, glasshouse
+581 grille, radiator grille
+582 grocery store, grocery, food market, market
+583 guillotine
+584 hair slide
+585 hair spray
+586 half track
+587 hammer
+588 hamper
+589 hand blower, blow dryer, blow drier, hair dryer, hair drier
+590 hand-held computer, hand-held microcomputer
+591 handkerchief, hankie, hanky, hankey
+592 hard disc, hard disk, fixed disk
+593 harmonica, mouth organ, harp, mouth harp
+594 harp
+595 harvester, reaper
+596 hatchet
+597 holster
+598 home theater, home theatre
+599 honeycomb
+600 hook, claw
+601 hoopskirt, crinoline
+602 horizontal bar, high bar
+603 horse cart, horse-cart
+604 hourglass
+605 iPod
+606 iron, smoothing iron
+607 jack-o'-lantern
+608 jean, blue jean, denim
+609 jeep, landrover
+610 jersey, T-shirt, tee shirt
+611 jigsaw puzzle
+612 jinrikisha, ricksha, rickshaw
+613 joystick
+614 kimono
+615 knee pad
+616 knot
+617 lab coat, laboratory coat
+618 ladle
+619 lampshade, lamp shade
+620 laptop, laptop computer
+621 lawn mower, mower
+622 lens cap, lens cover
+623 letter opener, paper knife, paperknife
+624 library
+625 lifeboat
+626 lighter, light, igniter, ignitor
+627 limousine, limo
+628 liner, ocean liner
+629 lipstick, lip rouge
+630 Loafer
+631 lotion
+632 loudspeaker, speaker, speaker unit, loudspeaker system, speaker system
+633 loupe, jeweler's loupe
+634 lumbermill, sawmill
+635 magnetic compass
+636 mailbag, postbag
+637 mailbox, letter box
+638 maillot
+639 maillot, tank suit
+640 manhole cover
+641 maraca
+642 marimba, xylophone
+643 mask
+644 matchstick
+645 maypole
+646 maze, labyrinth
+647 measuring cup
+648 medicine chest, medicine cabinet
+649 megalith, megalithic structure
+650 microphone, mike
+651 microwave, microwave oven
+652 military uniform
+653 milk can
+654 minibus
+655 miniskirt, mini
+656 minivan
+657 missile
+658 mitten
+659 mixing bowl
+660 mobile home, manufactured home
+661 Model T
+662 modem
+663 monastery
+664 monitor
+665 moped
+666 mortar
+667 mortarboard
+668 mosque
+669 mosquito net
+670 motor scooter, scooter
+671 mountain bike, all-terrain bike, off-roader
+672 mountain tent
+673 mouse, computer mouse
+674 mousetrap
+675 moving van
+676 muzzle
+677 nail
+678 neck brace
+679 necklace
+680 nipple
+681 notebook, notebook computer
+682 obelisk
+683 oboe, hautboy, hautbois
+684 ocarina, sweet potato
+685 odometer, hodometer, mileometer, milometer
+686 oil filter
+687 organ, pipe organ
+688 oscilloscope, scope, cathode-ray oscilloscope, CRO
+689 overskirt
+690 oxcart
+691 oxygen mask
+692 packet
+693 paddle, boat paddle
+694 paddlewheel, paddle wheel
+695 padlock
+696 paintbrush
+697 pajama, pyjama, pj's, jammies
+698 palace
+699 panpipe, pandean pipe, syrinx
+700 paper towel
+701 parachute, chute
+702 parallel bars, bars
+703 park bench
+704 parking meter
+705 passenger car, coach, carriage
+706 patio, terrace
+707 pay-phone, pay-station
+708 pedestal, plinth, footstall
+709 pencil box, pencil case
+710 pencil sharpener
+711 perfume, essence
+712 Petri dish
+713 photocopier
+714 pick, plectrum, plectron
+715 pickelhaube
+716 picket fence, paling
+717 pickup, pickup truck
+718 pier
+719 piggy bank, penny bank
+720 pill bottle
+721 pillow
+722 ping-pong ball
+723 pinwheel
+724 pirate, pirate ship
+725 pitcher, ewer
+726 plane, carpenter's plane, woodworking plane
+727 planetarium
+728 plastic bag
+729 plate rack
+730 plow, plough
+731 plunger, plumber's helper
+732 Polaroid camera, Polaroid Land camera
+733 pole
+734 police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria
+735 poncho
+736 pool table, billiard table, snooker table
+737 pop bottle, soda bottle
+738 pot, flowerpot
+739 potter's wheel
+740 power drill
+741 prayer rug, prayer mat
+742 printer
+743 prison, prison house
+744 projectile, missile
+745 projector
+746 puck, hockey puck
+747 punching bag, punch bag, punching ball, punchball
+748 purse
+749 quill, quill pen
+750 quilt, comforter, comfort, puff
+751 racer, race car, racing car
+752 racket, racquet
+753 radiator
+754 radio, wireless
+755 radio telescope, radio reflector
+756 rain barrel
+757 recreational vehicle, RV, R.V.
+758 reel
+759 reflex camera
+760 refrigerator, icebox
+761 remote control, remote
+762 restaurant, eating house, eating place, eatery
+763 revolver, six-gun, six-shooter
+764 rifle
+765 rocking chair, rocker
+766 rotisserie
+767 rubber eraser, rubber, pencil eraser
+768 rugby ball
+769 rule, ruler
+770 running shoe
+771 safe
+772 safety pin
+773 saltshaker, salt shaker
+774 sandal
+775 sarong
+776 sax, saxophone
+777 scabbard
+778 scale, weighing machine
+779 school bus
+780 schooner
+781 scoreboard
+782 screen, CRT screen
+783 screw
+784 screwdriver
+785 seat belt, seatbelt
+786 sewing machine
+787 shield, buckler
+788 shoe shop, shoe-shop, shoe store
+789 shoji
+790 shopping basket
+791 shopping cart
+792 shovel
+793 shower cap
+794 shower curtain
+795 ski
+796 ski mask
+797 sleeping bag
+798 slide rule, slipstick
+799 sliding door
+800 slot, one-armed bandit
+801 snorkel
+802 snowmobile
+803 snowplow, snowplough
+804 soap dispenser
+805 soccer ball
+806 sock
+807 solar dish, solar collector, solar furnace
+808 sombrero
+809 soup bowl
+810 space bar
+811 space heater
+812 space shuttle
+813 spatula
+814 speedboat
+815 spider web, spider's web
+816 spindle
+817 sports car, sport car
+818 spotlight, spot
+819 stage
+820 steam locomotive
+821 steel arch bridge
+822 steel drum
+823 stethoscope
+824 stole
+825 stone wall
+826 stopwatch, stop watch
+827 stove
+828 strainer
+829 streetcar, tram, tramcar, trolley, trolley car
+830 stretcher
+831 studio couch, day bed
+832 stupa, tope
+833 submarine, pigboat, sub, U-boat
+834 suit, suit of clothes
+835 sundial
+836 sunglass
+837 sunglasses, dark glasses, shades
+838 sunscreen, sunblock, sun blocker
+839 suspension bridge
+840 swab, swob, mop
+841 sweatshirt
+842 swimming trunks, bathing trunks
+843 swing
+844 switch, electric switch, electrical switch
+845 syringe
+846 table lamp
+847 tank, army tank, armored combat vehicle, armoured combat vehicle
+848 tape player
+849 teapot
+850 teddy, teddy bear
+851 television, television system
+852 tennis ball
+853 thatch, thatched roof
+854 theater curtain, theatre curtain
+855 thimble
+856 thresher, thrasher, threshing machine
+857 throne
+858 tile roof
+859 toaster
+860 tobacco shop, tobacconist shop, tobacconist
+861 toilet seat
+862 torch
+863 totem pole
+864 tow truck, tow car, wrecker
+865 toyshop
+866 tractor
+867 trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi
+868 tray
+869 trench coat
+870 tricycle, trike, velocipede
+871 trimaran
+872 tripod
+873 triumphal arch
+874 trolleybus, trolley coach, trackless trolley
+875 trombone
+876 tub, vat
+877 turnstile
+878 typewriter keyboard
+879 umbrella
+880 unicycle, monocycle
+881 upright, upright piano
+882 vacuum, vacuum cleaner
+883 vase
+884 vault
+885 velvet
+886 vending machine
+887 vestment
+888 viaduct
+889 violin, fiddle
+890 volleyball
+891 waffle iron
+892 wall clock
+893 wallet, billfold, notecase, pocketbook
+894 wardrobe, closet, press
+895 warplane, military plane
+896 washbasin, handbasin, washbowl, lavabo, wash-hand basin
+897 washer, automatic washer, washing machine
+898 water bottle
+899 water jug
+900 water tower
+901 whiskey jug
+902 whistle
+903 wig
+904 window screen
+905 window shade
+906 Windsor tie
+907 wine bottle
+908 wing
+909 wok
+910 wooden spoon
+911 wool, woolen, woollen
+912 worm fence, snake fence, snake-rail fence, Virginia fence
+913 wreck
+914 yawl
+915 yurt
+916 web site, website, internet site, site
+917 comic book
+918 crossword puzzle, crossword
+919 street sign
+920 traffic light, traffic signal, stoplight
+921 book jacket, dust cover, dust jacket, dust wrapper
+922 menu
+923 plate
+924 guacamole
+925 consomme
+926 hot pot, hotpot
+927 trifle
+928 ice cream, icecream
+929 ice lolly, lolly, lollipop, popsicle
+930 French loaf
+931 bagel, beigel
+932 pretzel
+933 cheeseburger
+934 hotdog, hot dog, red hot
+935 mashed potato
+936 head cabbage
+937 broccoli
+938 cauliflower
+939 zucchini, courgette
+940 spaghetti squash
+941 acorn squash
+942 butternut squash
+943 cucumber, cuke
+944 artichoke, globe artichoke
+945 bell pepper
+946 cardoon
+947 mushroom
+948 Granny Smith
+949 strawberry
+950 orange
+951 lemon
+952 fig
+953 pineapple, ananas
+954 banana
+955 jackfruit, jak, jack
+956 custard apple
+957 pomegranate
+958 hay
+959 carbonara
+960 chocolate sauce, chocolate syrup
+961 dough
+962 meat loaf, meatloaf
+963 pizza, pizza pie
+964 potpie
+965 burrito
+966 red wine
+967 espresso
+968 cup
+969 eggnog
+970 alp
+971 bubble
+972 cliff, drop, drop-off
+973 coral reef
+974 geyser
+975 lakeside, lakeshore
+976 promontory, headland, head, foreland
+977 sandbar, sand bar
+978 seashore, coast, seacoast, sea-coast
+979 valley, vale
+980 volcano
+981 ballplayer, baseball player
+982 groom, bridegroom
+983 scuba diver
+984 rapeseed
+985 daisy
+986 yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum
+987 corn
+988 acorn
+989 hip, rose hip, rosehip
+990 buckeye, horse chestnut, conker
+991 coral fungus
+992 agaric
+993 gyromitra
+994 stinkhorn, carrion fungus
+995 earthstar
+996 hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa
+997 bolete
+998 ear, spike, capitulum
+999 toilet tissue, toilet paper, bathroom tissue
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/logger.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/logger.py
new file mode 100644
index 000000000..bc8de3640
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/logger.py
@@ -0,0 +1,138 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+
+import logging
+import datetime
+import paddle.distributed as dist
+
+_logger = None
+
+
+def init_logger(name='ppcls', log_file=None, log_level=logging.INFO):
+    """Initialize and get a logger by name.
+    If the logger has not been initialized, this method will initialize the
+    logger by adding one or two handlers, otherwise the initialized logger will
+    be directly returned. During initialization, a StreamHandler will always be
+    added. If `log_file` is specified a FileHandler will also be added.
+    Args:
+        name (str): Logger name.
+        log_file (str | None): The log filename. If specified, a FileHandler
+            will be added to the logger.
+        log_level (int): The logger level. Note that only the process of
+            rank 0 is affected, and other processes will set the level to
+            "Error" thus be silent most of the time.
+    Returns:
+        logging.Logger: The expected logger.
+    """
+    global _logger
+    assert _logger is None, "logger should not be initialized twice or more."
+    _logger = logging.getLogger(name)
+
+    formatter = logging.Formatter(
+        '[%(asctime)s] %(name)s %(levelname)s: %(message)s',
+        datefmt="%Y/%m/%d %H:%M:%S")
+
+    stream_handler = logging.StreamHandler(stream=sys.stdout)
+    stream_handler.setFormatter(formatter)
+    _logger.addHandler(stream_handler)
+    if log_file is not None and dist.get_rank() == 0:
+        log_file_folder = os.path.split(log_file)[0]
+        os.makedirs(log_file_folder, exist_ok=True)
+        file_handler = logging.FileHandler(log_file, 'a')
+        file_handler.setFormatter(formatter)
+        _logger.addHandler(file_handler)
+    if dist.get_rank() == 0:
+        _logger.setLevel(log_level)
+    else:
+        _logger.setLevel(logging.ERROR)
+    _logger.propagate = False
+
+
+def log_at_trainer0(log):
+    """
+    logs will print multi-times when calling Fleet API.
+    Only display single log and ignore the others.
+    """
+
+    def wrapper(fmt, *args):
+        if dist.get_rank() == 0:
+            log(fmt, *args)
+
+    return wrapper
+
+
+@log_at_trainer0
+def info(fmt, *args):
+    _logger.info(fmt, *args)
+
+
+@log_at_trainer0
+def debug(fmt, *args):
+    _logger.debug(fmt, *args)
+
+
+@log_at_trainer0
+def warning(fmt, *args):
+    _logger.warning(fmt, *args)
+
+
+@log_at_trainer0
+def error(fmt, *args):
+    _logger.error(fmt, *args)
+
+
+def scaler(name, value, step, writer):
+    """
+    This function will draw a scalar curve generated by the visualdl.
+    Usage: Install visualdl: pip3 install visualdl==2.0.0b4
+           and then:
+           visualdl --logdir ./scalar --host 0.0.0.0 --port 8830 
+           to preview loss corve in real time.
+    """
+    if writer is None:
+        return
+    writer.add_scalar(tag=name, step=step, value=value)
+
+
+def advertise():
+    """
+    Show the advertising message like the following:
+
+    ===========================================================
+    ==        PaddleClas is powered by PaddlePaddle !        ==
+    ===========================================================
+    ==                                                       ==
+    ==   For more info please go to the following website.   ==
+    ==                                                       ==
+    ==       https://github.com/PaddlePaddle/PaddleClas      ==
+    ===========================================================
+
+    """
+    copyright = "PaddleClas is powered by PaddlePaddle !"
+    ad = "For more info please go to the following website."
+    website = "https://github.com/PaddlePaddle/PaddleClas"
+    AD_LEN = 6 + len(max([copyright, ad, website], key=len))
+
+    info("\n{0}\n{1}\n{2}\n{3}\n{4}\n{5}\n{6}\n{7}\n".format(
+        "=" * (AD_LEN + 4),
+        "=={}==".format(copyright.center(AD_LEN)),
+        "=" * (AD_LEN + 4),
+        "=={}==".format(' ' * AD_LEN),
+        "=={}==".format(ad.center(AD_LEN)),
+        "=={}==".format(' ' * AD_LEN),
+        "=={}==".format(website.center(AD_LEN)),
+        "=" * (AD_LEN + 4), ))
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/metrics.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/metrics.py
new file mode 100644
index 000000000..b0db68a75
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/metrics.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from sklearn.metrics import hamming_loss
+from sklearn.metrics import accuracy_score as accuracy_metric
+from sklearn.metrics import multilabel_confusion_matrix
+from sklearn.metrics import precision_recall_fscore_support
+from sklearn.metrics import average_precision_score
+from sklearn.preprocessing import binarize
+
+import numpy as np
+
+__all__ = ["multi_hot_encode", "hamming_distance", "accuracy_score", "precision_recall_fscore", "mean_average_precision"]
+
+
+def multi_hot_encode(logits, threshold=0.5):
+    """
+    Encode logits to multi-hot by elementwise for multilabel
+    """
+
+    return binarize(logits, threshold=threshold)
+
+
+def hamming_distance(output, target):
+    """
+    Soft metric based label for multilabel classification
+    Returns:
+        The smaller the return value is, the better model is.
+    """
+
+    return hamming_loss(target, output)
+
+
+def accuracy_score(output, target, base="sample"):
+    """
+    Hard metric for multilabel classification
+    Args:
+        output:
+        target:
+        base: ["sample", "label"], default="sample"
+            if "sample", return metric score based sample,
+            if "label", return metric score based label.
+    Returns:
+        accuracy:
+    """
+
+    assert base in ["sample", "label"], 'must be one of ["sample", "label"]'
+
+    if base == "sample":
+        accuracy = accuracy_metric(target, output)
+    elif base == "label":
+        mcm = multilabel_confusion_matrix(target, output)
+        tns = mcm[:, 0, 0]
+        fns = mcm[:, 1, 0]
+        tps = mcm[:, 1, 1]
+        fps = mcm[:, 0, 1]
+
+        accuracy = (sum(tps) + sum(tns)) / (sum(tps) + sum(tns) + sum(fns) + sum(fps))
+
+    return accuracy
+
+
+def precision_recall_fscore(output, target):
+    """
+    Metric based label for multilabel classification
+    Returns:
+        precisions:
+        recalls:
+        fscores:
+    """
+
+    precisions, recalls, fscores, _ = precision_recall_fscore_support(target, output)
+
+    return precisions, recalls, fscores
+
+
+def mean_average_precision(logits, target):
+    """
+    Calculate average precision
+    Args:
+        logits: probability from network before sigmoid or softmax
+        target: ground truth, 0 or 1
+    """
+    if not (isinstance(logits, np.ndarray) and isinstance(target, np.ndarray)):
+        raise TypeError("logits and target should be np.ndarray.")
+
+    aps = []
+    for i in range(target.shape[1]):
+        ap = average_precision_score(target[:, i], logits[:, i])
+        aps.append(ap)
+
+    return np.mean(aps)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/misc.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/misc.py
new file mode 100644
index 000000000..08ab7b6f7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/misc.py
@@ -0,0 +1,63 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['AverageMeter']
+
+
+class AverageMeter(object):
+    """
+    Computes and stores the average and current value
+    Code was based on https://github.com/pytorch/examples/blob/master/imagenet/main.py
+    """
+
+    def __init__(self, name='', fmt='f', postfix="", need_avg=True):
+        self.name = name
+        self.fmt = fmt
+        self.postfix = postfix
+        self.need_avg = need_avg
+        self.reset()
+
+    def reset(self):
+        """ reset """
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        """ update """
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    @property
+    def total(self):
+        return '{self.name}_sum: {self.sum:{self.fmt}}{self.postfix}'.format(
+            self=self)
+
+    @property
+    def total_minute(self):
+        return '{self.name} {s:{self.fmt}}{self.postfix} min'.format(
+            s=self.sum / 60, self=self)
+
+    @property
+    def mean(self):
+        return '{self.name}: {self.avg:{self.fmt}}{self.postfix}'.format(
+            self=self) if self.need_avg else ''
+
+    @property
+    def value(self):
+        return '{self.name}: {self.val:{self.fmt}}{self.postfix}'.format(
+            self=self)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/model_zoo.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/model_zoo.py
new file mode 100644
index 000000000..fc527f6a1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/model_zoo.py
@@ -0,0 +1,213 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import requests
+import shutil
+import tarfile
+import tqdm
+import zipfile
+
+from ppcls.arch import similar_architectures
+from ppcls.utils import logger
+
+__all__ = ['get']
+
+DOWNLOAD_RETRY_LIMIT = 3
+
+
+class UrlError(Exception):
+    """ UrlError
+    """
+
+    def __init__(self, url='', code=''):
+        message = "Downloading from {} failed with code {}!".format(url, code)
+        super(UrlError, self).__init__(message)
+
+
+class ModelNameError(Exception):
+    """ ModelNameError
+    """
+
+    def __init__(self, message=''):
+        super(ModelNameError, self).__init__(message)
+
+
+class RetryError(Exception):
+    """ RetryError
+    """
+
+    def __init__(self, url='', times=''):
+        message = "Download from {} failed. Retry({}) limit reached".format(
+            url, times)
+        super(RetryError, self).__init__(message)
+
+
+def _get_url(architecture, postfix="pdparams"):
+    prefix = "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/"
+    fname = architecture + "_pretrained." + postfix
+    return prefix + fname
+
+
+def _move_and_merge_tree(src, dst):
+    """
+    Move src directory to dst, if dst is already exists,
+    merge src to dst
+    """
+    if not os.path.exists(dst):
+        shutil.move(src, dst)
+    elif os.path.isfile(src):
+        shutil.move(src, dst)
+    else:
+        for fp in os.listdir(src):
+            src_fp = os.path.join(src, fp)
+            dst_fp = os.path.join(dst, fp)
+            if os.path.isdir(src_fp):
+                if os.path.isdir(dst_fp):
+                    _move_and_merge_tree(src_fp, dst_fp)
+                else:
+                    shutil.move(src_fp, dst_fp)
+            elif os.path.isfile(src_fp) and \
+                    not os.path.isfile(dst_fp):
+                shutil.move(src_fp, dst_fp)
+
+
+def _download(url, path):
+    """
+    Download from url, save to path.
+    url (str): download url
+    path (str): download to given path
+    """
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+    fname = os.path.split(url)[-1]
+    fullname = os.path.join(path, fname)
+    retry_cnt = 0
+
+    while not os.path.exists(fullname):
+        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+            retry_cnt += 1
+        else:
+            raise RetryError(url, DOWNLOAD_RETRY_LIMIT)
+
+        logger.info("Downloading {} from {}".format(fname, url))
+
+        req = requests.get(url, stream=True)
+        if req.status_code != 200:
+            raise UrlError(url, req.status_code)
+
+        # For protecting download interupted, download to
+        # tmp_fullname firstly, move tmp_fullname to fullname
+        # after download finished
+        tmp_fullname = fullname + "_tmp"
+        total_size = req.headers.get('content-length')
+        with open(tmp_fullname, 'wb') as f:
+            if total_size:
+                for chunk in tqdm.tqdm(
+                        req.iter_content(chunk_size=1024),
+                        total=(int(total_size) + 1023) // 1024,
+                        unit='KB'):
+                    f.write(chunk)
+            else:
+                for chunk in req.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+        shutil.move(tmp_fullname, fullname)
+
+    return fullname
+
+
+def _decompress(fname):
+    """
+    Decompress for zip and tar file
+    """
+    logger.info("Decompressing {}...".format(fname))
+
+    # For protecting decompressing interupted,
+    # decompress to fpath_tmp directory firstly, if decompress
+    # successed, move decompress files to fpath and delete
+    # fpath_tmp and remove download compress file.
+    fpath = os.path.split(fname)[0]
+    fpath_tmp = os.path.join(fpath, 'tmp')
+    if os.path.isdir(fpath_tmp):
+        shutil.rmtree(fpath_tmp)
+        os.makedirs(fpath_tmp)
+
+    if fname.find('tar') >= 0:
+        with tarfile.open(fname) as tf:
+            tf.extractall(path=fpath_tmp)
+    elif fname.find('zip') >= 0:
+        with zipfile.ZipFile(fname) as zf:
+            zf.extractall(path=fpath_tmp)
+    else:
+        raise TypeError("Unsupport compress file type {}".format(fname))
+
+    fs = os.listdir(fpath_tmp)
+    assert len(
+        fs
+    ) == 1, "There should just be 1 pretrained path in an archive file but got {}.".format(
+        len(fs))
+
+    f = fs[0]
+    src_dir = os.path.join(fpath_tmp, f)
+    dst_dir = os.path.join(fpath, f)
+    _move_and_merge_tree(src_dir, dst_dir)
+
+    shutil.rmtree(fpath_tmp)
+    os.remove(fname)
+
+    return f
+
+
+def _get_pretrained():
+    with open('./ppcls/utils/pretrained.list') as flist:
+        pretrained = [line.strip() for line in flist]
+    return pretrained
+
+
+def _check_pretrained_name(architecture):
+    assert isinstance(architecture, str), \
+        ("the type of architecture({}) should be str". format(architecture))
+    pretrained = _get_pretrained()
+    similar_names = similar_architectures(architecture, pretrained)
+    model_list = ', '.join(similar_names)
+    err = "{} is not exist! Maybe you want: [{}]" \
+          "".format(architecture, model_list)
+    if architecture not in similar_names:
+        raise ModelNameError(err)
+
+
+def list_models():
+    pretrained = _get_pretrained()
+    msg = "All avialable pretrained models are as follows: {}".format(
+        pretrained)
+    logger.info(msg)
+    return
+
+
+def get(architecture, path, decompress=False, postfix="pdparams"):
+    """
+    Get the pretrained model.
+    """
+    _check_pretrained_name(architecture)
+    url = _get_url(architecture, postfix=postfix)
+    fname = _download(url, path)
+    if postfix == "tar" and decompress:
+        _decompress(fname)
+    logger.info("download {} finished ".format(fname))
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/pretrained.list b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/pretrained.list
new file mode 100644
index 000000000..36d70f5a2
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/pretrained.list
@@ -0,0 +1,121 @@
+ResNet18
+ResNet34
+ResNet50
+ResNet101
+ResNet152
+ResNet50_vc
+ResNet18_vd
+ResNet34_vd
+ResNet50_vd
+ResNet50_vd_v2
+ResNet101_vd
+ResNet152_vd
+ResNet200_vd
+ResNet50_vd_ssld
+ResNet50_vd_ssld_v2
+Fix_ResNet50_vd_ssld_v2
+ResNet101_vd_ssld
+MobileNetV3_large_x0_35
+MobileNetV3_large_x0_5
+MobileNetV3_large_x0_75
+MobileNetV3_large_x1_0
+MobileNetV3_large_x1_25
+MobileNetV3_small_x0_35
+MobileNetV3_small_x0_5
+MobileNetV3_small_x0_75
+MobileNetV3_small_x1_0
+MobileNetV3_small_x1_25
+MobileNetV3_large_x1_0_ssld
+MobileNetV3_large_x1_0_ssld_int8
+MobileNetV3_small_x1_0_ssld
+MobileNetV2_x0_25
+MobileNetV2_x0_5
+MobileNetV2_x0_75
+MobileNetV2
+MobileNetV2_x1_5
+MobileNetV2_x2_0
+MobileNetV2_ssld
+MobileNetV1_x0_25
+MobileNetV1_x0_5
+MobileNetV1_x0_75
+MobileNetV1
+MobileNetV1_ssld
+ShuffleNetV2_x0_25
+ShuffleNetV2_x0_33
+ShuffleNetV2_x0_5
+ShuffleNetV2
+ShuffleNetV2_x1_5
+ShuffleNetV2_x2_0
+ShuffleNetV2_swish
+ResNeXt50_32x4d
+ResNeXt50_64x4d
+ResNeXt101_32x4d
+ResNeXt101_64x4d
+ResNeXt152_32x4d
+ResNeXt152_64x4d
+ResNeXt50_vd_32x4d
+ResNeXt50_vd_64x4d
+ResNeXt101_vd_32x4d
+ResNeXt101_vd_64x4d
+ResNeXt152_vd_32x4d
+ResNeXt152_vd_64x4d
+SE_ResNet18_vd
+SE_ResNet34_vd
+SE_ResNet50_vd
+SE_ResNeXt50_32x4d
+SE_ResNeXt101_32x4d
+SE_ResNeXt50_vd_32x4d
+SENet154_vd
+Res2Net50_26w_4s
+Res2Net50_vd_26w_4s
+Res2Net50_14w_8s
+Res2Net101_vd_26w_4s
+Res2Net200_vd_26w_4s
+GoogLeNet
+InceptionV4
+Xception41
+Xception41_deeplab
+Xception65
+Xception65_deeplab
+Xception71
+HRNet_W18_C
+HRNet_W30_C
+HRNet_W32_C
+HRNet_W40_C
+HRNet_W44_C
+HRNet_W48_C
+HRNet_W64_C
+DPN68
+DPN92
+DPN98
+DPN107
+DPN131
+DenseNet121
+DenseNet161
+DenseNet169
+DenseNet201
+DenseNet264
+EfficientNetB0_small
+EfficientNetB0
+EfficientNetB1
+EfficientNetB2
+EfficientNetB3
+EfficientNetB4
+EfficientNetB5
+EfficientNetB6
+EfficientNetB7
+ResNeXt101_32x8d_wsl
+ResNeXt101_32x16d_wsl
+ResNeXt101_32x32d_wsl
+ResNeXt101_32x48d_wsl
+Fix_ResNeXt101_32x48d_wsl
+AlexNet
+SqueezeNet1_0
+SqueezeNet1_1
+VGG11
+VGG13
+VGG16
+VGG19
+DarkNet53_ImageNet1k
+ResNet50_ACNet_deploy
+CSPResNet50_leaky
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/profiler.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/profiler.py
new file mode 100644
index 000000000..7cf945a26
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/profiler.py
@@ -0,0 +1,111 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import paddle
+
+# A global variable to record the number of calling times for profiler
+# functions. It is used to specify the tracing range of training steps.
+_profiler_step_id = 0
+
+# A global variable to avoid parsing from string every time.
+_profiler_options = None
+
+
+class ProfilerOptions(object):
+    '''
+    Use a string to initialize a ProfilerOptions.
+    The string should be in the format: "key1=value1;key2=value;key3=value3".
+    For example:
+      "profile_path=model.profile"
+      "batch_range=[50, 60]; profile_path=model.profile"
+      "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
+
+    ProfilerOptions supports following key-value pair:
+      batch_range      - a integer list, e.g. [100, 110].
+      state            - a string, the optional values are 'CPU', 'GPU' or 'All'. 
+      sorted_key       - a string, the optional values are 'calls', 'total',
+                         'max', 'min' or 'ave.
+      tracer_option    - a string, the optional values are 'Default', 'OpDetail',
+                         'AllOpDetail'.
+      profile_path     - a string, the path to save the serialized profile data,
+                         which can be used to generate a timeline.
+      exit_on_finished - a boolean.
+    '''
+
+    def __init__(self, options_str):
+        assert isinstance(options_str, str)
+
+        self._options = {
+            'batch_range': [10, 20],
+            'state': 'All',
+            'sorted_key': 'total',
+            'tracer_option': 'Default',
+            'profile_path': '/tmp/profile',
+            'exit_on_finished': True
+        }
+        self._parse_from_string(options_str)
+
+    def _parse_from_string(self, options_str):
+        for kv in options_str.replace(' ', '').split(';'):
+            key, value = kv.split('=')
+            if key == 'batch_range':
+                value_list = value.replace('[', '').replace(']', '').split(',')
+                value_list = list(map(int, value_list))
+                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
+                        1] > value_list[0]:
+                    self._options[key] = value_list
+            elif key == 'exit_on_finished':
+                self._options[key] = value.lower() in ("yes", "true", "t", "1")
+            elif key in [
+                    'state', 'sorted_key', 'tracer_option', 'profile_path'
+            ]:
+                self._options[key] = value
+
+    def __getitem__(self, name):
+        if self._options.get(name, None) is None:
+            raise ValueError(
+                "ProfilerOptions does not have an option named %s." % name)
+        return self._options[name]
+
+
+def add_profiler_step(options_str=None):
+    '''
+    Enable the operator-level timing using PaddlePaddle's profiler.
+    The profiler uses a independent variable to count the profiler steps.
+    One call of this function is treated as a profiler step.
+    
+    Args:
+      profiler_options - a string to initialize the ProfilerOptions.
+                         Default is None, and the profiler is disabled.
+    '''
+    if options_str is None:
+        return
+
+    global _profiler_step_id
+    global _profiler_options
+
+    if _profiler_options is None:
+        _profiler_options = ProfilerOptions(options_str)
+
+    if _profiler_step_id == _profiler_options['batch_range'][0]:
+        paddle.utils.profiler.start_profiler(
+            _profiler_options['state'], _profiler_options['tracer_option'])
+    elif _profiler_step_id == _profiler_options['batch_range'][1]:
+        paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'],
+                                            _profiler_options['profile_path'])
+        if _profiler_options['exit_on_finished']:
+            sys.exit(0)
+
+    _profiler_step_id += 1
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/save_load.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/save_load.py
new file mode 100644
index 000000000..625a28483
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.5/utils/save_load.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import errno
+import os
+import re
+import shutil
+import tempfile
+
+import paddle
+from ppcls.utils import logger
+from .download import get_weights_path_from_url
+
+__all__ = ['init_model', 'save_model', 'load_dygraph_pretrain']
+
+
+def _mkdir_if_not_exist(path):
+    """
+    mkdir if not exists, ignore the exception when multiprocess mkdir together
+    """
+    if not os.path.exists(path):
+        try:
+            os.makedirs(path)
+        except OSError as e:
+            if e.errno == errno.EEXIST and os.path.isdir(path):
+                logger.warning(
+                    'be happy if some process has already created {}'.format(
+                        path))
+            else:
+                raise OSError('Failed to mkdir {}'.format(path))
+
+
+def load_dygraph_pretrain(model, path=None):
+    if not (os.path.isdir(path) or os.path.exists(path + '.pdparams')):
+        raise ValueError("Model pretrain path {} does not "
+                         "exists.".format(path))
+    param_state_dict = paddle.load(path + ".pdparams")
+    model.set_dict(param_state_dict)
+    return
+
+
+def load_dygraph_pretrain_from_url(model, pretrained_url, use_ssld=False):
+    if use_ssld:
+        pretrained_url = pretrained_url.replace("_pretrained",
+                                                "_ssld_pretrained")
+    local_weight_path = get_weights_path_from_url(pretrained_url).replace(
+        ".pdparams", "")
+    load_dygraph_pretrain(model, path=local_weight_path)
+    return
+
+
+def load_distillation_model(model, pretrained_model):
+    logger.info("In distillation mode, teacher model will be "
+                "loaded firstly before student model.")
+
+    if not isinstance(pretrained_model, list):
+        pretrained_model = [pretrained_model]
+
+    teacher = model.teacher if hasattr(model,
+                                       "teacher") else model._layers.teacher
+    student = model.student if hasattr(model,
+                                       "student") else model._layers.student
+    load_dygraph_pretrain(teacher, path=pretrained_model[0])
+    logger.info("Finish initing teacher model from {}".format(
+        pretrained_model))
+    # load student model
+    if len(pretrained_model) >= 2:
+        load_dygraph_pretrain(student, path=pretrained_model[1])
+        logger.info("Finish initing student model from {}".format(
+            pretrained_model))
+
+
+def init_model(config, net, optimizer=None):
+    """
+    load model from checkpoint or pretrained_model
+    """
+    checkpoints = config.get('checkpoints')
+    if checkpoints and optimizer is not None:
+        assert os.path.exists(checkpoints + ".pdparams"), \
+            "Given dir {}.pdparams not exist.".format(checkpoints)
+        assert os.path.exists(checkpoints + ".pdopt"), \
+            "Given dir {}.pdopt not exist.".format(checkpoints)
+        para_dict = paddle.load(checkpoints + ".pdparams")
+        opti_dict = paddle.load(checkpoints + ".pdopt")
+        metric_dict = paddle.load(checkpoints + ".pdstates")
+        net.set_dict(para_dict)
+        optimizer.set_state_dict(opti_dict)
+        logger.info("Finish load checkpoints from {}".format(checkpoints))
+        return metric_dict
+
+    pretrained_model = config.get('pretrained_model')
+    use_distillation = config.get('use_distillation', False)
+    if pretrained_model:
+        if use_distillation:
+            load_distillation_model(net, pretrained_model)
+        else:  # common load
+            load_dygraph_pretrain(net, path=pretrained_model)
+            logger.info(
+                logger.coloring("Finish load pretrained model from {}".format(
+                    pretrained_model), "HEADER"))
+
+
+def save_model(net,
+               optimizer,
+               metric_info,
+               model_path,
+               model_name="",
+               prefix='ppcls'):
+    """
+    save model to the target path
+    """
+    if paddle.distributed.get_rank() != 0:
+        return
+    model_path = os.path.join(model_path, model_name)
+    _mkdir_if_not_exist(model_path)
+    model_path = os.path.join(model_path, prefix)
+
+    paddle.save(net.state_dict(), model_path + ".pdparams")
+    paddle.save(optimizer.state_dict(), model_path + ".pdopt")
+    paddle.save(metric_info, model_path + ".pdstates")
+    logger.info("Already save model in {}".format(model_path))
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/__init__.py
new file mode 100644
index 000000000..d6cdb6f8f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import optimizer
+
+from .arch import *
+from .optimizer import *
+from .data import *
+from .utils import *
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/__init__.py
new file mode 100644
index 000000000..798df62eb
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/__init__.py
@@ -0,0 +1,177 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import copy
+import importlib
+import paddle.nn as nn
+from paddle.jit import to_static
+from paddle.static import InputSpec
+
+from . import backbone, gears
+from .backbone import *
+from .gears import build_gear, add_ml_decoder_head
+from .utils import *
+from .backbone.base.theseus_layer import TheseusLayer
+from ..utils import logger
+from ..utils.save_load import load_dygraph_pretrain
+from .slim import prune_model, quantize_model
+from .distill.afd_attention import LinearTransformStudent, LinearTransformTeacher
+
+__all__ = ["build_model", "RecModel", "DistillationModel", "AttentionModel"]
+
+
+def build_model(config, mode="train"):
+    arch_config = copy.deepcopy(config["Arch"])
+    model_type = arch_config.pop("name")
+    use_sync_bn = arch_config.pop("use_sync_bn", False)
+    use_ml_decoder = arch_config.pop("use_ml_decoder", False)
+    mod = importlib.import_module(__name__)
+    arch = getattr(mod, model_type)(**arch_config)
+    if use_sync_bn:
+        if config["Global"]["device"] == "gpu":
+            arch = nn.SyncBatchNorm.convert_sync_batchnorm(arch)
+        else:
+            msg = "SyncBatchNorm can only be used on GPU device. The releated setting has been ignored."
+            logger.warning(msg)
+
+    if use_ml_decoder:
+        add_ml_decoder_head(arch, config.get("MLDecoder", {}))
+
+    if isinstance(arch, TheseusLayer):
+        prune_model(config, arch)
+        quantize_model(config, arch, mode)
+
+    return arch
+
+
+def apply_to_static(config, model, is_rec):
+    support_to_static = config['Global'].get('to_static', False)
+
+    if support_to_static:
+        specs = None
+        if 'image_shape' in config['Global']:
+            specs = [InputSpec([None] + config['Global']['image_shape'])]
+            specs[0].stop_gradient = True
+            if is_rec:
+                specs.append(InputSpec([None, 1], 'int64', stop_gradient=True))
+        model = to_static(model, input_spec=specs)
+        logger.info("Successfully to apply @to_static with specs: {}".format(
+            specs))
+    return model
+
+
+class RecModel(TheseusLayer):
+    def __init__(self, **config):
+        super().__init__()
+        backbone_config = config["Backbone"]
+        backbone_name = backbone_config.pop("name")
+        self.backbone = eval(backbone_name)(**backbone_config)
+        self.head_feature_from = config.get('head_feature_from', 'neck')
+
+        if "BackboneStopLayer" in config:
+            backbone_stop_layer = config["BackboneStopLayer"]["name"]
+            self.backbone.stop_after(backbone_stop_layer)
+
+        if "Neck" in config:
+            self.neck = build_gear(config["Neck"])
+        else:
+            self.neck = None
+
+        if "Head" in config:
+            self.head = build_gear(config["Head"])
+        else:
+            self.head = None
+
+    def forward(self, x, label=None):
+        
+        out = dict()
+        x = self.backbone(x)
+        out["backbone"] = x
+        if self.neck is not None:
+            feat = self.neck(x)
+            out["neck"] = feat
+        out["features"] = out['neck'] if self.neck else x
+        if self.head is not None:
+            if self.head_feature_from == 'backbone':
+                y = self.head(out['backbone'], label)
+            elif self.head_feature_from == 'neck':
+                y = self.head(out['features'], label)
+            out["logits"] = y
+        return out
+
+
+class DistillationModel(nn.Layer):
+    def __init__(self,
+                 models=None,
+                 pretrained_list=None,
+                 freeze_params_list=None,
+                 **kargs):
+        super().__init__()
+        assert isinstance(models, list)
+        self.model_list = []
+        self.model_name_list = []
+        if pretrained_list is not None:
+            assert len(pretrained_list) == len(models)
+
+        if freeze_params_list is None:
+            freeze_params_list = [False] * len(models)
+        assert len(freeze_params_list) == len(models)
+        for idx, model_config in enumerate(models):
+            assert len(model_config) == 1
+            key = list(model_config.keys())[0]
+            model_config = model_config[key]
+            model_name = model_config.pop("name")
+            model = eval(model_name)(**model_config)
+
+            if freeze_params_list[idx]:
+                for param in model.parameters():
+                    param.trainable = False
+            self.model_list.append(self.add_sublayer(key, model))
+            self.model_name_list.append(key)
+
+        if pretrained_list is not None:
+            for idx, pretrained in enumerate(pretrained_list):
+                if pretrained is not None:
+                    load_dygraph_pretrain(
+                        self.model_name_list[idx], path=pretrained)
+
+    def forward(self, x, label=None):
+        result_dict = dict()
+        for idx, model_name in enumerate(self.model_name_list):
+            if label is None:
+                result_dict[model_name] = self.model_list[idx](x)
+            else:
+                result_dict[model_name] = self.model_list[idx](x, label)
+        return result_dict
+
+
+class AttentionModel(DistillationModel):
+    def __init__(self,
+                 models=None,
+                 pretrained_list=None,
+                 freeze_params_list=None,
+                 **kargs):
+        super().__init__(models, pretrained_list, freeze_params_list, **kargs)
+
+    def forward(self, x, label=None):
+        result_dict = dict()
+        out = x
+        for idx, model_name in enumerate(self.model_name_list):
+            if label is None:
+                out = self.model_list[idx](out)
+                result_dict.update(out)
+            else:
+                out = self.model_list[idx](out, label)
+                result_dict.update(out)
+        return result_dict
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/__init__.py
new file mode 100644
index 000000000..f79dccdfe
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/__init__.py
@@ -0,0 +1,118 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import inspect
+
+from .legendary_models.mobilenet_v1 import MobileNetV1_x0_25, MobileNetV1_x0_5, MobileNetV1_x0_75, MobileNetV1
+from .legendary_models.mobilenet_v3 import MobileNetV3_small_x0_35, MobileNetV3_small_x0_5, MobileNetV3_small_x0_75, MobileNetV3_small_x1_0, MobileNetV3_small_x1_25, MobileNetV3_large_x0_35, MobileNetV3_large_x0_5, MobileNetV3_large_x0_75, MobileNetV3_large_x1_0, MobileNetV3_large_x1_25
+from .legendary_models.mobilenet_v4 import MobileNetV4_conv_small, MobileNetV4_conv_medium, MobileNetV4_conv_large, MobileNetV4_hybrid_medium, MobileNetV4_hybrid_large
+from .model_zoo.fasternet import FasterNet_T0, FasterNet_T1, FasterNet_T2, FasterNet_S, FasterNet_M, FasterNet_L
+from .model_zoo.starnet import StarNet_S1, StarNet_S2, StarNet_S3, StarNet_S4
+from .legendary_models.resnet import ResNet18, ResNet18_vd, ResNet34, ResNet34_vd, ResNet50, ResNet50_vd, ResNet101, ResNet101_vd, ResNet152, ResNet152_vd, ResNet200_vd
+from .legendary_models.vgg import VGG11, VGG13, VGG16, VGG19
+from .legendary_models.inception_v3 import InceptionV3
+from .legendary_models.hrnet import HRNet_W18_C, HRNet_W30_C, HRNet_W32_C, HRNet_W40_C, HRNet_W44_C, HRNet_W48_C, HRNet_W60_C, HRNet_W64_C, SE_HRNet_W64_C
+from .legendary_models.pp_lcnet import PPLCNetBaseNet, PPLCNet_x0_25, PPLCNet_x0_35, PPLCNet_x0_5, PPLCNet_x0_75, PPLCNet_x1_0, PPLCNet_x1_5, PPLCNet_x2_0, PPLCNet_x2_5
+from .legendary_models.pp_lcnet_v2 import PPLCNetV2_small, PPLCNetV2_base, PPLCNetV2_large
+from .legendary_models.esnet import ESNet_x0_25, ESNet_x0_5, ESNet_x0_75, ESNet_x1_0
+from .legendary_models.pp_hgnet import PPHGNet_tiny, PPHGNet_small, PPHGNet_base
+from .legendary_models.pp_hgnet_v2 import PPHGNetV2_B0, PPHGNetV2_B1, PPHGNetV2_B2, PPHGNetV2_B3, PPHGNetV2_B4, PPHGNetV2_B5, PPHGNetV2_B6
+
+from .model_zoo.resnet_vc import ResNet50_vc
+from .model_zoo.resnext import ResNeXt50_32x4d, ResNeXt50_64x4d, ResNeXt101_32x4d, ResNeXt101_64x4d, ResNeXt152_32x4d, ResNeXt152_64x4d
+from .model_zoo.resnext_vd import ResNeXt50_vd_32x4d, ResNeXt50_vd_64x4d, ResNeXt101_vd_32x4d, ResNeXt101_vd_64x4d, ResNeXt152_vd_32x4d, ResNeXt152_vd_64x4d
+from .model_zoo.res2net import Res2Net50_26w_4s, Res2Net50_14w_8s
+from .model_zoo.res2net_vd import Res2Net50_vd_26w_4s, Res2Net101_vd_26w_4s, Res2Net200_vd_26w_4s
+from .model_zoo.se_resnet_vd import SE_ResNet18_vd, SE_ResNet34_vd, SE_ResNet50_vd
+from .model_zoo.se_resnext_vd import SE_ResNeXt50_vd_32x4d, SE_ResNeXt50_vd_32x4d, SENet154_vd
+from .model_zoo.se_resnext import SE_ResNeXt50_32x4d, SE_ResNeXt101_32x4d, SE_ResNeXt152_64x4d
+from .model_zoo.dpn import DPN68, DPN92, DPN98, DPN107, DPN131
+from .model_zoo.dsnet import DSNet_tiny, DSNet_small, DSNet_base
+from .model_zoo.densenet import DenseNet121, DenseNet161, DenseNet169, DenseNet201, DenseNet264
+from .model_zoo.efficientnet import EfficientNetB0, EfficientNetB1, EfficientNetB2, EfficientNetB3, EfficientNetB4, EfficientNetB5, EfficientNetB6, EfficientNetB7, EfficientNetB0_small
+from .model_zoo.efficientnet_v2 import EfficientNetV2_S
+from .model_zoo.resnest import ResNeSt50_fast_1s1x64d, ResNeSt50, ResNeSt101, ResNeSt200, ResNeSt269
+from .model_zoo.googlenet import GoogLeNet
+from .model_zoo.mobilenet_v2 import MobileNetV2_x0_25, MobileNetV2_x0_5, MobileNetV2_x0_75, MobileNetV2, MobileNetV2_x1_5, MobileNetV2_x2_0
+from .model_zoo.mobilefacenet import MobileFaceNet
+from .model_zoo.shufflenet_v2 import ShuffleNetV2_x0_25, ShuffleNetV2_x0_33, ShuffleNetV2_x0_5, ShuffleNetV2_x1_0, ShuffleNetV2_x1_5, ShuffleNetV2_x2_0, ShuffleNetV2_swish
+from .model_zoo.ghostnet import GhostNet_x0_5, GhostNet_x1_0, GhostNet_x1_3
+from .model_zoo.alexnet import AlexNet
+from .model_zoo.inception_v4 import InceptionV4
+from .model_zoo.xception import Xception41, Xception65, Xception71
+from .model_zoo.xception_deeplab import Xception41_deeplab, Xception65_deeplab
+from .model_zoo.resnext101_wsl import ResNeXt101_32x8d_wsl, ResNeXt101_32x16d_wsl, ResNeXt101_32x32d_wsl, ResNeXt101_32x48d_wsl
+from .model_zoo.squeezenet import SqueezeNet1_0, SqueezeNet1_1
+from .model_zoo.darknet import DarkNet53
+from .model_zoo.regnet import RegNetX_200MF, RegNetX_400MF, RegNetX_600MF, RegNetX_800MF, RegNetX_1600MF, RegNetX_3200MF, RegNetX_4GF, RegNetX_6400MF, RegNetX_8GF, RegNetX_12GF, RegNetX_16GF, RegNetX_32GF
+from .model_zoo.vision_transformer import ViT_small_patch16_224, ViT_base_patch16_224, ViT_base_patch16_384, ViT_base_patch32_384, ViT_large_patch16_224, ViT_large_patch16_384, ViT_large_patch32_384
+from .model_zoo.distilled_vision_transformer import DeiT_tiny_patch16_224, DeiT_small_patch16_224, DeiT_base_patch16_224, DeiT_tiny_distilled_patch16_224, DeiT_small_distilled_patch16_224, DeiT_base_distilled_patch16_224, DeiT_base_patch16_384, DeiT_base_distilled_patch16_384
+from .legendary_models.swin_transformer import SwinTransformer_tiny_patch4_window7_224, SwinTransformer_small_patch4_window7_224, SwinTransformer_base_patch4_window7_224, SwinTransformer_base_patch4_window12_384, SwinTransformer_large_patch4_window7_224, SwinTransformer_large_patch4_window12_384
+from .model_zoo.swin_transformer_v2 import SwinTransformerV2_tiny_patch4_window8_256, SwinTransformerV2_small_patch4_window8_256, SwinTransformerV2_base_patch4_window8_256, SwinTransformerV2_tiny_patch4_window16_256, SwinTransformerV2_small_patch4_window16_256, SwinTransformerV2_base_patch4_window16_256, SwinTransformerV2_base_patch4_window24_384, SwinTransformerV2_large_patch4_window16_256, SwinTransformerV2_large_patch4_window24_384
+from .model_zoo.cswin_transformer import CSWinTransformer_tiny_224, CSWinTransformer_small_224, CSWinTransformer_base_224, CSWinTransformer_large_224, CSWinTransformer_base_384, CSWinTransformer_large_384
+from .model_zoo.mixnet import MixNet_S, MixNet_M, MixNet_L
+from .model_zoo.rexnet import ReXNet_1_0, ReXNet_1_3, ReXNet_1_5, ReXNet_2_0, ReXNet_3_0
+from .model_zoo.twins import pcpvt_small, pcpvt_base, pcpvt_large, alt_gvt_small, alt_gvt_base, alt_gvt_large
+from .model_zoo.levit import LeViT_128S, LeViT_128, LeViT_192, LeViT_256, LeViT_384
+from .model_zoo.dla import DLA34, DLA46_c, DLA46x_c, DLA60, DLA60x, DLA60x_c, DLA102, DLA102x, DLA102x2, DLA169
+from .model_zoo.rednet import RedNet26, RedNet38, RedNet50, RedNet101, RedNet152
+from .model_zoo.tnt import TNT_small, TNT_base
+from .model_zoo.hardnet import HarDNet68, HarDNet85, HarDNet39_ds, HarDNet68_ds
+from .model_zoo.cspnet import CSPDarkNet53
+from .model_zoo.pvt_v2 import PVT_V2_B0, PVT_V2_B1, PVT_V2_B2_Linear, PVT_V2_B2, PVT_V2_B3, PVT_V2_B4, PVT_V2_B5
+from .model_zoo.mobilevit import MobileViT_XXS, MobileViT_XS, MobileViT_S
+from .model_zoo.repvgg import RepVGG_A0, RepVGG_A1, RepVGG_A2, RepVGG_B0, RepVGG_B1, RepVGG_B2, RepVGG_B1g2, RepVGG_B1g4, RepVGG_B2g4, RepVGG_B3, RepVGG_B3g4, RepVGG_D2se
+from .model_zoo.van import VAN_B0, VAN_B1, VAN_B2, VAN_B3
+from .model_zoo.peleenet import PeleeNet
+from .model_zoo.foundation_vit import CLIP_vit_base_patch32_224, CLIP_vit_base_patch16_224, CLIP_vit_large_patch14_336, CLIP_vit_large_patch14_224, BEiTv2_vit_base_patch16_224, BEiTv2_vit_large_patch16_224, CAE_vit_base_patch16_224, EVA_vit_giant_patch14, MOCOV3_vit_small, MOCOV3_vit_base, MAE_vit_huge_patch14, MAE_vit_large_patch16, MAE_vit_base_patch16
+from .model_zoo.convnext import ConvNeXt_tiny, ConvNeXt_small, ConvNeXt_base_224, ConvNeXt_base_384, ConvNeXt_large_224, ConvNeXt_large_384
+from .model_zoo.nextvit import NextViT_small_224, NextViT_base_224, NextViT_large_224, NextViT_small_384, NextViT_base_384, NextViT_large_384
+from .model_zoo.cae import cae_base_patch16_224, cae_large_patch16_224
+from .model_zoo.cvt import CvT_13_224, CvT_13_384, CvT_21_224, CvT_21_384, CvT_W24_384
+from .model_zoo.micronet import MicroNet_M0, MicroNet_M1, MicroNet_M2, MicroNet_M3
+from .model_zoo.mobilenext import MobileNeXt_x0_35, MobileNeXt_x0_5, MobileNeXt_x0_75, MobileNeXt_x1_0, MobileNeXt_x1_4
+from .model_zoo.mobilevit_v2 import MobileViTV2_x0_5, MobileViTV2_x0_75, MobileViTV2_x1_0, MobileViTV2_x1_25, MobileViTV2_x1_5, MobileViTV2_x1_75, MobileViTV2_x2_0
+from .model_zoo.tinynet import TinyNet_A, TinyNet_B, TinyNet_C, TinyNet_D, TinyNet_E
+from .model_zoo.mobilevit_v3 import MobileViTV3_XXS, MobileViTV3_XS, MobileViTV3_S, MobileViTV3_XXS_L2, MobileViTV3_XS_L2, MobileViTV3_S_L2, MobileViTV3_x0_5, MobileViTV3_x0_75, MobileViTV3_x1_0
+from .model_zoo.svtrnet import SVTR_tiny, SVTR_base, SVTR_large
+
+from .variant_models.resnet_variant import ResNet50_last_stage_stride1
+from .variant_models.resnet_variant import ResNet50_adaptive_max_pool2d
+from .variant_models.resnet_variant import ResNet50_metabin
+from .variant_models.vgg_variant import VGG19Sigmoid
+from .variant_models.pp_lcnet_variant import PPLCNet_x2_5_Tanh
+from .variant_models.pp_lcnetv2_variant import PPLCNetV2_base_ShiTu
+from .variant_models.efficientnet_variant import EfficientNetB3_watermark
+from .variant_models.foundation_vit_variant import CLIP_large_patch14_224_aesthetic
+from .variant_models.swin_transformer_variant import SwinTransformer_tiny_patch4_window7_224_SOLIDER, SwinTransformer_small_patch4_window7_224_SOLIDER, SwinTransformer_base_patch4_window7_224_SOLIDER
+from .model_zoo.adaface_ir_net import AdaFace_IR_18, AdaFace_IR_34, AdaFace_IR_50, AdaFace_IR_101, AdaFace_IR_152, AdaFace_IR_SE_50, AdaFace_IR_SE_101, AdaFace_IR_SE_152, AdaFace_IR_SE_200
+from .model_zoo.wideresnet import WideResNet
+from .model_zoo.uniformer import UniFormer_small, UniFormer_small_plus, UniFormer_small_plus_dim64, UniFormer_base, UniFormer_base_ls
+
+
+# help whl get all the models' api (class type) and components' api (func type)
+def get_apis():
+    current_func = sys._getframe().f_code.co_name
+    current_module = sys.modules[__name__]
+    api = []
+    for _, obj in inspect.getmembers(current_module,
+                                     inspect.isclass) + inspect.getmembers(
+                                         current_module, inspect.isfunction):
+        api.append(obj.__name__)
+    api.remove(current_func)
+    return api
+
+
+__all__ = get_apis()
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/base/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/base/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/base/dbb/dbb_block.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/base/dbb/dbb_block.py
new file mode 100644
index 000000000..f38c5c257
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/base/dbb/dbb_block.py
@@ -0,0 +1,365 @@
+# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/2103.13425, https://github.com/DingXiaoH/DiverseBranchBlock
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+from .dbb_transforms import *
+
+
+def conv_bn(in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding=0,
+            dilation=1,
+            groups=1,
+            padding_mode='zeros'):
+    conv_layer = nn.Conv2D(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        groups=groups,
+        bias_attr=False,
+        padding_mode=padding_mode)
+    bn_layer = nn.BatchNorm2D(num_features=out_channels)
+    se = nn.Sequential()
+    se.add_sublayer('conv', conv_layer)
+    se.add_sublayer('bn', bn_layer)
+    return se
+
+
+class IdentityBasedConv1x1(nn.Conv2D):
+    def __init__(self, channels, groups=1):
+        super(IdentityBasedConv1x1, self).__init__(
+            in_channels=channels,
+            out_channels=channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=groups,
+            bias_attr=False)
+
+        assert channels % groups == 0
+        input_dim = channels // groups
+        id_value = np.zeros((channels, input_dim, 1, 1))
+        for i in range(channels):
+            id_value[i, i % input_dim, 0, 0] = 1
+        self.id_tensor = paddle.to_tensor(id_value)
+        self.weight.set_value(paddle.zeros_like(self.weight))
+
+    def forward(self, input):
+        kernel = self.weight + self.id_tensor
+        result = F.conv2d(
+            input,
+            kernel,
+            None,
+            stride=1,
+            padding=0,
+            dilation=self._dilation,
+            groups=self._groups)
+        return result
+
+    def get_actual_kernel(self):
+        return self.weight + self.id_tensor
+
+
+class BNAndPad(nn.Layer):
+    def __init__(self,
+                 pad_pixels,
+                 num_features,
+                 epsilon=1e-5,
+                 momentum=0.1,
+                 last_conv_bias=None,
+                 bn=nn.BatchNorm2D):
+        super().__init__()
+        self.bn = bn(num_features, momentum=momentum, epsilon=epsilon)
+        self.pad_pixels = pad_pixels
+        self.last_conv_bias = last_conv_bias
+
+    def forward(self, input):
+        output = self.bn(input)
+        if self.pad_pixels > 0:
+            bias = -self.bn._mean
+            if self.last_conv_bias is not None:
+                bias += self.last_conv_bias
+            pad_values = self.bn.bias + self.bn.weight * (
+                bias / paddle.sqrt(self.bn._variance + self.bn._epsilon))
+            ''' pad '''
+            # TODO: n,h,w,c format is not supported yet
+            n, c, h, w = output.shape
+            values = pad_values.reshape([1, -1, 1, 1])
+            w_values = values.expand([n, -1, self.pad_pixels, w])
+            x = paddle.concat([w_values, output, w_values], axis=2)
+            h = h + self.pad_pixels * 2
+            h_values = values.expand([n, -1, h, self.pad_pixels])
+            x = paddle.concat([h_values, x, h_values], axis=3)
+            output = x
+        return output
+
+    @property
+    def weight(self):
+        return self.bn.weight
+
+    @property
+    def bias(self):
+        return self.bn.bias
+
+    @property
+    def _mean(self):
+        return self.bn._mean
+
+    @property
+    def _variance(self):
+        return self.bn._variance
+
+    @property
+    def _epsilon(self):
+        return self.bn._epsilon
+
+
+class DiverseBranchBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 is_repped=False,
+                 single_init=False,
+                 **kwargs):
+        super().__init__()
+
+        padding = (filter_size - 1) // 2
+        dilation = 1
+
+        in_channels = num_channels
+        out_channels = num_filters
+        kernel_size = filter_size
+        internal_channels_1x1_3x3 = None
+        nonlinear = act
+
+        self.is_repped = is_repped
+
+        if nonlinear is None:
+            self.nonlinear = nn.Identity()
+        else:
+            self.nonlinear = nn.ReLU()
+
+        self.kernel_size = kernel_size
+        self.out_channels = out_channels
+        self.groups = groups
+        assert padding == kernel_size // 2
+
+        if is_repped:
+            self.dbb_reparam = nn.Conv2D(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups,
+                bias_attr=True)
+        else:
+            self.dbb_origin = conv_bn(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups)
+
+            self.dbb_avg = nn.Sequential()
+            if groups < out_channels:
+                self.dbb_avg.add_sublayer(
+                    'conv',
+                    nn.Conv2D(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        groups=groups,
+                        bias_attr=False))
+                self.dbb_avg.add_sublayer(
+                    'bn',
+                    BNAndPad(
+                        pad_pixels=padding, num_features=out_channels))
+                self.dbb_avg.add_sublayer(
+                    'avg',
+                    nn.AvgPool2D(
+                        kernel_size=kernel_size, stride=stride, padding=0))
+                self.dbb_1x1 = conv_bn(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=1,
+                    stride=stride,
+                    padding=0,
+                    groups=groups)
+            else:
+                self.dbb_avg.add_sublayer(
+                    'avg',
+                    nn.AvgPool2D(
+                        kernel_size=kernel_size,
+                        stride=stride,
+                        padding=padding))
+
+            self.dbb_avg.add_sublayer('avgbn', nn.BatchNorm2D(out_channels))
+
+            if internal_channels_1x1_3x3 is None:
+                internal_channels_1x1_3x3 = in_channels if groups < out_channels else 2 * in_channels  # For mobilenet, it is better to have 2X internal channels
+
+            self.dbb_1x1_kxk = nn.Sequential()
+            if internal_channels_1x1_3x3 == in_channels:
+                self.dbb_1x1_kxk.add_sublayer(
+                    'idconv1',
+                    IdentityBasedConv1x1(
+                        channels=in_channels, groups=groups))
+            else:
+                self.dbb_1x1_kxk.add_sublayer(
+                    'conv1',
+                    nn.Conv2D(
+                        in_channels=in_channels,
+                        out_channels=internal_channels_1x1_3x3,
+                        kernel_size=1,
+                        stride=1,
+                        padding=0,
+                        groups=groups,
+                        bias_attr=False))
+            self.dbb_1x1_kxk.add_sublayer(
+                'bn1',
+                BNAndPad(
+                    pad_pixels=padding,
+                    num_features=internal_channels_1x1_3x3))
+            self.dbb_1x1_kxk.add_sublayer(
+                'conv2',
+                nn.Conv2D(
+                    in_channels=internal_channels_1x1_3x3,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=0,
+                    groups=groups,
+                    bias_attr=False))
+            self.dbb_1x1_kxk.add_sublayer('bn2', nn.BatchNorm2D(out_channels))
+
+        #   The experiments reported in the paper used the default initialization of bn.weight (all as 1). But changing the initialization may be useful in some cases.
+        if single_init:
+            #   Initialize the bn.weight of dbb_origin as 1 and others as 0. This is not the default setting.
+            self.single_init()
+
+    def forward(self, inputs):
+        if self.is_repped:
+            return self.nonlinear(self.dbb_reparam(inputs))
+
+        out = self.dbb_origin(inputs)
+        if hasattr(self, 'dbb_1x1'):
+            out += self.dbb_1x1(inputs)
+        out += self.dbb_avg(inputs)
+        out += self.dbb_1x1_kxk(inputs)
+        return self.nonlinear(out)
+
+    def init_gamma(self, gamma_value):
+        if hasattr(self, "dbb_origin"):
+            paddle.nn.init.constant_(self.dbb_origin.bn.weight, gamma_value)
+        if hasattr(self, "dbb_1x1"):
+            paddle.nn.init.constant_(self.dbb_1x1.bn.weight, gamma_value)
+        if hasattr(self, "dbb_avg"):
+            paddle.nn.init.constant_(self.dbb_avg.avgbn.weight, gamma_value)
+        if hasattr(self, "dbb_1x1_kxk"):
+            paddle.nn.init.constant_(self.dbb_1x1_kxk.bn2.weight, gamma_value)
+
+    def single_init(self):
+        self.init_gamma(0.0)
+        if hasattr(self, "dbb_origin"):
+            paddle.nn.init.constant_(self.dbb_origin.bn.weight, 1.0)
+
+    def get_equivalent_kernel_bias(self):
+        k_origin, b_origin = transI_fusebn(self.dbb_origin.conv.weight,
+                                           self.dbb_origin.bn)
+
+        if hasattr(self, 'dbb_1x1'):
+            k_1x1, b_1x1 = transI_fusebn(self.dbb_1x1.conv.weight,
+                                         self.dbb_1x1.bn)
+            k_1x1 = transVI_multiscale(k_1x1, self.kernel_size)
+        else:
+            k_1x1, b_1x1 = 0, 0
+
+        if hasattr(self.dbb_1x1_kxk, 'idconv1'):
+            k_1x1_kxk_first = self.dbb_1x1_kxk.idconv1.get_actual_kernel()
+        else:
+            k_1x1_kxk_first = self.dbb_1x1_kxk.conv1.weight
+        k_1x1_kxk_first, b_1x1_kxk_first = transI_fusebn(k_1x1_kxk_first,
+                                                         self.dbb_1x1_kxk.bn1)
+        k_1x1_kxk_second, b_1x1_kxk_second = transI_fusebn(
+            self.dbb_1x1_kxk.conv2.weight, self.dbb_1x1_kxk.bn2)
+        k_1x1_kxk_merged, b_1x1_kxk_merged = transIII_1x1_kxk(
+            k_1x1_kxk_first,
+            b_1x1_kxk_first,
+            k_1x1_kxk_second,
+            b_1x1_kxk_second,
+            groups=self.groups)
+
+        k_avg = transV_avg(self.out_channels, self.kernel_size, self.groups)
+        k_1x1_avg_second, b_1x1_avg_second = transI_fusebn(k_avg,
+                                                           self.dbb_avg.avgbn)
+        if hasattr(self.dbb_avg, 'conv'):
+            k_1x1_avg_first, b_1x1_avg_first = transI_fusebn(
+                self.dbb_avg.conv.weight, self.dbb_avg.bn)
+            k_1x1_avg_merged, b_1x1_avg_merged = transIII_1x1_kxk(
+                k_1x1_avg_first,
+                b_1x1_avg_first,
+                k_1x1_avg_second,
+                b_1x1_avg_second,
+                groups=self.groups)
+        else:
+            k_1x1_avg_merged, b_1x1_avg_merged = k_1x1_avg_second, b_1x1_avg_second
+
+        return transII_addbranch(
+            (k_origin, k_1x1, k_1x1_kxk_merged, k_1x1_avg_merged),
+            (b_origin, b_1x1, b_1x1_kxk_merged, b_1x1_avg_merged))
+
+    def re_parameterize(self):
+        if self.is_repped:
+            return
+
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.dbb_reparam = nn.Conv2D(
+            in_channels=self.dbb_origin.conv._in_channels,
+            out_channels=self.dbb_origin.conv._out_channels,
+            kernel_size=self.dbb_origin.conv._kernel_size,
+            stride=self.dbb_origin.conv._stride,
+            padding=self.dbb_origin.conv._padding,
+            dilation=self.dbb_origin.conv._dilation,
+            groups=self.dbb_origin.conv._groups,
+            bias_attr=True)
+
+        self.dbb_reparam.weight.set_value(kernel)
+        self.dbb_reparam.bias.set_value(bias)
+
+        self.__delattr__('dbb_origin')
+        self.__delattr__('dbb_avg')
+        if hasattr(self, 'dbb_1x1'):
+            self.__delattr__('dbb_1x1')
+        self.__delattr__('dbb_1x1_kxk')
+        self.is_repped = True
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/base/dbb/dbb_transforms.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/base/dbb/dbb_transforms.py
new file mode 100644
index 000000000..70f55fb09
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/base/dbb/dbb_transforms.py
@@ -0,0 +1,73 @@
+# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/2103.13425, https://github.com/DingXiaoH/DiverseBranchBlock
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+
+def transI_fusebn(kernel, bn):
+    gamma = bn.weight
+    std = (bn._variance + bn._epsilon).sqrt()
+    return kernel * (
+        (gamma / std).reshape([-1, 1, 1, 1])), bn.bias - bn._mean * gamma / std
+
+
+def transII_addbranch(kernels, biases):
+    return sum(kernels), sum(biases)
+
+
+def transIII_1x1_kxk(k1, b1, k2, b2, groups):
+    if groups == 1:
+        k = F.conv2d(k2, k1.transpose([1, 0, 2, 3]))
+        b_hat = (k2 * b1.reshape([1, -1, 1, 1])).sum((1, 2, 3))
+    else:
+        k_slices = []
+        b_slices = []
+        k1_T = k1.transpose([1, 0, 2, 3])
+        k1_group_width = k1.shape[0] // groups
+        k2_group_width = k2.shape[0] // groups
+        for g in range(groups):
+            k1_T_slice = k1_T[:, g * k1_group_width:(g + 1) *
+                              k1_group_width, :, :]
+            k2_slice = k2[g * k2_group_width:(g + 1) * k2_group_width, :, :, :]
+            k_slices.append(F.conv2d(k2_slice, k1_T_slice))
+            b_slices.append((k2_slice * b1[g * k1_group_width:(
+                g + 1) * k1_group_width].reshape([1, -1, 1, 1])).sum((1, 2, 3
+                                                                      )))
+        k, b_hat = transIV_depthconcat(k_slices, b_slices)
+    return k, b_hat + b2
+
+
+def transIV_depthconcat(kernels, biases):
+    return paddle.cat(kernels, axis=0), paddle.cat(biases)
+
+
+def transV_avg(channels, kernel_size, groups):
+    input_dim = channels // groups
+    k = paddle.zeros((channels, input_dim, kernel_size, kernel_size))
+    k[np.arange(channels), np.tile(np.arange(input_dim),
+                                   groups), :, :] = 1.0 / kernel_size**2
+    return k
+
+
+# This has not been tested with non-square kernels (kernel.shape[2] != kernel.shape[3]) nor even-size kernels
+def transVI_multiscale(kernel, target_kernel_size):
+    H_pixels_to_pad = (target_kernel_size - kernel.shape[2]) // 2
+    W_pixels_to_pad = (target_kernel_size - kernel.shape[3]) // 2
+    return F.pad(
+        kernel,
+        [H_pixels_to_pad, H_pixels_to_pad, W_pixels_to_pad, W_pixels_to_pad])
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/base/theseus_layer.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/base/theseus_layer.py
new file mode 100644
index 000000000..30c5a9f8b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/base/theseus_layer.py
@@ -0,0 +1,398 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple, List, Dict, Union, Callable, Any
+
+from paddle import nn
+from ....utils import logger
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, inputs):
+        return inputs
+
+
+class TheseusLayer(nn.Layer):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.res_dict = {}
+        self.res_name = self.full_name()
+        self.pruner = None
+        self.quanter = None
+
+        self.init_net(*args, **kwargs)
+
+    def _return_dict_hook(self, layer, input, output):
+        res_dict = {"logits": output}
+        # 'list' is needed to avoid error raised by popping self.res_dict
+        for res_key in list(self.res_dict):
+            # clear the res_dict because the forward process may change according to input
+            res_dict[res_key] = self.res_dict.pop(res_key)
+        return res_dict
+
+    def init_net(self,
+                 stages_pattern=None,
+                 return_patterns=None,
+                 return_stages=None,
+                 freeze_befor=None,
+                 stop_after=None,
+                 *args,
+                 **kwargs):
+        # init the output of net
+        if return_patterns or return_stages:
+            if return_patterns and return_stages:
+                msg = f"The 'return_patterns' would be ignored when 'return_stages' is set."
+                logger.warning(msg)
+                return_stages = None
+
+            if return_stages is True:
+                return_patterns = stages_pattern
+
+            # return_stages is int or bool
+            if type(return_stages) is int:
+                return_stages = [return_stages]
+            if isinstance(return_stages, list):
+                if max(return_stages) > len(stages_pattern) or min(
+                        return_stages) < 0:
+                    msg = f"The 'return_stages' set error. Illegal value(s) have been ignored. The stages' pattern list is {stages_pattern}."
+                    logger.warning(msg)
+                    return_stages = [
+                        val for val in return_stages
+                        if val >= 0 and val < len(stages_pattern)
+                    ]
+                return_patterns = [stages_pattern[i] for i in return_stages]
+
+            if return_patterns:
+                # call update_res function after the __init__ of the object has completed execution, that is, the contructing of layer or model has been completed.
+                def update_res_hook(layer, input):
+                    self.update_res(return_patterns)
+
+                self.register_forward_pre_hook(update_res_hook)
+
+        # freeze subnet
+        if freeze_befor is not None:
+            self.freeze_befor(freeze_befor)
+
+        # set subnet to Identity
+        if stop_after is not None:
+            self.stop_after(stop_after)
+
+    def init_res(self,
+                 stages_pattern,
+                 return_patterns=None,
+                 return_stages=None):
+        msg = "\"init_res\" will be deprecated, please use \"init_net\" instead."
+        logger.warning(DeprecationWarning(msg))
+
+        if return_patterns and return_stages:
+            msg = f"The 'return_patterns' would be ignored when 'return_stages' is set."
+            logger.warning(msg)
+            return_stages = None
+
+        if return_stages is True:
+            return_patterns = stages_pattern
+        # return_stages is int or bool
+        if type(return_stages) is int:
+            return_stages = [return_stages]
+        if isinstance(return_stages, list):
+            if max(return_stages) > len(stages_pattern) or min(
+                    return_stages) < 0:
+                msg = f"The 'return_stages' set error. Illegal value(s) have been ignored. The stages' pattern list is {stages_pattern}."
+                logger.warning(msg)
+                return_stages = [
+                    val for val in return_stages
+                    if val >= 0 and val < len(stages_pattern)
+                ]
+            return_patterns = [stages_pattern[i] for i in return_stages]
+
+        if return_patterns:
+            self.update_res(return_patterns)
+
+    def replace_sub(self, *args, **kwargs) -> None:
+        msg = "The function 'replace_sub()' is deprecated, please use 'upgrade_sublayer()' instead."
+        logger.error(DeprecationWarning(msg))
+        raise DeprecationWarning(msg)
+
+    def upgrade_sublayer(self,
+                         layer_name_pattern: Union[str, List[str]],
+                         handle_func: Callable[[nn.Layer, str], nn.Layer]
+                         ) -> Dict[str, nn.Layer]:
+        """use 'handle_func' to modify the sub-layer(s) specified by 'layer_name_pattern'.
+
+        Args:
+            layer_name_pattern (Union[str, List[str]]): The name of layer to be modified by 'handle_func'.
+            handle_func (Callable[[nn.Layer, str], nn.Layer]): The function to modify target layer specified by 'layer_name_pattern'. The formal params are the layer(nn.Layer) and pattern(str) that is (a member of) layer_name_pattern (when layer_name_pattern is List type). And the return is the layer processed.
+
+        Returns:
+            Dict[str, nn.Layer]: The key is the pattern and corresponding value is the result returned by 'handle_func()'.
+
+        Examples:
+
+            from paddle import nn
+            import paddleclas
+
+            def rep_func(layer: nn.Layer, pattern: str):
+                new_layer = nn.Conv2D(
+                    in_channels=layer._in_channels,
+                    out_channels=layer._out_channels,
+                    kernel_size=5,
+                    padding=2
+                )
+                return new_layer
+
+            net = paddleclas.MobileNetV1()
+            res = net.upgrade_sublayer(layer_name_pattern=["blocks[11].depthwise_conv.conv", "blocks[12].depthwise_conv.conv"], handle_func=rep_func)
+            print(res)
+            # {'blocks[11].depthwise_conv.conv': the corresponding new_layer, 'blocks[12].depthwise_conv.conv': the corresponding new_layer}
+        """
+
+        if not isinstance(layer_name_pattern, list):
+            layer_name_pattern = [layer_name_pattern]
+
+        hit_layer_pattern_list = []
+        for pattern in layer_name_pattern:
+            # parse pattern to find target layer and its parent
+            layer_list = parse_pattern_str(pattern=pattern, parent_layer=self)
+            if not layer_list:
+                continue
+
+            sub_layer_parent = layer_list[-2]["layer"] if len(
+                layer_list) > 1 else self
+            sub_layer = layer_list[-1]["layer"]
+            sub_layer_name = layer_list[-1]["name"]
+            sub_layer_index_list = layer_list[-1]["index_list"]
+
+            new_sub_layer = handle_func(sub_layer, pattern)
+
+            if sub_layer_index_list:
+                if len(sub_layer_index_list) > 1:
+                    sub_layer_parent = getattr(
+                        sub_layer_parent,
+                        sub_layer_name)[sub_layer_index_list[0]]
+                    for sub_layer_index in sub_layer_index_list[1:-1]:
+                        sub_layer_parent = sub_layer_parent[sub_layer_index]
+                    sub_layer_parent[sub_layer_index_list[-1]] = new_sub_layer
+                else:
+                    getattr(sub_layer_parent, sub_layer_name)[
+                        sub_layer_index_list[0]] = new_sub_layer
+            else:
+                setattr(sub_layer_parent, sub_layer_name, new_sub_layer)
+
+            hit_layer_pattern_list.append(pattern)
+        return hit_layer_pattern_list
+
+    def stop_after(self, stop_layer_name: str) -> bool:
+        """stop forward and backward after 'stop_layer_name'.
+
+        Args:
+            stop_layer_name (str): The name of layer that stop forward and backward after this layer.
+
+        Returns:
+            bool: 'True' if successful, 'False' otherwise.
+        """
+
+        layer_list = parse_pattern_str(stop_layer_name, self)
+        if not layer_list:
+            return False
+
+        parent_layer = self
+        for layer_dict in layer_list:
+            name, index_list = layer_dict["name"], layer_dict["index_list"]
+            if not set_identity(parent_layer, name, index_list):
+                msg = f"Failed to set the layers that after stop_layer_name('{stop_layer_name}') to IdentityLayer. The error layer's name is '{name}'."
+                logger.warning(msg)
+                return False
+            parent_layer = layer_dict["layer"]
+
+        return True
+
+    def freeze_befor(self, layer_name: str) -> bool:
+        """freeze the layer named layer_name and its previous layer.
+
+        Args:
+            layer_name (str): The name of layer that would be freezed.
+
+        Returns:
+            bool: 'True' if successful, 'False' otherwise.
+        """
+
+        def stop_grad(layer, pattern):
+            class StopGradLayer(nn.Layer):
+                def __init__(self):
+                    super().__init__()
+                    self.layer = layer
+
+                def forward(self, x):
+                    x = self.layer(x)
+                    x.stop_gradient = True
+                    return x
+
+            new_layer = StopGradLayer()
+            return new_layer
+
+        res = self.upgrade_sublayer(layer_name, stop_grad)
+        if len(res) == 0:
+            msg = "Failed to stop the gradient befor the layer named '{layer_name}'"
+            logger.warning(msg)
+            return False
+        return True
+
+    def update_res(
+            self,
+            return_patterns: Union[str, List[str]]) -> Dict[str, nn.Layer]:
+        """update the result(s) to be returned.
+
+        Args:
+            return_patterns (Union[str, List[str]]): The name of layer to return output.
+
+        Returns:
+            Dict[str, nn.Layer]: The pattern(str) and corresponding layer(nn.Layer) that have been set successfully.
+        """
+
+        # clear res_dict that could have been set
+        self.res_dict = {}
+
+        class Handler(object):
+            def __init__(self, res_dict):
+                # res_dict is a reference
+                self.res_dict = res_dict
+
+            def __call__(self, layer, pattern):
+                layer.res_dict = self.res_dict
+                layer.res_name = pattern
+                if hasattr(layer, "hook_remove_helper"):
+                    layer.hook_remove_helper.remove()
+                layer.hook_remove_helper = layer.register_forward_post_hook(
+                    save_sub_res_hook)
+                return layer
+
+        handle_func = Handler(self.res_dict)
+
+        hit_layer_pattern_list = self.upgrade_sublayer(
+            return_patterns, handle_func=handle_func)
+
+        if hasattr(self, "hook_remove_helper"):
+            self.hook_remove_helper.remove()
+        self.hook_remove_helper = self.register_forward_post_hook(
+            self._return_dict_hook)
+
+        return hit_layer_pattern_list
+
+
+def save_sub_res_hook(layer, input, output):
+    layer.res_dict[layer.res_name] = output
+
+
+def set_identity(parent_layer: nn.Layer,
+                 layer_name: str,
+                 layer_index_list: str=None) -> bool:
+    """set the layer specified by layer_name and layer_index_list to Indentity.
+
+    Args:
+        parent_layer (nn.Layer): The parent layer of target layer specified by layer_name and layer_index_list.
+        layer_name (str): The name of target layer to be set to Indentity.
+        layer_index_list (str, optional): The index of target layer to be set to Indentity in parent_layer. Defaults to None.
+
+    Returns:
+        bool: True if successfully, False otherwise.
+    """
+
+    stop_after = False
+    for sub_layer_name in parent_layer._sub_layers:
+        if stop_after:
+            parent_layer._sub_layers[sub_layer_name] = Identity()
+            continue
+        if sub_layer_name == layer_name:
+            stop_after = True
+
+    if layer_index_list and stop_after:
+        layer_container = parent_layer._sub_layers[layer_name]
+        for num, layer_index in enumerate(layer_index_list):
+            stop_after = False
+            for i in range(num):
+                layer_container = layer_container[layer_index_list[i]]
+            for sub_layer_index in layer_container._sub_layers:
+                if stop_after:
+                    parent_layer._sub_layers[layer_name][
+                        sub_layer_index] = Identity()
+                    continue
+                if layer_index == sub_layer_index:
+                    stop_after = True
+
+    return stop_after
+
+
+def parse_pattern_str(pattern: str, parent_layer: nn.Layer) -> Union[
+        None, List[Dict[str, Union[nn.Layer, str, None]]]]:
+    """parse the string type pattern.
+
+    Args:
+        pattern (str): The pattern to discribe layer.
+        parent_layer (nn.Layer): The root layer relative to the pattern.
+
+    Returns:
+        Union[None, List[Dict[str, Union[nn.Layer, str, None]]]]: None if failed. If successfully, the members are layers parsed in order:
+                                                                [
+                                                                    {"layer": first layer, "name": first layer's name parsed, "index": first layer's index parsed if exist},
+                                                                    {"layer": second layer, "name": second layer's name parsed, "index": second layer's index parsed if exist},
+                                                                    ...
+                                                                ]
+    """
+
+    pattern_list = pattern.split(".")
+    if not pattern_list:
+        msg = f"The pattern('{pattern}') is illegal. Please check and retry."
+        logger.warning(msg)
+        return None
+
+    layer_list = []
+    while len(pattern_list) > 0:
+        if '[' in pattern_list[0]:
+            target_layer_name = pattern_list[0].split('[')[0]
+            target_layer_index_list = list(
+                index.split(']')[0]
+                for index in pattern_list[0].split('[')[1:])
+        else:
+            target_layer_name = pattern_list[0]
+            target_layer_index_list = None
+
+        target_layer = getattr(parent_layer, target_layer_name, None)
+
+        if target_layer is None:
+            msg = f"Not found layer named('{target_layer_name}') specifed in pattern('{pattern}')."
+            logger.warning(msg)
+            return None
+
+        if target_layer_index_list:
+            for target_layer_index in target_layer_index_list:
+                if int(target_layer_index) < 0 or int(
+                        target_layer_index) >= len(target_layer):
+                    msg = f"Not found layer by index('{target_layer_index}') specifed in pattern('{pattern}'). The index should < {len(target_layer)} and > 0."
+                    logger.warning(msg)
+                    return None
+                target_layer = target_layer[target_layer_index]
+
+        layer_list.append({
+            "layer": target_layer,
+            "name": target_layer_name,
+            "index_list": target_layer_index_list
+        })
+
+        pattern_list = pattern_list[1:]
+        parent_layer = target_layer
+
+    return layer_list
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/__init__.py
new file mode 100644
index 000000000..4a4d48e71
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/__init__.py
@@ -0,0 +1,8 @@
+from .resnet import ResNet18, ResNet34, ResNet50, ResNet101, ResNet152, ResNet18_vd, ResNet34_vd, ResNet50_vd, ResNet101_vd, ResNet152_vd
+from .hrnet import HRNet_W18_C, HRNet_W30_C, HRNet_W32_C, HRNet_W40_C, HRNet_W44_C, HRNet_W48_C, HRNet_W64_C
+from .mobilenet_v1 import MobileNetV1_x0_25, MobileNetV1_x0_5, MobileNetV1_x0_75, MobileNetV1
+from .mobilenet_v3 import MobileNetV3_small_x0_35, MobileNetV3_small_x0_5, MobileNetV3_small_x0_75, MobileNetV3_small_x1_0, MobileNetV3_small_x1_25, MobileNetV3_large_x0_35, MobileNetV3_large_x0_5, MobileNetV3_large_x0_75, MobileNetV3_large_x1_0, MobileNetV3_large_x1_25
+from .mobilenet_v4 import MobileNetV4_conv_small, MobileNetV4_conv_medium, MobileNetV4_conv_large, MobileNetV4_hybrid_medium, MobileNetV4_hybrid_large
+from .inception_v3 import InceptionV3
+from .vgg import VGG11, VGG13, VGG16, VGG19
+from .pp_lcnet import PPLCNetBaseNet, PPLCNet_x0_25, PPLCNet_x0_35, PPLCNet_x0_5, PPLCNet_x0_75, PPLCNet_x1_0, PPLCNet_x1_5, PPLCNet_x2_0, PPLCNet_x2_5
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/custom_devices_layers.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/custom_devices_layers.py
new file mode 100644
index 000000000..547646b96
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/custom_devices_layers.py
@@ -0,0 +1,37 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+__all__ = ["AdaptiveAvgPool2D"]
+
+
+class AdaptiveAvgPool2D(nn.AdaptiveAvgPool2D):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if paddle.device.get_device().startswith("npu"):
+            self.device = "npu"
+        else:
+            self.device = None
+
+        if isinstance(self._output_size, int) and self._output_size == 1:
+            self._gap = True
+        elif isinstance(self._output_size, tuple) and self._output_size[
+                0] == 1 and self._output_size[1] == 1:
+            self._gap = True
+        else:
+            self._gap = False
+
+    def forward(self, x):
+        if self.device == "npu" and self._gap:
+            # Global Average Pooling
+            N, C, _, _ = x.shape
+            x_mean = paddle.mean(x, axis=[2, 3])
+            x_mean = paddle.reshape(x_mean, [N, C, 1, 1])
+            return x_mean
+        else:
+            return F.adaptive_avg_pool2d(
+                x,
+                output_size=self._output_size,
+                data_format=self._data_format,
+                name=self._name, )
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/esnet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/esnet.py
new file mode 100644
index 000000000..8d2872eae
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/esnet.py
@@ -0,0 +1,369 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+import math
+import paddle
+from paddle import ParamAttr, reshape, transpose, concat, split
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D
+from paddle.nn.initializer import KaimingNormal
+from paddle.regularizer import L2Decay
+
+from ..base.theseus_layer import TheseusLayer
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "ESNet_x0_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ESNet_x0_25_pretrained.pdparams",
+    "ESNet_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ESNet_x0_5_pretrained.pdparams",
+    "ESNet_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ESNet_x0_75_pretrained.pdparams",
+    "ESNet_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ESNet_x1_0_pretrained.pdparams",
+}
+
+MODEL_STAGES_PATTERN = {"ESNet": ["blocks[2]", "blocks[9]", "blocks[12]"]}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def channel_shuffle(x, groups):
+    batch_size, num_channels, height, width = x.shape[0:4]
+    channels_per_group = num_channels // groups
+    x = reshape(
+        x=x, shape=[batch_size, groups, channels_per_group, height, width])
+    x = transpose(x=x, perm=[0, 2, 1, 3, 4])
+    x = reshape(x=x, shape=[batch_size, num_channels, height, width])
+    return x
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 if_act=True):
+        super().__init__()
+        self.conv = Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(initializer=KaimingNormal()),
+            bias_attr=False)
+
+        self.bn = BatchNorm(
+            out_channels,
+            param_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.if_act = if_act
+        self.hardswish = nn.Hardswish()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.if_act:
+            x = self.hardswish(x)
+        return x
+
+
+class SEModule(TheseusLayer):
+    def __init__(self, channel, reduction=4):
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.relu = nn.ReLU()
+        self.conv2 = Conv2D(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.hardsigmoid = nn.Hardsigmoid()
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        x = paddle.multiply(x=identity, y=x)
+        return x
+
+
+class ESBlock1(TheseusLayer):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.pw_1_1 = ConvBNLayer(
+            in_channels=in_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1)
+        self.dw_1 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=3,
+            stride=1,
+            groups=out_channels // 2,
+            if_act=False)
+        self.se = SEModule(out_channels)
+
+        self.pw_1_2 = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1)
+
+    def forward(self, x):
+        x1, x2 = split(
+            x, num_or_sections=[x.shape[1] // 2, x.shape[1] // 2], axis=1)
+        x2 = self.pw_1_1(x2)
+        x3 = self.dw_1(x2)
+        x3 = concat([x2, x3], axis=1)
+        x3 = self.se(x3)
+        x3 = self.pw_1_2(x3)
+        x = concat([x1, x3], axis=1)
+        return channel_shuffle(x, 2)
+
+
+class ESBlock2(TheseusLayer):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+
+        # branch1
+        self.dw_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=3,
+            stride=2,
+            groups=in_channels,
+            if_act=False)
+        self.pw_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1)
+        # branch2
+        self.pw_2_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1)
+        self.dw_2 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=3,
+            stride=2,
+            groups=out_channels // 2,
+            if_act=False)
+        self.se = SEModule(out_channels // 2)
+        self.pw_2_2 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1)
+        self.concat_dw = ConvBNLayer(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            groups=out_channels)
+        self.concat_pw = ConvBNLayer(
+            in_channels=out_channels, out_channels=out_channels, kernel_size=1)
+
+    def forward(self, x):
+        x1 = self.dw_1(x)
+        x1 = self.pw_1(x1)
+        x2 = self.pw_2_1(x)
+        x2 = self.dw_2(x2)
+        x2 = self.se(x2)
+        x2 = self.pw_2_2(x2)
+        x = concat([x1, x2], axis=1)
+        x = self.concat_dw(x)
+        x = self.concat_pw(x)
+        return x
+
+
+class ESNet(TheseusLayer):
+    def __init__(self,
+                 stages_pattern,
+                 class_num=1000,
+                 scale=1.0,
+                 dropout_prob=0.2,
+                 class_expand=1280,
+                 return_patterns=None,
+                 return_stages=None):
+        super().__init__()
+        self.scale = scale
+        self.class_num = class_num
+        self.class_expand = class_expand
+        stage_repeats = [3, 7, 3]
+        stage_out_channels = [
+            -1, 24, make_divisible(116 * scale), make_divisible(232 * scale),
+            make_divisible(464 * scale), 1024
+        ]
+
+        self.conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=stage_out_channels[1],
+            kernel_size=3,
+            stride=2)
+        self.max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        block_list = []
+        for stage_id, num_repeat in enumerate(stage_repeats):
+            for i in range(num_repeat):
+                if i == 0:
+                    block = ESBlock2(
+                        in_channels=stage_out_channels[stage_id + 1],
+                        out_channels=stage_out_channels[stage_id + 2])
+                else:
+                    block = ESBlock1(
+                        in_channels=stage_out_channels[stage_id + 2],
+                        out_channels=stage_out_channels[stage_id + 2])
+                block_list.append(block)
+        self.blocks = nn.Sequential(*block_list)
+
+        self.conv2 = ConvBNLayer(
+            in_channels=stage_out_channels[-2],
+            out_channels=stage_out_channels[-1],
+            kernel_size=1)
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+
+        self.last_conv = Conv2D(
+            in_channels=stage_out_channels[-1],
+            out_channels=self.class_expand,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=False)
+        self.hardswish = nn.Hardswish()
+        self.dropout = Dropout(p=dropout_prob, mode="downscale_in_infer")
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+        self.fc = Linear(self.class_expand, self.class_num)
+
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.max_pool(x)
+        x = self.blocks(x)
+        x = self.conv2(x)
+        x = self.avg_pool(x)
+        x = self.last_conv(x)
+        x = self.hardswish(x)
+        x = self.dropout(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ESNet_x0_25(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ESNet_x0_25
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ESNet_x0_25` model depends on args.
+    """
+    model = ESNet(
+        scale=0.25, stages_pattern=MODEL_STAGES_PATTERN["ESNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ESNet_x0_25"], use_ssld)
+    return model
+
+
+def ESNet_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ESNet_x0_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ESNet_x0_5` model depends on args.
+    """
+    model = ESNet(
+        scale=0.5, stages_pattern=MODEL_STAGES_PATTERN["ESNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ESNet_x0_5"], use_ssld)
+    return model
+
+
+def ESNet_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ESNet_x0_75
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ESNet_x0_75` model depends on args.
+    """
+    model = ESNet(
+        scale=0.75, stages_pattern=MODEL_STAGES_PATTERN["ESNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ESNet_x0_75"], use_ssld)
+    return model
+
+
+def ESNet_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ESNet_x1_0
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ESNet_x1_0` model depends on args.
+    """
+    model = ESNet(
+        scale=1.0, stages_pattern=MODEL_STAGES_PATTERN["ESNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ESNet_x1_0"], use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/hrnet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/hrnet.py
new file mode 100644
index 000000000..fd6d32557
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/hrnet.py
@@ -0,0 +1,797 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1908.07919
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+from paddle import nn
+from paddle import ParamAttr
+from paddle.nn.functional import upsample
+from paddle.nn.initializer import Uniform
+
+from ..base.theseus_layer import TheseusLayer, Identity
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "HRNet_W18_C":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W18_C_pretrained.pdparams",
+    "HRNet_W30_C":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W30_C_pretrained.pdparams",
+    "HRNet_W32_C":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W32_C_pretrained.pdparams",
+    "HRNet_W40_C":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W40_C_pretrained.pdparams",
+    "HRNet_W44_C":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W44_C_pretrained.pdparams",
+    "HRNet_W48_C":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W48_C_pretrained.pdparams",
+    "HRNet_W64_C":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W64_C_pretrained.pdparams"
+}
+
+MODEL_STAGES_PATTERN = {"HRNet": ["st4"]}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def _create_act(act):
+    if act == "hardswish":
+        return nn.Hardswish()
+    elif act == "relu":
+        return nn.ReLU()
+    elif act is None:
+        return Identity()
+    else:
+        raise RuntimeError(
+            "The activation function is not supported: {}".format(act))
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act="relu"):
+        super().__init__()
+
+        self.conv = nn.Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            bias_attr=False)
+        self.bn = nn.BatchNorm(num_filters, act=None)
+        self.act = _create_act(act)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+        return x
+
+
+class BottleneckBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 has_se,
+                 stride=1,
+                 downsample=False):
+        super().__init__()
+
+        self.has_se = has_se
+        self.downsample = downsample
+
+        self.conv1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act="relu")
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu")
+        self.conv3 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None)
+
+        if self.downsample:
+            self.conv_down = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                act=None)
+
+        if self.has_se:
+            self.se = SELayer(
+                num_channels=num_filters * 4,
+                num_filters=num_filters * 4,
+                reduction_ratio=16)
+        self.relu = nn.ReLU()
+
+    def forward(self, x, res_dict=None):
+        residual = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        if self.downsample:
+            residual = self.conv_down(residual)
+        if self.has_se:
+            x = self.se(x)
+        x = paddle.add(x=residual, y=x)
+        x = self.relu(x)
+        return x
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self, num_channels, num_filters, has_se=False):
+        super().__init__()
+
+        self.has_se = has_se
+
+        self.conv1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=1,
+            act="relu")
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=1,
+            act=None)
+
+        if self.has_se:
+            self.se = SELayer(
+                num_channels=num_filters,
+                num_filters=num_filters,
+                reduction_ratio=16)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        residual = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+
+        if self.has_se:
+            x = self.se(x)
+
+        x = paddle.add(x=residual, y=x)
+        x = self.relu(x)
+        return x
+
+
+class SELayer(TheseusLayer):
+    def __init__(self, num_channels, num_filters, reduction_ratio):
+        super().__init__()
+
+        self.avg_pool = nn.AdaptiveAvgPool2D(1)
+
+        self._num_channels = num_channels
+
+        med_ch = int(num_channels / reduction_ratio)
+        stdv = 1.0 / math.sqrt(num_channels * 1.0)
+        self.fc_squeeze = nn.Linear(
+            num_channels,
+            med_ch,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
+        self.relu = nn.ReLU()
+        stdv = 1.0 / math.sqrt(med_ch * 1.0)
+        self.fc_excitation = nn.Linear(
+            med_ch,
+            num_filters,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x, res_dict=None):
+        residual = x
+        x = self.avg_pool(x)
+        x = paddle.squeeze(x, axis=[2, 3])
+        x = self.fc_squeeze(x)
+        x = self.relu(x)
+        x = self.fc_excitation(x)
+        x = self.sigmoid(x)
+        x = paddle.unsqueeze(x, axis=[2, 3])
+        x = residual * x
+        return x
+
+
+class Stage(TheseusLayer):
+    def __init__(self, num_modules, num_filters, has_se=False):
+        super().__init__()
+
+        self._num_modules = num_modules
+
+        self.stage_func_list = nn.LayerList()
+        for i in range(num_modules):
+            self.stage_func_list.append(
+                HighResolutionModule(
+                    num_filters=num_filters, has_se=has_se))
+
+    def forward(self, x, res_dict=None):
+        x = x
+        for idx in range(self._num_modules):
+            x = self.stage_func_list[idx](x)
+        return x
+
+
+class HighResolutionModule(TheseusLayer):
+    def __init__(self, num_filters, has_se=False):
+        super().__init__()
+
+        self.basic_block_list = nn.LayerList()
+
+        for i in range(len(num_filters)):
+            self.basic_block_list.append(
+                nn.Sequential(* [
+                    BasicBlock(
+                        num_channels=num_filters[i],
+                        num_filters=num_filters[i],
+                        has_se=has_se) for j in range(4)
+                ]))
+
+        self.fuse_func = FuseLayers(
+            in_channels=num_filters, out_channels=num_filters)
+
+    def forward(self, x, res_dict=None):
+        out = []
+        for idx, xi in enumerate(x):
+            basic_block_list = self.basic_block_list[idx]
+            for basic_block_func in basic_block_list:
+                xi = basic_block_func(xi)
+            out.append(xi)
+        out = self.fuse_func(out)
+        return out
+
+
+class FuseLayers(TheseusLayer):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+
+        self._actual_ch = len(in_channels)
+        self._in_channels = in_channels
+
+        self.residual_func_list = nn.LayerList()
+        self.relu = nn.ReLU()
+        for i in range(len(in_channels)):
+            for j in range(len(in_channels)):
+                if j > i:
+                    self.residual_func_list.append(
+                        ConvBNLayer(
+                            num_channels=in_channels[j],
+                            num_filters=out_channels[i],
+                            filter_size=1,
+                            stride=1,
+                            act=None))
+                elif j < i:
+                    pre_num_filters = in_channels[j]
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            self.residual_func_list.append(
+                                ConvBNLayer(
+                                    num_channels=pre_num_filters,
+                                    num_filters=out_channels[i],
+                                    filter_size=3,
+                                    stride=2,
+                                    act=None))
+                            pre_num_filters = out_channels[i]
+                        else:
+                            self.residual_func_list.append(
+                                ConvBNLayer(
+                                    num_channels=pre_num_filters,
+                                    num_filters=out_channels[j],
+                                    filter_size=3,
+                                    stride=2,
+                                    act="relu"))
+                            pre_num_filters = out_channels[j]
+
+    def forward(self, x, res_dict=None):
+        out = []
+        residual_func_idx = 0
+        for i in range(len(self._in_channels)):
+            residual = x[i]
+            for j in range(len(self._in_channels)):
+                if j > i:
+                    xj = self.residual_func_list[residual_func_idx](x[j])
+                    residual_func_idx += 1
+
+                    xj = upsample(xj, scale_factor=2**(j - i), mode="nearest")
+                    residual = paddle.add(x=residual, y=xj)
+                elif j < i:
+                    xj = x[j]
+                    for k in range(i - j):
+                        xj = self.residual_func_list[residual_func_idx](xj)
+                        residual_func_idx += 1
+
+                    residual = paddle.add(x=residual, y=xj)
+
+            residual = self.relu(residual)
+            out.append(residual)
+
+        return out
+
+
+class LastClsOut(TheseusLayer):
+    def __init__(self,
+                 num_channel_list,
+                 has_se,
+                 num_filters_list=[32, 64, 128, 256]):
+        super().__init__()
+
+        self.func_list = nn.LayerList()
+        for idx in range(len(num_channel_list)):
+            self.func_list.append(
+                BottleneckBlock(
+                    num_channels=num_channel_list[idx],
+                    num_filters=num_filters_list[idx],
+                    has_se=has_se,
+                    downsample=True))
+
+    def forward(self, x, res_dict=None):
+        out = []
+        for idx, xi in enumerate(x):
+            xi = self.func_list[idx](xi)
+            out.append(xi)
+        return out
+
+
+class HRNet(TheseusLayer):
+    """
+    HRNet
+    Args:
+        width: int=18. Base channel number of HRNet.
+        has_se: bool=False. If 'True', add se module to HRNet.
+        class_num: int=1000. Output num of last fc layer.
+    Returns:
+        model: nn.Layer. Specific HRNet model depends on args.
+    """
+
+    def __init__(self,
+                 stages_pattern,
+                 width=18,
+                 has_se=False,
+                 class_num=1000,
+                 return_patterns=None,
+                 return_stages=None):
+        super().__init__()
+
+        self.width = width
+        self.has_se = has_se
+        self._class_num = class_num
+
+        channels_2 = [self.width, self.width * 2]
+        channels_3 = [self.width, self.width * 2, self.width * 4]
+        channels_4 = [
+            self.width, self.width * 2, self.width * 4, self.width * 8
+        ]
+
+        self.conv_layer1_1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=64,
+            filter_size=3,
+            stride=2,
+            act="relu")
+
+        self.conv_layer1_2 = ConvBNLayer(
+            num_channels=64,
+            num_filters=64,
+            filter_size=3,
+            stride=2,
+            act="relu")
+
+        self.layer1 = nn.Sequential(* [
+            BottleneckBlock(
+                num_channels=64 if i == 0 else 256,
+                num_filters=64,
+                has_se=has_se,
+                stride=1,
+                downsample=True if i == 0 else False) for i in range(4)
+        ])
+
+        self.conv_tr1_1 = ConvBNLayer(
+            num_channels=256, num_filters=width, filter_size=3)
+        self.conv_tr1_2 = ConvBNLayer(
+            num_channels=256, num_filters=width * 2, filter_size=3, stride=2)
+
+        self.st2 = Stage(
+            num_modules=1, num_filters=channels_2, has_se=self.has_se)
+
+        self.conv_tr2 = ConvBNLayer(
+            num_channels=width * 2,
+            num_filters=width * 4,
+            filter_size=3,
+            stride=2)
+        self.st3 = Stage(
+            num_modules=4, num_filters=channels_3, has_se=self.has_se)
+
+        self.conv_tr3 = ConvBNLayer(
+            num_channels=width * 4,
+            num_filters=width * 8,
+            filter_size=3,
+            stride=2)
+
+        self.st4 = Stage(
+            num_modules=3, num_filters=channels_4, has_se=self.has_se)
+
+        # classification
+        num_filters_list = [32, 64, 128, 256]
+        self.last_cls = LastClsOut(
+            num_channel_list=channels_4,
+            has_se=self.has_se,
+            num_filters_list=num_filters_list)
+
+        last_num_filters = [256, 512, 1024]
+        self.cls_head_conv_list = nn.LayerList()
+        for idx in range(3):
+            self.cls_head_conv_list.append(
+                ConvBNLayer(
+                    num_channels=num_filters_list[idx] * 4,
+                    num_filters=last_num_filters[idx],
+                    filter_size=3,
+                    stride=2))
+
+        self.conv_last = ConvBNLayer(
+            num_channels=1024, num_filters=2048, filter_size=1, stride=1)
+
+        self.avg_pool = nn.AdaptiveAvgPool2D(1)
+
+        stdv = 1.0 / math.sqrt(2048 * 1.0)
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+
+        self.fc = nn.Linear(
+            2048,
+            class_num,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
+
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward(self, x):
+        x = self.conv_layer1_1(x)
+        x = self.conv_layer1_2(x)
+
+        x = self.layer1(x)
+
+        tr1_1 = self.conv_tr1_1(x)
+        tr1_2 = self.conv_tr1_2(x)
+        x = self.st2([tr1_1, tr1_2])
+
+        tr2 = self.conv_tr2(x[-1])
+        x.append(tr2)
+        x = self.st3(x)
+
+        tr3 = self.conv_tr3(x[-1])
+        x.append(tr3)
+        x = self.st4(x)
+
+        x = self.last_cls(x)
+
+        y = x[0]
+        for idx in range(3):
+            y = paddle.add(x[idx + 1], self.cls_head_conv_list[idx](y))
+
+        y = self.conv_last(y)
+        y = self.avg_pool(y)
+        y = self.flatten(y)
+        y = self.fc(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def HRNet_W18_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W18_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W18_C` model depends on args.
+    """
+    model = HRNet(
+        width=18, stages_pattern=MODEL_STAGES_PATTERN["HRNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W18_C"], use_ssld)
+    return model
+
+
+def HRNet_W30_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W30_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W30_C` model depends on args.
+    """
+    model = HRNet(
+        width=30, stages_pattern=MODEL_STAGES_PATTERN["HRNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W30_C"], use_ssld)
+    return model
+
+
+def HRNet_W32_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W32_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W32_C` model depends on args.
+    """
+    model = HRNet(
+        width=32, stages_pattern=MODEL_STAGES_PATTERN["HRNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W32_C"], use_ssld)
+    return model
+
+
+def HRNet_W40_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W40_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W40_C` model depends on args.
+    """
+    model = HRNet(
+        width=40, stages_pattern=MODEL_STAGES_PATTERN["HRNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W40_C"], use_ssld)
+    return model
+
+
+def HRNet_W44_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W44_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W44_C` model depends on args.
+    """
+    model = HRNet(
+        width=44, stages_pattern=MODEL_STAGES_PATTERN["HRNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W44_C"], use_ssld)
+    return model
+
+
+def HRNet_W48_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W48_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W48_C` model depends on args.
+    """
+    model = HRNet(
+        width=48, stages_pattern=MODEL_STAGES_PATTERN["HRNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W48_C"], use_ssld)
+    return model
+
+
+def HRNet_W60_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W60_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W60_C` model depends on args.
+    """
+    model = HRNet(
+        width=60, stages_pattern=MODEL_STAGES_PATTERN["HRNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W60_C"], use_ssld)
+    return model
+
+
+def HRNet_W64_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W64_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W64_C` model depends on args.
+    """
+    model = HRNet(
+        width=64, stages_pattern=MODEL_STAGES_PATTERN["HRNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W64_C"], use_ssld)
+    return model
+
+
+def SE_HRNet_W18_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    SE_HRNet_W18_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W18_C` model depends on args.
+    """
+    model = HRNet(
+        width=18,
+        stages_pattern=MODEL_STAGES_PATTERN["HRNet"],
+        has_se=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W18_C"], use_ssld)
+    return model
+
+
+def SE_HRNet_W30_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    SE_HRNet_W30_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W30_C` model depends on args.
+    """
+    model = HRNet(
+        width=30,
+        stages_pattern=MODEL_STAGES_PATTERN["HRNet"],
+        has_se=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W30_C"], use_ssld)
+    return model
+
+
+def SE_HRNet_W32_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    SE_HRNet_W32_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W32_C` model depends on args.
+    """
+    model = HRNet(
+        width=32,
+        stages_pattern=MODEL_STAGES_PATTERN["HRNet"],
+        has_se=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W32_C"], use_ssld)
+    return model
+
+
+def SE_HRNet_W40_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    SE_HRNet_W40_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W40_C` model depends on args.
+    """
+    model = HRNet(
+        width=40,
+        stages_pattern=MODEL_STAGES_PATTERN["HRNet"],
+        has_se=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W40_C"], use_ssld)
+    return model
+
+
+def SE_HRNet_W44_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    SE_HRNet_W44_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W44_C` model depends on args.
+    """
+    model = HRNet(
+        width=44,
+        stages_pattern=MODEL_STAGES_PATTERN["HRNet"],
+        has_se=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W44_C"], use_ssld)
+    return model
+
+
+def SE_HRNet_W48_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    SE_HRNet_W48_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W48_C` model depends on args.
+    """
+    model = HRNet(
+        width=48,
+        stages_pattern=MODEL_STAGES_PATTERN["HRNet"],
+        has_se=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W48_C"], use_ssld)
+    return model
+
+
+def SE_HRNet_W60_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    SE_HRNet_W60_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W60_C` model depends on args.
+    """
+    model = HRNet(
+        width=60,
+        stages_pattern=MODEL_STAGES_PATTERN["HRNet"],
+        has_se=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W60_C"], use_ssld)
+    return model
+
+
+def SE_HRNet_W64_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    SE_HRNet_W64_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W64_C` model depends on args.
+    """
+    model = HRNet(
+        width=64,
+        stages_pattern=MODEL_STAGES_PATTERN["HRNet"],
+        has_se=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W64_C"], use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/inception_v3.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/inception_v3.py
new file mode 100644
index 000000000..b01887020
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/inception_v3.py
@@ -0,0 +1,559 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1512.00567v3
+
+from __future__ import absolute_import, division, print_function
+import math
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+from ..base.theseus_layer import TheseusLayer
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "InceptionV3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/InceptionV3_pretrained.pdparams"
+}
+
+MODEL_STAGES_PATTERN = {
+    "InceptionV3": [
+        "inception_block_list[2]", "inception_block_list[3]",
+        "inception_block_list[7]", "inception_block_list[8]",
+        "inception_block_list[10]"
+    ]
+}
+
+__all__ = MODEL_URLS.keys()
+'''
+InceptionV3 config: dict.
+    key: inception blocks of InceptionV3.
+    values: conv num in different blocks.
+'''
+NET_CONFIG = {
+    "inception_a": [[192, 256, 288], [32, 64, 64]],
+    "inception_b": [288],
+    "inception_c": [[768, 768, 768, 768], [128, 160, 160, 192]],
+    "inception_d": [768],
+    "inception_e": [1280, 2048]
+}
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 groups=1,
+                 act="relu"):
+        super().__init__()
+        self.act = act
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias_attr=False)
+        self.bn = BatchNorm(num_filters)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act:
+            x = self.relu(x)
+        return x
+
+
+class InceptionStem(TheseusLayer):
+    def __init__(self):
+        super().__init__()
+        self.conv_1a_3x3 = ConvBNLayer(
+            num_channels=3,
+            num_filters=32,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.conv_2a_3x3 = ConvBNLayer(
+            num_channels=32,
+            num_filters=32,
+            filter_size=3,
+            stride=1,
+            act="relu")
+        self.conv_2b_3x3 = ConvBNLayer(
+            num_channels=32,
+            num_filters=64,
+            filter_size=3,
+            padding=1,
+            act="relu")
+
+        self.max_pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+        self.conv_3b_1x1 = ConvBNLayer(
+            num_channels=64, num_filters=80, filter_size=1, act="relu")
+        self.conv_4a_3x3 = ConvBNLayer(
+            num_channels=80, num_filters=192, filter_size=3, act="relu")
+
+    def forward(self, x):
+        x = self.conv_1a_3x3(x)
+        x = self.conv_2a_3x3(x)
+        x = self.conv_2b_3x3(x)
+        x = self.max_pool(x)
+        x = self.conv_3b_1x1(x)
+        x = self.conv_4a_3x3(x)
+        x = self.max_pool(x)
+        return x
+
+
+class InceptionA(TheseusLayer):
+    def __init__(self, num_channels, pool_features):
+        super().__init__()
+        self.branch1x1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=64,
+            filter_size=1,
+            act="relu")
+        self.branch5x5_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=48,
+            filter_size=1,
+            act="relu")
+        self.branch5x5_2 = ConvBNLayer(
+            num_channels=48,
+            num_filters=64,
+            filter_size=5,
+            padding=2,
+            act="relu")
+
+        self.branch3x3dbl_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=64,
+            filter_size=1,
+            act="relu")
+        self.branch3x3dbl_2 = ConvBNLayer(
+            num_channels=64,
+            num_filters=96,
+            filter_size=3,
+            padding=1,
+            act="relu")
+        self.branch3x3dbl_3 = ConvBNLayer(
+            num_channels=96,
+            num_filters=96,
+            filter_size=3,
+            padding=1,
+            act="relu")
+        self.branch_pool = AvgPool2D(
+            kernel_size=3, stride=1, padding=1, exclusive=False)
+        self.branch_pool_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=pool_features,
+            filter_size=1,
+            act="relu")
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch5x5 = self.branch5x5_1(x)
+        branch5x5 = self.branch5x5_2(branch5x5)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+
+        branch_pool = self.branch_pool(x)
+        branch_pool = self.branch_pool_conv(branch_pool)
+        x = paddle.concat(
+            [branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1)
+        return x
+
+
+class InceptionB(TheseusLayer):
+    def __init__(self, num_channels):
+        super().__init__()
+        self.branch3x3 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=384,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.branch3x3dbl_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=64,
+            filter_size=1,
+            act="relu")
+        self.branch3x3dbl_2 = ConvBNLayer(
+            num_channels=64,
+            num_filters=96,
+            filter_size=3,
+            padding=1,
+            act="relu")
+        self.branch3x3dbl_3 = ConvBNLayer(
+            num_channels=96,
+            num_filters=96,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.branch_pool = MaxPool2D(kernel_size=3, stride=2)
+
+    def forward(self, x):
+        branch3x3 = self.branch3x3(x)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+
+        branch_pool = self.branch_pool(x)
+
+        x = paddle.concat([branch3x3, branch3x3dbl, branch_pool], axis=1)
+
+        return x
+
+
+class InceptionC(TheseusLayer):
+    def __init__(self, num_channels, channels_7x7):
+        super().__init__()
+        self.branch1x1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+
+        self.branch7x7_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=channels_7x7,
+            filter_size=1,
+            stride=1,
+            act="relu")
+        self.branch7x7_2 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=channels_7x7,
+            filter_size=(1, 7),
+            stride=1,
+            padding=(0, 3),
+            act="relu")
+        self.branch7x7_3 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=192,
+            filter_size=(7, 1),
+            stride=1,
+            padding=(3, 0),
+            act="relu")
+
+        self.branch7x7dbl_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=channels_7x7,
+            filter_size=1,
+            act="relu")
+        self.branch7x7dbl_2 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=channels_7x7,
+            filter_size=(7, 1),
+            padding=(3, 0),
+            act="relu")
+        self.branch7x7dbl_3 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=channels_7x7,
+            filter_size=(1, 7),
+            padding=(0, 3),
+            act="relu")
+        self.branch7x7dbl_4 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=channels_7x7,
+            filter_size=(7, 1),
+            padding=(3, 0),
+            act="relu")
+        self.branch7x7dbl_5 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=192,
+            filter_size=(1, 7),
+            padding=(0, 3),
+            act="relu")
+
+        self.branch_pool = AvgPool2D(
+            kernel_size=3, stride=1, padding=1, exclusive=False)
+        self.branch_pool_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+
+        branch7x7 = self.branch7x7_1(x)
+        branch7x7 = self.branch7x7_2(branch7x7)
+        branch7x7 = self.branch7x7_3(branch7x7)
+
+        branch7x7dbl = self.branch7x7dbl_1(x)
+        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
+
+        branch_pool = self.branch_pool(x)
+        branch_pool = self.branch_pool_conv(branch_pool)
+
+        x = paddle.concat(
+            [branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1)
+
+        return x
+
+
+class InceptionD(TheseusLayer):
+    def __init__(self, num_channels):
+        super().__init__()
+        self.branch3x3_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+        self.branch3x3_2 = ConvBNLayer(
+            num_channels=192,
+            num_filters=320,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.branch7x7x3_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+        self.branch7x7x3_2 = ConvBNLayer(
+            num_channels=192,
+            num_filters=192,
+            filter_size=(1, 7),
+            padding=(0, 3),
+            act="relu")
+        self.branch7x7x3_3 = ConvBNLayer(
+            num_channels=192,
+            num_filters=192,
+            filter_size=(7, 1),
+            padding=(3, 0),
+            act="relu")
+        self.branch7x7x3_4 = ConvBNLayer(
+            num_channels=192,
+            num_filters=192,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.branch_pool = MaxPool2D(kernel_size=3, stride=2)
+
+    def forward(self, x):
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = self.branch3x3_2(branch3x3)
+
+        branch7x7x3 = self.branch7x7x3_1(x)
+        branch7x7x3 = self.branch7x7x3_2(branch7x7x3)
+        branch7x7x3 = self.branch7x7x3_3(branch7x7x3)
+        branch7x7x3 = self.branch7x7x3_4(branch7x7x3)
+
+        branch_pool = self.branch_pool(x)
+
+        x = paddle.concat([branch3x3, branch7x7x3, branch_pool], axis=1)
+        return x
+
+
+class InceptionE(TheseusLayer):
+    def __init__(self, num_channels):
+        super().__init__()
+        self.branch1x1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=320,
+            filter_size=1,
+            act="relu")
+        self.branch3x3_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=384,
+            filter_size=1,
+            act="relu")
+        self.branch3x3_2a = ConvBNLayer(
+            num_channels=384,
+            num_filters=384,
+            filter_size=(1, 3),
+            padding=(0, 1),
+            act="relu")
+        self.branch3x3_2b = ConvBNLayer(
+            num_channels=384,
+            num_filters=384,
+            filter_size=(3, 1),
+            padding=(1, 0),
+            act="relu")
+
+        self.branch3x3dbl_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=448,
+            filter_size=1,
+            act="relu")
+        self.branch3x3dbl_2 = ConvBNLayer(
+            num_channels=448,
+            num_filters=384,
+            filter_size=3,
+            padding=1,
+            act="relu")
+        self.branch3x3dbl_3a = ConvBNLayer(
+            num_channels=384,
+            num_filters=384,
+            filter_size=(1, 3),
+            padding=(0, 1),
+            act="relu")
+        self.branch3x3dbl_3b = ConvBNLayer(
+            num_channels=384,
+            num_filters=384,
+            filter_size=(3, 1),
+            padding=(1, 0),
+            act="relu")
+        self.branch_pool = AvgPool2D(
+            kernel_size=3, stride=1, padding=1, exclusive=False)
+        self.branch_pool_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = paddle.concat(branch3x3, axis=1)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = paddle.concat(branch3x3dbl, axis=1)
+
+        branch_pool = self.branch_pool(x)
+        branch_pool = self.branch_pool_conv(branch_pool)
+
+        x = paddle.concat(
+            [branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
+        return x
+
+
+class Inception_V3(TheseusLayer):
+    """
+    Inception_V3
+    Args:
+        config: dict. config of Inception_V3.
+        class_num: int=1000. The number of classes.
+        pretrained: (True or False) or path of pretrained_model. Whether to load the pretrained model.
+    Returns:
+        model: nn.Layer. Specific Inception_V3 model depends on args.
+    """
+
+    def __init__(self,
+                 config,
+                 stages_pattern,
+                 class_num=1000,
+                 return_patterns=None,
+                 return_stages=None):
+        super().__init__()
+
+        self.inception_a_list = config["inception_a"]
+        self.inception_c_list = config["inception_c"]
+        self.inception_b_list = config["inception_b"]
+        self.inception_d_list = config["inception_d"]
+        self.inception_e_list = config["inception_e"]
+
+        self.inception_stem = InceptionStem()
+
+        self.inception_block_list = nn.LayerList()
+        for i in range(len(self.inception_a_list[0])):
+            inception_a = InceptionA(self.inception_a_list[0][i],
+                                     self.inception_a_list[1][i])
+            self.inception_block_list.append(inception_a)
+
+        for i in range(len(self.inception_b_list)):
+            inception_b = InceptionB(self.inception_b_list[i])
+            self.inception_block_list.append(inception_b)
+
+        for i in range(len(self.inception_c_list[0])):
+            inception_c = InceptionC(self.inception_c_list[0][i],
+                                     self.inception_c_list[1][i])
+            self.inception_block_list.append(inception_c)
+
+        for i in range(len(self.inception_d_list)):
+            inception_d = InceptionD(self.inception_d_list[i])
+            self.inception_block_list.append(inception_d)
+
+        for i in range(len(self.inception_e_list)):
+            inception_e = InceptionE(self.inception_e_list[i])
+            self.inception_block_list.append(inception_e)
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.dropout = Dropout(p=0.2, mode="downscale_in_infer")
+        stdv = 1.0 / math.sqrt(2048 * 1.0)
+        self.fc = Linear(
+            2048,
+            class_num,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr())
+
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward(self, x):
+        x = self.inception_stem(x)
+        for inception_block in self.inception_block_list:
+            x = inception_block(x)
+        x = self.avg_pool(x)
+        x = paddle.reshape(x, shape=[-1, 2048])
+        x = self.dropout(x)
+        x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def InceptionV3(pretrained=False, use_ssld=False, **kwargs):
+    """
+    InceptionV3
+    Args:
+        pretrained: bool=false or str. if `true` load pretrained parameters, `false` otherwise.
+                    if str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `InceptionV3` model
+    """
+    model = Inception_V3(
+        NET_CONFIG,
+        stages_pattern=MODEL_STAGES_PATTERN["InceptionV3"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["InceptionV3"], use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/mobilenet_v1.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/mobilenet_v1.py
new file mode 100644
index 000000000..4e6706382
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/mobilenet_v1.py
@@ -0,0 +1,259 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1704.04861
+
+from __future__ import absolute_import, division, print_function
+
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear, ReLU, Flatten
+from paddle.nn import AdaptiveAvgPool2D
+from paddle.nn.initializer import KaimingNormal
+
+from ..base.theseus_layer import TheseusLayer
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "MobileNetV1_x0_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV1_x0_25_pretrained.pdparams",
+    "MobileNetV1_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV1_x0_5_pretrained.pdparams",
+    "MobileNetV1_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV1_x0_75_pretrained.pdparams",
+    "MobileNetV1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV1_pretrained.pdparams"
+}
+
+MODEL_STAGES_PATTERN = {
+    "MobileNetV1": ["blocks[0]", "blocks[2]", "blocks[4]", "blocks[10]"]
+}
+
+__all__ = MODEL_URLS.keys()
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 padding,
+                 num_groups=1):
+        super().__init__()
+
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            weight_attr=ParamAttr(initializer=KaimingNormal()),
+            bias_attr=False)
+        self.bn = BatchNorm(num_filters)
+        self.relu = ReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class DepthwiseSeparable(TheseusLayer):
+    def __init__(self, num_channels, num_filters1, num_filters2, num_groups,
+                 stride, scale):
+        super().__init__()
+
+        self.depthwise_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=int(num_filters1 * scale),
+            filter_size=3,
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups * scale))
+
+        self.pointwise_conv = ConvBNLayer(
+            num_channels=int(num_filters1 * scale),
+            filter_size=1,
+            num_filters=int(num_filters2 * scale),
+            stride=1,
+            padding=0)
+
+    def forward(self, x):
+        x = self.depthwise_conv(x)
+        x = self.pointwise_conv(x)
+        return x
+
+
+class MobileNet(TheseusLayer):
+    """
+    MobileNet
+    Args:
+        scale: float=1.0. The coefficient that controls the size of network parameters. 
+        class_num: int=1000. The number of classes.
+    Returns:
+        model: nn.Layer. Specific MobileNet model depends on args.
+    """
+
+    def __init__(self,
+                 stages_pattern,
+                 scale=1.0,
+                 class_num=1000,
+                 return_patterns=None,
+                 return_stages=None):
+        super().__init__()
+        self.scale = scale
+
+        self.conv = ConvBNLayer(
+            num_channels=3,
+            filter_size=3,
+            num_filters=int(32 * scale),
+            stride=2,
+            padding=1)
+
+        #num_channels, num_filters1, num_filters2, num_groups, stride
+        self.cfg = [[int(32 * scale), 32, 64, 32, 1],
+                    [int(64 * scale), 64, 128, 64, 2],
+                    [int(128 * scale), 128, 128, 128, 1],
+                    [int(128 * scale), 128, 256, 128, 2],
+                    [int(256 * scale), 256, 256, 256, 1],
+                    [int(256 * scale), 256, 512, 256, 2],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 1024, 512, 2],
+                    [int(1024 * scale), 1024, 1024, 1024, 1]]
+
+        self.blocks = nn.Sequential(* [
+            DepthwiseSeparable(
+                num_channels=params[0],
+                num_filters1=params[1],
+                num_filters2=params[2],
+                num_groups=params[3],
+                stride=params[4],
+                scale=scale) for params in self.cfg
+        ])
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.flatten = Flatten(start_axis=1, stop_axis=-1)
+
+        self.fc = Linear(
+            int(1024 * scale),
+            class_num,
+            weight_attr=ParamAttr(initializer=KaimingNormal()))
+
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.blocks(x)
+        x = self.avg_pool(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def MobileNetV1_x0_25(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV1_x0_25
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV1_x0_25` model depends on args.
+    """
+    model = MobileNet(
+        scale=0.25,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV1"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV1_x0_25"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV1_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV1_x0_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV1_x0_5` model depends on args.
+    """
+    model = MobileNet(
+        scale=0.5,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV1"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV1_x0_5"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV1_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV1_x0_75
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV1_x0_75` model depends on args.
+    """
+    model = MobileNet(
+        scale=0.75,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV1"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV1_x0_75"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV1(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV1
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV1` model depends on args.
+    """
+    model = MobileNet(
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV1"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV1"], use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/mobilenet_v3.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/mobilenet_v3.py
new file mode 100644
index 000000000..a50c2884d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/mobilenet_v3.py
@@ -0,0 +1,591 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1905.02244
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.nn import BatchNorm, Conv2D, Dropout, Linear
+from paddle.regularizer import L2Decay
+
+from .custom_devices_layers import AdaptiveAvgPool2D
+from ..base.theseus_layer import TheseusLayer
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "MobileNetV3_small_x0_35":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x0_35_pretrained.pdparams",
+    "MobileNetV3_small_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x0_5_pretrained.pdparams",
+    "MobileNetV3_small_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x0_75_pretrained.pdparams",
+    "MobileNetV3_small_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x1_0_pretrained.pdparams",
+    "MobileNetV3_small_x1_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x1_25_pretrained.pdparams",
+    "MobileNetV3_large_x0_35":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x0_35_pretrained.pdparams",
+    "MobileNetV3_large_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x0_5_pretrained.pdparams",
+    "MobileNetV3_large_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x0_75_pretrained.pdparams",
+    "MobileNetV3_large_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x1_0_pretrained.pdparams",
+    "MobileNetV3_large_x1_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x1_25_pretrained.pdparams",
+}
+
+MODEL_STAGES_PATTERN = {
+    "MobileNetV3_small":
+    ["blocks[0]", "blocks[2]", "blocks[7]", "blocks[10]"],
+    "MobileNetV3_large":
+    ["blocks[0]", "blocks[2]", "blocks[5]", "blocks[11]", "blocks[14]"]
+}
+
+__all__ = MODEL_URLS.keys()
+
+# "large", "small" is just for MobinetV3_large, MobileNetV3_small respectively.
+# The type of "large" or "small" config is a list. Each element(list) represents a depthwise block, which is composed of k, exp, se, act, s.
+# k: kernel_size
+# exp: middle channel number in depthwise block
+# c: output channel number in depthwise block
+# se: whether to use SE block
+# act: which activation to use
+# s: stride in depthwise block
+NET_CONFIG = {
+    "large": [
+        # k, exp, c, se, act, s
+        [3, 16, 16, False, "relu", 1],
+        [3, 64, 24, False, "relu", 2],
+        [3, 72, 24, False, "relu", 1],
+        [5, 72, 40, True, "relu", 2],
+        [5, 120, 40, True, "relu", 1],
+        [5, 120, 40, True, "relu", 1],
+        [3, 240, 80, False, "hardswish", 2],
+        [3, 200, 80, False, "hardswish", 1],
+        [3, 184, 80, False, "hardswish", 1],
+        [3, 184, 80, False, "hardswish", 1],
+        [3, 480, 112, True, "hardswish", 1],
+        [3, 672, 112, True, "hardswish", 1],
+        [5, 672, 160, True, "hardswish", 2],
+        [5, 960, 160, True, "hardswish", 1],
+        [5, 960, 160, True, "hardswish", 1],
+    ],
+    "small": [
+        # k, exp, c, se, act, s
+        [3, 16, 16, True, "relu", 2],
+        [3, 72, 24, False, "relu", 2],
+        [3, 88, 24, False, "relu", 1],
+        [5, 96, 40, True, "hardswish", 2],
+        [5, 240, 40, True, "hardswish", 1],
+        [5, 240, 40, True, "hardswish", 1],
+        [5, 120, 48, True, "hardswish", 1],
+        [5, 144, 48, True, "hardswish", 1],
+        [5, 288, 96, True, "hardswish", 2],
+        [5, 576, 96, True, "hardswish", 1],
+        [5, 576, 96, True, "hardswish", 1],
+    ]
+}
+# first conv output channel number in MobileNetV3
+STEM_CONV_NUMBER = 16
+# last second conv output channel for "small"
+LAST_SECOND_CONV_SMALL = 576
+# last second conv output channel for "large"
+LAST_SECOND_CONV_LARGE = 960
+# last conv output channel number for "large" and "small"
+LAST_CONV = 1280
+
+
+def _make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+def _create_act(act):
+    if act == "hardswish":
+        return nn.Hardswish()
+    elif act == "relu":
+        return nn.ReLU()
+    elif act is None:
+        return None
+    else:
+        raise RuntimeError(
+            "The activation function is not supported: {}".format(act))
+
+
+class MobileNetV3(TheseusLayer):
+    """
+    MobileNetV3
+    Args:
+        config: list. MobileNetV3 depthwise blocks config.
+        scale: float=1.0. The coefficient that controls the size of network parameters. 
+        class_num: int=1000. The number of classes.
+        inplanes: int=16. The output channel number of first convolution layer.
+        class_squeeze: int=960. The output channel number of penultimate convolution layer. 
+        class_expand: int=1280. The output channel number of last convolution layer. 
+        dropout_prob: float=0.2.  Probability of setting units to zero.
+    Returns:
+        model: nn.Layer. Specific MobileNetV3 model depends on args.
+    """
+
+    def __init__(self,
+                 config,
+                 stages_pattern,
+                 scale=1.0,
+                 class_num=1000,
+                 inplanes=STEM_CONV_NUMBER,
+                 class_squeeze=LAST_SECOND_CONV_LARGE,
+                 class_expand=LAST_CONV,
+                 dropout_prob=0.2,
+                 return_patterns=None,
+                 return_stages=None,
+                 **kwargs):
+        super().__init__()
+
+        self.cfg = config
+        self.scale = scale
+        self.inplanes = inplanes
+        self.class_squeeze = class_squeeze
+        self.class_expand = class_expand
+        self.class_num = class_num
+
+        self.conv = ConvBNLayer(
+            in_c=3,
+            out_c=_make_divisible(self.inplanes * self.scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            num_groups=1,
+            if_act=True,
+            act="hardswish")
+
+        self.blocks = nn.Sequential(*[
+            ResidualUnit(
+                in_c=_make_divisible(self.inplanes * self.scale if i == 0 else
+                                     self.cfg[i - 1][2] * self.scale),
+                mid_c=_make_divisible(self.scale * exp),
+                out_c=_make_divisible(self.scale * c),
+                filter_size=k,
+                stride=s,
+                use_se=se,
+                act=act) for i, (k, exp, c, se, act, s) in enumerate(self.cfg)
+        ])
+
+        self.last_second_conv = ConvBNLayer(
+            in_c=_make_divisible(self.cfg[-1][2] * self.scale),
+            out_c=_make_divisible(self.scale * self.class_squeeze),
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=True,
+            act="hardswish")
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+
+        self.last_conv = Conv2D(
+            in_channels=_make_divisible(self.scale * self.class_squeeze),
+            out_channels=self.class_expand,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=False)
+
+        self.hardswish = nn.Hardswish()
+        if dropout_prob is not None:
+            self.dropout = Dropout(p=dropout_prob, mode="downscale_in_infer")
+        else:
+            self.dropout = None
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+
+        self.fc = Linear(self.class_expand, class_num)
+
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.blocks(x)
+        x = self.last_second_conv(x)
+        x = self.avg_pool(x)
+        x = self.last_conv(x)
+        x = self.hardswish(x)
+        if self.dropout is not None:
+            x = self.dropout(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+
+        return x
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 padding,
+                 num_groups=1,
+                 if_act=True,
+                 act=None):
+        super().__init__()
+
+        self.conv = Conv2D(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            bias_attr=False)
+        self.bn = BatchNorm(
+            num_channels=out_c,
+            act=None,
+            param_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.if_act = if_act
+        self.act = _create_act(act)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.if_act:
+            x = self.act(x)
+        return x
+
+
+class ResidualUnit(TheseusLayer):
+    def __init__(self,
+                 in_c,
+                 mid_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 use_se,
+                 act=None):
+        super().__init__()
+        self.if_shortcut = stride == 1 and in_c == out_c
+        self.if_se = use_se
+
+        self.expand_conv = ConvBNLayer(
+            in_c=in_c,
+            out_c=mid_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=True,
+            act=act)
+        self.bottleneck_conv = ConvBNLayer(
+            in_c=mid_c,
+            out_c=mid_c,
+            filter_size=filter_size,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            num_groups=mid_c,
+            if_act=True,
+            act=act)
+        if self.if_se:
+            self.mid_se = SEModule(mid_c)
+        self.linear_conv = ConvBNLayer(
+            in_c=mid_c,
+            out_c=out_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=False,
+            act=None)
+
+    def forward(self, x):
+        identity = x
+        x = self.expand_conv(x)
+        x = self.bottleneck_conv(x)
+        if self.if_se:
+            x = self.mid_se(x)
+        x = self.linear_conv(x)
+        if self.if_shortcut:
+            x = paddle.add(identity, x)
+        return x
+
+
+# nn.Hardsigmoid can't transfer "slope" and "offset" in nn.functional.hardsigmoid
+class Hardsigmoid(TheseusLayer):
+    def __init__(self, slope=0.2, offset=0.5):
+        super().__init__()
+        self.slope = slope
+        self.offset = offset
+
+    def forward(self, x):
+        return nn.functional.hardsigmoid(
+            x, slope=self.slope, offset=self.offset)
+
+
+class SEModule(TheseusLayer):
+    def __init__(self, channel, reduction=4):
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.relu = nn.ReLU()
+        self.conv2 = Conv2D(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.hardsigmoid = Hardsigmoid(slope=0.2, offset=0.5)
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        return paddle.multiply(x=identity, y=x)
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def MobileNetV3_small_x0_35(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x0_35
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x0_35` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=0.35,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x0_35"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_small_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x0_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x0_5` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=0.5,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x0_5"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_small_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x0_75
+    Args:
+        pretrained: bool=false or str. if `true` load pretrained parameters, `false` otherwise.
+                    if str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x0_75` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=0.75,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x0_75"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_small_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x1_0
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x1_0` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x1_0"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_small_x1_25(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x1_25
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x1_25` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=1.25,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x1_25"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_large_x0_35(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x0_35
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x0_35` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=0.35,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_small"],
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x0_35"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_large_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x0_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x0_5` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=0.5,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"],
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x0_5"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_large_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x0_75
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x0_75` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=0.75,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"],
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x0_75"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_large_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x1_0
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x1_0` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"],
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x1_0"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_large_x1_25(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x1_25
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x1_25` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=1.25,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV3_large"],
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x1_25"],
+                     use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/mobilenet_v4.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/mobilenet_v4.py
new file mode 100644
index 000000000..cad766c86
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/mobilenet_v4.py
@@ -0,0 +1,836 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/2404.10518
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn import BatchNorm, Conv2D, Dropout, Linear, Identity, Flatten
+from paddle.regularizer import L2Decay
+
+from .custom_devices_layers import AdaptiveAvgPool2D
+from ..base.theseus_layer import TheseusLayer
+from ..model_zoo.vision_transformer import DropPath
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "MobileNetV4_conv_large":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV4_conv_large_pretrained.pdparams",
+    "MobileNetV4_conv_medium":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV4_conv_medium_pretrained.pdparams",
+    "MobileNetV4_conv_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV4_conv_small_pretrained.pdparams",
+    "MobileNetV4_hybrid_large":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV4_hybrid_large_pretrained.pdparams",
+    "MobileNetV4_hybrid_medium":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV4_hybrid_medium_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+STEM_CONV_NUMBER = 32
+LAST_CONV = 1280
+
+NET_CONFIG = {
+    "conv_small": [
+        # stage 0, 112x112 
+        # type, out, kernal_size, act, stride
+        ["cn", 32, 3, "relu", 2],
+        ["cn", 32, 1, "relu", 1],
+        # stage 1, 56x56
+        ["cn", 96, 3, "relu", 2],
+        ["cn", 64, 1, "relu", 1],
+        # stage 2, 28x28
+        # type, out, mid_c, first_kernal_size, mid_kernal_size, act, stride
+        ["uir", 96, 192, 5, 5, "relu", 2],
+        ["uir", 96, 192, 0, 3, "relu", 1],
+        ["uir", 96, 192, 0, 3, "relu", 1],
+        ["uir", 96, 192, 0, 3, "relu", 1],
+        ["uir", 96, 192, 0, 3, "relu", 1],
+        ["uir", 96, 384, 3, 0, "relu", 1],
+        # stage 3,  14x14
+        ["uir", 128, 576, 3, 3, "relu", 2],
+        ["uir", 128, 512, 5, 5, "relu", 1],
+        ["uir", 128, 512, 0, 5, "relu", 1],
+        ["uir", 128, 384, 0, 5, "relu", 1],
+        ["uir", 128, 512, 0, 3, "relu", 1],
+        ["uir", 128, 512, 0, 3, "relu", 1],
+        # stage 4, 7x7
+        ["cn", 960, 1, "relu", 1],
+    ],
+    "conv_medium": [
+        # stage 0, 112x112 
+        ["er", 48, 128, 3, "relu", 2],
+        # stage 1, 56x56
+        ["uir", 80, 192, 3, 5, "relu", 2],
+        ["uir", 80, 160, 3, 3, "relu", 1],
+        # stage 2, 28x28
+        ["uir", 160, 480, 3, 5, "relu", 2],
+        ["uir", 160, 640, 3, 3, "relu", 1],
+        ["uir", 160, 640, 3, 3, "relu", 1],
+        ["uir", 160, 640, 3, 5, "relu", 1],
+        ["uir", 160, 640, 3, 3, "relu", 1],
+        ["uir", 160, 640, 3, 0, "relu", 1],
+        ["uir", 160, 320, 0, 0, "relu", 1],
+        ["uir", 160, 640, 3, 0, "relu", 1],
+        # stage 3,  14x14
+        ["uir", 256, 960, 5, 5, "relu", 2],
+        ["uir", 256, 1024, 5, 5, "relu", 1],
+        ["uir", 256, 1024, 3, 5, "relu", 1],
+        ["uir", 256, 1024, 3, 5, "relu", 1],
+        ["uir", 256, 1024, 0, 0, "relu", 1],
+        ["uir", 256, 1024, 3, 0, "relu", 1],
+        ["uir", 256, 512, 3, 5, "relu", 1],
+        ["uir", 256, 1024, 5, 5, "relu", 1],
+        ["uir", 256, 1024, 0, 0, "relu", 1],
+        ["uir", 256, 1024, 0, 0, "relu", 1],
+        ["uir", 256, 512, 5, 0, "relu", 1],
+        # stage 4, 7x7
+        ["cn", 960, 1, "relu", 1],
+    ],
+    "conv_large": [
+        # stem_size = 24
+        ["er", 48, 96, 3, "relu", 2],
+        # stage 1, 56x56
+        ["uir", 96, 192, 3, 5, "relu", 2],
+        ["uir", 96, 384, 3, 3, "relu", 1],
+        # stage 2, 28x28 in
+        ["uir", 192, 384, 3, 5, "relu", 2],
+        ["uir", 192, 768, 3, 3, "relu", 1],
+        ["uir", 192, 768, 3, 3, "relu", 1],
+        ["uir", 192, 768, 3, 3, "relu", 1],
+        ["uir", 192, 768, 3, 5, "relu", 1],
+        ["uir", 192, 768, 5, 3, "relu", 1],
+        ["uir", 192, 768, 5, 3, "relu", 1],
+        ["uir", 192, 768, 5, 3, "relu", 1],
+        ["uir", 192, 768, 5, 3, "relu", 1],
+        ["uir", 192, 768, 5, 3, "relu", 1],
+        ["uir", 192, 768, 3, 0, "relu", 1],
+        # stage 3,  14x14 in
+        ["uir", 512, 768, 5, 5, "relu", 2],
+        ["uir", 512, 2048, 5, 5, "relu", 1],
+        ["uir", 512, 2048, 5, 5, "relu", 1],
+        ["uir", 512, 2048, 5, 5, "relu", 1],
+        ["uir", 512, 2048, 5, 0, "relu", 1],
+        ["uir", 512, 2048, 5, 3, "relu", 1],
+        ["uir", 512, 2048, 5, 0, "relu", 1],
+        ["uir", 512, 2048, 5, 0, "relu", 1],
+        ["uir", 512, 2048, 5, 3, "relu", 1],
+        ["uir", 512, 2048, 5, 5, "relu", 1],
+        ["uir", 512, 2048, 5, 0, "relu", 1],
+        ["uir", 512, 2048, 5, 0, "relu", 1],
+        ["uir", 512, 2048, 5, 0, "relu", 1],
+        # stage 4, 7x7
+        ["cn", 960, 1, "relu", 1],
+    ],
+    "hybrid_medium": [
+        # stem_size = 32
+        ["er", 48, 128, 3, "relu", 2],
+        # stage 1, 56x56
+        ["uir", 80, 192, 3, 5, "relu", 2],
+        ["uir", 80, 160, 3, 3, "relu", 1],
+        # stage 2, 28x28
+        ["uir", 160, 480, 3, 5, "relu", 2],
+        ["uir", 160, 320, 0, 0, "relu", 1],
+        ["uir", 160, 640, 3, 3, "relu", 1],
+        ["uir", 160, 640, 3, 5, "relu", 1],
+        # type, out, kv_dim, kernal_size, kv_stride, act, stride
+        ["mqa", 160, 64, 3, 4, 2, "relu", 1],
+        ["uir", 160, 640, 3, 3, "relu", 1],
+        ["mqa", 160, 64, 3, 4, 2, "relu", 1],
+        ["uir", 160, 640, 3, 0, "relu", 1],
+        ["mqa", 160, 64, 3, 4, 2, "relu", 1],
+        ["uir", 160, 640, 3, 3, "relu", 1],
+        ["mqa", 160, 64, 3, 4, 2, "relu", 1],
+        ["uir", 160, 640, 3, 0, "relu", 1],
+        # stage 3,  14x14
+        ["uir", 256, 960, 5, 5, "relu", 2],
+        ["uir", 256, 1024, 5, 5, "relu", 1],
+        ["uir", 256, 1024, 3, 5, "relu", 1],
+        ["uir", 256, 1024, 3, 5, "relu", 1],
+        ["uir", 256, 512, 0, 0, "relu", 1],
+        ["uir", 256, 512, 3, 5, "relu", 1],
+        ["uir", 256, 512, 0, 0, "relu", 1],
+        ["uir", 256, 1024, 0, 0, "relu", 1],
+        ["mqa", 256, 64, 3, 4, 1, "relu", 1],
+        ["uir", 256, 1024, 3, 0, "relu", 1],
+        ["mqa", 256, 64, 3, 4, 1, "relu", 1],
+        ["uir", 256, 1024, 5, 5, "relu", 1],
+        ["mqa", 256, 64, 3, 4, 1, "relu", 1],
+        ["uir", 256, 1024, 5, 0, "relu", 1],
+        ["mqa", 256, 64, 3, 4, 1, "relu", 1],
+        ["uir", 256, 1024, 5, 0, "relu", 1],
+        # stage 4, 7x7
+        ["cn", 960, 1, "relu", 1],
+    ],
+    "hybrid_large": [
+        # stem_size = 24
+        ["er", 48, 96, 3, "gelu", 2],
+        # stage 1, 56x56
+        ["uir", 96, 192, 3, 5, "gelu", 2],
+        ["uir", 96, 384, 3, 3, "gelu", 1],
+        # stage 2, 28x28 in
+        ["uir", 192, 384, 3, 5, "gelu", 2],
+        ["uir", 192, 768, 3, 3, "gelu", 1],
+        ["uir", 192, 768, 3, 3, "gelu", 1],
+        ["uir", 192, 768, 3, 3, "gelu", 1],
+        ["uir", 192, 768, 3, 5, "gelu", 1],
+        ["uir", 192, 768, 5, 3, "gelu", 1],
+        ["uir", 192, 768, 5, 3, "gelu", 1],
+        ["mqa", 192, 48, 3, 8, 2, "gelu", 1],
+        ["uir", 192, 768, 5, 3, "gelu", 1],
+        ["mqa", 192, 48, 3, 8, 2, "gelu", 1],
+        ["uir", 192, 768, 5, 3, "gelu", 1],
+        ["mqa", 192, 48, 3, 8, 2, "gelu", 1],
+        ["uir", 192, 768, 5, 3, "gelu", 1],
+        ["mqa", 192, 48, 3, 8, 2, "gelu", 1],
+        ["uir", 192, 768, 3, 0, "gelu", 1],
+        # stage 3,  14x14
+        ["uir", 512, 768, 5, 5, "gelu", 2],
+        ["uir", 512, 2048, 5, 5, "gelu", 1],
+        ["uir", 512, 2048, 5, 5, "gelu", 1],
+        ["uir", 512, 2048, 5, 5, "gelu", 1],
+        ["uir", 512, 2048, 5, 0, "gelu", 1],
+        ["uir", 512, 2048, 5, 3, "gelu", 1],
+        ["uir", 512, 2048, 5, 0, "gelu", 1],
+        ["uir", 512, 2048, 5, 0, "gelu", 1],
+        ["uir", 512, 2048, 5, 3, "gelu", 1],
+        ["uir", 512, 2048, 5, 5, "gelu", 1],
+        ["mqa", 512, 64, 3, 8, 1, "gelu", 1],
+        ["uir", 512, 2048, 5, 0, "gelu", 1],
+        ["mqa", 512, 64, 3, 8, 1, "gelu", 1],
+        ["uir", 512, 2048, 5, 0, "gelu", 1],
+        ["mqa", 512, 64, 3, 8, 1, "gelu", 1],
+        ["uir", 512, 2048, 5, 0, "gelu", 1],
+        ["mqa", 512, 64, 3, 8, 1, "gelu", 1],
+        ["uir", 512, 2048, 5, 0, "gelu", 1],
+        # stage 4, 7x7
+        ["cn", 960, 1, "gelu", 1],
+    ]
+}
+
+MODEL_STAGES_PATTERN = {
+    "MobileNetV4_conv_small":
+    ["blocks[1]", "blocks[3]", "blocks[9]", "blocks[15]", "blocks[16]"],
+    "MobileNetV4_conv_medium":
+    ["blocks[0]", "blocks[2]", "blocks[10]", "blocks[21]", "blocks[22]"],
+    "MobileNetV4_conv_large":
+    ["blocks[0]", "blocks[2]", "blocks[13]", "blocks[26]", "blocks[27]"],
+    "MobileNetV4_hybrid_medium":
+    ["blocks[0]", "blocks[2]", "blocks[14]", "blocks[30]", "blocks[31]"],
+    "MobileNetV4_hybrid_large":
+    ["blocks[0]", "blocks[2]", "blocks[17]", "blocks[35]", "blocks[36]"],
+}
+
+
+def _make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+def _create_act(act):
+    if act == "hardswish":
+        return nn.Hardswish()
+    elif act == "relu":
+        return nn.ReLU()
+    elif act == "gelu":
+        return nn.GELU(approximate=False)
+    elif act is None:
+        return None
+    else:
+        raise RuntimeError(
+            "The activation function is not supported: {}".format(act))
+
+
+class ConvBnAct(TheseusLayer):
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 padding,
+                 num_groups=1,
+                 drop_path_rate=0.0,
+                 if_act=True,
+                 act=None):
+        super().__init__()
+
+        self.drop_path_rate = drop_path_rate
+        self.conv = Conv2D(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            bias_attr=False)
+        self.bn = BatchNorm(
+            num_channels=out_c,
+            act=None,
+            param_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.if_act = if_act
+        if self.if_act:
+            self.act = _create_act(act)
+        if self.drop_path_rate > 0:
+            self.drop_path = DropPath(drop_path_rate)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.if_act:
+            x = self.act(x)
+        if self.drop_path_rate > 0:
+            x = self.drop_path(x)
+        return x
+
+
+class EdgeResidual(TheseusLayer):
+    def __init__(self,
+                 in_c,
+                 mid_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 drop_path_rate=0.0,
+                 if_act=False,
+                 act=None):
+        super(EdgeResidual, self).__init__()
+
+        self.if_shortcut = stride == 1 and in_c == out_c
+        self.conv_exp = ConvBnAct(
+            in_c=in_c,
+            out_c=mid_c,
+            filter_size=filter_size,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            if_act=True,
+            act=act)
+        self.conv_pwl = ConvBnAct(
+            in_c=mid_c,
+            out_c=out_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=False,
+            act=act)
+        self.drop_path = DropPath(
+            drop_path_rate) if drop_path_rate else Identity()
+
+    def forward(self, x):
+        identity = x
+        x = self.conv_exp(x)
+        x = self.conv_pwl(x)
+        if self.if_shortcut:
+            x = paddle.add(identity, self.drop_path(x))
+        return x
+
+
+class UniversalInvertedResidual(TheseusLayer):
+    def __init__(self,
+                 in_c,
+                 mid_c,
+                 out_c,
+                 filter_size,
+                 stem_kernel_size=None,
+                 stride=1,
+                 drop_path_rate=0.0,
+                 layer_scale_init_value=0.0,
+                 if_act=False,
+                 act=None):
+        super().__init__()
+
+        self.if_shortcut = stride == 1 and in_c == out_c
+        self.layer_scale_init_value = layer_scale_init_value
+        if stem_kernel_size:
+            self.dw_start = ConvBnAct(
+                in_c=in_c,
+                out_c=in_c,
+                filter_size=stem_kernel_size,
+                stride=1,
+                padding=int((stem_kernel_size - 1) // 2),
+                num_groups=in_c,
+                if_act=False,
+                act=None)
+        else:
+            self.dw_start = Identity()
+
+        self.pw_exp = ConvBnAct(
+            in_c=in_c,
+            out_c=mid_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=True,
+            act=act)
+
+        if filter_size:
+            self.dw_mid = ConvBnAct(
+                in_c=mid_c,
+                out_c=mid_c,
+                filter_size=filter_size,
+                stride=stride,
+                padding=int((filter_size - 1) // 2),
+                num_groups=mid_c,
+                if_act=True,
+                act=act)
+        else:
+            self.dw_mid = Identity()
+        self.pw_proj = ConvBnAct(
+            in_c=mid_c,
+            out_c=out_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=False,
+            act=None)
+        if layer_scale_init_value > 0.0:
+            self.layer_scale = LayerScale2D(out_c, layer_scale_init_value)
+
+        self.drop_path = DropPath(
+            drop_path_rate) if drop_path_rate else Identity()
+
+    def forward(self, x):
+        identity = x
+        x = self.dw_start(x)
+        x = self.pw_exp(x)
+        x = self.dw_mid(x)
+        x = self.pw_proj(x)
+        if self.layer_scale_init_value > 0.0:
+            x = self.layer_scale(x)
+        if self.if_shortcut:
+            x = paddle.add(identity, self.drop_path(x))
+        return x
+
+
+class LayerScale2D(nn.Layer):
+    def __init__(self, dim, init_values=1e-05, inplace=False):
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = paddle.create_parameter(
+            shape=[dim],
+            dtype='float32',
+            default_initializer=nn.initializer.Constant(init_values))
+
+    def forward(self, x):
+        gamma = self.gamma.reshape([1, -1, 1, 1])
+        return (x.multiply_(y=paddle.to_tensor(gamma))
+                if self.inplace else x * gamma)
+
+
+class MobileAttention(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 filter_size=3,
+                 stride=1,
+                 num_head=4,
+                 query_dim=256,
+                 kv_dim=64,
+                 kv_stride=1,
+                 drop_path_rate=0.0,
+                 attn_drop_rate=0.0,
+                 dropout_prob=0.0,
+                 layer_scale_init_value=0.0,
+                 if_act=True,
+                 act=None,
+                 use_fused_attn=False):
+        super(MobileAttention, self).__init__()
+
+        self.if_shortcut = stride == 1 and in_c == out_c
+        self.kv_stride = kv_stride
+        self.kv_dim = kv_dim
+        self.num_head = num_head
+        self.query_dim = query_dim
+        self.attn_drop_rate = attn_drop_rate
+        self.use_fused_attn = use_fused_attn
+        self.norm = BatchNorm(
+            num_channels=in_c,
+            act=None,
+            param_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.query_proj = Conv2D(
+            in_channels=in_c,
+            out_channels=query_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            bias_attr=False)
+        if kv_stride > 1:
+            self.key_down_proj = ConvBnAct(
+                in_c=in_c,
+                out_c=in_c,
+                filter_size=filter_size,
+                stride=kv_stride,
+                padding=int((filter_size - 1) // 2),
+                num_groups=in_c,
+                if_act=False)
+            self.value_down_proj = ConvBnAct(
+                in_c=in_c,
+                out_c=in_c,
+                filter_size=filter_size,
+                stride=kv_stride,
+                padding=int((filter_size - 1) // 2),
+                num_groups=in_c,
+                if_act=False)
+        self.key_proj = Conv2D(
+            in_channels=in_c,
+            out_channels=kv_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            bias_attr=False)
+        self.value_proj = Conv2D(
+            in_channels=in_c,
+            out_channels=kv_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            bias_attr=False)
+        self.proj = Conv2D(
+            in_channels=query_dim,
+            out_channels=out_c,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            bias_attr=False)
+        if not self.use_fused_attn:
+            self.scale = query_dim**-0.5
+            self.softmax = nn.Softmax(-1)
+            self.attn_drop = Dropout(self.attn_drop_rate)
+        self.drop = Dropout(dropout_prob)
+        self.layer_scale_init_value = layer_scale_init_value
+        if layer_scale_init_value > 0.0:
+            self.layer_scale = LayerScale2D(out_c, layer_scale_init_value)
+        self.drop_path = (DropPath(drop_path_rate)
+                          if drop_path_rate else Identity())
+
+    def forward(self, x, attn_mask=None):
+        identity = x
+        x = self.norm(x)
+        B, C, H, W = tuple(x.shape)
+        q = self.query_proj(x).reshape(
+            [B, self.num_head, self.query_dim // self.num_head, H * W])
+        q = q.transpose([0, 3, 1, 2])
+        if self.kv_stride > 1:
+            k = self.key_proj(self.key_down_proj(x))
+            v = self.value_proj(self.value_down_proj(x))
+        else:
+            k = self.key_proj(x)
+            v = self.value_proj(x)
+        k = k.reshape(
+            [B, self.kv_dim, 1, H // self.kv_stride * W // self.kv_stride])
+        k = k.transpose([0, 3, 2, 1])
+        v = v.reshape(
+            [B, self.kv_dim, 1, H // self.kv_stride * W // self.kv_stride])
+        v = v.transpose([0, 3, 2, 1])
+        if self.use_fused_attn:
+            attn = F.scaled_dot_product_attention(
+                query=q,
+                key=k,
+                value=v,
+                attn_mask=attn_mask,
+                dropout_p=self.attn_drop_rate if self.training else 0.0)
+        else:
+            q = q.transpose([0, 2, 1, 3]) * self.scale
+            v = v.transpose([0, 2, 1, 3])
+            attn = q @ k.transpose([0, 2, 3, 1])
+            attn = self.softmax(attn)
+            attn = self.attn_drop(attn)
+            attn = attn @ v
+            attn = attn.transpose([0, 2, 1, 3])
+
+        attn = attn.reshape([B, H, W, self.query_dim])
+        x = self.proj(attn.transpose([0, 3, 1, 2]))
+        x = self.drop(x)
+        if self.layer_scale_init_value > 0.0:
+            x = self.layer_scale(x)
+        if self.if_shortcut:
+            x = paddle.add(identity, self.drop_path(x))
+        return x
+
+
+class MobileNetV4(TheseusLayer):
+    """
+    MobileNetV4
+    Args:
+        config: list. MobileNetV4 depthwise blocks config.
+        stages_pattern: list. The pattern of each stage blocks.
+        scale: float=1.0. The coefficient that controls the size of network parameters. 
+        class_num: int=1000. The number of classes.
+        inplanes: int=32. The output channel number of first convolution layer.
+        act: str="relu". The activation function.
+        class_expand: int=960. The output channel number of last convolution layer. 
+        drop_path_rate: float=0.0. Probability of dropping path.
+        drop_rate: float=0.0.  Probability of setting units to zero.
+    Returns:
+        model: nn.Layer. Specific MobileNetV4 model depends on args.
+    """
+    def __init__(self,
+                 config,
+                 stages_pattern,
+                 scale=1.0,
+                 class_num=1000,
+                 inplanes=STEM_CONV_NUMBER,
+                 class_expand=LAST_CONV,
+                 act="relu",
+                 drop_path_rate=0.0,
+                 drop_rate=0.0,
+                 layer_scale_init_value=0.0,
+                 return_patterns=None,
+                 return_stages=None,
+                 use_fused_attn=False,
+                 **kwargs):
+        super(MobileNetV4, self).__init__()
+        self.cfg = config
+        self.scale = scale
+        self.drop_path_rate = drop_path_rate
+        self.inplanes = inplanes
+        self.class_expand = class_expand
+        self.class_num = class_num
+        self.conv_stem = ConvBnAct(
+            in_c=3,
+            out_c=_make_divisible(self.inplanes * self.scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            if_act=True,
+            act=act)
+
+        blocks = []
+        block_count = len(self.cfg)
+        for i in range(block_count):
+            type = self.cfg[i][0]
+            if type == "cn":
+                _, exp, k, act, s = self.cfg[i]
+                block = ConvBnAct(
+                    in_c=_make_divisible(self.inplanes * self.scale if i == 0
+                                         else self.cfg[i - 1][1] * self.scale),
+                    out_c=_make_divisible(exp * self.scale),
+                    filter_size=k,
+                    stride=s,
+                    padding=int((k - 1) // 2),
+                    num_groups=1,
+                    drop_path_rate=self.drop_path_rate * i / block_count,
+                    if_act=True,
+                    act=act)
+            elif type == "uir":
+                _, c, exp, k_start, k, act, s = self.cfg[i]
+                block = UniversalInvertedResidual(
+                    in_c=_make_divisible(self.inplanes * self.scale if i == 0
+                                         else self.cfg[i - 1][1] * self.scale),
+                    mid_c=_make_divisible(self.scale * exp),
+                    out_c=_make_divisible(self.scale * c),
+                    filter_size=k,
+                    stem_kernel_size=k_start,
+                    stride=s,
+                    drop_path_rate=self.drop_path_rate * i / block_count,
+                    layer_scale_init_value=layer_scale_init_value,
+                    if_act=True,
+                    act=act)
+            elif type == "er":
+                _, c, exp, k, act, s = self.cfg[i]
+                block = EdgeResidual(
+                    in_c=_make_divisible(self.inplanes * self.scale if i == 0
+                                         else self.cfg[i - 1][1] * self.scale),
+                    mid_c=_make_divisible(self.scale * exp),
+                    out_c=_make_divisible(self.scale * c),
+                    filter_size=k,
+                    stride=s,
+                    drop_path_rate=self.drop_path_rate * i / block_count,
+                    if_act=True,
+                    act=act)
+            elif type == "mqa":
+                # type, out,kv_dim, kernal_size, kv_stride, act, stride
+                _, c, dim, k, head, kv_stride, act, s = self.cfg[i]
+                block = MobileAttention(
+                    in_c=_make_divisible(self.inplanes * self.scale if i == 0
+                                         else self.cfg[i - 1][1] * self.scale),
+                    out_c=_make_divisible(self.scale * c),
+                    filter_size=k,
+                    stride=s,
+                    num_head=head,
+                    query_dim=_make_divisible(self.scale * head * dim),
+                    kv_dim=_make_divisible(self.scale * dim),
+                    kv_stride=kv_stride,
+                    drop_path_rate=self.drop_path_rate * i / block_count,
+                    layer_scale_init_value=layer_scale_init_value,
+                    if_act=True,
+                    act=act,
+                    use_fused_attn=use_fused_attn)
+            blocks.append(block)
+        self.blocks = nn.Sequential(*blocks)
+        self.global_pool = AdaptiveAvgPool2D(1)
+        self.conv_head = ConvBnAct(
+            in_c=_make_divisible(self.scale * self.cfg[-1][1]),
+            out_c=self.class_expand,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=True,
+            act=act)
+        self.flatten = Flatten(start_axis=1, stop_axis=-1)
+        self.dropout = Dropout(drop_rate)
+        self.classifier = Linear(self.class_expand,
+                                 class_num) if class_num > 0 else Identity()
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward_features(self, x):
+        x = self.conv_stem(x)
+        x = self.blocks(x)
+        return x
+
+    def forward_head(self, x, pre_logits=False):
+        x = self.global_pool(x)
+        x = self.conv_head(x)
+        x = self.flatten(x)
+        if pre_logits:
+            return x
+        return self.classifier(x)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError("pretrained type is not available. ")
+
+
+def MobileNetV4_conv_small(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV4_conv_small
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV4_conv_small` model depends on args.
+    """
+    model = MobileNetV4(
+        config=NET_CONFIG["conv_small"],
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV4_conv_small"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV4_conv_small"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV4_conv_medium(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV4_conv_medium
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV4_conv_medium` model depends on args.
+    """
+    model = MobileNetV4(
+        config=NET_CONFIG["conv_medium"],
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV4_conv_medium"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV4_conv_medium"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV4_conv_large(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV4_conv_large
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV4_conv_large` model depends on args.
+    """
+    model = MobileNetV4(
+        config=NET_CONFIG["conv_large"],
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV4_conv_large"],
+        inplanes=24,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV4_conv_large"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV4_hybrid_medium(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV4_hybrid_medium
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV4_hybrid_medium` model depends on args.
+    """
+    model = MobileNetV4(
+        config=NET_CONFIG["hybrid_medium"],
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV4_hybrid_medium"],
+        layer_scale_init_value=1e-05,
+        **kwargs)
+    _load_pretrained(pretrained, model,
+                     MODEL_URLS["MobileNetV4_hybrid_medium"], use_ssld)
+    return model
+
+
+def MobileNetV4_hybrid_large(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV4_hybrid_large
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV4_hybrid_large` model depends on args.
+    """
+    model = MobileNetV4(
+        config=NET_CONFIG["hybrid_large"],
+        scale=1.0,
+        stages_pattern=MODEL_STAGES_PATTERN["MobileNetV4_hybrid_large"],
+        inplanes=24,
+        act="gelu",
+        layer_scale_init_value=1e-05,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV4_hybrid_large"],
+                     use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/pp_hgnet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/pp_hgnet.py
new file mode 100644
index 000000000..04a936f85
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/pp_hgnet.py
@@ -0,0 +1,376 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import KaimingNormal, Constant
+from paddle.nn import Conv2D, BatchNorm2D, ReLU, MaxPool2D
+from .custom_devices_layers import AdaptiveAvgPool2D
+from paddle.regularizer import L2Decay
+from paddle import ParamAttr
+
+from ..base.theseus_layer import TheseusLayer
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "PPHGNet_tiny":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPHGNet_tiny_pretrained.pdparams",
+    "PPHGNet_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPHGNet_small_pretrained.pdparams",
+    "PPHGNet_base": ""
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+kaiming_normal_ = KaimingNormal()
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+class ConvBNAct(TheseusLayer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 groups=1,
+                 use_act=True):
+        super().__init__()
+        self.use_act = use_act
+        self.conv = Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            bias_attr=False)
+        self.bn = BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        if self.use_act:
+            self.act = ReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.use_act:
+            x = self.act(x)
+        return x
+
+
+class ESEModule(TheseusLayer):
+    def __init__(self, channels):
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv = Conv2D(
+            in_channels=channels,
+            out_channels=channels,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv(x)
+        x = self.sigmoid(x)
+        return paddle.multiply(x=identity, y=x)
+
+
+class HG_Block(TheseusLayer):
+    def __init__(
+            self,
+            in_channels,
+            mid_channels,
+            out_channels,
+            layer_num,
+            identity=False, ):
+        super().__init__()
+        self.identity = identity
+
+        self.layers = nn.LayerList()
+        self.layers.append(
+            ConvBNAct(
+                in_channels=in_channels,
+                out_channels=mid_channels,
+                kernel_size=3,
+                stride=1))
+        for _ in range(layer_num - 1):
+            self.layers.append(
+                ConvBNAct(
+                    in_channels=mid_channels,
+                    out_channels=mid_channels,
+                    kernel_size=3,
+                    stride=1))
+
+        # feature aggregation
+        total_channels = in_channels + layer_num * mid_channels
+        self.aggregation_conv = ConvBNAct(
+            in_channels=total_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1)
+        self.att = ESEModule(out_channels)
+
+    def forward(self, x):
+        identity = x
+        output = []
+        output.append(x)
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+        x = paddle.concat(output, axis=1)
+        x = self.aggregation_conv(x)
+        x = self.att(x)
+        if self.identity:
+            x += identity
+        return x
+
+
+class HG_Stage(TheseusLayer):
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 block_num,
+                 layer_num,
+                 downsample=True):
+        super().__init__()
+        self.downsample = downsample
+        if downsample:
+            self.downsample = ConvBNAct(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=3,
+                stride=2,
+                groups=in_channels,
+                use_act=False)
+
+        blocks_list = []
+        blocks_list.append(
+            HG_Block(
+                in_channels,
+                mid_channels,
+                out_channels,
+                layer_num,
+                identity=False))
+        for _ in range(block_num - 1):
+            blocks_list.append(
+                HG_Block(
+                    out_channels,
+                    mid_channels,
+                    out_channels,
+                    layer_num,
+                    identity=True))
+        self.blocks = nn.Sequential(*blocks_list)
+
+    def forward(self, x):
+        if self.downsample:
+            x = self.downsample(x)
+        x = self.blocks(x)
+        return x
+
+
+class PPHGNet(TheseusLayer):
+    """
+    PPHGNet
+    Args:
+        stem_channels: list. Stem channel list of PPHGNet.
+        stage_config: dict. The configuration of each stage of PPHGNet. such as the number of channels, stride, etc.
+        layer_num: int. Number of layers of HG_Block.
+        use_last_conv: boolean. Whether to use a 1x1 convolutional layer before the classification layer.
+        class_expand: int=2048. Number of channels for the last 1x1 convolutional layer.
+        dropout_prob: float. Parameters of dropout, 0.0 means dropout is not used.
+        class_num: int=1000. The number of classes.
+    Returns:
+        model: nn.Layer. Specific PPHGNet model depends on args.
+    """
+
+    def __init__(self,
+                 stem_channels,
+                 stage_config,
+                 layer_num,
+                 use_last_conv=True,
+                 class_expand=2048,
+                 dropout_prob=0.0,
+                 class_num=1000,
+                 **kwargs):
+        super().__init__()
+        self.use_last_conv = use_last_conv
+        self.class_expand = class_expand
+
+        # stem
+        stem_channels.insert(0, 3)
+        self.stem = nn.Sequential(* [
+            ConvBNAct(
+                in_channels=stem_channels[i],
+                out_channels=stem_channels[i + 1],
+                kernel_size=3,
+                stride=2 if i == 0 else 1) for i in range(
+                    len(stem_channels) - 1)
+        ])
+        self.pool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        # stages
+        self.stages = nn.LayerList()
+        for k in stage_config:
+            in_channels, mid_channels, out_channels, block_num, downsample = stage_config[
+                k]
+            self.stages.append(
+                HG_Stage(in_channels, mid_channels, out_channels, block_num,
+                         layer_num, downsample))
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        if self.use_last_conv:
+            self.last_conv = Conv2D(
+                in_channels=out_channels,
+                out_channels=self.class_expand,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias_attr=False)
+            self.act = nn.ReLU()
+            self.dropout = nn.Dropout(
+                p=dropout_prob, mode="downscale_in_infer")
+
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+        self.fc = nn.Linear(self.class_expand
+                            if self.use_last_conv else out_channels, class_num)
+
+        self._init_weights()
+
+    def _init_weights(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                kaiming_normal_(m.weight)
+            elif isinstance(m, (nn.BatchNorm2D)):
+                ones_(m.weight)
+                zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                zeros_(m.bias)
+
+    def forward(self, x):
+        x = self.stem(x)
+        x = self.pool(x)
+
+        for stage in self.stages:
+            x = stage(x)
+
+        x = self.avg_pool(x)
+        if self.use_last_conv:
+            x = self.last_conv(x)
+            x = self.act(x)
+            x = self.dropout(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def PPHGNet_tiny(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPHGNet_tiny
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPHGNet_tiny` model depends on args.
+    """
+    stage_config = {
+        # in_channels, mid_channels, out_channels, blocks, downsample
+        "stage1": [96, 96, 224, 1, False],
+        "stage2": [224, 128, 448, 1, True],
+        "stage3": [448, 160, 512, 2, True],
+        "stage4": [512, 192, 768, 1, True],
+    }
+
+    model = PPHGNet(
+        stem_channels=[48, 48, 96],
+        stage_config=stage_config,
+        layer_num=5,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPHGNet_tiny"], use_ssld)
+    return model
+
+
+def PPHGNet_small(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPHGNet_small
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPHGNet_small` model depends on args.
+    """
+    stage_config = {
+        # in_channels, mid_channels, out_channels, blocks, downsample
+        "stage1": [128, 128, 256, 1, False],
+        "stage2": [256, 160, 512, 1, True],
+        "stage3": [512, 192, 768, 2, True],
+        "stage4": [768, 224, 1024, 1, True],
+    }
+
+    model = PPHGNet(
+        stem_channels=[64, 64, 128],
+        stage_config=stage_config,
+        layer_num=6,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPHGNet_small"], use_ssld)
+    return model
+
+
+def PPHGNet_base(pretrained=False, use_ssld=True, **kwargs):
+    """
+    PPHGNet_base
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPHGNet_base` model depends on args.
+    """
+    stage_config = {
+        # in_channels, mid_channels, out_channels, blocks, downsample
+        "stage1": [160, 192, 320, 1, False],
+        "stage2": [320, 224, 640, 2, True],
+        "stage3": [640, 256, 960, 3, True],
+        "stage4": [960, 288, 1280, 2, True],
+    }
+
+    model = PPHGNet(
+        stem_channels=[96, 96, 160],
+        stage_config=stage_config,
+        layer_num=7,
+        dropout_prob=0.2,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPHGNet_base"], use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/pp_hgnet_v2.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/pp_hgnet_v2.py
new file mode 100644
index 000000000..3f554c82c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/pp_hgnet_v2.py
@@ -0,0 +1,706 @@
+# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import KaimingNormal, Constant
+from paddle.nn import Conv2D, BatchNorm2D, ReLU, AdaptiveAvgPool2D, MaxPool2D
+from paddle.regularizer import L2Decay
+from paddle import ParamAttr
+
+from ..base.theseus_layer import TheseusLayer
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "PPHGNetV2_B0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPHGNetV2_B0_ssld_pretrained.pdparams",
+    "PPHGNetV2_B1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPHGNetV2_B1_ssld_pretrained.pdparams",
+    "PPHGNetV2_B2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPHGNetV2_B2_ssld_pretrained.pdparams",
+    "PPHGNetV2_B3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPHGNetV2_B3_ssld_pretrained.pdparams",
+    "PPHGNetV2_B4":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPHGNetV2_B4_ssld_pretrained.pdparams",
+    "PPHGNetV2_B5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPHGNetV2_B5_ssld_pretrained.pdparams",
+    "PPHGNetV2_B6":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPHGNetV2_B6_ssld_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+kaiming_normal_ = KaimingNormal()
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+class LearnableAffineBlock(TheseusLayer):
+    """  
+    Create a learnable affine block module. This module can significantly improve accuracy on smaller models.
+
+    Args:  
+        scale_value (float): The initial value of the scale parameter, default is 1.0.  
+        bias_value (float): The initial value of the bias parameter, default is 0.0.  
+        lr_mult (float): The learning rate multiplier, default is 1.0.  
+        lab_lr (float): The learning rate, default is 0.01.  
+    """
+
+    def __init__(self,
+                 scale_value=1.0,
+                 bias_value=0.0,
+                 lr_mult=1.0,
+                 lab_lr=0.01):
+        super().__init__()
+        self.scale = self.create_parameter(
+            shape=[1, ],
+            default_initializer=Constant(value=scale_value),
+            attr=ParamAttr(learning_rate=lr_mult * lab_lr))
+        self.add_parameter("scale", self.scale)
+        self.bias = self.create_parameter(
+            shape=[1, ],
+            default_initializer=Constant(value=bias_value),
+            attr=ParamAttr(learning_rate=lr_mult * lab_lr))
+        self.add_parameter("bias", self.bias)
+
+    def forward(self, x):
+        return self.scale * x + self.bias
+
+
+class ConvBNAct(TheseusLayer):
+    """  
+    ConvBNAct is a combination of convolution and batchnorm layers.
+  
+    Args:  
+        in_channels (int): Number of input channels.  
+        out_channels (int): Number of output channels.  
+        kernel_size (int): Size of the convolution kernel. Defaults to 3.
+        stride (int): Stride of the convolution. Defaults to 1.
+        padding (int/str): Padding or padding type for the convolution. Defaults to 1.
+        groups (int): Number of groups for the convolution. Defaults to 1.
+        use_act: (bool): Whether to use activation function. Defaults to True.
+        use_lab (bool): Whether to use the LAB operation. Defaults to False.  
+        lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0.  
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=1,
+                 groups=1,
+                 use_act=True,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.use_act = use_act
+        self.use_lab = use_lab
+        self.conv = Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding=padding
+            if isinstance(padding, str) else (kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=False)
+        self.bn = BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(
+                regularizer=L2Decay(0.0), learning_rate=lr_mult),
+            bias_attr=ParamAttr(
+                regularizer=L2Decay(0.0), learning_rate=lr_mult))
+        if self.use_act:
+            self.act = ReLU()
+            if self.use_lab:
+                self.lab = LearnableAffineBlock(lr_mult=lr_mult)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.use_act:
+            x = self.act(x)
+            if self.use_lab:
+                x = self.lab(x)
+        return x
+
+
+class LightConvBNAct(TheseusLayer):
+    """  
+    LightConvBNAct is a combination of pw and dw layers.
+  
+    Args:  
+        in_channels (int): Number of input channels.  
+        out_channels (int): Number of output channels.  
+        kernel_size (int): Size of the depth-wise convolution kernel.  
+        use_lab (bool): Whether to use the LAB operation. Defaults to False.  
+        lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0.  
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 use_lab=False,
+                 lr_mult=1.0,
+                 **kwargs):
+        super().__init__()
+        self.conv1 = ConvBNAct(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            use_act=False,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.conv2 = ConvBNAct(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            groups=out_channels,
+            use_act=True,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+
+
+class StemBlock(TheseusLayer):
+    """  
+    StemBlock for PP-HGNetV2.
+  
+    Args:  
+        in_channels (int): Number of input channels.  
+        mid_channels (int): Number of middle channels.
+        out_channels (int): Number of output channels.  
+        use_lab (bool): Whether to use the LAB operation. Defaults to False.  
+        lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0.  
+    """
+
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.stem1 = ConvBNAct(
+            in_channels=in_channels,
+            out_channels=mid_channels,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.stem2a = ConvBNAct(
+            in_channels=mid_channels,
+            out_channels=mid_channels // 2,
+            kernel_size=2,
+            stride=1,
+            padding="SAME",
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.stem2b = ConvBNAct(
+            in_channels=mid_channels // 2,
+            out_channels=mid_channels,
+            kernel_size=2,
+            stride=1,
+            padding="SAME",
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.stem3 = ConvBNAct(
+            in_channels=mid_channels * 2,
+            out_channels=mid_channels,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.stem4 = ConvBNAct(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.pool = nn.MaxPool2D(
+            kernel_size=2, stride=1, ceil_mode=True, padding="SAME")
+
+    def forward(self, x):
+        x = self.stem1(x)
+        x2 = self.stem2a(x)
+        x2 = self.stem2b(x2)
+        x1 = self.pool(x)
+        x = paddle.concat([x1, x2], 1)
+        x = self.stem3(x)
+        x = self.stem4(x)
+
+        return x
+
+
+class HGV2_Block(TheseusLayer):
+    """  
+    HGV2_Block, the basic unit that constitutes the HGV2_Stage.
+  
+    Args:  
+        in_channels (int): Number of input channels. 
+        mid_channels (int): Number of middle channels.  
+        out_channels (int): Number of output channels.  
+        kernel_size (int): Size of the convolution kernel. Defaults to 3.
+        layer_num (int): Number of layers in the HGV2 block. Defaults to 6.
+        stride (int): Stride of the convolution. Defaults to 1.
+        padding (int/str): Padding or padding type for the convolution. Defaults to 1.
+        groups (int): Number of groups for the convolution. Defaults to 1.
+        use_act (bool): Whether to use activation function. Defaults to True.
+        use_lab (bool): Whether to use the LAB operation. Defaults to False.  
+        lr_mult (float): Learning rate multiplier for the layer. Defaults to 1.0.  
+    """
+
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 kernel_size=3,
+                 layer_num=6,
+                 identity=False,
+                 light_block=True,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.identity = identity
+
+        self.layers = nn.LayerList()
+        block_type = "LightConvBNAct" if light_block else "ConvBNAct"
+        for i in range(layer_num):
+            self.layers.append(
+                eval(block_type)(in_channels=in_channels
+                                 if i == 0 else mid_channels,
+                                 out_channels=mid_channels,
+                                 stride=1,
+                                 kernel_size=kernel_size,
+                                 use_lab=use_lab,
+                                 lr_mult=lr_mult))
+        # feature aggregation
+        total_channels = in_channels + layer_num * mid_channels
+        self.aggregation_squeeze_conv = ConvBNAct(
+            in_channels=total_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.aggregation_excitation_conv = ConvBNAct(
+            in_channels=out_channels // 2,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+
+    def forward(self, x):
+        identity = x
+        output = []
+        output.append(x)
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+        x = paddle.concat(output, axis=1)
+        x = self.aggregation_squeeze_conv(x)
+        x = self.aggregation_excitation_conv(x)
+        if self.identity:
+            x += identity
+        return x
+
+
+class HGV2_Stage(TheseusLayer):
+    """  
+    HGV2_Stage, the basic unit that constitutes the PPHGNetV2.
+  
+    Args:  
+        in_channels (int): Number of input channels. 
+        mid_channels (int): Number of middle channels.  
+        out_channels (int): Number of output channels.  
+        block_num (int): Number of blocks in the HGV2 stage. 
+        layer_num (int): Number of layers in the HGV2 block. Defaults to 6.
+        is_downsample (bool): Whether to use downsampling operation. Defaults to False.
+        light_block (bool): Whether to use light block. Defaults to True.
+        kernel_size (int): Size of the convolution kernel. Defaults to 3.
+        use_lab (bool, optional): Whether to use the LAB operation. Defaults to False.  
+        lr_mult (float, optional): Learning rate multiplier for the layer. Defaults to 1.0.  
+    """
+
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 block_num,
+                 layer_num=6,
+                 is_downsample=True,
+                 light_block=True,
+                 kernel_size=3,
+                 use_lab=False,
+                 lr_mult=1.0):
+
+        super().__init__()
+        self.is_downsample = is_downsample
+        if self.is_downsample:
+            self.downsample = ConvBNAct(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=3,
+                stride=2,
+                groups=in_channels,
+                use_act=False,
+                use_lab=use_lab,
+                lr_mult=lr_mult)
+
+        blocks_list = []
+        for i in range(block_num):
+            blocks_list.append(
+                HGV2_Block(
+                    in_channels=in_channels if i == 0 else out_channels,
+                    mid_channels=mid_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    layer_num=layer_num,
+                    identity=False if i == 0 else True,
+                    light_block=light_block,
+                    use_lab=use_lab,
+                    lr_mult=lr_mult))
+        self.blocks = nn.Sequential(*blocks_list)
+
+    def forward(self, x):
+        if self.is_downsample:
+            x = self.downsample(x)
+        x = self.blocks(x)
+        return x
+
+
+class PPHGNetV2(TheseusLayer):
+    """
+    PPHGNetV2
+
+    Args:
+        stage_config (dict): Config for PPHGNetV2 stages. such as the number of channels, stride, etc.
+        stem_channels: (list): Number of channels of the stem of the PPHGNetV2.
+        use_lab (bool): Whether to use the LAB operation. Defaults to False.  
+        use_last_conv (bool): Whether to use the last conv layer as the output channel. Defaults to True.
+        class_expand (int): Number of channels for the last 1x1 convolutional layer.
+        drop_prob (float): Dropout probability for the last 1x1 convolutional layer. Defaults to 0.0.
+        class_num (int): The number of classes for the classification layer. Defaults to 1000.
+        lr_mult_list (list): Learning rate multiplier for the stages. Defaults to [1.0, 1.0, 1.0, 1.0, 1.0].
+    Returns:
+        model: nn.Layer. Specific PPHGNetV2 model depends on args.
+    """
+
+    def __init__(self,
+                 stage_config,
+                 stem_channels=[3, 32, 64],
+                 use_lab=False,
+                 use_last_conv=True,
+                 class_expand=2048,
+                 dropout_prob=0.0,
+                 class_num=1000,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+                 **kwargs):
+        super().__init__()
+        self.use_lab = use_lab
+        self.use_last_conv = use_last_conv
+        self.class_expand = class_expand
+        self.class_num = class_num
+
+        # stem
+        self.stem = StemBlock(
+            in_channels=stem_channels[0],
+            mid_channels=stem_channels[1],
+            out_channels=stem_channels[2],
+            use_lab=use_lab,
+            lr_mult=lr_mult_list[0])
+
+        # stages
+        self.stages = nn.LayerList()
+        for i, k in enumerate(stage_config):
+            in_channels, mid_channels, out_channels, block_num, is_downsample, light_block, kernel_size, layer_num = stage_config[
+                k]
+            self.stages.append(
+                HGV2_Stage(
+                    in_channels,
+                    mid_channels,
+                    out_channels,
+                    block_num,
+                    layer_num,
+                    is_downsample,
+                    light_block,
+                    kernel_size,
+                    use_lab,
+                    lr_mult=lr_mult_list[i + 1]))
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+
+        if self.use_last_conv:
+            self.last_conv = Conv2D(
+                in_channels=out_channels,
+                out_channels=self.class_expand,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias_attr=False)
+            self.act = ReLU()
+            if self.use_lab:
+                self.lab = LearnableAffineBlock()
+            self.dropout = nn.Dropout(
+                p=dropout_prob, mode="downscale_in_infer")
+
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+        self.fc = nn.Linear(self.class_expand if self.use_last_conv else
+                            out_channels, self.class_num)
+
+        self._init_weights()
+
+    def _init_weights(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                kaiming_normal_(m.weight)
+            elif isinstance(m, (nn.BatchNorm2D)):
+                ones_(m.weight)
+                zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                zeros_(m.bias)
+
+    def forward(self, x):
+        x = self.stem(x)
+
+        for stage in self.stages:
+            x = stage(x)
+        x = self.avg_pool(x)
+
+        if self.use_last_conv:
+            x = self.last_conv(x)
+            x = self.act(x)
+            if self.use_lab:
+                x = self.lab(x)
+            x = self.dropout(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(
+            model,
+            model_url,
+            use_ssld=use_ssld,
+            use_ssld_stage1_pretrained=True)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def PPHGNetV2_B0(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPHGNetV2_B0
+    Args:
+        pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
+    Returns:
+        model: nn.Layer. Specific `PPHGNetV2_B0` model depends on args.
+    """
+    stage_config = {
+        # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
+        "stage1": [16, 16, 64, 1, False, False, 3, 3],
+        "stage2": [64, 32, 256, 1, True, False, 3, 3],
+        "stage3": [256, 64, 512, 2, True, True, 5, 3],
+        "stage4": [512, 128, 1024, 1, True, True, 5, 3],
+    }
+
+    model = PPHGNetV2(
+        stem_channels=[3, 16, 16],
+        stage_config=stage_config,
+        use_lab=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPHGNetV2_B0"], use_ssld)
+    return model
+
+
+def PPHGNetV2_B1(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPHGNetV2_B1
+    Args:
+        pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
+    Returns:
+        model: nn.Layer. Specific `PPHGNetV2_B1` model depends on args.
+    """
+    stage_config = {
+        # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
+        "stage1": [32, 32, 64, 1, False, False, 3, 3],
+        "stage2": [64, 48, 256, 1, True, False, 3, 3],
+        "stage3": [256, 96, 512, 2, True, True, 5, 3],
+        "stage4": [512, 192, 1024, 1, True, True, 5, 3],
+    }
+
+    model = PPHGNetV2(
+        stem_channels=[3, 24, 32],
+        stage_config=stage_config,
+        use_lab=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPHGNetV2_B1"], use_ssld)
+    return model
+
+
+def PPHGNetV2_B2(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPHGNetV2_B2
+    Args:
+        pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
+    Returns:
+        model: nn.Layer. Specific `PPHGNetV2_B2` model depends on args.
+    """
+    stage_config = {
+        # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
+        "stage1": [32, 32, 96, 1, False, False, 3, 4],
+        "stage2": [96, 64, 384, 1, True, False, 3, 4],
+        "stage3": [384, 128, 768, 3, True, True, 5, 4],
+        "stage4": [768, 256, 1536, 1, True, True, 5, 4],
+    }
+
+    model = PPHGNetV2(
+        stem_channels=[3, 24, 32],
+        stage_config=stage_config,
+        use_lab=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPHGNetV2_B2"], use_ssld)
+    return model
+
+
+def PPHGNetV2_B3(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPHGNetV2_B3
+    Args:
+        pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
+    Returns:
+        model: nn.Layer. Specific `PPHGNetV2_B3` model depends on args.
+    """
+    stage_config = {
+        # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
+        "stage1": [32, 32, 128, 1, False, False, 3, 5],
+        "stage2": [128, 64, 512, 1, True, False, 3, 5],
+        "stage3": [512, 128, 1024, 3, True, True, 5, 5],
+        "stage4": [1024, 256, 2048, 1, True, True, 5, 5],
+    }
+
+    model = PPHGNetV2(
+        stem_channels=[3, 24, 32],
+        stage_config=stage_config,
+        use_lab=True,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPHGNetV2_B3"], use_ssld)
+    return model
+
+
+def PPHGNetV2_B4(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPHGNetV2_B4
+    Args:
+        pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
+    Returns:
+        model: nn.Layer. Specific `PPHGNetV2_B4` model depends on args.
+    """
+    stage_config = {
+        # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
+        "stage1": [48, 48, 128, 1, False, False, 3, 6],
+        "stage2": [128, 96, 512, 1, True, False, 3, 6],
+        "stage3": [512, 192, 1024, 3, True, True, 5, 6],
+        "stage4": [1024, 384, 2048, 1, True, True, 5, 6],
+    }
+
+    model = PPHGNetV2(
+        stem_channels=[3, 32, 48],
+        stage_config=stage_config,
+        use_lab=False,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPHGNetV2_B4"], use_ssld)
+    return model
+
+
+def PPHGNetV2_B5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPHGNetV2_B5
+    Args:
+        pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
+    Returns:
+        model: nn.Layer. Specific `PPHGNetV2_B5` model depends on args.
+    """
+    stage_config = {
+        # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
+        "stage1": [64, 64, 128, 1, False, False, 3, 6],
+        "stage2": [128, 128, 512, 2, True, False, 3, 6],
+        "stage3": [512, 256, 1024, 5, True, True, 5, 6],
+        "stage4": [1024, 512, 2048, 2, True, True, 5, 6],
+    }
+
+    model = PPHGNetV2(
+        stem_channels=[3, 32, 64],
+        stage_config=stage_config,
+        use_lab=False,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPHGNetV2_B5"], use_ssld)
+    return model
+
+
+def PPHGNetV2_B6(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPHGNetV2_B6
+    Args:
+        pretrained (bool/str): If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld (bool) Whether using ssld pretrained model when pretrained is True.
+    Returns:
+        model: nn.Layer. Specific `PPHGNetV2_B6` model depends on args.
+    """
+    stage_config = {
+        # in_channels, mid_channels, out_channels, num_blocks, is_downsample, light_block, kernel_size, layer_num
+        "stage1": [96, 96, 192, 2, False, False, 3, 6],
+        "stage2": [192, 192, 512, 3, True, False, 3, 6],
+        "stage3": [512, 384, 1024, 6, True, True, 5, 6],
+        "stage4": [1024, 768, 2048, 3, True, True, 5, 6],
+    }
+
+    model = PPHGNetV2(
+        stem_channels=[3, 48, 96],
+        stage_config=stage_config,
+        use_lab=False,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPHGNetV2_B6"], use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/pp_lcnet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/pp_lcnet.py
new file mode 100644
index 000000000..cc672b0f4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/pp_lcnet.py
@@ -0,0 +1,520 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.nn import BatchNorm2D, Conv2D, Dropout, Linear
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import KaimingNormal
+
+from .custom_devices_layers import AdaptiveAvgPool2D
+from ..base.theseus_layer import TheseusLayer
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "PPLCNet_x0_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x0_25_pretrained.pdparams",
+    "PPLCNet_x0_35":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x0_35_pretrained.pdparams",
+    "PPLCNet_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x0_5_pretrained.pdparams",
+    "PPLCNet_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x0_75_pretrained.pdparams",
+    "PPLCNet_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x1_0_pretrained.pdparams",
+    "PPLCNet_x1_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x1_5_pretrained.pdparams",
+    "PPLCNet_x2_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x2_0_pretrained.pdparams",
+    "PPLCNet_x2_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNet_x2_5_pretrained.pdparams"
+}
+
+MODEL_STAGES_PATTERN = {
+    "PPLCNet": ["blocks2", "blocks3", "blocks4", "blocks5", "blocks6"]
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+# Each element(list) represents a depthwise block, which is composed of k, in_c, out_c, s, use_se.
+# k: kernel_size
+# in_c: input channel number in depthwise block
+# out_c: output channel number in depthwise block
+# s: stride in depthwise block
+# use_se: whether to use SE block
+
+NET_CONFIG = {
+    # [k, in_c, out_c, s, use_se]
+    "blocks2": [[3, 16, 32, 1, False]],
+    "blocks3": [[3, 32, 64, 2, False], [3, 64, 64, 1, False]],
+    "blocks4": [[3, 64, 128, 2, False], [3, 128, 128, 1, False]],
+    "blocks5": [[3, 128, 256, 2, False], [5, 256, 256, 1, False],
+                [5, 256, 256, 1, False], [5, 256, 256, 1, False],
+                [5, 256, 256, 1, False], [5, 256, 256, 1, False]],
+    "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]]
+}
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+def _create_act(act):
+    if act == "hardswish":
+        return nn.Hardswish()
+    elif act == "relu":
+        return nn.ReLU()
+    elif act == "relu6":
+        return nn.ReLU6()
+    else:
+        raise RuntimeError(
+            "The activation function is not supported: {}".format(act))
+
+
+def _create_model_urls(model_scale):
+    model_scale_str = "PPLCNet_x" + str(model_scale).replace('.', '_')
+    if model_scale_str in MODEL_URLS:
+        return MODEL_URLS[model_scale_str]
+    else:
+        return None
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 num_groups=1,
+                 lr_mult=1.0,
+                 act="hardswish"):
+        super().__init__()
+
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=num_groups,
+            weight_attr=ParamAttr(
+                initializer=KaimingNormal(), learning_rate=lr_mult),
+            bias_attr=False)
+
+        self.bn = BatchNorm2D(
+            num_filters,
+            weight_attr=ParamAttr(
+                regularizer=L2Decay(0.0), learning_rate=lr_mult),
+            bias_attr=ParamAttr(
+                regularizer=L2Decay(0.0), learning_rate=lr_mult))
+        self.act = _create_act(act)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+        return x
+
+
+class DepthwiseSeparable(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 dw_size=3,
+                 use_se=False,
+                 lr_mult=1.0,
+                 act="hardswish"):
+        super().__init__()
+        self.use_se = use_se
+        self.dw_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_channels,
+            filter_size=dw_size,
+            stride=stride,
+            num_groups=num_channels,
+            lr_mult=lr_mult,
+            act=act)
+        if use_se:
+            self.se = SEModule(num_channels, lr_mult=lr_mult)
+        self.pw_conv = ConvBNLayer(
+            num_channels=num_channels,
+            filter_size=1,
+            num_filters=num_filters,
+            stride=1,
+            lr_mult=lr_mult,
+            act=act)
+
+    def forward(self, x):
+        x = self.dw_conv(x)
+        if self.use_se:
+            x = self.se(x)
+        x = self.pw_conv(x)
+        return x
+
+
+class SEModule(TheseusLayer):
+    def __init__(self, channel, reduction=4, lr_mult=1.0):
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult))
+        self.relu = nn.ReLU()
+        self.conv2 = Conv2D(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult))
+        self.hardsigmoid = nn.Hardsigmoid()
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        x = paddle.multiply(x=identity, y=x)
+        return x
+
+
+class PPLCNet(TheseusLayer):
+    def __init__(self,
+                 scale=1.0,
+                 class_num=1000,
+                 dropout_prob=0.2,
+                 class_expand=1280,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+                 stride_list=[2, 2, 2, 2, 2],
+                 use_last_conv=True,
+                 act="hardswish",
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.scale = scale
+        self.class_expand = class_expand
+        self.lr_mult_list = lr_mult_list
+        self.use_last_conv = use_last_conv
+        self.stride_list = stride_list
+        self.net_config = NET_CONFIG
+        if isinstance(self.lr_mult_list, str):
+            self.lr_mult_list = eval(self.lr_mult_list)
+
+        assert isinstance(self.lr_mult_list, (
+            list, tuple
+        )), "lr_mult_list should be in (list, tuple) but got {}".format(
+            type(self.lr_mult_list))
+        assert len(self.lr_mult_list
+                   ) == 6, "lr_mult_list length should be 6 but got {}".format(
+                       len(self.lr_mult_list))
+
+        assert isinstance(self.stride_list, (
+            list, tuple
+        )), "stride_list should be in (list, tuple) but got {}".format(
+            type(self.stride_list))
+        assert len(self.stride_list
+                   ) == 5, "stride_list length should be 5 but got {}".format(
+                       len(self.stride_list))
+
+        for i, stride in enumerate(stride_list[1:]):
+            self.net_config["blocks{}".format(i + 3)][0][3] = stride
+        self.conv1 = ConvBNLayer(
+            num_channels=3,
+            filter_size=3,
+            num_filters=make_divisible(16 * scale),
+            stride=stride_list[0],
+            lr_mult=self.lr_mult_list[0],
+            act=act)
+
+        self.blocks2 = nn.Sequential(*[
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                lr_mult=self.lr_mult_list[1],
+                act=act)
+            for i, (k, in_c, out_c, s, se
+                    ) in enumerate(self.net_config["blocks2"])
+        ])
+
+        self.blocks3 = nn.Sequential(*[
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                lr_mult=self.lr_mult_list[2],
+                act=act)
+            for i, (k, in_c, out_c, s, se
+                    ) in enumerate(self.net_config["blocks3"])
+        ])
+
+        self.blocks4 = nn.Sequential(*[
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                lr_mult=self.lr_mult_list[3],
+                act=act)
+            for i, (k, in_c, out_c, s, se
+                    ) in enumerate(self.net_config["blocks4"])
+        ])
+
+        self.blocks5 = nn.Sequential(*[
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                lr_mult=self.lr_mult_list[4],
+                act=act)
+            for i, (k, in_c, out_c, s, se
+                    ) in enumerate(self.net_config["blocks5"])
+        ])
+
+        self.blocks6 = nn.Sequential(*[
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                lr_mult=self.lr_mult_list[5],
+                act=act)
+            for i, (k, in_c, out_c, s, se
+                    ) in enumerate(self.net_config["blocks6"])
+        ])
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        if self.use_last_conv:
+            self.last_conv = Conv2D(
+                in_channels=make_divisible(self.net_config["blocks6"][-1][2] *
+                                           scale),
+                out_channels=self.class_expand,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias_attr=False)
+            self.act = _create_act(act)
+            self.dropout = Dropout(p=dropout_prob, mode="downscale_in_infer")
+        else:
+            self.last_conv = None
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+        self.fc = Linear(
+            self.class_expand if self.use_last_conv else
+            make_divisible(self.net_config["blocks6"][-1][2] * scale),
+            class_num)
+
+    def forward(self, x):
+        x = self.conv1(x)
+
+        x = self.blocks2(x)
+        x = self.blocks3(x)
+        x = self.blocks4(x)
+        x = self.blocks5(x)
+        x = self.blocks6(x)
+
+        x = self.avg_pool(x)
+        if self.last_conv is not None:
+            x = self.last_conv(x)
+            x = self.act(x)
+            x = self.dropout(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def PPLCNetBaseNet(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNetBaseNet
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer.
+    """
+    if "scale" in kwargs:
+        scale = kwargs["scale"]
+        kwargs.pop("scale")
+    else:
+        scale = 1.0
+
+    model = PPLCNet(scale=scale, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    model_url = _create_model_urls(scale)
+    _load_pretrained(pretrained, model, model_url, use_ssld)
+    return model
+
+
+def PPLCNet_x0_25(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNet_x0_25
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNet_x0_25` model depends on args.
+    """
+    model = PPLCNet(
+        scale=0.25, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNet_x0_25"], use_ssld)
+    return model
+
+
+def PPLCNet_x0_35(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNet_x0_35
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNet_x0_35` model depends on args.
+    """
+    model = PPLCNet(
+        scale=0.35, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNet_x0_35"], use_ssld)
+    return model
+
+
+def PPLCNet_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNet_x0_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNet_x0_5` model depends on args.
+    """
+    model = PPLCNet(
+        scale=0.5, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNet_x0_5"], use_ssld)
+    return model
+
+
+def PPLCNet_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNet_x0_75
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNet_x0_75` model depends on args.
+    """
+    model = PPLCNet(
+        scale=0.75, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNet_x0_75"], use_ssld)
+    return model
+
+
+def PPLCNet_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNet_x1_0
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNet_x1_0` model depends on args.
+    """
+    model = PPLCNet(
+        scale=1.0, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNet_x1_0"], use_ssld)
+    return model
+
+
+def PPLCNet_x1_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNet_x1_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNet_x1_5` model depends on args.
+    """
+    model = PPLCNet(
+        scale=1.5, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNet_x1_5"], use_ssld)
+    return model
+
+
+def PPLCNet_x2_0(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNet_x2_0
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNet_x2_0` model depends on args.
+    """
+    model = PPLCNet(
+        scale=2.0, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNet_x2_0"], use_ssld)
+    return model
+
+
+def PPLCNet_x2_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNet_x2_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNet_x2_5` model depends on args.
+    """
+    model = PPLCNet(
+        scale=2.5, stages_pattern=MODEL_STAGES_PATTERN["PPLCNet"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNet_x2_5"], use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/pp_lcnet_v2.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/pp_lcnet_v2.py
new file mode 100644
index 000000000..85be0a63f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/pp_lcnet_v2.py
@@ -0,0 +1,417 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn import AdaptiveAvgPool2D, BatchNorm2D, Conv2D, Dropout, Linear
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import KaimingNormal
+
+from ..base.theseus_layer import TheseusLayer
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "PPLCNetV2_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNetV2_small_pretrained.pdparams",
+    "PPLCNetV2_base":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNetV2_base_pretrained.pdparams",
+    "PPLCNetV2_large":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/PPLCNetV2_large_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+NET_CONFIG = {
+    # in_channels, kernel_size, split_pw, use_rep, use_se, use_shortcut
+    "stage1": [64, 3, False, False, False, False],
+    "stage2": [128, 3, False, False, False, False],
+    "stage3": [256, 5, True, True, True, False],
+    "stage4": [512, 5, False, True, False, True],
+}
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+def _create_act(act):
+    if act == "hardswish":
+        return nn.Hardswish()
+    elif act == "relu":
+        return nn.ReLU()
+    elif act == "relu6":
+        return nn.ReLU6()
+    elif act == "prelu":
+        return nn.PReLU()
+    elif act == "leaky_relu":
+        return nn.LeakyReLU(negative_slope=0.1)
+    else:
+        raise RuntimeError(
+            "The activation function is not supported: {}".format(act))
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 groups=1,
+                 act="relu"):
+        super().__init__()
+        self.act = act
+        self.conv = Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(initializer=KaimingNormal()),
+            bias_attr=False)
+
+        self.bn = BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        if self.act is not None:
+            self.act = _create_act(act)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act:
+            x = self.act(x)
+        return x
+
+
+class SEModule(TheseusLayer):
+    def __init__(self, channel, reduction=4, act='relu'):
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.act = _create_act(act)
+        self.conv2 = Conv2D(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.hardsigmoid = nn.Sigmoid()
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.act(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        x = paddle.multiply(x=identity, y=x)
+        return x
+
+
+class RepDepthwiseSeparable(TheseusLayer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 dw_size=3,
+                 split_pw=False,
+                 use_rep=False,
+                 use_se=False,
+                 use_shortcut=False,
+                 act="relu"):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.is_repped = False
+
+        self.dw_size = dw_size
+        self.split_pw = split_pw
+        self.use_rep = use_rep
+        self.use_se = use_se
+        self.use_shortcut = True if use_shortcut and stride == 1 and in_channels == out_channels else False
+
+        if self.use_rep:
+            self.dw_conv_list = nn.LayerList()
+            for kernel_size in range(self.dw_size, 0, -2):
+                if kernel_size == 1 and stride != 1:
+                    continue
+                dw_conv = ConvBNLayer(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    groups=in_channels,
+                    act=None)
+                self.dw_conv_list.append(dw_conv)
+            self.dw_conv = nn.Conv2D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=dw_size,
+                stride=stride,
+                padding=(dw_size - 1) // 2,
+                groups=in_channels)
+        else:
+            self.dw_conv = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=dw_size,
+                stride=stride,
+                groups=in_channels)
+
+        self.act = _create_act(act)
+
+        if use_se:
+            self.se = SEModule(in_channels, act=act)
+
+        if self.split_pw:
+            pw_ratio = 0.5
+            self.pw_conv_1 = ConvBNLayer(
+                in_channels=in_channels,
+                kernel_size=1,
+                out_channels=int(out_channels * pw_ratio),
+                stride=1,
+                act=act)
+            self.pw_conv_2 = ConvBNLayer(
+                in_channels=int(out_channels * pw_ratio),
+                kernel_size=1,
+                out_channels=out_channels,
+                stride=1,
+                act=act)
+        else:
+            self.pw_conv = ConvBNLayer(
+                in_channels=in_channels,
+                kernel_size=1,
+                out_channels=out_channels,
+                stride=1,
+                act=act)
+
+    def forward(self, x):
+        if self.use_rep:
+            input_x = x
+            if self.is_repped:
+                x = self.act(self.dw_conv(x))
+            else:
+                y = self.dw_conv_list[0](x)
+                for dw_conv in self.dw_conv_list[1:]:
+                    y += dw_conv(x)
+                x = self.act(y)
+        else:
+            x = self.dw_conv(x)
+
+        if self.use_se:
+            x = self.se(x)
+        if self.split_pw:
+            x = self.pw_conv_1(x)
+            x = self.pw_conv_2(x)
+        else:
+            x = self.pw_conv(x)
+        if self.use_shortcut:
+            x = x + input_x
+        return x
+
+    def re_parameterize(self):
+        if self.use_rep:
+            self.is_repped = True
+            kernel, bias = self._get_equivalent_kernel_bias()
+            self.dw_conv.weight.set_value(kernel)
+            self.dw_conv.bias.set_value(bias)
+
+    def _get_equivalent_kernel_bias(self):
+        kernel_sum = 0
+        bias_sum = 0
+        for dw_conv in self.dw_conv_list:
+            kernel, bias = self._fuse_bn_tensor(dw_conv)
+            kernel = self._pad_tensor(kernel, to_size=self.dw_size)
+            kernel_sum += kernel
+            bias_sum += bias
+        return kernel_sum, bias_sum
+
+    def _fuse_bn_tensor(self, branch):
+        kernel = branch.conv.weight
+        running_mean = branch.bn._mean
+        running_var = branch.bn._variance
+        gamma = branch.bn.weight
+        beta = branch.bn.bias
+        eps = branch.bn._epsilon
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape((-1, 1, 1, 1))
+        return kernel * t, beta - running_mean * gamma / std
+
+    def _pad_tensor(self, tensor, to_size):
+        from_size = tensor.shape[-1]
+        if from_size == to_size:
+            return tensor
+        pad = (to_size - from_size) // 2
+        return F.pad(tensor, [pad, pad, pad, pad])
+
+
+class PPLCNetV2(TheseusLayer):
+    def __init__(self,
+                 scale,
+                 depths,
+                 class_num=1000,
+                 dropout_prob=0,
+                 use_last_conv=True,
+                 class_expand=1280,
+                 act="relu",
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.scale = scale
+        self.use_last_conv = use_last_conv
+        self.class_expand = class_expand
+
+        self.stem = nn.Sequential(* [
+            ConvBNLayer(
+                in_channels=3,
+                kernel_size=3,
+                out_channels=make_divisible(32 * scale),
+                stride=2,
+                act=act), RepDepthwiseSeparable(
+                    in_channels=make_divisible(32 * scale),
+                    out_channels=make_divisible(64 * scale),
+                    stride=1,
+                    dw_size=3,
+                    act=act)
+        ])
+
+        # stages
+        self.stages = nn.LayerList()
+        for depth_idx, k in enumerate(NET_CONFIG):
+            in_channels, kernel_size, split_pw, use_rep, use_se, use_shortcut = NET_CONFIG[
+                k]
+            self.stages.append(
+                nn.Sequential(* [
+                    RepDepthwiseSeparable(
+                        in_channels=make_divisible((in_channels if i == 0 else
+                                                    in_channels * 2) * scale),
+                        out_channels=make_divisible(in_channels * 2 * scale),
+                        stride=2 if i == 0 else 1,
+                        dw_size=kernel_size,
+                        split_pw=split_pw,
+                        use_rep=use_rep,
+                        use_se=use_se,
+                        use_shortcut=use_shortcut,
+                        act=act)
+                    for i in range(depths[depth_idx])
+                ]))
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+
+        if self.use_last_conv:
+            self.last_conv = Conv2D(
+                in_channels=make_divisible(NET_CONFIG["stage4"][0] * 2 *
+                                           scale),
+                out_channels=self.class_expand,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias_attr=False)
+            self.act = _create_act(act)
+            self.dropout = Dropout(p=dropout_prob, mode="downscale_in_infer")
+
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+        in_features = self.class_expand if self.use_last_conv else make_divisible(
+            NET_CONFIG["stage4"][0] * 2 * scale)
+        self.fc = Linear(in_features, class_num)
+
+    def forward(self, x):
+        x = self.stem(x)
+        for stage in self.stages:
+            x = stage(x)
+        x = self.avg_pool(x)
+        if self.use_last_conv:
+            x = self.last_conv(x)
+            x = self.act(x)
+            x = self.dropout(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def PPLCNetV2_small(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNetV2_small
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNetV2_base` model depends on args.
+    """
+    model = PPLCNetV2(
+        scale=0.75, depths=[2, 2, 4, 2], dropout_prob=0.2, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNetV2_small"],
+                     use_ssld)
+    return model
+
+
+def PPLCNetV2_base(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNetV2_base
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNetV2_base` model depends on args.
+    """
+    model = PPLCNetV2(
+        scale=1.0, depths=[2, 2, 6, 2], dropout_prob=0.2, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNetV2_base"], use_ssld)
+    return model
+
+
+def PPLCNetV2_large(pretrained=False, use_ssld=False, **kwargs):
+    """
+    PPLCNetV2_large
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `PPLCNetV2_base` model depends on args.
+    """
+    model = PPLCNetV2(
+        scale=1.25, depths=[2, 2, 8, 2], dropout_prob=0.2, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNetV2_large"],
+                     use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/resnet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/resnet.py
new file mode 100644
index 000000000..e38206ee4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/resnet.py
@@ -0,0 +1,653 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/pdf/1512.03385
+
+from __future__ import absolute_import, division, print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear, BatchNorm2D
+from paddle.nn import MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+from paddle.regularizer import L2Decay
+import math
+
+from .custom_devices_layers import AdaptiveAvgPool2D
+from ....utils import logger
+from ..base.theseus_layer import TheseusLayer
+from ..base.dbb.dbb_block import DiverseBranchBlock
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "ResNet18":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_pretrained.pdparams",
+    "ResNet18_dbb":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_dbb_pretrained.pdparams",
+    "ResNet18_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_vd_pretrained.pdparams",
+    "ResNet34":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet34_pretrained.pdparams",
+    "ResNet34_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet34_vd_pretrained.pdparams",
+    "ResNet50":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_pretrained.pdparams",
+    "ResNet50_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_vd_pretrained.pdparams",
+    "ResNet101":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet101_pretrained.pdparams",
+    "ResNet101_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet101_vd_pretrained.pdparams",
+    "ResNet152":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_pretrained.pdparams",
+    "ResNet152_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_vd_pretrained.pdparams",
+    "ResNet200_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet200_vd_pretrained.pdparams",
+}
+
+MODEL_STAGES_PATTERN = {
+    "ResNet18": ["blocks[1]", "blocks[3]", "blocks[5]", "blocks[7]"],
+    "ResNet34": ["blocks[2]", "blocks[6]", "blocks[12]", "blocks[15]"],
+    "ResNet50": ["blocks[2]", "blocks[6]", "blocks[12]", "blocks[15]"],
+    "ResNet101": ["blocks[2]", "blocks[6]", "blocks[29]", "blocks[32]"],
+    "ResNet152": ["blocks[2]", "blocks[10]", "blocks[46]", "blocks[49]"],
+    "ResNet200": ["blocks[2]", "blocks[14]", "blocks[62]", "blocks[65]"]
+}
+
+__all__ = MODEL_URLS.keys()
+'''
+ResNet config: dict.
+    key: depth of ResNet.
+    values: config's dict of specific model.
+        keys:
+            block_type: Two different blocks in ResNet, BasicBlock and BottleneckBlock are optional.
+            block_depth: The number of blocks in different stages in ResNet.
+            num_channels: The number of channels to enter the next stage.
+'''
+NET_CONFIG = {
+    "18": {
+        "block_type": "BasicBlock",
+        "block_depth": [2, 2, 2, 2],
+        "num_channels": [64, 64, 128, 256]
+    },
+    "34": {
+        "block_type": "BasicBlock",
+        "block_depth": [3, 4, 6, 3],
+        "num_channels": [64, 64, 128, 256]
+    },
+    "50": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 4, 6, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "101": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 4, 23, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "152": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 8, 36, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "200": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 12, 48, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+}
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 is_vd_mode=False,
+                 act=None,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+        self.is_vd_mode = is_vd_mode
+        self.act = act
+        self.avg_pool = AvgPool2D(
+            kernel_size=2,
+            stride=stride,
+            padding="SAME",
+            ceil_mode=True,
+            data_format=data_format)
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=1 if is_vd_mode else stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=False,
+            data_format=data_format)
+
+        self.bn = BatchNorm(
+            num_filters,
+            param_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult),
+            data_layout=data_format)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        if self.is_vd_mode:
+            x = self.avg_pool(x)
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act:
+            x = self.relu(x)
+        return x
+
+
+class BottleneckBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 layer=ConvBNLayer,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+        self.conv0 = layer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv1 = layer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv2 = layer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            lr_mult=lr_mult,
+            data_format=data_format)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride,
+                is_vd_mode=False if if_first else True,
+                lr_mult=lr_mult,
+                data_format=data_format)
+
+        self.relu = nn.ReLU()
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        x = self.conv2(x)
+
+        if self.shortcut:
+            short = identity
+        else:
+            short = self.short(identity)
+        x = paddle.add(x=x, y=short)
+        x = self.relu(x)
+        return x
+
+
+class BasicBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 layer=ConvBNLayer,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+
+        self.stride = stride
+        self.conv0 = layer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv1 = layer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            lr_mult=lr_mult,
+            data_format=data_format)
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=stride,
+                is_vd_mode=False if if_first else True,
+                lr_mult=lr_mult,
+                data_format=data_format)
+        self.shortcut = shortcut
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        if self.shortcut:
+            short = identity
+        else:
+            short = self.short(identity)
+        x = paddle.add(x=x, y=short)
+        x = self.relu(x)
+        return x
+
+
+class ResNet(TheseusLayer):
+    """
+    ResNet
+    Args:
+        config: dict. config of ResNet.
+        version: str="vb". Different version of ResNet, version vd can perform better.
+        class_num: int=1000. The number of classes.
+        lr_mult_list: list. Control the learning rate of different stages.
+    Returns:
+        model: nn.Layer. Specific ResNet model depends on args.
+    """
+
+    def __init__(self,
+                 config,
+                 stages_pattern,
+                 version="vb",
+                 stem_act="relu",
+                 class_num=1000,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+                 stride_list=[2, 2, 2, 2, 2],
+                 max_pool=True,
+                 data_format="NCHW",
+                 input_image_channel=3,
+                 return_patterns=None,
+                 return_stages=None,
+                 layer_type="ConvBNLayer",
+                 use_first_short_conv=True,
+                 **kargs):
+        super().__init__()
+
+        self.cfg = config
+        self.lr_mult_list = lr_mult_list
+        self.stride_list = stride_list
+        self.is_vd_mode = version == "vd"
+        self.class_num = class_num
+        self.num_filters = [64, 128, 256, 512]
+        self.block_depth = self.cfg["block_depth"]
+        self.block_type = self.cfg["block_type"]
+        self.num_channels = self.cfg["num_channels"]
+        self.channels_mult = 1 if self.num_channels[-1] == 256 else 4
+
+        if layer_type == "ConvBNLayer":
+            layer = ConvBNLayer
+        elif layer_type == "DiverseBranchBlock":
+            layer = DiverseBranchBlock
+        else:
+            raise Exception()
+
+        assert isinstance(self.lr_mult_list, (
+            list, tuple
+        )), "lr_mult_list should be in (list, tuple) but got {}".format(
+            type(self.lr_mult_list))
+        if len(self.lr_mult_list) != 5:
+            msg = "lr_mult_list length should be 5 but got {}, default lr_mult_list used".format(
+                len(self.lr_mult_list))
+            logger.warning(msg)
+            self.lr_mult_list = [1.0, 1.0, 1.0, 1.0, 1.0]
+
+        assert isinstance(self.stride_list, (
+            list, tuple
+        )), "stride_list should be in (list, tuple) but got {}".format(
+            type(self.stride_list))
+        assert len(self.stride_list
+                   ) == 5, "stride_list length should be 5 but got {}".format(
+                       len(self.stride_list))
+
+        self.stem_cfg = {
+            #num_channels, num_filters, filter_size, stride
+            "vb": [[input_image_channel, 64, 7, self.stride_list[0]]],
+            "vd": [[input_image_channel, 32, 3, self.stride_list[0]],
+                   [32, 32, 3, 1], [32, 64, 3, 1]]
+        }
+
+        self.stem = nn.Sequential(*[
+            ConvBNLayer(
+                num_channels=in_c,
+                num_filters=out_c,
+                filter_size=k,
+                stride=s,
+                act=stem_act,
+                lr_mult=self.lr_mult_list[0],
+                data_format=data_format)
+            for in_c, out_c, k, s in self.stem_cfg[version]
+        ])
+
+        self.max_pool = max_pool
+        if max_pool:
+            self.max_pool = MaxPool2D(
+                kernel_size=3,
+                stride=stride_list[1],
+                padding=1,
+                data_format=data_format)
+        block_list = []
+        for block_idx in range(len(self.block_depth)):
+            # paddleclas' special improvement version
+            shortcut = False
+            # official resnet_vb version
+            if not use_first_short_conv and block_idx == 0:
+                shortcut = True
+            for i in range(self.block_depth[block_idx]):
+                block_list.append(globals()[self.block_type](
+                    num_channels=self.num_channels[block_idx] if i == 0 else
+                    self.num_filters[block_idx] * self.channels_mult,
+                    num_filters=self.num_filters[block_idx],
+                    stride=self.stride_list[block_idx + 1]
+                    if i == 0 and (block_idx != 0 or not max_pool) else 1,
+                    shortcut=shortcut,
+                    if_first=block_idx == i == 0 if version == "vd" else True,
+                    layer=layer,
+                    lr_mult=self.lr_mult_list[block_idx + 1],
+                    data_format=data_format))
+                shortcut = True
+        self.blocks = nn.Sequential(*block_list)
+
+        self.avg_pool = AdaptiveAvgPool2D(1, data_format=data_format)
+        self.flatten = nn.Flatten()
+        self.avg_pool_channels = self.num_channels[-1] * 2
+        stdv = 1.0 / math.sqrt(self.avg_pool_channels * 1.0)
+        self.fc = Linear(
+            self.avg_pool_channels,
+            self.class_num,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
+
+        self.data_format = data_format
+
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward(self, x):
+        with paddle.static.amp.fp16_guard():
+            return self._forward(x)
+
+    def _forward(self, x):
+        if self.data_format == "NHWC":
+            x = paddle.transpose(x, [0, 2, 3, 1])
+            x.stop_gradient = True
+        x = self.stem(x)
+        if self.max_pool:
+            x = self.max_pool(x)
+        x = self.blocks(x)
+        x = self.avg_pool(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNet18(pretrained=False,
+             use_ssld=False,
+             layer_type="ConvBNLayer",
+             **kwargs):
+    """
+    ResNet18
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet18` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["18"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet18"],
+        version="vb",
+        layer_type=layer_type,
+        **kwargs)
+    if layer_type == "DiverseBranchBlock":
+        _load_pretrained(pretrained, model, MODEL_URLS["ResNet18_dbb"],
+                         use_ssld)
+    else:
+        _load_pretrained(pretrained, model, MODEL_URLS["ResNet18"], use_ssld)
+    return model
+
+
+def ResNet18_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet18_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet18_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["18"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet18"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet18_vd"], use_ssld)
+    return model
+
+
+def ResNet34(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet34
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet34` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["34"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet34"],
+        version="vb",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet34"], use_ssld)
+    return model
+
+
+def ResNet34_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet34_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet34_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["34"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet34"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet34_vd"], use_ssld)
+    return model
+
+
+def ResNet50(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet50
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet50` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["50"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet50"],
+        version="vb",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50"], use_ssld)
+    return model
+
+
+def ResNet50_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet50_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet50_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["50"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet50"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50_vd"], use_ssld)
+    return model
+
+
+def ResNet101(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet101
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet101` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["101"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet101"],
+        version="vb",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet101"], use_ssld)
+    return model
+
+
+def ResNet101_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet101_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet101_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["101"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet101"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet101_vd"], use_ssld)
+    return model
+
+
+def ResNet152(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet152
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet152` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["152"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet152"],
+        version="vb",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet152"], use_ssld)
+    return model
+
+
+def ResNet152_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet152_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet152_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["152"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet152"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet152_vd"], use_ssld)
+    return model
+
+
+def ResNet200_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet200_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet200_vd` model depends on args.
+    """
+    model = ResNet(
+        config=NET_CONFIG["200"],
+        stages_pattern=MODEL_STAGES_PATTERN["ResNet200"],
+        version="vd",
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet200_vd"], use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/swin_transformer.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/swin_transformer.py
new file mode 100644
index 000000000..9a464e9dd
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/swin_transformer.py
@@ -0,0 +1,1002 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/microsoft/Swin-Transformer
+# reference: https://arxiv.org/abs/2103.14030
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import TruncatedNormal, Constant
+
+from ..model_zoo.vision_transformer import trunc_normal_, zeros_, ones_, to_2tuple, DropPath, Identity
+from ..base.theseus_layer import TheseusLayer
+from ....utils.save_load import load_dygraph_pretrain
+from ....utils import logger
+
+
+MODEL_URLS = {
+    "SwinTransformer_tiny_patch4_window7_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/SwinTransformer_tiny_patch4_window7_224_pretrained.pdparams",
+    "SwinTransformer_small_patch4_window7_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/SwinTransformer_small_patch4_window7_224_pretrained.pdparams",
+    "SwinTransformer_base_patch4_window7_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/SwinTransformer_base_patch4_window7_224_pretrained.pdparams",
+    "SwinTransformer_base_patch4_window12_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/SwinTransformer_base_patch4_window12_384_pretrained.pdparams",
+    "SwinTransformer_large_patch4_window7_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/SwinTransformer_large_patch4_window7_224_pretrained.pdparams",
+    "SwinTransformer_large_patch4_window12_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/SwinTransformer_large_patch4_window12_384_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+Linear = nn.Linear
+
+
+def masked_fill(x, mask, value):
+    y = paddle.full(x.shape, value, x.dtype)
+    return paddle.where(mask, y, x)
+
+
+def check_support_fused_op(use_fused_linear):
+    if use_fused_linear:
+        if paddle.device.cuda.get_device_capability()[0] >= 8:
+            return True
+        else:
+            logger.warning("The current device don't support Fused OP! Using the general Linear instead.")
+    return False
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def pading_for_not_divisible(pixel_values,
+                             height,
+                             width,
+                             patch_size,
+                             format="BCHW",
+                             function="split"):
+    if isinstance(patch_size, int):
+        patch_size = (patch_size, patch_size)
+    if height % patch_size[0] == 0 and width % patch_size[1] == 0:
+        return pixel_values, (0, 0, 0, 0, 0, 0, 0, 0)
+    if function == "split":
+        pading_width = patch_size[1] - width % patch_size[1]
+        pading_height = patch_size[0] - height % patch_size[0]
+    elif function == "merge":
+        pading_width = width % 2
+        pading_height = height % 2
+    if format == "BCHW":
+        pad_index = (0, 0, 0, 0, 0, pading_height, 0, pading_width)
+    elif format == "BHWC":
+        pad_index = (0, 0, 0, pading_height, 0, pading_width, 0, 0)
+    else:
+        assert ("vaild format")
+
+    return F.pad(pixel_values, pad_index), pad_index
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.reshape(
+        [B, H // window_size, window_size, W // window_size, window_size, C])
+    windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
+        [-1, window_size, window_size, C])
+    return windows
+
+
+def window_reverse(windows, window_size, H, W, C):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+
+    Returns:
+        x: (B, H, W, C)
+    """
+    x = windows.reshape(
+        [-1, H // window_size, W // window_size, window_size, window_size, C])
+    x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, H, W, C])
+    return x
+
+
+class WindowAttention(nn.Layer):
+    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 use_fused_attn=False):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        # define a parameter table of relative position bias
+        # 2*Wh-1 * 2*Ww-1, nH
+        self.relative_position_bias_table = self.create_parameter(
+            shape=((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                   num_heads),
+            default_initializer=zeros_)
+        self.add_parameter("relative_position_bias_table",
+                           self.relative_position_bias_table)
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = paddle.arange(self.window_size[0])
+        coords_w = paddle.arange(self.window_size[1])
+        coords = paddle.stack(paddle.meshgrid(
+            [coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+
+        coords_flatten_1 = coords_flatten.unsqueeze(axis=2)
+        coords_flatten_2 = coords_flatten.unsqueeze(axis=1)
+        relative_coords = coords_flatten_1 - coords_flatten_2
+
+        relative_coords = relative_coords.transpose(
+            [1, 2, 0])  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[
+            0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+
+        self.register_buffer("relative_position_index",
+                             relative_position_index)
+
+        self.qkv = Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table)
+        self.softmax = nn.Softmax(axis=-1)
+
+        self.use_fused_attn = use_fused_attn
+
+    def eval(self, ):
+        # this is used to re-param swin for model export
+        relative_position_bias_table = self.relative_position_bias_table
+        window_size = self.window_size
+        index = self.relative_position_index.reshape([-1])
+
+        relative_position_bias = paddle.index_select(
+            relative_position_bias_table, index)
+        relative_position_bias = relative_position_bias.reshape([
+            window_size[0] * window_size[1], window_size[0] * window_size[1],
+            -1
+        ])  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.transpose(
+            [2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+        relative_position_bias = relative_position_bias.unsqueeze(0)
+        self.register_buffer("relative_position_bias", relative_position_bias)
+
+    def get_relative_position_bias(self):
+        if self.training or not hasattr(self, "relative_position_bias"):
+            index = self.relative_position_index.reshape([-1])
+
+            relative_position_bias = paddle.index_select(
+                self.relative_position_bias_table, index)
+            relative_position_bias = relative_position_bias.reshape([
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1], -1
+            ])  # Wh*Ww,Wh*Ww,nH
+
+            relative_position_bias = relative_position_bias.transpose(
+                [2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+            return relative_position_bias.unsqueeze(0)
+        else:
+            return self.relative_position_bias
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(
+            [B_, N, 3, self.num_heads, C // self.num_heads])
+
+        if self.use_fused_attn:
+            qkv = qkv.transpose((2, 0, 1, 3, 4))
+            q, k, v = qkv[0], qkv[1], qkv[2]
+            attn_mask = self.get_relative_position_bias()
+            if mask is not None:
+                nW = mask.shape[0]
+                mask = mask.reshape((1, nW, 1, N, N)).expand((B_ // nW, -1, self.num_heads, -1, -1))
+                attn_mask = attn_mask + mask.reshape((-1, self.num_heads, N, N))
+            attn_mask = attn_mask.expand((B_, -1, -1, -1))
+            attn = paddle.nn.functional.scaled_dot_product_attention(q, k, v, dropout_p=self.attn_drop.p if self.training else 0., attn_mask=attn_mask)
+        else:
+            qkv = qkv.transpose((2, 0, 3, 1, 4))
+            q, k, v = qkv[0], qkv[1], qkv[2]
+            q = q * self.scale
+            attn = paddle.mm(q, k.transpose([0, 1, 3, 2]))
+            attn = attn + self.get_relative_position_bias()
+
+            if mask is not None:
+                nW = mask.shape[0]
+                attn = attn.reshape([B_ // nW, nW, self.num_heads, N, N
+                                    ]) + mask.unsqueeze(1).unsqueeze(0)
+                attn = attn.reshape([B_, self.num_heads, N, N])
+            attn = self.softmax(attn)
+            attn = self.attn_drop(attn)
+            attn = paddle.mm(attn, v)
+            attn = attn.transpose([0, 2, 1, 3])
+        x = attn.reshape([B_, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    def extra_repr(self):
+        return "dim={}, window_size={}, num_heads={}".format(
+            self.dim, self.window_size, self.num_heads)
+
+    def flops(self, N):
+        # calculate flops for 1 window with token length of N
+        flops = 0
+        # qkv = self.qkv(x)
+        flops += N * self.dim * 3 * self.dim
+        # attn = (q @ k.transpose(-2, -1))
+        flops += self.num_heads * N * (self.dim // self.num_heads) * N
+        #  x = (attn @ v)
+        flops += self.num_heads * N * N * (self.dim // self.num_heads)
+        # x = self.proj(x)
+        flops += N * self.dim * self.dim
+        return flops
+
+
+class SwinTransformerBlock(nn.Layer):
+    r""" Swin Transformer Block.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 num_heads,
+                 window_size=7,
+                 shift_size=0,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 use_fused_attn=False):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            use_fused_attn=use_fused_attn)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        H, W = self.input_resolution
+        attn_mask = None
+
+        self.register_buffer("attn_mask", attn_mask)
+
+    def get_attn_mask(self, height, width, dtype):
+        if self.shift_size > 0:
+            # calculate attention mask for shifted window multihead self attention
+            img_mask = paddle.zeros((1, height, width, 1), dtype=dtype)
+            height_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None), )
+            width_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None), )
+            count = 0
+            for height_slice in height_slices:
+                for width_slice in width_slices:
+                    img_mask[:, height_slice, width_slice, :] = count
+                    count += 1
+
+            mask_windows = window_partition(img_mask, self.window_size)
+            mask_windows = mask_windows.reshape(
+                (-1, self.window_size * self.window_size))
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = masked_fill(attn_mask, attn_mask != 0, float(-100.0))
+            attn_mask = masked_fill(attn_mask, attn_mask == 0, float(0.0))
+        else:
+            attn_mask = None
+        return attn_mask
+
+    def forward(self, x, input_dimensions):
+        H, W = input_dimensions
+        B, L, C = x.shape
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.reshape([B, H, W, C])
+
+        x, pad_values = pading_for_not_divisible(x, H, W, self.window_size,
+                                                 "BHWC")
+        _, height_pad, width_pad, _ = x.shape
+
+        padding_state = pad_values[3] > 0 or pad_values[
+            5] > 0  # change variable name
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = paddle.roll(
+                x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2))
+        else:
+            shifted_x = x
+
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.reshape(
+            [-1, self.window_size * self.window_size,
+             C])  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        #check did it need to calculate again
+        attn_mask = self.get_attn_mask(height_pad, width_pad, x.dtype)
+
+        attn_windows = self.attn(
+            x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.reshape(
+            [-1, self.window_size, self.window_size, C])
+        shifted_x = window_reverse(attn_windows, self.window_size, height_pad,
+                                   width_pad, C)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = paddle.roll(
+                shifted_x,
+                shifts=(self.shift_size, self.shift_size),
+                axis=(1, 2))
+        else:
+            x = shifted_x
+
+        if padding_state:
+            x = x[:, :H, :W, :]
+        x = x.reshape([B, H * W, C])
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+    def extra_repr(self):
+        return "dim={}, input_resolution={}, num_heads={}, window_size={}, shift_size={}, mlp_ratio={}".format(
+            self.dim, self.input_resolution, self.num_heads, self.window_size,
+            self.shift_size, self.mlp_ratio)
+
+    def flops(self):
+        flops = 0
+        H, W = self.input_resolution
+        # norm1
+        flops += self.dim * H * W
+        # W-MSA/SW-MSA
+        nW = H * W / self.window_size / self.window_size
+        flops += nW * self.attn.flops(self.window_size * self.window_size)
+        # mlp
+        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
+        # norm2
+        flops += self.dim * H * W
+        return flops
+
+
+class PatchMerging(nn.Layer):
+    r""" Patch Merging Layer.
+
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x, input_dimensions):
+        """
+        x: B, H*W, C
+        """
+        H, W = input_dimensions
+        B, L, C = x.shape
+        x = x.reshape([B, H, W, C])
+        x, _ = pading_for_not_divisible(x, H, W, 2, "BHWC", function="merge")
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = paddle.reshape(x, x.shape)
+        x = paddle.concat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+
+        # x = x.reshape([B, H // 2, 2, W // 2, 2, C])
+        # x = x.transpose((0, 1, 3, 4, 2, 5))
+
+        x = x.reshape([B,  -1, 4 * C])  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+    def extra_repr(self):
+        return "input_resolution={}, dim={}".format(self.input_resolution,
+                                                    self.dim)
+
+    def flops(self):
+        H, W = self.input_resolution
+        flops = H * W * self.dim
+        flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
+        return flops
+
+
+class BasicLayer(nn.Layer):
+    """ A basic Swin Transformer layer for one stage.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 depth,
+                 num_heads,
+                 window_size,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False,
+                 use_fused_attn=False):
+
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.LayerList([
+            SwinTransformerBlock(
+                dim=dim,
+                input_resolution=input_resolution,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer,
+                use_fused_attn=use_fused_attn) for i in range(depth)
+        ])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                input_resolution, dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x, input_dimensions):
+        H, W = input_dimensions
+        for blk in self.blocks:
+            x = blk(x, input_dimensions)
+        if self.downsample is not None:
+            H, W = (H + 1) // 2, (W + 1) // 2
+            x = self.downsample(x, input_dimensions)
+        return x, (H, W)
+
+    def extra_repr(self):
+        return "dim={}, input_resolution={}, depth={}".format(
+            self.dim, self.input_resolution, self.depth)
+
+    def flops(self):
+        flops = 0
+        for blk in self.blocks:
+            flops += blk.flops()
+        if self.downsample is not None:
+            flops += self.downsample.flops()
+        return flops
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+
+    Args:
+        img_size (int): Image size.  Default: 224.
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Layer, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 norm_layer=None):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patches_resolution = [
+            img_size[0] // patch_size[0], img_size[1] // patch_size[1]
+        ]
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x, _ = pading_for_not_divisible(x, H, W, self.patch_size, "BCHW")
+        x = self.proj(x)
+        _, _, height, width = x.shape
+        output_dimensions = (height, width)
+        x = x.flatten(2).transpose([0, 2, 1])  # B Ph*Pw C
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, output_dimensions
+
+    def flops(self):
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (
+            self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops
+
+
+class SwinTransformer(TheseusLayer):
+    """ Swin Transformer
+        A PaddlePaddle impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+
+    Args:
+        img_size (int | tuple(int)): Input image size. Default 224
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 7
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 use_checkpoint=False,
+                 **kwargs):
+        super(SwinTransformer, self).__init__()
+
+        self.num_classes = num_classes = class_num
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.num_features = int(embed_dim * 2**(self.num_layers - 1))
+        self.mlp_ratio = mlp_ratio
+        use_fused_attn = check_support_fused_op(kwargs.get('use_fused_attn', False))
+        use_fused_linear = kwargs.get('use_fused_linear', False)
+        global Linear
+        Linear = paddle.incubate.nn.FusedLinear if use_fused_linear else nn.Linear
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+
+        # absolute position embedding
+        if self.ape:
+            self.absolute_pos_embed = self.create_parameter(
+                shape=(1, num_patches, embed_dim), default_initializer=zeros_)
+            self.add_parameter("absolute_pos_embed", self.absolute_pos_embed)
+            trunc_normal_(self.absolute_pos_embed)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = np.linspace(0, drop_path_rate,
+                          sum(depths)).tolist()  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.LayerList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                input_resolution=(patches_resolution[0] // (2**i_layer),
+                                  patches_resolution[1] // (2**i_layer)),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging
+                if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint,
+                use_fused_attn=use_fused_attn)
+            self.layers.append(layer)
+
+        self.norm = norm_layer(self.num_features)
+        self.avgpool = nn.AdaptiveAvgPool1D(1)
+
+        self.head = Linear(
+            self.num_features,
+            num_classes) if self.num_classes > 0 else nn.Identity()
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Linear, paddle.incubate.nn.FusedLinear)):
+            trunc_normal_(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        x, output_dimensions = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+
+        for layer in self.layers:
+            x, output_dimensions = layer(x, output_dimensions)
+
+        x = self.norm(x)  # B L C
+        x = self.avgpool(x.transpose([0, 2, 1]))  # B C 1
+        x = paddle.flatten(x, 1)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+    def flops(self):
+        flops = 0
+        flops += self.patch_embed.flops()
+        for _, layer in enumerate(self.layers):
+            flops += layer.flops()
+        flops += self.num_features * self.patches_resolution[
+            0] * self.patches_resolution[1] // (2**self.num_layers)
+        flops += self.num_features * self.num_classes
+        return flops
+
+
+def _load_pretrained(pretrained,
+                     model,
+                     model_url,
+                     use_ssld=False,
+                     use_imagenet22k_pretrained=False,
+                     use_imagenet22kto1k_pretrained=False,
+                     **kwargs):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(
+            model,
+            model_url,
+            use_ssld=use_ssld,
+            use_imagenet22k_pretrained=use_imagenet22k_pretrained,
+            use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def SwinTransformer_tiny_patch4_window7_224(
+        pretrained=False,
+        use_ssld=False,
+        use_imagenet22k_pretrained=False,
+        use_imagenet22kto1k_pretrained=False,
+        **kwargs):
+    model = SwinTransformer(
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        drop_path_rate=0.2,  # if imagenet22k or imagenet22kto1k, set drop_path_rate=0.1
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformer_tiny_patch4_window7_224"],
+        use_ssld=use_ssld,
+        use_imagenet22k_pretrained=use_imagenet22k_pretrained,
+        use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    return model
+
+
+def SwinTransformer_small_patch4_window7_224(
+        pretrained=False,
+        use_ssld=False,
+        use_imagenet22k_pretrained=False,
+        use_imagenet22kto1k_pretrained=False,
+        **kwargs):
+    model = SwinTransformer(
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        drop_path_rate=0.3,  # if imagenet22k or imagenet22kto1k, set drop_path_rate=0.2
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformer_small_patch4_window7_224"],
+        use_ssld=use_ssld,
+        use_imagenet22k_pretrained=use_imagenet22k_pretrained,
+        use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    return model
+
+
+def SwinTransformer_base_patch4_window7_224(
+        pretrained=False,
+        use_ssld=False,
+        use_imagenet22k_pretrained=False,
+        use_imagenet22kto1k_pretrained=False,
+        **kwargs):
+    model = SwinTransformer(
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=7,
+        drop_path_rate=0.5,  # if imagenet22k or imagenet22kto1k, set drop_path_rate=0.2
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformer_base_patch4_window7_224"],
+        use_ssld=use_ssld,
+        use_imagenet22k_pretrained=use_imagenet22k_pretrained,
+        use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    return model
+
+
+def SwinTransformer_base_patch4_window12_384(
+        pretrained=False,
+        use_ssld=False,
+        use_imagenet22k_pretrained=False,
+        use_imagenet22kto1k_pretrained=False,
+        **kwargs):
+    model = SwinTransformer(
+        img_size=384,
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        drop_path_rate=0.5,  # if imagenet22k or imagenet22kto1k, set drop_path_rate=0.2
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformer_base_patch4_window12_384"],
+        use_ssld=use_ssld,
+        use_imagenet22k_pretrained=use_imagenet22k_pretrained,
+        use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    return model
+
+
+def SwinTransformer_large_patch4_window7_224(
+        pretrained=False,
+        use_ssld=False,
+        use_imagenet22k_pretrained=False,
+        use_imagenet22kto1k_pretrained=True,
+        **kwargs):
+    model = SwinTransformer(
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=7,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformer_large_patch4_window7_224"],
+        use_ssld=use_ssld,
+        use_imagenet22k_pretrained=use_imagenet22k_pretrained,
+        use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    return model
+
+
+def SwinTransformer_large_patch4_window12_384(
+        pretrained=False,
+        use_ssld=False,
+        use_imagenet22k_pretrained=False,
+        use_imagenet22kto1k_pretrained=True,
+        **kwargs):
+    model = SwinTransformer(
+        img_size=384,
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformer_large_patch4_window12_384"],
+        use_ssld=use_ssld,
+        use_imagenet22k_pretrained=use_imagenet22k_pretrained,
+        use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/vgg.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/vgg.py
new file mode 100644
index 000000000..8a0f0156e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/legendary_models/vgg.py
@@ -0,0 +1,261 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1409.1556
+
+from __future__ import absolute_import, division, print_function
+
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import MaxPool2D
+
+from ..base.theseus_layer import TheseusLayer
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "VGG11":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/VGG11_pretrained.pdparams",
+    "VGG13":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/VGG13_pretrained.pdparams",
+    "VGG16":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/VGG16_pretrained.pdparams",
+    "VGG19":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/VGG19_pretrained.pdparams",
+}
+
+MODEL_STAGES_PATTERN = {
+    "VGG": [
+        "conv_block_1", "conv_block_2", "conv_block_3", "conv_block_4",
+        "conv_block_5"
+    ]
+}
+
+__all__ = MODEL_URLS.keys()
+
+# VGG config
+# key: VGG network depth
+# value: conv num in different blocks
+NET_CONFIG = {
+    11: [1, 1, 2, 2, 2],
+    13: [2, 2, 2, 2, 2],
+    16: [2, 2, 3, 3, 3],
+    19: [2, 2, 4, 4, 4]
+}
+
+
+class ConvBlock(TheseusLayer):
+    def __init__(self, input_channels, output_channels, groups):
+        super().__init__()
+
+        self.groups = groups
+        self.conv1 = Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias_attr=False)
+        if groups == 2 or groups == 3 or groups == 4:
+            self.conv2 = Conv2D(
+                in_channels=output_channels,
+                out_channels=output_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias_attr=False)
+        if groups == 3 or groups == 4:
+            self.conv3 = Conv2D(
+                in_channels=output_channels,
+                out_channels=output_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias_attr=False)
+        if groups == 4:
+            self.conv4 = Conv2D(
+                in_channels=output_channels,
+                out_channels=output_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias_attr=False)
+
+        self.max_pool = MaxPool2D(kernel_size=2, stride=2, padding=0)
+        self.relu = nn.ReLU()
+
+    def forward(self, inputs):
+        x = self.conv1(inputs)
+        x = self.relu(x)
+        if self.groups == 2 or self.groups == 3 or self.groups == 4:
+            x = self.conv2(x)
+            x = self.relu(x)
+        if self.groups == 3 or self.groups == 4:
+            x = self.conv3(x)
+            x = self.relu(x)
+        if self.groups == 4:
+            x = self.conv4(x)
+            x = self.relu(x)
+        x = self.max_pool(x)
+        return x
+
+
+class VGGNet(TheseusLayer):
+    """
+    VGGNet
+    Args:
+        config: list. VGGNet config.
+        stop_grad_layers: int=0. The parameters in blocks which index larger than `stop_grad_layers`, will be set `param.trainable=False`
+        class_num: int=1000. The number of classes.
+    Returns:
+        model: nn.Layer. Specific VGG model depends on args.
+    """
+
+    def __init__(self,
+                 config,
+                 stages_pattern,
+                 stop_grad_layers=0,
+                 class_num=1000,
+                 return_patterns=None,
+                 return_stages=None):
+        super().__init__()
+
+        self.stop_grad_layers = stop_grad_layers
+
+        self.conv_block_1 = ConvBlock(3, 64, config[0])
+        self.conv_block_2 = ConvBlock(64, 128, config[1])
+        self.conv_block_3 = ConvBlock(128, 256, config[2])
+        self.conv_block_4 = ConvBlock(256, 512, config[3])
+        self.conv_block_5 = ConvBlock(512, 512, config[4])
+
+        self.relu = nn.ReLU()
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+
+        for idx, block in enumerate([
+                self.conv_block_1, self.conv_block_2, self.conv_block_3,
+                self.conv_block_4, self.conv_block_5
+        ]):
+            if self.stop_grad_layers >= idx + 1:
+                for param in block.parameters():
+                    param.trainable = False
+
+        self.drop = Dropout(p=0.5, mode="downscale_in_infer")
+        self.fc1 = Linear(7 * 7 * 512, 4096)
+        self.fc2 = Linear(4096, 4096)
+        self.fc3 = Linear(4096, class_num)
+
+        super().init_res(
+            stages_pattern,
+            return_patterns=return_patterns,
+            return_stages=return_stages)
+
+    def forward(self, inputs):
+        x = self.conv_block_1(inputs)
+        x = self.conv_block_2(x)
+        x = self.conv_block_3(x)
+        x = self.conv_block_4(x)
+        x = self.conv_block_5(x)
+        x = self.flatten(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.relu(x)
+        x = self.drop(x)
+        x = self.fc3(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def VGG11(pretrained=False, use_ssld=False, **kwargs):
+    """
+    VGG11
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `VGG11` model depends on args.
+    """
+    model = VGGNet(
+        config=NET_CONFIG[11],
+        stages_pattern=MODEL_STAGES_PATTERN["VGG"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["VGG11"], use_ssld)
+    return model
+
+
+def VGG13(pretrained=False, use_ssld=False, **kwargs):
+    """
+    VGG13
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `VGG13` model depends on args.
+    """
+    model = VGGNet(
+        config=NET_CONFIG[13],
+        stages_pattern=MODEL_STAGES_PATTERN["VGG"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["VGG13"], use_ssld)
+    return model
+
+
+def VGG16(pretrained=False, use_ssld=False, **kwargs):
+    """
+    VGG16
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `VGG16` model depends on args.
+    """
+    model = VGGNet(
+        config=NET_CONFIG[16],
+        stages_pattern=MODEL_STAGES_PATTERN["VGG"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["VGG16"], use_ssld)
+    return model
+
+
+def VGG19(pretrained=False, use_ssld=False, **kwargs):
+    """
+    VGG19
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `VGG19` model depends on args.
+    """
+    model = VGGNet(
+        config=NET_CONFIG[19],
+        stages_pattern=MODEL_STAGES_PATTERN["VGG"],
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["VGG19"], use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/adaface_ir_net.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/adaface_ir_net.py
new file mode 100644
index 000000000..47de152b6
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/adaface_ir_net.py
@@ -0,0 +1,529 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# this code is based on AdaFace(https://github.com/mk-minchul/AdaFace)
+from collections import namedtuple
+import paddle
+import paddle.nn as nn
+from paddle.nn import Dropout
+from paddle.nn import MaxPool2D
+from paddle.nn import Sequential
+from paddle.nn import Conv2D, Linear
+from paddle.nn import BatchNorm1D, BatchNorm2D
+from paddle.nn import ReLU, Sigmoid
+from paddle.nn import Layer
+from paddle.nn import PReLU
+
+# from ppcls.arch.backbone.legendary_models.resnet import _load_pretrained
+
+
+class Flatten(Layer):
+    """ Flat tensor
+    """
+
+    def forward(self, input):
+        return paddle.reshape(input, [input.shape[0], -1])
+
+
+class LinearBlock(Layer):
+    """ Convolution block without no-linear activation layer
+    """
+
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 kernel=(1, 1),
+                 stride=(1, 1),
+                 padding=(0, 0),
+                 groups=1):
+        super(LinearBlock, self).__init__()
+        self.conv = Conv2D(
+            in_c,
+            out_c,
+            kernel,
+            stride,
+            padding,
+            groups=groups,
+            weight_attr=nn.initializer.KaimingNormal(),
+            bias_attr=None)
+        weight_attr = paddle.ParamAttr(
+            regularizer=None, initializer=nn.initializer.Constant(value=1.0))
+        bias_attr = paddle.ParamAttr(
+            regularizer=None, initializer=nn.initializer.Constant(value=0.0))
+        self.bn = BatchNorm2D(
+            out_c, weight_attr=weight_attr, bias_attr=bias_attr)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+
+
+class GNAP(Layer):
+    """ Global Norm-Aware Pooling block
+    """
+
+    def __init__(self, in_c):
+        super(GNAP, self).__init__()
+        self.bn1 = BatchNorm2D(in_c, weight_attr=False, bias_attr=False)
+        self.pool = nn.AdaptiveAvgPool2D((1, 1))
+        self.bn2 = BatchNorm1D(in_c, weight_attr=False, bias_attr=False)
+
+    def forward(self, x):
+        x = self.bn1(x)
+        x_norm = paddle.norm(x, 2, 1, True)
+        x_norm_mean = paddle.mean(x_norm)
+        weight = x_norm_mean / x_norm
+        x = x * weight
+        x = self.pool(x)
+        x = x.view(x.shape[0], -1)
+        feature = self.bn2(x)
+        return feature
+
+
+class GDC(Layer):
+    """ Global Depthwise Convolution block
+    """
+
+    def __init__(self, in_c, embedding_size):
+        super(GDC, self).__init__()
+        self.conv_6_dw = LinearBlock(
+            in_c,
+            in_c,
+            groups=in_c,
+            kernel=(7, 7),
+            stride=(1, 1),
+            padding=(0, 0))
+        self.conv_6_flatten = Flatten()
+        self.linear = Linear(
+            in_c,
+            embedding_size,
+            weight_attr=nn.initializer.KaimingNormal(),
+            bias_attr=False)
+        self.bn = BatchNorm1D(
+            embedding_size, weight_attr=False, bias_attr=False)
+
+    def forward(self, x):
+        x = self.conv_6_dw(x)
+        x = self.conv_6_flatten(x)
+        x = self.linear(x)
+        x = self.bn(x)
+        return x
+
+
+class SELayer(Layer):
+    """ SE block
+    """
+
+    def __init__(self, channels, reduction):
+        super(SELayer, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2D(1)
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.XavierUniform())
+        self.fc1 = Conv2D(
+            channels,
+            channels // reduction,
+            kernel_size=1,
+            padding=0,
+            weight_attr=weight_attr,
+            bias_attr=False)
+
+        self.relu = ReLU()
+        self.fc2 = Conv2D(
+            channels // reduction,
+            channels,
+            kernel_size=1,
+            padding=0,
+            weight_attr=nn.initializer.KaimingNormal(),
+            bias_attr=False)
+
+        self.sigmoid = Sigmoid()
+
+    def forward(self, x):
+        module_input = x
+        x = self.avg_pool(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = self.sigmoid(x)
+
+        return module_input * x
+
+
+class BasicBlockIR(Layer):
+    """ BasicBlock for IRNet
+    """
+
+    def __init__(self, in_channel, depth, stride):
+        super(BasicBlockIR, self).__init__()
+
+        weight_attr = paddle.ParamAttr(
+            regularizer=None, initializer=nn.initializer.Constant(value=1.0))
+        bias_attr = paddle.ParamAttr(
+            regularizer=None, initializer=nn.initializer.Constant(value=0.0))
+        if in_channel == depth:
+            self.shortcut_layer = MaxPool2D(1, stride)
+        else:
+            self.shortcut_layer = Sequential(
+                Conv2D(
+                    in_channel,
+                    depth, (1, 1),
+                    stride,
+                    weight_attr=nn.initializer.KaimingNormal(),
+                    bias_attr=False),
+                BatchNorm2D(
+                    depth, weight_attr=weight_attr, bias_attr=bias_attr))
+        self.res_layer = Sequential(
+            BatchNorm2D(
+                in_channel, weight_attr=weight_attr, bias_attr=bias_attr),
+            Conv2D(
+                in_channel,
+                depth, (3, 3), (1, 1),
+                1,
+                weight_attr=nn.initializer.KaimingNormal(),
+                bias_attr=False),
+            BatchNorm2D(
+                depth, weight_attr=weight_attr, bias_attr=bias_attr),
+            PReLU(depth),
+            Conv2D(
+                depth,
+                depth, (3, 3),
+                stride,
+                1,
+                weight_attr=nn.initializer.KaimingNormal(),
+                bias_attr=False),
+            BatchNorm2D(
+                depth, weight_attr=weight_attr, bias_attr=bias_attr))
+
+    def forward(self, x):
+        shortcut = self.shortcut_layer(x)
+        res = self.res_layer(x)
+
+        return res + shortcut
+
+
+class BottleneckIR(Layer):
+    """ BasicBlock with bottleneck for IRNet
+    """
+
+    def __init__(self, in_channel, depth, stride):
+        super(BottleneckIR, self).__init__()
+        reduction_channel = depth // 4
+        weight_attr = paddle.ParamAttr(
+            regularizer=None, initializer=nn.initializer.Constant(value=1.0))
+        bias_attr = paddle.ParamAttr(
+            regularizer=None, initializer=nn.initializer.Constant(value=0.0))
+        if in_channel == depth:
+            self.shortcut_layer = MaxPool2D(1, stride)
+        else:
+            self.shortcut_layer = Sequential(
+                Conv2D(
+                    in_channel,
+                    depth, (1, 1),
+                    stride,
+                    weight_attr=nn.initializer.KaimingNormal(),
+                    bias_attr=False),
+                BatchNorm2D(
+                    depth, weight_attr=weight_attr, bias_attr=bias_attr))
+        self.res_layer = Sequential(
+            BatchNorm2D(
+                in_channel, weight_attr=weight_attr, bias_attr=bias_attr),
+            Conv2D(
+                in_channel,
+                reduction_channel, (1, 1), (1, 1),
+                0,
+                weight_attr=nn.initializer.KaimingNormal(),
+                bias_attr=False),
+            BatchNorm2D(
+                reduction_channel,
+                weight_attr=weight_attr,
+                bias_attr=bias_attr),
+            PReLU(reduction_channel),
+            Conv2D(
+                reduction_channel,
+                reduction_channel, (3, 3), (1, 1),
+                1,
+                weight_attr=nn.initializer.KaimingNormal(),
+                bias_attr=False),
+            BatchNorm2D(
+                reduction_channel,
+                weight_attr=weight_attr,
+                bias_attr=bias_attr),
+            PReLU(reduction_channel),
+            Conv2D(
+                reduction_channel,
+                depth, (1, 1),
+                stride,
+                0,
+                weight_attr=nn.initializer.KaimingNormal(),
+                bias_attr=False),
+            BatchNorm2D(
+                depth, weight_attr=weight_attr, bias_attr=bias_attr))
+
+    def forward(self, x):
+        shortcut = self.shortcut_layer(x)
+        res = self.res_layer(x)
+
+        return res + shortcut
+
+
+class BasicBlockIRSE(BasicBlockIR):
+    def __init__(self, in_channel, depth, stride):
+        super(BasicBlockIRSE, self).__init__(in_channel, depth, stride)
+        self.res_layer.add_sublayer("se_block", SELayer(depth, 16))
+
+
+class BottleneckIRSE(BottleneckIR):
+    def __init__(self, in_channel, depth, stride):
+        super(BottleneckIRSE, self).__init__(in_channel, depth, stride)
+        self.res_layer.add_sublayer("se_block", SELayer(depth, 16))
+
+
+class Bottleneck(namedtuple('Block', ['in_channel', 'depth', 'stride'])):
+    '''A named tuple describing a ResNet block.'''
+
+
+def get_block(in_channel, depth, num_units, stride=2):
+
+    return [Bottleneck(in_channel, depth, stride)] +\
+           [Bottleneck(depth, depth, 1) for i in range(num_units - 1)]
+
+
+def get_blocks(num_layers):
+    if num_layers == 18:
+        blocks = [
+            get_block(
+                in_channel=64, depth=64, num_units=2), get_block(
+                    in_channel=64, depth=128, num_units=2), get_block(
+                        in_channel=128, depth=256, num_units=2), get_block(
+                            in_channel=256, depth=512, num_units=2)
+        ]
+    elif num_layers == 34:
+        blocks = [
+            get_block(
+                in_channel=64, depth=64, num_units=3), get_block(
+                    in_channel=64, depth=128, num_units=4), get_block(
+                        in_channel=128, depth=256, num_units=6), get_block(
+                            in_channel=256, depth=512, num_units=3)
+        ]
+    elif num_layers == 50:
+        blocks = [
+            get_block(
+                in_channel=64, depth=64, num_units=3), get_block(
+                    in_channel=64, depth=128, num_units=4), get_block(
+                        in_channel=128, depth=256, num_units=14), get_block(
+                            in_channel=256, depth=512, num_units=3)
+        ]
+    elif num_layers == 100:
+        blocks = [
+            get_block(
+                in_channel=64, depth=64, num_units=3), get_block(
+                    in_channel=64, depth=128, num_units=13), get_block(
+                        in_channel=128, depth=256, num_units=30), get_block(
+                            in_channel=256, depth=512, num_units=3)
+        ]
+    elif num_layers == 152:
+        blocks = [
+            get_block(
+                in_channel=64, depth=256, num_units=3), get_block(
+                    in_channel=256, depth=512, num_units=8), get_block(
+                        in_channel=512, depth=1024, num_units=36), get_block(
+                            in_channel=1024, depth=2048, num_units=3)
+        ]
+    elif num_layers == 200:
+        blocks = [
+            get_block(
+                in_channel=64, depth=256, num_units=3), get_block(
+                    in_channel=256, depth=512, num_units=24), get_block(
+                        in_channel=512, depth=1024, num_units=36), get_block(
+                            in_channel=1024, depth=2048, num_units=3)
+        ]
+
+    return blocks
+
+
+class Backbone(Layer):
+    def __init__(self, input_size, num_layers, mode='ir'):
+        """ Args:
+            input_size: input_size of backbone
+            num_layers: num_layers of backbone
+            mode: support ir or irse
+        """
+        super(Backbone, self).__init__()
+        assert input_size[0] in [112, 224], \
+            "input_size should be [112, 112] or [224, 224]"
+        assert num_layers in [18, 34, 50, 100, 152, 200], \
+            "num_layers should be 18, 34, 50, 100 or 152"
+        assert mode in ['ir', 'ir_se'], \
+            "mode should be ir or ir_se"
+        weight_attr = paddle.ParamAttr(
+            regularizer=None, initializer=nn.initializer.Constant(value=1.0))
+        bias_attr = paddle.ParamAttr(
+            regularizer=None, initializer=nn.initializer.Constant(value=0.0))
+        self.input_layer = Sequential(
+            Conv2D(
+                3,
+                64, (3, 3),
+                1,
+                1,
+                weight_attr=nn.initializer.KaimingNormal(),
+                bias_attr=False),
+            BatchNorm2D(
+                64, weight_attr=weight_attr, bias_attr=bias_attr),
+            PReLU(64))
+        blocks = get_blocks(num_layers)
+        if num_layers <= 100:
+            if mode == 'ir':
+                unit_module = BasicBlockIR
+            elif mode == 'ir_se':
+                unit_module = BasicBlockIRSE
+            output_channel = 512
+        else:
+            if mode == 'ir':
+                unit_module = BottleneckIR
+            elif mode == 'ir_se':
+                unit_module = BottleneckIRSE
+            output_channel = 2048
+
+        if input_size[0] == 112:
+            self.output_layer = Sequential(
+                BatchNorm2D(
+                    output_channel,
+                    weight_attr=weight_attr,
+                    bias_attr=bias_attr),
+                Dropout(0.4),
+                Flatten(),
+                Linear(
+                    output_channel * 7 * 7,
+                    512,
+                    weight_attr=nn.initializer.KaimingNormal()),
+                BatchNorm1D(
+                    512, weight_attr=False, bias_attr=False))
+        else:
+            self.output_layer = Sequential(
+                BatchNorm2D(
+                    output_channel,
+                    weight_attr=weight_attr,
+                    bias_attr=bias_attr),
+                Dropout(0.4),
+                Flatten(),
+                Linear(
+                    output_channel * 14 * 14,
+                    512,
+                    weight_attr=nn.initializer.KaimingNormal()),
+                BatchNorm1D(
+                    512, weight_attr=False, bias_attr=False))
+
+        modules = []
+        for block in blocks:
+            for bottleneck in block:
+                modules.append(
+                    unit_module(bottleneck.in_channel, bottleneck.depth,
+                                bottleneck.stride))
+        self.body = Sequential(*modules)
+
+        # initialize_weights(self.modules())
+
+    def forward(self, x):
+
+        # current code only supports one extra image
+        # it comes with a extra dimension for number of extra image. We will just squeeze it out for now
+        x = self.input_layer(x)
+
+        for idx, module in enumerate(self.body):
+            x = module(x)
+
+        x = self.output_layer(x)
+        # norm = paddle.norm(x, 2, 1, True)
+        # output = paddle.divide(x, norm)
+        # return output, norm
+        return x
+
+
+def AdaFace_IR_18(input_size=(112, 112)):
+    """ Constructs a ir-18 model.
+    """
+    model = Backbone(input_size, 18, 'ir')
+    return model
+
+
+def AdaFace_IR_34(input_size=(112, 112)):
+    """ Constructs a ir-34 model.
+    """
+    model = Backbone(input_size, 34, 'ir')
+
+    return model
+
+
+def AdaFace_IR_50(input_size=(112, 112)):
+    """ Constructs a ir-50 model.
+    """
+    model = Backbone(input_size, 50, 'ir')
+
+    return model
+
+
+def AdaFace_IR_101(input_size=(112, 112)):
+    """ Constructs a ir-101 model.
+    """
+    model = Backbone(input_size, 100, 'ir')
+
+    return model
+
+
+def AdaFace_IR_152(input_size=(112, 112)):
+    """ Constructs a ir-152 model.
+    """
+    model = Backbone(input_size, 152, 'ir')
+
+    return model
+
+
+def AdaFace_IR_200(input_size=(112, 112)):
+    """ Constructs a ir-200 model.
+    """
+    model = Backbone(input_size, 200, 'ir')
+
+    return model
+
+
+def AdaFace_IR_SE_50(input_size=(112, 112)):
+    """ Constructs a ir_se-50 model.
+    """
+    model = Backbone(input_size, 50, 'ir_se')
+
+    return model
+
+
+def AdaFace_IR_SE_101(input_size=(112, 112)):
+    """ Constructs a ir_se-101 model.
+    """
+    model = Backbone(input_size, 100, 'ir_se')
+
+    return model
+
+
+def AdaFace_IR_SE_152(input_size=(112, 112)):
+    """ Constructs a ir_se-152 model.
+    """
+    model = Backbone(input_size, 152, 'ir_se')
+
+    return model
+
+
+def AdaFace_IR_SE_200(input_size=(112, 112)):
+    """ Constructs a ir_se-200 model.
+    """
+    model = Backbone(input_size, 200, 'ir_se')
+
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/alexnet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/alexnet.py
new file mode 100644
index 000000000..7020d5db7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/alexnet.py
@@ -0,0 +1,170 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://proceedings.neurips.cc/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout, ReLU
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "AlexNet":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/AlexNet_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvPoolLayer(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter_size,
+                 stride,
+                 padding,
+                 stdv,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvPoolLayer, self).__init__()
+
+        self.relu = ReLU() if act == "relu" else None
+
+        self._conv = Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(
+                name=name + "_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(
+                name=name + "_offset", initializer=Uniform(-stdv, stdv)))
+        self._pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        if self.relu is not None:
+            x = self.relu(x)
+        x = self._pool(x)
+        return x
+
+
+class AlexNetDY(nn.Layer):
+    def __init__(self, class_num=1000):
+        super(AlexNetDY, self).__init__()
+
+        stdv = 1.0 / math.sqrt(3 * 11 * 11)
+        self._conv1 = ConvPoolLayer(
+            3, 64, 11, 4, 2, stdv, act="relu", name="conv1")
+        stdv = 1.0 / math.sqrt(64 * 5 * 5)
+        self._conv2 = ConvPoolLayer(
+            64, 192, 5, 1, 2, stdv, act="relu", name="conv2")
+        stdv = 1.0 / math.sqrt(192 * 3 * 3)
+        self._conv3 = Conv2D(
+            192,
+            384,
+            3,
+            stride=1,
+            padding=1,
+            weight_attr=ParamAttr(
+                name="conv3_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(
+                name="conv3_offset", initializer=Uniform(-stdv, stdv)))
+        stdv = 1.0 / math.sqrt(384 * 3 * 3)
+        self._conv4 = Conv2D(
+            384,
+            256,
+            3,
+            stride=1,
+            padding=1,
+            weight_attr=ParamAttr(
+                name="conv4_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(
+                name="conv4_offset", initializer=Uniform(-stdv, stdv)))
+        stdv = 1.0 / math.sqrt(256 * 3 * 3)
+        self._conv5 = ConvPoolLayer(
+            256, 256, 3, 1, 1, stdv, act="relu", name="conv5")
+        stdv = 1.0 / math.sqrt(256 * 6 * 6)
+
+        self._drop1 = Dropout(p=0.5, mode="downscale_in_infer")
+        self._fc6 = Linear(
+            in_features=256 * 6 * 6,
+            out_features=4096,
+            weight_attr=ParamAttr(
+                name="fc6_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(
+                name="fc6_offset", initializer=Uniform(-stdv, stdv)))
+
+        self._drop2 = Dropout(p=0.5, mode="downscale_in_infer")
+        self._fc7 = Linear(
+            in_features=4096,
+            out_features=4096,
+            weight_attr=ParamAttr(
+                name="fc7_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(
+                name="fc7_offset", initializer=Uniform(-stdv, stdv)))
+        self._fc8 = Linear(
+            in_features=4096,
+            out_features=class_num,
+            weight_attr=ParamAttr(
+                name="fc8_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(
+                name="fc8_offset", initializer=Uniform(-stdv, stdv)))
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._conv2(x)
+        x = self._conv3(x)
+        x = F.relu(x)
+        x = self._conv4(x)
+        x = F.relu(x)
+        x = self._conv5(x)
+        x = paddle.flatten(x, start_axis=1, stop_axis=-1)
+        x = self._drop1(x)
+        x = self._fc6(x)
+        x = F.relu(x)
+        x = self._drop2(x)
+        x = self._fc7(x)
+        x = F.relu(x)
+        x = self._fc8(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def AlexNet(pretrained=False, use_ssld=False, **kwargs):
+    model = AlexNetDY(**kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["AlexNet"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/cae.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/cae.py
new file mode 100644
index 000000000..ac0f044bb
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/cae.py
@@ -0,0 +1,860 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was heavily based on https://github.com/PaddlePaddle/VIMER/blob/main/CAE/models/modeling_finetune.py
+# reference: https://arxiv.org/abs/2202.03026
+
+import collections
+from itertools import repeat
+import math
+import numpy as np
+from functools import partial
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ....utils.download import get_weights_path_from_url
+
+MODEL_URLS = {
+    "cae_base_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/cae_base_patch16_224_pretrained.pdparams",
+    "cae_large_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/cae_large_patch16_224_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+def trunc_normal_(tensor, mean=0., std=1.):
+    nn.initializer.TruncatedNormal(mean=mean, std=std)(tensor)
+
+
+def drop_path(x, drop_prob: float=0., training: bool=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0], ) + (1, ) * (
+        x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor.floor_()  # binarize
+    output = x / keep_prob * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias_attr=True)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias_attr=True)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        # x = self.drop(x)
+        # commit this for the orignal BERT implement
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 window_size=None,
+                 attn_head_dim=None):
+        super().__init__()
+
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.zeros_ = nn.initializer.Constant(value=0.)
+
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias_attr=False)
+        if qkv_bias:
+            self.q_bias = self.create_parameter(
+                [all_head_dim], default_initializer=self.zeros_)
+            self.v_bias = self.create_parameter(
+                [all_head_dim], default_initializer=self.zeros_)
+        else:
+            self.q_bias = None
+            self.v_bias = None
+
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0] - 1) * (
+                2 * window_size[1] - 1) + 3
+            self.relative_position_bias_table = self.create_parameter(
+                [self.num_relative_distance, num_heads],
+                default_initializer=self.zeros_)  # 2*Wh-1 * 2*Ww-1, nH
+            # cls to token & token 2 cls & cls to cls
+
+            # get pair-wise relative position index for each token inside the window
+            coords_h = paddle.arange(window_size[0])
+            coords_w = paddle.arange(window_size[1])
+            coords = paddle.stack(paddle.meshgrid(
+                [coords_h, coords_w]))  # 2, Wh, Ww
+            coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+            relative_coords = coords_flatten[:, :,
+                                             None] - coords_flatten[:,
+                                                                    None, :]  # 2, Wh*Ww, Wh*Ww
+            relative_coords = relative_coords.transpose(
+                [1, 2, 0])  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :, 0] += window_size[
+                0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+            relative_position_index = \
+                paddle.zeros((window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
+            relative_position_index[1:, 1:] = relative_coords.sum(
+                -1)  # Wh*Ww, Wh*Ww
+            relative_position_index[0, 0:] = self.num_relative_distance - 3
+            relative_position_index[0:, 0] = self.num_relative_distance - 2
+            relative_position_index[0, 0] = self.num_relative_distance - 1
+
+            self.register_buffer("relative_position_index",
+                                 relative_position_index)
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim, bias_attr=True)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, rel_pos_bias=None):
+        B, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            k_bias = paddle.zeros_like(self.v_bias)
+            k_bias.stop_gradient = True
+            qkv_bias = paddle.concat((self.q_bias, k_bias, self.v_bias))
+        # qkv = self.qkv(x).reshape([B, N, 3, self.num_heads, C // self.num_heads]).transpose([2, 0, 3, 1, 4])
+        qkv = F.linear(x=x, weight=self.qkv.weight, bias=qkv_bias)
+        qkv = qkv.reshape([B, N, 3, self.num_heads, -1]).transpose(
+            [2, 0, 3, 1, 4])
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @k.transpose([0, 1, 3, 2]))
+
+        if self.relative_position_bias_table is not None:
+            relative_position_bias = \
+                self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([
+                    self.window_size[0] * self.window_size[1] + 1,
+                    self.window_size[0] * self.window_size[1] + 1, -1])  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.transpose(
+                [2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+            attn = attn + relative_position_bias.unsqueeze(0)
+
+        if rel_pos_bias is not None:
+            attn = attn + rel_pos_bias
+
+        attn = F.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @v).transpose([0, 2, 1, 3]).reshape([B, N, -1])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 init_values=None,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 window_size=None,
+                 attn_head_dim=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            window_size=window_size,
+            attn_head_dim=attn_head_dim)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+
+        if init_values > 0:
+            self.gamma_1 = self.create_parameter(
+                [dim],
+                default_initializer=nn.initializer.Constant(value=init_values))
+            self.gamma_2 = self.create_parameter(
+                [dim],
+                default_initializer=nn.initializer.Constant(value=init_values))
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x, rel_pos_bias=None):
+        if self.gamma_1 is None:
+            x = x + self.drop_path(
+                self.attn(
+                    self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_1 * self.attn(
+                self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        to_2tuple = _ntuple(2)
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] //
+                                                        patch_size[0])
+        self.patch_shape = (img_size[0] // patch_size[0],
+                            img_size[1] // patch_size[1])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.in_chans = in_chans
+        self.out_chans = embed_dim
+        self.proj = nn.Conv2D(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias_attr=True)
+
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose([0, 2, 1])
+        return x
+
+    def _init_weights(self):
+        fan_out = self.out_chans
+        fan_in = self.patch_size[0] * self.patch_size[1] * self.in_chans
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.XavierUniform(fan_in, fan_out))  # MAE
+        bias_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(0.0))
+        return weight_attr, bias_attr
+
+
+class RelativePositionBias(nn.Layer):
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0] - 1) * (
+            2 * window_size[1] - 1) + 3
+        self.zeros_ = nn.initializer.Constant(value=0.)
+        self.relative_position_bias_table = self.create_parameter(
+            [self.num_relative_distance, num_heads],
+            default_initializer=self.zeros_)  # 2*Wh-1 * 2*Ww-1, nH
+        # cls to token & token 2 cls & cls to cls
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = paddle.arange(window_size[0])
+        coords_w = paddle.arange(window_size[1])
+        coords = paddle.stack(paddle.meshgrid(
+            [coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.transpose(
+            [1, 2, 0])  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = \
+            paddle.zeros((window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+        relative_position_index[1:, 1:] = relative_coords.sum(
+            -1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+
+        self.register_buffer("relative_position_index",
+                             relative_position_index)
+
+    def forward(self):
+        relative_position_bias = \
+            self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([
+                self.window_size[0] * self.window_size[1] + 1,
+                self.window_size[0] * self.window_size[1] + 1, -1])  # Wh*Ww,Wh*Ww,nH
+        return relative_position_bias.transpose([2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+
+
+def get_sinusoid_encoding_table(n_position, d_hid, token=False):
+    ''' Sinusoid position encoding table '''
+
+    def get_position_angle_vec(position):
+        return [
+            position / np.power(10000, 2 * (hid_j // 2) / d_hid)
+            for hid_j in range(d_hid)
+        ]
+
+    sinusoid_table = np.array(
+        [get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+
+    if token:
+        sinusoid_table = np.concatenate(
+            [sinusoid_table, np.zeros([1, d_hid])], dim=0)
+
+    return paddle.to_tensor(sinusoid_table).unsqueeze(0)
+
+
+class VisionTransformer(nn.Layer):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 init_values=None,
+                 use_abs_pos_emb=True,
+                 use_rel_pos_bias=False,
+                 use_shared_rel_pos_bias=False,
+                 use_mean_pooling=True,
+                 init_scale=0.001,
+                 lin_probe=False,
+                 sin_pos_emb=True,
+                 args=None):
+        super().__init__()
+        self.class_num = class_num
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.use_mean_pooling = use_mean_pooling
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.zeros_ = nn.initializer.Constant(value=0.)
+        self.ones_ = nn.initializer.Constant(value=1.)
+
+        self.cls_token = self.create_parameter(
+            [1, 1, embed_dim], default_initializer=self.zeros_)
+
+        self.use_abs_pos_emb = use_abs_pos_emb
+        if use_abs_pos_emb:
+            self.pos_embed = self.create_parameter(
+                [1, num_patches + 1, embed_dim],
+                default_initializer=self.zeros_)
+        elif sin_pos_emb:
+            # sine-cosine positional embeddings is on the way
+            self.pos_embed = self.create_parameter(
+                [1, num_patches + 1, embed_dim],
+                default_initializer=self.zeros_)
+            self.pos_embed.set_value(
+                self.build_2d_sincos_position_embedding(embed_dim))
+            self.pos_embed.stop_gradient = True  # fixed sin-cos embedding
+        else:
+            self.pos_embed = None
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(
+                window_size=self.patch_embed.patch_shape, num_heads=num_heads)
+        else:
+            self.rel_pos_bias = None
+
+        dpr = [x.item() for x in paddle.linspace(0, drop_path_rate, depth)
+               ]  # stochastic depth decay rule
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.blocks = nn.LayerList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                init_values=init_values,
+                window_size=self.patch_embed.patch_shape
+                if use_rel_pos_bias else None) for i in range(depth)
+        ])
+        self.norm = nn.Identity() if use_mean_pooling else norm_layer(
+            embed_dim)
+
+        self.lin_probe = lin_probe
+        # NOTE: batch norm
+        if lin_probe:
+            # TODO
+            from models.lincls_bn import LP_BatchNorm
+            self.fc_norm = LP_BatchNorm(embed_dim, affine=False)
+        else:
+            if use_mean_pooling:
+                self.fc_norm = norm_layer(embed_dim)
+            else:
+                self.fc_norm = None
+        self.head = nn.Linear(embed_dim,
+                              class_num) if class_num > 0 else nn.Identity()
+
+        if self.pos_embed is not None and use_abs_pos_emb:
+            trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        # trunc_normal_(self.mask_token, std=.02)
+        trunc_normal_(self.head.weight, std=.02)
+        self.apply(self._init_weights)
+        self.fix_init_weight()
+
+        self.head.weight.set_value(self.head.weight * init_scale)
+        self.head.bias.set_value(self.head.bias * init_scale)
+
+    def build_2d_sincos_position_embedding(self,
+                                           embed_dim=768,
+                                           temperature=10000.):
+        h, w = self.patch_embed.patch_shape
+        grid_w = paddle.arange(w, dtype=paddle.float32)
+        grid_h = paddle.arange(h, dtype=paddle.float32)
+        grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
+        assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
+        pos_dim = embed_dim // 4
+        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
+        omega = 1. / (temperature**omega)
+        out_w = paddle.einsum('m,d->md', grid_w.flatten(), omega)
+        out_h = paddle.einsum('m,d->md', grid_h.flatten(), omega)
+        pos_emb = paddle.concat(
+            [
+                paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
+                paddle.cos(out_h)
+            ],
+            axis=1)[None, :, :]
+
+        # if not self.use_mean_pooling:
+        pe_token = paddle.zeros([1, 1, embed_dim], dtype=paddle.float32)
+        pos_emb = paddle.concat([pe_token, pos_emb], axis=1)
+        return pos_emb
+
+    def fix_init_weight(self):
+        def rescale(param, layer_id):
+            param.set_value(param / math.sqrt(2.0 * layer_id))
+
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight, layer_id + 1)
+            rescale(layer.mlp.fc2.weight, layer_id + 1)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                self.zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            self.zeros_(m.bias)
+            self.ones_(m.weight)
+
+    def get_num_layers(self):
+        return len(self.blocks)
+
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, class_num, global_pool=''):
+        self.class_num = class_num
+        self.head = nn.Linear(self.embed_dim,
+                              class_num) if class_num > 0 else nn.Identity()
+
+    def forward_features(self, x, is_train=True):
+        x = self.patch_embed(x)
+        batch_size, seq_len, _ = x.shape
+
+        cls_tokens = self.cls_token.expand([
+            batch_size, -1, -1
+        ]).astype(x.dtype)  # stole cls_tokens impl from Phil Wang, thanks
+        x = paddle.concat((cls_tokens, x), axis=1)
+        if self.pos_embed is not None:
+            if self.use_abs_pos_emb:
+                x = x + self.pos_embed.expand(
+                    [batch_size, -1, -1]).astype(x.dtype).clone().detach()
+            else:
+                x = x + self.pos_embed.expand(
+                    [batch_size, -1, -1]).astype(x.dtype).clone().detach()
+
+        x = self.pos_drop(x)
+
+        rel_pos_bias = self.rel_pos_bias(
+        ) if self.rel_pos_bias is not None else None
+        for blk in self.blocks:
+            x = blk(x, rel_pos_bias=rel_pos_bias)
+
+        x = self.norm(x)
+        if self.fc_norm is not None:
+            t = x[:, 1:, :]
+            if self.lin_probe:
+                if self.use_mean_pooling:
+                    return self.fc_norm(t.mean(1), is_train=is_train)
+                else:
+                    return self.fc_norm(x[:, 0], is_train=is_train)
+            else:
+                return self.fc_norm(t.mean(1))
+
+        else:
+            return x[:, 0]
+
+    def forward(self, x, is_train=True):
+        x = self.forward_features(x, is_train)
+        x = self.head(x)
+        return x
+
+
+def _enable_linear_eval(model):
+    zeros_ = nn.initializer.Constant(value=0.)
+    normal_ = nn.initializer.Normal(mean=0.0, std=0.01)
+    linear_keyword = 'head'
+    head_norm = 'fc_norm'
+    requires_grad = []
+    for name, param in model.named_parameters():
+        if name not in [
+                '%s.weight' % linear_keyword, '%s.bias' % linear_keyword
+        ] and head_norm not in name:
+            param.stop_gradient = True
+        else:
+            requires_grad.append(name)
+    # init the fc layer
+    normal_(getattr(model, linear_keyword).weight)
+    zeros_(getattr(model, linear_keyword).bias)
+
+    return
+
+
+def _load_pretrained(pretrained,
+                     pretrained_url,
+                     model,
+                     model_keys,
+                     model_ema_configs,
+                     use_abs_pos_emb,
+                     use_rel_pos_bias,
+                     use_ssld=False):
+    if pretrained is False:
+        return
+    elif pretrained is True:
+        local_weight_path = get_weights_path_from_url(pretrained_url).replace(
+            ".pdparams", "")
+        checkpoint = paddle.load(local_weight_path + ".pdparams")
+    elif isinstance(pretrained, str):
+        checkpoint = paddle.load(pretrained + ".pdparams")
+
+    checkpoint_model = None
+    for model_key in model_keys.split('|'):
+        if model_key in checkpoint:
+            checkpoint_model = checkpoint[model_key]
+            break
+
+    if checkpoint_model is None:
+        checkpoint_model = checkpoint
+    state_dict = model.state_dict()
+    all_keys = list(checkpoint_model.keys())
+    # NOTE: remove all decoder keys
+    all_keys = [key for key in all_keys if key.startswith('encoder.')]
+    for key in all_keys:
+        new_key = key.replace('encoder.', '')
+        checkpoint_model[new_key] = checkpoint_model[key]
+        checkpoint_model.pop(key)
+
+    for key in list(checkpoint_model.keys()):
+        if key.startswith('regressor_and_decoder.'):
+            checkpoint_model.pop(key)
+        if key.startswith('teacher_network.'):
+            checkpoint_model.pop(key)
+
+        # NOTE: replace norm with fc_norm
+    for key in list(checkpoint_model.keys()):
+        if key.startswith('norm.'):
+            new_key = key.replace('norm.', 'fc_norm.')
+            checkpoint_model[new_key] = checkpoint_model[key]
+            checkpoint_model.pop(key)
+
+    for k in ['head.weight', 'head.bias']:
+        if k in checkpoint_model and checkpoint_model[k].shape != state_dict[
+                k].shape:
+            del checkpoint_model[k]
+
+    if model.use_rel_pos_bias and "rel_pos_bias.relative_position_bias_table" in checkpoint_model:
+        num_layers = model.get_num_layers()
+        rel_pos_bias = checkpoint_model[
+            "rel_pos_bias.relative_position_bias_table"]
+        for i in range(num_layers):
+            checkpoint_model["blocks.%d.attn.relative_position_bias_table" %
+                             i] = rel_pos_bias.clone()
+
+        checkpoint_model.pop("rel_pos_bias.relative_position_bias_table")
+
+    all_keys = list(checkpoint_model.keys())
+
+    for key in all_keys:
+        if "relative_position_index" in key:
+            checkpoint_model.pop(key)
+
+        if "relative_position_bias_table" in key and use_rel_pos_bias:
+            rel_pos_bias = checkpoint_model[key]
+            src_num_pos, num_attn_heads = rel_pos_bias.shape
+            dst_num_pos, _ = model.state_dict()[key].shape
+            dst_patch_shape = model.patch_embed.patch_shape
+            if dst_patch_shape[0] != dst_patch_shape[1]:
+                raise NotImplementedError()
+            num_extra_tokens = dst_num_pos - (dst_patch_shape[0] * 2 - 1) * (
+                dst_patch_shape[1] * 2 - 1)
+            src_size = int((src_num_pos - num_extra_tokens)**0.5)
+            dst_size = int((dst_num_pos - num_extra_tokens)**0.5)
+            if src_size != dst_size:
+                extra_tokens = rel_pos_bias[-num_extra_tokens:, :]
+                rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :]
+
+                def geometric_progression(a, r, n):
+                    return a * (1.0 - r**n) / (1.0 - r)
+
+                left, right = 1.01, 1.5
+                while right - left > 1e-6:
+                    q = (left + right) / 2.0
+                    gp = geometric_progression(1, q, src_size // 2)
+                    if gp > dst_size // 2:
+                        right = q
+                    else:
+                        left = q
+
+                dis = []
+                cur = 1
+                for i in range(src_size // 2):
+                    dis.append(cur)
+                    cur += q**(i + 1)
+
+                r_ids = [-_ for _ in reversed(dis)]
+
+                x = r_ids + [0] + dis
+                y = r_ids + [0] + dis
+
+                t = dst_size // 2.0
+                dx = np.arange(-t, t + 0.1, 1.0)
+                dy = np.arange(-t, t + 0.1, 1.0)
+
+                all_rel_pos_bias = []
+
+                for i in range(num_attn_heads):
+                    z = rel_pos_bias[:, i].view(src_size,
+                                                src_size).float().numpy()
+                    f = interpolate.interp2d(x, y, z, kind='cubic')
+                    all_rel_pos_bias.append(
+                        paddle.Tensor(f(dx, dy)).astype('float32').reshape(
+                            [-1, 1]))
+
+                rel_pos_bias = paddle.concat(all_rel_pos_bias, axis=-1)
+
+                new_rel_pos_bias = paddle.concat(
+                    (rel_pos_bias, extra_tokens), axis=0)
+                checkpoint_model[key] = new_rel_pos_bias
+
+    # interpolate position embedding
+    if 'pos_embed' in checkpoint_model and use_abs_pos_emb:
+        pos_embed_checkpoint = checkpoint_model['pos_embed']
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens)**
+                        0.5)
+        # height (== width) for the new position embedding
+        new_size = int(num_patches**0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size,
+                                            embedding_size).permute(0, 3, 1, 2)
+            pos_tokens = paddle.nn.functional.interpolate(
+                pos_tokens,
+                size=(new_size, new_size),
+                mode='bicubic',
+                align_corners=False)
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = paddle.concat((extra_tokens, pos_tokens), axis=1)
+            checkpoint_model['pos_embed'] = new_pos_embed
+    msg = model.set_state_dict(checkpoint_model)
+
+    model_without_ddp = model
+    n_parameters = sum(p.numel() for p in model.parameters()
+                       if not p.stop_gradient).item()
+
+    return
+
+
+def cae_base_patch16_224(pretrained=True, use_ssld=False, **kwargs):
+    config = kwargs.copy()
+    enable_linear_eval = config.pop('enable_linear_eval')
+    model_keys = config.pop('model_key')
+    model_ema_configs = config.pop('model_ema')
+    use_abs_pos_emb = config.get('use_abs_pos_emb', False)
+    use_rel_pos_bias = config.get('use_rel_pos_bias', True)
+    if pretrained in config:
+        pretrained = config.pop('pretrained')
+
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        **config)
+
+    if enable_linear_eval:
+        _enable_linear_eval(model)
+
+    _load_pretrained(
+        pretrained,
+        MODEL_URLS["cae_base_patch16_224"],
+        model,
+        model_keys,
+        model_ema_configs,
+        use_abs_pos_emb,
+        use_rel_pos_bias,
+        use_ssld=False)
+
+    return model
+
+
+def cae_large_patch16_224(pretrained=True, use_ssld=False, **kwargs):
+    config = kwargs.copy()
+    enable_linear_eval = config.pop('enable_linear_eval')
+    model_keys = config.pop('model_key')
+    model_ema_configs = config.pop('model_ema')
+    use_abs_pos_emb = config.get('use_abs_pos_emb', False)
+    use_rel_pos_bias = config.get('use_rel_pos_bias', True)
+    if pretrained in config:
+        pretrained = config.pop('pretrained')
+
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        **config)
+
+    if enable_linear_eval:
+        _enable_linear_eval(model)
+
+    _load_pretrained(
+        pretrained,
+        MODEL_URLS["cae_large_patch16_224"],
+        model,
+        model_keys,
+        model_ema_configs,
+        use_abs_pos_emb,
+        use_rel_pos_bias,
+        use_ssld=False)
+
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/convnext.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/convnext.py
new file mode 100644
index 000000000..3773fac56
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/convnext.py
@@ -0,0 +1,282 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Code was heavily based on https://github.com/facebookresearch/ConvNeXt
+
+import paddle
+import paddle.nn as nn
+from paddle.nn.initializer import TruncatedNormal, Constant
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "ConvNeXt_tiny":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ConvNeXt_tiny_pretrained.pdparams",
+    "ConvNeXt_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ConvNeXt_small_pretrained.pdparams",
+    "ConvNeXt_base_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ConvNeXt_base_224_pretrained.pdparams",
+    "ConvNeXt_base_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ConvNeXt_base_384_pretrained.pdparams",
+    "ConvNeXt_large_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ConvNeXt_large_224_pretrained.pdparams",
+    "ConvNeXt_large_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ConvNeXt_large_384_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+trunc_normal_ = TruncatedNormal(std=.02)
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class ChannelsFirstLayerNorm(nn.Layer):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs
+    with shape (batch_size, channels, height, width).
+    """
+
+    def __init__(self, normalized_shape, epsilon=1e-5):
+        super().__init__()
+        self.weight = self.create_parameter(
+            shape=[normalized_shape], default_initializer=ones_)
+        self.bias = self.create_parameter(
+            shape=[normalized_shape], default_initializer=zeros_)
+        self.epsilon = epsilon
+        self.normalized_shape = [normalized_shape]
+
+    def forward(self, x):
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / paddle.sqrt(s + self.epsilon)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+
+
+class Block(nn.Layer):
+    r""" ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in PyTorch
+
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+
+    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
+        super().__init__()
+        self.dwconv = nn.Conv2D(
+            dim, dim, 7, padding=3, groups=dim)  # depthwise conv
+        self.norm = nn.LayerNorm(dim, epsilon=1e-6)
+        # pointwise/1x1 convs, implemented with linear layers
+        self.pwconv1 = nn.Linear(dim, 4 * dim)
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+        if layer_scale_init_value > 0:
+            self.gamma = self.create_parameter(
+                shape=[dim],
+                default_initializer=Constant(value=layer_scale_init_value))
+        else:
+            self.gamma = None
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.transpose([0, 2, 3, 1])  # (N, C, H, W) -> (N, H, W, C)
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.transpose([0, 3, 1, 2])  # (N, H, W, C) -> (N, C, H, W)
+
+        x = input + self.drop_path(x)
+        return x
+
+
+class ConvNeXt(nn.Layer):
+    r""" ConvNeXt
+        A PaddlePaddle impl of : `A ConvNet for the 2020s`  -
+          https://arxiv.org/pdf/2201.03545.pdf
+
+    Args:
+        in_chans (int): Number of input image channels. Default: 3
+        class_num (int): Number of classes for classification head. Default: 1000
+        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+        head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1.
+    """
+
+    def __init__(self,
+                 in_chans=3,
+                 class_num=1000,
+                 depths=[3, 3, 9, 3],
+                 dims=[96, 192, 384, 768],
+                 drop_path_rate=0.,
+                 layer_scale_init_value=1e-6,
+                 head_init_scale=1.):
+        super().__init__()
+
+        # stem and 3 intermediate downsampling conv layers
+        self.downsample_layers = nn.LayerList()
+        stem = nn.Sequential(
+            nn.Conv2D(
+                in_chans, dims[0], 4, stride=4),
+            ChannelsFirstLayerNorm(
+                dims[0], epsilon=1e-6))
+        self.downsample_layers.append(stem)
+        for i in range(3):
+            downsample_layer = nn.Sequential(
+                ChannelsFirstLayerNorm(
+                    dims[i], epsilon=1e-6),
+                nn.Conv2D(
+                    dims[i], dims[i + 1], 2, stride=2), )
+            self.downsample_layers.append(downsample_layer)
+
+        # 4 feature resolution stages, each consisting of multiple residual blocks
+        self.stages = nn.LayerList()
+        dp_rates = [
+            x.item() for x in paddle.linspace(0, drop_path_rate, sum(depths))
+        ]
+        cur = 0
+        for i in range(4):
+            stage = nn.Sequential(* [
+                Block(
+                    dim=dims[i],
+                    drop_path=dp_rates[cur + j],
+                    layer_scale_init_value=layer_scale_init_value)
+                for j in range(depths[i])
+            ])
+            self.stages.append(stage)
+            cur += depths[i]
+
+        self.norm = nn.LayerNorm(dims[-1], epsilon=1e-6)  # final norm layer
+        self.head = nn.Linear(dims[-1], class_num)
+
+        self.apply(self._init_weights)
+        self.head.weight.set_value(self.head.weight * head_init_scale)
+        self.head.bias.set_value(self.head.bias * head_init_scale)
+
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2D, nn.Linear)):
+            trunc_normal_(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+
+    def forward_features(self, x):
+        for i in range(4):
+            x = self.downsample_layers[i](x)
+            x = self.stages[i](x)
+        # global average pooling, (N, C, H, W) -> (N, C)
+        return self.norm(x.mean([-2, -1]))
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ConvNeXt_tiny(pretrained=False, use_ssld=False, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 9, 3], dims=[96, 192, 384, 768], **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ConvNeXt_tiny"], use_ssld=use_ssld)
+    return model
+
+
+def ConvNeXt_small(pretrained=False, use_ssld=False, **kwargs):
+    model = ConvNeXt(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ConvNeXt_small"], use_ssld=use_ssld)
+    return model
+
+
+def ConvNeXt_base_224(pretrained=False, use_ssld=False, **kwargs):
+    model = ConvNeXt(
+        depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ConvNeXt_base_224"], use_ssld=use_ssld)
+    return model
+
+
+def ConvNeXt_base_384(pretrained=False, use_ssld=False, **kwargs):
+    model = ConvNeXt(
+        depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ConvNeXt_base_384"], use_ssld=use_ssld)
+    return model
+
+
+def ConvNeXt_large_224(pretrained=False, use_ssld=False, **kwargs):
+    model = ConvNeXt(
+        depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ConvNeXt_large_224"], use_ssld=use_ssld)
+    return model
+
+
+def ConvNeXt_large_384(pretrained=False, use_ssld=False, **kwargs):
+    model = ConvNeXt(
+        depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ConvNeXt_large_384"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/cspnet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/cspnet.py
new file mode 100644
index 000000000..c62206328
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/cspnet.py
@@ -0,0 +1,377 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was heavily based on https://github.com/rwightman/pytorch-image-models
+# reference: https://arxiv.org/abs/1911.11929
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "CSPDarkNet53":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CSPDarkNet53_pretrained.pdparams"
+}
+
+MODEL_CFGS = {
+    "CSPDarkNet53": dict(
+        stem=dict(
+            out_chs=32, kernel_size=3, stride=1, pool=''),
+        stage=dict(
+            out_chs=(64, 128, 256, 512, 1024),
+            depth=(1, 2, 8, 8, 4),
+            stride=(2, ) * 5,
+            exp_ratio=(2., ) + (1., ) * 4,
+            bottle_ratio=(0.5, ) + (1.0, ) * 4,
+            block_ratio=(1., ) + (0.5, ) * 4,
+            down_growth=True, ))
+}
+
+__all__ = ['CSPDarkNet53'
+           ]  # model_registry will add each entrypoint fn to this
+
+
+class ConvBnAct(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 kernel_size=1,
+                 stride=1,
+                 padding=None,
+                 dilation=1,
+                 groups=1,
+                 act_layer=nn.LeakyReLU,
+                 norm_layer=nn.BatchNorm2D):
+        super().__init__()
+        if padding is None:
+            padding = (kernel_size - 1) // 2
+        self.conv = nn.Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=ParamAttr(),
+            bias_attr=False)
+
+        self.bn = norm_layer(num_features=output_channels)
+        self.act = act_layer()
+
+    def forward(self, inputs):
+        x = self.conv(inputs)
+        x = self.bn(x)
+        if self.act is not None:
+            x = self.act(x)
+        return x
+
+
+def create_stem(in_chans=3,
+                out_chs=32,
+                kernel_size=3,
+                stride=2,
+                pool='',
+                act_layer=None,
+                norm_layer=None):
+    stem = nn.Sequential()
+    if not isinstance(out_chs, (tuple, list)):
+        out_chs = [out_chs]
+    assert len(out_chs)
+    in_c = in_chans
+    for i, out_c in enumerate(out_chs):
+        conv_name = f'conv{i + 1}'
+        stem.add_sublayer(
+            conv_name,
+            ConvBnAct(
+                in_c,
+                out_c,
+                kernel_size,
+                stride=stride if i == 0 else 1,
+                act_layer=act_layer,
+                norm_layer=norm_layer))
+        in_c = out_c
+        last_conv = conv_name
+    if pool:
+        stem.add_sublayer(
+            'pool', nn.MaxPool2D(
+                kernel_size=3, stride=2, padding=1))
+    return stem, dict(
+        num_chs=in_c, reduction=stride, module='.'.join(['stem', last_conv]))
+
+
+class DarkBlock(nn.Layer):
+    def __init__(self,
+                 in_chs,
+                 out_chs,
+                 dilation=1,
+                 bottle_ratio=0.5,
+                 groups=1,
+                 act_layer=nn.ReLU,
+                 norm_layer=nn.BatchNorm2D,
+                 attn_layer=None,
+                 drop_block=None):
+        super(DarkBlock, self).__init__()
+        mid_chs = int(round(out_chs * bottle_ratio))
+        ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer)
+        self.conv1 = ConvBnAct(in_chs, mid_chs, kernel_size=1, **ckwargs)
+        self.conv2 = ConvBnAct(
+            mid_chs,
+            out_chs,
+            kernel_size=3,
+            dilation=dilation,
+            groups=groups,
+            **ckwargs)
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = x + shortcut
+        return x
+
+
+class CrossStage(nn.Layer):
+    def __init__(self,
+                 in_chs,
+                 out_chs,
+                 stride,
+                 dilation,
+                 depth,
+                 block_ratio=1.,
+                 bottle_ratio=1.,
+                 exp_ratio=1.,
+                 groups=1,
+                 first_dilation=None,
+                 down_growth=False,
+                 cross_linear=False,
+                 block_dpr=None,
+                 block_fn=DarkBlock,
+                 **block_kwargs):
+        super(CrossStage, self).__init__()
+        first_dilation = first_dilation or dilation
+        down_chs = out_chs if down_growth else in_chs
+        exp_chs = int(round(out_chs * exp_ratio))
+        block_out_chs = int(round(out_chs * block_ratio))
+        conv_kwargs = dict(
+            act_layer=block_kwargs.get('act_layer'),
+            norm_layer=block_kwargs.get('norm_layer'))
+
+        if stride != 1 or first_dilation != dilation:
+            self.conv_down = ConvBnAct(
+                in_chs,
+                down_chs,
+                kernel_size=3,
+                stride=stride,
+                dilation=first_dilation,
+                groups=groups,
+                **conv_kwargs)
+            prev_chs = down_chs
+        else:
+            self.conv_down = None
+            prev_chs = in_chs
+
+        self.conv_exp = ConvBnAct(
+            prev_chs, exp_chs, kernel_size=1, **conv_kwargs)
+        prev_chs = exp_chs // 2  # output of conv_exp is always split in two
+
+        self.blocks = nn.Sequential()
+        for i in range(depth):
+            self.blocks.add_sublayer(
+                str(i),
+                block_fn(prev_chs, block_out_chs, dilation, bottle_ratio,
+                         groups, **block_kwargs))
+            prev_chs = block_out_chs
+
+        # transition convs
+        self.conv_transition_b = ConvBnAct(
+            prev_chs, exp_chs // 2, kernel_size=1, **conv_kwargs)
+        self.conv_transition = ConvBnAct(
+            exp_chs, out_chs, kernel_size=1, **conv_kwargs)
+
+    def forward(self, x):
+        if self.conv_down is not None:
+            x = self.conv_down(x)
+        x = self.conv_exp(x)
+        split = x.shape[1] // 2
+        xs, xb = x[:, :split], x[:, split:]
+        xb = self.blocks(xb)
+        xb = self.conv_transition_b(xb)
+        out = self.conv_transition(paddle.concat([xs, xb], axis=1))
+        return out
+
+
+class DarkStage(nn.Layer):
+    def __init__(self,
+                 in_chs,
+                 out_chs,
+                 stride,
+                 dilation,
+                 depth,
+                 block_ratio=1.,
+                 bottle_ratio=1.,
+                 groups=1,
+                 first_dilation=None,
+                 block_fn=DarkBlock,
+                 block_dpr=None,
+                 **block_kwargs):
+        super().__init__()
+        first_dilation = first_dilation or dilation
+
+        self.conv_down = ConvBnAct(
+            in_chs,
+            out_chs,
+            kernel_size=3,
+            stride=stride,
+            dilation=first_dilation,
+            groups=groups,
+            act_layer=block_kwargs.get('act_layer'),
+            norm_layer=block_kwargs.get('norm_layer'))
+
+        prev_chs = out_chs
+        block_out_chs = int(round(out_chs * block_ratio))
+        self.blocks = nn.Sequential()
+        for i in range(depth):
+            self.blocks.add_sublayer(
+                str(i),
+                block_fn(prev_chs, block_out_chs, dilation, bottle_ratio,
+                         groups, **block_kwargs))
+            prev_chs = block_out_chs
+
+    def forward(self, x):
+        x = self.conv_down(x)
+        x = self.blocks(x)
+        return x
+
+
+def _cfg_to_stage_args(cfg, curr_stride=2, output_stride=32):
+    # get per stage args for stage and containing blocks, calculate strides to meet target output_stride
+    num_stages = len(cfg['depth'])
+    if 'groups' not in cfg:
+        cfg['groups'] = (1, ) * num_stages
+    if 'down_growth' in cfg and not isinstance(cfg['down_growth'],
+                                               (list, tuple)):
+        cfg['down_growth'] = (cfg['down_growth'], ) * num_stages
+    stage_strides = []
+    stage_dilations = []
+    stage_first_dilations = []
+    dilation = 1
+    for cfg_stride in cfg['stride']:
+        stage_first_dilations.append(dilation)
+        if curr_stride >= output_stride:
+            dilation *= cfg_stride
+            stride = 1
+        else:
+            stride = cfg_stride
+            curr_stride *= stride
+        stage_strides.append(stride)
+        stage_dilations.append(dilation)
+    cfg['stride'] = stage_strides
+    cfg['dilation'] = stage_dilations
+    cfg['first_dilation'] = stage_first_dilations
+    stage_args = [
+        dict(zip(cfg.keys(), values)) for values in zip(*cfg.values())
+    ]
+    return stage_args
+
+
+class CSPNet(nn.Layer):
+    def __init__(self,
+                 cfg,
+                 in_chans=3,
+                 class_num=1000,
+                 output_stride=32,
+                 global_pool='avg',
+                 drop_rate=0.,
+                 act_layer=nn.LeakyReLU,
+                 norm_layer=nn.BatchNorm2D,
+                 zero_init_last_bn=True,
+                 stage_fn=CrossStage,
+                 block_fn=DarkBlock):
+        super().__init__()
+        self.class_num = class_num
+        self.drop_rate = drop_rate
+        assert output_stride in (8, 16, 32)
+        layer_args = dict(act_layer=act_layer, norm_layer=norm_layer)
+
+        # Construct the stem
+        self.stem, stem_feat_info = create_stem(in_chans, **cfg['stem'],
+                                                **layer_args)
+        self.feature_info = [stem_feat_info]
+        prev_chs = stem_feat_info['num_chs']
+        curr_stride = stem_feat_info[
+            'reduction']  # reduction does not include pool
+        if cfg['stem']['pool']:
+            curr_stride *= 2
+
+        # Construct the stages
+        per_stage_args = _cfg_to_stage_args(
+            cfg['stage'], curr_stride=curr_stride, output_stride=output_stride)
+        self.stages = nn.LayerList()
+        for i, sa in enumerate(per_stage_args):
+            self.stages.add_sublayer(
+                str(i),
+                stage_fn(
+                    prev_chs, **sa, **layer_args, block_fn=block_fn))
+            prev_chs = sa['out_chs']
+            curr_stride *= sa['stride']
+            self.feature_info += [
+                dict(
+                    num_chs=prev_chs,
+                    reduction=curr_stride,
+                    module=f'stages.{i}')
+            ]
+
+        # Construct the head
+        self.num_features = prev_chs
+
+        self.pool = nn.AdaptiveAvgPool2D(1)
+        self.flatten = nn.Flatten(1)
+        self.fc = nn.Linear(
+            prev_chs,
+            class_num,
+            weight_attr=ParamAttr(),
+            bias_attr=ParamAttr())
+
+    def forward(self, x):
+        x = self.stem(x)
+        for stage in self.stages:
+            x = stage(x)
+        x = self.pool(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def CSPDarkNet53(pretrained=False, use_ssld=False, **kwargs):
+    model = CSPNet(MODEL_CFGS["CSPDarkNet53"], block_fn=DarkBlock, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["CSPDarkNet53"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/cswin_transformer.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/cswin_transformer.py
new file mode 100644
index 000000000..9d6d26cbe
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/cswin_transformer.py
@@ -0,0 +1,651 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/BR-IDL/PaddleViT/blob/develop/image_classification/CSwin/cswin.py
+# reference: https://arxiv.org/abs/2107.00652
+
+import copy
+import numpy as np
+import paddle
+import paddle.nn as nn
+from .vision_transformer import trunc_normal_, zeros_, ones_, to_2tuple, DropPath, Identity
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "CSWinTransformer_tiny_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CSWinTransformer_tiny_224_pretrained.pdparams",
+    "CSWinTransformer_small_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CSWinTransformer_small_224_pretrained.pdparams",
+    "CSWinTransformer_base_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CSWinTransformer_base_224_pretrained.pdparams",
+    "CSWinTransformer_large_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CSWinTransformer_large_224_pretrained.pdparams",
+    "CSWinTransformer_base_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CSWinTransformer_base_384_pretrained.pdparams",
+    "CSWinTransformer_large_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CSWinTransformer_large_384_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class PatchEmbedding(nn.Layer):
+    """CSwin Patch Embedding
+    This patch embedding has a 7x7 conv + layernorm, the output tensor
+    is reshaped to [Batch, H*W, embed_dim]. Note that the patch is applied
+    by a conv with overlap (using patch_stride).
+    Args:
+        patch_stride: int, patch stride size, default: 4
+        in_channels: int, number of channels of input image, default: 3
+        embed_dim: int, output feature dimension, default: 96
+    """
+
+    def __init__(self, patch_stride=4, in_channels=3, embed_dim=96):
+        super().__init__()
+        self.patch_embed = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=embed_dim,
+            kernel_size=7,
+            stride=patch_stride,
+            padding=2)
+
+        self.norm = nn.LayerNorm(embed_dim)
+
+    def forward(self, x):
+        x = self.patch_embed(
+            x)  # [batch, embed_dim, h, w], h = w = image_size / 4
+        x = x.flatten(start_axis=2, stop_axis=-1)  # [batch, embed_dim, h*w]
+        x = x.transpose([0, 2, 1])  # [batch, h*w, embed_dim]
+        x = self.norm(x)
+        return x
+
+
+class Mlp(nn.Layer):
+    """ MLP module
+    Impl using nn.Linear and activation is GELU, dropout is applied.
+    Ops: fc -> act -> dropout -> fc -> dropout
+    Attributes:
+        fc1: nn.Linear
+        fc2: nn.Linear
+        act: GELU
+        dropout1: dropout after fc1
+        dropout2: dropout after fc2
+    """
+
+    def __init__(self, in_features, hidden_features, dropout):
+        super().__init__()
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.fc2 = nn.Linear(hidden_features, in_features)
+        self.act = nn.GELU()
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+
+
+def img2windows(img, h_split, w_split):
+    """Convert input tensor into split stripes
+    Args:
+        img: tensor, image tensor with shape [B, C, H, W]
+        h_split: int, splits width in height direction
+        w_split: int, splits width in width direction
+    Returns:
+        out: tensor, splitted image
+    """
+    B, C, H, W = img.shape
+    out = img.reshape([B, C, H // h_split, h_split, W // w_split, w_split])
+    out = out.transpose(
+        [0, 2, 4, 3, 5, 1])  # [B, H//h_split, W//w_split, h_split, w_split, C]
+    out = out.reshape([-1, h_split * w_split,
+                       C])  # [B, H//h_split, W//w_split, h_split*w_split, C]
+    return out
+
+
+def windows2img(img_splits, h_split, w_split, img_h, img_w):
+    """Convert splitted stripes back
+    Args:
+        img_splits: tensor, image tensor with shape [B, C, H, W]
+        h_split: int, splits width in height direction
+        w_split: int, splits width in width direction
+        img_h: int, original tensor height
+        img_w: int, original tensor width
+    Returns:
+        img: tensor, original tensor
+    """
+    B = paddle.to_tensor(img_splits.shape[0] //
+                         (img_h // h_split * img_w // w_split), "int32")
+    img = img_splits.reshape([
+        B, img_h // h_split, img_w // w_split, h_split, w_split,
+        img_splits.shape[-1]
+    ])
+    img = img.transpose(
+        [0, 1, 3, 2, 4,
+         5])  #[B,img_h//h_split, h_split, img_w//w_split, w_split,C]
+    img = img.reshape(
+        [B, img_h, img_w, img_splits.shape[-1]])  # [B, img_h, img_w, C]
+    return img
+
+
+class LePEAttention(nn.Layer):
+    """Cross Shaped Window self-attention with Locally enhanced positional encoding"""
+
+    def __init__(self,
+                 dim,
+                 resolution,
+                 h_split=7,
+                 w_split=7,
+                 num_heads=8,
+                 attention_dropout=0.,
+                 dropout=0.,
+                 qk_scale=None):
+        super().__init__()
+        self.dim = dim
+        self.resolution = resolution
+        self.num_heads = num_heads
+        self.dim_head = dim // num_heads
+        self.scale = qk_scale or self.dim_head**-0.5
+        self.h_split = h_split
+        self.w_split = w_split
+
+        self.get_v = nn.Conv2D(
+            in_channels=dim,
+            out_channels=dim,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            groups=dim)
+
+        self.softmax = nn.Softmax(axis=-1)
+        self.attn_dropout = nn.Dropout(attention_dropout)
+
+    def im2cswin(self, x):
+        B, HW, C = x.shape
+        H = W = int(np.sqrt(HW))
+        x = x.transpose([0, 2, 1])  # [B, C, H*W]
+        x = x.reshape([B, C, H, W])  # [B, C, H, W]
+        x = img2windows(x, self.h_split, self.w_split)
+        x = x.reshape(
+            [-1, self.h_split * self.w_split, self.num_heads, self.dim_head])
+        x = x.transpose([0, 2, 1, 3])
+        return x
+
+    def get_lepe(self, x, func):
+        """Locally Enhanced Positional Encoding (LePE)
+        This module applies a depthwise conv on V and returns the lepe
+        Args:
+            x: tensor, the input tensor V
+            func: nn.Layer, a depth wise conv of kernel 3 stride 1 and padding 1
+        """
+        B, HW, C = x.shape
+        H = W = int(np.sqrt(HW))
+        h_split = self.h_split
+        w_split = self.w_split
+
+        x = x.transpose([0, 2, 1])  # [B, C, H*W]
+        x = x.reshape([B, C, H, W])  # [B, C, H, W]
+        x = x.reshape([B, C, H // h_split, h_split, W // w_split, w_split])
+        x = x.transpose(
+            [0, 2, 4, 1, 3,
+             5])  # [B, H//h_split, W//w_split, C, h_split, w_split]
+        x = x.reshape(
+            [-1, C, h_split,
+             w_split])  # [B*(H//h_split)*(W//w_split), h_split, w_split]
+
+        lepe = func(x)  # depth wise conv does not change shape
+        #lepe = lepe.reshape([-1, self.num_heads, C // self.num_heads, h_split * w_split])
+        lepe = lepe.reshape(
+            [-1, self.num_heads, self.dim_head, h_split * w_split])
+        lepe = lepe.transpose(
+            [0, 1, 3, 2])  # [B, num_heads, h_spllit*w_split, dim_head]
+
+        x = x.reshape([-1, self.num_heads, self.dim_head, h_split * w_split])
+        x = x.transpose(
+            [0, 1, 3, 2])  # [B, num_heads, h_split*wsplit, dim_head]
+        return x, lepe
+
+    def forward(self, q, k, v):
+        B, HW, C = q.shape
+        H = W = self.resolution
+        q = self.im2cswin(q)
+        k = self.im2cswin(k)
+        v, lepe = self.get_lepe(v, self.get_v)
+
+        q = q * self.scale
+        attn = paddle.matmul(q, k, transpose_y=True)
+        attn = self.softmax(attn)
+        attn = self.attn_dropout(attn)
+
+        z = paddle.matmul(attn, v)
+        z = z + lepe
+        z = z.transpose([0, 2, 1, 3])
+        z = z.reshape([-1, self.h_split * self.w_split, C])
+
+        z = windows2img(z, self.h_split, self.w_split, H, W)
+        z = z.reshape([B, z.shape[1] * z.shape[2], C])
+        return z
+
+
+class CSwinBlock(nn.Layer):
+    """CSwin Block
+    CSwin block contains a LePE attention modual, a linear projection,
+    a mlp layer, and related norms layers. In the first 3 stages, the
+    LePE attention moduals used 2 branches, where horizontal and
+    vertical split stripes are used for self attention and a concat
+    op is applied to combine the outputs. The last stage does not
+    have branche in LePE attention.
+    Args:
+        dim: int, input feature dimension
+        input_resolution: int, input feature spatial size.
+        num_heads: int, num of attention heads in current stage
+        split_size: int, the split size in current stage
+        mlp_ratio: float, mlp ratio, mlp_hidden_dim = mlp_ratio * mlp_in_dim, default: 4.
+        qkv_bias: bool, if set True, qkv projection will have bias, default: True
+        qk_scale: float, if set, replace the orig qk_scale (dim_head ** -0.5), default: None
+        dropout: float, dropout rate for linear projection, default: 0
+        attention_dropout: float, dropout rate for attention, default: 0
+        droppath: float, drop path rate, default: 0
+        split_heads: bool, if True, split heads is applied (True for 1,2,3 stages), default: True
+    """
+
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 num_heads,
+                 split_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attention_dropout=0.,
+                 dropout=0.,
+                 droppath=0.,
+                 split_heads=True):
+        super().__init__()
+        self.dim = dim
+        # NOTE: here assume image_h == imgae_w
+        self.input_resolution = (input_resolution, input_resolution)
+        self.num_heads = num_heads
+        self.dim_head = dim // num_heads
+        self.mlp_ratio = mlp_ratio
+        self.split_size = split_size
+        self.norm1 = nn.LayerNorm(dim)
+        self.qkv = nn.Linear(
+            dim, dim * 3, bias_attr=None if qkv_bias else False)
+        self.attns = nn.LayerList()
+        self.split_heads = split_heads
+
+        num_branches = 2 if split_heads else 1
+        if split_heads:  # first 3 stages
+            splits = [self.input_resolution[0],
+                      self.split_size]  # horizantal splits
+        else:  # last stage
+            splits = [self.input_resolution[0], self.input_resolution[0]]
+        for _ in range(num_branches):
+            attn = LePEAttention(
+                dim=dim // num_branches,
+                resolution=input_resolution,
+                h_split=splits[0],
+                w_split=splits[1],
+                num_heads=num_heads // num_branches,
+                qk_scale=qk_scale,
+                attention_dropout=attention_dropout,
+                dropout=dropout)
+            self.attns.append(copy.deepcopy(attn))
+            # switch splits from horizantal to vertical
+            # NOTE: may need to change for different H and W
+            splits[0], splits[1] = splits[1], splits[0]
+
+        self.proj = nn.Linear(dim, dim)
+        self.drop_path = DropPath(droppath) if droppath > 0. else Identity()
+
+        self.norm2 = nn.LayerNorm(dim)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=int(dim * mlp_ratio),
+                       dropout=dropout)
+
+    def chunk_qkv(self, x, chunks=1, axis=-1):
+        x = x.chunk(chunks, axis=axis)
+        return x
+
+    def forward(self, x):
+        H, W = self.input_resolution
+        B, HW, C = x.shape
+        # cswin attention
+        h = x
+        x = self.norm1(x)
+        qkv = self.qkv(x).chunk(3, axis=-1)  # qkv is a tuple of [q, k, v]
+
+        if self.split_heads:
+            q, k, v = map(self.chunk_qkv, qkv,
+                          (2, 2, 2))  # map requries list/tuple inputs
+        else:
+            q, k, v = map(lambda x: [x], qkv)
+
+        if self.split_heads:  # first 3 stages
+            h_attn = self.attns[0](q[0], k[0], v[0])
+            w_attn = self.attns[1](q[1], k[1], v[1])
+            attn = paddle.concat([h_attn, w_attn], axis=2)
+        else:  # last stage
+            attn = self.attns[0](q[0], k[0], v[0])
+        attn = self.proj(attn)
+        attn = self.drop_path(attn)
+        x = h + attn
+        # mlp + residual
+        h = x
+        x = self.norm2(x)
+        x = self.mlp(x)
+        x = self.drop_path(x)
+        x = h + x
+        return x
+
+
+class MergeBlock(nn.Layer):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.conv = nn.Conv2D(
+            in_channels=dim_in,
+            out_channels=dim_out,
+            kernel_size=3,
+            stride=2,
+            padding=1)
+        self.norm = nn.LayerNorm(dim_out)
+
+    def forward(self, x):
+        B, HW, C = x.shape
+        H = W = int(np.sqrt(HW))
+        x = x.transpose([0, 2, 1])  # [B, C, HW]
+        x = x.reshape([B, C, H, W])  # [B, C, H, W]
+        x = self.conv(x)
+        new_shape = [x.shape[0], x.shape[1],
+                     x.shape[2] * x.shape[3]]  # [B, C', H*W]
+        x = x.reshape(new_shape)  # [B, C', H*W]
+        x = x.transpose([0, 2, 1])  # [B, H*W, C']
+        x = self.norm(x)
+        return x
+
+
+class CSwinStage(nn.Layer):
+    """ CSwin Stage, each stage contains multi blocks
+    CSwin has 4 stages, the first 3 stages are using head split. The last
+    stage does not have head split. There is a merge block between each
+    2 stages.
+    Args:
+        dim: int, input feature dimension
+        depth: int, number of blocks in current stage
+        num_heads: int, num of attention heads in current stage
+        split_size: int, the split size in current stage
+        mlp_ratio: float, mlp ratio, mlp_hidden_dim = mlp_ratio * mlp_in_dim, default: 4.
+        qkv_bias: bool, if set True, qkv projection will have bias, default: True
+        qk_scale: float, if set, replace the orig qk_scale (dim_head ** -0.5), default: None
+        dropout: float, dropout rate for linear projection, default: 0
+        attention_dropout: float, dropout rate for attention, default: 0
+        droppath: float, drop path rate, default: 0
+        last_stage: bool, if current stage is the last stage, default: False
+    """
+
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 depth,
+                 num_heads,
+                 split_size,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 dropout=0.,
+                 attention_dropout=0.,
+                 droppath=0.,
+                 last_stage=False):
+        super().__init__()
+        self.blocks = nn.LayerList()
+        for i in range(depth):
+            block = CSwinBlock(
+                dim=dim,
+                input_resolution=input_resolution,
+                num_heads=num_heads,
+                split_size=split_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                attention_dropout=attention_dropout,
+                dropout=dropout,
+                droppath=droppath[i]
+                if isinstance(droppath, list) else droppath,
+                split_heads=not last_stage)
+            self.blocks.append(copy.deepcopy(block))
+        # last stage does not need merge layer
+        self.merge = MergeBlock(
+            dim_in=dim, dim_out=dim * 2) if not last_stage else Identity()
+
+    def forward(self, x):
+        for block in self.blocks:
+            x = block(x)
+        x = self.merge(x)
+        return x
+
+
+class CSwinTransformer(nn.Layer):
+    """CSwin Transformer class
+    Args:
+        image_size: int, input image size, default: 224
+        patch_stride: int, stride for patch embedding, default: 4
+        in_channels: int, num of channels of input image, default: 3
+        num_classes: int, num of classes, default: 1000
+        embed_dim: int, embedding dim (patch embed out dim), default: 96
+        depths: list/tuple(int), number of blocks in each stage, default: [2, 4, 32, 2]
+        splits: list/tuple(int), the split number in each stage, default: [1, 2, 7, 7]
+        num_heads: list/tuple(int), num of attention heads in each stage, default: [4, 8, 16, 32]
+        mlp_ratio: float, mlp ratio, mlp_hidden_dim = mlp_ratio * mlp_in_dim, default: 4.
+        qkv_bias: bool, if set True, qkv projection will have bias, default: True
+        qk_scale: float, if set, replace the orig qk_scale (dim_head ** -0.5), default: None
+        dropout: float, dropout rate for linear projection, default: 0
+        attention_dropout: float, dropout rate for attention, default: 0
+        droppath: float, drop path rate, default: 0
+    """
+
+    def __init__(self,
+                 image_size=224,
+                 patch_stride=4,
+                 in_channels=3,
+                 class_num=1000,
+                 embed_dim=96,
+                 depths=[2, 4, 32, 2],
+                 splits=[1, 2, 7, 7],
+                 num_heads=[4, 8, 16, 32],
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 dropout=0.,
+                 attention_dropout=0.,
+                 droppath=0.):
+        super().__init__()
+        # token embedding
+        self.patch_embedding = PatchEmbedding(
+            patch_stride=patch_stride,
+            in_channels=in_channels,
+            embed_dim=embed_dim)
+        # drop path decay by stage
+        depth_decay = [
+            x.item() for x in paddle.linspace(0, droppath, sum(depths))
+        ]
+        dim = embed_dim
+        resolution = image_size // 4
+        self.stages = nn.LayerList()
+        num_stages = len(depths)
+        # construct CSwin stages: each stage has multiple blocks
+        for stage_idx in range(num_stages):
+            stage = CSwinStage(
+                dim=dim,
+                input_resolution=resolution,
+                depth=depths[stage_idx],
+                num_heads=num_heads[stage_idx],
+                split_size=splits[stage_idx],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                dropout=dropout,
+                attention_dropout=attention_dropout,
+                droppath=depth_decay[sum(depths[:stage_idx]):sum(
+                    depths[:stage_idx + 1])],
+                last_stage=stage_idx == num_stages - 1)
+            self.stages.append(stage)
+            if stage_idx != num_stages - 1:
+                dim = dim * 2
+                resolution = resolution // 2
+        # last norm and classification head layers
+        self.norm = nn.LayerNorm(dim)
+        self.head = nn.Linear(dim, class_num)
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        x = self.patch_embedding(x)
+        for stage in self.stages:
+            x = stage(x)
+        x = self.norm(x)
+        return paddle.mean(x, axis=1)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def CSWinTransformer_tiny_224(pretrained=False, use_ssld=False, **kwargs):
+    model = CSwinTransformer(
+        image_size=224,
+        embed_dim=64,
+        depths=[1, 2, 21, 1],
+        splits=[1, 2, 7, 7],
+        num_heads=[2, 4, 8, 16],
+        droppath=0.2,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["CSWinTransformer_tiny_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def CSWinTransformer_small_224(pretrained=False, use_ssld=False, **kwargs):
+    model = CSwinTransformer(
+        image_size=224,
+        embed_dim=64,
+        depths=[2, 4, 32, 2],
+        splits=[1, 2, 7, 7],
+        num_heads=[2, 4, 8, 16],
+        droppath=0.4,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["CSWinTransformer_small_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def CSWinTransformer_base_224(pretrained=False, use_ssld=False, **kwargs):
+    model = CSwinTransformer(
+        image_size=224,
+        embed_dim=96,
+        depths=[2, 4, 32, 2],
+        splits=[1, 2, 7, 7],
+        num_heads=[4, 8, 16, 32],
+        droppath=0.5,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["CSWinTransformer_base_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def CSWinTransformer_base_384(pretrained=False, use_ssld=False, **kwargs):
+    model = CSwinTransformer(
+        image_size=384,
+        embed_dim=96,
+        depths=[2, 4, 32, 2],
+        splits=[1, 2, 12, 12],
+        num_heads=[4, 8, 16, 32],
+        droppath=0.5,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["CSWinTransformer_base_384"],
+        use_ssld=use_ssld)
+    return model
+
+
+def CSWinTransformer_large_224(pretrained=False, use_ssld=False, **kwargs):
+    model = CSwinTransformer(
+        image_size=224,
+        embed_dim=144,
+        depths=[2, 4, 32, 2],
+        splits=[1, 2, 7, 7],
+        num_heads=[6, 12, 24, 24],
+        droppath=0.5,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["CSWinTransformer_large_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def CSWinTransformer_large_384(pretrained=False, use_ssld=False, **kwargs):
+    model = CSwinTransformer(
+        image_size=384,
+        embed_dim=144,
+        depths=[2, 4, 32, 2],
+        splits=[1, 2, 12, 12],
+        num_heads=[6, 12, 24, 24],
+        droppath=0.5,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["CSWinTransformer_large_384"],
+        use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/cvt.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/cvt.py
new file mode 100644
index 000000000..0092fb3d7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/cvt.py
@@ -0,0 +1,723 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Code was heavily based on https://github.com/microsoft/CvT
+# reference: https://arxiv.org/abs/2103.15808
+
+import paddle
+import paddle.nn as nn
+from paddle.nn.initializer import XavierUniform, TruncatedNormal, Constant
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "CvT_13_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CvT_13_224_pretrained.pdparams",
+    "CvT_13_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CvT_13_384_pretrained.pdparams",
+    "CvT_21_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CvT_21_224_pretrained.pdparams",
+    "CvT_21_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CvT_21_384_pretrained.pdparams",
+    "CvT_W24_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/CvT_W24_384_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+xavier_uniform_ = XavierUniform()
+trunc_normal_ = TruncatedNormal(std=.02)
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob)
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self):
+        return f'drop_prob={self.drop_prob:.3f}'
+
+
+def rearrange(x, pattern, **axes_lengths):
+    if 'b (h w) c -> b c h w' == pattern:
+        b, _, c = x.shape
+        h, w = axes_lengths.pop('h', -1), axes_lengths.pop('w', -1)
+        return x.transpose([0, 2, 1]).reshape([b, c, h, w])
+    if 'b c h w -> b (h w) c' == pattern:
+        b, c, h, w = x.shape
+        return x.reshape([b, c, h * w]).transpose([0, 2, 1])
+    if 'b t (h d) -> b h t d' == pattern:
+        b, t, h_d = x.shape
+        h = axes_lengths['h']
+        return x.reshape([b, t, h, h_d // h]).transpose([0, 2, 1, 3])
+    if 'b h t d -> b t (h d)' == pattern:
+        b, h, t, d = x.shape
+        return x.transpose([0, 2, 1, 3]).reshape([b, t, h * d])
+
+    raise NotImplementedError(
+        f"Rearrangement '{pattern}' has not been implemented.")
+
+
+class Rearrange(nn.Layer):
+    def __init__(self, pattern, **axes_lengths):
+        super().__init__()
+        self.pattern = pattern
+        self.axes_lengths = axes_lengths
+
+    def forward(self, x):
+        return rearrange(x, self.pattern, **self.axes_lengths)
+
+    def extra_repr(self):
+        return self.pattern
+
+
+class QuickGELU(nn.Layer):
+    def forward(self, x):
+        return x * nn.functional.sigmoid(1.702 * x)
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 num_heads,
+                 qkv_bias=False,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 method='dw_bn',
+                 kernel_size=3,
+                 stride_kv=1,
+                 stride_q=1,
+                 padding_kv=1,
+                 padding_q=1,
+                 with_cls_token=True,
+                 **kwargs):
+        super().__init__()
+        self.stride_kv = stride_kv
+        self.stride_q = stride_q
+        self.dim = dim_out
+        self.num_heads = num_heads
+        # head_dim = self.qkv_dim // num_heads
+        self.scale = dim_out**-0.5
+        self.with_cls_token = with_cls_token
+
+        self.conv_proj_q = self._build_projection(
+            dim_in, dim_out, kernel_size, padding_q, stride_q, 'linear'
+            if method == 'avg' else method)
+        self.conv_proj_k = self._build_projection(
+            dim_in, dim_out, kernel_size, padding_kv, stride_kv, method)
+        self.conv_proj_v = self._build_projection(
+            dim_in, dim_out, kernel_size, padding_kv, stride_kv, method)
+
+        self.proj_q = nn.Linear(dim_in, dim_out, bias_attr=qkv_bias)
+        self.proj_k = nn.Linear(dim_in, dim_out, bias_attr=qkv_bias)
+        self.proj_v = nn.Linear(dim_in, dim_out, bias_attr=qkv_bias)
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim_out, dim_out)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def _build_projection(self, dim_in, dim_out, kernel_size, padding, stride,
+                          method):
+        if method == 'dw_bn':
+            proj = nn.Sequential(
+                ('conv', nn.Conv2D(
+                    dim_in,
+                    dim_in,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    bias_attr=False,
+                    groups=dim_in)), ('bn', nn.BatchNorm2D(dim_in)),
+                ('rearrage', Rearrange('b c h w -> b (h w) c')))
+        elif method == 'avg':
+            proj = nn.Sequential(
+                ('avg', nn.AvgPool2D(
+                    kernel_size=kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    ceil_mode=True)),
+                ('rearrage', Rearrange('b c h w -> b (h w) c')))
+        elif method == 'linear':
+            proj = None
+        else:
+            raise ValueError('Unknown method ({})'.format(method))
+
+        return proj
+
+    def forward_conv(self, x, h, w):
+        if self.with_cls_token:
+            cls_token, x = paddle.split(x, [1, h * w], 1)
+
+        x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w)
+
+        if self.conv_proj_q is not None:
+            q = self.conv_proj_q(x)
+        else:
+            q = rearrange(x, 'b c h w -> b (h w) c')
+
+        if self.conv_proj_k is not None:
+            k = self.conv_proj_k(x)
+        else:
+            k = rearrange(x, 'b c h w -> b (h w) c')
+
+        if self.conv_proj_v is not None:
+            v = self.conv_proj_v(x)
+        else:
+            v = rearrange(x, 'b c h w -> b (h w) c')
+
+        if self.with_cls_token:
+            q = paddle.concat((cls_token, q), axis=1)
+            k = paddle.concat((cls_token, k), axis=1)
+            v = paddle.concat((cls_token, v), axis=1)
+
+        return q, k, v
+
+    def forward(self, x, h, w):
+        if (self.conv_proj_q is not None or self.conv_proj_k is not None or
+                self.conv_proj_v is not None):
+            q, k, v = self.forward_conv(x, h, w)
+
+        q = rearrange(self.proj_q(q), 'b t (h d) -> b h t d', h=self.num_heads)
+        k = rearrange(self.proj_k(k), 'b t (h d) -> b h t d', h=self.num_heads)
+        v = rearrange(self.proj_v(v), 'b t (h d) -> b h t d', h=self.num_heads)
+
+        attn_score = (q @k.transpose([0, 1, 3, 2])) * self.scale
+        attn = nn.functional.softmax(attn_score, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = attn @v
+        x = rearrange(x, 'b h t d -> b t (h d)')
+
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim_in,
+                 dim_out,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 **kwargs):
+        super().__init__()
+
+        self.with_cls_token = kwargs['with_cls_token']
+
+        self.norm1 = norm_layer(dim_in)
+        self.attn = Attention(dim_in, dim_out, num_heads, qkv_bias, attn_drop,
+                              drop, **kwargs)
+
+        self.drop_path = DropPath(drop_path) \
+            if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim_out)
+
+        dim_mlp_hidden = int(dim_out * mlp_ratio)
+        self.mlp = Mlp(in_features=dim_out,
+                       hidden_features=dim_mlp_hidden,
+                       act_layer=act_layer,
+                       drop=drop)
+
+    def forward(self, x, h, w):
+        x = x + self.drop_path(self.attn(self.norm1(x), h, w))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class ConvEmbed(nn.Layer):
+    def __init__(self,
+                 patch_size=7,
+                 in_chans=3,
+                 embed_dim=64,
+                 stride=4,
+                 padding=2,
+                 norm_layer=None):
+        super().__init__()
+        self.patch_size = patch_size
+
+        self.proj = nn.Conv2D(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=padding)
+        self.norm = norm_layer(embed_dim) if norm_layer else None
+
+    def forward(self, x):
+        x = self.proj(x)
+
+        B, C, H, W = x.shape
+        x = rearrange(x, 'b c h w -> b (h w) c')
+        if self.norm:
+            x = self.norm(x)
+        x = rearrange(x, 'b (h w) c -> b c h w', h=H, w=W)
+
+        return x
+
+
+class VisionTransformer(nn.Layer):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+
+    def __init__(self,
+                 patch_size=16,
+                 patch_stride=16,
+                 patch_padding=0,
+                 in_chans=3,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 init='trunc_norm',
+                 **kwargs):
+        super().__init__()
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+
+        self.rearrage = None
+
+        self.patch_embed = ConvEmbed(
+            # img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            stride=patch_stride,
+            padding=patch_padding,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer)
+
+        with_cls_token = kwargs['with_cls_token']
+        if with_cls_token:
+            self.cls_token = self.create_parameter(
+                shape=[1, 1, embed_dim], default_initializer=trunc_normal_)
+        else:
+            self.cls_token = None
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        dpr = [x.item() for x in paddle.linspace(0, drop_path_rate, depth)
+               ]  # stochastic depth decay rule
+
+        blocks = []
+        for j in range(depth):
+            blocks.append(
+                Block(
+                    dim_in=embed_dim,
+                    dim_out=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[j],
+                    act_layer=act_layer,
+                    norm_layer=norm_layer,
+                    **kwargs))
+        self.blocks = nn.LayerList(blocks)
+
+        if init == 'xavier':
+            self.apply(self._init_weights_xavier)
+        else:
+            self.apply(self._init_weights_trunc_normal)
+
+    def _init_weights_trunc_normal(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, (nn.LayerNorm, nn.BatchNorm2D)):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def _init_weights_xavier(self, m):
+        if isinstance(m, nn.Linear):
+            xavier_uniform_(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, (nn.LayerNorm, nn.BatchNorm2D)):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        B, C, H, W = x.shape
+
+        x = rearrange(x, 'b c h w -> b (h w) c')
+
+        cls_tokens = None
+        if self.cls_token is not None:
+            # stole cls_tokens impl from Phil Wang, thanks
+            cls_tokens = self.cls_token.expand([B, -1, -1])
+            x = paddle.concat((cls_tokens, x), axis=1)
+
+        x = self.pos_drop(x)
+
+        for i, blk in enumerate(self.blocks):
+            x = blk(x, H, W)
+
+        if self.cls_token is not None:
+            cls_tokens, x = paddle.split(x, [1, H * W], 1)
+        x = rearrange(x, 'b (h w) c -> b c h w', h=H, w=W)
+
+        return x, cls_tokens
+
+
+class ConvolutionalVisionTransformer(nn.Layer):
+    def __init__(self,
+                 in_chans=3,
+                 class_num=1000,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 init='trunc_norm',
+                 spec=None):
+        super().__init__()
+        self.class_num = class_num
+
+        self.num_stages = spec['NUM_STAGES']
+        for i in range(self.num_stages):
+            kwargs = {
+                'patch_size': spec['PATCH_SIZE'][i],
+                'patch_stride': spec['PATCH_STRIDE'][i],
+                'patch_padding': spec['PATCH_PADDING'][i],
+                'embed_dim': spec['DIM_EMBED'][i],
+                'depth': spec['DEPTH'][i],
+                'num_heads': spec['NUM_HEADS'][i],
+                'mlp_ratio': spec['MLP_RATIO'][i],
+                'qkv_bias': spec['QKV_BIAS'][i],
+                'drop_rate': spec['DROP_RATE'][i],
+                'attn_drop_rate': spec['ATTN_DROP_RATE'][i],
+                'drop_path_rate': spec['DROP_PATH_RATE'][i],
+                'with_cls_token': spec['CLS_TOKEN'][i],
+                'method': spec['QKV_PROJ_METHOD'][i],
+                'kernel_size': spec['KERNEL_QKV'][i],
+                'padding_q': spec['PADDING_Q'][i],
+                'padding_kv': spec['PADDING_KV'][i],
+                'stride_kv': spec['STRIDE_KV'][i],
+                'stride_q': spec['STRIDE_Q'][i],
+            }
+
+            stage = VisionTransformer(
+                in_chans=in_chans,
+                init=init,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                **kwargs)
+            setattr(self, f'stage{i}', stage)
+
+            in_chans = spec['DIM_EMBED'][i]
+
+        dim_embed = spec['DIM_EMBED'][-1]
+        self.norm = norm_layer(dim_embed)
+        self.cls_token = spec['CLS_TOKEN'][-1]
+
+        # Classifier head
+        self.head = nn.Linear(dim_embed,
+                              class_num) if class_num > 0 else nn.Identity()
+        trunc_normal_(self.head.weight)
+
+        bound = 1 / dim_embed**.5
+        nn.initializer.Uniform(-bound, bound)(self.head.bias)
+
+    def no_weight_decay(self):
+        layers = set()
+        for i in range(self.num_stages):
+            layers.add(f'stage{i}.pos_embed')
+            layers.add(f'stage{i}.cls_token')
+        return layers
+
+    def forward_features(self, x):
+        for i in range(self.num_stages):
+            x, cls_tokens = getattr(self, f'stage{i}')(x)
+
+        if self.cls_token:
+            x = self.norm(cls_tokens)
+            x = paddle.squeeze(x, axis=1)
+        else:
+            x = rearrange(x, 'b c h w -> b (h w) c')
+            x = self.norm(x)
+            x = paddle.mean(x, axis=1)
+
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+
+def _load_pretrained(pretrained,
+                     model,
+                     model_url,
+                     use_ssld=False,
+                     use_imagenet22kto1k_pretrained=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(
+            model,
+            model_url,
+            use_ssld=use_ssld,
+            use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def CvT_13_224(pretrained=False, use_ssld=False, **kwargs):
+    msvit_spec = dict(
+        INIT='trunc_norm',
+        NUM_STAGES=3,
+        PATCH_SIZE=[7, 3, 3],
+        PATCH_STRIDE=[4, 2, 2],
+        PATCH_PADDING=[2, 1, 1],
+        DIM_EMBED=[64, 192, 384],
+        NUM_HEADS=[1, 3, 6],
+        DEPTH=[1, 2, 10],
+        MLP_RATIO=[4.0, 4.0, 4.0],
+        ATTN_DROP_RATE=[0.0, 0.0, 0.0],
+        DROP_RATE=[0.0, 0.0, 0.0],
+        DROP_PATH_RATE=[0.0, 0.0, 0.1],
+        QKV_BIAS=[True, True, True],
+        CLS_TOKEN=[False, False, True],
+        POS_EMBED=[False, False, False],
+        QKV_PROJ_METHOD=['dw_bn', 'dw_bn', 'dw_bn'],
+        KERNEL_QKV=[3, 3, 3],
+        PADDING_KV=[1, 1, 1],
+        STRIDE_KV=[2, 2, 2],
+        PADDING_Q=[1, 1, 1],
+        STRIDE_Q=[1, 1, 1])
+    model = ConvolutionalVisionTransformer(
+        in_chans=3,
+        act_layer=QuickGELU,
+        init=msvit_spec.get('INIT', 'trunc_norm'),
+        spec=msvit_spec,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["CvT_13_224"], use_ssld=use_ssld)
+    return model
+
+
+def CvT_13_384(pretrained=False,
+               use_ssld=False,
+               use_imagenet22kto1k_pretrained=False,
+               **kwargs):
+    msvit_spec = dict(
+        INIT='trunc_norm',
+        NUM_STAGES=3,
+        PATCH_SIZE=[7, 3, 3],
+        PATCH_STRIDE=[4, 2, 2],
+        PATCH_PADDING=[2, 1, 1],
+        DIM_EMBED=[64, 192, 384],
+        NUM_HEADS=[1, 3, 6],
+        DEPTH=[1, 2, 10],
+        MLP_RATIO=[4.0, 4.0, 4.0],
+        ATTN_DROP_RATE=[0.0, 0.0, 0.0],
+        DROP_RATE=[0.0, 0.0, 0.0],
+        DROP_PATH_RATE=[0.0, 0.0, 0.1],
+        QKV_BIAS=[True, True, True],
+        CLS_TOKEN=[False, False, True],
+        POS_EMBED=[False, False, False],
+        QKV_PROJ_METHOD=['dw_bn', 'dw_bn', 'dw_bn'],
+        KERNEL_QKV=[3, 3, 3],
+        PADDING_KV=[1, 1, 1],
+        STRIDE_KV=[2, 2, 2],
+        PADDING_Q=[1, 1, 1],
+        STRIDE_Q=[1, 1, 1])
+    model = ConvolutionalVisionTransformer(
+        in_chans=3,
+        act_layer=QuickGELU,
+        init=msvit_spec.get('INIT', 'trunc_norm'),
+        spec=msvit_spec,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["CvT_13_384"],
+        use_ssld=use_ssld,
+        use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    return model
+
+
+def CvT_21_224(pretrained=False, use_ssld=False, **kwargs):
+    msvit_spec = dict(
+        INIT='trunc_norm',
+        NUM_STAGES=3,
+        PATCH_SIZE=[7, 3, 3],
+        PATCH_STRIDE=[4, 2, 2],
+        PATCH_PADDING=[2, 1, 1],
+        DIM_EMBED=[64, 192, 384],
+        NUM_HEADS=[1, 3, 6],
+        DEPTH=[1, 4, 16],
+        MLP_RATIO=[4.0, 4.0, 4.0],
+        ATTN_DROP_RATE=[0.0, 0.0, 0.0],
+        DROP_RATE=[0.0, 0.0, 0.0],
+        DROP_PATH_RATE=[0.0, 0.0, 0.1],
+        QKV_BIAS=[True, True, True],
+        CLS_TOKEN=[False, False, True],
+        POS_EMBED=[False, False, False],
+        QKV_PROJ_METHOD=['dw_bn', 'dw_bn', 'dw_bn'],
+        KERNEL_QKV=[3, 3, 3],
+        PADDING_KV=[1, 1, 1],
+        STRIDE_KV=[2, 2, 2],
+        PADDING_Q=[1, 1, 1],
+        STRIDE_Q=[1, 1, 1])
+    model = ConvolutionalVisionTransformer(
+        in_chans=3,
+        act_layer=QuickGELU,
+        init=msvit_spec.get('INIT', 'trunc_norm'),
+        spec=msvit_spec,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["CvT_21_224"], use_ssld=use_ssld)
+    return model
+
+
+def CvT_21_384(pretrained=False,
+               use_ssld=False,
+               use_imagenet22kto1k_pretrained=False,
+               **kwargs):
+    msvit_spec = dict(
+        INIT='trunc_norm',
+        NUM_STAGES=3,
+        PATCH_SIZE=[7, 3, 3],
+        PATCH_STRIDE=[4, 2, 2],
+        PATCH_PADDING=[2, 1, 1],
+        DIM_EMBED=[64, 192, 384],
+        NUM_HEADS=[1, 3, 6],
+        DEPTH=[1, 4, 16],
+        MLP_RATIO=[4.0, 4.0, 4.0],
+        ATTN_DROP_RATE=[0.0, 0.0, 0.0],
+        DROP_RATE=[0.0, 0.0, 0.0],
+        DROP_PATH_RATE=[0.0, 0.0, 0.1],
+        QKV_BIAS=[True, True, True],
+        CLS_TOKEN=[False, False, True],
+        POS_EMBED=[False, False, False],
+        QKV_PROJ_METHOD=['dw_bn', 'dw_bn', 'dw_bn'],
+        KERNEL_QKV=[3, 3, 3],
+        PADDING_KV=[1, 1, 1],
+        STRIDE_KV=[2, 2, 2],
+        PADDING_Q=[1, 1, 1],
+        STRIDE_Q=[1, 1, 1])
+    model = ConvolutionalVisionTransformer(
+        in_chans=3,
+        act_layer=QuickGELU,
+        init=msvit_spec.get('INIT', 'trunc_norm'),
+        spec=msvit_spec,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["CvT_21_384"],
+        use_ssld=use_ssld,
+        use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    return model
+
+
+def CvT_W24_384(pretrained=False, use_ssld=False, **kwargs):
+    msvit_spec = dict(
+        INIT='trunc_norm',
+        NUM_STAGES=3,
+        PATCH_SIZE=[7, 3, 3],
+        PATCH_STRIDE=[4, 2, 2],
+        PATCH_PADDING=[2, 1, 1],
+        DIM_EMBED=[192, 768, 1024],
+        NUM_HEADS=[3, 12, 16],
+        DEPTH=[2, 2, 20],
+        MLP_RATIO=[4.0, 4.0, 4.0],
+        ATTN_DROP_RATE=[0.0, 0.0, 0.0],
+        DROP_RATE=[0.0, 0.0, 0.0],
+        DROP_PATH_RATE=[0.0, 0.0, 0.3],
+        QKV_BIAS=[True, True, True],
+        CLS_TOKEN=[False, False, True],
+        POS_EMBED=[False, False, False],
+        QKV_PROJ_METHOD=['dw_bn', 'dw_bn', 'dw_bn'],
+        KERNEL_QKV=[3, 3, 3],
+        PADDING_KV=[1, 1, 1],
+        STRIDE_KV=[2, 2, 2],
+        PADDING_Q=[1, 1, 1],
+        STRIDE_Q=[1, 1, 1])
+    model = ConvolutionalVisionTransformer(
+        in_chans=3,
+        act_layer=QuickGELU,
+        init=msvit_spec.get('INIT', 'trunc_norm'),
+        spec=msvit_spec,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["CvT_W24_384"],
+        use_ssld=use_ssld,
+        use_imagenet22kto1k_pretrained=True)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/darknet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/darknet.py
new file mode 100644
index 000000000..2474c1587
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/darknet.py
@@ -0,0 +1,199 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1804.02767
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "DarkNet53":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DarkNet53_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter_size,
+                 stride,
+                 padding,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            weight_attr=ParamAttr(name=name + ".conv.weights"),
+            bias_attr=False)
+
+        bn_name = name + ".bn"
+        self._bn = BatchNorm(
+            num_channels=output_channels,
+            act="relu",
+            param_attr=ParamAttr(name=bn_name + ".scale"),
+            bias_attr=ParamAttr(name=bn_name + ".offset"),
+            moving_mean_name=bn_name + ".mean",
+            moving_variance_name=bn_name + ".var")
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        x = self._bn(x)
+        return x
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self, input_channels, output_channels, name=None):
+        super(BasicBlock, self).__init__()
+
+        self._conv1 = ConvBNLayer(
+            input_channels, output_channels, 1, 1, 0, name=name + ".0")
+        self._conv2 = ConvBNLayer(
+            output_channels, output_channels * 2, 3, 1, 1, name=name + ".1")
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._conv2(x)
+        return paddle.add(x=inputs, y=x)
+
+
+class DarkNet(nn.Layer):
+    def __init__(self, class_num=1000):
+        super(DarkNet, self).__init__()
+
+        self.stages = [1, 2, 8, 8, 4]
+        self._conv1 = ConvBNLayer(3, 32, 3, 1, 1, name="yolo_input")
+        self._conv2 = ConvBNLayer(
+            32, 64, 3, 2, 1, name="yolo_input.downsample")
+
+        self._basic_block_01 = BasicBlock(64, 32, name="stage.0.0")
+        self._downsample_0 = ConvBNLayer(
+            64, 128, 3, 2, 1, name="stage.0.downsample")
+
+        self._basic_block_11 = BasicBlock(128, 64, name="stage.1.0")
+        self._basic_block_12 = BasicBlock(128, 64, name="stage.1.1")
+        self._downsample_1 = ConvBNLayer(
+            128, 256, 3, 2, 1, name="stage.1.downsample")
+
+        self._basic_block_21 = BasicBlock(256, 128, name="stage.2.0")
+        self._basic_block_22 = BasicBlock(256, 128, name="stage.2.1")
+        self._basic_block_23 = BasicBlock(256, 128, name="stage.2.2")
+        self._basic_block_24 = BasicBlock(256, 128, name="stage.2.3")
+        self._basic_block_25 = BasicBlock(256, 128, name="stage.2.4")
+        self._basic_block_26 = BasicBlock(256, 128, name="stage.2.5")
+        self._basic_block_27 = BasicBlock(256, 128, name="stage.2.6")
+        self._basic_block_28 = BasicBlock(256, 128, name="stage.2.7")
+        self._downsample_2 = ConvBNLayer(
+            256, 512, 3, 2, 1, name="stage.2.downsample")
+
+        self._basic_block_31 = BasicBlock(512, 256, name="stage.3.0")
+        self._basic_block_32 = BasicBlock(512, 256, name="stage.3.1")
+        self._basic_block_33 = BasicBlock(512, 256, name="stage.3.2")
+        self._basic_block_34 = BasicBlock(512, 256, name="stage.3.3")
+        self._basic_block_35 = BasicBlock(512, 256, name="stage.3.4")
+        self._basic_block_36 = BasicBlock(512, 256, name="stage.3.5")
+        self._basic_block_37 = BasicBlock(512, 256, name="stage.3.6")
+        self._basic_block_38 = BasicBlock(512, 256, name="stage.3.7")
+        self._downsample_3 = ConvBNLayer(
+            512, 1024, 3, 2, 1, name="stage.3.downsample")
+
+        self._basic_block_41 = BasicBlock(1024, 512, name="stage.4.0")
+        self._basic_block_42 = BasicBlock(1024, 512, name="stage.4.1")
+        self._basic_block_43 = BasicBlock(1024, 512, name="stage.4.2")
+        self._basic_block_44 = BasicBlock(1024, 512, name="stage.4.3")
+
+        self._pool = AdaptiveAvgPool2D(1)
+
+        stdv = 1.0 / math.sqrt(1024.0)
+        self._out = Linear(
+            1024,
+            class_num,
+            weight_attr=ParamAttr(
+                name="fc_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._conv2(x)
+
+        x = self._basic_block_01(x)
+        x = self._downsample_0(x)
+
+        x = self._basic_block_11(x)
+        x = self._basic_block_12(x)
+        x = self._downsample_1(x)
+
+        x = self._basic_block_21(x)
+        x = self._basic_block_22(x)
+        x = self._basic_block_23(x)
+        x = self._basic_block_24(x)
+        x = self._basic_block_25(x)
+        x = self._basic_block_26(x)
+        x = self._basic_block_27(x)
+        x = self._basic_block_28(x)
+        x = self._downsample_2(x)
+
+        x = self._basic_block_31(x)
+        x = self._basic_block_32(x)
+        x = self._basic_block_33(x)
+        x = self._basic_block_34(x)
+        x = self._basic_block_35(x)
+        x = self._basic_block_36(x)
+        x = self._basic_block_37(x)
+        x = self._basic_block_38(x)
+        x = self._downsample_3(x)
+
+        x = self._basic_block_41(x)
+        x = self._basic_block_42(x)
+        x = self._basic_block_43(x)
+        x = self._basic_block_44(x)
+
+        x = self._pool(x)
+        x = paddle.squeeze(x, axis=[2, 3])
+        x = self._out(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def DarkNet53(pretrained=False, use_ssld=False, **kwargs):
+    model = DarkNet(**kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["DarkNet53"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/densenet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/densenet.py
new file mode 100644
index 000000000..314579a88
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/densenet.py
@@ -0,0 +1,346 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1608.06993
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "DenseNet121":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DenseNet121_pretrained.pdparams",
+    "DenseNet161":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DenseNet161_pretrained.pdparams",
+    "DenseNet169":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DenseNet169_pretrained.pdparams",
+    "DenseNet201":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DenseNet201_pretrained.pdparams",
+    "DenseNet264":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DenseNet264_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class BNACConvLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 pad=0,
+                 groups=1,
+                 act="relu",
+                 name=None):
+        super(BNACConvLayer, self).__init__()
+
+        self._batch_norm = BatchNorm(
+            num_channels,
+            act=act,
+            param_attr=ParamAttr(name=name + '_bn_scale'),
+            bias_attr=ParamAttr(name + '_bn_offset'),
+            moving_mean_name=name + '_bn_mean',
+            moving_variance_name=name + '_bn_variance')
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+
+    def forward(self, input):
+        y = self._batch_norm(input)
+        y = self._conv(y)
+        return y
+
+
+class DenseLayer(nn.Layer):
+    def __init__(self, num_channels, growth_rate, bn_size, dropout, name=None):
+        super(DenseLayer, self).__init__()
+        self.dropout = dropout
+
+        self.bn_ac_func1 = BNACConvLayer(
+            num_channels=num_channels,
+            num_filters=bn_size * growth_rate,
+            filter_size=1,
+            pad=0,
+            stride=1,
+            name=name + "_x1")
+
+        self.bn_ac_func2 = BNACConvLayer(
+            num_channels=bn_size * growth_rate,
+            num_filters=growth_rate,
+            filter_size=3,
+            pad=1,
+            stride=1,
+            name=name + "_x2")
+
+        if dropout:
+            self.dropout_func = Dropout(p=dropout, mode="downscale_in_infer")
+
+    def forward(self, input):
+        conv = self.bn_ac_func1(input)
+        conv = self.bn_ac_func2(conv)
+        if self.dropout:
+            conv = self.dropout_func(conv)
+        conv = paddle.concat([input, conv], axis=1)
+        return conv
+
+
+class DenseBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_layers,
+                 bn_size,
+                 growth_rate,
+                 dropout,
+                 name=None):
+        super(DenseBlock, self).__init__()
+        self.dropout = dropout
+
+        self.dense_layer_func = []
+
+        pre_channel = num_channels
+        for layer in range(num_layers):
+            self.dense_layer_func.append(
+                self.add_sublayer(
+                    "{}_{}".format(name, layer + 1),
+                    DenseLayer(
+                        num_channels=pre_channel,
+                        growth_rate=growth_rate,
+                        bn_size=bn_size,
+                        dropout=dropout,
+                        name=name + '_' + str(layer + 1))))
+            pre_channel = pre_channel + growth_rate
+
+    def forward(self, input):
+        conv = input
+        for func in self.dense_layer_func:
+            conv = func(conv)
+        return conv
+
+
+class TransitionLayer(nn.Layer):
+    def __init__(self, num_channels, num_output_features, name=None):
+        super(TransitionLayer, self).__init__()
+
+        self.conv_ac_func = BNACConvLayer(
+            num_channels=num_channels,
+            num_filters=num_output_features,
+            filter_size=1,
+            pad=0,
+            stride=1,
+            name=name)
+
+        self.pool2d_avg = AvgPool2D(kernel_size=2, stride=2, padding=0)
+
+    def forward(self, input):
+        y = self.conv_ac_func(input)
+        y = self.pool2d_avg(y)
+        return y
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 pad=0,
+                 groups=1,
+                 act="relu",
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=name + '_bn_scale'),
+            bias_attr=ParamAttr(name + '_bn_offset'),
+            moving_mean_name=name + '_bn_mean',
+            moving_variance_name=name + '_bn_variance')
+
+    def forward(self, input):
+        y = self._conv(input)
+        y = self._batch_norm(y)
+        return y
+
+
+class DenseNet(nn.Layer):
+    def __init__(self, layers=60, bn_size=4, dropout=0, class_num=1000):
+        super(DenseNet, self).__init__()
+
+        supported_layers = [121, 161, 169, 201, 264]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+        densenet_spec = {
+            121: (64, 32, [6, 12, 24, 16]),
+            161: (96, 48, [6, 12, 36, 24]),
+            169: (64, 32, [6, 12, 32, 32]),
+            201: (64, 32, [6, 12, 48, 32]),
+            264: (64, 32, [6, 12, 64, 48])
+        }
+        num_init_features, growth_rate, block_config = densenet_spec[layers]
+
+        self.conv1_func = ConvBNLayer(
+            num_channels=3,
+            num_filters=num_init_features,
+            filter_size=7,
+            stride=2,
+            pad=3,
+            act='relu',
+            name="conv1")
+
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_config = block_config
+
+        self.dense_block_func_list = []
+        self.transition_func_list = []
+        pre_num_channels = num_init_features
+        num_features = num_init_features
+        for i, num_layers in enumerate(block_config):
+            self.dense_block_func_list.append(
+                self.add_sublayer(
+                    "db_conv_{}".format(i + 2),
+                    DenseBlock(
+                        num_channels=pre_num_channels,
+                        num_layers=num_layers,
+                        bn_size=bn_size,
+                        growth_rate=growth_rate,
+                        dropout=dropout,
+                        name='conv' + str(i + 2))))
+
+            num_features = num_features + num_layers * growth_rate
+            pre_num_channels = num_features
+
+            if i != len(block_config) - 1:
+                self.transition_func_list.append(
+                    self.add_sublayer(
+                        "tr_conv{}_blk".format(i + 2),
+                        TransitionLayer(
+                            num_channels=pre_num_channels,
+                            num_output_features=num_features // 2,
+                            name='conv' + str(i + 2) + "_blk")))
+                pre_num_channels = num_features // 2
+                num_features = num_features // 2
+
+        self.batch_norm = BatchNorm(
+            num_features,
+            act="relu",
+            param_attr=ParamAttr(name='conv5_blk_bn_scale'),
+            bias_attr=ParamAttr(name='conv5_blk_bn_offset'),
+            moving_mean_name='conv5_blk_bn_mean',
+            moving_variance_name='conv5_blk_bn_variance')
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        stdv = 1.0 / math.sqrt(num_features * 1.0)
+
+        self.out = Linear(
+            num_features,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_weights"),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, input):
+        conv = self.conv1_func(input)
+        conv = self.pool2d_max(conv)
+
+        for i, num_layers in enumerate(self.block_config):
+            conv = self.dense_block_func_list[i](conv)
+            if i != len(self.block_config) - 1:
+                conv = self.transition_func_list[i](conv)
+
+        conv = self.batch_norm(conv)
+        y = self.pool2d_avg(conv)
+        y = paddle.flatten(y, start_axis=1, stop_axis=-1)
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def DenseNet121(pretrained=False, use_ssld=False, **kwargs):
+    model = DenseNet(layers=121, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["DenseNet121"], use_ssld=use_ssld)
+    return model
+
+
+def DenseNet161(pretrained=False, use_ssld=False, **kwargs):
+    model = DenseNet(layers=161, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["DenseNet161"], use_ssld=use_ssld)
+    return model
+
+
+def DenseNet169(pretrained=False, use_ssld=False, **kwargs):
+    model = DenseNet(layers=169, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["DenseNet169"], use_ssld=use_ssld)
+    return model
+
+
+def DenseNet201(pretrained=False, use_ssld=False, **kwargs):
+    model = DenseNet(layers=201, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["DenseNet201"], use_ssld=use_ssld)
+    return model
+
+
+def DenseNet264(pretrained=False, use_ssld=False, **kwargs):
+    model = DenseNet(layers=264, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["DenseNet264"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/distilled_vision_transformer.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/distilled_vision_transformer.py
new file mode 100644
index 000000000..1805b1d03
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/distilled_vision_transformer.py
@@ -0,0 +1,273 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was heavily based on https://github.com/facebookresearch/deit
+# reference: https://arxiv.org/abs/2012.12877
+
+import paddle
+import paddle.nn as nn
+from .vision_transformer import VisionTransformer, Identity, trunc_normal_, zeros_
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "DeiT_tiny_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DeiT_tiny_patch16_224_pretrained.pdparams",
+    "DeiT_small_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DeiT_small_patch16_224_pretrained.pdparams",
+    "DeiT_base_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DeiT_base_patch16_224_pretrained.pdparams",
+    "DeiT_tiny_distilled_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DeiT_tiny_distilled_patch16_224_pretrained.pdparams",
+    "DeiT_small_distilled_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DeiT_small_distilled_patch16_224_pretrained.pdparams",
+    "DeiT_base_distilled_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DeiT_base_distilled_patch16_224_pretrained.pdparams",
+    "DeiT_base_patch16_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DeiT_base_patch16_384_pretrained.pdparams",
+    "DeiT_base_distilled_patch16_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DeiT_base_distilled_patch16_384_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class DistilledVisionTransformer(VisionTransformer):
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 class_num=1000,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5,
+                 **kwargs):
+        super().__init__(
+            img_size=img_size,
+            patch_size=patch_size,
+            class_num=class_num,
+            embed_dim=embed_dim,
+            depth=depth,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            norm_layer=norm_layer,
+            epsilon=epsilon,
+            **kwargs)
+        self.pos_embed = self.create_parameter(
+            shape=(1, self.patch_embed.num_patches + 2, self.embed_dim),
+            default_initializer=zeros_)
+        self.add_parameter("pos_embed", self.pos_embed)
+
+        self.dist_token = self.create_parameter(
+            shape=(1, 1, self.embed_dim), default_initializer=zeros_)
+        self.add_parameter("cls_token", self.cls_token)
+
+        self.head_dist = nn.Linear(
+            self.embed_dim,
+            self.class_num) if self.class_num > 0 else Identity()
+
+        trunc_normal_(self.dist_token)
+        trunc_normal_(self.pos_embed)
+        self.head_dist.apply(self._init_weights)
+
+    def forward_features(self, x):
+        B = x.shape[0]
+        x = self.patch_embed(x)
+
+        cls_tokens = self.cls_token.expand((B, -1, -1)).astype(x.dtype)
+        dist_token = self.dist_token.expand((B, -1, -1)).astype(x.dtype)
+        x = paddle.concat((cls_tokens, dist_token, x), axis=1)
+
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        x = self.norm(x)
+        return x[:, 0], x[:, 1]
+
+    def forward(self, x):
+        x, x_dist = self.forward_features(x)
+        x = self.head(x)
+        x_dist = self.head_dist(x_dist)
+        return (x + x_dist) / 2
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def DeiT_tiny_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=192,
+        depth=12,
+        num_heads=3,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["DeiT_tiny_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def DeiT_small_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["DeiT_small_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def DeiT_base_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["DeiT_base_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def DeiT_tiny_distilled_patch16_224(pretrained=False, use_ssld=False,
+                                    **kwargs):
+    model = DistilledVisionTransformer(
+        patch_size=16,
+        embed_dim=192,
+        depth=12,
+        num_heads=3,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["DeiT_tiny_distilled_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def DeiT_small_distilled_patch16_224(pretrained=False,
+                                     use_ssld=False,
+                                     **kwargs):
+    model = DistilledVisionTransformer(
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["DeiT_small_distilled_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def DeiT_base_distilled_patch16_224(pretrained=False, use_ssld=False,
+                                    **kwargs):
+    model = DistilledVisionTransformer(
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["DeiT_base_distilled_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def DeiT_base_patch16_384(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["DeiT_base_patch16_384"],
+        use_ssld=use_ssld)
+    return model
+
+
+def DeiT_base_distilled_patch16_384(pretrained=False, use_ssld=False,
+                                    **kwargs):
+    model = DistilledVisionTransformer(
+        img_size=384,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["DeiT_base_distilled_patch16_384"],
+        use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/dla.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/dla.py
new file mode 100644
index 000000000..03a73bcfb
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/dla.py
@@ -0,0 +1,529 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/ucbdrive/dla
+# reference: https://arxiv.org/abs/1707.06484
+
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddle.nn.initializer import Normal, Constant
+
+from ..base.theseus_layer import Identity
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "DLA34":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA34_pretrained.pdparams",
+    "DLA46_c":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA46_c_pretrained.pdparams",
+    "DLA46x_c":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA46x_c_pretrained.pdparams",
+    "DLA60":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA60_pretrained.pdparams",
+    "DLA60x":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA60x_pretrained.pdparams",
+    "DLA60x_c":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA60x_c_pretrained.pdparams",
+    "DLA102":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA102_pretrained.pdparams",
+    "DLA102x":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA102x_pretrained.pdparams",
+    "DLA102x2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA102x2_pretrained.pdparams",
+    "DLA169":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DLA169_pretrained.pdparams"
+}
+
+__all__ = MODEL_URLS.keys()
+
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+class DlaBasic(nn.Layer):
+    def __init__(self, inplanes, planes, stride=1, dilation=1, **cargs):
+        super(DlaBasic, self).__init__()
+        self.conv1 = nn.Conv2D(
+            inplanes,
+            planes,
+            kernel_size=3,
+            stride=stride,
+            padding=dilation,
+            bias_attr=False,
+            dilation=dilation)
+        self.bn1 = nn.BatchNorm2D(planes)
+        self.relu = nn.ReLU()
+        self.conv2 = nn.Conv2D(
+            planes,
+            planes,
+            kernel_size=3,
+            stride=1,
+            padding=dilation,
+            bias_attr=False,
+            dilation=dilation)
+        self.bn2 = nn.BatchNorm2D(planes)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class DlaBottleneck(nn.Layer):
+    expansion = 2
+
+    def __init__(self,
+                 inplanes,
+                 outplanes,
+                 stride=1,
+                 dilation=1,
+                 cardinality=1,
+                 base_width=64):
+        super(DlaBottleneck, self).__init__()
+        self.stride = stride
+        mid_planes = int(
+            math.floor(outplanes * (base_width / 64)) * cardinality)
+        mid_planes = mid_planes // self.expansion
+
+        self.conv1 = nn.Conv2D(
+            inplanes, mid_planes, kernel_size=1, bias_attr=False)
+        self.bn1 = nn.BatchNorm2D(mid_planes)
+        self.conv2 = nn.Conv2D(
+            mid_planes,
+            mid_planes,
+            kernel_size=3,
+            stride=stride,
+            padding=dilation,
+            bias_attr=False,
+            dilation=dilation,
+            groups=cardinality)
+        self.bn2 = nn.BatchNorm2D(mid_planes)
+        self.conv3 = nn.Conv2D(
+            mid_planes, outplanes, kernel_size=1, bias_attr=False)
+        self.bn3 = nn.BatchNorm2D(outplanes)
+        self.relu = nn.ReLU()
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class DlaRoot(nn.Layer):
+    def __init__(self, in_channels, out_channels, kernel_size, residual):
+        super(DlaRoot, self).__init__()
+        self.conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            1,
+            stride=1,
+            bias_attr=False,
+            padding=(kernel_size - 1) // 2)
+        self.bn = nn.BatchNorm2D(out_channels)
+        self.relu = nn.ReLU()
+        self.residual = residual
+
+    def forward(self, *x):
+        children = x
+        x = self.conv(paddle.concat(x, 1))
+        x = self.bn(x)
+        if self.residual:
+            x += children[0]
+        x = self.relu(x)
+
+        return x
+
+
+class DlaTree(nn.Layer):
+    def __init__(self,
+                 levels,
+                 block,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 dilation=1,
+                 cardinality=1,
+                 base_width=64,
+                 level_root=False,
+                 root_dim=0,
+                 root_kernel_size=1,
+                 root_residual=False):
+        super(DlaTree, self).__init__()
+        if root_dim == 0:
+            root_dim = 2 * out_channels
+        if level_root:
+            root_dim += in_channels
+
+        self.downsample = nn.MaxPool2D(
+            stride, stride=stride) if stride > 1 else Identity()
+        self.project = Identity()
+        cargs = dict(
+            dilation=dilation, cardinality=cardinality, base_width=base_width)
+
+        if levels == 1:
+            self.tree1 = block(in_channels, out_channels, stride, **cargs)
+            self.tree2 = block(out_channels, out_channels, 1, **cargs)
+            if in_channels != out_channels:
+                self.project = nn.Sequential(
+                    nn.Conv2D(
+                        in_channels,
+                        out_channels,
+                        kernel_size=1,
+                        stride=1,
+                        bias_attr=False),
+                    nn.BatchNorm2D(out_channels))
+        else:
+            cargs.update(
+                dict(
+                    root_kernel_size=root_kernel_size,
+                    root_residual=root_residual))
+            self.tree1 = DlaTree(
+                levels - 1,
+                block,
+                in_channels,
+                out_channels,
+                stride,
+                root_dim=0,
+                **cargs)
+            self.tree2 = DlaTree(
+                levels - 1,
+                block,
+                out_channels,
+                out_channels,
+                root_dim=root_dim + out_channels,
+                **cargs)
+
+        if levels == 1:
+            self.root = DlaRoot(root_dim, out_channels, root_kernel_size,
+                                root_residual)
+
+        self.level_root = level_root
+        self.root_dim = root_dim
+        self.levels = levels
+
+    def forward(self, x, residual=None, children=None):
+        children = [] if children is None else children
+        bottom = self.downsample(x)
+        residual = self.project(bottom)
+
+        if self.level_root:
+            children.append(bottom)
+        x1 = self.tree1(x, residual)
+
+        if self.levels == 1:
+            x2 = self.tree2(x1)
+            x = self.root(x2, x1, *children)
+        else:
+            children.append(x1)
+            x = self.tree2(x1, children=children)
+        return x
+
+
+class DLA(nn.Layer):
+    def __init__(self,
+                 levels,
+                 channels,
+                 in_chans=3,
+                 cardinality=1,
+                 base_width=64,
+                 block=DlaBottleneck,
+                 residual_root=False,
+                 drop_rate=0.0,
+                 class_num=1000,
+                 with_pool=True):
+        super(DLA, self).__init__()
+        self.channels = channels
+        self.class_num = class_num
+        self.with_pool = with_pool
+        self.cardinality = cardinality
+        self.base_width = base_width
+        self.drop_rate = drop_rate
+
+        self.base_layer = nn.Sequential(
+            nn.Conv2D(
+                in_chans,
+                channels[0],
+                kernel_size=7,
+                stride=1,
+                padding=3,
+                bias_attr=False),
+            nn.BatchNorm2D(channels[0]),
+            nn.ReLU())
+
+        self.level0 = self._make_conv_level(channels[0], channels[0],
+                                            levels[0])
+        self.level1 = self._make_conv_level(
+            channels[0], channels[1], levels[1], stride=2)
+
+        cargs = dict(
+            cardinality=cardinality,
+            base_width=base_width,
+            root_residual=residual_root)
+
+        self.level2 = DlaTree(
+            levels[2],
+            block,
+            channels[1],
+            channels[2],
+            2,
+            level_root=False,
+            **cargs)
+        self.level3 = DlaTree(
+            levels[3],
+            block,
+            channels[2],
+            channels[3],
+            2,
+            level_root=True,
+            **cargs)
+        self.level4 = DlaTree(
+            levels[4],
+            block,
+            channels[3],
+            channels[4],
+            2,
+            level_root=True,
+            **cargs)
+        self.level5 = DlaTree(
+            levels[5],
+            block,
+            channels[4],
+            channels[5],
+            2,
+            level_root=True,
+            **cargs)
+
+        self.feature_info = [
+            # rare to have a meaningful stride 1 level
+            dict(
+                num_chs=channels[0], reduction=1, module='level0'),
+            dict(
+                num_chs=channels[1], reduction=2, module='level1'),
+            dict(
+                num_chs=channels[2], reduction=4, module='level2'),
+            dict(
+                num_chs=channels[3], reduction=8, module='level3'),
+            dict(
+                num_chs=channels[4], reduction=16, module='level4'),
+            dict(
+                num_chs=channels[5], reduction=32, module='level5'),
+        ]
+
+        self.num_features = channels[-1]
+
+        if with_pool:
+            self.global_pool = nn.AdaptiveAvgPool2D(1)
+
+        if class_num > 0:
+            self.fc = nn.Conv2D(self.num_features, class_num, 1)
+
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+                normal_ = Normal(mean=0.0, std=math.sqrt(2. / n))
+                normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2D):
+                ones_(m.weight)
+                zeros_(m.bias)
+
+    def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1):
+        modules = []
+        for i in range(convs):
+            modules.extend([
+                nn.Conv2D(
+                    inplanes,
+                    planes,
+                    kernel_size=3,
+                    stride=stride if i == 0 else 1,
+                    padding=dilation,
+                    bias_attr=False,
+                    dilation=dilation), nn.BatchNorm2D(planes), nn.ReLU()
+            ])
+            inplanes = planes
+        return nn.Sequential(*modules)
+
+    def forward_features(self, x):
+        x = self.base_layer(x)
+
+        x = self.level0(x)
+        x = self.level1(x)
+        x = self.level2(x)
+        x = self.level3(x)
+        x = self.level4(x)
+        x = self.level5(x)
+
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+
+        if self.with_pool:
+            x = self.global_pool(x)
+
+        if self.drop_rate > 0.:
+            x = F.dropout(x, p=self.drop_rate, training=self.training)
+
+        if self.class_num > 0:
+            x = self.fc(x)
+            x = x.flatten(1)
+
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def DLA34(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 2, 2, 1),
+                channels=(16, 32, 64, 128, 256, 512),
+                block=DlaBasic,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA34"])
+    return model
+
+
+def DLA46_c(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 2, 2, 1),
+                channels=(16, 32, 64, 64, 128, 256),
+                block=DlaBottleneck,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA46_c"])
+    return model
+
+
+def DLA46x_c(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 2, 2, 1),
+                channels=(16, 32, 64, 64, 128, 256),
+                block=DlaBottleneck,
+                cardinality=32,
+                base_width=4,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA46x_c"])
+    return model
+
+
+def DLA60(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 2, 3, 1),
+                channels=(16, 32, 128, 256, 512, 1024),
+                block=DlaBottleneck,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA60"])
+    return model
+
+
+def DLA60x(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 2, 3, 1),
+                channels=(16, 32, 128, 256, 512, 1024),
+                block=DlaBottleneck,
+                cardinality=32,
+                base_width=4,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA60x"])
+    return model
+
+
+def DLA60x_c(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 2, 3, 1),
+                channels=(16, 32, 64, 64, 128, 256),
+                block=DlaBottleneck,
+                cardinality=32,
+                base_width=4,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA60x_c"])
+    return model
+
+
+def DLA102(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 3, 4, 1),
+                channels=(16, 32, 128, 256, 512, 1024),
+                block=DlaBottleneck,
+                residual_root=True,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA102"])
+    return model
+
+
+def DLA102x(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 3, 4, 1),
+                channels=(16, 32, 128, 256, 512, 1024),
+                block=DlaBottleneck,
+                cardinality=32,
+                base_width=4,
+                residual_root=True,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA102x"])
+    return model
+
+
+def DLA102x2(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 1, 3, 4, 1),
+                channels=(16, 32, 128, 256, 512, 1024),
+                block=DlaBottleneck,
+                cardinality=64,
+                base_width=4,
+                residual_root=True,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA102x2"])
+    return model
+
+
+def DLA169(pretrained=False, **kwargs):
+    model = DLA(levels=(1, 1, 2, 3, 5, 1),
+                channels=(16, 32, 128, 256, 512, 1024),
+                block=DlaBottleneck,
+                residual_root=True,
+                **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DLA169"])
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/dpn.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/dpn.py
new file mode 100644
index 000000000..ef322dea9
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/dpn.py
@@ -0,0 +1,453 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1707.01629
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import sys
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "DPN68":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DPN68_pretrained.pdparams",
+    "DPN92":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DPN92_pretrained.pdparams",
+    "DPN98":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DPN98_pretrained.pdparams",
+    "DPN107":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DPN107_pretrained.pdparams",
+    "DPN131":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DPN131_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 pad=0,
+                 groups=1,
+                 act="relu",
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=name + '_bn_scale'),
+            bias_attr=ParamAttr(name + '_bn_offset'),
+            moving_mean_name=name + '_bn_mean',
+            moving_variance_name=name + '_bn_variance')
+
+    def forward(self, input):
+        y = self._conv(input)
+        y = self._batch_norm(y)
+        return y
+
+
+class BNACConvLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 pad=0,
+                 groups=1,
+                 act="relu",
+                 name=None):
+        super(BNACConvLayer, self).__init__()
+        self.num_channels = num_channels
+
+        self._batch_norm = BatchNorm(
+            num_channels,
+            act=act,
+            param_attr=ParamAttr(name=name + '_bn_scale'),
+            bias_attr=ParamAttr(name + '_bn_offset'),
+            moving_mean_name=name + '_bn_mean',
+            moving_variance_name=name + '_bn_variance')
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=pad,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+
+    def forward(self, input):
+        y = self._batch_norm(input)
+        y = self._conv(y)
+        return y
+
+
+class DualPathFactory(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_1x1_a,
+                 num_3x3_b,
+                 num_1x1_c,
+                 inc,
+                 G,
+                 _type='normal',
+                 name=None):
+        super(DualPathFactory, self).__init__()
+
+        self.num_1x1_c = num_1x1_c
+        self.inc = inc
+        self.name = name
+
+        kw = 3
+        kh = 3
+        pw = (kw - 1) // 2
+        ph = (kh - 1) // 2
+
+        # type
+        if _type == 'proj':
+            key_stride = 1
+            self.has_proj = True
+        elif _type == 'down':
+            key_stride = 2
+            self.has_proj = True
+        elif _type == 'normal':
+            key_stride = 1
+            self.has_proj = False
+        else:
+            print("not implemented now!!!")
+            sys.exit(1)
+
+        data_in_ch = sum(num_channels) if isinstance(num_channels,
+                                                     list) else num_channels
+
+        if self.has_proj:
+            self.c1x1_w_func = BNACConvLayer(
+                num_channels=data_in_ch,
+                num_filters=num_1x1_c + 2 * inc,
+                filter_size=(1, 1),
+                pad=(0, 0),
+                stride=(key_stride, key_stride),
+                name=name + "_match")
+
+        self.c1x1_a_func = BNACConvLayer(
+            num_channels=data_in_ch,
+            num_filters=num_1x1_a,
+            filter_size=(1, 1),
+            pad=(0, 0),
+            name=name + "_conv1")
+
+        self.c3x3_b_func = BNACConvLayer(
+            num_channels=num_1x1_a,
+            num_filters=num_3x3_b,
+            filter_size=(kw, kh),
+            pad=(pw, ph),
+            stride=(key_stride, key_stride),
+            groups=G,
+            name=name + "_conv2")
+
+        self.c1x1_c_func = BNACConvLayer(
+            num_channels=num_3x3_b,
+            num_filters=num_1x1_c + inc,
+            filter_size=(1, 1),
+            pad=(0, 0),
+            name=name + "_conv3")
+
+    def forward(self, input):
+        # PROJ
+        if isinstance(input, list):
+            data_in = paddle.concat([input[0], input[1]], axis=1)
+        else:
+            data_in = input
+
+        if self.has_proj:
+            c1x1_w = self.c1x1_w_func(data_in)
+            data_o1, data_o2 = paddle.split(
+                c1x1_w, num_or_sections=[self.num_1x1_c, 2 * self.inc], axis=1)
+        else:
+            data_o1 = input[0]
+            data_o2 = input[1]
+
+        c1x1_a = self.c1x1_a_func(data_in)
+        c3x3_b = self.c3x3_b_func(c1x1_a)
+        c1x1_c = self.c1x1_c_func(c3x3_b)
+
+        c1x1_c1, c1x1_c2 = paddle.split(
+            c1x1_c, num_or_sections=[self.num_1x1_c, self.inc], axis=1)
+
+        # OUTPUTS
+        summ = paddle.add(x=data_o1, y=c1x1_c1)
+        dense = paddle.concat([data_o2, c1x1_c2], axis=1)
+        # tensor, channels
+        return [summ, dense]
+
+
+class DPN(nn.Layer):
+    def __init__(self, layers=68, class_num=1000):
+        super(DPN, self).__init__()
+
+        self._class_num = class_num
+
+        args = self.get_net_args(layers)
+        bws = args['bw']
+        inc_sec = args['inc_sec']
+        rs = args['r']
+        k_r = args['k_r']
+        k_sec = args['k_sec']
+        G = args['G']
+        init_num_filter = args['init_num_filter']
+        init_filter_size = args['init_filter_size']
+        init_padding = args['init_padding']
+
+        self.k_sec = k_sec
+
+        self.conv1_x_1_func = ConvBNLayer(
+            num_channels=3,
+            num_filters=init_num_filter,
+            filter_size=init_filter_size,
+            stride=2,
+            pad=init_padding,
+            act='relu',
+            name="conv1")
+
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        num_channel_dpn = init_num_filter
+
+        self.dpn_func_list = []
+        #conv2 - conv5
+        match_list, num = [], 0
+        for gc in range(4):
+            bw = bws[gc]
+            inc = inc_sec[gc]
+            R = (k_r * bw) // rs[gc]
+            if gc == 0:
+                _type1 = 'proj'
+                _type2 = 'normal'
+                match = 1
+            else:
+                _type1 = 'down'
+                _type2 = 'normal'
+                match = match + k_sec[gc - 1]
+            match_list.append(match)
+            self.dpn_func_list.append(
+                self.add_sublayer(
+                    "dpn{}".format(match),
+                    DualPathFactory(
+                        num_channels=num_channel_dpn,
+                        num_1x1_a=R,
+                        num_3x3_b=R,
+                        num_1x1_c=bw,
+                        inc=inc,
+                        G=G,
+                        _type=_type1,
+                        name="dpn" + str(match))))
+            num_channel_dpn = [bw, 3 * inc]
+
+            for i_ly in range(2, k_sec[gc] + 1):
+                num += 1
+                if num in match_list:
+                    num += 1
+                self.dpn_func_list.append(
+                    self.add_sublayer(
+                        "dpn{}".format(num),
+                        DualPathFactory(
+                            num_channels=num_channel_dpn,
+                            num_1x1_a=R,
+                            num_3x3_b=R,
+                            num_1x1_c=bw,
+                            inc=inc,
+                            G=G,
+                            _type=_type2,
+                            name="dpn" + str(num))))
+
+                num_channel_dpn = [
+                    num_channel_dpn[0], num_channel_dpn[1] + inc
+                ]
+
+        out_channel = sum(num_channel_dpn)
+
+        self.conv5_x_x_bn = BatchNorm(
+            num_channels=sum(num_channel_dpn),
+            act="relu",
+            param_attr=ParamAttr(name='final_concat_bn_scale'),
+            bias_attr=ParamAttr('final_concat_bn_offset'),
+            moving_mean_name='final_concat_bn_mean',
+            moving_variance_name='final_concat_bn_variance')
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        stdv = 0.01
+
+        self.out = Linear(
+            out_channel,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_weights"),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, input):
+        conv1_x_1 = self.conv1_x_1_func(input)
+        convX_x_x = self.pool2d_max(conv1_x_1)
+
+        dpn_idx = 0
+        for gc in range(4):
+            convX_x_x = self.dpn_func_list[dpn_idx](convX_x_x)
+            dpn_idx += 1
+            for i_ly in range(2, self.k_sec[gc] + 1):
+                convX_x_x = self.dpn_func_list[dpn_idx](convX_x_x)
+                dpn_idx += 1
+
+        conv5_x_x = paddle.concat(convX_x_x, axis=1)
+        conv5_x_x = self.conv5_x_x_bn(conv5_x_x)
+
+        y = self.pool2d_avg(conv5_x_x)
+        y = paddle.flatten(y, start_axis=1, stop_axis=-1)
+        y = self.out(y)
+        return y
+
+    def get_net_args(self, layers):
+        if layers == 68:
+            k_r = 128
+            G = 32
+            k_sec = [3, 4, 12, 3]
+            inc_sec = [16, 32, 32, 64]
+            bw = [64, 128, 256, 512]
+            r = [64, 64, 64, 64]
+            init_num_filter = 10
+            init_filter_size = 3
+            init_padding = 1
+        elif layers == 92:
+            k_r = 96
+            G = 32
+            k_sec = [3, 4, 20, 3]
+            inc_sec = [16, 32, 24, 128]
+            bw = [256, 512, 1024, 2048]
+            r = [256, 256, 256, 256]
+            init_num_filter = 64
+            init_filter_size = 7
+            init_padding = 3
+        elif layers == 98:
+            k_r = 160
+            G = 40
+            k_sec = [3, 6, 20, 3]
+            inc_sec = [16, 32, 32, 128]
+            bw = [256, 512, 1024, 2048]
+            r = [256, 256, 256, 256]
+            init_num_filter = 96
+            init_filter_size = 7
+            init_padding = 3
+        elif layers == 107:
+            k_r = 200
+            G = 50
+            k_sec = [4, 8, 20, 3]
+            inc_sec = [20, 64, 64, 128]
+            bw = [256, 512, 1024, 2048]
+            r = [256, 256, 256, 256]
+            init_num_filter = 128
+            init_filter_size = 7
+            init_padding = 3
+        elif layers == 131:
+            k_r = 160
+            G = 40
+            k_sec = [4, 8, 28, 3]
+            inc_sec = [16, 32, 32, 128]
+            bw = [256, 512, 1024, 2048]
+            r = [256, 256, 256, 256]
+            init_num_filter = 128
+            init_filter_size = 7
+            init_padding = 3
+        else:
+            raise NotImplementedError
+        net_arg = {
+            'k_r': k_r,
+            'G': G,
+            'k_sec': k_sec,
+            'inc_sec': inc_sec,
+            'bw': bw,
+            'r': r
+        }
+        net_arg['init_num_filter'] = init_num_filter
+        net_arg['init_filter_size'] = init_filter_size
+        net_arg['init_padding'] = init_padding
+
+        return net_arg
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def DPN68(pretrained=False, use_ssld=False, **kwargs):
+    model = DPN(layers=68, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DPN68"])
+    return model
+
+
+def DPN92(pretrained=False, use_ssld=False, **kwargs):
+    model = DPN(layers=92, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DPN92"])
+    return model
+
+
+def DPN98(pretrained=False, use_ssld=False, **kwargs):
+    model = DPN(layers=98, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DPN98"])
+    return model
+
+
+def DPN107(pretrained=False, use_ssld=False, **kwargs):
+    model = DPN(layers=107, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DPN107"])
+    return model
+
+
+def DPN131(pretrained=False, use_ssld=False, **kwargs):
+    model = DPN(layers=131, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["DPN131"])
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/dsnet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/dsnet.py
new file mode 100644
index 000000000..20b28c2b0
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/dsnet.py
@@ -0,0 +1,701 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/2105.14734v4
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .vision_transformer import to_2tuple, zeros_, ones_, VisionTransformer, Identity, zeros_
+from functools import partial
+from paddle.nn.initializer import TruncatedNormal, Constant, Normal
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "DSNet_tiny":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DSNet_tiny_pretrained.pdparams",
+    "DSNet_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DSNet_small_pretrained.pdparams",
+    "DSNet_base":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/DSNet_base_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2D(in_features, hidden_features, 1)
+        self.act = act_layer()
+        self.fc2 = nn.Conv2D(hidden_features, out_features, 1)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class DWConv(nn.Layer):
+    def __init__(self, dim=768):
+        super(DWConv, self).__init__()
+        self.dwconv = nn.Conv2D(dim, dim, 3, 1, 1, bias=True, groups=dim)
+
+    def forward(self, x):
+        x = self.dwconv(x)
+        return x
+
+
+class DWConvMlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2D(in_features, hidden_features, 1)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Conv2D(hidden_features, out_features, 1)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.dwconv(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        C = int(C // 3)
+        qkv = x.reshape(
+            (B, N, 3, self.num_heads, C // self.num_heads)).transpose(
+                (2, 0, 3, 1, 4))
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+        attn = F.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((B, N, C))
+        x = self.proj_drop(x)
+        return x
+
+
+class Cross_Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, tokens_q, memory_k, memory_v, shape=None):
+        assert shape is not None
+        attn = (tokens_q.matmul(memory_k.transpose((0, 1, 3, 2)))) * self.scale
+        attn = F.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(memory_v)).transpose((0, 2, 1, 3)).reshape(
+            (shape[0], shape[1], shape[2]))
+        x = self.proj_drop(x)
+        return x
+
+
+class MixBlock(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 downsample=2,
+                 conv_ffn=False):
+        super().__init__()
+        self.pos_embed = nn.Conv2D(dim, dim, 3, padding=1, groups=dim)
+        self.dim = dim
+        self.norm1 = nn.BatchNorm2D(dim)
+        self.conv1 = nn.Conv2D(dim, dim, 1)
+        self.conv2 = nn.Conv2D(dim, dim, 1)
+        self.dim_conv = int(dim * 0.5)
+        self.dim_sa = dim - self.dim_conv
+        self.norm_conv1 = nn.BatchNorm2D(self.dim_conv)
+        self.norm_sa1 = nn.LayerNorm(self.dim_sa)
+        self.conv = nn.Conv2D(
+            self.dim_conv, self.dim_conv, 3, padding=1, groups=self.dim_conv)
+        self.channel_up = nn.Linear(self.dim_sa, 3 * self.dim_sa)
+        self.cross_channel_up_conv = nn.Conv2D(self.dim_conv,
+                                               3 * self.dim_conv, 1)
+        self.cross_channel_up_sa = nn.Linear(self.dim_sa, 3 * self.dim_sa)
+        self.fuse_channel_conv = nn.Linear(self.dim_conv, self.dim_conv)
+        self.fuse_channel_sa = nn.Linear(self.dim_sa, self.dim_sa)
+        self.num_heads = num_heads
+        self.attn = Attention(
+            self.dim_sa,
+            num_heads=self.num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=0.1,
+            proj_drop=drop)
+        self.cross_attn = Cross_Attention(
+            self.dim_sa,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=0.1,
+            proj_drop=drop)
+        self.norm_conv2 = nn.BatchNorm2D(self.dim_conv)
+        self.norm_sa2 = nn.LayerNorm(self.dim_sa)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = nn.BatchNorm2D(dim)
+        self.downsample = downsample
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        if conv_ffn:
+            self.mlp = DWConvMlp(
+                in_features=dim,
+                hidden_features=mlp_hidden_dim,
+                act_layer=act_layer,
+                drop=drop)
+        else:
+            self.mlp = Mlp(in_features=dim,
+                           hidden_features=mlp_hidden_dim,
+                           act_layer=act_layer,
+                           drop=drop)
+
+    def forward(self, x):
+        x = x + self.pos_embed(x)
+        _, _, H, W = x.shape
+        residual = x
+        x = self.norm1(x)
+        x = self.conv1(x)
+
+        qkv = x[:, :self.dim_sa, :]
+        conv = x[:, self.dim_sa:, :, :]
+        residual_conv = conv
+        conv = residual_conv + self.conv(self.norm_conv1(conv))
+
+        sa = F.interpolate(
+            qkv,
+            size=(H // self.downsample, W // self.downsample),
+            mode='bilinear')
+        B, _, H_down, W_down = sa.shape
+        sa = sa.flatten(2).transpose([0, 2, 1])
+        residual_sa = sa
+        sa = self.norm_sa1(sa)
+        sa = self.channel_up(sa)
+        sa = residual_sa + self.attn(sa)
+
+        # cross attention
+        residual_conv_co = conv
+        residual_sa_co = sa
+        conv_qkv = self.cross_channel_up_conv(self.norm_conv2(conv))
+        conv_qkv = conv_qkv.flatten(2).transpose([0, 2, 1])
+
+        sa_qkv = self.cross_channel_up_sa(self.norm_sa2(sa))
+
+        B_conv, N_conv, C_conv = conv_qkv.shape
+        C_conv = int(C_conv // 3)
+        conv_qkv = conv_qkv.reshape((B_conv, N_conv, 3, self.num_heads,
+                                     C_conv // self.num_heads)).transpose(
+                                         (2, 0, 3, 1, 4))
+        conv_q, conv_k, conv_v = conv_qkv[0], conv_qkv[1], conv_qkv[2]
+
+        B_sa, N_sa, C_sa = sa_qkv.shape
+        C_sa = int(C_sa // 3)
+        sa_qkv = sa_qkv.reshape(
+            (B_sa, N_sa, 3, self.num_heads, C_sa // self.num_heads)).transpose(
+                (2, 0, 3, 1, 4))
+        sa_q, sa_k, sa_v = sa_qkv[0], sa_qkv[1], sa_qkv[2]
+
+        # sa -> conv
+        conv = self.cross_attn(
+            conv_q, sa_k, sa_v, shape=(B_conv, N_conv, C_conv))
+        conv = self.fuse_channel_conv(conv)
+        conv = conv.reshape((B, H, W, C_conv)).transpose((0, 3, 1, 2))
+        conv = residual_conv_co + conv
+
+        # conv -> sa
+        sa = self.cross_attn(sa_q, conv_k, conv_v, shape=(B_sa, N_sa, C_sa))
+        sa = residual_sa_co + self.fuse_channel_sa(sa)
+        sa = sa.reshape((B, H_down, W_down, C_sa)).transpose((0, 3, 1, 2))
+        sa = F.interpolate(sa, size=(H, W), mode='bilinear')
+        x = paddle.concat([conv, sa], axis=1)
+        x = residual + self.drop_path(self.conv2(x))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] //
+                                                        patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x)
+        return x
+
+
+class OverlapPatchEmbed(nn.Layer):
+    """ Image to Overlapping Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=7,
+                 stride=4,
+                 in_chans=3,
+                 embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.H, self.W = img_size[0] // patch_size[0], img_size[
+            1] // patch_size[1]
+        self.num_patches = self.H * self.W
+        self.proj = nn.Conv2D(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=(patch_size[0] // 2, patch_size[1] // 2))
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x)
+        return x
+
+
+class MixVisionTransformer(nn.Layer):
+    """ Mixed Vision Transformer for DSNet
+    A PaddlePaddle impl of : `Dual-stream Network for Visual Recognition` - https://arxiv.org/abs/2105.14734v4
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dim=[64, 128, 320, 512],
+                 depth=[2, 2, 4, 1],
+                 num_heads=[1, 2, 5, 8],
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 representation_size=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 norm_layer=None,
+                 overlap_embed=False,
+                 conv_ffn=False):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            class_num (int): number of classes for classification head
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            qk_scale (float): override default qk scale of head_dim ** -0.5 if set
+            representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
+            drop_rate (float): dropout rate
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            norm_layer: (nn.Layer): normalization layer
+            overlap_embed (bool): enable overlapped patch embedding if True
+            conv_ffn (bool): enable depthwise convolution for mlp if True
+        """
+        super().__init__()
+        self.class_num = class_num
+        self.num_features = self.embed_dim = embed_dim
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        downsamples = [8, 4, 2, 2]
+        if overlap_embed:
+            self.patch_embed1 = OverlapPatchEmbed(
+                img_size=img_size,
+                patch_size=7,
+                stride=4,
+                in_chans=in_chans,
+                embed_dim=embed_dim[0])
+            self.patch_embed2 = OverlapPatchEmbed(
+                img_size=img_size // 4,
+                patch_size=3,
+                stride=2,
+                in_chans=embed_dim[0],
+                embed_dim=embed_dim[1])
+            self.patch_embed3 = OverlapPatchEmbed(
+                img_size=img_size // 8,
+                patch_size=3,
+                stride=2,
+                in_chans=embed_dim[1],
+                embed_dim=embed_dim[2])
+            self.patch_embed4 = OverlapPatchEmbed(
+                img_size=img_size // 16,
+                patch_size=3,
+                stride=2,
+                in_chans=embed_dim[2],
+                embed_dim=embed_dim[3])
+        else:
+            self.patch_embed1 = PatchEmbed(
+                img_size=img_size,
+                patch_size=4,
+                in_chans=in_chans,
+                embed_dim=embed_dim[0])
+            self.patch_embed2 = PatchEmbed(
+                img_size=img_size // 4,
+                patch_size=2,
+                in_chans=embed_dim[0],
+                embed_dim=embed_dim[1])
+            self.patch_embed3 = PatchEmbed(
+                img_size=img_size // 8,
+                patch_size=2,
+                in_chans=embed_dim[1],
+                embed_dim=embed_dim[2])
+            self.patch_embed4 = PatchEmbed(
+                img_size=img_size // 16,
+                patch_size=2,
+                in_chans=embed_dim[2],
+                embed_dim=embed_dim[3])
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        self.mixture = False
+        dpr = [
+            x.item() for x in paddle.linspace(0, drop_path_rate, sum(depth))
+        ]
+        self.blocks1 = nn.LayerList([
+            MixBlock(
+                dim=embed_dim[0],
+                num_heads=num_heads[0],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                downsample=downsamples[0],
+                conv_ffn=conv_ffn) for i in range(depth[0])
+        ])
+
+        self.blocks2 = nn.LayerList([
+            MixBlock(
+                dim=embed_dim[1],
+                num_heads=num_heads[1],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                downsample=downsamples[1],
+                conv_ffn=conv_ffn) for i in range(depth[1])
+        ])
+
+        self.blocks3 = nn.LayerList([
+            MixBlock(
+                dim=embed_dim[2],
+                num_heads=num_heads[2],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                downsample=downsamples[2],
+                conv_ffn=conv_ffn) for i in range(depth[2])
+        ])
+
+        if self.mixture:
+            self.blocks4 = nn.LayerList([
+                Block(
+                    dim=embed_dim[3],
+                    num_heads=16,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    downsample=downsamples[3],
+                    conv_ffn=conv_ffn) for i in range(depth[3])
+            ])
+            self.norm = norm_layer(embed_dim[-1])
+        else:
+            self.blocks4 = nn.LayerList([
+                MixBlock(
+                    dim=embed_dim[3],
+                    num_heads=num_heads[3],
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    downsample=downsamples[3],
+                    conv_ffn=conv_ffn) for i in range(depth[3])
+            ])
+            self.norm = nn.BatchNorm2D(embed_dim[-1])
+
+        # Representation layer
+        if representation_size:
+            self.num_features = representation_size
+            self.pre_logits = nn.Sequential(
+                OrderedDict([('fc', nn.Linear(embed_dim, representation_size)),
+                             ('act', nn.Tanh())]))
+        else:
+            self.pre_logits = Identity()
+
+        # Classifier head
+        self.head = nn.Linear(embed_dim[-1],
+                              class_num) if class_num > 0 else Identity()
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            TruncatedNormal(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, class_num, global_pool=''):
+        self.class_num = class_num
+        self.head = nn.Linear(self.embed_dim,
+                              class_num) if class_num > 0 else Identity()
+
+    def forward_features(self, x):
+        B = x.shape[0]
+        x = self.patch_embed1(x)
+        x = self.pos_drop(x)
+        for blk in self.blocks1:
+            x = blk(x)
+        x = self.patch_embed2(x)
+        for blk in self.blocks2:
+            x = blk(x)
+        x = self.patch_embed3(x)
+        for blk in self.blocks3:
+            x = blk(x)
+        x = self.patch_embed4(x)
+        if self.mixture:
+            x = x.flatten(2).transpose([0, 2, 1])
+        for blk in self.blocks4:
+            x = blk(x)
+        x = self.norm(x)
+        x = self.pre_logits(x)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        if self.mixture:
+            x = x.mean(1)
+        else:
+            x = x.flatten(2).mean(-1)
+        x = self.head(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def DSNet_tiny(pretrained=False, use_ssld=False, **kwargs):
+    model = MixVisionTransformer(
+        patch_size=16,
+        depth=[2, 2, 4, 1],
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, eps=1e-6),
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["DSNet_tiny"], use_ssld=use_ssld)
+    return model
+
+
+def DSNet_small(pretrained=False, use_ssld=False, **kwargs):
+    model = MixVisionTransformer(
+        patch_size=16,
+        depth=[3, 4, 8, 3],
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, eps=1e-6),
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["DSNet_small"], use_ssld=use_ssld)
+    return model
+
+
+def DSNet_base(pretrained=False, use_ssld=False, **kwargs):
+    model = MixVisionTransformer(
+        patch_size=16,
+        depth=[3, 4, 28, 3],
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, eps=1e-6),
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["DSNet_base"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/efficientnet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/efficientnet.py
new file mode 100644
index 000000000..9217efd4f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/efficientnet.py
@@ -0,0 +1,1028 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/lukemelas/EfficientNet-PyTorch
+# reference: https://arxiv.org/abs/1905.11946
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+import math
+import collections
+import re
+import copy
+
+from ..base.theseus_layer import TheseusLayer
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "EfficientNetB0_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB0_small_pretrained.pdparams",
+    "EfficientNetB0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB0_pretrained.pdparams",
+    "EfficientNetB1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB1_pretrained.pdparams",
+    "EfficientNetB2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB2_pretrained.pdparams",
+    "EfficientNetB3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB3_pretrained.pdparams",
+    "EfficientNetB4":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB4_pretrained.pdparams",
+    "EfficientNetB5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB5_pretrained.pdparams",
+    "EfficientNetB6":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB6_pretrained.pdparams",
+    "EfficientNetB7":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetB7_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+GlobalParams = collections.namedtuple('GlobalParams', [
+    'batch_norm_momentum',
+    'batch_norm_epsilon',
+    'dropout_rate',
+    'num_classes',
+    'width_coefficient',
+    'depth_coefficient',
+    'depth_divisor',
+    'depth_trunc',
+    'min_depth',
+    'drop_connect_rate',
+])
+
+BlockArgs = collections.namedtuple('BlockArgs', [
+    'kernel_size', 'num_repeat', 'input_filters', 'output_filters',
+    'expand_ratio', 'id_skip', 'stride', 'se_ratio'
+])
+
+GlobalParams.__new__.__defaults__ = (None, ) * len(GlobalParams._fields)
+BlockArgs.__new__.__defaults__ = (None, ) * len(BlockArgs._fields)
+
+
+def efficientnet_params(model_name):
+    """ Map EfficientNet model name to parameter coefficients. """
+    params_dict = {
+        # Coefficients:   width,depth,resolution,dropout
+        'efficientnet-b0-small': (1.0, 1.0, 224, 0.2),
+        'efficientnet-b0': (1.0, 1.0, 224, 0.2),
+        'efficientnet-b1': (1.0, 1.1, 240, 0.2),
+        'efficientnet-b2': (1.1, 1.2, 260, 0.3),
+        'efficientnet-b3': (1.2, 1.4, 300, 0.3),
+        'efficientnet-b4': (1.4, 1.8, 380, 0.4),
+        'efficientnet-b5': (1.6, 2.2, 456, 0.4),
+        'efficientnet-b6': (1.8, 2.6, 528, 0.5),
+        'efficientnet-b7': (2.0, 3.1, 600, 0.5),
+    }
+    return params_dict[model_name]
+
+
+def efficientnet(width_coefficient=None,
+                 depth_coefficient=None,
+                 dropout_rate=0.2,
+                 drop_connect_rate=0.2):
+    """ Get block arguments according to parameter and coefficients. """
+    blocks_args = [
+        'r1_k3_s11_e1_i32_o16_se0.25',
+        'r2_k3_s22_e6_i16_o24_se0.25',
+        'r2_k5_s22_e6_i24_o40_se0.25',
+        'r3_k3_s22_e6_i40_o80_se0.25',
+        'r3_k5_s11_e6_i80_o112_se0.25',
+        'r4_k5_s22_e6_i112_o192_se0.25',
+        'r1_k3_s11_e6_i192_o320_se0.25',
+    ]
+    blocks_args = BlockDecoder.decode(blocks_args)
+
+    global_params = GlobalParams(
+        batch_norm_momentum=0.99,
+        batch_norm_epsilon=1e-3,
+        dropout_rate=dropout_rate,
+        drop_connect_rate=drop_connect_rate,
+        num_classes=1000,
+        width_coefficient=width_coefficient,
+        depth_coefficient=depth_coefficient,
+        depth_divisor=8,
+        depth_trunc='ceil',
+        min_depth=None)
+
+    return blocks_args, global_params
+
+
+def get_model_params(model_name, override_params):
+    """ Get the block args and global params for a given model """
+    if model_name.startswith('efficientnet'):
+        w, d, _, p = efficientnet_params(model_name)
+        blocks_args, global_params = efficientnet(
+            width_coefficient=w, depth_coefficient=d, dropout_rate=p)
+    else:
+        raise NotImplementedError('model name is not pre-defined: %s' %
+                                  model_name)
+    if override_params:
+        global_params = global_params._replace(**override_params)
+    return blocks_args, global_params
+
+
+def round_filters(filters, global_params):
+    """ Calculate and round number of filters based on depth multiplier. """
+    multiplier = global_params.width_coefficient
+    if not multiplier:
+        return filters
+    divisor = global_params.depth_divisor
+    min_depth = global_params.min_depth
+    filters *= multiplier
+    min_depth = min_depth or divisor
+    new_filters = max(min_depth,
+                      int(filters + divisor / 2) // divisor * divisor)
+    if new_filters < 0.9 * filters:  # prevent rounding by more than 10%
+        new_filters += divisor
+    return int(new_filters)
+
+
+def round_repeats(repeats, global_params):
+    """ Round number of filters based on depth multiplier. """
+    multiplier = global_params.depth_coefficient
+    if not multiplier:
+        return repeats
+    if global_params.depth_trunc == 'round':
+        return max(1, round(multiplier * repeats))
+    else:
+        return int(math.ceil(multiplier * repeats))
+
+
+class BlockDecoder(object):
+    """
+    Block Decoder, straight from the official TensorFlow repository.
+    """
+
+    @staticmethod
+    def _decode_block_string(block_string):
+        """ Gets a block through a string notation of arguments. """
+        assert isinstance(block_string, str)
+
+        ops = block_string.split('_')
+        options = {}
+        for op in ops:
+            splits = re.split(r'(\d.*)', op)
+            if len(splits) >= 2:
+                key, value = splits[:2]
+                options[key] = value
+
+        # Check stride
+        cond_1 = ('s' in options and len(options['s']) == 1)
+        cond_2 = ((len(options['s']) == 2) and
+                  (options['s'][0] == options['s'][1]))
+        assert (cond_1 or cond_2)
+
+        return BlockArgs(
+            kernel_size=int(options['k']),
+            num_repeat=int(options['r']),
+            input_filters=int(options['i']),
+            output_filters=int(options['o']),
+            expand_ratio=int(options['e']),
+            id_skip=('noskip' not in block_string),
+            se_ratio=float(options['se']) if 'se' in options else None,
+            stride=[int(options['s'][0])])
+
+    @staticmethod
+    def _encode_block_string(block):
+        """Encodes a block to a string."""
+        args = [
+            'r%d' % block.num_repeat, 'k%d' % block.kernel_size, 's%d%d' %
+            (block.strides[0], block.strides[1]), 'e%s' % block.expand_ratio,
+            'i%d' % block.input_filters, 'o%d' % block.output_filters
+        ]
+        if 0 < block.se_ratio <= 1:
+            args.append('se%s' % block.se_ratio)
+        if block.id_skip is False:
+            args.append('noskip')
+        return '_'.join(args)
+
+    @staticmethod
+    def decode(string_list):
+        """
+        Decode a list of string notations to specify blocks in the network.
+
+        string_list: list of strings, each string is a notation of block
+        return
+            list of BlockArgs namedtuples of block args
+        """
+        assert isinstance(string_list, list)
+        blocks_args = []
+        for block_string in string_list:
+            blocks_args.append(BlockDecoder._decode_block_string(block_string))
+        return blocks_args
+
+    @staticmethod
+    def encode(blocks_args):
+        """
+        Encodes a list of BlockArgs to a list of strings.
+
+        :param blocks_args: a list of BlockArgs namedtuples of block args
+        :return: a list of strings, each string is a notation of block
+        """
+        block_strings = []
+        for block in blocks_args:
+            block_strings.append(BlockDecoder._encode_block_string(block))
+        return block_strings
+
+
+def initial_type(name, use_bias=False):
+    param_attr = ParamAttr(name=name + "_weights")
+    if use_bias:
+        bias_attr = ParamAttr(name=name + "_offset")
+    else:
+        bias_attr = False
+    return param_attr, bias_attr
+
+
+def init_batch_norm_layer(name="batch_norm"):
+    param_attr = ParamAttr(name=name + "_scale")
+    bias_attr = ParamAttr(name=name + "_offset")
+    return param_attr, bias_attr
+
+
+def init_fc_layer(name="fc"):
+    param_attr = ParamAttr(name=name + "_weights")
+    bias_attr = ParamAttr(name=name + "_offset")
+    return param_attr, bias_attr
+
+
+def cal_padding(img_size, stride, filter_size, dilation=1):
+    """Calculate padding size."""
+    if img_size % stride == 0:
+        out_size = max(filter_size - stride, 0)
+    else:
+        out_size = max(filter_size - (img_size % stride), 0)
+    return out_size // 2, out_size - out_size // 2
+
+
+inp_shape = {
+    "b0_small": [224, 112, 112, 56, 28, 14, 14, 7],
+    "b0": [224, 112, 112, 56, 28, 14, 14, 7],
+    "b1": [240, 120, 120, 60, 30, 15, 15, 8],
+    "b2": [260, 130, 130, 65, 33, 17, 17, 9],
+    "b3": [300, 150, 150, 75, 38, 19, 19, 10],
+    "b4": [380, 190, 190, 95, 48, 24, 24, 12],
+    "b5": [456, 228, 228, 114, 57, 29, 29, 15],
+    "b6": [528, 264, 264, 132, 66, 33, 33, 17],
+    "b7": [600, 300, 300, 150, 75, 38, 38, 19]
+}
+
+
+def _drop_connect(inputs, prob, is_test):
+    if is_test:
+        output = inputs
+    else:
+        keep_prob = 1.0 - prob
+        inputs_shape = inputs.shape
+        random_tensor = keep_prob + paddle.rand(
+            shape=[inputs_shape[0], 1, 1, 1])
+        binary_tensor = paddle.floor(random_tensor)
+        output = paddle.multiply(inputs, binary_tensor) / keep_prob
+    return output
+
+
+class Conv2ds(TheseusLayer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 groups=None,
+                 name="conv2d",
+                 act=None,
+                 use_bias=False,
+                 padding_type=None,
+                 model_name=None,
+                 cur_stage=None):
+        super(Conv2ds, self).__init__()
+        assert act in [None, "swish", "sigmoid"]
+        self.act = act
+
+        param_attr, bias_attr = initial_type(name=name, use_bias=use_bias)
+
+        def get_padding(filter_size, stride=1, dilation=1):
+            padding = ((stride - 1) + dilation * (filter_size - 1)) // 2
+            return padding
+
+        self.need_crop = False
+        if padding_type == "SAME":
+            inps = 1 if model_name == None and cur_stage == None else inp_shape[
+                model_name][cur_stage]
+            top_padding, bottom_padding = cal_padding(inps, stride,
+                                                      filter_size)
+            left_padding, right_padding = cal_padding(inps, stride,
+                                                      filter_size)
+            height_padding = bottom_padding
+            width_padding = right_padding
+            if top_padding != bottom_padding or left_padding != right_padding:
+                height_padding = top_padding + stride
+                width_padding = left_padding + stride
+                self.need_crop = True
+            padding = [height_padding, width_padding]
+        elif padding_type == "VALID":
+            height_padding = 0
+            width_padding = 0
+            padding = [height_padding, width_padding]
+        elif padding_type == "DYNAMIC":
+            padding = get_padding(filter_size, stride)
+        else:
+            padding = padding_type
+
+        groups = 1 if groups is None else groups
+        self._conv = Conv2D(
+            input_channels,
+            output_channels,
+            filter_size,
+            groups=groups,
+            stride=stride,
+            #             act=act,
+            padding=padding,
+            weight_attr=param_attr,
+            bias_attr=bias_attr)
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        if self.act == "swish":
+            x = F.swish(x)
+        elif self.act == "sigmoid":
+            x = F.sigmoid(x)
+
+        if self.need_crop:
+            x = x[:, :, 1:, 1:]
+        return x
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 input_channels,
+                 filter_size,
+                 output_channels,
+                 stride=1,
+                 num_groups=1,
+                 global_params=None,
+                 padding_type="SAME",
+                 conv_act=None,
+                 bn_act="swish",
+                 use_bn=True,
+                 use_bias=False,
+                 name=None,
+                 conv_name=None,
+                 bn_name=None,
+                 model_name=None,
+                 cur_stage=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2ds(
+            input_channels=input_channels,
+            output_channels=output_channels,
+            filter_size=filter_size,
+            stride=stride,
+            groups=num_groups,
+            act=conv_act,
+            padding_type=padding_type,
+            name=conv_name,
+            use_bias=use_bias,
+            model_name=model_name,
+            cur_stage=cur_stage)
+        self.use_bn = use_bn
+        if use_bn is True:
+            bn_name = name + bn_name
+            param_attr, bias_attr = init_batch_norm_layer(bn_name)
+            momentum = global_params.batch_norm_momentum
+            epsilon = global_params.batch_norm_epsilon
+
+            self._bn = BatchNorm(
+                num_channels=output_channels,
+                act=bn_act,
+                momentum=momentum,
+                epsilon=epsilon,
+                moving_mean_name=bn_name + "_mean",
+                moving_variance_name=bn_name + "_variance",
+                param_attr=param_attr,
+                bias_attr=bias_attr)
+
+    def forward(self, inputs):
+        if self.use_bn:
+            x = self._conv(inputs)
+            x = self._bn(x)
+            return x
+        else:
+            return self._conv(inputs)
+
+
+class ExpandConvNorm(TheseusLayer):
+    def __init__(self,
+                 input_channels,
+                 block_args,
+                 global_params,
+                 padding_type,
+                 name=None,
+                 model_name=None,
+                 cur_stage=None):
+        super(ExpandConvNorm, self).__init__()
+
+        self.oup = block_args.input_filters * block_args.expand_ratio
+        self.expand_ratio = block_args.expand_ratio
+
+        if self.expand_ratio != 1:
+            self._conv = ConvBNLayer(
+                input_channels,
+                1,
+                self.oup,
+                global_params=global_params,
+                bn_act=None,
+                padding_type=padding_type,
+                name=name,
+                conv_name=name + "_expand_conv",
+                bn_name="_bn0",
+                model_name=model_name,
+                cur_stage=cur_stage)
+
+    def forward(self, inputs):
+        if self.expand_ratio != 1:
+            return self._conv(inputs)
+        else:
+            return inputs
+
+
+class DepthwiseConvNorm(TheseusLayer):
+    def __init__(self,
+                 input_channels,
+                 block_args,
+                 global_params,
+                 padding_type,
+                 name=None,
+                 model_name=None,
+                 cur_stage=None):
+        super(DepthwiseConvNorm, self).__init__()
+
+        self.k = block_args.kernel_size
+        self.s = block_args.stride
+        if isinstance(self.s, list) or isinstance(self.s, tuple):
+            self.s = self.s[0]
+        oup = block_args.input_filters * block_args.expand_ratio
+
+        self._conv = ConvBNLayer(
+            input_channels,
+            self.k,
+            oup,
+            self.s,
+            num_groups=input_channels,
+            global_params=global_params,
+            bn_act=None,
+            padding_type=padding_type,
+            name=name,
+            conv_name=name + "_depthwise_conv",
+            bn_name="_bn1",
+            model_name=model_name,
+            cur_stage=cur_stage)
+
+    def forward(self, inputs):
+        return self._conv(inputs)
+
+
+class ProjectConvNorm(TheseusLayer):
+    def __init__(self,
+                 input_channels,
+                 block_args,
+                 global_params,
+                 padding_type,
+                 name=None,
+                 model_name=None,
+                 cur_stage=None):
+        super(ProjectConvNorm, self).__init__()
+
+        self.final_oup = block_args.output_filters
+
+        self._conv = ConvBNLayer(
+            input_channels,
+            1,
+            self.final_oup,
+            global_params=global_params,
+            bn_act=None,
+            padding_type=padding_type,
+            name=name,
+            conv_name=name + "_project_conv",
+            bn_name="_bn2",
+            model_name=model_name,
+            cur_stage=cur_stage)
+
+    def forward(self, inputs):
+        return self._conv(inputs)
+
+
+class SEBlock(TheseusLayer):
+    def __init__(self,
+                 input_channels,
+                 num_squeezed_channels,
+                 oup,
+                 padding_type,
+                 name=None,
+                 model_name=None,
+                 cur_stage=None):
+        super(SEBlock, self).__init__()
+
+        self._pool = AdaptiveAvgPool2D(1)
+        self._conv1 = Conv2ds(
+            input_channels,
+            num_squeezed_channels,
+            1,
+            use_bias=True,
+            padding_type=padding_type,
+            act="swish",
+            name=name + "_se_reduce")
+
+        self._conv2 = Conv2ds(
+            num_squeezed_channels,
+            oup,
+            1,
+            act="sigmoid",
+            use_bias=True,
+            padding_type=padding_type,
+            name=name + "_se_expand")
+
+    def forward(self, inputs):
+        x = self._pool(inputs)
+        x = self._conv1(x)
+        x = self._conv2(x)
+        out = paddle.multiply(inputs, x)
+        return out
+
+
+class MbConvBlock(TheseusLayer):
+    def __init__(self,
+                 input_channels,
+                 block_args,
+                 global_params,
+                 padding_type,
+                 use_se,
+                 name=None,
+                 drop_connect_rate=None,
+                 model_name=None,
+                 cur_stage=None):
+        super(MbConvBlock, self).__init__()
+
+        oup = block_args.input_filters * block_args.expand_ratio
+        self.block_args = block_args
+        self.has_se = use_se and (block_args.se_ratio is not None) and (
+            0 < block_args.se_ratio <= 1)
+        self.id_skip = block_args.id_skip
+        self.expand_ratio = block_args.expand_ratio
+        self.drop_connect_rate = drop_connect_rate
+
+        if self.expand_ratio != 1:
+            self._ecn = ExpandConvNorm(
+                input_channels,
+                block_args,
+                global_params,
+                padding_type=padding_type,
+                name=name,
+                model_name=model_name,
+                cur_stage=cur_stage)
+
+        self._dcn = DepthwiseConvNorm(
+            input_channels * block_args.expand_ratio,
+            block_args,
+            global_params,
+            padding_type=padding_type,
+            name=name,
+            model_name=model_name,
+            cur_stage=cur_stage)
+
+        if self.has_se:
+            num_squeezed_channels = max(
+                1, int(block_args.input_filters * block_args.se_ratio))
+            self._se = SEBlock(
+                input_channels * block_args.expand_ratio,
+                num_squeezed_channels,
+                oup,
+                padding_type=padding_type,
+                name=name,
+                model_name=model_name,
+                cur_stage=cur_stage)
+
+        self._pcn = ProjectConvNorm(
+            input_channels * block_args.expand_ratio,
+            block_args,
+            global_params,
+            padding_type=padding_type,
+            name=name,
+            model_name=model_name,
+            cur_stage=cur_stage)
+
+        self.final_oup = self._pcn.final_oup
+
+    def forward(self, inputs):
+        x = inputs
+        if self.expand_ratio != 1:
+            x = self._ecn(x)
+            x = F.swish(x)
+
+        x = self._dcn(x)
+        x = F.swish(x)
+        if self.has_se:
+            x = self._se(x)
+        x = self._pcn(x)
+
+        if self.id_skip and \
+                self.block_args.stride == 1 and \
+                self.block_args.input_filters == self.block_args.output_filters:
+            if self.drop_connect_rate:
+                x = _drop_connect(x, self.drop_connect_rate, not self.training)
+            x = paddle.add(x, inputs)
+        return x
+
+
+class ConvStemNorm(TheseusLayer):
+    def __init__(self,
+                 input_channels,
+                 padding_type,
+                 _global_params,
+                 name=None,
+                 model_name=None,
+                 fix_stem=False,
+                 cur_stage=None):
+        super(ConvStemNorm, self).__init__()
+
+        output_channels = 32 if fix_stem else round_filters(32, _global_params)
+        self._conv = ConvBNLayer(
+            input_channels,
+            filter_size=3,
+            output_channels=output_channels,
+            stride=2,
+            global_params=_global_params,
+            bn_act=None,
+            padding_type=padding_type,
+            name="",
+            conv_name="_conv_stem",
+            bn_name="_bn0",
+            model_name=model_name,
+            cur_stage=cur_stage)
+
+    def forward(self, inputs):
+        return self._conv(inputs)
+
+
+class ExtractFeatures(TheseusLayer):
+    def __init__(self,
+                 input_channels,
+                 _block_args,
+                 _global_params,
+                 padding_type,
+                 use_se,
+                 model_name=None,
+                 fix_stem=False):
+        super(ExtractFeatures, self).__init__()
+
+        self._global_params = _global_params
+
+        self._conv_stem = ConvStemNorm(
+            input_channels,
+            padding_type=padding_type,
+            _global_params=_global_params,
+            model_name=model_name,
+            fix_stem=fix_stem,
+            cur_stage=0)
+
+        self.block_args_copy = copy.deepcopy(_block_args)
+        idx = 0
+        block_size = 0
+        for block_arg in self.block_args_copy:
+            block_arg = block_arg._replace(
+                input_filters=round_filters(block_arg.input_filters,
+                                            _global_params),
+                output_filters=round_filters(block_arg.output_filters,
+                                             _global_params),
+                num_repeat=round_repeats(block_arg.num_repeat, _global_params))
+            block_size += 1
+            for _ in range(block_arg.num_repeat - 1):
+                block_size += 1
+
+        self.final_oup = None
+        self.conv_seq = []
+        cur_stage = 1
+        for block_idx, block_args in enumerate(_block_args):
+            if not (fix_stem and block_idx == 0):
+                block_args = block_args._replace(input_filters=round_filters(
+                    block_args.input_filters, _global_params))
+            block_args = block_args._replace(
+                output_filters=round_filters(block_args.output_filters,
+                                             _global_params),
+                num_repeat=round_repeats(block_args.num_repeat,
+                                         _global_params))
+
+            drop_connect_rate = self._global_params.drop_connect_rate
+            if drop_connect_rate:
+                drop_connect_rate *= float(idx) / block_size
+
+            _mc_block = self.add_sublayer(
+                "_blocks." + str(idx) + ".",
+                MbConvBlock(
+                    block_args.input_filters,
+                    block_args=block_args,
+                    global_params=_global_params,
+                    padding_type=padding_type,
+                    use_se=use_se,
+                    name="_blocks." + str(idx) + ".",
+                    drop_connect_rate=drop_connect_rate,
+                    model_name=model_name,
+                    cur_stage=cur_stage))
+            self.conv_seq.append(_mc_block)
+            self.final_oup = _mc_block.final_oup
+            idx += 1
+            if block_args.num_repeat > 1:
+                block_args = block_args._replace(
+                    input_filters=block_args.output_filters, stride=1)
+            for _ in range(block_args.num_repeat - 1):
+                drop_connect_rate = self._global_params.drop_connect_rate
+                if drop_connect_rate:
+                    drop_connect_rate *= float(idx) / block_size
+                _mc_block = self.add_sublayer(
+                    "block." + str(idx) + ".",
+                    MbConvBlock(
+                        block_args.input_filters,
+                        block_args,
+                        global_params=_global_params,
+                        padding_type=padding_type,
+                        use_se=use_se,
+                        name="_blocks." + str(idx) + ".",
+                        drop_connect_rate=drop_connect_rate,
+                        model_name=model_name,
+                        cur_stage=cur_stage))
+                self.conv_seq.append(_mc_block)
+                self.final_oup = _mc_block.final_oup
+                idx += 1
+            cur_stage += 1
+
+    def forward(self, inputs):
+        x = self._conv_stem(inputs)
+        x = F.swish(x)
+        for _mc_block in self.conv_seq:
+            x = _mc_block(x)
+        return x
+
+
+class EfficientNet(TheseusLayer):
+    def __init__(self,
+                 block_args,
+                 global_params,
+                 name="b0",
+                 padding_type="SAME",
+                 use_se=True,
+                 fix_stem=False,
+                 num_features=None,
+                 class_num=1000):
+        super(EfficientNet, self).__init__()
+
+        self.name = name
+        self.fix_stem = fix_stem
+        self._block_args = block_args
+        self._global_params = global_params
+        self.padding_type = padding_type
+        self.use_se = use_se
+
+        self._ef = ExtractFeatures(
+            3,
+            self._block_args,
+            self._global_params,
+            self.padding_type,
+            self.use_se,
+            model_name=self.name,
+            fix_stem=self.fix_stem)
+
+        output_channels = num_features or round_filters(1280,
+                                                        self._global_params)
+        self._conv = ConvBNLayer(
+            self._ef.final_oup,
+            1,
+            output_channels,
+            global_params=self._global_params,
+            bn_act="swish",
+            padding_type=self.padding_type,
+            name="",
+            conv_name="_conv_head",
+            bn_name="_bn1",
+            model_name=self.name,
+            cur_stage=7)
+        self._pool = AdaptiveAvgPool2D(1)
+
+        if self._global_params.dropout_rate:
+            self._drop = Dropout(
+                p=self._global_params.dropout_rate, mode="upscale_in_train")
+
+        param_attr, bias_attr = init_fc_layer("_fc")
+        self._fc = Linear(
+            output_channels,
+            class_num,
+            weight_attr=param_attr,
+            bias_attr=bias_attr)
+
+    def forward(self, inputs):
+        x = self._ef(inputs)
+        x = self._conv(x)
+        x = self._pool(x)
+        if self._global_params.dropout_rate:
+            x = self._drop(x)
+        x = paddle.squeeze(x, axis=[2, 3])
+        x = self._fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def EfficientNetB0_small(padding_type='DYNAMIC',
+                         override_params=None,
+                         use_se=False,
+                         pretrained=False,
+                         use_ssld=False,
+                         **kwargs):
+    block_args, global_params = get_model_params("efficientnet-b0-small",
+                                                 override_params)
+    model = EfficientNet(
+        block_args,
+        global_params,
+        name='b0',
+        padding_type=padding_type,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB0_small"])
+    return model
+
+
+def EfficientNetB0(padding_type='SAME',
+                   override_params=None,
+                   use_se=True,
+                   pretrained=False,
+                   use_ssld=False,
+                   **kwargs):
+    block_args, global_params = get_model_params("efficientnet-b0",
+                                                 override_params)
+    model = EfficientNet(
+        block_args,
+        global_params,
+        name='b0',
+        padding_type=padding_type,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB0"])
+    return model
+
+
+def EfficientNetB1(padding_type='SAME',
+                   override_params=None,
+                   use_se=True,
+                   pretrained=False,
+                   use_ssld=False,
+                   **kwargs):
+    block_args, global_params = get_model_params("efficientnet-b1",
+                                                 override_params)
+    model = EfficientNet(
+        block_args,
+        global_params,
+        name='b1',
+        padding_type=padding_type,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB1"])
+    return model
+
+
+def EfficientNetB2(padding_type='SAME',
+                   override_params=None,
+                   use_se=True,
+                   pretrained=False,
+                   use_ssld=False,
+                   **kwargs):
+    block_args, global_params = get_model_params("efficientnet-b2",
+                                                 override_params)
+    model = EfficientNet(
+        block_args,
+        global_params,
+        name='b2',
+        padding_type=padding_type,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB2"])
+    return model
+
+
+def EfficientNetB3(padding_type='SAME',
+                   override_params=None,
+                   use_se=True,
+                   pretrained=False,
+                   use_ssld=False,
+                   **kwargs):
+    block_args, global_params = get_model_params("efficientnet-b3",
+                                                 override_params)
+    model = EfficientNet(
+        block_args,
+        global_params,
+        name='b3',
+        padding_type=padding_type,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB3"])
+    return model
+
+
+def EfficientNetB4(padding_type='SAME',
+                   override_params=None,
+                   use_se=True,
+                   pretrained=False,
+                   use_ssld=False,
+                   **kwargs):
+    block_args, global_params = get_model_params("efficientnet-b4",
+                                                 override_params)
+    model = EfficientNet(
+        block_args,
+        global_params,
+        name='b4',
+        padding_type=padding_type,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB4"])
+    return model
+
+
+def EfficientNetB5(padding_type='SAME',
+                   override_params=None,
+                   use_se=True,
+                   pretrained=False,
+                   use_ssld=False,
+                   **kwargs):
+    block_args, global_params = get_model_params("efficientnet-b5",
+                                                 override_params)
+    model = EfficientNet(
+        block_args,
+        global_params,
+        name='b5',
+        padding_type=padding_type,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB5"])
+    return model
+
+
+def EfficientNetB6(padding_type='SAME',
+                   override_params=None,
+                   use_se=True,
+                   pretrained=False,
+                   use_ssld=False,
+                   **kwargs):
+    block_args, global_params = get_model_params("efficientnet-b6",
+                                                 override_params)
+    model = EfficientNet(
+        block_args,
+        global_params,
+        name='b6',
+        padding_type=padding_type,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB6"])
+    return model
+
+
+def EfficientNetB7(padding_type='SAME',
+                   override_params=None,
+                   use_se=True,
+                   pretrained=False,
+                   use_ssld=False,
+                   **kwargs):
+    block_args, global_params = get_model_params("efficientnet-b7",
+                                                 override_params)
+    model = EfficientNet(
+        block_args,
+        global_params,
+        name='b7',
+        padding_type=padding_type,
+        use_se=use_se,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB7"])
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/efficientnet_v2.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/efficientnet_v2.py
new file mode 100644
index 000000000..f620d895d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/efficientnet_v2.py
@@ -0,0 +1,994 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/lukemelas/EfficientNet-PyTorch
+# reference: https://arxiv.org/abs/1905.11946
+
+import math
+import re
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import Constant, Normal, Uniform
+from paddle.regularizer import L2Decay
+
+from ....utils.config import AttrDict
+
+from ....utils.save_load import (load_dygraph_pretrain,
+                                 load_dygraph_pretrain)
+
+MODEL_URLS = {
+    "EfficientNetV2_S":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetV2_S_pretrained.pdparams",
+    "EfficientNetV2_M":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetV2_M_pretrained.pdparams",
+    "EfficientNetV2_L":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetV2_L_pretrained.pdparams",
+    "EfficientNetV2_XL":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/EfficientNetV2_XL_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+inp_shape = {
+    "efficientnetv2-s": [384, 192, 192, 96, 48, 24, 24, 12],
+    "efficientnetv2-m": [384, 192, 192, 96, 48, 24, 24, 12],
+    "efficientnetv2-l": [384, 192, 192, 96, 48, 24, 24, 12],
+    "efficientnetv2-xl": [384, 192, 192, 96, 48, 24, 24, 12],
+}
+
+
+def cal_padding(img_size, stride, kernel_size):
+    """Calculate padding size."""
+    if img_size % stride == 0:
+        out_size = max(kernel_size - stride, 0)
+    else:
+        out_size = max(kernel_size - (img_size % stride), 0)
+    return out_size // 2, out_size - out_size // 2
+
+
+class Conv2ds(nn.Layer):
+    """Customed Conv2D with tensorflow's padding style
+
+    Args:
+        input_channels (int): input channels
+        output_channels (int): output channels
+        kernel_size (int): filter size
+        stride (int, optional): stride. Defaults to 1.
+        padding (int, optional): padding. Defaults to 0.
+        groups (int, optional): groups. Defaults to None.
+        act (str, optional): act. Defaults to None.
+        use_bias (bool, optional): use_bias. Defaults to None.
+        padding_type (str, optional): padding_type. Defaults to None.
+        model_name (str, optional): model name. Defaults to None.
+        cur_stage (int, optional): current stage. Defaults to None.
+
+    Returns:
+        nn.Layer: Customed Conv2D instance
+    """
+
+    def __init__(self,
+                 input_channels: int,
+                 output_channels: int,
+                 kernel_size: int,
+                 stride=1,
+                 padding=0,
+                 groups=None,
+                 act=None,
+                 use_bias=None,
+                 padding_type=None,
+                 model_name=None,
+                 cur_stage=None):
+        super(Conv2ds, self).__init__()
+        assert act in [None, "swish", "sigmoid"]
+        self._act = act
+
+        def get_padding(kernel_size, stride=1, dilation=1):
+            padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
+            return padding
+
+        inps = inp_shape[model_name][cur_stage]
+        self.need_crop = False
+        if padding_type == "SAME":
+            top_padding, bottom_padding = cal_padding(inps, stride,
+                                                      kernel_size)
+            left_padding, right_padding = cal_padding(inps, stride,
+                                                      kernel_size)
+            height_padding = bottom_padding
+            width_padding = right_padding
+            if top_padding != bottom_padding or left_padding != right_padding:
+                height_padding = top_padding + stride
+                width_padding = left_padding + stride
+                self.need_crop = True
+            padding = [height_padding, width_padding]
+        elif padding_type == "VALID":
+            height_padding = 0
+            width_padding = 0
+            padding = [height_padding, width_padding]
+        elif padding_type == "DYNAMIC":
+            padding = get_padding(kernel_size, stride)
+        else:
+            padding = padding_type
+
+        groups = 1 if groups is None else groups
+        self._conv = nn.Conv2D(
+            input_channels,
+            output_channels,
+            kernel_size,
+            groups=groups,
+            stride=stride,
+            padding=padding,
+            weight_attr=None,
+            bias_attr=use_bias
+            if not use_bias else ParamAttr(regularizer=L2Decay(0.0)))
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        if self._act == "swish":
+            x = F.swish(x)
+        elif self._act == "sigmoid":
+            x = F.sigmoid(x)
+
+        if self.need_crop:
+            x = x[:, :, 1:, 1:]
+        return x
+
+
+class BlockDecoder(object):
+    """Block Decoder for readability."""
+
+    def _decode_block_string(self, block_string):
+        """Gets a block through a string notation of arguments."""
+        assert isinstance(block_string, str)
+        ops = block_string.split('_')
+        options = AttrDict()
+        for op in ops:
+            splits = re.split(r'(\d.*)', op)
+            if len(splits) >= 2:
+                key, value = splits[:2]
+                options[key] = value
+
+        t = AttrDict(
+            kernel_size=int(options['k']),
+            num_repeat=int(options['r']),
+            in_channels=int(options['i']),
+            out_channels=int(options['o']),
+            expand_ratio=int(options['e']),
+            se_ratio=float(options['se']) if 'se' in options else None,
+            strides=int(options['s']),
+            conv_type=int(options['c']) if 'c' in options else 0, )
+        return t
+
+    def _encode_block_string(self, block):
+        """Encodes a block to a string."""
+        args = [
+            'r%d' % block.num_repeat,
+            'k%d' % block.kernel_size,
+            's%d' % block.strides,
+            'e%s' % block.expand_ratio,
+            'i%d' % block.in_channels,
+            'o%d' % block.out_channels,
+            'c%d' % block.conv_type,
+            'f%d' % block.fused_conv,
+        ]
+        if block.se_ratio > 0 and block.se_ratio <= 1:
+            args.append('se%s' % block.se_ratio)
+        return '_'.join(args)
+
+    def decode(self, string_list):
+        """Decodes a list of string notations to specify blocks inside the network.
+
+        Args:
+        string_list: a list of strings, each string is a notation of block.
+
+        Returns:
+        A list of namedtuples to represent blocks arguments.
+        """
+        assert isinstance(string_list, list)
+        blocks_args = []
+        for block_string in string_list:
+            blocks_args.append(self._decode_block_string(block_string))
+        return blocks_args
+
+    def encode(self, blocks_args):
+        """Encodes a list of Blocks to a list of strings.
+
+        Args:
+        blocks_args: A list of namedtuples to represent blocks arguments.
+        Returns:
+        a list of strings, each string is a notation of block.
+        """
+        block_strings = []
+        for block in blocks_args:
+            block_strings.append(self._encode_block_string(block))
+        return block_strings
+
+
+#################### EfficientNet V2 configs ####################
+v2_base_block = [  # The baseline config for v2 models.
+    "r1_k3_s1_e1_i32_o16_c1",
+    "r2_k3_s2_e4_i16_o32_c1",
+    "r2_k3_s2_e4_i32_o48_c1",
+    "r3_k3_s2_e4_i48_o96_se0.25",
+    "r5_k3_s1_e6_i96_o112_se0.25",
+    "r8_k3_s2_e6_i112_o192_se0.25",
+]
+
+v2_s_block = [  # about base * (width1.4, depth1.8)
+    "r2_k3_s1_e1_i24_o24_c1",
+    "r4_k3_s2_e4_i24_o48_c1",
+    "r4_k3_s2_e4_i48_o64_c1",
+    "r6_k3_s2_e4_i64_o128_se0.25",
+    "r9_k3_s1_e6_i128_o160_se0.25",
+    "r15_k3_s2_e6_i160_o256_se0.25",
+]
+
+v2_m_block = [  # about base * (width1.6, depth2.2)
+    "r3_k3_s1_e1_i24_o24_c1",
+    "r5_k3_s2_e4_i24_o48_c1",
+    "r5_k3_s2_e4_i48_o80_c1",
+    "r7_k3_s2_e4_i80_o160_se0.25",
+    "r14_k3_s1_e6_i160_o176_se0.25",
+    "r18_k3_s2_e6_i176_o304_se0.25",
+    "r5_k3_s1_e6_i304_o512_se0.25",
+]
+
+v2_l_block = [  # about base * (width2.0, depth3.1)
+    "r4_k3_s1_e1_i32_o32_c1",
+    "r7_k3_s2_e4_i32_o64_c1",
+    "r7_k3_s2_e4_i64_o96_c1",
+    "r10_k3_s2_e4_i96_o192_se0.25",
+    "r19_k3_s1_e6_i192_o224_se0.25",
+    "r25_k3_s2_e6_i224_o384_se0.25",
+    "r7_k3_s1_e6_i384_o640_se0.25",
+]
+
+v2_xl_block = [  # only for 21k pretraining.
+    "r4_k3_s1_e1_i32_o32_c1",
+    "r8_k3_s2_e4_i32_o64_c1",
+    "r8_k3_s2_e4_i64_o96_c1",
+    "r16_k3_s2_e4_i96_o192_se0.25",
+    "r24_k3_s1_e6_i192_o256_se0.25",
+    "r32_k3_s2_e6_i256_o512_se0.25",
+    "r8_k3_s1_e6_i512_o640_se0.25",
+]
+efficientnetv2_params = {
+    # params:            (block, width, depth, dropout)
+    "efficientnetv2-s":
+    (v2_s_block, 1.0, 1.0, np.linspace(0.1, 0.3, 4).tolist()),
+    "efficientnetv2-m": (v2_m_block, 1.0, 1.0, 0.3),
+    "efficientnetv2-l": (v2_l_block, 1.0, 1.0, 0.4),
+    "efficientnetv2-xl": (v2_xl_block, 1.0, 1.0, 0.4),
+}
+
+
+def efficientnetv2_config(model_name: str):
+    """EfficientNetV2 model config."""
+    block, width, depth, dropout = efficientnetv2_params[model_name]
+
+    cfg = AttrDict(model=AttrDict(
+        model_name=model_name,
+        blocks_args=BlockDecoder().decode(block),
+        width_coefficient=width,
+        depth_coefficient=depth,
+        dropout_rate=dropout,
+        feature_size=1280,
+        bn_momentum=0.9,
+        bn_epsilon=1e-3,
+        depth_divisor=8,
+        min_depth=8,
+        act_fn="silu",
+        survival_prob=0.8,
+        local_pooling=False,
+        conv_dropout=0,
+        num_classes=1000))
+    return cfg
+
+
+def get_model_config(model_name: str):
+    """Main entry for model name to config."""
+    if model_name.startswith("efficientnetv2-"):
+        return efficientnetv2_config(model_name)
+    raise ValueError(f"Unknown model_name {model_name}")
+
+
+################################################################################
+
+
+def round_filters(filters,
+                  width_coefficient,
+                  depth_divisor,
+                  min_depth,
+                  skip=False):
+    """Round number of filters based on depth multiplier."""
+    multiplier = width_coefficient
+    divisor = depth_divisor
+    min_depth = min_depth
+    if skip or not multiplier:
+        return filters
+
+    filters *= multiplier
+    min_depth = min_depth or divisor
+    new_filters = max(min_depth,
+                      int(filters + divisor / 2) // divisor * divisor)
+    return int(new_filters)
+
+
+def round_repeats(repeats, multiplier, skip=False):
+    """Round number of filters based on depth multiplier."""
+    if skip or not multiplier:
+        return repeats
+    return int(math.ceil(multiplier * repeats))
+
+
+def activation_fn(act_fn: str):
+    """Customized non-linear activation type."""
+    if not act_fn:
+        return nn.Silu()
+    elif act_fn in ("silu", "swish"):
+        return nn.Swish()
+    elif act_fn == "relu":
+        return nn.ReLU()
+    elif act_fn == "relu6":
+        return nn.ReLU6()
+    elif act_fn == "elu":
+        return nn.ELU()
+    elif act_fn == "leaky_relu":
+        return nn.LeakyReLU()
+    elif act_fn == "selu":
+        return nn.SELU()
+    elif act_fn == "mish":
+        return nn.Mish()
+    else:
+        raise ValueError("Unsupported act_fn {}".format(act_fn))
+
+
+def drop_path(x, training=False, survival_prob=1.0):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if not training:
+        return x
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    keep_prob = paddle.to_tensor(survival_prob, dtype=x.dtype)
+    random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class SE(nn.Layer):
+    """Squeeze-and-excitation layer.
+
+    Args:
+        local_pooling (bool): local_pooling
+        act_fn (str): act_fn
+        in_channels (int): in_channels
+        se_channels (int): se_channels
+        out_channels (int): out_channels
+        cur_stage (int): cur_stage
+        padding_type (str): padding_type
+        model_name (str): model_name
+    """
+
+    def __init__(self,
+                 local_pooling: bool,
+                 act_fn: str,
+                 in_channels: int,
+                 se_channels: int,
+                 out_channels: int,
+                 cur_stage: int,
+                 padding_type: str,
+                 model_name: str):
+        super(SE, self).__init__()
+
+        self._local_pooling = local_pooling
+        self._act = activation_fn(act_fn)
+
+        # Squeeze and Excitation layer.
+        self._se_reduce = Conv2ds(
+            in_channels,
+            se_channels,
+            1,
+            stride=1,
+            padding_type=padding_type,
+            model_name=model_name,
+            cur_stage=cur_stage)
+        self._se_expand = Conv2ds(
+            se_channels,
+            out_channels,
+            1,
+            stride=1,
+            padding_type=padding_type,
+            model_name=model_name,
+            cur_stage=cur_stage)
+
+    def forward(self, x):
+        if self._local_pooling:
+            se_tensor = F.adaptive_avg_pool2d(x, output_size=1)
+        else:
+            se_tensor = paddle.mean(x, axis=[2, 3], keepdim=True)
+        se_tensor = self._se_expand(self._act(self._se_reduce(se_tensor)))
+        return F.sigmoid(se_tensor) * x
+
+
+class MBConvBlock(nn.Layer):
+    """A class of MBConv: Mobile Inverted Residual Bottleneck.
+
+    Args:
+        se_ratio (int): se_ratio
+        in_channels (int): in_channels
+        expand_ratio (int): expand_ratio
+        kernel_size (int): kernel_size
+        strides (int): strides
+        out_channels (int): out_channels
+        bn_momentum (float): bn_momentum
+        bn_epsilon (float): bn_epsilon
+        local_pooling (bool): local_pooling
+        conv_dropout (float): conv_dropout
+        cur_stage (int): cur_stage
+        padding_type (str): padding_type
+        model_name (str): model_name
+    """
+
+    def __init__(self,
+                 se_ratio: int,
+                 in_channels: int,
+                 expand_ratio: int,
+                 kernel_size: int,
+                 strides: int,
+                 out_channels: int,
+                 bn_momentum: float,
+                 bn_epsilon: float,
+                 local_pooling: bool,
+                 conv_dropout: float,
+                 cur_stage: int,
+                 padding_type: str,
+                 model_name: str):
+        super(MBConvBlock, self).__init__()
+
+        self.se_ratio = se_ratio
+        self.in_channels = in_channels
+        self.expand_ratio = expand_ratio
+        self.kernel_size = kernel_size
+        self.strides = strides
+        self.out_channels = out_channels
+
+        self.bn_momentum = bn_momentum
+        self.bn_epsilon = bn_epsilon
+
+        self._local_pooling = local_pooling
+        self.act_fn = None
+        self.conv_dropout = conv_dropout
+
+        self._act = activation_fn(None)
+        self._has_se = (self.se_ratio is not None and 0 < self.se_ratio <= 1)
+        """Builds block according to the arguments."""
+        expand_channels = self.in_channels * self.expand_ratio
+        kernel_size = self.kernel_size
+
+        # Expansion phase. Called if not using fused convolutions and expansion
+        # phase is necessary.
+        if self.expand_ratio != 1:
+            self._expand_conv = Conv2ds(
+                self.in_channels,
+                expand_channels,
+                1,
+                stride=1,
+                use_bias=False,
+                padding_type=padding_type,
+                model_name=model_name,
+                cur_stage=cur_stage)
+            self._norm0 = nn.BatchNorm2D(
+                expand_channels,
+                self.bn_momentum,
+                self.bn_epsilon,
+                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+        # Depth-wise convolution phase. Called if not using fused convolutions.
+        self._depthwise_conv = Conv2ds(
+            expand_channels,
+            expand_channels,
+            kernel_size,
+            padding=kernel_size // 2,
+            stride=self.strides,
+            groups=expand_channels,
+            use_bias=False,
+            padding_type=padding_type,
+            model_name=model_name,
+            cur_stage=cur_stage)
+
+        self._norm1 = nn.BatchNorm2D(
+            expand_channels,
+            self.bn_momentum,
+            self.bn_epsilon,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+        if self._has_se:
+            num_reduced_filters = max(1, int(self.in_channels * self.se_ratio))
+            self._se = SE(self._local_pooling, None, expand_channels,
+                          num_reduced_filters, expand_channels, cur_stage,
+                          padding_type, model_name)
+        else:
+            self._se = None
+
+        # Output phase.
+        self._project_conv = Conv2ds(
+            expand_channels,
+            self.out_channels,
+            1,
+            stride=1,
+            use_bias=False,
+            padding_type=padding_type,
+            model_name=model_name,
+            cur_stage=cur_stage)
+        self._norm2 = nn.BatchNorm2D(
+            self.out_channels,
+            self.bn_momentum,
+            self.bn_epsilon,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.drop_out = nn.Dropout(self.conv_dropout)
+
+    def residual(self, inputs, x, survival_prob):
+        if (self.strides == 1 and self.in_channels == self.out_channels):
+            # Apply only if skip connection presents.
+            if survival_prob:
+                x = drop_path(x, self.training, survival_prob)
+            x = paddle.add(x, inputs)
+
+        return x
+
+    def forward(self, inputs, survival_prob=None):
+        """Implementation of call().
+
+        Args:
+            inputs: the inputs tensor.
+            survival_prob: float, between 0 to 1, drop connect rate.
+
+        Returns:
+            A output tensor.
+        """
+        x = inputs
+        if self.expand_ratio != 1:
+            x = self._act(self._norm0(self._expand_conv(x)))
+
+        x = self._act(self._norm1(self._depthwise_conv(x)))
+
+        if self.conv_dropout and self.expand_ratio > 1:
+            x = self.drop_out(x)
+
+        if self._se:
+            x = self._se(x)
+
+        x = self._norm2(self._project_conv(x))
+        x = self.residual(inputs, x, survival_prob)
+
+        return x
+
+
+class FusedMBConvBlock(MBConvBlock):
+    """Fusing the proj conv1x1 and depthwise_conv into a conv2d."""
+
+    def __init__(self, se_ratio, in_channels, expand_ratio, kernel_size,
+                 strides, out_channels, bn_momentum, bn_epsilon, local_pooling,
+                 conv_dropout, cur_stage, padding_type, model_name):
+        """Builds block according to the arguments."""
+        super(MBConvBlock, self).__init__()
+        self.se_ratio = se_ratio
+        self.in_channels = in_channels
+        self.expand_ratio = expand_ratio
+        self.kernel_size = kernel_size
+        self.strides = strides
+        self.out_channels = out_channels
+
+        self.bn_momentum = bn_momentum
+        self.bn_epsilon = bn_epsilon
+
+        self._local_pooling = local_pooling
+        self.act_fn = None
+        self.conv_dropout = conv_dropout
+
+        self._act = activation_fn(None)
+        self._has_se = (self.se_ratio is not None and 0 < self.se_ratio <= 1)
+
+        expand_channels = self.in_channels * self.expand_ratio
+        kernel_size = self.kernel_size
+        if self.expand_ratio != 1:
+            # Expansion phase:
+            self._expand_conv = Conv2ds(
+                self.in_channels,
+                expand_channels,
+                kernel_size,
+                padding=kernel_size // 2,
+                stride=self.strides,
+                use_bias=False,
+                padding_type=padding_type,
+                model_name=model_name,
+                cur_stage=cur_stage)
+            self._norm0 = nn.BatchNorm2D(
+                expand_channels,
+                self.bn_momentum,
+                self.bn_epsilon,
+                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+        if self._has_se:
+            num_reduced_filters = max(1, int(self.in_channels * self.se_ratio))
+            self._se = SE(self._local_pooling, None, expand_channels,
+                          num_reduced_filters, expand_channels, cur_stage,
+                          padding_type, model_name)
+        else:
+            self._se = None
+
+        # Output phase:
+        self._project_conv = Conv2ds(
+            expand_channels,
+            self.out_channels,
+            1 if (self.expand_ratio != 1) else kernel_size,
+            padding=(1 if (self.expand_ratio != 1) else kernel_size) // 2,
+            stride=1 if (self.expand_ratio != 1) else self.strides,
+            use_bias=False,
+            padding_type=padding_type,
+            model_name=model_name,
+            cur_stage=cur_stage)
+        self._norm1 = nn.BatchNorm2D(
+            self.out_channels,
+            self.bn_momentum,
+            self.bn_epsilon,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.drop_out = nn.Dropout(conv_dropout)
+
+    def forward(self, inputs, survival_prob=None):
+        """Implementation of call().
+
+        Args:
+            inputs: the inputs tensor.
+            training: boolean, whether the model is constructed for training.
+            survival_prob: float, between 0 to 1, drop connect rate.
+
+        Returns:
+            A output tensor.
+        """
+        x = inputs
+        if self.expand_ratio != 1:
+            x = self._act(self._norm0(self._expand_conv(x)))
+
+        if self.conv_dropout and self.expand_ratio > 1:
+            x = self.drop_out(x)
+
+        if self._se:
+            x = self._se(x)
+
+        x = self._norm1(self._project_conv(x))
+        if self.expand_ratio == 1:
+            x = self._act(x)  # add act if no expansion.
+
+        x = self.residual(inputs, x, survival_prob)
+        return x
+
+
+class Stem(nn.Layer):
+    """Stem layer at the begining of the network."""
+
+    def __init__(self, width_coefficient, depth_divisor, min_depth, skip,
+                 bn_momentum, bn_epsilon, act_fn, stem_channels, cur_stage,
+                 padding_type, model_name):
+        super(Stem, self).__init__()
+        self._conv_stem = Conv2ds(
+            3,
+            round_filters(stem_channels, width_coefficient, depth_divisor,
+                          min_depth, skip),
+            3,
+            padding=1,
+            stride=2,
+            use_bias=False,
+            padding_type=padding_type,
+            model_name=model_name,
+            cur_stage=cur_stage)
+        self._norm = nn.BatchNorm2D(
+            round_filters(stem_channels, width_coefficient, depth_divisor,
+                          min_depth, skip),
+            bn_momentum,
+            bn_epsilon,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self._act = activation_fn(act_fn)
+
+    def forward(self, inputs):
+        return self._act(self._norm(self._conv_stem(inputs)))
+
+
+class Head(nn.Layer):
+    """Head layer for network outputs."""
+
+    def __init__(self,
+                 in_channels,
+                 feature_size,
+                 bn_momentum,
+                 bn_epsilon,
+                 act_fn,
+                 dropout_rate,
+                 local_pooling,
+                 width_coefficient,
+                 depth_divisor,
+                 min_depth,
+                 skip=False):
+        super(Head, self).__init__()
+        self.in_channels = in_channels
+        self.feature_size = feature_size
+        self.bn_momentum = bn_momentum
+        self.bn_epsilon = bn_epsilon
+        self.dropout_rate = dropout_rate
+        self._local_pooling = local_pooling
+        self._conv_head = nn.Conv2D(
+            in_channels,
+            round_filters(self.feature_size or 1280, width_coefficient,
+                          depth_divisor, min_depth, skip),
+            kernel_size=1,
+            stride=1,
+            bias_attr=False)
+        self._norm = nn.BatchNorm2D(
+            round_filters(self.feature_size or 1280, width_coefficient,
+                          depth_divisor, min_depth, skip),
+            self.bn_momentum,
+            self.bn_epsilon,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self._act = activation_fn(act_fn)
+
+        self._avg_pooling = nn.AdaptiveAvgPool2D(output_size=1)
+
+        if isinstance(self.dropout_rate,
+                      (list, tuple)) or self.dropout_rate > 0:
+            self._dropout = nn.Dropout(self.dropout_rate[0] if isinstance(
+                self.dropout_rate, (list, tuple)) else self.dropout_rate)
+        else:
+            self._dropout = None
+
+    def forward(self, x):
+        """Call the layer."""
+        outputs = self._act(self._norm(self._conv_head(x)))
+
+        if self._local_pooling:
+            outputs = F.adaptive_avg_pool2d(outputs, output_size=1)
+            if self._dropout:
+                outputs = self._dropout(outputs)
+            if self._fc:
+                outputs = paddle.squeeze(outputs, axis=[2, 3])
+                outputs = self._fc(outputs)
+        else:
+            outputs = self._avg_pooling(outputs)
+            if self._dropout:
+                outputs = self._dropout(outputs)
+        return paddle.flatten(outputs, start_axis=1)
+
+
+class EfficientNetV2(nn.Layer):
+    """A class implements tf.keras.Model.
+
+        Reference: https://arxiv.org/abs/1807.11626
+    """
+
+    def __init__(self,
+                 model_name,
+                 blocks_args=None,
+                 mconfig=None,
+                 include_top=True,
+                 class_num=1000,
+                 padding_type="SAME"):
+        """Initializes an `Model` instance.
+
+        Args:
+            model_name: A string of model name.
+            model_config: A dict of model configurations or a string of hparams.
+        Raises:
+            ValueError: when blocks_args is not specified as a list.
+        """
+        super(EfficientNetV2, self).__init__()
+        self.blocks_args = blocks_args
+        self.mconfig = mconfig
+        """Builds a model."""
+        self._blocks = nn.LayerList()
+
+        cur_stage = 0
+        # Stem part.
+        self._stem = Stem(
+            self.mconfig.width_coefficient,
+            self.mconfig.depth_divisor,
+            self.mconfig.min_depth,
+            False,
+            self.mconfig.bn_momentum,
+            self.mconfig.bn_epsilon,
+            self.mconfig.act_fn,
+            stem_channels=self.blocks_args[0].in_channels,
+            cur_stage=cur_stage,
+            padding_type=padding_type,
+            model_name=model_name)
+        cur_stage += 1
+
+        # Builds blocks.
+        for block_args in self.blocks_args:
+            assert block_args.num_repeat > 0
+            # Update block input and output filters based on depth multiplier.
+            in_channels = round_filters(
+                block_args.in_channels, self.mconfig.width_coefficient,
+                self.mconfig.depth_divisor, self.mconfig.min_depth, False)
+            out_channels = round_filters(
+                block_args.out_channels, self.mconfig.width_coefficient,
+                self.mconfig.depth_divisor, self.mconfig.min_depth, False)
+
+            repeats = round_repeats(block_args.num_repeat,
+                                    self.mconfig.depth_coefficient)
+            block_args.update(
+                dict(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    num_repeat=repeats))
+
+            # The first block needs to take care of stride and filter size increase.
+            conv_block = {
+                0: MBConvBlock,
+                1: FusedMBConvBlock
+            }[block_args.conv_type]
+            self._blocks.append(
+                conv_block(block_args.se_ratio, block_args.in_channels,
+                           block_args.expand_ratio, block_args.kernel_size,
+                           block_args.strides, block_args.out_channels,
+                           self.mconfig.bn_momentum, self.mconfig.bn_epsilon,
+                           self.mconfig.local_pooling, self.mconfig.
+                           conv_dropout, cur_stage, padding_type, model_name))
+            if block_args.num_repeat > 1:  # rest of blocks with the same block_arg
+                block_args.in_channels = block_args.out_channels
+                block_args.strides = 1
+            for _ in range(block_args.num_repeat - 1):
+                self._blocks.append(
+                    conv_block(
+                        block_args.se_ratio, block_args.in_channels,
+                        block_args.expand_ratio, block_args.kernel_size,
+                        block_args.strides, block_args.out_channels,
+                        self.mconfig.bn_momentum, self.mconfig.bn_epsilon,
+                        self.mconfig.local_pooling, self.mconfig.conv_dropout,
+                        cur_stage, padding_type, model_name))
+            cur_stage += 1
+
+        # Head part.
+        self._head = Head(
+            self.blocks_args[-1].out_channels, self.mconfig.feature_size,
+            self.mconfig.bn_momentum, self.mconfig.bn_epsilon,
+            self.mconfig.act_fn, self.mconfig.dropout_rate,
+            self.mconfig.local_pooling, self.mconfig.width_coefficient,
+            self.mconfig.depth_divisor, self.mconfig.min_depth, False)
+
+        # top part for classification
+        if include_top and class_num:
+            self._fc = nn.Linear(
+                self.mconfig.feature_size,
+                class_num,
+                bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        else:
+            self._fc = None
+
+        # initialize weight
+        def _init_weights(m):
+            if isinstance(m, nn.Conv2D):
+                out_filters, in_channels, kernel_height, kernel_width = m.weight.shape
+                if in_channels == 1 and out_filters > in_channels:
+                    out_filters = in_channels
+                fan_out = int(kernel_height * kernel_width * out_filters)
+                Normal(mean=0.0, std=np.sqrt(2.0 / fan_out))(m.weight)
+            elif isinstance(m, nn.Linear):
+                init_range = 1.0 / np.sqrt(m.weight.shape[1])
+                Uniform(-init_range, init_range)(m.weight)
+                Constant(0.0)(m.bias)
+
+        self.apply(_init_weights)
+
+    def forward(self, inputs):
+        # Calls Stem layers
+        outputs = self._stem(inputs)
+        # print(f"stem: {outputs.mean().item():.10f}")
+
+        # Calls blocks.
+        for idx, block in enumerate(self._blocks):
+            survival_prob = self.mconfig.survival_prob
+            if survival_prob:
+                drop_rate = 1.0 - survival_prob
+                survival_prob = 1.0 - drop_rate * float(idx) / len(
+                    self._blocks)
+            outputs = block(outputs, survival_prob=survival_prob)
+
+        # Head to obtain the final feature.
+        outputs = self._head(outputs)
+        # Calls final dense layers and returns logits.
+        if self._fc:
+            outputs = self._fc(outputs)
+
+        return outputs
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def EfficientNetV2_S(include_top=True, pretrained=False, **kwargs):
+    """Get a V2 model instance.
+
+    Returns:
+        nn.Layer: A single model instantce
+    """
+    model_name = "efficientnetv2-s"
+    model_config = efficientnetv2_config(model_name)
+    model = EfficientNetV2(model_name, model_config.model.blocks_args,
+                           model_config.model, include_top, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetV2_S"])
+    return model
+
+
+def EfficientNetV2_M(include_top=True, pretrained=False, **kwargs):
+    """Get a V2 model instance.
+
+    Returns:
+        nn.Layer: A single model instantce
+    """
+    model_name = "efficientnetv2-m"
+    model_config = efficientnetv2_config(model_name)
+    model = EfficientNetV2(model_name, model_config.model.blocks_args,
+                           model_config.model, include_top, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetV2_M"])
+    return model
+
+
+def EfficientNetV2_L(include_top=True, pretrained=False, **kwargs):
+    """Get a V2 model instance.
+
+    Returns:
+        nn.Layer: A single model instantce
+    """
+    model_name = "efficientnetv2-l"
+    model_config = efficientnetv2_config(model_name)
+    model = EfficientNetV2(model_name, model_config.model.blocks_args,
+                           model_config.model, include_top, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetV2_L"])
+    return model
+
+
+def EfficientNetV2_XL(include_top=True, pretrained=False, **kwargs):
+    """Get a V2 model instance.
+
+    Returns:
+        nn.Layer: A single model instantce
+    """
+    model_name = "efficientnetv2-xl"
+    model_config = efficientnetv2_config(model_name)
+    model = EfficientNetV2(model_name, model_config.model.blocks_args,
+                           model_config.model, include_top, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetV2_XL"])
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/fasternet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/fasternet.py
new file mode 100644
index 000000000..4389d7c29
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/fasternet.py
@@ -0,0 +1,399 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/2303.03667
+
+import os
+import math
+import copy
+import warnings
+
+import paddle
+import paddle.nn as nn
+
+from .vision_transformer import trunc_normal_, zeros_, ones_
+from ....utils.save_load import load_dygraph_pretrain
+from ..model_zoo.vision_transformer import DropPath
+
+MODEL_URLS = {
+    "FasterNet_T0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/FasterNet_T0_pretrained.pdparams",
+    "FasterNet_T1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/FasterNet_T1_pretrained.pdparams",
+    "FasterNet_T2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/FasterNet_T2_pretrained.pdparams",
+    "FasterNet_S":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/FasterNet_S_pretrained.pdparams",
+    "FasterNet_M":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/FasterNet_M_pretrained.pdparams",
+    "FasterNet_L":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/FasterNet_L_pretrained.pdparams",
+}
+
+__all__ = MODEL_URLS.keys()
+
+NET_CONFIG = {
+    "FasterNet_T0":
+    [3, 40, [1, 2, 8, 2], 2, 4, 4, 4, 2, 2, True, 1280, 0.0, 0, 'BN', 'GELU'],
+    "FasterNet_T1": [
+        3, 64, [1, 2, 8, 2], 2, 4, 4, 4, 2, 2, True, 1280, 0.02, 0, 'BN',
+        'GELU'
+    ],
+    "FasterNet_T2": [
+        3, 96, [1, 2, 8, 2], 2, 4, 4, 4, 2, 2, True, 1280, 0.05, 0, 'BN',
+        'RELU'
+    ],
+    "FasterNet_S": [
+        3, 128, [1, 2, 13, 2], 2, 4, 4, 4, 2, 2, True, 1280, 0.1, 0, 'BN',
+        'RELU'
+    ],
+    "FasterNet_M": [
+        3, 144, [3, 4, 18, 3], 2, 4, 4, 4, 2, 2, True, 1280, 0.2, 0, 'BN',
+        'RELU'
+    ],
+    "FasterNet_L": [
+        3, 192, [3, 4, 18, 3], 2, 4, 4, 4, 2, 2, True, 1280, 0.3, 0, 'BN',
+        'RELU'
+    ],
+}
+
+
+class PartialConv(nn.Layer):
+    def __init__(self, dim: int, n_div: int, forward: str):
+        super().__init__()
+        self.dim_conv3 = dim // n_div
+        self.dim_untouched = dim - self.dim_conv3
+        self.partial_conv3 = nn.Conv2D(
+            in_channels=self.dim_conv3,
+            out_channels=self.dim_conv3,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias_attr=False)
+        if forward == 'slicing':
+            self.forward = self.forward_slicing
+        elif forward == 'split_cat':
+            self.forward = self.forward_split_cat
+        else:
+            raise NotImplementedError(
+                f"Forward method '{forward}' is not implemented.")
+
+    def forward_slicing(self, x):
+        x = x.clone()
+        x[:, :self.dim_conv3, :, :] = self.partial_conv3(
+            x[:, :self.dim_conv3, :, :])
+        return x
+
+    def forward_split_cat(self, x):
+        x1, x2 = paddle.split(
+            x=x, num_or_sections=[self.dim_conv3, self.dim_untouched], axis=1)
+        x1 = self.partial_conv3(x1)
+        x = paddle.concat(x=(x1, x2), axis=1)
+        return x
+
+
+class MLPBlock(nn.Layer):
+    def __init__(self, dim, n_div, mlp_ratio, drop_path,
+                 layer_scale_init_value, act_layer, norm_layer, pconv_fw_type):
+        super().__init__()
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        if drop_path > 0.:
+            self.drop_path = DropPath(drop_path)
+        else:
+            self.drop_path = nn.Identity()
+        self.n_div = n_div
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        mlp_layer = [
+            nn.Conv2D(
+                in_channels=dim,
+                out_channels=mlp_hidden_dim,
+                kernel_size=1,
+                bias_attr=False), norm_layer(mlp_hidden_dim), act_layer(),
+            nn.Conv2D(
+                in_channels=mlp_hidden_dim,
+                out_channels=dim,
+                kernel_size=1,
+                bias_attr=False)
+        ]
+        self.mlp = nn.Sequential(*mlp_layer)
+        self.spatial_mixing = PartialConv(dim, n_div, pconv_fw_type)
+        if layer_scale_init_value > 0:
+            self.layer_scale = (
+                paddle.base.framework.EagerParamBase.from_tensor(
+                    tensor=layer_scale_init_value * paddle.ones(shape=dim),
+                    trainable=True))
+            self.forward = self.forward_layer_scale
+        else:
+            self.forward = self.forward
+
+    def forward(self, x):
+        shortcut = x
+        x = self.spatial_mixing(x)
+        x = shortcut + self.drop_path(self.mlp(x))
+        return x
+
+    def forward_layer_scale(self, x):
+        shortcut = x
+        x = self.spatial_mixing(x)
+        x = shortcut + self.drop_path(
+            self.layer_scale.unsqueeze(axis=-1).unsqueeze(axis=-1) *
+            self.mlp(x))
+        return x
+
+
+class BasicStage(nn.Layer):
+    def __init__(self, dim, depth, n_div, mlp_ratio, drop_path,
+                 layer_scale_init_value, norm_layer, act_layer, pconv_fw_type):
+        super().__init__()
+        blocks_list = [
+            MLPBlock(
+                dim=dim,
+                n_div=n_div,
+                mlp_ratio=mlp_ratio,
+                drop_path=drop_path[i],
+                layer_scale_init_value=layer_scale_init_value,
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                pconv_fw_type=pconv_fw_type) for i in range(depth)
+        ]
+        self.blocks = nn.Sequential(*blocks_list)
+
+    def forward(self, x):
+        x = self.blocks(x)
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    def __init__(self, patch_size, patch_stride, in_chans, embed_dim,
+                 norm_layer):
+        super().__init__()
+        self.proj = nn.Conv2D(
+            in_channels=in_chans,
+            out_channels=embed_dim,
+            kernel_size=patch_size,
+            stride=patch_stride,
+            bias_attr=False)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim, momentum=0.1)
+        else:
+            self.norm = nn.Identity()
+
+    def forward(self, x):
+        x = self.norm(self.proj(x))
+        return x
+
+
+class PatchMerging(nn.Layer):
+    def __init__(self, patch_size_t, patch_stride_t, dim, norm_layer):
+        super().__init__()
+        self.reduction = nn.Conv2D(
+            in_channels=dim,
+            out_channels=2 * dim,
+            kernel_size=patch_size_t,
+            stride=patch_stride_t,
+            bias_attr=False)
+        if norm_layer is not None:
+            self.norm = norm_layer(2 * dim)
+        else:
+            self.norm = nn.Identity()
+
+    def forward(self, x):
+        x = self.norm(self.reduction(x))
+        return x
+
+
+class FasterNet(nn.Layer):
+    """
+    FasterNet
+    Args:
+        in_chans: int=3. Number of input channels. Default value is 3.
+        embed_dim: int=96. The dimension of embedding. Default value is 96.
+        depths: tuple=(1, 2, 8, 2). The depth of each stage. Default value is (1, 2, 8, 2).
+        mlp_ratio: float=2.0. The ratio of hidden dimension to embedding dimension. Default value is 2.0.
+        n_div: int=4. The number of divisions in the spatial dimension. Default value is 4.
+        patch_size: int=4. The size of patch. Default value is 4.
+        patch_stride: int=4. The stride of patch. Default value is 4.
+        patch_size_t: int=2. The size of patch for merging. Default value is 2.
+        patch_stride_t: int=2. The stride of patch for merging. Default value is 2.
+        patch_norm: bool=True. Whether to use patch normalization. Default value is True.
+        feature_dim: int=1280. The dimension of feature. Default value is 1280.
+        drop_path_rate: float=0.1. The drop path rate. Default value is 0.1.
+        layer_scale_init_value: float=0.0. The initial value of layer scale. Default value is 0.0.
+        norm_layer: str='BN'. The type of normalization layer. Default value is 'BN'.
+        act_layer: str='RELU'. The type of activation layer. Default value is 'RELU'.
+        class_num: int=1000. The number of classes. Default value is 1000.
+        fork_feat: bool=False. Whether to return feature maps. Default value is False.
+        pretrained: str=None. The path of pretrained model. Default value is None.
+        pconv_fw_type: str='split_cat'. The type of partial convolution forward. Default value is 'split_cat'.
+        scale: float=1.0. The coefficient that controls the size of network parameters. 
+    Returns:
+        model: nn.Layer. Specific FasterNet model depends on args.
+    """
+    def __init__(self,
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=(1, 2, 8, 2),
+                 mlp_ratio=2.0,
+                 n_div=4,
+                 patch_size=4,
+                 patch_stride=4,
+                 patch_size_t=2,
+                 patch_stride_t=2,
+                 patch_norm=True,
+                 feature_dim=1280,
+                 drop_path_rate=0.1,
+                 layer_scale_init_value=0,
+                 norm_layer='BN',
+                 act_layer='RELU',
+                 class_num=1000,
+                 fork_feat=False,
+                 pretrained=None,
+                 pconv_fw_type='split_cat',
+                 **kwargs):
+        super().__init__()
+        if norm_layer == 'BN':
+            norm_layer = nn.BatchNorm2D
+        else:
+            raise NotImplementedError
+        if act_layer == 'GELU':
+            act_layer = nn.GELU
+        elif act_layer == 'RELU':
+            act_layer = nn.ReLU
+        else:
+            raise NotImplementedError
+        if not fork_feat:
+            self.class_num = class_num
+        self.num_stages = len(depths)
+        self.embed_dim = embed_dim
+        self.patch_norm = patch_norm
+        self.num_features = int(embed_dim * 2**(self.num_stages - 1))
+        self.mlp_ratio = mlp_ratio
+        self.depths = depths
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            patch_stride=patch_stride,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+
+        dpr = [
+            x.item()
+            for x in paddle.linspace(
+                start=0, stop=drop_path_rate, num=sum(depths))
+        ]
+        stages_list = []
+        for i_stage in range(self.num_stages):
+            stage = BasicStage(
+                dim=int(embed_dim * 2**i_stage),
+                n_div=n_div,
+                depth=depths[i_stage],
+                mlp_ratio=self.mlp_ratio,
+                drop_path=dpr[sum(depths[:i_stage]):sum(depths[:i_stage + 1])],
+                layer_scale_init_value=layer_scale_init_value,
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                pconv_fw_type=pconv_fw_type)
+            stages_list.append(stage)
+            if i_stage < self.num_stages - 1:
+                stages_list.append(
+                    PatchMerging(
+                        patch_size_t=patch_size_t,
+                        patch_stride_t=patch_stride_t,
+                        dim=int(embed_dim * 2**i_stage),
+                        norm_layer=norm_layer))
+        self.stages = nn.Sequential(*stages_list)
+        self.avgpool_pre_head = nn.Sequential(
+            nn.AdaptiveAvgPool2D(output_size=1),
+            nn.Conv2D(
+                in_channels=self.num_features,
+                out_channels=feature_dim,
+                kernel_size=1,
+                bias_attr=False),
+            act_layer())
+        self.head = (nn.Linear(
+            in_features=feature_dim, out_features=class_num)
+                        if class_num > 0 else nn.Identity())
+        self.apply(self.cls_init_weights)
+        
+
+    def cls_init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, (nn.Conv1D, nn.Conv2D)):
+            trunc_normal_(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, (nn.LayerNorm, nn.GroupNorm)):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        x = self.stages(x)
+        x = self.avgpool_pre_head(x)
+        x = paddle.flatten(x=x, start_axis=1)
+        x = self.head(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def FasterNet_T0(pretrained=False, use_ssld=False, **kwargs):
+    model = FasterNet(*NET_CONFIG["FasterNet_T0"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["FasterNet_T0"], use_ssld)
+    return model
+
+
+def FasterNet_T1(pretrained=False, use_ssld=False, **kwargs):
+    model = FasterNet(*NET_CONFIG["FasterNet_T1"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["FasterNet_T1"], use_ssld)
+    return model
+
+
+def FasterNet_T2(pretrained=False, use_ssld=False, **kwargs):
+    model = FasterNet(*NET_CONFIG["FasterNet_T2"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["FasterNet_T2"], use_ssld)
+    return model
+
+
+def FasterNet_S(pretrained=False, use_ssld=False, **kwargs):
+    model = FasterNet(*NET_CONFIG["FasterNet_S"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["FasterNet_S"], use_ssld)
+    return model
+
+
+def FasterNet_M(pretrained=False, use_ssld=False, **kwargs):
+    model = FasterNet(*NET_CONFIG["FasterNet_M"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["FasterNet_M"], use_ssld)
+    return model
+
+
+def FasterNet_L(pretrained=False, use_ssld=False, **kwargs):
+    model = FasterNet(*NET_CONFIG["FasterNet_L"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["FasterNet_L"], use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/foundation_vit.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/foundation_vit.py
new file mode 100644
index 000000000..35146960c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/foundation_vit.py
@@ -0,0 +1,1261 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+# reference: https://arxiv.org/abs/2010.11929
+
+from collections.abc import Callable, Iterable
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import sys
+from paddle.nn.initializer import TruncatedNormal, Constant, Normal, Assign
+
+from ....utils import logger
+from ....utils.save_load import load_dygraph_pretrain
+from ..base.theseus_layer import TheseusLayer
+
+MODEL_URLS = {
+    "CLIP_vit_base_patch32_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/CLIP_vit_base_patch32_224.pdparams",
+    "CLIP_vit_base_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/CLIP_vit_base_patch16_224.pdparams",
+    "CLIP_vit_large_patch14_336":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/CLIP_vit_large_patch14_336.pdparams",
+    "CLIP_vit_large_patch14_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/CLIP_vit_large_patch14_224.pdparams",
+    "BEiTv2_vit_base_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/BEiTv2_vit_base_patch16_224.pdparams",
+    "BEiTv2_vit_large_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/BEiTv2_vit_large_patch16_224.pdparams",
+    "CAE_vit_base_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/CAE_vit_base_patch16_224.pdparams",
+    'EVA_vit_giant_patch14':
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/EVA_vit_giant_patch14.pdparams",
+    "MOCOV3_vit_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/MOCOV3_vit_small.pdparams",
+    "MOCOV3_vit_base":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/MOCOV3_vit_base.pdparams",
+    "MAE_vit_huge_patch14":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/MAE_vit_huge_patch14.pdparams",
+    "MAE_vit_large_patch16":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/MAE_vit_large_patch16.pdparams",
+    "MAE_vit_base_patch16":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/foundation_models/MAE_vit_base_patch16.pdparams",
+}
+
+
+def check_support_fused_op(use_fused_linear):
+    if use_fused_linear:
+        if paddle.device.cuda.get_device_capability()[0] >= 8:
+            return True
+        else:
+            logger.warning("The current device don't support Fused OP! Using the general Linear instead.")
+    return False
+
+
+def resize_pos_embed(pos_embed,
+                     src_shape,
+                     dst_shape,
+                     mode='bicubic',
+                     num_extra_tokens=1):
+    """Resize pos_embed weights.
+
+    Args:
+        pos_embed (torch.Tensor): Position embedding weights with shape
+            [1, L, C].
+        src_shape (tuple): The resolution of downsampled origin training
+            image, in format (H, W).
+        dst_shape (tuple): The resolution of downsampled new training
+            image, in format (H, W).
+        mode (str): Algorithm used for upsampling. Choose one from 'nearest',
+            'linear', 'bilinear', 'bicubic' and 'trilinear'.
+            Defaults to 'bicubic'.
+        num_extra_tokens (int): The number of extra tokens, such as cls_token.
+            Defaults to 1.
+
+    Returns:
+        torch.Tensor: The resized pos_embed of shape [1, L_new, C]
+    """
+    if src_shape[0] == dst_shape[0] and src_shape[1] == dst_shape[1]:
+        return pos_embed
+    assert pos_embed.ndim == 3, 'shape of pos_embed must be [1, L, C]'
+    _, L, C = pos_embed.shape
+    src_h, src_w = src_shape
+    assert L == src_h * src_w + num_extra_tokens, \
+        f"The length of `pos_embed` ({L}) doesn't match the expected " \
+        f'shape ({src_h}*{src_w}+{num_extra_tokens}). Please check the' \
+        '`img_size` argument.'
+    extra_tokens = pos_embed[:, :num_extra_tokens]
+
+    src_weight = pos_embed[:, num_extra_tokens:]
+    src_weight = src_weight.reshape([-1, src_h, src_w, C]).transpose(
+        [0, 3, 1, 2])
+
+    # The cubic interpolate algorithm only accepts float32
+    dst_weight = paddle.nn.functional.interpolate(
+        paddle.cast(src_weight, paddle.float32),
+        size=dst_shape,
+        align_corners=False,
+        mode=mode)
+    dst_weight = paddle.flatten(dst_weight, 2).transpose([0, 2, 1])
+    dst_weight = paddle.cast(dst_weight, src_weight.dtype)
+
+    return paddle.concat((extra_tokens, dst_weight), axis=1)
+
+
+def pading_for_not_divisible(pixel_values,
+                             height,
+                             width,
+                             patch_size,
+                             format="BCHW",
+                             function="split"):
+    if isinstance(patch_size, int):
+        patch_size = (patch_size, patch_size)
+    if height % patch_size[0] == 0 and width % patch_size[1] == 0:
+        return pixel_values, None
+    if function == "split":
+        pading_width = patch_size[1] - width % patch_size[1]
+        pading_height = patch_size[0] - height % patch_size[0]
+    elif function == "merge":
+        pading_width = width % 2
+        pading_height = height % 2
+    if format == "BCHW":
+        pad_index = (0, 0, 0, 0, 0, pading_height, 0, pading_width)
+    elif format == "BHWC":
+        pad_index = (0, 0, 0, pading_height, 0, pading_width, 0, 0)
+    else:
+        assert ("vaild format")
+
+    return paddle.nn.functional.pad(pixel_values, pad_index), pad_index
+
+
+__all__ = list(MODEL_URLS.keys())
+
+_model_size = None
+_model_diff = None
+
+_CLIP_diff = {
+    'add_layer_norm_before_encoder': [
+        'vit_base_patch32_224', 'vit_base_patch16_224',
+        'vit_large_patch14_336', 'vit_large_patch14_224'
+    ],
+    'add_relative_position_bias_in_msa': [],
+    'add_shared_rel_pos_bias': [],
+    'add_mul_gamma_to_msa_mlp': [],
+    'remove_cls_token': [],
+    'remove_abs_pos_emb': [],
+    'replace_mlp_GELU': [],
+    'head': {
+        'fc_norm': [],
+        'return_all_tokens': [],
+        'return_patch_tokens': [],
+        'return_tokens_mean': ['vit_base_patch16_224'],
+    },
+    'remove_cls_token_in_forward': ['vit_base_patch16_224'],
+}
+
+
+_MOCOV3_diff = {
+    'add_layer_norm_before_encoder': [],
+    'add_relative_position_bias_in_msa': [],
+    'add_shared_rel_pos_bias': [],
+    'add_mul_gamma_to_msa_mlp': [],
+    'remove_cls_token': [],
+    'remove_abs_pos_emb': [],
+    'replace_mlp_GELU': [],
+    'head': {
+        'fc_norm': [],
+        'return_all_tokens': [],
+        'return_patch_tokens': [],
+        'return_tokens_mean': [],
+    },
+    'remove_cls_token_in_forward': [],
+}
+
+_CoCa_diff = {
+    'add_layer_norm_before_encoder': [],
+    'add_relative_position_bias_in_msa': [],
+    'add_shared_rel_pos_bias': [],
+    'add_mul_gamma_to_msa_mlp': [],
+    'remove_cls_token': [],
+    'remove_abs_pos_emb': [],
+    'replace_mlp_GELU': [],
+    'head': {
+        'fc_norm': [],
+        'return_all_tokens': [],
+        'return_patch_tokens': [],
+        'return_tokens_mean': [],
+    },
+    'remove_cls_token_in_forward': [],
+}
+
+_BEiTv2_diff = {
+    'add_layer_norm_before_encoder': [],
+    'add_relative_position_bias_in_msa':
+    ['vit_base_patch16_224', 'vit_large_patch16_224'],
+    'add_shared_rel_pos_bias': [],
+    'add_mul_gamma_to_msa_mlp':
+    ['vit_base_patch16_224', 'vit_large_patch16_224'],
+    'remove_cls_token': [],
+    'remove_abs_pos_emb': ['vit_base_patch16_224', 'vit_large_patch16_224'],
+    'replace_mlp_GELU': [],
+    'head': {
+        'fc_norm': [],
+        'return_all_tokens': [],
+        'return_patch_tokens': [],
+        'return_tokens_mean': [],
+    },
+    'remove_cls_token_in_forward': [],
+}
+
+_CAE_diff = {
+    'add_layer_norm_before_encoder': [],
+    'add_relative_position_bias_in_msa': ['vit_base_patch16_224'],
+    'add_shared_rel_pos_bias': [],
+    'add_mul_gamma_to_msa_mlp': ['vit_base_patch16_224'],
+    'remove_cls_token': [],
+    'remove_abs_pos_emb': [],
+    'replace_mlp_GELU': [],
+    'head': {
+        'fc_norm': [],  # 3 x 197 x 786
+        'return_all_tokens': [],  # 3 x 197 x 1000
+        'return_patch_tokens': [],  # 3 x 196 x 1000
+        'return_tokens_mean': [],
+    },
+    'remove_cls_token_in_forward': [],
+}
+
+_EVA_diff = {
+    'add_layer_norm_before_encoder': [],
+    'add_relative_position_bias_in_msa': [],
+    'add_shared_rel_pos_bias': [],
+    'add_mul_gamma_to_msa_mlp': [],
+    'remove_cls_token': [],
+    'remove_abs_pos_emb': [],
+    'replace_mlp_GELU': [],
+    'head': {
+        'fc_norm': ['vit_huge_patch14'],
+        'return_all_tokens': [],
+        'return_patch_tokens': [],
+        'return_tokens_mean': [],
+    },
+    'remove_cls_token_in_forward': [],
+}
+
+_MAE_diff = {
+    'add_layer_norm_before_encoder': [],
+    'add_relative_position_bias_in_msa': [],
+    'add_shared_rel_pos_bias': [],
+    'add_mul_gamma_to_msa_mlp': [],
+    'remove_cls_token': [],
+    'remove_abs_pos_emb': [],
+    'replace_mlp_GELU': [],
+    'head': {
+        'fc_norm': ['vit_huge_patch14'],
+        'return_all_tokens': [],
+        'return_patch_tokens': [],
+        'return_tokens_mean': [],
+    },
+    'remove_cls_token_in_forward': [],
+}
+
+trunc_normal_ = TruncatedNormal(std=.02)
+normal_ = Normal
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+def to_2tuple(x):
+    return tuple([x] * 2)
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(TheseusLayer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+class QuickGELU(TheseusLayer):
+    def forward(self, x):
+        return x * nn.functional.sigmoid(1.702 * x)
+
+
+class Mlp(TheseusLayer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.,
+                 Linear=nn.Linear):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = Linear(in_features, hidden_features)
+        self.act = act_layer() if _model_size not in _model_diff[
+            'replace_mlp_GELU'] else QuickGELU()
+        self.fc2 = Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(TheseusLayer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 model_name=None,
+                 window_size=None,
+                 use_fused_attn=False,
+                 Linear=nn.Linear):
+        super().__init__()
+        self._model_name = model_name
+
+        if _model_size in _model_diff['add_relative_position_bias_in_msa']:
+            assert isinstance(
+                window_size, Iterable
+            ), f'window_size must be iterable, should not be {type(window_size)}'
+            self.window_size = window_size
+            self._register_relative_position_index(
+                window_size=window_size,
+                num_heads=num_heads, )
+
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.use_fused_attn = use_fused_attn
+        # TODO: support mask
+        if use_fused_attn:
+            if hasattr(self, 'relative_position_bias_table') or (_model_size in _model_diff['add_shared_rel_pos_bias'] and rel_pos_bias is not None):
+                logger.warning("The fused attn don't support `relative_position` yet, so fused attn will not be used.")
+                self.use_fused_attn = False
+
+    def _register_relative_position_index(
+            self,
+            window_size,
+            num_heads, ):
+        self.num_relative_distance = (2 * window_size[0] - 1) * (
+            2 * window_size[1] - 1) + 3
+        self.relative_position_bias_table = self.create_parameter(
+            [self.num_relative_distance, num_heads],
+            default_initializer=zeros_)  # 2*Wh-1 * 2*Ww-1, nH
+        coords_h = paddle.arange(window_size[0])
+        coords_w = paddle.arange(window_size[1])
+        coords = paddle.stack(paddle.meshgrid(
+            [coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.transpose(
+            [1, 2, 0])  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = \
+            paddle.zeros((window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
+        relative_position_index[1:, 1:] = relative_coords.sum(
+            -1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+
+        self.register_buffer("relative_position_index",
+                             relative_position_index)
+
+    def forward(self, x, rel_pos_bias=None):
+        # B= x.shape[0]
+        N, C = x.shape[1], x.shape[2]
+        qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C // self.num_heads))
+
+        if not self.use_fused_attn:
+            qkv = qkv.transpose((2, 0, 3, 1, 4))
+            q, k, v = qkv[0], qkv[1], qkv[2]
+            attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+            if hasattr(self, 'relative_position_bias_table'):
+                relative_position_bias = \
+                    self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([
+                        self.window_size[0] * self.window_size[1] + 1,
+                        self.window_size[0] * self.window_size[1] + 1, -1])  # Wh*Ww,Wh*Ww,nH
+                relative_position_bias = relative_position_bias.transpose(
+                    [2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+                attn = attn + relative_position_bias.unsqueeze(0)
+
+            if _model_size in _model_diff[
+                    'add_shared_rel_pos_bias'] and rel_pos_bias is not None:
+                attn = attn + rel_pos_bias
+
+            attn = nn.functional.softmax(attn, axis=-1)
+            attn = self.attn_drop(attn).matmul(v)
+            attn = attn.transpose((0, 2, 1, 3))
+        else:
+            qkv = qkv.transpose((2, 0, 1, 3, 4))
+            q, k, v = qkv[0], qkv[1], qkv[2]
+            # TODO: support mask
+            attn = paddle.nn.functional.scaled_dot_product_attention(q, k, v, dropout_p=self.attn_drop.p if self.training else 0.)
+
+        x = attn.reshape((-1, N, C))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(TheseusLayer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 model_name,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 init_values=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5,
+                 window_size=None,
+                 use_fused_attn=False,
+                 use_fused_linear=False):
+        super().__init__()
+        global _model_size
+        global _model_diff
+        self._model_name = model_name
+        if isinstance(norm_layer, str):
+            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+        elif isinstance(norm_layer, Callable):
+            self.norm1 = norm_layer(dim)
+        else:
+            raise TypeError(
+                "The norm_layer must be str or paddle.nn.layer.Layer class")
+        Linear = paddle.incubate.nn.FusedLinear if use_fused_linear else nn.Linear
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            model_name=self._model_name,
+            window_size=window_size,
+            use_fused_attn=use_fused_attn,
+            Linear=Linear)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+
+        if _model_size in _model_diff['add_mul_gamma_to_msa_mlp']:
+            self.gamma_1 = self.create_parameter(
+                [dim],
+                default_initializer=nn.initializer.Constant(value=init_values))
+            self.gamma_2 = self.create_parameter(
+                [dim],
+                default_initializer=nn.initializer.Constant(value=init_values))
+        else:
+            self.gamma_1 = None
+            self.gamma_2 = None
+
+        if isinstance(norm_layer, str):
+            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
+        elif isinstance(norm_layer, Callable):
+            self.norm2 = norm_layer(dim)
+        else:
+            raise TypeError(
+                "The norm_layer must be str or paddle.nn.layer.Layer class")
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop,
+                       Linear=Linear)
+
+    def forward(self, x, rel_pos_bias=None):
+        if self.gamma_1 is not None:
+            x = x + self.drop_path(self.gamma_1 * self.attn(
+                self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        else:
+            atten_result = self.drop_path(
+                self.attn(
+                    self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + atten_result
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class RelativePositionBias(TheseusLayer):
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0] - 1) * (
+            2 * window_size[1] - 1) + 3
+        self.relative_position_bias_table = self.create_parameter(
+            [self.num_relative_distance, num_heads],
+            default_initializer=zeros_)  # 2*Wh-1 * 2*Ww-1, nH
+        # cls to token & token 2 cls & cls to cls
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = paddle.arange(window_size[0])
+        coords_w = paddle.arange(window_size[1])
+        coords = paddle.stack(paddle.meshgrid(
+            [coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.transpose(
+            [1, 2, 0])  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = \
+            paddle.zeros((window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+        relative_position_index[1:, 1:] = relative_coords.sum(
+            -1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+
+        self.register_buffer("relative_position_index",
+                             relative_position_index)
+
+        # trunc_normal_(self.relative_position_bias_table, std=.02)
+
+    def forward(self):
+        relative_position_bias = \
+            self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([
+                self.window_size[0] * self.window_size[1] + 1,
+                self.window_size[0] * self.window_size[1] + 1, -1])  # Wh*Ww,Wh*Ww,nH
+        return relative_position_bias.transpose([2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+
+
+class PatchEmbed(TheseusLayer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 conv_bias=False):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * \
+            (img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        if conv_bias:
+            self.proj = nn.Conv2D(
+                in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        else:
+            self.proj = nn.Conv2D(
+                in_chans,
+                embed_dim,
+                kernel_size=patch_size,
+                stride=patch_size,
+                bias_attr=False)
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x, _ = pading_for_not_divisible(x, H, W, patch_size=self.patch_size)
+
+        x = self.proj(x)
+        _, _, H, W = x.shape
+
+        x = x.flatten(2).transpose((0, 2, 1))
+        return x, (H, W)
+
+
+class Head(TheseusLayer):
+    def __init__(self, embed_dim, class_num, norm_layer, model_size, setting):
+        super().__init__()
+        self.model_size = model_size
+        self.setting = setting
+
+        self.fc_norm = eval(norm_layer)(
+            embed_dim,
+            epsilon=1e-5) if model_size in setting['fc_norm'] else None
+        self.return_all_tokens = model_size in setting['return_all_tokens']
+        self.return_patch_tokens = model_size in setting['return_patch_tokens']
+        self.return_tokens_mean = model_size in setting['return_tokens_mean']
+
+        self.fc_head = nn.Linear(embed_dim,
+                                 class_num) if class_num > 0 else Identity()
+
+    def forward(self, x):
+        if self.fc_norm is not None:
+            if self.return_all_tokens:
+                x = self.fc_norm(x)
+            else:
+                t = x[:, 1:]
+                if self.return_patch_tokens:
+                    x = self.fc_norm(t)
+                else:
+                    x = self.fc_norm(t.mean(1))
+        elif isinstance(self.fc_head, Identity):
+            if self.return_all_tokens:
+                x = x
+            elif self.return_patch_tokens:
+                x = x[:, 1:]
+            elif self.return_tokens_mean:
+                x = x.mean(1)
+            else:
+                x = x[:, 0]
+        else:
+            x = x
+        return self.fc_head(x)
+
+
+class VisionTransformer(TheseusLayer):
+    """ Vision Transformer with support for patch input
+    """
+
+    def __init__(self,
+                 model_name,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dim=768,
+                 output_dim=512,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 image_project=False,
+                 conv_bias=False,
+                 feature_frame=False,
+                 hugging_face_framework=False,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5,
+                 head_init_scale=0.001,
+                 **kwargs):
+        super().__init__()
+        global _model_diff
+        global _model_size
+        _model_split = model_name.split('_')
+        self.model_name = _model_split[0]
+        self.feature_frame = feature_frame
+        self.model_size = '_'.join(_model_split[1:])
+        _model_size = self.model_size
+        _model_diff = eval(f'_{self.model_name}_diff')
+
+        self.class_num = class_num
+        self.return_embed = kwargs.get('return_embed', False)
+        self.return_mean_embed = kwargs.get('return_mean_embed', False) and self.return_embed
+        self.num_features = self.embed_dim = embed_dim
+        use_fused_attn = check_support_fused_op(kwargs.get('use_fused_attn', False))
+        use_fused_linear = check_support_fused_op(kwargs.get('use_fused_linear', False))
+        _img_size = to_2tuple(img_size)
+        _patch_size = to_2tuple(patch_size)
+        self.window_size = (_img_size[0] // _patch_size[0],
+                            _img_size[1] // _patch_size[1])
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            conv_bias=conv_bias)
+        num_patches = self.patch_embed.num_patches
+
+        if _model_size in _model_diff['add_shared_rel_pos_bias']:
+            self.rel_pos_bias = RelativePositionBias(
+                window_size=self.window_size, num_heads=num_heads)
+
+        #self.ln_pre = nn.LayerNorm(embed_dim) if _model_size in _model_diff[
+        #    'add_layer_norm_before_encoder'] else nn.Identity()
+
+        if _model_size in _model_diff['remove_cls_token'] or self.feature_frame:
+            self.pos_embed = self.create_parameter(
+                shape=(1, num_patches, embed_dim), default_initializer=zeros_)
+            self.cls_token = None
+        else:
+            self.pos_embed = self.create_parameter(
+                shape=(1, num_patches + 1, embed_dim),
+                default_initializer=zeros_)
+            self.cls_token = self.create_parameter(
+                shape=(1, 1, embed_dim), default_initializer=zeros_)
+            self.add_parameter("cls_token", self.cls_token)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        # for LaClip
+        if image_project:
+            image_projection = self.create_parameter(
+                shape=(img_size, embed_dim),
+                default_initializer=Assign(
+                    paddle.empty((img_size, embed_dim))))
+            self.add_parameter("image_projection", image_projection)
+        else:
+            self.image_projection = None
+        self.hugging_face_framework = hugging_face_framework
+        #for path size hugging face plan
+        if hugging_face_framework:
+            self.ln_pre = nn.LayerNorm(embed_dim)
+            self.add_parameter("pos_embed", self.pos_embed)
+        else:
+            self.ln_pre = nn.Identity() if _model_size not in _model_diff[
+                'add_layer_norm_before_encoder'] else nn.LayerNorm(embed_dim)
+            if _model_size in _model_diff['remove_abs_pos_emb']:
+                self.pos_embed = None
+            else:
+                self.add_parameter("pos_embed", self.pos_embed)
+
+        #proj
+        proj = self.create_parameter(
+            shape=(embed_dim, ),
+            default_initializer=Assign((embed_dim**-0.5) * paddle.randn((
+                (embed_dim, output_dim)))))
+        self.add_parameter("proj", proj)
+
+        dpr = np.linspace(0, drop_path_rate, depth)
+
+        self.blocks = nn.LayerList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                model_name=self.model_name,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                epsilon=epsilon,
+                window_size=self.window_size,
+                use_fused_attn=use_fused_attn,
+                use_fused_linear=use_fused_linear) for i in range(depth)
+        ])
+
+        self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)
+
+        self.head = Identity() if self.return_embed else Head(
+            embed_dim, class_num, norm_layer, self.model_size,
+            _model_diff['head'])
+
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed)
+        if not _model_size in _model_diff['remove_cls_token'] and self.feature_frame == False:
+            trunc_normal_(self.cls_token)
+
+        self.apply(self._init_weights)
+
+        if feature_frame:
+            self.feature = nn.Sequential(
+            nn.Linear(embed_dim * self.patch_embed.num_patches, embed_dim,bias_attr=False),
+            nn.BatchNorm1D(embed_dim, epsilon=2e-5),
+            nn.Linear(embed_dim, output_dim, bias_attr=False),
+            nn.BatchNorm1D(output_dim, epsilon=2e-5))
+            self.pos_drop = Identity()
+            self.cls_token = None
+            self.image_projection = Identity()
+            self.proj = None
+            self.ln_pre = Identity()
+
+        if head_init_scale != 1:
+            if not self.return_embed and class_num > 0:
+                self.head.fc_head.weight.set_value(
+                    self.head.fc_head.weight *
+                    paddle.to_tensor(head_init_scale))
+                self.head.fc_head.bias.set_value(
+                    self.head.fc_head.bias * paddle.to_tensor(head_init_scale))
+            else:
+                logger.warning(
+                    "Because the head or head.fc_head of ViT is Identity() class, the argument head_init_scale is invalid."
+                )
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def get_num_layers(self):
+        return len(self.blocks)
+
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    def forward_features(self, x):
+        B = x.shape[0]
+        x, output_dimensions = self.patch_embed(x)
+        if not _model_size in _model_diff['remove_cls_token'] and (self.feature_frame==False):
+            cls_tokens = self.cls_token.expand((B, -1, -1))
+            x = paddle.concat((cls_tokens, x), axis=1)
+
+        if self.pos_embed is not None:
+            x = x + resize_pos_embed(self.pos_embed, self.window_size,
+                                     output_dimensions)
+
+        x = self.ln_pre(x)
+        x = self.pos_drop(x)
+        rel_pos_bias = self.rel_pos_bias() if hasattr(self,
+                                                      'rel_pos_bias') else None
+        for blk in self.blocks:
+            x = blk(x, rel_pos_bias=rel_pos_bias)
+
+        if _model_size in _model_diff['remove_cls_token_in_forward']:
+            x = x[:, 1:, :]
+        if self.hugging_face_framework or self.return_embed == False:
+            pooled, token = x[:, 0], x[:, 1:]
+        else:
+            pooled = x
+        x = self.norm(pooled)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+
+        if self.feature_frame:
+            B, L, C = x.shape
+            x = paddle.reshape(x,[B, -1])
+            x = self.feature(x)
+
+        x = self.head(x)
+
+        if self.proj is not None and isinstance(self.head,Identity):
+           x = x @self.proj
+        if self.return_mean_embed:
+            x = x.mean(1)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def CLIP_vit_base_patch32_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        img_size=224,
+        patch_size=32,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-5,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def CLIP_vit_base_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        img_size=224,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-5,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def React_vit_base_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = "React_vit_base_patch16_224"
+    model = VisionTransformer(
+        model_name=model_name.replace("React","CLIP"),
+        img_size=224,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        hugging_face_framework=True,
+        epsilon=1e-5,
+        **kwargs, )
+    return model
+
+
+def React_vit_base_patch32_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = "React_vit_base_patch32_224"
+    model = VisionTransformer(
+        model_name=model_name.replace("React","CLIP"),
+        img_size=224,
+        patch_size=32,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        hugging_face_framework=True,
+        epsilon=1e-5,
+        **kwargs, )
+    return model
+
+
+def LaCLIP_vit_base_patch32_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = "LaCLIP_vit_base_patch32_224"
+    model = VisionTransformer(
+        model_name=model_name.replace("LaCLIP","CLIP"),
+        img_size=224,
+        patch_size=32,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        hugging_face_framework=True,
+        epsilon=1e-5,
+        **kwargs, )
+
+    return model
+
+
+def LaCLIP_vit_base_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = "LaCLIP_vit_base_patch16_224"
+    model = VisionTransformer(
+        model_name=model_name.replace("LaCLIP","CLIP"),
+        img_size=224,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        hugging_face_framework=True,
+        epsilon=1e-5,
+        **kwargs, )
+
+    return model
+
+
+def Unicom_vit_base_patch32_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = "Unicom_vit_base_patch32_224"
+    model = VisionTransformer(
+        model_name=model_name.replace("Unicom","CLIP"),
+        img_size=224,
+        patch_size=32,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=False,
+        conv_bias=True,
+        feature_frame=True,
+        hugging_face_framework=False,
+        image_project=False,
+        epsilon=1e-5,
+        **kwargs, )
+
+    return model
+
+
+def Unicom_vit_base_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = "Unicom_vit_base_patch16_224"
+    model = VisionTransformer(
+        model_name=model_name.replace("Unicom","CLIP"),
+        img_size=224,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=False,
+        hugging_face_framework=False,
+        image_project=False,
+        feature_frame=True,
+        conv_bias=True,
+        epsilon=1e-5,
+        **kwargs, )
+
+    return model
+
+
+def CLIP_vit_base_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        img_size=224,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-5,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def CLIP_vit_large_patch14_336(pretrained=False, use_ssld=False, **kwargs):
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        img_size=336,
+        patch_size=14,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-5,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def CLIP_vit_large_patch14_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        img_size=224,
+        patch_size=14,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-5,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def BEiTv2_vit_base_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        img_size=224,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def BEiTv2_vit_large_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        img_size=224,
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def MOCOV3_vit_small(pretrained=False, use_ssld=False, **kwargs):
+    """
+    vit small in mocov3
+    """
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        patch_size=16,
+        embed_dim=384,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def MOCOV3_vit_base(pretrained=False, use_ssld=False, **kwargs):
+    """
+    vit base in mocov3
+    """
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def MAE_vit_base_patch16(pretrained=False, use_ssld=False, **kwargs):
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def MAE_vit_large_patch16(pretrained=False, use_ssld=False, **kwargs):
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def MAE_vit_huge_patch14(pretrained=False, use_ssld=False, **kwargs):
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        patch_size=14,
+        embed_dim=1280,
+        depth=32,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def EVA_vit_giant_patch14(pretrained=False, use_ssld=False, **kwargs):
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        patch_size=14,
+        embed_dim=1408,
+        depth=40,
+        num_heads=16,
+        init_values=None,
+        mlp_ratio=4.3637,
+        qkv_bias=True,
+        class_num=0,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
+
+
+def CAE_vit_base_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model_name = sys._getframe().f_code.co_name
+    model = VisionTransformer(
+        model_name=model_name,
+        img_size=224,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs, )
+    _load_pretrained(
+        pretrained, model, MODEL_URLS[model_name], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/ghostnet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/ghostnet.py
new file mode 100644
index 000000000..8a3960827
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/ghostnet.py
@@ -0,0 +1,364 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/huawei-noah/CV-Backbones/tree/master/ghostnet_pytorch
+# reference: https://arxiv.org/abs/1911.11907
+
+import math
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, AdaptiveAvgPool2D, Linear
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Uniform, KaimingNormal
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "GhostNet_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/GhostNet_x0_5_pretrained.pdparams",
+    "GhostNet_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/GhostNet_x1_0_pretrained.pdparams",
+    "GhostNet_x1_3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/GhostNet_x1_3_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 groups=1,
+                 act="relu",
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(
+                initializer=KaimingNormal(), name=name + "_weights"),
+            bias_attr=False)
+        bn_name = name + "_bn"
+
+        self._batch_norm = BatchNorm(
+            num_channels=out_channels,
+            act=act,
+            param_attr=ParamAttr(
+                name=bn_name + "_scale", regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(
+                name=bn_name + "_offset", regularizer=L2Decay(0.0)),
+            moving_mean_name=bn_name + "_mean",
+            moving_variance_name=bn_name + "_variance")
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class SEBlock(nn.Layer):
+    def __init__(self, num_channels, reduction_ratio=4, name=None):
+        super(SEBlock, self).__init__()
+        self.pool2d_gap = AdaptiveAvgPool2D(1)
+        self._num_channels = num_channels
+        stdv = 1.0 / math.sqrt(num_channels * 1.0)
+        med_ch = num_channels // reduction_ratio
+        self.squeeze = Linear(
+            num_channels,
+            med_ch,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_1_weights"),
+            bias_attr=ParamAttr(name=name + "_1_offset"))
+        stdv = 1.0 / math.sqrt(med_ch * 1.0)
+        self.excitation = Linear(
+            med_ch,
+            num_channels,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_2_weights"),
+            bias_attr=ParamAttr(name=name + "_2_offset"))
+
+    def forward(self, inputs):
+        pool = self.pool2d_gap(inputs)
+        pool = paddle.squeeze(pool, axis=[2, 3])
+        squeeze = self.squeeze(pool)
+        squeeze = F.relu(squeeze)
+        excitation = self.excitation(squeeze)
+        excitation = paddle.clip(x=excitation, min=0, max=1)
+        excitation = paddle.unsqueeze(excitation, axis=[2, 3])
+        out = paddle.multiply(inputs, excitation)
+        return out
+
+
+class GhostModule(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 output_channels,
+                 kernel_size=1,
+                 ratio=2,
+                 dw_size=3,
+                 stride=1,
+                 relu=True,
+                 name=None):
+        super(GhostModule, self).__init__()
+        init_channels = int(math.ceil(output_channels / ratio))
+        new_channels = int(init_channels * (ratio - 1))
+        self.primary_conv = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=init_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            groups=1,
+            act="relu" if relu else None,
+            name=name + "_primary_conv")
+        self.cheap_operation = ConvBNLayer(
+            in_channels=init_channels,
+            out_channels=new_channels,
+            kernel_size=dw_size,
+            stride=1,
+            groups=init_channels,
+            act="relu" if relu else None,
+            name=name + "_cheap_operation")
+
+    def forward(self, inputs):
+        x = self.primary_conv(inputs)
+        y = self.cheap_operation(x)
+        out = paddle.concat([x, y], axis=1)
+        return out
+
+
+class GhostBottleneck(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 hidden_dim,
+                 output_channels,
+                 kernel_size,
+                 stride,
+                 use_se,
+                 name=None):
+        super(GhostBottleneck, self).__init__()
+        self._stride = stride
+        self._use_se = use_se
+        self._num_channels = in_channels
+        self._output_channels = output_channels
+        self.ghost_module_1 = GhostModule(
+            in_channels=in_channels,
+            output_channels=hidden_dim,
+            kernel_size=1,
+            stride=1,
+            relu=True,
+            name=name + "_ghost_module_1")
+        if stride == 2:
+            self.depthwise_conv = ConvBNLayer(
+                in_channels=hidden_dim,
+                out_channels=hidden_dim,
+                kernel_size=kernel_size,
+                stride=stride,
+                groups=hidden_dim,
+                act=None,
+                name=name +
+                "_depthwise_depthwise"  # looks strange due to an old typo, will be fixed later.
+            )
+        if use_se:
+            self.se_block = SEBlock(num_channels=hidden_dim, name=name + "_se")
+        self.ghost_module_2 = GhostModule(
+            in_channels=hidden_dim,
+            output_channels=output_channels,
+            kernel_size=1,
+            relu=False,
+            name=name + "_ghost_module_2")
+        if stride != 1 or in_channels != output_channels:
+            self.shortcut_depthwise = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                groups=in_channels,
+                act=None,
+                name=name +
+                "_shortcut_depthwise_depthwise"  # looks strange due to an old typo, will be fixed later.
+            )
+            self.shortcut_conv = ConvBNLayer(
+                in_channels=in_channels,
+                out_channels=output_channels,
+                kernel_size=1,
+                stride=1,
+                groups=1,
+                act=None,
+                name=name + "_shortcut_conv")
+
+    def forward(self, inputs):
+        x = self.ghost_module_1(inputs)
+        if self._stride == 2:
+            x = self.depthwise_conv(x)
+        if self._use_se:
+            x = self.se_block(x)
+        x = self.ghost_module_2(x)
+        if self._stride == 1 and self._num_channels == self._output_channels:
+            shortcut = inputs
+        else:
+            shortcut = self.shortcut_depthwise(inputs)
+            shortcut = self.shortcut_conv(shortcut)
+        return paddle.add(x=x, y=shortcut)
+
+
+class GhostNet(nn.Layer):
+    def __init__(self, scale, class_num=1000):
+        super(GhostNet, self).__init__()
+        self.cfgs = [
+            # k, t, c, SE, s
+            [3, 16, 16, 0, 1],
+            [3, 48, 24, 0, 2],
+            [3, 72, 24, 0, 1],
+            [5, 72, 40, 1, 2],
+            [5, 120, 40, 1, 1],
+            [3, 240, 80, 0, 2],
+            [3, 200, 80, 0, 1],
+            [3, 184, 80, 0, 1],
+            [3, 184, 80, 0, 1],
+            [3, 480, 112, 1, 1],
+            [3, 672, 112, 1, 1],
+            [5, 672, 160, 1, 2],
+            [5, 960, 160, 0, 1],
+            [5, 960, 160, 1, 1],
+            [5, 960, 160, 0, 1],
+            [5, 960, 160, 1, 1]
+        ]
+        self.scale = scale
+        output_channels = int(self._make_divisible(16 * self.scale, 4))
+        self.conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=output_channels,
+            kernel_size=3,
+            stride=2,
+            groups=1,
+            act="relu",
+            name="conv1")
+        # build inverted residual blocks
+        idx = 0
+        self.ghost_bottleneck_list = []
+        for k, exp_size, c, use_se, s in self.cfgs:
+            in_channels = output_channels
+            output_channels = int(self._make_divisible(c * self.scale, 4))
+            hidden_dim = int(self._make_divisible(exp_size * self.scale, 4))
+            ghost_bottleneck = self.add_sublayer(
+                name="_ghostbottleneck_" + str(idx),
+                sublayer=GhostBottleneck(
+                    in_channels=in_channels,
+                    hidden_dim=hidden_dim,
+                    output_channels=output_channels,
+                    kernel_size=k,
+                    stride=s,
+                    use_se=use_se,
+                    name="_ghostbottleneck_" + str(idx)))
+            self.ghost_bottleneck_list.append(ghost_bottleneck)
+            idx += 1
+        # build last several layers
+        in_channels = output_channels
+        output_channels = int(self._make_divisible(exp_size * self.scale, 4))
+        self.conv_last = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=output_channels,
+            kernel_size=1,
+            stride=1,
+            groups=1,
+            act="relu",
+            name="conv_last")
+        self.pool2d_gap = AdaptiveAvgPool2D(1)
+        in_channels = output_channels
+        self._fc0_output_channels = 1280
+        self.fc_0 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=self._fc0_output_channels,
+            kernel_size=1,
+            stride=1,
+            act="relu",
+            name="fc_0")
+        self.dropout = nn.Dropout(p=0.2)
+        stdv = 1.0 / math.sqrt(self._fc0_output_channels * 1.0)
+        self.fc_1 = Linear(
+            self._fc0_output_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                name="fc_1_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(name="fc_1_offset"))
+
+    def forward(self, inputs):
+        x = self.conv1(inputs)
+        for ghost_bottleneck in self.ghost_bottleneck_list:
+            x = ghost_bottleneck(x)
+        x = self.conv_last(x)
+        x = self.pool2d_gap(x)
+        x = self.fc_0(x)
+        x = self.dropout(x)
+        x = paddle.reshape(x, shape=[-1, self._fc0_output_channels])
+        x = self.fc_1(x)
+        return x
+
+    def _make_divisible(self, v, divisor, min_value=None):
+        """
+        This function is taken from the original tf repo.
+        It ensures that all layers have a channel number that is divisible by 8
+        It can be seen here:
+        https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+        """
+        if min_value is None:
+            min_value = divisor
+        new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+        # Make sure that round down does not go down by more than 10%.
+        if new_v < 0.9 * v:
+            new_v += divisor
+        return new_v
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def GhostNet_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    model = GhostNet(scale=0.5, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["GhostNet_x0_5"], use_ssld=use_ssld)
+    return model
+
+
+def GhostNet_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    model = GhostNet(scale=1.0, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["GhostNet_x1_0"], use_ssld=use_ssld)
+    return model
+
+
+def GhostNet_x1_3(pretrained=False, use_ssld=False, **kwargs):
+    model = GhostNet(scale=1.3, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["GhostNet_x1_3"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/googlenet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/googlenet.py
new file mode 100644
index 000000000..fcd52c923
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/googlenet.py
@@ -0,0 +1,365 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1409.4842
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "GoogLeNet":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/GoogLeNet_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def xavier(channels, filter_size, name):
+    stdv = (3.0 / (filter_size**2 * channels))**0.5
+    param_attr = ParamAttr(
+        initializer=Uniform(-stdv, stdv), name=name + "_weights")
+    return param_attr
+
+
+class ConvLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None,
+                 data_format="NCHW"):
+        super(ConvLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            data_format=data_format)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        return y
+
+
+class Inception(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter1,
+                 filter3R,
+                 filter3,
+                 filter5R,
+                 filter5,
+                 proj,
+                 name=None,
+                 data_format="NCHW"):
+        super(Inception, self).__init__()
+        self.data_format = data_format
+
+        self._conv1 = ConvLayer(
+            input_channels,
+            filter1,
+            1,
+            name="inception_" + name + "_1x1",
+            data_format=data_format)
+        self._conv3r = ConvLayer(
+            input_channels,
+            filter3R,
+            1,
+            name="inception_" + name + "_3x3_reduce",
+            data_format=data_format)
+        self._conv3 = ConvLayer(
+            filter3R,
+            filter3,
+            3,
+            name="inception_" + name + "_3x3",
+            data_format=data_format)
+        self._conv5r = ConvLayer(
+            input_channels,
+            filter5R,
+            1,
+            name="inception_" + name + "_5x5_reduce",
+            data_format=data_format)
+        self._conv5 = ConvLayer(
+            filter5R,
+            filter5,
+            5,
+            name="inception_" + name + "_5x5",
+            data_format=data_format)
+        self._pool = MaxPool2D(
+            kernel_size=3, stride=1, padding=1, data_format=data_format)
+
+        self._convprj = ConvLayer(
+            input_channels,
+            proj,
+            1,
+            name="inception_" + name + "_3x3_proj",
+            data_format=data_format)
+
+    def forward(self, inputs):
+        conv1 = self._conv1(inputs)
+
+        conv3r = self._conv3r(inputs)
+        conv3 = self._conv3(conv3r)
+
+        conv5r = self._conv5r(inputs)
+        conv5 = self._conv5(conv5r)
+
+        pool = self._pool(inputs)
+        convprj = self._convprj(pool)
+
+        if self.data_format == "NHWC":
+            cat = paddle.concat([conv1, conv3, conv5, convprj], axis=3)
+        else:
+            cat = paddle.concat([conv1, conv3, conv5, convprj], axis=1)
+        cat = F.relu(cat)
+        return cat
+
+
+class GoogLeNetDY(nn.Layer):
+    def __init__(self, class_num=1000, data_format="NCHW"):
+        super(GoogLeNetDY, self).__init__()
+        self.data_format = data_format
+        self._conv = ConvLayer(
+            3, 64, 7, 2, name="conv1", data_format=data_format)
+        self._pool = MaxPool2D(
+            kernel_size=3, stride=2, data_format=data_format)
+        self._conv_1 = ConvLayer(
+            64, 64, 1, name="conv2_1x1", data_format=data_format)
+        self._conv_2 = ConvLayer(
+            64, 192, 3, name="conv2_3x3", data_format=data_format)
+
+        self._ince3a = Inception(
+            192,
+            192,
+            64,
+            96,
+            128,
+            16,
+            32,
+            32,
+            name="ince3a",
+            data_format=data_format)
+        self._ince3b = Inception(
+            256,
+            256,
+            128,
+            128,
+            192,
+            32,
+            96,
+            64,
+            name="ince3b",
+            data_format=data_format)
+
+        self._ince4a = Inception(
+            480,
+            480,
+            192,
+            96,
+            208,
+            16,
+            48,
+            64,
+            name="ince4a",
+            data_format=data_format)
+        self._ince4b = Inception(
+            512,
+            512,
+            160,
+            112,
+            224,
+            24,
+            64,
+            64,
+            name="ince4b",
+            data_format=data_format)
+        self._ince4c = Inception(
+            512,
+            512,
+            128,
+            128,
+            256,
+            24,
+            64,
+            64,
+            name="ince4c",
+            data_format=data_format)
+        self._ince4d = Inception(
+            512,
+            512,
+            112,
+            144,
+            288,
+            32,
+            64,
+            64,
+            name="ince4d",
+            data_format=data_format)
+        self._ince4e = Inception(
+            528,
+            528,
+            256,
+            160,
+            320,
+            32,
+            128,
+            128,
+            name="ince4e",
+            data_format=data_format)
+
+        self._ince5a = Inception(
+            832,
+            832,
+            256,
+            160,
+            320,
+            32,
+            128,
+            128,
+            name="ince5a",
+            data_format=data_format)
+        self._ince5b = Inception(
+            832,
+            832,
+            384,
+            192,
+            384,
+            48,
+            128,
+            128,
+            name="ince5b",
+            data_format=data_format)
+
+        self._pool_5 = AdaptiveAvgPool2D(1, data_format=data_format)
+
+        self._drop = Dropout(p=0.4, mode="downscale_in_infer")
+        self.flatten = nn.Flatten()
+        self._fc_out = Linear(
+            1024,
+            class_num,
+            weight_attr=xavier(1024, 1, "out"),
+            bias_attr=ParamAttr(name="out_offset"))
+        self._pool_o1 = AvgPool2D(
+            kernel_size=5, stride=3, data_format=data_format)
+        self._conv_o1 = ConvLayer(
+            512, 128, 1, name="conv_o1", data_format=data_format)
+        self._fc_o1 = Linear(
+            1152,
+            1024,
+            weight_attr=xavier(2048, 1, "fc_o1"),
+            bias_attr=ParamAttr(name="fc_o1_offset"))
+        self._drop_o1 = Dropout(p=0.7, mode="downscale_in_infer")
+        self._out1 = Linear(
+            1024,
+            class_num,
+            weight_attr=xavier(1024, 1, "out1"),
+            bias_attr=ParamAttr(name="out1_offset"))
+        self._pool_o2 = AvgPool2D(
+            kernel_size=5, stride=3, data_format=data_format)
+        self._conv_o2 = ConvLayer(
+            528, 128, 1, name="conv_o2", data_format=data_format)
+        self._fc_o2 = Linear(
+            1152,
+            1024,
+            weight_attr=xavier(2048, 1, "fc_o2"),
+            bias_attr=ParamAttr(name="fc_o2_offset"))
+        self._drop_o2 = Dropout(p=0.7, mode="downscale_in_infer")
+        self._out2 = Linear(
+            1024,
+            class_num,
+            weight_attr=xavier(1024, 1, "out2"),
+            bias_attr=ParamAttr(name="out2_offset"))
+
+    def forward(self, inputs):
+        if self.data_format == "NHWC":
+            inputs = paddle.transpose(inputs, [0, 2, 3, 1])
+            inputs.stop_gradient = True
+        x = self._conv(inputs)
+        x = self._pool(x)
+        x = self._conv_1(x)
+        x = self._conv_2(x)
+        x = self._pool(x)
+
+        x = self._ince3a(x)
+        x = self._ince3b(x)
+        x = self._pool(x)
+
+        ince4a = self._ince4a(x)
+        x = self._ince4b(ince4a)
+        x = self._ince4c(x)
+        ince4d = self._ince4d(x)
+        x = self._ince4e(ince4d)
+        x = self._pool(x)
+
+        x = self._ince5a(x)
+        ince5b = self._ince5b(x)
+
+        x = self._pool_5(ince5b)
+        x = self._drop(x)
+        x = self.flatten(x)
+        out = self._fc_out(x)
+
+        x = self._pool_o1(ince4a)
+        x = self._conv_o1(x)
+        x = self.flatten(x)
+        x = self._fc_o1(x)
+        x = F.relu(x)
+        x = self._drop_o1(x)
+        out1 = self._out1(x)
+
+        x = self._pool_o2(ince4d)
+        x = self._conv_o2(x)
+        x = self.flatten(x)
+        x = self._fc_o2(x)
+        x = self._drop_o2(x)
+        out2 = self._out2(x)
+        return [out, out1, out2]
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def GoogLeNet(pretrained=False, use_ssld=False, **kwargs):
+    model = GoogLeNetDY(**kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["GoogLeNet"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/hardnet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/hardnet.py
new file mode 100644
index 000000000..fa3399e3e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/hardnet.py
@@ -0,0 +1,294 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/PingoLH/Pytorch-HarDNet
+# reference: https://arxiv.org/abs/1909.00948
+
+import paddle
+import paddle.nn as nn
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    'HarDNet39_ds':
+    'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/HarDNet39_ds_pretrained.pdparams',
+    'HarDNet68_ds':
+    'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/HarDNet68_ds_pretrained.pdparams',
+    'HarDNet68':
+    'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/HarDNet68_pretrained.pdparams',
+    'HarDNet85':
+    'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/HarDNet85_pretrained.pdparams'
+}
+
+__all__ = MODEL_URLS.keys()
+
+
+def ConvLayer(in_channels,
+              out_channels,
+              kernel_size=3,
+              stride=1,
+              bias_attr=False):
+    layer = nn.Sequential(
+        ('conv', nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=kernel_size // 2,
+            groups=1,
+            bias_attr=bias_attr)), ('norm', nn.BatchNorm2D(out_channels)),
+        ('relu', nn.ReLU6()))
+    return layer
+
+
+def DWConvLayer(in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=1,
+                bias_attr=False):
+    layer = nn.Sequential(
+        ('dwconv', nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=1,
+            groups=out_channels,
+            bias_attr=bias_attr)), ('norm', nn.BatchNorm2D(out_channels)))
+    return layer
+
+
+def CombConvLayer(in_channels, out_channels, kernel_size=1, stride=1):
+    layer = nn.Sequential(
+        ('layer1', ConvLayer(
+            in_channels, out_channels, kernel_size=kernel_size)),
+        ('layer2', DWConvLayer(
+            out_channels, out_channels, stride=stride)))
+    return layer
+
+
+class HarDBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 growth_rate,
+                 grmul,
+                 n_layers,
+                 keepBase=False,
+                 residual_out=False,
+                 dwconv=False):
+        super().__init__()
+        self.keepBase = keepBase
+        self.links = []
+        layers_ = []
+        self.out_channels = 0  # if upsample else in_channels
+        for i in range(n_layers):
+            outch, inch, link = self.get_link(i + 1, in_channels, growth_rate,
+                                              grmul)
+            self.links.append(link)
+            if dwconv:
+                layers_.append(CombConvLayer(inch, outch))
+            else:
+                layers_.append(ConvLayer(inch, outch))
+
+            if (i % 2 == 0) or (i == n_layers - 1):
+                self.out_channels += outch
+        # print("Blk out =",self.out_channels)
+        self.layers = nn.LayerList(layers_)
+
+    def get_link(self, layer, base_ch, growth_rate, grmul):
+        if layer == 0:
+            return base_ch, 0, []
+        out_channels = growth_rate
+
+        link = []
+        for i in range(10):
+            dv = 2**i
+            if layer % dv == 0:
+                k = layer - dv
+                link.append(k)
+                if i > 0:
+                    out_channels *= grmul
+
+        out_channels = int(int(out_channels + 1) / 2) * 2
+        in_channels = 0
+
+        for i in link:
+            ch, _, _ = self.get_link(i, base_ch, growth_rate, grmul)
+            in_channels += ch
+
+        return out_channels, in_channels, link
+
+    def forward(self, x):
+        layers_ = [x]
+
+        for layer in range(len(self.layers)):
+            link = self.links[layer]
+            tin = []
+            for i in link:
+                tin.append(layers_[i])
+            if len(tin) > 1:
+                x = paddle.concat(tin, 1)
+            else:
+                x = tin[0]
+            out = self.layers[layer](x)
+            layers_.append(out)
+
+        t = len(layers_)
+        out_ = []
+        for i in range(t):
+            if (i == 0 and self.keepBase) or (i == t - 1) or (i % 2 == 1):
+                out_.append(layers_[i])
+        out = paddle.concat(out_, 1)
+
+        return out
+
+
+class HarDNet(nn.Layer):
+    def __init__(self,
+                 depth_wise=False,
+                 arch=85,
+                 class_num=1000,
+                 with_pool=True):
+        super().__init__()
+        first_ch = [32, 64]
+        second_kernel = 3
+        max_pool = True
+        grmul = 1.7
+        drop_rate = 0.1
+
+        # HarDNet68
+        ch_list = [128, 256, 320, 640, 1024]
+        gr = [14, 16, 20, 40, 160]
+        n_layers = [8, 16, 16, 16, 4]
+        downSamp = [1, 0, 1, 1, 0]
+
+        if arch == 85:
+            # HarDNet85
+            first_ch = [48, 96]
+            ch_list = [192, 256, 320, 480, 720, 1280]
+            gr = [24, 24, 28, 36, 48, 256]
+            n_layers = [8, 16, 16, 16, 16, 4]
+            downSamp = [1, 0, 1, 0, 1, 0]
+            drop_rate = 0.2
+
+        elif arch == 39:
+            # HarDNet39
+            first_ch = [24, 48]
+            ch_list = [96, 320, 640, 1024]
+            grmul = 1.6
+            gr = [16, 20, 64, 160]
+            n_layers = [4, 16, 8, 4]
+            downSamp = [1, 1, 1, 0]
+
+        if depth_wise:
+            second_kernel = 1
+            max_pool = False
+            drop_rate = 0.05
+
+        blks = len(n_layers)
+        self.base = nn.LayerList([])
+
+        # First Layer: Standard Conv3x3, Stride=2
+        self.base.append(
+            ConvLayer(
+                in_channels=3,
+                out_channels=first_ch[0],
+                kernel_size=3,
+                stride=2,
+                bias_attr=False))
+
+        # Second Layer
+        self.base.append(
+            ConvLayer(
+                first_ch[0], first_ch[1], kernel_size=second_kernel))
+
+        # Maxpooling or DWConv3x3 downsampling
+        if max_pool:
+            self.base.append(nn.MaxPool2D(kernel_size=3, stride=2, padding=1))
+        else:
+            self.base.append(DWConvLayer(first_ch[1], first_ch[1], stride=2))
+
+        # Build all HarDNet blocks
+        ch = first_ch[1]
+        for i in range(blks):
+            blk = HarDBlock(ch, gr[i], grmul, n_layers[i], dwconv=depth_wise)
+            ch = blk.out_channels
+            self.base.append(blk)
+
+            if i == blks - 1 and arch == 85:
+                self.base.append(nn.Dropout(0.1))
+
+            self.base.append(ConvLayer(ch, ch_list[i], kernel_size=1))
+            ch = ch_list[i]
+            if downSamp[i] == 1:
+                if max_pool:
+                    self.base.append(nn.MaxPool2D(kernel_size=2, stride=2))
+                else:
+                    self.base.append(DWConvLayer(ch, ch, stride=2))
+
+        ch = ch_list[blks - 1]
+
+        layers = []
+
+        if with_pool:
+            layers.append(nn.AdaptiveAvgPool2D((1, 1)))
+
+        if class_num > 0:
+            layers.append(nn.Flatten())
+            layers.append(nn.Dropout(drop_rate))
+            layers.append(nn.Linear(ch, class_num))
+
+        self.base.append(nn.Sequential(*layers))
+
+    def forward(self, x):
+        for layer in self.base:
+            x = layer(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def HarDNet39_ds(pretrained=False, **kwargs):
+    model = HarDNet(arch=39, depth_wise=True, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HarDNet39_ds"])
+    return model
+
+
+def HarDNet68_ds(pretrained=False, **kwargs):
+    model = HarDNet(arch=68, depth_wise=True, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HarDNet68_ds"])
+    return model
+
+
+def HarDNet68(pretrained=False, **kwargs):
+    model = HarDNet(arch=68, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HarDNet68"])
+    return model
+
+
+def HarDNet85(pretrained=False, **kwargs):
+    model = HarDNet(arch=85, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HarDNet85"])
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/inception_v4.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/inception_v4.py
new file mode 100644
index 000000000..476330004
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/inception_v4.py
@@ -0,0 +1,479 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1602.07261
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "InceptionV4":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/InceptionV4_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 groups=1,
+                 act='relu',
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        bn_name = name + "_bn"
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(name=bn_name + "_offset"),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class InceptionStem(nn.Layer):
+    def __init__(self):
+        super(InceptionStem, self).__init__()
+        self._conv_1 = ConvBNLayer(
+            3, 32, 3, stride=2, act="relu", name="conv1_3x3_s2")
+        self._conv_2 = ConvBNLayer(32, 32, 3, act="relu", name="conv2_3x3_s1")
+        self._conv_3 = ConvBNLayer(
+            32, 64, 3, padding=1, act="relu", name="conv3_3x3_s1")
+        self._pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+        self._conv2 = ConvBNLayer(
+            64, 96, 3, stride=2, act="relu", name="inception_stem1_3x3_s2")
+        self._conv1_1 = ConvBNLayer(
+            160, 64, 1, act="relu", name="inception_stem2_3x3_reduce")
+        self._conv1_2 = ConvBNLayer(
+            64, 96, 3, act="relu", name="inception_stem2_3x3")
+        self._conv2_1 = ConvBNLayer(
+            160, 64, 1, act="relu", name="inception_stem2_1x7_reduce")
+        self._conv2_2 = ConvBNLayer(
+            64,
+            64, (7, 1),
+            padding=(3, 0),
+            act="relu",
+            name="inception_stem2_1x7")
+        self._conv2_3 = ConvBNLayer(
+            64,
+            64, (1, 7),
+            padding=(0, 3),
+            act="relu",
+            name="inception_stem2_7x1")
+        self._conv2_4 = ConvBNLayer(
+            64, 96, 3, act="relu", name="inception_stem2_3x3_2")
+        self._conv3 = ConvBNLayer(
+            192, 192, 3, stride=2, act="relu", name="inception_stem3_3x3_s2")
+
+    def forward(self, inputs):
+        conv = self._conv_1(inputs)
+        conv = self._conv_2(conv)
+        conv = self._conv_3(conv)
+
+        pool1 = self._pool(conv)
+        conv2 = self._conv2(conv)
+        concat = paddle.concat([pool1, conv2], axis=1)
+
+        conv1 = self._conv1_1(concat)
+        conv1 = self._conv1_2(conv1)
+
+        conv2 = self._conv2_1(concat)
+        conv2 = self._conv2_2(conv2)
+        conv2 = self._conv2_3(conv2)
+        conv2 = self._conv2_4(conv2)
+
+        concat = paddle.concat([conv1, conv2], axis=1)
+
+        conv1 = self._conv3(concat)
+        pool1 = self._pool(concat)
+
+        concat = paddle.concat([conv1, pool1], axis=1)
+        return concat
+
+
+class InceptionA(nn.Layer):
+    def __init__(self, name):
+        super(InceptionA, self).__init__()
+        self._pool = AvgPool2D(kernel_size=3, stride=1, padding=1)
+        self._conv1 = ConvBNLayer(
+            384, 96, 1, act="relu", name="inception_a" + name + "_1x1")
+        self._conv2 = ConvBNLayer(
+            384, 96, 1, act="relu", name="inception_a" + name + "_1x1_2")
+        self._conv3_1 = ConvBNLayer(
+            384, 64, 1, act="relu", name="inception_a" + name + "_3x3_reduce")
+        self._conv3_2 = ConvBNLayer(
+            64,
+            96,
+            3,
+            padding=1,
+            act="relu",
+            name="inception_a" + name + "_3x3")
+        self._conv4_1 = ConvBNLayer(
+            384,
+            64,
+            1,
+            act="relu",
+            name="inception_a" + name + "_3x3_2_reduce")
+        self._conv4_2 = ConvBNLayer(
+            64,
+            96,
+            3,
+            padding=1,
+            act="relu",
+            name="inception_a" + name + "_3x3_2")
+        self._conv4_3 = ConvBNLayer(
+            96,
+            96,
+            3,
+            padding=1,
+            act="relu",
+            name="inception_a" + name + "_3x3_3")
+
+    def forward(self, inputs):
+        pool1 = self._pool(inputs)
+        conv1 = self._conv1(pool1)
+
+        conv2 = self._conv2(inputs)
+
+        conv3 = self._conv3_1(inputs)
+        conv3 = self._conv3_2(conv3)
+
+        conv4 = self._conv4_1(inputs)
+        conv4 = self._conv4_2(conv4)
+        conv4 = self._conv4_3(conv4)
+
+        concat = paddle.concat([conv1, conv2, conv3, conv4], axis=1)
+        return concat
+
+
+class ReductionA(nn.Layer):
+    def __init__(self):
+        super(ReductionA, self).__init__()
+        self._pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+        self._conv2 = ConvBNLayer(
+            384, 384, 3, stride=2, act="relu", name="reduction_a_3x3")
+        self._conv3_1 = ConvBNLayer(
+            384, 192, 1, act="relu", name="reduction_a_3x3_2_reduce")
+        self._conv3_2 = ConvBNLayer(
+            192, 224, 3, padding=1, act="relu", name="reduction_a_3x3_2")
+        self._conv3_3 = ConvBNLayer(
+            224, 256, 3, stride=2, act="relu", name="reduction_a_3x3_3")
+
+    def forward(self, inputs):
+        pool1 = self._pool(inputs)
+        conv2 = self._conv2(inputs)
+        conv3 = self._conv3_1(inputs)
+        conv3 = self._conv3_2(conv3)
+        conv3 = self._conv3_3(conv3)
+        concat = paddle.concat([pool1, conv2, conv3], axis=1)
+        return concat
+
+
+class InceptionB(nn.Layer):
+    def __init__(self, name=None):
+        super(InceptionB, self).__init__()
+        self._pool = AvgPool2D(kernel_size=3, stride=1, padding=1)
+        self._conv1 = ConvBNLayer(
+            1024, 128, 1, act="relu", name="inception_b" + name + "_1x1")
+        self._conv2 = ConvBNLayer(
+            1024, 384, 1, act="relu", name="inception_b" + name + "_1x1_2")
+        self._conv3_1 = ConvBNLayer(
+            1024,
+            192,
+            1,
+            act="relu",
+            name="inception_b" + name + "_1x7_reduce")
+        self._conv3_2 = ConvBNLayer(
+            192,
+            224, (1, 7),
+            padding=(0, 3),
+            act="relu",
+            name="inception_b" + name + "_1x7")
+        self._conv3_3 = ConvBNLayer(
+            224,
+            256, (7, 1),
+            padding=(3, 0),
+            act="relu",
+            name="inception_b" + name + "_7x1")
+        self._conv4_1 = ConvBNLayer(
+            1024,
+            192,
+            1,
+            act="relu",
+            name="inception_b" + name + "_7x1_2_reduce")
+        self._conv4_2 = ConvBNLayer(
+            192,
+            192, (1, 7),
+            padding=(0, 3),
+            act="relu",
+            name="inception_b" + name + "_1x7_2")
+        self._conv4_3 = ConvBNLayer(
+            192,
+            224, (7, 1),
+            padding=(3, 0),
+            act="relu",
+            name="inception_b" + name + "_7x1_2")
+        self._conv4_4 = ConvBNLayer(
+            224,
+            224, (1, 7),
+            padding=(0, 3),
+            act="relu",
+            name="inception_b" + name + "_1x7_3")
+        self._conv4_5 = ConvBNLayer(
+            224,
+            256, (7, 1),
+            padding=(3, 0),
+            act="relu",
+            name="inception_b" + name + "_7x1_3")
+
+    def forward(self, inputs):
+        pool1 = self._pool(inputs)
+        conv1 = self._conv1(pool1)
+
+        conv2 = self._conv2(inputs)
+
+        conv3 = self._conv3_1(inputs)
+        conv3 = self._conv3_2(conv3)
+        conv3 = self._conv3_3(conv3)
+
+        conv4 = self._conv4_1(inputs)
+        conv4 = self._conv4_2(conv4)
+        conv4 = self._conv4_3(conv4)
+        conv4 = self._conv4_4(conv4)
+        conv4 = self._conv4_5(conv4)
+
+        concat = paddle.concat([conv1, conv2, conv3, conv4], axis=1)
+        return concat
+
+
+class ReductionB(nn.Layer):
+    def __init__(self):
+        super(ReductionB, self).__init__()
+        self._pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+        self._conv2_1 = ConvBNLayer(
+            1024, 192, 1, act="relu", name="reduction_b_3x3_reduce")
+        self._conv2_2 = ConvBNLayer(
+            192, 192, 3, stride=2, act="relu", name="reduction_b_3x3")
+        self._conv3_1 = ConvBNLayer(
+            1024, 256, 1, act="relu", name="reduction_b_1x7_reduce")
+        self._conv3_2 = ConvBNLayer(
+            256,
+            256, (1, 7),
+            padding=(0, 3),
+            act="relu",
+            name="reduction_b_1x7")
+        self._conv3_3 = ConvBNLayer(
+            256,
+            320, (7, 1),
+            padding=(3, 0),
+            act="relu",
+            name="reduction_b_7x1")
+        self._conv3_4 = ConvBNLayer(
+            320, 320, 3, stride=2, act="relu", name="reduction_b_3x3_2")
+
+    def forward(self, inputs):
+        pool1 = self._pool(inputs)
+
+        conv2 = self._conv2_1(inputs)
+        conv2 = self._conv2_2(conv2)
+
+        conv3 = self._conv3_1(inputs)
+        conv3 = self._conv3_2(conv3)
+        conv3 = self._conv3_3(conv3)
+        conv3 = self._conv3_4(conv3)
+
+        concat = paddle.concat([pool1, conv2, conv3], axis=1)
+
+        return concat
+
+
+class InceptionC(nn.Layer):
+    def __init__(self, name=None):
+        super(InceptionC, self).__init__()
+        self._pool = AvgPool2D(kernel_size=3, stride=1, padding=1)
+        self._conv1 = ConvBNLayer(
+            1536, 256, 1, act="relu", name="inception_c" + name + "_1x1")
+        self._conv2 = ConvBNLayer(
+            1536, 256, 1, act="relu", name="inception_c" + name + "_1x1_2")
+        self._conv3_0 = ConvBNLayer(
+            1536, 384, 1, act="relu", name="inception_c" + name + "_1x1_3")
+        self._conv3_1 = ConvBNLayer(
+            384,
+            256, (1, 3),
+            padding=(0, 1),
+            act="relu",
+            name="inception_c" + name + "_1x3")
+        self._conv3_2 = ConvBNLayer(
+            384,
+            256, (3, 1),
+            padding=(1, 0),
+            act="relu",
+            name="inception_c" + name + "_3x1")
+        self._conv4_0 = ConvBNLayer(
+            1536, 384, 1, act="relu", name="inception_c" + name + "_1x1_4")
+        self._conv4_00 = ConvBNLayer(
+            384,
+            448, (1, 3),
+            padding=(0, 1),
+            act="relu",
+            name="inception_c" + name + "_1x3_2")
+        self._conv4_000 = ConvBNLayer(
+            448,
+            512, (3, 1),
+            padding=(1, 0),
+            act="relu",
+            name="inception_c" + name + "_3x1_2")
+        self._conv4_1 = ConvBNLayer(
+            512,
+            256, (1, 3),
+            padding=(0, 1),
+            act="relu",
+            name="inception_c" + name + "_1x3_3")
+        self._conv4_2 = ConvBNLayer(
+            512,
+            256, (3, 1),
+            padding=(1, 0),
+            act="relu",
+            name="inception_c" + name + "_3x1_3")
+
+    def forward(self, inputs):
+        pool1 = self._pool(inputs)
+        conv1 = self._conv1(pool1)
+
+        conv2 = self._conv2(inputs)
+
+        conv3 = self._conv3_0(inputs)
+        conv3_1 = self._conv3_1(conv3)
+        conv3_2 = self._conv3_2(conv3)
+
+        conv4 = self._conv4_0(inputs)
+        conv4 = self._conv4_00(conv4)
+        conv4 = self._conv4_000(conv4)
+        conv4_1 = self._conv4_1(conv4)
+        conv4_2 = self._conv4_2(conv4)
+
+        concat = paddle.concat(
+            [conv1, conv2, conv3_1, conv3_2, conv4_1, conv4_2], axis=1)
+
+        return concat
+
+
+class InceptionV4DY(nn.Layer):
+    def __init__(self, class_num=1000):
+        super(InceptionV4DY, self).__init__()
+        self._inception_stem = InceptionStem()
+
+        self._inceptionA_1 = InceptionA(name="1")
+        self._inceptionA_2 = InceptionA(name="2")
+        self._inceptionA_3 = InceptionA(name="3")
+        self._inceptionA_4 = InceptionA(name="4")
+        self._reductionA = ReductionA()
+
+        self._inceptionB_1 = InceptionB(name="1")
+        self._inceptionB_2 = InceptionB(name="2")
+        self._inceptionB_3 = InceptionB(name="3")
+        self._inceptionB_4 = InceptionB(name="4")
+        self._inceptionB_5 = InceptionB(name="5")
+        self._inceptionB_6 = InceptionB(name="6")
+        self._inceptionB_7 = InceptionB(name="7")
+        self._reductionB = ReductionB()
+
+        self._inceptionC_1 = InceptionC(name="1")
+        self._inceptionC_2 = InceptionC(name="2")
+        self._inceptionC_3 = InceptionC(name="3")
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self._drop = Dropout(p=0.2, mode="downscale_in_infer")
+        stdv = 1.0 / math.sqrt(1536 * 1.0)
+        self.out = Linear(
+            1536,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="final_fc_weights"),
+            bias_attr=ParamAttr(name="final_fc_offset"))
+
+    def forward(self, inputs):
+        x = self._inception_stem(inputs)
+
+        x = self._inceptionA_1(x)
+        x = self._inceptionA_2(x)
+        x = self._inceptionA_3(x)
+        x = self._inceptionA_4(x)
+        x = self._reductionA(x)
+
+        x = self._inceptionB_1(x)
+        x = self._inceptionB_2(x)
+        x = self._inceptionB_3(x)
+        x = self._inceptionB_4(x)
+        x = self._inceptionB_5(x)
+        x = self._inceptionB_6(x)
+        x = self._inceptionB_7(x)
+        x = self._reductionB(x)
+
+        x = self._inceptionC_1(x)
+        x = self._inceptionC_2(x)
+        x = self._inceptionC_3(x)
+
+        x = self.avg_pool(x)
+        x = paddle.squeeze(x, axis=[2, 3])
+        x = self._drop(x)
+        x = self.out(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def InceptionV4(pretrained=False, use_ssld=False, **kwargs):
+    model = InceptionV4DY(**kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["InceptionV4"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/levit.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/levit.py
new file mode 100644
index 000000000..47734cc9c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/levit.py
@@ -0,0 +1,590 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/facebookresearch/LeViT
+# reference: https://openaccess.thecvf.com/content/ICCV2021/html/Graham_LeViT_A_Vision_Transformer_in_ConvNets_Clothing_for_Faster_Inference_ICCV_2021_paper.html
+
+import itertools
+import math
+import warnings
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import TruncatedNormal, Constant
+from paddle.regularizer import L2Decay
+
+from .vision_transformer import trunc_normal_, zeros_, ones_, Identity
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "LeViT_128S":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/LeViT_128S_pretrained.pdparams",
+    "LeViT_128":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/LeViT_128_pretrained.pdparams",
+    "LeViT_192":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/LeViT_192_pretrained.pdparams",
+    "LeViT_256":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/LeViT_256_pretrained.pdparams",
+    "LeViT_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/LeViT_384_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def cal_attention_biases(attention_biases, attention_bias_idxs):
+    gather_list = []
+    attention_bias_t = paddle.transpose(attention_biases, (1, 0))
+    nums = attention_bias_idxs.shape[0]
+    for idx in range(nums):
+        gather = paddle.gather(attention_bias_t, attention_bias_idxs[idx])
+        gather_list.append(gather)
+    shape0, shape1 = attention_bias_idxs.shape
+    gather = paddle.concat(gather_list)
+    return paddle.transpose(gather, (1, 0)).reshape((0, shape0, shape1))
+
+
+class Conv2d_BN(nn.Sequential):
+    def __init__(self,
+                 a,
+                 b,
+                 ks=1,
+                 stride=1,
+                 pad=0,
+                 dilation=1,
+                 groups=1,
+                 bn_weight_init=1,
+                 resolution=-10000):
+        super().__init__()
+        self.add_sublayer(
+            'c',
+            nn.Conv2D(
+                a, b, ks, stride, pad, dilation, groups, bias_attr=False))
+        bn = nn.BatchNorm2D(b)
+        ones_(bn.weight)
+        zeros_(bn.bias)
+        self.add_sublayer('bn', bn)
+
+
+class Linear_BN(nn.Sequential):
+    def __init__(self, a, b, bn_weight_init=1):
+        super().__init__()
+        self.add_sublayer('c', nn.Linear(a, b, bias_attr=False))
+        bn = nn.BatchNorm1D(b)
+        if bn_weight_init == 0:
+            zeros_(bn.weight)
+        else:
+            ones_(bn.weight)
+        zeros_(bn.bias)
+        self.add_sublayer('bn', bn)
+
+    def forward(self, x):
+        l, bn = self._sub_layers.values()
+        x = l(x)
+        return paddle.reshape(bn(x.flatten(0, 1)), x.shape)
+
+
+class BN_Linear(nn.Sequential):
+    def __init__(self, a, b, bias=True, std=0.02):
+        super().__init__()
+        self.add_sublayer('bn', nn.BatchNorm1D(a))
+        l = nn.Linear(a, b, bias_attr=bias)
+        trunc_normal_(l.weight)
+        if bias:
+            zeros_(l.bias)
+        self.add_sublayer('l', l)
+
+
+def b16(n, activation, resolution=224):
+    return nn.Sequential(
+        Conv2d_BN(
+            3, n // 8, 3, 2, 1, resolution=resolution),
+        activation(),
+        Conv2d_BN(
+            n // 8, n // 4, 3, 2, 1, resolution=resolution // 2),
+        activation(),
+        Conv2d_BN(
+            n // 4, n // 2, 3, 2, 1, resolution=resolution // 4),
+        activation(),
+        Conv2d_BN(
+            n // 2, n, 3, 2, 1, resolution=resolution // 8))
+
+
+class Residual(nn.Layer):
+    def __init__(self, m, drop):
+        super().__init__()
+        self.m = m
+        self.drop = drop
+
+    def forward(self, x):
+        if self.training and self.drop > 0:
+            y = paddle.rand(
+                shape=[x.shape[0], 1, 1]).__ge__(self.drop).astype("float32")
+            y = y.divide(paddle.full_like(y, 1 - self.drop))
+            return paddle.add(x, y)
+        else:
+            return paddle.add(x, self.m(x))
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 key_dim,
+                 num_heads=8,
+                 attn_ratio=4,
+                 activation=None,
+                 resolution=14):
+        super().__init__()
+        self.num_heads = num_heads
+        self.scale = key_dim**-0.5
+        self.key_dim = key_dim
+        self.nh_kd = nh_kd = key_dim * num_heads
+        self.d = int(attn_ratio * key_dim)
+        self.dh = int(attn_ratio * key_dim) * num_heads
+        self.attn_ratio = attn_ratio
+        self.h = self.dh + nh_kd * 2
+        self.qkv = Linear_BN(dim, self.h)
+        self.proj = nn.Sequential(
+            activation(), Linear_BN(
+                self.dh, dim, bn_weight_init=0))
+        points = list(itertools.product(range(resolution), range(resolution)))
+        N = len(points)
+        attention_offsets = {}
+        idxs = []
+        for p1 in points:
+            for p2 in points:
+                offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                idxs.append(attention_offsets[offset])
+        self.attention_biases = self.create_parameter(
+            shape=(num_heads, len(attention_offsets)),
+            default_initializer=zeros_,
+            attr=paddle.ParamAttr(regularizer=L2Decay(0.0)))
+        tensor_idxs = paddle.to_tensor(idxs, dtype='int64')
+        self.register_buffer('attention_bias_idxs',
+                             paddle.reshape(tensor_idxs, [N, N]))
+
+    @paddle.no_grad()
+    def train(self, mode=True):
+        if mode:
+            super().train()
+        else:
+            super().eval()
+        if mode and hasattr(self, 'ab'):
+            del self.ab
+        else:
+            self.ab = cal_attention_biases(self.attention_biases,
+                                           self.attention_bias_idxs)
+
+    def forward(self, x):
+        self.training = True
+        B, N, C = x.shape
+        qkv = self.qkv(x)
+        qkv = paddle.reshape(qkv,
+                             [B, N, self.num_heads, self.h // self.num_heads])
+        q, k, v = paddle.split(
+            qkv, [self.key_dim, self.key_dim, self.d], axis=3)
+        q = paddle.transpose(q, perm=[0, 2, 1, 3])
+        k = paddle.transpose(k, perm=[0, 2, 1, 3])
+        v = paddle.transpose(v, perm=[0, 2, 1, 3])
+        k_transpose = paddle.transpose(k, perm=[0, 1, 3, 2])
+
+        if self.training:
+            attention_biases = cal_attention_biases(self.attention_biases,
+                                                    self.attention_bias_idxs)
+        else:
+            attention_biases = self.ab
+        attn = (paddle.matmul(q, k_transpose) * self.scale + attention_biases)
+        attn = F.softmax(attn)
+        x = paddle.transpose(paddle.matmul(attn, v), perm=[0, 2, 1, 3])
+        x = paddle.reshape(x, [B, N, self.dh])
+        x = self.proj(x)
+        return x
+
+
+class Subsample(nn.Layer):
+    def __init__(self, stride, resolution):
+        super().__init__()
+        self.stride = stride
+        self.resolution = resolution
+
+    def forward(self, x):
+        B, N, C = x.shape
+        x = paddle.reshape(x, [B, self.resolution, self.resolution, C])
+        end1, end2 = x.shape[1], x.shape[2]
+        x = x[:, 0:end1:self.stride, 0:end2:self.stride]
+        x = paddle.reshape(x, [B, -1, C])
+        return x
+
+
+class AttentionSubsample(nn.Layer):
+    def __init__(self,
+                 in_dim,
+                 out_dim,
+                 key_dim,
+                 num_heads=8,
+                 attn_ratio=2,
+                 activation=None,
+                 stride=2,
+                 resolution=14,
+                 resolution_=7):
+        super().__init__()
+        self.num_heads = num_heads
+        self.scale = key_dim**-0.5
+        self.key_dim = key_dim
+        self.nh_kd = nh_kd = key_dim * num_heads
+        self.d = int(attn_ratio * key_dim)
+        self.dh = int(attn_ratio * key_dim) * self.num_heads
+        self.attn_ratio = attn_ratio
+        self.resolution_ = resolution_
+        self.resolution_2 = resolution_**2
+        self.training = True
+        h = self.dh + nh_kd
+        self.kv = Linear_BN(in_dim, h)
+
+        self.q = nn.Sequential(
+            Subsample(stride, resolution), Linear_BN(in_dim, nh_kd))
+        self.proj = nn.Sequential(activation(), Linear_BN(self.dh, out_dim))
+
+        self.stride = stride
+        self.resolution = resolution
+        points = list(itertools.product(range(resolution), range(resolution)))
+        points_ = list(
+            itertools.product(range(resolution_), range(resolution_)))
+
+        N = len(points)
+        N_ = len(points_)
+        attention_offsets = {}
+        idxs = []
+        i = 0
+        j = 0
+        for p1 in points_:
+            i += 1
+            for p2 in points:
+                j += 1
+                size = 1
+                offset = (abs(p1[0] * stride - p2[0] + (size - 1) / 2),
+                          abs(p1[1] * stride - p2[1] + (size - 1) / 2))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                idxs.append(attention_offsets[offset])
+        self.attention_biases = self.create_parameter(
+            shape=(num_heads, len(attention_offsets)),
+            default_initializer=zeros_,
+            attr=paddle.ParamAttr(regularizer=L2Decay(0.0)))
+
+        tensor_idxs_ = paddle.to_tensor(idxs, dtype='int64')
+        self.register_buffer('attention_bias_idxs',
+                             paddle.reshape(tensor_idxs_, [N_, N]))
+
+    @paddle.no_grad()
+    def train(self, mode=True):
+        if mode:
+            super().train()
+        else:
+            super().eval()
+        if mode and hasattr(self, 'ab'):
+            del self.ab
+        else:
+            self.ab = cal_attention_biases(self.attention_biases,
+                                           self.attention_bias_idxs)
+
+    def forward(self, x):
+        self.training = True
+        B, N, C = x.shape
+        kv = self.kv(x)
+        kv = paddle.reshape(kv, [B, N, self.num_heads, -1])
+        k, v = paddle.split(kv, [self.key_dim, self.d], axis=3)
+        k = paddle.transpose(k, perm=[0, 2, 1, 3])  # BHNC
+        v = paddle.transpose(v, perm=[0, 2, 1, 3])
+        q = paddle.reshape(
+            self.q(x), [B, self.resolution_2, self.num_heads, self.key_dim])
+        q = paddle.transpose(q, perm=[0, 2, 1, 3])
+
+        if self.training:
+            attention_biases = cal_attention_biases(self.attention_biases,
+                                                    self.attention_bias_idxs)
+        else:
+            attention_biases = self.ab
+
+        attn = (paddle.matmul(
+            q, paddle.transpose(
+                k, perm=[0, 1, 3, 2]))) * self.scale + attention_biases
+        attn = F.softmax(attn)
+
+        x = paddle.reshape(
+            paddle.transpose(
+                paddle.matmul(attn, v), perm=[0, 2, 1, 3]), [B, -1, self.dh])
+        x = self.proj(x)
+        return x
+
+
+class LeViT(nn.Layer):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dim=[192],
+                 key_dim=[64],
+                 depth=[12],
+                 num_heads=[3],
+                 attn_ratio=[2],
+                 mlp_ratio=[2],
+                 hybrid_backbone=None,
+                 down_ops=[],
+                 attention_activation=nn.Hardswish,
+                 mlp_activation=nn.Hardswish,
+                 distillation=True,
+                 drop_path=0):
+        super().__init__()
+
+        self.class_num = class_num
+        self.num_features = embed_dim[-1]
+        self.embed_dim = embed_dim
+        self.distillation = distillation
+
+        self.patch_embed = hybrid_backbone
+
+        self.blocks = []
+        down_ops.append([''])
+        resolution = img_size // patch_size
+        for i, (ed, kd, dpth, nh, ar, mr, do) in enumerate(
+                zip(embed_dim, key_dim, depth, num_heads, attn_ratio,
+                    mlp_ratio, down_ops)):
+            for _ in range(dpth):
+                self.blocks.append(
+                    Residual(
+                        Attention(
+                            ed,
+                            kd,
+                            nh,
+                            attn_ratio=ar,
+                            activation=attention_activation,
+                            resolution=resolution, ),
+                        drop_path))
+                if mr > 0:
+                    h = int(ed * mr)
+                    self.blocks.append(
+                        Residual(
+                            nn.Sequential(
+                                Linear_BN(ed, h),
+                                mlp_activation(),
+                                Linear_BN(
+                                    h, ed, bn_weight_init=0), ),
+                            drop_path))
+            if do[0] == 'Subsample':
+                #('Subsample',key_dim, num_heads, attn_ratio, mlp_ratio, stride)
+                resolution_ = (resolution - 1) // do[5] + 1
+                self.blocks.append(
+                    AttentionSubsample(
+                        *embed_dim[i:i + 2],
+                        key_dim=do[1],
+                        num_heads=do[2],
+                        attn_ratio=do[3],
+                        activation=attention_activation,
+                        stride=do[5],
+                        resolution=resolution,
+                        resolution_=resolution_))
+                resolution = resolution_
+                if do[4] > 0:  # mlp_ratio
+                    h = int(embed_dim[i + 1] * do[4])
+                    self.blocks.append(
+                        Residual(
+                            nn.Sequential(
+                                Linear_BN(embed_dim[i + 1], h),
+                                mlp_activation(),
+                                Linear_BN(
+                                    h, embed_dim[i + 1], bn_weight_init=0), ),
+                            drop_path))
+        self.blocks = nn.Sequential(*self.blocks)
+
+        # Classifier head
+        self.head = BN_Linear(embed_dim[-1],
+                              class_num) if class_num > 0 else Identity()
+        if distillation:
+            self.head_dist = BN_Linear(
+                embed_dim[-1], class_num) if class_num > 0 else Identity()
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        x = x.flatten(2)
+        x = paddle.transpose(x, perm=[0, 2, 1])
+        x = self.blocks(x)
+        x = x.mean(1)
+
+        x = paddle.reshape(x, [-1, self.embed_dim[-1]])
+        if self.distillation:
+            x = self.head(x), self.head_dist(x)
+            if not self.training:
+                x = (x[0] + x[1]) / 2
+        else:
+            x = self.head(x)
+        return x
+
+
+def model_factory(C, D, X, N, drop_path, class_num, distillation):
+    embed_dim = [int(x) for x in C.split('_')]
+    num_heads = [int(x) for x in N.split('_')]
+    depth = [int(x) for x in X.split('_')]
+    act = nn.Hardswish
+    model = LeViT(
+        patch_size=16,
+        embed_dim=embed_dim,
+        num_heads=num_heads,
+        key_dim=[D] * 3,
+        depth=depth,
+        attn_ratio=[2, 2, 2],
+        mlp_ratio=[2, 2, 2],
+        down_ops=[
+            #('Subsample',key_dim, num_heads, attn_ratio, mlp_ratio, stride)
+            ['Subsample', D, embed_dim[0] // D, 4, 2, 2],
+            ['Subsample', D, embed_dim[1] // D, 4, 2, 2],
+        ],
+        attention_activation=act,
+        mlp_activation=act,
+        hybrid_backbone=b16(embed_dim[0], activation=act),
+        class_num=class_num,
+        drop_path=drop_path,
+        distillation=distillation)
+
+    return model
+
+
+specification = {
+    'LeViT_128S': {
+        'C': '128_256_384',
+        'D': 16,
+        'N': '4_6_8',
+        'X': '2_3_4',
+        'drop_path': 0
+    },
+    'LeViT_128': {
+        'C': '128_256_384',
+        'D': 16,
+        'N': '4_8_12',
+        'X': '4_4_4',
+        'drop_path': 0
+    },
+    'LeViT_192': {
+        'C': '192_288_384',
+        'D': 32,
+        'N': '3_5_6',
+        'X': '4_4_4',
+        'drop_path': 0
+    },
+    'LeViT_256': {
+        'C': '256_384_512',
+        'D': 32,
+        'N': '4_6_8',
+        'X': '4_4_4',
+        'drop_path': 0
+    },
+    'LeViT_384': {
+        'C': '384_512_768',
+        'D': 32,
+        'N': '6_9_12',
+        'X': '4_4_4',
+        'drop_path': 0.1
+    },
+}
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def LeViT_128S(pretrained=False,
+               use_ssld=False,
+               class_num=1000,
+               distillation=False,
+               **kwargs):
+    model = model_factory(
+        **specification['LeViT_128S'],
+        class_num=class_num,
+        distillation=distillation)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["LeViT_128S"], use_ssld=use_ssld)
+    return model
+
+
+def LeViT_128(pretrained=False,
+              use_ssld=False,
+              class_num=1000,
+              distillation=False,
+              **kwargs):
+    model = model_factory(
+        **specification['LeViT_128'],
+        class_num=class_num,
+        distillation=distillation)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["LeViT_128"], use_ssld=use_ssld)
+    return model
+
+
+def LeViT_192(pretrained=False,
+              use_ssld=False,
+              class_num=1000,
+              distillation=False,
+              **kwargs):
+    model = model_factory(
+        **specification['LeViT_192'],
+        class_num=class_num,
+        distillation=distillation)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["LeViT_192"], use_ssld=use_ssld)
+    return model
+
+
+def LeViT_256(pretrained=False,
+              use_ssld=False,
+              class_num=1000,
+              distillation=False,
+              **kwargs):
+    model = model_factory(
+        **specification['LeViT_256'],
+        class_num=class_num,
+        distillation=distillation)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["LeViT_256"], use_ssld=use_ssld)
+    return model
+
+
+def LeViT_384(pretrained=False,
+              use_ssld=False,
+              class_num=1000,
+              distillation=False,
+              **kwargs):
+    model = model_factory(
+        **specification['LeViT_384'],
+        class_num=class_num,
+        distillation=distillation)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["LeViT_384"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/micronet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/micronet.py
new file mode 100644
index 000000000..8069bb860
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/micronet.py
@@ -0,0 +1,618 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Code was heavily based on https://github.com/liyunsheng13/micronet
+# reference: https://arxiv.org/pdf/2108.05894
+
+import math
+
+import paddle
+import paddle.nn as nn
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "MicroNet_M0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MicroNet_M0_pretrained.pdparams",
+    "MicroNet_M1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MicroNet_M1_pretrained.pdparams",
+    "MicroNet_M2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MicroNet_M2_pretrained.pdparams",
+    "MicroNet_M3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MicroNet_M3_pretrained.pdparams",
+}
+
+__all__ = MODEL_URLS.keys()
+
+NET_CONFIG = {
+    "msnx_dy6_exp4_4M_221": [
+        #s, n,  c, ks, c1, c2, g1, g2, c3, g3, g4,y1,y2,y3,r
+        [2, 1, 8, 3, 2, 2, 0, 4, 8, 2, 2, 2, 0, 1,
+         1],  # 6  ->12(0,0)  ->24  ->8(4,2)   ->8
+        [2, 1, 12, 3, 2, 2, 0, 8, 12, 4, 4, 2, 2, 1,
+         1],  # 8  ->16(0,0)  ->32  ->16(4,4)  ->12
+        [2, 1, 16, 5, 2, 2, 0, 12, 16, 4, 4, 2, 2, 1,
+         1],  # 16 ->32(0,0)  ->64  ->16(8,2)  ->16
+        [1, 1, 32, 5, 1, 4, 4, 4, 32, 4, 4, 2, 2, 1,
+         1],  # 16 ->16(2,8)  ->96  ->32(8,4)  ->32
+        [2, 1, 64, 5, 1, 4, 8, 8, 64, 8, 8, 2, 2, 1,
+         1],  # 32 ->32(2,16) ->192 ->64(12,4) ->64
+        [1, 1, 96, 3, 1, 4, 8, 8, 96, 8, 8, 2, 2, 1,
+         2],  # 64 ->64(3,16) ->384 ->96(16,6) ->96
+        [1, 1, 384, 3, 1, 4, 12, 12, 0, 0, 0, 2, 2, 1,
+         2],  # 96 ->96(4,24) ->384
+    ],
+    "msnx_dy6_exp6_6M_221": [
+        #s, n,  c, ks, c1, c2, g1, g2, c3, g3, g4
+        [2, 1, 8, 3, 2, 2, 0, 6, 8, 2, 2, 2, 0, 1,
+         1],  # 6  ->12(0,0)  ->24  ->8(4,2)   ->8
+        [2, 1, 16, 3, 2, 2, 0, 8, 16, 4, 4, 2, 2, 1,
+         1],  # 8  ->16(0,0)  ->32  ->16(4,4)  ->16
+        [2, 1, 16, 5, 2, 2, 0, 16, 16, 4, 4, 2, 2, 1,
+         1],  # 16 ->32(0,0)  ->64  ->16(8,2)  ->16
+        [1, 1, 32, 5, 1, 6, 4, 4, 32, 4, 4, 2, 2, 1,
+         1],  # 16 ->16(2,8)  ->96  ->32(8,4)  ->32
+        [2, 1, 64, 5, 1, 6, 8, 8, 64, 8, 8, 2, 2, 1,
+         1],  # 32 ->32(2,16) ->192 ->64(12,4) ->64
+        [1, 1, 96, 3, 1, 6, 8, 8, 96, 8, 8, 2, 2, 1,
+         2],  # 64 ->64(3,16) ->384 ->96(16,6) ->96
+        [1, 1, 576, 3, 1, 6, 12, 12, 0, 0, 0, 2, 2, 1,
+         2],  # 96 ->96(4,24) ->576
+    ],
+    "msnx_dy9_exp6_12M_221": [
+        #s, n,  c, ks, c1, c2, g1, g2, c3, g3, g4
+        [2, 1, 12, 3, 2, 2, 0, 8, 12, 4, 4, 2, 0, 1,
+         1],  # 8   ->16(0,0)   ->32  ->12(4,3)   ->12
+        [2, 1, 16, 3, 2, 2, 0, 12, 16, 4, 4, 2, 2, 1,
+         1],  # 12  ->24(0,0)   ->48  ->16(8,2)   ->16
+        [1, 1, 24, 3, 2, 2, 0, 16, 24, 4, 4, 2, 2, 1,
+         1],  # 16  ->16(0,0)   ->64  ->24(8,3)   ->24
+        [2, 1, 32, 5, 1, 6, 6, 6, 32, 4, 4, 2, 2, 1,
+         1],  # 24  ->24(2,12)  ->144 ->32(16,2)  ->32
+        [1, 1, 32, 5, 1, 6, 8, 8, 32, 4, 4, 2, 2, 1,
+         2],  # 32  ->32(2,16)  ->192 ->32(16,2)  ->32
+        [1, 1, 64, 5, 1, 6, 8, 8, 64, 8, 8, 2, 2, 1,
+         2],  # 32  ->32(2,16)  ->192 ->64(12,4)  ->64
+        [2, 1, 96, 5, 1, 6, 8, 8, 96, 8, 8, 2, 2, 1,
+         2],  # 64  ->64(4,12)  ->384 ->96(16,5)  ->96
+        [1, 1, 128, 3, 1, 6, 12, 12, 128, 8, 8, 2, 2, 1,
+         2],  # 96  ->96(5,16)  ->576 ->128(16,8) ->128
+        [1, 1, 768, 3, 1, 6, 16, 16, 0, 0, 0, 2, 2, 1,
+         2],  # 128 ->128(4,32) ->768
+    ],
+    "msnx_dy12_exp6_20M_020": [
+        #s, n,  c, ks, c1, c2, g1, g2, c3, g3, g4
+        [2, 1, 16, 3, 2, 2, 0, 12, 16, 4, 4, 0, 2, 0,
+         1],  # 12  ->24(0,0)   ->48  ->16(8,2)   ->16
+        [2, 1, 24, 3, 2, 2, 0, 16, 24, 4, 4, 0, 2, 0,
+         1],  # 16  ->32(0,0)   ->64  ->24(8,3)   ->24
+        [1, 1, 24, 3, 2, 2, 0, 24, 24, 4, 4, 0, 2, 0,
+         1],  # 24  ->48(0,0)   ->96  ->24(8,3)   ->24
+        [2, 1, 32, 5, 1, 6, 6, 6, 32, 4, 4, 0, 2, 0,
+         1],  # 24  ->24(2,12)  ->144 ->32(16,2)  ->32
+        [1, 1, 32, 5, 1, 6, 8, 8, 32, 4, 4, 0, 2, 0,
+         2],  # 32  ->32(2,16)  ->192 ->32(16,2)  ->32
+        [1, 1, 64, 5, 1, 6, 8, 8, 48, 8, 8, 0, 2, 0,
+         2],  # 32  ->32(2,16)  ->192 ->48(12,4)  ->64
+        [1, 1, 80, 5, 1, 6, 8, 8, 80, 8, 8, 0, 2, 0,
+         2],  # 48  ->48(3,16)  ->288 ->80(16,5)  ->80
+        [1, 1, 80, 5, 1, 6, 10, 10, 80, 8, 8, 0, 2, 0,
+         2],  # 80  ->80(4,20)  ->480 ->80(20,4)  ->80
+        [2, 1, 120, 5, 1, 6, 10, 10, 120, 10, 10, 0, 2, 0,
+         2],  # 80  ->80(4,20)  ->480 ->128(16,8) ->120
+        [1, 1, 120, 5, 1, 6, 12, 12, 120, 10, 10, 0, 2, 0,
+         2],  # 120 ->128(4,32) ->720 ->128(32,4) ->120
+        [1, 1, 144, 3, 1, 6, 12, 12, 144, 12, 12, 0, 2, 0,
+         2],  # 120 ->128(4,32) ->720 ->160(32,5) ->144
+        [1, 1, 864, 3, 1, 6, 12, 12, 0, 0, 0, 0, 2, 0,
+         2],  # 144 ->144(5,32) ->864
+    ],
+}
+
+ACTIVATION_CONFIG = {
+    "msnx_dy6_exp4_4M_221": {
+        "act_max": 2.0,
+        "reduction": 8,
+        "init_ab3": [1.0, 0.0],
+        "init_a": [1.0, 1.0],
+        "init_b": [0.0, 0.0],
+    },
+    "msnx_dy6_exp6_6M_221": {
+        "act_max": 2.0,
+        "reduction": 8,
+        "init_ab3": [1.0, 0.0],
+        "init_a": [1.0, 1.0],
+        "init_b": [0.0, 0.0],
+    },
+    "msnx_dy9_exp6_12M_221": {
+        "act_max": 2.0,
+        "reduction": 8,
+        "init_ab3": [1.0, 0.0],
+        "init_a": [1.0, 1.0],
+        "init_b": [0.0, 0.0],
+    },
+    "msnx_dy12_exp6_20M_020": {
+        "act_max": 2.0,
+        "reduction": 8,
+        "init_ab3": [1.0, 0.0],
+        "init_a": [1.0, 0.5],
+        "init_b": [0.0, 0.5],
+    },
+}
+
+
+def _make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class MaxGroupPooling(nn.Layer):
+    def __init__(self, channel_per_group=2):
+        super().__init__()
+        self.channel_per_group = channel_per_group
+
+    def forward(self, x):
+        if self.channel_per_group == 1:
+            return x
+
+        # max op
+        b, c, h, w = x.shape
+
+        # reshape
+        y = x.reshape([b, c // self.channel_per_group, -1, h, w])
+        out, _ = paddle.max(y, axis=2)
+        return out
+
+
+class SwishLinear(nn.Layer):
+    def __init__(self, inp, oup):
+        super().__init__()
+        self.linear = nn.Sequential(
+            nn.Linear(inp, oup), nn.BatchNorm1D(oup), nn.Hardswish())
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+class StemLayer(nn.Layer):
+    def __init__(self, inp, oup, stride, groups=(4, 4)):
+        super().__init__()
+        g1, g2 = groups
+        self.stem = nn.Sequential(
+            SpatialSepConvSF(inp, groups, 3, stride),
+            MaxGroupPooling(2) if g1 * g2 == 2 * oup else nn.ReLU6())
+
+    def forward(self, x):
+        out = self.stem(x)
+        return out
+
+
+class GroupConv(nn.Layer):
+    def __init__(self, inp, oup, groups=2):
+        super().__init__()
+        self.inp = inp
+        self.oup = oup
+        self.groups = groups
+        self.conv = nn.Sequential(
+            nn.Conv2D(
+                inp, oup, 1, groups=self.groups[0], bias_attr=False),
+            nn.BatchNorm2D(oup))
+
+    def forward(self, x):
+        x = self.conv(x)
+        return x
+
+
+class ChannelShuffle(nn.Layer):
+    def __init__(self, groups):
+        super().__init__()
+        self.groups = groups
+
+    def forward(self, x):
+        b, c, h, w = x.shape
+
+        channels_per_group = c // self.groups
+
+        # reshape
+        x = x.reshape([b, self.groups, channels_per_group, h, w])
+        x = x.transpose([0, 2, 1, 3, 4])
+        out = x.reshape([b, c, h, w])
+
+        return out
+
+
+class SpatialSepConvSF(nn.Layer):
+    def __init__(self, inp, oups, kernel_size, stride):
+        super().__init__()
+
+        oup1, oup2 = oups
+        self.conv = nn.Sequential(
+            nn.Conv2D(
+                inp,
+                oup1, (kernel_size, 1), (stride, 1), (kernel_size // 2, 0),
+                groups=1,
+                bias_attr=False),
+            nn.BatchNorm2D(oup1),
+            nn.Conv2D(
+                oup1,
+                oup1 * oup2, (1, kernel_size), (1, stride),
+                (0, kernel_size // 2),
+                groups=oup1,
+                bias_attr=False),
+            nn.BatchNorm2D(oup1 * oup2),
+            ChannelShuffle(oup1))
+
+    def forward(self, x):
+        out = self.conv(x)
+        return out
+
+
+class DepthSpatialSepConv(nn.Layer):
+    def __init__(self, inp, expand, kernel_size, stride):
+        super().__init__()
+
+        exp1, exp2 = expand
+        hidden_dim = inp * exp1
+        oup = inp * exp1 * exp2
+
+        self.conv = nn.Sequential(
+            nn.Conv2D(
+                inp,
+                inp * exp1, (kernel_size, 1), (stride, 1),
+                (kernel_size // 2, 0),
+                groups=inp,
+                bias_attr=False),
+            nn.BatchNorm2D(inp * exp1),
+            nn.Conv2D(
+                hidden_dim,
+                oup, (1, kernel_size), (1, stride), (0, kernel_size // 2),
+                groups=hidden_dim,
+                bias_attr=False),
+            nn.BatchNorm2D(oup))
+
+    def forward(self, x):
+        out = self.conv(x)
+        return out
+
+
+class DYShiftMax(nn.Layer):
+    def __init__(self,
+                 inp,
+                 oup,
+                 reduction=4,
+                 act_max=1.0,
+                 act_relu=True,
+                 init_a=[0.0, 0.0],
+                 init_b=[0.0, 0.0],
+                 relu_before_pool=False,
+                 g=None,
+                 expansion=False):
+        super().__init__()
+        self.oup = oup
+        self.act_max = act_max * 2
+        self.act_relu = act_relu
+        self.avg_pool = nn.Sequential(nn.ReLU() if relu_before_pool == True
+                                      else nn.Identity(),
+                                      nn.AdaptiveAvgPool2D(1))
+
+        self.exp = 4 if act_relu else 2
+        self.init_a = init_a
+        self.init_b = init_b
+
+        # determine squeeze
+        squeeze = _make_divisible(inp // reduction, 4)
+        if squeeze < 4:
+            squeeze = 4
+
+        self.fc = nn.Sequential(
+            nn.Linear(inp, squeeze),
+            nn.ReLU(), nn.Linear(squeeze, oup * self.exp), nn.Hardsigmoid())
+        if g is None:
+            g = 1
+        self.g = g[1]
+        if self.g != 1 and expansion:
+            self.g = inp // self.g
+        self.gc = inp // self.g
+        index = paddle.to_tensor(list(range(inp))).reshape([1, inp, 1, 1])
+        index = index.reshape([1, self.g, self.gc, 1, 1])
+        indexgs = paddle.split(index, [1, self.g - 1], axis=1)
+        indexgs = paddle.concat((indexgs[1], indexgs[0]), axis=1)
+        indexs = paddle.split(indexgs, [1, self.gc - 1], axis=2)
+        indexs = paddle.concat((indexs[1], indexs[0]), axis=2)
+        self.index = indexs.reshape([inp]).astype(paddle.int64)
+        self.expansion = expansion
+
+    def forward(self, x):
+        x_in = x
+        x_out = x
+
+        b, c, _, _ = x_in.shape
+        y = self.avg_pool(x_in).reshape([b, c])
+        y = self.fc(y).reshape([b, self.oup * self.exp, 1, 1])
+        y = (y - 0.5) * self.act_max
+
+        n2, c2, h2, w2 = x_out.shape
+        x2 = paddle.index_select(x_out, self.index, axis=1)
+
+        if self.exp == 4:
+            a1, b1, a2, b2 = paddle.split(y, 4, axis=1)
+
+            a1 = a1 + self.init_a[0]
+            a2 = a2 + self.init_a[1]
+
+            b1 = b1 + self.init_b[0]
+            b2 = b2 + self.init_b[1]
+
+            z1 = x_out * a1 + x2 * b1
+            z2 = x_out * a2 + x2 * b2
+
+            out = paddle.maximum(z1, z2)
+
+        elif self.exp == 2:
+            a1, b1 = paddle.split(y, 2, axis=1)
+            a1 = a1 + self.init_a[0]
+            b1 = b1 + self.init_b[0]
+            out = x_out * a1 + x2 * b1
+
+        return out
+
+
+class DYMicroBlock(nn.Layer):
+    def __init__(self,
+                 inp,
+                 oup,
+                 kernel_size=3,
+                 stride=1,
+                 ch_exp=(2, 2),
+                 ch_per_group=4,
+                 groups_1x1=(1, 1),
+                 dy=[0, 0, 0],
+                 ratio=1.0,
+                 activation_cfg=None):
+        super().__init__()
+
+        self.identity = stride == 1 and inp == oup
+
+        y1, y2, y3 = dy
+        act_max = activation_cfg["act_max"]
+        act_reduction = activation_cfg["reduction"] * ratio
+        init_a = activation_cfg["init_a"]
+        init_b = activation_cfg["init_b"]
+        init_ab3 = activation_cfg["init_ab3"]
+
+        t1 = ch_exp
+        gs1 = ch_per_group
+        hidden_fft, g1, g2 = groups_1x1
+
+        hidden_dim1 = inp * t1[0]
+        hidden_dim2 = inp * t1[0] * t1[1]
+
+        if gs1[0] == 0:
+            self.layers = nn.Sequential(
+                DepthSpatialSepConv(inp, t1, kernel_size, stride),
+                DYShiftMax(
+                    hidden_dim2,
+                    hidden_dim2,
+                    act_max=act_max,
+                    act_relu=True if y2 == 2 else False,
+                    init_a=init_a,
+                    reduction=act_reduction,
+                    init_b=init_b,
+                    g=gs1,
+                    expansion=False) if y2 > 0 else nn.ReLU6(),
+                ChannelShuffle(gs1[1]),
+                ChannelShuffle(hidden_dim2 // 2) if y2 != 0 else nn.Identity(),
+                GroupConv(hidden_dim2, oup, (g1, g2)),
+                DYShiftMax(
+                    oup,
+                    oup,
+                    act_max=act_max,
+                    act_relu=False,
+                    init_a=[init_ab3[0], 0.0],
+                    reduction=act_reduction // 2,
+                    init_b=[init_ab3[1], 0.0],
+                    g=(g1, g2),
+                    expansion=False) if y3 > 0 else nn.Identity(),
+                ChannelShuffle(g2),
+                ChannelShuffle(oup // 2)
+                if oup % 2 == 0 and y3 != 0 else nn.Identity())
+        elif g2 == 0:
+            self.layers = nn.Sequential(
+                GroupConv(inp, hidden_dim2, gs1),
+                DYShiftMax(
+                    hidden_dim2,
+                    hidden_dim2,
+                    act_max=act_max,
+                    act_relu=False,
+                    init_a=[init_ab3[0], 0.0],
+                    reduction=act_reduction,
+                    init_b=[init_ab3[1], 0.0],
+                    g=gs1,
+                    expansion=False) if y3 > 0 else nn.Identity())
+        else:
+            self.layers = nn.Sequential(
+                GroupConv(inp, hidden_dim2, gs1),
+                DYShiftMax(
+                    hidden_dim2,
+                    hidden_dim2,
+                    act_max=act_max,
+                    act_relu=True if y1 == 2 else False,
+                    init_a=init_a,
+                    reduction=act_reduction,
+                    init_b=init_b,
+                    g=gs1,
+                    expansion=False) if y1 > 0 else nn.ReLU6(),
+                ChannelShuffle(gs1[1]),
+                DepthSpatialSepConv(hidden_dim2, (1, 1), kernel_size, stride),
+                nn.Identity(),
+                DYShiftMax(
+                    hidden_dim2,
+                    hidden_dim2,
+                    act_max=act_max,
+                    act_relu=True if y2 == 2 else False,
+                    init_a=init_a,
+                    reduction=act_reduction,
+                    init_b=init_b,
+                    g=gs1,
+                    expansion=True) if y2 > 0 else nn.ReLU6(),
+                ChannelShuffle(hidden_dim2 // 4)
+                if y1 != 0 and y2 != 0 else nn.Identity()
+                if y1 == 0 and y2 == 0 else ChannelShuffle(hidden_dim2 // 2),
+                GroupConv(hidden_dim2, oup, (g1, g2)),
+                DYShiftMax(
+                    oup,
+                    oup,
+                    act_max=act_max,
+                    act_relu=False,
+                    init_a=[init_ab3[0], 0.0],
+                    reduction=act_reduction // 2
+                    if oup < hidden_dim2 else act_reduction,
+                    init_b=[init_ab3[1], 0.0],
+                    g=(g1, g2),
+                    expansion=False) if y3 > 0 else nn.Identity(),
+                ChannelShuffle(g2),
+                ChannelShuffle(oup // 2) if y3 != 0 else nn.Identity())
+
+    def forward(self, x):
+        out = self.layers(x)
+        if self.identity:
+            out = out + x
+        return out
+
+
+class MicroNet(nn.Layer):
+    def __init__(self,
+                 net_cfg,
+                 activation_cfg,
+                 input_size=224,
+                 class_num=1000,
+                 stem_ch=16,
+                 stem_groups=[4, 8],
+                 out_ch=1024,
+                 dropout_rate=0.0):
+        super().__init__()
+
+        # building first layer
+        assert input_size % 32 == 0
+        input_channel = stem_ch
+        layers = [StemLayer(3, input_channel, stride=2, groups=stem_groups)]
+
+        for s, n, c, ks, c1, c2, g1, g2, c3, g3, g4, y1, y2, y3, r in net_cfg:
+            for i in range(n):
+                layers.append(
+                    DYMicroBlock(
+                        input_channel,
+                        c,
+                        kernel_size=ks,
+                        stride=s if i == 0 else 1,
+                        ch_exp=(c1, c2),
+                        ch_per_group=(g1, g2),
+                        groups_1x1=(c3, g3, g4),
+                        dy=[y1, y2, y3],
+                        ratio=r,
+                        activation_cfg=activation_cfg))
+                input_channel = c
+        self.features = nn.Sequential(*layers)
+
+        self.avgpool = nn.Sequential(nn.ReLU6(),
+                                     nn.AdaptiveAvgPool2D(1), nn.Hardswish())
+
+        # building last several layers
+        self.classifier = nn.Sequential(
+            SwishLinear(input_channel, out_ch),
+            nn.Dropout(dropout_rate), SwishLinear(out_ch, class_num))
+
+        self.apply(self._initialize_weights)
+
+    def _initialize_weights(self, m):
+        if isinstance(m, nn.Conv2D):
+            n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+            nn.initializer.Normal(std=math.sqrt(2. / n))(m.weight)
+        elif isinstance(m, nn.Linear):
+            nn.initializer.Normal(std=0.01)(m.weight)
+
+    def forward(self, x):
+        x = self.features(x)
+        x = self.avgpool(x)
+        x = self.classifier(x.flatten(1))
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def MicroNet_M0(pretrained=False, use_ssld=False, **kwargs):
+    model = MicroNet(
+        NET_CONFIG["msnx_dy6_exp4_4M_221"],
+        ACTIVATION_CONFIG["msnx_dy6_exp4_4M_221"],
+        stem_ch=4,
+        stem_groups=[2, 2],
+        out_ch=640,
+        dropout_rate=0.05,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MicroNet_M0"], use_ssld)
+    return model
+
+
+def MicroNet_M1(pretrained=False, use_ssld=False, **kwargs):
+    model = MicroNet(
+        NET_CONFIG["msnx_dy6_exp6_6M_221"],
+        ACTIVATION_CONFIG["msnx_dy6_exp6_6M_221"],
+        stem_ch=6,
+        stem_groups=[3, 2],
+        out_ch=960,
+        dropout_rate=0.05,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MicroNet_M1"], use_ssld)
+    return model
+
+
+def MicroNet_M2(pretrained=False, use_ssld=False, **kwargs):
+    model = MicroNet(
+        NET_CONFIG["msnx_dy9_exp6_12M_221"],
+        ACTIVATION_CONFIG["msnx_dy9_exp6_12M_221"],
+        stem_ch=8,
+        stem_groups=[4, 2],
+        out_ch=1024,
+        dropout_rate=0.1,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MicroNet_M2"], use_ssld)
+    return model
+
+
+def MicroNet_M3(pretrained=False, use_ssld=False, **kwargs):
+    model = MicroNet(
+        NET_CONFIG["msnx_dy12_exp6_20M_020"],
+        ACTIVATION_CONFIG["msnx_dy12_exp6_20M_020"],
+        stem_ch=12,
+        stem_groups=[4, 3],
+        out_ch=1024,
+        dropout_rate=0.1,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MicroNet_M3"], use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mixnet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mixnet.py
new file mode 100644
index 000000000..201630cf9
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mixnet.py
@@ -0,0 +1,812 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1907.09595
+
+import os
+from inspect import isfunction
+from functools import reduce
+import paddle
+import paddle.nn as nn
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "MixNet_S":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MixNet_S_pretrained.pdparams",
+    "MixNet_M":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MixNet_M_pretrained.pdparams",
+    "MixNet_L":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MixNet_L_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class Identity(nn.Layer):
+    """
+    Identity block.
+    """
+
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+def round_channels(channels, divisor=8):
+    """
+    Round weighted channel number (make divisible operation).
+
+    Parameters:
+    ----------
+    channels : int or float
+        Original number of channels.
+    divisor : int, default 8
+        Alignment value.
+
+    Returns:
+    -------
+    int
+        Weighted number of channels.
+    """
+    rounded_channels = max(
+        int(channels + divisor / 2.0) // divisor * divisor, divisor)
+    if float(rounded_channels) < 0.9 * channels:
+        rounded_channels += divisor
+    return rounded_channels
+
+
+def get_activation_layer(activation):
+    """
+    Create activation layer from string/function.
+
+    Parameters:
+    ----------
+    activation : function, or str, or nn.Module
+        Activation function or name of activation function.
+
+    Returns:
+    -------
+    nn.Module
+        Activation layer.
+    """
+    assert activation is not None
+    if isfunction(activation):
+        return activation()
+    elif isinstance(activation, str):
+        if activation == "relu":
+            return nn.ReLU()
+        elif activation == "relu6":
+            return nn.ReLU6()
+        elif activation == "swish":
+            return nn.Swish()
+        elif activation == "hswish":
+            return nn.Hardswish()
+        elif activation == "sigmoid":
+            return nn.Sigmoid()
+        elif activation == "hsigmoid":
+            return nn.Hardsigmoid()
+        elif activation == "identity":
+            return Identity()
+        else:
+            raise NotImplementedError()
+    else:
+        assert isinstance(activation, nn.Layer)
+        return activation
+
+
+class ConvBlock(nn.Layer):
+    """
+    Standard convolution block with Batch normalization and activation.
+
+    Parameters:
+    ----------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.
+    kernel_size : int or tuple/list of 2 int
+        Convolution window size.
+    stride : int or tuple/list of 2 int
+        Strides of the convolution.
+    padding : int, or tuple/list of 2 int, or tuple/list of 4 int
+        Padding value for convolution layer.
+    dilation : int or tuple/list of 2 int, default 1
+        Dilation value for convolution layer.
+    groups : int, default 1
+        Number of groups.
+    bias : bool, default False
+        Whether the layer uses a bias vector.
+    use_bn : bool, default True
+        Whether to use BatchNorm layer.
+    bn_eps : float, default 1e-5
+        Small float added to variance in Batch norm.
+    activation : function or str or None, default nn.ReLU()
+        Activation function or name of activation function.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 dilation=1,
+                 groups=1,
+                 bias=False,
+                 use_bn=True,
+                 bn_eps=1e-5,
+                 activation=nn.ReLU()):
+        super(ConvBlock, self).__init__()
+        self.activate = (activation is not None)
+        self.use_bn = use_bn
+        self.use_pad = (isinstance(padding, (list, tuple)) and
+                        (len(padding) == 4))
+
+        if self.use_pad:
+            self.pad = padding
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias_attr=bias,
+            weight_attr=None)
+        if self.use_bn:
+            self.bn = nn.BatchNorm2D(num_features=out_channels, epsilon=bn_eps)
+        if self.activate:
+            self.activ = get_activation_layer(activation)
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.use_bn:
+            x = self.bn(x)
+        if self.activate:
+            x = self.activ(x)
+        return x
+
+
+class SEBlock(nn.Layer):
+    def __init__(self,
+                 channels,
+                 reduction=16,
+                 mid_channels=None,
+                 round_mid=False,
+                 use_conv=True,
+                 mid_activation=nn.ReLU(),
+                 out_activation=nn.Sigmoid()):
+        super(SEBlock, self).__init__()
+        self.use_conv = use_conv
+        if mid_channels is None:
+            mid_channels = channels // reduction if not round_mid else round_channels(
+                float(channels) / reduction)
+
+        self.pool = nn.AdaptiveAvgPool2D(output_size=1)
+        if use_conv:
+            self.conv1 = nn.Conv2D(
+                in_channels=channels,
+                out_channels=mid_channels,
+                kernel_size=1,
+                stride=1,
+                groups=1,
+                bias_attr=True,
+                weight_attr=None)
+
+        else:
+            self.fc1 = nn.Linear(
+                in_features=channels, out_features=mid_channels)
+        self.activ = get_activation_layer(mid_activation)
+        if use_conv:
+            self.conv2 = nn.Conv2D(
+                in_channels=mid_channels,
+                out_channels=channels,
+                kernel_size=1,
+                stride=1,
+                groups=1,
+                bias_attr=True,
+                weight_attr=None)
+        else:
+            self.fc2 = nn.Linear(
+                in_features=mid_channels, out_features=channels)
+        self.sigmoid = get_activation_layer(out_activation)
+
+    def forward(self, x):
+        w = self.pool(x)
+        if not self.use_conv:
+            w = w.reshape(shape=[w.shape[0], -1])
+        w = self.conv1(w) if self.use_conv else self.fc1(w)
+        w = self.activ(w)
+        w = self.conv2(w) if self.use_conv else self.fc2(w)
+        w = self.sigmoid(w)
+        if not self.use_conv:
+            w = w.unsqueeze(2).unsqueeze(3)
+        x = x * w
+        return x
+
+
+class MixConv(nn.Layer):
+    """
+    Mixed convolution layer from 'MixConv: Mixed Depthwise Convolutional Kernels,'
+    https://arxiv.org/abs/1907.09595.
+
+    Parameters:
+    ----------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.
+    kernel_size : int or tuple/list of int, or tuple/list of tuple/list of 2 int
+        Convolution window size.
+    stride : int or tuple/list of 2 int
+        Strides of the convolution.
+    padding : int or tuple/list of int, or tuple/list of tuple/list of 2 int
+        Padding value for convolution layer.
+    dilation : int or tuple/list of 2 int, default 1
+        Dilation value for convolution layer.
+    groups : int, default 1
+        Number of groups.
+    bias : bool, default False
+        Whether the layer uses a bias vector.
+    axis : int, default 1
+        The axis on which to concatenate the outputs.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 dilation=1,
+                 groups=1,
+                 bias=False,
+                 axis=1):
+        super(MixConv, self).__init__()
+        kernel_size = kernel_size if isinstance(kernel_size,
+                                                list) else [kernel_size]
+        padding = padding if isinstance(padding, list) else [padding]
+        kernel_count = len(kernel_size)
+        self.splitted_in_channels = self.split_channels(in_channels,
+                                                        kernel_count)
+        splitted_out_channels = self.split_channels(out_channels, kernel_count)
+        for i, kernel_size_i in enumerate(kernel_size):
+            in_channels_i = self.splitted_in_channels[i]
+            out_channels_i = splitted_out_channels[i]
+            padding_i = padding[i]
+            _ = self.add_sublayer(
+                name=str(i),
+                sublayer=nn.Conv2D(
+                    in_channels=in_channels_i,
+                    out_channels=out_channels_i,
+                    kernel_size=kernel_size_i,
+                    stride=stride,
+                    padding=padding_i,
+                    dilation=dilation,
+                    groups=(out_channels_i
+                            if out_channels == groups else groups),
+                    bias_attr=bias,
+                    weight_attr=None))
+        self.axis = axis
+
+    def forward(self, x):
+        xx = paddle.split(x, self.splitted_in_channels, axis=self.axis)
+        xx = paddle.split(x, self.splitted_in_channels, axis=self.axis)
+        out = [
+            conv_i(x_i) for x_i, conv_i in zip(xx, self._sub_layers.values())
+        ]
+        x = paddle.concat(tuple(out), axis=self.axis)
+        return x
+
+    @staticmethod
+    def split_channels(channels, kernel_count):
+        splitted_channels = [channels // kernel_count] * kernel_count
+        splitted_channels[0] += channels - sum(splitted_channels)
+        return splitted_channels
+
+
+class MixConvBlock(nn.Layer):
+    """
+    Mixed convolution block with Batch normalization and activation.
+
+    Parameters:
+    ----------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.
+    kernel_size : int or tuple/list of int, or tuple/list of tuple/list of 2 int
+        Convolution window size.
+    stride : int or tuple/list of 2 int
+        Strides of the convolution.
+    padding : int or tuple/list of int, or tuple/list of tuple/list of 2 int
+        Padding value for convolution layer.
+    dilation : int or tuple/list of 2 int, default 1
+        Dilation value for convolution layer.
+    groups : int, default 1
+        Number of groups.
+    bias : bool, default False
+        Whether the layer uses a bias vector.
+    use_bn : bool, default True
+        Whether to use BatchNorm layer.
+    bn_eps : float, default 1e-5
+        Small float added to variance in Batch norm.
+    activation : function or str or None, default nn.ReLU()
+        Activation function or name of activation function.
+    activate : bool, default True
+        Whether activate the convolution block.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 dilation=1,
+                 groups=1,
+                 bias=False,
+                 use_bn=True,
+                 bn_eps=1e-5,
+                 activation=nn.ReLU()):
+        super(MixConvBlock, self).__init__()
+        self.activate = (activation is not None)
+        self.use_bn = use_bn
+
+        self.conv = MixConv(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias)
+        if self.use_bn:
+            self.bn = nn.BatchNorm2D(num_features=out_channels, epsilon=bn_eps)
+        if self.activate:
+            self.activ = get_activation_layer(activation)
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.use_bn:
+            x = self.bn(x)
+        if self.activate:
+            x = self.activ(x)
+        return x
+
+
+def mixconv1x1_block(in_channels,
+                     out_channels,
+                     kernel_count,
+                     stride=1,
+                     groups=1,
+                     bias=False,
+                     use_bn=True,
+                     bn_eps=1e-5,
+                     activation=nn.ReLU()):
+    """
+    1x1 version of the mixed convolution block.
+
+    Parameters:
+    ----------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.
+    kernel_count : int
+        Kernel count.
+    stride : int or tuple/list of 2 int, default 1
+        Strides of the convolution.
+    groups : int, default 1
+        Number of groups.
+    bias : bool, default False
+        Whether the layer uses a bias vector.
+    use_bn : bool, default True
+        Whether to use BatchNorm layer.
+    bn_eps : float, default 1e-5
+        Small float added to variance in Batch norm.
+    activation : function or str, or None, default nn.ReLU()
+        Activation function or name of activation function.
+    """
+    return MixConvBlock(
+        in_channels=in_channels,
+        out_channels=out_channels,
+        kernel_size=([1] * kernel_count),
+        stride=stride,
+        padding=([0] * kernel_count),
+        groups=groups,
+        bias=bias,
+        use_bn=use_bn,
+        bn_eps=bn_eps,
+        activation=activation)
+
+
+class MixUnit(nn.Layer):
+    """
+    MixNet unit.
+
+    Parameters:
+    ----------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.  exp_channels : int
+        Number of middle (expanded) channels.
+    stride : int or tuple/list of 2 int
+        Strides of the second convolution layer.
+    exp_kernel_count : int
+        Expansion convolution kernel count for each unit.
+    conv1_kernel_count : int
+        Conv1 kernel count for each unit.
+    conv2_kernel_count : int
+        Conv2 kernel count for each unit.
+    exp_factor : int
+        Expansion factor for each unit.
+    se_factor : int
+        SE reduction factor for each unit.
+    activation : str
+        Activation function or name of activation function.
+    """
+
+    def __init__(self, in_channels, out_channels, stride, exp_kernel_count,
+                 conv1_kernel_count, conv2_kernel_count, exp_factor, se_factor,
+                 activation):
+        super(MixUnit, self).__init__()
+        assert exp_factor >= 1
+        assert se_factor >= 0
+        self.residual = (in_channels == out_channels) and (stride == 1)
+        self.use_se = se_factor > 0
+        mid_channels = exp_factor * in_channels
+        self.use_exp_conv = exp_factor > 1
+
+        if self.use_exp_conv:
+            if exp_kernel_count == 1:
+                self.exp_conv = ConvBlock(
+                    in_channels=in_channels,
+                    out_channels=mid_channels,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    groups=1,
+                    bias=False,
+                    use_bn=True,
+                    bn_eps=1e-5,
+                    activation=activation)
+            else:
+                self.exp_conv = mixconv1x1_block(
+                    in_channels=in_channels,
+                    out_channels=mid_channels,
+                    kernel_count=exp_kernel_count,
+                    activation=activation)
+        if conv1_kernel_count == 1:
+            self.conv1 = ConvBlock(
+                in_channels=mid_channels,
+                out_channels=mid_channels,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                dilation=1,
+                groups=mid_channels,
+                bias=False,
+                use_bn=True,
+                bn_eps=1e-5,
+                activation=activation)
+        else:
+            self.conv1 = MixConvBlock(
+                in_channels=mid_channels,
+                out_channels=mid_channels,
+                kernel_size=[3 + 2 * i for i in range(conv1_kernel_count)],
+                stride=stride,
+                padding=[1 + i for i in range(conv1_kernel_count)],
+                groups=mid_channels,
+                activation=activation)
+        if self.use_se:
+            self.se = SEBlock(
+                channels=mid_channels,
+                reduction=(exp_factor * se_factor),
+                round_mid=False,
+                mid_activation=activation)
+        if conv2_kernel_count == 1:
+            self.conv2 = ConvBlock(
+                in_channels=mid_channels,
+                out_channels=out_channels,
+                activation=None,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                groups=1,
+                bias=False,
+                use_bn=True,
+                bn_eps=1e-5)
+        else:
+            self.conv2 = mixconv1x1_block(
+                in_channels=mid_channels,
+                out_channels=out_channels,
+                kernel_count=conv2_kernel_count,
+                activation=None)
+
+    def forward(self, x):
+        if self.residual:
+            identity = x
+        if self.use_exp_conv:
+            x = self.exp_conv(x)
+        x = self.conv1(x)
+        if self.use_se:
+            x = self.se(x)
+        x = self.conv2(x)
+        if self.residual:
+            x = x + identity
+        return x
+
+
+class MixInitBlock(nn.Layer):
+    """
+    MixNet specific initial block.
+
+    Parameters:
+    ----------
+    in_channels : int
+        Number of input channels.
+    out_channels : int
+        Number of output channels.
+    """
+
+    def __init__(self, in_channels, out_channels):
+        super(MixInitBlock, self).__init__()
+        self.conv1 = ConvBlock(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            stride=2,
+            kernel_size=3,
+            padding=1)
+        self.conv2 = MixUnit(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            stride=1,
+            exp_kernel_count=1,
+            conv1_kernel_count=1,
+            conv2_kernel_count=1,
+            exp_factor=1,
+            se_factor=0,
+            activation="relu")
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+
+
+class MixNet(nn.Layer):
+    """
+    MixNet model from 'MixConv: Mixed Depthwise Convolutional Kernels,'
+    https://arxiv.org/abs/1907.09595.
+
+    Parameters:
+    ----------
+    channels : list of list of int
+        Number of output channels for each unit.
+    init_block_channels : int
+        Number of output channels for the initial unit.
+    final_block_channels : int
+        Number of output channels for the final block of the feature extractor.
+    exp_kernel_counts : list of list of int
+        Expansion convolution kernel count for each unit.
+    conv1_kernel_counts : list of list of int
+        Conv1 kernel count for each unit.
+    conv2_kernel_counts : list of list of int
+        Conv2 kernel count for each unit.
+    exp_factors : list of list of int
+        Expansion factor for each unit.
+    se_factors : list of list of int
+        SE reduction factor for each unit.
+    in_channels : int, default 3
+        Number of input channels.
+    in_size : tuple of two ints, default (224, 224)
+        Spatial size of the expected input image.
+    class_num : int, default 1000
+        Number of classification classes.
+    """
+
+    def __init__(self,
+                 channels,
+                 init_block_channels,
+                 final_block_channels,
+                 exp_kernel_counts,
+                 conv1_kernel_counts,
+                 conv2_kernel_counts,
+                 exp_factors,
+                 se_factors,
+                 in_channels=3,
+                 in_size=(224, 224),
+                 class_num=1000):
+        super(MixNet, self).__init__()
+        self.in_size = in_size
+        self.class_num = class_num
+
+        self.features = nn.Sequential()
+        self.features.add_sublayer(
+            "init_block",
+            MixInitBlock(
+                in_channels=in_channels, out_channels=init_block_channels))
+        in_channels = init_block_channels
+        for i, channels_per_stage in enumerate(channels):
+            stage = nn.Sequential()
+            for j, out_channels in enumerate(channels_per_stage):
+                stride = 2 if ((j == 0) and (i != 3)) or (
+                    (j == len(channels_per_stage) // 2) and (i == 3)) else 1
+                exp_kernel_count = exp_kernel_counts[i][j]
+                conv1_kernel_count = conv1_kernel_counts[i][j]
+                conv2_kernel_count = conv2_kernel_counts[i][j]
+                exp_factor = exp_factors[i][j]
+                se_factor = se_factors[i][j]
+                activation = "relu" if i == 0 else "swish"
+                stage.add_sublayer(
+                    "unit{}".format(j + 1),
+                    MixUnit(
+                        in_channels=in_channels,
+                        out_channels=out_channels,
+                        stride=stride,
+                        exp_kernel_count=exp_kernel_count,
+                        conv1_kernel_count=conv1_kernel_count,
+                        conv2_kernel_count=conv2_kernel_count,
+                        exp_factor=exp_factor,
+                        se_factor=se_factor,
+                        activation=activation))
+                in_channels = out_channels
+            self.features.add_sublayer("stage{}".format(i + 1), stage)
+        self.features.add_sublayer(
+            "final_block",
+            ConvBlock(
+                in_channels=in_channels,
+                out_channels=final_block_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                groups=1,
+                bias=False,
+                use_bn=True,
+                bn_eps=1e-5,
+                activation=nn.ReLU()))
+        in_channels = final_block_channels
+        self.features.add_sublayer(
+            "final_pool", nn.AvgPool2D(
+                kernel_size=7, stride=1))
+
+        self.output = nn.Linear(
+            in_features=in_channels, out_features=class_num)
+
+    def forward(self, x):
+        x = self.features(x)
+        reshape_dim = reduce(lambda x, y: x * y, x.shape[1:])
+        x = x.reshape(shape=[x.shape[0], reshape_dim])
+        x = self.output(x)
+        return x
+
+
+def get_mixnet(version, width_scale, model_name=None, **kwargs):
+    """
+    Create MixNet model with specific parameters.
+
+    Parameters:
+    ----------
+    version : str
+        Version of MobileNetV3 ('s' or 'm').
+    width_scale : float
+        Scale factor for width of layers.
+    model_name : str or None, default None
+        Model name.
+    """
+
+    if version == "s":
+        init_block_channels = 16
+        channels = [[24, 24], [40, 40, 40, 40], [80, 80, 80],
+                    [120, 120, 120, 200, 200, 200]]
+        exp_kernel_counts = [[2, 2], [1, 2, 2, 2], [1, 1, 1],
+                             [2, 2, 2, 1, 1, 1]]
+        conv1_kernel_counts = [[1, 1], [3, 2, 2, 2], [3, 2, 2],
+                               [3, 4, 4, 5, 4, 4]]
+        conv2_kernel_counts = [[2, 2], [1, 2, 2, 2], [2, 2, 2],
+                               [2, 2, 2, 1, 2, 2]]
+        exp_factors = [[6, 3], [6, 6, 6, 6], [6, 6, 6], [6, 3, 3, 6, 6, 6]]
+        se_factors = [[0, 0], [2, 2, 2, 2], [4, 4, 4], [2, 2, 2, 2, 2, 2]]
+    elif version == "m":
+        init_block_channels = 24
+        channels = [[32, 32], [40, 40, 40, 40], [80, 80, 80, 80],
+                    [120, 120, 120, 120, 200, 200, 200, 200]]
+        exp_kernel_counts = [[2, 2], [1, 2, 2, 2], [1, 2, 2, 2],
+                             [1, 2, 2, 2, 1, 1, 1, 1]]
+        conv1_kernel_counts = [[3, 1], [4, 2, 2, 2], [3, 4, 4, 4],
+                               [1, 4, 4, 4, 4, 4, 4, 4]]
+        conv2_kernel_counts = [[2, 2], [1, 2, 2, 2], [1, 2, 2, 2],
+                               [1, 2, 2, 2, 1, 2, 2, 2]]
+        exp_factors = [[6, 3], [6, 6, 6, 6], [6, 6, 6, 6],
+                       [6, 3, 3, 3, 6, 6, 6, 6]]
+        se_factors = [[0, 0], [2, 2, 2, 2], [4, 4, 4, 4],
+                      [2, 2, 2, 2, 2, 2, 2, 2]]
+    else:
+        raise ValueError("Unsupported MixNet version {}".format(version))
+
+    final_block_channels = 1536
+
+    if width_scale != 1.0:
+        channels = [[round_channels(cij * width_scale) for cij in ci]
+                    for ci in channels]
+        init_block_channels = round_channels(init_block_channels * width_scale)
+
+    net = MixNet(
+        channels=channels,
+        init_block_channels=init_block_channels,
+        final_block_channels=final_block_channels,
+        exp_kernel_counts=exp_kernel_counts,
+        conv1_kernel_counts=conv1_kernel_counts,
+        conv2_kernel_counts=conv2_kernel_counts,
+        exp_factors=exp_factors,
+        se_factors=se_factors,
+        **kwargs)
+
+    return net
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def MixNet_S(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MixNet-S model from 'MixConv: Mixed Depthwise Convolutional Kernels,'
+    https://arxiv.org/abs/1907.09595.
+    """
+    model = get_mixnet(
+        version="s", width_scale=1.0, model_name="MixNet_S", **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MixNet_S"], use_ssld=use_ssld)
+    return model
+
+
+def MixNet_M(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MixNet-M model from 'MixConv: Mixed Depthwise Convolutional Kernels,'
+    https://arxiv.org/abs/1907.09595.
+    """
+    model = get_mixnet(
+        version="m", width_scale=1.0, model_name="MixNet_M", **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MixNet_M"], use_ssld=use_ssld)
+    return model
+
+
+def MixNet_L(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MixNet-S model from 'MixConv: Mixed Depthwise Convolutional Kernels,'
+    https://arxiv.org/abs/1907.09595.
+    """
+    model = get_mixnet(
+        version="m", width_scale=1.3, model_name="MixNet_L", **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MixNet_L"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mobilefacenet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mobilefacenet.py
new file mode 100644
index 000000000..7fe78a074
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mobilefacenet.py
@@ -0,0 +1,166 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+Origin author: wujiyang
+Modify from: https://github.com/wujiyang/Face_Pytorch/blob/master/backbone/mobilefacenet.py
+'''
+
+import paddle
+from paddle import nn
+import math
+
+__all__ = ['MobileFaceNet']
+
+MobileFaceNet_BottleNeck_Setting = [
+    # t, c , n ,s
+    [2, 64, 5, 2],
+    [4, 128, 1, 2],
+    [2, 128, 6, 1],
+    [4, 128, 1, 2],
+    [2, 128, 2, 1]
+]
+
+
+class BottleNeck(nn.Layer):
+    def __init__(self, inp, oup, stride, expansion, data_format="NCHW"):
+        super().__init__()
+        self.connect = stride == 1 and inp == oup
+
+        self.conv = nn.Sequential(
+            # 1*1 conv
+            nn.Conv2D(
+                inp, inp * expansion, 1, 1, 0, bias_attr=False, data_format=data_format),
+            nn.BatchNorm2D(inp * expansion, data_format=data_format),
+            nn.PReLU(inp * expansion, data_format=data_format),
+
+            # 3*3 depth wise conv
+            nn.Conv2D(
+                inp * expansion,
+                inp * expansion,
+                3,
+                stride,
+                1,
+                groups=inp * expansion,
+                bias_attr=False,
+                data_format=data_format
+            ),
+            nn.BatchNorm2D(inp * expansion, data_format=data_format),
+            nn.PReLU(inp * expansion, data_format=data_format),
+
+            # 1*1 conv
+            nn.Conv2D(
+                inp * expansion, oup, 1, 1, 0, bias_attr=False, data_format=data_format),
+            nn.BatchNorm2D(oup, data_format=data_format), )
+
+    def forward(self, x):
+        if self.connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class ConvBlock(nn.Layer):
+    def __init__(self, inp, oup, k, s, p, dw=False, linear=False, data_format="NCHW"):
+        super().__init__()
+        self.linear = linear
+        if dw:
+            self.conv = nn.Conv2D(
+                inp, oup, k, s, p, groups=inp, bias_attr=False, data_format=data_format)
+        else:
+            self.conv = nn.Conv2D(inp, oup, k, s, p, bias_attr=False, data_format=data_format)
+
+        self.bn = nn.BatchNorm2D(oup, data_format=data_format)
+        if not linear:
+            self.prelu = nn.PReLU(oup, data_format=data_format)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.linear:
+            return x
+        else:
+            return self.prelu(x)
+
+
+class Backbone(nn.Layer):
+    def __init__(self,
+                 feature_dim=128,
+                 bottleneck_setting=MobileFaceNet_BottleNeck_Setting,
+                 data_format="NCHW",
+                 **args):
+        super().__init__()
+        self.data_format = data_format
+        
+        self.conv1 = ConvBlock(3, 64, 3, 2, 1, data_format=data_format)
+        self.dw_conv1 = ConvBlock(64, 64, 3, 1, 1, dw=True, data_format=data_format)
+
+        self.cur_channel = 64
+        block = BottleNeck
+        self.blocks = self._make_layer(block, bottleneck_setting)
+
+        self.conv2 = ConvBlock(128, 512, 1, 1, 0, data_format=data_format)
+        self.linear7 = ConvBlock(512, 512, 7, 1, 0, dw=True, linear=True, data_format=data_format)
+        self.linear1 = ConvBlock(512, feature_dim, 1, 1, 0, linear=True, data_format=data_format)
+
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                # ks * ks * out_ch
+                n = m.weight.shape[1] * m.weight.shape[2] * m.weight.shape[3]
+                m.weight = paddle.create_parameter(
+                    shape=m.weight.shape,
+                    dtype=m.weight.dtype,
+                    default_initializer=nn.initializer.Normal(
+                        mean=0.0, std=math.sqrt(2.0 / n)))
+
+            elif isinstance(m, (nn.BatchNorm, nn.BatchNorm2D, nn.GroupNorm)):
+                m.weight = paddle.create_parameter(
+                    shape=m.weight.shape,
+                    dtype=m.weight.dtype,
+                    default_initializer=nn.initializer.Constant(value=1.0))
+                m.bias = paddle.create_parameter(
+                    shape=m.bias.shape,
+                    dtype=m.bias.dtype,
+                    default_initializer=nn.initializer.Constant(value=0.0))
+
+    def _make_layer(self, block, setting):
+        layers = []
+        for t, c, n, s in setting:
+            for i in range(n):
+                if i == 0:
+                    layers.append(block(self.cur_channel, c, s, t, data_format=self.data_format))
+                else:
+                    layers.append(block(self.cur_channel, c, 1, t, data_format=self.data_format))
+                self.cur_channel = c
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        if self.data_format == "NHWC":
+            x = paddle.transpose(x, [0, 2, 3, 1])
+            x.stop_gradient = True
+        x = self.conv1(x)
+        x = self.dw_conv1(x)
+        x = self.blocks(x)
+        x = self.conv2(x)
+        x = self.linear7(x)
+        x = self.linear1(x)
+        if self.data_format == "NHWC":
+            x = paddle.transpose(x, [0, 3, 1, 2])
+        x = x.reshape([x.shape[0], x.shape[1] * x.shape[2] * x.shape[3]])
+        return x
+
+
+def MobileFaceNet(num_features=128, **args):
+    model = Backbone(feature_dim=num_features, **args)
+    return model
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mobilenet_v2.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mobilenet_v2.py
new file mode 100644
index 000000000..5d606988f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mobilenet_v2.py
@@ -0,0 +1,316 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1801.04381
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "MobileNetV2_x0_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_25_pretrained.pdparams",
+    "MobileNetV2_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_5_pretrained.pdparams",
+    "MobileNetV2_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x0_75_pretrained.pdparams",
+    "MobileNetV2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_pretrained.pdparams",
+    "MobileNetV2_x1_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x1_5_pretrained.pdparams",
+    "MobileNetV2_x2_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNetV2_x2_0_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 padding,
+                 channels=None,
+                 num_groups=1,
+                 name=None,
+                 use_cudnn=True,
+                 data_format="NCHW"):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            data_format=data_format)
+
+        self._batch_norm = BatchNorm(
+            num_filters,
+            param_attr=ParamAttr(name=name + "_bn_scale"),
+            bias_attr=ParamAttr(name=name + "_bn_offset"),
+            moving_mean_name=name + "_bn_mean",
+            moving_variance_name=name + "_bn_variance",
+            data_layout=data_format)
+
+    def forward(self, inputs, if_act=True):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if if_act:
+            y = F.relu6(y)
+        return y
+
+
+class InvertedResidualUnit(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_in_filter,
+                 num_filters,
+                 stride,
+                 filter_size,
+                 padding,
+                 expansion_factor,
+                 name,
+                 data_format="NCHW"):
+        super(InvertedResidualUnit, self).__init__()
+        num_expfilter = int(round(num_in_filter * expansion_factor))
+        self._expand_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_expfilter,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            name=name + "_expand",
+            data_format=data_format)
+
+        self._bottleneck_conv = ConvBNLayer(
+            num_channels=num_expfilter,
+            num_filters=num_expfilter,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            num_groups=num_expfilter,
+            use_cudnn=False,
+            name=name + "_dwise",
+            data_format=data_format)
+
+        self._linear_conv = ConvBNLayer(
+            num_channels=num_expfilter,
+            num_filters=num_filters,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            name=name + "_linear",
+            data_format=data_format)
+
+    def forward(self, inputs, ifshortcut):
+        y = self._expand_conv(inputs, if_act=True)
+        y = self._bottleneck_conv(y, if_act=True)
+        y = self._linear_conv(y, if_act=False)
+        if ifshortcut:
+            y = paddle.add(inputs, y)
+        return y
+
+
+class InvresiBlocks(nn.Layer):
+    def __init__(self, in_c, t, c, n, s, name, data_format="NCHW"):
+        super(InvresiBlocks, self).__init__()
+
+        self._first_block = InvertedResidualUnit(
+            num_channels=in_c,
+            num_in_filter=in_c,
+            num_filters=c,
+            stride=s,
+            filter_size=3,
+            padding=1,
+            expansion_factor=t,
+            name=name + "_1",
+            data_format=data_format)
+
+        self._block_list = []
+        for i in range(1, n):
+            block = self.add_sublayer(
+                name + "_" + str(i + 1),
+                sublayer=InvertedResidualUnit(
+                    num_channels=c,
+                    num_in_filter=c,
+                    num_filters=c,
+                    stride=1,
+                    filter_size=3,
+                    padding=1,
+                    expansion_factor=t,
+                    name=name + "_" + str(i + 1),
+                    data_format=data_format))
+            self._block_list.append(block)
+
+    def forward(self, inputs):
+        y = self._first_block(inputs, ifshortcut=False)
+        for block in self._block_list:
+            y = block(y, ifshortcut=True)
+        return y
+
+
+class MobileNet(nn.Layer):
+    def __init__(self,
+                 class_num=1000,
+                 scale=1.0,
+                 prefix_name="",
+                 data_format="NCHW"):
+        super(MobileNet, self).__init__()
+        self.scale = scale
+        self.class_num = class_num
+        self.data_format = data_format
+
+        bottleneck_params_list = [
+            (1, 16, 1, 1),
+            (6, 24, 2, 2),
+            (6, 32, 3, 2),
+            (6, 64, 4, 2),
+            (6, 96, 3, 1),
+            (6, 160, 3, 2),
+            (6, 320, 1, 1),
+        ]
+
+        self.conv1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=int(32 * scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            name=prefix_name + "conv1_1",
+            data_format=data_format)
+
+        self.block_list = []
+        i = 1
+        in_c = int(32 * scale)
+        for layer_setting in bottleneck_params_list:
+            t, c, n, s = layer_setting
+            i += 1
+            block = self.add_sublayer(
+                prefix_name + "conv" + str(i),
+                sublayer=InvresiBlocks(
+                    in_c=in_c,
+                    t=t,
+                    c=int(c * scale),
+                    n=n,
+                    s=s,
+                    name=prefix_name + "conv" + str(i),
+                    data_format=data_format))
+            self.block_list.append(block)
+            in_c = int(c * scale)
+
+        self.out_c = int(1280 * scale) if scale > 1.0 else 1280
+        self.conv9 = ConvBNLayer(
+            num_channels=in_c,
+            num_filters=self.out_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            name=prefix_name + "conv9",
+            data_format=data_format)
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1, data_format=data_format)
+
+        self.out = Linear(
+            self.out_c,
+            class_num,
+            weight_attr=ParamAttr(name=prefix_name + "fc10_weights"),
+            bias_attr=ParamAttr(name=prefix_name + "fc10_offset"))
+
+    def forward(self, inputs):
+        if self.data_format == "NHWC":
+            inputs = paddle.transpose(inputs, [0, 2, 3, 1])
+            inputs.stop_gradient = True
+        y = self.conv1(inputs, if_act=True)
+        for block in self.block_list:
+            y = block(y)
+        y = self.conv9(y, if_act=True)
+        y = self.pool2d_avg(y)
+        y = paddle.flatten(y, start_axis=1, stop_axis=-1)
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def MobileNetV2_x0_25(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNet(scale=0.25, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNetV2_x0_25"], use_ssld=use_ssld)
+    return model
+
+
+def MobileNetV2_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNet(scale=0.5, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNetV2_x0_5"], use_ssld=use_ssld)
+    return model
+
+
+def MobileNetV2_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNet(scale=0.75, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNetV2_x0_75"], use_ssld=use_ssld)
+    return model
+
+
+def MobileNetV2(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNet(scale=1.0, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNetV2"], use_ssld=use_ssld)
+    return model
+
+
+def MobileNetV2_x1_5(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNet(scale=1.5, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNetV2_x1_5"], use_ssld=use_ssld)
+    return model
+
+
+def MobileNetV2_x2_0(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNet(scale=2.0, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNetV2_x2_0"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mobilenext.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mobilenext.py
new file mode 100644
index 000000000..e432d2d7b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mobilenext.py
@@ -0,0 +1,262 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was heavily based on https://github.com/zhoudaquan/rethinking_bottleneck_design
+# reference: https://arxiv.org/abs/2007.02269
+
+import math
+import paddle.nn as nn
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "MobileNeXt_x0_35": "",  # TODO
+    "MobileNeXt_x0_5": "",  # TODO
+    "MobileNeXt_x0_75": "",  # TODO
+    "MobileNeXt_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileNeXt_x1_0_pretrained.pdparams",
+    "MobileNeXt_x1_4": "",  # TODO
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def _make_divisible(v, divisor, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+def conv_3x3_bn(inp, oup, stride):
+    return nn.Sequential(
+        nn.Conv2D(
+            inp, oup, 3, stride, 1, bias_attr=False),
+        nn.BatchNorm2D(oup),
+        nn.ReLU6())
+
+
+class SGBlock(nn.Layer):
+    def __init__(self, inp, oup, stride, expand_ratio, keep_3x3=False):
+        super(SGBlock, self).__init__()
+        assert stride in [1, 2]
+
+        hidden_dim = inp // expand_ratio
+        if hidden_dim < oup / 6.:
+            hidden_dim = math.ceil(oup / 6.)
+            hidden_dim = _make_divisible(hidden_dim, 16)  # + 16
+
+        self.identity = False
+        self.identity_div = 1
+        self.expand_ratio = expand_ratio
+
+        if expand_ratio == 2:
+            self.conv = nn.Sequential(
+                # dw
+                nn.Conv2D(
+                    inp, inp, 3, 1, 1, groups=inp, bias_attr=False),
+                nn.BatchNorm2D(inp),
+                nn.ReLU6(),
+                # pw-linear
+                nn.Conv2D(
+                    inp, hidden_dim, 1, 1, 0, bias_attr=False),
+                nn.BatchNorm2D(hidden_dim),
+                # pw-linear
+                nn.Conv2D(
+                    hidden_dim, oup, 1, 1, 0, bias_attr=False),
+                nn.BatchNorm2D(oup),
+                nn.ReLU6(),
+                # dw
+                nn.Conv2D(
+                    oup, oup, 3, stride, 1, groups=oup, bias_attr=False),
+                nn.BatchNorm2D(oup))
+        elif inp != oup and stride == 1 and keep_3x3 == False:
+            self.conv = nn.Sequential(
+                # pw-linear
+                nn.Conv2D(
+                    inp, hidden_dim, 1, 1, 0, bias_attr=False),
+                nn.BatchNorm2D(hidden_dim),
+                # pw-linear
+                nn.Conv2D(
+                    hidden_dim, oup, 1, 1, 0, bias_attr=False),
+                nn.BatchNorm2D(oup),
+                nn.ReLU6())
+        elif inp != oup and stride == 2 and keep_3x3 == False:
+            self.conv = nn.Sequential(
+                # pw-linear
+                nn.Conv2D(
+                    inp, hidden_dim, 1, 1, 0, bias_attr=False),
+                nn.BatchNorm2D(hidden_dim),
+                # pw-linear
+                nn.Conv2D(
+                    hidden_dim, oup, 1, 1, 0, bias_attr=False),
+                nn.BatchNorm2D(oup),
+                nn.ReLU6(),
+                # dw
+                nn.Conv2D(
+                    oup, oup, 3, stride, 1, groups=oup, bias_attr=False),
+                nn.BatchNorm2D(oup))
+        else:
+            if keep_3x3 == False:
+                self.identity = True
+            self.conv = nn.Sequential(
+                # dw
+                nn.Conv2D(
+                    inp, inp, 3, 1, 1, groups=inp, bias_attr=False),
+                nn.BatchNorm2D(inp),
+                nn.ReLU6(),
+                # pw
+                nn.Conv2D(
+                    inp, hidden_dim, 1, 1, 0, bias_attr=False),
+                nn.BatchNorm2D(hidden_dim),
+                #nn.ReLU6(),
+                # pw
+                nn.Conv2D(
+                    hidden_dim, oup, 1, 1, 0, bias_attr=False),
+                nn.BatchNorm2D(oup),
+                nn.ReLU6(),
+                # dw
+                nn.Conv2D(
+                    oup, oup, 3, 1, 1, groups=oup, bias_attr=False),
+                nn.BatchNorm2D(oup))
+
+    def forward(self, x):
+        out = self.conv(x)
+
+        if self.identity:
+            if self.identity_div == 1:
+                out = out + x
+            else:
+                shape = x.shape
+                id_tensor = x[:, :shape[1] // self.identity_div, :, :]
+                out[:, :shape[1] // self.identity_div, :, :] = \
+                    out[:, :shape[1] // self.identity_div, :, :] + id_tensor
+
+        return out
+
+
+class MobileNeXt(nn.Layer):
+    def __init__(self, class_num=1000, width_mult=1.00):
+        super().__init__()
+
+        # setting of inverted residual blocks
+        self.cfgs = [
+            # t, c, n, s
+            [2, 96, 1, 2],
+            [6, 144, 1, 1],
+            [6, 192, 3, 2],
+            [6, 288, 3, 2],
+            [6, 384, 4, 1],
+            [6, 576, 4, 2],
+            [6, 960, 3, 1],
+            [6, 1280, 1, 1],
+        ]
+
+        # building first layer
+        input_channel = _make_divisible(32 * width_mult, 4
+                                        if width_mult == 0.1 else 8)
+        layers = [conv_3x3_bn(3, input_channel, 2)]
+        # building inverted residual blocks
+        block = SGBlock
+        for t, c, n, s in self.cfgs:
+            output_channel = _make_divisible(c * width_mult, 4
+                                             if width_mult == 0.1 else 8)
+            if c == 1280 and width_mult < 1:
+                output_channel = 1280
+            layers.append(
+                block(input_channel, output_channel, s, t, n == 1 and s == 1))
+            input_channel = output_channel
+            for _ in range(n - 1):
+                layers.append(block(input_channel, output_channel, 1, t))
+                input_channel = output_channel
+        self.features = nn.Sequential(*layers)
+        # building last several layers
+        input_channel = output_channel
+        output_channel = _make_divisible(input_channel, 4)
+        self.avgpool = nn.AdaptiveAvgPool2D((1, 1))
+        self.classifier = nn.Sequential(
+            nn.Dropout(0.2), nn.Linear(output_channel, class_num))
+
+        self.apply(self._initialize_weights)
+
+    def _initialize_weights(self, m):
+        if isinstance(m, nn.Conv2D):
+            n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+            nn.initializer.Normal(std=math.sqrt(2. / n))(m.weight)
+            if m.bias is not None:
+                nn.initializer.Constant(0)(m.bias)
+        elif isinstance(m, nn.BatchNorm2D):
+            nn.initializer.Constant(1)(m.weight)
+            nn.initializer.Constant(0)(m.bias)
+        elif isinstance(m, nn.Linear):
+            nn.initializer.Normal(std=0.01)(m.weight)
+            nn.initializer.Constant(0)(m.bias)
+
+    def forward(self, x):
+        x = self.features(x)
+        x = self.avgpool(x)
+        x = x.flatten(1)
+        x = self.classifier(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def MobileNeXt_x0_35(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNeXt(width_mult=0.35, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNeXt_x0_35"], use_ssld=use_ssld)
+    return model
+
+
+def MobileNeXt_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNeXt(width_mult=0.50, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNeXt_x0_5"], use_ssld=use_ssld)
+    return model
+
+
+def MobileNeXt_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNeXt(width_mult=0.75, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNeXt_x0_75"], use_ssld=use_ssld)
+    return model
+
+
+def MobileNeXt_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNeXt(width_mult=1.00, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNeXt_x1_0"], use_ssld=use_ssld)
+    return model
+
+
+def MobileNeXt_x1_4(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileNeXt(width_mult=1.40, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileNeXt_x1_4"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mobilevit.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mobilevit.py
new file mode 100644
index 000000000..58adaf18e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mobilevit.py
@@ -0,0 +1,479 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/BR-IDL/PaddleViT/blob/develop/image_classification/MobileViT/mobilevit.py
+# and https://github.com/apple/ml-cvnets/blob/main/cvnets/models/classification/mobilevit.py
+# reference: https://arxiv.org/abs/2110.02178
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import KaimingUniform, TruncatedNormal, Constant
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "MobileViT_XXS":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViT_XXS_pretrained.pdparams",
+    "MobileViT_XS":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViT_XS_pretrained.pdparams",
+    "MobileViT_S":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViT_S_pretrained.pdparams",
+}
+
+
+def _init_weights_linear():
+    weight_attr = ParamAttr(initializer=TruncatedNormal(std=.02))
+    bias_attr = ParamAttr(initializer=Constant(0.0))
+    return weight_attr, bias_attr
+
+
+def _init_weights_layernorm():
+    weight_attr = ParamAttr(initializer=Constant(1.0))
+    bias_attr = ParamAttr(initializer=Constant(0.0))
+    return weight_attr, bias_attr
+
+
+class ConvBnAct(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=0,
+                 bias_attr=False,
+                 groups=1):
+        super().__init__()
+        self.in_channels = in_channels
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(initializer=KaimingUniform()),
+            bias_attr=bias_attr)
+        self.norm = nn.BatchNorm2D(out_channels)
+        self.act = nn.Silu()
+
+    def forward(self, inputs):
+        out = self.conv(inputs)
+        out = self.norm(out)
+        out = self.act(out)
+        return out
+
+
+class Identity(nn.Layer):
+    """ Identity layer"""
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, inputs):
+        return inputs
+
+
+class Mlp(nn.Layer):
+    def __init__(self, embed_dim, mlp_ratio, dropout=0.1):
+        super().__init__()
+        w_attr_1, b_attr_1 = _init_weights_linear()
+        self.fc1 = nn.Linear(
+            embed_dim,
+            int(embed_dim * mlp_ratio),
+            weight_attr=w_attr_1,
+            bias_attr=b_attr_1)
+
+        w_attr_2, b_attr_2 = _init_weights_linear()
+        self.fc2 = nn.Linear(
+            int(embed_dim * mlp_ratio),
+            embed_dim,
+            weight_attr=w_attr_2,
+            bias_attr=b_attr_2)
+
+        self.act = nn.Silu()
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout1(x)
+        x = self.fc2(x)
+        x = self.dropout2(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 qkv_bias=True,
+                 dropout=0.1,
+                 attention_dropout=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        self.attn_head_dim = int(embed_dim / self.num_heads)
+        self.all_head_dim = self.attn_head_dim * self.num_heads
+
+        w_attr_1, b_attr_1 = _init_weights_linear()
+        self.qkv = nn.Linear(
+            embed_dim,
+            self.all_head_dim * 3,
+            weight_attr=w_attr_1,
+            bias_attr=b_attr_1 if qkv_bias else False)
+
+        self.scales = self.attn_head_dim**-0.5
+
+        w_attr_2, b_attr_2 = _init_weights_linear()
+        self.proj = nn.Linear(
+            embed_dim, embed_dim, weight_attr=w_attr_2, bias_attr=b_attr_2)
+
+        self.attn_dropout = nn.Dropout(attention_dropout)
+        self.proj_dropout = nn.Dropout(dropout)
+        self.softmax = nn.Softmax(axis=-1)
+
+    def transpose_multihead(self, x):
+        B, P, N, d = x.shape
+        x = x.reshape([B, P, N, self.num_heads, d // self.num_heads])
+        x = x.transpose([0, 1, 3, 2, 4])
+        return x
+
+    def forward(self, x):
+        b_sz, n_patches, in_channels = x.shape
+        qkv = self.qkv(x)
+        qkv = qkv.reshape([
+            b_sz, n_patches, 3, self.num_heads,
+            qkv.shape[-1] // self.num_heads // 3
+        ])
+        qkv = qkv.transpose([0, 3, 2, 1, 4])
+        query, key, value = qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2]
+        query = query * self.scales
+        key = key.transpose([0, 1, 3, 2])
+        # QK^T
+        attn = paddle.matmul(query, key)
+        attn = self.softmax(attn)
+        attn = self.attn_dropout(attn)
+        # weighted sum
+        out = paddle.matmul(attn, value)
+        out = out.transpose([0, 2, 1, 3]).reshape(
+            [b_sz, n_patches, out.shape[1] * out.shape[3]])
+        out = self.proj(out)
+        out = self.proj_dropout(out)
+        return out
+
+
+class EncoderLayer(nn.Layer):
+    def __init__(self,
+                 embed_dim,
+                 num_heads=4,
+                 qkv_bias=True,
+                 mlp_ratio=2.0,
+                 dropout=0.1,
+                 attention_dropout=0.,
+                 droppath=0.):
+        super().__init__()
+        w_attr_1, b_attr_1 = _init_weights_layernorm()
+        w_attr_2, b_attr_2 = _init_weights_layernorm()
+
+        self.attn_norm = nn.LayerNorm(
+            embed_dim, weight_attr=w_attr_1, bias_attr=b_attr_1)
+        self.attn = Attention(embed_dim, num_heads, qkv_bias, dropout,
+                              attention_dropout)
+        self.drop_path = DropPath(droppath) if droppath > 0. else Identity()
+        self.mlp_norm = nn.LayerNorm(
+            embed_dim, weight_attr=w_attr_2, bias_attr=b_attr_2)
+        self.mlp = Mlp(embed_dim, mlp_ratio, dropout)
+
+    def forward(self, x):
+        h = x
+        x = self.attn_norm(x)
+        x = self.attn(x)
+        x = self.drop_path(x)
+        x = h + x
+        h = x
+        x = self.mlp_norm(x)
+        x = self.mlp(x)
+        x = self.drop_path(x)
+        x = x + h
+        return x
+
+
+class Transformer(nn.Layer):
+    """Transformer block for MobileViTBlock"""
+
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 depth,
+                 qkv_bias=True,
+                 mlp_ratio=2.0,
+                 dropout=0.1,
+                 attention_dropout=0.,
+                 droppath=0.):
+        super().__init__()
+        depth_decay = [x.item() for x in paddle.linspace(0, droppath, depth)]
+
+        layer_list = []
+        for i in range(depth):
+            layer_list.append(
+                EncoderLayer(embed_dim, num_heads, qkv_bias, mlp_ratio,
+                             dropout, attention_dropout, droppath))
+        self.layers = nn.LayerList(layer_list)
+
+        w_attr_1, b_attr_1 = _init_weights_layernorm()
+        self.norm = nn.LayerNorm(
+            embed_dim, weight_attr=w_attr_1, bias_attr=b_attr_1, epsilon=1e-6)
+
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        out = self.norm(x)
+        return out
+
+
+class MobileV2Block(nn.Layer):
+    """Mobilenet v2 InvertedResidual block"""
+
+    def __init__(self, inp, oup, stride=1, expansion=4):
+        super().__init__()
+        self.stride = stride
+        assert stride in [1, 2]
+
+        hidden_dim = int(round(inp * expansion))
+        self.use_res_connect = self.stride == 1 and inp == oup
+
+        layers = []
+        if expansion != 1:
+            layers.append(ConvBnAct(inp, hidden_dim, kernel_size=1))
+
+        layers.extend([
+            # dw
+            ConvBnAct(
+                hidden_dim,
+                hidden_dim,
+                stride=stride,
+                groups=hidden_dim,
+                padding=1),
+            # pw-linear
+            nn.Conv2D(
+                hidden_dim, oup, 1, 1, 0, bias_attr=False),
+            nn.BatchNorm2D(oup),
+        ])
+
+        self.conv = nn.Sequential(*layers)
+        self.out_channels = oup
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        return self.conv(x)
+
+
+class MobileViTBlock(nn.Layer):
+    """ MobileViTBlock for MobileViT"""
+
+    def __init__(self,
+                 dim,
+                 hidden_dim,
+                 depth,
+                 num_heads=4,
+                 qkv_bias=True,
+                 mlp_ratio=2.0,
+                 dropout=0.1,
+                 attention_dropout=0.,
+                 droppath=0.0,
+                 patch_size=(2, 2)):
+        super().__init__()
+        self.patch_h, self.patch_w = patch_size
+
+        # local representations
+        self.conv1 = ConvBnAct(dim, dim, padding=1)
+        self.conv2 = nn.Conv2D(
+            dim, hidden_dim, kernel_size=1, stride=1, bias_attr=False)
+        # global representations
+        self.transformer = Transformer(
+            embed_dim=hidden_dim,
+            num_heads=num_heads,
+            depth=depth,
+            qkv_bias=qkv_bias,
+            mlp_ratio=mlp_ratio,
+            dropout=dropout,
+            attention_dropout=attention_dropout,
+            droppath=droppath)
+
+        # fusion
+        self.conv3 = ConvBnAct(hidden_dim, dim, kernel_size=1)
+        self.conv4 = ConvBnAct(2 * dim, dim, padding=1)
+
+    def forward(self, x):
+        h = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+
+        patch_h = self.patch_h
+        patch_w = self.patch_w
+        patch_area = int(patch_w * patch_h)
+        _, in_channels, orig_h, orig_w = x.shape
+        new_h = int(math.ceil(orig_h / self.patch_h) * self.patch_h)
+        new_w = int(math.ceil(orig_w / self.patch_w) * self.patch_w)
+        interpolate = False
+
+        if new_w != orig_w or new_h != orig_h:
+            x = F.interpolate(x, size=[new_h, new_w], mode="bilinear")
+            interpolate = True
+
+        num_patch_w, num_patch_h = new_w // patch_w, new_h // patch_h
+        num_patches = num_patch_h * num_patch_w
+        reshaped_x = x.reshape([-1, patch_h, num_patch_w, patch_w])
+        transposed_x = reshaped_x.transpose([0, 2, 1, 3])
+        reshaped_x = transposed_x.reshape(
+            [-1, in_channels, num_patches, patch_area])
+        transposed_x = reshaped_x.transpose([0, 3, 2, 1])
+
+        x = transposed_x.reshape([-1, num_patches, in_channels])
+        x = self.transformer(x)
+        x = x.reshape([-1, patch_h * patch_w, num_patches, in_channels])
+
+        _, pixels, num_patches, channels = x.shape
+        x = x.transpose([0, 3, 2, 1])
+        x = x.reshape([-1, num_patch_w, patch_h, patch_w])
+        x = x.transpose([0, 2, 1, 3])
+        x = x.reshape(
+            [-1, channels, num_patch_h * patch_h, num_patch_w * patch_w])
+
+        if interpolate:
+            x = F.interpolate(x, size=[orig_h, orig_w])
+        x = self.conv3(x)
+        x = paddle.concat((h, x), axis=1)
+        x = self.conv4(x)
+        return x
+
+
+class MobileViT(nn.Layer):
+    """ MobileViT
+        A PaddlePaddle impl of : `MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer`  -
+          https://arxiv.org/abs/2110.02178
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 dims=[16, 32, 48, 48, 48, 64, 80, 96, 384],
+                 hidden_dims=[96, 120, 144],
+                 mv2_expansion=4,
+                 class_num=1000):
+        super().__init__()
+        self.conv3x3 = ConvBnAct(
+            in_channels, dims[0], kernel_size=3, stride=2, padding=1)
+        self.mv2_block_1 = MobileV2Block(
+            dims[0], dims[1], expansion=mv2_expansion)
+        self.mv2_block_2 = MobileV2Block(
+            dims[1], dims[2], stride=2, expansion=mv2_expansion)
+        self.mv2_block_3 = MobileV2Block(
+            dims[2], dims[3], expansion=mv2_expansion)
+        self.mv2_block_4 = MobileV2Block(
+            dims[3], dims[4], expansion=mv2_expansion)
+
+        self.mv2_block_5 = MobileV2Block(
+            dims[4], dims[5], stride=2, expansion=mv2_expansion)
+        self.mvit_block_1 = MobileViTBlock(dims[5], hidden_dims[0], depth=2)
+
+        self.mv2_block_6 = MobileV2Block(
+            dims[5], dims[6], stride=2, expansion=mv2_expansion)
+        self.mvit_block_2 = MobileViTBlock(dims[6], hidden_dims[1], depth=4)
+
+        self.mv2_block_7 = MobileV2Block(
+            dims[6], dims[7], stride=2, expansion=mv2_expansion)
+        self.mvit_block_3 = MobileViTBlock(dims[7], hidden_dims[2], depth=3)
+        self.conv1x1 = ConvBnAct(dims[7], dims[8], kernel_size=1)
+
+        self.pool = nn.AdaptiveAvgPool2D(1)
+        self.dropout = nn.Dropout(0.1)
+        self.linear = nn.Linear(dims[8], class_num)
+
+    def forward(self, x):
+        x = self.conv3x3(x)
+        x = self.mv2_block_1(x)
+        x = self.mv2_block_2(x)
+        x = self.mv2_block_3(x)
+        x = self.mv2_block_4(x)
+
+        x = self.mv2_block_5(x)
+        x = self.mvit_block_1(x)
+
+        x = self.mv2_block_6(x)
+        x = self.mvit_block_2(x)
+
+        x = self.mv2_block_7(x)
+        x = self.mvit_block_3(x)
+        x = self.conv1x1(x)
+
+        x = self.pool(x)
+        x = x.reshape(x.shape[:2])
+
+        x = self.dropout(x)
+        x = self.linear(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def MobileViT_XXS(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileViT(
+        in_channels=3,
+        dims=[16, 16, 24, 24, 24, 48, 64, 80, 320],
+        hidden_dims=[64, 80, 96],
+        mv2_expansion=2,
+        **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViT_XXS"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViT_XS(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileViT(
+        in_channels=3,
+        dims=[16, 32, 48, 48, 48, 64, 80, 96, 384],
+        hidden_dims=[96, 120, 144],
+        mv2_expansion=4,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViT_XS"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViT_S(pretrained=False, use_ssld=False, **kwargs):
+    model = MobileViT(
+        in_channels=3,
+        dims=[16, 32, 64, 64, 64, 96, 128, 160, 640],
+        hidden_dims=[144, 192, 240],
+        mv2_expansion=4,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViT_S"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mobilevit_v2.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mobilevit_v2.py
new file mode 100644
index 000000000..9a57448e9
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mobilevit_v2.py
@@ -0,0 +1,593 @@
+# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/apple/ml-cvnets/blob/7be93d3debd45c240a058e3f34a9e88d33c07a7d/cvnets/models/classification/mobilevit_v2.py
+# reference: https://arxiv.org/abs/2206.02680
+
+from functools import partial
+from typing import Dict, Optional, Tuple, Union
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "MobileViTV2_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV2_x0_5_pretrained.pdparams",
+    "MobileViTV2_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV2_x1_0_pretrained.pdparams",
+    "MobileViTV2_x1_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV2_x1_5_pretrained.pdparams",
+    "MobileViTV2_x2_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV2_x2_0_pretrained.pdparams",
+}
+
+layer_norm_2d = partial(nn.GroupNorm, num_groups=1)
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class InvertedResidual(nn.Layer):
+    """
+    Inverted residual block (MobileNetv2): https://arxiv.org/abs/1801.04381
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 expand_ratio,
+                 dilation=1,
+                 skip_connection=True):
+        super().__init__()
+        assert stride in [1, 2]
+        self.stride = stride
+
+        hidden_dim = make_divisible(int(round(in_channels * expand_ratio)), 8)
+        self.use_res_connect = self.stride == 1 and in_channels == out_channels and skip_connection
+
+        block = nn.Sequential()
+        if expand_ratio != 1:
+            block.add_sublayer(
+                name="exp_1x1",
+                sublayer=nn.Sequential(
+                    ('conv', nn.Conv2D(
+                        in_channels, hidden_dim, 1, bias_attr=False)),
+                    ('norm', nn.BatchNorm2D(hidden_dim)), ('act', nn.Silu())))
+
+        block.add_sublayer(
+            name="conv_3x3",
+            sublayer=nn.Sequential(
+                ('conv', nn.Conv2D(
+                    hidden_dim,
+                    hidden_dim,
+                    3,
+                    bias_attr=False,
+                    stride=stride,
+                    padding=dilation,
+                    dilation=dilation,
+                    groups=hidden_dim)), ('norm', nn.BatchNorm2D(hidden_dim)),
+                ('act', nn.Silu())))
+
+        block.add_sublayer(
+            name="red_1x1",
+            sublayer=nn.Sequential(
+                ('conv', nn.Conv2D(
+                    hidden_dim, out_channels, 1, bias_attr=False)),
+                ('norm', nn.BatchNorm2D(out_channels))))
+
+        self.block = block
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.exp = expand_ratio
+        self.dilation = dilation
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.block(x)
+        else:
+            return self.block(x)
+
+
+class LinearSelfAttention(nn.Layer):
+    def __init__(self, embed_dim, attn_dropout=0.0, bias=True):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.qkv_proj = nn.Conv2D(
+            embed_dim, 1 + (2 * embed_dim), 1, bias_attr=bias)
+        self.attn_dropout = nn.Dropout(p=attn_dropout)
+        self.out_proj = nn.Conv2D(embed_dim, embed_dim, 1, bias_attr=bias)
+
+    def forward(self, x):
+        # [B, C, P, N] --> [B, h + 2d, P, N]
+        qkv = self.qkv_proj(x)
+
+        # Project x into query, key and value
+        # Query --> [B, 1, P, N]
+        # value, key --> [B, d, P, N]
+        query, key, value = paddle.split(
+            qkv, [1, self.embed_dim, self.embed_dim], axis=1)
+
+        # apply softmax along N dimension
+        context_scores = F.softmax(query, axis=-1)
+        # Uncomment below line to visualize context scores
+        # self.visualize_context_scores(context_scores=context_scores)
+        context_scores = self.attn_dropout(context_scores)
+
+        # Compute context vector
+        # [B, d, P, N] x [B, 1, P, N] -> [B, d, P, N]
+        context_vector = key * context_scores
+        # [B, d, P, N] --> [B, d, P, 1]
+        context_vector = paddle.sum(context_vector, axis=-1, keepdim=True)
+
+        # combine context vector with values
+        # [B, d, P, N] * [B, d, P, 1] --> [B, d, P, N]
+        out = F.relu(value) * context_vector
+        out = self.out_proj(out)
+        return out
+
+
+class LinearAttnFFN(nn.Layer):
+    def __init__(self,
+                 embed_dim,
+                 ffn_latent_dim,
+                 attn_dropout=0.0,
+                 dropout=0.1,
+                 ffn_dropout=0.0,
+                 norm_layer=layer_norm_2d) -> None:
+        super().__init__()
+        attn_unit = LinearSelfAttention(
+            embed_dim=embed_dim, attn_dropout=attn_dropout, bias=True)
+
+        self.pre_norm_attn = nn.Sequential(
+            norm_layer(num_channels=embed_dim),
+            attn_unit,
+            nn.Dropout(p=dropout))
+
+        self.pre_norm_ffn = nn.Sequential(
+            norm_layer(num_channels=embed_dim),
+            nn.Conv2D(embed_dim, ffn_latent_dim, 1),
+            nn.Silu(),
+            nn.Dropout(p=ffn_dropout),
+            nn.Conv2D(ffn_latent_dim, embed_dim, 1),
+            nn.Dropout(p=dropout))
+
+    def forward(self, x):
+        # self-attention
+        x = x + self.pre_norm_attn(x)
+        # Feed forward network
+        x = x + self.pre_norm_ffn(x)
+        return x
+
+
+class MobileViTV2Block(nn.Layer):
+    """
+    This class defines the `MobileViTV2 block`
+    """
+
+    def __init__(self,
+                 in_channels,
+                 attn_unit_dim,
+                 ffn_multiplier=2.0,
+                 n_attn_blocks=2,
+                 attn_dropout=0.0,
+                 dropout=0.0,
+                 ffn_dropout=0.0,
+                 patch_h=8,
+                 patch_w=8,
+                 conv_ksize=3,
+                 dilation=1,
+                 attn_norm_layer=layer_norm_2d):
+        super().__init__()
+        cnn_out_dim = attn_unit_dim
+        padding = (conv_ksize - 1) // 2 * dilation
+        conv_3x3_in = nn.Sequential(
+            ('conv', nn.Conv2D(
+                in_channels,
+                in_channels,
+                conv_ksize,
+                bias_attr=False,
+                padding=padding,
+                dilation=dilation,
+                groups=in_channels)), ('norm', nn.BatchNorm2D(in_channels)),
+            ('act', nn.Silu()))
+        conv_1x1_in = nn.Sequential(('conv', nn.Conv2D(
+            in_channels, cnn_out_dim, 1, bias_attr=False)))
+
+        self.local_rep = nn.Sequential(conv_3x3_in, conv_1x1_in)
+
+        self.global_rep, attn_unit_dim = self._build_attn_layer(
+            d_model=attn_unit_dim,
+            ffn_mult=ffn_multiplier,
+            n_layers=n_attn_blocks,
+            attn_dropout=attn_dropout,
+            dropout=dropout,
+            ffn_dropout=ffn_dropout,
+            attn_norm_layer=attn_norm_layer)
+
+        self.conv_proj = nn.Sequential(
+            ('conv', nn.Conv2D(
+                cnn_out_dim, in_channels, 1, bias_attr=False)),
+            ('norm', nn.BatchNorm2D(in_channels)))
+
+        self.patch_h = patch_h
+        self.patch_w = patch_w
+
+    def _build_attn_layer(self, d_model, ffn_mult, n_layers, attn_dropout,
+                          dropout, ffn_dropout, attn_norm_layer):
+        # ensure that dims are multiple of 16
+        ffn_dims = [ffn_mult * d_model // 16 * 16] * n_layers
+
+        global_rep = [
+            LinearAttnFFN(
+                embed_dim=d_model,
+                ffn_latent_dim=ffn_dims[block_idx],
+                attn_dropout=attn_dropout,
+                dropout=dropout,
+                ffn_dropout=ffn_dropout,
+                norm_layer=attn_norm_layer) for block_idx in range(n_layers)
+        ]
+        global_rep.append(attn_norm_layer(num_channels=d_model))
+
+        return nn.Sequential(*global_rep), d_model
+
+    def unfolding(self, feature_map):
+        batch_size, in_channels, img_h, img_w = feature_map.shape
+
+        # [B, C, H, W] --> [B, C, P, N]
+        patches = F.unfold(
+            feature_map,
+            kernel_sizes=[self.patch_h, self.patch_w],
+            strides=[self.patch_h, self.patch_w])
+        n_patches = img_h * img_w // (self.patch_h * self.patch_w)
+        patches = patches.reshape(
+            [batch_size, in_channels, self.patch_h * self.patch_w, n_patches])
+
+        return patches, (img_h, img_w)
+
+    def folding(self, patches, output_size):
+        batch_size, in_dim, patch_size, n_patches = patches.shape
+
+        # [B, C, P, N]
+        patches = patches.reshape([batch_size, in_dim * patch_size, n_patches])
+
+        feature_map = F.fold(
+            patches,
+            output_size,
+            kernel_sizes=[self.patch_h, self.patch_w],
+            strides=[self.patch_h, self.patch_w])
+
+        return feature_map
+
+    def forward(self, x):
+        fm = self.local_rep(x)
+
+        # convert feature map to patches
+        patches, output_size = self.unfolding(fm)
+
+        # learn global representations on all patches
+        patches = self.global_rep(patches)
+
+        # [B x Patch x Patches x C] --> [B x C x Patches x Patch]
+        fm = self.folding(patches=patches, output_size=output_size)
+        fm = self.conv_proj(fm)
+
+        return fm
+
+
+class MobileViTV2(nn.Layer):
+    """
+        MobileViTV2
+    """
+
+    def __init__(self, mobilevit_config, class_num=1000, output_stride=None):
+        super().__init__()
+        self.round_nearest = 8
+        self.dilation = 1
+
+        dilate_l4 = dilate_l5 = False
+        if output_stride == 8:
+            dilate_l4 = True
+            dilate_l5 = True
+        elif output_stride == 16:
+            dilate_l5 = True
+
+        # store model configuration in a dictionary
+        in_channels = mobilevit_config["layer0"]["img_channels"]
+        out_channels = mobilevit_config["layer0"]["out_channels"]
+        self.conv_1 = nn.Sequential(
+            ('conv', nn.Conv2D(
+                in_channels,
+                out_channels,
+                3,
+                bias_attr=False,
+                stride=2,
+                padding=1)), ('norm', nn.BatchNorm2D(out_channels)),
+            ('act', nn.Silu()))
+
+        in_channels = out_channels
+        self.layer_1, out_channels = self._make_layer(
+            input_channel=in_channels, cfg=mobilevit_config["layer1"])
+
+        in_channels = out_channels
+        self.layer_2, out_channels = self._make_layer(
+            input_channel=in_channels, cfg=mobilevit_config["layer2"])
+
+        in_channels = out_channels
+        self.layer_3, out_channels = self._make_layer(
+            input_channel=in_channels, cfg=mobilevit_config["layer3"])
+
+        in_channels = out_channels
+        self.layer_4, out_channels = self._make_layer(
+            input_channel=in_channels,
+            cfg=mobilevit_config["layer4"],
+            dilate=dilate_l4)
+
+        in_channels = out_channels
+        self.layer_5, out_channels = self._make_layer(
+            input_channel=in_channels,
+            cfg=mobilevit_config["layer5"],
+            dilate=dilate_l5)
+
+        self.conv_1x1_exp = nn.Identity()
+        self.classifier = nn.Sequential()
+        self.classifier.add_sublayer(
+            name="global_pool",
+            sublayer=nn.Sequential(nn.AdaptiveAvgPool2D(1), nn.Flatten()))
+        self.classifier.add_sublayer(
+            name="fc", sublayer=nn.Linear(out_channels, class_num))
+
+        # weight initialization
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Conv2D):
+            fan_in = m.weight.shape[1] * m.weight.shape[2] * m.weight.shape[3]
+            bound = 1.0 / fan_in**0.5
+            nn.initializer.Uniform(-bound, bound)(m.weight)
+            if m.bias is not None:
+                nn.initializer.Uniform(-bound, bound)(m.bias)
+        elif isinstance(m, (nn.BatchNorm2D, nn.GroupNorm)):
+            nn.initializer.Constant(1)(m.weight)
+            nn.initializer.Constant(0)(m.bias)
+        elif isinstance(m, nn.Linear):
+            nn.initializer.XavierUniform()(m.weight)
+            if m.bias is not None:
+                nn.initializer.Constant(0)(m.bias)
+
+    def _make_layer(self, input_channel, cfg, dilate=False):
+        block_type = cfg.get("block_type", "mobilevit")
+        if block_type.lower() == "mobilevit":
+            return self._make_mit_layer(
+                input_channel=input_channel, cfg=cfg, dilate=dilate)
+        else:
+            return self._make_mobilenet_layer(
+                input_channel=input_channel, cfg=cfg)
+
+    def _make_mit_layer(self, input_channel, cfg, dilate=False):
+        prev_dilation = self.dilation
+        block = []
+        stride = cfg.get("stride", 1)
+
+        if stride == 2:
+            if dilate:
+                self.dilation *= 2
+                stride = 1
+
+            layer = InvertedResidual(
+                in_channels=input_channel,
+                out_channels=cfg.get("out_channels"),
+                stride=stride,
+                expand_ratio=cfg.get("mv_expand_ratio", 4),
+                dilation=prev_dilation)
+
+            block.append(layer)
+            input_channel = cfg.get("out_channels")
+
+        block.append(
+            MobileViTV2Block(
+                in_channels=input_channel,
+                attn_unit_dim=cfg["attn_unit_dim"],
+                ffn_multiplier=cfg.get("ffn_multiplier"),
+                n_attn_blocks=cfg.get("attn_blocks", 1),
+                ffn_dropout=0.,
+                attn_dropout=0.,
+                dilation=self.dilation,
+                patch_h=cfg.get("patch_h", 2),
+                patch_w=cfg.get("patch_w", 2)))
+
+        return nn.Sequential(*block), input_channel
+
+    def _make_mobilenet_layer(self, input_channel, cfg):
+        output_channels = cfg.get("out_channels")
+        num_blocks = cfg.get("num_blocks", 2)
+        expand_ratio = cfg.get("expand_ratio", 4)
+        block = []
+
+        for i in range(num_blocks):
+            stride = cfg.get("stride", 1) if i == 0 else 1
+
+            layer = InvertedResidual(
+                in_channels=input_channel,
+                out_channels=output_channels,
+                stride=stride,
+                expand_ratio=expand_ratio)
+            block.append(layer)
+            input_channel = output_channels
+        return nn.Sequential(*block), input_channel
+
+    def extract_features(self, x):
+        x = self.conv_1(x)
+        x = self.layer_1(x)
+        x = self.layer_2(x)
+        x = self.layer_3(x)
+
+        x = self.layer_4(x)
+        x = self.layer_5(x)
+        x = self.conv_1x1_exp(x)
+        return x
+
+    def forward(self, x):
+        x = self.extract_features(x)
+        x = self.classifier(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def get_configuration(width_multiplier):
+    ffn_multiplier = 2
+    mv2_exp_mult = 2  # max(1.0, min(2.0, 2.0 * width_multiplier))
+
+    layer_0_dim = max(16, min(64, 32 * width_multiplier))
+    layer_0_dim = int(make_divisible(layer_0_dim, divisor=8, min_value=16))
+    config = {
+        "layer0": {
+            "img_channels": 3,
+            "out_channels": layer_0_dim,
+        },
+        "layer1": {
+            "out_channels": int(make_divisible(64 * width_multiplier, divisor=16)),
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 1,
+            "stride": 1,
+            "block_type": "mv2",
+        },
+        "layer2": {
+            "out_channels": int(make_divisible(128 * width_multiplier, divisor=8)),
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 2,
+            "stride": 2,
+            "block_type": "mv2",
+        },
+        "layer3": {  # 28x28
+            "out_channels": int(make_divisible(256 * width_multiplier, divisor=8)),
+            "attn_unit_dim": int(make_divisible(128 * width_multiplier, divisor=8)),
+            "ffn_multiplier": ffn_multiplier,
+            "attn_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "block_type": "mobilevit",
+        },
+        "layer4": {  # 14x14
+            "out_channels": int(make_divisible(384 * width_multiplier, divisor=8)),
+            "attn_unit_dim": int(make_divisible(192 * width_multiplier, divisor=8)),
+            "ffn_multiplier": ffn_multiplier,
+            "attn_blocks": 4,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "block_type": "mobilevit",
+        },
+        "layer5": {  # 7x7
+            "out_channels": int(make_divisible(512 * width_multiplier, divisor=8)),
+            "attn_unit_dim": int(make_divisible(256 * width_multiplier, divisor=8)),
+            "ffn_multiplier": ffn_multiplier,
+            "attn_blocks": 3,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "block_type": "mobilevit",
+        },
+        "last_layer_exp_factor": 4,
+    }
+
+    return config
+
+
+def MobileViTV2_x2_0(pretrained=False, use_ssld=False, **kwargs):
+    width_multiplier = 2.0
+    model = MobileViTV2(get_configuration(width_multiplier), **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV2_x2_0"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV2_x1_75(pretrained=False, use_ssld=False, **kwargs):
+    width_multiplier = 1.75
+    model = MobileViTV2(get_configuration(width_multiplier), **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV2_x1_75"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV2_x1_5(pretrained=False, use_ssld=False, **kwargs):
+    width_multiplier = 1.5
+    model = MobileViTV2(get_configuration(width_multiplier), **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV2_x1_5"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV2_x1_25(pretrained=False, use_ssld=False, **kwargs):
+    width_multiplier = 1.25
+    model = MobileViTV2(get_configuration(width_multiplier), **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV2_x1_25"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV2_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    width_multiplier = 1.0
+    model = MobileViTV2(get_configuration(width_multiplier), **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV2_x1_0"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV2_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    width_multiplier = 0.75
+    model = MobileViTV2(get_configuration(width_multiplier), **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV2_x0_75"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV2_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    width_multiplier = 0.5
+    model = MobileViTV2(get_configuration(width_multiplier), **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV2_x0_5"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mobilevit_v3.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mobilevit_v3.py
new file mode 100644
index 000000000..652ac5626
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/mobilevit_v3.py
@@ -0,0 +1,1445 @@
+# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/micronDLA/MobileViTv3/blob/main/MobileViTv3-v1/cvnets/models/classification/mobilevit.py
+# reference: https://arxiv.org/abs/2209.15159
+
+import math
+from functools import partial
+from typing import Dict, Optional, Tuple, Union
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "MobileViTV3_XXS":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV3_XXS_pretrained.pdparams",
+    "MobileViTV3_XS":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV3_XS_pretrained.pdparams",
+    "MobileViTV3_S":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV3_S_pretrained.pdparams",
+    "MobileViTV3_XXS_L2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV3_XXS_L2_pretrained.pdparams",
+    "MobileViTV3_XS_L2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV3_XS_L2_pretrained.pdparams",
+    "MobileViTV3_S_L2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV3_S_L2_pretrained.pdparams",
+    "MobileViTV3_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV3_x0_5_pretrained.pdparams",
+    "MobileViTV3_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV3_x0_75_pretrained.pdparams",
+    "MobileViTV3_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/MobileViTV3_x1_0_pretrained.pdparams",
+}
+
+layer_norm_2d = partial(nn.GroupNorm, num_groups=1)
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class InvertedResidual(nn.Layer):
+    """
+    Inverted residual block (MobileNetv2): https://arxiv.org/abs/1801.04381
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 stride: int,
+                 expand_ratio: Union[int, float],
+                 dilation: int=1) -> None:
+        assert stride in [1, 2]
+        super(InvertedResidual, self).__init__()
+        self.stride = stride
+
+        hidden_dim = make_divisible(int(round(in_channels * expand_ratio)), 8)
+        self.use_res_connect = self.stride == 1 and in_channels == out_channels
+
+        block = nn.Sequential()
+        if expand_ratio != 1:
+            block.add_sublayer(
+                name="exp_1x1",
+                sublayer=nn.Sequential(
+                    ('conv', nn.Conv2D(
+                        in_channels, hidden_dim, 1, bias_attr=False)),
+                    ('norm', nn.BatchNorm2D(hidden_dim)), ('act', nn.Silu())))
+
+        block.add_sublayer(
+            name="conv_3x3",
+            sublayer=nn.Sequential(
+                ('conv', nn.Conv2D(
+                    hidden_dim,
+                    hidden_dim,
+                    3,
+                    bias_attr=False,
+                    stride=stride,
+                    padding=dilation,
+                    dilation=dilation,
+                    groups=hidden_dim)), ('norm', nn.BatchNorm2D(hidden_dim)),
+                ('act', nn.Silu())))
+
+        block.add_sublayer(
+            name="red_1x1",
+            sublayer=nn.Sequential(
+                ('conv', nn.Conv2D(
+                    hidden_dim, out_channels, 1, bias_attr=False)),
+                ('norm', nn.BatchNorm2D(out_channels))))
+
+        self.block = block
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.exp = expand_ratio
+        self.dilation = dilation
+
+    def forward(self, x, *args, **kwargs):
+        if self.use_res_connect:
+            return x + self.block(x)
+        else:
+            return self.block(x)
+
+
+class MultiHeadAttention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv_proj = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.out_proj = nn.Linear(dim, dim, bias_attr=qkv_bias)
+
+    def forward(self, x):
+        # B = x.shape[0]
+        N, C = x.shape[1:]
+        qkv = self.qkv_proj(x).reshape((-1, N, 3, self.num_heads,
+                                        C // self.num_heads)).transpose(
+                                            (2, 0, 3, 1, 4))
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
+        x = self.out_proj(x)
+        return x
+
+
+class TransformerEncoder(nn.Layer):
+    """
+        This class defines the Transformer encoder (pre-norm) as described in "Attention is all you need" paper
+            https://arxiv.org/abs/1706.03762
+    """
+
+    def __init__(self,
+                 embed_dim: int,
+                 ffn_latent_dim: int,
+                 num_heads: Optional[int]=8,
+                 attn_dropout: Optional[float]=0.0,
+                 dropout: Optional[float]=0.1,
+                 ffn_dropout: Optional[float]=0.0,
+                 transformer_norm_layer: nn.Layer=nn.LayerNorm):
+        super(TransformerEncoder, self).__init__()
+
+        self.pre_norm_mha = nn.Sequential(
+            transformer_norm_layer(embed_dim),
+            MultiHeadAttention(
+                embed_dim, num_heads, attn_drop=attn_dropout, qkv_bias=True),
+            nn.Dropout(p=dropout))
+
+        self.pre_norm_ffn = nn.Sequential(
+            transformer_norm_layer(embed_dim),
+            nn.Linear(embed_dim, ffn_latent_dim),
+            nn.Silu(),
+            nn.Dropout(p=ffn_dropout),
+            nn.Linear(ffn_latent_dim, embed_dim),
+            nn.Dropout(p=dropout))
+        self.embed_dim = embed_dim
+        self.ffn_dim = ffn_latent_dim
+        self.ffn_dropout = ffn_dropout
+
+    def forward(self, x):
+        # Multi-head attention
+        x = x + self.pre_norm_mha(x)
+
+        # Feed forward network
+        x = x + self.pre_norm_ffn(x)
+        return x
+
+
+class MobileViTV3Block(nn.Layer):
+    """
+        MobileViTV3 block
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 transformer_dim: int,
+                 ffn_dim: int,
+                 n_transformer_blocks: Optional[int]=2,
+                 head_dim: Optional[int]=32,
+                 attn_dropout: Optional[float]=0.1,
+                 dropout: Optional[int]=0.1,
+                 ffn_dropout: Optional[int]=0.1,
+                 patch_h: Optional[int]=8,
+                 patch_w: Optional[int]=8,
+                 transformer_norm_layer: nn.Layer=nn.LayerNorm,
+                 conv_ksize: Optional[int]=3,
+                 dilation: Optional[int]=1,
+                 var_ffn: Optional[bool]=False,
+                 no_fusion: Optional[bool]=False):
+
+        # For MobileViTV3: Normal 3x3 convolution --> Depthwise 3x3 convolution
+        padding = (conv_ksize - 1) // 2 * dilation
+        conv_3x3_in = nn.Sequential(
+            ('conv', nn.Conv2D(
+                in_channels,
+                in_channels,
+                conv_ksize,
+                bias_attr=False,
+                padding=padding,
+                dilation=dilation,
+                groups=in_channels)), ('norm', nn.BatchNorm2D(in_channels)),
+            ('act', nn.Silu()))
+        conv_1x1_in = nn.Sequential(('conv', nn.Conv2D(
+            in_channels, transformer_dim, 1, bias_attr=False)))
+
+        conv_1x1_out = nn.Sequential(
+            ('conv', nn.Conv2D(
+                transformer_dim, in_channels, 1, bias_attr=False)),
+            ('norm', nn.BatchNorm2D(in_channels)), ('act', nn.Silu()))
+        conv_3x3_out = None
+
+        # For MobileViTV3: input+global --> local+global
+        if not no_fusion:
+            #input_ch = tr_dim + in_ch
+            conv_3x3_out = nn.Sequential(
+                ('conv', nn.Conv2D(
+                    transformer_dim + in_channels,
+                    in_channels,
+                    1,
+                    bias_attr=False)), ('norm', nn.BatchNorm2D(in_channels)),
+                ('act', nn.Silu()))
+
+        super().__init__()
+        self.local_rep = nn.Sequential()
+        self.local_rep.add_sublayer(name="conv_3x3", sublayer=conv_3x3_in)
+        self.local_rep.add_sublayer(name="conv_1x1", sublayer=conv_1x1_in)
+
+        assert transformer_dim % head_dim == 0
+        num_heads = transformer_dim // head_dim
+
+        ffn_dims = [ffn_dim] * n_transformer_blocks
+
+        global_rep = [
+            TransformerEncoder(
+                embed_dim=transformer_dim,
+                ffn_latent_dim=ffn_dims[block_idx],
+                num_heads=num_heads,
+                attn_dropout=attn_dropout,
+                dropout=dropout,
+                ffn_dropout=ffn_dropout,
+                transformer_norm_layer=transformer_norm_layer)
+            for block_idx in range(n_transformer_blocks)
+        ]
+        global_rep.append(transformer_norm_layer(transformer_dim))
+        self.global_rep = nn.Sequential(*global_rep)
+
+        self.conv_proj = conv_1x1_out
+
+        self.fusion = conv_3x3_out
+
+        self.patch_h = patch_h
+        self.patch_w = patch_w
+        self.patch_area = self.patch_w * self.patch_h
+
+        self.cnn_in_dim = in_channels
+        self.cnn_out_dim = transformer_dim
+        self.n_heads = num_heads
+        self.ffn_dim = ffn_dim
+        self.dropout = dropout
+        self.attn_dropout = attn_dropout
+        self.ffn_dropout = ffn_dropout
+        self.dilation = dilation
+        self.ffn_max_dim = ffn_dims[0]
+        self.ffn_min_dim = ffn_dims[-1]
+        self.var_ffn = var_ffn
+        self.n_blocks = n_transformer_blocks
+        self.conv_ksize = conv_ksize
+
+    def unfolding(self, feature_map):
+        patch_w, patch_h = self.patch_w, self.patch_h
+        patch_area = int(patch_w * patch_h)
+        batch_size, in_channels, orig_h, orig_w = feature_map.shape
+
+        new_h = int(math.ceil(orig_h / self.patch_h) * self.patch_h)
+        new_w = int(math.ceil(orig_w / self.patch_w) * self.patch_w)
+
+        interpolate = False
+        if new_w != orig_w or new_h != orig_h:
+            # Note: Padding can be done, but then it needs to be handled in attention function.
+            feature_map = F.interpolate(
+                feature_map,
+                size=(new_h, new_w),
+                mode="bilinear",
+                align_corners=False)
+            interpolate = True
+
+        # number of patches along width and height
+        num_patch_w = new_w // patch_w  # n_w
+        num_patch_h = new_h // patch_h  # n_h
+        num_patches = num_patch_h * num_patch_w  # N
+
+        # [B, C, H, W] --> [B * C * n_h, p_h, n_w, p_w]
+        reshaped_fm = feature_map.reshape([
+            batch_size * in_channels * num_patch_h, patch_h, num_patch_w,
+            patch_w
+        ])
+        # [B * C * n_h, p_h, n_w, p_w] --> [B * C * n_h, n_w, p_h, p_w]
+        transposed_fm = reshaped_fm.transpose([0, 2, 1, 3])
+        # [B * C * n_h, n_w, p_h, p_w] --> [B, C, N, P] where P = p_h * p_w and N = n_h * n_w
+        reshaped_fm = transposed_fm.reshape(
+            [batch_size, in_channels, num_patches, patch_area])
+        # [B, C, N, P] --> [B, P, N, C]
+        transposed_fm = reshaped_fm.transpose([0, 3, 2, 1])
+        # [B, P, N, C] --> [BP, N, C]
+        patches = transposed_fm.reshape(
+            [batch_size * patch_area, num_patches, in_channels])
+
+        info_dict = {
+            "orig_size": (orig_h, orig_w),
+            "batch_size": batch_size,
+            "interpolate": interpolate,
+            "total_patches": num_patches,
+            "num_patches_w": num_patch_w,
+            "num_patches_h": num_patch_h
+        }
+
+        return patches, info_dict
+
+    def folding(self, patches, info_dict):
+        n_dim = patches.dim()
+        assert n_dim == 3, "Tensor should be of shape BPxNxC. Got: {}".format(
+            patches.shape)
+        # [BP, N, C] --> [B, P, N, C]
+        patches = patches.reshape([
+            info_dict["batch_size"], self.patch_area,
+            info_dict["total_patches"], patches.shape[2]
+        ])
+
+        batch_size, pixels, num_patches, channels = patches.shape
+        num_patch_h = info_dict["num_patches_h"]
+        num_patch_w = info_dict["num_patches_w"]
+
+        # [B, P, N, C] --> [B, C, N, P]
+        patches = patches.transpose([0, 3, 2, 1])
+
+        # [B, C, N, P] --> [B*C*n_h, n_w, p_h, p_w]
+        feature_map = patches.reshape([
+            batch_size * channels * num_patch_h, num_patch_w, self.patch_h,
+            self.patch_w
+        ])
+        # [B*C*n_h, n_w, p_h, p_w] --> [B*C*n_h, p_h, n_w, p_w]
+        feature_map = feature_map.transpose([0, 2, 1, 3])
+        # [B*C*n_h, p_h, n_w, p_w] --> [B, C, H, W]
+        feature_map = feature_map.reshape([
+            batch_size, channels, num_patch_h * self.patch_h,
+            num_patch_w * self.patch_w
+        ])
+        if info_dict["interpolate"]:
+            feature_map = F.interpolate(
+                feature_map,
+                size=info_dict["orig_size"],
+                mode="bilinear",
+                align_corners=False)
+        return feature_map
+
+    def forward(self, x):
+        res = x
+
+        # For MobileViTV3: Normal 3x3 convolution --> Depthwise 3x3 convolution
+        fm_conv = self.local_rep(x)
+
+        # convert feature map to patches
+        patches, info_dict = self.unfolding(fm_conv)
+
+        # learn global representations
+        patches = self.global_rep(patches)
+
+        # [B x Patch x Patches x C] --> [B x C x Patches x Patch]
+        fm = self.folding(patches=patches, info_dict=info_dict)
+
+        fm = self.conv_proj(fm)
+
+        if self.fusion is not None:
+            # For MobileViTV3: input+global --> local+global
+            fm = self.fusion(paddle.concat((fm_conv, fm), axis=1))
+
+        # For MobileViTV3: Skip connection
+        fm = fm + res
+
+        return fm
+
+
+class LinearSelfAttention(nn.Layer):
+    def __init__(self, embed_dim, attn_dropout=0.0, bias=True):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.qkv_proj = nn.Conv2D(
+            embed_dim, 1 + (2 * embed_dim), 1, bias_attr=bias)
+        self.attn_dropout = nn.Dropout(p=attn_dropout)
+        self.out_proj = nn.Conv2D(embed_dim, embed_dim, 1, bias_attr=bias)
+
+    def forward(self, x):
+        # [B, C, P, N] --> [B, h + 2d, P, N]
+        qkv = self.qkv_proj(x)
+
+        # Project x into query, key and value
+        # Query --> [B, 1, P, N]
+        # value, key --> [B, d, P, N]
+        query, key, value = paddle.split(
+            qkv, [1, self.embed_dim, self.embed_dim], axis=1)
+
+        # apply softmax along N dimension
+        context_scores = F.softmax(query, axis=-1)
+        # Uncomment below line to visualize context scores
+        # self.visualize_context_scores(context_scores=context_scores)
+        context_scores = self.attn_dropout(context_scores)
+
+        # Compute context vector
+        # [B, d, P, N] x [B, 1, P, N] -> [B, d, P, N]
+        context_vector = key * context_scores
+        # [B, d, P, N] --> [B, d, P, 1]
+        context_vector = paddle.sum(context_vector, axis=-1, keepdim=True)
+
+        # combine context vector with values
+        # [B, d, P, N] * [B, d, P, 1] --> [B, d, P, N]
+        out = F.relu(value) * context_vector
+        out = self.out_proj(out)
+        return out
+
+
+class LinearAttnFFN(nn.Layer):
+    def __init__(self,
+                 embed_dim: int,
+                 ffn_latent_dim: int,
+                 attn_dropout: Optional[float]=0.0,
+                 dropout: Optional[float]=0.1,
+                 ffn_dropout: Optional[float]=0.0,
+                 norm_layer: Optional[str]=layer_norm_2d) -> None:
+        super().__init__()
+        attn_unit = LinearSelfAttention(
+            embed_dim=embed_dim, attn_dropout=attn_dropout, bias=True)
+
+        self.pre_norm_attn = nn.Sequential(
+            norm_layer(num_channels=embed_dim),
+            attn_unit,
+            nn.Dropout(p=dropout))
+
+        self.pre_norm_ffn = nn.Sequential(
+            norm_layer(num_channels=embed_dim),
+            nn.Conv2D(embed_dim, ffn_latent_dim, 1),
+            nn.Silu(),
+            nn.Dropout(p=ffn_dropout),
+            nn.Conv2D(ffn_latent_dim, embed_dim, 1),
+            nn.Dropout(p=dropout))
+
+    def forward(self, x):
+        # self-attention
+        x = x + self.pre_norm_attn(x)
+        # Feed forward network
+        x = x + self.pre_norm_ffn(x)
+        return x
+
+
+class MobileViTV3BlockV2(nn.Layer):
+    """
+    This class defines the `MobileViTV3 block`
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 attn_unit_dim: int,
+                 ffn_multiplier: float=2.0,
+                 n_attn_blocks: Optional[int]=2,
+                 attn_dropout: Optional[float]=0.0,
+                 dropout: Optional[float]=0.0,
+                 ffn_dropout: Optional[float]=0.0,
+                 patch_h: Optional[int]=8,
+                 patch_w: Optional[int]=8,
+                 conv_ksize: Optional[int]=3,
+                 dilation: Optional[int]=1,
+                 attn_norm_layer: Optional[str]=layer_norm_2d):
+        cnn_out_dim = attn_unit_dim
+
+        padding = (conv_ksize - 1) // 2 * dilation
+        conv_3x3_in = nn.Sequential(
+            ('conv', nn.Conv2D(
+                in_channels,
+                in_channels,
+                conv_ksize,
+                bias_attr=False,
+                padding=padding,
+                dilation=dilation,
+                groups=in_channels)), ('norm', nn.BatchNorm2D(in_channels)),
+            ('act', nn.Silu()))
+        conv_1x1_in = nn.Sequential(('conv', nn.Conv2D(
+            in_channels, cnn_out_dim, 1, bias_attr=False)))
+
+        super().__init__()
+        self.local_rep = nn.Sequential(conv_3x3_in, conv_1x1_in)
+
+        self.global_rep, attn_unit_dim = self._build_attn_layer(
+            d_model=attn_unit_dim,
+            ffn_mult=ffn_multiplier,
+            n_layers=n_attn_blocks,
+            attn_dropout=attn_dropout,
+            dropout=dropout,
+            ffn_dropout=ffn_dropout,
+            attn_norm_layer=attn_norm_layer)
+
+        # MobileViTV3: input changed from just global to local+global
+        self.conv_proj = nn.Sequential(
+            ('conv', nn.Conv2D(
+                2 * cnn_out_dim, in_channels, 1, bias_attr=False)),
+            ('norm', nn.BatchNorm2D(in_channels)))
+
+        self.patch_h = patch_h
+        self.patch_w = patch_w
+
+    def _build_attn_layer(self,
+                          d_model: int,
+                          ffn_mult: float,
+                          n_layers: int,
+                          attn_dropout: float,
+                          dropout: float,
+                          ffn_dropout: float,
+                          attn_norm_layer: nn.Layer):
+
+        # ensure that dims are multiple of 16
+        ffn_dims = [ffn_mult * d_model // 16 * 16] * n_layers
+
+        global_rep = [
+            LinearAttnFFN(
+                embed_dim=d_model,
+                ffn_latent_dim=ffn_dims[block_idx],
+                attn_dropout=attn_dropout,
+                dropout=dropout,
+                ffn_dropout=ffn_dropout,
+                norm_layer=attn_norm_layer) for block_idx in range(n_layers)
+        ]
+        global_rep.append(attn_norm_layer(num_channels=d_model))
+
+        return nn.Sequential(*global_rep), d_model
+
+    def unfolding(self, feature_map):
+        batch_size, in_channels, img_h, img_w = feature_map.shape
+
+        # [B, C, H, W] --> [B, C, P, N]
+        patches = F.unfold(
+            feature_map,
+            kernel_sizes=[self.patch_h, self.patch_w],
+            strides=[self.patch_h, self.patch_w])
+        n_patches = img_h * img_w // (self.patch_h * self.patch_w)
+        patches = patches.reshape(
+            [batch_size, in_channels, self.patch_h * self.patch_w, n_patches])
+
+        return patches, (img_h, img_w)
+
+    def folding(self, patches, output_size: Tuple[int, int]):
+        batch_size, in_dim, patch_size, n_patches = patches.shape
+
+        # [B, C, P, N]
+        patches = patches.reshape([batch_size, in_dim * patch_size, n_patches])
+
+        feature_map = F.fold(
+            patches,
+            output_size,
+            kernel_sizes=[self.patch_h, self.patch_w],
+            strides=[self.patch_h, self.patch_w])
+
+        return feature_map
+
+    def forward(self, x):
+        fm_conv = self.local_rep(x)
+
+        # convert feature map to patches
+        patches, output_size = self.unfolding(fm_conv)
+
+        # learn global representations on all patches
+        patches = self.global_rep(patches)
+
+        # [B x Patch x Patches x C] --> [B x C x Patches x Patch]
+        fm = self.folding(patches=patches, output_size=output_size)
+
+        # MobileViTV3: local+global instead of only global
+        fm = self.conv_proj(paddle.concat((fm, fm_conv), axis=1))
+
+        # MobileViTV3: skip connection
+        fm = fm + x
+
+        return fm
+
+
+class MobileViTV3(nn.Layer):
+    """
+        MobileViTV3:
+    """
+
+    def __init__(self,
+                 mobilevit_config: Dict,
+                 dropout=0.1,
+                 class_num=1000,
+                 classifier_dropout=0.1,
+                 output_stride=None,
+                 mobilevit_v2_based=False):
+        super().__init__()
+        self.round_nearest = 8
+        self.dilation = 1
+        self.dropout = dropout
+        self.mobilevit_v2_based = mobilevit_v2_based
+
+        dilate_l4 = dilate_l5 = False
+        if output_stride == 8:
+            dilate_l4 = True
+            dilate_l5 = True
+        elif output_stride == 16:
+            dilate_l5 = True
+
+        # store model configuration in a dictionary
+        in_channels = mobilevit_config["layer0"]["img_channels"]
+        out_channels = mobilevit_config["layer0"]["out_channels"]
+        self.conv_1 = nn.Sequential(
+            ('conv', nn.Conv2D(
+                in_channels,
+                out_channels,
+                3,
+                bias_attr=False,
+                stride=2,
+                padding=1)), ('norm', nn.BatchNorm2D(out_channels)),
+            ('act', nn.Silu()))
+
+        in_channels = out_channels
+        self.layer_1, out_channels = self._make_layer(
+            input_channel=in_channels, cfg=mobilevit_config["layer1"])
+
+        in_channels = out_channels
+        self.layer_2, out_channels = self._make_layer(
+            input_channel=in_channels, cfg=mobilevit_config["layer2"])
+
+        in_channels = out_channels
+        self.layer_3, out_channels = self._make_layer(
+            input_channel=in_channels, cfg=mobilevit_config["layer3"])
+
+        in_channels = out_channels
+        self.layer_4, out_channels = self._make_layer(
+            input_channel=in_channels,
+            cfg=mobilevit_config["layer4"],
+            dilate=dilate_l4)
+
+        in_channels = out_channels
+        self.layer_5, out_channels = self._make_layer(
+            input_channel=in_channels,
+            cfg=mobilevit_config["layer5"],
+            dilate=dilate_l5)
+
+        if self.mobilevit_v2_based:
+            self.conv_1x1_exp = nn.Identity()
+        else:
+            in_channels = out_channels
+            out_channels = min(mobilevit_config["last_layer_exp_factor"] *
+                               in_channels, 960)
+            self.conv_1x1_exp = nn.Sequential(
+                ('conv', nn.Conv2D(
+                    in_channels, out_channels, 1, bias_attr=False)),
+                ('norm', nn.BatchNorm2D(out_channels)), ('act', nn.Silu()))
+
+        self.classifier = nn.Sequential()
+        self.classifier.add_sublayer(
+            name="global_pool",
+            sublayer=nn.Sequential(nn.AdaptiveAvgPool2D(1), nn.Flatten()))
+        if 0.0 < classifier_dropout < 1.0:
+            self.classifier.add_sublayer(
+                name="dropout", sublayer=nn.Dropout(p=classifier_dropout))
+        self.classifier.add_sublayer(
+            name="fc", sublayer=nn.Linear(out_channels, class_num))
+
+        # weight initialization
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Conv2D):
+            fan_in = m.weight.shape[1] * m.weight.shape[2] * m.weight.shape[3]
+            fan_out = m.weight.shape[0] * m.weight.shape[2] * m.weight.shape[3]
+            if self.mobilevit_v2_based:
+                bound = 1.0 / fan_in**0.5
+                nn.initializer.Uniform(-bound, bound)(m.weight)
+                if m.bias is not None:
+                    nn.initializer.Uniform(-bound, bound)(m.bias)
+            else:
+                nn.initializer.KaimingNormal(fan_in=fan_out)(m.weight)
+                if m.bias is not None:
+                    nn.initializer.Constant(0)(m.bias)
+        elif isinstance(m, nn.BatchNorm2D):
+            nn.initializer.Constant(1)(m.weight)
+            nn.initializer.Constant(0)(m.bias)
+        elif isinstance(m, nn.Linear):
+            if self.mobilevit_v2_based:
+                nn.initializer.XavierUniform()(m.weight)
+            else:
+                nn.initializer.TruncatedNormal(std=.02)(m.weight)
+            if m.bias is not None:
+                nn.initializer.Constant(0)(m.bias)
+
+    def _make_layer(self, input_channel, cfg, dilate=False):
+        block_type = cfg.get("block_type", "mobilevit")
+        if block_type.lower() == "mobilevit":
+            return self._make_mit_layer(
+                input_channel=input_channel, cfg=cfg, dilate=dilate)
+        else:
+            return self._make_mobilenet_layer(
+                input_channel=input_channel, cfg=cfg)
+
+    def _make_mit_layer(self, input_channel, cfg, dilate=False):
+        prev_dilation = self.dilation
+        block = []
+        stride = cfg.get("stride", 1)
+
+        if stride == 2:
+            if dilate:
+                self.dilation *= 2
+                stride = 1
+
+            layer = InvertedResidual(
+                in_channels=input_channel,
+                out_channels=cfg.get("out_channels"),
+                stride=stride,
+                expand_ratio=cfg.get("mv_expand_ratio", 4),
+                dilation=prev_dilation)
+
+            block.append(layer)
+            input_channel = cfg.get("out_channels")
+
+        if self.mobilevit_v2_based:
+            block.append(
+                MobileViTV3BlockV2(
+                    in_channels=input_channel,
+                    attn_unit_dim=cfg["attn_unit_dim"],
+                    ffn_multiplier=cfg.get("ffn_multiplier"),
+                    n_attn_blocks=cfg.get("attn_blocks", 1),
+                    ffn_dropout=0.,
+                    attn_dropout=0.,
+                    dilation=self.dilation,
+                    patch_h=cfg.get("patch_h", 2),
+                    patch_w=cfg.get("patch_w", 2)))
+        else:
+            head_dim = cfg.get("head_dim", 32)
+            transformer_dim = cfg["transformer_channels"]
+            ffn_dim = cfg.get("ffn_dim")
+            if head_dim is None:
+                num_heads = cfg.get("num_heads", 4)
+                if num_heads is None:
+                    num_heads = 4
+                head_dim = transformer_dim // num_heads
+
+            assert transformer_dim % head_dim == 0, (
+                "Transformer input dimension should be divisible by head dimension. "
+                "Got {} and {}.".format(transformer_dim, head_dim))
+
+            block.append(
+                MobileViTV3Block(
+                    in_channels=input_channel,
+                    transformer_dim=transformer_dim,
+                    ffn_dim=ffn_dim,
+                    n_transformer_blocks=cfg.get("transformer_blocks", 1),
+                    patch_h=cfg.get("patch_h", 2),
+                    patch_w=cfg.get("patch_w", 2),
+                    dropout=self.dropout,
+                    ffn_dropout=0.,
+                    attn_dropout=0.,
+                    head_dim=head_dim))
+
+        return nn.Sequential(*block), input_channel
+
+    def _make_mobilenet_layer(self, input_channel, cfg):
+        output_channels = cfg.get("out_channels")
+        num_blocks = cfg.get("num_blocks", 2)
+        expand_ratio = cfg.get("expand_ratio", 4)
+        block = []
+
+        for i in range(num_blocks):
+            stride = cfg.get("stride", 1) if i == 0 else 1
+
+            layer = InvertedResidual(
+                in_channels=input_channel,
+                out_channels=output_channels,
+                stride=stride,
+                expand_ratio=expand_ratio)
+            block.append(layer)
+            input_channel = output_channels
+        return nn.Sequential(*block), input_channel
+
+    def extract_features(self, x):
+        x = self.conv_1(x)
+        x = self.layer_1(x)
+        x = self.layer_2(x)
+        x = self.layer_3(x)
+
+        x = self.layer_4(x)
+        x = self.layer_5(x)
+        x = self.conv_1x1_exp(x)
+        return x
+
+    def forward(self, x):
+        x = self.extract_features(x)
+        x = self.classifier(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def MobileViTV3_S(pretrained=False, use_ssld=False, **kwargs):
+    mv2_exp_mult = 4
+    mobilevit_config = {
+        "layer0": {
+            "img_channels": 3,
+            "out_channels": 16,
+        },
+        "layer1": {
+            "out_channels": 32,
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 1,
+            "stride": 1,
+            "block_type": "mv2"
+        },
+        "layer2": {
+            "out_channels": 64,
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 3,
+            "stride": 2,
+            "block_type": "mv2"
+        },
+        "layer3": {  # 28x28
+            "out_channels": 128,
+            "transformer_channels": 144,
+            "ffn_dim": 288,
+            "transformer_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "layer4": {  # 14x14
+            "out_channels": 256,
+            "transformer_channels": 192,
+            "ffn_dim": 384,
+            "transformer_blocks": 4,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "layer5": {  # 7x7
+            "out_channels": 320,
+            "transformer_channels": 240,
+            "ffn_dim": 480,
+            "transformer_blocks": 3,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "last_layer_exp_factor": 4
+    }
+
+    model = MobileViTV3(mobilevit_config, **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV3_S"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV3_XS(pretrained=False, use_ssld=False, **kwargs):
+    mv2_exp_mult = 4
+    mobilevit_config = {
+        "layer0": {
+            "img_channels": 3,
+            "out_channels": 16,
+        },
+        "layer1": {
+            "out_channels": 32,
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 1,
+            "stride": 1,
+            "block_type": "mv2"
+        },
+        "layer2": {
+            "out_channels": 48,
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 3,
+            "stride": 2,
+            "block_type": "mv2"
+        },
+        "layer3": {  # 28x28
+            "out_channels": 96,
+            "transformer_channels": 96,
+            "ffn_dim": 192,
+            "transformer_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "layer4": {  # 14x14
+            "out_channels": 160,
+            "transformer_channels": 120,
+            "ffn_dim": 240,
+            "transformer_blocks": 4,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "layer5": {  # 7x7
+            "out_channels": 160,
+            "transformer_channels": 144,
+            "ffn_dim": 288,
+            "transformer_blocks": 3,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "last_layer_exp_factor": 4
+    }
+
+    model = MobileViTV3(mobilevit_config, **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV3_XS"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV3_XXS(pretrained=False, use_ssld=False, **kwargs):
+    mv2_exp_mult = 2
+    mobilevit_config = {
+        "layer0": {
+            "img_channels": 3,
+            "out_channels": 16,
+        },
+        "layer1": {
+            "out_channels": 16,
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 1,
+            "stride": 1,
+            "block_type": "mv2"
+        },
+        "layer2": {
+            "out_channels": 24,
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 3,
+            "stride": 2,
+            "block_type": "mv2"
+        },
+        "layer3": {  # 28x28
+            "out_channels": 64,
+            "transformer_channels": 64,
+            "ffn_dim": 128,
+            "transformer_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "layer4": {  # 14x14
+            "out_channels": 80,
+            "transformer_channels": 80,
+            "ffn_dim": 160,
+            "transformer_blocks": 4,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "layer5": {  # 7x7
+            "out_channels": 128,
+            "transformer_channels": 96,
+            "ffn_dim": 192,
+            "transformer_blocks": 3,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "last_layer_exp_factor": 4
+    }
+
+    model = MobileViTV3(mobilevit_config, **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV3_XXS"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV3_S_L2(pretrained=False, use_ssld=False, **kwargs):
+    mv2_exp_mult = 4
+    mobilevit_config = {
+        "layer0": {
+            "img_channels": 3,
+            "out_channels": 16,
+        },
+        "layer1": {
+            "out_channels": 32,
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 1,
+            "stride": 1,
+            "block_type": "mv2"
+        },
+        "layer2": {
+            "out_channels": 64,
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 3,
+            "stride": 2,
+            "block_type": "mv2"
+        },
+        "layer3": {  # 28x28
+            "out_channels": 128,
+            "transformer_channels": 144,
+            "ffn_dim": 288,
+            "transformer_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "layer4": {  # 14x14
+            "out_channels": 256,
+            "transformer_channels": 192,
+            "ffn_dim": 384,
+            "transformer_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "layer5": {  # 7x7
+            "out_channels": 320,
+            "transformer_channels": 240,
+            "ffn_dim": 480,
+            "transformer_blocks": 3,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "last_layer_exp_factor": 4
+    }
+
+    model = MobileViTV3(mobilevit_config, **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV3_S_L2"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV3_XS_L2(pretrained=False, use_ssld=False, **kwargs):
+    mv2_exp_mult = 4
+    mobilevit_config = {
+        "layer0": {
+            "img_channels": 3,
+            "out_channels": 16,
+        },
+        "layer1": {
+            "out_channels": 32,
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 1,
+            "stride": 1,
+            "block_type": "mv2"
+        },
+        "layer2": {
+            "out_channels": 48,
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 3,
+            "stride": 2,
+            "block_type": "mv2"
+        },
+        "layer3": {  # 28x28
+            "out_channels": 96,
+            "transformer_channels": 96,
+            "ffn_dim": 192,
+            "transformer_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "layer4": {  # 14x14
+            "out_channels": 160,
+            "transformer_channels": 120,
+            "ffn_dim": 240,
+            "transformer_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "layer5": {  # 7x7
+            "out_channels": 160,
+            "transformer_channels": 144,
+            "ffn_dim": 288,
+            "transformer_blocks": 3,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "last_layer_exp_factor": 4
+    }
+
+    model = MobileViTV3(mobilevit_config, **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV3_XS_L2"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV3_XXS_L2(pretrained=False, use_ssld=False, **kwargs):
+    mv2_exp_mult = 2
+    mobilevit_config = {
+        "layer0": {
+            "img_channels": 3,
+            "out_channels": 16,
+        },
+        "layer1": {
+            "out_channels": 16,
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 1,
+            "stride": 1,
+            "block_type": "mv2"
+        },
+        "layer2": {
+            "out_channels": 24,
+            "expand_ratio": mv2_exp_mult,
+            "num_blocks": 3,
+            "stride": 2,
+            "block_type": "mv2"
+        },
+        "layer3": {  # 28x28
+            "out_channels": 64,
+            "transformer_channels": 64,
+            "ffn_dim": 128,
+            "transformer_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "layer4": {  # 14x14
+            "out_channels": 80,
+            "transformer_channels": 80,
+            "ffn_dim": 160,
+            "transformer_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "layer5": {  # 7x7
+            "out_channels": 128,
+            "transformer_channels": 96,
+            "ffn_dim": 192,
+            "transformer_blocks": 3,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": mv2_exp_mult,
+            "head_dim": None,
+            "num_heads": 4,
+            "block_type": "mobilevit"
+        },
+        "last_layer_exp_factor": 4
+    }
+
+    model = MobileViTV3(mobilevit_config, **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV3_XXS_L2"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV3_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    mobilevit_config = {
+        "layer0": {
+            "img_channels": 3,
+            "out_channels": 32,
+        },
+        "layer1": {
+            "out_channels": 64,
+            "expand_ratio": 2,
+            "num_blocks": 1,
+            "stride": 1,
+            "block_type": "mv2",
+        },
+        "layer2": {
+            "out_channels": 128,
+            "expand_ratio": 2,
+            "num_blocks": 2,
+            "stride": 2,
+            "block_type": "mv2",
+        },
+        "layer3": {  # 28x28
+            "out_channels": 256,
+            "attn_unit_dim": 128,
+            "ffn_multiplier": 2,
+            "attn_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": 2,
+            "block_type": "mobilevit",
+        },
+        "layer4": {  # 14x14
+            "out_channels": 384,
+            "attn_unit_dim": 192,
+            "ffn_multiplier": 2,
+            "attn_blocks": 4,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": 2,
+            "block_type": "mobilevit",
+        },
+        "layer5": {  # 7x7
+            "out_channels": 512,
+            "attn_unit_dim": 256,
+            "ffn_multiplier": 2,
+            "attn_blocks": 3,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": 2,
+            "block_type": "mobilevit",
+        },
+        "last_layer_exp_factor": 4,
+    }
+
+    model = MobileViTV3(mobilevit_config, mobilevit_v2_based=True, **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV3_x1_0"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV3_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    mobilevit_config = {
+        "layer0": {
+            "img_channels": 3,
+            "out_channels": 24,
+        },
+        "layer1": {
+            "out_channels": 48,
+            "expand_ratio": 2,
+            "num_blocks": 1,
+            "stride": 1,
+            "block_type": "mv2",
+        },
+        "layer2": {
+            "out_channels": 96,
+            "expand_ratio": 2,
+            "num_blocks": 2,
+            "stride": 2,
+            "block_type": "mv2",
+        },
+        "layer3": {  # 28x28
+            "out_channels": 192,
+            "attn_unit_dim": 96,
+            "ffn_multiplier": 2,
+            "attn_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": 2,
+            "block_type": "mobilevit",
+        },
+        "layer4": {  # 14x14
+            "out_channels": 288,
+            "attn_unit_dim": 144,
+            "ffn_multiplier": 2,
+            "attn_blocks": 4,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": 2,
+            "block_type": "mobilevit",
+        },
+        "layer5": {  # 7x7
+            "out_channels": 384,
+            "attn_unit_dim": 192,
+            "ffn_multiplier": 2,
+            "attn_blocks": 3,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": 2,
+            "block_type": "mobilevit",
+        },
+        "last_layer_exp_factor": 4,
+    }
+
+    model = MobileViTV3(mobilevit_config, mobilevit_v2_based=True, **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV3_x0_75"], use_ssld=use_ssld)
+    return model
+
+
+def MobileViTV3_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    mobilevit_config = {
+        "layer0": {
+            "img_channels": 3,
+            "out_channels": 16,
+        },
+        "layer1": {
+            "out_channels": 32,
+            "expand_ratio": 2,
+            "num_blocks": 1,
+            "stride": 1,
+            "block_type": "mv2",
+        },
+        "layer2": {
+            "out_channels": 64,
+            "expand_ratio": 2,
+            "num_blocks": 2,
+            "stride": 2,
+            "block_type": "mv2",
+        },
+        "layer3": {  # 28x28
+            "out_channels": 128,
+            "attn_unit_dim": 64,
+            "ffn_multiplier": 2,
+            "attn_blocks": 2,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": 2,
+            "block_type": "mobilevit",
+        },
+        "layer4": {  # 14x14
+            "out_channels": 192,
+            "attn_unit_dim": 96,
+            "ffn_multiplier": 2,
+            "attn_blocks": 4,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": 2,
+            "block_type": "mobilevit",
+        },
+        "layer5": {  # 7x7
+            "out_channels": 256,
+            "attn_unit_dim": 128,
+            "ffn_multiplier": 2,
+            "attn_blocks": 3,
+            "patch_h": 2,
+            "patch_w": 2,
+            "stride": 2,
+            "mv_expand_ratio": 2,
+            "block_type": "mobilevit",
+        },
+        "last_layer_exp_factor": 4,
+    }
+
+    model = MobileViTV3(mobilevit_config, mobilevit_v2_based=True, **kwargs)
+
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["MobileViTV3_x0_5"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/nextvit.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/nextvit.py
new file mode 100644
index 000000000..383d6d1fe
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/nextvit.py
@@ -0,0 +1,643 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/bytedance/Next-ViT/blob/main/classification/nextvit.py
+# reference: https://arxiv.org/abs/2207.05501
+
+from functools import partial
+
+import paddle
+from paddle import nn
+from paddle.nn.initializer import TruncatedNormal, Constant, Normal
+from .vision_transformer import trunc_normal_, zeros_, ones_, to_2tuple, DropPath, Identity
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "NextViT_small_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/NextViT_small_224_pretrained.pdparams",
+    "NextViT_base_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/NextViT_base_224_pretrained.pdparams",
+    "NextViT_large_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/NextViT_large_224_pretrained.pdparams",
+    "NextViT_small_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/NextViT_small_384_pretrained.pdparams",
+    "NextViT_base_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/NextViT_base_384_pretrained.pdparams",
+    "NextViT_large_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/NextViT_large_384_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+NORM_EPS = 1e-5
+
+
+def rearrange(x, pattern, **axes_lengths):
+    if 'b (h w) c -> b c h w' == pattern:
+        b, n, c = x.shape
+        h = axes_lengths.pop('h', -1)
+        w = axes_lengths.pop('w', -1)
+        h = h if w == -1 else n // w
+        w = w if h == -1 else n // h
+        return x.transpose([0, 2, 1]).reshape([b, c, h, w])
+    if 'b c h w -> b (h w) c' == pattern:
+        b, c, h, w = x.shape
+        return x.reshape([b, c, h * w]).transpose([0, 2, 1])
+    if 'b t (h d) -> b h t d' == pattern:
+        b, t, h_d = x.shape
+        h = axes_lengths['h']
+        return x.reshape([b, t, h, h_d // h]).transpose([0, 2, 1, 3])
+    if 'b h t d -> b t (h d)' == pattern:
+        b, h, t, d = x.shape
+        return x.transpose([0, 2, 1, 3]).reshape([b, t, h * d])
+
+    raise NotImplementedError(
+        "Rearrangement '{}' has not been implemented.".format(pattern))
+
+
+def merge_pre_bn(layer, pre_bn_1, pre_bn_2=None):
+    """ Merge pre BN to reduce inference runtime.
+    """
+    weight = layer.weight
+    if isinstance(layer, nn.Linear):
+        weight = weight.transpose([1, 0])
+    bias = layer.bias
+    if pre_bn_2 is None:
+        scale_invstd = (pre_bn_1._variance + pre_bn_1._epsilon).pow(-0.5)
+        extra_weight = scale_invstd * pre_bn_1.weight
+        extra_bias = pre_bn_1.bias - pre_bn_1.weight * pre_bn_1._mean * scale_invstd
+    else:
+        scale_invstd_1 = (pre_bn_1._variance + pre_bn_1._epsilon).pow(-0.5)
+        scale_invstd_2 = (pre_bn_2._variance + pre_bn_2._epsilon).pow(-0.5)
+
+        extra_weight = scale_invstd_1 * pre_bn_1.weight * scale_invstd_2 * pre_bn_2.weight
+        extra_bias = scale_invstd_2 * pre_bn_2.weight * (
+            pre_bn_1.bias - pre_bn_1.weight * pre_bn_1._mean * scale_invstd_1 -
+            pre_bn_2._mean) + pre_bn_2.bias
+    if isinstance(layer, nn.Linear):
+        extra_bias = weight @extra_bias
+
+        weight = weight.multiply(
+            extra_weight.reshape([1, weight.shape[1]]).expand_as(weight))
+        weight = weight.transpose([1, 0])
+    elif isinstance(layer, nn.Conv2D):
+        assert weight.shape[2] == 1 and weight.shape[3] == 1
+
+        weight = weight.reshape([weight.shape[0], weight.shape[1]])
+        extra_bias = weight @extra_bias
+        weight = weight.multiply(
+            extra_weight.reshape([1, weight.shape[1]]).expand_as(weight))
+        weight = weight.reshape([weight.shape[0], weight.shape[1], 1, 1])
+    bias = bias.add(extra_bias)
+
+    layer.weight.set_value(weight)
+    layer.bias.set_value(bias)
+
+
+def _make_divisible(v, divisor, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class ConvBNReLU(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 groups=1):
+        super(ConvBNReLU, self).__init__()
+        self.conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=1,
+            groups=groups,
+            bias_attr=False)
+        self.norm = nn.BatchNorm2D(out_channels, epsilon=NORM_EPS)
+        self.act = nn.ReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm(x)
+        x = self.act(x)
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    def __init__(self, in_channels, out_channels, stride=1):
+        super(PatchEmbed, self).__init__()
+        norm_layer = partial(nn.BatchNorm2D, epsilon=NORM_EPS)
+        if stride == 2:
+            self.avgpool = nn.AvgPool2D((2, 2), stride=2, ceil_mode=True)
+            self.conv = nn.Conv2D(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                bias_attr=False)
+            self.norm = norm_layer(out_channels)
+        elif in_channels != out_channels:
+            self.avgpool = nn.Identity()
+            self.conv = nn.Conv2D(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=1,
+                bias_attr=False)
+            self.norm = norm_layer(out_channels)
+        else:
+            self.avgpool = nn.Identity()
+            self.conv = nn.Identity()
+            self.norm = nn.Identity()
+
+    def forward(self, x):
+        return self.norm(self.conv(self.avgpool(x)))
+
+
+class MHCA(nn.Layer):
+    """
+    Multi-Head Convolutional Attention
+    """
+
+    def __init__(self, out_channels, head_dim):
+        super(MHCA, self).__init__()
+        norm_layer = partial(nn.BatchNorm2D, epsilon=NORM_EPS)
+        self.group_conv3x3 = nn.Conv2D(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            groups=out_channels // head_dim,
+            bias_attr=False)
+        self.norm = norm_layer(out_channels)
+        self.act = nn.ReLU()
+        self.projection = nn.Conv2D(
+            out_channels, out_channels, kernel_size=1, bias_attr=False)
+
+    def forward(self, x):
+        out = self.group_conv3x3(x)
+        out = self.norm(out)
+        out = self.act(out)
+        out = self.projection(out)
+        return out
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 out_features=None,
+                 mlp_ratio=None,
+                 drop=0.,
+                 bias=True):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_dim = _make_divisible(in_features * mlp_ratio, 32)
+        self.conv1 = nn.Conv2D(
+            in_features,
+            hidden_dim,
+            kernel_size=1,
+            bias_attr=None if bias == True else False)
+        self.act = nn.ReLU()
+        self.conv2 = nn.Conv2D(
+            hidden_dim,
+            out_features,
+            kernel_size=1,
+            bias_attr=None if bias == True else False)
+        self.drop = nn.Dropout(drop)
+
+    def merge_bn(self, pre_norm):
+        merge_pre_bn(self.conv1, pre_norm)
+        self.is_bn_merged = True
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.conv2(x)
+        x = self.drop(x)
+        return x
+
+
+class NCB(nn.Layer):
+    """
+    Next Convolution Block
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride=1,
+                 path_dropout=0.0,
+                 drop=0.0,
+                 head_dim=32,
+                 mlp_ratio=3):
+        super(NCB, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        norm_layer = partial(nn.BatchNorm2D, epsilon=NORM_EPS)
+        assert out_channels % head_dim == 0
+
+        self.patch_embed = PatchEmbed(in_channels, out_channels, stride)
+        self.mhca = MHCA(out_channels, head_dim)
+        self.attention_path_dropout = DropPath(path_dropout)
+
+        self.norm = norm_layer(out_channels)
+        self.mlp = Mlp(out_channels, mlp_ratio=mlp_ratio, drop=drop, bias=True)
+        self.mlp_path_dropout = DropPath(path_dropout)
+        self.is_bn_merged = False
+
+    def merge_bn(self):
+        if not self.is_bn_merged:
+            self.mlp.merge_bn(self.norm)
+            self.is_bn_merged = True
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        x = x + self.attention_path_dropout(self.mhca(x))
+
+        if not self.is_bn_merged:
+            out = self.norm(x)
+        else:
+            out = x
+        x = x + self.mlp_path_dropout(self.mlp(out))
+        return x
+
+
+class E_MHSA(nn.Layer):
+    """
+    Efficient Multi-Head Self Attention
+    """
+
+    def __init__(self,
+                 dim,
+                 out_dim=None,
+                 head_dim=32,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop=0,
+                 proj_drop=0.,
+                 sr_ratio=1):
+        super().__init__()
+        self.dim = dim
+        self.out_dim = out_dim if out_dim is not None else dim
+        self.num_heads = self.dim // head_dim
+        self.scale = qk_scale or head_dim**-0.5
+        self.q = nn.Linear(dim, self.dim, bias_attr=qkv_bias)
+        self.k = nn.Linear(dim, self.dim, bias_attr=qkv_bias)
+        self.v = nn.Linear(dim, self.dim, bias_attr=qkv_bias)
+        self.proj = nn.Linear(self.dim, self.out_dim)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.sr_ratio = sr_ratio
+        self.N_ratio = sr_ratio**2
+        if sr_ratio > 1:
+            self.sr = nn.AvgPool1D(
+                kernel_size=self.N_ratio, stride=self.N_ratio)
+            self.norm = nn.BatchNorm1D(dim, epsilon=NORM_EPS)
+        self.is_bn_merged = False
+
+    def merge_bn(self, pre_bn):
+        merge_pre_bn(self.q, pre_bn)
+        if self.sr_ratio > 1:
+            merge_pre_bn(self.k, pre_bn, self.norm)
+            merge_pre_bn(self.v, pre_bn, self.norm)
+        else:
+            merge_pre_bn(self.k, pre_bn)
+            merge_pre_bn(self.v, pre_bn)
+        self.is_bn_merged = True
+
+    def forward(self, x):
+        B, N, C = x.shape
+        q = self.q(x)
+        q = q.reshape(
+            [B, N, self.num_heads, int(C // self.num_heads)]).transpose(
+                [0, 2, 1, 3])
+        if self.sr_ratio > 1:
+            x_ = x.transpose([0, 2, 1])
+            x_ = self.sr(x_)
+            if not self.is_bn_merged:
+                x_ = self.norm(x_)
+            x_ = x_.transpose([0, 2, 1])
+
+            k = self.k(x_)
+            k = k.reshape(
+                [B, k.shape[1], self.num_heads, int(C // self.num_heads)
+                 ]).transpose([0, 2, 3, 1])
+            v = self.v(x_)
+            v = v.reshape(
+                [B, v.shape[1], self.num_heads, int(C // self.num_heads)
+                 ]).transpose([0, 2, 1, 3])
+        else:
+            k = self.k(x)
+            k = k.reshape(
+                [B, k.shape[1], self.num_heads, int(C // self.num_heads)
+                 ]).transpose([0, 2, 3, 1])
+            v = self.v(x)
+            v = v.reshape(
+                [B, v.shape[1], self.num_heads, int(C // self.num_heads)
+                 ]).transpose([0, 2, 1, 3])
+        attn = (q @k) * self.scale
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @v).transpose([0, 2, 1, 3]).reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class NTB(nn.Layer):
+    """
+    Next Transformer Block
+    """
+
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            path_dropout,
+            stride=1,
+            sr_ratio=1,
+            mlp_ratio=2,
+            head_dim=32,
+            mix_block_ratio=0.75,
+            attn_drop=0.0,
+            drop=0.0, ):
+        super(NTB, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.mix_block_ratio = mix_block_ratio
+        norm_func = partial(nn.BatchNorm2D, epsilon=NORM_EPS)
+
+        self.mhsa_out_channels = _make_divisible(
+            int(out_channels * mix_block_ratio), 32)
+        self.mhca_out_channels = out_channels - self.mhsa_out_channels
+
+        self.patch_embed = PatchEmbed(in_channels, self.mhsa_out_channels,
+                                      stride)
+        self.norm1 = norm_func(self.mhsa_out_channels)
+        self.e_mhsa = E_MHSA(
+            self.mhsa_out_channels,
+            head_dim=head_dim,
+            sr_ratio=sr_ratio,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+        self.mhsa_path_dropout = DropPath(path_dropout * mix_block_ratio)
+
+        self.projection = PatchEmbed(
+            self.mhsa_out_channels, self.mhca_out_channels, stride=1)
+        self.mhca = MHCA(self.mhca_out_channels, head_dim=head_dim)
+        self.mhca_path_dropout = DropPath(path_dropout * (1 - mix_block_ratio))
+
+        self.norm2 = norm_func(out_channels)
+        self.mlp = Mlp(out_channels, mlp_ratio=mlp_ratio, drop=drop)
+        self.mlp_path_dropout = DropPath(path_dropout)
+
+        self.is_bn_merged = False
+
+    def merge_bn(self):
+        if not self.is_bn_merged:
+            self.e_mhsa.merge_bn(self.norm1)
+            self.mlp.merge_bn(self.norm2)
+            self.is_bn_merged = True
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+
+        B, C, H, W = x.shape
+        if not self.is_bn_merged:
+            out = self.norm1(x)
+        else:
+            out = x
+        out = rearrange(out, "b c h w -> b (h w) c")  # b n c
+        out = self.e_mhsa(out)
+        out = self.mhsa_path_dropout(out)
+        x = x + rearrange(out, "b (h w) c -> b c h w", h=H)
+
+        out = self.projection(x)
+        out = out + self.mhca_path_dropout(self.mhca(out))
+        x = paddle.concat([x, out], axis=1)
+
+        if not self.is_bn_merged:
+            out = self.norm2(x)
+        else:
+            out = x
+        x = x + self.mlp_path_dropout(self.mlp(out))
+        return x
+
+
+class NextViT(nn.Layer):
+    def __init__(self,
+                 stem_chs,
+                 depths,
+                 path_dropout,
+                 attn_drop=0,
+                 drop=0,
+                 class_num=1000,
+                 strides=[1, 2, 2, 2],
+                 sr_ratios=[8, 4, 2, 1],
+                 head_dim=32,
+                 mix_block_ratio=0.75):
+        super(NextViT, self).__init__()
+
+        self.stage_out_channels = [
+            [96] * (depths[0]), [192] * (depths[1] - 1) + [256],
+            [384, 384, 384, 384, 512] * (depths[2] // 5),
+            [768] * (depths[3] - 1) + [1024]
+        ]
+
+        # Next Hybrid Strategy
+        self.stage_block_types = [[NCB] * depths[0],
+                                  [NCB] * (depths[1] - 1) + [NTB],
+                                  [NCB, NCB, NCB, NCB, NTB] * (depths[2] // 5),
+                                  [NCB] * (depths[3] - 1) + [NTB]]
+
+        self.stem = nn.Sequential(
+            ConvBNReLU(
+                3, stem_chs[0], kernel_size=3, stride=2),
+            ConvBNReLU(
+                stem_chs[0], stem_chs[1], kernel_size=3, stride=1),
+            ConvBNReLU(
+                stem_chs[1], stem_chs[2], kernel_size=3, stride=1),
+            ConvBNReLU(
+                stem_chs[2], stem_chs[2], kernel_size=3, stride=2), )
+        input_channel = stem_chs[-1]
+        features = []
+        idx = 0
+        dpr = [
+            x.item() for x in paddle.linspace(0, path_dropout, sum(depths))
+        ]  # stochastic depth decay rule
+        for stage_id in range(len(depths)):
+            numrepeat = depths[stage_id]
+            output_channels = self.stage_out_channels[stage_id]
+            block_types = self.stage_block_types[stage_id]
+            for block_id in range(numrepeat):
+                if strides[stage_id] == 2 and block_id == 0:
+                    stride = 2
+                else:
+                    stride = 1
+                output_channel = output_channels[block_id]
+                block_type = block_types[block_id]
+                if block_type is NCB:
+                    layer = NCB(input_channel,
+                                output_channel,
+                                stride=stride,
+                                path_dropout=dpr[idx + block_id],
+                                drop=drop,
+                                head_dim=head_dim)
+                    features.append(layer)
+                elif block_type is NTB:
+                    layer = NTB(input_channel,
+                                output_channel,
+                                path_dropout=dpr[idx + block_id],
+                                stride=stride,
+                                sr_ratio=sr_ratios[stage_id],
+                                head_dim=head_dim,
+                                mix_block_ratio=mix_block_ratio,
+                                attn_drop=attn_drop,
+                                drop=drop)
+                    features.append(layer)
+                input_channel = output_channel
+            idx += numrepeat
+        self.features = nn.Sequential(*features)
+
+        self.norm = nn.BatchNorm2D(output_channel, epsilon=NORM_EPS)
+
+        self.avgpool = nn.AdaptiveAvgPool2D((1, 1))
+        self.proj_head = nn.Sequential(nn.Linear(output_channel, class_num), )
+
+        self.stage_out_idx = [
+            sum(depths[:idx + 1]) - 1 for idx in range(len(depths))
+        ]
+        self._initialize_weights()
+
+    def merge_bn(self):
+        self.eval()
+        for idx, layer in self.named_sublayers():
+            if isinstance(layer, NCB) or isinstance(layer, NTB):
+                layer.merge_bn()
+
+    def _initialize_weights(self):
+        for n, m in self.named_sublayers():
+            if isinstance(m, (nn.BatchNorm2D, nn.GroupNorm, nn.LayerNorm,
+                              nn.BatchNorm1D)):
+                ones_(m.weight)
+                zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                trunc_normal_(m.weight)
+                if hasattr(m, 'bias') and m.bias is not None:
+                    zeros_(m.bias)
+            elif isinstance(m, nn.Conv2D):
+                trunc_normal_(m.weight)
+                if hasattr(m, 'bias') and m.bias is not None:
+                    zeros_(m.bias)
+
+    def forward(self, x):
+        x = self.stem(x)
+        for layer in self.features:
+            x = layer(x)
+        x = self.norm(x)
+        x = self.avgpool(x)
+        x = paddle.flatten(x, 1)
+        x = self.proj_head(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def NextViT_small_224(pretrained=False, use_ssld=False, **kwargs):
+    model = NextViT(
+        stem_chs=[64, 32, 64],
+        depths=[3, 4, 10, 3],
+        path_dropout=0.1,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["NextViT_small_224"], use_ssld=use_ssld)
+    return model
+
+
+def NextViT_base_224(pretrained=False, use_ssld=False, **kwargs):
+    model = NextViT(
+        stem_chs=[64, 32, 64],
+        depths=[3, 4, 20, 3],
+        path_dropout=0.2,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["NextViT_base_224"], use_ssld=use_ssld)
+    return model
+
+
+def NextViT_large_224(pretrained=False, use_ssld=False, **kwargs):
+    model = NextViT(
+        stem_chs=[64, 32, 64],
+        depths=[3, 4, 30, 3],
+        path_dropout=0.2,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["NextViT_large_224"], use_ssld=use_ssld)
+    return model
+
+
+def NextViT_small_384(pretrained=False, use_ssld=False, **kwargs):
+    model = NextViT(
+        stem_chs=[64, 32, 64],
+        depths=[3, 4, 10, 3],
+        path_dropout=0.1,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["NextViT_small_384"], use_ssld=use_ssld)
+    return model
+
+
+def NextViT_base_384(pretrained=False, use_ssld=False, **kwargs):
+    model = NextViT(
+        stem_chs=[64, 32, 64],
+        depths=[3, 4, 20, 3],
+        path_dropout=0.2,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["NextViT_base_384"], use_ssld=use_ssld)
+    return model
+
+
+def NextViT_large_384(pretrained=False, use_ssld=False, **kwargs):
+    model = NextViT(
+        stem_chs=[64, 32, 64],
+        depths=[3, 4, 30, 3],
+        path_dropout=0.2,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["NextViT_large_384"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/peleenet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/peleenet.py
new file mode 100644
index 000000000..0584c9794
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/peleenet.py
@@ -0,0 +1,264 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Code was heavily based on https://github.com/Robert-JunWang/PeleeNet
+# reference: https://arxiv.org/pdf/1804.06882.pdf
+
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import Normal, Constant
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "PeleeNet":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/PeleeNet_pretrained.pdparams"
+}
+
+__all__ = MODEL_URLS.keys()
+
+normal_ = lambda x, mean=0, std=1: Normal(mean, std)(x)
+constant_ = lambda x, value=0: Constant(value)(x)
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+class _DenseLayer(nn.Layer):
+    def __init__(self, num_input_features, growth_rate, bottleneck_width,
+                 drop_rate):
+        super(_DenseLayer, self).__init__()
+
+        growth_rate = int(growth_rate / 2)
+        inter_channel = int(growth_rate * bottleneck_width / 4) * 4
+
+        if inter_channel > num_input_features / 2:
+            inter_channel = int(num_input_features / 8) * 4
+            print('adjust inter_channel to ', inter_channel)
+
+        self.branch1a = BasicConv2D(
+            num_input_features, inter_channel, kernel_size=1)
+        self.branch1b = BasicConv2D(
+            inter_channel, growth_rate, kernel_size=3, padding=1)
+
+        self.branch2a = BasicConv2D(
+            num_input_features, inter_channel, kernel_size=1)
+        self.branch2b = BasicConv2D(
+            inter_channel, growth_rate, kernel_size=3, padding=1)
+        self.branch2c = BasicConv2D(
+            growth_rate, growth_rate, kernel_size=3, padding=1)
+
+    def forward(self, x):
+        branch1 = self.branch1a(x)
+        branch1 = self.branch1b(branch1)
+
+        branch2 = self.branch2a(x)
+        branch2 = self.branch2b(branch2)
+        branch2 = self.branch2c(branch2)
+
+        return paddle.concat([x, branch1, branch2], 1)
+
+
+class _DenseBlock(nn.Sequential):
+    def __init__(self, num_layers, num_input_features, bn_size, growth_rate,
+                 drop_rate):
+        super(_DenseBlock, self).__init__()
+        for i in range(num_layers):
+            layer = _DenseLayer(num_input_features + i * growth_rate,
+                                growth_rate, bn_size, drop_rate)
+            setattr(self, 'denselayer%d' % (i + 1), layer)
+
+
+class _StemBlock(nn.Layer):
+    def __init__(self, num_input_channels, num_init_features):
+        super(_StemBlock, self).__init__()
+
+        num_stem_features = int(num_init_features / 2)
+
+        self.stem1 = BasicConv2D(
+            num_input_channels,
+            num_init_features,
+            kernel_size=3,
+            stride=2,
+            padding=1)
+        self.stem2a = BasicConv2D(
+            num_init_features,
+            num_stem_features,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.stem2b = BasicConv2D(
+            num_stem_features,
+            num_init_features,
+            kernel_size=3,
+            stride=2,
+            padding=1)
+        self.stem3 = BasicConv2D(
+            2 * num_init_features,
+            num_init_features,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.pool = nn.MaxPool2D(kernel_size=2, stride=2)
+
+    def forward(self, x):
+        out = self.stem1(x)
+
+        branch2 = self.stem2a(out)
+        branch2 = self.stem2b(branch2)
+        branch1 = self.pool(out)
+
+        out = paddle.concat([branch1, branch2], 1)
+        out = self.stem3(out)
+
+        return out
+
+
+class BasicConv2D(nn.Layer):
+    def __init__(self, in_channels, out_channels, activation=True, **kwargs):
+        super(BasicConv2D, self).__init__()
+        self.conv = nn.Conv2D(
+            in_channels, out_channels, bias_attr=False, **kwargs)
+        self.norm = nn.BatchNorm2D(out_channels)
+        self.activation = activation
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.norm(x)
+        if self.activation:
+            return F.relu(x)
+        else:
+            return x
+
+
+class PeleeNetDY(nn.Layer):
+    r"""PeleeNet model class, based on
+    `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf> and
+     "Pelee: A Real-Time Object Detection System on Mobile Devices" <https://arxiv.org/pdf/1804.06882.pdf>`
+
+    Args:
+        growth_rate (int or list of 4 ints) - how many filters to add each layer (`k` in paper)
+        block_config (list of 4 ints) - how many layers in each pooling block
+        num_init_features (int) - the number of filters to learn in the first convolution layer
+        bottleneck_width (int or list of 4 ints) - multiplicative factor for number of bottle neck layers
+          (i.e. bn_size * k features in the bottleneck layer)
+        drop_rate (float) - dropout rate after each dense layer
+        class_num (int) - number of classification classes
+    """
+
+    def __init__(self,
+                 growth_rate=32,
+                 block_config=[3, 4, 8, 6],
+                 num_init_features=32,
+                 bottleneck_width=[1, 2, 4, 4],
+                 drop_rate=0.05,
+                 class_num=1000):
+
+        super(PeleeNetDY, self).__init__()
+
+        self.features = nn.Sequential(* [('stemblock', _StemBlock(
+            3, num_init_features)), ])
+
+        if type(growth_rate) is list:
+            growth_rates = growth_rate
+            assert len(growth_rates) == 4, \
+                'The growth rate must be the list and the size must be 4'
+        else:
+            growth_rates = [growth_rate] * 4
+
+        if type(bottleneck_width) is list:
+            bottleneck_widths = bottleneck_width
+            assert len(bottleneck_widths) == 4, \
+                'The bottleneck width must be the list and the size must be 4'
+        else:
+            bottleneck_widths = [bottleneck_width] * 4
+
+        # Each denseblock
+        num_features = num_init_features
+        for i, num_layers in enumerate(block_config):
+            block = _DenseBlock(
+                num_layers=num_layers,
+                num_input_features=num_features,
+                bn_size=bottleneck_widths[i],
+                growth_rate=growth_rates[i],
+                drop_rate=drop_rate)
+            setattr(self.features, 'denseblock%d' % (i + 1), block)
+            num_features = num_features + num_layers * growth_rates[i]
+
+            setattr(
+                self.features,
+                'transition%d' % (i + 1),
+                BasicConv2D(
+                    num_features,
+                    num_features,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0))
+
+            if i != len(block_config) - 1:
+                setattr(
+                    self.features,
+                    'transition%d_pool' % (i + 1),
+                    nn.AvgPool2D(
+                        kernel_size=2, stride=2))
+                num_features = num_features
+
+        # Linear layer
+        self.classifier = nn.Linear(num_features, class_num)
+        self.drop_rate = drop_rate
+
+        self.apply(self._initialize_weights)
+
+    def forward(self, x):
+        features = self.features(x)
+        out = F.avg_pool2d(
+            features, kernel_size=features.shape[2:4]).flatten(1)
+        if self.drop_rate > 0:
+            out = F.dropout(out, p=self.drop_rate, training=self.training)
+        out = self.classifier(out)
+        return out
+
+    def _initialize_weights(self, m):
+        if isinstance(m, nn.Conv2D):
+            n = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+            normal_(m.weight, std=math.sqrt(2. / n))
+            if m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.BatchNorm2D):
+            ones_(m.weight)
+            zeros_(m.bias)
+        elif isinstance(m, nn.Linear):
+            normal_(m.weight, std=0.01)
+            zeros_(m.bias)
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def PeleeNet(pretrained=False, use_ssld=False, **kwargs):
+    model = PeleeNetDY(**kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["PeleeNet"], use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/pvt_v2.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/pvt_v2.py
new file mode 100644
index 000000000..9e9f8d5fd
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/pvt_v2.py
@@ -0,0 +1,493 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was heavily based on https://github.com/whai362/PVT
+# reference: https://arxiv.org/abs/2106.13797
+
+from functools import partial
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import TruncatedNormal, Constant
+
+from .vision_transformer import trunc_normal_, zeros_, ones_, to_2tuple, DropPath, Identity, drop_path
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "PVT_V2_B0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/PVT_V2_B0_pretrained.pdparams",
+    "PVT_V2_B1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/PVT_V2_B1_pretrained.pdparams",
+    "PVT_V2_B2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/PVT_V2_B2_pretrained.pdparams",
+    "PVT_V2_B2_Linear":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/PVT_V2_B2_Linear_pretrained.pdparams",
+    "PVT_V2_B3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/PVT_V2_B3_pretrained.pdparams",
+    "PVT_V2_B4":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/PVT_V2_B4_pretrained.pdparams",
+    "PVT_V2_B5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/PVT_V2_B5_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+@paddle.jit.not_to_static
+def swapdim(x, dim1, dim2):
+    a = list(range(len(x.shape)))
+    a[dim1], a[dim2] = a[dim2], a[dim1]
+    return x.transpose(a)
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.,
+                 linear=False):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+        self.linear = linear
+        if self.linear:
+            self.relu = nn.ReLU()
+
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        if self.linear:
+            x = self.relu(x)
+        x = self.dwconv(x, H, W)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 sr_ratio=1,
+                 linear=False):
+        super().__init__()
+        assert dim % num_heads == 0
+
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.q = nn.Linear(dim, dim, bias_attr=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.linear = linear
+        self.sr_ratio = sr_ratio
+        if not linear:
+            if sr_ratio > 1:
+                self.sr = nn.Conv2D(
+                    dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+                self.norm = nn.LayerNorm(dim)
+        else:
+            self.pool = nn.AdaptiveAvgPool2D(7)
+            self.sr = nn.Conv2D(dim, dim, kernel_size=1, stride=1)
+            self.norm = nn.LayerNorm(dim)
+            self.act = nn.GELU()
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        q = self.q(x).reshape(
+            [B, N, self.num_heads, C // self.num_heads]).transpose(
+                [0, 2, 1, 3])
+
+        if not self.linear:
+            if self.sr_ratio > 1:
+                x_ = x.transpose([0, 2, 1]).reshape([B, C, H, W])
+                x_ = self.sr(x_)
+                h_, w_ = x_.shape[-2:]
+                x_ = x_.reshape([B, C, h_ * w_]).transpose([0, 2, 1])
+                x_ = self.norm(x_)
+                kv = self.kv(x_)
+                kv = kv.reshape([
+                    B, kv.shape[2] * kv.shape[1] // 2 // C, 2, self.num_heads,
+                    C // self.num_heads
+                ]).transpose([2, 0, 3, 1, 4])
+            else:
+                kv = self.kv(x)
+                kv = kv.reshape([
+                    B, kv.shape[2] * kv.shape[1] // 2 // C, 2, self.num_heads,
+                    C // self.num_heads
+                ]).transpose([2, 0, 3, 1, 4])
+        else:
+            x_ = x.transpose([0, 2, 1]).reshape([B, C, H, W])
+            x_ = self.sr(self.pool(x_))
+            x_ = x_.reshape([B, C, x_.shape[2] * x_.shape[3]]).transpose(
+                [0, 2, 1])
+            x_ = self.norm(x_)
+            x_ = self.act(x_)
+            kv = self.kv(x_)
+            kv = kv.reshape([
+                B, kv.shape[2] * kv.shape[1] // 2 // C, 2, self.num_heads,
+                C // self.num_heads
+            ]).transpose([2, 0, 3, 1, 4])
+        k, v = kv[0], kv[1]
+
+        attn = (q @swapdim(k, -2, -1)) * self.scale
+        attn = F.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = swapdim((attn @v), 1, 2).reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 sr_ratio=1,
+                 linear=False):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            sr_ratio=sr_ratio,
+            linear=linear)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop,
+                       linear=linear)
+
+    def forward(self, x, H, W):
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
+
+        return x
+
+
+class OverlapPatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=7,
+                 stride=4,
+                 in_chans=3,
+                 embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.H, self.W = img_size[0] // patch_size[0], img_size[
+            1] // patch_size[1]
+        self.num_patches = self.H * self.W
+        self.proj = nn.Conv2D(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=(patch_size[0] // 2, patch_size[1] // 2))
+        self.norm = nn.LayerNorm(embed_dim)
+
+    def forward(self, x):
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        x = x.flatten(2)
+        x = swapdim(x, 1, 2)
+        x = self.norm(x)
+
+        return x, H, W
+
+
+class PyramidVisionTransformerV2(nn.Layer):
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dims=[64, 128, 256, 512],
+                 num_heads=[1, 2, 4, 8],
+                 mlp_ratios=[4, 4, 4, 4],
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 depths=[3, 4, 6, 3],
+                 sr_ratios=[8, 4, 2, 1],
+                 num_stages=4,
+                 linear=False):
+        super().__init__()
+        self.class_num = class_num
+        self.depths = depths
+        self.num_stages = num_stages
+
+        dpr = [x for x in paddle.linspace(0, drop_path_rate, sum(depths))
+               ]  # stochastic depth decay rule
+        cur = 0
+
+        for i in range(num_stages):
+            patch_embed = OverlapPatchEmbed(
+                img_size=img_size if i == 0 else img_size // (2**(i + 1)),
+                patch_size=7 if i == 0 else 3,
+                stride=4 if i == 0 else 2,
+                in_chans=in_chans if i == 0 else embed_dims[i - 1],
+                embed_dim=embed_dims[i])
+
+            block = nn.LayerList([
+                Block(
+                    dim=embed_dims[i],
+                    num_heads=num_heads[i],
+                    mlp_ratio=mlp_ratios[i],
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[cur + j].item(),
+                    norm_layer=norm_layer,
+                    sr_ratio=sr_ratios[i],
+                    linear=linear) for j in range(depths[i])
+            ])
+            norm = norm_layer(embed_dims[i])
+            cur += depths[i]
+
+            setattr(self, f"patch_embed{i + 1}", patch_embed)
+            setattr(self, f"block{i + 1}", block)
+            setattr(self, f"norm{i + 1}", norm)
+
+        # classification head
+        self.head = nn.Linear(embed_dims[3],
+                              class_num) if class_num > 0 else Identity()
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        B = x.shape[0]
+
+        for i in range(self.num_stages):
+            patch_embed = getattr(self, f"patch_embed{i + 1}")
+            block = getattr(self, f"block{i + 1}")
+            norm = getattr(self, f"norm{i + 1}")
+            x, H, W = patch_embed(x)
+            for blk in block:
+                x = blk(x, H, W)
+            x = norm(x)
+            if i != self.num_stages - 1:
+                x = x.reshape([B, H, W, x.shape[2]]).transpose([0, 3, 1, 2])
+
+        return x.mean(axis=1)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+
+        return x
+
+
+class DWConv(nn.Layer):
+    def __init__(self, dim=768):
+        super().__init__()
+        self.dwconv = nn.Conv2D(dim, dim, 3, 1, 1, bias_attr=True, groups=dim)
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        x = swapdim(x, 1, 2)
+        x = x.reshape([B, C, H, W])
+        x = self.dwconv(x)
+        x = x.flatten(2)
+        x = swapdim(x, 1, 2)
+
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def PVT_V2_B0(pretrained=False, use_ssld=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4,
+        embed_dims=[32, 64, 160, 256],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[2, 2, 2, 2],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["PVT_V2_B0"], use_ssld=use_ssld)
+    return model
+
+
+def PVT_V2_B1(pretrained=False, use_ssld=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[2, 2, 2, 2],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["PVT_V2_B1"], use_ssld=use_ssld)
+    return model
+
+
+def PVT_V2_B2(pretrained=False, use_ssld=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 4, 6, 3],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["PVT_V2_B2"], use_ssld=use_ssld)
+    return model
+
+
+def PVT_V2_B3(pretrained=False, use_ssld=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 4, 18, 3],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["PVT_V2_B3"], use_ssld=use_ssld)
+    return model
+
+
+def PVT_V2_B4(pretrained=False, use_ssld=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 8, 27, 3],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["PVT_V2_B4"], use_ssld=use_ssld)
+    return model
+
+
+def PVT_V2_B5(pretrained=False, use_ssld=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 6, 40, 3],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["PVT_V2_B5"], use_ssld=use_ssld)
+    return model
+
+
+def PVT_V2_B2_Linear(pretrained=False, use_ssld=False, **kwargs):
+    model = PyramidVisionTransformerV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 4, 6, 3],
+        sr_ratios=[8, 4, 2, 1],
+        linear=True,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["PVT_V2_B2_Linear"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/rednet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/rednet.py
new file mode 100644
index 000000000..70a8a2c67
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/rednet.py
@@ -0,0 +1,204 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/d-li14/involution
+# reference: https://arxiv.org/abs/2103.06255
+
+import paddle
+import paddle.nn as nn
+
+from paddle.vision.models import resnet
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "RedNet26":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RedNet26_pretrained.pdparams",
+    "RedNet38":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RedNet38_pretrained.pdparams",
+    "RedNet50":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RedNet50_pretrained.pdparams",
+    "RedNet101":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RedNet101_pretrained.pdparams",
+    "RedNet152":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RedNet152_pretrained.pdparams"
+}
+
+__all__ = MODEL_URLS.keys()
+
+
+class Involution(nn.Layer):
+    def __init__(self, channels, kernel_size, stride):
+        super(Involution, self).__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.channels = channels
+        reduction_ratio = 4
+        self.group_channels = 16
+        self.groups = self.channels // self.group_channels
+        self.conv1 = nn.Sequential(
+            ('conv', nn.Conv2D(
+                in_channels=channels,
+                out_channels=channels // reduction_ratio,
+                kernel_size=1,
+                bias_attr=False)),
+            ('bn', nn.BatchNorm2D(channels // reduction_ratio)),
+            ('activate', nn.ReLU()))
+        self.conv2 = nn.Sequential(('conv', nn.Conv2D(
+            in_channels=channels // reduction_ratio,
+            out_channels=kernel_size**2 * self.groups,
+            kernel_size=1,
+            stride=1)))
+        if stride > 1:
+            self.avgpool = nn.AvgPool2D(stride, stride)
+
+    def forward(self, x):
+        weight = self.conv2(
+            self.conv1(x if self.stride == 1 else self.avgpool(x)))
+        b, c, h, w = weight.shape
+        weight = weight.reshape(
+            (b, self.groups, self.kernel_size**2, h, w)).unsqueeze(2)
+
+        out = nn.functional.unfold(x, self.kernel_size, self.stride,
+                                   (self.kernel_size - 1) // 2, 1)
+        out = out.reshape(
+            (b, self.groups, self.group_channels, self.kernel_size**2, h, w))
+        out = (weight * out).sum(axis=3).reshape((b, self.channels, h, w))
+        return out
+
+
+class BottleneckBlock(resnet.BottleneckBlock):
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None):
+        super(BottleneckBlock, self).__init__(inplanes, planes, stride,
+                                              downsample, groups, base_width,
+                                              dilation, norm_layer)
+        width = int(planes * (base_width / 64.)) * groups
+        self.conv2 = Involution(width, 7, stride)
+
+
+class RedNet(resnet.ResNet):
+    def __init__(self, block, depth, class_num=1000, with_pool=True):
+        super(RedNet, self).__init__(
+            block=block, depth=50, num_classes=class_num, with_pool=with_pool)
+        layer_cfg = {
+            26: [1, 2, 4, 1],
+            38: [2, 3, 5, 2],
+            50: [3, 4, 6, 3],
+            101: [3, 4, 23, 3],
+            152: [3, 8, 36, 3]
+        }
+        layers = layer_cfg[depth]
+
+        self.conv1 = None
+        self.bn1 = None
+        self.relu = None
+        self.inplanes = 64
+        self.class_num = class_num
+        self.stem = nn.Sequential(
+            nn.Sequential(
+                ('conv', nn.Conv2D(
+                    in_channels=3,
+                    out_channels=self.inplanes // 2,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                    bias_attr=False)),
+                ('bn', nn.BatchNorm2D(self.inplanes // 2)),
+                ('activate', nn.ReLU())),
+            Involution(self.inplanes // 2, 3, 1),
+            nn.BatchNorm2D(self.inplanes // 2),
+            nn.ReLU(),
+            nn.Sequential(
+                ('conv', nn.Conv2D(
+                    in_channels=self.inplanes // 2,
+                    out_channels=self.inplanes,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias_attr=False)), ('bn', nn.BatchNorm2D(self.inplanes)),
+                ('activate', nn.ReLU())))
+
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
+
+    def forward(self, x):
+        x = self.stem(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        if self.with_pool:
+            x = self.avgpool(x)
+
+        if self.class_num > 0:
+            x = paddle.flatten(x, 1)
+            x = self.fc(x)
+
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def RedNet26(pretrained=False, **kwargs):
+    model = RedNet(BottleneckBlock, 26, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["RedNet26"])
+    return model
+
+
+def RedNet38(pretrained=False, **kwargs):
+    model = RedNet(BottleneckBlock, 38, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["RedNet38"])
+    return model
+
+
+def RedNet50(pretrained=False, **kwargs):
+    model = RedNet(BottleneckBlock, 50, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["RedNet50"])
+    return model
+
+
+def RedNet101(pretrained=False, **kwargs):
+    model = RedNet(BottleneckBlock, 101, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["RedNet101"])
+    return model
+
+
+def RedNet152(pretrained=False, **kwargs):
+    model = RedNet(BottleneckBlock, 152, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["RedNet152"])
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/regnet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/regnet.py
new file mode 100644
index 000000000..12f60012a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/regnet.py
@@ -0,0 +1,531 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/facebookresearch/pycls
+# reference: https://arxiv.org/abs/1905.13214
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "RegNetX_200MF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_200MF_pretrained.pdparams",
+    "RegNetX_400MF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_400MF_pretrained.pdparams",
+    "RegNetX_600MF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_600MF_pretrained.pdparams",
+    "RegNetX_800MF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_800MF_pretrained.pdparams",
+    "RegNetX_1600MF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_1600MF_pretrained.pdparams",
+    "RegNetX_3200MF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_3200MF_pretrained.pdparams",
+    "RegNetX_4GF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_4GF_pretrained.pdparams",
+    "RegNetX_6400MF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_6400MF_pretrained.pdparams",
+    "RegNetX_8GF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_8GF_pretrained.pdparams",
+    "RegNetX_12GF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_12GF_pretrained.pdparams",
+    "RegNetX_16GF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_16GF_pretrained.pdparams",
+    "RegNetX_32GF":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RegNetX_32GF_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def quantize_float(f, q):
+    """Converts a float to closest non-zero int divisible by q."""
+    return int(round(f / q) * q)
+
+
+def adjust_ws_gs_comp(ws, bms, gs):
+    """Adjusts the compatibility of widths and groups."""
+    ws_bot = [int(w * b) for w, b in zip(ws, bms)]
+    gs = [min(g, w_bot) for g, w_bot in zip(gs, ws_bot)]
+    ws_bot = [quantize_float(w_bot, g) for w_bot, g in zip(ws_bot, gs)]
+    ws = [int(w_bot / b) for w_bot, b in zip(ws_bot, bms)]
+    return ws, gs
+
+
+def get_stages_from_blocks(ws, rs):
+    """Gets ws/ds of network at each stage from per block values."""
+    ts = [
+        w != wp or r != rp
+        for w, wp, r, rp in zip(ws + [0], [0] + ws, rs + [0], [0] + rs)
+    ]
+    s_ws = [w for w, t in zip(ws, ts[:-1]) if t]
+    s_ds = np.diff([d for d, t in zip(range(len(ts)), ts) if t]).tolist()
+    return s_ws, s_ds
+
+
+def generate_regnet(w_a, w_0, w_m, d, q=8):
+    """Generates per block ws from RegNet parameters."""
+    assert w_a >= 0 and w_0 > 0 and w_m > 1 and w_0 % q == 0
+    ws_cont = np.arange(d) * w_a + w_0
+    ks = np.round(np.log(ws_cont / w_0) / np.log(w_m))
+    ws = w_0 * np.power(w_m, ks)
+    ws = np.round(np.divide(ws, q)) * q
+    num_stages, max_stage = len(np.unique(ws)), ks.max() + 1
+    ws, ws_cont = ws.astype(int).tolist(), ws_cont.tolist()
+    return ws, num_stages, max_stage, ws_cont
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 padding=0,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + ".conv2d.output.1.w_0"),
+            bias_attr=False)
+        bn_name = name + "_bn"
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + ".output.1.w_0"),
+            bias_attr=ParamAttr(bn_name + ".output.1.b_0"),
+            moving_mean_name=bn_name + "_mean",
+            moving_variance_name=bn_name + "_variance")
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 bm,
+                 gw,
+                 se_on,
+                 se_r,
+                 shortcut=True,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        # Compute the bottleneck width
+        w_b = int(round(num_filters * bm))
+        # Compute the number of groups
+        num_gs = w_b // gw
+        self.se_on = se_on
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=w_b,
+            filter_size=1,
+            padding=0,
+            act="relu",
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            num_channels=w_b,
+            num_filters=w_b,
+            filter_size=3,
+            stride=stride,
+            padding=1,
+            groups=num_gs,
+            act="relu",
+            name=name + "_branch2b")
+        if se_on:
+            w_se = int(round(num_channels * se_r))
+            self.se_block = SELayer(
+                num_channels=w_b,
+                num_filters=w_b,
+                reduction_ratio=w_se,
+                name=name + "_branch2se")
+        self.conv2 = ConvBNLayer(
+            num_channels=w_b,
+            num_filters=num_filters,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=stride,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        if self.se_on:
+            conv1 = self.se_block(conv1)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class SELayer(nn.Layer):
+    def __init__(self, num_channels, num_filters, reduction_ratio, name=None):
+        super(SELayer, self).__init__()
+
+        self.pool2d_gap = AdaptiveAvgPool2D(1)
+
+        self._num_channels = num_channels
+
+        med_ch = int(num_channels / reduction_ratio)
+        stdv = 1.0 / math.sqrt(num_channels * 1.0)
+        self.squeeze = Linear(
+            num_channels,
+            med_ch,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_sqz_weights"),
+            bias_attr=ParamAttr(name=name + "_sqz_offset"))
+
+        stdv = 1.0 / math.sqrt(med_ch * 1.0)
+        self.excitation = Linear(
+            med_ch,
+            num_filters,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_exc_weights"),
+            bias_attr=ParamAttr(name=name + "_exc_offset"))
+
+    def forward(self, input):
+        pool = self.pool2d_gap(input)
+        pool = paddle.reshape(pool, shape=[-1, self._num_channels])
+        squeeze = self.squeeze(pool)
+        squeeze = F.relu(squeeze)
+        excitation = self.excitation(squeeze)
+        excitation = F.sigmoid(excitation)
+        excitation = paddle.reshape(
+            excitation, shape=[-1, self._num_channels, 1, 1])
+        out = input * excitation
+        return out
+
+
+class RegNet(nn.Layer):
+    def __init__(self,
+                 w_a,
+                 w_0,
+                 w_m,
+                 d,
+                 group_w,
+                 bot_mul,
+                 q=8,
+                 se_on=False,
+                 class_num=1000):
+        super(RegNet, self).__init__()
+
+        # Generate RegNet ws per block
+        b_ws, num_s, max_s, ws_cont = generate_regnet(w_a, w_0, w_m, d, q)
+        # Convert to per stage format
+        ws, ds = get_stages_from_blocks(b_ws, b_ws)
+        # Generate group widths and bot muls
+        gws = [group_w for _ in range(num_s)]
+        bms = [bot_mul for _ in range(num_s)]
+        # Adjust the compatibility of ws and gws
+        ws, gws = adjust_ws_gs_comp(ws, bms, gws)
+        # Use the same stride for each stage
+        ss = [2 for _ in range(num_s)]
+        # Use SE for RegNetY
+        se_r = 0.25
+        # Construct the model
+        # Group params by stage
+        stage_params = list(zip(ds, ws, ss, bms, gws))
+        # Construct the stem
+        stem_type = "simple_stem_in"
+        stem_w = 32
+        block_type = "res_bottleneck_block"
+
+        self.conv = ConvBNLayer(
+            num_channels=3,
+            num_filters=stem_w,
+            filter_size=3,
+            stride=2,
+            padding=1,
+            act="relu",
+            name="stem_conv")
+
+        self.block_list = []
+        for block, (d, w_out, stride, bm, gw) in enumerate(stage_params):
+            shortcut = False
+            for i in range(d):
+                num_channels = stem_w if block == i == 0 else in_channels
+                # Stride apply to the first block of the stage
+                b_stride = stride if i == 0 else 1
+                conv_name = "s" + str(block + 1) + "_b" + str(i +
+                                                              1)  # chr(97 + i)
+                bottleneck_block = self.add_sublayer(
+                    conv_name,
+                    BottleneckBlock(
+                        num_channels=num_channels,
+                        num_filters=w_out,
+                        stride=b_stride,
+                        bm=bm,
+                        gw=gw,
+                        se_on=se_on,
+                        se_r=se_r,
+                        shortcut=shortcut,
+                        name=conv_name))
+                in_channels = w_out
+                self.block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.pool2d_avg_channels = w_out
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_0.w_0"),
+            bias_attr=ParamAttr(name="fc_0.b_0"))
+
+    def forward(self, inputs):
+        y = self.conv(inputs)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def RegNetX_200MF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=36.44,
+        w_0=24,
+        w_m=2.49,
+        d=13,
+        group_w=8,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_200MF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetX_400MF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=24.48,
+        w_0=24,
+        w_m=2.54,
+        d=22,
+        group_w=16,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_400MF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetX_600MF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=36.97,
+        w_0=48,
+        w_m=2.24,
+        d=16,
+        group_w=24,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_600MF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetX_800MF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=35.73,
+        w_0=56,
+        w_m=2.28,
+        d=16,
+        group_w=16,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_800MF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetX_1600MF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=34.01,
+        w_0=80,
+        w_m=2.25,
+        d=18,
+        group_w=24,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_1600MF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetX_3200MF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=26.31,
+        w_0=88,
+        w_m=2.25,
+        d=25,
+        group_w=48,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_3200MF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetX_4GF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=38.65,
+        w_0=96,
+        w_m=2.43,
+        d=23,
+        group_w=40,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_4GF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetX_6400MF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=60.83,
+        w_0=184,
+        w_m=2.07,
+        d=17,
+        group_w=56,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_6400MF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetX_8GF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=49.56,
+        w_0=80,
+        w_m=2.88,
+        d=23,
+        group_w=120,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_8GF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetX_12GF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=73.36,
+        w_0=168,
+        w_m=2.37,
+        d=19,
+        group_w=112,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_12GF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetX_16GF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=55.59,
+        w_0=216,
+        w_m=2.1,
+        d=22,
+        group_w=128,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_16GF"], use_ssld=use_ssld)
+    return model
+
+
+def RegNetX_32GF(pretrained=False, use_ssld=False, **kwargs):
+    model = RegNet(
+        w_a=69.86,
+        w_0=320,
+        w_m=2.0,
+        d=23,
+        group_w=168,
+        bot_mul=1.0,
+        q=8,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RegNetX_32GF"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/repvgg.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/repvgg.py
new file mode 100644
index 000000000..94309f578
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/repvgg.py
@@ -0,0 +1,451 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/DingXiaoH/RepVGG
+# reference: https://arxiv.org/abs/2101.03697
+
+import paddle.nn as nn
+import paddle
+import paddle.nn.functional as F
+import numpy as np
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "RepVGG_A0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_A0_pretrained.pdparams",
+    "RepVGG_A1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_A1_pretrained.pdparams",
+    "RepVGG_A2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_A2_pretrained.pdparams",
+    "RepVGG_B0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_B0_pretrained.pdparams",
+    "RepVGG_B1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_B1_pretrained.pdparams",
+    "RepVGG_B2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_B2_pretrained.pdparams",
+    "RepVGG_B1g2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_B1g2_pretrained.pdparams",
+    "RepVGG_B1g4":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_B1g4_pretrained.pdparams",
+    "RepVGG_B2g4":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_B2g4_pretrained.pdparams",
+    "RepVGG_B3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_B3_pretrained.pdparams",
+    "RepVGG_B3g4":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_B3g4_pretrained.pdparams",
+    "RepVGG_D2se":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/RepVGG_D2se_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+optional_groupwise_layers = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26]
+g2_map = {l: 2 for l in optional_groupwise_layers}
+g4_map = {l: 4 for l in optional_groupwise_layers}
+
+
+class ConvBN(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 groups=1):
+        super(ConvBN, self).__init__()
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias_attr=False)
+        self.bn = nn.BatchNorm2D(num_features=out_channels)
+
+    def forward(self, x):
+        y = self.conv(x)
+        y = self.bn(y)
+        return y
+
+
+class SEBlock(nn.Layer):
+    def __init__(self, input_channels, internal_neurons):
+        super(SEBlock, self).__init__()
+        self.down = nn.Conv2D(
+            in_channels=input_channels,
+            out_channels=internal_neurons,
+            kernel_size=1,
+            stride=1,
+            bias_attr=True)
+        self.up = nn.Conv2D(
+            in_channels=internal_neurons,
+            out_channels=input_channels,
+            kernel_size=1,
+            stride=1,
+            bias_attr=True)
+        self.input_channels = input_channels
+
+    def forward(self, inputs):
+        x = F.avg_pool2d(inputs, kernel_size=inputs.shape[3])
+        x = self.down(x)
+        x = F.relu(x)
+        x = self.up(x)
+        x = F.sigmoid(x)
+        x = x.reshape([-1, self.input_channels, 1, 1])
+        return inputs * x
+
+
+class RepVGGBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 padding_mode='zeros',
+                 use_se=False):
+        super(RepVGGBlock, self).__init__()
+        self.is_repped = False
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.padding_mode = padding_mode
+
+        assert kernel_size == 3
+        assert padding == 1
+
+        padding_11 = padding - kernel_size // 2
+
+        self.nonlinearity = nn.ReLU()
+
+        if use_se:
+            self.se = SEBlock(
+                out_channels, internal_neurons=out_channels // 16)
+        else:
+            self.se = nn.Identity()
+        self.rbr_identity = nn.BatchNorm2D(
+            num_features=in_channels
+        ) if out_channels == in_channels and stride == 1 else None
+        self.rbr_dense = ConvBN(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups)
+        self.rbr_1x1 = ConvBN(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=stride,
+            padding=padding_11,
+            groups=groups)
+
+    def forward(self, inputs):
+        if self.is_repped:
+            return self.nonlinearity(self.rbr_reparam(inputs))
+
+        if self.rbr_identity is None:
+            id_out = 0
+        else:
+            id_out = self.rbr_identity(inputs)
+        return self.nonlinearity(
+            self.se(self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out))
+
+    def re_parameterize(self):
+        if not hasattr(self, 'rbr_reparam'):
+            self.rbr_reparam = nn.Conv2D(
+                in_channels=self.in_channels,
+                out_channels=self.out_channels,
+                kernel_size=self.kernel_size,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+                padding_mode=self.padding_mode)
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.rbr_reparam.weight.set_value(kernel)
+        self.rbr_reparam.bias.set_value(bias)
+        self.is_repped = True
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
+        kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
+        return kernel3x3 + self._pad_1x1_to_3x3_tensor(
+            kernel1x1) + kernelid, bias3x3 + bias1x1 + biasid
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return nn.functional.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch):
+        if branch is None:
+            return 0, 0
+        if isinstance(branch, ConvBN):
+            kernel = branch.conv.weight
+            running_mean = branch.bn._mean
+            running_var = branch.bn._variance
+            gamma = branch.bn.weight
+            beta = branch.bn.bias
+            eps = branch.bn._epsilon
+        else:
+            assert isinstance(branch, nn.BatchNorm2D)
+            if not hasattr(self, 'id_tensor'):
+                input_dim = self.in_channels // self.groups
+                kernel_value = np.zeros(
+                    (self.in_channels, input_dim, 3, 3), dtype=np.float32)
+                for i in range(self.in_channels):
+                    kernel_value[i, i % input_dim, 1, 1] = 1
+                self.id_tensor = paddle.to_tensor(kernel_value)
+            kernel = self.id_tensor
+            running_mean = branch._mean
+            running_var = branch._variance
+            gamma = branch.weight
+            beta = branch.bias
+            eps = branch._epsilon
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape((-1, 1, 1, 1))
+        return kernel * t, beta - running_mean * gamma / std
+
+
+class RepVGG(nn.Layer):
+    def __init__(self,
+                 num_blocks,
+                 width_multiplier=None,
+                 override_groups_map=None,
+                 class_num=1000,
+                 use_se=False):
+        super(RepVGG, self).__init__()
+        assert len(width_multiplier) == 4
+        self.override_groups_map = override_groups_map or dict()
+        assert 0 not in self.override_groups_map
+        self.in_planes = min(64, int(64 * width_multiplier[0]))
+
+        self.stage0 = RepVGGBlock(
+            in_channels=3,
+            out_channels=self.in_planes,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            use_se=use_se)
+        self.cur_layer_idx = 1
+        self.stage1 = self._make_stage(
+            int(64 * width_multiplier[0]),
+            num_blocks[0],
+            stride=2,
+            use_se=use_se)
+        self.stage2 = self._make_stage(
+            int(128 * width_multiplier[1]),
+            num_blocks[1],
+            stride=2,
+            use_se=use_se)
+        self.stage3 = self._make_stage(
+            int(256 * width_multiplier[2]),
+            num_blocks[2],
+            stride=2,
+            use_se=use_se)
+        self.stage4 = self._make_stage(
+            int(512 * width_multiplier[3]),
+            num_blocks[3],
+            stride=2,
+            use_se=use_se)
+        self.gap = nn.AdaptiveAvgPool2D(output_size=1)
+        self.linear = nn.Linear(int(512 * width_multiplier[3]), class_num)
+
+    def _make_stage(self, planes, num_blocks, stride, use_se=False):
+        strides = [stride] + [1] * (num_blocks - 1)
+        blocks = []
+        for stride in strides:
+            cur_groups = self.override_groups_map.get(self.cur_layer_idx, 1)
+            blocks.append(
+                RepVGGBlock(
+                    in_channels=self.in_planes,
+                    out_channels=planes,
+                    kernel_size=3,
+                    stride=stride,
+                    padding=1,
+                    groups=cur_groups,
+                    use_se=use_se))
+            self.in_planes = planes
+            self.cur_layer_idx += 1
+        return nn.Sequential(*blocks)
+
+    def forward(self, x):
+        out = self.stage0(x)
+        out = self.stage1(out)
+        out = self.stage2(out)
+        out = self.stage3(out)
+        out = self.stage4(out)
+        out = self.gap(out)
+        out = paddle.flatten(out, start_axis=1)
+        out = self.linear(out)
+        return out
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def RepVGG_A0(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[2, 4, 14, 1],
+        width_multiplier=[0.75, 0.75, 0.75, 2.5],
+        override_groups_map=None,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_A0"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_A1(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[2, 4, 14, 1],
+        width_multiplier=[1, 1, 1, 2.5],
+        override_groups_map=None,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_A1"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_A2(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[2, 4, 14, 1],
+        width_multiplier=[1.5, 1.5, 1.5, 2.75],
+        override_groups_map=None,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_A2"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_B0(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[4, 6, 16, 1],
+        width_multiplier=[1, 1, 1, 2.5],
+        override_groups_map=None,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_B0"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_B1(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[4, 6, 16, 1],
+        width_multiplier=[2, 2, 2, 4],
+        override_groups_map=None,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_B1"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_B1g2(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[4, 6, 16, 1],
+        width_multiplier=[2, 2, 2, 4],
+        override_groups_map=g2_map,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_B1g2"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_B1g4(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[4, 6, 16, 1],
+        width_multiplier=[2, 2, 2, 4],
+        override_groups_map=g4_map,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_B1g4"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_B2(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[4, 6, 16, 1],
+        width_multiplier=[2.5, 2.5, 2.5, 5],
+        override_groups_map=None,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_B2"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_B2g4(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[4, 6, 16, 1],
+        width_multiplier=[2.5, 2.5, 2.5, 5],
+        override_groups_map=g4_map,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_B2g4"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_B3(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[4, 6, 16, 1],
+        width_multiplier=[3, 3, 3, 5],
+        override_groups_map=None,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_B3"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_B3g4(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[4, 6, 16, 1],
+        width_multiplier=[3, 3, 3, 5],
+        override_groups_map=g4_map,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_B3g4"], use_ssld=use_ssld)
+    return model
+
+
+def RepVGG_D2se(pretrained=False, use_ssld=False, **kwargs):
+    model = RepVGG(
+        num_blocks=[8, 14, 24, 1],
+        width_multiplier=[2.5, 2.5, 2.5, 5],
+        override_groups_map=None,
+        use_se=True,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["RepVGG_D2se"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/res2net.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/res2net.py
new file mode 100644
index 000000000..33261fca7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/res2net.py
@@ -0,0 +1,266 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1904.01169
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "Res2Net50_26w_4s":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Res2Net50_26w_4s_pretrained.pdparams",
+    "Res2Net50_14w_8s":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Res2Net50_14w_8s_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(
+            self,
+            num_channels,
+            num_filters,
+            filter_size,
+            stride=1,
+            groups=1,
+            act=None,
+            name=None, ):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels1,
+                 num_channels2,
+                 num_filters,
+                 stride,
+                 scales,
+                 shortcut=True,
+                 if_first=False,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+        self.stride = stride
+        self.scales = scales
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels1,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1_list = []
+        for s in range(scales - 1):
+            conv1 = self.add_sublayer(
+                name + '_branch2b_' + str(s + 1),
+                ConvBNLayer(
+                    num_channels=num_filters // scales,
+                    num_filters=num_filters // scales,
+                    filter_size=3,
+                    stride=stride,
+                    act='relu',
+                    name=name + '_branch2b_' + str(s + 1)))
+            self.conv1_list.append(conv1)
+        self.pool2d_avg = AvgPool2D(kernel_size=3, stride=stride, padding=1)
+
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_channels2,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels1,
+                num_filters=num_channels2,
+                filter_size=1,
+                stride=stride,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        xs = paddle.split(y, self.scales, 1)
+        ys = []
+        for s, conv1 in enumerate(self.conv1_list):
+            if s == 0 or self.stride == 2:
+                ys.append(conv1(xs[s]))
+            else:
+                ys.append(conv1(paddle.add(xs[s], ys[-1])))
+        if self.stride == 1:
+            ys.append(xs[-1])
+        else:
+            ys.append(self.pool2d_avg(xs[-1]))
+        conv1 = paddle.concat(ys, axis=1)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class Res2Net(nn.Layer):
+    def __init__(self, layers=50, scales=4, width=26, class_num=1000):
+        super(Res2Net, self).__init__()
+
+        self.layers = layers
+        self.scales = scales
+        self.width = width
+        basic_width = self.width * self.scales
+        supported_layers = [50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        num_channels = [64, 256, 512, 1024]
+        num_channels2 = [256, 512, 1024, 2048]
+        num_filters = [basic_width * t for t in [1, 2, 4, 8]]
+
+        self.conv1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=64,
+            filter_size=7,
+            stride=2,
+            act='relu',
+            name="conv1")
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        for block in range(len(depth)):
+            shortcut = False
+            for i in range(depth[block]):
+                if layers in [101, 152] and block == 2:
+                    if i == 0:
+                        conv_name = "res" + str(block + 2) + "a"
+                    else:
+                        conv_name = "res" + str(block + 2) + "b" + str(i)
+                else:
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        num_channels1=num_channels[block]
+                        if i == 0 else num_channels2[block],
+                        num_channels2=num_channels2[block],
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        scales=scales,
+                        shortcut=shortcut,
+                        if_first=block == i == 0,
+                        name=conv_name))
+                self.block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_weights"),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, inputs):
+        y = self.conv1(inputs)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def Res2Net50_26w_4s(pretrained=False, use_ssld=False, **kwargs):
+    model = Res2Net(layers=50, scales=4, width=26, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["Res2Net50_26w_4s"], use_ssld=use_ssld)
+    return model
+
+
+def Res2Net50_14w_8s(pretrained=False, use_ssld=False, **kwargs):
+    model = Res2Net(layers=50, scales=8, width=14, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["Res2Net50_14w_8s"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/res2net_vd.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/res2net_vd.py
new file mode 100644
index 000000000..206d06028
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/res2net_vd.py
@@ -0,0 +1,308 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1904.01169 & https://arxiv.org/abs/1812.01187
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "Res2Net50_vd_26w_4s":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Res2Net50_vd_26w_4s_pretrained.pdparams",
+    "Res2Net101_vd_26w_4s":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Res2Net101_vd_26w_4s_pretrained.pdparams",
+    "Res2Net200_vd_26w_4s":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Res2Net200_vd_26w_4s_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(
+            self,
+            num_channels,
+            num_filters,
+            filter_size,
+            stride=1,
+            groups=1,
+            is_vd_mode=False,
+            act=None,
+            name=None, ):
+        super(ConvBNLayer, self).__init__()
+
+        self.is_vd_mode = is_vd_mode
+        self._pool2d_avg = AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        if self.is_vd_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels1,
+                 num_channels2,
+                 num_filters,
+                 stride,
+                 scales,
+                 shortcut=True,
+                 if_first=False,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+        self.stride = stride
+        self.scales = scales
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels1,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1_list = []
+        for s in range(scales - 1):
+            conv1 = self.add_sublayer(
+                name + '_branch2b_' + str(s + 1),
+                ConvBNLayer(
+                    num_channels=num_filters // scales,
+                    num_filters=num_filters // scales,
+                    filter_size=3,
+                    stride=stride,
+                    act='relu',
+                    name=name + '_branch2b_' + str(s + 1)))
+            self.conv1_list.append(conv1)
+        self.pool2d_avg = AvgPool2D(kernel_size=3, stride=stride, padding=1)
+
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_channels2,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels1,
+                num_filters=num_channels2,
+                filter_size=1,
+                stride=1,
+                is_vd_mode=False if if_first else True,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        xs = paddle.split(y, self.scales, 1)
+        ys = []
+        for s, conv1 in enumerate(self.conv1_list):
+            if s == 0 or self.stride == 2:
+                ys.append(conv1(xs[s]))
+            else:
+                ys.append(conv1(xs[s] + ys[-1]))
+        if self.stride == 1:
+            ys.append(xs[-1])
+        else:
+            ys.append(self.pool2d_avg(xs[-1]))
+        conv1 = paddle.concat(ys, axis=1)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class Res2Net_vd(nn.Layer):
+    def __init__(self, layers=50, scales=4, width=26, class_num=1000,
+                 **kwargs):
+        super(Res2Net_vd, self).__init__()
+
+        self.layers = layers
+        self.scales = scales
+        self.width = width
+        basic_width = self.width * self.scales
+        supported_layers = [50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        num_channels = [64, 256, 512, 1024]
+        num_channels2 = [256, 512, 1024, 2048]
+        num_filters = [basic_width * t for t in [1, 2, 4, 8]]
+
+        self.conv1_1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=32,
+            filter_size=3,
+            stride=2,
+            act='relu',
+            name="conv1_1")
+        self.conv1_2 = ConvBNLayer(
+            num_channels=32,
+            num_filters=32,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_2")
+        self.conv1_3 = ConvBNLayer(
+            num_channels=32,
+            num_filters=64,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_3")
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        for block in range(len(depth)):
+            shortcut = False
+            for i in range(depth[block]):
+                if layers in [101, 152, 200] and block == 2:
+                    if i == 0:
+                        conv_name = "res" + str(block + 2) + "a"
+                    else:
+                        conv_name = "res" + str(block + 2) + "b" + str(i)
+                else:
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        num_channels1=num_channels[block]
+                        if i == 0 else num_channels2[block],
+                        num_channels2=num_channels2[block],
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        scales=scales,
+                        shortcut=shortcut,
+                        if_first=block == i == 0,
+                        name=conv_name))
+                self.block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_weights"),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def Res2Net50_vd_26w_4s(pretrained=False, use_ssld=False, **kwargs):
+    model = Res2Net_vd(layers=50, scales=4, width=26, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["Res2Net50_vd_26w_4s"],
+        use_ssld=use_ssld)
+    return model
+
+
+def Res2Net101_vd_26w_4s(pretrained=False, use_ssld=False, **kwargs):
+    model = Res2Net_vd(layers=101, scales=4, width=26, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["Res2Net101_vd_26w_4s"],
+        use_ssld=use_ssld)
+    return model
+
+
+def Res2Net200_vd_26w_4s(pretrained=False, use_ssld=False, **kwargs):
+    model = Res2Net_vd(layers=200, scales=4, width=26, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["Res2Net200_vd_26w_4s"],
+        use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/resnest.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/resnest.py
new file mode 100644
index 000000000..171aa1f94
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/resnest.py
@@ -0,0 +1,780 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/zhanghang1989/ResNeSt
+# reference: https://arxiv.org/abs/2004.08955
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+import math
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import KaimingNormal
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.regularizer import L2Decay
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "ResNeSt50_fast_1s1x64d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeSt50_fast_1s1x64d_pretrained.pdparams",
+    "ResNeSt50":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeSt50_pretrained.pdparams",
+    "ResNeSt101":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeSt101_pretrained.pdparams",
+    "ResNeSt200":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeSt200_pretrained.pdparams",
+    "ResNeSt269":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeSt269_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 dilation=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        bn_decay = 0.0
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weight"),
+            bias_attr=False)
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(
+                name=name + "_scale", regularizer=L2Decay(bn_decay)),
+            bias_attr=ParamAttr(
+                name + "_offset", regularizer=L2Decay(bn_decay)),
+            moving_mean_name=name + "_mean",
+            moving_variance_name=name + "_variance")
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._batch_norm(x)
+        return x
+
+
+class rSoftmax(nn.Layer):
+    def __init__(self, radix, cardinality):
+        super(rSoftmax, self).__init__()
+        self.radix = radix
+        self.cardinality = cardinality
+
+    def forward(self, x):
+        cardinality = self.cardinality
+        radix = self.radix
+
+        batch, r, h, w = x.shape
+        if self.radix > 1:
+            x = paddle.reshape(
+                x=x,
+                shape=[
+                    batch, cardinality, radix,
+                    int(r * h * w / cardinality / radix)
+                ])
+            x = paddle.transpose(x=x, perm=[0, 2, 1, 3])
+            x = nn.functional.softmax(x, axis=1)
+            x = paddle.reshape(x=x, shape=[batch, r * h * w, 1, 1])
+        else:
+            x = nn.functional.sigmoid(x)
+        return x
+
+
+class SplatConv(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 radix=2,
+                 reduction_factor=4,
+                 rectify_avg=False,
+                 name=None):
+        super(SplatConv, self).__init__()
+
+        self.radix = radix
+
+        self.conv1 = ConvBNLayer(
+            num_channels=in_channels,
+            num_filters=channels * radix,
+            filter_size=kernel_size,
+            stride=stride,
+            groups=groups * radix,
+            act="relu",
+            name=name + "_1_weights")
+
+        self.avg_pool2d = AdaptiveAvgPool2D(1)
+
+        inter_channels = int(max(in_channels * radix // reduction_factor, 32))
+
+        # to calc gap
+        self.conv2 = ConvBNLayer(
+            num_channels=channels,
+            num_filters=inter_channels,
+            filter_size=1,
+            stride=1,
+            groups=groups,
+            act="relu",
+            name=name + "_2_weights")
+
+        # to calc atten
+        self.conv3 = Conv2D(
+            in_channels=inter_channels,
+            out_channels=channels * radix,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=groups,
+            weight_attr=ParamAttr(
+                name=name + "_weights", initializer=KaimingNormal()))
+
+        self.rsoftmax = rSoftmax(radix=radix, cardinality=groups)
+
+    def forward(self, x):
+        x = self.conv1(x)
+
+        if self.radix > 1:
+            splited = paddle.split(x, num_or_sections=self.radix, axis=1)
+            gap = paddle.add_n(splited)
+        else:
+            gap = x
+
+        gap = self.avg_pool2d(gap)
+        gap = self.conv2(gap)
+
+        atten = self.conv3(gap)
+        atten = self.rsoftmax(atten)
+
+        if self.radix > 1:
+            attens = paddle.split(atten, num_or_sections=self.radix, axis=1)
+            y = paddle.add_n([
+                paddle.multiply(split, att)
+                for (att, split) in zip(attens, splited)
+            ])
+        else:
+            y = paddle.multiply(x, atten)
+
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 radix=1,
+                 cardinality=1,
+                 bottleneck_width=64,
+                 avd=False,
+                 avd_first=False,
+                 dilation=1,
+                 is_first=False,
+                 rectify_avg=False,
+                 last_gamma=False,
+                 avg_down=False,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+        self.inplanes = inplanes
+        self.planes = planes
+        self.stride = stride
+        self.radix = radix
+        self.cardinality = cardinality
+        self.avd = avd
+        self.avd_first = avd_first
+        self.dilation = dilation
+        self.is_first = is_first
+        self.rectify_avg = rectify_avg
+        self.last_gamma = last_gamma
+        self.avg_down = avg_down
+
+        group_width = int(planes * (bottleneck_width / 64.)) * cardinality
+
+        self.conv1 = ConvBNLayer(
+            num_channels=self.inplanes,
+            num_filters=group_width,
+            filter_size=1,
+            stride=1,
+            groups=1,
+            act="relu",
+            name=name + "_conv1")
+
+        if avd and avd_first and (stride > 1 or is_first):
+            self.avg_pool2d_1 = AvgPool2D(
+                kernel_size=3, stride=stride, padding=1)
+
+        if radix >= 1:
+            self.conv2 = SplatConv(
+                in_channels=group_width,
+                channels=group_width,
+                kernel_size=3,
+                stride=1,
+                padding=dilation,
+                dilation=dilation,
+                groups=cardinality,
+                bias=False,
+                radix=radix,
+                rectify_avg=rectify_avg,
+                name=name + "_splat")
+        else:
+            self.conv2 = ConvBNLayer(
+                num_channels=group_width,
+                num_filters=group_width,
+                filter_size=3,
+                stride=1,
+                dilation=dilation,
+                groups=cardinality,
+                act="relu",
+                name=name + "_conv2")
+
+        if avd and avd_first == False and (stride > 1 or is_first):
+            self.avg_pool2d_2 = AvgPool2D(
+                kernel_size=3, stride=stride, padding=1)
+
+        self.conv3 = ConvBNLayer(
+            num_channels=group_width,
+            num_filters=planes * 4,
+            filter_size=1,
+            stride=1,
+            groups=1,
+            act=None,
+            name=name + "_conv3")
+
+        if stride != 1 or self.inplanes != self.planes * 4:
+            if avg_down:
+                if dilation == 1:
+                    self.avg_pool2d_3 = AvgPool2D(
+                        kernel_size=stride, stride=stride, padding=0)
+                else:
+                    self.avg_pool2d_3 = AvgPool2D(
+                        kernel_size=1, stride=1, padding=0, ceil_mode=True)
+
+                self.conv4 = Conv2D(
+                    in_channels=self.inplanes,
+                    out_channels=planes * 4,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    groups=1,
+                    weight_attr=ParamAttr(
+                        name=name + "_weights", initializer=KaimingNormal()),
+                    bias_attr=False)
+            else:
+                self.conv4 = Conv2D(
+                    in_channels=self.inplanes,
+                    out_channels=planes * 4,
+                    kernel_size=1,
+                    stride=stride,
+                    padding=0,
+                    groups=1,
+                    weight_attr=ParamAttr(
+                        name=name + "_shortcut_weights",
+                        initializer=KaimingNormal()),
+                    bias_attr=False)
+
+            bn_decay = 0.0
+            self._batch_norm = BatchNorm(
+                planes * 4,
+                act=None,
+                param_attr=ParamAttr(
+                    name=name + "_shortcut_scale",
+                    regularizer=L2Decay(bn_decay)),
+                bias_attr=ParamAttr(
+                    name + "_shortcut_offset", regularizer=L2Decay(bn_decay)),
+                moving_mean_name=name + "_shortcut_mean",
+                moving_variance_name=name + "_shortcut_variance")
+
+    def forward(self, x):
+        short = x
+
+        x = self.conv1(x)
+        if self.avd and self.avd_first and (self.stride > 1 or self.is_first):
+            x = self.avg_pool2d_1(x)
+
+        x = self.conv2(x)
+
+        if self.avd and self.avd_first == False and (self.stride > 1 or
+                                                     self.is_first):
+            x = self.avg_pool2d_2(x)
+
+        x = self.conv3(x)
+
+        if self.stride != 1 or self.inplanes != self.planes * 4:
+            if self.avg_down:
+                short = self.avg_pool2d_3(short)
+
+            short = self.conv4(short)
+
+            short = self._batch_norm(short)
+
+        y = paddle.add(x=short, y=x)
+        y = F.relu(y)
+        return y
+
+
+class ResNeStLayer(nn.Layer):
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 blocks,
+                 radix,
+                 cardinality,
+                 bottleneck_width,
+                 avg_down,
+                 avd,
+                 avd_first,
+                 rectify_avg,
+                 last_gamma,
+                 stride=1,
+                 dilation=1,
+                 is_first=True,
+                 name=None):
+        super(ResNeStLayer, self).__init__()
+        self.inplanes = inplanes
+        self.planes = planes
+        self.blocks = blocks
+        self.radix = radix
+        self.cardinality = cardinality
+        self.bottleneck_width = bottleneck_width
+        self.avg_down = avg_down
+        self.avd = avd
+        self.avd_first = avd_first
+        self.rectify_avg = rectify_avg
+        self.last_gamma = last_gamma
+        self.is_first = is_first
+
+        if dilation == 1 or dilation == 2:
+            bottleneck_func = self.add_sublayer(
+                name + "_bottleneck_0",
+                BottleneckBlock(
+                    inplanes=self.inplanes,
+                    planes=planes,
+                    stride=stride,
+                    radix=radix,
+                    cardinality=cardinality,
+                    bottleneck_width=bottleneck_width,
+                    avg_down=self.avg_down,
+                    avd=avd,
+                    avd_first=avd_first,
+                    dilation=1,
+                    is_first=is_first,
+                    rectify_avg=rectify_avg,
+                    last_gamma=last_gamma,
+                    name=name + "_bottleneck_0"))
+        elif dilation == 4:
+            bottleneck_func = self.add_sublayer(
+                name + "_bottleneck_0",
+                BottleneckBlock(
+                    inplanes=self.inplanes,
+                    planes=planes,
+                    stride=stride,
+                    radix=radix,
+                    cardinality=cardinality,
+                    bottleneck_width=bottleneck_width,
+                    avg_down=self.avg_down,
+                    avd=avd,
+                    avd_first=avd_first,
+                    dilation=2,
+                    is_first=is_first,
+                    rectify_avg=rectify_avg,
+                    last_gamma=last_gamma,
+                    name=name + "_bottleneck_0"))
+        else:
+            raise RuntimeError("=>unknown dilation size")
+
+        self.inplanes = planes * 4
+        self.bottleneck_block_list = [bottleneck_func]
+        for i in range(1, blocks):
+            curr_name = name + "_bottleneck_" + str(i)
+
+            bottleneck_func = self.add_sublayer(
+                curr_name,
+                BottleneckBlock(
+                    inplanes=self.inplanes,
+                    planes=planes,
+                    radix=radix,
+                    cardinality=cardinality,
+                    bottleneck_width=bottleneck_width,
+                    avg_down=self.avg_down,
+                    avd=avd,
+                    avd_first=avd_first,
+                    dilation=dilation,
+                    rectify_avg=rectify_avg,
+                    last_gamma=last_gamma,
+                    name=curr_name))
+            self.bottleneck_block_list.append(bottleneck_func)
+
+    def forward(self, x):
+        for bottleneck_block in self.bottleneck_block_list:
+            x = bottleneck_block(x)
+        return x
+
+
+class ResNeSt(nn.Layer):
+    def __init__(self,
+                 layers,
+                 radix=1,
+                 groups=1,
+                 bottleneck_width=64,
+                 dilated=False,
+                 dilation=1,
+                 deep_stem=False,
+                 stem_width=64,
+                 avg_down=False,
+                 rectify_avg=False,
+                 avd=False,
+                 avd_first=False,
+                 final_drop=0.0,
+                 last_gamma=False,
+                 class_num=1000):
+        super(ResNeSt, self).__init__()
+
+        self.cardinality = groups
+        self.bottleneck_width = bottleneck_width
+        # ResNet-D params
+        self.inplanes = stem_width * 2 if deep_stem else 64
+        self.avg_down = avg_down
+        self.last_gamma = last_gamma
+        # ResNeSt params
+        self.radix = radix
+        self.avd = avd
+        self.avd_first = avd_first
+
+        self.deep_stem = deep_stem
+        self.stem_width = stem_width
+        self.layers = layers
+        self.final_drop = final_drop
+        self.dilated = dilated
+        self.dilation = dilation
+
+        self.rectify_avg = rectify_avg
+
+        if self.deep_stem:
+            self.stem = nn.Sequential(
+                ("conv1", ConvBNLayer(
+                    num_channels=3,
+                    num_filters=stem_width,
+                    filter_size=3,
+                    stride=2,
+                    act="relu",
+                    name="conv1")), ("conv2", ConvBNLayer(
+                        num_channels=stem_width,
+                        num_filters=stem_width,
+                        filter_size=3,
+                        stride=1,
+                        act="relu",
+                        name="conv2")), ("conv3", ConvBNLayer(
+                            num_channels=stem_width,
+                            num_filters=stem_width * 2,
+                            filter_size=3,
+                            stride=1,
+                            act="relu",
+                            name="conv3")))
+        else:
+            self.stem = ConvBNLayer(
+                num_channels=3,
+                num_filters=stem_width,
+                filter_size=7,
+                stride=2,
+                act="relu",
+                name="conv1")
+
+        self.max_pool2d = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.layer1 = ResNeStLayer(
+            inplanes=self.stem_width * 2
+            if self.deep_stem else self.stem_width,
+            planes=64,
+            blocks=self.layers[0],
+            radix=radix,
+            cardinality=self.cardinality,
+            bottleneck_width=bottleneck_width,
+            avg_down=self.avg_down,
+            avd=avd,
+            avd_first=avd_first,
+            rectify_avg=rectify_avg,
+            last_gamma=last_gamma,
+            stride=1,
+            dilation=1,
+            is_first=False,
+            name="layer1")
+
+        #         return
+
+        self.layer2 = ResNeStLayer(
+            inplanes=256,
+            planes=128,
+            blocks=self.layers[1],
+            radix=radix,
+            cardinality=self.cardinality,
+            bottleneck_width=bottleneck_width,
+            avg_down=self.avg_down,
+            avd=avd,
+            avd_first=avd_first,
+            rectify_avg=rectify_avg,
+            last_gamma=last_gamma,
+            stride=2,
+            name="layer2")
+
+        if self.dilated or self.dilation == 4:
+            self.layer3 = ResNeStLayer(
+                inplanes=512,
+                planes=256,
+                blocks=self.layers[2],
+                radix=radix,
+                cardinality=self.cardinality,
+                bottleneck_width=bottleneck_width,
+                avg_down=self.avg_down,
+                avd=avd,
+                avd_first=avd_first,
+                rectify_avg=rectify_avg,
+                last_gamma=last_gamma,
+                stride=1,
+                dilation=2,
+                name="layer3")
+            self.layer4 = ResNeStLayer(
+                inplanes=1024,
+                planes=512,
+                blocks=self.layers[3],
+                radix=radix,
+                cardinality=self.cardinality,
+                bottleneck_width=bottleneck_width,
+                avg_down=self.avg_down,
+                avd=avd,
+                avd_first=avd_first,
+                rectify_avg=rectify_avg,
+                last_gamma=last_gamma,
+                stride=1,
+                dilation=4,
+                name="layer4")
+        elif self.dilation == 2:
+            self.layer3 = ResNeStLayer(
+                inplanes=512,
+                planes=256,
+                blocks=self.layers[2],
+                radix=radix,
+                cardinality=self.cardinality,
+                bottleneck_width=bottleneck_width,
+                avg_down=self.avg_down,
+                avd=avd,
+                avd_first=avd_first,
+                rectify_avg=rectify_avg,
+                last_gamma=last_gamma,
+                stride=2,
+                dilation=1,
+                name="layer3")
+            self.layer4 = ResNeStLayer(
+                inplanes=1024,
+                planes=512,
+                blocks=self.layers[3],
+                radix=radix,
+                cardinality=self.cardinality,
+                bottleneck_width=bottleneck_width,
+                avg_down=self.avg_down,
+                avd=avd,
+                avd_first=avd_first,
+                rectify_avg=rectify_avg,
+                last_gamma=last_gamma,
+                stride=1,
+                dilation=2,
+                name="layer4")
+        else:
+            self.layer3 = ResNeStLayer(
+                inplanes=512,
+                planes=256,
+                blocks=self.layers[2],
+                radix=radix,
+                cardinality=self.cardinality,
+                bottleneck_width=bottleneck_width,
+                avg_down=self.avg_down,
+                avd=avd,
+                avd_first=avd_first,
+                rectify_avg=rectify_avg,
+                last_gamma=last_gamma,
+                stride=2,
+                name="layer3")
+            self.layer4 = ResNeStLayer(
+                inplanes=1024,
+                planes=512,
+                blocks=self.layers[3],
+                radix=radix,
+                cardinality=self.cardinality,
+                bottleneck_width=bottleneck_width,
+                avg_down=self.avg_down,
+                avd=avd,
+                avd_first=avd_first,
+                rectify_avg=rectify_avg,
+                last_gamma=last_gamma,
+                stride=2,
+                name="layer4")
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.out_channels = 2048
+
+        stdv = 1.0 / math.sqrt(self.out_channels * 1.0)
+
+        self.out = Linear(
+            self.out_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=nn.initializer.Uniform(-stdv, stdv),
+                name="fc_weights"),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, x):
+        x = self.stem(x)
+        x = self.max_pool2d(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+
+        x = self.layer3(x)
+
+        x = self.layer4(x)
+        x = self.pool2d_avg(x)
+        x = paddle.reshape(x, shape=[-1, self.out_channels])
+        x = self.out(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNeSt50_fast_1s1x64d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeSt(
+        layers=[3, 4, 6, 3],
+        radix=1,
+        groups=1,
+        bottleneck_width=64,
+        deep_stem=True,
+        stem_width=32,
+        avg_down=True,
+        avd=True,
+        avd_first=True,
+        final_drop=0.0,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeSt50_fast_1s1x64d"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ResNeSt50(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeSt(
+        layers=[3, 4, 6, 3],
+        radix=2,
+        groups=1,
+        bottleneck_width=64,
+        deep_stem=True,
+        stem_width=32,
+        avg_down=True,
+        avd=True,
+        avd_first=False,
+        final_drop=0.0,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeSt50"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeSt101(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeSt(
+        layers=[3, 4, 23, 3],
+        radix=2,
+        groups=1,
+        bottleneck_width=64,
+        deep_stem=True,
+        stem_width=64,
+        avg_down=True,
+        avd=True,
+        avd_first=False,
+        final_drop=0.0,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeSt101"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeSt200(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeSt(
+        layers=[3, 24, 36, 3],
+        radix=2,
+        groups=1,
+        bottleneck_width=64,
+        deep_stem=True,
+        stem_width=64,
+        avg_down=True,
+        avd=True,
+        avd_first=False,
+        final_drop=0.0,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeSt200"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeSt269(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeSt(
+        layers=[3, 30, 48, 8],
+        radix=2,
+        groups=1,
+        bottleneck_width=64,
+        deep_stem=True,
+        stem_width=64,
+        avg_down=True,
+        avd=True,
+        avd_first=False,
+        final_drop=0.0,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeSt269"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/resnet_vc.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/resnet_vc.py
new file mode 100644
index 000000000..ba44a2ce0
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/resnet_vc.py
@@ -0,0 +1,311 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1812.01187
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "ResNet50_vc":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vc_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+        self._num_channels_out = num_filters * 4
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            name=name + "_branch2b")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=stride,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv1)
+        y = F.relu(y)
+        return y
+
+
+class ResNet_vc(nn.Layer):
+    def __init__(self, layers=50, class_num=1000):
+        super(ResNet_vc, self).__init__()
+
+        self.layers = layers
+        supported_layers = [18, 34, 50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_channels = [64, 256, 512,
+                        1024] if layers >= 50 else [64, 64, 128, 256]
+        num_filters = [64, 128, 256, 512]
+
+        self.conv1_1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=32,
+            filter_size=3,
+            stride=2,
+            act='relu',
+            name="conv1_1")
+        self.conv1_2 = ConvBNLayer(
+            num_channels=32,
+            num_filters=32,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_2")
+        self.conv1_3 = ConvBNLayer(
+            num_channels=32,
+            num_filters=64,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_3")
+
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        if layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BottleneckBlock(
+                            num_channels=num_channels[block]
+                            if i == 0 else num_filters[block] * 4,
+                            num_filters=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            name=conv_name))
+                    self.block_list.append(bottleneck_block)
+                    shortcut = True
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BasicBlock(
+                            num_channels=num_channels[block]
+                            if i == 0 else num_filters[block],
+                            num_filters=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            name=conv_name))
+                    self.block_list.append(basic_block)
+                    shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_0.w_0"),
+            bias_attr=ParamAttr(name="fc_0.b_0"))
+
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNet50_vc(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNet_vc(layers=50, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNet50_vc"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/resnext.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/resnext.py
new file mode 100644
index 000000000..53f65b631
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/resnext.py
@@ -0,0 +1,303 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1611.05431
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "ResNeXt50_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt50_32x4d_pretrained.pdparams",
+    "ResNeXt50_64x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt50_64x4d_pretrained.pdparams",
+    "ResNeXt101_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x4d_pretrained.pdparams",
+    "ResNeXt101_64x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_64x4d_pretrained.pdparams",
+    "ResNeXt152_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt152_32x4d_pretrained.pdparams",
+    "ResNeXt152_64x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt152_64x4d_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None,
+                 data_format="NCHW"):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            data_format=data_format)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance',
+            data_layout=data_format)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 cardinality,
+                 shortcut=True,
+                 name=None,
+                 data_format="NCHW"):
+        super(BottleneckBlock, self).__init__()
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a",
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            groups=cardinality,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b",
+            data_format=data_format)
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 2 if cardinality == 32 else num_filters,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c",
+            data_format=data_format)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 2
+                if cardinality == 32 else num_filters,
+                filter_size=1,
+                stride=stride,
+                name=name + "_branch1",
+                data_format=data_format)
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class ResNeXt(nn.Layer):
+    def __init__(self,
+                 layers=50,
+                 class_num=1000,
+                 cardinality=32,
+                 input_image_channel=3,
+                 data_format="NCHW"):
+        super(ResNeXt, self).__init__()
+
+        self.layers = layers
+        self.data_format = data_format
+        self.input_image_channel = input_image_channel
+        self.cardinality = cardinality
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+        supported_cardinality = [32, 64]
+        assert cardinality in supported_cardinality, \
+            "supported cardinality is {} but input cardinality is {}" \
+            .format(supported_cardinality, cardinality)
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_channels = [64, 256, 512, 1024]
+        num_filters = [128, 256, 512,
+                       1024] if cardinality == 32 else [256, 512, 1024, 2048]
+
+        self.conv = ConvBNLayer(
+            num_channels=self.input_image_channel,
+            num_filters=64,
+            filter_size=7,
+            stride=2,
+            act='relu',
+            name="res_conv1",
+            data_format=self.data_format)
+        self.pool2d_max = MaxPool2D(
+            kernel_size=3, stride=2, padding=1, data_format=self.data_format)
+
+        self.block_list = []
+        for block in range(len(depth)):
+            shortcut = False
+            for i in range(depth[block]):
+                if layers in [101, 152] and block == 2:
+                    if i == 0:
+                        conv_name = "res" + str(block + 2) + "a"
+                    else:
+                        conv_name = "res" + str(block + 2) + "b" + str(i)
+                else:
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        num_channels=num_channels[block] if i == 0 else
+                        num_filters[block] * int(64 // self.cardinality),
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        cardinality=self.cardinality,
+                        shortcut=shortcut,
+                        name=conv_name,
+                        data_format=self.data_format))
+                self.block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1, data_format=self.data_format)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_weights"),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, inputs):
+        with paddle.static.amp.fp16_guard():
+            return self._forward(inputs)
+
+    def _forward(self, inputs):
+        if self.data_format == "NHWC":
+            inputs = paddle.tensor.transpose(inputs, [0, 2, 3, 1])
+            inputs.stop_gradient = True
+        y = self.conv(inputs)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNeXt50_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=50, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeXt50_32x4d"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt50_64x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=50, cardinality=64, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeXt50_64x4d"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt101_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=101, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeXt101_32x4d"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt101_64x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=101, cardinality=64, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeXt101_64x4d"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt152_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=152, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeXt152_32x4d"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt152_64x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=152, cardinality=64, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeXt152_64x4d"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/resnext101_wsl.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/resnext101_wsl.py
new file mode 100644
index 000000000..a4478e70a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/resnext101_wsl.py
@@ -0,0 +1,506 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1805.00932
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "ResNeXt101_32x8d_wsl":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x8d_wsl_pretrained.pdparams",
+    "ResNeXt101_32x16d_wsl":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x16_wsl_pretrained.pdparams",
+    "ResNeXt101_32x32d_wsl":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x32d_wsl_pretrained.pdparams",
+    "ResNeXt101_32x48d_wsl":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_32x48d_wsl_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        if "downsample" in name:
+            conv_name = name + ".0"
+        else:
+            conv_name = name
+        self._conv = Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=conv_name + ".weight"),
+            bias_attr=False)
+        if "downsample" in name:
+            bn_name = name[:9] + "downsample.1"
+        else:
+            if "conv1" == name:
+                bn_name = "bn" + name[-1]
+            else:
+                bn_name = (name[:10] if name[7:9].isdigit() else name[:9]
+                           ) + "bn" + name[-1]
+        self._bn = BatchNorm(
+            num_channels=output_channels,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + ".weight"),
+            bias_attr=ParamAttr(name=bn_name + ".bias"),
+            moving_mean_name=bn_name + ".running_mean",
+            moving_variance_name=bn_name + ".running_var")
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        x = self._bn(x)
+        return x
+
+
+class ShortCut(nn.Layer):
+    def __init__(self, input_channels, output_channels, stride, name=None):
+        super(ShortCut, self).__init__()
+
+        self.input_channels = input_channels
+        self.output_channels = output_channels
+        self.stride = stride
+        if input_channels != output_channels or stride != 1:
+            self._conv = ConvBNLayer(
+                input_channels,
+                output_channels,
+                filter_size=1,
+                stride=stride,
+                name=name)
+
+    def forward(self, inputs):
+        if self.input_channels != self.output_channels or self.stride != 1:
+            return self._conv(inputs)
+        return inputs
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self, input_channels, output_channels, stride, cardinality,
+                 width, name):
+        super(BottleneckBlock, self).__init__()
+
+        self._conv0 = ConvBNLayer(
+            input_channels,
+            output_channels,
+            filter_size=1,
+            act="relu",
+            name=name + ".conv1")
+        self._conv1 = ConvBNLayer(
+            output_channels,
+            output_channels,
+            filter_size=3,
+            act="relu",
+            stride=stride,
+            groups=cardinality,
+            name=name + ".conv2")
+        self._conv2 = ConvBNLayer(
+            output_channels,
+            output_channels // (width // 8),
+            filter_size=1,
+            act=None,
+            name=name + ".conv3")
+        self._short = ShortCut(
+            input_channels,
+            output_channels // (width // 8),
+            stride=stride,
+            name=name + ".downsample")
+
+    def forward(self, inputs):
+        x = self._conv0(inputs)
+        x = self._conv1(x)
+        x = self._conv2(x)
+        y = self._short(inputs)
+        y = paddle.add(x, y)
+        y = F.relu(y)
+        return y
+
+
+class ResNeXt101WSL(nn.Layer):
+    def __init__(self, layers=101, cardinality=32, width=48, class_num=1000):
+        super(ResNeXt101WSL, self).__init__()
+
+        self.class_num = class_num
+
+        self.layers = layers
+        self.cardinality = cardinality
+        self.width = width
+        self.scale = width // 8
+
+        self.depth = [3, 4, 23, 3]
+        self.base_width = cardinality * width
+        num_filters = [self.base_width * i
+                       for i in [1, 2, 4, 8]]  # [256, 512, 1024, 2048]
+        self._conv_stem = ConvBNLayer(
+            3, 64, 7, stride=2, act="relu", name="conv1")
+        self._pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self._conv1_0 = BottleneckBlock(
+            64,
+            num_filters[0],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer1.0")
+        self._conv1_1 = BottleneckBlock(
+            num_filters[0] // (width // 8),
+            num_filters[0],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer1.1")
+        self._conv1_2 = BottleneckBlock(
+            num_filters[0] // (width // 8),
+            num_filters[0],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer1.2")
+
+        self._conv2_0 = BottleneckBlock(
+            num_filters[0] // (width // 8),
+            num_filters[1],
+            stride=2,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer2.0")
+        self._conv2_1 = BottleneckBlock(
+            num_filters[1] // (width // 8),
+            num_filters[1],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer2.1")
+        self._conv2_2 = BottleneckBlock(
+            num_filters[1] // (width // 8),
+            num_filters[1],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer2.2")
+        self._conv2_3 = BottleneckBlock(
+            num_filters[1] // (width // 8),
+            num_filters[1],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer2.3")
+
+        self._conv3_0 = BottleneckBlock(
+            num_filters[1] // (width // 8),
+            num_filters[2],
+            stride=2,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.0")
+        self._conv3_1 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.1")
+        self._conv3_2 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.2")
+        self._conv3_3 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.3")
+        self._conv3_4 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.4")
+        self._conv3_5 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.5")
+        self._conv3_6 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.6")
+        self._conv3_7 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.7")
+        self._conv3_8 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.8")
+        self._conv3_9 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.9")
+        self._conv3_10 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.10")
+        self._conv3_11 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.11")
+        self._conv3_12 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.12")
+        self._conv3_13 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.13")
+        self._conv3_14 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.14")
+        self._conv3_15 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.15")
+        self._conv3_16 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.16")
+        self._conv3_17 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.17")
+        self._conv3_18 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.18")
+        self._conv3_19 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.19")
+        self._conv3_20 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.20")
+        self._conv3_21 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.21")
+        self._conv3_22 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[2],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer3.22")
+
+        self._conv4_0 = BottleneckBlock(
+            num_filters[2] // (width // 8),
+            num_filters[3],
+            stride=2,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer4.0")
+        self._conv4_1 = BottleneckBlock(
+            num_filters[3] // (width // 8),
+            num_filters[3],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer4.1")
+        self._conv4_2 = BottleneckBlock(
+            num_filters[3] // (width // 8),
+            num_filters[3],
+            stride=1,
+            cardinality=self.cardinality,
+            width=self.width,
+            name="layer4.2")
+
+        self._avg_pool = AdaptiveAvgPool2D(1)
+        self._out = Linear(
+            num_filters[3] // (width // 8),
+            class_num,
+            weight_attr=ParamAttr(name="fc.weight"),
+            bias_attr=ParamAttr(name="fc.bias"))
+
+    def forward(self, inputs):
+        x = self._conv_stem(inputs)
+        x = self._pool(x)
+
+        x = self._conv1_0(x)
+        x = self._conv1_1(x)
+        x = self._conv1_2(x)
+
+        x = self._conv2_0(x)
+        x = self._conv2_1(x)
+        x = self._conv2_2(x)
+        x = self._conv2_3(x)
+
+        x = self._conv3_0(x)
+        x = self._conv3_1(x)
+        x = self._conv3_2(x)
+        x = self._conv3_3(x)
+        x = self._conv3_4(x)
+        x = self._conv3_5(x)
+        x = self._conv3_6(x)
+        x = self._conv3_7(x)
+        x = self._conv3_8(x)
+        x = self._conv3_9(x)
+        x = self._conv3_10(x)
+        x = self._conv3_11(x)
+        x = self._conv3_12(x)
+        x = self._conv3_13(x)
+        x = self._conv3_14(x)
+        x = self._conv3_15(x)
+        x = self._conv3_16(x)
+        x = self._conv3_17(x)
+        x = self._conv3_18(x)
+        x = self._conv3_19(x)
+        x = self._conv3_20(x)
+        x = self._conv3_21(x)
+        x = self._conv3_22(x)
+
+        x = self._conv4_0(x)
+        x = self._conv4_1(x)
+        x = self._conv4_2(x)
+
+        x = self._avg_pool(x)
+        x = paddle.squeeze(x, axis=[2, 3])
+        x = self._out(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNeXt101_32x8d_wsl(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt101WSL(cardinality=32, width=8, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeXt101_32x8d_wsl"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt101_32x16d_wsl(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt101WSL(cardinality=32, width=16, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeXt101_32x16d_wsl"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt101_32x32d_wsl(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt101WSL(cardinality=32, width=32, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeXt101_32x32d_wsl"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt101_32x48d_wsl(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt101WSL(cardinality=32, width=48, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeXt101_32x48d_wsl"],
+        use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/resnext_vd.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/resnext_vd.py
new file mode 100644
index 000000000..b2c7d7777
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/resnext_vd.py
@@ -0,0 +1,319 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1611.05431 & https://arxiv.org/abs/1812.01187
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "ResNeXt50_vd_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt50_vd_32x4d_pretrained.pdparams",
+    "ResNeXt50_vd_64x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt50_vd_64x4d_pretrained.pdparams",
+    "ResNeXt101_vd_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_vd_32x4d_pretrained.pdparams",
+    "ResNeXt101_vd_64x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt101_vd_64x4d_pretrained.pdparams",
+    "ResNeXt152_vd_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt152_vd_32x4d_pretrained.pdparams",
+    "ResNeXt152_vd_64x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNeXt152_vd_64x4d_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(
+            self,
+            num_channels,
+            num_filters,
+            filter_size,
+            stride=1,
+            groups=1,
+            is_vd_mode=False,
+            act=None,
+            name=None, ):
+        super(ConvBNLayer, self).__init__()
+
+        self.is_vd_mode = is_vd_mode
+        self._pool2d_avg = AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        if self.is_vd_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 cardinality,
+                 shortcut=True,
+                 if_first=False,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            groups=cardinality,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 2 if cardinality == 32 else num_filters,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 2
+                if cardinality == 32 else num_filters,
+                filter_size=1,
+                stride=1,
+                is_vd_mode=False if if_first else True,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class ResNeXt(nn.Layer):
+    def __init__(self, layers=50, class_num=1000, cardinality=32):
+        super(ResNeXt, self).__init__()
+
+        self.layers = layers
+        self.cardinality = cardinality
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+        supported_cardinality = [32, 64]
+        assert cardinality in supported_cardinality, \
+            "supported cardinality is {} but input cardinality is {}" \
+            .format(supported_cardinality, cardinality)
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_channels = [64, 256, 512, 1024]
+        num_filters = [128, 256, 512,
+                       1024] if cardinality == 32 else [256, 512, 1024, 2048]
+
+        self.conv1_1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=32,
+            filter_size=3,
+            stride=2,
+            act='relu',
+            name="conv1_1")
+        self.conv1_2 = ConvBNLayer(
+            num_channels=32,
+            num_filters=32,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_2")
+        self.conv1_3 = ConvBNLayer(
+            num_channels=32,
+            num_filters=64,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_3")
+
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        for block in range(len(depth)):
+            shortcut = False
+            for i in range(depth[block]):
+                if layers in [101, 152] and block == 2:
+                    if i == 0:
+                        conv_name = "res" + str(block + 2) + "a"
+                    else:
+                        conv_name = "res" + str(block + 2) + "b" + str(i)
+                else:
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        num_channels=num_channels[block] if i == 0 else
+                        num_filters[block] * int(64 // self.cardinality),
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        cardinality=self.cardinality,
+                        shortcut=shortcut,
+                        if_first=block == i == 0,
+                        name=conv_name))
+                self.block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc_weights"),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNeXt50_vd_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=50, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeXt50_vd_32x4d"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt50_vd_64x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=50, cardinality=64, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ResNeXt50_vd_64x4d"], use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt101_vd_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=101, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeXt101_vd_32x4d"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt101_vd_64x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=101, cardinality=64, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeXt101_vd_64x4d"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt152_vd_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=152, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeXt152_vd_32x4d"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ResNeXt152_vd_64x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=152, cardinality=64, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ResNeXt152_vd_64x4d"],
+        use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/rexnet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/rexnet.py
new file mode 100644
index 000000000..098fb2876
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/rexnet.py
@@ -0,0 +1,283 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/2007.00992
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from math import ceil
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "ReXNet_1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ReXNet_1_0_pretrained.pdparams",
+    "ReXNet_1_3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ReXNet_1_3_pretrained.pdparams",
+    "ReXNet_1_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ReXNet_1_5_pretrained.pdparams",
+    "ReXNet_2_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ReXNet_2_0_pretrained.pdparams",
+    "ReXNet_3_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ReXNet_3_0_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def conv_bn_act(out,
+                in_channels,
+                channels,
+                kernel=1,
+                stride=1,
+                pad=0,
+                num_group=1,
+                active=True,
+                relu6=False):
+    out.append(
+        nn.Conv2D(
+            in_channels,
+            channels,
+            kernel,
+            stride,
+            pad,
+            groups=num_group,
+            bias_attr=False))
+    out.append(nn.BatchNorm2D(channels))
+    if active:
+        out.append(nn.ReLU6() if relu6 else nn.ReLU())
+
+
+def conv_bn_swish(out,
+                  in_channels,
+                  channels,
+                  kernel=1,
+                  stride=1,
+                  pad=0,
+                  num_group=1):
+    out.append(
+        nn.Conv2D(
+            in_channels,
+            channels,
+            kernel,
+            stride,
+            pad,
+            groups=num_group,
+            bias_attr=False))
+    out.append(nn.BatchNorm2D(channels))
+    out.append(nn.Swish())
+
+
+class SE(nn.Layer):
+    def __init__(self, in_channels, channels, se_ratio=12):
+        super(SE, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2D(1)
+        self.fc = nn.Sequential(
+            nn.Conv2D(
+                in_channels, channels // se_ratio, kernel_size=1, padding=0),
+            nn.BatchNorm2D(channels // se_ratio),
+            nn.ReLU(),
+            nn.Conv2D(
+                channels // se_ratio, channels, kernel_size=1, padding=0),
+            nn.Sigmoid())
+
+    def forward(self, x):
+        y = self.avg_pool(x)
+        y = self.fc(y)
+        return x * y
+
+
+class LinearBottleneck(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 t,
+                 stride,
+                 use_se=True,
+                 se_ratio=12,
+                 **kwargs):
+        super(LinearBottleneck, self).__init__(**kwargs)
+        self.use_shortcut = stride == 1 and in_channels <= channels
+        self.in_channels = in_channels
+        self.out_channels = channels
+
+        out = []
+        if t != 1:
+            dw_channels = in_channels * t
+            conv_bn_swish(out, in_channels=in_channels, channels=dw_channels)
+        else:
+            dw_channels = in_channels
+
+        conv_bn_act(
+            out,
+            in_channels=dw_channels,
+            channels=dw_channels,
+            kernel=3,
+            stride=stride,
+            pad=1,
+            num_group=dw_channels,
+            active=False)
+
+        if use_se:
+            out.append(SE(dw_channels, dw_channels, se_ratio))
+
+        out.append(nn.ReLU6())
+        conv_bn_act(
+            out,
+            in_channels=dw_channels,
+            channels=channels,
+            active=False,
+            relu6=True)
+        self.out = nn.Sequential(*out)
+
+    def forward(self, x):
+        out = self.out(x)
+        if self.use_shortcut:
+            out[:, 0:self.in_channels] += x
+
+        return out
+
+
+class ReXNetV1(nn.Layer):
+    def __init__(self,
+                 input_ch=16,
+                 final_ch=180,
+                 width_mult=1.0,
+                 depth_mult=1.0,
+                 class_num=1000,
+                 use_se=True,
+                 se_ratio=12,
+                 dropout_ratio=0.2,
+                 bn_momentum=0.9):
+        super(ReXNetV1, self).__init__()
+
+        layers = [1, 2, 2, 3, 3, 5]
+        strides = [1, 2, 2, 2, 1, 2]
+        use_ses = [False, False, True, True, True, True]
+
+        layers = [ceil(element * depth_mult) for element in layers]
+        strides = sum([[element] + [1] * (layers[idx] - 1)
+                       for idx, element in enumerate(strides)], [])
+        if use_se:
+            use_ses = sum([[element] * layers[idx]
+                           for idx, element in enumerate(use_ses)], [])
+        else:
+            use_ses = [False] * sum(layers[:])
+        ts = [1] * layers[0] + [6] * sum(layers[1:])
+
+        self.depth = sum(layers[:]) * 3
+        stem_channel = 32 / width_mult if width_mult < 1.0 else 32
+        inplanes = input_ch / width_mult if width_mult < 1.0 else input_ch
+
+        features = []
+        in_channels_group = []
+        channels_group = []
+
+        # The following channel configuration is a simple instance to make each layer become an expand layer.
+        for i in range(self.depth // 3):
+            if i == 0:
+                in_channels_group.append(int(round(stem_channel * width_mult)))
+                channels_group.append(int(round(inplanes * width_mult)))
+            else:
+                in_channels_group.append(int(round(inplanes * width_mult)))
+                inplanes += final_ch / (self.depth // 3 * 1.0)
+                channels_group.append(int(round(inplanes * width_mult)))
+
+        conv_bn_swish(
+            features,
+            3,
+            int(round(stem_channel * width_mult)),
+            kernel=3,
+            stride=2,
+            pad=1)
+
+        for block_idx, (in_c, c, t, s, se) in enumerate(
+                zip(in_channels_group, channels_group, ts, strides, use_ses)):
+            features.append(
+                LinearBottleneck(
+                    in_channels=in_c,
+                    channels=c,
+                    t=t,
+                    stride=s,
+                    use_se=se,
+                    se_ratio=se_ratio))
+
+        pen_channels = int(1280 * width_mult)
+        conv_bn_swish(features, c, pen_channels)
+
+        features.append(nn.AdaptiveAvgPool2D(1))
+        self.features = nn.Sequential(*features)
+        self.output = nn.Sequential(
+            nn.Dropout(dropout_ratio),
+            nn.Conv2D(
+                pen_channels, class_num, 1, bias_attr=True))
+
+    def forward(self, x):
+        x = self.features(x)
+        x = self.output(x).squeeze(axis=-1).squeeze(axis=-1)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ReXNet_1_0(pretrained=False, use_ssld=False, **kwargs):
+    model = ReXNetV1(width_mult=1.0, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ReXNet_1_0"], use_ssld=use_ssld)
+    return model
+
+
+def ReXNet_1_3(pretrained=False, use_ssld=False, **kwargs):
+    model = ReXNetV1(width_mult=1.3, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ReXNet_1_3"], use_ssld=use_ssld)
+    return model
+
+
+def ReXNet_1_5(pretrained=False, use_ssld=False, **kwargs):
+    model = ReXNetV1(width_mult=1.5, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ReXNet_1_5"], use_ssld=use_ssld)
+    return model
+
+
+def ReXNet_2_0(pretrained=False, use_ssld=False, **kwargs):
+    model = ReXNetV1(width_mult=2.0, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ReXNet_2_0"], use_ssld=use_ssld)
+    return model
+
+
+def ReXNet_3_0(pretrained=False, use_ssld=False, **kwargs):
+    model = ReXNetV1(width_mult=3.0, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ReXNet_3_0"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/se_resnet_vd.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/se_resnet_vd.py
new file mode 100644
index 000000000..e08399eac
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/se_resnet_vd.py
@@ -0,0 +1,392 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1812.01187 & https://arxiv.org/abs/1709.01507
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "SE_ResNet18_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SE_ResNet18_vd_pretrained.pdparams",
+    "SE_ResNet34_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SE_ResNet34_vd_pretrained.pdparams",
+    "SE_ResNet50_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SE_ResNet50_vd_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(
+            self,
+            num_channels,
+            num_filters,
+            filter_size,
+            stride=1,
+            groups=1,
+            is_vd_mode=False,
+            act=None,
+            name=None, ):
+        super(ConvBNLayer, self).__init__()
+
+        self.is_vd_mode = is_vd_mode
+        self._pool2d_avg = AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        if self.is_vd_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 reduction_ratio=16,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2b")
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            name=name + "_branch2c")
+        self.scale = SELayer(
+            num_channels=num_filters * 4,
+            num_filters=num_filters * 4,
+            reduction_ratio=reduction_ratio,
+            name='fc_' + name)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=1,
+                is_vd_mode=False if if_first else True,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+        scale = self.scale(conv2)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=scale)
+        y = F.relu(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 reduction_ratio=16,
+                 name=None):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            name=name + "_branch2a")
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            name=name + "_branch2b")
+
+        self.scale = SELayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            reduction_ratio=reduction_ratio,
+            name='fc_' + name)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=1,
+                is_vd_mode=False if if_first else True,
+                name=name + "_branch1")
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        scale = self.scale(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=scale)
+        y = F.relu(y)
+        return y
+
+
+class SELayer(nn.Layer):
+    def __init__(self, num_channels, num_filters, reduction_ratio, name=None):
+        super(SELayer, self).__init__()
+
+        self.pool2d_gap = AdaptiveAvgPool2D(1)
+
+        self._num_channels = num_channels
+
+        med_ch = int(num_channels / reduction_ratio)
+        stdv = 1.0 / math.sqrt(num_channels * 1.0)
+        self.squeeze = Linear(
+            num_channels,
+            med_ch,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_sqz_weights"),
+            bias_attr=ParamAttr(name=name + '_sqz_offset'))
+
+        stdv = 1.0 / math.sqrt(med_ch * 1.0)
+        self.excitation = Linear(
+            med_ch,
+            num_filters,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_exc_weights"),
+            bias_attr=ParamAttr(name=name + '_exc_offset'))
+
+    def forward(self, input):
+        pool = self.pool2d_gap(input)
+        pool = paddle.squeeze(pool, axis=[2, 3])
+        squeeze = self.squeeze(pool)
+        squeeze = F.relu(squeeze)
+        excitation = self.excitation(squeeze)
+        excitation = F.sigmoid(excitation)
+        excitation = paddle.unsqueeze(excitation, axis=[2, 3])
+        out = input * excitation
+        return out
+
+
+class SE_ResNet_vd(nn.Layer):
+    def __init__(self, layers=50, class_num=1000):
+        super(SE_ResNet_vd, self).__init__()
+
+        self.layers = layers
+        supported_layers = [18, 34, 50, 101, 152, 200]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        elif layers == 200:
+            depth = [3, 12, 48, 3]
+        num_channels = [64, 256, 512,
+                        1024] if layers >= 50 else [64, 64, 128, 256]
+        num_filters = [64, 128, 256, 512]
+
+        self.conv1_1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=32,
+            filter_size=3,
+            stride=2,
+            act='relu',
+            name="conv1_1")
+        self.conv1_2 = ConvBNLayer(
+            num_channels=32,
+            num_filters=32,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_2")
+        self.conv1_3 = ConvBNLayer(
+            num_channels=32,
+            num_filters=64,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_3")
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        if layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BottleneckBlock(
+                            num_channels=num_channels[block]
+                            if i == 0 else num_filters[block] * 4,
+                            num_filters=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            name=conv_name))
+                    self.block_list.append(bottleneck_block)
+                    shortcut = True
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        'bb_%d_%d' % (block, i),
+                        BasicBlock(
+                            num_channels=num_channels[block]
+                            if i == 0 else num_filters[block],
+                            num_filters=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            if_first=block == i == 0,
+                            name=conv_name))
+                    self.block_list.append(basic_block)
+                    shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc6_weights"),
+            bias_attr=ParamAttr(name="fc6_offset"))
+
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def SE_ResNet18_vd(pretrained=False, use_ssld=False, **kwargs):
+    model = SE_ResNet_vd(layers=18, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["SE_ResNet18_vd"], use_ssld=use_ssld)
+    return model
+
+
+def SE_ResNet34_vd(pretrained=False, use_ssld=False, **kwargs):
+    model = SE_ResNet_vd(layers=34, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["SE_ResNet34_vd"], use_ssld=use_ssld)
+    return model
+
+
+def SE_ResNet50_vd(pretrained=False, use_ssld=False, **kwargs):
+    model = SE_ResNet_vd(layers=50, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["SE_ResNet50_vd"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/se_resnext.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/se_resnext.py
new file mode 100644
index 000000000..79556356f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/se_resnext.py
@@ -0,0 +1,369 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1611.05431 & https://arxiv.org/abs/1709.01507
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "SE_ResNeXt50_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SE_ResNeXt50_32x4d_pretrained.pdparams",
+    "SE_ResNeXt101_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SE_ResNeXt101_32x4d_pretrained.pdparams",
+    "SE_ResNeXt152_64x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SE_ResNeXt152_64x4d_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None,
+                 data_format='NCHW'):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False,
+            data_format=data_format)
+        bn_name = name + '_bn'
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance',
+            data_layout=data_format)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 cardinality,
+                 reduction_ratio,
+                 shortcut=True,
+                 if_first=False,
+                 name=None,
+                 data_format="NCHW"):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name='conv' + name + '_x1',
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            groups=cardinality,
+            stride=stride,
+            act='relu',
+            name='conv' + name + '_x2',
+            data_format=data_format)
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 2 if cardinality == 32 else num_filters,
+            filter_size=1,
+            act=None,
+            name='conv' + name + '_x3',
+            data_format=data_format)
+        self.scale = SELayer(
+            num_channels=num_filters * 2 if cardinality == 32 else num_filters,
+            num_filters=num_filters * 2 if cardinality == 32 else num_filters,
+            reduction_ratio=reduction_ratio,
+            name='fc' + name,
+            data_format=data_format)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 2
+                if cardinality == 32 else num_filters,
+                filter_size=1,
+                stride=stride,
+                name='conv' + name + '_prj',
+                data_format=data_format)
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+        scale = self.scale(conv2)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=scale)
+        y = F.relu(y)
+        return y
+
+
+class SELayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 reduction_ratio,
+                 name=None,
+                 data_format="NCHW"):
+        super(SELayer, self).__init__()
+
+        self.data_format = data_format
+        self.pool2d_gap = AdaptiveAvgPool2D(1, data_format=self.data_format)
+
+        self._num_channels = num_channels
+
+        med_ch = int(num_channels / reduction_ratio)
+        stdv = 1.0 / math.sqrt(num_channels * 1.0)
+        self.squeeze = Linear(
+            num_channels,
+            med_ch,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_sqz_weights"),
+            bias_attr=ParamAttr(name=name + '_sqz_offset'))
+        self.relu = nn.ReLU()
+        stdv = 1.0 / math.sqrt(med_ch * 1.0)
+        self.excitation = Linear(
+            med_ch,
+            num_filters,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_exc_weights"),
+            bias_attr=ParamAttr(name=name + '_exc_offset'))
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, input):
+        pool = self.pool2d_gap(input)
+        if self.data_format == "NHWC":
+            pool = paddle.squeeze(pool, axis=[1, 2])
+        else:
+            pool = paddle.squeeze(pool, axis=[2, 3])
+        squeeze = self.squeeze(pool)
+        squeeze = self.relu(squeeze)
+        excitation = self.excitation(squeeze)
+        excitation = self.sigmoid(excitation)
+        if self.data_format == "NHWC":
+            excitation = paddle.unsqueeze(excitation, axis=[1, 2])
+        else:
+            excitation = paddle.unsqueeze(excitation, axis=[2, 3])
+        out = input * excitation
+        return out
+
+
+class ResNeXt(nn.Layer):
+    def __init__(self,
+                 layers=50,
+                 class_num=1000,
+                 cardinality=32,
+                 input_image_channel=3,
+                 data_format="NCHW"):
+        super(ResNeXt, self).__init__()
+
+        self.layers = layers
+        self.cardinality = cardinality
+        self.reduction_ratio = 16
+        self.data_format = data_format
+        self.input_image_channel = input_image_channel
+
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+        supported_cardinality = [32, 64]
+        assert cardinality in supported_cardinality, \
+            "supported cardinality is {} but input cardinality is {}" \
+            .format(supported_cardinality, cardinality)
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_channels = [64, 256, 512, 1024]
+        num_filters = [128, 256, 512,
+                       1024] if cardinality == 32 else [256, 512, 1024, 2048]
+        if layers < 152:
+            self.conv = ConvBNLayer(
+                num_channels=self.input_image_channel,
+                num_filters=64,
+                filter_size=7,
+                stride=2,
+                act='relu',
+                name="conv1",
+                data_format=self.data_format)
+        else:
+            self.conv1_1 = ConvBNLayer(
+                num_channels=self.input_image_channel,
+                num_filters=64,
+                filter_size=3,
+                stride=2,
+                act='relu',
+                name="conv1",
+                data_format=self.data_format)
+            self.conv1_2 = ConvBNLayer(
+                num_channels=64,
+                num_filters=64,
+                filter_size=3,
+                stride=1,
+                act='relu',
+                name="conv2",
+                data_format=self.data_format)
+            self.conv1_3 = ConvBNLayer(
+                num_channels=64,
+                num_filters=128,
+                filter_size=3,
+                stride=1,
+                act='relu',
+                name="conv3",
+                data_format=self.data_format)
+
+        self.pool2d_max = MaxPool2D(
+            kernel_size=3, stride=2, padding=1, data_format=self.data_format)
+
+        self.block_list = []
+        n = 1 if layers == 50 or layers == 101 else 3
+        for block in range(len(depth)):
+            n += 1
+            shortcut = False
+            for i in range(depth[block]):
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        num_channels=num_channels[block] if i == 0 else
+                        num_filters[block] * int(64 // self.cardinality),
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        cardinality=self.cardinality,
+                        reduction_ratio=self.reduction_ratio,
+                        shortcut=shortcut,
+                        if_first=block == 0,
+                        name=str(n) + '_' + str(i + 1),
+                        data_format=self.data_format))
+                self.block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1, data_format=self.data_format)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc6_weights"),
+            bias_attr=ParamAttr(name="fc6_offset"))
+
+    def forward(self, inputs):
+        with paddle.static.amp.fp16_guard():
+            return self._forward(inputs)
+
+    def _forward(self, inputs):
+        if self.data_format == "NHWC":
+            inputs = paddle.tensor.transpose(inputs, [0, 2, 3, 1])
+            inputs.stop_gradient = True
+        if self.layers < 152:
+            y = self.conv(inputs)
+        else:
+            y = self.conv1_1(inputs)
+            y = self.conv1_2(y)
+            y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for i, block in enumerate(self.block_list):
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def SE_ResNeXt50_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=50, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["SE_ResNeXt50_32x4d"], use_ssld=use_ssld)
+    return model
+
+
+def SE_ResNeXt101_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=101, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SE_ResNeXt101_32x4d"],
+        use_ssld=use_ssld)
+    return model
+
+
+def SE_ResNeXt152_64x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=152, cardinality=64, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SE_ResNeXt152_64x4d"],
+        use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/se_resnext_vd.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/se_resnext_vd.py
new file mode 100644
index 000000000..659e190c8
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/se_resnext_vd.py
@@ -0,0 +1,311 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1611.05431 & https://arxiv.org/abs/1812.01187 & https://arxiv.org/abs/1709.01507
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+import math
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "SE_ResNeXt50_vd_32x4d":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SE_ResNeXt50_vd_32x4d_pretrained.pdparams",
+    "SENet154_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SENet154_vd_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 is_vd_mode=False,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self.is_vd_mode = is_vd_mode
+        self._pool2d_avg = AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        bn_name = name + '_bn'
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + '_scale'),
+            bias_attr=ParamAttr(bn_name + '_offset'),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        if self.is_vd_mode:
+            inputs = self._pool2d_avg(inputs)
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 cardinality,
+                 reduction_ratio,
+                 shortcut=True,
+                 if_first=False,
+                 name=None):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act='relu',
+            name='conv' + name + '_x1')
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            groups=cardinality,
+            stride=stride,
+            act='relu',
+            name='conv' + name + '_x2')
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 2 if cardinality == 32 else num_filters,
+            filter_size=1,
+            act=None,
+            name='conv' + name + '_x3')
+        self.scale = SELayer(
+            num_channels=num_filters * 2 if cardinality == 32 else num_filters,
+            num_filters=num_filters * 2 if cardinality == 32 else num_filters,
+            reduction_ratio=reduction_ratio,
+            name='fc' + name)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 2
+                if cardinality == 32 else num_filters,
+                filter_size=1,
+                stride=1,
+                is_vd_mode=False if if_first else True,
+                name='conv' + name + '_prj')
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+        scale = self.scale(conv2)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=scale)
+        y = F.relu(y)
+        return y
+
+
+class SELayer(nn.Layer):
+    def __init__(self, num_channels, num_filters, reduction_ratio, name=None):
+        super(SELayer, self).__init__()
+
+        self.pool2d_gap = AdaptiveAvgPool2D(1)
+
+        self._num_channels = num_channels
+
+        med_ch = int(num_channels / reduction_ratio)
+        stdv = 1.0 / math.sqrt(num_channels * 1.0)
+        self.squeeze = Linear(
+            num_channels,
+            med_ch,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_sqz_weights"),
+            bias_attr=ParamAttr(name=name + '_sqz_offset'))
+        self.relu = nn.ReLU()
+        stdv = 1.0 / math.sqrt(med_ch * 1.0)
+        self.excitation = Linear(
+            med_ch,
+            num_filters,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name=name + "_exc_weights"),
+            bias_attr=ParamAttr(name=name + '_exc_offset'))
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, input):
+        pool = self.pool2d_gap(input)
+        pool = paddle.squeeze(pool, axis=[2, 3])
+        squeeze = self.squeeze(pool)
+        squeeze = self.relu(squeeze)
+        excitation = self.excitation(squeeze)
+        excitation = self.sigmoid(excitation)
+        excitation = paddle.unsqueeze(excitation, axis=[2, 3])
+        out = paddle.multiply(input, excitation)
+        return out
+
+
+class ResNeXt(nn.Layer):
+    def __init__(self, layers=50, class_num=1000, cardinality=32):
+        super(ResNeXt, self).__init__()
+
+        self.layers = layers
+        self.cardinality = cardinality
+        self.reduction_ratio = 16
+        supported_layers = [50, 101, 152]
+        assert layers in supported_layers, \
+            "supported layers are {} but input layer is {}".format(
+                supported_layers, layers)
+        supported_cardinality = [32, 64]
+        assert cardinality in supported_cardinality, \
+            "supported cardinality is {} but input cardinality is {}" \
+            .format(supported_cardinality, cardinality)
+        if layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3, 4, 23, 3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_channels = [128, 256, 512, 1024]
+        num_filters = [128, 256, 512,
+                       1024] if cardinality == 32 else [256, 512, 1024, 2048]
+
+        self.conv1_1 = ConvBNLayer(
+            num_channels=3,
+            num_filters=64,
+            filter_size=3,
+            stride=2,
+            act='relu',
+            name="conv1_1")
+        self.conv1_2 = ConvBNLayer(
+            num_channels=64,
+            num_filters=64,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_2")
+        self.conv1_3 = ConvBNLayer(
+            num_channels=64,
+            num_filters=128,
+            filter_size=3,
+            stride=1,
+            act='relu',
+            name="conv1_3")
+
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        self.block_list = []
+        n = 1 if layers == 50 or layers == 101 else 3
+        for block in range(len(depth)):
+            n += 1
+            shortcut = False
+            for i in range(depth[block]):
+                bottleneck_block = self.add_sublayer(
+                    'bb_%d_%d' % (block, i),
+                    BottleneckBlock(
+                        num_channels=num_channels[block] if i == 0 else
+                        num_filters[block] * int(64 // self.cardinality),
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        cardinality=self.cardinality,
+                        reduction_ratio=self.reduction_ratio,
+                        shortcut=shortcut,
+                        if_first=block == 0,
+                        name=str(n) + '_' + str(i + 1)))
+                self.block_list.append(bottleneck_block)
+                shortcut = True
+
+        self.pool2d_avg = AdaptiveAvgPool2D(1)
+
+        self.pool2d_avg_channels = num_channels[-1] * 2
+
+        stdv = 1.0 / math.sqrt(self.pool2d_avg_channels * 1.0)
+
+        self.out = Linear(
+            self.pool2d_avg_channels,
+            class_num,
+            weight_attr=ParamAttr(
+                initializer=Uniform(-stdv, stdv), name="fc6_weights"),
+            bias_attr=ParamAttr(name="fc6_offset"))
+
+    def forward(self, inputs):
+        y = self.conv1_1(inputs)
+        y = self.conv1_2(y)
+        y = self.conv1_3(y)
+        y = self.pool2d_max(y)
+        for block in self.block_list:
+            y = block(y)
+        y = self.pool2d_avg(y)
+        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_channels])
+        y = self.out(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def SE_ResNeXt50_vd_32x4d(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=50, cardinality=32, **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SE_ResNeXt50_vd_32x4d"],
+        use_ssld=use_ssld)
+    return model
+
+
+def SENet154_vd(pretrained=False, use_ssld=False, **kwargs):
+    model = ResNeXt(layers=152, cardinality=64, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["SENet154_vd"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/shufflenet_v2.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/shufflenet_v2.py
new file mode 100644
index 000000000..e1058bb08
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/shufflenet_v2.py
@@ -0,0 +1,364 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1807.11164
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from paddle import ParamAttr, reshape, transpose, concat, split
+from paddle.nn import Layer, Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm, Linear
+from paddle.nn.initializer import KaimingNormal
+from paddle.nn.functional import swish
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "ShuffleNetV2_x0_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x0_25_pretrained.pdparams",
+    "ShuffleNetV2_x0_33":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x0_33_pretrained.pdparams",
+    "ShuffleNetV2_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x0_5_pretrained.pdparams",
+    "ShuffleNetV2_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x1_0_pretrained.pdparams",
+    "ShuffleNetV2_x1_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x1_5_pretrained.pdparams",
+    "ShuffleNetV2_x2_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_x2_0_pretrained.pdparams",
+    "ShuffleNetV2_swish":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ShuffleNetV2_swish_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def channel_shuffle(x, groups):
+    batch_size, num_channels, height, width = x.shape[0:4]
+    channels_per_group = num_channels // groups
+
+    # reshape
+    x = reshape(
+        x=x, shape=[batch_size, groups, channels_per_group, height, width])
+
+    # transpose
+    x = transpose(x=x, perm=[0, 2, 1, 3, 4])
+
+    # flatten
+    x = reshape(x=x, shape=[batch_size, num_channels, height, width])
+    return x
+
+
+class ConvBNLayer(Layer):
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            groups=1,
+            act=None,
+            name=None, ):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(
+                initializer=KaimingNormal(), name=name + "_weights"),
+            bias_attr=False)
+
+        self._batch_norm = BatchNorm(
+            out_channels,
+            param_attr=ParamAttr(name=name + "_bn_scale"),
+            bias_attr=ParamAttr(name=name + "_bn_offset"),
+            act=act,
+            moving_mean_name=name + "_bn_mean",
+            moving_variance_name=name + "_bn_variance")
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class InvertedResidual(Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 act="relu",
+                 name=None):
+        super(InvertedResidual, self).__init__()
+        self._conv_pw = ConvBNLayer(
+            in_channels=in_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act,
+            name='stage_' + name + '_conv1')
+        self._conv_dw = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=out_channels // 2,
+            act=None,
+            name='stage_' + name + '_conv2')
+        self._conv_linear = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act,
+            name='stage_' + name + '_conv3')
+
+    def forward(self, inputs):
+        x1, x2 = split(
+            inputs,
+            num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2],
+            axis=1)
+        x2 = self._conv_pw(x2)
+        x2 = self._conv_dw(x2)
+        x2 = self._conv_linear(x2)
+        out = concat([x1, x2], axis=1)
+        return channel_shuffle(out, 2)
+
+
+class InvertedResidualDS(Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 stride,
+                 act="relu",
+                 name=None):
+        super(InvertedResidualDS, self).__init__()
+
+        # branch1
+        self._conv_dw_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=in_channels,
+            act=None,
+            name='stage_' + name + '_conv4')
+        self._conv_linear_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act,
+            name='stage_' + name + '_conv5')
+        # branch2
+        self._conv_pw_2 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act,
+            name='stage_' + name + '_conv1')
+        self._conv_dw_2 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=out_channels // 2,
+            act=None,
+            name='stage_' + name + '_conv2')
+        self._conv_linear_2 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act,
+            name='stage_' + name + '_conv3')
+
+    def forward(self, inputs):
+        x1 = self._conv_dw_1(inputs)
+        x1 = self._conv_linear_1(x1)
+        x2 = self._conv_pw_2(inputs)
+        x2 = self._conv_dw_2(x2)
+        x2 = self._conv_linear_2(x2)
+        out = concat([x1, x2], axis=1)
+
+        return channel_shuffle(out, 2)
+
+
+class ShuffleNet(Layer):
+    def __init__(self, class_num=1000, scale=1.0, act="relu"):
+        super(ShuffleNet, self).__init__()
+        self.scale = scale
+        self.class_num = class_num
+        stage_repeats = [4, 8, 4]
+
+        if scale == 0.25:
+            stage_out_channels = [-1, 24, 24, 48, 96, 512]
+        elif scale == 0.33:
+            stage_out_channels = [-1, 24, 32, 64, 128, 512]
+        elif scale == 0.5:
+            stage_out_channels = [-1, 24, 48, 96, 192, 1024]
+        elif scale == 1.0:
+            stage_out_channels = [-1, 24, 116, 232, 464, 1024]
+        elif scale == 1.5:
+            stage_out_channels = [-1, 24, 176, 352, 704, 1024]
+        elif scale == 2.0:
+            stage_out_channels = [-1, 24, 244, 488, 976, 2048]
+        else:
+            raise NotImplementedError("This scale size:[" + str(scale) +
+                                      "] is not implemented!")
+        # 1. conv1
+        self._conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=stage_out_channels[1],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            act=act,
+            name='stage1_conv')
+        self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+        # 2. bottleneck sequences
+        self._block_list = []
+        for stage_id, num_repeat in enumerate(stage_repeats):
+            for i in range(num_repeat):
+                if i == 0:
+                    block = self.add_sublayer(
+                        name=str(stage_id + 2) + '_' + str(i + 1),
+                        sublayer=InvertedResidualDS(
+                            in_channels=stage_out_channels[stage_id + 1],
+                            out_channels=stage_out_channels[stage_id + 2],
+                            stride=2,
+                            act=act,
+                            name=str(stage_id + 2) + '_' + str(i + 1)))
+                else:
+                    block = self.add_sublayer(
+                        name=str(stage_id + 2) + '_' + str(i + 1),
+                        sublayer=InvertedResidual(
+                            in_channels=stage_out_channels[stage_id + 2],
+                            out_channels=stage_out_channels[stage_id + 2],
+                            stride=1,
+                            act=act,
+                            name=str(stage_id + 2) + '_' + str(i + 1)))
+                self._block_list.append(block)
+        # 3. last_conv
+        self._last_conv = ConvBNLayer(
+            in_channels=stage_out_channels[-2],
+            out_channels=stage_out_channels[-1],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            act=act,
+            name='conv5')
+        # 4. pool
+        self._pool2d_avg = AdaptiveAvgPool2D(1)
+        self._out_c = stage_out_channels[-1]
+        # 5. fc
+        self._fc = Linear(
+            stage_out_channels[-1],
+            class_num,
+            weight_attr=ParamAttr(name='fc6_weights'),
+            bias_attr=ParamAttr(name='fc6_offset'))
+
+    def forward(self, inputs):
+        y = self._conv1(inputs)
+        y = self._max_pool(y)
+        for inv in self._block_list:
+            y = inv(y)
+        y = self._last_conv(y)
+        y = self._pool2d_avg(y)
+        y = paddle.flatten(y, start_axis=1, stop_axis=-1)
+        y = self._fc(y)
+        return y
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ShuffleNetV2_x0_25(pretrained=False, use_ssld=False, **kwargs):
+    model = ShuffleNet(scale=0.25, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ShuffleNetV2_x0_25"], use_ssld=use_ssld)
+    return model
+
+
+def ShuffleNetV2_x0_33(pretrained=False, use_ssld=False, **kwargs):
+    model = ShuffleNet(scale=0.33, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ShuffleNetV2_x0_33"], use_ssld=use_ssld)
+    return model
+
+
+def ShuffleNetV2_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    model = ShuffleNet(scale=0.5, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ShuffleNetV2_x0_5"], use_ssld=use_ssld)
+    return model
+
+
+def ShuffleNetV2_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    model = ShuffleNet(scale=1.0, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ShuffleNetV2_x1_0"], use_ssld=use_ssld)
+    return model
+
+
+def ShuffleNetV2_x1_5(pretrained=False, use_ssld=False, **kwargs):
+    model = ShuffleNet(scale=1.5, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ShuffleNetV2_x1_5"], use_ssld=use_ssld)
+    return model
+
+
+def ShuffleNetV2_x2_0(pretrained=False, use_ssld=False, **kwargs):
+    model = ShuffleNet(scale=2.0, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ShuffleNetV2_x2_0"], use_ssld=use_ssld)
+    return model
+
+
+def ShuffleNetV2_swish(pretrained=False, use_ssld=False, **kwargs):
+    model = ShuffleNet(scale=1.0, act="swish", **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["ShuffleNetV2_swish"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/squeezenet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/squeezenet.py
new file mode 100644
index 000000000..5edfaa0ad
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/squeezenet.py
@@ -0,0 +1,196 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1709.01507
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "SqueezeNet1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SqueezeNet1_0_pretrained.pdparams",
+    "SqueezeNet1_1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SqueezeNet1_1_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class MakeFireConv(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter_size,
+                 padding=0,
+                 name=None):
+        super(MakeFireConv, self).__init__()
+        self._conv = Conv2D(
+            input_channels,
+            output_channels,
+            filter_size,
+            padding=padding,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=ParamAttr(name=name + "_offset"))
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = F.relu(x)
+        return x
+
+
+class MakeFire(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 squeeze_channels,
+                 expand1x1_channels,
+                 expand3x3_channels,
+                 name=None):
+        super(MakeFire, self).__init__()
+        self._conv = MakeFireConv(
+            input_channels, squeeze_channels, 1, name=name + "_squeeze1x1")
+        self._conv_path1 = MakeFireConv(
+            squeeze_channels, expand1x1_channels, 1, name=name + "_expand1x1")
+        self._conv_path2 = MakeFireConv(
+            squeeze_channels,
+            expand3x3_channels,
+            3,
+            padding=1,
+            name=name + "_expand3x3")
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        x1 = self._conv_path1(x)
+        x2 = self._conv_path2(x)
+        return paddle.concat([x1, x2], axis=1)
+
+
+class SqueezeNet(nn.Layer):
+    def __init__(self, version, class_num=1000):
+        super(SqueezeNet, self).__init__()
+        self.version = version
+
+        if self.version == "1.0":
+            self._conv = Conv2D(
+                3,
+                96,
+                7,
+                stride=2,
+                weight_attr=ParamAttr(name="conv1_weights"),
+                bias_attr=ParamAttr(name="conv1_offset"))
+            self._pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+            self._conv1 = MakeFire(96, 16, 64, 64, name="fire2")
+            self._conv2 = MakeFire(128, 16, 64, 64, name="fire3")
+            self._conv3 = MakeFire(128, 32, 128, 128, name="fire4")
+
+            self._conv4 = MakeFire(256, 32, 128, 128, name="fire5")
+            self._conv5 = MakeFire(256, 48, 192, 192, name="fire6")
+            self._conv6 = MakeFire(384, 48, 192, 192, name="fire7")
+            self._conv7 = MakeFire(384, 64, 256, 256, name="fire8")
+
+            self._conv8 = MakeFire(512, 64, 256, 256, name="fire9")
+        else:
+            self._conv = Conv2D(
+                3,
+                64,
+                3,
+                stride=2,
+                padding=1,
+                weight_attr=ParamAttr(name="conv1_weights"),
+                bias_attr=ParamAttr(name="conv1_offset"))
+            self._pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+            self._conv1 = MakeFire(64, 16, 64, 64, name="fire2")
+            self._conv2 = MakeFire(128, 16, 64, 64, name="fire3")
+
+            self._conv3 = MakeFire(128, 32, 128, 128, name="fire4")
+            self._conv4 = MakeFire(256, 32, 128, 128, name="fire5")
+
+            self._conv5 = MakeFire(256, 48, 192, 192, name="fire6")
+            self._conv6 = MakeFire(384, 48, 192, 192, name="fire7")
+            self._conv7 = MakeFire(384, 64, 256, 256, name="fire8")
+            self._conv8 = MakeFire(512, 64, 256, 256, name="fire9")
+
+        self._drop = Dropout(p=0.5, mode="downscale_in_infer")
+        self._conv9 = Conv2D(
+            512,
+            class_num,
+            1,
+            weight_attr=ParamAttr(name="conv10_weights"),
+            bias_attr=ParamAttr(name="conv10_offset"))
+        self._avg_pool = AdaptiveAvgPool2D(1)
+
+    def forward(self, inputs):
+        x = self._conv(inputs)
+        x = F.relu(x)
+        x = self._pool(x)
+        if self.version == "1.0":
+            x = self._conv1(x)
+            x = self._conv2(x)
+            x = self._conv3(x)
+            x = self._pool(x)
+            x = self._conv4(x)
+            x = self._conv5(x)
+            x = self._conv6(x)
+            x = self._conv7(x)
+            x = self._pool(x)
+            x = self._conv8(x)
+        else:
+            x = self._conv1(x)
+            x = self._conv2(x)
+            x = self._pool(x)
+            x = self._conv3(x)
+            x = self._conv4(x)
+            x = self._pool(x)
+            x = self._conv5(x)
+            x = self._conv6(x)
+            x = self._conv7(x)
+            x = self._conv8(x)
+        x = self._drop(x)
+        x = self._conv9(x)
+        x = F.relu(x)
+        x = self._avg_pool(x)
+        x = paddle.squeeze(x, axis=[2, 3])
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def SqueezeNet1_0(pretrained=False, use_ssld=False, **kwargs):
+    model = SqueezeNet(version="1.0", **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["SqueezeNet1_0"], use_ssld=use_ssld)
+    return model
+
+
+def SqueezeNet1_1(pretrained=False, use_ssld=False, **kwargs):
+    model = SqueezeNet(version="1.1", **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["SqueezeNet1_1"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/starnet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/starnet.py
new file mode 100644
index 000000000..4832be783
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/starnet.py
@@ -0,0 +1,197 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/2403.19967
+
+import paddle
+import paddle.nn as nn
+
+from ....utils.save_load import load_dygraph_pretrain
+from ..model_zoo.vision_transformer import DropPath
+
+MODEL_URLS = {
+    "StarNet_S1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/StarNet_S1_pretrained.pdparams",
+    "StarNet_S2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/StarNet_S2_pretrained.pdparams",
+    "StarNet_S3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/StarNet_S3_pretrained.pdparams",
+    "StarNet_S4":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/StarNet_S4_pretrained.pdparams",
+}
+
+__all__ = MODEL_URLS.keys()
+
+NET_CONFIG = {
+    "StarNet_S1": [24, [2, 2, 8, 3]],
+    "StarNet_S2": [32, [1, 2, 6, 2]],
+    "StarNet_S3": [32, [2, 2, 8, 4]],
+    "StarNet_S4": [32, [3, 3, 12, 5]],
+}
+
+
+class ConvBN(nn.Sequential):
+    def __init__(self,
+                 in_planes,
+                 out_planes,
+                 kernel_size=1,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 with_bn=True):
+        super().__init__()
+        self.add_sublayer(
+            name='conv',
+            sublayer=nn.Conv2D(
+                in_channels=in_planes,
+                out_channels=out_planes,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                dilation=dilation,
+                groups=groups))
+        if with_bn:
+            self.add_sublayer(
+                name='bn', sublayer=nn.BatchNorm2D(num_features=out_planes))
+            init_Constant = nn.initializer.Constant(value=1)
+            init_Constant(self.bn.weight)
+            init_Constant = nn.initializer.Constant(value=0)
+            init_Constant(self.bn.bias)
+
+
+class Block(nn.Layer):
+    def __init__(self, dim, mlp_ratio=3, drop_path=0.0):
+        super().__init__()
+        self.dwconv = ConvBN(
+            dim, dim, 7, 1, (7 - 1) // 2, groups=dim, with_bn=True)
+        self.f1 = ConvBN(dim, mlp_ratio * dim, 1, with_bn=False)
+        self.f2 = ConvBN(dim, mlp_ratio * dim, 1, with_bn=False)
+        self.g = ConvBN(mlp_ratio * dim, dim, 1, with_bn=True)
+        self.dwconv2 = ConvBN(
+            dim, dim, 7, 1, (7 - 1) // 2, groups=dim, with_bn=False)
+        self.act = nn.ReLU6()
+        self.drop_path = (DropPath(drop_path)
+                          if drop_path > 0. else nn.Identity())
+
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x1, x2 = self.f1(x), self.f2(x)
+        x = self.act(x1) * x2
+        x = self.dwconv2(self.g(x))
+        x = input + self.drop_path(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError("pretrained type is not available. ")
+
+
+class StarNet(nn.Layer):
+    """
+    StarNet: StarNet for Image Classification
+    Args:
+        base_dim: int, base dimension of the model, default 32.
+        depths: list, number of blocks in each stage, default [3, 3, 12, 5].
+        mlp_ratio: int, ratio of hidden dim to mlp_dim, default 4.
+        drop_path_rate: float, default 0.0, stochastic depth rate.
+        class_num: int, default 1000, number of classes.
+    """
+    def __init__(self,
+                 base_dim=32,
+                 depths=[3, 3, 12, 5],
+                 mlp_ratio=4,
+                 drop_path_rate=0.0,
+                 class_num=1000,
+                 **kwargs):
+        super().__init__()
+        self.class_num = class_num
+        self.in_channel = 32
+        self.stem = nn.Sequential(
+            ConvBN(
+                3, self.in_channel, kernel_size=3, stride=2, padding=1),
+            nn.ReLU6())
+        dpr = [
+            x.item()
+            for x in paddle.linspace(
+                start=0, stop=drop_path_rate, num=sum(depths))
+        ]
+        self.stages = nn.LayerList()
+        cur = 0
+        for i_layer in range(len(depths)):
+            embed_dim = base_dim * 2**i_layer
+            down_sampler = ConvBN(self.in_channel, embed_dim, 3, 2, 1)
+            self.in_channel = embed_dim
+            blocks = [
+                Block(self.in_channel, mlp_ratio, dpr[cur + i])
+                for i in range(depths[i_layer])
+            ]
+            cur += depths[i_layer]
+            self.stages.append(nn.Sequential(down_sampler, *blocks))
+        self.norm = nn.BatchNorm2D(num_features=self.in_channel)
+        self.avgpool = nn.AdaptiveAvgPool2D(output_size=1)
+        self.head = nn.Linear(
+            in_features=self.in_channel, out_features=class_num)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear or nn.Conv2D):
+            pass
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                init_Constant = nn.initializer.Constant(value=0)
+                init_Constant(m.bias)
+        elif isinstance(m, nn.LayerNorm or nn.BatchNorm2D):
+            init_Constant = nn.initializer.Constant(value=0)
+            init_Constant(m.bias)
+            init_Constant = nn.initializer.Constant(value=1.0)
+            init_Constant(m.weight)
+
+    def forward(self, x):
+        x = self.stem(x)
+        for stage in self.stages:
+            x = stage(x)
+        x = paddle.flatten(x=self.avgpool(self.norm(x)), start_axis=1)
+        return self.head(x)
+
+
+def StarNet_S1(pretrained=False, use_ssld=False, **kwargs):
+    model = StarNet(*NET_CONFIG["StarNet_S1"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["StarNet_S1"], use_ssld)
+    return model
+
+
+def StarNet_S2(pretrained=False, use_ssld=False, **kwargs):
+    model = StarNet(*NET_CONFIG["StarNet_S2"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["StarNet_S2"], use_ssld)
+    return model
+
+
+def StarNet_S3(pretrained=False, use_ssld=False, **kwargs):
+    model = StarNet(*NET_CONFIG["StarNet_S3"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["StarNet_S3"], use_ssld)
+    return model
+
+
+def StarNet_S4(pretrained=False, use_ssld=False, **kwargs):
+    model = StarNet(*NET_CONFIG["StarNet_S4"], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["StarNet_S4"], use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/svtrnet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/svtrnet.py
new file mode 100644
index 000000000..1ee4ab9fc
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/svtrnet.py
@@ -0,0 +1,699 @@
+from paddle import ParamAttr
+from paddle.nn.initializer import KaimingNormal
+import numpy as np
+import paddle
+import paddle.nn as nn
+from paddle.nn.initializer import TruncatedNormal, Constant, Normal
+from paddle.nn import functional as F
+
+trunc_normal_ = TruncatedNormal(std=.02)
+normal_ = Normal
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+def resize_pos_embed(pos_embed,
+                     src_shape,
+                     dst_shape,
+                     mode='bicubic',
+                     num_extra_tokens=1):
+    """Resize pos_embed weights.
+
+    Args:
+        pos_embed (paddle.Tensor): Position embedding weights with shape
+            [1, L, C].
+        src_shape (tuple): The resolution of downsampled origin training
+            image, in format (H, W).
+        dst_shape (tuple): The resolution of downsampled new training
+            image, in format (H, W).
+        mode (str): Algorithm used for upsampling. Choose one from 'nearest',
+            'linear', 'bilinear', 'bicubic' and 'trilinear'.
+            Defaults to 'bicubic'.
+        num_extra_tokens (int): The number of extra tokens, such as cls_token.
+            Defaults to 1.
+
+    Returns:
+        paddle.Tensor: The resized pos_embed of shape [1, L_new, C]
+    """
+    if src_shape[0] == dst_shape[0] and src_shape[1] == dst_shape[1]:
+        return pos_embed
+    assert pos_embed.ndim == 3, 'shape of pos_embed must be [1, L, C]'
+    _, L, C = pos_embed.shape
+    src_h, src_w = src_shape
+    assert L == src_h * src_w + num_extra_tokens, \
+        f"The length of `pos_embed` ({L}) doesn't match the expected " \
+        f'shape ({src_h}*{src_w}+{num_extra_tokens}). Please check the' \
+        '`img_size` argument.'
+    extra_tokens = pos_embed[:, :num_extra_tokens]
+
+    src_weight = pos_embed[:, num_extra_tokens:]
+    src_weight = src_weight.reshape([-1, src_h, src_w, C]).transpose(
+        [0, 3, 1, 2])
+
+    # The cubic interpolate algorithm only accepts float32
+    dst_weight = F.interpolate(
+        paddle.cast(src_weight, paddle.float32),
+        size=dst_shape,
+        align_corners=False,
+        mode=mode)
+    dst_weight = paddle.flatten(dst_weight, 2).transpose([0, 2, 1])
+    dst_weight = paddle.cast(dst_weight, src_weight.dtype)
+
+    return paddle.concat((extra_tokens, dst_weight), axis=1)
+
+def pading_for_not_divisible(pixel_values,
+                             height,
+                             width,
+                             patch_size,
+                             format="NCHW",
+                             function="split"):
+    if isinstance(patch_size, int):
+        patch_size = (patch_size, patch_size)
+    if height % patch_size[0] == 0 and width % patch_size[1] == 0:
+        return pixel_values, (0, 0, 0, 0, 0, 0, 0, 0)
+    if function == "split":
+        pading_width = patch_size[1] - width % patch_size[1]
+        pading_height = patch_size[0] - height % patch_size[0]
+    elif function == "merge":
+        pading_width = width % 2
+        pading_height = height % 2
+    if format == "NCHW":
+        pad_index = [0, 0, 0, 0, 0, pading_height, 0, pading_width]
+    elif format == "NHWC":
+        pad_index = [0, 0, 0, pading_height, 0, pading_width, 0, 0]
+    else:
+        assert ("vaild format")
+
+    return F.pad(pixel_values, pad_index), pad_index
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=0,
+                 bias_attr=False,
+                 groups=1,
+                 act=nn.GELU):
+        super().__init__()
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=paddle.ParamAttr(
+                initializer=nn.initializer.KaimingUniform()),
+            bias_attr=bias_attr)
+        self.norm = nn.BatchNorm2D(out_channels)
+        self.act = act()
+
+    def forward(self, inputs):
+        out = self.conv(inputs)
+        out = self.norm(out)
+        out = self.act(out)
+        return out
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class ConvMixer(nn.Layer):
+    def __init__(
+            self,
+            dim,
+            num_heads=8,
+            HW=[8, 25],
+            local_k=[3, 3], ):
+        super().__init__()
+        self.HW = HW
+        self.dim = dim
+        self.local_mixer = nn.Conv2D(
+            dim,
+            dim,
+            local_k,
+            1, [local_k[0] // 2, local_k[1] // 2],
+            groups=num_heads,
+            weight_attr=ParamAttr(initializer=KaimingNormal()))
+
+    def forward(self, x, input_dimension):
+        h, w = input_dimension
+        x = x.transpose([0, 2, 1]).reshape([0, self.dim, h, w])
+        x = self.local_mixer(x)
+        x = x.flatten(2).transpose([0, 2, 1])
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 mixer='Global',
+                 HW=None,
+                 local_k=[7, 11],
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        self.dim = dim
+        self.head_dim = dim // num_heads
+        self.scale = qk_scale or self.head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.HW = HW
+        self.local_k = local_k
+        self.mixer = mixer
+    def get_mask(self,input_dimension):
+        if self.HW is not None:
+            H = input_dimension[0]
+            W = input_dimension[1]
+            self.N = H * W
+            self.C = self.dim
+        if self.mixer == 'Local' and self.HW is not None:
+            hk = self.local_k[0]
+            wk = self.local_k[1]
+            mask = paddle.ones(
+                [H * W, H + hk - 1, W + wk - 1], dtype='float32')
+            for h in range(0, H):
+                for w in range(0, W):
+                    mask[h * W + w, h:h + hk, w:w + wk] = 0.
+            mask_paddle = mask[:, hk // 2:H + hk // 2, wk // 2:W + wk //
+                               2].flatten(1)
+            mask_inf = paddle.full([H * W, H * W], '-inf', dtype='float32')
+            mask = paddle.where(mask_paddle < 1, mask_paddle, mask_inf)
+            return mask
+        return None
+    def forward(self, x, input_dimension):
+        qkv = self.qkv(x).reshape(
+            (0, -1, 3, self.num_heads, self.head_dim)).transpose(
+                (2, 0, 3, 1, 4))
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+
+        attn = (q.matmul(k.transpose((0, 1, 3, 2))))
+        if self.mixer == 'Local':
+            attn += self.get_mask(input_dimension)
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((0, -1, self.dim))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mixer='Global',
+                 local_mixer=[7, 11],
+                 HW=None,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-6,
+                 prenorm=True):
+        super().__init__()
+        if isinstance(norm_layer, str):
+            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+        else:
+            self.norm1 = norm_layer(dim)
+        if mixer == 'Global' or mixer == 'Local':
+            self.mixer = Attention(
+                dim,
+                num_heads=num_heads,
+                mixer=mixer,
+                HW=HW,
+                local_k=local_mixer,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                attn_drop=attn_drop,
+                proj_drop=drop)
+        elif mixer == 'Conv':
+            self.mixer = ConvMixer(
+                dim, num_heads=num_heads, HW=HW, local_k=local_mixer)
+        else:
+            raise TypeError("The mixer must be one of [Global, Local, Conv]")
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        if isinstance(norm_layer, str):
+            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
+        else:
+            self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp_ratio = mlp_ratio
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        self.prenorm = prenorm
+
+    def forward(self, x, input_dimension):
+        if self.prenorm:
+            x = self.norm1(x + self.drop_path(self.mixer(x,input_dimension)))
+            x = self.norm2(x + self.drop_path(self.mlp(x)))
+        else:
+            x = x + self.drop_path(self.mixer(self.norm1(x),input_dimension))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=[32, 100],
+                 in_channels=3,
+                 embed_dim=768,
+                 sub_num=2,
+                 patch_size=[4, 4],
+                 mode='pope'):
+        super().__init__()
+        num_patches = (img_size[1] // (2 ** sub_num)) * \
+                      (img_size[0] // (2 ** sub_num))
+        self.img_size = img_size
+        self.num_patches = num_patches
+        self.embed_dim = embed_dim
+        self.patch_size = patch_size
+        self.window_size = ((img_size[0] // (2 ** sub_num), (img_size[1] // (2 ** sub_num))))
+        self.norm = None
+        if mode == 'pope':
+            if sub_num == 2:
+                self.proj = nn.Sequential(
+                    ConvBNLayer(
+                        in_channels=in_channels,
+                        out_channels=embed_dim // 2,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        act=nn.GELU,
+                        bias_attr=None),
+                    ConvBNLayer(
+                        in_channels=embed_dim // 2,
+                        out_channels=embed_dim,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        act=nn.GELU,
+                        bias_attr=None))
+            if sub_num == 3:
+                self.proj = nn.Sequential(
+                    ConvBNLayer(
+                        in_channels=in_channels,
+                        out_channels=embed_dim // 4,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        act=nn.GELU,
+                        bias_attr=None),
+                    ConvBNLayer(
+                        in_channels=embed_dim // 4,
+                        out_channels=embed_dim // 2,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        act=nn.GELU,
+                        bias_attr=None),
+                    ConvBNLayer(
+                        in_channels=embed_dim // 2,
+                        out_channels=embed_dim,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        act=nn.GELU,
+                        bias_attr=None))
+        elif mode == 'linear':
+            self.proj = nn.Conv2D(
+                1, embed_dim, kernel_size=patch_size, stride=patch_size)
+            self.num_patches = img_size[0] // patch_size[0] * img_size[
+                1] // patch_size[1]
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+
+        x, _ = pading_for_not_divisible(x, H, W, self.patch_size, "BCHW")
+        x = self.proj(x)
+        _, _, height, width = x.shape
+        output_dimensions = (height, width)
+        x = x.flatten(2).transpose((0, 2, 1))
+        return x, output_dimensions
+
+
+class SubSample(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 types='Pool',
+                 stride=[2, 1],
+                 sub_norm='nn.LayerNorm',
+                 act=None):
+        super().__init__()
+        self.types = types
+        if types == 'Pool':
+            self.avgpool = nn.AvgPool2D(
+                kernel_size=[3, 5], stride=stride, padding=[1, 2])
+            self.maxpool = nn.MaxPool2D(
+                kernel_size=[3, 5], stride=stride, padding=[1, 2])
+            self.proj = nn.Linear(in_channels, out_channels)
+        else:
+            self.conv = nn.Conv2D(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                weight_attr=ParamAttr(initializer=KaimingNormal()))
+        self.norm = eval(sub_norm)(out_channels)
+        if act is not None:
+            self.act = act()
+        else:
+            self.act = None
+
+    def forward(self, x):
+
+        if self.types == 'Pool':
+            x1 = self.avgpool(x)
+            x2 = self.maxpool(x)
+            x = (x1 + x2) * 0.5
+            output_dimension = (x.shape[2],x.shape[3])
+            out = self.proj(x.flatten(2).transpose((0, 2, 1)))
+        else:
+            x = self.conv(x)
+            output_dimension = (x.shape[2],x.shape[3])
+            out = x.flatten(2).transpose((0, 2, 1))
+        out = self.norm(out)
+        if self.act is not None:
+            out = self.act(out)
+
+        return out, output_dimension
+
+
+class SVTRNet(nn.Layer):
+    def __init__(
+            self,
+            class_num=1000,
+            img_size=[48, 320],
+            in_channels=3,
+            embed_dim=[192, 256, 512],
+            depth=[6, 6, 9],
+            num_heads=[6, 8, 16],
+            mixer=['Conv'] * 9 + ['Global'] *
+            12,  # Local atten, Global atten, Conv
+            local_mixer=[[5, 5], [5, 5], [5, 5]],
+            patch_merging='Conv',  # Conv, Pool, None
+            mlp_ratio=4,
+            qkv_bias=True,
+            qk_scale=None,
+            drop_rate=0.,
+            last_drop=0.1,
+            attn_drop_rate=0.,
+            drop_path_rate=0.1,
+            norm_layer='nn.LayerNorm',
+            sub_norm='nn.LayerNorm',
+            epsilon=1e-6,
+            out_channels=512,
+            out_char_num=40,
+            block_unit='Block',
+            act='nn.GELU',
+            last_stage=False,
+            sub_num=2,
+            prenorm=True,
+            use_lenhead=False,
+            **kwargs):
+        super().__init__()
+        self.img_size = img_size
+        self.embed_dim = embed_dim
+        self.out_channels = out_channels
+        self.prenorm = prenorm
+        patch_merging = None if patch_merging != 'Conv' and patch_merging != 'Pool' else patch_merging
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            in_channels=in_channels,
+            embed_dim=embed_dim[0],
+            sub_num=sub_num)
+        num_patches = self.patch_embed.num_patches
+        self.HW = [img_size[0] // (2**sub_num), img_size[1] // (2**sub_num)]
+        self.pos_embed = self.create_parameter(
+            shape=[1, num_patches, embed_dim[0]], default_initializer=zeros_)
+        self.add_parameter("pos_embed", self.pos_embed)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        Block_unit = eval(block_unit)
+
+        dpr = np.linspace(0, drop_path_rate, sum(depth))
+        self.blocks1 = nn.LayerList([
+            Block_unit(
+                dim=embed_dim[0],
+                num_heads=num_heads[0],
+                mixer=mixer[0:depth[0]][i],
+                HW=self.HW,
+                local_mixer=local_mixer[0],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                act_layer=eval(act),
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[0:depth[0]][i],
+                norm_layer=norm_layer,
+                epsilon=epsilon,
+                prenorm=prenorm) for i in range(depth[0])
+        ])
+        if patch_merging is not None:
+            self.sub_sample1 = SubSample(
+                embed_dim[0],
+                embed_dim[1],
+                sub_norm=sub_norm,
+                stride=[2, 1],
+                types=patch_merging)
+            HW = [self.HW[0] // 2, self.HW[1]]
+        else:
+            HW = self.HW
+        self.patch_merging = patch_merging
+        self.blocks2 = nn.LayerList([
+            Block_unit(
+                dim=embed_dim[1],
+                num_heads=num_heads[1],
+                mixer=mixer[depth[0]:depth[0] + depth[1]][i],
+                HW=HW,
+                local_mixer=local_mixer[1],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                act_layer=eval(act),
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[depth[0]:depth[0] + depth[1]][i],
+                norm_layer=norm_layer,
+                epsilon=epsilon,
+                prenorm=prenorm) for i in range(depth[1])
+        ])
+        if patch_merging is not None:
+            self.sub_sample2 = SubSample(
+                embed_dim[1],
+                embed_dim[2],
+                sub_norm=sub_norm,
+                stride=[2, 1],
+                types=patch_merging)
+            HW = [self.HW[0] // 4, self.HW[1]]
+        else:
+            HW = self.HW
+        self.blocks3 = nn.LayerList([
+            Block_unit(
+                dim=embed_dim[2],
+                num_heads=num_heads[2],
+                mixer=mixer[depth[0] + depth[1]:][i],
+                HW=HW,
+                local_mixer=local_mixer[2],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                act_layer=eval(act),
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[depth[0] + depth[1]:][i],
+                norm_layer=norm_layer,
+                epsilon=epsilon,
+                prenorm=prenorm) for i in range(depth[2])
+        ])
+        self.flatten = nn.Flatten(start_axis=0, stop_axis=1)
+        self.fc = nn.Linear(embed_dim[2], class_num)
+
+        self.last_stage = last_stage
+        if last_stage:
+            self.avg_pool = nn.AdaptiveAvgPool2D([1, out_char_num])
+            self.last_conv = nn.Conv2D(
+                in_channels=embed_dim[2],
+                out_channels=self.out_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias_attr=False)
+            self.hardswish = nn.Hardswish()
+            self.dropout = nn.Dropout(p=last_drop, mode="downscale_in_infer")
+        if not prenorm:
+            self.norm = eval(norm_layer)(embed_dim[-1], epsilon=epsilon)
+        self.use_lenhead = use_lenhead
+        if use_lenhead:
+            self.len_conv = nn.Linear(embed_dim[2], self.out_channels)
+            self.hardswish_len = nn.Hardswish()
+            self.dropout_len = nn.Dropout(
+                p=last_drop, mode="downscale_in_infer")
+
+        trunc_normal_(self.pos_embed)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        x,output_dimensions = self.patch_embed(x)
+        x = x + resize_pos_embed(self.pos_embed,self.patch_embed.window_size,output_dimensions,num_extra_tokens=0)
+        x = self.pos_drop(x)
+        for blk in self.blocks1:
+            x = blk(x, output_dimensions)
+        if self.patch_merging is not None:
+            x, output_dimensions = self.sub_sample1(
+                x.transpose([0, 2, 1]).reshape(
+                    [0, self.embed_dim[0], output_dimensions[0], output_dimensions[1]]))
+        for blk in self.blocks2:
+            x = blk(x, output_dimensions)
+        if self.patch_merging is not None:
+            x, output_dimensions = self.sub_sample2(
+                x.transpose([0, 2, 1]).reshape(
+                    [0, self.embed_dim[1], output_dimensions[0], output_dimensions[1]]))
+        for blk in self.blocks3:
+            x = blk(x, output_dimensions)
+        if not self.prenorm:
+            x = self.norm(x)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = x.mean(1)
+        x = self.fc(x)
+        return x
+
+
+def SVTR_tiny(pretrained=False, use_ssld=False, **kwargs):
+    model = SVTRNet(
+        img_size=[48, 320],
+        embed_dim=[64, 128, 256],
+        depth=[3, 6, 3],
+        num_heads=[2, 4, 8],
+        mixer=['Conv'] * 6 + ['Global'] * 6,
+        local_mixer=[[5, 5], [5, 5], [5, 5]],
+        mlp_ratio=4,
+        qkv_bias=True,
+        out_channels=256,
+        out_char_num=40,
+        epsilon=1e-6,
+        **kwargs)
+    return model
+
+
+def SVTR_base(pretrained=False, use_ssld=False, **kwargs):
+    model = SVTRNet(
+        img_size=[48, 320],
+        embed_dim=[128, 256, 384],
+        depth=[6, 6, 6],
+        num_heads=[4, 8, 12],
+        mixer=['Conv'] * 9 + ['Global'] * 12,
+        local_mixer=[[5, 5], [5, 5], [5, 5]],
+        mlp_ratio=4,
+        qkv_bias=True,
+        out_channels=384,
+        out_char_num=40,
+        epsilon=1e-6,
+        **kwargs)
+    return model
+
+
+def SVTR_large(pretrained=False, use_ssld=False, **kwargs):
+    model = SVTRNet(
+        img_size=[48, 320],
+        embed_dim=[192, 256, 512],
+        depth=[6, 6, 9],
+        num_heads=[6, 8, 16],
+        mixer=['Conv'] * 9 + ['Global'] * 12,
+        local_mixer=[[5, 5], [5, 5], [5, 5]],
+        mlp_ratio=4,
+        qkv_bias=True,
+        out_channels=512,
+        out_char_num=40,
+        epsilon=1e-6,
+        **kwargs)
+    return model
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/swin_transformer_v2.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/swin_transformer_v2.py
new file mode 100644
index 000000000..d2f89446a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/swin_transformer_v2.py
@@ -0,0 +1,1061 @@
+# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/microsoft/Swin-Transformer
+# reference: https://arxiv.org/abs/2111.09883
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import TruncatedNormal, Constant, Normal
+import numpy as np
+import math
+
+from .vision_transformer import trunc_normal_, zeros_, ones_, to_2tuple, DropPath, Identity
+from ..base.theseus_layer import TheseusLayer
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "SwinTransformerV2_tiny_patch4_window8_256":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformerV2_tiny_patch4_window8_256_pretrained.pdparams",
+    "SwinTransformerV2_tiny_patch4_window16_256":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformerV2_tiny_patch4_window16_256_pretrained.pdparams",
+    "SwinTransformerV2_small_patch4_window8_256":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformerV2_small_patch4_window8_256_pretrained.pdparams",
+    "SwinTransformerV2_small_patch4_window16_256":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformerV2_small_patch4_window16_256_pretrained.pdparams",
+    "SwinTransformerV2_base_patch4_window8_256":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformerV2_base_patch4_window8_256_pretrained.pdparams",
+    "SwinTransformerV2_base_patch4_window16_256":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformerV2_base_patch4_window16_256_pretrained.pdparams",
+    "SwinTransformerV2_base_patch4_window24_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformerV2_base_patch4_window24_384_pretrained.pdparams",
+    "SwinTransformerV2_large_patch4_window16_256":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformerV2_large_patch4_window16_256_pretrained.pdparams",
+    "SwinTransformerV2_large_patch4_window24_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/SwinTransformerV2_large_patch4_window24_384_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def masked_fill(x, mask, value):
+    y = paddle.full(x.shape, value, x.dtype)
+    return paddle.where(mask, y, x)
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def masked_fill(x, mask, value):
+    y = paddle.full(x.shape, value, x.dtype)
+    return paddle.where(mask, y, x)
+
+
+def pading_for_not_divisible(pixel_values,
+                             height,
+                             width,
+                             patch_size,
+                             format="BCHW",
+                             function="split"):
+    if isinstance(patch_size, int):
+        patch_size = (patch_size, patch_size)
+    if function == "split":
+        pading_width = patch_size[1] - width % patch_size[1]
+        pading_height = patch_size[0] - height % patch_size[0]
+    elif function == "merge":
+        pading_width = width % 2
+        pading_height = height % 2
+    if format == "BCHW":
+        pad_index = (0, 0, 0, 0, 0, pading_height, 0, pading_width)
+    elif format == "BHWC":
+        pad_index = (0, 0, 0, pading_height, 0, pading_width, 0, 0)
+    else:
+        assert ("vaild format")
+
+    return F.pad(pixel_values, pad_index), pad_index
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.reshape(
+        [B, H // window_size, window_size, W // window_size, window_size, C])
+    windows = x.transpose(perm=[0, 1, 3, 2, 4, 5]).reshape(
+        [-1, window_size, window_size, C])
+    return windows
+
+
+def pad_patch(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.reshape(
+        [B, H // window_size, window_size, W // window_size, window_size, C])
+    windows = x.transpose(perm=[0, 1, 3, 2, 4, 5]).reshape(
+        [-1, window_size, window_size, C])
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+
+    Returns:
+        x: (B, H, W, C)
+    """
+    C = windows.shape[-1]
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.reshape(
+        [-1, H // window_size, W // window_size, window_size, window_size, C])
+    x = x.transpose(perm=[0, 1, 3, 2, 4, 5]).reshape([-1, H, W, C])
+    return x
+
+
+class WindowAttention(nn.Layer):
+    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+        pretrained_window_size (tuple[int]): The height and width of the window in pre-training.
+    """
+
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=True,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 pretrained_window_size=[0, 0]):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.pretrained_window_size = pretrained_window_size
+        self.num_heads = num_heads
+
+        self.logit_scale = self.create_parameter(
+            [num_heads, 1, 1],
+            dtype='float32',
+            default_initializer=Constant(math.log(10.)))
+
+        # mlp to generate continuous relative position bias
+        self.cpb_mlp = nn.Sequential(
+            nn.Linear(
+                2, 512, bias_attr=True),
+            nn.ReLU(),
+            nn.Linear(
+                512, num_heads, bias_attr=False))
+
+        # get relative_coords_table
+        relative_coords_h = paddle.arange(
+            -(self.window_size[0] - 1), self.window_size[0], dtype='float32')
+        relative_coords_w = paddle.arange(
+            -(self.window_size[1] - 1), self.window_size[1], dtype='float32')
+        relative_coords_table = paddle.stack(
+            paddle.meshgrid([relative_coords_h, relative_coords_w])).transpose(
+                perm=[1, 2, 0]).unsqueeze(0)  # 1, 2*Wh-1, 2*Ww-1, 2
+        if pretrained_window_size[0] > 0:
+            relative_coords_table[:, :, :, 0] /= (
+                pretrained_window_size[0] - 1)
+            relative_coords_table[:, :, :, 1] /= (
+                pretrained_window_size[1] - 1)
+        else:
+            relative_coords_table[:, :, :, 0] /= (self.window_size[0] - 1)
+            relative_coords_table[:, :, :, 1] /= (self.window_size[1] - 1)
+        relative_coords_table *= 8  # normalize to -8, 8
+        relative_coords_table = paddle.sign(
+            relative_coords_table) * paddle.log2(
+                paddle.abs(relative_coords_table) + 1.0) / np.log2(8)
+
+        self.register_buffer("relative_coords_table", relative_coords_table)
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = paddle.arange(self.window_size[0])
+        coords_w = paddle.arange(self.window_size[1])
+        coords = paddle.stack(paddle.meshgrid(
+            [coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.transpose(
+            perm=[1, 2, 0])  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[
+            0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index",
+                             relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=False)
+        if qkv_bias:
+            self.q_bias = self.create_parameter(
+                [dim], dtype='float32', default_initializer=zeros_)
+            self.v_bias = self.create_parameter(
+                [dim], dtype='float32', default_initializer=zeros_)
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.softmax = nn.Softmax(axis=-1)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = paddle.concat(
+                x=[self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias])
+        qkv = F.linear(x=x, weight=self.qkv.weight, bias=qkv_bias)
+        qkv = qkv.reshape(shape=[
+            B_, N, 3, self.num_heads, qkv.shape[-1] // (3 * self.num_heads)
+        ]).transpose(perm=[2, 0, 3, 1, 4])
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make paddlescript happy (cannot use tensor as tuple)
+
+        # cosine attention
+        attn = (F.normalize(
+            q, axis=-1) @F.normalize(
+                k, axis=-1).transpose(perm=[0, 1, 3, 2]))
+        logit_scale = paddle.clip(
+            self.logit_scale, max=math.log(1. / 0.01)).exp()
+        attn = attn * logit_scale
+
+        relative_position_bias_table = self.cpb_mlp(
+            self.relative_coords_table).reshape([-1, self.num_heads])
+        relative_position_bias = relative_position_bias_table[
+            self.relative_position_index.reshape([-1])].reshape([
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1], -1
+            ])  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.transpose(
+            perm=[2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+        relative_position_bias = 16 * F.sigmoid(relative_position_bias)
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.reshape([B_ // nW, nW, self.num_heads, N, N
+                                 ]) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.reshape([-1, self.num_heads, N, N])
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @v).transpose(perm=[0, 2, 1, 3]).reshape(shape=[B_, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    def extra_repr(self):
+        return f'dim={self.dim}, window_size={self.window_size}, ' \
+               f'pretrained_window_size={self.pretrained_window_size}, num_heads={self.num_heads}'
+
+    def flops(self, N):
+        # calculate flops for 1 window with token length of N
+        flops = 0
+        flops += N * self.dim * 3 * self.dim
+        flops += self.num_heads * N * (self.dim // self.num_heads) * N
+        flops += self.num_heads * N * N * (self.dim // self.num_heads)
+        flops += N * self.dim * self.dim
+        return flops
+
+
+class SwinTransformerBlock(nn.Layer):
+    r""" Swin Transformer Block.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+        pretrained_window_size (int): Window size in pre-training.
+    """
+
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 num_heads,
+                 window_size=8,
+                 shift_size=0,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 pretrained_window_size=0):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            pretrained_window_size=to_2tuple(pretrained_window_size))
+
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        """
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            H, W = self.input_resolution
+            img_mask = paddle.zeros([1, H, W, 1])  # 1 H W 1
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+
+            mask_windows = window_partition(
+                img_mask, self.window_size)  # nW, window_size, window_size, 1
+            mask_windows = mask_windows.reshape(
+                shape=[-1, self.window_size * self.window_size])
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = masked_fill(attn_mask, attn_mask != 0, float(-100.0))
+            attn_mask = masked_fill(attn_mask, attn_mask == 0, float(0.0))
+        else:
+        """
+        H, W = self.input_resolution
+        attn_mask = paddle.zeros([1, H, W, 1])
+
+        self.register_buffer("attn_mask", attn_mask)
+
+    def forward(self, x):
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+
+        x = x.reshape([B, H, W, C])
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = paddle.roll(
+                x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2))
+        else:
+            shifted_x = x
+
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.reshape(
+            [-1, self.window_size * self.window_size,
+             C])  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_mask = self.get_attn_mask(height_pad, width_pad, x.dtype)
+        attn_windows = self.attn(
+            x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.reshape(
+            [-1, self.window_size, self.window_size, C])
+        shifted_x = window_reverse(attn_windows, self.window_size, height_pad,
+                                   width_pad)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = paddle.roll(
+                shifted_x,
+                shifts=(self.shift_size, self.shift_size),
+                axis=(1, 2))
+        else:
+            x = shifted_x
+
+        was_padded = pad_values[3] > 0 or pad_values[5] > 0
+        if was_padded:
+            x = x[:, :H, :W, :]
+
+        x = x.reshape([B, H * W, C])
+        x = shortcut + self.drop_path(self.norm1(x))
+
+        # FFN
+        x = x + self.drop_path(self.norm2(self.mlp(x)))
+        return x
+
+    def extra_repr(self):
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
+               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
+
+    def flops(self):
+        flops = 0
+        H, W = self.input_resolution
+        # norm1
+        flops += self.dim * H * W
+        # W-MSA/SW-MSA
+        nW = H * W / self.window_size / self.window_size
+        flops += nW * self.attn.flops(self.window_size * self.window_size)
+        # mlp
+        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
+        # norm2
+        flops += self.dim * H * W
+        return flops
+
+
+class PatchMerging(nn.Layer):
+    r""" Patch Merging Layer.
+
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
+        self.norm = norm_layer(2 * dim)
+
+    def forward(self, x):
+        """
+        x: B, H*W, C
+        """
+        H, W = input_dimensions
+        B, L, C = x.shape
+
+        x = x.reshape([B, H // 2, 2, W // 2, 2, C])
+        x = x.transpose((0, 1, 3, 4, 2, 5))
+        x = x.reshape([B, H * W // 4, 4 * C])  # B H/2*W/2 4*C
+        x = self.reduction(x)
+        x = self.norm(x)
+        return x
+
+    def extra_repr(self):
+        return f"input_resolution={self.input_resolution}, dim={self.dim}"
+
+    def flops(self):
+        H, W = self.input_resolution
+        flops = (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
+        flops += H * W * self.dim // 2
+        return flops
+
+
+class BasicLayer(nn.Layer):
+    """ A basic Swin Transformer layer for one stage.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        pretrained_window_size (int): Local window size in pre-training.
+    """
+
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 depth,
+                 num_heads,
+                 window_size,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 pretrained_window_size=0):
+
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+
+        # build blocks
+        self.blocks = nn.LayerList([
+            SwinTransformerBlock(
+                dim=dim,
+                input_resolution=input_resolution,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer,
+                pretrained_window_size=pretrained_window_size)
+            for i in range(depth)
+        ])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                input_resolution, dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x, input_dimensions):
+        H, W = input_dimensions
+        for blk in self.blocks:
+            x = blk(x, input_dimensions)
+        if self.downsample is not None:
+            H, W = (H + 1) // 2, (W + 1) // 2
+            x = self.downsample(x, input_dimensions)
+
+        return x, (H, W)
+
+    def extra_repr(self):
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
+
+    def flops(self):
+        flops = 0
+        for blk in self.blocks:
+            flops += blk.flops()
+        if self.downsample is not None:
+            flops += self.downsample.flops()
+        return flops
+
+
+class PatchEmbed(nn.Layer):
+    r""" Image to Patch Embedding
+
+    Args:
+        img_size (int): Image size.  Default: 256.
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self,
+                 img_size=256,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 norm_layer=None):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patches_resolution = [
+            img_size[0] // patch_size[0], img_size[1] // patch_size[1]
+        ]
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def maybe_pad(self, pixel_values, height, width):
+        if width % self.patch_size[1] != 0:
+            pad_values = (0, 0, 0, 0, 0, 0, 0,
+                          self.patch_size[1] - width % self.patch_size[1])
+            pixel_values = nn.functional.pad(pixel_values, pad_values)
+        if height % self.patch_size[0] != 0:
+            pad_values = (
+                0,
+                0,
+                0,
+                0,
+                0,
+                self.patch_size[0] - height % self.patch_size[0],
+                0,
+                0, )
+            pixel_values = nn.functional.pad(pixel_values, pad_values)
+        return pixel_values
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose([0, 2, 1])  # B Ph*Pw C
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, output_dimensions
+
+    def flops(self):
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (
+            self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops
+
+
+class SwinTransformerV2(nn.Layer):
+    r""" Swin TransformerV2
+        A PaddlePaddle impl of : `Swin Transformer V2: Scaling Up Capacity and Resolution`  -
+          https://arxiv.org/abs/2111.09883
+
+    Args:
+        img_size (int | tuple(int)): Input image size. Default 256
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        in_chans (int): Number of input image channels. Default: 3
+        class_num (int): Number of classes for classification head. Default: 1000
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 7
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+        pretrained_window_sizes (tuple(int)): Pretrained window sizes of each layer.
+    """
+
+    def __init__(self,
+                 img_size=256,
+                 patch_size=4,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 pretrained_window_sizes=[0, 0, 0, 0],
+                 **kwargs):
+        super().__init__()
+
+        self.class_num = class_num
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.img_size = img_size
+        self.patch_norm = patch_norm
+        self.num_features = int(embed_dim * 2**(self.num_layers - 1))
+        self.mlp_ratio = mlp_ratio
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+
+        # absolute position embedding
+        if self.ape:
+            self.absolute_pos_embed = self.create_parameter(
+                shape=(1, num_patches, embed_dim), default_initializer=zeros_)
+            trunc_normal_(self.absolute_pos_embed)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = [
+            x.item() for x in paddle.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.LayerList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                input_resolution=(patches_resolution[0] // (2**i_layer),
+                                  patches_resolution[1] // (2**i_layer)),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging
+                if (i_layer < self.num_layers - 1) else None,
+                pretrained_window_size=pretrained_window_sizes[i_layer])
+            self.layers.append(layer)
+
+        self.norm = norm_layer(self.num_features)
+        self.avgpool = nn.AdaptiveAvgPool1D(1)
+        self.head = nn.Linear(self.num_features,
+                              class_num) if class_num > 0 else nn.Identity()
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        x, output_dimensions = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+
+        for layer in self.layers:
+            x, output_dimensions = layer(x, input_dimensions=output_dimensions)
+
+        x = self.norm(x)  # B L C
+        x = self.avgpool(x.transpose([0, 2, 1]))  # B C 1
+        x = paddle.flatten(x, 1)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+    def flops(self):
+        flops = 0
+        flops += self.patch_embed.flops()
+        for i, layer in enumerate(self.layers):
+            flops += layer.flops()
+        flops += self.num_features * self.patches_resolution[
+            0] * self.patches_resolution[1] // (2**self.num_layers)
+        flops += self.num_features * self.class_num
+        return flops
+
+
+def _load_pretrained(pretrained,
+                     model,
+                     model_url,
+                     use_ssld=False,
+                     use_imagenet22k_pretrained=False,
+                     use_imagenet22kto1k_pretrained=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(
+            model,
+            model_url,
+            use_ssld=use_ssld,
+            use_imagenet22k_pretrained=use_imagenet22k_pretrained,
+            use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained, **kwargs)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def SwinTransformerV2_tiny_patch4_window8_256(pretrained=False,
+                                              use_ssld=False,
+                                              **kwargs):
+    model = SwinTransformerV2(
+        img_size=256,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=8,
+        drop_path_rate=0.2,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformerV2_tiny_patch4_window8_256"],
+        use_ssld=use_ssld)
+    return model
+
+
+def SwinTransformerV2_tiny_patch4_window16_256(pretrained=False,
+                                               use_ssld=False,
+                                               **kwargs):
+    model = SwinTransformerV2(
+        img_size=256,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.2,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformerV2_tiny_patch4_window16_256"],
+        use_ssld=use_ssld)
+    return model
+
+
+def SwinTransformerV2_small_patch4_window8_256(pretrained=False,
+                                               use_ssld=False,
+                                               **kwargs):
+    model = SwinTransformerV2(
+        img_size=256,
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=8,
+        drop_path_rate=0.3,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformerV2_small_patch4_window8_256"],
+        use_ssld=use_ssld)
+    return model
+
+
+def SwinTransformerV2_small_patch4_window16_256(pretrained=False,
+                                                use_ssld=False,
+                                                **kwargs):
+    model = SwinTransformerV2(
+        img_size=256,
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=16,
+        drop_path_rate=0.3,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformerV2_small_patch4_window16_256"],
+        use_ssld=use_ssld)
+    return model
+
+
+def SwinTransformerV2_base_patch4_window8_256(pretrained=False,
+                                              use_ssld=False,
+                                              **kwargs):
+    model = SwinTransformerV2(
+        img_size=256,
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=8,
+        drop_path_rate=0.5,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformerV2_base_patch4_window8_256"],
+        use_ssld=use_ssld)
+    return model
+
+
+def SwinTransformerV2_base_patch4_window16_256(
+        pretrained=False,
+        use_ssld=False,
+        use_imagenet22k_pretrained=False,
+        use_imagenet22kto1k_pretrained=False,
+        **kwargs):
+    model = SwinTransformerV2(
+        img_size=256,
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=16,
+        drop_path_rate=0.5,  # if use imagenet22k or imagenet22kto1k, drop_path_rate=0.2
+        **kwargs
+    )  # if use imagenet22k, set pretrained_window_sizes=[12, 12, 12, 6]
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformerV2_base_patch4_window16_256"],
+        use_ssld=use_ssld,
+        use_imagenet22k_pretrained=use_imagenet22k_pretrained,
+        use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    return model
+
+
+def SwinTransformerV2_base_patch4_window24_384(
+        pretrained=False,
+        use_ssld=False,
+        use_imagenet22k_pretrained=False,
+        use_imagenet22kto1k_pretrained=True,
+        **kwargs):
+    model = SwinTransformerV2(
+        img_size=384,
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=24,
+        drop_path_rate=0.2,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformerV2_base_patch4_window24_384"],
+        use_ssld=use_ssld,
+        use_imagenet22k_pretrained=use_imagenet22k_pretrained,
+        use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    return model
+
+
+def SwinTransformerV2_large_patch4_window16_256(
+        pretrained=False,
+        use_ssld=False,
+        use_imagenet22k_pretrained=False,
+        use_imagenet22kto1k_pretrained=True,
+        **kwargs):
+    model = SwinTransformerV2(
+        img_size=256,
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=16,
+        drop_path_rate=0.2,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformerV2_large_patch4_window16_256"],
+        use_ssld=use_ssld,
+        use_imagenet22k_pretrained=use_imagenet22k_pretrained,
+        use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    return model
+
+
+def SwinTransformerV2_large_patch4_window24_384(
+        pretrained=False,
+        use_ssld=False,
+        use_imagenet22k_pretrained=False,
+        use_imagenet22kto1k_pretrained=True,
+        **kwargs):
+    model = SwinTransformerV2(
+        img_size=384,
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=24,
+        drop_path_rate=0.2,
+        pretrained_window_sizes=[12, 12, 12, 6],
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["SwinTransformerV2_large_patch4_window24_384"],
+        use_ssld=use_ssld,
+        use_imagenet22k_pretrained=use_imagenet22k_pretrained,
+        use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/tinynet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/tinynet.py
new file mode 100644
index 000000000..484004615
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/tinynet.py
@@ -0,0 +1,196 @@
+# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://gitee.com/mindspore/models/tree/master/research/cv/tinynet
+# reference: https://arxiv.org/abs/2010.14819
+
+import paddle.nn as nn
+
+from .efficientnet import EfficientNet, efficientnet
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "TinyNet_A":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/TinyNet_A_pretrained.pdparams",
+    "TinyNet_B":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/TinyNet_B_pretrained.pdparams",
+    "TinyNet_C":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/TinyNet_C_pretrained.pdparams",
+    "TinyNet_D":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/TinyNet_D_pretrained.pdparams",
+    "TinyNet_E":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/TinyNet_E_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def tinynet_params(model_name):
+    """ Map TinyNet model name to parameter coefficients. """
+    params_dict = {
+        # Coefficients: width,depth,resolution,dropout
+        "tinynet-a": (1.00, 1.200, 192, 0.2),
+        "tinynet-b": (0.75, 1.100, 188, 0.2),
+        "tinynet-c": (0.54, 0.850, 184, 0.2),
+        "tinynet-d": (0.54, 0.695, 152, 0.2),
+        "tinynet-e": (0.51, 0.600, 106, 0.2),
+    }
+    return params_dict[model_name]
+
+
+def get_model_params(model_name, override_params):
+    """ Get the block args and global params for a given model """
+    if model_name.startswith('tinynet'):
+        w, d, _, p = tinynet_params(model_name)
+        blocks_args, global_params = efficientnet(
+            width_coefficient=w, depth_coefficient=d, dropout_rate=p)
+    else:
+        raise NotImplementedError('model name is not pre-defined: %s' %
+                                  model_name)
+    if override_params:
+        global_params = global_params._replace(**override_params)
+    return blocks_args, global_params
+
+
+class TinyNet(EfficientNet):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Conv2D):
+            fin_in = m.weight.shape[1] * m.weight.shape[2] * m.weight.shape[3]
+            std = (2 / fin_in)**0.5
+            nn.initializer.Normal(std=std)(m.weight)
+            if m.bias is not None:
+                nn.initializer.Constant(0)(m.bias)
+        elif isinstance(m, nn.Linear):
+            fin_in = m.weight.shape[0]
+            bound = 1 / fin_in**0.5
+            nn.initializer.Uniform(-bound, bound)(m.weight)
+            if m.bias is not None:
+                nn.initializer.Constant(0)(m.bias)
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def TinyNet_A(padding_type='DYNAMIC',
+              override_params=None,
+              use_se=True,
+              pretrained=False,
+              use_ssld=False,
+              **kwargs):
+    block_args, global_params = get_model_params("tinynet-a", override_params)
+    model = TinyNet(
+        block_args,
+        global_params,
+        name='a',
+        padding_type=padding_type,
+        use_se=use_se,
+        fix_stem=True,
+        num_features=1280,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["TinyNet_A"], use_ssld)
+    return model
+
+
+def TinyNet_B(padding_type='DYNAMIC',
+              override_params=None,
+              use_se=True,
+              pretrained=False,
+              use_ssld=False,
+              **kwargs):
+    block_args, global_params = get_model_params("tinynet-b", override_params)
+    model = TinyNet(
+        block_args,
+        global_params,
+        name='b',
+        padding_type=padding_type,
+        use_se=use_se,
+        fix_stem=True,
+        num_features=1280,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["TinyNet_B"], use_ssld)
+    return model
+
+
+def TinyNet_C(padding_type='DYNAMIC',
+              override_params=None,
+              use_se=True,
+              pretrained=False,
+              use_ssld=False,
+              **kwargs):
+    block_args, global_params = get_model_params("tinynet-c", override_params)
+    model = TinyNet(
+        block_args,
+        global_params,
+        name='c',
+        padding_type=padding_type,
+        use_se=use_se,
+        fix_stem=True,
+        num_features=1280,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["TinyNet_C"], use_ssld)
+    return model
+
+
+def TinyNet_D(padding_type='DYNAMIC',
+              override_params=None,
+              use_se=True,
+              pretrained=False,
+              use_ssld=False,
+              **kwargs):
+    block_args, global_params = get_model_params("tinynet-d", override_params)
+    model = TinyNet(
+        block_args,
+        global_params,
+        name='d',
+        padding_type=padding_type,
+        use_se=use_se,
+        fix_stem=True,
+        num_features=1280,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["TinyNet_D"], use_ssld)
+    return model
+
+
+def TinyNet_E(padding_type='DYNAMIC',
+              override_params=None,
+              use_se=True,
+              pretrained=False,
+              use_ssld=False,
+              **kwargs):
+    block_args, global_params = get_model_params("tinynet-e", override_params)
+    model = TinyNet(
+        block_args,
+        global_params,
+        name='e',
+        padding_type=padding_type,
+        use_se=use_se,
+        fix_stem=True,
+        num_features=1280,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["TinyNet_E"], use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/tnt.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/tnt.py
new file mode 100644
index 000000000..a1841d367
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/tnt.py
@@ -0,0 +1,410 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/huawei-noah/CV-Backbones/tree/master/tnt_pytorch
+# reference: https://arxiv.org/abs/2103.00112
+
+import math
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+from paddle.nn.initializer import TruncatedNormal, Constant
+
+from ..base.theseus_layer import Identity
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "TNT_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/TNT_small_pretrained.pdparams",
+    "TNT_base":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/TNT_base_pretrained.pdparams"
+}
+
+__all__ = MODEL_URLS.keys()
+
+trunc_normal_ = TruncatedNormal(std=.02)
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, inputs):
+        return inputs
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = paddle.add(keep_prob, paddle.rand(shape, dtype=x.dtype))
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 hidden_dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        head_dim = hidden_dim // num_heads
+        self.head_dim = head_dim
+        self.scale = head_dim**-0.5
+
+        self.qk = nn.Linear(dim, hidden_dim * 2, bias_attr=qkv_bias)
+        self.v = nn.Linear(dim, dim, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qk = self.qk(x).reshape(
+            (B, N, 2, self.num_heads, self.head_dim)).transpose(
+                (2, 0, 3, 1, 4))
+
+        q, k = qk[0], qk[1]
+        v = self.v(x).reshape(
+            (B, N, self.num_heads, x.shape[-1] // self.num_heads)).transpose(
+                (0, 2, 1, 3))
+
+        attn = paddle.matmul(q, k.transpose((0, 1, 3, 2))) * self.scale
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = paddle.matmul(attn, v)
+        x = x.transpose((0, 2, 1, 3)).reshape(
+            (B, N, x.shape[-1] * x.shape[-3]))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 in_dim,
+                 num_pixel,
+                 num_heads=12,
+                 in_num_head=4,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        # Inner transformer
+        self.norm_in = norm_layer(in_dim)
+        self.attn_in = Attention(
+            in_dim,
+            in_dim,
+            num_heads=in_num_head,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+
+        self.norm_mlp_in = norm_layer(in_dim)
+        self.mlp_in = Mlp(in_features=in_dim,
+                          hidden_features=int(in_dim * 4),
+                          out_features=in_dim,
+                          act_layer=act_layer,
+                          drop=drop)
+
+        self.norm1_proj = norm_layer(in_dim * num_pixel)
+        self.proj = nn.Linear(in_dim * num_pixel, dim, bias_attr=False)
+        self.norm2_proj = norm_layer(in_dim * num_pixel)
+
+        # Outer transformer
+        self.norm_out = norm_layer(dim)
+        self.attn_out = Attention(
+            dim,
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+
+        self.norm_mlp = norm_layer(dim)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=int(dim * mlp_ratio),
+                       out_features=dim,
+                       act_layer=act_layer,
+                       drop=drop)
+
+    def forward(self, pixel_embed, patch_embed):
+        # inner
+        pixel_embed = paddle.add(
+            pixel_embed,
+            self.drop_path(self.attn_in(self.norm_in(pixel_embed))))
+        pixel_embed = paddle.add(
+            pixel_embed,
+            self.drop_path(self.mlp_in(self.norm_mlp_in(pixel_embed))))
+        # outer
+        B, N, C = patch_embed.shape
+        norm1_proj = pixel_embed.reshape(shape=[B, N - 1, C])
+        norm1_proj = self.norm1_proj(norm1_proj)
+        patch_embed[:, 1:] = paddle.add(
+            patch_embed[:, 1:], self.norm2_proj(self.proj(norm1_proj)))
+        patch_embed = paddle.add(
+            patch_embed,
+            self.drop_path(self.attn_out(self.norm_out(patch_embed))))
+        patch_embed = paddle.add(
+            patch_embed, self.drop_path(self.mlp(self.norm_mlp(patch_embed))))
+        return pixel_embed, patch_embed
+
+
+class PixelEmbed(nn.Layer):
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 in_dim=48,
+                 stride=4):
+        super().__init__()
+        self.patch_size = patch_size
+        num_patches = (img_size // patch_size)**2
+        self.img_size = img_size
+        self.num_patches = num_patches
+        self.in_dim = in_dim
+        new_patch_size = math.ceil(patch_size / stride)
+        self.new_patch_size = new_patch_size
+
+        self.proj = nn.Conv2D(
+            in_chans, self.in_dim, kernel_size=7, padding=3, stride=stride)
+
+    def forward(self, x, pixel_pos):
+        B, C, H, W = x.shape
+        assert H == self.img_size and W == self.img_size, f"Input image size ({H}*{W}) doesn't match model ({self.img_size}*{self.img_size})."
+        x = nn.functional.unfold(x, self.patch_size, self.patch_size)
+        x = x.transpose((0, 2, 1)).reshape(
+            (-1, C, self.patch_size, self.patch_size))
+        x = self.proj(x)
+        x = x.reshape((-1, self.in_dim, self.patch_size)).transpose((0, 2, 1))
+        x = x + pixel_pos
+        return x
+
+
+class TNT(nn.Layer):
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 in_dim=48,
+                 depth=12,
+                 num_heads=12,
+                 in_num_head=4,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 first_stride=4,
+                 class_num=1000):
+        super().__init__()
+        self.class_num = class_num
+        # num_features for consistency with other models
+        self.num_features = self.embed_dim = embed_dim
+
+        self.pixel_embed = PixelEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            in_dim=in_dim,
+            stride=first_stride)
+        num_patches = self.pixel_embed.num_patches
+        self.num_patches = num_patches
+        new_patch_size = self.pixel_embed.new_patch_size
+        num_pixel = new_patch_size**2
+
+        self.norm1_proj = norm_layer(num_pixel * in_dim)
+        self.proj = nn.Linear(num_pixel * in_dim, embed_dim)
+        self.norm2_proj = norm_layer(embed_dim)
+
+        self.cls_token = self.create_parameter(
+            shape=(1, 1, embed_dim), default_initializer=zeros_)
+        self.add_parameter("cls_token", self.cls_token)
+
+        self.patch_pos = self.create_parameter(
+            shape=(1, num_patches + 1, embed_dim), default_initializer=zeros_)
+        self.add_parameter("patch_pos", self.patch_pos)
+
+        self.pixel_pos = self.create_parameter(
+            shape=(1, patch_size, in_dim), default_initializer=zeros_)
+        self.add_parameter("pixel_pos", self.pixel_pos)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth decay rule
+        dpr = np.linspace(0, drop_path_rate, depth)
+
+        blocks = []
+        for i in range(depth):
+            blocks.append(
+                Block(
+                    dim=embed_dim,
+                    in_dim=in_dim,
+                    num_pixel=num_pixel,
+                    num_heads=num_heads,
+                    in_num_head=in_num_head,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer))
+        self.blocks = nn.LayerList(blocks)
+        self.norm = norm_layer(embed_dim)
+
+        if class_num > 0:
+            self.head = nn.Linear(embed_dim, class_num)
+
+        trunc_normal_(self.cls_token)
+        trunc_normal_(self.patch_pos)
+        trunc_normal_(self.pixel_pos)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        B = x.shape[0]
+        pixel_embed = self.pixel_embed(x, self.pixel_pos)
+
+        patch_embed = self.norm2_proj(
+            self.proj(
+                self.norm1_proj(
+                    pixel_embed.reshape((-1, self.num_patches, pixel_embed.
+                                         shape[-1] * pixel_embed.shape[-2])))))
+        patch_embed = paddle.concat(
+            (self.cls_token.expand((B, -1, -1)).astype(patch_embed.dtype),
+             patch_embed),
+            axis=1)
+        patch_embed = patch_embed + self.patch_pos
+        patch_embed = self.pos_drop(patch_embed)
+        for blk in self.blocks:
+            pixel_embed, patch_embed = blk(pixel_embed, patch_embed)
+
+        patch_embed = self.norm(patch_embed)
+        return patch_embed[:, 0]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+
+        if self.class_num > 0:
+            x = self.head(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def TNT_small(pretrained=False, use_ssld=False, **kwargs):
+    model = TNT(patch_size=16,
+                embed_dim=384,
+                in_dim=24,
+                depth=12,
+                num_heads=6,
+                in_num_head=4,
+                qkv_bias=False,
+                **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["TNT_small"], use_ssld=use_ssld)
+    return model
+
+
+def TNT_base(pretrained=False, use_ssld=False, **kwargs):
+    model = TNT(patch_size=16,
+                embed_dim=640,
+                in_dim=40,
+                depth=12,
+                num_heads=10,
+                in_num_head=4,
+                qkv_bias=False,
+                **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["TNT_base"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/twins.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/twins.py
new file mode 100644
index 000000000..3f983d962
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/twins.py
@@ -0,0 +1,692 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/Meituan-AutoML/Twins
+# reference: https://arxiv.org/abs/2104.13840
+
+from functools import partial
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.regularizer import L2Decay
+
+from .vision_transformer import trunc_normal_, normal_, zeros_, ones_, to_2tuple, DropPath, Identity, Mlp
+from .vision_transformer import Block as ViTBlock
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "pcpvt_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/pcpvt_small_pretrained.pdparams",
+    "pcpvt_base":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/pcpvt_base_pretrained.pdparams",
+    "pcpvt_large":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/pcpvt_large_pretrained.pdparams",
+    "alt_gvt_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/alt_gvt_small_pretrained.pdparams",
+    "alt_gvt_base":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/alt_gvt_base_pretrained.pdparams",
+    "alt_gvt_large":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/alt_gvt_large_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class GroupAttention(nn.Layer):
+    """LSA: self attention within a group.
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 ws=1):
+        super().__init__()
+        if ws == 1:
+            raise Exception("ws {ws} should not be 1")
+        if dim % num_heads != 0:
+            raise Exception(
+                "dim {dim} should be divided by num_heads {num_heads}.")
+
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.ws = ws
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        h_group, w_group = H // self.ws, W // self.ws
+        total_groups = h_group * w_group
+        x = x.reshape([B, h_group, self.ws, w_group, self.ws, C]).transpose(
+            [0, 1, 3, 2, 4, 5])
+        qkv = self.qkv(x).reshape([
+            B, total_groups, self.ws**2, 3, self.num_heads, C // self.num_heads
+        ]).transpose([3, 0, 1, 4, 2, 5])
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        attn = paddle.matmul(q, k.transpose([0, 1, 2, 4, 3])) * self.scale
+
+        attn = nn.Softmax(axis=-1)(attn)
+        attn = self.attn_drop(attn)
+        attn = paddle.matmul(attn, v).transpose([0, 1, 3, 2, 4]).reshape(
+            [B, h_group, w_group, self.ws, self.ws, C])
+
+        x = attn.transpose([0, 1, 3, 2, 4, 5]).reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    """GSA: using a key to summarize the information for a group to be efficient.
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 sr_ratio=1):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.q = nn.Linear(dim, dim, bias_attr=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        self.sr_ratio = sr_ratio
+        if sr_ratio > 1:
+            self.sr = nn.Conv2D(
+                dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+            self.norm = nn.LayerNorm(dim)
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        q = self.q(x).reshape(
+            [B, N, self.num_heads, C // self.num_heads]).transpose(
+                [0, 2, 1, 3])
+
+        if self.sr_ratio > 1:
+            x_ = x.transpose([0, 2, 1]).reshape([B, C, H, W])
+            tmp_n = H * W // self.sr_ratio**2
+            x_ = self.sr(x_).reshape([B, C, tmp_n]).transpose([0, 2, 1])
+            x_ = self.norm(x_)
+            kv = self.kv(x_).reshape(
+                [B, tmp_n, 2, self.num_heads, C // self.num_heads]).transpose(
+                    [2, 0, 3, 1, 4])
+        else:
+            kv = self.kv(x).reshape(
+                [B, N, 2, self.num_heads, C // self.num_heads]).transpose(
+                    [2, 0, 3, 1, 4])
+        k, v = kv[0], kv[1]
+
+        attn = paddle.matmul(q, k.transpose([0, 1, 3, 2])) * self.scale
+        attn = nn.Softmax(axis=-1)(attn)
+        attn = self.attn_drop(attn)
+
+        x = paddle.matmul(attn, v).transpose([0, 2, 1, 3]).reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 sr_ratio=1):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            sr_ratio=sr_ratio)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+
+    def forward(self, x, H, W):
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class SBlock(ViTBlock):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 sr_ratio=1):
+        super().__init__(dim, num_heads, mlp_ratio, qkv_bias, qk_scale, drop,
+                         attn_drop, drop_path, act_layer, norm_layer)
+
+    def forward(self, x, H, W):
+        return super().forward(x)
+
+
+class GroupBlock(ViTBlock):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 sr_ratio=1,
+                 ws=1):
+        super().__init__(dim, num_heads, mlp_ratio, qkv_bias, qk_scale, drop,
+                         attn_drop, drop_path, act_layer, norm_layer)
+        del self.attn
+        if ws == 1:
+            self.attn = Attention(dim, num_heads, qkv_bias, qk_scale,
+                                  attn_drop, drop, sr_ratio)
+        else:
+            self.attn = GroupAttention(dim, num_heads, qkv_bias, qk_scale,
+                                       attn_drop, drop, ws)
+
+    def forward(self, x, H, W):
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding.
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        if img_size % patch_size != 0:
+            raise Exception(
+                f"img_size {img_size} should be divided by patch_size {patch_size}."
+            )
+
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.H, self.W = img_size[0] // patch_size[0], img_size[
+            1] // patch_size[1]
+        self.num_patches = self.H * self.W
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.norm = nn.LayerNorm(embed_dim)
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x = self.proj(x).flatten(2).transpose([0, 2, 1])
+        x = self.norm(x)
+        H, W = H // self.patch_size[0], W // self.patch_size[1]
+        return x, (H, W)
+
+
+# borrow from PVT https://github.com/whai362/PVT.git
+class PyramidVisionTransformer(nn.Layer):
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dims=[64, 128, 256, 512],
+                 num_heads=[1, 2, 4, 8],
+                 mlp_ratios=[4, 4, 4, 4],
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 depths=[3, 4, 6, 3],
+                 sr_ratios=[8, 4, 2, 1],
+                 block_cls=Block):
+        super().__init__()
+        self.class_num = class_num
+        self.depths = depths
+
+        # patch_embed
+        self.patch_embeds = nn.LayerList()
+        self.pos_embeds = nn.ParameterList()
+        self.pos_drops = nn.LayerList()
+        self.blocks = nn.LayerList()
+
+        for i in range(len(depths)):
+            if i == 0:
+                self.patch_embeds.append(
+                    PatchEmbed(img_size, patch_size, in_chans, embed_dims[i]))
+            else:
+                self.patch_embeds.append(
+                    PatchEmbed(img_size // patch_size // 2**(i - 1), 2,
+                               embed_dims[i - 1], embed_dims[i]))
+            patch_num = self.patch_embeds[i].num_patches + 1 if i == len(
+                embed_dims) - 1 else self.patch_embeds[i].num_patches
+            self.pos_embeds.append(
+                self.create_parameter(
+                    shape=[1, patch_num, embed_dims[i]],
+                    default_initializer=zeros_))
+            self.pos_drops.append(nn.Dropout(p=drop_rate))
+
+        dpr = [
+            float(x) for x in paddle.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+
+        cur = 0
+        for k in range(len(depths)):
+            _block = nn.LayerList([
+                block_cls(
+                    dim=embed_dims[k],
+                    num_heads=num_heads[k],
+                    mlp_ratio=mlp_ratios[k],
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[cur + i],
+                    norm_layer=norm_layer,
+                    sr_ratio=sr_ratios[k]) for i in range(depths[k])
+            ])
+            self.blocks.append(_block)
+            cur += depths[k]
+
+        self.norm = norm_layer(embed_dims[-1])
+
+        # cls_token
+        self.cls_token = self.create_parameter(
+            shape=[1, 1, embed_dims[-1]],
+            default_initializer=zeros_,
+            attr=paddle.ParamAttr(regularizer=L2Decay(0.0)))
+
+        # classification head
+        self.head = nn.Linear(embed_dims[-1],
+                              class_num) if class_num > 0 else Identity()
+
+        # init weights
+        for pos_emb in self.pos_embeds:
+            trunc_normal_(pos_emb)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        B = x.shape[0]
+        for i in range(len(self.depths)):
+            x, (H, W) = self.patch_embeds[i](x)
+            if i == len(self.depths) - 1:
+                cls_tokens = self.cls_token.expand([B, -1, -1]).astype(x.dtype)
+                x = paddle.concat([cls_tokens, x], dim=1)
+            x = x + self.pos_embeds[i]
+            x = self.pos_drops[i](x)
+            for blk in self.blocks[i]:
+                x = blk(x, H, W)
+            if i < len(self.depths) - 1:
+                x = x.reshape([B, H, W, -1]).transpose(
+                    [0, 3, 1, 2]).contiguous()
+        x = self.norm(x)
+        return x[:, 0]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+
+# PEG  from https://arxiv.org/abs/2102.10882
+class PosCNN(nn.Layer):
+    def __init__(self, in_chans, embed_dim=768, s=1):
+        super().__init__()
+        self.proj = nn.Sequential(
+            nn.Conv2D(
+                in_chans,
+                embed_dim,
+                3,
+                s,
+                1,
+                bias_attr=paddle.ParamAttr(regularizer=L2Decay(0.0)),
+                groups=embed_dim,
+                weight_attr=paddle.ParamAttr(regularizer=L2Decay(0.0)), ))
+        self.s = s
+
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        feat_token = x
+        cnn_feat = feat_token.transpose([0, 2, 1]).reshape([B, C, H, W])
+        if self.s == 1:
+            x = self.proj(cnn_feat) + cnn_feat
+        else:
+            x = self.proj(cnn_feat)
+        x = x.flatten(2).transpose([0, 2, 1])
+        return x
+
+
+class CPVTV2(PyramidVisionTransformer):
+    """
+    Use useful results from CPVT. PEG and GAP.
+    Therefore, cls token is no longer required.
+    PEG is used to encode the absolute position on the fly, which greatly affects the performance when input resolution
+    changes during the training (such as segmentation, detection)
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dims=[64, 128, 256, 512],
+                 num_heads=[1, 2, 4, 8],
+                 mlp_ratios=[4, 4, 4, 4],
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 depths=[3, 4, 6, 3],
+                 sr_ratios=[8, 4, 2, 1],
+                 block_cls=Block):
+        super().__init__(img_size, patch_size, in_chans, class_num, embed_dims,
+                         num_heads, mlp_ratios, qkv_bias, qk_scale, drop_rate,
+                         attn_drop_rate, drop_path_rate, norm_layer, depths,
+                         sr_ratios, block_cls)
+        del self.pos_embeds
+        del self.cls_token
+        self.pos_block = nn.LayerList(
+            [PosCNN(embed_dim, embed_dim) for embed_dim in embed_dims])
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        import math
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+        elif isinstance(m, nn.Conv2D):
+            fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+            fan_out //= m._groups
+            normal_(0, math.sqrt(2.0 / fan_out))(m.weight)
+            if m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.BatchNorm2D):
+            m.weight.data.fill_(1.0)
+            m.bias.data.zero_()
+
+    def forward_features(self, x):
+        B = x.shape[0]
+
+        for i in range(len(self.depths)):
+            x, (H, W) = self.patch_embeds[i](x)
+            x = self.pos_drops[i](x)
+
+            for j, blk in enumerate(self.blocks[i]):
+                x = blk(x, H, W)
+                if j == 0:
+                    x = self.pos_block[i](x, H, W)  # PEG here
+
+            if i < len(self.depths) - 1:
+                x = x.reshape([B, H, W, x.shape[-1]]).transpose([0, 3, 1, 2])
+
+        x = self.norm(x)
+        return x.mean(axis=1)  # GAP here
+
+
+class PCPVT(CPVTV2):
+    def __init__(self,
+                 img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dims=[64, 128, 256],
+                 num_heads=[1, 2, 4],
+                 mlp_ratios=[4, 4, 4],
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 depths=[4, 4, 4],
+                 sr_ratios=[4, 2, 1],
+                 block_cls=SBlock):
+        super().__init__(img_size, patch_size, in_chans, class_num, embed_dims,
+                         num_heads, mlp_ratios, qkv_bias, qk_scale, drop_rate,
+                         attn_drop_rate, drop_path_rate, norm_layer, depths,
+                         sr_ratios, block_cls)
+
+
+class ALTGVT(PCPVT):
+    """
+    alias Twins-SVT
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dims=[64, 128, 256],
+                 num_heads=[1, 2, 4],
+                 mlp_ratios=[4, 4, 4],
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 depths=[4, 4, 4],
+                 sr_ratios=[4, 2, 1],
+                 block_cls=GroupBlock,
+                 wss=[7, 7, 7]):
+        super().__init__(img_size, patch_size, in_chans, class_num, embed_dims,
+                         num_heads, mlp_ratios, qkv_bias, qk_scale, drop_rate,
+                         attn_drop_rate, drop_path_rate, norm_layer, depths,
+                         sr_ratios, block_cls)
+        del self.blocks
+        self.wss = wss
+        # transformer encoder
+        dpr = [
+            float(x) for x in paddle.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+        cur = 0
+        self.blocks = nn.LayerList()
+        for k in range(len(depths)):
+            _block = nn.LayerList([
+                block_cls(
+                    dim=embed_dims[k],
+                    num_heads=num_heads[k],
+                    mlp_ratio=mlp_ratios[k],
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[cur + i],
+                    norm_layer=norm_layer,
+                    sr_ratio=sr_ratios[k],
+                    ws=1 if i % 2 == 1 else wss[k]) for i in range(depths[k])
+            ])
+            self.blocks.append(_block)
+            cur += depths[k]
+        self.apply(self._init_weights)
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def pcpvt_small(pretrained=False, use_ssld=False, **kwargs):
+    model = CPVTV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 4, 6, 3],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["pcpvt_small"], use_ssld=use_ssld)
+    return model
+
+
+def pcpvt_base(pretrained=False, use_ssld=False, **kwargs):
+    model = CPVTV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 4, 18, 3],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["pcpvt_base"], use_ssld=use_ssld)
+    return model
+
+
+def pcpvt_large(pretrained=False, use_ssld=False, **kwargs):
+    model = CPVTV2(
+        patch_size=4,
+        embed_dims=[64, 128, 320, 512],
+        num_heads=[1, 2, 5, 8],
+        mlp_ratios=[8, 8, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[3, 8, 27, 3],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["pcpvt_large"], use_ssld=use_ssld)
+    return model
+
+
+def alt_gvt_small(pretrained=False, use_ssld=False, **kwargs):
+    model = ALTGVT(
+        patch_size=4,
+        embed_dims=[64, 128, 256, 512],
+        num_heads=[2, 4, 8, 16],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[2, 2, 10, 4],
+        wss=[7, 7, 7, 7],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["alt_gvt_small"], use_ssld=use_ssld)
+    return model
+
+
+def alt_gvt_base(pretrained=False, use_ssld=False, **kwargs):
+    model = ALTGVT(
+        patch_size=4,
+        embed_dims=[96, 192, 384, 768],
+        num_heads=[3, 6, 12, 24],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[2, 2, 18, 2],
+        wss=[7, 7, 7, 7],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["alt_gvt_base"], use_ssld=use_ssld)
+    return model
+
+
+def alt_gvt_large(pretrained=False, use_ssld=False, **kwargs):
+    model = ALTGVT(
+        patch_size=4,
+        embed_dims=[128, 256, 512, 1024],
+        num_heads=[4, 8, 16, 32],
+        mlp_ratios=[4, 4, 4, 4],
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        depths=[2, 2, 18, 2],
+        wss=[7, 7, 7, 7],
+        sr_ratios=[8, 4, 2, 1],
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["alt_gvt_large"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/uniformer.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/uniformer.py
new file mode 100644
index 000000000..13e579c36
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/uniformer.py
@@ -0,0 +1,552 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/Sense-X/UniFormer
+# reference: https://arxiv.org/abs/2201.09450
+
+from collections import OrderedDict
+from functools import partial
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import math
+from .vision_transformer import trunc_normal_, zeros_, ones_, to_2tuple, DropPath, Identity, Mlp
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "UniFormer_small":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/UniFormer_small_pretrained.pdparams",
+    "UniFormer_small_plus":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/UniFormer_small_plus_pretrained.pdparams",
+    "UniFormer_small_plus_dim64":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/UniFormer_small_plus_dim64_pretrained.pdparams",
+    "UniFormer_base":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/UniFormer_base_pretrained.pdparams",
+    "UniFormer_base_ls":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/UniFormer_base_ls_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+layer_scale = False
+init_value = 1e-6
+
+
+class CMlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1_conv = nn.Conv2D(in_features, hidden_features, 1)
+        self.act = act_layer()
+        self.fc2_conv = nn.Conv2D(hidden_features, out_features, 1)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1_conv(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2_conv(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(
+            shape=[B, N, 3, self.num_heads, C // self.num_heads]).transpose(
+                perm=[2, 0, 3, 1, 4])
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q @k.transpose(perm=[0, 1, 3, 2])) * self.scale
+        attn = nn.Softmax(axis=-1)(attn)
+        attn = self.attn_drop(attn)
+
+        x = (attn @v).transpose(perm=[0, 2, 1, 3]).reshape(shape=[B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class CBlock(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.pos_embed = nn.Conv2D(dim, dim, 3, padding=1, groups=dim)
+        self.norm1 = nn.BatchNorm2D(dim)
+        self.conv1 = nn.Conv2D(dim, dim, 1)
+        self.conv2 = nn.Conv2D(dim, dim, 1)
+        self.attn = nn.Conv2D(dim, dim, 5, padding=2, groups=dim)
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = nn.BatchNorm2D(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = CMlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop)
+
+    def forward(self, x):
+        x = x + self.pos_embed(x)
+        x = x + self.drop_path(
+            self.conv2(self.attn(self.conv1(self.norm1(x)))))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class SABlock(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.pos_embed = nn.Conv2D(dim, dim, 3, padding=1, groups=dim)
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        global layer_scale
+        self.ls = layer_scale
+        if self.ls:
+            global init_value
+            print(f"Use layer_scale: {layer_scale}, init_values: {init_value}")
+            self.gamma_1 = self.create_parameter(
+                [dim],
+                dtype='float32',
+                default_initializer=nn.initializer.Constant(value=init_value))
+            self.gamma_2 = self.create_parameter(
+                [dim],
+                dtype='float32',
+                default_initializer=nn.initializer.Constant(value=init_value))
+
+    def forward(self, x):
+        x = x + self.pos_embed(x)
+        B, N, H, W = x.shape
+        x = x.flatten(2).transpose(perm=[0, 2, 1])
+        if self.ls:
+            x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x)))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.attn(self.norm1(x)))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        x = x.transpose(perm=[0, 2, 1]).reshape(shape=[B, N, H, W])
+        return x
+
+
+class HeadEmbedding(nn.Layer):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+
+        self.proj = nn.Sequential(
+            nn.Conv2D(
+                in_channels,
+                out_channels // 2,
+                kernel_size=(3, 3),
+                stride=(2, 2),
+                padding=(1, 1)),
+            nn.BatchNorm2D(out_channels // 2),
+            nn.GELU(),
+            nn.Conv2D(
+                out_channels // 2,
+                out_channels,
+                kernel_size=(3, 3),
+                stride=(2, 2),
+                padding=(1, 1)),
+            nn.BatchNorm2D(out_channels))
+
+    def forward(self, x):
+        x = self.proj(x)
+        return x
+
+
+class MiddleEmbedding(nn.Layer):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+
+        self.proj = nn.Sequential(
+            nn.Conv2D(
+                in_channels,
+                out_channels,
+                kernel_size=(3, 3),
+                stride=(2, 2),
+                padding=(1, 1)),
+            nn.BatchNorm2D(out_channels))
+
+    def forward(self, x):
+        x = self.proj(x)
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] //
+                                                        patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.norm = nn.LayerNorm(embed_dim)
+        self.proj_conv = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj_conv(x)
+        B, C, H, W = x.shape
+        x = x.flatten(2).transpose(perm=[0, 2, 1])
+        x = self.norm(x)
+        x = x.reshape(shape=[B, H, W, C]).transpose(perm=[0, 3, 1, 2])
+        return x
+
+
+class UniFormer(nn.Layer):
+    """ UniFormer
+    A PaddlePaddle impl of : `UniFormer: Unifying Convolution and Self-attention for Visual Recognition`  -
+        https://arxiv.org/abs/2201.09450
+    """
+
+    def __init__(self,
+                 depth=[3, 4, 8, 3],
+                 img_size=224,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dim=[64, 128, 320, 512],
+                 head_dim=64,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 representation_size=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=None,
+                 conv_stem=False):
+        """
+        Args:
+            depth (list): depth of each stage
+            img_size (int, tuple): input image size
+            in_chans (int): number of input channels
+            class_num (int): number of classes for classification head
+            embed_dim (list): embedding dimension of each stage
+            head_dim (int): head dimension
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            qk_scale (float): override default qk scale of head_dim ** -0.5 if set
+            representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
+            drop_rate (float): dropout rate
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            norm_layer (nn.Module): normalization layer
+            conv_stem (bool): whether use overlapped patch stem
+        """
+        super().__init__()
+        self.class_num = class_num
+        self.num_features = self.embed_dim = embed_dim
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        if conv_stem:
+            self.patch_embed1 = HeadEmbedding(
+                in_channels=in_chans, out_channels=embed_dim[0])
+            self.patch_embed2 = MiddleEmbedding(
+                in_channels=embed_dim[0], out_channels=embed_dim[1])
+            self.patch_embed3 = MiddleEmbedding(
+                in_channels=embed_dim[1], out_channels=embed_dim[2])
+            self.patch_embed4 = MiddleEmbedding(
+                in_channels=embed_dim[2], out_channels=embed_dim[3])
+        else:
+            self.patch_embed1 = PatchEmbed(
+                img_size=img_size,
+                patch_size=4,
+                in_chans=in_chans,
+                embed_dim=embed_dim[0])
+            self.patch_embed2 = PatchEmbed(
+                img_size=img_size // 4,
+                patch_size=2,
+                in_chans=embed_dim[0],
+                embed_dim=embed_dim[1])
+            self.patch_embed3 = PatchEmbed(
+                img_size=img_size // 8,
+                patch_size=2,
+                in_chans=embed_dim[1],
+                embed_dim=embed_dim[2])
+            self.patch_embed4 = PatchEmbed(
+                img_size=img_size // 16,
+                patch_size=2,
+                in_chans=embed_dim[2],
+                embed_dim=embed_dim[3])
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        dpr = [
+            x.item() for x in paddle.linspace(0, drop_path_rate, sum(depth))
+        ]  # stochastic depth decay rule
+        num_heads = [dim // head_dim for dim in embed_dim]
+        self.blocks1 = nn.LayerList([
+            CBlock(
+                dim=embed_dim[0],
+                num_heads=num_heads[0],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer) for i in range(depth[0])
+        ])
+        self.blocks2 = nn.LayerList([
+            CBlock(
+                dim=embed_dim[1],
+                num_heads=num_heads[1],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i + depth[0]],
+                norm_layer=norm_layer) for i in range(depth[1])
+        ])
+        self.blocks3 = nn.LayerList([
+            SABlock(
+                dim=embed_dim[2],
+                num_heads=num_heads[2],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i + depth[0] + depth[1]],
+                norm_layer=norm_layer) for i in range(depth[2])
+        ])
+        self.blocks4 = nn.LayerList([
+            SABlock(
+                dim=embed_dim[3],
+                num_heads=num_heads[3],
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i + depth[0] + depth[1] + depth[2]],
+                norm_layer=norm_layer) for i in range(depth[3])
+        ])
+        self.norm = nn.BatchNorm2D(embed_dim[-1])
+
+        # Representation layer
+        if representation_size:
+            self.num_features = representation_size
+            self.pre_logits = nn.Sequential(
+                OrderedDict([('fc', nn.Linear(embed_dim, representation_size)),
+                             ('act', nn.Tanh())]))
+        else:
+            self.pre_logits = nn.Identity()
+
+        # Classifier head
+        self.head = nn.Linear(embed_dim[-1],
+                              class_num) if class_num > 0 else nn.Identity()
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        x = self.patch_embed1(x)
+        x = self.pos_drop(x)
+        for blk in self.blocks1:
+            x = blk(x)
+        x = self.patch_embed2(x)
+        for blk in self.blocks2:
+            x = blk(x)
+        x = self.patch_embed3(x)
+        for blk in self.blocks3:
+            x = blk(x)
+        x = self.patch_embed4(x)
+        for blk in self.blocks4:
+            x = blk(x)
+        x = self.norm(x)
+        x = self.pre_logits(x)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = x.flatten(2).mean(-1)
+        x = self.head(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def UniFormer_small(pretrained=True, use_ssld=False, **kwargs):
+    model = UniFormer(
+        depth=[3, 4, 8, 3],
+        embed_dim=[64, 128, 320, 512],
+        head_dim=64,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        drop_path_rate=0.1,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["UniFormer_small"], use_ssld=use_ssld)
+    return model
+
+
+def UniFormer_small_plus(pretrained=True, use_ssld=False, **kwargs):
+    model = UniFormer(
+        depth=[3, 5, 9, 3],
+        conv_stem=True,
+        embed_dim=[64, 128, 320, 512],
+        head_dim=32,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        drop_path_rate=0.1,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["UniFormer_small_plus"],
+        use_ssld=use_ssld)
+    return model
+
+
+def UniFormer_small_plus_dim64(pretrained=True, use_ssld=False, **kwargs):
+    model = UniFormer(
+        depth=[3, 5, 9, 3],
+        conv_stem=True,
+        embed_dim=[64, 128, 320, 512],
+        head_dim=64,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        drop_path_rate=0.1,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["UniFormer_small_plus_dim64"],
+        use_ssld=use_ssld)
+    return model
+
+
+def UniFormer_base(pretrained=True, use_ssld=False, **kwargs):
+    model = UniFormer(
+        depth=[5, 8, 20, 7],
+        embed_dim=[64, 128, 320, 512],
+        head_dim=64,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        drop_path_rate=0.3,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["UniFormer_base"], use_ssld=use_ssld)
+    return model
+
+
+def UniFormer_base_ls(pretrained=True, use_ssld=False, **kwargs):
+    global layer_scale
+    layer_scale = True
+    model = UniFormer(
+        depth=[5, 8, 20, 7],
+        embed_dim=[64, 128, 320, 512],
+        head_dim=64,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(
+            nn.LayerNorm, epsilon=1e-6),
+        drop_path_rate=0.3,
+        **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["UniFormer_base_ls"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/van.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/van.py
new file mode 100644
index 000000000..18ac7f7a1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/van.py
@@ -0,0 +1,362 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was heavily based on https://github.com/Visual-Attention-Network/VAN-Classification
+# reference: https://arxiv.org/abs/2202.09741
+
+from functools import partial
+import math
+import paddle
+import paddle.nn as nn
+from paddle.nn.initializer import TruncatedNormal, Constant
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "VAN_B0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/VAN_B0_pretrained.pdparams",
+    "VAN_B1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/VAN_B1_pretrained.pdparams",
+    "VAN_B2":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/VAN_B2_pretrained.pdparams",
+    "VAN_B3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/VAN_B3_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+trunc_normal_ = TruncatedNormal(std=.02)
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+@paddle.jit.not_to_static
+def swapdim(x, dim1, dim2):
+    a = list(range(len(x.shape)))
+    a[dim1], a[dim2] = a[dim2], a[dim1]
+    return x.transpose(a)
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2D(in_features, hidden_features, 1)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Conv2D(hidden_features, out_features, 1)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.dwconv(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class LKA(nn.Layer):
+    def __init__(self, dim):
+        super().__init__()
+        self.conv0 = nn.Conv2D(dim, dim, 5, padding=2, groups=dim)
+        self.conv_spatial = nn.Conv2D(
+            dim, dim, 7, stride=1, padding=9, groups=dim, dilation=3)
+        self.conv1 = nn.Conv2D(dim, dim, 1)
+
+    def forward(self, x):
+        attn = self.conv0(x)
+        attn = self.conv_spatial(attn)
+        attn = self.conv1(attn)
+        return x * attn
+
+
+class Attention(nn.Layer):
+    def __init__(self, d_model):
+        super().__init__()
+        self.proj_1 = nn.Conv2D(d_model, d_model, 1)
+        self.activation = nn.GELU()
+        self.spatial_gating_unit = LKA(d_model)
+        self.proj_2 = nn.Conv2D(d_model, d_model, 1)
+
+    def forward(self, x):
+        shorcut = x
+        x = self.proj_1(x)
+        x = self.activation(x)
+        x = self.spatial_gating_unit(x)
+        x = self.proj_2(x)
+        x = x + shorcut
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU):
+        super().__init__()
+        self.norm1 = nn.BatchNorm2D(dim)
+        self.attn = Attention(dim)
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = nn.BatchNorm2D(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        layer_scale_init_value = 1e-2
+        self.layer_scale_1 = self.create_parameter(
+            shape=[dim, 1, 1],
+            default_initializer=Constant(value=layer_scale_init_value))
+        self.layer_scale_2 = self.create_parameter(
+            shape=[dim, 1, 1],
+            default_initializer=Constant(value=layer_scale_init_value))
+
+    def forward(self, x):
+        x = x + self.drop_path(self.layer_scale_1 * self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.layer_scale_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class OverlapPatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=7,
+                 stride=4,
+                 in_chans=3,
+                 embed_dim=768):
+        super().__init__()
+        self.proj = nn.Conv2D(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=patch_size // 2)
+        self.norm = nn.BatchNorm2D(embed_dim)
+
+    def forward(self, x):
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        x = self.norm(x)
+        return x, H, W
+
+
+class VAN(nn.Layer):
+    r""" VAN
+    A PaddlePaddle impl of : `Visual Attention Network`  -
+      https://arxiv.org/pdf/2202.09741.pdf
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dims=[64, 128, 256, 512],
+                 mlp_ratios=[4, 4, 4, 4],
+                 drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer=nn.LayerNorm,
+                 depths=[3, 4, 6, 3],
+                 num_stages=4,
+                 flag=False):
+        super().__init__()
+        if flag == False:
+            self.class_num = class_num
+        self.depths = depths
+        self.num_stages = num_stages
+
+        dpr = [x for x in paddle.linspace(0, drop_path_rate, sum(depths))
+               ]  # stochastic depth decay rule
+        cur = 0
+
+        for i in range(num_stages):
+            patch_embed = OverlapPatchEmbed(
+                img_size=img_size if i == 0 else img_size // (2**(i + 1)),
+                patch_size=7 if i == 0 else 3,
+                stride=4 if i == 0 else 2,
+                in_chans=in_chans if i == 0 else embed_dims[i - 1],
+                embed_dim=embed_dims[i])
+
+            block = nn.LayerList([
+                Block(
+                    dim=embed_dims[i],
+                    mlp_ratio=mlp_ratios[i],
+                    drop=drop_rate,
+                    drop_path=dpr[cur + j]) for j in range(depths[i])
+            ])
+            norm = norm_layer(embed_dims[i])
+            cur += depths[i]
+
+            setattr(self, f"patch_embed{i + 1}", patch_embed)
+            setattr(self, f"block{i + 1}", block)
+            setattr(self, f"norm{i + 1}", norm)
+
+        # classification head
+        self.head = nn.Linear(embed_dims[3],
+                              class_num) if class_num > 0 else nn.Identity()
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+        elif isinstance(m, nn.Conv2D):
+            fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels
+            fan_out //= m._groups
+            m.weight.set_value(
+                paddle.normal(
+                    std=math.sqrt(2.0 / fan_out), shape=m.weight.shape))
+            if m.bias is not None:
+                zeros_(m.bias)
+
+    def forward_features(self, x):
+        B = x.shape[0]
+
+        for i in range(self.num_stages):
+            patch_embed = getattr(self, f"patch_embed{i + 1}")
+            block = getattr(self, f"block{i + 1}")
+            norm = getattr(self, f"norm{i + 1}")
+            x, H, W = patch_embed(x)
+            for blk in block:
+                x = blk(x)
+
+            x = x.flatten(2)
+            x = swapdim(x, 1, 2)
+            x = norm(x)
+            if i != self.num_stages - 1:
+                x = x.reshape([B, H, W, x.shape[2]]).transpose([0, 3, 1, 2])
+
+        return x.mean(axis=1)
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+
+        return x
+
+
+class DWConv(nn.Layer):
+    def __init__(self, dim=768):
+        super().__init__()
+        self.dwconv = nn.Conv2D(dim, dim, 3, 1, 1, bias_attr=True, groups=dim)
+
+    def forward(self, x):
+        x = self.dwconv(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def VAN_B0(pretrained=False, use_ssld=False, **kwargs):
+    model = VAN(embed_dims=[32, 64, 160, 256],
+                mlp_ratios=[8, 8, 4, 4],
+                norm_layer=partial(
+                    nn.LayerNorm, epsilon=1e-6),
+                depths=[3, 3, 5, 2],
+                **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["VAN_B0"], use_ssld=use_ssld)
+    return model
+
+
+def VAN_B1(pretrained=False, use_ssld=False, **kwargs):
+    model = VAN(embed_dims=[64, 128, 320, 512],
+                mlp_ratios=[8, 8, 4, 4],
+                norm_layer=partial(
+                    nn.LayerNorm, epsilon=1e-6),
+                depths=[2, 2, 4, 2],
+                **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["VAN_B1"], use_ssld=use_ssld)
+    return model
+
+
+def VAN_B2(pretrained=False, use_ssld=False, **kwargs):
+    model = VAN(embed_dims=[64, 128, 320, 512],
+                mlp_ratios=[8, 8, 4, 4],
+                norm_layer=partial(
+                    nn.LayerNorm, epsilon=1e-6),
+                depths=[3, 3, 12, 3],
+                **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["VAN_B2"], use_ssld=use_ssld)
+    return model
+
+
+def VAN_B3(pretrained=False, use_ssld=False, **kwargs):
+    model = VAN(embed_dims=[64, 128, 320, 512],
+                mlp_ratios=[8, 8, 4, 4],
+                norm_layer=partial(
+                    nn.LayerNorm, epsilon=1e-6),
+                depths=[3, 5, 27, 3],
+                **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["VAN_B3"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/vision_transformer.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/vision_transformer.py
new file mode 100644
index 000000000..5a015702c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/vision_transformer.py
@@ -0,0 +1,459 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was based on https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+# reference: https://arxiv.org/abs/2010.11929
+
+from collections.abc import Callable
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+from paddle.nn.initializer import TruncatedNormal, Constant, Normal
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "ViT_small_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_small_patch16_224_pretrained.pdparams",
+    "ViT_base_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_224_pretrained.pdparams",
+    "ViT_base_patch16_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch16_384_pretrained.pdparams",
+    "ViT_base_patch32_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_base_patch32_384_pretrained.pdparams",
+    "ViT_large_patch16_224":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_large_patch16_224_pretrained.pdparams",
+    "ViT_large_patch16_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_large_patch16_384_pretrained.pdparams",
+    "ViT_large_patch32_384":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ViT_large_patch32_384_pretrained.pdparams",
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+trunc_normal_ = TruncatedNormal(std=.02)
+normal_ = Normal
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+def to_2tuple(x):
+    return tuple([x] * 2)
+
+
+def drop_path(x, drop_prob=0., training=False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.full(shape=[], fill_value=1 - drop_prob, dtype=x.dtype)
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        # B= x.shape[0]
+        N, C = x.shape[1:]
+        qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C //
+                                   self.num_heads)).transpose((2, 0, 3, 1, 4))
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5):
+        super().__init__()
+        if isinstance(norm_layer, str):
+            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
+        elif isinstance(norm_layer, Callable):
+            self.norm1 = norm_layer(dim)
+        else:
+            raise TypeError(
+                "The norm_layer must be str or paddle.nn.layer.Layer class")
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        if isinstance(norm_layer, str):
+            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
+        elif isinstance(norm_layer, Callable):
+            self.norm2 = norm_layer(dim)
+        else:
+            raise TypeError(
+                "The norm_layer must be str or paddle.nn.layer.Layer class")
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * \
+            (img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+
+        x = self.proj(x).flatten(2).transpose((0, 2, 1))
+        return x
+
+
+class VisionTransformer(nn.Layer):
+    """ Vision Transformer with support for patch input
+    """
+
+    def __init__(self,
+                 img_size=224,
+                 patch_size=16,
+                 in_chans=3,
+                 class_num=1000,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5,
+                 **kwargs):
+        super().__init__()
+        self.class_num = class_num
+
+        self.num_features = self.embed_dim = embed_dim
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.pos_embed = self.create_parameter(
+            shape=(1, num_patches + 1, embed_dim), default_initializer=zeros_)
+        self.add_parameter("pos_embed", self.pos_embed)
+        self.cls_token = self.create_parameter(
+            shape=(1, 1, embed_dim), default_initializer=zeros_)
+        self.add_parameter("cls_token", self.cls_token)
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr = np.linspace(0, drop_path_rate, depth)
+
+        self.blocks = nn.LayerList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                epsilon=epsilon) for i in range(depth)
+        ])
+
+        self.norm = eval(norm_layer)(embed_dim, epsilon=epsilon)
+
+        # Classifier head
+        self.head = nn.Linear(embed_dim,
+                              class_num) if class_num > 0 else Identity()
+
+        trunc_normal_(self.pos_embed)
+        trunc_normal_(self.cls_token)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward_features(self, x):
+        # B = x.shape[0]
+        B = x.shape[0]
+        x = self.patch_embed(x)
+        cls_tokens = self.cls_token.expand((B, -1, -1)).astype(x.dtype)
+        x = paddle.concat((cls_tokens, x), axis=1)
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+        for blk in self.blocks:
+            x = blk(x)
+        x = self.norm(x)
+        return x[:, 0]
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ViT_small_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=768,
+        depth=8,
+        num_heads=8,
+        mlp_ratio=3,
+        qk_scale=768**-0.5,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ViT_small_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ViT_base_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ViT_base_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ViT_base_patch16_384(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ViT_base_patch16_384"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ViT_base_patch32_384(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=32,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ViT_base_patch32_384"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ViT_large_patch16_224(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ViT_large_patch16_224"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ViT_large_patch16_384(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ViT_large_patch16_384"],
+        use_ssld=use_ssld)
+    return model
+
+
+def ViT_large_patch32_384(pretrained=False, use_ssld=False, **kwargs):
+    model = VisionTransformer(
+        img_size=384,
+        patch_size=32,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        epsilon=1e-6,
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model,
+        MODEL_URLS["ViT_large_patch32_384"],
+        use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/wideresnet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/wideresnet.py
new file mode 100644
index 000000000..8efd3220b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/wideresnet.py
@@ -0,0 +1,236 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from ..base.theseus_layer import TheseusLayer
+"""
+backbone option "WideResNet"
+code in this file is adpated from
+https://github.com/kekmodel/FixMatch-pytorch/blob/master/models/wideresnet.py
+thanks!
+"""
+
+
+def mish(x):
+    """Mish: A Self Regularized Non-Monotonic Neural Activation Function (https://arxiv.org/abs/1908.08681)"""
+    return x * paddle.tanh(F.softplus(x))
+
+
+class PSBatchNorm2D(nn.BatchNorm2D):
+    """How Does BN Increase Collapsed Neural Network Filters? (https://arxiv.org/abs/2001.11216)"""
+
+    def __init__(self,
+                 num_features,
+                 alpha=0.1,
+                 eps=1e-05,
+                 momentum=0.999,
+                 weight_attr=None,
+                 bias_attr=None):
+        super().__init__(num_features, momentum, eps, weight_attr, bias_attr)
+        self.alpha = alpha
+
+    def forward(self, x):
+        return super().forward(x) + self.alpha
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 in_planes,
+                 out_planes,
+                 stride,
+                 drop_rate=0.0,
+                 activate_before_residual=False):
+        super(BasicBlock, self).__init__()
+        self.bn1 = nn.BatchNorm2D(in_planes, momentum=0.999)
+        self.relu1 = nn.LeakyReLU(negative_slope=0.1)
+        self.conv1 = nn.Conv2D(
+            in_planes,
+            out_planes,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias_attr=False)
+        self.bn2 = nn.BatchNorm2D(out_planes, momentum=0.999)
+        self.relu2 = nn.LeakyReLU(negative_slope=0.1)
+        self.conv2 = nn.Conv2D(
+            out_planes,
+            out_planes,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias_attr=False)
+        self.drop_rate = drop_rate
+        self.equalInOut = (in_planes == out_planes)
+        self.convShortcut = (not self.equalInOut) and nn.Conv2D(
+            in_planes,
+            out_planes,
+            kernel_size=1,
+            stride=stride,
+            padding=0,
+            bias_attr=False) or None
+        self.activate_before_residual = activate_before_residual
+
+    def forward(self, x):
+        if not self.equalInOut and self.activate_before_residual == True:
+            x = self.relu1(self.bn1(x))
+        else:
+            out = self.relu1(self.bn1(x))
+        out = self.relu2(self.bn2(self.conv1(out if self.equalInOut else x)))
+        if self.drop_rate > 0:
+            out = F.dropout(out, p=self.drop_rate, training=self.training)
+        out = self.conv2(out)
+        return paddle.add(x if self.equalInOut else self.convShortcut(x), out)
+
+
+class NetworkBlock(nn.Layer):
+    def __init__(self,
+                 nb_layers,
+                 in_planes,
+                 out_planes,
+                 block,
+                 stride,
+                 drop_rate=0.0,
+                 activate_before_residual=False):
+        super(NetworkBlock, self).__init__()
+        self.layer = self._make_layer(block, in_planes, out_planes, nb_layers,
+                                      stride, drop_rate,
+                                      activate_before_residual)
+
+    def _make_layer(self, block, in_planes, out_planes, nb_layers, stride,
+                    drop_rate, activate_before_residual):
+        layers = []
+        for i in range(int(nb_layers)):
+            layers.append(
+                block(i == 0 and in_planes or out_planes, out_planes, i == 0
+                      and stride or 1, drop_rate, activate_before_residual))
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.layer(x)
+
+
+class Normalize(nn.Layer):
+    """ Ln normalization copied from
+    https://github.com/salesforce/CoMatch
+    """
+
+    def __init__(self, power=2):
+        super(Normalize, self).__init__()
+        self.power = power
+
+    def forward(self, x):
+        norm = x.pow(self.power).sum(1, keepdim=True).pow(1. / self.power)
+        out = x.divide(norm)
+        return out
+
+
+class Wide_ResNet(TheseusLayer):
+    def __init__(self,
+                 num_classes,
+                 depth=28,
+                 widen_factor=2,
+                 drop_rate=0.0,
+                 proj=False,
+                 proj_after=False,
+                 low_dim=64):
+        super(Wide_ResNet, self).__init__()
+        # prepare self values
+        self.widen_factor = widen_factor
+        self.depth = depth
+        self.drop_rate = drop_rate
+        # if use projection head
+        self.proj = proj
+        # if use the output of projection head for classification
+        self.proj_after = proj_after
+        self.low_dim = low_dim
+        channels = [
+            16, 16 * widen_factor, 32 * widen_factor, 64 * widen_factor
+        ]
+        assert ((depth - 4) % 6 == 0)
+        n = (depth - 4) / 6
+        block = BasicBlock
+        # 1st conv before any network block
+        self.conv1 = nn.Conv2D(
+            3,
+            channels[0],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias_attr=False)
+        # 1st block
+        self.block1 = NetworkBlock(
+            n,
+            channels[0],
+            channels[1],
+            block,
+            1,
+            drop_rate,
+            activate_before_residual=True)
+        # 2nd block
+        self.block2 = NetworkBlock(n, channels[1], channels[2], block, 2,
+                                   drop_rate)
+        # 3rd block
+        self.block3 = NetworkBlock(n, channels[2], channels[3], block, 2,
+                                   drop_rate)
+        # global average pooling and classifier
+        self.bn1 = nn.BatchNorm2D(channels[3], momentum=0.999)
+        self.relu = nn.LeakyReLU(negative_slope=0.1)
+
+        # if proj after means we classify after projection head
+        # so we must change the in channel to low_dim of laster fc
+        if self.proj_after:
+            self.fc = nn.Linear(self.low_dim, num_classes)
+        else:
+            self.fc = nn.Linear(channels[3], num_classes)
+        self.channels = channels[3]
+        # projection head
+        if self.proj:
+            self.l2norm = Normalize(2)
+
+            self.fc1 = nn.Linear(64 * self.widen_factor,
+                                 64 * self.widen_factor)
+            self.relu_mlp = nn.LeakyReLU(negative_slope=0.1)
+            self.fc2 = nn.Linear(64 * self.widen_factor, self.low_dim)
+
+    def forward(self, x):
+        feat = self.conv1(x)
+        feat = self.block1(feat)
+        feat = self.block2(feat)
+        feat = self.block3(feat)
+        feat = self.relu(self.bn1(feat))
+        feat = F.adaptive_avg_pool2d(feat, 1)
+        feat = paddle.reshape(feat, [-1, self.channels])
+        if self.proj:
+            pfeat = self.fc1(feat)
+            pfeat = self.relu_mlp(pfeat)
+            pfeat = self.fc2(pfeat)
+            pfeat = self.l2norm(pfeat)
+
+            # if projection after classifiy, we classify last
+            if self.proj_after:
+                out = self.fc(pfeat)
+            else:
+                out = self.fc(feat)
+
+            return out, pfeat
+
+        # output
+        out = self.fc(feat)
+        return out
+
+
+def WideResNet(depth,
+               widen_factor,
+               dropout,
+               num_classes,
+               proj=False,
+               low_dim=64,
+               **kwargs):
+    return Wide_ResNet(
+        depth=depth,
+        widen_factor=widen_factor,
+        drop_rate=dropout,
+        num_classes=num_classes,
+        proj=proj,
+        low_dim=low_dim,
+        **kwargs)
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/xception.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/xception.py
new file mode 100644
index 000000000..7615cd706
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/xception.py
@@ -0,0 +1,393 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1610.02357
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+import sys
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "Xception41":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Xception41_pretrained.pdparams",
+    "Xception65":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Xception65_pretrained.pdparams",
+    "Xception71":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Xception71_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(name=name + "_weights"),
+            bias_attr=False)
+        bn_name = "bn_" + name
+        self._batch_norm = BatchNorm(
+            num_filters,
+            act=act,
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(name=bn_name + "_offset"),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class SeparableConv(nn.Layer):
+    def __init__(self, input_channels, output_channels, stride=1, name=None):
+        super(SeparableConv, self).__init__()
+
+        self._pointwise_conv = ConvBNLayer(
+            input_channels, output_channels, 1, name=name + "_sep")
+        self._depthwise_conv = ConvBNLayer(
+            output_channels,
+            output_channels,
+            3,
+            stride=stride,
+            groups=output_channels,
+            name=name + "_dw")
+
+    def forward(self, inputs):
+        x = self._pointwise_conv(inputs)
+        x = self._depthwise_conv(x)
+        return x
+
+
+class EntryFlowBottleneckBlock(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 stride=2,
+                 name=None,
+                 relu_first=False):
+        super(EntryFlowBottleneckBlock, self).__init__()
+        self.relu_first = relu_first
+
+        self._short = Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=1,
+            stride=stride,
+            padding=0,
+            weight_attr=ParamAttr(name + "_branch1_weights"),
+            bias_attr=False)
+        self._conv1 = SeparableConv(
+            input_channels,
+            output_channels,
+            stride=1,
+            name=name + "_branch2a_weights")
+        self._conv2 = SeparableConv(
+            output_channels,
+            output_channels,
+            stride=1,
+            name=name + "_branch2b_weights")
+        self._pool = MaxPool2D(kernel_size=3, stride=stride, padding=1)
+
+    def forward(self, inputs):
+        conv0 = inputs
+        short = self._short(inputs)
+        if self.relu_first:
+            conv0 = F.relu(conv0)
+        conv1 = self._conv1(conv0)
+        conv2 = F.relu(conv1)
+        conv2 = self._conv2(conv2)
+        pool = self._pool(conv2)
+        return paddle.add(x=short, y=pool)
+
+
+class EntryFlow(nn.Layer):
+    def __init__(self, block_num=3):
+        super(EntryFlow, self).__init__()
+
+        name = "entry_flow"
+        self.block_num = block_num
+        self._conv1 = ConvBNLayer(
+            3, 32, 3, stride=2, act="relu", name=name + "_conv1")
+        self._conv2 = ConvBNLayer(32, 64, 3, act="relu", name=name + "_conv2")
+        if block_num == 3:
+            self._conv_0 = EntryFlowBottleneckBlock(
+                64, 128, stride=2, name=name + "_0", relu_first=False)
+            self._conv_1 = EntryFlowBottleneckBlock(
+                128, 256, stride=2, name=name + "_1", relu_first=True)
+            self._conv_2 = EntryFlowBottleneckBlock(
+                256, 728, stride=2, name=name + "_2", relu_first=True)
+        elif block_num == 5:
+            self._conv_0 = EntryFlowBottleneckBlock(
+                64, 128, stride=2, name=name + "_0", relu_first=False)
+            self._conv_1 = EntryFlowBottleneckBlock(
+                128, 256, stride=1, name=name + "_1", relu_first=True)
+            self._conv_2 = EntryFlowBottleneckBlock(
+                256, 256, stride=2, name=name + "_2", relu_first=True)
+            self._conv_3 = EntryFlowBottleneckBlock(
+                256, 728, stride=1, name=name + "_3", relu_first=True)
+            self._conv_4 = EntryFlowBottleneckBlock(
+                728, 728, stride=2, name=name + "_4", relu_first=True)
+        else:
+            sys.exit(-1)
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._conv2(x)
+
+        if self.block_num == 3:
+            x = self._conv_0(x)
+            x = self._conv_1(x)
+            x = self._conv_2(x)
+        elif self.block_num == 5:
+            x = self._conv_0(x)
+            x = self._conv_1(x)
+            x = self._conv_2(x)
+            x = self._conv_3(x)
+            x = self._conv_4(x)
+        return x
+
+
+class MiddleFlowBottleneckBlock(nn.Layer):
+    def __init__(self, input_channels, output_channels, name):
+        super(MiddleFlowBottleneckBlock, self).__init__()
+
+        self._conv_0 = SeparableConv(
+            input_channels,
+            output_channels,
+            stride=1,
+            name=name + "_branch2a_weights")
+        self._conv_1 = SeparableConv(
+            output_channels,
+            output_channels,
+            stride=1,
+            name=name + "_branch2b_weights")
+        self._conv_2 = SeparableConv(
+            output_channels,
+            output_channels,
+            stride=1,
+            name=name + "_branch2c_weights")
+
+    def forward(self, inputs):
+        conv0 = F.relu(inputs)
+        conv0 = self._conv_0(conv0)
+        conv1 = F.relu(conv0)
+        conv1 = self._conv_1(conv1)
+        conv2 = F.relu(conv1)
+        conv2 = self._conv_2(conv2)
+        return paddle.add(x=inputs, y=conv2)
+
+
+class MiddleFlow(nn.Layer):
+    def __init__(self, block_num=8):
+        super(MiddleFlow, self).__init__()
+
+        self.block_num = block_num
+        self._conv_0 = MiddleFlowBottleneckBlock(
+            728, 728, name="middle_flow_0")
+        self._conv_1 = MiddleFlowBottleneckBlock(
+            728, 728, name="middle_flow_1")
+        self._conv_2 = MiddleFlowBottleneckBlock(
+            728, 728, name="middle_flow_2")
+        self._conv_3 = MiddleFlowBottleneckBlock(
+            728, 728, name="middle_flow_3")
+        self._conv_4 = MiddleFlowBottleneckBlock(
+            728, 728, name="middle_flow_4")
+        self._conv_5 = MiddleFlowBottleneckBlock(
+            728, 728, name="middle_flow_5")
+        self._conv_6 = MiddleFlowBottleneckBlock(
+            728, 728, name="middle_flow_6")
+        self._conv_7 = MiddleFlowBottleneckBlock(
+            728, 728, name="middle_flow_7")
+        if block_num == 16:
+            self._conv_8 = MiddleFlowBottleneckBlock(
+                728, 728, name="middle_flow_8")
+            self._conv_9 = MiddleFlowBottleneckBlock(
+                728, 728, name="middle_flow_9")
+            self._conv_10 = MiddleFlowBottleneckBlock(
+                728, 728, name="middle_flow_10")
+            self._conv_11 = MiddleFlowBottleneckBlock(
+                728, 728, name="middle_flow_11")
+            self._conv_12 = MiddleFlowBottleneckBlock(
+                728, 728, name="middle_flow_12")
+            self._conv_13 = MiddleFlowBottleneckBlock(
+                728, 728, name="middle_flow_13")
+            self._conv_14 = MiddleFlowBottleneckBlock(
+                728, 728, name="middle_flow_14")
+            self._conv_15 = MiddleFlowBottleneckBlock(
+                728, 728, name="middle_flow_15")
+
+    def forward(self, inputs):
+        x = self._conv_0(inputs)
+        x = self._conv_1(x)
+        x = self._conv_2(x)
+        x = self._conv_3(x)
+        x = self._conv_4(x)
+        x = self._conv_5(x)
+        x = self._conv_6(x)
+        x = self._conv_7(x)
+        if self.block_num == 16:
+            x = self._conv_8(x)
+            x = self._conv_9(x)
+            x = self._conv_10(x)
+            x = self._conv_11(x)
+            x = self._conv_12(x)
+            x = self._conv_13(x)
+            x = self._conv_14(x)
+            x = self._conv_15(x)
+        return x
+
+
+class ExitFlowBottleneckBlock(nn.Layer):
+    def __init__(self, input_channels, output_channels1, output_channels2,
+                 name):
+        super(ExitFlowBottleneckBlock, self).__init__()
+
+        self._short = Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels2,
+            kernel_size=1,
+            stride=2,
+            padding=0,
+            weight_attr=ParamAttr(name + "_branch1_weights"),
+            bias_attr=False)
+        self._conv_1 = SeparableConv(
+            input_channels,
+            output_channels1,
+            stride=1,
+            name=name + "_branch2a_weights")
+        self._conv_2 = SeparableConv(
+            output_channels1,
+            output_channels2,
+            stride=1,
+            name=name + "_branch2b_weights")
+        self._pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
+
+    def forward(self, inputs):
+        short = self._short(inputs)
+        conv0 = F.relu(inputs)
+        conv1 = self._conv_1(conv0)
+        conv2 = F.relu(conv1)
+        conv2 = self._conv_2(conv2)
+        pool = self._pool(conv2)
+        return paddle.add(x=short, y=pool)
+
+
+class ExitFlow(nn.Layer):
+    def __init__(self, class_num):
+        super(ExitFlow, self).__init__()
+
+        name = "exit_flow"
+
+        self._conv_0 = ExitFlowBottleneckBlock(
+            728, 728, 1024, name=name + "_1")
+        self._conv_1 = SeparableConv(1024, 1536, stride=1, name=name + "_2")
+        self._conv_2 = SeparableConv(1536, 2048, stride=1, name=name + "_3")
+        self._pool = AdaptiveAvgPool2D(1)
+        stdv = 1.0 / math.sqrt(2048 * 1.0)
+        self._out = Linear(
+            2048,
+            class_num,
+            weight_attr=ParamAttr(
+                name="fc_weights", initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(name="fc_offset"))
+
+    def forward(self, inputs):
+        conv0 = self._conv_0(inputs)
+        conv1 = self._conv_1(conv0)
+        conv1 = F.relu(conv1)
+        conv2 = self._conv_2(conv1)
+        conv2 = F.relu(conv2)
+        pool = self._pool(conv2)
+        pool = paddle.flatten(pool, start_axis=1, stop_axis=-1)
+        out = self._out(pool)
+        return out
+
+
+class Xception(nn.Layer):
+    def __init__(self,
+                 entry_flow_block_num=3,
+                 middle_flow_block_num=8,
+                 class_num=1000):
+        super(Xception, self).__init__()
+        self.entry_flow_block_num = entry_flow_block_num
+        self.middle_flow_block_num = middle_flow_block_num
+        self._entry_flow = EntryFlow(entry_flow_block_num)
+        self._middle_flow = MiddleFlow(middle_flow_block_num)
+        self._exit_flow = ExitFlow(class_num)
+
+    def forward(self, inputs):
+        x = self._entry_flow(inputs)
+        x = self._middle_flow(x)
+        x = self._exit_flow(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def Xception41(pretrained=False, use_ssld=False, **kwargs):
+    model = Xception(entry_flow_block_num=3, middle_flow_block_num=8, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["Xception41"], use_ssld=use_ssld)
+    return model
+
+
+def Xception65(pretrained=False, use_ssld=False, **kwargs):
+    model = Xception(
+        entry_flow_block_num=3, middle_flow_block_num=16, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["Xception65"], use_ssld=use_ssld)
+    return model
+
+
+def Xception71(pretrained=False, use_ssld=False, **kwargs):
+    model = Xception(
+        entry_flow_block_num=5, middle_flow_block_num=16, **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["Xception71"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/xception_deeplab.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/xception_deeplab.py
new file mode 100644
index 000000000..f5a7fa529
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/model_zoo/xception_deeplab.py
@@ -0,0 +1,423 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1706.05587
+
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+
+from ....utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "Xception41_deeplab":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Xception41_deeplab_pretrained.pdparams",
+    "Xception65_deeplab":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/Xception65_deeplab_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def check_data(data, number):
+    if type(data) == int:
+        return [data] * number
+    assert len(data) == number
+    return data
+
+
+def check_stride(s, os):
+    if s <= os:
+        return True
+    else:
+        return False
+
+
+def check_points(count, points):
+    if points is None:
+        return False
+    else:
+        if isinstance(points, list):
+            return (True if count in points else False)
+        else:
+            return (True if count == points else False)
+
+
+def gen_bottleneck_params(backbone='xception_65'):
+    if backbone == 'xception_65':
+        bottleneck_params = {
+            "entry_flow": (3, [2, 2, 2], [128, 256, 728]),
+            "middle_flow": (16, 1, 728),
+            "exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]])
+        }
+    elif backbone == 'xception_41':
+        bottleneck_params = {
+            "entry_flow": (3, [2, 2, 2], [128, 256, 728]),
+            "middle_flow": (8, 1, 728),
+            "exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]])
+        }
+    elif backbone == 'xception_71':
+        bottleneck_params = {
+            "entry_flow": (5, [2, 1, 2, 1, 2], [128, 256, 256, 728, 728]),
+            "middle_flow": (16, 1, 728),
+            "exit_flow": (2, [2, 1], [[728, 1024, 1024], [1536, 1536, 2048]])
+        }
+    else:
+        raise Exception(
+            "xception backbont only support xception_41/xception_65/xception_71"
+        )
+    return bottleneck_params
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 act=None,
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=input_channels,
+            out_channels=output_channels,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            weight_attr=ParamAttr(name=name + "/weights"),
+            bias_attr=False)
+        self._bn = BatchNorm(
+            num_channels=output_channels,
+            act=act,
+            epsilon=1e-3,
+            momentum=0.99,
+            param_attr=ParamAttr(name=name + "/BatchNorm/gamma"),
+            bias_attr=ParamAttr(name=name + "/BatchNorm/beta"),
+            moving_mean_name=name + "/BatchNorm/moving_mean",
+            moving_variance_name=name + "/BatchNorm/moving_variance")
+
+    def forward(self, inputs):
+        return self._bn(self._conv(inputs))
+
+
+class Seperate_Conv(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 stride,
+                 filter,
+                 dilation=1,
+                 act=None,
+                 name=None):
+        super(Seperate_Conv, self).__init__()
+
+        self._conv1 = Conv2D(
+            in_channels=input_channels,
+            out_channels=input_channels,
+            kernel_size=filter,
+            stride=stride,
+            groups=input_channels,
+            padding=(filter) // 2 * dilation,
+            dilation=dilation,
+            weight_attr=ParamAttr(name=name + "/depthwise/weights"),
+            bias_attr=False)
+        self._bn1 = BatchNorm(
+            input_channels,
+            act=act,
+            epsilon=1e-3,
+            momentum=0.99,
+            param_attr=ParamAttr(name=name + "/depthwise/BatchNorm/gamma"),
+            bias_attr=ParamAttr(name=name + "/depthwise/BatchNorm/beta"),
+            moving_mean_name=name + "/depthwise/BatchNorm/moving_mean",
+            moving_variance_name=name + "/depthwise/BatchNorm/moving_variance")
+        self._conv2 = Conv2D(
+            input_channels,
+            output_channels,
+            1,
+            stride=1,
+            groups=1,
+            padding=0,
+            weight_attr=ParamAttr(name=name + "/pointwise/weights"),
+            bias_attr=False)
+        self._bn2 = BatchNorm(
+            output_channels,
+            act=act,
+            epsilon=1e-3,
+            momentum=0.99,
+            param_attr=ParamAttr(name=name + "/pointwise/BatchNorm/gamma"),
+            bias_attr=ParamAttr(name=name + "/pointwise/BatchNorm/beta"),
+            moving_mean_name=name + "/pointwise/BatchNorm/moving_mean",
+            moving_variance_name=name + "/pointwise/BatchNorm/moving_variance")
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._bn1(x)
+        x = self._conv2(x)
+        x = self._bn2(x)
+        return x
+
+
+class Xception_Block(nn.Layer):
+    def __init__(self,
+                 input_channels,
+                 output_channels,
+                 strides=1,
+                 filter_size=3,
+                 dilation=1,
+                 skip_conv=True,
+                 has_skip=True,
+                 activation_fn_in_separable_conv=False,
+                 name=None):
+        super(Xception_Block, self).__init__()
+
+        repeat_number = 3
+        output_channels = check_data(output_channels, repeat_number)
+        filter_size = check_data(filter_size, repeat_number)
+        strides = check_data(strides, repeat_number)
+
+        self.has_skip = has_skip
+        self.skip_conv = skip_conv
+        self.activation_fn_in_separable_conv = activation_fn_in_separable_conv
+        if not activation_fn_in_separable_conv:
+            self._conv1 = Seperate_Conv(
+                input_channels,
+                output_channels[0],
+                stride=strides[0],
+                filter=filter_size[0],
+                dilation=dilation,
+                name=name + "/separable_conv1")
+            self._conv2 = Seperate_Conv(
+                output_channels[0],
+                output_channels[1],
+                stride=strides[1],
+                filter=filter_size[1],
+                dilation=dilation,
+                name=name + "/separable_conv2")
+            self._conv3 = Seperate_Conv(
+                output_channels[1],
+                output_channels[2],
+                stride=strides[2],
+                filter=filter_size[2],
+                dilation=dilation,
+                name=name + "/separable_conv3")
+        else:
+            self._conv1 = Seperate_Conv(
+                input_channels,
+                output_channels[0],
+                stride=strides[0],
+                filter=filter_size[0],
+                act="relu",
+                dilation=dilation,
+                name=name + "/separable_conv1")
+            self._conv2 = Seperate_Conv(
+                output_channels[0],
+                output_channels[1],
+                stride=strides[1],
+                filter=filter_size[1],
+                act="relu",
+                dilation=dilation,
+                name=name + "/separable_conv2")
+            self._conv3 = Seperate_Conv(
+                output_channels[1],
+                output_channels[2],
+                stride=strides[2],
+                filter=filter_size[2],
+                act="relu",
+                dilation=dilation,
+                name=name + "/separable_conv3")
+
+        if has_skip and skip_conv:
+            self._short = ConvBNLayer(
+                input_channels,
+                output_channels[-1],
+                1,
+                stride=strides[-1],
+                padding=0,
+                name=name + "/shortcut")
+
+    def forward(self, inputs):
+        if not self.activation_fn_in_separable_conv:
+            x = F.relu(inputs)
+            x = self._conv1(x)
+            x = F.relu(x)
+            x = self._conv2(x)
+            x = F.relu(x)
+            x = self._conv3(x)
+        else:
+            x = self._conv1(inputs)
+            x = self._conv2(x)
+            x = self._conv3(x)
+        if self.has_skip:
+            if self.skip_conv:
+                skip = self._short(inputs)
+            else:
+                skip = inputs
+            return paddle.add(x, skip)
+        else:
+            return x
+
+
+class XceptionDeeplab(nn.Layer):
+    def __init__(self, backbone, class_num=1000):
+        super(XceptionDeeplab, self).__init__()
+
+        bottleneck_params = gen_bottleneck_params(backbone)
+        self.backbone = backbone
+
+        self._conv1 = ConvBNLayer(
+            3,
+            32,
+            3,
+            stride=2,
+            padding=1,
+            act="relu",
+            name=self.backbone + "/entry_flow/conv1")
+        self._conv2 = ConvBNLayer(
+            32,
+            64,
+            3,
+            stride=1,
+            padding=1,
+            act="relu",
+            name=self.backbone + "/entry_flow/conv2")
+
+        self.block_num = bottleneck_params["entry_flow"][0]
+        self.strides = bottleneck_params["entry_flow"][1]
+        self.chns = bottleneck_params["entry_flow"][2]
+        self.strides = check_data(self.strides, self.block_num)
+        self.chns = check_data(self.chns, self.block_num)
+
+        self.entry_flow = []
+        self.middle_flow = []
+
+        self.stride = 2
+        self.output_stride = 32
+        s = self.stride
+
+        for i in range(self.block_num):
+            stride = self.strides[i] if check_stride(s * self.strides[i],
+                                                     self.output_stride) else 1
+            xception_block = self.add_sublayer(
+                self.backbone + "/entry_flow/block" + str(i + 1),
+                Xception_Block(
+                    input_channels=64 if i == 0 else self.chns[i - 1],
+                    output_channels=self.chns[i],
+                    strides=[1, 1, self.stride],
+                    name=self.backbone + "/entry_flow/block" + str(i + 1)))
+            self.entry_flow.append(xception_block)
+            s = s * stride
+        self.stride = s
+
+        self.block_num = bottleneck_params["middle_flow"][0]
+        self.strides = bottleneck_params["middle_flow"][1]
+        self.chns = bottleneck_params["middle_flow"][2]
+        self.strides = check_data(self.strides, self.block_num)
+        self.chns = check_data(self.chns, self.block_num)
+        s = self.stride
+
+        for i in range(self.block_num):
+            stride = self.strides[i] if check_stride(s * self.strides[i],
+                                                     self.output_stride) else 1
+            xception_block = self.add_sublayer(
+                self.backbone + "/middle_flow/block" + str(i + 1),
+                Xception_Block(
+                    input_channels=728,
+                    output_channels=728,
+                    strides=[1, 1, self.strides[i]],
+                    skip_conv=False,
+                    name=self.backbone + "/middle_flow/block" + str(i + 1)))
+            self.middle_flow.append(xception_block)
+            s = s * stride
+        self.stride = s
+
+        self.block_num = bottleneck_params["exit_flow"][0]
+        self.strides = bottleneck_params["exit_flow"][1]
+        self.chns = bottleneck_params["exit_flow"][2]
+        self.strides = check_data(self.strides, self.block_num)
+        self.chns = check_data(self.chns, self.block_num)
+        s = self.stride
+        stride = self.strides[0] if check_stride(s * self.strides[0],
+                                                 self.output_stride) else 1
+        self._exit_flow_1 = Xception_Block(
+            728,
+            self.chns[0], [1, 1, stride],
+            name=self.backbone + "/exit_flow/block1")
+        s = s * stride
+        stride = self.strides[1] if check_stride(s * self.strides[1],
+                                                 self.output_stride) else 1
+        self._exit_flow_2 = Xception_Block(
+            self.chns[0][-1],
+            self.chns[1], [1, 1, stride],
+            dilation=2,
+            has_skip=False,
+            activation_fn_in_separable_conv=True,
+            name=self.backbone + "/exit_flow/block2")
+        s = s * stride
+
+        self.stride = s
+
+        self._drop = Dropout(p=0.5, mode="downscale_in_infer")
+        self._pool = AdaptiveAvgPool2D(1)
+        self._fc = Linear(
+            self.chns[1][-1],
+            class_num,
+            weight_attr=ParamAttr(name="fc_weights"),
+            bias_attr=ParamAttr(name="fc_bias"))
+
+    def forward(self, inputs):
+        x = self._conv1(inputs)
+        x = self._conv2(x)
+        for ef in self.entry_flow:
+            x = ef(x)
+        for mf in self.middle_flow:
+            x = mf(x)
+        x = self._exit_flow_1(x)
+        x = self._exit_flow_2(x)
+        x = self._drop(x)
+        x = self._pool(x)
+        x = paddle.squeeze(x, axis=[2, 3])
+        x = self._fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld=False):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def Xception41_deeplab(pretrained=False, use_ssld=False, **kwargs):
+    model = XceptionDeeplab('xception_41', **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["Xception41_deeplab"], use_ssld=use_ssld)
+    return model
+
+
+def Xception65_deeplab(pretrained=False, use_ssld=False, **kwargs):
+    model = XceptionDeeplab("xception_65", **kwargs)
+    _load_pretrained(
+        pretrained, model, MODEL_URLS["Xception65_deeplab"], use_ssld=use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/__init__.py
new file mode 100644
index 000000000..4a2716208
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/__init__.py
@@ -0,0 +1,5 @@
+from .resnet_variant import ResNet50_last_stage_stride1, ResNet50_metabin
+from .vgg_variant import VGG19Sigmoid
+from .pp_lcnet_variant import PPLCNet_x2_5_Tanh
+from .pp_lcnetv2_variant import PPLCNetV2_base_ShiTu
+from .swin_transformer_variant import SwinTransformer_base_patch4_window7_224_SOLIDER,SwinTransformer_small_patch4_window7_224_SOLIDER,SwinTransformer_tiny_patch4_window7_224_SOLIDER
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/efficientnet_variant.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/efficientnet_variant.py
new file mode 100644
index 000000000..86701d4b4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/efficientnet_variant.py
@@ -0,0 +1,44 @@
+import paddle
+import paddle.nn as nn
+from ..model_zoo.efficientnet import EfficientNetB3, _load_pretrained
+
+MODEL_URLS = {
+    "EfficientNetB3_watermark":
+    "https://paddleclas.bj.bcebos.com/models/practical/pretrained/EfficientNetB3_watermark_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+def EfficientNetB3_watermark(padding_type='DYNAMIC',
+                             override_params={"batch_norm_epsilon": 0.00001},
+                             use_se=True,
+                             pretrained=False,
+                             use_ssld=False,
+                             **kwargs):
+    def replace_function(_fc, pattern):
+        classifier = nn.Sequential(
+            # 1536 is the orginal in_features
+            nn.Linear(
+                in_features=1536, out_features=625),
+            nn.ReLU(),
+            nn.Dropout(p=0.3),
+            nn.Linear(
+                in_features=625, out_features=256),
+            nn.ReLU(),
+            nn.Linear(
+                in_features=256, out_features=2), )
+        return classifier
+
+    pattern = "_fc"
+    model = EfficientNetB3(
+        padding_type=padding_type,
+        override_params=override_params,
+        use_se=True,
+        pretrained=False,
+        use_ssld=False,
+        **kwargs)
+    model.upgrade_sublayer(pattern, replace_function)
+    _load_pretrained(pretrained, model, MODEL_URLS["EfficientNetB3_watermark"],
+                     use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/foundation_vit_variant.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/foundation_vit_variant.py
new file mode 100644
index 000000000..7f79dcaf3
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/foundation_vit_variant.py
@@ -0,0 +1,52 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ..model_zoo.foundation_vit import CLIP_vit_large_patch14_224, _load_pretrained
+
+MODEL_URLS = {
+    "CLIP_large_patch14_224_aesthetic":
+    "https://paddleclas.bj.bcebos.com/models/practical/pretrained/CLIP_large_patch14_224_aesthetic_pretrained.pdparams"
+}
+
+__all__ = list(MODEL_URLS.keys())
+
+
+class MLP(nn.Layer):
+    def __init__(self, input_size):
+        super().__init__()
+        self.input_size = input_size
+        self.layers = nn.Sequential(
+            nn.Linear(self.input_size, 1024),
+            nn.Dropout(0.2),
+            nn.Linear(1024, 128),
+            nn.Dropout(0.2),
+            nn.Linear(128, 64),
+            nn.Dropout(0.1), nn.Linear(64, 16), nn.Linear(16, 1))
+
+    def forward(self, x):
+        return self.layers(x)
+
+
+class Aesthetic_Score_Predictor(nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.model = CLIP_vit_large_patch14_224()
+        self.fc_head = nn.Linear(1024, 768, bias_attr=False)
+        self.mlp = MLP(768)
+
+    def forward(self, x):
+        x = self.model(x)
+        x = x[:, 0, :]
+        x = self.fc_head(x)
+        x = F.normalize(x, p=2, axis=-1)
+        x = self.mlp(x)
+        return x
+
+
+def CLIP_large_patch14_224_aesthetic(pretrained=False,
+                                     use_ssld=False,
+                                     **kwargs):
+    model = Aesthetic_Score_Predictor()
+    _load_pretrained(pretrained, model,
+                     MODEL_URLS["CLIP_large_patch14_224_aesthetic"], use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/pp_lcnet_variant.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/pp_lcnet_variant.py
new file mode 100644
index 000000000..e4c25c4c6
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/pp_lcnet_variant.py
@@ -0,0 +1,29 @@
+import paddle
+from paddle.nn import Sigmoid
+from paddle.nn import Tanh
+from ..legendary_models.pp_lcnet import PPLCNet_x2_5
+
+__all__ = ["PPLCNet_x2_5_Tanh"]
+
+
+class TanhSuffix(paddle.nn.Layer):
+    def __init__(self, origin_layer):
+        super(TanhSuffix, self).__init__()
+        self.origin_layer = origin_layer
+        self.tanh = Tanh()
+
+    def forward(self, input, res_dict=None, **kwargs):
+        x = self.origin_layer(input)
+        x = self.tanh(x)
+        return x
+
+
+def PPLCNet_x2_5_Tanh(pretrained=False, use_ssld=False, **kwargs):
+    def replace_function(origin_layer, pattern):
+        new_layer = TanhSuffix(origin_layer)
+        return new_layer
+
+    pattern = "fc"
+    model = PPLCNet_x2_5(pretrained=pretrained, use_ssld=use_ssld, **kwargs)
+    model.upgrade_sublayer(pattern, replace_function)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/pp_lcnetv2_variant.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/pp_lcnetv2_variant.py
new file mode 100644
index 000000000..6acccdc8e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/pp_lcnetv2_variant.py
@@ -0,0 +1,56 @@
+from paddle.nn import Conv2D, Identity
+
+from ..legendary_models.pp_lcnet_v2 import MODEL_URLS, PPLCNetV2_base, RepDepthwiseSeparable, _load_pretrained
+
+__all__ = ["PPLCNetV2_base_ShiTu"]
+
+
+def PPLCNetV2_base_ShiTu(pretrained=False, use_ssld=False, **kwargs):
+    """
+    An variant network of PPLCNetV2_base
+    1. remove ReLU layer after last_conv
+    2. add bias to last_conv
+    3. change stride to 1 in last two RepDepthwiseSeparable Block
+    """
+    model = PPLCNetV2_base(pretrained=False, use_ssld=use_ssld, **kwargs)
+
+    def remove_ReLU_function(conv, pattern):
+        new_conv = Identity()
+        return new_conv
+
+    def add_bias_last_conv(conv, pattern):
+        new_conv = Conv2D(
+            in_channels=conv._in_channels,
+            out_channels=conv._out_channels,
+            kernel_size=conv._kernel_size,
+            stride=conv._stride,
+            padding=conv._padding,
+            groups=conv._groups,
+            bias_attr=True)
+        return new_conv
+
+    def last_stride_function(rep_block, pattern):
+        new_conv = RepDepthwiseSeparable(
+            in_channels=rep_block.in_channels,
+            out_channels=rep_block.out_channels,
+            stride=1,
+            dw_size=rep_block.dw_size,
+            split_pw=rep_block.split_pw,
+            use_rep=rep_block.use_rep,
+            use_se=rep_block.use_se,
+            use_shortcut=rep_block.use_shortcut)
+        return new_conv
+
+    pattern_act = ["act"]
+    pattern_lastconv = ["last_conv"]
+    pattern_last_stride = [
+        "stages[3][0]",
+        "stages[3][1]",
+    ]
+    model.upgrade_sublayer(pattern_act, remove_ReLU_function)
+    model.upgrade_sublayer(pattern_lastconv, add_bias_last_conv)
+    model.upgrade_sublayer(pattern_last_stride, last_stride_function)
+
+    # load params again after upgrade some layers
+    _load_pretrained(pretrained, model, MODEL_URLS["PPLCNetV2_base"], use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/resnet_variant.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/resnet_variant.py
new file mode 100644
index 000000000..3569cd206
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/resnet_variant.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+import copy
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+from ..legendary_models.resnet import ResNet50, MODEL_URLS, _load_pretrained
+
+__all__ = [
+    "ResNet50_last_stage_stride1", "ResNet50_adaptive_max_pool2d",
+    'ResNet50_metabin'
+]
+
+
+def ResNet50_last_stage_stride1(pretrained=False, use_ssld=False, **kwargs):
+    def replace_function(conv, pattern):
+        new_conv = nn.Conv2D(
+            in_channels=conv._in_channels,
+            out_channels=conv._out_channels,
+            kernel_size=conv._kernel_size,
+            stride=1,
+            padding=conv._padding,
+            groups=conv._groups,
+            bias_attr=conv._bias_attr)
+        return new_conv
+
+    pattern = ["blocks[13].conv1.conv", "blocks[13].short.conv"]
+    model = ResNet50(pretrained=False, use_ssld=use_ssld, **kwargs)
+    model.upgrade_sublayer(pattern, replace_function)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50"], use_ssld)
+    return model
+
+
+def ResNet50_adaptive_max_pool2d(pretrained=False, use_ssld=False, **kwargs):
+    def replace_function(pool, pattern):
+        new_pool = nn.AdaptiveMaxPool2D(output_size=1)
+        return new_pool
+
+    pattern = ["avg_pool"]
+    model = ResNet50(pretrained=False, use_ssld=use_ssld, **kwargs)
+    model.upgrade_sublayer(pattern, replace_function)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50"], use_ssld)
+    return model
+
+
+def ResNet50_metabin(pretrained=False,
+                     use_ssld=False,
+                     bias_lr_factor=1.0,
+                     **kwargs):
+    """
+    ResNet50 which replaces all `bn` layers with MetaBIN
+    reference: https://arxiv.org/abs/2011.14670
+    """
+
+    class BINGate(nn.Layer):
+        def __init__(self, num_features):
+            super().__init__()
+            self.gate = self.create_parameter(
+                shape=[num_features],
+                default_initializer=nn.initializer.Constant(1.0))
+            self.add_parameter("gate", self.gate)
+
+        def forward(self, opt={}):
+            flag_update = 'lr_gate' in opt and \
+                opt.get('enable_inside_update', False)
+            if flag_update and self.gate.grad is not None:  # update gate
+                lr = opt['lr_gate'] * self.gate.optimize_attr.get(
+                    'learning_rate', 1.0)
+                gate = self.gate - lr * self.gate.grad
+                gate.clip_(min=0, max=1)
+            else:
+                gate = self.gate
+            return gate
+
+        def clip_gate(self):
+            self.gate.set_value(self.gate.clip(0, 1))
+
+    class MetaBN(nn.BatchNorm2D):
+        def forward(self, inputs, opt={}):
+            mode = opt.get("bn_mode", "general") if self.training else "eval"
+            if mode == "general":  # update, but not apply running_mean/var
+                result = F.batch_norm(inputs, self._mean, self._variance,
+                                      self.weight, self.bias, self.training,
+                                      self._momentum, self._epsilon)
+            elif mode == "hold":  # not update, not apply running_mean/var
+                result = F.batch_norm(
+                    inputs,
+                    paddle.mean(
+                        inputs, axis=(0, 2, 3)),
+                    paddle.var(inputs, axis=(0, 2, 3)),
+                    self.weight,
+                    self.bias,
+                    self.training,
+                    self._momentum,
+                    self._epsilon)
+            elif mode == "eval":  # fix and apply running_mean/var,
+                if self._mean is None:
+                    result = F.batch_norm(
+                        inputs,
+                        paddle.mean(
+                            inputs, axis=(0, 2, 3)),
+                        paddle.var(inputs, axis=(0, 2, 3)),
+                        self.weight,
+                        self.bias,
+                        True,
+                        self._momentum,
+                        self._epsilon)
+                else:
+                    result = F.batch_norm(inputs, self._mean, self._variance,
+                                          self.weight, self.bias, False,
+                                          self._momentum, self._epsilon)
+            return result
+
+    class MetaBIN(nn.Layer):
+        """
+        MetaBIN (Meta Batch-Instance Normalization)
+        reference: https://arxiv.org/abs/2011.14670
+        """
+
+        def __init__(self, num_features):
+            super().__init__()
+            self.batch_norm = MetaBN(
+                num_features=num_features, use_global_stats=True)
+            self.instance_norm = nn.InstanceNorm2D(num_features=num_features)
+            self.gate = BINGate(num_features=num_features)
+            self.opt = defaultdict()
+
+        def forward(self, inputs):
+            out_bn = self.batch_norm(inputs, self.opt)
+            out_in = self.instance_norm(inputs)
+            gate = self.gate(self.opt)
+            gate = gate.unsqueeze([0, -1, -1])
+            out = out_bn * gate + out_in * (1 - gate)
+            return out
+
+        def reset_opt(self):
+            self.opt = defaultdict()
+
+        def setup_opt(self, opt):
+            """
+            Arg:
+                opt (dict): Optional setting to change the behavior of MetaBIN during training. 
+                    It includes three settings which are `enable_inside_update`, `lr_gate` and `bn_mode`.
+            """
+            self.check_opt(opt)
+            self.opt = copy.deepcopy(opt)
+
+        @classmethod
+        def check_opt(cls, opt):
+            assert isinstance(opt, dict), \
+                TypeError('Got the wrong type of `opt`. Please use `dict` type.')
+
+            if opt.get('enable_inside_update', False) and 'lr_gate' not in opt:
+                raise RuntimeError('Missing `lr_gate` in opt.')
+
+            assert isinstance(opt.get('lr_gate', 1.0), float), \
+                TypeError('Got the wrong type of `lr_gate`. Please use `float` type.')
+            assert isinstance(opt.get('enable_inside_update', True), bool), \
+                TypeError('Got the wrong type of `enable_inside_update`. Please use `bool` type.')
+            assert opt.get('bn_mode', "general") in ["general", "hold", "eval"], \
+                TypeError('Got the wrong value of `bn_mode`.')
+
+    def bn2metabin(bn, pattern):
+        metabin = MetaBIN(bn.weight.shape[0])
+        metabin.batch_norm.weight.set_value(bn.weight)
+        metabin.batch_norm.bias.set_value(bn.bias)
+        metabin.batch_norm._variance.set_value(bn._variance)
+        metabin.batch_norm._mean.set_value(bn._mean)
+        return metabin
+
+    def setup_optimize_attr(model, bias_lr_factor):
+        for name, params in model.named_parameters():
+            if params.stop_gradient:
+                continue
+            if "bias" in name:
+                params.optimize_attr['learning_rate'] = bias_lr_factor
+
+    pattern = []
+    pattern.extend(["blocks[{}].conv{}.bn".format(i, j) \
+                    for i in range(16) for j in range(3)])
+    pattern.extend(["blocks[{}].short.bn".format(i) for i in [0, 3, 7, 13]])
+    pattern.append("stem[0].bn")
+
+    model = ResNet50_last_stage_stride1(
+        pretrained=pretrained, use_ssld=use_ssld, **kwargs)
+
+    model.upgrade_sublayer(pattern, bn2metabin)
+    setup_optimize_attr(model=model, bias_lr_factor=bias_lr_factor)
+
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/swin_transformer_variant.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/swin_transformer_variant.py
new file mode 100644
index 000000000..1e6632f6e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/swin_transformer_variant.py
@@ -0,0 +1,355 @@
+import numpy as np
+import paddle
+import paddle.nn as nn
+from ..legendary_models.swin_transformer import SwinTransformer, _load_pretrained, \
+    PatchEmbed, BasicLayer, SwinTransformerBlock
+
+MODEL_URLS_SOLIDER = {
+    "SwinTransformer_tiny_patch4_window7_224_SOLIDER":
+        'https://paddleclas.bj.bcebos.com/models/SOLIDER/SwinTransformer_tiny_patch4_window7_224_pretrained.pdparams',
+    "SwinTransformer_small_patch4_window7_224_SOLIDER":
+        'https://paddleclas.bj.bcebos.com/models/SOLIDER/SwinTransformer_small_patch4_window7_224_pretrained.pdparams',
+    "SwinTransformer_base_patch4_window7_224_SOLIDER":
+        'https://paddleclas.bj.bcebos.com/models/SOLIDER/SwinTransformer_base_patch4_window7_224_pretrained.pdparams'
+}
+
+__all__ = list(MODEL_URLS_SOLIDER.keys())
+
+
+class PatchEmbed_SOLIDER(PatchEmbed):
+    def forward(self, x):
+        x = self.proj(x)
+        out_size = (x.shape[2], x.shape[3])
+        x = x.flatten(2).transpose([0, 2, 1])  # B Ph*Pw C
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, out_size
+
+
+class SwinTransformerBlock_SOLIDER(SwinTransformerBlock):
+    r""" Swin Transformer Block.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 num_heads,
+                 window_size=7,
+                 shift_size=0,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super(SwinTransformerBlock_SOLIDER, self).__init__(
+            dim=dim,
+            input_resolution=input_resolution,
+            num_heads=num_heads,
+            window_size=window_size,
+            shift_size=shift_size,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            drop=drop,
+            attn_drop=attn_drop,
+            drop_path=drop_path,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+        )
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        self.check_condition()
+
+    def check_condition(self):
+        if min(self.input_resolution) < self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+
+class BasicLayer_SOLIDER(BasicLayer):
+    def __init__(self,
+                 dim,
+                 input_resolution,
+                 depth,
+                 num_heads,
+                 window_size,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False):
+
+        super(BasicLayer_SOLIDER, self).__init__(
+            dim=dim,
+            input_resolution=input_resolution,
+            depth=depth,
+            num_heads=num_heads,
+            window_size=window_size,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            drop=drop,
+            attn_drop=attn_drop,
+            drop_path=drop_path,
+            norm_layer=norm_layer,
+            downsample=downsample,
+            use_checkpoint=use_checkpoint
+        )
+        # build blocks
+        self.blocks = nn.LayerList([
+            SwinTransformerBlock_SOLIDER(
+                dim=dim,
+                input_resolution=input_resolution,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, list) else drop_path,
+                norm_layer=norm_layer) for i in range(depth)
+        ])
+
+    def forward(self, x):
+        for blk in self.blocks:
+            x = blk(x)
+
+        if self.downsample is not None:
+            x_down = self.downsample(x)
+            return x_down, x
+        else:
+            return x, x
+
+
+class PatchMerging_SOLIDER(nn.Layer):
+    r""" Patch Merging Layer.
+
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.sampler = nn.Unfold(kernel_sizes=2, strides=2)
+        self.norm = norm_layer(4 * dim)
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
+
+    def forward(self, x):
+        """
+        x: B, H*W, C
+        """
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        assert H % 2 == 0 and W % 2 == 0, "x size ({}*{}) are not even.".format(
+            H, W)
+
+        x = x.reshape([B, H, W, C]).transpose([0, 3, 1, 2])
+
+        x = self.sampler(x)
+        x = x.transpose([0, 2, 1])
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+
+
+class SwinTransformer_SOLIDER(SwinTransformer):
+    def __init__(self,
+                 embed_dim=96,
+                 img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 class_num=1000,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.1,
+                 norm_layer=nn.LayerNorm,
+                 out_indices=(0, 1, 2, 3),
+                 semantic_weight=1.0,
+                 use_checkpoint=False,
+                 **kwargs):
+        super(SwinTransformer_SOLIDER, self).__init__()
+        patches_resolution = self.patch_embed.patches_resolution
+        self.num_classes = num_classes = class_num
+        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
+        # stochastic depth
+        dpr = np.linspace(0, drop_path_rate,
+                          sum(depths)).tolist()  # stochastic depth decay rule
+        self.patch_embed = PatchEmbed_SOLIDER(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        self.out_indices = out_indices
+        # build layers
+        self.layers = nn.LayerList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer_SOLIDER(
+                dim=int(embed_dim * 2 ** i_layer),
+                input_resolution=(patches_resolution[0] // (2 ** i_layer),
+                                  patches_resolution[1] // (2 ** i_layer)),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging_SOLIDER
+                if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+
+        self.num_features_s = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+        for i in out_indices:
+            layer = norm_layer(self.num_features_s[i])
+            layer_name = f'norm{i}'
+            self.add_sublayer(layer_name, layer)
+        self.avgpool = nn.AdaptiveAvgPool2D(1)
+
+        # semantic embedding
+        self.semantic_weight = semantic_weight
+        if self.semantic_weight >= 0:
+            self.semantic_embed_w = nn.LayerList()
+            self.semantic_embed_b = nn.LayerList()
+            for i in range(len(depths)):
+                if i >= len(depths) - 1:
+                    i = len(depths) - 2
+                semantic_embed_w = nn.Linear(2, self.num_features_s[i + 1])
+                semantic_embed_b = nn.Linear(2, self.num_features_s[i + 1])
+                self._init_weights(semantic_embed_w)
+                self._init_weights(semantic_embed_b)
+                self.semantic_embed_w.append(semantic_embed_w)
+                self.semantic_embed_b.append(semantic_embed_b)
+            self.softplus = nn.Softplus()
+        self.head = nn.Linear(
+            self.num_features,
+            num_classes) if self.num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x, semantic_weight=None):
+        if self.semantic_weight >= 0 and semantic_weight is None:
+            w = paddle.ones((x.shape[0], 1)) * self.semantic_weight
+            w = paddle.concat([w, 1 - w], axis=-1)
+            semantic_weight = w.cuda()
+        x, hw_shape = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+        outs = []
+
+        for i, layer in enumerate(self.layers):
+            x, out = layer(x)
+            if self.semantic_weight >= 0:
+                sw = self.semantic_embed_w[i](semantic_weight).unsqueeze(1)
+                sb = self.semantic_embed_b[i](semantic_weight).unsqueeze(1)
+                x = x * self.softplus(sw) + sb
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                out = norm_layer(out)
+                out = out.reshape([-1, *hw_shape,
+                                   self.num_features_s[i]]).transpose([0, 3, 1, 2])
+                hw_shape = [item // 2 for item in hw_shape]
+                outs.append(out)
+
+        x = self.avgpool(outs[-1])  # B C 1
+        x = paddle.flatten(x, 1)
+
+        return x
+
+
+def SwinTransformer_tiny_patch4_window7_224_SOLIDER(
+        pretrained=False,
+        **kwargs):
+    model = SwinTransformer_SOLIDER(
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        drop_path_rate=0.2,  # if imagenet22k or imagenet22kto1k, set drop_path_rate=0.1
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model=model,
+        model_url=MODEL_URLS_SOLIDER["SwinTransformer_tiny_patch4_window7_224_SOLIDER"],
+        **kwargs)
+    return model
+
+
+def SwinTransformer_small_patch4_window7_224_SOLIDER(
+        pretrained=False,
+        **kwargs):
+    model = SwinTransformer_SOLIDER(
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        drop_path_rate=0.3,  # if imagenet22k or imagenet22kto1k, set drop_path_rate=0.2
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model=model,
+        model_url=MODEL_URLS_SOLIDER["SwinTransformer_small_patch4_window7_224_SOLIDER"],
+        **kwargs)
+    return model
+
+
+def SwinTransformer_base_patch4_window7_224_SOLIDER(
+        pretrained=False,
+        **kwargs):
+    model = SwinTransformer_SOLIDER(
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=7,
+        drop_path_rate=0.5,  # if imagenet22k or imagenet22kto1k, set drop_path_rate=0.2
+        **kwargs)
+    _load_pretrained(
+        pretrained,
+        model=model,
+        model_url=MODEL_URLS_SOLIDER["SwinTransformer_base_patch4_window7_224_SOLIDER"],
+        **kwargs)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/vgg_variant.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/vgg_variant.py
new file mode 100644
index 000000000..36ce47768
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/backbone/variant_models/vgg_variant.py
@@ -0,0 +1,28 @@
+import paddle
+from paddle.nn import Sigmoid
+from ..legendary_models.vgg import VGG19
+
+__all__ = ["VGG19Sigmoid"]
+
+
+class SigmoidSuffix(paddle.nn.Layer):
+    def __init__(self, origin_layer):
+        super().__init__()
+        self.origin_layer = origin_layer
+        self.sigmoid = Sigmoid()
+
+    def forward(self, input, res_dict=None, **kwargs):
+        x = self.origin_layer(input)
+        x = self.sigmoid(x)
+        return x
+
+
+def VGG19Sigmoid(pretrained=False, use_ssld=False, **kwargs):
+    def replace_function(origin_layer, pattern):
+        new_layer = SigmoidSuffix(origin_layer)
+        return new_layer
+
+    pattern = "fc2"
+    model = VGG19(pretrained=pretrained, use_ssld=use_ssld, **kwargs)
+    model.upgrade_sublayer(pattern, replace_function)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/distill/afd_attention.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/distill/afd_attention.py
new file mode 100644
index 000000000..63b094f31
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/distill/afd_attention.py
@@ -0,0 +1,123 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle
+import numpy as np
+
+
+class LinearBNReLU(nn.Layer):
+    def __init__(self, nin, nout):
+        super().__init__()
+        self.linear = nn.Linear(nin, nout)
+        self.bn = nn.BatchNorm1D(nout)
+        self.relu = nn.ReLU()
+
+    def forward(self, x, relu=True):
+        if relu:
+            return self.relu(self.bn(self.linear(x)))
+        return self.bn(self.linear(x))
+
+
+def unique_shape(s_shapes):
+    n_s = []
+    unique_shapes = []
+    n = -1
+    for s_shape in s_shapes:
+        if s_shape not in unique_shapes:
+            unique_shapes.append(s_shape)
+            n += 1
+        n_s.append(n)
+    return n_s, unique_shapes
+
+
+class LinearTransformTeacher(nn.Layer):
+    def __init__(self, qk_dim, t_shapes, keys):
+        super().__init__()
+        self.teacher_keys = keys
+        self.t_shapes = [[1] + t_i for t_i in t_shapes]
+        self.query_layer = nn.LayerList(
+            [LinearBNReLU(t_shape[1], qk_dim) for t_shape in self.t_shapes])
+
+    def forward(self, t_features_dict):
+        g_t = [t_features_dict[key] for key in self.teacher_keys]
+        bs = g_t[0].shape[0]
+        channel_mean = [f_t.mean(3).mean(2) for f_t in g_t]
+        spatial_mean = []
+        for i in range(len(g_t)):
+            c, h, w = g_t[i].shape[1:]
+            spatial_mean.append(g_t[i].pow(2).mean(1).reshape([bs, h * w]))
+        query = paddle.stack(
+            [
+                query_layer(
+                    f_t, relu=False)
+                for f_t, query_layer in zip(channel_mean, self.query_layer)
+            ],
+            axis=1)
+        value = [F.normalize(f_s, axis=1) for f_s in spatial_mean]
+        return {"query": query, "value": value}
+
+
+class LinearTransformStudent(nn.Layer):
+    def __init__(self, qk_dim, t_shapes, s_shapes, keys):
+        super().__init__()
+        self.student_keys = keys
+        self.t_shapes = [[1] + t_i for t_i in t_shapes]
+        self.s_shapes = [[1] + s_i for s_i in s_shapes]
+        self.t = len(self.t_shapes)
+        self.s = len(self.s_shapes)
+        self.qk_dim = qk_dim
+        self.n_t, self.unique_t_shapes = unique_shape(self.t_shapes)
+        self.relu = nn.ReLU()
+        self.samplers = nn.LayerList(
+            [Sample(t_shape) for t_shape in self.unique_t_shapes])
+        self.key_layer = nn.LayerList([
+            LinearBNReLU(s_shape[1], self.qk_dim) for s_shape in self.s_shapes
+        ])
+        self.bilinear = LinearBNReLU(qk_dim, qk_dim * len(self.t_shapes))
+
+    def forward(self, s_features_dict):
+        g_s = [s_features_dict[key] for key in self.student_keys]
+        bs = g_s[0].shape[0]
+        channel_mean = [f_s.mean(3).mean(2) for f_s in g_s]
+        spatial_mean = [sampler(g_s, bs) for sampler in self.samplers]
+
+        key = paddle.stack(
+            [
+                key_layer(f_s)
+                for key_layer, f_s in zip(self.key_layer, channel_mean)
+            ],
+            axis=1).reshape([-1, self.qk_dim])  # Bs x h
+        bilinear_key = self.bilinear(
+            key, relu=False).reshape([bs, self.s, self.t, self.qk_dim])
+        value = [F.normalize(s_m, axis=2) for s_m in spatial_mean]
+        return {"bilinear_key": bilinear_key, "value": value}
+
+
+class Sample(nn.Layer):
+    def __init__(self, t_shape):
+        super().__init__()
+        self.t_N, self.t_C, self.t_H, self.t_W = t_shape
+        self.sample = nn.AdaptiveAvgPool2D((self.t_H, self.t_W))
+
+    def forward(self, g_s, bs):
+        g_s = paddle.stack(
+            [
+                self.sample(f_s.pow(2).mean(
+                    1, keepdim=True)).reshape([bs, self.t_H * self.t_W])
+                for f_s in g_s
+            ],
+            axis=1)
+        return g_s
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/__init__.py
new file mode 100644
index 000000000..b2a6f46a0
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/__init__.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .arcmargin import ArcMargin
+from .cosmargin import CosMargin
+from .circlemargin import CircleMargin
+from .fc import FC
+from .vehicle_neck import VehicleNeck
+from paddle.nn import Tanh, Identity
+from .bnneck import BNNeck
+from .adamargin import AdaMargin
+from .frfn_neck import FRFNNeck
+from .metabnneck import MetaBNNeck
+from .ml_decoder import MLDecoder
+
+__all__ = ['build_gear', 'add_ml_decoder_head']
+
+
+def build_gear(config):
+    support_dict = [
+        'ArcMargin', 'CosMargin', 'CircleMargin', 'FC', 'VehicleNeck', 'Tanh',
+        'BNNeck', 'AdaMargin', 'FRFNNeck', 'MetaBNNeck'
+    ]
+    module_name = config.pop('name')
+    assert module_name in support_dict, Exception(
+        'head only support {}'.format(support_dict))
+    module_class = eval(module_name)(**config)
+    return module_class
+
+
+def add_ml_decoder_head(model, config):
+    if 'class_num' not in config:
+        if hasattr(model, 'class_num'):
+            config['class_num'] = model.class_num
+        else:
+            raise AttributeError(
+                'Please manually add parameter `class_num` '
+                'for MLDecoder in the config file.')
+
+    # remove_layers: list of layer names that need to be deleted from backbone
+    if 'remove_layers' in config:
+        remove_layers = config.pop('remove_layers')
+    else:
+        remove_layers = ['avg_pool', 'flatten']
+    for remove_layer in remove_layers:
+        if hasattr(model, remove_layer):
+            delattr(model, remove_layer)
+            setattr(model, remove_layer, Identity())
+        else:
+            raise AttributeError(
+                f"{remove_layer} does not have attribute the model.")
+
+    # replace_layer: layer name that need to be replaced in backbone
+    if 'replace_layer' in config:
+        replace_layer = config.pop('replace_layer')
+    else:
+        replace_layer = 'fc'
+    if hasattr(model, replace_layer):
+        delattr(model, replace_layer)
+        setattr(model, replace_layer, MLDecoder(**config))
+    else:
+        raise AttributeError(
+            f"{replace_layer} does not have attribute the model.")
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/adamargin.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/adamargin.py
new file mode 100644
index 000000000..98e963341
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/adamargin.py
@@ -0,0 +1,113 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on AdaFace(https://github.com/mk-minchul/AdaFace)
+# Paper: AdaFace: Quality Adaptive Margin for Face Recognition
+from paddle.nn import Layer
+import math
+import paddle
+
+
+def l2_norm(input, axis=1):
+    norm = paddle.norm(input, 2, axis, True)
+    output = paddle.divide(input, norm)
+    return output
+
+
+class AdaMargin(Layer):
+    def __init__(
+            self,
+            embedding_size=512,
+            class_num=70722,
+            m=0.4,
+            h=0.333,
+            s=64.,
+            t_alpha=1.0, ):
+        super(AdaMargin, self).__init__()
+        self.classnum = class_num
+        kernel_weight = paddle.uniform(
+            [embedding_size, class_num], min=-1, max=1)
+        kernel_weight_norm = paddle.norm(
+            kernel_weight, p=2, axis=0, keepdim=True)
+        kernel_weight_norm = paddle.where(kernel_weight_norm > 1e-5,
+                                          kernel_weight_norm,
+                                          paddle.ones_like(kernel_weight_norm))
+        kernel_weight = kernel_weight / kernel_weight_norm
+        self.kernel = self.create_parameter(
+            [embedding_size, class_num],
+            attr=paddle.nn.initializer.Assign(kernel_weight))
+
+        # initial kernel
+        # self.kernel.data.uniform_(-1, 1).renorm_(2,1,1e-5).mul_(1e5)
+        self.m = m
+        self.eps = 1e-3
+        self.h = h
+        self.s = s
+
+        # ema prep
+        self.t_alpha = t_alpha
+        self.register_buffer('t', paddle.zeros([1]), persistable=True)
+        self.register_buffer(
+            'batch_mean', paddle.ones([1]) * 20, persistable=True)
+        self.register_buffer(
+            'batch_std', paddle.ones([1]) * 100, persistable=True)
+
+    def forward(self, embbedings, label):
+        if not self.training:
+            return embbedings
+
+        norms = paddle.norm(embbedings, 2, 1, True)
+        embbedings = paddle.divide(embbedings, norms)
+        kernel_norm = l2_norm(self.kernel, axis=0)
+        cosine = paddle.mm(embbedings, kernel_norm)
+        cosine = paddle.clip(cosine, -1 + self.eps,
+                             1 - self.eps)  # for stability
+
+        safe_norms = paddle.clip(norms, min=0.001, max=100)  # for stability
+        safe_norms = safe_norms.clone().detach()
+
+        # update batchmean batchstd
+        with paddle.no_grad():
+            mean = safe_norms.mean().detach()
+            std = safe_norms.std().detach()
+            self.batch_mean = mean * self.t_alpha + (1 - self.t_alpha
+                                                     ) * self.batch_mean
+            self.batch_std = std * self.t_alpha + (1 - self.t_alpha
+                                                   ) * self.batch_std
+
+        margin_scaler = (safe_norms - self.batch_mean) / (
+            self.batch_std + self.eps)  # 66% between -1, 1
+        margin_scaler = margin_scaler * self.h  # 68% between -0.333 ,0.333 when h:0.333
+        margin_scaler = paddle.clip(margin_scaler, -1, 1)
+
+        # g_angular
+        m_arc = paddle.nn.functional.one_hot(
+            label.reshape([-1]), self.classnum)
+        g_angular = self.m * margin_scaler * -1
+        m_arc = m_arc * g_angular
+        theta = paddle.acos(cosine)
+        theta_m = paddle.clip(
+            theta + m_arc, min=self.eps, max=math.pi - self.eps)
+        cosine = paddle.cos(theta_m)
+
+        # g_additive
+        m_cos = paddle.nn.functional.one_hot(
+            label.reshape([-1]), self.classnum)
+        g_add = self.m + (self.m * margin_scaler)
+        m_cos = m_cos * g_add
+        cosine = cosine - m_cos
+
+        # scale
+        scaled_cosine_m = cosine * self.s
+        return scaled_cosine_m
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/arcmargin.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/arcmargin.py
new file mode 100644
index 000000000..6c72a71a2
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/arcmargin.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1801.07698
+
+import paddle
+import paddle.nn as nn
+import math
+
+
+class ArcMargin(nn.Layer):
+    def __init__(self,
+                 embedding_size,
+                 class_num,
+                 margin=0.5,
+                 scale=80.0,
+                 easy_margin=False):
+        super().__init__()
+        self.embedding_size = embedding_size
+        self.class_num = class_num
+        self.margin = margin
+        self.scale = scale
+        self.easy_margin = easy_margin
+        self.weight = self.create_parameter(
+            shape=[self.embedding_size, self.class_num],
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.XavierNormal())
+
+    def forward(self, input, label=None):
+        input_norm = paddle.sqrt(
+            paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        input = paddle.divide(input, input_norm)
+
+        weight_norm = paddle.sqrt(
+            paddle.sum(paddle.square(self.weight), axis=0, keepdim=True))
+        weight = paddle.divide(self.weight, weight_norm)
+
+        cos = paddle.matmul(input, weight)
+        if not self.training or label is None:
+            return cos
+        sin = paddle.sqrt(1.0 - paddle.square(cos) + 1e-6)
+        cos_m = math.cos(self.margin)
+        sin_m = math.sin(self.margin)
+        phi = cos * cos_m - sin * sin_m
+
+        th = math.cos(self.margin) * (-1)
+        mm = math.sin(self.margin) * self.margin
+        if self.easy_margin:
+            phi = self._paddle_where_more_than(cos, 0, phi, cos)
+        else:
+            phi = self._paddle_where_more_than(cos, th, phi, cos - mm)
+
+        one_hot = paddle.nn.functional.one_hot(label, self.class_num)
+        one_hot = paddle.squeeze(one_hot, axis=[1])
+        output = paddle.multiply(one_hot, phi) + paddle.multiply(
+            (1.0 - one_hot), cos)
+        output = output * self.scale
+        return output
+
+    def _paddle_where_more_than(self, target, limit, x, y):
+        mask = paddle.cast(x=(target > limit), dtype='float32')
+        output = paddle.multiply(mask, x) + paddle.multiply((1.0 - mask), y)
+        return output
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/bnneck.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/bnneck.py
new file mode 100644
index 000000000..e7abb8921
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/bnneck.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+
+from ..utils import get_param_attr_dict
+
+
+class BNNeck(nn.Layer):
+    def __init__(self, num_features, **kwargs):
+        super().__init__()
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Constant(value=1.0))
+        bias_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Constant(value=0.0),
+            trainable=False)
+
+        if 'weight_attr' in kwargs:
+            weight_attr = get_param_attr_dict(kwargs['weight_attr'])
+
+        bias_attr = None
+        if 'bias_attr' in kwargs:
+            bias_attr = get_param_attr_dict(kwargs['bias_attr'])
+
+        use_global_stats = None
+        if 'use_global_stats' in kwargs:
+            use_global_stats = get_param_attr_dict(kwargs['use_global_stats'])
+
+        self.feat_bn = nn.BatchNorm1D(
+            num_features,
+            momentum=0.9,
+            epsilon=1e-05,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            use_global_stats=use_global_stats)
+
+        self.flatten = nn.Flatten()
+
+    def forward(self, x):
+        x = self.flatten(x)
+        x = self.feat_bn(x)
+        return x
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/circlemargin.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/circlemargin.py
new file mode 100644
index 000000000..c04d6618b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/circlemargin.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/2002.10857
+
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class CircleMargin(nn.Layer):
+    def __init__(self, embedding_size, class_num, margin, scale):
+        super(CircleMargin, self).__init__()
+        self.scale = scale
+        self.margin = margin
+        self.embedding_size = embedding_size
+        self.class_num = class_num
+
+        self.weight = self.create_parameter(
+            shape=[self.embedding_size, self.class_num],
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.XavierNormal())
+
+    def forward(self, input, label):
+        feat_norm = paddle.sqrt(
+            paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        input = paddle.divide(input, feat_norm)
+
+        weight_norm = paddle.sqrt(
+            paddle.sum(paddle.square(self.weight), axis=0, keepdim=True))
+        weight = paddle.divide(self.weight, weight_norm)
+
+        logits = paddle.matmul(input, weight)
+        if not self.training or label is None:
+            return logits
+
+        alpha_p = paddle.clip(-logits.detach() + 1 + self.margin, min=0.)
+        alpha_n = paddle.clip(logits.detach() + self.margin, min=0.)
+        delta_p = 1 - self.margin
+        delta_n = self.margin
+
+        m_hot = F.one_hot(label.reshape([-1]), num_classes=logits.shape[1])
+
+        logits_p = alpha_p * (logits - delta_p)
+        logits_n = alpha_n * (logits - delta_n)
+        pre_logits = logits_p * m_hot + logits_n * (1 - m_hot)
+        pre_logits = self.scale * pre_logits
+
+        return pre_logits
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/cosmargin.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/cosmargin.py
new file mode 100644
index 000000000..d420c0ace
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/cosmargin.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/1801.09414
+
+import paddle
+import math
+import paddle.nn as nn
+
+
+class CosMargin(paddle.nn.Layer):
+    def __init__(self, embedding_size, class_num, margin=0.35, scale=64.0):
+        super(CosMargin, self).__init__()
+        self.scale = scale
+        self.margin = margin
+        self.embedding_size = embedding_size
+        self.class_num = class_num
+
+        self.weight = self.create_parameter(
+            shape=[self.embedding_size, self.class_num],
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.XavierNormal())
+
+    def forward(self, input, label):
+        label.stop_gradient = True
+
+        input_norm = paddle.sqrt(
+            paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        input = paddle.divide(input, input_norm)
+
+        weight_norm = paddle.sqrt(
+            paddle.sum(paddle.square(self.weight), axis=0, keepdim=True))
+        weight = paddle.divide(self.weight, weight_norm)
+
+        cos = paddle.matmul(input, weight)
+        if not self.training or label is None:
+            return cos
+
+        cos_m = cos - self.margin
+
+        one_hot = paddle.nn.functional.one_hot(label, self.class_num)
+        one_hot = paddle.squeeze(one_hot, axis=[1])
+        output = paddle.multiply(one_hot, cos_m) + paddle.multiply(
+            (1.0 - one_hot), cos)
+        output = output * self.scale
+        return output
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/fc.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/fc.py
new file mode 100644
index 000000000..622b0d37d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/fc.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+
+from ..utils import get_param_attr_dict
+
+
+class FC(nn.Layer):
+    def __init__(self, embedding_size, class_num, **kwargs):
+        super(FC, self).__init__()
+        self.embedding_size = embedding_size
+        self.class_num = class_num
+
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.XavierNormal())
+        if 'weight_attr' in kwargs:
+            weight_attr = get_param_attr_dict(kwargs['weight_attr'])
+
+        bias_attr = None
+        if 'bias_attr' in kwargs:
+            bias_attr = get_param_attr_dict(kwargs['bias_attr'])
+
+        self.fc = nn.Linear(
+            self.embedding_size,
+            self.class_num,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr)
+
+    def forward(self, input, label=None):
+        out = self.fc(input)
+        return out
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/frfn_neck.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/frfn_neck.py
new file mode 100644
index 000000000..43eadbab4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/frfn_neck.py
@@ -0,0 +1,32 @@
+import paddle.nn as nn
+
+
+class Normalize(nn.Layer):
+    """ Ln normalization copied from
+    https://github.com/salesforce/CoMatch
+    """
+
+    def __init__(self, power=2):
+        super(Normalize, self).__init__()
+        self.power = power
+
+    def forward(self, x):
+        norm = x.pow(self.power).sum(1, keepdim=True).pow(1. / self.power)
+        out = x.divide(norm)
+        return out
+
+
+class FRFNNeck(nn.Layer):
+    def __init__(self, num_features, low_dim, **kwargs):
+        super(FRFNNeck, self).__init__()
+        self.l2norm = Normalize(2)
+        self.fc1 = nn.Linear(num_features, num_features)
+        self.relu_mlp = nn.LeakyReLU(negative_slope=0.1)
+        self.fc2 = nn.Linear(num_features, low_dim)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.relu_mlp(x)
+        x = self.fc2(x)
+        x = self.l2norm(x)
+        return x
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/identity_head.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/identity_head.py
new file mode 100644
index 000000000..7d11e5742
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/identity_head.py
@@ -0,0 +1,9 @@
+from paddle import nn
+
+
+class IdentityHead(nn.Layer):
+    def __init__(self):
+        super(IdentityHead, self).__init__()
+
+    def forward(self, x, label=None):
+        return {"features": x, "logits": None}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/metabnneck.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/metabnneck.py
new file mode 100644
index 000000000..a27a7a852
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/metabnneck.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+from collections import defaultdict
+import copy
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ..utils import get_param_attr_dict
+
+
+class MetaBN1D(nn.BatchNorm1D):
+    def forward(self, inputs, opt={}):
+        mode = opt.get("bn_mode", "general") if self.training else "eval"
+        if mode == "general":  # update, but not apply running_mean/var
+            result = F.batch_norm(inputs, self._mean, self._variance,
+                                  self.weight, self.bias, self.training,
+                                  self._momentum, self._epsilon)
+        elif mode == "hold":  # not update, not apply running_mean/var
+            result = F.batch_norm(
+                inputs,
+                paddle.mean(
+                    inputs, axis=0),
+                paddle.var(inputs, axis=0),
+                self.weight,
+                self.bias,
+                self.training,
+                self._momentum,
+                self._epsilon)
+        elif mode == "eval":  # fix and apply running_mean/var,
+            if self._mean is None:
+                result = F.batch_norm(
+                    inputs,
+                    paddle.mean(
+                        inputs, axis=0),
+                    paddle.var(inputs, axis=0),
+                    self.weight,
+                    self.bias,
+                    True,
+                    self._momentum,
+                    self._epsilon)
+            else:
+                result = F.batch_norm(inputs, self._mean, self._variance,
+                                      self.weight, self.bias, False,
+                                      self._momentum, self._epsilon)
+        return result
+
+
+class MetaBNNeck(nn.Layer):
+    def __init__(self, num_features, **kwargs):
+        super(MetaBNNeck, self).__init__()
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Constant(value=1.0))
+        bias_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Constant(value=0.0),
+            trainable=False)
+
+        if 'weight_attr' in kwargs:
+            weight_attr = get_param_attr_dict(kwargs['weight_attr'])
+
+        bias_attr = None
+        if 'bias_attr' in kwargs:
+            bias_attr = get_param_attr_dict(kwargs['bias_attr'])
+
+        use_global_stats = None
+        if 'use_global_stats' in kwargs:
+            use_global_stats = get_param_attr_dict(kwargs['use_global_stats'])
+
+        self.feat_bn = MetaBN1D(
+            num_features,
+            momentum=0.9,
+            epsilon=1e-05,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            use_global_stats=use_global_stats)
+        self.flatten = nn.Flatten()
+        self.opt = {}
+
+    def forward(self, x):
+        x = self.flatten(x)
+        x = self.feat_bn(x, self.opt)
+        return x
+
+    def reset_opt(self):
+        self.opt = defaultdict()
+
+    def setup_opt(self, opt):
+        """
+        Arg:
+            opt (dict): Optional setting to change the behavior of MetaBIN during training. 
+                It includes three settings which are `enable_inside_update`, `lr_gate` and `bn_mode`.
+        """
+        self.check_opt(opt)
+        self.opt = copy.deepcopy(opt)
+
+    @classmethod
+    def check_opt(cls, opt):
+        assert isinstance(opt, dict), \
+            TypeError('Got the wrong type of `opt`. Please use `dict` type.')
+
+        if opt.get('enable_inside_update', False) and 'lr_gate' not in opt:
+            raise RuntimeError('Missing `lr_gate` in opt.')
+
+        assert isinstance(opt.get('lr_gate', 1.0), float), \
+            TypeError('Got the wrong type of `lr_gate`. Please use `float` type.')
+        assert isinstance(opt.get('enable_inside_update', True), bool), \
+            TypeError('Got the wrong type of `enable_inside_update`. Please use `bool` type.')
+        assert opt.get('bn_mode', "general") in ["general", "hold", "eval"], \
+            TypeError('Got the wrong value of `bn_mode`.')
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/ml_decoder.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/ml_decoder.py
new file mode 100644
index 000000000..7f24bac2f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/ml_decoder.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import XavierNormal, Constant, Normal
+
+xavier_normal_ = XavierNormal()
+normal_ = Normal
+zero_ = Constant(value=0.0)
+
+
+class MLDecoder(nn.Layer):
+    """
+    ML-Decoder is an attention-based classification head,
+    which introduced by Tal Ridnik et al. in https://arxiv.org/pdf/2111.12933.pdf.
+    """
+
+    def __init__(self,
+                 class_num=80,
+                 in_channels=2048,
+                 query_num=80,
+                 embed_dim=768,
+                 depth=1,
+                 num_heads=8,
+                 mlp_hidden_dim=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 freeze_query_embed=True,
+                 remove_self_attn=True):
+        super().__init__()
+        self.class_num = class_num
+        self.in_channels = in_channels
+
+        # 1 <= query_num <= class_num
+        query_num = min(max(query_num, 1), class_num)
+
+        self.input_proj = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=embed_dim,
+            kernel_size=1,
+            stride=1)
+
+        self.query_pos_embed = nn.Embedding(
+            num_embeddings=query_num,
+            embedding_dim=embed_dim)
+        if freeze_query_embed:
+            self.query_pos_embed.weight.stop_gradient = True
+
+        decoder_layer = nn.TransformerDecoderLayer(
+            d_model=embed_dim,
+            nhead=num_heads,
+            dim_feedforward=mlp_hidden_dim,
+            dropout=dropout,
+            activation=activation,
+            attn_dropout=dropout,
+            act_dropout=dropout)
+        if remove_self_attn:
+            del decoder_layer.self_attn
+            decoder_layer.self_attn = self.self_attn_identity
+        self.decoder = nn.TransformerDecoder(
+            decoder_layer=decoder_layer,
+            num_layers=depth)
+
+        group_factor = math.ceil(class_num / query_num)
+        self.group_conv = nn.Conv2D(
+            in_channels=query_num * embed_dim,
+            out_channels=query_num * group_factor,
+            kernel_size=1,
+            stride=1,
+            groups=query_num)
+
+        self._init_weights()
+
+    def _init_weights(self):
+        normal_(self.query_pos_embed.weight)
+        xavier_normal_(self.group_conv.weight)
+        zero_(self.group_conv.bias)
+
+    @staticmethod
+    def self_attn_identity(*args):
+        return args[0]
+
+    def group_fc_pool(self, x):
+        x = x.flatten(1)[..., None, None]
+        x = self.group_conv(x)
+        x = x.flatten(1)[:, :self.class_num]
+        return x
+
+    def forward(self, x):
+        if x.ndim == 2:
+            assert x.shape[1] % self.in_channels == 0, "Wrong `in_channels` value!!!"
+            x = x.reshape([x.shape[0], self.in_channels, -1, 1])
+        elif x.ndim == 3:
+            assert x.shape[1] == self.in_channels, "Wrong input shape!!!"
+            x = x.unsqueeze(-1)
+        else:
+            assert x.ndim == 4 and x.shape[1] == self.in_channels, "Wrong input shape!!!"
+
+        feat_proj = F.relu(self.input_proj(x))
+        feat_flatten = feat_proj.flatten(2).transpose([0, 2, 1])
+
+        query_pos_embed = self.query_pos_embed.weight[None].tile([x.shape[0], 1, 1])
+        out_embed = self.decoder(query_pos_embed, feat_flatten)
+
+        logit = self.group_fc_pool(out_embed)
+        return logit
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/vehicle_neck.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/vehicle_neck.py
new file mode 100644
index 000000000..05f4e333f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/gears/vehicle_neck.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+
+
+class VehicleNeck(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=1,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 padding_mode='zeros',
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCHW'):
+        super().__init__()
+        self.conv = nn.Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            padding_mode=padding_mode,
+            weight_attr=weight_attr,
+            bias_attr=weight_attr,
+            data_format=data_format)
+        self.flatten = nn.Flatten()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.flatten(x)
+        return x
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/slim/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/slim/__init__.py
new file mode 100644
index 000000000..734bfefad
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/slim/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .prune import prune_model
+from .quant import quantize_model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/slim/prune.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/slim/prune.py
new file mode 100644
index 000000000..59cd411a1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/slim/prune.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+import paddle
+from ...utils import logger
+
+
+def prune_model(config, model):
+    if config.get("Slim", False) and config["Slim"].get("prune", False):
+        import paddleslim
+        prune_method_name = config["Slim"]["prune"]["name"].lower()
+        assert prune_method_name in [
+            "fpgm", "l1_norm"
+        ], "The prune methods only support 'fpgm' and 'l1_norm'"
+        if prune_method_name == "fpgm":
+            model.pruner = paddleslim.dygraph.FPGMFilterPruner(
+                model, [1] + config["Global"]["image_shape"])
+        else:
+            model.pruner = paddleslim.dygraph.L1NormFilterPruner(
+                model, [1] + config["Global"]["image_shape"])
+
+        # prune model
+        _prune_model(config, model)
+    else:
+        model.pruner = None
+
+
+def _prune_model(config, model):
+    from paddleslim.analysis import dygraph_flops as flops
+    logger.info("FLOPs before pruning: {}GFLOPs".format(
+        flops(model, [1] + config["Global"]["image_shape"]) / 1e9))
+    model.eval()
+
+    params = []
+    for sublayer in model.sublayers():
+        for param in sublayer.parameters(include_sublayers=False):
+            if isinstance(sublayer, paddle.nn.Conv2D):
+                params.append(param.name)
+    ratios = {}
+    for param in params:
+        ratios[param] = config["Slim"]["prune"]["pruned_ratio"]
+    plan = model.pruner.prune_vars(ratios, [0])
+
+    logger.info("FLOPs after pruning: {}GFLOPs; pruned ratio: {}".format(
+        flops(model, [1] + config["Global"]["image_shape"]) / 1e9,
+        plan.pruned_flops))
+
+    for param in model.parameters():
+        if "conv2d" in param.name:
+            logger.info("{}\t{}".format(param.name, param.shape))
+
+    model.train()
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/slim/quant.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/slim/quant.py
new file mode 100644
index 000000000..3e31d9d53
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/slim/quant.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+import paddle
+from ...utils import logger
+
+QUANT_CONFIG = {
+    # weight preprocess type, default is None and no preprocessing is performed.
+    'weight_preprocess_type': None,
+    # activation preprocess type, default is None and no preprocessing is performed.
+    'activation_preprocess_type': None,
+    # weight quantize type, default is 'channel_wise_abs_max'
+    'weight_quantize_type': 'channel_wise_abs_max',
+    # activation quantize type, default is 'moving_average_abs_max'
+    'activation_quantize_type': 'moving_average_abs_max',
+    # weight quantize bit num, default is 8
+    'weight_bits': 8,
+    # activation quantize bit num, default is 8
+    'activation_bits': 8,
+    # data type after quantization, such as 'uint8', 'int8', etc. default is 'int8'
+    'dtype': 'int8',
+    # window size for 'range_abs_max' quantization. default is 10000
+    'window_size': 10000,
+    # The decay coefficient of moving average, default is 0.9
+    'moving_rate': 0.9,
+    # for dygraph quantization, layers of type in quantizable_layer_type will be quantized
+    'quantizable_layer_type': ['Conv2D', 'Linear'],
+}
+
+
+def quantize_model(config, model, mode="train"):
+    if config.get("Slim", False) and config["Slim"].get("quant", False):
+        from paddleslim.dygraph.quant import QAT
+        assert config["Slim"]["quant"]["name"].lower(
+        ) == 'pact', 'Only PACT quantization method is supported now'
+        QUANT_CONFIG["activation_preprocess_type"] = "PACT"
+        if mode in ["infer", "export"]:
+            QUANT_CONFIG['activation_preprocess_type'] = None
+
+        # for re-parameterization nets, convert to reparameterized model first
+        for layer in model.sublayers():
+            if hasattr(layer, "re_parameterize"):
+                layer.re_parameterize()
+
+        model.quanter = QAT(config=QUANT_CONFIG)
+        model.quanter.quantize(model)
+        logger.info("QAT model summary:")
+        paddle.summary(model, (1, 3, 224, 224))
+    else:
+        model.quanter = None
+    return
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/utils.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/utils.py
new file mode 100644
index 000000000..785b7fbbe
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/arch/utils.py
@@ -0,0 +1,99 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+import types
+import paddle
+from difflib import SequenceMatcher
+
+from . import backbone
+from typing import Any, Dict, Union
+
+
+def get_architectures():
+    """
+    get all of model architectures
+    """
+    names = []
+    for k, v in backbone.__dict__.items():
+        if isinstance(v, (types.FunctionType, six.class_types)):
+            names.append(k)
+    return names
+
+
+def get_blacklist_model_in_static_mode():
+    from ppcls.arch.backbone import distilled_vision_transformer
+    from ppcls.arch.backbone import vision_transformer
+    blacklist = distilled_vision_transformer.__all__ + vision_transformer.__all__
+    return blacklist
+
+
+def similar_architectures(name='', names=[], thresh=0.1, topk=10):
+    """
+    inferred similar architectures
+    """
+    scores = []
+    for idx, n in enumerate(names):
+        if n.startswith('__'):
+            continue
+        score = SequenceMatcher(None, n.lower(), name.lower()).quick_ratio()
+        if score > thresh:
+            scores.append((idx, score))
+    scores.sort(key=lambda x: x[1], reverse=True)
+    similar_names = [names[s[0]] for s in scores[:min(topk, len(scores))]]
+    return similar_names
+
+
+def get_param_attr_dict(ParamAttr_config: Union[None, bool, Dict[str, Dict]]
+                        ) -> Union[None, bool, paddle.ParamAttr]:
+    """parse ParamAttr from an dict
+
+    Args:
+        ParamAttr_config (Union[None, bool, Dict[str, Dict]]): ParamAttr configure
+
+    Returns:
+        Union[None, bool, paddle.ParamAttr]: Generated ParamAttr
+    """
+    if ParamAttr_config is None:
+        return None
+    if isinstance(ParamAttr_config, bool):
+        return ParamAttr_config
+    ParamAttr_dict = {}
+    if 'initializer' in ParamAttr_config:
+        initializer_cfg = ParamAttr_config.get('initializer')
+        if 'name' in initializer_cfg:
+            initializer_name = initializer_cfg.pop('name')
+            ParamAttr_dict['initializer'] = getattr(
+                paddle.nn.initializer, initializer_name)(**initializer_cfg)
+        else:
+            raise ValueError(f"'name' must specified in initializer_cfg")
+    if 'learning_rate' in ParamAttr_config:
+        # NOTE: only support an single value now
+        learning_rate_value = ParamAttr_config.get('learning_rate')
+        if isinstance(learning_rate_value, (int, float)):
+            ParamAttr_dict['learning_rate'] = learning_rate_value
+        else:
+            raise ValueError(
+                f"learning_rate_value must be float or int, but got {type(learning_rate_value)}"
+            )
+    if 'regularizer' in ParamAttr_config:
+        regularizer_cfg = ParamAttr_config.get('regularizer')
+        if 'name' in regularizer_cfg:
+            # L1Decay or L2Decay
+            regularizer_name = regularizer_cfg.pop('name')
+            ParamAttr_dict['regularizer'] = getattr(
+                paddle.regularizer, regularizer_name)(**regularizer_cfg)
+        else:
+            raise ValueError(f"'name' must specified in regularizer_cfg")
+    return paddle.ParamAttr(**ParamAttr_dict)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Attr/PPLCNet_x1_0_pedestrian_attribute.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Attr/PPLCNet_x1_0_pedestrian_attribute.yaml
new file mode 100644
index 000000000..0aa50f88d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Attr/PPLCNet_x1_0_pedestrian_attribute.yaml
@@ -0,0 +1,148 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 192]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "PPLCNet_x1_0"
+  pretrained: True
+  use_ssld: True
+  class_num: 26
+  
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pedestrian_attribute/data"
+      cls_label_path: "dataset/pedestrian_attribute/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - TimmAutoAugment:
+            prob: 0.8
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [192, 256]
+        - Padv2:
+            size: [212, 276]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [192, 256]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.4
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pedestrian_attribute/data"
+      cls_label_path: "dataset/pedestrian_attribute/val_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_attribute/090004.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [192, 256]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: MultiLabelThreshOutput
+    threshold: 0.5
+    class_id_map_file: ppcls/utils/pedestrian_attribute_label_list.txt
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Attr/PPLCNet_x1_0_vehicle_attribute.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Attr/PPLCNet_x1_0_vehicle_attribute.yaml
new file mode 100644
index 000000000..c9a902562
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Attr/PPLCNet_x1_0_vehicle_attribute.yaml
@@ -0,0 +1,149 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 20
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 192, 256]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "PPLCNet_x1_0"
+  pretrained: True
+  class_num: 19
+  use_ssld: True
+  lr_mult_list: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+  infer_add_softmax: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.0125
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [256, 192]
+        - Padv2:
+            size: [276, 212]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [256, 192]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/test_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ./deploy/images/PULC/vehicle_attribute/0002_c002_00030670_0.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [256, 192]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: MultiLabelThreshOutput
+    threshold: 0.5
+    class_id_map_file: ppcls/utils/vehicle_attribute_label_list.txt
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Attr/StrongBaselineAttr.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Attr/StrongBaselineAttr.yaml
new file mode 100644
index 000000000..2324015d6
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Attr/StrongBaselineAttr.yaml
@@ -0,0 +1,113 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 20
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 192]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "ResNet50"
+  pretrained: True
+  class_num: 26
+  infer_add_softmax: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Adam
+  lr:
+    name: Piecewise
+    decay_epochs: [12, 18, 24, 28]
+    values: [0.0001, 0.00001, 0.000001, 0.0000001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+  clip_norm: 10
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/attribute/data/"
+      cls_label_path: "dataset/attribute/trainval.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - Padv2:
+            size: [212, 276]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [192, 256]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/attribute/data/"
+      cls_label_path: "dataset/attribute/test.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+
+Metric:
+  Eval:
+    - ATTRMetric:
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/CAE/cae_base_patch16_224_finetune.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/CAE/cae_base_patch16_224_finetune.yaml
new file mode 100644
index 000000000..899deab20
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/CAE/cae_base_patch16_224_finetune.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 20
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: cae_base_patch16_224
+  class_num: 1000
+  drop_rate: 0.0
+  drop_path_rate: 0.1
+  attn_drop_rate: 0.0
+
+  use_mean_pooling: True
+  init_scale: 0.001
+  use_rel_pos_bias: True
+  use_abs_pos_emb: False
+  init_values: 0.1
+  lin_probe: False
+
+  sin_pos_emb: True
+
+  enable_linear_eval: False
+  model_key: model|module|state_dict
+  model_ema:
+    enable_model_ema: False 
+    model_ema_decay: 0.9999
+    model_ema_force_cpu: False
+  pretrained: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - SoftTargetCrossEntropy:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamWDL
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  layerwise_decay: 0.65
+  lr:
+    name: Cosine
+    learning_rate: 0.001
+    eta_min: 1e-6
+    warmup_epoch: 10
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      batch_transform_ops:
+        - MixupCutmixHybrid:
+            mixup_alpha: 0.8
+            cutmix_alpha: 1.0
+            switch_prob: 0.5
+            num_classes: 1000
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandomResizedCrop:
+            size: 224
+        - RandomHorizontalFlip:
+            prob: 0.5
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [ 0.5, 0.5, 0.5 ]
+            std: [ 0.5, 0.5, 0.5 ]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 16
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [ 0.5, 0.5, 0.5 ]
+            std: [ 0.5, 0.5, 0.5 ]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 16
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/CAE/cae_large_patch16_224_finetune.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/CAE/cae_large_patch16_224_finetune.yaml
new file mode 100644
index 000000000..a7fbe9002
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/CAE/cae_large_patch16_224_finetune.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 20
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: cae_large_patch16_224
+  class_num: 1000
+  drop_rate: 0.0
+  drop_path_rate: 0.2
+  attn_drop_rate: 0.0
+
+  use_mean_pooling: True
+  init_scale: 0.001
+  use_rel_pos_bias: True
+  use_abs_pos_emb: False
+  init_values: 0.1
+  lin_probe: False
+
+  sin_pos_emb: True
+
+  enable_linear_eval: False
+  model_key: model|module|state_dict
+  model_ema:
+    enable_model_ema: False 
+    model_ema_decay: 0.9999
+    model_ema_force_cpu: False
+  pretrained: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - SoftTargetCrossEntropy:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamWDL
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  layerwise_decay: 0.75
+  lr:
+    name: Cosine
+    learning_rate: 0.001
+    eta_min: 1e-6
+    warmup_epoch: 10
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      batch_transform_ops:
+        - MixupCutmixHybrid:
+            mixup_alpha: 0.8
+            cutmix_alpha: 1.0
+            switch_prob: 0.5
+            num_classes: 1000
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandomResizedCrop:
+            size: 224
+        - RandomHorizontalFlip:
+            prob: 0.5
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [ 0.5, 0.5, 0.5 ]
+            std: [ 0.5, 0.5, 0.5 ]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 16
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [ 0.5, 0.5, 0.5 ]
+            std: [ 0.5, 0.5, 0.5 ]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 16
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/CLIP/CLIP_vit_base_patch16_224_finetune.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/CLIP/CLIP_vit_base_patch16_224_finetune.yaml
new file mode 100644
index 000000000..aaddd1b15
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/CLIP/CLIP_vit_base_patch16_224_finetune.yaml
@@ -0,0 +1,162 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 50
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O2
+  use_fp16_test: True
+
+# model architecture
+Arch:
+  name: CLIP_vit_base_patch16_224
+  class_num: 1000
+  return_embed: False
+  pretrained: True
+  # fused op can be used in AMP O2 mode only
+  use_fused_attn: False
+  use_fused_linear: False 
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamWDL
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  layerwise_decay: 0.6
+  filter_bias_and_bn: True
+  lr:
+    name: Cosine
+    learning_rate: 0.0003
+    eta_min: 1e-6
+    warmup_epoch: 10
+    warmup_start_lr: 1e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 224
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 224
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/CLIP/CLIP_vit_large_patch14_224_finetune.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/CLIP/CLIP_vit_large_patch14_224_finetune.yaml
new file mode 100644
index 000000000..c9c236799
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/CLIP/CLIP_vit_large_patch14_224_finetune.yaml
@@ -0,0 +1,162 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 50
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O2
+  use_fp16_test: True
+
+# model architecture
+Arch:
+  name: CLIP_vit_large_patch14_224
+  class_num: 1000
+  return_embed: False
+  pretrained: True
+  # fused op can be used in AMP O2 mode only
+  use_fused_attn: False
+  use_fused_linear: False 
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamWDL
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  layerwise_decay: 0.6
+  filter_bias_and_bn: True
+  lr:
+    name: Cosine
+    learning_rate: 0.0003
+    eta_min: 1e-6
+    warmup_epoch: 10
+    warmup_start_lr: 1e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 224
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 224
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Cartoonface/ResNet50_icartoon.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Cartoonface/ResNet50_icartoon.yaml
new file mode 100644
index 000000000..3d1b99378
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Cartoonface/ResNet50_icartoon.yaml
@@ -0,0 +1,149 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_mode: "retrieval"
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  image_shape: [3, 224, 224]
+  infer_imgs:
+  save_inference_dir: "./inference"
+  feature_normalize: True
+
+Arch:  
+  name: "RecModel"
+  Backbone:
+    name: "ResNet50"
+    pretrained: True
+  BackboneStopLayer: 
+    name: "flatten"
+    output_dim: 2048
+  Head:
+    name: "FC"
+    class_num: 5013
+    embedding_size: 2048
+    # margin: 0.5
+    # scale:  80
+  infer_output_key:  "features"
+  infer_add_softmax: "false"
+
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    # - TripletLoss:  
+    #     margin: 0.1
+    #     weight: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+DataLoader:
+  Train:
+    dataset:
+      name: ICartoonDataset
+      image_root:  "./dataset/iCartoonFace"
+      cls_label_path:  "./dataset/iCartoonFace/train_list.txt"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 0.00392157
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+        name: DistributedBatchSampler
+        #num_instances: 2
+        batch_size: 256
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+  
+  Eval:
+    Query:
+      dataset: 
+        name: ICartoonDataset
+        image_root: "./dataset/iCartoonFace"
+        cls_label_path: "./dataset/iCartoonFace/query.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              resize_short: 256
+          - CropImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+          name: DistributedBatchSampler
+          batch_size: 64
+          drop_last: False
+          shuffle: False
+      loader:
+          num_workers: 8
+          use_shared_memory: True
+
+    Gallery:
+      dataset: 
+          name: ICartoonDataset
+          image_root: "./dataset/iCartoonFace"
+          cls_label_path: "./dataset/iCartoonFace/gallery.txt"
+          transform_ops:
+            - DecodeImage:
+                to_rgb: True
+                channel_first: False
+            - ResizeImage:
+                resize_short: 256
+            - CropImage:
+                size: 224
+            - NormalizeImage:
+                scale: 0.00392157
+                mean: [0.485, 0.456, 0.406]
+                std: [0.229, 0.224, 0.225]
+                order: ''
+      sampler:
+          name: DistributedBatchSampler
+          batch_size: 64
+          drop_last: False
+          shuffle: False
+      loader:
+          num_workers: 8
+          use_shared_memory: True
+
+Metric:
+    Train:
+    - TopkAcc:
+        topk: [1, 5]
+    Eval:
+    - Recallk:
+        topk: [1]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/DeepHash/DCH.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/DeepHash/DCH.yaml
new file mode 100644
index 000000000..363d7b406
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/DeepHash/DCH.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 15
+  eval_during_train: True
+  eval_interval: 15
+  epochs: 150
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+  #feature postprocess
+  feature_normalize: False
+  feature_binarize: "sign"
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+  is_rec: True
+
+  Backbone:
+    name: AlexNet
+    pretrained: True
+    class_num: 48
+
+# loss function config for train/eval process
+Loss:
+  Train:
+    - DCHLoss:
+        weight: 1.0
+        gamma:  20.0
+        _lambda: 0.1
+        n_class: 10
+  Eval:
+    - DCHLoss:
+        weight: 1.0
+        gamma:  20.0
+        _lambda: 0.1
+        n_class: 10
+
+Optimizer:
+  name: SGD
+  lr:
+    name: Piecewise
+    learning_rate: 0.005
+    decay_epochs: [200]
+    values: [0.005, 0.0005]
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/CIFAR10/
+      cls_label_path: ./dataset/CIFAR10/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset:
+        name: ImageNetDataset
+        image_root: ./dataset/CIFAR10/
+        cls_label_path: ./dataset/CIFAR10/test_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: ImageNetDataset
+        image_root: ./dataset/CIFAR10/
+        cls_label_path: ./dataset/CIFAR10/train_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - mAP: {}
+    - Recallk:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/DeepHash/DSHSD.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/DeepHash/DSHSD.yaml
new file mode 100644
index 000000000..e5b45d64a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/DeepHash/DSHSD.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 15
+  eval_during_train: True
+  eval_interval: 15
+  epochs: 150
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+  #feature postprocess
+  feature_normalize: False
+  feature_binarize: "sign"
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key:  features
+  infer_add_softmax: False
+  is_rec: True
+
+  Backbone:
+    name: AlexNet
+    pretrained: True
+    class_num: 48
+  Neck:
+    name: Tanh
+  Head:
+    name: FC
+    class_num: 10
+    embedding_size: 48
+
+# loss function config for train/eval process
+Loss:
+  Train:
+    - DSHSDLoss:
+        weight: 1.0
+        alpha:  0.05
+  Eval:
+    - DSHSDLoss:
+        weight: 1.0
+        alpha:  0.05
+
+Optimizer:
+  name: Adam
+  beta1: 0.9
+  beta2: 0.999
+  lr:
+    name: Piecewise
+    learning_rate: 0.00001
+    decay_epochs: [200]
+    values: [0.00001, 0.000001]  
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/CIFAR10/
+      cls_label_path: ./dataset/CIFAR10/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset:
+        name: ImageNetDataset
+        image_root: ./dataset/CIFAR10/
+        cls_label_path: ./dataset/CIFAR10/test_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: ImageNetDataset
+        image_root: ./dataset/CIFAR10/
+        cls_label_path: ./dataset/CIFAR10/train_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - mAP: {}
+    - Recallk:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/DeepHash/LCDSH.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/DeepHash/LCDSH.yaml
new file mode 100644
index 000000000..5a3349d9d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/DeepHash/LCDSH.yaml
@@ -0,0 +1,138 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 15
+  eval_during_train: True
+  eval_interval: 15
+  epochs: 150
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+  #feature postprocess
+  feature_normalize: False
+  feature_binarize: "sign"
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+  is_rec: True
+
+  Backbone:
+    name: AlexNet
+    pretrained: True
+    class_num: 48
+
+# loss function config for train/eval process
+Loss:
+  Train:
+    - LCDSHLoss:
+        weight: 1.0
+        _lambda: 3
+        n_class: 10
+  Eval:
+    - LCDSHLoss:
+        weight: 1.0
+        _lambda: 3
+        n_class: 10
+
+Optimizer:
+  name: Adam
+  beta1: 0.9
+  beta2: 0.999
+  lr:
+    name: Piecewise
+    learning_rate: 0.00001
+    decay_epochs: [200]
+    values: [0.00001, 0.000001]  
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/CIFAR10/
+      cls_label_path: ./dataset/CIFAR10/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset:
+        name: ImageNetDataset
+        image_root: ./dataset/CIFAR10/
+        cls_label_path: ./dataset/CIFAR10/test_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: ImageNetDataset
+        image_root: ./dataset/CIFAR10/
+        cls_label_path: ./dataset/CIFAR10/train_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - mAP: {}
+    - Recallk:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Face_Recognition/FaceRecognition_ArcFace_MobileFaceNet.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Face_Recognition/FaceRecognition_ArcFace_MobileFaceNet.yaml
new file mode 100644
index 000000000..e0045ac51
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Face_Recognition/FaceRecognition_ArcFace_MobileFaceNet.yaml
@@ -0,0 +1,128 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: output/MobileFaceNet
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 25
+  print_batch_step: 20
+  use_visualdl: False
+  eval_mode: face_recognition
+  retrieval_feature_from: backbone 
+  flip_test: True
+  feature_normalize: False
+  re_ranking: False
+  use_dali: False
+  # used for static mode and model export
+  image_shape: [3, 112, 112]
+  save_inference_dir: ./inference
+
+AMP:
+  scale_loss: 27648
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+
+  Backbone:
+    name: MobileFaceNet
+  Head:
+    name: ArcMargin
+    embedding_size: 128
+    class_num: 93431
+    margin: 0.5
+    scale: 64
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 4e-3  # lr 4e-3 for total_batch_size 1024
+    eta_min: 1e-6
+    warmup_epoch: 1
+    warmup_start_lr: 0
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: dataset/MS1M_v3/
+      cls_label_path: dataset/MS1M_v3/label.txt
+      delimiter: "\t"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: cv2
+        - RandFlipImage:
+            flip_code: 1
+        - ResizeImage:
+            size: [112, 112]
+            return_numpy: False
+            interpolation: bilinear
+            backend: cv2
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: FiveFaceEvalDataset
+      val_data_path: dataset/MS1M_v3/
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: cv2
+        - ResizeImage:
+            size: [112, 112]
+            return_numpy: False
+            interpolation: bilinear
+            backend: cv2
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Metric:
+  Eval:
+    - FaceAccOnFiveDatasets: {}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Face_Recognition/FaceRecognition_ArcFace_ResNet50.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Face_Recognition/FaceRecognition_ArcFace_ResNet50.yaml
new file mode 100644
index 000000000..fcfd0cd87
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Face_Recognition/FaceRecognition_ArcFace_ResNet50.yaml
@@ -0,0 +1,131 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: output/ResNet50_face
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 25
+  print_batch_step: 20
+  use_visualdl: False
+  eval_mode: face_recognition
+  retrieval_feature_from: backbone 
+  flip_test: True
+  feature_normalize: False
+  re_ranking: False
+  use_dali: False
+  # used for static mode and model export
+  image_shape: [3, 112, 112]
+  save_inference_dir: ./inference
+
+AMP:
+  scale_loss: 27648.0
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+
+  Backbone:
+    name: ResNet50
+    max_pool: False
+    stride_list: [1, 2, 2, 2, 2]
+    class_num: 512
+  Head:
+    name: ArcMargin
+    embedding_size: 512
+    class_num: 93431
+    margin: 0.5
+    scale: 64
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 4e-3  # lr 4e-3 for total_batch_size 1024
+    eta_min: 1e-6
+    warmup_epoch: 1
+    warmup_start_lr: 0
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: dataset/MS1M_v3/
+      cls_label_path: dataset/MS1M_v3/label.txt
+      delimiter: "\t"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: cv2
+        - RandFlipImage:
+            flip_code: 1
+        - ResizeImage:
+            size: [112, 112]
+            return_numpy: False
+            interpolation: bilinear
+            backend: cv2
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: FiveFaceEvalDataset
+      val_data_path: dataset/MS1M_v3/
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: cv2
+        - ResizeImage:
+            size: [112, 112]
+            return_numpy: False
+            interpolation: bilinear
+            backend: cv2
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Metric:
+  Eval:
+    - BestAccOnFiveDatasets: {}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognition/Gallery2FC_PPLCNet_x2_5.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognition/Gallery2FC_PPLCNet_x2_5.yaml
new file mode 100644
index 000000000..fbaefdcf5
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognition/Gallery2FC_PPLCNet_x2_5.yaml
@@ -0,0 +1,51 @@
+# global configs
+Global:
+  pretrained_model: ./pretrained/general_PPLCNet_x2_5_pretrained_v1.0_quant
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference/general_PPLCNet_x2_5_quant/inference
+
+# for quantizaiton or prune model
+Slim:
+  ## for prune
+  quant:
+    name: pact
+
+# model architecture
+Arch:
+  name: RecModel
+
+  Backbone: 
+    name: PPLCNet_x2_5
+    pretrained: False
+    use_ssld: True
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: FC
+    embedding_size: 1280
+    class_num: 512
+  Head:
+    name: ArcMargin 
+    embedding_size: 512
+    class_num: 185341
+    margin: 0.2
+    scale: 30
+
+# indexing engine config
+IndexProcess:
+  image_root: "./drink_dataset_v1.0/gallery/"
+  data_file:  "./drink_dataset_v1.0/gallery/drink_label.txt"
+  delimiter: "\t"
+  batch_size: 2
+  transform_ops:
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [ 0.485, 0.456, 0.406 ]
+        std: [ 0.229, 0.224, 0.225 ]
+        order: ''
+    - ToCHWImage:
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5.yaml
new file mode 100644
index 000000000..70daa639b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5.yaml
@@ -0,0 +1,148 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+
+  Backbone:
+    name: PPLCNet_x2_5
+    pretrained: True
+    use_ssld: True
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: FC
+    embedding_size: 1280
+    class_num: 512
+  Head:
+    name: ArcMargin
+    embedding_size: 512
+    class_num: 185341
+    margin: 0.2
+    scale: 30
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+    warmup_epoch: 5
+  regularizer:
+    name: "L2"
+    coeff: 0.00001
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/train_reg_all_data.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ""
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset:
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ""
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ""
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_binary.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_binary.yaml
new file mode 100644
index 000000000..728942fe3
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_binary.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+  #feature postprocess
+  feature_normalize: False
+  feature_binarize: "sign"
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+
+  Backbone:
+    name: PPLCNet_x2_5_Tanh
+    pretrained: True
+    use_ssld: True
+    class_num: 512
+  Head:
+    name: FC
+    embedding_size: 512
+    class_num: 185341
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/all_data
+      cls_label_path: ./dataset/all_data/train_reg_all_data.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_dml.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_dml.yaml
new file mode 100644
index 000000000..b6c45363b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_dml.yaml
@@ -0,0 +1,188 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: true
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  infer_output_key: features
+  infer_add_softmax: False
+  is_rec: True
+  infer_model_name: "Student"
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - False
+  - False
+  models:
+    - Teacher:
+        name: RecModel
+        infer_output_key: features
+        infer_add_softmax: False
+        Backbone: 
+          name: PPLCNet_x2_5
+          pretrained: True
+          use_ssld: True
+        BackboneStopLayer:
+          name: "flatten"
+        Neck:
+          name: FC
+          embedding_size: 1280
+          class_num: 512
+        Head:
+          name: ArcMargin 
+          embedding_size: 512
+          class_num: 185341
+          margin: 0.2
+          scale: 30
+    - Student:
+        name: RecModel
+        infer_output_key: features
+        infer_add_softmax: False
+        Backbone: 
+          name: PPLCNet_x2_5
+          pretrained: True
+          use_ssld: True
+        BackboneStopLayer:
+          name: "flatten"
+        Neck:
+          name: FC
+          embedding_size: 1280
+          class_num: 512
+        Head:
+          name: ArcMargin 
+          embedding_size: 512
+          class_num: 185341
+          margin: 0.2
+          scale: 30
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+        weight: 1.0
+        key: "logits"
+        model_names: ["Student", "Teacher"]
+    - DistillationDMLLoss:
+        weight: 1.0
+        key: "logits"
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - DistillationGTCELoss:
+        weight: 1.0
+        model_names: ["Student"]
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.02
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/train_reg_all_data.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_udml.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_udml.yaml
new file mode 100644
index 000000000..bcaea03b8
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognition/GeneralRecognition_PPLCNet_x2_5_udml.yaml
@@ -0,0 +1,193 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: true
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  infer_output_key: features
+  infer_add_softmax: False
+  is_rec: True
+  infer_model_name: "Student"
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - False
+  - False
+  models:
+    - Teacher:
+        name: RecModel
+        infer_output_key: features
+        infer_add_softmax: False
+        Backbone: 
+          name: PPLCNet_x2_5
+          pretrained: True
+          use_ssld: True
+        BackboneStopLayer:
+          name: "flatten"
+        Neck:
+          name: FC
+          embedding_size: 1280
+          class_num: 512
+        Head:
+          name: ArcMargin 
+          embedding_size: 512
+          class_num: 185341
+          margin: 0.2
+          scale: 30
+    - Student:
+        name: RecModel
+        infer_output_key: features
+        infer_add_softmax: False
+        Backbone: 
+          name: PPLCNet_x2_5
+          pretrained: True
+          use_ssld: True
+        BackboneStopLayer:
+          name: "flatten"
+        Neck:
+          name: FC
+          embedding_size: 1280
+          class_num: 512
+        Head:
+          name: ArcMargin 
+          embedding_size: 512
+          class_num: 185341
+          margin: 0.2
+          scale: 30
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+        weight: 1.0
+        key: "logits"
+        model_names: ["Student", "Teacher"]
+    - DistillationDMLLoss:
+        weight: 1.0
+        key: "logits"
+        model_name_pairs:
+        - ["Student", "Teacher"]
+    - DistillationDistanceLoss:
+        weight: 1.0
+        key: "backbone"
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - DistillationGTCELoss:
+        weight: 1.0
+        model_names: ["Student"]
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.02
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/train_reg_all_data.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognitionV2/GeneralRecognitionV2_CLIP_vit_base.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognitionV2/GeneralRecognitionV2_CLIP_vit_base.yaml
new file mode 100644
index 000000000..912c3cd39
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognitionV2/GeneralRecognitionV2_CLIP_vit_base.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+
+  Backbone:
+    name: CLIP_vit_base_patch16_224
+    pretrained: True
+    return_embed: True
+    return_mean_embed: True 
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: FC
+    embedding_size: 512
+    class_num: 512
+  Head:
+    name: ArcMargin
+    embedding_size: 512
+    class_num: 192613
+    margin: 0.2
+    scale: 30
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.02
+    warmup_epoch: 1
+  regularizer:
+    name: "L2"
+    coeff: 0.00002
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/train_reg_all_data_v2.txt 
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224  
+        - RandomRotation:
+            prob: 0.3
+            degrees: 90
+            interpolation: bicubic
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ""
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset:
+        name: VeriWild
+        image_root: ./dataset/ppshitu_traindata/Aliproduct
+        cls_label_path: ./dataset/ppshitu_traindata/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+              interpolation: bicubic
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ""
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 32
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 12
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: VeriWild
+        image_root: ./dataset/ppshitu_traindata/Aliproduct/
+        cls_label_path: ./dataset/ppshitu_traindata/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+              interpolation: bicubic
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ""
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 32
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 12
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognitionV2/GeneralRecognitionV2_CLIP_vit_large.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognitionV2/GeneralRecognitionV2_CLIP_vit_large.yaml
new file mode 100644
index 000000000..f7e33b3f8
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognitionV2/GeneralRecognitionV2_CLIP_vit_large.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 10
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+
+  Backbone:
+    name: CLIP_vit_large_patch14_224
+    pretrained: True
+    return_embed: True
+    return_mean_embed: True 
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: FC
+    embedding_size: 512
+    class_num: 512
+  Head:
+    name: ArcMargin
+    embedding_size: 512
+    class_num: 192613
+    margin: 0.2
+    scale: 30
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.0025
+    warmup_epoch: 1
+  regularizer:
+    name: "L2"
+    coeff: 0.00002
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/train_reg_all_data_v2.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224  
+        - RandomRotation:
+            prob: 0.3
+            degrees: 90
+            interpolation: bicubic
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ""
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset:
+        name: VeriWild
+        image_root: ./dataset/ppshitu_traindata/Aliproduct
+        cls_label_path: ./dataset/ppshitu_traindata/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+              interpolation: bicubic
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ""
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 32
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 12
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: VeriWild
+        image_root: ./dataset/ppshitu_traindata/Aliproduct/
+        cls_label_path: ./dataset/ppshitu_traindata/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+              interpolation: bicubic
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ""
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 32
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 12
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognitionV2/GeneralRecognitionV2_PPLCNetV2_base.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognitionV2/GeneralRecognitionV2_PPLCNetV2_base.yaml
new file mode 100644
index 000000000..61eadb48a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/GeneralRecognitionV2/GeneralRecognitionV2_PPLCNetV2_base.yaml
@@ -0,0 +1,209 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 20
+  use_visualdl: False
+  eval_mode: retrieval
+  retrieval_feature_from: features # 'backbone' or 'features'
+  re_ranking: False
+  use_dali: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+
+  Backbone:
+    name: PPLCNetV2_base_ShiTu
+    pretrained: True
+    use_ssld: True
+    class_expand: &feat_dim 512
+  BackboneStopLayer:
+    name: flatten
+  Neck:
+    name: BNNeck
+    num_features: *feat_dim
+    weight_attr:
+      initializer:
+        name: Constant
+        value: 1.0
+    bias_attr:
+      initializer:
+        name: Constant
+        value: 0.0
+      learning_rate: 1.0e-20 # NOTE: Temporarily set lr small enough to freeze the bias to zero
+  Head:
+    name: FC
+    embedding_size: *feat_dim
+    class_num: 192612
+    weight_attr:
+      initializer:
+        name: Normal
+        std: 0.001
+    bias_attr: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+    - TripletAngularMarginLoss:
+        weight: 1.0
+        feature_from: features
+        margin: 0.5
+        reduction: mean
+        add_absolute: True
+        absolute_loss_weight: 0.1
+        normalize_feature: True
+        ap_value: 0.8
+        an_value: 0.4
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.06 # for 8gpu x 256bs
+    warmup_epoch: 5
+  regularizer:
+    name: L2
+    coeff: 0.00001
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/train_reg_all_data_v2.txt
+      relabel: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [224, 224]
+            return_numpy: False
+            interpolation: bilinear
+            backend: cv2
+        - RandFlipImage:
+            flip_code: 1
+        - Pad:
+            padding: 10
+            backend: cv2
+        - RandCropImageV2:
+            size: [224, 224]
+        - RandomRotation:
+            prob: 0.5
+            degrees: 90
+            interpolation: bilinear
+        - ResizeImage:
+            size: [224, 224]
+            return_numpy: False
+            interpolation: bilinear
+            backend: cv2
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: hwc
+    sampler:
+      name: PKSampler
+      batch_size: 256
+      sample_per_id: 4
+      drop_last: False
+      shuffle: True
+      sample_method: "id_avg_prob"
+      id_list: [50030, 80700, 92019, 96015] # be careful when set relabel=True
+      ratio: [4, 4]
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset:
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: [224, 224]
+              return_numpy: False
+              interpolation: bilinear
+              backend: cv2
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: hwc
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: [224, 224]
+              return_numpy: False
+              interpolation: bilinear
+              backend: cv2
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: hwc
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSPNet/CSPDarkNet53.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSPNet/CSPDarkNet53.yaml
new file mode 100644
index 000000000..29ca02e25
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSPNet/CSPDarkNet53.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: CSPDarkNet53
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 288
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 288
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSWinTransformer/CSWinTransformer_base_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSWinTransformer/CSWinTransformer_base_224.yaml
new file mode 100644
index 000000000..5f116f11a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSWinTransformer/CSWinTransformer_base_224.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: CSWinTransformer_base_224
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 2.5e-4
+    eta_min: 2.5e-6
+    warmup_epoch: 20
+    warmup_start_lr: 2.5e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSWinTransformer/CSWinTransformer_base_384.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSWinTransformer/CSWinTransformer_base_384.yaml
new file mode 100644
index 000000000..d845d8f4d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSWinTransformer/CSWinTransformer_base_384.yaml
@@ -0,0 +1,173 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: CSWinTransformer_base_384
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1.25e-4
+    eta_min: 1.25e-6
+    warmup_epoch: 20
+    warmup_start_lr: 1.25e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 16
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSWinTransformer/CSWinTransformer_large_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSWinTransformer/CSWinTransformer_large_224.yaml
new file mode 100644
index 000000000..9cadcc901
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSWinTransformer/CSWinTransformer_large_224.yaml
@@ -0,0 +1,173 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: CSWinTransformer_large_224
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 2.5e-4
+    eta_min: 2.5e-6
+    warmup_epoch: 20
+    warmup_start_lr: 2.5e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSWinTransformer/CSWinTransformer_large_384.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSWinTransformer/CSWinTransformer_large_384.yaml
new file mode 100644
index 000000000..1e01bb0bb
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSWinTransformer/CSWinTransformer_large_384.yaml
@@ -0,0 +1,173 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: CSWinTransformer_large_384
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 6.25e-5
+    eta_min: 6.25e-7
+    warmup_epoch: 20
+    warmup_start_lr: 6.25e-8
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 8
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSWinTransformer/CSWinTransformer_small_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSWinTransformer/CSWinTransformer_small_224.yaml
new file mode 100644
index 000000000..2c182bb53
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSWinTransformer/CSWinTransformer_small_224.yaml
@@ -0,0 +1,173 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: CSWinTransformer_small_224
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 5e-4
+    eta_min: 5e-6
+    warmup_epoch: 20
+    warmup_start_lr: 5e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg 
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSWinTransformer/CSWinTransformer_tiny_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSWinTransformer/CSWinTransformer_tiny_224.yaml
new file mode 100644
index 000000000..fa8986f2c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CSWinTransformer/CSWinTransformer_tiny_224.yaml
@@ -0,0 +1,173 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: CSWinTransformer_tiny_224
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ConvNeXt/ConvNeXt_base_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ConvNeXt/ConvNeXt_base_224.yaml
new file mode 100644
index 000000000..591afe390
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ConvNeXt/ConvNeXt_base_224.yaml
@@ -0,0 +1,182 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  update_freq: 4  # for 8 cards
+
+# model ema
+EMA:
+  decay: 0.9999
+
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ConvNeXt_base_224
+  class_num: 1000
+  drop_path_rate: 0.1
+  layer_scale_init_value: 1e-6
+  head_init_scale: 1.0
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 4e-3  # lr 4e-3 for total_batch_size 4096
+    eta_min: 1e-6
+    warmup_epoch: 20
+    warmup_start_lr: 0
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ConvNeXt/ConvNeXt_base_384.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ConvNeXt/ConvNeXt_base_384.yaml
new file mode 100644
index 000000000..0adec4be5
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ConvNeXt/ConvNeXt_base_384.yaml
@@ -0,0 +1,182 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  update_freq: 4  # for 8 cards
+
+# model ema
+EMA:
+  decay: 0.9999
+
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ConvNeXt_base_384
+  class_num: 1000
+  drop_path_rate: 0.1
+  layer_scale_init_value: 1e-6
+  head_init_scale: 1.0
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 4e-3  # lr 4e-3 for total_batch_size 4096
+    eta_min: 1e-6
+    warmup_epoch: 20
+    warmup_start_lr: 0
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ConvNeXt/ConvNeXt_large_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ConvNeXt/ConvNeXt_large_224.yaml
new file mode 100644
index 000000000..6f5b23e10
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ConvNeXt/ConvNeXt_large_224.yaml
@@ -0,0 +1,182 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  update_freq: 4  # for 8 cards
+
+# model ema
+EMA:
+  decay: 0.9999
+
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ConvNeXt_large_224
+  class_num: 1000
+  drop_path_rate: 0.1
+  layer_scale_init_value: 1e-6
+  head_init_scale: 1.0
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 4e-3  # lr 4e-3 for total_batch_size 4096
+    eta_min: 1e-6
+    warmup_epoch: 20
+    warmup_start_lr: 0
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ConvNeXt/ConvNeXt_large_384.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ConvNeXt/ConvNeXt_large_384.yaml
new file mode 100644
index 000000000..63a4aa1a0
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ConvNeXt/ConvNeXt_large_384.yaml
@@ -0,0 +1,182 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  update_freq: 4  # for 8 cards
+
+# model ema
+EMA:
+  decay: 0.9999
+
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ConvNeXt_large_384
+  class_num: 1000
+  drop_path_rate: 0.1
+  layer_scale_init_value: 1e-6
+  head_init_scale: 1.0
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 4e-3  # lr 4e-3 for total_batch_size 4096
+    eta_min: 1e-6
+    warmup_epoch: 20
+    warmup_start_lr: 0
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ConvNeXt/ConvNeXt_small.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ConvNeXt/ConvNeXt_small.yaml
new file mode 100644
index 000000000..d6c0551df
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ConvNeXt/ConvNeXt_small.yaml
@@ -0,0 +1,182 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  update_freq: 4  # for 8 cards
+
+# model ema
+EMA:
+  decay: 0.9999
+
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ConvNeXt_small
+  class_num: 1000
+  drop_path_rate: 0.1
+  layer_scale_init_value: 1e-6
+  head_init_scale: 1.0
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 4e-3  # lr 4e-3 for total_batch_size 4096
+    eta_min: 1e-6
+    warmup_epoch: 20
+    warmup_start_lr: 0
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ConvNeXt/ConvNeXt_tiny.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ConvNeXt/ConvNeXt_tiny.yaml
new file mode 100644
index 000000000..4d705857b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ConvNeXt/ConvNeXt_tiny.yaml
@@ -0,0 +1,182 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  update_freq: 4  # for 8 cards
+
+# model ema
+EMA:
+  decay: 0.9999
+
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ConvNeXt_tiny
+  class_num: 1000
+  drop_path_rate: 0.1
+  layer_scale_init_value: 1e-6
+  head_init_scale: 1.0
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 4e-3  # lr 4e-3 for total_batch_size 4096
+    eta_min: 1e-6
+    warmup_epoch: 20
+    warmup_start_lr: 0
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CvT/CvT_13_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CvT/CvT_13_224.yaml
new file mode 100644
index 000000000..b211c0cd1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CvT/CvT_13_224.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 50
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  update_freq: 2  # for 8 cards
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: CvT_13_224
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 2e-3  # lr 2e-3 for total_batch_size 2048
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+    by_epoch: True
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CvT/CvT_13_384.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CvT/CvT_13_384.yaml
new file mode 100644
index 000000000..14e2b9d9e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CvT/CvT_13_384.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 50
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  update_freq: 2  # for 8 cards
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: CvT_13_384
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 2e-3  # lr 2e-3 for total_batch_size 2048
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+    by_epoch: True
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        size: 384
+        interpolation: bicubic
+        backend: pil
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CvT/CvT_21_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CvT/CvT_21_224.yaml
new file mode 100644
index 000000000..8274a582e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CvT/CvT_21_224.yaml
@@ -0,0 +1,154 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 50
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  update_freq: 2  # for 8 cards
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: CvT_21_224
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.1
+  no_weight_decay_name: pos_embed cls_token .bias
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3  # lr 1e-3 for total_batch_size 1024
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+    by_epoch: True
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: RASampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CvT/CvT_21_384.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CvT/CvT_21_384.yaml
new file mode 100644
index 000000000..4aa2e27ca
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CvT/CvT_21_384.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 50
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  update_freq: 2  # for 8 cards
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: CvT_21_384
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.1
+  no_weight_decay_name: pos_embed cls_token .bias
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3  # lr 1e-3 for total_batch_size 1024
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+    by_epoch: True
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        size: 384
+        interpolation: bicubic
+        backend: pil
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CvT/CvT_W24_384.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CvT/CvT_W24_384.yaml
new file mode 100644
index 000000000..18b6d7ff6
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/CvT/CvT_W24_384.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 50
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  update_freq: 2  # for 8 cards
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: CvT_W24_384
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.1
+  no_weight_decay_name: pos_embed cls_token .bias
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3  # lr 1e-3 for total_batch_size 1024
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+    by_epoch: True
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        size: 384
+        interpolation: bicubic
+        backend: pil
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA102.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA102.yaml
new file mode 100644
index 000000000..c87635763
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA102.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DLA102
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA102x.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA102x.yaml
new file mode 100644
index 000000000..580c8ce4c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA102x.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DLA102x
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA102x2.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA102x2.yaml
new file mode 100644
index 000000000..0691a2afc
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA102x2.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DLA102x2
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA169.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA169.yaml
new file mode 100644
index 000000000..7731d361b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA169.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DLA169
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA34.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA34.yaml
new file mode 100644
index 000000000..555716033
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA34.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DLA34
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA46_c.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA46_c.yaml
new file mode 100644
index 000000000..1fef5b861
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA46_c.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DLA46_c
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA46x_c.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA46x_c.yaml
new file mode 100644
index 000000000..a88a940d2
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA46x_c.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DLA46x_c
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA60.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA60.yaml
new file mode 100644
index 000000000..0a82f7d2c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA60.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DLA60
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA60x.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA60x.yaml
new file mode 100644
index 000000000..19dc8ef32
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA60x.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DLA60x
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA60x_c.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA60x_c.yaml
new file mode 100644
index 000000000..ebf247840
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DLA/DLA60x_c.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DLA60x_c
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DPN/DPN107.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DPN/DPN107.yaml
new file mode 100644
index 000000000..a18c341f6
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DPN/DPN107.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DPN107
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DPN/DPN131.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DPN/DPN131.yaml
new file mode 100644
index 000000000..68e9479ff
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DPN/DPN131.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DPN131
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DPN/DPN68.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DPN/DPN68.yaml
new file mode 100644
index 000000000..33a0e416e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DPN/DPN68.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DPN68
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DPN/DPN92.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DPN/DPN92.yaml
new file mode 100644
index 000000000..583079845
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DPN/DPN92.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DPN92
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DPN/DPN98.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DPN/DPN98.yaml
new file mode 100644
index 000000000..f3bb99423
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DPN/DPN98.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DPN98
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DSNet/DSNet_base.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DSNet/DSNet_base.yaml
new file mode 100644
index 000000000..7d4ffcb10
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DSNet/DSNet_base.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DSNet_base
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DSNet/DSNet_small.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DSNet/DSNet_small.yaml
new file mode 100644
index 000000000..e006104c9
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DSNet/DSNet_small.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DSNet_small
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DSNet/DSNet_tiny.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DSNet/DSNet_tiny.yaml
new file mode 100644
index 000000000..884620582
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DSNet/DSNet_tiny.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DSNet_tiny
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DarkNet/DarkNet53.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DarkNet/DarkNet53.yaml
new file mode 100644
index 000000000..ccf05adaa
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DarkNet/DarkNet53.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DarkNet53
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_AutoAugment.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_AutoAugment.yaml
new file mode 100644
index 000000000..127cf91e8
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_AutoAugment.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_Baseline.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_Baseline.yaml
new file mode 100644
index 000000000..542ce15f8
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_Baseline.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_Cutmix.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_Cutmix.yaml
new file mode 100644
index 000000000..21ec5f88b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_Cutmix.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - CutmixOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_Cutout.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_Cutout.yaml
new file mode 100644
index 000000000..5f9286b82
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_Cutout.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - Cutout:
+            n_holes: 1
+            length: 112
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_GridMask.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_GridMask.yaml
new file mode 100644
index 000000000..8e14546a3
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_GridMask.yaml
@@ -0,0 +1,146 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - GridMask:
+            d1: 96
+            d2: 224
+            rotate: 1
+            ratio: 0.5
+            mode: 0
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_HideAndSeek.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_HideAndSeek.yaml
new file mode 100644
index 000000000..b8bdeba2c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_HideAndSeek.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - HideAndSeek:
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_Mixup.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_Mixup.yaml
new file mode 100644
index 000000000..176acaf3e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_Mixup.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_RandAugment.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_RandAugment.yaml
new file mode 100644
index 000000000..c1f8c14f6
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_RandAugment.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - RandAugment:
+            num_layers: 2 
+            magnitude: 5
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_RandomErasing.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_RandomErasing.yaml
new file mode 100644
index 000000000..1788e529d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DataAugment/ResNet50_RandomErasing.yaml
@@ -0,0 +1,146 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 0.4
+            r1: 0.3
+            mean: [0., 0., 0.]
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_224.yaml
new file mode 100644
index 000000000..39087d340
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_224.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DeiT_base_distilled_patch16_224
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 2e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_384.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_384.yaml
new file mode 100644
index 000000000..bf0ac2b7a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_base_distilled_patch16_384.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DeiT_base_distilled_patch16_384
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 2e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384 
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 438
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 438
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_base_patch16_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_base_patch16_224.yaml
new file mode 100644
index 000000000..cf3f1dc5e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_base_patch16_224.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DeiT_base_patch16_224
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 2e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_base_patch16_384.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_base_patch16_384.yaml
new file mode 100644
index 000000000..8f4a9a9d2
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_base_patch16_384.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DeiT_base_patch16_384
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 2e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384 
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 438
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 438
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_small_distilled_patch16_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_small_distilled_patch16_224.yaml
new file mode 100644
index 000000000..0db9532e3
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_small_distilled_patch16_224.yaml
@@ -0,0 +1,168 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DeiT_small_distilled_patch16_224
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 2e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_small_patch16_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_small_patch16_224.yaml
new file mode 100644
index 000000000..5e91973b1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_small_patch16_224.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DeiT_small_patch16_224
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 2e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_tiny_distilled_patch16_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_tiny_distilled_patch16_224.yaml
new file mode 100644
index 000000000..3068ada56
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_tiny_distilled_patch16_224.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DeiT_tiny_distilled_patch16_224
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 2e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_tiny_patch16_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_tiny_patch16_224.yaml
new file mode 100644
index 000000000..3cd8cd06c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DeiT/DeiT_tiny_patch16_224.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DeiT_tiny_patch16_224
+  drop_path_rate : 0.1
+  drop_rate : 0.0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token pos_embed dist_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 2e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DenseNet/DenseNet121.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DenseNet/DenseNet121.yaml
new file mode 100644
index 000000000..13e57d675
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DenseNet/DenseNet121.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DenseNet121
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DenseNet/DenseNet161.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DenseNet/DenseNet161.yaml
new file mode 100644
index 000000000..a3fa6b1cf
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DenseNet/DenseNet161.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DenseNet161
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DenseNet/DenseNet169.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DenseNet/DenseNet169.yaml
new file mode 100644
index 000000000..5eb27d4dd
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DenseNet/DenseNet169.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DenseNet169
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DenseNet/DenseNet201.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DenseNet/DenseNet201.yaml
new file mode 100644
index 000000000..6b7aad5c1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DenseNet/DenseNet201.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DenseNet201
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DenseNet/DenseNet264.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DenseNet/DenseNet264.yaml
new file mode 100644
index 000000000..046e3a83c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/DenseNet/DenseNet264.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: DenseNet264
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/PPLCNet_x1_0_ssld.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/PPLCNet_x1_0_ssld.yaml
new file mode 100644
index 000000000..a9f28674e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/PPLCNet_x1_0_ssld.yaml
@@ -0,0 +1,161 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output_r50_vd_distill
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  to_static: True
+
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  infer_model_name: "Student"
+  models:
+    - Teacher:
+        name: ResNet50_vd
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: False
+        dropout_prob: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationDMLLoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.4 # for bs 1024
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/PPLCNet_x2_5_dml.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/PPLCNet_x2_5_dml.yaml
new file mode 100644
index 000000000..4fb768098
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/PPLCNet_x2_5_dml.yaml
@@ -0,0 +1,161 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output_lcnet_x2_5_dml
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - False
+  - False
+  infer_model_name: "Student"
+  models:
+    - Teacher:
+        name: PPLCNet_x2_5
+        class_num: *class_num
+        pretrained: False
+    - Student:
+        name: PPLCNet_x2_5
+        class_num: *class_num
+        pretrained: False
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+        weight: 1.0
+        model_names: ["Student", "Teacher"]
+    - DistillationDMLLoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.4
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/PPLCNet_x2_5_ssld.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/PPLCNet_x2_5_ssld.yaml
new file mode 100644
index 000000000..e778cb8d8
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/PPLCNet_x2_5_ssld.yaml
@@ -0,0 +1,160 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output_r50_vd_distill
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  to_static: True
+
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  infer_model_name: "Student"
+  models:
+    - Teacher:
+        name: ResNet50_vd
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+    - Student:
+        name: PPLCNet_x2_5
+        class_num: *class_num
+        pretrained: False
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationDMLLoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.2
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/PPLCNet_x2_5_udml.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/PPLCNet_x2_5_udml.yaml
new file mode 100644
index 000000000..17e020e9a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/PPLCNet_x2_5_udml.yaml
@@ -0,0 +1,171 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output_lcnet_x2_5_udml
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - False
+  - False
+  infer_model_name: "Student"
+  models:
+    - Teacher:
+        name: PPLCNet_x2_5
+        class_num: *class_num
+        pretrained: False
+        return_patterns: ["blocks3", "blocks4", "blocks5", "blocks6"]
+    - Student:
+        name: PPLCNet_x2_5
+        class_num: *class_num
+        pretrained: False
+        return_patterns: ["blocks3", "blocks4", "blocks5", "blocks6"]
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+       weight: 1.0
+       key: logits
+       model_names: ["Student", "Teacher"]
+    - DistillationDMLLoss:
+        weight: 1.0
+        key: logits
+        model_name_pairs:
+        - ["Student", "Teacher"]
+    - DistillationDistanceLoss:
+        weight: 1.0
+        key: "blocks5"
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.4
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml
new file mode 100644
index 000000000..7cc99b64a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/mv3_large_x1_0_distill_mv3_small_x1_0.yaml
@@ -0,0 +1,167 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  use_dali: false
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  models:
+    - Teacher:
+        name: MobileNetV3_large_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+        dropout_prob: null
+    - Student:
+        name: MobileNetV3_small_x1_0
+        class_num: *class_num
+        pretrained: False
+        dropout_prob: null
+
+  infer_model_name: "Student"
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationCELoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+        
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.65
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/train_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - RandCropImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 256
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/val_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              resize_short: 256
+          - CropImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+
+Infer:
+  infer_imgs: "docs/images/inference_deployment/whl_demo.jpg"
+  batch_size: 10
+  transforms:
+      - DecodeImage:
+          to_rgb: True
+          channel_first: False
+      - ResizeImage:
+          resize_short: 256
+      - CropImage:
+          size: 224
+      - NormalizeImage:
+          scale: 1.0/255.0
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+          order: ''
+      - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: "ppcls/utils/imagenet1k_label_list.txt"
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/res2net200_vd_distill_pphgnet_base.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/res2net200_vd_distill_pphgnet_base.yaml
new file mode 100644
index 000000000..acc0aa380
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/res2net200_vd_distill_pphgnet_base.yaml
@@ -0,0 +1,171 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  use_dali: false
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  models:
+    - Teacher:
+        name: Res2Net200_vd_26w_4s
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+    - Student:
+        name: PPHGNet_base
+        class_num: *class_num
+        pretrained: False
+
+  infer_model_name: "Student"
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationCELoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+        
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/train_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - RandCropImage:
+              size: 224
+              interpolation: bicubic
+              backend: pil
+          - RandFlipImage:
+              flip_code: 1
+          - TimmAutoAugment:
+              config_str: rand-m7-mstd0.5-inc1
+              interpolation: bicubic
+              img_size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/val_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              resize_short: 236
+              interpolation: bicubic
+              backend: pil
+          - CropImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+Infer:
+  infer_imgs: "docs/images/inference_deployment/whl_demo.jpg"
+  batch_size: 10
+  transforms:
+      - DecodeImage:
+          to_rgb: True
+          channel_first: False
+      - ResizeImage:
+          resize_short: 236
+      - CropImage:
+          size: 224
+      - NormalizeImage:
+          scale: 1.0/255.0
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+          order: ''
+      - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: "ppcls/utils/imagenet1k_label_list.txt"
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_afd.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_afd.yaml
new file mode 100644
index 000000000..6816cd257
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_afd.yaml
@@ -0,0 +1,211 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  models:
+    - Teacher:
+        name: AttentionModel
+        pretrained_list:
+        freeze_params_list:
+          - True
+          - False
+        models:
+          - ResNet34:
+              name: ResNet34
+              pretrained: True
+              return_patterns: &t_keys ["blocks[0]", "blocks[1]", "blocks[2]", "blocks[3]",
+                                        "blocks[4]", "blocks[5]", "blocks[6]", "blocks[7]",
+                                        "blocks[8]", "blocks[9]", "blocks[10]", "blocks[11]",
+                                        "blocks[12]", "blocks[13]", "blocks[14]", "blocks[15]"]
+          - LinearTransformTeacher:
+              name: LinearTransformTeacher
+              qk_dim: 128
+              keys: *t_keys
+              t_shapes: &t_shapes [[64, 56, 56], [64, 56, 56], [64, 56, 56], [128, 28, 28],
+                                   [128, 28, 28], [128, 28, 28], [128, 28, 28], [256, 14, 14],
+                                   [256, 14, 14], [256, 14, 14], [256, 14, 14], [256, 14, 14],
+                                   [256, 14, 14], [512, 7, 7], [512, 7, 7], [512, 7, 7]]
+
+    - Student:
+        name: AttentionModel
+        pretrained_list:
+        freeze_params_list:
+          - False
+          - False
+        models:
+          - ResNet18:
+              name: ResNet18
+              pretrained: False
+              return_patterns: &s_keys ["blocks[0]", "blocks[1]", "blocks[2]", "blocks[3]",
+                                        "blocks[4]", "blocks[5]", "blocks[6]", "blocks[7]"]
+          - LinearTransformStudent:
+              name: LinearTransformStudent
+              qk_dim: 128
+              keys: *s_keys
+              s_shapes: &s_shapes [[64, 56, 56], [64, 56, 56], [128, 28, 28], [128, 28, 28],
+                                   [256, 14, 14], [256, 14, 14], [512, 7, 7], [512, 7, 7]]
+              t_shapes: *t_shapes
+
+  infer_model_name: "Student"
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+        weight: 1.0
+        model_names: ["Student"]
+        key: logits
+    - DistillationKLDivLoss:
+        weight: 0.9
+        model_name_pairs: [["Student", "Teacher"]]
+        temperature: 4
+        key: logits
+    - AFDLoss:
+        weight: 50.0
+        model_name_pair: ["Student", "Teacher"]
+        student_keys: ["bilinear_key", "value"]
+        teacher_keys: ["query", "value"]
+        s_shapes: *s_shapes
+        t_shapes: *t_shapes
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  weight_decay: 1e-4
+  lr:
+    name: MultiStepDecay
+    learning_rate: 0.1
+    milestones: [30, 60, 90]
+    step_each_epoch: 1
+    gamma: 0.1
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/train_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - RandCropImage:
+              size: 224
+              interpolation: bicubic
+              backend: pil
+          - RandFlipImage:
+              flip_code: 1
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/val_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              resize_short: 256
+              interpolation: bicubic
+              backend: pil
+          - CropImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Infer:
+  infer_imgs: "docs/images/inference_deployment/whl_demo.jpg"
+  batch_size: 10
+  transforms:
+      - DecodeImage:
+          to_rgb: True
+          channel_first: False
+      - ResizeImage:
+          resize_short: 256
+          interpolation: bicubic
+          backend: pil
+      - CropImage:
+          size: 224
+      - NormalizeImage:
+          scale: 1.0/255.0
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+          order: ''
+      - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: "ppcls/utils/imagenet1k_label_list.txt"
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_dist.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_dist.yaml
new file mode 100644
index 000000000..a689f6172
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_dist.yaml
@@ -0,0 +1,164 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/r34_r18_dist
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  infer_model_name: "Student"
+  models:
+    - Teacher:
+        name: ResNet34
+        class_num: *class_num
+        pretrained: True
+    - Student:
+        name: ResNet18
+        class_num: *class_num
+        pretrained: False
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+        weight: 1.0
+        model_names: ["Student"]
+    - DistillationDISTLoss:
+        weight: 2.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  weight_decay: 1e-4
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_dkd.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_dkd.yaml
new file mode 100644
index 000000000..19c684535
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_dkd.yaml
@@ -0,0 +1,166 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  models:
+    - Teacher:
+        name: ResNet34
+        pretrained: True
+
+    - Student:
+        name: ResNet18
+        pretrained: False
+
+  infer_model_name: "Student"
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+        weight: 1.0
+        model_names: ["Student"]
+    - DistillationDKDLoss:
+        weight: 1.0
+        model_name_pairs: [["Student", "Teacher"]]
+        temperature: 1
+        alpha: 1.0
+        beta: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+        
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  weight_decay: 1e-4
+  lr:
+    name: MultiStepDecay
+    learning_rate: 0.2
+    milestones: [30, 60, 90]
+    step_each_epoch: 1
+    gamma: 0.1
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/train_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - RandCropImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/val_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              resize_short: 256
+          - CropImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Infer:
+  infer_imgs: "docs/images/inference_deployment/whl_demo.jpg"
+  batch_size: 10
+  transforms:
+      - DecodeImage:
+          to_rgb: True
+          channel_first: False
+      - ResizeImage:
+          resize_short: 256
+      - CropImage:
+          size: 224
+      - NormalizeImage:
+          scale: 1.0/255.0
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+          order: ''
+      - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: "ppcls/utils/imagenet1k_label_list.txt"
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_mgd.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_mgd.yaml
new file mode 100644
index 000000000..d501f3dd5
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_mgd.yaml
@@ -0,0 +1,171 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/r34_r18_mgd
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  infer_model_name: "Student"
+  models:
+    - Teacher:
+        name: ResNet34
+        class_num: *class_num
+        pretrained: True
+        return_patterns: &t_stages ["blocks[2]", "blocks[6]", "blocks[12]", "blocks[15]"]
+    - Student:
+        name: ResNet18
+        class_num: *class_num
+        pretrained: False
+        return_patterns: &s_stages ["blocks[1]", "blocks[3]", "blocks[5]", "blocks[7]"]
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+        weight: 1.0
+        model_names: ["Student"]
+    - DistillationPairLoss:
+        weight: 1.0
+        base_loss_name: MGDLoss
+        model_name_pairs: [["Student", "Teacher"]]
+        s_key: "blocks[7]"
+        t_key: "blocks[15]"
+        name: "loss_mgd"
+        student_channels: 512
+        teacher_channels: 512
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  weight_decay: 1e-4
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_pefd.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_pefd.yaml
new file mode 100644
index 000000000..1c87f03ec
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_pefd.yaml
@@ -0,0 +1,171 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/r34_r18_pefd
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  infer_model_name: "Student"
+  models:
+    - Teacher:
+        name: ResNet34
+        class_num: *class_num
+        pretrained: True
+        return_patterns: &t_stages ["avg_pool"]
+    - Student:
+        name: ResNet18
+        class_num: *class_num
+        pretrained: False
+        return_patterns: &s_stages ["avg_pool"]
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+        weight: 1.0
+        model_names: ["Student"]
+    - DistillationPairLoss:
+        weight: 25.0
+        base_loss_name: PEFDLoss
+        model_name_pairs: [["Student", "Teacher"]]
+        s_key: "avg_pool"
+        t_key: "avg_pool"
+        name: "loss_pefd"
+        student_channel: 512
+        teacher_channel: 512
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  weight_decay: 1e-4
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_skd.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_skd.yaml
new file mode 100644
index 000000000..d1100a1b7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_skd.yaml
@@ -0,0 +1,163 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  models:
+    - Teacher:
+        name: ResNet34
+        pretrained: True
+
+    - Student:
+        name: ResNet18
+        pretrained: False
+
+  infer_model_name: "Student"
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationSKDLoss:
+        weight: 1.0
+        model_name_pairs: [["Student", "Teacher"]]
+        temperature: 1.0
+        multiplier: 2.0
+        alpha: 0.9
+  Eval:
+    - CELoss:
+        weight: 1.0
+        
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  weight_decay: 1e-4
+  lr:
+    name: MultiStepDecay
+    learning_rate: 0.1
+    milestones: [30, 60, 90]
+    step_each_epoch: 1
+    gamma: 0.1
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/train_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - RandCropImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/val_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              resize_short: 256
+          - CropImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Infer:
+  infer_imgs: "docs/images/inference_deployment/whl_demo.jpg"
+  batch_size: 10
+  transforms:
+      - DecodeImage:
+          to_rgb: True
+          channel_first: False
+      - ResizeImage:
+          resize_short: 256
+      - CropImage:
+          size: 224
+      - NormalizeImage:
+          scale: 1.0/255.0
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+          order: ''
+      - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: "ppcls/utils/imagenet1k_label_list.txt"
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_wsl.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_wsl.yaml
new file mode 100644
index 000000000..adabc5ae0
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Distillation/resnet34_distill_resnet18_wsl.yaml
@@ -0,0 +1,164 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/r34_r18_wsl
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  models:
+    - Teacher:
+        name: ResNet34
+        pretrained: True
+
+    - Student:
+        name: ResNet18
+        pretrained: False
+
+  infer_model_name: "Student"
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+        weight: 1.0
+        model_names: ["Student"]
+    - DistillationWSLLoss:
+        weight: 2.5
+        model_name_pairs: [["Student", "Teacher"]]
+        temperature: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  weight_decay: 1e-4
+  lr:
+    name: MultiStepDecay
+    learning_rate: 0.1
+    milestones: [30, 60, 90]
+    step_each_epoch: 1
+    gamma: 0.1
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/train_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - RandCropImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/val_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              resize_short: 256
+          - CropImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Infer:
+  infer_imgs: "docs/images/inference_deployment/whl_demo.jpg"
+  batch_size: 10
+  transforms:
+      - DecodeImage:
+          to_rgb: True
+          channel_first: False
+      - ResizeImage:
+          resize_short: 256
+      - CropImage:
+          size: 224
+      - NormalizeImage:
+          scale: 1.0/255.0
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+          order: ''
+      - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: "ppcls/utils/imagenet1k_label_list.txt"
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ESNet/ESNet_x0_25.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ESNet/ESNet_x0_25.yaml
new file mode 100644
index 000000000..a1ae01f2e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ESNet/ESNet_x0_25.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ESNet_x0_25
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ESNet/ESNet_x0_5.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ESNet/ESNet_x0_5.yaml
new file mode 100644
index 000000000..bf806d9cd
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ESNet/ESNet_x0_5.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ESNet_x0_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ESNet/ESNet_x0_75.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ESNet/ESNet_x0_75.yaml
new file mode 100644
index 000000000..566560795
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ESNet/ESNet_x0_75.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ESNet_x0_75
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ESNet/ESNet_x1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ESNet/ESNet_x1_0.yaml
new file mode 100644
index 000000000..e0b32d5c5
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ESNet/ESNet_x1_0.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ESNet_x1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB0.yaml
new file mode 100644
index 000000000..8f8ce1825
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB0.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: EfficientNetB0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  lr:
+    name: Cosine
+    learning_rate: 0.032
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB1.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB1.yaml
new file mode 100644
index 000000000..33cdcff94
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB1.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 240, 240]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: EfficientNetB1
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  lr:
+    name: Cosine
+    learning_rate: 0.032
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 240
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 272
+        - CropImage:
+            size: 240
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 272
+    - CropImage:
+        size: 240
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB2.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB2.yaml
new file mode 100644
index 000000000..3d2e15f85
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB2.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 260, 260]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: EfficientNetB2
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  lr:
+    name: Cosine
+    learning_rate: 0.032
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 260
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+        - CropImage:
+            size: 260
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+    - CropImage:
+        size: 260
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB3.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB3.yaml
new file mode 100644
index 000000000..4dd71da17
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB3.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 300, 300]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: EfficientNetB3
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  lr:
+    name: Cosine
+    learning_rate: 0.032
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 300
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 332
+        - CropImage:
+            size: 300
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 332
+    - CropImage:
+        size: 300
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB4.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB4.yaml
new file mode 100644
index 000000000..a123c1267
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB4.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 380, 380]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: EfficientNetB4
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  lr:
+    name: Cosine
+    learning_rate: 0.032
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 380
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 412
+        - CropImage:
+            size: 380
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 412
+    - CropImage:
+        size: 380
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB5.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB5.yaml
new file mode 100644
index 000000000..8c163d64f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB5.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 456, 456]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: EfficientNetB5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  lr:
+    name: Cosine
+    learning_rate: 0.032
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 456
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 488
+        - CropImage:
+            size: 456
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 488
+    - CropImage:
+        size: 456
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB6.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB6.yaml
new file mode 100644
index 000000000..9897673a3
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB6.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 528, 528]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: EfficientNetB6
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  lr:
+    name: Cosine
+    learning_rate: 0.032
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 528
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 560
+        - CropImage:
+            size: 528
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 560
+    - CropImage:
+        size: 528
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB7.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB7.yaml
new file mode 100644
index 000000000..8f02cd278
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNet/EfficientNetB7.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 600, 600]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: EfficientNetB7
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  lr:
+    name: Cosine
+    learning_rate: 0.032
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 600
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 632
+        - CropImage:
+            size: 600
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 632
+    - CropImage:
+        size: 600
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNetV2/EfficientNetV2_S.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNetV2/EfficientNetV2_S.yaml
new file mode 100644
index 000000000..48257bc13
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/EfficientNetV2/EfficientNetV2_S.yaml
@@ -0,0 +1,147 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 350
+  print_batch_step: 20
+  use_visualdl: False
+  train_mode: progressive  # progressive training
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+EMA:
+  decay: 0.9999
+
+# model architecture
+Arch:
+  name: EfficientNetV2_S
+  class_num: 1000
+  use_sync_bn: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.65 # 8gpux128bs
+    warmup_epoch: 5
+  regularizer:
+    name: L2
+    coeff: 0.00001
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 171
+            progress_size: [171, 214, 257, 300]
+            scale: [0.05, 1.0]
+        - RandFlipImage:
+            flip_code: 1
+        - RandAugmentV2:
+            num_layers: 2
+            magnitude: 5.0
+            progress_magnitude: [5.0, 8.3333333333, 11.66666666667, 15.0]
+        - NormalizeImage:
+            scale: 1.0
+            mean: [128.0, 128.0, 128.0]
+            std: [128.0, 128.0, 128.0]
+            order: ""
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - CropImageAtRatio:
+            size: 384
+            pad: 32
+            interpolation: bilinear
+        - NormalizeImage:
+            scale: 1.0
+            mean: [128.0, 128.0, 128.0]
+            std: [128.0, 128.0, 128.0]
+            order: ""
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - CropImageAtRatio:
+        size: 384
+        pad: 32
+        interpolation: bilinear
+    - NormalizeImage:
+        scale: 1.0
+        mean: [128.0, 128.0, 128.0]
+        std: [128.0, 128.0, 128.0]
+        order: ""
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/FasterNet/FasterNet_L.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/FasterNet/FasterNet_L.yaml
new file mode 100644
index 000000000..e8182839e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/FasterNet/FasterNet_L.yaml
@@ -0,0 +1,163 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: False
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: FasterNet_L
+  class_num: 1000
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  clip_grad: 0.01
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.002
+    warmup_start_lr: 0.000001
+    warmup_epoch: 20
+    eta_min: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.7
+              prob: 0.5
+            CutmixOperator: 
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            to_np: False
+            channel_first: False
+        - ResizeImage:  
+            interpolation: bicubic
+            backend: pil
+            resize_short: 248
+        - CropImage: 
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: 'hwc'
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 1
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 0
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/FasterNet/FasterNet_M.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/FasterNet/FasterNet_M.yaml
new file mode 100644
index 000000000..a08f6005e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/FasterNet/FasterNet_M.yaml
@@ -0,0 +1,163 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: False
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: FasterNet_M
+  class_num: 1000
+  
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  clip_grad: 1
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.002 
+    warmup_start_lr: 0.000001
+    warmup_epoch: 20
+    eta_min: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1 
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.5
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            to_np: False
+            channel_first: False
+        - ResizeImage:  
+            interpolation: bicubic
+            backend: pil
+            resize_short: 248
+        - CropImage: 
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: 'hwc'
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 1
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 0
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/FasterNet/FasterNet_S.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/FasterNet/FasterNet_S.yaml
new file mode 100644
index 000000000..5d43a911a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/FasterNet/FasterNet_S.yaml
@@ -0,0 +1,163 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: False
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: FasterNet_S
+  class_num: 1000
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.03
+  clip_grad: null
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.004
+    warmup_start_lr: 0.000001
+    warmup_epoch: 20
+    eta_min: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.3
+              prob: 0.5
+            CutmixOperator: 
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            to_np: False
+            channel_first: False
+        - ResizeImage:  ##RandomResized
+            interpolation: bicubic
+            backend: pil
+            resize_short: 248
+        - CropImage: 
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: 'hwc'
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 1
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 0
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/FasterNet/FasterNet_T0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/FasterNet/FasterNet_T0.yaml
new file mode 100644
index 000000000..6c3c0d66e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/FasterNet/FasterNet_T0.yaml
@@ -0,0 +1,163 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: False
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: FasterNet_T0
+  class_num: 1000
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.005
+  clip_grad: null
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.004
+    warmup_start_lr: 0.000001
+    warmup_epoch: 20
+    eta_min: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: null
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.05
+              prob: 0.5
+            CutmixOperator: 
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            to_np: False
+            channel_first: False
+        - ResizeImage: 
+            interpolation: bicubic
+            backend: pil
+            resize_short: 248
+        - CropImage: 
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: 'hwc'
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 1
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 0
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/FasterNet/FasterNet_T1.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/FasterNet/FasterNet_T1.yaml
new file mode 100644
index 000000000..436f61eca
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/FasterNet/FasterNet_T1.yaml
@@ -0,0 +1,163 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: False
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: FasterNet_T1
+  class_num: 1000
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.01
+  clip_grad: null
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.004
+    warmup_start_lr: 0.000001
+    warmup_epoch: 20
+    eta_min: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m3-mstd0.5-inc1 
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.05
+              prob: 0.5
+            CutmixOperator: 
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            to_np: False
+            channel_first: False
+        - ResizeImage:  
+            interpolation: bicubic
+            backend: pil
+            resize_short: 248
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: 'hwc'
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 1
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 0
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/FasterNet/FasterNet_T2.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/FasterNet/FasterNet_T2.yaml
new file mode 100644
index 000000000..eba5d0d49
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/FasterNet/FasterNet_T2.yaml
@@ -0,0 +1,162 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: False
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: FasterNet_T2
+  class_num: 1000
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.02
+  clip_grad: null
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.004
+    warmup_start_lr: 0.000001
+    warmup_epoch: 20
+    eta_min: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m5-mstd0.5-inc1 
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.1
+              prob: 0.5
+            CutmixOperator: 
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            to_np: False
+            channel_first: False
+        - ResizeImage:  
+            interpolation: bicubic
+            backend: pil
+            resize_short: 248
+        - CropImage: 
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: 'hwc'
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 1
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 0
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/GhostNet/GhostNet_x0_5.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/GhostNet/GhostNet_x0_5.yaml
new file mode 100644
index 000000000..fe56eb53d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/GhostNet/GhostNet_x0_5.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: GhostNet_x0_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/GhostNet/GhostNet_x1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/GhostNet/GhostNet_x1_0.yaml
new file mode 100644
index 000000000..e063ec2b7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/GhostNet/GhostNet_x1_0.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: GhostNet_x1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/GhostNet/GhostNet_x1_3.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/GhostNet/GhostNet_x1_3.yaml
new file mode 100644
index 000000000..40572d4ab
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/GhostNet/GhostNet_x1_3.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: GhostNet_x1_3
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/HarDNet/HarDNet39_ds.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/HarDNet/HarDNet39_ds.yaml
new file mode 100644
index 000000000..d15ba0c75
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/HarDNet/HarDNet39_ds.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: HarDNet39_ds
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/HarDNet/HarDNet68.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/HarDNet/HarDNet68.yaml
new file mode 100644
index 000000000..f0d8d8e26
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/HarDNet/HarDNet68.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: HarDNet68
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/HarDNet/HarDNet68_ds.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/HarDNet/HarDNet68_ds.yaml
new file mode 100644
index 000000000..dc003a88f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/HarDNet/HarDNet68_ds.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: HarDNet68_ds
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/HarDNet/HarDNet85.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/HarDNet/HarDNet85.yaml
new file mode 100644
index 000000000..f69bc650c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/HarDNet/HarDNet85.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: HarDNet85
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Inception/GoogLeNet.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Inception/GoogLeNet.yaml
new file mode 100644
index 000000000..6709433f9
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Inception/GoogLeNet.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: GoogLeNet
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - GoogLeNetLoss:
+        weight: 1.0
+  Eval:
+    - GoogLeNetLoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - GoogLeNetTopkAcc:
+        topk: [1, 5]
+  Eval:
+    - GoogLeNetTopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Inception/InceptionV3.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Inception/InceptionV3.yaml
new file mode 100644
index 000000000..fe6c66a34
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Inception/InceptionV3.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 299, 299]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: InceptionV3
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 299
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+        - CropImage:
+            size: 299
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+    - CropImage:
+        size: 299
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Inception/InceptionV4.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Inception/InceptionV4.yaml
new file mode 100644
index 000000000..996049490
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Inception/InceptionV4.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 299, 299]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: InceptionV4
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 299
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+        - CropImage:
+            size: 299
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+    - CropImage:
+        size: 299
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/LeViT/LeViT_128.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/LeViT/LeViT_128.yaml
new file mode 100644
index 000000000..496155274
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/LeViT/LeViT_128.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: LeViT_128
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/LeViT/LeViT_128S.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/LeViT/LeViT_128S.yaml
new file mode 100644
index 000000000..27798a673
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/LeViT/LeViT_128S.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: LeViT_128S
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/LeViT/LeViT_192.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/LeViT/LeViT_192.yaml
new file mode 100644
index 000000000..7e045c65d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/LeViT/LeViT_192.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: LeViT_192
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/LeViT/LeViT_256.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/LeViT/LeViT_256.yaml
new file mode 100644
index 000000000..2b764daed
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/LeViT/LeViT_256.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: LeViT_256
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/LeViT/LeViT_384.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/LeViT/LeViT_384.yaml
new file mode 100644
index 000000000..6751e066c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/LeViT/LeViT_384.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: LeViT_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MicroNet/MicroNet_M0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MicroNet/MicroNet_M0.yaml
new file mode 100644
index 000000000..fe64441f8
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MicroNet/MicroNet_M0.yaml
@@ -0,0 +1,147 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 600
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MicroNet_M0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.2  # for total batch size 512
+  regularizer:
+    name: 'L2'
+    coeff: 3e-5
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bilinear
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128  # for 4 gpus
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bilinear
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bilinear
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MicroNet/MicroNet_M1.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MicroNet/MicroNet_M1.yaml
new file mode 100644
index 000000000..694793e2a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MicroNet/MicroNet_M1.yaml
@@ -0,0 +1,147 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 600
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MicroNet_M1
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.2  # for total batch size 512
+  regularizer:
+    name: 'L2'
+    coeff: 3e-5
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bilinear
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128  # for 4 gpus
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bilinear
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bilinear
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MicroNet/MicroNet_M2.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MicroNet/MicroNet_M2.yaml
new file mode 100644
index 000000000..379445787
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MicroNet/MicroNet_M2.yaml
@@ -0,0 +1,147 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 600
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MicroNet_M2
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.2  # for total batch size 512
+  regularizer:
+    name: 'L2'
+    coeff: 3e-5
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bilinear
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128  # for 4 gpus
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bilinear
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bilinear
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MicroNet/MicroNet_M3.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MicroNet/MicroNet_M3.yaml
new file mode 100644
index 000000000..7a41c05ba
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MicroNet/MicroNet_M3.yaml
@@ -0,0 +1,152 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 600
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MicroNet_M3
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.2  # for total batch size 512
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 4e-5
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 224
+            interpolation: bilinear
+            use_log_aspect: True
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128  # for 4 gpus
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bilinear
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bilinear
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MixNet/MixNet_L.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MixNet/MixNet_L.yaml
new file mode 100644
index 000000000..fe75f2cc8
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MixNet/MixNet_L.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MixNet_L
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MixNet/MixNet_M.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MixNet/MixNet_M.yaml
new file mode 100644
index 000000000..62c039bda
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MixNet/MixNet_M.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MixNet_M
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MixNet/MixNet_S.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MixNet/MixNet_S.yaml
new file mode 100644
index 000000000..f56f40eb1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MixNet/MixNet_S.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MixNet_S
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNeXt/MobileNeXt_x1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNeXt/MobileNeXt_x1_0.yaml
new file mode 100644
index 000000000..04772014a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNeXt/MobileNeXt_x1_0.yaml
@@ -0,0 +1,160 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 50
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNeXt_x1_0
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  use_nesterov: True
+  no_weight_decay_name: .bias
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.1  # for total batch size 512
+    eta_min: 1e-5
+    warmup_epoch: 3
+    warmup_start_lr: 1e-4
+    by_epoch: True
+  regularizer:
+    name: 'L2'
+    coeff: 1e-4
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 224
+            interpolation: random
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - ColorJitter:
+            brightness: 0.4
+            contrast: 0.4
+            saturation: 0.4
+            hue: 0
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128  # for 4 gpus
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV1/MobileNetV1.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV1/MobileNetV1.yaml
new file mode 100644
index 000000000..66fa53ec7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV1/MobileNetV1.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV1
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV1/MobileNetV1_x0_25.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV1/MobileNetV1_x0_25.yaml
new file mode 100644
index 000000000..364ed851a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV1/MobileNetV1_x0_25.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV1_x0_25
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV1/MobileNetV1_x0_5.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV1/MobileNetV1_x0_5.yaml
new file mode 100644
index 000000000..46fc98859
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV1/MobileNetV1_x0_5.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV1_x0_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV1/MobileNetV1_x0_75.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV1/MobileNetV1_x0_75.yaml
new file mode 100644
index 000000000..180e97c8c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV1/MobileNetV1_x0_75.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV1_x0_75
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV2/MobileNetV2.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV2/MobileNetV2.yaml
new file mode 100644
index 000000000..12ba90772
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV2/MobileNetV2.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV2
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV2/MobileNetV2_x0_25.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV2/MobileNetV2_x0_25.yaml
new file mode 100644
index 000000000..c8897d8fc
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV2/MobileNetV2_x0_25.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV2_x0_25
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV2/MobileNetV2_x0_5.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV2/MobileNetV2_x0_5.yaml
new file mode 100644
index 000000000..d6c761ba4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV2/MobileNetV2_x0_5.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV2_x0_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV2/MobileNetV2_x0_75.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV2/MobileNetV2_x0_75.yaml
new file mode 100644
index 000000000..ac33c49d4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV2/MobileNetV2_x0_75.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV2_x0_75
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV2/MobileNetV2_x1_5.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV2/MobileNetV2_x1_5.yaml
new file mode 100644
index 000000000..98fc61db4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV2/MobileNetV2_x1_5.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV2_x1_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV2/MobileNetV2_x2_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV2/MobileNetV2_x2_0.yaml
new file mode 100644
index 000000000..1fd273469
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV2/MobileNetV2_x2_0.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV2_x2_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_35.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_35.yaml
new file mode 100644
index 000000000..e8c400d1d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_35.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x0_35
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_5.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_5.yaml
new file mode 100644
index 000000000..b357cc1ae
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_5.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x0_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_75.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_75.yaml
new file mode 100644
index 000000000..b9bb092fa
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_large_x0_75.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x0_75
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_0.yaml
new file mode 100644
index 000000000..e3b54a66b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_0.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.65
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_25.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_25.yaml
new file mode 100644
index 000000000..7eab8f904
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_large_x1_25.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x1_25
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_35.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_35.yaml
new file mode 100644
index 000000000..8957dc785
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_35.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_35
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_5.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_5.yaml
new file mode 100644
index 000000000..8d59196e0
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_5.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_75.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_75.yaml
new file mode 100644
index 000000000..2ccb6faee
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x0_75.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_75
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0.yaml
new file mode 100644
index 000000000..08fb96996
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x1_0
+  class_num: 1000
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0_ampo2_ultra.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0_ampo2_ultra.yaml
new file mode 100644
index 000000000..2880583b9
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0_ampo2_ultra.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  use_dali: True
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O2
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 5.2
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 1024
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0_fp32_ultra.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0_fp32_ultra.yaml
new file mode 100644
index 000000000..f2df21425
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_0_fp32_ultra.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  use_dali: True
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 5.2
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 1024
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_25.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_25.yaml
new file mode 100644
index 000000000..b401bd168
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV3/MobileNetV3_small_x1_25.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x1_25
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV4/MobileNetV4_conv_large.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV4/MobileNetV4_conv_large.yaml
new file mode 100644
index 000000000..a1aa14dd1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV4/MobileNetV4_conv_large.yaml
@@ -0,0 +1,181 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 600
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 448, 448]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV4_conv_large
+  drop_rate: 0.2
+  drop_path_rate: 0.35
+  class_num: 1000
+ 
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.2
+  clip_grad: 5.0
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 0.003 ##null
+    eta_min: 1.0e-06
+    warmup_epoch: 20
+    warmup_start_lr: 0
+
+
+EMA:
+  decay: 0.9998 
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m8-inc1-mstd1.0
+            interpolation: bicubic
+            img_size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator: # todo 确认是否需要
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            backend: pil
+            interpolation: bicubic
+            resize_short: 448
+        - CropImage:
+            size: 448
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 2
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV4/MobileNetV4_conv_medium.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV4/MobileNetV4_conv_medium.yaml
new file mode 100644
index 000000000..fcfe07dcb
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV4/MobileNetV4_conv_medium.yaml
@@ -0,0 +1,181 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 500
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 320, 320]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV4_conv_medium
+  drop_rate: 0.2
+  drop_path_rate: 0.1
+  class_num: 1000
+ 
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.1
+  clip_grad: 5.0
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 0.002
+    eta_min: 0
+    warmup_epoch: 20
+    warmup_start_lr: 0
+
+
+EMA:
+  decay: 0.9998 
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m8-inc1-mstd1.0
+            interpolation: bicubic
+            img_size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator: 
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            backend: pil
+            interpolation: bicubic
+            resize_short: 320
+        - CropImage:
+            size: 320
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV4/MobileNetV4_conv_small.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV4/MobileNetV4_conv_small.yaml
new file mode 100644
index 000000000..b1a0ee229
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV4/MobileNetV4_conv_small.yaml
@@ -0,0 +1,181 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 2400
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV4_conv_small
+  drop_rate: 0.25
+  drop_path_rate: 0.03
+  class_num: 1000
+ 
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.06
+  clip_grad: 5.0
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 0.002 ##null
+    eta_min: 0
+    warmup_epoch: 5
+    warmup_start_lr: 0
+
+
+EMA:
+  decay: 0.9998 
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m8-inc1-mstd1.0
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator: 
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            backend: pil
+            interpolation: bicubic
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV4/MobileNetV4_hybrid_large.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV4/MobileNetV4_hybrid_large.yaml
new file mode 100644
index 000000000..da7da794a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV4/MobileNetV4_hybrid_large.yaml
@@ -0,0 +1,182 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 600
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 448, 448]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: True
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV4_hybrid_large
+  drop_rate: 0.2
+  drop_path_rate: 0.35
+  use_fused_attn: False
+  class_num: 1000
+ 
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.1
+  clip_grad: 5.0
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 0.001
+    eta_min: 1.0e-06
+    warmup_epoch: 20
+    warmup_start_lr: 0
+
+
+EMA:
+  decay: 0.9998 
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-inc1-mstd1.0
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator: 
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            backend: pil
+            interpolation: bicubic
+            resize_short: 448
+        - CropImage:
+            size: 448
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 448
+    - CropImage:
+        size: 448
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV4/MobileNetV4_hybrid_medium.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV4/MobileNetV4_hybrid_medium.yaml
new file mode 100644
index 000000000..dbe7be411
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileNetV4/MobileNetV4_hybrid_medium.yaml
@@ -0,0 +1,176 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 500
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: True
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileNetV4_hybrid_medium
+  drop_rate: 0.2
+  drop_path_rate: 0.1
+  use_fused_attn: False
+  class_num: 1000
+ 
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.1
+  clip_grad: 5.0
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 0.001
+    eta_min: 0
+    warmup_epoch: 20
+    warmup_start_lr: 0
+
+EMA:
+  decay: 0.9998 
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-inc1-mstd1.0
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator: # todo 确认是否需要
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            backend: pil
+            interpolation: bicubic
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 0
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViT/MobileViT_S.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViT/MobileViT_S.yaml
new file mode 100644
index 000000000..bf39753e7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViT/MobileViT_S.yaml
@@ -0,0 +1,151 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileViT_S
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.01
+  no_weight_decay_name: .bias norm
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 0.002
+    eta_min: 0.0002
+    warmup_epoch: 5
+    warmup_start_lr: 0.0002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bilinear
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+            interpolation: bilinear
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViT/MobileViT_XS.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViT/MobileViT_XS.yaml
new file mode 100644
index 000000000..c4b6804e5
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViT/MobileViT_XS.yaml
@@ -0,0 +1,151 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileViT_XS
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.01
+  no_weight_decay_name: .bias norm
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 0.002
+    eta_min: 0.0002
+    warmup_epoch: 5
+    warmup_start_lr: 0.0002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bilinear
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+            interpolation: bilinear
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViT/MobileViT_XXS.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViT/MobileViT_XXS.yaml
new file mode 100644
index 000000000..a3611f4e3
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViT/MobileViT_XXS.yaml
@@ -0,0 +1,151 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: MobileViT_XXS
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.01
+  no_weight_decay_name: .bias norm
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 0.002
+    eta_min: 0.0002
+    warmup_epoch: 5
+    warmup_start_lr: 0.0002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bilinear
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: False
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+            interpolation: bilinear
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg 
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV2/MobileViTV2_x0_5.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV2/MobileViTV2_x0_5.yaml
new file mode 100644
index 000000000..12891491a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV2/MobileViTV2_x0_5.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV2_x0_5
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.004
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.009  # for total batch size 1024
+    eta_min: 0.0009
+    warmup_epoch: 16  # 20000 iterations
+    warmup_start_lr: 1e-6
+  clip_norm: 10
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - RandAugmentV3:
+            num_layers: 2
+            interpolation: bicubic
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: const
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.2
+              prob: 0.25
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.25
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_np: False
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_np: False
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV2/MobileViTV2_x1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV2/MobileViTV2_x1_0.yaml
new file mode 100644
index 000000000..aeee054da
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV2/MobileViTV2_x1_0.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV2_x1_0
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.013
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.0075  # for total batch size 1024
+    eta_min: 0.00075
+    warmup_epoch: 16  # 20000 iterations
+    warmup_start_lr: 1e-6
+  clip_norm: 10
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - RandAugmentV3:
+            num_layers: 2
+            interpolation: bicubic
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: const
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.2
+              prob: 0.25
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.25
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_np: False
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_np: False
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV2/MobileViTV2_x1_5.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV2/MobileViTV2_x1_5.yaml
new file mode 100644
index 000000000..198e4333f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV2/MobileViTV2_x1_5.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV2_x1_5
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.029
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.0035  # for total batch size 1024
+    eta_min: 0.00035
+    warmup_epoch: 16  # 20000 iterations
+    warmup_start_lr: 1e-6
+  clip_norm: 10
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - RandAugmentV3:
+            num_layers: 2
+            interpolation: bicubic
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: const
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.2
+              prob: 0.25
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.25
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_np: False
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_np: False
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV2/MobileViTV2_x2_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV2/MobileViTV2_x2_0.yaml
new file mode 100644
index 000000000..2739ea9d1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV2/MobileViTV2_x2_0.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV2_x2_0
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.002  # for total batch size 1024
+    eta_min: 0.0002
+    warmup_epoch: 16  # 20000 iterations
+    warmup_start_lr: 1e-6
+  clip_norm: 10
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - RandAugmentV3:
+            num_layers: 2
+            interpolation: bicubic
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: const
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.2
+              prob: 0.25
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.25
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_np: False
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_np: False
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_S.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_S.yaml
new file mode 100644
index 000000000..ba49dbbc2
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_S.yaml
@@ -0,0 +1,153 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV3_S
+  class_num: 1000
+  dropout: 0.1
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.01
+  lr:
+    name: Cosine
+    learning_rate: 0.002  # for total batch size 384
+    eta_min: 0.0002
+    warmup_epoch: 1  # 3000 iterations
+    warmup_start_lr: 0.0002
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiScaleDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bilinear
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    # support to specify width and height respectively:
+    # scales: [(256,256) (160,160), (192,192), (224,224) (288,288) (320,320)]
+    sampler:
+      name: MultiScaleSampler
+      scales: [256, 160, 192, 224, 288, 320]
+      # first_bs: batch size for the first image resolution in the scales list
+      # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+      first_bs: 48
+      divided_factor: 32
+      is_training: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bilinear
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 48
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bilinear
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_S_L2.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_S_L2.yaml
new file mode 100644
index 000000000..4a9ba8b01
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_S_L2.yaml
@@ -0,0 +1,153 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV3_S_L2
+  class_num: 1000
+  dropout: 0.1
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.01
+  lr:
+    name: Cosine
+    learning_rate: 0.002  # for total batch size 384
+    eta_min: 0.0002
+    warmup_epoch: 1  # 3000 iterations
+    warmup_start_lr: 0.0002
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiScaleDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bilinear
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    # support to specify width and height respectively:
+    # scales: [(256,256) (160,160), (192,192), (224,224) (288,288) (320,320)]
+    sampler:
+      name: MultiScaleSampler
+      scales: [256, 160, 192, 224, 288, 320]
+      # first_bs: batch size for the first image resolution in the scales list
+      # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+      first_bs: 48
+      divided_factor: 32
+      is_training: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bilinear
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 48
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bilinear
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_XS.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_XS.yaml
new file mode 100644
index 000000000..3ac7264ac
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_XS.yaml
@@ -0,0 +1,153 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV3_XS
+  class_num: 1000
+  dropout: 0.1
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.01
+  lr:
+    name: Cosine
+    learning_rate: 0.002  # for total batch size 384
+    eta_min: 0.0002
+    warmup_epoch: 1  # 3000 iterations
+    warmup_start_lr: 0.0002
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiScaleDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bilinear
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    # support to specify width and height respectively:
+    # scales: [(256,256) (160,160), (192,192), (224,224) (288,288) (320,320)]
+    sampler:
+      name: MultiScaleSampler
+      scales: [256, 160, 192, 224, 288, 320]
+      # first_bs: batch size for the first image resolution in the scales list
+      # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+      first_bs: 48
+      divided_factor: 32
+      is_training: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bilinear
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 48
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bilinear
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_XS_L2.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_XS_L2.yaml
new file mode 100644
index 000000000..10b3c7f9e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_XS_L2.yaml
@@ -0,0 +1,153 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV3_XS_L2
+  class_num: 1000
+  dropout: 0.1
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.01
+  lr:
+    name: Cosine
+    learning_rate: 0.002  # for total batch size 384
+    eta_min: 0.0002
+    warmup_epoch: 1  # 3000 iterations
+    warmup_start_lr: 0.0002
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiScaleDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bilinear
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    # support to specify width and height respectively:
+    # scales: [(256,256) (160,160), (192,192), (224,224) (288,288) (320,320)]
+    sampler:
+      name: MultiScaleSampler
+      scales: [256, 160, 192, 224, 288, 320]
+      # first_bs: batch size for the first image resolution in the scales list
+      # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+      first_bs: 48
+      divided_factor: 32
+      is_training: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bilinear
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 48
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bilinear
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_XXS.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_XXS.yaml
new file mode 100644
index 000000000..719a20068
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_XXS.yaml
@@ -0,0 +1,153 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV3_XXS
+  class_num: 1000
+  dropout: 0.05
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.01
+  lr:
+    name: Cosine
+    learning_rate: 0.002  # for total batch size 384
+    eta_min: 0.0002
+    warmup_epoch: 1  # 3000 iterations
+    warmup_start_lr: 0.0002
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiScaleDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bilinear
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    # support to specify width and height respectively:
+    # scales: [(256,256) (160,160), (192,192), (224,224) (288,288) (320,320)]
+    sampler:
+      name: MultiScaleSampler
+      scales: [256, 160, 192, 224, 288, 320]
+      # first_bs: batch size for the first image resolution in the scales list
+      # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+      first_bs: 48
+      divided_factor: 32
+      is_training: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bilinear
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 48
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bilinear
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_XXS_L2.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_XXS_L2.yaml
new file mode 100644
index 000000000..ff84fadd7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_XXS_L2.yaml
@@ -0,0 +1,153 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV3_XXS_L2
+  class_num: 1000
+  dropout: 0.1
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.01
+  lr:
+    name: Cosine
+    learning_rate: 0.002  # for total batch size 384
+    eta_min: 0.0002
+    warmup_epoch: 1  # 3000 iterations
+    warmup_start_lr: 0.0002
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiScaleDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bilinear
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    # support to specify width and height respectively:
+    # scales: [(256,256) (160,160), (192,192), (224,224) (288,288) (320,320)]
+    sampler:
+      name: MultiScaleSampler
+      scales: [256, 160, 192, 224, 288, 320]
+      # first_bs: batch size for the first image resolution in the scales list
+      # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+      first_bs: 48
+      divided_factor: 32
+      is_training: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bilinear
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 48
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bilinear
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_x0_5.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_x0_5.yaml
new file mode 100644
index 000000000..31eef0c35
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_x0_5.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV3_x0_5
+  class_num: 1000
+  classifier_dropout: 0.
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.002  # for total batch size 1020 by referring to official
+    eta_min: 0.0002
+    warmup_epoch: 16  # 20000 iterations
+    warmup_start_lr: 1e-6
+  clip_norm: 10
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - RandAugmentV3:
+            num_layers: 2
+            interpolation: bicubic
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: const
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.2
+              prob: 0.25
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.25
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_np: False
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_np: False
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_x0_75.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_x0_75.yaml
new file mode 100644
index 000000000..eb4e3a3bd
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_x0_75.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV3_x0_75
+  class_num: 1000
+  classifier_dropout: 0.
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.002  # for total batch size 1020 by referring to official
+    eta_min: 0.0002
+    warmup_epoch: 16  # 20000 iterations
+    warmup_start_lr: 1e-6
+  clip_norm: 10
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - RandAugmentV3:
+            num_layers: 2
+            interpolation: bicubic
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: const
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.2
+              prob: 0.25
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.25
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_np: False
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_np: False
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_x1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_x1_0.yaml
new file mode 100644
index 000000000..e6b437bb4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/MobileViTV3/MobileViTV3_x1_0.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model ema
+EMA:
+  decay: 0.9995
+
+# model architecture
+Arch:
+  name: MobileViTV3_x1_0
+  class_num: 1000
+  classifier_dropout: 0.
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 0.002  # for total batch size 1020 by referring to official
+    eta_min: 0.0002
+    warmup_epoch: 16  # 20000 iterations
+    warmup_start_lr: 1e-6
+  clip_norm: 10
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - RandAugmentV3:
+            num_layers: 2
+            interpolation: bicubic
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: const
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.2
+              prob: 0.25
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.25
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_np: False
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 288
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.0, 0.0, 0.0]
+            std: [1.0, 1.0, 1.0]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_np: False
+        channel_first: False
+        backend: pil
+    - ResizeImage:
+        resize_short: 288
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.0, 0.0, 0.0]
+        std: [1.0, 1.0, 1.0]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/NextViT/NextViT_base_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/NextViT/NextViT_base_224.yaml
new file mode 100644
index 000000000..66317732e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/NextViT/NextViT_base_224.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: NextViT_base_224
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.1
+  no_weight_decay_name: .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ./deploy/images/ImageNet/ILSVRC2012_val_00000010.jpeg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/NextViT/NextViT_base_384.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/NextViT/NextViT_base_384.yaml
new file mode 100644
index 000000000..d013e6470
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/NextViT/NextViT_base_384.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: NextViT_base_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.1
+  no_weight_decay_name: .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ./deploy/images/ImageNet/ILSVRC2012_val_00000010.jpeg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/NextViT/NextViT_large_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/NextViT/NextViT_large_224.yaml
new file mode 100644
index 000000000..33b8dabfa
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/NextViT/NextViT_large_224.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: NextViT_large_224
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.1
+  no_weight_decay_name: .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ./deploy/images/ImageNet/ILSVRC2012_val_00000010.jpeg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/NextViT/NextViT_large_384.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/NextViT/NextViT_large_384.yaml
new file mode 100644
index 000000000..a3eaf4c58
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/NextViT/NextViT_large_384.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: NextViT_large_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.1
+  no_weight_decay_name: .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ./deploy/images/ImageNet/ILSVRC2012_val_00000010.jpeg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/NextViT/NextViT_small_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/NextViT/NextViT_small_224.yaml
new file mode 100644
index 000000000..d16151df2
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/NextViT/NextViT_small_224.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: NextViT_small_224
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.1
+  no_weight_decay_name: .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ./deploy/images/ImageNet/ILSVRC2012_val_00000010.jpeg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/NextViT/NextViT_small_384.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/NextViT/NextViT_small_384.yaml
new file mode 100644
index 000000000..daf727621
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/NextViT/NextViT_small_384.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: NextViT_small_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.1
+  no_weight_decay_name: .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ./deploy/images/ImageNet/ILSVRC2012_val_00000010.jpeg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNet/PPHGNet_base.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNet/PPHGNet_base.yaml
new file mode 100644
index 000000000..873b027a9
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNet/PPHGNet_base.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 600
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNet_base
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m15-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.4
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.4
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 236
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 236
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNet/PPHGNet_small.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNet/PPHGNet_small.yaml
new file mode 100644
index 000000000..9cc9dd9cd
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNet/PPHGNet_small.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 600
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNet_small
+  class_num: 1000
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.2
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 236
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 236
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNet/PPHGNet_tiny.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNet/PPHGNet_tiny.yaml
new file mode 100644
index 000000000..77f1e7a5f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNet/PPHGNet_tiny.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 600
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNet_tiny
+  class_num: 1000
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.2
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 232
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 232
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B0.yaml
new file mode 100644
index 000000000..d5a7ab49e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B0.yaml
@@ -0,0 +1,164 @@
+## Note: This config is only used for finetune training. The ImageNet metrics in PaddleClas are not trained through this config.
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNetV2_B0
+  class_num: 1000
+  pretrained: True # ssld pretrained
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    # for global bs 1024, when finetune training, you need to reduce learning_rate manually
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 232
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 232
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B1.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B1.yaml
new file mode 100644
index 000000000..bdae7decb
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B1.yaml
@@ -0,0 +1,164 @@
+## Note: This config is only used for finetune training. The ImageNet metrics in PaddleClas are not trained through this config.
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNetV2_B1
+  class_num: 1000
+  pretrained: True # ssld pretrained
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    # for global bs 1024, when finetune training, you need to reduce learning_rate manually
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 232
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 232
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B2.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B2.yaml
new file mode 100644
index 000000000..a30d1dd65
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B2.yaml
@@ -0,0 +1,164 @@
+## Note: This config is only used for finetune training. The ImageNet metrics in PaddleClas are not trained through this config.
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNetV2_B2
+  class_num: 1000
+  pretrained: True # ssld pretrained
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    # for global bs 1024, when finetune training, you need to reduce learning_rate manually
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 232
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 232
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B3.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B3.yaml
new file mode 100644
index 000000000..536659111
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B3.yaml
@@ -0,0 +1,164 @@
+## Note: This config is only used for finetune training. The ImageNet metrics in PaddleClas are not trained through this config.
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNetV2_B3
+  class_num: 1000
+  pretrained: True # ssld pretrained
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    # for global bs 1024, when finetune training, you need to reduce learning_rate manually
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 232
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 232
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B4.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B4.yaml
new file mode 100644
index 000000000..167a48092
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B4.yaml
@@ -0,0 +1,164 @@
+## Note: This config is only used for finetune training. The ImageNet metrics in PaddleClas are not trained through this config.
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNetV2_B4
+  class_num: 1000
+  pretrained: True # ssld pretrained
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    # for global bs 1024, when finetune training, you need to reduce learning_rate manually
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 232
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 232
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B4_ssld_stage1.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B4_ssld_stage1.yaml
new file mode 100644
index 000000000..c8cd5fcf7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B4_ssld_stage1.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  use_dali: false
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  models:
+    - Teacher:
+        name: PPHGNet_small
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+    - Student:
+        name: PPHGNetV2_B4
+        class_num: *class_num
+        pretrained: False
+
+  infer_model_name: "Student"
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationCELoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+        
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: ImageNetDataset
+        image_root: "./dataset/"
+        # ImageNet_5M label path, the training process does not use real labels.
+        cls_label_path: "./dataset/train_list_imagenet_5M.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - RandCropImage:
+              size: 224
+              interpolation: bicubic
+              backend: pil
+          - RandFlipImage:
+              flip_code: 1
+          - TimmAutoAugment:
+              config_str: rand-m7-mstd0.5-inc1
+              interpolation: bicubic
+              img_size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/val_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              resize_short: 236
+              interpolation: bicubic
+              backend: pil
+          - CropImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+Infer:
+  infer_imgs: "docs/images/inference_deployment/whl_demo.jpg"
+  batch_size: 10
+  transforms:
+      - DecodeImage:
+          to_rgb: True
+          channel_first: False
+      - ResizeImage:
+          resize_short: 236
+      - CropImage:
+          size: 224
+      - NormalizeImage:
+          scale: 1.0/255.0
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+          order: ''
+      - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: "ppcls/utils/imagenet1k_label_list.txt"
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B4_ssld_stage2.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B4_ssld_stage2.yaml
new file mode 100644
index 000000000..074a77b42
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B4_ssld_stage2.yaml
@@ -0,0 +1,173 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  use_dali: false
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 1000
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  models:
+    - Teacher:
+        name: PPHGNet_small
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+    - Student:
+        name: PPHGNetV2_B4
+        class_num: *class_num
+        pretrained: path/to/stage1_best_model_student
+
+  infer_model_name: "Student"
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationCELoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+        
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    # stage2 should reduce learning rate
+    learning_rate: 0.005
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        # ImageNet-1k label path, the training process does not use real labels
+        cls_label_path: "./dataset/ILSVRC2012/train_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - RandCropImage:
+              size: 224
+              interpolation: bicubic
+              backend: pil
+          - RandFlipImage:
+              flip_code: 1
+          - TimmAutoAugment:
+              config_str: rand-m7-mstd0.5-inc1
+              interpolation: bicubic
+              img_size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: ImageNetDataset
+        image_root: "./dataset/ILSVRC2012/"
+        cls_label_path: "./dataset/ILSVRC2012/val_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              resize_short: 236
+              interpolation: bicubic
+              backend: pil
+          - CropImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+Infer:
+  infer_imgs: "docs/images/inference_deployment/whl_demo.jpg"
+  batch_size: 10
+  transforms:
+      - DecodeImage:
+          to_rgb: True
+          channel_first: False
+      - ResizeImage:
+          resize_short: 236
+      - CropImage:
+          size: 224
+      - NormalizeImage:
+          scale: 1.0/255.0
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+          order: ''
+      - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: "ppcls/utils/imagenet1k_label_list.txt"
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B5.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B5.yaml
new file mode 100644
index 000000000..57ca91199
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B5.yaml
@@ -0,0 +1,164 @@
+## Note: This config is only used for finetune training. The ImageNet metrics in PaddleClas are not trained through this config.
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNetV2_B5
+  class_num: 1000
+  pretrained: True # ssld pretrained
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    # for global bs 1024, when finetune training, you need to reduce learning_rate manually
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 232
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 232
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B6.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B6.yaml
new file mode 100644
index 000000000..da9cb04bd
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPHGNetV2/PPHGNetV2_B6.yaml
@@ -0,0 +1,164 @@
+## Note: This config is only used for finetune training. The ImageNet metrics in PaddleClas are not trained through this config.
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNetV2_B6
+  class_num: 1000
+  pretrained: True # ssld pretrained
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    # for global bs 1024, when finetune training, you need to reduce learning_rate manually
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 232
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 232
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x0_25.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x0_25.yaml
new file mode 100644
index 000000000..f700f6c6e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x0_25.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x0_25
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x0_35.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x0_35.yaml
new file mode 100644
index 000000000..c83c504a7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x0_35.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x0_35
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x0_5.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x0_5.yaml
new file mode 100644
index 000000000..00eae55c4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x0_5.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x0_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x0_75.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x0_75.yaml
new file mode 100644
index 000000000..96a43650b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x0_75.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x0_75
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml
new file mode 100644
index 000000000..97291077f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x1_0.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x1_0_ampo2_ultra.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x1_0_ampo2_ultra.yaml
new file mode 100644
index 000000000..336fb3345
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x1_0_ampo2_ultra.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  use_dali: True
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O2
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 3.2
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 1024
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x1_0_fp32_ultra.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x1_0_fp32_ultra.yaml
new file mode 100644
index 000000000..9f7abadd9
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x1_0_fp32_ultra.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  use_dali: True
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.6
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x1_5.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x1_5.yaml
new file mode 100644
index 000000000..367689197
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x1_5.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x2_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x2_0.yaml
new file mode 100644
index 000000000..87fb92eee
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x2_0.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x2_0
+  class_num: 1000 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x2_5.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x2_5.yaml
new file mode 100644
index 000000000..762d21fed
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNet/PPLCNet_x2_5.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x2_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNetV2/PPLCNetV2_base.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNetV2/PPLCNetV2_base.yaml
new file mode 100644
index 000000000..4195efebc
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNetV2/PPLCNetV2_base.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 480
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PPLCNetV2_base
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiScaleDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    # support to specify width and height respectively:
+    # scales: [(160,160), (192,192), (224,224) (288,288) (320,320)]
+    sampler:
+      name: MultiScaleSampler
+      scales: [160, 192, 224, 288, 320]
+      # first_bs: batch size for the first image resolution in the scales list
+      # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+      first_bs: 500
+      divided_factor: 32
+      is_training: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNetV2/PPLCNetV2_large.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNetV2/PPLCNetV2_large.yaml
new file mode 100644
index 000000000..1551e673f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNetV2/PPLCNetV2_large.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 480
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PPLCNetV2_large
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.4
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiScaleDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    # support to specify width and height respectively:
+    # scales: [(160,160), (192,192), (224,224) (288,288) (320,320)]
+    sampler:
+      name: MultiScaleSampler
+      scales: [160, 192, 224, 288, 320]
+      # first_bs: batch size for the first image resolution in the scales list
+      # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+      first_bs: 250
+      divided_factor: 32
+      is_training: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNetV2/PPLCNetV2_small.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNetV2/PPLCNetV2_small.yaml
new file mode 100644
index 000000000..bb937ba78
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PPLCNetV2/PPLCNetV2_small.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 480
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PPLCNetV2_small
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiScaleDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    # support to specify width and height respectively:
+    # scales: [(160,160), (192,192), (224,224) (288,288) (320,320)]
+    sampler:
+      name: MultiScaleSampler
+      scales: [160, 192, 224, 288, 320]
+      # first_bs: batch size for the first image resolution in the scales list
+      # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+      first_bs: 500
+      divided_factor: 32
+      is_training: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B0.yaml
new file mode 100644
index 000000000..447326ac7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B0.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PVT_V2_B0
+  class_num: 1000
+  drop_path_rate: 0.1
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B1.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B1.yaml
new file mode 100644
index 000000000..35ab6507a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B1.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PVT_V2_B1
+  class_num: 1000
+  drop_path_rate: 0.1
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B2.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B2.yaml
new file mode 100644
index 000000000..ec93edc94
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B2.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PVT_V2_B2
+  class_num: 1000
+  drop_path_rate: 0.1
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B2_Linear.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B2_Linear.yaml
new file mode 100644
index 000000000..77cfbad84
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B2_Linear.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PVT_V2_B2_Linear
+  class_num: 1000
+  drop_path_rate: 0.1
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B3.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B3.yaml
new file mode 100644
index 000000000..9d4443491
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B3.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PVT_V2_B3
+  class_num: 1000
+  drop_path_rate: 0.3
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  clip_grad: 1.0
+  no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B4.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B4.yaml
new file mode 100644
index 000000000..78e08fa7a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B4.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PVT_V2_B4
+  class_num: 1000
+  drop_path_rate: 0.3
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  clip_grad: 1.0
+  no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B5.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B5.yaml
new file mode 100644
index 000000000..b8f4da0d3
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PVTV2/PVT_V2_B5.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PVT_V2_B5
+  class_num: 1000
+  drop_path_rate: 0.3
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  clip_grad: 1.0
+  no_weight_decay_name: pos_embed1 pos_embed2 pos_embed3 pos_embed4 cls_token
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PeleeNet/PeleeNet.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PeleeNet/PeleeNet.yaml
new file mode 100644
index 000000000..06b0059bc
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/PeleeNet/PeleeNet.yaml
@@ -0,0 +1,148 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: PeleeNet
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.18  # for total batch size 512
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bilinear
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bilinear
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256  # for 2 cards
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bilinear
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ReXNet/ReXNet_1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ReXNet/ReXNet_1_0.yaml
new file mode 100644
index 000000000..c4fa39e9b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ReXNet/ReXNet_1_0.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ReXNet_1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ReXNet/ReXNet_1_3.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ReXNet/ReXNet_1_3.yaml
new file mode 100644
index 000000000..8bfe5c3c0
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ReXNet/ReXNet_1_3.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ReXNet_1_3
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ReXNet/ReXNet_1_5.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ReXNet/ReXNet_1_5.yaml
new file mode 100644
index 000000000..66a8497cb
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ReXNet/ReXNet_1_5.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ReXNet_1_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ReXNet/ReXNet_2_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ReXNet/ReXNet_2_0.yaml
new file mode 100644
index 000000000..80a80c179
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ReXNet/ReXNet_2_0.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ReXNet_2_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ReXNet/ReXNet_3_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ReXNet/ReXNet_3_0.yaml
new file mode 100644
index 000000000..7b896d20f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ReXNet/ReXNet_3_0.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ReXNet_3_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RedNet/RedNet101.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RedNet/RedNet101.yaml
new file mode 100644
index 000000000..fa793e4d4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RedNet/RedNet101.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RedNet101
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0
+        mean: [123.675, 116.28, 103.53]
+        std: [58.395, 57.12, 57.375]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RedNet/RedNet152.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RedNet/RedNet152.yaml
new file mode 100644
index 000000000..1b7bc86e9
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RedNet/RedNet152.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RedNet152
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0
+        mean: [123.675, 116.28, 103.53]
+        std: [58.395, 57.12, 57.375]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RedNet/RedNet26.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RedNet/RedNet26.yaml
new file mode 100644
index 000000000..d8e88eb90
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RedNet/RedNet26.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RedNet26
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0
+        mean: [123.675, 116.28, 103.53]
+        std: [58.395, 57.12, 57.375]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RedNet/RedNet38.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RedNet/RedNet38.yaml
new file mode 100644
index 000000000..b1326d647
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RedNet/RedNet38.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RedNet38
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0
+        mean: [123.675, 116.28, 103.53]
+        std: [58.395, 57.12, 57.375]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RedNet/RedNet50.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RedNet/RedNet50.yaml
new file mode 100644
index 000000000..5e2a5cf06
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RedNet/RedNet50.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RedNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0
+            mean: [123.675, 116.28, 103.53]
+            std: [58.395, 57.12, 57.375]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0
+        mean: [123.675, 116.28, 103.53]
+        std: [58.395, 57.12, 57.375]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_12GF.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_12GF.yaml
new file mode 100644
index 000000000..7d9135427
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_12GF.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RegNetX_12GF
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_1600MF.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_1600MF.yaml
new file mode 100644
index 000000000..b14298f5e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_1600MF.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RegNetX_1600MF
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_16GF.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_16GF.yaml
new file mode 100644
index 000000000..7de217298
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_16GF.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RegNetX_16GF
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_200MF.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_200MF.yaml
new file mode 100644
index 000000000..1b87cabc0
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_200MF.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RegNetX_200MF
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_3200MF.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_3200MF.yaml
new file mode 100644
index 000000000..702296394
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_3200MF.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RegNetX_3200MF
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_32GF.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_32GF.yaml
new file mode 100644
index 000000000..0c770b0c3
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_32GF.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RegNetX_32GF
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_400MF.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_400MF.yaml
new file mode 100644
index 000000000..aad3bb23f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_400MF.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RegNetX_400MF
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_600MF.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_600MF.yaml
new file mode 100644
index 000000000..54dbbcca4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_600MF.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RegNetX_600MF
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_6400MF.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_6400MF.yaml
new file mode 100644
index 000000000..eae2c6a4d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_6400MF.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RegNetX_6400MF
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_800MF.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_800MF.yaml
new file mode 100644
index 000000000..8dd6d19db
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_800MF.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RegNetX_800MF
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_8GF.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_8GF.yaml
new file mode 100644
index 000000000..3cb65e961
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RegNet/RegNetX_8GF.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RegNetX_8GF
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_A0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_A0.yaml
new file mode 100644
index 000000000..ecec64644
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_A0.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RepVGG_A0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_A1.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_A1.yaml
new file mode 100644
index 000000000..8ce5a8ad4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_A1.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RepVGG_A1
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_A2.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_A2.yaml
new file mode 100644
index 000000000..7c8e4bc61
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_A2.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RepVGG_A2
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B0.yaml
new file mode 100644
index 000000000..5449d698d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B0.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RepVGG_B0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B1.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B1.yaml
new file mode 100644
index 000000000..37f926299
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B1.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RepVGG_B1
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B1g2.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B1g2.yaml
new file mode 100644
index 000000000..a78d60d6d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B1g2.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RepVGG_B1g2
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B1g4.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B1g4.yaml
new file mode 100644
index 000000000..4b2678409
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B1g4.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RepVGG_B1g4
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B2.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B2.yaml
new file mode 100644
index 000000000..d6c3f36a3
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B2.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RepVGG_B2
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B2g4.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B2g4.yaml
new file mode 100644
index 000000000..d596211bb
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B2g4.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RepVGG_B2g4
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B3.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B3.yaml
new file mode 100644
index 000000000..a15cbdc6c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B3.yaml
@@ -0,0 +1,149 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RepVGG_B3
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B3g4.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B3g4.yaml
new file mode 100644
index 000000000..d2921bdb5
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_B3g4.yaml
@@ -0,0 +1,149 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RepVGG_B3g4
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_D2se.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_D2se.yaml
new file mode 100644
index 000000000..0b7f105f1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/RepVGG/RepVGG_D2se.yaml
@@ -0,0 +1,149 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 320, 320]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: RepVGG_D2se
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 320
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 320
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Res2Net/Res2Net101_vd_26w_4s.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Res2Net/Res2Net101_vd_26w_4s.yaml
new file mode 100644
index 000000000..155a5caee
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Res2Net/Res2Net101_vd_26w_4s.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: Res2Net101_vd_26w_4s
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Res2Net/Res2Net200_vd_26w_4s.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Res2Net/Res2Net200_vd_26w_4s.yaml
new file mode 100644
index 000000000..db0076cf3
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Res2Net/Res2Net200_vd_26w_4s.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: Res2Net200_vd_26w_4s
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Res2Net/Res2Net50_14w_8s.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Res2Net/Res2Net50_14w_8s.yaml
new file mode 100644
index 000000000..e90812e03
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Res2Net/Res2Net50_14w_8s.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: Res2Net50_14w_8s
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Res2Net/Res2Net50_26w_4s.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Res2Net/Res2Net50_26w_4s.yaml
new file mode 100644
index 000000000..eabf48041
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Res2Net/Res2Net50_26w_4s.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: Res2Net50_26w_4s
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Res2Net/Res2Net50_vd_26w_4s.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Res2Net/Res2Net50_vd_26w_4s.yaml
new file mode 100644
index 000000000..4f72ecbb4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Res2Net/Res2Net50_vd_26w_4s.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: Res2Net50_vd_26w_4s
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeSt/ResNeSt101.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeSt/ResNeSt101.yaml
new file mode 100644
index 000000000..315b7c74e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeSt/ResNeSt101.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeSt101
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 288
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 288
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeSt/ResNeSt200.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeSt/ResNeSt200.yaml
new file mode 100644
index 000000000..cb3721264
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeSt/ResNeSt200.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 320, 320]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeSt200
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 320
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+        - CropImage:
+            size: 320
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+    - CropImage:
+        size: 320
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeSt/ResNeSt269.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeSt/ResNeSt269.yaml
new file mode 100644
index 000000000..985ff98e7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeSt/ResNeSt269.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 416, 416]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeSt269
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 416
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 416
+        - CropImage:
+            size: 416
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 416
+    - CropImage:
+        size: 416
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeSt/ResNeSt50.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeSt/ResNeSt50.yaml
new file mode 100644
index 000000000..50529aecd
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeSt/ResNeSt50.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeSt50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeSt/ResNeSt50_fast_1s1x64d.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeSt/ResNeSt50_fast_1s1x64d.yaml
new file mode 100644
index 000000000..c5db97b10
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeSt/ResNeSt50_fast_1s1x64d.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeSt50_fast_1s1x64d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x16d_wsl.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x16d_wsl.yaml
new file mode 100644
index 000000000..e0d496e83
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x16d_wsl.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeXt101_32x16d_wsl
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x32d_wsl.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x32d_wsl.yaml
new file mode 100644
index 000000000..40f82b5fd
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x32d_wsl.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeXt101_32x32d_wsl
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x48d_wsl.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x48d_wsl.yaml
new file mode 100644
index 000000000..5974ca719
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x48d_wsl.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeXt101_32x48d_wsl
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x8d_wsl.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x8d_wsl.yaml
new file mode 100644
index 000000000..d2f4cd852
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNeXt101_wsl/ResNeXt101_32x8d_wsl.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNeXt101_32x8d_wsl
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet101.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet101.yaml
new file mode 100644
index 000000000..afd6329b3
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet101.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet101
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet101_vd.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet101_vd.yaml
new file mode 100644
index 000000000..748dfc12f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet101_vd.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet101_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet152.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet152.yaml
new file mode 100644
index 000000000..993d52e5d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet152.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet152
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet152_vd.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet152_vd.yaml
new file mode 100644
index 000000000..8daf0f806
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet152_vd.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet152_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet18.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet18.yaml
new file mode 100644
index 000000000..2c0c4c231
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet18.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet18
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet18_dbb.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet18_dbb.yaml
new file mode 100644
index 000000000..35250d83a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet18_dbb.yaml
@@ -0,0 +1,153 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet18
+  class_num: 1000
+  layer_type: DiverseBranchBlock
+  use_first_short_conv: False
+  
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            backend: pil
+            interpolation: bilinear
+        - RandFlipImage:
+            flip_code: 1
+        - ColorJitter:
+            brightness: 0.4
+            saturation: 0.4
+            hue: 0.4
+        - PCALighting:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            backend: pil
+            interpolation: bilinear
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet18_vd.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet18_vd.yaml
new file mode 100644
index 000000000..591ef505b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet18_vd.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet18_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet200_vd.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet200_vd.yaml
new file mode 100644
index 000000000..135388ae7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet200_vd.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet200_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet34.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet34.yaml
new file mode 100644
index 000000000..88af8fe0e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet34.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet34
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet34_vd.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet34_vd.yaml
new file mode 100644
index 000000000..feb6c4266
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet34_vd.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet34_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50.yaml
new file mode 100644
index 000000000..a6dd0f5f4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml
new file mode 100644
index 000000000..4981efe91
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: True
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  multi_precision: True
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50_amp_O1_ultra.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50_amp_O1_ultra.yaml
new file mode 100644
index 000000000..47aa4c0e4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50_amp_O1_ultra.yaml
@@ -0,0 +1,150 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  image_channel: &image_channel 4
+  # used for static mode and model export
+  image_shape: [*image_channel, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: True
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+  input_image_channel: *image_channel
+  data_format: "NHWC"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  multi_precision: True
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            channel_num: *image_channel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            channel_num: *image_channel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+        channel_num: *image_channel
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50_amp_O2_ultra.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50_amp_O2_ultra.yaml
new file mode 100644
index 000000000..7f4de6640
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50_amp_O2_ultra.yaml
@@ -0,0 +1,150 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  image_channel: &image_channel 4
+  # used for static mode and model export
+  image_shape: [*image_channel, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: True
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O2
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+  input_image_channel: *image_channel
+  data_format: "NHWC"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  multi_precision: True
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            output_fp16: True
+            channel_num: *image_channel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            channel_num: *image_channel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+        channel_num: *image_channel
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50_ampo2_ultra.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50_ampo2_ultra.yaml
new file mode 100644
index 000000000..ee606707a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50_ampo2_ultra.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  use_dali: True
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O2
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.8
+    decay_epochs: [30, 60, 90]
+    values: [0.8, 0.08, 0.008, 0.0008]
+    warmup_epoch : 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50_fp32_ultra.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50_fp32_ultra.yaml
new file mode 100644
index 000000000..9fb8da0d6
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50_fp32_ultra.yaml
@@ -0,0 +1,146 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  use_dali: True
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.8
+    decay_epochs: [30, 60, 90]
+    values: [0.8, 0.08, 0.008, 0.0008]
+    warmup_epoch : 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50_vd.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50_vd.yaml
new file mode 100644
index 000000000..5f30746b1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ResNet/ResNet50_vd.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ResNet50_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SENet154_vd.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SENet154_vd.yaml
new file mode 100644
index 000000000..33e72300a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SENet154_vd.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SENet154_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNeXt101_32x4d.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNeXt101_32x4d.yaml
new file mode 100644
index 000000000..df3d9c561
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNeXt101_32x4d.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SE_ResNeXt101_32x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_amp_O2_ultra.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_amp_O2_ultra.yaml
new file mode 100644
index 000000000..835782185
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNeXt101_32x4d_amp_O2_ultra.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_channel: &image_channel 4
+  image_shape: [*image_channel, 224, 224]
+  save_inference_dir: ./inference
+
+# mixed precision training
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O2
+
+# model architecture
+Arch:
+  name: SE_ResNeXt101_32x4d
+  class_num: 1000
+  input_image_channel: *image_channel
+  data_format: "NHWC"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  multi_precision: True
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            output_fp16: True
+            channel_num: *image_channel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            channel_num: *image_channel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+        channel_num: *image_channel
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNeXt50_32x4d.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNeXt50_32x4d.yaml
new file mode 100644
index 000000000..192b5aafd
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNeXt50_32x4d.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SE_ResNeXt50_32x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNeXt50_vd_32x4d.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNeXt50_vd_32x4d.yaml
new file mode 100644
index 000000000..e82bfe42c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNeXt50_vd_32x4d.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SE_ResNeXt50_vd_32x4d
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNet18_vd.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNet18_vd.yaml
new file mode 100644
index 000000000..5cfed65d2
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNet18_vd.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SE_ResNet18_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNet34_vd.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNet34_vd.yaml
new file mode 100644
index 000000000..857300d51
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNet34_vd.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SE_ResNet34_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNet50_vd.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNet50_vd.yaml
new file mode 100644
index 000000000..e7b94138c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SENet/SE_ResNet50_vd.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SE_ResNet50_vd
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_swish.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_swish.yaml
new file mode 100644
index 000000000..4b0393cce
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_swish.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_swish
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_25.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_25.yaml
new file mode 100644
index 000000000..a00c648cf
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_25.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_x0_25
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_33.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_33.yaml
new file mode 100644
index 000000000..e3b82cf90
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_33.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_x0_33
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_5.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_5.yaml
new file mode 100644
index 000000000..c228b57a6
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_x0_5.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_x0_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml
new file mode 100644
index 000000000..f0d5d46f0
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_0.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_x1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_5.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_5.yaml
new file mode 100644
index 000000000..202514c99
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_x1_5.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_x1_5
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.25
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_x2_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_x2_0.yaml
new file mode 100644
index 000000000..d633a754e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/ShuffleNet/ShuffleNetV2_x2_0.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 240
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_x2_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.25
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SqueezeNet/SqueezeNet1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SqueezeNet/SqueezeNet1_0.yaml
new file mode 100644
index 000000000..7224467c7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SqueezeNet/SqueezeNet1_0.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SqueezeNet1_0
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.02
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SqueezeNet/SqueezeNet1_1.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SqueezeNet/SqueezeNet1_1.yaml
new file mode 100644
index 000000000..ed5ef8ecb
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SqueezeNet/SqueezeNet1_1.yaml
@@ -0,0 +1,140 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SqueezeNet1_1
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.02
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/StarNet/StarNet_S1.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/StarNet/StarNet_S1.yaml
new file mode 100644
index 000000000..a25df46d4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/StarNet/StarNet_S1.yaml
@@ -0,0 +1,165 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: StarNet_S1
+  drop_rate: 0
+  drop_path_rate: 0
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  clip_grad: None
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 3e-3
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m1-mstd0.5-inc1
+            interpolation: random
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator: 
+              alpha: 0.2
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: False
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            channel_first: False
+        - ResizeImage:
+            interpolation: bicubic
+            backend: pil
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: 'hwc'
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 20
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: False
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 224
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/StarNet/StarNet_S2.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/StarNet/StarNet_S2.yaml
new file mode 100644
index 000000000..f57360e59
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/StarNet/StarNet_S2.yaml
@@ -0,0 +1,165 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: StarNet_S2
+  drop_rate: 0
+  drop_path_rate: 0.01
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  clip_grad: None
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 3e-3
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m1-mstd0.5-inc1
+            interpolation: random
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator: 
+              alpha: 0.2
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: False
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            channel_first: False
+        - ResizeImage:
+            interpolation: bicubic
+            backend: pil
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: 'hwc'
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 20
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: False
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 224
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/StarNet/StarNet_S3.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/StarNet/StarNet_S3.yaml
new file mode 100644
index 000000000..af99efbc4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/StarNet/StarNet_S3.yaml
@@ -0,0 +1,166 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: StarNet_S3
+  drop_rate: 0
+  drop_path_rate: 0.01
+  class_num: 1000
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  clip_grad: None
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 3e-3
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list_smallbatch.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m1-mstd0.5-inc1
+            interpolation: random
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 0.2
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: False
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            channel_first: False
+        - ResizeImage:
+            interpolation: bicubic
+            backend: pil
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: 'hwc'
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 20
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: False
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 224
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/StarNet/StarNet_S4.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/StarNet/StarNet_S4.yaml
new file mode 100644
index 000000000..608f76c7f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/StarNet/StarNet_S4.yaml
@@ -0,0 +1,165 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: StarNet_S4
+  drop_rate: 0
+  drop_path_rate: 0.02
+  class_num: 1000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  clip_grad: None
+  no_weight_decay_name: null
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 3e-3
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m1-mstd0.5-inc1
+            interpolation: random
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator: 
+              alpha: 0.2
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: False
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            backend: pil
+            channel_first: False
+        - ResizeImage:
+            interpolation: bicubic
+            backend: pil
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: 'hwc'
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 20
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: False
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 224
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window12_384.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window12_384.yaml
new file mode 100644
index 000000000..c212dab12
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window12_384.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SwinTransformer_base_patch4_window12_384
+  class_num: 1000
+  # fused op can be used in AMP O2 mode only
+  use_fused_attn: False
+  use_fused_linear: False
+  
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 2e-5
+    warmup_epoch: 20
+    warmup_start_lr: 2e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384 
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 438
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 438
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window7_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window7_224.yaml
new file mode 100644
index 000000000..6941543d8
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window7_224.yaml
@@ -0,0 +1,176 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SwinTransformer_base_patch4_window7_224
+  class_num: 1000
+  pretrained: True
+  # fused op can be used in AMP O2 mode only
+  use_fused_attn: False
+  use_fused_linear: False
+  
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 2e-5
+    warmup_epoch: 20
+    warmup_start_lr: 2e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window12_384.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window12_384.yaml
new file mode 100644
index 000000000..3e335b24e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window12_384.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SwinTransformer_large_patch4_window12_384
+  class_num: 1000
+  # fused op can be used in AMP O2 mode only
+  use_fused_attn: False
+  use_fused_linear: False
+  
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 2e-5
+    warmup_epoch: 20
+    warmup_start_lr: 2e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384 
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 438
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 438
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window7_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window7_224.yaml
new file mode 100644
index 000000000..89bdd0108
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window7_224.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SwinTransformer_large_patch4_window7_224
+  class_num: 1000
+  # fused op can be used in AMP O2 mode only
+  use_fused_attn: False
+  use_fused_linear: False
+  
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 2e-5
+    warmup_epoch: 20
+    warmup_start_lr: 2e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformer/SwinTransformer_small_patch4_window7_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformer/SwinTransformer_small_patch4_window7_224.yaml
new file mode 100644
index 000000000..ee134d5f0
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformer/SwinTransformer_small_patch4_window7_224.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SwinTransformer_small_patch4_window7_224
+  class_num: 1000
+  # fused op can be used in AMP O2 mode only
+  use_fused_attn: False
+  use_fused_linear: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 2e-5
+    warmup_epoch: 20
+    warmup_start_lr: 2e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformer/SwinTransformer_tiny_patch4_window7_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformer/SwinTransformer_tiny_patch4_window7_224.yaml
new file mode 100644
index 000000000..e359afcef
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformer/SwinTransformer_tiny_patch4_window7_224.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: True
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O2
+
+
+# model architecture
+Arch:
+  name: SwinTransformer_tiny_patch4_window7_224
+  class_num: 1000
+  # fused op can be used in AMP O2 mode only
+  use_fused_attn: False
+  use_fused_linear: False
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 2e-5
+    warmup_epoch: 20
+    warmup_start_lr: 2e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_base_patch4_window16_256.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_base_patch4_window16_256.yaml
new file mode 100644
index 000000000..966bb2cd1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_base_patch4_window16_256.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name:  SwinTransformerV2_base_patch4_window16_256
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_base_patch4_window24_384.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_base_patch4_window24_384.yaml
new file mode 100644
index 000000000..b71ad0ec4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_base_patch4_window24_384.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name:  SwinTransformerV2_base_patch4_window24_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_base_patch4_window8_256.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_base_patch4_window8_256.yaml
new file mode 100644
index 000000000..2337b5d8d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_base_patch4_window8_256.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name:  SwinTransformerV2_base_patch4_window8_256
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_large_patch4_window16_256.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_large_patch4_window16_256.yaml
new file mode 100644
index 000000000..ed60a4b82
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_large_patch4_window16_256.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name:  SwinTransformerV2_large_patch4_window16_256
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_large_patch4_window24_384.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_large_patch4_window24_384.yaml
new file mode 100644
index 000000000..48972c48f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_large_patch4_window24_384.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name:  SwinTransformerV2_large_patch4_window24_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_small_patch4_window16_256.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_small_patch4_window16_256.yaml
new file mode 100644
index 000000000..e8c4c7c4c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_small_patch4_window16_256.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name:  SwinTransformerV2_small_patch4_window16_256
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_small_patch4_window8_256.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_small_patch4_window8_256.yaml
new file mode 100644
index 000000000..c0e1e0c7e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_small_patch4_window8_256.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name:  SwinTransformerV2_small_patch4_window8_256
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_tiny_patch4_window16_256.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_tiny_patch4_window16_256.yaml
new file mode 100644
index 000000000..18678441e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_tiny_patch4_window16_256.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name:  SwinTransformerV2_tiny_patch4_window16_256
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_tiny_patch4_window8_256.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_tiny_patch4_window8_256.yaml
new file mode 100644
index 000000000..a1b69b3d7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/SwinTransformerV2/SwinTransformerV2_tiny_patch4_window8_256.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 256]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name:  SwinTransformerV2_tiny_patch4_window8_256
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 256
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 292
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 256
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 292
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 256
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TNT/TNT_base.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TNT/TNT_base.yaml
new file mode 100644
index 000000000..c2c7766be
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TNT/TNT_base.yaml
@@ -0,0 +1,146 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: TNT_base
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TNT/TNT_small.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TNT/TNT_small.yaml
new file mode 100644
index 000000000..2ab6cb60f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TNT/TNT_small.yaml
@@ -0,0 +1,146 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: TNT_small
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TinyNet/TinyNet_A.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TinyNet/TinyNet_A.yaml
new file mode 100644
index 000000000..4e6c365bd
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TinyNet/TinyNet_A.yaml
@@ -0,0 +1,167 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 450
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 192, 192]
+  save_inference_dir: ./inference
+
+# model ema
+EMA:
+  decay: 0.9999
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: TinyNet_A
+  class_num: 1000
+  override_params:
+    batch_norm_momentum: 0.9
+    batch_norm_epsilon: 1e-5
+    depth_trunc: round
+    drop_connect_rate: 0.1
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Step
+    learning_rate: 0.048
+    step_size: 2.4
+    gamma: 0.97
+    warmup_epoch: 3
+    warmup_start_lr: 1e-6
+  regularizer:
+    name: 'L2'
+    coeff: 1e-5
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 192
+            interpolation: bicubic
+            backend: pil
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - ColorJitter:
+            brightness: 0.4
+            contrast: 0.4
+            saturation: 0.4
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_np: False
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 219
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 192
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_np: False
+        channel_first: False
+    - ResizeImage:
+        resize_short: 219
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 192
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TinyNet/TinyNet_B.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TinyNet/TinyNet_B.yaml
new file mode 100644
index 000000000..96fd0d06b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TinyNet/TinyNet_B.yaml
@@ -0,0 +1,167 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 450
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 188, 188]
+  save_inference_dir: ./inference
+
+# model ema
+EMA:
+  decay: 0.9999
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: TinyNet_B
+  class_num: 1000
+  override_params:
+    batch_norm_momentum: 0.9
+    batch_norm_epsilon: 1e-5
+    depth_trunc: round
+    drop_connect_rate: 0.1
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Step
+    learning_rate: 0.048
+    step_size: 2.4
+    gamma: 0.97
+    warmup_epoch: 3
+    warmup_start_lr: 1e-6
+  regularizer:
+    name: 'L2'
+    coeff: 1e-5
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 188
+            interpolation: bicubic
+            backend: pil
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - ColorJitter:
+            brightness: 0.4
+            contrast: 0.4
+            saturation: 0.4
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_np: False
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 214
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 188
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_np: False
+        channel_first: False
+    - ResizeImage:
+        resize_short: 214
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 188
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TinyNet/TinyNet_C.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TinyNet/TinyNet_C.yaml
new file mode 100644
index 000000000..addaec8c1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TinyNet/TinyNet_C.yaml
@@ -0,0 +1,167 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 450
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 184, 184]
+  save_inference_dir: ./inference
+
+# model ema
+EMA:
+  decay: 0.9999
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: TinyNet_C
+  class_num: 1000
+  override_params:
+    batch_norm_momentum: 0.9
+    batch_norm_epsilon: 1e-5
+    depth_trunc: round
+    drop_connect_rate: 0.0
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Step
+    learning_rate: 0.048
+    step_size: 2.4
+    gamma: 0.97
+    warmup_epoch: 3
+    warmup_start_lr: 1e-6
+  regularizer:
+    name: 'L2'
+    coeff: 1e-5
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 184
+            interpolation: bicubic
+            backend: pil
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - ColorJitter:
+            brightness: 0.4
+            contrast: 0.4
+            saturation: 0.4
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_np: False
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 210
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 184
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_np: False
+        channel_first: False
+    - ResizeImage:
+        resize_short: 210
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 184
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TinyNet/TinyNet_D.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TinyNet/TinyNet_D.yaml
new file mode 100644
index 000000000..a868af59b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TinyNet/TinyNet_D.yaml
@@ -0,0 +1,167 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 450
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 152, 152]
+  save_inference_dir: ./inference
+
+# model ema
+EMA:
+  decay: 0.9999
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: TinyNet_D
+  class_num: 1000
+  override_params:
+    batch_norm_momentum: 0.9
+    batch_norm_epsilon: 1e-5
+    depth_trunc: round
+    drop_connect_rate: 0.0
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Step
+    learning_rate: 0.048
+    step_size: 2.4
+    gamma: 0.97
+    warmup_epoch: 3
+    warmup_start_lr: 1e-6
+  regularizer:
+    name: 'L2'
+    coeff: 1e-5
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 152
+            interpolation: bicubic
+            backend: pil
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - ColorJitter:
+            brightness: 0.4
+            contrast: 0.4
+            saturation: 0.4
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_np: False
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 173
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 152
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_np: False
+        channel_first: False
+    - ResizeImage:
+        resize_short: 173
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 152
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TinyNet/TinyNet_E.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TinyNet/TinyNet_E.yaml
new file mode 100644
index 000000000..02617db16
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/TinyNet/TinyNet_E.yaml
@@ -0,0 +1,167 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 450
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 106, 106]
+  save_inference_dir: ./inference
+
+# model ema
+EMA:
+  decay: 0.9999
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: TinyNet_E
+  class_num: 1000
+  override_params:
+    batch_norm_momentum: 0.9
+    batch_norm_epsilon: 1e-5
+    depth_trunc: round
+    drop_connect_rate: 0.0
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: RMSProp
+  momentum: 0.9
+  rho: 0.9
+  epsilon: 0.001
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Step
+    learning_rate: 0.048
+    step_size: 2.4
+    gamma: 0.97
+    warmup_epoch: 3
+    warmup_start_lr: 1e-6
+  regularizer:
+    name: 'L2'
+    coeff: 1e-5
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+            backend: pil
+        - RandCropImage:
+            size: 106
+            interpolation: bicubic
+            backend: pil
+            use_log_aspect: True
+        - RandFlipImage:
+            flip_code: 1
+        - ColorJitter:
+            brightness: 0.4
+            contrast: 0.4
+            saturation: 0.4
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_np: False
+            channel_first: False
+            backend: pil
+        - ResizeImage:
+            resize_short: 121
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 106
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_np: False
+        channel_first: False
+    - ResizeImage:
+        resize_short: 121
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 106
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Twins/alt_gvt_base.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Twins/alt_gvt_base.yaml
new file mode 100644
index 000000000..216bda288
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Twins/alt_gvt_base.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: alt_gvt_base
+  class_num: 1000
+  drop_rate: 0.0
+  drop_path_rate: 0.3
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token proj.0.weight proj.1.weight proj.2.weight proj.3.weight pos_block
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Twins/alt_gvt_large.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Twins/alt_gvt_large.yaml
new file mode 100644
index 000000000..ff2e62520
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Twins/alt_gvt_large.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: alt_gvt_large
+  class_num: 1000
+  drop_rate: 0.0
+  drop_path_rate: 0.5
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token proj.0.weight proj.1.weight proj.2.weight proj.3.weight pos_block
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Twins/alt_gvt_small.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Twins/alt_gvt_small.yaml
new file mode 100644
index 000000000..7de1133c1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Twins/alt_gvt_small.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: alt_gvt_small
+  class_num: 1000
+  drop_rate: 0.0
+  drop_path_rate: 0.2
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token proj.0.weight proj.1.weight proj.2.weight proj.3.weight pos_block
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Twins/pcpvt_base.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Twins/pcpvt_base.yaml
new file mode 100644
index 000000000..b3fbeca13
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Twins/pcpvt_base.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: pcpvt_base
+  class_num: 1000
+  drop_rate: 0.0
+  drop_path_rate: 0.3
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token proj.0.weight proj.1.weight proj.2.weight proj.3.weight pos_block
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Twins/pcpvt_large.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Twins/pcpvt_large.yaml
new file mode 100644
index 000000000..4d91ea255
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Twins/pcpvt_large.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: pcpvt_large
+  class_num: 1000
+  drop_rate: 0.0
+  drop_path_rate: 0.5
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token proj.0.weight proj.1.weight proj.2.weight proj.3.weight pos_block
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Twins/pcpvt_small.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Twins/pcpvt_small.yaml
new file mode 100644
index 000000000..97d5f1e19
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Twins/pcpvt_small.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: pcpvt_small
+  class_num: 1000
+  drop_rate: 0.0
+  drop_path_rate: 0.2
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: norm cls_token proj.0.weight proj.1.weight proj.2.weight proj.3.weight pos_block
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 2e-5
+    warmup_epoch: 5
+    warmup_start_lr: 2e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/UniFormer/UniFormer_base.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/UniFormer/UniFormer_base.yaml
new file mode 100644
index 000000000..58ef6931f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/UniFormer/UniFormer_base.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: UniFormer_base
+  class_num: 1000
+  pretrained: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5 
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/UniFormer/UniFormer_base_ls.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/UniFormer/UniFormer_base_ls.yaml
new file mode 100644
index 000000000..29a3d4261
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/UniFormer/UniFormer_base_ls.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: UniFormer_base_ls
+  class_num: 1000
+  pretrained: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5 
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/UniFormer/UniFormer_small.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/UniFormer/UniFormer_small.yaml
new file mode 100644
index 000000000..5d1860d89
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/UniFormer/UniFormer_small.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: UniFormer_small
+  class_num: 1000
+  pretrained: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5 
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/UniFormer/UniFormer_small_plus.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/UniFormer/UniFormer_small_plus.yaml
new file mode 100644
index 000000000..f08fb716a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/UniFormer/UniFormer_small_plus.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: UniFormer_small_plus
+  class_num: 1000
+  pretrained: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5 
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/UniFormer/UniFormer_small_plus_dim64.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/UniFormer/UniFormer_small_plus_dim64.yaml
new file mode 100644
index 000000000..bff77c140
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/UniFormer/UniFormer_small_plus_dim64.yaml
@@ -0,0 +1,174 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: UniFormer_small_plus_dim64
+  class_num: 1000
+  pretrained: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: pos_embed cls_token .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    # for 8 cards
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-5 
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VAN/VAN_B0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VAN/VAN_B0.yaml
new file mode 100644
index 000000000..f121dc57d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VAN/VAN_B0.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: VAN_B0
+  class_num: 1000
+  drop_path_rate: 0.1
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-6
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: random
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: random
+            img_size: 224
+            mean: [0.5, 0.5, 0.5]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VAN/VAN_B1.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VAN/VAN_B1.yaml
new file mode 100644
index 000000000..fbc30f6f5
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VAN/VAN_B1.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: VAN_B1
+  class_num: 1000
+  drop_path_rate: 0.1
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-6
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: random
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: random
+            img_size: 224
+            mean: [0.5, 0.5, 0.5]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VAN/VAN_B2.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VAN/VAN_B2.yaml
new file mode 100644
index 000000000..1dca03a2d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VAN/VAN_B2.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: VAN_B2
+  class_num: 1000
+  drop_path_rate: 0.1
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-6
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: random
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: random
+            img_size: 224
+            mean: [0.5, 0.5, 0.5]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VAN/VAN_B3.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VAN/VAN_B3.yaml
new file mode 100644
index 000000000..6bdef9a20
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VAN/VAN_B3.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 300
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: VAN_B3
+  class_num: 1000
+  drop_path_rate: 0.1
+  drop_rate: 0.0
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-3
+    eta_min: 1e-6
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: random
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: random
+            img_size: 224
+            mean: [0.5, 0.5, 0.5]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 248
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 248
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_base_patch16_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_base_patch16_224.yaml
new file mode 100644
index 000000000..5b9c518a7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_base_patch16_224.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ViT_base_patch16_224
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_base_patch16_384.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_base_patch16_384.yaml
new file mode 100644
index 000000000..a8792a036
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_base_patch16_384.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ViT_base_patch16_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_base_patch32_384.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_base_patch32_384.yaml
new file mode 100644
index 000000000..477d9edd0
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_base_patch32_384.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ViT_base_patch32_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_large_patch16_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_large_patch16_224.yaml
new file mode 100644
index 000000000..7174f151f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_large_patch16_224.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ViT_large_patch16_224
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_large_patch16_384.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_large_patch16_384.yaml
new file mode 100644
index 000000000..195cd2293
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_large_patch16_384.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ViT_large_patch16_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_large_patch32_384.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_large_patch32_384.yaml
new file mode 100644
index 000000000..afa78dacf
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_large_patch32_384.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 384, 384]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ViT_large_patch32_384
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 384
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 384
+        - CropImage:
+            size: 384
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 384
+    - CropImage:
+        size: 384
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_small_patch16_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_small_patch16_224.yaml
new file mode 100644
index 000000000..7eafe2acd
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/VisionTransformer/ViT_small_patch16_224.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: ViT_small_patch16_224
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5, 0.5, 0.5]
+            std: [0.5, 0.5, 0.5]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.5, 0.5, 0.5]
+        std: [0.5, 0.5, 0.5]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Xception/Xception41.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Xception/Xception41.yaml
new file mode 100644
index 000000000..c622617f7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Xception/Xception41.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 299, 299]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: Xception41
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 299
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+     
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+        - CropImage:
+            size: 299
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+    - CropImage:
+        size: 299
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Xception/Xception41_deeplab.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Xception/Xception41_deeplab.yaml
new file mode 100644
index 000000000..d03b6bc77
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Xception/Xception41_deeplab.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 299, 299]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: Xception41_deeplab
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 299
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+     
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+        - CropImage:
+            size: 299
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+    - CropImage:
+        size: 299
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Xception/Xception65.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Xception/Xception65.yaml
new file mode 100644
index 000000000..c134331a0
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Xception/Xception65.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 299, 299]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: Xception65
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 299
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+     
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+        - CropImage:
+            size: 299
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+    - CropImage:
+        size: 299
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Xception/Xception65_deeplab.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Xception/Xception65_deeplab.yaml
new file mode 100644
index 000000000..05e88336e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Xception/Xception65_deeplab.yaml
@@ -0,0 +1,141 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 299, 299]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: Xception65_deeplab
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.045
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 299
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+     
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+        - CropImage:
+            size: 299
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+    - CropImage:
+        size: 299
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Xception/Xception71.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Xception/Xception71.yaml
new file mode 100644
index 000000000..3fabd6595
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ImageNet/Xception/Xception71.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 299, 299]
+  save_inference_dir: ./inference
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: Xception71
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.0225
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 299
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+     
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 320
+        - CropImage:
+            size: 299
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 320
+    - CropImage:
+        size: 299
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Logo/ResNet50_ReID.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Logo/ResNet50_ReID.yaml
new file mode 100644
index 000000000..bfbedf8f9
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Logo/ResNet50_ReID.yaml
@@ -0,0 +1,151 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  eval_mode: "retrieval"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone:
+    name: "ResNet50_last_stage_stride1"
+    pretrained: True
+  BackboneStopLayer:
+    name: "avg_pool"
+  Neck:
+    name: "VehicleNeck"
+    in_channels: 2048
+    out_channels: 512
+  Head:
+    name: "CircleMargin"
+    margin: 0.35
+    scale: 64
+    embedding_size: 512
+    class_num: 3000
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - PairwiseCosface:
+        margin: 0.35
+        gamma: 64
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+  regularizer:
+    name: "L2"
+    coeff: 0.0001
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: "dataset/LogoDet-3K-crop/train/"
+      cls_label_path: "dataset/LogoDet-3K-crop/train_list.txt"
+      relabel: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AugMix:
+            prob: 0.5
+        - NormalizeImage:
+            scale: 0.00392157
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ""
+        - RandomErasing:
+            EPSILON: 0.5
+    sampler:
+      name: PKSampler
+      batch_size: 128
+      sample_per_id: 2
+      drop_last: True
+
+    loader:
+      num_workers: 6
+      use_shared_memory: True
+  Eval:
+    Query:
+      dataset:
+        name: ImageNetDataset
+        image_root: "dataset/LogoDet-3K-crop/val/"
+        cls_label_path: "dataset/LogoDet-3K-crop/query_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ""
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 8
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: ImageNetDataset
+        image_root: "dataset/LogoDet-3K-crop/train/"
+        cls_label_path: "dataset/LogoDet-3K-crop/train_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ""
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 8
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/CLIP_vit_base_patch16_448_ml_decoder_448.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/CLIP_vit_base_patch16_448_ml_decoder_448.yaml
new file mode 100644
index 000000000..435842f89
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/CLIP_vit_base_patch16_448_ml_decoder_448.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 40
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 448, 448]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_multilabel: True
+
+# model ema
+EMA:
+  decay: 0.9997
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O2
+
+# model architecture
+Arch:
+  name: CLIP_vit_base_patch16_224
+  class_num: 80
+  return_embed: False
+  use_fused_attn: False # fused attn can be used in AMP O2 mode only
+  pretrained: True
+  use_ml_decoder: True
+
+# ml-decoder head
+MLDecoder:
+  query_num: 80 # default: 80, query_num <= class_num
+  in_channels: 768
+  remove_layers: []
+  replace_layer: 'head'
+
+# loss function config for training/eval process
+Loss:
+  Train:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+  Eval:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 1e-4 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-5
+    eta_min: 1e-10
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: dataset/coco_ml/images
+      cls_label_path: dataset/coco_ml/train.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - Cutout:
+            length: 224
+            fill_value: none
+        - RandAugmentV4:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 16
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: dataset/coco_ml/images
+      cls_label_path: dataset/coco_ml/val.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 8
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/coco_000000570688.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: 448
+        interpolation: bilinear
+        backend: pil
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: MultiLabelThreshOutput
+    threshold: 0.5
+    class_id_map_file: ppcls/utils/COCO2017_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - MultiLabelMAP:
+        # support list: integral, 11point
+        # default: integral
+        map_type: integral
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/PP-HGNetV2-B0_ml_decoder_448.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/PP-HGNetV2-B0_ml_decoder_448.yaml
new file mode 100644
index 000000000..00b3abb16
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/PP-HGNetV2-B0_ml_decoder_448.yaml
@@ -0,0 +1,168 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 40
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 448, 448]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_multilabel: True
+
+# model ema
+EMA:
+  decay: 0.9997
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNetV2_B0
+  class_num: 80
+  pretrained: True # ssld pretrained
+  use_ml_decoder: True
+
+# ml-decoder head
+MLDecoder:
+  query_num: 80 # default: 80, query_num <= class_num
+  in_channels: 2048
+
+# loss function config for training/eval process
+Loss:
+  Train:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+  Eval:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 1e-4 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-4
+    eta_min: 1e-10
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: dataset/coco_ml/images
+      cls_label_path: dataset/coco_ml/train.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - Cutout:
+            length: 224
+            fill_value: none
+        - RandAugmentV4:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: dataset/coco_ml/images
+      cls_label_path: dataset/coco_ml/val.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/coco_000000570688.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: 448
+        interpolation: bilinear
+        backend: pil
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: MultiLabelThreshOutput
+    threshold: 0.5
+    class_id_map_file: ppcls/utils/COCO2017_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - MultiLabelMAP:
+        # support list: integral, 11point
+        # default: integral
+        map_type: integral
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/PP-HGNetV2-B4_ml_decoder_448.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/PP-HGNetV2-B4_ml_decoder_448.yaml
new file mode 100644
index 000000000..a92c89bc1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/PP-HGNetV2-B4_ml_decoder_448.yaml
@@ -0,0 +1,168 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 40
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 448, 448]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_multilabel: True
+
+# model ema
+EMA:
+  decay: 0.9997
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNetV2_B4
+  class_num: 80
+  pretrained: True # ssld pretrained
+  use_ml_decoder: True
+
+# ml-decoder head
+MLDecoder:
+  query_num: 80 # default: 80, query_num <= class_num
+  in_channels: 2048
+
+# loss function config for training/eval process
+Loss:
+  Train:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+  Eval:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 1e-4 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-4
+    eta_min: 1e-10
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: dataset/coco_ml/images
+      cls_label_path: dataset/coco_ml/train.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - Cutout:
+            length: 224
+            fill_value: none
+        - RandAugmentV4:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: dataset/coco_ml/images
+      cls_label_path: dataset/coco_ml/val.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 16
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/coco_000000570688.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: 448
+        interpolation: bilinear
+        backend: pil
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: MultiLabelThreshOutput
+    threshold: 0.5
+    class_id_map_file: ppcls/utils/COCO2017_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - MultiLabelMAP:
+        # support list: integral, 11point
+        # default: integral
+        map_type: integral
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/PP-HGNetV2-B6_ml_decoder_448.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/PP-HGNetV2-B6_ml_decoder_448.yaml
new file mode 100644
index 000000000..d21453c31
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/PP-HGNetV2-B6_ml_decoder_448.yaml
@@ -0,0 +1,168 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 40
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 448, 448]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_multilabel: True
+
+# model ema
+EMA:
+  decay: 0.9997
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPHGNetV2_B6
+  class_num: 80
+  pretrained: True # ssld pretrained
+  use_ml_decoder: True
+
+# ml-decoder head
+MLDecoder:
+  query_num: 80 # default: 80, query_num <= class_num
+  in_channels: 2048
+
+# loss function config for training/eval process
+Loss:
+  Train:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+  Eval:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 1e-4 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-4
+    eta_min: 1e-10
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: dataset/coco_ml/images
+      cls_label_path: dataset/coco_ml/train.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - Cutout:
+            length: 224
+            fill_value: none
+        - RandAugmentV4:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: dataset/coco_ml/images
+      cls_label_path: dataset/coco_ml/val.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 8
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/coco_000000570688.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: 448
+        interpolation: bilinear
+        backend: pil
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: MultiLabelThreshOutput
+    threshold: 0.5
+    class_id_map_file: ppcls/utils/COCO2017_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - MultiLabelMAP:
+        # support list: integral, 11point
+        # default: integral
+        map_type: integral
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/PP-LCNet_x1_0_ml_decoder_448.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/PP-LCNet_x1_0_ml_decoder_448.yaml
new file mode 100644
index 000000000..15aaee0a2
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/PP-LCNet_x1_0_ml_decoder_448.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 40
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 448, 448]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_multilabel: True
+
+# model ema
+EMA:
+  decay: 0.9997
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 80
+  pretrained: True
+  use_ml_decoder: True
+
+# ml-decoder head
+MLDecoder:
+  query_num: 80 # default: 80, query_num <= class_num
+  class_num: 80
+  in_channels: 1280
+
+
+# loss function config for training/eval process
+Loss:
+  Train:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+  Eval:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 1e-4 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-4
+    eta_min: 1e-10
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: dataset/coco_ml/images
+      cls_label_path: dataset/coco_ml/train.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - Cutout:
+            length: 224
+            fill_value: none
+        - RandAugmentV4:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: dataset/coco_ml/images
+      cls_label_path: dataset/coco_ml/val.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/coco_000000570688.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: 448
+        interpolation: bilinear
+        backend: pil
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: MultiLabelThreshOutput
+    threshold: 0.5
+    class_id_map_file: ppcls/utils/COCO2017_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - MultiLabelMAP:
+        # support list: integral, 11point
+        # default: integral
+        map_type: integral
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/README.md b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/README.md
new file mode 100644
index 000000000..e4e4263c1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/README.md
@@ -0,0 +1,272 @@
+# ML-Decoder多标签分类
+
+## 目录
+
+* [1. 模型介绍](#1)
+* [2. 数据和模型准备](#2)
+* [3. 模型训练](#3)
+* [4. 模型评估](#4)
+* [5. 模型预测](#5)
+* [6. 基于预测引擎预测](#6)
+  * [6.1 导出 inference model](#6.1)
+  * [6.2 基于 Python 预测引擎推理](#6.2)
+* [7. 引用](#7)
+
+<a name="1"></a>
+## 1. 模型介绍
+
+ML-Decoder是一种新的基于注意力的分类头，它通过查询来预测类别标签的存在，并且比全局平均池化能够更好地利用空间数据。ML-Decoder的特点有以下几点：
+
+1. ML-Decoder重新设计了解码器的架构，并且使用了一种新颖的分组解码方案，使得ML-Decoder具有高效性和可扩展性，可以很好地处理数千个类别的分类任务。
+2. ML-Decoder具有一致性的速度-准确度权衡，相比于使用更大的主干网络，ML-Decoder能够提供更好的性能。
+3. ML-Decoder也具有多功能性，它可以作为各种分类头的替代方案，并且在使用词语查询时能够泛化到未见过的类别。通过使用新颖的查询增强方法，ML-Decoder的泛化能力进一步提高。
+
+使用ML-Decoder，作者在多个分类任务上取得了最先进的结果：
+1. 在MS-COCO多标签分类上，达到了91.4%的mAP；
+2. 在NUS-WIDE零样本分类上，达到了31.1%的ZSL mAP；
+3. 在ImageNet单标签分类上，使用普通的ResNet50主干网络，达到了80.7%的新高分，而没有使用额外的数据或蒸馏。
+
+`PaddleClas` 目前支持在单标签分类和多标签分类任务中使用ML-Decoder, 且使用方式非常简单方便，须在配置文件的Arch作用域下设置use_ml_decoder=True 并给出对ML-Decoder的参数配置即可:
+```yaml
+# model architecture
+Arch:
+  name: ResNet101
+  class_num: 80
+  pretrained: True
+  # use ml-decoder head to replace avg_pool and fc
+  use_ml_decoder: True
+
+# ml-decoder head
+MLDecoder:
+  query_num: 80 # default: 80, query_num <= class_num
+  in_channels: 2048
+  # optional args
+  # class_num: 80
+  # remove_layers: ['avg_pool', 'flatten']
+  # replace_layer: 'fc'
+```
+注意：
+1. 当所选择的Backbone无法调取class_num属性时，MLDecoder需要在配置文件中手动添加`class_num`属性。
+2. 实际使用时可根据对应Backbone中的实际的层的名称来修改`remove_layers`和`replace_layer`参数。 Backbone中的实际的层的名称可根据选取的主干模型名在`ppcls/arch/backbone`路径下查找对应的实现代码，并查看改模型相应输出层的名称。
+
+下面是选择`RepVGG_A0`为例， 对应查看`ppcls/arch/backbone/model_zoo/repvgg.py`, 适应性添加或修改`class_num`, `remove_layers`和`replace_layer`后的MLDecoder配置示例：
+```yaml
+# model architecture
+Arch:
+  name: RepVGG_A0
+  class_num: 80
+  pretrained: True
+  # use ml-decoder head to replace avg_pool and fc
+  use_ml_decoder: True
+
+# ml-decoder head
+MLDecoder:
+  query_num: 80 # default: 80, query_num <= class_num
+  in_channels: 1280
+  # optional args
+  class_num: 80
+  remove_layers: ['gap']
+  replace_layer: 'linear'
+```
+
+开发者可以自行尝试使用不同的主干模型来结合ML-Decoder。
+
+目前使用多类Backbone结合ML-Decoder在COCO2017的多标签分类任务上的性能指标如下：
+
+|        Model         | Backbone  | Resolution | mAP |                   Links                   |
+|:--------------------:|:---------:|:----------:|:---:|:-----------------------------------------:|
+| PP-LCNet_x1_0_ml_decoder_448 | PP-LCNet_x1_0 |  448x448   |  77.96%  | [config](./PP-LCNet_x1_0_ml_decoder_448.yaml) |
+| PP-HGNetV2-B0_ml_decoder_448 | PP-HGNetV2-B0 |  448x448   |  80.98%  | [config](./PP-HGNetV2-B0_ml_decoder_448.yaml) |
+| PP-HGNetV2-B4_ml_decoder_448 | PP-HGNetV2-B4 |  448x448   |  87.96%   | [config](./PP-HGNetV2-B4_ml_decoder_448.yaml) |
+| PP-HGNetV2-B6_ml_decoder_448 | PP-HGNetV2-B6 |  448x448   |  91.25%  | [config](./PP-HGNetV2-B6_ml_decoder_448.yaml) |
+| ResNet50_ml_decoder_448 | ResNet50 |  448x448   |  83.50%  | [config](./ResNet50_ml_decoder_448.yaml) |
+| ResNet101_ml_decoder | ResNet101 |  448x448   |  91.40%   | [config](./ResNet101_ml_decoder_448.yaml) |
+| CLIP_vit_base_patch16_448_ml_decoder_448 | CLIP_vit_base_patch16_448 |  448x448   |  89.15%   | [config](./CLIP_vit_base_patch16_448_ml_decoder_448.yaml) |
+
+基于 [COCO2017](https://cocodataset.org/) 数据集，如下将介绍添加ml-decoder进行多标签分类的训练、评估、预测的过程。请首先安装 PaddlePaddle 和 PaddleClas，具体安装步骤可详看 [环境准备](../installation.md)。
+
+
+
+<a name="2"></a>
+## 2. 数据和模型准备
+
+* 进入 `PaddleClas` 目录。
+
+```
+cd path_to_PaddleClas
+```
+
+* 创建并进入 `dataset/COCO2017` 目录，下载并解压 COCO2017 数据集。
+
+```shell
+mkdir dataset/COCO2017 && cd dataset/COCO2017
+wget http://images.cocodataset.org/zips/train2017.zip -O t.zip && unzip t.zip -d . && rm t.zip
+wget http://images.cocodataset.org/zips/val2017.zip -O t.zip && unzip t.zip -d . && rm t.zip
+wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip -O t.zip && unzip t.zip -d . && rm t.zip
+```
+
+* 返回 `PaddleClas` 根目录
+
+
+```shell
+cd ../../
+# 转换训练集并生成`COCO2017_labels.txt`
+python3 ./ppcls/utils/create_coco_multilabel_lists.py \
+        --dataset_dir dataset/COCO2017 \
+        --image_dir train2017 \
+        --anno_path annotations/instances_train2017.json \
+        --save_name multilabel_train_list --save_label_name
+# 转换测试集
+python3 ./ppcls/utils/create_coco_multilabel_lists.py \
+        --dataset_dir dataset/COCO2017 \
+        --image_dir val2017 \
+        --anno_path annotations/instances_val2017.json \
+        --save_name multilabel_val_list
+```
+
+<a name="3"></a>
+## 3. 模型训练
+
+```shell
+# 多卡
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python3 -m paddle.distributed.launch \
+    --gpus="0,1,2,3" \
+    tools/train.py \
+        -c ./ppcls/configs/MultiLabelCOCO/MLDecoder/ResNet101_ml_decoder_448.yaml
+# 单卡
+python3 tools/train.py \
+        -c ./ppcls/configs/MultiLabelCOCO/MLDecoder/ResNet101_ml_decoder_448.yaml
+```
+
+**注意:**
+1. 目前多标签分类的损失函数默认使用`MultiLabelAsymmetricLoss`。
+2. 目前多标签分类的评估指标默认使用`MultiLabelMAP(integral)`。
+
+<a name="4"></a>
+
+## 4. 模型评估
+
+```bash
+python3 tools/eval.py \
+    -c ./ppcls/configs/MultiLabelCOCO/MLDecoder/ResNet101_ml_decoder_448.yaml \
+    -o Global.pretrained_model="./output/ResNet101_ml_decoder_448/best_model"
+```
+
+<a name="5"></a>
+## 5. 模型预测
+
+```bash
+python3 tools/infer.py \
+    -c ./ppcls/configs/MultiLabelCOCO/MLDecoder/ResNet101_ml_decoder_448.yaml \
+    -o Global.pretrained_model="./output/ResNet101_ml_decoder_448/best_model"
+```
+
+得到类似下面的输出：
+```
+[{'class_ids': [0, 2, 7, 24, 25, 26, 33, 56], 'scores': [0.99998, 0.52104, 0.51953, 0.59292, 0.64329, 0.63605, 0.99994, 0.7054], 'label_names': ['person', 'car', 'truck', 'backpack', 'umbrella', 'handbag', 'kite', 'chair'], 'file_name': 'deploy/images/coco_000000570688.jpg'}]
+```
+
+<a name="6"></a>
+## 6. 基于预测引擎预测
+
+<a name="6.1"></a>
+### 6.1 导出 inference model
+
+```bash
+python3 tools/export_model.py \
+    -c ./ppcls/configs/MultiLabelCOCO/MLDecoder/ResNet101_ml_decoder_448.yaml \
+    -o Global.pretrained_model="./output/ResNet101_ml_decoder_448/best_model"
+```
+inference model 的路径默认在当前路径下 `./inference`
+`./inference` 文件夹下应有如下文件结构：
+
+```
+├── inference
+│   ├── inference.pdiparams
+│   ├── inference.pdiparams.info
+│   └── inference.pdmodel
+```
+
+<a name="6.2"></a>
+
+### 6.2 基于 Python 预测引擎推理
+
+切换到depoly目录下，并且使用deploy中的脚本进行推理前需要确认paddleclas为非本地安装, 如不是请进行切换，不然会出现包的导入错误。 
+
+```shell
+# 本地安装
+pip install -e .
+# 非本地安装
+python setup.py install
+
+# 进入deploy目录下
+cd deploy
+```
+
+<a name="6.2.1"></a>  
+
+#### 6.2.1 预测单张图像
+
+运行下面的命令，对图像 `./images/coco_000000570688.jpg` 进行分类。
+
+```shell
+# linux使用`python3`，windows使用`python (-m)`来执行脚本
+# 使用下面的命令使用 GPU 进行预测
+python3 python/predict_cls.py \
+    -c configs/inference_cls_multilabel.yaml \
+    -o Global.inference_model_dir=../inference/ \
+    -o Global.infer_imgs=images/coco_000000570688.jpg \
+    -o PostProcess.MultiLabelThreshOutput.class_id_map_file=../ppcls/utils/COCO2017_label_list.txt 
+# 使用下面的命令使用 CPU 进行预测
+python3 python/predict_cls.py \
+    -c configs/inference_cls_multilabel.yaml \
+    -o Global.inference_model_dir=../inference/ \
+    -o Global.infer_imgs=images/coco_000000570688.jpg \
+    -o PostProcess.MultiLabelThreshOutput.class_id_map_file=../ppcls/utils/COCO2017_label_list.txt \
+    -o Global.use_gpu=False
+```
+
+输出结果如下：
+
+```
+coco_000000570688.jpg:  class id(s): [0, 2, 3, 4, 7, 9, 21, 22, 23, 24, 25, 27, 28, 29, 30, 33, 38, 39, 45, 46, 47, 48, 49, 51, 52, 53, 54, 57, 58, 60, 61, 62, 63, 64, 65, 67, 69, 70, 71, 72, 73, 75], score(s): [0.84, 0.68, 0.93, 0.54, 0.74, 0.90, 0.56, 0.60, 0.63, 0.77, 0.64, 0.70, 0.94, 0.82, 0.99, 0.71, 0.86, 0.81, 0.81, 0.65, 0.65, 0.92, 0.67, 0.53, 0.83, 0.63, 0.58, 0.52, 0.83, 0.55, 0.92, 0.72, 0.74, 0.59, 0.82, 0.50, 0.62, 0.77, 0.87, 0.64, 0.84, 0.67], label_name(s): ['person', 'car', 'motorcycle', 'airplane', 'truck', 'traffic light', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'tie', 'suitcase', 'frisbee', 'skis', 'kite', 'tennis racket', 'bottle', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'carrot', 'hot dog', 'pizza', 'donut', 'couch', 'potted plant', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'cell phone', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'vase']
+```
+
+<a name="6.2.2"></a>  
+
+#### 6.2.2 基于文件夹的批量预测
+
+如果希望预测文件夹内的图像，可以直接修改配置文件中的 `Global.infer_imgs` 字段，也可以通过下面的 `-o` 参数修改对应的配置。
+
+```shell
+# linux使用`python3`，windows使用`python (-m)`来执行脚本
+# 使用下面的命令使用 GPU 进行预测，如果希望使用 CPU 预测，可以在命令后面添加 -o Global.use_gpu=False
+python3 python/predict_cls.py \
+    -c configs/inference_cls_multilabel.yaml \
+    -o Global.inference_model_dir=../inference/ \
+    -o PostProcess.MultiLabelThreshOutput.class_id_map_file=../ppcls/utils/COCO2017_label_list.txt \
+    -o Global.infer_imgs=images/ImageNet/
+```
+
+终端中会输出该文件夹内所有图像的分类结果，如下所示。
+
+```
+ILSVRC2012_val_00000010.jpeg:   class id(s): [0, 2, 3, 7, 9, 21, 22, 23, 24, 25, 27, 28, 29, 30, 33, 38, 39, 40, 41, 45, 46, 47, 48, 49, 52, 53, 54, 58, 60, 61, 62, 63, 64, 65, 69, 70, 71, 72, 73, 75], score(s): [0.80, 0.58, 0.89, 0.74, 0.86, 0.66, 0.56, 0.60, 0.81, 0.64, 0.73, 0.94, 0.75, 0.99, 0.70, 0.86, 0.78, 0.63, 0.57, 0.76, 0.66, 0.60, 0.94, 0.65, 0.90, 0.63, 0.52, 0.79, 0.50, 0.93, 0.72, 0.70, 0.60, 0.83, 0.61, 0.75, 0.86, 0.67, 0.87, 0.64], label_name(s): ['person', 'car', 'motorcycle', 'truck', 'traffic light', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'tie', 'suitcase', 'frisbee', 'skis', 'kite', 'tennis racket', 'bottle', 'wine glass', 'cup', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'hot dog', 'pizza', 'donut', 'potted plant', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'vase']
+ILSVRC2012_val_00010010.jpeg:   class id(s): [0, 2, 3, 6, 7, 8, 9, 21, 22, 23, 24, 25, 27, 28, 29, 30, 33, 38, 39, 40, 45, 46, 47, 48, 49, 51, 52, 53, 54, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 75], score(s): [0.78, 0.66, 0.93, 0.54, 0.77, 0.50, 0.86, 0.69, 0.63, 0.50, 0.78, 0.56, 0.71, 0.93, 0.78, 0.99, 0.64, 0.85, 0.80, 0.53, 0.85, 0.71, 0.66, 0.96, 0.70, 0.62, 0.85, 0.58, 0.57, 0.57, 0.78, 0.50, 0.92, 0.64, 0.73, 0.71, 0.77, 0.53, 0.66, 0.52, 0.73, 0.87, 0.69, 0.85, 0.66], label_name(s): ['person', 'car', 'motorcycle', 'train', 'truck', 'boat', 'traffic light', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'tie', 'suitcase', 'frisbee', 'skis', 'kite', 'tennis racket', 'bottle', 'wine glass', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'carrot', 'hot dog', 'pizza', 'donut', 'couch', 'potted plant', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'vase']
+ILSVRC2012_val_00020010.jpeg:   class id(s): [0, 2, 3, 7, 9, 21, 22, 23, 24, 25, 27, 28, 29, 30, 33, 38, 39, 40, 41, 45, 46, 47, 48, 49, 51, 52, 53, 54, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 75], score(s): [0.85, 0.62, 0.94, 0.69, 0.89, 0.56, 0.56, 0.62, 0.77, 0.67, 0.71, 0.93, 0.78, 0.99, 0.65, 0.86, 0.75, 0.57, 0.60, 0.76, 0.66, 0.58, 0.95, 0.73, 0.50, 0.88, 0.63, 0.59, 0.62, 0.84, 0.59, 0.82, 0.74, 0.74, 0.62, 0.86, 0.53, 0.53, 0.60, 0.73, 0.88, 0.65, 0.85, 0.70], label_name(s): ['person', 'car', 'motorcycle', 'truck', 'traffic light', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'tie', 'suitcase', 'frisbee', 'skis', 'kite', 'tennis racket', 'bottle', 'wine glass', 'cup', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'carrot', 'hot dog', 'pizza', 'donut', 'couch', 'potted plant', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'vase']
+ILSVRC2012_val_00030010.jpeg:   class id(s): [0, 2, 3, 4, 7, 9, 21, 22, 23, 24, 25, 27, 28, 29, 30, 33, 38, 39, 40, 41, 45, 46, 47, 48, 49, 51, 52, 53, 58, 60, 61, 62, 63, 64, 65, 69, 70, 71, 72, 73, 75], score(s): [0.82, 0.60, 0.92, 0.54, 0.67, 0.87, 0.57, 0.63, 0.57, 0.84, 0.70, 0.80, 0.92, 0.82, 0.99, 0.72, 0.86, 0.80, 0.59, 0.55, 0.84, 0.73, 0.60, 0.94, 0.75, 0.53, 0.89, 0.51, 0.84, 0.56, 0.90, 0.87, 0.67, 0.70, 0.85, 0.59, 0.82, 0.91, 0.62, 0.89, 0.67], label_name(s): ['person', 'car', 'motorcycle', 'airplane', 'truck', 'traffic light', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'tie', 'suitcase', 'frisbee', 'skis', 'kite', 'tennis racket', 'bottle', 'wine glass', 'cup', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'carrot', 'hot dog', 'pizza', 'potted plant', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'vase']
+```
+
+<a name="7"></a>
+## 7. 引用
+```
+@misc{ridnik2021mldecoder,
+      title={ML-Decoder: Scalable and Versatile Classification Head}, 
+      author={Tal Ridnik and Gilad Sharir and Avi Ben-Cohen and Emanuel Ben-Baruch and Asaf Noy},
+      year={2021},
+      eprint={2111.12933},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/ResNet101_ml_decoder_448.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/ResNet101_ml_decoder_448.yaml
new file mode 100644
index 000000000..8bafbb23d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/ResNet101_ml_decoder_448.yaml
@@ -0,0 +1,168 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 40
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 448, 448]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_multilabel: True
+
+# model ema
+EMA:
+  decay: 0.9997
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: ResNet101
+  class_num: 80
+  pretrained: True
+  # use ml-decoder head to replace avg_pool and fc
+  use_ml_decoder: True
+
+# ml-decoder head
+MLDecoder:
+  query_num: 80 # default: 80, query_num <= class_num
+  in_channels: 2048
+
+# loss function config for training/eval process
+Loss:
+  Train:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+  Eval:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+Optimizer:
+  name: AdamW
+  one_dim_param_no_weight_decay: True
+  weight_decay: 0.0001 # 1e-4
+  lr:
+    name: OneCycleLR
+    max_learning_rate: 0.0001 # 1e-4
+    divide_factor: 25.0
+    end_learning_rate: 0.0000000001 # 1e-10
+    phase_pct: 0.2
+    anneal_strategy: cos
+    three_phase: False
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: ./dataset/COCO2017/train2017
+      cls_label_path: ./dataset/COCO2017/multilabel_train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - Cutout:
+            length: 224
+            fill_value: none
+        - RandAugmentV4:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 56
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: ./dataset/COCO2017/val2017
+      cls_label_path: ./dataset/COCO2017/multilabel_val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/coco_000000570688.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: 448
+        interpolation: bilinear
+        backend: pil
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: MultiLabelThreshOutput
+    threshold: 0.5
+    class_id_map_file: ppcls/utils/COCO2017_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - MultiLabelMAP:
+        # support list: integral, 11point
+        # default: integral
+        map_type: integral
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/ResNet50_ml_decoder_448.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/ResNet50_ml_decoder_448.yaml
new file mode 100644
index 000000000..067c5be1c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/MultiLabelCOCO/MLDecoder/ResNet50_ml_decoder_448.yaml
@@ -0,0 +1,168 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 40
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 448, 448]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_multilabel: True
+
+# model ema
+EMA:
+  decay: 0.9997
+
+# mixed precision
+AMP:
+  use_amp: True
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: ResNet50
+  class_num: 80
+  pretrained: True
+  use_ml_decoder: True
+
+# ml-decoder head
+MLDecoder:
+  query_num: 80 # default: 80, query_num <= class_num
+  in_channels: 2048
+
+# loss function config for training/eval process
+Loss:
+  Train:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+  Eval:
+    - MultiLabelAsymmetricLoss:
+        weight: 1.0
+        gamma_pos: 0
+        gamma_neg: 4
+        clip: 0.05
+        disable_focal_loss_grad: True
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 1e-4 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-4
+    eta_min: 1e-10
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: dataset/coco_ml/images
+      cls_label_path: dataset/coco_ml/train.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - Cutout:
+            length: 224
+            fill_value: none
+        - RandAugmentV4:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: dataset/coco_ml/images
+      cls_label_path: dataset/coco_ml/val.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 448
+            interpolation: bilinear
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 16
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/coco_000000570688.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: 448
+        interpolation: bilinear
+        backend: pil
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: MultiLabelThreshOutput
+    threshold: 0.5
+    class_id_map_file: ppcls/utils/COCO2017_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - MultiLabelMAP:
+        # support list: integral, 11point
+        # default: integral
+        map_type: integral
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/car_exists/MobileNetV3_small_x0_35.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/car_exists/MobileNetV3_small_x0_35.yaml
new file mode 100644
index 000000000..911b8edec
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/car_exists/MobileNetV3_small_x0_35.yaml
@@ -0,0 +1,139 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 10
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_35
+  class_num: 2
+  pretrained: True
+  use_sync_bn: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.05
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/car_exists/
+      cls_label_path: ./dataset/car_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/car_exists/
+      cls_label_path: ./dataset/car_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/car_exists/objects365_00001507.jpeg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: no_car
+    label_1: contains_car
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TprAtFpr:
+        max_fpr: 0.01
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/car_exists/PPLCNet_x1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/car_exists/PPLCNet_x1_0.yaml
new file mode 100644
index 000000000..247f655b5
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/car_exists/PPLCNet_x1_0.yaml
@@ -0,0 +1,152 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 10
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 2
+  pretrained: True
+  use_ssld: True
+  use_sync_bn: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.0125
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/car_exists/
+      cls_label_path: ./dataset/car_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 192
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 0.5
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 192
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/car_exists
+      cls_label_path: ./dataset/car_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/car_exists/objects365_00001507.jpeg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.9
+    label_0: no_car
+    label_1: contains_car
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TprAtFpr:
+        max_fpr: 0.01
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/car_exists/PPLCNet_x1_0_distillation.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/car_exists/PPLCNet_x1_0_distillation.yaml
new file mode 100644
index 000000000..4c11802d6
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/car_exists/PPLCNet_x1_0_distillation.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  start_eval_epoch: 1
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 2
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  use_sync_bn: True
+  models:
+    - Teacher:
+        name: ResNet101_vd
+        class_num: *class_num
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+
+  infer_model_name: "Student"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationDMLLoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/car_exists/
+      cls_label_path: ./dataset/car_exists/train_list_for_distill.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 192
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 192
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.1
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/car_exists/
+      cls_label_path: ./dataset/car_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/car_exists/objects365_00001507.jpeg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: no_car
+    label_1: contains_car
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 2]
+    Eval:
+    - TprAtFpr:
+        max_fpr: 0.01
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/car_exists/PPLCNet_x1_0_search.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/car_exists/PPLCNet_x1_0_search.yaml
new file mode 100644
index 000000000..c263f2309
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/car_exists/PPLCNet_x1_0_search.yaml
@@ -0,0 +1,152 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 10
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 2
+  pretrained: True
+  use_ssld: True
+  use_sync_bn: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/car_exists/
+      cls_label_path: ./dataset/car_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/car_exists/
+      cls_label_path: ./dataset/car_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/car_exists/objects365_00001507.jpeg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: no_car
+    label_1: contains_car
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TprAtFpr:
+        max_fpr: 0.01
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/car_exists/SwinTransformer_tiny_patch4_window7_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/car_exists/SwinTransformer_tiny_patch4_window7_224.yaml
new file mode 100644
index 000000000..a75fda4b4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/car_exists/SwinTransformer_tiny_patch4_window7_224.yaml
@@ -0,0 +1,169 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 10
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: SwinTransformer_tiny_patch4_window7_224
+  class_num: 2
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-4
+    eta_min: 2e-6
+    warmup_epoch: 5
+    warmup_start_lr: 2e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/car_exists/
+      cls_label_path: ./dataset/car_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/car_exists/
+      cls_label_path: ./dataset/car_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/car_exists/objects365_00001507.jpeg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: no_car
+    label_1: contains_car
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TprAtFpr:
+        max_fpr: 0.01
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/car_exists/search.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/car_exists/search.yaml
new file mode 100644
index 000000000..820337c02
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/car_exists/search.yaml
@@ -0,0 +1,40 @@
+base_config_file: ppcls/configs/PULC/person_exists/PPLCNet_x1_0_search.yaml
+distill_config_file: ppcls/configs/PULC/person_exists/PPLCNet_x1_0_distillation.yaml
+
+gpus: 0,1,2,3
+output_dir: output/search_person_cls
+search_times: 1
+search_dict:
+  - search_key: lrs
+    replace_config:
+      - Optimizer.lr.learning_rate
+    search_values: [0.0075, 0.01, 0.0125]
+  - search_key: resolutions
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.1.RandCropImage.size
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.img_size
+    search_values: [176, 192, 224]
+  - search_key: ra_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.prob
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: re_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.5.RandomErasing.EPSILON
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: lr_mult_list
+    replace_config:
+      - Arch.lr_mult_list
+    search_values:
+      - [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+      - [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+      - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+teacher:
+  rm_keys:
+    - Arch.lr_mult_list
+  search_values:
+    - ResNet101_vd
+    - ResNet50_vd
+final_replace:
+  Arch.lr_mult_list: Arch.models.1.Student.lr_mult_list
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/clarity_assessment/PPLCNet_x1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/clarity_assessment/PPLCNet_x1_0.yaml
new file mode 100644
index 000000000..0f766380f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/clarity_assessment/PPLCNet_x1_0.yaml
@@ -0,0 +1,133 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  pretrained: True
+  use_ssld: True
+  class_num: 2
+  use_last_conv: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.14
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: CustomLabelDataset
+      image_root: ./dataset/
+      sample_list_path: ./dataset/ImageNet_OCR_det.txt
+      label_key: blur_image
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - BlurImage:
+        - ResizeImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/blur/
+      cls_label_path: ./dataset/blur/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ./test_img/
+  batch_size: 1
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 1
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1]
+  Eval:
+    - TopkAcc:
+        topk: [1]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/code_exists/MobileNetV3_small_x0_35.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/code_exists/MobileNetV3_small_x0_35.yaml
new file mode 100644
index 000000000..8566b0c5c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/code_exists/MobileNetV3_small_x0_35.yaml
@@ -0,0 +1,137 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_35
+  class_num: 2
+  pretrained: True
+  use_sync_bn: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.05
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/code_exists/
+      cls_label_path: ./dataset/code_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/code_exists/
+      cls_label_path: ./dataset/code_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: test_images/
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: nobody
+    label_1: someone
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/code_exists/PPLCNet_x1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/code_exists/PPLCNet_x1_0.yaml
new file mode 100644
index 000000000..02a960116
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/code_exists/PPLCNet_x1_0.yaml
@@ -0,0 +1,145 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 2
+  pretrained: True
+  use_ssld: True
+  use_sync_bn: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/code_exists/
+      cls_label_path: ./dataset/code_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 192
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.1
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/code_exists/
+      cls_label_path: ./dataset/code_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: test_imags/
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: no_code
+    label_1: contains_code
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/code_exists/PPLCNet_x1_0_distillation.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/code_exists/PPLCNet_x1_0_distillation.yaml
new file mode 100644
index 000000000..2ba2dcd1b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/code_exists/PPLCNet_x1_0_distillation.yaml
@@ -0,0 +1,167 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  start_eval_epoch: 1
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 2
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  use_sync_bn: True
+  models:
+    - Teacher:
+        name: ResNet101_vd
+        class_num: *class_num
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+
+  infer_model_name: "Student"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationDMLLoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/code_exists/
+      cls_label_path: ./dataset/code_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 192
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 192
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.1
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/code_exists/
+      cls_label_path: ./dataset/code_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/code_exists/objects365_02035329.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: no_code
+    label_1: contains_code
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 2]
+    Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/code_exists/PPLCNet_x1_0_search.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/code_exists/PPLCNet_x1_0_search.yaml
new file mode 100644
index 000000000..b6b627e10
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/code_exists/PPLCNet_x1_0_search.yaml
@@ -0,0 +1,150 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 2
+  pretrained: True
+  use_ssld: True
+  use_sync_bn: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/code_exists/
+      cls_label_path: ./dataset/code_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/code_exists/
+      cls_label_path: ./dataset/code_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_exists/objects365_02035329.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: no_code
+    label_1: contains_code
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/code_exists/SwinTransformer_tiny_patch4_window7_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/code_exists/SwinTransformer_tiny_patch4_window7_224.yaml
new file mode 100644
index 000000000..7600a4361
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/code_exists/SwinTransformer_tiny_patch4_window7_224.yaml
@@ -0,0 +1,167 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: SwinTransformer_tiny_patch4_window7_224
+  class_num: 2
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-5
+    eta_min: 1e-6
+    warmup_epoch: 5
+    warmup_start_lr: 1e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/code_exists/
+      cls_label_path: ./dataset/code_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/code_exists/
+      cls_label_path: ./dataset/code_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: test_images
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: no_code
+    label_1: contains_code
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/code_exists/search.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/code_exists/search.yaml
new file mode 100644
index 000000000..585450e5b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/code_exists/search.yaml
@@ -0,0 +1,40 @@
+base_config_file: ppcls/configs/PULC/code_exists/PPLCNet_x1_0_search.yaml
+distill_config_file: ppcls/configs/PULC/code_exists/PPLCNet_x1_0_distillation.yaml
+
+gpus: 0,1,2,3
+output_dir: output/search_code_exists
+search_times: 1
+search_dict:
+  - search_key: lrs
+    replace_config:
+      - Optimizer.lr.learning_rate
+    search_values: [0.005, 0.0075, 0.01, 0.015, 0.02]
+  - search_key: resolutions
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.1.RandCropImage.size
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.img_size
+    search_values: [176, 192, 224]
+  - search_key: ra_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.prob
+    search_values: [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+  - search_key: re_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.5.RandomErasing.EPSILON
+    search_values: [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+  - search_key: lr_mult_list
+    replace_config:
+      - Arch.lr_mult_list
+    search_values:
+      - [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+      - [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+      - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+teacher:
+  rm_keys:
+    - Arch.lr_mult_list
+  search_values:
+    - ResNet101_vd
+    - ResNet50_vd
+final_replace:
+  Arch.lr_mult_list: Arch.models.1.Student.lr_mult_list
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/image_orientation/PPLCNet_x1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/image_orientation/PPLCNet_x1_0.yaml
new file mode 100644
index 000000000..c55ad1a92
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/image_orientation/PPLCNet_x1_0.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 10
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  pretrained: True
+  use_ssld: True
+  class_num: 4
+  use_last_conv: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.14
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: CustomLabelDataset
+      image_root: ./dataset/OrientationDataset/
+      sample_list_path: ./dataset/OrientationDataset/train_list.txt
+      label_key: random_rot90_orientation
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+        - RandomRot90:
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 12
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/OrientationDataset/
+      cls_label_path: ./dataset/OrientationDataset/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ./test_img/
+  batch_size: 1
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 1
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1]
+  Eval:
+    - TopkAcc:
+        topk: [1]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/language_classification/MobileNetV3_small_x0_35.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/language_classification/MobileNetV3_small_x0_35.yaml
new file mode 100644
index 000000000..c3973ff42
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/language_classification/MobileNetV3_small_x0_35.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  start_eval_epoch: 20
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_35
+  class_num: 10
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC_label_list/language_classification_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/language_classification/PPLCNet_x1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/language_classification/PPLCNet_x1_0.yaml
new file mode 100644
index 000000000..081d8d23f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/language_classification/PPLCNet_x1_0.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 80, 160]
+  save_inference_dir: ./inference
+  
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 10
+  pretrained: True
+  use_ssld: True
+  stride_list: [2, [2, 1], [2, 1], [2, 1], [2, 1]]
+  lr_mult_list : [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [160, 80]
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 1.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [160, 80]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 1.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [160, 80]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/language_classification/word_35404.png
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+            size: [160, 80]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC_label_list/language_classification_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/language_classification/PPLCNet_x1_0_distillation.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/language_classification/PPLCNet_x1_0_distillation.yaml
new file mode 100644
index 000000000..d792c573d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/language_classification/PPLCNet_x1_0_distillation.yaml
@@ -0,0 +1,164 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 10
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  use_sync_bn: True
+  models:
+    - Teacher:
+        name: ResNet101_vd
+        class_num: *class_num
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+        stride_list: [2, [2, 1], [2, 1], [2, 1], [2, 1]]
+        lr_mult_list : [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+        
+
+  infer_model_name: "Student"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationDMLLoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/train_list_for_distill.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [160, 80]
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 1.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [160, 80]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 1.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [160, 80]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/language_classification/word_35404.png
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [160, 80]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC_label_list/language_classification_label_list.txt
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 2]
+    Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/language_classification/PPLCNet_x1_0_search.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/language_classification/PPLCNet_x1_0_search.yaml
new file mode 100644
index 000000000..49a5f1702
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/language_classification/PPLCNet_x1_0_search.yaml
@@ -0,0 +1,142 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 48, 192]
+  save_inference_dir: ./inference
+  start_eval_epoch: 20
+  
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 10
+  pretrained: True
+  use_ssld: True
+  stride_list: [2, [2, 1], [2, 1], [2, 1], [2, 1]]
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.4
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 48]
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [192, 48]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 48]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 32
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/language_classification/word_35404.png
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [192, 48]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC_label_list/language_classification_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/language_classification/SwinTransformer_tiny_patch4_window7_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/language_classification/SwinTransformer_tiny_patch4_window7_224.yaml
new file mode 100644
index 000000000..4e1a45a9e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/language_classification/SwinTransformer_tiny_patch4_window7_224.yaml
@@ -0,0 +1,160 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: SwinTransformer_tiny_patch4_window7_224
+  class_num: 10
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-4
+    eta_min: 1e-5
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/language_classification/
+      cls_label_path: ./dataset/language_classification/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/language_classification/word_35404.png
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC_label_list/language_classification_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/language_classification/search.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/language_classification/search.yaml
new file mode 100644
index 000000000..a4b3dde56
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/language_classification/search.yaml
@@ -0,0 +1,40 @@
+base_config_file: ppcls/configs/PULC/language_classification/PPLCNet_x1_0_search.yaml
+distill_config_file: ppcls/configs/PULC/language_classification/PPLCNet_x1_0_distillation.yaml
+
+gpus: 0,1,2,3
+output_dir: output/search_language_classification
+search_times: 1
+search_dict:
+  - search_key: lrs
+    replace_config:
+      - Optimizer.lr.learning_rate
+    search_values: [0.2, 0.4, 0.8]
+  - search_key: resolutions
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.1.ResizeImage.size
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.img_size
+      - DataLoader.Eval.dataset.transform_ops.1.ResizeImage.size
+    search_values: [[192, 48], [180, 60], [160, 80]]
+  - search_key: ra_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.prob
+    search_values: [0.0, 0.5, 1.0]
+  - search_key: re_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.5.RandomErasing.EPSILON
+    search_values: [0.0, 0.5, 1.0]
+  - search_key: lr_mult_list
+    replace_config:
+      - Arch.lr_mult_list
+    search_values:
+      - [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+      - [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+      - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+teacher:
+  rm_keys:
+    - Arch.lr_mult_list
+  search_values:
+    - ResNet101_vd
+    - ResNet50_vd
+final_replace:
+  Arch.lr_mult_list: Arch.models.1.Student.lr_mult_list
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/MobileNetV3_small_x0_35.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/MobileNetV3_small_x0_35.yaml
new file mode 100644
index 000000000..94b443832
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/MobileNetV3_small_x0_35.yaml
@@ -0,0 +1,135 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 192]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "MobileNetV3_small_x0_35"
+  pretrained: True
+  class_num: 26
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+  #clip_norm: 10
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pa100k/"
+      cls_label_path: "dataset/pa100k/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - Padv2:
+            size: [212, 276]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [192, 256]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pa100k/"
+      cls_label_path: "dataset/pa100k/val_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_attribute/090004.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [192, 256]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: PersonAttribute
+    threshold: 0.5  #default threshold
+    glasses_threshold: 0.3  #threshold only for glasses
+    hold_threshold: 0.6 #threshold only for hold
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/PPLCNet_x1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/PPLCNet_x1_0.yaml
new file mode 100644
index 000000000..b042ad757
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/PPLCNet_x1_0.yaml
@@ -0,0 +1,149 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 192]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "PPLCNet_x1_0"
+  pretrained: True
+  use_ssld: True
+  class_num: 26
+  
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pa100k/"
+      cls_label_path: "dataset/pa100k/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - TimmAutoAugment:
+            prob: 0.8
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [192, 256]
+        - Padv2:
+            size: [212, 276]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [192, 256]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.4
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pa100k/"
+      cls_label_path: "dataset/pa100k/val_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_attribute/090004.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [192, 256]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: PersonAttribute
+    threshold: 0.5  #default threshold
+    glasses_threshold: 0.3  #threshold only for glasses
+    hold_threshold: 0.6 #threshold only for hold
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/PPLCNet_x1_0_Distillation.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/PPLCNet_x1_0_Distillation.yaml
new file mode 100644
index 000000000..bd6503488
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/PPLCNet_x1_0_Distillation.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  start_eval_epoch: 1
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 192]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 26
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  use_sync_bn: True
+  models:
+    - Teacher:
+        name: ResNet101_vd
+        class_num: *class_num
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+
+  infer_model_name: "Student"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationDMLLoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+    - DistillationMultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        model_names: ["Student"]
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pa100k/"
+      cls_label_path: "dataset/pa100k/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - TimmAutoAugment:
+            prob: 0.8
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [192, 256]
+        - Padv2:
+            size: [212, 276]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [192, 256]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.4
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pa100k/"
+      cls_label_path: "dataset/pa100k/val_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_attribute/090004.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [192, 256]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: PersonAttribute
+    threshold: 0.5  #default threshold
+    glasses_threshold: 0.3  #threshold only for glasses
+    hold_threshold: 0.6 #threshold only for hold
+
+Metric:
+  Eval:
+    - ATTRMetric:
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/PPLCNet_x1_0_search.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/PPLCNet_x1_0_search.yaml
new file mode 100644
index 000000000..8f6b0d7fe
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/PPLCNet_x1_0_search.yaml
@@ -0,0 +1,149 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 192]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "PPLCNet_x1_0"
+  pretrained: True
+  use_ssld: True
+  class_num: 26
+  
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pa100k/"
+      cls_label_path: "dataset/pa100k/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [192, 256]
+        - Padv2:
+            size: [212, 276]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [192, 256]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pa100k"
+      cls_label_path: "dataset/pa100k/val_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_attribute/090004.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [192, 256]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: PersonAttribute
+    threshold: 0.5  #default threshold
+    glasses_threshold: 0.3  #threshold only for glasses
+    hold_threshold: 0.6 #threshold only for hold
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/Res2Net200_vd_26w_4s.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/Res2Net200_vd_26w_4s.yaml
new file mode 100644
index 000000000..4f7dc273c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/Res2Net200_vd_26w_4s.yaml
@@ -0,0 +1,134 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 256, 192]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "Res2Net200_vd_26w_4s"
+  pretrained: True
+  class_num: 26
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pa100k/"
+      cls_label_path: "dataset/pa100k/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - Padv2:
+            size: [212, 276]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [192, 256]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pa100k/"
+      cls_label_path: "dataset/pa100k/val_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 256]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_attribute/090004.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [192, 256]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: PersonAttribute
+    threshold: 0.5  #default threshold
+    glasses_threshold: 0.3  #threshold only for glasses
+    hold_threshold: 0.6 #threshold only for hold
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/SwinTransformer_tiny_patch4_window7_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/SwinTransformer_tiny_patch4_window7_224.yaml
new file mode 100644
index 000000000..36c3d6aae
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/SwinTransformer_tiny_patch4_window7_224.yaml
@@ -0,0 +1,135 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "SwinTransformer_tiny_patch4_window7_224"
+  pretrained: True
+  class_num: 26
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+  #clip_norm: 10
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pa100k/"
+      cls_label_path: "dataset/pa100k/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [224, 224]
+        - Padv2:
+            size: [244, 244]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [224, 224]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/pa100k/"
+      cls_label_path: "dataset/pa100k/val_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [224, 224]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_attribute/090004.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [224, 224]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: PersonAttribute
+    threshold: 0.5  #default threshold
+    glasses_threshold: 0.3  #threshold only for glasses
+    hold_threshold: 0.6 #threshold only for hold
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/search.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/search.yaml
new file mode 100644
index 000000000..78192d113
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_attribute/search.yaml
@@ -0,0 +1,41 @@
+base_config_file: ppcls/configs/PULC/person_attribute/PPLCNet_x1_0_search.yaml
+distill_config_file: ppcls/configs/PULC/person_attribute/PPLCNet_x1_0_Distillation.yaml
+
+gpus: 0,1,2,3
+output_dir: output/search_attr
+search_times: 1
+search_dict:
+  - search_key: lrs
+    replace_config:
+      - Optimizer.lr.learning_rate
+    search_values: [0.0001, 0.005, 0.01, 0.02, 0.05]
+  - search_key: resolutions
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.1.ResizeImage.size
+      - DataLoader.Train.dataset.transform_ops.4.RandomCropImage.size
+      - DataLoader.Train.dataset.transform_ops.2.TimmAutoAugment.img_size
+    search_values: [[192, 256]]
+  - search_key: ra_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.2.TimmAutoAugment.prob
+    search_values: [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+  - search_key: re_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.7.RandomErasing.EPSILON
+    search_values: [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+  - search_key: lr_mult_list
+    replace_config:
+      - Arch.lr_mult_list
+    search_values:
+      - [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+      - [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+      - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+teacher:
+  rm_keys:
+    - Arch.lr_mult_list
+  search_values:
+    - ResNet101_vd
+    - ResNet50_vd
+final_replace:
+  Arch.lr_mult_list: Arch.models.1.Student.lr_mult_list
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_exists/MobileNetV3_small_x0_35.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_exists/MobileNetV3_small_x0_35.yaml
new file mode 100644
index 000000000..9510ec258
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_exists/MobileNetV3_small_x0_35.yaml
@@ -0,0 +1,138 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 10
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_35
+  class_num: 2
+  pretrained: True
+  use_sync_bn: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.05
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/person_exists/
+      cls_label_path: ./dataset/person_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/person_exists/
+      cls_label_path: ./dataset/person_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_exists/objects365_02035329.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: nobody
+    label_1: someone
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TprAtFpr:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_exists/PPLCNet_x1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_exists/PPLCNet_x1_0.yaml
new file mode 100644
index 000000000..93e9841d9
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_exists/PPLCNet_x1_0.yaml
@@ -0,0 +1,151 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 10
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 2
+  pretrained: True
+  use_ssld: True
+  use_sync_bn: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/person_exists/
+      cls_label_path: ./dataset/person_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 192
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 192
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.1
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/person_exists/
+      cls_label_path: ./dataset/person_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_exists/objects365_02035329.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.9
+    label_0: nobody
+    label_1: someone
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TprAtFpr:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_exists/PPLCNet_x1_0_distillation.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_exists/PPLCNet_x1_0_distillation.yaml
new file mode 100644
index 000000000..3d3aa3258
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_exists/PPLCNet_x1_0_distillation.yaml
@@ -0,0 +1,168 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  start_eval_epoch: 1
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 2
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  use_sync_bn: True
+  models:
+    - Teacher:
+        name: ResNet101_vd
+        class_num: *class_num
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+
+  infer_model_name: "Student"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationDMLLoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/person_exists/
+      cls_label_path: ./dataset/person_exists/train_list_for_distill.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 192
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 192
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.1
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/person_exists/
+      cls_label_path: ./dataset/person_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_exists/objects365_02035329.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: nobody
+    label_1: someone
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 2]
+    Eval:
+    - TprAtFpr:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_exists/PPLCNet_x1_0_search.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_exists/PPLCNet_x1_0_search.yaml
new file mode 100644
index 000000000..86c25a05b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_exists/PPLCNet_x1_0_search.yaml
@@ -0,0 +1,151 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 10
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 2
+  pretrained: True
+  use_ssld: True
+  use_sync_bn: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/person_exists/
+      cls_label_path: ./dataset/person_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/person_exists/
+      cls_label_path: ./dataset/person_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_exists/objects365_02035329.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: nobody
+    label_1: someone
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TprAtFpr:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_exists/SwinTransformer_tiny_patch4_window7_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_exists/SwinTransformer_tiny_patch4_window7_224.yaml
new file mode 100644
index 000000000..be10d67b7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_exists/SwinTransformer_tiny_patch4_window7_224.yaml
@@ -0,0 +1,168 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 10
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: SwinTransformer_tiny_patch4_window7_224
+  class_num: 2
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 5e-5
+    eta_min: 1e-6
+    warmup_epoch: 5
+    warmup_start_lr: 1e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/person_exists/
+      cls_label_path: ./dataset/person_exists/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/person_exists/
+      cls_label_path: ./dataset/person_exists/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/person_exists/objects365_02035329.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: nobody
+    label_1: someone
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TprAtFpr:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_exists/search.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_exists/search.yaml
new file mode 100644
index 000000000..820337c02
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/person_exists/search.yaml
@@ -0,0 +1,40 @@
+base_config_file: ppcls/configs/PULC/person_exists/PPLCNet_x1_0_search.yaml
+distill_config_file: ppcls/configs/PULC/person_exists/PPLCNet_x1_0_distillation.yaml
+
+gpus: 0,1,2,3
+output_dir: output/search_person_cls
+search_times: 1
+search_dict:
+  - search_key: lrs
+    replace_config:
+      - Optimizer.lr.learning_rate
+    search_values: [0.0075, 0.01, 0.0125]
+  - search_key: resolutions
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.1.RandCropImage.size
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.img_size
+    search_values: [176, 192, 224]
+  - search_key: ra_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.prob
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: re_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.5.RandomErasing.EPSILON
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: lr_mult_list
+    replace_config:
+      - Arch.lr_mult_list
+    search_values:
+      - [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+      - [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+      - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+teacher:
+  rm_keys:
+    - Arch.lr_mult_list
+  search_values:
+    - ResNet101_vd
+    - ResNet50_vd
+final_replace:
+  Arch.lr_mult_list: Arch.models.1.Student.lr_mult_list
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/MobileNetV3_small_x0_35.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/MobileNetV3_small_x0_35.yaml
new file mode 100644
index 000000000..9ef4beb79
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/MobileNetV3_small_x0_35.yaml
@@ -0,0 +1,134 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_35
+  pretrained: True
+  class_num: 2
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.08
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/safety_helmet/safety_helmet_test_1.png
+  batch_size: 1
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: wearing_helmet
+    label_1: unwearing_helmet
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1]
+  Eval:
+    - TprAtFpr:
+        max_fpr: 0.0001
+    - TopkAcc:
+        topk: [1]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/PPLCNet_x1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/PPLCNet_x1_0.yaml
new file mode 100644
index 000000000..4c3c8642d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/PPLCNet_x1_0.yaml
@@ -0,0 +1,148 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 40
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  pretrained: True
+  use_ssld: True
+  class_num: 2
+  use_sync_bn : True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.025
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 176
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob : 0.5
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size : 176
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON : 0.1
+            r1 : 0.3
+            sh : 1.0/3.0
+            sl : 0.02
+            attempt : 10
+            use_log_aspect : True
+            mode : pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/safety_helmet/safety_helmet_test_1.png
+  batch_size: 1
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: wearing_helmet
+    label_1: unwearing_helmet
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1]
+  Eval:
+    - TprAtFpr:
+        max_fpr: 0.0001
+    - TopkAcc:
+        topk: [1]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/PPLCNet_x1_0_distillation.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/PPLCNet_x1_0_distillation.yaml
new file mode 100644
index 000000000..254db5df4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/PPLCNet_x1_0_distillation.yaml
@@ -0,0 +1,185 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  start_eval_epoch: 1
+  eval_interval: 1
+  epochs: 40
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 2
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - False
+  - False
+  use_sync_bn: True
+  models:
+    - Teacher:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+        return_stages: True
+        return_patterns: ["blocks3", "blocks4", "blocks5", "blocks6"]
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+        return_stages: True
+        return_patterns: ["blocks3", "blocks4", "blocks5", "blocks6"]
+
+  infer_model_name: "Student"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationGTCELoss:
+        weight: 1.0
+        key: logits
+        model_names: ["Student", "Teacher"]
+    - DistillationDMLLoss:
+        weight: 1.0
+        key: logits
+        model_name_pairs:
+        - ["Student", "Teacher"]
+    - DistillationDistanceLoss:
+        weight: 1.0
+        key: "blocks4"
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.015
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 192
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 0.5
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 192
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/safety_helmet/safety_helmet_test_1.png
+  batch_size: 1
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: wearing_helmet
+    label_1: unwearing_helmet
+
+Metric:
+  Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1]
+  Eval:
+    - TprAtFpr:
+        max_fpr: 0.0001
+    - TopkAcc:
+        topk: [1]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/PPLCNet_x1_0_search.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/PPLCNet_x1_0_search.yaml
new file mode 100644
index 000000000..98f63a613
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/PPLCNet_x1_0_search.yaml
@@ -0,0 +1,148 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 40
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  pretrained: True
+  use_ssld: True
+  class_num: 2
+  use_sync_bn: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.10
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 192
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            prob: 0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 192
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/safety_helmet/safety_helmet_test_1.png
+  batch_size: 1
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: wearing_helmet
+    label_1: unwearing_helmet
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1]
+  Eval:
+    - TprAtFpr:
+        max_fpr: 0.0001
+    - TopkAcc:
+        topk: [1]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/Res2Net200_vd_26w_4s.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/Res2Net200_vd_26w_4s.yaml
new file mode 100644
index 000000000..5b987d510
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/Res2Net200_vd_26w_4s.yaml
@@ -0,0 +1,137 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: Res2Net200_vd_26w_4s
+  class_num: 2
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.005
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/safety_helmet/safety_helmet_test_1.png
+  batch_size: 1
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: wearing_helmet
+    label_1: unwearing_helmet
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1]
+  Eval:
+    - TprAtFpr:
+        max_fpr: 0.0001
+    - TopkAcc:
+        topk: [1]
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/SwinTransformer_tiny_patch4_window7_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/SwinTransformer_tiny_patch4_window7_224.yaml
new file mode 100644
index 000000000..5863ee17e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/SwinTransformer_tiny_patch4_window7_224.yaml
@@ -0,0 +1,159 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: SwinTransformer_tiny_patch4_window7_224
+  pretrained: True
+  class_num: 2
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-5
+    eta_min: 1e-7
+    warmup_epoch: 5
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/safety_helmet/
+      cls_label_path: ./dataset/safety_helmet/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/safety_helmet/safety_helmet_test_1.png
+  batch_size: 1
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: wearing_helmet
+    label_1: unwearing_helmet
+
+Metric:
+  Eval:
+    - TprAtFpr:
+        max_fpr: 0.0001
+    - TopkAcc:
+        topk: [1]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/search.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/search.yaml
new file mode 100644
index 000000000..e8c1c933d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/safety_helmet/search.yaml
@@ -0,0 +1,36 @@
+base_config_file: ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0_search.yaml
+distill_config_file: ppcls/configs/PULC/safety_helmet/PPLCNet_x1_0_distillation.yaml
+
+gpus: 0,1,2,3
+output_dir: output/search_safety_helmet
+search_times: 1
+search_dict:
+  - search_key: lrs
+    replace_config:
+      - Optimizer.lr.learning_rate
+    search_values: [0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10, 0.11, 0.11, 0.12]
+  - search_key: resolutions
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.1.RandCropImage.size
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.img_size
+    search_values: [176, 192, 224]
+  - search_key: ra_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.prob
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: re_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.5.RandomErasing.EPSILON
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: lr_mult_list
+    replace_config:
+      - Arch.lr_mult_list
+    search_values:
+      - [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+      - [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+      - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+teacher:
+  algorithm: "udml"
+final_replace:
+  Arch.lr_mult_list: Arch.models.1.Student.lr_mult_list
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/table_attribute/PPLCNet_x1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/table_attribute/PPLCNet_x1_0.yaml
new file mode 100644
index 000000000..2c1e9b253
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/table_attribute/PPLCNet_x1_0.yaml
@@ -0,0 +1,133 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "PPLCNet_x1_0"
+  pretrained: True
+  use_ssld: True
+  class_num: 6
+  
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/table_attribute/"
+      cls_label_path: "dataset/table_attribute/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [224, 224]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/table_attribute/"
+      cls_label_path: "dataset/table_attribute/val_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [224, 224]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/table_attribute/val_3610.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [224, 224]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: TableAttribute
+    source_threshold: 0.5
+    number_threshold: 0.5
+    color_threshold: 0.5
+    clarity_threshold : 0.5
+    obstruction_threshold: 0.5
+    angle_threshold: 0.5
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/table_attribute/PPLCNet_x1_0_distillation.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/table_attribute/PPLCNet_x1_0_distillation.yaml
new file mode 100644
index 000000000..5b48f4a1f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/table_attribute/PPLCNet_x1_0_distillation.yaml
@@ -0,0 +1,155 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 20
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 6
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  infer_model_name: "Student"
+  freeze_params_list:
+  - True
+  - False
+  use_ssld: True
+  models:
+    - Teacher:
+        name: ResNet50_vd
+        class_num: *class_num
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationMultiLabelLoss:
+        weight: 1.0
+        model_names: ["Student"]
+        weight_ratio: True
+        size_sum: True
+    - DistillationDMLLoss:
+        weight: 1.0
+        weight_ratio: True
+        sum_across_class_dim: False
+        model_name_pairs:
+        - ["Student", "Teacher"]
+    
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.02
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/table_attribute/"
+      cls_label_path: "dataset/table_attribute/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [224, 224]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/table_attribute/"
+      cls_label_path: "dataset/table_attribute/val_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [224, 224]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/table_attribute/val_3253.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [224, 224]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: TableAttribute
+    source_threshold: 0.5
+    number_threshold: 0.5
+    color_threshold: 0.5
+    clarity_threshold : 0.5
+    obstruction_threshold: 0.5
+    angle_threshold: 0.5
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/text_image_orientation/MobileNetV3_small_x0_35.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/text_image_orientation/MobileNetV3_small_x0_35.yaml
new file mode 100644
index 000000000..7eaff9768
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/text_image_orientation/MobileNetV3_small_x0_35.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  start_eval_epoch: 40
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_35
+  class_num: 4
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 1.3
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/text_image_orientation/
+      cls_label_path: ./dataset/text_image_orientation/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/text_image_orientation/
+      cls_label_path: ./dataset/text_image_orientation/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ddeploy/images/PULC/text_image_orientation/img_rot0_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC_label_list/text_image_orientation_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/text_image_orientation/PPLCNet_x1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/text_image_orientation/PPLCNet_x1_0.yaml
new file mode 100644
index 000000000..c8ded908e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/text_image_orientation/PPLCNet_x1_0.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 4
+  pretrained: True
+  use_ssld: True
+
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.4
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/text_image_orientation/
+      cls_label_path: ./dataset/text_image_orientation/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/text_image_orientation/
+      cls_label_path: ./dataset/text_image_orientation/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/text_image_orientation/img_rot0_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC_label_list/text_image_orientation_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/text_image_orientation/PPLCNet_x1_0_distillation.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/text_image_orientation/PPLCNet_x1_0_distillation.yaml
new file mode 100644
index 000000000..b8fd0b108
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/text_image_orientation/PPLCNet_x1_0_distillation.yaml
@@ -0,0 +1,164 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 4
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  use_sync_bn: True
+  models:
+    - Teacher:
+        name: ResNet101_vd
+        class_num: *class_num
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+        
+
+  infer_model_name: "Student"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationDMLLoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.4
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/text_image_orientation/
+      cls_label_path: ./dataset/text_image_orientation/train_list_for_distill.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/text_image_orientation/
+      cls_label_path: ./dataset/text_image_orientation/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/text_image_orientation/img_rot0_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC_label_list/text_image_orientation_label_list.txt
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 2]
+    Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/text_image_orientation/PPLCNet_x1_0_search.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/text_image_orientation/PPLCNet_x1_0_search.yaml
new file mode 100644
index 000000000..0ba788156
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/text_image_orientation/PPLCNet_x1_0_search.yaml
@@ -0,0 +1,146 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  start_eval_epoch: 40
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 4
+  pretrained: True
+  use_ssld: True
+
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/text_image_orientation/
+      cls_label_path: ./dataset/text_image_orientation/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/text_image_orientation/
+      cls_label_path: ./dataset/text_image_orientation/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/text_image_orientation/img_rot0_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC_label_list/text_image_orientation_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/text_image_orientation/SwinTransformer_tiny_patch4_window7_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/text_image_orientation/SwinTransformer_tiny_patch4_window7_224.yaml
new file mode 100644
index 000000000..4d123cd4b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/text_image_orientation/SwinTransformer_tiny_patch4_window7_224.yaml
@@ -0,0 +1,157 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: SwinTransformer_tiny_patch4_window7_224
+  class_num: 4
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm 
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 2.5e-4
+    eta_min: 1e-5
+    warmup_epoch: 20
+    warmup_start_lr: 1e-6
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/text_image_orientation/
+      cls_label_path: ./dataset/text_image_orientation/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/text_image_orientation/
+      cls_label_path: ./dataset/text_image_orientation/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+            interpolation: bicubic
+            backend: pil
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/text_image_orientation/img_rot0_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+        interpolation: bicubic
+        backend: pil
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: ppcls/utils/PULC_label_list/text_image_orientation_label_list.txt
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/text_image_orientation/search.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/text_image_orientation/search.yaml
new file mode 100644
index 000000000..d8e65f5f0
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/text_image_orientation/search.yaml
@@ -0,0 +1,41 @@
+base_config_file: ppcls/configs/PULC/text_image_orientation/PPLCNet_x1_0_search.yaml
+distill_config_file: ppcls/configs/PULC/text_image_orientation/PPLCNet_x1_0_distillation.yaml
+
+gpus: 0,1,2,3
+output_dir: output/search_text_image_orientation
+search_times: 1
+search_dict:
+  - search_key: lrs
+    replace_config:
+      - Optimizer.lr.learning_rate
+    search_values: [0.1, 0.2, 0.4, 0.8]
+  - search_key: resolutions
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.1.RandCropImage.size
+      - DataLoader.Train.dataset.transform_ops.2.TimmAutoAugment.img_size
+    search_values: [176, 192, 224]
+  - search_key: ra_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.2.TimmAutoAugment.prob
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: re_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.4.RandomErasing.EPSILON
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: lr_mult_list
+    replace_config:
+      - Arch.lr_mult_list
+    search_values:
+      - [0.0, 0.0, 0.3, 0.5, 0.8, 1.0]
+      - [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+      - [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+      - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+teacher:
+  rm_keys:
+    - Arch.lr_mult_list
+  search_values:
+    - ResNet101_vd
+    - ResNet50_vd
+final_replace:
+  Arch.lr_mult_list: Arch.models.1.Student.lr_mult_list
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/MobileNetV3_small_x0_35.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/MobileNetV3_small_x0_35.yaml
new file mode 100644
index 000000000..040868378
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/MobileNetV3_small_x0_35.yaml
@@ -0,0 +1,134 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 18
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_35
+  class_num: 2
+  pretrained: True
+  use_sync_bn: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.13
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/textline_orientation/
+      cls_label_path: ./dataset/textline_orientation/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 512
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/textline_orientation/
+      cls_label_path: ./dataset/textline_orientation/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/textline_orientation/textline_orientation_test_0_0.png
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 1
+    class_id_map_file: ppcls/utils/PULC_label_list/textline_orientation_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/PPLCNet_x1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/PPLCNet_x1_0.yaml
new file mode 100644
index 000000000..3ab3657d8
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/PPLCNet_x1_0.yaml
@@ -0,0 +1,143 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  start_eval_epoch: 18
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 80, 160]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 2
+  pretrained: True
+  use_ssld: True
+  stride_list: [2, [2, 1], [2, 1], [2, 1], [2, 1]]
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/textline_orientation/
+      cls_label_path: ./dataset/textline_orientation/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [160, 80]
+        - TimmAutoAugment:
+            prob: 1.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [160, 80]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/textline_orientation/
+      cls_label_path: ./dataset/textline_orientation/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [160, 80]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/textline_orientation/textline_orientation_test_0_0.png
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [160, 80]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 1
+    class_id_map_file: ppcls/utils/PULC_label_list/textline_orientation_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/PPLCNet_x1_0_224x224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/PPLCNet_x1_0_224x224.yaml
new file mode 100644
index 000000000..17b9cbb15
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/PPLCNet_x1_0_224x224.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  start_eval_epoch: 18
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 2
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/textline_orientation/
+      cls_label_path: ./dataset/textline_orientation/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/textline_orientation/
+      cls_label_path: ./dataset/textline_orientation/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/textline_orientation/textline_orientation_test_0_0.png
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 1
+    class_id_map_file: ppcls/utils/PULC_label_list/textline_orientation_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/PPLCNet_x1_0_distillation.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/PPLCNet_x1_0_distillation.yaml
new file mode 100644
index 000000000..2cc57e637
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/PPLCNet_x1_0_distillation.yaml
@@ -0,0 +1,162 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  start_eval_epoch: 18
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 80, 160]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 2
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  use_sync_bn: True
+  models:
+    - Teacher:
+        name: ResNet101_vd
+        class_num: *class_num
+        stride_list: [2, [2, 1], [2, 1], [2, 1], [2, 1]]
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        stride_list: [2, [2, 1], [2, 1], [2, 1], [2, 1]]
+        pretrained: True
+        use_ssld: True
+
+  infer_model_name: "Student"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationDMLLoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.8
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/textline_orientation/
+      cls_label_path: ./dataset/textline_orientation/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [160, 80]
+        - TimmAutoAugment:
+            prob: 1.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [160, 80]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/textline_orientation/
+      cls_label_path: ./dataset/textline_orientation/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [160, 80]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/textline_orientation/textline_orientation_test_0_0.png
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [160, 80]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 1
+    class_id_map_file: ppcls/utils/PULC_label_list/textline_orientation_label_list.txt
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 2]
+    Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/PPLCNet_x1_0_search.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/PPLCNet_x1_0_search.yaml
new file mode 100644
index 000000000..e9e186377
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/PPLCNet_x1_0_search.yaml
@@ -0,0 +1,144 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  start_eval_epoch: 18
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 48, 192]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 2
+  pretrained: True
+  use_ssld: True
+  stride_list: [2, [2, 1], [2, 1], [2, 1], [2, 1]]
+  
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.5
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/textline_orientation/
+      cls_label_path: ./dataset/textline_orientation/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 48]
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [192, 48]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 16
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/textline_orientation/
+      cls_label_path: ./dataset/textline_orientation/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [192, 48]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/textline_orientation/textline_orientation_test_0_0.png 
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [192, 48]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 1
+    class_id_map_file: ppcls/utils/PULC_label_list/textline_orientation_label_list.txt 
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/SwinTransformer_tiny_patch4_window7_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/SwinTransformer_tiny_patch4_window7_224.yaml
new file mode 100644
index 000000000..a466d5e08
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/SwinTransformer_tiny_patch4_window7_224.yaml
@@ -0,0 +1,164 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 10
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: SwinTransformer_tiny_patch4_window7_224
+  class_num: 2
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 1e-4
+    eta_min: 2e-6
+    warmup_epoch: 5
+    warmup_start_lr: 2e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/textline_orientation/
+      cls_label_path: ./dataset/textline_orientation/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/textline_orientation/
+      cls_label_path: ./dataset/textline_orientation/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/textline_orientation/textline_orientation_test_0_0.png 
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 1
+    class_id_map_file: ppcls/utils/PULC_label_list/textline_orientation_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 2]
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/search.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/search.yaml
new file mode 100644
index 000000000..4419949bc
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/textline_orientation/search.yaml
@@ -0,0 +1,41 @@
+base_config_file: ppcls/configs/PULC/text_direction/PPLCNet_x1_0.yaml
+distill_config_file: ppcls/configs/PULC/text_direction/PPLCNet_x1_0_distillation.yaml
+
+gpus: 0,1,2,3
+output_dir: output/search_text
+search_times: 1
+search_dict:
+  - search_key: lrs
+    replace_config:
+      - Optimizer.lr.learning_rate
+    search_values: [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
+  - search_key: resolutions
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.1.ResizeImage.size
+      - DataLoader.Train.dataset.transform_ops.2.TimmAutoAugment.img_size
+      - DataLoader.Eval.dataset.transform_ops.1.ResizeImage.size
+    search_values: [[192, 48], [180, 60], [160, 80]]
+  - search_key: ra_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.2.TimmAutoAugment.prob
+    search_values: [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+  - search_key: re_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.4.RandomErasing.EPSILON
+    search_values: [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+  - search_key: lr_mult_list
+    replace_config:
+      - Arch.lr_mult_list
+    search_values:
+      - [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+      - [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+      - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+teacher:
+  rm_keys:
+    - Arch.lr_mult_list
+  search_values:
+    - ResNet101_vd
+    - ResNet50_vd
+final_replace:
+  Arch.lr_mult_list: Arch.models.1.Student.lr_mult_list
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/traffic_sign/MobileNetV3_samll_x0_35.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/traffic_sign/MobileNetV3_samll_x0_35.yaml
new file mode 100644
index 000000000..5ebe7441e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/traffic_sign/MobileNetV3_samll_x0_35.yaml
@@ -0,0 +1,132 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 10
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV3_small_x0_35
+  class_num: 232
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/traffic_sign/label_list_train.txt
+      delimiter: "\t"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/traffic_sign/label_list_test.txt
+      delimiter: "\t"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/PULC_label_list/traffic_sign_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/traffic_sign/PPLCNet_x1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/traffic_sign/PPLCNet_x1_0.yaml
new file mode 100644
index 000000000..5362d07b7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/traffic_sign/PPLCNet_x1_0.yaml
@@ -0,0 +1,148 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 0
+  epochs: 10
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 232
+  pretrained: True
+  use_ssld: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.02
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/traffic_sign/label_list_train.txt
+      delimiter: "\t"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - TimmAutoAugment:
+            prob: 0.5
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/traffic_sign/label_list_test.txt
+      delimiter: "\t"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/traffic_sign/99603_17806.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/PULC_label_list/traffic_sign_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/traffic_sign/PPLCNet_x1_0_distillation.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/traffic_sign/PPLCNet_x1_0_distillation.yaml
new file mode 100644
index 000000000..b00c250e1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/traffic_sign/PPLCNet_x1_0_distillation.yaml
@@ -0,0 +1,172 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 10
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 232
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  models:
+    - Teacher:
+        name: ResNet101_vd
+        class_num: *class_num
+        pretrained: False
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+
+  infer_model_name: "Student"
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationDMLLoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/traffic_sign/label_list_train_for_distillation.txt
+      delimiter: "\t"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/traffic_sign/label_list_test.txt
+      delimiter: "\t"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/PULC_label_list/traffic_sign_label_list.txt
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/traffic_sign/PPLCNet_x1_0_search.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/traffic_sign/PPLCNet_x1_0_search.yaml
new file mode 100644
index 000000000..27fbc4b86
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/traffic_sign/PPLCNet_x1_0_search.yaml
@@ -0,0 +1,148 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 0
+  epochs: 10
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 232
+  pretrained: True
+  # use_ssld: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/traffic_sign/label_list_train.txt
+      delimiter: "\t"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/traffic_sign/label_list_test.txt
+      delimiter: "\t"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/PULC/traffic_sign/99603_17806.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/PULC_label_list/traffic_sign_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/traffic_sign/SwinTransformer_tiny_patch4_window7_224.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/traffic_sign/SwinTransformer_tiny_patch4_window7_224.yaml
new file mode 100644
index 000000000..ae86ae622
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/traffic_sign/SwinTransformer_tiny_patch4_window7_224.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  start_eval_epoch: 0
+  epochs: 10
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: SwinTransformer_tiny_patch4_window7_224
+  class_num: 232
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: AdamW
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-8
+  weight_decay: 0.05
+  no_weight_decay_name: absolute_pos_embed relative_position_bias_table .bias norm
+  one_dim_param_no_weight_decay: True
+  lr:
+    name: Cosine
+    learning_rate: 2e-4
+    eta_min: 2e-6
+    warmup_epoch: 5
+    warmup_start_lr: 2e-7
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/traffic_sign/label_list_train.txt
+      delimiter: "\t"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.8
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/traffic_sign/label_list_test.txt
+      delimiter: "\t"
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/PULC_label_list/traffic_sign_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/traffic_sign/search.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/traffic_sign/search.yaml
new file mode 100644
index 000000000..029d042df
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/traffic_sign/search.yaml
@@ -0,0 +1,41 @@
+base_config_file: ppcls/configs/PULC/traffic_sign/PPLCNet_x1_0_search.yaml
+distill_config_file: ppcls/configs/PULC/traffic_sign/PPLCNet_x1_0_distillation.yaml
+
+gpus: 0,1,2,3
+output_dir: output/search_traffic_sign
+search_times: 1
+search_dict:
+  - search_key: lrs
+    replace_config:
+      - Optimizer.lr.learning_rate
+    search_values: [0.0075, 0.01, 0.0125]
+  - search_key: resolutions
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.1.RandCropImage.size
+      - DataLoader.Train.dataset.transform_ops.2.TimmAutoAugment.img_size
+    search_values: [176, 192, 224]
+  - search_key: ra_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.2.TimmAutoAugment.prob
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: re_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.4.RandomErasing.EPSILON
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: lr_mult_list
+    replace_config:
+      - Arch.lr_mult_list
+    search_values:
+      - [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+      - [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+      - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+teacher:
+  algorithm: "skl-ugi"
+  rm_keys:
+    - Arch.lr_mult_list
+  search_values:
+    - ResNet101_vd
+    - ResNet50_vd
+final_replace:
+  Arch.lr_mult_list: Arch.models.1.Student.lr_mult_list
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/MobileNetV3_small_x0_35.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/MobileNetV3_small_x0_35.yaml
new file mode 100644
index 000000000..a35bc6114
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/MobileNetV3_small_x0_35.yaml
@@ -0,0 +1,115 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 20
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 192, 256]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "MobileNetV3_small_x0_35"
+  pretrained: True
+  class_num: 19
+  infer_add_softmax: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - Padv2:
+            size: [276, 212]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [256, 192]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/test_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/PPLCNet_x1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/PPLCNet_x1_0.yaml
new file mode 100644
index 000000000..a3369a9ee
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/PPLCNet_x1_0.yaml
@@ -0,0 +1,149 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 20
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 192, 256]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "PPLCNet_x1_0"
+  pretrained: True
+  class_num: 19
+  use_ssld: True
+  lr_mult_list: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+  infer_add_softmax: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.0125
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [256, 192]
+        - Padv2:
+            size: [276, 212]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [256, 192]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/test_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ./deploy/images/PULC/vehicle_attribute/0002_c002_00030670_0.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [256, 192]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: VehicleAttribute
+    color_threshold: 0.5
+    type_threshold: 0.5
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/PPLCNet_x1_0_distillation.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/PPLCNet_x1_0_distillation.yaml
new file mode 100644
index 000000000..d098ca81f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/PPLCNet_x1_0_distillation.yaml
@@ -0,0 +1,171 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 20
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 192, 256]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  class_num: &class_num 19
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  infer_model_name: "Student"
+  freeze_params_list:
+  - True
+  - False
+  use_ssld: True
+  models:
+    - Teacher:
+        name: ResNet101_vd
+        class_num: *class_num
+    - Student:
+        name: PPLCNet_x1_0
+        class_num: *class_num
+        pretrained: True
+        use_ssld: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationMultiLabelLoss:
+        weight: 1.0
+        model_names: ["Student"]
+        weight_ratio: True
+        size_sum: True
+    - DistillationDMLLoss:
+        weight: 1.0
+        weight_ratio: True
+        sum_across_class_dim: False
+        model_name_pairs:
+        - ["Student", "Teacher"]
+    
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [256, 192]
+        - Padv2:
+            size: [276, 212]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [256, 192]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/test_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: ./deploy/images/PULC/vehicle_attribute/0002_c002_00030670_0.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: [256, 192]
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: VehicleAttribute
+    color_threshold: 0.5
+    type_threshold: 0.5
+
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/PPLCNet_x1_0_search.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/PPLCNet_x1_0_search.yaml
new file mode 100644
index 000000000..5f84c2a65
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/PPLCNet_x1_0_search.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 20
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 192, 256]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "PPLCNet_x1_0"
+  pretrained: True
+  use_ssld: True
+  class_num: 19
+  infer_add_softmax: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - TimmAutoAugment:
+            prob: 0.0
+            config_str: rand-m9-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: [256, 192]
+        - Padv2:
+            size: [276, 212]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [256, 192]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.0
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/test_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/Res2Net200_vd_26w_4s.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/Res2Net200_vd_26w_4s.yaml
new file mode 100644
index 000000000..c6618f960
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/Res2Net200_vd_26w_4s.yaml
@@ -0,0 +1,122 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/mo"
+  device: "gpu"
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 20
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 192, 256]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# mixed precision training
+AMP:
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  # O1: mixed fp16
+  level: O1
+
+# model architecture
+Arch:
+  name: "Res2Net200_vd_26w_4s"
+  pretrained: True
+  class_num: 19
+  infer_add_softmax: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - Padv2:
+            size: [276, 212]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [256, 192]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/test_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/ResNet50.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/ResNet50.yaml
new file mode 100644
index 000000000..9218769c6
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/ResNet50.yaml
@@ -0,0 +1,116 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 20
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 192, 256]
+  save_inference_dir: "./inference"
+  use_multilabel: True
+
+# model architecture
+Arch:
+  name: "ResNet50"
+  pretrained: True
+  class_num: 19
+  infer_add_softmax: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+        weight_ratio: True
+        size_sum: True
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/train_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - Padv2:
+            size: [276, 212]
+            pad_mode: 1
+            fill_value: 0
+        - RandomCropImage:
+            size: [256, 192]
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    dataset:
+      name: MultiLabelDataset
+      image_root: "dataset/VeRi/"
+      cls_label_path: "dataset/VeRi/test_list.txt"
+      label_ratio: True
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: [256, 192]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+
+Metric:
+  Eval:
+    - ATTRMetric:
+
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/search.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/search.yaml
new file mode 100644
index 000000000..2a16266bf
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/PULC/vehicle_attribute/search.yaml
@@ -0,0 +1,35 @@
+base_config_file: ppcls/configs/PULC/vehicle_attr/PPLCNet_x1_0_search.yaml
+distill_config_file: ppcls/configs/PULC/vehicle_attr/PPLCNet_x1_0_distillation.yaml
+
+gpus: 0,1,2,3
+output_dir: output/search_vehicle_attr
+search_times: 1
+search_dict:
+  - search_key: lrs
+    replace_config:
+      - Optimizer.lr.learning_rate
+    search_values: [0.0075, 0.01, 0.0125]
+  - search_key: ra_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.2.TimmAutoAugment.prob
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: re_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.7.RandomErasing.EPSILON
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: lr_mult_list
+    replace_config:
+      - Arch.lr_mult_list
+    search_values:
+      - [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+      - [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+      - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+teacher:
+  algorithm: "skl-ugi"
+  rm_keys:
+    - Arch.lr_mult_list
+  search_values:
+    - ResNet101_vd
+    - ResNet50_vd
+final_replace:
+  Arch.lr_mult_list: Arch.models.1.Student.lr_mult_list
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Products/MV3_Large_1x_Aliproduct_DLBHC.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Products/MV3_Large_1x_Aliproduct_DLBHC.yaml
new file mode 100644
index 000000000..ad77ea950
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Products/MV3_Large_1x_Aliproduct_DLBHC.yaml
@@ -0,0 +1,149 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output_dlbhc/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  #eval_mode: "retrieval"
+  print_batch_step: 10
+  use_visualdl: False
+
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+  #feature postprocess
+  feature_normalize: False
+  feature_binarize: "round"
+
+# model architecture
+Arch:  
+  name: "RecModel"
+  Backbone:
+    name: "MobileNetV3_large_x1_0"
+    pretrained: True
+    class_num: 512
+  Head:
+    name: "FC"
+    class_num: 50030
+    embedding_size: 512
+    
+  infer_output_key:  "features"
+  infer_add_softmax: "false"
+ 
+# loss function config for train/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [50, 150]
+    values: [0.1, 0.01, 0.001]
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/Aliproduct/
+      cls_label_path: ./dataset/Aliproduct/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 256
+        - RandCropImage:
+            size: 227
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2023, 0.1994, 0.2010]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/Aliproduct/
+      cls_label_path: ./dataset/Aliproduct/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 227
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2023, 0.1994, 0.2010]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 227
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.4914, 0.4822, 0.4465]
+        std: [0.2023, 0.1994, 0.2010]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
+        
+# switch to metric below when eval by retrieval
+#     - Recallk:
+#         topk: [1]
+#     - mAP:
+#     - Precisionk:
+#         topk: [1]
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Products/ResNet50_vd_Aliproduct.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Products/ResNet50_vd_Aliproduct.yaml
new file mode 100644
index 000000000..70f805647
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Products/ResNet50_vd_Aliproduct.yaml
@@ -0,0 +1,119 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 10
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: classification
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+
+  Backbone: 
+    name: ResNet50_vd
+    pretrained: True
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: FC
+    embedding_size: 2048
+    class_num: 512
+  Head:
+    name: FC  
+    embedding_size: 512
+    class_num: 50030
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.05
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/Aliproduct/
+      cls_label_path: ./dataset/Aliproduct/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 0.00392157
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/Aliproduct/
+      cls_label_path: ./dataset/Aliproduct/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 0.00392157
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Products/ResNet50_vd_Inshop.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Products/ResNet50_vd_Inshop.yaml
new file mode 100644
index 000000000..18ddfa3a8
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Products/ResNet50_vd_Inshop.yaml
@@ -0,0 +1,157 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/product_ResNet50_vd_Aliproduct_v1.0_pretrained.pdparams"
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 10
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+  
+  Backbone: 
+    name: ResNet50_vd
+    pretrained: False
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: FC
+    embedding_size: 2048
+    class_num: 512
+  Head:
+    name: ArcMargin  
+    embedding_size: 512
+    class_num: 3997
+    margin: 0.15
+    scale: 30
+   
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - TripletLossV2:
+        weight: 1.0
+        margin: 0.5
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: MultiStepDecay
+    learning_rate: 0.04
+    milestones: [30, 60, 70, 80, 90, 100]
+    gamma: 0.5
+    verbose: False
+    last_epoch: -1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/Inshop/
+      cls_label_path: ./dataset/Inshop/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 0.00392157
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 0.4
+            r1: 0.3
+            mean: [0., 0., 0.]
+    sampler:
+      name: PKSampler
+      batch_size: 64
+      sample_per_id: 2
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+      
+  Eval:
+    Query:
+      dataset: 
+        name: ImageNetDataset
+        image_root: ./dataset/Inshop/
+        cls_label_path: ./dataset/Inshop/query_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: ImageNetDataset
+        image_root: ./dataset/Inshop/
+        cls_label_path: ./dataset/Inshop/gallery_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Products/ResNet50_vd_SOP.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Products/ResNet50_vd_SOP.yaml
new file mode 100644
index 000000000..7728a6678
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Products/ResNet50_vd_SOP.yaml
@@ -0,0 +1,156 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/product_ResNet50_vd_Aliproduct_v1.0_pretrained.pdparams"
+  output_dir: ./output/
+  device: gpu
+  save_interval: 10
+  eval_during_train: True
+  eval_interval: 10
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+
+# model architecture
+Arch:
+  name: RecModel
+  Backbone: 
+    name: ResNet50_vd
+    pretrained: False
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: FC
+    embedding_size: 2048
+    class_num: 512
+  Head:
+    name: ArcMargin  
+    embedding_size: 512
+    class_num: 11319
+    margin: 0.15
+    scale: 30
+  infer_output_key: features
+  infer_add_softmax: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - TripletLossV2:
+        weight: 1.0
+        margin: 0.5
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: MultiStepDecay
+    learning_rate: 0.01
+    milestones: [30, 60, 70, 80, 90, 100]
+    gamma: 0.5
+    verbose: False
+    last_epoch: -1
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: VeriWild
+      image_root: ./dataset/SOP/
+      cls_label_path: ./dataset/SOP/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 0.00392157
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 0.4
+            r1: 0.3
+            mean: [0., 0., 0.]
+
+    sampler:
+      name: PKSampler
+      batch_size: 64
+      sample_per_id: 2
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    Query:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/SOP/
+        cls_label_path: ./dataset/SOP/test_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 32
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/SOP/
+        cls_label_path: ./dataset/SOP/test_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 32
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ResNet50_UReID_infer.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ResNet50_UReID_infer.yaml
new file mode 100644
index 000000000..750851d7c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ResNet50_UReID_infer.yaml
@@ -0,0 +1,152 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  # pretrained_model: "./pd_model_trace/ISE/ISE_M_model" # pretrained ISE model for Market1501
+  # pretrained_model: "./pd_model_trace/ISE/ISE_MS_model" # pretrained ISE model for MSMT17
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 128, 256]
+  save_inference_dir: "./inference"
+  eval_mode: "retrieval"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "ResNet50_last_stage_stride1"
+    pretrained: True
+  BackboneStopLayer:
+    name: "avg_pool"
+  Neck:
+    name: "BNNeck"
+    num_features: 2048
+  Head:
+    name: "FC"  
+    embedding_size: 2048
+    class_num: 751
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - SupConLoss:
+        weight: 1.0
+        views: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "Market1501" # ["Market1501", "MSMT17"]
+        image_root: "./dataset"
+        cls_label_path: "bounding_box_train"
+        transform_ops:
+          - ResizeImage:
+              size: [128, 256]
+              interpolation: 'bicubic'
+              backend: 'pil'
+          - RandFlipImage:
+              flip_code: 1
+          - Pad:
+              padding: 10
+              fill: 0
+          - RandomCrop:
+              size: [128, 256]
+              pad_if_needed: False
+          - NormalizeImage:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+          - RandomErasing:
+              EPSILON: 0.5
+              sl: 0.02
+              sh: 0.4
+              r1: 0.3
+              mean: [0.485, 0.456, 0.406] 
+
+    sampler:
+        name: PKSampler
+        batch_size: 16
+        sample_per_id: 4
+        drop_last: True
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+  Eval:
+    Query:
+      dataset: 
+        name: "Market1501" # ["Market1501", "MSMT17"]
+        image_root: "./dataset"
+        cls_label_path: "query"
+        transform_ops:
+          - ResizeImage:
+              size: [128, 256]
+              interpolation: 'bicubic'
+              backend: 'pil'
+          - NormalizeImage:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: "Market1501" # ["Market1501", "MSMT17"]
+        image_root: "./dataset"
+        cls_label_path: "bounding_box_test"
+        transform_ops:
+          - ResizeImage:
+              size: [128, 256]
+              interpolation: 'bicubic'
+              backend: 'pil'
+          - NormalizeImage:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/SVTR/svtr_base.yml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/SVTR/svtr_base.yml
new file mode 100644
index 000000000..9ab15d5c4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/SVTR/svtr_base.yml
@@ -0,0 +1,146 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 50
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 48, 320]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SVTR_base
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: False
+            channel_first: False
+        - ResizeImage:
+            size:
+              - 320
+              - 48
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size:
+              - 320
+              - 48
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/SVTR/svtr_large.yml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/SVTR/svtr_large.yml
new file mode 100644
index 000000000..a3b556e97
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/SVTR/svtr_large.yml
@@ -0,0 +1,146 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 50
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 48, 320]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SVTR_large
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: False
+            channel_first: False
+        - ResizeImage:
+            size:
+              - 320
+              - 48
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size:
+              - 320
+              - 48
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/SVTR/svtr_tiny.yml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/SVTR/svtr_tiny.yml
new file mode 100644
index 000000000..21d6788af
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/SVTR/svtr_tiny.yml
@@ -0,0 +1,146 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 50
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 48, 320]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+
+# mixed precision
+AMP:
+  use_amp: False
+  use_fp16_test: False
+  scale_loss: 128.0
+  use_dynamic_loss_scaling: True
+  use_promote: False
+  # O1: mixed fp16, O2: pure fp16
+  level: O1
+
+
+# model architecture
+Arch:
+  name: SVTR_tiny
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: False
+            channel_first: False
+        - ResizeImage:
+            size:
+              - 320
+              - 48
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size:
+              - 320
+              - 48
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/StrategySearch/person.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/StrategySearch/person.yaml
new file mode 100644
index 000000000..906635595
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/StrategySearch/person.yaml
@@ -0,0 +1,40 @@
+base_config_file: ppcls/configs/PULC/person/PPLCNet/PPLCNet_x1_0_search.yaml
+distill_config_file: ppcls/configs/PULC/person/Distillation/PPLCNet_x1_0_distillation.yaml
+
+gpus: 0,1,2,3
+output_dir: output/search_person
+search_times: 1
+search_dict:
+  - search_key: lrs
+    replace_config:
+      - Optimizer.lr.learning_rate
+    search_values: [0.0075, 0.01, 0.0125]
+  - search_key: resolutions
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.1.RandCropImage.size
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.img_size
+    search_values: [176, 192, 224]
+  - search_key: ra_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.3.TimmAutoAugment.prob
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: re_probs
+    replace_config:
+      - DataLoader.Train.dataset.transform_ops.5.RandomErasing.EPSILON
+    search_values: [0.0, 0.1, 0.5]
+  - search_key: lr_mult_list
+    replace_config:
+      - Arch.lr_mult_list
+    search_values:
+      - [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
+      - [0.0, 0.4, 0.4, 0.8, 0.8, 1.0]
+      - [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+teacher:
+  rm_keys:
+    - Arch.lr_mult_list
+  search_values:
+    - ResNet101_vd
+    - ResNet50_vd
+final_replace:
+  Arch.lr_mult_list: Arch.models.1.Student.lr_mult_list
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Vehicle/PPLCNet_2.5x_ReID.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Vehicle/PPLCNet_2.5x_ReID.yaml
new file mode 100644
index 000000000..eb9f145a1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Vehicle/PPLCNet_2.5x_ReID.yaml
@@ -0,0 +1,158 @@
+# global configs
+# pretrained_model: https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/vehicle_reid_PPLCNet2.5x_VERIWild_v1.0_pretrained.pdparams
+# VeriWild v1 small: recall1: 0.93736, recall5: 0.98427, mAP: 0.82125
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output_reid/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 160
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  eval_mode: "retrieval"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "PPLCNet_x2_5"
+    pretrained: True
+    use_ssld: True
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: "FC"
+    embedding_size: 1280
+    class_num: 512
+  Head:
+    name: "ArcMargin"  
+    embedding_size: 512
+    class_num: 30671
+    margin: 0.15
+    scale: 32
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - SupConLoss:
+        weight: 1.0
+        views: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images/"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/train_list_start0.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - AugMix:
+              prob: 0.5
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+          - RandomErasing:
+              EPSILON: 0.5
+              sl: 0.02
+              sh: 0.4
+              r1: 0.3
+              mean: [0., 0., 0.]
+
+    sampler:
+        name: PKSampler
+        batch_size: 128
+        sample_per_id: 2
+        drop_last: True
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+  Eval:
+    Query:
+      dataset: 
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/test_3000_id_query.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/test_3000_id.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Vehicle/ResNet50.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Vehicle/ResNet50.yaml
new file mode 100644
index 000000000..6b6172475
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Vehicle/ResNet50.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 160
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "ResNet50_last_stage_stride1"
+    pretrained: True
+  BackboneStopLayer:
+    name: "avg_pool"
+  Neck:
+    name: "VehicleNeck"
+    in_channels: 2048
+    out_channels: 512
+  Head:
+    name: "ArcMargin"  
+    embedding_size: 512
+    class_num: 431
+    margin: 0.15
+    scale: 32
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - SupConLoss:
+        weight: 1.0
+        views: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "CompCars"
+        image_root: "./dataset/CompCars/image/"
+        label_root: "./dataset/CompCars/label/"
+        bbox_crop: True
+        cls_label_path: "./dataset/CompCars/train_test_split/classification/train_label.txt"
+        transform_ops:
+          - ResizeImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - AugMix:
+              prob: 0.5
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+          - RandomErasing:
+              EPSILON: 0.5
+              sl: 0.02
+              sh: 0.4
+              r1: 0.3
+              mean: [0., 0., 0.]
+
+    sampler:
+        name: PKSampler
+        batch_size: 128
+        sample_per_id: 2
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: "CompCars"
+        image_root: "./dataset/CompCars/image/"
+        label_root: "./dataset/CompCars/label/"
+        cls_label_path: "./dataset/CompCars/train_test_split/classification/test_label.txt"
+        bbox_crop: True
+        transform_ops:
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+Metric:
+    Train:
+    - TopkAcc:
+        topk: [1, 5]
+    Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Vehicle/ResNet50_ReID.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Vehicle/ResNet50_ReID.yaml
new file mode 100644
index 000000000..c13d59afd
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/Vehicle/ResNet50_ReID.yaml
@@ -0,0 +1,155 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 160
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  eval_mode: "retrieval"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "ResNet50_last_stage_stride1"
+    pretrained: True
+  BackboneStopLayer:
+    name: "avg_pool"
+  Neck:
+    name: "VehicleNeck"
+    in_channels: 2048
+    out_channels: 512
+  Head:
+    name: "ArcMargin"  
+    embedding_size: 512
+    class_num: 30671
+    margin: 0.15
+    scale: 32
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - SupConLoss:
+        weight: 1.0
+        views: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images/"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/train_list_start0.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - AugMix:
+              prob: 0.5
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+          - RandomErasing:
+              EPSILON: 0.5
+              sl: 0.02
+              sh: 0.4
+              r1: 0.3
+              mean: [0., 0., 0.]
+
+    sampler:
+        name: PKSampler
+        batch_size: 128
+        sample_per_id: 2
+        drop_last: True
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+  Eval:
+    Query:
+      dataset: 
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/test_3000_id_query.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/test_3000_id.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/metric_learning/adaface_ir18.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/metric_learning/adaface_ir18.yaml
new file mode 100644
index 000000000..2cbfe5da4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/metric_learning/adaface_ir18.yaml
@@ -0,0 +1,105 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 26
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 112, 112]
+  save_inference_dir: "./inference"
+  eval_mode: "adaface"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "AdaFace_IR_18"
+    input_size: [112, 112]
+  Head:
+    name: "AdaMargin"  
+    embedding_size: 512
+    class_num: 70722
+    m: 0.4
+    s: 64
+    h: 0.333
+    t_alpha: 0.01
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [12, 20, 24]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "AdaFaceDataset"
+        root_dir: "dataset/face/"
+        label_path: "dataset/face/train_filter_label.txt"
+        transform:
+          - CropWithPadding:
+              prob: 0.2
+              padding_num: 0
+              size: [112, 112]
+              scale: [0.2, 1.0]
+              ratio: [0.75, 1.3333333333333333]
+          - RandomInterpolationAugment:
+              prob: 0.2
+          - ColorJitter:
+              prob: 0.2
+              brightness: 0.5
+              contrast: 0.5
+              saturation: 0.5
+              hue: 0
+          - RandomHorizontalFlip:
+          - ToTensor:
+          - Normalize:
+              mean: [0.5, 0.5, 0.5]
+              std: [0.5, 0.5, 0.5]
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 256
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+
+  Eval:
+    dataset:
+      name: FiveValidationDataset
+      val_data_path: dataset/face/faces_emore
+      concat_mem_file_name: dataset/face/faces_emore/concat_validation_memfile
+    sampler:
+        name: BatchSampler
+        batch_size: 256
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/metric_learning/xbm_resnet50.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/metric_learning/xbm_resnet50.yaml
new file mode 100644
index 000000000..44dcfe065
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/metric_learning/xbm_resnet50.yaml
@@ -0,0 +1,170 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 35
+  iter_per_epoch: &iter_per_epoch 1000
+  print_batch_step: 20
+  use_visualdl: False
+  eval_mode: retrieval
+  retrieval_feature_from: features # 'backbone' or 'features'
+  re_ranking: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+  Backbone:
+    name: ResNet50_adaptive_max_pool2d
+    pretrained: https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/others/resnet50-19c8e357_torch2paddle.pdparams
+    stem_act: null
+  BackboneStopLayer:
+    name: flatten
+  Neck:
+    name: FC
+    embedding_size: 2048
+    class_num: &feat_dim 128
+    weight_attr:
+      initializer:
+        name: KaimingNormal
+        fan_in: *feat_dim
+        negative_slope: 0.0
+        nonlinearity: leaky_relu
+    bias_attr:
+      initializer:
+        name: Constant
+        value: 0.0
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - ContrastiveLoss_XBM:
+        weight: 1.0
+        xbm_size: 55000
+        xbm_weight: 1.0
+        start_iter: 1000
+        margin: 0.5
+        embedding_size: *feat_dim
+        epsilon: 1.0e-5
+        normalize_feature: True
+        feature_from: features
+  Eval:
+    - ContrastiveLoss:
+        weight: 1.0
+        margin: 0.5
+        embedding_size: *feat_dim
+        normalize_feature: True
+        epsilon: 1.0e-5
+        feature_from: features
+
+Optimizer:
+  name: Adam
+  lr:
+    name: ReduceOnPlateau
+    learning_rate: 0.0001
+    mode: max
+    factor: 0.1
+    patience: 4
+    threshold: 0.001
+    threshold_mode: rel
+    cooldown: 2
+    min_lr: 0.000005
+    epsilon: 1e-8
+    by_epoch: True
+  regularizer:
+    name: L2
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: VeriWild
+      image_root: ./dataset/SOP
+      cls_label_path: ./dataset/SOP/train_list.txt
+      backend: pil
+      transform_ops:
+        - Resize:
+            size: 256
+        - RandomResizedCrop:
+            scale: [0.2, 1]
+            size: 224
+        - RandomHorizontalFlip:
+            prob: 0.5
+        - ToTensor:
+        - Normalize:
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+    sampler:
+      name: DistributedRandomIdentitySampler
+      batch_size: 64
+      num_instances: 4
+      drop_last: False
+      shuffle: True
+      max_iters: *iter_per_epoch
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+  Eval:
+    Gallery:
+      dataset:
+        name: VeriWild
+        image_root: ./dataset/SOP
+        cls_label_path: ./dataset/SOP/test_list.txt
+        backend: pil
+        transform_ops:
+          - Resize:
+              size: 256
+          - CenterCrop:
+              size: 224
+          - ToTensor:
+          - Normalize:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 256
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Query:
+      dataset:
+        name: VeriWild
+        image_root: ./dataset/SOP
+        cls_label_path: ./dataset/SOP/test_list.txt
+        backend: pil
+        transform_ops:
+          - Resize:
+              size: 256
+          - CenterCrop:
+              size: 224
+          - ToTensor:
+          - Normalize:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 256
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 8
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/multi_scale/MobileNetV1_multi_scale.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/multi_scale/MobileNetV1_multi_scale.yaml
new file mode 100644
index 000000000..530e75075
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/multi_scale/MobileNetV1_multi_scale.yaml
@@ -0,0 +1,138 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+
+# model architecture
+Arch:
+  name: MobileNetV1
+  class_num: 1000
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiScaleDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+            
+    # support to specify width and height respectively:
+    # scales: [(160,160), (192,192), (224,224) (288,288) (320,320)]
+    sampler:
+      name: MultiScaleSampler
+      scales: [160, 192, 224, 288, 320]
+      # first_bs: batch size for the first image resolution in the scales list
+      # divide_factor: to ensure the width and height dimensions can be devided by downsampling multiple
+      first_bs: 64
+      divided_factor: 32
+      is_training: True
+
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/whl/demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/practical_models/.gitkeep b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/practical_models/.gitkeep
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/practical_models/.gitkeep
@@ -0,0 +1 @@
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/practical_models/CLIP_large_patch14_224_aesthetic.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/practical_models/CLIP_large_patch14_224_aesthetic.yaml
new file mode 100644
index 000000000..8d0a72371
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/practical_models/CLIP_large_patch14_224_aesthetic.yaml
@@ -0,0 +1,78 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 50
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: CLIP_large_patch14_224_aesthetic
+  pretrained: True
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: False
+
+Infer:
+  infer_imgs: deploy/images/practical/aesthetic_score_predictor/Highscore.png
+  batch_size: 1
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ScoreOutput
+    decimal_places: 2
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/practical_models/EfficientNetB3_watermark.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/practical_models/EfficientNetB3_watermark.yaml
new file mode 100644
index 000000000..2fd142ae7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/practical_models/EfficientNetB3_watermark.yaml
@@ -0,0 +1,81 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 50
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: EfficientNetB3_watermark
+  pretrained: True
+  class_num: 2
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: False
+
+Infer:
+  infer_imgs: deploy/images/practical/watermark_exists/watermark_example.png
+  batch_size: 1
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: ThreshOutput
+    threshold: 0.5
+    label_0: contains_watermark
+    label_1: no_watermark
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 2]
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/practical_models/PPHGNet_tiny_calling_halfbody.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/practical_models/PPHGNet_tiny_calling_halfbody.yaml
new file mode 100644
index 000000000..c6415cd47
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/practical_models/PPHGNet_tiny_calling_halfbody.yaml
@@ -0,0 +1,150 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 50
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  # training model under @to_static
+  to_static: False
+  use_dali: False
+
+# model architecture
+Arch:
+  name: PPHGNet_tiny
+  class_num: 2
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.05 
+    warmup_epoch: 3
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/phone_train_list_halfbody.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - TimmAutoAugment:
+            config_str: rand-m7-mstd0.5-inc1
+            interpolation: bicubic
+            img_size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - RandomErasing:
+            EPSILON: 0.25
+            sl: 0.02
+            sh: 1.0/3.0
+            r1: 0.3
+            attempt: 10
+            use_log_aspect: True
+            mode: pixel
+      batch_transform_ops:
+        - OpSampler:
+            MixupOperator:
+              alpha: 0.2
+              prob: 0.5
+            CutmixOperator:
+              alpha: 1.0
+              prob: 0.5
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 2
+      use_shared_memory: False
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/phone_val_list_halfbody.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+            interpolation: bicubic
+            backend: pil
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 2
+      use_shared_memory: False
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 1
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 2
+    class_id_map_file: dataset/phone_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 1]
+  Eval:
+    - TopkAcc:
+        topk: [1, 1]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/MobileNetV1_retrieval.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/MobileNetV1_retrieval.yaml
new file mode 100644
index 000000000..bac477392
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/MobileNetV1_retrieval.yaml
@@ -0,0 +1,157 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 5
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 50
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+
+  Backbone:
+    name: MobileNetV1
+    pretrained: False
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: FC
+    embedding_size: 1024
+    class_num: 512
+  Head:
+    name: ArcMargin
+    embedding_size: 512
+    class_num: 101
+    margin: 0.15
+    scale: 30
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - TripletLossV2:
+        weight: 1.0
+        margin: 0.5
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: MultiStepDecay
+    learning_rate: 0.01
+    milestones: [20, 30, 40]
+    gamma: 0.5
+    verbose: False
+    last_epoch: -1
+  regularizer:
+    name: "L2"
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: VeriWild
+      image_root: ./dataset/CUB_200_2011/
+      cls_label_path: ./dataset/CUB_200_2011/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 0.00392157
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ""
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 0.4
+            r1: 0.3
+            mean: [0., 0., 0.]
+    sampler:
+      name: DistributedRandomIdentitySampler
+      batch_size: 64
+      num_instances: 2
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset:
+        name: VeriWild
+        image_root: ./dataset/CUB_200_2011/
+        cls_label_path: ./dataset/CUB_200_2011/test_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ""
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: VeriWild
+        image_root: ./dataset/CUB_200_2011/
+        cls_label_path: ./dataset/CUB_200_2011/test_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ""
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/MobileNetV3_large_x1_0.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/MobileNetV3_large_x1_0.yaml
new file mode 100644
index 000000000..d87dc0991
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/MobileNetV3_large_x1_0.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x1_0
+  class_num: 102
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.00375
+    warmup_epoch: 5
+    last_epoch: -1
+  regularizer:
+    name: 'L2'
+    coeff: 0.000001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/flowers102/
+      cls_label_path: ./dataset/flowers102/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/flowers102/
+      cls_label_path: ./dataset/flowers102/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ./dataset/flowers102/flowers102_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/ResNet50_vd.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/ResNet50_vd.yaml
new file mode 100644
index 000000000..90b2c88d6
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/ResNet50_vd.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50_vd 
+  class_num: 102
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.0125
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/flowers102/
+      cls_label_path: ./dataset/flowers102/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 32
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/flowers102/
+      cls_label_path: ./dataset/flowers102/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ./dataset/flowers102/flowers102_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/kunlun/HRNet_W18_C_finetune_kunlun.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/kunlun/HRNet_W18_C_finetune_kunlun.yaml
new file mode 100644
index 000000000..6a461ccfa
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/kunlun/HRNet_W18_C_finetune_kunlun.yaml
@@ -0,0 +1,68 @@
+mode: 'train'
+ARCHITECTURE:
+    name: 'HRNet_W18_C'
+pretrained_model: "./pretrained/HRNet_W18_C_pretrained"
+model_save_dir: "./output/"
+classes_num: 102
+total_images: 1020
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 10
+topk: 5
+image_shape: [3, 224, 224]
+
+LEARNING_RATE:
+    function: 'Cosine'
+    params:
+        lr: 0.00375
+
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000001
+
+TRAIN:
+    batch_size: 20
+    num_workers: 0
+    file_list: "./dataset/flowers102/train_list.txt"
+    data_dir: "./dataset/flowers102/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+
+VALID:
+    batch_size: 20
+    num_workers: 0
+    file_list: "./dataset/flowers102/val_list.txt"
+    data_dir: "./dataset/flowers102/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/kunlun/ResNet50_vd_finetune_kunlun.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/kunlun/ResNet50_vd_finetune_kunlun.yaml
new file mode 100644
index 000000000..7fad5eebe
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/kunlun/ResNet50_vd_finetune_kunlun.yaml
@@ -0,0 +1,69 @@
+mode: 'train'
+ARCHITECTURE:
+    name: 'ResNet50_vd'
+pretrained_model: "./pretrained/ResNet50_vd_pretrained"
+load_static_weights: true
+model_save_dir: "./output/"
+classes_num: 102
+total_images: 1020
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 20
+topk: 5
+image_shape: [3, 224, 224]
+
+LEARNING_RATE:
+    function: 'Cosine'          
+    params:                   
+        lr: 0.00375
+
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.000001
+
+TRAIN:
+    batch_size: 20
+    num_workers: 1
+    file_list: "./dataset/flowers102/train_list.txt"
+    data_dir: "./dataset/flowers102/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+
+VALID:
+    batch_size: 20
+    num_workers: 1
+    file_list: "./dataset/flowers102/val_list.txt"
+    data_dir: "./dataset/flowers102/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/kunlun/VGG16_finetune_kunlun.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/kunlun/VGG16_finetune_kunlun.yaml
new file mode 100644
index 000000000..389a5f35f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/kunlun/VGG16_finetune_kunlun.yaml
@@ -0,0 +1,70 @@
+mode: 'train'
+ARCHITECTURE:
+    name: 'VGG16'
+    params:
+        stop_grad_layers: 5
+pretrained_model: "./pretrained/VGG16_pretrained"
+model_save_dir: "./output/"
+classes_num: 102
+total_images: 1020
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 20
+topk: 5
+image_shape: [3, 224, 224]
+
+LEARNING_RATE:
+    function: 'Cosine'
+    params:
+        lr: 0.0005
+
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00001
+
+TRAIN:
+    batch_size: 20
+    num_workers: 0
+    file_list: "./dataset/flowers102/train_list.txt"
+    data_dir: "./dataset/flowers102/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+
+VALID:
+    batch_size: 20
+    num_workers: 0
+    file_list: "./dataset/flowers102/val_list.txt"
+    data_dir: "./dataset/flowers102/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/kunlun/VGG19_finetune_kunlun.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/kunlun/VGG19_finetune_kunlun.yaml
new file mode 100644
index 000000000..6ba38b974
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/kunlun/VGG19_finetune_kunlun.yaml
@@ -0,0 +1,70 @@
+mode: 'train'
+ARCHITECTURE:
+    name: 'VGG19'
+    params:
+        stop_grad_layers: 5
+pretrained_model: "./pretrained/VGG19_pretrained"
+model_save_dir: "./output/"
+classes_num: 102
+total_images: 1020
+save_interval: 1
+validate: True
+valid_interval: 1
+epochs: 20
+topk: 5
+image_shape: [3, 224, 224]
+
+LEARNING_RATE:
+    function: 'Cosine'
+    params:
+        lr: 0.0005
+
+OPTIMIZER:
+    function: 'Momentum'
+    params:
+        momentum: 0.9
+    regularizer:
+        function: 'L2'
+        factor: 0.00001
+
+TRAIN:
+    batch_size: 20
+    num_workers: 0
+    file_list: "./dataset/flowers102/train_list.txt"
+    data_dir: "./dataset/flowers102/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+
+VALID:
+    batch_size: 20
+    num_workers: 0
+    file_list: "./dataset/flowers102/val_list.txt"
+    data_dir: "./dataset/flowers102/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/new_user/ShuffleNetV2_x0_25.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/new_user/ShuffleNetV2_x0_25.yaml
new file mode 100644
index 000000000..124636690
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/new_user/ShuffleNetV2_x0_25.yaml
@@ -0,0 +1,129 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: cpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 20
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ShuffleNetV2_x0_25
+  class_num: 102
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.0125
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/flowers102/
+      cls_label_path: ./dataset/flowers102/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/flowers102/
+      cls_label_path: ./dataset/flowers102/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ./dataset/flowers102/flowers102_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/professional/MobileNetV1_multilabel.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/professional/MobileNetV1_multilabel.yaml
new file mode 100644
index 000000000..7c64ae3b3
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/professional/MobileNetV1_multilabel.yaml
@@ -0,0 +1,130 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 10
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  use_multilabel: True
+# model architecture
+Arch:
+  name: MobileNetV1
+  class_num: 33
+  pretrained: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - MultiLabelLoss:
+        weight: 1.0
+  Eval:
+    - MultiLabelLoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00004
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: MultiLabelDataset
+      image_root: ./dataset/NUS-WIDE-SCENE/NUS-SCENE-dataset/images/
+      cls_label_path: ./dataset/NUS-WIDE-SCENE/NUS-SCENE-dataset/multilabel_train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: MultiLabelDataset
+      image_root: ./dataset/NUS-WIDE-SCENE/NUS-SCENE-dataset/images/
+      cls_label_path: ./dataset/NUS-WIDE-SCENE/NUS-SCENE-dataset/multilabel_test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: deploy/images/0517_2715693311.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: MultiLabelThreshOutput
+    threshold: 0.5
+    class_id_map_file: "ppcls/utils/NUS-WIDE-SCENE_label_list.txt"
+    delimiter: " "
+
+Metric:
+  Train:
+    - AccuracyScore:
+    - HammingDistance:
+  Eval:
+    - AccuracyScore:
+    - HammingDistance:
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/professional/MobileNetV3_large_x1_0_CIFAR100_finetune.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/professional/MobileNetV3_large_x1_0_CIFAR100_finetune.yaml
new file mode 100644
index 000000000..423a45389
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/professional/MobileNetV3_large_x1_0_CIFAR100_finetune.yaml
@@ -0,0 +1,127 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 32, 32]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x1_0
+  class_num: 100
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/CIFAR100/
+      cls_label_path: ./dataset/CIFAR100/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 32
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/CIFAR100/
+      cls_label_path: ./dataset/CIFAR100/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 36
+        - CropImage:
+            size: 32
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 36
+    - CropImage:
+        size: 32
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/professional/R50_vd_distill_MV3_large_x1_0_CIFAR100.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/professional/R50_vd_distill_MV3_large_x1_0_CIFAR100.yaml
new file mode 100644
index 000000000..a27068d16
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/professional/R50_vd_distill_MV3_large_x1_0_CIFAR100.yaml
@@ -0,0 +1,151 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 32, 32]
+  save_inference_dir: "./inference"
+
+# model architecture
+Arch:
+  name: "DistillationModel"
+  # if not null, its lengths should be same as models
+  pretrained_list:
+  # if not null, its lengths should be same as models
+  freeze_params_list:
+  - True
+  - False
+  models:
+    - Teacher:
+        name: ResNet50_vd
+        class_num: 100
+        pretrained: "./pretrained/best_model"
+    - Student:
+        name: MobileNetV3_large_x1_0
+        class_num: 100
+        pretrained: True
+
+  infer_model_name: "Student"
+
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - DistillationCELoss:
+        weight: 1.0
+        model_name_pairs:
+        - ["Student", "Teacher"]
+  Eval:
+    - DistillationGTCELoss:
+        weight: 1.0
+        model_names: ["Student"]
+        
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: ImageNetDataset
+        image_root: "./dataset/CIFAR100/"
+        cls_label_path: "./dataset/CIFAR100/train_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - RandCropImage:
+              size: 32
+          - RandFlipImage:
+              flip_code: 1
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 512
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: ImageNetDataset
+        image_root: "./dataset/CIFAR100/"
+        cls_label_path: "./dataset/CIFAR100/test_list.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              resize_short: 36
+          - CropImage:
+              size: 32
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+
+Infer:
+  infer_imgs: "docs/images/inference_deployment/whl_demo.jpg"
+  batch_size: 10
+  transforms:
+      - DecodeImage:
+          to_rgb: True
+          channel_first: False
+      - ResizeImage:
+          resize_short: 36
+      - CropImage:
+          size: 32
+      - NormalizeImage:
+          scale: 1.0/255.0
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+          order: ''
+      - ToCHWImage:
+  PostProcess:
+    name: DistillationPostProcess
+    func: Topk
+    topk: 5
+
+Metric:
+    Train:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
+    Eval:
+    - DistillationTopkAcc:
+        model_key: "Student"
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/professional/ResNet50_vd_CIFAR100.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/professional/ResNet50_vd_CIFAR100.yaml
new file mode 100644
index 000000000..10c326a40
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/professional/ResNet50_vd_CIFAR100.yaml
@@ -0,0 +1,127 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 32, 32]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50_vd
+  class_num: 100
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/CIFAR100/
+      cls_label_path: ./dataset/CIFAR100/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 32
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/CIFAR100/
+      cls_label_path: ./dataset/CIFAR100/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 36
+        - CropImage:
+            size: 32
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 36
+    - CropImage:
+        size: 32
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/professional/ResNet50_vd_mixup_CIFAR100_finetune.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/professional/ResNet50_vd_mixup_CIFAR100_finetune.yaml
new file mode 100644
index 000000000..d8ff817f8
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/professional/ResNet50_vd_mixup_CIFAR100_finetune.yaml
@@ -0,0 +1,127 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 100
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 32, 32]
+  save_inference_dir: ./inference
+
+# model architecture
+Arch:
+  name: ResNet50_vd
+  class_num: 100
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.04
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/CIFAR100/
+      cls_label_path: ./dataset/CIFAR100/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 32
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/CIFAR100/
+      cls_label_path: ./dataset/CIFAR100/test_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 36
+        - CropImage:
+            size: 32
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 36
+    - CropImage:
+        size: 32
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/professional/VGG19_CIFAR10_DeepHash.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/professional/VGG19_CIFAR10_DeepHash.yaml
new file mode 100644
index 000000000..97228828a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/quick_start/professional/VGG19_CIFAR10_DeepHash.yaml
@@ -0,0 +1,147 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  eval_mode: "retrieval"
+  epochs: 128
+  print_batch_step: 10
+  use_visualdl: False
+
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+  #feature postprocess
+  feature_normalize: False
+  feature_binarize: "round"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  Backbone:
+    name: "VGG19Sigmoid"
+    pretrained: True
+    class_num: 48
+  Head:
+    name: "FC"
+    class_num: 10
+    embedding_size: 48
+    
+  infer_output_key:  "features"
+  infer_add_softmax: "false"
+
+# loss function config for train/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.01
+    decay_epochs: [200]
+    values: [0.01, 0.001]
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/cifar10/
+      cls_label_path: ./dataset/cifar10/cifar10-2/train.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            size: 256
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2023, 0.1994, 0.2010]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset: 
+        name: ImageNetDataset
+        image_root: ./dataset/cifar10/
+        cls_label_path: ./dataset/cifar10/cifar10-2/test.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.4914, 0.4822, 0.4465]
+              std: [0.2023, 0.1994, 0.2010]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 512
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: ImageNetDataset
+        image_root: ./dataset/cifar10/
+        cls_label_path: ./dataset/cifar10/cifar10-2/database.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 1.0/255.0
+              mean: [0.4914, 0.4822, 0.4465]
+              std: [0.2023, 0.1994, 0.2010]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 512
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - mAP:
+    - Precisionk:
+        topk: [1, 5]
+        
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/reid/MetaBIN_ResNet50_cross_domain.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/reid/MetaBIN_ResNet50_cross_domain.yaml
new file mode 100644
index 000000000..19c9adafc
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/reid/MetaBIN_ResNet50_cross_domain.yaml
@@ -0,0 +1,277 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  iter_per_epoch: &iter_per_epoch 50
+  warmup_iter: 10
+  save_interval: 8
+  eval_during_train: True
+  eval_interval: 8
+  epochs: &epochs 348 # 348*50 = 120*145 = 17400 iters
+  print_batch_step: 25
+  use_visualdl: False
+  eval_mode: "retrieval"
+  retrieval_feature_from: "features" # 'backbone' or 'features'
+  re_ranking: False
+  # used for static mode and model export
+  image_shape: [3, 256, 128]
+  save_inference_dir: "./inference"
+  train_mode: "metabin"
+
+AMP:
+  scale_loss: 65536
+  use_dynamic_loss_scaling: True
+
+# model architecture
+Arch:
+  name: "RecModel"
+  Backbone:
+    name: "ResNet50_metabin"
+    pretrained: https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/others/resnet50-19c8e357_torch2paddle.pdparams
+    bias_lr_factor: 2.0
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: MetaBNNeck
+    num_features: &feat_dim 2048
+    use_global_stats: True
+  Head:
+    name: "FC"
+    embedding_size: *feat_dim
+    class_num: 751
+    weight_attr:
+      initializer:
+        name: KaimingUniform
+        negative_slope: 2.23606 # math.sqrt(5)
+        nonlinearity: "leaky_relu"
+    bias_attr: False
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: "Market1501"
+      image_root: "./dataset/"
+      cls_label_path: "bounding_box_train"
+      backend: "pil"
+      transform_ops:
+        - ResizeImage:
+            size: [128, 256]
+            return_numpy: False
+            interpolation: "bicubic"
+            backend: "pil"
+        - RandFlipImage:
+            flip_code: 1
+        - Pad:
+            padding: 10
+        - RandCropImageV2:
+            size: [128, 256]
+        - ColorJitter:
+            brightness: 0.15
+            contrast: 0.15
+            saturation: 0.1
+            hue: 0.1
+        - ToTensor:
+        - Normalize:
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+    sampler:
+      name: NaiveIdentityBatchSampler
+      batch_size: 96
+      num_instances: 4
+      drop_last: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Metalearning:
+    Train:
+      dataset:
+        name: "Market1501"
+        image_root: "./dataset/"
+        cls_label_path: "bounding_box_train"
+        backend: "pil"
+        transform_ops:
+          - ResizeImage:
+              size: [128, 256]
+              return_numpy: False
+              interpolation: "bicubic"
+              backend: "pil"
+          - RandFlipImage:
+              flip_code: 1
+          - Pad:
+              padding: 10
+          - RandCropImageV2:
+              size: [128, 256]
+          - ColorJitter:
+              brightness: 0.15
+              contrast: 0.15
+              saturation: 0.1
+              hue: 0.1
+          - ToTensor:
+          - Normalize:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+      sampler:
+        name: DomainShuffleBatchSampler
+        batch_size: 96
+        num_instances: 4
+        drop_last: True
+        camera_to_domain: True
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset:
+        name: "DukeMTMC"
+        image_root: "./dataset/"
+        cls_label_path: "query"
+        backend: "pil"
+        transform_ops:
+          - ResizeImage:
+              size: [128, 256]
+              return_numpy: False
+              interpolation: "bicubic"
+              backend: "pil"
+          - ToTensor:
+          - Normalize:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: "DukeMTMC"
+        image_root: "./dataset/"
+        cls_label_path: "bounding_box_test"
+        backend: "pil"
+        transform_ops:
+          - ResizeImage:
+              size: [128, 256]
+              return_numpy: False
+              interpolation: "bicubic"
+              backend: "pil"
+          - ToTensor:
+          - Normalize:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELossForMetaBIN:
+        weight: 1.0
+        epsilon: 0.1
+    - TripletLossForMetaBIN:
+        weight: 1.0
+        margin: 0.3
+        feature_from: "backbone"
+    - IntraDomainScatterLoss:
+        weight: 1.0
+        normalize_feature: True
+        feature_from: "backbone"
+    - InterDomainShuffleLoss:
+        weight: 1.0
+        normalize_feature: False
+        feature_from: "backbone"
+  Basic:
+    - CELossForMetaBIN:
+        weight: 1.0
+        epsilon: 0.1
+    - TripletLossForMetaBIN:
+        weight: 1.0
+        margin: 0.3
+        feature_from: "backbone"
+  MetaTrain:
+    - CELossForMetaBIN:
+        weight: 1.0
+        epsilon: 0.1
+    - TripletLossForMetaBIN:
+        weight: 1.0
+        margin: 0.3
+        feature_from: "backbone"
+    - IntraDomainScatterLoss:
+        weight: 1.0
+        normalize_feature: True
+        feature_from: "backbone"
+    - InterDomainShuffleLoss:
+        weight: 1.0
+        normalize_feature: False
+        feature_from: "backbone"
+  MetaTest:
+    - CELossForMetaBIN:
+        weight: 1.0
+        epsilon: 0.1
+    - TripletLossForMetaBIN:
+        weight: 1.0
+        margin: 0.3
+        feature_from: "backbone"
+  Eval:
+    - TripletLossForMetaBIN:
+        weight: 1.0
+        margin: 0.3
+        feature_from: "backbone"
+
+Optimizer:
+  - Momentum:
+      scope: ".*(conv|batch_norm|instance_norm|feat_bn|fc)"
+      lr:
+        name: MultiStepDecay
+        epochs: *epochs
+        learning_rate: 0.01
+        step_each_epoch: *iter_per_epoch
+        milestones: [145, 261]
+        gamma: 0.1
+        warmup_epoch: 29
+        warmup_start_lr: 0.0001
+        by_epoch: False
+        last_epoch: -1
+      momentum: 0.9
+      regularizer:
+        name: "L2"
+        coeff: 0.0005
+  - SGD:
+      scope: "backbone.*gate"
+      lr:
+        name: Constant
+        learning_rate: 0.2
+        last_epoch: -1
+  - SGD:
+      scope: "RecModel"
+      lr:
+        name: Cyclic
+        epochs: *epochs
+        step_each_epoch: *iter_per_epoch
+        base_learning_rate: 0.001
+        max_learning_rate: 0.1
+        warmup_epoch: 0
+        warmup_start_lr: 1
+        step_size_up: 1095
+        step_size_down: 1095
+        by_epoch: False
+        last_epoch: 0
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5, 10]
+    - mAP: {}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/reid/strong_baseline/baseline.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/reid/strong_baseline/baseline.yaml
new file mode 100644
index 000000000..5c83b8da8
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/reid/strong_baseline/baseline.yaml
@@ -0,0 +1,158 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 40
+  eval_during_train: True
+  eval_interval: 10
+  epochs: 120
+  print_batch_step: 20
+  use_visualdl: False
+  eval_mode: "retrieval"
+  retrieval_feature_from: "backbone" # 'backbone' or 'neck'
+  re_ranking: False
+  # used for static mode and model export
+  image_shape: [3, 256, 128]
+  save_inference_dir: "./inference"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone:
+    name: "ResNet50"
+    pretrained: https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/others/resnet50-19c8e357_torch2paddle.pdparams
+    stem_act: null
+  BackboneStopLayer:
+    name: "flatten"
+  Head:
+    name: "FC"
+    embedding_size: 2048
+    class_num: 751
+    weight_attr:
+      initializer:
+        name: KaimingUniform
+        fan_in: 12288 # 6*embedding_size
+    bias_attr:
+      initializer:
+        name: KaimingUniform
+        fan_in: 12288 # 6*embedding_size
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - TripletLossV2:
+        weight: 1.0
+        margin: 0.3
+        normalize_feature: False
+        feature_from: "backbone"
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Adam
+  lr:
+    name: Piecewise
+    decay_epochs: [40, 70]
+    values: [0.00035, 0.000035, 0.0000035]
+    by_epoch: True
+    last_epoch: 0
+  regularizer:
+    name: "L2"
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: "Market1501"
+      image_root: "./dataset/"
+      cls_label_path: "bounding_box_train"
+      backend: "pil"
+      transform_ops:
+        - ResizeImage:
+            size: [128, 256]
+            return_numpy: False
+            interpolation: "bilinear"
+            backend: "pil"
+        - RandFlipImage:
+            flip_code: 1
+        - Pad:
+            padding: 10
+        - RandCropImageV2:
+            size: [128, 256]
+        - ToTensor:
+        - Normalize:
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+    sampler:
+      name: DistributedRandomIdentitySampler
+      batch_size: 64
+      num_instances: 4
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    Query:
+      dataset:
+        name: "Market1501"
+        image_root: "./dataset/"
+        cls_label_path: "query"
+        backend: "pil"
+        transform_ops:
+          - ResizeImage:
+              size: [128, 256]
+              return_numpy: False
+              interpolation: "bilinear"
+              backend: "pil"
+          - ToTensor:
+          - Normalize:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: "Market1501"
+        image_root: "./dataset/"
+        cls_label_path: "bounding_box_test"
+        backend: "pil"
+        transform_ops:
+          - ResizeImage:
+              size: [128, 256]
+              return_numpy: False
+              interpolation: "bilinear"
+              backend: "pil"
+          - ToTensor:
+          - Normalize:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/reid/strong_baseline/softmax_triplet.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/reid/strong_baseline/softmax_triplet.yaml
new file mode 100644
index 000000000..43f1de62f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/reid/strong_baseline/softmax_triplet.yaml
@@ -0,0 +1,176 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 40
+  eval_during_train: True
+  eval_interval: 10
+  epochs: 120
+  print_batch_step: 20
+  use_visualdl: False
+  eval_mode: "retrieval"
+  retrieval_feature_from: "features" # 'backbone' or 'features'
+  re_ranking: False
+  # used for static mode and model export
+  image_shape: [3, 256, 128]
+  save_inference_dir: "./inference"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone:
+    name: "ResNet50_last_stage_stride1"
+    pretrained: https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/others/resnet50-19c8e357_torch2paddle.pdparams
+    stem_act: null
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: BNNeck
+    num_features: &feat_dim 2048
+    weight_attr:
+      initializer:
+        name: Constant
+        value: 1.0
+    bias_attr:
+      initializer:
+        name: Constant
+        value: 0.0
+      learning_rate: 1.0e-20 # NOTE: Temporarily set lr small enough to freeze the bias to zero
+  Head:
+    name: "FC"
+    embedding_size: *feat_dim
+    class_num: 751
+    weight_attr:
+      initializer:
+        name: Normal
+        std: 0.001
+    bias_attr: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+    - TripletLossV2:
+        weight: 1.0
+        margin: 0.3
+        normalize_feature: False
+        feature_from: "backbone"
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Adam
+  lr:
+    name: Piecewise
+    decay_epochs: [30, 60]
+    values: [0.00035, 0.000035, 0.0000035]
+    warmup_epoch: 10
+    warmup_start_lr: 0.0000035
+    by_epoch: True
+    last_epoch: 0
+  regularizer:
+    name: "L2"
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: "Market1501"
+      image_root: "./dataset/"
+      cls_label_path: "bounding_box_train"
+      backend: "pil"
+      transform_ops:
+        - ResizeImage:
+            size: [128, 256]
+            return_numpy: False
+            interpolation: "bilinear"
+            backend: "pil"
+        - RandFlipImage:
+            flip_code: 1
+        - Pad:
+            padding: 10
+        - RandCropImageV2:
+            size: [128, 256]
+        - ToTensor:
+        - Normalize:
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 0.4
+            r1: 0.3
+            mean: [0.485, 0.456, 0.406]
+    sampler:
+      name: DistributedRandomIdentitySampler
+      batch_size: 64
+      num_instances: 4
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    Query:
+      dataset:
+        name: "Market1501"
+        image_root: "./dataset/"
+        cls_label_path: "query"
+        backend: "pil"
+        transform_ops:
+          - ResizeImage:
+              size: [128, 256]
+              return_numpy: False
+              interpolation: "bilinear"
+              backend: "pil"
+          - ToTensor:
+          - Normalize:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: "Market1501"
+        image_root: "./dataset/"
+        cls_label_path: "bounding_box_test"
+        backend: "pil"
+        transform_ops:
+          - ResizeImage:
+              size: [128, 256]
+              return_numpy: False
+              interpolation: "bilinear"
+              backend: "pil"
+          - ToTensor:
+          - Normalize:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/reid/strong_baseline/softmax_triplet_with_center.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/reid/strong_baseline/softmax_triplet_with_center.yaml
new file mode 100644
index 000000000..70c70a99b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/reid/strong_baseline/softmax_triplet_with_center.yaml
@@ -0,0 +1,187 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  save_interval: 40
+  eval_during_train: True
+  eval_interval: 10
+  epochs: 120
+  print_batch_step: 20
+  use_visualdl: False
+  eval_mode: "retrieval"
+  retrieval_feature_from: "features" # 'backbone' or 'features'
+  re_ranking: False
+  # used for static mode and model export
+  image_shape: [3, 256, 128]
+  save_inference_dir: "./inference"
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone:
+    name: "ResNet50_last_stage_stride1"
+    pretrained: https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/others/resnet50-19c8e357_torch2paddle.pdparams
+    stem_act: null
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: BNNeck
+    num_features: &feat_dim 2048
+    weight_attr:
+      initializer:
+        name: Constant
+        value: 1.0
+    bias_attr:
+      initializer:
+        name: Constant
+        value: 0.0
+      learning_rate: 1.0e-20 # NOTE: Temporarily set lr small enough to freeze the bias to zero
+  Head:
+    name: "FC"
+    embedding_size: *feat_dim
+    class_num: &class_num 751
+    weight_attr:
+      initializer:
+        name: Normal
+        std: 0.001
+    bias_attr: False
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+    - TripletLossV2:
+        weight: 1.0
+        margin: 0.3
+        normalize_feature: False
+        feature_from: "backbone"
+    - CenterLoss:
+        weight: 0.0005
+        num_classes: *class_num
+        feat_dim: *feat_dim
+        feature_from: "backbone"
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  - Adam:
+      scope: RecModel
+      lr:
+        name: Piecewise
+        decay_epochs: [30, 60]
+        values: [0.00035, 0.000035, 0.0000035]
+        warmup_epoch: 10
+        warmup_start_lr: 0.0000035
+        by_epoch: True
+        last_epoch: 0
+      regularizer:
+        name: "L2"
+        coeff: 0.0005
+  - SGD:
+      scope: CenterLoss
+      lr:
+        name: ConstLR
+        learning_rate: 1000.0 # NOTE: set to ori_lr*(1/centerloss_weight) to avoid manually scaling centers' gradidents.
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: "Market1501"
+      image_root: "./dataset/"
+      cls_label_path: "bounding_box_train"
+      backend: "pil"
+      transform_ops:
+        - ResizeImage:
+            size: [128, 256]
+            return_numpy: False
+            interpolation: "bilinear"
+            backend: "pil"
+        - RandFlipImage:
+            flip_code: 1
+        - Pad:
+            padding: 10
+        - RandCropImageV2:
+            size: [128, 256]
+        - ToTensor:
+        - Normalize:
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+        - RandomErasing:
+            EPSILON: 0.5
+            sl: 0.02
+            sh: 0.4
+            r1: 0.3
+            mean: [0.485, 0.456, 0.406]
+    sampler:
+      name: DistributedRandomIdentitySampler
+      batch_size: 64
+      num_instances: 4
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+  Eval:
+    Query:
+      dataset:
+        name: "Market1501"
+        image_root: "./dataset/"
+        cls_label_path: "query"
+        backend: "pil"
+        transform_ops:
+          - ResizeImage:
+              size: [128, 256]
+              return_numpy: False
+              interpolation: "bilinear"
+              backend: "pil"
+          - ToTensor:
+          - Normalize:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset:
+        name: "Market1501"
+        image_root: "./dataset/"
+        cls_label_path: "bounding_box_test"
+        backend: "pil"
+        transform_ops:
+          - ResizeImage:
+              size: [128, 256]
+              return_numpy: False
+              interpolation: "bilinear"
+              backend: "pil"
+          - ToTensor:
+          - Normalize:
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/GeneralRecognition_PPLCNet_x2_5_quantization.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/GeneralRecognition_PPLCNet_x2_5_quantization.yaml
new file mode 100644
index 000000000..7b21d0ba8
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/GeneralRecognition_PPLCNet_x2_5_quantization.yaml
@@ -0,0 +1,154 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/general_PPLCNet_x2_5_pretrained_v1.0.pdparams
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+  eval_mode: retrieval
+  use_dali: False
+  to_static: False
+
+# for quantizaiton or prune model
+Slim:
+  ## for prune
+  quant:
+    name: pact
+
+# model architecture
+Arch:
+  name: RecModel
+  infer_output_key: features
+  infer_add_softmax: False
+
+  Backbone: 
+    name: PPLCNet_x2_5
+    pretrained: False
+    use_ssld: True
+  BackboneStopLayer:
+    name: "flatten"
+  Neck:
+    name: FC
+    embedding_size: 1280
+    class_num: 512
+  Head:
+    name: ArcMargin 
+    embedding_size: 512
+    class_num: 185341
+    margin: 0.2
+    scale: 30
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.002
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/
+      cls_label_path: ./dataset/train_reg_all_data.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    Query:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: VeriWild
+        image_root: ./dataset/Aliproduct/
+        cls_label_path: ./dataset/Aliproduct/val_list.txt
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 4
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/MobileNetV3_large_x1_0_prune.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/MobileNetV3_large_x1_0_prune.yaml
new file mode 100644
index 000000000..6655c3a0a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/MobileNetV3_large_x1_0_prune.yaml
@@ -0,0 +1,139 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 360
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# for quantization or prune model 
+Slim:
+  ## for prune
+  prune:
+    name: fpgm
+    pruned_ratio: 0.3
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x1_0
+  class_num: 1000
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.65
+    warmup_epoch: 5
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/MobileNetV3_large_x1_0_quantization.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/MobileNetV3_large_x1_0_quantization.yaml
new file mode 100644
index 000000000..517c4677c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/MobileNetV3_large_x1_0_quantization.yaml
@@ -0,0 +1,138 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# for quantalization or prune model 
+Slim:
+  ## for quantization
+  quant:
+    name: pact
+
+# model architecture
+Arch:
+  name: MobileNetV3_large_x1_0
+  class_num: 1000
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.065
+    warmup_epoch: 0
+  regularizer:
+    name: 'L2'
+    coeff: 0.00002
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 256
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/PPLCNet_x1_0_quantization.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/PPLCNet_x1_0_quantization.yaml
new file mode 100644
index 000000000..40111a036
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/PPLCNet_x1_0_quantization.yaml
@@ -0,0 +1,138 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 60
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# for quantalization or prune model 
+Slim:
+  ## for quantization
+  quant:
+    name: pact
+
+# model architecture
+Arch:
+  name: PPLCNet_x1_0
+  class_num: 1000
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.02
+    warmup_epoch: 0
+  regularizer:
+    name: 'L2'
+    coeff: 0.00003
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - AutoAugment:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 128
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+    - TopkAcc:
+        topk: [1, 5]
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/ResNet50_vd_prune.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/ResNet50_vd_prune.yaml
new file mode 100644
index 000000000..7bfc537b1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/ResNet50_vd_prune.yaml
@@ -0,0 +1,138 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null 
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 200
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# for quantization or prune model 
+Slim:
+  ## for prune
+  prune:
+    name: fpgm
+    pruned_ratio: 0.3
+
+# model architecture
+Arch:
+  name: ResNet50_vd
+  class_num: 1000
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.1
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/ResNet50_vd_quantization.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/ResNet50_vd_quantization.yaml
new file mode 100644
index 000000000..f9db41020
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/ResNet50_vd_quantization.yaml
@@ -0,0 +1,137 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null 
+  output_dir: ./output/
+  device: gpu
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 30
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+# for quantalization or prune model 
+Slim:
+  ## for quantization
+  quant:
+    name: pact
+
+# model architecture
+Arch:
+  name: ResNet50_vd
+  class_num: 1000
+  pretrained: True
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        epsilon: 0.1
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+  regularizer:
+    name: 'L2'
+    coeff: 0.00007
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/train_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+      batch_transform_ops:
+        - MixupOperator:
+            alpha: 0.2
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  Eval:
+    dataset: 
+      name: ImageNetDataset
+      image_root: ./dataset/ILSVRC2012/
+      cls_label_path: ./dataset/ILSVRC2012/val_list.txt
+      transform_ops:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: False
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+Infer:
+  infer_imgs: docs/images/inference_deployment/whl_demo.jpg
+  batch_size: 10
+  transforms:
+    - DecodeImage:
+        to_rgb: True
+        channel_first: False
+    - ResizeImage:
+        resize_short: 256
+    - CropImage:
+        size: 224
+    - NormalizeImage:
+        scale: 1.0/255.0
+        mean: [0.485, 0.456, 0.406]
+        std: [0.229, 0.224, 0.225]
+        order: ''
+    - ToCHWImage:
+  PostProcess:
+    name: Topk
+    topk: 5
+    class_id_map_file: ppcls/utils/imagenet1k_label_list.txt
+
+Metric:
+  Train:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/ResNet50_vehicle_cls_prune.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/ResNet50_vehicle_cls_prune.yaml
new file mode 100644
index 000000000..1f6fea887
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/ResNet50_vehicle_cls_prune.yaml
@@ -0,0 +1,135 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/vehicle_cls_ResNet50_CompCars_v1.2_pretrained.pdparams"
+  output_dir: "./output_vehicle_cls_prune/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 160
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+
+Slim:
+  prune:
+    name: fpgm
+    pruned_ratio: 0.3
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "ResNet50_last_stage_stride1"
+    pretrained: True
+  BackboneStopLayer:
+    name: "avg_pool"
+  Neck:
+    name: "VehicleNeck"
+    in_channels: 2048
+    out_channels: 512
+  Head:
+    name: "ArcMargin"  
+    embedding_size: 512
+    class_num: 431
+    margin: 0.15
+    scale: 32
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - SupConLoss:
+        weight: 1.0
+        views: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "CompCars"
+        image_root: "./dataset/CompCars/image/"
+        label_root: "./dataset/CompCars/label/"
+        bbox_crop: True
+        cls_label_path: "./dataset/CompCars/train_test_split/classification/train_label.txt"
+        transform_ops:
+          - ResizeImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - AugMix:
+              prob: 0.5
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+          - RandomErasing:
+              EPSILON: 0.5
+              sl: 0.02
+              sh: 0.4
+              r1: 0.3
+              mean: [0., 0., 0.]
+
+    sampler:
+        name: PKSampler
+        batch_size: 128
+        sample_per_id: 2
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: "CompCars"
+        image_root: "./dataset/CompCars/image/"
+        label_root: "./dataset/CompCars/label/"
+        cls_label_path: "./dataset/CompCars/train_test_split/classification/test_label.txt"
+        bbox_crop: True
+        transform_ops:
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+Metric:
+    Train:
+    - TopkAcc:
+        topk: [1, 5]
+    Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/ResNet50_vehicle_cls_quantization.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/ResNet50_vehicle_cls_quantization.yaml
new file mode 100644
index 000000000..026b86547
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/ResNet50_vehicle_cls_quantization.yaml
@@ -0,0 +1,134 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/vehicle_cls_ResNet50_CompCars_v1.2_pretrained.pdparams"
+  output_dir: "./output_vehicle_cls_pact/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 80
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+
+Slim:
+  quant:
+    name: pact
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "ResNet50_last_stage_stride1"
+    pretrained: True
+  BackboneStopLayer:
+    name: "avg_pool"
+  Neck:
+    name: "VehicleNeck"
+    in_channels: 2048
+    out_channels: 512
+  Head:
+    name: "ArcMargin"  
+    embedding_size: 512
+    class_num: 431
+    margin: 0.15
+    scale: 32
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - SupConLoss:
+        weight: 1.0
+        views: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.001
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "CompCars"
+        image_root: "./dataset/CompCars/image/"
+        label_root: "./dataset/CompCars/label/"
+        bbox_crop: True
+        cls_label_path: "./dataset/CompCars/train_test_split/classification/train_label.txt"
+        transform_ops:
+          - ResizeImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - AugMix:
+              prob: 0.5
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+          - RandomErasing:
+              EPSILON: 0.5
+              sl: 0.02
+              sh: 0.4
+              r1: 0.3
+              mean: [0., 0., 0.]
+
+    sampler:
+        name: PKSampler
+        batch_size: 64
+        sample_per_id: 2
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+  Eval:
+    dataset: 
+        name: "CompCars"
+        image_root: "./dataset/CompCars/image/"
+        label_root: "./dataset/CompCars/label/"
+        cls_label_path: "./dataset/CompCars/train_test_split/classification/test_label.txt"
+        bbox_crop: True
+        transform_ops:
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+    sampler:
+        name: DistributedBatchSampler
+        batch_size: 128
+        drop_last: False
+        shuffle: False
+    loader:
+        num_workers: 8
+        use_shared_memory: True
+
+Metric:
+    Train:
+    - TopkAcc:
+        topk: [1, 5]
+    Eval:
+    - TopkAcc:
+        topk: [1, 5]
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/ResNet50_vehicle_reid_prune.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/ResNet50_vehicle_reid_prune.yaml
new file mode 100644
index 000000000..63b87f1ca
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/ResNet50_vehicle_reid_prune.yaml
@@ -0,0 +1,162 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/vehicle_reid_ResNet50_VERIWild_v1.1_pretrained.pdparams"
+  output_dir: "./output_vehicle_reid_prune/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 160
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  eval_mode: "retrieval"
+
+# for quantizaiton or prune model
+Slim:
+  ## for prune
+  prune:
+    name: fpgm
+    pruned_ratio: 0.3
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "ResNet50_last_stage_stride1"
+    pretrained: True
+  BackboneStopLayer:
+    name: "avg_pool"
+  Neck:
+    name: "VehicleNeck"
+    in_channels: 2048
+    out_channels: 512
+  Head:
+    name: "ArcMargin"  
+    embedding_size: 512
+    class_num: 30671
+    margin: 0.15
+    scale: 32
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - SupConLoss:
+        weight: 1.0
+        views: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.01
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images/"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/train_list_start0.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - AugMix:
+              prob: 0.5
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+          - RandomErasing:
+              EPSILON: 0.5
+              sl: 0.02
+              sh: 0.4
+              r1: 0.3
+              mean: [0., 0., 0.]
+
+    sampler:
+        name: PKSampler
+        batch_size: 128
+        sample_per_id: 2
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+  Eval:
+    Query:
+      dataset: 
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/test_3000_id_query.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/test_3000_id.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/ResNet50_vehicle_reid_quantization.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/ResNet50_vehicle_reid_quantization.yaml
new file mode 100644
index 000000000..cca9915e2
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/slim/ResNet50_vehicle_reid_quantization.yaml
@@ -0,0 +1,161 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/rec/models/pretrain/vehicle_reid_ResNet50_VERIWild_v1.1_pretrained.pdparams"
+  output_dir: "./output_vehicle_reid_pact/"
+  device: "gpu"
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 40
+  print_batch_step: 10
+  use_visualdl: False
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: "./inference"
+  eval_mode: "retrieval"
+
+# for quantizaiton or prune model
+Slim:
+  ## for prune
+  quant:
+    name: pact
+
+# model architecture
+Arch:
+  name: "RecModel"
+  infer_output_key: "features"
+  infer_add_softmax: False
+  Backbone: 
+    name: "ResNet50_last_stage_stride1"
+    pretrained: True
+  BackboneStopLayer:
+    name: "avg_pool"
+  Neck:
+    name: "VehicleNeck"
+    in_channels: 2048
+    out_channels: 512
+  Head:
+    name: "ArcMargin"  
+    embedding_size: 512
+    class_num: 30671
+    margin: 0.15
+    scale: 32
+ 
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+    - SupConLoss:
+        weight: 1.0
+        views: 2
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Cosine
+    learning_rate: 0.001
+  regularizer:
+    name: 'L2'
+    coeff: 0.0005
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images/"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/train_list_start0.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - RandFlipImage:
+              flip_code: 1
+          - AugMix:
+              prob: 0.5
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+          - RandomErasing:
+              EPSILON: 0.5
+              sl: 0.02
+              sh: 0.4
+              r1: 0.3
+              mean: [0., 0., 0.]
+
+    sampler:
+        name: PKSampler
+        batch_size: 64
+        sample_per_id: 2
+        drop_last: False
+        shuffle: True
+    loader:
+        num_workers: 6
+        use_shared_memory: True
+  Eval:
+    Query:
+      dataset: 
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/test_3000_id_query.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+    Gallery:
+      dataset: 
+        name: "VeriWild"
+        image_root: "./dataset/VeRI-Wild/images"
+        cls_label_path: "./dataset/VeRI-Wild/train_test_split/test_3000_id.txt"
+        transform_ops:
+          - DecodeImage:
+              to_rgb: True
+              channel_first: False
+          - ResizeImage:
+              size: 224
+          - NormalizeImage:
+              scale: 0.00392157
+              mean: [0.485, 0.456, 0.406]
+              std: [0.229, 0.224, 0.225]
+              order: ''
+      sampler:
+        name: DistributedBatchSampler
+        batch_size: 64
+        drop_last: False
+        shuffle: False
+      loader:
+        num_workers: 6
+        use_shared_memory: True
+
+Metric:
+  Eval:
+    - Recallk:
+        topk: [1, 5]
+    - mAP: {}
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ssl/CCSSL/FixMatchCCSSL_cifar100_10000_4gpu.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ssl/CCSSL/FixMatchCCSSL_cifar100_10000_4gpu.yaml
new file mode 100644
index 000000000..a2382817d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ssl/CCSSL/FixMatchCCSSL_cifar100_10000_4gpu.yaml
@@ -0,0 +1,209 @@
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: -1
+  eval_during_train: true
+  eval_interval: 1
+  epochs: 1024
+  iter_per_epoch: 1024
+  print_batch_step: 20
+  use_visualdl: false
+  use_dali: false
+  train_mode: fixmatch_ccssl
+  image_shape: [3, 32, 32]
+  save_inference_dir: ./inference
+
+SSL:
+  T: 1
+  threshold: 0.95
+
+EMA:
+  decay: 0.999
+
+Arch:
+  name: RecModel
+  infer_output_key: logits
+  infer_add_softmax: false
+  head_feature_from: backbone 
+  Backbone:
+    name: WideResNet
+    widen_factor: 8
+    depth: 28
+    dropout: 0 
+    num_classes: 100
+    low_dim: 64
+    proj: false
+    proj_after: false
+  BackboneStopLayer:
+    name: bn1
+  Neck:
+    name: FRFNNeck
+    num_features: 512
+    low_dim: 64
+  Head:
+    name: FC
+    embedding_size: 512
+    class_num: 100
+
+  use_sync_bn: true
+
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        reduction: "mean"
+  Eval:
+    - CELoss:
+        weight: 1.0
+  
+UnLabelLoss:
+  Train:
+    - CCSSLCELoss:
+        weight: 1.
+    - SoftSupConLoss:
+        weight: 1.0
+        temperature: 0.07
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  use_nesterov: true
+  weight_decay: 0.001
+  lr:
+    name: 'CosineFixmatch'
+    learning_rate: 0.03
+    num_warmup_steps: 0
+
+DataLoader:
+  mean: [0.5071, 0.4867, 0.4408]
+  std: [0.2675, 0.2565, 0.2761]
+  Train:
+    dataset:
+      name: Cifar100
+      data_file: null
+      mode: 'train'
+      download: true
+      backend: 'pil'
+      sample_per_label: 100
+      expand_labels: 1
+      transform_ops:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5071, 0.4867, 0.4408]
+            std: [0.2675, 0.2565, 0.2761]
+            order: hwc
+    
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 16
+      drop_last: true
+      shuffle: true
+    loader:
+      num_workers: 4
+      use_shared_memory: true
+  
+  UnLabelTrain:
+    dataset:
+      name: Cifar100
+      data_file: null
+      mode: 'train'
+      backend: 'pil'
+      download: true
+
+      transform_ops_weak:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5071, 0.4867, 0.4408]
+            std: [0.2675, 0.2565, 0.2761]
+            order: hwc
+
+      transform_ops_strong:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - RandAugment:
+            num_layers: 2
+            magnitude: 10
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5071, 0.4867, 0.4408]
+            std: [0.2675, 0.2565, 0.2761]
+            order: hwc
+      
+      transform_ops_strong2:
+        - RandomResizedCrop:
+            size: [32, 32]
+        - RandFlipImage:
+            flip_code: 1
+        - RandomApply:
+            transforms:
+              - RawColorJitter:
+                  brightness: 0.4
+                  contrast: 0.4
+                  saturation: 0.4
+                  hue: 0.1
+            p: 1.0  # refer to official settings
+        - RandomGrayscale:
+            p: 0.2
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0., 0., 0.]
+            std: [1., 1., 1.]
+            order: hwc
+
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 112
+      drop_last: true
+      shuffle: true
+    loader:
+      num_workers: 4
+      use_shared_memory: true
+
+  Eval:
+    dataset:
+      name: Cifar100
+      mode: 'test'
+      backend: 'pil'
+      download: true
+      data_file: null
+      transform_ops:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.5071, 0.4867, 0.4408]
+            std: [0.2675, 0.2565, 0.2761]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 16
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: true
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ssl/CCSSL/FixMatchCCSSL_cifar10_4000_4gpu.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ssl/CCSSL/FixMatchCCSSL_cifar10_4000_4gpu.yaml
new file mode 100644
index 000000000..79667edc6
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ssl/CCSSL/FixMatchCCSSL_cifar10_4000_4gpu.yaml
@@ -0,0 +1,208 @@
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: ./output
+  device: gpu
+  save_interval: -1
+  eval_during_train: true
+  eval_interval: 1
+  epochs: 1024
+  iter_per_epoch: 1024
+  print_batch_step: 20
+  use_visualdl: false
+  use_dali: false
+  train_mode: fixmatch_ccssl
+  image_shape: [3, 32, 32]
+  save_inference_dir: ./inference
+
+SSL:
+  T: 1
+  threshold: 0.95
+
+EMA:
+  decay: 0.999
+
+Arch:
+  name: RecModel
+  infer_output_key: logits
+  infer_add_softmax: false
+  head_feature_from: backbone 
+  Backbone:
+    name: WideResNet
+    widen_factor: 2
+    depth: 28
+    dropout: 0
+    num_classes: 10
+    low_dim: 64
+    proj: false
+    proj_after: false
+  BackboneStopLayer:
+    name: bn1
+  Neck:
+    name: FRFNNeck
+    num_features: 128
+    low_dim: 64
+  Head:
+    name: FC
+    embedding_size: 128
+    class_num: 10
+
+  use_sync_bn: true
+
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        reduction: "mean"
+  Eval:
+    - CELoss:
+        weight: 1.0
+  
+UnLabelLoss:
+  Train:
+    - CCSSLCELoss:
+        weight: 1.
+    - SoftSupConLoss:
+        weight: 1.0
+        temperature: 0.07
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  use_nesterov: true
+  weight_decay: 0.001
+  lr:
+    name: 'CosineFixmatch'
+    learning_rate: 0.03
+    num_warmup_steps: 0
+
+DataLoader:
+  mean: [0.4914, 0.4822, 0.4465]
+  std: [0.2471, 0.2435, 0.2616]
+  Train:
+    dataset:
+      name: Cifar10
+      data_file: null
+      mode: 'train'
+      download: true
+      backend: 'pil'
+      sample_per_label: 400
+      expand_labels: 1
+      transform_ops:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: true
+      shuffle: true
+    loader:
+      num_workers: 4
+      use_shared_memory: true
+  
+  UnLabelTrain:
+    dataset:
+      name: Cifar10
+      data_file: null
+      mode: 'train'
+      backend: 'pil'
+      download: true
+
+      transform_ops_weak:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+
+      transform_ops_strong:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - RandAugment:
+            num_layers: 2
+            magnitude: 10
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+      
+      transform_ops_strong2:
+        - RandCropImageV2:
+            size: [32, 32]
+        - RandFlipImage:
+            flip_code: 1
+        - RandomApply:
+            transforms:
+              - RawColorJitter:
+                  brightness: 0.4
+                  contrast: 0.4
+                  saturation: 0.4
+                  hue: 0.1
+            p: 1.0  # refer to official settings
+        - RandomGrayscale:
+            p: 0.2
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 448
+      drop_last: true
+      shuffle: true
+    loader:
+      num_workers: 4
+      use_shared_memory: true
+
+  Eval:
+    dataset:
+      name: Cifar10
+      mode: 'test'
+      backend: 'pil'
+      download: true
+      data_file: null
+      transform_ops:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: true
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ssl/FixMatch/FixMatch_cifar10_250.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ssl/FixMatch/FixMatch_cifar10_250.yaml
new file mode 100644
index 000000000..e8e00e8c8
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ssl/FixMatch/FixMatch_cifar10_250.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: '../test/torch2paddle_cifar10'
+  output_dir: ./output_25
+  device: gpu
+  save_interval: -1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 1024
+  iter_per_epoch: 1024
+  print_batch_step: 20
+  use_visualdl: False
+  use_dali: False
+  train_mode: fixmatch
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+SSL:
+  tempture: 1
+  threshold: 0.95
+
+EMA:
+  decay: 0.999
+
+# AMP:
+#   scale_loss: 65536
+#   use_dynamic_loss_scaling: True
+#   # O1: mixed fp16
+#   level: O1
+
+# model architecture
+Arch:
+  name: WideResNet
+  depth: 28
+  widen_factor: 2
+  dropout: 0
+  num_classes: 10
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        reduction: "mean"
+  Eval:
+    - CELoss:
+        weight: 1.0
+UnLabelLoss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        reduction: "none"
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  use_nesterov: True
+  no_weight_decay_name: bn bias
+  weight_decay: 0.0005
+  lr:
+    name: CosineFixmatch
+    learning_rate: 0.03
+    num_warmup_steps: 0
+    num_cycles: 0.4375
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: Cifar10
+      data_file: None
+      mode: 'train'
+      download: True
+      backend: 'pil'
+      sample_per_label: 25
+      expand_labels: 263
+      transform_ops:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  UnLabelTrain:
+    dataset:
+      name: Cifar10
+      data_file: None
+      mode: 'train'
+      download: True
+      backend: 'pil'
+      sample_per_label: None
+      transform_ops_weak:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+      transform_ops_strong:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - RandAugment:
+            num_layers: 2
+            magnitude: 10
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 448
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+
+  Eval:
+    dataset:
+      name: Cifar10
+      data_file: None
+      mode: 'test'
+      download: True
+      backend: 'pil'
+      sample_per_label: None
+      transform_ops:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ssl/FixMatch/FixMatch_cifar10_40.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ssl/FixMatch/FixMatch_cifar10_40.yaml
new file mode 100644
index 000000000..0327fcd9c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ssl/FixMatch/FixMatch_cifar10_40.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: 'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/others/torch2paddle_weight/torch2paddle_initialize_cifar10_WideResNet_depth28_widenfactor2_classnum10.pdparams'
+  output_dir: ./output
+  device: gpu
+  save_interval: -1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 1024
+  iter_per_epoch: 1024
+  print_batch_step: 20
+  use_visualdl: False
+  use_dali: False
+  train_mode: fixmatch
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+SSL:
+  tempture: 1
+  threshold: 0.95
+
+EMA:
+  decay: 0.999
+
+# AMP:
+#   scale_loss: 65536
+#   use_dynamic_loss_scaling: True
+#   # O1: mixed fp16
+#   level: O1
+
+# model architecture
+Arch:
+  name: WideResNet
+  depth: 28
+  widen_factor: 2
+  dropout: 0
+  num_classes: 10
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        reduction: "mean"
+  Eval:
+    - CELoss:
+        weight: 1.0
+UnLabelLoss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        reduction: "none"
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  use_nesterov: True
+  no_weight_decay_name: bn bias
+  weight_decay: 0.0005
+  lr:
+    name: CosineFixmatch
+    learning_rate: 0.03
+    num_warmup_steps: 0
+    num_cycles: 0.4375
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: Cifar10
+      data_file: None
+      mode: 'train'
+      download: True
+      backend: 'pil'
+      sample_per_label: 4
+      expand_labels: 1639
+      transform_ops:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  UnLabelTrain:
+    dataset:
+      name: Cifar10
+      data_file: None
+      mode: 'train'
+      download: True
+      backend: 'pil'
+      sample_per_label: None
+      transform_ops_weak:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+      transform_ops_strong:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - RandAugment:
+            num_layers: 2
+            magnitude: 10
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 448
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+
+  Eval:
+    dataset:
+      name: Cifar10
+      data_file: None
+      mode: 'test'
+      download: True
+      backend: 'pil'
+      sample_per_label: None
+      transform_ops:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ssl/FixMatch/FixMatch_cifar10_4000.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ssl/FixMatch/FixMatch_cifar10_4000.yaml
new file mode 100644
index 000000000..0989fc89e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ssl/FixMatch/FixMatch_cifar10_4000.yaml
@@ -0,0 +1,175 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: 'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/others/torch2paddle_weight/torch2paddle_initialize_cifar10_WideResNet_depth28_widenfactor2_classnum10.pdparams'
+  output_dir: ./output
+  device: gpu
+  save_interval: -1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 1024
+  iter_per_epoch: 1024
+  print_batch_step: 20
+  use_visualdl: False
+  use_dali: False
+  train_mode: fixmatch
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+SSL:
+  tempture: 1
+  threshold: 0.95
+
+EMA:
+  decay: 0.999
+
+# AMP:
+#   scale_loss: 65536
+#   use_dynamic_loss_scaling: True
+#   # O1: mixed fp16
+#   level: O1
+
+# model architecture
+Arch:
+  name: WideResNet
+  depth: 28
+  widen_factor: 2
+  dropout: 0
+  num_classes: 10
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        reduction: "mean"
+  Eval:
+    - CELoss:
+        weight: 1.0
+UnLabelLoss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        reduction: "none"
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  use_nesterov: True
+  no_weight_decay_name: bn bias
+  weight_decay: 0.0005
+  lr:
+    name: CosineFixmatch
+    learning_rate: 0.03
+    num_warmup_steps: 0
+    num_cycles: 0.4375
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: Cifar10
+      data_file: None
+      mode: 'train'
+      download: True
+      backend: 'pil'
+      sample_per_label: 400
+      expand_labels: 17
+      transform_ops:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+  UnLabelTrain:
+    dataset:
+      name: Cifar10
+      data_file: None
+      mode: 'train'
+      download: True
+      backend: 'pil'
+      sample_per_label: None
+      transform_ops_weak:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+      transform_ops_strong:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - RandAugment:
+            num_layers: 2
+            magnitude: 10
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 448
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+
+  Eval:
+    dataset:
+      name: Cifar10
+      data_file: None
+      mode: 'test'
+      download: True
+      backend: 'pil'
+      sample_per_label: None
+      transform_ops:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ssl/FixMatch/FixMatch_cifar10_40_4gpu.yaml b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ssl/FixMatch/FixMatch_cifar10_40_4gpu.yaml
new file mode 100644
index 000000000..feeb1a9e8
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/configs/ssl/FixMatch/FixMatch_cifar10_40_4gpu.yaml
@@ -0,0 +1,176 @@
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: 'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/others/torch2paddle_weight/torch2paddle_initialize_cifar10_WideResNet_depth28_widenfactor2_classnum10.pdparams'
+  output_dir: ./output
+  device: gpu
+  save_interval: -1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 1024
+  iter_per_epoch: 1024
+  print_batch_step: 20
+  use_visualdl: False
+  use_dali: False
+  train_mode: fixmatch
+  # used for static mode and model export
+  image_shape: [3, 224, 224]
+  save_inference_dir: ./inference
+
+SSL:
+  tempture: 1
+  threshold: 0.95
+
+EMA:
+  decay: 0.999
+
+# AMP:
+#   scale_loss: 65536
+#   use_dynamic_loss_scaling: True
+#   # O1: mixed fp16
+#   level: O1
+
+# model architecture
+Arch:
+  name: WideResNet
+  depth: 28
+  widen_factor: 2
+  dropout: 0
+  num_classes: 10
+  use_sync_bn: True
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        reduction: "mean"
+  Eval:
+    - CELoss:
+        weight: 1.0
+UnLabelLoss:
+  Train:
+    - CELoss:
+        weight: 1.0
+        reduction: "none"
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  use_nesterov: True
+  no_weight_decay_name: bn bias
+  weight_decay: 0.0005
+  lr:
+    name: CosineFixmatch
+    learning_rate: 0.03
+    num_warmup_steps: 0
+    num_cycles: 0.4375
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    dataset:
+      name: Cifar10
+      data_file: None
+      mode: 'train'
+      download: True
+      backend: 'pil'
+      sample_per_label: 4
+      expand_labels: 1639
+      transform_ops:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 16
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+  UnLabelTrain:
+    dataset:
+      name: Cifar10
+      data_file: None
+      mode: 'train'
+      download: True
+      backend: 'pil'
+      sample_per_label: None
+      transform_ops_weak:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+      transform_ops_strong:
+        - RandFlipImage:
+            flip_code: 1
+        - Pad_paddle_vision:
+            padding: 4
+            padding_mode: reflect
+        - RandCropImageV2:
+            size: [32, 32]
+        - RandAugment:
+            num_layers: 2
+            magnitude: 10
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 112
+      drop_last: True
+      shuffle: True
+    loader:
+      num_workers: 8
+      use_shared_memory: True
+
+
+  Eval:
+    dataset:
+      name: Cifar10
+      data_file: None
+      mode: 'test'
+      download: True
+      backend: 'pil'
+      sample_per_label: None
+      transform_ops:
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.4914, 0.4822, 0.4465]
+            std: [0.2471, 0.2435, 0.2616]
+            order: hwc
+    sampler:
+      name: DistributedBatchSampler
+      batch_size: 64
+      drop_last: False
+      shuffle: True
+    loader:
+      num_workers: 4
+      use_shared_memory: True
+
+
+Metric:
+  Eval:
+    - TopkAcc:
+        topk: [1, 5]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/engine.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/engine.py
new file mode 100644
index 000000000..7cce65daf
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/engine.py
@@ -0,0 +1,717 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import gc
+import shutil
+import copy
+import platform
+import paddle
+import paddle.distributed as dist
+from visualdl import LogWriter
+from paddle import nn
+import numpy as np
+import random
+
+from ppcls.utils.misc import AverageMeter
+from ppcls.utils import logger
+from ppcls.utils.logger import init_logger
+from ppcls.utils.config import print_config, dump_infer_config
+from ppcls.data import build_dataloader
+from ppcls.arch import build_model, RecModel, DistillationModel, TheseusLayer
+from ppcls.arch import apply_to_static
+from ppcls.loss import build_loss
+from ppcls.metric import build_metrics
+from ppcls.optimizer import build_optimizer
+from ppcls.utils.amp import AutoCast, build_scaler
+from ppcls.utils.ema import ExponentialMovingAverage
+from ppcls.utils.save_load import load_dygraph_pretrain
+from ppcls.utils.save_load import init_model
+from ppcls.utils.save_result import update_train_results
+from ppcls.utils import save_load, save_predict_result
+
+from ppcls.data.utils.get_image_list import get_image_list
+from ppcls.data.postprocess import build_postprocess
+from ppcls.data import create_operators
+from ppcls.engine import train as train_method
+from ppcls.engine.train.utils import type_name
+from ppcls.engine import evaluation
+from ppcls.arch.gears.identity_head import IdentityHead
+
+
+class Engine(object):
+    def __init__(self, config, mode="train"):
+        assert mode in ["train", "eval", "infer", "export"]
+        self.mode = mode
+        self.config = config
+        self.eval_mode = self.config["Global"].get("eval_mode",
+                                                   "classification")
+        self.train_mode = self.config["Global"].get("train_mode", None)
+        if "Head" in self.config["Arch"] or self.config["Arch"].get("is_rec",
+                                                                    False):
+            self.is_rec = True
+        else:
+            self.is_rec = False
+
+        # set seed
+        seed = self.config["Global"].get("seed", False)
+        if seed or seed == 0:
+            assert isinstance(seed, int), "The 'seed' must be a integer!"
+            paddle.seed(seed)
+            np.random.seed(seed)
+            random.seed(seed)
+
+        # init logger
+        self.output_dir = self.config['Global']['output_dir']
+        log_file = os.path.join(self.output_dir, f"{mode}.log")
+        log_ranks = self.config['Global'].get("log_ranks", "0")
+        init_logger(log_file=log_file, log_ranks=log_ranks)
+        print_config(config)
+
+        # init train_func and eval_func
+        assert self.eval_mode in [
+            "classification", "retrieval", "adaface", "face_recognition"
+        ], logger.error("Invalid eval mode: {}".format(self.eval_mode))
+        if self.train_mode is None:
+            self.train_epoch_func = train_method.train_epoch
+        else:
+            self.train_epoch_func = getattr(train_method,
+                                            "train_epoch_" + self.train_mode)
+        self.eval_func = getattr(evaluation, self.eval_mode + "_eval")
+
+        self.use_dali = self.config['Global'].get("use_dali", False)
+
+        # for visualdl
+        self.vdl_writer = None
+        if self.config['Global'][
+                'use_visualdl'] and mode == "train" and dist.get_rank() == 0:
+            vdl_writer_path = self.output_dir
+            if not os.path.exists(vdl_writer_path):
+                os.makedirs(vdl_writer_path)
+            self.vdl_writer = LogWriter(logdir=vdl_writer_path)
+
+        # set device
+        assert self.config["Global"]["device"] in [
+            "cpu", "gpu", "xpu", "npu", "mlu", "ascend", "intel_gpu", "mps"
+        ]
+        self.device = paddle.set_device(self.config["Global"]["device"])
+        logger.info('train with paddle {} and device {}'.format(
+            paddle.__version__, self.device))
+
+        # gradient accumulation
+        self.update_freq = self.config["Global"].get("update_freq", 1)
+
+        if "class_num" in config["Global"]:
+            global_class_num = config["Global"]["class_num"]
+            if "class_num" not in config["Arch"]:
+                config["Arch"]["class_num"] = global_class_num
+                msg = f"The Global.class_num will be deprecated. Please use Arch.class_num instead. Arch.class_num has been set to {global_class_num}."
+            else:
+                msg = "The Global.class_num will be deprecated. Please use Arch.class_num instead. The Global.class_num has been ignored."
+            logger.warning(msg)
+        #TODO(gaotingquan): support rec
+        class_num = config["Arch"].get("class_num", None)
+        self.config["DataLoader"].update({"class_num": class_num})
+        self.config["DataLoader"].update({
+            "epochs": self.config["Global"]["epochs"]
+        })
+
+        # build dataloader
+        if self.mode == 'train':
+            self.train_dataloader = build_dataloader(
+                self.config["DataLoader"], "Train", self.device, self.use_dali)
+            if self.config["DataLoader"].get('UnLabelTrain', None) is not None:
+                self.unlabel_train_dataloader = build_dataloader(
+                    self.config["DataLoader"], "UnLabelTrain", self.device,
+                    self.use_dali)
+            else:
+                self.unlabel_train_dataloader = None
+
+            self.iter_per_epoch = len(
+                self.train_dataloader) - 1 if platform.system(
+                ) == "Windows" else len(self.train_dataloader)
+            if self.config["Global"].get("iter_per_epoch", None):
+                # set max iteration per epoch mannualy, when training by iteration(s), such as XBM, FixMatch.
+                self.iter_per_epoch = self.config["Global"].get(
+                    "iter_per_epoch")
+            if self.iter_per_epoch < self.update_freq:
+                logger.warning(
+                    "The arg Global.update_freq greater than iter_per_epoch and has been set to 1. This may be caused by too few of batches."
+                )
+                self.update_freq = 1
+            self.iter_per_epoch = self.iter_per_epoch // self.update_freq * self.update_freq
+
+        if self.mode == "eval" or (self.mode == "train" and
+                                   self.config["Global"]["eval_during_train"]):
+            if self.eval_mode in ["classification", "adaface", "face_recognition"]:
+                self.eval_dataloader = build_dataloader(
+                    self.config["DataLoader"], "Eval", self.device,
+                    self.use_dali)
+            elif self.eval_mode == "retrieval":
+                self.gallery_query_dataloader = None
+                if len(self.config["DataLoader"]["Eval"].keys()) == 1:
+                    key = list(self.config["DataLoader"]["Eval"].keys())[0]
+                    self.gallery_query_dataloader = build_dataloader(
+                        self.config["DataLoader"]["Eval"], key, self.device,
+                        self.use_dali)
+                else:
+                    self.gallery_dataloader = build_dataloader(
+                        self.config["DataLoader"]["Eval"], "Gallery",
+                        self.device, self.use_dali)
+                    self.query_dataloader = build_dataloader(
+                        self.config["DataLoader"]["Eval"], "Query", self.device,
+                        self.use_dali)
+
+        # build loss
+        if self.mode == "train":
+            label_loss_info = self.config["Loss"]["Train"]
+            self.train_loss_func = build_loss(label_loss_info)
+            unlabel_loss_info = self.config.get("UnLabelLoss", {}).get("Train",
+                                                                       None)
+            self.unlabel_train_loss_func = build_loss(unlabel_loss_info)
+        if self.mode == "eval" or (self.mode == "train" and
+                                   self.config["Global"]["eval_during_train"]):
+            loss_config = self.config.get("Loss", None)
+            if loss_config is not None:
+                loss_config = loss_config.get("Eval")
+                if loss_config is not None:
+                    self.eval_loss_func = build_loss(loss_config)
+                else:
+                    self.eval_loss_func = None
+            else:
+                self.eval_loss_func = None
+
+        # build metric
+        if self.mode == 'train' and "Metric" in self.config and "Train" in self.config[
+                "Metric"] and self.config["Metric"]["Train"]:
+            metric_config = self.config["Metric"]["Train"]
+            if hasattr(self.train_dataloader, "collate_fn"
+                       ) and self.train_dataloader.collate_fn is not None:
+                for m_idx, m in enumerate(metric_config):
+                    if "TopkAcc" in m:
+                        msg = f"Unable to calculate accuracy when using \"batch_transform_ops\". The metric \"{m}\" has been removed."
+                        logger.warning(msg)
+                        metric_config.pop(m_idx)
+            self.train_metric_func = build_metrics(metric_config)
+        else:
+            self.train_metric_func = None
+
+        if self.mode == "eval" or (self.mode == "train" and
+                                   self.config["Global"]["eval_during_train"]):
+            if self.eval_mode == "classification":
+                if "Metric" in self.config and "Eval" in self.config["Metric"]:
+                    self.eval_metric_func = build_metrics(self.config["Metric"][
+                        "Eval"])
+                else:
+                    self.eval_metric_func = None
+            elif self.eval_mode == "retrieval":
+                if "Metric" in self.config and "Eval" in self.config["Metric"]:
+                    metric_config = self.config["Metric"]["Eval"]
+                else:
+                    metric_config = [{"name": "Recallk", "topk": (1, 5)}]
+                self.eval_metric_func = build_metrics(metric_config)
+            elif self.eval_mode == "face_recognition":
+                if "Metric" in self.config and "Eval" in self.config["Metric"]:
+                    self.eval_metric_func = build_metrics(self.config["Metric"]
+                                                          ["Eval"])
+        else:
+            self.eval_metric_func = None
+
+        # build model
+        self.model = build_model(self.config, self.mode)
+        # set @to_static for benchmark, skip this by default.
+        apply_to_static(self.config, self.model, is_rec=self.is_rec)
+
+        # load_pretrain
+        if self.config["Global"]["pretrained_model"] is not None:
+            load_dygraph_pretrain(
+                [self.model, getattr(self, 'train_loss_func', None)],
+                self.config["Global"]["pretrained_model"])
+
+        # build optimizer
+        if self.mode == 'train':
+            self.optimizer, self.lr_sch = build_optimizer(
+                self.config["Optimizer"], self.config["Global"]["epochs"],
+                self.iter_per_epoch // self.update_freq,
+                [self.model, self.train_loss_func])
+        # amp
+        self._init_amp()
+
+        # build EMA model
+        self.ema = "EMA" in self.config and self.mode == "train"
+        if self.ema:
+            self.model_ema = ExponentialMovingAverage(
+                self.model, self.config['EMA'].get("decay", 0.9999))
+
+        # check the gpu num
+        world_size = dist.get_world_size()
+        self.config["Global"]["distributed"] = world_size != 1
+        if self.mode == "train":
+            std_gpu_num = 8 if isinstance(
+                self.config["Optimizer"],
+                dict) and self.config["Optimizer"]["name"] == "AdamW" else 4
+            if world_size != std_gpu_num:
+                msg = f"The training strategy provided by PaddleClas is based on {std_gpu_num} gpus. But the number of gpu is {world_size} in current training. Please modify the stategy (learning rate, batch size and so on) if use this config to train."
+                logger.warning(msg)
+
+        # for distributed
+        if self.config["Global"]["distributed"]:
+            dist.init_parallel_env()
+            self.model = paddle.DataParallel(self.model)
+            if self.mode == 'train' and len(self.train_loss_func.parameters(
+            )) > 0:
+                self.train_loss_func = paddle.DataParallel(self.train_loss_func)
+
+            # set different seed in different GPU manually in distributed environment
+            if seed is None:
+                logger.warning(
+                    "The random seed cannot be None in a distributed environment. Global.seed has been set to 42 by default"
+                )
+                self.config["Global"]["seed"] = seed = 42
+            logger.info(
+                f"Set random seed to ({int(seed)} + $PADDLE_TRAINER_ID) for different trainer"
+            )
+            paddle.seed(int(seed) + dist.get_rank())
+            np.random.seed(int(seed) + dist.get_rank())
+            random.seed(int(seed) + dist.get_rank())
+
+        # build postprocess for infer
+        if self.mode == 'infer':
+            self.preprocess_func = create_operators(self.config["Infer"][
+                "transforms"])
+            self.postprocess_func = build_postprocess(self.config["Infer"][
+                "PostProcess"])
+
+    def train(self):
+        assert self.mode == "train"
+        print_batch_step = self.config['Global']['print_batch_step']
+        save_interval = self.config["Global"]["save_interval"]
+        best_metric = {
+            "metric": -1.0,
+            "epoch": 0,
+        }
+        acc_ema = -1.0
+        best_metric_ema = -1.0
+        ema_module = None
+        if self.ema:
+            ema_module = self.model_ema.module
+        # key:
+        # val: metrics list word
+        self.output_info = dict()
+        self.time_info = {
+            "batch_cost": AverageMeter(
+                "batch_cost", '.5f', postfix=" s,"),
+            "reader_cost": AverageMeter(
+                "reader_cost", ".5f", postfix=" s,"),
+        }
+        # global iter counter
+        self.global_step = 0
+        uniform_output_enabled = self.config['Global'].get(
+            "uniform_output_enabled", False)
+
+        if self.config.Global.checkpoints is not None:
+            metric_info = init_model(self.config.Global, self.model,
+                                     self.optimizer, self.train_loss_func,
+                                     ema_module)
+            if metric_info is not None:
+                best_metric.update(metric_info)
+            if hasattr(self.train_dataloader.batch_sampler, "set_epoch"):
+                self.train_dataloader.batch_sampler.set_epoch(best_metric[
+                    "epoch"])
+
+        for epoch_id in range(best_metric["epoch"] + 1,
+                              self.config["Global"]["epochs"] + 1):
+            acc = 0.0
+            # for one epoch train
+            self.train_epoch_func(self, epoch_id, print_batch_step)
+
+            if self.use_dali:
+                self.train_dataloader.reset()
+            metric_msg = ", ".join(
+                [self.output_info[key].avg_info for key in self.output_info])
+            logger.info("[Train][Epoch {}/{}][Avg]{}".format(
+                epoch_id, self.config["Global"]["epochs"], metric_msg))
+            self.output_info.clear()
+
+            # eval model and save model if possible
+            start_eval_epoch = self.config["Global"].get("start_eval_epoch",
+                                                         0) - 1
+            if self.config["Global"][
+                    "eval_during_train"] and epoch_id % self.config["Global"][
+                        "eval_interval"] == 0 and epoch_id > start_eval_epoch:
+                acc = self.eval(epoch_id)
+
+                # step lr (by epoch) according to given metric, such as acc
+                for i in range(len(self.lr_sch)):
+                    if getattr(self.lr_sch[i], "by_epoch", False) and \
+                            type_name(self.lr_sch[i]) == "ReduceOnPlateau":
+                        self.lr_sch[i].step(acc)
+
+                # update best_metric
+                if acc >= best_metric["metric"]:
+                    best_metric["metric"] = acc
+                    best_metric["epoch"] = epoch_id
+                logger.info("[Eval][Epoch {}][best metric: {}]".format(
+                    epoch_id, best_metric["metric"]))
+                logger.scaler(
+                    name="eval_acc",
+                    value=acc,
+                    step=epoch_id,
+                    writer=self.vdl_writer)
+
+                if self.ema:
+                    ori_model, self.model = self.model, ema_module
+                    acc_ema = self.eval(epoch_id)
+                    self.model = ori_model
+                    ema_module.eval()
+
+                    # update best_ema
+                    if acc_ema > best_metric_ema:
+                        best_metric_ema = acc_ema
+                    logger.info("[Eval][Epoch {}][best metric ema: {}]".format(
+                        epoch_id, best_metric_ema))
+                    logger.scaler(
+                        name="eval_acc_ema",
+                        value=acc_ema,
+                        step=epoch_id,
+                        writer=self.vdl_writer)
+
+                # save best model from best_acc or best_ema_acc
+                if max(acc, acc_ema) >= max(best_metric["metric"],
+                                            best_metric_ema):
+                    metric_info = {
+                        "metric": max(acc, acc_ema),
+                        "epoch": epoch_id
+                    }
+                    prefix = "best_model"
+                    save_load.save_model(
+                        self.model,
+                        self.optimizer,
+                        metric_info,
+                        os.path.join(self.output_dir, prefix)
+                        if uniform_output_enabled else self.output_dir,
+                        ema=ema_module,
+                        model_name=self.config["Arch"]["name"],
+                        prefix=prefix,
+                        loss=self.train_loss_func,
+                        save_student_model=True)
+                    if uniform_output_enabled:
+                        save_path = os.path.join(self.output_dir, prefix,
+                                                 "inference")
+                        self.export(save_path, uniform_output_enabled)
+                        gc.collect()
+                        if self.ema:
+                            ema_save_path = os.path.join(
+                                self.output_dir, prefix, "inference_ema")
+                            self.export(ema_save_path, uniform_output_enabled)
+                            gc.collect()
+                        update_train_results(
+                            self.config, prefix, metric_info, ema=self.ema)
+                        save_load.save_model_info(metric_info, self.output_dir,
+                                                  prefix)
+
+                self.model.train()
+
+            # save model
+            if save_interval > 0 and epoch_id % save_interval == 0:
+                metric_info = {"metric": acc, "epoch": epoch_id}
+                prefix = "epoch_{}".format(epoch_id)
+                save_load.save_model(
+                    self.model,
+                    self.optimizer,
+                    metric_info,
+                    os.path.join(self.output_dir, prefix)
+                    if uniform_output_enabled else self.output_dir,
+                    ema=ema_module,
+                    model_name=self.config["Arch"]["name"],
+                    prefix=prefix,
+                    loss=self.train_loss_func)
+                if uniform_output_enabled:
+                    save_path = os.path.join(self.output_dir, prefix,
+                                             "inference")
+                    self.export(save_path, uniform_output_enabled)
+                    gc.collect()
+                    if self.ema:
+                        ema_save_path = os.path.join(self.output_dir, prefix,
+                                                     "inference_ema")
+                        self.export(ema_save_path, uniform_output_enabled)
+                        gc.collect()
+                    update_train_results(
+                        self.config,
+                        prefix,
+                        metric_info,
+                        done_flag=epoch_id == self.config["Global"]["epochs"],
+                        ema=self.ema)
+                    save_load.save_model_info(metric_info, self.output_dir,
+                                              prefix)
+            # save the latest model
+            metric_info = {"metric": acc, "epoch": epoch_id}
+            prefix = "latest"
+            save_load.save_model(
+                self.model,
+                self.optimizer,
+                metric_info,
+                os.path.join(self.output_dir, prefix)
+                if uniform_output_enabled else self.output_dir,
+                ema=ema_module,
+                model_name=self.config["Arch"]["name"],
+                prefix=prefix,
+                loss=self.train_loss_func)
+            if uniform_output_enabled:
+                save_path = os.path.join(self.output_dir, prefix, "inference")
+                self.export(save_path, uniform_output_enabled)
+                gc.collect()
+                if self.ema:
+                    ema_save_path = os.path.join(self.output_dir, prefix,
+                                                 "inference_ema")
+                    self.export(ema_save_path, uniform_output_enabled)
+                    gc.collect()
+                save_load.save_model_info(metric_info, self.output_dir, prefix)
+                self.model.train()
+
+        if self.vdl_writer is not None:
+            self.vdl_writer.close()
+
+    @paddle.no_grad()
+    def eval(self, epoch_id=0):
+        assert self.mode in ["train", "eval"]
+        self.model.eval()
+        eval_result = self.eval_func(self, epoch_id)
+        self.model.train()
+        return eval_result
+
+    @paddle.no_grad()
+    def infer(self):
+        assert self.mode == "infer" and self.eval_mode == "classification"
+        results = []
+        total_trainer = dist.get_world_size()
+        local_rank = dist.get_rank()
+        infer_imgs = self.config["Infer"]["infer_imgs"]
+        infer_list = self.config["Infer"].get("infer_list", None)
+        image_list = get_image_list(infer_imgs, infer_list=infer_list)
+        # data split
+        image_list = image_list[local_rank::total_trainer]
+
+        batch_size = self.config["Infer"]["batch_size"]
+        self.model.eval()
+        batch_data = []
+        image_file_list = []
+        save_path = self.config["Infer"].get("save_dir", None)
+        for idx, image_file in enumerate(image_list):
+            with open(image_file, 'rb') as f:
+                x = f.read()
+            try:
+                for process in self.preprocess_func:
+                    x = process(x)
+                batch_data.append(x)
+                image_file_list.append(image_file)
+                if len(batch_data) >= batch_size or idx == len(image_list) - 1:
+                    batch_tensor = paddle.to_tensor(batch_data)
+
+                    with self.auto_cast(is_eval=True):
+                        out = self.model(batch_tensor)
+
+                    if isinstance(out, list):
+                        out = out[0]
+                    if isinstance(out, dict) and "Student" in out:
+                        out = out["Student"]
+                    if isinstance(out, dict) and "logits" in out:
+                        out = out["logits"]
+                    if isinstance(out, dict) and "output" in out:
+                        out = out["output"]
+
+                    result = self.postprocess_func(out, image_file_list)
+                    if not save_path:
+                        logger.info(result)
+                    results.extend(result)
+                    batch_data.clear()
+                    image_file_list.clear()
+            except Exception as ex:
+                logger.error(
+                    "Exception occured when parse line: {} with msg: {}".format(
+                        image_file, ex))
+                continue
+        if save_path:
+            save_predict_result(save_path, results)
+        return results
+
+    def export(self,
+               save_path=None,
+               uniform_output_enabled=False,
+               ema_module=None):
+        assert self.mode == "export" or uniform_output_enabled
+        if paddle.distributed.get_rank() != 0:
+            return
+        use_multilabel = self.config["Global"].get(
+            "use_multilabel",
+            False) or "ATTRMetric" in self.config["Metric"]["Eval"][0]
+        model = self.model_ema.module if self.ema else self.model
+        if hasattr(model, '_layers'):
+            model = copy.deepcopy(model._layers)
+        else:
+            model = copy.deepcopy(model)
+        model = ExportModel(self.config["Arch"], model
+                            if not ema_module else ema_module, use_multilabel)
+        if self.config["Global"][
+                "pretrained_model"] is not None and not uniform_output_enabled:
+            load_dygraph_pretrain(model.base_model,
+                                  self.config["Global"]["pretrained_model"])
+        model.eval()
+        # for re-parameterization nets
+        for layer in model.sublayers():
+            if hasattr(layer, "re_parameterize") and not getattr(layer,
+                                                                 "is_repped"):
+                layer.re_parameterize()
+        if not save_path:
+            save_path = os.path.join(
+                self.config["Global"]["save_inference_dir"], "inference")
+        else:
+            save_path = os.path.join(save_path, "inference")
+
+        model = paddle.jit.to_static(
+            model,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None] + self.config["Global"]["image_shape"],
+                    dtype='float32')
+            ])
+        if hasattr(model.base_model,
+                   "quanter") and model.base_model.quanter is not None:
+            model.base_model.quanter.save_quantized_model(model,
+                                                          save_path + "_int8")
+        else:
+            paddle.jit.save(model, save_path)
+        if self.config["Global"].get("export_for_fd",
+                                     False) or uniform_output_enabled:
+            dst_path = os.path.join(os.path.dirname(save_path), 'inference.yml')
+            dump_infer_config(self.config, dst_path)
+        logger.info(
+            f"Export succeeded! The inference model exported has been saved in \"{save_path}\"."
+        )
+
+    def _init_amp(self):
+        if self.mode == "export":
+            return
+
+        amp_config = self.config.get("AMP", None)
+        use_amp = True if amp_config and amp_config.get("use_amp",
+                                                        True) else False
+
+        if not use_amp:
+            self.auto_cast = AutoCast(use_amp)
+            self.scaler = build_scaler(use_amp)
+        else:
+            AMP_RELATED_FLAGS_SETTING = {'FLAGS_max_inplace_grad_add': 8, }
+            if paddle.is_compiled_with_cuda():
+                AMP_RELATED_FLAGS_SETTING.update({
+                    'FLAGS_cudnn_batchnorm_spatial_persistent': 1
+                })
+            paddle.set_flags(AMP_RELATED_FLAGS_SETTING)
+
+            use_promote = amp_config.get("use_promote", False)
+            amp_level = amp_config.get("level", "O1")
+            if amp_level not in ["O1", "O2"]:
+                msg = "[Parameter Error]: The optimize level of AMP only support 'O1' and 'O2'. The level has been set 'O1'."
+                logger.warning(msg)
+                amp_level = amp_config["level"] = "O1"
+
+            amp_eval = self.config["AMP"].get("use_fp16_test", False)
+            # TODO(gaotingquan): Paddle not yet support FP32 evaluation when training with AMPO2
+            if self.mode == "train" and self.config["Global"].get(
+                    "eval_during_train",
+                    True) and amp_level == "O2" and amp_eval == False:
+                msg = "PaddlePaddle only support FP16 evaluation when training with AMP O2 now. "
+                logger.warning(msg)
+                self.config["AMP"]["use_fp16_test"] = True
+                amp_eval = True
+
+            self.auto_cast = AutoCast(
+                use_amp,
+                amp_level=amp_level,
+                use_promote=use_promote,
+                amp_eval=amp_eval)
+
+            scale_loss = amp_config.get("scale_loss", 1.0)
+            use_dynamic_loss_scaling = amp_config.get(
+                "use_dynamic_loss_scaling", False)
+            self.scaler = build_scaler(
+                use_amp,
+                scale_loss=scale_loss,
+                use_dynamic_loss_scaling=use_dynamic_loss_scaling)
+
+            if self.mode == "train":
+                self.model, self.optimizer = paddle.amp.decorate(
+                    models=self.model,
+                    optimizers=self.optimizer,
+                    level=amp_level,
+                    save_dtype='float32')
+            elif amp_eval:
+                self.model = paddle.amp.decorate(
+                    models=self.model, level=amp_level, save_dtype='float32')
+
+            if self.mode == "train" and len(self.train_loss_func.parameters(
+            )) > 0:
+                self.train_loss_func = paddle.amp.decorate(
+                    models=self.train_loss_func,
+                    level=self.amp_level,
+                    save_dtype='float32')
+
+
+class ExportModel(TheseusLayer):
+    """
+    ExportModel: add softmax onto the model
+    """
+
+    def __init__(self, config, model, use_multilabel):
+        super().__init__()
+        self.base_model = model
+        # we should choose a final model to export
+        if isinstance(self.base_model, DistillationModel):
+            self.infer_model_name = config["infer_model_name"]
+        else:
+            self.infer_model_name = None
+
+        self.infer_output_key = config.get("infer_output_key", None)
+        if self.infer_output_key == "features" and isinstance(self.base_model,
+                                                              RecModel):
+            self.base_model.head = IdentityHead()
+        if use_multilabel:
+            self.out_act = nn.Sigmoid()
+        else:
+            if config.get("infer_add_softmax", True):
+                self.out_act = nn.Softmax(axis=-1)
+            else:
+                self.out_act = None
+
+    def eval(self):
+        self.training = False
+        for layer in self.sublayers():
+            layer.training = False
+            layer.eval()
+
+    def forward(self, x):
+        x = self.base_model(x)
+        if isinstance(x, list):
+            x = x[0]
+        if self.infer_model_name is not None:
+            x = x[self.infer_model_name]
+        if self.infer_output_key is not None:
+            x = x[self.infer_output_key]
+        if self.out_act is not None:
+            if isinstance(x, dict):
+                x = x["logits"]
+            x = self.out_act(x)
+        return x
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/evaluation/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/evaluation/__init__.py
new file mode 100644
index 000000000..0ac783e4c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/evaluation/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ppcls.engine.evaluation.classification import classification_eval
+from ppcls.engine.evaluation.retrieval import retrieval_eval
+from ppcls.engine.evaluation.adaface import adaface_eval
+from ppcls.engine.evaluation.face_recognition import face_recognition_eval
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/evaluation/adaface.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/evaluation/adaface.py
new file mode 100644
index 000000000..e62144b5c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/evaluation/adaface.py
@@ -0,0 +1,260 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import time
+import numpy as np
+import platform
+import paddle
+import sklearn
+from sklearn.model_selection import KFold
+from sklearn.decomposition import PCA
+
+from ppcls.utils.misc import AverageMeter
+from ppcls.utils import logger
+
+
+def fuse_features_with_norm(stacked_embeddings, stacked_norms):
+    assert stacked_embeddings.ndim == 3  # (n_features_to_fuse, batch_size, channel)
+    assert stacked_norms.ndim == 3  # (n_features_to_fuse, batch_size, 1)
+    pre_norm_embeddings = stacked_embeddings * stacked_norms
+    fused = pre_norm_embeddings.sum(axis=0)
+    norm = paddle.norm(fused, 2, 1, True)
+    fused = paddle.divide(fused, norm)
+    return fused, norm
+
+
+def adaface_eval(engine, epoch_id=0):
+    output_info = dict()
+    time_info = {
+        "batch_cost": AverageMeter(
+            "batch_cost", '.5f', postfix=" s,"),
+        "reader_cost": AverageMeter(
+            "reader_cost", ".5f", postfix=" s,"),
+    }
+    print_batch_step = engine.config["Global"]["print_batch_step"]
+
+    metric_key = None
+    tic = time.time()
+    unique_dict = {}
+    for iter_id, batch in enumerate(engine.eval_dataloader):
+        images, labels, dataname, image_index = batch
+        if iter_id == 5:
+            for key in time_info:
+                time_info[key].reset()
+        time_info["reader_cost"].update(time.time() - tic)
+        batch_size = images.shape[0]
+        batch[0] = paddle.to_tensor(images)
+        embeddings = engine.model(images, labels)['features']
+        norms = paddle.divide(embeddings, paddle.norm(embeddings, 2, 1, True))
+        embeddings = paddle.divide(embeddings, norms)
+        fliped_images = paddle.flip(images, axis=[3])
+        flipped_embeddings = engine.model(fliped_images, labels)['features']
+        flipped_norms = paddle.divide(
+            flipped_embeddings, paddle.norm(flipped_embeddings, 2, 1, True))
+        flipped_embeddings = paddle.divide(flipped_embeddings, flipped_norms)
+        stacked_embeddings = paddle.stack(
+            [embeddings, flipped_embeddings], axis=0)
+        stacked_norms = paddle.stack([norms, flipped_norms], axis=0)
+        embeddings, norms = fuse_features_with_norm(stacked_embeddings,
+                                                    stacked_norms)
+
+        for out, nor, label, data, idx in zip(embeddings, norms, labels,
+                                              dataname, image_index):
+            unique_dict[int(idx.numpy())] = {
+                'output': out,
+                'norm': nor,
+                'target': label,
+                'dataname': data
+            }
+            #  calc metric
+        time_info["batch_cost"].update(time.time() - tic)
+        if iter_id % print_batch_step == 0:
+            time_msg = "s, ".join([
+                "{}: {:.5f}".format(key, time_info[key].avg)
+                for key in time_info
+            ])
+
+            ips_msg = "ips: {:.5f} images/sec".format(
+                batch_size / time_info["batch_cost"].avg)
+
+            metric_msg = ", ".join([
+                "{}: {:.5f}".format(key, output_info[key].val)
+                for key in output_info
+            ])
+            logger.info("[Eval][Epoch {}][Iter: {}/{}]{}, {}, {}".format(
+                epoch_id, iter_id,
+                len(engine.eval_dataloader), metric_msg, time_msg, ips_msg))
+
+        tic = time.time()
+
+    unique_keys = sorted(unique_dict.keys())
+    all_output_tensor = paddle.stack(
+        [unique_dict[key]['output'] for key in unique_keys], axis=0)
+    all_norm_tensor = paddle.stack(
+        [unique_dict[key]['norm'] for key in unique_keys], axis=0)
+    all_target_tensor = paddle.stack(
+        [unique_dict[key]['target'] for key in unique_keys], axis=0)
+    all_dataname_tensor = paddle.stack(
+        [unique_dict[key]['dataname'] for key in unique_keys], axis=0)
+
+    eval_result = cal_metric(all_output_tensor, all_norm_tensor,
+                             all_target_tensor, all_dataname_tensor)
+
+    metric_msg = ", ".join([
+        "{}: {:.5f}".format(key, output_info[key].avg) for key in output_info
+    ])
+    face_msg = ", ".join([
+        "{}: {:.5f}".format(key, eval_result[key])
+        for key in eval_result.keys()
+    ])
+    logger.info("[Eval][Epoch {}][Avg]{}".format(epoch_id, metric_msg + ", " +
+                                                 face_msg))
+
+    # return 1st metric in the dict
+    return eval_result['all_test_acc']
+
+
+def cal_metric(all_output_tensor, all_norm_tensor, all_target_tensor,
+               all_dataname_tensor):
+    all_target_tensor = all_target_tensor.reshape([-1])
+    all_dataname_tensor = all_dataname_tensor.reshape([-1])
+    dataname_to_idx = {
+        "agedb_30": 0,
+        "cfp_fp": 1,
+        "lfw": 2,
+        "cplfw": 3,
+        "calfw": 4
+    }
+    idx_to_dataname = {val: key for key, val in dataname_to_idx.items()}
+    test_logs = {}
+    # _, indices = paddle.unique(all_dataname_tensor, return_index=True, return_inverse=False, return_counts=False)
+    for dataname_idx in all_dataname_tensor.unique():
+        dataname = idx_to_dataname[dataname_idx.item()]
+        # per dataset evaluation
+        embeddings = all_output_tensor[all_dataname_tensor ==
+                                       dataname_idx].numpy()
+        labels = all_target_tensor[all_dataname_tensor == dataname_idx].numpy()
+        issame = labels[0::2]
+        tpr, fpr, accuracy, best_thresholds = evaluate_face(
+            embeddings, issame, nrof_folds=10)
+        acc, best_threshold = accuracy.mean(), best_thresholds.mean()
+
+        num_test_samples = len(embeddings)
+        test_logs[f'{dataname}_test_acc'] = acc
+        test_logs[f'{dataname}_test_best_threshold'] = best_threshold
+        test_logs[f'{dataname}_num_test_samples'] = num_test_samples
+
+    test_acc = np.mean([
+        test_logs[f'{dataname}_test_acc']
+        for dataname in dataname_to_idx.keys()
+        if f'{dataname}_test_acc' in test_logs
+    ])
+
+    test_logs['all_test_acc'] = test_acc
+    return test_logs
+
+
+def evaluate_face(embeddings, actual_issame, nrof_folds=10, pca=0):
+    # Calculate evaluation metrics
+    thresholds = np.arange(0, 4, 0.01)
+    embeddings1 = embeddings[0::2]
+    embeddings2 = embeddings[1::2]
+    tpr, fpr, accuracy, best_thresholds = calculate_roc(
+        thresholds,
+        embeddings1,
+        embeddings2,
+        np.asarray(actual_issame),
+        nrof_folds=nrof_folds,
+        pca=pca)
+    return tpr, fpr, accuracy, best_thresholds
+
+
+def calculate_roc(thresholds,
+                  embeddings1,
+                  embeddings2,
+                  actual_issame,
+                  nrof_folds=10,
+                  pca=0):
+    assert (embeddings1.shape[0] == embeddings2.shape[0])
+    assert (embeddings1.shape[1] == embeddings2.shape[1])
+    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+    nrof_thresholds = len(thresholds)
+    k_fold = KFold(n_splits=nrof_folds, shuffle=False)
+
+    tprs = np.zeros((nrof_folds, nrof_thresholds))
+    fprs = np.zeros((nrof_folds, nrof_thresholds))
+    accuracy = np.zeros((nrof_folds))
+    best_thresholds = np.zeros((nrof_folds))
+    indices = np.arange(nrof_pairs)
+    # print('pca', pca)
+    dist = None
+
+    if pca == 0:
+        diff = np.subtract(embeddings1, embeddings2)
+        dist = np.sum(np.square(diff), 1)
+
+    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+        # print('train_set', train_set)
+        # print('test_set', test_set)
+        if pca > 0:
+            print('doing pca on', fold_idx)
+            embed1_train = embeddings1[train_set]
+            embed2_train = embeddings2[train_set]
+            _embed_train = np.concatenate((embed1_train, embed2_train), axis=0)
+            # print(_embed_train.shape)
+            pca_model = PCA(n_components=pca)
+            pca_model.fit(_embed_train)
+            embed1 = pca_model.transform(embeddings1)
+            embed2 = pca_model.transform(embeddings2)
+            embed1 = sklearn.preprocessing.normalize(embed1)
+            embed2 = sklearn.preprocessing.normalize(embed2)
+            # print(embed1.shape, embed2.shape)
+            diff = np.subtract(embed1, embed2)
+            dist = np.sum(np.square(diff), 1)
+
+        # Find the best threshold for the fold
+        acc_train = np.zeros((nrof_thresholds))
+        for threshold_idx, threshold in enumerate(thresholds):
+            _, _, acc_train[threshold_idx] = calculate_accuracy(
+                threshold, dist[train_set], actual_issame[train_set])
+        best_threshold_index = np.argmax(acc_train)
+        best_thresholds[fold_idx] = thresholds[best_threshold_index]
+        for threshold_idx, threshold in enumerate(thresholds):
+            tprs[fold_idx, threshold_idx], fprs[
+                fold_idx, threshold_idx], _ = calculate_accuracy(
+                    threshold, dist[test_set], actual_issame[test_set])
+        _, _, accuracy[fold_idx] = calculate_accuracy(
+            thresholds[best_threshold_index], dist[test_set],
+            actual_issame[test_set])
+
+    tpr = np.mean(tprs, 0)
+    fpr = np.mean(fprs, 0)
+    return tpr, fpr, accuracy, best_thresholds
+
+
+def calculate_accuracy(threshold, dist, actual_issame):
+    predict_issame = np.less(dist, threshold)
+    tp = np.sum(np.logical_and(predict_issame, actual_issame))
+    fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
+    tn = np.sum(
+        np.logical_and(
+            np.logical_not(predict_issame), np.logical_not(actual_issame)))
+    fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame))
+
+    tpr = 0 if (tp + fn == 0) else float(tp) / float(tp + fn)
+    fpr = 0 if (fp + tn == 0) else float(fp) / float(fp + tn)
+    acc = float(tp + tn) / dist.size
+    return tpr, fpr, acc
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/evaluation/classification.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/evaluation/classification.py
new file mode 100644
index 000000000..d802eeeda
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/evaluation/classification.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import time
+import platform
+import paddle
+
+from ppcls.utils.misc import AverageMeter
+from ppcls.utils import logger
+
+
+def classification_eval(engine, epoch_id=0):
+    if hasattr(engine.eval_metric_func, "reset"):
+        engine.eval_metric_func.reset()
+    output_info = dict()
+    time_info = {
+        "batch_cost": AverageMeter(
+            "batch_cost", '.5f', postfix=" s,"),
+        "reader_cost": AverageMeter(
+            "reader_cost", ".5f", postfix=" s,"),
+    }
+    print_batch_step = engine.config["Global"]["print_batch_step"]
+
+    tic = time.time()
+    accum_samples = 0
+    total_samples = len(
+        engine.eval_dataloader.
+        dataset) if not engine.use_dali else engine.eval_dataloader.size
+    max_iter = len(engine.eval_dataloader) - 1 if platform.system(
+    ) == "Windows" else len(engine.eval_dataloader)
+    for iter_id, batch in enumerate(engine.eval_dataloader):
+        if iter_id >= max_iter:
+            break
+        if iter_id == 5:
+            for key in time_info:
+                time_info[key].reset()
+
+        time_info["reader_cost"].update(time.time() - tic)
+        batch_size = batch[0].shape[0]
+        batch[0] = paddle.to_tensor(batch[0])
+        if not engine.config["Global"].get("use_multilabel", False):
+            batch[1] = batch[1].reshape([-1, 1]).astype("int64")
+
+        # image input
+        with engine.auto_cast(is_eval=True):
+            if engine.is_rec:
+                out = engine.model(batch[0], batch[1])
+            else:
+                out = engine.model(batch[0])
+
+        # just for DistributedBatchSampler issue: repeat sampling
+        current_samples = batch_size * paddle.distributed.get_world_size()
+        accum_samples += current_samples
+
+        if isinstance(out, dict) and "Student" in out:
+            out = out["Student"]
+        if isinstance(out, dict) and "logits" in out:
+            out = out["logits"]
+
+        # gather Tensor when distributed
+        if paddle.distributed.get_world_size() > 1:
+            label_list = []
+            device_id = paddle.distributed.ParallelEnv().device_id
+            label = batch[1].cuda(device_id) if engine.config["Global"][
+                "device"] == "gpu" else batch[1]
+            paddle.distributed.all_gather(label_list, label)
+            labels = paddle.concat(label_list, 0)
+
+            if isinstance(out, list):
+                preds = []
+                for x in out:
+                    pred_list = []
+                    paddle.distributed.all_gather(pred_list, x)
+                    pred_x = paddle.concat(pred_list, 0)
+                    preds.append(pred_x)
+            else:
+                pred_list = []
+                paddle.distributed.all_gather(pred_list, out)
+                preds = paddle.concat(pred_list, 0)
+
+            if accum_samples > total_samples and not engine.use_dali:
+                if isinstance(preds, list):
+                    preds = [
+                        pred[:total_samples + current_samples - accum_samples]
+                        for pred in preds
+                    ]
+                else:
+                    preds = preds[:total_samples + current_samples -
+                                  accum_samples]
+                labels = labels[:total_samples + current_samples -
+                                accum_samples]
+                current_samples = total_samples + current_samples - accum_samples
+        else:
+            labels = batch[1]
+            preds = out
+
+        # calc loss
+        if engine.eval_loss_func is not None:
+            with engine.auto_cast(is_eval=True):
+                loss_dict = engine.eval_loss_func(preds, labels)
+
+            for key in loss_dict:
+                if key not in output_info:
+                    output_info[key] = AverageMeter(key, '7.5f')
+                output_info[key].update(float(loss_dict[key]), current_samples)
+
+        #  calc metric
+        if engine.eval_metric_func is not None:
+            engine.eval_metric_func(preds, labels)
+        time_info["batch_cost"].update(time.time() - tic)
+
+        if iter_id % print_batch_step == 0:
+            time_msg = "s, ".join([
+                "{}: {:.5f}".format(key, time_info[key].avg)
+                for key in time_info
+            ])
+
+            ips_msg = "ips: {:.5f} images/sec".format(
+                batch_size / time_info["batch_cost"].avg)
+
+            if "ATTRMetric" in engine.config["Metric"]["Eval"][0]:
+                metric_msg = ""
+            else:
+                metric_msg = ", ".join([
+                    "{}: {:.5f}".format(key, output_info[key].val)
+                    for key in output_info
+                ])
+                if "MultiLabelMAP" not in engine.config["Metric"]["Eval"][0]:
+                    metric_msg += ", {}".format(engine.eval_metric_func.avg_info)
+            logger.info("[Eval][Epoch {}][Iter: {}/{}]{}, {}, {}".format(
+                epoch_id, iter_id,
+                len(engine.eval_dataloader), metric_msg, time_msg, ips_msg))
+
+        tic = time.time()
+    if engine.use_dali:
+        engine.eval_dataloader.reset()
+
+    if "ATTRMetric" in engine.config["Metric"]["Eval"][0]:
+        metric_msg = ", ".join([
+            "evalres: ma: {:.5f} label_f1: {:.5f} label_pos_recall: {:.5f} label_neg_recall: {:.5f} instance_f1: {:.5f} instance_acc: {:.5f} instance_prec: {:.5f} instance_recall: {:.5f}".
+            format(*engine.eval_metric_func.attr_res())
+        ])
+        logger.info("[Eval][Epoch {}][Avg]{}".format(epoch_id, metric_msg))
+
+        # do not try to save best eval.model
+        if engine.eval_metric_func is None:
+            return -1
+        # return 1st metric in the dict
+        return engine.eval_metric_func.attr_res()[0]
+    else:
+        metric_msg = ", ".join([
+            "{}: {:.5f}".format(key, output_info[key].avg)
+            for key in output_info
+        ])
+        metric_msg += ", {}".format(engine.eval_metric_func.avg_info)
+        logger.info("[Eval][Epoch {}][Avg]{}".format(epoch_id, metric_msg))
+
+        # do not try to save best eval.model
+        if engine.eval_metric_func is None:
+            return -1
+        # return 1st metric in the dict
+        return engine.eval_metric_func.avg
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/evaluation/face_recognition.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/evaluation/face_recognition.py
new file mode 100644
index 000000000..fcedca3c1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/evaluation/face_recognition.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import platform
+import paddle
+import paddle.nn.functional as F
+
+from ppcls.utils.misc import AverageMeter
+from ppcls.utils import logger, all_gather
+
+
+def face_recognition_eval(engine, epoch_id=0):
+    # reset metric on beginning of eval
+    if hasattr(engine.eval_metric_func, "reset"):
+        engine.eval_metric_func.reset()
+    output_info = dict()
+
+    # log time_info for each batch
+    time_info = {
+        "batch_cost": AverageMeter(
+            "batch_cost", '.5f', postfix=" s,"),
+        "reader_cost": AverageMeter(
+            "reader_cost", ".5f", postfix=" s,"),
+    }
+    print_batch_step = engine.config["Global"]["print_batch_step"]
+
+    tic = time.time()
+    accum_samples = 0
+    total_samples = len(
+        engine.eval_dataloader.
+        dataset) if not engine.use_dali else engine.eval_dataloader.size
+    max_iter = len(engine.eval_dataloader) - 1 if platform.system(
+    ) == "Windows" else len(engine.eval_dataloader)
+    flip_test = engine.config["Global"].get("flip_test", False)
+    feature_normalize = engine.config["Global"].get("feature_normalize", False)
+    for iter_id, batch in enumerate(engine.eval_dataloader):
+        if iter_id >= max_iter:
+            break
+        if iter_id == 5:
+            for key in time_info:
+                time_info[key].reset()
+        time_info["reader_cost"].update(time.time() - tic)
+
+        images_left, images_right, labels = [
+            paddle.to_tensor(x) for x in batch[:3]]
+        batch_remains = [paddle.to_tensor(x) for x in batch[3:]]
+        labels = labels.astype('int64')
+        batch_size = images_left.shape[0]
+
+        # flip images
+        if flip_test:
+            images_left = paddle.concat(
+                [images_left, paddle.flip(images_left, axis=-1)], 0)
+            images_right = paddle.concat(
+                [images_right, paddle.flip(images_right, axis=-1)], 0)
+
+        with engine.auto_cast(is_eval=True):
+            out_left = engine.model(images_left)
+            out_right = engine.model(images_right)
+
+        # get features
+        if engine.config["Global"].get("retrieval_feature_from",
+                                      "features") == "features":
+            # use output from neck as feature
+            embeddings_left = out_left["features"]
+            embeddings_right = out_right["features"]
+        else:
+            # use output from backbone as feature
+            embeddings_left = out_left["backbone"]
+            embeddings_right = out_right["backbone"]
+        
+        # normalize features
+        if feature_normalize:
+            embeddings_left = F.normalize(embeddings_left, p=2, axis=1)
+            embeddings_right = F.normalize(embeddings_right, p=2, axis=1)
+        
+        # fuse features by sum up if flip_test is True
+        if flip_test:
+            embeddings_left = embeddings_left[:batch_size] + \
+                              embeddings_left[batch_size:]
+            embeddings_right = embeddings_right[:batch_size] + \
+                               embeddings_right[batch_size:]
+
+        # just for DistributedBatchSampler issue: repeat sampling
+        current_samples = batch_size * paddle.distributed.get_world_size()
+        accum_samples += current_samples
+
+        # gather Tensor when distributed
+        if paddle.distributed.get_world_size() > 1:
+            embeddings_left = all_gather(embeddings_left)
+            embeddings_right = all_gather(embeddings_right)
+            labels = all_gather(labels)
+            batch_remains = [all_gather(x) for x in batch_remains]
+
+            # discard redundant padding sample(s) in the last batch
+            if accum_samples > total_samples and not engine.use_dali:
+                rest_num = total_samples + current_samples - accum_samples
+                embeddings_left = embeddings_left[:rest_num]
+                embeddings_right = embeddings_right[:rest_num]
+                labels = labels[:rest_num]
+                batch_remains = [x[:rest_num] for x in batch_remains]
+
+        #  calc metric
+        if engine.eval_metric_func is not None:
+            engine.eval_metric_func(embeddings_left, embeddings_right, labels, 
+                                    *batch_remains)
+        time_info["batch_cost"].update(time.time() - tic)
+
+        if iter_id % print_batch_step == 0:
+            time_msg = "s, ".join([
+                "{}: {:.5f}".format(key, time_info[key].avg)
+                for key in time_info
+            ])
+
+            ips_msg = "ips: {:.5f} images/sec".format(
+                batch_size / time_info["batch_cost"].avg)
+
+            metric_msg = ", ".join([
+                "{}: {:.5f}".format(key, output_info[key].val)
+                    for key in output_info
+            ])
+            logger.info("[Eval][Epoch {}][Iter: {}/{}]{}, {}, {}".format(
+                epoch_id, iter_id,
+                len(engine.eval_dataloader), metric_msg, time_msg, ips_msg))
+
+        tic = time.time()
+    if engine.use_dali:
+        engine.eval_dataloader.reset()
+
+    metric_msg = ", ".join([
+        "{}: {:.5f}".format(key, output_info[key].avg)
+        for key in output_info
+    ])
+    metric_msg += ", {}".format(engine.eval_metric_func.avg_info)
+    logger.info("[Eval][Epoch {}][Avg]{}".format(epoch_id, metric_msg))
+
+    # do not try to save best eval.model
+    if engine.eval_metric_func is None:
+        return -1
+    # return 1st metric in the dict
+    return engine.eval_metric_func.avg
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/evaluation/retrieval.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/evaluation/retrieval.py
new file mode 100644
index 000000000..53f744f1a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/evaluation/retrieval.py
@@ -0,0 +1,327 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import defaultdict
+
+import numpy as np
+import paddle
+import scipy
+
+from ppcls.utils import all_gather, logger
+
+
+def retrieval_eval(engine, epoch_id=0):
+    engine.model.eval()
+    # step1. prepare query and gallery features
+    if engine.gallery_query_dataloader is not None:
+        gallery_feat, gallery_label, gallery_camera = compute_feature(
+            engine, "gallery_query")
+        query_feat, query_label, query_camera = gallery_feat, gallery_label, gallery_camera
+    else:
+        gallery_feat, gallery_label, gallery_camera = compute_feature(
+            engine, "gallery")
+        query_feat, query_label, query_camera = compute_feature(engine,
+                                                                "query")
+
+    # step2. split features into feature blocks for saving memory
+    num_query = len(query_feat)
+    block_size = engine.config["Global"].get("sim_block_size", 64)
+    sections = [block_size] * (num_query // block_size)
+    if num_query % block_size > 0:
+        sections.append(num_query % block_size)
+
+    query_feat_blocks = paddle.split(query_feat, sections)
+    query_label_blocks = paddle.split(query_label, sections)
+    query_camera_blocks = paddle.split(
+        query_camera, sections) if query_camera is not None else None
+    metric_key = None
+
+    # step3. compute metric
+    if engine.eval_loss_func is None:
+        metric_dict = {metric_key: 0.0}
+    else:
+        use_reranking = engine.config["Global"].get("re_ranking", False)
+        logger.info(f"re_ranking={use_reranking}")
+        if use_reranking:
+            # compute distance matrix
+            distmat = compute_re_ranking_dist(
+                query_feat, gallery_feat, engine.config["Global"].get(
+                    "feature_normalize", True), 20, 6, 0.3)
+            # exclude illegal distance
+            if query_camera is not None:
+                camera_mask = query_camera != gallery_camera.t()
+                label_mask = query_label != gallery_label.t()
+                keep_mask = label_mask | camera_mask
+                distmat = keep_mask.astype(query_feat.dtype) * distmat + (
+                    ~keep_mask).astype(query_feat.dtype) * (distmat.max() + 1)
+            else:
+                keep_mask = None
+            # compute metric with all samples
+            metric_dict = engine.eval_metric_func(-distmat, query_label,
+                                                  gallery_label, keep_mask)
+        else:
+            metric_dict = defaultdict(float)
+            for block_idx, block_feat in enumerate(query_feat_blocks):
+                # compute distance matrix
+                distmat = paddle.matmul(
+                    block_feat, gallery_feat, transpose_y=True)
+                # exclude illegal distance
+                if query_camera is not None:
+                    camera_mask = query_camera_blocks[
+                        block_idx] != gallery_camera.t()
+                    label_mask = query_label_blocks[
+                        block_idx] != gallery_label.t()
+                    keep_mask = label_mask | camera_mask
+                    distmat = keep_mask.astype(query_feat.dtype) * distmat
+                else:
+                    keep_mask = None
+                # compute metric by block
+                metric_block = engine.eval_metric_func(
+                    distmat, query_label_blocks[block_idx], gallery_label,
+                    keep_mask)
+                # accumulate metric
+                for key in metric_block:
+                    metric_dict[key] += metric_block[key] * block_feat.shape[
+                        0] / num_query
+
+    metric_info_list = []
+    for key, value in metric_dict.items():
+        metric_info_list.append(f"{key}: {value:.5f}")
+        if metric_key is None:
+            metric_key = key
+    metric_msg = ", ".join(metric_info_list)
+    logger.info(f"[Eval][Epoch {epoch_id}][Avg]{metric_msg}")
+
+    return metric_dict[metric_key]
+
+
+def compute_feature(engine, name="gallery"):
+    if name == "gallery":
+        dataloader = engine.gallery_dataloader
+    elif name == "query":
+        dataloader = engine.query_dataloader
+    elif name == "gallery_query":
+        dataloader = engine.gallery_query_dataloader
+    else:
+        raise ValueError(
+            f"Only support gallery or query or gallery_query dataset, but got {name}"
+        )
+
+    all_feat = []
+    all_label = []
+    all_camera = []
+    has_camera = False
+    for idx, batch in enumerate(dataloader):  # load is very time-consuming
+        if idx % engine.config["Global"]["print_batch_step"] == 0:
+            logger.info(
+                f"{name} feature calculation process: [{idx}/{len(dataloader)}]"
+            )
+
+        batch = [paddle.to_tensor(x) for x in batch]
+        batch[1] = batch[1].reshape([-1, 1]).astype("int64")
+        if len(batch) >= 3:
+            has_camera = True
+            batch[2] = batch[2].reshape([-1, 1]).astype("int64")
+        with engine.auto_cast(is_eval=True):
+            if engine.is_rec:
+                out = engine.model(batch[0], batch[1])
+            else:
+                out = engine.model(batch[0])
+        if "Student" in out:
+            out = out["Student"]
+
+        # get features
+        if engine.config["Global"].get("retrieval_feature_from",
+                                       "features") == "features":
+            # use output from neck as feature
+            batch_feat = out["features"]
+        else:
+            # use output from backbone as feature
+            batch_feat = out["backbone"]
+
+        # do norm(optional)
+        if engine.config["Global"].get("feature_normalize", True):
+            batch_feat = paddle.nn.functional.normalize(batch_feat, p=2)
+
+        # do binarize(optional)
+        if engine.config["Global"].get("feature_binarize") == "round":
+            batch_feat = paddle.round(batch_feat).astype("float32") * 2.0 - 1.0
+        elif engine.config["Global"].get("feature_binarize") == "sign":
+            batch_feat = paddle.sign(batch_feat).astype("float32")
+
+        if paddle.distributed.get_world_size() > 1:
+            all_feat.append(all_gather(batch_feat))
+            all_label.append(all_gather(batch[1]))
+            if has_camera:
+                all_camera.append(all_gather(batch[2]))
+        else:
+            all_feat.append(batch_feat)
+            all_label.append(batch[1])
+            if has_camera:
+                all_camera.append(batch[2])
+
+    if engine.use_dali:
+        dataloader.reset()
+
+    all_feat = paddle.concat(all_feat)
+    all_label = paddle.concat(all_label)
+    if has_camera:
+        all_camera = paddle.concat(all_camera)
+    else:
+        all_camera = None
+    # discard redundant padding sample(s) at the end
+    total_samples = dataloader.size if engine.use_dali else len(
+        dataloader.dataset)
+    all_feat = all_feat[:total_samples]
+    all_label = all_label[:total_samples]
+    if has_camera:
+        all_camera = all_camera[:total_samples]
+
+    logger.info(f"Build {name} done, all feat shape: {all_feat.shape}")
+    return all_feat, all_label, all_camera
+
+
+def k_reciprocal_neighbor(rank: np.ndarray, p: int, k: int) -> np.ndarray:
+    """Implementation of k-reciprocal nearest neighbors, i.e. R(p, k)
+
+    Args:
+        rank (np.ndarray): Rank mat with shape of [N, N].
+        p (int): Probe index.
+        k (int): Parameter k for k-reciprocal nearest neighbors algorithm.
+
+    Returns:
+        np.ndarray: K-reciprocal nearest neighbors of probe p with shape of [M, ].
+    """
+    # use k+1 for excluding probe index itself
+    forward_k_neigh_index = rank[p, :k + 1]
+    backward_k_neigh_index = rank[forward_k_neigh_index, :k + 1]
+    candidate = np.where(backward_k_neigh_index == p)[0]
+    return forward_k_neigh_index[candidate]
+
+
+def compute_re_ranking_dist(query_feat: paddle.Tensor,
+                            gallery_feat: paddle.Tensor,
+                            feature_normed: bool=True,
+                            k1: int=20,
+                            k2: int=6,
+                            lamb: float=0.5) -> paddle.Tensor:
+    """
+    Re-ranking Person Re-identification with k-reciprocal Encoding
+    Reference: https://arxiv.org/abs/1701.08398
+    Code refernence: https://github.com/michuanhaohao/reid-strong-baseline/blob/master/utils/re_ranking.py
+
+    Args:
+        query_feat (paddle.Tensor): Query features with shape of [num_query, feature_dim].
+        gallery_feat (paddle.Tensor):  Gallery features with shape of [num_gallery, feature_dim].
+        feature_normed (bool, optional):  Whether input features are normalized.
+        k1 (int, optional): Parameter for K-reciprocal nearest neighbors. Defaults to 20.
+        k2 (int, optional): Parameter for K-nearest neighbors. Defaults to 6.
+        lamb (float, optional): Penalty factor. Defaults to 0.5.
+
+    Returns:
+        paddle.Tensor: (1 - lamb) x Dj + lamb x D, with shape of [num_query, num_gallery].
+    """
+    num_query = query_feat.shape[0]
+    num_gallery = gallery_feat.shape[0]
+    num_all = num_query + num_gallery
+    feat = paddle.concat([query_feat, gallery_feat], 0)
+    logger.info("Using GPU to compute original distance matrix")
+    # use L2 distance
+    if feature_normed:
+        original_dist = 2 - 2 * paddle.matmul(feat, feat, transpose_y=True)
+    else:
+        original_dist = paddle.pow(feat, 2).sum(axis=1, keepdim=True).expand([num_all, num_all]) + \
+            paddle.pow(feat, 2).sum(axis=1, keepdim=True).expand([num_all, num_all]).t()
+        original_dist = original_dist.addmm(feat, feat.t(), -2.0, 1.0)
+    original_dist = original_dist.numpy()
+    del feat
+
+    original_dist = np.transpose(original_dist / np.max(original_dist, axis=0))
+    V = np.zeros_like(original_dist).astype(np.float16)
+    initial_rank = np.argpartition(original_dist, range(1, k1 + 1))
+    logger.info("Start re-ranking...")
+
+    for p in range(num_all):
+        # compute R(p,k1)
+        p_k_reciprocal_ind = k_reciprocal_neighbor(initial_rank, p, k1)
+
+        # compute R*(p,k1)=R(p,k1)∪R(q,k1/2)
+        # s.t. |R(p,k1)∩R(q,k1/2)|>=2/3|R(q,k1/2)|, ∀q∈R(p,k1)
+        p_k_reciprocal_exp_ind = p_k_reciprocal_ind
+        for _, q in enumerate(p_k_reciprocal_ind):
+            q_k_reciprocal_ind = k_reciprocal_neighbor(initial_rank, q,
+                                                       int(np.around(k1 / 2)))
+            if len(
+                    np.intersect1d(
+                        p_k_reciprocal_ind,
+                        q_k_reciprocal_ind,
+                        assume_unique=True)) > 2 / 3 * len(q_k_reciprocal_ind):
+                p_k_reciprocal_exp_ind = np.append(p_k_reciprocal_exp_ind,
+                                                   q_k_reciprocal_ind)
+        p_k_reciprocal_exp_ind = np.unique(p_k_reciprocal_exp_ind)
+        # reweight distance using gaussian kernel
+        weight = np.exp(-original_dist[p, p_k_reciprocal_exp_ind])
+        V[p, p_k_reciprocal_exp_ind] = weight / np.sum(weight)
+
+    # local query expansion
+    original_dist = original_dist[:num_query, ]
+    if k2 > 1:
+        try:
+            # use sparse tensor to speed up query expansion
+            indices = (np.repeat(np.arange(num_all), k2),
+                       initial_rank[:, :k2].reshape([-1, ]))
+            values = np.array(
+                [1 / k2 for _ in range(num_all * k2)], dtype="float16")
+            V = scipy.sparse.coo_matrix(
+                (values, indices), V.shape,
+                dtype="float16") @V.astype("float16")
+        except Exception as e:
+            logger.info(
+                f"Failed to do local query expansion with sparse tensor for reason: \n{e}\n"
+                f"now use for-loop instead")
+            # use vanilla for-loop
+            V_qe = np.zeros_like(V, dtype=np.float16)
+            for i in range(num_all):
+                V_qe[i, :] = np.mean(V[initial_rank[i, :k2], :], axis=0)
+            V = V_qe
+            del V_qe
+    del initial_rank
+
+    # cache k-reciprocal sets which contains gj
+    invIndex = []
+    for gj in range(num_all):
+        invIndex.append(np.nonzero(V[:, gj])[0])
+
+    # compute jaccard distance
+    jaccard_dist = np.zeros_like(original_dist, dtype=np.float16)
+    for p in range(num_query):
+        sum_min = np.zeros(shape=[1, num_all], dtype=np.float16)
+        gj_ind = np.nonzero(V[p, :])[0]
+        gj_ind_inv = [invIndex[gj] for gj in gj_ind]
+        for j, gj in enumerate(gj_ind):
+            gi = gj_ind_inv[j]
+            sum_min[0, gi] += np.minimum(V[p, gj], V[gi, gj])
+        jaccard_dist[p] = 1 - sum_min / (2 - sum_min)
+
+    # fuse jaccard distance with original distance
+    final_dist = (1 - lamb) * jaccard_dist + lamb * original_dist
+    del original_dist
+    del V
+    del jaccard_dist
+    final_dist = final_dist[:num_query, num_query:]
+    final_dist = paddle.to_tensor(final_dist)
+    return final_dist
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/__init__.py
new file mode 100644
index 000000000..50bf9037f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ppcls.engine.train.train import train_epoch
+from ppcls.engine.train.train_fixmatch import train_epoch_fixmatch
+from ppcls.engine.train.train_fixmatch_ccssl import train_epoch_fixmatch_ccssl
+from ppcls.engine.train.train_progressive import train_epoch_progressive
+from ppcls.engine.train.train_metabin import train_epoch_metabin
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/train.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/train.py
new file mode 100644
index 000000000..cf7f65734
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/train.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+
+import time
+import paddle
+from ppcls.engine.train.utils import update_loss, update_metric, log_info, type_name
+from ppcls.utils import profiler
+
+
+def train_epoch(engine, epoch_id, print_batch_step):
+    tic = time.time()
+
+    if not hasattr(engine, "train_dataloader_iter"):
+        engine.train_dataloader_iter = iter(engine.train_dataloader)
+
+    for iter_id in range(engine.iter_per_epoch):
+        # fetch data batch from dataloader
+        try:
+            batch = next(engine.train_dataloader_iter)
+        except Exception:
+            # NOTE: reset DALI dataloader manually
+            if engine.use_dali:
+                engine.train_dataloader.reset()
+            engine.train_dataloader_iter = iter(engine.train_dataloader)
+            batch = next(engine.train_dataloader_iter)
+
+        profiler.add_profiler_step(engine.config["profiler_options"])
+        if iter_id == 5:
+            for key in engine.time_info:
+                engine.time_info[key].reset()
+        engine.time_info["reader_cost"].update(time.time() - tic)
+
+        batch_size = batch[0].shape[0]
+        if not engine.config["Global"].get("use_multilabel", False):
+            batch[1] = batch[1].reshape([batch_size, -1])
+        engine.global_step += 1
+
+        # image input
+        with engine.auto_cast(is_eval=False):
+            out = forward(engine, batch)
+            loss_dict = engine.train_loss_func(out, batch[1])
+
+        # loss
+        loss = loss_dict["loss"] / engine.update_freq
+
+        # backward & step opt
+        scaled = engine.scaler.scale(loss)
+        scaled.backward()
+        if (iter_id + 1) % engine.update_freq == 0:
+            for i in range(len(engine.optimizer)):
+                # optimizer.step() with auto amp
+                engine.scaler.step(engine.optimizer[i])
+                engine.scaler.update()
+
+        if (iter_id + 1) % engine.update_freq == 0:
+            # clear grad
+            for i in range(len(engine.optimizer)):
+                engine.optimizer[i].clear_grad()
+            # step lr(by step)
+            for i in range(len(engine.lr_sch)):
+                if not getattr(engine.lr_sch[i], "by_epoch", False):
+                    engine.lr_sch[i].step()
+            # update ema
+            if engine.ema:
+                engine.model_ema.update(engine.model)
+
+        # below code just for logging
+        # update metric_for_logger
+        update_metric(engine, out, batch, batch_size)
+        # update_loss_for_logger
+        update_loss(engine, loss_dict, batch_size)
+        engine.time_info["batch_cost"].update(time.time() - tic)
+        if iter_id % print_batch_step == 0:
+            log_info(engine, batch_size, epoch_id, iter_id)
+        tic = time.time()
+
+    # step lr(by epoch)
+    for i in range(len(engine.lr_sch)):
+        if getattr(engine.lr_sch[i], "by_epoch", False) and \
+                type_name(engine.lr_sch[i]) != "ReduceOnPlateau":
+            engine.lr_sch[i].step()
+
+
+def forward(engine, batch):
+    if not engine.is_rec:
+        return engine.model(batch[0])
+    else:
+        return engine.model(batch[0], batch[1])
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/train_fixmatch.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/train_fixmatch.py
new file mode 100644
index 000000000..26a3daa74
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/train_fixmatch.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+
+import time
+import paddle
+from ppcls.engine.train.utils import update_loss, update_metric, log_info
+from ppcls.utils import profiler
+from paddle.nn import functional as F
+import numpy as np
+
+
+def train_epoch_fixmatch(engine, epoch_id, print_batch_step):
+    tic = time.time()
+    if not hasattr(engine, "train_dataloader_iter"):
+        engine.train_dataloader_iter = iter(engine.train_dataloader)
+        engine.unlabel_train_dataloader_iter = iter(
+            engine.unlabel_train_dataloader)
+    temperture = engine.config["SSL"].get("temperture", 1)
+    threshold = engine.config["SSL"].get("threshold", 0.95)
+    assert engine.iter_per_epoch is not None, "Global.iter_per_epoch need to be set."
+    threshold = paddle.to_tensor(threshold)
+    for iter_id in range(engine.iter_per_epoch):
+        if iter_id >= engine.iter_per_epoch:
+            break
+        if iter_id == 5:
+            for key in engine.time_info:
+                engine.time_info[key].reset()
+        try:
+            label_data_batch = engine.train_dataloader_iter.next()
+        except Exception:
+            engine.train_dataloader_iter = iter(engine.train_dataloader)
+            label_data_batch = engine.train_dataloader_iter.next()
+        try:
+            unlabel_data_batch = engine.unlabel_train_dataloader_iter.next()
+        except Exception:
+            engine.unlabel_train_dataloader_iter = iter(
+                engine.unlabel_train_dataloader)
+            unlabel_data_batch = engine.unlabel_train_dataloader_iter.next()
+        assert len(unlabel_data_batch) == 3
+        assert unlabel_data_batch[0].shape == unlabel_data_batch[1].shape
+        engine.time_info["reader_cost"].update(time.time() - tic)
+        batch_size = label_data_batch[0].shape[0] + unlabel_data_batch[0].shape[0] \
+            + unlabel_data_batch[1].shape[0]
+        engine.global_step += 1
+
+        # make inputs
+        inputs_x, targets_x = label_data_batch
+        inputs_u_w, inputs_u_s, targets_u = unlabel_data_batch
+        batch_size_label = inputs_x.shape[0]
+        inputs = paddle.concat([inputs_x, inputs_u_w, inputs_u_s], axis=0)
+
+        # image input
+        with engine.auto_cast(is_eval=False):
+            loss_dict, logits_label = get_loss(engine, inputs,
+                                               batch_size_label, temperture,
+                                               threshold, targets_x)
+
+        # loss
+        loss = loss_dict["loss"]
+
+        # backward & step opt
+        scaled = engine.scaler.scale(loss)
+        scaled.backward()
+
+        for i in range(len(engine.optimizer)):
+            # optimizer.step() with auto amp
+            engine.scaler.step(engine.optimizer[i])
+            engine.scaler.update()
+
+        # step lr(by step)
+        for i in range(len(engine.lr_sch)):
+            if not getattr(engine.lr_sch[i], "by_epoch", False):
+                engine.lr_sch[i].step()
+        # clear grad
+        for i in range(len(engine.optimizer)):
+            engine.optimizer[i].clear_grad()
+
+        # update ema
+        if engine.ema:
+            engine.model_ema.update(engine.model)
+
+        # below code just for logging
+        # update metric_for_logger
+        update_metric(engine, logits_label, label_data_batch, batch_size)
+        # update_loss_for_logger
+        update_loss(engine, loss_dict, batch_size)
+        engine.time_info["batch_cost"].update(time.time() - tic)
+        if iter_id % print_batch_step == 0:
+            log_info(engine, batch_size, epoch_id, iter_id)
+        tic = time.time()
+
+    # step lr(by epoch)
+    for i in range(len(engine.lr_sch)):
+        if getattr(engine.lr_sch[i], "by_epoch", False):
+            engine.lr_sch[i].step()
+
+
+def get_loss(engine, inputs, batch_size_label, temperture, threshold,
+             targets_x):
+    # For pytroch version, inputs need to use interleave and de_interleave
+    # to reshape and transpose inputs and logits, but it dosen't affect the
+    # result. So this paddle version dose not use the two transpose func.
+    # inputs = interleave(inputs, inputs.shape[0] // batch_size_label)
+    logits = engine.model(inputs)
+    # logits = de_interleave(logits, inputs.shape[0] // batch_size_label)
+    logits_x = logits[:batch_size_label]
+    logits_u_w, logits_u_s = logits[batch_size_label:].chunk(2)
+    loss_dict_label = engine.train_loss_func(logits_x, targets_x)
+    probs_u_w = F.softmax(logits_u_w.detach() / temperture, axis=-1)
+    p_targets_u, mask = get_psuedo_label_and_mask(probs_u_w, threshold)
+    unlabel_celoss = engine.unlabel_train_loss_func(logits_u_s,
+                                                    p_targets_u)["CELoss"]
+    unlabel_celoss = (unlabel_celoss * mask).mean()
+    loss_dict = dict()
+    for k, v in loss_dict_label.items():
+        if k != "loss":
+            loss_dict[k + "_label"] = v
+    loss_dict["CELoss_unlabel"] = unlabel_celoss
+    loss_dict["loss"] = loss_dict_label['loss'] + unlabel_celoss
+    return loss_dict, logits_x
+
+
+def get_psuedo_label_and_mask(probs_u_w, threshold):
+    max_probs = paddle.max(probs_u_w, axis=-1)
+    p_targets_u = paddle.argmax(probs_u_w, axis=-1)
+
+    mask = paddle.greater_equal(max_probs, threshold).astype('float')
+    return p_targets_u, mask
+
+
+def interleave(x, size):
+    s = list(x.shape)
+    return x.reshape([-1, size] + s[1:]).transpose(
+        [1, 0, 2, 3, 4]).reshape([-1] + s[1:])
+
+
+def de_interleave(x, size):
+    s = list(x.shape)
+    return x.reshape([size, -1] + s[1:]).transpose(
+        [1, 0, 2]).reshape([-1] + s[1:])
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/train_fixmatch_ccssl.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/train_fixmatch_ccssl.py
new file mode 100644
index 000000000..43d20519f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/train_fixmatch_ccssl.py
@@ -0,0 +1,125 @@
+from __future__ import absolute_import, division, print_function
+import time
+import paddle
+from ppcls.engine.train.train_fixmatch import get_loss
+from ppcls.engine.train.utils import update_loss, update_metric, log_info
+from ppcls.utils import profiler
+from paddle.nn import functional as F
+import numpy as np
+import paddle
+
+
+def train_epoch_fixmatch_ccssl(engine, epoch_id, print_batch_step):
+    tic = time.time()
+    if not hasattr(engine, 'train_dataloader_iter'):
+        engine.train_dataloader_iter = iter(engine.train_dataloader)
+        engine.unlabel_train_dataloader_iter = iter(engine.unlabel_train_dataloader)
+    
+    temperture = engine.config['SSL'].get("T", 1)
+    threshold = engine.config['SSL'].get("threshold", 0.95)
+    assert engine.iter_per_epoch is not None, "Global.iter_per_epoch need to be set"
+    threshold = paddle.to_tensor(threshold)
+
+    for iter_id in range(engine.iter_per_epoch):
+        if iter_id >= engine.iter_per_epoch:
+            break
+
+        if iter_id == 5:
+            for key in engine.time_info:
+                engine.time_info[key].reset()
+
+        try:
+            label_data_batch = engine.train_dataloader_iter.next()
+        except Exception:
+            engine.train_dataloader_iter = iter(engine.train_dataloader)
+            label_data_batch = engine.train_dataloader_iter.next()
+
+        try:
+            unlabel_data_batch = engine.unlabel_train_dataloader_iter.next()
+        except Exception:
+            engine.unlabel_train_dataloader_iter = iter(engine.unlabel_train_dataloader)
+            unlabel_data_batch = engine.unlabel_train_dataloader_iter.next()
+
+        assert len(unlabel_data_batch) in [3, 4]
+        assert unlabel_data_batch[0].shape == unlabel_data_batch[1].shape == unlabel_data_batch[2].shape
+
+        engine.time_info['reader_cost'].update(time.time() - tic)
+        batch_size = label_data_batch[0].shape[0] \
+                    + unlabel_data_batch[0].shape[0] \
+                    + unlabel_data_batch[1].shape[0] \
+                    + unlabel_data_batch[2].shape[0]
+        engine.global_step += 1
+
+        inputs_x, targets_x = label_data_batch
+        inputs_w, inputs_s1, inputs_s2 = unlabel_data_batch[:3]
+        batch_size_label = inputs_x.shape[0]
+        inputs = paddle.concat([inputs_x, inputs_w, inputs_s1, inputs_s2], axis=0)
+
+        loss_dict, logits_label = get_loss(engine, inputs, batch_size_label, 
+                                           temperture, threshold, targets_x,
+                                           )
+        loss = loss_dict['loss']
+        loss.backward()
+        
+        for i in range(len(engine.optimizer)):
+            engine.optimizer[i].step()
+        
+        for i in range(len(engine.lr_sch)):
+            if not getattr(engine.lr_sch[i], 'by_epoch', False):
+                engine.lr_sch[i].step()
+
+        for i in range(len(engine.optimizer)):
+            engine.optimizer[i].clear_grad()
+
+        if engine.ema:
+            engine.model_ema.update(engine.model)
+        update_metric(engine, logits_label, label_data_batch, batch_size)
+        update_loss(engine, loss_dict, batch_size)
+        engine.time_info['batch_cost'].update(time.time() - tic)
+        if iter_id % print_batch_step == 0:
+            log_info(engine, batch_size, epoch_id, iter_id)
+
+        tic = time.time()
+
+    for i in range(len(engine.lr_sch)):
+        if getattr(engine.lr_sch[i], 'by_epoch', False):
+            engine.lr_sch[i].step()
+
+def get_loss(engine,
+             inputs,
+             batch_size_label,
+             temperture,
+             threshold,
+             targets_x,
+             **kwargs
+             ):
+    out = engine.model(inputs)
+
+    logits, feats = out['logits'], out['features']
+    feat_w, feat_s1, feat_s2 = feats[batch_size_label:].chunk(3)
+    feat_x = feats[:batch_size_label]
+    logits_x = logits[:batch_size_label]
+    logits_w, logits_s1, logits_s2 = logits[batch_size_label:].chunk(3)
+    loss_dict_label = engine.train_loss_func(logits_x, targets_x)
+    probs_u_w = F.softmax(logits_w.detach() / temperture, axis=-1)
+    max_probs, p_targets_u_w = probs_u_w.max(axis=-1), probs_u_w.argmax(axis=-1)
+    mask = paddle.greater_equal(max_probs, threshold).astype('float')
+    
+    feats = paddle.concat([feat_s1.unsqueeze(1), feat_s2.unsqueeze(1)], axis=1)
+    batch = {'logits_w': logits_w,
+             'logits_s1': logits_s1,
+             'p_targets_u_w': p_targets_u_w,
+             'mask': mask,
+             'max_probs': max_probs,
+             }
+    unlabel_loss = engine.unlabel_train_loss_func(feats, batch)
+    loss_dict = {}
+    for k, v in loss_dict_label.items():
+        if k != 'loss':
+            loss_dict[k] = v
+    for k, v in unlabel_loss.items():
+        if k != 'loss':
+            loss_dict[k] = v
+    loss_dict['loss'] = loss_dict_label['loss'] + unlabel_loss['loss']
+
+    return loss_dict, logits_x
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/train_metabin.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/train_metabin.py
new file mode 100644
index 000000000..d1d50f1d4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/train_metabin.py
@@ -0,0 +1,251 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/2011.14670v2
+
+from __future__ import absolute_import, division, print_function
+
+import time
+import paddle
+import numpy as np
+from collections import defaultdict
+
+from ppcls.engine.train.utils import update_loss, update_metric, log_info, type_name
+from ppcls.utils import profiler
+from ppcls.data import build_dataloader
+from ppcls.loss import build_loss
+
+
+def train_epoch_metabin(engine, epoch_id, print_batch_step):
+    tic = time.time()
+
+    if not hasattr(engine, "train_dataloader_iter"):
+        engine.train_dataloader_iter = iter(engine.train_dataloader)
+
+    if not hasattr(engine, "meta_dataloader"):
+        engine.meta_dataloader = build_dataloader(
+            config=engine.config['DataLoader']['Metalearning'],
+            mode='Train',
+            device=engine.device)
+        engine.meta_dataloader_iter = iter(engine.meta_dataloader)
+
+    num_domain = engine.train_dataloader.dataset.num_cams
+    for iter_id in range(engine.iter_per_epoch):
+        # fetch data batch from dataloader
+        try:
+            train_batch = next(engine.train_dataloader_iter)
+        except Exception:
+            engine.train_dataloader_iter = iter(engine.train_dataloader)
+            train_batch = next(engine.train_dataloader_iter)
+
+        try:
+            mtrain_batch, mtest_batch = get_meta_data(
+                engine.meta_dataloader_iter, num_domain)
+        except Exception:
+            engine.meta_dataloader_iter = iter(engine.meta_dataloader)
+            mtrain_batch, mtest_batch = get_meta_data(
+                engine.meta_dataloader_iter, num_domain)
+
+        profiler.add_profiler_step(engine.config["profiler_options"])
+        if iter_id == 5:
+            for key in engine.time_info:
+                engine.time_info[key].reset()
+        engine.time_info["reader_cost"].update(time.time() - tic)
+
+        train_batch_size = train_batch[0].shape[0]
+        mtrain_batch_size = mtrain_batch[0].shape[0]
+        mtest_batch_size = mtest_batch[0].shape[0]
+        if not engine.config["Global"].get("use_multilabel", False):
+            train_batch[1] = train_batch[1].reshape([train_batch_size, -1])
+            mtrain_batch[1] = mtrain_batch[1].reshape([mtrain_batch_size, -1])
+            mtest_batch[1] = mtest_batch[1].reshape([mtest_batch_size, -1])
+
+        engine.global_step += 1
+
+        if engine.global_step == 1:  # update model (execpt gate) to warmup
+            for i in range(engine.config["Global"]["warmup_iter"] - 1):
+                out, basic_loss_dict = basic_update(engine, train_batch)
+                loss_dict = basic_loss_dict
+                try:
+                    train_batch = next(engine.train_dataloader_iter)
+                except Exception:
+                    engine.train_dataloader_iter = iter(
+                        engine.train_dataloader)
+                    train_batch = next(engine.train_dataloader_iter)
+
+        out, basic_loss_dict = basic_update(engine=engine, batch=train_batch)
+        mtrain_loss_dict, mtest_loss_dict = metalearning_update(
+            engine=engine, mtrain_batch=mtrain_batch, mtest_batch=mtest_batch)
+        loss_dict = {
+            **
+            {"train_" + key: value
+             for key, value in basic_loss_dict.items()}, ** {
+                 "mtrain_" + key: value
+                 for key, value in mtrain_loss_dict.items()
+             }, **
+            {"mtest_" + key: value
+             for key, value in mtest_loss_dict.items()}
+        }
+        # step lr (by iter)
+        for i in range(len(engine.lr_sch)):
+            if not getattr(engine.lr_sch[i], "by_epoch", False):
+                engine.lr_sch[i].step()
+        # update ema
+        if engine.ema:
+            engine.model_ema.update(engine.model)
+
+        # below code just for logging
+        # update metric_for_logger
+        update_metric(engine, out, train_batch, train_batch_size)
+        # update_loss_for_logger
+        update_loss(engine, loss_dict, train_batch_size)
+        engine.time_info["batch_cost"].update(time.time() - tic)
+        if iter_id % print_batch_step == 0:
+            log_info(engine, train_batch_size, epoch_id, iter_id)
+        tic = time.time()
+
+    # step lr(by epoch)
+    for i in range(len(engine.lr_sch)):
+        if getattr(engine.lr_sch[i], "by_epoch", False) and \
+                type_name(engine.lr_sch[i]) != "ReduceOnPlateau":
+            engine.lr_sch[i].step()
+
+
+def setup_opt(engine, stage):
+    assert stage in ["train", "mtrain", "mtest"]
+    opt = defaultdict()
+    if stage == "train":
+        opt["bn_mode"] = "general"
+        opt["enable_inside_update"] = False
+        opt["lr_gate"] = 0.0
+    elif stage == "mtrain":
+        opt["bn_mode"] = "hold"
+        opt["enable_inside_update"] = False
+        opt["lr_gate"] = 0.0
+    elif stage == "mtest":
+        norm_lr = engine.lr_sch[1].last_lr
+        cyclic_lr = engine.lr_sch[2].get_lr()
+        opt["bn_mode"] = "hold"
+        opt["enable_inside_update"] = True
+        opt["lr_gate"] = norm_lr * cyclic_lr
+    for layer in engine.model.backbone.sublayers():
+        if type_name(layer) == "MetaBIN":
+            layer.setup_opt(opt)
+    engine.model.neck.setup_opt(opt)
+
+
+def reset_opt(model):
+    for layer in model.backbone.sublayers():
+        if type_name(layer) == "MetaBIN":
+            layer.reset_opt()
+    model.neck.reset_opt()
+
+
+def get_meta_data(meta_dataloader_iter, num_domain):
+    """
+    fetch data batch from dataloader then divide the batch by domains
+    """
+    list_all = np.random.permutation(num_domain)
+    list_mtrain = list(list_all[:num_domain // 2])
+    batch = next(meta_dataloader_iter)
+    domain_idx = batch[2]
+    cnt = 0
+    for sample in list_mtrain:
+        if cnt == 0:
+            is_mtrain_domain = domain_idx == sample
+        else:
+            is_mtrain_domain = paddle.logical_or(is_mtrain_domain,
+                                                 domain_idx == sample)
+        cnt += 1
+
+    # mtrain_batch
+    if not any(is_mtrain_domain):
+        mtrain_batch = None
+        raise RuntimeError
+    else:
+        mtrain_batch = [batch[i][is_mtrain_domain] for i in range(len(batch))]
+
+    # mtest_batch
+    is_mtest_domains = is_mtrain_domain == False
+    if not any(is_mtest_domains):
+        mtest_batch = None
+        raise RuntimeError
+    else:
+        mtest_batch = [batch[i][is_mtest_domains] for i in range(len(batch))]
+    return mtrain_batch, mtest_batch
+
+
+def forward(engine, batch, loss_func):
+    batch_info = defaultdict()
+    batch_info = {"label": batch[1], "domain": batch[2]}
+
+    with engine.auto_cast(is_eval=False):
+        out = engine.model(batch[0], batch[1])
+        loss_dict = loss_func(out, batch_info)
+
+    return out, loss_dict
+
+
+def backward(engine, loss, optimizer):
+    optimizer.clear_grad()
+    scaled = engine.scaler.scale(loss)
+    scaled.backward()
+
+    # optimizer.step() with auto amp
+    engine.scaler.step(optimizer)
+    engine.scaler.update()
+
+    for name, layer in engine.model.backbone.named_sublayers():
+        if "gate" == name.split('.')[-1]:
+            layer.clip_gate()
+
+
+def basic_update(engine, batch):
+    setup_opt(engine, "train")
+    train_loss_func = build_loss(engine.config["Loss"]["Basic"])
+    out, train_loss_dict = forward(engine, batch, train_loss_func)
+    train_loss = train_loss_dict["loss"]
+    backward(engine, train_loss, engine.optimizer[0])
+    engine.optimizer[0].clear_grad()
+    reset_opt(engine.model)
+    return out, train_loss_dict
+
+
+def metalearning_update(engine, mtrain_batch, mtest_batch):
+    # meta train
+    mtrain_loss_func = build_loss(engine.config["Loss"]["MetaTrain"])
+    setup_opt(engine, "mtrain")
+
+    mtrain_batch_info = defaultdict()
+    mtrain_batch_info = {"label": mtrain_batch[1], "domain": mtrain_batch[2]}
+    out = engine.model(mtrain_batch[0], mtrain_batch[1])
+    mtrain_loss_dict = mtrain_loss_func(out, mtrain_batch_info)
+    mtrain_loss = mtrain_loss_dict["loss"]
+    engine.optimizer[1].clear_grad()
+    mtrain_loss.backward()
+
+    # meta test
+    mtest_loss_func = build_loss(engine.config["Loss"]["MetaTest"])
+    setup_opt(engine, "mtest")
+
+    out, mtest_loss_dict = forward(engine, mtest_batch, mtest_loss_func)
+    engine.optimizer[1].clear_grad()
+    mtest_loss = mtest_loss_dict["loss"]
+    backward(engine, mtest_loss, engine.optimizer[1])
+
+    engine.optimizer[0].clear_grad()
+    engine.optimizer[1].clear_grad()
+    reset_opt(engine.model)
+
+    return mtrain_loss_dict, mtest_loss_dict
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/train_progressive.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/train_progressive.py
new file mode 100644
index 000000000..9999024b9
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/train_progressive.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+
+from ppcls.data import build_dataloader
+from ppcls.engine.train.utils import type_name
+from ppcls.utils import logger
+
+from .train import train_epoch
+
+
+def train_epoch_progressive(engine, epoch_id, print_batch_step):
+    # 1. Build training hyper-parameters for different training stage
+    num_stage = 4
+    ratio_list = [(i + 1) / num_stage for i in range(num_stage)]
+    stones = [
+        int(engine.config["Global"]["epochs"] * ratio_list[i])
+        for i in range(num_stage)
+    ]
+    stage_id = 0
+    for i in range(num_stage):
+        if epoch_id > stones[i]:
+            stage_id = i + 1
+
+    # 2. Adjust training hyper-parameters for different training stage
+    if not hasattr(engine, 'last_stage') or engine.last_stage < stage_id:
+        cur_dropout_rate = 0.0
+
+        def _change_dp_func(m):
+            global cur_dropout_rate
+            if type_name(m) == "Head" and hasattr(m, "_dropout"):
+                m._dropout.p = m.dropout_rate[stage_id]
+                cur_dropout_rate = m.dropout_rate[stage_id]
+
+        engine.model.apply(_change_dp_func)
+
+        cur_image_size = engine.config["DataLoader"]["Train"]["dataset"][
+            "transform_ops"][1]["RandCropImage"]["progress_size"][stage_id]
+        cur_magnitude = engine.config["DataLoader"]["Train"]["dataset"][
+            "transform_ops"][3]["RandAugmentV2"]["progress_magnitude"][
+                stage_id]
+        engine.config["DataLoader"]["Train"]["dataset"]["transform_ops"][1][
+            "RandCropImage"]["size"] = cur_image_size
+        engine.config["DataLoader"]["Train"]["dataset"]["transform_ops"][3][
+            "RandAugmentV2"]["magnitude"] = cur_magnitude
+        engine.train_dataloader = build_dataloader(
+            engine.config["DataLoader"],
+            "Train",
+            engine.device,
+            engine.use_dali,
+            seed=epoch_id)
+        engine.train_dataloader_iter = iter(engine.train_dataloader)
+        engine.last_stage = stage_id
+    logger.info(f"Training stage: [{stage_id+1}/{num_stage}]("
+                f"random_aug_magnitude={cur_magnitude}, "
+                f"train_image_size={cur_image_size}, "
+                f"dropout_rate={cur_dropout_rate}"
+                f")")
+
+    # 3. Train one epoch as usual at current stage
+    train_epoch(engine, epoch_id, print_batch_step)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/utils.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/utils.py
new file mode 100644
index 000000000..1d5c70b2e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/engine/train/utils.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import datetime
+from ppcls.utils import logger
+from ppcls.utils.misc import AverageMeter
+
+
+def update_metric(trainer, out, batch, batch_size):
+    # calc metric
+    if trainer.train_metric_func is not None:
+        metric_dict = trainer.train_metric_func(out, batch[-1])
+        for key in metric_dict:
+            if key not in trainer.output_info:
+                trainer.output_info[key] = AverageMeter(key, '7.5f')
+            trainer.output_info[key].update(
+                float(metric_dict[key]), batch_size)
+
+
+def update_loss(trainer, loss_dict, batch_size):
+    # update_output_info
+    for key in loss_dict:
+        if key not in trainer.output_info:
+            trainer.output_info[key] = AverageMeter(key, '7.5f')
+        trainer.output_info[key].update(float(loss_dict[key]), batch_size)
+
+
+def log_info(trainer, batch_size, epoch_id, iter_id):
+    lr_msg = ", ".join([
+        "lr({}): {:.8f}".format(type_name(lr), lr.get_lr())
+        for i, lr in enumerate(trainer.lr_sch)
+    ])
+    metric_msg = ", ".join([
+        "{}: {:.5f}".format(key, trainer.output_info[key].avg)
+        for key in trainer.output_info
+    ])
+    time_msg = "s, ".join([
+        "{}: {:.5f}".format(key, trainer.time_info[key].avg)
+        for key in trainer.time_info
+    ])
+
+    ips_msg = "ips: {:.5f} samples/s".format(
+        batch_size / trainer.time_info["batch_cost"].avg)
+
+    global_epochs = trainer.config["Global"]["epochs"]
+    eta_sec = (
+        (trainer.config["Global"]["epochs"] - epoch_id + 1) *
+        trainer.iter_per_epoch - iter_id) * trainer.time_info["batch_cost"].avg
+    eta_msg = "eta: {:s}".format(str(datetime.timedelta(seconds=int(eta_sec))))
+    max_mem_reserved_msg = ""
+    max_mem_allocated_msg = ""
+    max_mem_msg = ""
+    print_mem_info = trainer.config["Global"].get("print_mem_info", False)
+    if print_mem_info:
+        if paddle.device.is_compiled_with_cuda():
+            max_mem_reserved_msg = f"max_mem_reserved: {format(paddle.device.cuda.max_memory_reserved() / (1024 ** 2), '.2f')} MB"
+            max_mem_allocated_msg = f"max_mem_allocated: {format(paddle.device.cuda.max_memory_allocated() / (1024 ** 2), '.2f')} MB"
+            max_mem_msg = f", {max_mem_reserved_msg}, {max_mem_allocated_msg}"
+    logger.info(
+        f"[Train][Epoch {epoch_id}/{global_epochs}][Iter: {iter_id}/{trainer.iter_per_epoch}]{lr_msg}, {metric_msg}, {time_msg}, {ips_msg}, {eta_msg}{max_mem_msg}"
+    )
+    for key in trainer.time_info:
+        trainer.time_info[key].reset()
+
+    for i, lr in enumerate(trainer.lr_sch):
+        logger.scaler(
+            name="lr({})".format(type_name(lr)),
+            value=lr.get_lr(),
+            step=trainer.global_step,
+            writer=trainer.vdl_writer)
+    for key in trainer.output_info:
+        logger.scaler(
+            name="train_{}".format(key),
+            value=trainer.output_info[key].avg,
+            step=trainer.global_step,
+            writer=trainer.vdl_writer)
+
+
+def type_name(object: object) -> str:
+    """get class name of an object"""
+    return object.__class__.__name__
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/__init__.py
new file mode 100644
index 000000000..7ab8be4fa
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/__init__.py
@@ -0,0 +1,91 @@
+import copy
+
+import paddle
+import paddle.nn as nn
+from ppcls.utils import logger
+
+from .celoss import CELoss, MixCELoss
+from .googlenetloss import GoogLeNetLoss
+from .centerloss import CenterLoss
+from .contrasiveloss import ContrastiveLoss
+from .contrasiveloss import ContrastiveLoss_XBM
+from .emlloss import EmlLoss
+from .msmloss import MSMLoss
+from .npairsloss import NpairsLoss
+from .trihardloss import TriHardLoss
+from .triplet import TripletLoss, TripletLossV2
+from .tripletangularmarginloss import TripletAngularMarginLoss, TripletAngularMarginLoss_XBM
+from .supconloss import SupConLoss
+from .softsuploss import SoftSupConLoss
+from .ccssl_loss import CCSSLCELoss
+from .pairwisecosface import PairwiseCosface
+from .dmlloss import DMLLoss
+from .distanceloss import DistanceLoss
+from .softtargetceloss import SoftTargetCrossEntropy
+from .distillationloss import DistillationCELoss
+from .distillationloss import DistillationGTCELoss
+from .distillationloss import DistillationDMLLoss
+from .distillationloss import DistillationDistanceLoss
+from .distillationloss import DistillationRKDLoss
+from .distillationloss import DistillationKLDivLoss
+from .distillationloss import DistillationDKDLoss
+from .distillationloss import DistillationWSLLoss
+from .distillationloss import DistillationSKDLoss
+from .distillationloss import DistillationMultiLabelLoss
+from .distillationloss import DistillationDISTLoss
+from .distillationloss import DistillationPairLoss
+
+from .multilabelloss import MultiLabelLoss, MultiLabelAsymmetricLoss
+from .afdloss import AFDLoss
+
+from .deephashloss import DSHSDLoss
+from .deephashloss import LCDSHLoss
+from .deephashloss import DCHLoss
+
+from .metabinloss import CELossForMetaBIN
+from .metabinloss import TripletLossForMetaBIN
+from .metabinloss import InterDomainShuffleLoss
+from .metabinloss import IntraDomainScatterLoss
+
+
+class CombinedLoss(nn.Layer):
+    def __init__(self, config_list):
+        super().__init__()
+        self.loss_func = []
+        self.loss_weight = []
+        assert isinstance(config_list, list), (
+            'operator config should be a list')
+        for config in config_list:
+            assert isinstance(config,
+                              dict) and len(config) == 1, "yaml format error"
+            name = list(config)[0]
+            param = config[name]
+            assert "weight" in param, "weight must be in param, but param just contains {}".format(
+                param.keys())
+            self.loss_weight.append(param.pop("weight"))
+            self.loss_func.append(eval(name)(**param))
+            self.loss_func = nn.LayerList(self.loss_func)
+
+    def __call__(self, input, batch):
+        loss_dict = {}
+        # just for accelerate classification traing speed
+        if len(self.loss_func) == 1:
+            loss = self.loss_func[0](input, batch)
+            loss_dict.update(loss)
+            loss_dict["loss"] = list(loss.values())[0]
+        else:
+            for idx, loss_func in enumerate(self.loss_func):
+                loss = loss_func(input, batch)
+                weight = self.loss_weight[idx]
+                loss = {key: loss[key] * weight for key in loss}
+                loss_dict.update(loss)
+            loss_dict["loss"] = paddle.add_n(list(loss_dict.values()))
+        return loss_dict
+
+
+def build_loss(config):
+    if config is None:
+        return None
+    module_class = CombinedLoss(copy.deepcopy(config))
+    logger.debug("build loss {} success.".format(module_class))
+    return module_class
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/afdloss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/afdloss.py
new file mode 100644
index 000000000..e2f457451
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/afdloss.py
@@ -0,0 +1,130 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle
+import numpy as np
+import matplotlib.pyplot as plt
+import cv2
+import warnings
+warnings.filterwarnings('ignore')
+
+
+class LinearBNReLU(nn.Layer):
+    def __init__(self, nin, nout):
+        super().__init__()
+        self.linear = nn.Linear(nin, nout)
+        self.bn = nn.BatchNorm1D(nout)
+        self.relu = nn.ReLU()
+
+    def forward(self, x, relu=True):
+        if relu:
+            return self.relu(self.bn(self.linear(x)))
+        return self.bn(self.linear(x))
+
+
+def unique_shape(s_shapes):
+    n_s = []
+    unique_shapes = []
+    n = -1
+    for s_shape in s_shapes:
+        if s_shape not in unique_shapes:
+            unique_shapes.append(s_shape)
+            n += 1
+        n_s.append(n)
+    return n_s, unique_shapes
+
+
+class AFDLoss(nn.Layer):
+    """
+    AFDLoss
+    https://www.aaai.org/AAAI21Papers/AAAI-9785.JiM.pdf
+    https://github.com/clovaai/attention-feature-distillation
+    """
+
+    def __init__(self,
+                 model_name_pair=["Student", "Teacher"],
+                 student_keys=["bilinear_key", "value"],
+                 teacher_keys=["query", "value"],
+                 s_shapes=[[64, 16, 160], [128, 8, 160], [256, 4, 160],
+                           [512, 2, 160]],
+                 t_shapes=[[640, 48], [320, 96], [160, 192]],
+                 qk_dim=128,
+                 name="loss_afd"):
+        super().__init__()
+        assert isinstance(model_name_pair, list)
+        self.model_name_pair = model_name_pair
+        self.student_keys = student_keys
+        self.teacher_keys = teacher_keys
+        self.s_shapes = [[1] + s_i for s_i in s_shapes]
+        self.t_shapes = [[1] + t_i for t_i in t_shapes]
+        self.qk_dim = qk_dim
+        self.n_t, self.unique_t_shapes = unique_shape(self.t_shapes)
+        self.attention = Attention(self.qk_dim, self.t_shapes, self.s_shapes,
+                                   self.n_t, self.unique_t_shapes)
+        self.name = name
+
+    def forward(self, predicts, batch):
+        s_features_dict = predicts[self.model_name_pair[0]]
+        t_features_dict = predicts[self.model_name_pair[1]]
+
+        g_s = [s_features_dict[key] for key in self.student_keys]
+        g_t = [t_features_dict[key] for key in self.teacher_keys]
+
+        loss = self.attention(g_s, g_t)
+        sum_loss = sum(loss)
+
+        loss_dict = dict()
+        loss_dict[self.name] = sum_loss
+
+        return loss_dict
+
+
+class Attention(nn.Layer):
+    def __init__(self, qk_dim, t_shapes, s_shapes, n_t, unique_t_shapes):
+        super().__init__()
+        self.qk_dim = qk_dim
+        self.n_t = n_t
+
+        self.p_t = self.create_parameter(
+            shape=[len(t_shapes), qk_dim],
+            default_initializer=nn.initializer.XavierNormal())
+        self.p_s = self.create_parameter(
+            shape=[len(s_shapes), qk_dim],
+            default_initializer=nn.initializer.XavierNormal())
+
+    def forward(self, g_s, g_t):
+        bilinear_key, h_hat_s_all = g_s
+        query, h_t_all = g_t
+
+        p_logit = paddle.matmul(self.p_t, self.p_s.t())
+
+        logit = paddle.add(
+            paddle.einsum('bstq,btq->bts', bilinear_key, query),
+            p_logit) / np.sqrt(self.qk_dim)
+        atts = F.softmax(logit, axis=2)  # b x t x s
+
+        loss = []
+
+        for i, (n, h_t) in enumerate(zip(self.n_t, h_t_all)):
+            h_hat_s = h_hat_s_all[n]
+            diff = self.cal_diff(h_hat_s, h_t, atts[:, i])
+            loss.append(diff)
+        return loss
+
+    def cal_diff(self, v_s, v_t, att):
+        diff = (v_s - v_t.unsqueeze(1)).pow(2).mean(2)
+        diff = paddle.multiply(diff, att).sum(1).mean()
+        return diff
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/ccssl_loss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/ccssl_loss.py
new file mode 100644
index 000000000..1b3b71d56
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/ccssl_loss.py
@@ -0,0 +1,19 @@
+from ppcls.engine.train.train import forward
+from .softsuploss import SoftSupConLoss
+import copy
+import paddle.nn as nn
+
+
+class CCSSLCELoss(nn.Layer):
+    def __init__(self, **kwargs):
+        super(CCSSLCELoss, self).__init__()
+        self.celoss = nn.CrossEntropyLoss(reduction='none')
+
+    def forward(self, inputs, batch, **kwargs):
+        p_targets_u_w = batch['p_targets_u_w']
+        logits_s1 = batch['logits_s1']
+        mask = batch['mask']
+        loss_u = self.celoss(logits_s1, p_targets_u_w) * mask
+        loss_u = loss_u.mean()
+
+        return {'CCSSLCELoss': loss_u}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/celoss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/celoss.py
new file mode 100644
index 000000000..2715dee19
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/celoss.py
@@ -0,0 +1,76 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ppcls.utils import logger
+
+
+class CELoss(nn.Layer):
+    """
+    Cross entropy loss
+    """
+
+    def __init__(self, reduction="mean", epsilon=None):
+        super().__init__()
+        if epsilon is not None and (epsilon <= 0 or epsilon >= 1):
+            epsilon = None
+        self.epsilon = epsilon
+        assert reduction in ["mean", "sum", "none"]
+        self.reduction = reduction
+
+    def _labelsmoothing(self, target, class_num):
+        if len(target.shape) == 1 or target.shape[-1] != class_num:
+            one_hot_target = F.one_hot(target, class_num)
+        else:
+            one_hot_target = target
+        soft_target = F.label_smooth(one_hot_target, epsilon=self.epsilon)
+        soft_target = paddle.reshape(soft_target, shape=[-1, class_num])
+        return soft_target
+
+    def forward(self, x, label):
+        if isinstance(x, dict):
+            x = x["logits"]
+        if self.epsilon is not None:
+            class_num = x.shape[-1]
+            label = self._labelsmoothing(label, class_num)
+            x = -F.log_softmax(x, axis=-1)
+            loss = paddle.sum(x * label, axis=-1)
+            if self.reduction == 'mean':
+                loss = loss.mean()
+            elif self.reduction == 'sum':
+                loss = loss.sum()
+        else:
+            if label.shape[-1] == x.shape[-1]:
+                label = F.softmax(label, axis=-1)
+                soft_label = True
+            else:
+                soft_label = False
+            loss = F.cross_entropy(
+                x,
+                label=label,
+                soft_label=soft_label,
+                reduction=self.reduction)
+        return {"CELoss": loss}
+
+
+class MixCELoss(object):
+    def __init__(self, *args, **kwargs):
+        msg = "\"MixCELos\" is deprecated, please use \"CELoss\" instead."
+        logger.error(DeprecationWarning(msg))
+        raise DeprecationWarning(msg)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/centerloss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/centerloss.py
new file mode 100644
index 000000000..23a86ee88
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/centerloss.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from typing import Dict
+
+import paddle
+import paddle.nn as nn
+
+
+class CenterLoss(nn.Layer):
+    """Center loss
+    paper : [A Discriminative Feature Learning Approach for Deep Face Recognition](https://link.springer.com/content/pdf/10.1007%2F978-3-319-46478-7_31.pdf)
+    code reference: https://github.com/michuanhaohao/reid-strong-baseline/blob/master/layers/center_loss.py#L7
+    Args:
+        num_classes (int): number of classes.
+        feat_dim (int): number of feature dimensions.
+        feature_from (str): feature from "backbone" or "features"
+    """
+
+    def __init__(self,
+                 num_classes: int,
+                 feat_dim: int,
+                 feature_from: str="features"):
+        super(CenterLoss, self).__init__()
+        self.num_classes = num_classes
+        self.feat_dim = feat_dim
+        self.feature_from = feature_from
+        random_init_centers = paddle.randn(
+            shape=[self.num_classes, self.feat_dim])
+        self.centers = self.create_parameter(
+            shape=(self.num_classes, self.feat_dim),
+            default_initializer=nn.initializer.Assign(random_init_centers))
+        self.add_parameter("centers", self.centers)
+
+    def __call__(self, input: Dict[str, paddle.Tensor],
+                 target: paddle.Tensor) -> Dict[str, paddle.Tensor]:
+        """compute center loss.
+
+        Args:
+            input (Dict[str, paddle.Tensor]): {'features': (batch_size, feature_dim), ...}.
+            target (paddle.Tensor): ground truth label with shape (batch_size, ).
+
+        Returns:
+            Dict[str, paddle.Tensor]: {'CenterLoss': loss}.
+        """
+        feats = input[self.feature_from]
+        labels = target
+
+        # squeeze labels to shape (batch_size, )
+        if labels.ndim >= 2 and labels.shape[-1] == 1:
+            labels = paddle.squeeze(labels, axis=[-1])
+
+        batch_size = feats.shape[0]
+        distmat = paddle.pow(feats, 2).sum(axis=1, keepdim=True).expand([batch_size, self.num_classes]) + \
+            paddle.pow(self.centers, 2).sum(axis=1, keepdim=True).expand([self.num_classes, batch_size]).t()
+        distmat = distmat.addmm(x=feats, y=self.centers.t(), beta=1, alpha=-2)
+
+        classes = paddle.arange(self.num_classes).astype(labels.dtype)
+        labels = labels.unsqueeze(1).expand([batch_size, self.num_classes])
+        mask = labels.equal(classes.expand([batch_size, self.num_classes]))
+
+        dist = distmat * mask.astype(feats.dtype)
+        loss = dist.clip(min=1e-12, max=1e+12).sum() / batch_size
+        # return loss
+        return {'CenterLoss': loss}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/comfunc.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/comfunc.py
new file mode 100644
index 000000000..277bdd6b5
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/comfunc.py
@@ -0,0 +1,45 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+def rerange_index(batch_size, samples_each_class):
+    tmp = np.arange(0, batch_size * batch_size)
+    tmp = tmp.reshape(-1, batch_size)
+    rerange_index = []
+
+    for i in range(batch_size):
+        step = i // samples_each_class
+        start = step * samples_each_class
+        end = (step + 1) * samples_each_class
+
+        pos_idx = []
+        neg_idx = []
+        for j, k in enumerate(tmp[i]):
+            if j >= start and j < end:
+                if j == i:
+                    pos_idx.insert(0, k)
+                else:
+                    pos_idx.append(k)
+            else:
+                neg_idx.append(k)
+        rerange_index += (pos_idx + neg_idx)
+
+    rerange_index = np.array(rerange_index).astype(np.int32)
+    return rerange_index
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/contrasiveloss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/contrasiveloss.py
new file mode 100644
index 000000000..d27dbe22e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/contrasiveloss.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from typing import Dict
+
+import paddle
+import paddle.nn as nn
+from ppcls.loss.xbm import CrossBatchMemory
+
+
+class ContrastiveLoss(nn.Layer):
+    """ContrastiveLoss
+
+    Args:
+        margin (float): margin
+        embedding_size (int): number of embedding's dimension
+        normalize_feature (bool, optional): whether to normalize embedding. Defaults to True.
+        epsilon (float, optional): epsilon. Defaults to 1e-5.
+        feature_from (str, optional): which key embedding from input dict. Defaults to "features".
+    """
+
+    def __init__(self,
+                 margin: float,
+                 embedding_size: int,
+                 normalize_feature=True,
+                 epsilon: float=1e-5,
+                 feature_from: str="features"):
+        super(ContrastiveLoss, self).__init__()
+        self.margin = margin
+        self.embedding_size = embedding_size
+        self.normalize_feature = normalize_feature
+        self.epsilon = epsilon
+        self.feature_from = feature_from
+
+    def forward(self, input: Dict[str, paddle.Tensor],
+                target: paddle.Tensor) -> Dict[str, paddle.Tensor]:
+        feats = input[self.feature_from]
+        labels = target
+
+        # normalize along feature dim
+        if self.normalize_feature:
+            feats = nn.functional.normalize(feats, p=2, axis=1)
+
+        # squeeze labels to shape (batch_size, )
+        if labels.ndim >= 2 and labels.shape[-1] == 1:
+            labels = paddle.squeeze(labels, axis=[-1])
+
+        loss = self._compute_loss(feats, target, feats, target)
+
+        return {'ContrastiveLoss': loss}
+
+    def _compute_loss(self,
+                      inputs_q: paddle.Tensor,
+                      targets_q: paddle.Tensor,
+                      inputs_k: paddle.Tensor,
+                      targets_k: paddle.Tensor) -> paddle.Tensor:
+        batch_size = inputs_q.shape[0]
+        # Compute similarity matrix
+        sim_mat = paddle.matmul(inputs_q, inputs_k.t())
+
+        loss = []
+        for i in range(batch_size):
+            pos_pair_ = paddle.masked_select(sim_mat[i],
+                                             targets_q[i] == targets_k)
+            pos_pair_ = paddle.masked_select(pos_pair_,
+                                             pos_pair_ < 1 - self.epsilon)
+
+            neg_pair_ = paddle.masked_select(sim_mat[i],
+                                             targets_q[i] != targets_k)
+            neg_pair = paddle.masked_select(neg_pair_, neg_pair_ > self.margin)
+
+            pos_loss = paddle.sum(-pos_pair_ + 1)
+
+            if len(neg_pair) > 0:
+                neg_loss = paddle.sum(neg_pair)
+            else:
+                neg_loss = 0
+            loss.append(pos_loss + neg_loss)
+
+        loss = sum(loss) / batch_size
+        return loss
+
+
+class ContrastiveLoss_XBM(ContrastiveLoss):
+    """ContrastiveLoss with CrossBatchMemory
+
+    Args:
+        xbm_size (int): size of memory bank
+        xbm_weight (int): weight of CrossBatchMemory's loss
+        start_iter (int): store embeddings after start_iter
+        margin (float): margin
+        embedding_size (int): number of embedding's dimension
+        epsilon (float, optional): epsilon. Defaults to 1e-5.
+        normalize_feature (bool, optional): whether to normalize embedding. Defaults to True.
+        feature_from (str, optional): which key embedding from input dict. Defaults to "features".
+    """
+
+    def __init__(self,
+                 xbm_size: int,
+                 xbm_weight: int,
+                 start_iter: int,
+                 margin: float,
+                 embedding_size: int,
+                 epsilon: float=1e-5,
+                 normalize_feature=True,
+                 feature_from: str="features"):
+        super(ContrastiveLoss_XBM, self).__init__(
+            margin, embedding_size, normalize_feature, epsilon, feature_from)
+        self.xbm = CrossBatchMemory(xbm_size, embedding_size)
+        self.xbm_weight = xbm_weight
+        self.start_iter = start_iter
+        self.iter = 0
+
+    def __call__(self, input: Dict[str, paddle.Tensor],
+                 target: paddle.Tensor) -> Dict[str, paddle.Tensor]:
+        feats = input[self.feature_from]
+        labels = target
+
+        # normalize along feature dim
+        if self.normalize_feature:
+            feats = nn.functional.normalize(feats, p=2, axis=1)
+
+        # squeeze labels to shape (batch_size, )
+        if labels.ndim >= 2 and labels.shape[-1] == 1:
+            labels = paddle.squeeze(labels, axis=[-1])
+
+        loss = self._compute_loss(feats, labels, feats, labels)
+
+        # compute contrastive loss from memory bank
+        self.iter += 1
+        if self.iter > self.start_iter:
+            self.xbm.enqueue_dequeue(feats.detach(), labels.detach())
+            xbm_feats, xbm_labels = self.xbm.get()
+            xbm_loss = self._compute_loss(feats, labels, xbm_feats, xbm_labels)
+            loss = loss + self.xbm_weight * xbm_loss
+
+        return {'ContrastiveLoss_XBM': loss}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/deephashloss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/deephashloss.py
new file mode 100644
index 000000000..7dda519a8
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/deephashloss.py
@@ -0,0 +1,149 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+
+class DSHSDLoss(nn.Layer):
+    """
+    # DSHSD(IEEE ACCESS 2019)
+    # paper [Deep Supervised Hashing Based on Stable Distribution](https://ieeexplore.ieee.org/document/8648432/)
+    # code reference: https://github.com/swuxyj/DeepHash-pytorch/blob/master/DSHSD.py
+    """
+
+    def __init__(self, alpha, multi_label=False):
+        super(DSHSDLoss, self).__init__()
+        self.alpha = alpha
+        self.multi_label = multi_label
+
+    def forward(self, input, label):
+        features = input["features"]
+        logits = input["logits"]
+
+        features_temp1 = paddle.unsqueeze(features, 1)
+        features_temp2 = paddle.unsqueeze(features, 0)
+        dist = features_temp1 - features_temp2
+        dist = paddle.square(dist)
+        dist = paddle.sum(dist, axis=2)
+
+        n_class = logits.shape[1]
+        labels = paddle.nn.functional.one_hot(label, n_class)
+        labels = labels.squeeze().astype("float32")
+
+        s = paddle.matmul(labels, labels, transpose_y=True)
+        s = (s == 0).astype("float32")
+        margin = 2 * features.shape[1]
+        Ld = (1 - s) / 2 * dist + s / 2 * (margin - dist).clip(min=0)
+        Ld = Ld.mean()
+
+        if self.multi_label:
+            Lc_temp = (1 + (-logits).exp()).log()
+            Lc = (logits - labels * logits + Lc_temp).sum(axis=1)
+        else:
+            probs = paddle.nn.functional.softmax(logits)
+            Lc = (-probs.log() * labels).sum(axis=1)
+        Lc = Lc.mean()
+
+        loss = Lc + Ld * self.alpha
+        return {"dshsdloss": loss}
+
+
+class LCDSHLoss(nn.Layer):
+    """
+    # paper [Locality-Constrained Deep Supervised Hashing for Image Retrieval](https://www.ijcai.org/Proceedings/2017/0499.pdf)
+    # code reference: https://github.com/swuxyj/DeepHash-pytorch/blob/master/LCDSH.py
+    """
+
+    def __init__(self, n_class, _lambda):
+        super(LCDSHLoss, self).__init__()
+        self._lambda = _lambda
+        self.n_class = n_class
+
+    def forward(self, input, label):
+        features = input["features"]
+        labels = paddle.nn.functional.one_hot(label, self.n_class)
+        labels = labels.squeeze().astype("float32")
+
+        s = paddle.matmul(labels, labels, transpose_y=True)
+        s = 2 * (s > 0).astype("float32") - 1
+
+        inner_product = paddle.matmul(features, features, transpose_y=True)
+        inner_product = inner_product * 0.5
+        inner_product = inner_product.clip(min=-50, max=50)
+        L1 = paddle.log(1 + paddle.exp(-s * inner_product))
+        L1 = L1.mean()
+
+        binary_features = features.sign()
+
+        inner_product_ = paddle.matmul(
+            binary_features, binary_features, transpose_y=True)
+        inner_product_ = inner_product_ * 0.5
+        sigmoid = paddle.nn.Sigmoid()
+        L2 = (sigmoid(inner_product) - sigmoid(inner_product_)).pow(2)
+        L2 = L2.mean()
+
+        loss = L1 + self._lambda * L2
+        return {"lcdshloss": loss}
+
+
+class DCHLoss(paddle.nn.Layer):
+    """
+    # paper [Deep Cauchy Hashing for Hamming Space Retrieval]
+    URL:(http://ise.thss.tsinghua.edu.cn/~mlong/doc/deep-cauchy-hashing-cvpr18.pdf)
+    # code reference: https://github.com/swuxyj/DeepHash-pytorch/blob/master/DCH.py
+    """
+
+    def __init__(self, gamma, _lambda, n_class):
+        super(DCHLoss, self).__init__()
+        self.gamma = gamma
+        self._lambda = _lambda
+        self.n_class = n_class
+
+    def distance(self, feature_i, feature_j):
+        assert feature_i.shape[1] == feature_j.shape[
+            1], "feature len of feature_i and feature_j is different, please check whether the featurs are right"
+        K = feature_i.shape[1]
+        inner_product = paddle.matmul(feature_i, feature_j, transpose_y=True)
+
+        len_i = feature_i.pow(2).sum(axis=1, keepdim=True).pow(0.5)
+        len_j = feature_j.pow(2).sum(axis=1, keepdim=True).pow(0.5)
+        norm = paddle.matmul(len_i, len_j, transpose_y=True)
+        cos = inner_product / norm.clip(min=0.0001)
+        dist = (1 - cos.clip(max=0.99)) * K / 2
+        return dist
+
+    def forward(self, input, label):
+        features = input["features"]
+        labels = paddle.nn.functional.one_hot(label, self.n_class)
+        labels = labels.squeeze().astype("float32")
+
+        s = paddle.matmul(labels, labels, transpose_y=True).astype("float32")
+        if (1 - s).sum() != 0 and s.sum() != 0:
+            positive_w = s * s.numel() / s.sum()
+            negative_w = (1 - s) * s.numel() / (1 - s).sum()
+            w = positive_w + negative_w
+        else:
+            w = 1
+
+        dist_matric = self.distance(features, features)
+        cauchy_loss = w * (s * paddle.log(dist_matric / self.gamma) +
+                           paddle.log(1 + self.gamma / dist_matric))
+
+        all_one = paddle.ones_like(features, dtype="float32")
+        dist_to_one = self.distance(features.abs(), all_one)
+        quantization_loss = paddle.log(1 + dist_to_one / self.gamma)
+
+        loss = cauchy_loss.mean() + self._lambda * quantization_loss.mean()
+        return {"dchloss": loss}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/dist_loss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/dist_loss.py
new file mode 100644
index 000000000..78c8e12ff
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/dist_loss.py
@@ -0,0 +1,52 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+def cosine_similarity(a, b, eps=1e-8):
+    return (a * b).sum(1) / (a.norm(axis=1) * b.norm(axis=1) + eps)
+
+
+def pearson_correlation(a, b, eps=1e-8):
+    return cosine_similarity(a - a.mean(1).unsqueeze(1),
+                             b - b.mean(1).unsqueeze(1), eps)
+
+
+def inter_class_relation(y_s, y_t):
+    return 1 - pearson_correlation(y_s, y_t).mean()
+
+
+def intra_class_relation(y_s, y_t):
+    return inter_class_relation(y_s.transpose([1, 0]), y_t.transpose([1, 0]))
+
+
+class DISTLoss(nn.Layer):
+    # DISTLoss
+    # paper [Knowledge Distillation from A Stronger Teacher](https://arxiv.org/pdf/2205.10536v1.pdf)
+    # code reference: https://github.com/hunto/image_classification_sota/blob/d4f15a0494/lib/models/losses/dist_kd.py
+    def __init__(self, beta=1.0, gamma=1.0):
+        super().__init__()
+        self.beta = beta
+        self.gamma = gamma
+
+    def forward(self, z_s, z_t):
+        y_s = F.softmax(z_s, axis=-1)
+        y_t = F.softmax(z_t, axis=-1)
+        inter_loss = inter_class_relation(y_s, y_t)
+        intra_loss = intra_class_relation(y_s, y_t)
+        kd_loss = self.beta * inter_loss + self.gamma * intra_loss
+        return kd_loss
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/distanceloss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/distanceloss.py
new file mode 100644
index 000000000..0a09f0cb2
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/distanceloss.py
@@ -0,0 +1,43 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddle.nn import L1Loss
+from paddle.nn import MSELoss as L2Loss
+from paddle.nn import SmoothL1Loss
+
+
+class DistanceLoss(nn.Layer):
+    """
+    DistanceLoss:
+        mode: loss mode
+    """
+
+    def __init__(self, mode="l2", **kargs):
+        super().__init__()
+        assert mode in ["l1", "l2", "smooth_l1"]
+        if mode == "l1":
+            self.loss_func = nn.L1Loss(**kargs)
+        elif mode == "l2":
+            self.loss_func = nn.MSELoss(**kargs)
+        elif mode == "smooth_l1":
+            self.loss_func = nn.SmoothL1Loss(**kargs)
+        self.mode = mode
+
+    def forward(self, x, y):
+        loss = self.loss_func(x, y)
+        return {"loss_{}".format(self.mode): loss}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/distillationloss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/distillationloss.py
new file mode 100644
index 000000000..6ccbbb840
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/distillationloss.py
@@ -0,0 +1,426 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .celoss import CELoss
+from .dmlloss import DMLLoss
+from .distanceloss import DistanceLoss
+from .rkdloss import RKdAngle, RkdDistance
+from .kldivloss import KLDivLoss
+from .dkdloss import DKDLoss
+from .wslloss import WSLLoss
+from .dist_loss import DISTLoss
+from .multilabelloss import MultiLabelLoss
+from .mgd_loss import MGDLoss
+from .skdloss import SKDLoss
+from .pefdloss import PEFDLoss
+
+
+class DistillationCELoss(CELoss):
+    """
+    DistillationCELoss
+    """
+
+    def __init__(self,
+                 model_name_pairs=[],
+                 epsilon=None,
+                 key=None,
+                 name="loss_ce"):
+        super().__init__(epsilon=epsilon)
+        assert isinstance(model_name_pairs, list)
+        self.key = key
+        self.model_name_pairs = model_name_pairs
+        self.name = name
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for idx, pair in enumerate(self.model_name_pairs):
+            out1 = predicts[pair[0]]
+            out2 = predicts[pair[1]]
+            if self.key is not None:
+                out1 = out1[self.key]
+                out2 = out2[self.key]
+            loss = super().forward(out1, out2)
+            for key in loss:
+                loss_dict["{}_{}_{}".format(key, pair[0], pair[1])] = loss[key]
+        return loss_dict
+
+
+class DistillationGTCELoss(CELoss):
+    """
+    DistillationGTCELoss
+    """
+
+    def __init__(self,
+                 model_names=[],
+                 epsilon=None,
+                 key=None,
+                 name="loss_gt_ce"):
+        super().__init__(epsilon=epsilon)
+        assert isinstance(model_names, list)
+        self.key = key
+        self.model_names = model_names
+        self.name = name
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for name in self.model_names:
+            out = predicts[name]
+            if self.key is not None:
+                out = out[self.key]
+            loss = super().forward(out, batch)
+            for key in loss:
+                loss_dict["{}_{}".format(key, name)] = loss[key]
+        return loss_dict
+
+
+class DistillationDMLLoss(DMLLoss):
+    """
+    """
+
+    def __init__(self,
+                 model_name_pairs=[],
+                 act="softmax",
+                 weight_ratio=False,
+                 sum_across_class_dim=False,
+                 key=None,
+                 name="loss_dml"):
+        super().__init__(act=act, sum_across_class_dim=sum_across_class_dim)
+        assert isinstance(model_name_pairs, list)
+        self.key = key
+        self.model_name_pairs = model_name_pairs
+        self.name = name
+        self.weight_ratio = weight_ratio
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for idx, pair in enumerate(self.model_name_pairs):
+            out1 = predicts[pair[0]]
+            out2 = predicts[pair[1]]
+            if self.key is not None:
+                out1 = out1[self.key]
+                out2 = out2[self.key]
+            if self.weight_ratio is True:
+                loss = super().forward(out1, out2, batch)
+            else:
+                loss = super().forward(out1, out2)
+            if isinstance(loss, dict):
+                for key in loss:
+                    loss_dict["{}_{}_{}_{}".format(key, pair[0], pair[1],
+                                                   idx)] = loss[key]
+            else:
+                loss_dict["{}_{}".format(self.name, idx)] = loss
+        return loss_dict
+
+
+class DistillationDistanceLoss(DistanceLoss):
+    """
+    """
+
+    def __init__(self,
+                 mode="l2",
+                 model_name_pairs=[],
+                 act=None,
+                 key=None,
+                 name="loss_",
+                 **kargs):
+        super().__init__(mode=mode, **kargs)
+        assert isinstance(model_name_pairs, list)
+        self.key = key
+        self.model_name_pairs = model_name_pairs
+        self.name = name + mode
+        assert act in [None, "sigmoid", "softmax"]
+        if act == "sigmoid":
+            self.act = nn.Sigmoid()
+        elif act == "softmax":
+            self.act = nn.Softmax(axis=-1)
+        else:
+            self.act = None
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for idx, pair in enumerate(self.model_name_pairs):
+            out1 = predicts[pair[0]]
+            out2 = predicts[pair[1]]
+            if self.key is not None:
+                out1 = out1[self.key]
+                out2 = out2[self.key]
+            if self.act is not None:
+                out1 = self.act(out1)
+                out2 = self.act(out2)
+            loss = super().forward(out1, out2)
+            for key in loss:
+                loss_dict["{}_{}_{}".format(self.name, key, idx)] = loss[key]
+        return loss_dict
+
+
+class DistillationRKDLoss(nn.Layer):
+    def __init__(self,
+                 target_size=None,
+                 model_name_pairs=(["Student", "Teacher"], ),
+                 student_keepkeys=[],
+                 teacher_keepkeys=[]):
+        super().__init__()
+        self.student_keepkeys = student_keepkeys
+        self.teacher_keepkeys = teacher_keepkeys
+        self.model_name_pairs = model_name_pairs
+        assert len(self.student_keepkeys) == len(self.teacher_keepkeys)
+
+        self.rkd_angle_loss = RKdAngle(target_size=target_size)
+        self.rkd_dist_loss = RkdDistance(target_size=target_size)
+
+    def __call__(self, predicts, batch):
+        loss_dict = {}
+        for m1, m2 in self.model_name_pairs:
+            for idx, (
+                    student_name, teacher_name
+            ) in enumerate(zip(self.student_keepkeys, self.teacher_keepkeys)):
+                student_out = predicts[m1][student_name]
+                teacher_out = predicts[m2][teacher_name]
+
+                loss_dict[f"loss_angle_{idx}_{m1}_{m2}"] = self.rkd_angle_loss(
+                    student_out, teacher_out)
+                loss_dict[f"loss_dist_{idx}_{m1}_{m2}"] = self.rkd_dist_loss(
+                    student_out, teacher_out)
+
+        return loss_dict
+
+
+class DistillationKLDivLoss(KLDivLoss):
+    """
+    DistillationKLDivLoss
+    """
+
+    def __init__(self,
+                 model_name_pairs=[],
+                 temperature=4,
+                 key=None,
+                 name="loss_kl"):
+        super().__init__(temperature=temperature)
+        assert isinstance(model_name_pairs, list)
+        self.key = key
+        self.model_name_pairs = model_name_pairs
+        self.name = name
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for idx, pair in enumerate(self.model_name_pairs):
+            out1 = predicts[pair[0]]
+            out2 = predicts[pair[1]]
+            if self.key is not None:
+                out1 = out1[self.key]
+                out2 = out2[self.key]
+            loss = super().forward(out1, out2)
+            for key in loss:
+                loss_dict["{}_{}_{}".format(key, pair[0], pair[1])] = loss[key]
+        return loss_dict
+
+
+class DistillationDKDLoss(DKDLoss):
+    """
+    DistillationDKDLoss
+    """
+
+    def __init__(self,
+                 model_name_pairs=[],
+                 key=None,
+                 temperature=1.0,
+                 alpha=1.0,
+                 beta=1.0,
+                 use_target_as_gt=False,
+                 name="loss_dkd"):
+        super().__init__(
+            temperature=temperature,
+            alpha=alpha,
+            beta=beta,
+            use_target_as_gt=use_target_as_gt)
+        self.key = key
+        self.model_name_pairs = model_name_pairs
+        self.name = name
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for idx, pair in enumerate(self.model_name_pairs):
+            out1 = predicts[pair[0]]
+            out2 = predicts[pair[1]]
+            if self.key is not None:
+                out1 = out1[self.key]
+                out2 = out2[self.key]
+            loss = super().forward(out1, out2, batch)
+            loss_dict[f"{self.name}_{pair[0]}_{pair[1]}"] = loss
+        return loss_dict
+
+
+class DistillationWSLLoss(WSLLoss):
+    """
+    DistillationWSLLoss
+    """
+
+    def __init__(self,
+                 model_name_pairs=[],
+                 key=None,
+                 temperature=2.0,
+                 name="wsl_loss"):
+        super().__init__(temperature)
+        self.model_name_pairs = model_name_pairs
+        self.key = key
+        self.name = name
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for idx, pair in enumerate(self.model_name_pairs):
+            out1 = predicts[pair[0]]
+            out2 = predicts[pair[1]]
+            if self.key is not None:
+                out1 = out1[self.key]
+                out2 = out2[self.key]
+            loss = super().forward(out1, out2, batch)
+            loss_dict[f"{self.name}_{pair[0]}_{pair[1]}"] = loss
+        return loss_dict
+
+
+class DistillationSKDLoss(SKDLoss):
+    """
+    DistillationSKDLoss
+    """
+
+    def __init__(self,
+                 model_name_pairs=[],
+                 key=None,
+                 temperature=1.0,
+                 multiplier=2.0,
+                 alpha=0.9,
+                 use_target_as_gt=False,
+                 name="skd_loss"):
+        super().__init__(temperature, multiplier, alpha, use_target_as_gt)
+        self.model_name_pairs = model_name_pairs
+        self.key = key
+        self.name = name
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for idx, pair in enumerate(self.model_name_pairs):
+            out1 = predicts[pair[0]]
+            out2 = predicts[pair[1]]
+            if self.key is not None:
+                out1 = out1[self.key]
+                out2 = out2[self.key]
+            loss = super().forward(out1, out2, batch)
+            loss_dict[f"{self.name}_{pair[0]}_{pair[1]}"] = loss
+        return loss_dict
+
+
+class DistillationMultiLabelLoss(MultiLabelLoss):
+    """
+    DistillationMultiLabelLoss
+    """
+
+    def __init__(self,
+                 model_names=[],
+                 epsilon=None,
+                 size_sum=False,
+                 weight_ratio=False,
+                 key=None,
+                 name="loss_mll"):
+        super().__init__(
+            epsilon=epsilon, size_sum=size_sum, weight_ratio=weight_ratio)
+        assert isinstance(model_names, list)
+        self.key = key
+        self.model_names = model_names
+        self.name = name
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for name in self.model_names:
+            out = predicts[name]
+            if self.key is not None:
+                out = out[self.key]
+            loss = super().forward(out, batch)
+            for key in loss:
+                loss_dict["{}_{}".format(key, name)] = loss[key]
+        return loss_dict
+
+
+class DistillationDISTLoss(DISTLoss):
+    """
+    DistillationDISTLoss
+    """
+
+    def __init__(self,
+                 model_name_pairs=[],
+                 key=None,
+                 beta=1.0,
+                 gamma=1.0,
+                 name="loss_dist"):
+        super().__init__(beta=beta, gamma=gamma)
+        self.key = key
+        self.model_name_pairs = model_name_pairs
+        self.name = name
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for idx, pair in enumerate(self.model_name_pairs):
+            out1 = predicts[pair[0]]
+            out2 = predicts[pair[1]]
+            if self.key is not None:
+                out1 = out1[self.key]
+                out2 = out2[self.key]
+            loss = super().forward(out1, out2)
+            loss_dict[f"{self.name}_{pair[0]}_{pair[1]}"] = loss
+        return loss_dict
+
+
+class DistillationPairLoss(nn.Layer):
+    """
+    DistillationPairLoss
+    """
+
+    def __init__(self,
+                 base_loss_name,
+                 model_name_pairs=[],
+                 s_key=None,
+                 t_key=None,
+                 name="loss",
+                 **kwargs):
+        super().__init__()
+        self.loss_func = eval(base_loss_name)(**kwargs)
+        assert type(s_key) == type(t_key)
+        self.s_key = s_key
+        self.t_key = t_key
+        self.model_name_pairs = model_name_pairs
+        self.name = name
+
+    def forward(self, predicts, batch):
+        loss_dict = dict()
+        for idx, pair in enumerate(self.model_name_pairs):
+            out1 = predicts[pair[0]]
+            out2 = predicts[pair[1]]
+            if isinstance(self.s_key, str):
+                out1 = out1[self.s_key]
+                out2 = out2[self.t_key]
+            else:
+                out1 = [out1[k] if k is not None else out1 for k in self.s_key]
+                out2 = [out2[k] if k is not None else out2 for k in self.t_key]
+
+            loss = self.loss_func.forward(out1, out2)
+            if isinstance(loss, dict):
+                for k in loss:
+                    loss_dict[
+                        f"{self.name}_{idx}_{pair[0]}_{pair[1]}_{k}"] = loss[k]
+            else:
+                loss_dict[f"{self.name}_{idx}_{pair[0]}_{pair[1]}"] = loss
+        return loss_dict
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/dkdloss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/dkdloss.py
new file mode 100644
index 000000000..bf9224e31
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/dkdloss.py
@@ -0,0 +1,68 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class DKDLoss(nn.Layer):
+    """
+    DKDLoss
+    Reference: https://arxiv.org/abs/2203.08679
+    Code was heavily based on https://github.com/megvii-research/mdistiller
+    """
+
+    def __init__(self,
+                 temperature=1.0,
+                 alpha=1.0,
+                 beta=1.0,
+                 use_target_as_gt=False):
+        super().__init__()
+        self.temperature = temperature
+        self.alpha = alpha
+        self.beta = beta
+        self.use_target_as_gt = use_target_as_gt
+
+    def forward(self, logits_student, logits_teacher, target=None):
+        if target is None or self.use_target_as_gt:
+            target = logits_teacher.argmax(axis=-1)
+        gt_mask = _get_gt_mask(logits_student, target)
+        other_mask = 1 - gt_mask
+        pred_student = F.softmax(logits_student / self.temperature, axis=1)
+        pred_teacher = F.softmax(logits_teacher / self.temperature, axis=1)
+        pred_student = cat_mask(pred_student, gt_mask, other_mask)
+        pred_teacher = cat_mask(pred_teacher, gt_mask, other_mask)
+        log_pred_student = paddle.log(pred_student)
+        tckd_loss = (F.kl_div(
+            log_pred_student, pred_teacher,
+            reduction='sum') * (self.temperature**2) / target.shape[0])
+        pred_teacher_part2 = F.softmax(
+            logits_teacher / self.temperature - 1000.0 * gt_mask, axis=1)
+        log_pred_student_part2 = F.log_softmax(
+            logits_student / self.temperature - 1000.0 * gt_mask, axis=1)
+        nckd_loss = (F.kl_div(
+            log_pred_student_part2, pred_teacher_part2,
+            reduction='sum') * (self.temperature**2) / target.shape[0])
+        return self.alpha * tckd_loss + self.beta * nckd_loss
+
+
+def _get_gt_mask(logits, target):
+    target = target.reshape([-1]).unsqueeze(1)
+    updates = paddle.ones_like(target)
+    mask = scatter(
+        paddle.zeros_like(logits), target, updates.astype('float32'))
+    return mask
+
+
+def cat_mask(t, mask1, mask2):
+    t1 = (t * mask1).sum(axis=1, keepdim=True)
+    t2 = (t * mask2).sum(axis=1, keepdim=True)
+    rt = paddle.concat([t1, t2], axis=1)
+    return rt
+
+
+def scatter(x, index, updates):
+    i, j = index.shape
+    grid_x, grid_y = paddle.meshgrid(paddle.arange(i), paddle.arange(j))
+    index = paddle.stack([grid_x.flatten(), index.flatten()], axis=1)
+    updates_index = paddle.stack([grid_x.flatten(), grid_y.flatten()], axis=1)
+    updates = paddle.gather_nd(updates, index=updates_index)
+    return paddle.scatter_nd_add(x, index, updates)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/dmlloss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/dmlloss.py
new file mode 100644
index 000000000..e8983ed08
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/dmlloss.py
@@ -0,0 +1,62 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ppcls.loss.multilabelloss import ratio2weight
+
+
+class DMLLoss(nn.Layer):
+    """
+    DMLLoss
+    """
+
+    def __init__(self, act="softmax", sum_across_class_dim=False, eps=1e-12):
+        super().__init__()
+        if act is not None:
+            assert act in ["softmax", "sigmoid"]
+        if act == "softmax":
+            self.act = nn.Softmax(axis=-1)
+        elif act == "sigmoid":
+            self.act = nn.Sigmoid()
+        else:
+            self.act = None
+        self.eps = eps
+        self.sum_across_class_dim = sum_across_class_dim
+
+    def _kldiv(self, x, target):
+        class_num = x.shape[-1]
+        cost = target * paddle.log(
+            (target + self.eps) / (x + self.eps)) * class_num
+        return cost
+
+    def forward(self, x, target, gt_label=None):
+        if self.act is not None:
+            x = self.act(x)
+            target = self.act(target)
+        loss = self._kldiv(x, target) + self._kldiv(target, x)
+        loss = loss / 2
+
+        # for multi-label dml loss
+        if gt_label is not None:
+            gt_label, label_ratio = gt_label[:, 0, :], gt_label[:, 1, :]
+            targets_mask = paddle.cast(gt_label > 0.5, 'float32')
+            weight = ratio2weight(targets_mask, paddle.to_tensor(label_ratio))
+            weight = weight * (gt_label > -1)
+            loss = loss * weight
+
+        loss = loss.sum(1).mean() if self.sum_across_class_dim else loss.mean()
+        return {"DMLLoss": loss}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/emlloss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/emlloss.py
new file mode 100644
index 000000000..38b707fe1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/emlloss.py
@@ -0,0 +1,102 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+import numpy as np
+from .comfunc import rerange_index
+
+
+class EmlLoss(paddle.nn.Layer):
+    """Ensemble Metric Learning Loss
+    paper: [Large Scale Strongly Supervised Ensemble Metric Learning, with Applications to Face Verification and Retrieval](https://arxiv.org/pdf/1212.6094.pdf)
+    code reference: https://github.com/PaddlePaddle/models/blob/develop/PaddleCV/metric_learning/losses/emlloss.py
+    """
+
+    def __init__(self, batch_size=40, samples_each_class=2):
+        super(EmlLoss, self).__init__()
+        assert (batch_size % samples_each_class == 0)
+        self.samples_each_class = samples_each_class
+        self.batch_size = batch_size
+        self.rerange_index = rerange_index(batch_size, samples_each_class)
+        self.thresh = 20.0
+        self.beta = 100000
+
+    def surrogate_function(self, beta, theta, bias):
+        x = theta * paddle.exp(bias)
+        output = paddle.log(1 + beta * x) / math.log(1 + beta)
+        return output
+
+    def surrogate_function_approximate(self, beta, theta, bias):
+        output = (
+            paddle.log(theta) + bias + math.log(beta)) / math.log(1 + beta)
+        return output
+
+    def surrogate_function_stable(self, beta, theta, target, thresh):
+        max_gap = paddle.to_tensor(thresh, dtype='float32')
+        max_gap.stop_gradient = True
+
+        target_max = paddle.maximum(target, max_gap)
+        target_min = paddle.minimum(target, max_gap)
+
+        loss1 = self.surrogate_function(beta, theta, target_min)
+        loss2 = self.surrogate_function_approximate(beta, theta, target_max)
+        bias = self.surrogate_function(beta, theta, max_gap)
+        loss = loss1 + loss2 - bias
+        return loss
+
+    def forward(self, input, target=None):
+        features = input["features"]
+        samples_each_class = self.samples_each_class
+        batch_size = self.batch_size
+        rerange_index = self.rerange_index
+
+        #calc distance
+        diffs = paddle.unsqueeze(
+            features, axis=1) - paddle.unsqueeze(
+                features, axis=0)
+        similary_matrix = paddle.sum(paddle.square(diffs), axis=-1)
+
+        tmp = paddle.reshape(similary_matrix, shape=[-1, 1])
+        rerange_index = paddle.to_tensor(rerange_index)
+        tmp = paddle.gather(tmp, index=rerange_index)
+        similary_matrix = paddle.reshape(tmp, shape=[-1, batch_size])
+
+        ignore, pos, neg = paddle.split(
+            similary_matrix,
+            num_or_sections=[
+                1, samples_each_class - 1, batch_size - samples_each_class
+            ],
+            axis=1)
+        ignore.stop_gradient = True
+
+        pos_max = paddle.max(pos, axis=1, keepdim=True)
+        pos = paddle.exp(pos - pos_max)
+        pos_mean = paddle.mean(pos, axis=1, keepdim=True)
+
+        neg_min = paddle.min(neg, axis=1, keepdim=True)
+        neg = paddle.exp(neg_min - neg)
+        neg_mean = paddle.mean(neg, axis=1, keepdim=True)
+
+        bias = pos_max - neg_min
+        theta = paddle.multiply(neg_mean, pos_mean)
+
+        loss = self.surrogate_function_stable(self.beta, theta, bias,
+                                              self.thresh)
+        loss = paddle.mean(loss)
+        return {"emlloss": loss}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/googlenetloss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/googlenetloss.py
new file mode 100644
index 000000000..491311831
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/googlenetloss.py
@@ -0,0 +1,43 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class GoogLeNetLoss(nn.Layer):
+    """
+    Cross entropy loss used after googlenet
+    reference paper: [https://arxiv.org/pdf/1409.4842v1.pdf](Going Deeper with Convolutions)
+    """
+
+    def __init__(self, epsilon=None):
+        super().__init__()
+        assert (epsilon is None or epsilon <= 0 or
+                epsilon >= 1), "googlenet is not support label_smooth"
+
+    def forward(self, inputs, label):
+        input0, input1, input2 = inputs
+        if isinstance(input0, dict):
+            input0 = input0["logits"]
+        if isinstance(input1, dict):
+            input1 = input1["logits"]
+        if isinstance(input2, dict):
+            input2 = input2["logits"]
+
+        loss0 = F.cross_entropy(input0, label=label, soft_label=False)
+        loss1 = F.cross_entropy(input1, label=label, soft_label=False)
+        loss2 = F.cross_entropy(input2, label=label, soft_label=False)
+        loss = loss0 + 0.3 * loss1 + 0.3 * loss2
+        loss = loss.mean()
+        return {"GooleNetLoss": loss}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/kldivloss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/kldivloss.py
new file mode 100644
index 000000000..da6ab02fb
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/kldivloss.py
@@ -0,0 +1,33 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class KLDivLoss(nn.Layer):
+    """
+    Distilling the Knowledge in a Neural Network
+    """
+
+    def __init__(self, temperature=4):
+        super(KLDivLoss, self).__init__()
+        self.T = temperature
+
+    def forward(self, y_s, y_t):
+        p_s = F.log_softmax(y_s / self.T, axis=1)
+        p_t = F.softmax(y_t / self.T, axis=1)
+        loss = F.kl_div(p_s, p_t, reduction='sum') * (self.T**2) / y_s.shape[0]
+        return {"loss_kldiv": loss}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/metabinloss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/metabinloss.py
new file mode 100644
index 000000000..34159bdcd
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/metabinloss.py
@@ -0,0 +1,206 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# reference: https://arxiv.org/abs/2011.14670
+
+import copy
+import numpy as np
+import paddle
+from paddle import nn
+from paddle.nn import functional as F
+
+from .dist_loss import cosine_similarity
+from .celoss import CELoss
+
+
+def euclidean_dist(x, y):
+    m, n = x.shape[0], y.shape[0]
+    xx = paddle.pow(x, 2).sum(1, keepdim=True).expand([m, n])
+    yy = paddle.pow(y, 2).sum(1, keepdim=True).expand([n, m]).t()
+    dist = xx + yy - 2 * paddle.matmul(x, y.t())
+    dist = dist.clip(min=1e-12).sqrt()  # for numerical stability
+    return dist
+
+
+def hard_example_mining(dist_mat, is_pos, is_neg):
+    """For each anchor, find the hardest positive and negative sample.
+    Args:
+        dist_mat: pairwise distance between samples, shape [N, M]
+        is_pos: positive index with shape [N, M]
+        is_neg: negative index with shape [N, M]
+    Returns:
+        dist_ap: distance(anchor, positive); shape [N, 1]
+        dist_an: distance(anchor, negative); shape [N, 1]
+    """
+
+    inf = float("inf")
+
+    def _masked_max(tensor, mask, axis):
+        masked = paddle.multiply(tensor, mask.astype(tensor.dtype))
+        neg_inf = paddle.zeros_like(tensor)
+        neg_inf.stop_gradient = True
+        neg_inf[paddle.logical_not(mask)] = -inf
+        return paddle.max(masked + neg_inf, axis=axis, keepdim=True)
+
+    def _masked_min(tensor, mask, axis):
+        masked = paddle.multiply(tensor, mask.astype(tensor.dtype))
+        pos_inf = paddle.zeros_like(tensor)
+        pos_inf.stop_gradient = True
+        pos_inf[paddle.logical_not(mask)] = inf
+        return paddle.min(masked + pos_inf, axis=axis, keepdim=True)
+
+    assert len(dist_mat.shape) == 2
+    dist_ap = _masked_max(dist_mat, is_pos, axis=1)
+    dist_an = _masked_min(dist_mat, is_neg, axis=1)
+    return dist_ap, dist_an
+
+
+class IntraDomainScatterLoss(nn.Layer):
+    """
+    IntraDomainScatterLoss
+    
+    enhance intra-domain diversity and disarrange inter-domain distributions like confusing multiple styles.
+
+    reference: https://arxiv.org/abs/2011.14670
+    """
+
+    def __init__(self, normalize_feature, feature_from):
+        super(IntraDomainScatterLoss, self).__init__()
+        self.normalize_feature = normalize_feature
+        self.feature_from = feature_from
+
+    def forward(self, input, batch):
+        domains = batch["domain"]
+        inputs = input[self.feature_from]
+
+        if self.normalize_feature:
+            inputs = 1. * inputs / (paddle.expand_as(
+                paddle.norm(
+                    inputs, p=2, axis=-1, keepdim=True), inputs) + 1e-12)
+
+        unique_label = paddle.unique(domains)
+        features_per_domain = list()
+        for i, x in enumerate(unique_label):
+            features_per_domain.append(inputs[x == domains])
+        num_domain = len(features_per_domain)
+        losses = []
+        for i in range(num_domain):
+            features_in_same_domain = features_per_domain[i]
+            center = paddle.mean(features_in_same_domain, 0)
+            cos_sim = cosine_similarity(
+                center.unsqueeze(0), features_in_same_domain)
+            losses.append(paddle.mean(cos_sim))
+        loss = paddle.mean(paddle.stack(losses))
+        return {"IntraDomainScatterLoss": loss}
+
+
+class InterDomainShuffleLoss(nn.Layer):
+    """
+    InterDomainShuffleLoss
+
+    pull the negative sample of the interdomain and push the negative sample of the intra-domain, 
+    so that the inter-domain distributions are shuffled.
+
+    reference: https://arxiv.org/abs/2011.14670
+    """
+
+    def __init__(self, normalize_feature=True, feature_from="features"):
+        super(InterDomainShuffleLoss, self).__init__()
+        self.feature_from = feature_from
+        self.normalize_feature = normalize_feature
+
+    def forward(self, input, batch):
+        target = batch["label"]
+        domains = batch["domain"]
+        inputs = input[self.feature_from]
+        bs = inputs.shape[0]
+
+        if self.normalize_feature:
+            inputs = 1. * inputs / (paddle.expand_as(
+                paddle.norm(
+                    inputs, p=2, axis=-1, keepdim=True), inputs) + 1e-12)
+
+        # compute distance
+        dist_mat = euclidean_dist(inputs, inputs)
+
+        is_same_img = np.zeros(shape=[bs, bs], dtype=bool)
+        np.fill_diagonal(is_same_img, True)
+        is_same_img = paddle.to_tensor(is_same_img)
+        is_diff_instance = target.reshape([bs, 1]).expand([bs, bs])\
+            .not_equal(target.reshape([bs, 1]).expand([bs, bs]).t())
+        is_same_domain = domains.reshape([bs, 1]).expand([bs, bs])\
+            .equal(domains.reshape([bs, 1]).expand([bs, bs]).t())
+        is_diff_domain = is_same_domain == False
+
+        is_pos = paddle.logical_or(is_same_img, is_diff_domain)
+        is_neg = paddle.logical_and(is_diff_instance, is_same_domain)
+
+        dist_ap, dist_an = hard_example_mining(dist_mat, is_pos, is_neg)
+
+        y = paddle.ones_like(dist_an)
+        loss = F.soft_margin_loss(dist_an - dist_ap, y)
+        if loss == float('Inf'):
+            loss = F.margin_ranking_loss(dist_an, dist_ap, y, margin=0.3)
+        return {"InterDomainShuffleLoss": loss}
+
+
+class CELossForMetaBIN(CELoss):
+    def _labelsmoothing(self, target, class_num):
+        if len(target.shape) == 1 or target.shape[-1] != class_num:
+            one_hot_target = F.one_hot(target, class_num)
+        else:
+            one_hot_target = target
+        # epsilon is different from the one in original CELoss
+        epsilon = class_num / (class_num - 1) * self.epsilon
+        soft_target = F.label_smooth(one_hot_target, epsilon=epsilon)
+        soft_target = paddle.reshape(soft_target, shape=[-1, class_num])
+        return soft_target
+
+    def forward(self, x, batch):
+        label = batch["label"]
+        return super().forward(x, label)
+
+
+class TripletLossForMetaBIN(nn.Layer):
+    def __init__(self,
+                 margin=1,
+                 normalize_feature=False,
+                 feature_from="feature"):
+        super(TripletLossForMetaBIN, self).__init__()
+        self.margin = margin
+        self.feature_from = feature_from
+        self.normalize_feature = normalize_feature
+
+    def forward(self, input, batch):
+        inputs = input[self.feature_from]
+        targets = batch["label"]
+        bs = inputs.shape[0]
+        all_targets = targets
+
+        if self.normalize_feature:
+            inputs = 1. * inputs / (paddle.expand_as(
+                paddle.norm(
+                    inputs, p=2, axis=-1, keepdim=True), inputs) + 1e-12)
+
+        dist_mat = euclidean_dist(inputs, inputs)
+
+        is_pos = all_targets.reshape([bs, 1]).expand([bs, bs]).equal(
+            all_targets.reshape([bs, 1]).expand([bs, bs]).t())
+        is_neg = all_targets.reshape([bs, 1]).expand([bs, bs]).not_equal(
+            all_targets.reshape([bs, 1]).expand([bs, bs]).t())
+        dist_ap, dist_an = hard_example_mining(dist_mat, is_pos, is_neg)
+
+        y = paddle.ones_like(dist_an)
+        loss = F.margin_ranking_loss(dist_an, dist_ap, y, margin=self.margin)
+        return {"TripletLoss": loss}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/mgd_loss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/mgd_loss.py
new file mode 100644
index 000000000..799a91431
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/mgd_loss.py
@@ -0,0 +1,84 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppcls.utils.initializer import kaiming_normal_
+
+
+class MGDLoss(nn.Layer):
+    """Paddle version of `Masked Generative Distillation`
+    MGDLoss
+    Reference: https://arxiv.org/abs/2205.01529
+    Code was heavily based on https://github.com/yzd-v/MGD
+    """
+
+    def __init__(
+            self,
+            student_channels,
+            teacher_channels,
+            alpha_mgd=1.756,
+            lambda_mgd=0.15, ):
+        super().__init__()
+        self.alpha_mgd = alpha_mgd
+        self.lambda_mgd = lambda_mgd
+
+        if student_channels != teacher_channels:
+            self.align = nn.Conv2D(
+                student_channels,
+                teacher_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0)
+        else:
+            self.align = None
+
+        self.generation = nn.Sequential(
+            nn.Conv2D(
+                teacher_channels, teacher_channels, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv2D(
+                teacher_channels, teacher_channels, kernel_size=3, padding=1))
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Conv2D):
+            kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+
+    def forward(self, pred_s, pred_t):
+        """Forward function.
+        Args:
+            pred_s(Tensor): Bs*C*H*W, student's feature map
+            pred_t(Tensor): Bs*C*H*W, teacher's feature map
+        """
+        assert pred_s.shape[-2:] == pred_t.shape[-2:]
+
+        if self.align is not None:
+            pred_s = self.align(pred_s)
+
+        loss = self.get_dis_loss(pred_s, pred_t) * self.alpha_mgd
+
+        return loss
+
+    def get_dis_loss(self, pred_s, pred_t):
+        loss_mse = nn.MSELoss(reduction='mean')
+        N, C, _, _ = pred_t.shape
+        mat = paddle.rand([N, C, 1, 1])
+        mat = paddle.where(mat < self.lambda_mgd, 0, 1).astype("float32")
+        masked_fea = paddle.multiply(pred_s, mat)
+        new_fea = self.generation(masked_fea)
+        dis_loss = loss_mse(new_fea, pred_t)
+        return dis_loss
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/msmloss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/msmloss.py
new file mode 100644
index 000000000..adf03ef8e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/msmloss.py
@@ -0,0 +1,80 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+from .comfunc import rerange_index
+
+
+class MSMLoss(paddle.nn.Layer):
+    """
+    paper : [Margin Sample Mining Loss: A Deep Learning Based Method for Person Re-identification](https://arxiv.org/pdf/1710.00478.pdf)
+    code reference: https://github.com/michuanhaohao/keras_reid/blob/master/reid_tripletcls.py
+    Margin Sample Mining Loss, based on triplet loss. USE P * K samples.
+    the batch size is fixed. Batch_size = P * K;  but the K may vary between batches.
+    same label gather together
+
+            supported_metrics = [
+            'euclidean',
+            'sqeuclidean',
+            'cityblock',
+        ]
+    only consider samples_each_class = 2
+    """
+
+    def __init__(self, batch_size=120, samples_each_class=2, margin=0.1):
+        super(MSMLoss, self).__init__()
+        self.margin = margin
+        self.samples_each_class = samples_each_class
+        self.batch_size = batch_size
+        self.rerange_index = rerange_index(batch_size, samples_each_class)
+
+    def forward(self, input, target=None):
+        #normalization
+        features = input["features"]
+        features = self._nomalize(features)
+        samples_each_class = self.samples_each_class
+        rerange_index = paddle.to_tensor(self.rerange_index)
+
+        #calc sm
+        diffs = paddle.unsqueeze(
+            features, axis=1) - paddle.unsqueeze(
+                features, axis=0)
+        similary_matrix = paddle.sum(paddle.square(diffs), axis=-1)
+
+        #rerange
+        tmp = paddle.reshape(similary_matrix, shape=[-1, 1])
+        tmp = paddle.gather(tmp, index=rerange_index)
+        similary_matrix = paddle.reshape(tmp, shape=[-1, self.batch_size])
+
+        #split
+        ignore, pos, neg = paddle.split(
+            similary_matrix,
+            num_or_sections=[1, samples_each_class - 1, -1],
+            axis=1)
+        ignore.stop_gradient = True
+
+        hard_pos = paddle.max(pos)
+        hard_neg = paddle.min(neg)
+
+        loss = hard_pos + self.margin - hard_neg
+        loss = paddle.nn.ReLU()(loss)
+        return {"msmloss": loss}
+
+    def _nomalize(self, input):
+        input_norm = paddle.sqrt(
+            paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        return paddle.divide(input, input_norm)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/multilabelloss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/multilabelloss.py
new file mode 100644
index 000000000..51ae97838
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/multilabelloss.py
@@ -0,0 +1,119 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+def ratio2weight(targets, ratio):
+    pos_weights = targets * (1. - ratio)
+    neg_weights = (1. - targets) * ratio
+    weights = paddle.exp(neg_weights + pos_weights)
+
+    # for RAP dataloader, targets element may be 2, with or without smooth, some element must great than 1
+    weights = weights - weights * (targets > 1).astype(weights.dtype)
+
+    return weights
+
+
+class MultiLabelLoss(nn.Layer):
+    """
+    Multi-label loss
+    """
+
+    def __init__(self, epsilon=None, size_sum=False, weight_ratio=False):
+        super().__init__()
+        if epsilon is not None and (epsilon <= 0 or epsilon >= 1):
+            epsilon = None
+        self.epsilon = epsilon
+        self.weight_ratio = weight_ratio
+        self.size_sum = size_sum
+
+    def _labelsmoothing(self, target, class_num):
+        if target.ndim == 1 or target.shape[-1] != class_num:
+            one_hot_target = F.one_hot(target, class_num)
+        else:
+            one_hot_target = target
+        soft_target = F.label_smooth(one_hot_target, epsilon=self.epsilon)
+        soft_target = paddle.reshape(soft_target, shape=[-1, class_num])
+        return soft_target
+
+    def _binary_crossentropy(self, input, target, class_num):
+        if self.weight_ratio:
+            target, label_ratio = target[:, 0, :], target[:, 1, :]
+        elif target.ndim == 3:
+            target = target[:, 0, :]
+        if self.epsilon is not None:
+            target = self._labelsmoothing(target, class_num)
+        cost = F.binary_cross_entropy_with_logits(
+            logit=input, label=target, reduction='none')
+
+        if self.weight_ratio:
+            targets_mask = paddle.cast(target > 0.5, 'float32')
+            weight = ratio2weight(targets_mask, paddle.to_tensor(label_ratio))
+            weight = weight * (target > -1).astype(weight.dtype)
+            cost = cost * weight
+
+        if self.size_sum:
+            cost = cost.sum(1).mean() if self.size_sum else cost.mean()
+
+        return cost
+
+    def forward(self, x, target):
+        if isinstance(x, dict):
+            x = x["logits"]
+        class_num = x.shape[-1]
+        loss = self._binary_crossentropy(x, target, class_num)
+        loss = loss.mean()
+        return {"MultiLabelLoss": loss}
+
+
+class MultiLabelAsymmetricLoss(nn.Layer):
+    """
+    Multi-label asymmetric loss, introduced by
+    Emanuel Ben-Baruch at el. in https://arxiv.org/pdf/2009.14119v4.pdf.
+    """
+
+    def __init__(self,
+                 gamma_pos=1,
+                 gamma_neg=4,
+                 clip=0.05,
+                 epsilon=1e-8,
+                 disable_focal_loss_grad=True,
+                 reduction="sum"):
+        super().__init__()
+        self.gamma_pos = gamma_pos
+        self.gamma_neg = gamma_neg
+        self.clip = clip
+        self.epsilon = epsilon
+        self.disable_focal_loss_grad = disable_focal_loss_grad
+        assert reduction in ["mean", "sum", "none"]
+        self.reduction = reduction
+
+    def forward(self, x, target):
+        if isinstance(x, dict):
+            x = x["logits"]
+        pred_sigmoid = F.sigmoid(x)
+        target = target.astype(pred_sigmoid.dtype)
+
+        # Asymmetric Clipping and Basic CE calculation
+        if self.clip and self.clip > 0:
+            pt = (1 - pred_sigmoid + self.clip).clip(max=1) \
+                * (1 - target) + pred_sigmoid * target
+        else:
+            pt = (1 - pred_sigmoid) * (1 - target) + pred_sigmoid * target
+
+        # Asymmetric Focusing
+        if self.disable_focal_loss_grad:
+            paddle.set_grad_enabled(False)
+        asymmetric_weight = (
+            1 - pt
+        ).pow(self.gamma_pos * target + self.gamma_neg * (1 - target))
+        if self.disable_focal_loss_grad:
+            paddle.set_grad_enabled(True)
+
+        loss = -paddle.log(pt.clip(min=self.epsilon)) * asymmetric_weight
+
+        if self.reduction == 'mean':
+            loss = loss.mean()
+        elif self.reduction == 'sum':
+            loss = loss.sum()
+        return {"MultiLabelAsymmetricLoss": loss}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/npairsloss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/npairsloss.py
new file mode 100644
index 000000000..131c799a4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/npairsloss.py
@@ -0,0 +1,43 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+
+
+class NpairsLoss(paddle.nn.Layer):
+    """Npair_loss_
+    paper [Improved deep metric learning with multi-class N-pair loss objective](https://dl.acm.org/doi/10.5555/3157096.3157304)
+    code reference: https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/contrib/losses/metric_learning/npairs_loss
+    """
+
+    def __init__(self, reg_lambda=0.01):
+        super(NpairsLoss, self).__init__()
+        self.reg_lambda = reg_lambda
+
+    def forward(self, input, target=None):
+        """
+        anchor and positive(should include label)
+        """
+        features = input["features"]
+        reg_lambda = self.reg_lambda
+        batch_size = features.shape[0]
+        fea_dim = features.shape[1]
+        num_class = batch_size // 2
+
+        #reshape
+        out_feas = paddle.reshape(features, shape=[-1, 2, fea_dim])
+        anc_feas, pos_feas = paddle.split(out_feas, num_or_sections=2, axis=1)
+        anc_feas = paddle.squeeze(anc_feas, axis=1)
+        pos_feas = paddle.squeeze(pos_feas, axis=1)
+
+        #get simi matrix
+        similarity_matrix = paddle.matmul(
+            anc_feas, pos_feas, transpose_y=True)  #get similarity matrix
+        sparse_labels = paddle.arange(0, num_class, dtype='int64')
+        xentloss = paddle.nn.CrossEntropyLoss()(
+            similarity_matrix, sparse_labels)  #by default: mean
+
+        #l2 norm
+        reg = paddle.mean(paddle.sum(paddle.square(features), axis=1))
+        l2loss = 0.5 * reg_lambda * reg
+        return {"npairsloss": xentloss + l2loss}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/pairwisecosface.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/pairwisecosface.py
new file mode 100644
index 000000000..f1fc73024
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/pairwisecosface.py
@@ -0,0 +1,64 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class PairwiseCosface(nn.Layer):
+    """
+    paper: Circle Loss: A Unified Perspective of Pair Similarity Optimization
+    code reference: https://github.com/leoluopy/circle-loss-demonstration/blob/main/circle_loss.py
+    """
+
+    def __init__(self, margin, gamma):
+        super(PairwiseCosface, self).__init__()
+        self.margin = margin
+        self.gamma = gamma
+
+    def forward(self, embedding, targets):
+        if isinstance(embedding, dict):
+            embedding = embedding['features']
+        # Normalize embedding features
+        embedding = F.normalize(embedding, axis=1)
+        dist_mat = paddle.matmul(embedding, embedding, transpose_y=True)
+
+        N = dist_mat.shape[0]
+        is_pos = targets.reshape([N, 1]).expand([N, N]).equal(
+            paddle.t(targets.reshape([N, 1]).expand([N, N]))).astype('float32')
+        is_neg = targets.reshape([N, 1]).expand([N, N]).not_equal(
+            paddle.t(targets.reshape([N, 1]).expand([N, N]))).astype('float32')
+
+        # Mask scores related to itself
+        is_pos = is_pos - paddle.eye(N, N)
+
+        s_p = dist_mat * is_pos
+        s_n = dist_mat * is_neg
+
+        logit_p = -self.gamma * s_p + (-99999999.) * (1 - is_pos)
+        logit_n = self.gamma * (s_n + self.margin) + (-99999999.) * (1 - is_neg
+                                                                     )
+
+        loss = F.softplus(
+            paddle.logsumexp(
+                logit_p, axis=1) + paddle.logsumexp(
+                    logit_n, axis=1)).mean()
+
+        return {"PairwiseCosface": loss}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/pefdloss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/pefdloss.py
new file mode 100644
index 000000000..f16a8d5dc
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/pefdloss.py
@@ -0,0 +1,83 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ppcls.utils.initializer import kaiming_normal_, kaiming_uniform_
+
+
+class Regressor(nn.Layer):
+    """Linear regressor"""
+
+    def __init__(self, dim_in=1024, dim_out=1024):
+        super(Regressor, self).__init__()
+        self.conv = nn.Linear(dim_in, dim_out)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = F.relu(x)
+        return x
+
+
+class PEFDLoss(nn.Layer):
+    """Improved Feature Distillation via Projector Ensemble
+    Reference: https://arxiv.org/pdf/2210.15274.pdf
+    Code reference: https://github.com/chenyd7/PEFD
+    """
+
+    def __init__(self,
+                 student_channel,
+                 teacher_channel,
+                 num_projectors=3,
+                 mode="flatten"):
+        super().__init__()
+
+        if num_projectors <= 0:
+            raise ValueError("Number of projectors must be greater than 0.")
+
+        if mode not in ["flatten", "gap"]:
+            raise ValueError("Mode must be \"flatten\" or \"gap\".")
+
+        self.mode = mode
+        self.projectors = nn.LayerList()
+
+        for _ in range(num_projectors):
+            self.projectors.append(Regressor(student_channel, teacher_channel))
+
+    def forward(self, student_feature, teacher_feature):
+        if self.mode == "gap":
+            student_feature = F.adaptive_avg_pool2d(student_feature, (1, 1))
+            teacher_feature = F.adaptive_avg_pool2d(teacher_feature, (1, 1))
+
+        student_feature = student_feature.flatten(1)
+        f_t = teacher_feature.flatten(1)
+
+        q = len(self.projectors)
+        f_s = 0.0
+        for i in range(q):
+            f_s += self.projectors[i](student_feature)
+        f_s = f_s / q
+
+        # inner product (normalize first and inner product)
+        normft = f_t.pow(2).sum(1, keepdim=True).pow(1. / 2)
+        outft = f_t / normft
+        normfs = f_s.pow(2).sum(1, keepdim=True).pow(1. / 2)
+        outfs = f_s / normfs
+
+        cos_theta = (outft * outfs).sum(1, keepdim=True)
+        loss = paddle.mean(1 - cos_theta)
+
+        return loss
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/rkdloss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/rkdloss.py
new file mode 100644
index 000000000..aa6ae2324
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/rkdloss.py
@@ -0,0 +1,99 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+def pdist(e, squared=False, eps=1e-12):
+    e_square = e.pow(2).sum(axis=1)
+    prod = paddle.mm(e, e.t())
+    res = (e_square.unsqueeze(1) + e_square.unsqueeze(0) - 2 * prod).clip(
+        min=eps)
+
+    if not squared:
+        res = res.sqrt()
+    return res
+
+
+class RKdAngle(nn.Layer):
+    # paper : [Relational Knowledge Distillation](https://arxiv.org/abs/1904.05068?context=cs.LG)
+    # reference: https://github.com/lenscloth/RKD/blob/master/metric/loss.py
+    def __init__(self, target_size=None):
+        super().__init__()
+        if target_size is not None:
+            self.avgpool = paddle.nn.AdaptiveAvgPool2D(target_size)
+        else:
+            self.avgpool = None
+
+    def forward(self, student, teacher):
+        # GAP to reduce memory
+        if self.avgpool is not None:
+            # NxC1xH1xW1 -> NxC1x1x1
+            student = self.avgpool(student)
+            # NxC2xH2xW2 -> NxC2x1x1
+            teacher = self.avgpool(teacher)
+
+        # reshape for feature map distillation
+        bs = student.shape[0]
+        student = student.reshape([bs, -1])
+        teacher = teacher.reshape([bs, -1])
+
+        td = (teacher.unsqueeze(0) - teacher.unsqueeze(1))
+        norm_td = F.normalize(td, p=2, axis=2)
+        t_angle = paddle.bmm(norm_td, norm_td.transpose([0, 2, 1])).reshape(
+            [-1, 1])
+
+        sd = (student.unsqueeze(0) - student.unsqueeze(1))
+        norm_sd = F.normalize(sd, p=2, axis=2)
+        s_angle = paddle.bmm(norm_sd, norm_sd.transpose([0, 2, 1])).reshape(
+            [-1, 1])
+        loss = F.smooth_l1_loss(s_angle, t_angle, reduction='mean')
+        return loss
+
+
+class RkdDistance(nn.Layer):
+    # paper : [Relational Knowledge Distillation](https://arxiv.org/abs/1904.05068?context=cs.LG)
+    # reference: https://github.com/lenscloth/RKD/blob/master/metric/loss.py
+    def __init__(self, eps=1e-12, target_size=1):
+        super().__init__()
+        self.eps = eps
+        if target_size is not None:
+            self.avgpool = paddle.nn.AdaptiveAvgPool2D(target_size)
+        else:
+            self.avgpool = None
+
+    def forward(self, student, teacher):
+        # GAP to reduce memory
+        if self.avgpool is not None:
+            # NxC1xH1xW1 -> NxC1x1x1
+            student = self.avgpool(student)
+            # NxC2xH2xW2 -> NxC2x1x1
+            teacher = self.avgpool(teacher)
+
+        bs = student.shape[0]
+        student = student.reshape([bs, -1])
+        teacher = teacher.reshape([bs, -1])
+
+        t_d = pdist(teacher, squared=False)
+        mean_td = t_d.mean()
+        t_d = t_d / (mean_td + self.eps)
+
+        d = pdist(student, squared=False)
+        mean_d = d.mean()
+        d = d / (mean_d + self.eps)
+
+        loss = F.smooth_l1_loss(d, t_d, reduction="mean")
+        return loss
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/skdloss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/skdloss.py
new file mode 100644
index 000000000..fe8e8e14d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/skdloss.py
@@ -0,0 +1,72 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class SKDLoss(nn.Layer):
+    """
+    Spherical Knowledge Distillation
+    paper: https://arxiv.org/pdf/2010.07485.pdf
+    code reference: https://github.com/forjiuzhou/Spherical-Knowledge-Distillation
+    """
+
+    def __init__(self,
+                 temperature,
+                 multiplier=2.0,
+                 alpha=0.9,
+                 use_target_as_gt=False):
+        super().__init__()
+        self.temperature = temperature
+        self.multiplier = multiplier
+        self.alpha = alpha
+        self.use_target_as_gt = use_target_as_gt
+
+    def forward(self, logits_student, logits_teacher, target=None):
+        """Compute Spherical Knowledge Distillation loss.
+        Args:
+            logits_student: student's logits with shape (batch_size, num_classes)
+            logits_teacher: teacher's logits with shape (batch_size, num_classes)
+        """
+        if target is None or self.use_target_as_gt:
+            target = logits_teacher.argmax(axis=-1)
+
+        target = F.one_hot(
+            target.reshape([-1]), num_classes=logits_student[0].shape[0])
+
+        logits_student = F.layer_norm(
+            logits_student,
+            logits_student.shape[1:],
+            weight=None,
+            bias=None,
+            epsilon=1e-7) * self.multiplier
+        logits_teacher = F.layer_norm(
+            logits_teacher,
+            logits_teacher.shape[1:],
+            weight=None,
+            bias=None,
+            epsilon=1e-7) * self.multiplier
+
+        kd_loss = -paddle.sum(F.softmax(logits_teacher / self.temperature) *
+                              F.log_softmax(logits_student / self.temperature),
+                              axis=1)
+
+        kd_loss = paddle.mean(kd_loss) * self.temperature**2
+
+        ce_loss = paddle.mean(-paddle.sum(
+            target * F.log_softmax(logits_student), axis=1))
+
+        return kd_loss * self.alpha + ce_loss * (1 - self.alpha)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/softsuploss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/softsuploss.py
new file mode 100644
index 000000000..b1389a0ba
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/softsuploss.py
@@ -0,0 +1,75 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+class SoftSupConLoss(nn.Layer):
+    """
+    Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf.
+    It also supports the unsupervised contrastive loss in SimCLR
+    """
+    def __init__(self, temperature=0.07, contrast_mode='all', base_temperature=0.07):
+        super(SoftSupConLoss, self).__init__()
+        self.temperature = temperature
+        self.contrast_mode = contrast_mode
+        self.base_temperature = base_temperature
+
+    def __call__(self, feat, batch, max_probs=None, labels=None, mask=None, reduction="mean", select_matrix=None):
+        """Compute loss for model. If both `labels` and `mask` are None,
+        it degenerates to SimCLR unsupervised loss:
+        https://arxiv.org/pdf/2002.05709.pdf
+
+        Args:
+            feat: hidden vector of shape [batch_size, n_views, ...].
+            labels: ground truth of shape [batch_size].
+            mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j
+                has the same class as sample i. Can be asymmetric.
+        Returns:
+            A loss scalar.
+        """
+        max_probs = batch['max_probs']
+        labels = batch['p_targets_u_w']
+        # reduction = batch['reduction']
+        batch_size = feat.shape[0]
+        if labels is not None:
+            labels = labels.reshape((-1, 1))
+            mask = paddle.equal(labels, labels.T).astype('float32')
+            max_probs = max_probs.reshape((-1, 1))
+            score_mask = paddle.matmul(max_probs, max_probs.T)
+            mask = paddle.multiply(mask, score_mask)
+            
+        contrast_count = feat.shape[1]
+        contrast_feat = paddle.concat(paddle.unbind(feat, axis=1), axis=0)  # (2n, d)
+        if self.contrast_mode == 'all':
+            anchor_feat = contrast_feat
+            anchor_count = contrast_count
+        anchor_dot_contrast = paddle.matmul(anchor_feat, contrast_feat.T) / self.temperature
+        logits_max = anchor_dot_contrast.max(axis=1, keepdim=True)
+        logits = anchor_dot_contrast - logits_max.detach()
+        mask = paddle.concat([mask, mask], axis=0)
+        mask = paddle.concat([mask, mask], axis=1)
+        
+        logits_mask = 1 - paddle.eye(batch_size * contrast_count, dtype=paddle.float64)
+        mask = mask * logits_mask
+        exp_logits = paddle.exp(logits) * logits_mask
+        log_prob = logits - paddle.log(exp_logits.sum(axis=1, keepdim=True))
+        
+        mean_log_prob_pos = (mask * log_prob).sum(axis=1) / mask.sum(axis=1)
+        loss = -(self.temperature / self.base_temperature) * mean_log_prob_pos
+        loss = loss.reshape((anchor_count, batch_size))
+        if reduction == 'mean':
+            loss = loss.mean()
+
+        return {"SoftSupConLoss": loss}
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/softtargetceloss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/softtargetceloss.py
new file mode 100644
index 000000000..351db50e3
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/softtargetceloss.py
@@ -0,0 +1,16 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class SoftTargetCrossEntropy(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, target):
+        loss = paddle.sum(-target * F.log_softmax(x, axis=-1), axis=-1)
+        loss = loss.mean()
+        return {"SoftTargetCELoss": loss}
+
+    def __str__(self, ):
+        return type(self).__name__
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/supconloss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/supconloss.py
new file mode 100644
index 000000000..753ceaf41
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/supconloss.py
@@ -0,0 +1,109 @@
+import paddle
+from paddle import nn
+
+
+class SupConLoss(nn.Layer):
+    """Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf.
+    code reference: https://github.com/HobbitLong/SupContrast/blob/master/losses.py
+    It also supports the unsupervised contrastive loss in SimCLR"""
+
+    def __init__(self,
+                 views=16,
+                 temperature=0.07,
+                 contrast_mode='all',
+                 base_temperature=0.07,
+                 normalize_feature=True):
+        super(SupConLoss, self).__init__()
+        self.temperature = paddle.to_tensor(temperature)
+        self.contrast_mode = contrast_mode
+        self.base_temperature = paddle.to_tensor(base_temperature)
+        self.num_ids = None
+        self.views = views
+        self.normalize_feature = normalize_feature
+
+    def forward(self, features, labels, mask=None):
+        """Compute loss for model. If both `labels` and `mask` are None,
+        it degenerates to SimCLR unsupervised loss:
+        https://arxiv.org/pdf/2002.05709.pdf
+        Args:
+            features: hidden vector of shape [bsz, n_views, ...].
+            labels: ground truth of shape [bsz].
+            mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j
+                has the same class as sample i. Can be asymmetric.
+        Returns:
+            A loss scalar.
+        """
+        features = features["features"]
+        if self.num_ids is None:
+            self.num_ids = int(features.shape[0] / self.views)
+
+        if self.normalize_feature:
+            features = 1. * features / (paddle.expand_as(
+                paddle.norm(
+                    features, p=2, axis=-1, keepdim=True), features) + 1e-12)
+        features = features.reshape([self.num_ids, self.views, -1])
+        labels = labels.reshape([self.num_ids, self.views])[:, 0]
+
+        if len(features.shape) < 3:
+            raise ValueError('`features` needs to be [bsz, n_views, ...],'
+                             'at least 3 dimensions are required')
+        if len(features.shape) > 3:
+            features = features.reshape(
+                [features.shape[0], features.shape[1], -1])
+
+        batch_size = features.shape[0]
+        if labels is not None and mask is not None:
+            raise ValueError('Cannot define both `labels` and `mask`')
+        elif labels is None and mask is None:
+            mask = paddle.eye(batch_size, dtype='float32')
+        elif labels is not None:
+            labels = labels.reshape([-1, 1])
+            if labels.shape[0] != batch_size:
+                raise ValueError(
+                    'Num of labels does not match num of features')
+            mask = paddle.cast(
+                paddle.equal(labels, paddle.t(labels)), 'float32')
+        else:
+            mask = paddle.cast(mask, 'float32')
+
+        contrast_count = features.shape[1]
+        contrast_feature = paddle.concat(
+            paddle.unbind(
+                features, axis=1), axis=0)
+        if self.contrast_mode == 'one':
+            anchor_feature = features[:, 0]
+            anchor_count = 1
+        elif self.contrast_mode == 'all':
+            anchor_feature = contrast_feature
+            anchor_count = contrast_count
+        else:
+            raise ValueError('Unknown mode: {}'.format(self.contrast_mode))
+
+        # compute logits
+        anchor_dot_contrast = paddle.divide(
+            paddle.matmul(anchor_feature, paddle.t(contrast_feature)),
+            self.temperature)
+        # for numerical stability
+        logits_max = paddle.max(anchor_dot_contrast, axis=1, keepdim=True)
+        logits = anchor_dot_contrast - logits_max.detach()
+
+        # tile mask
+        mask = paddle.tile(mask, [anchor_count, contrast_count])
+
+        logits_mask = 1 - paddle.eye(batch_size * anchor_count)
+        mask = mask * logits_mask
+
+        # compute log_prob
+        exp_logits = paddle.exp(logits) * logits_mask
+        log_prob = logits - paddle.log(
+            paddle.sum(exp_logits, axis=1, keepdim=True))
+
+        # compute mean of log-likelihood over positive
+        mean_log_prob_pos = paddle.sum((mask * log_prob),
+                                       axis=1) / paddle.sum(mask, axis=1)
+
+        # loss
+        loss = -(self.temperature / self.base_temperature) * mean_log_prob_pos
+        loss = paddle.mean(loss.reshape([anchor_count, batch_size]))
+
+        return {"SupConLoss": loss}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/trihardloss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/trihardloss.py
new file mode 100644
index 000000000..96cb42cb4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/trihardloss.py
@@ -0,0 +1,84 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from .comfunc import rerange_index
+
+
+class TriHardLoss(paddle.nn.Layer):
+    """
+    paper: In Defense of the Triplet Loss for Person Re-Identification
+    code reference: https://github.com/VisualComputingInstitute/triplet-reid/blob/master/loss.py
+    TriHard Loss, based on triplet loss. USE P * K samples.
+    the batch size is fixed. Batch_size = P * K;  but the K may vary between batches.
+    same label gather together
+
+            supported_metrics = [
+            'euclidean',
+            'sqeuclidean',
+            'cityblock',
+        ]
+    only consider samples_each_class = 2
+    """
+
+    def __init__(self, batch_size=120, samples_each_class=2, margin=0.1):
+        super(TriHardLoss, self).__init__()
+        self.margin = margin
+        self.samples_each_class = samples_each_class
+        self.batch_size = batch_size
+        self.rerange_index = rerange_index(batch_size, samples_each_class)
+
+    def forward(self, input, target=None):
+        features = input["features"]
+        assert (self.batch_size == features.shape[0])
+
+        #normalization
+        features = self._nomalize(features)
+        samples_each_class = self.samples_each_class
+        rerange_index = paddle.to_tensor(self.rerange_index)
+
+        #calc sm
+        diffs = paddle.unsqueeze(
+            features, axis=1) - paddle.unsqueeze(
+                features, axis=0)
+        similary_matrix = paddle.sum(paddle.square(diffs), axis=-1)
+
+        #rerange
+        tmp = paddle.reshape(similary_matrix, shape=[-1, 1])
+        tmp = paddle.gather(tmp, index=rerange_index)
+        similary_matrix = paddle.reshape(tmp, shape=[-1, self.batch_size])
+
+        #split
+        ignore, pos, neg = paddle.split(
+            similary_matrix,
+            num_or_sections=[1, samples_each_class - 1, -1],
+            axis=1)
+
+        ignore.stop_gradient = True
+        hard_pos = paddle.max(pos, axis=1)
+        hard_neg = paddle.min(neg, axis=1)
+
+        loss = hard_pos + self.margin - hard_neg
+        loss = paddle.nn.ReLU()(loss)
+        loss = paddle.mean(loss)
+        return {"trihardloss": loss}
+
+    def _nomalize(self, input):
+        input_norm = paddle.sqrt(
+            paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        return paddle.divide(input, input_norm)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/triplet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/triplet.py
new file mode 100644
index 000000000..0da7cc5df
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/triplet.py
@@ -0,0 +1,157 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+
+
+class TripletLossV2(nn.Layer):
+    """Triplet loss with hard positive/negative mining.
+    paper : [Facenet: A unified embedding for face recognition and clustering](https://arxiv.org/pdf/1503.03832.pdf)
+    code reference: https://github.com/okzhili/Cartoon-face-recognition/blob/master/loss/triplet_loss.py
+    Args:
+        margin (float): margin for triplet.
+    """
+
+    def __init__(self,
+                 margin=0.5,
+                 normalize_feature=True,
+                 feature_from="features"):
+        super(TripletLossV2, self).__init__()
+        self.margin = margin
+        self.feature_from = feature_from
+        self.ranking_loss = paddle.nn.loss.MarginRankingLoss(margin=margin)
+        self.normalize_feature = normalize_feature
+
+    def forward(self, input, target):
+        """
+        Args:
+            inputs: feature matrix with shape (batch_size, feat_dim)
+            target: ground truth labels with shape (num_classes)
+        """
+        inputs = input[self.feature_from]
+
+        if self.normalize_feature:
+            inputs = 1. * inputs / (paddle.expand_as(
+                paddle.norm(
+                    inputs, p=2, axis=-1, keepdim=True), inputs) + 1e-12)
+
+        bs = inputs.shape[0]
+
+        # compute distance
+        dist = paddle.pow(inputs, 2).sum(axis=1, keepdim=True).expand([bs, bs])
+        dist = dist + dist.t()
+        dist = paddle.addmm(
+            input=dist, x=inputs, y=inputs.t(), alpha=-2.0, beta=1.0)
+        dist = paddle.clip(dist, min=1e-12).sqrt()
+
+        # hard negative mining
+        is_pos = paddle.expand(target, (
+            bs, bs)).equal(paddle.expand(target, (bs, bs)).t())
+        is_neg = paddle.expand(target, (
+            bs, bs)).not_equal(paddle.expand(target, (bs, bs)).t())
+
+        # `dist_ap` means distance(anchor, positive)
+        ## both `dist_ap` and `relative_p_inds` with shape [N, 1]
+        '''
+        dist_ap, relative_p_inds = paddle.max(
+            paddle.reshape(dist[is_pos], (bs, -1)), axis=1, keepdim=True)
+        # `dist_an` means distance(anchor, negative)
+        # both `dist_an` and `relative_n_inds` with shape [N, 1]
+        dist_an, relative_n_inds = paddle.min(
+            paddle.reshape(dist[is_neg], (bs, -1)), axis=1, keepdim=True)
+        '''
+        dist_ap = paddle.max(paddle.reshape(
+            paddle.masked_select(dist, is_pos), (bs, -1)),
+                             axis=1,
+                             keepdim=True)
+        # `dist_an` means distance(anchor, negative)
+        # both `dist_an` and `relative_n_inds` with shape [N, 1]
+        dist_an = paddle.min(paddle.reshape(
+            paddle.masked_select(dist, is_neg), (bs, -1)),
+                             axis=1,
+                             keepdim=True)
+        # shape [N]
+        dist_ap = paddle.squeeze(dist_ap, axis=1)
+        dist_an = paddle.squeeze(dist_an, axis=1)
+
+        # Compute ranking hinge loss
+        y = paddle.ones_like(dist_an)
+        loss = self.ranking_loss(dist_an, dist_ap, y)
+        return {"TripletLossV2": loss}
+
+
+class TripletLoss(nn.Layer):
+    """Triplet loss with hard positive/negative mining.
+    Reference:
+    Hermans et al. In Defense of the Triplet Loss for Person Re-Identification. arXiv:1703.07737.
+    Code imported from https://github.com/Cysu/open-reid/blob/master/reid/loss/triplet.py.
+    Args:
+        margin (float): margin for triplet.
+    """
+
+    def __init__(self, margin=1.0):
+        super(TripletLoss, self).__init__()
+        self.margin = margin
+        self.ranking_loss = paddle.nn.loss.MarginRankingLoss(margin=margin)
+
+    def forward(self, input, target):
+        """
+        Args:
+            inputs: feature matrix with shape (batch_size, feat_dim)
+            target: ground truth labels with shape (num_classes)
+        """
+        inputs = input["features"]
+
+        bs = inputs.shape[0]
+        # Compute pairwise distance, replace by the official when merged
+        dist = paddle.pow(inputs, 2).sum(axis=1, keepdim=True).expand([bs, bs])
+        dist = dist + dist.t()
+        dist = paddle.addmm(
+            input=dist, x=inputs, y=inputs.t(), alpha=-2.0, beta=1.0)
+        dist = paddle.clip(dist, min=1e-12).sqrt()
+
+        mask = paddle.equal(
+            target.expand([bs, bs]), target.expand([bs, bs]).t())
+        mask_numpy_idx = mask.numpy()
+        dist_ap, dist_an = [], []
+        for i in range(bs):
+            # dist_ap_i = paddle.to_tensor(dist[i].numpy()[mask_numpy_idx[i]].max(),dtype='float64').unsqueeze(0)
+            # dist_ap_i.stop_gradient = False
+            # dist_ap.append(dist_ap_i)
+            dist_ap.append(
+                max([
+                    dist[i][j] if mask_numpy_idx[i][j] == True else float(
+                        "-inf") for j in range(bs)
+                ]).unsqueeze(0))
+            # dist_an_i = paddle.to_tensor(dist[i].numpy()[mask_numpy_idx[i] == False].min(), dtype='float64').unsqueeze(0)
+            # dist_an_i.stop_gradient = False
+            # dist_an.append(dist_an_i)
+            dist_an.append(
+                min([
+                    dist[i][k] if mask_numpy_idx[i][k] == False else float(
+                        "inf") for k in range(bs)
+                ]).unsqueeze(0))
+
+        dist_ap = paddle.concat(dist_ap, axis=0)
+        dist_an = paddle.concat(dist_an, axis=0)
+
+        # Compute ranking hinge loss
+        y = paddle.ones_like(dist_an)
+        loss = self.ranking_loss(dist_an, dist_ap, y)
+        return {"TripletLoss": loss}
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/tripletangularmarginloss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/tripletangularmarginloss.py
new file mode 100644
index 000000000..df03489f2
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/tripletangularmarginloss.py
@@ -0,0 +1,241 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+from ppcls.loss.xbm import CrossBatchMemory
+
+
+class TripletAngularMarginLoss(nn.Layer):
+    """A more robust triplet loss with hard positive/negative mining on angular margin instead of relative distance between d(a,p) and d(a,n).
+
+    Args:
+        margin (float, optional): angular margin. Defaults to 0.5.
+        normalize_feature (bool, optional): whether to apply L2-norm in feature before computing distance(cos-similarity). Defaults to True.
+        reduction (str, optional): reducing option within an batch . Defaults to "mean".
+        add_absolute (bool, optional): whether add absolute loss within d(a,p) or d(a,n). Defaults to False.
+        absolute_loss_weight (float, optional): weight for absolute loss. Defaults to 1.0.
+        ap_value (float, optional): weight for d(a, p). Defaults to 0.9.
+        an_value (float, optional): weight for d(a, n). Defaults to 0.5.
+        feature_from (str, optional): which key feature from. Defaults to "features".
+    """
+
+    def __init__(self,
+                 margin=0.5,
+                 normalize_feature=True,
+                 reduction="mean",
+                 add_absolute=False,
+                 absolute_loss_weight=1.0,
+                 ap_value=0.9,
+                 an_value=0.5,
+                 feature_from="features"):
+        super(TripletAngularMarginLoss, self).__init__()
+        self.margin = margin
+        self.feature_from = feature_from
+        self.ranking_loss = paddle.nn.loss.MarginRankingLoss(
+            margin=margin, reduction=reduction)
+        self.normalize_feature = normalize_feature
+        self.add_absolute = add_absolute
+        self.ap_value = ap_value
+        self.an_value = an_value
+        self.absolute_loss_weight = absolute_loss_weight
+
+    def forward(self, input, target):
+        """
+        Args:
+            inputs: feature matrix with shape (batch_size, feat_dim)
+            target: ground truth labels with shape (batch_size)
+        """
+        inputs = input[self.feature_from]
+
+        if self.normalize_feature:
+            inputs = paddle.divide(
+                inputs, paddle.norm(
+                    inputs, p=2, axis=-1, keepdim=True))
+
+        bs = inputs.shape[0]
+
+        # compute distance(cos-similarity)
+        dist = paddle.matmul(inputs, inputs.t())
+
+        # hard negative mining
+        is_pos = paddle.expand(target, (
+            bs, bs)).equal(paddle.expand(target, (bs, bs)).t())
+        is_neg = paddle.expand(target, (
+            bs, bs)).not_equal(paddle.expand(target, (bs, bs)).t())
+
+        # `dist_ap` means distance(anchor, positive)
+        # both `dist_ap` and `relative_p_inds` with shape [N, 1]
+        dist_ap = paddle.min(paddle.reshape(
+            paddle.masked_select(dist, is_pos), (bs, -1)),
+                             axis=1,
+                             keepdim=True)
+        # `dist_an` means distance(anchor, negative)
+        # both `dist_an` and `relative_n_inds` with shape [N, 1]
+        dist_an = paddle.max(paddle.reshape(
+            paddle.masked_select(dist, is_neg), (bs, -1)),
+                             axis=1,
+                             keepdim=True)
+        # shape [N]
+        dist_ap = paddle.squeeze(dist_ap, axis=1)
+        dist_an = paddle.squeeze(dist_an, axis=1)
+
+        # Compute ranking hinge loss
+        y = paddle.ones_like(dist_an)
+        loss = self.ranking_loss(dist_ap, dist_an, y)
+
+        if self.add_absolute:
+            absolut_loss_ap = self.ap_value - dist_ap
+            absolut_loss_ap = paddle.where(absolut_loss_ap > 0,
+                                           absolut_loss_ap,
+                                           paddle.zeros_like(absolut_loss_ap))
+
+            absolut_loss_an = dist_an - self.an_value
+            absolut_loss_an = paddle.where(absolut_loss_an > 0,
+                                           absolut_loss_an,
+                                           paddle.ones_like(absolut_loss_an))
+
+            loss = (absolut_loss_an.mean() + absolut_loss_ap.mean()
+                    ) * self.absolute_loss_weight + loss.mean()
+
+        return {"TripletAngularMarginLoss": loss}
+
+
+class TripletAngularMarginLoss_XBM(TripletAngularMarginLoss):
+    """TripletAngularMarginLoss combined with CrossBatchMemory
+
+    Args:
+        start_iter: (int): from which step CrossBatchMemory is enabled
+        xbm_size: (int): Size of CrossBatchMemory
+        xbm_weight: (float): Weight of CrossBatchMemory loss
+        feat_dim: (int): Channels of features in CrossBatchMemory
+        margin (float, optional): angular margin. Defaults to 0.5.
+        normalize_feature (bool, optional): whether to apply L2-norm in feature before computing distance(cos-similarity). Defaults to True.
+        reduction (str, optional): reducing option within an batch . Defaults to "mean".
+        add_absolute (bool, optional): whether add absolute loss within d(a,p) or d(a,n). Defaults to False.
+        absolute_loss_weight (float, optional): weight for absolute loss. Defaults to 1.0.
+        ap_value (float, optional): weight for d(a, p). Defaults to 0.9.
+        an_value (float, optional): weight for d(a, n). Defaults to 0.5.
+        feature_from (str, optional): which key feature from. Defaults to "features".
+    """
+
+    def __init__(self,
+                 start_iter: int,
+                 xbm_size: int,
+                 xbm_weight: float,
+                 feat_dim: int,
+                 margin=0.5,
+                 normalize_feature=True,
+                 reduction="mean",
+                 add_absolute=False,
+                 absolute_loss_weight=1.0,
+                 ap_value=0.9,
+                 an_value=0.5,
+                 feature_from="features"):
+        super(TripletAngularMarginLoss_XBM, self).__init__(
+            margin, normalize_feature, reduction, add_absolute,
+            absolute_loss_weight, ap_value, an_value, feature_from)
+        self.start_iter = start_iter
+        self.xbm = CrossBatchMemory(xbm_size, feat_dim)
+        self.xbm_weight = xbm_weight
+        self.inf = 10  # 10 is big enough as inf for cos-similarity
+        self.register_buffer("iter", paddle.to_tensor(0, dtype="int64"))
+
+    def forward(self, input, target):
+        """
+        Args:
+            inputs: feature matrix with shape (batch_size, feat_dim)
+            target: ground truth labels with shape (batch_size)
+        """
+        feats = input[self.feature_from]
+        if self.normalize_feature:
+            feats = nn.functional.normalize(feats, p=2, axis=1)
+
+        labels = target
+        if labels.ndim >= 2 and labels.shape[-1] == 1:
+            labels = paddle.squeeze(labels, axis=[-1])
+
+        loss = self._compute_loss(feats, labels, feats, labels)
+
+        # XBM loss below
+        self.iter += 1
+        if self.iter.item() > self.start_iter:
+            self.xbm.enqueue_dequeue(feats.detach(), labels.detach())
+            xbm_feats, xbm_labels = self.xbm.get()
+            xbm_loss = self._compute_loss(feats, labels, xbm_feats, xbm_labels)
+            loss = loss + self.xbm_weight * xbm_loss
+
+        return {"TripletAngularMarginLoss_XBM": loss}
+
+    def _masked_max(self, tensor, mask, axis):
+        masked = paddle.multiply(tensor, mask.astype(tensor.dtype))
+        neg_inf = paddle.zeros_like(tensor)
+        neg_inf.stop_gradient = True
+        neg_inf[paddle.logical_not(mask)] = -self.inf
+        return paddle.max(masked + neg_inf, axis=axis, keepdim=True)
+
+    def _masked_min(self, tensor, mask, axis):
+        masked = paddle.multiply(tensor, mask.astype(tensor.dtype))
+        pos_inf = paddle.zeros_like(tensor)
+        pos_inf.stop_gradient = True
+        pos_inf[paddle.logical_not(mask)] = self.inf
+        return paddle.min(masked + pos_inf, axis=axis, keepdim=True)
+
+    def _compute_loss(self,
+                      inputs_q: paddle.Tensor,
+                      targets_q: paddle.Tensor,
+                      inputs_k: paddle.Tensor,
+                      targets_k: paddle.Tensor) -> paddle.Tensor:
+        Q = inputs_q.shape[0]
+        K = inputs_k.shape[0]
+
+        # compute distance(cos-similarity)
+        dist = paddle.matmul(inputs_q, inputs_k.t())  # [Q, K]
+
+        # hard negative mining
+        is_pos = paddle.expand(paddle.unsqueeze(targets_q, 1), (Q, K)).equal(
+            paddle.expand(paddle.unsqueeze(targets_k, 1),
+                          (K, Q)).t())  # [Q, K]
+        is_neg = paddle.expand(paddle.unsqueeze(targets_q, 1),
+                               (Q, K)).not_equal(
+                                   paddle.expand(
+                                       paddle.unsqueeze(targets_k, 1),
+                                       (K, Q)).t())  # [Q, K]
+
+        dist_ap = self._masked_min(dist, is_pos, axis=1)  # [Q, ]
+        dist_an = self._masked_max(dist, is_neg, axis=1)  # [Q, ]
+
+        # Compute ranking hinge loss
+        y = paddle.ones_like(dist_an)
+        loss = self.ranking_loss(dist_ap, dist_an, y)
+
+        if self.add_absolute:
+            absolut_loss_ap = self.ap_value - dist_ap
+            absolut_loss_ap = paddle.where(absolut_loss_ap > 0,
+                                           absolut_loss_ap,
+                                           paddle.zeros_like(absolut_loss_ap))
+
+            absolut_loss_an = dist_an - self.an_value
+            absolut_loss_an = paddle.where(absolut_loss_an > 0,
+                                           absolut_loss_an,
+                                           paddle.ones_like(absolut_loss_an))
+
+            loss = (absolut_loss_an.mean() + absolut_loss_ap.mean()
+                    ) * self.absolute_loss_weight + loss.mean()
+
+        return loss
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/wslloss.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/wslloss.py
new file mode 100644
index 000000000..8bdfaf8cc
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/wslloss.py
@@ -0,0 +1,66 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class WSLLoss(nn.Layer):
+    """
+    Weighted Soft Labels Loss
+    paper: https://arxiv.org/pdf/2102.00650.pdf
+    code reference: https://github.com/bellymonster/Weighted-Soft-Label-Distillation
+    """
+
+    def __init__(self, temperature=2.0, use_target_as_gt=False):
+        super().__init__()
+        self.temperature = temperature
+        self.use_target_as_gt = use_target_as_gt
+
+    def forward(self, logits_student, logits_teacher, target=None):
+        """Compute weighted soft labels loss.
+        Args:
+            logits_student: student's logits with shape (batch_size, num_classes)
+            logits_teacher: teacher's logits with shape (batch_size, num_classes)
+            target: ground truth labels with shape (batch_size)
+        """
+        if target is None or self.use_target_as_gt:
+            target = logits_teacher.argmax(axis=-1)
+
+        target = F.one_hot(
+            target.reshape([-1]), num_classes=logits_student[0].shape[0])
+
+        s_input_for_softmax = logits_student / self.temperature
+        t_input_for_softmax = logits_teacher / self.temperature
+
+        ce_loss_s = -paddle.sum(target *
+                                F.log_softmax(logits_student.detach()),
+                                axis=1)
+        ce_loss_t = -paddle.sum(target *
+                                F.log_softmax(logits_teacher.detach()),
+                                axis=1)
+
+        ratio = ce_loss_s / (ce_loss_t + 1e-7)
+        ratio = paddle.maximum(ratio, paddle.zeros_like(ratio))
+
+        kd_loss = -paddle.sum(F.softmax(t_input_for_softmax) *
+                              F.log_softmax(s_input_for_softmax),
+                              axis=1)
+        weight = 1 - paddle.exp(-ratio)
+
+        weighted_kd_loss = (self.temperature**2) * paddle.mean(kd_loss *
+                                                               weight)
+
+        return weighted_kd_loss
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/xbm.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/xbm.py
new file mode 100644
index 000000000..d63583e44
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/loss/xbm.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from typing import Tuple
+
+import paddle
+
+
+class CrossBatchMemory(paddle.nn.Layer):
+    """
+    CrossBatchMemory Implementation. refer to "Cross-Batch Memory for Embedding Learning".
+
+    code heavily based on https://github.com/msight-tech/research-xbm/blob/master/ret_benchmark/modeling/xbm.py
+
+    Args:
+        size (int): Size of memory bank
+        embedding_size (int): number of embedding dimension for memory bank
+    """
+
+    def __init__(self, size: int, embedding_size: int):
+        super().__init__()
+        self.size = size
+        self.embedding_size = embedding_size
+
+        # initialize and register feature queue for resume training
+        feats = paddle.zeros([self.size, self.embedding_size])
+        self.register_buffer("feats", feats)
+
+        # initialize and register label queue for resume training
+        targets = paddle.zeros([self.size, ], dtype="int64")
+        self.register_buffer("targets", targets)
+
+        self.ptr = 0
+        # self.accumulated_size = 0
+
+    @property
+    def _is_full(self) -> bool:
+        # return self.accumulated_size >= self.size
+        return self.targets[-1].item() != 0  # author's usage
+
+    def get(self) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """return features and targets in memory bank
+
+        Returns:
+            Tuple[paddle.Tensor, paddle.Tensor]: [features, targets]
+        """
+        if self._is_full:
+            return self.feats, self.targets
+        else:
+            return self.feats[:self.ptr], self.targets[:self.ptr]
+
+    def enqueue_dequeue(self, feats: paddle.Tensor,
+                        targets: paddle.Tensor) -> None:
+        """put newest feats and targets into memory bank and pop oldest feats and targets from momory bank
+
+        Args:
+            feats (paddle.Tensor): features to enque
+            targets (paddle.Tensor): targets to enque
+        """
+        input_size = len(targets)
+        if self.ptr + input_size > self.size:
+            self.feats[-input_size:] = feats
+            self.targets[-input_size:] = targets
+            self.ptr = 0
+        else:
+            self.feats[self.ptr:self.ptr + input_size] = feats
+            self.targets[self.ptr:self.ptr + input_size] = targets
+            self.ptr += input_size
+        # self.accumulated_size += input_size
+
+    def forward(self, *kargs, **kwargs):
+        raise NotImplementedError(
+            "CrossBatchMemory module is for memory-bank, forward method is not needed"
+        )
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/metric/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/metric/__init__.py
new file mode 100644
index 000000000..fc720b349
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/metric/__init__.py
@@ -0,0 +1,72 @@
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import copy
+from collections import OrderedDict
+
+from .avg_metrics import AvgMetrics
+from .metrics import TopkAcc, mAP, mINP, Recallk, Precisionk
+from .metrics import DistillationTopkAcc
+from .metrics import GoogLeNetTopkAcc
+from .metrics import HammingDistance, AccuracyScore
+from .metrics import ATTRMetric
+from .metrics import TprAtFpr, MultilabelMeanAccuracy
+from .metrics import MultiLabelMAP
+from .face_metrics import FaceAccuracy, FaceAccOnFiveDatasets
+
+
+class CombinedMetrics(AvgMetrics):
+    def __init__(self, config_list):
+        super().__init__()
+        self.metric_func_list = []
+        assert isinstance(config_list, list), (
+            'operator config should be a list')
+        for config in config_list:
+            assert isinstance(config,
+                              dict) and len(config) == 1, "yaml format error"
+            metric_name = list(config)[0]
+            metric_params = config[metric_name]
+            if metric_params is not None:
+                self.metric_func_list.append(
+                    eval(metric_name)(**metric_params))
+            else:
+                self.metric_func_list.append(eval(metric_name)())
+        self.reset()
+
+    def forward(self, *args, **kwargs):
+        metric_dict = OrderedDict()
+        for idx, metric_func in enumerate(self.metric_func_list):
+            metric_dict.update(metric_func(*args, **kwargs))
+        return metric_dict
+
+    @property
+    def avg_info(self):
+        return ", ".join([metric.avg_info for metric in self.metric_func_list])
+
+    @property
+    def avg(self):
+        return self.metric_func_list[0].avg
+
+    def attr_res(self):
+        return self.metric_func_list[0].attrmeter.res()
+
+    def reset(self):
+        for metric in self.metric_func_list:
+            if hasattr(metric, "reset"):
+                metric.reset()
+
+
+def build_metrics(config):
+    metrics_list = CombinedMetrics(copy.deepcopy(config))
+    return metrics_list
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/metric/avg_metrics.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/metric/avg_metrics.py
new file mode 100644
index 000000000..6f4b62290
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/metric/avg_metrics.py
@@ -0,0 +1,20 @@
+from paddle import nn
+
+
+class AvgMetrics(nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.avg_meters = {}
+
+    def reset(self):
+        self.avg_meters = {}
+
+    @property
+    def avg(self):
+        if self.avg_meters:
+            for metric_key in self.avg_meters:
+                return self.avg_meters[metric_key].avg
+
+    @property
+    def avg_info(self):
+        return ", ".join([self.avg_meters[key].avg_info for key in self.avg_meters])
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/metric/face_metrics.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/metric/face_metrics.py
new file mode 100644
index 000000000..cad9553c8
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/metric/face_metrics.py
@@ -0,0 +1,201 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cmath import nan
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from sklearn.model_selection import KFold
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import normalize
+
+from ppcls.utils import logger
+
+
+class FaceAccuracy(nn.Layer):
+    """
+    This code is modified from https://github.com/deepinsight/insightface/blob/master/recognition/arcface_torch/eval/verification.py
+    """
+    def __init__(self):
+        super().__init__()
+        self.embedding_left_list = []
+        self.embedding_right_list = []
+        self.label_list = []
+        self.best_acc = 0.
+
+    def forward(self, embeddings_left, embeddings_right, labels, *args):
+        assert len(embeddings_left) == len(embeddings_right) == len(labels)
+        self.embedding_left_list.append(normalize(embeddings_left.numpy()))
+        self.embedding_right_list.append(normalize(embeddings_right.numpy()))
+        self.label_list.append(labels.numpy())
+
+        return {}
+    
+    def reset(self):
+        self.embedding_left_list = []
+        self.embedding_right_list = []
+        self.label_list = []
+        self.best_acc = 0.
+
+    @property
+    def avg(self):
+        return self.best_acc
+
+    @property
+    def avg_info(self):
+        embeddings_left = np.concatenate(self.embedding_left_list)
+        embeddings_right = np.concatenate(self.embedding_right_list)
+        labels = np.concatenate(self.label_list) 
+        num_samples = len(embeddings_left)
+
+        thresholds = np.arange(0, 4, 0.01)
+        _, _, accuracy, best_thresholds = self.calculate_roc(thresholds, 
+                                                             embeddings_left,
+                                                             embeddings_right,
+                                                             labels)
+        self.best_acc = accuracy.mean()
+        return "best_threshold: {:.4f}, acc: {:.4f}, num_samples: {}".format(
+            best_thresholds.mean(), accuracy.mean(), num_samples)
+        
+    @staticmethod
+    def calculate_roc(thresholds,
+                      embeddings1,
+                      embeddings2,
+                      actual_issame,
+                      nrof_folds=10,
+                      pca=0):
+        assert (embeddings1.shape[0] == embeddings2.shape[0])
+        assert (embeddings1.shape[1] == embeddings2.shape[1])
+        nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
+        nrof_thresholds = len(thresholds)
+        k_fold = KFold(n_splits=nrof_folds, shuffle=False)
+
+        tprs = np.zeros((nrof_folds, nrof_thresholds))
+        fprs = np.zeros((nrof_folds, nrof_thresholds))
+        accuracy = np.zeros((nrof_folds))
+        best_thresholds = np.zeros((nrof_folds))
+        indices = np.arange(nrof_pairs)
+        # print('pca', pca)
+        dist = None
+
+        if pca == 0:
+            diff = np.subtract(embeddings1, embeddings2)
+            dist = np.sum(np.square(diff), 1)
+
+        for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
+            if pca > 0:
+                print('doing pca on', fold_idx)
+                embed1_train = embeddings1[train_set]
+                embed2_train = embeddings2[train_set]
+                _embed_train = np.concatenate((embed1_train, embed2_train), axis=0)
+                pca_model = PCA(n_components=pca)
+                pca_model.fit(_embed_train)
+                embed1 = pca_model.transform(embeddings1)
+                embed2 = pca_model.transform(embeddings2)
+                embed1 = normalize(embed1)
+                embed2 = normalize(embed2)
+                diff = np.subtract(embed1, embed2)
+                dist = np.sum(np.square(diff), 1)
+
+            # Find the best threshold for the fold
+            acc_train = np.zeros((nrof_thresholds))
+            for threshold_idx, threshold in enumerate(thresholds):
+                _, _, acc_train[threshold_idx] = FaceAccuracy.calculate_accuracy(
+                    threshold, dist[train_set], actual_issame[train_set])
+            best_threshold_index = np.argmax(acc_train)
+            best_thresholds[fold_idx] = thresholds[best_threshold_index]
+            for threshold_idx, threshold in enumerate(thresholds):
+                tprs[fold_idx, threshold_idx], fprs[
+                    fold_idx, threshold_idx], _ = FaceAccuracy.calculate_accuracy(
+                        threshold, dist[test_set], actual_issame[test_set])
+            _, _, accuracy[fold_idx] = FaceAccuracy.calculate_accuracy(
+                thresholds[best_threshold_index], dist[test_set],
+                actual_issame[test_set])
+
+        tpr = np.mean(tprs, 0)
+        fpr = np.mean(fprs, 0)
+        return tpr, fpr, accuracy, best_thresholds
+
+
+    @staticmethod
+    def calculate_accuracy(threshold, dist, actual_issame):
+        predict_issame = np.less(dist, threshold)
+        tp = np.sum(np.logical_and(predict_issame, actual_issame))
+        fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
+        tn = np.sum(
+            np.logical_and(
+                np.logical_not(predict_issame), np.logical_not(actual_issame)))
+        fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame))
+    
+        tpr = 0 if (tp + fn == 0) else float(tp) / float(tp + fn)
+        fpr = 0 if (fp + tn == 0) else float(fp) / float(fp + tn)
+        acc = float(tp + tn) / dist.size
+        return tpr, fpr, acc
+
+
+class FaceAccOnFiveDatasets(FaceAccuracy):
+    dataname_to_idx = {
+        "agedb_30": 0,
+        "cfp_fp": 1,
+        "lfw": 2,
+        "cplfw": 3,
+        "calfw": 4
+    }
+    idx_to_dataname = {v: k for k, v in dataname_to_idx.items()}
+
+    def __init__(self):
+        super().__init__()
+        self.dataname_idx_list = []
+    
+    def forward(self, embeddings_left, embeddings_right, labels, 
+                dataname_idxs, *args):
+        assert len(embeddings_left) == len(dataname_idxs)
+        dataname_idxs = dataname_idxs.astype('int64').numpy()
+        self.dataname_idx_list.append(dataname_idxs)
+
+        return super().forward(embeddings_left, embeddings_right, labels)
+    
+    def reset(self):
+        super().reset()
+        self.dataname_idx_list = []
+    
+    @property
+    def avg_info(self):
+        results = {}
+        all_embeddings_left = np.concatenate(self.embedding_left_list)
+        all_embeddings_right = np.concatenate(self.embedding_right_list)
+        all_labels = np.concatenate(self.label_list)
+        dataname_idxs = np.concatenate(self.dataname_idx_list)
+
+        acc = []
+        for dataname_idx in np.unique(dataname_idxs):
+            dataname = self.idx_to_dataname[dataname_idx]
+            mask = dataname_idxs == dataname_idx
+            embeddings_left = all_embeddings_left[mask]
+            embeddings_right = all_embeddings_right[mask]
+            labels = all_labels[mask]
+
+            thresholds = np.arange(0, 4, 0.01)
+            _, _, accuracy, best_thresholds = self.calculate_roc(
+                thresholds, embeddings_left, embeddings_right, labels)
+            acc.append(accuracy.mean())
+            results[f'{dataname}-best_threshold'] = f'{best_thresholds.mean():.4f}'
+            results[f'{dataname}-acc'] = f'{accuracy.mean():.4f}'
+            results[f'{dataname}-num_samples'] = f'{len(embeddings_left)}'
+        self.best_acc = np.mean(acc)
+        results['avg_acc'] = f'{self.best_acc:.4f}'
+
+        info = ", ".join([f"{k}: {v}" for k, v in results.items()])
+        return info
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/metric/metrics.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/metric/metrics.py
new file mode 100644
index 000000000..e39aafbe1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/metric/metrics.py
@@ -0,0 +1,661 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cmath import nan
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from sklearn.metrics import hamming_loss
+from sklearn.metrics import accuracy_score as accuracy_metric
+from sklearn.metrics import multilabel_confusion_matrix
+from sklearn.preprocessing import binarize
+
+from easydict import EasyDict
+
+from ppcls.metric.avg_metrics import AvgMetrics
+from ppcls.utils.misc import AverageMeter, AttrMeter
+from ppcls.utils import logger
+
+
+class TopkAcc(AvgMetrics):
+    def __init__(self, topk=(1, 5)):
+        super().__init__()
+        assert isinstance(topk, (int, list, tuple))
+        if isinstance(topk, int):
+            topk = [topk]
+        self.topk = topk
+        self.reset()
+        self.warned = False
+
+    def reset(self):
+        self.avg_meters = {
+            f"top{k}": AverageMeter(f"top{k}")
+            for k in self.topk
+        }
+
+    def forward(self, x, label):
+        if isinstance(x, dict):
+            x = x["logits"]
+
+        output_dims = x.shape[-1]
+
+        metric_dict = dict()
+        for idx, k in enumerate(self.topk):
+            if output_dims < k:
+                if not self.warned:
+                    msg = f"The output dims({output_dims}) is less than k({k}), so the Top-{k} metric is meaningless."
+                    logger.warning(msg)
+                    self.warned = True
+                metric_dict[f"top{k}"] = 1
+            else:
+                metric_dict[f"top{k}"] = paddle.metric.accuracy(x, label, k=k)
+            self.avg_meters[f"top{k}"].update(metric_dict[f"top{k}"],
+                                              x.shape[0])
+        return metric_dict
+
+
+class mAP(nn.Layer):
+    def __init__(self, descending=True):
+        super().__init__()
+        self.descending = descending
+
+    def forward(self, similarities_matrix, query_img_id, gallery_img_id,
+                keep_mask):
+        metric_dict = dict()
+
+        choosen_indices = paddle.argsort(
+            similarities_matrix, axis=1, descending=self.descending)
+        gallery_labels_transpose = paddle.transpose(gallery_img_id, [1, 0])
+        gallery_labels_transpose = paddle.broadcast_to(
+            gallery_labels_transpose,
+            shape=[
+                choosen_indices.shape[0], gallery_labels_transpose.shape[1]
+            ])
+        choosen_label = paddle.index_sample(gallery_labels_transpose,
+                                            choosen_indices)
+        equal_flag = paddle.equal(choosen_label, query_img_id)
+        if keep_mask is not None:
+            keep_mask = paddle.index_sample(
+                keep_mask.astype('float32'), choosen_indices)
+            equal_flag = paddle.logical_and(equal_flag,
+                                            keep_mask.astype('bool'))
+        equal_flag = paddle.cast(equal_flag, 'float32')
+
+        num_rel = paddle.sum(equal_flag, axis=1)
+        num_rel = paddle.greater_than(num_rel, paddle.to_tensor(0.))
+        num_rel_index = paddle.nonzero(num_rel.astype("int"))
+        num_rel_index = paddle.reshape(num_rel_index, [num_rel_index.shape[0]])
+
+        if paddle.numel(num_rel_index).item() == 0:
+            metric_dict["mAP"] = np.nan
+            return metric_dict
+
+        equal_flag = paddle.index_select(equal_flag, num_rel_index, axis=0)
+
+        acc_sum = paddle.cumsum(equal_flag, axis=1)
+        div = paddle.arange(acc_sum.shape[1]).astype("float32") + 1
+        precision = paddle.divide(acc_sum, div)
+
+        #calc map
+        precision_mask = paddle.multiply(equal_flag, precision)
+        ap = paddle.sum(precision_mask, axis=1) / paddle.sum(equal_flag,
+                                                             axis=1)
+        metric_dict["mAP"] = float(paddle.mean(ap))
+        return metric_dict
+
+
+class mINP(nn.Layer):
+    def __init__(self, descending=True):
+        super().__init__()
+        self.descending = descending
+
+    def forward(self, similarities_matrix, query_img_id, gallery_img_id,
+                keep_mask):
+        metric_dict = dict()
+
+        choosen_indices = paddle.argsort(
+            similarities_matrix, axis=1, descending=self.descending)
+        gallery_labels_transpose = paddle.transpose(gallery_img_id, [1, 0])
+        gallery_labels_transpose = paddle.broadcast_to(
+            gallery_labels_transpose,
+            shape=[
+                choosen_indices.shape[0], gallery_labels_transpose.shape[1]
+            ])
+        choosen_label = paddle.index_sample(gallery_labels_transpose,
+                                            choosen_indices)
+        equal_flag = paddle.equal(choosen_label, query_img_id)
+        if keep_mask is not None:
+            keep_mask = paddle.indechmx_sample(
+                keep_mask.astype('float32'), choosen_indices)
+            equal_flag = paddle.logical_and(equal_flag,
+                                            keep_mask.astype('bool'))
+        equal_flag = paddle.cast(equal_flag, 'float32')
+
+        num_rel = paddle.sum(equal_flag, axis=1)
+        num_rel = paddle.greater_than(num_rel, paddle.to_tensor(0.))
+        num_rel_index = paddle.nonzero(num_rel.astype("int"))
+        num_rel_index = paddle.reshape(num_rel_index, [num_rel_index.shape[0]])
+        equal_flag = paddle.index_select(equal_flag, num_rel_index, axis=0)
+
+        #do accumulative sum
+        div = paddle.arange(equal_flag.shape[1]).astype("float32") + 2
+        minus = paddle.divide(equal_flag, div)
+        auxilary = paddle.subtract(equal_flag, minus)
+        hard_index = paddle.argmax(auxilary, axis=1).astype("float32")
+        all_INP = paddle.divide(paddle.sum(equal_flag, axis=1), hard_index)
+        mINP = paddle.mean(all_INP)
+        metric_dict["mINP"] = float(mINP)
+        return metric_dict
+
+
+class TprAtFpr(nn.Layer):
+    def __init__(self, max_fpr=1 / 1000.):
+        super().__init__()
+        self.gt_pos_score_list = []
+        self.gt_neg_score_list = []
+        self.softmax = nn.Softmax(axis=-1)
+        self.max_fpr = max_fpr
+        self.max_tpr = 0.
+
+    def forward(self, x, label):
+        if isinstance(x, dict):
+            x = x["logits"]
+        x = self.softmax(x)
+        for i, label_i in enumerate(label):
+            if label_i[0] == 0:
+                self.gt_neg_score_list.append(x[i][1].numpy())
+            else:
+                self.gt_pos_score_list.append(x[i][1].numpy())
+        return {}
+
+    def reset(self):
+        self.gt_pos_score_list = []
+        self.gt_neg_score_list = []
+        self.max_tpr = 0.
+
+    @property
+    def avg(self):
+        return self.max_tpr
+
+    @property
+    def avg_info(self):
+        max_tpr = 0.
+        result = ""
+        gt_pos_score_list = np.array(self.gt_pos_score_list)
+        gt_neg_score_list = np.array(self.gt_neg_score_list)
+        for i in range(0, 10000):
+            threshold = i / 10000.
+            if len(gt_pos_score_list) == 0:
+                continue
+            tpr = np.sum(
+                gt_pos_score_list > threshold) / len(gt_pos_score_list)
+            if len(gt_neg_score_list) == 0 and tpr > max_tpr:
+                max_tpr = tpr
+                result = "threshold: {}, fpr: 0.0, tpr: {:.5f}".format(
+                    threshold, tpr)
+                msg = f"The number of negative samples is 0, please add negative samples."
+                logger.warning(msg)
+            fpr = np.sum(
+                gt_neg_score_list > threshold) / len(gt_neg_score_list)
+            if fpr <= self.max_fpr and tpr > max_tpr:
+                max_tpr = tpr
+                result = "threshold: {}, fpr: {}, tpr: {:.5f}".format(
+                    threshold, fpr, tpr)
+        self.max_tpr = max_tpr
+        return result
+
+
+class MultilabelMeanAccuracy(nn.Layer):
+    def __init__(self,
+                 start_threshold=0.4,
+                 num_iterations=10,
+                 end_threshold=0.9):
+        super().__init__()
+        self.start_threshold = start_threshold
+        self.num_iterations = num_iterations
+        self.end_threshold = end_threshold
+        self.gt_all_score_list = []
+        self.gt_label_score_list = []
+        self.max_acc = 0.
+
+    def forward(self, x, label):
+        if isinstance(x, dict):
+            x = x["logits"]
+        x = F.sigmoid(x)
+        label = label[:, 0, :]
+        for i in range(len(x)):
+            self.gt_all_score_list.append(x[i].numpy())
+            self.gt_label_score_list.append(label[i].numpy())
+        return {}
+
+    def reset(self):
+        self.gt_all_score_list = []
+        self.gt_label_score_list = []
+        self.max_acc = 0.
+
+    @property
+    def avg(self):
+        return self.max_acc
+
+    @property
+    def avg_info(self):
+        max_acc = 0.
+        result = ""
+        gt_all_score_list = np.array(self.gt_all_score_list)
+        gt_label_score_list = np.array(self.gt_label_score_list)
+        for i in range(self.num_iterations):
+            threshold = self.start_threshold + i * (self.end_threshold -
+                                                    self.start_threshold
+                                                    ) / self.num_iterations
+            pred_label = (gt_all_score_list > threshold).astype(int)
+            TP = np.sum(
+                (gt_label_score_list == 1) * (pred_label == 1)).astype(float)
+            TN = np.sum(
+                (gt_label_score_list == 0) * (pred_label == 0)).astype(float)
+            acc = (TP + TN) / len(gt_all_score_list)
+            if max_acc <= acc:
+                max_acc = acc
+                result = "threshold: {}, mean_acc: {}".format(
+                    threshold, max_acc / len(gt_label_score_list[0]))
+        self.max_acc = max_acc / len(gt_label_score_list[0])
+        return result
+
+
+class Recallk(nn.Layer):
+    def __init__(self, topk=(1, 5), descending=True):
+        super().__init__()
+        assert isinstance(topk, (int, list, tuple))
+        if isinstance(topk, int):
+            topk = [topk]
+        self.topk = topk
+        self.descending = descending
+
+    def forward(self, similarities_matrix, query_img_id, gallery_img_id,
+                keep_mask):
+        metric_dict = dict()
+
+        # get cmc
+        choosen_indices = paddle.argsort(
+            similarities_matrix, axis=1, descending=self.descending)
+        gallery_labels_transpose = gallery_img_id.t()
+        gallery_labels_transpose = paddle.broadcast_to(
+            gallery_labels_transpose,
+            shape=[
+                choosen_indices.shape[0], gallery_labels_transpose.shape[1]
+            ])
+        choosen_label = paddle.index_sample(gallery_labels_transpose,
+                                            choosen_indices)
+        equal_flag = paddle.equal(choosen_label, query_img_id)
+        if keep_mask is not None:
+            keep_mask = paddle.index_sample(
+                keep_mask.astype("float32"), choosen_indices)
+            equal_flag = equal_flag & keep_mask.astype("bool")
+        equal_flag = paddle.cast(equal_flag, "float32")
+        real_query_num = paddle.sum(equal_flag, axis=1)
+        real_query_num = paddle.sum((real_query_num > 0.0).astype("float32"))
+
+        acc_sum = paddle.cumsum(equal_flag, axis=1)
+        mask = (acc_sum > 0.0).astype("float32")
+        all_cmc = (paddle.sum(mask, axis=0) / real_query_num).numpy()
+
+        for k in self.topk:
+            metric_dict["recall{}".format(k)] = all_cmc[k - 1]
+        return metric_dict
+
+
+class Precisionk(nn.Layer):
+    def __init__(self, topk=(1, 5), descending=True):
+        super().__init__()
+        assert isinstance(topk, (int, list, tuple))
+        if isinstance(topk, int):
+            topk = [topk]
+        self.topk = topk
+        self.descending = descending
+
+    def forward(self, similarities_matrix, query_img_id, gallery_img_id,
+                keep_mask):
+        metric_dict = dict()
+
+        #get cmc
+        choosen_indices = paddle.argsort(
+            similarities_matrix, axis=1, descending=self.descending)
+        gallery_labels_transpose = paddle.transpose(gallery_img_id, [1, 0])
+        gallery_labels_transpose = paddle.broadcast_to(
+            gallery_labels_transpose,
+            shape=[
+                choosen_indices.shape[0], gallery_labels_transpose.shape[1]
+            ])
+        choosen_label = paddle.index_sample(gallery_labels_transpose,
+                                            choosen_indices)
+        equal_flag = paddle.equal(choosen_label, query_img_id)
+        if keep_mask is not None:
+            keep_mask = paddle.index_sample(
+                keep_mask.astype('float32'), choosen_indices)
+            equal_flag = paddle.logical_and(equal_flag,
+                                            keep_mask.astype('bool'))
+        equal_flag = paddle.cast(equal_flag, 'float32')
+
+        Ns = paddle.arange(gallery_img_id.shape[0]) + 1
+        equal_flag_cumsum = paddle.cumsum(equal_flag, axis=1)
+        Precision_at_k = (paddle.mean(equal_flag_cumsum, axis=0) / Ns).numpy()
+
+        for k in self.topk:
+            metric_dict["precision@{}".format(k)] = Precision_at_k[k - 1]
+
+        return metric_dict
+
+
+class DistillationTopkAcc(TopkAcc):
+    def __init__(self, model_key, feature_key=None, topk=(1, 5)):
+        super().__init__(topk=topk)
+        self.model_key = model_key
+        self.feature_key = feature_key
+
+    def forward(self, x, label):
+        if isinstance(x, dict):
+            x = x[self.model_key]
+        if self.feature_key is not None:
+            x = x[self.feature_key]
+        return super().forward(x, label)
+
+
+class GoogLeNetTopkAcc(TopkAcc):
+    def __init__(self, topk=(1, 5)):
+        super().__init__()
+        assert isinstance(topk, (int, list, tuple))
+        if isinstance(topk, int):
+            topk = [topk]
+        self.topk = topk
+
+    def forward(self, x, label):
+        return super().forward(x[0], label)
+
+
+class MultiLabelMetric(AvgMetrics):
+    def __init__(self, bi_threshold=0.5):
+        super().__init__()
+        self.bi_threshold = bi_threshold
+
+    def _multi_hot_encode(self, output):
+        logits = F.sigmoid(output).numpy()
+        return binarize(logits, threshold=self.bi_threshold)
+
+
+class HammingDistance(MultiLabelMetric):
+    """
+    Soft metric based label for multilabel classification
+    Returns:
+        The smaller the return value is, the better model is.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.reset()
+
+    def reset(self):
+        self.avg_meters = {"HammingDistance": AverageMeter("HammingDistance")}
+
+    def forward(self, output, target):
+        preds = super()._multi_hot_encode(output)
+        metric_dict = dict()
+        metric_dict["HammingDistance"] = paddle.to_tensor(
+            hamming_loss(target, preds))
+        self.avg_meters["HammingDistance"].update(
+            float(metric_dict["HammingDistance"]), output.shape[0])
+        return metric_dict
+
+
+class AccuracyScore(MultiLabelMetric):
+    """
+    Hard metric for multilabel classification
+    Args:
+        base: ["sample", "label"], default="sample"
+            if "sample", return metric score based sample,
+            if "label", return metric score based label.
+    Returns:
+        accuracy:
+    """
+
+    def __init__(self, base="label"):
+        super().__init__()
+        assert base in ["sample", "label"
+                        ], 'must be one of ["sample", "label"]'
+        self.base = base
+        self.reset()
+
+    def reset(self):
+        self.avg_meters = {"AccuracyScore": AverageMeter("AccuracyScore")}
+
+    def forward(self, output, target):
+        preds = super()._multi_hot_encode(output)
+        metric_dict = dict()
+        if self.base == "sample":
+            accuracy = accuracy_metric(target, preds)
+        elif self.base == "label":
+            mcm = multilabel_confusion_matrix(target, preds)
+            tns = mcm[:, 0, 0]
+            fns = mcm[:, 1, 0]
+            tps = mcm[:, 1, 1]
+            fps = mcm[:, 0, 1]
+            accuracy = (sum(tps) + sum(tns)) / (
+                sum(tps) + sum(tns) + sum(fns) + sum(fps))
+        metric_dict["AccuracyScore"] = paddle.to_tensor(accuracy)
+        self.avg_meters["AccuracyScore"].update(
+            float(metric_dict["AccuracyScore"]), output.shape[0])
+        return metric_dict
+
+
+def get_attr_metrics(gt_label, preds_probs, threshold):
+    """
+    index: evaluated label index
+    adapted from "https://github.com/valencebond/Rethinking_of_PAR/blob/master/metrics/pedestrian_metrics.py"
+    """
+    pred_label = (preds_probs > threshold).astype(int)
+
+    eps = 1e-20
+    result = EasyDict()
+
+    has_fuyi = gt_label == -1
+    pred_label[has_fuyi] = -1
+
+    ###############################
+    # label metrics
+    # TP + FN
+    result.gt_pos = np.sum((gt_label == 1), axis=0).astype(float)
+    # TN + FP
+    result.gt_neg = np.sum((gt_label == 0), axis=0).astype(float)
+    # TP
+    result.true_pos = np.sum((gt_label == 1) * (pred_label == 1),
+                             axis=0).astype(float)
+    # TN
+    result.true_neg = np.sum((gt_label == 0) * (pred_label == 0),
+                             axis=0).astype(float)
+    # FP
+    result.false_pos = np.sum(((gt_label == 0) * (pred_label == 1)),
+                              axis=0).astype(float)
+    # FN
+    result.false_neg = np.sum(((gt_label == 1) * (pred_label == 0)),
+                              axis=0).astype(float)
+
+    ################
+    # instance metrics
+    result.gt_pos_ins = np.sum((gt_label == 1), axis=1).astype(float)
+    result.true_pos_ins = np.sum((pred_label == 1), axis=1).astype(float)
+    # true positive
+    result.intersect_pos = np.sum((gt_label == 1) * (pred_label == 1),
+                                  axis=1).astype(float)
+    # IOU
+    result.union_pos = np.sum(((gt_label == 1) + (pred_label == 1)),
+                              axis=1).astype(float)
+
+    return result
+
+
+class ATTRMetric(nn.Layer):
+    def __init__(self, threshold=0.5):
+        super().__init__()
+        self.threshold = threshold
+
+    def reset(self):
+        self.attrmeter = AttrMeter(threshold=0.5)
+
+    def forward(self, output, target):
+        metric_dict = get_attr_metrics(target[:, 0, :].numpy(),
+                                       output.numpy(), self.threshold)
+        self.attrmeter.update(metric_dict)
+        return metric_dict
+
+
+class MultiLabelMAP(nn.Layer):
+    """
+    Calculate multi-label classification mean average precision.
+    Currently, support two types: 11point and integral
+
+    The code base on:
+    https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/metrics/map_utils.py
+
+    Args:
+        map_type (str): Calculation method of mean average.
+    """
+
+    def __init__(self, map_type='integral'):
+        super().__init__()
+        assert map_type in ['11point', 'integral'], \
+            "map_type currently only support '11point' and 'integral'"
+        self.map_type = map_type
+
+        self.reset()
+
+    def reset(self):
+        self.is_latest = True
+        self.class_score_poss = None
+        self.class_gt_counts = None
+        self.mAP = 0.0
+
+    def one_class_update(self, score, gt_label, class_idx):
+        topk_idx = np.argsort(score)[::-1]
+        topk_score = score[topk_idx]
+        topk_gt_label = gt_label[topk_idx]
+        for s, l in zip(topk_score, topk_gt_label):
+            if int(l) == 1:
+                self.class_score_poss[class_idx].append([s, 1.])
+                self.class_gt_counts[class_idx] += 1
+            else:
+                self.class_score_poss[class_idx].append([s, 0.])
+
+    @staticmethod
+    def get_tp_fp_accum(score_pos_list):
+        """
+        Calculate accumulating true/false positive results from
+        [score, pos] records
+        """
+        sorted_list = sorted(score_pos_list, key=lambda s: s[0], reverse=True)
+
+        accum_tp = 0
+        accum_fp = 0
+        accum_tp_list = []
+        accum_fp_list = []
+        for (score, pos) in sorted_list:
+            accum_tp += int(pos)
+            accum_tp_list.append(accum_tp)
+            accum_fp += 1 - int(pos)
+            accum_fp_list.append(accum_fp)
+
+        return accum_tp_list, accum_fp_list
+
+    def compute_mAP(self):
+        if not self.is_latest:
+            mAP = 0.
+            valid_cnt = 0
+            for score_pos, count in zip(self.class_score_poss,
+                                        self.class_gt_counts):
+                if count == 0:
+                    continue
+
+                if len(score_pos) == 0:
+                    valid_cnt += 1
+                    continue
+
+                accum_tp_list, accum_fp_list = \
+                    self.get_tp_fp_accum(score_pos)
+                precision = []
+                recall = []
+                for ac_tp, ac_fp in zip(accum_tp_list, accum_fp_list):
+                    precision.append(float(ac_tp) / (ac_tp + ac_fp))
+                    recall.append(float(ac_tp) / count)
+
+                one_class_ap = 0.0
+                if self.map_type == '11point':
+                    max_precisions = [0.] * 11
+                    start_idx = len(precision) - 1
+                    for j in range(10, -1, -1):
+                        for i in range(start_idx, -1, -1):
+                            if recall[i] < float(j) / 10.:
+                                start_idx = i
+                                if j > 0:
+                                    max_precisions[j - 1] = max_precisions[j]
+                                    break
+                            else:
+                                if max_precisions[j] < precision[i]:
+                                    max_precisions[j] = precision[i]
+                    one_class_ap = sum(max_precisions) / 11.
+                    mAP += one_class_ap
+                    valid_cnt += 1
+                elif self.map_type == 'integral':
+                    import math
+                    prev_recall = 0.
+                    for i in range(len(precision)):
+                        recall_gap = math.fabs(recall[i] - prev_recall)
+                        if recall_gap > 1e-6:
+                            one_class_ap += precision[i] * recall_gap
+                            prev_recall = recall[i]
+                    mAP += one_class_ap
+                    valid_cnt += 1
+                else:
+                    raise NotImplementedError(
+                        f"Unsupported mAP type {self.map_type}")
+
+            self.mAP = mAP / float(valid_cnt) if valid_cnt > 0 else mAP
+
+            self.is_latest = True
+
+    def forward(self, output, target):
+        scores = F.sigmoid(output).numpy()
+        gt_labels = target.numpy()
+
+        if self.class_score_poss is None:
+            self.class_score_poss = [[] for _ in range(scores.shape[-1])]
+        if self.class_gt_counts is None:
+            self.class_gt_counts = [0] * scores.shape[-1]
+
+        for class_idx in range(scores.shape[-1]):
+            score = scores[:, class_idx]
+            gt_label = gt_labels[:, class_idx]
+            self.one_class_update(score, gt_label, class_idx)
+
+        self.is_latest = False
+
+        return {}
+
+    @property
+    def avg_info(self):
+        self.compute_mAP()
+        return f"MultiLabelMAP({self.map_type}): {self.mAP:.3f}"
+
+    @property
+    def avg(self):
+        self.compute_mAP()
+        return self.mAP
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/optimizer/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/optimizer/__init__.py
new file mode 100644
index 000000000..992bc57a1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/optimizer/__init__.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+import copy
+import paddle
+from typing import Dict, List
+
+from ppcls.engine.train.utils import type_name
+from ppcls.utils import logger
+
+from . import optimizer
+
+__all__ = ['build_optimizer']
+
+
+def build_lr_scheduler(lr_config, epochs, step_each_epoch):
+    from . import learning_rate
+    lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch})
+    if 'name' in lr_config:
+        lr_name = lr_config.pop('name')
+        lr = getattr(learning_rate, lr_name)(**lr_config)
+        if isinstance(lr, paddle.optimizer.lr.LRScheduler):
+            return lr
+        else:
+            return lr()
+    else:
+        lr = lr_config['learning_rate']
+    return lr
+
+
+# model_list is None in static graph
+def build_optimizer(config, epochs, step_each_epoch, model_list=None):
+    optim_config = copy.deepcopy(config)
+    if isinstance(optim_config, dict):
+        # convert {'name': xxx, **optim_cfg} to [{name: {scope: xxx, **optim_cfg}}]
+        optim_name = optim_config.pop("name")
+        optim_config: List[Dict[str, Dict]] = [{
+            optim_name: {
+                'scope': "all",
+                **
+                optim_config
+            }
+        }]
+    optim_list = []
+    lr_list = []
+    """NOTE:
+    Currently only support optim objets below.
+    1. single optimizer config.
+    2. next level uner Arch, such as Arch.backbone, Arch.neck, Arch.head.
+    3. loss which has parameters, such as CenterLoss.
+    """
+    for optim_item in optim_config:
+        # optim_cfg = {optim_name: {scope: xxx, **optim_cfg}}
+        # step1 build lr
+        optim_name = list(optim_item.keys())[0]  # get optim_name
+        optim_scope = optim_item[optim_name].pop('scope')  # get optim_scope
+        optim_cfg = optim_item[optim_name]  # get optim_cfg
+
+        lr = build_lr_scheduler(optim_cfg.pop('lr'), epochs, step_each_epoch)
+        logger.debug("build lr ({}) for scope ({}) success..".format(
+            lr, optim_scope))
+        # step2 build regularization
+        if 'regularizer' in optim_cfg and optim_cfg['regularizer'] is not None:
+            if 'weight_decay' in optim_cfg:
+                logger.warning(
+                    "ConfigError: Only one of regularizer and weight_decay can be set in Optimizer Config. \"weight_decay\" has been ignored."
+                )
+            reg_config = optim_cfg.pop('regularizer')
+            reg_name = reg_config.pop('name') + 'Decay'
+            reg = getattr(paddle.regularizer, reg_name)(**reg_config)
+            optim_cfg["weight_decay"] = reg
+            logger.debug("build regularizer ({}) for scope ({}) success..".
+                         format(reg, optim_scope))
+        # step3 build optimizer
+        if 'clip_norm' in optim_cfg:
+            clip_norm = optim_cfg.pop('clip_norm')
+            grad_clip = paddle.nn.ClipGradByNorm(clip_norm=clip_norm)
+        else:
+            grad_clip = None
+        optim_model = []
+
+        # for static graph
+        if model_list is None:
+            optim = getattr(optimizer, optim_name)(
+                learning_rate=lr, grad_clip=grad_clip,
+                **optim_cfg)(model_list=optim_model)
+            return optim, lr
+
+        # for dynamic graph
+        for i in range(len(model_list)):
+            if len(model_list[i].parameters()) == 0:
+                continue
+            if optim_scope == "all":
+                # optimizer for all
+                optim_model.append(model_list[i])
+            else:
+                if optim_scope.endswith("Loss"):
+                    # optimizer for loss
+                    for m in model_list[i].sublayers(True):
+                        if type_name(m) == optim_scope:
+                            optim_model.append(m)
+                else:
+                    # opmizer for module in model, such as backbone, neck, head...
+                    if optim_scope == type_name(model_list[i]):
+                        optim_model.append(model_list[i])
+                    elif hasattr(model_list[i], optim_scope):
+                        optim_model.append(getattr(model_list[i], optim_scope))
+                    else:
+                        for name, layer in model_list[i].named_sublayers():
+                            if len(layer.parameters()) != 0 \
+                                    and re.fullmatch(optim_scope, name):
+                                optim_model.append(layer)
+
+        optim = getattr(optimizer, optim_name)(
+            learning_rate=lr, grad_clip=grad_clip,
+            **optim_cfg)(model_list=optim_model)
+        logger.debug("build optimizer ({}) for scope ({}) success..".format(
+            optim, optim_scope))
+        optim_list.append(optim)
+        lr_list.append(lr)
+    return optim_list, lr_list
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/optimizer/learning_rate.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/optimizer/learning_rate.py
new file mode 100644
index 000000000..168bf5e39
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/optimizer/learning_rate.py
@@ -0,0 +1,687 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+import math
+import types
+from abc import abstractmethod
+from typing import Union
+from paddle.optimizer import lr
+from ppcls.utils import logger
+
+
+class LRBase(object):
+    """Base class for custom learning rates
+
+    Args:
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        learning_rate (float): learning rate
+        warmup_epoch (int): number of warmup epoch(s)
+        warmup_start_lr (float): start learning rate within warmup
+        last_epoch (int): last epoch
+        by_epoch (bool): learning rate decays by epoch when by_epoch is True, else by iter
+        verbose (bool): If True, prints a message to stdout for each update. Defaults to False
+    """
+
+    def __init__(self,
+                 epochs: int,
+                 step_each_epoch: int,
+                 learning_rate: float,
+                 warmup_epoch: int,
+                 warmup_start_lr: float,
+                 last_epoch: int,
+                 by_epoch: bool,
+                 verbose: bool=False) -> None:
+        """Initialize and record the necessary parameters
+        """
+        super(LRBase, self).__init__()
+        if warmup_epoch >= epochs:
+            msg = f"When using warm up, the value of \"Global.epochs\" must be greater than value of \"Optimizer.lr.warmup_epoch\". The value of \"Optimizer.lr.warmup_epoch\" has been set to {epochs}."
+            logger.warning(msg)
+            warmup_epoch = epochs
+        self.epochs = epochs
+        self.step_each_epoch = step_each_epoch
+        self.learning_rate = learning_rate
+        self.warmup_epoch = warmup_epoch
+        self.warmup_steps = self.warmup_epoch if by_epoch else round(
+            self.warmup_epoch * self.step_each_epoch)
+        self.warmup_start_lr = warmup_start_lr
+        self.last_epoch = last_epoch
+        self.by_epoch = by_epoch
+        self.verbose = verbose
+
+    @abstractmethod
+    def __call__(self, *kargs, **kwargs) -> lr.LRScheduler:
+        """generate an learning rate scheduler
+
+        Returns:
+            lr.LinearWarmup: learning rate scheduler
+        """
+        pass
+
+    def linear_warmup(
+            self,
+            learning_rate: Union[float, lr.LRScheduler]) -> lr.LinearWarmup:
+        """Add an Linear Warmup before learning_rate
+
+        Args:
+            learning_rate (Union[float, lr.LRScheduler]): original learning rate without warmup
+
+        Returns:
+            lr.LinearWarmup: learning rate scheduler with warmup
+        """
+        warmup_lr = lr.LinearWarmup(
+            learning_rate=learning_rate,
+            warmup_steps=self.warmup_steps,
+            start_lr=self.warmup_start_lr,
+            end_lr=self.learning_rate,
+            last_epoch=self.last_epoch,
+            verbose=self.verbose)
+        return warmup_lr
+
+
+class Constant(lr.LRScheduler):
+    """Constant learning rate Class implementation
+
+    Args:
+        learning_rate (float): The initial learning rate
+        last_epoch (int, optional): The index of last epoch. Default: -1.
+    """
+
+    def __init__(self, learning_rate, last_epoch=-1, **kwargs):
+        self.learning_rate = learning_rate
+        self.last_epoch = last_epoch
+        super(Constant, self).__init__()
+
+    def get_lr(self) -> float:
+        """always return the same learning rate
+        """
+        return self.learning_rate
+
+
+class ConstLR(LRBase):
+    """Constant learning rate
+
+    Args:
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        learning_rate (float): learning rate
+        warmup_epoch (int): number of warmup epoch(s)
+        warmup_start_lr (float): start learning rate within warmup
+        last_epoch (int): last epoch
+        by_epoch (bool): learning rate decays by epoch when by_epoch is True, else by iter
+    """
+
+    def __init__(self,
+                 epochs,
+                 step_each_epoch,
+                 learning_rate,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 by_epoch=False,
+                 **kwargs):
+        super(ConstLR, self).__init__(epochs, step_each_epoch, learning_rate,
+                                      warmup_epoch, warmup_start_lr,
+                                      last_epoch, by_epoch)
+
+    def __call__(self):
+        learning_rate = Constant(
+            learning_rate=self.learning_rate, last_epoch=self.last_epoch)
+
+        if self.warmup_steps > 0:
+            learning_rate = self.linear_warmup(learning_rate)
+
+        setattr(learning_rate, "by_epoch", self.by_epoch)
+        return learning_rate
+
+
+class Linear(LRBase):
+    """Linear learning rate decay
+
+    Args:
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        learning_rate (float): learning rate
+        end_lr (float, optional): The minimum final learning rate. Defaults to 0.0.
+        power (float, optional): Power of polynomial. Defaults to 1.0.
+        warmup_epoch (int): number of warmup epoch(s)
+        warmup_start_lr (float): start learning rate within warmup
+        last_epoch (int): last epoch
+        by_epoch (bool): learning rate decays by epoch when by_epoch is True, else by iter
+    """
+
+    def __init__(self,
+                 epochs,
+                 step_each_epoch,
+                 learning_rate,
+                 end_lr=0.0,
+                 power=1.0,
+                 cycle=False,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 by_epoch=False,
+                 **kwargs):
+        super(Linear, self).__init__(epochs, step_each_epoch, learning_rate,
+                                     warmup_epoch, warmup_start_lr, last_epoch,
+                                     by_epoch)
+        self.decay_steps = (epochs - self.warmup_epoch) * step_each_epoch
+        self.end_lr = end_lr
+        self.power = power
+        self.cycle = cycle
+        self.warmup_steps = round(self.warmup_epoch * step_each_epoch)
+        if self.by_epoch:
+            self.decay_steps = self.epochs - self.warmup_epoch
+
+    def __call__(self):
+        learning_rate = lr.PolynomialDecay(
+            learning_rate=self.learning_rate,
+            decay_steps=self.decay_steps,
+            end_lr=self.end_lr,
+            power=self.power,
+            cycle=self.cycle,
+            last_epoch=self.last_epoch) if self.decay_steps > 0 else Constant(
+                self.learning_rate)
+
+        if self.warmup_steps > 0:
+            learning_rate = self.linear_warmup(learning_rate)
+
+        setattr(learning_rate, "by_epoch", self.by_epoch)
+        return learning_rate
+
+
+class Cosine(LRBase):
+    """Cosine learning rate decay
+
+    ``lr = 0.05 * (math.cos(epoch * (math.pi / epochs)) + 1)``
+
+    Args:
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        learning_rate (float): learning rate
+        eta_min (float, optional): Minimum learning rate. Defaults to 0.0.
+        warmup_epoch (int, optional): The epoch numbers for LinearWarmup. Defaults to 0.
+        warmup_start_lr (float, optional): start learning rate within warmup. Defaults to 0.0.
+        last_epoch (int, optional): last epoch. Defaults to -1.
+        by_epoch (bool, optional): learning rate decays by epoch when by_epoch is True, else by iter. Defaults to False.
+    """
+
+    def __init__(self,
+                 epochs,
+                 step_each_epoch,
+                 learning_rate,
+                 eta_min=0.0,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 by_epoch=False,
+                 **kwargs):
+        super(Cosine, self).__init__(epochs, step_each_epoch, learning_rate,
+                                     warmup_epoch, warmup_start_lr, last_epoch,
+                                     by_epoch)
+        self.T_max = (self.epochs - self.warmup_epoch) * self.step_each_epoch
+        self.eta_min = eta_min
+        if self.by_epoch:
+            self.T_max = self.epochs - self.warmup_epoch
+
+    def __call__(self):
+        learning_rate = lr.CosineAnnealingDecay(
+            learning_rate=self.learning_rate,
+            T_max=self.T_max,
+            eta_min=self.eta_min,
+            last_epoch=self.last_epoch) if self.T_max > 0 else Constant(
+                self.learning_rate)
+
+        if self.warmup_steps > 0:
+            learning_rate = self.linear_warmup(learning_rate)
+
+        setattr(learning_rate, "by_epoch", self.by_epoch)
+        return learning_rate
+
+
+class Cyclic(LRBase):
+    """Cyclic learning rate decay
+
+    Args:
+        epochs (int): Total epoch(s).
+        step_each_epoch (int): Number of iterations within an epoch.
+        base_learning_rate (float): Initial learning rate, which is the lower boundary in the cycle. The paper recommends
+            that set the base_learning_rate to 1/3 or 1/4 of max_learning_rate.
+        max_learning_rate (float): Maximum learning rate in the cycle. It defines the cycle amplitude as above.
+            Since there is some scaling operation during process of learning rate adjustment,
+            max_learning_rate may not actually be reached.
+        warmup_epoch (int): Number of warmup epoch(s).
+        warmup_start_lr (float): Start learning rate within warmup.
+        step_size_up (int): Number of training steps, which is used to increase learning rate in a cycle.
+            The step size of one cycle will be defined by step_size_up + step_size_down. According to the paper, step
+            size should be set as at least 3 or 4 times steps in one epoch.
+        step_size_down (int, optional): Number of training steps, which is used to decrease learning rate in a cycle.
+            If not specified, it's value will initialize to `` step_size_up `` . Default: None.
+        mode (str, optional): One of 'triangular', 'triangular2' or 'exp_range'.
+            If scale_fn is specified, this argument will be ignored. Default: 'triangular'.
+        exp_gamma (float): Constant in 'exp_range' scaling function: exp_gamma**iterations. Used only when mode = 'exp_range'. Default: 1.0.
+        scale_fn (function, optional): A custom scaling function, which is used to replace three build-in methods.
+            It should only have one argument. For all x >= 0, 0 <= scale_fn(x) <= 1.
+            If specified, then 'mode' will be ignored. Default: None.
+        scale_mode (str, optional): One of 'cycle' or 'iterations'. Defines whether scale_fn is evaluated on cycle
+            number or cycle iterations (total iterations since start of training). Default: 'cycle'.
+        last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        by_epoch (bool): Learning rate decays by epoch when by_epoch is True, else by iter.
+        verbose: (bool, optional): If True, prints a message to stdout for each update. Defaults to False.
+    """
+
+    def __init__(self,
+                 epochs,
+                 step_each_epoch,
+                 base_learning_rate,
+                 max_learning_rate,
+                 warmup_epoch,
+                 warmup_start_lr,
+                 step_size_up,
+                 step_size_down=None,
+                 mode='triangular',
+                 exp_gamma=1.0,
+                 scale_fn=None,
+                 scale_mode='cycle',
+                 by_epoch=False,
+                 last_epoch=-1,
+                 verbose=False):
+        super(Cyclic, self).__init__(
+            epochs, step_each_epoch, base_learning_rate, warmup_epoch,
+            warmup_start_lr, last_epoch, by_epoch, verbose)
+        self.base_learning_rate = base_learning_rate
+        self.max_learning_rate = max_learning_rate
+        self.step_size_up = step_size_up
+        self.step_size_down = step_size_down
+        self.mode = mode
+        self.exp_gamma = exp_gamma
+        self.scale_fn = scale_fn
+        self.scale_mode = scale_mode
+
+    def __call__(self):
+        learning_rate = lr.CyclicLR(
+            base_learning_rate=self.base_learning_rate,
+            max_learning_rate=self.max_learning_rate,
+            step_size_up=self.step_size_up,
+            step_size_down=self.step_size_down,
+            mode=self.mode,
+            exp_gamma=self.exp_gamma,
+            scale_fn=self.scale_fn,
+            scale_mode=self.scale_mode,
+            last_epoch=self.last_epoch,
+            verbose=self.verbose)
+
+        if self.warmup_steps > 0:
+            learning_rate = self.linear_warmup(learning_rate)
+
+        setattr(learning_rate, "by_epoch", self.by_epoch)
+        return learning_rate
+
+
+class Step(LRBase):
+    """Step learning rate decay
+
+    Args:
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        learning_rate (float): learning rate
+        step_size (int|float): the interval to update.
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma``. It should be less than 1.0. Default: 0.1.
+        warmup_epoch (int, optional): The epoch numbers for LinearWarmup. Defaults to 0.
+        warmup_start_lr (float, optional): start learning rate within warmup. Defaults to 0.0.
+        last_epoch (int, optional): last epoch. Defaults to -1.
+        by_epoch (bool, optional): learning rate decays by epoch when by_epoch is True, else by iter. Defaults to False.
+    """
+
+    def __init__(self,
+                 epochs,
+                 step_each_epoch,
+                 learning_rate,
+                 step_size,
+                 gamma,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 by_epoch=False,
+                 **kwargs):
+        super(Step, self).__init__(epochs, step_each_epoch, learning_rate,
+                                   warmup_epoch, warmup_start_lr, last_epoch,
+                                   by_epoch)
+        self.step_size = int(step_size * step_each_epoch)
+        self.gamma = gamma
+        if self.by_epoch:
+            self.step_size = step_size
+
+    def __call__(self):
+        learning_rate = lr.StepDecay(
+            learning_rate=self.learning_rate,
+            step_size=self.step_size,
+            gamma=self.gamma,
+            last_epoch=self.last_epoch)
+
+        if self.warmup_steps > 0:
+            learning_rate = self.linear_warmup(learning_rate)
+
+        setattr(learning_rate, "by_epoch", self.by_epoch)
+        return learning_rate
+
+
+class Piecewise(LRBase):
+    """Piecewise learning rate decay
+
+    Args:
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        decay_epochs (List[int]): A list of steps numbers. The type of element in the list is python int.
+        values (List[float]): A list of learning rate values that will be picked during different epoch boundaries.
+        warmup_epoch (int, optional): The epoch numbers for LinearWarmup. Defaults to 0.
+        warmup_start_lr (float, optional): start learning rate within warmup. Defaults to 0.0.
+        last_epoch (int, optional): last epoch. Defaults to -1.
+        by_epoch (bool, optional): learning rate decays by epoch when by_epoch is True, else by iter. Defaults to False.
+    """
+
+    def __init__(self,
+                 epochs,
+                 step_each_epoch,
+                 decay_epochs,
+                 values,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 by_epoch=False,
+                 learning_rate=None,
+                 **kwargs):
+        if learning_rate:
+            decay_epochs = list(range(0, epochs, 30))
+            values = [
+                learning_rate * (0.1**i) for i in range(len(decay_epochs))
+            ]
+            # when total epochs < 30, decay_epochs and values should be
+            # [] and [lr] respectively, but paddle dont support.
+            if len(decay_epochs) == 1:
+                decay_epochs = [epochs]
+                values = [values[0], values[0]]
+            else:
+                decay_epochs = decay_epochs[1:]
+            logger.warning(
+                "When 'learning_rate' of Piecewise has beed set, "
+                "the learning rate scheduler would be set by the rule that lr decay 10 times every 30 epochs. "
+                f"So, the 'decay_epochs' and 'values' have been set to {decay_epochs} and {values} respectively."
+            )
+        super(Piecewise,
+              self).__init__(epochs, step_each_epoch, values[0], warmup_epoch,
+                             warmup_start_lr, last_epoch, by_epoch)
+
+        self.values = values
+        self.boundaries_steps = [e * step_each_epoch for e in decay_epochs]
+        if self.by_epoch is True:
+            self.boundaries_steps = decay_epochs
+
+    def __call__(self):
+        learning_rate = lr.PiecewiseDecay(
+            boundaries=self.boundaries_steps,
+            values=self.values,
+            last_epoch=self.last_epoch)
+
+        if self.warmup_steps > 0:
+            learning_rate = self.linear_warmup(learning_rate)
+
+        setattr(learning_rate, "by_epoch", self.by_epoch)
+        return learning_rate
+
+
+class MultiStepDecay(LRBase):
+    """MultiStepDecay learning rate decay
+
+    Args:
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        learning_rate (float): learning rate
+        milestones (List[int]): List of each boundaries. Must be increasing.
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma``. It should be less than 1.0. Defaults to 0.1.
+        warmup_epoch (int, optional): The epoch numbers for LinearWarmup. Defaults to 0.
+        warmup_start_lr (float, optional): start learning rate within warmup. Defaults to 0.0.
+        last_epoch (int, optional): last epoch. Defaults to -1.
+        by_epoch (bool, optional): learning rate decays by epoch when by_epoch is True, else by iter. Defaults to False.
+    """
+
+    def __init__(self,
+                 epochs,
+                 step_each_epoch,
+                 learning_rate,
+                 milestones,
+                 gamma=0.1,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 by_epoch=False,
+                 **kwargs):
+        super(MultiStepDecay, self).__init__(
+            epochs, step_each_epoch, learning_rate, warmup_epoch,
+            warmup_start_lr, last_epoch, by_epoch)
+        self.milestones = [x * step_each_epoch for x in milestones]
+        self.gamma = gamma
+        if self.by_epoch:
+            self.milestones = milestones
+
+    def __call__(self):
+        learning_rate = lr.MultiStepDecay(
+            learning_rate=self.learning_rate,
+            milestones=self.milestones,
+            gamma=self.gamma,
+            last_epoch=self.last_epoch)
+
+        if self.warmup_steps > 0:
+            learning_rate = self.linear_warmup(learning_rate)
+
+        setattr(learning_rate, "by_epoch", self.by_epoch)
+        return learning_rate
+
+
+class ReduceOnPlateau(LRBase):
+    """ReduceOnPlateau learning rate decay
+    Args:
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        learning_rate (float): learning rate
+        mode (str, optional): ``'min'`` or ``'max'`` can be selected. Normally, it is ``'min'`` , which means that the
+            learning rate will reduce when ``loss`` stops descending. Specially, if it's set to ``'max'``, the learning
+            rate will reduce when ``loss`` stops ascending. Defaults to ``'min'``.
+        factor (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * factor`` .
+            It should be less than 1.0. Defaults to 0.1.
+        patience (int, optional): When ``loss`` doesn't improve for this number of epochs, learing rate will be reduced.
+            Defaults to 10.
+        threshold (float, optional): ``threshold`` and ``threshold_mode`` will determine the minimum change of ``loss`` .
+            This make tiny changes of ``loss`` will be ignored. Defaults to 1e-4.
+        threshold_mode (str, optional): ``'rel'`` or ``'abs'`` can be selected. In ``'rel'`` mode, the minimum change of ``loss``
+            is ``last_loss * threshold`` , where ``last_loss`` is ``loss`` in last epoch. In ``'abs'`` mode, the minimum
+            change of ``loss`` is ``threshold`` . Defaults to ``'rel'`` .
+        cooldown (int, optional): The number of epochs to wait before resuming normal operation. Defaults to 0.
+        min_lr (float, optional): The lower bound of the learning rate after reduction. Defaults to 0.
+        epsilon (float, optional): Minimal decay applied to lr. If the difference between new and old lr is smaller than epsilon,
+            the update is ignored. Defaults to 1e-8.
+        warmup_epoch (int, optional): The epoch numbers for LinearWarmup. Defaults to 0.
+        warmup_start_lr (float, optional): start learning rate within warmup. Defaults to 0.0.
+        last_epoch (int, optional): last epoch. Defaults to -1.
+        by_epoch (bool, optional): learning rate decays by epoch when by_epoch is True, else by iter. Defaults to False.
+    """
+
+    def __init__(self,
+                 epochs,
+                 step_each_epoch,
+                 learning_rate,
+                 mode='min',
+                 factor=0.1,
+                 patience=10,
+                 threshold=1e-4,
+                 threshold_mode='rel',
+                 cooldown=0,
+                 min_lr=0,
+                 epsilon=1e-8,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 last_epoch=-1,
+                 by_epoch=False,
+                 **kwargs):
+        super(ReduceOnPlateau, self).__init__(
+            epochs, step_each_epoch, learning_rate, warmup_epoch,
+            warmup_start_lr, last_epoch, by_epoch)
+        self.mode = mode
+        self.factor = factor
+        self.patience = patience
+        self.threshold = threshold
+        self.threshold_mode = threshold_mode
+        self.cooldown = cooldown
+        self.min_lr = min_lr
+        self.epsilon = epsilon
+
+    def __call__(self):
+        learning_rate = lr.ReduceOnPlateau(
+            learning_rate=self.learning_rate,
+            mode=self.mode,
+            factor=self.factor,
+            patience=self.patience,
+            threshold=self.threshold,
+            threshold_mode=self.threshold_mode,
+            cooldown=self.cooldown,
+            min_lr=self.min_lr,
+            epsilon=self.epsilon)
+
+        if self.warmup_steps > 0:
+            learning_rate = self.linear_warmup(learning_rate)
+
+        # NOTE: Implement get_lr() method for class `ReduceOnPlateau`,
+        # which is called in `log_info` function
+        def get_lr(self):
+            return self.last_lr
+
+        learning_rate.get_lr = types.MethodType(get_lr, learning_rate)
+
+        setattr(learning_rate, "by_epoch", self.by_epoch)
+        return learning_rate
+
+
+class CosineFixmatch(LRBase):
+    """Cosine decay in FixMatch style
+
+    Args:
+        epochs (int): total epoch(s)
+        step_each_epoch (int): number of iterations within an epoch
+        learning_rate (float): learning rate
+        num_warmup_steps (int): the number warmup steps.
+        warmunum_cycles (float, optional): the factor for cosine in FixMatch learning rate. Defaults to 7 / 16.
+        last_epoch (int, optional): last epoch. Defaults to -1.
+        by_epoch (bool, optional): learning rate decays by epoch when by_epoch is True, else by iter. Defaults to False.
+    """
+
+    def __init__(self,
+                 epochs,
+                 step_each_epoch,
+                 learning_rate,
+                 num_warmup_steps,
+                 num_cycles=7 / 16,
+                 last_epoch=-1,
+                 by_epoch=False):
+        self.epochs = epochs
+        self.step_each_epoch = step_each_epoch
+        self.learning_rate = learning_rate
+        self.num_warmup_steps = num_warmup_steps
+        self.num_cycles = num_cycles
+        self.last_epoch = last_epoch
+        self.by_epoch = by_epoch
+
+    def __call__(self):
+        def _lr_lambda(current_step):
+            if current_step < self.num_warmup_steps:
+                return float(current_step) / float(
+                    max(1, self.num_warmup_steps))
+            no_progress = float(current_step - self.num_warmup_steps) / \
+                        float(max(1, self.epochs * self.step_each_epoch - self.num_warmup_steps))
+            return max(0., math.cos(math.pi * self.num_cycles * no_progress))
+
+        learning_rate = lr.LambdaDecay(
+            learning_rate=self.learning_rate,
+            lr_lambda=_lr_lambda,
+            last_epoch=self.last_epoch)
+        setattr(learning_rate, "by_epoch", self.by_epoch)
+        return learning_rate
+
+
+class OneCycleLR(LRBase):
+    """OneCycleLR learning rate decay
+
+    Args:
+        epochs (int): Total epoch(s).
+        step_each_epoch (int): Number of iterations within an epoch.
+        max_learning_rate (float): Maximum learning rate in the cycle. It defines the cycle amplitude as above.
+            Since there is some scaling operation during process of learning rate adjustment,
+            max_learning_rate may not actually be reached.
+        warmup_epoch (int): Number of warmup epoch(s).
+        warmup_start_lr (float): Start learning rate within warmup.
+        divide_factor (float, optional): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25.
+        phase_pct (float): The percentage of total steps which used to increasing learning rate. Default: 0.3.
+        end_learning_rate (float, optional): The minimum learning rate during training, it should be much less than initial learning rate.
+        anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing, 'linear' for linear annealing. Default: 'cos'.
+        three_phase (bool, optional): Whether to use three-phase.
+        last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+        by_epoch (bool): Learning rate decays by epoch when by_epoch is True, else by iter.
+        verbose: (bool, optional): If True, prints a message to stdout for each update. Defaults to False.
+    """
+
+    def __init__(self,
+                 epochs,
+                 step_each_epoch,
+                 max_learning_rate,
+                 warmup_epoch=0,
+                 warmup_start_lr=0.0,
+                 divide_factor=25.0,
+                 end_learning_rate=1e-10,
+                 phase_pct=0.3,
+                 anneal_strategy='cos',
+                 three_phase=False,
+                 by_epoch=False,
+                 last_epoch=-1,
+                 verbose=False):
+        super().__init__(
+            epochs, step_each_epoch, max_learning_rate,
+            warmup_epoch, warmup_start_lr, last_epoch, by_epoch, verbose)
+        self.max_learning_rate = max_learning_rate
+        self.total_steps = epochs * step_each_epoch
+        self.divide_factor = divide_factor
+        self.end_learning_rate = end_learning_rate
+        self.phase_pct = phase_pct
+        self.anneal_strategy = anneal_strategy
+        self.three_phase = three_phase
+        self.last_epoch = last_epoch
+
+    def __call__(self):
+        learning_rate = lr.OneCycleLR(
+            max_learning_rate=self.max_learning_rate,
+            total_steps=self.total_steps,
+            end_learning_rate=self.end_learning_rate,
+            divide_factor=self.divide_factor,
+            phase_pct=self.phase_pct,
+            anneal_strategy=self.anneal_strategy,
+            three_phase=self.three_phase,
+            last_epoch=self.last_epoch,
+            verbose=self.verbose)
+
+        if self.warmup_steps > 0:
+            learning_rate = self.linear_warmup(learning_rate)
+
+        setattr(learning_rate, "by_epoch", self.by_epoch)
+        return learning_rate
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/optimizer/optimizer.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/optimizer/optimizer.py
new file mode 100644
index 000000000..adc417c87
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/optimizer/optimizer.py
@@ -0,0 +1,518 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import paddle
+from paddle import optimizer as optim
+from ppcls.utils import logger
+from functools import partial
+
+
+class SGD(object):
+    """
+    Args:
+    learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+        It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
+    parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
+        This parameter is required in dygraph mode. \
+        The default value is None in static mode, at this time all parameters will be updated.
+    weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+        It canbe a float value as coeff of L2 regularization or \
+        :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+        If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+        the regularization setting here in optimizer will be ignored for this parameter. \
+        Otherwise, the regularization setting here in optimizer will take effect. \
+        Default None, meaning there is no regularization.
+    grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+        some derived class of ``GradientClipBase`` . There are three cliping strategies
+        ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
+        :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+    name (str, optional): The default value is None. Normally there is no need for user
+            to set this property.
+    """
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 weight_decay=None,
+                 grad_clip=None,
+                 multi_precision=False,
+                 name=None):
+        self.learning_rate = learning_rate
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
+        self.multi_precision = multi_precision
+        self.name = name
+
+    def __call__(self, model_list):
+        # model_list is None in static graph
+        parameters = sum([m.parameters() for m in model_list],
+                         []) if model_list else None
+        argspec = inspect.getfullargspec(optim.SGD.__init__).args
+        if 'multi_precision' in argspec:
+            opt = optim.SGD(learning_rate=self.learning_rate,
+                            parameters=parameters,
+                            weight_decay=self.weight_decay,
+                            grad_clip=self.grad_clip,
+                            multi_precision=self.multi_precision,
+                            name=self.name)
+        else:
+            opt = optim.SGD(learning_rate=self.learning_rate,
+                            parameters=parameters,
+                            weight_decay=self.weight_decay,
+                            grad_clip=self.grad_clip,
+                            name=self.name)
+        return opt
+
+
+class Momentum(object):
+    """
+    Simple Momentum optimizer with velocity state.
+    Args:
+        learning_rate (float|Variable) - The learning rate used to update parameters.
+            Can be a float value or a Variable with one float value as data element.
+        momentum (float) - Momentum factor.
+        regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 momentum,
+                 weight_decay=None,
+                 grad_clip=None,
+                 use_nesterov=False,
+                 multi_precision=True,
+                 no_weight_decay_name=None,
+                 one_dim_param_no_weight_decay=False):
+        super().__init__()
+        self.learning_rate = learning_rate
+        self.momentum = momentum
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
+        self.multi_precision = multi_precision
+        self.use_nesterov = use_nesterov
+        self.no_weight_decay_name_list = no_weight_decay_name.split(
+        ) if no_weight_decay_name else []
+        self.one_dim_param_no_weight_decay = one_dim_param_no_weight_decay
+
+    def __call__(self, model_list):
+        # model_list is None in static graph
+        parameters = None
+        if model_list:
+            # TODO(gaotingquan): to avoid cause issues for unset no_weight_decay models
+            if len(self.no_weight_decay_name_list) > 0:
+                params_with_decay = []
+                params_without_decay = []
+                for m in model_list:
+                    for n, p in m.named_parameters():
+                        if any(nd in n for nd in self.no_weight_decay_name_list) \
+                            or (self.one_dim_param_no_weight_decay and len(p.shape) == 1):
+                            params_without_decay.append(p)
+                        else:
+                            params_with_decay.append(p)
+                parameters = [{
+                    "params": params_with_decay,
+                    "weight_decay": self.weight_decay
+                }, {
+                    "params": params_without_decay,
+                    "weight_decay": 0.0
+                }]
+            else:
+                parameters = sum([m.parameters() for m in model_list], [])
+        opt = optim.Momentum(
+            learning_rate=self.learning_rate,
+            momentum=self.momentum,
+            weight_decay=self.weight_decay,
+            grad_clip=self.grad_clip,
+            multi_precision=self.multi_precision,
+            use_nesterov=self.use_nesterov,
+            parameters=parameters)
+        if hasattr(opt, '_use_multi_tensor'):
+            opt = optim.Momentum(
+                learning_rate=self.learning_rate,
+                momentum=self.momentum,
+                weight_decay=self.weight_decay,
+                grad_clip=self.grad_clip,
+                multi_precision=self.multi_precision,
+                parameters=parameters,
+                use_nesterov=self.use_nesterov,
+                use_multi_tensor=True)
+        return opt
+
+
+class Adam(object):
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-08,
+                 parameter_list=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None,
+                 lazy_mode=False,
+                 multi_precision=False):
+        self.learning_rate = learning_rate
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.parameter_list = parameter_list
+        self.learning_rate = learning_rate
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
+        self.name = name
+        self.lazy_mode = lazy_mode
+        self.multi_precision = multi_precision
+
+    def __call__(self, model_list):
+        # model_list is None in static graph
+        parameters = sum([m.parameters() for m in model_list],
+                         []) if model_list else None
+        opt = optim.Adam(
+            learning_rate=self.learning_rate,
+            beta1=self.beta1,
+            beta2=self.beta2,
+            epsilon=self.epsilon,
+            weight_decay=self.weight_decay,
+            grad_clip=self.grad_clip,
+            name=self.name,
+            lazy_mode=self.lazy_mode,
+            multi_precision=self.multi_precision,
+            parameters=parameters)
+        return opt
+
+
+class RMSProp(object):
+    """
+    Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning rate method.
+    Args:
+        learning_rate (float|Variable) - The learning rate used to update parameters.
+            Can be a float value or a Variable with one float value as data element.
+        momentum (float) - Momentum factor.
+        rho (float) - rho value in equation.
+        epsilon (float) - avoid division by zero, default is 1e-6.
+        regularization (WeightDecayRegularizer, optional) - The strategy of regularization.
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 momentum=0.0,
+                 rho=0.95,
+                 epsilon=1e-6,
+                 weight_decay=None,
+                 grad_clip=None,
+                 multi_precision=False,
+                 no_weight_decay_name=None,
+                 one_dim_param_no_weight_decay=False):
+        super().__init__()
+        self.learning_rate = learning_rate
+        self.momentum = momentum
+        self.rho = rho
+        self.epsilon = epsilon
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
+        self.no_weight_decay_name_list = no_weight_decay_name.split(
+        ) if no_weight_decay_name else []
+        self.one_dim_param_no_weight_decay = one_dim_param_no_weight_decay
+
+    def __call__(self, model_list):
+        # model_list is None in static graph
+        parameters = None
+        if model_list:
+            params_with_decay = []
+            params_without_decay = []
+            for m in model_list:
+                for n, p in m.named_parameters():
+                    if any(nd in n for nd in self.no_weight_decay_name_list) \
+                        or (self.one_dim_param_no_weight_decay and len(p.shape) == 1):
+                        params_without_decay.append(p)
+                    else:
+                        params_with_decay.append(p)
+            if params_without_decay:
+                parameters = [{
+                    "params": params_with_decay,
+                    "weight_decay": self.weight_decay
+                }, {
+                    "params": params_without_decay,
+                    "weight_decay": 0.0
+                }]
+            else:
+                parameters = params_with_decay
+        opt = optim.RMSProp(
+            learning_rate=self.learning_rate,
+            momentum=self.momentum,
+            rho=self.rho,
+            epsilon=self.epsilon,
+            weight_decay=self.weight_decay,
+            grad_clip=self.grad_clip,
+            parameters=parameters)
+        return opt
+
+
+class AdamW(object):
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-8,
+                 weight_decay=None,
+                 multi_precision=False,
+                 grad_clip=None,
+                 no_weight_decay_name=None,
+                 one_dim_param_no_weight_decay=False,
+                 **args):
+        super().__init__()
+        self.learning_rate = learning_rate
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.grad_clip = grad_clip
+        self.weight_decay = weight_decay
+        self.multi_precision = multi_precision
+        self.no_weight_decay_name_list = no_weight_decay_name.split(
+        ) if no_weight_decay_name else []
+        self.one_dim_param_no_weight_decay = one_dim_param_no_weight_decay
+
+    def __call__(self, model_list):
+        # model_list is None in static graph
+        parameters = sum([m.parameters() for m in model_list],
+                         []) if model_list else None
+
+        # TODO(gaotingquan): model_list is None when in static graph, "no_weight_decay" not work.
+        if model_list is None:
+            if self.one_dim_param_no_weight_decay or len(
+                    self.no_weight_decay_name_list) != 0:
+                msg = "\"AdamW\" does not support setting \"no_weight_decay\" in static graph. Please use dynamic graph."
+                logger.error(Exception(msg))
+                raise Exception(msg)
+
+        self.no_weight_decay_param_name_list = [
+            p.name for model in model_list for n, p in model.named_parameters()
+            if any(nd in n for nd in self.no_weight_decay_name_list)
+        ] if model_list else []
+
+        if self.one_dim_param_no_weight_decay:
+            self.no_weight_decay_param_name_list += [
+                p.name
+                for model in model_list for n, p in model.named_parameters()
+                if len(p.shape) == 1
+            ] if model_list else []
+
+        opt = optim.AdamW(
+            learning_rate=self.learning_rate,
+            beta1=self.beta1,
+            beta2=self.beta2,
+            epsilon=self.epsilon,
+            parameters=parameters,
+            weight_decay=self.weight_decay,
+            multi_precision=self.multi_precision,
+            grad_clip=self.grad_clip,
+            apply_decay_param_fun=self._apply_decay_param_fun)
+        return opt
+
+    def _apply_decay_param_fun(self, name):
+        return name not in self.no_weight_decay_param_name_list
+
+
+class AdamWDL(object):
+    """
+    The AdamWDL optimizer is implemented based on the AdamW Optimization with dynamic lr setting.
+    Generally it's used for transformer model.
+    """
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-8,
+                 weight_decay=None,
+                 multi_precision=False,
+                 grad_clip=None,
+                 layerwise_decay=None,
+                 filter_bias_and_bn=True,
+                 **args):
+        self.learning_rate = learning_rate
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.grad_clip = grad_clip
+        self.weight_decay = weight_decay
+        self.multi_precision = multi_precision
+        self.layerwise_decay = layerwise_decay
+        self.filter_bias_and_bn = filter_bias_and_bn
+
+    class AdamWDLImpl(optim.AdamW):
+        def __init__(self,
+                     learning_rate=0.001,
+                     beta1=0.9,
+                     beta2=0.999,
+                     epsilon=1e-8,
+                     parameters=None,
+                     weight_decay=0.01,
+                     apply_decay_param_fun=None,
+                     grad_clip=None,
+                     lazy_mode=False,
+                     multi_precision=False,
+                     layerwise_decay=1.0,
+                     n_layers=12,
+                     name_dict=None,
+                     name=None):
+            if not isinstance(layerwise_decay, float) and \
+                    not isinstance(layerwise_decay, paddle.static.Variable):
+                raise TypeError("coeff should be float or Tensor.")
+            self.layerwise_decay = layerwise_decay
+            self.name_dict = name_dict
+            self.n_layers = n_layers
+            self._coeff = weight_decay
+            self._lr_to_coeff = dict()
+            self.set_param_lr_func = partial(
+                self._layerwise_lr_decay, layerwise_decay, name_dict, n_layers)
+            super().__init__(
+                learning_rate=learning_rate,
+                parameters=parameters,
+                beta1=beta1,
+                beta2=beta2,
+                epsilon=epsilon,
+                grad_clip=grad_clip,
+                name=name,
+                apply_decay_param_fun=apply_decay_param_fun,
+                weight_decay=weight_decay,
+                lazy_mode=lazy_mode,
+                multi_precision=multi_precision,)
+
+        # Layerwise decay
+        def _layerwise_lr_decay(self, decay_rate, name_dict, n_layers, param):
+            """
+            Args:
+                decay_rate (float):
+                    The layer-wise decay ratio.
+                name_dict (dict):
+                    The keys of name_dict is dynamic name of model while the value
+                    of name_dict is static name.
+                    Use model.named_parameters() to get name_dict.
+                n_layers (int):
+                    Total number of layers in the transformer encoder.
+            """
+            ratio = 1.0
+            static_name = name_dict[param.name]
+            if "blocks" in static_name:
+                idx = static_name.find("blocks.")
+                layer = int(static_name[idx:].split(".")[1])
+                ratio = decay_rate**(n_layers - layer)
+            elif any([
+                    key in static_name
+                    for key in ["embed", "token", "conv1", "ln_pre"]
+            ]):
+                ratio = decay_rate**(n_layers + 1)
+            # param.optimize_attr["learning_rate"] *= ratio
+            return ratio
+        def _append_decoupled_weight_decay(self, block, param_and_grad):
+            """
+            Add decoupled weight decay op.
+                parameter = parameter - parameter * coeff * lr
+            Args:
+                block: block in which variable is to be created
+                param_and_grad: (parameters, gradients) pairs,
+                    the parameters need to decay.
+            Raises:
+                Exception: The type of coeff and parameter is not consistent.
+            """
+            if isinstance(param_and_grad, dict):
+                param_and_grad = self._update_param_group(param_and_grad)
+            param, grad = param_and_grad
+
+            if self._apply_decay_param_fun is not None and not self._apply_decay_param_fun(param.name):
+                return
+
+            if isinstance(self._learning_rate, float):
+                learning_rate = self._learning_rate
+            else:
+                # NOTE. We add this function to the _append_optimize_op(),
+                # for we must make sure _create_param_lr() be called after
+                # optimizer._create_global_learning_rate().
+                learning_rate = self._create_param_lr(param_and_grad)
+
+            with block.program._optimized_guard([param, grad]), paddle.static.name_scope("weight decay"):
+                self._params_name.add(param.name)
+
+                # If it has been calculated, the result will be reused.
+                # NOTE(wangxi): In dygraph mode, apply_gradient will be executed
+                # every step, so need clear _lr_to_coeff every step,
+                # we do this in _create_optimization_pass
+                decay_coeff = self._lr_to_coeff.get(learning_rate, None)
+                if decay_coeff is None:
+                    # NOTE(wangxi): for pipeline to set device:all
+                    with paddle.static.device_guard(None):
+                        decay_coeff = 1.0 - learning_rate * self._coeff
+                    self._lr_to_coeff[learning_rate] = decay_coeff
+
+                find_master = self._multi_precision and param.dtype == paddle.float16
+                if find_master:
+                    master_weight = self._master_weights[param.name]
+                    scaled_param = master_weight * decay_coeff
+                    paddle.assign(scaled_param, output=master_weight)
+                else:
+                    scaled_param = param * decay_coeff
+                    paddle.assign(scaled_param, output=param)
+
+        def _append_optimize_op(self, block, param_and_grad):
+            if self.set_param_lr_func is None:
+                return super()._append_optimize_op(block, param_and_grad)
+
+            self._append_decoupled_weight_decay(block, param_and_grad)
+            prev_lr = param_and_grad[0].optimize_attr["learning_rate"]
+            ratio = self.set_param_lr_func(param_and_grad[0])
+            param_and_grad[0].optimize_attr["learning_rate"] *= ratio
+
+            # excute Adam op
+            res = super()._append_optimize_op(block, param_and_grad)
+            param_and_grad[0].optimize_attr["learning_rate"] = prev_lr
+            return res
+
+    def __call__(self, model_list):
+        model = model_list[0]
+        if self.weight_decay and self.filter_bias_and_bn:
+            skip = {}
+            if hasattr(model, 'no_weight_decay'):
+                skip = model.no_weight_decay()
+            decay_dict = {
+                param.name: not (len(param.shape) == 1 or
+                                 name.endswith(".bias") or name in skip)
+                for name, param in model.named_parameters()
+                if not 'teacher' in name
+            }
+            parameters = [
+                param for param in model.parameters()
+                if 'teacher' not in param.name
+            ]
+            weight_decay = 0.
+        else:
+            parameters = model.parameters()
+        opt_args = dict(
+            learning_rate=self.learning_rate, weight_decay=self.weight_decay)
+        opt_args['parameters'] = parameters
+        if decay_dict is not None:
+            opt_args['apply_decay_param_fun'] = lambda n: decay_dict[n]
+        opt_args['epsilon'] = self.epsilon
+        opt_args['beta1'] = self.beta1
+        opt_args['beta2'] = self.beta2
+        if self.layerwise_decay and self.layerwise_decay < 1.0:
+            opt_args['layerwise_decay'] = self.layerwise_decay
+            name_dict = dict()
+            for n, p in model.named_parameters():
+                name_dict[p.name] = n
+            opt_args['name_dict'] = name_dict
+            opt_args['n_layers'] = model.get_num_layers()
+        optimizer = self.AdamWDLImpl(**opt_args)
+
+        return optimizer
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/static/README.md b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/static/README.md
new file mode 100644
index 000000000..24093f4d0
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/static/README.md
@@ -0,0 +1,19 @@
+# 使用静态图模式训练
+
+飞桨框架支持动态图模式和静态图模式，通常情况下使用动态图训练，即可满足大部分场景需求。飞桨经过多个版本的持续优化，动态图模型训练的性能已经可以和静态图媲美。如果在某些场景下确实需要使用静态图模式训练，我们推荐优先使用动转静训练功能，即仍然采用更易用的动态图方式构建模型，再转为静态图模式进行训练。同时，考虑到对静态图训练的兼容，PaddleClas 同样提供了静态图训练功能。
+
+## 一、动转静训练
+
+仅需在训练配置文件中设置参数 `Global.to_static` 为 `True`，同时通过参数 `Global.image_shape` 输入数据 shape 即可，如 `[3, 224, 224]`。
+
+## 二、静态图训练
+
+在 PaddleClas 中，静态图训练与动态图类似，同样使用配置文件的方式指定训练参数，训练入口脚本为 `ppcls/static/train.py`，以 ResNet50 模型为例，训练启动命令如下：
+
+```bash
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+python3.7 -m paddle.distributed.launch \
+    --gpus="0,1,2,3" \
+    ppcls/static/train.py \
+    -c ./ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml
+```
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/static/program.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/static/program.py
new file mode 100644
index 000000000..0e8d0990d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/static/program.py
@@ -0,0 +1,445 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+import numpy as np
+
+from collections import OrderedDict
+
+import paddle
+import paddle.nn.functional as F
+
+from paddle.distributed import fleet
+from paddle.distributed.fleet import DistributedStrategy
+
+# from ppcls.optimizer import OptimizerBuilder
+# from ppcls.optimizer.learning_rate import LearningRateBuilder
+
+from ppcls.arch import build_model
+from ppcls.loss import build_loss
+from ppcls.metric import build_metrics
+from ppcls.optimizer import build_optimizer
+from ppcls.optimizer import build_lr_scheduler
+
+from ppcls.utils.misc import AverageMeter
+from ppcls.utils import logger, profiler
+
+
+def create_feeds(image_shape, use_mix=False, class_num=None, dtype="float32"):
+    """
+    Create feeds as model input
+
+    Args:
+        image_shape(list[int]): model input shape, such as [3, 224, 224]
+        use_mix(bool): whether to use mix(include mixup, cutmix, fmix)
+        class_num(int): the class number of network, required if use_mix
+
+    Returns:
+        feeds(dict): dict of model input variables
+    """
+    feeds = OrderedDict()
+    feeds['data'] = paddle.static.data(
+        name="data", shape=[None] + image_shape, dtype=dtype)
+
+    if use_mix:
+        if class_num is None:
+            msg = "When use MixUp, CutMix and so on, you must set class_num."
+            logger.error(msg)
+            raise Exception(msg)
+        feeds['target'] = paddle.static.data(
+            name="target", shape=[None, class_num], dtype="float32")
+    else:
+        feeds['label'] = paddle.static.data(
+            name="label", shape=[None, 1], dtype="int64")
+
+    return feeds
+
+
+def create_fetchs(out,
+                  feeds,
+                  architecture,
+                  topk=5,
+                  epsilon=None,
+                  class_num=None,
+                  use_mix=False,
+                  config=None,
+                  mode="Train"):
+    """
+    Create fetchs as model outputs(included loss and measures),
+    will call create_loss and create_metric(if use_mix).
+    Args:
+        out(variable): model output variable
+        feeds(dict): dict of model input variables.
+            If use mix_up, it will not include label.
+        architecture(dict): architecture information,
+            name(such as ResNet50) is needed
+        topk(int): usually top5
+        epsilon(float): parameter for label smoothing, 0.0 <= epsilon <= 1.0
+        class_num(int): the class number of network, required if use_mix
+        use_mix(bool): whether to use mix(include mixup, cutmix, fmix)
+        config(dict): model config
+
+    Returns:
+        fetchs(dict): dict of model outputs(included loss and measures)
+    """
+    fetchs = OrderedDict()
+    # build loss
+    if use_mix:
+        if class_num is None:
+            msg = "When use MixUp, CutMix and so on, you must set class_num."
+            logger.error(msg)
+            raise Exception(msg)
+        target = paddle.reshape(feeds['target'], [-1, class_num])
+    else:
+        target = paddle.reshape(feeds['label'], [-1, 1])
+
+    loss_func = build_loss(config["Loss"][mode])
+    loss_dict = loss_func(out, target)
+
+    loss_out = loss_dict["loss"]
+    fetchs['loss'] = (loss_out, AverageMeter('loss', '7.4f', need_avg=True))
+
+    # build metric
+    if not use_mix:
+        metric_func = build_metrics(config["Metric"][mode])
+
+        metric_dict = metric_func(out, target)
+
+        for key in metric_dict:
+            if mode != "Train" and paddle.distributed.get_world_size() > 1:
+                paddle.distributed.all_reduce(
+                    metric_dict[key], op=paddle.distributed.ReduceOp.SUM)
+                metric_dict[key] = metric_dict[
+                    key] / paddle.distributed.get_world_size()
+
+            fetchs[key] = (metric_dict[key], AverageMeter(
+                key, '7.4f', need_avg=True))
+
+    return fetchs
+
+
+def create_optimizer(config, step_each_epoch):
+    # create learning_rate instance
+    optimizer, lr_sch = build_optimizer(
+        config["Optimizer"], config["Global"]["epochs"], step_each_epoch)
+    return optimizer, lr_sch
+
+
+def create_strategy(config):
+    """
+    Create build strategy and exec strategy.
+
+    Args:
+        config(dict): config
+
+    Returns:
+        build_strategy: build strategy
+        exec_strategy: exec strategy
+    """
+    build_strategy = paddle.static.BuildStrategy()
+
+    fuse_op = True if 'AMP' in config and config['AMP'].get('use_amp',
+                                                            True) else False
+
+    fuse_bn_act_ops = config.get('fuse_bn_act_ops', fuse_op)
+    fuse_elewise_add_act_ops = config.get('fuse_elewise_add_act_ops', fuse_op)
+    fuse_bn_add_act_ops = config.get('fuse_bn_add_act_ops', fuse_op)
+    enable_addto = config.get('enable_addto', fuse_op)
+
+    build_strategy.fuse_bn_act_ops = fuse_bn_act_ops
+    build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
+    build_strategy.fuse_bn_add_act_ops = fuse_bn_add_act_ops
+    build_strategy.enable_addto = enable_addto
+
+    return build_strategy
+
+
+def dist_optimizer(config, optimizer):
+    """
+    Create a distributed optimizer based on a normal optimizer
+
+    Args:
+        config(dict):
+        optimizer(): a normal optimizer
+
+    Returns:
+        optimizer: a distributed optimizer
+    """
+    build_strategy = create_strategy(config)
+
+    dist_strategy = DistributedStrategy()
+    dist_strategy.build_strategy = build_strategy
+
+    dist_strategy.nccl_comm_num = 1
+    dist_strategy.fuse_all_reduce_ops = True
+    dist_strategy.fuse_grad_size_in_MB = 16
+    optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)
+
+    return optimizer
+
+
+def mixed_precision_optimizer(config, optimizer):
+    if 'AMP' in config and config['AMP'].get('use_amp', True):
+        amp_cfg = config.AMP if config.AMP else dict()
+        scale_loss = amp_cfg.get('scale_loss', 1.0)
+        use_dynamic_loss_scaling = amp_cfg.get('use_dynamic_loss_scaling',
+                                               False)
+        use_pure_fp16 = amp_cfg.get("level", "O1") == "O2"
+        optimizer = paddle.static.amp.decorate(
+            optimizer,
+            init_loss_scaling=scale_loss,
+            use_dynamic_loss_scaling=use_dynamic_loss_scaling,
+            use_pure_fp16=use_pure_fp16,
+            use_fp16_guard=True)
+
+    return optimizer
+
+
+def build(config,
+          main_prog,
+          startup_prog,
+          class_num=None,
+          step_each_epoch=100,
+          is_train=True,
+          is_distributed=True):
+    """
+    Build a program using a model and an optimizer
+        1. create feeds
+        2. create a dataloader
+        3. create a model
+        4. create fetchs
+        5. create an optimizer
+
+    Args:
+        config(dict): config
+        main_prog(): main program
+        startup_prog(): startup program
+        class_num(int): the class number of network, required if use_mix
+        is_train(bool): train or eval
+        is_distributed(bool): whether to use distributed training method
+
+    Returns:
+        dataloader(): a bridge between the model and the data
+        fetchs(dict): dict of model outputs(included loss and measures)
+    """
+    with paddle.static.program_guard(main_prog, startup_prog):
+        with paddle.utils.unique_name.guard():
+            mode = "Train" if is_train else "Eval"
+            use_mix = "batch_transform_ops" in config["DataLoader"][mode][
+                "dataset"]
+            data_dtype = "float32"
+            if 'AMP' in config and config['AMP'].get(
+                    'use_amp', True) and config["AMP"]["level"] == 'O2':
+                data_dtype = "float16"
+            feeds = create_feeds(
+                config["Global"]["image_shape"],
+                use_mix,
+                class_num=class_num,
+                dtype=data_dtype)
+
+            # build model
+            # data_format should be assigned in arch-dict
+            input_image_channel = config["Global"]["image_shape"][
+                0]  # default as [3, 224, 224]
+            model = build_model(config)
+            out = model(feeds["data"])
+            # end of build model
+
+            fetchs = create_fetchs(
+                out,
+                feeds,
+                config["Arch"],
+                epsilon=config.get('ls_epsilon'),
+                class_num=class_num,
+                use_mix=use_mix,
+                config=config,
+                mode=mode)
+            lr_scheduler = None
+            optimizer = None
+            if is_train:
+                optimizer, lr_scheduler = build_optimizer(
+                    config["Optimizer"], config["Global"]["epochs"],
+                    step_each_epoch)
+                optimizer = mixed_precision_optimizer(config, optimizer)
+                if is_distributed:
+                    optimizer = dist_optimizer(config, optimizer)
+                optimizer.minimize(fetchs['loss'][0])
+    return fetchs, lr_scheduler, feeds, optimizer
+
+
+def compile(config, program, loss_name=None, share_prog=None):
+    """
+    Compile the program
+
+    Args:
+        config(dict): config
+        program(): the program which is wrapped by
+        loss_name(str): loss name
+        share_prog(): the shared program, used for evaluation during training
+
+    Returns:
+        compiled_program(): a compiled program
+    """
+    build_strategy = create_strategy(config)
+
+    compiled_program = paddle.static.CompiledProgram(
+        program, build_strategy=build_strategy)
+
+    return compiled_program
+
+
+total_step = 0
+
+
+def run(dataloader,
+        exe,
+        program,
+        feeds,
+        fetchs,
+        epoch=0,
+        mode='train',
+        config=None,
+        vdl_writer=None,
+        lr_scheduler=None,
+        profiler_options=None):
+    """
+    Feed data to the model and fetch the measures and loss
+
+    Args:
+        dataloader(paddle io dataloader):
+        exe():
+        program():
+        fetchs(dict): dict of measures and the loss
+        epoch(int): epoch of training or evaluation
+        model(str): log only
+
+    Returns:
+    """
+    fetch_list = [f[0] for f in fetchs.values()]
+    metric_dict = OrderedDict([("lr", AverageMeter(
+        'lr', 'f', postfix=",", need_avg=False))])
+
+    for k in fetchs:
+        metric_dict[k] = fetchs[k][1]
+
+    metric_dict["batch_time"] = AverageMeter(
+        'batch_cost', '.5f', postfix=" s,")
+    metric_dict["reader_time"] = AverageMeter(
+        'reader_cost', '.5f', postfix=" s,")
+
+    for m in metric_dict.values():
+        m.reset()
+
+    use_dali = config["Global"].get('use_dali', False)
+    tic = time.time()
+
+    if not use_dali:
+        dataloader = dataloader()
+
+    idx = 0
+    batch_size = None
+    while True:
+        # The DALI maybe raise RuntimeError for some particular images, such as ImageNet1k/n04418357_26036.JPEG
+        try:
+            batch = next(dataloader)
+        except StopIteration:
+            break
+        except RuntimeError:
+            logger.warning(
+                "Except RuntimeError when reading data from dataloader, try to read once again..."
+            )
+            continue
+        except IndexError:
+            logger.warning(
+                "Except IndexError when reading data from dataloader, try to read once again..."
+            )
+            continue
+        idx += 1
+        # ignore the warmup iters
+        if idx == 5:
+            metric_dict["batch_time"].reset()
+            metric_dict["reader_time"].reset()
+
+        metric_dict['reader_time'].update(time.time() - tic)
+
+        profiler.add_profiler_step(profiler_options)
+
+        batch_size = batch[0].shape()[0]
+        feed_dict = {
+            key.name: batch[idx]
+            for idx, key in enumerate(feeds.values())
+        }
+
+        metrics = exe.run(program=program,
+                          feed=feed_dict,
+                          fetch_list=fetch_list)
+
+        for name, m in zip(fetchs.keys(), metrics):
+            metric_dict[name].update(np.mean(m), batch_size)
+        metric_dict["batch_time"].update(time.time() - tic)
+        if mode == "train":
+            metric_dict['lr'].update(lr_scheduler.get_lr())
+
+        fetchs_str = ' '.join([
+            str(metric_dict[key].mean)
+            if "time" in key else str(metric_dict[key].value)
+            for key in metric_dict
+        ])
+        ips_info = " ips: {:.5f} samples/sec.".format(
+            batch_size / metric_dict["batch_time"].avg)
+        fetchs_str += ips_info
+
+        if lr_scheduler is not None:
+            lr_scheduler.step()
+
+        if vdl_writer:
+            global total_step
+            logger.scaler('loss', metrics[0][0], total_step, vdl_writer)
+            total_step += 1
+        if mode == 'eval':
+            if idx % config.get('print_interval', 10) == 0:
+                logger.info("{:s} step:{:<4d} {:s}".format(mode, idx,
+                                                           fetchs_str))
+        else:
+            epoch_str = "epoch:{:<3d}".format(epoch)
+            step_str = "{:s} step:{:<4d}".format(mode, idx)
+            max_mem_reserved_str = f"max_mem_reserved: {paddle.device.cuda.max_memory_reserved()}"
+            max_mem_allocated_str = f"max_mem_allocated: {paddle.device.cuda.max_memory_allocated()}"
+            if idx % config.get('print_interval', 10) == 0:
+                logger.info(
+                    f"{epoch_str} {step_str} {fetchs_str} {max_mem_reserved_str} {max_mem_allocated_str}"
+                )
+
+        tic = time.time()
+
+    end_str = ' '.join([str(m.mean) for m in metric_dict.values()] +
+                       [metric_dict["batch_time"].total])
+    ips_info = "ips: {:.5f} samples/sec.".format(batch_size /
+                                                 metric_dict["batch_time"].avg)
+    if mode == 'eval':
+        logger.info("END {:s} {:s} {:s}".format(mode, end_str, ips_info))
+    else:
+        end_epoch_str = "END epoch:{:<3d}".format(epoch)
+        logger.info("{:s} {:s} {:s}".format(end_epoch_str, mode, end_str))
+    if use_dali:
+        dataloader.reset()
+
+    # return top1_acc in order to save the best model
+    if mode == 'eval':
+        return fetchs["top1"][1].avg
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/static/run_dali.sh b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/static/run_dali.sh
new file mode 100644
index 000000000..5bf0ef4ca
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/static/run_dali.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+
+python3.7 -m paddle.distributed.launch \
+    --gpus="0,1,2,3" \
+    ppcls/static/train.py \
+    -c ./ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/static/save_load.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/static/save_load.py
new file mode 100644
index 000000000..5d124fcf7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/static/save_load.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import errno
+import os
+import re
+import shutil
+import tempfile
+
+import paddle
+
+from ppcls.utils import logger
+
+__all__ = ['init_model', 'save_model']
+
+
+def _mkdir_if_not_exist(path):
+    """
+    mkdir if not exists, ignore the exception when multiprocess mkdir together
+    """
+    if not os.path.exists(path):
+        try:
+            os.makedirs(path)
+        except OSError as e:
+            if e.errno == errno.EEXIST and os.path.isdir(path):
+                logger.warning(
+                    'be happy if some process has already created {}'.format(
+                        path))
+            else:
+                raise OSError('Failed to mkdir {}'.format(path))
+
+
+def _load_state(path):
+    if os.path.exists(path + '.pdopt'):
+        # XXX another hack to ignore the optimizer state
+        tmp = tempfile.mkdtemp()
+        dst = os.path.join(tmp, os.path.basename(os.path.normpath(path)))
+        shutil.copy(path + '.pdparams', dst + '.pdparams')
+        state = paddle.static.load_program_state(dst)
+        shutil.rmtree(tmp)
+    else:
+        state = paddle.static.load_program_state(path)
+    return state
+
+
+def load_params(exe, prog, path, ignore_params=None):
+    """
+    Load model from the given path.
+    Args:
+        exe (paddle.static.Executor): The paddle.static.Executor object.
+        prog (paddle.static.Program): load weight to which Program object.
+        path (string): URL string or loca model path.
+        ignore_params (list): ignore variable to load when finetuning.
+            It can be specified by finetune_exclude_pretrained_params
+            and the usage can refer to the document
+            docs/advanced_tutorials/TRANSFER_LEARNING.md
+    """
+    if not (os.path.isdir(path) or os.path.exists(path + '.pdparams')):
+        raise ValueError("Model pretrain path {} does not "
+                         "exists.".format(path))
+
+    logger.info("Loading parameters from {}...".format(path))
+
+    ignore_set = set()
+    state = _load_state(path)
+
+    # ignore the parameter which mismatch the shape
+    # between the model and pretrain weight.
+    all_var_shape = {}
+    for block in prog.blocks:
+        for param in block.all_parameters():
+            all_var_shape[param.name] = param.shape
+    ignore_set.update([
+        name for name, shape in all_var_shape.items()
+        if name in state and shape != state[name].shape
+    ])
+
+    if ignore_params:
+        all_var_names = [var.name for var in prog.list_vars()]
+        ignore_list = filter(
+            lambda var: any([re.match(name, var) for name in ignore_params]),
+            all_var_names)
+        ignore_set.update(list(ignore_list))
+
+    if len(ignore_set) > 0:
+        for k in ignore_set:
+            if k in state:
+                logger.warning(
+                    'variable {} is already excluded automatically'.format(k))
+                del state[k]
+
+    paddle.static.set_program_state(prog, state)
+
+
+def init_model(config, program, exe):
+    """
+    load model from checkpoint or pretrained_model
+    """
+    checkpoints = config.get('checkpoints')
+    if checkpoints:
+        paddle.static.load(program, checkpoints, exe)
+        logger.info("Finish initing model from {}".format(checkpoints))
+        return
+
+    pretrained_model = config.get('pretrained_model')
+    if pretrained_model:
+        if not isinstance(pretrained_model, list):
+            pretrained_model = [pretrained_model]
+        for pretrain in pretrained_model:
+            load_params(exe, program, pretrain)
+        logger.info("Finish initing model from {}".format(pretrained_model))
+
+
+def save_model(program, model_path, epoch_id, prefix='ppcls'):
+    """
+    save model to the target path
+    """
+    if paddle.distributed.get_rank() != 0:
+        return
+    model_path = os.path.join(model_path, str(epoch_id))
+    _mkdir_if_not_exist(model_path)
+    model_prefix = os.path.join(model_path, prefix)
+    paddle.static.save(program, model_prefix)
+    logger.info("Already save model in {}".format(model_path))
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/static/train.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/static/train.py
new file mode 100644
index 000000000..4f85fe16f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/static/train.py
@@ -0,0 +1,227 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import sys
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(__dir__)
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))
+
+import numpy as np
+import random
+
+import paddle
+from paddle.distributed import fleet
+from visualdl import LogWriter
+
+from ppcls.data import build_dataloader
+from ppcls.utils.config import get_config, print_config
+from ppcls.utils import logger
+from ppcls.utils.logger import init_logger
+from ppcls.static.save_load import init_model, save_model
+from ppcls.static import program
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("PaddleClas train script")
+    parser.add_argument(
+        '-c',
+        '--config',
+        type=str,
+        default='configs/ResNet/ResNet50.yaml',
+        help='config file path')
+    parser.add_argument(
+        '-p',
+        '--profiler_options',
+        type=str,
+        default=None,
+        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
+    )
+    parser.add_argument(
+        '-o',
+        '--override',
+        action='append',
+        default=[],
+        help='config options to be overridden')
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    """
+    all the config of training paradigm should be in config["Global"]
+    """
+    config = get_config(args.config, overrides=args.override, show=False)
+
+    # set seed
+    seed = config["Global"].get("seed", False)
+    if seed or seed == 0:
+        assert isinstance(seed, int), "The 'seed' must be a integer!"
+        paddle.seed(seed)
+        np.random.seed(seed)
+        random.seed(seed)
+
+    global_config = config["Global"]
+
+    mode = "train"
+
+    log_file = os.path.join(global_config['output_dir'],
+                            config["Arch"]["name"], f"{mode}.log")
+    log_ranks = config["Global"].get("log_ranks", "0")
+    init_logger(log_file=log_file, log_ranks=log_ranks)
+    print_config(config)
+
+    if global_config.get("is_distributed", True):
+        fleet.init(is_collective=True)
+
+    # assign the device
+    assert global_config["device"] in [
+        "cpu", "gpu", "xpu", "npu", "mlu", "ascend", "intel_gpu", "mps"
+    ]
+    device = paddle.set_device(global_config["device"])
+
+    # amp related config
+    amp_config = config.get("AMP", None)
+    use_amp = True if amp_config and amp_config.get("use_amp", True) else False
+    if use_amp:
+        AMP_RELATED_FLAGS_SETTING = {
+            'FLAGS_cudnn_exhaustive_search': 1,
+            'FLAGS_conv_workspace_size_limit': 1500,
+            'FLAGS_cudnn_batchnorm_spatial_persistent': 1,
+            'FLAGS_max_inplace_grad_add': 8,
+        }
+        os.environ['FLAGS_cudnn_batchnorm_spatial_persistent'] = '1'
+        paddle.set_flags(AMP_RELATED_FLAGS_SETTING)
+
+    # visualDL
+    vdl_writer = None
+    if global_config["use_visualdl"]:
+        vdl_dir = global_config["output_dir"]
+        vdl_writer = LogWriter(vdl_dir)
+
+    # build dataloader
+    eval_dataloader = None
+    use_dali = global_config.get('use_dali', False)
+
+    class_num = config["Arch"].get("class_num", None)
+    config["DataLoader"].update({"class_num": class_num})
+    train_dataloader = build_dataloader(
+        config["DataLoader"], "Train", device=device, use_dali=use_dali)
+    if global_config["eval_during_train"]:
+        eval_dataloader = build_dataloader(
+            config["DataLoader"], "Eval", device=device, use_dali=use_dali)
+
+    step_each_epoch = len(train_dataloader)
+
+    # startup_prog is used to do some parameter init work,
+    # and train prog is used to hold the network
+    startup_prog = paddle.static.Program()
+    train_prog = paddle.static.Program()
+
+    best_top1_acc = 0.0  # best top1 acc record
+
+    train_fetchs, lr_scheduler, train_feeds, optimizer = program.build(
+        config,
+        train_prog,
+        startup_prog,
+        class_num,
+        step_each_epoch=step_each_epoch,
+        is_train=True,
+        is_distributed=global_config.get("is_distributed", True))
+
+    if global_config["eval_during_train"]:
+        eval_prog = paddle.static.Program()
+        eval_fetchs, _, eval_feeds, _ = program.build(
+            config,
+            eval_prog,
+            startup_prog,
+            is_train=False,
+            is_distributed=global_config.get("is_distributed", True))
+        # clone to prune some content which is irrelevant in eval_prog
+        eval_prog = eval_prog.clone(for_test=True)
+
+    # create the "Executor" with the statement of which device
+    exe = paddle.static.Executor(device)
+    # Parameter initialization
+    exe.run(startup_prog)
+    # load pretrained models or checkpoints
+    init_model(global_config, train_prog, exe)
+
+    if use_amp:
+        # for AMP O2
+        if config["AMP"].get("level", "O1").upper() == "O2":
+            use_fp16_test = True
+            msg = "Only support FP16 evaluation when AMP O2 is enabled."
+            logger.warning(msg)
+        # for AMP O1
+        else:
+            use_fp16_test = config["AMP"].get("use_fp16_test", False)
+
+        optimizer.amp_init(
+            device,
+            scope=paddle.static.global_scope(),
+            test_program=eval_prog
+            if global_config["eval_during_train"] else None,
+            use_fp16_test=use_fp16_test)
+
+    if not global_config.get("is_distributed", True):
+        compiled_train_prog = program.compile(
+            config, train_prog, loss_name=train_fetchs["loss"][0].name)
+    else:
+        compiled_train_prog = train_prog
+
+    if eval_dataloader is not None:
+        if not global_config.get("is_distributed", True):
+            compiled_eval_prog = program.compile(config, eval_prog)
+        else:
+            compiled_eval_prog = eval_prog
+
+    for epoch_id in range(global_config["epochs"]):
+        # 1. train with train dataset
+        program.run(train_dataloader, exe, compiled_train_prog, train_feeds,
+                    train_fetchs, epoch_id, 'train', config, vdl_writer,
+                    lr_scheduler, args.profiler_options)
+        # 2. evaluate with eval dataset
+        if global_config["eval_during_train"] and epoch_id % global_config[
+                "eval_interval"] == 0:
+            top1_acc = program.run(eval_dataloader, exe, compiled_eval_prog,
+                                   eval_feeds, eval_fetchs, epoch_id, "eval",
+                                   config)
+            if top1_acc > best_top1_acc:
+                best_top1_acc = top1_acc
+                message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
+                    best_top1_acc, epoch_id)
+                logger.info(message)
+                if epoch_id % global_config["save_interval"] == 0:
+
+                    model_path = os.path.join(global_config["output_dir"],
+                                              config["Arch"]["name"])
+                    save_model(train_prog, model_path, "best_model")
+
+        # 3. save the persistable model
+        if epoch_id % global_config["save_interval"] == 0:
+            model_path = os.path.join(global_config["output_dir"],
+                                      config["Arch"]["name"])
+            save_model(train_prog, model_path, epoch_id)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    args = parse_args()
+    main(args)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/COCO2017_label_list.txt b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/COCO2017_label_list.txt
new file mode 100644
index 000000000..51dba82b4
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/COCO2017_label_list.txt
@@ -0,0 +1,80 @@
+0 person
+1 bicycle
+2 car
+3 motorcycle
+4 airplane
+5 bus
+6 train
+7 truck
+8 boat
+9 traffic light
+10 fire hydrant
+11 stop sign
+12 parking meter
+13 bench
+14 bird
+15 cat
+16 dog
+17 horse
+18 sheep
+19 cow
+20 elephant
+21 bear
+22 zebra
+23 giraffe
+24 backpack
+25 umbrella
+26 handbag
+27 tie
+28 suitcase
+29 frisbee
+30 skis
+31 snowboard
+32 sports ball
+33 kite
+34 baseball bat
+35 baseball glove
+36 skateboard
+37 surfboard
+38 tennis racket
+39 bottle
+40 wine glass
+41 cup
+42 fork
+43 knife
+44 spoon
+45 bowl
+46 banana
+47 apple
+48 sandwich
+49 orange
+50 broccoli
+51 carrot
+52 hot dog
+53 pizza
+54 donut
+55 cake
+56 chair
+57 couch
+58 potted plant
+59 bed
+60 dining table
+61 toilet
+62 tv
+63 laptop
+64 mouse
+65 remote
+66 keyboard
+67 cell phone
+68 microwave
+69 oven
+70 toaster
+71 sink
+72 refrigerator
+73 book
+74 clock
+75 vase
+76 scissors
+77 teddy bear
+78 hair drier
+79 toothbrush
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/NUS-WIDE-SCENE_label_list.txt b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/NUS-WIDE-SCENE_label_list.txt
new file mode 100644
index 000000000..f4ee66ba7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/NUS-WIDE-SCENE_label_list.txt
@@ -0,0 +1,33 @@
+0 airport
+1 beach
+2 bridge
+3 buildings
+4 castle
+5 cityscape
+6 clouds
+7 frost
+8 garden
+9 glacier
+10 grass
+11 harbor
+12 house
+13 lake
+14 moon
+15 mountain
+16 nighttime
+17 ocean
+18 plants
+19 railroad
+20 rainbow
+21 reflection
+22 road
+23 sky
+24 snow
+25 street
+26 sunset
+27 temple
+28 town
+29 valley
+30 water
+31 waterfall
+32 window
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/PULC_label_list/image_orientation_label_list.txt b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/PULC_label_list/image_orientation_label_list.txt
new file mode 100644
index 000000000..f6c8ef322
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/PULC_label_list/image_orientation_label_list.txt
@@ -0,0 +1,4 @@
+0 0°
+1 90°
+2 180°
+3 270°
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/PULC_label_list/language_classification_label_list.txt b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/PULC_label_list/language_classification_label_list.txt
new file mode 100644
index 000000000..8d9ee9dd8
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/PULC_label_list/language_classification_label_list.txt
@@ -0,0 +1,10 @@
+0 arabic
+1 chinese_cht
+2 cyrillic
+3 devanagari
+4 japan
+5 ka
+6 korean
+7 ta
+8 te
+9 latin
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/PULC_label_list/text_image_orientation_label_list.txt b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/PULC_label_list/text_image_orientation_label_list.txt
new file mode 100644
index 000000000..051944a92
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/PULC_label_list/text_image_orientation_label_list.txt
@@ -0,0 +1,4 @@
+0 0
+1 90
+2 180
+3 270
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/PULC_label_list/textline_orientation_label_list.txt b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/PULC_label_list/textline_orientation_label_list.txt
new file mode 100644
index 000000000..207b70c6b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/PULC_label_list/textline_orientation_label_list.txt
@@ -0,0 +1,2 @@
+0 0_degree
+1 180_degree
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/PULC_label_list/traffic_sign_label_list.txt b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/PULC_label_list/traffic_sign_label_list.txt
new file mode 100644
index 000000000..c1e41d539
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/PULC_label_list/traffic_sign_label_list.txt
@@ -0,0 +1,232 @@
+0 pl80
+1 w9
+2 p6
+3 ph4.2
+4 i8
+5 w14
+6 w33
+7 pa13
+8 im
+9 w58
+10 pl90
+11 il70
+12 p5
+13 pm55
+14 pl60
+15 ip
+16 p11
+17 pdd
+18 wc
+19 i2r
+20 w30
+21 pmr
+22 p23
+23 pl15
+24 pm10
+25 pss
+26 w1
+27 p4
+28 w38
+29 w50
+30 w34
+31 pw3.5
+32 iz
+33 w39
+34 w11
+35 p1n
+36 pr70
+37 pd
+38 pnl
+39 pg
+40 ph5.3
+41 w66
+42 il80
+43 pb
+44 pbm
+45 pm5
+46 w24
+47 w67
+48 w49
+49 pm40
+50 ph4
+51 w45
+52 i4
+53 w37
+54 ph2.6
+55 pl70
+56 ph5.5
+57 i14
+58 i11
+59 p7
+60 p29
+61 pne
+62 pr60
+63 pm13
+64 ph4.5
+65 p12
+66 p3
+67 w40
+68 pl5
+69 w13
+70 pr10
+71 p14
+72 i4l
+73 pr30
+74 pw4.2
+75 w16
+76 p17
+77 ph3
+78 i9
+79 w15
+80 w35
+81 pa8
+82 pt
+83 pr45
+84 w17
+85 pl30
+86 pcs
+87 pctl
+88 pr50
+89 ph4.4
+90 pm46
+91 pm35
+92 i15
+93 pa12
+94 pclr
+95 i1
+96 pcd
+97 pbp
+98 pcr
+99 w28
+100 ps
+101 pm8
+102 w18
+103 w2
+104 w52
+105 ph2.9
+106 ph1.8
+107 pe
+108 p20
+109 w36
+110 p10
+111 pn
+112 pa14
+113 w54
+114 ph3.2
+115 p2
+116 ph2.5
+117 w62
+118 w55
+119 pw3
+120 pw4.5
+121 i12
+122 ph4.3
+123 phclr
+124 i10
+125 pr5
+126 i13
+127 w10
+128 p26
+129 w26
+130 p8
+131 w5
+132 w42
+133 il50
+134 p13
+135 pr40
+136 p25
+137 w41
+138 pl20
+139 ph4.8
+140 pnlc
+141 ph3.3
+142 w29
+143 ph2.1
+144 w53
+145 pm30
+146 p24
+147 p21
+148 pl40
+149 w27
+150 pmb
+151 pc
+152 i6
+153 pr20
+154 p18
+155 ph3.8
+156 pm50
+157 pm25
+158 i2
+159 w22
+160 w47
+161 w56
+162 pl120
+163 ph2.8
+164 i7
+165 w12
+166 pm1.5
+167 pm2.5
+168 w32
+169 pm15
+170 ph5
+171 w19
+172 pw3.2
+173 pw2.5
+174 pl10
+175 il60
+176 w57
+177 w48
+178 w60
+179 pl100
+180 pr80
+181 p16
+182 pl110
+183 w59
+184 w64
+185 w20
+186 ph2
+187 p9
+188 il100
+189 w31
+190 w65
+191 ph2.4
+192 pr100
+193 p19
+194 ph3.5
+195 pa10
+196 pcl
+197 pl35
+198 p15
+199 w7
+200 pa6
+201 phcs
+202 w43
+203 p28
+204 w6
+205 w3
+206 w25
+207 pl25
+208 il110
+209 p1
+210 w46
+211 pn-2
+212 w51
+213 w44
+214 w63
+215 w23
+216 pm20
+217 w8
+218 pmblr
+219 w4
+220 i5
+221 il90
+222 w21
+223 p27
+224 pl50
+225 pl65
+226 w61
+227 ph2.2
+228 pm2
+229 i3
+230 pa18
+231 pw4
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/__init__.py
new file mode 100644
index 000000000..294fbb125
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import logger
+from . import metrics
+from . import misc
+from . import model_zoo
+
+from .config import get_config, convert_to_dict
+from .dist_utils import all_gather
+from .metrics import accuracy_score
+from .metrics import hamming_distance
+from .metrics import mean_average_precision
+from .metrics import multi_hot_encode
+from .metrics import precision_recall_fscore
+from .misc import AverageMeter
+from .save_load import init_model, save_model
+from .save_result import save_predict_result
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/amp.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/amp.py
new file mode 100644
index 000000000..7ada7a404
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/amp.py
@@ -0,0 +1,61 @@
+from functools import partial
+import contextlib
+import paddle
+
+
+class AutoCast:
+    def __init__(self,
+                 use_amp=False,
+                 amp_level="O1",
+                 use_promote=False,
+                 amp_eval=False):
+        self.use_amp = use_amp
+        self.amp_eval = amp_eval
+
+        if self.use_amp:
+            # compatible with paddle 2.5 and older version
+            paddle_version = paddle.__version__[:3]
+            # paddle version >= 2.5.0 or develop
+            if paddle_version in ["2.5", "0.0"]:
+                self.cast_context = partial(
+                    paddle.amp.auto_cast,
+                    level=amp_level,
+                    use_promote=use_promote)
+            # paddle version <= 2.4.x and not develop
+            else:
+                self.cast_context = partial(
+                    paddle.amp.auto_cast, level=amp_level)
+
+    def __call__(self, is_eval=False):
+        if self.use_amp:
+            # not is_eval: cast for all training
+            # is_eval and self.amp_eval: cast for evaluation only when amp_eval is True
+            if not is_eval or (is_eval and self.amp_eval):
+                return self.cast_context()
+
+        return contextlib.nullcontext()
+
+
+def build_scaler(use_amp=False, scale_loss=1.0,
+                 use_dynamic_loss_scaling=False):
+    class Foo:
+        def __init__(self):
+            pass
+
+        def scale(self, loss):
+            return loss
+
+        def step(self, optimizer):
+            optimizer.step()
+
+        def update(self):
+            return
+
+        def minimize(self, optimizer, loss):
+            optimizer.step()
+
+    if use_amp:
+        return paddle.amp.GradScaler(
+            init_loss_scaling=scale_loss,
+            use_dynamic_loss_scaling=use_dynamic_loss_scaling)
+    return Foo()
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/check.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/check.py
new file mode 100644
index 000000000..8b743335e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/check.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+import paddle
+from paddle import is_compiled_with_cuda
+
+from ..arch.utils import get_architectures, similar_architectures, get_blacklist_model_in_static_mode
+from . import logger
+
+
+def check_version():
+    """
+    Log error and exit when the installed version of paddlepaddle is
+    not satisfied.
+    """
+    err = "PaddlePaddle version 1.8.0 or higher is required, " \
+          "or a suitable develop version is satisfied as well. \n" \
+          "Please make sure the version is good with your code."
+    try:
+        pass
+        # paddle.utils.require_version('0.0.0')
+    except Exception:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_gpu():
+    """
+    Log error and exit when using paddlepaddle cpu version.
+    """
+    err = "You are using paddlepaddle cpu version! Please try to " \
+          "install paddlepaddle-gpu to run model on GPU."
+
+    try:
+        assert is_compiled_with_cuda()
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_architecture(architecture):
+    """
+    check architecture and recommend similar architectures
+    """
+    assert isinstance(architecture, dict), \
+        ("the type of architecture({}) should be dict". format(architecture))
+    assert "name" in architecture, \
+        ("name must be in the architecture keys, just contains: {}". format(
+            architecture.keys()))
+
+    similar_names = similar_architectures(architecture["name"],
+                                          get_architectures())
+    model_list = ', '.join(similar_names)
+    err = "Architecture [{}] is not exist! Maybe you want: [{}]" \
+          "".format(architecture["name"], model_list)
+    try:
+        assert architecture["name"] in similar_names
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_model_with_running_mode(architecture):
+    """
+    check whether the model is consistent with the operating mode 
+    """
+    # some model are not supported in the static mode
+    blacklist = get_blacklist_model_in_static_mode()
+    if not paddle.in_dynamic_mode() and architecture["name"] in blacklist:
+        logger.error("Model: {} is not supported in the staic mode.".format(
+            architecture["name"]))
+        sys.exit(1)
+    return
+
+
+def check_mix(architecture, use_mix=False):
+    """
+    check mix parameter
+    """
+    err = "Cannot use mix processing in GoogLeNet, " \
+          "please set use_mix = False."
+    try:
+        if architecture["name"] == "GoogLeNet":
+            assert use_mix is not True
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_classes_num(classes_num):
+    """
+    check classes_num
+    """
+    err = "classes_num({}) should be a positive integer" \
+        "and larger than 1".format(classes_num)
+    try:
+        assert isinstance(classes_num, int)
+        assert classes_num > 1
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_data_dir(path):
+    """
+    check cata_dir
+    """
+    err = "Data path is not exist, please given a right path" \
+          "".format(path)
+    try:
+        assert os.isdir(path)
+    except AssertionError:
+        logger.error(err)
+        sys.exit(1)
+
+
+def check_function_params(config, key):
+    """
+    check specify config
+    """
+    k_config = config.get(key)
+    assert k_config is not None, \
+        ('{} is required in config'.format(key))
+
+    assert k_config.get('function'), \
+        ('function is required {} config'.format(key))
+    params = k_config.get('params')
+    assert params is not None, \
+        ('params is required in {} config'.format(key))
+    assert isinstance(params, dict), \
+        ('the params in {} config should be a dict'.format(key))
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/config.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/config.py
new file mode 100644
index 000000000..35fe7d428
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/config.py
@@ -0,0 +1,326 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import copy
+import argparse
+import yaml
+from . import logger
+from . import check
+from collections import OrderedDict
+
+__all__ = ['get_config', 'convert_to_dict']
+
+
+def convert_to_dict(obj):
+    if isinstance(obj, dict):
+        return {k: convert_to_dict(v) for k, v in obj.items()}
+    elif isinstance(obj, list):
+        return [convert_to_dict(i) for i in obj]
+    else:
+        return obj
+
+
+class AttrDict(dict):
+    def __getattr__(self, key):
+        return self[key]
+
+    def __setattr__(self, key, value):
+        if key in self.__dict__:
+            self.__dict__[key] = value
+        else:
+            self[key] = value
+
+    def __deepcopy__(self, content):
+        return AttrDict(copy.deepcopy(dict(self)))
+
+
+def create_attr_dict(yaml_config):
+    from ast import literal_eval
+    for key, value in yaml_config.items():
+        if type(value) is dict:
+            yaml_config[key] = value = AttrDict(value)
+        if isinstance(value, str):
+            try:
+                value = literal_eval(value)
+            except BaseException:
+                pass
+        if isinstance(value, AttrDict):
+            create_attr_dict(yaml_config[key])
+        else:
+            yaml_config[key] = value
+
+
+def parse_config(cfg_file):
+    """Load a config file into AttrDict"""
+    with open(cfg_file, 'r') as fopen:
+        yaml_config = AttrDict(yaml.load(fopen, Loader=yaml.SafeLoader))
+    create_attr_dict(yaml_config)
+    return yaml_config
+
+
+def print_dict(d, delimiter=0):
+    """
+    Recursively visualize a dict and
+    indenting acrrording by the relationship of keys.
+    """
+    placeholder = "-" * 60
+    for k, v in d.items():
+        if isinstance(v, dict):
+            logger.info("{}{} : ".format(delimiter * " ", k))
+            print_dict(v, delimiter + 4)
+        elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):
+            logger.info("{}{} : ".format(delimiter * " ", k))
+            for value in v:
+                print_dict(value, delimiter + 4)
+        else:
+            logger.info("{}{} : {}".format(delimiter * " ", k, v))
+
+        if k[0].isupper() and delimiter == 0:
+            logger.info(placeholder)
+
+
+def print_config(config):
+    """
+    visualize configs
+    Arguments:
+        config: configs
+    """
+    logger.advertise()
+    print_dict(config)
+
+
+def check_config(config):
+    """
+    Check config
+    """
+    check.check_version()
+    use_gpu = config.get('use_gpu', True)
+    if use_gpu:
+        check.check_gpu()
+    architecture = config.get('ARCHITECTURE')
+    #check.check_architecture(architecture)
+    use_mix = config.get('use_mix', False)
+    check.check_mix(architecture, use_mix)
+    classes_num = config.get('classes_num')
+    check.check_classes_num(classes_num)
+    mode = config.get('mode', 'train')
+    if mode.lower() == 'train':
+        check.check_function_params(config, 'LEARNING_RATE')
+        check.check_function_params(config, 'OPTIMIZER')
+
+
+def override(dl, ks, v):
+    """
+    Recursively replace dict of list
+    Args:
+        dl(dict or list): dict or list to be replaced
+        ks(list): list of keys
+        v(str): value to be replaced
+    """
+
+    def str2num(v):
+        try:
+            return eval(v)
+        except Exception:
+            return v
+
+    assert isinstance(dl, (list, dict)), ("{} should be a list or a dict")
+    assert len(ks) > 0, ('lenght of keys should larger than 0')
+    if isinstance(dl, list):
+        k = str2num(ks[0])
+        if len(ks) == 1:
+            assert k < len(dl), ('index({}) out of range({})'.format(k, dl))
+            dl[k] = str2num(v)
+        else:
+            override(dl[k], ks[1:], v)
+    else:
+        if len(ks) == 1:
+            # assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl))
+            if not ks[0] in dl:
+                print('A new field ({}) detected!'.format(ks[0], dl))
+            dl[ks[0]] = str2num(v)
+        else:
+            if ks[0] not in dl.keys():
+                dl[ks[0]] = {}
+                print("A new Series field ({}) detected!".format(ks[0], dl))
+            override(dl[ks[0]], ks[1:], v)
+
+
+def override_config(config, options=None):
+    """
+    Recursively override the config
+    Args:
+        config(dict): dict to be replaced
+        options(list): list of pairs(key0.key1.idx.key2=value)
+            such as: [
+                'topk=2',
+                'VALID.transforms.1.ResizeImage.resize_short=300'
+            ]
+    Returns:
+        config(dict): replaced config
+    """
+    if options is not None:
+        for opt in options:
+            assert isinstance(opt, str), (
+                "option({}) should be a str".format(opt))
+            assert "=" in opt, (
+                "option({}) should contain a ="
+                "to distinguish between key and value".format(opt))
+            pair = opt.split('=')
+            assert len(pair) == 2, ("there can be only a = in the option")
+            key, value = pair
+            keys = key.split('.')
+            override(config, keys, value)
+    return config
+
+
+def get_config(fname, overrides=None, show=False):
+    """
+    Read config from file
+    """
+    assert os.path.exists(fname), ('config file({}) is not exist'.format(fname))
+    config = parse_config(fname)
+    override_config(config, overrides)
+    if show:
+        print_config(config)
+    # check_config(config)
+    return config
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("generic-image-rec train script")
+    parser.add_argument(
+        '-c',
+        '--config',
+        type=str,
+        default='configs/config.yaml',
+        help='config file path')
+    parser.add_argument(
+        '-o',
+        '--override',
+        action='append',
+        default=[],
+        help='config options to be overridden')
+    parser.add_argument(
+        '-p',
+        '--profiler_options',
+        type=str,
+        default=None,
+        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
+    )
+    args = parser.parse_args()
+    return args
+
+
+def represent_dictionary_order(self, dict_data):
+    return self.represent_mapping('tag:yaml.org,2002:map', dict_data.items())
+
+
+def setup_orderdict():
+    yaml.add_representer(OrderedDict, represent_dictionary_order)
+
+
+def dump_infer_config(inference_config, path):
+    setup_orderdict()
+    infer_cfg = OrderedDict()
+    config = copy.deepcopy(inference_config)
+    if config["Global"].get("pdx_model_name", None):
+        infer_cfg["Global"] = {"model_name": config["Global"]["pdx_model_name"]}
+    if config.get("Infer"):
+        transforms = config["Infer"]["transforms"]
+    elif config["DataLoader"]["Eval"].get("Query"):
+        transforms = config["DataLoader"]["Eval"]["Query"]["dataset"][
+            "transform_ops"]
+        transforms.append({"ToCHWImage": None})
+    elif config["DataLoader"]["Eval"].get("dataset"):
+        transforms = config["DataLoader"]["Eval"]["dataset"]["transform_ops"]
+        transforms.append({"ToCHWImage": None})
+    else:
+        logger.error("This config does not support dump transform config!")
+    transform = next((item for item in transforms if 'CropImage' in item), None)
+    if transform:
+        dynamic_shapes = transform["CropImage"]["size"]
+    else:
+        transform = next((item for item in transforms
+                          if 'ResizeImage' in item), None)
+        if transform:
+            if isinstance(transform["ResizeImage"]["size"], list):
+                dynamic_shapes = transform["ResizeImage"]["size"][0]
+            elif isinstance(transform["ResizeImage"]["size"], int):
+                dynamic_shapes = transform["ResizeImage"]["size"]
+            else:
+                raise ValueError(
+                    "ResizeImage size must be either a list or an int.")
+        else:
+            raise ValueError("No valid transform found.")
+    # Configuration required config for high-performance inference.
+    if config["Global"].get("hpi_config_path", None):
+        hpi_config = convert_to_dict(
+            parse_config(config["Global"]["hpi_config_path"]))
+        if hpi_config["Hpi"]["backend_config"].get("paddle_tensorrt", None):
+            hpi_config["Hpi"]["backend_config"]["paddle_tensorrt"][
+                "dynamic_shapes"]["x"] = [[
+                    1, 3, dynamic_shapes, dynamic_shapes
+                ] for i in range(3)]
+            hpi_config["Hpi"]["backend_config"]["paddle_tensorrt"][
+                "max_batch_size"] = 1
+        if hpi_config["Hpi"]["backend_config"].get("tensorrt", None):
+            hpi_config["Hpi"]["backend_config"]["tensorrt"]["dynamic_shapes"][
+                "x"] = [[1, 3, dynamic_shapes, dynamic_shapes]
+                        for i in range(3)]
+            hpi_config["Hpi"]["backend_config"]["tensorrt"][
+                "max_batch_size"] = 1
+        infer_cfg["Hpi"] = hpi_config["Hpi"]
+    for transform in transforms:
+        if "NormalizeImage" in transform:
+            transform["NormalizeImage"]["channel_num"] = 3
+            scale_str = transform["NormalizeImage"]["scale"]
+            numerator, denominator = scale_str.split('/')
+            numerator, denominator = float(numerator), float(denominator)
+            transform["NormalizeImage"]["scale"] = float(numerator /
+                                                         denominator)
+    infer_cfg["PreProcess"] = {
+        "transform_ops": [
+            infer_preprocess for infer_preprocess in transforms
+            if "DecodeImage" not in infer_preprocess
+        ]
+    }
+    if config.get("Infer"):
+        postprocess_dict = config["Infer"]["PostProcess"]
+
+        with open(postprocess_dict["class_id_map_file"], 'r') as f:
+            label_id_maps = f.readlines()
+        label_names = []
+        for line in label_id_maps:
+            line = line.strip().split(' ', 1)
+            label_names.append(line[1:][0])
+
+        postprocess_name = postprocess_dict.get("name", None)
+        postprocess_dict.pop("class_id_map_file")
+        postprocess_dict.pop("name")
+        dic = OrderedDict()
+        for item in postprocess_dict.items():
+            dic[item[0]] = item[1]
+        dic['label_list'] = label_names
+
+        if postprocess_name:
+            infer_cfg["PostProcess"] = {postprocess_name: dic}
+        else:
+            raise ValueError("PostProcess name is not specified")
+    else:
+        infer_cfg["PostProcess"] = {"NormalizeFeatures": None}
+    with open(path, 'w') as f:
+        yaml.dump(infer_cfg, f)
+    logger.info("Export inference config file to {}".format(os.path.join(path)))
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/create_cls_trainval_lists.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/create_cls_trainval_lists.py
new file mode 100644
index 000000000..11aa32f10
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/create_cls_trainval_lists.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import random
+import string
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dataset_path', type=str, default='./data')
+    parser.add_argument('--save_img_list_path', type=str, default='train.txt')
+    parser.add_argument(
+        '--train', action='store_true', help='Create train list.')
+    parser.add_argument('--val', action='store_true', help='Create val list.')
+
+    args = parser.parse_args()
+    return args
+
+
+def parse_class_id_map(class_id_map_file):
+    class_id_map = {}
+    with open(class_id_map_file, "r") as f:
+        lines = f.readlines()
+        for line in lines:
+            partition = line.split("\n")[0].partition(" ")
+            class_id_map[str(partition[-1])] = int(partition[0])
+    return class_id_map
+
+
+def main(args):
+    img_list = []
+    label_list = []
+    img_end = ['jpg', 'JPG', 'png', 'PNG', 'jpeg', 'JPEG', 'bmp']
+    if args.dataset_path[-1] == "/":
+        args.dataset_path = args.dataset_path[:-1]
+
+    if not os.path.exists(args.dataset_path):
+        raise Exception(f"The data path {args.dataset_path} not exists.")
+    else:
+        label_name_list = [
+            label for label in os.listdir(args.dataset_path)
+            if os.path.isdir(os.path.join(args.dataset_path, label))
+        ]
+
+    if not os.path.exists(
+            os.path.join(os.path.dirname(args.dataset_path),
+                         'label.txt')) and args.val:
+        raise Exception(
+            'The label file is not exist. Please set "--train" first.')
+
+    for index, label_name in enumerate(label_name_list):
+        for root, dirs, files in os.walk(
+                os.path.join(args.dataset_path, label_name)):
+            for single_file in files:
+                if single_file.split('.')[-1] in img_end:
+                    img_path = os.path.relpath(
+                        os.path.join(root, single_file),
+                        os.path.dirname(args.dataset_path))
+                    if args.val:
+                        class_id_map = parse_class_id_map(
+                            os.path.join(
+                                os.path.dirname(args.dataset_path),
+                                'label.txt'))
+                        img_list.append(
+                            f'{img_path} {class_id_map[label_name]}')
+                    else:
+                        img_list.append(f'{img_path} {index}')
+                else:
+                    print(
+                        f'WARNING: File {os.path.join(root, single_file)} end with {single_file.split(".")[-1]} is not supported.'
+                    )
+        label_list.append(f'{index} {label_name}')
+
+    if len(img_list) == 0:
+        raise Exception(f"Not found any images file in {args.dataset_path}.")
+
+    with open(
+            os.path.join(
+                os.path.dirname(args.dataset_path), args.save_img_list_path),
+            'w') as f:
+        f.write('\n'.join(img_list))
+    print(
+        f'Already save {args.save_img_list_path} in {os.path.join(os.path.dirname(args.dataset_path), args.save_img_list_path)}.'
+    )
+
+    if not args.val:
+        with open(
+                os.path.join(os.path.dirname(args.dataset_path), 'label.txt'),
+                'w') as f:
+            f.write('\n'.join(label_list))
+        print(
+            f'Already save label.txt in {os.path.join(os.path.dirname(args.dataset_path), "label.txt")}.'
+        )
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/create_coco_multilabel_lists.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/create_coco_multilabel_lists.py
new file mode 100644
index 000000000..3082b4a6c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/create_coco_multilabel_lists.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+from ppcls.utils import logger
+from ppcls.utils.logger import init_logger
+from pycocotools.coco import COCO
+from tqdm import tqdm
+
+init_logger()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        '--dataset_dir',
+        required=True,
+        help='root directory for dataset')
+    parser.add_argument(
+        '--image_dir',
+        required=True,
+        help='directory for images')
+    parser.add_argument(
+        '--anno_path',
+        required=True,
+        help='coco annotation file path')
+    parser.add_argument(
+        '--save_name',
+        default=None,
+        help='will same as anno_path if got None')
+    parser.add_argument(
+        '--output_dir',
+        default=None,
+        help='output directory, and will same as '
+             'dataset_dir if got None')
+    parser.add_argument(
+        '--save_label_name',
+        action='store_true',
+        help='save label name file')
+
+    args = parser.parse_args()
+    if args.output_dir is None:
+        args.output_dir = args.dataset_dir
+    else:
+        os.makedirs(args.dataset_dir, exist_ok=True)
+    if args.save_name is None:
+        args.save_name = os.path.splitext(os.path.basename(args.anno_path))[0]
+
+    image_dir = os.path.join(args.dataset_dir, args.image_dir)
+    anno_path = os.path.join(args.dataset_dir, args.anno_path)
+    assert os.path.exists(image_dir) and os.path.exists(anno_path), \
+        ValueError("The dataset is not Found or "
+                   "the folder structure is non-conformance.")
+    coco = COCO(anno_path)
+    cat_id_map = {
+        old_cat_id: new_cat_id
+        for new_cat_id, old_cat_id in enumerate(coco.getCatIds())
+    }
+    num_classes = len(list(cat_id_map.keys()))
+
+    assert 'annotations' in coco.dataset, \
+        'Annotation file: {} does not contains ground truth!!!'.format(anno_path)
+
+    save_path = os.path.join(args.dataset_dir, args.save_name + '.txt')
+    logger.info("Start converting {}:".format(anno_path))
+    with open(save_path, 'w') as fp:
+        lines = []
+        for img_id in tqdm(sorted(coco.getImgIds())):
+            img_info = coco.loadImgs([img_id])[0]
+            img_filename = img_info['file_name']
+            img_w = img_info['width']
+            img_h = img_info['height']
+
+            img_filepath = os.path.join(image_dir, img_filename)
+            if not os.path.exists(img_filepath):
+                logger.warning('Illegal image file: {}, '
+                               'and it will be ignored'.format(img_filepath))
+                continue
+
+            if img_w < 0 or img_h < 0:
+                logger.warning(
+                    'Illegal width: {} or height: {} in annotation, '
+                    'and im_id: {} will be ignored'.format(img_w, img_h, img_id))
+                continue
+
+            ins_anno_ids = coco.getAnnIds(imgIds=[img_id])
+            instances = coco.loadAnns(ins_anno_ids)
+
+            label = [0] * num_classes
+            for instance in instances:
+                label[cat_id_map[instance['category_id']]] = 1
+            lines.append(img_filename + '\t' + ','.join(map(str, label)))
+
+        fp.write('\n'.join(lines))
+        fp.close()
+    logger.info("Conversion completed, save to {}:".format(save_path))
+
+    if args.save_label_name:
+        label_txt_save_name = os.path.basename(
+            os.path.abspath(args.dataset_dir)) + '_labels.txt'
+        label_txt_save_path = os.path.join(args.dataset_dir, label_txt_save_name)
+        with open(label_txt_save_path, 'w') as fp:
+            label_name_list = []
+            for cat in coco.cats.values():
+                label_name_list.append(cat['name'])
+            fp.write('\n'.join(label_name_list))
+            fp.close()
+        logger.info("Save label names to {}.".format(label_txt_save_path))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/dist_utils.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/dist_utils.py
new file mode 100644
index 000000000..6b7d889d6
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/dist_utils.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Union
+
+import paddle
+
+
+def all_gather(tensor: paddle.Tensor, concat: bool=True,
+               axis: int=0) -> Union[paddle.Tensor, List[paddle.Tensor]]:
+    """Gather tensor from all devices, concatenate them along given axis if specified.
+
+    Args:
+        tensor (paddle.Tensor): Tensor to be gathered from all GPUs.
+        concat (bool, optional): Whether to concatenate gathered Tensors. Defaults to True.
+        axis (int, optional): Axis which concatenated along. Defaults to 0.
+
+    Returns:
+        Union[paddle.Tensor, List[paddle.Tensor]]: Gathered Tensors
+    """
+    result = []
+    paddle.distributed.all_gather(result, tensor)
+    if concat:
+        return paddle.concat(result, axis)
+    return result
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/download.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/download.py
new file mode 100644
index 000000000..decad654a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/download.py
@@ -0,0 +1,304 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import os.path as osp
+import shutil
+import requests
+import hashlib
+import tarfile
+import zipfile
+import time
+from collections import OrderedDict
+from tqdm import tqdm
+
+from . import logger
+
+__all__ = ['get_weights_path_from_url']
+
+WEIGHTS_HOME = osp.expanduser("~/.paddleclas/weights")
+
+DOWNLOAD_RETRY_LIMIT = 3
+
+
+def is_url(path):
+    """
+    Whether path is URL.
+    Args:
+        path (string): URL string or not.
+    """
+    return path.startswith('http://') or path.startswith('https://')
+
+
+def get_weights_path_from_url(url, md5sum=None):
+    """Get weights path from WEIGHT_HOME, if not exists,
+    download it from url.
+
+    Args:
+        url (str): download url
+        md5sum (str): md5 sum of download package
+    
+    Returns:
+        str: a local path to save downloaded weights.
+
+    Examples:
+        .. code-block:: python
+
+            from paddle.utils.download import get_weights_path_from_url
+
+            resnet18_pretrained_weight_url = 'https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams'
+            local_weight_path = get_weights_path_from_url(resnet18_pretrained_weight_url)
+
+    """
+    path = get_path_from_url(url, WEIGHTS_HOME, md5sum)
+    return path
+
+
+def _map_path(url, root_dir):
+    # parse path after download under root_dir
+    fname = osp.split(url)[-1]
+    fpath = fname
+    return osp.join(root_dir, fpath)
+
+
+def get_path_from_url(url,
+                      root_dir,
+                      md5sum=None,
+                      check_exist=True,
+                      decompress=True):
+    """ Download from given url to root_dir.
+    if file or directory specified by url is exists under
+    root_dir, return the path directly, otherwise download
+    from url and decompress it, return the path.
+
+    Args:
+        url (str): download url
+        root_dir (str): root dir for downloading, it should be
+                        WEIGHTS_HOME or DATASET_HOME
+        md5sum (str): md5 sum of download package
+    
+    Returns:
+        str: a local path to save downloaded models & weights & datasets.
+    """
+
+    from paddle.distributed import ParallelEnv
+
+    assert is_url(url), "downloading from {} not a url".format(url)
+    # parse path after download to decompress under root_dir
+    fullpath = _map_path(url, root_dir)
+    # Mainly used to solve the problem of downloading data from different 
+    # machines in the case of multiple machines. Different nodes will download 
+    # data, and the same node will only download data once.
+    rank_id_curr_node = int(os.environ.get("PADDLE_RANK_IN_NODE", 0))
+
+    if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum):
+        logger.info("Found {}".format(fullpath))
+    else:
+        if rank_id_curr_node == 0:
+            fullpath = _download(url, root_dir, md5sum)
+        else:
+            while not os.path.exists(fullpath):
+                time.sleep(1)
+
+    if rank_id_curr_node == 0:
+        if decompress and (tarfile.is_tarfile(fullpath) or
+                           zipfile.is_zipfile(fullpath)):
+            fullpath = _decompress(fullpath)
+
+    return fullpath
+
+
+def _download(url, path, md5sum=None):
+    """
+    Download from url, save to path.
+
+    url (str): download url
+    path (str): download to given path
+    """
+    if not osp.exists(path):
+        os.makedirs(path)
+
+    fname = osp.split(url)[-1]
+    fullname = osp.join(path, fname)
+    retry_cnt = 0
+
+    while not (osp.exists(fullname) and _md5check(fullname, md5sum)):
+        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+            retry_cnt += 1
+        else:
+            raise RuntimeError("Download from {} failed. "
+                               "Retry limit reached".format(url))
+
+        logger.info("Downloading {} from {}".format(fname, url))
+
+        try:
+            req = requests.get(url, stream=True)
+        except Exception as e:  # requests.exceptions.ConnectionError
+            logger.info(
+                "Downloading {} from {} failed {} times with exception {}".
+                format(fname, url, retry_cnt + 1, str(e)))
+            time.sleep(1)
+            continue
+
+        if req.status_code != 200:
+            raise RuntimeError("Downloading from {} failed with code "
+                               "{}!".format(url, req.status_code))
+
+        # For protecting download interupted, download to
+        # tmp_fullname firstly, move tmp_fullname to fullname
+        # after download finished
+        tmp_fullname = fullname + "_tmp"
+        total_size = req.headers.get('content-length')
+        with open(tmp_fullname, 'wb') as f:
+            if total_size:
+                with tqdm(total=(int(total_size) + 1023) // 1024) as pbar:
+                    for chunk in req.iter_content(chunk_size=1024):
+                        f.write(chunk)
+                        pbar.update(1)
+            else:
+                for chunk in req.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+        shutil.move(tmp_fullname, fullname)
+
+    return fullname
+
+
+def _md5check(fullname, md5sum=None):
+    if md5sum is None:
+        return True
+
+    logger.info("File {} md5 checking...".format(fullname))
+    md5 = hashlib.md5()
+    with open(fullname, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5.update(chunk)
+    calc_md5sum = md5.hexdigest()
+
+    if calc_md5sum != md5sum:
+        logger.info("File {} md5 check failed, {}(calc) != "
+                    "{}(base)".format(fullname, calc_md5sum, md5sum))
+        return False
+    return True
+
+
+def _decompress(fname):
+    """
+    Decompress for zip and tar file
+    """
+    logger.info("Decompressing {}...".format(fname))
+
+    # For protecting decompressing interupted,
+    # decompress to fpath_tmp directory firstly, if decompress
+    # successed, move decompress files to fpath and delete
+    # fpath_tmp and remove download compress file.
+
+    if tarfile.is_tarfile(fname):
+        uncompressed_path = _uncompress_file_tar(fname)
+    elif zipfile.is_zipfile(fname):
+        uncompressed_path = _uncompress_file_zip(fname)
+    else:
+        raise TypeError("Unsupport compress file type {}".format(fname))
+
+    return uncompressed_path
+
+
+def _uncompress_file_zip(filepath):
+    files = zipfile.ZipFile(filepath, 'r')
+    file_list = files.namelist()
+
+    file_dir = os.path.dirname(filepath)
+
+    if _is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+
+        for item in file_list:
+            files.extract(item, file_dir)
+
+    elif _is_a_single_dir(file_list):
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+
+        for item in file_list:
+            files.extract(item, file_dir)
+
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        if not os.path.exists(uncompressed_path):
+            os.makedirs(uncompressed_path)
+        for item in file_list:
+            files.extract(item, os.path.join(file_dir, rootpath))
+
+    files.close()
+
+    return uncompressed_path
+
+
+def _uncompress_file_tar(filepath, mode="r:*"):
+    files = tarfile.open(filepath, mode)
+    file_list = files.getnames()
+
+    file_dir = os.path.dirname(filepath)
+
+    if _is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        for item in file_list:
+            files.extract(item, file_dir)
+    elif _is_a_single_dir(file_list):
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        for item in file_list:
+            files.extract(item, file_dir)
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        if not os.path.exists(uncompressed_path):
+            os.makedirs(uncompressed_path)
+
+        for item in file_list:
+            files.extract(item, os.path.join(file_dir, rootpath))
+
+    files.close()
+
+    return uncompressed_path
+
+
+def _is_a_single_file(file_list):
+    if len(file_list) == 1 and file_list[0].find(os.sep) < -1:
+        return True
+    return False
+
+
+def _is_a_single_dir(file_list):
+    new_file_list = []
+    for file_path in file_list:
+        if '/' in file_path:
+            file_path = file_path.replace('/', os.sep)
+        elif '\\' in file_path:
+            file_path = file_path.replace('\\', os.sep)
+        new_file_list.append(file_path)
+
+    file_name = new_file_list[0].split(os.sep)[0]
+    for i in range(1, len(new_file_list)):
+        if file_name != new_file_list[i].split(os.sep)[0]:
+            return False
+    return True
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/ema.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/ema.py
new file mode 100644
index 000000000..8cdb3dfab
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/ema.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from copy import deepcopy
+
+import paddle
+
+
+class ExponentialMovingAverage():
+    """
+    Exponential Moving Average
+    Code was heavily based on https://github.com/rwightman/pytorch-image-models/blob/master/timm/utils/model_ema.py
+    """
+
+    def __init__(self, model, decay=0.9999):
+        super().__init__()
+        # make a copy of the model for accumulating moving average of weights
+        self.module = deepcopy(model)
+        self.module.eval()
+        self.decay = decay
+
+    @paddle.no_grad()
+    def _update(self, model, update_fn):
+        for ema_v, model_v in zip(self.module.state_dict().values(),
+                                  model.state_dict().values()):
+            paddle.assign(update_fn(ema_v, model_v), ema_v)
+
+    def update(self, model):
+        self._update(
+            model,
+            update_fn=lambda e, m: self.decay * e + (1. - self.decay) * m)
+
+    def set(self, model):
+        self._update(model, update_fn=lambda e, m: m)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/feature_maps_visualization/fm_vis.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/feature_maps_visualization/fm_vis.py
new file mode 100644
index 000000000..a5368b10e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/feature_maps_visualization/fm_vis.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import cv2
+import utils
+import argparse
+import os
+import sys
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(__dir__)
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../../..')))
+
+import paddle
+from paddle.distributed import ParallelEnv
+
+from resnet import ResNet50
+from ppcls.utils.save_load import load_dygraph_pretrain
+
+
+def parse_args():
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--image_file", required=True, type=str)
+    parser.add_argument("-c", "--channel_num", type=int)
+    parser.add_argument("-p", "--pretrained_model", type=str)
+    parser.add_argument("--show", type=str2bool, default=False)
+    parser.add_argument("--interpolation", type=int, default=1)
+    parser.add_argument("--save_path", type=str, default=None)
+    parser.add_argument("--use_gpu", type=str2bool, default=True)
+
+    return parser.parse_args()
+
+
+def create_operators(interpolation=1):
+    size = 224
+    img_mean = [0.485, 0.456, 0.406]
+    img_std = [0.229, 0.224, 0.225]
+    img_scale = 1.0 / 255.0
+
+    resize_op = utils.ResizeImage(
+        resize_short=256, interpolation=interpolation)
+    crop_op = utils.CropImage(size=(size, size))
+    normalize_op = utils.NormalizeImage(
+        scale=img_scale, mean=img_mean, std=img_std)
+    totensor_op = utils.ToTensor()
+
+    return [resize_op, crop_op, normalize_op, totensor_op]
+
+
+def preprocess(data, ops):
+    for op in ops:
+        data = op(data)
+    return data
+
+
+def main():
+    args = parse_args()
+    operators = create_operators(args.interpolation)
+    # assign the place
+    place = 'gpu:{}'.format(ParallelEnv().dev_id) if args.use_gpu else 'cpu'
+    place = paddle.set_device(place)
+
+    net = ResNet50()
+    load_dygraph_pretrain(net, args.pretrained_model)
+
+    img = cv2.imread(args.image_file, cv2.IMREAD_COLOR)
+    data = preprocess(img, operators)
+    data = np.expand_dims(data, axis=0)
+    data = paddle.to_tensor(data)
+    net.eval()
+    _, fm = net(data)
+    assert args.channel_num >= 0 and args.channel_num <= fm.shape[
+        1], "the channel is out of the range, should be in {} but got {}".format(
+            [0, fm.shape[1]], args.channel_num)
+
+    fm = (np.squeeze(fm[0][args.channel_num].numpy()) * 255).astype(np.uint8)
+    fm = cv2.resize(fm, (img.shape[1], img.shape[0]))
+    if args.save_path is not None:
+        print("the feature map is saved in path: {}".format(args.save_path))
+        cv2.imwrite(args.save_path, fm)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/feature_maps_visualization/resnet.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/feature_maps_visualization/resnet.py
new file mode 100644
index 000000000..0bea3401a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/feature_maps_visualization/resnet.py
@@ -0,0 +1,535 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils.save_load import load_dygraph_pretrain
+
+MODEL_URLS = {
+    "ResNet18":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_pretrained.pdparams",
+    "ResNet18_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_vd_pretrained.pdparams",
+    "ResNet34":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet34_pretrained.pdparams",
+    "ResNet34_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet34_vd_pretrained.pdparams",
+    "ResNet50":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_pretrained.pdparams",
+    "ResNet50_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_vd_pretrained.pdparams",
+    "ResNet101":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet101_pretrained.pdparams",
+    "ResNet101_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet101_vd_pretrained.pdparams",
+    "ResNet152":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_pretrained.pdparams",
+    "ResNet152_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_vd_pretrained.pdparams",
+    "ResNet200_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet200_vd_pretrained.pdparams",
+}
+
+__all__ = MODEL_URLS.keys()
+'''
+ResNet config: dict.
+    key: depth of ResNet.
+    values: config's dict of specific model.
+        keys:
+            block_type: Two different blocks in ResNet, BasicBlock and BottleneckBlock are optional.
+            block_depth: The number of blocks in different stages in ResNet.
+            num_channels: The number of channels to enter the next stage.
+'''
+NET_CONFIG = {
+    "18": {
+        "block_type": "BasicBlock",
+        "block_depth": [2, 2, 2, 2],
+        "num_channels": [64, 64, 128, 256]
+    },
+    "34": {
+        "block_type": "BasicBlock",
+        "block_depth": [3, 4, 6, 3],
+        "num_channels": [64, 64, 128, 256]
+    },
+    "50": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 4, 6, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "101": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 4, 23, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "152": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 8, 36, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "200": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 12, 48, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+}
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 is_vd_mode=False,
+                 act=None,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+        self.is_vd_mode = is_vd_mode
+        self.act = act
+        self.avg_pool = AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=False,
+            data_format=data_format)
+        self.bn = BatchNorm(
+            num_filters,
+            param_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult),
+            data_layout=data_format)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        if self.is_vd_mode:
+            x = self.avg_pool(x)
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act:
+            x = self.relu(x)
+        return x
+
+
+class BottleneckBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            lr_mult=lr_mult,
+            data_format=data_format)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride if if_first else 1,
+                is_vd_mode=False if if_first else True,
+                lr_mult=lr_mult,
+                data_format=data_format)
+        self.relu = nn.ReLU()
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        x = self.conv2(x)
+
+        if self.shortcut:
+            short = identity
+        else:
+            short = self.short(identity)
+        x = paddle.add(x=x, y=short)
+        x = self.relu(x)
+        return x
+
+
+class BasicBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0,
+                 data_format="NCHW"):
+        super().__init__()
+
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult,
+            data_format=data_format)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            lr_mult=lr_mult,
+            data_format=data_format)
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=stride if if_first else 1,
+                is_vd_mode=False if if_first else True,
+                lr_mult=lr_mult,
+                data_format=data_format)
+        self.shortcut = shortcut
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        if self.shortcut:
+            short = identity
+        else:
+            short = self.short(identity)
+        x = paddle.add(x=x, y=short)
+        x = self.relu(x)
+        return x
+
+
+class ResNet(TheseusLayer):
+    """
+    ResNet
+    Args:
+        config: dict. config of ResNet.
+        version: str="vb". Different version of ResNet, version vd can perform better.
+        class_num: int=1000. The number of classes.
+        lr_mult_list: list. Control the learning rate of different stages.
+    Returns:
+        model: nn.Layer. Specific ResNet model depends on args.
+    """
+
+    def __init__(self,
+                 config,
+                 version="vb",
+                 class_num=1000,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+                 data_format="NCHW",
+                 input_image_channel=3,
+                 return_patterns=None):
+        super().__init__()
+
+        self.cfg = config
+        self.lr_mult_list = lr_mult_list
+        self.is_vd_mode = version == "vd"
+        self.class_num = class_num
+        self.num_filters = [64, 128, 256, 512]
+        self.block_depth = self.cfg["block_depth"]
+        self.block_type = self.cfg["block_type"]
+        self.num_channels = self.cfg["num_channels"]
+        self.channels_mult = 1 if self.num_channels[-1] == 256 else 4
+
+        assert isinstance(self.lr_mult_list, (
+            list, tuple
+        )), "lr_mult_list should be in (list, tuple) but got {}".format(
+            type(self.lr_mult_list))
+        assert len(self.lr_mult_list
+                   ) == 5, "lr_mult_list length should be 5 but got {}".format(
+                       len(self.lr_mult_list))
+
+        self.stem_cfg = {
+            #num_channels, num_filters, filter_size, stride
+            "vb": [[input_image_channel, 64, 7, 2]],
+            "vd":
+            [[input_image_channel, 32, 3, 2], [32, 32, 3, 1], [32, 64, 3, 1]]
+        }
+
+        self.stem = nn.Sequential(* [
+            ConvBNLayer(
+                num_channels=in_c,
+                num_filters=out_c,
+                filter_size=k,
+                stride=s,
+                act="relu",
+                lr_mult=self.lr_mult_list[0],
+                data_format=data_format)
+            for in_c, out_c, k, s in self.stem_cfg[version]
+        ])
+
+        self.max_pool = MaxPool2D(
+            kernel_size=3, stride=2, padding=1, data_format=data_format)
+        block_list = []
+        for block_idx in range(len(self.block_depth)):
+            shortcut = False
+            for i in range(self.block_depth[block_idx]):
+                block_list.append(globals()[self.block_type](
+                    num_channels=self.num_channels[block_idx] if i == 0 else
+                    self.num_filters[block_idx] * self.channels_mult,
+                    num_filters=self.num_filters[block_idx],
+                    stride=2 if i == 0 and block_idx != 0 else 1,
+                    shortcut=shortcut,
+                    if_first=block_idx == i == 0 if version == "vd" else True,
+                    lr_mult=self.lr_mult_list[block_idx + 1],
+                    data_format=data_format))
+                shortcut = True
+        self.blocks = nn.Sequential(*block_list)
+
+        self.avg_pool = AdaptiveAvgPool2D(1, data_format=data_format)
+        self.flatten = nn.Flatten()
+        self.avg_pool_channels = self.num_channels[-1] * 2
+        stdv = 1.0 / math.sqrt(self.avg_pool_channels * 1.0)
+        self.fc = Linear(
+            self.avg_pool_channels,
+            self.class_num,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
+
+        self.data_format = data_format
+        if return_patterns is not None:
+            self.update_res(return_patterns)
+            self.register_forward_post_hook(self._return_dict_hook)
+
+    def forward(self, x):
+        with paddle.static.amp.fp16_guard():
+            if self.data_format == "NHWC":
+                x = paddle.transpose(x, [0, 2, 3, 1])
+                x.stop_gradient = True
+            x = self.stem(x)
+            fm = x
+            x = self.max_pool(x)
+            x = self.blocks(x)
+            x = self.avg_pool(x)
+            x = self.flatten(x)
+            x = self.fc(x)
+        return x, fm
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNet18(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet18
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet18` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["18"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet18"], use_ssld)
+    return model
+
+
+def ResNet18_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet18_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet18_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["18"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet18_vd"], use_ssld)
+    return model
+
+
+def ResNet34(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet34
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet34` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["34"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet34"], use_ssld)
+    return model
+
+
+def ResNet34_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet34_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet34_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["34"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet34_vd"], use_ssld)
+    return model
+
+
+def ResNet50(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet50
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet50` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["50"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50"], use_ssld)
+    return model
+
+
+def ResNet50_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet50_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet50_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["50"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50_vd"], use_ssld)
+    return model
+
+
+def ResNet101(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet101
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet101` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["101"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet101"], use_ssld)
+    return model
+
+
+def ResNet101_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet101_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet101_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["101"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet101_vd"], use_ssld)
+    return model
+
+
+def ResNet152(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet152
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet152` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["152"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet152"], use_ssld)
+    return model
+
+
+def ResNet152_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet152_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet152_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["152"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet152_vd"], use_ssld)
+    return model
+
+
+def ResNet200_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet200_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet200_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["200"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet200_vd"], use_ssld)
+    return model
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/feature_maps_visualization/utils.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/feature_maps_visualization/utils.py
new file mode 100644
index 000000000..7c7014932
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/feature_maps_visualization/utils.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import cv2
+import numpy as np
+
+
+class DecodeImage(object):
+    def __init__(self, to_rgb=True):
+        self.to_rgb = to_rgb
+
+    def __call__(self, img):
+        data = np.frombuffer(img, dtype='uint8')
+        img = cv2.imdecode(data, 1)
+        if self.to_rgb:
+            assert img.shape[2] == 3, 'invalid shape of image[%s]' % (
+                img.shape)
+            img = img[:, :, ::-1]
+
+        return img
+
+
+class ResizeImage(object):
+    def __init__(self, resize_short=None, interpolation=1):
+        self.resize_short = resize_short
+        self.interpolation = interpolation
+
+    def __call__(self, img):
+        img_h, img_w = img.shape[:2]
+        percent = float(self.resize_short) / min(img_w, img_h)
+        w = int(round(img_w * percent))
+        h = int(round(img_h * percent))
+        return cv2.resize(img, (w, h), interpolation=self.interpolation)
+
+
+class CropImage(object):
+    def __init__(self, size):
+        if type(size) is int:
+            self.size = (size, size)
+        else:
+            self.size = size
+
+    def __call__(self, img):
+        w, h = self.size
+        img_h, img_w = img.shape[:2]
+        w_start = (img_w - w) // 2
+        h_start = (img_h - h) // 2
+
+        w_end = w_start + w
+        h_end = h_start + h
+        return img[h_start:h_end, w_start:w_end, :]
+
+
+class NormalizeImage(object):
+    def __init__(self, scale=None, mean=None, std=None):
+        self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
+        mean = mean if mean is not None else [0.485, 0.456, 0.406]
+        std = std if std is not None else [0.229, 0.224, 0.225]
+
+        shape = (1, 1, 3)
+        self.mean = np.array(mean).reshape(shape).astype('float32')
+        self.std = np.array(std).reshape(shape).astype('float32')
+
+    def __call__(self, img):
+        return (img.astype('float32') * self.scale - self.mean) / self.std
+
+
+class ToTensor(object):
+    def __init__(self):
+        pass
+
+    def __call__(self, img):
+        img = img.transpose((2, 0, 1))
+        return img
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/imagenet1k_label_list.txt b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/imagenet1k_label_list.txt
new file mode 100644
index 000000000..376e18021
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/imagenet1k_label_list.txt
@@ -0,0 +1,1000 @@
+0 tench, Tinca tinca
+1 goldfish, Carassius auratus
+2 great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias
+3 tiger shark, Galeocerdo cuvieri
+4 hammerhead, hammerhead shark
+5 electric ray, crampfish, numbfish, torpedo
+6 stingray
+7 cock
+8 hen
+9 ostrich, Struthio camelus
+10 brambling, Fringilla montifringilla
+11 goldfinch, Carduelis carduelis
+12 house finch, linnet, Carpodacus mexicanus
+13 junco, snowbird
+14 indigo bunting, indigo finch, indigo bird, Passerina cyanea
+15 robin, American robin, Turdus migratorius
+16 bulbul
+17 jay
+18 magpie
+19 chickadee
+20 water ouzel, dipper
+21 kite
+22 bald eagle, American eagle, Haliaeetus leucocephalus
+23 vulture
+24 great grey owl, great gray owl, Strix nebulosa
+25 European fire salamander, Salamandra salamandra
+26 common newt, Triturus vulgaris
+27 eft
+28 spotted salamander, Ambystoma maculatum
+29 axolotl, mud puppy, Ambystoma mexicanum
+30 bullfrog, Rana catesbeiana
+31 tree frog, tree-frog
+32 tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui
+33 loggerhead, loggerhead turtle, Caretta caretta
+34 leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea
+35 mud turtle
+36 terrapin
+37 box turtle, box tortoise
+38 banded gecko
+39 common iguana, iguana, Iguana iguana
+40 American chameleon, anole, Anolis carolinensis
+41 whiptail, whiptail lizard
+42 agama
+43 frilled lizard, Chlamydosaurus kingi
+44 alligator lizard
+45 Gila monster, Heloderma suspectum
+46 green lizard, Lacerta viridis
+47 African chameleon, Chamaeleo chamaeleon
+48 Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis
+49 African crocodile, Nile crocodile, Crocodylus niloticus
+50 American alligator, Alligator mississipiensis
+51 triceratops
+52 thunder snake, worm snake, Carphophis amoenus
+53 ringneck snake, ring-necked snake, ring snake
+54 hognose snake, puff adder, sand viper
+55 green snake, grass snake
+56 king snake, kingsnake
+57 garter snake, grass snake
+58 water snake
+59 vine snake
+60 night snake, Hypsiglena torquata
+61 boa constrictor, Constrictor constrictor
+62 rock python, rock snake, Python sebae
+63 Indian cobra, Naja naja
+64 green mamba
+65 sea snake
+66 horned viper, cerastes, sand viper, horned asp, Cerastes cornutus
+67 diamondback, diamondback rattlesnake, Crotalus adamanteus
+68 sidewinder, horned rattlesnake, Crotalus cerastes
+69 trilobite
+70 harvestman, daddy longlegs, Phalangium opilio
+71 scorpion
+72 black and gold garden spider, Argiope aurantia
+73 barn spider, Araneus cavaticus
+74 garden spider, Aranea diademata
+75 black widow, Latrodectus mactans
+76 tarantula
+77 wolf spider, hunting spider
+78 tick
+79 centipede
+80 black grouse
+81 ptarmigan
+82 ruffed grouse, partridge, Bonasa umbellus
+83 prairie chicken, prairie grouse, prairie fowl
+84 peacock
+85 quail
+86 partridge
+87 African grey, African gray, Psittacus erithacus
+88 macaw
+89 sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita
+90 lorikeet
+91 coucal
+92 bee eater
+93 hornbill
+94 hummingbird
+95 jacamar
+96 toucan
+97 drake
+98 red-breasted merganser, Mergus serrator
+99 goose
+100 black swan, Cygnus atratus
+101 tusker
+102 echidna, spiny anteater, anteater
+103 platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus
+104 wallaby, brush kangaroo
+105 koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus
+106 wombat
+107 jellyfish
+108 sea anemone, anemone
+109 brain coral
+110 flatworm, platyhelminth
+111 nematode, nematode worm, roundworm
+112 conch
+113 snail
+114 slug
+115 sea slug, nudibranch
+116 chiton, coat-of-mail shell, sea cradle, polyplacophore
+117 chambered nautilus, pearly nautilus, nautilus
+118 Dungeness crab, Cancer magister
+119 rock crab, Cancer irroratus
+120 fiddler crab
+121 king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica
+122 American lobster, Northern lobster, Maine lobster, Homarus americanus
+123 spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish
+124 crayfish, crawfish, crawdad, crawdaddy
+125 hermit crab
+126 isopod
+127 white stork, Ciconia ciconia
+128 black stork, Ciconia nigra
+129 spoonbill
+130 flamingo
+131 little blue heron, Egretta caerulea
+132 American egret, great white heron, Egretta albus
+133 bittern
+134 crane
+135 limpkin, Aramus pictus
+136 European gallinule, Porphyrio porphyrio
+137 American coot, marsh hen, mud hen, water hen, Fulica americana
+138 bustard
+139 ruddy turnstone, Arenaria interpres
+140 red-backed sandpiper, dunlin, Erolia alpina
+141 redshank, Tringa totanus
+142 dowitcher
+143 oystercatcher, oyster catcher
+144 pelican
+145 king penguin, Aptenodytes patagonica
+146 albatross, mollymawk
+147 grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus
+148 killer whale, killer, orca, grampus, sea wolf, Orcinus orca
+149 dugong, Dugong dugon
+150 sea lion
+151 Chihuahua
+152 Japanese spaniel
+153 Maltese dog, Maltese terrier, Maltese
+154 Pekinese, Pekingese, Peke
+155 Shih-Tzu
+156 Blenheim spaniel
+157 papillon
+158 toy terrier
+159 Rhodesian ridgeback
+160 Afghan hound, Afghan
+161 basset, basset hound
+162 beagle
+163 bloodhound, sleuthhound
+164 bluetick
+165 black-and-tan coonhound
+166 Walker hound, Walker foxhound
+167 English foxhound
+168 redbone
+169 borzoi, Russian wolfhound
+170 Irish wolfhound
+171 Italian greyhound
+172 whippet
+173 Ibizan hound, Ibizan Podenco
+174 Norwegian elkhound, elkhound
+175 otterhound, otter hound
+176 Saluki, gazelle hound
+177 Scottish deerhound, deerhound
+178 Weimaraner
+179 Staffordshire bullterrier, Staffordshire bull terrier
+180 American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier
+181 Bedlington terrier
+182 Border terrier
+183 Kerry blue terrier
+184 Irish terrier
+185 Norfolk terrier
+186 Norwich terrier
+187 Yorkshire terrier
+188 wire-haired fox terrier
+189 Lakeland terrier
+190 Sealyham terrier, Sealyham
+191 Airedale, Airedale terrier
+192 cairn, cairn terrier
+193 Australian terrier
+194 Dandie Dinmont, Dandie Dinmont terrier
+195 Boston bull, Boston terrier
+196 miniature schnauzer
+197 giant schnauzer
+198 standard schnauzer
+199 Scotch terrier, Scottish terrier, Scottie
+200 Tibetan terrier, chrysanthemum dog
+201 silky terrier, Sydney silky
+202 soft-coated wheaten terrier
+203 West Highland white terrier
+204 Lhasa, Lhasa apso
+205 flat-coated retriever
+206 curly-coated retriever
+207 golden retriever
+208 Labrador retriever
+209 Chesapeake Bay retriever
+210 German short-haired pointer
+211 vizsla, Hungarian pointer
+212 English setter
+213 Irish setter, red setter
+214 Gordon setter
+215 Brittany spaniel
+216 clumber, clumber spaniel
+217 English springer, English springer spaniel
+218 Welsh springer spaniel
+219 cocker spaniel, English cocker spaniel, cocker
+220 Sussex spaniel
+221 Irish water spaniel
+222 kuvasz
+223 schipperke
+224 groenendael
+225 malinois
+226 briard
+227 kelpie
+228 komondor
+229 Old English sheepdog, bobtail
+230 Shetland sheepdog, Shetland sheep dog, Shetland
+231 collie
+232 Border collie
+233 Bouvier des Flandres, Bouviers des Flandres
+234 Rottweiler
+235 German shepherd, German shepherd dog, German police dog, alsatian
+236 Doberman, Doberman pinscher
+237 miniature pinscher
+238 Greater Swiss Mountain dog
+239 Bernese mountain dog
+240 Appenzeller
+241 EntleBucher
+242 boxer
+243 bull mastiff
+244 Tibetan mastiff
+245 French bulldog
+246 Great Dane
+247 Saint Bernard, St Bernard
+248 Eskimo dog, husky
+249 malamute, malemute, Alaskan malamute
+250 Siberian husky
+251 dalmatian, coach dog, carriage dog
+252 affenpinscher, monkey pinscher, monkey dog
+253 basenji
+254 pug, pug-dog
+255 Leonberg
+256 Newfoundland, Newfoundland dog
+257 Great Pyrenees
+258 Samoyed, Samoyede
+259 Pomeranian
+260 chow, chow chow
+261 keeshond
+262 Brabancon griffon
+263 Pembroke, Pembroke Welsh corgi
+264 Cardigan, Cardigan Welsh corgi
+265 toy poodle
+266 miniature poodle
+267 standard poodle
+268 Mexican hairless
+269 timber wolf, grey wolf, gray wolf, Canis lupus
+270 white wolf, Arctic wolf, Canis lupus tundrarum
+271 red wolf, maned wolf, Canis rufus, Canis niger
+272 coyote, prairie wolf, brush wolf, Canis latrans
+273 dingo, warrigal, warragal, Canis dingo
+274 dhole, Cuon alpinus
+275 African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus
+276 hyena, hyaena
+277 red fox, Vulpes vulpes
+278 kit fox, Vulpes macrotis
+279 Arctic fox, white fox, Alopex lagopus
+280 grey fox, gray fox, Urocyon cinereoargenteus
+281 tabby, tabby cat
+282 tiger cat
+283 Persian cat
+284 Siamese cat, Siamese
+285 Egyptian cat
+286 cougar, puma, catamount, mountain lion, painter, panther, Felis concolor
+287 lynx, catamount
+288 leopard, Panthera pardus
+289 snow leopard, ounce, Panthera uncia
+290 jaguar, panther, Panthera onca, Felis onca
+291 lion, king of beasts, Panthera leo
+292 tiger, Panthera tigris
+293 cheetah, chetah, Acinonyx jubatus
+294 brown bear, bruin, Ursus arctos
+295 American black bear, black bear, Ursus americanus, Euarctos americanus
+296 ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus
+297 sloth bear, Melursus ursinus, Ursus ursinus
+298 mongoose
+299 meerkat, mierkat
+300 tiger beetle
+301 ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle
+302 ground beetle, carabid beetle
+303 long-horned beetle, longicorn, longicorn beetle
+304 leaf beetle, chrysomelid
+305 dung beetle
+306 rhinoceros beetle
+307 weevil
+308 fly
+309 bee
+310 ant, emmet, pismire
+311 grasshopper, hopper
+312 cricket
+313 walking stick, walkingstick, stick insect
+314 cockroach, roach
+315 mantis, mantid
+316 cicada, cicala
+317 leafhopper
+318 lacewing, lacewing fly
+319 dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk
+320 damselfly
+321 admiral
+322 ringlet, ringlet butterfly
+323 monarch, monarch butterfly, milkweed butterfly, Danaus plexippus
+324 cabbage butterfly
+325 sulphur butterfly, sulfur butterfly
+326 lycaenid, lycaenid butterfly
+327 starfish, sea star
+328 sea urchin
+329 sea cucumber, holothurian
+330 wood rabbit, cottontail, cottontail rabbit
+331 hare
+332 Angora, Angora rabbit
+333 hamster
+334 porcupine, hedgehog
+335 fox squirrel, eastern fox squirrel, Sciurus niger
+336 marmot
+337 beaver
+338 guinea pig, Cavia cobaya
+339 sorrel
+340 zebra
+341 hog, pig, grunter, squealer, Sus scrofa
+342 wild boar, boar, Sus scrofa
+343 warthog
+344 hippopotamus, hippo, river horse, Hippopotamus amphibius
+345 ox
+346 water buffalo, water ox, Asiatic buffalo, Bubalus bubalis
+347 bison
+348 ram, tup
+349 bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis
+350 ibex, Capra ibex
+351 hartebeest
+352 impala, Aepyceros melampus
+353 gazelle
+354 Arabian camel, dromedary, Camelus dromedarius
+355 llama
+356 weasel
+357 mink
+358 polecat, fitch, foulmart, foumart, Mustela putorius
+359 black-footed ferret, ferret, Mustela nigripes
+360 otter
+361 skunk, polecat, wood pussy
+362 badger
+363 armadillo
+364 three-toed sloth, ai, Bradypus tridactylus
+365 orangutan, orang, orangutang, Pongo pygmaeus
+366 gorilla, Gorilla gorilla
+367 chimpanzee, chimp, Pan troglodytes
+368 gibbon, Hylobates lar
+369 siamang, Hylobates syndactylus, Symphalangus syndactylus
+370 guenon, guenon monkey
+371 patas, hussar monkey, Erythrocebus patas
+372 baboon
+373 macaque
+374 langur
+375 colobus, colobus monkey
+376 proboscis monkey, Nasalis larvatus
+377 marmoset
+378 capuchin, ringtail, Cebus capucinus
+379 howler monkey, howler
+380 titi, titi monkey
+381 spider monkey, Ateles geoffroyi
+382 squirrel monkey, Saimiri sciureus
+383 Madagascar cat, ring-tailed lemur, Lemur catta
+384 indri, indris, Indri indri, Indri brevicaudatus
+385 Indian elephant, Elephas maximus
+386 African elephant, Loxodonta africana
+387 lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens
+388 giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca
+389 barracouta, snoek
+390 eel
+391 coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch
+392 rock beauty, Holocanthus tricolor
+393 anemone fish
+394 sturgeon
+395 gar, garfish, garpike, billfish, Lepisosteus osseus
+396 lionfish
+397 puffer, pufferfish, blowfish, globefish
+398 abacus
+399 abaya
+400 academic gown, academic robe, judge's robe
+401 accordion, piano accordion, squeeze box
+402 acoustic guitar
+403 aircraft carrier, carrier, flattop, attack aircraft carrier
+404 airliner
+405 airship, dirigible
+406 altar
+407 ambulance
+408 amphibian, amphibious vehicle
+409 analog clock
+410 apiary, bee house
+411 apron
+412 ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin
+413 assault rifle, assault gun
+414 backpack, back pack, knapsack, packsack, rucksack, haversack
+415 bakery, bakeshop, bakehouse
+416 balance beam, beam
+417 balloon
+418 ballpoint, ballpoint pen, ballpen, Biro
+419 Band Aid
+420 banjo
+421 bannister, banister, balustrade, balusters, handrail
+422 barbell
+423 barber chair
+424 barbershop
+425 barn
+426 barometer
+427 barrel, cask
+428 barrow, garden cart, lawn cart, wheelbarrow
+429 baseball
+430 basketball
+431 bassinet
+432 bassoon
+433 bathing cap, swimming cap
+434 bath towel
+435 bathtub, bathing tub, bath, tub
+436 beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon
+437 beacon, lighthouse, beacon light, pharos
+438 beaker
+439 bearskin, busby, shako
+440 beer bottle
+441 beer glass
+442 bell cote, bell cot
+443 bib
+444 bicycle-built-for-two, tandem bicycle, tandem
+445 bikini, two-piece
+446 binder, ring-binder
+447 binoculars, field glasses, opera glasses
+448 birdhouse
+449 boathouse
+450 bobsled, bobsleigh, bob
+451 bolo tie, bolo, bola tie, bola
+452 bonnet, poke bonnet
+453 bookcase
+454 bookshop, bookstore, bookstall
+455 bottlecap
+456 bow
+457 bow tie, bow-tie, bowtie
+458 brass, memorial tablet, plaque
+459 brassiere, bra, bandeau
+460 breakwater, groin, groyne, mole, bulwark, seawall, jetty
+461 breastplate, aegis, egis
+462 broom
+463 bucket, pail
+464 buckle
+465 bulletproof vest
+466 bullet train, bullet
+467 butcher shop, meat market
+468 cab, hack, taxi, taxicab
+469 caldron, cauldron
+470 candle, taper, wax light
+471 cannon
+472 canoe
+473 can opener, tin opener
+474 cardigan
+475 car mirror
+476 carousel, carrousel, merry-go-round, roundabout, whirligig
+477 carpenter's kit, tool kit
+478 carton
+479 car wheel
+480 cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM
+481 cassette
+482 cassette player
+483 castle
+484 catamaran
+485 CD player
+486 cello, violoncello
+487 cellular telephone, cellular phone, cellphone, cell, mobile phone
+488 chain
+489 chainlink fence
+490 chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour
+491 chain saw, chainsaw
+492 chest
+493 chiffonier, commode
+494 chime, bell, gong
+495 china cabinet, china closet
+496 Christmas stocking
+497 church, church building
+498 cinema, movie theater, movie theatre, movie house, picture palace
+499 cleaver, meat cleaver, chopper
+500 cliff dwelling
+501 cloak
+502 clog, geta, patten, sabot
+503 cocktail shaker
+504 coffee mug
+505 coffeepot
+506 coil, spiral, volute, whorl, helix
+507 combination lock
+508 computer keyboard, keypad
+509 confectionery, confectionary, candy store
+510 container ship, containership, container vessel
+511 convertible
+512 corkscrew, bottle screw
+513 cornet, horn, trumpet, trump
+514 cowboy boot
+515 cowboy hat, ten-gallon hat
+516 cradle
+517 crane
+518 crash helmet
+519 crate
+520 crib, cot
+521 Crock Pot
+522 croquet ball
+523 crutch
+524 cuirass
+525 dam, dike, dyke
+526 desk
+527 desktop computer
+528 dial telephone, dial phone
+529 diaper, nappy, napkin
+530 digital clock
+531 digital watch
+532 dining table, board
+533 dishrag, dishcloth
+534 dishwasher, dish washer, dishwashing machine
+535 disk brake, disc brake
+536 dock, dockage, docking facility
+537 dogsled, dog sled, dog sleigh
+538 dome
+539 doormat, welcome mat
+540 drilling platform, offshore rig
+541 drum, membranophone, tympan
+542 drumstick
+543 dumbbell
+544 Dutch oven
+545 electric fan, blower
+546 electric guitar
+547 electric locomotive
+548 entertainment center
+549 envelope
+550 espresso maker
+551 face powder
+552 feather boa, boa
+553 file, file cabinet, filing cabinet
+554 fireboat
+555 fire engine, fire truck
+556 fire screen, fireguard
+557 flagpole, flagstaff
+558 flute, transverse flute
+559 folding chair
+560 football helmet
+561 forklift
+562 fountain
+563 fountain pen
+564 four-poster
+565 freight car
+566 French horn, horn
+567 frying pan, frypan, skillet
+568 fur coat
+569 garbage truck, dustcart
+570 gasmask, respirator, gas helmet
+571 gas pump, gasoline pump, petrol pump, island dispenser
+572 goblet
+573 go-kart
+574 golf ball
+575 golfcart, golf cart
+576 gondola
+577 gong, tam-tam
+578 gown
+579 grand piano, grand
+580 greenhouse, nursery, glasshouse
+581 grille, radiator grille
+582 grocery store, grocery, food market, market
+583 guillotine
+584 hair slide
+585 hair spray
+586 half track
+587 hammer
+588 hamper
+589 hand blower, blow dryer, blow drier, hair dryer, hair drier
+590 hand-held computer, hand-held microcomputer
+591 handkerchief, hankie, hanky, hankey
+592 hard disc, hard disk, fixed disk
+593 harmonica, mouth organ, harp, mouth harp
+594 harp
+595 harvester, reaper
+596 hatchet
+597 holster
+598 home theater, home theatre
+599 honeycomb
+600 hook, claw
+601 hoopskirt, crinoline
+602 horizontal bar, high bar
+603 horse cart, horse-cart
+604 hourglass
+605 iPod
+606 iron, smoothing iron
+607 jack-o'-lantern
+608 jean, blue jean, denim
+609 jeep, landrover
+610 jersey, T-shirt, tee shirt
+611 jigsaw puzzle
+612 jinrikisha, ricksha, rickshaw
+613 joystick
+614 kimono
+615 knee pad
+616 knot
+617 lab coat, laboratory coat
+618 ladle
+619 lampshade, lamp shade
+620 laptop, laptop computer
+621 lawn mower, mower
+622 lens cap, lens cover
+623 letter opener, paper knife, paperknife
+624 library
+625 lifeboat
+626 lighter, light, igniter, ignitor
+627 limousine, limo
+628 liner, ocean liner
+629 lipstick, lip rouge
+630 Loafer
+631 lotion
+632 loudspeaker, speaker, speaker unit, loudspeaker system, speaker system
+633 loupe, jeweler's loupe
+634 lumbermill, sawmill
+635 magnetic compass
+636 mailbag, postbag
+637 mailbox, letter box
+638 maillot
+639 maillot, tank suit
+640 manhole cover
+641 maraca
+642 marimba, xylophone
+643 mask
+644 matchstick
+645 maypole
+646 maze, labyrinth
+647 measuring cup
+648 medicine chest, medicine cabinet
+649 megalith, megalithic structure
+650 microphone, mike
+651 microwave, microwave oven
+652 military uniform
+653 milk can
+654 minibus
+655 miniskirt, mini
+656 minivan
+657 missile
+658 mitten
+659 mixing bowl
+660 mobile home, manufactured home
+661 Model T
+662 modem
+663 monastery
+664 monitor
+665 moped
+666 mortar
+667 mortarboard
+668 mosque
+669 mosquito net
+670 motor scooter, scooter
+671 mountain bike, all-terrain bike, off-roader
+672 mountain tent
+673 mouse, computer mouse
+674 mousetrap
+675 moving van
+676 muzzle
+677 nail
+678 neck brace
+679 necklace
+680 nipple
+681 notebook, notebook computer
+682 obelisk
+683 oboe, hautboy, hautbois
+684 ocarina, sweet potato
+685 odometer, hodometer, mileometer, milometer
+686 oil filter
+687 organ, pipe organ
+688 oscilloscope, scope, cathode-ray oscilloscope, CRO
+689 overskirt
+690 oxcart
+691 oxygen mask
+692 packet
+693 paddle, boat paddle
+694 paddlewheel, paddle wheel
+695 padlock
+696 paintbrush
+697 pajama, pyjama, pj's, jammies
+698 palace
+699 panpipe, pandean pipe, syrinx
+700 paper towel
+701 parachute, chute
+702 parallel bars, bars
+703 park bench
+704 parking meter
+705 passenger car, coach, carriage
+706 patio, terrace
+707 pay-phone, pay-station
+708 pedestal, plinth, footstall
+709 pencil box, pencil case
+710 pencil sharpener
+711 perfume, essence
+712 Petri dish
+713 photocopier
+714 pick, plectrum, plectron
+715 pickelhaube
+716 picket fence, paling
+717 pickup, pickup truck
+718 pier
+719 piggy bank, penny bank
+720 pill bottle
+721 pillow
+722 ping-pong ball
+723 pinwheel
+724 pirate, pirate ship
+725 pitcher, ewer
+726 plane, carpenter's plane, woodworking plane
+727 planetarium
+728 plastic bag
+729 plate rack
+730 plow, plough
+731 plunger, plumber's helper
+732 Polaroid camera, Polaroid Land camera
+733 pole
+734 police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria
+735 poncho
+736 pool table, billiard table, snooker table
+737 pop bottle, soda bottle
+738 pot, flowerpot
+739 potter's wheel
+740 power drill
+741 prayer rug, prayer mat
+742 printer
+743 prison, prison house
+744 projectile, missile
+745 projector
+746 puck, hockey puck
+747 punching bag, punch bag, punching ball, punchball
+748 purse
+749 quill, quill pen
+750 quilt, comforter, comfort, puff
+751 racer, race car, racing car
+752 racket, racquet
+753 radiator
+754 radio, wireless
+755 radio telescope, radio reflector
+756 rain barrel
+757 recreational vehicle, RV, R.V.
+758 reel
+759 reflex camera
+760 refrigerator, icebox
+761 remote control, remote
+762 restaurant, eating house, eating place, eatery
+763 revolver, six-gun, six-shooter
+764 rifle
+765 rocking chair, rocker
+766 rotisserie
+767 rubber eraser, rubber, pencil eraser
+768 rugby ball
+769 rule, ruler
+770 running shoe
+771 safe
+772 safety pin
+773 saltshaker, salt shaker
+774 sandal
+775 sarong
+776 sax, saxophone
+777 scabbard
+778 scale, weighing machine
+779 school bus
+780 schooner
+781 scoreboard
+782 screen, CRT screen
+783 screw
+784 screwdriver
+785 seat belt, seatbelt
+786 sewing machine
+787 shield, buckler
+788 shoe shop, shoe-shop, shoe store
+789 shoji
+790 shopping basket
+791 shopping cart
+792 shovel
+793 shower cap
+794 shower curtain
+795 ski
+796 ski mask
+797 sleeping bag
+798 slide rule, slipstick
+799 sliding door
+800 slot, one-armed bandit
+801 snorkel
+802 snowmobile
+803 snowplow, snowplough
+804 soap dispenser
+805 soccer ball
+806 sock
+807 solar dish, solar collector, solar furnace
+808 sombrero
+809 soup bowl
+810 space bar
+811 space heater
+812 space shuttle
+813 spatula
+814 speedboat
+815 spider web, spider's web
+816 spindle
+817 sports car, sport car
+818 spotlight, spot
+819 stage
+820 steam locomotive
+821 steel arch bridge
+822 steel drum
+823 stethoscope
+824 stole
+825 stone wall
+826 stopwatch, stop watch
+827 stove
+828 strainer
+829 streetcar, tram, tramcar, trolley, trolley car
+830 stretcher
+831 studio couch, day bed
+832 stupa, tope
+833 submarine, pigboat, sub, U-boat
+834 suit, suit of clothes
+835 sundial
+836 sunglass
+837 sunglasses, dark glasses, shades
+838 sunscreen, sunblock, sun blocker
+839 suspension bridge
+840 swab, swob, mop
+841 sweatshirt
+842 swimming trunks, bathing trunks
+843 swing
+844 switch, electric switch, electrical switch
+845 syringe
+846 table lamp
+847 tank, army tank, armored combat vehicle, armoured combat vehicle
+848 tape player
+849 teapot
+850 teddy, teddy bear
+851 television, television system
+852 tennis ball
+853 thatch, thatched roof
+854 theater curtain, theatre curtain
+855 thimble
+856 thresher, thrasher, threshing machine
+857 throne
+858 tile roof
+859 toaster
+860 tobacco shop, tobacconist shop, tobacconist
+861 toilet seat
+862 torch
+863 totem pole
+864 tow truck, tow car, wrecker
+865 toyshop
+866 tractor
+867 trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi
+868 tray
+869 trench coat
+870 tricycle, trike, velocipede
+871 trimaran
+872 tripod
+873 triumphal arch
+874 trolleybus, trolley coach, trackless trolley
+875 trombone
+876 tub, vat
+877 turnstile
+878 typewriter keyboard
+879 umbrella
+880 unicycle, monocycle
+881 upright, upright piano
+882 vacuum, vacuum cleaner
+883 vase
+884 vault
+885 velvet
+886 vending machine
+887 vestment
+888 viaduct
+889 violin, fiddle
+890 volleyball
+891 waffle iron
+892 wall clock
+893 wallet, billfold, notecase, pocketbook
+894 wardrobe, closet, press
+895 warplane, military plane
+896 washbasin, handbasin, washbowl, lavabo, wash-hand basin
+897 washer, automatic washer, washing machine
+898 water bottle
+899 water jug
+900 water tower
+901 whiskey jug
+902 whistle
+903 wig
+904 window screen
+905 window shade
+906 Windsor tie
+907 wine bottle
+908 wing
+909 wok
+910 wooden spoon
+911 wool, woolen, woollen
+912 worm fence, snake fence, snake-rail fence, Virginia fence
+913 wreck
+914 yawl
+915 yurt
+916 web site, website, internet site, site
+917 comic book
+918 crossword puzzle, crossword
+919 street sign
+920 traffic light, traffic signal, stoplight
+921 book jacket, dust cover, dust jacket, dust wrapper
+922 menu
+923 plate
+924 guacamole
+925 consomme
+926 hot pot, hotpot
+927 trifle
+928 ice cream, icecream
+929 ice lolly, lolly, lollipop, popsicle
+930 French loaf
+931 bagel, beigel
+932 pretzel
+933 cheeseburger
+934 hotdog, hot dog, red hot
+935 mashed potato
+936 head cabbage
+937 broccoli
+938 cauliflower
+939 zucchini, courgette
+940 spaghetti squash
+941 acorn squash
+942 butternut squash
+943 cucumber, cuke
+944 artichoke, globe artichoke
+945 bell pepper
+946 cardoon
+947 mushroom
+948 Granny Smith
+949 strawberry
+950 orange
+951 lemon
+952 fig
+953 pineapple, ananas
+954 banana
+955 jackfruit, jak, jack
+956 custard apple
+957 pomegranate
+958 hay
+959 carbonara
+960 chocolate sauce, chocolate syrup
+961 dough
+962 meat loaf, meatloaf
+963 pizza, pizza pie
+964 potpie
+965 burrito
+966 red wine
+967 espresso
+968 cup
+969 eggnog
+970 alp
+971 bubble
+972 cliff, drop, drop-off
+973 coral reef
+974 geyser
+975 lakeside, lakeshore
+976 promontory, headland, head, foreland
+977 sandbar, sand bar
+978 seashore, coast, seacoast, sea-coast
+979 valley, vale
+980 volcano
+981 ballplayer, baseball player
+982 groom, bridegroom
+983 scuba diver
+984 rapeseed
+985 daisy
+986 yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum
+987 corn
+988 acorn
+989 hip, rose hip, rosehip
+990 buckeye, horse chestnut, conker
+991 coral fungus
+992 agaric
+993 gyromitra
+994 stinkhorn, carrion fungus
+995 earthstar
+996 hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa
+997 bolete
+998 ear, spike, capitulum
+999 toilet tissue, toilet paper, bathroom tissue
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/initializer.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/initializer.py
new file mode 100644
index 000000000..b044e8088
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/initializer.py
@@ -0,0 +1,318 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
+Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file.
+"""
+
+import math
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+
+__all__ = [
+    'uniform_',
+    'normal_',
+    'constant_',
+    'ones_',
+    'zeros_',
+    'xavier_uniform_',
+    'xavier_normal_',
+    'kaiming_uniform_',
+    'kaiming_normal_',
+    'linear_init_',
+    'conv_init_',
+    'reset_initialized_parameter',
+]
+
+
+def _no_grad_uniform_(tensor, a, b):
+    with paddle.no_grad():
+        tensor.set_value(
+            paddle.uniform(
+                shape=tensor.shape, dtype=tensor.dtype, min=a, max=b))
+    return tensor
+
+
+def _no_grad_normal_(tensor, mean=0., std=1.):
+    with paddle.no_grad():
+        tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape))
+    return tensor
+
+
+def _no_grad_fill_(tensor, value=0.):
+    with paddle.no_grad():
+        tensor.set_value(paddle.full_like(tensor, value, dtype=tensor.dtype))
+    return tensor
+
+
+def uniform_(tensor, a, b):
+    """
+    Modified tensor inspace using uniform_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        a (float|int): min value.
+        b (float|int): max value.
+    Return:
+        tensor
+    """
+    return _no_grad_uniform_(tensor, a, b)
+
+
+def normal_(tensor, mean=0., std=1.):
+    """
+    Modified tensor inspace using normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mean (float|int): mean value.
+        std (float|int): std value.
+    Return:
+        tensor
+    """
+    return _no_grad_normal_(tensor, mean, std)
+
+
+def constant_(tensor, value=0.):
+    """
+    Modified tensor inspace using constant_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        value (float|int): value to fill tensor.
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, value)
+
+
+def ones_(tensor):
+    """
+    Modified tensor inspace using ones_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, 1)
+
+
+def zeros_(tensor):
+    """
+    Modified tensor inspace using zeros_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, 0)
+
+
+def _calculate_fan_in_and_fan_out(tensor, reverse=False):
+    """
+    Calculate (fan_in, _fan_out) for tensor
+
+    Args:
+        tensor (Tensor): paddle.Tensor
+        reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True
+
+    Return:
+        Tuple[fan_in, fan_out]
+    """
+    if tensor.ndim < 2:
+        raise ValueError(
+            "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
+        )
+
+    if reverse:
+        num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
+    else:
+        num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0]
+
+    receptive_field_size = 1
+    if tensor.ndim > 2:
+        receptive_field_size = np.prod(tensor.shape[2:])
+
+    fan_in = num_input_fmaps * receptive_field_size
+    fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+def xavier_uniform_(tensor, gain=1., reverse=False):
+    """
+    Modified tensor inspace using xavier_uniform_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        gain (float): super parameter, 1. default.
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    k = math.sqrt(3.0) * std
+    return _no_grad_uniform_(tensor, -k, k)
+
+
+def xavier_normal_(tensor, gain=1., reverse=False):
+    """
+    Modified tensor inspace using xavier_normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        gain (float): super parameter, 1. default.
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    return _no_grad_normal_(tensor, 0, std)
+
+
+# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html
+def _calculate_correct_fan(tensor, mode, reverse=False):
+    mode = mode.lower()
+    valid_modes = ['fan_in', 'fan_out']
+    if mode not in valid_modes:
+        raise ValueError("Mode {} not supported, please use one of {}".format(
+            mode, valid_modes))
+
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)
+
+    return fan_in if mode == 'fan_in' else fan_out
+
+
+def _calculate_gain(nonlinearity, param=None):
+    linear_fns = [
+        'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',
+        'conv_transpose2d', 'conv_transpose3d'
+    ]
+    if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
+        return 1
+    elif nonlinearity == 'tanh':
+        return 5.0 / 3
+    elif nonlinearity == 'relu':
+        return math.sqrt(2.0)
+    elif nonlinearity == 'leaky_relu':
+        if param is None:
+            negative_slope = 0.01
+        elif not isinstance(param, bool) and isinstance(
+                param, int) or isinstance(param, float):
+            # True/False are instances of int, hence check above
+            negative_slope = param
+        else:
+            raise ValueError("negative_slope {} not a valid number".format(
+                param))
+        return math.sqrt(2.0 / (1 + negative_slope**2))
+    elif nonlinearity == 'selu':
+        return 3.0 / 4
+    else:
+        raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
+
+
+def kaiming_uniform_(tensor,
+                     a=0,
+                     mode='fan_in',
+                     nonlinearity='leaky_relu',
+                     reverse=False):
+    """
+    Modified tensor inspace using kaiming_uniform method
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
+        nonlinearity (str): nonlinearity method name
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan = _calculate_correct_fan(tensor, mode, reverse)
+    gain = _calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    k = math.sqrt(3.0) * std
+    return _no_grad_uniform_(tensor, -k, k)
+
+
+def kaiming_normal_(tensor,
+                    a=0,
+                    mode='fan_in',
+                    nonlinearity='leaky_relu',
+                    reverse=False):
+    """
+    Modified tensor inspace using kaiming_normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
+        nonlinearity (str): nonlinearity method name
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan = _calculate_correct_fan(tensor, mode, reverse)
+    gain = _calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    return _no_grad_normal_(tensor, 0, std)
+
+
+def linear_init_(module):
+    bound = 1 / math.sqrt(module.weight.shape[0])
+    uniform_(module.weight, -bound, bound)
+    uniform_(module.bias, -bound, bound)
+
+
+def conv_init_(module):
+    bound = 1 / np.sqrt(np.prod(module.weight.shape[1:]))
+    uniform_(module.weight, -bound, bound)
+    if module.bias is not None:
+        uniform_(module.bias, -bound, bound)
+
+
+def bias_init_with_prob(prior_prob=0.01):
+    """initialize conv/fc bias value according to a given probability value."""
+    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
+    return bias_init
+
+
+@paddle.no_grad()
+def reset_initialized_parameter(model, include_self=True):
+    """
+    Reset initialized parameter using following method for [conv, linear, embedding, bn]
+
+    Args:
+        model (paddle.Layer): paddle Layer
+        include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself
+    Return:
+        None
+    """
+    for _, m in model.named_sublayers(include_self=include_self):
+        if isinstance(m, nn.Conv2D):
+            k = float(m._groups) / (m._in_channels * m._kernel_size[0] *
+                                    m._kernel_size[1])
+            k = math.sqrt(k)
+            _no_grad_uniform_(m.weight, -k, k)
+            if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
+                _no_grad_uniform_(m.bias, -k, k)
+
+        elif isinstance(m, nn.Linear):
+            k = math.sqrt(1. / m.weight.shape[0])
+            _no_grad_uniform_(m.weight, -k, k)
+            if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
+                _no_grad_uniform_(m.bias, -k, k)
+
+        elif isinstance(m, nn.Embedding):
+            _no_grad_normal_(m.weight, mean=0., std=1.)
+
+        elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)):
+            _no_grad_fill_(m.weight, 1.)
+            if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
+                _no_grad_fill_(m.bias, 0)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/logger.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/logger.py
new file mode 100644
index 000000000..b8d1ebaf8
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/logger.py
@@ -0,0 +1,173 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+import logging
+import os
+import sys
+
+import paddle.distributed as dist
+
+_logger = None
+
+
+class LoggerHook(object):
+    """
+    logs will print multi-times when calling Fleet API.
+    Commonly, only need to display single log at rank0 and ignore the others.
+    """
+    block = False
+
+    def __init__(self, log):
+        self.log = log
+
+    def __call__(self, *args, **kwargs):
+        if not self.block:
+            self.log(*args, **kwargs)
+
+
+def init_logger(name='ppcls',
+                log_file=None,
+                log_level=logging.INFO,
+                log_ranks="0"):
+    """Initialize and get a logger by name.
+    If the logger has not been initialized, this method will initialize the
+    logger by adding one or two handlers, otherwise the initialized logger will
+    be directly returned. During initialization, a StreamHandler will always be
+    added. If `log_file` is specified a FileHandler will also be added.
+    Args:
+        name (str): Logger name.
+        log_file (str | None): The log filename. If specified, a FileHandler
+            will be added to the logger.
+        log_level (int): The logger level. Note that only the process of
+            rank 0 is affected, and other processes will set the level to
+            "Error" thus be silent most of the time.
+        log_ranks (str): The ids of gpu to log which are separated by "," when more than 1, "0" by default.
+    Returns:
+        logging.Logger: The expected logger.
+    """
+    global _logger
+
+    #  solve mutiple init issue when using paddleclas.py and engin.engin
+    init_flag = False
+    if _logger is None:
+        _logger = logging.getLogger(name)
+        init_flag = True
+
+    formatter = logging.Formatter(
+        '[%(asctime)s] %(name)s %(levelname)s: %(message)s',
+        datefmt="%Y/%m/%d %H:%M:%S")
+
+    stream_handler = logging.StreamHandler(stream=sys.stdout)
+    stream_handler.setFormatter(formatter)
+    stream_handler._name = 'stream_handler'
+
+    # add stream_handler when _logger dose not contain stream_handler
+    for i, h in enumerate(_logger.handlers):
+        if h.get_name() == stream_handler.get_name():
+            break
+        if i == len(_logger.handlers) - 1:
+            _logger.addHandler(stream_handler)
+    if init_flag:
+        _logger.addHandler(stream_handler)
+
+    if log_file is not None and dist.get_rank() == 0:
+        log_file_folder = os.path.split(log_file)[0]
+        os.makedirs(log_file_folder, exist_ok=True)
+        file_handler = logging.FileHandler(log_file, 'a')
+        file_handler.setFormatter(formatter)
+        file_handler._name = 'file_handler'
+
+        # add file_handler when _logger dose not contain same file_handler
+        for i, h in enumerate(_logger.handlers):
+            if h.get_name() == file_handler.get_name() and \
+                    h.baseFilename == file_handler.baseFilename:
+                break
+            if i == len(_logger.handlers) - 1:
+                _logger.addHandler(file_handler)
+
+    if isinstance(log_ranks, str):
+        log_ranks = [int(i) for i in log_ranks.split(',')]
+    elif isinstance(log_ranks, int):
+        log_ranks = [log_ranks]
+    if dist.get_rank() in log_ranks:
+        _logger.setLevel(log_level)
+        LoggerHook.block = False
+    else:
+        _logger.setLevel(logging.ERROR)
+        LoggerHook.block = True
+    _logger.propagate = False
+
+
+@LoggerHook
+def info(fmt, *args):
+    _logger.info(fmt, *args)
+
+
+@LoggerHook
+def debug(fmt, *args):
+    _logger.debug(fmt, *args)
+
+
+@LoggerHook
+def warning(fmt, *args):
+    _logger.warning(fmt, *args)
+
+
+@LoggerHook
+def error(fmt, *args):
+    _logger.error(fmt, *args)
+
+
+def scaler(name, value, step, writer):
+    """
+    This function will draw a scalar curve generated by the visualdl.
+    Usage: Install visualdl: pip3 install visualdl==2.0.0b4
+           and then:
+           visualdl --logdir ./scalar --host 0.0.0.0 --port 8830 
+           to preview loss corve in real time.
+    """
+    if writer is None:
+        return
+    writer.add_scalar(tag=name, step=step, value=value)
+
+
+def advertise():
+    """
+    Show the advertising message like the following:
+
+    ===========================================================
+    ==        PaddleClas is powered by PaddlePaddle !        ==
+    ===========================================================
+    ==                                                       ==
+    ==   For more info please go to the following website.   ==
+    ==                                                       ==
+    ==       https://github.com/PaddlePaddle/PaddleClas      ==
+    ===========================================================
+
+    """
+    copyright = "PaddleClas is powered by PaddlePaddle !"
+    ad = "For more info please go to the following website."
+    website = "https://github.com/PaddlePaddle/PaddleClas"
+    AD_LEN = 6 + len(max([copyright, ad, website], key=len))
+
+    info("\n{0}\n{1}\n{2}\n{3}\n{4}\n{5}\n{6}\n{7}\n".format(
+        "=" * (AD_LEN + 4),
+        "=={}==".format(copyright.center(AD_LEN)),
+        "=" * (AD_LEN + 4),
+        "=={}==".format(' ' * AD_LEN),
+        "=={}==".format(ad.center(AD_LEN)),
+        "=={}==".format(' ' * AD_LEN),
+        "=={}==".format(website.center(AD_LEN)),
+        "=" * (AD_LEN + 4), ))
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/metrics.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/metrics.py
new file mode 100644
index 000000000..b0db68a75
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/metrics.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from sklearn.metrics import hamming_loss
+from sklearn.metrics import accuracy_score as accuracy_metric
+from sklearn.metrics import multilabel_confusion_matrix
+from sklearn.metrics import precision_recall_fscore_support
+from sklearn.metrics import average_precision_score
+from sklearn.preprocessing import binarize
+
+import numpy as np
+
+__all__ = ["multi_hot_encode", "hamming_distance", "accuracy_score", "precision_recall_fscore", "mean_average_precision"]
+
+
+def multi_hot_encode(logits, threshold=0.5):
+    """
+    Encode logits to multi-hot by elementwise for multilabel
+    """
+
+    return binarize(logits, threshold=threshold)
+
+
+def hamming_distance(output, target):
+    """
+    Soft metric based label for multilabel classification
+    Returns:
+        The smaller the return value is, the better model is.
+    """
+
+    return hamming_loss(target, output)
+
+
+def accuracy_score(output, target, base="sample"):
+    """
+    Hard metric for multilabel classification
+    Args:
+        output:
+        target:
+        base: ["sample", "label"], default="sample"
+            if "sample", return metric score based sample,
+            if "label", return metric score based label.
+    Returns:
+        accuracy:
+    """
+
+    assert base in ["sample", "label"], 'must be one of ["sample", "label"]'
+
+    if base == "sample":
+        accuracy = accuracy_metric(target, output)
+    elif base == "label":
+        mcm = multilabel_confusion_matrix(target, output)
+        tns = mcm[:, 0, 0]
+        fns = mcm[:, 1, 0]
+        tps = mcm[:, 1, 1]
+        fps = mcm[:, 0, 1]
+
+        accuracy = (sum(tps) + sum(tns)) / (sum(tps) + sum(tns) + sum(fns) + sum(fps))
+
+    return accuracy
+
+
+def precision_recall_fscore(output, target):
+    """
+    Metric based label for multilabel classification
+    Returns:
+        precisions:
+        recalls:
+        fscores:
+    """
+
+    precisions, recalls, fscores, _ = precision_recall_fscore_support(target, output)
+
+    return precisions, recalls, fscores
+
+
+def mean_average_precision(logits, target):
+    """
+    Calculate average precision
+    Args:
+        logits: probability from network before sigmoid or softmax
+        target: ground truth, 0 or 1
+    """
+    if not (isinstance(logits, np.ndarray) and isinstance(target, np.ndarray)):
+        raise TypeError("logits and target should be np.ndarray.")
+
+    aps = []
+    for i in range(target.shape[1]):
+        ap = average_precision_score(target[:, i], logits[:, i])
+        aps.append(ap)
+
+    return np.mean(aps)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/misc.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/misc.py
new file mode 100644
index 000000000..b63da7c5f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/misc.py
@@ -0,0 +1,155 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+__all__ = ['AverageMeter']
+
+
+class AverageMeter(object):
+    """
+    Computes and stores the average and current value
+    Code was based on https://github.com/pytorch/examples/blob/master/imagenet/main.py
+    """
+
+    def __init__(self, name='', fmt='f', postfix="", need_avg=True):
+        self.name = name
+        self.fmt = fmt
+        self.postfix = postfix
+        self.need_avg = need_avg
+        self.reset()
+
+    def reset(self):
+        """ reset """
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        """ update """
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    @property
+    def avg_info(self):
+        if isinstance(self.avg, paddle.Tensor):
+            self.avg = float(self.avg)
+        return "{}: {:.5f}".format(self.name, self.avg)
+
+    @property
+    def total(self):
+        return '{self.name}_sum: {self.sum:{self.fmt}}{self.postfix}'.format(
+            self=self)
+
+    @property
+    def total_minute(self):
+        return '{self.name} {s:{self.fmt}}{self.postfix} min'.format(
+            s=self.sum / 60, self=self)
+
+    @property
+    def mean(self):
+        return '{self.name}: {self.avg:{self.fmt}}{self.postfix}'.format(
+            self=self) if self.need_avg else ''
+
+    @property
+    def value(self):
+        return '{self.name}: {self.val:{self.fmt}}{self.postfix}'.format(
+            self=self)
+
+
+class AttrMeter(object):
+    """
+    Computes and stores the average and current value
+    Code was based on https://github.com/pytorch/examples/blob/master/imagenet/main.py
+    """
+
+    def __init__(self, threshold=0.5):
+        self.threshold = threshold
+        self.reset()
+
+    def reset(self):
+        self.gt_pos = 0
+        self.gt_neg = 0
+        self.true_pos = 0
+        self.true_neg = 0
+        self.false_pos = 0
+        self.false_neg = 0
+
+        self.gt_pos_ins = []
+        self.true_pos_ins = []
+        self.intersect_pos = []
+        self.union_pos = []
+
+    def update(self, metric_dict):
+        self.gt_pos += metric_dict['gt_pos']
+        self.gt_neg += metric_dict['gt_neg']
+        self.true_pos += metric_dict['true_pos']
+        self.true_neg += metric_dict['true_neg']
+        self.false_pos += metric_dict['false_pos']
+        self.false_neg += metric_dict['false_neg']
+
+        self.gt_pos_ins += metric_dict['gt_pos_ins'].tolist()
+        self.true_pos_ins += metric_dict['true_pos_ins'].tolist()
+        self.intersect_pos += metric_dict['intersect_pos'].tolist()
+        self.union_pos += metric_dict['union_pos'].tolist()
+
+    def res(self):
+        import numpy as np
+        eps = 1e-20
+        label_pos_recall = 1.0 * self.true_pos / (
+            self.gt_pos + eps)  # true positive
+        label_neg_recall = 1.0 * self.true_neg / (
+            self.gt_neg + eps)  # true negative
+        # mean accuracy
+        label_ma = (label_pos_recall + label_neg_recall) / 2
+
+        label_pos_recall = np.mean(label_pos_recall)
+        label_neg_recall = np.mean(label_neg_recall)
+        label_prec = (self.true_pos / (self.true_pos + self.false_pos + eps))
+        label_acc = (self.true_pos /
+                     (self.true_pos + self.false_pos + self.false_neg + eps))
+        label_f1 = np.mean(2 * label_prec * label_pos_recall /
+                           (label_prec + label_pos_recall + eps))
+
+        ma = (np.mean(label_ma))
+
+        self.gt_pos_ins = np.array(self.gt_pos_ins)
+        self.true_pos_ins = np.array(self.true_pos_ins)
+        self.intersect_pos = np.array(self.intersect_pos)
+        self.union_pos = np.array(self.union_pos)
+        instance_acc = self.intersect_pos / (self.union_pos + eps)
+        instance_prec = self.intersect_pos / (self.true_pos_ins + eps)
+        instance_recall = self.intersect_pos / (self.gt_pos_ins + eps)
+        instance_f1 = 2 * instance_prec * instance_recall / (
+            instance_prec + instance_recall + eps)
+
+        instance_acc = np.mean(instance_acc)
+        instance_prec = np.mean(instance_prec)
+        instance_recall = np.mean(instance_recall)
+        instance_f1 = 2 * instance_prec * instance_recall / (
+            instance_prec + instance_recall + eps)
+
+        instance_acc = np.mean(instance_acc)
+        instance_prec = np.mean(instance_prec)
+        instance_recall = np.mean(instance_recall)
+        instance_f1 = np.mean(instance_f1)
+
+        res = [
+            ma, label_f1, label_pos_recall, label_neg_recall, instance_f1,
+            instance_acc, instance_prec, instance_recall
+        ]
+        return res
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/model_zoo.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/model_zoo.py
new file mode 100644
index 000000000..e9ab5992d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/model_zoo.py
@@ -0,0 +1,213 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import requests
+import shutil
+import tarfile
+import tqdm
+import zipfile
+
+from ..arch.utils import similar_architectures
+from . import logger
+
+__all__ = ['get']
+
+DOWNLOAD_RETRY_LIMIT = 3
+
+
+class UrlError(Exception):
+    """ UrlError
+    """
+
+    def __init__(self, url='', code=''):
+        message = "Downloading from {} failed with code {}!".format(url, code)
+        super(UrlError, self).__init__(message)
+
+
+class ModelNameError(Exception):
+    """ ModelNameError
+    """
+
+    def __init__(self, message=''):
+        super(ModelNameError, self).__init__(message)
+
+
+class RetryError(Exception):
+    """ RetryError
+    """
+
+    def __init__(self, url='', times=''):
+        message = "Download from {} failed. Retry({}) limit reached".format(
+            url, times)
+        super(RetryError, self).__init__(message)
+
+
+def _get_url(architecture, postfix="pdparams"):
+    prefix = "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/"
+    fname = architecture + "_pretrained." + postfix
+    return prefix + fname
+
+
+def _move_and_merge_tree(src, dst):
+    """
+    Move src directory to dst, if dst is already exists,
+    merge src to dst
+    """
+    if not os.path.exists(dst):
+        shutil.move(src, dst)
+    elif os.path.isfile(src):
+        shutil.move(src, dst)
+    else:
+        for fp in os.listdir(src):
+            src_fp = os.path.join(src, fp)
+            dst_fp = os.path.join(dst, fp)
+            if os.path.isdir(src_fp):
+                if os.path.isdir(dst_fp):
+                    _move_and_merge_tree(src_fp, dst_fp)
+                else:
+                    shutil.move(src_fp, dst_fp)
+            elif os.path.isfile(src_fp) and \
+                    not os.path.isfile(dst_fp):
+                shutil.move(src_fp, dst_fp)
+
+
+def _download(url, path):
+    """
+    Download from url, save to path.
+    url (str): download url
+    path (str): download to given path
+    """
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+    fname = os.path.split(url)[-1]
+    fullname = os.path.join(path, fname)
+    retry_cnt = 0
+
+    while not os.path.exists(fullname):
+        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+            retry_cnt += 1
+        else:
+            raise RetryError(url, DOWNLOAD_RETRY_LIMIT)
+
+        logger.info("Downloading {} from {}".format(fname, url))
+
+        req = requests.get(url, stream=True)
+        if req.status_code != 200:
+            raise UrlError(url, req.status_code)
+
+        # For protecting download interupted, download to
+        # tmp_fullname firstly, move tmp_fullname to fullname
+        # after download finished
+        tmp_fullname = fullname + "_tmp"
+        total_size = req.headers.get('content-length')
+        with open(tmp_fullname, 'wb') as f:
+            if total_size:
+                for chunk in tqdm.tqdm(
+                        req.iter_content(chunk_size=1024),
+                        total=(int(total_size) + 1023) // 1024,
+                        unit='KB'):
+                    f.write(chunk)
+            else:
+                for chunk in req.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+        shutil.move(tmp_fullname, fullname)
+
+    return fullname
+
+
+def _decompress(fname):
+    """
+    Decompress for zip and tar file
+    """
+    logger.info("Decompressing {}...".format(fname))
+
+    # For protecting decompressing interupted,
+    # decompress to fpath_tmp directory firstly, if decompress
+    # successed, move decompress files to fpath and delete
+    # fpath_tmp and remove download compress file.
+    fpath = os.path.split(fname)[0]
+    fpath_tmp = os.path.join(fpath, 'tmp')
+    if os.path.isdir(fpath_tmp):
+        shutil.rmtree(fpath_tmp)
+        os.makedirs(fpath_tmp)
+
+    if fname.find('tar') >= 0:
+        with tarfile.open(fname) as tf:
+            tf.extractall(path=fpath_tmp)
+    elif fname.find('zip') >= 0:
+        with zipfile.ZipFile(fname) as zf:
+            zf.extractall(path=fpath_tmp)
+    else:
+        raise TypeError("Unsupport compress file type {}".format(fname))
+
+    fs = os.listdir(fpath_tmp)
+    assert len(
+        fs
+    ) == 1, "There should just be 1 pretrained path in an archive file but got {}.".format(
+        len(fs))
+
+    f = fs[0]
+    src_dir = os.path.join(fpath_tmp, f)
+    dst_dir = os.path.join(fpath, f)
+    _move_and_merge_tree(src_dir, dst_dir)
+
+    shutil.rmtree(fpath_tmp)
+    os.remove(fname)
+
+    return f
+
+
+def _get_pretrained():
+    with open('./ppcls/utils/pretrained.list') as flist:
+        pretrained = [line.strip() for line in flist]
+    return pretrained
+
+
+def _check_pretrained_name(architecture):
+    assert isinstance(architecture, str), \
+        ("the type of architecture({}) should be str". format(architecture))
+    pretrained = _get_pretrained()
+    similar_names = similar_architectures(architecture, pretrained)
+    model_list = ', '.join(similar_names)
+    err = "{} is not exist! Maybe you want: [{}]" \
+          "".format(architecture, model_list)
+    if architecture not in similar_names:
+        raise ModelNameError(err)
+
+
+def list_models():
+    pretrained = _get_pretrained()
+    msg = "All avialable pretrained models are as follows: {}".format(
+        pretrained)
+    logger.info(msg)
+    return
+
+
+def get(architecture, path, decompress=False, postfix="pdparams"):
+    """
+    Get the pretrained model.
+    """
+    _check_pretrained_name(architecture)
+    url = _get_url(architecture, postfix=postfix)
+    fname = _download(url, path)
+    if postfix == "tar" and decompress:
+        _decompress(fname)
+    logger.info("download {} finished ".format(fname))
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/pedestrian_attribute_label_list.txt b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/pedestrian_attribute_label_list.txt
new file mode 100644
index 000000000..af6e0df1c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/pedestrian_attribute_label_list.txt
@@ -0,0 +1,26 @@
+0 Hat(帽子)
+1 Glasses(眼镜)
+2 ShortSleeve(短袖)
+3 LongSleeve(长袖)
+4 UpperStride(上衣条纹)
+5 UpperLogo(上衣有标志)
+6 UpperPlaid(上衣格子)
+7 UpperSplice(上衣拼接)
+8 LowerStripe(裤子条纹)
+9 LowerPattern(裤子图案)
+10 LongCoat(长外套)
+11 Trousers(长裤)
+12 Shorts(短裤)
+13 Skirt&Dress(裙子或连衣裙)
+14 Boots(靴子)
+15 HandBag(手提包)
+16 ShoulderBag(单肩包)
+17 Backpack(背包)
+18 HoldObjectsInFront(手持物品在前)
+19 AgeLess18(年龄小于18岁)
+20 Age18-60(年龄在18-60岁之间)
+21 AgeOver60(年龄大于60岁)
+22 Female(女性)
+23 Front(面朝前)
+24 Side(侧面)
+25 Back(背面)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/pretrained.list b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/pretrained.list
new file mode 100644
index 000000000..36d70f5a2
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/pretrained.list
@@ -0,0 +1,121 @@
+ResNet18
+ResNet34
+ResNet50
+ResNet101
+ResNet152
+ResNet50_vc
+ResNet18_vd
+ResNet34_vd
+ResNet50_vd
+ResNet50_vd_v2
+ResNet101_vd
+ResNet152_vd
+ResNet200_vd
+ResNet50_vd_ssld
+ResNet50_vd_ssld_v2
+Fix_ResNet50_vd_ssld_v2
+ResNet101_vd_ssld
+MobileNetV3_large_x0_35
+MobileNetV3_large_x0_5
+MobileNetV3_large_x0_75
+MobileNetV3_large_x1_0
+MobileNetV3_large_x1_25
+MobileNetV3_small_x0_35
+MobileNetV3_small_x0_5
+MobileNetV3_small_x0_75
+MobileNetV3_small_x1_0
+MobileNetV3_small_x1_25
+MobileNetV3_large_x1_0_ssld
+MobileNetV3_large_x1_0_ssld_int8
+MobileNetV3_small_x1_0_ssld
+MobileNetV2_x0_25
+MobileNetV2_x0_5
+MobileNetV2_x0_75
+MobileNetV2
+MobileNetV2_x1_5
+MobileNetV2_x2_0
+MobileNetV2_ssld
+MobileNetV1_x0_25
+MobileNetV1_x0_5
+MobileNetV1_x0_75
+MobileNetV1
+MobileNetV1_ssld
+ShuffleNetV2_x0_25
+ShuffleNetV2_x0_33
+ShuffleNetV2_x0_5
+ShuffleNetV2
+ShuffleNetV2_x1_5
+ShuffleNetV2_x2_0
+ShuffleNetV2_swish
+ResNeXt50_32x4d
+ResNeXt50_64x4d
+ResNeXt101_32x4d
+ResNeXt101_64x4d
+ResNeXt152_32x4d
+ResNeXt152_64x4d
+ResNeXt50_vd_32x4d
+ResNeXt50_vd_64x4d
+ResNeXt101_vd_32x4d
+ResNeXt101_vd_64x4d
+ResNeXt152_vd_32x4d
+ResNeXt152_vd_64x4d
+SE_ResNet18_vd
+SE_ResNet34_vd
+SE_ResNet50_vd
+SE_ResNeXt50_32x4d
+SE_ResNeXt101_32x4d
+SE_ResNeXt50_vd_32x4d
+SENet154_vd
+Res2Net50_26w_4s
+Res2Net50_vd_26w_4s
+Res2Net50_14w_8s
+Res2Net101_vd_26w_4s
+Res2Net200_vd_26w_4s
+GoogLeNet
+InceptionV4
+Xception41
+Xception41_deeplab
+Xception65
+Xception65_deeplab
+Xception71
+HRNet_W18_C
+HRNet_W30_C
+HRNet_W32_C
+HRNet_W40_C
+HRNet_W44_C
+HRNet_W48_C
+HRNet_W64_C
+DPN68
+DPN92
+DPN98
+DPN107
+DPN131
+DenseNet121
+DenseNet161
+DenseNet169
+DenseNet201
+DenseNet264
+EfficientNetB0_small
+EfficientNetB0
+EfficientNetB1
+EfficientNetB2
+EfficientNetB3
+EfficientNetB4
+EfficientNetB5
+EfficientNetB6
+EfficientNetB7
+ResNeXt101_32x8d_wsl
+ResNeXt101_32x16d_wsl
+ResNeXt101_32x32d_wsl
+ResNeXt101_32x48d_wsl
+Fix_ResNeXt101_32x48d_wsl
+AlexNet
+SqueezeNet1_0
+SqueezeNet1_1
+VGG11
+VGG13
+VGG16
+VGG19
+DarkNet53_ImageNet1k
+ResNet50_ACNet_deploy
+CSPResNet50_leaky
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/profiler.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/profiler.py
new file mode 100644
index 000000000..28ac46736
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/profiler.py
@@ -0,0 +1,129 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import paddle
+import paddle.profiler as profiler
+
+# A global variable to record the number of calling times for profiler
+# functions. It is used to specify the tracing range of training steps.
+_profiler_step_id = 0
+
+# A global variable to avoid parsing from string every time.
+_profiler_options = None
+_prof = None
+
+class ProfilerOptions(object):
+    '''
+    Use a string to initialize a ProfilerOptions.
+    The string should be in the format: "key1=value1;key2=value;key3=value3".
+    For example:
+      "profile_path=model.profile"
+      "batch_range=[50, 60]; profile_path=model.profile"
+      "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
+
+    ProfilerOptions supports following key-value pair:
+      batch_range      - a integer list, e.g. [100, 110].
+      state            - a string, the optional values are 'CPU', 'GPU' or 'All'. 
+      sorted_key       - a string, the optional values are 'calls', 'total',
+                         'max', 'min' or 'ave.
+      tracer_option    - a string, the optional values are 'Default', 'OpDetail',
+                         'AllOpDetail'.
+      profile_path     - a string, the path to save the serialized profile data,
+                         which can be used to generate a timeline.
+      exit_on_finished - a boolean.
+    '''
+
+    def __init__(self, options_str):
+        assert isinstance(options_str, str)
+
+        self._options = {
+            'batch_range': [10, 20],
+            'state': 'All',
+            'sorted_key': 'total',
+            'tracer_option': 'Default',
+            'profile_path': '/tmp/profile',
+            'exit_on_finished': True,
+            'timer_only': True
+        }
+        self._parse_from_string(options_str)
+
+    def _parse_from_string(self, options_str):
+        for kv in options_str.replace(' ', '').split(';'):
+            key, value = kv.split('=')
+            if key == 'batch_range':
+                value_list = value.replace('[', '').replace(']', '').split(',')
+                value_list = list(map(int, value_list))
+                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
+                        1] > value_list[0]:
+                    self._options[key] = value_list
+            elif key == 'exit_on_finished':
+                self._options[key] = value.lower() in ("yes", "true", "t", "1")
+            elif key in [
+                    'state', 'sorted_key', 'tracer_option', 'profile_path'
+            ]:
+                self._options[key] = value
+            elif key == 'timer_only':
+                self._options[key] = value
+
+    def __getitem__(self, name):
+        if self._options.get(name, None) is None:
+            raise ValueError(
+                "ProfilerOptions does not have an option named %s." % name)
+        return self._options[name]
+
+
+def add_profiler_step(options_str=None):
+    '''
+    Enable the operator-level timing using PaddlePaddle's profiler.
+    The profiler uses a independent variable to count the profiler steps.
+    One call of this function is treated as a profiler step.
+    Args:
+      profiler_options - a string to initialize the ProfilerOptions.
+                         Default is None, and the profiler is disabled.
+    '''
+    if options_str is None:
+        return
+
+    global _prof 
+    global _profiler_step_id
+    global _profiler_options
+
+    if _profiler_options is None:
+        _profiler_options = ProfilerOptions(options_str)
+    # profile : https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/performance_improving/profiling_model.html#chakanxingnengshujudetongjibiaodan
+    # timer_only = True  only the model's throughput and time overhead are displayed
+    # timer_only = False calling summary can print a statistical form that presents performance data from different perspectives.
+    # timer_only = False the output Timeline information can be found in the profiler_log directory
+    if _prof is None:
+        _timer_only = str(_profiler_options['timer_only']) == str(True)
+        _prof = profiler.Profiler(
+                   scheduler = (_profiler_options['batch_range'][0], _profiler_options['batch_range'][1]),
+                   on_trace_ready = profiler.export_chrome_tracing('./profiler_log'),
+                   timer_only = _timer_only)
+        _prof.start()
+    else:
+        _prof.step()
+        
+    if _profiler_step_id == _profiler_options['batch_range'][1]:
+        _prof.stop()
+        _prof.summary(
+             op_detail=True,
+             thread_sep=False,
+             time_unit='ms')
+        _prof = None
+        if _profiler_options['exit_on_finished']:
+            sys.exit(0)
+
+    _profiler_step_id += 1
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/save_load.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/save_load.py
new file mode 100644
index 000000000..13ccab2ad
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/save_load.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import errno
+import os
+import json
+
+import paddle
+from . import logger
+from .download import get_weights_path_from_url
+
+__all__ = ['init_model', 'save_model', 'load_dygraph_pretrain']
+
+
+def _mkdir_if_not_exist(path):
+    """
+    mkdir if not exists, ignore the exception when multiprocess mkdir together
+    """
+    if not os.path.exists(path):
+        try:
+            os.makedirs(path)
+        except OSError as e:
+            if e.errno == errno.EEXIST and os.path.isdir(path):
+                logger.warning(
+                    'be happy if some process has already created {}'.format(
+                        path))
+            else:
+                raise OSError('Failed to mkdir {}'.format(path))
+
+
+def _extract_student_weights(all_params, student_prefix="Student."):
+    s_params = {
+        key[len(student_prefix):]: all_params[key]
+        for key in all_params if student_prefix in key
+    }
+    return s_params
+
+
+def _set_ssld_pretrained(pretrained_path,
+                         use_ssld=False,
+                         use_ssld_stage1_pretrained=False):
+    if use_ssld and "ssld" not in pretrained_path:
+        pretrained_path = pretrained_path.replace("_pretrained",
+                                                  "_ssld_pretrained")
+    if use_ssld_stage1_pretrained and "ssld" in pretrained_path:
+        pretrained_path = pretrained_path.replace("ssld_pretrained",
+                                                  "ssld_stage1_pretrained")
+    return pretrained_path
+
+
+def load_dygraph_pretrain(model,
+                          pretrained_path,
+                          use_ssld=False,
+                          use_ssld_stage1_pretrained=False,
+                          use_imagenet22k_pretrained=False,
+                          use_imagenet22kto1k_pretrained=False):
+    if pretrained_path.startswith(("http://", "https://")):
+        pretrained_path = _set_ssld_pretrained(
+            pretrained_path,
+            use_ssld=use_ssld,
+            use_ssld_stage1_pretrained=use_ssld_stage1_pretrained)
+        if use_imagenet22k_pretrained:
+            pretrained_path = pretrained_path.replace("_pretrained",
+                                                      "_22k_pretrained")
+        if use_imagenet22kto1k_pretrained:
+            pretrained_path = pretrained_path.replace("_pretrained",
+                                                      "_22kto1k_pretrained")
+        pretrained_path = get_weights_path_from_url(pretrained_path)
+    if not pretrained_path.endswith('.pdparams'):
+        pretrained_path = pretrained_path + '.pdparams'
+    if not os.path.exists(pretrained_path):
+        raise ValueError("Model pretrain path {} does not "
+                         "exists.".format(pretrained_path))
+    param_state_dict = paddle.load(pretrained_path)
+    if isinstance(model, list):
+        for m in model:
+            if hasattr(m, 'set_dict'):
+                m.set_dict(param_state_dict)
+    else:
+        model.set_dict(param_state_dict)
+    logger.info("Finish load pretrained model from {}".format(pretrained_path))
+    return
+
+
+def load_distillation_model(model, pretrained_model):
+    logger.info("In distillation mode, teacher model will be "
+                "loaded firstly before student model.")
+
+    if not isinstance(pretrained_model, list):
+        pretrained_model = [pretrained_model]
+
+    teacher = model.teacher if hasattr(model,
+                                       "teacher") else model._layers.teacher
+    student = model.student if hasattr(model,
+                                       "student") else model._layers.student
+    load_dygraph_pretrain(teacher, path=pretrained_model[0])
+    logger.info("Finish initing teacher model from {}".format(pretrained_model))
+    # load student model
+    if len(pretrained_model) >= 2:
+        load_dygraph_pretrain(student, path=pretrained_model[1])
+        logger.info("Finish initing student model from {}".format(
+            pretrained_model))
+
+
+def init_model(config,
+               net,
+               optimizer=None,
+               loss: paddle.nn.Layer=None,
+               ema=None):
+    """
+    load model from checkpoint or pretrained_model
+    """
+    checkpoints = config.get('checkpoints')
+    if checkpoints and optimizer is not None:
+        assert os.path.exists(checkpoints + ".pdparams"), \
+            "Given dir {}.pdparams not exist.".format(checkpoints)
+        assert os.path.exists(checkpoints + ".pdopt"), \
+            "Given dir {}.pdopt not exist.".format(checkpoints)
+        # load state dict
+        opti_dict = paddle.load(checkpoints + ".pdopt")
+        metric_dict = paddle.load(checkpoints + ".pdstates")
+        if ema is not None:
+            assert os.path.exists(checkpoints + ".pdema"), \
+                "Given dir {}.pdema not exist.".format(checkpoints)
+            para_dict = paddle.load(checkpoints + ".pdema")
+            para_ema_dict = paddle.load(checkpoints + ".pdparams")
+            ema.set_state_dict(para_ema_dict)
+        else:
+            para_dict = paddle.load(checkpoints + ".pdparams")
+        metric_dict["metric"] = 0.0
+        # set state dict
+        net.set_state_dict(para_dict)
+        loss.set_state_dict(para_dict)
+        for i in range(len(optimizer)):
+            optimizer[i].set_state_dict(opti_dict[i] if isinstance(
+                opti_dict, list) else opti_dict)
+        logger.info("Finish load checkpoints from {}".format(checkpoints))
+        return metric_dict
+
+    pretrained_model = config.get('pretrained_model')
+    use_distillation = config.get('use_distillation', False)
+    if pretrained_model:
+        if use_distillation:
+            load_distillation_model(net, pretrained_model)
+        else:  # common load
+            load_dygraph_pretrain(net, path=pretrained_model)
+            logger.info("Finish load pretrained model from {}".format(
+                pretrained_model))
+
+
+def save_model(net,
+               optimizer,
+               metric_info,
+               model_path,
+               ema=None,
+               model_name="",
+               prefix='ppcls',
+               loss: paddle.nn.Layer=None,
+               save_student_model=False):
+    """
+    save model to the target path
+    """
+    if paddle.distributed.get_rank() != 0:
+        return
+
+    if prefix == 'best_model':
+        best_model_path = os.path.join(model_path, 'best_model')
+        _mkdir_if_not_exist(best_model_path)
+
+    _mkdir_if_not_exist(model_path)
+    model_path = os.path.join(model_path, prefix)
+
+    params_state_dict = net.state_dict()
+    if loss is not None:
+        loss_state_dict = loss.state_dict()
+        keys_inter = set(params_state_dict.keys()) & set(loss_state_dict.keys())
+        assert len(keys_inter) == 0, \
+            f"keys in model and loss state_dict must be unique, but got intersection {keys_inter}"
+        params_state_dict.update(loss_state_dict)
+
+    if save_student_model:
+        s_params = _extract_student_weights(params_state_dict)
+        if len(s_params) > 0:
+            paddle.save(s_params, model_path + "_student.pdparams")
+    if ema is not None:
+        paddle.save(params_state_dict, model_path + ".pdema")
+        paddle.save(ema.state_dict(), model_path + ".pdparams")
+    else:
+        paddle.save(params_state_dict, model_path + ".pdparams")
+
+    if prefix == 'best_model':
+        best_model_path = os.path.join(best_model_path, 'model')
+        paddle.save(params_state_dict, best_model_path + ".pdparams")
+    paddle.save([opt.state_dict() for opt in optimizer], model_path + ".pdopt")
+    paddle.save(metric_info, model_path + ".pdstates")
+    logger.info("Already save model in {}".format(model_path))
+
+
+def save_model_info(model_info, save_path, prefix):
+    """
+    save model info to the target path
+    """
+    if paddle.distributed.get_rank() != 0:
+        return
+    save_path = os.path.join(save_path, prefix)
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    with open(os.path.join(save_path, f'{prefix}.info.json'), 'w') as f:
+        json.dump(model_info, f)
+    logger.info("Already save model info in {}".format(save_path))
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/save_result.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/save_result.py
new file mode 100644
index 000000000..f7613db36
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/save_result.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import json
+import yaml
+import paddle
+
+from . import logger
+
+
+def save_predict_result(save_path, result):
+    if os.path.splitext(save_path)[-1] == '':
+        if save_path[-1] == "/":
+            save_path = save_path[:-1]
+        save_path = save_path + '.json'
+    elif os.path.splitext(save_path)[-1] == '.json':
+        save_path = save_path
+    else:
+        raise Exception(
+            f"{save_path} is invalid input path, only files in json format are supported."
+        )
+
+    if os.path.exists(save_path):
+        logger.warning(f"The file {save_path} will be overwritten.")
+    with open(save_path, 'w', encoding='utf-8') as f:
+        json.dump(result, f)
+
+
+def update_train_results(config,
+                         prefix,
+                         metric_info,
+                         done_flag=False,
+                         last_num=5,
+                         ema=False):
+
+    if paddle.distributed.get_rank() != 0:
+        return
+
+    assert last_num >= 1
+    train_results_path = os.path.join(config["Global"]["output_dir"],
+                                      "train_results.json")
+    save_model_tag = ["pdparams", "pdopt", "pdstates"]
+    save_inference_tag = [
+        "inference_config", "pdmodel", "pdiparams", "pdiparams.info"
+    ]
+    if ema:
+        save_model_tag.append("pdema")
+    if os.path.exists(train_results_path):
+        with open(train_results_path, "r") as fp:
+            train_results = json.load(fp)
+    else:
+        train_results = {}
+        train_results["model_name"] = config["Global"].get("pdx_model_name",
+                                                           None)
+        if config.get("infer", None):
+            train_results["label_dict"] = config["Infer"]["PostProcess"].get(
+                "class_id_map_file", "")
+        else:
+            train_results["label_dict"] = ""
+        train_results["train_log"] = "train.log"
+        train_results["visualdl_log"] = ""
+        train_results["config"] = "config.yaml"
+        train_results["models"] = {}
+        for i in range(1, last_num + 1):
+            train_results["models"][f"last_{i}"] = {}
+        train_results["models"]["best"] = {}
+    train_results["done_flag"] = done_flag
+    if prefix == "best_model":
+        train_results["models"]["best"]["score"] = metric_info["metric"]
+        for tag in save_model_tag:
+            train_results["models"]["best"][tag] = os.path.join(
+                prefix, f"{prefix}.{tag}")
+        for tag in save_inference_tag:
+            train_results["models"]["best"][tag] = os.path.join(
+                prefix, "inference", f"inference.{tag}"
+                if tag != "inference_config" else "inference.yml")
+    else:
+        for i in range(last_num - 1, 0, -1):
+            train_results["models"][f"last_{i + 1}"] = train_results["models"][
+                f"last_{i}"].copy()
+        train_results["models"][f"last_{1}"]["score"] = metric_info["metric"]
+        for tag in save_model_tag:
+            train_results["models"][f"last_{1}"][tag] = os.path.join(
+                prefix, f"{prefix}.{tag}")
+        for tag in save_inference_tag:
+            train_results["models"][f"last_{1}"][tag] = os.path.join(
+                prefix, "inference", f"inference.{tag}"
+                if tag != "inference_config" else "inference.yml")
+
+    with open(train_results_path, "w") as fp:
+        json.dump(train_results, fp)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/vehicle_attribute_label_list.txt b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/vehicle_attribute_label_list.txt
new file mode 100644
index 000000000..03ad382fe
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/utils/vehicle_attribute_label_list.txt
@@ -0,0 +1,19 @@
+0 yellow(黄色)
+1 orange(橙色)
+2 green(绿色)
+3 gray(灰色)
+4 red(红色)
+5 blue(蓝色)
+6 white(白色)
+7 golden(金色)
+8 brown(棕色)
+9 black(黑色)
+10 sedan(轿车)
+11 suv(SUV)
+12 van(厢式车)
+13 hatchback(掀背车)
+14 mpv(多用途车)
+15 pickup(皮卡)
+16 bus(公共汽车)
+17 truck(卡车)
+18 estate(旅行车)
diff --git a/cv/classification/resnet50/paddlepaddle/requirements.txt b/cv/classification/resnet50/paddlepaddle/requirements.txt
new file mode 100644
index 000000000..9d896504f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/requirements.txt
@@ -0,0 +1,11 @@
+prettytable
+ujson
+opencv-python<=4.6.0.66
+pillow>=9.0.0
+tqdm
+PyYAML>=5.1
+visualdl>=2.2.0
+scipy>=1.0.0
+scikit-learn>=0.21.0
+gast==0.3.3
+easydict
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/run_resnet50.sh b/cv/classification/resnet50/paddlepaddle/run_resnet50.sh
new file mode 100644
index 000000000..d1edf8e97
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/run_resnet50.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+export PYTHONPATH=./:${PYTHONPATH}
+
+pip3 install -r requirements.txt
+
+python3 train.py -c ./ppcls/configs/quick_start/ResNet50_vd.yaml
+exit $?
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/run_resnet50_dist.sh b/cv/classification/resnet50/paddlepaddle/run_resnet50_dist.sh
new file mode 100644
index 000000000..8b5ad479e
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/run_resnet50_dist.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+export PYTHONPATH=./:${PYTHONPATH}
+
+pip3 install -r requirements.txt
+
+python3 -m paddle.distributed.launch -ips=127.0.0.1 train.py -c ./ppcls/configs/quick_start/ResNet50_vd.yaml
+exit $?
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/train.py b/cv/classification/resnet50/paddlepaddle/train.py
new file mode 100644
index 000000000..3eab7fdd0
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/train.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
+
+from ppcls.utils import config
+from ppcls.engine.engine import Engine
+
+if __name__ == "__main__":
+    args = config.parse_args()
+    config = config.get_config(
+        args.config, overrides=args.override, show=False)
+    config.profiler_options = args.profiler_options
+    if "BATCH_SIZE" in os.environ:
+        config["DataLoader"]["sampler"]["batch_size"] = int(os.environ["BATCH_SIZE"])
+
+    try:
+        from dltest import show_training_arguments
+        show_training_arguments([args, config])
+    except:
+        pass
+    engine = Engine(config, mode="train")
+    engine.train()
diff --git a/tests/executables/resnet/init_paddle.sh b/tests/executables/resnet/init_paddle.sh
index 39528b7a4..fdb475bd8 100644
--- a/tests/executables/resnet/init_paddle.sh
+++ b/tests/executables/resnet/init_paddle.sh
@@ -13,7 +13,7 @@ if [ ! -d "${DATASET_DIR}/flowers102" ]; then
     tar zxf ${DATASET_DIR}/flowers102.tgz -C ${DATASET_DIR}
 fi
 
-RESNET_PADDLE_DIR=${PRJ_DIR}/official/cv/classification/resnet/paddle
+RESNET_PADDLE_DIR=${PRJ_DIR}/cv/classification/resnet50/paddlepaddle
 cd ${RESNET_PADDLE_DIR}
 pip3 install -r requirements.txt
 
diff --git a/tests/executables/resnet/train_resnet50_dist_paddle.sh b/tests/executables/resnet/train_resnet50_dist_paddle.sh
index e3aaf503c..b4dccfde7 100644
--- a/tests/executables/resnet/train_resnet50_dist_paddle.sh
+++ b/tests/executables/resnet/train_resnet50_dist_paddle.sh
@@ -8,7 +8,7 @@ if [[ -d ${OUTPUT_DIR} ]]; then
     mkdir -p ${OUTPUT_DIR}
 fi
 
-RESNET_PADDLE_DIR=${PROJECT_DIR}/official/cv/classification/resnet/paddle
+RESNET_PADDLE_DIR=${PROJECT_DIR}/cv/classification/resnet50/paddlepaddle/
 cd ${RESNET_PADDLE_DIR}
 
 ixdltest-check --nonstrict_mode_args="--epoch ${NONSTRICT_EPOCH}" -b 8 --run_script \
-- 
Gitee


From 4d292e3a7126b984321e5f96415b31333703f94e Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Fri, 26 Sep 2025 14:56:03 +0800
Subject: [PATCH 15/20] sync ssd pytorch all

---
 .../ssd/pytorch/{ci/prepare.sh => Dockerfile} |  52 +-
 cv/detection/ssd/pytorch/async_evaluator.py   |  95 +++
 cv/detection/ssd/pytorch/base/BaseDockerfile  |  58 --
 .../ssd/pytorch/base/config/__init__.py       |   4 -
 cv/detection/ssd/pytorch/base/config/_base.py | 143 -----
 .../ssd/pytorch/base/config/config_manager.py | 165 ------
 .../ssd/pytorch/base/config/mutable_params.py |  23 -
 .../ssd/pytorch/base/dataloaders/__init__.py  |   1 -
 .../pytorch/base/dataloaders/dataloader.py    |  13 -
 .../base/dataloaders/native_pipeline.py       | 119 ----
 .../ssd/pytorch/base/model/__init__.py        |  22 -
 .../ssd/pytorch/base/model/layers/__init__.py |   3 -
 .../ssd/pytorch/base/model/losses/__init__.py |   2 -
 .../ssd/pytorch/base/model/models/__init__.py |   0
 .../ssd/pytorch/base/model/models/resnet.py   | 291 ---------
 .../ssd/pytorch/base/model/models/ssd300.py   | 141 -----
 .../ssd/pytorch/base/optimizers/__init__.py   |   1 -
 .../ssd/pytorch/base/optimizers/factory.py    |  11 -
 cv/detection/ssd/pytorch/base/prepare.py      | 316 ----------
 cv/detection/ssd/pytorch/base/run_train.py    | 123 ----
 cv/detection/ssd/pytorch/base/run_training.sh |  43 --
 .../ssd/pytorch/base/run_with_docker.sh       | 224 -------
 cv/detection/ssd/pytorch/base/setup.py        | 102 ----
 .../pytorch/base/test/dali_dataloader_test.py |  48 --
 .../ssd/pytorch/base/test/dataloader_test.py  |  47 --
 .../ssd/pytorch/base/train/__init__.py        |   0
 .../ssd/pytorch/base/train/evaluator.py       | 116 ----
 .../ssd/pytorch/base/train/event/__init__.py  |   4 -
 .../ssd/pytorch/base/train/event/base.py      |  64 --
 .../pytorch/base/train/event/base_adapter.py  |  30 -
 .../ssd/pytorch/base/train/event/compose.py   | 109 ----
 .../ssd/pytorch/base/train/event/log.py       | 145 -----
 .../ssd/pytorch/base/train/trainer.py         | 229 --------
 .../ssd/pytorch/base/train/training_state.py  |  53 --
 .../ssd/pytorch/base/utils/__init__.py        |  17 -
 cv/detection/ssd/pytorch/base/utils/check.py  |  72 ---
 cv/detection/ssd/pytorch/base/utils/dist.py   | 165 ------
 .../ssd/pytorch/base/utils/logging.py         | 235 --------
 cv/detection/ssd/pytorch/base/utils/paths.py  |  18 -
 .../model/losses/loss.py => base_model.py}    |   2 +-
 cv/detection/ssd/pytorch/bind.sh              | 212 +++++++
 .../ssd/pytorch/{base => }/bind_launch.py     |  70 +--
 .../ssd/pytorch/{base => }/box_coder.py       | 119 ++--
 cv/detection/ssd/pytorch/build_ssd.sh         |  25 +
 cv/detection/ssd/pytorch/clean_ssd.sh         |  10 +
 .../csrc => csrc_pt1}/box_encoder_cuda.cu     |   6 +-
 .../{iluvatar/csrc => csrc_pt1}/interface.cpp |   0
 .../csrc => csrc_pt1}/nhwc/Descriptors.cpp    |   0
 .../csrc => csrc_pt1}/nhwc/Descriptors.h      |   1 +
 .../csrc => csrc_pt1}/nhwc/Exceptions.h       |   0
 .../csrc => csrc_pt1}/nhwc/ParamsHash.h       |   0
 .../csrc => csrc_pt1}/nhwc/batch_norm.cu      |  24 +-
 .../{nvidia/csrc => csrc_pt1}/nhwc/conv.cpp   |  24 +-
 .../csrc => csrc_pt1}/nhwc/max_pool.cu        |   2 +-
 .../pytorch/{nvidia/csrc => csrc_pt1}/nms.cu  |   5 +-
 .../csrc => csrc_pt1}/random_horiz_flip.cu    |   6 +-
 .../csrc => csrc_pt2}/box_encoder_cuda.cu     |   6 +-
 .../{nvidia/csrc => csrc_pt2}/interface.cpp   |   0
 .../csrc => csrc_pt2}/nhwc/Descriptors.cpp    |   2 +-
 .../csrc => csrc_pt2}/nhwc/Descriptors.h      |   3 +
 .../csrc => csrc_pt2}/nhwc/Exceptions.h       |   0
 .../csrc => csrc_pt2}/nhwc/ParamsHash.h       |   0
 .../csrc => csrc_pt2}/nhwc/batch_norm.cu      |  24 +-
 .../{iluvatar/csrc => csrc_pt2}/nhwc/conv.cpp |  26 +-
 .../csrc => csrc_pt2}/nhwc/max_pool.cu        |   2 +-
 .../{iluvatar/csrc => csrc_pt2}/nms.cu        |   5 +-
 .../csrc => csrc_pt2}/random_horiz_flip.cu    |   8 +-
 .../dataloaders => data}/build_pipeline.py    |  37 +-
 .../dataloaders => data}/dali_iterator.py     |  17 +-
 .../dataloaders => data}/dali_pipeline.py     |  33 +-
 .../dataloaders => data}/input_iterators.py   |   4 +
 .../ssd/pytorch/data/native_pipeline.py       | 165 ++++++
 .../{base/dataloaders => data}/prefetcher.py  |   0
 .../{base/dataloaders => data}/sampler.py     |   0
 .../pytorch/default/config/config_V100x1x1.py |  32 -
 .../pytorch/default/config/config_V100x1x8.py |  31 -
 .../default/config/config_nv_V100x1x8.py      |  32 -
 .../pytorch/default/config/training_event.py  |  79 ---
 cv/detection/ssd/pytorch/download_dataset.sh  |  25 +
 cv/detection/ssd/pytorch/eval.py              | 258 ++++++++
 .../pytorch/{base => }/fused_color_jitter.py  |   0
 .../iluvatar/config/config_V100x1x1.py        |  44 --
 .../iluvatar/config/config_V100x1x8.py        |  44 --
 .../iluvatar/config/config_V100x1x8_2.1.0.py  |  44 --
 .../iluvatar/config/config_V100x1x8_wsl.py    |  44 --
 .../iluvatar/config/config_nodali_V100x1x8.py |  44 --
 .../iluvatar/config/config_nv_V100x1x8.py     |  44 --
 .../ssd/pytorch/iluvatar/config/converter.py  |  15 -
 .../iluvatar/config/environment_variables.sh  |  10 -
 .../ssd/pytorch/iluvatar/config/nhwc/conv.py  |  99 ----
 .../iluvatar/config/nhwc/test_bn_cudnn.py     |  93 ---
 .../ssd/pytorch/iluvatar/config/ssd300.py     | 230 --------
 .../pytorch/iluvatar/config/training_event.py | 103 ----
 cv/detection/ssd/pytorch/iluvatar/reset.sh    |  22 -
 cv/detection/ssd/pytorch/install_ssd.sh       |  33 ++
 .../nhwc/test_conv.py => master_params.py}    |  64 +-
 cv/detection/ssd/pytorch/mlperf_log_utils.py  |  34 ++
 cv/detection/ssd/pytorch/mlperf_logger.py     | 101 ++++
 .../{iluvatar/config => }/nhwc/batch_norm.py  |   0
 cv/detection/ssd/pytorch/nhwc/cifar10_nhwc.py | 229 ++++++++
 .../pytorch/{nvidia/config => }/nhwc/conv.py  |  12 +-
 .../{iluvatar/config => }/nhwc/max_pool.py    |   0
 cv/detection/ssd/pytorch/nhwc/mnist_nhwc.py   | 133 +++++
 .../ssd/pytorch/nhwc/resnet_nhwc_cifar10.py   | 135 +++++
 .../{nvidia/config => }/nhwc/test_bn_cudnn.py |   8 +-
 .../{nvidia/config => }/nhwc/test_conv.py     |   6 +-
 .../config => }/nhwc/test_max_pool.py         |   7 +-
 .../ssd/pytorch/nvidia/config/Dockerfile      |  14 -
 .../pytorch/nvidia/config/config_V100x1x1.py  |  44 --
 .../pytorch/nvidia/config/config_V100x1x8.py  |  44 --
 .../nvidia/config/config_nodali_V100x1x8.py   |  44 --
 .../ssd/pytorch/nvidia/config/converter.py    |  15 -
 .../nvidia/config/environment_variables.sh    |  10 -
 .../pytorch/nvidia/config/nhwc/batch_norm.py  |  77 ---
 .../pytorch/nvidia/config/nhwc/max_pool.py    |  60 --
 .../nvidia/config/nhwc/test_max_pool.py       |  69 ---
 .../ssd/pytorch/nvidia/config/resnet.py       | 236 --------
 .../pytorch/nvidia/config/training_event.py   | 103 ----
 cv/detection/ssd/pytorch/nvidia/reset.sh      |   9 -
 cv/detection/ssd/pytorch/nvidia/setup.py      | 105 ----
 .../{base/model/losses => }/opt_loss.py       |  14 +
 cv/detection/ssd/pytorch/parse_config.py      | 172 ++++++
 .../prepare_json.py => prepare-json.py}       |   0
 .../ssd/pytorch/{base => }/requirements.txt   |   4 +-
 .../pytorch/{iluvatar/config => }/resnet.py   |  39 +-
 cv/detection/ssd/pytorch/run.sub              |  62 ++
 cv/detection/ssd/pytorch/run_and_time.sh      |  82 +++
 cv/detection/ssd/pytorch/run_ddp_mm.sh        | 126 ++++
 cv/detection/ssd/pytorch/run_with_docker.sh   |  68 +++
 .../ssd/pytorch/{iluvatar => }/setup.py       |   0
 .../ssd/pytorch/{nvidia/config => }/ssd300.py |   9 +-
 cv/detection/ssd/pytorch/test.py              | 204 +++++++
 .../pytorch/{base => }/test/box_coder_test.py |  34 +-
 .../{base => }/test/cuda_encoder_test.py      |  30 +-
 .../pytorch/{base => }/test/opt_loss_test.py  |  30 +-
 cv/detection/ssd/pytorch/train.py             | 554 ++++++++++++++++++
 .../{base/dataloaders/util.py => utils.py}    | 416 +++++++++----
 cv/detection/ssd/pytorch/visualize.py         | 174 ++++++
 tests/executables/ssd/init_torch.sh           |   2 +-
 tests/executables/ssd/train_ssd_amp_torch.sh  |   2 +-
 140 files changed, 3561 insertions(+), 5488 deletions(-)
 rename cv/detection/ssd/pytorch/{ci/prepare.sh => Dockerfile} (32%)
 create mode 100644 cv/detection/ssd/pytorch/async_evaluator.py
 delete mode 100644 cv/detection/ssd/pytorch/base/BaseDockerfile
 delete mode 100644 cv/detection/ssd/pytorch/base/config/__init__.py
 delete mode 100644 cv/detection/ssd/pytorch/base/config/_base.py
 delete mode 100644 cv/detection/ssd/pytorch/base/config/config_manager.py
 delete mode 100644 cv/detection/ssd/pytorch/base/config/mutable_params.py
 delete mode 100644 cv/detection/ssd/pytorch/base/dataloaders/__init__.py
 delete mode 100644 cv/detection/ssd/pytorch/base/dataloaders/dataloader.py
 delete mode 100644 cv/detection/ssd/pytorch/base/dataloaders/native_pipeline.py
 delete mode 100644 cv/detection/ssd/pytorch/base/model/__init__.py
 delete mode 100644 cv/detection/ssd/pytorch/base/model/layers/__init__.py
 delete mode 100644 cv/detection/ssd/pytorch/base/model/losses/__init__.py
 delete mode 100644 cv/detection/ssd/pytorch/base/model/models/__init__.py
 delete mode 100644 cv/detection/ssd/pytorch/base/model/models/resnet.py
 delete mode 100644 cv/detection/ssd/pytorch/base/model/models/ssd300.py
 delete mode 100644 cv/detection/ssd/pytorch/base/optimizers/__init__.py
 delete mode 100644 cv/detection/ssd/pytorch/base/optimizers/factory.py
 delete mode 100644 cv/detection/ssd/pytorch/base/prepare.py
 delete mode 100644 cv/detection/ssd/pytorch/base/run_train.py
 delete mode 100644 cv/detection/ssd/pytorch/base/run_training.sh
 delete mode 100644 cv/detection/ssd/pytorch/base/run_with_docker.sh
 delete mode 100644 cv/detection/ssd/pytorch/base/setup.py
 delete mode 100644 cv/detection/ssd/pytorch/base/test/dali_dataloader_test.py
 delete mode 100644 cv/detection/ssd/pytorch/base/test/dataloader_test.py
 delete mode 100644 cv/detection/ssd/pytorch/base/train/__init__.py
 delete mode 100644 cv/detection/ssd/pytorch/base/train/evaluator.py
 delete mode 100644 cv/detection/ssd/pytorch/base/train/event/__init__.py
 delete mode 100644 cv/detection/ssd/pytorch/base/train/event/base.py
 delete mode 100644 cv/detection/ssd/pytorch/base/train/event/base_adapter.py
 delete mode 100644 cv/detection/ssd/pytorch/base/train/event/compose.py
 delete mode 100644 cv/detection/ssd/pytorch/base/train/event/log.py
 delete mode 100644 cv/detection/ssd/pytorch/base/train/trainer.py
 delete mode 100644 cv/detection/ssd/pytorch/base/train/training_state.py
 delete mode 100644 cv/detection/ssd/pytorch/base/utils/__init__.py
 delete mode 100644 cv/detection/ssd/pytorch/base/utils/check.py
 delete mode 100644 cv/detection/ssd/pytorch/base/utils/dist.py
 delete mode 100644 cv/detection/ssd/pytorch/base/utils/logging.py
 delete mode 100644 cv/detection/ssd/pytorch/base/utils/paths.py
 rename cv/detection/ssd/pytorch/{base/model/losses/loss.py => base_model.py} (100%)
 create mode 100644 cv/detection/ssd/pytorch/bind.sh
 rename cv/detection/ssd/pytorch/{base => }/bind_launch.py (75%)
 rename cv/detection/ssd/pytorch/{base => }/box_coder.py (76%)
 create mode 100644 cv/detection/ssd/pytorch/build_ssd.sh
 create mode 100644 cv/detection/ssd/pytorch/clean_ssd.sh
 rename cv/detection/ssd/pytorch/{iluvatar/csrc => csrc_pt1}/box_encoder_cuda.cu (99%)
 rename cv/detection/ssd/pytorch/{iluvatar/csrc => csrc_pt1}/interface.cpp (100%)
 rename cv/detection/ssd/pytorch/{nvidia/csrc => csrc_pt1}/nhwc/Descriptors.cpp (100%)
 rename cv/detection/ssd/pytorch/{iluvatar/csrc => csrc_pt1}/nhwc/Descriptors.h (99%)
 rename cv/detection/ssd/pytorch/{iluvatar/csrc => csrc_pt1}/nhwc/Exceptions.h (100%)
 rename cv/detection/ssd/pytorch/{iluvatar/csrc => csrc_pt1}/nhwc/ParamsHash.h (100%)
 rename cv/detection/ssd/pytorch/{nvidia/csrc => csrc_pt1}/nhwc/batch_norm.cu (96%)
 rename cv/detection/ssd/pytorch/{nvidia/csrc => csrc_pt1}/nhwc/conv.cpp (98%)
 rename cv/detection/ssd/pytorch/{nvidia/csrc => csrc_pt1}/nhwc/max_pool.cu (99%)
 rename cv/detection/ssd/pytorch/{nvidia/csrc => csrc_pt1}/nms.cu (99%)
 rename cv/detection/ssd/pytorch/{nvidia/csrc => csrc_pt1}/random_horiz_flip.cu (98%)
 rename cv/detection/ssd/pytorch/{nvidia/csrc => csrc_pt2}/box_encoder_cuda.cu (99%)
 rename cv/detection/ssd/pytorch/{nvidia/csrc => csrc_pt2}/interface.cpp (100%)
 rename cv/detection/ssd/pytorch/{iluvatar/csrc => csrc_pt2}/nhwc/Descriptors.cpp (99%)
 rename cv/detection/ssd/pytorch/{nvidia/csrc => csrc_pt2}/nhwc/Descriptors.h (99%)
 rename cv/detection/ssd/pytorch/{nvidia/csrc => csrc_pt2}/nhwc/Exceptions.h (100%)
 rename cv/detection/ssd/pytorch/{nvidia/csrc => csrc_pt2}/nhwc/ParamsHash.h (100%)
 rename cv/detection/ssd/pytorch/{iluvatar/csrc => csrc_pt2}/nhwc/batch_norm.cu (96%)
 rename cv/detection/ssd/pytorch/{iluvatar/csrc => csrc_pt2}/nhwc/conv.cpp (98%)
 rename cv/detection/ssd/pytorch/{iluvatar/csrc => csrc_pt2}/nhwc/max_pool.cu (99%)
 rename cv/detection/ssd/pytorch/{iluvatar/csrc => csrc_pt2}/nms.cu (99%)
 rename cv/detection/ssd/pytorch/{iluvatar/csrc => csrc_pt2}/random_horiz_flip.cu (97%)
 rename cv/detection/ssd/pytorch/{base/dataloaders => data}/build_pipeline.py (58%)
 rename cv/detection/ssd/pytorch/{base/dataloaders => data}/dali_iterator.py (94%)
 rename cv/detection/ssd/pytorch/{base/dataloaders => data}/dali_pipeline.py (42%)
 rename cv/detection/ssd/pytorch/{base/dataloaders => data}/input_iterators.py (99%)
 create mode 100644 cv/detection/ssd/pytorch/data/native_pipeline.py
 rename cv/detection/ssd/pytorch/{base/dataloaders => data}/prefetcher.py (100%)
 rename cv/detection/ssd/pytorch/{base/dataloaders => data}/sampler.py (100%)
 delete mode 100644 cv/detection/ssd/pytorch/default/config/config_V100x1x1.py
 delete mode 100644 cv/detection/ssd/pytorch/default/config/config_V100x1x8.py
 delete mode 100644 cv/detection/ssd/pytorch/default/config/config_nv_V100x1x8.py
 delete mode 100644 cv/detection/ssd/pytorch/default/config/training_event.py
 create mode 100644 cv/detection/ssd/pytorch/download_dataset.sh
 create mode 100644 cv/detection/ssd/pytorch/eval.py
 rename cv/detection/ssd/pytorch/{base => }/fused_color_jitter.py (100%)
 delete mode 100644 cv/detection/ssd/pytorch/iluvatar/config/config_V100x1x1.py
 delete mode 100644 cv/detection/ssd/pytorch/iluvatar/config/config_V100x1x8.py
 delete mode 100644 cv/detection/ssd/pytorch/iluvatar/config/config_V100x1x8_2.1.0.py
 delete mode 100644 cv/detection/ssd/pytorch/iluvatar/config/config_V100x1x8_wsl.py
 delete mode 100644 cv/detection/ssd/pytorch/iluvatar/config/config_nodali_V100x1x8.py
 delete mode 100644 cv/detection/ssd/pytorch/iluvatar/config/config_nv_V100x1x8.py
 delete mode 100644 cv/detection/ssd/pytorch/iluvatar/config/converter.py
 delete mode 100644 cv/detection/ssd/pytorch/iluvatar/config/environment_variables.sh
 delete mode 100644 cv/detection/ssd/pytorch/iluvatar/config/nhwc/conv.py
 delete mode 100644 cv/detection/ssd/pytorch/iluvatar/config/nhwc/test_bn_cudnn.py
 delete mode 100644 cv/detection/ssd/pytorch/iluvatar/config/ssd300.py
 delete mode 100644 cv/detection/ssd/pytorch/iluvatar/config/training_event.py
 delete mode 100644 cv/detection/ssd/pytorch/iluvatar/reset.sh
 create mode 100644 cv/detection/ssd/pytorch/install_ssd.sh
 rename cv/detection/ssd/pytorch/{iluvatar/config/nhwc/test_conv.py => master_params.py} (33%)
 create mode 100644 cv/detection/ssd/pytorch/mlperf_log_utils.py
 create mode 100644 cv/detection/ssd/pytorch/mlperf_logger.py
 rename cv/detection/ssd/pytorch/{iluvatar/config => }/nhwc/batch_norm.py (100%)
 create mode 100644 cv/detection/ssd/pytorch/nhwc/cifar10_nhwc.py
 rename cv/detection/ssd/pytorch/{nvidia/config => }/nhwc/conv.py (92%)
 rename cv/detection/ssd/pytorch/{iluvatar/config => }/nhwc/max_pool.py (100%)
 create mode 100644 cv/detection/ssd/pytorch/nhwc/mnist_nhwc.py
 create mode 100644 cv/detection/ssd/pytorch/nhwc/resnet_nhwc_cifar10.py
 rename cv/detection/ssd/pytorch/{nvidia/config => }/nhwc/test_bn_cudnn.py (97%)
 rename cv/detection/ssd/pytorch/{nvidia/config => }/nhwc/test_conv.py (96%)
 rename cv/detection/ssd/pytorch/{iluvatar/config => }/nhwc/test_max_pool.py (96%)
 delete mode 100644 cv/detection/ssd/pytorch/nvidia/config/Dockerfile
 delete mode 100644 cv/detection/ssd/pytorch/nvidia/config/config_V100x1x1.py
 delete mode 100644 cv/detection/ssd/pytorch/nvidia/config/config_V100x1x8.py
 delete mode 100644 cv/detection/ssd/pytorch/nvidia/config/config_nodali_V100x1x8.py
 delete mode 100644 cv/detection/ssd/pytorch/nvidia/config/converter.py
 delete mode 100644 cv/detection/ssd/pytorch/nvidia/config/environment_variables.sh
 delete mode 100644 cv/detection/ssd/pytorch/nvidia/config/nhwc/batch_norm.py
 delete mode 100644 cv/detection/ssd/pytorch/nvidia/config/nhwc/max_pool.py
 delete mode 100644 cv/detection/ssd/pytorch/nvidia/config/nhwc/test_max_pool.py
 delete mode 100644 cv/detection/ssd/pytorch/nvidia/config/resnet.py
 delete mode 100644 cv/detection/ssd/pytorch/nvidia/config/training_event.py
 delete mode 100644 cv/detection/ssd/pytorch/nvidia/reset.sh
 delete mode 100644 cv/detection/ssd/pytorch/nvidia/setup.py
 rename cv/detection/ssd/pytorch/{base/model/losses => }/opt_loss.py (77%)
 create mode 100644 cv/detection/ssd/pytorch/parse_config.py
 rename cv/detection/ssd/pytorch/{base/data_preprocessing/prepare_json.py => prepare-json.py} (100%)
 rename cv/detection/ssd/pytorch/{base => }/requirements.txt (35%)
 rename cv/detection/ssd/pytorch/{iluvatar/config => }/resnet.py (88%)
 create mode 100644 cv/detection/ssd/pytorch/run.sub
 create mode 100644 cv/detection/ssd/pytorch/run_and_time.sh
 create mode 100644 cv/detection/ssd/pytorch/run_ddp_mm.sh
 create mode 100644 cv/detection/ssd/pytorch/run_with_docker.sh
 rename cv/detection/ssd/pytorch/{iluvatar => }/setup.py (100%)
 rename cv/detection/ssd/pytorch/{nvidia/config => }/ssd300.py (98%)
 create mode 100644 cv/detection/ssd/pytorch/test.py
 rename cv/detection/ssd/pytorch/{base => }/test/box_coder_test.py (53%)
 rename cv/detection/ssd/pytorch/{base => }/test/cuda_encoder_test.py (88%)
 rename cv/detection/ssd/pytorch/{base => }/test/opt_loss_test.py (64%)
 create mode 100644 cv/detection/ssd/pytorch/train.py
 rename cv/detection/ssd/pytorch/{base/dataloaders/util.py => utils.py} (48%)
 create mode 100644 cv/detection/ssd/pytorch/visualize.py

diff --git a/cv/detection/ssd/pytorch/ci/prepare.sh b/cv/detection/ssd/pytorch/Dockerfile
similarity index 32%
rename from cv/detection/ssd/pytorch/ci/prepare.sh
rename to cv/detection/ssd/pytorch/Dockerfile
index 7a9d113e8..dab6ec428 100644
--- a/cv/detection/ssd/pytorch/ci/prepare.sh
+++ b/cv/detection/ssd/pytorch/Dockerfile
@@ -1,10 +1,8 @@
-#!/bin/bash
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
+# Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
@@ -14,22 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set -x
-## install libGL
-ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
-if [[ ${ID} == "ubuntu" ]]; then
-    apt install -y libgl1-mesa-glx
-elif [[ ${ID} == "centos" ]]; then
-    yum install -y mesa-libGL
-else
-    echo "Not Support Os"
-fi
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
+FROM ${FROM_IMAGE_NAME}
 
-mkdir -p /home/data/perf/ssd
-ln -s /mnt/deepspark/data/datasets/coco /home/data/perf/ssd/
-cp /mnt/deepspark/data/checkpoints/resnet34-333f7ec4.pth /home/data/perf/ssd/
+# Install dependencies for system configuration logger
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends \
+        infiniband-diags \
+        pciutils \
+ && rm -rf /var/lib/apt/lists/*
 
-cd base
-source ../iluvatar/config/environment_variables.sh
-python3 prepare.py --name iluvatar --data_dir /home/data/perf/ssd
-timeout 1800 bash run_training.sh --name iluvatar --config V100x1x8 --data_dir /home/data/perf/ssd --backbone_path /home/data/perf/ssd/resnet34-333f7ec4.pth
\ No newline at end of file
+WORKDIR opt
+
+# Install Python dependencies
+WORKDIR /workspace/single_stage_detector
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir https://github.com/mlperf/logging/archive/9ea0afa.zip \
+ && pip install --no-cache-dir -r requirements.txt
+
+# Copy SSD code and build
+COPY . .
+RUN pip install .
+
+# Configure environment variables
+ENV OMP_NUM_THREADS=1
+ENV OPENCV_FOR_THREADS_NUM=1
+ENV TORCH_HOME=/workspace/single_stage_detector/torch-model-cache
diff --git a/cv/detection/ssd/pytorch/async_evaluator.py b/cv/detection/ssd/pytorch/async_evaluator.py
new file mode 100644
index 000000000..8c8343fff
--- /dev/null
+++ b/cv/detection/ssd/pytorch/async_evaluator.py
@@ -0,0 +1,95 @@
+import torch
+
+from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
+
+class AsyncEvaluator():
+    """
+    Creates a threaded evaluator for a given device.
+    If device == None then the current active device is used
+    """
+    def __init__(self, num_threads=1, device=None):
+        self.num_threads = num_threads
+        # self.pool = ThreadPoolExecutor(num_threads)
+        self.pool = ProcessPoolExecutor(num_threads)
+
+        self.events = {}
+
+    def __del__(self):
+        for t, e in self.events.items():
+           e.cancel()
+
+    # submit given function and its arguments with an
+    # associated tag for bookkkeeping
+    def submit_task(self, tag, fn, *args, **kwargs):
+
+        # launch work
+        e = self.pool.submit(fn, *args, **kwargs)
+
+        # record work
+        self.events[tag] = e
+
+    # check if a task has completed
+    def task_done(self, tag):
+        return self.events[tag].done()
+
+    # get the result of a task:
+    # Note: will block until completed
+    def task_result(self, tag):
+        return self.events[tag].result(timeout=None)
+
+    # Get all currently finished tasks in a dict of
+    # { tag : result }
+    def finished_tasks(self):
+        ret = {}
+        to_remove = []
+        # Check all existing tasks
+        for t in self.events.keys():
+            done = self.events[t].done()
+
+            if done:
+                ret[t] = self.task_result(t)
+                to_remove.append(t)
+
+        # As soon as a task is finished we want to remove it
+        for t in to_remove:
+            self.task_remove(t)
+
+        return ret
+
+    # remove a task from the outstanding list
+    # Note: will cancel task if not completed
+    def task_remove(self, tag):
+        done = self.events[tag].done()
+
+        # cancel task if necessary
+        if not done:
+            self.events[tag].cancel()
+
+        # remove the entry
+        del self.events[tag]
+
+    # return list of tags outstanding
+    def task_tags(self):
+        return self.events.keys()
+
+    # wait for everything to finish
+    def get_all_tasks(self):
+        ret = {}
+        to_remove = []
+        # Check all existing tasks
+        for t in self.events.keys():
+            ret[t] = self.task_result(t)
+            to_remove.append(t)
+
+        # As soon as a task is finished we want to remove it
+        for t in to_remove:
+            self.task_remove(t)
+        return ret
+
+    # wait for everything to finish
+    def wait_all_tasks(self):
+        for t in self.events.keys():
+            y = self.task_result(t)
+            print('task {} finished'.format(t))
+
+
diff --git a/cv/detection/ssd/pytorch/base/BaseDockerfile b/cv/detection/ssd/pytorch/base/BaseDockerfile
deleted file mode 100644
index 6f036a1c0..000000000
--- a/cv/detection/ssd/pytorch/base/BaseDockerfile
+++ /dev/null
@@ -1,58 +0,0 @@
-FROM ubuntu:18.04
-
-ENV DEBIAN_FRONTEND=noninteractive
-ENV PATH /root/miniconda/bin:$PATH
-
-
-RUN apt-get update -y
-RUN apt-get install -y \
-     apt-utils \
-     sudo \
-     openssh-server \
-     vim \
-     git \
-     curl \
-     wget \
-     tree \
-     perl \
-     kmod \
-     make \
-     pciutils \
-     build-essential \
-     python3.8-dev \
-     python3-pip \
-     libjpeg-dev \
-     zlib1g-dev \
-     unzip \
-     cmake \
-     bzip2 \
-     cabextract \
-     iputils-ping \
-     pbzip2 \
-     pv \
-     numactl
-
-# Configure anaconda
-RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linux-x86_64.sh && \
-    bash ./Miniconda3-py38_4.10.3-Linux-x86_64.sh -b -p /root/miniconda && \
-#    eval "$(/root/miniconda/bin/conda shell.bash hook)" && \
-    /root/miniconda/bin/conda clean -tipsy && \
-    ln -s /root/miniconda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
-    echo ". /root/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \
-    echo "conda activate base" >> ~/.bashrc && \
-    conda config --set always_yes yes --set changeps1 no
-
-
-RUN /bin/bash -c "apt-get install -y linux-headers-`uname -r`"
-
-# TODO: Remove pip source
-RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"
-
-COPY requirements.txt requirements.txt
-RUN /bin/bash -c "pip3 install -r requirements.txt"
-
-
-WORKDIR /workspace/baai-perf
-
-
-
diff --git a/cv/detection/ssd/pytorch/base/config/__init__.py b/cv/detection/ssd/pytorch/base/config/__init__.py
deleted file mode 100644
index 258fb17d7..000000000
--- a/cv/detection/ssd/pytorch/base/config/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from ._base import *
-
-from .config_manager import activate_config_env
-
diff --git a/cv/detection/ssd/pytorch/base/config/_base.py b/cv/detection/ssd/pytorch/base/config/_base.py
deleted file mode 100644
index 4262ad1a7..000000000
--- a/cv/detection/ssd/pytorch/base/config/_base.py
+++ /dev/null
@@ -1,143 +0,0 @@
-from typing import ClassVar
-from train.event.base import BaseTrainingEventInterface
-
-# 1.Basic Configurations
-# The train dir. Should contain coco datasets.
-data_dir: str = None
-
-# The backbone dir. The path to pretrained backbone weights file of resnet34-333f7ec4.pth, 'default is to get it from online torchvision repository'.
-backbone_path: str = None
-
-# Whether to run training.
-do_train: bool = False
-
-# Whether to read local rank from ENVVAR
-use_env: bool = False
-
-# device
-device: str = None
-
-# total gpu count
-n_gpu: int = 1
-
-distributed: bool = False
-
-# local_rank for distributed training on gpus
-local_rank: int = -1
-
-# Communication backend for distributed training on gpus
-dist_backend: str = "nccl"
-
-# Stop training after reaching this Masked-LM accuracy
-threshold: float = 0.23
-
-# NMS threshold
-nms_valid_thresh: float = 0.05
-
-# Total number of training epochs to perform.
-epochs: int = 80
-
-# Start epoch, use for training from checkpoint
-epoch: int = 0
-
-# Start iteration, use for training from checkpoint
-iteration: int = 0
-
-# Sample to begin performing eval.
-evaluation: list = [5, 10, 40, 50, 55, 60, 65, 70, 75, 80]
-
-# Whether to save checkpoints
-save_checkpoint: bool = False
-
-# Where to save checkpoints
-output: str = "./output"
-
-# path to model checkpoint file
-checkpoint: str = None
-
-# random seed for initialization
-seed: int = 42
-
-# frequency of logging loss. If not positive, no logging is provided for training loss
-log_freq: int = 20
-
-# Number of updates steps to accumualte before performing a backward/update pass.
-num_classes: int = 81
-
-# Input images size
-input_size: int = 300
-
-
-# 2.Model Training Configurations
-gradient_accumulation_steps: int = 1
-
-# Total batch size for training.
-train_batch_size: int = 120
-
-# Total batch size for training.
-eval_batch_size: int = 160
-
-# The initial learning rate.
-learning_rate: float = 2.92e-3
-
-# weight decay rate.
-weight_decay_rate: float = 1.6e-4
-
-# decay rate of learning rate. default is 0.1.
-lr_decay_factor: float = 0.1
-
-# epochs at which to decay the learning rate.
-lr_decay_epochs: list = [40, 50]
-
-# How long the learning rate will be warmed up in fraction of epochs.
-warmup: int = 650
-
-# MLperf rule parameter for controlling warmup curve.
-warmup_factor: int = 0
-
-# Loss scaling, positive power of 2 values can improve fp16 convergence.
-loss_scale: float = 0.0
-
-
-# 3. Optimizer Configurations
-# A object to provide some core components in training
-training_event: ClassVar[BaseTrainingEventInterface] = None
-
-# Dataloader workers
-num_workers: int = 4
-
-# Whether to use 16-bit float precision instead of 32-bit
-fp16: bool = False
-
-# Control training mode(FP32 or FP16) by opt_level using apex
-# ToDo enum 0,1,2
-opt_level: int = 0
-
-delay_allreduce: bool = False
-
-# Group of processes to collaborate on BatchNorm ops
-# ToDo enum 1,2,4,8
-bn_group: int = 1
-
-fast_nms: bool = False
-
-# Use fused color jitter
-fast_cj: bool = False
-
-#
-use_coco_ext: bool = False
-
-# Whether use Dali
-dali: bool = False
-
-# Run dali in synchronous mode instead of the (default) asynchronous
-dali_sync: bool = False
-
-# cache size (in GB) for Dali's nvjpeg caching
-dali_cache: int = 0
-
-# The following 4 optimization configurations must using Dali
-nhwc: bool = False
-pad_input: bool = False
-jit: bool = False
-use_nvjpeg: bool = False
diff --git a/cv/detection/ssd/pytorch/base/config/config_manager.py b/cv/detection/ssd/pytorch/base/config/config_manager.py
deleted file mode 100644
index e8bd00a25..000000000
--- a/cv/detection/ssd/pytorch/base/config/config_manager.py
+++ /dev/null
@@ -1,165 +0,0 @@
-import copy
-import importlib
-import inspect
-import os
-import sys
-from argparse import ArgumentParser
-from typing import Iterable, Mapping
-
-import config as global_config
-from . import _base as base_config
-from .mutable_params import mutable_params
-
-mutable_params = copy.copy(mutable_params)
-immutable_params = set(global_config.__dict__.keys()) - set(mutable_params)
-
-
-def get_config(config_path: str):
-    if os.path.exists(config_path):
-        abs_path = config_path
-        sys.path.append(os.path.dirname(abs_path))
-        config_path = os.path.basename(config_path).replace(".py", "")
-        try:
-            module = importlib.import_module(config_path)
-        except Exception as ex:
-            sys.path.pop(-1)
-            raise ex
-        sys.path.pop(-1)
-    else:
-        raise FileNotFoundError("Not found config:", config_path)
-
-    return module
-
-
-def get_annotations(other_modules: list=None):
-    annotations = dict()
-
-    if "__annotations__" in base_config.__dict__:
-        annotations.update(base_config.__dict__["__annotations__"])
-
-    if other_modules is not None:
-        for mod in other_modules:
-            if isinstance(mod, str):
-                mod = get_config(mod)
-            if "__annotations__" in mod.__dict__:
-                annotations.update(mod.__dict__["__annotations__"])
-
-    return annotations
-
-
-def is_property(name: str, value):
-    status = [
-        not name.startswith('__'),
-        not callable(value),
-        not inspect.isclass(value),
-        not inspect.ismodule(value),
-        not inspect.ismethod(value),
-        not inspect.isfunction(value),
-        not inspect.isbuiltin(value),
-    ]
-
-    return all(status)
-
-
-def get_properties_from_config(config):
-    if not isinstance(config, Mapping):
-        config = config.__dict__
-    properties = dict()
-    for name, value in config.items():
-        if is_property(name, value):
-            properties[name] = value
-
-    return properties
-
-
-def add_to_argparser(config: dict, parser: ArgumentParser, other_modules: list=None):
-    annotations = get_annotations(other_modules)
-
-    def get_property_type(name, value):
-        if value is not None:
-            return type(value)
-        if name in annotations:
-            return annotations[name]
-        return str
-
-    def add_args(parser, name, value, prefix=''):
-        dtype = get_property_type(prefix + name, value)
-
-        if dtype == str:
-            parser.add_argument('--' + prefix + name, type=str, default=None)
-        elif dtype == int:
-            parser.add_argument('--' + prefix + name, type=int, default=None)
-        elif dtype == float:
-            parser.add_argument('--' + prefix + name, type=float, default=None)
-        elif dtype == bool:
-            parser.add_argument('--' + prefix + name, action=f"store_{str(not value).lower()}", default=None)
-        elif isinstance(value, Mapping):
-            for k, v in value.items():
-                add_args(parser, k, v, prefix=prefix + name + ".")
-        elif isinstance(value, Iterable) and not isinstance(value, Mapping):
-            parser.add_argument('--' + prefix + name, type=type(value[0]), nargs='+', default=None)
-        # else:
-        #     print(f'WARN: Cannot parse key {prefix + name} of type {type(value)}.')
-
-    for name, value in config.items():
-        if not is_property(name, value):
-            continue
-        add_args(parser, name, value)
-
-
-def _merge_dict_to_config(src: dict, dist: dict, ignore_none=True):
-    for arg, value in src.items():
-        if ignore_none and value is None:
-            continue
-        dist[arg] = value
-
-
-def parse_from_args(config: dict, parser=None, other_modules: list=None, with_config_env_name: bool=False):
-    if parser is None:
-        parser = ArgumentParser()
-
-    add_to_argparser(config, parser, other_modules)
-    if with_config_env_name:
-        parser.add_argument("config", type=str, help="Config name")
-
-    args = parser.parse_args()
-    return args
-
-
-def activate_config_env(name=None, parser=None, parse_args=True, with_config_env_name: bool=False):
-    global_config_copy_ = copy.copy(global_config.__dict__)
-
-    if parse_args:
-        args_dict = dict()
-        for mutable_param in mutable_params:
-            args_dict[mutable_param] = global_config_copy_[mutable_param]
-        args = parse_from_args(args_dict, parser, with_config_env_name=with_config_env_name)
-        del args_dict
-        if name is None and with_config_env_name:
-            name = args.config
-
-    if name is None:
-        raise RuntimeError("Argument `name` must be given.")
-
-    external_module_params = copy.copy(get_config(name).__dict__)
-    for immutable_param in immutable_params:
-        if immutable_param in external_module_params:
-            external_module_params.pop(immutable_param)
-
-    _merge_dict_to_config(global_config_copy_, global_config.__dict__)
-    _merge_dict_to_config(external_module_params, global_config.__dict__)
-    if parse_args:
-        _merge_dict_to_config(args.__dict__, global_config.__dict__)
-
-
-def print_config(config=None):
-    if config is None:
-        config = global_config
-    properties = get_properties_from_config(config)
-    config_fields = []
-    for name, value in properties.items():
-        config_fields.append(f"{name}={value}")
-
-    config_fields = ", ".join(config_fields)
-    config_str = f"Config({config_fields})"
-    print(config_str)
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/base/config/mutable_params.py b/cv/detection/ssd/pytorch/base/config/mutable_params.py
deleted file mode 100644
index b5bc1d41f..000000000
--- a/cv/detection/ssd/pytorch/base/config/mutable_params.py
+++ /dev/null
@@ -1,23 +0,0 @@
-mutable_params = [
-    "dist_backend", "seed",
-    "train_batch_size", "eval_batch_size", "learning_rate", "weight_decay_rate", "lr_decay_factor", "lr_decay_epochs",
-    "warmup", "warmup_factor", "loss_scale",
-    "gradient_accumulation_steps", "fp16", "opt_level", "delay_allreduce", "fast_nms", "fast_cj",
-    "dali", "dali_cache", "nhwc", "pad_input", "jit", "use_nvjpeg",
-    "training_event",
-]
-
-mutable_params += [
-    "local_rank",
-    "do_train",
-    "data_dir",
-    "backbone_path",
-    "log_freq",
-]
-
-# only use for debug and fine tune.
-mutable_params += [
-    "save_checkpoint",
-    "output",
-    "checkpoint"
-]
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/base/dataloaders/__init__.py b/cv/detection/ssd/pytorch/base/dataloaders/__init__.py
deleted file mode 100644
index 01ffe4e37..000000000
--- a/cv/detection/ssd/pytorch/base/dataloaders/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .dataloader import create_train_dataloader, create_eval_dataloader
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/base/dataloaders/dataloader.py b/cv/detection/ssd/pytorch/base/dataloaders/dataloader.py
deleted file mode 100644
index a05c764b5..000000000
--- a/cv/detection/ssd/pytorch/base/dataloaders/dataloader.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import torch
-
-from .build_pipeline import prebuild_pipeline, build_pipeline
-
-def create_train_dataloader(config):
-    train_pipe = prebuild_pipeline(config)
-    train_loader, epoch_size, train_sampler = build_pipeline(config, training=True, pipe=train_pipe)
-    return train_loader, epoch_size, train_sampler
-
-
-def create_eval_dataloader(config):
-    val_loader, inv_map, cocoGt = build_pipeline(config, training=False)
-    return val_loader, inv_map, cocoGt
diff --git a/cv/detection/ssd/pytorch/base/dataloaders/native_pipeline.py b/cv/detection/ssd/pytorch/base/dataloaders/native_pipeline.py
deleted file mode 100644
index 27afce830..000000000
--- a/cv/detection/ssd/pytorch/base/dataloaders/native_pipeline.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import torch
-import os
-
-from functools import partial
-
-from torch.utils.data import DataLoader
-
-from box_coder import dboxes300_coco
-from .util import COCODetection, SSDTransformer
-from .sampler import GeneralDistributedSampler
-
-from pycocotools.coco import COCO
-import numpy as np
-
-
-def SSDCollator(batch, dali):
-    """
-    :param batch: batch data, [image, image_id, (htot, wtot), bboxes, labels]
-        if Dali is False:
-            a batch is like:
-                [
-                    [torch.Size([3, 300, 300]), 152915, (262, 386), torch.Size([8732, 4]), torch.Size([8732])],
-                    [torch.Size([3, 300, 300]), 260111, (480, 640), torch.Size([8732, 4]), torch.Size([8732])]
-                    ....
-                ]
-        if Dali is True:
-            This function will not be called.
-    :param dali: whether use Dali
-    :return:
-    """
-    images = []
-    image_ids = []
-    image_sizes = []
-    bboxes = []
-    bbox_offsets = [0]
-    labels = []
-    for img, img_id, img_size, bbox, label in batch:
-        images.append(img.view(1, *img.shape))
-        image_ids.append(img_id)
-        image_sizes.append(img_size)
-        bboxes.append(bbox)
-        labels.append(label)
-        bbox_offsets.append(bbox_offsets[-1] + bbox.shape[0])
-
-    images = torch.cat(images)
-    bboxes = torch.cat(bboxes)
-    labels = torch.cat(labels)
-    if dali:
-        bbox_offsets = np.array(bbox_offsets).astype(np.int32)
-        return [images, bboxes, labels, torch.tensor(bbox_offsets)]
-    else:
-        N = images.shape[0]
-        return [images, bboxes.view(N, -1, 4), labels.view(N, -1)]
-
-
-def build_train_pipe(config):
-    input_size = config.input_size
-    train_coco_root = os.path.join(config.data_dir, "train2017")
-    if config.dali:
-        # Default model, this branch is not be executed, and the alternative branch is dataloaders.dali_pipeline.build_dali_pipeline.
-        train_annotate = os.path.join(config.data_dir, "annotations/bbox_only_instances_train2017.json")
-        train_trans = SSDTransformer((input_size, input_size), dali=True,
-                                     fast_nms=config.fast_nms, fast_cj=config.fast_cj, val=False)
-    else:
-        train_annotate = os.path.join(config.data_dir, "annotations/instances_train2017.json")
-        dboxes = dboxes300_coco()
-        train_trans = SSDTransformer((input_size, input_size), dboxes=dboxes, dali=False,
-                                     fast_nms=config.fast_nms, fast_cj=config.fast_cj, val=False)
-
-    train_coco = COCODetection(train_coco_root, train_annotate, train_trans)
-
-    if config.distributed:
-        train_sampler = GeneralDistributedSampler(train_coco, pad=False)
-    else:
-        train_sampler = None
-
-    train_loader = DataLoader(train_coco,
-                              batch_size=config.train_batch_size,
-                              shuffle=(train_sampler is None),
-                              sampler=train_sampler,
-                              num_workers=config.num_workers,
-                              collate_fn=partial(SSDCollator, dali=config.dali))
-    return train_loader, len(train_loader), train_sampler
-
-
-def build_eval_pipe(config):
-    # Paths
-    input_size = config.input_size
-    val_coco_root = os.path.join(config.data_dir, "val2017")
-    val_annotate = os.path.join(config.data_dir, "annotations/bbox_only_instances_val2017.json")
-    val_trans = SSDTransformer((input_size, input_size), dali=True,
-                               fast_nms=config.fast_nms, fast_cj=config.fast_cj, val=True)
-    if config.use_coco_ext:
-        cocoGt = COCO(annotation_file=val_annotate, use_ext=True)
-    else:
-        cocoGt = COCO(annotation_file=val_annotate)
-    val_coco = COCODetection(val_coco_root, val_annotate, val_trans, cocoGt.dataset)
-
-    if config.distributed:
-        val_sampler = GeneralDistributedSampler(val_coco, pad=False)
-    else:
-        val_sampler = None
-
-    val_dataloader = DataLoader(val_coco,
-                                  batch_size=config.eval_batch_size,
-                                  shuffle=False, # Note: distributed sampler is shuffled :(
-                                  sampler=val_sampler,
-                                  num_workers=config.num_workers)
-
-    inv_map = {v:k for k,v in val_coco.label_map.items()}
-
-    return val_dataloader, inv_map, cocoGt
-
-
-def build_native_pipeline(config, training=True, pipe=None):
-    if training:
-        return build_train_pipe(config)
-    else:
-        return build_eval_pipe(config)
diff --git a/cv/detection/ssd/pytorch/base/model/__init__.py b/cv/detection/ssd/pytorch/base/model/__init__.py
deleted file mode 100644
index 7f31eba74..000000000
--- a/cv/detection/ssd/pytorch/base/model/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import torch
-from .models.ssd300 import SSD300
-
-
-def load_checkpoint(model, checkpoint):
-    print("loading model checkpoint", checkpoint)
-    od = torch.load(checkpoint)
-
-    # remove proceeding 'module' from checkpoint
-    saved_model = od["model"]
-    for k in list(saved_model.keys()):
-        if k.startswith('module.'):
-            saved_model[k[7:]] = saved_model.pop(k)
-    model.load_state_dict(saved_model)
-    return od
-
-
-def create_model(config):
-    model = SSD300(config)
-    model.train()
-    model.cuda()
-    return model
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/base/model/layers/__init__.py b/cv/detection/ssd/pytorch/base/model/layers/__init__.py
deleted file mode 100644
index d5bdef535..000000000
--- a/cv/detection/ssd/pytorch/base/model/layers/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-import torch
-
-
diff --git a/cv/detection/ssd/pytorch/base/model/losses/__init__.py b/cv/detection/ssd/pytorch/base/model/losses/__init__.py
deleted file mode 100644
index 71d4a7969..000000000
--- a/cv/detection/ssd/pytorch/base/model/losses/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .loss import Loss
-from .opt_loss import OptLoss
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/base/model/models/__init__.py b/cv/detection/ssd/pytorch/base/model/models/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/cv/detection/ssd/pytorch/base/model/models/resnet.py b/cv/detection/ssd/pytorch/base/model/models/resnet.py
deleted file mode 100644
index afa119cd9..000000000
--- a/cv/detection/ssd/pytorch/base/model/models/resnet.py
+++ /dev/null
@@ -1,291 +0,0 @@
-from typing import Type, Any, Callable, Union, List, Optional
-
-#from torchvision.models.vgg import vgg16
-import torch.nn as nn
-from torch import Tensor
-import torch.utils.model_zoo as model_zoo
-import torch.nn.functional as F
-import torch
-from torch.autograd import Variable
-from collections import OrderedDict
-
-# from torchvision.models.resnet import resnet34
-
-
-model_urls = {
-    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
-}
-
-
-def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
-    """1x1 convolution"""
-    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
-
-
-def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
-    """3x3 convolution with padding"""
-    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
-                     padding=dilation, groups=groups, bias=False, dilation=dilation)
-
-
-def _ModifyConvStrideDilation(conv: nn.Conv2d, stride: tuple = (1, 1), padding=None):
-    conv.stride = stride
-
-    if padding is not None:
-        conv.padding = padding
-
-
-def _ModifyBlock(block, bottleneck=False, **kwargs):
-    for m in list(block.children()):
-        if bottleneck:
-           _ModifyConvStrideDilation(m.conv2, **kwargs)
-        else:
-           _ModifyConvStrideDilation(m.conv1, **kwargs)
-
-        if m.downsample is not None:
-            # need to make sure no padding for the 1x1 residual connection
-            _ModifyConvStrideDilation(list(m.downsample.children())[0], **kwargs)
-
-
-class BasicBlock(nn.Module):
-    expansion: int = 1
-
-    def __init__(
-        self,
-        inplanes: int,
-        planes: int,
-        stride: int = 1,
-        downsample: Optional[nn.Module] = None,
-        groups: int = 1,
-        base_width: int = 64,
-        dilation: int = 1,
-        norm_layer: Optional[Callable[..., nn.Module]] = None
-    ) -> None:
-        super(BasicBlock, self).__init__()
-        if norm_layer is None:
-            norm_layer = nn.BatchNorm2d
-        if groups != 1 or base_width != 64:
-            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
-        if dilation > 1:
-            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
-        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
-        self.conv1 = conv3x3(inplanes, planes, stride)
-        self.bn1 = norm_layer(planes)
-        self.relu = nn.ReLU(inplace=True)
-        self.conv2 = conv3x3(planes, planes)
-        self.bn2 = norm_layer(planes)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x: Tensor) -> Tensor:
-        identity = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-
-        if self.downsample is not None:
-            identity = self.downsample(x)
-
-        out += identity
-        out = self.relu(out)
-
-        return out
-
-
-class Bottleneck(nn.Module):
-    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
-    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
-    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
-    # This variant is also known as ResNet V1.5 and improves accuracy according to
-    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
-
-    expansion: int = 4
-
-    def __init__(
-        self,
-        inplanes: int,
-        planes: int,
-        stride: int = 1,
-        downsample: Optional[nn.Module] = None,
-        groups: int = 1,
-        base_width: int = 64,
-        dilation: int = 1,
-        norm_layer: Optional[Callable[..., nn.Module]] = None
-    ) -> None:
-        super(Bottleneck, self).__init__()
-        if norm_layer is None:
-            norm_layer = nn.BatchNorm2d
-        width = int(planes * (base_width / 64.)) * groups
-        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
-        self.conv1 = conv1x1(inplanes, width)
-        self.bn1 = norm_layer(width)
-        self.conv2 = conv3x3(width, width, stride, groups, dilation)
-        self.bn2 = norm_layer(width)
-        self.conv3 = conv1x1(width, planes * self.expansion)
-        self.bn3 = norm_layer(planes * self.expansion)
-        self.relu = nn.ReLU(inplace=True)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x: Tensor) -> Tensor:
-        identity = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.relu(out)
-
-        out = self.conv3(out)
-        out = self.bn3(out)
-
-        if self.downsample is not None:
-            identity = self.downsample(x)
-
-        out += identity
-        out = self.relu(out)
-
-        return out
-
-
-class ResNet(nn.Module):
-
-    def __init__(
-        self,
-        block: Type[Union[BasicBlock, Bottleneck]],
-        layers: List[int],
-        num_classes: int = 1000,
-        zero_init_residual: bool = False,
-        groups: int = 1,
-        width_per_group: int = 64,
-        replace_stride_with_dilation: Optional[List[bool]] = None,
-        norm_layer: Optional[Callable[..., nn.Module]] = None
-    ) -> None:
-        super(ResNet, self).__init__()
-        if norm_layer is None:
-            norm_layer = nn.BatchNorm2d
-        self._norm_layer = norm_layer
-
-        self.inplanes = 64
-        self.dilation = 1
-        if replace_stride_with_dilation is None:
-            # each element in the tuple indicates if we should replace
-            # the 2x2 stride with a dilated convolution instead
-            replace_stride_with_dilation = [False, False, False]
-        if len(replace_stride_with_dilation) != 3:
-            raise ValueError("replace_stride_with_dilation should be None "
-                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
-        self.groups = groups
-        self.base_width = width_per_group
-        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
-                               bias=False)
-        self.bn1 = norm_layer(self.inplanes)
-        self.relu = nn.ReLU(inplace=True)
-        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
-        self.layer1 = self._make_layer(block, 64, layers[0])
-        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
-                                       dilate=replace_stride_with_dilation[0])
-        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
-                                       dilate=replace_stride_with_dilation[1])
-        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
-                                       dilate=replace_stride_with_dilation[2])
-        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
-        self.fc = nn.Linear(512 * block.expansion, num_classes)
-
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-
-        # Zero-initialize the last BN in each residual branch,
-        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
-        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
-        if zero_init_residual:
-            for m in self.modules():
-                if isinstance(m, Bottleneck):
-                    nn.init.constant_(m.bn3.weight, 0)  # type: ignore[arg-type]
-                elif isinstance(m, BasicBlock):
-                    nn.init.constant_(m.bn2.weight, 0)  # type: ignore[arg-type]
-
-    def _make_layer(self, block: Type[Union[BasicBlock, Bottleneck]], planes: int, blocks: int,
-                    stride: int = 1, dilate: bool = False) -> nn.Sequential:
-        norm_layer = self._norm_layer
-        downsample = None
-        previous_dilation = self.dilation
-        if dilate:
-            self.dilation *= stride
-            stride = 1
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                conv1x1(self.inplanes, planes * block.expansion, stride),
-                norm_layer(planes * block.expansion),
-            )
-
-        layers = []
-        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
-                            self.base_width, previous_dilation, norm_layer))
-        self.inplanes = planes * block.expansion
-        for _ in range(1, blocks):
-            layers.append(block(self.inplanes, planes, groups=self.groups,
-                                base_width=self.base_width, dilation=self.dilation,
-                                norm_layer=norm_layer))
-
-        return nn.Sequential(*layers)
-
-    def _forward_impl(self, x: Tensor) -> Tensor:
-        # See note [TorchScript super()]
-        x = self.conv1(x)
-        x = self.bn1(x)
-        x = self.relu(x)
-        x = self.maxpool(x)
-
-        x = self.layer1(x)
-        x = self.layer2(x)
-        x = self.layer3(x)
-        x = self.layer4(x)
-
-        x = self.avgpool(x)
-        x = torch.flatten(x, 1)
-        x = self.fc(x)
-
-        return x
-
-    def forward(self, x: Tensor) -> Tensor:
-        return self._forward_impl(x)
-
-
-class ResNet34(nn.Module):
-    def __init__(self, model_path=None):
-        super().__init__()
-        # rn34 = resnet34(pretrained=(model_path is None))
-        rn34 = ResNet(BasicBlock, [3, 4, 6, 3])
-        if model_path is None:
-            state_dict = model_zoo.load_url(model_urls['resnet34'],
-                                            progress=True)
-            rn34.load_state_dict(state_dict)
-        else:
-            rn34.load_state_dict(torch.load(model_path))
-
-        # discard last Resnet block, avrpooling and classification FC
-        self.layer1 = nn.Sequential(*list(rn34.children())[:6])
-        self.layer2 = nn.Sequential(*list(rn34.children())[6:7])
-        # modify conv4 if necessary
-        # Always deal with stride in first block
-        modulelist = list(self.layer2.children())
-        _ModifyBlock(modulelist[0], stride=(1,1))
-
-    def forward(self, data):
-        layer1_activation = self.layer1(data)
-        x = layer1_activation
-        layer2_activation = self.layer2(x)
-
-        return [layer2_activation]
-
diff --git a/cv/detection/ssd/pytorch/base/model/models/ssd300.py b/cv/detection/ssd/pytorch/base/model/models/ssd300.py
deleted file mode 100644
index 5bcb3f6a6..000000000
--- a/cv/detection/ssd/pytorch/base/model/models/ssd300.py
+++ /dev/null
@@ -1,141 +0,0 @@
-import torch
-import torch.nn as nn
-from .resnet import ResNet34
-
-class SSD300(nn.Module):
-    """
-        Build a SSD module to take 300x300 image input,
-        and output 8732 per class bounding boxes
-
-        vggt: pretrained vgg16 (partial) model
-        label_num: number of classes (including background 0)
-    """
-    def __init__(self, config):
-
-        super(SSD300, self).__init__()
-
-        self.label_num = config.num_classes
-
-        if not config.checkpoint and config.backbone_path:
-            self.backbone = ResNet34(model_path=config.backbone_path)
-        else:
-            self.backbone = ResNet34()
-        out_channels = 256
-        out_size = 38
-        self.out_chan = [out_channels, 512, 512, 256, 256, 256]
-
-        self._build_additional_features(out_size, self.out_chan)
-
-        # after l2norm, conv7, conv8_2, conv9_2, conv10_2, conv11_2
-        # classifer 1, 2, 3, 4, 5 ,6
-
-        self.num_defaults = [4, 6, 6, 6, 4, 4]
-        self.loc = []
-        self.conf = []
-
-        for nd, oc in zip(self.num_defaults, self.out_chan):
-            self.loc.append(nn.Conv2d(oc, nd*4, kernel_size=3, padding=1))
-            self.conf.append(nn.Conv2d(oc, nd*self.label_num, kernel_size=3, padding=1))
-
-        self.loc = nn.ModuleList(self.loc)
-        self.conf = nn.ModuleList(self.conf)
-        # intitalize all weights
-        self._init_weights()
-
-    def _build_additional_features(self, input_size, input_channels):
-        idx = 0
-        if input_size == 38:
-            idx = 0
-        elif input_size == 19:
-            idx = 1
-        elif input_size == 10:
-            idx = 2
-
-        self.additional_blocks = []
-
-        if input_size == 38:
-            self.additional_blocks.append(nn.Sequential(
-                nn.Conv2d(input_channels[idx], 256, kernel_size=1),
-                nn.ReLU(inplace=True),
-                nn.Conv2d(256, input_channels[idx+1], kernel_size=3, padding=1, stride=2),
-                nn.ReLU(inplace=True),
-            ))
-            idx += 1
-
-        self.additional_blocks.append(nn.Sequential(
-            nn.Conv2d(input_channels[idx], 256, kernel_size=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(256, input_channels[idx+1], kernel_size=3, padding=1, stride=2),
-            nn.ReLU(inplace=True),
-        ))
-        idx += 1
-
-        # conv9_1, conv9_2
-        self.additional_blocks.append(nn.Sequential(
-            nn.Conv2d(input_channels[idx], 128, kernel_size=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(128, input_channels[idx+1], kernel_size=3, padding=1, stride=2),
-            nn.ReLU(inplace=True),
-        ))
-        idx += 1
-
-        # conv10_1, conv10_2
-        self.additional_blocks.append(nn.Sequential(
-            nn.Conv2d(input_channels[idx], 128, kernel_size=1),
-            nn.ReLU(inplace=True),
-            nn.Conv2d(128, input_channels[idx+1], kernel_size=3),
-            nn.ReLU(inplace=True),
-        ))
-        idx += 1
-
-        # Only necessary in VGG for now
-        if input_size >= 19:
-            # conv11_1, conv11_2
-            self.additional_blocks.append(nn.Sequential(
-                nn.Conv2d(input_channels[idx], 128, kernel_size=1),
-                nn.ReLU(inplace=True),
-                nn.Conv2d(128, input_channels[idx+1], kernel_size=3),
-                nn.ReLU(inplace=True),
-            ))
-
-        self.additional_blocks = nn.ModuleList(self.additional_blocks)
-
-    def _init_weights(self):
-
-        layers = [
-            *self.additional_blocks,
-            *self.loc, *self.conf]
-
-        for layer in layers:
-            for param in layer.parameters():
-                if param.dim() > 1: nn.init.xavier_uniform_(param)
-
-    # Shape the classifier to the view of bboxes
-    def bbox_view(self, src, loc, conf):
-        ret = []
-        for s, l, c in zip(src, loc, conf):
-            ret.append((l(s).view(s.size(0), 4, -1), c(s).view(s.size(0), self.label_num, -1)))
-
-        locs, confs = list(zip(*ret))
-        locs, confs = torch.cat(locs, 2).contiguous(), torch.cat(confs, 2).contiguous()
-        return locs, confs
-
-    def forward(self, data):
-
-        layers = self.backbone(data)
-
-        # last result from network goes into additional blocks
-        x = layers[-1]
-        additional_results = []
-        for i, l in enumerate(self.additional_blocks):
-            x = l(x)
-            additional_results.append(x)
-
-        src = [*layers, *additional_results]
-        # Feature Map 38x38x4, 19x19x6, 10x10x6, 5x5x6, 3x3x4, 1x1x4
-
-        locs, confs = self.bbox_view(src, self.loc, self.conf)
-
-        # For SSD 300, shall return nbatch x 8732 x {nlabels, nlocs} results
-        return locs, confs
-
diff --git a/cv/detection/ssd/pytorch/base/optimizers/__init__.py b/cv/detection/ssd/pytorch/base/optimizers/__init__.py
deleted file mode 100644
index 8ff379afc..000000000
--- a/cv/detection/ssd/pytorch/base/optimizers/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .factory import create_optimizer
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/base/optimizers/factory.py b/cv/detection/ssd/pytorch/base/optimizers/factory.py
deleted file mode 100644
index 343b3a1da..000000000
--- a/cv/detection/ssd/pytorch/base/optimizers/factory.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import torch
-
-
-def create_optimizer(name: str, params, config):
-    name = name.lower()
-
-    if name == "sgd":
-        return torch.optim.SGD(params, lr=config.learning_rate,
-                                momentum=0.9,
-                                weight_decay=config.weight_decay_rate)
-    raise RuntimeError(f"Not found optimier {name}.")
diff --git a/cv/detection/ssd/pytorch/base/prepare.py b/cv/detection/ssd/pytorch/base/prepare.py
deleted file mode 100644
index 4b181436e..000000000
--- a/cv/detection/ssd/pytorch/base/prepare.py
+++ /dev/null
@@ -1,316 +0,0 @@
-import os
-import shutil
-import subprocess
-from typing import List, Optional, Union
-import os.path as ospath
-from argparse import ArgumentParser, REMAINDER
-from functools import partial, wraps
-from typing import NamedTuple
-# import platform # python3.5已弃用
-
-
-# =========================================================
-# Define arguments
-# =========================================================
-
-def parse_args():
-    parser = ArgumentParser("Prepare")
-    parser.add_argument("--name", type=str, default=None, help="The name of submitter")
-    parser.add_argument("--data_dir", type=str, default=None, help="Data directory")
-    # WARN: Don't delete this argument
-    parser.add_argument('other_args', nargs=REMAINDER)
-    args = parser.parse_args()
-    return args
-
-
-# =========================================================
-# Constants
-# =========================================================
-args              = parse_args()
-
-APT               = "apt"
-YUM               = "yum"
-PIP_INSTALL       = "pip3 install "
-PYTHON            = "python3"
-DOWNLOAD          = "wget"
-
-APT_PKGS          = ["numactl"]
-YUM_PKGS          = ["numactl"]
-SUPPORTED_WHEELS  = ["torch", "apex"]
-
-
-MODEL_DIR = ospath.abspath(
-    ospath.join(
-        __file__,
-        "../../../"
-    )
-)
-PROJ_DIR = ospath.abspath(
-    ospath.join(
-        MODEL_DIR,
-        "pytorch"
-    )
-)
-MODEL_IMPL_DIR = ospath.join(PROJ_DIR, "base")
-CURRENT_MODEL_NAME = ospath.basename(MODEL_DIR)
-
-PACKAGE_DIR_NAME  = "packages"
-SOURCE_DIR_NAME   = "csrc"
-SDK_DIR_NAME      = "sdk_installers"
-PACKAGE_LIST_NAME = "files.txt"
-SDK_LIST_NAME     = "files.txt"
-
-SUBMITTER         = args.name
-DATA_DIR          = args.data_dir
-SUBMITTER_DIR     = ospath.join(PROJ_DIR, SUBMITTER)
-
-EXTENSION_SOURCE_DIR_ENV = "EXTENSION_SOURCE_DIR"
-SDK_ARGUMENTS_ENV        = "SDK_ARGUMENTS"
-
-
-# =========================================================
-# Helpers
-# =========================================================
-
-class ShellResult(NamedTuple):
-
-    returncode: int
-    result: Union[subprocess.CompletedProcess, str] = None
-
-
-def _exec_cmd(cmd: Union[str, List], *args, **kwargs):
-    args_str = " ".join(args)
-    args_str += " ".join([f"{name}={value}" for name, value in kwargs.items()])
-    cmd_str = cmd
-    if isinstance(cmd, (tuple, list)):
-        cmd_str = " ".join(cmd)
-    print(f"Commands: {cmd_str}")
-
-    result = subprocess.run(cmd, *args, **kwargs, stdout=None, stderr=subprocess.STDOUT)
-
-    if result.returncode > 0:
-        msg = f"ERROR: {cmd} {args_str}"
-        return ShellResult(returncode=result.returncode, result=msg)
-
-    return ShellResult(returncode=result.returncode, result=result)
-
-
-def exec_shell_cmd(cmd: str, *args, **kwargs):
-    return _exec_cmd(cmd, shell=True, *args, **kwargs)
-
-
-def exec_shell_cmds(cmds: List[str], *args, **kwargs):
-    cmds = "\n".join(cmds)
-    return exec_shell_cmd(cmds, *args, **kwargs)
-
-
-def warning(*args, **kwargs):
-    print("WARN:", *args, **kwargs)
-
-
-def find_file_by_match(dir, file_patterns):
-    if ospath.exists(dir):
-        dir_files = os.listdir(dir)
-    else:
-        return file_patterns
-
-    for i, pattern in enumerate(file_patterns):
-        pattern = pattern.strip()
-        if len(pattern) <= 1 or not pattern.endswith("*"):
-            continue
-
-        pattern = pattern[:-1]
-
-        for dir_file in dir_files:
-            if dir_file.startswith(pattern):
-                file_patterns[i] = dir_file
-                break
-    return file_patterns
-
-# =========================================================
-# Pipelines
-# =========================================================
-
-def install_apt_packages():
-    if len(APT_PKGS) == 0:
-        return
-    return exec_shell_cmd(f"{APT} install -y {' '.join(APT_PKGS)}")
-
-
-def install_yum_packages():
-    if len(YUM_PKGS) == 0:
-        return
-    return exec_shell_cmd(f"{YUM} install {' '.join(YUM_PKGS)}")
-
-
-def prepare_data():
-    checked_files = ["bbox_only_instances_train2017.json", "bbox_only_instances_val2017.json"]
-    path_join = ospath.join
-    exist_preprocessed_data = all([ospath.exists(path_join(DATA_DIR, "annotations", name)) for name in checked_files])
-
-    if exist_preprocessed_data:
-        return
-    # current_dir = os.path.join(MODEL_DIR, "pytorch")
-    cmds = [
-        # f"cd {current_dir}",
-        f"python3 data_preprocessing/prepare_json.py --keep-keys {DATA_DIR}/annotations/instances_val2017.json {DATA_DIR}/annotations/bbox_only_instances_val2017.json",
-        f"python3 data_preprocessing/prepare_json.py {DATA_DIR}/annotations/instances_train2017.json {DATA_DIR}/annotations/bbox_only_instances_train2017.json"
-    ]
-    return exec_shell_cmds(cmds)
-
-
-def install_sdk():
-    def get_sdk_args():
-        sdk_args = dict()
-        if SDK_ARGUMENTS_ENV in os.environ:
-            sdk_args_str = os.environ[SDK_ARGUMENTS_ENV]
-
-            sdk_args_segments = sdk_args_str.split(';')
-            for sdk_arg in sdk_args_segments:
-                sdk, arg = sdk_arg.split('=', maxsplit=1)
-                sdk_args[sdk] = arg
-        return sdk_args
-
-    sdk_args_dict = get_sdk_args()
-    print("SDK Arguments:", sdk_args_dict)
-
-    sdk_installer_dir = ospath.join(SUBMITTER_DIR, SDK_DIR_NAME)
-    if not ospath.exists(sdk_installer_dir):
-        sdk_installer_dir = ospath.join(MODEL_IMPL_DIR, SUBMITTER, SDK_DIR_NAME)
-        if not ospath.exists(sdk_installer_dir):
-            warning("Not found sdk\'s dir, skip run installer")
-            return
-
-    # Find sdk installers
-    sdk_list_file = ospath.join(sdk_installer_dir, SDK_LIST_NAME)
-    if ospath.exists(sdk_list_file):
-        with open(sdk_list_file) as f:
-            sdk_installers = f.readlines()
-        sdk_installers_pattern = [sdk.strip() for sdk in sdk_installers]
-        sdk_installers = find_file_by_match(sdk_installer_dir, sdk_installers_pattern)
-    else:
-        sdk_installers = os.listdir(sdk_installer_dir)
-        sdk_installers.sort()
-
-    sdk_installers_cmds = []
-    for sdk in sdk_installers:
-        if sdk.endswith(".run"):
-            sdk_arg = ""
-            for sdk_args_key in sdk_args_dict:
-                if sdk.startswith(sdk_args_key):
-                    sdk_arg = sdk_args_dict[sdk_args_key]
-            sdk_installers_cmds.append("sh " + ospath.join(sdk_installer_dir, sdk) + f" {sdk_arg}")
-
-    if len(sdk_installers_cmds) == 0:
-        warning("Not found installer in", sdk_installer_dir, ", skip run installer")
-        return
-
-    return exec_shell_cmds(sdk_installers_cmds)
-
-
-def install_requirements():
-    return exec_shell_cmd(
-        f"{PIP_INSTALL} -r requirements.txt"
-    )
-
-
-def install_wheel_pkgs(filter_packages: bool=False):
-    wheel_dir = ospath.join(SUBMITTER_DIR, PACKAGE_DIR_NAME)
-    if not ospath.exists(wheel_dir):
-        warning("Not found package\'s dir, skip install wheel package")
-        return
-
-    # Find packages
-    package_list_file = ospath.join(wheel_dir, PACKAGE_LIST_NAME)
-    if ospath.exists(package_list_file):
-        with open(package_list_file) as f:
-            packages = f.readlines()
-        packages_pattern = [pkg.strip() for pkg in packages]
-        packages = find_file_by_match(wheel_dir, packages_pattern)
-    else:
-        packages = os.listdir(wheel_dir)
-        packages.sort()
-
-    def _filter_packages(name: str):
-        for support_pkg in SUPPORTED_WHEELS:
-            if name.startswith(support_pkg):
-                return True
-        return False
-
-    if filter_packages:
-        packages = list(filter(_filter_packages, packages))
-
-    if len(packages) == 0:
-        warning("Not found wheel packages in", wheel_dir)
-        return
-
-    install_packages_cmds = [f"{PIP_INSTALL} {ospath.join(wheel_dir, pkg)}" for pkg in packages]
-    return exec_shell_cmds(install_packages_cmds)
-
-
-def install_extensions():
-    source_dir = ospath.join(SUBMITTER_DIR, SOURCE_DIR_NAME)
-    if not ospath.exists(source_dir):
-        warning("Not found source dir:", source_dir)
-        return
-
-    sandbox_dir = os.path.join(MODEL_DIR, "pytorch", 'sandbox', "extension")
-    if os.path.exists(sandbox_dir):
-        shutil.rmtree(sandbox_dir)
-    print("sandbox_dir: ", sandbox_dir)
-    cmds = [
-        f"export {EXTENSION_SOURCE_DIR_ENV}={source_dir}",
-        f"mkdir -p {sandbox_dir}",
-        f"cd {sandbox_dir}",
-        f"{PYTHON} {SUBMITTER_DIR}/setup.py install",
-        f"rm -rf {sandbox_dir}",
-    ]
-
-    return exec_shell_cmds(cmds)
-
-def get_linux_distro():
-    try:
-        with open('/etc/os-release') as f:
-            for line in f:
-                if line.startswith('NAME='):
-                    # 提取 NAME 字段的值（例如 "Ubuntu" 或 "CentOS"）
-                    name = line.split('=')[1].strip().strip('"')
-                    if 'Ubuntu' in name:
-                        return 'Ubuntu'
-                    elif 'CentOS' in name:
-                        return 'CentOS'
-                    else:
-                        return name  # 返回其他发行版名称
-    except FileNotFoundError:
-        return 'Unknown Linux distribution'
-
-def pipelines():
-    plat = get_linux_distro().lower()
-    if "centos" == plat:
-        res = [install_yum_packages]
-    elif "ubuntu" == plat:
-        res = [install_apt_packages]
-    else:
-        raise Exception("Invalid Platform, only support Centos and Debian!")
-    res.extend([
-        install_requirements,
-        install_sdk,
-        partial(install_wheel_pkgs, filter_packages=True),
-        install_extensions,
-        prepare_data,
-    ])
-    return res
-
-
-if __name__ == '__main__':
-    for pipeline in pipelines():
-        result = pipeline()
-        if result is not None and result.returncode > 0:
-            print(result.result)
-            print("Fail:", pipeline)
-            exit(result.returncode)
-
-
-
-
-
diff --git a/cv/detection/ssd/pytorch/base/run_train.py b/cv/detection/ssd/pytorch/base/run_train.py
deleted file mode 100644
index 5b72b9591..000000000
--- a/cv/detection/ssd/pytorch/base/run_train.py
+++ /dev/null
@@ -1,123 +0,0 @@
-"""SSD training"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-from copy import copy
-import os
-import random
-import time
-from concurrent.futures import ProcessPoolExecutor
-
-import numpy as np
-import torch
-from torch.cuda.amp import GradScaler
-
-import utils
-from dataloaders.dataloader import create_train_dataloader
-from train.evaluator import Evaluator
-from train.trainer import Trainer
-from train.training_state import TrainingState
-from train.event import TrainingEventCompose, TrainingLogger
-
-logger = None
-
-
-def main():
-    import config
-    parser = argparse.ArgumentParser("SSD")
-    config.activate_config_env(parser=parser, with_config_env_name=True)
-
-    if config.use_env and 'LOCAL_RANK' in os.environ:
-        config.local_rank = int(os.environ['LOCAL_RANK'])
-
-    device, num_gpus = utils.init_dist_training_env(config)
-    config.device = device
-    config.n_gpu = num_gpus
-
-    utils.check_config(config)
-
-    torch.backends.cudnn.benchmark = True
-    interface = config.training_event(config)
-    events = [
-        TrainingLogger(config, log_freq=config.log_freq)
-    ]
-    training_event = TrainingEventCompose(interface, events)
-    training_event.launch()
-
-    global logger
-    logger = events[0].logger
-
-    utils.barrier()
-    training_event.on_init_start()
-    init_start_time = logger.previous_log_time
-    utils.setup_seeds(config)
-
-    evaluator = Evaluator(config)
-    grad_scaler = GradScaler(init_scale=float(os.getenv("INIT_LOSS_SCALE", 2 ** 20)), growth_interval=2000)
-    training_state = TrainingState()
-    trainer = Trainer(config, training_event, evaluator, training_state, grad_scaler, device=device)
-    training_state._trainer = trainer
-
-    train_dataloader, epoch_size, train_sampler = create_train_dataloader(config)
-    utils.barrier()
-    trainer.init()
-
-    utils.barrier()
-    init_evaluation_start = time.time()
-    eval_ap = evaluator.evaluate(trainer)
-    training_state.eval_ap = eval_ap
-    init_evaluation_end = time.time()
-    init_evaluation_info = dict(
-        eval_ap=eval_ap,
-        time=init_evaluation_end - init_evaluation_start
-    )
-    training_event.on_init_evaluate(init_evaluation_info)
-
-    if not config.do_train:
-        return config, training_state, init_evaluation_info["time"]
-
-    training_event.on_init_end()
-    init_end_time = logger.previous_log_time
-    training_state.init_time = (init_end_time - init_start_time) / 1e+3
-
-    utils.barrier()
-
-    training_event.on_train_begin()
-    raw_train_start_time = logger.previous_log_time
-    while training_state.epoch < config.epochs and not training_state.end_training:
-        training_state.epoch += 1
-        trainer.train_one_epoch(train_dataloader)
-        if config.distributed and not config.dali:
-            train_sampler.set_epoch(training_state.epoch)
-    training_event.on_train_end()
-    raw_train_end_time = logger.previous_log_time
-    training_state.raw_train_time = (raw_train_end_time - raw_train_start_time) / 1e+3
-    return config, training_state, epoch_size
-
-
-if __name__ == "__main__":
-    now = time.time()
-    config, training_state, epoch_size = main()
-
-    if not utils.is_main_process():
-        print("process {} exit!".format(config.local_rank))
-        exit()
-
-    gpu_count = config.n_gpu
-    e2e_time = time.time() - now
-    training_perf = (epoch_size * training_state.epoch) / training_state.raw_train_time
-    if config.do_train:
-        finished_info = {
-            "e2e_time": e2e_time,
-            "training_samples_per_second": training_perf,
-            "converged": training_state.converged,
-            "final_ap": training_state.eval_ap,
-            "raw_train_time": training_state.raw_train_time,
-            "init_time": training_state.init_time,
-        }
-    else:
-        finished_info = {"e2e_time": e2e_time}
-    logger.log("FINISHED", finished_info, stacklevel=0)
diff --git a/cv/detection/ssd/pytorch/base/run_training.sh b/cv/detection/ssd/pytorch/base/run_training.sh
deleted file mode 100644
index 94ab4b9c4..000000000
--- a/cv/detection/ssd/pytorch/base/run_training.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2022, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-get_lscpu_value() {
-    awk -F: "(\$1 == \"${1}\"){gsub(/ /, \"\", \$2); print \$2; found=1} END{exit found!=1}"
-}
-lscpu_out=$(lscpu)
-
-n_sockets=$(get_lscpu_value 'Socket(s)' <<< "${lscpu_out}")
-n_cores_per_socket=$(get_lscpu_value 'Core(s) per socket' <<< "${lscpu_out}")
-
-echo "Number of CPU sockets on a node: ${n_sockets}"
-echo "Number of CPU cores per socket: ${n_cores_per_socket}"
-
-EXIT_STATUS=0
-check_status()
-{
-    if ((${PIPESTATUS[0]} != 0)); then
-        EXIT_STATUS=1
-    fi
-}
-
-export PYTHONPATH=../:$PYTHONPATH
-
-python3 -u -m bind_launch \
-    --nsockets_per_node ${n_sockets} \
-    --ncores_per_socket ${n_cores_per_socket} \
-    --no_hyperthreads  \
-    --no_membind ./run_train.py --do_train "$@" ; check_status
-
-exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/base/run_with_docker.sh b/cv/detection/ssd/pytorch/base/run_with_docker.sh
deleted file mode 100644
index 5ac6404b7..000000000
--- a/cv/detection/ssd/pytorch/base/run_with_docker.sh
+++ /dev/null
@@ -1,224 +0,0 @@
-# Copyright (c) 2022, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-# =================================================
-# Constants
-# =================================================
-
-MODEL="ssd"
-export MODEL
-DOCKER_IMAGE="model_zoo:${MODEL}"
-NEXP=1
-
-# TODO: Add to Dockerfile
-WORK_DIR="/workspace/model_zoo/ssd/pytorch"
-MODEL_DIR="${WORK_DIR}/base"
-
-CURRENT_DIR=$(cd `dirname $0`; pwd)
-PROJ_DIR="${CURRENT_DIR}/../"
-BUILD_EXTENSION_DIR="${CURRENT_DIR}/build"
-BUILD_EXTENSION_PACKAGE_NAME="ext_ops"
-
-BASE_DOCKERFILE_PATH="${CURRENT_DIR}/BaseDockerfile"
-HOST_DOCKERFILE_PATH="${CURRENT_DIR}/Dockerfile"
-
-SOURCE_DATA_DIR=""
-MAP_DATA_DIR="/mnt/dataset/model_zoo/${MODEL}"
-MAP_BACKBONE_DIR="/mnt/dataset/model_zoo/${MODEL}/resnet34-333f7ec4.pth"
-SUBMITTER="iluvatar"
-CONFIG=""
-
-: "${CLEAR_CACHES:=1}"
-SHM_SIZE="32g"
-
-
-# =================================================
-# Parse arguments
-# =================================================
-
-i=2
-TRAINING_SCRIPT_ARGS="$@"
-for arg in "$@"
-do
-    if [[ $arg =~ "--data_dir" ]]; then
-        if [[ $arg =~ "=" ]]; then
-            kv=(${arg//=/ })
-            SOURCE_DATA_DIR=${kv[1]}
-            TRAINING_SCRIPT_ARGS=${TRAINING_SCRIPT_ARGS/$arg/"--data_dir ${MAP_DATA_DIR} --backbone_path ${MAP_BACKBONE_DIR}"}
-        else
-            SOURCE_DATA_DIR=${!i}
-            TRAINING_SCRIPT_ARGS=${TRAINING_SCRIPT_ARGS/"--data_dir ${!i}"/"--data_dir ${MAP_DATA_DIR} --backbone_path ${MAP_BACKBONE_DIR}"}
-        fi
-
-    elif [[ $arg =~ "--name" ]]; then
-        if [[ $arg =~ "=" ]]; then
-            kv=(${arg//=/ })
-            SUBMITTER=${kv[1]}
-        else
-            SUBMITTER=${!i}
-        fi
-
-    elif [[ $arg =~ "--config" ]]; then
-        if [[ $arg =~ "=" ]]; then
-            kv=(${arg//=/ })
-            CONFIG=${kv[1]}
-        else
-            CONFIG=${!i}
-        fi
-    fi
-
-    let i++
-done
-
-
-# =================================================
-# Check arguments
-# =================================================
-
-if [[ "${SOURCE_DATA_DIR}" == "" ]]; then
-    echo "ERROR: data_dir is not given, please set --data_dir <DATA_DIR>"
-    exit 1
-fi
-
-if [[ "${CONFIG}" == "" ]]; then
-    echo "ERROR: config is not given, please set --config <CONFIG>"
-    exit 1
-fi
-
-CONTAINER_SUBMITTER_DIR="${WORK_DIR}/${SUBMITTER}"
-HOST_SUBMITTER_DIR="${PROJ_DIR}/${SUBMITTER}"
-
-CONTAINER_ENVIRONMENT_VARIABLES_PATH=${CONTAINER_SUBMITTER_DIR}/${MODEL}/config/environment_variables.sh
-HOST_ENVIRONMENT_VARIABLES_PATH="${HOST_SUBMITTER_DIR}/${MODEL}/config/environment_variables.sh"
-
-HOST_SUBMITTER_DOCKERFILE="${PROJ_DIR}/${SUBMITTER}/${MODEL}/config/Dockerfile"
-CONTAINER_NAME="model_zoo-${MODEL}-${SUBMITTER}-container"
-
-if [ ! -f "${HOST_ENVIRONMENT_VARIABLES_PATH}" ]; then
-    touch "${HOST_ENVIRONMENT_VARIABLES_PATH}"
-fi
-
-source ${HOST_ENVIRONMENT_VARIABLES_PATH}
-
-RESULTS_DIR="${PROJ_DIR}/${SUBMITTER}/${MODEL}/results"
-LOG_FILE_BASE="${RESULTS_DIR}/config_${CONFIG}_experiment"
-
-echo "======================================"
-echo "Arguments"
-echo "---------"
-
-echo "CLEAR_CACHES = ${CLEAR_CACHES}"
-echo "CLEAR_CONTAINER = ${CLEAR_CNT}"
-echo "MODEL = ${MODEL}"
-echo "CONTAINER_NAME = ${CONTAINER_NAME}"
-echo "DOCKER_IMAGE = ${DOCKER_IMAGE}"
-echo "MODEL_DIR = ${MODEL_DIR}"
-echo "SUBMITTER = ${SUBMITTER}"
-echo "CONTAINER_SUBMITTER_DIR = ${CONTAINER_SUBMITTER_DIR}"
-echo "HOST_SUBMITTER_DOCKERFILE = ${HOST_SUBMITTER_DOCKERFILE}"
-echo "CONFIG = ${CONFIG}"
-echo "CONTAINER_MOUNTS = ${CONTAINER_MOUNTS}"
-echo "TRAINING_SCRIPT_ARGS = ${TRAINING_SCRIPT_ARGS[*]}"
-echo "CURRENT_DIR = ${CURRENT_DIR}"
-echo "CONTAINER_ENVIRONMENT_VARIABLES_PATH = ${CONTAINER_ENVIRONMENT_VARIABLES_PATH}"
-echo "RESULTS_DIR = ${RESULTS_DIR}"
-echo "LOG_FILE_BASE = ${LOG_FILE_BASE}"
-echo "SHM_SIZE = ${SHM_SIZE}"
-echo "======================================"
-
-
-# =================================================
-# Training
-# =================================================
-
-# Cleanup container
-cleanup_docker() {
-    docker container rm -f "${CONTAINER_NAME}" || true
-}
-if [ "${CLEAR_CNT}" -eq 1 ]; then
-    cleanup_docker
-    trap 'set -eux; cleanup_docker' EXIT
-fi
-
-
-container_status=`docker ps | grep ${CONTAINER_NAME}`
-if [[ ! ${container_status} =~ ${CONTAINER_NAME} ]]; then
-  # Clean built extension
-  if [ -d "${BUILD_EXTENSION_DIR}" ]; then
-      echo "WARN: Delete built extension"
-      rm -rf "${BUILD_EXTENSION_DIR}"
-      rm -rf ${CURRENT_DIR}/${BUILD_EXTENSION_PACKAGE_NAME}.*.so
-      echo "extension file: "${CURRENT_DIR}/${BUILD_EXTENSION_PACKAGE_NAME}.*.so""
-  fi
-
-
-  # Build image
-  if [ -f "${HOST_DOCKERFILE_PATH}" ]; then
-      echo "WARN: Remove previous Dockerfile"
-      rm -f "${HOST_DOCKERFILE_PATH}"
-  fi
-
-  echo "WARN: cp BaseDockerfile to Dockerfile"
-  cp "${BASE_DOCKERFILE_PATH}" "${HOST_DOCKERFILE_PATH}"
-
-  if [ ${SUBMITTER} = "nvidia" ]; then
-      echo "Nvidia Dockerfile build from Nvidia NGC Images."
-      cat "${HOST_SUBMITTER_DOCKERFILE}" > "${HOST_DOCKERFILE_PATH}"
-  elif [ -f "${HOST_SUBMITTER_DOCKERFILE}" ]; then
-      echo "WARN: Found submitter's Dockerfile, merging submitter's Dockerfile to Dockerfile"
-      cat "${HOST_SUBMITTER_DOCKERFILE}" >> "${HOST_DOCKERFILE_PATH}"
-  fi
-
-  docker build -t ${DOCKER_IMAGE} ./
-
-  # Setup container by Dockerfile
-  docker run --rm --init --detach \
-      --net=host --uts=host --ipc=host --security-opt=seccomp=unconfined \
-      --privileged=true \
-      --ulimit=stack=67108864 --ulimit=memlock=-1 \
-      -w ${MODEL_DIR} \
-      --shm-size="${SHM_SIZE}" \
-      --volume ${SOURCE_DATA_DIR}:${MAP_DATA_DIR} \
-      --volume ${PROJ_DIR}:${WORK_DIR} \
-      --name="${CONTAINER_NAME}" ${CONTAINER_MOUNTS} \
-      "${DOCKER_IMAGE}" sleep infinity
-
-  # make sure container has time to finish initialization
-  # TODO: Uncomment
-  sleep 5
-  docker exec -it "${CONTAINER_NAME}" true
-
-  mkdir -p ${RESULTS_DIR}
-  docker exec -it "${CONTAINER_NAME}" sh -c "chmod 777 run_training.sh"
-
-  # TODO: Remove pip source
-  docker exec -it "${CONTAINER_NAME}" /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"
-
-  docker exec -it "${CONTAINER_NAME}" /bin/bash -c "source ${CONTAINER_ENVIRONMENT_VARIABLES_PATH};python3 prepare.py --name ${SUBMITTER} --data_dir ${MAP_DATA_DIR}"
-fi
-
-# Run experiments
-for _experiment_index in $(seq 1 "${NEXP}"); do
-    (
-        echo "Beginning trial ${_experiment_index} of ${NEXP}"
-        echo "source ${CONTAINER_ENVIRONMENT_VARIABLES_PATH};bash ./run_training.sh ${TRAINING_SCRIPT_ARGS[*]}"
-
-        if [ "${CLEAR_CACHES}" -eq 1 ]; then
-            sync && sudo /sbin/sysctl vm.drop_caches=3
-        fi
-
-        # Run experiment
-        docker exec -it "${CONTAINER_NAME}" /bin/bash -c "source ${CONTAINER_ENVIRONMENT_VARIABLES_PATH};bash ./run_training.sh ${TRAINING_SCRIPT_ARGS[*]}"
-    ) |& tee "${LOG_FILE_BASE}_${_experiment_index}.log"
-done
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/base/setup.py b/cv/detection/ssd/pytorch/base/setup.py
deleted file mode 100644
index f006678c0..000000000
--- a/cv/detection/ssd/pytorch/base/setup.py
+++ /dev/null
@@ -1,102 +0,0 @@
-import glob
-import os
-import os.path as ospath
-
-from setuptools import setup, Extension
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-
-PACKAGE_NAME = "ext_ops"
-
-SOURCE_FILE_EXT = ["c", "cpp", "cu"]
-HEADER_FILE_EXT = ["h", "hpp", "cuh"]
-
-SUPPORT_EXTENSIONS = SOURCE_FILE_EXT + HEADER_FILE_EXT
-SOURCE_DIR_KEY = "extension_source_dir"
-NVCC_ARGUMENTS_KEY = "NVCC_ARGUMENTS"
-
-
-def get_value_from_environ(name: str, default=None):
-    if name in os.environ:
-        return os.environ[name]
-    if name.upper() in os.environ:
-        return os.environ[name.upper()]
-
-    return default
-
-
-def check_source_dir():
-    source_dir = get_value_from_environ(SOURCE_DIR_KEY)
-    if source_dir in [None, ""]:
-        raise ValueError(f"Invaild `source_dir` argument: {source_dir}.")
-
-    return source_dir
-
-
-def find_source_files() -> dict:
-    source_dir = check_source_dir()
-
-    if not ospath.exists(source_dir):
-        return dict()
-
-    # Search source files
-    sources = dict()
-    for ext in SOURCE_FILE_EXT:
-        sources[ext] = glob.glob(ospath.join(source_dir, "**", f"*.{ext}"), recursive=True)
-
-    return sources
-
-
-def find_include_dirs() -> list:
-    source_dir = check_source_dir()
-    if not ospath.exists(source_dir):
-        return []
-    return glob.glob(ospath.join(source_dir, "**", "include"), recursive=True)
-
-
-def get_nvcc_arguments() -> list:
-    arguments = get_value_from_environ(NVCC_ARGUMENTS_KEY)
-    if arguments is None:
-        return []
-    arguments = arguments.split(" ")
-    return arguments
-
-
-source_files = find_source_files()
-include_dirs = find_include_dirs()
-c_sources = source_files.pop("c")
-other_sources = []
-for _sources in source_files.values():
-    other_sources.extend(_sources)
-
-nvcc_arguments = get_nvcc_arguments()
-
-ext_modules = []
-
-if len(c_sources) != 0:
-    ext_modules.append(Extension(
-        name=PACKAGE_NAME,
-        sources=c_sources,
-        include_dirs=include_dirs,
-        extra_compile_args={
-            'c': ['-O3']
-        }
-    ))
-
-if len(other_sources) != 0:
-    ext_modules.append(CUDAExtension(
-        name=PACKAGE_NAME,
-        sources=other_sources,
-        extra_compile_args={
-            'cxx': ['-O3', ],
-            'nvcc': ['-O3'] + nvcc_arguments
-        }
-    ))
-
-setup(
-    name=PACKAGE_NAME,
-    version="0.1",
-    ext_modules=ext_modules,
-    cmdclass={
-        'build_ext': BuildExtension
-    }
-)
diff --git a/cv/detection/ssd/pytorch/base/test/dali_dataloader_test.py b/cv/detection/ssd/pytorch/base/test/dali_dataloader_test.py
deleted file mode 100644
index 6b1d3b484..000000000
--- a/cv/detection/ssd/pytorch/base/test/dali_dataloader_test.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import sys
-sys.path.append("..")
-
-import torch
-from dataloaders import create_train_dataloader, create_eval_dataloader
-
-if __name__ == "__main__":
-    class Config(object):
-        def __init__(self):
-            pass
-
-    config = Config()
-    config.data_dir = "/home/data/perf/ssd"
-    config.input_size = 300
-    config.n_gpu = 1
-    config.distributed = False
-    config.local_rank = 0
-    config.local_seed = 32
-    config.num_workers = 4
-    config.train_batch_size = 32
-    config.eval_batch_size = 32
-    config.fp16 = True
-    config.fast_nms = True
-    config.fast_cj = True
-    config.use_coco_ext = False
-    config.dali = True
-    config.dali_sync = False
-    config.dali_cache = -1
-    config.nhwc = True
-    config.pad_input = True
-    config.jit = True
-    config.use_nvjpeg = False
-
-    train_loader, epoch_size, train_sampler = create_train_dataloader(config)
-    for batch in train_loader:
-        print(len(batch))
-        break
-    val_loader, inv_map, cocoGt = create_eval_dataloader(config)
-    from dataloaders.prefetcher import eval_prefetcher
-    val_loader = eval_prefetcher(iter(val_loader),
-                    torch.cuda.current_device(),
-                    config.pad_input,
-                    config.nhwc,
-                    config.fp16)
-    for batch in val_loader:
-        print(len(batch))
-        break
-    print("finished!")
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/base/test/dataloader_test.py b/cv/detection/ssd/pytorch/base/test/dataloader_test.py
deleted file mode 100644
index 570c7a256..000000000
--- a/cv/detection/ssd/pytorch/base/test/dataloader_test.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import sys
-sys.path.append("..")
-
-import torch
-from dataloaders import create_train_dataloader, create_eval_dataloader
-
-if __name__ == "__main__":
-    class Config(object):
-        def __init__(self):
-            pass
-
-    config = Config()
-    config.data_dir = "/home/data/perf/ssd"
-    config.input_size = 300
-    config.n_gpu = 1
-    config.distributed = False
-    config.local_rank = 0
-    config.local_seed = 32
-    config.num_workers = 4
-    config.train_batch_size = 32
-    config.eval_batch_size = 32
-    config.fp16 = True
-    config.fast_nms = False
-    config.fast_cj = False
-    config.dali = False
-    config.dali_sync = False
-    config.dali_cache = 0
-    config.nhwc = False
-    config.pad_input = False
-    config.jit = False
-    config.use_nvjpeg = False
-
-    train_loader, epoch_size, train_sampler = create_train_dataloader(config)
-    for batch in train_loader:
-        print(len(batch))
-        break
-    val_loader, inv_map, cocoGt = create_eval_dataloader(config)
-    from dataloaders.prefetcher import eval_prefetcher
-    val_loader = eval_prefetcher(iter(val_loader),
-                    torch.cuda.current_device(),
-                    config.pad_input,
-                    config.nhwc,
-                    config.fp16)
-    for batch in val_loader:
-        print(len(batch))
-        break
-    print("finished!")
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/base/train/__init__.py b/cv/detection/ssd/pytorch/base/train/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/cv/detection/ssd/pytorch/base/train/evaluator.py b/cv/detection/ssd/pytorch/base/train/evaluator.py
deleted file mode 100644
index 1132827e7..000000000
--- a/cv/detection/ssd/pytorch/base/train/evaluator.py
+++ /dev/null
@@ -1,116 +0,0 @@
-from concurrent.futures import ProcessPoolExecutor
-import sys
-
-import numpy as np
-from pycocotools.cocoeval import COCOeval
-import torch
-import torch.distributed as dist
-
-
-import utils
-from dataloaders.dataloader import create_eval_dataloader
-from dataloaders.prefetcher import eval_prefetcher
-
-import config
-from box_coder import build_ssd300_coder
-
-class Evaluator:
-
-    def __init__(self, config):
-        self.config = config
-        self.eval_count = 0
-
-        self._dataloader = None
-        self.fetch_dataloader()
-
-        self.ret = []
-
-        self.overlap_threshold = 0.50
-        self.nms_max_detections = 200
-        self.encoder = build_ssd300_coder(config.fast_nms)
-
-    def fetch_dataloader(self):
-        if self._dataloader is None:
-            self._dataloader, self.inv_map, self.cocoGt = create_eval_dataloader(config)
-        return self._dataloader
-
-    def evaluate_coco(self, final_results, cocoGt):
-        if self.config.use_coco_ext:
-            cocoDt = cocoGt.loadRes(final_results, use_ext=True)
-            E = COCOeval(cocoGt, cocoDt, iouType='bbox', use_ext=True)
-        else:
-            cocoDt = cocoGt.loadRes(final_results)
-            E = COCOeval(cocoGt, cocoDt, iouType='bbox')
-        E.evaluate()
-        E.accumulate()
-        E.summarize()
-        print("Current AP: {:.5f} AP".format(E.stats[0]))
-        return E.stats[0]
-
-    def evaluate(self, trainer):
-        self.eval_count += 1
-        eval_dataloader = eval_prefetcher(iter(self._dataloader),
-                                     torch.cuda.current_device(),
-                                     config.pad_input,
-                                     config.nhwc,
-                                     config.fp16)
-        trainer.model.eval()
-        ret = []
-        with torch.no_grad():
-            for batch in eval_dataloader:
-                img, img_id, img_size = batch
-                _, ploc, plabel = trainer.inference(img)
-
-                # torch.save({
-                #     "bbox": ploc,
-                #     "scores": plabel,
-                #     "criteria": self.overlap_threshold,
-                #     "max_output": self.nms_max_detections,
-                # }, "decode_inputs_{}.pth".format(config.local_rank))
-                # exit()
-
-                for idx in range(ploc.shape[0]):
-                    # ease-of-use for specific predictions
-                    ploc_i = ploc[idx, :, :].unsqueeze(0)
-                    plabel_i = plabel[idx, :, :].unsqueeze(0)
-
-                    result = self.encoder.decode_batch(ploc_i, plabel_i, self.overlap_threshold, self.nms_max_detections)[0]
-
-                    htot, wtot = img_size[0][idx].item(), img_size[1][idx].item()
-                    loc, label, prob = [r.cpu().numpy() for r in result]
-                    for loc_, label_, prob_ in zip(loc, label, prob):
-                        ret.append([img_id[idx], loc_[0] * wtot, \
-                                    loc_[1] * htot,
-                                    (loc_[2] - loc_[0]) * wtot,
-                                    (loc_[3] - loc_[1]) * htot,
-                                    prob_,
-                                    self.inv_map[label_]])
-
-        trainer.model.train()
-        ret = np.array(ret).astype(np.float32)
-        if self.config.distributed:
-            ret_copy = torch.tensor(ret).cuda()
-            ret_sizes = [torch.tensor(0).cuda() for _ in range(config.n_gpu)]
-            torch.distributed.all_gather(ret_sizes, torch.tensor(ret_copy.shape[0]).cuda())
-            max_size = 0
-            sizes = []
-            for s in ret_sizes:
-                max_size = max(max_size, s.item())
-                sizes.append(s.item())
-            ret_pad = torch.cat([ret_copy, torch.zeros(max_size - ret_copy.shape[0], 7, dtype=torch.float32).cuda()])
-            other_ret = [torch.zeros(max_size, 7, dtype=torch.float32).cuda() for i in range(config.n_gpu)]
-            torch.distributed.all_gather(other_ret, ret_pad)
-            cat_tensors = []
-            for i in range(config.n_gpu):
-                cat_tensors.append(other_ret[i][:sizes[i]][:])
-
-            final_results = torch.cat(cat_tensors).cpu().numpy()
-        else:
-            final_results = ret
-
-        if utils.is_main_process():
-            eval_ap = self.evaluate_coco(final_results, self.cocoGt)
-            return eval_ap
-        else:
-            return 0
-
diff --git a/cv/detection/ssd/pytorch/base/train/event/__init__.py b/cv/detection/ssd/pytorch/base/train/event/__init__.py
deleted file mode 100644
index 2cd8c9e60..000000000
--- a/cv/detection/ssd/pytorch/base/train/event/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from .base import BaseTrainingEventInterface
-from .base_adapter import BaseTrainingEventAdapter
-from .compose import TrainingEventCompose
-from .log import TrainingLogger
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/base/train/event/base.py b/cv/detection/ssd/pytorch/base/train/event/base.py
deleted file mode 100644
index d9d76fafa..000000000
--- a/cv/detection/ssd/pytorch/base/train/event/base.py
+++ /dev/null
@@ -1,64 +0,0 @@
-from typing import Tuple, List
-
-import torch.nn
-from torch import Tensor
-from torch.cuda.amp import GradScaler
-from torch.optim import Optimizer
-
-from train.training_state import TrainingState
-
-
-SSD_MODEL = torch.nn.Module
-BatchType = Tuple[Tensor, Tensor, Tensor]
-
-class BaseTrainingEventInterface(object):
-
-    def __init__(self, config):
-        self.config = config
-
-    def save_checkpoint(self, path: str, training_state: TrainingState):
-        pass
-
-    def load_checkpoint(self, checkpoint):
-        pass
-
-    def convert_model(self, model: SSD_MODEL) -> SSD_MODEL:
-        return model
-
-    def create_optimizer(self, model: SSD_MODEL) -> Optimizer:
-        raise NotImplementedError()
-
-    def model_to_fp16(self, model: SSD_MODEL, optimizer: Optimizer) -> Tuple[SSD_MODEL, Optimizer]:
-        return model, optimizer
-
-    def model_to_ddp(self, model: SSD_MODEL) -> SSD_MODEL:
-        return model
-
-    def on_init_start(self):
-        pass
-
-    def on_init_end(self):
-        pass
-
-    def on_backward(self, step: int, loss: Tensor, optimizer: Optimizer, grad_scaler: GradScaler=None):
-        pass
-
-    def on_train_begin(self):
-        pass
-
-    def on_train_end(self):
-        pass
-
-    def on_epoch_begin(self, epoch: int):
-        pass
-
-    def on_epoch_end(self, epoch: int):
-        pass
-
-    def on_step_begin(self, step: int):
-        pass
-
-    def on_step_end(self, step: int):
-        pass
-
-
diff --git a/cv/detection/ssd/pytorch/base/train/event/base_adapter.py b/cv/detection/ssd/pytorch/base/train/event/base_adapter.py
deleted file mode 100644
index 5c6b9b1c0..000000000
--- a/cv/detection/ssd/pytorch/base/train/event/base_adapter.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from torch.optim import Optimizer
-
-from .base import BaseTrainingEventInterface
-
-
-class BaseTrainingEventMix:
-
-    def launch(self):
-        pass
-
-    def create_optimizer(self, optimizer: Optimizer):
-        pass
-
-    def on_init_evaluate(self, result: dict):
-        pass
-
-    def on_evaluate(self, result: dict):
-        pass
-
-    def on_step_end(self, step: int, result: dict = None):
-        pass
-
-
-class BaseTrainingEventAdapter(BaseTrainingEventMix, BaseTrainingEventInterface):
-    pass
-
-
-
-
-
diff --git a/cv/detection/ssd/pytorch/base/train/event/compose.py b/cv/detection/ssd/pytorch/base/train/event/compose.py
deleted file mode 100644
index a112f32e1..000000000
--- a/cv/detection/ssd/pytorch/base/train/event/compose.py
+++ /dev/null
@@ -1,109 +0,0 @@
-from typing import List, Union, Callable, Tuple
-
-from torch import Tensor
-from torch.cuda.amp import GradScaler
-from torch.optim import Optimizer
-
-from .base import BaseTrainingEventInterface as TrainingEventInterface, SSD_MODEL
-from .base_adapter import BaseTrainingEventMix, BaseTrainingEventAdapter
-from train.training_state import TrainingState
-
-
-class TrainingEventCompose(BaseTrainingEventAdapter):
-
-    def __init__(self, interface: TrainingEventInterface, events: List[BaseTrainingEventAdapter]):
-        super(TrainingEventCompose, self).__init__(interface.config)
-
-        self.interface = interface
-        self.events = events
-
-    def launch(self):
-        self._call_events_func(self.launch, with_interface=False)
-
-    def save_checkpoint(self, path: str, training_state: TrainingState):
-        self.interface.save_checkpoint(path, training_state)
-        self._call_events_func(self.save_checkpoint, with_interface=False, path=path, training_state=training_state)
-
-    def load_checkpoint(self, checkpoint):
-        self.interface.load_checkpoint(checkpoint)
-        self._call_events_func(self.load_checkpoint, with_interface=False, checkpoint=checkpoint)
-
-    def convert_model(self, model: SSD_MODEL) -> SSD_MODEL:
-        model = self.interface.convert_model(model)
-        self._call_events_func(self.convert_model, with_interface=False, model=model)
-        return model
-
-    def create_optimizer(self, model: SSD_MODEL) -> Optimizer:
-        optimizer = self.interface.create_optimizer(model)
-        self._call_events_func(self.create_optimizer, with_interface=False, optimizer=optimizer)
-        return optimizer
-
-    def model_to_fp16(self, model: SSD_MODEL, optimizer: Optimizer) -> Tuple[SSD_MODEL, Optimizer]:
-        model, optimizer = self.interface.model_to_fp16(model, optimizer)
-        self._call_events_func(self.model_to_fp16, with_interface=False, model=model, optimizer=optimizer)
-        return model, optimizer
-
-    def model_to_ddp(self, model: SSD_MODEL) -> SSD_MODEL:
-        model = self.interface.model_to_ddp(model)
-        self._call_events_func(self.model_to_ddp, with_interface=False, model=model)
-        return model
-
-    def on_init_evaluate(self, result: dict):
-        self._call_events_func(self.on_init_evaluate, with_interface=False, result=result)
-
-    def on_evaluate(self, result: dict):
-        self._call_events_func(self.on_evaluate, with_interface=False, result=result)
-
-    def on_init_start(self):
-        self._call_events_func(self.on_init_start, with_interface=True)
-
-    def on_init_end(self):
-        self._call_events_func(self.on_init_end, with_interface=True)
-
-    def on_backward(self, step: int, loss: Tensor, optimizer: Optimizer, grad_scaler: GradScaler = None):
-        return self.interface.on_backward(step, loss, optimizer, grad_scaler)
-
-    def on_train_begin(self):
-        self._call_events_func(self.on_train_begin, with_interface=True)
-
-    def on_train_end(self):
-        self._call_events_func(self.on_train_end, with_interface=True)
-
-    def on_epoch_begin(self, epoch: int):
-        self._call_events_func(self.on_epoch_begin, with_interface=True, epoch=epoch)
-
-    def on_epoch_end(self, epoch: int):
-        self._call_events_func(self.on_epoch_end, with_interface=True, epoch=epoch)
-
-    def on_step_begin(self, step: int):
-        self._call_events_func(self.on_step_begin, with_interface=True, step=step)
-
-    def on_step_end(self, step: int, result: dict = None):
-        self.interface.on_step_end(step)
-        self._call_events_func(self.on_step_end, with_interface=False, step=step, result=result)
-
-    def _call_events_func(self, func: Union[str, Callable], with_interface=False, *args, **kwargs):
-        func_name = self._get_func_name(func)
-        events = self.events
-        if with_interface:
-            events = [self.interface] + events
-
-        result = []
-        for event in events:
-            ret = None
-            if hasattr(event, func_name):
-                 ret = getattr(event, func_name)(*args, **kwargs)
-            result.append(ret)
-        return result
-
-    def _get_func_name(self, func: Union[str, Callable]):
-        if isinstance(func, str):
-            return func
-
-        if callable(func):
-            return func.__name__
-
-        return None
-
-
-
diff --git a/cv/detection/ssd/pytorch/base/train/event/log.py b/cv/detection/ssd/pytorch/base/train/event/log.py
deleted file mode 100644
index 083ded561..000000000
--- a/cv/detection/ssd/pytorch/base/train/event/log.py
+++ /dev/null
@@ -1,145 +0,0 @@
-import copy
-import inspect
-import os
-import os.path as ospath
-from typing import Tuple, Union, Iterable
-
-from torch import Tensor
-from torch.cuda.amp import GradScaler
-from torch.optim import Optimizer
-
-from config.config_manager import get_properties_from_config
-from utils.logging import PerfLogger, LogEvent, PerfLogLevel
-from .base import SSD_MODEL
-from .base_adapter import BaseTrainingEventAdapter
-
-
-STACKLEVEL = 4
-
-
-class TrainingLogger(BaseTrainingEventAdapter):
-
-    def __init__(self, config, logger: PerfLogger=None, log_freq: int = 0):
-        super(TrainingLogger, self).__init__(config)
-        self.config = config
-        self.log_freq = log_freq
-        level = PerfLogLevel.INFO if log_freq > 0 else PerfLogLevel.SUBMITTION
-        if logger is None:
-            logger = PerfLogger.get_default_logger(rank=config.local_rank, level=level)
-        self.logger = logger
-
-        self.model = None
-        self.submitter = None
-
-    def launch(self):
-        self.logger.log(LogEvent.launch_training, "Launch training", stacklevel=STACKLEVEL)
-        config_path: str = self.config.config
-        config_dict = get_properties_from_config(self.config)
-        for key, value in config_dict.items():
-            if type(value) not in [int, float, str, bool] and not isinstance(value, Iterable):
-                config_dict[key] = str(value)
-
-        # Extract definition of training event
-        try:
-            training_event_class = self.config.training_event
-            if not inspect.isclass(training_event_class):
-                training_event_class = training_event_class.__class__
-            training_event_class_define = inspect.getabsfile(training_event_class)
-            training_event_class_define = training_event_class_define.rsplit(".py", maxsplit=1)[0]
-            training_event_class_define += ":" + training_event_class.__name__
-        except:
-            training_event_class_define = str(self.config.training_event)
-        config_dict['training_event'] = training_event_class_define
-
-        # Like /path/to/proj/submitter/model/config/config_xxx.py
-        if config_path.startswith("."):
-            config_path = ospath.abspath(config_path)
-
-        config_path_nodes = config_path.rsplit(sep="/", maxsplit=4)
-        submitter = config_path_nodes[1]
-        model = config_path_nodes[2]
-        self.logger.init_logger(submitter=submitter,
-                                model=model,
-                                config_path=config_path,
-                                config=config_dict,
-                                stacklevel=STACKLEVEL)
-
-        self.model = model
-        self.submitter = submitter
-
-    def convert_model(self, model: SSD_MODEL):
-        model_class = type(model)
-        model_info = dict(
-            type = model_class.__name__,
-            module = model_class.__module__ if hasattr(model_class, "__module__") else "None"
-        )
-        self._log_event(LogEvent.convert_model, model_info)
-
-    def create_optimizer(self, optimizer: Optimizer):
-        optimizer_class = type(optimizer)
-        optimizer_info = dict(
-            type=optimizer_class.__name__,
-            module=optimizer_class.__module__ if hasattr(optimizer_class, "__module__") else "None"
-        )
-        self._log_event(LogEvent.create_optimizer, optimizer_info)
-
-    def model_to_fp16(self, model: SSD_MODEL, optimizer: Optimizer):
-        fp16_info = dict(
-            fp16 = self.config.fp16 if hasattr(self.config, "fp16") else False
-        )
-        self._log_event(LogEvent.model_to_fp16, fp16_info)
-
-    def model_to_ddp(self, model: SSD_MODEL):
-        model_class = type(model)
-        model_info = dict(
-            type=model_class.__name__,
-            module=model_class.__module__ if hasattr(model_class, "__module__") else None
-        )
-        self._log_event(LogEvent.model_to_ddp, model_info)
-
-    def on_init_evaluate(self, result: dict):
-        self._log_event(LogEvent.init_evaluation, result)
-
-    def on_evaluate(self, result: dict):
-        self._log_event(LogEvent.evaluation, result)
-
-    def on_init_start(self):
-        self._log_event(LogEvent.init_start)
-
-    def on_init_end(self):
-        self._log_event(LogEvent.init_end, "Finish initialization")
-
-    def on_backward(self, step: int, loss: Tensor, optimizer: Optimizer, grad_scaler: GradScaler=None):
-        pass
-
-    def on_train_begin(self):
-        self._log_event(LogEvent.train_begin)
-
-    def on_train_end(self):
-        self._log_event(LogEvent.train_end)
-
-    def on_epoch_begin(self, epoch: int):
-        epoch_info = dict(epoch=epoch)
-        self._log_event(LogEvent.epoch_begin, epoch_info)
-
-    def on_epoch_end(self, epoch: int):
-        epoch_info = dict(epoch=epoch)
-        self._log_event(LogEvent.epoch_end, epoch_info)
-
-    def on_step_begin(self, step: int):
-        pass
-
-    def on_step_end(self, step: int, result: dict=None):
-        if (self.log_freq <= 0 or step % self.log_freq != 0) and step != 1:
-            return
-        if result is None:
-            step_info = dict()
-        else:
-            step_info = copy.copy(result)
-
-        step_info['step'] = step
-        self._log_event(LogEvent.step_end, step_info)
-
-    def _log_event(self, event, *args, **kwargs):
-        self.logger.log(event, stacklevel=STACKLEVEL, *args, **kwargs)
-
diff --git a/cv/detection/ssd/pytorch/base/train/trainer.py b/cv/detection/ssd/pytorch/base/train/trainer.py
deleted file mode 100644
index e4739b26e..000000000
--- a/cv/detection/ssd/pytorch/base/train/trainer.py
+++ /dev/null
@@ -1,229 +0,0 @@
-import time
-import os
-import sys
-from bisect import bisect
-from typing import Union
-
-import numpy as np
-import torch
-from torch.cuda.amp import GradScaler
-
-import utils
-from model import create_model
-from train.evaluator import Evaluator
-from train.training_state import TrainingState
-from train.event import TrainingEventCompose as TrainingEvent
-from box_coder import dboxes300_coco
-
-from model.losses import OptLoss, Loss
-
-
-Device = Union[torch.device, str, None]
-
-
-def lr_warmup(optim, warmup_iter, iter_num, base_lr, args):
-    if iter_num < warmup_iter:
-        warmup_step = base_lr / (warmup_iter * (2 ** args.warmup_factor))
-        new_lr = base_lr - (warmup_iter - iter_num) * warmup_step
-        for param_group in optim.param_groups:
-            param_group['lr'] = new_lr
-        return new_lr
-    else:
-        return base_lr
-
-
-class Trainer(object):
-
-    def __init__(self, config, training_event: TrainingEvent,
-                 evaluator: Evaluator,
-                 training_state: TrainingState,
-                 grad_scaler: GradScaler,
-                 device: Device):
-        super(Trainer, self).__init__()
-        self.config = config
-        self.training_event = training_event
-        self.training_state = training_state
-        self.grad_scaler = grad_scaler
-
-        self.device = device
-        self.optimizer = None
-        self.model = None
-        self.evaluator = evaluator
-        self.success = torch.zeros(1).cuda()
-        dboxes = dboxes300_coco()
-        if self.config.dali:
-            self.loss_fun = OptLoss().cuda()
-        else:
-            self.loss_fun = Loss(dboxes).cuda()
-
-    def init(self):
-        self.model = create_model(self.config)
-        self.model = self.training_event.convert_model(self.model)
-        self.optimizer = self.training_event.create_optimizer(self.model)
-        self.model, self.optimizer = self.training_event.model_to_fp16(self.model, self.optimizer)
-        self.model = self.training_event.model_to_ddp(self.model)
-        # self.training_state.base_lr = self.optimizer.param_groups[0]['lr']
-        self.training_state.base_lr = self.optimizer.defaults['lr']
-        if utils.is_main_process():
-            print("==="*20)
-            print("config lr: {}, optimizer lr: {}".format(self.config.learning_rate, self.training_state.base_lr))
-            print("==="*20)
-
-        self._init_model()
-        self.model.train()
-        self._verify_model()
-
-    def _init_model(self):
-        if self.config.checkpoint:
-            checkpoint = torch.load(self.config.checkpoint, map_location="cpu")
-            self.training_event.load_checkpoint(checkpoint)
-            self.training_state.iter_num = self.config.iteration
-
-    def _verify_model(self):
-        input_c = 4 if self.config.pad_input else 3
-        if self.config.nhwc:
-            example_shape = [self.config.train_batch_size, 300, 300, input_c]
-        else:
-            example_shape = [self.config.train_batch_size, input_c, 300, 300]
-        example_input = torch.randn(*example_shape).cuda()
-        if self.config.fp16:
-            example_input = example_input.half()
-        if self.config.jit:
-            # DDP has some Python-side control flow.  If we JIT the entire DDP-wrapped module,
-            # the resulting ScriptModule will elide this control flow, resulting in allreduce
-            # hooks not being called.  If we're running distributed, we need to extract and JIT
-            # the wrapped .module.
-            # Replacing a DDP-ed ssd300 with a script_module might also cause the AccumulateGrad hooks
-            # to go out of scope, and therefore silently disappear.
-            module_to_jit = self.model.module if self.config.distributed else self.model
-            if self.config.distributed:
-                self.model.module = torch.jit.trace(module_to_jit, example_input, check_trace=False)
-            else:
-                self.model = torch.jit.trace(module_to_jit, example_input, check_trace=False)
-        ploc, plabel = self.model(example_input)
-        loss = ploc[0, 0, 0] + plabel[0, 0, 0]
-        dloss = torch.randn_like(loss)
-        # Cause cudnnFind for dgrad, wgrad to run
-        loss.backward(dloss)
-        for p in self.model.parameters():
-            p.grad = None
-
-    def train_one_epoch(self, train_dataloader):
-        if self.training_state.epoch in self.config.lr_decay_epochs:
-            self.training_state.base_lr *= self.config.lr_decay_factor
-            print(self.config.local_rank, "base_lr decay step #" + str(bisect(self.config.lr_decay_epochs, self.training_state.epoch)))
-            for param_group in self.optimizer.param_groups:
-                param_group['lr'] = self.training_state.base_lr
-        if self.training_state.epoch <= self.config.epoch:
-            print("Start continue training from epoch: {}, iter: {}, skip epoch {}".format(self.config.epoch, self.config.iteration, self.training_state.epoch))
-            return
-
-        self.training_event.on_epoch_begin(self.training_state.epoch)
-        step_start_time = time.time()
-        for batch in train_dataloader:
-            # print([len(x) for x in batch])
-            img, bbox, label = batch
-            if not self.config.dali:
-                img = img.cuda()
-                bbox = bbox.cuda()
-                label = label.cuda()
-
-            self.training_state.lr = lr_warmup(self.optimizer, self.config.warmup, self.training_state.iter_num, self.training_state.base_lr, self.config)
-            if (img is None) or (bbox is None) or (label is None):
-                print("No labels in batch")
-                continue
-            self.training_event.on_step_begin(self.training_state.iter_num)
-            self.train_one_step(img, bbox, label)
-
-            other_state = dict()
-            if self.training_state.iter_num % self.config.gradient_accumulation_steps == 0:
-                step_end_time = time.time()
-                step_total_time = step_end_time - step_start_time
-                fps = (utils.global_batch_size(self.config) * self.config.gradient_accumulation_steps) / step_total_time
-                other_state["avg_samples/s"] = fps
-                step_start_time = step_end_time
-
-            step_info = self.training_state.to_dict(**other_state)
-            self.training_event.on_step_end(self.training_state.iter_num, result=step_info)
-            self.training_state.iter_num += 1
-
-        if self.config.dali:
-            train_dataloader.reset()
-        if self.training_state.epoch in self.config.evaluation:
-            if self.config.distributed:
-                world_size = float(utils.get_world_size())
-                for bn_name, bn_buf in self.model.module.named_buffers(recurse=True):
-                    if ('running_mean' in bn_name) or ('running_var' in bn_name):
-                        torch.distributed.all_reduce(bn_buf, op=torch.distributed.ReduceOp.SUM)
-                        bn_buf /= world_size
-
-            eval_start = time.time()
-            self.training_state.eval_ap = self.evaluator.evaluate(self)
-            eval_end = time.time()
-            eval_result = dict(epoch=self.training_state.epoch,
-                               eval_ap=self.training_state.eval_ap,
-                               time=eval_end - eval_start)
-            self.training_event.on_evaluate(eval_result)
-            if utils.is_main_process():
-                if self.config.save_checkpoint:
-                    print("saving model...")
-                    if not os.path.isdir(self.config.output):
-                        os.mkdir(self.config.output)
-                    self.training_event.save_checkpoint(self.config.output, self.training_state)
-                self.detect_training_status(self.training_state)
-                if self.training_state.converged:
-                    self.success = torch.ones(1).cuda()
-            if self.config.distributed:
-                torch.distributed.broadcast(self.success, 0)
-            if self.success[0]:
-                print("Process {} train success!".format(self.config.local_rank))
-                self.training_state.end_training = True
-        self.training_event.on_epoch_end(self.training_state.epoch)
-
-    def train_one_step(self, img, bbox, label):
-        self.training_state.loss, _, _ = self.forward(img, bbox, label)
-        if self.training_state.epoch == self.config.epoch + 1 and self.training_state.iter_num == self.config.iteration:
-            self.training_state.avg_loss = self.training_state.loss.item()
-        else:
-            if np.isfinite(self.training_state.loss.item()):
-                self.training_state.avg_loss = 0.999 * self.training_state.avg_loss + 0.001 * self.training_state.loss.item()
-            else:
-                print("model exploded (corrupted by Inf or Nan)")
-                sys.exit()
-        self.training_event.on_backward(self.training_state.iter_num, self.training_state.loss, self.optimizer, self.grad_scaler)
-
-    def forward(self, img, bbox, label, training=True):
-        # origin input shape is: (bs, 3, 300, 300)
-        # using dali and nhwc input shape is: (bs, 300, 300, 4)
-        ploc, plabel = self.model(img)
-        ploc, plabel = ploc.float(), plabel.float()
-        if training:
-            N = img.shape[0]
-            bbox.requires_grad = False
-            label.requires_grad = False
-            # reshape (N*8732X4 -> Nx8732x4) and transpose (Nx8732x4 -> Nx4x8732)
-            bbox = bbox.view(N, -1, 4).transpose(1, 2).contiguous()
-            # reshape (N*8732 -> Nx8732) and cast to Long
-            label = label.view(N, -1).long()
-
-            # torch.save({
-            #     "ploc": ploc,
-            #     "plabel": plabel,
-            #     "gloc": bbox,
-            #     "glabel": label,
-            # }, "loss.pth_{}".format(self.config.local_rank))
-            # exit()
-            loss = self.loss_fun(ploc, plabel, bbox, label)
-            return loss, None, None
-        else:
-            return None, ploc, plabel
-
-    def inference(self, img):
-        return self.forward(img, None, None, False)
-
-    def detect_training_status(self, training_state: TrainingState):
-        if training_state.eval_ap >= self.config.threshold:
-            training_state.converged_success()
-        if training_state.converged or training_state.epoch >= self.config.epochs:
-            training_state.end_training = True
-        return training_state.end_training
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/base/train/training_state.py b/cv/detection/ssd/pytorch/base/train/training_state.py
deleted file mode 100644
index a33cb19d0..000000000
--- a/cv/detection/ssd/pytorch/base/train/training_state.py
+++ /dev/null
@@ -1,53 +0,0 @@
-
-import torch
-import utils
-
-
-class TrainingState:
-    _trainer = None
-    _status = 'aborted'  # later set to 'success' if termination criteria met
-
-    iter_num = 0
-
-    loss: float = 0.0
-    avg_loss: float = 0.0
-    base_lr: float = 0.0
-    lr: float = 0.0
-
-    epoch: int = 0
-    end_training: bool = False
-    converged: bool = False
-
-    eval_ap = 0
-
-    init_time = 0
-    raw_train_time = 0
-
-    def status(self):
-        if self.converged:
-            self._status = "success"
-        return self._status
-
-    def converged_success(self):
-        self.end_training = True
-        self.converged = True
-
-    def to_dict(self, **kwargs):
-        state_dict = dict()
-
-        for var_name, value in self.__dict__.items():
-            if not var_name.startswith("_") and utils.is_property(value):
-                state_dict[var_name] = value
-
-        exclude = ["eval_ap", "converged", "init_time", "raw_train_time"]
-        for exkey in exclude:
-            if exkey in state_dict:
-                state_dict.pop(exkey)
-
-        state_dict.update(kwargs)
-
-        for k in state_dict.keys():
-            if torch.is_tensor(state_dict[k]):
-                state_dict[k] = state_dict[k].item()
-
-        return state_dict
diff --git a/cv/detection/ssd/pytorch/base/utils/__init__.py b/cv/detection/ssd/pytorch/base/utils/__init__.py
deleted file mode 100644
index 9c0ed2c68..000000000
--- a/cv/detection/ssd/pytorch/base/utils/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import inspect
-
-from .check import check_config
-from .dist import *
-
-def is_property(value):
-    status = [
-        not callable(value),
-        not inspect.isclass(value),
-        not inspect.ismodule(value),
-        not inspect.ismethod(value),
-        not inspect.isfunction(value),
-        not inspect.isbuiltin(value),
-        "classmethod object" not in str(value)
-    ]
-
-    return all(status)
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/base/utils/check.py b/cv/detection/ssd/pytorch/base/utils/check.py
deleted file mode 100644
index 9b5f9ea19..000000000
--- a/cv/detection/ssd/pytorch/base/utils/check.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import os
-
-import torch
-
-
-def get_config_arg(config, name):
-    if hasattr(config, name):
-        value = getattr(config, name)
-        if value is not None:
-            return value
-
-    if name in os.environ:
-        return os.environ[name]
-
-    return None
-
-
-def check_config(config):
-    print("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
-        config.device, config.n_gpu, config.local_rank != -1, config.fp16))
-
-    if config.gradient_accumulation_steps < 1:
-        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
-            config.gradient_accumulation_steps))
-
-    # if config.fp16:
-    #     assert config.opt_level == 2
-
-    # nhwc can only be used with fp16
-    if config.nhwc:
-        assert config.fp16
-
-        # input padding can only be used with NHWC
-    if config.pad_input:
-        assert config.nhwc
-
-        # no dali can only be used with NCHW and no padding
-    if not config.dali:
-        assert (not config.nhwc)
-        assert (not config.pad_input)
-        assert (not config.use_nvjpeg)
-        assert (not config.dali_cache)
-
-    if config.dali_cache > 0:
-        assert config.use_nvjpeg
-
-    if config.jit:
-        assert config.nhwc  # jit can not be applied with apex::syncbn used for non-nhwc
-
-
-# Check that the run is valid for specified group BN arg
-def validate_group_bn(bn_groups):
-    if torch.distributed.is_initialized():
-        world_size = torch.distributed.get_world_size()
-    else:
-        world_size = 1
-
-    # Can't have larger group than ranks
-    assert(bn_groups <= world_size)
-
-    # must have only complete groups
-    assert(world_size % bn_groups == 0)
-
-
-
-
-
-
-
-
-
-
diff --git a/cv/detection/ssd/pytorch/base/utils/dist.py b/cv/detection/ssd/pytorch/base/utils/dist.py
deleted file mode 100644
index e3549c125..000000000
--- a/cv/detection/ssd/pytorch/base/utils/dist.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-
-import numpy as np
-import torch
-import torch.distributed as dist
-
-from contextlib import contextmanager
-import random
-
-from .check import validate_group_bn
-
-
-def generate_seeds(rng, size):
-    """
-    Generate list of random seeds
-
-    :param rng: random number generator
-    :param size: length of the returned list
-    """
-    seeds = [rng.randint(0, 2**32 - 1) for _ in range(size)]
-    return seeds
-
-
-def broadcast_seeds(seeds, device):
-    """
-    Broadcasts random seeds to all distributed workers.
-    Returns list of random seeds (broadcasted from workers with rank 0).
-
-    :param seeds: list of seeds (integers)
-    :param device: torch.device
-    """
-    if torch.distributed.is_available() and torch.distributed.is_initialized():
-        seeds_tensor = torch.LongTensor(seeds).to(device)
-        torch.distributed.broadcast(seeds_tensor, 0)
-        seeds = seeds_tensor.tolist()
-    return seeds
-
-
-def setup_seeds(config):
-    torch.cuda.set_device(config.local_rank)
-    config.local_seed = (config.seed + get_rank()) % 2**32
-    print(get_rank(), "Using seed = {}".format(config.local_seed))
-    random.seed(config.local_seed)
-    torch.manual_seed(config.local_seed)
-    np.random.seed(seed=config.local_seed)
-    return
-
-
-def barrier():
-    """
-    Works as a temporary distributed barrier, currently pytorch
-    doesn't implement barrier for NCCL backend.
-    Calls all_reduce on dummy tensor and synchronizes with GPU.
-    """
-    if torch.distributed.is_available() and torch.distributed.is_initialized():
-        torch.distributed.all_reduce(torch.cuda.FloatTensor(1))
-        torch.cuda.synchronize()
-
-
-def get_rank(default=0):
-    """
-    Gets distributed rank or returns zero if distributed is not initialized.
-    """
-    if torch.distributed.is_available() and torch.distributed.is_initialized():
-        rank = torch.distributed.get_rank()
-    else:
-        rank = default
-    return rank
-
-
-def get_world_size():
-    """
-    Gets total number of distributed workers or returns one if distributed is
-    not initialized.
-    """
-    if torch.distributed.is_available() and torch.distributed.is_initialized():
-        world_size = torch.distributed.get_world_size()
-    else:
-        world_size = 1
-    return world_size
-
-
-def main_proc_print(*args, **kwargs):
-    if is_main_process():
-        print(*args, **kwargs)
-
-
-def set_device(cuda, local_rank):
-    """
-    Sets device based on local_rank and returns instance of torch.device.
-
-    :param cuda: if True: use cuda
-    :param local_rank: local rank of the worker
-    """
-    if cuda:
-        torch.cuda.set_device(local_rank)
-        device = torch.device('cuda')
-    else:
-        device = torch.device('cpu')
-    return device
-
-
-def init_dist_training_env(config):
-    if config.distributed is False:
-        device = torch.device("cuda")
-        num_gpus = 1
-    else:
-        torch.cuda.set_device(config.local_rank)
-        device = torch.device("cuda", config.local_rank)
-        host_addr_full = 'tcp://' + os.environ["MASTER_ADDR"] + ':' + os.environ["MASTER_PORT"]
-        rank = int(os.environ["RANK"])
-        world_size = int(os.environ["WORLD_SIZE"])
-        torch.distributed.init_process_group(backend=config.dist_backend, init_method=host_addr_full, rank=rank, world_size=world_size)
-        num_gpus = torch.distributed.get_world_size()
-        validate_group_bn(config.bn_group)
-    return device, num_gpus
-
-
-def global_batch_size(config):
-    return config.train_batch_size * config.n_gpu
-
-
-@contextmanager
-def sync_workers():
-    """
-    Yields distributed rank and synchronizes all workers on exit.
-    """
-    rank = get_rank()
-    yield rank
-    barrier()
-
-
-def is_main_process():
-    if dist.is_initialized():
-        if "LOCAL_RANK" in os.environ:
-            return int(os.environ["LOCAL_RANK"]) == 0
-        else:
-            return get_rank() == 0
-
-    return True
-
-
-def format_step(step):
-    if isinstance(step, str):
-        return step
-    s = ""
-    if len(step) > 0:
-        s += "Training Epoch: {} ".format(step[0])
-    if len(step) > 1:
-        s += "Training Iteration: {} ".format(step[1])
-    if len(step) > 2:
-        s += "Validation Iteration: {} ".format(step[2])
-    return s
diff --git a/cv/detection/ssd/pytorch/base/utils/logging.py b/cv/detection/ssd/pytorch/base/utils/logging.py
deleted file mode 100644
index 2f576619b..000000000
--- a/cv/detection/ssd/pytorch/base/utils/logging.py
+++ /dev/null
@@ -1,235 +0,0 @@
-import os
-import sys
-import time
-import logging
-import json
-from logging import currentframe
-from typing import NamedTuple, Union, Tuple, Optional
-from collections import OrderedDict
-
-from enum import IntEnum
-
-
-_srcfile = os.path.normcase(logging.addLevelName.__code__.co_filename)
-
-
-class LogKeys:
-    default_logger_name = "PerfLogger"
-
-    # Log format
-    log_header = "PerfLog"
-    log_template = "[{header}] {message}"
-
-    # Submitted info
-    submmiter: str = "submmiter"
-    model: str = "model"
-    optimizer_type: str = "optimizer_type"
-    config: str = "config"
-    config_path: str = "config_path"
-
-    # Event
-    event: str = "event"
-    value: str = "value"
-
-    # Metadata
-    metadata: str = "metadata"
-    called_log_file = "file"
-    called_log_file_lineno = "lineno"
-    time_ms = "time_ms"
-    rank = "rank"
-
-    # Other message
-    other_message: str = "other"
-
-
-class PerfLogLevel(IntEnum):
-
-    INFO = 100
-    SUBMITTION = 101
-
-    @staticmethod
-    def from_string(level: str):
-        return PerfLogLevel.__dict__[level.upper()]
-
-    @classmethod
-    def register_to_logging(cls, logging):
-        for level_name, level in PerfLogLevel.__dict__.items():
-            if isinstance(level, cls):
-                logging.addLevelName(level.value, level_name)
-
-
-PerfLogLevel.register_to_logging(logging)
-
-
-class LogEventField(NamedTuple):
-
-    name: str
-    rank: Union[int, list] = -1
-    level: PerfLogLevel = PerfLogLevel.SUBMITTION
-
-
-class LogEvent:
-
-    submitted_info    = LogEventField("SUBMITTED_INFO", rank=0)
-    launch_training   = LogEventField("LAUNCH_TRAINING")
-    convert_model     = LogEventField("CONVERT_MODEL", rank=0)
-    create_optimizer  = LogEventField("CREATE_OPTIMIZER", rank=0)
-    model_to_fp16     = LogEventField("MODEL_TO_FP16", rank=0)
-    model_to_ddp      = LogEventField("MODEL_TO_DDP", rank=0)
-    init_start        = LogEventField("INIT_START", rank=0)
-    init_end          = LogEventField("INIT_END", rank=0)
-    train_begin       = LogEventField("TRAIN_BEGIN", rank=0)
-    train_end         = LogEventField("TRAIN_END", rank=0)
-    epoch_begin       = LogEventField("EPOCH_BEGIN", rank=0, level=PerfLogLevel.INFO)
-    epoch_end         = LogEventField("EPOCH_END", rank=0, level=PerfLogLevel.INFO)
-    step_begin        = LogEventField("STEP_BEGIN", rank=0, level=PerfLogLevel.INFO)
-    step_end          = LogEventField("STEP_END", rank=0, level=PerfLogLevel.INFO)
-    init_evaluation   = LogEventField("INIT_EVALUATION", rank=0)
-    evaluation        = LogEventField("EVALUATION", rank=0)
-    finished          = LogEventField("FINISHED", rank=0)
-
-    @staticmethod
-    def from_string(key: str):
-        return LogEvent.__dict__[key.lower()]
-
-
-class PerfLogger:
-
-    _singleton = None
-
-    def __init__(self, rank: int,
-                 level: Union[str, PerfLogLevel]=PerfLogLevel.SUBMITTION,
-                 logger: logging.Logger=None):
-        self.rank = rank
-
-        if isinstance(level, str):
-            level = PerfLogLevel.from_string(level)
-        self.level = level
-
-        if logger is None:
-            logger = logging.Logger(LogKeys.default_logger_name)
-
-        self.logger = logger
-
-        self.previous_log_time = None
-
-    @property
-    def _current_time_ms(self):
-        current = int(time.time() * 1e3)
-        self.previous_log_time = current
-        return current
-
-    def init_logger(self, submitter: str, model: str, config_path: str, config: dict, *args, **kwargs):
-        message = {
-            LogKeys.submmiter: submitter,
-            LogKeys.model: model,
-            LogKeys.config_path: config_path,
-            LogKeys.config: config
-        }
-
-        self.log(LogEvent.submitted_info, message, *args, **kwargs)
-
-
-    def log(self, event: Union[str, LogEventField], message: Optional[Union[str, dict]]=None, *args, **kwargs):
-        if isinstance(event, str):
-            event = LogEvent.from_string(event)
-
-        show_log = any([
-            event.rank == 0 and self.rank == 0,
-            event.rank == -1,
-        ]) and any([
-            event.level == PerfLogLevel.SUBMITTION,
-            event.level == self.level
-        ])
-
-        if not show_log:
-            return
-
-        stacklevel = 1
-        if "stacklevel" in kwargs:
-            stacklevel = kwargs.pop("stacklevel")
-
-        call_info = self.get_caller(stacklevel=stacklevel)
-
-        message = self._encode_message(event, message, call_info)
-        self.logger.log(self.level.value, message, *args, **kwargs)
-
-    def _encode_message(self, event: LogEventField,
-                        message: Union[str, dict],
-                        call_info: Tuple[str, int]) -> str:
-        if isinstance(message, str):
-            message ={LogKeys.other_message: message}
-        message = OrderedDict({
-            LogKeys.event: event.name,
-            LogKeys.value: message
-        })
-        called_file, lineno = call_info
-        metadata = {
-            LogKeys.called_log_file: called_file,
-            LogKeys.called_log_file_lineno: lineno,
-            LogKeys.time_ms: self._current_time_ms,
-            LogKeys.rank: self.rank
-        }
-
-        message[LogKeys.metadata] = metadata
-        message = json.dumps(message)
-
-        return self._log_template(message)
-
-    def _log_template(self, message: str):
-        return LogKeys.log_template.format(header=LogKeys.log_header, message=message)
-
-    def get_caller(self, stacklevel=1) -> Tuple[str, int]:
-        f = currentframe()
-
-        if stacklevel == 0:
-            default_file_name = f.f_code.co_filename
-            default_lineno = f.f_lineno
-            return (default_file_name, default_lineno)
-
-        # On some versions of IronPython, currentframe() returns None if
-        # IronPython isn't run with -X:Frames.
-        if f is not None:
-            f = f.f_back
-        orig_f = f
-        while f and stacklevel > 1:
-            f = f.f_back
-            stacklevel -= 1
-        if not f:
-            f = orig_f
-        rv = ("(unknown file)", -1)
-
-        while hasattr(f, "f_code"):
-            co = f.f_code
-            filename = os.path.normcase(co.co_filename)
-            if filename == _srcfile:
-                f = f.f_back
-                continue
-            rv = (co.co_filename, f.f_lineno)
-            break
-        return rv
-
-
-    @classmethod
-    def get_default_logger(cls, rank: int=-1,
-                 level: Union[str, PerfLogLevel]=PerfLogLevel.SUBMITTION,
-                 logger: logging.Logger=None):
-        if cls._singleton is None:
-            cls._singleton = cls(rank=rank, level=level, logger=logger)
-
-        return cls._singleton
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/cv/detection/ssd/pytorch/base/utils/paths.py b/cv/detection/ssd/pytorch/base/utils/paths.py
deleted file mode 100644
index 37691d743..000000000
--- a/cv/detection/ssd/pytorch/base/utils/paths.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import os.path as ospath
-
-
-MODEL_DIR = ospath.abspath(
-    ospath.join(
-        __file__,
-        "../../../../"
-    )
-)
-
-CURRENT_MODEL_NAME = ospath.basename(MODEL_DIR)
-
-PROJ_DIR = ospath.abspath(
-    ospath.join(
-        MODEL_DIR,
-        "pytorch"
-    )
-)
diff --git a/cv/detection/ssd/pytorch/base/model/losses/loss.py b/cv/detection/ssd/pytorch/base_model.py
similarity index 100%
rename from cv/detection/ssd/pytorch/base/model/losses/loss.py
rename to cv/detection/ssd/pytorch/base_model.py
index ae2db5d13..c177b5c0c 100644
--- a/cv/detection/ssd/pytorch/base/model/losses/loss.py
+++ b/cv/detection/ssd/pytorch/base_model.py
@@ -1,5 +1,5 @@
-import torch.nn as nn
 import torch
+import torch.nn as nn
 
 
 class Loss(nn.Module):
diff --git a/cv/detection/ssd/pytorch/bind.sh b/cv/detection/ssd/pytorch/bind.sh
new file mode 100644
index 000000000..dd100a3de
--- /dev/null
+++ b/cv/detection/ssd/pytorch/bind.sh
@@ -0,0 +1,212 @@
+#! /bin/bash
+set -euo pipefail
+
+print_usage() {
+    cat << EOF
+${0} [options] [--] COMMAND [ARG...]
+
+Control binding policy for each task. Assumes one rank will be launched for each GPU.
+
+Options:
+    --cpu=MODE
+        * exclusive -- bind each rank to an exclusive set of cores near its GPU
+        * exclusive,nosmt -- bind each rank to an exclusive set of cores near its GPU, without hyperthreading
+        * node -- bind each rank to all cores in the NUMA node nearest its GPU [default]
+	* *.sh -- bind each rank using the bash associative array bind_cpu_cores or bind_cpu_nodes from a file
+        * off -- don't bind
+    --mem=MODE
+        * node -- bind each rank to the nearest NUMA node [default]
+	* *.sh -- bind each rank using the bash associative array bind_mem from a file
+        * off -- don't bind
+    --ib=MODE
+        * single -- bind each rank to a single IB device near its GPU
+        * off -- don't bind [default]
+    --cluster=CLUSTER
+        Select which cluster is being used. May be required if system params cannot be detected.
+EOF
+}
+
+################################################################################
+# Argument parsing
+################################################################################
+
+cpu_mode='node'
+mem_mode='node'
+ib_mode='off'
+cluster=''
+while [ $# -gt 0 ]; do
+    case "$1" in
+        -h|--help) print_usage ; exit 0 ;;
+        --cpu=*) cpu_mode="${1/*=/}"; shift ;;
+        --cpu)   cpu_mode="$2"; shift 2 ;;
+        --mem=*) mem_mode="${1/*=/}"; shift ;;
+        --mem)   mem_mode="$2"; shift 2 ;;
+        --ib=*) ib_mode="${1/*=/}"; shift ;;
+        --ib)   ib_mode="$2"; shift 2 ;;
+        --cluster=*) cluster="${1/*=/}"; shift ;;
+        --cluster)   cluster="$2"; shift 2 ;;
+        --) shift; break ;;
+        *) break ;;
+    esac
+done
+if [ $# -lt 1 ]; then
+    echo 'ERROR: no command given' 2>&1
+    print_usage
+    exit 1
+fi
+
+################################################################################
+# Get system params
+################################################################################
+
+# LOCAL_RANK is set with an enroot hook for Pytorch containers
+# SLURM_LOCALID is set by Slurm
+# OMPI_COMM_WORLD_LOCAL_RANK is set by mpirun
+readonly local_rank="${LOCAL_RANK:=${SLURM_LOCALID:=${OMPI_COMM_WORLD_LOCAL_RANK:-}}}"
+if [ -z "${local_rank}" ]; then
+    echo 'ERROR: cannot read LOCAL_RANK from env' >&2
+    exit 1
+fi
+
+num_gpus=$(nvidia-smi -i 0 --query-gpu=count --format=csv,noheader,nounits)
+if [ "${local_rank}" -ge "${num_gpus}" ]; then
+    echo "ERROR: local rank is ${local_rank}, but there are only ${num_gpus} gpus available" >&2
+    exit 1
+fi
+
+get_lscpu_value() {
+    awk -F: "(\$1 == \"${1}\"){gsub(/ /, \"\", \$2); print \$2; found=1} END{exit found!=1}"
+}
+lscpu_out=$(lscpu)
+num_sockets=$(get_lscpu_value 'Socket(s)' <<< "${lscpu_out}")
+num_nodes=$(get_lscpu_value 'NUMA node(s)' <<< "${lscpu_out}")
+cores_per_socket=$(get_lscpu_value 'Core(s) per socket' <<< "${lscpu_out}")
+
+echo "num_sockets = ${num_sockets} num_nodes=${num_nodes} cores_per_socket=${cores_per_socket}"
+
+readonly cores_per_node=$(( (num_sockets * cores_per_socket) / num_nodes ))
+if [ ${num_gpus} -gt 1 ]; then
+    readonly gpus_per_node=$(( num_gpus / num_nodes ))
+else
+    readonly gpus_per_node=1
+fi
+readonly cores_per_gpu=$(( cores_per_node / gpus_per_node ))
+readonly local_node=$(( local_rank / gpus_per_node ))
+
+
+declare -a ibdevs=()
+case "${cluster}" in
+    circe)
+        # Need to specialize for circe because IB detection is hard
+        ibdevs=(mlx5_1 mlx5_2 mlx5_3 mlx5_4 mlx5_7 mlx5_8 mlx5_9 mlx5_10)
+        ;;
+   selene)
+        # Need to specialize for selene because IB detection is hard
+        ibdevs=(mlx5_0 mlx5_1 mlx5_2 mlx5_3 mlx5_6 mlx5_7 mlx5_8 mlx5_9)
+        ;;
+    '')
+        if ibstat_out="$(ibstat -l 2>/dev/null | sort -V)" ; then
+            mapfile -t ibdevs <<< "${ibstat_out}"
+        fi
+        ;;
+    *)
+        echo "ERROR: Unknown cluster '${cluster}'" >&2
+        exit 1
+        ;;
+esac
+readonly num_ibdevs="${#ibdevs[@]}"
+
+################################################################################
+# Setup for exec
+################################################################################
+
+declare -a numactl_args=()
+
+case "${cpu_mode}" in
+    exclusive)
+        numactl_args+=( "$(printf -- "--physcpubind=%u-%u,%u-%u" \
+            $(( local_rank * cores_per_gpu )) \
+            $(( (local_rank + 1) * cores_per_gpu - 1 )) \
+            $(( local_rank * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) )) \
+            $(( (local_rank + 1) * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) - 1 )) \
+        )" )
+        ;;
+    exclusive,nosmt)
+        numactl_args+=( "$(printf -- "--physcpubind=%u-%u" \
+            $(( local_rank * cores_per_gpu )) \
+            $(( (local_rank + 1) * cores_per_gpu - 1 )) \
+        )" )
+        ;;
+    node)
+        numactl_args+=( "--cpunodebind=${local_node}" )
+        ;;
+    *.sh)
+	source "${cpu_mode}"
+	if [ -n "${bind_cpu_cores:-}" ]; then
+	    numactl_args+=( "--physcpubind=${bind_cpu_cores[${local_rank}]}" )
+	elif [ -n "${bind_cpu_nodes:-}" ]; then
+	    numactl_args+=( "--cpunodebind=${bind_cpu_nodes[${local_rank}]}" )
+	else
+	    echo "ERROR: invalid CPU affinity file ${cpu_mode}." >&2
+	    exit 1
+	fi
+	;;
+    off|'')
+        ;;
+    *)
+        echo "ERROR: invalid cpu mode '${cpu_mode}'" 2>&1
+        print_usage
+        exit 1
+        ;;
+esac
+
+case "${mem_mode}" in
+    node)
+        numactl_args+=( "--membind=${local_node}" )
+        ;;
+    *.sh)
+	source "${mem_mode}"
+	if [ -z "${bind_mem:-}" ]; then
+	    echo "ERROR: invalid memory affinity file ${mem_mode}." >&2
+	    exit 1
+	fi
+	numactl_args+=( "--membind=${bind_mem[${local_rank}]}" )
+	;;
+    off|'')
+        ;;
+    *)
+        echo "ERROR: invalid mem mode '${mem_mode}'" 2>&1
+        print_usage
+        exit 1
+        ;;
+esac
+
+case "${ib_mode}" in
+    single)
+        if [ "${num_ibdevs}" -eq 0 ]; then
+            echo "WARNING: used '$0 --ib=single', but there are 0 IB devices available; skipping IB binding." 2>&1
+        else
+            readonly ibdev="${ibdevs[$(( local_rank * num_ibdevs / num_gpus ))]}"
+            export OMPI_MCA_btl_openib_if_include="${OMPI_MCA_btl_openib_if_include-$ibdev}"
+            export UCX_NET_DEVICES="${UCX_NET_DEVICES-$ibdev:1}"
+        fi
+        ;;
+    off|'')
+        ;;
+    *)
+        echo "ERROR: invalid ib mode '${ib_mode}'" 2>&1
+        print_usage
+        exit 1
+        ;;
+esac
+
+################################################################################
+# Exec
+################################################################################
+
+if [ "${#numactl_args[@]}" -gt 0 ] ; then
+    set -x
+    exec numactl "${numactl_args[@]}" -- "${@}"
+else
+    exec "${@}"
+fi
diff --git a/cv/detection/ssd/pytorch/base/bind_launch.py b/cv/detection/ssd/pytorch/bind_launch.py
similarity index 75%
rename from cv/detection/ssd/pytorch/base/bind_launch.py
rename to cv/detection/ssd/pytorch/bind_launch.py
index f1fb7423e..0e985a8f2 100644
--- a/cv/detection/ssd/pytorch/base/bind_launch.py
+++ b/cv/detection/ssd/pytorch/bind_launch.py
@@ -1,32 +1,17 @@
 import sys
 import subprocess
 import os
-import os.path as ospath
-from argparse import ArgumentParser
+import socket
+from argparse import ArgumentParser, REMAINDER
 
-
-MODEL_DIR = ospath.abspath(
-    ospath.join(
-        __file__,
-        "../../../"
-    )
-)
-
-MODEL = ospath.basename(MODEL_DIR)
-
-PROJ_DIR = ospath.abspath(
-    ospath.join(
-        MODEL_DIR,
-        "pytorch"
-    )
-)
-
-
-def _parse_known_args(parser, *args, **kwargs):
-    return parser.parse_known_args(*args, **kwargs)
+import torch
 
 
 def parse_args():
+    """
+    Helper function parsing the command line options
+    @retval ArgumentParser
+    """
     parser = ArgumentParser(description="PyTorch distributed training launch "
                                         "helper utilty that will spawn up "
                                         "multiple distributed processes")
@@ -70,35 +55,16 @@ def parse_args():
                              "followed by all the arguments for the "
                              "training script")
 
-    parser.add_argument("--config", type=str, required=True)
-    parser.add_argument("--name", type=str, required=True)
-
-    args, training_script_args = _parse_known_args(parser)
-    args.training_script_args = training_script_args
-
-    return args
-
-
-def get_cuda_visible_devices(gpus=1):
-    if "CUDA_VISIBLE_DEVICES" in os.environ:
-        return os.environ['CUDA_VISIBLE_DEVICES']
-    return ','.join([str(gpu_id) for gpu_id in range(gpus)])
-
+    # rest from the training program
+    parser.add_argument('training_script_args', nargs=REMAINDER)
+    return parser.parse_args()
 
 def main():
     args = parse_args()
-    config_full_name = f"config_{args.config}.py"
-    config_path = ospath.join(PROJ_DIR, args.name, "config", config_full_name)
-
-    _, args.nnodes, args.nproc_per_node = args.config.split("x")
-
-    args.nnodes = int(args.nnodes)
-    args.nproc_per_node = int(args.nproc_per_node)
 
     # variables for numactrl binding
-
     NSOCKETS = args.nsockets_per_node
-    NGPUS_PER_SOCKET = (args.nproc_per_node // args.nsockets_per_node) + (1 if (args.nproc_per_node % args.nsockets_per_node) else 0)
+    NGPUS_PER_SOCKET = args.nproc_per_node // args.nsockets_per_node
     NCORES_PER_GPU = args.ncores_per_socket // NGPUS_PER_SOCKET
     NCORES_PER_GPU_REMAIN=args.ncores_per_socket - NGPUS_PER_SOCKET*NCORES_PER_GPU
 
@@ -110,8 +76,6 @@ def main():
     current_env["MASTER_ADDR"] = args.master_addr
     current_env["MASTER_PORT"] = str(args.master_port)
     current_env["WORLD_SIZE"] = str(dist_world_size)
-    current_env["NODE_RANK"] = str(args.node_rank)
-    current_env["CUDA_VISIBLE_DEVICES"] = get_cuda_visible_devices(args.nproc_per_node)
 
     processes = []
 
@@ -119,7 +83,6 @@ def main():
         # each process's rank
         dist_rank = args.nproc_per_node * args.node_rank + local_rank
         current_env["RANK"] = str(dist_rank)
-        current_env["LOCAL_RANK"] = str(local_rank)
 
         # Instead of binding to a set of cores which this task has exclusive access to,
         #   bind to all cores on the local NUMA node (may share them with other ranks)
@@ -150,17 +113,18 @@ def main():
                 args.training_script,
                 "--local_rank={}".format(local_rank)
               ] \
-            + args.training_script_args + [f"{config_path}"]
+            + args.training_script_args
 
-        print("=" * 80)
-        print("= numactlargs_flag")
-        print(cmd)
-        print("=" * 80)
         process = subprocess.Popen(cmd, env=current_env)
         processes.append(process)
 
+    proc_status = []
+
     for process in processes:
         process.wait()
+        proc_status.append(process.returncode != 0)
+
+    exit(all(proc_status))
 
 
 if __name__ == "__main__":
diff --git a/cv/detection/ssd/pytorch/base/box_coder.py b/cv/detection/ssd/pytorch/box_coder.py
similarity index 76%
rename from cv/detection/ssd/pytorch/base/box_coder.py
rename to cv/detection/ssd/pytorch/box_coder.py
index d4b7e6871..1f6722ad3 100644
--- a/cv/detection/ssd/pytorch/base/box_coder.py
+++ b/cv/detection/ssd/pytorch/box_coder.py
@@ -2,12 +2,38 @@ import random
 import torch
 
 import torch.nn.functional as F
+from SSD import _C as C
 import numpy as np
 import itertools
 
 from math import sqrt
 
 
+def generate_bbox_stacks(bboxes):
+    offsets = [0]
+
+    for bbox in bboxes:
+        offsets.append(bbox.shape[0] + offsets[-1])
+
+    offsets = torch.tensor(np.array(offsets).astype(np.int32)).cuda()
+
+    return len(bboxes), torch.cat(bboxes), offsets, bboxes
+
+
+def load_bboxes(box_list, random_rows=True):
+    tensor_list = []
+
+    for b in box_list:
+        if random_rows:
+            n_rows = random.randint(1, b.shape[0])
+            t = torch.tensor(np.array(b[0:n_rows, :]).astype(np.float32)).cuda()
+        else:
+            t = torch.tensor(np.array(b).astype(np.float32)).cuda()
+        tensor_list.append(t)
+
+    return generate_bbox_stacks(tensor_list)
+
+
 def calc_iou_tensor(box1, box2):
     """ Calculation of IoU based on two boxes tensor,
         Reference to https://github.com/kuangliu/pytorch-ssd
@@ -102,7 +128,6 @@ class DefaultBoxes(object):
         if order == "ltrb": return self.dboxes_ltrb
         if order == "xywh": return self.dboxes
 
-
 # This class is from https://github.com/kuangliu/pytorch-ssd
 class Encoder(object):
     """
@@ -125,14 +150,13 @@ class Encoder(object):
             max_output : maximum number of output bboxes
     """
 
-    def __init__(self, dboxes, fast_nms=False):
+    def __init__(self, dboxes):
         self.dboxes = dboxes(order="ltrb")
         self.dboxes_xywh = dboxes(order="xywh").unsqueeze(dim=0)
         self.nboxes = self.dboxes.size(0)
         #print("# Bounding boxes: {}".format(self.nboxes))
         self.scale_xy = dboxes.scale_xy
         self.scale_wh = dboxes.scale_wh
-        self.fast_nms = fast_nms
 
         # self.dboxes = self.dboxes.cuda()
         # self.dboxes_xywh = self.dboxes_xywh.cuda()
@@ -140,6 +164,12 @@ class Encoder(object):
     def encode(self, bboxes_in, labels_in, criteria = 0.5):
 
         try:
+            # source boxes
+            # N, bboxes_cat, offsets, bboxes = load_bboxes([bboxes_in, self.dboxes])
+            # # target boxes
+            # _, _, _, targets = load_bboxes([self.dboxes])
+            # ious = C.calc_ious(N, bboxes_cat, offsets, *targets)
+
             ious = calc_iou_tensor(bboxes_in, self.dboxes)
             best_dbox_ious, best_dbox_idx = ious.max(dim=0)
             best_bbox_ious, best_bbox_idx = ious.max(dim=1)
@@ -155,6 +185,15 @@ class Encoder(object):
             labels_out = torch.zeros(self.nboxes, dtype=torch.long)
             #print(maxloc.shape, labels_in.shape, labels_out.shape)
 
+            #print("labels_out")
+            #print(labels_out.shape)
+            #print("masks")
+            #print(masks.shape)
+            #print("labels_in")
+            #print(labels_in.shape)
+            #print("best_dbox_idx")
+            #print(best_dbox_idx.shape)
+
             labels_out[masks] = labels_in[best_dbox_idx[masks]]
             bboxes_out = self.dboxes.clone()
             bboxes_out[masks, :] = bboxes_in[best_dbox_idx[masks], :]
@@ -205,7 +244,7 @@ class Encoder(object):
 
         return bboxes_in, F.softmax(scores_in, dim=-1)
 
-    def decode_batch(self, bboxes_in, scores_in,  criteria=0.45, max_output=200, nms_valid_thresh=0.05):
+    def decode_batch(self, bboxes_in, scores_in,  criteria = 0.45, max_output=200):
         bboxes, probs = self.scale_back_batch(bboxes_in, scores_in)
 
         output = []
@@ -213,77 +252,13 @@ class Encoder(object):
         for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)):
             bbox = bbox.squeeze(0)
             prob = prob.squeeze(0)
-            if self.fast_nms:
-                output.append(self.fast_decode_single(bbox, prob, criteria, max_output,
-                             nms_valid_thresh=nms_valid_thresh))
-            else:
-                try:
-                    output.append(self.decode_single(bbox, prob, criteria, max_output,
-                                 nms_valid_thresh=nms_valid_thresh))
-                except:
-                    output.append([
-                        torch.Tensor([]).reshape(0, 4).to(bbox.device), \
-                        torch.tensor([], dtype=torch.long), \
-                        torch.Tensor([]).to(bbox.device)
-                    ])
-                    return output
+            output.append(self.decode_single(bbox, prob, criteria, max_output))
             #print(output[-1])
         return output
 
     # perform non-maximum suppression
-    def decode_single(self, bboxes_in, scores_in, criteria, max_output,
-                      max_num=200, nms_valid_thresh=0.05):
-        # Reference to https://github.com/amdegroot/ssd.pytorch
-        bboxes_out = []
-        scores_out = []
-        labels_out = []
-
-        for i, score in enumerate(scores_in.split(1, 1)):
-            # skip background
-            # print(score[score>0.90])
-            if i == 0: continue
-            # print(i)
-
-            score = score.squeeze(1)
-            mask = score > nms_valid_thresh
-
-            bboxes, score = bboxes_in[mask, :], score[mask]
-            if score.size(0) == 0: continue
-
-            score_sorted, score_idx_sorted = score.sort(dim=0)
-
-            # select max_output indices
-            score_idx_sorted = score_idx_sorted[-max_num:]
-            candidates = []
-            # maxdata, maxloc = scores_in.sort()
-
-            while score_idx_sorted.numel() > 0:
-                idx = score_idx_sorted[-1].item()
-                bboxes_sorted = bboxes[score_idx_sorted, :]
-                bboxes_idx = bboxes[idx, :].unsqueeze(dim=0)
-                iou_sorted = calc_iou_tensor(bboxes_sorted,
-                                             bboxes_idx).squeeze()
-                # we only need iou < criteria
-                score_idx_sorted = score_idx_sorted[iou_sorted < criteria]
-                candidates.append(idx)
-
-            bboxes_out.append(bboxes[candidates, :])
-            scores_out.append(score[candidates])
-            labels_out.extend([i] * len(candidates))
-
-        bboxes_out, labels_out, scores_out = torch.cat(bboxes_out, dim=0), \
-                                             torch.tensor(labels_out,
-                                                          dtype=torch.long), \
-                                             torch.cat(scores_out, dim=0)
-
-        _, max_ids = scores_out.sort(dim=0)
-        max_ids = max_ids[-max_output:]
-        return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids]
-
-    # perform non-maximum suppression
-    def fast_decode_single(self, bboxes_in, scores_in, criteria, max_output, max_num=200, nms_valid_thresh=0.05):
+    def decode_single(self, bboxes_in, scores_in, criteria, max_output, max_num=200):
         # Reference to https://github.com/amdegroot/ssd.pytorch
-        from SSD import _C as C
 
         bboxes_out = []
         scores_out = []
@@ -336,5 +311,5 @@ def dboxes300_coco():
     dboxes = DefaultBoxes(figsize, feat_size, steps, scales, aspect_ratios)
     return dboxes
 
-def build_ssd300_coder(fast_nms):
-    return Encoder(dboxes300_coco(), fast_nms)
+def build_ssd300_coder():
+    return Encoder(dboxes300_coco())
diff --git a/cv/detection/ssd/pytorch/build_ssd.sh b/cv/detection/ssd/pytorch/build_ssd.sh
new file mode 100644
index 000000000..084de47ad
--- /dev/null
+++ b/cv/detection/ssd/pytorch/build_ssd.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+PYTHON_PATH=$(which python3)
+${PYTHON_PATH} -m pip list | grep "^torch .*+corex" || {
+  echo "ERROR: building SSD requries the corex torch has been installed."
+  exit 1
+}
+
+a=$(pip3 show torch|awk '/Version:/ {print $NF}'); b=(${a//+/ }); c=(${b//./ })
+if [[ ${c[0]} -eq 1 ]]; then
+  rm -rf csrc && ln -s csrc_pt1 csrc
+elif [[ ${c[0]} -eq 2 ]]; then
+  rm -rf csrc && ln -s csrc_pt2 csrc
+else
+  echo "ERROR: torch version ${a} is not as expected, please check."
+  exit 1
+fi
+
+
+${PYTHON_PATH} setup.py build 2>&1 | tee compile.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit
+${PYTHON_PATH} setup.py bdist_wheel -d build_pip || exit
+rm -rf SSD.egg-info
+
+# Return 0 status if all finished
+exit 0
diff --git a/cv/detection/ssd/pytorch/clean_ssd.sh b/cv/detection/ssd/pytorch/clean_ssd.sh
new file mode 100644
index 000000000..569634adc
--- /dev/null
+++ b/cv/detection/ssd/pytorch/clean_ssd.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+PYTHON_PATH=$(which python3)
+
+rm -rf build
+${PYTHON_PATH} setup.py clean || true
+rm -rf build_pip
+
+# Return 0 status if all finished
+exit 0
diff --git a/cv/detection/ssd/pytorch/iluvatar/csrc/box_encoder_cuda.cu b/cv/detection/ssd/pytorch/csrc_pt1/box_encoder_cuda.cu
similarity index 99%
rename from cv/detection/ssd/pytorch/iluvatar/csrc/box_encoder_cuda.cu
rename to cv/detection/ssd/pytorch/csrc_pt1/box_encoder_cuda.cu
index e9b311fab..641af7ab3 100644
--- a/cv/detection/ssd/pytorch/iluvatar/csrc/box_encoder_cuda.cu
+++ b/cv/detection/ssd/pytorch/csrc_pt1/box_encoder_cuda.cu
@@ -19,14 +19,14 @@
 
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
-#include <THC/THCNumerics.cuh>
-#include <THC/THC.h>
+// #include <THC/THCNumerics.cuh>
+// #include <THC/THC.h>
 
 #include <cuda.h>
 
 #include <torch/torch.h>
 #include <torch/extension.h>
-
+#define THCudaCheck(x) C10_CUDA_CHECK(x)
 //#define DEBUG
 
 // calculate the IoU of a single box against another box
diff --git a/cv/detection/ssd/pytorch/iluvatar/csrc/interface.cpp b/cv/detection/ssd/pytorch/csrc_pt1/interface.cpp
similarity index 100%
rename from cv/detection/ssd/pytorch/iluvatar/csrc/interface.cpp
rename to cv/detection/ssd/pytorch/csrc_pt1/interface.cpp
diff --git a/cv/detection/ssd/pytorch/nvidia/csrc/nhwc/Descriptors.cpp b/cv/detection/ssd/pytorch/csrc_pt1/nhwc/Descriptors.cpp
similarity index 100%
rename from cv/detection/ssd/pytorch/nvidia/csrc/nhwc/Descriptors.cpp
rename to cv/detection/ssd/pytorch/csrc_pt1/nhwc/Descriptors.cpp
diff --git a/cv/detection/ssd/pytorch/iluvatar/csrc/nhwc/Descriptors.h b/cv/detection/ssd/pytorch/csrc_pt1/nhwc/Descriptors.h
similarity index 99%
rename from cv/detection/ssd/pytorch/iluvatar/csrc/nhwc/Descriptors.h
rename to cv/detection/ssd/pytorch/csrc_pt1/nhwc/Descriptors.h
index f8c03619b..280d069a7 100644
--- a/cv/detection/ssd/pytorch/iluvatar/csrc/nhwc/Descriptors.h
+++ b/cv/detection/ssd/pytorch/csrc_pt1/nhwc/Descriptors.h
@@ -21,6 +21,7 @@
 
 #include "Exceptions.h"
 
+#include <iostream>
 #include <ATen/cudnn/cudnn-wrapper.h>
 #include <ATen/ATen.h>
 #include <ATen/TensorUtils.h>
diff --git a/cv/detection/ssd/pytorch/iluvatar/csrc/nhwc/Exceptions.h b/cv/detection/ssd/pytorch/csrc_pt1/nhwc/Exceptions.h
similarity index 100%
rename from cv/detection/ssd/pytorch/iluvatar/csrc/nhwc/Exceptions.h
rename to cv/detection/ssd/pytorch/csrc_pt1/nhwc/Exceptions.h
diff --git a/cv/detection/ssd/pytorch/iluvatar/csrc/nhwc/ParamsHash.h b/cv/detection/ssd/pytorch/csrc_pt1/nhwc/ParamsHash.h
similarity index 100%
rename from cv/detection/ssd/pytorch/iluvatar/csrc/nhwc/ParamsHash.h
rename to cv/detection/ssd/pytorch/csrc_pt1/nhwc/ParamsHash.h
diff --git a/cv/detection/ssd/pytorch/nvidia/csrc/nhwc/batch_norm.cu b/cv/detection/ssd/pytorch/csrc_pt1/nhwc/batch_norm.cu
similarity index 96%
rename from cv/detection/ssd/pytorch/nvidia/csrc/nhwc/batch_norm.cu
rename to cv/detection/ssd/pytorch/csrc_pt1/nhwc/batch_norm.cu
index b1e88a47d..8294132bc 100644
--- a/cv/detection/ssd/pytorch/nvidia/csrc/nhwc/batch_norm.cu
+++ b/cv/detection/ssd/pytorch/csrc_pt1/nhwc/batch_norm.cu
@@ -20,13 +20,13 @@
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cudnn/Handle.h>
-#include <THC/THCNumerics.cuh>
+// #include <THC/THCNumerics.cuh>
 
 #include <torch/torch.h>
 #include <torch/extension.h>
 
-#include "THC/THC.h"
-
+// #include "THC/THC.h"
+#include <c10/cuda/CUDACachingAllocator.h>
 #include "Descriptors.h"
 
 #include <cuda.h>
@@ -41,19 +41,23 @@ const float BN_MIN_EPSILON = 1e-4;
 // tensor instead.
 struct Workspace {
   Workspace(size_t size) : size(size), data(NULL) {
-    data = THCudaMalloc(at::globalContext().lazyInitCUDA(), size);
+    // data = THCudaMalloc(at::globalContext().lazyInitCUDA(), size);
+    auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+    dataPtr = allocator.allocate(size);
+    data = dataPtr.get();
   }
   Workspace(const Workspace&) = delete;
   Workspace(Workspace&&) = default;
   Workspace& operator=(Workspace&&) = default;
-  ~Workspace() {
-    if (data) {
-      THCudaFree(at::globalContext().lazyInitCUDA(), data);
-    }
-  }
-
+  // ~Workspace() {
+  //   if (data) {
+  //     THCudaFree(at::globalContext().lazyInitCUDA(), data);
+  //   }
+  // }
+  ~Workspace() = default;
   size_t size;
   void* data;
+  c10::DataPtr dataPtr;
 };
 
 // Return {y. save_mean, save_var, reserve}
diff --git a/cv/detection/ssd/pytorch/nvidia/csrc/nhwc/conv.cpp b/cv/detection/ssd/pytorch/csrc_pt1/nhwc/conv.cpp
similarity index 98%
rename from cv/detection/ssd/pytorch/nvidia/csrc/nhwc/conv.cpp
rename to cv/detection/ssd/pytorch/csrc_pt1/nhwc/conv.cpp
index d8c975570..3c9a29b17 100644
--- a/cv/detection/ssd/pytorch/nvidia/csrc/nhwc/conv.cpp
+++ b/cv/detection/ssd/pytorch/csrc_pt1/nhwc/conv.cpp
@@ -23,8 +23,8 @@
 #include <ATen/cuda/CUDAConfig.h>
 #include <ATen/cuda/Exceptions.h>
 
-#include "THC/THC.h"
-
+// #include "THC/THC.h"
+#include <c10/cuda/CUDACachingAllocator.h>
 #include <ATen/cudnn/cudnn-wrapper.h>
 #include "Descriptors.h"
 // #include <ATen/cudnn/Types.h>
@@ -41,7 +41,7 @@
 #include <mutex>
 #include <stdint.h>
 #include <unordered_map>
-
+#define THCudaCheck(x) C10_CUDA_CHECK(x)
 namespace at { namespace native { namespace nhwc {
 
 // TODO: Go through all the checking code again and make sure
@@ -358,19 +358,23 @@ BenchmarkCache<cudnnConvolutionBwdFilterAlgo_t> bwd_filter_algos;
 // tensor instead.
 struct Workspace {
   Workspace(size_t size) : size(size), data(NULL) {
-    data = THCudaMalloc(globalContext().lazyInitCUDA(), size);
+    // data = THCudaMalloc(globalContext().lazyInitCUDA(), size);
+    auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+    dataPtr = allocator.allocate(size);
+    data = dataPtr.get();
   }
   Workspace(const Workspace&) = delete;
   Workspace(Workspace&&) = default;
   Workspace& operator=(Workspace&&) = default;
-  ~Workspace() {
-    if (data) {
-      THCudaFree(globalContext().lazyInitCUDA(), data);
-    }
-  }
-
+  // ~Workspace() {
+  //   if (data) {
+  //     THCudaFree(globalContext().lazyInitCUDA(), data);
+  //   }
+  // }
+  ~Workspace() = default;
   size_t size;
   void* data;
+  c10::DataPtr dataPtr;
 };
 
 template<typename algo_t>
diff --git a/cv/detection/ssd/pytorch/nvidia/csrc/nhwc/max_pool.cu b/cv/detection/ssd/pytorch/csrc_pt1/nhwc/max_pool.cu
similarity index 99%
rename from cv/detection/ssd/pytorch/nvidia/csrc/nhwc/max_pool.cu
rename to cv/detection/ssd/pytorch/csrc_pt1/nhwc/max_pool.cu
index 5bd12266b..a4b39f1c4 100644
--- a/cv/detection/ssd/pytorch/nvidia/csrc/nhwc/max_pool.cu
+++ b/cv/detection/ssd/pytorch/csrc_pt1/nhwc/max_pool.cu
@@ -20,7 +20,7 @@
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cudnn/Handle.h>
-#include <THC/THCNumerics.cuh>
+// #include <THC/THCNumerics.cuh>
 
 #include "Descriptors.h"
 
diff --git a/cv/detection/ssd/pytorch/nvidia/csrc/nms.cu b/cv/detection/ssd/pytorch/csrc_pt1/nms.cu
similarity index 99%
rename from cv/detection/ssd/pytorch/nvidia/csrc/nms.cu
rename to cv/detection/ssd/pytorch/csrc_pt1/nms.cu
index 3cf8bac4b..930af4313 100644
--- a/cv/detection/ssd/pytorch/nvidia/csrc/nms.cu
+++ b/cv/detection/ssd/pytorch/csrc_pt1/nms.cu
@@ -19,13 +19,14 @@
 
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
-#include <THC/THCNumerics.cuh>
-#include <THC/THC.h>
+// #include <THC/THCNumerics.cuh>
+// #include <THC/THC.h>
 
 #include <cuda.h>
 
 #include <torch/torch.h>
 #include <torch/extension.h>
+#define THCudaCheck(x) C10_CUDA_CHECK(x)
 
 namespace nms_internal {
 
diff --git a/cv/detection/ssd/pytorch/nvidia/csrc/random_horiz_flip.cu b/cv/detection/ssd/pytorch/csrc_pt1/random_horiz_flip.cu
similarity index 98%
rename from cv/detection/ssd/pytorch/nvidia/csrc/random_horiz_flip.cu
rename to cv/detection/ssd/pytorch/csrc_pt1/random_horiz_flip.cu
index ff906e1f6..68bb33b0e 100644
--- a/cv/detection/ssd/pytorch/nvidia/csrc/random_horiz_flip.cu
+++ b/cv/detection/ssd/pytorch/csrc_pt1/random_horiz_flip.cu
@@ -19,9 +19,9 @@
 
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
-#include <THC/THCNumerics.cuh>
-#include <THC/THC.h>
-
+// #include <THC/THCNumerics.cuh>
+// #include <THC/THC.h>
+#define THCudaCheck(x) C10_CUDA_CHECK(x)
 #include <cuda.h>
 
 #include <torch/torch.h>
diff --git a/cv/detection/ssd/pytorch/nvidia/csrc/box_encoder_cuda.cu b/cv/detection/ssd/pytorch/csrc_pt2/box_encoder_cuda.cu
similarity index 99%
rename from cv/detection/ssd/pytorch/nvidia/csrc/box_encoder_cuda.cu
rename to cv/detection/ssd/pytorch/csrc_pt2/box_encoder_cuda.cu
index e9b311fab..641af7ab3 100644
--- a/cv/detection/ssd/pytorch/nvidia/csrc/box_encoder_cuda.cu
+++ b/cv/detection/ssd/pytorch/csrc_pt2/box_encoder_cuda.cu
@@ -19,14 +19,14 @@
 
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
-#include <THC/THCNumerics.cuh>
-#include <THC/THC.h>
+// #include <THC/THCNumerics.cuh>
+// #include <THC/THC.h>
 
 #include <cuda.h>
 
 #include <torch/torch.h>
 #include <torch/extension.h>
-
+#define THCudaCheck(x) C10_CUDA_CHECK(x)
 //#define DEBUG
 
 // calculate the IoU of a single box against another box
diff --git a/cv/detection/ssd/pytorch/nvidia/csrc/interface.cpp b/cv/detection/ssd/pytorch/csrc_pt2/interface.cpp
similarity index 100%
rename from cv/detection/ssd/pytorch/nvidia/csrc/interface.cpp
rename to cv/detection/ssd/pytorch/csrc_pt2/interface.cpp
diff --git a/cv/detection/ssd/pytorch/iluvatar/csrc/nhwc/Descriptors.cpp b/cv/detection/ssd/pytorch/csrc_pt2/nhwc/Descriptors.cpp
similarity index 99%
rename from cv/detection/ssd/pytorch/iluvatar/csrc/nhwc/Descriptors.cpp
rename to cv/detection/ssd/pytorch/csrc_pt2/nhwc/Descriptors.cpp
index afe0466c9..f3143f923 100644
--- a/cv/detection/ssd/pytorch/iluvatar/csrc/nhwc/Descriptors.cpp
+++ b/cv/detection/ssd/pytorch/csrc_pt2/nhwc/Descriptors.cpp
@@ -20,7 +20,7 @@
 #include "Descriptors.h"
 
 #include <ATen/ATen.h>
-#include <iostream>
+
 #include <ostream>
 #include <sstream>
 #include <string>
diff --git a/cv/detection/ssd/pytorch/nvidia/csrc/nhwc/Descriptors.h b/cv/detection/ssd/pytorch/csrc_pt2/nhwc/Descriptors.h
similarity index 99%
rename from cv/detection/ssd/pytorch/nvidia/csrc/nhwc/Descriptors.h
rename to cv/detection/ssd/pytorch/csrc_pt2/nhwc/Descriptors.h
index c8556f293..280d069a7 100644
--- a/cv/detection/ssd/pytorch/nvidia/csrc/nhwc/Descriptors.h
+++ b/cv/detection/ssd/pytorch/csrc_pt2/nhwc/Descriptors.h
@@ -21,6 +21,7 @@
 
 #include "Exceptions.h"
 
+#include <iostream>
 #include <ATen/cudnn/cudnn-wrapper.h>
 #include <ATen/ATen.h>
 #include <ATen/TensorUtils.h>
@@ -29,6 +30,8 @@
 
 #if !defined(TORCH_CUDA_API) && defined(AT_CUDA_API)
 #define TORCH_CUDA_API AT_CUDA_API
+#else
+#define TORCH_CUDA_API
 #endif
 
 namespace at { namespace native { namespace nhwc {
diff --git a/cv/detection/ssd/pytorch/nvidia/csrc/nhwc/Exceptions.h b/cv/detection/ssd/pytorch/csrc_pt2/nhwc/Exceptions.h
similarity index 100%
rename from cv/detection/ssd/pytorch/nvidia/csrc/nhwc/Exceptions.h
rename to cv/detection/ssd/pytorch/csrc_pt2/nhwc/Exceptions.h
diff --git a/cv/detection/ssd/pytorch/nvidia/csrc/nhwc/ParamsHash.h b/cv/detection/ssd/pytorch/csrc_pt2/nhwc/ParamsHash.h
similarity index 100%
rename from cv/detection/ssd/pytorch/nvidia/csrc/nhwc/ParamsHash.h
rename to cv/detection/ssd/pytorch/csrc_pt2/nhwc/ParamsHash.h
diff --git a/cv/detection/ssd/pytorch/iluvatar/csrc/nhwc/batch_norm.cu b/cv/detection/ssd/pytorch/csrc_pt2/nhwc/batch_norm.cu
similarity index 96%
rename from cv/detection/ssd/pytorch/iluvatar/csrc/nhwc/batch_norm.cu
rename to cv/detection/ssd/pytorch/csrc_pt2/nhwc/batch_norm.cu
index b1e88a47d..8294132bc 100644
--- a/cv/detection/ssd/pytorch/iluvatar/csrc/nhwc/batch_norm.cu
+++ b/cv/detection/ssd/pytorch/csrc_pt2/nhwc/batch_norm.cu
@@ -20,13 +20,13 @@
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cudnn/Handle.h>
-#include <THC/THCNumerics.cuh>
+// #include <THC/THCNumerics.cuh>
 
 #include <torch/torch.h>
 #include <torch/extension.h>
 
-#include "THC/THC.h"
-
+// #include "THC/THC.h"
+#include <c10/cuda/CUDACachingAllocator.h>
 #include "Descriptors.h"
 
 #include <cuda.h>
@@ -41,19 +41,23 @@ const float BN_MIN_EPSILON = 1e-4;
 // tensor instead.
 struct Workspace {
   Workspace(size_t size) : size(size), data(NULL) {
-    data = THCudaMalloc(at::globalContext().lazyInitCUDA(), size);
+    // data = THCudaMalloc(at::globalContext().lazyInitCUDA(), size);
+    auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+    dataPtr = allocator.allocate(size);
+    data = dataPtr.get();
   }
   Workspace(const Workspace&) = delete;
   Workspace(Workspace&&) = default;
   Workspace& operator=(Workspace&&) = default;
-  ~Workspace() {
-    if (data) {
-      THCudaFree(at::globalContext().lazyInitCUDA(), data);
-    }
-  }
-
+  // ~Workspace() {
+  //   if (data) {
+  //     THCudaFree(at::globalContext().lazyInitCUDA(), data);
+  //   }
+  // }
+  ~Workspace() = default;
   size_t size;
   void* data;
+  c10::DataPtr dataPtr;
 };
 
 // Return {y. save_mean, save_var, reserve}
diff --git a/cv/detection/ssd/pytorch/iluvatar/csrc/nhwc/conv.cpp b/cv/detection/ssd/pytorch/csrc_pt2/nhwc/conv.cpp
similarity index 98%
rename from cv/detection/ssd/pytorch/iluvatar/csrc/nhwc/conv.cpp
rename to cv/detection/ssd/pytorch/csrc_pt2/nhwc/conv.cpp
index d8c975570..b3b010a63 100644
--- a/cv/detection/ssd/pytorch/iluvatar/csrc/nhwc/conv.cpp
+++ b/cv/detection/ssd/pytorch/csrc_pt2/nhwc/conv.cpp
@@ -23,8 +23,8 @@
 #include <ATen/cuda/CUDAConfig.h>
 #include <ATen/cuda/Exceptions.h>
 
-#include "THC/THC.h"
-
+// #include "THC/THC.h"
+#include <c10/cuda/CUDACachingAllocator.h>
 #include <ATen/cudnn/cudnn-wrapper.h>
 #include "Descriptors.h"
 // #include <ATen/cudnn/Types.h>
@@ -41,7 +41,7 @@
 #include <mutex>
 #include <stdint.h>
 #include <unordered_map>
-
+#define THCudaCheck(x) C10_CUDA_CHECK(x)
 namespace at { namespace native { namespace nhwc {
 
 // TODO: Go through all the checking code again and make sure
@@ -358,19 +358,23 @@ BenchmarkCache<cudnnConvolutionBwdFilterAlgo_t> bwd_filter_algos;
 // tensor instead.
 struct Workspace {
   Workspace(size_t size) : size(size), data(NULL) {
-    data = THCudaMalloc(globalContext().lazyInitCUDA(), size);
+    // data = THCudaMalloc(globalContext().lazyInitCUDA(), size);
+    auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
+    dataPtr = allocator.allocate(size);
+    data = dataPtr.get();
   }
   Workspace(const Workspace&) = delete;
   Workspace(Workspace&&) = default;
   Workspace& operator=(Workspace&&) = default;
-  ~Workspace() {
-    if (data) {
-      THCudaFree(globalContext().lazyInitCUDA(), data);
-    }
-  }
-
+  // ~Workspace() {
+  //   if (data) {
+  //     THCudaFree(globalContext().lazyInitCUDA(), data);
+  //   }
+  // }
+  ~Workspace() = default;
   size_t size;
   void* data;
+  c10::DataPtr dataPtr;
 };
 
 template<typename algo_t>
@@ -429,7 +433,7 @@ size_t getMaxWorkspaceSize(
 
   int device;
   THCudaCheck(cudaGetDevice(&device));
-  c10::cuda::CUDACachingAllocator::cacheInfo(device, &tmp_bytes, &max_block_size);
+  c10::cuda::CUDACachingAllocator::cacheInfo(device, &max_block_size);
 
   for (int i = 0; i < n_algo; i++) {
     cudnnStatus_t err;
diff --git a/cv/detection/ssd/pytorch/iluvatar/csrc/nhwc/max_pool.cu b/cv/detection/ssd/pytorch/csrc_pt2/nhwc/max_pool.cu
similarity index 99%
rename from cv/detection/ssd/pytorch/iluvatar/csrc/nhwc/max_pool.cu
rename to cv/detection/ssd/pytorch/csrc_pt2/nhwc/max_pool.cu
index 5bd12266b..a4b39f1c4 100644
--- a/cv/detection/ssd/pytorch/iluvatar/csrc/nhwc/max_pool.cu
+++ b/cv/detection/ssd/pytorch/csrc_pt2/nhwc/max_pool.cu
@@ -20,7 +20,7 @@
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <ATen/cudnn/Handle.h>
-#include <THC/THCNumerics.cuh>
+// #include <THC/THCNumerics.cuh>
 
 #include "Descriptors.h"
 
diff --git a/cv/detection/ssd/pytorch/iluvatar/csrc/nms.cu b/cv/detection/ssd/pytorch/csrc_pt2/nms.cu
similarity index 99%
rename from cv/detection/ssd/pytorch/iluvatar/csrc/nms.cu
rename to cv/detection/ssd/pytorch/csrc_pt2/nms.cu
index 3cf8bac4b..930af4313 100644
--- a/cv/detection/ssd/pytorch/iluvatar/csrc/nms.cu
+++ b/cv/detection/ssd/pytorch/csrc_pt2/nms.cu
@@ -19,13 +19,14 @@
 
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
-#include <THC/THCNumerics.cuh>
-#include <THC/THC.h>
+// #include <THC/THCNumerics.cuh>
+// #include <THC/THC.h>
 
 #include <cuda.h>
 
 #include <torch/torch.h>
 #include <torch/extension.h>
+#define THCudaCheck(x) C10_CUDA_CHECK(x)
 
 namespace nms_internal {
 
diff --git a/cv/detection/ssd/pytorch/iluvatar/csrc/random_horiz_flip.cu b/cv/detection/ssd/pytorch/csrc_pt2/random_horiz_flip.cu
similarity index 97%
rename from cv/detection/ssd/pytorch/iluvatar/csrc/random_horiz_flip.cu
rename to cv/detection/ssd/pytorch/csrc_pt2/random_horiz_flip.cu
index ff906e1f6..cb43a49b4 100644
--- a/cv/detection/ssd/pytorch/iluvatar/csrc/random_horiz_flip.cu
+++ b/cv/detection/ssd/pytorch/csrc_pt2/random_horiz_flip.cu
@@ -19,9 +19,9 @@
 
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
-#include <THC/THCNumerics.cuh>
-#include <THC/THC.h>
-
+// #include <THC/THCNumerics.cuh>
+// #include <THC/THC.h>
+#define THCudaCheck(x) C10_CUDA_CHECK(x)
 #include <cuda.h>
 
 #include <torch/torch.h>
@@ -141,7 +141,7 @@ std::vector<at::Tensor> random_horiz_flip(
 
   auto stream = at::cuda::getCurrentCUDAStream();
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      img.type(),
+      img.scalar_type(),
       "HorizFlipImagesAndBoxes",
       [&] {
         HorizFlipImagesAndBoxes<scalar_t><<<N, dim3(16, 16), 0, stream.stream()>>>(
diff --git a/cv/detection/ssd/pytorch/base/dataloaders/build_pipeline.py b/cv/detection/ssd/pytorch/data/build_pipeline.py
similarity index 58%
rename from cv/detection/ssd/pytorch/base/dataloaders/build_pipeline.py
rename to cv/detection/ssd/pytorch/data/build_pipeline.py
index 17ae06e71..1f4e1b0ee 100644
--- a/cv/detection/ssd/pytorch/base/dataloaders/build_pipeline.py
+++ b/cv/detection/ssd/pytorch/data/build_pipeline.py
@@ -15,18 +15,20 @@
 import torch
 
 from .native_pipeline import build_native_pipeline
-from .input_iterators import ConvertDaliInputIterator
+from .input_iterators import ConvertDaliInputIterator, RateMatcher, FakeInputIterator
 
+from mlperf_logger import log_event
+from mlperf_logging.mllog import constants
 
 """
 Build a train pipe for training (without touching the data)
 
 returns train_pipe
 """
-def prebuild_pipeline(config):
-    if config.dali:
+def prebuild_pipeline(args):
+    if args.dali:
         from .dali_pipeline import prebuild_dali_pipeline
-        return prebuild_dali_pipeline(config)
+        return prebuild_dali_pipeline(args)
     else:
         return None
 
@@ -36,19 +38,28 @@ Build a data pipeline for either training or eval
 Training : returns loader, epoch_size
 Eval : returns loader, inv_class_map, cocoGt
 """
-def build_pipeline(config, training=True, pipe=None):
+def build_pipeline(args, training=True, pipe=None):
     # Handle training / testing differently due to different
     # outputs. But still want to do this to abstract out the
     # use of EncodingInputIterator and RateMatcher
     if training:
-        if config.dali:
+        if args.dali:
             from .dali_pipeline import build_dali_pipeline
-            train_loader, epoch_size = build_dali_pipeline(config, training=True, pipe=pipe)
-            train_sampler = None
-            train_loader = ConvertDaliInputIterator(train_loader)
+            builder_fn = build_dali_pipeline
         else:
-            train_loader, epoch_size, train_sampler = build_native_pipeline(config, training=True, pipe=pipe)
-        return train_loader, epoch_size, train_sampler
-    else:
-        return build_native_pipeline(config, training=False)
+            builder_fn = build_native_pipeline
+        train_loader, epoch_size = builder_fn(args, training=True, pipe=pipe)
+        log_event(key=constants.TRAIN_SAMPLES, value=epoch_size)
+
+        if args.dali:
+            train_loader = ConvertDaliInputIterator(train_loader)
 
+        if args.fake_input:
+            train_loader = FakeInputIterator(train_loader, epoch_size, args.N_gpu)
+
+        if args.input_batch_multiplier > 1:
+            train_loader = RateMatcher(input_it=train_loader, output_size=args.batch_size)
+
+        return train_loader, epoch_size
+    else:
+        return build_native_pipeline(args, training=False)
diff --git a/cv/detection/ssd/pytorch/base/dataloaders/dali_iterator.py b/cv/detection/ssd/pytorch/data/dali_iterator.py
similarity index 94%
rename from cv/detection/ssd/pytorch/base/dataloaders/dali_iterator.py
rename to cv/detection/ssd/pytorch/data/dali_iterator.py
index 42ba88166..266ebda80 100644
--- a/cv/detection/ssd/pytorch/base/dataloaders/dali_iterator.py
+++ b/cv/detection/ssd/pytorch/data/dali_iterator.py
@@ -27,10 +27,12 @@ from nvidia.dali.pipeline import Pipeline
 import nvidia.dali.ops as ops
 import nvidia.dali.types as types
 
+import time
+
 # Defines the pipeline for a single GPU for _training_
 class COCOPipeline(Pipeline):
-    def __init__(self, batch_size, device_id, file_root, annotations_file, num_gpus,
-                 anchors_ltrb_list, meta_files_path=None,
+    def __init__(self, batch_size, device_id, file_root, meta_files_path, annotations_file, num_gpus,
+                 anchors_ltrb_list,
                  output_fp16=False, output_nhwc=False, pad_output=False, num_threads=1,
                  seed=15, dali_cache=-1, dali_async=True,
                  use_nvjpeg=False):
@@ -40,10 +42,15 @@ class COCOPipeline(Pipeline):
                                            exec_async=dali_async)
 
         self.use_nvjpeg = use_nvjpeg
-        try:
-            shard_id = torch.distributed.get_rank()
+        #try:
+            #shard_id = torch.distributed.get_rank()
         # Note: <= 19.05 was a RuntimeError, 19.06 is now throwing AssertionError
-        except (RuntimeError, AssertionError):
+        #except (RuntimeError, AssertionError):
+            #shard_id = 0
+        import torch.distributed as dist
+        if dist.is_available() and dist.is_initialized():
+            shard_id = dist.get_rank()
+        else:
             shard_id = 0
 
         if meta_files_path == None:
diff --git a/cv/detection/ssd/pytorch/base/dataloaders/dali_pipeline.py b/cv/detection/ssd/pytorch/data/dali_pipeline.py
similarity index 42%
rename from cv/detection/ssd/pytorch/base/dataloaders/dali_pipeline.py
rename to cv/detection/ssd/pytorch/data/dali_pipeline.py
index 6d19450d8..b5e5cc26c 100644
--- a/cv/detection/ssd/pytorch/base/dataloaders/dali_pipeline.py
+++ b/cv/detection/ssd/pytorch/data/dali_pipeline.py
@@ -7,32 +7,27 @@ from box_coder import dboxes300_coco
 
 anchors_ltrb_list = dboxes300_coco()("ltrb").numpy().flatten().tolist()
 
-def prebuild_dali_pipeline(config):
-    """
-        Equivalent to SSD Transformer.
-    :param config: configuration
-    :return: Dali Pipeline
-    """
-    train_annotate = os.path.join(config.data_dir, "annotations/bbox_only_instances_train2017.json")
-    train_coco_root = os.path.join(config.data_dir, "train2017")
-    pipe = COCOPipeline(config.train_batch_size,
-                        config.local_rank, train_coco_root,
-                        train_annotate, config.n_gpu,
+def prebuild_dali_pipeline(args):
+    train_annotate = os.path.join(os.path.dirname(__file__), "../../../../../bbox_only_instances_train2017.json")
+    train_coco_root = os.path.join(args.data, "train2017")
+    pipe = COCOPipeline(args.batch_size * args.input_batch_multiplier,
+                        args.local_rank, train_coco_root,
+                        args.meta_files_path, train_annotate, args.N_gpu,
                         anchors_ltrb_list,
-                        num_threads=config.num_workers,
-                        output_fp16=config.fp16, output_nhwc=config.nhwc,
-                        pad_output=config.pad_input, seed=config.local_seed - 2**31,
-                        use_nvjpeg=config.use_nvjpeg,
-                        dali_cache=config.dali_cache,
-                        dali_async=(not config.dali_sync))
+                        num_threads=args.num_workers,
+                        output_fp16=args.use_fp16, output_nhwc=args.nhwc,
+                        pad_output=args.pad_input, seed=args.local_seed - 2**31,
+                        use_nvjpeg=args.use_nvjpeg,
+                        dali_cache=args.dali_cache,
+                        dali_async=(not args.dali_sync))
     pipe.build()
     return pipe
 
-def build_dali_pipeline(config, training=True, pipe=None):
+def build_dali_pipeline(args, training=True, pipe=None):
     # pipe is prebuilt without touching the data
     from nvidia.dali.plugin.pytorch import DALIGenericIterator
     train_loader = DALIGenericIterator(pipelines=[pipe],
                                        output_map= ['image', 'bbox', 'label'],
-                                       size=pipe.epoch_size()['train_reader'] // config.n_gpu,
+                                       size=pipe.epoch_size()['train_reader'] // args.N_gpu,
                                        auto_reset=True)
     return train_loader, pipe.epoch_size()['train_reader']
diff --git a/cv/detection/ssd/pytorch/base/dataloaders/input_iterators.py b/cv/detection/ssd/pytorch/data/input_iterators.py
similarity index 99%
rename from cv/detection/ssd/pytorch/base/dataloaders/input_iterators.py
rename to cv/detection/ssd/pytorch/data/input_iterators.py
index fda527e2b..59d86fc81 100644
--- a/cv/detection/ssd/pytorch/base/dataloaders/input_iterators.py
+++ b/cv/detection/ssd/pytorch/data/input_iterators.py
@@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch
+from SSD import _C as C
+
+
 class ConvertDaliInputIterator(object):
     def __init__(self, dali_it):
         self._dali_it = dali_it
diff --git a/cv/detection/ssd/pytorch/data/native_pipeline.py b/cv/detection/ssd/pytorch/data/native_pipeline.py
new file mode 100644
index 000000000..ebb7b5a30
--- /dev/null
+++ b/cv/detection/ssd/pytorch/data/native_pipeline.py
@@ -0,0 +1,165 @@
+import torch
+import os
+
+from functools import partial
+
+from torch.utils.data import DataLoader
+from mlperf_logger import log_event
+from mlperf_logging.mllog import constants
+
+from utils import COCODetection, SSDCropping, SSDTransformer, SSDTransformerNoDali
+from box_coder import dboxes300_coco
+from .sampler import GeneralDistributedSampler
+
+from pycocotools.coco import COCO
+import numpy as np
+
+
+def SSDCollator(batch, is_training=False):
+    # batch is: [image (300x300) Tensor, image_id, (htot, wtot), bboxes (8732, 4) Tensor, labels (8732) Tensor]
+    images = []
+    image_ids = []
+    image_sizes = []
+    bboxes = []
+    bbox_offsets = [0]
+    labels = []
+
+    for item in batch:
+        images.append(item[0].view(1, *item[0].shape))
+        image_ids.append(item[1])
+        image_sizes.append(item[2])
+        bboxes.append(item[3])
+        labels.append(item[4])
+
+        bbox_offsets.append(bbox_offsets[-1] + item[3].shape[0])
+
+    images = torch.cat(images)
+    bbox_offsets = np.array(bbox_offsets).astype(np.int32)
+
+    if is_training:
+        return [images, torch.cat(bboxes), torch.cat(labels), torch.tensor(bbox_offsets)]
+    else:
+        return [images, torch.tensor(image_ids), image_sizes, torch.cat(bboxes), torch.cat(labels), torch.tensor(bbox_offsets)]
+
+
+def SSDCollatorNoDali(batch, is_training=False):
+    # batch is: [image (300x300) Tensor, image_id, (htot, wtot), bboxes (8732, 4) Tensor, labels (8732) Tensor]
+    images = []
+    image_ids = []
+    image_sizes = []
+    bboxes = []
+    bbox_offsets = [0]
+    labels = []
+
+    for img, img_id, img_size, bbox, label in batch:
+        images.append(img.view(1, *img.shape))
+        image_ids.append(img_id)
+        image_sizes.append(img_size)
+        bboxes.append(bbox)
+        labels.append(label)
+        bbox_offsets.append(bbox_offsets[-1] + bbox.shape[0])
+
+    images = torch.cat(images)
+    N = images.shape[0]
+    bboxes = torch.cat(bboxes).view(N, -1, 4)
+    labels = torch.cat(labels).view(N, -1)
+    if is_training:
+        res = [images, bboxes, labels]
+    else:
+        res = [images, torch.tensor(image_ids), image_sizes, torch.cat(bboxes), torch.cat(labels), torch.tensor(bbox_offsets)]
+    return res
+
+
+def generate_mean_std(args):
+    mean_val = [0.485, 0.456, 0.406]
+    std_val = [0.229, 0.224, 0.225]
+
+    if args.pad_input:
+        mean_val.append(0.)
+        std_val.append(1.)
+    mean = torch.tensor(mean_val).cuda()
+    std = torch.tensor(std_val).cuda()
+
+    if args.nhwc:
+        view = [1, 1, 1, len(mean_val)]
+    else:
+        view = [1, len(mean_val), 1, 1]
+
+    mean = mean.view(*view)
+    std = std.view(*view)
+
+    if args.use_fp16:
+        mean = mean.half()
+        std = std.half()
+
+    return mean, std
+
+def build_train_pipe(args):
+    if args.dali:
+        train_annotate = os.path.join(os.path.dirname(__file__), "../../../../../bbox_only_instances_train2017.json")
+    else:
+        train_annotate = os.path.join(args.data, "annotations/instances_train2017.json")
+    train_coco_root = os.path.join(args.data, "train2017")
+
+    input_size = args.input_size
+    if args.dali:
+        train_trans = SSDTransformer((input_size, input_size), val=False)
+    else:
+        dboxes = dboxes300_coco()
+        train_trans = SSDTransformerNoDali(dboxes, (input_size, input_size), val=False)
+    train_coco = COCODetection(train_coco_root, train_annotate, train_trans)
+
+    if args.distributed:
+        train_sampler = GeneralDistributedSampler(train_coco, pad=args.pad_input)
+    else:
+        train_sampler = None
+
+    if args.dali:
+        train_loader = DataLoader(train_coco,
+                                  batch_size=args.batch_size*args.input_batch_multiplier,
+                                  shuffle=(train_sampler is None),
+                                  sampler=train_sampler,
+                                  num_workers=args.num_workers,
+                                  collate_fn=partial(SSDCollator, is_training=True))
+    else:
+        train_loader = DataLoader(train_coco,
+                                  batch_size=args.batch_size,
+                                  shuffle=(train_sampler is None),
+                                  sampler=train_sampler,
+                                  num_workers=args.num_workers,
+                                  collate_fn=partial(SSDCollatorNoDali, is_training=True)
+        )
+    return train_loader, len(train_loader)
+
+
+def build_eval_pipe(args):
+    # Paths
+    val_annotate = os.path.join(os.path.dirname(__file__), "../../../../../bbox_only_instances_val2017.json")
+    val_coco_root = os.path.join(args.data, "val2017")
+
+    input_size = args.input_size
+    val_trans = SSDTransformer((input_size, input_size), val=True)
+    cocoGt = COCO(annotation_file=val_annotate)
+    val_coco = COCODetection(val_coco_root, val_annotate, val_trans, cocoGt.dataset)
+    log_event(key=constants.EVAL_SAMPLES, value=len(val_coco))
+
+    if args.distributed:
+        val_sampler = GeneralDistributedSampler(val_coco, pad=args.pad_input)
+    else:
+        val_sampler = None
+
+    val_dataloader   = DataLoader(val_coco,
+                                  batch_size=args.eval_batch_size,
+                                  shuffle=False, # Note: distributed sampler is shuffled :(
+                                  sampler=val_sampler,
+                                  num_workers=args.num_workers)
+
+    inv_map = {v:k for k,v in val_coco.label_map.items()}
+
+    return val_dataloader, inv_map, cocoGt
+
+def build_native_pipeline(args, training=True, pipe=None):
+    if training:
+        return build_train_pipe(args)
+    else:
+        return build_eval_pipe(args)
diff --git a/cv/detection/ssd/pytorch/base/dataloaders/prefetcher.py b/cv/detection/ssd/pytorch/data/prefetcher.py
similarity index 100%
rename from cv/detection/ssd/pytorch/base/dataloaders/prefetcher.py
rename to cv/detection/ssd/pytorch/data/prefetcher.py
diff --git a/cv/detection/ssd/pytorch/base/dataloaders/sampler.py b/cv/detection/ssd/pytorch/data/sampler.py
similarity index 100%
rename from cv/detection/ssd/pytorch/base/dataloaders/sampler.py
rename to cv/detection/ssd/pytorch/data/sampler.py
diff --git a/cv/detection/ssd/pytorch/default/config/config_V100x1x1.py b/cv/detection/ssd/pytorch/default/config/config_V100x1x1.py
deleted file mode 100644
index 11b12e1df..000000000
--- a/cv/detection/ssd/pytorch/default/config/config_V100x1x1.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from training_event import DefaultTrainingEvent
-
-# 1.Basic Configurations
-n_gpu = 1
-distributed = False
-dist_backend = "nccl"
-
-save_checkpoint = False
-
-seed = 42
-log_freq = 20
-
-
-# 2.Model Training Configurations
-gradient_accumulation_steps = 1
-train_batch_size = 120
-eval_batch_size = 160
-learning_rate = 2.92e-3
-weight_decay_rate = 1.6e-4
-lr_decay_factor = 0.1
-lr_decay_epochs = [40, 50]
-warmup = 650
-warmup_factor = 0
-loss_scale = 0.0
-
-# 3. Optimizer Configurations
-num_workers = 4
-fp16 = False
-delay_allreduce = False
-
-
-training_event = DefaultTrainingEvent
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/default/config/config_V100x1x8.py b/cv/detection/ssd/pytorch/default/config/config_V100x1x8.py
deleted file mode 100644
index 3932faacf..000000000
--- a/cv/detection/ssd/pytorch/default/config/config_V100x1x8.py
+++ /dev/null
@@ -1,31 +0,0 @@
-from training_event import DefaultTrainingEvent
-
-# 1.Basic Configurations
-n_gpu = 1
-distributed = True
-dist_backend = "nccl"
-
-save_checkpoint = False
-
-seed = 4230048668
-log_freq = 20
-
-
-# 2.Model Training Configurations
-gradient_accumulation_steps = 1
-train_batch_size = 128
-eval_batch_size = 160
-learning_rate = 2.5e-3
-weight_decay_rate = 5e-4
-lr_decay_factor = 0.1
-lr_decay_epochs = [40, 50]
-warmup = 300
-warmup_factor = 0
-
-# 3. Optimizer Configurations
-num_workers = 4
-fp16 = False
-delay_allreduce = False
-
-
-training_event = DefaultTrainingEvent
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/default/config/config_nv_V100x1x8.py b/cv/detection/ssd/pytorch/default/config/config_nv_V100x1x8.py
deleted file mode 100644
index 80d62b686..000000000
--- a/cv/detection/ssd/pytorch/default/config/config_nv_V100x1x8.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from training_event import DefaultTrainingEvent
-
-# 1.Basic Configurations
-n_gpu = 1
-distributed = True
-dist_backend = "nccl"
-
-save_checkpoint = False
-
-seed = 4230048668
-log_freq = 20
-
-
-# 2.Model Training Configurations
-gradient_accumulation_steps = 1
-train_batch_size = 120
-eval_batch_size = 160
-learning_rate = 2.92e-3
-weight_decay_rate = 1.6e-4
-lr_decay_factor = 0.1
-lr_decay_epochs = [44, 55]
-warmup = 650
-warmup_factor = 0
-loss_scale = 0.0
-
-# 3. Optimizer Configurations
-num_workers = 4
-fp16 = False
-delay_allreduce = True
-
-
-training_event = DefaultTrainingEvent
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/default/config/training_event.py b/cv/detection/ssd/pytorch/default/config/training_event.py
deleted file mode 100644
index 0c14b73e0..000000000
--- a/cv/detection/ssd/pytorch/default/config/training_event.py
+++ /dev/null
@@ -1,79 +0,0 @@
-from typing import Tuple
-
-import torch
-import torch.distributed as dist
-from torch.cuda.amp import GradScaler
-from torch.cuda.amp import autocast
-from torch.nn.parallel import DistributedDataParallel as NativeDDP
-from torch.optim import Optimizer
-
-from optimizers import create_optimizer
-from train.event.base import BaseTrainingEventInterface, SSD_MODEL
-from train.training_state import TrainingState
-
-
-class DefaultTrainingEvent(BaseTrainingEventInterface):
-
-    def __init__(self, config):
-        super(DefaultTrainingEvent, self).__init__(config)
-        self.model = None
-        self.optimizer = None
-        self.autocast_ctx = None
-
-    def save_checkpoint(self, path: str, training_state: TrainingState):
-        torch.save({
-            "model": self.model.state_dict(),
-            "optimizer": self.optimizer.state_dict(),
-            "epoch": training_state.epoch,
-            "iter_num": training_state.iter_num,
-        }, "{}/epoch{}_{}.pt".format(path, training_state.epoch, round(training_state.eval_ap, 5)))
-
-    def load_checkpoint(self, checkpoint):
-        self.model.load_state_dict(checkpoint["model"], strict=True)
-        self.optimizer.load_state_dict(checkpoint["optimizer"])
-        self.config.iteration = checkpoint["iter_num"]
-        self.config.epoch = checkpoint["epoch"]
-
-    def create_optimizer(self, model: SSD_MODEL) -> Optimizer:
-        config = self.config
-        current_momentum = 0.9
-        current_lr = config.learning_rate * (config.train_batch_size * config.n_gpu / 32)
-        self.optimizer = torch.optim.SGD(model.parameters(), lr=current_lr,
-                                momentum=current_momentum,
-                                weight_decay=config.weight_decay_rate)
-        return self.optimizer
-
-    def model_to_fp16(self, model: SSD_MODEL, optimizer: Optimizer) -> Tuple[SSD_MODEL, Optimizer]:
-        self.model = model
-        self.optimizer = optimizer
-        return self.model, self.optimizer
-
-    def model_to_ddp(self, model: SSD_MODEL) -> SSD_MODEL:
-        if self.config.distributed:
-            self.model = NativeDDP(model,
-                              device_ids=[self.config.local_rank])
-        else:
-            self.model = model
-        return self.model
-
-    def on_step_begin(self, step: int):
-        self.autocast_ctx = autocast(self.config.fp16)
-        self.autocast_ctx.__enter__()
-
-    def on_backward(self, step: int, loss: torch.Tensor, optimizer: Optimizer, grad_scaler: GradScaler=None):
-        self.autocast_ctx.__exit__(None, None, None)
-
-        scaled_loss = grad_scaler.scale(loss)
-        scaled_loss.backward()
-        update_step = step % self.config.gradient_accumulation_steps == 0
-        if update_step:
-            self.update_model_params(optimizer, grad_scaler)
-
-    def update_model_params(self, optimizer: Optimizer, grad_scaler: GradScaler=None):
-        grad_scaler.step(optimizer)
-        grad_scaler.update()
-
-        for param in self.model.parameters():
-            param.grad = None
-
-
diff --git a/cv/detection/ssd/pytorch/download_dataset.sh b/cv/detection/ssd/pytorch/download_dataset.sh
new file mode 100644
index 000000000..b3f96d04f
--- /dev/null
+++ b/cv/detection/ssd/pytorch/download_dataset.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Get COCO 2017 data sets
+dir=$(pwd)
+mkdir /coco; cd /coco
+curl -O http://images.cocodataset.org/zips/train2017.zip; unzip train2017.zip
+curl -O http://images.cocodataset.org/zips/val2017.zip; unzip val2017.zip
+curl -O http://images.cocodataset.org/annotations/annotations_trainval2017.zip; unzip annotations_trainval2017.zip
+${dir}/prepare-json.py --keep-keys annotations/instances_val2017.json annotations/bbox_only_instances_val2017.json
+${dir}/prepare-json.py annotations/instances_train2017.json annotations/bbox_only_instances_train2017.json
+cd $dir
diff --git a/cv/detection/ssd/pytorch/eval.py b/cv/detection/ssd/pytorch/eval.py
new file mode 100644
index 000000000..a4c459163
--- /dev/null
+++ b/cv/detection/ssd/pytorch/eval.py
@@ -0,0 +1,258 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import torch
+import time
+import numpy as np
+import io
+
+from ssd300 import SSD300
+from box_coder import dboxes300_coco, build_ssd300_coder
+from parse_config import parse_args, validate_arguments, validate_group_bn
+from data.build_pipeline import build_pipeline
+from data.prefetcher import eval_prefetcher
+from async_evaluator import AsyncEvaluator
+
+import sys
+
+# necessary pytorch imports
+import torch.utils.data.distributed
+import torch.distributed as dist
+
+# Apex imports
+try:
+    import apex_C
+    import apex
+    from apex.parallel import DistributedDataParallel as DDP
+    from apex.fp16_utils import *
+    from apex.multi_tensor_apply import multi_tensor_applier
+    import amp_C
+except ImportError:
+    raise ImportError("Please install APEX from https://github.com/nvidia/apex")
+
+from SSD import _C as C
+
+def print_message(rank, *print_args):
+    if rank == 0:
+        print(*print_args)
+
+"""
+Take results and produce mAP on COCO
+
+Intended to be used with an async evaluator, and run on a single
+node -- calling code is responsible for that delegation
+"""
+def evaluate_coco(epoch, final_results, cocoGt, local_rank, threshold, lr):
+    from pycocotools.cocoeval import COCOeval
+    cocoDt = cocoGt.loadRes(final_results)
+
+    E = COCOeval(cocoGt, cocoDt, iouType='bbox')
+    E.evaluate()
+    E.accumulate()
+    E.summarize()
+    print("Epoch: {}, Current AP: {:.5f} AP goal: {:.5f}, lr: {}".format(epoch, E.stats[0], threshold, lr))
+    sys.stdout.flush()
+
+    return E.stats[0]
+
+def coco_eval(args, model, coco, cocoGt, encoder, inv_map, epoch, iteration, lr, evaluator=None):
+    from pycocotools.cocoeval import COCOeval
+    model.eval()
+    threshold = args.threshold
+    batch_size = args.eval_batch_size
+    use_fp16 = args.use_fp16
+    local_rank = args.local_rank
+    N_gpu = args.N_gpu
+    use_nhwc = args.nhwc
+    pad_input = args.pad_input
+    distributed = args.distributed
+
+    ret = []
+    overlap_threshold = 0.50
+    nms_max_detections = 200
+    start = time.time()
+
+    # Wrap dataloader for prefetching
+    coco = eval_prefetcher(iter(coco),
+                           torch.cuda.current_device(),
+                           args.pad_input,
+                           args.nhwc,
+                           args.use_fp16)
+
+    for nbatch, (img, img_id, img_size) in enumerate(coco):
+        with torch.no_grad():
+            # Get predictions
+            ploc, plabel = model(img)
+            ploc, plabel = ploc.float(), plabel.float()
+
+            # Handle the batch of predictions produced
+            # This is slow, but consistent with old implementation.
+            for idx in range(ploc.shape[0]):
+                # ease-of-use for specific predictions
+                ploc_i = ploc[idx, :, :].unsqueeze(0)
+                plabel_i = plabel[idx, :, :].unsqueeze(0)
+
+                result = encoder.decode_batch(ploc_i, plabel_i, overlap_threshold, nms_max_detections)[0]
+
+                htot, wtot = img_size[0][idx].item(), img_size[1][idx].item()
+                loc, label, prob = [r.cpu().numpy() for r in result]
+                for loc_, label_, prob_ in zip(loc, label, prob):
+                    ret.append([img_id[idx], loc_[0]*wtot, \
+                                        loc_[1]*htot,
+                                        (loc_[2] - loc_[0])*wtot,
+                                        (loc_[3] - loc_[1])*htot,
+                                        prob_,
+                                        inv_map[label_]])
+
+    # Now we have all predictions from this rank, gather them all together
+    # if necessary
+    ret = np.array(ret).astype(np.float32)
+
+    # Multi-GPU eval
+    if distributed:
+        # NCCL backend means we can only operate on GPU tensors
+        ret_copy = torch.tensor(ret).cuda()
+
+        # Everyone exchanges the size of their results
+        ret_sizes = [torch.tensor(0).cuda() for _ in range(N_gpu)]
+        torch.distributed.all_gather(ret_sizes, torch.tensor(ret_copy.shape[0]).cuda())
+
+        # Get the maximum results size, as all tensors must be the same shape for
+        # the all_gather call we need to make
+        max_size = 0
+        sizes = []
+        for s in ret_sizes:
+            max_size = max(max_size, s.item())
+            sizes.append(s.item())
+
+        # Need to pad my output to max_size in order to use in all_gather
+        ret_pad = torch.cat([ret_copy, torch.zeros(max_size-ret_copy.shape[0], 7, dtype=torch.float32).cuda()])
+
+        # allocate storage for results from all other processes
+        other_ret = [torch.zeros(max_size, 7, dtype=torch.float32).cuda() for i in range(N_gpu)]
+        # Everyone exchanges (padded) results
+        torch.distributed.all_gather(other_ret, ret_pad)
+
+        # Now need to reconstruct the _actual_ results from the padded set using slices.
+        cat_tensors = []
+        for i in range(N_gpu):
+            cat_tensors.append(other_ret[i][:sizes[i]][:])
+
+        final_results = torch.cat(cat_tensors).cpu().numpy()
+    else:
+        # Otherwise full results are just our results
+        final_results = ret
+
+    print_message(args.rank, "Predicting Ended, total time: {:.2f} s".format(time.time()-start))
+
+    # All results are assembled -- if rank == 0 start async evaluation (if enabled)
+    if args.rank == 0 and (evaluator is not None):
+        evaluator.submit_task(epoch, evaluate_coco, epoch, final_results, cocoGt, local_rank, threshold, lr)
+
+    model.train()
+    return
+
+
+def load_checkpoint(model, checkpoint):
+    print("loading model checkpoint", checkpoint)
+    od = torch.load(checkpoint)
+
+    # remove proceeding 'module' from checkpoint
+    saved_model = od["model"]
+    for k in list(saved_model.keys()):
+        if k.startswith('module.'):
+            saved_model[k[7:]] = saved_model.pop(k)
+    model.load_state_dict(saved_model)
+
+def setup_distributed(args):
+    # Setup multi-GPU if necessary
+    args.distributed = False
+    if 'WORLD_SIZE' in os.environ:
+        args.distributed = int(os.environ['WORLD_SIZE']) > 1
+
+    if args.distributed:
+        torch.cuda.set_device(args.local_rank)
+        torch.distributed.init_process_group(backend='nccl',
+                                             init_method='env://')
+    args.local_seed = 0 # set_seeds(args)
+    # start timing here
+    if args.distributed:
+        args.N_gpu = torch.distributed.get_world_size()
+        args.rank = torch.distributed.get_rank()
+    else:
+        args.N_gpu = 1
+        args.rank = 0
+
+    validate_group_bn(args.bn_group)
+
+    return args
+
+# setup everything (model, etc) to run eval
+def run_eval(args, lr):
+    args = setup_distributed(args)
+
+    from pycocotools.coco import COCO
+
+    local_seed = args.local_seed
+
+    encoder = build_ssd300_coder()
+
+    val_annotate = os.path.join(args.data, "annotations/instances_val2017.json")
+    val_coco_root = os.path.join(args.data, "val2017")
+
+    cocoGt = COCO(annotation_file=val_annotate)
+
+    val_loader, inv_map = build_pipeline(args, training=False)
+
+    model_options = {
+        'use_nhwc' : args.nhwc,
+        'pad_input' : args.pad_input,
+        'bn_group' : args.bn_group,
+        'pretrained' : False,
+    }
+
+    ssd300_eval = SSD300(args, args.num_classes, **model_options).cuda()
+    if args.use_fp16:
+        convert_network(ssd300_eval, torch.half)
+    ssd300_eval.eval()
+
+    if args.checkpoint is not None:
+        load_checkpoint(ssd300_eval, args.checkpoint)
+
+    evaluator = AsyncEvaluator(num_threads=1)
+
+    coco_eval(args,
+              ssd300_eval,
+              val_loader,
+              cocoGt,
+              encoder,
+              inv_map,
+              0, # epoch
+              0, # iter_num
+              lr,
+              evaluator=evaluator)
+
+    res = evaluator.task_result(0)
+
+if __name__ == "__main__":
+    args = parse_args()
+    validate_arguments(args)
+
+    torch.backends.cudnn.benchmark = True
+    torch.set_num_threads(1)
+    lr = 0.001
+    run_eval(args, lr)
+
+
diff --git a/cv/detection/ssd/pytorch/base/fused_color_jitter.py b/cv/detection/ssd/pytorch/fused_color_jitter.py
similarity index 100%
rename from cv/detection/ssd/pytorch/base/fused_color_jitter.py
rename to cv/detection/ssd/pytorch/fused_color_jitter.py
diff --git a/cv/detection/ssd/pytorch/iluvatar/config/config_V100x1x1.py b/cv/detection/ssd/pytorch/iluvatar/config/config_V100x1x1.py
deleted file mode 100644
index 714de8849..000000000
--- a/cv/detection/ssd/pytorch/iluvatar/config/config_V100x1x1.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from training_event import ApexTrainingEvent
-
-# 1.Basic Configurations
-n_gpu = 1
-distributed = False
-dist_backend = "nccl"
-
-save_checkpoint = False
-
-seed = 1769250163
-log_freq = 20
-
-
-# 2.Model Training Configurations
-gradient_accumulation_steps = 1
-train_batch_size = 160
-eval_batch_size = 160
-learning_rate = 5.2e-3
-weight_decay_rate = 1.6e-4
-lr_decay_factor = 0.2
-lr_decay_epochs = [34, 45]
-warmup = 650
-warmup_factor = 0
-loss_scale = 0.0
-
-# 3. Optimizer Configurations
-num_workers = 4
-fp16 = True
-opt_level = 2
-delay_allreduce = True
-bn_group = 1
-fast_nms = True
-fast_cj = True
-use_coco_ext = False
-dali = True
-dali_sync = False
-dali_cache = -1
-nhwc = True
-pad_input = True
-jit = True
-use_nvjpeg = False
-
-
-training_event = ApexTrainingEvent
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/iluvatar/config/config_V100x1x8.py b/cv/detection/ssd/pytorch/iluvatar/config/config_V100x1x8.py
deleted file mode 100644
index 281fd970f..000000000
--- a/cv/detection/ssd/pytorch/iluvatar/config/config_V100x1x8.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from training_event import ApexTrainingEvent
-
-# 1.Basic Configurations
-n_gpu = 1
-distributed = True
-dist_backend = "nccl"
-
-save_checkpoint = False
-
-seed = 4230048668
-log_freq = 20
-
-
-# 2.Model Training Configurations
-gradient_accumulation_steps = 1
-train_batch_size = 120
-eval_batch_size = 160
-learning_rate = 2.92e-3
-weight_decay_rate = 1.6e-4
-lr_decay_factor = 0.1
-lr_decay_epochs = [44, 55]
-warmup = 650
-warmup_factor = 0
-loss_scale = 0.0
-
-# 3. Optimizer Configurations
-num_workers = 4
-fp16 = True
-opt_level = 2
-delay_allreduce = True
-bn_group = 1
-fast_nms = True
-fast_cj = True
-use_coco_ext = False
-dali = True
-dali_sync = False
-dali_cache = -1
-nhwc = True
-pad_input = True
-jit = True
-use_nvjpeg = False
-
-
-training_event = ApexTrainingEvent
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/iluvatar/config/config_V100x1x8_2.1.0.py b/cv/detection/ssd/pytorch/iluvatar/config/config_V100x1x8_2.1.0.py
deleted file mode 100644
index b5dd65e1b..000000000
--- a/cv/detection/ssd/pytorch/iluvatar/config/config_V100x1x8_2.1.0.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from training_event import ApexTrainingEvent
-
-# 1.Basic Configurations
-n_gpu = 1
-distributed = True
-dist_backend = "nccl"
-
-save_checkpoint = False
-
-seed = 1769250163
-log_freq = 20
-
-
-# 2.Model Training Configurations
-gradient_accumulation_steps = 1
-train_batch_size = 160
-eval_batch_size = 160
-learning_rate = 5.2e-3
-weight_decay_rate = 1.6e-4
-lr_decay_factor = 0.2
-lr_decay_epochs = [34, 45]
-warmup = 650
-warmup_factor = 0
-loss_scale = 0.0
-
-# 3. Optimizer Configurations
-num_workers = 4
-fp16 = True
-opt_level = 2
-delay_allreduce = True
-bn_group = 1
-fast_nms = True
-fast_cj = True
-use_coco_ext = False
-dali = True
-dali_sync = False
-dali_cache = -1
-nhwc = True
-pad_input = True
-jit = True
-use_nvjpeg = False
-
-
-training_event = ApexTrainingEvent
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/iluvatar/config/config_V100x1x8_wsl.py b/cv/detection/ssd/pytorch/iluvatar/config/config_V100x1x8_wsl.py
deleted file mode 100644
index fb1638a61..000000000
--- a/cv/detection/ssd/pytorch/iluvatar/config/config_V100x1x8_wsl.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from training_event import ApexTrainingEvent
-
-# 1.Basic Configurations
-n_gpu = 1
-distributed = True
-dist_backend = "nccl"
-
-save_checkpoint = False
-
-seed = 2651384829
-log_freq = 20
-
-
-# 2.Model Training Configurations
-gradient_accumulation_steps = 1
-train_batch_size = 200
-eval_batch_size = 200
-learning_rate = 5.15e-3
-weight_decay_rate = 1.6e-4
-lr_decay_factor = 0.2
-lr_decay_epochs = [34, 45]
-warmup = 650
-warmup_factor = 0
-loss_scale = 0.0
-
-# 3. Optimizer Configurations
-num_workers = 4
-fp16 = True
-opt_level = 2
-delay_allreduce = True
-bn_group = 1
-fast_nms = True
-fast_cj = True
-use_coco_ext = False
-dali = True
-dali_sync = False
-dali_cache = -1
-nhwc = True
-pad_input = True
-jit = True
-use_nvjpeg = False
-
-
-training_event = ApexTrainingEvent
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/iluvatar/config/config_nodali_V100x1x8.py b/cv/detection/ssd/pytorch/iluvatar/config/config_nodali_V100x1x8.py
deleted file mode 100644
index 279669912..000000000
--- a/cv/detection/ssd/pytorch/iluvatar/config/config_nodali_V100x1x8.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from training_event import ApexTrainingEvent
-
-# 1.Basic Configurations
-n_gpu = 1
-distributed = True
-dist_backend = "nccl"
-
-save_checkpoint = False
-
-seed = 4230048668
-log_freq = 20
-
-
-# 2.Model Training Configurations
-gradient_accumulation_steps = 1
-train_batch_size = 120
-eval_batch_size = 160
-learning_rate = 2.92e-3
-weight_decay_rate = 1.6e-4
-lr_decay_factor = 0.1
-lr_decay_epochs = [44, 55]
-warmup = 650
-warmup_factor = 0
-loss_scale = 0.0
-
-# 3. Optimizer Configurations
-num_workers = 4
-fp16 = True
-opt_level = 2
-delay_allreduce = True
-bn_group = 1
-fast_nms = True
-fast_cj = True
-use_coco_ext = False
-dali = False
-dali_sync = False
-dali_cache = 0
-nhwc = False
-pad_input = False
-jit = False
-use_nvjpeg = False
-
-
-training_event = ApexTrainingEvent
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/iluvatar/config/config_nv_V100x1x8.py b/cv/detection/ssd/pytorch/iluvatar/config/config_nv_V100x1x8.py
deleted file mode 100644
index 281fd970f..000000000
--- a/cv/detection/ssd/pytorch/iluvatar/config/config_nv_V100x1x8.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from training_event import ApexTrainingEvent
-
-# 1.Basic Configurations
-n_gpu = 1
-distributed = True
-dist_backend = "nccl"
-
-save_checkpoint = False
-
-seed = 4230048668
-log_freq = 20
-
-
-# 2.Model Training Configurations
-gradient_accumulation_steps = 1
-train_batch_size = 120
-eval_batch_size = 160
-learning_rate = 2.92e-3
-weight_decay_rate = 1.6e-4
-lr_decay_factor = 0.1
-lr_decay_epochs = [44, 55]
-warmup = 650
-warmup_factor = 0
-loss_scale = 0.0
-
-# 3. Optimizer Configurations
-num_workers = 4
-fp16 = True
-opt_level = 2
-delay_allreduce = True
-bn_group = 1
-fast_nms = True
-fast_cj = True
-use_coco_ext = False
-dali = True
-dali_sync = False
-dali_cache = -1
-nhwc = True
-pad_input = True
-jit = True
-use_nvjpeg = False
-
-
-training_event = ApexTrainingEvent
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/iluvatar/config/converter.py b/cv/detection/ssd/pytorch/iluvatar/config/converter.py
deleted file mode 100644
index e61209dcb..000000000
--- a/cv/detection/ssd/pytorch/iluvatar/config/converter.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import copy
-import math
-from torch.utils import checkpoint
-
-from ssd300 import SSD300
-
-
-def convert_model(model, config):
-    model_options = {
-        'use_nhwc': config.nhwc,
-        'pad_input': config.pad_input,
-        'bn_group': config.bn_group,
-    }
-    model = SSD300(config, config.num_classes, **model_options).cuda()
-    return model
diff --git a/cv/detection/ssd/pytorch/iluvatar/config/environment_variables.sh b/cv/detection/ssd/pytorch/iluvatar/config/environment_variables.sh
deleted file mode 100644
index e28aba1a3..000000000
--- a/cv/detection/ssd/pytorch/iluvatar/config/environment_variables.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-# =================================================
-# Export variables
-# =================================================
-
-export CONTAINER_MOUNTS="--gpus all"
-NVCC_ARGUMENTS="-U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --expt-relaxed-constexpr -ftemplate-depth=1024 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80"
-if [[ "$PYTORCH_BUILD_VERSION" == 1.8* ]]; then
-    NVCC_ARGUMENTS="${NVCC_ARGUMENTS} -D_PYTORCH18"
-fi
-export NVCC_ARGUMENTS
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/iluvatar/config/nhwc/conv.py b/cv/detection/ssd/pytorch/iluvatar/config/nhwc/conv.py
deleted file mode 100644
index 1bdddc757..000000000
--- a/cv/detection/ssd/pytorch/iluvatar/config/nhwc/conv.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-from torch.nn.modules.conv import _ConvNd
-
-import SSD._C as C
-
-import collections
-from itertools import repeat
-
-
-def _ntuple(n):
-    def parse(x):
-        if isinstance(x, collections.Iterable):
-            return x
-        return tuple(repeat(x, n))
-    return parse
-
-_single = _ntuple(1)
-_pair = _ntuple(2)
-_triple = _ntuple(3)
-_quadruple = _ntuple(4)
-
-class conv2d_NHWC_impl(torch.autograd.Function):
-
-    @staticmethod
-    def forward(ctx, x, w, bias=None, stride=(1,1), padding=(0,0), dilation=(1,1), groups=1):
-        # Save constants for bprop
-        ctx.stride = stride
-        ctx.padding = padding
-        ctx.dilation = dilation
-        ctx.groups = groups
-        ctx.need_bias_grad = bias is not None
-        ctx.save_for_backward(x, w)
-
-        if bias is None:
-            return C.cudnn_convolution_nhwc(x, w,
-                                            padding, stride, dilation,
-                                            groups,
-                                            torch.backends.cudnn.benchmark, False)
-        else:
-            return C.cudnn_convolution_with_bias_nhwc(x, w, bias,
-                                            padding, stride, dilation,
-                                            groups,
-                                            torch.backends.cudnn.benchmark, False)
-
-    @staticmethod
-    def backward(ctx, grad_y):
-        x, w = ctx.saved_variables
-
-        if ctx.need_bias_grad:
-            dx, dw, db = C.cudnn_convolution_backward_with_bias_nhwc(x, grad_y, w,
-                                                                     ctx.padding, ctx.stride, ctx.dilation, ctx.groups,
-                                                                     torch.backends.cudnn.benchmark, False,
-                                                                     list(ctx.needs_input_grad[0:3]))
-            if ctx.needs_input_grad[0]:
-                return dx, dw, db, None, None, None, None
-            else:
-                return None, dw, db, None, None, None, None
-        else:
-            dx, dw = C.cudnn_convolution_backward_nhwc(x, grad_y, w,
-                                                       ctx.padding, ctx.stride, ctx.dilation, ctx.groups,
-                                                       torch.backends.cudnn.benchmark, False,
-                                                       list(ctx.needs_input_grad[0:2]))
-            if ctx.needs_input_grad[0]:
-                return dx, dw, None, None, None, None, None
-            else:
-                return None, dw, None, None, None, None, None
-
-class Conv2d_NHWC(_ConvNd):
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
-                 padding=0, dilation=1, groups=1, bias=True):
-        kernel_size = _pair(kernel_size)
-        stride = _pair(stride)
-        padding = _pair(padding)
-        dilation = _pair(dilation)
-        super(Conv2d_NHWC, self).__init__(
-            in_channels, out_channels, kernel_size, stride, padding, dilation,
-            False, _pair(0), groups, bias=bias, padding_mode='zeros')
-
-        # permute filters
-        self.weight = torch.nn.Parameter(self.weight.permute(0, 2, 3, 1).contiguous())
-
-    def forward(self, x):
-        return conv2d_NHWC_impl.apply(x, self.weight, self.bias, self.stride,
-                        self.padding, self.dilation, self.groups)
-
diff --git a/cv/detection/ssd/pytorch/iluvatar/config/nhwc/test_bn_cudnn.py b/cv/detection/ssd/pytorch/iluvatar/config/nhwc/test_bn_cudnn.py
deleted file mode 100644
index f74edfbe3..000000000
--- a/cv/detection/ssd/pytorch/iluvatar/config/nhwc/test_bn_cudnn.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-
-try:
-    from .batch_norm import *
-except:
-    from batch_norm import *
-
-
-torch.backends.cudnn.benchmark=True
-
-N = 64
-C = 256
-H = 56
-W = 56
-
-fuse_relu = True
-fuse_add = True
-
-bn = BatchNorm2d_NHWC(C, fuse_relu=fuse_relu).cuda()
-
-# Make a NCHW copy of everything
-bn_nchw = torch.nn.BatchNorm2d(C).cuda()
-# Need to set this for consistency
-bn_nchw.eps = 1e-4
-
-# copy the parameters
-bn_nchw.weight = torch.nn.Parameter(bn.weight.clone())
-bn_nchw.bias = torch.nn.Parameter(bn.bias.clone())
-
-# generate random inputs
-x = torch.randn(N, C, H, W).cuda().half()
-z = torch.randn(N, C, H, W).cuda().half()
-
-# Copy input tensors
-x_copy = x.clone()
-z_copy = z.clone()
-x_copy.requires_grad = True
-z_copy.requires_grad = True
-
-
-# Transpose -> NHWC
-x = x.permute(0,2,3,1).contiguous()
-z = z.permute(0,2,3,1).contiguous()
-x.requires_grad = True
-z.requires_grad = True
-
-# generate a random signal to backprop, copy
-g0 = torch.randn(N, H, W, C).cuda().half()
-g0_nchw = g0.clone().permute(0,3,1,2).contiguous()
-
-# Run NHWC fwd
-out = bn(x, z if fuse_add else None)
-# Run NHWC bwd
-out.backward(g0)
-
-# Run NCHW fwd
-out_nchw = bn_nchw(x_copy)
-if fuse_add:
-    out_nchw += z_copy
-if fuse_relu:
-    out_nchw = out_nchw.relu()
-
-# Run NCHW bwd
-out_nchw.backward(g0_nchw)
-
-# Permute NHWC results -> NCHW for comparison
-out_nhwc = out.permute(0,3,1,2)
-x_grad_nhwc = x.grad.permute(0,3,1,2)
-if fuse_add:
-    z_grad_nhwc = z.grad.permute(0,3,1,2)
-
-atol = 1e-5
-rtol = 1e-3
-print('X: ', torch.allclose(out_nhwc, out_nchw, atol=atol, rtol=rtol))
-print('dS: ', torch.allclose(bn.weight.grad, bn_nchw.weight.grad, atol=atol, rtol=rtol))
-print('dB: ', torch.allclose(bn.bias.grad, bn_nchw.bias.grad, atol=atol, rtol=rtol))
-print('dX: ', torch.allclose(x_grad_nhwc, x_copy.grad, atol=atol, rtol=rtol))
-if fuse_add:
-    print('dZ: ', torch.allclose(z_grad_nhwc, z_copy.grad, atol=atol, rtol=rtol))
diff --git a/cv/detection/ssd/pytorch/iluvatar/config/ssd300.py b/cv/detection/ssd/pytorch/iluvatar/config/ssd300.py
deleted file mode 100644
index 7e550beb4..000000000
--- a/cv/detection/ssd/pytorch/iluvatar/config/ssd300.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-import torch.nn as nn
-# from base_model import L2Norm, ResNet
-
-try:
-    from .resnet import ResNet, resnet34
-    from .nhwc.conv import Conv2d_NHWC
-except:
-    from resnet import ResNet, resnet34
-    from nhwc.conv import Conv2d_NHWC
-
-
-class SSD300(nn.Module):
-    """
-        Build a SSD module to take 300x300 image input,
-        and output 8732 per class bounding boxes
-
-        label_num: number of classes (including background 0)
-    """
-    def __init__(self, args, label_num, use_nhwc=False, pad_input=False, bn_group=1, pretrained=True):
-
-        super(SSD300, self).__init__()
-
-        self.label_num = label_num
-        self.use_nhwc = use_nhwc
-        self.pad_input = pad_input
-        self.bn_group = bn_group
-
-        # Explicitly RN34 all the time
-        out_channels = 256
-        out_size = 38
-        self.out_chan = [out_channels, 512, 512, 256, 256, 256]
-
-        # self.model = ResNet(self.use_nhwc, self.pad_input, self.bn_group)
-
-        rn_args = {
-            'bn_group' : bn_group,
-            'pad_input' : pad_input,
-            'nhwc' : use_nhwc,
-            'pretrained' : pretrained,
-            'ssd_mods' : True,
-        }
-
-        self.model = resnet34(**rn_args)
-
-        self._build_additional_features()
-
-        padding_channels_to = 8
-        self._build_multibox_heads(use_nhwc, padding_channels_to)
-
-        # after l2norm, conv7, conv8_2, conv9_2, conv10_2, conv11_2
-        # classifer 1, 2, 3, 4, 5 ,6
-
-        # intitalize all weights
-        with torch.no_grad():
-            self._init_weights()
-
-    def _build_multibox_heads(self, use_nhwc, padding_channels_to=8):
-        self.num_defaults = [4, 6, 6, 6, 4, 4]
-        self.mbox = []
-        self.padding_amounts = []
-
-        if self.use_nhwc:
-            conv_fn = Conv2d_NHWC
-        else:
-            conv_fn = nn.Conv2d
-        # Multiple to pad channels to
-        for nd, oc in zip(self.num_defaults, self.out_chan):
-            # Horizontally fuse loc and conf convolutions
-            my_num_channels = nd*(4+self.label_num)
-            if self.use_nhwc:
-                # Want to manually pad to get HMMA kernels in NHWC case
-                padding_amount = padding_channels_to - (my_num_channels % padding_channels_to)
-            else:
-                padding_amount = 0
-            self.padding_amounts.append(padding_amount)
-            self.mbox.append(conv_fn(oc, my_num_channels + padding_amount, kernel_size=3, padding=1))
-
-        self.mbox = nn.ModuleList(self.mbox)
-
-
-    """
-    Output size from RN34 is always 38x38
-    """
-    def _build_additional_features(self):
-        self.additional_blocks = []
-
-        if self.use_nhwc:
-            conv_fn = Conv2d_NHWC
-        else:
-            conv_fn = nn.Conv2d
-
-        def build_block(input_channels, inter_channels, out_channels, stride=1, pad=0):
-            return nn.Sequential(
-                conv_fn(input_channels, inter_channels, kernel_size=1),
-                nn.ReLU(inplace=True),
-                conv_fn(inter_channels, out_channels, kernel_size=3, stride=stride, padding=pad),
-                nn.ReLU(inplace=True)
-            )
-
-        strides = [2, 2, 2, 1, 1]
-        intermediates = [256, 256, 128, 128, 128]
-        paddings = [1, 1, 1, 0, 0]
-
-        for i, im, o, stride, pad in zip(self.out_chan[:-1], intermediates, self.out_chan[1:], strides, paddings):
-            self.additional_blocks.append(build_block(i, im, o, stride=stride, pad=pad))
-
-        self.additional_blocks = nn.ModuleList(self.additional_blocks)
-
-    def _init_additional_weights(self):
-        addn_blocks = [*self.additional_blocks]
-        # Need to handle additional blocks differently in NHWC case due to xavier initialization
-        for layer in addn_blocks:
-            for param in layer.parameters():
-                if param.dim() > 1:
-                    if self.use_nhwc:
-                        # xavier_uniform relies on fan-in/-out, so need to use NCHW here to get
-                        # correct values (K, R) instead of the correct (K, C)
-                        nchw_param_data = param.data.permute(0, 3, 1, 2).contiguous()
-                        nn.init.xavier_uniform_(nchw_param_data)
-                        # Now permute correctly-initialized param back to NHWC
-                        param.data.copy_(nchw_param_data.permute(0, 2, 3, 1).contiguous())
-                    else:
-                        nn.init.xavier_uniform_(param)
-
-    def _init_multibox_weights(self):
-        layers = [ *self.mbox ]
-
-        for layer, default, padding in zip(layers, self.num_defaults, self.padding_amounts):
-            for param in layer.parameters():
-                if param.dim() > 1 and self.use_nhwc:
-                    # Need to be careful - we're initialising [loc, conf, pad] with
-                    # all 3 needing to be treated separately
-                    conf_channels = default * self.label_num
-                    loc_channels  = default * 4
-                    pad_channels  = padding
-                    # Split the parameter into separate parts along K dimension
-                    conf, loc, pad = param.data.split([conf_channels, loc_channels, pad_channels], dim=0)
-
-                    # Padding should be zero
-                    pad_data = torch.zeros_like(pad.data)
-
-                    def init_loc_conf(p):
-                        p_data = p.data.permute(0, 3, 1, 2).contiguous()
-                        nn.init.xavier_uniform_(p_data)
-                        p_data = p_data.permute(0, 2, 3, 1).contiguous()
-                        return p_data
-
-                    # Location and confidence data
-                    loc_data = init_loc_conf(loc)
-                    conf_data = init_loc_conf(conf)
-
-                    # Put the full weight together again along K and copy
-                    param.data.copy_(torch.cat([conf_data, loc_data, pad_data], dim=0))
-                elif param.dim() > 1:
-                    nn.init.xavier_uniform_(param)
-
-    def _init_weights(self):
-        self._init_additional_weights()
-        self._init_multibox_weights()
-
-    # Shape the classifier to the view of bboxes
-    def bbox_view(self, src, mbox):
-        locs = []
-        confs = []
-        for s, m, num_defaults, pad in zip(src, mbox, self.num_defaults, self.padding_amounts):
-            mm = m(s)
-            conf_channels = num_defaults * self.label_num
-            loc_channels  = num_defaults * 4
-
-            if self.use_nhwc:
-                conf, loc, _ = mm.split([conf_channels, loc_channels, pad], dim=3)
-                conf, loc = conf.contiguous(), loc.contiguous()
-                # We now have unfused [N, H, W, C]
-                # Layout is a little awkward here.
-                # Take C = c * d, then we actually have:
-                # [N, H, W, c*d]
-                # flatten HW first:
-                #   [N, H, W, c*d] -> [N, HW, c*d]
-                locs.append(
-                    loc.view(s.size(0), -1, 4 * num_defaults).permute(0, 2, 1).contiguous().view(loc.size(0), 4, -1))
-                confs.append(
-                    conf.view(s.size(0), -1, self.label_num * num_defaults).permute(0, 2, 1).contiguous().view(conf.size(0), self.label_num, -1))
-            else:
-                conf, loc = mm.split([conf_channels, loc_channels], dim=1)
-                conf, loc = conf.contiguous(), loc.contiguous()
-                # flatten the anchors for this layer
-                locs.append(loc.view(s.size(0), 4, -1))
-                confs.append(conf.view(s.size(0), self.label_num, -1))
-
-        cat_dim = 2
-        locs, confs = torch.cat(locs, cat_dim), torch.cat(confs, cat_dim)
-
-        return locs, confs
-
-    def forward(self, data):
-
-        layers = self.model(data)
-
-        # last result from network goes into additional blocks
-        x = layers
-        # If necessary, transpose back to NCHW
-        additional_results = []
-        for i, l in enumerate(self.additional_blocks):
-            x = l(x)
-            additional_results.append(x)
-
-        # do we need the l2norm on the first result?
-        src = [layers, *additional_results]
-        # Feature Map 38x38x4, 19x19x6, 10x10x6, 5x5x6, 3x3x4, 1x1x4
-
-        locs, confs = self.bbox_view(src, self.mbox)
-
-        # For SSD 300, shall return nbatch x 8732 x {nlabels, nlocs} results
-        return locs, confs
-
diff --git a/cv/detection/ssd/pytorch/iluvatar/config/training_event.py b/cv/detection/ssd/pytorch/iluvatar/config/training_event.py
deleted file mode 100644
index 3fdb8fba4..000000000
--- a/cv/detection/ssd/pytorch/iluvatar/config/training_event.py
+++ /dev/null
@@ -1,103 +0,0 @@
-import os
-from typing import Tuple
-
-import torch
-import apex
-from torch.cuda.amp import GradScaler
-from torch.optim import Optimizer
-
-from train.event.base import BaseTrainingEventInterface
-from train.event.base import BatchType, SSD_MODEL
-from train.training_state import TrainingState
-
-from converter import convert_model
-
-
-class ApexTrainingEvent(BaseTrainingEventInterface):
-
-    def __init__(self, config):
-        super(ApexTrainingEvent, self).__init__(config)
-        self.model = None
-        self.optimizer = None
-        self.overflow_buf = None
-
-    def save_checkpoint(self, path: str, training_state: TrainingState):
-        torch.save({
-            "model": self.model.state_dict(),
-            "optimizer": self.optimizer.state_dict(),
-            "amp": apex.amp.state_dict(),
-            "master params": list(apex.amp.master_params(self.optimizer)),
-            "epoch": training_state.epoch,
-            "iter_num": training_state.iter_num,
-        }, "{}/epoch{}_{}.pt".format(path, training_state.epoch, round(training_state.eval_ap, 5)))
-
-    def load_checkpoint(self, checkpoint):
-        self.model.load_state_dict(checkpoint["model"], strict=True)
-        self.optimizer.load_state_dict(checkpoint["optimizer"])
-        self.config.iteration = checkpoint["iter_num"]
-        self.config.epoch = checkpoint["epoch"]
-        if checkpoint.get("amp", None):
-            apex.amp.load_state_dict(checkpoint["amp"])
-        if checkpoint.get("master params", None):
-            for param, saved_param in zip(apex.amp.master_params(self.optimizer), checkpoint["master params"]):
-                param.data.copy_(saved_param.data)
-
-    def on_init_start(self):
-        pass
-
-    def convert_model(self, model: SSD_MODEL) -> SSD_MODEL:
-        self.model = convert_model(model, self.config)
-        return self.model
-
-    def create_optimizer(self, model: SSD_MODEL) -> Optimizer:
-        config = self.config
-        base_lr = 2.5e-3
-        requested_lr_multiplier = config.learning_rate / base_lr
-        adjusted_multiplier = max(1, round(requested_lr_multiplier * config.train_batch_size * config.n_gpu / 32))
-
-        current_lr = base_lr * adjusted_multiplier
-        current_weight_decay = config.weight_decay_rate
-
-        self.optimizer = apex.optimizers.FusedSGD(model.parameters(),
-                                         lr=current_lr,
-                                         momentum=0.9,
-                                         weight_decay=current_weight_decay)
-        return self.optimizer
-
-    def model_to_fp16(self, model: SSD_MODEL, optimizer: Optimizer) -> Tuple[SSD_MODEL, Optimizer]:
-        self.model, self.optimizer = apex.amp.initialize(model, optimizer, opt_level="O{}".format(self.config.opt_level), loss_scale=128.)
-        return self.model, self.optimizer
-
-    def model_to_ddp(self, model: SSD_MODEL) -> SSD_MODEL:
-        config = self.config
-        if config.distributed:
-            if config.delay_allreduce:
-                print(config.local_rank, "Delaying allreduces to the end of backward()")
-            self.model = apex.parallel.DistributedDataParallel(model,
-                        gradient_predivide_factor=config.n_gpu / 8.0,
-                        delay_allreduce=config.delay_allreduce,
-                        retain_allreduce_buffers=config.fp16)
-        else:
-            self.model = model
-        return self.model
-
-    def on_step_begin(self, step: int):
-        pass
-
-    def on_step_end(self, step: int):
-        pass
-
-    def on_backward(self, step: int, loss: torch.Tensor, optimizer: Optimizer, grad_scaler: GradScaler=None):
-        with apex.amp.scale_loss(loss, optimizer) as scaled_loss:
-            scaled_loss.backward()
-        update_step = step % self.config.gradient_accumulation_steps == 0
-        if update_step:
-            self.update_model_params(optimizer, grad_scaler)
-
-    def update_model_params(self, optimizer: Optimizer, grad_scaler: GradScaler=None):
-        optimizer.step()
-        for param in self.model.parameters():
-            param.grad = None
-
-
-
diff --git a/cv/detection/ssd/pytorch/iluvatar/reset.sh b/cv/detection/ssd/pytorch/iluvatar/reset.sh
deleted file mode 100644
index 0380b4762..000000000
--- a/cv/detection/ssd/pytorch/iluvatar/reset.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2022, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-# clean cache for host memory
-echo 3 > /proc/sys/vm/drop_caches
-
-# reset BI
-ixsmi -r
-
-
diff --git a/cv/detection/ssd/pytorch/install_ssd.sh b/cv/detection/ssd/pytorch/install_ssd.sh
new file mode 100644
index 000000000..a65d5c927
--- /dev/null
+++ b/cv/detection/ssd/pytorch/install_ssd.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+TARGET_DIR=${TARGET_DIR:-}
+
+PYTHON_PATH=$(which python3)
+PYTHON_DIST_PATH=${TARGET_DIR}/lib/python3/dist-packages
+
+PKG_DIR="build_pip"
+PKG_NAME="ssd"
+
+if [[ ! -d ${PKG_DIR} ]]; then
+  echo "ERROR: Package directory ${PKG_DIR} doesn't exist"
+  exit 1
+fi
+
+latest_pkg="$(ls -t ${PKG_DIR} | grep -i ${PKG_NAME} | head -1)"
+if [[ "${latest_pkg}" == "" ]]; then
+  echo "ERROR: Cannot find latest ${PKG_NAME} package"
+  exit 1
+else
+  echo "INFO: Found latest package ${latest_pkg} in directory ${PKG_DIR}"
+fi
+
+if [[ "${TARGET_DIR}" != ""  ]]; then
+  ${PYTHON_PATH} -m pip install --upgrade --no-deps -t ${PYTHON_DIST_PATH} ${PKG_DIR}/${latest_pkg} || exit
+  echo "SSD-0.1 installed in ${PYTHON_DIST_PATH}; please add it to your PYTHONPATH."
+else
+  ${PYTHON_PATH} -m pip uninstall ${PKG_NAME} -y
+  ${PYTHON_PATH} -m pip install --no-deps ${PKG_DIR}/${latest_pkg} || exit
+fi
+
+# Return 0 status if all finished
+exit 0
diff --git a/cv/detection/ssd/pytorch/iluvatar/config/nhwc/test_conv.py b/cv/detection/ssd/pytorch/master_params.py
similarity index 33%
rename from cv/detection/ssd/pytorch/iluvatar/config/nhwc/test_conv.py
rename to cv/detection/ssd/pytorch/master_params.py
index 2ccf96905..a7defbf39 100644
--- a/cv/detection/ssd/pytorch/iluvatar/config/nhwc/test_conv.py
+++ b/cv/detection/ssd/pytorch/master_params.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,53 +13,27 @@
 # limitations under the License.
 
 import torch
+import apex_C
 
-try:
-    from .conv import *
-except:
-    from conv import *
+def check_type_split(buckets):
+    for bucket in buckets:
+        this_type = bucket[0].type()
+        for param in bucket:
+            if param.type() != this_type:
+                raise ValueError("Each bucket must contain only params of the same type.")
 
-torch.backends.cudnn.benchmark=True
 
-N = 64
-C = 32
-K = 256
-H = 56
-W = 56
+def create_flat_master(model_buckets):
+    # Ideally, we'd like to flatten the model params as well, and reset the float params' .data
+    # attributes to point directly into the flattened master buffers.  However, my version that does
+    # so is yielding CUDNN_STATUS_BAD_PARAM errors when running with distributed and nhwc.
+    # I ended up making the safe choice of not altering what the params' .data members point to.
+    check_type_split(model_buckets)
 
-conv = Conv2d_NHWC(C, K, 1).cuda().half()
-weight_orig = conv.weight.clone()
-conv.weight = torch.nn.Parameter(conv.weight.permute(0,2,3,1).contiguous())
+    flat_master_buckets = [apex_C.flatten([p.detach().clone().float() for p in model_bucket])
+                           for model_bucket in model_buckets]
 
-# Make a NCHW copy of everything
-conv_nchw = torch.nn.Conv2d(C, K, 1, bias=False).cuda().half()
-conv_nchw.weight = torch.nn.Parameter(weight_orig)
+    for flat_master in flat_master_buckets:
+        flat_master.requires_grad_()
 
-x = torch.randn(N, C, H, W).cuda().half()
-
-# Copy input tensor
-x_copy = x.clone()
-x_copy.requires_grad = True
-
-# Transpose -> NHWC
-x = x.permute(0,2,3,1).contiguous()
-x.requires_grad = True
-
-g0 = torch.randn(N, H, W, K).cuda().half()
-g0_nchw = g0.clone().permute(0,3,1,2).contiguous()
-
-out = conv(x)
-out = out.relu_()
-out.backward(g0)
-
-out_nchw = conv_nchw(x_copy)
-out_nchw = out_nchw.relu_()
-
-out_nchw.backward(g0_nchw)
-
-out_nhwc = out.permute(0,3,1,2)
-#print(out_nhwc)
-#print(out_nchw)
-
-print(torch.allclose(out_nhwc, out_nchw, atol=1e-5, rtol=1e-3))
-print(torch.allclose(conv.weight.grad.permute(0,3,1,2), conv_nchw.weight.grad, atol=1e-5, rtol=1e-3))
+    return flat_master_buckets
diff --git a/cv/detection/ssd/pytorch/mlperf_log_utils.py b/cv/detection/ssd/pytorch/mlperf_log_utils.py
new file mode 100644
index 000000000..be94d97ce
--- /dev/null
+++ b/cv/detection/ssd/pytorch/mlperf_log_utils.py
@@ -0,0 +1,34 @@
+import collections
+import os
+import subprocess
+
+from mlperf_logging.mllog import constants
+from mlperf_logger  import log_event, configure_logger
+
+
+def mlperf_submission_log(benchmark):
+
+    num_nodes = os.environ.get('SLURM_JOB_NUM_NODES', 1)
+
+    configure_logger(benchmark)
+
+    log_event(
+        key=constants.SUBMISSION_BENCHMARK,
+        value=benchmark,
+        )
+
+    log_event(
+        key=constants.SUBMISSION_ORG,
+        value='NVIDIA')
+
+    log_event(
+        key=constants.SUBMISSION_DIVISION,
+        value='closed')
+
+    log_event(
+        key=constants.SUBMISSION_STATUS,
+        value='onprem')
+
+    log_event(
+        key=constants.SUBMISSION_PLATFORM,
+        value=f'{num_nodes}xSUBMISSION_PLATFORM_PLACEHOLDER')
diff --git a/cv/detection/ssd/pytorch/mlperf_logger.py b/cv/detection/ssd/pytorch/mlperf_logger.py
new file mode 100644
index 000000000..3a7f22d4f
--- /dev/null
+++ b/cv/detection/ssd/pytorch/mlperf_logger.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+import subprocess
+import torch
+import numpy as np
+from mlperf_logging import mllog
+import random
+
+mllogger = mllog.get_mllogger()
+
+def log_start(*args, **kwargs):
+    _log_print(mllogger.start, *args, **kwargs)
+def log_end(*args, **kwargs):
+    _log_print(mllogger.end, *args, **kwargs)
+def log_event(*args, **kwargs):
+    _log_print(mllogger.event, *args, **kwargs)
+def _log_print(logger, *args, **kwargs):
+    """
+    Wrapper for MLPerf compliance logging calls.
+    All arguments but 'log_all_ranks' are passed to
+    mlperf_logging.mllog.
+    If 'log_all_ranks' is set to True then all distributed workers will print
+    logging message, if set to False then only worker with rank=0 will print
+    the message.
+    """
+    if 'stack_offset' not in kwargs:
+        kwargs['stack_offset'] = 3
+    if 'value' not in kwargs:
+        kwargs['value'] = None
+
+    if kwargs.pop('log_all_ranks', False):
+        log = True
+    else:
+        log = (get_rank() == 0)
+
+    if log:
+        logger(*args, **kwargs)
+
+
+def configure_logger(benchmark):
+    mllog.config(filename=os.path.join(os.path.dirname(os.path.abspath(__file__)), f'{benchmark}.log'))
+    mllogger = mllog.get_mllogger()
+    mllogger.logger.propagate = False
+
+
+def barrier():
+    """
+    Works as a temporary distributed barrier, currently pytorch
+    doesn't implement barrier for NCCL backend.
+    Calls all_reduce on dummy tensor and synchronizes with GPU.
+    """
+    if torch.distributed.is_initialized():
+        torch.distributed.all_reduce(torch.cuda.FloatTensor(1))
+        torch.cuda.synchronize()
+
+
+def get_rank():
+    """
+    Gets distributed rank or returns zero if distributed is not initialized.
+    """
+    if torch.distributed.is_initialized():
+        rank = torch.distributed.get_rank()
+    else:
+        rank = 0
+    return rank
+
+def broadcast_seeds(seed, device):
+    if torch.distributed.is_initialized():
+        seeds_tensor = torch.LongTensor([seed]).to(device)
+        torch.distributed.broadcast(seeds_tensor, 0)
+        seed = seeds_tensor.item()
+    return seed
+
+def set_seeds(args):
+    torch.cuda.set_device(args.local_rank)
+    device = torch.device('cuda')
+
+    # make sure that all workers has the same master seed
+    log_event(key=mllog.constants.SEED, value=args.seed)
+    args.seed = broadcast_seeds(args.seed, device)
+
+    local_seed = (args.seed + get_rank()) % 2**32
+    print(get_rank(), "Using seed = {}".format(local_seed))
+    random.seed(local_seed)
+    torch.manual_seed(local_seed)
+    np.random.seed(seed=local_seed)
+    return local_seed
diff --git a/cv/detection/ssd/pytorch/iluvatar/config/nhwc/batch_norm.py b/cv/detection/ssd/pytorch/nhwc/batch_norm.py
similarity index 100%
rename from cv/detection/ssd/pytorch/iluvatar/config/nhwc/batch_norm.py
rename to cv/detection/ssd/pytorch/nhwc/batch_norm.py
diff --git a/cv/detection/ssd/pytorch/nhwc/cifar10_nhwc.py b/cv/detection/ssd/pytorch/nhwc/cifar10_nhwc.py
new file mode 100644
index 000000000..e3b24c68b
--- /dev/null
+++ b/cv/detection/ssd/pytorch/nhwc/cifar10_nhwc.py
@@ -0,0 +1,229 @@
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import torch
+import torchvision
+import torchvision.transforms as transforms
+from tqdm import tqdm
+import math
+import torch.optim as optim
+import torch.optim.lr_scheduler as lr_scheduler
+import matplotlib.pyplot as plt
+import pylab
+import numpy as np
+from torch.autograd import Variable
+
+from argparse import ArgumentParser
+from apex.fp16_utils import *
+from apex.parallel import DistributedDataParallel as DDP
+
+from vgg_nhwc import VggBN_NHWC
+from resnet_nhwc_cifar10 import *
+
+def parse_args():
+    parser = ArgumentParser(description="NHWC CIFAR10 test")
+    parser.add_argument('--local_rank', '--local-rank', default=0, type=int,
+			help='Used for multi-process training. Can either be manually set ' +
+			'or automatically set by using \'python -m multiproc\'.')
+    parser.add_argument('--net', type=str, choices=['vgg', 'rn18', 'rn34'], default='vgg')
+
+    return parser.parse_args()
+
+def train_and_test(args):
+    # Setup multi-GPU if necessary
+    args.distributed = False
+    args.world_size = 1
+    if 'WORLD_SIZE' in os.environ:
+        args.distributed = int(os.environ['WORLD_SIZE']) > 1
+        args.world_size = int(os.environ['WORLD_SIZE'])
+
+    if args.distributed:
+        torch.cuda.set_device(args.local_rank)
+
+        torch.distributed.init_process_group(backend='nccl',
+                                             init_method='env://')
+
+
+    torch.manual_seed(0)
+    torch.cuda.manual_seed(0)
+
+    batch_size = 32
+    learning_rate = 0.01
+    learning_rate_decay = 0.0005
+    momentum = 0.9
+    epoch_step = 25
+    max_epoch = 300
+
+    transform_train = transforms.Compose(
+        [transforms.RandomHorizontalFlip(),
+        transforms.ToTensor(),
+        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+
+    transform_test = transforms.Compose(
+        [transforms.ToTensor(),
+        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+
+    trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
+                                            download=True, transform=transform_train)
+    testset = torchvision.datasets.CIFAR10(root='./data', train=False,
+                                           download=True, transform=transform_test)
+    if args.distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(trainset)
+        val_sampler = torch.utils.data.distributed.DistributedSampler(testset)
+    else:
+        train_sampler = None
+        val_sampler = None
+
+    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
+                                              shuffle=train_sampler is None,
+                                              sampler=train_sampler, num_workers=4)
+
+    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
+                                             shuffle=val_sampler is None,
+                                             sampler=val_sampler, num_workers=4)
+
+    save_dir = "./save"
+    if not os.path.exists(save_dir) and args.local_rank == 0:
+        os.mkdir(save_dir)
+
+
+    def weights_init(m):
+        if isinstance(m, nn.Conv2d):
+            n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            m.weight.data.normal_(0, math.sqrt(2./n))
+        elif isinstance(m, nn.BatchNorm2d):
+            m.weight.data.fill_(1)
+            m.bias.data.zero_()
+
+    if args.net == 'vgg':
+        net = VggBN_NHWC()
+    elif args.net == 'rn18':
+        net = ResNet18_NHWC()
+    elif args.net == 'rn34':
+        net = ResNet34_NHWC()
+
+    net = network_to_half(net.cuda())
+    net.apply(weights_init)
+    net.cuda()
+
+
+    criterion = nn.CrossEntropyLoss()
+    base_lr = learning_rate * args.world_size
+    optimizer = optim.SGD(net.parameters(), lr=base_lr, momentum=0.9, weight_decay=learning_rate_decay)
+    scheduler = lr_scheduler.StepLR(optimizer, step_size=epoch_step, gamma=0.5)
+    optimizer = FP16_Optimizer(optimizer, static_loss_scale=128.)
+
+    if args.distributed:
+        net = DDP(net)
+
+    test_accuracies = np.zeros(max_epoch)
+    for epoch in range(max_epoch):  # loop over the dataset multiple times
+        #pbar = tqdm(trainloader)
+        #pbar.mininterval = 1 # update the processing bar at least 1 second
+
+        """
+            Initial Check
+        """
+        """
+        net.eval()
+
+        if epoch == 0:
+            print('\033[0;31mInitial Check: \033[0m')
+            running_loss, correct, total = 0., 0., 0.
+            for i, data in enumerate(testloader, 0):
+                images, labels = data
+                images, labels = Variable(images.cuda()), Variable(labels.cuda())
+                images = images.half().permute(0, 2, 3, 1).contiguous()
+                outputs = net(images)
+                loss = criterion(outputs, labels)
+                running_loss = running_loss * (i/(i+1.)) + loss.data[0] * (1./(i+1.) )
+                _, predicted = torch.max(outputs.data, 1)
+                total += labels.size(0)
+                correct += (predicted == labels.data).sum()
+            print('Loss on the test images: %f  ......  should be 2.3' % running_loss)
+            print('Accuracy on the test images: %f %% ......  should be 10%%' % (100. * correct / total))
+        """
+
+        """
+            Training ...
+        """
+        net.train()
+
+        running_loss, correct, total = 0., 0., 0.
+        scheduler.step()
+
+        for i, data in enumerate(trainloader, 0):
+            # get the inputs
+            inputs, labels = data
+            # wrap them in Variable
+            inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())
+            inputs = inputs.half().permute(0, 2, 3, 1).contiguous()
+            inputs.require_grad = False
+            labels.require_grad = False
+            # zero the parameter gradients
+            optimizer.zero_grad()
+            # forward + backward + optimize
+            outputs = net(inputs)
+            loss = criterion(outputs, labels)
+            optimizer.backward(loss)
+            optimizer.step()
+
+            # update statistics
+            running_loss = running_loss * (i/(i+1.)) + loss.data[0] * (1./(i+1.) )
+            _, predicted = torch.max(outputs.data, 1)
+            total += labels.size(0)
+            correct += (predicted == labels.data).sum()
+        print('\033[0;32m Statistics on epoch :%d learning rate: %f\033[0m' %(epoch, scheduler.get_lr()[0]))
+        print('Train Loss : %f Train Accuracy: %f %%' % (running_loss, 100. * correct / total))
+
+        """
+            Testing ...
+        """
+        net.eval()
+
+        correct, total = 0., 0.
+        for data in testloader:
+            images, labels = data
+            images = Variable(images.cuda()).half().permute(0, 2, 3, 1).contiguous()
+            outputs = net(images)
+            _, predicted = torch.max(outputs.data, 1)
+            total += labels.size(0)
+            correct += (predicted == labels.cuda()).sum()
+
+        print('[{}] {} / {} correct'.format(args.local_rank, correct, total))
+        print('Test Accuracy: \033[1;33m%f %%\033[0m' % (100. * correct / total))
+        test_accuracies[epoch] = 100. * correct / total
+
+        """
+            Saving model and accuracies, and ploting
+        """
+        #np.save('./save/accuracies.npy', test_accuracies)
+        #torch.save(net.state_dict(), './save/model.%d.pkl' %epoch)
+
+        #plt.figure()
+        #pylab.xlim(0, max_epoch + 1)
+        #pylab.ylim(0, 100)
+        #plt.plot(range(1, max_epoch +1), test_accuracies)
+        #plt.savefig('./save/accuracies.png')
+        #plt.close()
+
+if __name__ == "__main__":
+    torch.backends.cudnn.benchmark = True
+
+    args = parse_args()
+
+    train_and_test(args)
+
+
diff --git a/cv/detection/ssd/pytorch/nvidia/config/nhwc/conv.py b/cv/detection/ssd/pytorch/nhwc/conv.py
similarity index 92%
rename from cv/detection/ssd/pytorch/nvidia/config/nhwc/conv.py
rename to cv/detection/ssd/pytorch/nhwc/conv.py
index 1bdddc757..94b81b255 100644
--- a/cv/detection/ssd/pytorch/nvidia/config/nhwc/conv.py
+++ b/cv/detection/ssd/pytorch/nhwc/conv.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
 import torch
 from torch.nn.modules.conv import _ConvNd
 
@@ -23,9 +24,14 @@ from itertools import repeat
 
 def _ntuple(n):
     def parse(x):
-        if isinstance(x, collections.Iterable):
-            return x
-        return tuple(repeat(x, n))
+        if sys.version_info < (3, 10, 0):
+            if isinstance(x, collections.Iterable):
+                return x
+            return tuple(repeat(x, n))
+        else:
+            if isinstance(x, collections.abc.Iterable):
+                return x
+            return tuple(repeat(x, n))
     return parse
 
 _single = _ntuple(1)
diff --git a/cv/detection/ssd/pytorch/iluvatar/config/nhwc/max_pool.py b/cv/detection/ssd/pytorch/nhwc/max_pool.py
similarity index 100%
rename from cv/detection/ssd/pytorch/iluvatar/config/nhwc/max_pool.py
rename to cv/detection/ssd/pytorch/nhwc/max_pool.py
diff --git a/cv/detection/ssd/pytorch/nhwc/mnist_nhwc.py b/cv/detection/ssd/pytorch/nhwc/mnist_nhwc.py
new file mode 100644
index 000000000..f35f04159
--- /dev/null
+++ b/cv/detection/ssd/pytorch/nhwc/mnist_nhwc.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import argparse
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torchvision import datasets, transforms
+
+from nhwc.conv import Conv2d_NHWC
+from nhwc.max_pool import MaxPool2d_NHWC
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = Conv2d_NHWC(1, 10, kernel_size=5)
+        self.pool1 = MaxPool2d_NHWC(2)
+        self.conv2 = Conv2d_NHWC(10, 20, kernel_size=5)
+        self.pool2 = MaxPool2d_NHWC(2)
+        self.conv2_drop = nn.Dropout2d()
+        self.fc1 = nn.Linear(320, 50)
+        self.fc2 = nn.Linear(50, 10)
+
+    def forward(self, x):
+        x = F.relu(self.pool1(self.conv1(x)))
+        x = F.relu(self.pool2(self.conv2_drop(self.conv2(x))))
+        x = x.permute(0, 3, 1, 2).contiguous()
+        x = x.view(-1, 320)
+        x = F.relu(self.fc1(x))
+        x = F.dropout(x, training=self.training)
+        x = self.fc2(x)
+        return F.log_softmax(x, dim=1)
+
+def train(args, model, device, train_loader, optimizer, epoch):
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(device), target.to(device)
+        data = data.permute(0, 2, 3, 1).contiguous().half()
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % args.log_interval == 0:
+            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                epoch, batch_idx * len(data), len(train_loader.dataset),
+                100. * batch_idx / len(train_loader), loss.item()))
+
+def test(args, model, device, test_loader):
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.no_grad():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            data = data.permute(0, 2, 3, 1).contiguous().half()
+            output = model(data)
+            test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
+            pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
+            correct += pred.eq(target.view_as(pred)).sum().item()
+
+    test_loss /= len(test_loader.dataset)
+    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
+        test_loss, correct, len(test_loader.dataset),
+        100. * correct / len(test_loader.dataset)))
+
+def main():
+    # Training settings
+    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
+    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
+                        help='input batch size for training (default: 64)')
+    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
+                        help='input batch size for testing (default: 1000)')
+    parser.add_argument('--epochs', type=int, default=10, metavar='N',
+                        help='number of epochs to train (default: 10)')
+    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
+                        help='learning rate (default: 0.01)')
+    parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
+                        help='SGD momentum (default: 0.5)')
+    parser.add_argument('--no-cuda', action='store_true', default=False,
+                        help='disables CUDA training')
+    parser.add_argument('--seed', type=int, default=1, metavar='S',
+                        help='random seed (default: 1)')
+    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
+                        help='how many batches to wait before logging training status')
+    args = parser.parse_args()
+    use_cuda = not args.no_cuda and torch.cuda.is_available()
+
+    torch.manual_seed(args.seed)
+
+    device = torch.device("cuda" if use_cuda else "cpu")
+
+    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
+    train_loader = torch.utils.data.DataLoader(
+        datasets.MNIST('../data', train=True, download=True,
+                       transform=transforms.Compose([
+                           transforms.ToTensor(),
+                           transforms.Normalize((0.1307,), (0.3081,))
+                       ])),
+        batch_size=args.batch_size, shuffle=True, **kwargs)
+    test_loader = torch.utils.data.DataLoader(
+        datasets.MNIST('../data', train=False, transform=transforms.Compose([
+                           transforms.ToTensor(),
+                           transforms.Normalize((0.1307,), (0.3081,))
+                       ])),
+        batch_size=args.test_batch_size, shuffle=True, **kwargs)
+
+
+    model = Net().to(device).half()
+    optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
+
+    torch.backends.cudnn.benchmark = True
+
+    for epoch in range(1, args.epochs + 1):
+        train(args, model, device, train_loader, optimizer, epoch)
+        test(args, model, device, test_loader)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/cv/detection/ssd/pytorch/nhwc/resnet_nhwc_cifar10.py b/cv/detection/ssd/pytorch/nhwc/resnet_nhwc_cifar10.py
new file mode 100644
index 000000000..f6f5c3b2d
--- /dev/null
+++ b/cv/detection/ssd/pytorch/nhwc/resnet_nhwc_cifar10.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+'''ResNet in PyTorch.
+
+For Pre-activation ResNet, see 'preact_resnet.py'.
+
+Reference:
+[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
+    Deep Residual Learning for Image Recognition. arXiv:1512.03385
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from conv import Conv2d_NHWC
+from batch_norm import BatchNorm2d_NHWC
+from max_pool import MaxPool2d_NHWC
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = Conv2d_NHWC(in_planes, planes, kernel_size=3, stride=stride, padding=1)
+        self.bn1 = BatchNorm2d_NHWC(planes, fuse_relu=True)
+        self.conv2 = Conv2d_NHWC(planes, planes, kernel_size=3, stride=1, padding=1)
+        self.bn2 = BatchNorm2d_NHWC(planes, fuse_relu=False)
+
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion*planes:
+            self.shortcut = nn.Sequential(
+                Conv2d_NHWC(in_planes, self.expansion*planes, kernel_size=1, stride=stride),
+                BatchNorm2d_NHWC(self.expansion*planes, fuse_relu=False)
+            )
+
+    def forward(self, x):
+        out = self.bn1(self.conv1(x))
+        out = self.bn2(self.conv2(out))
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(Bottleneck, self).__init__()
+        self.conv1 = Conv2d_NHWC(in_planes, planes, kernel_size=1)
+        self.bn1 = BatchNorm2d_NHWC(planes, fuse_relu=True)
+        self.conv2 = Conv2d_NHWC(planes, planes, kernel_size=3, stride=stride, padding=1)
+        self.bn2 = BatchNorm2d_NHWC(planes, fuse_relu=True)
+        self.conv3 = Conv2d_NHWC(planes, self.expansion*planes, kernel_size=1)
+        self.bn3 = BatchNorm2d_NHWC(self.expansion*planes)
+
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion*planes:
+            self.shortcut = nn.Sequential(
+                Conv2d_NHWC(in_planes, self.expansion*planes, kernel_size=1, stride=stride),
+                BatchNorm2d_NHWC(self.expansion*planes)
+            )
+
+    def forward(self, x):
+        out = self.bn1(self.conv1(x))
+        out = self.bn2(self.conv2(out))
+        out = self.bn3(self.conv3(out))
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out
+
+
+class ResNet_NHWC(nn.Module):
+    def __init__(self, block, num_blocks, num_classes=10):
+        super(ResNet_NHWC, self).__init__()
+        self.in_planes = 64
+
+        self.conv1 = Conv2d_NHWC(3, 64, kernel_size=3, stride=1, padding=1)
+        self.bn1 = BatchNorm2d_NHWC(64, fuse_relu=True)
+        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
+        self.linear = nn.Linear(512*block.expansion, num_classes)
+
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1]*(num_blocks-1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_planes, planes, stride))
+            self.in_planes = planes * block.expansion
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = self.bn1(self.conv1(x))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+
+	# Move back to NCHW for final parts
+        out = out.permute(0, 3, 1, 2).contiguous()
+
+        out = F.avg_pool2d(out, 4)
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+def ResNet18_NHWC():
+    return ResNet_NHWC(BasicBlock, [2,2,2,2])
+
+def ResNet34_NHWC():
+    return ResNet_NHWC(BasicBlock, [3,4,6,3])
+
+def ResNet50_NHWC():
+    return ResNet_NHWC(Bottleneck, [3,4,6,3])
+
+def ResNet101_NHWC():
+    return ResNet_NHWC(Bottleneck, [3,4,23,3])
+
+def ResNet152():
+    return ResNet(Bottleneck, [3,8,36,3])
+
diff --git a/cv/detection/ssd/pytorch/nvidia/config/nhwc/test_bn_cudnn.py b/cv/detection/ssd/pytorch/nhwc/test_bn_cudnn.py
similarity index 97%
rename from cv/detection/ssd/pytorch/nvidia/config/nhwc/test_bn_cudnn.py
rename to cv/detection/ssd/pytorch/nhwc/test_bn_cudnn.py
index f74edfbe3..a1350d2e8 100644
--- a/cv/detection/ssd/pytorch/nvidia/config/nhwc/test_bn_cudnn.py
+++ b/cv/detection/ssd/pytorch/nhwc/test_bn_cudnn.py
@@ -12,14 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from conv import *
+from batch_norm import *
 import torch
 
-try:
-    from .batch_norm import *
-except:
-    from batch_norm import *
-
-
 torch.backends.cudnn.benchmark=True
 
 N = 64
diff --git a/cv/detection/ssd/pytorch/nvidia/config/nhwc/test_conv.py b/cv/detection/ssd/pytorch/nhwc/test_conv.py
similarity index 96%
rename from cv/detection/ssd/pytorch/nvidia/config/nhwc/test_conv.py
rename to cv/detection/ssd/pytorch/nhwc/test_conv.py
index 2ccf96905..c080ff349 100644
--- a/cv/detection/ssd/pytorch/nvidia/config/nhwc/test_conv.py
+++ b/cv/detection/ssd/pytorch/nhwc/test_conv.py
@@ -12,13 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from nhwc.conv import *
 import torch
 
-try:
-    from .conv import *
-except:
-    from conv import *
-
 torch.backends.cudnn.benchmark=True
 
 N = 64
diff --git a/cv/detection/ssd/pytorch/iluvatar/config/nhwc/test_max_pool.py b/cv/detection/ssd/pytorch/nhwc/test_max_pool.py
similarity index 96%
rename from cv/detection/ssd/pytorch/iluvatar/config/nhwc/test_max_pool.py
rename to cv/detection/ssd/pytorch/nhwc/test_max_pool.py
index 3212b76c1..77fbc6dcc 100644
--- a/cv/detection/ssd/pytorch/iluvatar/config/nhwc/test_max_pool.py
+++ b/cv/detection/ssd/pytorch/nhwc/test_max_pool.py
@@ -12,13 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from nhwc.conv import *
+from nhwc.max_pool import *
 import torch
 
-try:
-    from .max_pool import *
-except:
-    from max_pool import *
-
 torch.backends.cudnn.benchmark=True
 
 N = 64
diff --git a/cv/detection/ssd/pytorch/nvidia/config/Dockerfile b/cv/detection/ssd/pytorch/nvidia/config/Dockerfile
deleted file mode 100644
index 77de5e1b4..000000000
--- a/cv/detection/ssd/pytorch/nvidia/config/Dockerfile
+++ /dev/null
@@ -1,14 +0,0 @@
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:20.06-py3
-FROM ${FROM_IMAGE_NAME}
-
-# Install dependencies for system configuration logger
-RUN apt-get update \
- && apt-get install -y --no-install-recommends \
-        infiniband-diags \
-        pciutils \
- && rm -rf /var/lib/apt/lists/*
-
-# Configure environment variables
-ENV OMP_NUM_THREADS=1
-ENV OPENCV_FOR_THREADS_NUM=1
-
diff --git a/cv/detection/ssd/pytorch/nvidia/config/config_V100x1x1.py b/cv/detection/ssd/pytorch/nvidia/config/config_V100x1x1.py
deleted file mode 100644
index 584f5ac91..000000000
--- a/cv/detection/ssd/pytorch/nvidia/config/config_V100x1x1.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from training_event import ApexTrainingEvent
-
-# 1.Basic Configurations
-n_gpu = 1
-distributed = False
-dist_backend = "nccl"
-
-save_checkpoint = False
-
-seed = 4043767865
-log_freq = 20
-
-
-# 2.Model Training Configurations
-gradient_accumulation_steps = 1
-train_batch_size = 120
-eval_batch_size = 160
-learning_rate = 2.92e-3
-weight_decay_rate = 1.6e-4
-lr_decay_factor = 0.1
-lr_decay_epochs = [44, 55]
-warmup = 650
-warmup_factor = 0
-loss_scale = 0.0
-
-# 3. Optimizer Configurations
-num_workers = 4
-fp16 = True
-opt_level = 2
-delay_allreduce = True
-bn_group = 1
-fast_nms = True
-fast_cj = True
-use_coco_ext = True
-dali = True
-dali_sync = False
-dali_cache = -1
-nhwc = True
-pad_input = True
-jit = True
-use_nvjpeg = True
-
-
-training_event = ApexTrainingEvent
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/nvidia/config/config_V100x1x8.py b/cv/detection/ssd/pytorch/nvidia/config/config_V100x1x8.py
deleted file mode 100644
index 07f345a48..000000000
--- a/cv/detection/ssd/pytorch/nvidia/config/config_V100x1x8.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from training_event import ApexTrainingEvent
-
-# 1.Basic Configurations
-n_gpu = 1
-distributed = True
-dist_backend = "nccl"
-
-save_checkpoint = False
-
-seed = 4230048668
-log_freq = 20
-
-
-# 2.Model Training Configurations
-gradient_accumulation_steps = 1
-train_batch_size = 120
-eval_batch_size = 160
-learning_rate = 2.92e-3
-weight_decay_rate = 1.6e-4
-lr_decay_factor = 0.1
-lr_decay_epochs = [44, 55]
-warmup = 650
-warmup_factor = 0
-loss_scale = 0.0
-
-# 3. Optimizer Configurations
-num_workers = 4
-fp16 = True
-opt_level = 2
-delay_allreduce = True
-bn_group = 1
-fast_nms = True
-fast_cj = True
-use_coco_ext = True
-dali = True
-dali_sync = False
-dali_cache = -1
-nhwc = True
-pad_input = True
-jit = True
-use_nvjpeg = True
-
-
-training_event = ApexTrainingEvent
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/nvidia/config/config_nodali_V100x1x8.py b/cv/detection/ssd/pytorch/nvidia/config/config_nodali_V100x1x8.py
deleted file mode 100644
index 20d801472..000000000
--- a/cv/detection/ssd/pytorch/nvidia/config/config_nodali_V100x1x8.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from training_event import ApexTrainingEvent
-
-# 1.Basic Configurations
-n_gpu = 1
-distributed = True
-dist_backend = "nccl"
-
-save_checkpoint = False
-
-seed = 4230048668
-log_freq = 20
-
-
-# 2.Model Training Configurations
-gradient_accumulation_steps = 1
-train_batch_size = 120
-eval_batch_size = 160
-learning_rate = 2.92e-3
-weight_decay_rate = 1.6e-4
-lr_decay_factor = 0.1
-lr_decay_epochs = [44, 55]
-warmup = 650
-warmup_factor = 0
-loss_scale = 0.0
-
-# 3. Optimizer Configurations
-num_workers = 4
-fp16 = True
-opt_level = 2
-delay_allreduce = True
-bn_group = 1
-fast_nms = True
-fast_cj = True
-use_coco_ext = True
-dali = False
-dali_sync = False
-dali_cache = 0
-nhwc = False
-pad_input = False
-jit = False
-use_nvjpeg = False
-
-
-training_event = ApexTrainingEvent
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/nvidia/config/converter.py b/cv/detection/ssd/pytorch/nvidia/config/converter.py
deleted file mode 100644
index e61209dcb..000000000
--- a/cv/detection/ssd/pytorch/nvidia/config/converter.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import copy
-import math
-from torch.utils import checkpoint
-
-from ssd300 import SSD300
-
-
-def convert_model(model, config):
-    model_options = {
-        'use_nhwc': config.nhwc,
-        'pad_input': config.pad_input,
-        'bn_group': config.bn_group,
-    }
-    model = SSD300(config, config.num_classes, **model_options).cuda()
-    return model
diff --git a/cv/detection/ssd/pytorch/nvidia/config/environment_variables.sh b/cv/detection/ssd/pytorch/nvidia/config/environment_variables.sh
deleted file mode 100644
index e28aba1a3..000000000
--- a/cv/detection/ssd/pytorch/nvidia/config/environment_variables.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-# =================================================
-# Export variables
-# =================================================
-
-export CONTAINER_MOUNTS="--gpus all"
-NVCC_ARGUMENTS="-U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ --expt-relaxed-constexpr -ftemplate-depth=1024 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80"
-if [[ "$PYTORCH_BUILD_VERSION" == 1.8* ]]; then
-    NVCC_ARGUMENTS="${NVCC_ARGUMENTS} -D_PYTORCH18"
-fi
-export NVCC_ARGUMENTS
\ No newline at end of file
diff --git a/cv/detection/ssd/pytorch/nvidia/config/nhwc/batch_norm.py b/cv/detection/ssd/pytorch/nvidia/config/nhwc/batch_norm.py
deleted file mode 100644
index b0e0402a9..000000000
--- a/cv/detection/ssd/pytorch/nvidia/config/nhwc/batch_norm.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-from torch.nn.modules.batchnorm import _BatchNorm
-
-import SSD._C as C
-
-import collections
-from itertools import repeat
-
-class bn_NHWC_impl(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, s, b, rm, riv, mom, epsilon, fuse_relu=False, is_train=True, z=None):
-        if is_train:
-            ctx.epsilon = epsilon
-            ctx.momentum = mom
-            ctx.fuse_relu = fuse_relu
-
-            ctx.fuse_add = False if z is None else True
-
-            if z is not None:
-                y, save_mean, save_var, reserve = C.bn_add_fwd_nhwc_cudnn(x, z, s, b, rm, riv, mom, epsilon, fuse_relu)
-            else:
-                y, save_mean, save_var, reserve = C.bn_fwd_nhwc_cudnn(x, s, b, rm, riv, mom, epsilon, fuse_relu)
-
-            ctx.save_for_backward(x, y, s, b, rm, riv, save_mean, save_var, reserve)
-
-            return y
-        else:
-            if z is not None:
-                return C.bn_add_fwd_eval_nhwc_cudnn(x, z, s, b, rm, riv, mom, epsilon, fuse_relu)
-            else:
-                return C.bn_fwd_eval_nhwc_cudnn(x, s, b, rm, riv, mom, epsilon, fuse_relu)
-
-    @staticmethod
-    def backward(ctx, grad_y):
-        x, y, s, b, rm, riv, save_mean, save_var, reserve = ctx.saved_variables
-        epsilon = ctx.epsilon
-        mom = ctx.momentum
-        fuse_relu = ctx.fuse_relu
-        fuse_add = ctx.fuse_add
-
-        if ctx.fuse_add:
-            dx, dz, dscale, dbias = C.bn_add_bwd_nhwc_cudnn(x, y, grad_y, s, b, rm, riv, save_mean, save_var, reserve, mom, epsilon, fuse_relu)
-        else:
-            dx, _, dscale, dbias = C.bn_bwd_nhwc_cudnn(x, y, grad_y, s, b, rm, riv, save_mean, save_var, reserve, mom, epsilon, fuse_relu)
-            dz = None
-
-        return dx, dscale, dbias, None, None, None, None, None, None, dz
-
-
-
-class BatchNorm2d_NHWC(_BatchNorm):
-    def __init__(self, num_features, fuse_relu=False):
-        super(BatchNorm2d_NHWC, self).__init__(num_features)
-
-        self.fuse_relu = fuse_relu
-
-    def forward(self, x, z=None):
-        return bn_NHWC_impl.apply(x,
-                                  self.weight, self.bias,
-                                  self.running_mean, self.running_var,
-                                  self.momentum,
-                                  self.eps, self.fuse_relu, self.training, z)
-
diff --git a/cv/detection/ssd/pytorch/nvidia/config/nhwc/max_pool.py b/cv/detection/ssd/pytorch/nvidia/config/nhwc/max_pool.py
deleted file mode 100644
index 90035222f..000000000
--- a/cv/detection/ssd/pytorch/nvidia/config/nhwc/max_pool.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-from torch.nn.modules.pooling import MaxPool2d
-
-import SSD._C as C
-
-class max_pool_NHWC_impl(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, kernel_size, stride, padding, dilation):
-        ctx.kernel_size = kernel_size
-        ctx.stride = stride if stride is not None else 0
-        ctx.padding = padding
-        ctx.dilation = dilation
-
-        y = C.max_pool_fwd_nhwc(x, kernel_size, stride, padding, dilation)
-
-        # Need to save y as well
-        ctx.save_for_backward(x, y)
-        return y
-
-    @staticmethod
-    def backward(ctx, y_grad):
-        x, y = ctx.saved_variables
-
-        kernel = ctx.kernel_size
-        stride = ctx.stride
-        padding = ctx.padding
-        dilation = ctx.dilation
-
-        return C.max_pool_bwd_nhwc(x,
-                                   y,
-                                   y_grad,
-                                   kernel,
-                                   stride,
-                                   padding,
-                                   dilation), None, None, None, None
-
-class MaxPool2d_NHWC(MaxPool2d):
-    def __init__(self, kernel_size, stride=None, padding=0):
-        super(MaxPool2d_NHWC, self).__init__(kernel_size, stride=stride, padding=padding)
-
-    def forward(self, x):
-        return max_pool_NHWC_impl.apply(x,
-                                        self.kernel_size,
-                                        self.stride,
-                                        self.padding,
-                                        self.dilation)
diff --git a/cv/detection/ssd/pytorch/nvidia/config/nhwc/test_max_pool.py b/cv/detection/ssd/pytorch/nvidia/config/nhwc/test_max_pool.py
deleted file mode 100644
index 3212b76c1..000000000
--- a/cv/detection/ssd/pytorch/nvidia/config/nhwc/test_max_pool.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-
-try:
-    from .max_pool import *
-except:
-    from max_pool import *
-
-torch.backends.cudnn.benchmark=True
-
-N = 64
-C = 256
-H = 56
-W = 56
-
-kernel = 3
-stride = 1
-padding = 1
-
-H_out = 1 + (H + 2*padding - kernel) // stride
-W_out = 1 + (W + 2*padding - kernel) // stride
-
-pool = MaxPool2d_NHWC(kernel_size=kernel, stride=stride, padding=padding)
-
-# Make a NCHW copy of everything
-pool_nchw = torch.nn.MaxPool2d(kernel_size=kernel, stride=stride, padding=padding).cuda()
-
-x = torch.randn(N, C, H, W).cuda().half()
-
-# Copy input tensor
-x_copy = x.clone()
-x_copy.requires_grad = True
-
-# Transpose -> NHWC
-x = x.permute(0,2,3,1).contiguous()
-x.requires_grad = True
-
-g0 = torch.randn(N, H_out, W_out, C).cuda().half()
-g0_nchw = g0.clone().permute(0,3,1,2).contiguous()
-
-out = pool(x)
-#out = out.relu_()
-out.backward(g0)
-
-out_nchw = pool_nchw(x_copy)
-#out_nchw = out_nchw.relu_()
-
-out_nchw.backward(g0_nchw)
-
-out_nhwc = out.permute(0,3,1,2)
-#print(out_nhwc)
-#print(out_nchw)
-
-print(torch.allclose(out_nhwc, out_nchw, atol=1e-5, rtol=1e-3))
-print(torch.allclose(x.grad.permute(0,3,1,2), x_copy.grad, atol=1e-5, rtol=1e-3))
-
diff --git a/cv/detection/ssd/pytorch/nvidia/config/resnet.py b/cv/detection/ssd/pytorch/nvidia/config/resnet.py
deleted file mode 100644
index f81f344c4..000000000
--- a/cv/detection/ssd/pytorch/nvidia/config/resnet.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch                    # for torch.cat and torch.zeros
-import torch.nn as nn
-import torch.utils.model_zoo as model_zoo
-
-# Group batch norm
-from apex.parallel import SyncBatchNorm as gbn
-# Persistent group BN for NHWC case
-import apex.parallel
-# from apex.contrib.groupbn.batch_norm import BatchNorm2d_NHWC as gbn_persistent
-
-
-try:
-    from .nhwc.batch_norm import BatchNorm2d_NHWC as gbn_persistent
-    from .nhwc.conv import Conv2d_NHWC
-    from .nhwc.max_pool import MaxPool2d_NHWC
-except:
-    from nhwc.batch_norm import BatchNorm2d_NHWC as gbn_persistent
-    from nhwc.conv import Conv2d_NHWC
-    from nhwc.max_pool import MaxPool2d_NHWC
-
-
-__all__ = ['resnet']
-
-model_urls = {
-    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
-}
-
-
-class Layers_NCHW:
-    Conv2d = nn.Conv2d
-    MaxPool = nn.MaxPool2d
-    BnAddRelu = None # will be assigned at construction
-
-    def __init__(self, bn_group, **kwargs):
-        super(Layers_NCHW, self).__init__()
-        self.nhwc = False
-        self.bn_group = bn_group
-
-        if (bn_group > 1):
-            bn_base = gbn
-        else:
-            bn_base = nn.BatchNorm2d
-
-        class BnAddRelu_(bn_base):
-            def __init__(self, planes, fuse_relu=False, bn_group=1):
-                if (bn_group > 1):
-                    super(BnAddRelu_, self).__init__(
-                        planes,
-                        process_group=apex.parallel.create_syncbn_process_group(bn_group))
-                else:
-                    super(BnAddRelu_, self).__init__(planes)
-
-                self.fuse_relu_flag = fuse_relu
-
-            def forward(self, x, z=None):
-                out = super().forward(x)
-                if z is not None:
-                    out = out.add_(z)
-                if self.fuse_relu_flag:
-                    out = out.relu_()
-                return out
-
-        # this is still Layers_NCHW::__init__
-        self.BnAddRelu = BnAddRelu_
-
-    def build_bn(self, planes, fuse_relu=False):
-        return self.BnAddRelu(planes, fuse_relu, self.bn_group)
-
-
-class Layers_NHWC:
-    Conv2d = Conv2d_NHWC
-    MaxPool = MaxPool2d_NHWC
-
-    class BnAddRelu(gbn_persistent):
-        def __init__(self, planes, fuse_relu=False, bn_group=1):
-            super(Layers_NHWC.BnAddRelu, self).__init__(planes,
-                                                          fuse_relu)
-
-    def __init__(self, bn_group, **kwargs):
-        super(Layers_NHWC, self).__init__()
-        self.nhwc = True
-        self.bn_group = bn_group
-
-    def build_bn(self, planes, fuse_relu):
-        return self.BnAddRelu(planes, fuse_relu, self.bn_group)
-
-
-
-def conv1x1(layer_types, in_planes, out_planes, stride=1):
-    """1x1 convolution"""
-    return layer_types.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride,
-                           bias=False)
-
-def conv3x3(layer_types, in_planes, out_planes, stride=1):
-    """3x3 convolution with padding"""
-    return layer_types.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
-                           padding=1, bias=False)
-
-
-class BasicBlock(nn.Module):
-    expansion = 1
-
-    def __init__(self, layerImpls, inplanes, planes, stride=1, downsample=None):
-        super(BasicBlock, self).__init__()
-        self.conv1 = conv3x3(layerImpls, inplanes, planes, stride=stride)
-        self.bn1 = layerImpls.build_bn(planes, fuse_relu=True)
-        self.conv2 = conv3x3(layerImpls, planes, planes)
-        self.bn2 = layerImpls.build_bn(planes, fuse_relu=True)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x):
-        residual = x
-        if self.downsample is not None:
-            residual = self.downsample(x)
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out, residual)
-
-        return out
-
-class ResNet(nn.Module):
-
-    def __init__(self, layerImpls, block, layers, num_classes=1000,
-                 pad_input=False, ssd_mods=False, use_nhwc=False,
-                 bn_group=1):
-        self.inplanes = 64
-        super(ResNet, self).__init__()
-        if pad_input:
-            input_channels = 4
-        else:
-            input_channels = 3
-        self.conv1 = layerImpls.Conv2d(input_channels, 64, kernel_size=7, stride=2,
-                                       padding=3, bias=False)
-        self.bn1 = layerImpls.build_bn(64, fuse_relu=True)
-        self.maxpool = layerImpls.MaxPool(kernel_size=3, stride=2, padding=1)
-
-        # Add conv{2,3,4}
-        self.layer1 = self._make_layer(layerImpls, block, 64, layers[0])
-        self.layer2 = self._make_layer(layerImpls, block, 128, layers[1], stride=2)
-        self.layer3 = self._make_layer(layerImpls, block, 256, layers[2], stride=1)
-
-        # FIXME! This (a) fails for nhwc, and (b) is irrelevant if the user is
-        # also loading pretrained data (which we don't know about here, but
-        # know about in the caller (the "resnet()" function below).
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-            elif isinstance(m, nn.BatchNorm2d):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-
-    def _make_layer(self, layerImpls, block, planes, blocks, stride=1):
-        downsample = None
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                layerImpls.Conv2d(self.inplanes, planes * block.expansion,
-                                  kernel_size=1, stride=stride, bias=False),
-                layerImpls.build_bn(planes * block.expansion, fuse_relu=False),
-            )
-
-        layers = []
-        layers.append(block(layerImpls, self.inplanes, planes, stride, downsample))
-        self.inplanes = planes * block.expansion
-        for i in range(1, blocks):
-            layers.append(block(layerImpls, self.inplanes, planes))
-
-        return nn.Sequential(*layers)
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.bn1(x)
-        x = self.maxpool(x)
-
-        x = self.layer1(x)
-        x = self.layer2(x)
-        x = self.layer3(x)
-        x = self.layer4(x)
-
-        x = self.classifier(x)
-
-        return x
-
-def _transpose_state(state, pad_input=False):
-    for k in state.keys():
-        if len(state[k].shape) == 4:
-            if pad_input and "conv1.weight" in k and not 'layer' in k:
-                s = state[k].shape
-                state[k] = torch.cat([state[k], torch.zeros([s[0], 1, s[2], s[3]])], dim=1)
-            state[k] = state[k].permute(0, 2, 3, 1).contiguous()
-    return state
-
-def resnet34(pretrained=False, nhwc=False, ssd_mods=False, **kwargs):
-    """Constructs a ResNet model.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-    """
-    if nhwc:
-        layerImpls = Layers_NHWC(**kwargs)
-    else:
-        layerImpls = Layers_NCHW(**kwargs)
-
-    block = BasicBlock
-    layer_list = [3, 4, 6, 3]
-    model = ResNet(layerImpls, block, layer_list, ssd_mods=ssd_mods, use_nhwc=nhwc, **kwargs)
-
-    if pretrained:
-        orig_state_dict = model_zoo.load_url(model_urls['resnet34'])
-
-        # Modify the state dict to remove conv5 / layer4
-        state_dict = {k:orig_state_dict[k] for k in orig_state_dict if (not k.startswith('layer4') and not k.startswith('fc'))}
-
-        pad_input = kwargs.get('pad_input', False)
-        if nhwc:
-            state_dict = _transpose_state(state_dict, pad_input)
-
-        model.load_state_dict(state_dict)
-    return nn.Sequential(model.conv1, model.bn1, model.maxpool, model.layer1, model.layer2, model.layer3)
diff --git a/cv/detection/ssd/pytorch/nvidia/config/training_event.py b/cv/detection/ssd/pytorch/nvidia/config/training_event.py
deleted file mode 100644
index 3fdb8fba4..000000000
--- a/cv/detection/ssd/pytorch/nvidia/config/training_event.py
+++ /dev/null
@@ -1,103 +0,0 @@
-import os
-from typing import Tuple
-
-import torch
-import apex
-from torch.cuda.amp import GradScaler
-from torch.optim import Optimizer
-
-from train.event.base import BaseTrainingEventInterface
-from train.event.base import BatchType, SSD_MODEL
-from train.training_state import TrainingState
-
-from converter import convert_model
-
-
-class ApexTrainingEvent(BaseTrainingEventInterface):
-
-    def __init__(self, config):
-        super(ApexTrainingEvent, self).__init__(config)
-        self.model = None
-        self.optimizer = None
-        self.overflow_buf = None
-
-    def save_checkpoint(self, path: str, training_state: TrainingState):
-        torch.save({
-            "model": self.model.state_dict(),
-            "optimizer": self.optimizer.state_dict(),
-            "amp": apex.amp.state_dict(),
-            "master params": list(apex.amp.master_params(self.optimizer)),
-            "epoch": training_state.epoch,
-            "iter_num": training_state.iter_num,
-        }, "{}/epoch{}_{}.pt".format(path, training_state.epoch, round(training_state.eval_ap, 5)))
-
-    def load_checkpoint(self, checkpoint):
-        self.model.load_state_dict(checkpoint["model"], strict=True)
-        self.optimizer.load_state_dict(checkpoint["optimizer"])
-        self.config.iteration = checkpoint["iter_num"]
-        self.config.epoch = checkpoint["epoch"]
-        if checkpoint.get("amp", None):
-            apex.amp.load_state_dict(checkpoint["amp"])
-        if checkpoint.get("master params", None):
-            for param, saved_param in zip(apex.amp.master_params(self.optimizer), checkpoint["master params"]):
-                param.data.copy_(saved_param.data)
-
-    def on_init_start(self):
-        pass
-
-    def convert_model(self, model: SSD_MODEL) -> SSD_MODEL:
-        self.model = convert_model(model, self.config)
-        return self.model
-
-    def create_optimizer(self, model: SSD_MODEL) -> Optimizer:
-        config = self.config
-        base_lr = 2.5e-3
-        requested_lr_multiplier = config.learning_rate / base_lr
-        adjusted_multiplier = max(1, round(requested_lr_multiplier * config.train_batch_size * config.n_gpu / 32))
-
-        current_lr = base_lr * adjusted_multiplier
-        current_weight_decay = config.weight_decay_rate
-
-        self.optimizer = apex.optimizers.FusedSGD(model.parameters(),
-                                         lr=current_lr,
-                                         momentum=0.9,
-                                         weight_decay=current_weight_decay)
-        return self.optimizer
-
-    def model_to_fp16(self, model: SSD_MODEL, optimizer: Optimizer) -> Tuple[SSD_MODEL, Optimizer]:
-        self.model, self.optimizer = apex.amp.initialize(model, optimizer, opt_level="O{}".format(self.config.opt_level), loss_scale=128.)
-        return self.model, self.optimizer
-
-    def model_to_ddp(self, model: SSD_MODEL) -> SSD_MODEL:
-        config = self.config
-        if config.distributed:
-            if config.delay_allreduce:
-                print(config.local_rank, "Delaying allreduces to the end of backward()")
-            self.model = apex.parallel.DistributedDataParallel(model,
-                        gradient_predivide_factor=config.n_gpu / 8.0,
-                        delay_allreduce=config.delay_allreduce,
-                        retain_allreduce_buffers=config.fp16)
-        else:
-            self.model = model
-        return self.model
-
-    def on_step_begin(self, step: int):
-        pass
-
-    def on_step_end(self, step: int):
-        pass
-
-    def on_backward(self, step: int, loss: torch.Tensor, optimizer: Optimizer, grad_scaler: GradScaler=None):
-        with apex.amp.scale_loss(loss, optimizer) as scaled_loss:
-            scaled_loss.backward()
-        update_step = step % self.config.gradient_accumulation_steps == 0
-        if update_step:
-            self.update_model_params(optimizer, grad_scaler)
-
-    def update_model_params(self, optimizer: Optimizer, grad_scaler: GradScaler=None):
-        optimizer.step()
-        for param in self.model.parameters():
-            param.grad = None
-
-
-
diff --git a/cv/detection/ssd/pytorch/nvidia/reset.sh b/cv/detection/ssd/pytorch/nvidia/reset.sh
deleted file mode 100644
index a11f3253f..000000000
--- a/cv/detection/ssd/pytorch/nvidia/reset.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-
-# clean cache for host memory
-echo 3 > /proc/sys/vm/drop_caches
-
-# reset GPU
-nvidia-smi -r
-
-
diff --git a/cv/detection/ssd/pytorch/nvidia/setup.py b/cv/detection/ssd/pytorch/nvidia/setup.py
deleted file mode 100644
index a802a5560..000000000
--- a/cv/detection/ssd/pytorch/nvidia/setup.py
+++ /dev/null
@@ -1,105 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import glob
-import os
-
-import torch
-from torch.utils.cpp_extension import CUDA_HOME
-from torch.utils.cpp_extension import CppExtension
-from torch.utils.cpp_extension import CUDAExtension
-
-from setuptools import find_packages
-from setuptools import setup
-
-requirements = ["torch", "torchvision"]
-
-
-def get_extensions():
-    this_dir = os.path.dirname(os.path.abspath(__file__))
-    extensions_dir = os.path.join(this_dir, "csrc")
-
-    source_cpu = glob.glob(os.path.join(extensions_dir, "*.cpp"))
-    source_nhwc = glob.glob(os.path.join(extensions_dir, "nhwc", "*.cpp"))
-    source_nhwc += glob.glob(os.path.join(extensions_dir, "nhwc", "*.cu"))
-    source_cuda = glob.glob(os.path.join(extensions_dir, "*.cu"))
-
-    print('c++: ', source_cpu)
-    print('NHWC: ', source_nhwc)
-    print('cuda: ', source_cuda)
-    sources = source_cpu + source_nhwc
-    extension = CppExtension
-
-    define_macros = []
-
-    if CUDA_HOME is not None:
-        extension = CUDAExtension
-        sources += source_cuda
-        define_macros += [("WITH_CUDA", None)]
-
-    sources = [os.path.join(extensions_dir, s) for s in sources]
-
-    include_dirs = [extensions_dir]
-    extra_compile_flags= {'cxx' : []}
-    extra_compile_flags['nvcc'] = ['-DCUDA_HAS_FP16=1','-D__CUDA_NO_HALF_OPERATORS__','-D__CUDA_NO_HALF_CONVERSIONS__','-D__CUDA_NO_HALF2_OPERATORS__']
-
-    gencodes = [
-                #'-gencode', 'arch=compute_50,code=sm_50',
-                #'-gencode', 'arch=compute_52,code=sm_52',
-                #'-gencode', 'arch=compute_60,code=sm_60',
-                #'-gencode', 'arch=compute_61,code=sm_61',
-		'-gencode', 'arch=compute_70,code=sm_70',
-                '-gencode', 'arch=compute_70,code=compute_70',]
-
-    extra_compile_flags['nvcc'] += gencodes
-
-    extra_compile_flags= {'cxx' : []}
-    extra_compile_flags['nvcc'] = ['-DCUDA_HAS_FP16=1','-D__CUDA_NO_HALF_OPERATORS__','-D__CUDA_NO_HALF_CONVERSIONS__','-D__CUDA_NO_HALF2_OPERATORS__']
-
-    gencodes = [
-                #'-gencode', 'arch=compute_50,code=sm_50',
-                #'-gencode', 'arch=compute_52,code=sm_52',
-                #'-gencode', 'arch=compute_60,code=sm_60',
-                #'-gencode', 'arch=compute_61,code=sm_61',
-		'-gencode', 'arch=compute_70,code=sm_70',
-                '-gencode', 'arch=compute_70,code=compute_70',]
-
-    extra_compile_flags['nvcc'] += gencodes
-
-    ext_modules = [
-        extension(
-            "SSD._C",
-            sources,
-            include_dirs=include_dirs,
-            define_macros=define_macros,
-            extra_compile_args=extra_compile_flags,
-        )
-    ]
-
-    return ext_modules
-
-
-setup(
-    name="SSD",
-    version="0.1",
-    author="slayton",
-    url="",
-    description="SSD in pytorch",
-    packages=find_packages(exclude=("configs", "examples", "test",)),
-    # install_requires=requirements,
-    ext_modules=get_extensions(),
-    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
-)
diff --git a/cv/detection/ssd/pytorch/base/model/losses/opt_loss.py b/cv/detection/ssd/pytorch/opt_loss.py
similarity index 77%
rename from cv/detection/ssd/pytorch/base/model/losses/opt_loss.py
rename to cv/detection/ssd/pytorch/opt_loss.py
index 05eaeca33..b482811bc 100644
--- a/cv/detection/ssd/pytorch/base/model/losses/opt_loss.py
+++ b/cv/detection/ssd/pytorch/opt_loss.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import torch
 
 class OptLoss(torch.jit.ScriptModule):
diff --git a/cv/detection/ssd/pytorch/parse_config.py b/cv/detection/ssd/pytorch/parse_config.py
new file mode 100644
index 000000000..146888e5f
--- /dev/null
+++ b/cv/detection/ssd/pytorch/parse_config.py
@@ -0,0 +1,172 @@
+import torch
+import os                       # for getenv()
+
+from argparse import ArgumentParser
+
+import random
+
+# adds mutually exclusive "--name" and "--no-name" command line arguments, with
+# the result stored in a variable named "name" (with any dashes in "name"
+# replaced by underscores)
+# inspired by https://stackoverflow.com/a/31347222/2209313
+def add_bool_arg(group, name, default=False, help=''):
+    subgroup = group.add_mutually_exclusive_group(required=False)
+    name_with_underscore = name.replace('-', '_').replace(' ', '_')
+
+    truehelp = help
+    falsehelp = help
+    if help != '':
+        falsehelp = 'do not ' + falsehelp
+    if default is True:
+        if truehelp != '':
+            truehelp = truehelp + ' '
+        truehelp = truehelp + '(default)'
+    else:
+        if falsehelp != '':
+            falsehelp = falsehelp + ' '
+        falsehelp = falsehelp + '(default)'
+
+    subgroup.add_argument('--' + name, dest=name_with_underscore, action='store_true', help=truehelp)
+    subgroup.add_argument('--no-' + name, dest=name_with_underscore, action='store_false', help=falsehelp)
+    group.set_defaults(**{name_with_underscore:default})
+
+def parse_args():
+    parser = ArgumentParser(description="Train Single Shot MultiBox Detector"
+                                        " on COCO")
+
+    data_group = parser.add_argument_group('data', 'data-related options')
+    # Data-related
+    data_group.add_argument('--data', '-d', type=str, default='/coco',
+                        help='path to test and training data files')
+    data_group.add_argument('--meta_files_path', type=str, default=None,
+                        help='path to COCO meta files')
+    data_group.add_argument('--batch-size', '-b', type=int, default=32,
+                        help='number of examples for each iteration')
+    data_group.add_argument('--eval-batch-size', type=int, default=32,
+                        help='number of examples for each evaluation iteration')
+    # input pipeline stuff
+    add_bool_arg(data_group, 'dali', default=True) # --dali (default) and --no-dali
+    data_group.add_argument('--fake-input', action='store_true',
+                        help='run input pipeline with fake data (avoid all i/o and work except on very first call)')
+    data_group.add_argument('--input-batch-multiplier', type=int, default=1,
+                        help='run input pipeline at batch size <n> times larger than that given in --batch-size')
+    data_group.add_argument('--dali-sync', action='store_true',
+                        help='run dali in synchronous mode instead of the (default) asynchronous')
+    data_group.add_argument('--dali-cache', type=int, default=-1,
+                        help="cache size (in GB) for Dali's nvjpeg caching")
+    data_group.add_argument('--use-nvjpeg', action='store_true')
+    data_group.add_argument('--use-roi-decode', action='store_true',
+                            help="DEPRECATED: Dali input pipeline uses roi decode if and only if --dali-cache is not set" )
+
+    # model-related
+    model_group = parser.add_argument_group('model', 'Model-related options')
+    model_group.add_argument('--model-path', type=str, default='./vgg16n.pth')
+    model_group.add_argument('--backbone', type=str, choices=['vgg16', 'vgg16bn', 'resnet18', 'resnet34', 'resnet50'], default='resnet34')
+    model_group.add_argument('--num-workers', type=int, default=4)
+    model_group.add_argument('--use-fp16', action='store_true')
+    model_group.add_argument('--print-interval', type=int, default=20)
+    model_group.add_argument('--jit', action='store_true')
+    model_group.add_argument('--nhwc', action='store_true')
+    model_group.add_argument('--pad-input', action='store_true')
+    model_group.add_argument('--num-classes', type=int, default=81)
+    model_group.add_argument('--input-size', type=int, default=300)
+    model_group.add_argument('--verify-checkpoint', action='store_true')
+
+    # Solver-related
+    solver_group = parser.add_argument_group('solver', 'Solver-related options')
+    solver_group.add_argument('--epochs', '-e', type=int, default=800,
+                        help='number of epochs for training')
+    add_bool_arg(solver_group, 'allreduce-running-stats', default=True,
+                 help='allreduce batch norm running stats before evaluation')
+    solver_group.add_argument('--seed', '-s', type=int, default=random.SystemRandom().randint(0, 2**32 - 1),
+                        help='manually set random seed for torch')
+    solver_group.add_argument('--threshold', '-t', type=float, default=0.212,
+                        help='stop training early at threshold')
+    solver_group.add_argument('--iteration', type=int, default=0,
+                        help='iteration to start from')
+    solver_group.add_argument('--checkpoint', type=str, default=None,
+                        help='path to model checkpoint file')
+    add_bool_arg(solver_group, 'save', default=True,
+                        help='save model checkpoints')
+    solver_group.add_argument('--evaluation', nargs='*', type=int,
+                              default=[5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80],
+                              help='epochs at which to evaluate')
+    solver_group.add_argument('--warmup', type=int, default=None)
+    solver_group.add_argument('--warmup-factor', type=int, default=1,
+                        help='mlperf rule parameter for controlling warmup curve')
+    solver_group.add_argument('--lr', type=float, default=2.68e-3)
+    solver_group.add_argument('--wd', type=float, default=5e-4)
+    solver_group.add_argument('--lr-decay-factor', type=float, default=0.1,
+                              help='decay rate of learning rate. default is 0.1.')
+    parser.add_argument('--lr-decay-epochs', type=int, nargs='+', default=[44,55],
+                        help='epochs at which learning rate decays. default is 44,55.')
+    solver_group.add_argument('--delay-allreduce', action='store_true')
+    solver_group.add_argument('--opt-loss', action='store_true', help='deprecated option, does nothing (loss is always optimized)')
+    solver_group.add_argument('--bn-group', type=int, default=1, choices=[1, 2, 4, 8], help='Group of processes to collaborate on BatchNorm ops')
+
+    # Profiling
+    profiling_group = parser.add_argument_group('profiling', 'Profiling options')
+    profiling_group.add_argument('--profile', type=int, default=None,
+                        help='iteration at which to early terminate')
+    profiling_group.add_argument('--profile-start', type=int, default=None,
+                        help='iteration at which to turn on cuda and/or pytorch nvtx profiling')
+    profiling_group.add_argument('--profile-nvtx', action='store_true',
+                        help='turn on pytorch nvtx annotations in addition to cuda profiling')
+    profiling_group.add_argument('--profile-gc-off', action='store_true',
+                                 help='call gc.disable() (useful for eliminating gc noise while profiling)')
+    profiling_group.add_argument('--profile-cudnn-get', action='store_true',
+                                 help='use cudnnGet() rather than cudnnFind() to eliminate a possible source of perf non-determinism')
+    profiling_group.add_argument('--profile-fake-optim', action='store_true',
+                                 help='turn off optimizer to get more accurate timing of the rest of the training pipe')
+
+    # Distributed stuff
+    parser.add_argument('--local_rank', '--local-rank', default=os.getenv('LOCAL_RANK',0), type=int,
+			help='Used for multi-process training. Can either be manually set ' +
+			'or automatically set by using \'python -m multiproc\'.')
+    parser.add_argument('--backend', default='nccl', type=str, help='choice the distributed backend (nccl or gloo)')
+    solver_group.add_argument('--opt-level', type=str, default="O0", choices=["O0", "O1", "O2"], help='control training mode(FP32 or FP16) by opt_level')
+
+    return parser.parse_args()
+
+# make sure that arguments are all self-consistent
+def validate_arguments(args):
+    # nhwc can only be used with fp16
+    if args.nhwc:
+        assert(args.use_fp16)
+
+    # input padding can only be used with NHWC
+    if args.pad_input:
+        assert(args.nhwc)
+
+    # no dali can only be used with NCHW and no padding
+    if not args.dali:
+        assert(not args.nhwc)
+        assert(not args.pad_input)
+        assert(not args.use_nvjpeg)
+        assert(not args.dali_cache)
+        assert(not args.use_roi_decode)
+
+    if args.use_roi_decode:
+        assert(args.dali_cache<=0) # roi decode also crops every epoch, so can't cache
+
+    if args.dali_cache>0:
+        assert(args.use_nvjpeg)
+
+    if args.jit:
+        assert(args.nhwc) #jit can not be applied with apex::syncbn used for non-nhwc
+
+    return
+
+# Check that the run is valid for specified group BN arg
+def validate_group_bn(bn_groups):
+    if torch.distributed.is_initialized():
+        world_size = torch.distributed.get_world_size()
+    else:
+        world_size = 1
+
+    # Can't have larger group than ranks
+    assert(bn_groups <= world_size)
+
+    # must have only complete groups
+    assert(world_size % bn_groups == 0)
+
diff --git a/cv/detection/ssd/pytorch/base/data_preprocessing/prepare_json.py b/cv/detection/ssd/pytorch/prepare-json.py
similarity index 100%
rename from cv/detection/ssd/pytorch/base/data_preprocessing/prepare_json.py
rename to cv/detection/ssd/pytorch/prepare-json.py
diff --git a/cv/detection/ssd/pytorch/base/requirements.txt b/cv/detection/ssd/pytorch/requirements.txt
similarity index 35%
rename from cv/detection/ssd/pytorch/base/requirements.txt
rename to cv/detection/ssd/pytorch/requirements.txt
index 37473d262..dd9e53d11 100644
--- a/cv/detection/ssd/pytorch/base/requirements.txt
+++ b/cv/detection/ssd/pytorch/requirements.txt
@@ -1,4 +1,2 @@
-# progress bars in model download and training scripts
-pycocotools==2.0.2
 ujson==1.35
-
+pycocotools==2.0.2
diff --git a/cv/detection/ssd/pytorch/iluvatar/config/resnet.py b/cv/detection/ssd/pytorch/resnet.py
similarity index 88%
rename from cv/detection/ssd/pytorch/iluvatar/config/resnet.py
rename to cv/detection/ssd/pytorch/resnet.py
index 1c77b8a0c..739b95759 100644
--- a/cv/detection/ssd/pytorch/iluvatar/config/resnet.py
+++ b/cv/detection/ssd/pytorch/resnet.py
@@ -16,28 +16,50 @@ import torch                    # for torch.cat and torch.zeros
 import torch.nn as nn
 import torch.utils.model_zoo as model_zoo
 
+from nhwc.conv import Conv2d_NHWC
+from nhwc.batch_norm import BatchNorm2d_NHWC as gbn_persistent
+from nhwc.max_pool import MaxPool2d_NHWC
+
 # Group batch norm
 from apex.parallel import SyncBatchNorm as gbn
 # Persistent group BN for NHWC case
 from apex.contrib.groupbn.batch_norm import BatchNorm2d_NHWC
 import apex.parallel
 
-try:
-    from .nhwc.conv import Conv2d_NHWC
-    from .nhwc.batch_norm import BatchNorm2d_NHWC as gbn_persistent
-    from .nhwc.max_pool import MaxPool2d_NHWC
-except:
-    from nhwc.conv import Conv2d_NHWC
-    from nhwc.batch_norm import BatchNorm2d_NHWC as gbn_persistent
-    from nhwc.max_pool import MaxPool2d_NHWC
+import subprocess
+import os
 
 __all__ = ['resnet']
 
 model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
     'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
 }
 
 
+def download_with_wget(url, save_path):
+    filename = os.path.basename(url)
+    print(f"downloading file: {filename}, save to: {save_path}")
+    if os.path.exists(save_path+filename):
+        print(f"file {save_path+filename} has existed")
+        return
+
+    os.makedirs(save_path, exist_ok=True)
+    cmd = ""
+    cmd = [
+        'wget',
+        '-q',
+        '--show-progress',
+        '-O', save_path+filename,
+        url
+    ]
+
+    # 执行命令并等待完成
+    subprocess.check_call(cmd)
+
 class Layers_NCHW:
     Conv2d = nn.Conv2d
     MaxPool = nn.MaxPool2d
@@ -221,6 +243,7 @@ def resnet34(pretrained=False, nhwc=False, ssd_mods=False, **kwargs):
     model = ResNet(layerImpls, block, layer_list, ssd_mods=ssd_mods, use_nhwc=nhwc, **kwargs)
 
     if pretrained:
+        download_with_wget(model_urls['resnet34'], '/root/.cache/torch/hub/checkpoints/')
         orig_state_dict = model_zoo.load_url(model_urls['resnet34'])
 
         # Modify the state dict to remove conv5 / layer4
diff --git a/cv/detection/ssd/pytorch/run.sub b/cv/detection/ssd/pytorch/run.sub
new file mode 100644
index 000000000..bd97fee5e
--- /dev/null
+++ b/cv/detection/ssd/pytorch/run.sub
@@ -0,0 +1,62 @@
+#!/bin/bash
+#SBATCH --job-name single_stage_detector
+set -euxo pipefail
+
+# Vars without defaults
+: "${DGXSYSTEM:?DGXSYSTEM not set}"
+: "${CONT:?CONT not set}"
+
+# Vars with defaults
+: "${NEXP:=5}"
+: "${DATESTAMP:=$(date +'%y%m%d%H%M%S%N')}"
+: "${CLEAR_CACHES:=1}"
+: "${DATADIR:=/raid/datasets/coco/coco-2017}"
+: "${LOGDIR:=./results}"
+
+# Other vars
+readonly _logfile_base="${LOGDIR}/${DATESTAMP}"
+readonly _cont_name=single_stage_detector
+_cont_mounts="${DATADIR}:/data,${LOGDIR}:/results"
+
+# MLPerf vars
+MLPERF_HOST_OS=$(srun -N1 -n1 bash <<EOF
+    source /etc/os-release
+    source /etc/dgx-release || true
+    echo "\${PRETTY_NAME} / \${DGX_PRETTY_NAME:-???} \${DGX_OTA_VERSION:-\${DGX_SWBUILD_VERSION:-???}}"
+EOF
+)
+export MLPERF_HOST_OS
+
+# Setup directories
+mkdir -p "${LOGDIR}"
+srun --ntasks="${SLURM_JOB_NUM_NODES}" mkdir -p "${LOGDIR}"
+
+# Setup container
+srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-image="${CONT}" --container-name="${_cont_name}" true
+
+# Run experiments
+for _experiment_index in $(seq 1 "${NEXP}"); do
+    (
+        echo "Beginning trial ${_experiment_index} of ${NEXP}"
+
+        # Print system info
+        srun --nodes=1 --ntasks=1 --container-name="${_cont_name}" python -c "
+import mlperf_log_utils
+from mlperf_logging.mllog import constants
+mlperf_log_utils.mlperf_submission_log(constants.SSD)"
+
+        # Clear caches
+        if [ "${CLEAR_CACHES}" -eq 1 ]; then
+            srun --ntasks="${SLURM_JOB_NUM_NODES}" bash -c "echo -n 'Clearing cache on ' && hostname && sync && sudo /sbin/sysctl vm.drop_caches=3"
+            srun --ntasks="${SLURM_JOB_NUM_NODES}" --container-name="${_cont_name}" python -c "
+from mlperf_logging.mllog import constants
+from mlperf_logger import log_event
+log_event(key=constants.CACHE_CLEAR, value=True)"
+        fi
+
+        # Run experiment
+        srun --mpi=none --ntasks="$(( SLURM_JOB_NUM_NODES * DGXNGPU ))" --ntasks-per-node="${DGXNGPU}" \
+            --container-name="${_cont_name}" --container-mounts="${_cont_mounts}" \
+            ./run_and_time.sh
+    ) |& tee "${_logfile_base}_${_experiment_index}.log"
+done
diff --git a/cv/detection/ssd/pytorch/run_and_time.sh b/cv/detection/ssd/pytorch/run_and_time.sh
new file mode 100644
index 000000000..c6ead3049
--- /dev/null
+++ b/cv/detection/ssd/pytorch/run_and_time.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# runs benchmark and reports time to convergence
+# to use the script:
+#   run_and_time.sh
+
+set -e
+
+# start timing
+start=$(date +%s)
+start_fmt=$(date +%Y-%m-%d\ %r)
+echo "STARTING TIMING RUN AT $start_fmt"
+
+# run benchmark
+set -x
+NUMEPOCHS=${NUMEPOCHS:-80}
+
+echo "running benchmark"
+
+export DATASET_DIR="/data/coco2017"
+export TORCH_HOME="$(pwd)/torch-model-cache"
+
+declare -a CMD
+if [ -n "${SLURM_LOCALID-}" ]; then
+  # Mode 1: Slurm launched a task for each GPU and set some envvars; no need for parallel launch
+  if [ "${SLURM_NTASKS}" -gt "${SLURM_JOB_NUM_NODES}" ]; then
+    CMD=( './bind.sh' '--' 'python' '-u' )
+  else
+    CMD=( 'python' '-u' )
+  fi
+else
+  # Mode 2: Single-node Docker; need to launch tasks with Pytorch's distributed launch
+  # TODO: use bind.sh instead of bind_launch.py
+  #       torch.distributed.launch only accepts Python programs (not bash scripts) to exec
+  CMD=( 'python' '-u' '-m' 'bind_launch' "--nsockets_per_node=${DGXNSOCKET}" \
+    "--ncores_per_socket=${DGXSOCKETCORES}" "--nproc_per_node=${DGXNGPU}" )
+fi
+
+# run training
+"${CMD[@]}" train.py \
+  --use-fp16 \
+  --nhwc \
+  --pad-input \
+  --jit \
+  --delay-allreduce \
+  --opt-loss \
+  --epochs "${NUMEPOCHS}" \
+  --warmup-factor 0 \
+  --no-save \
+  --threshold=0.23 \
+  --data ${DATASET_DIR} \
+  ${EXTRA_PARAMS} ; ret_code=$?
+
+set +x
+
+sleep 3
+if [[ $ret_code != 0 ]]; then exit $ret_code; fi
+
+# end timing
+end=$(date +%s)
+end_fmt=$(date +%Y-%m-%d\ %r)
+echo "ENDING TIMING RUN AT $end_fmt"
+
+# report result
+result=$(( $end - $start ))
+result_name="SINGLE_STAGE_DETECTOR"
+
+echo "RESULT,$result_name,,$result,nvidia,$start_fmt"
diff --git a/cv/detection/ssd/pytorch/run_ddp_mm.sh b/cv/detection/ssd/pytorch/run_ddp_mm.sh
new file mode 100644
index 000000000..108c121fd
--- /dev/null
+++ b/cv/detection/ssd/pytorch/run_ddp_mm.sh
@@ -0,0 +1,126 @@
+#!/bin/bash -x
+
+#bash init.sh
+
+DATASET_DIR="/datasets/coco_2017"
+BATCH_SIZE_LIST="128"
+NHWC_PARAMS="--nhwc --pad-input --jit"
+DATE=`date +%m%d%H%M%S`
+LOG_DIR="./train_log_${DATE}"
+CORES_PER_SOCKET=`lscpu|awk '/Core\(s\) per socket/ {print $4}'`
+HOST_IP=$(hostname -I | awk '{print $1}')
+CUR_DIR=`pwd`
+CUR_SCR=$0
+
+ADDR_ARRAY=("192.168.10.19" "192.168.10.20" "192.168.10.21" "192.168.10.22")
+IMAGE="<url>:v2.1.0_0304-143_5.4.0-42"
+CONTAINER_NAME="slw_release_2.1_0304"
+CONTAINER_INIT_OPT="--privileged --pid=host --cap-add=ALL -v /dev:/dev -v /lib/modules:/lib/modules -v /home/shunlai.wang:/home/shunlai.wang  -v /mnt:/mnt -v /data/datasets:/datasets --network=host -e BASH_ENV=/etc/bash.bashrc"
+mkdir -p ${LOG_DIR}
+
+function run_multi_machine_8card_FPS()
+{
+        export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+	export GLOO_SOCKET_IFNAME="ib0"
+	FORMAT=$1
+	BACKEND=$2
+	num_epochs=10
+        if [ "${FORMAT}" = "nchw" ]; then
+            args=""
+        else
+            args=${NHWC_PARAMS}
+        fi
+
+	# do actual run when IP matched
+	for i in "${!ADDR_ARRAY[@]}"
+	do
+		if [ "$HOST_IP" == "${ADDR_ARRAY[$i]}" ]
+		then
+			for BATCH_SIZE in ${BATCH_SIZE_LIST}
+			do	
+				# ../../../../tools/reset.sh
+				echo "nodes: ${#ADDR_ARRAY[@]}, rank: $i, IP: $HOST_IP, MASTER_IP: ${ADDR_ARRAY[0]}"
+				python -u -m bind_launch --nnodes ${#ADDR_ARRAY[@]} --node_rank $i --nproc_per_node 8 --master_addr ${ADDR_ARRAY[0]} --nsockets_per_node 2 --ncores_per_socket ${CORES_PER_SOCKET} --no_membind \
+					./train.py --dali --data=${DATASET_DIR} --batch-size=${BATCH_SIZE} --warmup-factor=0 --warmup=650 --lr=5.2e-3 --threshold=0.23 --no-save --epochs ${num_epochs} --eval-batch-size=160 --wd=1.6e-4 --use-fp16 --delay-allreduce --lr-decay-factor=0.2 --lr-decay-epochs 34 45 --opt-level "O2" ${args} --backend ${BACKEND,,} > ${LOG_DIR}/ssd_${#ADDR_ARRAY[@]}_machine_8card_${FORMAT,,}_batch_${BATCH_SIZE}_${BACKEND,,}_fps.log 2>&1
+			done
+		fi
+	done
+
+}
+
+
+function exec_ssh_by_master() {
+# only at master host, start all other non master hosts run
+if [ "$HOST_IP" == "${ADDR_ARRAY[0]}" ]
+then
+        for i in "${!ADDR_ARRAY[@]}"
+        do
+                if [ "$i" != "0" ]
+                then
+                        scp ${CUR_SCR} ${ADDR_ARRAY[$i]}:${CUR_DIR}
+			init_docker_container_by_non_master ${ADDR_ARRAY[$i]}
+                        ssh ${ADDR_ARRAY[$i]} "docker exec -i ${CONTAINER_NAME} bash -c \"cd ${CUR_DIR}; bash ${CUR_SCR} \"" &
+                fi
+        done
+fi
+
+}
+
+
+function init_docker_container_by_non_master
+{
+	ADDR=$1
+	matched_containers=`ssh ${ADDR} "docker ps -a"|grep "${CONTAINER_NAME}$"|wc -l`
+	if [ ${matched_containers} -gt "0" ]
+	then
+		echo "Warning: Found container ${CONTAINER_NAME} exists! Will delete it."
+		ssh ${ADDR} "docker stop ${CONTAINER_NAME}; docker rm ${CONTAINER_NAME}"
+	fi
+	ssh ${ADDR} "docker run -itd --name ${CONTAINER_NAME} ${CONTAINER_INIT_OPT} ${IMAGE} /bin/bash"
+	if [ "$?" != "0" ]
+	then
+		echo "Error: Init container ${CONTAINER_NAME} at ${ADDR} failed!"
+		exit -1
+	fi
+}
+
+
+
+function run_multi_machine_8card_end2end()
+{
+        export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+	export GLOO_SOCKET_IFNAME="ib0"
+	FORMAT=$1
+	BACKEND=$2
+	BATCH_SIZE=$3
+	num_epochs=90
+        if [ "${FORMAT}" = "nchw" ]; then
+            args=""
+        else
+            args=${NHWC_PARAMS}
+        fi
+
+	# do actual run when IP matched
+	for i in "${!ADDR_ARRAY[@]}"
+	do
+		if [ "$HOST_IP" == "${ADDR_ARRAY[$i]}" ]
+		then
+			../../../../tools/reset.sh
+			echo "nodes: ${#ADDR_ARRAY[@]}, rank: $i, IP: $HOST_IP, MASTER_IP: ${ADDR_ARRAY[0]}"
+			python3 -u -m bind_launch --nnodes ${#ADDR_ARRAY[@]} --node_rank $i --nproc_per_node 8 --master_addr ${ADDR_ARRAY[0]} --nsockets_per_node 2 --ncores_per_socket ${CORES_PER_SOCKET} --no_membind \
+                                        ./train.py --dali --data=${DATASET_DIR} --batch-size=${BATCH_SIZE} --warmup-factor=0 --warmup=650 --lr=2.68e-3 --threshold=0.23 --no-save --epochs ${num_epochs} --eval-batch-size=160 --wd=1.6e-4 --use-fp16 --delay-allreduce --lr-decay-factor=0.2 --lr-decay-epochs 34 45 --opt-level "O2" ${args} --backend ${BACKEND,,} > ${LOG_DIR}/ssd_${#ADDR_ARRAY[@]}_machine_8card_${FORMAT,,}_batch_${BATCH_SIZE}_${BACKEND,,}_fps.log 2>&1
+
+		fi
+
+	done
+}
+
+date +%m%d%H%M%S >> ${LOG_DIR}/time.log
+exec_ssh_by_master
+#run_multi_machine_8card_end2end nhwc gloo 128
+run_multi_machine_8card_end2end nhwc nccl 112
+#run_multi_machine_8card_end2end nhwc gloo 96
+#run_multi_machine_8card_end2end nhwc gloo 80
+#run_multi_machine_8card_end2end nhwc gloo 64
+#run_multi_machine_8card_end2end nhwc gloo 56
+date +%m%d%H%M%S >> ${LOG_DIR}/time.log
diff --git a/cv/detection/ssd/pytorch/run_with_docker.sh b/cv/detection/ssd/pytorch/run_with_docker.sh
new file mode 100644
index 000000000..c30e5051f
--- /dev/null
+++ b/cv/detection/ssd/pytorch/run_with_docker.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+set -euxo pipefail
+
+# Vars without defaults
+: "${DGXSYSTEM:?DGXSYSTEM not set}"
+: "${CONT:?CONT not set}"
+
+# Vars with defaults
+: "${NEXP:=5}"
+: "${DATESTAMP:=$(date +'%y%m%d%H%M%S%N')}"
+: "${CLEAR_CACHES:=1}"
+: "${DATADIR:=/raid/datasets/coco/coco-2017}"
+: "${LOGDIR:=$(pwd)/results}"
+
+# Other vars
+readonly _config_file="./config_${DGXSYSTEM}.sh"
+readonly _logfile_base="${LOGDIR}/${DATESTAMP}"
+readonly _cont_name=single_stage_detector
+_cont_mounts=("--volume=${DATADIR}:/data" "--volume=${LOGDIR}:/results")
+
+# Setup directories
+mkdir -p "${LOGDIR}"
+
+# Get list of envvars to pass to docker
+source "${_config_file}"
+mapfile -t _config_env < <(env -i bash -c ". ${_config_file} && compgen -e" | grep -E -v '^(PWD|SHLVL)')
+_config_env+=(MLPERF_HOST_OS)
+mapfile -t _config_env < <(for v in "${_config_env[@]}"; do echo "--env=$v"; done)
+
+# Cleanup container
+cleanup_docker() {
+    docker container rm -f "${_cont_name}" || true
+}
+cleanup_docker
+trap 'set -eux; cleanup_docker' EXIT
+
+# Setup container
+nvidia-docker run --rm --init --detach \
+    --net=host --uts=host --ipc=host --security-opt=seccomp=unconfined \
+    --ulimit=stack=67108864 --ulimit=memlock=-1 \
+    --name="${_cont_name}" "${_cont_mounts[@]}" \
+    "${CONT}" sleep infinity
+docker exec -it "${_cont_name}" true
+
+# Run experiments
+for _experiment_index in $(seq 1 "${NEXP}"); do
+    (
+        echo "Beginning trial ${_experiment_index} of ${NEXP}"
+
+        # Print system info
+        docker exec -it "${_cont_name}" python -c "
+import mlperf_log_utils
+from mlperf_logging.mllog import constants
+mlperf_log_utils.mlperf_submission_log(constants.SSD)"
+
+        # Clear caches
+        if [ "${CLEAR_CACHES}" -eq 1 ]; then
+            sync && sudo /sbin/sysctl vm.drop_caches=3
+            docker exec -it "${_cont_name}" python -c "
+from mlperf_logging.mllog import constants
+from mlperf_logger import log_event
+log_event(key=constants.CACHE_CLEAR, value=True)"
+        fi
+
+        # Run experiment
+        docker exec -it "${_config_env[@]}" "${_cont_name}" ./run_and_time.sh
+    ) |& tee "${_logfile_base}_${_experiment_index}.log"
+done
diff --git a/cv/detection/ssd/pytorch/iluvatar/setup.py b/cv/detection/ssd/pytorch/setup.py
similarity index 100%
rename from cv/detection/ssd/pytorch/iluvatar/setup.py
rename to cv/detection/ssd/pytorch/setup.py
diff --git a/cv/detection/ssd/pytorch/nvidia/config/ssd300.py b/cv/detection/ssd/pytorch/ssd300.py
similarity index 98%
rename from cv/detection/ssd/pytorch/nvidia/config/ssd300.py
rename to cv/detection/ssd/pytorch/ssd300.py
index 6ad66af9f..a97647c8d 100644
--- a/cv/detection/ssd/pytorch/nvidia/config/ssd300.py
+++ b/cv/detection/ssd/pytorch/ssd300.py
@@ -15,14 +15,9 @@
 import torch
 import torch.nn as nn
 # from base_model import L2Norm, ResNet
+from resnet import ResNet, resnet34
 
-try:
-    from .resnet import resnet34
-    from .nhwc.conv import Conv2d_NHWC
-except:
-    from resnet import resnet34
-    from nhwc.conv import Conv2d_NHWC
-
+from nhwc.conv import Conv2d_NHWC
 
 class SSD300(nn.Module):
     """
diff --git a/cv/detection/ssd/pytorch/test.py b/cv/detection/ssd/pytorch/test.py
new file mode 100644
index 000000000..183c43dfe
--- /dev/null
+++ b/cv/detection/ssd/pytorch/test.py
@@ -0,0 +1,204 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from argparse import ArgumentParser
+from utils import DefaultBoxes, Encoder, COCODetection
+from base_model import Loss
+from utils import SSDTransformer
+from ssd300 import SSD300
+import torch
+from torch.utils.data import DataLoader
+import time
+import numpy as np
+import os
+
+# necessary pytorch imports
+import torch.utils.data.distributed
+import torch.distributed as dist
+from torch.autograd import Variable
+
+# Apex imports
+try:
+    from apex.parallel import DistributedDataParallel as DDP
+    from apex.fp16_utils import *
+except ImportError:
+    raise ImportError("Please install APEX from https://github.com/nvidia/apex")
+
+# DALI import
+from coco_pipeline import COCOPipeline, DALICOCOIterator
+
+from SSD import _C as C
+
+
+def parse_args():
+    parser = ArgumentParser(description="Train Single Shot MultiBox Detector"
+                                        " on COCO")
+    parser.add_argument('--data', '-d', type=str, default='/coco/coco2017',
+                        help='path to test and training data files')
+    parser.add_argument('--batch-size', '-b', type=int, default=128,
+                        help='number of examples for each iteration')
+    #parser.add_argument('--checkpoint', type=str, default=None,
+    #                    help='path to model checkpoint file', required=True)
+    parser.add_argument('--backbone', type=str, choices=['vgg16', 'vgg16bn',
+                        'resnet18', 'resnet34', 'resnet50'], default='resnet34')
+    parser.add_argument('--num-workers', type=int, default=3)
+    parser.add_argument('--fbu', type=int, default=1)
+    parser.add_argument('--use-fp16', action='store_true')
+    parser.add_argument('--use-train-dataset', action='store_true')
+
+    # Distributed stuff
+    parser.add_argument('--local_rank', '--local-rank', default=0, type=int,
+			help='Used for multi-process training. Can either be manually set ' +
+                    'or automatically set by using \'python -m multiproc\'.')
+
+    return parser.parse_args()
+
+def dboxes300_coco():
+    figsize = 300
+    feat_size = [38, 19, 10, 5, 3, 1]
+    steps = [8, 16, 32, 64, 100, 300]
+    # use the scales here: https://github.com/amdegroot/ssd.pytorch/blob/master/data/config.py
+    scales = [21, 45, 99, 153, 207, 261, 315]
+    aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
+    dboxes = DefaultBoxes(figsize, feat_size, steps, scales, aspect_ratios)
+    return dboxes
+
+def test_coco(args):
+    # For testing purposes we have to use CUDA
+    use_cuda = True
+
+    # Setup multi-GPU if necessary
+    args.distributed = False
+    if 'WORLD_SIZE' in os.environ:
+        args.distributed = int(os.environ['WORLD_SIZE']) > 1
+
+    if args.distributed:
+        torch.cuda.set_device(args.local_rank)
+
+        torch.distributed.init_process_group(backend='nccl',
+                                             init_method='env://')
+
+    if args.distributed:
+        N_gpu = torch.distributed.get_world_size()
+    else:
+        N_gpu = 1
+
+    # Setup data, defaults
+    dboxes = dboxes300_coco()
+    encoder = Encoder(dboxes)
+
+    if args.use_train_dataset:
+        annotate = os.path.join(args.data, "annotations/instances_train2017.json")
+        coco_root = os.path.join(args.data, "train2017")
+        img_number = 118287
+    else:
+        annotate = os.path.join(args.data, "annotations/instances_val2017.json")
+        coco_root = os.path.join(args.data, "val2017")
+        img_number = 5000
+
+    pipe = COCOPipeline(args.batch_size, args.local_rank, coco_root,
+                    annotate, N_gpu, num_threads=args.num_workers)
+    pipe.build()
+    test_run = pipe.run()
+    dataloader = DALICOCOIterator(pipe, img_number / N_gpu)
+
+    # Build the model
+    ssd300 = SSD300(81, backbone=args.backbone, model_path='', dilation=False)
+
+    """
+    # Note: args.checkpoint is required, so this can never be false
+    if args.checkpoint is not None:
+        print("loading model checkpoint", args.checkpoint)
+        od = torch.load(args.checkpoint)
+
+        # remove proceeding 'module' from checkpoint
+        model = od["model"]
+        for k in list(model.keys()):
+            if k.startswith('module.'):
+                model[k[7:]] = model.pop(k)
+        ssd300.load_state_dict(model)
+    """
+
+
+    ssd300.cuda()
+    ssd300.eval()
+    loss_func = Loss(dboxes)
+    loss_func.cuda()
+
+    # parallelize
+    if args.distributed:
+        ssd300 = DDP(ssd300)
+
+    if args.use_fp16:
+        ssd300 = network_to_half(ssd300)
+
+    if args.use_train_dataset and args.local_rank == 0:
+        print('Image 000000320612.jpg is in fact PNG and it will cause fail if ' +
+                'used with nvJPEGDecoder in coco_pipeline')
+
+    for epoch in range(2):
+        if epoch == 1 and args.local_rank == 0:
+            print("Performance computation starts")
+            s = time.time()
+        for i, data in enumerate(dataloader):
+
+            with torch.no_grad():
+                # Get data from pipeline
+                img = data[0][0][0]
+                bbox = data[0][1][0]
+                label = data[0][2][0]
+                label = label.type(torch.cuda.LongTensor)
+                bbox_offsets = data[0][3][0]
+                bbox_offsets = bbox_offsets.cuda()
+
+                # Encode labels
+                N = img.shape[0]
+                if bbox_offsets[-1].item() == 0:
+                    print("No labels in batch")
+                    continue
+                bbox, label = C.box_encoder(N, bbox, bbox_offsets, label,
+                                            encoder.dboxes.cuda(), 0.5)
+
+                # Prepare tensors for computing loss
+                M = bbox.shape[0] // N
+                bbox = bbox.view(N, M, 4)
+                label = label.view(N, M)
+                trans_bbox = bbox.transpose(1,2).contiguous()
+                gloc, glabel = Variable(trans_bbox, requires_grad=False), \
+                               Variable(label, requires_grad=False)
+
+                if args.use_fp16:
+                    img = img.half()
+
+                for _ in range(args.fbu):
+                    ploc, plabel = ssd300(img)
+                    ploc, plabel = ploc.float(), plabel.float()
+                    loss = loss_func(ploc, plabel, gloc, glabel)
+
+        if epoch == 1 and args.local_rank == 0:
+            e = time.time()
+            print("Performance achieved: {:.2f} img/sec".format(img_number / (e - s)))
+
+        dataloader.reset()
+
+def main():
+    args = parse_args()
+
+    torch.backends.cudnn.benchmark = True
+
+    test_coco(args)
+
+if __name__ == "__main__":
+    main()
diff --git a/cv/detection/ssd/pytorch/base/test/box_coder_test.py b/cv/detection/ssd/pytorch/test/box_coder_test.py
similarity index 53%
rename from cv/detection/ssd/pytorch/base/test/box_coder_test.py
rename to cv/detection/ssd/pytorch/test/box_coder_test.py
index 1516dbe83..4c3613739 100644
--- a/cv/detection/ssd/pytorch/base/test/box_coder_test.py
+++ b/cv/detection/ssd/pytorch/test/box_coder_test.py
@@ -1,9 +1,24 @@
-import sys
-sys.path.append("..")
-
-from box_coder import DefaultBoxes, Encoder
+import os
+from argparse import ArgumentParser
+from utils import DefaultBoxes, Encoder, COCODetection, SSDCropping
+from PIL import Image
+from base_model import Loss
+from utils import SSDTransformer
+from ssd300 import SSD300
+from sampler import GeneralDistributedSampler
+from master_params import create_flat_master
 import torch
+from torch.autograd import Variable
+from torch.utils.data import DataLoader
+import time
+import numpy as np
+import io
+import random
+import torchvision.transforms as transforms
+
+import sys
 
+from SSD import _C as C
 
 def dboxes300_coco():
     figsize = 300
@@ -18,8 +33,7 @@ def dboxes300_coco():
 
 if __name__ == "__main__":
     dboxes = dboxes300_coco()
-    encoder = Encoder(dboxes, fast_nms=False)
-    encoder_fast = Encoder(dboxes, fast_nms=True)
+    encoder = Encoder(dboxes)
 
     saved_inputs = torch.load('inputs.pth')
 
@@ -31,9 +45,5 @@ if __name__ == "__main__":
     print('bboxes: {}, scores: {}'.format(bboxes.shape, scores.shape))
 
     for i in range(bboxes.shape[0]):
-        box1, label1, score1 = encoder.decode_batch(bboxes[i, :, :].unsqueeze(0), scores[i, :, :].unsqueeze(0), criteria, max_num)[0]
-
-        box2, label2, score2 = \
-        encoder_fast.decode_batch(bboxes[i, :, :].unsqueeze(0), scores[i, :, :].unsqueeze(0), criteria, max_num)[0]
-
-        print('label: {}, fast label: {}'.format(label1, label2))
+        box, label, score = encoder.decode_batch(bboxes[i, :, :].unsqueeze(0), scores[i, :, :].unsqueeze(0), criteria, max_num)[0]
+        print('r: {}'.format(label))
diff --git a/cv/detection/ssd/pytorch/base/test/cuda_encoder_test.py b/cv/detection/ssd/pytorch/test/cuda_encoder_test.py
similarity index 88%
rename from cv/detection/ssd/pytorch/base/test/cuda_encoder_test.py
rename to cv/detection/ssd/pytorch/test/cuda_encoder_test.py
index f4a9d0e24..ba10edf2f 100644
--- a/cv/detection/ssd/pytorch/base/test/cuda_encoder_test.py
+++ b/cv/detection/ssd/pytorch/test/cuda_encoder_test.py
@@ -1,11 +1,23 @@
-import sys
-sys.path.append("..")
-
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import torch
+from torch.autograd import Variable
 from SSD import _C as C
 
-from box_coder import dboxes300_coco, Encoder
+from train import dboxes300_coco
+from utils import Encoder
 
 import numpy as np
 
@@ -150,7 +162,7 @@ def test_box_encoder():
     N, bboxes_cat, offsets, bboxes = load_bboxes(box_list, True)
     # N, bboxes_cat, offsets, bboxes = load_bboxes([b1[:2,:], b1[:2,:]])
 
-    # print(N, bboxes_cat, offsets)
+    print(N, bboxes_cat, offsets)
 
     label_numpy = np.random.randn(offsets[-1])*10
     labels = torch.tensor(label_numpy.astype(np.int64)).cuda()
@@ -163,8 +175,6 @@ def test_box_encoder():
 
     start = time.time()
     bbox_out, label_out = C.box_encoder(N, bboxes_cat, offsets, labels, dboxes.cuda(), 0.5)
-    bbox_out = bbox_out.reshape(bbox_out.shape[0] * bbox_out.shape[1], -1)
-    label_out = label_out.reshape(label_out.shape[0] * label_out.shape[1], -1)
     torch.cuda.synchronize()
     end = time.time()
 
@@ -178,7 +188,7 @@ def test_box_encoder():
 
     # reference
     dboxes = dboxes300_coco()
-    encoder = Encoder(dboxes, fast_nms=True)
+    encoder = Encoder(dboxes)
 
     labels_ref = torch.tensor(label_numpy.astype(np.int64))
     start = time.time()
@@ -206,7 +216,7 @@ def test_box_encoder():
     for i, res in enumerate(r):
         if not res.any():
             num_fail += 1
-            # print(i, res, ref_boxes[i,:], bbox_out[i, :])
+            print(i, res, ref_boxes[i,:], bbox_out[i, :])
 
     print('{} bboxes failed'.format(num_fail))
 
@@ -218,7 +228,7 @@ def test_box_encoder():
     for i, res in enumerate(r2):
         if not res:
             num_fail += 1
-            # print('label: ', i, res, label_out[i], ref_labels.numpy()[i])
+            print('label: ', i, res, label_out[i], ref_labels.numpy()[i])
 
     print('{} labels failed'.format(num_fail))
 
diff --git a/cv/detection/ssd/pytorch/base/test/opt_loss_test.py b/cv/detection/ssd/pytorch/test/opt_loss_test.py
similarity index 64%
rename from cv/detection/ssd/pytorch/base/test/opt_loss_test.py
rename to cv/detection/ssd/pytorch/test/opt_loss_test.py
index a422ee358..7411eafee 100644
--- a/cv/detection/ssd/pytorch/base/test/opt_loss_test.py
+++ b/cv/detection/ssd/pytorch/test/opt_loss_test.py
@@ -1,11 +1,8 @@
-import sys
-sys.path.append("..")
-
 import torch
 
-from box_coder import dboxes300_coco
-from model.losses.opt_loss import OptLoss
-# from model.losses.loss import Loss
+from base_model import Loss
+from train import dboxes300_coco
+from opt_loss import OptLoss
 
 # In:
 #  ploc : N x 8732 x 4
@@ -21,9 +18,9 @@ glabel = data['glabel'].cuda()
 
 dboxes = dboxes300_coco()
 # loss = Loss(dboxes).cuda()
-opt_loss = OptLoss().cuda()
+loss = OptLoss(dboxes).cuda()
 
-opt_loss = torch.jit.trace(opt_loss, (ploc, plabel, gloc, glabel))
+loss = torch.jit.trace(loss, (ploc, plabel, gloc, glabel))
 # print(traced_loss.graph)
 
 # timing
@@ -33,22 +30,20 @@ import time
 
 # Dry run to eliminate JIT compile overhead
 dl = torch.tensor([1.], device="cuda")
-l1 = opt_loss(ploc, plabel, gloc, glabel)
-print("opt loss: {}".format(l1))
-l1.backward()
-
+l = loss(ploc, plabel, gloc, glabel)
+l.backward(dl)
 
 # fprop
 torch.cuda.synchronize()
 start = time.time()
 with torch.no_grad():
     for _ in range(timing_iterations):
-        l1 = opt_loss(ploc, plabel, gloc, glabel)
+        l = loss(ploc, plabel, gloc, glabel)
 
+print('loss: {}'.format(l))
 torch.cuda.synchronize()
 end = time.time()
 
-print('opt loss: {}'.format(l1))
 time_per_fprop = (end - start) / timing_iterations
 
 print('took {} seconds per iteration (fprop)'.format(time_per_fprop))
@@ -57,15 +52,14 @@ print('took {} seconds per iteration (fprop)'.format(time_per_fprop))
 torch.cuda.synchronize()
 start = time.time()
 for _ in range(timing_iterations):
-    l1 = opt_loss(ploc, plabel, gloc, glabel)
-    l1.backward()
+    l = loss(ploc, plabel, gloc, glabel)
+    l.backward(dl)
 
 torch.cuda.synchronize()
 end = time.time()
-print('opt loss: {}'.format(l1))
 
 time_per_fprop_bprop = (end - start) / timing_iterations
 
 print('took {} seconds per iteration (fprop + bprop)'.format(time_per_fprop_bprop))
 
-# print(loss.graph_for(ploc, plabel, gloc, glabel))
+print(loss.graph_for(ploc, plabel, gloc, glabel))
diff --git a/cv/detection/ssd/pytorch/train.py b/cv/detection/ssd/pytorch/train.py
new file mode 100644
index 000000000..cfb05e108
--- /dev/null
+++ b/cv/detection/ssd/pytorch/train.py
@@ -0,0 +1,554 @@
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import nvidia.dali
+import os
+from base_model import Loss
+from opt_loss import OptLoss
+from mlperf_logger import configure_logger, log_start, log_end, log_event, set_seeds, get_rank, barrier
+from mlperf_logging.mllog import constants
+import torch
+from torch.autograd import Variable
+import time
+import numpy as np
+import io
+from bisect import bisect       # for lr_scheduler
+
+from ssd300 import SSD300
+from master_params import create_flat_master
+from parse_config import parse_args, validate_arguments, validate_group_bn
+from data.build_pipeline import prebuild_pipeline, build_pipeline
+from box_coder import dboxes300_coco, build_ssd300_coder
+from async_evaluator import AsyncEvaluator
+from eval import coco_eval
+
+import sys
+import gc
+
+# necessary pytorch imports
+import torch.utils.data.distributed
+import torch.distributed as dist
+
+# Apex imports
+try:
+    import apex_C
+    import apex
+    from apex.parallel.LARC import LARC
+    from apex.parallel import DistributedDataParallel as DDP
+    from apex.fp16_utils import *
+    from apex.multi_tensor_apply import multi_tensor_applier
+    import amp_C
+except ImportError:
+    raise ImportError("Please install APEX from https://github.com/nvidia/apex")
+
+from contextlib import redirect_stdout
+
+from SSD import _C as C
+
+def print_message(rank, *print_args):
+    if rank == 0:
+        print(*print_args)
+
+def load_checkpoint(model, checkpoint, rank_id):
+    print("loading model checkpoint", checkpoint)
+    od = torch.load(checkpoint, map_location=torch.device('cuda:'+str(rank_id)))
+    # od = torch.load(checkpoint)
+
+    # remove proceeding 'module' from checkpoint
+    saved_model = od["model"]
+    for k in list(saved_model.keys()):
+        if k.startswith('module.'):
+            saved_model[k[7:]] = saved_model.pop(k)
+    model.load_state_dict(saved_model)
+    return od
+
+def check_async_evals(args, evaluator, threshold):
+    finished = 0
+    # Note: only one rank does COCOEval, so we need to check there if we've
+    # finished -- we'll broadcast that to a "finished" tensor to determine
+    # if we should stop
+    # Note2: ssd_print contains a barrier() call, implemented with all_reduce
+    #        If we conditional on rank 0, then an ssd_print all_reduce matches with
+    #        the finished all_reduce and all hell breaks loose.
+    if args.rank == 0:
+        for epoch, current_accuracy in evaluator.finished_tasks().items():
+            # Note: Move to per-iter check
+            # EVAL_START should be prior to the accuracy/score evaluation but adding the missing EVAL_START here for now
+            log_start(key=constants.EVAL_START, metadata={'epoch_num' : epoch})
+            log_event(key=constants.EVAL_ACCURACY,
+                      value=current_accuracy,
+                      metadata={'epoch_num' : epoch})
+            log_end(key=constants.EVAL_STOP, metadata={'epoch_num' : epoch})
+            if current_accuracy >= threshold:
+                finished = 1
+
+    # handle the non-distributed case -- don't need to bcast, just take local result
+    if not args.distributed:
+        return finished == 1
+
+    # Now we know from all ranks if they're done - reduce result
+    # Note: Already caught the non-distributed case above, can assume broadcast is available
+    with torch.no_grad():
+        finish_tensor = torch.tensor([finished], dtype=torch.int32, device=torch.device('cuda'))
+        # torch.distributed.all_reduce(finish_tensor)
+        torch.distributed.broadcast(finish_tensor, src=0)
+
+        # >= 1 ranks has seen final accuracy
+        if finish_tensor.item() >= 1:
+            return True
+
+    # Default case: No results, or no accuracte enough results
+    return False
+
+def check_async_evals_block(args, evaluator, threshold):
+    finished = 0
+    # Note: only one rank does COCOEval, so we need to check there if we've
+    # finished -- we'll broadcast that to a "finished" tensor to determine
+    # if we should stop
+    # Note2: ssd_print contains a barrier() call, implemented with all_reduce
+    #        If we conditional on rank 0, then an ssd_print all_reduce matches with
+    #        the finished all_reduce and all hell breaks loose.
+    if args.rank == 0:
+        for epoch, current_accuracy in evaluator.get_all_tasks().items():
+            # Note: Move to per-iter check
+            # EVAL_START should be prior to the accuracy/score evaluation but adding the missing EVAL_START here for now
+            log_start(key=constants.EVAL_START, metadata={'epoch_num' : epoch})
+            log_event(key=constants.EVAL_ACCURACY,
+                      value=current_accuracy,
+                      metadata={'epoch_num' : epoch})
+            log_end(key=constants.EVAL_STOP, metadata={'epoch_num' : epoch})
+            if current_accuracy >= threshold:
+                finished = 1
+
+    # handle the non-distributed case -- don't need to bcast, just take local result
+    if not args.distributed:
+        return finished == 1
+
+    # Now we know from all ranks if they're done - reduce result
+    # Note: Already caught the non-distributed case above, can assume broadcast is available
+    with torch.no_grad():
+        finish_tensor = torch.tensor([finished], dtype=torch.int32, device=torch.device('cuda'))
+        # torch.distributed.all_reduce(finish_tensor)
+        torch.distributed.broadcast(finish_tensor, src=0)
+
+        # >= 1 ranks has seen final accuracy
+        if finish_tensor.item() >= 1:
+            return True
+
+    # Default case: No results, or no accuracte enough results
+    return False
+
+def lr_warmup(optim, warmup_iter, iter_num, epoch, base_lr, args):
+    if iter_num < warmup_iter:
+        # new_lr = 1. * base_lr / warmup_iter * iter_num
+
+        # mlperf warmup rule
+        warmup_step = base_lr / (warmup_iter * (2 ** args.warmup_factor))
+        new_lr = base_lr - (warmup_iter - iter_num) * warmup_step
+
+        for param_group in optim.param_groups:
+            param_group['lr'] = new_lr
+        return new_lr
+    else:
+        return base_lr
+
+def setup_distributed(args):
+    # Setup multi-GPU if necessary
+    args.distributed = False
+    if 'WORLD_SIZE' in os.environ:
+        args.distributed = int(os.environ['WORLD_SIZE']) > 1
+
+    if args.distributed:
+        torch.cuda.set_device(args.local_rank)
+        if "MASTER_ADDR" in os.environ:
+            host_addr_full = 'tcp://' + os.environ["MASTER_ADDR"] + ':' + os.environ["MASTER_PORT"]
+            rank = int(os.environ["RANK"])
+            world_size = int(os.environ["WORLD_SIZE"])
+            torch.distributed.init_process_group(backend=args.backend, init_method=host_addr_full, rank=rank,
+                                                 world_size=world_size)
+        else:
+            torch.distributed.init_process_group(backend=args.backend, init_method='env://')
+
+    args.local_seed = set_seeds(args)
+    # start timing here
+    if args.distributed:
+        args.N_gpu = torch.distributed.get_world_size()
+        args.rank = args.local_rank #torch.distributed.get_rank()
+    else:
+        args.N_gpu = 1
+        args.rank = 0
+
+    validate_group_bn(args.bn_group)
+
+    return args
+
+def train300_mlperf_coco(args):
+
+    args = setup_distributed(args)
+
+    # Build the model
+    model_options = {
+        'use_nhwc' : args.nhwc,
+        'pad_input' : args.pad_input,
+        'bn_group' : args.bn_group,
+    }
+
+    ssd300 = SSD300(args, args.num_classes, **model_options)
+    if args.checkpoint is not None:
+        od = load_checkpoint(ssd300, args.checkpoint, args.rank)
+    else:
+        od = None
+
+    ssd300.train()
+    ssd300.cuda()
+    dboxes = dboxes300_coco()
+    # Note: No reason not to use optimised loss
+
+    if args.dali:
+        loss_func = OptLoss()
+    else:
+        loss_func = Loss(dboxes)
+    loss_func.cuda()
+
+    # Create optimizer.  This must also be done after network_to_half.
+    global_batch_size = (args.N_gpu * args.batch_size)
+    log_event(key=constants.MODEL_BN_SPAN, value=args.bn_group*args.batch_size)
+    log_event(key=constants.GLOBAL_BATCH_SIZE, value=global_batch_size)
+
+    # mlperf only allows base_lr scaled by an integer
+    base_lr = 2.5e-3
+    requested_lr_multiplier = args.lr / base_lr
+    adjusted_multiplier = max(1, round(requested_lr_multiplier * global_batch_size / 32))
+
+    current_lr = base_lr * adjusted_multiplier
+    current_momentum = 0.9
+    current_weight_decay = args.wd
+    static_loss_scale = 128.
+
+    optim = apex.optimizers.FusedSGD(ssd300.parameters(),
+                                     lr=current_lr,
+                                     momentum=current_momentum,
+                                     weight_decay=current_weight_decay)
+
+    ssd300, optim = apex.amp.initialize(ssd300, optim, opt_level=args.opt_level, loss_scale=static_loss_scale)
+    if od is not None and od.get("optimizer", None):
+        optim.load_state_dict(od["optimizer"])
+        for param, saved_param in zip(apex.amp.master_params(optim), od["master params"]):
+            param.data.copy_(saved_param.data)
+
+    # Parallelize.  Need to do this after network_to_half.
+    if args.distributed:
+        if args.delay_allreduce:
+            print_message(args.local_rank, "Delaying allreduces to the end of backward()")
+        ssd300 = DDP(ssd300,
+                     gradient_predivide_factor=args.N_gpu/8.0,
+                     delay_allreduce=args.delay_allreduce,
+                     retain_allreduce_buffers=args.use_fp16)
+
+    log_event(key=constants.OPT_BASE_LR, value=current_lr)
+    log_event(key=constants.OPT_LR_DECAY_BOUNDARY_EPOCHS, value=args.lr_decay_epochs)
+    log_event(key=constants.OPT_LR_DECAY_STEPS, value=args.lr_decay_epochs)
+    log_event(key=constants.OPT_WEIGHT_DECAY, value=current_weight_decay)
+    if args.warmup is not None:
+        log_event(key=constants.OPT_LR_WARMUP_STEPS, value=args.warmup)
+        log_event(key=constants.OPT_LR_WARMUP_FACTOR, value=args.warmup_factor)
+
+    print_message(args.local_rank, "epoch", "nbatch", "loss")
+
+    if od and od.get("iter_num", None):
+        args.iteration = od["iter_num"]
+    iter_num = args.iteration
+    avg_loss = 0.0
+
+    start_elapsed_time = time.time()
+    last_printed_iter = args.iteration
+    num_elapsed_samples = 0
+
+    input_c = 4 if args.pad_input else 3
+    example_shape = [args.batch_size, 300, 300, input_c] if args.nhwc else [args.batch_size, input_c, 300, 300]
+    example_input = torch.randn(*example_shape).cuda()
+
+    if args.use_fp16:
+        example_input = example_input.half()
+    if args.jit:
+        # DDP has some Python-side control flow.  If we JIT the entire DDP-wrapped module,
+        # the resulting ScriptModule will elide this control flow, resulting in allreduce
+        # hooks not being called.  If we're running distributed, we need to extract and JIT
+        # the wrapped .module.
+        # Replacing a DDP-ed ssd300 with a script_module might also cause the AccumulateGrad hooks
+        # to go out of scope, and therefore silently disappear.
+        module_to_jit = ssd300.module if args.distributed else ssd300
+        if args.distributed:
+            ssd300.module = torch.jit.trace(module_to_jit, example_input, check_trace=False)
+        else:
+            ssd300 = torch.jit.trace(module_to_jit, example_input, check_trace=False)
+
+    # do a dummy fprop & bprop to make sure cudnnFind etc. are timed here
+    ploc, plabel = ssd300(example_input)
+
+    # produce a single dummy "loss" to make things easier
+    loss = ploc[0,0,0] + plabel[0,0,0]
+    dloss = torch.randn_like(loss)
+    # Cause cudnnFind for dgrad, wgrad to run
+    loss.backward(dloss)
+
+    # Necessary import in init
+    from pycocotools.coco import COCO
+
+    encoder = build_ssd300_coder()
+
+    evaluator = AsyncEvaluator(num_threads=1)
+
+    log_end(key=constants.INIT_STOP)
+
+    ##### END INIT
+
+    # This is the first place we touch anything related to data
+    ##### START DATA TOUCHING
+    barrier()
+    log_start(key=constants.RUN_START)
+    barrier()
+
+    train_pipe = prebuild_pipeline(args)
+
+    train_loader, epoch_size = build_pipeline(args, training=True, pipe=train_pipe)
+    if args.rank == 0:
+        print("epoch size is: ", epoch_size, " images")
+
+    val_loader, inv_map, cocoGt = build_pipeline(args, training=False)
+    if args.profile_gc_off:
+        gc.disable()
+        gc.collect()
+
+    ##### END DATA TOUCHING
+    i_eval = 0
+    block_start_epoch = 0
+    log_start(key=constants.BLOCK_START,
+              metadata={'first_epoch_num': block_start_epoch,
+                        'epoch_count': args.evaluation[i_eval]})
+
+    if args.verify_checkpoint:
+        print("verify initial checkpoint....")
+        # Note: No longer returns, evaluation is abstracted away inside evaluator
+        if od and od.get("epoch", None):
+            epoch = od["epoch"]
+        else:
+            epoch = 0
+        coco_eval(args,
+                  ssd300,
+                  val_loader,
+                  cocoGt,
+                  encoder,
+                  inv_map,
+                  epoch,
+                  iter_num, current_lr,
+                  evaluator=evaluator)
+        finished = check_async_evals_block(args, evaluator, args.threshold)
+        if finished:
+            return True
+
+        torch.distributed.barrier()
+    for epoch in range(1, args.epochs + 1):
+        if epoch in args.lr_decay_epochs:
+            current_lr *= args.lr_decay_factor
+            print_message(args.rank, "lr decay step #" + str(bisect(args.lr_decay_epochs, epoch)))
+            for param_group in optim.param_groups:
+                param_group['lr'] = current_lr
+        if od and od.get("epoch", None):
+            if epoch <= od["epoch"]:
+                if epoch in args.evaluation:
+                    if epoch != max(args.evaluation):
+                        i_eval += 1
+                        block_start_epoch = epoch
+                print("Start continue training from epoch: {}, iter: {}, skip epoch {}".format(od["epoch"], iter_num, epoch))
+                continue
+        for p in ssd300.parameters():
+            p.grad = None
+
+        if epoch in args.evaluation:
+            # Get the existant state from the train model
+            # * if we use distributed, then we want .module
+            train_model = ssd300.module if args.distributed else ssd300
+
+            if args.distributed and args.allreduce_running_stats:
+                if args.rank == 0: print("averaging bn running means and vars")
+                # make sure every node has the same running bn stats before
+                # using them to evaluate, or saving the model for inference
+                world_size = float(torch.distributed.get_world_size())
+                for bn_name, bn_buf in train_model.named_buffers(recurse=True):
+                    if ('running_mean' in bn_name) or ('running_var' in bn_name):
+                        torch.distributed.all_reduce(bn_buf, op=dist.ReduceOp.SUM)
+                        bn_buf /= world_size
+
+            # Note: No longer returns, evaluation is abstracted away inside evaluator
+            coco_eval(args,
+                      ssd300,
+                      val_loader,
+                      cocoGt,
+                      encoder,
+                      inv_map,
+                      epoch,
+                      iter_num, current_lr,
+                      evaluator=evaluator)
+            log_end(key=constants.BLOCK_STOP, metadata={'first_epoch_num': block_start_epoch})
+            if epoch != max(args.evaluation):
+                i_eval += 1
+                block_start_epoch = epoch
+                log_start(key=constants.BLOCK_START,
+                          metadata={'first_epoch_num': block_start_epoch,
+                                    'epoch_count': (args.evaluation[i_eval] -
+                                                    args.evaluation[i_eval - 1])})
+
+            if args.rank == 0:
+                if args.save:
+                    print("saving model...")
+                    if not os.path.isdir('./models'):
+                        os.mkdir('./models')
+                    torch.save({
+                        "model": ssd300.state_dict(),
+                        "optimizer": optim.state_dict(),
+                        "master params": list(apex.amp.master_params(optim)),
+                        "epoch": epoch,
+                        "iter_num": iter_num,
+                    }, "./models/iter_{}.pt".format(iter_num))
+
+            finished = check_async_evals_block(args, evaluator, args.threshold)
+            if finished:
+                return True
+            if args.distributed:
+                torch.distributed.barrier()
+
+        log_start(key=constants.EPOCH_START,
+                  metadata={'epoch_num': epoch,
+                            'current_iter_num': iter_num})
+        for i, (img, bbox, label) in enumerate(train_loader):
+            if not args.dali:
+                img = img.cuda()
+                bbox = bbox.cuda()
+                label = label.cuda()
+
+            if args.profile_start is not None and iter_num == args.profile_start:
+                torch.cuda.profiler.start()
+                torch.cuda.synchronize()
+                if args.profile_nvtx:
+                    torch.autograd._enable_profiler(torch.autograd.ProfilerState.NVTX)
+
+            if args.profile is not None and iter_num == args.profile:
+                if args.profile_start is not None and iter_num >=args.profile_start:
+                    # we turned cuda and nvtx profiling on, better turn it off too
+                    if args.profile_nvtx:
+                        torch.autograd._disable_profiler()
+                    torch.cuda.profiler.stop()
+                return
+
+            new_lr = current_lr
+            if args.warmup is not None:
+                new_lr = lr_warmup(optim, args.warmup, iter_num, epoch, current_lr, args)
+
+            if (img is None) or (bbox is None) or (label is None):
+                print("No labels in batch")
+                continue
+
+            ploc, plabel = ssd300(img)
+            ploc, plabel = ploc.float(), plabel.float()
+
+            N = img.shape[0]
+            bbox.requires_grad = False
+            label.requires_grad = False
+            # reshape (N*8732X4 -> Nx8732x4) and transpose (Nx8732x4 -> Nx4x8732)
+            bbox = bbox.view(N, -1, 4).transpose(1,2).contiguous()
+            # reshape (N*8732 -> Nx8732) and cast to Long
+            label = label.view(N, -1).long()
+            loss = loss_func(ploc, plabel, bbox, label)
+
+            if i == 0 and epoch == 0:
+                avg_loss = loss.item()
+            else:
+                if np.isfinite(loss.item()):
+                    avg_loss = 0.999*avg_loss + 0.001*loss.item()
+                else:
+                    print("model exploded (corrupted by Inf or Nan)")
+                    sys.exit(1)
+
+            num_elapsed_samples += N
+            if args.rank == 0 and iter_num % args.print_interval == 0:
+                end_elapsed_time = time.time()
+                elapsed_time = end_elapsed_time - start_elapsed_time
+
+                avg_samples_per_sec = num_elapsed_samples * args.N_gpu / elapsed_time
+
+                print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}, avg. samples / sec: {:.2f}  lr: {:.6f}"\
+                            .format(iter_num, loss.item(), avg_loss, avg_samples_per_sec, new_lr), end="\n")
+
+                last_printed_iter = iter_num
+                start_elapsed_time = time.time()
+                num_elapsed_samples = 0
+
+            with apex.amp.scale_loss(loss, optim) as scaled_loss:
+                scaled_loss.backward()
+
+            if not args.profile_fake_optim:
+                optim.step()
+
+            # Likely a decent skew here, let's take this opportunity to set the
+            # gradients to None.  After DALI integration, playing with the
+            # placement of this is worth trying.
+            for p in ssd300.parameters():
+                p.grad = None
+
+            # Don't check every iteration due to cost of broadcast
+            if iter_num % 20 == 0:
+                finished = check_async_evals(args, evaluator, args.threshold)
+
+                if finished:
+                    return True
+
+            iter_num += 1
+
+        if args.dali:
+            train_loader.reset()
+        log_end(key=constants.EPOCH_STOP, metadata={'epoch_num': epoch})
+
+    finished = check_async_evals_block(args, evaluator, args.threshold)
+    if finished:
+        return True
+    return False
+
+def main():
+
+    configure_logger(constants.SSD)
+    log_start(key=constants.INIT_START, log_all_ranks=True)
+    args = parse_args()
+    try:
+        from dltest import show_training_arguments
+        show_training_arguments(args)
+    except:
+        pass
+    # make sure the epoch lists are in sorted order
+    args.evaluation.sort()
+    args.lr_decay_epochs.sort()
+
+    validate_arguments(args)
+
+    torch.set_num_threads(1)
+    torch.backends.cudnn.benchmark = not args.profile_cudnn_get
+
+    success = train300_mlperf_coco(args)
+    status = 'success' if success else 'aborted'
+
+    # end timing here
+    log_end(key=constants.RUN_STOP, metadata={'status': status})
+
+if __name__ == "__main__":
+    main()
diff --git a/cv/detection/ssd/pytorch/base/dataloaders/util.py b/cv/detection/ssd/pytorch/utils.py
similarity index 48%
rename from cv/detection/ssd/pytorch/base/dataloaders/util.py
rename to cv/detection/ssd/pytorch/utils.py
index f65f5f981..6862a0687 100644
--- a/cv/detection/ssd/pytorch/base/dataloaders/util.py
+++ b/cv/detection/ssd/pytorch/utils.py
@@ -20,6 +20,45 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 #
+# ------------------------------------------------------------------------------
+#
+# MIT License
+#
+# Copyright (c) 2017 Max deGroot, Ellis Brown
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# ------------------------------------------------------------------------------
+#
+# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import torch
 import torchvision
@@ -40,73 +79,159 @@ except ImportError:
     import json
 import gc
 import time
-# import bz2
-# import pickle
-from math import sqrt
-
-from box_coder import calc_iou_tensor, Encoder
-
-
-class DefaultBoxes(object):
-    def __init__(self, fig_size, feat_size, steps, scales, aspect_ratios, \
-                 scale_xy=0.1, scale_wh=0.2):
-
-        self.feat_size = feat_size
-        self.fig_size = fig_size
-
-        self.scale_xy_ = scale_xy
-        self.scale_wh_ = scale_wh
-
-        # According to https://github.com/weiliu89/caffe
-        # Calculation method slightly different from paper
-        self.steps = steps
-        self.scales = scales
-
-        fk = fig_size / np.array(steps)
-        self.aspect_ratios = aspect_ratios
-
-        self.default_boxes = []
-        # size of feature and number of feature
-        for idx, sfeat in enumerate(self.feat_size):
-
-            sk1 = scales[idx] / fig_size
-            sk2 = scales[idx + 1] / fig_size
-            sk3 = sqrt(sk1 * sk2)
-            all_sizes = [(sk1, sk1), (sk3, sk3)]
-
-            for alpha in aspect_ratios[idx]:
-                w, h = sk1 * sqrt(alpha), sk1 / sqrt(alpha)
-                all_sizes.append((w, h))
-                all_sizes.append((h, w))
-            for w, h in all_sizes:
-                for i, j in itertools.product(range(sfeat), repeat=2):
-                    cx, cy = (j + 0.5) / fk[idx], (i + 0.5) / fk[idx]
-                    self.default_boxes.append((cx, cy, w, h))
-
-        self.dboxes = torch.tensor(self.default_boxes, dtype=torch.float)
-        self.dboxes.clamp_(min=0, max=1)
-        # For IoU calculation
-        self.dboxes_ltrb = self.dboxes.clone()
-        self.dboxes_ltrb[:, 0] = self.dboxes[:, 0] - 0.5 * self.dboxes[:, 2]
-        self.dboxes_ltrb[:, 1] = self.dboxes[:, 1] - 0.5 * self.dboxes[:, 3]
-        self.dboxes_ltrb[:, 2] = self.dboxes[:, 0] + 0.5 * self.dboxes[:, 2]
-        self.dboxes_ltrb[:, 3] = self.dboxes[:, 1] + 0.5 * self.dboxes[:, 3]
+#import bz2
+import pickle
+from math import sqrt, ceil, cos, sin, pi
+from mlperf_logging.mllog import constants
+from mlperf_logger import log_event
+
+from SSD import _C as C
+
+from fused_color_jitter import FusedColorJitter
+from box_coder import Encoder
+
+# This function is from https://github.com/kuangliu/pytorch-ssd
+def calc_iou_tensor(box1, box2):
+    """ Calculation of IoU based on two boxes tensor,
+        Reference to https://github.com/kuangliu/pytorch-ssd
+        input:
+            box1 (N, 4)
+            box2 (M, 4)
+        output:
+            IoU (N, M)
+    """
+    N = box1.size(0)
+    M = box2.size(0)
+
+    be1 = box1.unsqueeze(1).expand(-1, M, -1)
+    be2 = box2.unsqueeze(0).expand(N, -1, -1)
+
+    # Left Top & Right Bottom
+    lt = torch.max(be1[:,:,:2], be2[:,:,:2])
+    #mask1 = (be1[:,:, 0] < be2[:,:, 0]) ^ (be1[:,:, 1] < be2[:,:, 1])
+    #mask1 = ~mask1
+    rb = torch.min(be1[:,:,2:], be2[:,:,2:])
+    #mask2 = (be1[:,:, 2] < be2[:,:, 2]) ^ (be1[:,:, 3] < be2[:,:, 3])
+    #mask2 = ~mask2
+
+    delta = rb - lt
+    delta[delta < 0] = 0
+    intersect = delta[:,:,0]*delta[:,:,1]
+    #*mask1.float()*mask2.float()
+
+    delta1 = be1[:,:,2:] - be1[:,:,:2]
+    area1 = delta1[:,:,0]*delta1[:,:,1]
+    delta2 = be2[:,:,2:] - be2[:,:,:2]
+    area2 = delta2[:,:,0]*delta2[:,:,1]
+
+    iou = intersect/(area1 + area2 - intersect)
+    return iou
+
+# This class is from https://github.com/chauhan-utk/ssd.DomainAdaptation
+class SSDCropping(object):
+    """ Cropping for SSD, according to original paper
+        Choose between following 3 conditions:
+        1. Preserve the original image
+        2. Random crop minimum IoU is among 0.1, 0.3, 0.5, 0.7, 0.9
+        3. Random crop
+        Reference to https://github.com/chauhan-utk/ssd.DomainAdaptation
+    """
+    def __init__(self):
 
-    @property
-    def scale_xy(self):
-        return self.scale_xy_
+        self.sample_options = (
+            # Do nothing
+            None,
+            # min IoU, max IoU
+            (0.1, None),
+            (0.3, None),
+            (0.5, None),
+            (0.7, None),
+            (0.9, None),
+            # no IoU requirements
+            (None, None),
+        )
+        # Implementation uses 1 iteration to find a possible candidate, this
+        # was shown to produce the same mAP as using more iterations.
+        self.num_cropping_iterations = 1
+        log_event(key=constants.MAX_SAMPLES,
+                  value=self.num_cropping_iterations)
 
-    @property
-    def scale_wh(self):
-        return self.scale_wh_
+    def __call__(self, img, img_size, bboxes, labels):
 
-    def __call__(self, order="ltrb"):
-        if order == "ltrb": return self.dboxes_ltrb
-        if order == "xywh": return self.dboxes
+        # Ensure always return cropped image
+        while True:
+            mode = random.choice(self.sample_options)
 
+            if mode is None:
+                return img, img_size, bboxes, labels
 
-# This function is from https://github.com/chauhan-utk/ssd.DomainAdaptation.
-class SSDCropping(object):
+            htot, wtot = img_size
+
+            min_iou, max_iou = mode
+            min_iou = float("-inf") if min_iou is None else min_iou
+            max_iou = float("+inf") if max_iou is None else max_iou
+
+            # Implementation use 50 iteration to find possible candidate
+            for _ in range(self.num_cropping_iterations):
+                # suze of each sampled path in [0.1, 1] 0.3*0.3 approx. 0.1
+                w = random.uniform(0.3 , 1.0)
+                h = random.uniform(0.3 , 1.0)
+
+                if w/h < 0.5 or w/h > 2:
+                    continue
+
+                # left 0 ~ wtot - w, top 0 ~ htot - h
+                left = random.uniform(0, 1.0 - w)
+                top = random.uniform(0, 1.0 - h)
+
+                right = left + w
+                bottom = top + h
+
+                ious = calc_iou_tensor(bboxes, torch.tensor([[left, top, right, bottom]]))
+
+                # tailor all the bboxes and return
+                if not ((ious > min_iou) & (ious < max_iou)).all():
+                    continue
+
+                # discard any bboxes whose center not in the cropped image
+                xc = 0.5*(bboxes[:, 0] + bboxes[:, 2])
+                yc = 0.5*(bboxes[:, 1] + bboxes[:, 3])
+
+                masks = (xc > left) & (xc < right) & (yc > top) & (yc < bottom)
+
+                # if no such boxes, continue searching again
+                if not masks.any():
+                    continue
+
+                bboxes[bboxes[:, 0] < left, 0] = left
+                bboxes[bboxes[:, 1] < top, 1] = top
+                bboxes[bboxes[:, 2] > right, 2] = right
+                bboxes[bboxes[:, 3] > bottom, 3] = bottom
+
+                #print(left, top, right, bottom)
+                #print(labels, bboxes, masks)
+                bboxes = bboxes[masks, :]
+                labels = labels[masks]
+
+                left_idx = int(left*wtot)
+                top_idx =  int(top*htot)
+                right_idx = int(right*wtot)
+                bottom_idx = int(bottom*htot)
+                #print(left_idx,top_idx,right_idx,bottom_idx)
+                #img = img[:, top_idx:bottom_idx, left_idx:right_idx]
+                img = img.crop((left_idx, top_idx, right_idx, bottom_idx))
+
+                bboxes[:, 0] = (bboxes[:, 0] - left)/w
+                bboxes[:, 1] = (bboxes[:, 1] - top)/h
+                bboxes[:, 2] = (bboxes[:, 2] - left)/w
+                bboxes[:, 3] = (bboxes[:, 3] - top)/h
+
+                htot = bottom_idx - top_idx
+                wtot = right_idx - left_idx
+                return img, (htot, wtot), bboxes, labels
+
+
+class SSDCroppingNoDali(object):
     """ Cropping for SSD, according to original paper
         Choose between following 3 conditions:
         1. Preserve the original image
@@ -207,7 +332,7 @@ class SSDCropping(object):
                 wtot = right_idx - left_idx
                 return img, (htot, wtot), bboxes, labels
 
-
+# Don't need to cast to float, already there (from FusedColorJitter)
 class ToTensor(object):
     def __init__(self):
         pass
@@ -215,10 +340,9 @@ class ToTensor(object):
     def __call__(self, img):
         img = torch.Tensor(np.array(img))
         # Transform from HWC to CHW
-        img = img.permute(2, 0, 1).div(255)
+        img = img.permute(2, 0 ,1).div(255)
         return img
 
-
 class RandomHorizontalFlip(object):
     def __init__(self, p=0.5):
         self.p = p
@@ -229,7 +353,6 @@ class RandomHorizontalFlip(object):
             return image.transpose(Image.FLIP_LEFT_RIGHT), bboxes
         return image, bboxes
 
-
 # Do data augumentation
 class SSDTransformer(object):
     """ SSD Data Augumentation, according to original paper
@@ -239,39 +362,27 @@ class SSDTransformer(object):
         Flipping
         Jittering
     """
-    def __init__(self, size=(300, 300), dboxes=None, dali=False, fast_nms=False, fast_cj=False, val=False, num_cropping_iterations=1):
+    def __init__(self, size = (300, 300), val=False):
 
         # define vgg16 mean
         self.size = size
         self.val = val
-        self.dali = dali
-        self.crop = SSDCropping(num_cropping_iterations=num_cropping_iterations)
-        if self.dali:
-            self.dboxes_ = None
-            self.encoder = None
-        else:
-            self.dboxes_ = dboxes  # DefaultBoxes300()
-            self.encoder = Encoder(self.dboxes_, fast_nms)
-
-        if fast_cj:
-            from fused_color_jitter import FusedColorJitter
-            self.img_trans = transforms.Compose([
-                transforms.Resize(self.size),
-                FusedColorJitter(),
-                ToTensor(),
-            ])
-        else:
-            self.img_trans = transforms.Compose([
-                transforms.Resize(self.size),
-                transforms.ColorJitter(brightness=0.125, contrast=0.5,
-                                       saturation=0.5, hue=0.05
-                                       ),
-                transforms.ToTensor()
-            ])
+
+        self.crop = SSDCropping()
+        self.img_trans = transforms.Compose([
+            transforms.Resize(self.size),
+            #transforms.ColorJitter(brightness=0.125, contrast=0.5,
+            #    saturation=0.5, hue=0.05
+            #),
+            #transforms.ToTensor(),
+            FusedColorJitter(),
+            ToTensor(),
+        ])
         self.hflip = RandomHorizontalFlip()
 
         # All Pytorch Tensor will be normalized
         # https://discuss.pytorch.org/t/how-to-preprocess-input-for-pre-trained-networks/683
+
         normalization_mean = [0.485, 0.456, 0.406]
         normalization_std = [0.229, 0.224, 0.225]
         self.normalize = transforms.Normalize(mean=normalization_mean,
@@ -280,7 +391,75 @@ class SSDTransformer(object):
         self.trans_val = transforms.Compose([
             transforms.Resize(self.size),
             transforms.ToTensor(),
-            self.normalize])
+            self.normalize,])
+
+    def __call__(self, img, img_size, bbox=None, label=None, max_num=200):
+        #img = torch.tensor(img)
+        if self.val:
+            bbox_out = torch.zeros(max_num, 4)
+            label_out =  torch.zeros(max_num, dtype=torch.long)
+            bbox_out[:bbox.size(0), :] = bbox
+            label_out[:label.size(0)] = label
+            return self.trans_val(img), img_size, bbox_out, label_out
+
+        # random crop
+        img, img_size, bbox, label = self.crop(img, img_size, bbox, label)
+
+        # random horiz. flip
+        img, bbox = self.hflip(img, bbox)
+
+        # [Resize, ColorJitter, ToTensor]
+        img = self.img_trans(img).contiguous()
+
+        img = self.normalize(img)
+
+        return img, img_size, bbox, label
+
+
+class SSDTransformerNoDali(object):
+    """ SSD Data Augumentation, according to original paper
+        Composed by several steps:
+        Cropping
+        Resize
+        Flipping
+        Jittering
+    """
+
+    def __init__(self, dboxes, size=(300, 300), val=False, num_cropping_iterations=1):
+        # define vgg16 mean
+        self.size = size
+        self.val = val
+
+        self.dboxes_ = dboxes  # DefaultBoxes300()
+        self.encoder = Encoder(self.dboxes_)
+
+        self.crop = SSDCroppingNoDali(num_cropping_iterations=num_cropping_iterations)
+        self.img_trans = transforms.Compose([
+            transforms.Resize(self.size),
+            # transforms.Resize((300, 300)),
+            # transforms.RandomHorizontalFlip(),
+            transforms.ColorJitter(brightness=0.125, contrast=0.5,
+                                   saturation=0.5, hue=0.05
+                                   ),
+            transforms.ToTensor()
+            # LightingNoice(),
+        ])
+        self.hflip = RandomHorizontalFlip()
+
+        # All Pytorch Tensor will be normalized
+        # https://discuss.pytorch.org/t/how-to-preprocess-input-for-pre-trained-networks/683
+        normalization_mean = [0.485, 0.456, 0.406]
+        normalization_std = [0.229, 0.224, 0.225]
+        self.normalize = transforms.Normalize(mean=normalization_mean,
+                                              std=normalization_std)
+        # self.normalize = transforms.Normalize(mean = [104.0, 117.0, 123.0],
+        #                                      std = [1.0, 1.0, 1.0])
+
+        self.trans_val = transforms.Compose([
+            transforms.Resize(self.size),
+            transforms.ToTensor(),
+            # ToTensor(),
+            self.normalize, ])
 
     @property
     def dboxes(self):
@@ -295,23 +474,19 @@ class SSDTransformer(object):
             label_out[:label.size(0)] = label
             return self.trans_val(img), img_size, bbox_out, label_out
 
-        # random crop
+        # print("before", img.size, bbox)
         img, img_size, bbox, label = self.crop(img, img_size, bbox, label)
-
-        # random horiz. flip
+        # print("after", img.size, bbox)
         img, bbox = self.hflip(img, bbox)
 
-        # [Resize, ColorJitter, ToTensor]
         img = self.img_trans(img).contiguous()
-
+        # img = img.contiguous().div(255)
         img = self.normalize(img)
 
-        if not self.dali:
-            bbox, label = self.encoder.encode(bbox, label)
+        bbox, label = self.encoder.encode(bbox, label)
 
         return img, img_size, bbox, label
 
-
 # Implement a datareader for COCO dataset
 class COCODetection(data.Dataset):
     def __init__(self, img_folder, annotate_file, transform=None, data=None):
@@ -330,14 +505,13 @@ class COCODetection(data.Dataset):
 
                 self.data = json.load(fin)
 
-                if gc_old:
-                    gc.enable()
+                if gc_old: gc.enable()
 
         self.images = {}
 
         self.label_map = {}
         self.label_info = {}
-        # print("Parsing COCO data...")
+        #print("Parsing COCO data...")
         start_time = time.time()
         # 0 stand for the background
         cnt = 0
@@ -351,8 +525,8 @@ class COCODetection(data.Dataset):
         for img in self.data["images"]:
             img_id = img["id"]
             img_name = img["file_name"]
-            img_size = (img["height"], img["width"])
-            # print(img_name)
+            img_size = (img["height"],img["width"])
+            #print(img_name)
             if img_id in self.images: raise Exception("dulpicated image record")
             self.images[img_id] = (img_name, img_size, [])
 
@@ -366,28 +540,29 @@ class COCODetection(data.Dataset):
 
         for k, v in list(self.images.items()):
             if len(v[2]) == 0:
-                # print("empty image: {}".format(k))
+                #print("empty image: {}".format(k))
                 self.images.pop(k)
 
         self.img_keys = list(self.images.keys())
         self.transform = transform
-        # print("End parsing COCO data, total time {}".format(time.time()-start_time))
+        #print("End parsing COCO data, total time {}".format(time.time()-start_time))
 
     @property
     def labelnum(self):
         return len(self.label_info)
 
-    # @staticmethod
-    # def load(pklfile):
-    #     # print("Loading from {}".format(pklfile))
-    #     with bz2.open(pklfile, "rb") as fin:
-    #         ret = pickle.load(fin)
-    #     return ret
+    @staticmethod
+    def load(pklfile):
+        #print("Loading from {}".format(pklfile))
+        with bz2.open(pklfile, "rb") as fin:
+            ret = pickle.load(fin)
+        return ret
+
+    def save(self, pklfile):
+        #print("Saving to {}".format(pklfile))
+        with bz2.open(pklfile, "wb") as fout:
+            pickle.dump(self, fout)
 
-    # def save(self, pklfile):
-    #     # print("Saving to {}".format(pklfile))
-    #     with bz2.open(pklfile, "wb") as fout:
-    #         pickle.dump(self, fout)
 
     def __len__(self):
         return len(self.images)
@@ -406,8 +581,8 @@ class COCODetection(data.Dataset):
         bbox_sizes = []
         bbox_labels = []
 
-        # for (xc, yc, w, h), bbox_label in img_data[2]:
-        for (l, t, w, h), bbox_label in img_data[2]:
+        #for (xc, yc, w, h), bbox_label in img_data[2]:
+        for (l,t,w,h), bbox_label in img_data[2]:
             r = l + w
             b = t + h
             #l, t, r, b = xc - 0.5*w, yc - 0.5*h, xc + 0.5*w, yc + 0.5*h
@@ -428,4 +603,5 @@ class COCODetection(data.Dataset):
         else:
             pass # img = transforms.ToTensor()(img)
 
-        return img, img_id, (htot, wtot), bbox_sizes, bbox_labels
\ No newline at end of file
+        return img, img_id, (htot, wtot), bbox_sizes, bbox_labels
+
diff --git a/cv/detection/ssd/pytorch/visualize.py b/cv/detection/ssd/pytorch/visualize.py
new file mode 100644
index 000000000..29c40ec5d
--- /dev/null
+++ b/cv/detection/ssd/pytorch/visualize.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nvidia.dali.pipeline import Pipeline
+import nvidia.dali.ops as ops
+import nvidia.dali.types as types
+
+import numpy as np
+from time import time
+import os
+import random
+import time
+import io
+import json
+
+import torch
+from PIL import Image
+from torchvision import transforms
+from torch.utils.data import DataLoader
+
+import matplotlib.pyplot as plt
+import matplotlib.patches as patches
+
+from argparse import ArgumentParser
+from utils import DefaultBoxes, Encoder, COCODetection
+from utils import SSDTransformer
+from ssd300 import SSD300
+from train import load_checkpoint, dboxes300_coco
+
+def parse_args():
+    parser = ArgumentParser(description="Visualize models predictions on image")
+    parser.add_argument('--images', '-i', nargs='*', type=str,
+                        help='path to jpg image')
+    parser.add_argument('--model', '-m', type=str, default='iter_240000.pt',
+                        help='path to trained model')
+    parser.add_argument('--threshold', '-t', type=float, default=0.10,
+                        help='threshold for predictions probabilities')
+    parser.add_argument('--annotations', '-a', type=str,
+                        default='/coco/annotations/instances_val2017.json',
+                        help='path to json with annotations')
+    return parser.parse_args()
+
+def print_image(image, model, encoder, inv_map, name_map, category_id_to_color, threshold):
+    # Open image for printing
+    im = Image.open(image)
+    W, H = im.size
+
+    # Prepare tensor input for model
+    tmp = im.copy()
+    tmp = tmp.resize((300, 300))
+    img = transforms.ToTensor()(tmp)
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    img = normalize(img).unsqueeze(dim = 0)
+
+    # Find predictions
+    with torch.no_grad():
+        ploc, plabel = model(img)
+        ploc, plabel = ploc.float(), plabel.float()
+
+        ret = []
+        for idx in range(ploc.shape[0]):
+            # ease-of-use for specific predictions
+            ploc_i = ploc[idx, :, :].unsqueeze(0)
+            plabel_i = plabel[idx, :, :].unsqueeze(0)
+
+            try:
+                result = encoder.decode_batch(ploc_i, plabel_i, 0.50, 200)[0]
+            except:
+                print("No object detected in image {}".format(image))
+                continue
+
+            htot, wtot = (H, W)
+            loc, label, prob = [r.cpu().numpy() for r in result]
+            for loc_, label_, prob_ in zip(loc, label, prob):
+                ret.append([0, loc_[0]*wtot, \
+                                    loc_[1]*htot,
+                                    (loc_[2] - loc_[0])*wtot,
+                                    (loc_[3] - loc_[1])*htot,
+                                    prob_,
+                                    inv_map[label_]])
+
+        ret = np.array(ret).astype(np.float32)
+
+    # Choose bounding boxes for printing
+    bboxes = []
+    for re in ret:
+        if re[5] > threshold:
+            bboxes.append(re)
+
+    print("Bounding boxes detected in image {}:".format(image))
+    print(bboxes)
+
+    # Prepare image for plotting
+    img = transforms.ToTensor()(im)
+    img = img.permute(1, 2, 0)
+    H = img.shape[0]
+    W = img.shape[1]
+    fig,ax = plt.subplots(1)
+    ax.imshow(img)
+
+    # Add bboxes with labels
+    used = set()
+    for bbox in bboxes:
+        if (bbox[6] in used):
+            rect = patches.Rectangle((bbox[1], bbox[2]), bbox[3], bbox[4],
+                                    edgecolor=category_id_to_color[bbox[6]],
+                                    linewidth=2, facecolor='none')
+        else:
+            rect = patches.Rectangle((bbox[1], bbox[2]), bbox[3], bbox[4],
+                                    label = name_map[bbox[6]],
+                                    edgecolor=category_id_to_color[bbox[6]],
+                                    linewidth=2, facecolor='none')
+            used.add(bbox[6])
+        ax.add_patch(rect)
+
+    # Show image
+    plt.legend(ncol=1, bbox_to_anchor=(1.04,1), loc="upper left")
+    plt.show()
+
+def main():
+    # Parse arguments
+    args = parse_args()
+
+    # Get categories names
+    with open(args.annotations,'r') as anno:
+        js = json.loads(anno.read())
+        coco_names = js['categories']
+
+    # Prepare map of COCO labels to COCO names
+    name_map = {}
+    for name in coco_names:
+        name_map[name['id']] = name['name']
+
+    # Prepare map of SSD to COCO labels
+    deleted = [12, 26, 29, 30, 45, 66, 68, 69, 71, 83]
+    inv_map = {}
+    cnt = 0
+    for i in range(1, 81):
+        while i + cnt in deleted:
+            cnt += 1
+        inv_map[i] = i + cnt
+
+    # Prepare colors for categories
+    category_id_to_color = dict([(cat_id, [random.uniform(0, 1) ,random.uniform(0, 1), random.uniform(0, 1)]) for cat_id in range(1, 91)])
+
+    # Set math plot lib size
+    plt.rcParams["figure.figsize"] = (12, 8)
+
+    # Build and load SSD model
+    ssd300 = SSD300(81, backbone="resnet34", model_path=None, dilation=None)
+    load_checkpoint(ssd300, args.model)
+    ssd300.eval()
+
+    # Prepare encoder
+    dboxes = dboxes300_coco()
+    encoder = Encoder(dboxes)
+
+    # Print images
+    for image in args.images:
+        print_image(image, ssd300, encoder, inv_map, name_map, category_id_to_color, args.threshold)
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/executables/ssd/init_torch.sh b/tests/executables/ssd/init_torch.sh
index 5065ae4b7..19211d91a 100644
--- a/tests/executables/ssd/init_torch.sh
+++ b/tests/executables/ssd/init_torch.sh
@@ -51,7 +51,7 @@ if [[ "$(uname -m)" == "aarch64" ]]; then
     source /opt/rh/gcc-toolset-11/enable
 fi
 
-cd ../../research/cv/detection/ssd && bash ./clean_ssd.sh && bash ./build_ssd.sh && bash ./install_ssd.sh "$@";  check_status
+cd ../../cv/detection/ssd/pytorch/ && bash ./clean_ssd.sh && bash ./build_ssd.sh && bash ./install_ssd.sh "$@";  check_status
 DATA_PATH_BBOX=../../../..
 
 python3 prepare-json.py --keep-keys ${DATA_PATH}/annotations/instances_val2017.json ${DATA_PATH_BBOX}/bbox_only_instances_val2017.json "$@";  check_status
diff --git a/tests/executables/ssd/train_ssd_amp_torch.sh b/tests/executables/ssd/train_ssd_amp_torch.sh
index 6d348a988..719664e12 100644
--- a/tests/executables/ssd/train_ssd_amp_torch.sh
+++ b/tests/executables/ssd/train_ssd_amp_torch.sh
@@ -11,7 +11,7 @@ check_status()
     fi
 }
 
-cd ../../research/cv/detection/ssd
+cd ../../cv/detection/ssd/pytorch/
 
 echo "python3 train.py --no-dali --dali-cache 0 --data=${COCO_PATH} \
 --batch-size=${BATCH_SIZE} --warmup-factor=0 --warmup=650 --lr=2.92e-3 --threshold=0.08 --epochs 5 --eval-batch-size=160 \
-- 
Gitee


From 66387b946415c0e914c5886f2f298fcc5f75b3c0 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Fri, 26 Sep 2025 15:17:05 +0800
Subject: [PATCH 16/20] sync bert pytorch all

---
 .../bert_sample/pytorch/base/BaseDockerfile   |    58 +
 .../bert_sample/pytorch/base/bind_pyt.py      |   201 +
 .../pytorch/base/config/__init__.py           |     4 +
 .../bert_sample/pytorch/base/config/_base.py  |   189 +
 .../pytorch/base/config/config_manager.py     |   176 +
 .../pytorch/base/config/mutable_params.py     |    24 +
 .../pytorch/base/create_container.sh          |   184 +
 .../pytorch/base/create_contaner.sh           |   183 +
 .../pytorch/base/create_contaner_bi.sh        |   184 +
 .../2048_shards_varlength.chk                 |  2048 ++++
 .../4320_shards_varlength.chk                 |  4320 +++++++
 .../data_preprocessing/chop_hdf5_files.py     |   150 +
 .../chop_hdf5_files_to_varlength.py           |   154 +
 .../pytorch/base/data_preprocessing/clean.sh  |    86 +
 .../base/data_preprocessing/cleanup_file.py   |    83 +
 .../convert_fixed2variable.py                 |    71 +
 .../convert_tf_checkpoint.py                  |    93 +
 .../create_pretraining_data.py                |   455 +
 .../create_pretraining_data_wrapper.sh        |    30 +
 .../base/data_preprocessing/do_gather.py      |    95 +
 .../do_sentence_segmentation.py               |    74 +
 .../pytorch/base/data_preprocessing/eval.md5  | 10000 ++++++++++++++++
 .../data_preprocessing/eval_varlength.chk     |     1 +
 .../base/data_preprocessing/hdf5_md5.py       |    29 +
 .../parallel_create_hdf5.sh                   |    75 +
 .../data_preprocessing/pick_eval_samples.py   |    83 +
 .../pick_eval_samples_varlength.py            |    76 +
 .../base/data_preprocessing/prepare_data.sh   |   167 +
 .../base/data_preprocessing/process_wiki.sh   |    34 +
 .../data_preprocessing/seperate_test_set.py   |   120 +
 .../base/data_preprocessing/tokenization.py   |   413 +
 .../pytorch/base/dataloaders/__init__.py      |     3 +
 .../pytorch/base/dataloaders/dataloader.py    |   235 +
 .../pytorch/base/dataloaders/dataset.py       |   159 +
 .../pytorch/base/model/__init__.py            |    19 +
 .../pytorch/base/model/layers/__init__.py     |     6 +
 .../pytorch/base/model/layers/activations.py  |    82 +
 .../pytorch/base/model/layers/embeddings.py   |    59 +
 .../pytorch/base/model/layers/layernorm.py    |    36 +
 .../pytorch/base/model/layers/padding.py      |   125 +
 .../pytorch/base/model/losses/__init__.py     |     0
 .../pytorch/base/model/models/__init__.py     |     0
 .../pytorch/base/model/models/modeling.py     |  1394 +++
 .../pytorch/base/optimizers/__init__.py       |     1 +
 .../pytorch/base/optimizers/factory.py        |    33 +
 .../pytorch/base/optimizers/lamb.py           |   102 +
 .../bert_sample/pytorch/base/prepare.py       |   288 +
 .../bert_sample/pytorch/base/requirements.txt |     3 +
 .../pytorch/base/run_pretraining.py           |   173 +
 .../bert_sample/pytorch/base/run_training.sh  |    39 +
 .../pytorch/base/run_with_docker.sh           |   208 +
 .../pytorch/base/schedulers/__init__.py       |     1 +
 .../pytorch/base/schedulers/base.py           |    49 +
 .../pytorch/base/schedulers/factory.py        |    33 +
 .../linear_warmup_poly_scheduler.py           |    57 +
 .../schedulers/linear_warmup_scheduler.py     |    33 +
 .../bert_sample/pytorch/base/setup.py         |    98 +
 .../pytorch/base/train/__init__.py            |     0
 .../pytorch/base/train/evaluator.py           |    99 +
 .../pytorch/base/train/event/__init__.py      |     4 +
 .../pytorch/base/train/event/base.py          |    65 +
 .../pytorch/base/train/event/base_adapter.py  |    40 +
 .../pytorch/base/train/event/compose.py       |   110 +
 .../pytorch/base/train/event/log.py           |   155 +
 .../bert_sample/pytorch/base/train/trainer.py |   222 +
 .../pytorch/base/train/training_state.py      |    73 +
 .../pytorch/base/utils/__init__.py            |    17 +
 .../bert_sample/pytorch/base/utils/check.py   |    94 +
 .../pytorch/base/utils/checkpoint.py          |    77 +
 .../bert_sample/pytorch/base/utils/dist.py    |   202 +
 .../bert_sample/pytorch/base/utils/logging.py |   245 +
 .../bert_sample/pytorch/base/utils/paths.py   |    18 +
 .../pytorch/base/utils/tokenization.py        |   428 +
 tests/executables/bert/init_torch.sh          |    72 +
 .../train_bert_default_amp_dist_1x8_torch.sh  |    41 +
 ...ain_bert_pretraining_amp_dist_1x8_torch.sh |    31 +
 76 files changed, 25089 insertions(+)
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/BaseDockerfile
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/bind_pyt.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/config/__init__.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/config/_base.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/config/config_manager.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/config/mutable_params.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/create_container.sh
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/create_contaner.sh
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/create_contaner_bi.sh
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/data_preprocessing/2048_shards_varlength.chk
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/data_preprocessing/4320_shards_varlength.chk
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/data_preprocessing/chop_hdf5_files.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/data_preprocessing/chop_hdf5_files_to_varlength.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/data_preprocessing/clean.sh
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/data_preprocessing/cleanup_file.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/data_preprocessing/convert_fixed2variable.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/data_preprocessing/convert_tf_checkpoint.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/data_preprocessing/create_pretraining_data.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/data_preprocessing/create_pretraining_data_wrapper.sh
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/data_preprocessing/do_gather.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/data_preprocessing/do_sentence_segmentation.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/data_preprocessing/eval.md5
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/data_preprocessing/eval_varlength.chk
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/data_preprocessing/hdf5_md5.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/data_preprocessing/parallel_create_hdf5.sh
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/data_preprocessing/pick_eval_samples.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/data_preprocessing/pick_eval_samples_varlength.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/data_preprocessing/prepare_data.sh
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/data_preprocessing/process_wiki.sh
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/data_preprocessing/seperate_test_set.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/data_preprocessing/tokenization.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/dataloaders/__init__.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/dataloaders/dataloader.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/dataloaders/dataset.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/model/__init__.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/model/layers/__init__.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/model/layers/activations.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/model/layers/embeddings.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/model/layers/layernorm.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/model/layers/padding.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/model/losses/__init__.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/model/models/__init__.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/model/models/modeling.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/optimizers/__init__.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/optimizers/factory.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/optimizers/lamb.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/prepare.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/requirements.txt
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/run_pretraining.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/run_training.sh
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/run_with_docker.sh
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/schedulers/__init__.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/schedulers/base.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/schedulers/factory.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/schedulers/linear_warmup_poly_scheduler.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/schedulers/linear_warmup_scheduler.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/setup.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/train/__init__.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/train/evaluator.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/train/event/__init__.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/train/event/base.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/train/event/base_adapter.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/train/event/compose.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/train/event/log.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/train/trainer.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/train/training_state.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/utils/__init__.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/utils/check.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/utils/checkpoint.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/utils/dist.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/utils/logging.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/utils/paths.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/base/utils/tokenization.py
 create mode 100644 tests/executables/bert/init_torch.sh
 create mode 100644 tests/executables/bert/train_bert_default_amp_dist_1x8_torch.sh
 create mode 100644 tests/executables/bert/train_bert_pretraining_amp_dist_1x8_torch.sh

diff --git a/nlp/language_model/bert_sample/pytorch/base/BaseDockerfile b/nlp/language_model/bert_sample/pytorch/base/BaseDockerfile
new file mode 100644
index 000000000..6f036a1c0
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/BaseDockerfile
@@ -0,0 +1,58 @@
+FROM ubuntu:18.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PATH /root/miniconda/bin:$PATH
+
+
+RUN apt-get update -y
+RUN apt-get install -y \
+     apt-utils \
+     sudo \
+     openssh-server \
+     vim \
+     git \
+     curl \
+     wget \
+     tree \
+     perl \
+     kmod \
+     make \
+     pciutils \
+     build-essential \
+     python3.8-dev \
+     python3-pip \
+     libjpeg-dev \
+     zlib1g-dev \
+     unzip \
+     cmake \
+     bzip2 \
+     cabextract \
+     iputils-ping \
+     pbzip2 \
+     pv \
+     numactl
+
+# Configure anaconda
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linux-x86_64.sh && \
+    bash ./Miniconda3-py38_4.10.3-Linux-x86_64.sh -b -p /root/miniconda && \
+#    eval "$(/root/miniconda/bin/conda shell.bash hook)" && \
+    /root/miniconda/bin/conda clean -tipsy && \
+    ln -s /root/miniconda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
+    echo ". /root/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \
+    echo "conda activate base" >> ~/.bashrc && \
+    conda config --set always_yes yes --set changeps1 no
+
+
+RUN /bin/bash -c "apt-get install -y linux-headers-`uname -r`"
+
+# TODO: Remove pip source
+RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"
+
+COPY requirements.txt requirements.txt
+RUN /bin/bash -c "pip3 install -r requirements.txt"
+
+
+WORKDIR /workspace/baai-perf
+
+
+
diff --git a/nlp/language_model/bert_sample/pytorch/base/bind_pyt.py b/nlp/language_model/bert_sample/pytorch/base/bind_pyt.py
new file mode 100644
index 000000000..827787c6c
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/bind_pyt.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import subprocess
+import os
+import os.path as ospath
+import socket
+from argparse import ArgumentParser, REMAINDER
+import random
+import psutil
+
+MODEL_DIR = ospath.abspath(
+    ospath.join(
+        __file__,
+        "../../"
+    )
+)
+
+PROJ_DIR = MODEL_DIR
+
+MODEL = ospath.basename(MODEL_DIR)
+
+
+def _parse_known_args(parser, *args, **kwargs):
+    return parser.parse_known_args(*args, **kwargs)
+
+
+def parse_args():
+    parser = ArgumentParser(description="PyTorch distributed training launch "
+                                        "helper utilty that will spawn up "
+                                        "multiple distributed processes")
+
+    parser.add_argument("--node_rank", type=int, default=0,
+                        help="The rank of the node for multi-node distributed "
+                             "training")
+    parser.add_argument("--master_addr", default="127.0.0.1", type=str,
+                        help="Master node (rank 0)'s address, should be either "
+                             "the IP address or the hostname of node 0, for "
+                             "single node multi-proc training, the "
+                             "--master_addr can simply be 127.0.0.1")
+    parser.add_argument("--master_port", default=random.randint(10000,65534), type=int,
+                        help="Master node (rank 0)'s free port that needs to "
+                             "be used for communciation during distributed "
+                             "training")
+    parser.add_argument('--no_hyperthreads', action='store_true',
+                        help='Flag to disable binding to hyperthreads')
+    parser.add_argument('--no_membind', action='store_true',
+                        help='Flag to disable memory binding')
+
+    # non-optional arguments for binding
+    parser.add_argument("--nsockets_per_node", type=int, required=True,
+                        help="Number of CPU sockets on a node")
+    parser.add_argument("--ncores_per_socket", type=int, required=True,
+                        help="Number of CPU cores per socket")
+
+    parser.add_argument("--training_script", type=str, required=True,
+                        help="The full path to the single GPU training "
+                             "program/script to be launched in parallel, "
+                             "followed by all the arguments for the "
+                             "training script")
+
+    parser.add_argument("--config", type=str, required=True)
+    parser.add_argument("--name", type=str, required=True)
+
+    args, training_script_args = _parse_known_args(parser)
+    args.training_script_args = training_script_args
+
+    return args
+
+
+def get_cuda_visible_devices(gpus=1):
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        return os.environ['CUDA_VISIBLE_DEVICES']
+    return ','.join([str(gpu_id) for gpu_id in range(gpus)])
+
+def is_port_in_use(port):
+    for conn in psutil.net_connections():
+        if conn.laddr.port == port:
+            return True
+        
+    return False
+
+def get_process_info_by_port(port):
+    for conn in psutil.net_connections(kind='inet'):
+        if conn.laddr.port == port:
+            process = psutil.Process(conn.pid)
+            return {
+                "pid": conn.pid,
+                "name": process.name(),
+                "cmdline": process.cmdline(),
+            }
+    return None
+
+def main():
+    args = parse_args()
+    config_full_name = f"config_{args.config}.py"
+    config_path = ospath.join(PROJ_DIR, args.name, "config", config_full_name)
+
+    _, args.nnodes, args.nproc_per_node = args.config.split("x")
+
+    args.nnodes = int(args.nnodes)
+    args.nproc_per_node = int(args.nproc_per_node)
+
+    # variables for numactrl binding
+
+    NSOCKETS = args.nsockets_per_node
+    NGPUS_PER_SOCKET = (args.nproc_per_node // args.nsockets_per_node) + (1 if (args.nproc_per_node % args.nsockets_per_node) else 0)
+    NCORES_PER_GPU = args.ncores_per_socket // NGPUS_PER_SOCKET
+
+    # world size in terms of number of processes
+    dist_world_size = args.nproc_per_node * args.nnodes
+
+    if is_port_in_use(args.master_port):
+        process_info = get_process_info_by_port(args.master_port)
+        if process_info:
+            print(f"端口：{args.master_port} 被进程 {process_info['name']} 占用")
+            print(f"进程ID: {process_info['pid']}")
+            print(f"进程命令行: {' '.join(process_info['cmdline'])}")
+        else:
+            print(f"端口：{args.master_port} 被占用，请先关闭占用此端口进程 !")
+        return
+    else:
+        print(f"master port: {args.master_port} is ok")
+
+    # set PyTorch distributed related environmental variables
+    current_env = os.environ.copy()
+    current_env["MASTER_ADDR"] = args.master_addr
+    current_env["MASTER_PORT"] = str(args.master_port)
+    current_env["WORLD_SIZE"] = str(dist_world_size)
+    current_env["NODE_RANK"] = str(args.node_rank)
+    current_env["CUDA_VISIBLE_DEVICES"] = get_cuda_visible_devices(args.nproc_per_node)
+
+    print(args.master_addr)
+    print(args.master_port)
+
+    processes = []
+
+    for local_rank in range(0, args.nproc_per_node):
+        # each process's rank
+        dist_rank = args.nproc_per_node * args.node_rank + local_rank
+        current_env["RANK"] = str(dist_rank)
+        current_env["LOCAL_RANK"] = str(local_rank)
+
+        # form numactrl binding command
+        cpu_ranges = [local_rank * NCORES_PER_GPU,
+                     (local_rank + 1) * NCORES_PER_GPU - 1,
+                     local_rank * NCORES_PER_GPU + (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS),
+                     (local_rank + 1) * NCORES_PER_GPU + (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS) - 1]
+
+        numactlargs = []
+        if args.no_hyperthreads:
+            numactlargs += [ "--physcpubind={}-{}".format(*cpu_ranges[0:2]) ]
+        else:
+            numactlargs += [ "--physcpubind={}-{},{}-{}".format(*cpu_ranges) ]
+
+        if not args.no_membind:
+            memnode = local_rank // NGPUS_PER_SOCKET
+            numactlargs += [ "--membind={}".format(memnode) ]
+
+        # spawn the processes
+        cmd = [ "/usr/bin/numactl" ] \
+            + numactlargs \
+            + [ sys.executable,
+                "-u",
+                args.training_script,
+                "--local_rank={}".format(local_rank)
+              ] \
+            + args.training_script_args + [f"{config_path}"]
+
+        print("=" * 80)
+        print("= numactlargs_flag")
+        print(cmd)
+        print("=" * 80)
+        process = subprocess.Popen(cmd, env=current_env)
+        processes.append(process)
+
+    proc_status = []
+
+    for process in processes:
+        process.wait()
+        proc_status.append(process.returncode != 0)
+
+    exit(all(proc_status))
+
+
+if __name__ == "__main__":
+    main()
+
+
diff --git a/nlp/language_model/bert_sample/pytorch/base/config/__init__.py b/nlp/language_model/bert_sample/pytorch/base/config/__init__.py
new file mode 100644
index 000000000..258fb17d7
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/config/__init__.py
@@ -0,0 +1,4 @@
+from ._base import *
+
+from .config_manager import activate_config_env
+
diff --git a/nlp/language_model/bert_sample/pytorch/base/config/_base.py b/nlp/language_model/bert_sample/pytorch/base/config/_base.py
new file mode 100644
index 000000000..cecb8d69e
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/config/_base.py
@@ -0,0 +1,189 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+from typing import ClassVar
+from train.event.base import BaseTrainingEventInterface
+
+# The train dir. Should contain train_dir, eval_dir, init_checkpoint, bert_config_path for the task.
+data_dir: str = None
+
+# The train dir. Should contain .hdf5 files  for the task.
+train_dir: str = None
+
+# Bert pre-trained model selected in the list:
+# bert-base-uncased, bert-large-uncased, bert-base-cased,
+# bert-base-multilingual, bert-base-chinese.
+bert_model: str = "bert-large-uncased"
+
+# The output directory where the model checkpoints will be written.
+output_dir: str = None
+
+# The eval data dir. Should contain .hdf5 files  for the task.
+eval_dir: str = None
+
+# Sample to begin performing eval.
+eval_iter_start_samples: int = 150000
+
+# If set to -1, disable eval, else evaluate every eval_iter_samples during training
+eval_iter_samples: int = 150000
+
+# number of eval examples to run eval on
+num_eval_examples: int = 10000
+
+# whether to cache evaluation data on GPU
+cache_eval_data: bool = False
+
+# The initial checkpoint to start training from.
+init_checkpoint: str = None
+
+# The initial TF checkpoint to start training from.
+init_tf_checkpoint: str = None
+
+# Whether to verify init checkpoint.
+verify_checkpoint: bool = True
+
+# The maximum total input sequence length after WordPiece tokenization.
+# Sequences longer than this will be truncated, and sequences shorter
+# than this will be padded.
+max_seq_length: int = 512
+
+# The maximum total of masked tokens in input sequence
+max_predictions_per_seq: int = 76
+
+# Total batch size for training.
+train_batch_size: int = 18
+
+# Total batch size for training.
+eval_batch_size: int = 128
+
+# The initial learning rate for LAMB.
+learning_rate: float = 4e-05
+
+# weight decay rate for LAMB.
+weight_decay_rate: float = 0.01
+
+# LAMB beta1.
+opt_lamb_beta_1: float = 0.9
+
+# LAMB beta2.
+opt_lamb_beta_2: float = 0.999
+
+# Total number of training steps to perform.
+max_steps: int = 1536
+
+# Total number of training samples to run.
+max_samples_termination: float = 14000000
+
+# Proportion of optimizer update steps to perform linear learning rate warmup for.
+# Typically 1/8th of steps for Phase2
+warmup_proportion: float = 0.01
+
+# Number of optimizer update steps to perform linear learning rate warmup for.
+# Typically 1/8th of steps for Phase2
+warmup_steps: int = 0
+
+# Starting step for warmup.
+start_warmup_step: int = 0
+
+# local_rank for distributed training on gpus
+local_rank: int = -1
+
+# Communication backend for distributed training on gpus
+dist_backend: str = "nccl"
+
+# random seed for initialization
+seed: int = 42
+
+# Number of updates steps to accumualte before performing a backward/update pass.
+gradient_accumulation_steps: int = 1
+
+# Whether to use 16-bit float precision instead of 32-bit
+fp16: bool = False
+
+# Loss scaling, positive power of 2 values can improve fp16 convergence.
+loss_scale: float = 0.0
+
+# frequency of logging loss. If not positive, no logging is provided for training loss
+log_freq: int = 1
+
+# Whether to use gradient checkpointing
+checkpoint_activations: bool = False
+
+# Whether to resume training from checkpoint.
+# If set, precedes init_checkpoint/init_tf_checkpoint
+resume_from_checkpoint: bool = False
+
+# The initial checkpoint to start continue training from.
+resume_init_checkpoint: str = None
+
+# Number of checkpoints to keep (rolling basis).
+keep_n_most_recent_checkpoints: int = 20
+
+# Number of update steps until a model checkpoint is saved to disk.
+num_samples_per_checkpoint: int = 500000
+
+# Number of update steps until model checkpoints start saving to disk.
+min_samples_to_start_checkpoints: int = 3000000
+
+# Whether to save checkpoints
+save_checkpoint: bool = False
+
+# Whether to run training.
+do_train: bool = False
+
+# Whether to run with unpadding.
+exchange_padding: bool = False
+
+# Whether to disable fusion of attention mask to softmax and dropout.
+enable_fuse_dropout: bool = False
+
+# Whether to disable fusion of the attention mask to softmax.
+disable_fuse_mask: bool = False
+
+# Whether to run with optimizations.
+fused_gelu_bias: bool = False
+
+# Whether to run with optimizations.
+fused_dropout_add: bool = False
+
+# Whether to run with optimizations.
+dense_seq_output: bool = False
+
+# Whether to read local rank from ENVVAR
+use_env: bool = False
+
+# Path bert_config.json is located in
+bert_config_path: str = None
+
+# Stop training after reaching this Masked-LM accuracy
+target_mlm_accuracy: float = 0.720
+
+# Average accuracy over this amount of batches before performing a stopping criterion test
+train_mlm_accuracy_window_size: int = 0
+
+# Number of epochs to plan seeds for. Same set across all workers.
+# num_epochs_to_generate_seeds_for: int = 2
+num_epochs_to_generate_seeds_for: int = 3
+
+# Enable DDP.
+use_ddp: bool = False
+
+# Turn ON gradient_as_bucket_view optimization in native DDP.
+use_gradient_as_bucket_view: bool = False
+
+# A object to provide some core components in training
+training_event: ClassVar[BaseTrainingEventInterface] = None
+
+# device
+device: str = None
+n_gpu: int = 1
+
+eval_interval_samples: int = 0
+eval_steps = 1000
diff --git a/nlp/language_model/bert_sample/pytorch/base/config/config_manager.py b/nlp/language_model/bert_sample/pytorch/base/config/config_manager.py
new file mode 100644
index 000000000..69520ad43
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/config/config_manager.py
@@ -0,0 +1,176 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+
+import copy
+import importlib
+import inspect
+import os
+import sys
+from argparse import ArgumentParser
+from typing import Iterable, Mapping
+
+import config as global_config
+from . import _base as base_config
+from .mutable_params import mutable_params
+
+mutable_params = copy.copy(mutable_params)
+immutable_params = set(global_config.__dict__.keys()) - set(mutable_params)
+
+
+def get_config(config_path: str):
+    if os.path.exists(config_path):
+        abs_path = config_path
+        sys.path.append(os.path.dirname(abs_path))
+        config_path = os.path.basename(config_path).replace(".py", "")
+        try:
+            module = importlib.import_module(config_path)
+        except Exception as ex:
+            sys.path.pop(-1)
+            raise ex
+        sys.path.pop(-1)
+    else:
+        raise FileNotFoundError("Not found config:", config_path)
+
+    return module
+
+
+def get_annotations(other_modules: list=None):
+    annotations = dict()
+
+    if "__annotations__" in base_config.__dict__:
+        annotations.update(base_config.__dict__["__annotations__"])
+
+    if other_modules is not None:
+        for mod in other_modules:
+            if isinstance(mod, str):
+                mod = get_config(mod)
+            if "__annotations__" in mod.__dict__:
+                annotations.update(mod.__dict__["__annotations__"])
+
+    return annotations
+
+
+def is_property(name: str, value):
+    status = [
+        not name.startswith('__'),
+        not callable(value),
+        not inspect.isclass(value),
+        not inspect.ismodule(value),
+        not inspect.ismethod(value),
+        not inspect.isfunction(value),
+        not inspect.isbuiltin(value),
+    ]
+
+    return all(status)
+
+
+def get_properties_from_config(config):
+    if not isinstance(config, Mapping):
+        config = config.__dict__
+    properties = dict()
+    for name, value in config.items():
+        if is_property(name, value):
+            properties[name] = value
+
+    return properties
+
+
+def add_to_argparser(config: dict, parser: ArgumentParser, other_modules: list=None):
+    annotations = get_annotations(other_modules)
+
+    def get_property_type(name, value):
+        if value is not None:
+            return type(value)
+        if name in annotations:
+            return annotations[name]
+        return str
+
+    def add_args(parser, name, value, prefix=''):
+        dtype = get_property_type(prefix + name, value)
+
+        if dtype == str:
+            parser.add_argument('--' + prefix + name, type=str, default=None)
+        elif dtype == int:
+            parser.add_argument('--' + prefix + name, type=int, default=None)
+        elif dtype == float:
+            parser.add_argument('--' + prefix + name, type=float, default=None)
+        elif dtype == bool:
+            parser.add_argument('--' + prefix + name, action=f"store_{str(not value).lower()}", default=None)
+        elif isinstance(value, Mapping):
+            for k, v in value.items():
+                add_args(parser, k, v, prefix=prefix + name + ".")
+        elif isinstance(value, Iterable) and not isinstance(value, Mapping):
+            parser.add_argument('--' + prefix + name, type=type(value[0]), nargs='+', default=None)
+        # else:
+        #     print(f'WARN: Cannot parse key {prefix + name} of type {type(value)}.')
+
+    for name, value in config.items():
+        if not is_property(name, value):
+            continue
+        add_args(parser, name, value)
+
+
+def _merge_dict_to_config(src: dict, dist: dict, ignore_none=True):
+    for arg, value in src.items():
+        if ignore_none and value is None:
+            continue
+        dist[arg] = value
+
+
+def parse_from_args(config: dict, parser=None, other_modules: list=None, with_config_env_name: bool=False):
+    if parser is None:
+        parser = ArgumentParser()
+
+    add_to_argparser(config, parser, other_modules)
+    if with_config_env_name:
+        parser.add_argument("config", type=str, help="Config name")
+
+    args = parser.parse_args()
+    return args
+
+
+def activate_config_env(name=None, parser=None, parse_args=True, with_config_env_name: bool=False):
+    global_config_copy_ = copy.copy(global_config.__dict__)
+
+    if parse_args:
+        args_dict = dict()
+        for mutable_param in mutable_params:
+            args_dict[mutable_param] = global_config_copy_[mutable_param]
+        args = parse_from_args(args_dict, parser, with_config_env_name=with_config_env_name)
+        del args_dict
+        if name is None and with_config_env_name:
+            name = args.config
+
+    if name is None:
+        raise RuntimeError("Argument `name` must be given.")
+
+    external_module_params = copy.copy(get_config(name).__dict__)
+    for immutable_param in immutable_params:
+        if immutable_param in external_module_params:
+            external_module_params.pop(immutable_param)
+
+    _merge_dict_to_config(global_config_copy_, global_config.__dict__)
+    _merge_dict_to_config(external_module_params, global_config.__dict__)
+    if parse_args:
+        _merge_dict_to_config(args.__dict__, global_config.__dict__)
+
+
+def print_config(config=None):
+    if config is None:
+        config = global_config
+    properties = get_properties_from_config(config)
+    config_fields = []
+    for name, value in properties.items():
+        config_fields.append(f"{name}={value}")
+
+    config_fields = ", ".join(config_fields)
+    config_str = f"Config({config_fields})"
+    print(config_str)
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/base/config/mutable_params.py b/nlp/language_model/bert_sample/pytorch/base/config/mutable_params.py
new file mode 100644
index 000000000..517668dea
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/config/mutable_params.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+mutable_params = [
+    "train_batch_size", "eval_batch_size", "learning_rate", "weight_decay_rate", "opt_lamb_beta_1",
+    "opt_lamb_beta_2", "max_steps", "max_samples_termination", "warmup_proportion", "warmup_steps",
+    "start_warmup_step", "dist_backend", "seed", "gradient_accumulation_steps", "fp16",
+    "loss_scale", "exchange_padding", "enable_fuse_dropout", "disable_fuse_mask", "fused_gelu_bias",
+    "fused_dropout_add", "dense_seq_output", "cache_eval_data", "training_event","output_dir","save_checkpoint","eval_steps","init_checkpoint","target_mlm_accuracy"
+]
+
+mutable_params += [
+    "local_rank",
+    "do_train",
+    "data_dir",
+    "log_freq"
+]
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/base/create_container.sh b/nlp/language_model/bert_sample/pytorch/base/create_container.sh
new file mode 100644
index 000000000..23e41bd2c
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/create_container.sh
@@ -0,0 +1,184 @@
+# =================================================
+# Constants
+# =================================================
+
+MODEL="bert"
+export MODEL
+DOCKER_IMAGE="perf:${MODEL}"
+NEXP=1
+
+# TODO: Add to Dockerfile
+WORK_DIR="/workspace/baai-perf"
+MODEL_DIR="${WORK_DIR}/benchmarks/${MODEL}/pytorch"
+
+CURRENT_DIR=$(cd `dirname $0`; pwd)
+PROJ_DIR="${CURRENT_DIR}/../../../"
+BUILD_EXTENSION_DIR="${CURRENT_DIR}/build"
+BUILD_EXTENSION_PACKAGE_NAME="ext_ops"
+
+BASE_DOCKERFILE_PATH="${CURRENT_DIR}/BaseDockerfile"
+HOST_DOCKERFILE_PATH="${CURRENT_DIR}/Dockerfile"
+
+SOURCE_DATA_DIR=""
+MAP_DATA_DIR="/mnt/dataset/perf/${MODEL}"
+SUBMITTER="default"
+CONFIG=""
+
+: "${CLEAR_CACHES:=1}"
+SHM_SIZE="32g"
+
+
+# =================================================
+# Parse arguments
+# =================================================
+
+i=2
+TRAINING_SCRIPT_ARGS="$@"
+for arg in "$@"
+do
+    if [[ $arg =~ "--data_dir" ]]; then
+        if [[ $arg =~ "=" ]]; then
+            kv=(${arg//=/ })
+            SOURCE_DATA_DIR=${kv[1]}
+            TRAINING_SCRIPT_ARGS=${TRAINING_SCRIPT_ARGS/$arg/"--data_dir ${MAP_DATA_DIR}"}
+        else
+            SOURCE_DATA_DIR=${!i}
+            TRAINING_SCRIPT_ARGS=${TRAINING_SCRIPT_ARGS/"--data_dir ${!i}"/"--data_dir ${MAP_DATA_DIR}"}
+        fi
+
+    elif [[ $arg =~ "--name" ]]; then
+        if [[ $arg =~ "=" ]]; then
+            kv=(${arg//=/ })
+            SUBMITTER=${kv[1]}
+        else
+            SUBMITTER=${!i}
+        fi
+
+    elif [[ $arg =~ "--config" ]]; then
+        if [[ $arg =~ "=" ]]; then
+            kv=(${arg//=/ })
+            CONFIG=${kv[1]}
+        else
+            CONFIG=${!i}
+        fi
+    fi
+
+    let i++
+done
+
+
+# =================================================
+# Check arguments
+# =================================================
+
+if [[ "${SOURCE_DATA_DIR}" == "" ]]; then
+    echo "ERROR: data_dir is not given, please set --data_dir <DATA_DIR>"
+    exit 1
+fi
+
+if [[ "${CONFIG}" == "" ]]; then
+    echo "ERROR: config is not given, please set --config <CONFIG>"
+    exit 1
+fi
+
+CONTAINER_SUBMITTER_DIR="${WORK_DIR}/${SUBMITTER}"
+HOST_SUBMITTER_DIR="${PROJ_DIR}/${SUBMITTER}"
+
+CONTAINER_ENVIRONMENT_VARIABLES_PATH=${CONTAINER_SUBMITTER_DIR}/${MODEL}/config/environment_variables.sh
+HOST_ENVIRONMENT_VARIABLES_PATH="${HOST_SUBMITTER_DIR}/${MODEL}/config/environment_variables.sh"
+
+HOST_SUBMITTER_DOCKERFILE="${PROJ_DIR}/${SUBMITTER}/${MODEL}/config/Dockerfile"
+CONTAINER_NAME="bert_ckpt"
+
+if [ ! -f "${HOST_ENVIRONMENT_VARIABLES_PATH}" ]; then
+    touch "${HOST_ENVIRONMENT_VARIABLES_PATH}"
+fi
+
+source ${HOST_ENVIRONMENT_VARIABLES_PATH}
+
+RESULTS_DIR="${PROJ_DIR}/${SUBMITTER}/${MODEL}/results"
+LOG_FILE_BASE="${RESULTS_DIR}/config_${CONFIG}_experiment"
+
+echo "======================================"
+echo "Arguments"
+echo "---------"
+
+echo "MODEL = ${MODEL}"
+echo "CONTAINER_NAME = ${CONTAINER_NAME}"
+echo "DOCKER_IMAGE = ${DOCKER_IMAGE}"
+echo "MODEL_DIR = ${MODEL_DIR}"
+echo "SUBMITTER = ${SUBMITTER}"
+echo "CONTAINER_SUBMITTER_DIR = ${CONTAINER_SUBMITTER_DIR}"
+echo "HOST_SUBMITTER_DOCKERFILE = ${HOST_SUBMITTER_DOCKERFILE}"
+echo "CONFIG = ${CONFIG}"
+echo "CONTAINER_MOUNTS = ${CONTAINER_MOUNTS}"
+echo "TRAINING_SCRIPT_ARGS = ${TRAINING_SCRIPT_ARGS[*]}"
+echo "CURRENT_DIR = ${CURRENT_DIR}"
+echo "CONTAINER_ENVIRONMENT_VARIABLES_PATH = ${CONTAINER_ENVIRONMENT_VARIABLES_PATH}"
+echo "RESULTS_DIR = ${RESULTS_DIR}"
+echo "LOG_FILE_BASE = ${LOG_FILE_BASE}"
+echo "SHM_SIZE = ${SHM_SIZE}"
+echo "======================================"
+
+
+# =================================================
+# Training
+# =================================================
+
+# Cleanup container
+# cleanup_docker() {
+#     docker container rm -f "${CONTAINER_NAME}" || true
+# }
+# cleanup_docker
+# trap 'set -eux; cleanup_docker' EXIT
+
+# Clean built extension
+if [ -d "${BUILD_EXTENSION_DIR}" ]; then
+    echo "WARN: Delete built extension"
+    rm -rf "${BUILD_EXTENSION_DIR}"
+    rm -rf ${CURRENT_DIR}/${BUILD_EXTENSION_PACKAGE_NAME}.*.so
+    echo "extension file: "${CURRENT_DIR}/${BUILD_EXTENSION_PACKAGE_NAME}.*.so""
+fi
+
+
+# Build image
+if [ -f "${HOST_DOCKERFILE_PATH}" ]; then
+    echo "WARN: Remove previous Dockerfile"
+    rm -f "${HOST_DOCKERFILE_PATH}"
+fi
+
+echo "WARN: cp BaseDockerfile to Dockerfile"
+cp "${BASE_DOCKERFILE_PATH}" "${HOST_DOCKERFILE_PATH}"
+
+if [ -f "${HOST_SUBMITTER_DOCKERFILE}" ]; then
+    echo "WARN: Found submitter's Dockerfile, merging submitter's Dockerfile to Dockerfile"
+    cat "${HOST_SUBMITTER_DOCKERFILE}" >> "${HOST_DOCKERFILE_PATH}"
+fi
+
+docker build -t ${DOCKER_IMAGE} ./
+
+# Setup container by Dockerfile
+docker run --init --detach \
+    --net=host --uts=host --ipc=host --security-opt=seccomp=unconfined \
+    --privileged=true \
+    --ulimit=stack=67108864 --ulimit=memlock=-1 \
+    -w ${MODEL_DIR} \
+    --shm-size="${SHM_SIZE}" \
+    --volume ${SOURCE_DATA_DIR}:${MAP_DATA_DIR} \
+    --volume ${PROJ_DIR}:${WORK_DIR} \
+    --name="${CONTAINER_NAME}" ${CONTAINER_MOUNTS} \
+    "${DOCKER_IMAGE}" sleep infinity
+
+# make sure container has time to finish initialization
+# TODO: Uncomment
+#sleep 30
+docker exec -it "${CONTAINER_NAME}" true
+
+mkdir -p ${RESULTS_DIR}
+docker exec -it "${CONTAINER_NAME}" sh -c "chmod 777 run_training.sh"
+
+# TODO: Remove pip source
+docker exec -it "${CONTAINER_NAME}" /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"
+
+docker exec -it "${CONTAINER_NAME}" /bin/bash -c "source ${CONTAINER_ENVIRONMENT_VARIABLES_PATH};python3 prepare.py --name ${SUBMITTER} --data_dir ${MAP_DATA_DIR}"
+
diff --git a/nlp/language_model/bert_sample/pytorch/base/create_contaner.sh b/nlp/language_model/bert_sample/pytorch/base/create_contaner.sh
new file mode 100644
index 000000000..b6b97a037
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/create_contaner.sh
@@ -0,0 +1,183 @@
+# =================================================
+# Constants
+# =================================================
+
+MODEL="bert"
+export MODEL
+DOCKER_IMAGE="perf:${MODEL}"
+NEXP=5
+
+# TODO: Add to Dockerfile
+WORK_DIR="/workspace/baai-perf"
+MODEL_DIR="${WORK_DIR}/benchmarks/${MODEL}/pytorch"
+
+CURRENT_DIR=$(cd `dirname $0`; pwd)
+PROJ_DIR="${CURRENT_DIR}/../../../"
+BUILD_EXTENSION_DIR="${CURRENT_DIR}/build"
+BUILD_EXTENSION_PACKAGE_NAME="ext_ops"
+
+BASE_DOCKERFILE_PATH="${CURRENT_DIR}/BaseDockerfile"
+HOST_DOCKERFILE_PATH="${CURRENT_DIR}/Dockerfile"
+
+SOURCE_DATA_DIR=""
+MAP_DATA_DIR="/mnt/dataset/perf/${MODEL}"
+SUBMITTER="iluvatar"
+CONFIG=""
+
+: "${CLEAR_CACHES:=1}"
+SHM_SIZE="32g"
+
+
+# =================================================
+# Parse arguments
+# =================================================
+
+i=2
+TRAINING_SCRIPT_ARGS="$@"
+for arg in "$@"
+do
+    if [[ $arg =~ "--data_dir" ]]; then
+        if [[ $arg =~ "=" ]]; then
+            kv=(${arg//=/ })
+            SOURCE_DATA_DIR=${kv[1]}
+            TRAINING_SCRIPT_ARGS=${TRAINING_SCRIPT_ARGS/$arg/"--data_dir ${MAP_DATA_DIR}"}
+        else
+            SOURCE_DATA_DIR=${!i}
+            TRAINING_SCRIPT_ARGS=${TRAINING_SCRIPT_ARGS/"--data_dir ${!i}"/"--data_dir ${MAP_DATA_DIR}"}
+        fi
+
+    elif [[ $arg =~ "--name" ]]; then
+        if [[ $arg =~ "=" ]]; then
+            kv=(${arg//=/ })
+            SUBMITTER=${kv[1]}
+        else
+            SUBMITTER=${!i}
+        fi
+
+    elif [[ $arg =~ "--config" ]]; then
+        if [[ $arg =~ "=" ]]; then
+            kv=(${arg//=/ })
+            CONFIG=${kv[1]}
+        else
+            CONFIG=${!i}
+        fi
+    fi
+
+    let i++
+done
+
+
+# =================================================
+# Check arguments
+# =================================================
+
+if [[ "${SOURCE_DATA_DIR}" == "" ]]; then
+    echo "ERROR: data_dir is not given, please set --data_dir <DATA_DIR>"
+    exit 1
+fi
+
+if [[ "${CONFIG}" == "" ]]; then
+    echo "ERROR: config is not given, please set --config <CONFIG>"
+    exit 1
+fi
+
+CONTAINER_SUBMITTER_DIR="${WORK_DIR}/${SUBMITTER}"
+HOST_SUBMITTER_DIR="${PROJ_DIR}/${SUBMITTER}"
+
+CONTAINER_ENVIRONMENT_VARIABLES_PATH=${CONTAINER_SUBMITTER_DIR}/${MODEL}/config/environment_variables.sh
+HOST_ENVIRONMENT_VARIABLES_PATH="${HOST_SUBMITTER_DIR}/${MODEL}/config/environment_variables.sh"
+
+HOST_SUBMITTER_DOCKERFILE="${PROJ_DIR}/${SUBMITTER}/${MODEL}/config/Dockerfile"
+CONTAINER_NAME="bert_test"
+
+if [ ! -f "${HOST_ENVIRONMENT_VARIABLES_PATH}" ]; then
+    touch "${HOST_ENVIRONMENT_VARIABLES_PATH}"
+fi
+
+source ${HOST_ENVIRONMENT_VARIABLES_PATH}
+
+RESULTS_DIR="${PROJ_DIR}/${SUBMITTER}/${MODEL}/results"
+LOG_FILE_BASE="${RESULTS_DIR}/config_${CONFIG}_experiment"
+
+echo "======================================"
+echo "Arguments"
+echo "---------"
+
+echo "MODEL = ${MODEL}"
+echo "CONTAINER_NAME = ${CONTAINER_NAME}"
+echo "DOCKER_IMAGE = ${DOCKER_IMAGE}"
+echo "MODEL_DIR = ${MODEL_DIR}"
+echo "SUBMITTER = ${SUBMITTER}"
+echo "CONTAINER_SUBMITTER_DIR = ${CONTAINER_SUBMITTER_DIR}"
+echo "HOST_SUBMITTER_DOCKERFILE = ${HOST_SUBMITTER_DOCKERFILE}"
+echo "CONFIG = ${CONFIG}"
+echo "CONTAINER_MOUNTS = ${CONTAINER_MOUNTS}"
+echo "TRAINING_SCRIPT_ARGS = ${TRAINING_SCRIPT_ARGS[*]}"
+echo "CURRENT_DIR = ${CURRENT_DIR}"
+echo "CONTAINER_ENVIRONMENT_VARIABLES_PATH = ${CONTAINER_ENVIRONMENT_VARIABLES_PATH}"
+echo "RESULTS_DIR = ${RESULTS_DIR}"
+echo "LOG_FILE_BASE = ${LOG_FILE_BASE}"
+echo "SHM_SIZE = ${SHM_SIZE}"
+echo "======================================"
+
+
+# =================================================
+# Training
+# =================================================
+
+# Cleanup container
+# cleanup_docker() {
+#     docker container rm -f "${CONTAINER_NAME}" || true
+# }
+# cleanup_docker
+# trap 'set -eux; cleanup_docker' EXIT
+
+# Clean built extension
+if [ -d "${BUILD_EXTENSION_DIR}" ]; then
+    echo "WARN: Delete built extension"
+    rm -rf "${BUILD_EXTENSION_DIR}"
+    rm -rf ${CURRENT_DIR}/${BUILD_EXTENSION_PACKAGE_NAME}.*.so
+    echo "extension file: "${CURRENT_DIR}/${BUILD_EXTENSION_PACKAGE_NAME}.*.so""
+fi
+
+
+# Build image
+if [ -f "${HOST_DOCKERFILE_PATH}" ]; then
+    echo "WARN: Remove previous Dockerfile"
+    rm -f "${HOST_DOCKERFILE_PATH}"
+fi
+
+echo "WARN: cp BaseDockerfile to Dockerfile"
+cp "${BASE_DOCKERFILE_PATH}" "${HOST_DOCKERFILE_PATH}"
+
+if [ -f "${HOST_SUBMITTER_DOCKERFILE}" ]; then
+    echo "WARN: Found submitter's Dockerfile, merging submitter's Dockerfile to Dockerfile"
+    cat "${HOST_SUBMITTER_DOCKERFILE}" >> "${HOST_DOCKERFILE_PATH}"
+fi
+
+docker build -t ${DOCKER_IMAGE} ./
+
+# Setup container by Dockerfile
+docker run  -itd \
+--gpus all \
+    --net=host --uts=host --ipc=host \
+    --privileged \
+    -w ${MODEL_DIR} \
+    --shm-size="${SHM_SIZE}" \
+    --volume ${SOURCE_DATA_DIR}:${MAP_DATA_DIR} \
+    --volume ${PROJ_DIR}:${WORK_DIR} \
+    --name="${CONTAINER_NAME}" ${CONTAINER_MOUNTS} \
+    "${DOCKER_IMAGE}" 
+
+# make sure container has time to finish initialization
+# TODO: Uncomment
+#sleep 30
+docker exec -it "${CONTAINER_NAME}" true
+
+mkdir -p ${RESULTS_DIR}
+docker exec -it "${CONTAINER_NAME}" sh -c "chmod 777 run_training.sh"
+
+# TODO: Remove pip source
+docker exec -it "${CONTAINER_NAME}" /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"
+
+docker exec -it "${CONTAINER_NAME}" /bin/bash -c "source ${CONTAINER_ENVIRONMENT_VARIABLES_PATH};python3 prepare.py --name ${SUBMITTER} --data_dir ${MAP_DATA_DIR}"
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/base/create_contaner_bi.sh b/nlp/language_model/bert_sample/pytorch/base/create_contaner_bi.sh
new file mode 100644
index 000000000..0c736a36e
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/create_contaner_bi.sh
@@ -0,0 +1,184 @@
+# =================================================
+# Constants
+# =================================================
+
+source ../../../iluvatar/bert/config/environment_variables.sh
+
+MODEL="bert"
+export MODEL
+DOCKER_IMAGE="perf:${MODEL}"
+NEXP=5
+
+# TODO: Add to Dockerfile
+WORK_DIR="/workspace/baai-perf"
+MODEL_DIR="${WORK_DIR}/benchmarks/${MODEL}/pytorch"
+
+CURRENT_DIR=$(cd `dirname $0`; pwd)
+PROJ_DIR="${CURRENT_DIR}/../../../"
+BUILD_EXTENSION_DIR="${CURRENT_DIR}/build"
+BUILD_EXTENSION_PACKAGE_NAME="ext_ops"
+
+BASE_DOCKERFILE_PATH="${CURRENT_DIR}/BaseDockerfile"
+HOST_DOCKERFILE_PATH="${CURRENT_DIR}/Dockerfile"
+
+SOURCE_DATA_DIR=""
+MAP_DATA_DIR="/mnt/dataset/perf/${MODEL}"
+SUBMITTER="iluvatar"
+CONFIG=""
+
+: "${CLEAR_CACHES:=1}"
+SHM_SIZE="32g"
+
+
+# =================================================
+# Parse arguments
+# =================================================
+
+i=2
+TRAINING_SCRIPT_ARGS="$@"
+for arg in "$@"
+do
+    if [[ $arg =~ "--data_dir" ]]; then
+        if [[ $arg =~ "=" ]]; then
+            kv=(${arg//=/ })
+            SOURCE_DATA_DIR=${kv[1]}
+            TRAINING_SCRIPT_ARGS=${TRAINING_SCRIPT_ARGS/$arg/"--data_dir ${MAP_DATA_DIR}"}
+        else
+            SOURCE_DATA_DIR=${!i}
+            TRAINING_SCRIPT_ARGS=${TRAINING_SCRIPT_ARGS/"--data_dir ${!i}"/"--data_dir ${MAP_DATA_DIR}"}
+        fi
+
+    elif [[ $arg =~ "--name" ]]; then
+        if [[ $arg =~ "=" ]]; then
+            kv=(${arg//=/ })
+            SUBMITTER=${kv[1]}
+        else
+            SUBMITTER=${!i}
+        fi
+
+    elif [[ $arg =~ "--config" ]]; then
+        if [[ $arg =~ "=" ]]; then
+            kv=(${arg//=/ })
+            CONFIG=${kv[1]}
+        else
+            CONFIG=${!i}
+        fi
+    fi
+
+    let i++
+done
+
+
+# =================================================
+# Check arguments
+# =================================================
+
+if [[ "${SOURCE_DATA_DIR}" == "" ]]; then
+    echo "ERROR: data_dir is not given, please set --data_dir <DATA_DIR>"
+    exit 1
+fi
+
+if [[ "${CONFIG}" == "" ]]; then
+    echo "ERROR: config is not given, please set --config <CONFIG>"
+    exit 1
+fi
+
+CONTAINER_SUBMITTER_DIR="${WORK_DIR}/${SUBMITTER}"
+HOST_SUBMITTER_DIR="${PROJ_DIR}/${SUBMITTER}"
+
+CONTAINER_ENVIRONMENT_VARIABLES_PATH=${CONTAINER_SUBMITTER_DIR}/${MODEL}/config/environment_variables.sh
+HOST_ENVIRONMENT_VARIABLES_PATH="${HOST_SUBMITTER_DIR}/${MODEL}/config/environment_variables.sh"
+
+HOST_SUBMITTER_DOCKERFILE="${PROJ_DIR}/${SUBMITTER}/${MODEL}/config/Dockerfile"
+CONTAINER_NAME="bert_test"
+
+if [ ! -f "${HOST_ENVIRONMENT_VARIABLES_PATH}" ]; then
+    touch "${HOST_ENVIRONMENT_VARIABLES_PATH}"
+fi
+
+source ${HOST_ENVIRONMENT_VARIABLES_PATH}
+
+RESULTS_DIR="${PROJ_DIR}/${SUBMITTER}/${MODEL}/results"
+LOG_FILE_BASE="${RESULTS_DIR}/config_${CONFIG}_experiment"
+
+echo "======================================"
+echo "Arguments"
+echo "---------"
+
+echo "MODEL = ${MODEL}"
+echo "CONTAINER_NAME = ${CONTAINER_NAME}"
+echo "DOCKER_IMAGE = ${DOCKER_IMAGE}"
+echo "MODEL_DIR = ${MODEL_DIR}"
+echo "SUBMITTER = ${SUBMITTER}"
+echo "CONTAINER_SUBMITTER_DIR = ${CONTAINER_SUBMITTER_DIR}"
+echo "HOST_SUBMITTER_DOCKERFILE = ${HOST_SUBMITTER_DOCKERFILE}"
+echo "CONFIG = ${CONFIG}"
+echo "CONTAINER_MOUNTS = ${CONTAINER_MOUNTS}"
+echo "TRAINING_SCRIPT_ARGS = ${TRAINING_SCRIPT_ARGS[*]}"
+echo "CURRENT_DIR = ${CURRENT_DIR}"
+echo "CONTAINER_ENVIRONMENT_VARIABLES_PATH = ${CONTAINER_ENVIRONMENT_VARIABLES_PATH}"
+echo "RESULTS_DIR = ${RESULTS_DIR}"
+echo "LOG_FILE_BASE = ${LOG_FILE_BASE}"
+echo "SHM_SIZE = ${SHM_SIZE}"
+echo "======================================"
+
+
+# =================================================
+# Training
+# =================================================
+
+# Cleanup container
+# cleanup_docker() {
+#     docker container rm -f "${CONTAINER_NAME}" || true
+# }
+# cleanup_docker
+# trap 'set -eux; cleanup_docker' EXIT
+
+# Clean built extension
+if [ -d "${BUILD_EXTENSION_DIR}" ]; then
+    echo "WARN: Delete built extension"
+    rm -rf "${BUILD_EXTENSION_DIR}"
+    rm -rf ${CURRENT_DIR}/${BUILD_EXTENSION_PACKAGE_NAME}.*.so
+    echo "extension file: "${CURRENT_DIR}/${BUILD_EXTENSION_PACKAGE_NAME}.*.so""
+fi
+
+
+# Build image
+if [ -f "${HOST_DOCKERFILE_PATH}" ]; then
+    echo "WARN: Remove previous Dockerfile"
+    rm -f "${HOST_DOCKERFILE_PATH}"
+fi
+
+echo "WARN: cp BaseDockerfile to Dockerfile"
+cp "${BASE_DOCKERFILE_PATH}" "${HOST_DOCKERFILE_PATH}"
+
+if [ -f "${HOST_SUBMITTER_DOCKERFILE}" ]; then
+    echo "WARN: Found submitter's Dockerfile, merging submitter's Dockerfile to Dockerfile"
+    cat "${HOST_SUBMITTER_DOCKERFILE}" >> "${HOST_DOCKERFILE_PATH}"
+fi
+
+docker build -t ${DOCKER_IMAGE} ./
+
+# Setup container by Dockerfile
+docker run  -itd \
+    --net=host --uts=host --ipc=host \
+    --privileged \
+    -w ${MODEL_DIR} \
+    --shm-size="${SHM_SIZE}" \
+    --volume ${SOURCE_DATA_DIR}:${MAP_DATA_DIR} \
+    --volume ${PROJ_DIR}:${WORK_DIR} \
+    --name="${CONTAINER_NAME}" ${CONTAINER_MOUNTS} \
+    "${DOCKER_IMAGE}" 
+
+# make sure container has time to finish initialization
+# TODO: Uncomment
+#sleep 30
+docker exec -it "${CONTAINER_NAME}" true
+
+mkdir -p ${RESULTS_DIR}
+docker exec -it "${CONTAINER_NAME}" sh -c "chmod 777 run_training.sh"
+
+# TODO: Remove pip source
+docker exec -it "${CONTAINER_NAME}" /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"
+
+docker exec -it "${CONTAINER_NAME}" /bin/bash -c "source ${CONTAINER_ENVIRONMENT_VARIABLES_PATH};python3 prepare.py --name ${SUBMITTER} --data_dir ${MAP_DATA_DIR}"
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/2048_shards_varlength.chk b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/2048_shards_varlength.chk
new file mode 100644
index 000000000..830fb5eb5
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/2048_shards_varlength.chk
@@ -0,0 +1,2048 @@
+part_00000_of_02048.hdf5	ac517231f748d9ae0dad04cce21ab01d
+part_00001_of_02048.hdf5	72eb0206ee55007fef7697887a9a9854
+part_00002_of_02048.hdf5	3a32e5dbc7f778f103a46e63f829b2a7
+part_00003_of_02048.hdf5	b0d3b517a6fdf06cdba6eabc5353ac69
+part_00004_of_02048.hdf5	2bd928b45c8e3eb21263ed561c636101
+part_00005_of_02048.hdf5	feeb99a68cf1f7b46cec3798ad7039bd
+part_00006_of_02048.hdf5	7b37c353db09c52eb097691ae5c31d8c
+part_00007_of_02048.hdf5	3adef5c0b25c407ff705832c646eaa5b
+part_00008_of_02048.hdf5	f843a3f740659ef85240887829e2cd0b
+part_00009_of_02048.hdf5	33f02eb4e489a790b57cd61ea49cb247
+part_00010_of_02048.hdf5	2188a09df8316ead693d45db26c93289
+part_00011_of_02048.hdf5	d5ed18add059adb40167df7b6d9b78e2
+part_00012_of_02048.hdf5	c5848e8d70f104c3d838eb5944d3c891
+part_00013_of_02048.hdf5	c53bf0fe33509cc8183faf2b7a9b8929
+part_00014_of_02048.hdf5	7d0f79579baf22b3537d15b749865896
+part_00015_of_02048.hdf5	873189f098e7f86d07a493cc966fab4b
+part_00016_of_02048.hdf5	c7017402069bc19dd5a8d7a7ce92f04c
+part_00017_of_02048.hdf5	5a016fcc2dc8b59d7caaacac8525331e
+part_00018_of_02048.hdf5	c3721c16dffec8a213efc382a8702692
+part_00019_of_02048.hdf5	5bc52bcba2465b174b19249a70b679bc
+part_00020_of_02048.hdf5	0724c25f8eadab720b4d34433ba85a22
+part_00021_of_02048.hdf5	8522fd94f477609b3c90a99bd58a2d62
+part_00022_of_02048.hdf5	d974a0a6fdb86c8661b3c06cde955c9e
+part_00023_of_02048.hdf5	dd54cdbc5539ea41a0f3a05e68218631
+part_00024_of_02048.hdf5	742cecbcf1c9fdd6960d7c237a63b602
+part_00025_of_02048.hdf5	7d684040ac11bff2f624684524437a46
+part_00026_of_02048.hdf5	e9b5626339242dc61e63370f8b7c7e52
+part_00027_of_02048.hdf5	adf4175da231082a5e9f2b97fccdcedf
+part_00028_of_02048.hdf5	f92fefaa5288a1b331f50ab0b5d9b016
+part_00029_of_02048.hdf5	b25111930d986497d6e00cad19ec5c30
+part_00030_of_02048.hdf5	15ee4cd55ada1579f880e3a0288b014e
+part_00031_of_02048.hdf5	90220b6fab5ffb1120a28e0bebb92139
+part_00032_of_02048.hdf5	c6f405d259ab900d0dec701ec3f33191
+part_00033_of_02048.hdf5	6f2c6e395d4925df8f88c001052f311e
+part_00034_of_02048.hdf5	773eb1e8349ac73d038c9377acf742a5
+part_00035_of_02048.hdf5	400851d03afcbad124cc639f79dfd161
+part_00036_of_02048.hdf5	4bb8a508fb09a1b923fd852604ad4d57
+part_00037_of_02048.hdf5	f8639af21ec6c93638b84c5c9f46263d
+part_00038_of_02048.hdf5	30e8d4f33732e25268c53d36dc51e394
+part_00039_of_02048.hdf5	0c60280a020ffb350851f1e6f100a564
+part_00040_of_02048.hdf5	3fb33476f560019ebc76095e3e2ef432
+part_00041_of_02048.hdf5	1deddaac3ac84a41bcbbd596cf1790c1
+part_00042_of_02048.hdf5	3c66716ab83b27c1048c371b584de08b
+part_00043_of_02048.hdf5	0df7450ce4f356c418134c341228588c
+part_00044_of_02048.hdf5	e08de48adce90607c074552ca231fb55
+part_00045_of_02048.hdf5	2d86c4ccd2436a19f062ecebfe7cb3bf
+part_00046_of_02048.hdf5	d7a9f3978bef69ccb0a6b3b0c7ebf784
+part_00047_of_02048.hdf5	5f5861661687330df56e40e5ae8d10c8
+part_00048_of_02048.hdf5	480a1d72986b4976898f25b3cbeb8a37
+part_00049_of_02048.hdf5	312466a38e51d445d6c3578db65801db
+part_00050_of_02048.hdf5	23616caa8bfbe61988bf5ac69751f45a
+part_00051_of_02048.hdf5	1b847150e7ea88db2f6b6f2171b616a9
+part_00052_of_02048.hdf5	c02bb625bdc466a0a189d6847ab3e770
+part_00053_of_02048.hdf5	6501231222bbcac3ffbb5cebc3468299
+part_00054_of_02048.hdf5	f21ffafbd44504da49f5b995b9bd25ce
+part_00055_of_02048.hdf5	b3135f5688dd46b5332ca61bf90cd26e
+part_00056_of_02048.hdf5	0d6c66bfdbe7b3d575cbe5ba4a04014f
+part_00057_of_02048.hdf5	706187a92b002d190a964d22d1c75779
+part_00058_of_02048.hdf5	c3a5a7c3883f1d674204e2b4327c489c
+part_00059_of_02048.hdf5	a04c10a4e9c875c19bcf8d33abaf8d53
+part_00060_of_02048.hdf5	fd89fa6cc873ae5c9576304f0ca1180c
+part_00061_of_02048.hdf5	063e958a56189a4c912e69103309c218
+part_00062_of_02048.hdf5	844111d3923ea463a7c135ca2219ef41
+part_00063_of_02048.hdf5	348b058cc8465460070d1b1d78dcc87d
+part_00064_of_02048.hdf5	12dbfbf88a48785d46cd69139bd1dc75
+part_00065_of_02048.hdf5	692f2e0f99bde94750498c5665158cdc
+part_00066_of_02048.hdf5	041f140df3f3dad66c6b47b40a0296e3
+part_00067_of_02048.hdf5	eeb836f177be6f7afa572fca8cd36b52
+part_00068_of_02048.hdf5	dec3a6143f422a50ceed7fa9924d53f3
+part_00069_of_02048.hdf5	277a7885e36eaec7c7d425aa1baf9275
+part_00070_of_02048.hdf5	0b4e913fc88544a5cf8081e5ff4c6227
+part_00071_of_02048.hdf5	84870168c76045746b066d86f4048892
+part_00072_of_02048.hdf5	154934587a785e4fc5d0a6df9b4a3be2
+part_00073_of_02048.hdf5	1151aa05f7bd724bbdc889aee546e840
+part_00074_of_02048.hdf5	faa07c8e13b9ba89bc0f001653beeec9
+part_00075_of_02048.hdf5	699f68cab3474af86a91262e22a251c3
+part_00076_of_02048.hdf5	8545df227bc6114b539cd4cb647e10c0
+part_00077_of_02048.hdf5	b3f5dfe52489ab274aae8f8b148853d9
+part_00078_of_02048.hdf5	a58f058988e038d0cf77715069f85ef7
+part_00079_of_02048.hdf5	a1f77de1318e09fa82ae3e268ca20213
+part_00080_of_02048.hdf5	bb5c3cce7540964e484868e082e3d03d
+part_00081_of_02048.hdf5	8ae1b874af3ec9ce591e3c35764b5bcc
+part_00082_of_02048.hdf5	2fd796e3d021eeed98cb8a29cef5f1ea
+part_00083_of_02048.hdf5	bafaef8e03b21f2d459164acac82b608
+part_00084_of_02048.hdf5	afc1a7ae7c0887641991c7ba4eea4f4b
+part_00085_of_02048.hdf5	3b143f182fc5e603fb0961f2253f0e9a
+part_00086_of_02048.hdf5	a6e771b920611cd93c7b46f15aecec1b
+part_00087_of_02048.hdf5	478fb4b4d2a99deade384f01513a8d2d
+part_00088_of_02048.hdf5	c97b4f4f17c72bebce9ee946ce06a5e0
+part_00089_of_02048.hdf5	dfa8282917495d4aa425e9eb435502e7
+part_00090_of_02048.hdf5	d800d623e77b225f3a9f4ae89e17636e
+part_00091_of_02048.hdf5	0bed36e17602bbd02001333f64c49b69
+part_00092_of_02048.hdf5	b471888097d7af7e48e8c208bd46fa0f
+part_00093_of_02048.hdf5	62d596aebd4510efeb8c7a366605e885
+part_00094_of_02048.hdf5	a21bc7aa3ddda68b657a4b7c5a2db3df
+part_00095_of_02048.hdf5	18e609a838e80150031e27293e669ecd
+part_00096_of_02048.hdf5	c64dcb84005149182394e0236513ed0e
+part_00097_of_02048.hdf5	86e178ea04f6c951e04d3540429b5e9a
+part_00098_of_02048.hdf5	885d8406335fa23a60d09e7e642817ce
+part_00099_of_02048.hdf5	4e9172e3ee20481d492edc372023095e
+part_00100_of_02048.hdf5	4a0d09d0e230bad0e98db011989cac3c
+part_00101_of_02048.hdf5	dd8225ccc3a5829680bc4e69aac28185
+part_00102_of_02048.hdf5	244778a226da1cd8480c07f2c755eb6f
+part_00103_of_02048.hdf5	d16a816ef9522c36e49fc54adcb19fe1
+part_00104_of_02048.hdf5	319a5bcc2cff604804b7986434f4b826
+part_00105_of_02048.hdf5	acdfcfbadf08b03b0b2bb2390564816b
+part_00106_of_02048.hdf5	3f3db1affc6ca2c435d94df09fe48d5b
+part_00107_of_02048.hdf5	0eac91209c742c5ab4e411d210136fe7
+part_00108_of_02048.hdf5	5aa9731c683e405fedf5e792a8afcbb3
+part_00109_of_02048.hdf5	9be685644423a9352eb86ef7a3ac1cd8
+part_00110_of_02048.hdf5	7e1bfd32a9958bc145305c2c8d431c92
+part_00111_of_02048.hdf5	d4fe42db38e011b98a26a9fa65ed3ac0
+part_00112_of_02048.hdf5	dbff9603ef52ad8a886f18842ce7acdd
+part_00113_of_02048.hdf5	6e32afb3eb59cb8037977ce5c667a7a5
+part_00114_of_02048.hdf5	68b435e78759f1657f5ea2a193f32eb8
+part_00115_of_02048.hdf5	7267a99ea4f6de96ca9e15328070585c
+part_00116_of_02048.hdf5	5757d8d2da49f568d8409aa29f59aa8a
+part_00117_of_02048.hdf5	979131096be11ec1bbc51b81466487f9
+part_00118_of_02048.hdf5	5f4124110f29f7325d97175d10da674e
+part_00119_of_02048.hdf5	6de0ffb0777854343266f20e01af4c50
+part_00120_of_02048.hdf5	55b3b9ecf01328c3d21adc6ea58473e8
+part_00121_of_02048.hdf5	a9d0012db09885f7a71f9a9e649979d4
+part_00122_of_02048.hdf5	c858bc793766b37b4937693296b4140f
+part_00123_of_02048.hdf5	0eff9fdf967716f7a887193b23d89ad1
+part_00124_of_02048.hdf5	1f6d88c8e76a40f273c5949a137ff029
+part_00125_of_02048.hdf5	44a82be59c8159c75646fdd7efdddd70
+part_00126_of_02048.hdf5	20f1e4b1bb9a3ced4bf4d9baeb75c97e
+part_00127_of_02048.hdf5	247d7b975ecdcd78b8468f3934341e27
+part_00128_of_02048.hdf5	a22a18ee9b769481ffe269de8e9c8ffd
+part_00129_of_02048.hdf5	76e982f7c9d3d56aa98d01ae1e5b4a7b
+part_00130_of_02048.hdf5	5c1afaca79589a6a8edbe6d0d97d7e99
+part_00131_of_02048.hdf5	ec9075de97c393262983e408320be20a
+part_00132_of_02048.hdf5	990e225a693461af1f31bc790441500b
+part_00133_of_02048.hdf5	2e714eaf77ddf60ad1bb7f04041d7de4
+part_00134_of_02048.hdf5	5267fea23f6d837ca90fc385146ac998
+part_00135_of_02048.hdf5	0eb11d482153ce8b40c1a78304f9b0e5
+part_00136_of_02048.hdf5	27aafeaeaace540327954ddb770ca193
+part_00137_of_02048.hdf5	2341f737f6c5b9a5c9a127701172a56e
+part_00138_of_02048.hdf5	448c2c6899001245087aa5d31e4ef6e7
+part_00139_of_02048.hdf5	6268386262d6f5f0caff3a4040aff1f3
+part_00140_of_02048.hdf5	794a91abc82a772ec36927eb833b76d8
+part_00141_of_02048.hdf5	3f44f428c02fda180c76fe6fc9465ab4
+part_00142_of_02048.hdf5	d0ab7052124a5a94ebe2da56437845e7
+part_00143_of_02048.hdf5	e93d48a4892da729c4ffbc7cebd39193
+part_00144_of_02048.hdf5	719fce7ab632f9e0c3909d7013c48475
+part_00145_of_02048.hdf5	6136b920b33b3419958bc4c2349c7cac
+part_00146_of_02048.hdf5	34e812868aece378e0c4fba49e0741cc
+part_00147_of_02048.hdf5	da1fae99e89ecab958629416bd89e716
+part_00148_of_02048.hdf5	80fb7d76c1223bb2f3aeccec0355faf9
+part_00149_of_02048.hdf5	b3c2833d36004ae7f6d78ad26e583dff
+part_00150_of_02048.hdf5	45f74f224c598ee7ff6a86e0ddc8c091
+part_00151_of_02048.hdf5	1fc784799883101e76f6efffb691fc76
+part_00152_of_02048.hdf5	91714d2831018894887ccce53d2ba0ba
+part_00153_of_02048.hdf5	264c576e8e8eb5065144ee7b7fcdc348
+part_00154_of_02048.hdf5	bb38ddd7f2458a08387c3a6646b254a4
+part_00155_of_02048.hdf5	8abd1f47fc9c26f5db684c957577dbe7
+part_00156_of_02048.hdf5	2cb5e296cccf685c111a50594777b88f
+part_00157_of_02048.hdf5	e62b850549a34ac9d4ab9ed80748f2c2
+part_00158_of_02048.hdf5	1f3399d8b40a86fcaff8eb21835e5033
+part_00159_of_02048.hdf5	793642f3157e8b8449c7cbad3617bb04
+part_00160_of_02048.hdf5	a43c7b1a21d80237bffe045cf822d237
+part_00161_of_02048.hdf5	94f0403c2a1b9c56c99e4f29400dfafe
+part_00162_of_02048.hdf5	ab1d9fb3c1561e6b5123b5a3c1909306
+part_00163_of_02048.hdf5	d7dde3559804675eb574af6fefb30e0f
+part_00164_of_02048.hdf5	2840dd95f77addcf3cedaf5942d5560d
+part_00165_of_02048.hdf5	766e2bb9819228910f510e3440ea15fa
+part_00166_of_02048.hdf5	77d2ab00e8c70aa2c9d4bc72b53bb510
+part_00167_of_02048.hdf5	10038e1a397360ebc2bd3531382a61fc
+part_00168_of_02048.hdf5	f7d76ed48216ad5b1d11d7aea3376120
+part_00169_of_02048.hdf5	f8775d0cd22c1a2948a241002e550b0a
+part_00170_of_02048.hdf5	163454767bc7dcd1499c9ee34103b1ec
+part_00171_of_02048.hdf5	0c6e853c572d5684ec8486d176cdb506
+part_00172_of_02048.hdf5	963b029aec6e9e65d8e28fc973575d31
+part_00173_of_02048.hdf5	a1f4881ca1939b34faf0104998b828f3
+part_00174_of_02048.hdf5	19b3a8a546ab8e25db5d2a339d060fe3
+part_00175_of_02048.hdf5	2ed4be6dd62472e2082552ab4036eca3
+part_00176_of_02048.hdf5	2b91afd5c9ff7b36e0d8858e78337c13
+part_00177_of_02048.hdf5	7a4e1b9dcfc3693b28a64aabea78b44b
+part_00178_of_02048.hdf5	d5dbcc8187e09bd4ae61e360688d8b49
+part_00179_of_02048.hdf5	7f840bc906183f4a9d2e457ac8fb2ea8
+part_00180_of_02048.hdf5	e4e8202bfabdd2b2f78616a14c3aeab7
+part_00181_of_02048.hdf5	c005ba8800fb8bd5d09af21159ca3361
+part_00182_of_02048.hdf5	a1095e071c86f907b312618be885413e
+part_00183_of_02048.hdf5	c442863a70e331cea7cf0df951503925
+part_00184_of_02048.hdf5	ccd0456d81f3c6ef5adf42bacbde14b4
+part_00185_of_02048.hdf5	1a9806191c20097b01b1855fc94ca2d7
+part_00186_of_02048.hdf5	58caf9461dc9430e849e9b760c31d05d
+part_00187_of_02048.hdf5	ef9e1cac4a1ab02ca3a727531fda4b03
+part_00188_of_02048.hdf5	7d83babd6e25d5f522ef6b3d77f19ed0
+part_00189_of_02048.hdf5	917a5e41d3285194cb9663eeba33e42d
+part_00190_of_02048.hdf5	3bf24d7f655e543cfe99e3faa7085909
+part_00191_of_02048.hdf5	4815e50f9f1d0c4910df7a6c7a6c28ed
+part_00192_of_02048.hdf5	e74cffb54aeffdbb43fdfa93d0bf23fa
+part_00193_of_02048.hdf5	0e80c1f5fd09f6c8f349b3117cf5ad76
+part_00194_of_02048.hdf5	63f660ac7a1dbe7590602e3cceb85512
+part_00195_of_02048.hdf5	999fe1e2d2dba70b9ac72c34304ddca8
+part_00196_of_02048.hdf5	d9754b7e752bf9ccd7732ba5301c926c
+part_00197_of_02048.hdf5	f02c51f4c036b947daf70ba82c781d0a
+part_00198_of_02048.hdf5	7348b7cc4cf3f7331004a911843bd247
+part_00199_of_02048.hdf5	74586c4d4219fe507d3faa46d93fd82a
+part_00200_of_02048.hdf5	bb9928a1665ddee4ba576ddb1723f1f4
+part_00201_of_02048.hdf5	ccc981db5c5bff00386cadc198d46a80
+part_00202_of_02048.hdf5	0690a40ccbdbce25070a0176cd624440
+part_00203_of_02048.hdf5	535524e286e1fb46a44d01c1cb5419a2
+part_00204_of_02048.hdf5	ca6ad434f8cf0c4268ca3c18b97809c9
+part_00205_of_02048.hdf5	a90788f16e0ae6021765253a91d00dcc
+part_00206_of_02048.hdf5	ed1070da26aadb5f5ebf531263425782
+part_00207_of_02048.hdf5	2decd52b27f9c9a1d58e87554e39eb00
+part_00208_of_02048.hdf5	b7d7fdb2c2c18dee591e49c10fd751e9
+part_00209_of_02048.hdf5	15f423b5170222d20bd0392c32947408
+part_00210_of_02048.hdf5	88d8a64a1468852d5a6584bd060951ec
+part_00211_of_02048.hdf5	4a5ed5896ce5d906f354f22b7fde4f14
+part_00212_of_02048.hdf5	89a2b7b572a2d4135c2a8d834737b7f5
+part_00213_of_02048.hdf5	c561f029a7f4b3970df2c0ec80ace10e
+part_00214_of_02048.hdf5	216fa4abd7108f2ffaa8fef75db0928f
+part_00215_of_02048.hdf5	b75da3c93e6dbb33beee78803051db6d
+part_00216_of_02048.hdf5	a7f0f1261f0b2cfa4a37386b8513a6c4
+part_00217_of_02048.hdf5	a44324846631f44d7abc4f146fbd8cf3
+part_00218_of_02048.hdf5	c1cd59c8706f17b32e032788e68d509d
+part_00219_of_02048.hdf5	2c993a0cb1cd7f4603385b2f8e1e52aa
+part_00220_of_02048.hdf5	76bfd04bb75337b488b287a5eb2df4a7
+part_00221_of_02048.hdf5	96fd48b3c4160c9fc1770c987e2da3a9
+part_00222_of_02048.hdf5	f24891eaec4a9affe6b3ca521b2ea485
+part_00223_of_02048.hdf5	9a55330440de7d21b556e619929adffe
+part_00224_of_02048.hdf5	33a7fd1f1c4fdf9bd3f8ba5bb9c8e39a
+part_00225_of_02048.hdf5	83644a826f2897e468afcb418c3a0577
+part_00226_of_02048.hdf5	be8339607a9edefa50d69bce28343980
+part_00227_of_02048.hdf5	4bd2b3128661d72873ebf2b3579c686a
+part_00228_of_02048.hdf5	d17904f1067a8f272964ab407ed9e62e
+part_00229_of_02048.hdf5	317ccc5ead4ee6711a358216f331837a
+part_00230_of_02048.hdf5	a61320fc706bf1db1e71e62a13d7bd58
+part_00231_of_02048.hdf5	e92ca138ea8a8f4a57bac2e26d8091e2
+part_00232_of_02048.hdf5	8536228bab1f27fe385cf2d9aeea7168
+part_00233_of_02048.hdf5	70ad197b1e226880ccbbd761c5f95223
+part_00234_of_02048.hdf5	51215462d19e7c7a7fcc25d87bba896b
+part_00235_of_02048.hdf5	47f87fd6e8ed59a387965830def68690
+part_00236_of_02048.hdf5	f7eeae103d9466c2490d2c4690a9d202
+part_00237_of_02048.hdf5	207b0bc8683f98e91b89d4b70af3b04f
+part_00238_of_02048.hdf5	62cc5eea13ffb957d61547c035d59f02
+part_00239_of_02048.hdf5	9143b3b5fb8ecd226aa3266134650300
+part_00240_of_02048.hdf5	28d7eccc2f0f9df06bfbd1ae46860039
+part_00241_of_02048.hdf5	ee3e982f1b0bf613e87c5c07fb4e61a6
+part_00242_of_02048.hdf5	6a5d79c065502ae7fe82e3179737208d
+part_00243_of_02048.hdf5	2f979f1d7dc8d30606695bdc129455db
+part_00244_of_02048.hdf5	5ebd1ff73defe4d67afa596c6781fe3d
+part_00245_of_02048.hdf5	151f7a7b642f393d46be7b5f99681a60
+part_00246_of_02048.hdf5	d9835809e8f4a08e1280c9d608403e67
+part_00247_of_02048.hdf5	31f021ce68aa092d697e91e394590a63
+part_00248_of_02048.hdf5	4dcb59d9aa4e8d5330af41e7d3a865b6
+part_00249_of_02048.hdf5	7fce4d9e07b75123b1dc8cc50b6c34f0
+part_00250_of_02048.hdf5	763fa0ffe95d91c67ac82d1703cd63f1
+part_00251_of_02048.hdf5	07e94719048190b00f6fafe8308aa780
+part_00252_of_02048.hdf5	fd23c2b8d4130e5a313f0f364e89c4f6
+part_00253_of_02048.hdf5	4ef0c018798ec77524cea123c62f1ea6
+part_00254_of_02048.hdf5	34671a86532cf41b370172a25ba22dd3
+part_00255_of_02048.hdf5	7c402275715bf45e50e05415d52d8e8f
+part_00256_of_02048.hdf5	bb0a6a4eb8afd971e3ced22efc634497
+part_00257_of_02048.hdf5	81566ec1830119138e293761ca9776fc
+part_00258_of_02048.hdf5	848a566d5b792253c9eba5f21d999501
+part_00259_of_02048.hdf5	83c4dacffd04cd2735f94c468107a8ba
+part_00260_of_02048.hdf5	258cdd0acc54e8c5e20d17b4313a0c03
+part_00261_of_02048.hdf5	6e25ba2f1a18a83e23583465f51acc20
+part_00262_of_02048.hdf5	2229bdbc4e0dbdfba786be3afcbbf5a5
+part_00263_of_02048.hdf5	83c4747e5c23f568150fc84f4a75aabd
+part_00264_of_02048.hdf5	e9f2bc5b003cd86922e83f64c26656ea
+part_00265_of_02048.hdf5	fe75a45747ae38e3b558680fa45b3e61
+part_00266_of_02048.hdf5	c32c437ab98b37b7b416c2163a1553bd
+part_00267_of_02048.hdf5	9c1c50fb78f7848d298fb0b50209b204
+part_00268_of_02048.hdf5	fb12cd3db64b9c1f04b5e07de30fb294
+part_00269_of_02048.hdf5	d11806c036ef4f6976eef4c1518ff85b
+part_00270_of_02048.hdf5	4b320ba8d5eb59ea017435a3cf9ffe54
+part_00271_of_02048.hdf5	2310b56b29aa46254c9757c9ebc5be7f
+part_00272_of_02048.hdf5	9f3ca96046f3ebce77b3b383e04ad07e
+part_00273_of_02048.hdf5	31b87884496d31bd9508542ea2440554
+part_00274_of_02048.hdf5	a05e7009138f0d0802595e15078a9351
+part_00275_of_02048.hdf5	eeac7ebfa66f1c689bfb51ac7da966ce
+part_00276_of_02048.hdf5	3aaf5d7e3cdd0df6958b6df815154142
+part_00277_of_02048.hdf5	8f80e00891fccf61c450db2225edd8fb
+part_00278_of_02048.hdf5	a74569099ea181e97d0cc369d51c1ba7
+part_00279_of_02048.hdf5	53fcab8b56c15e5b4ca3b99cb703d91f
+part_00280_of_02048.hdf5	69cc80597f6440d8321f643f3df91701
+part_00281_of_02048.hdf5	7d2101202e1d7cefbac6778feda75d88
+part_00282_of_02048.hdf5	d262b384d050666421a3c3fdbd7e7e62
+part_00283_of_02048.hdf5	8e0cbf5a267d88adaa751974d2a5d31d
+part_00284_of_02048.hdf5	76f21e93e66615869c19f08283c74a5b
+part_00285_of_02048.hdf5	445624b144da304fb3b3882ac5f22e87
+part_00286_of_02048.hdf5	90ee4f16b167cb79d7363478176cfecc
+part_00287_of_02048.hdf5	804409ee87786377397e8fdf9dc69709
+part_00288_of_02048.hdf5	e03aafbc157cf02d767fd6b2f3f55e91
+part_00289_of_02048.hdf5	1131f6da9b720347f702b512abd680f7
+part_00290_of_02048.hdf5	7b5733ca657ce434cfa82ab59df47e46
+part_00291_of_02048.hdf5	27895101e9cb0ce362f124b5675818df
+part_00292_of_02048.hdf5	3910dbc5a374947680906efe41b35fb4
+part_00293_of_02048.hdf5	83f44a5cf208a82cd71d46a6bd97532d
+part_00294_of_02048.hdf5	09b6f1ff752ac288a5a08b64b7c9e9bf
+part_00295_of_02048.hdf5	1b9de1bc26767e019d44b6062b80275b
+part_00296_of_02048.hdf5	3df51c0ee39c3b57dea9ae949ddc4e63
+part_00297_of_02048.hdf5	c477b264b0adf829b3e4fe529a12ac8c
+part_00298_of_02048.hdf5	d981bdadb5ef33965f5b681f95cacd73
+part_00299_of_02048.hdf5	1ecb87fc3edab4d34016bb1861f537c0
+part_00300_of_02048.hdf5	b03aedbdcec33caaffe656573791344e
+part_00301_of_02048.hdf5	22cd7fb91ce62a8fec360c71d188082b
+part_00302_of_02048.hdf5	865d68725a89ffe6f6242ebb14cd462b
+part_00303_of_02048.hdf5	5d39662cdead79ae7b545b23fd2704ef
+part_00304_of_02048.hdf5	c75d2daa4b56e5dac90a55bcad1c00d3
+part_00305_of_02048.hdf5	982953831b61fc2c890e5b49694e2094
+part_00306_of_02048.hdf5	1b3fb6e2b4340c4240a1ff7b8914efff
+part_00307_of_02048.hdf5	648560c3860935d1aaa6e3a05879b091
+part_00308_of_02048.hdf5	3d4ce928098d1c20887534fed3f9d9b4
+part_00309_of_02048.hdf5	ca7b48a1f655d2022dbb05485ad28ffb
+part_00310_of_02048.hdf5	5f8cf62b9ac2647d91df82e267e70778
+part_00311_of_02048.hdf5	eb45b0497ac72ebf0443f2cde9b75b07
+part_00312_of_02048.hdf5	a9c7902925e152e9753711dd4a29e862
+part_00313_of_02048.hdf5	1aed5b2c68d0f58026d5006845912540
+part_00314_of_02048.hdf5	a67a4c25c4246c6ef4db47a43784be9a
+part_00315_of_02048.hdf5	607109fbf48df9bedb64e615ad3b3b8f
+part_00316_of_02048.hdf5	be54c95efb0c94af94527e6fa49b0118
+part_00317_of_02048.hdf5	41c73a777850fb0b3e0d571f576055cf
+part_00318_of_02048.hdf5	46c686ec70280d375ebf55784176d0f1
+part_00319_of_02048.hdf5	8790601aca11bfe8c6553116f040c977
+part_00320_of_02048.hdf5	705655db72b045b7215fb0c5e0bf175f
+part_00321_of_02048.hdf5	23ecfa80389bf22c7d401aaa4ee78077
+part_00322_of_02048.hdf5	e188e3b8ebce88975b1b1bfae768833b
+part_00323_of_02048.hdf5	f03d67bad22275eb62071de93fde69ba
+part_00324_of_02048.hdf5	56f7d4f116442e5001d9a53407b42afd
+part_00325_of_02048.hdf5	1f3850ada69ae07b56e443ab00fdb86f
+part_00326_of_02048.hdf5	f2230dbcc4df9c9eefcf9a062350bcc7
+part_00327_of_02048.hdf5	3772c580b0e6253e302100fd12004a3a
+part_00328_of_02048.hdf5	17bf1a9bd4d931fe8817c4d6394c0573
+part_00329_of_02048.hdf5	f7b79262d006b56522da02a9440613ee
+part_00330_of_02048.hdf5	a8bb2655cbb85b64f6a528f84c68fe42
+part_00331_of_02048.hdf5	62ccca89634d7419bea1aa13b7eee8cb
+part_00332_of_02048.hdf5	587eeb366d246725210c0401355456f1
+part_00333_of_02048.hdf5	950cd096c7551a5b85a1ff81ac53b844
+part_00334_of_02048.hdf5	83915826604de84123ad64595a3c077a
+part_00335_of_02048.hdf5	842e14b658c1727757fdd7da0b80905a
+part_00336_of_02048.hdf5	58b782be61b007971ec1c32089aab3b5
+part_00337_of_02048.hdf5	030c04c1b7342f8e4ac954a248a50e2c
+part_00338_of_02048.hdf5	fe3a7b20f78b847e08f6c6057aefd829
+part_00339_of_02048.hdf5	624f8d8b0959e8cb7426b7afa2eee7e1
+part_00340_of_02048.hdf5	ea07266aa6f9c20e6cd7c6643d51bd1d
+part_00341_of_02048.hdf5	58c2db58aba3fcfd5d8b2c8e31b3560c
+part_00342_of_02048.hdf5	639d0cbf932ac9adff5e8ee780bdd594
+part_00343_of_02048.hdf5	294ac9cc1e4a2c9b4834d3e593a5386e
+part_00344_of_02048.hdf5	19f42bce8dc6f12ab77d9c60ef72edb4
+part_00345_of_02048.hdf5	31adc38a6025f1a95c6fbd9c6a7cd566
+part_00346_of_02048.hdf5	09198087b3a4476da4242fee4fbae990
+part_00347_of_02048.hdf5	c671b1323142bdc9eaca48892590108d
+part_00348_of_02048.hdf5	1b0e7b3f252ee7f08306bd74db49d49b
+part_00349_of_02048.hdf5	ecccd992727ec19d215c8fef594711f5
+part_00350_of_02048.hdf5	1d78049af90f8f9c826099fd3fd82bd5
+part_00351_of_02048.hdf5	f0a8658cfa7343038039595e670fe839
+part_00352_of_02048.hdf5	ff40c6e6ba5bfad1166025c89b80aa67
+part_00353_of_02048.hdf5	459f8aece676fdddd5913db4a5257864
+part_00354_of_02048.hdf5	141850c4a85d45d8ec074c15374d8666
+part_00355_of_02048.hdf5	b5be1b02b56866eb093f6aad952ac5d3
+part_00356_of_02048.hdf5	1d0e2d3f56667b11db71917aad30e503
+part_00357_of_02048.hdf5	210e1093e84c37095f7f50785d106554
+part_00358_of_02048.hdf5	72929728e78d9cc5cad338706344f5c4
+part_00359_of_02048.hdf5	3a1aaf0848fc1d3b91f42e4b2d3a7cd7
+part_00360_of_02048.hdf5	948aa83da5757d8a0c265f1241652b17
+part_00361_of_02048.hdf5	f437c3e0a02d7a8baf72068b6461736b
+part_00362_of_02048.hdf5	0093570384b5e8d0e4855e3d2572633d
+part_00363_of_02048.hdf5	cc6d0312e1bb2c0caec0afa7e2609b1d
+part_00364_of_02048.hdf5	df79005e3f39391b4b75cfb9f10454fe
+part_00365_of_02048.hdf5	9be3f27eda89dd5c2b8193f4af9f6d68
+part_00366_of_02048.hdf5	5fbbd684fb0fddbebd641d866f7fa327
+part_00367_of_02048.hdf5	da30d1829ffc43da9afd22f7a3eee93d
+part_00368_of_02048.hdf5	ac33336ec857b34830afb689a05b7ec9
+part_00369_of_02048.hdf5	7361c801e3ecba0dc77d995138eee9d9
+part_00370_of_02048.hdf5	7ff7340068075aeedbd3ee99e00a3747
+part_00371_of_02048.hdf5	64f1153eaacc0e9098cc52dc63cca2ba
+part_00372_of_02048.hdf5	b40f2ae62c60cae33c80c05a73a90709
+part_00373_of_02048.hdf5	90a62a0273f1cbd41e6c178036031199
+part_00374_of_02048.hdf5	4024d7785b48a7def7688b7dda30b529
+part_00375_of_02048.hdf5	8b26c809be57179ec5815cc53f3a0e10
+part_00376_of_02048.hdf5	6ddb77ae024fe2d5b3c66de7e0496b38
+part_00377_of_02048.hdf5	5a939e53f15bba1734450d6112de58db
+part_00378_of_02048.hdf5	b107b3de9758de60d0e97fe04e512dde
+part_00379_of_02048.hdf5	b467ae689bc91203528ea72e0e056d5c
+part_00380_of_02048.hdf5	4a234ef5c499297556e988df9721c153
+part_00381_of_02048.hdf5	38821f669aec0da13e9d0099801490a1
+part_00382_of_02048.hdf5	d5c73ea739246cafaf0050a8f2512157
+part_00383_of_02048.hdf5	baa04b0acf42aeb0d55f81bdaefd9851
+part_00384_of_02048.hdf5	eb235edadcf56f69485da6875ff87421
+part_00385_of_02048.hdf5	def1cd23ce87e88e77be042f8f57bd07
+part_00386_of_02048.hdf5	022016273bfed9ba6495b2415baa1005
+part_00387_of_02048.hdf5	cacb713e204358104584b613099e1b7f
+part_00388_of_02048.hdf5	980fd410d6ec7caf53b010616a132a17
+part_00389_of_02048.hdf5	4c5db52db3987b26842a047506167123
+part_00390_of_02048.hdf5	b6c6aed8c390a4bafef4ebce13113a11
+part_00391_of_02048.hdf5	ce8d22d12fda59f5ba3a121444561321
+part_00392_of_02048.hdf5	d9b7c069d486606c30f2d8ed7376bebe
+part_00393_of_02048.hdf5	333c3c63df584e7127084a33b7563301
+part_00394_of_02048.hdf5	aa55042bfad5254e735b45d902f2ed96
+part_00395_of_02048.hdf5	58a4b9f45f11da469208a3e7ca2bc3a9
+part_00396_of_02048.hdf5	1068978ad9d0377efd24ce2b063791fd
+part_00397_of_02048.hdf5	8213d65b8fceb5e97abaf506ad8459dc
+part_00398_of_02048.hdf5	2d1e56dd8e1f72f39aa72a3426a6fa8f
+part_00399_of_02048.hdf5	070bb8bd3c3115bd191fc64887d939b0
+part_00400_of_02048.hdf5	ef52b224d2d461255117d4f91e107177
+part_00401_of_02048.hdf5	3041dc1cd6c12f3cca955417fa36e323
+part_00402_of_02048.hdf5	31ad6d9a4f6bfb2bf65f8cbd813a0952
+part_00403_of_02048.hdf5	e947f0a4e02f01bcc319229e384c5394
+part_00404_of_02048.hdf5	fb5537ddec91e3fe6e78095b3ae20c7f
+part_00405_of_02048.hdf5	15c755a0c0cdfd38e4479e3fbfdc5c09
+part_00406_of_02048.hdf5	b2b57b0febb638910dbc547334728c7e
+part_00407_of_02048.hdf5	95dbdc7724ba9612b135a1e91812014a
+part_00408_of_02048.hdf5	7754c7e2d95122975a07158dff39154e
+part_00409_of_02048.hdf5	4e08a4f1cb21d024a3225d60713d1ec5
+part_00410_of_02048.hdf5	ba14fcda44f5edbc0d60b1bcb1059fa5
+part_00411_of_02048.hdf5	3bd4fbaea97600a0458c761ee0015eba
+part_00412_of_02048.hdf5	7ab4b4a19d3190740eb208feeb1579ff
+part_00413_of_02048.hdf5	3712b47381d2fbc569e75294f8b022e6
+part_00414_of_02048.hdf5	04b53648a64780a3d986810c20ab2ef9
+part_00415_of_02048.hdf5	8e0e0cb7592d565105aa2c9c3ea843ed
+part_00416_of_02048.hdf5	c5c25ecead484cb051833028503c3725
+part_00417_of_02048.hdf5	2f1465115ca872a34746679d494903ed
+part_00418_of_02048.hdf5	7a86828d889888ac250dca75b1ce5ce1
+part_00419_of_02048.hdf5	a98c8d25da75295cbc88bd47d7cb88cc
+part_00420_of_02048.hdf5	e18f2cc83726fde04be5f759f0f41db5
+part_00421_of_02048.hdf5	b97cf80563b09c7ba65b0f99c02bdecb
+part_00422_of_02048.hdf5	9aa124d54665bc62c747bd57b3520837
+part_00423_of_02048.hdf5	55f9c465456999230ea1d5941069673a
+part_00424_of_02048.hdf5	eadd712c895ea1e0e8b4b7c9c3f76715
+part_00425_of_02048.hdf5	426357da0ff53bba97bbbca27a23ff30
+part_00426_of_02048.hdf5	f9bb81b75c4fdf84ea25d3690f86302b
+part_00427_of_02048.hdf5	4f32a32914bbdcc702548ef7257e4828
+part_00428_of_02048.hdf5	88b51adc1a2e7ac65efa9e88a47fa247
+part_00429_of_02048.hdf5	9e4d13eac6102eb7223a183ea08781eb
+part_00430_of_02048.hdf5	eed3ac3a795a6a88eef8b217a92816e0
+part_00431_of_02048.hdf5	1b8f4cc554adbd33b9ec9224e5588d4a
+part_00432_of_02048.hdf5	00d2b917f5d2768ad9d2bb06e095f0b8
+part_00433_of_02048.hdf5	d5bca239bc2414d73cea33c3422b083e
+part_00434_of_02048.hdf5	62f159fcdcd02d522cdb34d324e3fb03
+part_00435_of_02048.hdf5	4cad05ada881959b0ca7fcf0c2511099
+part_00436_of_02048.hdf5	516d01f6f46ca0ac3873371ea149b897
+part_00437_of_02048.hdf5	ba29ab2df65bcde9ac8800674bc75db5
+part_00438_of_02048.hdf5	8d2a4db392898025b972af00906e9903
+part_00439_of_02048.hdf5	958ce7ca401f6b646570ddecf32a6b77
+part_00440_of_02048.hdf5	7b70f374cecb9b4710ebf35b69e37187
+part_00441_of_02048.hdf5	5955845c000916ff95278bb53dda089c
+part_00442_of_02048.hdf5	46d31a9015723fda2ed489c0ac9e67f7
+part_00443_of_02048.hdf5	899e30878fdc90861f11be8803d60427
+part_00444_of_02048.hdf5	e48aa7b47a24b9f3e1d035ad479b8750
+part_00445_of_02048.hdf5	a8c15571f31fe9ca0a1ad07a2ae8682d
+part_00446_of_02048.hdf5	4bc92b3651384441eb93810688ecba4c
+part_00447_of_02048.hdf5	246c5141dfc8fdbdad607c7824ce46d0
+part_00448_of_02048.hdf5	dc43820ddb293437a2ca08cf4db14fd7
+part_00449_of_02048.hdf5	c515707630e2cd01c7a08c2692ac1ec8
+part_00450_of_02048.hdf5	e0a1e9c121557281709361d9ae4c38cd
+part_00451_of_02048.hdf5	0069822c064f5bfb5878b83389d8c968
+part_00452_of_02048.hdf5	197c4b5e9d76f8339034e19926af8d81
+part_00453_of_02048.hdf5	937efafc970b2478177ca6931ce1b704
+part_00454_of_02048.hdf5	c464cb464d7b045beb42a7b13a2a6cb8
+part_00455_of_02048.hdf5	8c9ebd6bd897a983ffdc651b75e6828e
+part_00456_of_02048.hdf5	f7799a9f9ed5bf223682cfefd23bb2cb
+part_00457_of_02048.hdf5	6e568ddcd95b996b2f68c9f95b49d0ad
+part_00458_of_02048.hdf5	9f847bc8ce2e5c7894054a25395002e3
+part_00459_of_02048.hdf5	d7f7f1b646f220d20984243630cc05fb
+part_00460_of_02048.hdf5	c21f764c2c31fd035227b5dfb3d75f75
+part_00461_of_02048.hdf5	bd634d68aea28deec2b4c723a5366892
+part_00462_of_02048.hdf5	cb8e1aaea054522e3f9b4013df328877
+part_00463_of_02048.hdf5	2875388f33f2e03b9eaeef573023f815
+part_00464_of_02048.hdf5	b280d7f00ee4630602cda945124a756b
+part_00465_of_02048.hdf5	093b1917a75f85950c3555758c7431eb
+part_00466_of_02048.hdf5	5738735abe98a3327220ef87af0b88c7
+part_00467_of_02048.hdf5	8414971dadac7539537877c9e9ea1fa6
+part_00468_of_02048.hdf5	9e0711ca8942677c2f70cb123ebb4149
+part_00469_of_02048.hdf5	9ba97e9d73093e72880f0c9f0b5c0a3e
+part_00470_of_02048.hdf5	75431a66dbb3d3ecd9ced2948934c4a1
+part_00471_of_02048.hdf5	68bd7848e7522505a02ac43069dadf4c
+part_00472_of_02048.hdf5	ebe3be50a83a0946553f2eec49604c26
+part_00473_of_02048.hdf5	bae8c72a836016a32bf21ead64540ef9
+part_00474_of_02048.hdf5	98fa64de4b0e4780f067f6c1fd7d4965
+part_00475_of_02048.hdf5	2bac579035b32b68c48b6bae2defde8b
+part_00476_of_02048.hdf5	0a0a5257ad86c4bcafc3ca31a380246d
+part_00477_of_02048.hdf5	f08708b3afde8b83ade92f7376a3d081
+part_00478_of_02048.hdf5	6cb386ba840e430a411869a0894f762a
+part_00479_of_02048.hdf5	17b1013b6f51a8b86275751ec047602a
+part_00480_of_02048.hdf5	fb362e0b7f81a00c16e64f84bd1582e4
+part_00481_of_02048.hdf5	b03f719e0fe60f3a8caa91782d354835
+part_00482_of_02048.hdf5	b7845cae9cf4801c7e06edaa268c71b5
+part_00483_of_02048.hdf5	95e0f7bab75412f6fb6c4a322be96fb5
+part_00484_of_02048.hdf5	cde5cc326b76ed5157b12db29ed6b828
+part_00485_of_02048.hdf5	0fad2461fc1cfc130450a161b7bb5562
+part_00486_of_02048.hdf5	c60aeaababf91b3c9b053d5ca7e57584
+part_00487_of_02048.hdf5	02c5224bb1a21ac7a8c2adcc9fd3e9f6
+part_00488_of_02048.hdf5	cd1132e8acc676bdc1cdc9d7bf638f51
+part_00489_of_02048.hdf5	be87186dccfa2ccf5ef9bb4e8a58a969
+part_00490_of_02048.hdf5	4704c7f1e40ec8a84ecd1b47a03af2f8
+part_00491_of_02048.hdf5	f3f6f7c23f8bf320eea2f8eceb9d63de
+part_00492_of_02048.hdf5	9859e7bf4d7a1526e89ad3608dd504e9
+part_00493_of_02048.hdf5	3f4f20501b29fea4ded3709f5bade5ae
+part_00494_of_02048.hdf5	abfffb06cc43715091d8b5b7fd2b59eb
+part_00495_of_02048.hdf5	c6886d60bd7bda4be2f2e04d8011162d
+part_00496_of_02048.hdf5	edf6109244352cc84c28ffbf4bcd8f44
+part_00497_of_02048.hdf5	5653fb912b0a0f21da07602bbac7cfa0
+part_00498_of_02048.hdf5	84610b0f29809f3a24edc3e725376ffb
+part_00499_of_02048.hdf5	5a607103eed3e5123150ee9c912e875c
+part_00500_of_02048.hdf5	dc8340c804f9a25189d1085eb2d366f4
+part_00501_of_02048.hdf5	3a4b13f6b3f63cfeaa01cbfe418f6be0
+part_00502_of_02048.hdf5	85051be49c88014410f0beedfa5f2eda
+part_00503_of_02048.hdf5	bce927972ac27c25835c1f15e8734ced
+part_00504_of_02048.hdf5	14ae1e3d3868c137408d25548a8604cc
+part_00505_of_02048.hdf5	cee5ff73dc8340a7956f7846e89817b6
+part_00506_of_02048.hdf5	8d1ac53e890de9b1a544d9f6e6374acb
+part_00507_of_02048.hdf5	eac09de69589d67f70855dd3737f0cee
+part_00508_of_02048.hdf5	ecfdebff5c69c69ac34d3a648f5a8c6f
+part_00509_of_02048.hdf5	e97a644b9fac44e4d1adabcb64b7f0d2
+part_00510_of_02048.hdf5	8e38c66a7b4e1b16b265f70fb00485aa
+part_00511_of_02048.hdf5	239dda3f4d71421674a4d7f9358cf1a7
+part_00512_of_02048.hdf5	f0746e1cae74d8d9285b6ea6cc9f1f80
+part_00513_of_02048.hdf5	c4c865560b084f30ab6a94d158d8db12
+part_00514_of_02048.hdf5	acb8f7e0c4fd7613d0b27142725164ba
+part_00515_of_02048.hdf5	183ab3490377e07ff82b7047bf1f8dee
+part_00516_of_02048.hdf5	8c5907d90ecdbb4038c974781a9805ee
+part_00517_of_02048.hdf5	7d3a69bec3f15e56e9ff130d3e8f70c7
+part_00518_of_02048.hdf5	4302f4de86e4cc4129fbf5ff462ab340
+part_00519_of_02048.hdf5	6ac7f0987c8cb9bc9c1d0bcbf2a7e8b8
+part_00520_of_02048.hdf5	5bf90ce81b1049afd3da9269fd22d45f
+part_00521_of_02048.hdf5	dc50a4945ed125b73b4f66725ff7b95d
+part_00522_of_02048.hdf5	aec4269baeea27576cf1e27d6a154db4
+part_00523_of_02048.hdf5	8e62945438b0c93fff47adcf7171097d
+part_00524_of_02048.hdf5	e1fb2b3ddab186db29f008477d066eff
+part_00525_of_02048.hdf5	3f5c55e32271525e0bff95843f74cba7
+part_00526_of_02048.hdf5	e4e5d073c044b23ffd5d53df036a966d
+part_00527_of_02048.hdf5	233d585d9fb8350cec072f2198ad6d74
+part_00528_of_02048.hdf5	3ed6c4bb5ad145d41b25d79e3ecefa09
+part_00529_of_02048.hdf5	8ff2bcdb970df82e215b333f7f72b758
+part_00530_of_02048.hdf5	b6dd7c4852ee7eaee7b4fa3a2fff72c0
+part_00531_of_02048.hdf5	1c2c79ab4d7ef08566e1efd4fb906cba
+part_00532_of_02048.hdf5	e6c1487094b971c730f43009f8555a45
+part_00533_of_02048.hdf5	3f5e708ef3b8513d66d999962d12fbda
+part_00534_of_02048.hdf5	cc713b68b1ea437177d39ac8ffe39119
+part_00535_of_02048.hdf5	cbaab26785d687f22fb0a5cca05ab73d
+part_00536_of_02048.hdf5	f4c38bf4560a0f8bf6dd628eb8783498
+part_00537_of_02048.hdf5	79a4619463ba534ef94a88b0c0a76d31
+part_00538_of_02048.hdf5	b098640a952661016d383b7480173509
+part_00539_of_02048.hdf5	b106564ab876ef8fa593970050b855a9
+part_00540_of_02048.hdf5	d0a12ef253a3f4c1880d98bd3bc74446
+part_00541_of_02048.hdf5	94036717328b4119afdbb656eb6113e3
+part_00542_of_02048.hdf5	570b471b3b30f20e5763be013046ccf5
+part_00543_of_02048.hdf5	1d3efa2d1502ab24a6b68f94cde90dc7
+part_00544_of_02048.hdf5	870eb0bc53994eb02fefac00b89e4fa0
+part_00545_of_02048.hdf5	629f1911d289bf93caa2fdd3c1497fb5
+part_00546_of_02048.hdf5	ab6ec159673611dc06c5554a3a7d112e
+part_00547_of_02048.hdf5	90274377c93f6536e0afb857fd50e6a6
+part_00548_of_02048.hdf5	3eb09e140d84665efcad834f63630a2d
+part_00549_of_02048.hdf5	07cf53cfe4e2bab58c9b3934601ea1b9
+part_00550_of_02048.hdf5	502ee2980eb055a590ad9b7f9c3add2b
+part_00551_of_02048.hdf5	92b107c138871f1ab546116ead05c667
+part_00552_of_02048.hdf5	86296469c93c20a4122dc1eab7e40792
+part_00553_of_02048.hdf5	00f3f5143ad70642677ecdbf469be22c
+part_00554_of_02048.hdf5	1484644e91944c8d50ab1a66ab78e36f
+part_00555_of_02048.hdf5	74046a041568ecf9eaa12cb0e840e76b
+part_00556_of_02048.hdf5	5b99d6e9a55b80745378a4ac54ace8a7
+part_00557_of_02048.hdf5	39732c273ee85828007657c490e2adc8
+part_00558_of_02048.hdf5	9a3acc8a4e0fc550ffc0d357f7f44052
+part_00559_of_02048.hdf5	93b97a37ef0f97524c1fc18b26412ab1
+part_00560_of_02048.hdf5	d41205cd411a0790f8d3f76353313550
+part_00561_of_02048.hdf5	366b698ca0e090891873cf9d025d0de3
+part_00562_of_02048.hdf5	15eae257577f4164c2401ef66547bd24
+part_00563_of_02048.hdf5	8dc497a872f930b977445e767435170a
+part_00564_of_02048.hdf5	221cebaa9ba480e64db24b0443bb4ce3
+part_00565_of_02048.hdf5	7dc0849fd843cc0f24dc4ff455a2ee04
+part_00566_of_02048.hdf5	ce539113746dcfc3e47c7aff14ba2645
+part_00567_of_02048.hdf5	22ff246e01222b0fdde358120e8c3f13
+part_00568_of_02048.hdf5	4824f30fa800c8a5a1c09519ff75a65b
+part_00569_of_02048.hdf5	fe02372fb6c29d8d9c3194f5c2f0f1fa
+part_00570_of_02048.hdf5	cbefba5933b89966d585fb0c1978b093
+part_00571_of_02048.hdf5	6d08e8d3ab6de47fd1dd872221c02392
+part_00572_of_02048.hdf5	80dab5ddc5cf79394954ffd90544c387
+part_00573_of_02048.hdf5	8c8d3848a6285ec58d7843c25f141cff
+part_00574_of_02048.hdf5	1d5a4a46af1a3efe64157e038c503892
+part_00575_of_02048.hdf5	e1a5f3785077908a31b6681c727c72a9
+part_00576_of_02048.hdf5	cd48300db8967bedbeec5a449900f01c
+part_00577_of_02048.hdf5	a1c7017208d65fc13e49ccbd6a963445
+part_00578_of_02048.hdf5	55dfd402b572c33fd2d872ad039976c1
+part_00579_of_02048.hdf5	60a15bdb232176a2d0d18e5ccc2363b9
+part_00580_of_02048.hdf5	b624c99f4bdb013d9b769c7042599fe2
+part_00581_of_02048.hdf5	490f729f7636392cb211a274cfa2e3d0
+part_00582_of_02048.hdf5	26135a3d063e7b0f51b12b8277c038f2
+part_00583_of_02048.hdf5	fd18325887b37701caaf067173656093
+part_00584_of_02048.hdf5	b849038708b07ff002733a522477b9a3
+part_00585_of_02048.hdf5	48dc06918237981117c52615bdc9eca4
+part_00586_of_02048.hdf5	acafbddf9773424c6b565d25e4974924
+part_00587_of_02048.hdf5	9b3c4dc0c46c4a6f64deb3de091fd526
+part_00588_of_02048.hdf5	782ea624c2593badd744861aae1d4b0d
+part_00589_of_02048.hdf5	14ca4b8c205af102a16fb6c9352e98f2
+part_00590_of_02048.hdf5	fc80ba4fd4d6557d15c29aa163921577
+part_00591_of_02048.hdf5	cc03b173a39540fd61c9bebbef442f7c
+part_00592_of_02048.hdf5	3198e228e8d20a51db6cb3c1ed89d521
+part_00593_of_02048.hdf5	76e4e1441531f86338b3cca62ba545cf
+part_00594_of_02048.hdf5	34adb8e3bfba951d05ac5eb1b57bf585
+part_00595_of_02048.hdf5	800ab43f857546d39fc0588f79024974
+part_00596_of_02048.hdf5	bff8943abcd2a0f7fddfc53c31b249d3
+part_00597_of_02048.hdf5	fe3d870165e099bc721a683c10cc9622
+part_00598_of_02048.hdf5	3cb26ac43c27af69ea08e11d2c801b24
+part_00599_of_02048.hdf5	1a587bdc1c02d604e4a2c45ccb076f86
+part_00600_of_02048.hdf5	646f1b8c9b222d5c1648b483e9f0e3d0
+part_00601_of_02048.hdf5	35372896cdf5086366e19138dd24d499
+part_00602_of_02048.hdf5	ed6fc27de9c29abcfc998f132e5cb7ef
+part_00603_of_02048.hdf5	71f3f2799f05978cc6caca3efc36780c
+part_00604_of_02048.hdf5	738b779a0df990bce147d8d59660e988
+part_00605_of_02048.hdf5	33e71f378ef012178a94c959b7ee7d94
+part_00606_of_02048.hdf5	8c95211ddf9b85d4ab9049b28f1f9caf
+part_00607_of_02048.hdf5	6cf4637a2698482d5a73a09d33a42da6
+part_00608_of_02048.hdf5	b1980b3064e550890cb70e826c90889a
+part_00609_of_02048.hdf5	ef3247331a650132ab0ad8959e5ca97a
+part_00610_of_02048.hdf5	477d5eedf7c5ec4acd1878398e8e1d54
+part_00611_of_02048.hdf5	406dd09e0c9efd503effded622d1803c
+part_00612_of_02048.hdf5	49bf6658c245b341c81e62d6079336f3
+part_00613_of_02048.hdf5	baa4935479dd2eaef1a63e3e69ffff03
+part_00614_of_02048.hdf5	66b9aa60721aac7d01ea0b2b484fe068
+part_00615_of_02048.hdf5	ed8b12e8ae00e4c90bac250b240b2213
+part_00616_of_02048.hdf5	62796e0a3535a4d740128ccc200f3699
+part_00617_of_02048.hdf5	b98c39484725861760f014d6b4b645f7
+part_00618_of_02048.hdf5	5ee530e5aa103aacdb2cd6d77160c2a5
+part_00619_of_02048.hdf5	d4695de61dd2e844953f927bcd95c24a
+part_00620_of_02048.hdf5	ab655f63396363a60def4d18f2341bf0
+part_00621_of_02048.hdf5	4dd8bb04782b340e6ffcc23636e5354f
+part_00622_of_02048.hdf5	b10fb9d38c842628365b8e34bbdfb248
+part_00623_of_02048.hdf5	a0ad5bef600f69738e30cf43289ed72f
+part_00624_of_02048.hdf5	7c508c55140d2b1f616555acd6318116
+part_00625_of_02048.hdf5	f7d9f8c4309bc9ff246abedf805420af
+part_00626_of_02048.hdf5	9274b7492fef6e92209af51f53ca61cb
+part_00627_of_02048.hdf5	f96a63e46fce441763e68bcfd0cd0733
+part_00628_of_02048.hdf5	f2d6e9ee0b15383c680ea91f9611705a
+part_00629_of_02048.hdf5	0b9b524177783a086e0392252ae75aad
+part_00630_of_02048.hdf5	326d16a7e87323d77326e4e33b81961a
+part_00631_of_02048.hdf5	583535122e9336d0d298e52cdae57899
+part_00632_of_02048.hdf5	1550f1a786f3901d344676f4cc57b712
+part_00633_of_02048.hdf5	6463976389942a1de7a9e87beaf3a996
+part_00634_of_02048.hdf5	7f12641e374b34bd7567b65f02bee3c8
+part_00635_of_02048.hdf5	40f1c5b30cf95bf29db73812c9865b0f
+part_00636_of_02048.hdf5	7198712ced642336d57f046703c6532a
+part_00637_of_02048.hdf5	2716945b8f111c8ef370d2b3daa0bcd1
+part_00638_of_02048.hdf5	ac588d46625aae1235933699276a2b58
+part_00639_of_02048.hdf5	5edb7c2fcfdb51ef04d9223304831435
+part_00640_of_02048.hdf5	19c505967983405c9ab43385427ad9dd
+part_00641_of_02048.hdf5	8511bd398f969a3d525f163e1dff833b
+part_00642_of_02048.hdf5	be425b527f7e2c97a9a53bb448d025a0
+part_00643_of_02048.hdf5	e19abafebdc4a1b8504d32fd93bb9f5b
+part_00644_of_02048.hdf5	1f3758e17bd7b6c3d94ee934eba624fa
+part_00645_of_02048.hdf5	1d3eadeeb9d3d5510361816a8d156b00
+part_00646_of_02048.hdf5	40ad2a390e4eb9c94e938de46d99690a
+part_00647_of_02048.hdf5	a2019735bda06a275f4c6134d4d21315
+part_00648_of_02048.hdf5	80d02ae1e234d0b9bae2ce51272c09c1
+part_00649_of_02048.hdf5	c1123b846a5af62bae7667b3a12ca709
+part_00650_of_02048.hdf5	103c3aeeff136246ced38581f83e88a5
+part_00651_of_02048.hdf5	a2a09cb9822088db5deb10392cdca11f
+part_00652_of_02048.hdf5	9dd6db496a20f6997a8c1d929b8b184e
+part_00653_of_02048.hdf5	a1f5d79249387b0e6551e9766abce3e9
+part_00654_of_02048.hdf5	58e0ca7c9e52bbc726557dec7b779685
+part_00655_of_02048.hdf5	d1fea551c4e6a154ad1e9df829efff44
+part_00656_of_02048.hdf5	84e53a58c28306971c295fb8d75dbd6b
+part_00657_of_02048.hdf5	e47009bb0a4ca2dcd3572d950d151056
+part_00658_of_02048.hdf5	0157c81561e6eec7962fa7f86beb039b
+part_00659_of_02048.hdf5	e09ffeff0bb9385e15047d56d5e5166a
+part_00660_of_02048.hdf5	8a54248fc5ffe6e485e9c842aae5d1e2
+part_00661_of_02048.hdf5	62ab7bd437ef1f39b8acfb01a1aa3226
+part_00662_of_02048.hdf5	9dc024464fa757cfc95d74df7f37743a
+part_00663_of_02048.hdf5	3a2e5e969ed7d81015588f02f8818419
+part_00664_of_02048.hdf5	284be32f103f6788c6f1add8536711c7
+part_00665_of_02048.hdf5	5b0debfc6ca4a74c5fd3aa6ced5fd20d
+part_00666_of_02048.hdf5	73d8d9d5938f733cc562f333db504806
+part_00667_of_02048.hdf5	b4f62edf72e7f47738f8c1e9e9ad5272
+part_00668_of_02048.hdf5	88a85f30a2d45d8e266ca362b5c826b8
+part_00669_of_02048.hdf5	8f271721478ea6e4a0db8fe8aa412bbd
+part_00670_of_02048.hdf5	3b6ef1591a04961b802b3d75bb6e5990
+part_00671_of_02048.hdf5	8b76bf59697d3e5229d02e8038b10b9a
+part_00672_of_02048.hdf5	f00d0a22cac4a7dfd5b2d3d24f854b34
+part_00673_of_02048.hdf5	9a4331db0b8173dcc065e5f56ef6851c
+part_00674_of_02048.hdf5	c23e45044055b00df0a28d18c5d5adfb
+part_00675_of_02048.hdf5	0a9146fd83512491b342a7af84ca7cf0
+part_00676_of_02048.hdf5	656bc7458db70a390d270859713e99ab
+part_00677_of_02048.hdf5	82474dfbae4cd981116df869ed64d89c
+part_00678_of_02048.hdf5	c0655705ade8e1061999236b642f90c2
+part_00679_of_02048.hdf5	6c3ecdae9057bf879a02b8a069895368
+part_00680_of_02048.hdf5	db77d18eea558d8b00ca7084f635fbfc
+part_00681_of_02048.hdf5	af8279d915d052527cd75abc594640fe
+part_00682_of_02048.hdf5	35401c68dab0a90788c799ec923e6fb4
+part_00683_of_02048.hdf5	4ca7e46198117df437d98580500e6ce0
+part_00684_of_02048.hdf5	0b38a80de4c1a04468bc27b6896ecfaf
+part_00685_of_02048.hdf5	e6a73705720999996bd4ac1847e4bb15
+part_00686_of_02048.hdf5	a34f9afafef152c8ce50ec8398d57550
+part_00687_of_02048.hdf5	45ec8c2e1d2b17e2d5e658e615280dd6
+part_00688_of_02048.hdf5	5c373890c1c59ac1ba9b384c912c0f03
+part_00689_of_02048.hdf5	07dbe72e569965eea3d15f753ac0abcc
+part_00690_of_02048.hdf5	a77815370a03bea6c1f9b850ab12edfa
+part_00691_of_02048.hdf5	f68c42c92b187f748a7672f0a78530d8
+part_00692_of_02048.hdf5	53843a274af9dd1b879e4031bc09c813
+part_00693_of_02048.hdf5	0beded723f4243914938c2948c4644bc
+part_00694_of_02048.hdf5	a3edb5655a370606c3a2d01c7b37cea9
+part_00695_of_02048.hdf5	be629759584b4f1c48e780e8242b19c1
+part_00696_of_02048.hdf5	1fbbe616ad583e5f24b32bd7e4dd5ad1
+part_00697_of_02048.hdf5	5b29d389648d3783ec8b040030e294c1
+part_00698_of_02048.hdf5	cf29c605d40c36733cf7ebd26a7f35d4
+part_00699_of_02048.hdf5	87253a6f912d2af137fd3bc2cbd883c3
+part_00700_of_02048.hdf5	c04515db149637914122507cd512317c
+part_00701_of_02048.hdf5	e3c2e64fb20462eb73c71f686fb4cf0f
+part_00702_of_02048.hdf5	e23ae2acf82670c86cea5d322cbb03e1
+part_00703_of_02048.hdf5	ba63c5d931b644dc751c4732553df1c1
+part_00704_of_02048.hdf5	32549aecdbe9a1194636690e8f5f7415
+part_00705_of_02048.hdf5	36720b374b95f07a0b349c5b8f763dc6
+part_00706_of_02048.hdf5	ba17385120831b41d74ae296b68c2bc0
+part_00707_of_02048.hdf5	7d8ee251495ce69b6dfdd07e78bb387f
+part_00708_of_02048.hdf5	2bb51fa739d0c159f633faf1100bdcda
+part_00709_of_02048.hdf5	2f4c2ad8f3cd943dee13134481568b2a
+part_00710_of_02048.hdf5	6d023501c31347a16361ab2e7f94fa94
+part_00711_of_02048.hdf5	9e24fcce2a1649261861affb84207521
+part_00712_of_02048.hdf5	4f6e559501a0e5145c7e3656841a1f0e
+part_00713_of_02048.hdf5	ac30f55d33e1fa3dce9e74f65df2a2b9
+part_00714_of_02048.hdf5	bbcb1369a1281636f626ec5318b42613
+part_00715_of_02048.hdf5	252269d88ebafe542d5cf6367e2398f7
+part_00716_of_02048.hdf5	a27846e66e0d65d04ac04f378c863be1
+part_00717_of_02048.hdf5	742b3d6e94f44d315df536f13c7c2cc5
+part_00718_of_02048.hdf5	61e9e12ee615cf853124fbf207490606
+part_00719_of_02048.hdf5	fac4e0efa519b8ea538f98a03a5bb2ff
+part_00720_of_02048.hdf5	3e10e5fe6d5ef8f56593bf7e34d2d028
+part_00721_of_02048.hdf5	56cf9958c002abe0f2efa2c185ac015f
+part_00722_of_02048.hdf5	064ece5ff7c313430e376874e1aa13a1
+part_00723_of_02048.hdf5	83c4066d268603954fa131652ddfc78b
+part_00724_of_02048.hdf5	61c7763245c8804f29a198c8083f401c
+part_00725_of_02048.hdf5	e3fb199fb569ff0892ad65ac27ef10d3
+part_00726_of_02048.hdf5	036111ea19724f8b8914b1f0cf6c4a69
+part_00727_of_02048.hdf5	fc572523224b77670704c57cc4837f77
+part_00728_of_02048.hdf5	618ca924620852ad18cfcd9b4c05fab1
+part_00729_of_02048.hdf5	479433f05f7d87a2c27b14fd891b7381
+part_00730_of_02048.hdf5	693ed7f214b2fe223a4f77a8b25534b7
+part_00731_of_02048.hdf5	9e3bc49ab492393d4468a79c66585183
+part_00732_of_02048.hdf5	058f784043e884ca4ecf7cc9c5209c71
+part_00733_of_02048.hdf5	0d8c906647d96bf14548980466d64bf0
+part_00734_of_02048.hdf5	5a086bd65b93eb7eaa4e49dcc1289dcb
+part_00735_of_02048.hdf5	7b3e8a616e4c3d4768441cf9934892df
+part_00736_of_02048.hdf5	effc54a1c6780b119f244d51ba0fc66d
+part_00737_of_02048.hdf5	a4fb0b258a5d5c136f2b373e8edd4bfd
+part_00738_of_02048.hdf5	4eb8172f3c09c9a3f338747b987aecc7
+part_00739_of_02048.hdf5	9dd23a50c84e6c805816c9c5d86f6da9
+part_00740_of_02048.hdf5	245cd8624bf7f69fe72d590e1d07618e
+part_00741_of_02048.hdf5	48f1d0df04252d2ca69d36c2648d9f91
+part_00742_of_02048.hdf5	bbe9e328b0a5786e089ca10ee04d7a89
+part_00743_of_02048.hdf5	8fbe5c5cd09f23884d23a00ba0d0fe2f
+part_00744_of_02048.hdf5	693c83f68233b599f80ed2a384d952f6
+part_00745_of_02048.hdf5	7820ac7d95fc366d9797fa9c6c4f3e2e
+part_00746_of_02048.hdf5	0ab11c62e469bcae57914d5c401a4a76
+part_00747_of_02048.hdf5	ada31fe9022b23fcc4b9a48aeeb59961
+part_00748_of_02048.hdf5	9ace4eb724cb89fc9aff11f71f6a344e
+part_00749_of_02048.hdf5	1c48566fb945a397fb1ffad186e63134
+part_00750_of_02048.hdf5	adaf4e17fab84b9e22c1ffae10735a74
+part_00751_of_02048.hdf5	39dd88ec599a865d85cd0a0a91a87071
+part_00752_of_02048.hdf5	64b331c56a79045562a96929ae1acbfa
+part_00753_of_02048.hdf5	824fb2ea00e2dbb3f45be57793671d4c
+part_00754_of_02048.hdf5	2f1b4d7f29fa9b94d29d0b7fa0bed472
+part_00755_of_02048.hdf5	e0a80b5df482ffd2dfc76e6ac4a70859
+part_00756_of_02048.hdf5	b442f2764ee0828d283ec4c6072e64a6
+part_00757_of_02048.hdf5	5626eb5cdd5ad1857e5e87b5eed97b1f
+part_00758_of_02048.hdf5	9185285e91876c62af1083066b50568b
+part_00759_of_02048.hdf5	f9cac134bbae62439d721bd667f9a9d0
+part_00760_of_02048.hdf5	6caa743f5f08cf4d04c6a5149e4b2611
+part_00761_of_02048.hdf5	74466883d64e7dfc037a95ac6b8f5b36
+part_00762_of_02048.hdf5	5e6671abc0743026121b9907bbbd15ea
+part_00763_of_02048.hdf5	4eab7eee525ad19816148783bed29d09
+part_00764_of_02048.hdf5	0d440121520b2e813e7a22d57fdff435
+part_00765_of_02048.hdf5	93db5611806b75752f20adc9f31a3550
+part_00766_of_02048.hdf5	b3d81fc98f6c1866eacc3ba743136452
+part_00767_of_02048.hdf5	8f91c14280764476c7b1f59e85a3fbc0
+part_00768_of_02048.hdf5	0ff66e0d3087138eaffd9ae9562dc6cf
+part_00769_of_02048.hdf5	88643887d90dda0240c89382c14ff8fb
+part_00770_of_02048.hdf5	dfd4f82c34491a5189fb658aabce53e2
+part_00771_of_02048.hdf5	6b69387406da01e178f851f1a7dda4dc
+part_00772_of_02048.hdf5	e6a25935cd724dbd63a4d6ed0de8cefe
+part_00773_of_02048.hdf5	9b23b25cad33c7cab21608e6a99665b5
+part_00774_of_02048.hdf5	3692d92e7ecea54a3705afcdca2a3c1b
+part_00775_of_02048.hdf5	5e34bef6e20da8a7a340b20d759b1408
+part_00776_of_02048.hdf5	0e004dd5fe186928b56e39f78dffb572
+part_00777_of_02048.hdf5	ecf85e484103881bcffd5c80af7f00f0
+part_00778_of_02048.hdf5	b1100da72afdb461fe05b611c19823ad
+part_00779_of_02048.hdf5	2719b12cb4100e63f15bbf455ce0493b
+part_00780_of_02048.hdf5	e4d721f0015defeb9e182f2916ee7ec9
+part_00781_of_02048.hdf5	8c46b006542b1fd1aaeebda200a14ea1
+part_00782_of_02048.hdf5	d07b494baee98d6a1631c49be50340a7
+part_00783_of_02048.hdf5	80ff97a0ab4bccfb46b1169252574cb5
+part_00784_of_02048.hdf5	23d99ab12ccd6bfeaa1432ab00e14a01
+part_00785_of_02048.hdf5	790d6031b03c5cd72a5fd53eb2ac74b6
+part_00786_of_02048.hdf5	096225ef571c5ab33795351db3bd0e56
+part_00787_of_02048.hdf5	c73d9f06292afa4936fe33a868687e13
+part_00788_of_02048.hdf5	70a3618533630eb56f13352f07faa8b3
+part_00789_of_02048.hdf5	1275a999da12306bc007706f93dd68fb
+part_00790_of_02048.hdf5	ea862a26a327c6c7d3e79b60420a1f2b
+part_00791_of_02048.hdf5	a007f2c20f2508bb7f3f21b42a7537bc
+part_00792_of_02048.hdf5	1176ec357bdf18bacb1e8a15132a1144
+part_00793_of_02048.hdf5	ef89a7e09b97f416f6a4fb9bfd69d1f2
+part_00794_of_02048.hdf5	ae6353113603cdd57e86d8e46e211603
+part_00795_of_02048.hdf5	e6c50018a53d760b55eb6c94c9529192
+part_00796_of_02048.hdf5	f8d4764eaa584f656b3ceae19784eec9
+part_00797_of_02048.hdf5	9646a2632e191b76d86db20911fc6a1c
+part_00798_of_02048.hdf5	d5701776b01d78e2c553e905f1658ada
+part_00799_of_02048.hdf5	63cd32baaa9c4abd6a350f596e47d41c
+part_00800_of_02048.hdf5	daae7de8751c2894f20a10074e4848fb
+part_00801_of_02048.hdf5	6fb6bcd1d1eac9727bd10e101d6b636f
+part_00802_of_02048.hdf5	fe9ff8c30e7bb33266e9cbf6d7063460
+part_00803_of_02048.hdf5	d94395dbed551e0ec969616b620480c0
+part_00804_of_02048.hdf5	7b8e44c4fc0160295337a3353cf209d9
+part_00805_of_02048.hdf5	30ae6c74264e017793056eb2796712c1
+part_00806_of_02048.hdf5	20ebaf5ecb3d6f741cec90bc404fef7f
+part_00807_of_02048.hdf5	4d3fdb40da202023d29edaff466fcef5
+part_00808_of_02048.hdf5	4a4633f6359947297f1a556347c2139d
+part_00809_of_02048.hdf5	92d89bd8352e356197cb0c2e3782edbf
+part_00810_of_02048.hdf5	c2efa5059024689fb448c88045c5b881
+part_00811_of_02048.hdf5	a471a9752c48e60a57dde6bc4f014b5e
+part_00812_of_02048.hdf5	dd2d40a6cc78a85dab71872aebd20929
+part_00813_of_02048.hdf5	3a5cdf49a496f8ad18c109f1e2466038
+part_00814_of_02048.hdf5	048bf6f5b1838ca2e098b1e4a5d129c4
+part_00815_of_02048.hdf5	b4bbc7ccb11ea49d296eee047083569e
+part_00816_of_02048.hdf5	78703867f960c12acb56999355cdd85e
+part_00817_of_02048.hdf5	47457f29ee54eb7886669e35a8c0f17e
+part_00818_of_02048.hdf5	33c8e7e0c0756bbdc7927e9235289bbf
+part_00819_of_02048.hdf5	644a254ab79463cce75e96e1da7c5bbc
+part_00820_of_02048.hdf5	7b4fdbaa7b40768cf47878b606e22ae2
+part_00821_of_02048.hdf5	c8522edcbae149a627e39135346e8053
+part_00822_of_02048.hdf5	b7cad409a3d1f135ac5b856651bf5a55
+part_00823_of_02048.hdf5	7807b43c439cefe3026c56d4752d31c0
+part_00824_of_02048.hdf5	ea66304813d8cfc26b0c9cbed1eb7367
+part_00825_of_02048.hdf5	08701a0aea588f40bbd5bbaa837fadcb
+part_00826_of_02048.hdf5	5a643c7f77376fc7fb77ecd3a2d42463
+part_00827_of_02048.hdf5	de3ff64c93eb6e39cc7eb66a54a8c540
+part_00828_of_02048.hdf5	9e7c639556ccf4a4ca383378521227e3
+part_00829_of_02048.hdf5	6f1d2f3c9757ff5a5bebf97d86dfd7e4
+part_00830_of_02048.hdf5	dc130956e60719e6098e310f28fd8f5d
+part_00831_of_02048.hdf5	83edd8578f657f8bd8bd49ca7f867649
+part_00832_of_02048.hdf5	03a1cf8b5a5b1522ecafd9d63c509723
+part_00833_of_02048.hdf5	3e0ecc5972c70948f07bdacb880229d6
+part_00834_of_02048.hdf5	7f835ebe5edc3eaba7a2363d44d83ac1
+part_00835_of_02048.hdf5	159f7452593bce0f919727f4f5c1e123
+part_00836_of_02048.hdf5	078cf894ffcc16a13871eec2e3543ab0
+part_00837_of_02048.hdf5	0ee3f6c9bd2bfb31381d51b91c67f471
+part_00838_of_02048.hdf5	5de7977e74328b2b8730e3c92631d0af
+part_00839_of_02048.hdf5	ae2ac832926d73d7b23dc22bd75223a2
+part_00840_of_02048.hdf5	8f24fd7c1e3741bc8525de851f155c47
+part_00841_of_02048.hdf5	4de1fd8ffce5d8f89a085750d4373c83
+part_00842_of_02048.hdf5	2c20141375c86af122e70dbc223daaec
+part_00843_of_02048.hdf5	983bccf9c704795431fe691b925adbb2
+part_00844_of_02048.hdf5	865db78726a68c3cbf4f9885ec9c40f3
+part_00845_of_02048.hdf5	d3d978f94eccddff69f84447e92e7b08
+part_00846_of_02048.hdf5	4e91b204a37a518e42986d16ae88a8e0
+part_00847_of_02048.hdf5	155fd9f8e91919d47414a9dca3496ed6
+part_00848_of_02048.hdf5	590c2cfa28cd6f7ec78cc6eeab866158
+part_00849_of_02048.hdf5	2ba072b967ed465da8f202e48d678a4a
+part_00850_of_02048.hdf5	abb987a0fdd9bd74b6b1383f492efe7b
+part_00851_of_02048.hdf5	c2949e4f6729c0924b57fb00a2cedfb4
+part_00852_of_02048.hdf5	f5c0b2ff5ae0c3f4823026fe4c280285
+part_00853_of_02048.hdf5	3445ed4df787645245ac6f387bc04321
+part_00854_of_02048.hdf5	2a44cedf52f403814b692762faba82ea
+part_00855_of_02048.hdf5	c5a53e22e4f0a977134f1555b08b095f
+part_00856_of_02048.hdf5	d3754722a557d0900e8bc9eeee35856c
+part_00857_of_02048.hdf5	a3e4fa6186222695fd685e64df1963ff
+part_00858_of_02048.hdf5	ddda746e2f2d1d0bbd1096b2bb524f85
+part_00859_of_02048.hdf5	33290a77a614cc6e3a2903577452733f
+part_00860_of_02048.hdf5	acf0a2ec16cb92a9a621093ec4c3cf52
+part_00861_of_02048.hdf5	60b7817366c7f7f928adfdcced1b8b8b
+part_00862_of_02048.hdf5	61b0fd7bac1f780d46f5370d5ca60482
+part_00863_of_02048.hdf5	fc0534a7930ad9f81e60389f0ab0561e
+part_00864_of_02048.hdf5	38e238d4255ce64cdae0a63eeff42aa4
+part_00865_of_02048.hdf5	e7202f450720ff12b693a917f8d9a221
+part_00866_of_02048.hdf5	8db06132c23ea5bbc3e2b406ef030730
+part_00867_of_02048.hdf5	155ceb0e4b4011fc62c0ccf5ac3d82f3
+part_00868_of_02048.hdf5	e04f9308045b7d93bfdfae5d6c736c82
+part_00869_of_02048.hdf5	fe23b70ffac264ea8b085d29a4dad85a
+part_00870_of_02048.hdf5	fab62362f91688ea043f66cd8bcce795
+part_00871_of_02048.hdf5	0d8646ad1096cb45d818bda8f6c727f1
+part_00872_of_02048.hdf5	80bae5b5e4207455d4320183e1b7560f
+part_00873_of_02048.hdf5	33dd2356b48df84efc574aaa055279db
+part_00874_of_02048.hdf5	3a390ae845499c13eaf12fb39546ff1e
+part_00875_of_02048.hdf5	cc730a6b967a2bf6f49e3bc84ca95358
+part_00876_of_02048.hdf5	cad67ac2c9dee48213758c3a91a89604
+part_00877_of_02048.hdf5	d9d81c4f8aef8435cc9e1707422b6cf5
+part_00878_of_02048.hdf5	c5fc2264b3ca3d53fe533fb840cf687c
+part_00879_of_02048.hdf5	76770c2ed8f2262825c1e866e1ddc6c4
+part_00880_of_02048.hdf5	77f3c60072968610cb1c01cdad3a4506
+part_00881_of_02048.hdf5	b44c4a96e1d9600b62c28f06a83a5da1
+part_00882_of_02048.hdf5	554792113f652e71be688b90858308b0
+part_00883_of_02048.hdf5	0db03c0f0a5f9d8d885473f0951afcc6
+part_00884_of_02048.hdf5	aa6c7496fb09bc25417099d81b1efc32
+part_00885_of_02048.hdf5	dc93bc2909df94db67f71e7bc5375d7a
+part_00886_of_02048.hdf5	a5aacbb12d6977a8068366381a777c60
+part_00887_of_02048.hdf5	502a2e6113da00c4972fe09369851610
+part_00888_of_02048.hdf5	21f22af1a7b9b9f782a4b3d3bde7d898
+part_00889_of_02048.hdf5	2bec072f7bd2f78e895201366f826fdc
+part_00890_of_02048.hdf5	ee6c9ad59f29651d2954b173a2e2f834
+part_00891_of_02048.hdf5	f4ae93bc7864d90b7c3e426400ba9070
+part_00892_of_02048.hdf5	524e28eb3e8d03809c0e681126e544b6
+part_00893_of_02048.hdf5	24e7d8e732c2069743ad5d37e0babfe7
+part_00894_of_02048.hdf5	f6b8f291e2709870a512b96f172ae17b
+part_00895_of_02048.hdf5	c8a0fd04d284bdfdd55b8d2d03c12139
+part_00896_of_02048.hdf5	e21817887765b57091e73dac53d00e6f
+part_00897_of_02048.hdf5	7b8812a7046b527fb725a2aaeb393e63
+part_00898_of_02048.hdf5	c2975cf01d3af28216e083905d607651
+part_00899_of_02048.hdf5	bfbf8921aa28a695af99dc629df8576c
+part_00900_of_02048.hdf5	d37d6634f0728a7d81fe6ce794f8bdd3
+part_00901_of_02048.hdf5	d2dc5caf113ef56ca9a542970380bc82
+part_00902_of_02048.hdf5	ea9fc4e611ba981b00ae48b8d4eceac0
+part_00903_of_02048.hdf5	bac947ee39561cd76514b0c6a4dadeb0
+part_00904_of_02048.hdf5	5c960276bbaa0491a05be488a62ecc05
+part_00905_of_02048.hdf5	945170f6f60989e4eae770997207147e
+part_00906_of_02048.hdf5	aeb1455a1c083bd001c633f44e5c4a20
+part_00907_of_02048.hdf5	0049eba63bd76a4761751031ecd76ed3
+part_00908_of_02048.hdf5	46805535ba757b2d9d9aaac1cbb755a8
+part_00909_of_02048.hdf5	57c941df26c3648c2fc44dd5cd8554da
+part_00910_of_02048.hdf5	f99cf21382e39aef0158720313e497df
+part_00911_of_02048.hdf5	dd6f21e60dc7e3d790b129e21480ff11
+part_00912_of_02048.hdf5	a183c71fe4a6b40ed974a7ab4c0174ea
+part_00913_of_02048.hdf5	833e4e28c78922aee54b842372d69c54
+part_00914_of_02048.hdf5	3ff14034fde07e7cbe176bf119f8eb13
+part_00915_of_02048.hdf5	e2244fbbb1adee92abbb40236595cae3
+part_00916_of_02048.hdf5	db557898a2a1d78aaa42764f5ad2119a
+part_00917_of_02048.hdf5	afeeaf011e498b423d7a5a5af7f6cbf6
+part_00918_of_02048.hdf5	36be08521ac3dc5b422e889be42ba119
+part_00919_of_02048.hdf5	44afa89a845347527a8cb6087679361a
+part_00920_of_02048.hdf5	cb5e410404ba0b85235796152421d7b8
+part_00921_of_02048.hdf5	91fd3f0635166d010a44dedbf870ed93
+part_00922_of_02048.hdf5	eafb2235649675df2b9ab8eebf956285
+part_00923_of_02048.hdf5	35b7d9358ad0d5fc8d80493209cb5018
+part_00924_of_02048.hdf5	d75fae395f0a828048b61f0cf5173f02
+part_00925_of_02048.hdf5	03078ae7a09724a0100ca573af3e279e
+part_00926_of_02048.hdf5	b741d8cab05ad4ab9f5c270876224411
+part_00927_of_02048.hdf5	f56880a28ab47ce113604b01d1113667
+part_00928_of_02048.hdf5	08f0ff9074feb875230f657accc8ed58
+part_00929_of_02048.hdf5	960554fa344892fd9cf6da901fde73e3
+part_00930_of_02048.hdf5	9487c8ff641cc99d599678a165a81e72
+part_00931_of_02048.hdf5	32cb45c1c4e6e4fe80a683e2008f6a6f
+part_00932_of_02048.hdf5	0c15788fcfecdffc3587de0cac75cc08
+part_00933_of_02048.hdf5	9378e0bf9c4b05dc0b3fabf66df955f1
+part_00934_of_02048.hdf5	85deaf030555d96aba8bdc0419c3e1f0
+part_00935_of_02048.hdf5	43c55597746df6dac26345f81504d19f
+part_00936_of_02048.hdf5	acd41de29e848f4aa59fae4aaf391e77
+part_00937_of_02048.hdf5	596c8faf8222503be3ef5b9990056669
+part_00938_of_02048.hdf5	cb9caa093e91fc52668963d7ccd09472
+part_00939_of_02048.hdf5	cb9e5535069db80fb775e5b1251b2448
+part_00940_of_02048.hdf5	515339ea91d4ee3a0b0c066f1c65a5f4
+part_00941_of_02048.hdf5	c8ff19f1e837732cc4b3e79b3b8a8c97
+part_00942_of_02048.hdf5	e0e799e3dae46ec66d2e4c42c55cf0e5
+part_00943_of_02048.hdf5	f7473d316add42ed7799864f550e2557
+part_00944_of_02048.hdf5	f7d2ed2cde907cf29d85ae0fc7257afd
+part_00945_of_02048.hdf5	81898861f59d0d9ae2082de17cf42199
+part_00946_of_02048.hdf5	198bb5f1a8226790ed433ec355544c8c
+part_00947_of_02048.hdf5	276cc477e79e0f1b62387e193c18a8b8
+part_00948_of_02048.hdf5	41ab36a2ea30c92b70fb4db31b55544f
+part_00949_of_02048.hdf5	59a578d46f6058f6ccd106096999f8f1
+part_00950_of_02048.hdf5	4dca7e9454539f072d5332e9d12adccc
+part_00951_of_02048.hdf5	f28d5147d889a44e2a92bb464feffb55
+part_00952_of_02048.hdf5	fdcbebb24f3e7cfa710e8723e3dd1401
+part_00953_of_02048.hdf5	f9f71234e914e4eae73d392bbf62d224
+part_00954_of_02048.hdf5	e506314f575e064a7f777c2eeafadbbf
+part_00955_of_02048.hdf5	bf6d600b916105498ca22ae07768ba27
+part_00956_of_02048.hdf5	4a2d4cf153a46c9eba95b5d95c5fd667
+part_00957_of_02048.hdf5	f0e180989460123985e15f8388494471
+part_00958_of_02048.hdf5	10a852a046acb6be6b94fbb75a35a67a
+part_00959_of_02048.hdf5	fa1e817620fba16923b0dac690c632e5
+part_00960_of_02048.hdf5	31bf4d1eabb7f011755b4c5645f0e6b5
+part_00961_of_02048.hdf5	d5ca936aa058afd7c03b4fdd6d5f114b
+part_00962_of_02048.hdf5	e411ea5cdc7887b23a6250d2fb3aea32
+part_00963_of_02048.hdf5	297c9362661a97dff2e0dbf9d442841c
+part_00964_of_02048.hdf5	a41121694f63280c7b319aac504d7457
+part_00965_of_02048.hdf5	755bd367f5c6d67f205d9e3866635824
+part_00966_of_02048.hdf5	78cb4d293d442cfb6f0be6022627a997
+part_00967_of_02048.hdf5	a6d43a0379ac5f4f40cb0c458e1815e9
+part_00968_of_02048.hdf5	bf77ab4e0fca12bc0554f55489400c06
+part_00969_of_02048.hdf5	a851bb972042f18f1e65a40961d5d322
+part_00970_of_02048.hdf5	3be29c96d1643253aa8585bbfdae80d0
+part_00971_of_02048.hdf5	4d7838599302821cb2dffc1b3a5d6c48
+part_00972_of_02048.hdf5	30623d62793f694c54a520fd99d3a068
+part_00973_of_02048.hdf5	8ad1b3ecc4df49c76b5bd2b912b6a3a9
+part_00974_of_02048.hdf5	6739d82cdbc7128ccb8239d3443a84cd
+part_00975_of_02048.hdf5	ee90adf2f2a61f96428cd1b16cc16e84
+part_00976_of_02048.hdf5	67fbec91ceb5f72102e78fbe949c4b18
+part_00977_of_02048.hdf5	ea0c9abda530a702024877e71e470830
+part_00978_of_02048.hdf5	d6e71de3ccc1beabf2404ba8e63d0aba
+part_00979_of_02048.hdf5	c291b816e4a9077da0807a3600bdbb88
+part_00980_of_02048.hdf5	6d886d788fda14e9869d67927bfe94e2
+part_00981_of_02048.hdf5	21f64a0df41d3dddd894f9cf780b4819
+part_00982_of_02048.hdf5	a0d5c73560c5e06ea666896d9801282c
+part_00983_of_02048.hdf5	1c3b30e47dcca76adbd6ca7f9e768704
+part_00984_of_02048.hdf5	592bf985115dcbd3c020525cff19cb64
+part_00985_of_02048.hdf5	3bff0abde53100a1d4cff9f9373bc832
+part_00986_of_02048.hdf5	944bda7865f01f3b1f6dc4b38a1c2223
+part_00987_of_02048.hdf5	3ab6ff47ecd24d2e73b6dd0a653f31e7
+part_00988_of_02048.hdf5	a556cfcae43736b89004ba84c062b1cf
+part_00989_of_02048.hdf5	122ae6e8cc7d843f96d3fd0eadf34397
+part_00990_of_02048.hdf5	679ad3495e52d86d7fe533ea153fa8ec
+part_00991_of_02048.hdf5	87f2a71224a5e999a869c6a6aa0642b3
+part_00992_of_02048.hdf5	948545d9faf02afa1e95ce18c8e8f321
+part_00993_of_02048.hdf5	985a773d415983e4ad88d2d7ae24d8d1
+part_00994_of_02048.hdf5	e14b9616a5b890c1132a2d59eb95eca5
+part_00995_of_02048.hdf5	7d337d60f219950d6558ae896ae81547
+part_00996_of_02048.hdf5	77f36279ec56936a825b7bde0e9bddca
+part_00997_of_02048.hdf5	282d9e490be62f27c2fd10b44857495d
+part_00998_of_02048.hdf5	984408e3c6b0b21ad7e59e87c9deb0ba
+part_00999_of_02048.hdf5	91f41182c6e3f91ff8db8fc717ed3bd1
+part_01000_of_02048.hdf5	eb4944d638ef35c6a2f7d6c612c29303
+part_01001_of_02048.hdf5	4ecb6f9177bd114be05658b717f3a278
+part_01002_of_02048.hdf5	8daec54aa51233c33d39f9cce79d2e20
+part_01003_of_02048.hdf5	7dfd20714bd1d563d29908b823db75cc
+part_01004_of_02048.hdf5	bb8de8fb4fa6dde3ddcb8053f419209d
+part_01005_of_02048.hdf5	66b15ded614961781128926d639263fe
+part_01006_of_02048.hdf5	22d056ab268ef16732f8b198cdc15f57
+part_01007_of_02048.hdf5	c45306a6d1ac54719c4f7246739c6bbe
+part_01008_of_02048.hdf5	2b5f317256c7c446020a6f6001ebed6a
+part_01009_of_02048.hdf5	f74d8ddef7d2da531a5bf20981681991
+part_01010_of_02048.hdf5	a834ed755255062d95566c765344a778
+part_01011_of_02048.hdf5	b7e90c0c8d1ae6e4b35ad7cedf2f1a4e
+part_01012_of_02048.hdf5	4bfc91946f47aa1aeae98496956e5576
+part_01013_of_02048.hdf5	6e9cfcf4b8247f8c716114863db74059
+part_01014_of_02048.hdf5	f7e1a95a48fb62ea5d02b01c2a3827eb
+part_01015_of_02048.hdf5	ccbb7c859d30ea69afe81842e213fa4e
+part_01016_of_02048.hdf5	b54bbb6729f931d468ee906225888a72
+part_01017_of_02048.hdf5	d11c18c3acd28b8fbc2430700364a2a8
+part_01018_of_02048.hdf5	b74b7617716fa6815a8f808b2fc4f080
+part_01019_of_02048.hdf5	d4e8990d9a04b1991424ce61161e754b
+part_01020_of_02048.hdf5	6bdb85b832c7e03a76b35c7750922ef7
+part_01021_of_02048.hdf5	f2d6a17e7b801850135c65fedb964c2d
+part_01022_of_02048.hdf5	5affb73b3c8f45cef4cfb955c7152ef0
+part_01023_of_02048.hdf5	69ad1b753f956b0d3b22e3b2685a5b80
+part_01024_of_02048.hdf5	b141ece7d6dec7ef834d1ec6cfdc59fa
+part_01025_of_02048.hdf5	10c1b930952f19565553e9a3a6e13bcd
+part_01026_of_02048.hdf5	360ac7a68f321c12e5a4e2ffa7e20e68
+part_01027_of_02048.hdf5	9b44ad5f9ef8aaab3ec11130420d0895
+part_01028_of_02048.hdf5	2af5337f9e13db07a10ac6322f7a2591
+part_01029_of_02048.hdf5	83cc49b717be7c1daf02aa67b2a78116
+part_01030_of_02048.hdf5	236d63ae14242182b70c4695ecebee61
+part_01031_of_02048.hdf5	a4005be83ede8f4d5047a2a15e3ddebf
+part_01032_of_02048.hdf5	37a296c584a20393303a2ab4ea325c7d
+part_01033_of_02048.hdf5	784ccc5e273fbfcfdb9331ccaa8f99f4
+part_01034_of_02048.hdf5	3112646c93dcba64adfaa0a09038d255
+part_01035_of_02048.hdf5	d70e0d942102320307cb1e3b80ba6429
+part_01036_of_02048.hdf5	2003e22e9a8a7930742cef68ebff40b2
+part_01037_of_02048.hdf5	601e729ab123198b39ac44a85361d4f3
+part_01038_of_02048.hdf5	fd912992295549e2703fdb0105f81717
+part_01039_of_02048.hdf5	56b072590fa2a0f38756fbd01fca5366
+part_01040_of_02048.hdf5	49e35701edf866153b531fbef95ed625
+part_01041_of_02048.hdf5	3136ac6bf0442c9345863d2b17306a9f
+part_01042_of_02048.hdf5	8ea65a9a624e1fd6e42f1543af214125
+part_01043_of_02048.hdf5	662999254b703c7d7ae476f58319f298
+part_01044_of_02048.hdf5	80f5f9ce0120ebacc80e775cb8c8eb01
+part_01045_of_02048.hdf5	d4b88a67165c6167680e8ab1f452db12
+part_01046_of_02048.hdf5	bdba4699fd6451266e883a0936109bf9
+part_01047_of_02048.hdf5	d4eaba83802bfc5d153c31103e98c479
+part_01048_of_02048.hdf5	3a2be5a79f624e3fa07a69cb8a223817
+part_01049_of_02048.hdf5	cdb3fd9d61083d569b52f0b120bdaad9
+part_01050_of_02048.hdf5	36d17db8c1e67577e4b8a63feeac876b
+part_01051_of_02048.hdf5	8f1bd5ec1ebca7a69c456f63a588fea3
+part_01052_of_02048.hdf5	81bc408294f94c6791abbcf0e4971b53
+part_01053_of_02048.hdf5	04cec5a07a728439948a3bf9ea75070c
+part_01054_of_02048.hdf5	78b6455c29a211e77bbbb4913c3f9779
+part_01055_of_02048.hdf5	a026bda2389f2026a34539ffb2fb68a3
+part_01056_of_02048.hdf5	5db8688e2f408fb547dace49be32c521
+part_01057_of_02048.hdf5	63904c403a786df23ac43d04d4263275
+part_01058_of_02048.hdf5	d2e7bdbf45c09fdccfca27199ef13329
+part_01059_of_02048.hdf5	067f1f3f8506c9d42fe6328d972a451e
+part_01060_of_02048.hdf5	8299064690009e8942e1aee9bc441ceb
+part_01061_of_02048.hdf5	75bf59d24b0e93bb4d6d6aee6e321dad
+part_01062_of_02048.hdf5	74f00d6f0d887d2ede8c57eb3d9e0c8b
+part_01063_of_02048.hdf5	8c7f381d9879e8651a111042d0c1b65a
+part_01064_of_02048.hdf5	9fe0ae79e14cd17b4a6a2f808894fd1f
+part_01065_of_02048.hdf5	349e49daca6154ce8f4b505039be947c
+part_01066_of_02048.hdf5	2e95e2e46a4556c1c9a7e3b101c64408
+part_01067_of_02048.hdf5	c0b7de6cf99162b25fc83ba34df9b811
+part_01068_of_02048.hdf5	3fe0b0c1e7972113d1d37a8414265d8b
+part_01069_of_02048.hdf5	88c0dadcd583ac05f570f47c5b93a309
+part_01070_of_02048.hdf5	8a3b5085992318516319d3c5516d6f36
+part_01071_of_02048.hdf5	fce4b8cba2702ad176ae7036c9e85ad4
+part_01072_of_02048.hdf5	b3eaa253c5ec8aac9ee3a2c8fda286f0
+part_01073_of_02048.hdf5	82edc7abd21a8725615ba5bf3e3120a3
+part_01074_of_02048.hdf5	3c89a25c15fa36400c3fea169d120b2f
+part_01075_of_02048.hdf5	34b60782bdb1eb84c15550cd514fd7c1
+part_01076_of_02048.hdf5	d826520674f80d6b740c6229200fbd7e
+part_01077_of_02048.hdf5	fe756bdbb07a25905b0d60f43f597297
+part_01078_of_02048.hdf5	eb3d127b128b81b432f7979a446a0970
+part_01079_of_02048.hdf5	549d083282192c69729147d3b16bb47a
+part_01080_of_02048.hdf5	f78e1aedc0ad3fc6135514984694298f
+part_01081_of_02048.hdf5	454482e58b755c7a4a66d2540519f378
+part_01082_of_02048.hdf5	3fea0aaa9f0411670da830fc0a580a1d
+part_01083_of_02048.hdf5	7eb4fd98fb4b14fb51d7f92f73db782b
+part_01084_of_02048.hdf5	9a5120774ba2f5a3bf207b40a117def2
+part_01085_of_02048.hdf5	16f206aaebf13eaa1c6fdfcc1868c11c
+part_01086_of_02048.hdf5	a2998bf87b2534bda620fdef15856e19
+part_01087_of_02048.hdf5	06725c4a210ac8ceda8158ba46f30834
+part_01088_of_02048.hdf5	da0f6162fdc685d248ec8312601e3ae9
+part_01089_of_02048.hdf5	85a3e8dac662184337534b22467b6160
+part_01090_of_02048.hdf5	311eb30dcd5ecdcc85fb149a2aef8d14
+part_01091_of_02048.hdf5	c1ee41b4ed7fde41863d57fccc7ebf75
+part_01092_of_02048.hdf5	5e5a7d57f67d396b1de72ffdc58f474c
+part_01093_of_02048.hdf5	2950d7620fe8f4edc58b4dbabc6de7e6
+part_01094_of_02048.hdf5	60d0507b9d08f387d3fbe5bbca8efd45
+part_01095_of_02048.hdf5	6c694df92e8e747ae6703f6d1885f05b
+part_01096_of_02048.hdf5	22cdda2cab1824f7575270c84cca03b3
+part_01097_of_02048.hdf5	55160510b72d792cfc13943ed7a93b28
+part_01098_of_02048.hdf5	5fd0f9970e0802c129c518fd85e62fef
+part_01099_of_02048.hdf5	a60ce740fa39fa4553d6e92746cf9a4f
+part_01100_of_02048.hdf5	b6f83dab9e109d47f4657bcbb6883e3c
+part_01101_of_02048.hdf5	745628c993fa6d6e3dc4883ae50982b3
+part_01102_of_02048.hdf5	5d4729f2232c2134e4e969d74da9a552
+part_01103_of_02048.hdf5	94446302ef1ca3c4b4aa5e9c5c76554f
+part_01104_of_02048.hdf5	e55d41821f02db21c32509ec4f5c0aaa
+part_01105_of_02048.hdf5	19174a5e8c161cc8d672f0d748238d51
+part_01106_of_02048.hdf5	b661752b018c8caeb6a1aa0aa31b4181
+part_01107_of_02048.hdf5	04d7bdd8c0b402c7a2a28571366cb7da
+part_01108_of_02048.hdf5	ee8d02155fef1a6b6dce6b55725d6a3c
+part_01109_of_02048.hdf5	883a3067b781faccccd81aa474d190ca
+part_01110_of_02048.hdf5	19a3f2330e8b240ff110e34db6afecca
+part_01111_of_02048.hdf5	fbc9f411ad7ffe96be34377513014903
+part_01112_of_02048.hdf5	74ea23bc051fa28e04d3c12fae62676b
+part_01113_of_02048.hdf5	d5f53ada0f1c61c1b8ca782c3d441e73
+part_01114_of_02048.hdf5	eec96f163ddd0f93fe09d30b99d583d9
+part_01115_of_02048.hdf5	198d54047d5f43e9bb8538f2baca24b9
+part_01116_of_02048.hdf5	0f3253f432368d012d4050571f469583
+part_01117_of_02048.hdf5	d9c5cf3c6700362c5f30bab2e8e28d77
+part_01118_of_02048.hdf5	9d8f4c65605a07b7ddda69f4889a8506
+part_01119_of_02048.hdf5	7044805e7f9ccdee516a8a359b099700
+part_01120_of_02048.hdf5	108776606725d5e0f47a6e668203c80a
+part_01121_of_02048.hdf5	37b7092a83a04de06196d6b52c68d393
+part_01122_of_02048.hdf5	8ce002d5dd777c013cb063736f3b9997
+part_01123_of_02048.hdf5	4b7c1caabbfea4fd056eb58a4e7a681f
+part_01124_of_02048.hdf5	1452cf5923d2b2c1805e19a72c477047
+part_01125_of_02048.hdf5	3f8a4147a19d9cc8e3b0d3668d3121c2
+part_01126_of_02048.hdf5	457a75860b22c3189d8b7511cde5f77b
+part_01127_of_02048.hdf5	170bcde395cff9fa2dacd2cd7410336a
+part_01128_of_02048.hdf5	c49aea25531188e975c97807041a49b6
+part_01129_of_02048.hdf5	da5b8432daf01a56467a3f245f797e93
+part_01130_of_02048.hdf5	cb53c0f208f2da1fb81b731f525819fe
+part_01131_of_02048.hdf5	faf20a6748baca0fc572d4a476ed658f
+part_01132_of_02048.hdf5	7266d15d777ed7b6774b06363dcb5f09
+part_01133_of_02048.hdf5	5ad8dcdadbe8c3e26b7123f0ebc5a1d3
+part_01134_of_02048.hdf5	c8a75d3b67d0136a6ee8fb15ddc635fa
+part_01135_of_02048.hdf5	9e82ef30d32d6a4e91ab392245844b39
+part_01136_of_02048.hdf5	bccaacb6584037e52b77d74aac6cc5eb
+part_01137_of_02048.hdf5	93ddb2ef80d7d29665d3c05a0cfa799a
+part_01138_of_02048.hdf5	b07d4e40fcb9154460034be7fd946e56
+part_01139_of_02048.hdf5	2c1db5d22f028729d81be33643c3bcc4
+part_01140_of_02048.hdf5	257f63f7d71422eb58fc1f513e271d16
+part_01141_of_02048.hdf5	e4afec7be02dbd06c5ec019f8165f158
+part_01142_of_02048.hdf5	4ce4ef4f30f889a4c2b966622f8b8b68
+part_01143_of_02048.hdf5	bad075e13f75cc409d8c1f80e1f88a90
+part_01144_of_02048.hdf5	e132ed386a0ed493ae3f877ef4a350e4
+part_01145_of_02048.hdf5	f8bbd125337c2aedac48779521a2e691
+part_01146_of_02048.hdf5	5e6fa1c38c753976d83cdf2cde8d63da
+part_01147_of_02048.hdf5	f660ad03f42932f3ce2d71dd6f1e1f1b
+part_01148_of_02048.hdf5	285feb36c31b24611e71151cde8b7f49
+part_01149_of_02048.hdf5	ee38536e6a0b6b71ea3f228f58e3e228
+part_01150_of_02048.hdf5	f4021c38fbc1f8feace2306f1ec7ebbd
+part_01151_of_02048.hdf5	ce74da5e3a6880e772905c4160d522e4
+part_01152_of_02048.hdf5	4615c0c1c2c639b789901b577304fc2a
+part_01153_of_02048.hdf5	54b3e7f7ee15952d84b2fcb21e1cb0ab
+part_01154_of_02048.hdf5	7c62c22f3135f61a62972dad8cf9ba6e
+part_01155_of_02048.hdf5	47948f16d8359f297c6434278bec96e9
+part_01156_of_02048.hdf5	db1b3c11b70efb2c9fda14185861e30c
+part_01157_of_02048.hdf5	f9848157a4ed3d783112971bbda45014
+part_01158_of_02048.hdf5	51a5678e9f2adada08ffeb8096b43b05
+part_01159_of_02048.hdf5	a037196ee04a00ec563b5a6f69576918
+part_01160_of_02048.hdf5	b06ffe0556ce1fa892f161d453cfcde7
+part_01161_of_02048.hdf5	40ea1e439a5b8f9a30ab04c857a50ab6
+part_01162_of_02048.hdf5	100144ddc0cd9e2668b5544244020609
+part_01163_of_02048.hdf5	83c76787fcc399242c0c3afadce3cbb7
+part_01164_of_02048.hdf5	1f80662b61ccaa4cab8ea428b8de8ad7
+part_01165_of_02048.hdf5	21cca365441698fafde522db0e88f474
+part_01166_of_02048.hdf5	be3a85a18bbcb8c692e9c936d3fae151
+part_01167_of_02048.hdf5	12e052fbb240c78b1b36fcce30bba558
+part_01168_of_02048.hdf5	6a4b787feba85528eb039274e995e150
+part_01169_of_02048.hdf5	a766268f658227494bdffd884455b75d
+part_01170_of_02048.hdf5	f816a002f6ce1245b0b83f90878198d3
+part_01171_of_02048.hdf5	3105ff386aa31b2ec3c54743da1795ce
+part_01172_of_02048.hdf5	0c488420a62a48e79bd170f9dccdaa61
+part_01173_of_02048.hdf5	6e12f4397ab0a19ceb78302b24f49016
+part_01174_of_02048.hdf5	48a0ed9340afd9cc866405d47a51b9b5
+part_01175_of_02048.hdf5	a52d889d81aa3664456fe6ce85b09f82
+part_01176_of_02048.hdf5	feb3e8f3a23ba9cff1224c91974147c0
+part_01177_of_02048.hdf5	8a0abc748a08445807d48b9b96219126
+part_01178_of_02048.hdf5	073417c986c5cd4de66c754e4487820e
+part_01179_of_02048.hdf5	48c0e496a6e3c694ac9b9abe4b21a1e8
+part_01180_of_02048.hdf5	766138ea57024561bc405577057f7870
+part_01181_of_02048.hdf5	671632541f12d86f5962cf9c1f4bc59d
+part_01182_of_02048.hdf5	c1e8baf9beba1319d1c3b40c1fb4ab28
+part_01183_of_02048.hdf5	9229fcba08f29d253e64d6ac11a187f0
+part_01184_of_02048.hdf5	6cd3ebd7a55b54d97041c61514da740c
+part_01185_of_02048.hdf5	a50cc119ab1be86f8f76d3a28c31cc94
+part_01186_of_02048.hdf5	a2c39aa7b96ea08a04eb65dcbeef97cc
+part_01187_of_02048.hdf5	38033ceea48882a7c4646828724e7869
+part_01188_of_02048.hdf5	c8d50e243029d84354620fbfedc34c68
+part_01189_of_02048.hdf5	b800c49dc8984f54baa03e491a1b9b9b
+part_01190_of_02048.hdf5	75dd85039b0f1af77baa6d8826244b76
+part_01191_of_02048.hdf5	1a478d383731f7f23a1fc441d67044e3
+part_01192_of_02048.hdf5	375274955d96784892302bfc5d63595b
+part_01193_of_02048.hdf5	1b0ba2d11efe69f61d03777de2a893ec
+part_01194_of_02048.hdf5	dc40cd760e4be6b36aefd38e63731dba
+part_01195_of_02048.hdf5	951c9f9a47db0bdef20a951b919a1a59
+part_01196_of_02048.hdf5	32181e84eadeba3ba3f05122f6e1edb7
+part_01197_of_02048.hdf5	76e60b2c568f5a54430df3eee2fe86c8
+part_01198_of_02048.hdf5	424f8e0be37929ab2307fdff3b7701a5
+part_01199_of_02048.hdf5	e96f8a4eb7f9eb747b986c8c1326b688
+part_01200_of_02048.hdf5	46c5ec35809fd9433f987b29a5b128d4
+part_01201_of_02048.hdf5	9961d326715619d6fc6d0657af09076c
+part_01202_of_02048.hdf5	dab5f328675e586959b43723d9798f9e
+part_01203_of_02048.hdf5	eeb7e6f9d6753a6f6eb27efb2281e342
+part_01204_of_02048.hdf5	bfe9a3f166e5faa12aaee2c9b9306866
+part_01205_of_02048.hdf5	a579d123ad4eba9a3077d1e5c40a59a1
+part_01206_of_02048.hdf5	dd06c4571dfaa2bb69d37017b0ec6be5
+part_01207_of_02048.hdf5	6683e398b2f6314aede109753c252e72
+part_01208_of_02048.hdf5	83743c9ad94e185eeeed8b27b6e600a4
+part_01209_of_02048.hdf5	34ecc523bc238775d6cc7862766aded8
+part_01210_of_02048.hdf5	2ee095472b769ed1fbbe4b2c5e596c22
+part_01211_of_02048.hdf5	1ebfb46da8ada458c24637578afca65e
+part_01212_of_02048.hdf5	9cf3941bd04cfc7ceab0ce1cbdbbc8df
+part_01213_of_02048.hdf5	26a8022022107a499468cf9270846e27
+part_01214_of_02048.hdf5	027d4395d4ae8086e84501f7e3d9fcae
+part_01215_of_02048.hdf5	2273d9c4d9cef8c7f024942bfb62a646
+part_01216_of_02048.hdf5	50d99a184e5f412f0242dc1066a18a5c
+part_01217_of_02048.hdf5	d4cc54ac847e668c1e3c82a9b8681129
+part_01218_of_02048.hdf5	08b5d1846133d7974589f21e8b29f59c
+part_01219_of_02048.hdf5	89d9679061df71b83f2f27f27fe2dce9
+part_01220_of_02048.hdf5	2677d4ad7ad3113cd632cab231106f14
+part_01221_of_02048.hdf5	f3736c8cfafdfebcee26b7bcc3400655
+part_01222_of_02048.hdf5	5941fc0d884fc669b5e0d72472ddf8ca
+part_01223_of_02048.hdf5	4d1e200bdca802c5901fdb91410e6c00
+part_01224_of_02048.hdf5	0b2f84f7b743eb53bda7beecb1f5e262
+part_01225_of_02048.hdf5	8f9a43afae2246a30dcad6d011d192a4
+part_01226_of_02048.hdf5	e61389952ecac75f6c5e03cf4aeee5b9
+part_01227_of_02048.hdf5	7ade615414096876a9d34a5aaf7bdb8c
+part_01228_of_02048.hdf5	7e2ec382541c94a6531dde1539e60629
+part_01229_of_02048.hdf5	97538b268d9c988607314e31c74b8105
+part_01230_of_02048.hdf5	aa8e099ba50641873a4ae53af9bd99a4
+part_01231_of_02048.hdf5	a7d16032614f724b996334fac62cd620
+part_01232_of_02048.hdf5	dd848e21025d63d00d8ccc8faf0fa585
+part_01233_of_02048.hdf5	c52d1e1724f67951fc88f90f0ac83cd8
+part_01234_of_02048.hdf5	9dd8f5a99f3059b51472e3196fb1a9ce
+part_01235_of_02048.hdf5	26c86f41c120630b6a8af197a09ed7c9
+part_01236_of_02048.hdf5	2b1a554da2f48d137eec2f9d85842e97
+part_01237_of_02048.hdf5	d27b9a445b76ddc15aec1e1f13f4d75f
+part_01238_of_02048.hdf5	643898daacc03f4e1024786c8b65f4d0
+part_01239_of_02048.hdf5	7784bbdf28d3d48505b939e176897367
+part_01240_of_02048.hdf5	da5b8354e2aa6ab3da1f50f5e64bdb3d
+part_01241_of_02048.hdf5	36dd432640d6834f993264c1c95a8229
+part_01242_of_02048.hdf5	f689b7471d349d4f922a1849f1c57b81
+part_01243_of_02048.hdf5	8bca7e391fe5b93b3044fc7d1c30ceed
+part_01244_of_02048.hdf5	3afaa5d6b83b2bd80a3dc37216eb4457
+part_01245_of_02048.hdf5	574c700f77a4dac713be6ac481a6af28
+part_01246_of_02048.hdf5	3c52baab58d13a0dfb22a247c0448b30
+part_01247_of_02048.hdf5	a8899f0e065d8e67ade803e57ccf49c5
+part_01248_of_02048.hdf5	56d85119f82967d93c5124b119a240f5
+part_01249_of_02048.hdf5	aa9d7eff72dacb1dfca2e776199d5fc5
+part_01250_of_02048.hdf5	9dddff5b9fca267f0931ea27cea48d32
+part_01251_of_02048.hdf5	e63908a10bd888d26163b4130295c638
+part_01252_of_02048.hdf5	103753e040dbc696609c60b91d21d223
+part_01253_of_02048.hdf5	8cb36a24486299e5541891041e787fa7
+part_01254_of_02048.hdf5	95d734e3089b39296ff0c4286dd3f950
+part_01255_of_02048.hdf5	d018b9b4ef8a475f9c4edff675df56da
+part_01256_of_02048.hdf5	1a555bcdd7f0a862964ae375c1b22bf8
+part_01257_of_02048.hdf5	24a3cf9c33cc25bf3269e21b5658f24e
+part_01258_of_02048.hdf5	45a472f598e43f6872dbbac6b71f0446
+part_01259_of_02048.hdf5	69da9df4669ef6b4a19b913cc9a31bd7
+part_01260_of_02048.hdf5	cc86b2f7e6481d11033c2a398e318a89
+part_01261_of_02048.hdf5	28b15f18b99ab747d26bacdde07d0da0
+part_01262_of_02048.hdf5	96728fb85639096228807125a5c40506
+part_01263_of_02048.hdf5	1ca48f2a890a194ffd19d4339f0e3be5
+part_01264_of_02048.hdf5	bf11dca036e9935c3fe75565bec41d80
+part_01265_of_02048.hdf5	f44b6dae01b661c5e1f46cbb76844d5f
+part_01266_of_02048.hdf5	6f6cbc6e6bf242daa7051a9d1f8bd72a
+part_01267_of_02048.hdf5	20c0f0f903508a03436b67e636f8b33a
+part_01268_of_02048.hdf5	f632b2d20f8376fc7a3f5ced805bd2df
+part_01269_of_02048.hdf5	5a1f80e304557974ba10661b956ae6e7
+part_01270_of_02048.hdf5	1191e7454dca9fd45d077ceb45e3ce9c
+part_01271_of_02048.hdf5	66fb3535de1ac37a759fcaed2d3edda1
+part_01272_of_02048.hdf5	47812fce936cd14f7e3550e782032452
+part_01273_of_02048.hdf5	6a1aaed35c4f274b1baee03b0d82d207
+part_01274_of_02048.hdf5	26800924b6618807f7abe93619f7cc59
+part_01275_of_02048.hdf5	a7c39164099fbc2c469c5ca4f740c824
+part_01276_of_02048.hdf5	f237fdec353fb8886119071de2efbe91
+part_01277_of_02048.hdf5	89bf56e49012f9c4be319540ed29aa74
+part_01278_of_02048.hdf5	621d07eb28432bac0ccdc3fe7037b670
+part_01279_of_02048.hdf5	eb77e9b60d55c1ac72dfed8e600a3de5
+part_01280_of_02048.hdf5	23d7cfb28a49748ea7c5fab0675f7283
+part_01281_of_02048.hdf5	fb2ce058ad89dfbb47b9772d19892af6
+part_01282_of_02048.hdf5	9d12c1bbed81973e093654e378122a96
+part_01283_of_02048.hdf5	405a3bc1bdd8aea8700969cbc2e3a91e
+part_01284_of_02048.hdf5	c807432cb2cb5e5a5e3a01d97bdf8713
+part_01285_of_02048.hdf5	ad0a87efa8559f989e6dddb0ccbdda26
+part_01286_of_02048.hdf5	df914cb6caa99d817624f07d0e5cf580
+part_01287_of_02048.hdf5	ba6d2a73f75b36429ed6f61e8993ed54
+part_01288_of_02048.hdf5	48271ca2dc156f3b8bdc91a9f0633c63
+part_01289_of_02048.hdf5	049ad730dd701351422def6da4e3bd88
+part_01290_of_02048.hdf5	9e3e5b282d68721d17f9f2aa0bdbfec5
+part_01291_of_02048.hdf5	fb027eb6c2c30c2b69f912ad81b51034
+part_01292_of_02048.hdf5	67e0be12da4c765f4a8f5669f4593b19
+part_01293_of_02048.hdf5	8443276b192dfb1fe8d95501e536d382
+part_01294_of_02048.hdf5	c89342feade773196847601ba402db02
+part_01295_of_02048.hdf5	afb2b592a34704f1253505bf1148d062
+part_01296_of_02048.hdf5	59615ef73e383a3bfc2701e1982bed7d
+part_01297_of_02048.hdf5	bd7ed53af748752e1a38dfa0c647d5a5
+part_01298_of_02048.hdf5	5b70e0ec0f2c68bb2c25f9bca73d4ae2
+part_01299_of_02048.hdf5	e67172adfa177abbc292f9c9ac931c50
+part_01300_of_02048.hdf5	5b968e66ea91573ce822717f176206ab
+part_01301_of_02048.hdf5	612c305d47b22e574995621e478f1d66
+part_01302_of_02048.hdf5	6377d2b4505208b3d55080af301770a9
+part_01303_of_02048.hdf5	c57643e25d46d906a7c3f99ad144d1bc
+part_01304_of_02048.hdf5	b9db7575eff138f39cfbde05bf19a018
+part_01305_of_02048.hdf5	b87b09f14df1ca80f9975abd58bcd30b
+part_01306_of_02048.hdf5	c09b60ff5a1c9305339b96af9a2e5bda
+part_01307_of_02048.hdf5	1586026fdd4a0399c7fd67b152a5b11d
+part_01308_of_02048.hdf5	9aeaec318f6bc71ad613152e5b693bfc
+part_01309_of_02048.hdf5	21feb217502cd9937421cc857d502b0f
+part_01310_of_02048.hdf5	cc3c30dc20790ba3923a992c6496977a
+part_01311_of_02048.hdf5	2c288a1e9786ff949a19601f30ba3836
+part_01312_of_02048.hdf5	7afef7ffa0f4bf2e97444fe23becf457
+part_01313_of_02048.hdf5	803355b6e79fea94fdb5bad3929e43b6
+part_01314_of_02048.hdf5	0853977bfc9cc7e57ab2ab8cdd543bab
+part_01315_of_02048.hdf5	b7522ce8a58e205b54ac0db002abb60c
+part_01316_of_02048.hdf5	03368ce7fa79717a32c5cb875f69fb6b
+part_01317_of_02048.hdf5	a7c1e1b53b7ac851386e83c7b8ad8a78
+part_01318_of_02048.hdf5	4882836c54f0d98fe8d1978b3908b5b0
+part_01319_of_02048.hdf5	26360d926765ebe3ffa9e4bdf6911b7e
+part_01320_of_02048.hdf5	c4b771a008b8fa698cde754c94d7663c
+part_01321_of_02048.hdf5	eb71fecd2596087f2bfdd0363cfee24f
+part_01322_of_02048.hdf5	980f1c2af0ff85cb9d816cc0f6d7819f
+part_01323_of_02048.hdf5	f03911986a0494634422af9e5b846e02
+part_01324_of_02048.hdf5	44fb704c585ca76602fcf5f71e575d6a
+part_01325_of_02048.hdf5	bd5a358abe430ea37d9c9ba800307f25
+part_01326_of_02048.hdf5	cc69f851205b3760fbdf7724a5239488
+part_01327_of_02048.hdf5	1618733215fdd9922b52415a1a937469
+part_01328_of_02048.hdf5	9abfbfd4f3b2de2bf6c1dc54ad9fd625
+part_01329_of_02048.hdf5	53de752292b5aabb2cfce812f88ca11a
+part_01330_of_02048.hdf5	892f4ad58f1684e20dae77b523a56ee4
+part_01331_of_02048.hdf5	94a2e23801dbd188f41ea6493796df1a
+part_01332_of_02048.hdf5	2a4ef58e27e870650f04405793cf542c
+part_01333_of_02048.hdf5	52ebf696901b1377273453ead9c38ad9
+part_01334_of_02048.hdf5	78f216b57cad3dcc7c653bb157df6b64
+part_01335_of_02048.hdf5	d6daae29be7b0d670694f1643845a316
+part_01336_of_02048.hdf5	9007c840b60e0f0d26a20184568ab3c1
+part_01337_of_02048.hdf5	7aa50cc234bc1bed3bc044d205484ca2
+part_01338_of_02048.hdf5	90544a22415b16ff276e06e8a5b4cc4e
+part_01339_of_02048.hdf5	95a6ae14ea03943f8ea215d723149158
+part_01340_of_02048.hdf5	4e5a2e1190b649cc6d2a144006a29169
+part_01341_of_02048.hdf5	1cb635a3a54f7aca315c5fe54d1ff5a0
+part_01342_of_02048.hdf5	0984591bf6b474fa1b96f61625e4ea66
+part_01343_of_02048.hdf5	baeae783cc6748e273e2d8b2709938a1
+part_01344_of_02048.hdf5	9ae28ec85c85adc2b7a907f234d3e616
+part_01345_of_02048.hdf5	f249e8578dac1c1c3095f854cb8cb8d2
+part_01346_of_02048.hdf5	35b3402273dde425dc3cf447cacdb648
+part_01347_of_02048.hdf5	7271a7b0c9b52624439396e1f9466ec5
+part_01348_of_02048.hdf5	0bf36f8bc5566d16238d049a73d94d12
+part_01349_of_02048.hdf5	bd35a57dbdb3393dc14c9f1fa0a8b1e4
+part_01350_of_02048.hdf5	237466fb2a0974c3f7dff2ebf83d05fe
+part_01351_of_02048.hdf5	4e5f8402bbffd8df2830ee60e6b4826e
+part_01352_of_02048.hdf5	82f1d457b57de4cace571ef3b1a32077
+part_01353_of_02048.hdf5	37e719fc74696c274953fd05546bb92a
+part_01354_of_02048.hdf5	846a07a09c62edf4883d11c5613f1900
+part_01355_of_02048.hdf5	06a63b7b89d7f0e230b03b93da033933
+part_01356_of_02048.hdf5	0369d875c795b88304d2e3805d266df8
+part_01357_of_02048.hdf5	7d9b157212383e1d47b5cdb3bb9870a7
+part_01358_of_02048.hdf5	7856d04702a94410e10080a9e26391f9
+part_01359_of_02048.hdf5	d1cb333acaa3b8dfb6bb997b7b028d2c
+part_01360_of_02048.hdf5	ba699a3b6223a08d1785bddbd645a263
+part_01361_of_02048.hdf5	235c833c6309e882e8d59868b638d404
+part_01362_of_02048.hdf5	c5b4f912616fe3580ca576ceb1329bc8
+part_01363_of_02048.hdf5	f36d2694d210156444b1300b492cb4a0
+part_01364_of_02048.hdf5	8ffc08122dc47f188df87dfbce156606
+part_01365_of_02048.hdf5	992d1e5d3355c84031e3a0aedc04df27
+part_01366_of_02048.hdf5	a078c643a22c6622a77623d258ac1d54
+part_01367_of_02048.hdf5	19f2e3da76a1b100587c3e0d9293b980
+part_01368_of_02048.hdf5	29d6cbfe4af21360f2b9395c804ce79d
+part_01369_of_02048.hdf5	75183122f98417a170b142815c144c8b
+part_01370_of_02048.hdf5	8a60da88451917da6e83e8a5feecf006
+part_01371_of_02048.hdf5	3c056e3695334e9b555d8b71556c604f
+part_01372_of_02048.hdf5	bbee3c3e15627fb75d69fe5921ab542d
+part_01373_of_02048.hdf5	fd82b7d1b69a79577a6f6dfa392cf3cd
+part_01374_of_02048.hdf5	48bbc78f12b12d46d45aaa1f14a8dc6a
+part_01375_of_02048.hdf5	e17811f0d36aa999d74d42101c2f4ca6
+part_01376_of_02048.hdf5	307e9e0314d464b53fa5a747505e648f
+part_01377_of_02048.hdf5	b8ea5d93cc67e2408d4e48bdf79c6b61
+part_01378_of_02048.hdf5	f25aefb0441e355b00e5685c640c8da6
+part_01379_of_02048.hdf5	d926cacd755b29ef4433ea0296677b6f
+part_01380_of_02048.hdf5	fe7585fdfbb794d615ec99caa0ece54c
+part_01381_of_02048.hdf5	0161f7e0c87131c03549db75a12641f2
+part_01382_of_02048.hdf5	8632cadc67b576ab3e5bc83c20099478
+part_01383_of_02048.hdf5	b60deaf0a8f8f1b2c14dddb3c0b44220
+part_01384_of_02048.hdf5	4ef862ff3bb5a8eba9fc541b90049f39
+part_01385_of_02048.hdf5	d522093158df2ac834666a5b7652931e
+part_01386_of_02048.hdf5	8cd1e45ae8fddbe73a1f95da2c7147d6
+part_01387_of_02048.hdf5	cb69cce7c38562545f2421972830318f
+part_01388_of_02048.hdf5	696ac1cf32c6722f25d46ceff05b4325
+part_01389_of_02048.hdf5	2bd30de9e1be67680587327ff24110bc
+part_01390_of_02048.hdf5	d21ad2cf13bfa063471d0211933ce198
+part_01391_of_02048.hdf5	221ceab04b352978ad728218e86c51eb
+part_01392_of_02048.hdf5	0adb55f24d24976b1abd48c3fb94e5d1
+part_01393_of_02048.hdf5	e32c890dd5c2aa3b892ca0217708f3f9
+part_01394_of_02048.hdf5	0495098fe4025d1fb037a49a93d49346
+part_01395_of_02048.hdf5	e0f5a4aa05b03b432417e913c75343fa
+part_01396_of_02048.hdf5	81d412aaac1cdf7d88f89ca4848c521a
+part_01397_of_02048.hdf5	9b2c646658c7388b426dea0bdc80c9fb
+part_01398_of_02048.hdf5	417d1a40930857f30a430342b194e446
+part_01399_of_02048.hdf5	3219353623fb14d4d9ddb2dd53a6138a
+part_01400_of_02048.hdf5	0716bf9267446570740333b81e2ec95b
+part_01401_of_02048.hdf5	471585e5b1573e7d9de166a5627d0cbc
+part_01402_of_02048.hdf5	afbb1f808cab59d59df7c6d0f22a0f45
+part_01403_of_02048.hdf5	c68bb386ebd4fb7a3a24d90797bde6c5
+part_01404_of_02048.hdf5	e68662a5151a238b6d0a93ae1f78d8d6
+part_01405_of_02048.hdf5	84da5f53f9a33b2f3099885f6f90457c
+part_01406_of_02048.hdf5	8116249e1ae357d2a92588e1959a1da2
+part_01407_of_02048.hdf5	a57c9cdaf06b1ed54bf4ac1bd35bb13c
+part_01408_of_02048.hdf5	2c024a8168be2dad01deb3f298d3a4b5
+part_01409_of_02048.hdf5	ccde73a1f1a8fc634a4b312c42146f2f
+part_01410_of_02048.hdf5	8d5154cab5e4a4a61f74fdd7f9d9fdc0
+part_01411_of_02048.hdf5	48f8977f655404969e7317818285320d
+part_01412_of_02048.hdf5	c95d1048343111a31997194e320c1adb
+part_01413_of_02048.hdf5	454734e3ca19ec744ea9f796d6bf35ac
+part_01414_of_02048.hdf5	0aae7ece14ff81be595ff75ecfcdafb4
+part_01415_of_02048.hdf5	efbfcaf976e4d71551e5289f762bb600
+part_01416_of_02048.hdf5	9f9c8044793b40fe1874ea9ef6784c52
+part_01417_of_02048.hdf5	ab1f35bd52ba5aba76b89922cc23a3a8
+part_01418_of_02048.hdf5	b64c985b44780ba36c1933f11abf039f
+part_01419_of_02048.hdf5	6cf9cd746acc9c9ceeb8ef3b10dfc557
+part_01420_of_02048.hdf5	4be09cacd92562743c8841d05ad46af7
+part_01421_of_02048.hdf5	cb24d2c80033ce82ca2fe25646787d62
+part_01422_of_02048.hdf5	066819c714d5f71b47c3012ff15d6e3c
+part_01423_of_02048.hdf5	cb2d4beeff28e483a7b9e94a7e1a1f0f
+part_01424_of_02048.hdf5	47d4b83c69da1442017fde5da7ddbb0e
+part_01425_of_02048.hdf5	fe827db52961c0c8ef9ee65fe5fcd090
+part_01426_of_02048.hdf5	828caeb40d76becff20199f7c16c0e12
+part_01427_of_02048.hdf5	183f1f75ecdfabb9db90fb5af1287067
+part_01428_of_02048.hdf5	de726593c810606038e8a764950f3621
+part_01429_of_02048.hdf5	5ff1ab82d1b695fdb2ed3faa15e01e84
+part_01430_of_02048.hdf5	12bde6cb376d1254bddc53096b4c94a4
+part_01431_of_02048.hdf5	3702f799ffd5fecbbf9a5e91c1565e74
+part_01432_of_02048.hdf5	b9dd0ac5a8aa7cf9e9a773eace5e5a9f
+part_01433_of_02048.hdf5	9a1612fc3045f1ba1a96b53e4c7d3db5
+part_01434_of_02048.hdf5	4a8a5ccedad8fe03cb4057fc29ce0618
+part_01435_of_02048.hdf5	bc3e9e2f6e2b9fc2957b12e47c7b836f
+part_01436_of_02048.hdf5	2e6dc72824eeea1755cac28ef6deb90f
+part_01437_of_02048.hdf5	e25eb94feee39a3c679f74ba52034b4f
+part_01438_of_02048.hdf5	05765de5af1a27d96bd3a7519042a649
+part_01439_of_02048.hdf5	0e8d7c827d26bb4785fe335c07c8384b
+part_01440_of_02048.hdf5	2793f5a45139ff99fccb835b888a057d
+part_01441_of_02048.hdf5	77342feae55379e366980ac505f624b3
+part_01442_of_02048.hdf5	a5cfdba01f21a2b34b62819fb070bdb2
+part_01443_of_02048.hdf5	3d91749e2f087e5fcecf72b923bc82cf
+part_01444_of_02048.hdf5	a581662a0cd87c0976411d3bcacaa9c2
+part_01445_of_02048.hdf5	2ba21b2ad09fcedda120837137e230ce
+part_01446_of_02048.hdf5	c53cc892436823be7976d26242e7425a
+part_01447_of_02048.hdf5	564e08f6e41d50583de6cc1c9147a8c4
+part_01448_of_02048.hdf5	284e9a3a495fc1338fe572997202c74f
+part_01449_of_02048.hdf5	7b2f313f47779109f137cedcf2bb6a6e
+part_01450_of_02048.hdf5	f82bef0c32e6976db8d59bf714f6aaf0
+part_01451_of_02048.hdf5	dffc97d2451d90071330bcd953f3d3d1
+part_01452_of_02048.hdf5	047ec0afdbe85750175a32b4ad6965d1
+part_01453_of_02048.hdf5	6b2f9a931f51d77ad452f3e7d25090d6
+part_01454_of_02048.hdf5	531ff3b05b2b50ee9a4be344eac1b7b6
+part_01455_of_02048.hdf5	5dc586a1f1e4b9edac68871341d56911
+part_01456_of_02048.hdf5	c059afb15cc175138d7a8503050520a3
+part_01457_of_02048.hdf5	62eea019202b27b614f6fad436b5b840
+part_01458_of_02048.hdf5	715ac633cd4c6dd85cd9a1eb1a4a64b4
+part_01459_of_02048.hdf5	ae3fcaa4adbe71932b1ded9f36d4a351
+part_01460_of_02048.hdf5	243d7102fffbb1d3ff0966e6c754c340
+part_01461_of_02048.hdf5	1ce07c792f26bbda86d11b0e00071799
+part_01462_of_02048.hdf5	bc5909a34b5354ddc0bd8102e163c369
+part_01463_of_02048.hdf5	b0c3583ef57d0ec9c876c21f25fd09dc
+part_01464_of_02048.hdf5	d3c8f1302319d502e8c6e52d88bb3cfe
+part_01465_of_02048.hdf5	591ad09aa00ceaa1478d266043cb526c
+part_01466_of_02048.hdf5	d166db9c4f1b82a022f65f5ab0712fc5
+part_01467_of_02048.hdf5	546ef2a3c5cd8e9c1c8f6ee222676691
+part_01468_of_02048.hdf5	8b2638288d87eea57586e726abf4a320
+part_01469_of_02048.hdf5	5338499fea901fed5c1005f774aa5b5b
+part_01470_of_02048.hdf5	72c18e6c3d3bb34d710d5ca4c4f5562c
+part_01471_of_02048.hdf5	179a4cf7a34205fdadb303359e038de2
+part_01472_of_02048.hdf5	cef2fbbe5b8a1344e4c36989a437175b
+part_01473_of_02048.hdf5	c55356f14b7c65a2df193869748d61f7
+part_01474_of_02048.hdf5	fa18fadd52a8b5ed78acfe2bcfc130c9
+part_01475_of_02048.hdf5	434c9d51651ad8f2acae6c48c120921b
+part_01476_of_02048.hdf5	ae6b4cc2d39dafc3f7b3dad42d20c700
+part_01477_of_02048.hdf5	46e138083734fe6da14a0fc0816e4836
+part_01478_of_02048.hdf5	44383ee726cfba0300c53f1cec1235c8
+part_01479_of_02048.hdf5	30bb347ffe0e50ca3c1a573655e7d5bf
+part_01480_of_02048.hdf5	82d45171841128aa02def03a529c50b5
+part_01481_of_02048.hdf5	56bcc6f89141360b9ecd3e366fceffc7
+part_01482_of_02048.hdf5	ca1aa300444797276fa641d37029a6ed
+part_01483_of_02048.hdf5	cf5e97ea6b5dbbd3932ae132aa2836f1
+part_01484_of_02048.hdf5	e4fcfb9c8921ba2a1096cc4591b94400
+part_01485_of_02048.hdf5	2523e9fba417211afde946bea99109a7
+part_01486_of_02048.hdf5	8eb71fd846962d25d701977acb603506
+part_01487_of_02048.hdf5	9a97da503fd7b115b0bed8db011ed88f
+part_01488_of_02048.hdf5	6b5ff841c64beae796b2f82494264a41
+part_01489_of_02048.hdf5	70d56cef6673e41e0109cae2cf7a6e96
+part_01490_of_02048.hdf5	2d65618d0bd1d310a153eaeea8b937a4
+part_01491_of_02048.hdf5	ea0dbd6fb4d2dbb6dc683478cfa8bbf8
+part_01492_of_02048.hdf5	0b982f789fe2af48a18b7b6753f80775
+part_01493_of_02048.hdf5	39437c4ad90c3a028c0429eb9b65899e
+part_01494_of_02048.hdf5	81cad023b65f5f8500bf49ed7f153a9a
+part_01495_of_02048.hdf5	8b362987102129dd0edd9db01a856d10
+part_01496_of_02048.hdf5	b0c3d514ff77013d6ccb5d934fee1e7b
+part_01497_of_02048.hdf5	b7b2a48c929ff8b5e2a227843cef2856
+part_01498_of_02048.hdf5	fb054ee92a996a57959f053529a567a1
+part_01499_of_02048.hdf5	dfbf136fb3d787b8a501f3163f4c1cc5
+part_01500_of_02048.hdf5	27fadfe35e49aa02b94758b9ab63e3f8
+part_01501_of_02048.hdf5	7cf985d817bf3bab3e316b396e0fb1d7
+part_01502_of_02048.hdf5	22aab2810bc157a8ed7ef2edc42336c5
+part_01503_of_02048.hdf5	0b3bf9269cd243291d5e3e5d3053714c
+part_01504_of_02048.hdf5	68447d19b8e5c6804dd3e87cf570dec9
+part_01505_of_02048.hdf5	ff91a4f546e3c420785df8690bd2ae55
+part_01506_of_02048.hdf5	e72370c4ae564dd5278e8b6781605bf9
+part_01507_of_02048.hdf5	cdfb56a371af545676ddbcc89c90b371
+part_01508_of_02048.hdf5	d19641267050bf7de7cba5d7a83b2c46
+part_01509_of_02048.hdf5	720c8df2a92d0b42b000456484e3c92c
+part_01510_of_02048.hdf5	9dd6c1eb8241c311a487b2a4b209fa4c
+part_01511_of_02048.hdf5	0987d90585ede88ed7c4dbdbb6bbe19c
+part_01512_of_02048.hdf5	ecae97fcad4575ed433ff8a1371dc2c2
+part_01513_of_02048.hdf5	7cf56cbc0c61af4b9511259a8ef30d0c
+part_01514_of_02048.hdf5	bbce9cb109f8572086c8122655a731b1
+part_01515_of_02048.hdf5	67bfa6c9ff3fc687b63d4e39815809e0
+part_01516_of_02048.hdf5	afff8164ada1a81e932846f1610cb1b7
+part_01517_of_02048.hdf5	ec582f637f8de7b9b080ff5d3e970620
+part_01518_of_02048.hdf5	59272c86752e2f2c02234d4005cb96b3
+part_01519_of_02048.hdf5	77f33f380fee21051cf3953c957c29fd
+part_01520_of_02048.hdf5	cf04fb39b877693d84afdf3f13f535a5
+part_01521_of_02048.hdf5	202ba0be38a8f3bc0811807cb39192e6
+part_01522_of_02048.hdf5	1ba5fb023a526cea7cfb3ed452a81022
+part_01523_of_02048.hdf5	27a70d331990ce0d5e8197645d42d9e3
+part_01524_of_02048.hdf5	db6d4a47a82abfd2bfb12c164ca251bd
+part_01525_of_02048.hdf5	5bd8cd0b2f3278c75361b8ea931209c7
+part_01526_of_02048.hdf5	bebe0b342634b6409f47cb279e94d495
+part_01527_of_02048.hdf5	9ac291e99c30ffb3b18079c2a0db411b
+part_01528_of_02048.hdf5	534d1018628dd7e230037e6dd5c5956d
+part_01529_of_02048.hdf5	c48db89c4f010b67bc3072db7981f4bd
+part_01530_of_02048.hdf5	652f360d4ddeb690fa77395913dcb610
+part_01531_of_02048.hdf5	9b1f2326d94ccee0b4278d4307761d45
+part_01532_of_02048.hdf5	4acd9e1ce7a64fd35b4f8fd78035d43c
+part_01533_of_02048.hdf5	c4065853da4547aa5150617c93bbe0f6
+part_01534_of_02048.hdf5	f80486eb8ac962c954967d37d646b45d
+part_01535_of_02048.hdf5	2de7bb62a81afd8dc020445a73cac4e8
+part_01536_of_02048.hdf5	3356c174771f0bd168680d71d1f15c44
+part_01537_of_02048.hdf5	ab367b9573d7039c5b6615037377a900
+part_01538_of_02048.hdf5	3f2a34a77719b251e4cce8f88ffcd469
+part_01539_of_02048.hdf5	939803c6e045599a6ca9a71ef2801299
+part_01540_of_02048.hdf5	b006d710f736123608b2cdb10d28465f
+part_01541_of_02048.hdf5	40c84c5b0501c774c8d7ac5b5b7cb1e7
+part_01542_of_02048.hdf5	2b1778830499511f3566d2052ad69219
+part_01543_of_02048.hdf5	041cd840b361a4b0bc33e5c055c65f78
+part_01544_of_02048.hdf5	a5ba9a69285924abbce4deaad584b975
+part_01545_of_02048.hdf5	fda454daf01bab49e49e39811269d65a
+part_01546_of_02048.hdf5	52e12633144c723d2cae35a749b50c2f
+part_01547_of_02048.hdf5	85bd5d90b887337b114cbab10c29b69b
+part_01548_of_02048.hdf5	21434d90e10d474c7f33fa7452cc9eeb
+part_01549_of_02048.hdf5	308e31c5f678e740f14e6cffa6b4b489
+part_01550_of_02048.hdf5	54e3363a2d08b41add6b44f44476d6ee
+part_01551_of_02048.hdf5	8c68c23f6b8c4ee9aa250298b8657639
+part_01552_of_02048.hdf5	3a11c4068b67a3ab56da20062307f564
+part_01553_of_02048.hdf5	43e27401bdd1a781f3e52a62ef1c10b1
+part_01554_of_02048.hdf5	0894f85362521d6ab43295417e73b18c
+part_01555_of_02048.hdf5	97d9bb0a61478e169f5888ba9f8df466
+part_01556_of_02048.hdf5	194ce59c86102d7ebcd875dc5b9cebc2
+part_01557_of_02048.hdf5	420b34f3769fb247a8f0b5f5a3e8fe4e
+part_01558_of_02048.hdf5	36a6620aade4a417a93f79c902edc61c
+part_01559_of_02048.hdf5	1b63c085bdf5c9a15b5aab31aca5619f
+part_01560_of_02048.hdf5	bc56cdeb140e2615800037253c8679a4
+part_01561_of_02048.hdf5	fd4449fd9fde4302f6b54e8b9a586be0
+part_01562_of_02048.hdf5	896222438e1068c94ad43a02d7bc9e5d
+part_01563_of_02048.hdf5	28f31cf17b144a23098dac883a74a8c2
+part_01564_of_02048.hdf5	13efdd394c398e3946d25c20748264b2
+part_01565_of_02048.hdf5	d41471273f572c7d4da81a00626e3c2a
+part_01566_of_02048.hdf5	9a4dd3b4afaecfa05da2222ace14f1df
+part_01567_of_02048.hdf5	e6abc18f268e0678e0ef12eb30ace9c6
+part_01568_of_02048.hdf5	92691786a59195a91780e4187d9ea6f9
+part_01569_of_02048.hdf5	5307b284158a7ace58bb7963275ee12b
+part_01570_of_02048.hdf5	7f3e42b1839244f52ee70570e6213721
+part_01571_of_02048.hdf5	b1969914860a82555d430b146fd863b0
+part_01572_of_02048.hdf5	a880736e68df94dcb9f0000f1140d57f
+part_01573_of_02048.hdf5	755ed03676cc31f447c11cf3aabf5744
+part_01574_of_02048.hdf5	6761b92ca5d1aa3a008f95ef3f1f92c2
+part_01575_of_02048.hdf5	9adc3a35c1f68163c768df3d3205e4b1
+part_01576_of_02048.hdf5	b888c31889e696d246740f097bd113bf
+part_01577_of_02048.hdf5	4fdd063d6ac7a2167ab1b738256901d9
+part_01578_of_02048.hdf5	5acaa98ecb44188c66f8d48622d9fba9
+part_01579_of_02048.hdf5	95802d0fbac6708a63da245f05e57035
+part_01580_of_02048.hdf5	fbfd99ba70a660a07feea18436173fbf
+part_01581_of_02048.hdf5	bbec2f2948fbfc528b11465ab08c0947
+part_01582_of_02048.hdf5	2950faca288119bef8164cf220830b64
+part_01583_of_02048.hdf5	2133d932e616cae55833ced0f6eb5083
+part_01584_of_02048.hdf5	26e10367a6d405773499fb73359314eb
+part_01585_of_02048.hdf5	182506596a039e6ceaade98c0e4fc5c6
+part_01586_of_02048.hdf5	c70ca9cbecb1e7037fa60546cf7fad12
+part_01587_of_02048.hdf5	8835960dee55ee55e6c347b6613174aa
+part_01588_of_02048.hdf5	32764306648a33d71935b99563f136f8
+part_01589_of_02048.hdf5	e31adf86693c1a3c9e2546a572a8fe62
+part_01590_of_02048.hdf5	53a3a6acebebf2d62e848c3695cde159
+part_01591_of_02048.hdf5	8a5882d032ab91ce8e132670ff87e0ee
+part_01592_of_02048.hdf5	3d64ba642c038f7ff0c155a3d0b553d6
+part_01593_of_02048.hdf5	3292448af3b85bd46efb864acee615f3
+part_01594_of_02048.hdf5	659c43bd6a825b23a5e6b5551187e396
+part_01595_of_02048.hdf5	30df24b985a9191002d21d1c7ae0c5d7
+part_01596_of_02048.hdf5	fb9bd1cda7efd2c3d6e504d75abe5711
+part_01597_of_02048.hdf5	c63aab62942d03c72ba48e1a6c558963
+part_01598_of_02048.hdf5	4f164e2404d594cba7efddba95b6b8c5
+part_01599_of_02048.hdf5	d8781f4ede7c290a9bb14b793ba81ab4
+part_01600_of_02048.hdf5	02a7407cd823de3099d1527cdea1674e
+part_01601_of_02048.hdf5	7159b431198b07b119fe85bb3ff7f9a7
+part_01602_of_02048.hdf5	fdbd6a63410884b4cf20dfa78f5309e9
+part_01603_of_02048.hdf5	b7e2cdcbbaaa91cc4fc07534467973db
+part_01604_of_02048.hdf5	2e184962ae5d95e8cacf8fb4a57d1104
+part_01605_of_02048.hdf5	8bf1ad88c12193f708b1312086819628
+part_01606_of_02048.hdf5	4a40da30bc811a989141a932fc2e00c9
+part_01607_of_02048.hdf5	55cd3031453d5f4f8aeec1d2bbd80a35
+part_01608_of_02048.hdf5	5a2c56fe7709414be8becb2160f9a90a
+part_01609_of_02048.hdf5	86929a3c6ce61ef17d9a8318795c82f9
+part_01610_of_02048.hdf5	f7461c152939ab8add690ea679c5f338
+part_01611_of_02048.hdf5	35da52f1b035c593311542e04c3b5a3d
+part_01612_of_02048.hdf5	e1305937fda5cd08b8b613c960082f65
+part_01613_of_02048.hdf5	078486eff9e4405e2a17a9a5eb95dcca
+part_01614_of_02048.hdf5	d94341e111318b682254c4a71f7b864c
+part_01615_of_02048.hdf5	ee89289f48919c36f97b91da5857141a
+part_01616_of_02048.hdf5	15684af2368b73cf3b1cc11336b9244a
+part_01617_of_02048.hdf5	b6e32b935d416610c9a822087c134496
+part_01618_of_02048.hdf5	ebcf1992ad00456cd522bfbfeb917be0
+part_01619_of_02048.hdf5	9402b94602168a6474f7eee9b95301fd
+part_01620_of_02048.hdf5	7f0a98f968207161205df40128d5661f
+part_01621_of_02048.hdf5	d33a0d24f333e7563d3df75ce529528a
+part_01622_of_02048.hdf5	cf92b792c7eedc95b4c8e5a8ec61446d
+part_01623_of_02048.hdf5	a727e4b2726b6eede5992feaa64b2090
+part_01624_of_02048.hdf5	bf73ee941ad85f67c924efb09c065be5
+part_01625_of_02048.hdf5	f5e3ba5d0478e0cb12b57e7eab933a5d
+part_01626_of_02048.hdf5	3a5a7b671ab4d66e937d358dba91776b
+part_01627_of_02048.hdf5	c5701163aa75be840db6b238693eaeb9
+part_01628_of_02048.hdf5	b2620e2821a993c05925c31200a3803e
+part_01629_of_02048.hdf5	2c6196500d05b5d25e75d1a0c9166a3e
+part_01630_of_02048.hdf5	ae2a5f829728c68cf5e446ba3b6e852a
+part_01631_of_02048.hdf5	e1c12974aec5dbe80039396700b376c8
+part_01632_of_02048.hdf5	44f352fa829bd25d82cc1dcfa14cb7f6
+part_01633_of_02048.hdf5	2761ce8042e28ee5e33bdfbdc2a48156
+part_01634_of_02048.hdf5	e77c23191519446ff190b52db5f5a95e
+part_01635_of_02048.hdf5	16958ec6c247047db49bcafeff36e643
+part_01636_of_02048.hdf5	37006aba43bc5e6e5f540259978c4875
+part_01637_of_02048.hdf5	b9ddd5a82011be709a8d67e45e3a0e4f
+part_01638_of_02048.hdf5	87f811b2b7d60a2bbacc2a5ac800e095
+part_01639_of_02048.hdf5	8852c14c1528efadf189294b07c347ad
+part_01640_of_02048.hdf5	092a7cca9c7ae12b0cd7fb17fad1c5ac
+part_01641_of_02048.hdf5	994fff14c28ce1c0adcf377c0d64a40d
+part_01642_of_02048.hdf5	603231620bf1e205bae3404ba51f0f09
+part_01643_of_02048.hdf5	34751f5d99d2d707e647be0b6253425e
+part_01644_of_02048.hdf5	299540554b386e923b10d41164d7e315
+part_01645_of_02048.hdf5	7abeb93e28c42171f28e5c9767feba80
+part_01646_of_02048.hdf5	da5ec42db945886a463c142af167499d
+part_01647_of_02048.hdf5	bf95e345e37ce2726fa0a24151a013c2
+part_01648_of_02048.hdf5	f6753c41c2c8237d134986d3b8dee9f8
+part_01649_of_02048.hdf5	4face88f6b70dcdcf15b1a00dc8b62dd
+part_01650_of_02048.hdf5	baaa4fa27e45c56530a1af42da26f386
+part_01651_of_02048.hdf5	ec37b66ca34e76e1715c8c305ec6bf7a
+part_01652_of_02048.hdf5	f169be00fca6847ae4cd37f100af2067
+part_01653_of_02048.hdf5	13ee42bdecd5bc500eb2a6e3704f0b65
+part_01654_of_02048.hdf5	cf431d39eaf45c4f9e622092fb68bb85
+part_01655_of_02048.hdf5	b61dc1da9cac07bf7959d9667987c3e3
+part_01656_of_02048.hdf5	67f7d5a45d83d0cf319aef19ffdb22c6
+part_01657_of_02048.hdf5	60ce884ecc61e443b7b2c183bae68674
+part_01658_of_02048.hdf5	bcf1dd49bbd556e7f8f3fd0ad6d0d5c4
+part_01659_of_02048.hdf5	1b6e68510d25c24c86dbdd594a54b661
+part_01660_of_02048.hdf5	e3d778042906e5e20a1d93fd79079ae5
+part_01661_of_02048.hdf5	805a26887c792c6f2ef5ea0c18058723
+part_01662_of_02048.hdf5	1ede28cb8cba255465507444f45d4ec0
+part_01663_of_02048.hdf5	8a7995d26ba9272b1d0c804c5bd3ab9b
+part_01664_of_02048.hdf5	47db4e266271d46d6792987ab9c25184
+part_01665_of_02048.hdf5	193f90708063bf1d2175a5e80e973435
+part_01666_of_02048.hdf5	d68b7983652e4c02b54cae1bdb48bfec
+part_01667_of_02048.hdf5	482ed1c0c828dd0b627e1f433fef5e94
+part_01668_of_02048.hdf5	2e73cc7f61d09d5540158a9339ad3604
+part_01669_of_02048.hdf5	2f9254729131b435de9b0c9b07abf2b5
+part_01670_of_02048.hdf5	9a72ba3b91474eac6d03bf6195925864
+part_01671_of_02048.hdf5	b1b15560715eb4998a747c5b57d6f8ed
+part_01672_of_02048.hdf5	afadcc50f633f662e14aa1db3529e88d
+part_01673_of_02048.hdf5	7b5c16a7781ccf9117d345b4dd4254c3
+part_01674_of_02048.hdf5	bf26349938a260d6a377cd5288d30355
+part_01675_of_02048.hdf5	6767536b95cf002bdc7a593dc081d542
+part_01676_of_02048.hdf5	6d6ad20a961568a28d656d968e9f5d06
+part_01677_of_02048.hdf5	7e873ddd2bf184c4849b69376818360c
+part_01678_of_02048.hdf5	3b8c1e889e769cc8f30be303246eec7f
+part_01679_of_02048.hdf5	181273ba6da9f994b10b819bc486edb8
+part_01680_of_02048.hdf5	33e45b440887a6ea0f023c1ac09e8e31
+part_01681_of_02048.hdf5	d9ce37a4f2bf228aa630e00418c2b635
+part_01682_of_02048.hdf5	1ad896f58c4f1e8e66d3dbbe3df01bc2
+part_01683_of_02048.hdf5	2fad3168787a3e663b3f38cc6df92a3d
+part_01684_of_02048.hdf5	486285c6835285521393b1b1c284b555
+part_01685_of_02048.hdf5	596bd8caa719badef00b763a533ae4bb
+part_01686_of_02048.hdf5	0ac8b3b6cac546c5356117f505828517
+part_01687_of_02048.hdf5	7af7112cb0008b66ceb3e998ce4baac0
+part_01688_of_02048.hdf5	24e45cbdf481b080c5477a3a3d0936e2
+part_01689_of_02048.hdf5	b98f70affa33d72af7426ba2c2d4386d
+part_01690_of_02048.hdf5	ab4ee495cfdf94cdd59f3ea59122714b
+part_01691_of_02048.hdf5	35354ab40ab17d2c3665caa395b55d5f
+part_01692_of_02048.hdf5	027fe5ee51497feb7ba0bbe45944b70b
+part_01693_of_02048.hdf5	698328a452d38d6e1822cefb350db505
+part_01694_of_02048.hdf5	882473e21e9a0a46b0aaecc85c8f8c2d
+part_01695_of_02048.hdf5	9ce45ec4365dc13eca1a36b59f3978f5
+part_01696_of_02048.hdf5	9cfec80f6f8eeb641d94a432642927f9
+part_01697_of_02048.hdf5	315915f5933af0bf275643cb10381fc0
+part_01698_of_02048.hdf5	b65068f212f853a6fde829855908479e
+part_01699_of_02048.hdf5	4bb925b371380f9997ab5b9244179309
+part_01700_of_02048.hdf5	7662b5c281dbf78562a3522b092b69c8
+part_01701_of_02048.hdf5	cc6093ecd78c3af14473354cd1918247
+part_01702_of_02048.hdf5	0c2cfb2fce54866687b5b23f2a58fe09
+part_01703_of_02048.hdf5	f115c367be8dfdebd635452c88e10572
+part_01704_of_02048.hdf5	1f05be8ede1dee9f11d6948e8a6f5a31
+part_01705_of_02048.hdf5	89543596dcc66bd6c798440a73a8fe7d
+part_01706_of_02048.hdf5	61d10306c575793ec65c6a64283463a0
+part_01707_of_02048.hdf5	86a322be827cbb5603a96bf91f03fe9a
+part_01708_of_02048.hdf5	39fff1d9c2b21cd5743525bd63aecfde
+part_01709_of_02048.hdf5	3915ad99572319a75e34c2e2546dc315
+part_01710_of_02048.hdf5	86f8adce268db75c2fbfec2b8c259d3a
+part_01711_of_02048.hdf5	f0023a86cea70c7e68dff1b91f23a4f9
+part_01712_of_02048.hdf5	0587d4091d30b138249a695f4e957b98
+part_01713_of_02048.hdf5	04d39661566ed7e0d066c6f32e5bd717
+part_01714_of_02048.hdf5	cce158e21e8f0def950dfd465f147dc9
+part_01715_of_02048.hdf5	bc9eb39ef4ac4802b4d49db9d6da35d8
+part_01716_of_02048.hdf5	aafb683e8b0c679857d8a67c0d2bcfd0
+part_01717_of_02048.hdf5	67cbae04baa9aa3243e195a109543c02
+part_01718_of_02048.hdf5	f7f5bac9b6911cded24ae0174a6b2756
+part_01719_of_02048.hdf5	3fb6e64b04c7a1eb94b898475e71e320
+part_01720_of_02048.hdf5	bfaf0232e28f73b32314787de0144b14
+part_01721_of_02048.hdf5	4215359a16760ba0501f03a4f9a2e1d2
+part_01722_of_02048.hdf5	0a7f88f815834a1b461db9e4f9b226b3
+part_01723_of_02048.hdf5	de31039a537a54e80b1926c6b6506029
+part_01724_of_02048.hdf5	3aafda8202435492a56c3707c95d7fb0
+part_01725_of_02048.hdf5	5d27ea506273a381b069a8f617066fb0
+part_01726_of_02048.hdf5	c30a9741b553c86f288ba63f068178a1
+part_01727_of_02048.hdf5	14c72e6b337b82ca8c50e4565b8662ee
+part_01728_of_02048.hdf5	06b7064bf065217d859eeec696431c1d
+part_01729_of_02048.hdf5	90ad7fe26fd54ee6df676948068b6335
+part_01730_of_02048.hdf5	0843f1f03cdb3bd7078f938f19c7453c
+part_01731_of_02048.hdf5	22ff6390d56ea9da9373c59aa23dd0a3
+part_01732_of_02048.hdf5	9336d3a1606148f3905608b0da0c992a
+part_01733_of_02048.hdf5	30a4fc0703bf41a42c411ffc590be626
+part_01734_of_02048.hdf5	103578b099994e91424d1d9b8c049c80
+part_01735_of_02048.hdf5	8945548037a9d727ec3bc1714d4b8feb
+part_01736_of_02048.hdf5	07fce8adada57d80d692e901f9f39af4
+part_01737_of_02048.hdf5	ec646677bbc759732a80dc2cecbb03a3
+part_01738_of_02048.hdf5	ba9b6ffd7b00f51f80e1c909e3392fe1
+part_01739_of_02048.hdf5	6296832cecdbf5787c8c297327312d46
+part_01740_of_02048.hdf5	b24cf16eff14f6d54ecc5bc95a6c5e99
+part_01741_of_02048.hdf5	99b6297959408b8078551e2da18a8ffc
+part_01742_of_02048.hdf5	0afb493cede5f0b2dde498a6f178e4f1
+part_01743_of_02048.hdf5	ff17ac1c0421e6366a91ab49e4e1a8a4
+part_01744_of_02048.hdf5	7347d2b5fb2dac48531af78e38de1601
+part_01745_of_02048.hdf5	10ad6fb125532290484b057a58e1da4a
+part_01746_of_02048.hdf5	7e805affd2abcf3dd0ebec0b00ceabec
+part_01747_of_02048.hdf5	11d95a7e53f311a40b18c1ac0e57060c
+part_01748_of_02048.hdf5	1eae6ffa046c932cefd32bc46be02231
+part_01749_of_02048.hdf5	8b2bc9a71d1bcb851c0e13ed2cf1f4a7
+part_01750_of_02048.hdf5	63e984a47f1e3eb69c7ff88d3701ce78
+part_01751_of_02048.hdf5	40d8d685e78c00e790db3ce47dad1629
+part_01752_of_02048.hdf5	7b56f5e86d02c3b28949daeaaebfae10
+part_01753_of_02048.hdf5	e25bc7aa8991b7bb9a88b5665fbbc6eb
+part_01754_of_02048.hdf5	f64c6ce43511dcd10c511793e64d8189
+part_01755_of_02048.hdf5	85524234c15023f1f5d901f976cadbd7
+part_01756_of_02048.hdf5	8cc5d245b239ab1eff41aa2db0afa2e9
+part_01757_of_02048.hdf5	25e9c061854e7be4ff434dc70dce686f
+part_01758_of_02048.hdf5	0e252557ad4c9c935a7c920ba589889b
+part_01759_of_02048.hdf5	97f171d966c46c54ea34ee538af517ee
+part_01760_of_02048.hdf5	31ff3d744a33f41efab39d5795496fb3
+part_01761_of_02048.hdf5	7f54aa9cf7b40335eddaa9f76a858dea
+part_01762_of_02048.hdf5	63bfd9fe38f1806c49f796a85974ecae
+part_01763_of_02048.hdf5	e0ece0f6f8a363c0cf3596525e215c53
+part_01764_of_02048.hdf5	aa2ecb72b69c5f6b9cbf341412c4eea9
+part_01765_of_02048.hdf5	a19461ab82f0f61626785127de846f3c
+part_01766_of_02048.hdf5	b864faf89ef1a51747f4956c30aa7fda
+part_01767_of_02048.hdf5	d5c6af00d2b0c40b53851d014acb97cd
+part_01768_of_02048.hdf5	a5bb2b573bd13d68c620c6d79ae02c5f
+part_01769_of_02048.hdf5	bb016244800c4e5daf5894b40d5c67e2
+part_01770_of_02048.hdf5	616fe25536d6c3e46e757b2a75671729
+part_01771_of_02048.hdf5	ef19308b03df9b5cb8ac61f0cac03c32
+part_01772_of_02048.hdf5	fd7927b64d13ae46088be67fb74af300
+part_01773_of_02048.hdf5	bef8c9b9846f4006494e1b01ec0589a1
+part_01774_of_02048.hdf5	86e8eca9ee38ce70fd37a8d064061d76
+part_01775_of_02048.hdf5	181fe7f52e5261f891c7be86e54bae11
+part_01776_of_02048.hdf5	afb39ae73fdea6608f41193856058f7d
+part_01777_of_02048.hdf5	d9fb6d48b9ba11b027b59741f6010f1d
+part_01778_of_02048.hdf5	a51039407ad52bc8463d7870bda0b22c
+part_01779_of_02048.hdf5	c53bb93287d9b849613f973caa518841
+part_01780_of_02048.hdf5	03751b66c30b0a86920f31b1b767999f
+part_01781_of_02048.hdf5	bcaa303a867e5b4b012b989a1996ad21
+part_01782_of_02048.hdf5	98abb55af177cf2f5a259ab9c16c89d4
+part_01783_of_02048.hdf5	8d2faf63098fb4928989ddabca0ee624
+part_01784_of_02048.hdf5	e091fd13c6ba72c8abf2bb145916d030
+part_01785_of_02048.hdf5	2dc54cd0a07f329c29139d52c30e2ca6
+part_01786_of_02048.hdf5	a3372c4ecb17131089cf8dbff8b28207
+part_01787_of_02048.hdf5	874aef4107b9e5f2e2efb3faeecc7f8c
+part_01788_of_02048.hdf5	1c9c2e8ec0fb7be12c9df332a4f62bca
+part_01789_of_02048.hdf5	fc1df51ec576f6e798b78650f3c73a6b
+part_01790_of_02048.hdf5	f8c29c2055b976e1fa9d5f95cec8de99
+part_01791_of_02048.hdf5	bffe62f912a696c716875d25ac13385f
+part_01792_of_02048.hdf5	fe0765b1e92728bdb62d21a2436742b8
+part_01793_of_02048.hdf5	9fc6b806e14fbfae7dce9266830c00a4
+part_01794_of_02048.hdf5	984ddea719ae9ea77e2e8e2ae7aa3f52
+part_01795_of_02048.hdf5	235a6a6b76b83b3907380391c11324dd
+part_01796_of_02048.hdf5	1c6b6c470eefaedec9ae54ddbdc92164
+part_01797_of_02048.hdf5	561e8b625f7f277a2863582179343721
+part_01798_of_02048.hdf5	d213165cf8f20edeb086debe4fee059f
+part_01799_of_02048.hdf5	f1b850b39236f8318ddace446a1d9c56
+part_01800_of_02048.hdf5	f6d385350f66e013b5efdd64843bfdc2
+part_01801_of_02048.hdf5	2235dbb9f27e123aedb471ba50a5294d
+part_01802_of_02048.hdf5	33b4c853e468431a22172d96b30f5336
+part_01803_of_02048.hdf5	20c9869c2178ea64d84082d0f62e9efb
+part_01804_of_02048.hdf5	c9b1545b29b2f0966a349109b30c5606
+part_01805_of_02048.hdf5	627dd6f44d1e04ff77b0262c39d0d62d
+part_01806_of_02048.hdf5	1bd3a016fbbb7e91078ce55400ce9bb9
+part_01807_of_02048.hdf5	a2d90a5da33649859f9cab08c76b5432
+part_01808_of_02048.hdf5	2cc7aaa8dfa4a43681520ff7b3a3edf4
+part_01809_of_02048.hdf5	1be842d8fe9ced0b2a60b8d47ef4e56a
+part_01810_of_02048.hdf5	4bd64dcba90c79c3ef8a1a09d3cb0ebf
+part_01811_of_02048.hdf5	b0f5015e0192ada1055f9365601b7cc1
+part_01812_of_02048.hdf5	1de07e82201d0ffcf10d2705d3fde97a
+part_01813_of_02048.hdf5	87356d178575c588ccc611fc0619d0fd
+part_01814_of_02048.hdf5	496c7108c6f59fa276e8b7fc8d059b4c
+part_01815_of_02048.hdf5	588d9b3928c69165fcc7ec4cb903cf88
+part_01816_of_02048.hdf5	0eefb473446db7650ba37e8f115a8956
+part_01817_of_02048.hdf5	f180314df1978e8f860118c0c03a6f87
+part_01818_of_02048.hdf5	65424ad4c7efe62e2e69a8a8e95abb2b
+part_01819_of_02048.hdf5	dca51a2fba8972f7e0431a3eb6037b21
+part_01820_of_02048.hdf5	e2735da1cd7596fcee50478494fca3d9
+part_01821_of_02048.hdf5	4e7f71385ecf26f66bff28df19c24a46
+part_01822_of_02048.hdf5	dee499406975b283ce39bc472c6d3a58
+part_01823_of_02048.hdf5	ed6ed68a366f0336af6c5e9e80bf83dc
+part_01824_of_02048.hdf5	41bf1e22cc0bd71c8aae809bd63f6391
+part_01825_of_02048.hdf5	891b9a1cc6760a90c4ddfbcbb4a764ba
+part_01826_of_02048.hdf5	1f2b2042dbaa3cc5fe7d55d8950ea8a7
+part_01827_of_02048.hdf5	7bdc6a59c5e84f31941241114b7e7fd2
+part_01828_of_02048.hdf5	ca24e1db14969a81513357b56aa838b9
+part_01829_of_02048.hdf5	470f38750551da20472465a834428b07
+part_01830_of_02048.hdf5	0dae98d02adbd6e8d4489a58213903df
+part_01831_of_02048.hdf5	827e13795d01043b559f584f3d6c0da2
+part_01832_of_02048.hdf5	7de926bd9aab525922f1c576aa52b1f4
+part_01833_of_02048.hdf5	c7fae5ba17507087ec7ac1969bb67e8a
+part_01834_of_02048.hdf5	453a5081c0c61fabb03b5ed585652fed
+part_01835_of_02048.hdf5	2eade111a358eb8baa547117497ee8a8
+part_01836_of_02048.hdf5	9a3b45f45d923e6f7d03398c89a7e306
+part_01837_of_02048.hdf5	cc58d9f612852dfb202c92dbddc26182
+part_01838_of_02048.hdf5	6fea7302c15391ce122dbe646ba989cb
+part_01839_of_02048.hdf5	dd8d66d0479075dbf596b8f34fc31f3a
+part_01840_of_02048.hdf5	d717b6a1c60d4d1b74772b8736ea5381
+part_01841_of_02048.hdf5	3d3b3c9271c398adf7fa877cf831319e
+part_01842_of_02048.hdf5	69563f3b169a2798b8c07f10362f8acb
+part_01843_of_02048.hdf5	2d2d131422b3eb6f0891e056a8d0dbd6
+part_01844_of_02048.hdf5	813d408079617ccae9473807b4c0200f
+part_01845_of_02048.hdf5	2cf993d271a2f6edefea275d17658535
+part_01846_of_02048.hdf5	900be99c68039e7ff37157e8efaf8e17
+part_01847_of_02048.hdf5	c6d0ff46d847e7e5becd550d420a5c81
+part_01848_of_02048.hdf5	edeec89079c47bbab5f936e3237c66fd
+part_01849_of_02048.hdf5	014ead6c2cc6b399b99856876b4ba32c
+part_01850_of_02048.hdf5	12c294293e46b83871c5e3e85bece25e
+part_01851_of_02048.hdf5	9949ed925b3b76a979be69a1f84d9a99
+part_01852_of_02048.hdf5	aeb8d5b89b3676940a64c938e37455da
+part_01853_of_02048.hdf5	f3cf1b3ccf849e5fbacf4d0a72f9e68c
+part_01854_of_02048.hdf5	2e9333fe5c33050c6fc04882398b8921
+part_01855_of_02048.hdf5	d880bfc57a86be17596544d4032c3ab1
+part_01856_of_02048.hdf5	656e168c626d4548b8033e4b3cb47add
+part_01857_of_02048.hdf5	fa72b103a7672f7f9193a476b94f4d6d
+part_01858_of_02048.hdf5	cd09705657e5e98c7ec6d823d68175ef
+part_01859_of_02048.hdf5	19d30aa11bf86245cdb89260ce89b920
+part_01860_of_02048.hdf5	8a1c2609b3949563fc4603e5c4e6b218
+part_01861_of_02048.hdf5	bc0795d6a7353569a2c620bbe6d11e77
+part_01862_of_02048.hdf5	cc6b3467293761ca3a1f95adbcc8eb3a
+part_01863_of_02048.hdf5	07ba8b7808b6a4f4fe466dd85ed00a0f
+part_01864_of_02048.hdf5	306700b2a8f68c1170f09a561baa587c
+part_01865_of_02048.hdf5	4946ff017c5de55c8befbdd823595314
+part_01866_of_02048.hdf5	f0e991820497d125eae56fb9546279d1
+part_01867_of_02048.hdf5	7945a064dc8f62df5b8d1ce8bb17dbc6
+part_01868_of_02048.hdf5	59d7f22f729e262ea4862624e7fcbaed
+part_01869_of_02048.hdf5	51e6de124077707bb7621a77ce49aa9d
+part_01870_of_02048.hdf5	bbea43427a89872663eb805e38b793a1
+part_01871_of_02048.hdf5	e0567bd6ba1890b0f3859fa76f67155f
+part_01872_of_02048.hdf5	616d44662e4c2fdd19007ccfa8e28ff4
+part_01873_of_02048.hdf5	f0cc8c0d425d22bdb9db3b114f9728b0
+part_01874_of_02048.hdf5	8e1b2d947b4e79c99f704d6a3c78c675
+part_01875_of_02048.hdf5	080e84ceea86a0e307cb2a54f44edd5a
+part_01876_of_02048.hdf5	a31a74a22ca68742b2bf0844541237ff
+part_01877_of_02048.hdf5	fd0dcafad62bf35b43a0bac63b3be667
+part_01878_of_02048.hdf5	daa543d2f1ff274add19aeb9e08ac7ab
+part_01879_of_02048.hdf5	b14bc72ceec636145580a99b01e6ae74
+part_01880_of_02048.hdf5	8a8255ddf0dff78bed67ba5204ad4ce2
+part_01881_of_02048.hdf5	aa5c674a0567680322e38d2a76b95190
+part_01882_of_02048.hdf5	6b69751ec34144db6e301fc1b08f202a
+part_01883_of_02048.hdf5	49d64db91d16cf9224c6ed7a2fbc1a76
+part_01884_of_02048.hdf5	a5bc69d234b621fca01154a5ccaaccb4
+part_01885_of_02048.hdf5	9b207f25a243fa607845e6b45ab4db6e
+part_01886_of_02048.hdf5	ad41735d50db1a6c164eaa76c7428447
+part_01887_of_02048.hdf5	1cc7c4f1e01cd5399db43c0494d29369
+part_01888_of_02048.hdf5	0a421220a5de79ad912a8fcd1a8b7acb
+part_01889_of_02048.hdf5	bf006bb00a726cea01bd7b6ec0679076
+part_01890_of_02048.hdf5	0dccdce0bdfe14ead0ba1e6e45f875bb
+part_01891_of_02048.hdf5	18b23e85d4452c218dbcae5499d4356f
+part_01892_of_02048.hdf5	ccfba42fc56255afcbd6e597247d93f1
+part_01893_of_02048.hdf5	8d10f35fe173873fea8412df22b64d73
+part_01894_of_02048.hdf5	88c0e838ff4e9d284a9b2be6f848c98b
+part_01895_of_02048.hdf5	bec64959a193c9e47afa1bc9931bb7b5
+part_01896_of_02048.hdf5	e07bdd276aeedf39b233279da2e20b7f
+part_01897_of_02048.hdf5	cd86b180bfb0dcaac6345546a9afc1f6
+part_01898_of_02048.hdf5	49173746dd2ad982a6652574556f23c0
+part_01899_of_02048.hdf5	1590a2b8ffb1afcdf3bd5f47148325b7
+part_01900_of_02048.hdf5	9657f3666487d904c5006541eb279654
+part_01901_of_02048.hdf5	a1197be049efc774b02bcd3d899d684a
+part_01902_of_02048.hdf5	8283643244d76f491bf72bdc682ce9e1
+part_01903_of_02048.hdf5	80bb60fa154f0b9d1b1c58938721803f
+part_01904_of_02048.hdf5	38b6da267f066eba1dae774acb386e57
+part_01905_of_02048.hdf5	4ca868ec8edd8985aec03b0c88043043
+part_01906_of_02048.hdf5	898701acc4637c421cd7a3029c6a9c9d
+part_01907_of_02048.hdf5	2d976333b412d84e99c73231a4753092
+part_01908_of_02048.hdf5	25f7d594118f17458c4b1ef9f5d0276b
+part_01909_of_02048.hdf5	6239690a7d97a390d2aeb9570938752d
+part_01910_of_02048.hdf5	e206816141941c8980dc0523d43f842a
+part_01911_of_02048.hdf5	86a9345c40d783c9a62848774fb99a26
+part_01912_of_02048.hdf5	92b12512e6d161b8670b21617debf1a2
+part_01913_of_02048.hdf5	9407c68e489b7b8c8c65803932a0c1af
+part_01914_of_02048.hdf5	ca6824f533d7664cb8919179b22be46e
+part_01915_of_02048.hdf5	a1b5cd0dc1282b66539b5bbeff6c9bfc
+part_01916_of_02048.hdf5	45b8a0e199ed16fd883d37cc685745fe
+part_01917_of_02048.hdf5	09ce3226804bef98e6f09f632968beaa
+part_01918_of_02048.hdf5	77262b84c87576c41cae8994f79a9894
+part_01919_of_02048.hdf5	2fb10cf0fd2482af6129d33c8d6aef65
+part_01920_of_02048.hdf5	f89a9277a757ea78634c8caaa54e47fc
+part_01921_of_02048.hdf5	842861017dd84569c2ca41d388dba8a8
+part_01922_of_02048.hdf5	7ceff2d4006bca27eaf36aaaf4fc5edc
+part_01923_of_02048.hdf5	412f9d1f47e759f9f7ac6f059a254dad
+part_01924_of_02048.hdf5	d6d5601c7ac43bf6c3c9b67fdc680699
+part_01925_of_02048.hdf5	0f8bfa2b5fb2e2a61341cc9d43177740
+part_01926_of_02048.hdf5	4bf2103164cbe8ba6563fec99643ebf8
+part_01927_of_02048.hdf5	22896e9c90c8873667cb9cfbc96d3548
+part_01928_of_02048.hdf5	3d10b164daa30c2c0513011d43971cc2
+part_01929_of_02048.hdf5	4b8cc61dc4d34e69ab44a05036e5f13d
+part_01930_of_02048.hdf5	c642593eebcb0eae94729d30e10d0dd8
+part_01931_of_02048.hdf5	4b6b2cbc7f64178da52a51b10e64977a
+part_01932_of_02048.hdf5	bda7e9b39a1a87c2dc4d01baf859bdbb
+part_01933_of_02048.hdf5	bc0de7753f0b9d177d9c1db4ceaa5bb4
+part_01934_of_02048.hdf5	4e889b3459498dc46aeb1b30e5967434
+part_01935_of_02048.hdf5	37f48b973d17c9af15c8d604da9892ae
+part_01936_of_02048.hdf5	a3c43bb62b00059213967737100e0161
+part_01937_of_02048.hdf5	12217812e0346db94a3c3c5044bd0865
+part_01938_of_02048.hdf5	cf4ed02058fd098074eda423b9cbbb53
+part_01939_of_02048.hdf5	25848d2fdef82408de246d01f0db38ab
+part_01940_of_02048.hdf5	f71be4ade49b2deeed04f208265c0576
+part_01941_of_02048.hdf5	2d07de30f7f16574a5b9480a1b8b6aa6
+part_01942_of_02048.hdf5	5583c0c4a67484553b35fb42af5ef66c
+part_01943_of_02048.hdf5	c5f34a3625c3a40a0a0a64b9bd8332c8
+part_01944_of_02048.hdf5	798f822d17d46b88e32a815e0ee78f97
+part_01945_of_02048.hdf5	30ee498c4955562c30fed8d39e6a9741
+part_01946_of_02048.hdf5	2a67f2df07d9dfb6566cc23fe23b721f
+part_01947_of_02048.hdf5	48b3a307b81f16f582a76444219bfdd6
+part_01948_of_02048.hdf5	37255e9467bd5b14824a3f0bb51d312f
+part_01949_of_02048.hdf5	e4f890437b18bd9023c74dd31b12e64e
+part_01950_of_02048.hdf5	cf3fac2710bd4557a51c9a508751ca33
+part_01951_of_02048.hdf5	639019c0d695a18f5acff0f5268a12d6
+part_01952_of_02048.hdf5	b17f234c9f3b704020d22de803bf0447
+part_01953_of_02048.hdf5	b40252cdce57be628ffac3b12e6f9efc
+part_01954_of_02048.hdf5	3ddabeaf8385608c0ff13dd5ec629c57
+part_01955_of_02048.hdf5	1941f1f756e909322e084836ad635935
+part_01956_of_02048.hdf5	190503679dd60665e35288ef550792ac
+part_01957_of_02048.hdf5	70b48e27d21aca71ec67deb8895e7740
+part_01958_of_02048.hdf5	d11a3976f3b670214e7935f5a8f761b3
+part_01959_of_02048.hdf5	f2eff575e0e9ab0ee03fe66c277679c7
+part_01960_of_02048.hdf5	a3375c5916cf42e4d70154e837f13e0b
+part_01961_of_02048.hdf5	5b9bb0a78facd9e71d45f2b99e62a4e7
+part_01962_of_02048.hdf5	ffe4bad33ad71aec3a18689b8550a136
+part_01963_of_02048.hdf5	253cecee537f081c1aee74d6473273f5
+part_01964_of_02048.hdf5	019f329114f8e1e8f5f88e933155322d
+part_01965_of_02048.hdf5	c053e906999ec660ef10dcc64686cce2
+part_01966_of_02048.hdf5	f9285db0aafde3fb2e362b4fd83ea0fc
+part_01967_of_02048.hdf5	d2ce5c5185e65da6b543c5c019dd754b
+part_01968_of_02048.hdf5	737c320b5c89d0e7317057bda6d1a753
+part_01969_of_02048.hdf5	5e1480e58b6601ab6ccbb8403ee2801c
+part_01970_of_02048.hdf5	9ca8725fa7d3ab383c830e80bcc29128
+part_01971_of_02048.hdf5	f38f96fd2c9ad4d6455dbe63de3198ee
+part_01972_of_02048.hdf5	97b53bb6c1a5bdfcd90755ae21621c5c
+part_01973_of_02048.hdf5	1ff5b1fa69cb2577a4e4c6c4a64b8872
+part_01974_of_02048.hdf5	1f6703b3efffcc7fb8e236d2f453cf15
+part_01975_of_02048.hdf5	b668ffb96ccf2ae51d7f47402751a4ca
+part_01976_of_02048.hdf5	2ba4ab80b4405b589c884e38ac7986d8
+part_01977_of_02048.hdf5	029cb44af593f59fb941fccfca1c5959
+part_01978_of_02048.hdf5	65e994b4a89852775a9e3a58b7b1b379
+part_01979_of_02048.hdf5	c8033283c392307990eb39394e8fde03
+part_01980_of_02048.hdf5	003fd3daf8262d524fae9e2d6f6a2811
+part_01981_of_02048.hdf5	a16488d1a6e3ae9b2e5966791f552762
+part_01982_of_02048.hdf5	7b0a9b876ac15b80414503fae8a13f0d
+part_01983_of_02048.hdf5	9464f735b05a692b81d9aef518e16794
+part_01984_of_02048.hdf5	a16a01f72120346383ac993ba0ffa102
+part_01985_of_02048.hdf5	ac86b435f8424e7e5ab05a470481d316
+part_01986_of_02048.hdf5	b49dbdaa73e7d7be41525efa49658193
+part_01987_of_02048.hdf5	13f47dce7002a2b523a41a2191fcfb89
+part_01988_of_02048.hdf5	af8af8739b2b0345a132771bba78951d
+part_01989_of_02048.hdf5	34d90359c9e4b5f990efff65cd52cb87
+part_01990_of_02048.hdf5	717194b1dcdb68e24173eba23b529f55
+part_01991_of_02048.hdf5	9f33c1bc074a7aafa25839f62ec3484b
+part_01992_of_02048.hdf5	2f0795f20eca1519572af6a1a34ada89
+part_01993_of_02048.hdf5	10a0e2314b80d0801a1e23d68250f579
+part_01994_of_02048.hdf5	ba3b7ea24ac4833830e03fda10b80b44
+part_01995_of_02048.hdf5	ea1a7abb04f05691422845fcc02ad2f1
+part_01996_of_02048.hdf5	14f74ecb91b06cda2ab30fa907121c6f
+part_01997_of_02048.hdf5	354893f700873c242ae6b20776f21b48
+part_01998_of_02048.hdf5	34007bf3c46d61e6d9f2683e4135227f
+part_01999_of_02048.hdf5	008ac60cdb8a0f9818d44889a305de5b
+part_02000_of_02048.hdf5	32d14831a20b37dd1948c62975c63e4d
+part_02001_of_02048.hdf5	9f049a4f35543d44559ef3c3e525c5aa
+part_02002_of_02048.hdf5	c0d9324bb67d6fb2fd22661d0dcb979c
+part_02003_of_02048.hdf5	03faf5e57a148ff93a6f23c9bb1d6ed6
+part_02004_of_02048.hdf5	3a73a1a8b50928921fc369bdcc31d6ae
+part_02005_of_02048.hdf5	468b1ef7f82a3cc5958d65f1c9b6b12b
+part_02006_of_02048.hdf5	3e410e4d5dc33cf751206459184a109a
+part_02007_of_02048.hdf5	3e1351399b96e27bae6d4c80668aa824
+part_02008_of_02048.hdf5	910e9185d02c87cf46016d6b33ed97fa
+part_02009_of_02048.hdf5	e9c77a3de25484621b86937ce9d6d8ae
+part_02010_of_02048.hdf5	a3858d63e008b22c1ae358c29ecbac53
+part_02011_of_02048.hdf5	55638099a566f58adf70a8c0f74052f6
+part_02012_of_02048.hdf5	d088e7abf0e7514df21bea02f76f0052
+part_02013_of_02048.hdf5	63418092f9dc3ceeb42d13ec68b3928f
+part_02014_of_02048.hdf5	b91bd9ca0c0dc0d12b7bf8c520d3c007
+part_02015_of_02048.hdf5	8acb790295e7a86ba5093b1730f50d36
+part_02016_of_02048.hdf5	422b73bd62b92aad5980b312dba0ef96
+part_02017_of_02048.hdf5	07e851c31ca5360a2e6c18dd364d2084
+part_02018_of_02048.hdf5	a3ed98a0e5aaf297d008a6c96b656e99
+part_02019_of_02048.hdf5	043f581ebc10b36a8f8c37371ed68be4
+part_02020_of_02048.hdf5	ac6d4adf47ae457777c8e89edc6fee28
+part_02021_of_02048.hdf5	90f87b20c80d10f04f65e0b34dd1a3e5
+part_02022_of_02048.hdf5	6f877a0899e56c195da45904b3e3095d
+part_02023_of_02048.hdf5	91d1bec3965f43827321f6578bd8eb30
+part_02024_of_02048.hdf5	072ba9ace12660ce3cd4a8c7ba828d35
+part_02025_of_02048.hdf5	0a1aa1d6350b57a120086a48a65f419e
+part_02026_of_02048.hdf5	6ac5e2eededca1b4540352004a99f0b8
+part_02027_of_02048.hdf5	5c1d3e9fd478febb5303e93f98ba1322
+part_02028_of_02048.hdf5	ee81ad2f6e9e53f275718776846c1b63
+part_02029_of_02048.hdf5	fc58f42e556366a79d659bb8ff4cec16
+part_02030_of_02048.hdf5	4de7736ade66ed0960f05b95718d0714
+part_02031_of_02048.hdf5	2f5f9da3402e73a380c915cf9c5ae520
+part_02032_of_02048.hdf5	4776c75fb148c33ea27d69c8d98b143e
+part_02033_of_02048.hdf5	e1d5050a46f118b1c3e2d17d3bf93dc3
+part_02034_of_02048.hdf5	04da804973b0ce05765f225ac7ad2647
+part_02035_of_02048.hdf5	7df04d61ba0a7827560ccc3ea7dd4b83
+part_02036_of_02048.hdf5	b113b455a554b19b4db9e20d9b9175c2
+part_02037_of_02048.hdf5	1688eae9276722f68dcf61ec5fc7bf74
+part_02038_of_02048.hdf5	8f067ab77026682b3182d5132a11f4ef
+part_02039_of_02048.hdf5	9520103dd92a4289b64c05b1664e059f
+part_02040_of_02048.hdf5	cfd6ec8684e6b9ebfa851c052ed40e60
+part_02041_of_02048.hdf5	fd71332d50d8a210244e1d14ede8982b
+part_02042_of_02048.hdf5	db45c3e4d0ff1a51e7ab7d2e0575f24d
+part_02043_of_02048.hdf5	18c76d4ef7aa25917ff660f6796caac1
+part_02044_of_02048.hdf5	86394565e33b034101a6d7db3391459b
+part_02045_of_02048.hdf5	0c49d26c3b04a3d5f90206d07eb79681
+part_02046_of_02048.hdf5	7e02b581f991cbf9404db65839b86b9f
+part_02047_of_02048.hdf5	7460162aaa6952ee5192e6fb3c5d5805
diff --git a/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/4320_shards_varlength.chk b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/4320_shards_varlength.chk
new file mode 100644
index 000000000..b259aeeaa
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/4320_shards_varlength.chk
@@ -0,0 +1,4320 @@
+part_00000_of_04320.hdf5	e25b1a34b8d14179b2774d1dbf3b7561
+part_00001_of_04320.hdf5	f9455f720575b25ec923a71caa657f84
+part_00002_of_04320.hdf5	f1869d6460efcfa5eaee73a08fde097c
+part_00003_of_04320.hdf5	f1d7681e60083964b161303e0a256dd5
+part_00004_of_04320.hdf5	75aac8516a7a829cbaed67bad36cbc28
+part_00005_of_04320.hdf5	b2bd9b31850f99e64e6680cc8932148c
+part_00006_of_04320.hdf5	fc58f25b2adf87536b1a08714c71f754
+part_00007_of_04320.hdf5	c9282fd31d177aa73d0f0a32180d6369
+part_00008_of_04320.hdf5	53bd94f5447b257d4922a9c11f5bd5c6
+part_00009_of_04320.hdf5	33b2242fc9480ae9cf249cf1db1c3867
+part_00010_of_04320.hdf5	3b9605ac0fc4a1a04bab12435277d560
+part_00011_of_04320.hdf5	1cd23247637a2bcd2063ec700a7f1356
+part_00012_of_04320.hdf5	06de420a3134e36395080fe46440b158
+part_00013_of_04320.hdf5	989e82ab2250e6d29dcac5b4f46fca87
+part_00014_of_04320.hdf5	48ed1af82b21b3b5082ff16b902fd6b3
+part_00015_of_04320.hdf5	92174da07d07ab2a2123ed73ff37f157
+part_00016_of_04320.hdf5	b0d8a4f8d93ff5f272fb99a6a2ebf8f0
+part_00017_of_04320.hdf5	2d3e8cd590abbd29aac4b3ff9059f933
+part_00018_of_04320.hdf5	97232601baf199d083e4e4f7a4889e65
+part_00019_of_04320.hdf5	6add965d541a2983afc497ede51907b0
+part_00020_of_04320.hdf5	3f24191bcbec6f3911698f30f75a71fb
+part_00021_of_04320.hdf5	0c447ce3615f12528186f8e27aa930df
+part_00022_of_04320.hdf5	afdea4b784d11101b3356f06abd45b57
+part_00023_of_04320.hdf5	4b1450b2f51c6e8f0053afb91852fce2
+part_00024_of_04320.hdf5	08acee7e13a65180acdb6867e6b5c886
+part_00025_of_04320.hdf5	fbf67820211c18a780f2f788773d617e
+part_00026_of_04320.hdf5	1132dba5d6d262d3dfd07ae02176e1d8
+part_00027_of_04320.hdf5	0ccae432ec9d1d2c3a3a1fc9ab430f1b
+part_00028_of_04320.hdf5	e3846c5d3690e9e18c88bd93af80b140
+part_00029_of_04320.hdf5	0e2982e29095d9c4c7ce42a2cbc927fb
+part_00030_of_04320.hdf5	f531138dc9dda7f92c623b744717305a
+part_00031_of_04320.hdf5	5646b9f3d2d0cbe4220fd9bb657ea797
+part_00032_of_04320.hdf5	9bf1b18a5fba9f554c0f4229b8dc6248
+part_00033_of_04320.hdf5	075256e562fb7d19d8755ccfcd91d25a
+part_00034_of_04320.hdf5	f86167071c48fdf9555de3dd904a46e8
+part_00035_of_04320.hdf5	3ed3da292471a7fa365cbabfa6f5ffbb
+part_00036_of_04320.hdf5	76d04c157f331f714aef108a50feea43
+part_00037_of_04320.hdf5	65b5978805538396f340144e56c07006
+part_00038_of_04320.hdf5	91e0b49dc321cb35a911d13ff0c5d47c
+part_00039_of_04320.hdf5	5d151e5616f5214d6a4abbefe99e033c
+part_00040_of_04320.hdf5	3b569a0a8761f59764ca6773c93d3a92
+part_00041_of_04320.hdf5	47eebb1738dd0c693b823588a2186045
+part_00042_of_04320.hdf5	243eeb9da6bb36c8ba6029d1d4bec890
+part_00043_of_04320.hdf5	02ab253e5b7888257ed3046ef076d10a
+part_00044_of_04320.hdf5	de0751538fede407a0946789361af8d1
+part_00045_of_04320.hdf5	61b80c66a8fb3aeeb6d35e2576c67deb
+part_00046_of_04320.hdf5	c6ae2efd9a0baf399f0a0b94cb27d1e6
+part_00047_of_04320.hdf5	b90743bd3b188b094445bd69768514f6
+part_00048_of_04320.hdf5	c99bead2307d9ab13e7638bf4936d82e
+part_00049_of_04320.hdf5	d9930b22acb3b33b13ddf530bf5bb555
+part_00050_of_04320.hdf5	401d616f075abf222dcb7c1dcd74d3cb
+part_00051_of_04320.hdf5	ac5181741cc0fab2f1727f45b04110ee
+part_00052_of_04320.hdf5	3310185e8aa42e06bd5b146294aef9c0
+part_00053_of_04320.hdf5	ff8fdc9bcdef4cd69c677b5afb865933
+part_00054_of_04320.hdf5	574c4715cd7291628394f48fe872f301
+part_00055_of_04320.hdf5	4523a702e987d813295c1111faafd87a
+part_00056_of_04320.hdf5	86ca07acecb16aa5418046a190a911db
+part_00057_of_04320.hdf5	c978b91ee84aece86421ea43e006aac8
+part_00058_of_04320.hdf5	4e1ce3db897d44cc6771353e68697d02
+part_00059_of_04320.hdf5	8ee42610d87b463d81a56439be868668
+part_00060_of_04320.hdf5	208b6b83c460e01b33b2fc6f0ac2f7ba
+part_00061_of_04320.hdf5	6770fe918250632ae3b7f96ff1e74eb2
+part_00062_of_04320.hdf5	623a22d4e6dca4c35e3e62170fe8a3cf
+part_00063_of_04320.hdf5	841e13b5e0996a4519cc62b7216fcc78
+part_00064_of_04320.hdf5	1fa39f65542df27aa85fe0c2a9f7976b
+part_00065_of_04320.hdf5	40639d444689cbf22c8fa5cd79feeb9e
+part_00066_of_04320.hdf5	f1f639e0a819ecc5d46990c0b337d5a9
+part_00067_of_04320.hdf5	2443f2f77a4dbdadd87e6c7c038dcf67
+part_00068_of_04320.hdf5	38a60d4cdb9806a169cce2b0eb24fdfd
+part_00069_of_04320.hdf5	9b47cc8ad792e912260f1ca708c2a3b2
+part_00070_of_04320.hdf5	2407d9d2946818bfa713982151ef72c3
+part_00071_of_04320.hdf5	52123d586e608142be1bead1f8517c4d
+part_00072_of_04320.hdf5	5da6cd9c8618d327299e0cbb9dba28ac
+part_00073_of_04320.hdf5	4972499b37f6573d923f0d569dee24a0
+part_00074_of_04320.hdf5	d7d92a688059b726cea20f59a98e6495
+part_00075_of_04320.hdf5	3c2b1021296ec44535e3060c544fe94d
+part_00076_of_04320.hdf5	ddd0a1b7bb8992fd0c1995d7e21a27c6
+part_00077_of_04320.hdf5	330fdd327901ab31f0d76163e6239552
+part_00078_of_04320.hdf5	0df7bbfecb3149a2a7a683f3836ac3bd
+part_00079_of_04320.hdf5	a5024429b4d83ff30bbd8b9e43422b61
+part_00080_of_04320.hdf5	48616e8e1eab2c0ca071b2a2a26d69ba
+part_00081_of_04320.hdf5	e7068c828447ba5f381897f6c5cce1e6
+part_00082_of_04320.hdf5	6cf6e9451992ec035dcf7d745f01531b
+part_00083_of_04320.hdf5	8283396780202430c7a0e8c0c40610b1
+part_00084_of_04320.hdf5	9c5e34b47afb5d5ea4157712a2f7c2c0
+part_00085_of_04320.hdf5	b5c118945567d95aa060a46bee565d9b
+part_00086_of_04320.hdf5	7f599edc96b7a391d4d75af2de8a5098
+part_00087_of_04320.hdf5	3affa53b9ff0363360f9e776bfbe985a
+part_00088_of_04320.hdf5	81f2b73578fc90a075f224ef844ea70c
+part_00089_of_04320.hdf5	cc4bead0ce2ef4be55fb7c2174ab2d08
+part_00090_of_04320.hdf5	d9f0bc7d4009c7e7b46552c064516e44
+part_00091_of_04320.hdf5	b7a9180add4ca8511e7a8465ee14d37e
+part_00092_of_04320.hdf5	2e2f89bb852c692184d7ca5492e05840
+part_00093_of_04320.hdf5	42de1e45439475a6b5efd64320a7a5e7
+part_00094_of_04320.hdf5	d0bc2f8826cd006956bbb070d87ab138
+part_00095_of_04320.hdf5	aaeb51ee885848251c0e163f9d318d0d
+part_00096_of_04320.hdf5	6446db369475510acdd2abf8069bc8a2
+part_00097_of_04320.hdf5	cfb2bb254e8dad8e0b03649a4a391b49
+part_00098_of_04320.hdf5	ea09e112722c822cfe5fca0686593ab6
+part_00099_of_04320.hdf5	e1d1d337131a6110a6f2807b01bf420f
+part_00100_of_04320.hdf5	e967bea11271c854933c08b07e2bfa7a
+part_00101_of_04320.hdf5	4a2864505fb1cde933f8c50ced2a2a89
+part_00102_of_04320.hdf5	22ac7a896f49ff5b168e7841f38ea81b
+part_00103_of_04320.hdf5	9eb925afb8b3908e513d94147f7d3094
+part_00104_of_04320.hdf5	de2acf866f1b6b6a46b6515f7374efe0
+part_00105_of_04320.hdf5	468843434371b13204e27932175147ee
+part_00106_of_04320.hdf5	32231875fe2a0052108e674f9dd7fd52
+part_00107_of_04320.hdf5	a468e13bb06210398411363807600449
+part_00108_of_04320.hdf5	b941404f7ec965a01be46f256c1a36ed
+part_00109_of_04320.hdf5	c81b803a20d98c96b998cf3570f5e496
+part_00110_of_04320.hdf5	7cd1301094d79e8ee68e04153150e1e6
+part_00111_of_04320.hdf5	9693b2392b7587ccbc5f14c7e6087700
+part_00112_of_04320.hdf5	b748889c5f85fe471cbf6bd2b64cbe80
+part_00113_of_04320.hdf5	9cd07e597f9dd75ba39d1796c67cd643
+part_00114_of_04320.hdf5	77d7263960a40fcca131da1f826fcbfb
+part_00115_of_04320.hdf5	4a323456afb77118cf9ce0fb3e299023
+part_00116_of_04320.hdf5	0d16e21fcf71d89a8cf1b4b484100ff4
+part_00117_of_04320.hdf5	4d4662748380141895a158504d429c1a
+part_00118_of_04320.hdf5	b72250c3f0a9f64758922e33280678f6
+part_00119_of_04320.hdf5	78bf668e39f88e27dea7fa52903e6ab2
+part_00120_of_04320.hdf5	d96aa3615c58a5140d45b4ef6d4bf24e
+part_00121_of_04320.hdf5	72fbe981134c707ffb613b1f39bb8cd8
+part_00122_of_04320.hdf5	646bafcd4f732dd02ac95df0a21ef56c
+part_00123_of_04320.hdf5	d827d736a4a65a74b1171bbb3839aa81
+part_00124_of_04320.hdf5	45dbd74b1b996657eff5344f2494ad37
+part_00125_of_04320.hdf5	264a76bdd91c1f5c28edb38227d445c1
+part_00126_of_04320.hdf5	0b4c43365cc4337caec44582cdb22966
+part_00127_of_04320.hdf5	5efc8293b1ecbedc4635e4868c7b05aa
+part_00128_of_04320.hdf5	243c5b59ae4c887a81e9c7010cb3e66b
+part_00129_of_04320.hdf5	ced647656b15fc1268c8292d561285f3
+part_00130_of_04320.hdf5	966613bd41ce7c33fca891e4c7b99b5a
+part_00131_of_04320.hdf5	47f5a79f1cdd99592e505a39cf5c7097
+part_00132_of_04320.hdf5	2efe43e931759986fa50a70be9202194
+part_00133_of_04320.hdf5	afc01a74de0e9c1b8a73d30673f1afd4
+part_00134_of_04320.hdf5	1e915f8ac89962bb473bb09dc1793b42
+part_00135_of_04320.hdf5	e85cac2ca107c44e5374184f60ff7ebb
+part_00136_of_04320.hdf5	ef22b4a6cf5b19699a185cebd213f96e
+part_00137_of_04320.hdf5	26d9c409624c19afb8b45330bb9081fd
+part_00138_of_04320.hdf5	abd1e8cf7526c164b4bef32b63a67be7
+part_00139_of_04320.hdf5	bdd4a120a76e801e7e09ad4b5be05d33
+part_00140_of_04320.hdf5	a3bf4caae27a8036275d42354a3b5019
+part_00141_of_04320.hdf5	e29e9d5ee3b3d4250c9dfb0c454dc0d7
+part_00142_of_04320.hdf5	708f07e17ea5cffbb45742474a98bb55
+part_00143_of_04320.hdf5	29e17700c4457ded2bbeb268c7aaebf1
+part_00144_of_04320.hdf5	70a9b19799d860703dcc62f0f425512c
+part_00145_of_04320.hdf5	8ce6da8f19189ea41b5c43f3e0212adc
+part_00146_of_04320.hdf5	991603f016e07361635fd8392ea12982
+part_00147_of_04320.hdf5	d81fe5d0bf228665d6d5df9010feca5b
+part_00148_of_04320.hdf5	a87d99474da6b3f2c686df67d7e8b04d
+part_00149_of_04320.hdf5	e6ed8fa6f29aedf658bded44a1b8593f
+part_00150_of_04320.hdf5	255387b8e882ae64645cf133eacb0aca
+part_00151_of_04320.hdf5	dd739642e8bc436c9aab0c6dae8a1159
+part_00152_of_04320.hdf5	265e94f0c34a2366bfb88283731bcc77
+part_00153_of_04320.hdf5	ac397b9964648ddfed19bde7cfb9033f
+part_00154_of_04320.hdf5	2b8803417bd7fcbd6a6ab27fe8868f2c
+part_00155_of_04320.hdf5	5754ce84ea90bdd0a72ce48c554ae42f
+part_00156_of_04320.hdf5	e22de3c459acdfeb0e9d1b291f53e4f7
+part_00157_of_04320.hdf5	d5603cb68f15d00068b03c795a3232fe
+part_00158_of_04320.hdf5	0a02ac2d97b2241473fe49bf2714bace
+part_00159_of_04320.hdf5	432dbb49bae3d1f04ba7137c063ec46a
+part_00160_of_04320.hdf5	1a5ad65c82b23380050ec10970f57ace
+part_00161_of_04320.hdf5	6e1a311e9b471e6c6ff2d3afa586ea97
+part_00162_of_04320.hdf5	b2c5dfabe9685546e815a3632c10a835
+part_00163_of_04320.hdf5	7423f21dea856c7f6a30ed7564524355
+part_00164_of_04320.hdf5	77d75764269eb10c1b3052d30b01448e
+part_00165_of_04320.hdf5	265de81dd66da39252f8676121ec6059
+part_00166_of_04320.hdf5	2e60dfee293c731c1d067473fc9fa954
+part_00167_of_04320.hdf5	0bcd772ce3feeb4bca9b10f4eaac732c
+part_00168_of_04320.hdf5	69fdf9abc4a99f0eae54cedc193b89ef
+part_00169_of_04320.hdf5	0dc7576099588814da3d6b08c6936e25
+part_00170_of_04320.hdf5	214287504028f892d6e042bcf71e8ccd
+part_00171_of_04320.hdf5	4824c586f5b179c125eaf3102f5c3295
+part_00172_of_04320.hdf5	340dd66e910d8fa31eb0a5b0f33517ce
+part_00173_of_04320.hdf5	564e9775f556e921996f18df1e5091b2
+part_00174_of_04320.hdf5	45aeeb486fbd31c98677bbc8f5cfdb15
+part_00175_of_04320.hdf5	d114c76776d634774d30a91392f4adfd
+part_00176_of_04320.hdf5	10a7c888aae098a35e0bd337ddcec32a
+part_00177_of_04320.hdf5	e24a81410964d3f3999b25d3d3aaab06
+part_00178_of_04320.hdf5	f265ab82e078c1e86520818e4175fdd1
+part_00179_of_04320.hdf5	c7ad5f5bad555946673b1ee03565103c
+part_00180_of_04320.hdf5	92fa5571a8cc0b1b62ee3245ceb2ef87
+part_00181_of_04320.hdf5	7916e37df259dba26ed8f88ad4cd5d2e
+part_00182_of_04320.hdf5	6bf9c255b55021ff6766d7b34a74f4f1
+part_00183_of_04320.hdf5	4dbab19c3552408adc07eebb98bc8798
+part_00184_of_04320.hdf5	6bed6eb51dc2f8d145fd8388fcaa3a9c
+part_00185_of_04320.hdf5	9fe81bcc23365b335dc2530334e3a1ba
+part_00186_of_04320.hdf5	3436c8effad5f5115b0db1576f2a11bd
+part_00187_of_04320.hdf5	f416e311b04c75848f975e84059ee8d4
+part_00188_of_04320.hdf5	7271eced0b5868ccfb97258f165bda1b
+part_00189_of_04320.hdf5	59ca23598fcbe63696ff476e74848278
+part_00190_of_04320.hdf5	25e7f4ff9a729b83a97ae1ae2774ba31
+part_00191_of_04320.hdf5	42d3e24802b945f52009871bc421ba1f
+part_00192_of_04320.hdf5	573040cdd6d51c54d07407864bb881da
+part_00193_of_04320.hdf5	f8e26be2485553690e0ed0a4cf89394d
+part_00194_of_04320.hdf5	bd32a7e9b68cb00b363edafbc4c13ca1
+part_00195_of_04320.hdf5	468a84cc6ba3ad403011972e5d357593
+part_00196_of_04320.hdf5	7fbf28f7b1f4eaed6024b12fc240e63f
+part_00197_of_04320.hdf5	6a630193c61c40d864f85be58b24a414
+part_00198_of_04320.hdf5	d724f892fde9bc8979c0e4ed03e1967c
+part_00199_of_04320.hdf5	0be5d99f7a75f45b65904aeabcd0d5e7
+part_00200_of_04320.hdf5	bad20bd59fe939f6ca4b9a5f77ff24ae
+part_00201_of_04320.hdf5	073899c9d62f4d1121fb9d60c2ce0eb8
+part_00202_of_04320.hdf5	6048232411eda733df1b2722a009d328
+part_00203_of_04320.hdf5	317cadacc4904c448450d35225d641ac
+part_00204_of_04320.hdf5	9cd9a64e622c89593f3126a112461c6c
+part_00205_of_04320.hdf5	e027b0079f295dae997062443eced07c
+part_00206_of_04320.hdf5	2813176adc6ab66b7a4e8bad62083efb
+part_00207_of_04320.hdf5	3c5d2ea3b3e637a83d002128399a222a
+part_00208_of_04320.hdf5	f150dc1557ed9c1ac528d461c7ac1e3b
+part_00209_of_04320.hdf5	fc79a94c924a1768f6ea95335d6f9ef1
+part_00210_of_04320.hdf5	9c91727210350740c2e5228a0fee6fbe
+part_00211_of_04320.hdf5	481e8be50d11b012af02305991317f34
+part_00212_of_04320.hdf5	ced679a978771dafaf7266683d094fb7
+part_00213_of_04320.hdf5	fe9daaf9461c5b5b3cd83ce8118de4d2
+part_00214_of_04320.hdf5	009e61478dde75a42a2a18cd02f48199
+part_00215_of_04320.hdf5	5475c065c81485723eb3e0e05824a71e
+part_00216_of_04320.hdf5	0f7f3464b19ece93c37fdefac1bc4dbb
+part_00217_of_04320.hdf5	58b5bfc9c50148ddac81cea1b9e1bf26
+part_00218_of_04320.hdf5	39709abe5d492de4a68f33046b78dc85
+part_00219_of_04320.hdf5	08bf06fb3e0075bd03a05afa71a9875c
+part_00220_of_04320.hdf5	1c8e96e6facdc09ae1ae207d509cb735
+part_00221_of_04320.hdf5	b20f5faddcbd0df4541b4d8fec9b9c49
+part_00222_of_04320.hdf5	ede49671ac6221cfc0c35ba0d280d96d
+part_00223_of_04320.hdf5	15b9e3bbd37902943512a267218f7538
+part_00224_of_04320.hdf5	d22308ac8a2674d31de95d80b77d2c1b
+part_00225_of_04320.hdf5	1550e09f4764ad8184d257bd64558ef4
+part_00226_of_04320.hdf5	fcb00fc647fb7ddadd54e9c1121ab72a
+part_00227_of_04320.hdf5	51c4b332d1a8d0cdb563886fa1b3490d
+part_00228_of_04320.hdf5	ead794d47aaca72cf3225fdd05e12cd4
+part_00229_of_04320.hdf5	5e65cd2cfc38e1d0264a99ff916626ca
+part_00230_of_04320.hdf5	e96b9121e36d67d08bed6c9a8453a75e
+part_00231_of_04320.hdf5	3d5f376d124af0110fccdba16abfb6f3
+part_00232_of_04320.hdf5	f203570ddd5dea5f208210baf7bee4e2
+part_00233_of_04320.hdf5	fda9cbeddbee35acc74534c664600d37
+part_00234_of_04320.hdf5	a3324194889aa2e95d48a4cf15954792
+part_00235_of_04320.hdf5	835cba720b7fa426c1f03a027fed75b6
+part_00236_of_04320.hdf5	c5f925b5946bc777d6da5b2c48f7a274
+part_00237_of_04320.hdf5	d9f4eb059bdf272399e6b4763e74ffb7
+part_00238_of_04320.hdf5	5302baecf31f27438b891413fecf5d0b
+part_00239_of_04320.hdf5	79b9f12baca2f271e0c7a86b85034ada
+part_00240_of_04320.hdf5	8bf4118433ffad5b492221c9c31ca3ac
+part_00241_of_04320.hdf5	beede6a440000518bbb4061f2ac9aff5
+part_00242_of_04320.hdf5	a53d97e0f760c768e867768baf104a64
+part_00243_of_04320.hdf5	88f6364663fcb8d3d197f6d22c1441c2
+part_00244_of_04320.hdf5	8e3eba78696c2d80aaad25c52a158113
+part_00245_of_04320.hdf5	c84db5dd0d850c852f48715512ef42be
+part_00246_of_04320.hdf5	e75d5e166118535b4a9b1e2cd4a1d362
+part_00247_of_04320.hdf5	0bef41606a06070b5c051134e378559c
+part_00248_of_04320.hdf5	043ca3fae1adda11dca0db171ecbf345
+part_00249_of_04320.hdf5	5cf20b58660d8a9eaaaf1d97da2c5688
+part_00250_of_04320.hdf5	c7932c84027e1f722e6d7f51ac757094
+part_00251_of_04320.hdf5	c2dae497c7aa5e456160c84cb9ad4591
+part_00252_of_04320.hdf5	9c0950b26348747cc8235a425c5d003f
+part_00253_of_04320.hdf5	5ac8b8c452859bfa06aff22431edda05
+part_00254_of_04320.hdf5	0117bc8f8e533fc886ba9ea50aa4e360
+part_00255_of_04320.hdf5	514ad57c9a86da4f27a3e327e71cf266
+part_00256_of_04320.hdf5	58808d08196f7a2db00e73e7d1467a44
+part_00257_of_04320.hdf5	6fd4289dfdcc445ea6164d23ecfe0e78
+part_00258_of_04320.hdf5	865e6482f7d4a92bd33c1f5096099f50
+part_00259_of_04320.hdf5	cd4a899dd40ff1b163be79358a6650c7
+part_00260_of_04320.hdf5	c284dbfa8c23bb2093ac5429af3c3d42
+part_00261_of_04320.hdf5	3b9e0228b5b05ab4f42b5b1fb68d9553
+part_00262_of_04320.hdf5	b27fa1c3278804fe4e0ab3b0bc2f23b2
+part_00263_of_04320.hdf5	31f1b45ac151f1b791095bf757afe08b
+part_00264_of_04320.hdf5	11b46a2cfc902d3be75a56eefd40c302
+part_00265_of_04320.hdf5	2750f1f9ed37f930ae965f0ffaf2af9f
+part_00266_of_04320.hdf5	ae08e62d208127c5e8bd7fc5f6e8c5ae
+part_00267_of_04320.hdf5	aa261d1f4ee4b5d7eb02c87c8844cdcc
+part_00268_of_04320.hdf5	f402d8e69f6c432611794eefdf4e162b
+part_00269_of_04320.hdf5	7ec553c84f93abb8b12af9709afd8988
+part_00270_of_04320.hdf5	7c038b5952b8bc0440a6fd15dc98c8f2
+part_00271_of_04320.hdf5	b64737cdd3a32c1663932f678b747847
+part_00272_of_04320.hdf5	6d5f7b798b463ebe5a1d4d2547362653
+part_00273_of_04320.hdf5	998d7baf3392fac73ff029a5d139d97a
+part_00274_of_04320.hdf5	6f7535881985025d37ecfecacc235637
+part_00275_of_04320.hdf5	70f7c506750096f289ef12d885bef338
+part_00276_of_04320.hdf5	ba3d21e94db7eb1807aa660b145cdc96
+part_00277_of_04320.hdf5	c1fe52fc99a31f57536378507cdc582b
+part_00278_of_04320.hdf5	2462a85bd3e2d0bb86d72195715a6de0
+part_00279_of_04320.hdf5	cd97aa15172d275c025712ebed6d18ff
+part_00280_of_04320.hdf5	08f8c182c2d67bd5176ee8a66b53ee84
+part_00281_of_04320.hdf5	2d5b3a7937e9c4810021ecb0363b475c
+part_00282_of_04320.hdf5	d8508004987b6c599f62181f72c33831
+part_00283_of_04320.hdf5	b808f192152fdd64b03040fb6467d5b2
+part_00284_of_04320.hdf5	36a319ed6b2bb0bc39587584c3ee2c10
+part_00285_of_04320.hdf5	ad55122068f3c131ba6258ade37fd833
+part_00286_of_04320.hdf5	000bdc800c121dfb3d876892c3714648
+part_00287_of_04320.hdf5	0dba7615fb3a20b8bc4e31418cdbbbe4
+part_00288_of_04320.hdf5	3ad3c0f68a0ddc2ebad865ae2bdfa07b
+part_00289_of_04320.hdf5	42a79047196407ebcd76a720b4ff5c8d
+part_00290_of_04320.hdf5	9cc39b2b357f18e6f2f75bd3ee1b1959
+part_00291_of_04320.hdf5	4737be949b2aad177f855f1ee8278318
+part_00292_of_04320.hdf5	d0d478e67bba5358b96175cebd2cae21
+part_00293_of_04320.hdf5	23de4c6ddd1d1eb04e0948b8278bb44c
+part_00294_of_04320.hdf5	ce070a637a1ff764241021d9d0aa5d5f
+part_00295_of_04320.hdf5	581ae7f1d7f0730177de5a2815f8ff24
+part_00296_of_04320.hdf5	1ca602871919540a1f795d81ba90e1ce
+part_00297_of_04320.hdf5	9cf97a6e3b0c8761dfa6b36f023fd8c1
+part_00298_of_04320.hdf5	8d473529e5ec8cdf8d9344006715a1d9
+part_00299_of_04320.hdf5	9207ea3ed4cf3f95d6eca31e1a8c11af
+part_00300_of_04320.hdf5	bfaae7ca3559def77194084bc46d5e7c
+part_00301_of_04320.hdf5	ea3a979cc50f2cb715af35e65bdf6798
+part_00302_of_04320.hdf5	368d677ac9fa9ad2455524b51203577d
+part_00303_of_04320.hdf5	c4fd1bb037a93f59652f2d2f64d8f2cc
+part_00304_of_04320.hdf5	f84039684442a8992c4b760a3249c8fb
+part_00305_of_04320.hdf5	33a49d5953016e3f346527f51a328784
+part_00306_of_04320.hdf5	5fdcf0f24a87841b7ff0a1d5f1acf095
+part_00307_of_04320.hdf5	48cf95ea6eb90043031042384501f7ac
+part_00308_of_04320.hdf5	1f020beadac895e90598036bc879a256
+part_00309_of_04320.hdf5	c30c3dfd2e6d5f3d278af07634c78fab
+part_00310_of_04320.hdf5	f81596a0abf95d89bc2d7563a5397ac6
+part_00311_of_04320.hdf5	ec5dbd0a5f8c749297d3d024797e4e7b
+part_00312_of_04320.hdf5	9a2eba9799c599cfb26a9e06d3327e52
+part_00313_of_04320.hdf5	7211fa141c99f6ef7b8bb6c3e727c4c9
+part_00314_of_04320.hdf5	f03fa1a88c6d71adda5b4f418b041ecc
+part_00315_of_04320.hdf5	d88a87bb1c594ead9afb386ab4194d80
+part_00316_of_04320.hdf5	75f125fd1a5fa342abfec89b81f6b952
+part_00317_of_04320.hdf5	3a94830a66b83dc7dae054f258c28082
+part_00318_of_04320.hdf5	e76308de9711468d806bc878da5ba70e
+part_00319_of_04320.hdf5	e2c6679dadb148a4a3e77a5e4a6da45a
+part_00320_of_04320.hdf5	e604a00e6a53fcdd8f950aa379b6e632
+part_00321_of_04320.hdf5	7b8bdbecad24636e26b862ce759abd4d
+part_00322_of_04320.hdf5	c476476e1d53fae0372f8b84e6bfced0
+part_00323_of_04320.hdf5	2dbe64e71b310e5b9af14d5ae0216b7a
+part_00324_of_04320.hdf5	f30750b4645c92caed12c8b1a4ccd1dd
+part_00325_of_04320.hdf5	7b55ffafbb8ca2b8434bff6485d23781
+part_00326_of_04320.hdf5	0284e433a4c16c55a54ee53d52904c20
+part_00327_of_04320.hdf5	be1750ed61ca7674fc73156df9743f52
+part_00328_of_04320.hdf5	c9e3e23f9d348c20b7f7f697ff6f7779
+part_00329_of_04320.hdf5	043199f8e686c79c78bb5054aacdfa60
+part_00330_of_04320.hdf5	110f7e5b38ccbd2e2eaee2dbd93dd463
+part_00331_of_04320.hdf5	399892ac31ce5188a8aa5f18c2d3aa6f
+part_00332_of_04320.hdf5	f86e384e53e292f80e7c6578cc864cab
+part_00333_of_04320.hdf5	562bb2917463d77aae19fcd7f6a59f89
+part_00334_of_04320.hdf5	b502c0acef28730b956dd16cff48f03e
+part_00335_of_04320.hdf5	b91a1a0593ba751004f499469c44afc0
+part_00336_of_04320.hdf5	59466a04c50b3dac333a03944e28397b
+part_00337_of_04320.hdf5	9fb48d02b097f048eeb77887a54f730e
+part_00338_of_04320.hdf5	2b1d99eb929c78fbbdc052730b0e91c7
+part_00339_of_04320.hdf5	4d93141d49fda0315ae744615c31970d
+part_00340_of_04320.hdf5	f4c741e7886c3967b8a4d0aa917837cb
+part_00341_of_04320.hdf5	84507eab8d10a3076e47d60bd721cf4d
+part_00342_of_04320.hdf5	65e96de3a3880b846bc26c366849e349
+part_00343_of_04320.hdf5	070a0ddf9ae1b9dbee8020e992e3bd09
+part_00344_of_04320.hdf5	5b3e35b7c012e3fc6f11a0ac149ea709
+part_00345_of_04320.hdf5	55617c7c3732d8d6e6261934018e2c5d
+part_00346_of_04320.hdf5	3f237ca0ec377b65f15821ef174bdee8
+part_00347_of_04320.hdf5	6a48c7637ef0b3867c4477328f43fab9
+part_00348_of_04320.hdf5	0442df68fcaf2f840f980590456d702d
+part_00349_of_04320.hdf5	2af842ce86f922899876ec7e0f4951d3
+part_00350_of_04320.hdf5	f4a1bb1090f961d721543c1267390e90
+part_00351_of_04320.hdf5	5bac6b26d8427376e6a1cb0fa97b69fb
+part_00352_of_04320.hdf5	a64d2ec7e80846f2f10d7beb8b6e038d
+part_00353_of_04320.hdf5	6e7a73440670a4181c3fe66dfcaecdd7
+part_00354_of_04320.hdf5	261069f5f55b247936585428767d3bf9
+part_00355_of_04320.hdf5	4225cc74f75372f6c273e9eeb4aa976b
+part_00356_of_04320.hdf5	892001e46890bf0af16834a500b2652b
+part_00357_of_04320.hdf5	eb41058b7b9040e46900c90029f6b95b
+part_00358_of_04320.hdf5	2c698ed50caeeb346480b15fe772ef39
+part_00359_of_04320.hdf5	3a7c55a0d4b52da05d802970810ad145
+part_00360_of_04320.hdf5	eb30ec9bbfbd3b28e48718f17f71e496
+part_00361_of_04320.hdf5	2ea1e9b3eed66d99ada051ef8b138de6
+part_00362_of_04320.hdf5	82814d40b6d1580899310466275d28f8
+part_00363_of_04320.hdf5	b02e857fe3c8472f151cf4db8754ab06
+part_00364_of_04320.hdf5	1d173b4b5cc5103081e5ab09321b3f7a
+part_00365_of_04320.hdf5	8ee6a3c1b66c08a2a0f6b55db2ebb6ea
+part_00366_of_04320.hdf5	d8c2fac248ffd58d6fe8df529d3755c9
+part_00367_of_04320.hdf5	24e2e4c61d368d234b78717f07340244
+part_00368_of_04320.hdf5	8e55b936846f147f3cbf25163810bef5
+part_00369_of_04320.hdf5	d7e5b3c446abc028e6367d4b4a143f0a
+part_00370_of_04320.hdf5	d0f4020f3529224cb9c3ce21b55eb96f
+part_00371_of_04320.hdf5	18f7d6f5384fc97b83711c53fb877e5d
+part_00372_of_04320.hdf5	aa50994a34506abb7e85af9edf0a73c6
+part_00373_of_04320.hdf5	4f004b9e17958de29995663d340dee46
+part_00374_of_04320.hdf5	f3d52f60f90278157b62da7c3d99681b
+part_00375_of_04320.hdf5	18a254fdb20d7070acad394eb21aa742
+part_00376_of_04320.hdf5	6a1f9228a7a7732da63dffe42ecff612
+part_00377_of_04320.hdf5	40fef5eff5a3a4761f7c590edf82bcc9
+part_00378_of_04320.hdf5	dd49596b2b11d4bd52373dcd5cda6966
+part_00379_of_04320.hdf5	55ef262d5cd5688e0ffd2c870d0b4380
+part_00380_of_04320.hdf5	5347ca8485f9d6786700dd1112006f22
+part_00381_of_04320.hdf5	f7d792e26f27b4418a52c1699ca94416
+part_00382_of_04320.hdf5	9ed88907da41a1fe4734c474ea2d5057
+part_00383_of_04320.hdf5	5d6325dfb88df016740c8ac5c3842447
+part_00384_of_04320.hdf5	929767100e82231de95db2c07f033537
+part_00385_of_04320.hdf5	ae7a38f16f231f43a9195d13265f6c69
+part_00386_of_04320.hdf5	422dc65db63a99a3e0bd9c4e68745014
+part_00387_of_04320.hdf5	6d2c7394bb8986c4c751416b170db4d8
+part_00388_of_04320.hdf5	f6e9502410c26bca48064398c8fceba2
+part_00389_of_04320.hdf5	143c74bce6751367878daf9ad98a8ccd
+part_00390_of_04320.hdf5	8ad4ba9b3049e8827f60c8e2e258b1f7
+part_00391_of_04320.hdf5	646452858499055424eee16714280d85
+part_00392_of_04320.hdf5	1c58b110b5483b875a1d835feb532bd6
+part_00393_of_04320.hdf5	3f96b30449248ca46acf61f5835b3cad
+part_00394_of_04320.hdf5	2b65bea38b31e06700168391f8c1ead3
+part_00395_of_04320.hdf5	7df2e632d56094718304111fd7a235a9
+part_00396_of_04320.hdf5	63f84be70b423d4ec74f8ea10ed7dfa0
+part_00397_of_04320.hdf5	ef01bcff48b4602d4ab7714c9d018bcd
+part_00398_of_04320.hdf5	917b7728877fef1bab9c3039473ed657
+part_00399_of_04320.hdf5	a15566defc56b56808c77f46a9e66151
+part_00400_of_04320.hdf5	291c7927314991f45e4de8087f91309c
+part_00401_of_04320.hdf5	cb46867032c6788cc69c1e7e1d94dd0d
+part_00402_of_04320.hdf5	be296c4ec316a0596bbb6da2c7780059
+part_00403_of_04320.hdf5	99fada3e5b07e340b90588fc0dc4e082
+part_00404_of_04320.hdf5	d34f04e858b7785230a3e2ebea74b99a
+part_00405_of_04320.hdf5	656f5f547dce92d6c5c9b80cb485de6a
+part_00406_of_04320.hdf5	523c04016d8259104bac5c186d4ad96c
+part_00407_of_04320.hdf5	01df8f4cb29e3f2aa65dee86d694737c
+part_00408_of_04320.hdf5	95f464d7e1c4276986c946cfa0a75348
+part_00409_of_04320.hdf5	8d3cd83d3054635d57ceea4710011bc0
+part_00410_of_04320.hdf5	f4220cb41acf5ea93fb67a9554a9495e
+part_00411_of_04320.hdf5	4e314ca1cb5ec7ac46c53de93dc3ebe4
+part_00412_of_04320.hdf5	a2521ba6554cc2047c6bd0e648970db5
+part_00413_of_04320.hdf5	e1d3cfc16216387c1cd32c7c90e4fbf6
+part_00414_of_04320.hdf5	8209a3f97737eb907724cfe0d5caff6e
+part_00415_of_04320.hdf5	4efa1ca32d97a6e72b2aa124787a845f
+part_00416_of_04320.hdf5	52898dd2811c48c427547745f2b49b80
+part_00417_of_04320.hdf5	699fa959d6ddbaa3754e9d5e2bab6928
+part_00418_of_04320.hdf5	e059a1151a035febd3c4748fac5fd9e7
+part_00419_of_04320.hdf5	6eacea8200da3869be9a1ad1f61c1b4d
+part_00420_of_04320.hdf5	3bfe8798980c374d416d6aaf8499e535
+part_00421_of_04320.hdf5	8e7b4746301340de114b293d2bd18009
+part_00422_of_04320.hdf5	77e826d25243f44e6100c2452226bdc6
+part_00423_of_04320.hdf5	0593178e2a6033e618495b6d0cc9869d
+part_00424_of_04320.hdf5	e779310eb494423e84a9cc5eed6fb090
+part_00425_of_04320.hdf5	c0c421634f466431f8c0c86f450a4ad1
+part_00426_of_04320.hdf5	e052afdfeb3628c6e84056157a1c75de
+part_00427_of_04320.hdf5	9fb40669ef4ae65e60001ca6b58aeb2f
+part_00428_of_04320.hdf5	7d6d9399cb135c65766f8d3068ee5684
+part_00429_of_04320.hdf5	ff966d85af1fb15bd32e0763d94ccce6
+part_00430_of_04320.hdf5	533c970e305becf1343c2dfb97bf4a89
+part_00431_of_04320.hdf5	cf2b601cd09cd1c1baafac31a04ebc8a
+part_00432_of_04320.hdf5	81412d89dc5e2287bd8664e6ae8814b2
+part_00433_of_04320.hdf5	33f1b48aa0d348730ef1454441420e2f
+part_00434_of_04320.hdf5	7c9dfaa6c83b5b98b06977b3cb2fbd04
+part_00435_of_04320.hdf5	3bac1017246e68941958e8205440bd53
+part_00436_of_04320.hdf5	a02ae931e1c5821041e02ac091e2493b
+part_00437_of_04320.hdf5	01644549273ba15baca114e8ef263249
+part_00438_of_04320.hdf5	eaa51fba84874372ddf13e80d59fb8b0
+part_00439_of_04320.hdf5	64a3604f847b272b2554da9296400346
+part_00440_of_04320.hdf5	4925c08962d3e8eea6d47c6da42cc6a4
+part_00441_of_04320.hdf5	d38ddd5dce8df81d676bd91daca0b505
+part_00442_of_04320.hdf5	5418bd00d194aca1cd404d08fb6a596c
+part_00443_of_04320.hdf5	3f86b611bd6ef5ec83a79e7134e475d6
+part_00444_of_04320.hdf5	20a15dad60b490f4480ec67e3c96a424
+part_00445_of_04320.hdf5	93b3b1d0f2e00063e3fbba7688ecb15d
+part_00446_of_04320.hdf5	4605c3b6ec152396274fb2b3f546a595
+part_00447_of_04320.hdf5	f83ee1cac5029afe7263c710c894ee2a
+part_00448_of_04320.hdf5	07a90fd533e36ef97a63f58cf56bcb47
+part_00449_of_04320.hdf5	b5bde1e2caa09202c760de516a509524
+part_00450_of_04320.hdf5	14eeccaf2983ed1249d638aa4a6f14a3
+part_00451_of_04320.hdf5	fc46802cb55bf8c1f2b2a8b8871db29b
+part_00452_of_04320.hdf5	88cd589efbc9fdccd11ddafd3f6abb18
+part_00453_of_04320.hdf5	e54188f54ead90db09118eaed3a576f3
+part_00454_of_04320.hdf5	48769ef8944821edd5efed59f8d37830
+part_00455_of_04320.hdf5	2f035634ea11a52fa3b36c3da00a159f
+part_00456_of_04320.hdf5	38817530f1f58c2d64d84139ae3f2b5f
+part_00457_of_04320.hdf5	b2c81d2c593ac9d142979e49ca21d06b
+part_00458_of_04320.hdf5	cbf448e16e6690f96d46b58fd334e2fd
+part_00459_of_04320.hdf5	d3ea7635b7ec74a2701d85fd3077202b
+part_00460_of_04320.hdf5	82d2e482be526144dd5fbb8f09488f20
+part_00461_of_04320.hdf5	c491fa230a516418b8fcf84804e25cd5
+part_00462_of_04320.hdf5	b19664f2f737cdd756b9614967d57f40
+part_00463_of_04320.hdf5	792af45a93393c18da5361480d75bb87
+part_00464_of_04320.hdf5	d535c405f5883ab1cc2aed1ac564303d
+part_00465_of_04320.hdf5	4668069988f54b92d92a004c6b7c42c9
+part_00466_of_04320.hdf5	b64fa60751d3858d157c5332399090f3
+part_00467_of_04320.hdf5	9d534979c5653c8491e702be5ef7fa2d
+part_00468_of_04320.hdf5	38971a3bfd96a6bb5e55817cad24a757
+part_00469_of_04320.hdf5	6e6fbcae502710b1095adac3a8b138a3
+part_00470_of_04320.hdf5	896f6d35953ecec7fcfe43103e929e8b
+part_00471_of_04320.hdf5	357d7ba738d31b39c4be35c1872a4003
+part_00472_of_04320.hdf5	b6a9926f54c6d80474dd7a9145dd26d4
+part_00473_of_04320.hdf5	c562fed765a6db7ebff4fb7c5b3c1c05
+part_00474_of_04320.hdf5	8a5c5e26dc6b2ededaab88cf6a638cd0
+part_00475_of_04320.hdf5	d899b3716657968cf398acaf74ae3997
+part_00476_of_04320.hdf5	7b193c7e06b462ae02870ec92dbdc153
+part_00477_of_04320.hdf5	29d2633c003d12ea29f688add4b8d278
+part_00478_of_04320.hdf5	70c14c538eea9341aaabcc8e5a2fca1c
+part_00479_of_04320.hdf5	0d1ab203e79882d6254c092f5531922b
+part_00480_of_04320.hdf5	25e6625de6872e4c919ec10c3ac27467
+part_00481_of_04320.hdf5	f5ffbe387b401355e008e1673a5e4466
+part_00482_of_04320.hdf5	de935495a1d384b3189bcaac1b164736
+part_00483_of_04320.hdf5	eedababda293ba74540e34e5136f01f0
+part_00484_of_04320.hdf5	7895583e6ec4a6337486bfe297287533
+part_00485_of_04320.hdf5	9bef72d393524c378a2d7970f268a7c1
+part_00486_of_04320.hdf5	484da7d2e2b810e9210c91619ce4fd00
+part_00487_of_04320.hdf5	30dd1ada4ab9a79415c1fc8e3b6e5e1e
+part_00488_of_04320.hdf5	b12692b5141a25383cf8ae29cccd2f4c
+part_00489_of_04320.hdf5	16a1ad181d5ee289f72d0f91eddcb543
+part_00490_of_04320.hdf5	18563d16de463be525358842290b1629
+part_00491_of_04320.hdf5	8a8ac5656b800e35d286689681bd5d4c
+part_00492_of_04320.hdf5	7aad8d17635bf8f987ee50de26369f28
+part_00493_of_04320.hdf5	564e69701613ad981f3669d03afa4f48
+part_00494_of_04320.hdf5	1982942c6b128eecf3b1302172dca2da
+part_00495_of_04320.hdf5	dc547c214d23c8893b169f81b3e199cb
+part_00496_of_04320.hdf5	31ed3ae9799471e1de80c6f98dc5cd14
+part_00497_of_04320.hdf5	c938fa6ca7f793fc40633fb5e932928c
+part_00498_of_04320.hdf5	f7020194c4ffcf8524047cdd4dd95727
+part_00499_of_04320.hdf5	55ae9e135577c282863aea51d236ed0a
+part_00500_of_04320.hdf5	f9a5f048b4137d74652f8d9f65da13a8
+part_00501_of_04320.hdf5	618cfda42d66b7a3a3a3f2f50d70f869
+part_00502_of_04320.hdf5	e3b355c20c5606fbf3e7132aa43599c9
+part_00503_of_04320.hdf5	903cffc44a776ac10ff3ff09690e5966
+part_00504_of_04320.hdf5	1b7daacc8b4c5196bf2b20114c747cdf
+part_00505_of_04320.hdf5	aab161a3f5e9e504f3433dc19fc3c689
+part_00506_of_04320.hdf5	01efb239875d9c93b52b006ea817c997
+part_00507_of_04320.hdf5	f28f506874031473978f515597524b9a
+part_00508_of_04320.hdf5	1c8ea833813f7c0cb657ca15886451d6
+part_00509_of_04320.hdf5	f4cdf0079018317db3b72fe69160492d
+part_00510_of_04320.hdf5	c5b312abf6466b30514ed4dc80e60ba7
+part_00511_of_04320.hdf5	9c4f3cb25813b553e61bde8b2a5ced54
+part_00512_of_04320.hdf5	31d59784ccf18c2105756cbde0326088
+part_00513_of_04320.hdf5	d9b3d88afa5358f4c3d58166ffecc873
+part_00514_of_04320.hdf5	26490bfd1e6a90341ddd9061c9ec0468
+part_00515_of_04320.hdf5	c3efe1a02e0eac579758711094bf05ec
+part_00516_of_04320.hdf5	165b60507054db0b8e94ba9e9314644b
+part_00517_of_04320.hdf5	76b74ce821de09ba7e3176ee3aea3738
+part_00518_of_04320.hdf5	c7a86ba292005d81674a271f7dcc32f4
+part_00519_of_04320.hdf5	82e274caf15ff2b515ac6beaa48313cd
+part_00520_of_04320.hdf5	520bff78bced2b1d59519839ad672c0e
+part_00521_of_04320.hdf5	bde06a59f0ebf27f3d7e5ccb0eb91134
+part_00522_of_04320.hdf5	46a4dbd6fec3fe1c4c4f779db0a4d8b2
+part_00523_of_04320.hdf5	e2be7fd85e1ac05df20ee2c7337dcf67
+part_00524_of_04320.hdf5	b12d83d61c51acce628723b2634ab7f9
+part_00525_of_04320.hdf5	925dd958425dbed4b9e24cee68ee47f8
+part_00526_of_04320.hdf5	3963cf18cd8805ee0f583a0f205f15a9
+part_00527_of_04320.hdf5	a51cb7fb07021824154918b01f97e2f8
+part_00528_of_04320.hdf5	7a8fb4ae0cb3722c53c8bce63f7e116e
+part_00529_of_04320.hdf5	2354917782e605ea18d03305116eaf9e
+part_00530_of_04320.hdf5	632799588989b7c0b4519cc76210c48b
+part_00531_of_04320.hdf5	f85f3086ed7ffec1c00d9c51be53527e
+part_00532_of_04320.hdf5	a58b94fb379845eb97f476dbc99c1c41
+part_00533_of_04320.hdf5	211389b72b92d6462c7e53182227043f
+part_00534_of_04320.hdf5	e5cff5057c036e940097b360087b8bd2
+part_00535_of_04320.hdf5	4f6d90151896e7598ccef66e91d8b4d5
+part_00536_of_04320.hdf5	eb85d8d1f84ef88bb8e3669e0e8c26cc
+part_00537_of_04320.hdf5	b34a379672055e131f6203f6f650dbde
+part_00538_of_04320.hdf5	2b1fae27592eff3da6cf531be956c5c4
+part_00539_of_04320.hdf5	1f97a309c998576a611861598c66ecc3
+part_00540_of_04320.hdf5	310905a93095b292b3b83deb7f835215
+part_00541_of_04320.hdf5	28dfc675dc3c42de0268f39fe8fbf2c1
+part_00542_of_04320.hdf5	5423dfcf4ec2c23a9944433cf400d62c
+part_00543_of_04320.hdf5	8ae94450a06c8364f4fb1a8ee2e430a7
+part_00544_of_04320.hdf5	19658095b6d8585a81a311852d8a96c6
+part_00545_of_04320.hdf5	f90f4ce94f747564056752f3b9ed55a2
+part_00546_of_04320.hdf5	5413c53ce043cddea6775c423f111ab3
+part_00547_of_04320.hdf5	aeebbb77695d69257b2916955bbe86b8
+part_00548_of_04320.hdf5	4630d76e245ce6dbb8427ebad14c97dd
+part_00549_of_04320.hdf5	b9f6e338949ed66b0bd586fb5eefd9b8
+part_00550_of_04320.hdf5	fb07bc91ccbffb1017c5d70384a32ad2
+part_00551_of_04320.hdf5	3c5f71d6b061453f935250cdbbed9f36
+part_00552_of_04320.hdf5	5f8785c78bf27a31d43788c7ee3b38ca
+part_00553_of_04320.hdf5	ad167c28a73f88fb10e9a2bc5e869b27
+part_00554_of_04320.hdf5	0c69788f66906210d06a3d13e6949bf4
+part_00555_of_04320.hdf5	e411e44b15502bf1dca4811748391d1f
+part_00556_of_04320.hdf5	43e608aa97bcef7eeba550e8b350e0d5
+part_00557_of_04320.hdf5	f2b03d91dc4ec2c566d72df44421c03e
+part_00558_of_04320.hdf5	125533fec3ef05084248b70fca08828b
+part_00559_of_04320.hdf5	148873d36a92d2f185daf5964b936f3b
+part_00560_of_04320.hdf5	53744df927a679f053c6cf141ec616d8
+part_00561_of_04320.hdf5	84350d882304c2e34d67b2355e11798b
+part_00562_of_04320.hdf5	5f328c70cd7cc35ff85da6f851928605
+part_00563_of_04320.hdf5	de9fcaaeaaa00c67f5df269e4ba302f0
+part_00564_of_04320.hdf5	e116073bf23b745526449a86ed542f2a
+part_00565_of_04320.hdf5	4fa07d70a5e920b00caa2ccd0bee25ba
+part_00566_of_04320.hdf5	ace370c93c22a4e3bd5c633d08af6d30
+part_00567_of_04320.hdf5	ce27f147963284337f969bd5fda99d39
+part_00568_of_04320.hdf5	82e7e452dec0928a688696facb910c9a
+part_00569_of_04320.hdf5	df8cd7262af54b758ead7d06a701e1ec
+part_00570_of_04320.hdf5	2614d41ecce328f3cd0ee6f4c1d1e06d
+part_00571_of_04320.hdf5	fe543255c6077756ba243e3d1fb03b91
+part_00572_of_04320.hdf5	6a011834dd76c8b50c50cacfff1f2535
+part_00573_of_04320.hdf5	1826a5d25e3d2e3becaa6a059e2b988b
+part_00574_of_04320.hdf5	416357c1e71a444b455c38b50a6c8fff
+part_00575_of_04320.hdf5	6a3bf927f34cb768876455c1a3e62455
+part_00576_of_04320.hdf5	f4c3fc23588db434c3a772b182e04531
+part_00577_of_04320.hdf5	dce30500f85c7a21c7c9ddf56e1d6d1b
+part_00578_of_04320.hdf5	fc77b4fe99ab2b55d61ba25ed0a80fa8
+part_00579_of_04320.hdf5	448ee2eedd68584e88fed32788ea80dc
+part_00580_of_04320.hdf5	55ee484d42db8496e09d2a27307bf94b
+part_00581_of_04320.hdf5	f80e93f7e19272137d590cb0c95a4b47
+part_00582_of_04320.hdf5	84a7960e62e620c4a6fe845be999a11e
+part_00583_of_04320.hdf5	20f630e9b870cb4028921b2be980c2c3
+part_00584_of_04320.hdf5	46126bb5e4da323a3a17ea3836b98bc7
+part_00585_of_04320.hdf5	44a63b23164db8f2ed866727c9046f74
+part_00586_of_04320.hdf5	fb14a71c3045b66799c4949107932114
+part_00587_of_04320.hdf5	30d563f839a2264414c1e2eadf7e5cb8
+part_00588_of_04320.hdf5	7fe4f6e1232f7f5ec3365511987e86a8
+part_00589_of_04320.hdf5	24da743ffb17fb35f9144231872ec3b7
+part_00590_of_04320.hdf5	5329ffdba7e6a3c0bf0f72dd2ea69938
+part_00591_of_04320.hdf5	49c0fd58da4a3dc42185aa8dfcd1531b
+part_00592_of_04320.hdf5	2fb37cd3083326ec5b91bbbcad144326
+part_00593_of_04320.hdf5	af6f34c9c0d229b069bdcdff3951e095
+part_00594_of_04320.hdf5	5635e23cc86512717058609a2e5b9855
+part_00595_of_04320.hdf5	78a2addf19d3afdbcead567b7b467772
+part_00596_of_04320.hdf5	2ca23afa2e9b3f4faa6a6b8e9f7f4206
+part_00597_of_04320.hdf5	d19d002f459d8b9ae63c1172de3f523e
+part_00598_of_04320.hdf5	2e7cac10e690094ea59305ec3d9cff28
+part_00599_of_04320.hdf5	1f13b22755eed37660e0e467523b3816
+part_00600_of_04320.hdf5	41dd79b357307ce88a30fb7dde9c3c1c
+part_00601_of_04320.hdf5	aa467a0a923a875950f0fc8737c7e63d
+part_00602_of_04320.hdf5	2f322e696a1f572cac10c9045c5353c1
+part_00603_of_04320.hdf5	e913f2e9088360220923e9b685f6a0c2
+part_00604_of_04320.hdf5	01409100f531994785925703e291a18b
+part_00605_of_04320.hdf5	7c305c932cdabb1b4ca8f52ff7e0e71a
+part_00606_of_04320.hdf5	243b6109db9a90ba8958d7a2c3240c60
+part_00607_of_04320.hdf5	171e08b9fe53fbe19333229111bc3758
+part_00608_of_04320.hdf5	9e66808579dd8c7a5cdf49b103968b30
+part_00609_of_04320.hdf5	99aaec057be4990c1a5d06f42203b246
+part_00610_of_04320.hdf5	e15686185f163429618e70d0c30be969
+part_00611_of_04320.hdf5	39e545852d852c1861f1908965e33b93
+part_00612_of_04320.hdf5	ab12581f10cb86a3dd999685bfaccbe7
+part_00613_of_04320.hdf5	d087510c329ccd47a14357db806c85b4
+part_00614_of_04320.hdf5	e4883031e34ca4cdb96a8df51c5105b1
+part_00615_of_04320.hdf5	a8ae12b486f8187167130c286b5f8ff3
+part_00616_of_04320.hdf5	65bd7c3e1457f890bcb1b24c62637f2f
+part_00617_of_04320.hdf5	c7798d24ddd1afc45edeeac95b5c133f
+part_00618_of_04320.hdf5	48551973d0ef8495035176b751a16cee
+part_00619_of_04320.hdf5	4b273ab16fafb0599ca3b97c9684b203
+part_00620_of_04320.hdf5	a5f0485f6a340b10402759acdecc0005
+part_00621_of_04320.hdf5	e9ac6497a15e0774c6ffff3fb249ab68
+part_00622_of_04320.hdf5	f215cfca00fcdc9d7088a9b9066cf08c
+part_00623_of_04320.hdf5	75824afd48a3fed694c84a8e5f721549
+part_00624_of_04320.hdf5	20e5aa63db38706312a60ec72c42a86d
+part_00625_of_04320.hdf5	fb3734785f54dc8238b5786f4dc2c5c3
+part_00626_of_04320.hdf5	bcfb7cd12e2c4c0e7988d8539c8b8b25
+part_00627_of_04320.hdf5	655c386ba947e40abd8c68c31511c678
+part_00628_of_04320.hdf5	d0a5c5ef52c46009c3e3475765cfb50c
+part_00629_of_04320.hdf5	d5b3dcce264e9579fb4d8d24bfa03d62
+part_00630_of_04320.hdf5	d19d2faadd7d4cd18a9be816d087dba0
+part_00631_of_04320.hdf5	335a069660f72489d62e3bdc06521bb0
+part_00632_of_04320.hdf5	2d054fa82ce48036e381bbab9e101840
+part_00633_of_04320.hdf5	2e06cdafb47b1a6afb93472b0c7fd35f
+part_00634_of_04320.hdf5	c0d02be495050bc59e79d524e9f9e7d2
+part_00635_of_04320.hdf5	4b9ce29bf844039f96aa56a4820e43d8
+part_00636_of_04320.hdf5	4a0fe20dab2896f81c2daed00e675e97
+part_00637_of_04320.hdf5	44d05821ea70497ced08f602fdae92e1
+part_00638_of_04320.hdf5	a1dffdb2e51726aca321f43608c9088f
+part_00639_of_04320.hdf5	71222cfaf0cb43719d351372e224f8a2
+part_00640_of_04320.hdf5	c1862d525ac08971b0a4c04f6b45dd0b
+part_00641_of_04320.hdf5	03a2bf6f0047b7cdf656375c870b2799
+part_00642_of_04320.hdf5	95a4a839927a7c665585701f050e92a1
+part_00643_of_04320.hdf5	be0b35410d2aa20a7912b1a3d36fc1a2
+part_00644_of_04320.hdf5	c46eacd49d0b14001720150d5aa8d6ea
+part_00645_of_04320.hdf5	c7ca0eb72b963c37444158fc21902ff0
+part_00646_of_04320.hdf5	85f2de67bd5789e571284380348b571c
+part_00647_of_04320.hdf5	0aa5ff1b6ab6ac2f8b22630572702f8e
+part_00648_of_04320.hdf5	44bdc1b202cbce7e5e50907c0f749086
+part_00649_of_04320.hdf5	26cf25c35288bc47a095175c1476ce16
+part_00650_of_04320.hdf5	6de6a809093ad2eb5b943812e5c05d79
+part_00651_of_04320.hdf5	a86f1fe4b18b3485de6bf212de101cdc
+part_00652_of_04320.hdf5	dd5682002cfeeb459ae3af071c0faef4
+part_00653_of_04320.hdf5	cba96c50212ba0f86453f8b4ad6d2330
+part_00654_of_04320.hdf5	d663e32275e017638c6dcefb3f7c18ce
+part_00655_of_04320.hdf5	d01dae078d4ed5c2620f7ffb364ef47f
+part_00656_of_04320.hdf5	27b47eaf71604beec118093a85ec35ff
+part_00657_of_04320.hdf5	88a6fcc7cd6fa0a834d2e69b27f6ce58
+part_00658_of_04320.hdf5	eabdfa71f98dffe5ca6aa1fb4b051052
+part_00659_of_04320.hdf5	b05a8ee7fcde527257b42adb03d3f426
+part_00660_of_04320.hdf5	4d0fd4c77e4aec9cb874d1e019898638
+part_00661_of_04320.hdf5	5c70713cf555b8d1686b179d5ccd41a8
+part_00662_of_04320.hdf5	959ed9c4956752262200d2d4e0c206f0
+part_00663_of_04320.hdf5	a6fcce60ce313ceea5a751c26c0978be
+part_00664_of_04320.hdf5	d29dc05a69ba54a750ca976a89f7af4e
+part_00665_of_04320.hdf5	9eb02e76ca92f819354e7b935ac91654
+part_00666_of_04320.hdf5	2e2ffe1c13ff2a11dd772a5812894258
+part_00667_of_04320.hdf5	92d68d43c2d54e08ff7eafa0ce04c1e2
+part_00668_of_04320.hdf5	eb9b4f6379f8ed3cc4c33ff20b7e35ea
+part_00669_of_04320.hdf5	04a62d4529c2e6f5b2da977bb1a3a4a8
+part_00670_of_04320.hdf5	4988b6595b6c966b2003ae10d493f28a
+part_00671_of_04320.hdf5	b727a047061297981af826dff25e0e3d
+part_00672_of_04320.hdf5	adebb29a8bdc474d7b6aaba812b59414
+part_00673_of_04320.hdf5	ba4a7a1e09d9f889d697096cf97aa5a0
+part_00674_of_04320.hdf5	23bc50d6686365c438f8f1c015206bae
+part_00675_of_04320.hdf5	9baefe36e00bd238b57c702d6dfd98a7
+part_00676_of_04320.hdf5	d2ca06bcd4ad5ff72089e7eaf167ee6c
+part_00677_of_04320.hdf5	bb3b6ea8006687cd3948d0fe141829d0
+part_00678_of_04320.hdf5	7f7d55e0f513b3984fbf556a1ba80a62
+part_00679_of_04320.hdf5	b72a0b0694ca5b93e2388f71813d0042
+part_00680_of_04320.hdf5	9607b72d26cdef2d4101699ef946ff2a
+part_00681_of_04320.hdf5	6899870f76783a6a550f371d7586bdc8
+part_00682_of_04320.hdf5	2015aba041a67744dc01abec64099f6b
+part_00683_of_04320.hdf5	af24eed7d545eb52dc6d87a7a8e44275
+part_00684_of_04320.hdf5	33705cb6474d694174b06d98a5ccc05b
+part_00685_of_04320.hdf5	f53d882ce68fab2c526bbe2b6508d520
+part_00686_of_04320.hdf5	3565c370ee16357fd534897d2a7b4bf4
+part_00687_of_04320.hdf5	f88ac6cd7e534b5d386249c43bff873e
+part_00688_of_04320.hdf5	659a58cd7f2d6fa880790ee2b116945e
+part_00689_of_04320.hdf5	ff020db883b7525c370ee4c3ce706e74
+part_00690_of_04320.hdf5	6c5a3ec1f1c116f87e61bfa7d3b80c79
+part_00691_of_04320.hdf5	3109da1daede20a9bba509a70954b455
+part_00692_of_04320.hdf5	58d146f86bba9e1fd4978c45e9cdc0a7
+part_00693_of_04320.hdf5	2bf27172e3256a0846da207afe6a8a13
+part_00694_of_04320.hdf5	68999cfa1864c431caa7a2c83ce1a05f
+part_00695_of_04320.hdf5	d31199adc005359730fe6b0303132756
+part_00696_of_04320.hdf5	11f3ca91b5ba8f35d0dce64e02f88787
+part_00697_of_04320.hdf5	89a564558550cdf2f6e6be0aa2c0bab7
+part_00698_of_04320.hdf5	914a31f51063c6ccfe5f0ee7c48e0242
+part_00699_of_04320.hdf5	ed0eddd98874db911dfd1e43e7515eaf
+part_00700_of_04320.hdf5	db90b9c2bd07c55d3b8ade5edc216050
+part_00701_of_04320.hdf5	44621a76334746cb9073e1513dd898b5
+part_00702_of_04320.hdf5	0f15493fa330cbe5db838cd11faf5c51
+part_00703_of_04320.hdf5	d526b86c7c7989d75202e61afa6e5f76
+part_00704_of_04320.hdf5	1acc686cc0ed8fcaee19fc29b47b7e94
+part_00705_of_04320.hdf5	bf65ec19969d94ef04a1fd0c06e6e038
+part_00706_of_04320.hdf5	4b44a217263975accd19dce6e0370bdb
+part_00707_of_04320.hdf5	589c981426d250ceefb25abe3479af1d
+part_00708_of_04320.hdf5	32983bac8d9be7bf9857b52e1c5d5072
+part_00709_of_04320.hdf5	4addde5d1aa454ecf71a9b6e7abb587e
+part_00710_of_04320.hdf5	2594379570269c096652b87ee3244c54
+part_00711_of_04320.hdf5	1044a1091dadd41452a09c433615897b
+part_00712_of_04320.hdf5	54934d8f39c22d87daa9b0e30d368169
+part_00713_of_04320.hdf5	95b28839dd9f53450392fcfdfef53cc5
+part_00714_of_04320.hdf5	de151dce0c83aca65e6cb74a6ba775f1
+part_00715_of_04320.hdf5	9551543dc53bf321267e6d7d4d354236
+part_00716_of_04320.hdf5	c4a2af844c5896ee43329fc9fdcceba6
+part_00717_of_04320.hdf5	f09a73eab148448560cd3f73519e4bd2
+part_00718_of_04320.hdf5	507613c21a07d5ca59f5ad940a8c0b56
+part_00719_of_04320.hdf5	803745c8542e4985465fe784f18bb859
+part_00720_of_04320.hdf5	f4aae55b18944bbd9bc44128fa1bdfbf
+part_00721_of_04320.hdf5	8093771aaefce57f0fbb3dac428f4954
+part_00722_of_04320.hdf5	e42055fab5b8fda897b7c1f81c8c4a5a
+part_00723_of_04320.hdf5	3a82ad22a2581b9ec85da7b1ed68bd67
+part_00724_of_04320.hdf5	3f62cfc91f1b0c60bff24a4783cff771
+part_00725_of_04320.hdf5	b3ed928083079dab9c1d03f919e09893
+part_00726_of_04320.hdf5	bbbdcf1199a4b95d808ee3be07c09b3f
+part_00727_of_04320.hdf5	cd56f0d000b060ec84fcefe87b484617
+part_00728_of_04320.hdf5	95cee8847f041c267461adb067b91330
+part_00729_of_04320.hdf5	3d8c04ae067afe9b15f00f9f70aed2ee
+part_00730_of_04320.hdf5	45a62b6f6cc6ccdff784614c7f9c70fb
+part_00731_of_04320.hdf5	69dd89cf6b2bdcfb46eb56a92f41a82d
+part_00732_of_04320.hdf5	27dfce6704e52ce250fd93dfe5e5d34e
+part_00733_of_04320.hdf5	70a16d0c23d060893e85398e773bee62
+part_00734_of_04320.hdf5	ab04565b16d7a117643fcd936a803cdb
+part_00735_of_04320.hdf5	247b0617278dc3d3929b284aed2c3033
+part_00736_of_04320.hdf5	c684bfa74b94eba79b3530fdbb298bf2
+part_00737_of_04320.hdf5	6c273c8b5a5e297151d3a86684d17aad
+part_00738_of_04320.hdf5	fa979a1e4e518585f8cd68b7972217fc
+part_00739_of_04320.hdf5	5510d1248ae0dd843f7ed4cd28775861
+part_00740_of_04320.hdf5	feaa381d99a2e86825a7af15a55efd02
+part_00741_of_04320.hdf5	dca85fc13bd4fdb7e64250f94c6aaa42
+part_00742_of_04320.hdf5	092b103dc1c83ee695f4d31df0b51860
+part_00743_of_04320.hdf5	9e6546de86ac23f56912e28e50a648b2
+part_00744_of_04320.hdf5	de7b9c23fd1eefdb380de11f42bd61b2
+part_00745_of_04320.hdf5	2c91e5c63026fb3806dc9419d10ce061
+part_00746_of_04320.hdf5	57f620f0d052de351143dc55cd180bee
+part_00747_of_04320.hdf5	0ac0c5e6532823b074bc12469fda8c4a
+part_00748_of_04320.hdf5	59ae525d21cdb4479fb640e4bb659000
+part_00749_of_04320.hdf5	40c9bcef6f56a0d5ec8a0f6576850106
+part_00750_of_04320.hdf5	335de89f2a6a6c9894d5111214cfc6bc
+part_00751_of_04320.hdf5	1b3242cc6f1eac719fa1a270b2b17b84
+part_00752_of_04320.hdf5	e6229f4a02d4bbe4d2aa6f6754cf8b50
+part_00753_of_04320.hdf5	337f24ff427170daeca7bd1c55f8795f
+part_00754_of_04320.hdf5	06bb4d581143cf5d01dfc2bc36b78850
+part_00755_of_04320.hdf5	b53877c206f82e66dc0a340512a6112b
+part_00756_of_04320.hdf5	745c09ef9ef79e2a236860b2c1ca6ec9
+part_00757_of_04320.hdf5	805b703fcba02df8c072ad3ef3e594e8
+part_00758_of_04320.hdf5	47e001f9ac6a8d91eae07c09b630f798
+part_00759_of_04320.hdf5	6bcc35101d03e450c28e2581e7242b27
+part_00760_of_04320.hdf5	f08574aa64b0fc3e4e914da7ca3bbf9a
+part_00761_of_04320.hdf5	41af4127bf1e836c0b2249b6bf0105a4
+part_00762_of_04320.hdf5	b33c462b1544bf08fd51c27341731971
+part_00763_of_04320.hdf5	70d12aa6691370fac9fe0804bdf06b98
+part_00764_of_04320.hdf5	116f599a970f58e5aa1a6657838bb01a
+part_00765_of_04320.hdf5	0ed938a8a506d25a6f8fb2413b7ab5c6
+part_00766_of_04320.hdf5	64a9cfd2f0f8c1cad8ca4a15af69efe2
+part_00767_of_04320.hdf5	82f67da2c3cdfd43b5cb94c6dfc6bd6d
+part_00768_of_04320.hdf5	99aec697af090419f329eeba4307380e
+part_00769_of_04320.hdf5	af6b2f9badda8ee86affd45388dfbaf1
+part_00770_of_04320.hdf5	41b082822196abdf19f476fbaee4ab7a
+part_00771_of_04320.hdf5	cf66a82bc40e29e80984353dbd2bc6b6
+part_00772_of_04320.hdf5	7913c7a87739092302d486ed25e90535
+part_00773_of_04320.hdf5	9639dcb1df49692b9c380b24dbef2086
+part_00774_of_04320.hdf5	1199923111bcf137d4e0bbeb74268e45
+part_00775_of_04320.hdf5	185159436d8d2de9aa4923b80fc11495
+part_00776_of_04320.hdf5	a1732767425d09b5304cc6a2c1aee1dc
+part_00777_of_04320.hdf5	80864f4d5b063c149bea273a7851cf61
+part_00778_of_04320.hdf5	b4a4fba532241467fbdab78fde015cef
+part_00779_of_04320.hdf5	4d3c621971cc0c89b41c6a7099080c67
+part_00780_of_04320.hdf5	addf81f140bc2b549001601f4e0a26da
+part_00781_of_04320.hdf5	dd78b8c513b5ef6b8236e2f2058b5360
+part_00782_of_04320.hdf5	fbf90ae2910b67ba27db40241c00fe4a
+part_00783_of_04320.hdf5	532e9dfacb860dbc2effb11b5d91242e
+part_00784_of_04320.hdf5	fad9419963996014b407e2f1a1e2c401
+part_00785_of_04320.hdf5	f94ace9d7304663c68135fca8437c064
+part_00786_of_04320.hdf5	4f030d79b67a4f4a0f8073c93fcca053
+part_00787_of_04320.hdf5	38cf3af4ef51166f8feb701b45fc78dd
+part_00788_of_04320.hdf5	255d7521d337438e6ec0dfa8dc7aa355
+part_00789_of_04320.hdf5	754947f8d6a2167ec57003ba63f48122
+part_00790_of_04320.hdf5	a36f2c57cea4a957df4eaa36c722effa
+part_00791_of_04320.hdf5	ff733e6c254c33cf8e1ff0a3e39f0924
+part_00792_of_04320.hdf5	c7789bc42f657f7b2d29471c6784e408
+part_00793_of_04320.hdf5	d72fc67a4a39b5fd807e2b05c8b7d3f3
+part_00794_of_04320.hdf5	a4a704d4c8236adf5d84da22487d996f
+part_00795_of_04320.hdf5	fad632c19231cfcdfd8b2a044d3779bc
+part_00796_of_04320.hdf5	2ab09f12fe3707113cb9cf9f804ec6e8
+part_00797_of_04320.hdf5	58fd8ec15263ec49065c685be0de4471
+part_00798_of_04320.hdf5	d32ffc5f5af034c3ac19494ab6df5c93
+part_00799_of_04320.hdf5	944c87e7512933cb1684206d685c6fbd
+part_00800_of_04320.hdf5	697bc5c252a3561615a2680e81beb127
+part_00801_of_04320.hdf5	d5e5d87cc5794ea03e3a5010b26284f4
+part_00802_of_04320.hdf5	0cd417a685fdc0221e8579f3abd5cf30
+part_00803_of_04320.hdf5	43d163bd576f13065201e93fa2b24761
+part_00804_of_04320.hdf5	0eb42a7ca95bd81a3e9dc93994ab97d7
+part_00805_of_04320.hdf5	0f2eef231868cf75c2f26e8aa40630a4
+part_00806_of_04320.hdf5	0e23b79ee7be1d5c84c93595f339e4e8
+part_00807_of_04320.hdf5	f8d6f54fac02dd1258940e2a5c2f5b4a
+part_00808_of_04320.hdf5	809399f8266b3762016015b7fc577f28
+part_00809_of_04320.hdf5	e37be363b6568e515ddae00fb984b610
+part_00810_of_04320.hdf5	eaba436c602a580b5667dbf739228223
+part_00811_of_04320.hdf5	55b8d586cc82371a2d56ea3779d4a7aa
+part_00812_of_04320.hdf5	8ae3eb688b93d74f48fc0103505c7277
+part_00813_of_04320.hdf5	61a28da88c1192996ca273947ad005c2
+part_00814_of_04320.hdf5	1e7bee9132ec7740464ddf029de2348e
+part_00815_of_04320.hdf5	8e0c758459956fe5612a42304cd158e0
+part_00816_of_04320.hdf5	8cb46b4499158faa94d1d5b2391d13f8
+part_00817_of_04320.hdf5	319d258757681472f60df5ead9896533
+part_00818_of_04320.hdf5	cc983a10727ba9bf9a4a96a507e17400
+part_00819_of_04320.hdf5	33aaa7a990f0bf7aefe5dcd4bddc71e0
+part_00820_of_04320.hdf5	5529c5e914a16ef7b45e86cb532a4eee
+part_00821_of_04320.hdf5	cd34c5754950615eb27d6ded5b7f711e
+part_00822_of_04320.hdf5	7d47e58339e2c0e395c473a21563b183
+part_00823_of_04320.hdf5	9b3ad986d3fc4572a7d6c959abee36ec
+part_00824_of_04320.hdf5	e6e2ca244f035552298e178baf9e6ed3
+part_00825_of_04320.hdf5	f73d01d3da168831bd73cfe1322b1315
+part_00826_of_04320.hdf5	d468f3aa685a36d02d3397823e93bdfc
+part_00827_of_04320.hdf5	88b2f643d93ab8417d35fc6d81c1193f
+part_00828_of_04320.hdf5	d07afc929e688e1d1be8bd27fa28fb6d
+part_00829_of_04320.hdf5	40dd5aaeb08678f314abcd340693f030
+part_00830_of_04320.hdf5	4d6990547ad6814d36c6eff98ac32e01
+part_00831_of_04320.hdf5	c9d00b97d3b03d9d2c7547d5fd1613e1
+part_00832_of_04320.hdf5	830b547448070a7d10c9a98c0e883c94
+part_00833_of_04320.hdf5	933b603ce1f736c87859f7da284a5859
+part_00834_of_04320.hdf5	f86112eae423d4fdfdee5f17cccad122
+part_00835_of_04320.hdf5	5f93ac00320beb936085df3d33003af5
+part_00836_of_04320.hdf5	539a9f08e878ce9a07d2d26d99074ce0
+part_00837_of_04320.hdf5	b245aca62b7498527228288155ee2c21
+part_00838_of_04320.hdf5	8fb3b842fdcf3755a93227932609d023
+part_00839_of_04320.hdf5	ceb7c8dfd243426e367c2870a031e718
+part_00840_of_04320.hdf5	0d50b9bdfe056e164fa769fcdeae98e3
+part_00841_of_04320.hdf5	3ea3d110d418b66799aa60c004681581
+part_00842_of_04320.hdf5	3c942ea58276616b76f29bf955a71f35
+part_00843_of_04320.hdf5	2d587d87d62b1dc6d1d250b784eba52c
+part_00844_of_04320.hdf5	89d98a8e489c54fb902a73a167d6908b
+part_00845_of_04320.hdf5	9128230b69b774c8c2373eba88c15fdc
+part_00846_of_04320.hdf5	decde8d2c9454c6bc4c3063b52939d0e
+part_00847_of_04320.hdf5	19fbb682429da0b88b13ccb234d18d21
+part_00848_of_04320.hdf5	d572618c3d9b0e83e6ec815ed5a4378e
+part_00849_of_04320.hdf5	cc7439013cd14863ee0d832e58b0db4e
+part_00850_of_04320.hdf5	73efccd65bc897249efb0d2c44c211ea
+part_00851_of_04320.hdf5	ca7aa8fb2885e4f78cc624b57c5ea393
+part_00852_of_04320.hdf5	123beac6769e7a054360d6403aa6c519
+part_00853_of_04320.hdf5	18c2f0b9aff8b12fd822a01428de1345
+part_00854_of_04320.hdf5	ca1d3e8b63e1171c72be171b8c06b228
+part_00855_of_04320.hdf5	7d7f64253c4c0b507c1df4a3a2059513
+part_00856_of_04320.hdf5	7ac2020676962e6aca14240586eda2a4
+part_00857_of_04320.hdf5	563d037bf332e24aef9c0df81417b693
+part_00858_of_04320.hdf5	7db41c721bb1d265a863a50020f945a2
+part_00859_of_04320.hdf5	1a3f31b8090f3d29582c9846f78b988c
+part_00860_of_04320.hdf5	1e8f87465dac6725bf186af885329888
+part_00861_of_04320.hdf5	c35beac966ba6458055119e6fbc90daf
+part_00862_of_04320.hdf5	d2b6a81bc8f310dfd2531f274d2c40f3
+part_00863_of_04320.hdf5	fb70aa3c4d8fd5d6fcd686a468611e67
+part_00864_of_04320.hdf5	ba5565a33f51f207eb360d9c52eaf17b
+part_00865_of_04320.hdf5	2bfbaada52b976522986d0045c753ffc
+part_00866_of_04320.hdf5	44aa8f11925428d90cf7329adf8d6395
+part_00867_of_04320.hdf5	fd616cc32a942e2fc4060bdcc3a4c339
+part_00868_of_04320.hdf5	e9b8146831a44a229c01a9ca89a700d9
+part_00869_of_04320.hdf5	bc29ee8b303447c61dccfdbad52c5b4c
+part_00870_of_04320.hdf5	6f11281fbf38e87a3b200a807a1f8153
+part_00871_of_04320.hdf5	13691b04b5f60f6434e202156a286ef5
+part_00872_of_04320.hdf5	295fa49838d96f4ed93b6063d774cfa7
+part_00873_of_04320.hdf5	ff1413ca094ca25b542f8c388aa2b9ac
+part_00874_of_04320.hdf5	639287d199b474c12db70eb12f2446b7
+part_00875_of_04320.hdf5	be4cdfc73f5f898f86eb9d0c0ed71271
+part_00876_of_04320.hdf5	6479f64e72ce5b2684e4baec12e9de6e
+part_00877_of_04320.hdf5	9679db69ddc1c1120aee78a4e2efdca6
+part_00878_of_04320.hdf5	9816dfd158260d9dafe00370c0154813
+part_00879_of_04320.hdf5	4ee63960479cc95315f333f582d02a4b
+part_00880_of_04320.hdf5	d85dde0a3ec79c3f56158ea0ffe44297
+part_00881_of_04320.hdf5	900ab876ced9ca235c40da1621159b87
+part_00882_of_04320.hdf5	88a0ad5528303e2465c232f4580dce5e
+part_00883_of_04320.hdf5	19eb6859e7a3cc2a465696f7433479d2
+part_00884_of_04320.hdf5	559eafa6903ddaa231101b8c1e501733
+part_00885_of_04320.hdf5	83101080b206703883e1d00534addc8f
+part_00886_of_04320.hdf5	349b4d173911eeba923a9ebcc8d61f05
+part_00887_of_04320.hdf5	d93d9b70b640818c77db618afe4b1825
+part_00888_of_04320.hdf5	a0103bf2f2321d37302c6fa59f1b11d6
+part_00889_of_04320.hdf5	1cb035f78f2b68721595c62e5c79ebd7
+part_00890_of_04320.hdf5	df78f7dceff200ae0ea48adf9078327d
+part_00891_of_04320.hdf5	5e6192da103fd3df4f66ad75c1859f1c
+part_00892_of_04320.hdf5	e7e89df3e4b37d42a3f10cf3f3c8339b
+part_00893_of_04320.hdf5	97b95b30885f1307b483b04cde3a7870
+part_00894_of_04320.hdf5	bb24dc8f97cc3581d13d69fc57881ed9
+part_00895_of_04320.hdf5	855b0ba8051727a5928058268c530e42
+part_00896_of_04320.hdf5	09e8dd2c8ea004a0b4e8954b4f0012fd
+part_00897_of_04320.hdf5	554c0d6745f96cb5639581357bf57cd9
+part_00898_of_04320.hdf5	a2fa0e671aadda006324f168678e0295
+part_00899_of_04320.hdf5	d9b7c5f28dbfc20c3ef9cd8f8d75ff18
+part_00900_of_04320.hdf5	a3edb0152fe5aefe0bf3be202ee68d27
+part_00901_of_04320.hdf5	d74ec53e571d0a8e2b4cfad454c0bbc9
+part_00902_of_04320.hdf5	d1ea4c2d4bc9ae24088996b371cb7995
+part_00903_of_04320.hdf5	c118c73c97729fd78ee1c8c60f3b9535
+part_00904_of_04320.hdf5	d825ac943d3cb9f2d9bd17aec89b3f91
+part_00905_of_04320.hdf5	acf4fffda118b0fdc8fdff91cda8d7d6
+part_00906_of_04320.hdf5	147501f16d7e2f10d235daf28c223ebb
+part_00907_of_04320.hdf5	8378c2a3d4d490500f1af7651db4b4bf
+part_00908_of_04320.hdf5	eb0e6e10258d513733100d05172eb7ba
+part_00909_of_04320.hdf5	0e79b33191826743b20e192d7516b66d
+part_00910_of_04320.hdf5	2f1a664fdd3cdda3115a783a852793aa
+part_00911_of_04320.hdf5	27c0a8ab213b6c8f1b6f7ae8fece0f09
+part_00912_of_04320.hdf5	7c7049f39814004167abe5042226fdf1
+part_00913_of_04320.hdf5	ae23b7ce4d3c7e3f61efaa1bd00d6310
+part_00914_of_04320.hdf5	6b57fd5469d49d3016ddc2538c8bbb30
+part_00915_of_04320.hdf5	88ae0ccb93eda7622631a3b470e46185
+part_00916_of_04320.hdf5	1ad6cd3e41c0f80cf808d027e4fde955
+part_00917_of_04320.hdf5	c6ab69cd3c2f8f0749ad88effb773fae
+part_00918_of_04320.hdf5	418d65b422880041b629665e6dc8a9cd
+part_00919_of_04320.hdf5	d5d43198e3f96438fb064bb2b3e03df0
+part_00920_of_04320.hdf5	f808803cf9917fc8474cf6836f292074
+part_00921_of_04320.hdf5	b0ecbb36f40f9ab173a077583109b605
+part_00922_of_04320.hdf5	46bd5e699249e9847bff045ea4376593
+part_00923_of_04320.hdf5	343e419ee6a8adbca077d00d40f4b36c
+part_00924_of_04320.hdf5	926db74c3f999160546b79e2cc9deb83
+part_00925_of_04320.hdf5	f411a88990b497202a0902d6043847d4
+part_00926_of_04320.hdf5	6709c8f52183763733ca5307c0100b41
+part_00927_of_04320.hdf5	999b7704337cfbe1d00292057ab9ab8a
+part_00928_of_04320.hdf5	7d7c68d7225d6767cc95190057f910be
+part_00929_of_04320.hdf5	6a2dc1305ba0fc974554660ba47bcfe7
+part_00930_of_04320.hdf5	1739b7f094119fefc064d807d6071c03
+part_00931_of_04320.hdf5	92ff162afd464ae51f1d005f4283415c
+part_00932_of_04320.hdf5	79a5a07cf2d00ed0d80575925bb9bf75
+part_00933_of_04320.hdf5	0a0e8a4569335f08f997a0738fced927
+part_00934_of_04320.hdf5	0e5883cbae53b3269a6523d6e939ac39
+part_00935_of_04320.hdf5	7b1524f1bc32f27ed4ad7f7fa79e1c5b
+part_00936_of_04320.hdf5	8b38ca4734a518f82d3d81d3055b13bd
+part_00937_of_04320.hdf5	7ed7ebe9bc1d681651a23d8ddf0bc9fd
+part_00938_of_04320.hdf5	c9352ca11e89ef2c2e75e8294b7aa989
+part_00939_of_04320.hdf5	bb1ad5890e53b847d21ebb882532d166
+part_00940_of_04320.hdf5	c8db380c45c4c674edbe1509fb043f7c
+part_00941_of_04320.hdf5	35f0c0056dc7c78253a8ca2f84b42c6a
+part_00942_of_04320.hdf5	de5d33ede6919ff02cdc14eafbfc84e2
+part_00943_of_04320.hdf5	987c2c92636c43d81a4a52891add3127
+part_00944_of_04320.hdf5	7fcb30c373192b89580476076d8cde47
+part_00945_of_04320.hdf5	0ea360badeed59dce33a9f109e07ab17
+part_00946_of_04320.hdf5	e522785516da1302a84985ff078cf8c9
+part_00947_of_04320.hdf5	8d6bc17dd777bf550f30e10b05ce018a
+part_00948_of_04320.hdf5	dcfa28e5b201fcf3e62b30e834791791
+part_00949_of_04320.hdf5	ebdba0695a9a849a391a2c1e5af8ae1c
+part_00950_of_04320.hdf5	6aa7029c67f23410e7879e5d0c84d1f0
+part_00951_of_04320.hdf5	55570c973b6e8670aaf58706930b01d4
+part_00952_of_04320.hdf5	2597951926fc1889978ed49fe5cd595c
+part_00953_of_04320.hdf5	a6b03d6273e1e8cdd5979c006a9dbc93
+part_00954_of_04320.hdf5	562be67295c3bc44b92e4ff6903d63a6
+part_00955_of_04320.hdf5	0e3de11ddf158c51125d9553e8d7d337
+part_00956_of_04320.hdf5	8351cf69c8b7e82a1c5982aeebdfe30d
+part_00957_of_04320.hdf5	067a8b2321a492bd8464c7759577e135
+part_00958_of_04320.hdf5	47967fedd91a4753e9e84140fef833b5
+part_00959_of_04320.hdf5	d1cf6d8948e38fc10e044a5f460bb151
+part_00960_of_04320.hdf5	36fbb2bd5de97adc90944914c19164a3
+part_00961_of_04320.hdf5	5f6c4a5ce9e2683c3ba8286d105e4748
+part_00962_of_04320.hdf5	1b094b59de5822766ced1fa2b2cbf9c2
+part_00963_of_04320.hdf5	e04c754abe585e988ecd58d2b0922808
+part_00964_of_04320.hdf5	11dd868f4581ce478d3756e475027fde
+part_00965_of_04320.hdf5	a44bbbbb5ee1e9d6a3fcec9c5dcee6c9
+part_00966_of_04320.hdf5	f9750d9d75a5634e6becf1679beade53
+part_00967_of_04320.hdf5	85b7aa6792e1b7d4c549e1c0a145eedc
+part_00968_of_04320.hdf5	1ec78741228f067ee61660d40851dcbf
+part_00969_of_04320.hdf5	d565fdae68df964e6908e7fa5acfccc8
+part_00970_of_04320.hdf5	1e2d4a11e635adfe340be16cfba5813e
+part_00971_of_04320.hdf5	ba35f4af7895a55dfb10c161f579bfba
+part_00972_of_04320.hdf5	42984e5fdcaead16386b50eefbaacd0b
+part_00973_of_04320.hdf5	73f536d56d7e63ab047410139e8d26a6
+part_00974_of_04320.hdf5	8288e606d03ad1b028d7b87d736218fb
+part_00975_of_04320.hdf5	f0c2a0f7d216cb24e7bfd940d0dd904e
+part_00976_of_04320.hdf5	233adcd6e887ac1ac21a8f7835c570e8
+part_00977_of_04320.hdf5	ff73c44a114545db6ce855571dbd8665
+part_00978_of_04320.hdf5	9bc892248a6ba5b0b8e8ab688b39027e
+part_00979_of_04320.hdf5	e742b14ce2bbd9c7687551303bb22af1
+part_00980_of_04320.hdf5	af6659b5a4e6dc8e38f816b55f219c6c
+part_00981_of_04320.hdf5	f5d8b67ae66d0dffdb4cc1cbb3e296fc
+part_00982_of_04320.hdf5	6cbad0cd67088ea1ce54250a8a41cef4
+part_00983_of_04320.hdf5	f6800747b1619d4ecd91ded13f1c896f
+part_00984_of_04320.hdf5	ebf95bab051bcb23d7c8d84ce2d43b8f
+part_00985_of_04320.hdf5	cd00ac552dd48c67d4b16e36364dc429
+part_00986_of_04320.hdf5	c6fed79f63d7623e1defa8772ba605e1
+part_00987_of_04320.hdf5	c79d54d15860791329f63191e55da038
+part_00988_of_04320.hdf5	632def306f888f5d0207ae5a0ee053e0
+part_00989_of_04320.hdf5	8b481305f2ce47afc4198a1a582435c3
+part_00990_of_04320.hdf5	e88c9c8b0e1e6afa571abccdb7fb9d12
+part_00991_of_04320.hdf5	5ed8ba2b577fe0fa3b03d03e60b906b6
+part_00992_of_04320.hdf5	c43368a3efb658fba443833607e33829
+part_00993_of_04320.hdf5	ce2b2a5104f490390db3d7addd80c721
+part_00994_of_04320.hdf5	869fe89f038bb7c9b92063f1c331a361
+part_00995_of_04320.hdf5	cb49aca0b02f740f0473b2e2705e9e79
+part_00996_of_04320.hdf5	7f449fd8f460efc3b9aa56185f6703fb
+part_00997_of_04320.hdf5	0f4f54f555f65c53784c1895c41693e6
+part_00998_of_04320.hdf5	4f4e9216b54cbb137e13e4dc08028114
+part_00999_of_04320.hdf5	07d34965687e0ffd7b602c2670e7b227
+part_01000_of_04320.hdf5	460827e5d17caa5882d966a3922fbe33
+part_01001_of_04320.hdf5	ea527245b662453e2d7c5df74f28741c
+part_01002_of_04320.hdf5	cf8789c135108960ab334f76ea9a18db
+part_01003_of_04320.hdf5	96bec2674f25fc99379cd1a692b8d5fd
+part_01004_of_04320.hdf5	74ee9279d2a663207b0a439c5a210d45
+part_01005_of_04320.hdf5	d516a76a35a5a329c7d6d201f8d9ed5c
+part_01006_of_04320.hdf5	276a716ca47ddb752e1b84e0f7a54099
+part_01007_of_04320.hdf5	e2efeec204685be601b586ec7a336906
+part_01008_of_04320.hdf5	a48df2ae27a81444cb931a2bd6d1dc20
+part_01009_of_04320.hdf5	0bda09bb08687aeb05925ddfb2bc0745
+part_01010_of_04320.hdf5	4982571a2e3707d1ae47b4012d3aea1d
+part_01011_of_04320.hdf5	7c386d1f9394dbd6139ab5cce5bfd800
+part_01012_of_04320.hdf5	7ee1467b3f41941a755aa214441bec14
+part_01013_of_04320.hdf5	f65bf5dcb89cd1d7395fe89db0aa5d53
+part_01014_of_04320.hdf5	fa35e52236ff10dd815dbfd0c7a9add6
+part_01015_of_04320.hdf5	8e5c2ac67c6653e86ac745ee3d078bce
+part_01016_of_04320.hdf5	9f8b404fde96053654f44ed8382166f2
+part_01017_of_04320.hdf5	d9005bfd2abd687f8b75838c7571d173
+part_01018_of_04320.hdf5	b52906fd056295594d83500ef44a47f1
+part_01019_of_04320.hdf5	c2920c4db541cb7a35c4dda99afa9961
+part_01020_of_04320.hdf5	f704d7ad068b3590fb694a99bff4ab4a
+part_01021_of_04320.hdf5	62e912ce6d9e1699040192207389f996
+part_01022_of_04320.hdf5	241221377669c0b687bed22f48bc575e
+part_01023_of_04320.hdf5	7ef62f58c546ffd20144db1a38d75834
+part_01024_of_04320.hdf5	e6f4c50bb2c4d92376d4cf930f795883
+part_01025_of_04320.hdf5	3059cf351918dd07e6b6c4dd4c9033f8
+part_01026_of_04320.hdf5	41d49050f3a086f79734c83e021b21da
+part_01027_of_04320.hdf5	ab39979ab8bd6d5b695630e9585bb5af
+part_01028_of_04320.hdf5	e4bd51de168d480553e9664513ca7703
+part_01029_of_04320.hdf5	4f116af4bbc9c1553d411ede91746f70
+part_01030_of_04320.hdf5	daf7c581749f90870611ff1d9d2268e3
+part_01031_of_04320.hdf5	5cb5d1e1dab1a68b65bab4c7c544c37e
+part_01032_of_04320.hdf5	82b0b2fde79dbcf2a0f71ecd34d8bafd
+part_01033_of_04320.hdf5	2f444a3b6a552cd69c0ac65e10473439
+part_01034_of_04320.hdf5	83b9c09ebc89dc657e4d6919f3673b2c
+part_01035_of_04320.hdf5	f7fced7f614b743b46501168c101a2a7
+part_01036_of_04320.hdf5	13942b6570e4a3559ad848009fcc7265
+part_01037_of_04320.hdf5	a1ba60cae91c6f28fbc62f1255921243
+part_01038_of_04320.hdf5	9f92190b67e3921a5cb44e5e83b5c338
+part_01039_of_04320.hdf5	961dbf75e61ed90050fd442a68aab3d8
+part_01040_of_04320.hdf5	fb37e4cbb7e2ab7a4d58f43c780253cd
+part_01041_of_04320.hdf5	bca49086ba4dae83e763042f7339eb0b
+part_01042_of_04320.hdf5	fb8553dca361c5a120c4e7c5bacc0e5e
+part_01043_of_04320.hdf5	606fb08b1a86038593f334e0a8984e7b
+part_01044_of_04320.hdf5	878bd5aae6597cd77c7297b524715d8e
+part_01045_of_04320.hdf5	b51e4b57ce4bad51d444ead8af593c9a
+part_01046_of_04320.hdf5	34db83eae28d0f0ceaeb0267c48f9a99
+part_01047_of_04320.hdf5	9d868c2f07c44f3b93041ab4cd023f56
+part_01048_of_04320.hdf5	3c109ee395d67c1180f36b91d2ca37db
+part_01049_of_04320.hdf5	3be8ad1c8a115856c6af74a0625175cd
+part_01050_of_04320.hdf5	b469277b29eb39b3df3aff496064558c
+part_01051_of_04320.hdf5	a68229ee9f4ad522548060e9aed387bc
+part_01052_of_04320.hdf5	9dfb44141c7a0798ecb3ba19d4e276b1
+part_01053_of_04320.hdf5	c3bfec974c2f0faf45521cb848549629
+part_01054_of_04320.hdf5	c6a3ee4218cea2d3f7f8fad39947740e
+part_01055_of_04320.hdf5	17f4e7a7a83156cb7a4462492f445bb3
+part_01056_of_04320.hdf5	f94f0d72a00c8ca3b38c6c235a700d51
+part_01057_of_04320.hdf5	d09a647e0eed9b621aed0d58d7c4ce4a
+part_01058_of_04320.hdf5	1413d7f9328a4e43a2ca2c7022d93947
+part_01059_of_04320.hdf5	1460ca4e5a9dd54f2a3095421fa38860
+part_01060_of_04320.hdf5	cf46f5f1a94b35d2504b75c5449e5699
+part_01061_of_04320.hdf5	e560750171a34d5488730812f64c85b3
+part_01062_of_04320.hdf5	b52034d5f90406bd035889b90172343f
+part_01063_of_04320.hdf5	95a687321eaa41d5726e3e5d5b56cbf8
+part_01064_of_04320.hdf5	80a7b22750e0a3fa89f72079c6a4b828
+part_01065_of_04320.hdf5	023fb58e27e4a3b396d439ca46bb2706
+part_01066_of_04320.hdf5	fd5e7c80279e8c1200ab620855beceea
+part_01067_of_04320.hdf5	7e993b14ae4a290d84b627a35e0f8b3e
+part_01068_of_04320.hdf5	64b62ff047d2d95c508c5745b69e5bdc
+part_01069_of_04320.hdf5	8797ca8b60e2b4ebdf59d094504d1021
+part_01070_of_04320.hdf5	9752fb077f83106627471525b5ecb774
+part_01071_of_04320.hdf5	2a7a3245ae8703362de261e47f588e40
+part_01072_of_04320.hdf5	a135371373e22eb1581cd21b46afe141
+part_01073_of_04320.hdf5	40e20a2fc21c404f9a927ce9149be58e
+part_01074_of_04320.hdf5	9edf70f827be22d23dcdcb85668b12fe
+part_01075_of_04320.hdf5	4c8a57d917914e60ee7bde85c7b7deda
+part_01076_of_04320.hdf5	9b506149c16961cef15207ddb522fccb
+part_01077_of_04320.hdf5	6c28d0008c745a0277f4358f475c14d6
+part_01078_of_04320.hdf5	b44384aa983db24c9cf5315116d32f8d
+part_01079_of_04320.hdf5	9d219cb12577b539e300bf85c97c39b0
+part_01080_of_04320.hdf5	6ce75a30a1d6d4939d5217c450874f2f
+part_01081_of_04320.hdf5	79ae2814531282ade91e59afdefe511a
+part_01082_of_04320.hdf5	e854798bb6c101e993e7e732798eff5e
+part_01083_of_04320.hdf5	3bc211b17eb8b8509eb0ca33f770c219
+part_01084_of_04320.hdf5	2c83654bf1104a58c621872242f82246
+part_01085_of_04320.hdf5	f723c38be634ad6e5a2c4bd8a3ae7e22
+part_01086_of_04320.hdf5	6a2386759b187d39409d4a33b28f5791
+part_01087_of_04320.hdf5	1ea61306b2e1c86ec9513ca81731c21e
+part_01088_of_04320.hdf5	871ca4ee8d420fe6dfe6b74051b305d0
+part_01089_of_04320.hdf5	161bf18832583e6a34e1d44b5d619c8b
+part_01090_of_04320.hdf5	e1559f226beb321fc485ebeed9ddecf6
+part_01091_of_04320.hdf5	1e33e627e476e15c73e5d11406b8475b
+part_01092_of_04320.hdf5	066e1a5a79562ce4d4a22b662e82c0e3
+part_01093_of_04320.hdf5	a2873e1eb648f1bd50edce8e8f709771
+part_01094_of_04320.hdf5	4bec36059613148107ac1b08036528a6
+part_01095_of_04320.hdf5	0129069064371886c19109fbf13ebcb0
+part_01096_of_04320.hdf5	16fc7880068a309b6f8cee06f8dce6a9
+part_01097_of_04320.hdf5	d2bc7cfad7331fc652665dd93f9b5fa9
+part_01098_of_04320.hdf5	70e51d573370bd242226e5bd19cce6a8
+part_01099_of_04320.hdf5	f8116c743284d7731929a8b04edad50b
+part_01100_of_04320.hdf5	13e6a7b802eba443eda5f27787800eb0
+part_01101_of_04320.hdf5	64e14adcedacd022b8fdf146a669b5cf
+part_01102_of_04320.hdf5	2bdec2ab70b11f8bf2a8cb62d6ffa957
+part_01103_of_04320.hdf5	bf7af7758ab9ec3ae97cc4df5b86f186
+part_01104_of_04320.hdf5	abf6ff015a522058381d3095503a1909
+part_01105_of_04320.hdf5	5c040b58a3f149be06209fad89d9e35d
+part_01106_of_04320.hdf5	663ec7b3e8655587581aa16c4fd2c728
+part_01107_of_04320.hdf5	8706a58892ba8a17c7c5ae7a5faf30e8
+part_01108_of_04320.hdf5	af582bcb0231dbad58693fa4bc7403b3
+part_01109_of_04320.hdf5	00ba7dca8cc22f8e0414758735fbb879
+part_01110_of_04320.hdf5	7d3f719abcd310c8db7fa44b929ce919
+part_01111_of_04320.hdf5	3a5bc499fdec50e89f8c056b7cdfa4b1
+part_01112_of_04320.hdf5	40571dbc6b101cfd4437ff3903309dd9
+part_01113_of_04320.hdf5	8c62ee563c6d96426d453319f475e50a
+part_01114_of_04320.hdf5	15cfd6ab7e66645f939950f058014e82
+part_01115_of_04320.hdf5	ff1efa672428f1ed1cacd9b835d3ad69
+part_01116_of_04320.hdf5	ce7c10f80ed1db8ca27bb0e0d998b7d3
+part_01117_of_04320.hdf5	52fef4fa8f16b792aa965fc01bb50621
+part_01118_of_04320.hdf5	6fc019ce8bac48ca649833d34852cc82
+part_01119_of_04320.hdf5	e3c8b026d383ce466f61bee12e13b9dd
+part_01120_of_04320.hdf5	f0d67acdf8a9ffd80398975deb6e41f2
+part_01121_of_04320.hdf5	dbcb14aa4e787222cac9393d7360c169
+part_01122_of_04320.hdf5	280c31f0470c3ec17e207da4f0758a71
+part_01123_of_04320.hdf5	0170df810fff2a2fb3b1f7abc18ea3f3
+part_01124_of_04320.hdf5	8da572c1c15a7c6cccbae8b0b03d8904
+part_01125_of_04320.hdf5	e1b0810b91a0e6418381b3ae571147cd
+part_01126_of_04320.hdf5	88608cdda66a2b51b4dafed8b3affec8
+part_01127_of_04320.hdf5	cfb79debe800970690ce861e6b4e7092
+part_01128_of_04320.hdf5	345d2b1cfa1a3a392911f79f3a562059
+part_01129_of_04320.hdf5	3f315b7c27e0f053a71b090dc9b0e18e
+part_01130_of_04320.hdf5	389c57946c45e2971af4b9896c6b553d
+part_01131_of_04320.hdf5	619f6168c255c428acbf7e60e47d16a2
+part_01132_of_04320.hdf5	1a5d20a90bbe3389262bd1c3facf0bb4
+part_01133_of_04320.hdf5	48d012079e36a6b8534bfe95969c35d9
+part_01134_of_04320.hdf5	fab745f3666aeb65071c99f152bea1fc
+part_01135_of_04320.hdf5	33038614ea35ef33ec4f7ca5dbb9f928
+part_01136_of_04320.hdf5	2a3a6d5396df3dab4569eef320ec2c10
+part_01137_of_04320.hdf5	b28744f603b872586a86541ac605103b
+part_01138_of_04320.hdf5	49df8ae938d8d81d137df02dad5c870b
+part_01139_of_04320.hdf5	e8514416eddac75d44c261bdf7c02443
+part_01140_of_04320.hdf5	52b2c814d6abd99f5890b3c35864dfd0
+part_01141_of_04320.hdf5	438701aea8f11828d7387093e101475b
+part_01142_of_04320.hdf5	5881dcc168acfcf3b9bdfbacc2be26c9
+part_01143_of_04320.hdf5	02a2a42c4ee4e61cedb9c5add1e5cac7
+part_01144_of_04320.hdf5	37236907fcd1847933ccd64e414f3795
+part_01145_of_04320.hdf5	a192d0e885bc86a2887e8879c865aa73
+part_01146_of_04320.hdf5	ce6682fdd513b63c90e8eb5e88830fae
+part_01147_of_04320.hdf5	8a8e9ee8faf0e9e906d8d96b9f41116e
+part_01148_of_04320.hdf5	6ea92025be4e28ba64b5904d33410fab
+part_01149_of_04320.hdf5	5173e0fb9fc3715e19a02f261b1de4d8
+part_01150_of_04320.hdf5	baaa919ae93da4185bd49fbab5d3ae5c
+part_01151_of_04320.hdf5	a1be43556d251d276909e94708af5c74
+part_01152_of_04320.hdf5	c6fa74e647ccb2fa48f557394360432b
+part_01153_of_04320.hdf5	6830d862cf9e67655ce82f053963a72c
+part_01154_of_04320.hdf5	588bac2e29df1d917416ff99bf85fc3c
+part_01155_of_04320.hdf5	2a88a10f5bf22e67006f3cd67d5094e1
+part_01156_of_04320.hdf5	5e59f00271b750b50e20e6926dabb208
+part_01157_of_04320.hdf5	a76644c1940f430d8d927355eec79dc2
+part_01158_of_04320.hdf5	a595e028b432483d33be7bc8e96a1f0d
+part_01159_of_04320.hdf5	79089d6dce500cb290d9707bd45cf6ed
+part_01160_of_04320.hdf5	29d13e55435f6a88ebe971a985a4e158
+part_01161_of_04320.hdf5	f30ee88ac525c95bb3c1fafbd9ae3157
+part_01162_of_04320.hdf5	44305750ca4df6714ac445dc84ec94f2
+part_01163_of_04320.hdf5	154d1ddfee9b19611626d89752e6bdd5
+part_01164_of_04320.hdf5	2479c182818084959faf843eacef0af5
+part_01165_of_04320.hdf5	bc0538df3f915764cb46b744b8562664
+part_01166_of_04320.hdf5	3e7870ff41028f7b5533869aac18de2c
+part_01167_of_04320.hdf5	99f7729418679d36797cac8671223b5f
+part_01168_of_04320.hdf5	4b789037323d6a986af59e7b7c013466
+part_01169_of_04320.hdf5	5e685dd4a22e2ed65d7929a671dfa2f3
+part_01170_of_04320.hdf5	2a328dadd93003539c87e832846f7cb9
+part_01171_of_04320.hdf5	0457b5c9e4efadda98b7a5306126dc60
+part_01172_of_04320.hdf5	f61ea594632319f3ae5b81ef685961c7
+part_01173_of_04320.hdf5	ee8da336b8aab0e001316e46381fd7da
+part_01174_of_04320.hdf5	e07ee692a5837fb18e3ddefac057e228
+part_01175_of_04320.hdf5	c94245de5b3f8e5e379f0838cc688e1e
+part_01176_of_04320.hdf5	4735243201fe0cb3dc78163788cca193
+part_01177_of_04320.hdf5	23c764b11ea2845011ee8a87bdf1855a
+part_01178_of_04320.hdf5	358404d78db4cbc9b1f76e04bec5d473
+part_01179_of_04320.hdf5	fb88350de22c4d4a416284fab02743ae
+part_01180_of_04320.hdf5	748e6c44d328f32098a2fe6320a199f2
+part_01181_of_04320.hdf5	25d2ec2ce05fa87a2563007db0bf93c3
+part_01182_of_04320.hdf5	0c5228e38e71083c468d40cf0f41c015
+part_01183_of_04320.hdf5	45a071f99b933b9efb2d7156b260711a
+part_01184_of_04320.hdf5	242ca7816dde0474a383f23ffde78519
+part_01185_of_04320.hdf5	3ebb4d0c1a8bbf034248c71118cca1a9
+part_01186_of_04320.hdf5	231103ab764ebda00c2b784b39e6e00f
+part_01187_of_04320.hdf5	afd95609d4ae7c8a7181f2d0ee7bdc57
+part_01188_of_04320.hdf5	98b4ae2e0a641bd863bf3d140f092f1f
+part_01189_of_04320.hdf5	5e632f7d102ad706af9b43ef4087d47e
+part_01190_of_04320.hdf5	f4f5cf64a88394d38cab5e7d15f9a3d7
+part_01191_of_04320.hdf5	10bb8163b210bfbc9248f89717415ce6
+part_01192_of_04320.hdf5	346c08449b10c43d9f8658bbf3d8c7e2
+part_01193_of_04320.hdf5	a6a29727e3d5cd5fc7f99c3e77a5157c
+part_01194_of_04320.hdf5	dc29b9828db085b38cf40ed513672932
+part_01195_of_04320.hdf5	8f7dda7a010a94a294cf93cf89016d6c
+part_01196_of_04320.hdf5	09be3baa9b697878d71b839dbfb513fd
+part_01197_of_04320.hdf5	8f40b3e2650a59eb5ba282ef543a543f
+part_01198_of_04320.hdf5	71640bfb0e7292df0b3395cc2fc8dfbd
+part_01199_of_04320.hdf5	4bff89c663c4bdd995ee5c8503371ac8
+part_01200_of_04320.hdf5	35978eedcefbe0e5603f0542cfebfc00
+part_01201_of_04320.hdf5	f1cbcde381008aea43f4b2cf5293af9e
+part_01202_of_04320.hdf5	35263c14792f78614e1c7340a1859d9a
+part_01203_of_04320.hdf5	7cebf2a64d9d241b03764e0a15be7436
+part_01204_of_04320.hdf5	d622c8ce07f8c0660d06a974fe78f60e
+part_01205_of_04320.hdf5	6a9cc3f0f92b7f703bae6f6cb01cdec5
+part_01206_of_04320.hdf5	3f1de6eedbb8e9b179f37d6fc3fcd0f9
+part_01207_of_04320.hdf5	121b5cbcf10054b1b30f2902adf49ad8
+part_01208_of_04320.hdf5	cad3861a46be9bdbdc5ba5515648c3e8
+part_01209_of_04320.hdf5	09663f71df41ae7163c2be495644dfb5
+part_01210_of_04320.hdf5	61e769ee1f2efb25ff2da4e5afc2115a
+part_01211_of_04320.hdf5	63c5838f32212024e58ffa7d9402537f
+part_01212_of_04320.hdf5	da61b18771ac0ebc1eb3beeab6898339
+part_01213_of_04320.hdf5	29632db44eff98310c3c0ca8fb60a128
+part_01214_of_04320.hdf5	9ad4e4c3c77f16f0ec9020136b92b3da
+part_01215_of_04320.hdf5	068265dbb6d8d659c9000ff137e7e05f
+part_01216_of_04320.hdf5	6185148e262aa5476c3c43fa60a40fde
+part_01217_of_04320.hdf5	cda88a0b516f228929b7454e154ef49c
+part_01218_of_04320.hdf5	6bc7a9090fa33adc99408d5dff0f3ff0
+part_01219_of_04320.hdf5	523931623ffd8763b7d8d896de3ada7b
+part_01220_of_04320.hdf5	3ca9b673db742d21bf113289db556bc4
+part_01221_of_04320.hdf5	b40778e32fda838483039fa394f9b8ec
+part_01222_of_04320.hdf5	c154ca831b48a2c455aba2bfe071882e
+part_01223_of_04320.hdf5	e9b3101e2654890ac536905af9eb3a10
+part_01224_of_04320.hdf5	7ac33959202522acd442266fc4113e5b
+part_01225_of_04320.hdf5	744a59bf5a71b23ca0e8f43f4f1d6cac
+part_01226_of_04320.hdf5	02c1f833379f8d2bf40127029c9547e6
+part_01227_of_04320.hdf5	1b83ab24694855461ecd1b5870adc082
+part_01228_of_04320.hdf5	09350f6fcb48e8cbc94a7658afea43f6
+part_01229_of_04320.hdf5	2b56d953e6effcb71f60be0f06397fb1
+part_01230_of_04320.hdf5	ff9ef793d4682d23fd1884c0b5949b2d
+part_01231_of_04320.hdf5	f0432bba11a326c5a22e73823388cdef
+part_01232_of_04320.hdf5	91662035df6250fd075934e9b8892b45
+part_01233_of_04320.hdf5	d6dd0adbb66dd8bee27de85f063103b9
+part_01234_of_04320.hdf5	bcee979ee780e895a4fba6864bae3ed8
+part_01235_of_04320.hdf5	664bbda5fc84ae026326bf543c141c55
+part_01236_of_04320.hdf5	396426bf4f22c8e27006dc2459119e16
+part_01237_of_04320.hdf5	42655d5e19c82703c8988bbc2289678b
+part_01238_of_04320.hdf5	73d385297009e9db7d60bf37d1d7af28
+part_01239_of_04320.hdf5	a5c5bd47bc342ce624717600bb0fcde6
+part_01240_of_04320.hdf5	e4e5d0e30313c6e7df38569ed1116ab9
+part_01241_of_04320.hdf5	35c1890cc07bdaab34b6d7e7834532fc
+part_01242_of_04320.hdf5	1c18eb6edf9a09be8f60983a4d930ed1
+part_01243_of_04320.hdf5	87fe90378c6d618a3e1d06d2baba2a82
+part_01244_of_04320.hdf5	390a8dd9a306fed96420bc6ca2ea2fa5
+part_01245_of_04320.hdf5	433bec421595e0586de4d79b57824112
+part_01246_of_04320.hdf5	575fa2a9d3462ab01e97c0fa2bb7158d
+part_01247_of_04320.hdf5	3d7a8fc7ad8cc49e79a0b496c4790b20
+part_01248_of_04320.hdf5	217c5efba8b609f9cec3dc5fea89dd2e
+part_01249_of_04320.hdf5	83270caefcc8a2b258091d0d6aa76b5f
+part_01250_of_04320.hdf5	9b7304e6997965ebfee284980d3aaca4
+part_01251_of_04320.hdf5	758241c3929b4b224cebbae8f808264d
+part_01252_of_04320.hdf5	2206a67c561bc9f0b59924c72b857657
+part_01253_of_04320.hdf5	ada52cd36efcaec583d6146b0da8232e
+part_01254_of_04320.hdf5	a64493aeee165c0a5f94ab69cde86f0d
+part_01255_of_04320.hdf5	dc988d207fe8774d03107e920f2048a5
+part_01256_of_04320.hdf5	a6bdbe192aa24bf379dcbedea1c2213b
+part_01257_of_04320.hdf5	d4d1011f704febdbb8a2b94f9c8802b4
+part_01258_of_04320.hdf5	c0cbcf047db3a4aee7ec7fe5b2abe643
+part_01259_of_04320.hdf5	7095044e2b6a1bb3c0a77e82b2ab367a
+part_01260_of_04320.hdf5	929efc63b4eb115bab008128c92eb169
+part_01261_of_04320.hdf5	5325a3da29dfb306f4ce31e06316dd8d
+part_01262_of_04320.hdf5	8a86194c814118d484bcb6b159c90b93
+part_01263_of_04320.hdf5	12bf203c35acd43cb5866fa1d45246e1
+part_01264_of_04320.hdf5	2a1f689ac4230901a8f4188b14e0d4bd
+part_01265_of_04320.hdf5	6fe55757b6ba94e44518c22eb5804b88
+part_01266_of_04320.hdf5	5956d1bfa69a927e9e6d77a08352b076
+part_01267_of_04320.hdf5	a49aac7efdfb7c37cfb1540ababb996b
+part_01268_of_04320.hdf5	c86fea7ad5790743f2de29c54ba27fde
+part_01269_of_04320.hdf5	5e6352e1f8c88ea1730a30602533781b
+part_01270_of_04320.hdf5	9ebcfab3bdaac5237a7e8db41a02e00e
+part_01271_of_04320.hdf5	b2ce0df75fd10c2c31639aab3fe66ad0
+part_01272_of_04320.hdf5	073b9e7beab1403c5b1a076dadc6ad28
+part_01273_of_04320.hdf5	3de9af1e2e66b1905e6b1067b0c3b7e8
+part_01274_of_04320.hdf5	add64a7b49069b542b5cd9ca0588b7ee
+part_01275_of_04320.hdf5	2fab2f1b0baa01a3a2105197c3a06ae0
+part_01276_of_04320.hdf5	55e7373f0627cf1e567c6181fa5b63ea
+part_01277_of_04320.hdf5	2251b63f34956b685b13f72475c24948
+part_01278_of_04320.hdf5	383e90f690e2609934046819b7d15169
+part_01279_of_04320.hdf5	cb7f79b28ac785382e33cfd6eab00e47
+part_01280_of_04320.hdf5	b54675ba00e4cc89a151f310557fe81a
+part_01281_of_04320.hdf5	a33765619f2e4f98736f459c9bc00e2d
+part_01282_of_04320.hdf5	b8475422fa473b1c82c5b84b2ff87a59
+part_01283_of_04320.hdf5	422a55453879ca7c33f49078e89839b6
+part_01284_of_04320.hdf5	4ce451aa4e63046c85cde9281669052d
+part_01285_of_04320.hdf5	49fbaa646e178a1526e96a49c9599b84
+part_01286_of_04320.hdf5	758a12b565dcc6b0c124e0d1a665df3f
+part_01287_of_04320.hdf5	565edcc394710a327047e323901deb75
+part_01288_of_04320.hdf5	d111bb89f19cbaeb3016aaa70def8daf
+part_01289_of_04320.hdf5	bc59e54513cce04cd5e98ef7d7701061
+part_01290_of_04320.hdf5	84f00a41cadf38afcd28f5bd48c24131
+part_01291_of_04320.hdf5	dfa33b6f59e0a3c9d3f02dabd4fa16bd
+part_01292_of_04320.hdf5	5c5cbbb528deeb6b4f75d2f497cfd1f4
+part_01293_of_04320.hdf5	c530e311dee678725b749edb1b96c900
+part_01294_of_04320.hdf5	34e92223424d3e469fd33011d6c916a7
+part_01295_of_04320.hdf5	2a57dbe4f593fd8dbc6392c551169fde
+part_01296_of_04320.hdf5	35d193f95044202fec90369ba68a74a9
+part_01297_of_04320.hdf5	3c7f5d5b3c093ecb9cf8a0c0566f7f1d
+part_01298_of_04320.hdf5	8035eaa8d26ba629d49f760705ff85b5
+part_01299_of_04320.hdf5	f715ea6f67a89f3b4905d9323299a37c
+part_01300_of_04320.hdf5	1b7fdf65445b9366056d1ebb3bce872c
+part_01301_of_04320.hdf5	4d037f114992ed83e038128ed612fe48
+part_01302_of_04320.hdf5	f5674adfeb199a9b1bbf15f9794def7d
+part_01303_of_04320.hdf5	c4c7706d700a1c93da22d76619de8d82
+part_01304_of_04320.hdf5	4cca651ebe9319081443e68033ca8a21
+part_01305_of_04320.hdf5	cd13acdf362a4f1f58f867323bc6ad58
+part_01306_of_04320.hdf5	864e54bcd7f30938c877c9f216976f82
+part_01307_of_04320.hdf5	38110bcf9cfbed231e1eef8c3c979b7a
+part_01308_of_04320.hdf5	9c882875e21506a8252be6c0fabd7747
+part_01309_of_04320.hdf5	0adce4a6fd7e78bb23842eb55bc0b525
+part_01310_of_04320.hdf5	3e2febf16676334fbc6ce1d8cb02a533
+part_01311_of_04320.hdf5	d5def9c26a9750429aa44a75f85e8421
+part_01312_of_04320.hdf5	3443a5f6bdc0d373ed6597d48cb1e008
+part_01313_of_04320.hdf5	4c44b2e2ab03de61e53bec2afab83924
+part_01314_of_04320.hdf5	1612565266a4e1493014a6769a67d2f9
+part_01315_of_04320.hdf5	ecd583015c3ac116d2c6c90d3db0f90a
+part_01316_of_04320.hdf5	245b0024e767ac51a7799cb024240199
+part_01317_of_04320.hdf5	282043063c3053f6ac9efd63832c451b
+part_01318_of_04320.hdf5	d0f2593ebcda2c3a0bd0559b1591648b
+part_01319_of_04320.hdf5	e6ac2ab07db2a6321cef6797769bb70d
+part_01320_of_04320.hdf5	55d176ac4a384a998577e3a841bbf8ed
+part_01321_of_04320.hdf5	c1c3371aafdd7d5c495dc64d363ae863
+part_01322_of_04320.hdf5	9fa5a54c48b998f8a4a47fd36d5a42e8
+part_01323_of_04320.hdf5	df7c420f33365db9b76c0f5893038bbe
+part_01324_of_04320.hdf5	3c82d01203e405db65f40ede30f08f4e
+part_01325_of_04320.hdf5	a2f15b2a45e3d1e824d35668f6eeec64
+part_01326_of_04320.hdf5	50e477ffa73285710dcf0e2d60ed27e2
+part_01327_of_04320.hdf5	8654110d1522176832198e9c1500fa63
+part_01328_of_04320.hdf5	3b7371cf15580db653944b599e64df2d
+part_01329_of_04320.hdf5	77573fcb7874f39d8c29b4cb3d53f274
+part_01330_of_04320.hdf5	95ac1d03d86b897a5fcefd2dd06cbcec
+part_01331_of_04320.hdf5	517af1568426c2755779fdf905488e07
+part_01332_of_04320.hdf5	8c8bf4159142453838313ee13ddccf7a
+part_01333_of_04320.hdf5	834823acb5e0108bf8ce607f6f247c65
+part_01334_of_04320.hdf5	0a981cb8583722428ef6fc803aca1d93
+part_01335_of_04320.hdf5	cbdac07c7b9f2b8880563d269eb943f3
+part_01336_of_04320.hdf5	c594c3ec4fe83b62167178d5f72b3c5e
+part_01337_of_04320.hdf5	7d25a02af0600555b7e93dbd4055a8f7
+part_01338_of_04320.hdf5	aa8434a0d5927df8e11a04a4e51baed6
+part_01339_of_04320.hdf5	20dd9e781fe3daa65fce1af96f8b705e
+part_01340_of_04320.hdf5	a140ffa39f49f32c3fce9fa3c3367a6e
+part_01341_of_04320.hdf5	089153cc8aa2f2aa2519b837c7e00441
+part_01342_of_04320.hdf5	3f292cbe3e934c3436f71429ab01d815
+part_01343_of_04320.hdf5	a49031ec75936d3e61ea95d44ca50497
+part_01344_of_04320.hdf5	bfa88282557420a34dba0810d9ab8435
+part_01345_of_04320.hdf5	9a86e6ba4b62641412a65a7b5cb4c744
+part_01346_of_04320.hdf5	9471ce91df4afbedb56ff65da6c972ea
+part_01347_of_04320.hdf5	5fce8c869a7f3806cab2e92e12b06c8e
+part_01348_of_04320.hdf5	014dcab14b69521a3101236fd84e63ef
+part_01349_of_04320.hdf5	ced6cb53b55e69de4a69fedbfd5d4877
+part_01350_of_04320.hdf5	d78da55daf38763342d8ad91a3cffa4e
+part_01351_of_04320.hdf5	6e66bebcf7b93740de4c845447370aa5
+part_01352_of_04320.hdf5	113f512b057d1fd54bc9fcf8cbb1631f
+part_01353_of_04320.hdf5	caa0520562511fa407518e097fafb143
+part_01354_of_04320.hdf5	c388a57581b200196bc568e0f02d34cb
+part_01355_of_04320.hdf5	03b1f4ae61034cea96433e215bca5b66
+part_01356_of_04320.hdf5	1d280597b5bce49082270755df4ca943
+part_01357_of_04320.hdf5	23e16b4bef7b8cfbef812ba03261755e
+part_01358_of_04320.hdf5	e8daeacd5bd696378078976dfa954899
+part_01359_of_04320.hdf5	e1d598ce4001dbcfdd3ca3939f1a82ae
+part_01360_of_04320.hdf5	32a079b2b929837f6c6bf0edf085a31b
+part_01361_of_04320.hdf5	b5aae61aab5a05a4afa8569108ec2031
+part_01362_of_04320.hdf5	08c8b75e942c8b9f95d8816a6050ee8f
+part_01363_of_04320.hdf5	4cec2f9aa88479646cb3189036350251
+part_01364_of_04320.hdf5	620cb794c74766d490bd6b6d80eef490
+part_01365_of_04320.hdf5	3c96418ea0a6ba004949f0dd87c54ba6
+part_01366_of_04320.hdf5	51567252148fbb857b4df7edab883eb8
+part_01367_of_04320.hdf5	6621296242325c09bd0f3f2036b33b28
+part_01368_of_04320.hdf5	968a692754ae86b32938805fdc7bba45
+part_01369_of_04320.hdf5	1d898cb9dc5a6df1ffbc5b4d00e93504
+part_01370_of_04320.hdf5	dc98470b889062411d4ff29cd62fddfe
+part_01371_of_04320.hdf5	142a828ed5a1ce8a2ed2560ddd00669f
+part_01372_of_04320.hdf5	148ca4c74cb0c446d8df9ce942ceb5ae
+part_01373_of_04320.hdf5	1541ff6490cbb5c80506eab6e100e00a
+part_01374_of_04320.hdf5	12ca09c47ac75f0393aaf522d4c65269
+part_01375_of_04320.hdf5	e1fd92e887f5bc5de2e8cd3ac7e41be8
+part_01376_of_04320.hdf5	68a822f328f948691da895d5bacf10e7
+part_01377_of_04320.hdf5	6071f24c3148df2a46162df48b98dc4d
+part_01378_of_04320.hdf5	3050973776817c0cbaf3d8da944c94b8
+part_01379_of_04320.hdf5	1718299284a9e216f42797ff13436805
+part_01380_of_04320.hdf5	c971d23032be8039e12df692fd41c7e2
+part_01381_of_04320.hdf5	795cafa379fba1d234426693d58f0c08
+part_01382_of_04320.hdf5	1d4e0b72caa67e66203af1d70a5885b6
+part_01383_of_04320.hdf5	7a20ea561152b605d663527e09285a39
+part_01384_of_04320.hdf5	3746e731950fce82e5009a1670092cc7
+part_01385_of_04320.hdf5	d055e9fb8f4a45d0db3cb332aca7adf6
+part_01386_of_04320.hdf5	024fca157e14ed0486e2d766337686ed
+part_01387_of_04320.hdf5	58192b81d602439042589c0a4da510ee
+part_01388_of_04320.hdf5	dabde2c24b5e934255824c8bec2b70d3
+part_01389_of_04320.hdf5	c62aa6aa9ed848378b09afa19dc307fa
+part_01390_of_04320.hdf5	6ca8970c999ca7774ba111a6ee2c195f
+part_01391_of_04320.hdf5	4438fe3b092ec8cf0fc1373cd850f562
+part_01392_of_04320.hdf5	2697e93cc286102b2180808e15289159
+part_01393_of_04320.hdf5	0a5d4afe278b1f491286037dd9e532ed
+part_01394_of_04320.hdf5	a510e2173a1c8361e6b3ccc19c2500ea
+part_01395_of_04320.hdf5	0b20a39af26bbd70f2089c36647ae6e3
+part_01396_of_04320.hdf5	decd84e17a46e96254b912d822db0c7f
+part_01397_of_04320.hdf5	3e5d91c4d0836c95357818b899a6bd4e
+part_01398_of_04320.hdf5	55e4ef6b5ad8eff948358388003804cd
+part_01399_of_04320.hdf5	d96e25173917df792cb00d508e119f37
+part_01400_of_04320.hdf5	8cd792e8b111b4fbd49520d38f7cf1a2
+part_01401_of_04320.hdf5	2753e26747dbd3bae61e8b2fb3f81a02
+part_01402_of_04320.hdf5	8c27dc106c6ba8bebd0da14cc6dbc371
+part_01403_of_04320.hdf5	7439494f1fe08a3f287083dde5ac296c
+part_01404_of_04320.hdf5	59f493f0a68317d9a5fdff939345d88d
+part_01405_of_04320.hdf5	7983c020a1d81c73f1cd03249958d64f
+part_01406_of_04320.hdf5	92f4dfaa87984d684500ee601c300de6
+part_01407_of_04320.hdf5	4ae45d789a261d33dc0dccd54221c823
+part_01408_of_04320.hdf5	8b8c4455249671db2af14f1c46be7602
+part_01409_of_04320.hdf5	df2d977ade42ac0b7c55a1e481c47670
+part_01410_of_04320.hdf5	9f3f66bfff584f0e2d36ee482605c246
+part_01411_of_04320.hdf5	9eaaf96330d378397bb87ba4cdfe2ece
+part_01412_of_04320.hdf5	8e64ff6174b360241addfeb8c8f9c625
+part_01413_of_04320.hdf5	6559aa6902a991fa6a2f06f8e4de6b73
+part_01414_of_04320.hdf5	0ec6d337ed5e33a2b4e59ee2f8f27af5
+part_01415_of_04320.hdf5	7058491e86159e0f9b2b11ceaed4350b
+part_01416_of_04320.hdf5	5aac6d0d13fa4b8cee6b2b7b9bbf27bd
+part_01417_of_04320.hdf5	99828cdaa171d30d72dbe8f5f3b02f85
+part_01418_of_04320.hdf5	dc7ca0b7764662fe7fdb14ef484a298e
+part_01419_of_04320.hdf5	b433f355cfee1fc52b5ef0952c8b22ab
+part_01420_of_04320.hdf5	247468d43babdd5dd227572b9f8803d3
+part_01421_of_04320.hdf5	20c3ebcc0b6d7032c574f75298e6ed52
+part_01422_of_04320.hdf5	3384f91a011976cc411faf34b01158d5
+part_01423_of_04320.hdf5	a7ea154575f33a20fb24000771a844de
+part_01424_of_04320.hdf5	384247cf59321b1ca798153d4fc33c19
+part_01425_of_04320.hdf5	1534a93a65af746327a48323fb203fbf
+part_01426_of_04320.hdf5	6a4f73d16af202a7f49afb77e99e9e8a
+part_01427_of_04320.hdf5	1ab3ec518c586d114c84c78a8fd34da0
+part_01428_of_04320.hdf5	19a2f6801f31a4cf1b5fed9f2ad10c93
+part_01429_of_04320.hdf5	e84fbe0e79e2f13761ad6060b35d6941
+part_01430_of_04320.hdf5	f574021b2a3df1bf3954849b2b1ca6ac
+part_01431_of_04320.hdf5	550f6845e3cf4ed9b3eae8847bb44a47
+part_01432_of_04320.hdf5	1bbaff61d06027527c0f1bd51b47dc03
+part_01433_of_04320.hdf5	632570628579255ec88dd9f33ddf3ad7
+part_01434_of_04320.hdf5	ca121956d88204e07b8ad07b194c9a96
+part_01435_of_04320.hdf5	1f33da810e9a976fe5966514264ae4c8
+part_01436_of_04320.hdf5	ae29ed76e0935f7a96661c90766fe35c
+part_01437_of_04320.hdf5	850270eaeac1889710413d79b73ed443
+part_01438_of_04320.hdf5	065499795b705822e7cec700b68f36fa
+part_01439_of_04320.hdf5	2e0723ee5118773eaaf21146e71a474f
+part_01440_of_04320.hdf5	6e827f72652612125f3372c49943b7d3
+part_01441_of_04320.hdf5	c91e17550d69b77c0012cb4c81c9793f
+part_01442_of_04320.hdf5	9a1967096672167fa39118ed51c0f251
+part_01443_of_04320.hdf5	6bc8abe2e3cce09368114cf424e7f782
+part_01444_of_04320.hdf5	0d7feefa68bc93540c87cd94e619666a
+part_01445_of_04320.hdf5	c2177f881e38d6eacc910ecb9da9cc9e
+part_01446_of_04320.hdf5	8efb92c8eaca423ea6ce1ae915135c32
+part_01447_of_04320.hdf5	66ac50bdf847b63e9427daf8f7cc86a5
+part_01448_of_04320.hdf5	896efb08191238855d4139eb8708682f
+part_01449_of_04320.hdf5	93b4d1fcfcdf714a2c2cd8c4fbfb0de0
+part_01450_of_04320.hdf5	e57b4c8704448e384385e8c3ec525644
+part_01451_of_04320.hdf5	b74596a966251465013903418bfc91ad
+part_01452_of_04320.hdf5	8c1a440c6b484d36639179386b9e2379
+part_01453_of_04320.hdf5	f773bbe281a8b52cd277754fd313fcba
+part_01454_of_04320.hdf5	c0bef7b5902b9f865e957dd2ab1101e3
+part_01455_of_04320.hdf5	2b206f66d639da781255d7f2e768f0e4
+part_01456_of_04320.hdf5	885f4b65267621ca4f967096ce4b7031
+part_01457_of_04320.hdf5	5ef6d7fa6bbeb17025282db75f720286
+part_01458_of_04320.hdf5	f8a89011eab5083d4c9613d8d7cf70ce
+part_01459_of_04320.hdf5	44ea7a0b89a0326ba8a511f1a91a5bd5
+part_01460_of_04320.hdf5	f78f39c187f736a580b4d5316baf9590
+part_01461_of_04320.hdf5	98273240744aeaff449f2f9de535a008
+part_01462_of_04320.hdf5	1ddf2b89d3d4dc8e9d4c3ba3e760eda0
+part_01463_of_04320.hdf5	176b1f20e1de8ddb4ef7486e135ece36
+part_01464_of_04320.hdf5	0fb92b9efe00396f14efa2b0a42deefb
+part_01465_of_04320.hdf5	76bb4ba296b7ae7ecfb01807ff1cdca7
+part_01466_of_04320.hdf5	3ad16f0fc4d9d4ef5c72fa2612ff5140
+part_01467_of_04320.hdf5	344cf6b884f715af97e77ae0c2a79723
+part_01468_of_04320.hdf5	48c3fa55f13349265b3e3270dfc79416
+part_01469_of_04320.hdf5	3d4023f6a27a0dc7fd8cd58b0a8d0996
+part_01470_of_04320.hdf5	1936ae2e883ccccc3fb9b05f2fb911fb
+part_01471_of_04320.hdf5	66a39b2d75a94d283ef2301330848f30
+part_01472_of_04320.hdf5	ae40d2b5f541fbbccacebfeb29431de5
+part_01473_of_04320.hdf5	6330eb7005bcf6a826fad7056b6dbb75
+part_01474_of_04320.hdf5	69347b7101fdbf4d327c2b6697a63fe6
+part_01475_of_04320.hdf5	67dd3ccba2a3365738bf884ab7748369
+part_01476_of_04320.hdf5	338f66dbc279235c16e8a21c31badbc2
+part_01477_of_04320.hdf5	97458e818203d8139eb5d5c617d36f3d
+part_01478_of_04320.hdf5	e607cc64fa6b57e76a9b33ba7b6bbae8
+part_01479_of_04320.hdf5	0b5702cad6b3e2c58951876f965ffa59
+part_01480_of_04320.hdf5	713449319b4925daa37832c193e9d3bd
+part_01481_of_04320.hdf5	e53fda2d1dbf4d523ba4da282dfca20c
+part_01482_of_04320.hdf5	30da27ac375750314009d41b9a8ea8e2
+part_01483_of_04320.hdf5	56ab61295435d9739730b7e8ac088a13
+part_01484_of_04320.hdf5	f06632152e2b8a7207b87cfe2927a824
+part_01485_of_04320.hdf5	fa85b4bf976ba558a8ed0d4b6181af96
+part_01486_of_04320.hdf5	736abcedbfca2ee69c3794c58a2c8cdf
+part_01487_of_04320.hdf5	f962631e7dc0f0d6fb56942d2c7f7418
+part_01488_of_04320.hdf5	31591e01d34dbce7a32df6724ab9a8ac
+part_01489_of_04320.hdf5	21135548e7b2917048950ba91711a9c3
+part_01490_of_04320.hdf5	fa394dc79745998c64067f31b3fa1fd7
+part_01491_of_04320.hdf5	545ed3f7e7c648e73e0a015f652b52ec
+part_01492_of_04320.hdf5	337ef10553f981a9cc998c0775172ffa
+part_01493_of_04320.hdf5	f2275efddd2d1b423bd5793cb7c3f76c
+part_01494_of_04320.hdf5	9f1db46eec241287e4ffa63f7ec11281
+part_01495_of_04320.hdf5	ce3d89b818f42640b91b234868471acc
+part_01496_of_04320.hdf5	1f22536ddc8de843d8cae3b73dfe44c2
+part_01497_of_04320.hdf5	f6d3f9b13c6946870e8a7e56b235166e
+part_01498_of_04320.hdf5	dfd41cfe889e976c57a6710be2bf7e46
+part_01499_of_04320.hdf5	1a896711af0d15beef0e76aa7928fb5e
+part_01500_of_04320.hdf5	58c838b031bcc58a84b6b8004249ea34
+part_01501_of_04320.hdf5	d2f2875848df3b1664c490c188a2f5ab
+part_01502_of_04320.hdf5	e1bb70d3d97c12389d7ed0faae83684f
+part_01503_of_04320.hdf5	3207f23b8c4fa0922ba3240d5efee4da
+part_01504_of_04320.hdf5	6f3780f6bc921df5b148d029c4a46c91
+part_01505_of_04320.hdf5	27718923d48d5725f29f360112d723a5
+part_01506_of_04320.hdf5	596d20349bb5457bec8c50e10ec18549
+part_01507_of_04320.hdf5	ac21c2caca7bab447140b6ee1db4235f
+part_01508_of_04320.hdf5	ceebff01fdee8e1ebf2a4504945e0621
+part_01509_of_04320.hdf5	6d82ced780424bbef8472499b67016be
+part_01510_of_04320.hdf5	f8a4c540023a997277187734d6e1632c
+part_01511_of_04320.hdf5	a9da2f34a6ab1d7c2585e1fc2dbdc868
+part_01512_of_04320.hdf5	d2ef8632033dd390a714d9101dffae89
+part_01513_of_04320.hdf5	29efc14f0271e2c1d21bd1b1e025f03e
+part_01514_of_04320.hdf5	65e52478aee7a88b6ba01217930b4d3a
+part_01515_of_04320.hdf5	eec4d5c0df1766c357e370db0f5eaba2
+part_01516_of_04320.hdf5	bda6f75dc605516303b116e10c86051c
+part_01517_of_04320.hdf5	6e26c635f58d206f5581cc3f0f757666
+part_01518_of_04320.hdf5	5b8af273e755ae6d4e7bf537b8878d2b
+part_01519_of_04320.hdf5	4abc97b4c358ef6354ee4b83b52789e5
+part_01520_of_04320.hdf5	a6c11fae9261358bc0527d2d91d2e1aa
+part_01521_of_04320.hdf5	df6a6640c2cd3cce4ea77d0d16260184
+part_01522_of_04320.hdf5	617176e19d223b67aa5fd53b7044969b
+part_01523_of_04320.hdf5	d84586efbb4f4c7122b566b88106dab7
+part_01524_of_04320.hdf5	d5c3e210d49ffc6308b55efcd1eae460
+part_01525_of_04320.hdf5	44d6d1b4183e91c86d405adec67e0208
+part_01526_of_04320.hdf5	9f9ed19ae9aeee1f9b49ae25753acc81
+part_01527_of_04320.hdf5	7fdf6da413b3321948138ba831825ba5
+part_01528_of_04320.hdf5	87c6c33b4ef15f7c168929aa504313ef
+part_01529_of_04320.hdf5	ab09343357642cd73b1d315774b0a281
+part_01530_of_04320.hdf5	e0dae68eb6eda8fc0e5d4b6c5b5781e7
+part_01531_of_04320.hdf5	1ab3c5a1f9e73ebccbbcebee00092458
+part_01532_of_04320.hdf5	0660dfa808edc7c453d81c3a656c6256
+part_01533_of_04320.hdf5	eb86fadc72e0adf43897f5da1406d05f
+part_01534_of_04320.hdf5	b1517cda54b9127e812faa6028f9b0b0
+part_01535_of_04320.hdf5	cb9309e6b2700adc41408bd1280e1e91
+part_01536_of_04320.hdf5	96d5e91f3c5bc714d8153e04bd3df516
+part_01537_of_04320.hdf5	b64fa6b8da21b158c7a4f374b2e1e6ee
+part_01538_of_04320.hdf5	fd6296cf26424b6a3e0ad814d0cc43d6
+part_01539_of_04320.hdf5	03b5e064784e23ad593469a54702a186
+part_01540_of_04320.hdf5	db55c8a71ac6149508cd55686427e388
+part_01541_of_04320.hdf5	5bdced51b57bfc4e41c1fd4a1a931ae5
+part_01542_of_04320.hdf5	ea08d1a9cb47447cc1342e90a9c7f2a7
+part_01543_of_04320.hdf5	5c3f90ac8a2dd83149348fda128cc922
+part_01544_of_04320.hdf5	5421bae39afab618dbd40678edd2bb36
+part_01545_of_04320.hdf5	ff411c9889e28ea4abbcde4266acf6ea
+part_01546_of_04320.hdf5	7b7f146583ed024c6b2ddb4c891cf9e9
+part_01547_of_04320.hdf5	2768c1441d65e0b90298799b32c64b89
+part_01548_of_04320.hdf5	b8ae4655bca83206b286a6aeee6ee320
+part_01549_of_04320.hdf5	b6d7b5a8e999c2ffd450bad11c1333f5
+part_01550_of_04320.hdf5	2b9913ade19e6e792ba3e51d86df847b
+part_01551_of_04320.hdf5	b8fb9259c364c6ac8f4113e3ad240ce8
+part_01552_of_04320.hdf5	9e2c71653aec9878ae3cbbd02a806384
+part_01553_of_04320.hdf5	385bc0b9b3a811beec4538dde20dd4b2
+part_01554_of_04320.hdf5	171aa48f60b6eef346b9af1e7379e02a
+part_01555_of_04320.hdf5	7fc876549d484c0fc38ef59fb7efca03
+part_01556_of_04320.hdf5	bb4ee3e890aada6a2698316a6c0ce060
+part_01557_of_04320.hdf5	893e6d7ba9a0253d90997123afb539e8
+part_01558_of_04320.hdf5	3781ac8fb599796c4f2e0d38acb06f22
+part_01559_of_04320.hdf5	566ce7a53072f39cc78061a0c7955641
+part_01560_of_04320.hdf5	758785acf9d9bb6bb5fc5b7267505e5f
+part_01561_of_04320.hdf5	3c001bcd1b7aeb9729ed2af8259efd43
+part_01562_of_04320.hdf5	a00f6df9579e26a2d1a27e164c4af493
+part_01563_of_04320.hdf5	9693b4c0b6256c605a587bcf36db126a
+part_01564_of_04320.hdf5	8ff74524271c15c9c0c6de3f6eead81a
+part_01565_of_04320.hdf5	04036387ebdc13eb775fcbb6c8576418
+part_01566_of_04320.hdf5	1eb145b5cce8dff88e3fb258828b458d
+part_01567_of_04320.hdf5	bfaa76d2f37b36195ab5a6c4472a5c69
+part_01568_of_04320.hdf5	f866115b82aa0bff357cb137caf4f780
+part_01569_of_04320.hdf5	401aeac6b21043bdc32464e3cd979541
+part_01570_of_04320.hdf5	7dadc48a6f2b160c1efb915253eaa830
+part_01571_of_04320.hdf5	50d07d40e69c41f2dc5f76d46cd0ae1d
+part_01572_of_04320.hdf5	2325183fa57788670fcbdaa3c2741dad
+part_01573_of_04320.hdf5	7cea9aa4d254c37e424f6e86eec116b4
+part_01574_of_04320.hdf5	e714540812134bca3f8ba9da36807c7f
+part_01575_of_04320.hdf5	77e705e580f401adc25cdebf4d78eb64
+part_01576_of_04320.hdf5	d222a790f990d24bde95f1cb633a4b05
+part_01577_of_04320.hdf5	8371d9046af0880949eba752fe7bda78
+part_01578_of_04320.hdf5	d18ebd21d3bb2821185195214b6c70fb
+part_01579_of_04320.hdf5	10eb11cc71c8b3be4d7e01ccf54b0706
+part_01580_of_04320.hdf5	860dca352b3b566d04688e43ab80413b
+part_01581_of_04320.hdf5	20326cfc4b68d3bc8187a58e80a2c879
+part_01582_of_04320.hdf5	c2f2ece0dd961bee3ff9ea4342aa0a3b
+part_01583_of_04320.hdf5	291ffd1ac57628ed094381eb4a2f20ad
+part_01584_of_04320.hdf5	357b828f2f263023efd31744fc3ec3cc
+part_01585_of_04320.hdf5	5d5f188ff681593a2270df995b3cbb72
+part_01586_of_04320.hdf5	6123850f706fb482edab4ce0d44d51d9
+part_01587_of_04320.hdf5	601f5d6b4ad4ac982780c0c611f2b091
+part_01588_of_04320.hdf5	c30ad0753eddea3e4e36509a2d6d3f0a
+part_01589_of_04320.hdf5	1427cb876f7596e06bf693fc45ba94eb
+part_01590_of_04320.hdf5	b911239b43cd087c5604e1586e20a2bf
+part_01591_of_04320.hdf5	909feb06bdacd02033dbfe2b764ecf7b
+part_01592_of_04320.hdf5	87265c14eb46f6121fe51f565239b77f
+part_01593_of_04320.hdf5	d74ccc8607d87a70fd559a9109ee4b82
+part_01594_of_04320.hdf5	17aaa796aa6fe34723bba71c13033d9f
+part_01595_of_04320.hdf5	027edea96c4add9faa7535f117d89c03
+part_01596_of_04320.hdf5	ed171f709eaec57f711b8c1171cbd9ab
+part_01597_of_04320.hdf5	b99463d6df204af2377a685fc5afba14
+part_01598_of_04320.hdf5	377af9d6d90c35d001134a8ae687abd7
+part_01599_of_04320.hdf5	7b4d2fef765ae719e3dfa18d4b9181b7
+part_01600_of_04320.hdf5	163b3bb8ddc51a8003406d04088c521f
+part_01601_of_04320.hdf5	7ee3d8d0911e54482c0eddba54bfecf1
+part_01602_of_04320.hdf5	e56a86d2abcc4f6d16334668121be8fe
+part_01603_of_04320.hdf5	460a7d4d73d93a541088e88c63308a21
+part_01604_of_04320.hdf5	0f798464e056410e6bab3782a98dedb5
+part_01605_of_04320.hdf5	feb4c3bff75ea2cea1430d553064716c
+part_01606_of_04320.hdf5	fd16570db7489892494fec66fe8b6604
+part_01607_of_04320.hdf5	9e1a33d66f468583f07e3e8b7290d1bc
+part_01608_of_04320.hdf5	60d331e2a87e8fe1788b23206d1b8a2c
+part_01609_of_04320.hdf5	6a6bcf2fdb7595a51e8a5efb2a072f3d
+part_01610_of_04320.hdf5	7b23f0b13db0811b4ed1e0c7783ad6f9
+part_01611_of_04320.hdf5	abb851988a53ae2326b0b7cf38866dc2
+part_01612_of_04320.hdf5	6fd4fefbf45dded407f994a8abebe36d
+part_01613_of_04320.hdf5	907d017bea80e8e7ffcc6f608ec539d4
+part_01614_of_04320.hdf5	cf4fdedad0871c25e31c25ce756eb90f
+part_01615_of_04320.hdf5	9ceea17c6ddd8884323639371ce33b8b
+part_01616_of_04320.hdf5	8b04b869f76e1a4cf6a1f2a89e7aafd0
+part_01617_of_04320.hdf5	c8a4341f4001a6ea84d20bb3e87b5fa9
+part_01618_of_04320.hdf5	cb9d132a37df650eb2bb2deaf21bc27f
+part_01619_of_04320.hdf5	338e4d1e15b704694418dcc8732beb75
+part_01620_of_04320.hdf5	af406b9ade77a866c98041914bfdf6b5
+part_01621_of_04320.hdf5	b5024fa50622fc2528ac4435bcf33cab
+part_01622_of_04320.hdf5	c734481e2f934410927700a47f54871f
+part_01623_of_04320.hdf5	9164fcd95cf44d7bfbbb022f0153bb5f
+part_01624_of_04320.hdf5	388e8f793d72c399d4b6a29b4ac27e29
+part_01625_of_04320.hdf5	e190f26094074cefefc57b97adc1316a
+part_01626_of_04320.hdf5	bb5c848d03f3e0b366ba0b4d8d983be5
+part_01627_of_04320.hdf5	823aa5aaf6e48f691e3bb5ad2512ce7c
+part_01628_of_04320.hdf5	1b10017d865e0d969ab861bced461766
+part_01629_of_04320.hdf5	5c506d8cfb1f1a32e5eb4dc9df2734f4
+part_01630_of_04320.hdf5	acf64031762ecf421d0ef9e841dfacbc
+part_01631_of_04320.hdf5	86d8f963433f188e0ee68aa54c3f7ebf
+part_01632_of_04320.hdf5	e6553ed16d9d42a6aef98abd63b8548c
+part_01633_of_04320.hdf5	a8500cc45995c9c7351ad23d581334ad
+part_01634_of_04320.hdf5	e79003a51545460ac39b99d332b918dd
+part_01635_of_04320.hdf5	5c9645d1f39b090a4d76cca6276c8de9
+part_01636_of_04320.hdf5	a0d8a080fceae10f4a428a4bf7b60d02
+part_01637_of_04320.hdf5	71a875fa393ab9c2537768bf42214821
+part_01638_of_04320.hdf5	483effeb1681841c5a74198bd11ef153
+part_01639_of_04320.hdf5	385e8637cbb00f713c07c6c4ce26938b
+part_01640_of_04320.hdf5	4f41bb1cff38c7db9bea8e9bda663d4d
+part_01641_of_04320.hdf5	d7e515f8120580a43e13176c5b309b74
+part_01642_of_04320.hdf5	9a52a4d3c980c82e4c8d546e63c88ef6
+part_01643_of_04320.hdf5	caa4c4b9ed006553b479cd67f039db0c
+part_01644_of_04320.hdf5	30d3e5813e4488100f3e58c359b8bbfd
+part_01645_of_04320.hdf5	c7c6e4135cf9505087cd8108f2f126da
+part_01646_of_04320.hdf5	b2b8090cc215f8a580c5ae4cb33b05aa
+part_01647_of_04320.hdf5	06493e376423b8bded1365165e96a949
+part_01648_of_04320.hdf5	4ef9f52ec8cbc0534f1047ef91f7e6ef
+part_01649_of_04320.hdf5	cf88b47f19b0536fabc19b0422eab7c3
+part_01650_of_04320.hdf5	e4e6354f7aed08bc1bfb4e622cb9c26e
+part_01651_of_04320.hdf5	c3b717bccd8fed83ce549c5eabc9342c
+part_01652_of_04320.hdf5	19a195971685a20e52d9e21f2b05bec9
+part_01653_of_04320.hdf5	d6469afade1f6ce70233eb082bd80bca
+part_01654_of_04320.hdf5	e46ce4efe35a96549cbf8fd24a517fa2
+part_01655_of_04320.hdf5	23e9d833d5f43bc59adb3717f630aff7
+part_01656_of_04320.hdf5	61e7acb68aaae4db1bab9a8766b0b0cb
+part_01657_of_04320.hdf5	07f78f0914681fc602499a0f5d74589f
+part_01658_of_04320.hdf5	c07573e92ce0c504d30956e998bc9226
+part_01659_of_04320.hdf5	ce9be6c3198659225cea2b0a951afc2f
+part_01660_of_04320.hdf5	4f76d5d9c66572547a4744fc6610feba
+part_01661_of_04320.hdf5	e1f816980b77262c333204308ce8b2cb
+part_01662_of_04320.hdf5	a4e92f238796afec2cdc8c89721e069b
+part_01663_of_04320.hdf5	a43b14830589b45d211406bb0a7570c0
+part_01664_of_04320.hdf5	6f2fc28c3bb55aa3726ef49f37197d1a
+part_01665_of_04320.hdf5	318f0b41538399c05d741619ea5f97a5
+part_01666_of_04320.hdf5	e1fa46eff076b71aedb3e05c9750dad3
+part_01667_of_04320.hdf5	f9a4e64149c64e55c1d3fad025cb3b44
+part_01668_of_04320.hdf5	098bce10edb3274253faa58441a4518e
+part_01669_of_04320.hdf5	02e09736de6c4194679e33104f6f764b
+part_01670_of_04320.hdf5	8d60fd67ebe7713ef3594ac0dcfed55d
+part_01671_of_04320.hdf5	a413827ec4dd522654cf7f52b97cfc53
+part_01672_of_04320.hdf5	4b0ea6a63dca6f2cd3dc0c9571dcc2c5
+part_01673_of_04320.hdf5	37045138780cfa999a6a530520134538
+part_01674_of_04320.hdf5	08a4dcabfc1c98637f3d73d5b9fb0d1a
+part_01675_of_04320.hdf5	5ece0bb7cde489cae4620a5678411ea7
+part_01676_of_04320.hdf5	37afd67f8ffaedf684458ff6a2e9a6bd
+part_01677_of_04320.hdf5	d88e75a1fbd30b8800c55aab22909307
+part_01678_of_04320.hdf5	f06cee90b0b55ab21d7b5d91f69bc4bc
+part_01679_of_04320.hdf5	d152678c5e61c3be36090b35ca693962
+part_01680_of_04320.hdf5	12cd90758b16d214a85dc0341df1c5ac
+part_01681_of_04320.hdf5	41320862e9efb7913615de76f151ce25
+part_01682_of_04320.hdf5	821c9e198f6433cd687d099caaa85ecd
+part_01683_of_04320.hdf5	0d71ceee816e5f89c4ce9c604d023efa
+part_01684_of_04320.hdf5	cf6a83b55200963b96343355fd0c7673
+part_01685_of_04320.hdf5	b2f1ac3086f4f761f7ea8e13c92c15be
+part_01686_of_04320.hdf5	30c0c61507f74c73577c7fb7d0145bfb
+part_01687_of_04320.hdf5	1674d0884b2b9f52321257d78fc04a75
+part_01688_of_04320.hdf5	f82d132e65f034755f63fafa63fb1e22
+part_01689_of_04320.hdf5	41f11c8eb76c1f1afea38465068a2905
+part_01690_of_04320.hdf5	22bc8221024119dccc2ff51e045141bc
+part_01691_of_04320.hdf5	22d945fe6f2389ac47badda4f644e85b
+part_01692_of_04320.hdf5	43bd3d92770494a918c88898dabf2943
+part_01693_of_04320.hdf5	67b3ea30aa20f46f8a5502c43fee3239
+part_01694_of_04320.hdf5	777e2c3470f85d7216f6715c33938205
+part_01695_of_04320.hdf5	1cb44ecb12523a85f49918cfc9159885
+part_01696_of_04320.hdf5	123520fd2cf7729fa08b2be4f9c643bd
+part_01697_of_04320.hdf5	ba8698b605b015e82bc3caabb3f08c51
+part_01698_of_04320.hdf5	5287bce4b7d2ae582e689406ea68e981
+part_01699_of_04320.hdf5	08420620932477d95a8af6db818f0caf
+part_01700_of_04320.hdf5	ba5e94fc829866a9c2aca1821a0384a8
+part_01701_of_04320.hdf5	fd365803155c27dd4385d0a0fafabec3
+part_01702_of_04320.hdf5	17e1e11d4ea92db95ce998b5af2d58f8
+part_01703_of_04320.hdf5	3bf0e43505ed8ea5bcb8269faff1b06e
+part_01704_of_04320.hdf5	b8270f96b74a681dd121dd8d10b4a1c1
+part_01705_of_04320.hdf5	ab8055b69340cd7101d50ef6cd445811
+part_01706_of_04320.hdf5	e3a0550f972a872e1e81e80c519a3506
+part_01707_of_04320.hdf5	6aadd47b17856374d29b6b888b142417
+part_01708_of_04320.hdf5	f12c2e838640010849cd5bac0f9276e5
+part_01709_of_04320.hdf5	747d2d8211be0b523a602f82c24d166e
+part_01710_of_04320.hdf5	723e584f530c42e3600d073490286ac0
+part_01711_of_04320.hdf5	2fbc1f50aaeb447d5c752a6fd16b2fbe
+part_01712_of_04320.hdf5	381cd182837164c83b41a407568f995a
+part_01713_of_04320.hdf5	40dde5423eefdbe41ffedd75cbd727aa
+part_01714_of_04320.hdf5	c9eadbcdf829ff357b41c9265b9f5e5a
+part_01715_of_04320.hdf5	b50a536c72e205d0c5c4e07faaa19e3e
+part_01716_of_04320.hdf5	bfe4d17855c17d8f49b787f33ac3aba0
+part_01717_of_04320.hdf5	551d1f7797ccf44d54f2a7742ecc118b
+part_01718_of_04320.hdf5	ac9fbebaec92bc45e6ae3e5474799777
+part_01719_of_04320.hdf5	41fae03074b8d525d4b1c2325c3481dc
+part_01720_of_04320.hdf5	b4752ac1536de8063fbc1e95c7e428cc
+part_01721_of_04320.hdf5	cd9e7ebcffbbf5407ba83dce2eebcaf8
+part_01722_of_04320.hdf5	caf6b4c2fec3b580d63e7eeaec5c667b
+part_01723_of_04320.hdf5	9d93acf0441ab1c660111851b0ff7e7c
+part_01724_of_04320.hdf5	fb47f7b1eabd251b207a90d7eceb7ef3
+part_01725_of_04320.hdf5	5fc4b51a2f188d9388455eb32c7c67ec
+part_01726_of_04320.hdf5	dc1c7cede7fddc7dd7a917191352185b
+part_01727_of_04320.hdf5	d0fba5da107ce7f2904c59eded579564
+part_01728_of_04320.hdf5	05d223394fa4a415079100ac91237240
+part_01729_of_04320.hdf5	8787878288ef40d92472c23214d862d2
+part_01730_of_04320.hdf5	8c16c08578047b4b80c2724ec25ccdb3
+part_01731_of_04320.hdf5	4fd110697d3abd198c9e0fdfe71102aa
+part_01732_of_04320.hdf5	dc13d8935b13791811bb7aaef04373e0
+part_01733_of_04320.hdf5	1d1255e6b17f6b442353e2024ab56127
+part_01734_of_04320.hdf5	d5fe8552fd8e7feed5980821a50b483f
+part_01735_of_04320.hdf5	0ac3301dd1c38c368be5a139e38f25c3
+part_01736_of_04320.hdf5	546d02536438cd13076b0becabd70482
+part_01737_of_04320.hdf5	c77097d8eb7a8361e2c4045b6fe41280
+part_01738_of_04320.hdf5	0b6c5c596fbe2ffbc9f12028e9c6711d
+part_01739_of_04320.hdf5	3f9c56838e63a802e0111ca2840ac0ca
+part_01740_of_04320.hdf5	384e7c9c5d5d827dbd54629d5b1fdce0
+part_01741_of_04320.hdf5	ce3db02e180a7071156891b6fb5f8754
+part_01742_of_04320.hdf5	6f7f2893f9eada6f31757db3e4f2c26f
+part_01743_of_04320.hdf5	6f88a0b87881db356adaddc617769259
+part_01744_of_04320.hdf5	a46ca8d3933760b470431f2d364af9dd
+part_01745_of_04320.hdf5	d0dfe57b5a457eb8e6a733e392279a91
+part_01746_of_04320.hdf5	d70cbfff2f2ab10326f32d5a18fe5c4b
+part_01747_of_04320.hdf5	3fbda3877a80a8ca9cbac52b23b53cf5
+part_01748_of_04320.hdf5	46818e7961714edb50582e18a2eb6e55
+part_01749_of_04320.hdf5	28fcf862e37e44c60b8de9d41e217e96
+part_01750_of_04320.hdf5	b99c66ec38079311f26e6f0abb68d964
+part_01751_of_04320.hdf5	43096de4d276c867b6d55902fcdac67e
+part_01752_of_04320.hdf5	e5ca94dee329876855f2e7c281379482
+part_01753_of_04320.hdf5	281c07519ba6045b7619e655a0101548
+part_01754_of_04320.hdf5	8c1dc86505ff060778eff33814032d43
+part_01755_of_04320.hdf5	eb4fbab2c44b8deb41634064d59f6fa2
+part_01756_of_04320.hdf5	1d646634d7ab0b3913175bedb50b88b9
+part_01757_of_04320.hdf5	85843d4d49e458cc4bbed0e640326f3b
+part_01758_of_04320.hdf5	6ab6167c8996c562c7db049ba2ff71c5
+part_01759_of_04320.hdf5	1519a8c604ce7fee89bdc991e6f26159
+part_01760_of_04320.hdf5	bb5b17cc4809398c5cf35d07e1d3c468
+part_01761_of_04320.hdf5	ff36a512548037ead602cfdd8befabd4
+part_01762_of_04320.hdf5	f633f2944fd5b39a51c2ebacfe0147da
+part_01763_of_04320.hdf5	8b77d5799dcd3ed9379ee71c5c6bee9d
+part_01764_of_04320.hdf5	2e0b9afe68f04279ebe98b102a6791d0
+part_01765_of_04320.hdf5	75246ac17c39b49f0723e2ae51b77fac
+part_01766_of_04320.hdf5	2c4c1a2928102d0d238bc95de42e3b30
+part_01767_of_04320.hdf5	b36ea7c7a263341a02268987f6f5168e
+part_01768_of_04320.hdf5	5ad71280d925a254e4d6bfdb6baebb41
+part_01769_of_04320.hdf5	a8245cdbde8d5c81a333019db17f695b
+part_01770_of_04320.hdf5	46e0214848309bab4129d76021ec3222
+part_01771_of_04320.hdf5	37a1091b017d96f4c79d7db67645d6ee
+part_01772_of_04320.hdf5	b076b5150c2709f53b281359ac0d0693
+part_01773_of_04320.hdf5	8892a5dc588929f8cf68d8d862b9d599
+part_01774_of_04320.hdf5	618028b6f5aef1794a848b6c54aad3fa
+part_01775_of_04320.hdf5	4eb3c7d52c884bb2febdac8e6254e628
+part_01776_of_04320.hdf5	61cfe8eb146e029d20d3b94c2e905ee4
+part_01777_of_04320.hdf5	3812a5fccf0ba1ba74415cf475c5f289
+part_01778_of_04320.hdf5	dbd482202f5a784a4ea98c62662c59c6
+part_01779_of_04320.hdf5	92d3dc7717510d1d2570ebc096cf5e00
+part_01780_of_04320.hdf5	da52c1270b0d5199388f4d26e913a627
+part_01781_of_04320.hdf5	0f87a752a309202eb6063b9bd0f47276
+part_01782_of_04320.hdf5	0f7d4fe0be65daf6b573487bfd5b59d9
+part_01783_of_04320.hdf5	2b781e921802dece0e9ad6bf70bd118f
+part_01784_of_04320.hdf5	a66333a08a225265ed3495d7fc88640e
+part_01785_of_04320.hdf5	46a0b3fbe7b1c609a46f3c1474bbb064
+part_01786_of_04320.hdf5	ce66cfe08f5d1cfdce5dc6b16fcc0f72
+part_01787_of_04320.hdf5	e8ea32b7c41abd1c7cbb1b09eb66e25e
+part_01788_of_04320.hdf5	eddda70b654146d55d8f1fb7fa4cd652
+part_01789_of_04320.hdf5	4df7e6067de690e67316836b68905344
+part_01790_of_04320.hdf5	485bb46b02d4da77622b5cb29fc5ff7e
+part_01791_of_04320.hdf5	fa7e9d5261ad67d1a336b8000fa29a18
+part_01792_of_04320.hdf5	9282e2ae64b7f642bbc8e913f44945a9
+part_01793_of_04320.hdf5	08e906ef89b20516485f0859e8f47cd6
+part_01794_of_04320.hdf5	798e26e442fde73da7d2d94f16f00a4e
+part_01795_of_04320.hdf5	30c5142c5b7cd7f466bdfd6cf102e1e4
+part_01796_of_04320.hdf5	85199f72f6f7507751528f0f6803cb94
+part_01797_of_04320.hdf5	69f089ab2b95e617703591cfd879144e
+part_01798_of_04320.hdf5	1264d31bf2ceff42df1a7127c949eca7
+part_01799_of_04320.hdf5	2dedc4b848fc8d5e36eb57621ce12bb2
+part_01800_of_04320.hdf5	7f2b865b60514605025f4bcf2b6e1eca
+part_01801_of_04320.hdf5	9876dd966e45026dcc99840176c7c44b
+part_01802_of_04320.hdf5	3fc8056e511f1273d8fef4f3affb616a
+part_01803_of_04320.hdf5	fbc7f81f1e25d0ac3acac22126be4678
+part_01804_of_04320.hdf5	bc70401e9381c11f5cd31609b8e53aae
+part_01805_of_04320.hdf5	22aa53091bf24d1b17d0b97d9fb2a7db
+part_01806_of_04320.hdf5	b9be848b54e260873ac09dba23c57e8f
+part_01807_of_04320.hdf5	802b84c682caceefc284bb7b0408ffbb
+part_01808_of_04320.hdf5	9bead2f03643112ab17e046e8b849d33
+part_01809_of_04320.hdf5	1812e04e6c764e73b2bb21489b047e09
+part_01810_of_04320.hdf5	9df2a7ee8e13451dc72be8c3d461ba00
+part_01811_of_04320.hdf5	8f6d863a0a136c30b80e2652dd489dba
+part_01812_of_04320.hdf5	eba4ab388e24e6622825292d7a65cfef
+part_01813_of_04320.hdf5	86378030271088cc0b11be1c41fa508c
+part_01814_of_04320.hdf5	8bfa8ade9f22fa26e01ccbf72c901a48
+part_01815_of_04320.hdf5	3a55f70787444a7dae392b4a95cfbdbc
+part_01816_of_04320.hdf5	709d4a7f39404587253b616333f9817d
+part_01817_of_04320.hdf5	5564b140061e71438336d0b055c56a64
+part_01818_of_04320.hdf5	9c5099cf5e0367ebb7eb437c9cbea2c8
+part_01819_of_04320.hdf5	ce24d5e971a0ceb60ed734158739871b
+part_01820_of_04320.hdf5	43d47049b67286bf96055ae04338b903
+part_01821_of_04320.hdf5	bc115cbe455e6c95ec34a58b8f0af223
+part_01822_of_04320.hdf5	8fdebf31ffd6d386dd26c3a36a2c787a
+part_01823_of_04320.hdf5	974b5920aa958ff7ba1abc11c7129623
+part_01824_of_04320.hdf5	2bc3a6b7939df252a57e383c25a10e46
+part_01825_of_04320.hdf5	3295d64a1685aaff448e5c02a6137e6f
+part_01826_of_04320.hdf5	99f42f03ebb2683262074f7c6c85446a
+part_01827_of_04320.hdf5	bb5f93a1094ecd2141f130ed6e0abc07
+part_01828_of_04320.hdf5	8ea3a789d1d6137668457e1d9c00dbe2
+part_01829_of_04320.hdf5	0970cdb1d8d254e371a8e0e249ceb468
+part_01830_of_04320.hdf5	353d7e737d8d93099a8b2b6df9b14f0e
+part_01831_of_04320.hdf5	bbcd4f0ae80acb7386c2537d15df2e8f
+part_01832_of_04320.hdf5	d97dcdc22e337ed4fb68b7e0635abd66
+part_01833_of_04320.hdf5	98918098ec6d3d33b0bcf745b8f9f01e
+part_01834_of_04320.hdf5	a5b391d0c58fbda63710f83d36ce3a81
+part_01835_of_04320.hdf5	9fafc0504033b85d99e9ae25c3817176
+part_01836_of_04320.hdf5	c15c6310222207e3aa5c028ee59ddd2d
+part_01837_of_04320.hdf5	0ebdaac0bb8304007d8b320c31a4379b
+part_01838_of_04320.hdf5	20298de05b9387a93f56c28f496aa316
+part_01839_of_04320.hdf5	ea87a920dc0dc97ff86bf87ffba7ec5f
+part_01840_of_04320.hdf5	c037f52ecb8af032828111a5ad4f1ca6
+part_01841_of_04320.hdf5	9490744fa669c00d861898ed3793790f
+part_01842_of_04320.hdf5	39495ba97e551f9eb1ccbbcb8370c678
+part_01843_of_04320.hdf5	c91e251fb8aa5d4a707365534429b448
+part_01844_of_04320.hdf5	0cfdd6c46f70de70e127f0e09ad122a3
+part_01845_of_04320.hdf5	f8634d9c54e1f809d46286ebcbdc761a
+part_01846_of_04320.hdf5	2c9cc80c39b8e0ac3b839410dc6280fc
+part_01847_of_04320.hdf5	1269709471cf8b75ddc3d925de0f02d4
+part_01848_of_04320.hdf5	6f80d65e1b061765875c2bdd6daa2684
+part_01849_of_04320.hdf5	3479a1fe0a15763f83a64371c209133a
+part_01850_of_04320.hdf5	9e531106def9dc7953144d611c8449f2
+part_01851_of_04320.hdf5	366b2c12cb2357f12e52fab643160610
+part_01852_of_04320.hdf5	3f796833bddfdc229fb5ee9ea94e2027
+part_01853_of_04320.hdf5	871ff313beb98c7fd94d2cb95f497ba3
+part_01854_of_04320.hdf5	1d25a5dc7003c12202f08a62409b31b5
+part_01855_of_04320.hdf5	81648a9fe165f1cea4cf4b61977dc925
+part_01856_of_04320.hdf5	622d21c4982858f9aa0d87031d6bd2b7
+part_01857_of_04320.hdf5	585646718419e3fa99e7299b2c70b1fd
+part_01858_of_04320.hdf5	6dd691b4ef236d5da6946f4f214c8522
+part_01859_of_04320.hdf5	a9121855577c24a62a45a3f54dba063d
+part_01860_of_04320.hdf5	46066380447beee717c52640d9935df3
+part_01861_of_04320.hdf5	b629e4fde7c216ffdb4d37858438e8fb
+part_01862_of_04320.hdf5	7abf404e854e9dd1e5a4d46125c74d80
+part_01863_of_04320.hdf5	1c75d30b44eff64c2c4e7d358b1e43cb
+part_01864_of_04320.hdf5	d47ba11b3cff0672dd6a786730fa8308
+part_01865_of_04320.hdf5	c12478b09309aad38b453aeaec0d333e
+part_01866_of_04320.hdf5	e354cdb92802fe3a6fc2f4add2ee66bf
+part_01867_of_04320.hdf5	59c9733651f434c1bdc6c67329739bd6
+part_01868_of_04320.hdf5	810bdd038e17680dfcb7997e4fb08ce1
+part_01869_of_04320.hdf5	8f41f247f69b5917d7a3ea0bc6aa4232
+part_01870_of_04320.hdf5	dfc222561ac7cc314e9d3c8329b088b9
+part_01871_of_04320.hdf5	1680ec952f4dda83167e8fc54694bf64
+part_01872_of_04320.hdf5	412f7b3df9e14e4d440e835f73f02687
+part_01873_of_04320.hdf5	8e3a2c32dc0f7de55a5eadab167fad9e
+part_01874_of_04320.hdf5	4e1f56c36a55407b1d61f9b460a947aa
+part_01875_of_04320.hdf5	5451eab9a5a7ef16a0d26250e21bb1c9
+part_01876_of_04320.hdf5	d85e2a520d0735def97e88c2c32093bc
+part_01877_of_04320.hdf5	748161d584e0e9f0338bcd33bd76e092
+part_01878_of_04320.hdf5	24aab56379fc4285ce494ef25350b09c
+part_01879_of_04320.hdf5	affa515b970a940e037b99820fb9f15c
+part_01880_of_04320.hdf5	3268646399192aa48b9d049f3ddc4b3a
+part_01881_of_04320.hdf5	3aaffb633ce8c55333a56271a0b5b6b3
+part_01882_of_04320.hdf5	1bb699aee2247f2607654be4d62326a0
+part_01883_of_04320.hdf5	f6363c37a959a97a124ec4d80c8c96ec
+part_01884_of_04320.hdf5	a3c88b95e66f2afd78e587fe7687003a
+part_01885_of_04320.hdf5	cacb49d63a0b61ccad8e2a9398f71d23
+part_01886_of_04320.hdf5	456a3f7b4604414e6db09e061211cd07
+part_01887_of_04320.hdf5	fa1e3a175f17e50fc945f34e6b9e0a4a
+part_01888_of_04320.hdf5	1a23f9991ea736ebf15e802ef3a48b13
+part_01889_of_04320.hdf5	d9ce0d1b9a8e882b96d33731c5a7f95b
+part_01890_of_04320.hdf5	beb6bd6fdb84c4f572543f8c70b4fcf5
+part_01891_of_04320.hdf5	bb4eacbd3a70a136231790a6124f6fbb
+part_01892_of_04320.hdf5	5c82796167cce283c24112bab4a58984
+part_01893_of_04320.hdf5	1e7ef651337e01cf302940da53d2b86e
+part_01894_of_04320.hdf5	a3e274261c3e133fbe4ba39456846da1
+part_01895_of_04320.hdf5	8aa4b29536c6d9f617621eb551755e17
+part_01896_of_04320.hdf5	1877afe37708e0e0ea4e937480bde304
+part_01897_of_04320.hdf5	194e286aa8745627b8653bdd3b8b13d7
+part_01898_of_04320.hdf5	51dfc212e59f3c836fa19e574e714ad3
+part_01899_of_04320.hdf5	6e630b9779d3eb41ce971d139cc00782
+part_01900_of_04320.hdf5	b6385131fd7052f08113ece773da33c3
+part_01901_of_04320.hdf5	808728e32b745df842b8eef13abb13b6
+part_01902_of_04320.hdf5	858ecb93f2e310524735197796a3af12
+part_01903_of_04320.hdf5	2bfaeaa6c2233591b859783987493ae2
+part_01904_of_04320.hdf5	c6314299f91e1fb046c7121497d1eabc
+part_01905_of_04320.hdf5	7ce35858f2e91e27bacabed9d494ee8e
+part_01906_of_04320.hdf5	f50786fc3c52969855176440c508c542
+part_01907_of_04320.hdf5	358359e0cf01425399e5ba5d66ba3c71
+part_01908_of_04320.hdf5	5609dfcac5e274b329569dbcbb3ff01d
+part_01909_of_04320.hdf5	e4efaddcfbcfddb47a133dff7fb25adc
+part_01910_of_04320.hdf5	5d778d1fb027ce0da209bf46e701ddaa
+part_01911_of_04320.hdf5	23575ff0c11cf37962b4665d2ba6495a
+part_01912_of_04320.hdf5	e7a9258846e4e50449eddb79a6df5a8e
+part_01913_of_04320.hdf5	82495912a1e80e45ee68cc054c612add
+part_01914_of_04320.hdf5	4befa7897a33124f2d2b4e8f79e1599b
+part_01915_of_04320.hdf5	a07641a0f0ac1f6d30294b534273aac8
+part_01916_of_04320.hdf5	b8489ca58a1e9738dea561717bcdd94f
+part_01917_of_04320.hdf5	5881a792e4217f7195ad961b01c8afc9
+part_01918_of_04320.hdf5	67ef62d21ac95376c2fcb0ff3e968696
+part_01919_of_04320.hdf5	48522d4d9ce69ca3cf5209b9e4f26694
+part_01920_of_04320.hdf5	121af1f390c4e146742b80d8f40ab4da
+part_01921_of_04320.hdf5	823b363c2fe66552d811d5ce301ba13f
+part_01922_of_04320.hdf5	8dfcf95379f1afe2ed7e3d5f0f815748
+part_01923_of_04320.hdf5	b2b4043b19a4bfe20786930b1a97711e
+part_01924_of_04320.hdf5	508d38b7aafa2864d9545a349b29da28
+part_01925_of_04320.hdf5	73de7cb5a08447d5441ba5292ad589e0
+part_01926_of_04320.hdf5	cc95ea08b58be6827a44f937fe4850d3
+part_01927_of_04320.hdf5	6cc6e7af7eb1eaaf9735c15931ceb703
+part_01928_of_04320.hdf5	0d89c715ede73515a91c8319b0b76a2a
+part_01929_of_04320.hdf5	e981bf2e004870a2af813c94940070f5
+part_01930_of_04320.hdf5	712a86ac28d20d64d5908782ef97c579
+part_01931_of_04320.hdf5	0a64ac22453657fe9f824f9334d27f25
+part_01932_of_04320.hdf5	25008aadbff088be8cd04aaa3d7d08bf
+part_01933_of_04320.hdf5	3915591c7ca1558000be2047c306bce3
+part_01934_of_04320.hdf5	3e5f2739b2e520489211b289807647de
+part_01935_of_04320.hdf5	4a757ade5e350f39079fe0a03331a261
+part_01936_of_04320.hdf5	6239e641b9d9f3341b9ed202b8bd776d
+part_01937_of_04320.hdf5	73f577d9568e02c1400b0b0c27fb13b9
+part_01938_of_04320.hdf5	7e3e26d6a9f1f9d48545428019bfc408
+part_01939_of_04320.hdf5	be856e039449c9e521f62edea0853238
+part_01940_of_04320.hdf5	19cb3324e8e9b0b010b0b9aede2edb38
+part_01941_of_04320.hdf5	9440b1cfbc1a970d60f843a86e3fcd21
+part_01942_of_04320.hdf5	c04ea8d0fc5d4f281c62852cb3a5b79b
+part_01943_of_04320.hdf5	ccf39fabc8eb656d3a212cec9ec8d564
+part_01944_of_04320.hdf5	4aa730c4d5e6ecd0e034f6d754fc443e
+part_01945_of_04320.hdf5	9c306c9185d3694cecf94de5dfe51450
+part_01946_of_04320.hdf5	f5d2b6577764e89019dc776d214f94e3
+part_01947_of_04320.hdf5	139b210d11d8188ad7e61c065043dadb
+part_01948_of_04320.hdf5	c0303dce60b428141bac3c68b215b3c0
+part_01949_of_04320.hdf5	b4a06003640e9fd2cc8bbf5816b7c6cb
+part_01950_of_04320.hdf5	47dd08dfd15ecdb802d3d213e9e1ff42
+part_01951_of_04320.hdf5	c19fc7ecd64f9dea26bbde9f4aac949e
+part_01952_of_04320.hdf5	e60487788bdbae66482d3a563b2a8c29
+part_01953_of_04320.hdf5	f36322e4d98e997b518337ea5f50d783
+part_01954_of_04320.hdf5	6e0f8e9627f248e19e65b82fd1dd4260
+part_01955_of_04320.hdf5	623763ca8f980f200385b350c9ea79ad
+part_01956_of_04320.hdf5	860430e85f5bd55520aa24fad6ce00fa
+part_01957_of_04320.hdf5	40beb0f45294e67968382d8a9503c970
+part_01958_of_04320.hdf5	d97cb776fcd4ca801624e40bbc2ac865
+part_01959_of_04320.hdf5	cc09728379e4cc3169d3c47a1e9a84aa
+part_01960_of_04320.hdf5	d348e9ad3d8435504e829246aac6e85f
+part_01961_of_04320.hdf5	9f67519716f084367f7d576c6a3a7925
+part_01962_of_04320.hdf5	edcbde998a35cc0f5ff48428d5f914bf
+part_01963_of_04320.hdf5	c45abdae3ecd129e34ec7c9199e5c922
+part_01964_of_04320.hdf5	3254286c7ec5d3911e6e469e1c5fbe70
+part_01965_of_04320.hdf5	563bea1149014e803126677a5adb81b8
+part_01966_of_04320.hdf5	1cd8ec35502e63242c078f0d773bd663
+part_01967_of_04320.hdf5	fb18ab42666679a07a18c74f6a3af4af
+part_01968_of_04320.hdf5	d08e79f6ea5bdfe2d0d7aedde2d49f3a
+part_01969_of_04320.hdf5	9c88ff461f860172f8d74f397e35ccca
+part_01970_of_04320.hdf5	978acc4e634caaffdf572ce9ca11faa6
+part_01971_of_04320.hdf5	712270fd90542c7400789c2bef49fb88
+part_01972_of_04320.hdf5	eeaa36b4a608219007af7977af69364e
+part_01973_of_04320.hdf5	3325f4fbe7ff0c13119ec195bc245d1f
+part_01974_of_04320.hdf5	a3f369bac6a651cffe487d3a43ebfbc8
+part_01975_of_04320.hdf5	722ea1b55092a2b9c0ccb513d7be4ca4
+part_01976_of_04320.hdf5	0a42be37dce26233131c97acd31b2bca
+part_01977_of_04320.hdf5	c4a14a3e5d774378a57701e56712ef7d
+part_01978_of_04320.hdf5	22c16c8702be86641d483c7c580e68c9
+part_01979_of_04320.hdf5	be1fb7e32063c93459447086724f5651
+part_01980_of_04320.hdf5	a94749307be5a75fd1eba377620d8188
+part_01981_of_04320.hdf5	bed861c1013e8f497fd47d92375e0f61
+part_01982_of_04320.hdf5	43f02cba5af795ce69f55f6b599d107e
+part_01983_of_04320.hdf5	01679adfbb30c11a9dd874413241040c
+part_01984_of_04320.hdf5	6aac1eab46bfb8b06f2ac09cc7551da5
+part_01985_of_04320.hdf5	0823c8b3b83b4cea018fbb20b2c72b95
+part_01986_of_04320.hdf5	a0b1cbd3a4dcac875222ee77714969ec
+part_01987_of_04320.hdf5	a678227550487b8bca3f1757edd1427f
+part_01988_of_04320.hdf5	94b499157f1fe6f8c98aa72c5e5da110
+part_01989_of_04320.hdf5	6def8970ef35851e1bf72373b2c17f40
+part_01990_of_04320.hdf5	a76fd748fd9b4c6cd5e669e43c25d3d5
+part_01991_of_04320.hdf5	07abdd76988e2c5617aa8694de82e412
+part_01992_of_04320.hdf5	9c979eb755ed90dc3a33af37aa30de36
+part_01993_of_04320.hdf5	5757d031305e0398b84f3c053e3ecdd7
+part_01994_of_04320.hdf5	6e67f3bfc4ce9142b5f052beb0b1812c
+part_01995_of_04320.hdf5	9e9f1c6a183267783ccdede6ccb11e8e
+part_01996_of_04320.hdf5	fcb34fd18132e57126f7bb8ec919dbcd
+part_01997_of_04320.hdf5	0f5807e82f131c5e66f704873f5428db
+part_01998_of_04320.hdf5	e272fd983614dfcfe1689a18b74296a8
+part_01999_of_04320.hdf5	de158890fa6e8c75be2ab0b73da198f8
+part_02000_of_04320.hdf5	76aac5d9d44e4c7737b13eaedd31b516
+part_02001_of_04320.hdf5	de101ffbb78e2cfdb08b2237fd01426d
+part_02002_of_04320.hdf5	c9e0fcae090935af2bdcdab140751402
+part_02003_of_04320.hdf5	ff3ba8b9f66b88c89cd05d8c6863c138
+part_02004_of_04320.hdf5	9a042363b46ef5b841ce8e71b42dad69
+part_02005_of_04320.hdf5	444d238d19dfde63cf0cccc739154a1e
+part_02006_of_04320.hdf5	0b4e9e69362aebd8c1983c8c35775154
+part_02007_of_04320.hdf5	7afc6ab0c0904bc650b4c4595b1bf07d
+part_02008_of_04320.hdf5	d76d52045b406bd3ba0b655186c75892
+part_02009_of_04320.hdf5	04d7aa127c3ab20b12b1f2c3a02143e2
+part_02010_of_04320.hdf5	e9def7023d1d28aa5681f8f51e7a86ea
+part_02011_of_04320.hdf5	853c393b593947f319af784e513c40ff
+part_02012_of_04320.hdf5	e1e86ff5e5477cc3f4a7ea02d49d45fd
+part_02013_of_04320.hdf5	ffc6499f02a6a229cb8c41819c8f8953
+part_02014_of_04320.hdf5	61b74dd0778162c6d40486a94fed2d86
+part_02015_of_04320.hdf5	3f48ab38d85d0cbc640b07325cf51460
+part_02016_of_04320.hdf5	04ba7783baddb2dc31cd97139b8ed5e2
+part_02017_of_04320.hdf5	a2deff15bdc39c8f1f18b909fd0fed6f
+part_02018_of_04320.hdf5	73b8e6779c33055fe9eeb388b6b851fc
+part_02019_of_04320.hdf5	9c4d34dfbbf5782bbee8852fdd6cbacd
+part_02020_of_04320.hdf5	384e370371e7215281bf8ce4d84317c6
+part_02021_of_04320.hdf5	0b4bf220599440d632a033319fa5d7e6
+part_02022_of_04320.hdf5	99b09d9ebc3a65b7289a33458bfdccad
+part_02023_of_04320.hdf5	98fa0dc5ebc02f10d42ae48c51ef58b6
+part_02024_of_04320.hdf5	afc269135538f4389bb4d66867d76324
+part_02025_of_04320.hdf5	c5b0501ca881f85f18810aa3a4197e89
+part_02026_of_04320.hdf5	a0a48bb0906c4110a2039c2b166e788a
+part_02027_of_04320.hdf5	9c2438f9ae724d45432363a77978391b
+part_02028_of_04320.hdf5	7a86031b589202129c14a080306e2383
+part_02029_of_04320.hdf5	c7ee65c4050bbac20af4fd163584b7f8
+part_02030_of_04320.hdf5	df1086ac132c3a3bb9dda3b54c3d0540
+part_02031_of_04320.hdf5	251a27cbcf02b340ac1c4766eae19dec
+part_02032_of_04320.hdf5	9015d17158364767a0a38481b3a71e70
+part_02033_of_04320.hdf5	3e433193f8c7b4c671e38afb70c3fd01
+part_02034_of_04320.hdf5	32f13902f3a10ededb925ee6a301763e
+part_02035_of_04320.hdf5	483e90da158247aebcf0a4cfc06560b0
+part_02036_of_04320.hdf5	a74b7071954df41ba8b0559398cb5cfd
+part_02037_of_04320.hdf5	601356f7da8ff9a064aa998db33c805a
+part_02038_of_04320.hdf5	1438e3ce793e54fa4deb1e6ccf936aa8
+part_02039_of_04320.hdf5	796c42de238609c9f10a956913cf4bfd
+part_02040_of_04320.hdf5	064e140d4704441c35c883766656c5ac
+part_02041_of_04320.hdf5	323c64b1eaa3df42da3f8af5e3b67245
+part_02042_of_04320.hdf5	6c31b7d3415a5497b0d58cc3fdd84445
+part_02043_of_04320.hdf5	88b11d44e12eb0dbb6bfd7636dde702f
+part_02044_of_04320.hdf5	c8a70a6b6b8fcf98919de94edf8d841d
+part_02045_of_04320.hdf5	3a665edbb6d9c67ddbdcdf695f286530
+part_02046_of_04320.hdf5	167d9e885b62f4edb2a9f478dc1fa4b6
+part_02047_of_04320.hdf5	80274ca1f9a763217ee08d4b2cbc2c71
+part_02048_of_04320.hdf5	5b26e544e752b647653979bf7dbdcf51
+part_02049_of_04320.hdf5	ef078912598daaa43a3c0aef84becb4d
+part_02050_of_04320.hdf5	244dccfdc641e1dce4a9f1e62c1a7fc0
+part_02051_of_04320.hdf5	bba973b80a4caf817d1571d2871b65f7
+part_02052_of_04320.hdf5	5cfb33c5a5454adbc8017c965f796edc
+part_02053_of_04320.hdf5	1e7565d3fd28a577e8e8ce2b8f52b280
+part_02054_of_04320.hdf5	d882ea04aaee89ff714f33b0dce2aa6c
+part_02055_of_04320.hdf5	e7c91dcf5933bc1aa51a3d38c539fdae
+part_02056_of_04320.hdf5	0433fc8eac3a32c917762a727263a208
+part_02057_of_04320.hdf5	34e945631a03fdf308945a974c24241a
+part_02058_of_04320.hdf5	026cb6331d8e00dbd9c2daeb10be3440
+part_02059_of_04320.hdf5	c67ae07d9a968e42a745f0a4ad8d2174
+part_02060_of_04320.hdf5	8caab2cd003d088fbd1d6b763c7efb66
+part_02061_of_04320.hdf5	c1b5b335d47a7831d8c15c05afd278ae
+part_02062_of_04320.hdf5	6a7b1811de76ea6f926276f95895fd45
+part_02063_of_04320.hdf5	6f73c6779a35a759b4934dc86a7e8413
+part_02064_of_04320.hdf5	23a6504536ce98e32cc38a86cad0970d
+part_02065_of_04320.hdf5	e17f4dfb1f031f38099a44572f0afe7c
+part_02066_of_04320.hdf5	7296c9b1d8814960a2ccde2a1c23846f
+part_02067_of_04320.hdf5	6994bcdf0a88a8516789fd55fbe9a043
+part_02068_of_04320.hdf5	2369b66028471344307558e9dc069914
+part_02069_of_04320.hdf5	1055aa25239328ec70c795ab332b2e8e
+part_02070_of_04320.hdf5	bb7b7a796018e7ec2b91f495258f62ad
+part_02071_of_04320.hdf5	721c6b0ccefa7b3cd7c15b78ed66c659
+part_02072_of_04320.hdf5	ee3788d736f2699df502e92b896d318c
+part_02073_of_04320.hdf5	cb43d0f38164fe3baf211467290da225
+part_02074_of_04320.hdf5	01b31badfe40ed7a3733c35a11aa6034
+part_02075_of_04320.hdf5	e6350a1378b085ed699235fb851acb3c
+part_02076_of_04320.hdf5	6088dcbaf33de3f0b0651d18600cdf51
+part_02077_of_04320.hdf5	a75c9a9ecaff412ea1590779311cdc95
+part_02078_of_04320.hdf5	e4a957ce2167a9040ecf74379fd06224
+part_02079_of_04320.hdf5	37f0608578435151fd9637683cb236b7
+part_02080_of_04320.hdf5	96c72b827b8701a058f8d7722998f132
+part_02081_of_04320.hdf5	ee1221aae7c552a69117a9314c1a770d
+part_02082_of_04320.hdf5	60ed91247463886968163759118ce5fc
+part_02083_of_04320.hdf5	d6a8ddebddea21e894a6437201564d15
+part_02084_of_04320.hdf5	7ab3f041df895fa463c5449b1eb840d4
+part_02085_of_04320.hdf5	b69987e7bc42c7c57487e03b87f78c4d
+part_02086_of_04320.hdf5	32d7572e9d48a308f9990474a9a996b6
+part_02087_of_04320.hdf5	92ba51dc8cde2d180781d343cb1b2182
+part_02088_of_04320.hdf5	a4140415be708b0df1dd5218bd1e021b
+part_02089_of_04320.hdf5	375e7c4f7f58c6fd9200baa1c75a469a
+part_02090_of_04320.hdf5	89c06db38f79328309bc4807c4217c4e
+part_02091_of_04320.hdf5	0a181d42b618e7cc7e674d0bdd2d890d
+part_02092_of_04320.hdf5	4c7d10464f4a68c20ccce2d577924560
+part_02093_of_04320.hdf5	0d7804219e12f8f11d417cda9ece7c9d
+part_02094_of_04320.hdf5	709ae007d89d54386587dd2bfd045bc4
+part_02095_of_04320.hdf5	f091cbf1f0002697e4fd9c6efaba08be
+part_02096_of_04320.hdf5	b5a3cb85b07fa4d438206291b8b30b77
+part_02097_of_04320.hdf5	c6057115cbf9b45afa21202f35548270
+part_02098_of_04320.hdf5	835ccd793239f96f25d8d05b9dd6cc7b
+part_02099_of_04320.hdf5	77a3aab5bdcd159ba4ee0765341e7580
+part_02100_of_04320.hdf5	cc8785bc3f76f60dd16e0815a67bc604
+part_02101_of_04320.hdf5	8440e479cbc84341a0addbec7dbb1885
+part_02102_of_04320.hdf5	9593ebcee4a0abe553fd8e47bac7e663
+part_02103_of_04320.hdf5	7995be9bea0fe854e4c0265fd9f1b400
+part_02104_of_04320.hdf5	ab0fc2d0c745f907638412a389d489fd
+part_02105_of_04320.hdf5	a988b23b69401db5e348387909c04318
+part_02106_of_04320.hdf5	ed5017f26b07e92b2c7aaf3bd132a727
+part_02107_of_04320.hdf5	3df8ca14013503cb9666dfa61b2da3aa
+part_02108_of_04320.hdf5	a3f21c0fc5f5b60238e3307f4103f69c
+part_02109_of_04320.hdf5	1141087f485bac0b1908f061b9464616
+part_02110_of_04320.hdf5	375e1f7de8df9768d71b9ef232289e09
+part_02111_of_04320.hdf5	8ac9827359aca6621f6d260dc20b4c15
+part_02112_of_04320.hdf5	1215bbfba267f2cf7da2e6e8519642b7
+part_02113_of_04320.hdf5	e518e818f543633441844077cae2540d
+part_02114_of_04320.hdf5	e2a3d56bf8d5faa265d7e77ec38475b5
+part_02115_of_04320.hdf5	21c20fd696cdb5d3c30bd4a4fd31428a
+part_02116_of_04320.hdf5	8abc6175c305d04fd92f5a4288e285ef
+part_02117_of_04320.hdf5	100a145eecc03c369d60e6ce72112bef
+part_02118_of_04320.hdf5	2c8e63987d62a725062d1f30b0d63777
+part_02119_of_04320.hdf5	1fd5e6fd0732bd1b063771b49a0aa1d6
+part_02120_of_04320.hdf5	faa2bef348cadde865ef9eadb996d885
+part_02121_of_04320.hdf5	a5cd85b3bff9fe04cd76eab727c1a750
+part_02122_of_04320.hdf5	adcc9e985e7a05fa4b9d0881d1b975c8
+part_02123_of_04320.hdf5	21886ea99ffdc86c6f7682fea5545eec
+part_02124_of_04320.hdf5	d55b247e96cf6b7e491ee64cff267602
+part_02125_of_04320.hdf5	bcfd6e1df8b600559522450a493bac5c
+part_02126_of_04320.hdf5	5dad146759a28f09d580ba52ed3de9b0
+part_02127_of_04320.hdf5	e63c637c83921f40e41de724918f0be9
+part_02128_of_04320.hdf5	b96e40f7c4330cfc492eb22a9ab9cbe9
+part_02129_of_04320.hdf5	74688e03353c81264cb3107770c45cf8
+part_02130_of_04320.hdf5	12f03016d5b0823623e175dab8a88e61
+part_02131_of_04320.hdf5	0c0c2bbbd378e20706ae7bb8b063a98d
+part_02132_of_04320.hdf5	b9ac133d31634088b3a6ec3ede229390
+part_02133_of_04320.hdf5	499b46de044efd271ec88c857ba1d920
+part_02134_of_04320.hdf5	b5ca14e53e6c00bf64724d86bf0c1704
+part_02135_of_04320.hdf5	1cc85794d58e468fba94a601dd11eaef
+part_02136_of_04320.hdf5	50d69f20e0e18efe1ed03667388564b4
+part_02137_of_04320.hdf5	e62885a68ecd7b4a57f7c0c7bffccc75
+part_02138_of_04320.hdf5	c1fa260362d12946d8a9f817a682051e
+part_02139_of_04320.hdf5	b4c17385c221134f65177b7595ae901d
+part_02140_of_04320.hdf5	bb8e24debc2a2f446f046d4ad7d87077
+part_02141_of_04320.hdf5	011c87f663b446be9a7ec9c868a0ae68
+part_02142_of_04320.hdf5	0e82ad99c548f014eac6691f3bf7d868
+part_02143_of_04320.hdf5	eb8366b6ae3fc5d7f85556c406efde53
+part_02144_of_04320.hdf5	804875df8f76383e2d7707741a56667b
+part_02145_of_04320.hdf5	869a432eabed268f5ac2c48111406576
+part_02146_of_04320.hdf5	3cce59b91684dcec94fc017b50736d2d
+part_02147_of_04320.hdf5	06c7dd8952ea3677cc0ee5eb5d0e8744
+part_02148_of_04320.hdf5	d76aa6801a0c9716bd134670528257b8
+part_02149_of_04320.hdf5	108361278136fe06c2a623b323955c77
+part_02150_of_04320.hdf5	24aec13f051c9d886efea1c3db963769
+part_02151_of_04320.hdf5	21e1bc557cf2b9e9292d1ab3a7795714
+part_02152_of_04320.hdf5	34ee62cfb903fd1d97dcfc1d86fc8af0
+part_02153_of_04320.hdf5	787751028ad904e181654be2ee358bc8
+part_02154_of_04320.hdf5	be6001c9882707b84dd77b4effd8efb2
+part_02155_of_04320.hdf5	fe61b68ab6c8d49d0448be14fef0a3a3
+part_02156_of_04320.hdf5	708495db6560c6b5be322c952152ecca
+part_02157_of_04320.hdf5	f7184c5769f85923c9e547832016c2c5
+part_02158_of_04320.hdf5	8d2bac1e9465651e68ab124af76a1bc8
+part_02159_of_04320.hdf5	f0746de9bece83f2ea84ff7e5f08182e
+part_02160_of_04320.hdf5	36e9a9fc6c6ef84461b499ac36141f0c
+part_02161_of_04320.hdf5	1dae4847ab866f74852063622f58dc50
+part_02162_of_04320.hdf5	fb21fb59b775d3d76b5ff9055ff75685
+part_02163_of_04320.hdf5	3a3e681f8291c32f2aa1483f282d6a3d
+part_02164_of_04320.hdf5	daf875d07190c334727f4a2c24eca4cd
+part_02165_of_04320.hdf5	28f720ee00bc1f632cd8edb27f7c1dfc
+part_02166_of_04320.hdf5	5dd949fe6fa83e43e946287793f91e0e
+part_02167_of_04320.hdf5	5ff87a63e17d4e8c8a1e324c9a2d31e6
+part_02168_of_04320.hdf5	38c5823f90db070ae97422cbcb013e5b
+part_02169_of_04320.hdf5	3424fea77f91fbb8f3b3157fe56e840c
+part_02170_of_04320.hdf5	4068717499156c602f8a61d7162f14f2
+part_02171_of_04320.hdf5	25ff168349d030766bb7ce3c0ec8ba84
+part_02172_of_04320.hdf5	4293b537fe08f137484f36b40117fa54
+part_02173_of_04320.hdf5	fcb9a201b9e898c9b72f10f44948a4a2
+part_02174_of_04320.hdf5	b143944cfe4ba58a438c75adae358189
+part_02175_of_04320.hdf5	a65aec2cdef241264aff104247c4dd2c
+part_02176_of_04320.hdf5	338bcb78d335fe290913f22120edf66b
+part_02177_of_04320.hdf5	c2a699a00ca22c5ecdaa2aa55f0de779
+part_02178_of_04320.hdf5	176a5919bd9b8fdc6f9113bd00102300
+part_02179_of_04320.hdf5	30d71e1e5221fb93b30670467e7e053b
+part_02180_of_04320.hdf5	496f89c120751cb71535eda202c9c4b9
+part_02181_of_04320.hdf5	e9de0a21a72f129ffc51294ee88eac67
+part_02182_of_04320.hdf5	2d320ae7df0dca4988c40a854bf74bfd
+part_02183_of_04320.hdf5	c823e861571b13fc2fa2cfcb8c6eb243
+part_02184_of_04320.hdf5	8b96a2ce61e87fd816500da9020f5c34
+part_02185_of_04320.hdf5	39ec93c0f9cf8ee7e82ef8e5880aa93c
+part_02186_of_04320.hdf5	80d09c430751c44c557c3f8d3359503c
+part_02187_of_04320.hdf5	0431f009fb69db4aff032746336430d9
+part_02188_of_04320.hdf5	e763208019dfb831336bce5a51db6999
+part_02189_of_04320.hdf5	db7163c4fb682ed1903021c9c79c6bd0
+part_02190_of_04320.hdf5	cc5e9f72ee880ef782532d9e961b5418
+part_02191_of_04320.hdf5	bacbd9a67baf1b2c37439a2c51be8e64
+part_02192_of_04320.hdf5	3882a5c4177b56116984e5fe8b0b34a3
+part_02193_of_04320.hdf5	690970131020799bc26dffc9b5ec65d6
+part_02194_of_04320.hdf5	271e87e06819534ca1f2550cd0d3f5da
+part_02195_of_04320.hdf5	e1a33bd8ae7acad3bcce4ad6db365288
+part_02196_of_04320.hdf5	452562ed7fd7371b7bc96bb22f4f31a3
+part_02197_of_04320.hdf5	2e4259b76ef2b4c62ce481b4d38fc145
+part_02198_of_04320.hdf5	39e969f5873317b88978921f1eab419d
+part_02199_of_04320.hdf5	22776fa618307292b7e79c61006db2e9
+part_02200_of_04320.hdf5	30ca3644f26c99710cb9dcf33c2b3638
+part_02201_of_04320.hdf5	fdb25b0fd198a041a63d1fcd4e3c1e12
+part_02202_of_04320.hdf5	eb3ba00936caa0d3d6b61af48d67ebdf
+part_02203_of_04320.hdf5	c6d679c1baa3eac867a47c4f99835b8f
+part_02204_of_04320.hdf5	55d4c2db2652a93a8bf814b0501ac00e
+part_02205_of_04320.hdf5	3264d75eab8293a43d9a5e748885271b
+part_02206_of_04320.hdf5	1b9ca0e81b442c9116ecc965471ca52a
+part_02207_of_04320.hdf5	57b71d8988b8536c1655b6fafbe3e08a
+part_02208_of_04320.hdf5	79f70d1d633c631dacbaef6ae26bef3a
+part_02209_of_04320.hdf5	867a726597229c3cf0a76d12c6462adf
+part_02210_of_04320.hdf5	2b2890ae005b11792fe2c6236d3577f6
+part_02211_of_04320.hdf5	e506c00c3c45f49e7bcfaec79d6fd99c
+part_02212_of_04320.hdf5	f4d85b769c691714f8bb699925054394
+part_02213_of_04320.hdf5	5d48d98925c350dc9d96f7f060110214
+part_02214_of_04320.hdf5	9a2220d5b5ab3e44afb58e97efda8487
+part_02215_of_04320.hdf5	175a73d126311ac906a18f9e9a3252e8
+part_02216_of_04320.hdf5	378c378874cadb9d44de7b9e6c1ca739
+part_02217_of_04320.hdf5	48340e7e636827e633a827b3a7346321
+part_02218_of_04320.hdf5	c8a02832bdfce8c78167188482f3bbaa
+part_02219_of_04320.hdf5	68f14a0dd263c80c9f3274e92ea4e38d
+part_02220_of_04320.hdf5	d7532d363d0fbc6169e61b895811bd90
+part_02221_of_04320.hdf5	9fd7866bac01905d9e8e955d21656a35
+part_02222_of_04320.hdf5	850d08918f35c83e9faa38854f526e15
+part_02223_of_04320.hdf5	2ca4d319e80712704ad06c6878b8ef97
+part_02224_of_04320.hdf5	7fc30b3d6cccf2c363b473d8b4d969bf
+part_02225_of_04320.hdf5	c35a852aff21442402e4dedbbbf33e31
+part_02226_of_04320.hdf5	5aa626f0c09dbd71148c17b5fb96c787
+part_02227_of_04320.hdf5	d4acec0b1194d9406b1a74a7153de982
+part_02228_of_04320.hdf5	9477ffdc8e5bf8a7c4b8c702cb9fd60f
+part_02229_of_04320.hdf5	8790cdda1084180d694fe5358da1300d
+part_02230_of_04320.hdf5	75e0910f31dfc8c169dc68a7efdbecee
+part_02231_of_04320.hdf5	a26e7a38a7472b7ebd180c1a2f824cf8
+part_02232_of_04320.hdf5	40fc0029860a00f92c641487006be5c1
+part_02233_of_04320.hdf5	67d9d759838cd0d89b25a08afe71feb5
+part_02234_of_04320.hdf5	d7403bfd037200dd2973f0184fc5c381
+part_02235_of_04320.hdf5	9eb20842ceca246a11b209e031d90d19
+part_02236_of_04320.hdf5	2d181dee048ef5496a1390659af02657
+part_02237_of_04320.hdf5	8ac589fbff3c085294a60c4aa5ceac00
+part_02238_of_04320.hdf5	0d01655692c6092723e2e6f4666f8f54
+part_02239_of_04320.hdf5	c0bf1d6fac0712dcefb1907720548409
+part_02240_of_04320.hdf5	a3cdbafdec0e885b73db0866bc0a2093
+part_02241_of_04320.hdf5	be22058570d43405f921e9398dd8d1ec
+part_02242_of_04320.hdf5	f34e3d7514a47f2b68e343871135fc78
+part_02243_of_04320.hdf5	06c3fe2041bdfc21daab59cf8cdc097b
+part_02244_of_04320.hdf5	7cecd3c0360255abfc126eedf70294d7
+part_02245_of_04320.hdf5	e5806b0ce8c87635f543cae3a4a7a141
+part_02246_of_04320.hdf5	2912dbe4cda02d93a76d68c16bfcf954
+part_02247_of_04320.hdf5	db0bc5719656d19ac7833f1e10fd8409
+part_02248_of_04320.hdf5	79fba5575f2fe5c30b88952289c16179
+part_02249_of_04320.hdf5	c1f7d72f4022bbca6b562b57c3453654
+part_02250_of_04320.hdf5	0399592a73e98f512f642d5dc7a909f4
+part_02251_of_04320.hdf5	792d2ad699dc97439eb7c1b6a4e1bacb
+part_02252_of_04320.hdf5	df2e999ea5f59ab39c4fe733f21c5244
+part_02253_of_04320.hdf5	d0da07be0eef6c36adaf6c2c7d916b9c
+part_02254_of_04320.hdf5	798b99c453a065d7762412e8d4acb894
+part_02255_of_04320.hdf5	db128c183869e0bb93338ac7918765aa
+part_02256_of_04320.hdf5	10e5f2821a8c6472e52835a7bd2fd007
+part_02257_of_04320.hdf5	c9c79c9baaf4a3a8c273760ae44c84ab
+part_02258_of_04320.hdf5	db404c91085508033b75ca9593f5da34
+part_02259_of_04320.hdf5	33f41879d82596b9367152016327bfdf
+part_02260_of_04320.hdf5	9ddf2ce02bd6e53237c4a8891238af75
+part_02261_of_04320.hdf5	13073fd2a3870831e116b85d39890b76
+part_02262_of_04320.hdf5	0528c325bc0e338761c71ed21739e691
+part_02263_of_04320.hdf5	717dab4e7e6016a1a3bbbf7e018de44c
+part_02264_of_04320.hdf5	ebf5fa80d48fdfca07c9bf70d9703b05
+part_02265_of_04320.hdf5	c68ee6749d341bee92505e4049958abf
+part_02266_of_04320.hdf5	763afc85390bc9c74575a1c4d95da5e1
+part_02267_of_04320.hdf5	ca3b20687191b184a683f34d9a5da5be
+part_02268_of_04320.hdf5	91e6d5550103d3f160f4dfa81ae3e34f
+part_02269_of_04320.hdf5	f39c09b8827748873ae2aa6e64606165
+part_02270_of_04320.hdf5	bbc4fa01090eced3dc0c16825afb1b7a
+part_02271_of_04320.hdf5	ad8a81d7b86dcdf2a95f1691108c1e39
+part_02272_of_04320.hdf5	6ea1bc6d07d860690adc794cdb0d58b7
+part_02273_of_04320.hdf5	32dbda0982cf9bec31c51c8483a234a0
+part_02274_of_04320.hdf5	c3cbfbc3342b1e027528b0c2eb9c404b
+part_02275_of_04320.hdf5	ba781768f0edc6b7c74f4f6b06c48c14
+part_02276_of_04320.hdf5	b07c4939dc66e03b2d1b584d2110b483
+part_02277_of_04320.hdf5	76bec46ed030c226196f1fd4bc0af2c0
+part_02278_of_04320.hdf5	0cc180a31e4f37f77461cecf8dc52831
+part_02279_of_04320.hdf5	d58d89b32fca98a7d9fc03c1226da670
+part_02280_of_04320.hdf5	aafac64764003808b5f8469b072e04e4
+part_02281_of_04320.hdf5	0909134df98cdd1c5b877c2317049310
+part_02282_of_04320.hdf5	2d56495b5aa09ae33725c284acf2da8c
+part_02283_of_04320.hdf5	784b189f320c3ca29b8b97dd78102fff
+part_02284_of_04320.hdf5	258fd4f149e7682b1e73919a76cd1766
+part_02285_of_04320.hdf5	fb8c0877ddbf16bb9c99a71ca64c0aed
+part_02286_of_04320.hdf5	507b3fe9d3b67964709aa21c30871ec4
+part_02287_of_04320.hdf5	fbecf7f30bb673df05a21bd1b865de4b
+part_02288_of_04320.hdf5	f1f93ad5933a32d37f35dec1273cea4e
+part_02289_of_04320.hdf5	c86763e7b510c5f7beba53e965f6bf64
+part_02290_of_04320.hdf5	44ded8e43b4660e4e5cf8cccd2ca59cd
+part_02291_of_04320.hdf5	1953ec29241b2dc7154f1ff0c538a4d1
+part_02292_of_04320.hdf5	e477fd92b3cd557b863953e39265fe09
+part_02293_of_04320.hdf5	a2c4e953c9db0192c9cbd6b3406d5bca
+part_02294_of_04320.hdf5	6ae9fddc80c7990d7f83b7a95a27bb47
+part_02295_of_04320.hdf5	a147b71198876dcb60ab7a0eff69f436
+part_02296_of_04320.hdf5	2e686f8b2077673b62d48bc3426eb409
+part_02297_of_04320.hdf5	7e1dee09232b2387b2d3146b646720b3
+part_02298_of_04320.hdf5	7e84f1a3ecb3e508dcff53cb501c7e42
+part_02299_of_04320.hdf5	880b9f6a9b62fe7ea4d4a2408111f9e3
+part_02300_of_04320.hdf5	369d2fa0d3bd767cdaf59bcc0d3ea1a3
+part_02301_of_04320.hdf5	84678dc0f593630d0356ab76bfb5c505
+part_02302_of_04320.hdf5	25c1bfcbc432dd1c0fe9a00d62b88c0e
+part_02303_of_04320.hdf5	478fe28e514262b78b9798d8c6a7c42e
+part_02304_of_04320.hdf5	b23cbc3d526cbbce1d0451d8357f4953
+part_02305_of_04320.hdf5	8810d8683f8d325aed01bc2cb2cca90b
+part_02306_of_04320.hdf5	066c67aff9c5f6d918e580070288d5c5
+part_02307_of_04320.hdf5	d66776103a8a41a9b5b1efed1efae49d
+part_02308_of_04320.hdf5	e071c5b73a3476b904469d8dd1c3da1d
+part_02309_of_04320.hdf5	48ebca04d6f59d71e69f2c51bc713b1f
+part_02310_of_04320.hdf5	8d3b0d86d9f832337e24126d7e60d8aa
+part_02311_of_04320.hdf5	cd8643b1319de76903cda24f3cb472c0
+part_02312_of_04320.hdf5	e0d5c0069db1d166e19685a58801a9cb
+part_02313_of_04320.hdf5	2033c5b9cc021512b33fd3b24fb1f442
+part_02314_of_04320.hdf5	70a916f52eafca43269d012cb5bafbc8
+part_02315_of_04320.hdf5	e67ef839391d5f553ec68a929b82d28d
+part_02316_of_04320.hdf5	b83af2b141f9e3acaac862df0102274e
+part_02317_of_04320.hdf5	12465ef6cf1f81adbbae1145ae5ef0b3
+part_02318_of_04320.hdf5	d43a809d5c72fd4802ce87ddc166ba2b
+part_02319_of_04320.hdf5	b505a1f77db00d3ee90a9286f3171672
+part_02320_of_04320.hdf5	94f5c890aba94fce1d78bc38e38b6ca7
+part_02321_of_04320.hdf5	dacb14807db150d614ed11cbe18902bc
+part_02322_of_04320.hdf5	8bf876cdeff05dac7e601257b178f4d6
+part_02323_of_04320.hdf5	17eaea903ac089d04a231f22b8b3f086
+part_02324_of_04320.hdf5	d73fbb9e97447fec22aa8d3851baeeee
+part_02325_of_04320.hdf5	49772fe37a9790e91afbffa93ec0a499
+part_02326_of_04320.hdf5	179c126cb8049efb1c143c20778d40b8
+part_02327_of_04320.hdf5	8a7608e83ebb2a1eeb15296b10dc5782
+part_02328_of_04320.hdf5	2098a32c5be7ecb070aadf1cd900957f
+part_02329_of_04320.hdf5	54b78e1b142045c487b6394c62f292a7
+part_02330_of_04320.hdf5	afacd1c544782d9e3c1c1256afac4e2e
+part_02331_of_04320.hdf5	d063b481cd2f297d669bc93a78ab664a
+part_02332_of_04320.hdf5	c0e0b78a068d9fba74e29178236c47b6
+part_02333_of_04320.hdf5	e231494243c9f333e30fbebb60b1ccd0
+part_02334_of_04320.hdf5	0e4b6874f0b33bd0b4508bced8560724
+part_02335_of_04320.hdf5	ee445e300bfbb26a0af85ff357535b7c
+part_02336_of_04320.hdf5	3dfa81e7d60a176210123d97bc0c9216
+part_02337_of_04320.hdf5	622412ea5dd2687ac5bc6f1536734905
+part_02338_of_04320.hdf5	f3b36977c465c60849db89e95a5af8c2
+part_02339_of_04320.hdf5	39f55582bf674c76abe92d69eb0ca282
+part_02340_of_04320.hdf5	34949991667f94a2236629896d52cd35
+part_02341_of_04320.hdf5	c13cb2d3cd4f52ed5f98fbee5f5c96b8
+part_02342_of_04320.hdf5	63526d948cd9268162d5061c6180689f
+part_02343_of_04320.hdf5	f175ca83f02342d152849e29e2482f78
+part_02344_of_04320.hdf5	23fbd280c52010a001510623389501ea
+part_02345_of_04320.hdf5	9cab064f94c1319e6d551797a8189616
+part_02346_of_04320.hdf5	bd6530aec2b4d990eb9f8e241553cf7e
+part_02347_of_04320.hdf5	ea6ebc2fdb6ccff43053c076a23756a0
+part_02348_of_04320.hdf5	eb4ae8d94511205edebbe252def66cf4
+part_02349_of_04320.hdf5	8b95ab227700d592715d537bf87e1896
+part_02350_of_04320.hdf5	712add3d4eb873da959dc4b2e579db47
+part_02351_of_04320.hdf5	73c054f8ea657694066248c27fb9ec62
+part_02352_of_04320.hdf5	eac586ea2a6268c58b16425072589d64
+part_02353_of_04320.hdf5	e0cf8e2e48a3aca9de36f272ddc895af
+part_02354_of_04320.hdf5	f5e1c095691e84761f3a3d6fdb848e48
+part_02355_of_04320.hdf5	5dafd39914f0df02f3bf2be30f0352c8
+part_02356_of_04320.hdf5	e894f4f045c9ba5ad4ee967f23ed4d88
+part_02357_of_04320.hdf5	26fbadef3fbdc78f3c4744fed7a3255b
+part_02358_of_04320.hdf5	d1a2109e8bcaf4ed471205b2f53c9f53
+part_02359_of_04320.hdf5	5c142d0aedf20ecd50591282d75dcd91
+part_02360_of_04320.hdf5	0a19d83160134505cf52763a7cbc1ac3
+part_02361_of_04320.hdf5	0e1d55c6088189931d33c43e975c62b2
+part_02362_of_04320.hdf5	af48d610ce188ca2a1093f85135fe318
+part_02363_of_04320.hdf5	637d3f253b23806fcd2467ec37cb5ca7
+part_02364_of_04320.hdf5	aac684dc8a95645bde7d08143b53178c
+part_02365_of_04320.hdf5	b9c4276b17154374b655fa53c764708e
+part_02366_of_04320.hdf5	845764797ed1b2f14cf3227645ad3f21
+part_02367_of_04320.hdf5	c03d1e310a9a21eb28ddfcb9c4b0bb81
+part_02368_of_04320.hdf5	6ae767243579076225dff4fc2d2d56da
+part_02369_of_04320.hdf5	3c343a2a46a92d077d21efea0fa99a44
+part_02370_of_04320.hdf5	3d0c590495b3d7d39544c2fe82bdf850
+part_02371_of_04320.hdf5	763807706e0cb44c5489f43caed0bd7c
+part_02372_of_04320.hdf5	db0b1b28594e3a481b459ec03df26e29
+part_02373_of_04320.hdf5	15979ce517dbcbe686e84cfed7d212b3
+part_02374_of_04320.hdf5	14329c3d3ee7209d1de51f1fa49db68c
+part_02375_of_04320.hdf5	226c79d97aa85c330671ae1fb9ec2355
+part_02376_of_04320.hdf5	d21f0647a8c15c7da5bb9a4782b8b333
+part_02377_of_04320.hdf5	9123d40440eea3a582ad9d6acc173acb
+part_02378_of_04320.hdf5	603bc50360f6e906ebd5bc46745fb6f9
+part_02379_of_04320.hdf5	639f4142bff2236af7a958f0fafe6eda
+part_02380_of_04320.hdf5	2ef68f66efe6d6c070bcb863079adf12
+part_02381_of_04320.hdf5	d89ef01d42fbf0ded79fa0fc960eafa6
+part_02382_of_04320.hdf5	c633a2320c81a7d7c7c4c1973c97ee24
+part_02383_of_04320.hdf5	f6f2bdc29f24a42aa51c5efc11abcc86
+part_02384_of_04320.hdf5	dbb874a2ed93af8666e0fc87f4724869
+part_02385_of_04320.hdf5	8666aba075e87d01802206f804478204
+part_02386_of_04320.hdf5	3d88866a2cc4f8e20638644e6d6402c5
+part_02387_of_04320.hdf5	fe329edeadc5d363bb671d0d628175c0
+part_02388_of_04320.hdf5	013780866758161419f92dec6df34720
+part_02389_of_04320.hdf5	82e7709f7e66180bc37c81bce5aea4f4
+part_02390_of_04320.hdf5	9c33f52b24ed4f139b40e579a2883b1e
+part_02391_of_04320.hdf5	712eb79aa5e617ff735917554b093be0
+part_02392_of_04320.hdf5	101a338c432ad5ef31d1e1787e7cdefc
+part_02393_of_04320.hdf5	8fd6d0d8013ace43022d8a6a5f3fd473
+part_02394_of_04320.hdf5	311a44aa32b5617c4b3d1c2cae47bd7a
+part_02395_of_04320.hdf5	0497015e87cd40db318d51497d8a261f
+part_02396_of_04320.hdf5	c8a46205a80bb8e94561846e5bda4a15
+part_02397_of_04320.hdf5	23698f9052defe1b8e1dd5d398702918
+part_02398_of_04320.hdf5	d68c6d2d9bd17719d5cbd2a4d77ee7ed
+part_02399_of_04320.hdf5	29e92065ebdb0a5a61944cbc47c18932
+part_02400_of_04320.hdf5	bb2d31ba3d05186de74130dbaa5d6557
+part_02401_of_04320.hdf5	874b4a2b07993c3446688ca1042d2964
+part_02402_of_04320.hdf5	867d50ce2ca6adc80facec76e181c2f8
+part_02403_of_04320.hdf5	6176242b8761d119f0a0ff7b24a28614
+part_02404_of_04320.hdf5	99245dc9d5754e1be9e581857813568c
+part_02405_of_04320.hdf5	9476966a597c25c8c410f5b60e2a9c2f
+part_02406_of_04320.hdf5	f5e7b2f862d7781d48209ac143f33609
+part_02407_of_04320.hdf5	286622e9611a98abfc13c61460993048
+part_02408_of_04320.hdf5	3ff753b3142a56c12eca429ccd1c6878
+part_02409_of_04320.hdf5	71f5ae73517e5fac3580c33ce890076c
+part_02410_of_04320.hdf5	046550bccb41d7ff94a4417036656247
+part_02411_of_04320.hdf5	909a7fa08bfac550170aea15e8f9c2c3
+part_02412_of_04320.hdf5	fbb57944ad8c52dc329d0e4dbadf8ba9
+part_02413_of_04320.hdf5	79865df7bfca3abff64cc39727beb257
+part_02414_of_04320.hdf5	fdcbc10b1102edebc9b696cefb4b8922
+part_02415_of_04320.hdf5	3e47a00c16fad5501b57c8013c5a5dd8
+part_02416_of_04320.hdf5	c6d895c5d2de333424a7dba52098f27d
+part_02417_of_04320.hdf5	21851d63342d767b833174ec16d8d4f4
+part_02418_of_04320.hdf5	dcfcb565ffa2b894236e2a3c44e3321a
+part_02419_of_04320.hdf5	01650a0007daa651556dbbf948caddf4
+part_02420_of_04320.hdf5	bc7f60844c7cfb99807d73d17577080f
+part_02421_of_04320.hdf5	3d360399cf31dc92ab43aa4649ffea20
+part_02422_of_04320.hdf5	794d789d4a0b32e2372a69994782e3b7
+part_02423_of_04320.hdf5	921fb97e9e618d8bfb4c8e2d1400bdb1
+part_02424_of_04320.hdf5	cfef7da4a125dc8c8a23d197a55d6746
+part_02425_of_04320.hdf5	54c30f9140f6f16ab4f20ad1f019dc3a
+part_02426_of_04320.hdf5	63586e22851e996a28eee4753a8980b2
+part_02427_of_04320.hdf5	9e4188cdabf9d6a6c11bc04df658de37
+part_02428_of_04320.hdf5	548badfa1b65232c5195dfde62201ae1
+part_02429_of_04320.hdf5	761a0653cf9d4585d1c20517a32e7f27
+part_02430_of_04320.hdf5	9d488cf9b05c0b26677d8063c3a7e02d
+part_02431_of_04320.hdf5	c50fbe061826c63e9d10ca695c955148
+part_02432_of_04320.hdf5	6df9e085c95fe0a9545df3fd020fa57a
+part_02433_of_04320.hdf5	38b7ac3c2f2c4e45d78ec5b87bb9b99e
+part_02434_of_04320.hdf5	f398872e4df3fde3c1eadd14b0ffd031
+part_02435_of_04320.hdf5	d493d01708dd24f6242a0a1453a4129b
+part_02436_of_04320.hdf5	1e95a3ca1d66f785063fb1d5833043ac
+part_02437_of_04320.hdf5	5ed5d9e10162803489399e3d331fc8eb
+part_02438_of_04320.hdf5	f5b927b5901bacbe27d0335599d9eae0
+part_02439_of_04320.hdf5	1a97c5861e19beb4e75b753abf9101e2
+part_02440_of_04320.hdf5	144141ae3b07a99327b686aae3e275b2
+part_02441_of_04320.hdf5	5dcad80891b599ac5d1980aade5a5b1e
+part_02442_of_04320.hdf5	f9db640d45d3a9c0a23c0d65e2b7c6ca
+part_02443_of_04320.hdf5	73ee3364c06a6c8369bc4aa9986c429c
+part_02444_of_04320.hdf5	8b93bc07735fbfd447554f16e730fffd
+part_02445_of_04320.hdf5	7ad9ef8f55ee4ca2a6a12b84cc777b63
+part_02446_of_04320.hdf5	166ec052a459316baee36fefa28537b4
+part_02447_of_04320.hdf5	6c86af2f22a980bd61a0edf18ed6ec66
+part_02448_of_04320.hdf5	248ec8475949014bb926eff1425e7079
+part_02449_of_04320.hdf5	7607a08710f4544400e86a77d0e32fc9
+part_02450_of_04320.hdf5	2d003e66b0117c0de80d2344c3b8edb9
+part_02451_of_04320.hdf5	14b5bdac7d2f40277aca180649ac8f17
+part_02452_of_04320.hdf5	60335a1a8a81ffa8c345a7b8d6156a7b
+part_02453_of_04320.hdf5	07c51863414494dafb14f2849fae9b1a
+part_02454_of_04320.hdf5	33e8d0341dd281d0aeeb42a3db3d2c11
+part_02455_of_04320.hdf5	08882fc47cd4e2403b6712e98b24d29c
+part_02456_of_04320.hdf5	652d8c0b37e2fda38671872426696726
+part_02457_of_04320.hdf5	ff9e8078ca38d4f8205615b7ab0ff0f5
+part_02458_of_04320.hdf5	46bf9590a8bc74169b224a3f356ab3c2
+part_02459_of_04320.hdf5	3c795022a9d3312b661cb8f28459b0cd
+part_02460_of_04320.hdf5	5766a6249267ec82a4ddf126e386cf13
+part_02461_of_04320.hdf5	391d50dfb4adbae86e8180954ffe19a7
+part_02462_of_04320.hdf5	8a3d7bc90322f5306fad79c11d41f83e
+part_02463_of_04320.hdf5	0db110f1c29a5f7a4c2395c4b034093d
+part_02464_of_04320.hdf5	b950961ae53389f0b47ef73be30b1fd1
+part_02465_of_04320.hdf5	4c9aee8bd13e9e43005a7e09dac250ad
+part_02466_of_04320.hdf5	be75484836d37278638ab9cc8ef44b93
+part_02467_of_04320.hdf5	1f292c31419b1a1a78cabdbe9eadaa05
+part_02468_of_04320.hdf5	5d3aca7784f9a345156c77e6a5fc20ba
+part_02469_of_04320.hdf5	5afcd3a4b5a6f4d5fd80ccccccf68c45
+part_02470_of_04320.hdf5	572edf179687ed207e20edf1a102b1f8
+part_02471_of_04320.hdf5	e1127c3d149e83aa4b9ce35c6a4eaee8
+part_02472_of_04320.hdf5	fabc382ac01dbf5095f48af1a09bce66
+part_02473_of_04320.hdf5	b637c725158b37a44d6b9516c17b841c
+part_02474_of_04320.hdf5	f3af7cc4ef2fbcad381a6f08cf1b3507
+part_02475_of_04320.hdf5	18e638e89e83587af9537af00dc63670
+part_02476_of_04320.hdf5	a5878ca49db864bc3f4f1a7b2531d79a
+part_02477_of_04320.hdf5	0ffbc9396f474e7debff34b5dc1d7444
+part_02478_of_04320.hdf5	8f313cc9ba97acae45d34d95c27a7fe4
+part_02479_of_04320.hdf5	1fb19711fe5dbe78a75ef49b78cb8534
+part_02480_of_04320.hdf5	5eb11f43a3a6db5d9ed637159af23ca1
+part_02481_of_04320.hdf5	07be5f3afdd9742e4b0c9542fa9853fc
+part_02482_of_04320.hdf5	03d5cb3383d43deaef8dcb7aa7bd41e1
+part_02483_of_04320.hdf5	32fdd0032b2485d1f8a7724e8722eff2
+part_02484_of_04320.hdf5	911406cb2e391d1ad44b9521a1fd5228
+part_02485_of_04320.hdf5	8f329fa432d236dd08e84f5d591c14c3
+part_02486_of_04320.hdf5	eec1a3b0390c7854bdba1e50cd2b2053
+part_02487_of_04320.hdf5	8aff820948d477b7d2ae3ac39e8538a6
+part_02488_of_04320.hdf5	f1072ed65516a556c817dfd6b9be9234
+part_02489_of_04320.hdf5	4034153be0dccca5a71a7b54ec2360d1
+part_02490_of_04320.hdf5	40c108540bc5fa5d73918abec9df48d3
+part_02491_of_04320.hdf5	d3f29be14dced188a286841e25bc9a3f
+part_02492_of_04320.hdf5	9fe462ca80226ce89336b76147e472c2
+part_02493_of_04320.hdf5	263954ede01e15eb295682b5d90b5f4d
+part_02494_of_04320.hdf5	cb80a0542e7eb50481db3783d9e46ccd
+part_02495_of_04320.hdf5	79a652e3af9e3943e6888e22fdc10d26
+part_02496_of_04320.hdf5	e69d30023da52d517ac15a30717f3953
+part_02497_of_04320.hdf5	925762b402fbd4f68ae3ff0fc69fcb35
+part_02498_of_04320.hdf5	3a8d1df7a468b152793ecfaa20de877f
+part_02499_of_04320.hdf5	9cb85dfb71c93db34453765ec2e3f15a
+part_02500_of_04320.hdf5	4dc2e9abb1e0bcb096f9ad9663732db1
+part_02501_of_04320.hdf5	4e6c7e5e395be3399286440b301f2103
+part_02502_of_04320.hdf5	2f1cf47c08919d2540c41617b99a1367
+part_02503_of_04320.hdf5	545cbb12745d12cdd90c2e04bfc42e23
+part_02504_of_04320.hdf5	0ad5148af6368601904181328b0bd3b8
+part_02505_of_04320.hdf5	4d2583d528858ad4bf14828b803fd8d9
+part_02506_of_04320.hdf5	9df36064bfe938767ab1855b5867839b
+part_02507_of_04320.hdf5	976ac7f7fba0b8038950d176d4bb598f
+part_02508_of_04320.hdf5	9be731c042a0463d8e2abb75c9257604
+part_02509_of_04320.hdf5	3711d5049dbcc3fd76c460265fbf1fcb
+part_02510_of_04320.hdf5	4e08bb4e53d61c702c7fd7445db83202
+part_02511_of_04320.hdf5	92d18483bab24c98e7376ef53e1029ce
+part_02512_of_04320.hdf5	f1a473cc078f00d35edda022347e4ce5
+part_02513_of_04320.hdf5	7a1e587372eaa3c152bb55bbf1f299a4
+part_02514_of_04320.hdf5	a298dd30758532a5234999e061c9116f
+part_02515_of_04320.hdf5	69eaf7dc9fba803a8069c90542b018f1
+part_02516_of_04320.hdf5	94d84aead1e4b70df112a54382eaba89
+part_02517_of_04320.hdf5	3c32f085e839862bd9d624a0c302529e
+part_02518_of_04320.hdf5	7617bee4f445918b4aac1d9c32db45ed
+part_02519_of_04320.hdf5	9dd6e395474b50db5025a8af7cfeba9a
+part_02520_of_04320.hdf5	d194d73809ccf11180fdc5021b6c50ab
+part_02521_of_04320.hdf5	c29eeb7210503e74dfd805549ba79d21
+part_02522_of_04320.hdf5	f03aa9e376a02cd322a19aefa8f30eb9
+part_02523_of_04320.hdf5	ddd5b9ac4d23665f0db08d24ad8e41a0
+part_02524_of_04320.hdf5	d60c4516180ee394f69fa81e7369fad7
+part_02525_of_04320.hdf5	144db70db3dba67db59578a2f60fb5e4
+part_02526_of_04320.hdf5	013b5f68dfd54da45a1df134a0c871be
+part_02527_of_04320.hdf5	1750d1f9332645a0720e8c4bda06093a
+part_02528_of_04320.hdf5	4f533abea76debe2f06273212ef19c14
+part_02529_of_04320.hdf5	943c215aceb60db2f0f9242a7c2bc386
+part_02530_of_04320.hdf5	2172b0f4499aa91cc479c773e40973c5
+part_02531_of_04320.hdf5	b2762f81e3e482341ba6537fc8b23f55
+part_02532_of_04320.hdf5	4ca29bf6758f2bf2c5afdef47a7314ae
+part_02533_of_04320.hdf5	719b9e70ddd8bae691ab8c43e3078e9c
+part_02534_of_04320.hdf5	71068e69813c3838f20105e08c7433ce
+part_02535_of_04320.hdf5	779c3bbe68e4c6df6de4d19008db7ef4
+part_02536_of_04320.hdf5	f514a7d89e7159a9b773c0047cd8e1a6
+part_02537_of_04320.hdf5	cfca96ff3e1552d104d1b32a98ba2dc0
+part_02538_of_04320.hdf5	7f88c0e79453befbb00f8c37c59b55f8
+part_02539_of_04320.hdf5	0ca1f7297e0f3e6f4dcffe0338b1b95e
+part_02540_of_04320.hdf5	750b086c7102bdcc640215f507f0bbd0
+part_02541_of_04320.hdf5	003f8d549ebdc334ffb497e3a2aa91e6
+part_02542_of_04320.hdf5	16590df25f2f83cde4c5c5dba729d0ce
+part_02543_of_04320.hdf5	4dc441051e2f34c959442d865ca956e1
+part_02544_of_04320.hdf5	76d6e92afe069d3e0f619c174d649e2e
+part_02545_of_04320.hdf5	35c1dfa3f2d970f08ec40cac19a01194
+part_02546_of_04320.hdf5	c701fe17aada842402ae96589b3b9653
+part_02547_of_04320.hdf5	09e7f0439b57407511d959d0699359b6
+part_02548_of_04320.hdf5	f8e03f7d66f3bf0466cc09a9447ad79d
+part_02549_of_04320.hdf5	5d85b55167ec3b2525fb300a6e269f49
+part_02550_of_04320.hdf5	1591bf196ae9bfdb1f6bf9916d1a6534
+part_02551_of_04320.hdf5	70b727884ae5a741d886f2b5bb48fe0a
+part_02552_of_04320.hdf5	70dc4b4132c15b2f47a4c28f0b8301b2
+part_02553_of_04320.hdf5	1ddf10ef9fe2644c778278274de3e7ee
+part_02554_of_04320.hdf5	f0445a9a3b6b3c709c8903a4a741463f
+part_02555_of_04320.hdf5	5a03599d7343dae8b8190c89a79788bd
+part_02556_of_04320.hdf5	82c9fa58f6b092f3bb8806d73f9deb1e
+part_02557_of_04320.hdf5	0002576d7067dca1c54994aee8d16bcf
+part_02558_of_04320.hdf5	7afd4d01eea21e792e2540aba20675ab
+part_02559_of_04320.hdf5	da7dcdcacfaed69f6c0b79bb3482c760
+part_02560_of_04320.hdf5	239cd3868f8235b6cd0213b9244975cc
+part_02561_of_04320.hdf5	451fcc6efc7cad3a7db3dff09d45b8e5
+part_02562_of_04320.hdf5	291f48f15ad6789bf7161666fc1ed8c3
+part_02563_of_04320.hdf5	dcab22a819370cc8605282d714743fe8
+part_02564_of_04320.hdf5	2c788578a354881b8b92b873659af013
+part_02565_of_04320.hdf5	3274c1ae1afd0888098e01bfa2868a03
+part_02566_of_04320.hdf5	3fc81d963b5255a232ab5c0d8a01441f
+part_02567_of_04320.hdf5	d93d0b3139d1bc35e68a6e760df22a6f
+part_02568_of_04320.hdf5	a06c2a84f7e7923a49ee6606e4c25791
+part_02569_of_04320.hdf5	927b8dbd0fdb3fd0406fc2449746da37
+part_02570_of_04320.hdf5	02ba8a654cd890fcc449e4b1466a8d10
+part_02571_of_04320.hdf5	666b9917f06ffcfcba4c961f4dcc89f8
+part_02572_of_04320.hdf5	fa83c2f3f06f003868caed764c129424
+part_02573_of_04320.hdf5	c4fd1e7b291b4d5f83fcb090c673e6de
+part_02574_of_04320.hdf5	99493842d6b811d7ed23277173cc9e4a
+part_02575_of_04320.hdf5	6e1912521f20cf3a586c61aef50cfd53
+part_02576_of_04320.hdf5	6dae780aadb2079cf5ec697cb840c7f1
+part_02577_of_04320.hdf5	28c8798694ac1421dcff96d399b8c9b3
+part_02578_of_04320.hdf5	9138a722e8b9a09f57097dbb3807708d
+part_02579_of_04320.hdf5	d4070e365321c902b3e4e23ee53095de
+part_02580_of_04320.hdf5	9f5c7b305a7c08beb18ca84a6da5bccb
+part_02581_of_04320.hdf5	d0deae47a54d070c80a94d545b6f0fd2
+part_02582_of_04320.hdf5	e21b2f048f2272bd454afa3f114865dc
+part_02583_of_04320.hdf5	5fbbe26e92773de28bd2a890084ca8e4
+part_02584_of_04320.hdf5	2becb6ccd6a1b3e18d730141cf14979a
+part_02585_of_04320.hdf5	3a79f8610f2e0e202ac1da3d98811975
+part_02586_of_04320.hdf5	2a36ab972e3f1e01cfb714374cb1a27a
+part_02587_of_04320.hdf5	dd356bbccd7cea5d7197b6814c2414fc
+part_02588_of_04320.hdf5	c793eba65efd3fc33c0fc22a68b1e1f4
+part_02589_of_04320.hdf5	60b24f9542960fe0ca102600e427c551
+part_02590_of_04320.hdf5	419839125fc355a207a47e1017e1cc66
+part_02591_of_04320.hdf5	6d55aa53cbdfabc4e9f71413bd43bb91
+part_02592_of_04320.hdf5	d8e4cfb97327f3b34493cd25f804e8da
+part_02593_of_04320.hdf5	114557c4be8684b41eaa1b3c543432ab
+part_02594_of_04320.hdf5	f261daeae4cea914592dcb4be2831b94
+part_02595_of_04320.hdf5	2f0e46aefdfe4722036478c496f3dbf9
+part_02596_of_04320.hdf5	1e3ba034cb07cfdc01eebea1e6676e56
+part_02597_of_04320.hdf5	ea2b4d2e21aee08a26e04ab25c398731
+part_02598_of_04320.hdf5	49c6839dd7827a88dfe043c985c6abcf
+part_02599_of_04320.hdf5	b3e79d7fb7e5e41a907bdf44aa96f66c
+part_02600_of_04320.hdf5	d6d92040a094506180b380729de0e2fe
+part_02601_of_04320.hdf5	5ab500b94ecfb78d6b85697c81bf4f1c
+part_02602_of_04320.hdf5	1aaf640d9d875edd83396ebe4a3915b2
+part_02603_of_04320.hdf5	3944848a711c19b92620e831b4d9001f
+part_02604_of_04320.hdf5	abf974799c9d1c2047407fbb17ca35b8
+part_02605_of_04320.hdf5	c639b865ec294273fa49be718dbd324a
+part_02606_of_04320.hdf5	01b750f4f0854c290d4b243e6d73d06c
+part_02607_of_04320.hdf5	b01261549fef01a6916396aab440a635
+part_02608_of_04320.hdf5	db6f913b7fe2a8d7a85cf102098f004e
+part_02609_of_04320.hdf5	b3dfaf16b6731055ee1476f7a827c55e
+part_02610_of_04320.hdf5	81911cc8b7234f5b376d96e32da953db
+part_02611_of_04320.hdf5	174b3d2d2bb33822aede75c17856e14b
+part_02612_of_04320.hdf5	78780c238265e99b9d56b83e351458b5
+part_02613_of_04320.hdf5	d7f84ca99867e69e6440178f8812adbf
+part_02614_of_04320.hdf5	c6a1eff6b8f2df56d43e062241a04fd0
+part_02615_of_04320.hdf5	a4d5b28ff898fb8bcd1181aec8cd6476
+part_02616_of_04320.hdf5	1af97d9565c998b0e6e798e72d77a997
+part_02617_of_04320.hdf5	4bdf356aad4fcc501ce2ae6f79fee3f7
+part_02618_of_04320.hdf5	e649334e48bf04771a73a5fd3c04d107
+part_02619_of_04320.hdf5	15f2b0ce3a6611c45895aea64090ce23
+part_02620_of_04320.hdf5	2527416b0f5635147f91b648a7cca710
+part_02621_of_04320.hdf5	220f9fa7cb9f0d1ef556f4fc1cb2b414
+part_02622_of_04320.hdf5	093c8fb21931a9343e59ccdbc702deff
+part_02623_of_04320.hdf5	ed97fcec71e2c0a8320fabd7b0fec6c7
+part_02624_of_04320.hdf5	40da2a3e2346810d21c1157b24ea8117
+part_02625_of_04320.hdf5	90ba9db930c59dcd1e534bc777cd4ed3
+part_02626_of_04320.hdf5	dde385f4d7d5c06ba09dcfb6bb2c38df
+part_02627_of_04320.hdf5	9ee9b95075a9377abcda6da474f4bd75
+part_02628_of_04320.hdf5	3e308d2d743c193e123722563231f1d2
+part_02629_of_04320.hdf5	d590c5b9b419ccdd6b2ecd1625210233
+part_02630_of_04320.hdf5	557a1ea48efa7820c25fe3ea7c73261e
+part_02631_of_04320.hdf5	2e37bf4d8f3f2329e357227082fe7a4b
+part_02632_of_04320.hdf5	24d1677d549c1f1fe4cf3fea543076f4
+part_02633_of_04320.hdf5	8c759b96f250c78aa4cf5fa41d4935bd
+part_02634_of_04320.hdf5	50d0ed6d3b83981f841fe6b67cd63c1c
+part_02635_of_04320.hdf5	a48b1d24f402f473ca94d078a5b26b53
+part_02636_of_04320.hdf5	c732b17aa971e328280992d054838114
+part_02637_of_04320.hdf5	5431017e937bd83bb042ae409d54b9e0
+part_02638_of_04320.hdf5	09f85a68625be080da6de54ced63f32e
+part_02639_of_04320.hdf5	4a62ca59114803671bd0a32276129aa9
+part_02640_of_04320.hdf5	54a8da9b9fa3c221b125559957d0db98
+part_02641_of_04320.hdf5	2a5714b613af73f0545cb7ba7e026ae1
+part_02642_of_04320.hdf5	1dfe18f120a1dfdff90bdda98d0b109f
+part_02643_of_04320.hdf5	436bbd588e087ae8127bfeaf9f3bc631
+part_02644_of_04320.hdf5	7c2608e5b532b5a1d5bfa53f26e9ef48
+part_02645_of_04320.hdf5	da48c07f1474c4b1bff20be3e1ca00f5
+part_02646_of_04320.hdf5	8d4108382a6eff0394dc4b7c5282094c
+part_02647_of_04320.hdf5	3ba06cb34dad1ba369d32e0606b8c734
+part_02648_of_04320.hdf5	2fe05322c0341e22cf07bf1c33f31a7a
+part_02649_of_04320.hdf5	196ad9f6423f77fa93fe3358f1d8616b
+part_02650_of_04320.hdf5	1bd7acf399ddffb69f2fe85a317a44e7
+part_02651_of_04320.hdf5	5320f56a6a273da32f410ca6e7e1caa0
+part_02652_of_04320.hdf5	b72540939c2a09a05d6ec02019779196
+part_02653_of_04320.hdf5	848a9a363be802fdc6eeefed5b36452d
+part_02654_of_04320.hdf5	e4c872a5601c69dcd40f1113fb5b67e7
+part_02655_of_04320.hdf5	c8c3fec5816db529ff639b9b3da49377
+part_02656_of_04320.hdf5	1561ff6afe88f19d7ef08c0e9958d894
+part_02657_of_04320.hdf5	2ceadfe4bcdfcdeff2c6dc83e6264f6b
+part_02658_of_04320.hdf5	afc691e32171db1ebb300914f2917844
+part_02659_of_04320.hdf5	1527e0bb2badb0e4a849c6ac1d7db6f7
+part_02660_of_04320.hdf5	5c29c07f83d7629d0e6359f260898a24
+part_02661_of_04320.hdf5	d4af7830b5c8da1e423041dd2dbaebcf
+part_02662_of_04320.hdf5	8e7198cef896a390c99981f148dbdd85
+part_02663_of_04320.hdf5	17837c1905c4d61e4255acc1ff06ed8e
+part_02664_of_04320.hdf5	eb7b5c1478794e9a0fa7117608d64a52
+part_02665_of_04320.hdf5	ad5404b126c118504dffa8193e9d752d
+part_02666_of_04320.hdf5	c448179a5fd68b3e331f84d3c55f8c84
+part_02667_of_04320.hdf5	7a8a261fc1f2a54405ba426300dda499
+part_02668_of_04320.hdf5	184024161ac5708e3aef9c9958a173e9
+part_02669_of_04320.hdf5	3fe2c3f50d5094883903b3d62219755c
+part_02670_of_04320.hdf5	e5ec4259aa1377419042c761e7bae5a1
+part_02671_of_04320.hdf5	6800f4900f34f63761f5cdc9c33e3c93
+part_02672_of_04320.hdf5	f381cc1459e106315e648b9dfe9e24cf
+part_02673_of_04320.hdf5	c814669ead39e23b045b0969e0cd9c15
+part_02674_of_04320.hdf5	7d0775ba4921b4a4bf0d128622d7a630
+part_02675_of_04320.hdf5	982f3eda55bc30356a8c942976447e5c
+part_02676_of_04320.hdf5	8274e9c9bc8a53b5df6ff8cde38c03c2
+part_02677_of_04320.hdf5	e16eacc350ae1944651abb89c3fe0460
+part_02678_of_04320.hdf5	8d81a784c613513b807a44bcfda53b4b
+part_02679_of_04320.hdf5	ef8bd6359c092cdb83f4c502460b2655
+part_02680_of_04320.hdf5	277157358c2bfc47a532ec22e2f5537b
+part_02681_of_04320.hdf5	03f3d4e92244dc7850b08844eb54fe0f
+part_02682_of_04320.hdf5	3b67c209b78b96ff6489760aea5b5d60
+part_02683_of_04320.hdf5	94065a75eda7bc472aaee64b46ccc92c
+part_02684_of_04320.hdf5	67fd6e8055452deb085499871e78a7b7
+part_02685_of_04320.hdf5	17e0eb189f2d11449d1e05cc9712c1c1
+part_02686_of_04320.hdf5	7ac4386666053c24e4d90d44210aa472
+part_02687_of_04320.hdf5	82848ccd5216916472993fa7c6f1f383
+part_02688_of_04320.hdf5	1432c2338c8133762096e2b1403001ce
+part_02689_of_04320.hdf5	0a9a43c888dc09dcf4f4a71010fd658b
+part_02690_of_04320.hdf5	ae367b47bab09bf6753167eca70a4f7b
+part_02691_of_04320.hdf5	0d04839b03513c046e6ca86d9b0c7fc5
+part_02692_of_04320.hdf5	19b8c6bd06f5dc59b35973cb58b1e890
+part_02693_of_04320.hdf5	839fc1fad65b616acd97426c6c332268
+part_02694_of_04320.hdf5	8a385358c4d298e92d9cdd2345aecc07
+part_02695_of_04320.hdf5	91ba9a9976adcbe80729577ba6ff4b01
+part_02696_of_04320.hdf5	bee49873a060df365614c866c2e7252a
+part_02697_of_04320.hdf5	f5acf6c757d0805ae5c47a32b1087b98
+part_02698_of_04320.hdf5	480455b4b4c782a703fb89f953b37c27
+part_02699_of_04320.hdf5	98bf9cc4b97c494fc5336d838931b985
+part_02700_of_04320.hdf5	286c6069dd546c54ba9b34a7f3c06f0c
+part_02701_of_04320.hdf5	6bc7bc6425561ba3424598d5642b444c
+part_02702_of_04320.hdf5	08b7e4df9a8b6398656f57a0cc2ba999
+part_02703_of_04320.hdf5	74204cf7452116e76f80dabccaf3a2c4
+part_02704_of_04320.hdf5	05e76e7f403cb5ff88d62113d0fac166
+part_02705_of_04320.hdf5	227e8b3084bf3d0ec756b9e02f85c3b7
+part_02706_of_04320.hdf5	7a796f59c7fc79b9fbb6ec23972ce2d0
+part_02707_of_04320.hdf5	ef6c7d569c71e70e01ef05bb7f0c738b
+part_02708_of_04320.hdf5	55a7587352a434e282090dad6cb88b83
+part_02709_of_04320.hdf5	f26c2f464ec5e7b6c09c8fd36c3ce767
+part_02710_of_04320.hdf5	929b3dcd5b4ce53edfb018aa778a0715
+part_02711_of_04320.hdf5	bd833bee11095ef45906a68e4d020a18
+part_02712_of_04320.hdf5	c6e0f3fc13dc354c234972edb1da719d
+part_02713_of_04320.hdf5	9c27e79ed9c7ee40117a37cc76d6f6dd
+part_02714_of_04320.hdf5	cb2364ede4f83ec1821d670605543a2c
+part_02715_of_04320.hdf5	c01609e4a368f95bf10a1d281580271b
+part_02716_of_04320.hdf5	1e9dfdfbc67eb5709dff3577137db053
+part_02717_of_04320.hdf5	e6abe09417f029863272a8d882d5725c
+part_02718_of_04320.hdf5	8e5a9389606b7f7fb264447eea9999d0
+part_02719_of_04320.hdf5	eff46e1a1d22abddeec07e1f56fe6af3
+part_02720_of_04320.hdf5	425497da9c08c7f7083ab3a627d158d4
+part_02721_of_04320.hdf5	b553845b02bb3927d5debc3122dbf83b
+part_02722_of_04320.hdf5	7099bd3aff5aa6667e58eb9a8b30ca79
+part_02723_of_04320.hdf5	64c160abb3f7336429ca74977cd93afe
+part_02724_of_04320.hdf5	696cbee310dcdd4270d618b04f940862
+part_02725_of_04320.hdf5	af4ace72d1879a71535e5d56031fb37c
+part_02726_of_04320.hdf5	d3f7b3b25b738fd1a22ae09dc7fed4f6
+part_02727_of_04320.hdf5	63fbd040984818dc5e1da719b5c76b48
+part_02728_of_04320.hdf5	aed6ca7c68754915495ab0d1f39c8213
+part_02729_of_04320.hdf5	4fe767fbf37a364b640503f956a6925c
+part_02730_of_04320.hdf5	582811292ff8b321bd8326f4b9674610
+part_02731_of_04320.hdf5	6d3f7b07c9f7e93486144a0200e5e929
+part_02732_of_04320.hdf5	c9b068bf3f04505a855a4a5aa70fb52e
+part_02733_of_04320.hdf5	d544548e015554670c1495a8e58fe453
+part_02734_of_04320.hdf5	6ebac97769c3482ff74c5fbfc7266de6
+part_02735_of_04320.hdf5	7997f5d9311bfac427e795bbda7dda72
+part_02736_of_04320.hdf5	f8f14ecbedd891d0fb1109f95082ea88
+part_02737_of_04320.hdf5	c86b4a76f5625e9ea6c5ef0bcea8be72
+part_02738_of_04320.hdf5	af00326908e0b15bfa0c039615ea92e2
+part_02739_of_04320.hdf5	4b70f0e32aeca5725f68b70f1f5055f8
+part_02740_of_04320.hdf5	4cbbdbba92287835d56ff36ca4859bf2
+part_02741_of_04320.hdf5	ecfbd0144e15e0c62e604085ee097dd9
+part_02742_of_04320.hdf5	5a9d4176dd63558e146848e2d6ffe045
+part_02743_of_04320.hdf5	7d4f4b405b0cbd80bd37ca5407da4856
+part_02744_of_04320.hdf5	ca2ae2453132d495f50367f620ffb6ae
+part_02745_of_04320.hdf5	21ffff9efd2c9c529629f903f24b2595
+part_02746_of_04320.hdf5	6f82d7f3599df99e4a777a9fb89a1965
+part_02747_of_04320.hdf5	801164833d6470a595c76aae7d76a3ad
+part_02748_of_04320.hdf5	f4ebf719ea3da014f456b898b2c3063e
+part_02749_of_04320.hdf5	bd4d0d0dc8dab6b7e4392c8f31dc298a
+part_02750_of_04320.hdf5	6d7a9795a89b66f45e6aed0769f81698
+part_02751_of_04320.hdf5	832b64ee0053f04be43524fd3ba41adf
+part_02752_of_04320.hdf5	e9f249439afe08aa40afbd700e380392
+part_02753_of_04320.hdf5	963824f6e614ade28978782d1075746f
+part_02754_of_04320.hdf5	3ea59560fe1d78d6749006436fa155ba
+part_02755_of_04320.hdf5	32bd314491807bad33a528f4409964ad
+part_02756_of_04320.hdf5	e94684d82ce5339f464dd11a5dea73cd
+part_02757_of_04320.hdf5	eca935cd007e1f3aa6bde2e9aced1681
+part_02758_of_04320.hdf5	345477785683cf6f5eea5230e1338709
+part_02759_of_04320.hdf5	ebfd113b80fc568dc868466fd4510ce9
+part_02760_of_04320.hdf5	cc38293819146a38c322ebf33c5e0c23
+part_02761_of_04320.hdf5	1b44b3bf224036d0241d1b6839851738
+part_02762_of_04320.hdf5	afde8ff0d87e5da919d377fd2e91bb2d
+part_02763_of_04320.hdf5	c713d7bdaa3ea4e3ab3271ece0926e48
+part_02764_of_04320.hdf5	dd50ac6041910bf44a8b79846f66dbe3
+part_02765_of_04320.hdf5	9236fcda9d5a61699c6eb97104abe93b
+part_02766_of_04320.hdf5	d9161bbb9560f3150136ad875c0aa188
+part_02767_of_04320.hdf5	2e58fa2887ed33ced6eb8ceeb6cd729e
+part_02768_of_04320.hdf5	3a1bbce8efcda9198b3212611011faf7
+part_02769_of_04320.hdf5	3cf14fea22341029a6e3eab2e8578634
+part_02770_of_04320.hdf5	58bb452e31d5737fb174613f2e3c299e
+part_02771_of_04320.hdf5	3928b658f88b17b972d1098180d5ac1e
+part_02772_of_04320.hdf5	d829061c61259841fa0957939a97f928
+part_02773_of_04320.hdf5	60f0fb7e58161668b431e8018f1780a2
+part_02774_of_04320.hdf5	c82ff76ec41cbc5ae12a35fd726496af
+part_02775_of_04320.hdf5	d35bb82cf4f647673df9882f7bd5a5a0
+part_02776_of_04320.hdf5	e3d489109cdbe01ad37167cb00e11cd6
+part_02777_of_04320.hdf5	7ae3dcb71e65f0edab109b1af1739789
+part_02778_of_04320.hdf5	a997de6747c46588dbad2280dfd9d1c7
+part_02779_of_04320.hdf5	5362c57d8d161c9bdf6fb71c104c91dd
+part_02780_of_04320.hdf5	e9924fd57413adf1b959b692462d832d
+part_02781_of_04320.hdf5	f3ffebef3f87da3cb1198dac39ef1ba4
+part_02782_of_04320.hdf5	b31b5ae23099af9cb1af60d3640d15dd
+part_02783_of_04320.hdf5	6690cc98be2aeb077501663e2c8f75a7
+part_02784_of_04320.hdf5	126f079d11ce4c4d8c9eecedf9d6a0e8
+part_02785_of_04320.hdf5	8976f10ad4f4789dc710676bb198ca35
+part_02786_of_04320.hdf5	a924296e83e66bbaa8fb5188382a0725
+part_02787_of_04320.hdf5	ca8c74b65a89d45d34efa9785e0ecdbd
+part_02788_of_04320.hdf5	a1804efaf1c7ab8c211ff9368f22685d
+part_02789_of_04320.hdf5	a222f0460f581c90bda9317a8ee0f786
+part_02790_of_04320.hdf5	f8c1d937d058078041f5b59ba3af4704
+part_02791_of_04320.hdf5	3db8e255a62e35b6ca0fa8d6c72d5ceb
+part_02792_of_04320.hdf5	f6b951e6465590ae0cbad0d068a3e525
+part_02793_of_04320.hdf5	a4704989b3a8808bcbdf97ce69752815
+part_02794_of_04320.hdf5	3d66340c5d60f4dee2a93f3a57bdff82
+part_02795_of_04320.hdf5	bc951a41f43b876b13ed301fdacf25d2
+part_02796_of_04320.hdf5	a4ccb045b57b68533e66a8f7d794e375
+part_02797_of_04320.hdf5	cc7098f7b3c47f30b7537060f143997d
+part_02798_of_04320.hdf5	e07696cb1309a7f4c43c4f6e22fbdb53
+part_02799_of_04320.hdf5	3a6e7af0626c6918d987ea47cca83209
+part_02800_of_04320.hdf5	cb8f2a56b76fc46310e384b98f116030
+part_02801_of_04320.hdf5	51ae3c3b05c6569c7fce0aee1d1fa464
+part_02802_of_04320.hdf5	a6f99b56554d04ae94d5ed021839ca73
+part_02803_of_04320.hdf5	f2b8ba8073997cad4529afe89c7767e9
+part_02804_of_04320.hdf5	f664840ffe1679eba30878fb3941f01a
+part_02805_of_04320.hdf5	71b3e264a3c66e5e024b643b53362fe2
+part_02806_of_04320.hdf5	4bbb517f81262f536a48528c21c6136f
+part_02807_of_04320.hdf5	249457fcd2a4cac7ce9032a4215eca09
+part_02808_of_04320.hdf5	d8352dd6054c7a0b3b7f375275cc845f
+part_02809_of_04320.hdf5	ed154e11c99fcee0ebcb3b6e5a497d60
+part_02810_of_04320.hdf5	61f403716d62d282deb97a487f003dee
+part_02811_of_04320.hdf5	8c6250ec42dd14576645f2234fd75795
+part_02812_of_04320.hdf5	d1a10ae958f7927c769d46622912fc02
+part_02813_of_04320.hdf5	723f8e0e77b641dae93554a5a5cf3583
+part_02814_of_04320.hdf5	bfb7f13ef5aeae4c13d6bf908c01ddee
+part_02815_of_04320.hdf5	77fb4c1bff637237b9a53ec2ca82bf16
+part_02816_of_04320.hdf5	52d4df5c894d47922bda531016502c60
+part_02817_of_04320.hdf5	fd676b790dbcff9cc0c2245dcd8acaf4
+part_02818_of_04320.hdf5	6cff25034c9f5348a58ab0240cda256b
+part_02819_of_04320.hdf5	252d7643658a8d55c4f9a1230748de71
+part_02820_of_04320.hdf5	e098d19d642b8f248c79682d67c73770
+part_02821_of_04320.hdf5	47772a8dc7b1de88634e28af1a14e5ac
+part_02822_of_04320.hdf5	0bcb57ace2e441b8c5680ff65c448c3f
+part_02823_of_04320.hdf5	30cbaf1381a25a926a99284d93d1771d
+part_02824_of_04320.hdf5	0abd7325708e69391cbbe2f28125e758
+part_02825_of_04320.hdf5	742526ca8fc8125c1c57539d397c6cde
+part_02826_of_04320.hdf5	eaa4f52ef3245d4aeba44c10e5f2d2a5
+part_02827_of_04320.hdf5	d674997ceec58914f11913c33f884aab
+part_02828_of_04320.hdf5	27f9194ef77338ee81c5a4da9531ae6d
+part_02829_of_04320.hdf5	ac43aa55f86825b4c60b7965ef8a98ed
+part_02830_of_04320.hdf5	d510382872e8fed0dea8ede51b9f3268
+part_02831_of_04320.hdf5	536db7106553b08d6bf6091963ba5170
+part_02832_of_04320.hdf5	620027d81c130910d0e698f381a9a785
+part_02833_of_04320.hdf5	3dc02a05ab91c70d3ed50c4b2c550c6b
+part_02834_of_04320.hdf5	b8200fe886da36a81d461dbf0bc618db
+part_02835_of_04320.hdf5	849276fbda122c439a6627bf1166a095
+part_02836_of_04320.hdf5	5b13fd0b91b5e44210d11736a3c2ad57
+part_02837_of_04320.hdf5	417891c22c102938088e96307991478f
+part_02838_of_04320.hdf5	f487b72936ee30904e28dd42bdfa9d6f
+part_02839_of_04320.hdf5	a57b3a966bcbf10c144b1676f36c7eb0
+part_02840_of_04320.hdf5	1e4f8c04897ecb5d661994d183921b51
+part_02841_of_04320.hdf5	3d5915f0c5c20cc75ea53c30594de39a
+part_02842_of_04320.hdf5	63ce350021cce6a2c896aa4fdab7377c
+part_02843_of_04320.hdf5	cbdbf3010531e595cb650ca3fe74a7fe
+part_02844_of_04320.hdf5	d26bf383cb5ffe7d2b8de21fe89a7ee2
+part_02845_of_04320.hdf5	ae359ce1262738e069c5b27e657e1289
+part_02846_of_04320.hdf5	3fad65aa779deda174b10ce3598f624d
+part_02847_of_04320.hdf5	0a2bd6de3a6a843a336765dd578247e5
+part_02848_of_04320.hdf5	984180ed74ec30e67b5c6eb947ecfd99
+part_02849_of_04320.hdf5	2a558cd0bc640fe5678f3b323bfed066
+part_02850_of_04320.hdf5	290084be36ad334ebd8e33744b9005d4
+part_02851_of_04320.hdf5	7b9dc8ed3c11b97129d47522b75f4c29
+part_02852_of_04320.hdf5	14e59f0099f3fecc4edcf5abfb671c30
+part_02853_of_04320.hdf5	d7da88c511b4fa3f9aac0c62a2c715f0
+part_02854_of_04320.hdf5	d3ac2dde6f2708793752519a75203377
+part_02855_of_04320.hdf5	a06fafa3a3306c2ee692ae58d04e9b45
+part_02856_of_04320.hdf5	c0e342e1110dabe3bfc00b168f283fa3
+part_02857_of_04320.hdf5	d072ffeffef8d3b71eefb395abd0f399
+part_02858_of_04320.hdf5	d5f133d4c9e6b7bcc550ebbeea5f067b
+part_02859_of_04320.hdf5	1760598bbb09b765f17c5d3ea1e2f319
+part_02860_of_04320.hdf5	c5f985fc7783ad1f8903b74242b80c89
+part_02861_of_04320.hdf5	01a6a9feea64f627a6ad78ae3c43e8b5
+part_02862_of_04320.hdf5	a539d4e6d2f7dfcfcf14074668e48323
+part_02863_of_04320.hdf5	91aff7d4c4186a7fec2e27cead8afae9
+part_02864_of_04320.hdf5	47673f5f4a2473e278cfb20a9e85ae5a
+part_02865_of_04320.hdf5	a35a06cb33853c6446b3d48719d703be
+part_02866_of_04320.hdf5	b6db803464e83c1e229ef055fb67d233
+part_02867_of_04320.hdf5	4aaadf8b0a8561a0503901831f24948e
+part_02868_of_04320.hdf5	f683f25764a03ef990b5734b8eb7c136
+part_02869_of_04320.hdf5	688061f3e81201ef15b9c2b1d68a0200
+part_02870_of_04320.hdf5	322ff8e40f1e1daba57ed7569d7918d0
+part_02871_of_04320.hdf5	4d265901a6338c27c78f8fc26214e47b
+part_02872_of_04320.hdf5	d088f044ff2bddb2edbd472d36f32d9b
+part_02873_of_04320.hdf5	48eb8527b6628e7a42b51bacdb412803
+part_02874_of_04320.hdf5	fa997eb475de9f7ed6cccdc36b6a010b
+part_02875_of_04320.hdf5	df1eeef800a7f905ead5890846cd41d0
+part_02876_of_04320.hdf5	9e1868d79c7fee39e4cb9bf7482f3447
+part_02877_of_04320.hdf5	d60d53ca4ea6d7381aaa29112e595b4c
+part_02878_of_04320.hdf5	92320a52e9b44e5dafc5179271af5eba
+part_02879_of_04320.hdf5	dd1e310323e7044feff96d7dbf627844
+part_02880_of_04320.hdf5	9641535856ccbfd44811286bed3dd754
+part_02881_of_04320.hdf5	9eec07fc5c2e3379f9dc42aa0289a13e
+part_02882_of_04320.hdf5	f538ded5a6efd3bf7fc23339b7ce4940
+part_02883_of_04320.hdf5	8d435ea2ed9fd03cfbba5c86438a2805
+part_02884_of_04320.hdf5	94a8f1fadb2160b6affd2d0b939f3486
+part_02885_of_04320.hdf5	1a65037c7477ad998cc141596224c801
+part_02886_of_04320.hdf5	8e098d6eefc17b0eb78ade5448cfc244
+part_02887_of_04320.hdf5	3b877bf4dec86c377af67a29f2b5190b
+part_02888_of_04320.hdf5	5d7182de58ce6c85335510a5c08fca2f
+part_02889_of_04320.hdf5	7685ae87bbc98602bc11d9b1bd4ab1b3
+part_02890_of_04320.hdf5	46e93fc52e91294120dfdf70c6350f56
+part_02891_of_04320.hdf5	8f195116a81e7136374f5f25780869e6
+part_02892_of_04320.hdf5	67efb2943a838abab36c267902a0f425
+part_02893_of_04320.hdf5	2893d8f82435f7639d83876f91820a36
+part_02894_of_04320.hdf5	469b2823dcdcf909195ee1d2a65c34c6
+part_02895_of_04320.hdf5	71535d5da6a654f7c80f485ee86331fd
+part_02896_of_04320.hdf5	f8183cf69e134c615ffba4b142423694
+part_02897_of_04320.hdf5	15032fc4e92c8ba5407b61ea9938c0cf
+part_02898_of_04320.hdf5	13a535bfccff287802ae87300819496a
+part_02899_of_04320.hdf5	283317c2deaef83331273cbf85432394
+part_02900_of_04320.hdf5	c941a8dda65bbf16ea2c7fb069373301
+part_02901_of_04320.hdf5	c5b8830940a9d0ffef0e7927cd9230a0
+part_02902_of_04320.hdf5	19d8b563c1a734d78210d4f5d1330c32
+part_02903_of_04320.hdf5	2923777ff8e0681383583f55b61f8466
+part_02904_of_04320.hdf5	b403e4e22cf3244c065939f52ccd6420
+part_02905_of_04320.hdf5	2b9935269af85333f2df398fe645eef0
+part_02906_of_04320.hdf5	b507a348a0c6fb74a68a5e0f903892d7
+part_02907_of_04320.hdf5	a450e823aef0038d6d5a733d193a6d56
+part_02908_of_04320.hdf5	9be3a3b1488f9e0de8cb90410ae202b0
+part_02909_of_04320.hdf5	e5ed4da84b654e548ce0cff4d5e66b6d
+part_02910_of_04320.hdf5	69a281e37b578674214ecae1d5ff87e2
+part_02911_of_04320.hdf5	92d6dff5c664fb0872efcb5b747c94e4
+part_02912_of_04320.hdf5	056286ceea4c7b079d60e2ecb3998290
+part_02913_of_04320.hdf5	85ae432cd5ba8d0e7f6fd92dbe1d4a6b
+part_02914_of_04320.hdf5	3d80013df9e42522cb80fc4e687bff6e
+part_02915_of_04320.hdf5	223d82b0461a504fda0299445857f9f3
+part_02916_of_04320.hdf5	c80b689d767d0bf89bad154e4916cd09
+part_02917_of_04320.hdf5	453dab1b191128cd18675b4933f6bf67
+part_02918_of_04320.hdf5	db1ea4c22e0d8abf2f6fad0338e46ef4
+part_02919_of_04320.hdf5	4565c248c2e22a0afb6dee5159cc8591
+part_02920_of_04320.hdf5	a877e59ca7d17fb8849532c0de558785
+part_02921_of_04320.hdf5	52af82766b185da29d0879f4940fdd41
+part_02922_of_04320.hdf5	eb444b8b70b8d0e7378242c8edfee76d
+part_02923_of_04320.hdf5	8a439c7a83b6d4bd1bb314146380f832
+part_02924_of_04320.hdf5	d1b680f5182c5ae62f80a5622330dbc4
+part_02925_of_04320.hdf5	2b872178d73bffc6d64b760ceaf1147d
+part_02926_of_04320.hdf5	542412a7b1fc16f9ef2b8dbf6bd11c02
+part_02927_of_04320.hdf5	b1a14a57acc05d3ad5d1cd356ac95182
+part_02928_of_04320.hdf5	51f468c665dc784274bcaa4a73527146
+part_02929_of_04320.hdf5	4e6e7a6a2a40dead6e27d3899a8a41ce
+part_02930_of_04320.hdf5	85cb78404cfd54645a6ea7bf1c651f64
+part_02931_of_04320.hdf5	5a61c2618948736af0175f28220d17c8
+part_02932_of_04320.hdf5	acdca99d099fa764a8520ce29cc6df54
+part_02933_of_04320.hdf5	32802318701d234642946f70f2b7d00b
+part_02934_of_04320.hdf5	d25aa99ae406f77dac54a426173ca705
+part_02935_of_04320.hdf5	86d011ae58df68c882ade0e947df3ce7
+part_02936_of_04320.hdf5	3efddecb9a7a0e2e3bcc518c88d52388
+part_02937_of_04320.hdf5	c6e222a850f0ac7a2db5f3d5554c8fbb
+part_02938_of_04320.hdf5	dcfb66f4477ad42dcc2c63fc32e35691
+part_02939_of_04320.hdf5	13b365a2e0f8dbf847de5d4c60792df1
+part_02940_of_04320.hdf5	c1242cf6c2d751e8dea25a43374f2e8a
+part_02941_of_04320.hdf5	0e12cde704ad22764ce375e43c248f8e
+part_02942_of_04320.hdf5	b89520e4ac3644a79fecd6f1da6b33cd
+part_02943_of_04320.hdf5	d734b3780d676a2efabe870f197a3cb9
+part_02944_of_04320.hdf5	bdefc95762a1b6a86df49c8074c73340
+part_02945_of_04320.hdf5	b6216d24b6b3c76eae5edf58f94ad0f4
+part_02946_of_04320.hdf5	c97c3a460b5f5e8891c92caa95a16d93
+part_02947_of_04320.hdf5	f51bbb3a8388998b442ed620f5b17f1c
+part_02948_of_04320.hdf5	4641e2a9adfe1cb05c8a1311ad932eb7
+part_02949_of_04320.hdf5	36ef5684123327d760d12cc791b8d486
+part_02950_of_04320.hdf5	bac15e48b926a54759adcb48fdb343da
+part_02951_of_04320.hdf5	27695536e1d9479ae13a2b42c8077cfb
+part_02952_of_04320.hdf5	607c9d7fc289763c18d4962f6942fb6d
+part_02953_of_04320.hdf5	9c63d3fb17b0259d85b712bc2f2a5104
+part_02954_of_04320.hdf5	5d07d94d7d24eaf3936fb918aca5ec29
+part_02955_of_04320.hdf5	1ded95ddfdf55327841deaf23aa27541
+part_02956_of_04320.hdf5	8ef8edb3a6cbeb4a8d5b825d7acb66a2
+part_02957_of_04320.hdf5	814f446e3e80e4d96f71cfbe3c5caa0b
+part_02958_of_04320.hdf5	0e518741d02aec61a9be917bc84e0a30
+part_02959_of_04320.hdf5	1afcc2aa5a6b26bd400616e44867f78d
+part_02960_of_04320.hdf5	d81cfb24eccf80b9b14ad829d25317de
+part_02961_of_04320.hdf5	eebd869101e0359adc4815db063a91ee
+part_02962_of_04320.hdf5	4546ec61dc842ea7a66d40cf64f13364
+part_02963_of_04320.hdf5	9bd2dbd99ec176ef444452b78def861d
+part_02964_of_04320.hdf5	afbd470d299a54400a0dcb10b9444f1f
+part_02965_of_04320.hdf5	3a8471f58b6525d68f5fac1a06d43e99
+part_02966_of_04320.hdf5	82f9ca231c3d4469f21acd1d9fb03071
+part_02967_of_04320.hdf5	0b835d72787e2247e89080c225226a3e
+part_02968_of_04320.hdf5	d263d6c3839b7ea10dd647988930e78a
+part_02969_of_04320.hdf5	19f05e5f8b71f7236bdfeb5b931a1f74
+part_02970_of_04320.hdf5	a8d721797ea0c2490f90cb14a64f8b5e
+part_02971_of_04320.hdf5	ac9cae714a28feea54fc181ab77acd94
+part_02972_of_04320.hdf5	6f723a033cb6cd95f54aa87ccaa44df2
+part_02973_of_04320.hdf5	c1e272cf441b65232be519f831966158
+part_02974_of_04320.hdf5	7cf2bcd279a725736a7aabceb4df9547
+part_02975_of_04320.hdf5	c8fadd6162807edd1f49d39a8b546b85
+part_02976_of_04320.hdf5	0636740b41539e4ef4e3d2e646d77359
+part_02977_of_04320.hdf5	b6b6b288e4864d316bf3b79d3ac49b95
+part_02978_of_04320.hdf5	05ab7c06e7f9abebb0bcd4579d190665
+part_02979_of_04320.hdf5	2494d792607b1c4ea62ed5b4b8e795bd
+part_02980_of_04320.hdf5	00f881881be0d169b936c7c455956f63
+part_02981_of_04320.hdf5	4ba005bc7a948c2611c2e35fe7a995db
+part_02982_of_04320.hdf5	015694d94fda31ea2b6fb672a34267d9
+part_02983_of_04320.hdf5	bf31ceb8bebc49c66ad0b03ccaa4c13a
+part_02984_of_04320.hdf5	5cb4a3832ea0cd603af4cced69e3c5bd
+part_02985_of_04320.hdf5	c2fb8a6ebce540a80180f1da55d5cc62
+part_02986_of_04320.hdf5	b928bc936b97bf4eff706104ae0626c2
+part_02987_of_04320.hdf5	add08d84fc08014bf67658ad7108af51
+part_02988_of_04320.hdf5	18289770eaac0d0fd0f6b8855a3da6a6
+part_02989_of_04320.hdf5	2ab7d5b12bf030e84c7b729559fa76c4
+part_02990_of_04320.hdf5	a75bd4f8420cdf58d117b741dacd7edb
+part_02991_of_04320.hdf5	5832eaba5385758f3a46f898ca1a73cb
+part_02992_of_04320.hdf5	a6af4972defa59349bb13d4205ac20d6
+part_02993_of_04320.hdf5	d3222c433a4e3657480158b6cbd298cb
+part_02994_of_04320.hdf5	e6cfe56d3ab6943649c2fc8f5ea2e32e
+part_02995_of_04320.hdf5	65052a9055de4cb37d347016f761fd7c
+part_02996_of_04320.hdf5	f913f56243d6dfbc3477be96850991d9
+part_02997_of_04320.hdf5	76d786717e01b3ac02b150ab7ae700b4
+part_02998_of_04320.hdf5	7757bf965be6270ef0654fc0fc7c259c
+part_02999_of_04320.hdf5	b386b45ccb09d43831aa54cde3b44e18
+part_03000_of_04320.hdf5	e0892f2cd707b9448bbba513794def71
+part_03001_of_04320.hdf5	bb69460e54b92628d2f3fafa374f22a9
+part_03002_of_04320.hdf5	6287ce7cc44146ffb9eeae6e4586ea22
+part_03003_of_04320.hdf5	22177d8786d2ea0150c6501c34ce670d
+part_03004_of_04320.hdf5	03d0fb9d313de0613fbb2b5d4bc11b66
+part_03005_of_04320.hdf5	9feeaf74accb0bddfb8394eb14adceff
+part_03006_of_04320.hdf5	7f38bd9ca93b01a690143cadb390717c
+part_03007_of_04320.hdf5	09c8e70bdd33d583da803021f77ff0bc
+part_03008_of_04320.hdf5	ce4537ba444d8d01199c44ab7d31b581
+part_03009_of_04320.hdf5	9fa8a50a244c942a754a6f5ad87701f8
+part_03010_of_04320.hdf5	485f5ba22a8dd4824d2c20d84a9221b4
+part_03011_of_04320.hdf5	7d4936cb2de8f393e63219e44c1c474e
+part_03012_of_04320.hdf5	ee8cd6de37dadf688a9ac6554ab1aff0
+part_03013_of_04320.hdf5	fd611321fdc541da3eba2564fff0d0c3
+part_03014_of_04320.hdf5	1b157810ac1e5805d0ebc2627fd76dba
+part_03015_of_04320.hdf5	98ce2e2925244f142a627b0050eebf93
+part_03016_of_04320.hdf5	7262548cdfc5f08830d9d917f30975a5
+part_03017_of_04320.hdf5	57be611e936776db7f443a33011e135f
+part_03018_of_04320.hdf5	cb5a9ead01ab6a37d050215171f67fbd
+part_03019_of_04320.hdf5	61b264f854e7368d47f23252604b863b
+part_03020_of_04320.hdf5	1c660aa72590efed5025df732050c97c
+part_03021_of_04320.hdf5	3070ad648647d05b0371fe34bc06c619
+part_03022_of_04320.hdf5	d84ddbbf3d262b9c44a3d4f3074003a5
+part_03023_of_04320.hdf5	1bdd3edeab5b13069d63cafc7547bb27
+part_03024_of_04320.hdf5	7d037d7ed79345c9c21f23f9d0c29ba3
+part_03025_of_04320.hdf5	b3bfffc38a0ccbf82372d9e2eb1fad4a
+part_03026_of_04320.hdf5	9ff1a92cdbb6737f3e569dfcf022f5ca
+part_03027_of_04320.hdf5	35d2d66306d4cbaa01df168e6d9ae4cd
+part_03028_of_04320.hdf5	983632670481907928ecb3cf9c22c5c6
+part_03029_of_04320.hdf5	22c09934e8608911b98a1688f2f57364
+part_03030_of_04320.hdf5	3e0e623edcef69bd186b3f18f95790b1
+part_03031_of_04320.hdf5	7f343e0a5143bf8fbba1a7a163adfeb4
+part_03032_of_04320.hdf5	4e77e76f38f5aa7576ce97d40d50dfb0
+part_03033_of_04320.hdf5	6c3cf5047699218c308f9b47eb3316fb
+part_03034_of_04320.hdf5	7e1ed5ff1371159250c35aa8b8db4370
+part_03035_of_04320.hdf5	b37d59e531eeec4d4dd97bc71b9d3606
+part_03036_of_04320.hdf5	e5627d8c7eb3a6648c001b8351f62490
+part_03037_of_04320.hdf5	9e220025ea26f32e50ea2fbab5357820
+part_03038_of_04320.hdf5	feb64528bc1b9d9d77c3b91708c6c50d
+part_03039_of_04320.hdf5	46695ee2a2c7b9a5b1285beba467dd97
+part_03040_of_04320.hdf5	3707af1a8f020c3cf9150ef7ea976eb4
+part_03041_of_04320.hdf5	b5bd494b61e61caecfe74116fd456a38
+part_03042_of_04320.hdf5	00fc4119df78c01d0992bfd330b099cb
+part_03043_of_04320.hdf5	533b38d3ccc55e4ac132e94c5246e55e
+part_03044_of_04320.hdf5	9a9b93c89bdd2eac89f9d7cab82bb797
+part_03045_of_04320.hdf5	d3022bb7e3a3c7967362027440d0f760
+part_03046_of_04320.hdf5	d308c134a1528faea8f56791ab2360f1
+part_03047_of_04320.hdf5	8d2c24aced8972f636eda8fc999f473b
+part_03048_of_04320.hdf5	b869f2654e4c0688f59ac256b8097506
+part_03049_of_04320.hdf5	da499a0991a4aa2c80eded0405fa79dd
+part_03050_of_04320.hdf5	d020932962a0c48d8809399dd7ed13b2
+part_03051_of_04320.hdf5	74f35b45993a10ef7f16873dd4c4221d
+part_03052_of_04320.hdf5	eb8494222701f6e25d334ddefc53d18e
+part_03053_of_04320.hdf5	a7d419ea90d239e2dd661911c6e7eaf2
+part_03054_of_04320.hdf5	1f5dde25faeb3e4867f6fd6dffb2f2f4
+part_03055_of_04320.hdf5	8e72b730a7380611833f344050ef1908
+part_03056_of_04320.hdf5	bf043666a76e737261f774c5695d695e
+part_03057_of_04320.hdf5	f70f2deeada1a60ec6b28e0e8a1fcabc
+part_03058_of_04320.hdf5	6372334b6915aaeae2bf7d50d4a154e0
+part_03059_of_04320.hdf5	8838b4895617d66795c2cb4ab540fd7c
+part_03060_of_04320.hdf5	72919ba0fc73c4799155533c778d0bee
+part_03061_of_04320.hdf5	d80510896b5a4a71b118c6ca6856b864
+part_03062_of_04320.hdf5	d501c71c0f1df28799a902c104f58406
+part_03063_of_04320.hdf5	a22519632e9d32c4f631b20c30f9b601
+part_03064_of_04320.hdf5	cbdd2006390c547acc18f212e7b37717
+part_03065_of_04320.hdf5	7cd2a53a2be46e219d74f20e9951936d
+part_03066_of_04320.hdf5	54a218a04a7e435c2c5f33a93e5905a1
+part_03067_of_04320.hdf5	d9319ef5c112b4399e6f81e53b613a66
+part_03068_of_04320.hdf5	10835297613804083a72852cbe88fa1e
+part_03069_of_04320.hdf5	69cbda312a2db519131db7653ec0ed37
+part_03070_of_04320.hdf5	fe7d354bb5c6038e0c1438050a72209f
+part_03071_of_04320.hdf5	9b286f2b0f5e7b1ef9fdac37b0874d2d
+part_03072_of_04320.hdf5	1c7efe14060600e17098a4af7b5f844f
+part_03073_of_04320.hdf5	7c8d2ac965185ba6ff955fc28069ce6e
+part_03074_of_04320.hdf5	76129fc0aabffac5b39083bb73df947b
+part_03075_of_04320.hdf5	3e39779897beff4085b47819e01213ae
+part_03076_of_04320.hdf5	14dbb867a222151706cccde663115fbc
+part_03077_of_04320.hdf5	53433021ee6d6ed11e49e5dabecde7d4
+part_03078_of_04320.hdf5	cc23557b2b0f671d7db983b90d77ed45
+part_03079_of_04320.hdf5	4a13a937771e22c4e85873a5905cea55
+part_03080_of_04320.hdf5	41d5aba690081d7ddbf13fe2cae47471
+part_03081_of_04320.hdf5	d2e2352502debf8bdfa5b7e6b1932140
+part_03082_of_04320.hdf5	c38e77c80b4e13ff631f1e3391917b2f
+part_03083_of_04320.hdf5	36692b37e1e4c0bf283ee41a34a43707
+part_03084_of_04320.hdf5	99f82f0e74c480d9ff985f0c7aab9af7
+part_03085_of_04320.hdf5	6bee12ad7fba4e005e7d9f6b50be3ea6
+part_03086_of_04320.hdf5	be6e95f252c040e6f9bc6447568af149
+part_03087_of_04320.hdf5	6900e34fd10433da222c19a10188412b
+part_03088_of_04320.hdf5	b1211bbfdf034541f0a129104d36de8d
+part_03089_of_04320.hdf5	c206c93ff68d6e3559da883d98471418
+part_03090_of_04320.hdf5	6855e95260bb4fb6c0e63f4cde9966c4
+part_03091_of_04320.hdf5	b6ea39171149bea294fac2fc3b1d059a
+part_03092_of_04320.hdf5	b0bd2a72e3ad23d2f4fbf21ab5819cde
+part_03093_of_04320.hdf5	92b0a88279be9a6e7afb30bb2882b437
+part_03094_of_04320.hdf5	7369682b068869957e52ab7b2a5906aa
+part_03095_of_04320.hdf5	edbaf7716b24a29c27da0bba654b9c48
+part_03096_of_04320.hdf5	5bb4f8fda0ab2d3110aaec068cff12cf
+part_03097_of_04320.hdf5	574d560e555d78b10d617ca69bec87ab
+part_03098_of_04320.hdf5	cb791902e4bbdef254d996f77e33d394
+part_03099_of_04320.hdf5	902d3240c6e58e83368f782009e86ce6
+part_03100_of_04320.hdf5	1f74f84978f634ce0f96f70aba3c77a4
+part_03101_of_04320.hdf5	779817f840f1eb2ca662fec499683557
+part_03102_of_04320.hdf5	b3e07b3fd4d6d993ae0bbf61597a9055
+part_03103_of_04320.hdf5	34f23c560dfd04d6b10c830e53ac8512
+part_03104_of_04320.hdf5	297be38e321bc2517bf6ac123c95584e
+part_03105_of_04320.hdf5	b9c13b304d8d9d3b66cfb2361ce14b9e
+part_03106_of_04320.hdf5	cbba67d6b9a34e182815e6484cf7fb75
+part_03107_of_04320.hdf5	f8ba581e76ebfca72f5b2296674aab48
+part_03108_of_04320.hdf5	2f515afec387ac57adb158e2d6fd8299
+part_03109_of_04320.hdf5	093bb37f7233e0154ccadcd8350f80b2
+part_03110_of_04320.hdf5	11534a93b2b01362d2c777a786714c9d
+part_03111_of_04320.hdf5	d8a051232d6bada89bc666d59abab459
+part_03112_of_04320.hdf5	810a50708933a24b9c9c0acdc5383d7d
+part_03113_of_04320.hdf5	8c08a56c29b674408872b3d8f31e7dd8
+part_03114_of_04320.hdf5	312450e23bd9f9ad2dbabf98b8d49a3e
+part_03115_of_04320.hdf5	e67f97e3b0801aad959512dc79fa6bf7
+part_03116_of_04320.hdf5	c4e7e21ed4b221631513ec60b21ddf77
+part_03117_of_04320.hdf5	a49f3050066c6e9964c0ab0ee3060645
+part_03118_of_04320.hdf5	a0e4623ecf16bcb933b120011e781fef
+part_03119_of_04320.hdf5	b7b430a6994117408e14bc2fa8c69ca5
+part_03120_of_04320.hdf5	7790520cbbf9ab8ac195aea909ffe643
+part_03121_of_04320.hdf5	c5ea172bc1a666e5036f40e309994364
+part_03122_of_04320.hdf5	0c4568557d572fcb63680bf7a1d11a59
+part_03123_of_04320.hdf5	c4e25f14272edcfd83b8fbe924b74a7f
+part_03124_of_04320.hdf5	a3c730a81ef43731535af9636d557e58
+part_03125_of_04320.hdf5	1940fc0c37f3af5729c0014e2340ca02
+part_03126_of_04320.hdf5	f3b58309577fa62b47d03e12b16c1f04
+part_03127_of_04320.hdf5	108b34c49235ab0f6ae4606668cc98d9
+part_03128_of_04320.hdf5	6a344cbd4e39e3578f3dedb6cf248841
+part_03129_of_04320.hdf5	e64ab0c5f68683b5b9346e3a3f5cba0e
+part_03130_of_04320.hdf5	a242a65ef6866fac77cd9e8b9f0e218e
+part_03131_of_04320.hdf5	358bd656af150e2ea06fb3ec0a4804c9
+part_03132_of_04320.hdf5	958a9207baee90f8bcc81bd1a31d8b20
+part_03133_of_04320.hdf5	5a2552001733ffa862523de7f7324dce
+part_03134_of_04320.hdf5	2ac2915a4fe2ed84f0b88cd2d49a4103
+part_03135_of_04320.hdf5	536b8a22bceaf7b1027850a0ee42c0d8
+part_03136_of_04320.hdf5	b2e32db8dbaba69fe8ab5259e792dac1
+part_03137_of_04320.hdf5	b0eabdf9431555d43f0834bf354e1d02
+part_03138_of_04320.hdf5	aee3ad73fe264d20781f3e6fb2284d7d
+part_03139_of_04320.hdf5	2b7ca08095cbd4f99bd7959ae856b3cc
+part_03140_of_04320.hdf5	686e4705de939b04a3cfb4742401c919
+part_03141_of_04320.hdf5	67a1b3107701af2a7949ec1d7573bbd3
+part_03142_of_04320.hdf5	11d54e7adc4377394f18815d9020c457
+part_03143_of_04320.hdf5	79d42e0c4545df8c6035730836c3a610
+part_03144_of_04320.hdf5	236b077c8b85fc04805ae7a288c1bd02
+part_03145_of_04320.hdf5	49b539d6a5de8a1ef6aac8e626c652d5
+part_03146_of_04320.hdf5	58ed7ab08c5197f9b527d794d28091c1
+part_03147_of_04320.hdf5	f0ce6bbf5bc0c286e624ece02f89a291
+part_03148_of_04320.hdf5	a62a0cf6ec7d7eda4e8e1887074cd5b6
+part_03149_of_04320.hdf5	59bfb89ad442cfb02bcd06d472a5f67b
+part_03150_of_04320.hdf5	31546255ee50b4aa186b9709e49d913e
+part_03151_of_04320.hdf5	9434fae1501688b38adf72afb19ba6d6
+part_03152_of_04320.hdf5	7535388a2cdeef67b8030d7b0ebccead
+part_03153_of_04320.hdf5	6acd93a361c5382562e4e4e4a8a94d54
+part_03154_of_04320.hdf5	32e46c187dcd20a2a006a8c7a2fa769b
+part_03155_of_04320.hdf5	6008912e41241c8ff71a98ca0ae69fc6
+part_03156_of_04320.hdf5	68111c59552a87fc9d4acbb268e3ec8a
+part_03157_of_04320.hdf5	f29b1c6ff9d9c191e34d7825de40fc3a
+part_03158_of_04320.hdf5	4084cdbdb12e3dd3e75f1d3867c105d9
+part_03159_of_04320.hdf5	52c13c558c2c626e2a627a97e3a91cc3
+part_03160_of_04320.hdf5	a46d325eddd26c8a196e201d1ad9cb61
+part_03161_of_04320.hdf5	72da29cba44dca229ec7a7d7871973ce
+part_03162_of_04320.hdf5	b1adcf6c4c61055821d4a1da744acc7b
+part_03163_of_04320.hdf5	a4a548efaf67eab59d610a7a2c36246b
+part_03164_of_04320.hdf5	96ec776c61c8afc52625f49f7cfe4065
+part_03165_of_04320.hdf5	72be3e91ee9b4a32ca1f040337128200
+part_03166_of_04320.hdf5	f2392e10d218e975fd709244bff5d88e
+part_03167_of_04320.hdf5	519013dc4fc0c29ee9f3f1adbc2851e4
+part_03168_of_04320.hdf5	b6c56896757b0b2d983872aeb5b010c7
+part_03169_of_04320.hdf5	935160c6fddf9201ef9beabb44c71cc4
+part_03170_of_04320.hdf5	f2f494264dfd7912981a4620ea13cf9f
+part_03171_of_04320.hdf5	7ff5341b5db7200b86615e175028e217
+part_03172_of_04320.hdf5	f2c1ed35c668578fe5d6d12c4eb33ef7
+part_03173_of_04320.hdf5	da141a0e4fdb9d9422f0c5b09da73c91
+part_03174_of_04320.hdf5	e313da22564b1741f6b95d155d210fd9
+part_03175_of_04320.hdf5	8e91fc7a4cff9e903f220bf8e89eac6a
+part_03176_of_04320.hdf5	cf692bd1f34f76d163c7acc43e6f92a7
+part_03177_of_04320.hdf5	893e25ffb2954be3eb0b0c06aa41d5dd
+part_03178_of_04320.hdf5	f58e320366c20320af8a2656f77800d8
+part_03179_of_04320.hdf5	403a6c2d8ae764875b2d305460f45711
+part_03180_of_04320.hdf5	d39de18ee21edd34ca057633f4e6474b
+part_03181_of_04320.hdf5	d9cea76a683e23503497c7d2d619f654
+part_03182_of_04320.hdf5	af9060dcaa81f815853ba34067e8414d
+part_03183_of_04320.hdf5	54ed70a0e3be9e423fab86a70907f5b6
+part_03184_of_04320.hdf5	cef795b8a433bd36d8e4ec6475e80e20
+part_03185_of_04320.hdf5	78b5ac8ec79248fdf1f306e53b5d6475
+part_03186_of_04320.hdf5	db121ee7606ad6ee3f97c9e677c046b2
+part_03187_of_04320.hdf5	20a72bea3f794b1db588a0eb1bd4288c
+part_03188_of_04320.hdf5	ac8a1c4973138b6cb0a5259fa4d1500f
+part_03189_of_04320.hdf5	8ebba2aac99355e398e9da283451c463
+part_03190_of_04320.hdf5	a980bef9248bee4c54f05fb05bda7e5a
+part_03191_of_04320.hdf5	2059a37059debf694d101dd4cc78294e
+part_03192_of_04320.hdf5	04aca7dce6d624327e700a4de1857674
+part_03193_of_04320.hdf5	687d1160bf980bf9a0f021dd6152896a
+part_03194_of_04320.hdf5	5d7133f9aca3828736dcff646831aa4f
+part_03195_of_04320.hdf5	f950a8b419d65a9045b881b877d5e244
+part_03196_of_04320.hdf5	b6f5f7e93c3dc710d4f057228c2e174f
+part_03197_of_04320.hdf5	36e7f9141debb6326006b72044bc6ce3
+part_03198_of_04320.hdf5	bb47c8e08a4312c3cc4b1e4c17a1c2a1
+part_03199_of_04320.hdf5	7100cc9aaea78e68ace69b7d12350e9a
+part_03200_of_04320.hdf5	57f3cb7de705ee1ce508da237849ba28
+part_03201_of_04320.hdf5	1a4217ca47d8658c03f88b9046063650
+part_03202_of_04320.hdf5	48ee7b4bdad536f618157b8f2076daef
+part_03203_of_04320.hdf5	582425144bd649a8c4df6615729915f2
+part_03204_of_04320.hdf5	bf639ea9d0986b53c8029c97ec41240a
+part_03205_of_04320.hdf5	07ab98526a1ff3625c8b7487df9fb3f4
+part_03206_of_04320.hdf5	c641eee2b02acd85f217c820b336c392
+part_03207_of_04320.hdf5	bd4b9abebb56818d16904e762e0a90d9
+part_03208_of_04320.hdf5	5842b6eb50bca669f73271d9714aa36a
+part_03209_of_04320.hdf5	035d22626aede89e71906cf18725840a
+part_03210_of_04320.hdf5	beb85629b623ffba67e5e2faea3a7f38
+part_03211_of_04320.hdf5	3bb7fa87a10de4cc28b851d12cce1342
+part_03212_of_04320.hdf5	b070ea3c42062b75e28a735d4730f313
+part_03213_of_04320.hdf5	2fd66e73c204d95db9b13196da4e7941
+part_03214_of_04320.hdf5	b80472e831ba1066ceab57141c7917f5
+part_03215_of_04320.hdf5	f87577a445ccc3891d742fc7cd8b796c
+part_03216_of_04320.hdf5	cc579e4e6a1f982f1b305a4aef3af5c3
+part_03217_of_04320.hdf5	b8b35799fe91a534cc29021725a8ee6d
+part_03218_of_04320.hdf5	e1e31ca27578634cd01272d47e0ecba2
+part_03219_of_04320.hdf5	77fbd30889ffe2cca8a26baf3177b4a8
+part_03220_of_04320.hdf5	ad85f2e4f967c787aa05baac7f7bceb9
+part_03221_of_04320.hdf5	d9941a583611b657609fb48b073f9262
+part_03222_of_04320.hdf5	e20dece573c5751e2695f212d109bebd
+part_03223_of_04320.hdf5	09a9089455d49f6c025e028b839f33c1
+part_03224_of_04320.hdf5	8a6b1921bede647a53deecf1ccf123a3
+part_03225_of_04320.hdf5	095b2a7090644a8f6ec3c13e5ec256c2
+part_03226_of_04320.hdf5	8d9acc59e3737d67dda5059f2614b734
+part_03227_of_04320.hdf5	098519d7c3a672ba23e82bcb9c930336
+part_03228_of_04320.hdf5	ff7e6931f79245ecc2a03b4ec5a32d7f
+part_03229_of_04320.hdf5	559d1ad1093ad502ab3e3d0deab86e3c
+part_03230_of_04320.hdf5	804a01789bba76cfb934ccd24612319c
+part_03231_of_04320.hdf5	f8c6209fde4e2bf421fcc3137a244f85
+part_03232_of_04320.hdf5	6def167526c8f36981855bd373b98222
+part_03233_of_04320.hdf5	b0e61d5e917c4666d38a669a3b437308
+part_03234_of_04320.hdf5	5cd636c880ac35f4f2fa88efc24e037b
+part_03235_of_04320.hdf5	6197cfc7a6246deab4d09c59b20b5710
+part_03236_of_04320.hdf5	959d90455685b1b32023a193b1acc68c
+part_03237_of_04320.hdf5	f0c3e1f27faefabe9186943922219cff
+part_03238_of_04320.hdf5	8a8767600967d3b27d2c63d7d3c17440
+part_03239_of_04320.hdf5	2e701eb6328718d9d9e7ab9ea52f3a7e
+part_03240_of_04320.hdf5	22ed4a861eb6f3899ecef53ccb9a4bd6
+part_03241_of_04320.hdf5	63464e0a567a055c9d7012c899bc25b5
+part_03242_of_04320.hdf5	d54cfbccc4dce9f68306c7b19577a885
+part_03243_of_04320.hdf5	23465ab391fbb418615606f4061ad5db
+part_03244_of_04320.hdf5	675c8122391c8e24c8fde3e502721f13
+part_03245_of_04320.hdf5	ebff0350dc9a51eab9139c50679877a5
+part_03246_of_04320.hdf5	dab03b1e916673bbf49f145812df9b79
+part_03247_of_04320.hdf5	958783ee9c78d6a6d2b47a9242916fcf
+part_03248_of_04320.hdf5	a1a763265f94c69dd614d0b683f0b069
+part_03249_of_04320.hdf5	f5e0cd0f0c404e05444bec0e9b965cf7
+part_03250_of_04320.hdf5	d62b5af99ac4f2c3f1e738dfcb51bee9
+part_03251_of_04320.hdf5	55c5cbd91c16f07316a31bdcd7d1e281
+part_03252_of_04320.hdf5	024aa2c41d21bd53afcf8fcfd5788911
+part_03253_of_04320.hdf5	5ca097748c6670cc28718c4519f5b256
+part_03254_of_04320.hdf5	8e18bba1e10367d07bf6a623fefb18b3
+part_03255_of_04320.hdf5	6ba2fceaa3ac5a3ca4b5a9977cce1647
+part_03256_of_04320.hdf5	d3204666b6fd1ef34b7f5126686d03bc
+part_03257_of_04320.hdf5	4bb53373a42cc3497b4028b9cd6c63b1
+part_03258_of_04320.hdf5	56649e50f85fed1cbde6b2cad06d3c00
+part_03259_of_04320.hdf5	32b343cb47098656cc0c3437c5033715
+part_03260_of_04320.hdf5	346ed478a05f96210ea1a581d2d5abbc
+part_03261_of_04320.hdf5	07c30b6c70e8c7e35aa1b12d6e8c0766
+part_03262_of_04320.hdf5	671a68626c576d03ae6a01fc18cfbbfc
+part_03263_of_04320.hdf5	2bca9d27b32bcb73f1c936d901124353
+part_03264_of_04320.hdf5	44afa90e4f41dd8c7afec343e21743db
+part_03265_of_04320.hdf5	2dd212dd2b84d2778260b4ec004c835f
+part_03266_of_04320.hdf5	1007dabfc731893f5bd6a60dc43fb525
+part_03267_of_04320.hdf5	ae4a9a7b4f4a97f7b6fadd749b99851c
+part_03268_of_04320.hdf5	711e856cefc7e0135084c162aec5947d
+part_03269_of_04320.hdf5	19d803ed2d7f4a874c4ae22de0f2f880
+part_03270_of_04320.hdf5	8d9e10849714b88ca35b32d0f13edf4c
+part_03271_of_04320.hdf5	a27cde131b840e7c7ecb5e046a8ad8a2
+part_03272_of_04320.hdf5	2e1b718df1be6cf59a9a5906d8b7d472
+part_03273_of_04320.hdf5	d3797b0af3692a9f72466aa850c0e318
+part_03274_of_04320.hdf5	0906f91e9d8e74c43b5ed5a32db4ed68
+part_03275_of_04320.hdf5	b6863346ecd5863a23807a8758109a19
+part_03276_of_04320.hdf5	5c4ac0621d66e6fb1a5d159bb0b8c9fa
+part_03277_of_04320.hdf5	252e4d988f27e0257beac5d08a484a1f
+part_03278_of_04320.hdf5	0ea0f6d67d49f6593b1abf1c4b02a995
+part_03279_of_04320.hdf5	7ff6548bedf3f7aea523779e24be64dc
+part_03280_of_04320.hdf5	925b89d0f7f359eab06edc7b6f962b2d
+part_03281_of_04320.hdf5	4eabea6b44e8289806de062ef735ea40
+part_03282_of_04320.hdf5	b01ac331647738687b1a47e19456946f
+part_03283_of_04320.hdf5	5cb36ae1441a915d5ec22707367512a1
+part_03284_of_04320.hdf5	1530e720c5e1b1d34c7e11c216bdbe11
+part_03285_of_04320.hdf5	a18a2003fdd2038c8923da6b214d6a8f
+part_03286_of_04320.hdf5	f802e72723cfd2a0270fbceccfee92bc
+part_03287_of_04320.hdf5	3f7cf5f7cf7db0801f2e4601cac7de81
+part_03288_of_04320.hdf5	087a5974226a4c6a23fcc3b8a1a5075e
+part_03289_of_04320.hdf5	807a2a914141b93b0ce9bb29518e2487
+part_03290_of_04320.hdf5	07e76c09f4bbc5c83dcc1c8304dcf1c6
+part_03291_of_04320.hdf5	6dcb4525567eb4ef1df7a183a3b7adb1
+part_03292_of_04320.hdf5	458222c2eb9389659477aac3fccb6c4f
+part_03293_of_04320.hdf5	6333ed754c4d4aaca7f1ea18be232793
+part_03294_of_04320.hdf5	04f16db285fa292c1e544f8190909bca
+part_03295_of_04320.hdf5	6c70b1e037a3cddc81b6f4bdb8dc01be
+part_03296_of_04320.hdf5	686e37acd971f55c285f5e039a66cf70
+part_03297_of_04320.hdf5	61ca38f81a0d5771e0a808c7254759fe
+part_03298_of_04320.hdf5	1a09f92d1cec20dad3169f5e5297bbbb
+part_03299_of_04320.hdf5	a8554a3065445e4e7ee418169ec3d3c0
+part_03300_of_04320.hdf5	88ecbf2402efb2269c119c8f5f3b8426
+part_03301_of_04320.hdf5	6e9945b53c84f42a9a912766eb14eb1a
+part_03302_of_04320.hdf5	d92fc588caff1368f04e38d48c4486d8
+part_03303_of_04320.hdf5	ecdcb1f532bd59826e27243e309eb53a
+part_03304_of_04320.hdf5	4e0021bd0c073b69e52ef49562b89935
+part_03305_of_04320.hdf5	07f7c536e23739fdc31979e29bd6047f
+part_03306_of_04320.hdf5	027592bbdd59fce959b2ecdbc8577c25
+part_03307_of_04320.hdf5	8dfce5b3812fc4a15f5164f80bef4c5e
+part_03308_of_04320.hdf5	ece94af43dfc8b214b4d051a9fb7332e
+part_03309_of_04320.hdf5	5cd8ddba5fc448ae91fdee53ad06ce21
+part_03310_of_04320.hdf5	e84aa4150b2fcd1c6a9de25bdceeea96
+part_03311_of_04320.hdf5	64e399a0183279aa58d95bc2ba9a8cba
+part_03312_of_04320.hdf5	b873110ad6e9d3efd0f90ce5e9af8e47
+part_03313_of_04320.hdf5	ce2d943a08622fef0af6b63ec96b97cc
+part_03314_of_04320.hdf5	ace0cdb927f80361c1d9c0bbd8ad0b06
+part_03315_of_04320.hdf5	a32d6eb7ae147f48c80c7dc3d322ef22
+part_03316_of_04320.hdf5	4d964ae50fe6674bd206799368a7236c
+part_03317_of_04320.hdf5	ca0c985f317591043b1f18014e8165e2
+part_03318_of_04320.hdf5	acff857de937f1108c279d620895e855
+part_03319_of_04320.hdf5	385ddc1204f047464c82c75275f381d8
+part_03320_of_04320.hdf5	f56bea2b1b03846c4292f65106acbb7c
+part_03321_of_04320.hdf5	9f08ae363ae9cd4d2abfd1431cb1cc85
+part_03322_of_04320.hdf5	571311f83a28e84e38b57a4fae415531
+part_03323_of_04320.hdf5	52a331cbd365b366643edf04106c2a7d
+part_03324_of_04320.hdf5	4add9fd4851b3c48b07c469218b35c46
+part_03325_of_04320.hdf5	e0956b512df61f6b60a12357bc43773e
+part_03326_of_04320.hdf5	62ff3975ebfdfb47f1cf91659e0e7b08
+part_03327_of_04320.hdf5	f41f4ebabfe4c661d363e9697eb471a1
+part_03328_of_04320.hdf5	dc7413ddad775ab669caaaba29409ced
+part_03329_of_04320.hdf5	11abadd7823d3dad130cc32a53bcdda9
+part_03330_of_04320.hdf5	3924ebbd234b6d0b8f4a199cd10e34e8
+part_03331_of_04320.hdf5	6ffab846a77d90e37135493f1448eaf1
+part_03332_of_04320.hdf5	6f8db0af08b839adced3f7ef64b9cbf1
+part_03333_of_04320.hdf5	ec0234bfc9a37f581b192a7a886bf3e7
+part_03334_of_04320.hdf5	dab2f604d7dd278079b2170ec2d9e880
+part_03335_of_04320.hdf5	f31c8a701810bf3854770f6906986f03
+part_03336_of_04320.hdf5	553286b2bbef1a7f8a0581db6975e837
+part_03337_of_04320.hdf5	ab6fe8306aa1d301d62cbcd4e35df117
+part_03338_of_04320.hdf5	07b8b245408285fefc753307b75f81c2
+part_03339_of_04320.hdf5	2bef8d13d4e5c6e765e4ee085674a701
+part_03340_of_04320.hdf5	3ef0c8462205840d90d6cfbbfd9b9b56
+part_03341_of_04320.hdf5	011b84c9cd845688cc3028c39c378408
+part_03342_of_04320.hdf5	ce46feaa09bb122f3a4eab8fb29d7d2f
+part_03343_of_04320.hdf5	ac3572664688f76ac8aba1d2dc785f03
+part_03344_of_04320.hdf5	7e9c4bd4826e3231b8b3d39f6be22651
+part_03345_of_04320.hdf5	aa2e45f324d99942d4830994c42c58ee
+part_03346_of_04320.hdf5	303ea924550fdbea58f7502d1c58818a
+part_03347_of_04320.hdf5	10d1e2996d4f475168e95f94dd34da6f
+part_03348_of_04320.hdf5	e47fb46fd22c3a037873d41029eb5a92
+part_03349_of_04320.hdf5	f1743f9770183780aee63d778222be53
+part_03350_of_04320.hdf5	caebf3f57b24be2b6548deeea0dd7ce1
+part_03351_of_04320.hdf5	83a4733dc1d989ab069fbb6057f08ad0
+part_03352_of_04320.hdf5	136fb11d14e383a088bfc604f0edd77c
+part_03353_of_04320.hdf5	6df68619bc9c416dadb6b4af5ffccc2f
+part_03354_of_04320.hdf5	efc4638d95e6c7884053cfb727e0a60f
+part_03355_of_04320.hdf5	73b99469fdef13c751d1ec99535c0318
+part_03356_of_04320.hdf5	df3ff813b74582721280b1653c189adc
+part_03357_of_04320.hdf5	92bd8a5c2d3f71983e720c9e94cd4a1f
+part_03358_of_04320.hdf5	9fa7049d69e84f1ea1fa0ca64701fbba
+part_03359_of_04320.hdf5	2f9de475dd7e2a5a9ccd67e0d9d58103
+part_03360_of_04320.hdf5	c8fbc97bb533b8eb42c88c2c946d5c21
+part_03361_of_04320.hdf5	95ccd410e3d718f614bb8f4565b3380a
+part_03362_of_04320.hdf5	bb172d898d8185c5e1c891477070abc8
+part_03363_of_04320.hdf5	8bc9e57239d1b8fa5065f96de668cfdb
+part_03364_of_04320.hdf5	a1fb43be67cc4fc6f2b1244341820be1
+part_03365_of_04320.hdf5	a1fc47448723bfa79dd15d2604a8c6db
+part_03366_of_04320.hdf5	7cbf11fe9dd73d5cfd09496e776117a4
+part_03367_of_04320.hdf5	ff324196fd3002cadcc3fd382c0f50a8
+part_03368_of_04320.hdf5	071f1d813d2c016cac363dcf0c02cdcc
+part_03369_of_04320.hdf5	9d029a28f8b88f1949a9d43ee0869645
+part_03370_of_04320.hdf5	0b4b648346a99a85fc58d89fcb86b38e
+part_03371_of_04320.hdf5	d1000a4b1d10993036f957bf06ac7511
+part_03372_of_04320.hdf5	5cfd81e9bd3dac82f1c90046b7820ee5
+part_03373_of_04320.hdf5	ad0142827488f103e6fa62d3273095a7
+part_03374_of_04320.hdf5	ca3b01cbb236a526de60e8615c9540b1
+part_03375_of_04320.hdf5	dc8e89ada0ac4a04b04d970d134b2c0c
+part_03376_of_04320.hdf5	31f8ace477771ea68a3acb7bb4ceaf24
+part_03377_of_04320.hdf5	1c51673cdc55fa2b8e54ca9698833a0b
+part_03378_of_04320.hdf5	34ad443ab45114cf28da41738bc5cb07
+part_03379_of_04320.hdf5	dcb0ff23ec7244f8ae767ec1255fa5fa
+part_03380_of_04320.hdf5	394e348939ebaadd61d05a7cc7f35a0c
+part_03381_of_04320.hdf5	67d6cd7c2d19066b6dd7f0b5594a0cfe
+part_03382_of_04320.hdf5	43a7f0d1a561cca99d5227af8fbbeb8c
+part_03383_of_04320.hdf5	586a4b5095255cfb515255ef5973c5ca
+part_03384_of_04320.hdf5	c47b4e563ee909efd79087c44c04793f
+part_03385_of_04320.hdf5	d98a0182ea054ff145d5442ac3a7065a
+part_03386_of_04320.hdf5	3d2eef467c68b184d9c6b82f3a52603a
+part_03387_of_04320.hdf5	8b47e2f3f3ee2bbe4462dac6dbd5b4ab
+part_03388_of_04320.hdf5	89f622f73034c7008e9316a9a550132b
+part_03389_of_04320.hdf5	dd6ada0336bbf2174f363a8495c8eb97
+part_03390_of_04320.hdf5	f8b5cfdcbcfbc857008a807a7534cc15
+part_03391_of_04320.hdf5	5a173d94cfacc56b10a67c1f26fe1e7a
+part_03392_of_04320.hdf5	e3bcba06ad295608530c177e5269b467
+part_03393_of_04320.hdf5	d6ef57a52378c910f041f9a6cacb5700
+part_03394_of_04320.hdf5	422f5f804eee6fd658167b4874b26f5e
+part_03395_of_04320.hdf5	b14b97bdf2110fb09a099e39ab88f242
+part_03396_of_04320.hdf5	dd5e18014a6027e275f7467c791ed164
+part_03397_of_04320.hdf5	7cbaa17c21045e0ebe48af8f0796cc1a
+part_03398_of_04320.hdf5	152a7aa951e59ad08409898615031fab
+part_03399_of_04320.hdf5	78f3f49795dcdc9742d9c2fe1f25dd76
+part_03400_of_04320.hdf5	3d16066d6004046e57dd2c085e020bb0
+part_03401_of_04320.hdf5	b61dfcc0f03d077e147ea721e38731b2
+part_03402_of_04320.hdf5	e23b4d9efcbfa0132a6b1513e0447dfc
+part_03403_of_04320.hdf5	7e1a486550f0fa504817de1d8614c20e
+part_03404_of_04320.hdf5	173819da4d476a3c00c823cf7aa2465b
+part_03405_of_04320.hdf5	3634b613d6ff13cea6db8b1f5ff0c64c
+part_03406_of_04320.hdf5	0862459bcd505d77dcb3e912370a4206
+part_03407_of_04320.hdf5	68ee8d6632f6c232623cd64c500e6ca1
+part_03408_of_04320.hdf5	8298ccc49c64ffcfde8c81eefb6cc7bc
+part_03409_of_04320.hdf5	7725f5b39c24444acb748da679e48934
+part_03410_of_04320.hdf5	d9a3887038d974295592c655c968b7c6
+part_03411_of_04320.hdf5	b7e109a28be530cedde347eacdb94286
+part_03412_of_04320.hdf5	f327760effb983415283d0bdfce99167
+part_03413_of_04320.hdf5	de29350b4eada270ce0fea81a43e2b8d
+part_03414_of_04320.hdf5	d3c9a294627f4f09f62ed5786ece0faf
+part_03415_of_04320.hdf5	3690b806e1da020b7939373adc1e3a33
+part_03416_of_04320.hdf5	2ba6d64d104ea4473c40b5717665f37c
+part_03417_of_04320.hdf5	af04b2045bf409ac0e042d926d84d60d
+part_03418_of_04320.hdf5	7bd81e51b20ebde22d8d9170a5f2a4f1
+part_03419_of_04320.hdf5	2bd5b29626c21fd01abc2aa24a8baeb3
+part_03420_of_04320.hdf5	b7ebf33e6656dc65e3322c5994e5f7e2
+part_03421_of_04320.hdf5	bc14c26e3814117b0614c92ea5b6de2e
+part_03422_of_04320.hdf5	675c739a40c0531afe59d0fe193333ef
+part_03423_of_04320.hdf5	0b9c5ce2ac2882777084347fa3e45897
+part_03424_of_04320.hdf5	f2d26fe38ff2e5f9210799553f5a4004
+part_03425_of_04320.hdf5	f35cf6ccd3fd60d3517e297464cbaa88
+part_03426_of_04320.hdf5	b5d1869a59c7570dc4a65130c836ffe5
+part_03427_of_04320.hdf5	551b804fc5b055f136559c70c125553d
+part_03428_of_04320.hdf5	0785418bfaaf8a959871244604c0f3c2
+part_03429_of_04320.hdf5	d562cd3e0f995a35443d65959f9a4b23
+part_03430_of_04320.hdf5	3ab80aad7ade1ebbaffa64f9b1237f4c
+part_03431_of_04320.hdf5	443273787a30723faca0f489d89c6223
+part_03432_of_04320.hdf5	512cdc86d445254cdda5ee8e7adf77e7
+part_03433_of_04320.hdf5	94dcf3a0482e618d9b584dacff8be140
+part_03434_of_04320.hdf5	130ccecb2ee9108b3219a942558f8bab
+part_03435_of_04320.hdf5	8050cd27806bc3d9f379de655e51d173
+part_03436_of_04320.hdf5	288239e942e894ec37f10079da6809d4
+part_03437_of_04320.hdf5	5631b52af4af1f4a9cbab9ad8a7202e0
+part_03438_of_04320.hdf5	8384b6a82f987d977e3e0abed48639fa
+part_03439_of_04320.hdf5	1fcdafcd82efb3661f4d00fcd1490088
+part_03440_of_04320.hdf5	29efe102a3d4e93c95225aa41c6aed51
+part_03441_of_04320.hdf5	872fe08f588040579959500b92117d62
+part_03442_of_04320.hdf5	2d5cc8dbeeae30ebc13909bd25664f78
+part_03443_of_04320.hdf5	f0b216cab42bd7276ad1dded7eb25e09
+part_03444_of_04320.hdf5	e73a924b6a6a579a84ec0acb086c1aa2
+part_03445_of_04320.hdf5	036a46914d5f8742f50d8535a5c1bbc8
+part_03446_of_04320.hdf5	dbade371aa4e8ac8269686af98abe8e1
+part_03447_of_04320.hdf5	0735b1764f96aaa17570cf7d32756221
+part_03448_of_04320.hdf5	fc7a8a9f9d1ed5c3d9a28216be57587f
+part_03449_of_04320.hdf5	737a2ed19020a38f6edf5c3230e0ccc7
+part_03450_of_04320.hdf5	8438c2f4cf2e9e0f5c69f43002e94181
+part_03451_of_04320.hdf5	ab34ea161d869ab00303074028ac53c1
+part_03452_of_04320.hdf5	04d65a0399451d009cd92ce693a6335f
+part_03453_of_04320.hdf5	f17d0e35dcb1288cfb6ec59bf9e351ed
+part_03454_of_04320.hdf5	595ad109cf913238a53ced6d14dfc8a3
+part_03455_of_04320.hdf5	9676b2de284df99479213affd3bfb0ba
+part_03456_of_04320.hdf5	b10ef639bb63ad025f0920326dad3529
+part_03457_of_04320.hdf5	7b1b5418410e87ba4d530ac68029856a
+part_03458_of_04320.hdf5	cd21b7be9c77e59a045ea931f5a2eff1
+part_03459_of_04320.hdf5	e7e2c9acd972c92dd80eea19c401c387
+part_03460_of_04320.hdf5	8c7d6a79a42459a0ab629193ff3aba64
+part_03461_of_04320.hdf5	35f962fc1a6262494425af2cdaec6b33
+part_03462_of_04320.hdf5	647e5def2dc349e187cd89b84db88f35
+part_03463_of_04320.hdf5	d3cb09bad476413f20e60eeff50d3b60
+part_03464_of_04320.hdf5	1ec6d75f3ba5c5dbe02a9adc3a1d6b02
+part_03465_of_04320.hdf5	914ceb97c52b34b5cf89aba9ada59aec
+part_03466_of_04320.hdf5	0ebcac9f377e7ad513f298371e68d7aa
+part_03467_of_04320.hdf5	ddb835e40645fb6ff0997ee98e547a61
+part_03468_of_04320.hdf5	df02c23da4591d351605cf7f68d9ad12
+part_03469_of_04320.hdf5	6cff36cf52ce25792704e424cf86c6b9
+part_03470_of_04320.hdf5	d99f20712224b20cd5ec2c14b8c74421
+part_03471_of_04320.hdf5	33b896be50b4261c227616847b2bdd1e
+part_03472_of_04320.hdf5	ccc496edd617aff09334b60edf1314f6
+part_03473_of_04320.hdf5	a8e249e465096ddd1d91862cf740032d
+part_03474_of_04320.hdf5	2d76aff9bb4f18caa8b3409f7b784819
+part_03475_of_04320.hdf5	c7f581a2b2d67e094f64b05ceb14079b
+part_03476_of_04320.hdf5	406e8f8d2c2d73bc71beae2193d850dd
+part_03477_of_04320.hdf5	775b52a48c489f2c12e83b9b543a8ac4
+part_03478_of_04320.hdf5	38b0ee27c5abb6a7e00b9744e468a8ad
+part_03479_of_04320.hdf5	ed8c44b00307618797d6bd4d8b9ae26b
+part_03480_of_04320.hdf5	d7f05bfd7f57e5ae121a3dca8511178a
+part_03481_of_04320.hdf5	6d298f9efb0f46f77939923b128403e4
+part_03482_of_04320.hdf5	bfbb93b9e006ba66248b1144024fdaee
+part_03483_of_04320.hdf5	a5c5c2dece992644bf5549217530ced5
+part_03484_of_04320.hdf5	d29cb1c4b130032a86fb649581b2b7c0
+part_03485_of_04320.hdf5	08a6ae1201faa6cb7797ab10b47a92e7
+part_03486_of_04320.hdf5	1cab9fc2c1408bca4ed36b6b21d5af48
+part_03487_of_04320.hdf5	9401c5dad62ebc5dce33864b0a644610
+part_03488_of_04320.hdf5	645b9c6fc23208ff5b1e16eca7fb84a3
+part_03489_of_04320.hdf5	844f662aaff8c2bcfc1ce6bbdfbdaf92
+part_03490_of_04320.hdf5	a96f897b268e4fa84764a36ed9dfaed3
+part_03491_of_04320.hdf5	b8588aa7f3336f5fee52ed8502f78417
+part_03492_of_04320.hdf5	9024befe7d76d00d6120baad9545ea2d
+part_03493_of_04320.hdf5	622a22b514d420116da819e22df78b2b
+part_03494_of_04320.hdf5	f2083e5b56b80a6b9e6dbe99ddf4859c
+part_03495_of_04320.hdf5	fea37bd1b108e1ac9400deb7241e0df0
+part_03496_of_04320.hdf5	1729b9ed3fdfce6c36568e356806525c
+part_03497_of_04320.hdf5	78118cbbfa858853a9f0cf3d3668d308
+part_03498_of_04320.hdf5	97fa8050e9dd11256f4b19ee506b1558
+part_03499_of_04320.hdf5	e263a0da7a83922646062649d61ec6a3
+part_03500_of_04320.hdf5	9ce70b95f5250be6b3c1a328dbdd7bd0
+part_03501_of_04320.hdf5	9b9350b72902ce2a24e2d3e6887b94d7
+part_03502_of_04320.hdf5	abe73a538bcee40a11c97cd8a5d76317
+part_03503_of_04320.hdf5	c62949165759395dc2a4af5e7194f690
+part_03504_of_04320.hdf5	cba47938c25c9cca32696aeb57009862
+part_03505_of_04320.hdf5	2d1297263c6b296a8520d808f8f3c5ca
+part_03506_of_04320.hdf5	44b9ebeab326ad8ab0a79867488adaa5
+part_03507_of_04320.hdf5	4cf827b3ae3d2b3870602e764d095de8
+part_03508_of_04320.hdf5	9b8cee4dbaa57ea3403f04f5a32eee2d
+part_03509_of_04320.hdf5	3dbacc11401ddb291c66ccb4b9ff683d
+part_03510_of_04320.hdf5	df8b5185b5dfc69c3c0d0a5c8861749a
+part_03511_of_04320.hdf5	3845aa9c41c934695a57490737c6fb41
+part_03512_of_04320.hdf5	fcd9791f428d9b01d9a4cd6f925f77f4
+part_03513_of_04320.hdf5	5a3eade56192e5bbfe375c5c8d6c6d6e
+part_03514_of_04320.hdf5	4da2f1b51d934295ac036902c5cf54ad
+part_03515_of_04320.hdf5	2ce2334d19c473db5dd6e897d599e1d6
+part_03516_of_04320.hdf5	bb6515d4e13e23e1fc9770246e2e0698
+part_03517_of_04320.hdf5	c03f32138de7d1369c82e95228914220
+part_03518_of_04320.hdf5	b337a34e4e910585bfb0650b5382eebe
+part_03519_of_04320.hdf5	89631da8cd0752d532b0ee4e0861943f
+part_03520_of_04320.hdf5	76dffe5009296276de3398ced80c4c76
+part_03521_of_04320.hdf5	37341962abe3afb990ed2b26ba75ee0b
+part_03522_of_04320.hdf5	be38182bcd6832ca109a94c69e0609ef
+part_03523_of_04320.hdf5	3edc123745bf3ac893c46b112f66a258
+part_03524_of_04320.hdf5	d3ea12b78ab6f7c7a5c1b9a85936816c
+part_03525_of_04320.hdf5	fedb0184ee4b3f17da96eec27a54cbc6
+part_03526_of_04320.hdf5	edc815080a7109215bc519477933a089
+part_03527_of_04320.hdf5	70b792d0f59a9da56e1278f12f360ad0
+part_03528_of_04320.hdf5	16c68c195c739329f11d5101d5440dca
+part_03529_of_04320.hdf5	b8a1c52baac42f2dda3a98d45ee4c19a
+part_03530_of_04320.hdf5	267aabc9628c37550210c8b4cb33199d
+part_03531_of_04320.hdf5	d54be980c9571904ee123fd29cd4b90e
+part_03532_of_04320.hdf5	708f7ec39faf64c5d51d7a894d539ba3
+part_03533_of_04320.hdf5	de4a7360ad53d1fca06af5a7fd46982f
+part_03534_of_04320.hdf5	948527b31de525045397963e5ff5fe5e
+part_03535_of_04320.hdf5	3ff80d666ee65958a6d48855cdec612e
+part_03536_of_04320.hdf5	121ed39d5255f17b8404d3713787ddcc
+part_03537_of_04320.hdf5	ab036ba4c4b742adc4c34b28d599a26d
+part_03538_of_04320.hdf5	5e4be3879c088f99ad35f44a9aa0ca72
+part_03539_of_04320.hdf5	25c692f729d5e880344b61f07685310e
+part_03540_of_04320.hdf5	344eef0bf22fc242d3402b63d78a25b9
+part_03541_of_04320.hdf5	5904e69c4036ad6c19de36aee2d584b3
+part_03542_of_04320.hdf5	49337d1378ebe7d55663770b23526811
+part_03543_of_04320.hdf5	a0e4e110fe13fc53ebdb16b10cdedd12
+part_03544_of_04320.hdf5	5cb48dd603972efe019a865be1e6dfd1
+part_03545_of_04320.hdf5	f725bef0a65517fe37a97f5be29e04ff
+part_03546_of_04320.hdf5	ea443e6a530e84543de8ca0ce14d9c48
+part_03547_of_04320.hdf5	012ee852d7c76fef1c968a01a411fdd2
+part_03548_of_04320.hdf5	517a47144f49ff825ed5456be856a648
+part_03549_of_04320.hdf5	21f5a109b138493de4850782e95f1ce1
+part_03550_of_04320.hdf5	5dff3774b97272dcbbf3a510856c572e
+part_03551_of_04320.hdf5	b50390584dbd6f5c5f4544586c3ab1c5
+part_03552_of_04320.hdf5	02f6a96a4462470b0d69d522df82bce2
+part_03553_of_04320.hdf5	5d8b7818e424af70f6a642fdfda3fc7f
+part_03554_of_04320.hdf5	7e6c0332f9f65e3ff043a9e54e913c0d
+part_03555_of_04320.hdf5	68276b004932a155f68f2fa15aa4b9da
+part_03556_of_04320.hdf5	f1e754cc8e010547fbc53015d980443f
+part_03557_of_04320.hdf5	2ca5cce30c46a04ae8490fde18bf2ff6
+part_03558_of_04320.hdf5	89a9fc421ed805629881662bcd082953
+part_03559_of_04320.hdf5	2c52cb483bee47d4b81994564863f2a6
+part_03560_of_04320.hdf5	281623b8301f1b53fbd5eb174b3e378f
+part_03561_of_04320.hdf5	de73a6177f9b6a29a7f31b4b1d34e4f5
+part_03562_of_04320.hdf5	370eb0e5f33aa3538c04df2f268138f7
+part_03563_of_04320.hdf5	37d693926dff66f1f5e977b76fe22cb3
+part_03564_of_04320.hdf5	0d583fb492078f3f313c815783d18245
+part_03565_of_04320.hdf5	5f3670b5672843ee99e97a20d6b57574
+part_03566_of_04320.hdf5	fe6dde02691cb228fce1ec2e21b7b2f0
+part_03567_of_04320.hdf5	199c0c7548ed989724ecef4af24481a4
+part_03568_of_04320.hdf5	6e69f1c6241be5cc0931fb6d8985f2bb
+part_03569_of_04320.hdf5	e866fc10ad7eafa9e312ed69f208c09e
+part_03570_of_04320.hdf5	d8c9513b16ee5a689e3ab8a605295335
+part_03571_of_04320.hdf5	289575d029d2740cf0d8afcd1df65b74
+part_03572_of_04320.hdf5	9b4d0dca61d274a465b1774de1555ebb
+part_03573_of_04320.hdf5	84672e428dd5d55f662484c8fd7071c8
+part_03574_of_04320.hdf5	ffb81a719e59e3884862f5fa27780545
+part_03575_of_04320.hdf5	70b85ea11e51a2aa5a7168f0ef412053
+part_03576_of_04320.hdf5	3e422446ce00278adcff6ecbb2dcc76f
+part_03577_of_04320.hdf5	60967de2464fbbc4438edbeb31c7f6bd
+part_03578_of_04320.hdf5	1bff90d48b93595d4926cad9f7c59566
+part_03579_of_04320.hdf5	cfd4167a58687d2704dd310062c60f52
+part_03580_of_04320.hdf5	ddfbf08239bc2a2d3d02d0c79d4d7622
+part_03581_of_04320.hdf5	b1a475f1bee01337ac9f5cf0e7f380ab
+part_03582_of_04320.hdf5	8da75d86ba3a943ef218b0634967ca7c
+part_03583_of_04320.hdf5	eb2e0b8eb8a3112cce8f6e2aabf3c3e1
+part_03584_of_04320.hdf5	1027dbb5eeed83b9541398732c27dd05
+part_03585_of_04320.hdf5	dcd67884686b8223da670538dd7f426b
+part_03586_of_04320.hdf5	f6d2ccfdeba1d2565eb373aeacb1b128
+part_03587_of_04320.hdf5	f5348b4f6e0c4b6d683da35a4a42dc1b
+part_03588_of_04320.hdf5	685937d126e2bf95fe61604cd6b19fc9
+part_03589_of_04320.hdf5	66f3709619654874119aa4ca0630caa4
+part_03590_of_04320.hdf5	ba7033ea73ca4ef4f6ba6fb0bdff9198
+part_03591_of_04320.hdf5	08b6ca7a72256e2583a81d2f0004786a
+part_03592_of_04320.hdf5	6d57b75970dfb2e3a771358d77141cce
+part_03593_of_04320.hdf5	109e9c9272ceae5b7bae12e3cbc2fb3f
+part_03594_of_04320.hdf5	f81456145d561aaa3a69f633af53bd6d
+part_03595_of_04320.hdf5	99ae865cf21731ae3171ecba55f6abe0
+part_03596_of_04320.hdf5	83dc72592f7b69b3b5a12bf113fe96ac
+part_03597_of_04320.hdf5	ee7ac3466590a5f216678230890aab94
+part_03598_of_04320.hdf5	914db9f0a4d6c78ba86936c3fccfe23e
+part_03599_of_04320.hdf5	d7f1b833f9d24c9f37124173d0301186
+part_03600_of_04320.hdf5	7af074268021ce5fc5c6f2c37e6758a4
+part_03601_of_04320.hdf5	794f4476fc92598881981ee1469e8b84
+part_03602_of_04320.hdf5	02695523004475e8007686a251ef6d12
+part_03603_of_04320.hdf5	ddc3a1534063fb90688bebc0d30658f4
+part_03604_of_04320.hdf5	92b5125f35c2cbb27a1f40fcd4c34d87
+part_03605_of_04320.hdf5	a8c223f20bf34438ce5c4959f0cd785a
+part_03606_of_04320.hdf5	c53bca2e6a4cf61eb484635554eef22f
+part_03607_of_04320.hdf5	ada9276a8119d1eec317f2afded194cc
+part_03608_of_04320.hdf5	60434a76b86f10f3e5ec5c6bef5e16cd
+part_03609_of_04320.hdf5	b6e92e63ea077dc2eaec094e8f739cbb
+part_03610_of_04320.hdf5	4ba507ef42fe8712285387d6c1cfdc9f
+part_03611_of_04320.hdf5	df1e55c22952b017ceb330148592694f
+part_03612_of_04320.hdf5	c8dff6ac6fba7a1cd7be3114fdf124bd
+part_03613_of_04320.hdf5	f9d344f13cb9344a8d6b6c595c41a4bc
+part_03614_of_04320.hdf5	e043c23c553f1679e8a6c10f4e4e91bf
+part_03615_of_04320.hdf5	97869b24874cbbdfc390ee3e815f8059
+part_03616_of_04320.hdf5	41bf8ad4188b2705fba31e592d4085fb
+part_03617_of_04320.hdf5	c71af35c562136d5e7399c7881d14fac
+part_03618_of_04320.hdf5	0b204b89d93e67c374903e35619b38ad
+part_03619_of_04320.hdf5	7d4a5ed0a6734764487cabd143409b22
+part_03620_of_04320.hdf5	2512590c77b589c08bc4d3627a11813f
+part_03621_of_04320.hdf5	ee68e2e10f3f3133011577fc8dabbc81
+part_03622_of_04320.hdf5	8c969847f6a7c543ac641961a04ec3b3
+part_03623_of_04320.hdf5	f6d7c681b0d3cbe668e069cc50160026
+part_03624_of_04320.hdf5	52537d4969ca2a426ac4c470742e7559
+part_03625_of_04320.hdf5	b98b3b6daf12b8995f284835577f8c1d
+part_03626_of_04320.hdf5	d5dacbfdaaec9a905fe17960af3a0ad8
+part_03627_of_04320.hdf5	7fc9fa94915d6fd0f9b11762d39c6cb8
+part_03628_of_04320.hdf5	f50bfd170baab70ec36aaa185e37d802
+part_03629_of_04320.hdf5	4cd699f810c9ca9106cf50991b444368
+part_03630_of_04320.hdf5	d426ad1e64c8f18ba1877a6095e2745c
+part_03631_of_04320.hdf5	145eeec5c289fbf881cd0c4cf9d85d39
+part_03632_of_04320.hdf5	c37889e37ec021e261308bb1e0298f9f
+part_03633_of_04320.hdf5	00235d6aba14013766ab634f1647e06c
+part_03634_of_04320.hdf5	2036d2cee866f3bad4aadc8923c0c9ea
+part_03635_of_04320.hdf5	52fb6c8431359d54decc24314f3ff937
+part_03636_of_04320.hdf5	60946e1222fa599ca06af426af45e7ba
+part_03637_of_04320.hdf5	a05864e5a34af785f925cf224ca505a8
+part_03638_of_04320.hdf5	f4c574c28f9aeae8cc9ca1df851d304a
+part_03639_of_04320.hdf5	1c5e8f97cdffe581fb369b874f55c6ae
+part_03640_of_04320.hdf5	68f18d300ffb9bbc2cedea30f99e501e
+part_03641_of_04320.hdf5	7f771ccd2abacf0635942e5b4e12a7f2
+part_03642_of_04320.hdf5	a07289b6c7db22a51b277fca0ac993b9
+part_03643_of_04320.hdf5	80769358ea512658f1edbc48f6cf8475
+part_03644_of_04320.hdf5	2c8b254ffa01e0087a77ad7a3420840e
+part_03645_of_04320.hdf5	221346cb54c344f5129e00b15925fe18
+part_03646_of_04320.hdf5	203c46fe4e19e86cd19edb788515195c
+part_03647_of_04320.hdf5	2d7cc129d0f57f3ba802c64c1748a3a6
+part_03648_of_04320.hdf5	0528681d1e4b37b9ba4b424e6a9ad5fa
+part_03649_of_04320.hdf5	41619f0b8f43875f755ba873d130171b
+part_03650_of_04320.hdf5	8b23e3df40c0a92c846b8810babe1d6a
+part_03651_of_04320.hdf5	1606e3d90257cf8a5cf2e2b37116312f
+part_03652_of_04320.hdf5	ffa5105faf180dffb94da55bd7cd8145
+part_03653_of_04320.hdf5	a53c4ba3d67618bb00f944f0f9422f85
+part_03654_of_04320.hdf5	929f5971e37bcebd099f99fc187fc68a
+part_03655_of_04320.hdf5	fcf9bb85b9163fa608ad34f674b7a3c1
+part_03656_of_04320.hdf5	ce552d47e7a0e9ebb03622c201af1d50
+part_03657_of_04320.hdf5	ea48a98b5af0ffec46479b8f600bc28a
+part_03658_of_04320.hdf5	a4e0f9779d7fb40cf3ab11208d4c15b2
+part_03659_of_04320.hdf5	e7dbcd5deefbb66a9efcb8dd298ea122
+part_03660_of_04320.hdf5	2c2f3e2505f07cb9f0610b80d3c509bc
+part_03661_of_04320.hdf5	dcf922bd4c183a06175756793b5846be
+part_03662_of_04320.hdf5	9d069f14f8f11293307166b21d27907e
+part_03663_of_04320.hdf5	2faf02f3ad7db179246b7e6d9803a9f3
+part_03664_of_04320.hdf5	1c7e51e2efa8e85285c65af8dbf05723
+part_03665_of_04320.hdf5	513fe580a8e763fa1f71b4fe4cd27989
+part_03666_of_04320.hdf5	b336adc1b3463f42a414d7a1a92efcaa
+part_03667_of_04320.hdf5	d86a1c3173224ae9133a5b268d66106a
+part_03668_of_04320.hdf5	86c05a5650e3e24c6e98e1febb68d8db
+part_03669_of_04320.hdf5	30b2ffba180121c0ea6d9003d327dae8
+part_03670_of_04320.hdf5	7a3914a176354d3f3b6dc55993dcf7cd
+part_03671_of_04320.hdf5	0269d64848ea80a5729f23d1d322c5bc
+part_03672_of_04320.hdf5	49da20fc8dc1bce8c4f421fec2c9bac7
+part_03673_of_04320.hdf5	ecc8d19a04abd33c0c759e8647a541c3
+part_03674_of_04320.hdf5	380f032d71b9b59f60e03b5c3e6085df
+part_03675_of_04320.hdf5	bf05c1e22d5dd116273ae4076fcd7db5
+part_03676_of_04320.hdf5	6f770553616427a65e6508aeb8825363
+part_03677_of_04320.hdf5	0ec86c46e99f49abf3cd9997e8f15e3e
+part_03678_of_04320.hdf5	611cea87e5fd37142c264e11a4bb71c2
+part_03679_of_04320.hdf5	127d2b0c9082ea8f34535649d61ecfc7
+part_03680_of_04320.hdf5	57e70bbc01e2a433b10f966a5c6c0b04
+part_03681_of_04320.hdf5	7ccb8d7adc7ca191ab8bd3c93d31cba6
+part_03682_of_04320.hdf5	2f17091941a59e3e00c55af96926763e
+part_03683_of_04320.hdf5	4bd06d971f507e77bd06849da2bd1c7a
+part_03684_of_04320.hdf5	08011e319bf4a56eb69306c210917aad
+part_03685_of_04320.hdf5	199ad3e2ce8c79674c33f99bbe95e183
+part_03686_of_04320.hdf5	9989bc17e61fa3f16e5b045bd7a0080d
+part_03687_of_04320.hdf5	57e516bc46b8fe2fd5dd83ce9aad8cf0
+part_03688_of_04320.hdf5	89bbcf1f4798f68b9699b2a05abc09a4
+part_03689_of_04320.hdf5	19d52fde058c6ebc7c0b51494701b4b7
+part_03690_of_04320.hdf5	c82c014da160ca5747e8171ff84967db
+part_03691_of_04320.hdf5	ac8961057c509d4c4127b9580c58a434
+part_03692_of_04320.hdf5	cdf3018037a86443b3fa013beee13e78
+part_03693_of_04320.hdf5	6d319bf0fd60d6eb0f00605bf5b6de9b
+part_03694_of_04320.hdf5	9e1a6684d502094ffc15d7582e7ccc54
+part_03695_of_04320.hdf5	c9bbae56a78374c17f904ad9f95a2736
+part_03696_of_04320.hdf5	07af4ee85133bc897254d6b3c9cbab23
+part_03697_of_04320.hdf5	5d1713b5832e35fbc04fe7b7b62f86b6
+part_03698_of_04320.hdf5	b1262d14e3a865194828f396ff5c5277
+part_03699_of_04320.hdf5	f0aec2c9f136c94dee40f39d573881d6
+part_03700_of_04320.hdf5	573e18c4dc4179edb6c27ff78ff2857c
+part_03701_of_04320.hdf5	5127f658c2aa409cc8125291124154b0
+part_03702_of_04320.hdf5	eb104c3c491de4353f9312d95c50f9ab
+part_03703_of_04320.hdf5	f3f521eb79c60fb2b082b560284711b9
+part_03704_of_04320.hdf5	8a02362153a6bb53677812357fa3bede
+part_03705_of_04320.hdf5	e3d9354b3f37555e340972c5eaba3758
+part_03706_of_04320.hdf5	f25845941f84e20832b3ff2bab448d15
+part_03707_of_04320.hdf5	5bc22478bded4af90588279bb765d056
+part_03708_of_04320.hdf5	7a978c331daa53c4c40096cd757af340
+part_03709_of_04320.hdf5	850026e97ec7240930e036668d1e6dcb
+part_03710_of_04320.hdf5	575212e8637f905d3bddcb19592429b5
+part_03711_of_04320.hdf5	0d7a4462e572c18b3e1cd95b7d8d3b50
+part_03712_of_04320.hdf5	146aa4b417f7ea46e56ffe76e05c6c96
+part_03713_of_04320.hdf5	efdd60038200242c55c945f538f1f6db
+part_03714_of_04320.hdf5	764b44a6c131b90da888f1659cb9cab8
+part_03715_of_04320.hdf5	400de88c353caa43ab256569fc880627
+part_03716_of_04320.hdf5	1e732be2eb9fdf623248a6917d831bf0
+part_03717_of_04320.hdf5	50abef0671d9ac3efff7c1a7a5f298e2
+part_03718_of_04320.hdf5	652424ae3d35d90fc280b6c52e0c3229
+part_03719_of_04320.hdf5	98686a54daaa6758eaef12d01a1e681f
+part_03720_of_04320.hdf5	f93c78249e9733e04cd07a5f1a85f12a
+part_03721_of_04320.hdf5	3a4881814a695aa1458db67a637ea312
+part_03722_of_04320.hdf5	2aac63ecf893aade72321ca4c7454912
+part_03723_of_04320.hdf5	f895ddb568d1914ad17b254fd9f564a8
+part_03724_of_04320.hdf5	75aabfe71f6504ae95ec79d7fdb0d254
+part_03725_of_04320.hdf5	52447ff5e9636adf3a0f1b2cae5eeb0f
+part_03726_of_04320.hdf5	1d27e9c32c282404e07d2560fe1e7c2a
+part_03727_of_04320.hdf5	c445c807716586d629240734c4023756
+part_03728_of_04320.hdf5	107e88330b8b2843c0d840f93d88957e
+part_03729_of_04320.hdf5	bff4fa7bebd5b53fcfbb62aef8c8e630
+part_03730_of_04320.hdf5	25b2a1f3a44bc9d93fe0ea1bedb6f380
+part_03731_of_04320.hdf5	cafff664a49ed6632538c351c7abdacc
+part_03732_of_04320.hdf5	de306e9c4c815ae16eae7d5a3f3ab55f
+part_03733_of_04320.hdf5	8dc55254a00ff2d39b0bf9f62aed6dd5
+part_03734_of_04320.hdf5	8adffa582ace1bb8f20e6101d1ca45df
+part_03735_of_04320.hdf5	548c9af3681c7d361c7634b3720d2a69
+part_03736_of_04320.hdf5	ea02cf00749b28591518123c45424a0a
+part_03737_of_04320.hdf5	a26e588bdfa292ecef0b34eaaabae37e
+part_03738_of_04320.hdf5	89b8ffc84a48aff9fe2251ad924666ca
+part_03739_of_04320.hdf5	d3bab0dcfbc647705df9a496a4a4b77e
+part_03740_of_04320.hdf5	3dcdf3c97ec3fa96f16d4a70ce42051c
+part_03741_of_04320.hdf5	3cf62dc185cc22b8941d1f963765fbe6
+part_03742_of_04320.hdf5	b9814280d7e0d76ebb8d7e4b15fc31af
+part_03743_of_04320.hdf5	cb6f20a3940981c211285f64d942c45e
+part_03744_of_04320.hdf5	8ee18ae2f4d22b68415c8a20897d3733
+part_03745_of_04320.hdf5	cf26e4a6c3005c3eb6d22de0b16d9b90
+part_03746_of_04320.hdf5	66f054855313d5a0169732c98cf2e775
+part_03747_of_04320.hdf5	d0a5e2c9384021977fc38cabff784814
+part_03748_of_04320.hdf5	f2d3b20edc0f9ae4ebc8ee33cb6adfec
+part_03749_of_04320.hdf5	aa7c694aff12817677f1d138397b1030
+part_03750_of_04320.hdf5	2f0647eeb41c7ac74c22456e879faeff
+part_03751_of_04320.hdf5	b404a6d214058adceecb912b417099cc
+part_03752_of_04320.hdf5	8108713c8b471d49f59c0a93b7c5488d
+part_03753_of_04320.hdf5	c326bb2e3d5bf7343309959be65a4cda
+part_03754_of_04320.hdf5	af1ebad2e80476e7ab5c4e4c992a25d2
+part_03755_of_04320.hdf5	e3b25da8ab0f00bbe59ac8aab0e97b96
+part_03756_of_04320.hdf5	24fd836b812a203ffaee8fa78ef1d602
+part_03757_of_04320.hdf5	03c134f23ed186eb65013d855813bad7
+part_03758_of_04320.hdf5	cc10a22c6fe44a97ca3edde1ffbbd14a
+part_03759_of_04320.hdf5	966c22f86f94f5c1e8bf7ec9f35d5fe6
+part_03760_of_04320.hdf5	aa4fbd3485c63bc8aabd6a5472e4856f
+part_03761_of_04320.hdf5	473ffe69a597869e932c7c28389209db
+part_03762_of_04320.hdf5	5d7ea88939a4392ce3b6c1a314a1ba8e
+part_03763_of_04320.hdf5	81cceaf84ca27655da7dd17cf009fc1c
+part_03764_of_04320.hdf5	e2997d752b62440ad0a986cc83fcc04d
+part_03765_of_04320.hdf5	c8ac11f6588084e79b9dae36b31cf833
+part_03766_of_04320.hdf5	ff17069e9e478019453a4efa50e111b0
+part_03767_of_04320.hdf5	67277c5ce6e0986f922026cab4ea14e5
+part_03768_of_04320.hdf5	f030b1fcb91432d7f1bc0f273fb01f7f
+part_03769_of_04320.hdf5	6bb26691140a8dd1c7787de245ed4626
+part_03770_of_04320.hdf5	282365e75a1a4d4897eae56e0b52f37a
+part_03771_of_04320.hdf5	612e34aea6b332db91d0c2eae460cf80
+part_03772_of_04320.hdf5	d5a70c2b61c761507cbc4c2b77059854
+part_03773_of_04320.hdf5	20fa54576a2ae4a09d94372fd5a74a54
+part_03774_of_04320.hdf5	de1f94a350ef641fb7b9bd0d9ee8b885
+part_03775_of_04320.hdf5	dc1fe6e90f7cde22eb16d783d1363037
+part_03776_of_04320.hdf5	96839b009644fbf88546c71ac8b2507e
+part_03777_of_04320.hdf5	fd7525e35d9703058cff821120f959a0
+part_03778_of_04320.hdf5	7b39b987dc0bdbfc30640c87238ceb9b
+part_03779_of_04320.hdf5	fa9c259152d96a406f3b312a190bb0a0
+part_03780_of_04320.hdf5	ab39c80f8f19f142b9360cf0aa32541b
+part_03781_of_04320.hdf5	6f74dca2e54c04f268eaa88b79b75d8c
+part_03782_of_04320.hdf5	da2eaffd6fdda28078d5c28b480aba29
+part_03783_of_04320.hdf5	77c11eb7bdd2542bf0b84b33278dc61f
+part_03784_of_04320.hdf5	b79f0a1c110d6db153794a05dcc519cd
+part_03785_of_04320.hdf5	f2ebe60455427bca71633ddd7176d59c
+part_03786_of_04320.hdf5	6b5a7ba2d79e3d51e144887457367e6a
+part_03787_of_04320.hdf5	ef84b55709ee2dab373d4b553c6ed784
+part_03788_of_04320.hdf5	c6fe902e8da592bd28aaef04efc150a5
+part_03789_of_04320.hdf5	11fe130f0f4b442d1085f0c7edf8e99a
+part_03790_of_04320.hdf5	439cab28b08e7e93af67f42271664687
+part_03791_of_04320.hdf5	cd45ce1f109a7c30e13b300f591f9eff
+part_03792_of_04320.hdf5	06828b3fdbdeb0cfed39ac0f6d62bed3
+part_03793_of_04320.hdf5	42a66110069c4ad21c3b4a26c7acc403
+part_03794_of_04320.hdf5	063668321910eb3a9b322ffc533e4c0c
+part_03795_of_04320.hdf5	1fcd6973ad07150381255e14cc7ba0be
+part_03796_of_04320.hdf5	fee209799604b4eb5008efa76ae0601a
+part_03797_of_04320.hdf5	4dfd556603f8053785e3303b28db5300
+part_03798_of_04320.hdf5	b687a84fc9b96c4729fd6822f0f228b5
+part_03799_of_04320.hdf5	a7d5af4af1795b6fc3f19c27b1c4e092
+part_03800_of_04320.hdf5	2cf9b7a4e5cce7b25bd31e55612e1a0f
+part_03801_of_04320.hdf5	84fa231608e40e7e229e8f32e311facf
+part_03802_of_04320.hdf5	fcaab7e2f8a308bbef14549e0b089350
+part_03803_of_04320.hdf5	2d21c9d79f5e0ffee78b9e3ba47b2868
+part_03804_of_04320.hdf5	a48b3d852f087923fdda7950b5c612f9
+part_03805_of_04320.hdf5	8e6e7c9592eb974a25d5eb60ad9f3ee3
+part_03806_of_04320.hdf5	c08b8e66fd3fc60b50fd87ed57031540
+part_03807_of_04320.hdf5	b13122e27ddd601e27f106a966adf2f2
+part_03808_of_04320.hdf5	7ad4aae0f77b5ed5c1f2c07d702cfe5f
+part_03809_of_04320.hdf5	e3d353acc491bb56037840b073b08703
+part_03810_of_04320.hdf5	02eb4240b063204a223c90d6898eee63
+part_03811_of_04320.hdf5	724de2bdfa7ff67fdfdedefa7e31d33a
+part_03812_of_04320.hdf5	4bea82db6fb627ae591d5df3e6838c57
+part_03813_of_04320.hdf5	e38c6747dc7b5ba8ef83bca7d0d8a2c1
+part_03814_of_04320.hdf5	a5be54796acd0c36e648c92dcf42c47a
+part_03815_of_04320.hdf5	25888f3e39d6ac34df0d04ef6cfbb64a
+part_03816_of_04320.hdf5	35c15e9275ac926e85f171f29c53cfe2
+part_03817_of_04320.hdf5	b32ef5214d56a24daa0b4f2418ab5c1f
+part_03818_of_04320.hdf5	8cc008c397680cf5321c83da6f9a1984
+part_03819_of_04320.hdf5	ef09eae43823904e72a12fb0c26dec36
+part_03820_of_04320.hdf5	1a637559c36913207779ccf0b2b256ae
+part_03821_of_04320.hdf5	8473f79709ba8e21fa81ceb4bc5a03a0
+part_03822_of_04320.hdf5	dadb7c1ce10007647dd94acff765ee40
+part_03823_of_04320.hdf5	3d821d87ac816993a0e03c7645dc0bbb
+part_03824_of_04320.hdf5	40159b2da0f215d0c593451417430c06
+part_03825_of_04320.hdf5	b96cdbca42f77909428c44aa1baac81b
+part_03826_of_04320.hdf5	7a64e3af8c1c65cf6c2e9636fab11cb5
+part_03827_of_04320.hdf5	357e85c1277a454c442ce3360180a5bc
+part_03828_of_04320.hdf5	a9b3fc3f7cbc96279f844f1ff925e937
+part_03829_of_04320.hdf5	9bb895c463dc5d503a2e50abc9c4929e
+part_03830_of_04320.hdf5	fe781650839a1ce5e514b83b058fed10
+part_03831_of_04320.hdf5	3f109512bc579d2c0f34579e6c190553
+part_03832_of_04320.hdf5	ab50e37c62f81b02a8b0c94bab79aac4
+part_03833_of_04320.hdf5	c8653acd0c752486199e141f2140d8f2
+part_03834_of_04320.hdf5	c80c2b7bd22103c21b12850f09406ade
+part_03835_of_04320.hdf5	b7d4a6da03fd221986dc8def39a18f2c
+part_03836_of_04320.hdf5	3b1eba5632993e67ccce8c7504d3ab10
+part_03837_of_04320.hdf5	446dad578c02c582a6644d0da9bd4c85
+part_03838_of_04320.hdf5	e7cb9395697955462294e63cc9946bb3
+part_03839_of_04320.hdf5	974458cc6964bd057c786fad763182d2
+part_03840_of_04320.hdf5	0b4c2e65bfaf9b045fa888eb648ef497
+part_03841_of_04320.hdf5	f001c252ab79d8f94d7e65e94a0a63dd
+part_03842_of_04320.hdf5	9160859c94cc916193842ee9b375e6d3
+part_03843_of_04320.hdf5	998fa2f50d11bc627f2ee18206bca788
+part_03844_of_04320.hdf5	1d21c637bd731e8954c6b400a6f08e53
+part_03845_of_04320.hdf5	80e8a442edd8befa12a624a3ef626718
+part_03846_of_04320.hdf5	ebef716816f2dfd28b0458421c7ff4f5
+part_03847_of_04320.hdf5	3c8a76f57b5b5c1dda26094017324ad0
+part_03848_of_04320.hdf5	d6a546f57d785edea06f7389ccf92c0e
+part_03849_of_04320.hdf5	902e6dc34da5117b8ac4be07ef179b24
+part_03850_of_04320.hdf5	d5c0c5fbe5ce63e90f32ed7e1ab53873
+part_03851_of_04320.hdf5	4b23264ea1d119820bcccda4a0d683f5
+part_03852_of_04320.hdf5	1793da96b4c1ef7b967709d85c3048d5
+part_03853_of_04320.hdf5	7ddc0e2ff6505986982bd566ae37ee1e
+part_03854_of_04320.hdf5	891482f7620df2b96825943363c8b4b0
+part_03855_of_04320.hdf5	cf6bf6edf71253c06f495ecbcf0b7f91
+part_03856_of_04320.hdf5	67bf14bc7a06a0d22158b05b5d4eb350
+part_03857_of_04320.hdf5	267b91c54cbf08a939decb22787f0ab9
+part_03858_of_04320.hdf5	ec4ddb40143bfd6060c69c25b744f548
+part_03859_of_04320.hdf5	2ade5890936f10e6146c3340050a3be1
+part_03860_of_04320.hdf5	a6e773aad24562d7e0f65ba714f6795c
+part_03861_of_04320.hdf5	ee43263d1428e622450424f3a0d9fbc5
+part_03862_of_04320.hdf5	6f342ee1fd14399c66230bc69e9edb08
+part_03863_of_04320.hdf5	77db37919f098e2795ef238447691eac
+part_03864_of_04320.hdf5	65cc10807edc28c387873046ed06839c
+part_03865_of_04320.hdf5	f19b43ba30b3ad61c8660ccbc8add07a
+part_03866_of_04320.hdf5	be3f1fccf90a07aec5da3d84435eaee7
+part_03867_of_04320.hdf5	91949119bab133a4d935fed20d3d425c
+part_03868_of_04320.hdf5	aca9699ef50cdd15ff4164d4c9f45c6c
+part_03869_of_04320.hdf5	da337dee031278616c36b62fb364c171
+part_03870_of_04320.hdf5	ef5d0ae2c476d718bc75299ac7f18559
+part_03871_of_04320.hdf5	7282d43e8d1d504e128a0dbb1d121d32
+part_03872_of_04320.hdf5	66b745f09110011010ad4069b1bb88e1
+part_03873_of_04320.hdf5	52ce67df74756ffcdbe43e89c22ba76b
+part_03874_of_04320.hdf5	69003ca0e5d39f85405871619ac4bf1f
+part_03875_of_04320.hdf5	6833a43969fac89a94cf3c8471d4f9b7
+part_03876_of_04320.hdf5	e867cb50338eee99f6c402f45761a7a4
+part_03877_of_04320.hdf5	ae00529427f82042fd9a4abd30e0134e
+part_03878_of_04320.hdf5	ce4b2de8f9b8f197693f03bd2681e524
+part_03879_of_04320.hdf5	d6df8242c77fb6610d639aeb1975816a
+part_03880_of_04320.hdf5	305b23fe329121d96a521a08b29d083f
+part_03881_of_04320.hdf5	cdf647804f1005130d664736c1984680
+part_03882_of_04320.hdf5	0ec6a0d3183ecd6860aed86b5dd8fd2e
+part_03883_of_04320.hdf5	f1b6b9769c12805ee5039136ca7bb05a
+part_03884_of_04320.hdf5	4cb9b58d729823d4a97a1107a4042f0e
+part_03885_of_04320.hdf5	b2b198d0e64ecedb9798f73f7822d573
+part_03886_of_04320.hdf5	02a83ed4f27429206438df6c6c2ce8e3
+part_03887_of_04320.hdf5	ddd9b7beb6d3c647c2624f31a7b38d6c
+part_03888_of_04320.hdf5	1445f1bc616e43dc243bcddfe98aa58e
+part_03889_of_04320.hdf5	6b4a2b051c7797ee859db8431c83b67a
+part_03890_of_04320.hdf5	84e35b784c83c45f0d007238e625ca6d
+part_03891_of_04320.hdf5	4a459445bd0ff7632fc6b2be6a48060f
+part_03892_of_04320.hdf5	3c49e64a4a18f61d7d476602eec6a9c9
+part_03893_of_04320.hdf5	d95fc55dd3db3e1ad0f45dcbc0316384
+part_03894_of_04320.hdf5	ad362c072df7f0986d064010ee1311a6
+part_03895_of_04320.hdf5	57c2726e91d33ef75dd0998e1bb017aa
+part_03896_of_04320.hdf5	e3c13aeb28cddba191561c3532df85d9
+part_03897_of_04320.hdf5	0cc3eefc42c7b7281e161778463fc956
+part_03898_of_04320.hdf5	e20a61bc8934453e28ed56d01464f18d
+part_03899_of_04320.hdf5	ee266e141817617139ae1feeb0457e97
+part_03900_of_04320.hdf5	2d252e4e69c7fead57fb2de406cbc891
+part_03901_of_04320.hdf5	bbb8ddbe1c11c78f57c16352e4afcf88
+part_03902_of_04320.hdf5	54cf7bb7ecf3789f11b91bb8333e22b0
+part_03903_of_04320.hdf5	b994df306d91755cbeaea556e8a5163e
+part_03904_of_04320.hdf5	48695cbecd50980c8dcc4c743d539898
+part_03905_of_04320.hdf5	c927b71947da8c832f7a5d0a9d16f180
+part_03906_of_04320.hdf5	a9351d052f0685e8cb1337eab6aa9cb4
+part_03907_of_04320.hdf5	4bf3ec7ca3c3bd09f214b6d8a4799d0e
+part_03908_of_04320.hdf5	5ac608cb94fb0a485703df08177279aa
+part_03909_of_04320.hdf5	54e9aebe805a9309c13f0d0691dd0c0d
+part_03910_of_04320.hdf5	90d579c850bf9ec34c39b77edb78d749
+part_03911_of_04320.hdf5	24c705a7e2c50c2ca94b7bf166defbc3
+part_03912_of_04320.hdf5	83c016003126564379986a0c767ab0f4
+part_03913_of_04320.hdf5	d223ae7b8bdea3f5a49ac909052e21e8
+part_03914_of_04320.hdf5	522f8d5e059cae70e4f627baeed143f3
+part_03915_of_04320.hdf5	3940d85acaaefacf145747631f261696
+part_03916_of_04320.hdf5	b7a29bc304eebee545e8fa2cfd4f7868
+part_03917_of_04320.hdf5	0e2521fd44b397c2d01b85cc4d74ef0a
+part_03918_of_04320.hdf5	c60b19fe665f1f26882fbc451bc1e186
+part_03919_of_04320.hdf5	70e41005fa7792cdc98a4bd7fdf288f1
+part_03920_of_04320.hdf5	37a4ad25df287ad014ff22a6f820caf5
+part_03921_of_04320.hdf5	e4ce52cd4154f50695f09b209cfd8ca0
+part_03922_of_04320.hdf5	6227a65d5e4ab69d1febfb7ad22647ed
+part_03923_of_04320.hdf5	72ff3e68ccc15cbe4f8bbeec0e5831f6
+part_03924_of_04320.hdf5	b9e194c27435336bf8a40fce567f8d8a
+part_03925_of_04320.hdf5	823039a09faabeb5358c22e89ebf0223
+part_03926_of_04320.hdf5	0327739a6efdd6e11c8bb5d8de366108
+part_03927_of_04320.hdf5	4beeb476a7ef9be21b847dbccddb9828
+part_03928_of_04320.hdf5	296cced2f0f2131c7febd28dc6f929c1
+part_03929_of_04320.hdf5	ae5655ac38d0dc1256dade38af176e09
+part_03930_of_04320.hdf5	ff2f0aac02b92f40bd4d72fa131f65ba
+part_03931_of_04320.hdf5	a2dc11c8ceb5225e309d4505f2e1046a
+part_03932_of_04320.hdf5	731c7560e9c99644cb90cf3d517f9d8f
+part_03933_of_04320.hdf5	450ac7f8d98b718b78beda9c12da8a3a
+part_03934_of_04320.hdf5	83587c74593148f6f19705ae271baa7a
+part_03935_of_04320.hdf5	f777fbcf0d2e44d9cb3eaf9f00a6a816
+part_03936_of_04320.hdf5	a01f2294545325ca14e0647a43390ab2
+part_03937_of_04320.hdf5	7b0d4d3b7f8076f6bd08d7c2a7ff05bf
+part_03938_of_04320.hdf5	ff9e559080c8ec90dc86e69f9804ca0c
+part_03939_of_04320.hdf5	31459514d7397111083c64951ab31d00
+part_03940_of_04320.hdf5	49329dc25c6dc6690a5dbc58a199e96e
+part_03941_of_04320.hdf5	23d17ddbc26bd0db6d99e8ecd1c60793
+part_03942_of_04320.hdf5	4ad2254912dc4f5b62b81df44dc106e9
+part_03943_of_04320.hdf5	2f779d067209b482f4a2cd54f51b3d7a
+part_03944_of_04320.hdf5	243b6f32f0c0a469c14670c72ef9ff23
+part_03945_of_04320.hdf5	9e6f8476cac6b216365900b7448c9952
+part_03946_of_04320.hdf5	25283f9f2f90f73afdb4b623198c6a94
+part_03947_of_04320.hdf5	04fde3ecbf8f69e02993940d5d6c9a38
+part_03948_of_04320.hdf5	54f913a9c7bb25630adcaacd7058b3d9
+part_03949_of_04320.hdf5	3ab385156861c5785bc76bc49c9e15bd
+part_03950_of_04320.hdf5	9d3c86c1c07c5ca7a6594428f9c9eccb
+part_03951_of_04320.hdf5	e0cd7869ed852de714a0c479af6aae21
+part_03952_of_04320.hdf5	fab2b13b0beee8df07ac8132b1155a74
+part_03953_of_04320.hdf5	31d6c56a222bc7ce889b43e25fcdd544
+part_03954_of_04320.hdf5	876fb6f41500929f5bd8bec16af16cab
+part_03955_of_04320.hdf5	ae88abe55891f7fa9fb6dccc556ec607
+part_03956_of_04320.hdf5	0fd33213b911f8aabeea1e80d8a1cd99
+part_03957_of_04320.hdf5	f8d25ec6b0bd1ac170cbdb47edad5fab
+part_03958_of_04320.hdf5	18947109501da9a44be3126f39168fc2
+part_03959_of_04320.hdf5	1195e47ebb142afa1957e59008be44b5
+part_03960_of_04320.hdf5	4e1c7ae6e1133ba26bd8163cc4924a32
+part_03961_of_04320.hdf5	da2ff07cfba78721a27cf882f49cdaf3
+part_03962_of_04320.hdf5	7543f321beedb621aca20e44a017ea48
+part_03963_of_04320.hdf5	ae524196fda2a6dbb78766e543f9549c
+part_03964_of_04320.hdf5	b69d1aebbe3d0a12eca0e6de2490a624
+part_03965_of_04320.hdf5	d9dda8f3fcacc800d8c6e0e1a9ccb139
+part_03966_of_04320.hdf5	881349f396d5fa6f2f028a00e9c0589d
+part_03967_of_04320.hdf5	e7b84de803fd01da9620f5bc340a0e37
+part_03968_of_04320.hdf5	7b68cdd64e6913590499c6b3ada0381d
+part_03969_of_04320.hdf5	d2ef50c5e4683e2b54a13846b8561c5b
+part_03970_of_04320.hdf5	5fafdca2372b336207662cd510611fdb
+part_03971_of_04320.hdf5	e16c32dc1f2db6cbde257407aedd2602
+part_03972_of_04320.hdf5	8949b488eac79e8fa639efe2c74484fd
+part_03973_of_04320.hdf5	6eb212407638a57fc0214a3a6b85e167
+part_03974_of_04320.hdf5	a1e5966a99eeac31a9aae46836e0895e
+part_03975_of_04320.hdf5	c2925765d736f300c99c3a1607076c48
+part_03976_of_04320.hdf5	fdf8e923c6730acc4d85436abd6681a5
+part_03977_of_04320.hdf5	bbb8e33d66791b3620d183d536dc0343
+part_03978_of_04320.hdf5	268d4f4960c6b11e90e990b0fb9df4b1
+part_03979_of_04320.hdf5	a2f4debdfdcee5d014b6e92b91c1e6d7
+part_03980_of_04320.hdf5	82dc64052fdd9d0ef3b9034e96b0154d
+part_03981_of_04320.hdf5	c499d5454a01591bea303cb92d7c789e
+part_03982_of_04320.hdf5	a334bb810c73f9364fbf4b0a516d4e5a
+part_03983_of_04320.hdf5	d36847e16f6ac494bc0a60ceb02c1e40
+part_03984_of_04320.hdf5	95ee66b5af3b887eddc4e3bf5faa753a
+part_03985_of_04320.hdf5	b2022612be6371eea53aaaa2c1455f81
+part_03986_of_04320.hdf5	f638a704843dc1d8b27a5e30e31c8e7c
+part_03987_of_04320.hdf5	641af197b856bb4ff0b75dd127a01223
+part_03988_of_04320.hdf5	9de367b52f0b8cd122b6a0f2ce950fd8
+part_03989_of_04320.hdf5	b54b11f9623a6032e704e22807dfa0dc
+part_03990_of_04320.hdf5	0a6e490eb1f87da0a7d55f188723c046
+part_03991_of_04320.hdf5	b8c6048dd971cd40916daa2c8f8b3ea9
+part_03992_of_04320.hdf5	f111704fbfd40cf8e348620be9515b67
+part_03993_of_04320.hdf5	834e936a0f3d754d88995c5b56abf0f1
+part_03994_of_04320.hdf5	de6634311fc8afa724fd6c588206d7a0
+part_03995_of_04320.hdf5	56e822d6386cb7b7472b7aec1e731bc8
+part_03996_of_04320.hdf5	6f49f6343d54cdb673cc8a3216845b88
+part_03997_of_04320.hdf5	3b5402a11eb529979610347b3e23df9c
+part_03998_of_04320.hdf5	c1ca97e52c50a5415ec71195c03d0647
+part_03999_of_04320.hdf5	b9234fbe8896a6e33dea73ce077b4cdb
+part_04000_of_04320.hdf5	445559cc470c33bf137547dad6589d56
+part_04001_of_04320.hdf5	4dbc59293be47601dd0fb040118b2946
+part_04002_of_04320.hdf5	ea9d5b64c7bc599d167de2021a2c6f89
+part_04003_of_04320.hdf5	2d4f04d8d13a416c8314ddd833fc3015
+part_04004_of_04320.hdf5	d1d36ecd83027d9ef056a1fe41a12be4
+part_04005_of_04320.hdf5	8112e1580d7af87ea9df5a29e267ddfc
+part_04006_of_04320.hdf5	9198d16b729d60a6a6f5d720901ab697
+part_04007_of_04320.hdf5	c074ab10f1c9a2898ab1fc69c0f78a5a
+part_04008_of_04320.hdf5	d901f75802a7bfa4056ba33adc42b7e4
+part_04009_of_04320.hdf5	4834f4c8423c69a18b533bd4f41bd9a2
+part_04010_of_04320.hdf5	912359ca328d04ea9aa4bc1e8107f2ab
+part_04011_of_04320.hdf5	487d6e643de7682d89c099f6b12d7972
+part_04012_of_04320.hdf5	0460c860bf1fba4d1650a36755359503
+part_04013_of_04320.hdf5	4308e775bc036eb0fe079ead91908875
+part_04014_of_04320.hdf5	ad6a5686e798afcfd7b4474e7bacf53a
+part_04015_of_04320.hdf5	8649a3b3c30ea783a8b4f724ff55285a
+part_04016_of_04320.hdf5	a7e7e33086a606db945d3d80cb36ad77
+part_04017_of_04320.hdf5	5fdb4b410db4a68de6afb0435eea7c01
+part_04018_of_04320.hdf5	ff1fec0e6a6e0498507dcfdba3d32994
+part_04019_of_04320.hdf5	faf22eda779277c31e5da3d9ab199c24
+part_04020_of_04320.hdf5	58c9f0e5ee9df9bb9b7694483399d656
+part_04021_of_04320.hdf5	3727f7a6ff68c1767700d374843a0f2f
+part_04022_of_04320.hdf5	6dee4bb9bc6712c34bd4ba16bd8e9db1
+part_04023_of_04320.hdf5	1aef754fc5507f4955ba5995beb08f83
+part_04024_of_04320.hdf5	2ca3413732560eedc19eb3140a1fa6bc
+part_04025_of_04320.hdf5	8e08d6e193f2781a948d922b440d14e4
+part_04026_of_04320.hdf5	3ec919fd6a8c4522056d238b604c258a
+part_04027_of_04320.hdf5	1ac3f4da8f44e777cd8ba244a378092c
+part_04028_of_04320.hdf5	047b1bf4c275fbff9b351b1810aa18f9
+part_04029_of_04320.hdf5	6240e679c8256dde65f8f2ecb9866bdf
+part_04030_of_04320.hdf5	97f007690270ab1db3dff9f9e33cd435
+part_04031_of_04320.hdf5	cb568d27e18b89f58a82b0f2b40773f4
+part_04032_of_04320.hdf5	cfb2b5ba8a36df4f60924a1343d2370a
+part_04033_of_04320.hdf5	f988b3e01876e5eaa6b0c6a54bdd53ae
+part_04034_of_04320.hdf5	b43041a1e57aa583599717f77eb957d3
+part_04035_of_04320.hdf5	82f06807c5ed906b9fcb6ee231abb20e
+part_04036_of_04320.hdf5	99640fc7029b49ffbb951aead4c985c2
+part_04037_of_04320.hdf5	e0f10caf7b28a2212dcc6139be520e30
+part_04038_of_04320.hdf5	975de381e52f7721fe8259c03a1bf066
+part_04039_of_04320.hdf5	70a86179d61851fca3e5f2ca9458e769
+part_04040_of_04320.hdf5	1e1723bec6815c8397f955672749b32e
+part_04041_of_04320.hdf5	dd10eb724fb75324d60286f0975fdf97
+part_04042_of_04320.hdf5	4548bc5d761a075b70434d52f4511eb1
+part_04043_of_04320.hdf5	88c21a4166155578efc2440d45fd8957
+part_04044_of_04320.hdf5	59334f3415d16339a4607736d9fa2b1e
+part_04045_of_04320.hdf5	360cbd55a0b76a1353d5458bb6296dee
+part_04046_of_04320.hdf5	77fdc4e6b00e5de3be9a981cf04cdbcf
+part_04047_of_04320.hdf5	5349f23388892fe20c39ac8e748a7911
+part_04048_of_04320.hdf5	48d1a36e056505657ca9d009ff7262ea
+part_04049_of_04320.hdf5	234b92022ef5119b2f7b1c97d6d56c5c
+part_04050_of_04320.hdf5	1937e05c6098c04bfcca75922b52bc3d
+part_04051_of_04320.hdf5	5315e4c4249b07f5d94fbeb14a7560ce
+part_04052_of_04320.hdf5	671349a66862dd7f613e031a73633d3c
+part_04053_of_04320.hdf5	1b218a440823d551be856605370f2697
+part_04054_of_04320.hdf5	33d6cb33e551133ed3a89826fa51e51c
+part_04055_of_04320.hdf5	b9c53403f9eb73e14902e175df104474
+part_04056_of_04320.hdf5	977598d38f9b1ddc5c19300b1a3707c2
+part_04057_of_04320.hdf5	b5fa2fb8314e0c63de4413057f3ac986
+part_04058_of_04320.hdf5	b84b91dc8cdf73a780a23e40a74646c9
+part_04059_of_04320.hdf5	09d3cd782f37112943b1bf31094b0879
+part_04060_of_04320.hdf5	f73b2d2120a9fbb60c7387379a32169b
+part_04061_of_04320.hdf5	7c8bf4379f426c60706f8698a20874cd
+part_04062_of_04320.hdf5	b5d15cc14aa51f3a5141dd78d18ec170
+part_04063_of_04320.hdf5	b95565704cd61d5b88688592c75aaaa3
+part_04064_of_04320.hdf5	94916ddc47cd1486f31b45bbed0ab6ca
+part_04065_of_04320.hdf5	aedb244d01e71bc828ae7b599af63a8a
+part_04066_of_04320.hdf5	2e8dbc21e46a9edee180e51e4caaf39e
+part_04067_of_04320.hdf5	0a3e051f5ff22b90c3b683a5d7356a23
+part_04068_of_04320.hdf5	75d5903bc23481c2a104e4905271a2c8
+part_04069_of_04320.hdf5	8027d3ca86ff6e24c0088d614e16e7c9
+part_04070_of_04320.hdf5	c8e25dff4f461ac682e79d68a6dbc51f
+part_04071_of_04320.hdf5	f9a18e90449b1094ecc225df9e19650b
+part_04072_of_04320.hdf5	138df9b10eb3ab83215726575857ae67
+part_04073_of_04320.hdf5	38d5225488b555c8483df1fdf386ec00
+part_04074_of_04320.hdf5	c3684935f04155c3017212e974d7ee6a
+part_04075_of_04320.hdf5	5a6889c41e9d4ee492633639a9c9f8c7
+part_04076_of_04320.hdf5	61524dfe2df922ed001cd4a78f642b7e
+part_04077_of_04320.hdf5	bf74b853297e869ecd3c72783c6e58f4
+part_04078_of_04320.hdf5	cb475ca835ce8d87169f112456e9e568
+part_04079_of_04320.hdf5	05a2f30f86ecc314a9e9b3f6e38862c3
+part_04080_of_04320.hdf5	8ad6d7b7ca54dbb54bf784da7cd2e6bf
+part_04081_of_04320.hdf5	cf430f56f7b8979e4152149ac1175930
+part_04082_of_04320.hdf5	1e27ace1f2a4356fa07eb78261c97b15
+part_04083_of_04320.hdf5	7befc54e80eee0db2e88e58eb589ad97
+part_04084_of_04320.hdf5	3529c93ef89454366165582594d6ac81
+part_04085_of_04320.hdf5	055fea98f02bdbe300a19ba24b0f9294
+part_04086_of_04320.hdf5	cb6fc5ce880c3f4588fefac4318ee974
+part_04087_of_04320.hdf5	01114db705ee069f458fe726c7a86c9c
+part_04088_of_04320.hdf5	12cd2bc3ce621e253dac4d10fa742d61
+part_04089_of_04320.hdf5	cbbb8414dda5174036bf908c19c32e05
+part_04090_of_04320.hdf5	4a8c83f6ffde761cac9a05f9eeb3e9d0
+part_04091_of_04320.hdf5	a55aebeb8fefd2bebba03946deca7834
+part_04092_of_04320.hdf5	8d0acf94be1f881d624d313066c1e776
+part_04093_of_04320.hdf5	a3558f5dd0caf1a2e25b417809e41f29
+part_04094_of_04320.hdf5	a38d051fbcc823e7fe10e2ca486a9962
+part_04095_of_04320.hdf5	8f97bf44a37669bd8a9b347532ee4c76
+part_04096_of_04320.hdf5	f45051997238e1360119b2b2d0df961d
+part_04097_of_04320.hdf5	13f47b5085e967f01b3dbec707975639
+part_04098_of_04320.hdf5	f0b3ccdfe5c4075b0e96ca58c958aeb3
+part_04099_of_04320.hdf5	b7efa48b6b3bb904077162129b8416c3
+part_04100_of_04320.hdf5	c3663631a2a650bd99423f47752e49e8
+part_04101_of_04320.hdf5	6de554befb6c883d57dc306cb2f97631
+part_04102_of_04320.hdf5	152b30ec7ff57c07761a19ec8a3afb6b
+part_04103_of_04320.hdf5	dba67394f1d34e0a680edca1e6f9267c
+part_04104_of_04320.hdf5	1cabd3c3f566ea5de135971a18227160
+part_04105_of_04320.hdf5	ed187e430f77d7c093e5ccaee0b3bfb2
+part_04106_of_04320.hdf5	708f2ffe58a93ddbe54522febb3c327c
+part_04107_of_04320.hdf5	cb2b853c7592a40b780283e524deb397
+part_04108_of_04320.hdf5	4e28fd270b45a6d659a85692513e1d87
+part_04109_of_04320.hdf5	f2b39ad19bc5d505a92018f611d9caea
+part_04110_of_04320.hdf5	30cdb0d12ec6c3946344e672eaf3695f
+part_04111_of_04320.hdf5	37406cf8be9643cddabb0270fb7dde63
+part_04112_of_04320.hdf5	e732ae090749aa68bdf2e77faaf79581
+part_04113_of_04320.hdf5	33a86880493ada2030f292e6603e98f8
+part_04114_of_04320.hdf5	a97fecc0a0d9666d31ecc06f85519af3
+part_04115_of_04320.hdf5	33cba9ba7bd3d99ea20ccdad818697e8
+part_04116_of_04320.hdf5	c22dc8cfaa44082f11f8a2114c6bd986
+part_04117_of_04320.hdf5	7a4a512e090abce8429aa78fc85343c7
+part_04118_of_04320.hdf5	c28de92c616719a777deb1128e5bc89e
+part_04119_of_04320.hdf5	95fa1fea55df1439dce883395855931c
+part_04120_of_04320.hdf5	c8fd4bdb50a84389626dde1ada364391
+part_04121_of_04320.hdf5	09211ed98bc07e0dc4f525fd19ea0d9c
+part_04122_of_04320.hdf5	0d93363b6d648d4e848193bbc00fef91
+part_04123_of_04320.hdf5	4be108bf927ac970d1207cf1d9fde70f
+part_04124_of_04320.hdf5	f5424f2c1ea63446a940a0c584d80394
+part_04125_of_04320.hdf5	137b8d7c077fb2c9783605c5f8be1d83
+part_04126_of_04320.hdf5	b99c65f3fd814930fcfe40bdcb522950
+part_04127_of_04320.hdf5	9815d64278ace0c00ab942b523745fcd
+part_04128_of_04320.hdf5	e3b723265e5f9afca4a52b6d93cc67dc
+part_04129_of_04320.hdf5	78acdf39d6776a3f27dfa0b2d9adb20d
+part_04130_of_04320.hdf5	f1a314a106f278b55fd505205bf8b203
+part_04131_of_04320.hdf5	0594ee14353dc365cc7a9e512b60760d
+part_04132_of_04320.hdf5	afc7d02ea8ead454acfeef4f86795273
+part_04133_of_04320.hdf5	67643a9bbbb4fbc38b4070a152687366
+part_04134_of_04320.hdf5	25edba6fb3369d1bd6f5e45a6ae8e57f
+part_04135_of_04320.hdf5	4529d627b263c791f0bd559fdf327c8b
+part_04136_of_04320.hdf5	19a176355cbd296b4ec78ba6abf7402e
+part_04137_of_04320.hdf5	9e42ae3dd58989b62bf357e045551e1f
+part_04138_of_04320.hdf5	c3fa5c7ad1a6abd7c755e94e275e3430
+part_04139_of_04320.hdf5	29044e0cbdb698c40ed36789f3a7fae5
+part_04140_of_04320.hdf5	4c1d498d8a37eb39bb3dbf7d6e412f9a
+part_04141_of_04320.hdf5	49dbd28623f7d21a8fe3694efe957474
+part_04142_of_04320.hdf5	58b6af78daa92fe2d6ad44b09939d129
+part_04143_of_04320.hdf5	2a16200fccbeb4dc83ee15c04b648d71
+part_04144_of_04320.hdf5	f1565f18a6b177d852df01a751ba38d0
+part_04145_of_04320.hdf5	45d0960d201e050cb05f2d5cb850d0db
+part_04146_of_04320.hdf5	69080cb65bb9e651bf15154ea2760822
+part_04147_of_04320.hdf5	7bec3581a1be853a2927377014085bae
+part_04148_of_04320.hdf5	c2a90499f20f04faf0bc239f5f896fca
+part_04149_of_04320.hdf5	e392c47e3cbfe84deb0ddd5264face43
+part_04150_of_04320.hdf5	33651e6d9762dc3c11c2e560133cd83c
+part_04151_of_04320.hdf5	fd1755ca35a1aad9f13a5e71b5bd0396
+part_04152_of_04320.hdf5	43ccc25aa8bf12267078bb30f53cee2b
+part_04153_of_04320.hdf5	d85471c754943424f32db139e3747b29
+part_04154_of_04320.hdf5	6f2538ca20d344d86f1522093759bde9
+part_04155_of_04320.hdf5	c0f5a21d2004b46e2c5c9f3a7225143d
+part_04156_of_04320.hdf5	2e1d0936f327d90946489c0a1ed29d42
+part_04157_of_04320.hdf5	32a7ac5dfcfe0deb2f50def3d5c5397a
+part_04158_of_04320.hdf5	6915dbd84467398f47be5d602be14ac6
+part_04159_of_04320.hdf5	91ea6ed0bb408720e6529a1f03380fcd
+part_04160_of_04320.hdf5	4df8bafdcc5c11787e5f8a1b3457ba24
+part_04161_of_04320.hdf5	93ffa71dca2858b19d08d1f82c1733be
+part_04162_of_04320.hdf5	b921077bfd27a98065f1d9c5470d0c09
+part_04163_of_04320.hdf5	1724ebdda3db7cf0f29bc7d050e00d49
+part_04164_of_04320.hdf5	04bde4fd48b8089c57f127e42bae42c7
+part_04165_of_04320.hdf5	f2ed6467e1eba711a605f10652f3f575
+part_04166_of_04320.hdf5	821f0f2cf947302738cf8d142bcd4b06
+part_04167_of_04320.hdf5	78ab179d36bd0606dfae8236b3f4aee1
+part_04168_of_04320.hdf5	fd2de6ae50ff48b5de69b21f10557c6b
+part_04169_of_04320.hdf5	09c458671ac02c1ff241b83ec93f21b3
+part_04170_of_04320.hdf5	33f3337165deb3f8375a8a1a90435184
+part_04171_of_04320.hdf5	c4a7dc0636d8795269ac92f63c87272d
+part_04172_of_04320.hdf5	b0fdacda0330c66c709e4a238aef1a6a
+part_04173_of_04320.hdf5	43851e92cfa9f1267ade03a2a4428164
+part_04174_of_04320.hdf5	b25214580b7aadebd688e411f250d069
+part_04175_of_04320.hdf5	b58048f83a4e3dd0daeb8af125579aed
+part_04176_of_04320.hdf5	7db53acbf71176ee7a6d28f16b75e3bf
+part_04177_of_04320.hdf5	98dd4de77e16c8ea3fb83768f36028c6
+part_04178_of_04320.hdf5	2e6854d2b7f30e427931920b50a82e82
+part_04179_of_04320.hdf5	d1d789988d7768e8183d81b3f19968ae
+part_04180_of_04320.hdf5	4176c0a0185ebb2ed054476ed35bf8a1
+part_04181_of_04320.hdf5	ffb2413bde6f3ed2a9d9a5e5cd4a3e0e
+part_04182_of_04320.hdf5	1f4235b7e4b34206cc9029bf6a1f2b5c
+part_04183_of_04320.hdf5	86f3452e56df7daf9a6bbaf1fe38e8bc
+part_04184_of_04320.hdf5	f4572d6143962b79a2e66c2f37f7fbdd
+part_04185_of_04320.hdf5	fad0c90fb6f6cbe9b6e9576a49957da2
+part_04186_of_04320.hdf5	b6aae9068c4abcc379d55f30cf9206b5
+part_04187_of_04320.hdf5	3f106c6afd1ce1b668b016085de7ea42
+part_04188_of_04320.hdf5	a14cfcc5634f47be1fd95c67386f5690
+part_04189_of_04320.hdf5	ed356ccef3ebe4190384e489152a278d
+part_04190_of_04320.hdf5	d89c6535306318b52ee80938b7aaaeb2
+part_04191_of_04320.hdf5	4e497d625c7327d2d0e785fe01abfc2e
+part_04192_of_04320.hdf5	ed38dd346ba7e150642a690a7d3c83a3
+part_04193_of_04320.hdf5	ef16f841f0264ce2bb912eecd742b081
+part_04194_of_04320.hdf5	6764d63502fdf6e08ee3f79a2f8ece2a
+part_04195_of_04320.hdf5	1db948c7d8104480d6c795adbfa77228
+part_04196_of_04320.hdf5	a4a40f45a8fa1a41ecbca78754214a9f
+part_04197_of_04320.hdf5	629c79dee81fbf584f9498dd0776f618
+part_04198_of_04320.hdf5	8861523ef98cd2531df0ec22cc7a609e
+part_04199_of_04320.hdf5	3f6eca8c4374427f89d54c80af9b8277
+part_04200_of_04320.hdf5	c6e29427ad48246035201b9edf90f8dc
+part_04201_of_04320.hdf5	27dab3bd04ac923e229d0953d625c97c
+part_04202_of_04320.hdf5	a2c12a8f66d67f1be03108cb507ab509
+part_04203_of_04320.hdf5	861f690f22890d2575700b83e992dfb9
+part_04204_of_04320.hdf5	9704f0e381bee0794bf04829facf4ac3
+part_04205_of_04320.hdf5	a29343e330c52237c7c73d96f038b7c5
+part_04206_of_04320.hdf5	4d7b7c9b6ac1aa81a1d9ad92db84d05f
+part_04207_of_04320.hdf5	d56ccde8d54d8a2bb22f0f5569317602
+part_04208_of_04320.hdf5	605af3ac67542bad1c5843c90642f83b
+part_04209_of_04320.hdf5	7942acef0a3551ce166147ea3fdf74cb
+part_04210_of_04320.hdf5	ef353f4c5e5e6d139cc5babadc118822
+part_04211_of_04320.hdf5	5327dfb1aecad76db491fd8c0645c7b7
+part_04212_of_04320.hdf5	04df2414017c626629e77908a1126c4a
+part_04213_of_04320.hdf5	b20451d9b67d53a1007cc98c5f75e1d5
+part_04214_of_04320.hdf5	5c6e1bf7abb64b95f1d1699737c13ba2
+part_04215_of_04320.hdf5	ca2874f090c21614bc4466a0e0a79dac
+part_04216_of_04320.hdf5	5017505477dd89717e6f0ed1f4e326d6
+part_04217_of_04320.hdf5	cd0a43ea63ec74869d20b89e225d6c9a
+part_04218_of_04320.hdf5	1be0ff95acbcffb3d5efa46d94646e38
+part_04219_of_04320.hdf5	82b8805f35f271c9da2067b62e2b066b
+part_04220_of_04320.hdf5	f1cfd8e05d6122f9435738dc308b8d15
+part_04221_of_04320.hdf5	8b2b951ad739e991187e19d7fd788897
+part_04222_of_04320.hdf5	c9304e4e5c8463105c1c626e33c89f79
+part_04223_of_04320.hdf5	e89db5406da6251164b90c93a7d03a2f
+part_04224_of_04320.hdf5	fccdf91a5a73584f7dae0b30e2a21d01
+part_04225_of_04320.hdf5	ed13bad064e021732573890d4ddf31a2
+part_04226_of_04320.hdf5	b8c2079904d51c3cda6ba45094ae8ec9
+part_04227_of_04320.hdf5	092462d158e84483147af43a9528c492
+part_04228_of_04320.hdf5	a055f4850cfd4f42557904739f18b644
+part_04229_of_04320.hdf5	9fe9427c3a6d6b9d7115f19303474e9a
+part_04230_of_04320.hdf5	4c6db3e6facb56dbcd33b92ccde99bd9
+part_04231_of_04320.hdf5	0378c7beb43639332040b6a6e8156e98
+part_04232_of_04320.hdf5	64c48c558ef33dee410488acb92457ee
+part_04233_of_04320.hdf5	47a775f6047e7fc6c1611bbfc0379025
+part_04234_of_04320.hdf5	7a5a070084749299e238ecdbfe57908e
+part_04235_of_04320.hdf5	80e5f4f5b54012eaf28103f43c98aa16
+part_04236_of_04320.hdf5	5225d80dd5bb713ec275eb256384401c
+part_04237_of_04320.hdf5	4b6cf4af37277f40133a6812b95cbd8e
+part_04238_of_04320.hdf5	1e5203d7ab593859c3db623ea685837d
+part_04239_of_04320.hdf5	dc02740f2c2365f06e051bb82fb2bd75
+part_04240_of_04320.hdf5	b095fef8409d46bca3b3a776a4e6a338
+part_04241_of_04320.hdf5	e0d7d0977fc44c38d3d789fe7fda5d63
+part_04242_of_04320.hdf5	c64c2baec3645be4fd6a5eef76d4ce52
+part_04243_of_04320.hdf5	d94b1d148503e8ba2f3a68fdad93fb0f
+part_04244_of_04320.hdf5	1d9c80a34c8108f24748658b4c093184
+part_04245_of_04320.hdf5	40c57c257089346dee09b1629519b852
+part_04246_of_04320.hdf5	8118cef2d4d7077e48bf9199efeced51
+part_04247_of_04320.hdf5	e1b4e4eaa22ba8dce2b7d3fc478bf370
+part_04248_of_04320.hdf5	767c048e290d8d19962056c040290555
+part_04249_of_04320.hdf5	b67339c27de1d7e8dd62d1b026473c3d
+part_04250_of_04320.hdf5	d1e6afae6c8066c4167354f02ee3a569
+part_04251_of_04320.hdf5	cfedcf839502235c972303e6393c5606
+part_04252_of_04320.hdf5	215fcb399e8ccde189277cc064404a0d
+part_04253_of_04320.hdf5	dd2a528edb3252b4d499c0e833c0662e
+part_04254_of_04320.hdf5	0781d6e437c34b823b17b2bb1d07374f
+part_04255_of_04320.hdf5	5cc2ece7fea8ff6d4e69c449509b0eb8
+part_04256_of_04320.hdf5	8e77a2e66893d46f73fe78403ae17262
+part_04257_of_04320.hdf5	c9ad7732adaeb3c08179ae506a4acc55
+part_04258_of_04320.hdf5	b007e03f87fa80892160f83142122a87
+part_04259_of_04320.hdf5	2b552e7a0d7f3be916b4433726dad788
+part_04260_of_04320.hdf5	8fb97dfe73e3b7101f4ad2427fd8af0b
+part_04261_of_04320.hdf5	d0b19791aab28563d3644c65d904c5ba
+part_04262_of_04320.hdf5	a9f2749417c7326980f2fce7b853c1db
+part_04263_of_04320.hdf5	688ea9ebd59bec5474f5a5565c3c9dfb
+part_04264_of_04320.hdf5	1c99c2f627ccf5d9eae9cb6e17cfbdae
+part_04265_of_04320.hdf5	25a2af9cd558583de616f2d286eca204
+part_04266_of_04320.hdf5	443ccc30c61797b0828d3e2c15b8a123
+part_04267_of_04320.hdf5	5776e15dfbde3c52b7b179666d486332
+part_04268_of_04320.hdf5	0234d049d41b7758487dc3c903f78abc
+part_04269_of_04320.hdf5	6d917af51029c421ba699af5d8856016
+part_04270_of_04320.hdf5	91f82d74462ef5ce5f51c6c9e8795e6a
+part_04271_of_04320.hdf5	56044a0f3795c23c9fad1c48720077e6
+part_04272_of_04320.hdf5	a370719fcb1658d2e23afc0a2aa2427e
+part_04273_of_04320.hdf5	0f231177d2de34c317f50f1206b03632
+part_04274_of_04320.hdf5	191d49d69616d8ead6845a4355db5008
+part_04275_of_04320.hdf5	0b46368d317dbbeec28f7fe0c0fe43e5
+part_04276_of_04320.hdf5	c17b703366a42bf6acb07d1314bc24d7
+part_04277_of_04320.hdf5	a32dd264b3723270c3fb2f624458a97e
+part_04278_of_04320.hdf5	513ed59f585d51a7c773c82cbe987f27
+part_04279_of_04320.hdf5	229cef03aa8fb8d33057ff1189196a42
+part_04280_of_04320.hdf5	2102666dcbbf86ba5b8b2e1db6dafa86
+part_04281_of_04320.hdf5	97783afc0f2960a8facb2055a3b4fd1b
+part_04282_of_04320.hdf5	568089b7dea8fe6a05cfafef0be025ea
+part_04283_of_04320.hdf5	d54012bc8eae80f60f4428ef002d2913
+part_04284_of_04320.hdf5	9168ae7a59359d0da51948a72d3a6220
+part_04285_of_04320.hdf5	843dff1e13aa52ae7ff163b0904c8d3d
+part_04286_of_04320.hdf5	7b98123b9e895c09344ac43c9ef175ce
+part_04287_of_04320.hdf5	98c6b10cbe8d01f54212c3283ec90e6d
+part_04288_of_04320.hdf5	d36eeffb29f543b576ffb938f0c88d3c
+part_04289_of_04320.hdf5	d6e0771d42e75a4339f18094a29e36fb
+part_04290_of_04320.hdf5	852e87adad3cf4b91579eb6d150ded93
+part_04291_of_04320.hdf5	58eac227c78ffa1b660dd8e42c203051
+part_04292_of_04320.hdf5	39747b9b19b9575f37773bc5db5579d8
+part_04293_of_04320.hdf5	9776218fa66a91bdf5258317dfce928e
+part_04294_of_04320.hdf5	55acbff973b8db68c42f3d272efcc93b
+part_04295_of_04320.hdf5	65b2466a5d3eb35cbdf3eb52fcf47a5f
+part_04296_of_04320.hdf5	44ab0caff01bce2a3ad1e9ba8aed8692
+part_04297_of_04320.hdf5	b1c757ce2fbe9e3f0ab4f094fbb8e352
+part_04298_of_04320.hdf5	ee5b942d471551bd4795cf3f9a841927
+part_04299_of_04320.hdf5	8582be6403d5910a4caf41988b7e9ae7
+part_04300_of_04320.hdf5	c90b3a15acd6fc503cc99a609f16383c
+part_04301_of_04320.hdf5	741095f9bde59c1a2366ec3185c1f8e7
+part_04302_of_04320.hdf5	1640baf6f37ac61b8215895ce829f057
+part_04303_of_04320.hdf5	d3ba5eb8b1e1f1c85ca25410248aa39f
+part_04304_of_04320.hdf5	1add5fdc720c520ba2ec54f6b88bd954
+part_04305_of_04320.hdf5	4af5c47bc539728421cb99a4ee436e94
+part_04306_of_04320.hdf5	6e62063ad415548daf84f956bbaef1e1
+part_04307_of_04320.hdf5	489ca988430f8d59c1d0f0369d1f5248
+part_04308_of_04320.hdf5	03ccec05c12df20508ff90d8959ce9b1
+part_04309_of_04320.hdf5	155bac59ce2085513cefeb8f56bede0a
+part_04310_of_04320.hdf5	3bc4250f03b44c358714eda0542d29af
+part_04311_of_04320.hdf5	86c2b6108ee7161c24fe66864b48c2c5
+part_04312_of_04320.hdf5	ee014a26063d1cd79c23af081165acaf
+part_04313_of_04320.hdf5	587f8f6096a63e1d2f20973977a2f20e
+part_04314_of_04320.hdf5	0004f3a2e8720f7491468b128d6f275f
+part_04315_of_04320.hdf5	ae4cdb58d5dbc59c477e35ab690a6019
+part_04316_of_04320.hdf5	d273804a20c4789331b38ae667c34fb7
+part_04317_of_04320.hdf5	7efe3d976ffc9831f076fc9b1c718c1d
+part_04318_of_04320.hdf5	b05aa2704fe1028c132bb42b2bd27230
+part_04319_of_04320.hdf5	76985d700c2ea35390a32549492fe914
diff --git a/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/chop_hdf5_files.py b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/chop_hdf5_files.py
new file mode 100644
index 000000000..321389c27
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/chop_hdf5_files.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import h5py
+import multiprocessing
+import numpy as np
+from os import path, makedirs
+from tqdm import tqdm
+import argparse
+import logging
+
+parser = argparse.ArgumentParser(
+    description="Training data sharding for BERT.")
+parser.add_argument(
+    '--input_hdf5_dir',
+    type=str,
+    default='hdf5',
+    help='Input hdf5_file path')
+parser.add_argument(
+    '--output_hdf5_dir',
+    type=str,
+    default='',
+    help='Output hdf5_file path')
+parser.add_argument(
+    '--num_shards',
+    type=int,
+    default=2048,
+    help='Number of output shards (default 2048)')
+parser.add_argument(
+    '--max_seq_length',
+    type=int,
+    default=512,
+    help='The maximum number of tokens within a sequence. (default 512)')
+parser.add_argument(
+    '--max_predictions_per_seq',
+    type=int,
+    default=76,
+    help='The maximum number of predictions within a sequence. (default 76)')
+args = parser.parse_args()
+
+max_seq_length = args.max_seq_length
+max_predictions_per_seq = args.max_predictions_per_seq
+n_output_shards = args.num_shards
+input_path = args.input_hdf5_dir
+logging.basicConfig(level=logging.INFO)
+
+hdf5_compression_method = None
+
+input_files = sorted(glob.glob(input_path + '/part-00???-of-00500.hdf5', recursive=False))
+logging.info('n_input_shards = {}'.format(len(input_files)))
+logging.info('n_output_shards = {}'.format(n_output_shards))
+
+output_shards_dir = path.join(args.output_hdf5_dir,'hdf5_{}_shards_uncompressed'.format(n_output_shards))
+try:  
+  makedirs(output_shards_dir)
+except OSError as error:
+  logging.info('Output directory : {} already exists. Overwritting ...'.format(output_shards_dir))
+
+ofile_prefix = path.join(output_shards_dir, 'part_')
+ofile_suffix = '_of_{:05d}.hdf5'.format(n_output_shards)
+
+
+# First pass over data to get sample count (read only the smallest array to get count)
+n_samples = 0
+for ifile in tqdm(input_files, total=len(input_files)):
+  h5_ifile = h5py.File(ifile, 'r')
+  n_samples += h5_ifile['next_sentence_labels'].shape[0]
+  h5_ifile.close()
+  
+# Find a "nominal" number of samples per shard (calculated to always go over by one shard size)
+# Find excess samples in last shard and distribute removal of excess over first "N" shards (could be done over last, but it doesn't matter and math is easier this way)
+#  (since 0 <= excess < nominal_shard_size, the max imbalance will be 1 sample to minimize the straggler effect)
+n_sample_per_ofile_nominal = (n_samples + n_output_shards - 1) // n_output_shards
+n_excess = n_output_shards * n_sample_per_ofile_nominal - n_samples  # Always a positive number
+logging.info('Total number of samples: {}. Sample per shard {}/{}'.format(n_samples, n_sample_per_ofile_nominal-1, n_sample_per_ofile_nominal))
+
+logging.info('creating {} output file handles.  This could take a while.'.format(n_output_shards))
+ofile_handles = [h5py.File('{}{:05d}{}'.format(ofile_prefix, shard, ofile_suffix), 'w') for shard in range(n_output_shards)]
+
+ofile_idx = 0  # which output file
+ofile_entry_idx = 0  # index into an individual data element of an output file
+ifile_entry_idx = 0
+
+n_samples_in_this_shard = n_sample_per_ofile_nominal - 1 
+o_input_ids = np.ndarray((n_samples_in_this_shard, max_seq_length))
+o_input_masks = np.ndarray((n_samples_in_this_shard, max_seq_length))
+o_segment_ids = np.ndarray((n_samples_in_this_shard, max_seq_length))
+o_masked_lm_positions = np.ndarray((n_samples_in_this_shard, max_predictions_per_seq))
+o_masked_lm_ids = np.ndarray((n_samples_in_this_shard, max_predictions_per_seq))
+o_next_sentence_labels = np.ndarray((n_samples_in_this_shard))
+
+for ifile in tqdm(input_files, total=len(input_files)):
+  h5_ifile = h5py.File(ifile, 'r')
+  
+  ifile_entry_idx = 0
+  f_input_ids = h5_ifile['input_ids'][:]
+  f_input_masks = h5_ifile['input_mask'][:]
+  f_segment_ids = h5_ifile['segment_ids'][:]
+  f_masked_lm_positions = h5_ifile['masked_lm_positions'][:]
+  f_masked_lm_ids = h5_ifile['masked_lm_ids'][:]
+  f_next_sentence_labels = h5_ifile['next_sentence_labels'][:]
+
+  h5_ifile.close()
+
+  # This could be vectorized but keeping it simple due to lack of time
+  while ifile_entry_idx < f_input_ids.shape[0]:
+    if ofile_entry_idx == n_samples_in_this_shard:
+      ofile_handles[ofile_idx].create_dataset("input_ids",            data=o_input_ids,            dtype='i2', compression=hdf5_compression_method)
+      ofile_handles[ofile_idx].create_dataset("input_mask",           data=o_input_masks,          dtype='i1', compression=hdf5_compression_method)
+      ofile_handles[ofile_idx].create_dataset("segment_ids",          data=o_segment_ids,          dtype='i1', compression=hdf5_compression_method)
+      ofile_handles[ofile_idx].create_dataset("masked_lm_positions",  data=o_masked_lm_positions,  dtype='i2', compression=hdf5_compression_method)
+      ofile_handles[ofile_idx].create_dataset("masked_lm_ids",        data=o_masked_lm_ids,        dtype='i2', compression=hdf5_compression_method)
+      ofile_handles[ofile_idx].create_dataset("next_sentence_labels", data=o_next_sentence_labels, dtype='i1', compression=hdf5_compression_method)
+      ofile_handles[ofile_idx].flush()
+      ofile_handles[ofile_idx].close()
+
+      ofile_entry_idx = 0
+      ofile_idx += 1
+
+      n_samples_in_this_shard = n_sample_per_ofile_nominal
+      if ofile_entry_idx < n_excess:
+        n_samples_in_this_shard -= 1
+
+      o_input_ids = np.ndarray((n_samples_in_this_shard, max_seq_length))
+      o_input_masks = np.ndarray((n_samples_in_this_shard, max_seq_length))
+      o_segment_ids = np.ndarray((n_samples_in_this_shard, max_seq_length))
+      o_masked_lm_positions = np.ndarray((n_samples_in_this_shard, max_predictions_per_seq))
+      o_masked_lm_ids = np.ndarray((n_samples_in_this_shard, max_predictions_per_seq))
+      o_next_sentence_labels = np.ndarray((n_samples_in_this_shard))
+
+    o_input_ids[ofile_entry_idx] = f_input_ids[ifile_entry_idx]
+    o_input_masks[ofile_entry_idx] = f_input_masks[ifile_entry_idx]
+    o_segment_ids[ofile_entry_idx] = f_segment_ids[ifile_entry_idx]
+    o_masked_lm_positions[ofile_entry_idx] = f_masked_lm_positions[ifile_entry_idx]
+    o_masked_lm_ids[ofile_entry_idx] = f_masked_lm_ids[ifile_entry_idx]
+    o_next_sentence_labels[ofile_entry_idx] = f_next_sentence_labels[ifile_entry_idx]
+    ofile_entry_idx += 1
+
+    ifile_entry_idx += 1 
diff --git a/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/chop_hdf5_files_to_varlength.py b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/chop_hdf5_files_to_varlength.py
new file mode 100644
index 000000000..b2fe8b502
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/chop_hdf5_files_to_varlength.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import h5py
+import multiprocessing
+import numpy as np
+from os import path, makedirs
+from tqdm import tqdm
+import argparse
+import logging
+
+parser = argparse.ArgumentParser(
+    description="Training data sharding for BERT.")
+parser.add_argument(
+    '--input_hdf5_dir',
+    type=str,
+    default='hdf5',
+    help='Input hdf5_file path')
+parser.add_argument(
+    '--output_hdf5_dir',
+    type=str,
+    default='',
+    help='Output hdf5_file path')
+parser.add_argument(
+    '--num_shards',
+    type=int,
+    default=2048,
+    help='Number of output shards (default 2048)')
+parser.add_argument(
+    '--max_seq_length',
+    type=int,
+    default=512,
+    help='The maximum number of tokens within a sequence. (default 512)')
+parser.add_argument(
+    '--max_predictions_per_seq',
+    type=int,
+    default=76,
+    help='The maximum number of predictions within a sequence. (default 76)')
+args = parser.parse_args()
+
+max_seq_length = args.max_seq_length
+max_predictions_per_seq = args.max_predictions_per_seq
+n_output_shards = args.num_shards
+input_path = args.input_hdf5_dir
+logging.basicConfig(level=logging.INFO)
+
+hdf5_compression_method = None
+
+input_files = sorted(
+    glob.glob(input_path + '/part-00???-of-00500.hdf5', recursive=False))
+logging.info('n_input_shards = {}'.format(len(input_files)))
+logging.info('n_output_shards = {}'.format(n_output_shards))
+
+output_shards_dir = path.join(
+    args.output_hdf5_dir, '{}_shards_varlength'.format(n_output_shards))
+try:
+    makedirs(output_shards_dir)
+except OSError as error:
+    logging.info('Output directory : {} already exists. Overwritting ...'.format(
+      output_shards_dir))
+
+ofile_prefix = path.join(output_shards_dir, 'part_')
+ofile_suffix = '_of_{:05d}.hdf5'.format(n_output_shards)
+
+
+# First pass over data to get sample count (read only the smallest array to get count)
+n_samples = 0
+n_samples_written = 0
+for ifile in tqdm(input_files, total=len(input_files)):
+    h5_ifile = h5py.File(ifile, 'r')
+    n_samples += h5_ifile['next_sentence_labels'].shape[0]
+    h5_ifile.close()
+
+# Find a "nominal" number of samples per shard (calculated to always go over by one shard size)
+# Find excess samples in last shard and distribute removal of excess over first "N" shards (could be done over last, but it doesn't matter and math is easier this way)
+#  (since 0 <= excess < nominal_shard_size, the max imbalance will be 1 sample to minimize the straggler effect)
+n_sample_per_ofile_nominal = (
+    n_samples + n_output_shards - 1) // n_output_shards
+n_excess = n_output_shards * n_sample_per_ofile_nominal - \
+    n_samples  # Always a positive number
+logging.info('Total number of samples: {}. Sample per shard {}/{}'.format(
+    n_samples, n_sample_per_ofile_nominal-1, n_sample_per_ofile_nominal))
+
+logging.info('creating {} output file handles.  This could take a while.'.format(
+    n_output_shards))
+
+ofile_idx = 0        # which output file
+ofile_entry_idx = 0  # index into an currently written data element of an output file
+ifile_entry_idx = 0  # index into an currently read data element of an input file
+
+n_samples_in_this_shard = n_sample_per_ofile_nominal - 1
+
+#open first output shard
+ofile_handle = h5py.File('{}{:05d}{}'.format(ofile_prefix, ofile_idx, ofile_suffix), 'w')
+input_ids = ofile_handle.create_dataset('input_ids', (n_samples_in_this_shard,), dtype=h5py.vlen_dtype(np.dtype('int16')), compression=hdf5_compression_method)
+segment_ids = ofile_handle.create_dataset('segment_ids', (n_samples_in_this_shard,), dtype=h5py.vlen_dtype(np.dtype('int8')), compression=hdf5_compression_method)
+masked_lm_positions = ofile_handle.create_dataset('masked_lm_positions', (n_samples_in_this_shard,), dtype=h5py.vlen_dtype(np.dtype('int16')), compression=hdf5_compression_method)
+masked_lm_ids = ofile_handle.create_dataset('masked_lm_ids', (n_samples_in_this_shard,), dtype=h5py.vlen_dtype(np.dtype('int16')), compression=hdf5_compression_method)
+next_sentence_labels = ofile_handle.create_dataset('next_sentence_labels', data=np.zeros(n_samples_in_this_shard, dtype="int8"), dtype='i1', compression=hdf5_compression_method)
+
+for ifile in tqdm(input_files, total=len(input_files)):
+    h5_ifile = h5py.File(ifile, 'r')
+    f_input_ids = h5_ifile['input_ids'][:]
+    f_input_masks = h5_ifile['input_mask'][:]
+    f_segment_ids = h5_ifile['segment_ids'][:]
+    f_masked_lm_positions = h5_ifile['masked_lm_positions'][:]
+    f_masked_lm_ids = h5_ifile['masked_lm_ids'][:]
+    f_next_sentence_labels = h5_ifile['next_sentence_labels'][:]
+    h5_ifile.close()
+
+    ifile_entry_idx = 0   # reset input reading index
+
+    while ifile_entry_idx < f_input_ids.shape[0]:
+        if ofile_entry_idx == n_samples_in_this_shard:  # shard is filled up, prepare a next one
+
+            ofile_handle.flush()
+            ofile_handle.close()
+
+            ofile_entry_idx = 0   # reset output writting index
+            ofile_idx += 1        # next output file
+
+            n_samples_in_this_shard = n_sample_per_ofile_nominal
+            if ofile_idx < n_excess:
+                n_samples_in_this_shard -= 1
+
+            ofile_handle = h5py.File('{}{:05d}{}'.format(ofile_prefix, ofile_idx, ofile_suffix), 'w')
+            input_ids = ofile_handle.create_dataset('input_ids', (n_samples_in_this_shard,), dtype=h5py.vlen_dtype(np.dtype('int16')), compression=hdf5_compression_method)
+            segment_ids = ofile_handle.create_dataset('segment_ids', (n_samples_in_this_shard,), dtype=h5py.vlen_dtype(np.dtype('int8')), compression=hdf5_compression_method)
+            masked_lm_positions = ofile_handle.create_dataset('masked_lm_positions', (n_samples_in_this_shard,), dtype=h5py.vlen_dtype(np.dtype('int16')), compression=hdf5_compression_method)
+            masked_lm_ids = ofile_handle.create_dataset('masked_lm_ids', (n_samples_in_this_shard,), dtype=h5py.vlen_dtype(np.dtype('int16')), compression=hdf5_compression_method)
+            next_sentence_labels = ofile_handle.create_dataset('next_sentence_labels', data=np.zeros(n_samples_in_this_shard, dtype="int8"), dtype='i1', compression=hdf5_compression_method)
+
+        input_ids[ofile_entry_idx] = f_input_ids[ifile_entry_idx, :sum(f_input_masks[ifile_entry_idx])]
+        segment_ids[ofile_entry_idx] = f_segment_ids[ifile_entry_idx, :sum(f_input_masks[ifile_entry_idx])]
+        masked_lm_positions[ofile_entry_idx] = f_masked_lm_positions[ifile_entry_idx, :sum(f_masked_lm_positions[ifile_entry_idx] != 0)]
+        masked_lm_ids[ofile_entry_idx] = f_masked_lm_ids[ifile_entry_idx, :sum(f_masked_lm_positions[ifile_entry_idx] != 0)]
+        next_sentence_labels[ofile_entry_idx] = f_next_sentence_labels[ifile_entry_idx]
+
+        ofile_entry_idx += 1
+        ifile_entry_idx += 1
+        n_samples_written += 1
+
+logging.info("{} samples wriiten.".format(n_samples_written))
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/clean.sh b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/clean.sh
new file mode 100644
index 000000000..e8a634a57
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/clean.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+filein=$1
+fileout=$2
+
+echo "Further clean up: $filein => $fileout"
+
+cmd="cat $filein "
+cmd+="| grep -v '^<doc [^>]*>$' "
+cmd+="| grep -vE '\[\[Category:[^][]*\]\]' "
+cmd+="| sed 's/\[\[\([^]|[]*\)\]\]/\1/g' "
+cmd+="| sed 's/\[\[\([^]|[]*\)\]\]/\1/g' "
+cmd+="| sed 's/\[\[[^]|[]*|\([^]|[]*\)\]\]/\1/g' "
+cmd+="| sed 's/\[\[[^]|[]*|\([^]|[]*\)\]\]/\1/g' "
+cmd+="| sed 's/\[\[[:]*[Ff]ile:[^][]*\]\]//g' "
+cmd+="| sed 's/\[\[[Mm]edia:[^][]*\]\]//g' "
+cmd+="| sed 's/\[\[[Ii]mage:[^][]*\]\]//g' "
+cmd+="| sed 's/\[\([^]|[]*\)\]/\1/g' "
+cmd+="| sed 's/\[\[\([^][]*\)\]\]//g' "
+cmd+="| sed 's/alt=//g' "
+cmd+="| sed 's/<\/doc>/\r/g' "
+cmd+="| sed 's/<chem\([^<]*\)<\/chem>/\1/g' "
+cmd+="| sed 's/<ins\([^<]*\)<\/ins>/\1/g' "
+cmd+="| sed 's/<\, ref \([^<]*\)<\/ref>//g' "
+cmd+="| sed 's/<includeonly\([^<]*\)<\/includeonly>//g' "
+cmd+="| sed 's/<graph\([^<]*\)<\/graph>//g' "
+cmd+="| sed 's/<section\([^\\]*\)\/>//g' "
+cmd+="| sed 's/<meta\([^\\]*\)\/>//g' "
+cmd+="| sed 's/<hr\([^\\]*\)\/>//g' "
+cmd+="| sed 's/<gallery\([^>]*\)>//g' "
+cmd+="| sed 's/<ref\([^<]*\)<\/ref>//g' "
+cmd+="| sed 's/<ref\([^>]*\)>//g' "
+cmd+="| sed 's/<http\([^>]*\)>//g' "
+cmd+="| sed 's/<Ref\([^>]*\)>//g' "
+cmd+="| sed 's/<mapframe \([^\/]*\)\/>//g' "
+cmd+="| sed 's/<mapframe\([^>]*\)>//g' "
+cmd+="| sed 's/<\/mapframe>//g' "
+cmd+="| sed 's/<poem>//g' "
+cmd+="| sed 's/<\/poem>//g' "
+cmd+="| sed 's/<math>//g' "
+cmd+="| sed 's/<\/math>//g' "
+cmd+="| sed 's/<ref>//g' "
+cmd+="| sed 's/<\/ref>//g' "
+cmd+="| sed 's/<div\([^>]*\)>//g' "
+cmd+="| sed 's/<\/div\([^>]*\)>//g' "
+cmd+="| sed 's/<\/div style>//g' "
+cmd+="| sed 's/<\/div>//g' "
+cmd+="| sed 's/<sup>//g' "
+cmd+="| sed 's/<\/sup>//g' "
+cmd+="| sed 's/<br>//g' "
+cmd+="| sed 's/<\/br>//g' "
+cmd+="| sed 's/<BR>//g' "
+cmd+="| sed 's/<\/BR>//g' "
+cmd+="| sed 's/<Br>//g' "
+cmd+="| sed 's/<\/Br>//g' "
+cmd+="| sed 's/<del>//g' "
+cmd+="| sed 's/<\/del>//g' "
+cmd+="| sed 's/<nowiki>//g' "
+cmd+="| sed 's/<\/nowiki>//g' "
+cmd+="| sed 's/<NOWIKI>//g' "
+cmd+="| sed 's/<\/NOWIKI>//g' "
+cmd+="| sed 's/<onlyinclude>//g' "
+cmd+="| sed 's/<\/onlyinclude>//g' "
+cmd+="| sed 's/<includeonly>//g' "
+cmd+="| sed 's/<\/includeonly>//g' "
+cmd+="| sed 's/<small>//g' "
+cmd+="| sed 's/<\/small>//g' "
+cmd+="| sed 's/<chem>//g' "
+cmd+="| sed 's/<\/chem>//g' "
+cmd+="| sed 's/<noinclude>//g' "
+cmd+="| sed 's/<\/noinclude>//g' "
+cmd+="| sed 's/<gallery>//g' "
+cmd+="| sed 's/<\/gallery>//g' "
+cmd+="| sed 's/<graph>{//g' "
+cmd+="| sed 's/<graph>//g' "
+cmd+="| sed 's/}<\/graph>//g' "
+cmd+="| sed 's/<\/graph>//g' "
+cmd+="| sed 's/<\/references>//g' "
+cmd+="| sed 's/<poem \([^>]*\)>//g' "
+# cmd+="| grep -v '^[ \t]*$' "
+cmd+="> $fileout"
+
+# echo "bash -c ${cmd[@]}"
+     bash -c "${cmd[@]}"
+
+
diff --git a/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/cleanup_file.py b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/cleanup_file.py
new file mode 100644
index 000000000..1d4e4d010
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/cleanup_file.py
@@ -0,0 +1,83 @@
+# Lint as: python3
+"""Script to clean up input wiki dump for BERT input."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import glob
+import io
+import logging
+import multiprocessing
+import os
+import time
+
+
+parser = argparse.ArgumentParser(description='Wiki clean up for BERT.')
+parser.add_argument(
+    '--data',
+    type=str,
+    default='./wiki_??',
+    help='Input files. Default is "./wiki_??"')
+parser.add_argument(
+    '--input_suffix',
+    type=str,
+    default='',
+    help='Suffix for input files. Default is ""')
+parser.add_argument(
+    '--output_suffix',
+    type=str,
+    default='.1',
+    help='Suffix for output files. Default is ".1"')
+parser.add_argument(
+    '--nworker',
+    type=int,
+    default=72,
+    help='Number of workers for parallel processing.')
+args = parser.parse_args()
+
+
+def process_one_file(one_input):
+  """Remove <doc> tag and title of pages, for one file."""
+  input_filename = one_input + args.input_suffix
+  output_filename = one_input + args.output_suffix
+  logging.info('Processing %s => %s', input_filename, output_filename)
+
+  with io.open(input_filename, 'r', encoding='utf-8') as fin:
+    with io.open(output_filename, 'w', encoding='utf-8') as fout:
+
+      keep_next_line = True
+      for line in fin:
+        if not keep_next_line:
+          keep_next_line = True
+          continue
+
+        if '<doc' in line:
+          keep_next_line = False
+          fout.write(u'\n')
+          continue
+
+        if '</doc>' in line:
+          continue
+
+        if len(line) == 1:
+          continue
+
+        # line = line.replace('<nowiki>', '').replace('</nowiki>', '')
+
+        fout.write(line)
+
+
+if __name__ == '__main__':
+  input_files = sorted(glob.glob(os.path.expanduser(args.data)))
+  num_files = len(input_files)
+  num_workers = args.nworker
+  logging.basicConfig(level=logging.INFO)
+  logging.info('Number of input files to process = %d', num_files)
+
+  tic = time.time()
+  p = multiprocessing.Pool(num_workers)
+  p.map(process_one_file, input_files)
+  toc = time.time()
+  logging.info('Processed %s in %.2f sec', args.data, toc - tic)
diff --git a/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/convert_fixed2variable.py b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/convert_fixed2variable.py
new file mode 100644
index 000000000..afe3db06b
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/convert_fixed2variable.py
@@ -0,0 +1,71 @@
+# coding=utf-8
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2020 MLBenchmark Group. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Create masked LM/next sentence masked_lm TF examples for BERT."""
+
+
+import argparse
+import time
+import logging
+import collections
+import h5py
+import numpy as np
+from tqdm import tqdm
+
+parser = argparse.ArgumentParser(
+    description="Eval sample picker for BERT.")
+parser.add_argument(
+    '--input_hdf5_file',
+    type=str,
+    default='',
+    help='Input hdf5_file path')
+parser.add_argument(
+    '--output_hdf5_file',
+    type=str,
+    default='',
+    help='Output hdf5_file path')
+args = parser.parse_args()
+
+logging.basicConfig(level=logging.INFO)
+
+if __name__ == '__main__':
+  tic = time.time()
+  print(args.input_hdf5_file)
+  h5_ifile = h5py.File(args.input_hdf5_file, 'r')
+  num_examples = h5_ifile.get('next_sentence_labels').shape[0]
+
+#  hdf5_compression_method = "gzip"
+  hdf5_compression_method = None
+
+  h5_writer = h5py.File(args.output_hdf5_file, 'w')
+  input_ids = h5_writer.create_dataset('input_ids', (num_examples,), dtype=h5py.vlen_dtype(np.dtype('int16')), compression=hdf5_compression_method)
+  segment_ids = h5_writer.create_dataset('segment_ids', (num_examples,), dtype=h5py.vlen_dtype(np.dtype('int8')), compression=hdf5_compression_method)
+  masked_lm_positions = h5_writer.create_dataset('masked_lm_positions', (num_examples,), dtype=h5py.vlen_dtype(np.dtype('int16')), compression=hdf5_compression_method)
+  masked_lm_ids = h5_writer.create_dataset('masked_lm_ids', (num_examples,), dtype=h5py.vlen_dtype(np.dtype('int16')), compression=hdf5_compression_method)
+  next_sentence_labels = h5_writer.create_dataset('next_sentence_labels', data=np.zeros(num_examples, dtype="int8"), dtype='i1', compression=hdf5_compression_method)
+
+  for i in tqdm(range(num_examples), total=num_examples):
+    input_ids[i] = h5_ifile['input_ids'][i, :sum(h5_ifile['input_mask'][i])]
+    segment_ids[i] = h5_ifile['segment_ids'][i, :sum(h5_ifile['input_mask'][i])]
+    masked_lm_positions[i] = h5_ifile['masked_lm_positions'][i, :sum(h5_ifile['masked_lm_positions'][i]!=0)]
+    masked_lm_ids[i] = h5_ifile['masked_lm_ids'][i, :sum(h5_ifile['masked_lm_positions'][i]!=0)]
+    next_sentence_labels[i] = h5_ifile['next_sentence_labels'][i]
+
+  h5_writer.flush()
+  h5_writer.close()
+
+  toc = time.time()
+  logging.info("Converted {} examples in {:.2} sec".format(num_examples, toc - tic))
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/convert_tf_checkpoint.py b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/convert_tf_checkpoint.py
new file mode 100644
index 000000000..2a2568e26
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/convert_tf_checkpoint.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+pkg_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+sys.path.append(pkg_path)
+
+
+import torch
+import argparse
+
+from model import BertForPretraining, BertConfig
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--bert_model", default="bert-large-uncased", type=str,
+                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
+                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
+    parser.add_argument('--tf_checkpoint',
+                        type=str,
+                        default="/google_bert_data",
+                        help="Path to directory containing TF checkpoint")
+    parser.add_argument('--bert_config_path',
+                        type=str,
+                        default="/workspace/phase1",
+                        help="Path bert_config.json is located in")
+    parser.add_argument('--output_checkpoint', type=str,
+                        default='./checkpoint.pt',
+                        help="Path to output PyT checkpoint")
+
+    return parser.parse_args()
+
+def prepare_model(args, device):
+
+    # Prepare model
+    config = BertConfig.from_json_file(args.bert_config_path)
+
+    # Padding for divisibility by 8
+    if config.vocab_size % 8 != 0:
+        config.vocab_size += 8 - (config.vocab_size % 8)
+        print('padded vocab size to: {}'.format(config.vocab_size))
+
+    # Set some options that the config file is expected to have (but don't need to be set properly
+    # at this point)
+    config.pad = False
+    config.unpad = False
+    config.dense_seq_output = False
+    config.fused_mha = False
+    config.fused_gelu_bias = False
+    config.fuse_qkv = False
+    config.fuse_scale = False
+    config.fuse_mask = False
+    config.fuse_dropout = False
+    config.apex_softmax = False
+    config.enable_stream = False
+    config.unpad_fmha = False
+    config.pad_fmha = False
+    config.fused_bias_mha = False
+    config.fused_dropout_add = False
+    config.fused_bias_fc = False
+    if config.fuse_mask == True: config.apex_softmax = True
+    if config.pad == False: config.enable_stream = True
+    if config.unpad == True: config.fused_mha = False
+
+    #Load from TF checkpoint
+    model = BertForPretraining.from_pretrained(args.tf_checkpoint, from_tf=True, config=config)
+
+    return model
+
+def main():
+    args = parse_arguments()
+    device = torch.device("cuda")
+
+    model = prepare_model(args, device)
+
+    torch.save({'model' : model.state_dict() }, args.output_checkpoint)
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/create_pretraining_data.py b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/create_pretraining_data.py
new file mode 100644
index 000000000..73cff8892
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/create_pretraining_data.py
@@ -0,0 +1,455 @@
+# coding=utf-8
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2020 MLBenchmark Group. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Create masked LM/next sentence masked_lm TF examples for BERT."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import random
+import tokenization
+import tensorflow as tf
+
+import h5py
+import numpy as np
+
+hdf5_compression_method = None
+
+#flags = tf.flags
+flags = tf.compat.v1.flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("input_file", None,
+                    "Input raw text file (or comma-separated list of files).")
+
+flags.DEFINE_string(
+    "output_file", None,
+    "Output TF example file (or comma-separated list of files).")
+
+flags.DEFINE_string("vocab_file", None,
+                    "The vocabulary file that the BERT model was trained on.")
+
+flags.DEFINE_bool(
+    "do_lower_case", True,
+    "Whether to lower case the input text. Should be True for uncased "
+    "models and False for cased models.")
+
+flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.")
+
+flags.DEFINE_integer("max_predictions_per_seq", 20,
+                     "Maximum number of masked LM predictions per sequence.")
+
+flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")
+
+flags.DEFINE_integer(
+    "dupe_factor", 10,
+    "Number of times to duplicate the input data (with different masks).")
+
+flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.")
+
+flags.DEFINE_float(
+    "short_seq_prob", 0.1,
+    "Probability of creating sequences which are shorter than the "
+    "maximum length.")
+
+
+class TrainingInstance(object):
+  """A single training instance (sentence pair)."""
+
+  def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
+               is_random_next):
+    self.tokens = tokens
+    self.segment_ids = segment_ids
+    self.is_random_next = is_random_next
+    self.masked_lm_positions = masked_lm_positions
+    self.masked_lm_labels = masked_lm_labels
+
+  def __str__(self):
+    s = ""
+    s += "tokens: %s\n" % (" ".join(
+        [tokenization.printable_text(x) for x in self.tokens]))
+    s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
+    s += "is_random_next: %s\n" % self.is_random_next
+    s += "masked_lm_positions: %s\n" % (" ".join(
+        [str(x) for x in self.masked_lm_positions]))
+    s += "masked_lm_labels: %s\n" % (" ".join(
+        [tokenization.printable_text(x) for x in self.masked_lm_labels]))
+    s += "\n"
+    return s
+
+  def __repr__(self):
+    return self.__str__()
+
+
+def write_instance_to_example_files(instances, tokenizer, max_seq_length,
+                                    max_predictions_per_seq, output_files):
+  """Create TF example files from `TrainingInstance`s."""
+  writers = []
+  h5_writers = []
+  
+  expected_instances_per_file = len(instances) // len(output_files) + 500    # Over-allocation to avoid resizing 
+  for output_file in output_files:
+    h5_writers.append({
+      'handle' : h5py.File(output_file + ".hdf5", 'w'),
+      'input_ids' : np.zeros([expected_instances_per_file, max_seq_length], dtype="int32"),
+      'input_mask' : np.zeros([expected_instances_per_file, max_seq_length], dtype="int32"),
+      'segment_ids' : np.zeros([expected_instances_per_file, max_seq_length], dtype="int32"),
+      'masked_lm_positions' : np.zeros([expected_instances_per_file, max_predictions_per_seq], dtype="int32"),
+      'masked_lm_ids' : np.zeros([expected_instances_per_file, max_predictions_per_seq], dtype="int32"),
+      'next_sentence_labels' : np.zeros(expected_instances_per_file, dtype="int32"),
+      'len' : 0 })
+
+  writer_index = 0
+
+  total_written = 0
+
+  features_h5 = collections.OrderedDict()
+
+  for (inst_index, instance) in enumerate(instances):
+    input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
+    input_mask = [1] * len(input_ids)
+    segment_ids = list(instance.segment_ids)
+    assert len(input_ids) <= max_seq_length
+
+    while len(input_ids) < max_seq_length:
+      input_ids.append(0)
+      input_mask.append(0)
+      segment_ids.append(0)
+
+    assert len(input_ids) == max_seq_length
+    assert len(input_mask) == max_seq_length
+    assert len(segment_ids) == max_seq_length
+
+    masked_lm_positions = list(instance.masked_lm_positions)
+    masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
+    masked_lm_weights = [1.0] * len(masked_lm_ids)
+
+    while len(masked_lm_positions) < max_predictions_per_seq:
+      masked_lm_positions.append(0)
+      masked_lm_ids.append(0)
+      masked_lm_weights.append(0.0)
+
+    next_sentence_label = 1 if instance.is_random_next else 0
+    
+    h5_writers[writer_index]['input_ids'][inst_index] = input_ids
+    h5_writers[writer_index]['input_mask'][inst_index] = input_mask
+    h5_writers[writer_index]['segment_ids'][inst_index] = segment_ids
+    h5_writers[writer_index]['masked_lm_positions'][inst_index] = masked_lm_positions
+    h5_writers[writer_index]['masked_lm_ids'][inst_index] = masked_lm_ids
+    h5_writers[writer_index]['next_sentence_labels'][inst_index] = next_sentence_label
+    h5_writers[writer_index]['len'] += 1
+
+    writer_index = (writer_index + 1) % len(h5_writers)
+
+    total_written += 1
+
+    if inst_index < 20:
+      tf.compat.v1.logging.info("*** Example ***")
+      tf.compat.v1.logging.info("tokens: %s" % " ".join(
+          [tokenization.printable_text(x) for x in instance.tokens]))
+  
+  print("saving data")
+  for h5_writer in h5_writers:
+    my_size = h5_writer['len']
+    h5_writer['handle'].create_dataset('input_ids', data=h5_writer['input_ids'][:my_size], dtype='i2', compression=hdf5_compression_method)
+    h5_writer['handle'].create_dataset('input_mask', data=h5_writer['input_mask'][:my_size], dtype='i1', compression=hdf5_compression_method)
+    h5_writer['handle'].create_dataset('segment_ids', data=h5_writer['segment_ids'][:my_size], dtype='i1', compression=hdf5_compression_method)
+    h5_writer['handle'].create_dataset('masked_lm_positions', data=h5_writer['masked_lm_positions'][:my_size], dtype='i2', compression=hdf5_compression_method)
+    h5_writer['handle'].create_dataset('masked_lm_ids', data=h5_writer['masked_lm_ids'][:my_size], dtype='i2', compression=hdf5_compression_method)
+    h5_writer['handle'].create_dataset('next_sentence_labels', data=h5_writer['next_sentence_labels'][:my_size], dtype='i1', compression=hdf5_compression_method)
+    h5_writer['handle'].flush()
+    h5_writer['handle'].close()
+
+  tf.compat.v1.logging.info("Wrote %d total instances", total_written)
+
+
+def create_int_feature(values):
+  feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+  return feature
+
+def create_float_feature(values):
+  feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
+  return feature
+
+def create_training_instances(input_files, tokenizer, max_seq_length,
+                              dupe_factor, short_seq_prob, masked_lm_prob,
+                              max_predictions_per_seq, rng):
+  """Create `TrainingInstance`s from raw text."""
+  all_documents = [[]]
+
+  # Input file format:
+  # (1) One sentence per line. These should ideally be actual sentences, not
+  # entire paragraphs or arbitrary spans of text. (Because we use the
+  # sentence boundaries for the "next sentence prediction" task).
+  # (2) Blank lines between documents. Document boundaries are needed so
+  # that the "next sentence prediction" task doesn't span between documents.
+  for input_file in input_files:
+    with tf.compat.v1.gfile.GFile(input_file, "r") as reader:
+      while True:
+        line = tokenization.convert_to_unicode(reader.readline())
+        if not line:
+          break
+        line = line.strip()
+
+        # Empty lines are used as document delimiters
+        if not line:
+          all_documents.append([])
+        tokens = tokenizer.tokenize(line)
+        if tokens:
+          all_documents[-1].append(tokens)
+
+  # Remove empty documents
+  all_documents = [x for x in all_documents if x]
+  rng.shuffle(all_documents)
+
+  vocab_words = list(tokenizer.vocab.keys())
+  instances = []
+  for _ in range(dupe_factor):
+    for document_index in range(len(all_documents)):
+      instances.extend(
+          create_instances_from_document(
+              all_documents, document_index, max_seq_length, short_seq_prob,
+              masked_lm_prob, max_predictions_per_seq, vocab_words, rng))
+
+  rng.shuffle(instances)
+  return instances
+
+
+def create_instances_from_document(
+    all_documents, document_index, max_seq_length, short_seq_prob,
+    masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
+  """Creates `TrainingInstance`s for a single document."""
+  document = all_documents[document_index]
+
+  # Account for [CLS], [SEP], [SEP]
+  max_num_tokens = max_seq_length - 3
+
+  # We *usually* want to fill up the entire sequence since we are padding
+  # to `max_seq_length` anyways, so short sequences are generally wasted
+  # computation. However, we *sometimes*
+  # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
+  # sequences to minimize the mismatch between pre-training and fine-tuning.
+  # The `target_seq_length` is just a rough target however, whereas
+  # `max_seq_length` is a hard limit.
+  target_seq_length = max_num_tokens
+  if rng.random() < short_seq_prob:
+    target_seq_length = rng.randint(2, max_num_tokens)
+
+  # We DON'T just concatenate all of the tokens from a document into a long
+  # sequence and choose an arbitrary split point because this would make the
+  # next sentence prediction task too easy. Instead, we split the input into
+  # segments "A" and "B" based on the actual "sentences" provided by the user
+  # input.
+  instances = []
+  current_chunk = []
+  current_length = 0
+  i = 0
+  while i < len(document):
+    segment = document[i]
+    current_chunk.append(segment)
+    current_length += len(segment)
+    if i == len(document) - 1 or current_length >= target_seq_length:
+      if current_chunk:
+        # `a_end` is how many segments from `current_chunk` go into the `A`
+        # (first) sentence.
+        a_end = 1
+        if len(current_chunk) >= 2:
+          a_end = rng.randint(1, len(current_chunk) - 1)
+
+        tokens_a = []
+        for j in range(a_end):
+          tokens_a.extend(current_chunk[j])
+
+        tokens_b = []
+        # Random next
+        is_random_next = False
+        if len(current_chunk) == 1 or rng.random() < 0.5:
+          is_random_next = True
+          target_b_length = target_seq_length - len(tokens_a)
+
+          # This should rarely go for more than one iteration for large
+          # corpora. However, just to be careful, we try to make sure that
+          # the random document is not the same as the document
+          # we're processing.
+          for _ in range(10):
+            random_document_index = rng.randint(0, len(all_documents) - 1)
+            if random_document_index != document_index:
+              break
+
+          random_document = all_documents[random_document_index]
+          random_start = rng.randint(0, len(random_document) - 1)
+          for j in range(random_start, len(random_document)):
+            tokens_b.extend(random_document[j])
+            if len(tokens_b) >= target_b_length:
+              break
+          # We didn't actually use these segments so we "put them back" so
+          # they don't go to waste.
+          num_unused_segments = len(current_chunk) - a_end
+          i -= num_unused_segments
+        # Actual next
+        else:
+          is_random_next = False
+          for j in range(a_end, len(current_chunk)):
+            tokens_b.extend(current_chunk[j])
+        truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
+
+        assert len(tokens_a) >= 1
+        assert len(tokens_b) >= 1
+
+        tokens = []
+        segment_ids = []
+        tokens.append("[CLS]")
+        segment_ids.append(0)
+        for token in tokens_a:
+          tokens.append(token)
+          segment_ids.append(0)
+
+        tokens.append("[SEP]")
+        segment_ids.append(0)
+
+        for token in tokens_b:
+          tokens.append(token)
+          segment_ids.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(1)
+
+        (tokens, masked_lm_positions,
+         masked_lm_labels) = create_masked_lm_predictions(
+             tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
+        instance = TrainingInstance(
+            tokens=tokens,
+            segment_ids=segment_ids,
+            is_random_next=is_random_next,
+            masked_lm_positions=masked_lm_positions,
+            masked_lm_labels=masked_lm_labels)
+        instances.append(instance)
+      current_chunk = []
+      current_length = 0
+    i += 1
+
+  return instances
+
+MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
+                                          ["index", "label"])
+
+def create_masked_lm_predictions(tokens, masked_lm_prob,
+                                 max_predictions_per_seq, vocab_words, rng):
+  """Creates the predictions for the masked LM objective."""
+
+  cand_indexes = []
+  for (i, token) in enumerate(tokens):
+    if token == "[CLS]" or token == "[SEP]":
+      continue
+    cand_indexes.append(i)
+
+  rng.shuffle(cand_indexes)
+
+  output_tokens = list(tokens)
+
+  num_to_predict = min(max_predictions_per_seq,
+                       max(1, int(round(len(tokens) * masked_lm_prob))))
+
+  masked_lms = []
+  covered_indexes = set()
+  for index in cand_indexes:
+    if len(masked_lms) >= num_to_predict:
+      break
+    if index in covered_indexes:
+      continue
+    covered_indexes.add(index)
+
+    masked_token = None
+    # 80% of the time, replace with [MASK]
+    if rng.random() < 0.8:
+      masked_token = "[MASK]"
+    else:
+      # 10% of the time, keep original
+      if rng.random() < 0.5:
+        masked_token = tokens[index]
+      # 10% of the time, replace with random word
+      else:
+        masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
+
+    output_tokens[index] = masked_token
+
+    masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
+
+  masked_lms = sorted(masked_lms, key=lambda x: x.index)
+
+  masked_lm_positions = []
+  masked_lm_labels = []
+  for p in masked_lms:
+    masked_lm_positions.append(p.index)
+    masked_lm_labels.append(p.label)
+
+  return (output_tokens, masked_lm_positions, masked_lm_labels)
+
+
+def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
+  """Truncates a pair of sequences to a maximum sequence length."""
+  while True:
+    total_length = len(tokens_a) + len(tokens_b)
+    if total_length <= max_num_tokens:
+      break
+
+    trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
+    assert len(trunc_tokens) >= 1
+
+    # We want to sometimes truncate from the front and sometimes from the
+    # back to add more randomness and avoid biases.
+    if rng.random() < 0.5:
+      del trunc_tokens[0]
+    else:
+      trunc_tokens.pop()
+
+
+def main(_):
+  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
+
+  tokenizer = tokenization.FullTokenizer(
+      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+
+  input_files = []
+  for input_pattern in FLAGS.input_file.split(","):
+    input_files.extend(tf.compat.v1.gfile.Glob(input_pattern))
+
+  tf.compat.v1.logging.info("*** Reading from input files ***")
+  for input_file in input_files:
+    tf.compat.v1.logging.info("  %s", input_file)
+
+  rng = random.Random(FLAGS.random_seed)
+  instances = create_training_instances(
+      input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
+      FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
+      rng)
+
+  output_files = FLAGS.output_file.split(",")
+  tf.compat.v1.logging.info("*** Writing to output files ***")
+  for output_file in output_files:
+    tf.compat.v1.logging.info("  %s", output_file)
+
+  write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
+                                  FLAGS.max_predictions_per_seq, output_files)
+
+
+if __name__ == "__main__":
+  flags.mark_flag_as_required("input_file")
+  flags.mark_flag_as_required("output_file")
+  flags.mark_flag_as_required("vocab_file")
+  tf.compat.v1.app.run()
diff --git a/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/create_pretraining_data_wrapper.sh b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/create_pretraining_data_wrapper.sh
new file mode 100644
index 000000000..a31f97111
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/create_pretraining_data_wrapper.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+
+INPUT=${1}
+OUTPUT=${2}/$(basename $INPUT)
+VOCAB=${3}
+
+python3 ${SCRIPT_DIR}/create_pretraining_data.py \
+   --input_file=${INPUT} \
+   --output_file=${OUTPUT} \
+   --vocab_file=${VOCAB} \
+   --do_lower_case=True \
+   --max_seq_length=512 \
+   --max_predictions_per_seq=76 \
+   --masked_lm_prob=0.15 \
+   --random_seed=12345 \
+   --dupe_factor=10
diff --git a/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/do_gather.py b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/do_gather.py
new file mode 100644
index 000000000..0227924bd
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/do_gather.py
@@ -0,0 +1,95 @@
+"""Script to package BERT dataset into files with approcimate size.
+
+Copied and modified from https://github.com/eric-haibin-lin/text-proc.git
+"""
+import argparse
+import glob
+import io
+import logging
+import multiprocessing
+import os
+import time
+# import collections
+# from multiprocessing import Pool
+
+parser = argparse.ArgumentParser(description='BERT data packaging')
+parser.add_argument(
+    '--data',
+    type=str,
+    default='~/book-corpus-feb-stn/*/*.txt',
+    help='Input files. Default is "*.txt"')
+parser.add_argument(
+    '--nworker',
+    type=int,
+    default=1,
+    help='Number of workers for parallel processing.')
+parser.add_argument(
+    '--out_dir',
+    type=str,
+    default='~/book-corpus-large-gather/',
+    help='Output dir. Default is ~/book-corpus-large-gather/')
+parser.add_argument(
+    '--num_outputs', type=int, default=500, help='number of output files')
+parser.add_argument(
+    '--input_suffix', type=str, default='.3', help='Suffix for input filenames')
+parser.add_argument(
+    '--block_size',
+    type=float,
+    default=32.0,
+    help='Block size for each output (MB)')
+
+args = parser.parse_args()
+
+input_files = sorted(glob.glob(os.path.expanduser(args.data)))
+out_dir = os.path.expanduser(args.out_dir)
+num_files = len(input_files)
+num_workers = args.nworker
+logging.basicConfig(level=logging.INFO)
+logging.info('Number of input files to process = %d', num_files)
+
+if not os.path.exists(out_dir):
+  os.makedirs(out_dir)
+
+
+def worker_fn(x):
+  """Workload for one worker."""
+  file_split, worker_id = x
+  count = 0
+  out_file = None
+  total_size = 0
+  for in_path in file_split:
+    in_file = io.open(in_path + args.input_suffix, 'r', encoding='utf-8-sig')
+    curr_size = os.path.getsize(in_path)
+    if args.block_size * 1024 * 1024 < total_size + curr_size:
+      out_file.close()
+      out_file = None
+      count += 1
+      total_size = 0
+    if not out_file:
+      out_path = os.path.join(
+          out_dir, 'part-{}-of-{}'.format(
+              str(count + 1000 * worker_id).zfill(5),
+              str(args.num_outputs).zfill(5)))
+      out_file = io.open(out_path, 'w', encoding='utf-8')
+    total_size += curr_size
+    content = in_file.read()
+    if content[-1] == content[-2] and content[-1] == '\n':
+      content = content[:-1]
+    out_file.write(content)
+
+
+if __name__ == '__main__':
+  p = multiprocessing.Pool(num_workers)
+
+  # calculate the number of splits
+  file_splits = []
+  split_size = (len(input_files) + num_workers - 1) // num_workers
+  for i in range(num_workers - 1):
+    file_splits.append((input_files[i * split_size:(i + 1) * split_size], i))
+  file_splits.append(
+      (input_files[(num_workers - 1) * split_size:], num_workers - 1))
+
+  tic = time.time()
+  p.map(worker_fn, file_splits)
+  toc = time.time()
+  logging.info('Processed %s in %.2f sec', args.data, toc - tic)
diff --git a/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/do_sentence_segmentation.py b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/do_sentence_segmentation.py
new file mode 100644
index 000000000..e907c0920
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/do_sentence_segmentation.py
@@ -0,0 +1,74 @@
+"""Script for sentence segmentation.
+
+Copied and modified from https://github.com/eric-haibin-lin/text-proc.git
+"""
+import argparse
+import glob
+import io
+import logging
+import multiprocessing
+import os
+import time
+import nltk
+
+from nltk.tokenize import sent_tokenize
+
+parser = argparse.ArgumentParser(
+    description='Sentence segmentation for BERT documents.')
+parser.add_argument(
+    '--data',
+    type=str,
+    default='./*/*.compact',
+    help='Input files. Default is "./*/*.compact"')
+parser.add_argument(
+    '--input_suffix',
+    type=str,
+    default='.2',
+    help='Suffix for input files. Default is ".2"')
+parser.add_argument(
+    '--output_suffix',
+    type=str,
+    default='.3',
+    help='Suffix for output files. Default is ".3"')
+parser.add_argument(
+    '--nworker',
+    type=int,
+    default=72,
+    help='Number of workers for parallel processing.')
+args = parser.parse_args()
+
+# download package
+nltk.download('punkt')
+
+# arguments
+input_files = sorted(glob.glob(os.path.expanduser(args.data)))
+num_files = len(input_files)
+num_workers = args.nworker
+logging.basicConfig(level=logging.INFO)
+logging.info('Number of input files to process = %d', num_files)
+
+
+def process_one_file(one_input):
+  """Separate paragraphs into sentences, for one file."""
+  input_filename = one_input + args.input_suffix
+  output_filename = one_input + args.output_suffix
+  logging.info('Processing %s => %s', input_filename, output_filename)
+  with io.open(input_filename, 'r', encoding='utf-8') as fin:
+    with io.open(output_filename, 'w', encoding='utf-8') as fout:
+      for line in fin:
+        if len(line) == 1:
+          fout.write(u'\n')
+        sents = sent_tokenize(line)
+        for sent in sents:
+          sent_str = sent.strip()
+          # if sent_str:
+          fout.write('%s\n' % sent_str)
+      fout.write(u'\n')
+
+
+if __name__ == '__main__':
+  tic = time.time()
+  p = multiprocessing.Pool(num_workers)
+  p.map(process_one_file, input_files)
+  toc = time.time()
+  logging.info('Processed %s in %.2f sec', args.data, toc - tic)
diff --git a/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/eval.md5 b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/eval.md5
new file mode 100644
index 000000000..bf8d906ce
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/eval.md5
@@ -0,0 +1,10000 @@
+e4515c4f17370f7418328077c61404a3
+cb03ca49918da9884997c2437b74a2e5
+947ae6147770e5bd1bf37343387b72cc
+781dad6ca9b16d785c9e48b41aecee19
+8ad954fd2e7c2c19f8338a6ef7f73eff
+0cc0c81a9735b7e92795417b9a3905da
+508f75ff13c377310ca70d43f6c61c45
+093d706899aad4f1567b97fd6d04c0cd
+66c7570301df7b632362467a35dfafbf
+c696bb084ed1209a89e947a57a8324fe
+27c9733c912554d505374bb7a49c4160
+a815f46241379e61b69e3e82dd32aa2d
+e8e28c3378e15d45606dd81213c2e5b7
+faf45e144913e73da53b2d7e9458d667
+abd94678805debd2f677114a76b78961
+0864f74b0d6b996add81d55cf7545081
+3ed0d5bd91f4e34558ca5ebafb360b33
+873d73cf3c4c6f0d96d6595501c1cd32
+7c36b59496cadba791102077305a89e7
+c92292ffb66323be2988f89b3f3be98e
+7fa33e23cc71b244c23f8ccd5ccd8cc6
+d55ca45c41bb79f86f7c9f4aa12b43de
+8229bbbfc50c1608ff52d55702b0d0e5
+08e339d278b7f050e53dc2fd78ecb1ce
+7fefe3b13b9fce509a80717e60fe17e8
+c5907568ba53dd66b251534053db2739
+3839dde112a5b5c09f2f5db5aec8029b
+79d48014146c86acdfd5283289690460
+5d0ccd19e05191750b3f7bbaac0c775f
+af2a1ef225b4d60a0bf03de84e354ba1
+ecee3ee49e2aaf774b876141c91bdc7e
+957d907f0355485156c3a8ce0920a8ed
+019d4de0e3d639592d1eb2e3903d5185
+a8478a88c5eb833e482a400987bf3a79
+d41d8cd98f00b204e9800998ecf8427e
+7725cc5b58ef6bef0a6e00a8382e52d5
+9aa449b2265a1c3229ab54eeb51836d9
+f8d7b4f8bf3e62b4df4047692e5efa15
+b95fa421432461502f066f1191e3d1b4
+bb05863b1bdf727acd53ab8dcc589c42
+8aeab1332f832a1809435dbb53ece62c
+3aae29ba345110afd9f62ee192bcee29
+a2474f9c45ffe966159e75743120cfb8
+c2ede108de22e546ac1a5db1f2ecb07a
+4245c87d5c7a74221436d1fd072339c2
+ae148f40ac9e9134066340b5dd834a89
+3f80b3c7c9bd50aa061f159de1a9661b
+9606087adc65a5070f3a4eeb23f1f0b3
+ecaa1fb034959fac41e3a2f0379e1c99
+05c1439f860223cd46b0c054ae18ca60
+8fc712a435c38c9eadcd745b4e148aa3
+ffc0af255030c5b2322adaadaeb2e22b
+53ef3f7f97bb613ebbfdbb95c0dac251
+9c4c201b2e9f60d189836bdd0d2010ad
+3e48e71ce6c1a5fd03466b07a82f796b
+2054f64f1263720188d261f499f51cc3
+064041e2aab298edf9a62b98e7657a8f
+098428953d35ff5a9870236a4411a8cd
+016eb5248d0ae5c200ed41a1bb142404
+032a04e7a239c16d508f48701adeb78f
+9df0a8c8d405fc0f935b6168428e76d5
+ac54c8ba0a81a5e027b097511b5c5c6e
+eef5ff65e33e370dd491140307878ca5
+a4ca2f9cbfbfb25f006fc219a4939ccd
+2b82d1490d965e7bcac58bab206555a3
+c97ee56278c01d4edf4530eeeab0df1d
+93c24ad4bc93fe8e9bdcbd78eed6107f
+9985eae5f5f1c9ba64740bb784755e29
+2d13f6fedc9fb5b2f91065a8e33c2f94
+b4807511692724fb83a76a39bbde141e
+b916b3a8025e512011338c0ed40bef87
+6edd12d4c8df99817b445b9e35300633
+1bc92b1a22d261bff24dc899c195bd68
+a390540b707796ec07abfa010c48ef44
+2a6384f009123e06d7b6b644754a314c
+020638edf98cba4089ff7e22fa3ba4ab
+9d6403692ee427a8c2de5b85cced6cf1
+f35a18f8c5ba7dd5b745bde291457d2b
+5ade9b817973e904e60908aa71b22f64
+761cff770ed4fa2b7d8a1dd2e74d571f
+018bf65cb09a59ea9297f99eea072f91
+699444530a0b002e57e1e61540eed121
+ef4b196605cc91fbc6ba84ccd3c8d19b
+3d49bf243d85ffe5449b01824d014411
+9ef43abbd82ae12f699998caa4ad6705
+1e76898011cfe256d4d7ef7df8edac7a
+1463da0e0c51c48c4afa7c80784e24c2
+79c1caeeeaa6b839da0c9a8659ef4b53
+c10981aeabd6a45e94bbd99f488de599
+d3ef7dd596b820cba948b53ce11830b8
+d41d8cd98f00b204e9800998ecf8427e
+f986d2b16c0e7630663d5babde8a9684
+a8c8fb3f07c6ec2ba9dc152e689fd4da
+f947ef9aa46502051180a8bae8da569c
+d41d8cd98f00b204e9800998ecf8427e
+c6aaaeaa929e9bf26d83bb62166af8ff
+ec2508d810abfd922d52fa13fa9794e4
+6009ffa7a814784a6f5d8dd57081969d
+d18412fb659feb7e7ec3178ac6fa988d
+54d31b471b0e19a10cbc1db8957f9dd0
+7116f875e5605503152c21cfee2b5ace
+b501813e40d2d0c391bd8875f9d85b73
+63eef4b11fc02b250d4ea5aba63d8a4a
+1d5ba0becef41d894cedbc9fb069d74a
+534d041f991e09a4eaf14e33e418e3ab
+35db1856e7f5353adfb70f9656efcf40
+ec522cf077272d135336d27181f5276c
+964be8dd1164c1c6a22ef7cecb1aea47
+22454d415afb7ea0c6b91103b4e92df3
+ddb46101921bbe3f3fc5ac8106ef83ef
+f81d9ce95913381ddb2c381a23463155
+d17ed0849cb434015bc20be31bbe06a8
+4cc5aec4147e58f3354773f288787004
+3be9098148759f82680147ddb2c468a1
+a8d04e89a0e423f1dbd5caa245909dd2
+af680b47bc38ff23a095affcda726184
+65a654f0ddec814b22b2eb05cf10fb4e
+88edc883306eeaa828a1b0845520625c
+4c6a47e7d2662edcb60bc7d00e5c9002
+5b1f8dbcd1e5a6f35494ef05a892e955
+040b2d46fe48dcd8d463df2632f9b738
+b914bc4089ca48de29ada6dd6033aaf6
+9595c22d5193ac583d01c33a3822b271
+ec1361c2fedb4ee25141f5bd5eb4f6c5
+d41d8cd98f00b204e9800998ecf8427e
+753b1988966fb3e838b7afb8f4559ce6
+d41d8cd98f00b204e9800998ecf8427e
+b99dba26a6d078073dc73bf9795adbee
+5d1974e3969cf73aee3f7d7e42a6629b
+dc0662669ee4fd2c40d9b983e71bc7af
+d9901bfb178222624e1e2a80d3088bc5
+e9f0e33872c02e920740715c47740f7a
+e6557cd111ebd0a2fb697dd32ea19176
+27f9674cbd3cc9431f61331c6bcbaed0
+0d4d1dc106bf6227d3300344245a2146
+f1b4297bc0250b25096f1379dc4bd8e0
+55062b58c270da524b0271c93e394f68
+668c1e6e35882914f9650e9fe4109d23
+15ba44fdc857517693d02559580b217c
+6eafbccd38860664056c6a3f998d824a
+9d21d99ed9ec5c08b84fd740106f94b2
+cd67fb2fb2683fe7860431ec9fc996cf
+bc4dd130ceb1a4bef2a580b2667b48d5
+604deb1bda61485410d19df98dd590b3
+1461663d2edead70b8abd5b6d3fd229f
+29667235686a8fb1a9fa03f17196f3dc
+e20f6d3c5b32566b7051c0e4d263c494
+bd0732d28992eaa26a2da493591ca005
+58aeeb716a2eff3a79fd8f096ed24b45
+77481e625067aab9752695a9312cee9d
+006cedf8a3b672294e8b2f7d182dd93f
+e779b8c927a3a394ff8197bb5faf613c
+9d94c9483569788ff622d5d55c6a16f7
+4ed9de010c0c155e1dbab045d77c2ba2
+e316264a234329159e10e40dabc93df3
+808b27e73c570483018c92c2162e4b2b
+52bc89b2ba44cef4aa6eaba996866aad
+2b6b61fafb17b328b58403064b04aae7
+225e512992a60fafd5645f5f67b03a73
+774863c0375af2ddd825748995f0e274
+6876e1411bd225590016cb2714725633
+c4d392c5579054189574ca584fb3138e
+f930d4ae7529362d13945837298ab5d0
+e33e79f6b4944aabb3bca8e019141399
+bba5b787f342c232cf9e551743b174e3
+21e67e9baec6c7c1aca1de379e9d23f8
+6c0eeb70ebf13912673f414b5d269f8b
+1a6b3ba2b2c7b1ae8a5bd8463182dcf0
+426adede287982223630b652883e6bda
+f31b2ce4af2dab13cb1fcc13bc555eb9
+4c40c47d349aa4dcea2354ebc992a5a8
+86f4a9562a2f5e97e96562bade3ef632
+afedebe0f16adb819cf3280b9f9623cb
+c52f3943565bcb9f7ab08dcae6847809
+63c2e4ef68162c6a636f1be566c5be14
+936ccf111900cbcd385138426ccda5a5
+8911115e59be8fe9108cc925fba25842
+b464e74bb5d59b48cadbabb6a4b4774e
+b416f81909fc841ce59fefbeefc138cb
+9e0c5c3a02617ef7653a1312aff1bff1
+5f6828ff6b2c74743980fcef36f0ffbe
+71a2ad5484b41fd7433c4b52ba4263af
+431df30f62aaa5912ddc7e434c3c7648
+6edb323fd74a5ae5aabc2b9856d46943
+fddce22bdc15c2e7ad282f8a51900cf7
+1ad30e25b803d53e55eb54fc90799eba
+2df58b441666be9cac5820dfd94393fd
+75a558848969278aad8a05a00a3bb93f
+0a2c8395428cd4b12376a738d6e437a2
+63fdf2b4e6f69ee5c8249fce65700b07
+d89a9ea021fc6b8c71cd6f2d95582420
+2202e0c69a133395e20ce3e08ba34a47
+d986900b8f454760c2c55f42b48b37c5
+8410842a5dfbd08a9c86a0b907234b5a
+278c91b2999d0fbea42d6c0222225cfe
+f343fad3bbd44cfec27dab2f10da9055
+c3829aee4c01c7eac45cec55671689ed
+d41d8cd98f00b204e9800998ecf8427e
+e789eeae15abe40d5f740cf5ac1cca1f
+d41d8cd98f00b204e9800998ecf8427e
+312a20cd247a895da8ed2ed738061e8d
+b83f210c10a3191682ebb258391cef23
+19a46d859cafde19a3619f17c6b05d9c
+66ea898ea65d6ad8afb2421c3f3cb55d
+cc47bfda019692e4ac18f22aa1a0a2ba
+a3f38e24c3a2532139d18775376635df
+ff72aa70914862ee3926db7238784837
+656237a3d0507e3cb556b59bdbefde72
+05a11c9a9e1c43ddc4d5c76aed989fb7
+20a2805417aff9283210b15ab58a806d
+515cb5a0acd2e49c81f3a47beec36db8
+ccf156efe4945c5feb044f89838aa3eb
+15813c22904d421b7fdb4181a807b568
+ae5e656012ab5fb962fc082b7f4f6810
+c32a4c5775fce9f405c494c8283b88c7
+bcfe6e00605070ed51b24519b956745e
+437302b9025c1d7122dacf5b14d817ae
+e4fa5ae547a003e7a8ef5b5f7d1ec136
+d086ce8bf6be4eb7365bf09c2ef60824
+094b05008c48d3b9ca3364dd40efea38
+c94c670ebd72c4ce9764a5a1465bda12
+b02d1bcb635edffb446b10cf5e7d1c8e
+f30447455d05f41b978edfbbfb13bb47
+1e3312ba7dc4546682a75e6d3598cf8f
+e54e28dc37d204061009fbba08a06cef
+45cf1d8bab0fe8ab2b53c3ac228f0385
+085e4ed6cdf29a52f18d8f9313edba94
+43b7db299dc33eb4b2cf049ba9e23396
+ee2a8929295253a84bd4516bf04bf797
+fed5f9e9760e22baa74bf87920e39370
+7be2397c410ca8835fbbac82d60be419
+8476bead7c2d42a7fb7ed56459157127
+98264659f5bbbaf7ff7055b93906ce3f
+5174844a4a993bf74258d4d0f99de007
+7f2978743e65fc3eca33d9a38ec6bedd
+d19b338a37b41c7be1e10aaa9f7d745d
+6557255f147977f46ff3dc1e133056e5
+0effdfeea2be8a3b97487d18e1299f9b
+59023eb3b3a9c2da69f62417770ec0e5
+ebdd2fc42a5a8f303ed1dd4a39211bf1
+fe76154177e384de429a822722c9357f
+b3a475c2906e4819ae34a26e17b4edd6
+192a276e59f8e7c704da640bbd04a074
+8b967d38b27e2be153f409d48993772a
+a501f0b3f9976145dc5bf807fbc3bc06
+1a221931b038a7e64cb07646eff64800
+34325ba1f722c24885014dc129777978
+9b80623fd97fec7407aa278bb3a50cdc
+d41d8cd98f00b204e9800998ecf8427e
+ec39df3e9daeead9896161c875bd81f2
+f472fd836a61aada647b5b2e67221408
+5f69add393d9428fe00a62998ba353c1
+f4b3f8d7db994e7cfe85d25ab743664d
+d41d8cd98f00b204e9800998ecf8427e
+8f306bc0e4fb3748e6ffe4898381cc5c
+e5fbe6a5a4f6bc8a325997e9e1aa9502
+2e075cc9489734f06b8d2a3276576f9b
+fa7122a9dfbc37da5fdb7324d417b2f8
+b9c1d9712de9df894c76107e1306d426
+a5c4f330acf15f51675847f1601a0731
+96309cb58434a47cd46a5291cfba9fe4
+03cf776c1c231ee508de58443b5af3dc
+fa13e4e7a63b15bfd4cefc227c23d5d3
+d4c373061c702eb0f9450d36f989fda2
+c44efc9db4fa10c874f87940fd5f2186
+098dab377d378ab5f7b8b8e9600877f6
+dc1f7534be4f742ed9308c202ae983f7
+74dc0736489450fe528899e5bd36b5c8
+62c5d1ffb2b90ebe38be062455ddfef2
+518a635ed54c5be03fac42ceb6787c93
+cd57a4149cc20cf89eb36e0467e54a14
+cbc2ea589b483d4eb9ff9981b7b46546
+cd058b487175ad6198ab71b97747b88f
+b89661a4a93a19b604a4370e7c1415c6
+3dfbb0dff0a310adeafcf5f874bbdb60
+63b61b32562cc5e505dc5b52058bbc50
+d49c382df9f3b422586caf2641ad7417
+c009eca72953fb9eb19c1c8d0b565d3b
+3c84a39a94c49c7a093fb7a53d2f3ae6
+6a192ead9e794bfa54ad2aa1118b73a1
+1ca5b76eb95b94e7b146eb56ef7669e7
+0089643155593021a5ef21c170efbe62
+d7abaee2bccdebe037c197dcb602c6a8
+8fc7ea7fbc9214a3e39d299d3cc327c4
+37e3620bb29c2afc4de62d5c8b7670db
+e32b44d46134781dc00b48aee5eaf6b0
+770102c7fb9f421ea6e05f271592e7bb
+c5afff1c59d7bd7bbeb1ad1fcfaca341
+68cdd34a8c339004d589a48ce50aec9a
+a135132c70e892a9308f1a220cee2d33
+d41d8cd98f00b204e9800998ecf8427e
+073bfbae3e44fc720f757f34371aa94a
+65b6c0d940d8cd6e34eb8560ac233bef
+80aaea68a8cd5de7fac850b14b0fe4b1
+554f3e4066996237c60398d3fc753455
+d0d4bfd6d5542b8ef96d6bcda9d5f9af
+b54cfb7320a19f0edecd9f57a0239332
+aa7cd0556d561a631ffec8b52d78a808
+2858e3f57dd00fb04ba0da40bae28b8e
+91c0767943996300864fb6103bf95a7d
+b3a703894308af2cef638f2adf0ef00a
+60f3ccaf4bdd5695c62bcc8cd8aed4d6
+156d6a3d9560b46f42f5d87164a3d6ac
+d41d8cd98f00b204e9800998ecf8427e
+aacc5d04213714249e410a913f551a63
+a1b8304bb1a984b40c78eca26beb5ead
+c1307e7e4e50c63090a8159e0da80497
+85fcbca78a15e98792804321cdc2a50f
+d49e910a45ee6484db7cb8021f93d33b
+b2e903b9e7829b192170a24829e2d326
+1c1300eb1ae2b48e83416719af7e100b
+51873e473ef17feb4126d178dc55771f
+b78ea6cc53e857a214c09dbd51438cd6
+4680e91b20b3ff40f743e8f4b8754276
+17fba0d6586779a7a24208b24dd877b2
+ba26b02a234d244f8a241604a01828c8
+d6ec644568fa8cf970feddac1e6bd9bd
+4baa8bc4221b10b42e2ed4f8221bd020
+06186e80a74d442cd1d83531b7247042
+354132687d3abc6deb2240265e603ecc
+13f2b3f46942eea3ae8f7b2e49f6045c
+e2d3b487b0c799f42f27df384871f57f
+df3afec2b85bb48cae4eae9ffc31ca93
+3d026abb6db4720623779fa2bb445e03
+aeed47a9c6db4c3a1364426e450c4619
+56db6c77ec92cefb24b947b0b2b820a0
+5f9c18ba59498b2313eaa820a002910b
+e4b895468236acb386b9df445e88ea50
+ef68a3340a38baa8efbef9e236ca49d9
+a2b957f3796fd29e8a6109554ca1fe59
+e24d33c57485a446868f34809a3d2ae5
+97dbc9662f2b70ad1037d207e9cdfa2b
+5046847a27f90f4fdc451923704fdb5e
+321dc9d429f5054602da66b6167a03d3
+b4e5dfe8456f18dbfca49e6b20b64516
+37056ea65add73a53d6eead4e5cfca0f
+2501042a5c19cfc6df00c2d891ee4be0
+3796df2afb9ed593698188715a93c0af
+7278c9cb56f676c56fc135b17e008c2c
+5a5c9b8fc09d372e18438ba3db170a00
+75cce23f7cb8f808100ef81187f767a1
+e58469bd1462381549127d3bb0802914
+ce933fb9bf3b0f572f868b01353555dc
+b9dffca7d3df1e8d671343654beac150
+b12a598aad8fcfd3c3ca9f1acce858ee
+9c85e1990554ea2ae59d31f71a5b8f8f
+dc43b246fe073cb48e3ed2b71d4e18fc
+174a3b0da8b993e49e1cb2c104d0d225
+c3b5d8d53249474c16e2efaab88d34df
+d61d644a96891375b6920c4316537b46
+685569e48f0556b2c45ef95bc03b1355
+f8eb504e31e2ea3e890b6d3e929d1211
+1f7cc5a81f3b3dcd21fb51b97d22763d
+841abac714f2a878d0c1434846217fbe
+4b12e7c6dc5be46e3e77ae0140854083
+772fa3cf125fd9a06692ed4b1232f2ce
+73a1158d9574fbcbbe14c9494609dc87
+b83701f526528fa1b2e12573ba67ed2d
+b6d1dc0c4f5c655c7cf62c4524579960
+82d0483dc15f4148b4e7d0ca96994ed8
+b8e8aa41bc1424fec2b073926d3b8788
+683fa0e0cd39a76ed16da3aed078ae0e
+8efed4ee605d391d2003d00c27f22fef
+d41d8cd98f00b204e9800998ecf8427e
+d41d8cd98f00b204e9800998ecf8427e
+1656687c4437593456c033073531bbbf
+5dcddebbc884a14ebcbc9a247b64de99
+14c97ac709e07d5d683edc43211808e9
+9f7c4e356a8832bb14d85334d31ddd9c
+c85cb60aabd296b9f539f3757f5b33c5
+8a25f8f5f7a8df0f8f587abca022e69e
+99ad20e06a51da8baea1a8254ab3d47a
+da19ec495ae72a5e41e4c764799fb0b5
+208f5aea407ee530c415a458a96631a0
+45dd7a394cf7c71d4286cd3f21c6aea4
+5c831698179a60e01d767c01f42ec08d
+ea835d079292301163f27f8edca255a4
+889f8f6e35e7e733d38a1e4e72d5425d
+4bf908ca00e0652a273933618a548289
+0d162774d98affa01a9fdda9eb6828a1
+7edcc41c66f064c836955c3a2c00b15d
+e6f3dcfd9b65f348a6aa74f8ea6b9a96
+050ab268ca51fb3f60bb210823d84425
+49009c0da3bf1399cbb2313f245b6ea3
+61c705f63bb5e4f85845622cc7776360
+8865734668b23ac70ab6d093b25e1e1c
+2ffe4102e1e0baaebfa9ad0584e5f7dd
+108bb1370241bad0b2e9a23a8bc17cd0
+685328b925ffbfcb81f9f3a67e57b0de
+24de0ae6d647f6bf8cb645ca212effa5
+1f8d26a8a42f08a0e6032198ff08e605
+596bf3f8a56b10af886bec38728d13fa
+119b3fb61aed601c6444ba2357d6f2a8
+55c35844d97891860321eeea3ec0e562
+f379728e5b5215f4db57bed021e67df6
+c5c95e8678c50b68d022ac259d3fdf5e
+c5b3672bc14557bf02a0a487ee336603
+65800e90eed41800225c614581e5ca85
+9c4756a2b881070db0f4fcafc299f5ab
+0d0c4af54f179eadfa902d887336762c
+74a816521f760202a129e29a37a38214
+19584b0e3dc86a9f2df7f504dc60398b
+235afcb1239513108854ba48ef9c27fc
+b47cd2182eefb63fb033d71a7a7bff45
+37b1509c3b35e52559baddfda1a673f3
+f2c556afa6043bcaf2a2999c7f34a781
+ed9e5ee7cd88994b7400ddcb94c31c95
+3912738dd201bc218ae0e0d70238e16e
+b3c85adaffa5a0537b36dfb39f1ac6e9
+5c747da80e1bb90791bdab861f653fc4
+325ffb842a94eef34b60fc0947a26f65
+4314f354247c07abe938b1ad5a8e1429
+078e0a25897d6286a5ef6f20c7f7ead0
+59d5e5dd0cc57a4af5be09056428ccbb
+8c2a304d0d1d7744f22509af94a6d3f2
+5b5938f2bb201d01d2fef0196dc33199
+40eb13411370f186ca615f62bc5b079e
+8265ad1769a9a735a20673104751c68e
+d0f0d5991e1b2e5e0e04935c7f057d27
+ab08a0e0b82d192c4a6f9a68d90b1ae5
+11fb792590bc8d80d81dbcbf880152ac
+9fd9af6c9dfc95c018e4371aee715916
+f697157f9118aa085540c87fe23b4e21
+fe2dce33c482742cae202077189f49ef
+1645605cb97bb79cee970158f4cdb21e
+e4e90ee4eca5b1d4994d19b180f11e8c
+577d89861ef7300e34b2d68fd34ca026
+a795e460155958525dc017e8703268d1
+d429885c3de34a795e44b3c32ccdb335
+41bcd8c0b2cb44bd9efe99313f55c940
+b1f27754047c00d57fcb80e45d29c8c4
+df8a5f3f302e4b73e24ba68c031593dd
+304864d48e5e526fe3f04a7d501ad8cd
+9b3527f2b3bdbe1ecd5a0a213ad2563b
+f661f196ddc9c405c237cf8c2f61be3b
+6c479e1509613826555cd81399d99091
+2e04bea9a5b2abc184a41c4ef9920ce0
+c1061783b7508380171f74c898f020a4
+5a145907193cfb11666e21a54607b328
+f3398d1d993ea2a1b11aba068f6ddfab
+7e2c3fca910d006f913825d62526f9d7
+437f8dcfbf960f9bd8321b60cadb5396
+ed739c79ed326b2e07c6ad044b24de1b
+a6c8d3d00d9580b98da17c38d1f45be8
+0fa6698411c7c0e1c727d72b313d1ee9
+3a5889ed99ad270ba5938a8e76897551
+6530658bd6552987ab90fd114598bb6a
+5aaa8f1ab7d0c70f1223894973bb3226
+98c7f4bbd4b55e0422756a476c1b5c5a
+f922a895e8a04486dac904017dfe04df
+d6cc95b3c71e9836a838755a16e04c61
+dab6f3c2572687eff15183391418785f
+37bd17b3664e4c3d588ac47bdc0d21aa
+c5e01ea41fcbd51c54d7052d13ef9b3a
+69d00d5844a1c77ac7e1f0432b3e37bd
+6d476fa4d362247e4938c59dd5bad082
+a3135fee1f73f93c67927023aa99a3ad
+75dbe378c0fa24f070308880f69b24bb
+c7b4cbf7a73e567b2bf5c5ce4bc185ad
+d99c1ca52911dcbe27c997e61506d827
+7a5a7c47e317e064a82949e17e123495
+224d8c27cfef4b3a915fa38451810b91
+7cd8f467f3ea83f98fa4b6d3ca6ac12d
+61168adbf0e13c10c5fa3501dc35617e
+52621939587a34b5917be73ec617e306
+a547fd7f9522922942c2e623a98a47a0
+429b8ac61536cfb7d72b739d5b86f2d0
+9a8830915741c273302dfa7dd2d97236
+19ee36db67eefb1e769029f69b715ff3
+101b24e90cb53b64ecd89ec3f1053df6
+f1e0d51e7d4ef0ab91ae5ea5acda3e01
+169003f67baac1d21ed930c6707889f4
+79cefe2a92811ae00b5f0c3d261a9e8c
+48801333d1092034e94ae900cec18d38
+fad8252008f2783be0466a50fe874eb2
+3ee8f8f189032fa8ae357428caccaa36
+d5cfa9ce554e5dceaaaf98108f3f85a2
+66e82525b91fc858345305b5c401f778
+d71a8417d2703380683c92dd1c1a32d6
+24329a9c2bfbdfafee3c94fe4e6a187f
+3aa729778cc13b0bc831ad078c906141
+d60166745714faef18c5fa8956dcd713
+8de97b5a3e37af4ddfd746549b23240f
+ccacd02f43e9eb19e0fcc040cf840058
+c7b5d4bde58c0e5a70f86915e083d97d
+10a2183877a7db7f2a26df478ad7c558
+972eea24efcc04a1959afc014f10495f
+3237a8844f394e80646b93644a61effa
+fdab3ccb7d37a3b1bc4b2ad2a7490227
+f0d585d41fff837f9fca4ed714af95b0
+b13005308300e9a525383bba1cab3e89
+4691c26943b91dacbecc251b88f97e89
+3475f84eb4965fee386570414da29f1f
+ca37b3f84d6951483c591c265108a019
+364929ed273e99a3fcc844620986f032
+3e21f82da173d4ef1612dea5a2d73169
+a78f570462889ba774dc61c9390ac211
+c3637cfae8fd560f528d577f134fdf70
+537d80467188de1d1b741f85456904b2
+0f2e22278353f627543edad48e63f33c
+a945139b9f3400da3c35bc8fd343d637
+d3a7698414bef92525256f53c8db2b4b
+5031d6082ee65be6aae3053deaaa25ad
+7a6986269e298b6fe58f9ab16077edee
+3a88aa9e0bfb96e49e6ecd327894f93d
+d76577ec1b544a160bf507800d9d6ff9
+d713fde2dc3d018fb291ef15332b1a88
+6544ce09551d2e5372d53999db27f50b
+1d1b2a981e2a73cb820463b5ce3cc298
+1e1c5b548c60a8fdc018039d0f0467f8
+650db430ac3cc142f523e10334ef65ae
+961150d98ac4301a446205f1ed276a63
+8a153f7170a6817cb98023996e2ab86c
+bf37a363e683fa1cea026b28fc64c744
+bc52445312189d4001c40e91246027c5
+086c6f846812889e2bee1a7fa1c069e8
+b06011c1eb62f9cc18a7d91b34b2fb4d
+6bd71ed86e183f35759a6e96b2ec8d4f
+cce81ba2bd521e6d6e2d0daae18a21ea
+f1febc045e73de3ff483563d89eab567
+4c4114f1735a8b156c3598bee80e1238
+ccf4f7368f75b51b6e34ed6fabb682ff
+9b40ce0362a1f8359e13edecd82fc913
+589a16adbadd7c3f015458f79b8b8077
+0ebac7a1a7b7a1a1b8af9832d622e2ce
+7488077fa3ff2d53b964515249f79a50
+1e1bf8fdf4656335720472ff3aaf033f
+5dd4b41ffc03f3d8bc48bb170f7d3b9b
+86b42717141338f9695578bb19e8410f
+ca0dbe14205fa1e768c3e703e53553d1
+a0efef8721ee1a96cf28697d41c1790c
+e143937c6b34430ea1a3ffa575982412
+ddc7af5ee483c3e0fa26a521d60cc5a0
+1f91770d86ac2a0f93a497c9fedf2d45
+803f94e5da482b49a87ded95de229927
+972b0a337d7ee4190ccd472fa181f942
+2c9c5302b3d339660a097d542ae04350
+084c45bb6408771b0d6a8f2242727d65
+f3a70e0be9b34bf4eb1bf537f87de03e
+7a910a5f09c7e57769af0a6940b0ec5d
+819bf4b001bf2f9f50a36bc6146a89fe
+7eecdc64d906232cbfdb86fad1bda0cd
+01e8100a9788b6e63c6763614f13fca0
+1612473e58383d4ab3b540c17a8fedad
+c0f6d246ee8d3323cb2d55c558370bf9
+70ed3fabb3c8687b8265f77deeeef87c
+58b8e51eb8f6c210359aa7adcede0c76
+1d4ed329896a36bb2ae6db89086abd58
+b3d3c805a1c8f70b6a76b663e15bb87b
+cde666de7a0669904e5ff5efa8f99c73
+c1dd186f08607d74c57a1dd20a73e220
+caaf1c2b6e229a95ff2a81e7f330159c
+e98be9d670d1ce97f9d0d059ba6cb80f
+06e0ec29b19e06e7982ab3350d287244
+03e1557ae562a238d84553ae17836a53
+9cbe12826c711424e03f043640ba133d
+48504748c7d743fa2126bcdac473e0da
+2478bb339be497ac0d66b3a364dee407
+9f8f031dc97428621722de7354be80b4
+2e913550bfaa2c7af3d3b28364f3c20c
+86de923679b339769f375bb90b4ab256
+ac7d6fa5b98aa095679efdb1de9e7d63
+2ec08eb71bd845a1fe89d29dfe91133c
+29ae3733b8f7dc6bf3931978d3cecf34
+25e4aca18f8ea8ebe5c143fad83bfb42
+1ac1c1024652f320f3d3d8f49e15733f
+54b0d4ed51d489ceb3419729d81538de
+20c2429fcfca358a0d104fbf703ccdfb
+3c450097286f9deea35a172fb1f086c5
+394f30d79815c4b4cb133f112320fc22
+789644faa3b5595a436c423d351bf5b0
+e4f68ee9c9ef9b3d7d01da6577d50df9
+f5364e11f786f33ca2482c335f1b60c2
+d700f412ef2727890bf70d92a562e066
+0427870bb7b7ebef86f22b7c17e356af
+14c34cbc600cffaef158afeb87eab09a
+a59b6dad7a299ab3f5dbf6783ab73dcf
+69b1793c1649baf1f922fe7c42d3a18d
+2e4f0f95278368872d07c27765050e43
+6f6f37634f2c5b599b11375d0e80d00b
+c8604472ce9e82d81e115578b932f0b6
+1e9347af7fa6736ec12cb27ef17bcfff
+3ebf5db5096723fb893f71b6436903c8
+1755dfb8fd1f329064222327f958e24b
+8c0b9e17d07785e7f0ab5f720645130a
+1da9615d6a4613cf7e8b7e3a74fe6bfc
+a07496a83383695b21bf420f722306d3
+ce878a6af85dc8f2ae0c84a94facdfd4
+c5b807e0f5b1a5c6d618457b1d2086f9
+233e758e660a04bc2fb6081cbc520625
+e967f87c95b1c927b4132cbe28913830
+27414ab13d9881c774777ac178dea897
+c7cb467e32732f846f3db3a6c77c9f36
+13dad90dacba9f1e85f6765dc80c82a0
+42b0d6f23b60c779484d7c1090deea0c
+0c7e191c68473070bb40acd09aa4a6d6
+11e99362c24bc62ea60411a00f033747
+5e72e545d6bf7126a78c312a576ae166
+9f52505bcf750a2c39c73415bcff8740
+5e15155d77280da83b3be5d276ec82ab
+7c2ffbcad2404be6b561f3993359c79a
+0e7b691ed0a86f86f142b5feb78f7c07
+c354afdafd7df91caa72a19fba302be8
+364de4e1153bf7e944377c00f89afc3f
+483df9a21455c44651500fd239949e82
+c67f1af58b2bf55af5c16de13f86bee9
+5c60cce185c97f564e0ece1d03bc553f
+fe4d770a3c57ec397ed8d5f29b40e6c2
+0de59cd550b3a8cbfbf3e6dc063cb81d
+902f43b865a0e5f1d41e84211bf22935
+17243dbe6946c349ba8dd168905db426
+b4c341f4be352e631719c2024027ba9c
+4e28fe19f404dc7ec817094f1519e080
+972861a3a214130546e24d8e7c696706
+9ec982d9afe6552296c041a9ba166647
+841ac1c12b30f47072a360b0a6d86a62
+eac5ad1c62f0eb79982c0524992930dc
+fd1aef8f82ce816916a23dd4946cee6a
+9d9685a4a2b6c9d33949ff9c78228bd5
+fa26ab86001c598421f2b6d003fa6023
+6831638d3bcdac4cdefe95d59c109a92
+23b4b25734f34ef1f8a6c2f00d74ea89
+bd28fa1f63265eeace6ea4b380630f36
+26b34ce567624804085b3e7f3707e259
+7ea5913a2e39fcb0c63f315b2a79c51b
+3b7c8a062b8f083a1d508e7a49740979
+d6e88f464701e068c23487172cc50968
+1f855a06220c7ff24869cacacf2eb9b8
+72ca69ed41b8eba3fe0d7281b517bea3
+42f90a1d1a5f6ef566548201d2e1af3d
+14b436af9ffac81896bae99e64a13896
+9e4715ba2f6dd1de4489071ac490a0ec
+7bda3ea34adc5fe08190d64fa5582065
+2a8406d6f05f9ac0f668c1765aa55fde
+35e6c1dab568c16f041ee21013a61316
+40654738907127554ffed84429afada1
+22305248a0a55423f1aba8d5b65b6e8c
+806489b41d49ef2c318e4c1be8509ba3
+a761bed95e194ad3beb7e32eaba6d78a
+d15aecf55343667ff179d3ec3948ec6e
+34e9907b5f602661bb0b7ad5574240e2
+bdfddddcddc398a5d4257c9ee7458610
+1ba8867657a25275f20dd01e3e525b7d
+318213048b2d55f2c1e20984ac11596f
+687f31b28da702b8b1c61c01c43bd0b9
+370e45c031a5b9055e454ec642333394
+120ded38b14add7aae85add6ae9c3c61
+27d7a144af9320bec669e8010fe996e7
+b761781ba87e4644fc838cc8282cd869
+155240e22e5d8204c28bb4873f6b7dd4
+3c38c658ae93ed5526214b426bf86d80
+7272e69ff42aecb1ea18a4a83d330227
+8a6e741644cedc254326e1f55555250f
+2834d240c44c2653e401334971c1d6fc
+a8b2c1fe0ae11e5fa54c3b4dea349439
+ed63efb88d224cd11ea4fcd046722b1c
+74b3f173209b641533baa64c1fae41b4
+ef87a13f960d9a45e42afac57409a718
+077076c3dcd943b6c3984012f6b263a6
+a79992de54c09c3255141d05689c911a
+0c637b363fede615a825df01725d7b11
+099861a50b3f43d493ce94ddc0ebba69
+34cbc35bb7ddf3bc8e73acdf3cee54d3
+c2b406227a5d030b985160deff67ace6
+a690640df03de4ba427e9e0536cdedb7
+4221d0df6c780646536d435a40bb4338
+e888ef4673e7887336d8582a12af34d6
+99fa831b3ecdd5e6770c08a1ba47cf7a
+c77978342f73634709419139d9ba1238
+432bdea62dda07fc73ccf10ccc5bbffd
+7b5e25aa7990951eac6cf537fb36c741
+534753ed605faa374463db2d847b2d65
+d3db2f6b18013f98937663ce5544f2c9
+6562a5bfd16152ac34a8f4af4b3f4649
+85a4a2feb016fb489cf17694791b1b86
+6fdfd72db37e042de446f3b5d6679e56
+2b16f8702ed0b7678c54a52498778773
+42aaf6b14980e691e6b459200d5e5d01
+7b4fbbae61b97fcdc3c04a51ebc1610a
+4af7a208a0d684d5b18b5b1f50349354
+8de339486049acf1fbe1a32139fd82ac
+bbce5dd7cc922abc29182f35a8630ee9
+a71e245e4a16f2395c8d5a368e37ed43
+95d48a671adb5a20d66c015f60194213
+75aa19768fdd9515ce74c155c8cae058
+19a10360b6a5cffe02e83c65ab41ad90
+ebb420449c8b74a8a22bba380e41dc85
+268a5e49b330b778d5b5a273686d10da
+06772bf881bdf34a0f00dc911254c3c6
+e8aa0470ff132780e5fc745d04adb093
+09a279e936bb169a7533730861246dc8
+2692f05a28b161434a0614713d715da8
+07da48d9e0664ee830e78c2c262f7656
+2b87f4ffe30a6c10a1d78b4d57fb9a9d
+e6de3d3505227e67247eb2bd17b6159e
+472e3e75059d2b347c6c97e86d675734
+de0bae2f01b5c2d981bdd6bc255428d2
+bd2d5117dd524d021871712d59efc8ec
+c73c7d0facc265671f3177aa8b849364
+c62102d3b7bcc510fcf3368af83f14de
+ae4bee0defe5373385bfacf368b132b7
+3d9599934fab0777bc65a78d40b0e0b4
+6b7dac3acd795a47effcec5f40438056
+7e396047931eca9a9c3d1f466f9e1906
+f9c5b22c5d93dc02a27df6de36aebbea
+4728dd1827de424ebe4dceb82e45ce5f
+41baecbcb0344c8b48922a6bcb5e7792
+56b25f2b825eb78eb26c6feb942d9457
+99ac46bd4ab81b79e77d88c3f44292cc
+92639f13cb9a59f8930069bb95ba4fa6
+379230687fa3a69aa9375df34b92da6a
+2ab569a59df3756957cad464d30f8d4e
+614ba62577b2af15870c8c4070c11a4e
+834197148a917de4c9e309dbc5cc97d9
+ec4854154c13d48afa9936aaf0beb0d4
+63c718ecafb1c5c35a82612382c64a0f
+ef7fb4dd08836b74aa2e5a5cf8a2810b
+e2592696987e0362655978a9b2d5fe4e
+62bd34109b6a8a8e3a6c787deb01704c
+1e1366333a37701636db1fe4cb45cfff
+1d5d754d9fe5535de7e96709ea183e06
+4d733f0ecac5b74ab6cd5c994f385671
+6b93d706a7fefd4bbf0fd07421484c9c
+249560e7fb457fe3f085c73e1c5c9a4b
+86e8b48dcc818224cd9842f1ad3f8158
+f8902ada35a3053288c713bbd4df006a
+76fdf3beaa111635fd309e8072840109
+de73616c11785d5cb107ae48949459a3
+14082ab58d2adf306b33e7c3b2cd70fa
+72e947cd77fbf2d776d69e044c5bbac5
+6cfd43896cba01bc41428411e5f7a1f8
+92a9e2ae6c80cb2fdbf4587a1c986344
+5aef133650db2e26d3f75cedf2c03f3d
+a8256a2912b4a7921ea80ba6741f2af2
+d41d8cd98f00b204e9800998ecf8427e
+09e3c54c6cd483a3a58d51f4207cbc08
+0f9d831efc231e0f41d53f008bcd06a3
+6cdeb602ddaaa81ffd79b7c0639631a9
+8255d1b1c4b89f7aefd08605f477744b
+4f4bb5c3da0eef842bd88d9ccd9d5780
+1d6d31468115c6f06492beff8e6b6c79
+7e2247818a0cd30ce6daf50c7b591880
+522b882a1010893f2f13b0426b0d6a4e
+929eca3e780b2e1d6a5190e525f60893
+134fc82fa131a5e9253f2b5ec6f3bde4
+9ac9186ea7ff7272defe13aa15dae56b
+850c3e60193ba7a4b90d1dca958ed494
+219e4b3da878121a4c5056334eb69e13
+38f8cb72863e8f1fd3d596f84f1847ce
+81d6004e2fdd78dcf4db37c587189cea
+64a3fd87a18f72b0170f71bacb07d5b5
+3b85e763eb59bf8840c54f3af85088db
+eb5313fd3f7e30abec28b9ae907ebb97
+aef157efa7e3fc4b304dba2f5f906c0b
+95c0e94cc4bc8c11048c06f7f1720876
+ebf15dc4be5f6d85f6a2c9bfe4144adf
+ae6ee0d386dc7987545641db76cbce41
+056774a414940edfa52ffe093b2452e0
+3d7825369249882b5fe04d399b3c9f85
+4b817950c0e2a3dfd5429ab019ad90d1
+d9249e93c464d90298690dc895712d95
+d154904cfcd986037343b8fa87fe4b8b
+e816ea630855c9c3ba60444a75014817
+8bfe2b01c63db278ccc805d93ca5884e
+70a66d8ad058bd04898b7e18e81a67cf
+efdafe68a42a595f56cef82f925424fb
+8d0aec591bfee33c33851f35aa61f474
+9cfbe436a56adc5188b6422a421b868f
+25ae93935c33e1d9852096d2cffaedc5
+76b855c96c1f58d0b564db1dc47fe17e
+e3e18fcf49743ef15bd1a46c6f188d66
+fa4cdc589136175ab3edbb109e92dca0
+7f9f16c222386b814c1bf90a9d9b32cb
+06d6d2efbeecb37c22054f4b8235150a
+83fa3674b8189586691ded279ae46339
+8a2948d3acfe101143bb3a46b2d357b9
+10fe7f3673192a0dff622176db46c72c
+f04919eacbd9211df7c190318f31797c
+88f41f9c7af403e2e0ff3a0a96aee0c7
+3530dd97fce74f75f34cb7b94d121f95
+ca85087354021345e19d816df23b05d7
+564d0c3c6fc21d91a8fe59fff2ab6f0c
+a67e249ae0fb34baff1be34e6ce189d9
+8613517e4cfb20238115e64df93c479a
+82d1ba3d875b6c6c982e50edc2f8d8e8
+c8753fb619da1c35b1c1597bfc3cd7bb
+81d6afb8014ed129b0683d2cba4ef9bf
+64998c2ca41f08c28726b17698d14464
+205e9c24b645b01be40252d8e5de7a93
+25b823904d953aab26c9136a41713461
+177c85a7defdcb46a7475198f650ab78
+074373b3bcf0f794153c5938e5f305f3
+d466866e4d5f9328b2ea4133f0cf2d2d
+1235454ab531f5c5c735eee462bda4dc
+213e90c2bee5229c50c7426c879b185e
+d43de7f3505023c27888a17958669b4d
+4418df16b216b091ac9cca1b8d9de074
+a6cab0b6be5be45f52e330bde9cbf86e
+80f4812c18b6344248fc32b5815060bf
+d0785af48e469685db5a4185f22ac1c7
+7caeeeabf9de39e148158d4b5f6bf9b6
+913114033995fa8d94b910623242b7fa
+d1c1ef84919ba7e3e374e79e30093df9
+44bf5692aff5f47b8cf9efff9d7dc0a2
+9232d3d76468d524d982884e22d46451
+9b1af1afebb1fb1b8837a84df4d1b6b1
+81ffa7287722527ef6451f470571e5c5
+8308edf485e99d1c65e83b61b2b400db
+4488feb6b405a42a149ac3b146e5958b
+8d84487ff114bde3883c30feae6b9454
+b4c40b853d02b15a90a2224ee5eea1d1
+b102fa7f3888406e4ef6cbf5c0e3482f
+27dec5fd24a7cd3232d69c6b3a8a88d1
+d41d8cd98f00b204e9800998ecf8427e
+bc7a9522574d067397af4858bad91eb3
+284a7a898f5dc30082ad764a91a43ea1
+6fc5fd6d0cf8867c746d40fb2fd2b42b
+a6019132fcf6b26e4daed65c677cdc16
+94121f56d8cdbdc413ddc0701c6fab77
+2b7d3a3417a70de8306567914bd237de
+b2681f633fa93a4d91d8b9ef9d3a2d6d
+2d0e0cb25fc7bc9cbf67058ec6479842
+110a2761d94ca8c1cb05bf255ff3786c
+1ae95fd1d7e422e4fe179a1cffba1f09
+b4e9c9710c581f91135313117adcde55
+486d90686c83fcd2ef73869315c6f42e
+c585fc9386bfb2a04d01397f79391e62
+8c7e6c75c0ddde9eee2842933d313035
+dc9a6ad326c75d4dbcb4badd146d5c01
+fc62a072cd89eb0372ac5d9d0ea76c22
+12f6955c35f432ae25009de09371c489
+63017b37a15cfd35b5fa04190031487d
+0b4179f1cc0256cafe8ac66b6a3c9212
+3f6556cf032a925826710e241926d7fd
+8c6f3e966d7684f437dde4161214b792
+1ddced66bc2f3066132a28965b03103c
+d0d84f41bc78e025b45f851eafc2dbdf
+c446b5477156ab804c56653e14a7c3c7
+383e6bbe78dd2b188441ffa1acd11006
+e77e45f2b9bb09ba8652dfc64a9acb89
+569fbfc2194b201e54266f51a189f5fc
+964cda83e3f6678ec81ee445dde25d9e
+185c2fcb5c25bdaba12555a54edf7b36
+770bddcd663485de9594ff1a653da863
+5c18adf7798c8401f775ef3cfdd4c4e6
+70164b44808623ce3cc1188ae8118859
+8e6e75fdd868e2ab62942021911dcb2c
+c39bb415e50b7e9a436b83a456165d04
+053fd7c8789afab9ef979fe1723c206d
+c80174caff05b31ba18e98632bf7e03f
+298970ddb969f875a63508253c96d7cd
+99b31865b966782d1e50202d8cec11fd
+3de6276b0223fc01c593f1ae19c8fd31
+f3a67d9ccf72f6761c223b01ac185109
+7aac8bad71c641761e5c59fee53a2be7
+0a77231faf19a2da0b7763716f77f50f
+1770c0c31dfc509450cb0a5c037b18df
+e2826539a80779fa5358394ee603d73c
+b5b61565a37bd2a94cf2fd81bf2de215
+e735546e97a22b90178bcd8f1d555400
+355ce4a5adf854d5ae6fb266b4194d46
+d94882f02f1055a59dff4212d494f055
+79986308452e0ae3a781cf46d812912f
+2fb19e649416295d0edd9cf0195d9524
+8de4b9771c470967b45d0fc402e5fe01
+9d972f67fdf9317ff38cc85ef8a5719e
+28ee5339f1f5ba56a02d986ce9619f34
+abfdfe2589d1027473eb9438deac16c8
+03510eaa057c302ce9a605a069b3411a
+92ade3233b70e2485f258716db1331e6
+cf2e830f7c2c6d011caa0ca225f32c1c
+3e655a0b33e4be2ccdb40efa4fc15092
+e587a6f67db53493e81422af10ef4b03
+72a241a460b794912ba9b5ad44a33318
+3b1c188de9a104afba5261f4cb303f1e
+be72422804df3da9091d020f2f2e2c44
+c0cd9d78ea51574e286e14876a93c0be
+eb0fc8ef454c2f90bab885564181e54e
+7b660ed115a1bc33fb16a664852ba4ff
+b9d3b4671d795f84d4fb22d29fa57403
+6d67865dbff20793427acccd4f307fb4
+0ec919ab4ef416b255f5778db983b61a
+d558951ca3b38195a9cd56b5269a6e2e
+3b2fcf51575559f03cb2cb7673d80252
+b7342e925c0726e883718e53943d569d
+11411db8f42a339b83efdbc683ad6740
+888ee0a8ca9d978665e00e41b6c93c69
+1a79786400e4612837784e504c673bd3
+f346d14da95eb76e588483abda75f73a
+47d59eb3d694f78161de7ed1e08eec49
+6868862e7883476f75077090c4f1630b
+1957a1ee117fe120e5eec2d537f77629
+b63d8b597e0ffcaa6d57c66c7b7b011d
+d06f7a857f881a72e938ede632f967a6
+1ca1d0a273188c1bc3af651dd0d44c83
+5e66f1a7cf6e833dc6131573257fead0
+1f5055dabdca23e5d96fd8e08cda7971
+a1740663d056c6f9f2e8371bd5b29ee8
+99916d4ed168f8bb0b31afe6c88ba8ed
+7a79989e9b9a227c3780cc0ef0a220c6
+6db57e4d27217c27da7f0575d40b609d
+93485f4cc6c4331bfd8c25ea4c24138d
+37709e93ef66abaddf5cdc86d5d5775b
+a6afe96162a6f5e3ec1778bc6fa0c412
+a237475cc579739c8cedbeb2c910ba96
+0b3c4251d03cb65b9ab9d41e0964b23e
+4602d92519ef8d6f11d0e0339334336d
+b74deeb22a65317feffe8bf653e81f13
+d0a82462d6c1384222a56c9c63d19103
+dc136ea97e39227b405ea53c06806004
+565658af71c61b2562d523bd3d7d6f96
+b0ce4747112658cfe2cc4c472e0bde47
+342f7a77fce6b5c7542c2d68f02513c9
+b0cef7288f0b9e1e05ce7cd579cec0ba
+2c02f0b930b5cd479a9a308dfb5a1be5
+d4aa4491f7e3d148717a4323d4c5fba3
+9469ce8347179294eeeecb9b6f485e1c
+47bd4708605a2b919f9ecd423e25a136
+3637abb7f3aaf21395a3f8c184435e92
+f51efa0c010cf6f5b8fd8ecf16f9e344
+b30ae5bd89b06644b0c7606b121e339c
+df7f4d1126002f39fd0ce48d563e5f2c
+7dffec1bb1d2bbb003a2acb9839d8a89
+b1ee4ef2efc837af2615f7503784c14a
+5962eb8dd1ca83c5301c8d7b66be904b
+62faeddd99ef74918b2d2e65975e7c63
+f45c8511ee890d6973031dcab7a4d912
+b0f8d0c83908bc5717b340d43bf045d0
+394a289ea4373a6eebe10f7f2e655215
+328f5a3eb8e8e727d5dea80a3fa8f15c
+7ae8034c6ba8714e89b170571184fd8f
+f7f70061bc31474116e3305c10bcf593
+1df1a21eb706d2ca64acb478df9b364c
+fd6a32a7fd2e9dc39e9791c29d2ed5f5
+ca47e02144e8dbf0229b2580d340ec56
+00eaf344b983d8b429631056e250916e
+12fe4cc34cdc669638f9066537759584
+2cc7e395588c6bbd250f3542af3f13ac
+948b1b20e39b5cab38a79d2a20ff9704
+8ad5db31ad174d17ce59d90fbb0d2e94
+80de7119e1cede96b9ceb629396d902f
+f338217b4422cefcb4b285d3e74632e5
+f65b7320946c510e34da62c534421fe0
+28a696ed97ed191ae2406ed857fd1842
+402c23f06f01d154ef11084a572933d1
+e70e72df55c9ccb1957e73bd6abe6c9a
+884c04342b5165ab369ab02edcf2629c
+0db7ada6afbd0c73d730c3ba69c19122
+4e7ffb87c240f71593d196a4a4b51737
+98283fa4552b1ae9ecd028df35867fa2
+106d914ef402c31beb35ce7a91fea8f1
+023f22b402870f4fd751436b4f2c797d
+f38fb5f4b669851641f1ef5c8770009d
+c147bf62a80e624ae9c387d718e7a00a
+acbe8a705cff212a73b99fa3ccae949c
+017f6394170ca7b23098f6fec953e73f
+06789bec116073e7dd120d887c597455
+2240266e2cf441940888514697f92ee7
+e37f9ba99b5f0d99c3ad514dafbba816
+fb99ad6ff8bbc271c7f396f7539ee0ab
+21e6c2b7e80d1427b019fd4732d4d138
+22496e3c9d18b3c58ac52e58f23be3c5
+0a096ef72b51d94a45e88b2311290c96
+0e22749453d7e963859c670fe4e49b9d
+c78092f2b45b1de3611fa389f0bdae28
+a956aa9b76d844cded5cd7abd0cc9fb9
+4b4a131d9be5e6ea253d346a00eaccde
+f1f184b2659921d4fbad562e4dd6e2b9
+42c2cabbe4b029339db30d202c796e8e
+00e6c862c03451b34acbef24e33e8715
+bec9f4ae6f6d7079e28c0195e5793afe
+397f219d572f03eb26b88cf062025114
+f1e288d3a60185e59e3ae78198c0c290
+c2851317369046f8079c849d91d447f5
+cf1cced88f51dc197dce7b01a94dcd2c
+82956e6ecb9050a0a70eae6ff7111a1b
+7521317ac96a8f37d324c5892aade397
+a78dc68ed077be06a4ffb01980a1fbb9
+2b66640d0435b064b1f837226b069c7b
+de289e72c19e24012b43bedd43b12e99
+33b5809ae107e90ebd3f171fa0036afa
+5b9770aa9be28a691593e42821458630
+f96d1553400bead3bf3e7bb73d8f72ef
+e94c2acb56a5259433455b014ab7add0
+65029ebab5d954c9e10f52f30bc35748
+8fc815c34b60270d8c2995da18ba1949
+d4d662034bf20aa65d96c3ebeda6c5c1
+a31ab248ed5b15c324c768155ddedd74
+6924f51fd8ca2078da21e16479ff94a3
+04d29c09bef73318158abf5a65c26cec
+be0286048d9667289c16250ddb55ac45
+e46efcceafc0a8724498a6117dc9d060
+7788dd19281f4e39beca0f3220d4d3b7
+d5c6290152a322aec2baab9a3ea9325c
+60e20a6e503a6e0c721d374a8e67d408
+3da48b2c1ad45ed9a0e94ed2f2b8cf98
+a60b5c269a8150515b15b54d1248e541
+f22d2a2dc7b48c5345e7a766fd8dbd6d
+d3d0179a0f2e82e898620671e601c5b8
+b12103a1bee37efe908438a757b9d261
+1920d35b072cacdc8ccd0c4c37cdf404
+07ad9d9a7625577ebf1b0e5e7f17cdbb
+271e1679794f1624c1bf674bbd5ded36
+079d7ad4074201b7ccc5d6364561db37
+333a387d13cf17fa2d3b3b4e4485234a
+499d20ad2a17c96a3829bbd45fa44ef7
+91d125baed01b6ddb0f9c501a182c760
+327d8e9c5f49b343bc5f8c9d4f7425a3
+fd7ef3cb1dfbca956e199b4605c4fa8d
+b7ecc052b13c4f269bee86c37f707ec9
+a48e2ffb4b2c866a70436acb3421690e
+def23ed8eada869c4f9e0cd934c29864
+4cd18d5b475fad1140baab39ba101d29
+5d0aff977363500ebc6dfaf272f35cc3
+0268de9e4aaf8a47f97579729cb062e8
+4df34f718b5f102f2e12b54f4584c457
+6c78252079ef8f74d4a1ee052f102891
+571e6d0595ec149fece9811f1b70b9ec
+7f51187a0d2187be3033e507d2355f2b
+6438d5f606133fad099ffa162e0753fc
+982de025317d3010ace60525e637ad1c
+e2bf31c31e00fc3a903a91079af74587
+7d34259e19414d199eda7355c7768551
+7cef70acafe5066aa8a7205f071e3067
+c7d5c6836293e3a6e6c105a8c142ba8c
+82e3f54ff060f2905a34d19d4b5d08d2
+016ae999477467de6443a3aeec3e034c
+a31aaf55ed0ab75037fcd8344b5ed0db
+95e51ad0563edb9d362677c46070beae
+54ec2a9f0082c9e40fee1fdf9a56ede1
+b5d6995e643c656cb51c53ab076b8065
+f5a368d4d65a4953e88cde759c50ca78
+ba0e978e853f42b6afdd1286367306e5
+64543cbe751d4e0541932d5224c2fcf4
+8ee1f5628c34895ef6ce845a076b79f4
+6892c0bff7856c79fd2b9ae7a46f9bca
+26c3e5cfe8d1ea6bfc33fe9633bc087c
+ab40a76e96f8606426731451234351b1
+7b9792d3ad11d9d18837917d7d34a70b
+77e5750a43036465221918c4e722f897
+b14760266f4bc7fe5732d4a223f7b137
+bcf47a5eb80e0b88eb2337e72114d7dc
+a2d569d5ef6e7dea7f4014d380393192
+ce4628f0f1368c8ff895b77ab2ce2863
+0a698e3371758a89756d2d21e460a85c
+e3fe476f75a197fd85ec93f6f3b2f4c0
+2590cb241e7d4eab080116d58e4a1b8e
+94ef997830ccea7f24c39e9ca8020c51
+0b7efe93556db0e42a3d0655a7d4f310
+db5f792e19238565348b0f02a780911c
+357ce0b4bb439c8583071fed3fabf860
+0e85a890c02d792468dbcf61c11945b2
+cf67f4474cbee464c8737f63a7e2dc49
+be6f44a8159a5ec3a5dd9fe05fd92407
+71247dd6455f38911ee89efba7eb9980
+38905f6c91c5cc9ddeac07f7d9e0243f
+62d50232492b9a14ac341f9e3db13405
+3b679e35dbaa8296c618dd3aa1034d86
+e177e55d743ac4f601d75cec8dfe06e0
+65142ed22b65b98aad3636dd7ef709bf
+cc0148d774558b390287471e8e9e2ffa
+f54b78364a49ac4251f62700c4b0e07c
+a371ae12b26c5013c9f565f7bf2c881f
+ae946d68b6a473a8bd3db6d9ce67e919
+9bc9be4ccc5b056c638612a872927db1
+db8ae0b0c06761d990edf0f2a5021de9
+a2cdc16e41130d09705d7b200adcf506
+76d692c19c60779701e8601cab811540
+904a7dca6a2bcc8523dc2d3b686013d1
+93c80521572e84eda679c9ec556ef0f2
+d8a9f31e6015f3a9ce98961e497df36c
+851ab2d3acae6327588a677d14fa6366
+71e3599fec4e1c32ab1001f11818f885
+9468ee90d23f471f01f39449c4fba2ff
+137abf7b279b333cdb8f0ccad9a8b7d6
+2f16feddbf30dd652bf9c3c0790fec75
+8a87a75345a51ac6eb8437b793c980f3
+2a1371bd3e53f1cebcf17095c779af44
+e52e8effebe6224e9bbd15bda107a314
+6533990ac9c4495733489760f27aca0d
+e7830ebe4d8dd6ac878f713544d47ef5
+f8aedae879945b151c805a6228031473
+639fcb3e9cd819a6451e51e898d08a20
+a1ed6444c9c5705abbc3bc39455c60d7
+a603a61e1428f552bfc76b30104978b0
+e3554bf767b906d9b804f313e69dc836
+9f708f52ddc4c92bcd72552e66f17cfe
+52798e6007933d6ef6092cfd08970e50
+78f8faf92ebca0e2fdbce292ed294625
+21f87c83ce04d8e5e3a596cd9fff87f5
+6420ee9ae778cfabf7e5c62083d1745c
+502a4e1790dc80c3a3d1b04a9c9b12d7
+fd7222116e38cf536008639d120d7f5b
+6564c7d6e5ccdcc4d02881c31ed11c3f
+4399bd116742186b137e6daed2f85d74
+64eaea4deeebcdf1d2e331ee170c1d1b
+a4b96b5b354801a94e98ad2c441c4601
+fe0c38bbef2fd394dc54bb218b19d4f3
+5e432908c3734e50d94260eed6d0c8b0
+df28dc234513285bad0208285da50733
+8407c93fa449bf331dfbca8cf7fc695a
+7dc5c3c29272e29438d8dd3944274dbe
+d3177860f7528878aa99d86794d0a0f0
+7a983cc138579813f7240601590c5fb4
+32ac57635e18e75acee9b7abf4299881
+b77844f883d4c403f57f87973297ecf0
+fe46a8292e963bdbd7510297317ca047
+fef40daa87a8be117f2de905077843b0
+5fa612f06149138fc3fc7f66b17b9bf4
+e70cdc81b8e155cc7af46b7a278f8657
+ea622b313958296545dde97bd3a98a10
+56b7e6938f5a658a76b79b1467a8a0b0
+91fd15edb5595b19a1ac4e8c8c630868
+986c6e15a93ce1419f34bff234d96873
+129e7e61ece9242cb69ae669facf9451
+82dfa66ba554b8b27575b941e3e0dda9
+cae7608f284101e2e11eade1c40220e7
+8441f2d1a6f173cef3ce4f3fd756bfdd
+6f866a638d8e501d39b139d3bf5da13b
+4de8dfac0e46bcaa6328b96b27006803
+021e071a52cf66e2964fdc99cf053da9
+63863d35a3f9d8827d2c8a014e1f1d73
+516cd157b4b5124cd3ca0a58ece52d11
+fa861c321188c143cc9ee5be0183ad24
+af2d30875dbff7187d36f370d0ce8dc5
+b2e7f7c0449f9f58f7f329cc9270de56
+2cc85fa8ec9d6a069b84539d35c2af88
+ecca69140ca0640822dafc6beff83b6a
+3a36d4f68a6bb5a8b045e67d11d6346c
+90510c769742d1d3e5141961fe28fc08
+c836c0aa796b03c63bb70ec4c79ceddb
+376a3004e19d5bd1966551fa82fb90e5
+f520dfe228a51781f501fedc680057b5
+18fa0e4740f890a65034b71425603174
+1ba40b966593a2ee7baac915b87b0985
+f194fefb8c922e2b9c2c48f49f33e2a1
+b8db46aca77023baaf6680ed4b1b3a04
+6503eb157d5c01b0dedb4f12b9230cda
+e97cb8cc364e555f7afe8f31697a6937
+02b01455d31dee15bdd673afcbe22e95
+921ea47ab68b38761824628054d7a8c8
+8ceb9aaae7b72aa95f391a544d57fff8
+a3f360f6dfe96adead7f98f9159e3a72
+0e67e0cf976408bf0cbaf1b54c97ba38
+a843fe191a4d233154d8bce3a37a9031
+98c5a5b09260c2026f70c537c2c532b8
+6c91ffb899393d9f1eb437eb39825f01
+c10e34e8f4baee21f4a40a4011059d60
+543c5498360a3c8f3be635ffc01e501c
+96279ff3f7f33f10de3575202f3f55e2
+c5e825b00e5974efc91a0e9917172bc2
+a7353cc024b1b24b46631661adba6939
+f385084902cd457506053f98d623b79f
+af1e821c8702d3c3b4b9190db5e4e822
+fe2510bd01222e8c39bd5a49481be822
+d341802aef6290a15ab4d32e54b5313e
+40d1437c42401c34c89b562a2b0839a5
+59137c5b150f4051ed4cc692e3b1a0f9
+70d87428acf5aa5dfb3dcb24bb7edf32
+c21aa7c64083c281fbb1cd6df572a7a4
+a5a4d2edf42a99dd3560add9b939cb17
+845ab2edf6b52b9e63024fee2dfe3afc
+0b7502d7d11bcba60aa5f08e3ae04e4b
+751b0ef478e3572b27628c203572d5a4
+790feb9ec0f13ccb24b1e64ea7f49e4d
+79caaa192fec3897061ae460300d4578
+60a7302dc844949095eb15536c93098e
+5e21432b6b96947f26e966cc14c30924
+f84f212202dcf2e96d4537990e2bf0e4
+fddb63d73e67692020696f754354ddde
+da9354ce2f4bb12fd3024f6bcc0641b2
+a370504ffa6841027d7c25e6148cf113
+a33153425d7e609b5422ab3b6f3230a4
+e83fc71d19f2cb68db444e4491623be0
+3cfb1ee48d6aa16713d98b980c70e3f7
+62361cc37c0dea3b151d9597cb94ad29
+d41d8cd98f00b204e9800998ecf8427e
+154b8c325b4eb7c295819168022d98b8
+326110398e39509a8a7f3228b0cc01bc
+cb7635e6e1215ee18d4711c554bf7f3b
+9169947d2247e046a33b87e5465438fc
+7741b61993303870d1ab591abc6a8ac3
+d00a331224bb78cfd4f010d0f9fa1308
+72e959de642077421e3342cb012ceb9a
+4fdb45e88b0037ee0968051b9b98a806
+52a0bd9862b5a849ea8f3297d9bc3e5a
+db9cf2447587ee018b801217a7d0a1f3
+cc283cf5e8608637504f6753f318cee6
+655b4d902f683dce461291f23053af71
+809bac122522ef4adef9f95528179d4d
+5cb10e1b57052ea0ee4afe1172e2c0e6
+edd2c77afb8ada27d4f51091d86bb07d
+9d8a512f28a95ae89cc701e95db531d7
+3d11d0d8c776f7039bf3a11b6c24430c
+1b88f85c751295a77bc97e3228d4d6ab
+804e11617f088acfdcbb377d3cdd721c
+61f3c0ed69fc5cb0de157031fd169619
+0643f7849a1369be54cecb95f3212a8f
+e833fa507a1569868c6cd89f963fe35d
+493d8a36f85770196686e8e7182ae6b4
+e7807646e787e08b7f085bc7680284a0
+365adfc15f4d54a3bd0a3cad4fc7b672
+71342dc963274ef677e45a3f70e77bd1
+9d10b4dbe6319c212326386ad57280bf
+bb71bafa3a8bbb04f7fa265d8a2e41bc
+0cbffed59380da43601180241f6d505e
+23264e8ff302554bab5789778863cdd6
+fabe40182ba42c47a2a2b61bbc27549d
+db37ab86a0ba47b47c24223e1bda2361
+325ece1ec1a4ef39c85cc81c40554d99
+19ac8b35e051da54a11c473113ddf59f
+a184bddd31b69defc10d504510cbd042
+5a3ed175715297539e0a11df56053083
+b90070bbe5caaa634a9c121d25395441
+194777ec9edce56888ef69652c0febf9
+823acb5555d73bd1a00eead5de509a3e
+34e2d72df4e7705482773c7f34dccb26
+be950051c705a1f0735dbc8663e15009
+f8e4a7a7e1fbef5a7ded52b224dfd141
+eb12ab4913799e2cb9f6a968c57480e9
+50ed99735b6a4bf628738857083e2c6e
+df1cb7cc85f363258446ae9f73a24fa8
+2b0bf69f1993fdee55edb71127b790ba
+a6e5cde3bd90752d3e3aec1567e06aa2
+0039442ecb43c91c4385b95a346d6035
+85ad9e8aae3b0b2d1089d804ef5712f9
+752c0d266978a8c62c63cd2e747a962f
+f9c4e595a19ce010a4bab036813eebd8
+5d7ebea9c98fc8d23f03ff27db3d2ec7
+c111f77d25ba12260a77f6059811a501
+7685520023958f43949943d5891743a6
+ae3f72ec9f60ea1417c09e8546924583
+764c387629eb61a4687920616e6b7f3d
+b6bdd823ebee6d281a13e1fc7d3a85c5
+d41d8cd98f00b204e9800998ecf8427e
+4291eb6a06d9c9b69ad7f1fda068da6f
+df741e21f39cfcc06ab854c23b8bb489
+6b46817d728fed2c14d52ad6973b5d08
+16f6608d961839e6a629fa80ca7bdc3f
+09eeaff5b6422725a1d75b955b713156
+af62877d3fec877884618e1be8ce3335
+53c76737bbffecd1d896a18815276731
+3ef044e4d4f818e5ad8d04d51e924bec
+673b33e1cdc24328efaa2484c38bd44c
+062e60c6af9fa4cd9a578fe2e0a1678b
+4f3362e9908d2f35e1f079055c48cc5f
+3e495285c5b93503ca12960f64b0c4ce
+bd785ad06dfed6e7a569c14c5a0d1d2e
+ed70880daff23f4549e810e8b275c14e
+121d5ee79b56b068215b0e30cdedecf0
+ff151bb5834673659552bb9343f4dc66
+254b2e1dab883b7cbce35eacfbb53429
+db19c5ec26681e49a25a1e181636de13
+3a1b84c47641a161a9c9528ea9b8f675
+129c421b96fcbb1ee75034035bbde429
+707c355b49f43eb9ff9d7af993d1a038
+57f396be555c9fa90618bd896e5e4511
+dabd112d9420e7f91dcaa3ba4bf80031
+9c439aa2047d0543b8b2ca04b0b24aa4
+0bb4562aa34bd2610435e3add4036e6c
+e63c3ae63d20b15ef07e3becd7178c41
+9c1c938593ff4d3c0e4ab7649dcd9b5a
+3a2b9179de93189d9426c3d74596ee66
+a2205df6696738fab61b8b3786c427f0
+f391f17e56b8e1ffbcd1a43bc70101dc
+e6f3cbe57e5f31b059ebc595265e6021
+4f2f6dd49ab305e3d36bf3698d248eb3
+2341fb0d6d339410832fe35ce86101e1
+1ee721dbf2f524fdf4a82aac08723e28
+d4625a042fd2a6a8ab122ecb92a52c68
+f1612e5122f1a722ae03b8f2f79b04a3
+db7c16f5ab1b9f73a863848ed2f6e234
+d4b5014e67491e84be9fd51466eaa1a8
+9f97a799f6cecb65298b355d2668ebd9
+14dc06403035ed48921242bf445554c2
+863a6b15dae07573042e7ed725efcfc5
+d430683b9d3d56dcf79ddde67ba9e333
+ab438ff75c6d0ce5f191c86a4ef9e78b
+de332f60d01e7791fb2b7ba094f2fee1
+fff9336c16920b672e8e223099b6a38e
+ed92b2b749f29103a116121c849a2932
+a7411c849af9a9c3d0614f17ab4dc5ba
+dfbf3e1889c8d75078eb4c11ea2e5423
+31a7b4b1227f5bfd7fb58d66677c0bcf
+d392be91e18e5ce74cbe12e3a93c5a37
+c2cfe696dfa22d9d1304a3afff5aada5
+7c25d933a13241ef4819b8808574ec69
+9f3ad6197f9b95da40bf3a1d9b71464c
+7691c65628d15f2b114e6ac1d5eae7eb
+c08a05fcad6461c429c7bc0210d30bfb
+0b8dd0e8b0d4cce2878cd3fd81b38c42
+c652fda6bb5c35413574898f81a3ebe1
+c0e015f8d59590e61a74be4749c3cabf
+f10b35ddb7d97a61628b34166896f775
+279cdc8e1add726bfc3464cc1716df6a
+a397c14ba4f7e2ed4a8c5569353b18d8
+910fb98baf13b436a3bab4860a639098
+beee7dc6cb000bcfd596f4bed996f2ce
+6eb066949c56e2768c5f6dcded0131b4
+091e7e1975836d19463ea31692c52241
+628ed0a8fbe877cb211da173fc710b6a
+76de419ca7d4fdebd931a9a018b272ab
+eacacd93887b26b1f5344e919f010bd7
+d37360ec32481b7b8a26d6981af94725
+10e25f4e0e2d1fffd902e8a0adb444e5
+90fd303592117d92fe06585bb8841d58
+afd86077b90cee8b2e4782d618d24b82
+4567733ce01aa86745d379ce7582e7f7
+842fb887cadde9a5e89ef7413620a582
+20b55012c74c5c3749fd418d3241de45
+917ba69b21316208ae78e3334a63323c
+41a0f4a90d3cbd1b7e4f69613c34a6c9
+fed45e1b18f341433fd5e184fe094010
+75524e0d225d251004a94e9e04963521
+5afd6e15462a5a00354b35cce45a2507
+d41d8cd98f00b204e9800998ecf8427e
+52fcbd0bfd2df13be8e29550feb3ce8f
+4619f558d01687e0c014059def6760d6
+43cbf708ea8e7988af5766a3c7f2e3b1
+f749d319457103ef3fa8e302f53226eb
+aa4a022f0d6f76a976bfdb1781e9287a
+06fcb312a06ea529ef8ebb18899caa2f
+fce066e97ccd6db72d6d4f3f97ad570d
+dbab1e077f39a6b08d88585770d08168
+2d9f37417fc870419d3a7fe191be669d
+0492414730e83c805f4a720d42b2ddf5
+c5010840fbe53fb16fb7efefc5dc5bfa
+33fab9d40efea755ba0027fc62bce94a
+370d68774ab5d46e6cb099240526f50f
+192f12e979cb61b192182a2209bb6a18
+07ecacd34c974eb47331a58868e59a5f
+821e1d97a14cdfb0e6877def1450a8e5
+c7f7ef08dc00010bf98e375bc46d457b
+4be1d98cf73673a5ddcaefa25bfbf8a1
+b8d84d76e461319dc916f9c46489e4ec
+dbcf30526e06f4374194298b2c509949
+5344588dfda82f612f07a2a19deadb75
+a267bea458e0a4c1b37a2962f765aa6d
+5302b44b54c0c473641bf59bf4d4fe9e
+42b813ba1d5ea9dcc1df8f83a226dccb
+6ec9fb4cf25c6626bd985c9301f472fd
+41868a3fee4e6b2f595bd55caad93d6b
+d60853c1c796929b87d058683b659daf
+92828cd739934c70e06e2ca88144de1c
+6f3db1ae3d0c9131a23642fa70494b67
+29ca1a0e003304590a81efda6e673825
+ee5cd24b698c2a9d5e1e186177c0c3cd
+fd4bd769fce7eda9f5eac1c871ce4373
+f1447f3669b1039f742f01d5485bb9e4
+94d5891c363ce385a5fffb3e4c55305b
+76865bd47da11068cf4323fb703eb9b8
+bc35369746a8030b814d24bd7bd8ac6e
+9ae68f9f2a10e2d268bb7226ade2f73c
+9a531ec730ff38d444c3d49f6f45ffa6
+695539d93fc10af2b940c4b15fb4ea56
+243e0c25d9cbe924ea92a573068bd53b
+c7704a7de6354bac6e84c5ef010f829d
+50408fe8125580ff6a1d3ca390a0dec7
+94f0b26de71e6331d38afacc18eacbbf
+f07d26aeb4ee94ee066e9b0bf6638533
+bf09257b14dbb5df8fea5a70786bf54c
+afafc15af48021b1e207d22465fbb31f
+facf6a65b7ebf95b97e7637a8b6ba4e1
+f06843540279270dc10c5a214f3b378a
+390b14ba97ef28b90c2db5f395e90cf4
+b9226baf911017f13f33013d90d1c62e
+11e6c0257e2b8adb2e166a51b0fd3b15
+92c7a4dfbbb40596db3c460ec8c74996
+d73e5b1d27937beadcd144884235c13b
+56501232d1ad0969cb2a1fd90a3cdceb
+4063a7d538271663fac848999d76947b
+7ca6620c4fbc1ec08031feb08ffbd982
+5d7e94c4e079387787ba0b33312c8de3
+e6587e7501e37397e5cd78d2dc7f5f63
+ad90a0fc9f1e94170968855cb658ddb2
+523d1647553da35b56cd6ff94ca4c865
+9c02e002a6b4097a91f6979b5c214125
+fc20d439f52879f40d1fadedb549e2f8
+21de77f9cd1a0311aa992f21502eb680
+72111398919a9679f1c8827fdc726350
+e1b8bc4ae863983bd24c4ccc18a60385
+58002193ec0b226fbe4a3374c06abb6c
+a7749fc31936129dffebf7d9c595a0cd
+15ffae6d4088101d7becb80b6ee1ef2e
+96873004b8708519f01aac257e935756
+0822eb791b9e43197ca683a150232537
+0638330b58522e4361c63b6ecb574c3d
+c91369766380355321089c290fc35c04
+67c32a6fddbf9d5200ae726a4949ecf7
+624115e00b6adb23d40ea49e25c14617
+9c957a05a902024a25fb52fe4948e8d0
+3a10884ff7e17f537c9fa008a81d9381
+df0b98dcae1fc3ccc4493fa0805bc513
+dc2fca462cbd1591e3795d46a920508f
+84173000761f4d48ca42886b83520d7c
+38b8b96a38e482908bad7f0603eb6052
+f72b639b6ab14362e6e51423f9d28584
+c1e51056bc86b0d30a6ef7859ef8ce41
+1ac6770cf04a809933f6f3e032c3b74f
+7afffa8806538bad5cd5d23dd216dc7b
+8ef433460f19da45bd9ba95fc1bb0513
+03e7ef22e9dfed239d1e031d33044b7b
+07ce9aedb0025a8ef109a571b83f2f5c
+bee3778f20cd8306c3b4d39e3e920f59
+c943daa6d164953dedaf164e2ca72009
+d2240e0d80fda2638f38b667382d6cbe
+5257731a8edf3894302be28b0b0a95f9
+68753a411274e6db8b8bfcf492e27fe2
+db69a4862d5cf9f70bad86c71c93e7c3
+bb7d302237f48bb1167f7f9dad9a841f
+8292c4aa713ad31a6a9a858fcefcba68
+d41d8cd98f00b204e9800998ecf8427e
+d41d8cd98f00b204e9800998ecf8427e
+e33112d47972e7b508b543b5559b080a
+57ec8b2f4ae3c52e56df6dde9ff8a730
+68326340325524842342e81c2c1edb78
+466e3a9fc2a3ac2b012a7dcee494c6b6
+cb934d8452b975b9b0bf14622bb26410
+ef24925ad91070876d68a2c8f6757530
+a7ee42b0dd1dec9c83e68933d842c679
+fc91c97539381ed93953645eb1041328
+e0e738556aa5df197feb1fea217ffcb7
+e5585fbe36d9ca3f3adba5295ad1fd62
+34504a7737edb2847121dfb414193a87
+e62c15d50728be1fa999aadb0bc14106
+339f0d7cbc8e02571e6ae0ca9bda26b0
+2750aae6423849c47fa65c93845e08d8
+2dba45cc10aed6e47b54ad737521c348
+af6f08b6145cbf0a74766b4741ff0870
+ba2ff09c90fffcb9066a184d96cff8f1
+197272908a2d3abacf50b586b78419f7
+baf17ac21ea7f9993f71ab11e6522ffa
+ba34b417d6b79aa30c315d6bf00d0374
+69bb83ce2cc2a12d6e57a0a5cc3c588b
+3308f5ead8c9471539d6b374a4757757
+ed69506c639b026433b4b17e8b9f035f
+728f4ae3959cea51515fda05fe3b96f4
+33a44cfe131096cfe122929afb3e20f0
+652ddc6fab349ae77716c7c4fb893c33
+de9fb6bcb5b2c66bfe548d8aa0240c3f
+ae06ae0e33ebf54e6befa08c353095bc
+bb569bd03ddc90710846c429bafedda4
+48f075e133fa4fe1f22d583a0fa6d91c
+92973da89722d177a1b007fa26463fac
+eacbbfb64dd59da51ea75df98a6d6f7d
+1df933770056db899f563e0b35fce5d7
+caa58c1ee57e0d2af66ab917f63ec62f
+d87a66dd99f9caef4ee60cea5d37861e
+fef031ff341630f860883b50e741534b
+cde5fa8e76eef0ea6e219f64638f0d52
+603d75ebd94e27c412f62894077405a9
+c550b559f07b6ed0dc275e63ebcd6b9b
+9cc48b1f01e55dcf6f59443003570fd4
+a63a1d13b26c2c938383befe009552e8
+d423ecdefe67235f7c67ee3f1ea61aa6
+5172c1b754dbd95e9ff3f7478956a069
+8fe828d2851fc49abcd37202d982dc3f
+b15082e99c91ac92cd16ccda845b6ffb
+fb2d0044c8605a25483473195e0c4ecd
+5f20c8c93b91734248ed1f0cacdce225
+65f3d11d0c7b6d013daba730c8b94592
+d538d1836863b4f18a0f2f78c392f25a
+2d72988b1ae75bfa672f8c6e64937847
+37bdd921577dfdb4cabd42efb87a2921
+11271ddf3e413103a544ee5e70867b74
+1a40bf139d2fd746a8a9879b8280575c
+91b1b8a2aa9cf6ed50f029122d304b05
+25ce938804f23f36154a92a6cc1aeaf8
+691c454d9873ad3d09c496d53ee0c6bc
+8544ff29d677a3577b4af4b3d21a9b64
+2e856e61df11f27cd803363266a0350a
+ac50a383d18b3c7c9b058d240bdda8a4
+2abc9e79d6edd868c9a8639b390fca97
+1dc919144cda17381cf1af68b3825eb0
+4fcb974c1b0c3f1949b4ce4f16742a4b
+4570b638dc797ee79420d78023507908
+f96c4f504e0aa915de1ea6a2143e3653
+8a4471adc01358dbe4ba365e37d4a5fc
+41ca8189e6ea95dfb0ec1dd36ff7ed2b
+f2c0191e0f7a1dea3e44cd190b97be37
+dfc140d72a06025c288ba689e2d5e15e
+03a4669244fb99f882e5e17ef4f8bc44
+ad73026b5f15d9f4768152a6e7252c35
+7ef2bb23d19e7bf3414682bf16d17c9f
+9f29169db670431833443d5bec1b241d
+5f7e5514f86950735e9fe561cff570a7
+bade19a387ea40286fd817ab526016f3
+51815c0eed00389804cceaa7d3eb18d4
+e7bf7355c103d4a86c97585e71e519a2
+f92e0b2b37b1c6b945c9d9ee2e2160ed
+267ae58c836a9cdfc9713f69ca4b32a0
+085eeb66b320fdab5bbfa255a4f2769d
+799727960343678811377031e631c987
+a9b3f5b80261c3a185b8278c1eb1dc1f
+958dd614831207caf20c19d96ceabd5a
+fa4d832ac13f49bab77cd032cea08687
+6a99e7147169829b476c5f69ae80a690
+2d3d9b69a530c4bb0af75e1a7677f941
+b55fd38f933194ceb9c124775ee65d82
+f2c5c9a41d180dbf7abf54dfc7b053b9
+553fcabf13cab220d077ff0581d22cd5
+3a03662b37811d253c7d62174a25d2a2
+fdab9f85b0beb686ce6eb2c27e9f6606
+c296cb017770e75f4126217ba7a94289
+38eb002ae5db0c83a1737a76ee4c82a2
+c0bfed2be3ee64515847aeb3cd098655
+abdb7a7920b35adaec0b2408325137a9
+f7622881c3c0d9bbf04125d62a9323b3
+0c139aac9e9989de56f128172f9d1d6f
+881ab5b63c6cffd9c10caf5c46fd777e
+b07d0f3e2230af1db06cc189bbe830bb
+fe13752c6b71d513897287ed1925fae9
+db365912c76b67a712d9ebbf6ec33531
+6c572d0d489d3937089a79bb201af793
+90f3c37d06ea9577e77c95a6aa968784
+69f70098f9ed429028dd0a6c43de80d3
+426a2509e3e854b2197ae47461d45ebd
+68d69d7bd4fd72a87b6cddc4324ee7ee
+faf50a3e9d536fd732f3df53fa2aeac1
+2ccfe139f827e25223b889bf63d0122a
+d508939d2caf753c9bd62b509008e4ec
+30ac361ea6874f683621b0ac86b03043
+ddb9eb46be20d5deac9b467cb8221deb
+1af6a340a64986fa22f81379c8df1507
+8e7339237462c5ba1e4e4cf14706b683
+ac6146657d71b931dd4d070384588b99
+85e86e6b255adae6354af14b503bc9f2
+67f7f0b4e6915e1d204b6e471431a9cb
+33138685fe5d5d96cbd7b92f69eb76e8
+91dca4eb509fcf1d5a61bb3c138ee90f
+0e00674cb873c9102642870634cc2d49
+e08ba17e7f164309a9a7c79352087a77
+83f28f86721799be4ffc9e85e09e32c3
+c09e08292356b3b505b246b17fc1db8c
+e6505a948900512d1c3adcf2a298e439
+ce3ec9380e67365a9985ec1f9979d707
+3936c6565a8d7212ec9a4ab8f17b5e3b
+12c4313826e3381b7f4189bc61e81dd5
+ade29e0dcc327fdfe7c90686901f2e6c
+872e2249a1ae0f35245ecf9189312e04
+1f5f8cdb1acc76149dc1d70c0400872d
+b72fe4698e3f09d962e19b16af33552b
+c5ef52a5d1e45110ce9e5ebdd8c26c23
+8080d59958e8578d210dec8848b65a56
+1b9305c4bb7dc5f66188f8c1690d955c
+b9e05c3b8e395840e8c587307c4e7fd4
+c87523ce8edc618ca5a7349ac723471b
+905838004686f8d4655784033e64c86c
+7bb8daf550be49532a912c0bbecaf4ef
+8f40b661f6b1fca00a2e40111adafa43
+3f80f9a0472bd438eaed469173703f45
+908cad4a994e1f709fcc7f415dfaa1d4
+bf8dbb99e35ef6f85a0b0ad50bbfa194
+80441fc50014e3083cd2b4cdca095712
+f5045cd83d37dadd75fbfbf9b8458773
+bbba771b1cc40c065188b6b1b7990541
+85cd3ff1fcc79aaea72c3922ef53538b
+26b817140ec401702759d89aaf9b5e05
+405ee9ecb2f677a05245fc9e97df2128
+4f1db54aebe73c847985a5a297f79a0e
+c2a07942c06752c062ebcb9baa76f1f6
+b334ab38b959f661ae62166134332a20
+9cf1c20f442eb90fce13976a64c9eaba
+c568d54cc1061edbe55bb36a8916e027
+ec76f18607e55828b45c6f413d2f24b2
+f36bacf723a704b2b5895732e3a7b30c
+e74225ba9ea6fa3bd95f64183e4b0da8
+6b4328000c3a20612f1346dbeb27af9e
+60d30810aac3634ba0640cf2dc947938
+5382c9b94c6706f4fcb7f4db93089296
+9c04a898b9d91d16bb581a83d2484a52
+2c4648e624b15114bafe5241d9cff7b8
+8b71c8e0b15598004c015022aae657f2
+5d6ec64bc4cc4f1b71b865c9b61da82b
+4e6bdc1bb14b67f93e9aa8cc691e5a25
+974810e9823b25b8c8055c2b83e33ace
+c5559a4013408ff9093bc4411fe263c0
+9f061fccca0d7df7876bab38905a9311
+14221eb6d068b956e9d5a3ba61a2406d
+40c483cc248df08ffdb485da6c3ba881
+b6f1114f353a993f1e42437724b9fccf
+2ddfec536ad8c28111001c897884b538
+e74abcd35126a8f7d6e366a1f5d0c33c
+35368fea903015656fde751b5c945d5c
+a3f2784bea3ef896ba44e486266bdc32
+8700ea3945d9080599de4289d67f55c0
+1b8e5f9790354e7cb09c0995806122ad
+14eabdb253654da6482a1bbf84d531e5
+ecc6de89421151703ec56a23ebfd8b66
+8558f4053ec1339ca73b66ef409ebce7
+f909cc7d81ed1670f0cf6f83f9157cc9
+e309d7afd1d6cc56393bb4bda7b0e06f
+89f5079cadfa2086278ea0a62733e71c
+1a6e26e5d9d5fb42e57d7ba5a29e4762
+d819950997da2ea73efd10eb447ef331
+bc0558ae85f9b61e65b72af15fdbb938
+d46e136a1c3f51db26f3950d3d7d102e
+80faf8161c9c1305383c0b6702095cf7
+572253c608e56682dcd3ae2caaa8c220
+535006152b03683b7b16107a1f287e66
+20524f25eead028a20325fc8da1ad778
+6dda649b6293c978260f1c7c647a63f9
+d62d10d98942ebb052f133a0a3950e92
+69efcfe2b68a94b3b0dc8ec93e13b046
+94a825e64c404ab3b945d89c4f6260c3
+cd75f57322ed2afe34a57c195ddf93d1
+5c7738dbbf4d4098a740423ad9b63453
+ef26eb0225518f21cb24790e86bbcfd1
+981c3f2f09c1d9c7fa4e52d12454a9b1
+b5c5324e6688fefbc13d67bcdbea1c49
+830cb5a5ff3f7255d3134506bc32ebca
+07f48df5644c00532f55a34fd211d2bb
+d41d8cd98f00b204e9800998ecf8427e
+8378e7432662aa500de71d4dc79da62b
+b6907b090e9e94a77048383c17621a1b
+eb8ebb171ee9d30f775eb08b718854be
+c43ee3f2bb372312a697442ab071cad2
+b2266056e8d9128999f0a1e395a6b073
+509ab76038d676070347f8463b7da0c6
+fe1f59a2ef2ce377c51e2cf1b54ea01b
+d10d5c1f04f459acd896cb677ad0aa4d
+a7336d759f0082baf711dacad477e1c0
+c47114e6c27805a8c838c4d262217e3d
+e68d6d80b1bd84e44bbd408a866107e6
+22443c2e2bbfdf6431aa5939fe8e02c2
+b4771b628534b8b33118634ad3bc2457
+b993948dbcaa8b8725d0d3724cbc363e
+88c05f2e6cedc31c93e3e1c5c9fdaee3
+2611f98829842d975eb8094af5fc98a9
+e7da74d0aa9915cc71a3f24cab964b1f
+242977327df1fd9191471ca8e6788b0f
+16b28acadad26dab0fdf9a37bc1e812c
+3d7778d9cbc90ce41425c80a02ee3c32
+4d856c2a268edd04637309156514d258
+4b93963b996b74b79d642241e835f53e
+b4063b02a7b5cdde15a6afa6bb70622e
+fff51c7d357189288f82000b06f912d7
+ee1609e984c788456f9dbbc8aa499f4e
+cecab21dd607f0c66e22433965f10013
+bd4c071abec7618bbbf6ddd143f2ef83
+46aee9dcd1344d6c0b448ceed8e8c553
+4bf6ff2b2ed95e12f2424ede8355541d
+cc69af614b95dc3b77a031cc91d31b1f
+fa255ac72f18f4e53aede3aa57f6cb35
+751388163a44b375acc016e4a31e487a
+41d1b0008a55a182e4ef809c63e48aa9
+5321d8a91f9503a4ec28856330894bc4
+cb03a78d2dc79f3ceab95f15c5e27c53
+8f5774d29590f00a90e1201850a448a4
+26ddeffdd9e869941df3df7b14912e24
+e372228fa57ccfe48c741f610b3e0a72
+3bbb3d03de175d0b9dcf445a72c2cb9f
+62840f803da220149f5c760ad1d24804
+42cb3688430ba4714883bf00ebf57120
+ff39af1af82d6714755cece3147e4969
+bb0e82fc5c180938aa5e7492f625e6e3
+9b7f93a9b2e357ec016779fc480f3a86
+d8a54cfcb6965fdc690be7d5d1051f2f
+d023af40c3612f5c470d5c5527b414d1
+cf0eeda9424d2e09b04bd47b6584f47d
+36b7340031d13a5c12fcdc78b6abf0db
+fcb53152d54db96b060248c274660974
+14956508fe1a41d8f9cfdded72ff13d4
+7128c0ad5b233767d0b66810a06d1467
+1e6b7be8f0311c5d5083532b89dbf39c
+709d780b4cba1ae0ee9030145453d9c6
+d4184f082205866f0f00b73eeee5c1ff
+1568a32c4c007b215c7d34a5d791d390
+4e2c7571f646a00f3c8a7fcaaf1ebcd1
+4c8636afba12349c509ef4612f7635ca
+549b262f98a316cdac87d62c0c8d6c7d
+ecd536e7c6547da1cf571fa70637e4b1
+0dabf4afc6577a3f1977729a5772792b
+cb30e9fb1922cebe3d40b35e34cdea4c
+b1db2e43cb00ce9aa902078c63e3f93a
+e222484cfd17f486ba5b9d9e1e26e1a9
+4c0e69639e856c18ba725b6a845462aa
+d44005c7c1c30a1de733c8d758ef3ce7
+d6980d4a84b81e7037619d0fd4398335
+0aff872d7cfaf3b0719e28f5b5bad2ae
+adc7417540c7a156bd14f759f6ed35bd
+e9d7e43e1efd50cfd1c901282e27e58f
+e4ca16d54f9a8c79ebd106780757f8b3
+2be2466b8fe9e684327396d2d21d3b1e
+4faf9ab766a0482e4aff51e4c97dff8d
+b8e6a6bc8e6fdf4aec6f3dcc3fb8cd35
+10c25310df42652069aa95df3f7693bc
+aaaeace4a213d9a99f31b16fe381b6f5
+e9f1ffc7830430228fba238e6188ad7b
+942ec532ce3bb0f9679fafb244e250ec
+de5ab3d9cdb43de723970d548c01a111
+0f8f55ae6459b048cd80a820da68a2d2
+51e4e8bc8eb005d2cdd114946d2943b4
+68454b05f5f0306544c374a9b3670d7f
+6411b47e78b971f82785cf11d53c0e7a
+a275adec3e88e0c9206171a970598447
+db5115e81dc0b665b3bfa393507dbaeb
+30c2f3d52cbb644df039c7467b7ea31c
+0c2000eb5402b4efcf01588f02b53e8c
+3c9ad89359a2d7a5a4ee87ed1cf8cc30
+46858554fe215fde02214d88fab2b615
+1c19f6f0ddb0e853ead9a4f40519be3f
+9b2d9967c07ba5261c1ec22948eff33a
+0e6bba43ab0146e663d33c44eb109803
+403b6165291786215ef7e3a182af68ac
+2933817df03362b6bb9236e0bf392ec0
+07898bd9fba589320ec0ae1fb11f575c
+4d3826b8158e87db85aceba3bae8b6d8
+038bab2006f95837e9012337595be2cc
+afd279c86ba0b90e257a91c5d9f7c03e
+5287a16963286e3852144602c63e3285
+1ba240f559dd46fbdf69cb3698ce372a
+ff6db609de5d386b9fcfc8e1dbbb509f
+f050129d222fe58500e775c6b567ff82
+1ad8b6149e8cc460e00a5804280df91c
+8e55223047e86295bf014b11b2433417
+a6235184b5f6fa512ee9491493fea835
+46768935fdce98d98b625a2a3fcb68c8
+b06a06aec0a06e253d86d17073b6ec77
+dbc4e088fed7099e726dbe0199f72288
+07302bd8eb997206f00bcd914748cef5
+cdab8994ccc40a72b2522081a8873f16
+1ae8710248577e50a93f79a412e7cd84
+859da56a540d87cf707d9ef3183ffb24
+29ac180be036e41e9259046ac7bd3f54
+82b452ec5d7fb0de6b9041de205f51a7
+73d10fe1e1ed62b06386c901dcfabfa1
+2f050e54ff5c78ae402f855f441ea47e
+d231c9600044c8d7d2facfb88ec0607e
+10c20f840e9fa6367ff5d87d529e0541
+af9b1e793c1b5c95c7731f0de5389fd7
+08c0f0be2f8ce190319d7bed1593ef3f
+e8ed5224c7a4ce4869d5ca25220f0487
+7b80df548bec5f43cdc0c893e403d8e1
+74278519c0de0234d23fdb1795219c46
+df2691d0729bd23d1a70fdcc5c5c6c6c
+747887d01c59a153bcad1ac067285368
+ffd6e283d45aefae3b04a1ac0ad0b708
+3aa1961c10b4398bfb1002b6c2eb1613
+d9cb55b46648f56d74abcda991e28fcd
+1bcf3f73c3814e50f9036381fb295513
+dea3d72d51c9c90a48a82672bc39ea4b
+55f807c9835d01643ddf735322e418f2
+89cd84069451c616827057748cfee0bb
+ff6c14b642f8ce5d9bcd8fbd4c444077
+10487e061a31b8326d1e654fcbed779b
+d2ffe9ccf00e450a4a615db51951406c
+a4b6996e32e716a486ad880282b64d9f
+22f6c5eef0b1705161315a2e5ca5671d
+f80b847134e87863eb854dd09ce392c7
+60ebbd2f76c97aa4be55c9d782eb967d
+3e459a3fe1d976da62ce747ba4de9cb7
+32b728f5c0f18ebd5efc3d8e3f5d5536
+f50523a5e65ddccb0250d620687d0071
+b1551f36f10d0df03f653508100c35d9
+3e121986fd21e2be0f0f5896fcffc01c
+bff6d5d141c58b97f8d72e54c44ae314
+80bee783ce1340cdca5358f8c5114bac
+6f66cecd030cf04ec09399c25875e57f
+35ca2d77d957f8b4d3b2bb572ac59ffe
+8a1143aa8f57e7827b737b6205bbf0a3
+b69d009d8451dae02a2d5bd19322b2b0
+d075acad6a8c5dd96931982f0b710ccf
+33849725f20b410d9418d2d2602f9a6b
+e8be62521a8d69ee87dc63e76bc90dc2
+22a981399a379383667e1963b2ede71d
+ee17eb82f2ee16e910a0aee241e6edd9
+d579f596fd48cc2220a436a7a02c8e58
+74781967e95f6d7a43fb74f54e3f1145
+91db00c0e3c1e7c2617be6b1a4863b24
+60b597bfe2b0a05fdc20fa7c4aa32340
+833e235cc647b7d643c2dc6126adecce
+f3e6e4467e1d9a755ca92365377b9fc3
+afd06de5346c3df1e5fd1eb1f32af1e2
+dff409c7287f3fff1059dfe17628b6d0
+de0a340db86c2e21d86b4141ed9929ac
+e68a53ecf87fe5d41920c915180d1c26
+90a82e3b4c3ab8db3b1a5e6ff9fe71bb
+00f6a5b678242ac9f0349e754646fe6a
+4ee10bb2d977c45302cf3fe4d001c2c0
+c232d9127e72e45f1b328effcabb9383
+efb633387ec02cbef8fc8a0d7c75c03a
+d54f7996727f6bf0bc56bff067e9e563
+aafe26340d3fbe2be3cbb723dffbda31
+6fceb0e03dacf6f1b3e1ac184a4d0920
+1d35d6f6def536be533e8ffbb237d658
+00d629e07e0d96c6b664a183de245f35
+5528c8ec021797f8077be6ae16c51234
+2494c5df043cebf05127afe63039b535
+9e42da4807e8b853c29b2a1e5d8eb4df
+eaa53f2dee42c6d66e80d6fd0c1ba410
+8702e95167390596f4362b58470485fa
+9298f497edb46ee8a6760dc1907ec5ce
+ed34c80e40976e394ba58693e344c480
+93450a24ce5b2b3d116c8eedb4e6b266
+14297a904da64078155e3beae363b550
+423aa20afd23684ae6f3b8f263c11f97
+ffa1e12030a264fa1328bb3d44ea8ea2
+0d816eaa08f702f844182251124ff556
+f057c14fda68693377db31a74c1cca2e
+e8bc29367c6783e21ec009bd923b372a
+6f680c48e563596981585d26605b7ad0
+fc290da80bb650ab21f22ea66d01cd4a
+d99eb2cae8293e1db6ef59965af3fc0d
+ab157e48ff7fbce709325aa58cdd3554
+d93076cca82c0cfd5bf01d8688162c6b
+0ba23fd736a330b2002e7c4c2b2a1406
+28fa375f47d0d1df3a82f0170fd72993
+f027c543ceebcb5b1cffcbe7131ee4b1
+c6948be12c9888c40d338e41959d1cd2
+58f63666dd2c825fc3a9df40d9c0887c
+00b041580b505cb9877d819ca84a149c
+2c89078126e1ff6825eb0864f7ca4195
+acd83aaa123c6c02c93b00b4563b0e6b
+0b8758b50eadaac5cbf9461a3f0d9329
+f568034794f2320e0a554363b4d73bbb
+18b7d5b3cde0d6bc2119b946e7fc71d7
+abb155ec82334949af32ff1904da8921
+00a8e2ed03195bb6a14807ae5d33d9df
+6cfa94418920175ae808cc66c0c64500
+86e48bf1636994d81826b19442694440
+ab1b139e0327be25c8379abaa809bcfa
+26144f2cfd6a3db85b20345961228ae4
+08205240eeb8e52093abeae8fe3145e9
+664861698b5da01e11d716c1f9cbdb7c
+f54045cb313a5515d4721f5a683cea45
+30c4b286a9fccc70ba9d88417f96c096
+ad84c1f4233e1bd60128b4b284beb6eb
+79cf4adf5508d27a616578ed29349d10
+85013779f34b6c0295088ecf4f73549d
+26255dfccd7ad2085412c019a1adc47f
+9c3b7f4d5d02268693d651b567b4fc3b
+e03e3ad6cd086defc0d53cc42850a9a5
+9fa86500bdbc1b89083e631ef64e14d7
+e4b520886ef505764b8419df7ef8c5cb
+4bd4391fb16140e9db103cd3be3e9568
+6ad1bccfae06aad7a67b28826a801234
+ceb45e4badf3a8c3e06fdcb6cd252163
+611c8012f0897c6bc0c297987da30a5b
+45af573f0dd1940817045026ecdc93b3
+3c951c96f5bc25cead20a9a089eee649
+017ea20529f9980c69137462475e3872
+f47973b722804477e44a2dce6546d758
+0ebb74763ad2b0c0ee2733dad86bd5c1
+7dc0870d8648064ae5b3ab6555d2a9ef
+12ecfa3aec0837aef051d2cb044fdbcd
+e5344be9caa73ef6c687db8f5577643d
+ad0c54ab1951c953ab72c6468ff1873f
+ec7b071a2a85789d23f159cf55c1385a
+78d231ee522ad1c279b1347a5cd9dc1c
+243b3af8e8707780d08cf5f4b96ba2c7
+9c4f224df6789706772f7eeafc4b5eaa
+35ca867a52ce607b8302a188786b26e2
+eb6aa9aa1fcf8bdde936adda0a5a4a6a
+b0c4b541026fd15974803898cbd22d0f
+7a3db845ee76d1400e1685f70c3c8121
+3a0b121669d17d848cc91c59aefce98b
+cd14cf27b2c159c7f6d883eaa5764932
+862600d1149d3b8bb899321f20fcf269
+b2523fd86afe29905b65511a7a100a5c
+328325aef5554f2c394d8dea75209523
+74a4ecab9a380e940feca6993e0ba085
+7e9186f5a74112de6820ed942c5bf2ff
+d8410f7f9c13c4ac53a340cafb480216
+685364c204c1f6c6802b5502ba021915
+bb215eeafd69278b3e9780b3a9ab6cd0
+d78b7e61bc32b91306ba658fc255d164
+46f28f2cd2aea7361e0b7af91f6f31d0
+e3a8414b311427317ea85bb32c4c6496
+d982b7e41482e9d630cc8ee39dfbc6fa
+2f10e52c47c74eb9bfa922fa1a468d83
+0fc89448160b6f966a91bb888b036006
+46683ef4fca4c67b5cde932ed2b0f2d8
+7140d522fdee22fbd7c138fc304aacf6
+e6b12e3f7b0a6ebf9441c504c0c76399
+84fd02d7ad2bcb58a05cdfce4ff0810c
+e8b0f7c70ba713e69551502a7feafd3d
+fe268461aeb24833c2d1a525c33569c7
+bf361d555c05c1e20f25d4d4f962030b
+96e8df84f0462baa8c6eeb2813a4cdac
+e6ad79d235abb04255af16ca175e5ad4
+28a8d33706de61117cc7f1db2a318383
+d11d90601b5cdae17f38d825060db863
+8fa4003b16258d0de639a5a7357b78b8
+aaf2323297a036c2c5d3b05cb907bc56
+83727ecbdab2fc87545115d4cd8868e2
+7394a10eeada97845900767c6dbdf6c6
+815bcec299eb5c30e0139869cd1a7df2
+8ee6fcad62788467a7268ace590db6e2
+4bef2d709d3df29cc26bf8e2699e1c6f
+7854ac26cccabf07f8edbb596ec345b9
+33271a5d4ff8dea29ed4dd1931563991
+842cd82ccf0aac0c4079bf654cd141a0
+246907acc8c6066f664b973bea22500f
+950e778d97797c3dc6aac910a6e4c757
+1ef40ede6ba92b7a7e9fb344d816da3e
+96bdf13eadc48819130941c5e5bfb2f4
+c6c13ec69c56203a4f0bf100d982309c
+bd11e8e6916d20b7b3dc0729334ccc1c
+14322af9ba299b0293ce20eec7d2365f
+c8e45480c6ccec77852d5a34182c49a2
+3666ec2d0d4fa00fc2303e0373b85fda
+7eef7fd60f9a0ab2f6fe2d31cfe7c7cc
+9b1ac4955ff63c630d3e37d029a52e5e
+a1afadfac0daa7ed6e1b1c611ce5d6ce
+e1e6a6edd861543c0091d6dced0a2d0e
+cad63c582789830f22eeb33e38ed7434
+6efab923621dd4bb48b5825e43f3242e
+ba44a5869325943296e3f934333d3afb
+8d8345f4d2a4169c2e61a1184081fb0d
+477ac25ba902b6a05d97dc618d087784
+f716265bba2d0e2c01fb5bdbc510b4fa
+1368c3c376577ff06f2f5f93b2d89df3
+1a9030b1dc9803b3dec3b3cdf8bb91b0
+40da7d16be1945961e7ca73689fbf70d
+5b2e0236983ef6c82c9ab224473a9386
+ad125c3f0129d782bc23d7524ad88bb1
+8d22a8f28e1c9041b721956f9a66b439
+47b1ab0ccc1c522b1046360a24204660
+cbc5fdfbbce3dcaf8e767b943059ce9b
+afed3bd4b77d38ceab503b6bf8d48f13
+d41d8cd98f00b204e9800998ecf8427e
+bde74f6ee13decd6fc4be87ec4569265
+87c132f809bf3fa2552a57e74210979c
+72a02a12792f1ccbc980feed3f85afe8
+219bbf777cf814da234e31e0de42e6cf
+4902d2e72ad605c4b834b6ea538dc845
+35971811a2a4036765f14137e18ef795
+98abe18177697ab480fde044003fe959
+38ec84252404c33c064f050062ce6a7b
+0c0070b1499e158443b4d3b27afa0667
+23f49c36fe06883fb2536c302d94e9d6
+7bb7e76bf4c6bd97df3a311118ba3d8b
+6bc8df57fc69dcc23001ff9facbde13a
+4fe6415b5011cc6bb9a8e6ee707ffe17
+d3c897593370f265c7ef9cedf6ebe49a
+dd231ec5ddb6210d21e45365a77262f0
+03ebd9b477d269df67e2cd08f65b163f
+1b227f84396da4460c5a0c82f75fd1da
+def4f6921d4176d717d977ca128ee1fe
+6e0f9d5d405cb71ca62797bfa0561503
+be7976468e06bd8706376dced80cc03f
+4b46e4e9c17da6090b06257a114e2a81
+107addb78a8fc78ba69575daf084c8df
+dcc4cd7add8f9a1df4188fc0cf32577c
+5eb39a63c9bf801bfd792f135d18635e
+8b8ff5571f8ef9ddfde91ade70b1399d
+c0d2861364381332da36035038dae91c
+53773135c25ceee428aa57823f96b0ca
+d45f3a281aee57bc31604d14172c1d6a
+f11473f378336ab798cce9a16f59700e
+2ad88b662265b39df85f5689611f93f1
+f0fdf9456985b89b4593cc8a97d25e02
+a3fddf1d91c9015a51ca012f75685106
+016b7f14fbe4d5fdaa6de305b0ccc72e
+3b787b92e733fc5d796d5613095d3689
+430a48d960ef3d166cc6674318d34430
+f2a72e2fc6323eb7dbf2623633ffd42a
+7b0b8f55dd7a23468b51efdc8b5be6ab
+ccceb1a7f490519a8a0f4655673dca9d
+8344243281d8791d37716a9fd4a28a86
+927859801cc15a4d44b06ec114e791d4
+8cc618918473efe9c6305008d617e00e
+d182a416e23bcaecaa63e749629a7cf5
+b5cc8f0ae93ea4ae444d30b68bd04889
+54f87337526f34f04d84a13a8480d736
+c7b6d71fda2bca1510cfb37ce3ef4770
+f93137ccb3c03d75e67a68c4a6083970
+c081c934a0b572fe5af940b9b3a7ac95
+e3fc94748e672c8d2a875aaec190ced6
+3a8284d521b46bcb8eebb25a8c5f67b7
+c0e752533f16d3b0aa04157cf09cd44a
+831fa6cf72631d933431bdafa679b94f
+3bfeba5cc5b9063a5e0d33386187faa1
+f152c6ff80f7931ef3b2e78fefc36873
+2ddfce188ad69ffa6774ea8d0c2f0972
+d1dd566360a56997d8ec2d9b982bf14f
+fd3230bbe5e3b48a9e023253dbeb0af6
+3303f0b5a809fd8dc806c4e1edd753ef
+3fe59e55b395112223d984d449793b8c
+fbc7eaed192c484f60fa5f6dde078f61
+be54c3216c196b7d7cd44adff5ec4b42
+9d9b6fa6ff7260e1e5bf5a0640510815
+c475b2de0ba884e1cf4b271fecdb42ff
+0bcc235d059f9dbbc13b1dd102860c5d
+f19fd53a32d7bd84614bf5aafe13bcc7
+8b3a355f4d28a7b4e0470ab1545abb7f
+4577eb10609d00831602f3bbda0baaab
+dc346b25252c248c7b9e32788ed12096
+8f7dcb5a4167bcc1429283b3fe3695de
+f3b8b0d3fc4d2608416beba15432c889
+30a2737eeee6b9b5c46032a7fb6b3a13
+f5b56dd7c3144690125ddc6f7423cf63
+bbe95aca7d5b48d5140df2d3c86bdead
+583df8e747de78a5c43ddde6e5a428f7
+06b1029ed359b6de920d23d56b153b99
+ecf3dd7e10834b95a5938a952916ae68
+969a050d8129c1550d6d6cf62422229f
+a9b6eb341adf2d81d1933f293085b77f
+8b8fa6081ed91bf8ede4f75a99b3b9c5
+318fcf95ec10c58bb1f81dd7d856648d
+2ac22899638b73bd37b862b19af562ec
+cd5b7a2be8e8cd08a76184f6cd244898
+46b75d305c054ed0ef3e8a147d49301d
+b85a25d1821ccc593917add296849fc6
+2315e20139173573ce34e7cba2e380b6
+ec3cb10bfd83cf046a594ea119181434
+5d50ffd9b7fa7b472411de8be9fa9434
+1185b538940a527269eb93181e8d5754
+8e065b5e92064b27ed7d08201ede21a0
+dd33f68f485867c301a23a5b00371243
+3e26caa0a56bda7ed7c1465bb35b5308
+8ff2e68c10854d7468410a393f05ceca
+574c68a2e80cf8e39adcd1605b93d3fe
+e9a32940a4ecde19457643a83b24db10
+64f0e75bf0a3e295a05c6ad50c475f36
+271d30c6d0f6ab0b8f7cd40c0b64549d
+aa5136bb1c0981747e29744f0fe20d47
+d4663117573af0aea7eeb6e5380dfffb
+62e1cbae659fb430a5dfee42efc3602e
+b87c62a427f0b40c3aec6e46684d2f71
+62f00c432fd8834fdc47e8b22bd0133a
+7f917b7513f1c200a2eeed034675245a
+12bc8b4ee3626863ebca7f11b6a48cdf
+75f6e14bfb5d03c61e6bdea3ad226354
+dddd2b02fc7e52a20ceff2561faaf00a
+d44beee5a984587c465f52d1a3041efd
+a4b97688dc56b13a8294f0830963706c
+22f57961726448bf3cc32c2b75c69c38
+42ee4a5d5af88e7a4252597562c87e94
+707a261dfb24f44250a59db313163068
+137c5dee952735c27f3c64b60cd65329
+2a5f268c0347fa0c6cf0f2253d044174
+3c016bbf3ff201005631a1ebab4a4a0f
+fd3a7bd1a763383f001dc37645420dcf
+81930ac20a7e7351709f36ca18935248
+ffef8a212820b73024faf0822220b9e9
+14b39db364d654034c7048e7328e1ced
+5d0d261875778e38c2b8806c5dc3b936
+bcdfbbfa43696a968fc18eaecb5f1f8f
+24e91ebbbe458fc3227c55ca09ebf5b8
+e34b0e5264a2f4e6bd947f609bc1c5ba
+46670f252523f88b9d98749916c2bd9f
+ea6267249939973404173e3306026f7f
+bc796416e1d1d44eed23dc4e8c94f0ac
+5fcb25f386d77450c6cabcb52d3f827b
+887ae9921dcaa5c13d6697dd9defa830
+6b966c5dee2e22b9a6036485cc601596
+7a2880cfd6c48b0577e71d87a6b012fc
+1b579b7e3c2efb956968a91589fd889c
+f7d6de4cb9d64012c3003e0c7635d20e
+c40b3a9ca87f0d52ee1114c86001eb70
+e4a684afb41360b2f4565eb29263f587
+e7daac23d75a0007b5ccc36210b6064f
+fbddebde90098af52bea177e32b0d512
+8a9ae8f188e871a290e2186319581464
+44ff53af95214e14ba4ffcd5eed8e41c
+b5e81f6d2781c5bf2ac1ade162eeb989
+3f783c0602d97686e563c864c0ad597e
+40b2c217e915b5e852fcc7aa90810391
+f3d981428c5b5b4fa1a0d3a25282f8c9
+3b77bf3d42b32f99c21a1676197d4221
+219df77f8d80358eba79102ab6fdff66
+023cf0321d1a19a52b58408b08a5b5fc
+f1c9bb46112a9a09a29d75a3edd307a7
+257e16fd6d1ce5cfaf3179573cbbd8a9
+2b72e0532701b5cf726bf8e88ee837dd
+ad49ba5349412574b34cb7a96de082f3
+17256047f78a34c3903f9f496302ad54
+face68b410fb88139bcd5b42fe3b22c1
+b295bc9222fb686f102819a022e95533
+68266c37f47faff688438c5c5e6ec3a7
+5ef5a9e35149bfcae7352708e8cfee61
+da2a017e456f48f7c376b6fd7b9363b2
+e5cfdd3498a2650addb5af9d243d7782
+10f8b633242b8ff1ca53e51940b4c07f
+817710b9f349345164fef05921920e51
+7151565394e97bb45757bdc020fee071
+aeacf5a19696751b3d6f55234e92ee3f
+9a9d2ce7df4b74b09c1b65422b77f9d5
+3e1d517ac6dc7d25059927e6329248eb
+95f7989891cfc839a50568bcf4c0f144
+e144038d2edfd35bb27ecc307dbe0252
+d41d8cd98f00b204e9800998ecf8427e
+d959c85a309f913d8ce7333aa73087f7
+a4ac2cfc05d24d045da18b186d44a565
+c2d0a43e997d924a1e14590ec6b791de
+8b72b028485b23a0e358c14803062928
+ecc1477e11e219f4998dd19c38dad515
+394938beed40dd7509793cc5312967bb
+34f2309b44abbb39d225f0612ff03e53
+26048ca674d707c794bdfb5123b18405
+4480c16a017f57bc36d4a0453d4ac8f9
+90ecf5ca383d4fa69a948f01345123c6
+b570c3fb1f22cd42c50fae14493c83ce
+28c215fea4552ebc30c7782e7eb19f1f
+3a033a7f7ab54ffb4d729107e75ac187
+b4c7dbe202c7d1e2b8c6633b411eb86b
+0e166c43a8e6487864457eb219caa2b8
+3bde1f3fb552fc1d6b15f50e99cdc079
+6a5d4f363b7fdb6cb600086f672655d3
+6ae93240d108765fdc5f0f778a668ff3
+e7e23817292df420e2507921cfdddf98
+d155f412f82c54f752021bf9b91a8a4b
+c3cb29f59d052c97a047fc1e76b78e05
+f0cc478a884e02388d6330cd662ac015
+b0d829f71ff80a0f039e905673543ff3
+db9f4f9dc520e94f797ab24618dbbbff
+4a4952db98c00a08e341228c9cea44b4
+dd598c104fac71d695959ee2007abfc4
+bec42d4764b59d3dbf071fba3fad9399
+b6a385df11f52d23ade84c8c43eef8a3
+02cde845b63214277128da6afa75c69e
+f867a1ba0f250359fd5a2a06e3aecd6e
+88c79025f8f7590bc6b4fc04d682b6c8
+43fea3216c31a54a85dbbea4bb32f860
+79aa09aa9acb50bef5a1408fb45ef778
+113ffceb04ee5a02d4040dc7caa9b6d2
+b67b8fe74ffb5e37edf9cdc6a226c980
+9af9510863a4bff704031ec7566178ef
+7157fce8b6864949a10e83b0f5d69f0d
+28729e781809279d6d679feb38b9c4f5
+a92bda9dcfd6751e0f6cf90a87312fff
+cb3cbe11aa7ec694d44da849bb150fc9
+e732db84be3e51f7528639130fd25d6b
+e1dcb9c29a764f4cf9b2d9767b633c0a
+fec99e8664eb9a7a9bbf279596e5d59d
+5408d447bf05c93318d8df8b3a24acf5
+0d440c05e8d66083b3492d4722b0ec49
+610871d04fa0eea35396eb85a96dbc5b
+371bfcddcad76cd99dbf897b07196159
+91693a69317ce4d14c43d8ac297d9907
+1d5c5e373b714d90bc69f0bfc344f26e
+9ff0fd591e105cbbcb60931cf21aa83b
+e6c9f231c311cc52173edaba423940cb
+c5bee3f3523daeb62f915af7a42ff3ec
+3c10d7304d3ab6f37289bf9a321a9b86
+dc133bdf457374b4f64aa53f6770d5e5
+1e3863f99f4ac2a61009cea062cd76d2
+f8776402d972484cff0533cbfbac2615
+72db22840b2678dba363a49476a32dd8
+94e837baeb0fd71d69859565bf53c324
+aa8a4c70d6628330291294c688b4368c
+f90fb644e5676537fd518af238e5b53d
+01ae385b32316a293b12a24cef98aec7
+8147c04c4bdb1cd13c80f1c2ccd53fef
+78aea2f4deb7a5ab076432bc27498b4f
+e62ad3f1124858000b896cb81dc8022c
+93a33a655890b2e0a2d1b2378a627310
+44ac1154767ea5f3812f7d9c20e5b2e2
+22469605ee7700caa5fac81a504071af
+4332dbff0b486e4399ffe606bd01d96a
+9acb4d933b7035ef3817c41a68398f6b
+5b5e129ae6f6811c49765b006e57b0e6
+4cd8d85dd53a4f0cb755cb42c9e21fed
+ada978ecd6b8d875f5666c331a16666b
+85e4b7238ca7ef49d71b9d45c1ed4255
+ae3349ec74d4a76a814cea551e04880b
+054038d11ecf38ad011d34b5f4ede979
+98dd62c349c117b9921f9cf65599139d
+6315a018a672981a1032936ae2bf1e04
+5727db41fa9fd79080a90e3f8b64ff2a
+79a03b2437ccec25abb84582a3951ef8
+e3150e995bc1d2f068cd7ae9010cb2c8
+cfd170af226147d800597f96da18bc48
+5593771169a202b88e1a9228c99951fb
+d5feb8d16b73cb57b44db2c84611bcf7
+189727a05768f18c5d8218141ae0849b
+77bf4ef7a275bb272b5c6e87beb2c11d
+ca50a1b794e077cd1d52f9b0ccfda1da
+737e2b5c2ec75bd9f73054c9b09872db
+c23d91ee387e15c570f1b522f0ea6e0f
+a2a8bde00c33e94cac9659ada3336fc2
+383fe980ef6f174c2079e5763d884423
+f15ec1e04ca3bb2b48fb2fefde3e1467
+38e6688ba3a0255f9ba134884948dfa8
+e4cc26b9dafb9aaaf809fc3853fd3c21
+da02ac481f0cb1249fdf26c1d21bdef0
+9e7ddabd3e8dc2ec5b169fe68f1c1f11
+cfe9e678a4a36844b029c5a69e3f4bb6
+6268533806269aa5de8cc2629361c3f0
+7787d176e6bee8c25a146193910d57f3
+fd5ffe08b3e30d35aae8d398ba484fef
+ad08c3740e9d9e79f8659ef3f60b5255
+fa10c1929f1daa8cebc07802af57c3af
+653f49d4ab59285b849463e49bec1479
+c49a162163905045ba9bd6221ac27fee
+670b9c8b89e2f6887b91aa94f349d67b
+4ecfbab7887ff2c806a05e7fb32739a3
+b96ce8bcac6a83ec4058e01da8524854
+9e58cc77d42564e86b08b9667c05eb5a
+bcf2bd25fbcdc4f5e6c5ce42aff601bb
+5d7ad2d52cad15301c45195991169c59
+b40077fdae1bf82b47aa1190569a8bc2
+23e228a8618b945f5f1ceee23a402ac7
+8bdab8e83081ed1dc75b957bff0e22e1
+f8cabd026dd1ce2cfe1961214113fa64
+40d7e199015a219695c793c210062cbd
+0626bb19b28deb280937eca709e20fc3
+0ba98201041fafaf2c904df79a976ac1
+8992dcc64a645cfead9f1e0b17e3f5a5
+3f9a15829c33c1042f2fef0c2f3294f2
+752d2cf728930527a8a8899bafd5e546
+0d7d601375473adef0b9be12adc7c6f2
+6ceba6bebc589305b8a8c0324c3fc9c4
+3bc3a1e5e2573d7cf0c6c430fd8a9a4b
+0598cebb1925418f8287a9bf1bbda994
+b8d6dd12ebce8929056fee66f233eca9
+b9b5ca2aaaa4032b8782e879a08bbfb1
+5afee565dbdd18af82b49052046eb34c
+42050db8e2ac516ba9ab115d6dea1f63
+5fa14cb542b88a406f8f345add11243e
+13d610a98e76eb788fd275dc3f289dd9
+1f74b1a919499b5a0893283e5813a758
+761050bf69bcd2e862516bcee4b66af5
+60c521addffc03d4bd65c88f64b98f14
+020ece14e25d21c415dd1df6dd45e0d6
+17996763192849834eea731cde7cf784
+b8a26988c0ab944b95ab841a39ebce2a
+a2e67a66909edc63adbbc5953da85252
+203316834d4cc8a47b203009fdf3bee3
+9907b7acad69e6404607779cfd6e486e
+9ad8b054be919babb73430ca37c44cce
+b1dd94386e43688665d79b9efc6760a0
+def5709e40acc22cf8de0258937bcd22
+abdf944fe1bdc4d1dda0200b2c16f9c3
+f88b5484c8f5f330e07859e8e774b777
+83e3d26a2dabb6e75400e89c0df9db1d
+9b6c7e8b544f1687774e9fbe2ad637e7
+f721ec406508eef96dd7a61c55158897
+284f1990f74aae22b578ddce6a2aebf5
+be9deb71ccb58fa606263db0b09d0885
+2a42430693dc774072232e65bdc243f0
+20e3d74223c2e33998ae95ef77207d7a
+9a677b660c533b5eadf4fb2ebdd4976c
+c58fba60d258c8bd19c9707a8d820aca
+ef4dc2919e50d27350d03d147a0f8510
+758ef0bf1af93c820061f3170365f0c9
+2797a9a4bcda45912d73f48df0d23ad3
+bbdaa69ac9d974a94af49225b4240d6e
+89ccee15c13515dd36e62d302e0f3cca
+87b715c4baaa9aac82927a54a80b466f
+ee0bf6bbe42e952e37e67d1b3fb62329
+9879b1cb6ac541379794bef06e6eda9b
+d264d3f7066e8a628de5af40c626dae8
+410d8a298b6c9e492915eaa6d8e71e3a
+0062a143bd9226248e81a850af05f2c7
+0a64894b356373b295bc734bbdf84e85
+2ef67236467f38fa17a13cc9ca3baa68
+5fe5733ebfc80a7b99c62aa6b6e2fc91
+dda943abd01141d6bb376c288ed158df
+9a06df989606035c3647a1a89eb04ea7
+a6dbc89ee7fbf30dff0a289a88a7bc11
+91b57e6fb40556c735a7fdaa1174c6c1
+38d4e7dd7160a5f9c1c52885b389a758
+3f1c7bea79803d1e650e0298df089b8f
+b8b09f7a4577d598bb42aa2fe8dbe631
+63a65ff7f216571a222b46732f18fdf5
+88c4ace4699bf1b5ef27c1a0d3c06b44
+6a97d4c4980e306e1de0224917678e46
+fb7428c91e23f81894189cf75b6813ed
+b254bfbb0de9e4070d3b215f86243998
+4f8664451c2cebb274814ab27b66272e
+165c30b5c06eea47bf2a8a0c211e0623
+54a1774c71b1a77faeb095777ec0aca9
+c31112a220e3873dab4286a9e7d85b23
+47140b6a4bb4a9f3bee2ccbc0f2c0497
+f6fba2b480cafa09fcfec83e4de7bc4a
+041beebe432bb1e0e40df840c63bcaa4
+18e1e0ac009495629b57dc0315cb1147
+26ded3ce83f3ada2a3a2912fca15c76a
+cc5025c93702a7df191bcffb2fb26ee9
+546cd59f9385693ec097594e8ec2b436
+c391eca607495c15f65b6bd985eb4664
+025311d62cfe0bf5229057efd6f98266
+163cad743c5cbf46cbbadbef79335ad7
+f310e6ab2e63bff6a0d2a28c06e5a2ac
+329aa76700da47430a6f7d2734163b5c
+4bfd4b739d6b082140fee30739fe62d5
+59c07aa8759f0fe76b7d0fa2d1693800
+ba2a64afce073003ad051c860326573f
+74c882684aebc17ecef1b392de9558d1
+81c79a400a312a09d51cb13c0397ff99
+da469bff4f0a9ced4dd4578c4fab50e7
+24e479124b1919f8ae1b2d0e515cbd1c
+2f9d9e7a1bb2a7537cb596411b0f2918
+d02d5b9c81aecc13ddd7c558f4bd12ab
+028ef6ee5b02017e32c3400f17778f01
+b4740d741000d8fc68998cf9c49e10a9
+98eb3cf5fdb88b74ec9c933fa58793a5
+a5269574a1dbb3e6561b0806b7cb3c20
+c7505cd1be825f907abbd8db3e01d39e
+13cc87629cfe7e4e4dd23623be6ded40
+27920bd6c76c37170f2492fed6421bf7
+73f986f04297c8d179914c1a7dc9a405
+73b4c4b43b0e421803453b1f0c4d9c98
+18ef3d0e189ea0921710b97fd6184aa7
+8dd54670e8f9b5517aaa7737fc9400bc
+657ebcb3c2dbe49dee30d0545ed5c245
+6afced216dc53685a47755f79f06b7e9
+93f66dde36e6c2552343b73dc2c0182d
+3f80b78d63f12ca5a478cca42cfe9c39
+f6378e861d24527135d332cbd6912bc9
+ed5ce0d31cd6e4fad99e7402969d98f0
+3148d4706d35989a3616ded3daaef933
+5867a43ca3df57f89c63cfa1f729dd1b
+d7effa2e5629666328cfa1032729d81f
+86471cc0a058419f457de4e45a596b97
+fbea43f7e520035a2433b050da0a0d05
+52768460b5082c79faa90e6a458d1fc8
+1225d9bb53ae88b28f053b6207310afb
+e27df3b8ccbcbce48e721a08708342cf
+de806db6f40a059132a65bf9b6e905ac
+a63d925e08dd02621a3b801b1cd3adca
+5181b5dd0e9991d83db7e6707d4ae930
+8f0c3e9221568b5f0d46f55fe39d24ec
+fe8e905420713d222e77c7ca35cb0a87
+19fd126e9ed57bc44b97a3f1bf26ece5
+d2513afc5570bbecf83dc926fd262a8c
+dad08e40fd33b959cfb919da66e428ec
+3aaae26a2b60d70306e66e14add18186
+e516043b549128b2b516d34abda856d4
+bfcb3f35335f7f29edf7ef9d81450296
+300b872c446ec0207a4118546468ce77
+13c51a717b3b52a8ab2e3e8ae6eb7d6c
+5cc33942cdc72a9f8730c877d0b8a9df
+bb8e5448d92a06a013c47fd1b3d47552
+914d2aac89f505d46a60963da82849fe
+aee9ea8604b7818797f94c5341b7cddb
+cbb7a9929c7dee6a76db790078cae835
+01adfb4b31117023b41e7c9bbc1258ab
+d30ac7dec26f8fc990a46f967d8484ee
+7b1b6dc6b52c8989e88186e872500dc7
+c8bc42f83eb8f07b2a873372a40bbe65
+4dacd6fdbca2158bbfea458ab037764d
+17eaf9aa97001e6d808e9eec05e5680b
+df3431aea42f83771325eb687b1618cb
+ee84be277d71ecb89a0649c85768a9d6
+853f82b9c7ab3044d4dfc810f60751f1
+d4224df7223bc39da117228be0f424da
+1b03572eecee5ac0bca685cb0adc8ace
+609424a5c3aaa78b7031fb158ee37059
+0e958c3497650f535c9b577f487ab1e1
+8bab8913aabbffd3a0d368dad085b7dd
+fc142944caa722df2ce61d077cd499f6
+1fa87b7d4665a3385d4d45c61c72a58b
+0916ca61df22c4889aa5ea91fe923adf
+a9842c9ee02c4f3beabb8548e96699f6
+5a4989ecc8fc9ca65a4a9e14bbaec26f
+298eda161a181449c6823dd0ab886ad6
+e6f83f495d3e97fca7e3318a10e97d90
+121d0d0d3bfd0e9933b565d3c2c76694
+742fe401a12fb8b84354b41235c3e51e
+d86857b53e30cd820aa539d22897d6cf
+828cdc5961c6058fb865582c2f337152
+ec70364182e30f417bd9fda4d0b48386
+31e14ebaa86e987d8acef404c692ce0c
+a7192d17953b3ddd8b47d45dd2a1c00d
+8fbb3e67f6a4897dae613659faff132b
+8983545868718109f4754877c3b3a008
+ca3a8ace82f4a1803ebfacfb45afcc4a
+7534ead7b7e60bcc2d2468dd84ee27e3
+0c0426eed150759001d7bc81e8996563
+8f23dc27eac11da067927a679e46de98
+1b1b0ac8127c055f86c3b2b20e821291
+8a42f8a59e37f95e245fcec7744be9c7
+cb375701abd743979be6a779be727869
+4be0a9426a783bdc2f66cd5669c24669
+72d1daf05343e30aaeae251f8b509c9c
+43519e7f85521bedeb317d280aaa8a22
+986fcf8f6988941a13edc9f1da52469c
+a0193ca0cb0fe3006f43dd25bf9feab3
+677b1e0dcce6943e391049cb4730ec1c
+4ca9e5295a72bd4d53e6de963685e7b7
+272ec08d6052f83dc959c72a1995d019
+13b8a049f10744e96d5a3d7e4cbce37f
+956afcce869e3a42ae36c02e728fe91d
+3258bf5db3b835d0d5518813210b6273
+7aad7298ff64e2fe5b1eb01e97ccab29
+427c67a1e9bac4f1d76a206beea4e5a4
+7d580879af13cade4e9b0695dca06913
+a11d42380cbe85a5f2e376340fbaaf54
+50aada191e16eee2e94744ad99ae508b
+0a12ead41c2b98cdb60bdf7262070657
+c110c16a61f9f8bc3745d80b4482007a
+1b1864f878416b79e305f2d016a8fdff
+a1867c2fc9ee89405f6323f144979a0e
+47dbe36c54ca84cdeee0d66f0c83fd5e
+62317c14f6443946512f1f7d34864182
+d41d8cd98f00b204e9800998ecf8427e
+be706d7ee426258c1a2d0d93c9480826
+e0aaea997f7c85eaed48ef7b1ff1f0cd
+eae69d8466dc0ea932e5b12cb7e33859
+4bdf90e39ee52ea0ac04ee42daf95ef1
+d41d8cd98f00b204e9800998ecf8427e
+95d620c47df3e6f94ed5aee991fe6553
+ebd6712a8e97c2bde3924cd3adf08378
+297b9279e350dc5c3bc0aa47a3ae6917
+79fdc0b5ea0d68709b95a1871fb664cd
+ef582c7075316268a183077137d1e0cd
+d51d05e9f8770c2a35bc74b735716090
+6a288fa96b84f63988019ccac1dd8a9a
+27679b19b943fb991fe1b4ee049c0c03
+01782c522ee3acd121abe9e03d577524
+160cdae6078be56bd86db8f57e5d63aa
+0f2c80ed2977f60955563fbd48e71671
+dbc8dd362d0bcced13ae7d9798d7dbce
+d15bb5e24ee911ee79ac5ba5f92c4d90
+958d54585f1a77b5ffd5c9a38bf0ab05
+0e5a2f2520772b562b06dedcb54b6dd5
+b37d02cef6b170c72fe2166c6573b402
+55bd123a417b0e68a2c1f010508127e0
+00ca19bc81d279d2475bde4a9f5c6355
+af2c601351df2ccfac9544319e90fd56
+c6150c835b78e933ce378000aea13faa
+955945686f5ebaf87d96e0f10a4ac309
+381d09e9de818504c770f0228977a976
+be529c19d3fc336031a166768444534c
+cea12a2a19944cdbd2346b2d3eff9847
+6a7a83c656f3e3ceeaded7d979027ba1
+3b73b67d52530d1b1ae25ee4aa468020
+2429fba670161305eb4b78082113fe44
+6db7f93421dc7d648bc871f2c7b24cb0
+5419b3e00bcf8e24db18b0dcf2a2b546
+fd2245ec3c4ed1dd4775c5f3eb9afd1a
+59a22efd26c04187e9423bc37bea78e3
+5a932442675c1069f6fcb013db9e0a8d
+bdc35af4bafe1c2ba55ae91783583085
+3fb9d130de159d6eb1eb5e09936066f3
+817723d4a852f647910a283649e3f09a
+edb17316da6bd54496db7f3b590b9023
+83814c2d3d3538bfad02b71f212fcd9d
+1511fa41526999d92f5cc9bf0f991fef
+4b06bc74d08a68c5a2dd60469bfe0423
+317956baae0f92b36f618948db66a0cf
+f33d170e0b976da30d50e1d4c9123e0e
+2f960b93243216ed00c667641fd89783
+1ae9982f68aaeab92945342b0e2a1d28
+d1841e6ec9b1b606fd7462175ee239e9
+f61c2d0320ad1e4edfb58b51368f1d94
+4627688b15d9e0a752e7387b49a34d0d
+5a0c7f590525afbc94a928b43967c300
+d041f557fcd8c945672c841b552af38d
+b7da5f6c5dd18960d0f0fbd95499dd0a
+c0c0f7f8d51d0b2f042594c712ad947c
+85ff74c88eb3d544effbcc2c2fd9c354
+b8237e0a57e0d05119df3f9fa013c810
+e4ac30c3c8cc29b63442ae9ec06ebab1
+30145705ec0414e01e2e16c4943b9a60
+e52583d035a95e5bf75dd76bb083d845
+295af57065a78f73273633a3e83bbc38
+6d7dd270655aee5d431e9472edc01663
+08797704b553c2ab372e9f3d5142d7cc
+3402474b9e0d60b30ac41e708d1abfc0
+57a9b30defe7d362717af354d227b5d9
+76adbc5b3227dc6fbd6c1db9d906ec36
+d9e71b36738e6ff746e4f68a37bbc836
+4408539d74a4f2e4ec0a055455dfec26
+38149d886c0761561ab6657c267683d3
+13d668c7e4fce5c8f3858eb58ce0a85c
+1a82f6d48d2698e9412ee6465eae7cbf
+cb1bfaec374614bdb39d991d1dc6e420
+bb26e440fca9fc893954395c5e257213
+2188f5c434dc2f19f9b87f2928ea26d3
+866997cd24398355b2023317f96353e8
+a454d77f89ab9455b9958b74df4e11c7
+4b36c6d28316d2ef767c583937584b16
+51f02993bec8b8e9927ea482c8b58caf
+5eff22bbfbe421856d01dc9ee3a5ef18
+cc419ddfc7f443bfe1dd920057e116d7
+67b7588558f9fd67ce4a8cad97fd54de
+a9f8e340af334e57c264e4b47c4258b8
+47260ce95f1a915abe2ecc87d130ade4
+ac63d849ff6716352cc4b54153bd2273
+1551d54d8026b2d00a97171881f03ed5
+7de7557c2a97d6f407f55585b2626bfe
+c8e178c17b047cb6d65e28a1acb4b0fe
+6452c6ce111382edf8042305c599003d
+d1e16c963bc95ecadbe66e1228a96016
+a704447dca0dff5e61c0e46fb62e6fdf
+29a9f378941de5b25d88b856261af128
+d602adf9e9f9299c03a0c9c2bb6159dc
+3f11b431bbf38983b6a35f4a22975fae
+391277c4bc03c0481ad485bec7c039f4
+71322d0061c0cc5ee1270681f3bc2e2b
+0ea6c25b1ef6b7eb440ea60d4dcff404
+8e027f56c06ed9e43465d70c56c86435
+08d0f4b49030b7633dfe2f1eb7103a21
+3862a463e5e47178a88d188f4ef90d7c
+691b52ab475604eb2b186fbdf7cdb09a
+2cbe23603f30a41dd1d2aaffe4377f14
+af88e10ec6e763cdbd59ae81fc15e1bc
+3c445fd1a12a7b2fb8771f983a3dc4ee
+d1e84f1d20292803b295d65d35616e7a
+90f4a325b82e96ff72949e1946b46f3b
+4819aadbed8aad5cb1040a8d3e3b85e6
+bf197c16b7a153dc83c10d19f89ddf08
+325b2d4e00b546f19f7f666639f89f77
+45d712bb089bcea4cb39978ba3a19a46
+4b3c8b0c18aa54ba53f498a0d8573eab
+e5f79219cb635e70f1717c5215edf4cd
+b9e587dc11a4df3ea6bc639e917650fc
+7adabebe524c2be0980e6a5321e77d28
+2cb48e5707389b2e664c15685d6a96ce
+fc55d0ca4b689182d87ce7fd433c57e1
+03ff4bc3eac84c0b5ea61d449b740dc5
+523d87f0f79a63c969ed41300f280983
+b6a0edcff378d091c81414766969252a
+731e1671a8fe9d33f5d7b057ece0faf3
+aa3d8a70f235e13e711a53f041b5baa9
+207ae0ed245b1ead4f70c09c825be993
+fe58bfffb092653ad9d17d977ae69e25
+9c991bed0148b50135a9e05e92a1646a
+c904797d8ad4e2fbdec579c88b9f25be
+a50a2c4346324dd019ae6ef3a7f74b75
+f60f34010a480a8b43f2d5c9ebabe81d
+6b11c4599d7ceaaf4cbb4555539c7be4
+5075bc94b4d9d064d27f9573037f0e49
+a479d484e0ed9ee4d3b777d22890134a
+42ec348597bfca3af8dea159dfa1fcb9
+2fb5b54921b2282b5b9319e6bd495f55
+914a033878015c07bfc1ce15ce1b366a
+ac2140f976eb07579f2dcc63e27aa5fe
+116c46e663400818e6249568e19775bc
+81af1b533c64ccd744cb2118478392d0
+7da90c32c7ecfbbc4d2e44785a3a267e
+83e4cf38addcc54325a295abb568739c
+e40ee7b4106f6fb41cccb3fa9dcec130
+05652aa24e2f0d5f24fecf441dd81906
+54ec44a2dc1a6746f920e517196f3617
+1ad823491f98cb99e391c8a5ee073276
+9e6de6ffcf70d830b046c718ae136470
+3d7449283559ffff7740fdfdb9e689d4
+9604b4fbbf2dc5325524e0df8d3e1941
+aae90502c49a1bf38bb3c4b38e14e3cb
+2c5aa2698158d343c67b6e513f05db25
+64af7d958138c1c3708586ebb531ccb6
+83b6085f803e0f57bd0ce5ecd1404762
+b6e11592cfaf1c1f4be598c3076ce88e
+7324ed4aa05a3471eadaba3b7438447c
+ab742edb10ff2c3cbc83d2c6d4538637
+54723755a49d373316dcc919bf24c4f6
+6cec8cf410bd0e7045cf924a9edcdab7
+823af8222b1c23abca6d5132a0af90ab
+c27144ad5be27c28a3bb5871c3475092
+572a18156f9a88a49456edf3bc3ac277
+c79dbc5120a4b87145493859c0aa9c6c
+4c92906d91e4c504a3d1f757dbbd46e2
+c362be0acf8ec2ebeb121b4ae8aa615a
+fce704f40a7895e6a1d7d9c4d44f5ae5
+f43cff3887f0f4a3f4993478f67ff8c2
+d841e3a96834b9f17e99b37d7dc7c9ba
+139a458453c2c7c4d528ea7b4ce80682
+574ccae2241c47c4bc6f16052fd3b27d
+180c381edb2580277852d5ed139f2481
+721054d1008f0218531ec90eb2dc359c
+b99ebdc3a59a455cc0123df30803ac76
+6b635d722a90a5a4047427394f1d04d6
+c91fe693bd71d6afcfc234e03e51e4c3
+a9fdc981adedda498f375197dd384e16
+335d044120bd2e5de0cd42e38e36898c
+696f8f723a4224b9f6d944717133b153
+f241c5c11de8d6b1b5b402ef7ca12a73
+467f7fdc6defc148033206749e5a4f90
+010fdaebfb75355d7fb0383e1ec47a1d
+b4715bd9f0298b029b00d86e96e7ffe1
+4d2921d8c497f5456a7a397950292045
+e3fe198dc8ad7b30dbecc2c68740ede0
+8e37e9c21cfb1067a664c0da198901bc
+956815a587ee7902db5b6a0572659255
+0c0efcac31149033a9879b4fe4c9f72e
+c0a49e728b0871fd66d7e91a61e3a985
+0480ce6e86fa76657d0eaa2b0fc7a5b1
+4551e5ed099c081509c9e5c90326a39e
+ab02ce00f110bf5d31f21ed52bd6645d
+0f80069fe9e767d6556df3b7c3238d68
+414e15eed4f26eee9b9a9aa78245bec7
+2686ad745217f8cfe29f4675d3246114
+6937b41c92636a9947b143ee6494a0df
+ae54aa6ac3af741ced23dc1af42d3274
+24bcf0cf697447579f72331fc018cd0b
+a04cc329e903d437d61c976d95ec70a0
+3dbf66dd47268de96da5f48b70acb95e
+cd8f4acda3ce871e9358c96bdbb2bd5b
+15f2054134ee3f97ee798f6de1405549
+53c172aaca94b39211423d2948962e6e
+14ab8d01473e74f72f734c1c5c3798fe
+4b7c31f79bef4b4e45d2a50bf21abac8
+b47356b219ecb00fef7cdc3c46bec2a1
+c3e74088a21dc0a097de0b09e6dded50
+0828d74ffb4e9eb28908723cedd0fa58
+343005ac77ee718c1eb7ce3967071765
+bb103acec3cc7518f38f1dfd236c3b6a
+ef18ca317ec1be47108704ad26a9ddb8
+6209dfd54e637e1e99cedf909f8819a7
+d796664f3d5f561ffcbdb4f5a09d88f2
+6d79d5024bbc51476a883bc23ef32167
+94b7e975922bcf50fac0a0af6575c0c8
+1e9400f9c5c7ab55df1ddd1ec252b17f
+88222b45f50daeef93a762cfe54541ea
+25f0cdde9efa9ec0600d84c3a2793626
+5b54c5bea0484de0cba3a49d3ba35453
+80e83f78e09f1e4755a25f976e6333ce
+96620790726200f951ede48d59d091bd
+844724bd8d458cd6c3a571493a29f27a
+fda9cd76be2685b8307d5c3bca152b90
+f8c6f79ad80862dd3e19aeb2344b23b3
+0d040f9c1e477f09f928070d0ee266c3
+869cd2171413c5a2dc21947d039b1976
+cb81d7bef17d96a50243444e36077c74
+8c3cc78e9e40a08d61fb817c87105c4a
+362cf6d658c0395dfc420f8dc4fa8d22
+abb1c8bb7bc7691902335626d74cac2b
+70dc9a9eb64d5f9e6007503f0c21003f
+d1453aaa5b05b7f314af63ad75bdfd99
+ed490c6515478c8254612eca47006d56
+a0c45a04fde40dd2d9c01e5aab30b526
+61d08e5ad4bdc750e07eb29c01078f98
+48cc57f026943ac7a1e8f077183decbd
+fe7df947f2619d30344e96a4f6f0c87d
+113fae229642098dca4c638127ada743
+b62d517db57faa729c5a941c9762ba1d
+9480de0ca7ca4a80abe1102ce2f610bb
+2add5cc06a6eb5653bfa0c23222ed632
+fdc777174287fa385228449930cfe862
+06152ea9146ac173be206e742db77c34
+9cd10bbf8d2277d1d35128c121b31c8e
+9e9a07c2d6e73be51467d8cdbd2c75a8
+69abd4ab7acc4cbbf2b297e2c3708742
+8f5049c4e85628cd8f1682ebf34f1e16
+650e314a6ee395c16d310fb5a3c8893a
+3b9c866548e9141e2f6fafdcc765b83a
+c5af4993a8630b89f527e01b3e339efb
+02fed860a5c4cbafa5396d20186a62bf
+42ae6a11e2e0cb2638785f7662f749c5
+ccdce7ae45d89471eb9ddb8b26b74823
+34b3d5baa10287dc6e1eac51233f31fe
+f1a506dcee3290487be34248ab74ba6d
+a32b7cad35044154938cbc5183288e38
+0e0e86e38da0a428fc27e0d39f889ef5
+10052e8cc3503aec16b0473c3110e1aa
+7c64fbf37f23e5117cef47178abe149d
+9c7993c92b001930668adc8495f6d188
+c9407344ce4137fef8de66e62f3f719c
+656db0c9220e92e5ae17ae73a955dee0
+e2ef15a69a5de6406f080f2d3697e212
+370894d9d20ec96a54efeb57199147fd
+c15488d9303fa2b87b44ae93825ac38a
+840acbb3c7a042a5f56a0423cd330fd5
+4eba0366eae673ea84482cc8cb7ee4b1
+4ccd856a39fb2f601b0822a23517dc57
+db28235707452091300c5c9ec3456062
+a51c9c810fc48d111ec457142b041801
+1ce3bb37b4d07fc1ef549bffa370325c
+ca8d1fdfd18e06e40036776e705bde58
+50bfe6d5f7a2f88c1517ae9154824baa
+c2ddaad71b84905e4362383414f60908
+20a9f7dc9f2c8b5d51fb695dc0c74a93
+d4eb101c20feb0d2541a9f2976b91fb6
+a5bb0a6781e2644e2560245297a7f6b6
+7fcbee4f03b09f995273f2afea158bf5
+9c926263d35fe49e9863e2de8763eab8
+e5d7396bf7eeb12b3d4a1dd583494c8e
+4271cd9ffb5aca4e7d12596ff2815f97
+6ab0c762f3856194529c3086ec505719
+daf8ffcede5195012c1ab8c21855ec37
+d587b5e363c5fe13cbc782b8a433f374
+173c18ec2b71496b07a7580cce48ec58
+cd930f987158e0ead977fd2e1db461bf
+38600cfb8b0562241a3fce98b1386b65
+5ac6e4283ec1960e5ea69d830e97c21b
+8bd949d9a26c85e1cb3a59caf8403138
+640960549e2e5e52df9fd6210f880658
+70fb539819a5d28829dc75d384adb8a7
+a695e0f6401f391d3c86ddeebfe44244
+4d3f29345633e277688657246104301c
+daa45899c7344081623db53d330f4bfd
+e053bc37b236978c0fdd25c3cda3bf35
+cd74d515f333b93cde34015ce065c596
+ccd12f5d8c679ac6d0403bcf2a3f5abb
+2667ecd0bebabfffd80413fdea70c140
+74bf3d6198633de5514d1daa1584b405
+0e2558f68606ba5a27a757636077154c
+fcf197acbe61b80f9f54fc2381e9a4b5
+834b1abff4edfec6d797b36911b7a50c
+acdf7000f89471915e384a7d66a8fb54
+ad7b14947d8eb10eb48e7a0dbf4c9c0d
+31c6bf11f59f603a28005113319bb7fa
+96f6c1604c51ecfb03883a87895a23d0
+9684b86c622dc78c627ed07a12abb55b
+8a4f3c660ae05b0c63a732530e3b36d2
+a45dd32389bc6e025859742aef855256
+f218f8ebcf07228f9a274253d100e6c5
+47f490dabc2761cb2e48cd75527072be
+e39fdc0c9aac34c2150f7ff966bff5fa
+382c02c722a319435b1b87bf45793c69
+9f2508f194b974d7348b1bc98335955a
+c362f4c16ea42f189f3bd4be7facfd8c
+21b3119667fc79ae19abf0b23fe2eab3
+e6834440fefa21652d98abe9b900d495
+ef386116122b5020d6e2649aa137359d
+9e9d682bd46e1fbaf5cb75977f1e7aab
+9a415f1bdaac1d194e7e8c0247ddc6bb
+872d9c36ce5b3345d41b703fcc9ed5a8
+d592daf19ed6e7fc614ff45c215ab940
+0d512a953ab122f179f7a19d0c6c4bbe
+cccfe2d7f54f5a402e33917ce73cc5b3
+94d8b557d5b8ee181e9ab8cd3d0260ed
+e437afc23af3df36ce8ecb0d73b6b0f8
+afda706fb885bb5260925943d4942643
+d0d133438b09c0022ea604e9c76bbf62
+357ea46d4613bbc043a445ed698b16b7
+86015e7589d3802cea9444306cb7d1f1
+ce9a00ab3470802171abe58d4ba8ad68
+df1fcbb320ee9d7780ce3e2f2df53495
+bee65b4ebe85317ad75f321c9c112a84
+04a6fa4d7b183ccefe3c102838d88fe0
+3b3d668187d23db02cc2b19bf264ba10
+c5bab8de3c2b48c623c7130c38286d18
+ab9e01a0d24e56337e9383ef61169a96
+c608738cde90ed463c3b0e4e81611f05
+d1de09eebfc83a5b161b7b933a86e9c7
+d659c15c81c1d886e8ca7effce594fad
+23e7bdb66d509cb66bee53495b284782
+4cbc471859014c0c255f57ee38ae53df
+4b4b93efbd4dfbf6cf4b376f845431ae
+8409866e35c7d0d402c359030c42cda5
+51f3b94e1d052d860bc0760cd503dc8c
+5010762b4900b381a440ba94c29fbf7d
+9a8b31ebe105d177336c930eef13c548
+aba363ab4a1728ca88d01e38917dbe95
+82c4bcd4f7dc564b02356655beb22046
+968ed146e407d19fcbb0e0a7e43937e7
+a4d21022e4a7d536fd0bae156029d054
+b75888adee7e42222ac580ddb5e3b504
+41a72d69d8ece3575c5524ffd43cfd10
+5296d988322357f7be2c6e73bd1d21b3
+55b751bc23cce0b97d51cb0bb7ae8e90
+4c25f84228a2aa19c0e257fada92e076
+6cf0bbcdc028e414db3e1ca8424ce99a
+636ad9df3f435df7db29bdf59ca54869
+05de108c7dd546e3db8c9cc01b2d92e3
+5849497f4991860960db2e870316b7c0
+80dd7c666a8ccc3fdccedc75670eed6e
+7555d8cd4bab1340380555158f6443c8
+33eff2755a38bf2a046cf0ed7848dee1
+fdb572201ae17665a3f20054f3338e0f
+96526e2306cd9a464f9e7f1e789602a1
+a9bee1c9208c5ee48d1bc8496c91565f
+a3657858fb9f40fda9ede357b739fb4d
+c1758a4bb8c3c90dd7db6b67bcaed827
+84a7950bbd6d8e7b1ab6449c1ad9f763
+d41d8cd98f00b204e9800998ecf8427e
+479191d89d0ec2a048b5a6c08a3805bd
+f4fdaeea77d8232402e6a5380e653723
+c7d6462086cfaf3a1a37ff4c74c8ecf3
+7417700dfa23ef81a00e45ca28064945
+74769e1d252f72cb7771e343dca761cf
+75bf97489dfaa915e345a23cb513ac6b
+f4b8ddaa1bd2babb8eed7bdf7bcc821c
+d41d8cd98f00b204e9800998ecf8427e
+f1060f076d8494d8fe2f52535ffb828b
+6f7cb8276a2babf56404bf1c359702d6
+89bfd59f41413116bf39b263135726af
+4b378477c8af74232ba691d3369be558
+5c713edb2e0f8d713b96c4ed9e3df317
+0d3897c2c142abd5d47ca3865b4b6caf
+6c1ba6af85e1efdf32a994be8f676ee6
+1102fb85ff10fe3549cddaf39fc39cfa
+2f500664cefb0f69124a738f27e971e3
+84b912ff8a5372f712d538d242e77691
+1c8cfeadf3b4a2840f2f6800f6a996b5
+6127b8cf899db470f2e17543a36aa0da
+aa50447c4e7088ef36df1844019f912c
+8bd5aea4bc8068a81795ef641594ba01
+abe3d5633eb5c6c881fbd7adaa8912c9
+5d46c2bdf943416af31c6650c28a6fc7
+26fd597e6bf4f1abf726700295ab56cf
+ba42cd2e0e4f8fe5080b4946ede71006
+89f361fa7a234ee85df88bfa494aa875
+107e719bb11582124f0b706f88accfef
+7606ef8e75cc1708c98495bbef242695
+cb33fa3ea9fdeadf7c7ef0bdbaa06567
+a2eea2d156e3ddc802fb185d4a4d7a22
+03e3954d48f152b86e269ec706a90cba
+5fcf57801e2b4b256ff248240b3b9c34
+edf1c871e0ed341d24a420708a6f98cd
+a25f05a633a97741ae0ccade75c4dcb1
+70383aaf5bbdb8168106755bfb5fcf5f
+b77bf0e97960c5fa659b2b11d7d8d3e8
+e0c3b5e5ea1a13d06c131f1ac98ea9b3
+870a58ae9b9f6862d7a4bbb623be915d
+8179d234356325be264d3bf596411573
+62583b5f61f7705f39b11d2c9d79ccb6
+8c56d650c0bc6216fb7414149e591c9b
+3824bc31194e9317c08a7d733e38857a
+b29c72a17ae06486ce649798c89ab266
+8d93cc15f77ad3fa0d40e61599cf881a
+e98b305c9c18620ee538263538417067
+2def001d72b3d7d1711b33522d45d9eb
+785874b14c12bbdb7a8637f5004ca86d
+5bb14329e2cf7112d93d7f89990de8bc
+d827e5d1f651c9155b89e41a51d90406
+002df920258723c82e2c6bf22854b780
+08e5ce8ed1bcbec7c4a5cd4529b6b7a1
+4239519c2757bfe036a68cc4d9670478
+4defa5b38e5375c578d7e1561d58e563
+b7bdbfd3db32134ef4f16417ace87494
+2eccb54525abb7d19302bf7b079e68f9
+f24623dcfd27016b5ed7ea2e9adeff7b
+023dd2b9e718e11dc258e69de09c591f
+18a120736b28d8ce7a612bacbdcc099d
+c4a311fd37327c1bbc222f827f0f1c1c
+4453edbb9c8cd8634aa785057f170856
+6b06911a64f95d5f9b49ceb70c0eb71d
+662543f34d49c3d6d4d7ce296e668526
+9344b9ecb3ce8709361b7a0180903036
+b4f233b116572a74cb0d90ee93ebbb0f
+40790b9972ecff045591a0bd869a3ed8
+84c435d023351705bf97d8d8656d6be0
+86600c80e714099fb1f1a799876c5a8e
+cb28d9951810a183ebfa03357068220b
+3ee47cddaff0d6e0ad43fbaaedcc15a9
+f5b294a8682676315133d5549518c515
+53c8bf11f19bd4084c25fb18fea75a59
+3648e41b5162dc5255f2a7ffd220e429
+5a3842bf510531c210c96e7f183ed15a
+ba5e2b6e1e36b1c4c15ec59695fa46d4
+f0f0589c9b81db057f1da95395b6100f
+1683a562fd34464932e174a70193a00a
+ff7aa2e8eb5c4bf6399697aea0276ecf
+4c0c23910a31793e2876d4953eb143e4
+287d6db738307e38fdf1e117ad14a474
+1d5865023e1da201bd43543a4f69d58f
+b253a638f2a49106ee2c9690ae505445
+f31b884d359cc7077237b4c7cb47aba4
+443056c2b77fc7725259a7a87ffb37d2
+c4d2257083430bf299378bfe42586cc7
+c135eff5619c63e5accdf3a47035d952
+d6472d24f6726ecb5cc846a652b2a4cd
+11bfb13aaf692ac023306823fc376d51
+840f1e396da7ba1cf01bad4ef7baa3a7
+e233d88455de55fa92b6cd77d410f36e
+6334414d5f93ed6baf8c3cf8e4bc23c7
+e498a60b11e9c8c5465b51e437256aa2
+828978a2a425c23b4e5bd6696966a351
+ec3be355be12c5429e8e33d74191f20b
+66a3682f3c90dbc136270991373e6123
+f88bc6eafd126228df852d30527fff31
+e6d10ef8df1ab4905ad046b49f131078
+32d09306c201297dd17e8b37eb32e5a9
+59dbff02992854b402ed0db653fd3277
+779575bc926cdd886a6a283419ee3c41
+5bfdb6a2a02ce14733b86bc4c1760d37
+608ca8c9b7c003ba766666c2b197eb30
+2c7a2416e613ff2f007211100e2a3095
+c8dc4bc9d788f47d977c0e5030b4943c
+39b3012dca6b21ba02e9bd9b4c3e1e61
+0f336d244a0bf4cb42d7e15bbc6ad7bc
+5d43a68789423bbee418831869e42edc
+8a4421444caa950fc5e2ba451544fc29
+85c474435803a508609ac0a3e5a44d99
+3a4fe741e293fda320045d8e3763066e
+519524a4f401a47b2a1cd92a9ea45db2
+ac15367a30f9f9f8f6495bb449b04666
+cdcc6e5f69307a3b27662395b992d5b8
+0c026895fb04b0d74d50c2d22f7d8356
+7b99d5ba87da05f56150057a2d5b1a74
+6a02ef195316963f8717b0628122a426
+ba385fa2a374eec583bb568de8064833
+b076cae7d697b132e324f675f6c48765
+9b9889e0ebc7afe9f23a2886764cfedc
+4b32da81eff0ead40ae0f986ca48361c
+b58c75b5ccc54024fb6706377a871916
+e1d1ff568671eeaba9757f1b3e531397
+afa48389ff69fb1c26ca79f07eaf0084
+6fe01a29930d1ba9533ea9c43c485043
+9b57102849560a0d3a61d54eeeb0a1e6
+a57194eb4484e3278031ed072bd27e6a
+77ca618a7f2e10ebc809f90715932775
+c63973bb6cff6d8a59ce4989e3b897dc
+cf910dfe0be23320a87fc46cb844f8fe
+4fdee135fda2126623c2ab4f19ceb05f
+c4e4b22c9956532b8899b643d3ed5475
+b76428383bc99825efdc5aac936bac83
+0f2c9fad35fea3a02a0f2b2c74988842
+9ceb9641436e0faf1d0e962ba1f4d3e2
+37f0d72d5ac6745bff1ad9ad19f251f2
+3e78d04f04be5df3ee5774555c9d45f6
+6f38888d5ac513b19875dfdd045023eb
+ecf473a36595a415409ba20194d8c170
+2a0ecc57b9b4674fb98d0925a0711d1f
+eea9624a25557bd5e13a22bfdd7dd742
+0a58a0beaa2b49ef4ffc2a68ebaee683
+42437d312dd7a68da6d3e623ee01b2af
+79a7aa468ec835523bc8d8e2da64274f
+4e2c56c5e17aec6e5d4f26053143a3fd
+98927ecb272004c1f45eefc1e6698264
+3a3cb9ccffdc235cac1582d6ec8088fb
+cccda001e52e372cfd8357504fd231f0
+a359c6c648781bc8b669c729c0ea9532
+2b507e51c20520744412b97a5c167ce6
+3dab09ce4db82219f59322c8068597e5
+2e569ba0e7c35e23a9a2b62377b70d30
+447ed3eafa19625475120f0ceb526883
+017895177cbaeb6441205584f7018e7e
+779564082718e3fd92a32069bbd12c06
+271da3bfe787590fea6ee99090d6f9a5
+0bb0fc5df07da3663d418e2a8bb13f5c
+a81ebb1389c5f75f150a07e59bd2a27a
+73f347b4c2b8e218504e1f37ba8451a8
+74a2de90012ea85e082b5f08955a75bd
+becae1d1d9ee277ea134e3f6d98b7e5d
+e3c5937281c74058ab9c7430b0c5b61e
+c15cdff210d8250aacdc6747c4823dd8
+1315728a52d23d0b9b9719bae34ab8bd
+8197e78288ef662089e23a3c19988a92
+9bd5642372cef5d3d915b32eccf59edd
+6705fab5575bd4b8ed5fffda6304220c
+54a2407df47cef13f7c6879ff7b88468
+d41d8cd98f00b204e9800998ecf8427e
+757d9b432cc4b31d284f0ba9e1918c2e
+6d0130ac751d8b55cf9853410d8a76d4
+4d99b5c21bf8759f2ec78cdf0742ac68
+9c2265c54e15d6f127d269d13c02c98d
+68f9ca9a08b33ca85910c4d0d955aba2
+94bac64a9ded743fb00abfd9b9517db3
+cd1f4293f589aebf2cea5c39dd2a48e8
+3f1a91f5426d555bd71396f70f4be656
+833b17b1674efb04684ab17cf11ae215
+55b1e3e0284ad230cbf9eee634fb3f1d
+99a8752336092bec824ee7ec2640a9c2
+158bf81b0f935984e7a05a94285c8589
+0249125fa9c9784fe591d89bf3d61908
+a529b033c7ae3eb41d42de5d53f76058
+3072524fbafe110d46e24bff3085708a
+a7274b38aed0da9bc3223067132fcec2
+8340baa99f55b5525b0ee59936b2d01b
+184cf578257d9aa934afa4098e7f7d7d
+70353f77c95f25b7d3e66f2d97b10623
+6f18b9e3f1b95d359dff94a96fd8ad97
+c1c13c7d93ffd5d500cd7aa3dbd306b8
+b0b6c072c8f4b868238b20d167322e53
+e36131e5df6d3cacfbf60e144f00e907
+759361f3f4a18443d719cc27d593267a
+e86849c168691fd865881538f6c90c04
+fc94234d28256700857388213f927c12
+8f5835ada9526445e98b4a79e027592a
+14a0965e2085ca3b38bcfea33325a6dc
+d0573072831b02be0b2ee32acbfa82d4
+d7e3693a24ce0c174d653100373f358e
+664f42a44ba55d298f0d5b90c6aa9d75
+373666d0a4e7b040e6b1fdf9e5d77cab
+45305c86c110b07b7634a89eb9884028
+ee160377752f5c42f391246a473984ca
+45fa76b5dda321c5ff12d20e142b0286
+116b04e908df9c2ce65f1f9a3f68ae2b
+a826d5d048ba3e312a937feee812f441
+38f975217a1a0d1a8aaaa527bd6176be
+b2740f8d0195ef25dc5b10ba946bb4b1
+5f5282121bb806547994cd392d3a3d02
+da52510a6dd0437860efc9bef64c859e
+279bd310c02fb195c2bddd936af3c6ba
+139cda641e95056a296b9c30a90851af
+8c498f88eacfe0298d7257461829d5da
+d92a757856d946607bd4cdbb95183a43
+c1b49086a71fa8e4ce1b4b479af1c62c
+674996585877ba3959b2927b3cc69871
+b45a4b629b1713871744d9ce10a72007
+dfb9057664a234ef370c001e5db38593
+cc188bdb21516d6c45df666a6716595d
+6f53b4f3686168377bf5e67c1a29e356
+4f44922d87763192d9d449c8f31c298f
+4220ee17832e8df40538bdafbbaf7a93
+2a45a8f98c8e69ca5ace62898c785f1a
+17aae32fb8ac7900b9f1e721902d2363
+cba9dd412ddee11506d5052ebdff2300
+b282411548fa7dce1319734524e32ee0
+67dbb26870b9d022cfd248ef30c5341e
+753d666e38b6517e30ec3ec7b4d43269
+bdc6dcf1d96ad0f9084a00ea716473e7
+d45527a8f7a8cbf6880144255f2d7442
+c2e69cd32d94ec17c79e3a9bd30c2e72
+14c7e6ef9de7f4421da3b51a0aa4d88b
+55e52c0ba6c9e96219c8ac2e58bdab56
+2dbb85d34324fc60c77b9f13356e304e
+4ddb4a24981463e3daef7e5c7e0cd0e5
+d649986670d18b01ba24dfb57ca38bcb
+44c93678331019b3abb5a28b563558d1
+4167cffaf2008e2b9ed3fd2789fd031b
+87356505134a00da7dc1c24ef128fc42
+7922f52af57e3934e77b458b167769c8
+ee2cbd530738e9f6e7d29e9d85aca09f
+a0d13e78759433fa31cb78a12031299d
+629c2a3a93c965c46722fb541ba36bf4
+869b84304f84a7c681884b475ea9c2d8
+1476870d2e20d5196213aa1add995c33
+f68027bfa07e535a3248726917f10b93
+cd8509c1631d64763d6ea2c3eb208e5d
+42f5deda5500d4e8ac7f0ed4f52f12be
+c1003c5d63bff748f9b99e815e709071
+841346eea81e8f15af79d00e0e51e8fd
+bbe21f040c35ba0ce78a11a1ba51dabc
+4158b6f68874d7fdecadc6c9c6802888
+c45a57660d9718648e9c276f26e2e24b
+45185089743aba8ea447e29e17134b06
+00a3feacbfad8e495de8c198ab7ee2c8
+1c9650d1c846985630a111a4acb0fe0c
+58b691e142354f389414c5821e91fb03
+70d1aea9f9ee46032eabcf3c031a9850
+e974a17fe7cd23f6cfb240011733afbf
+da8432bd4a41add2535cf0ca7d795649
+0e326a1d0fd11d3529250e7091916486
+75c0d1d0e521effddf700bebed125485
+4bafaf99a44787b68c515000d65cf11d
+270150e870b329b7a89b5999495749ed
+9a7cd3ab86fba3293c0b7faeb5ce96db
+c826fae6659cdd8d92cdc914d384517c
+27c6c0cea4702e8ea02891cf0c709226
+2af7ff4a61f1d14ced8061186c936835
+754f68e373086de1d735e0d6d5269f2e
+b77e4120b0390c02af58c6e338022ac1
+9485ea666a090bc27c58b6d57ae6a25f
+edfaeb668444a2d013848f916f915671
+b631b5761aa77860113844c1cdb339e5
+bd5bba39473edffdd5212b4ed81934cc
+2f5d48238886c68fe5251d9361510013
+5365f3cd68f5f2da96e4e3bfb6b05878
+ea9f07b7c8e63ceb4ff1cb7ce3f0d50e
+60be5111874ed17f0f8c851c3527413d
+1bba32cc48e5b45cd7262798863b5d50
+a877a91145cb097ede194c1c1abae3ba
+4ec98d42b504d8a6206cf635ae7cfd36
+3fae788a1df09523f1ac850ecd6d2f50
+af3c193b275e12e6531759384c6578d0
+27c90e412f4dcd812099bad8f8b8248f
+37dcc5c8d5de8aee72d101148a1c1a6f
+ae5141378821390cef63c1910a9cc2ac
+4bc8a6979b90d851eb40b4519471032c
+b1d5a1ca4a0e4c6a38a623fa19eda5f7
+7b89a0216bb05d54a000d90f6eaa4504
+e5bed5fb7c9c1bc9df7a471b8bedcdc6
+d393564d4827019130526a65012c66d7
+449add3972b0b1940648220fbe380e46
+2071d1143e37f66e24b286cc7ed2b3f2
+b8fc5b7bd9d66b12af265ebc4a55e18b
+e36c303a9371b779b5e873c060baabb6
+929ac5fcc0e0146e866e634f77fb9bcb
+ddcd0373a9fae0e7ea1cd0f2902d369c
+4e2bafe5a7db3d8db69ed4814c18a7a3
+014c440a5c9cee34d8ad3cb3bc223a25
+4dbd15cc29b0a1d5a988239e1518c88f
+5c717f6fd8de2d2d2771d145eb1cd738
+f4d4755bbf48e9fe75093dfbc50deae4
+769cc0958e9469173cc102d3c8408c69
+d41d8cd98f00b204e9800998ecf8427e
+b335f56bc30ffb4191d48b053785d6b6
+5fd96e810fa0706f5c983734e3fb257b
+b1ec03c42fc6c533b802254ab1f0625b
+a1a99badb1233a404e9d5dbe923d36ba
+f3e7d5a0f08e551a7ccb7ae3754655fb
+db5ff96e02bf12bea93a361ff5d2dfae
+389afca673b2e868d195ef3164b3c605
+1e8281f1a3a69c0691cbe85933fb213e
+aa564151cc37087c11ac1d671a50f2ae
+d2f3ebfb0588b7139a0316c4660ee5bd
+5ca39a489397aa5024a8c9c5114cf2a7
+e5f6a9499c052a872046d84a9018e98b
+8e5672281f58765335dec71130e46af0
+3ab0ddb3fcf698d6b083187478c29b7b
+b49f7e088905a8c0e30f07a4f0797591
+a16d7a16e0599688a93ac68e9c53502f
+844a61193e7e6fc4c142fffd2a538892
+d73a3dce9eab627fb34f3ed7335887be
+f9b0ba5e9488d39f3667ff9951ce4d76
+a525f097a9a2d1bc11a5ec0b8df77da6
+cfe6b3af97f99f7545f1973046e96397
+c1c4ec02cffb80a89355ffc08a0ef09c
+5a9df61a25bc19b989bf838b4074eb4d
+225a6749c8ab373f1c20674e9d4a4ff6
+fec20d268a6306dc1840321d5c9b7efe
+3b6f7fd2f094bfed1584db393dd4f309
+bf81edaa45376518dd0ca33106ce3e6c
+2e3e328d7587f034bf7d7fc0954a2d60
+18c27f11f5f4d87a265f3762476b729b
+e9082ac74630d176c9d9f564d89b7a31
+facbe35fa48ed77fc3970725f8654c68
+ea0be514ba58e5ed69592f10fff5a588
+426bcfa064630fbf84d197448533e702
+d64c08813e3dd3206c11faaf26011c43
+dbdaf6a7b3a986655f645fe86f448305
+4613719d9ea2b9b354e127285d64e070
+d163e84e816d9c77b578e0132e3a0c55
+a8e08b260543ce1d67de1f2e6a416fa5
+7f605ed5527ec5b586753b81e0e6b416
+bc2b1400666b9bf2fdcb159c5b4e5c9b
+91466aafb845ee434e9ced2663d34d7b
+d57cc0f11e762cf4067b3a2a3b5c4afd
+59066ee941b32e5cc4a51fb0b9933c73
+f0cd30dc49d7810523bf215a924e0d5c
+f5f9bb6531852a10154439106fdf6f69
+5a4d8360b654e1893d1869b7fdef88e8
+e06e62e939ba439d5eea457ecdc12c25
+56edb6a3b332448a1f28dc65185234ec
+ed53fcdfea14a7cf5f34481ddd4573e1
+113c54a50793f3cc12ca3d8d2e32b693
+84f10598f6eb522cc8e10e897fc16ba2
+9a5715cfe60d86d846b76f9032551c44
+086975ab42fda3387c6869b909108667
+a1f13858e608d303a93fb1070fc52bde
+af49386da8cb09b4aec5bebfe12d4a67
+45ec4fe4ca1c75f7d35ad17913d6f504
+31ab1ff676197eaf05a9e238de02fa01
+1bde697a670e6b9dbd29f959898cd7e9
+57cfed66f26f2d84a4bf65e4b2c93d34
+adb601c67aa01f883ebf20ca5364d197
+51ab1b14a6362d832020bbec17242257
+f454ec2fbcd6076ef238be8cf091cefc
+51b39066e5d23f731d143f0651981d95
+581be8a2f8a6ff926451e102a44cbe72
+fc82a9f729a195c0061a5f3686fbde73
+8e809e76fd2fb12062776d5f36a61272
+d3cc74445cc7a6d24feb3e340579e990
+edcd13ec509967bcb7461ab6b433df36
+599716575146723b05fdbbac23f2ffe5
+2529cc9e732bedeb04cb0ff1f867a7d8
+8152359826ae10f8cd00f298ded26d8e
+4e77505a0b95c92846942c6166a4a1b6
+127259bdf8bf63f4109fda811938ccb7
+8e381c662d33ffedb0ccf4f44da4fab7
+56f5e0baed918e67dac35b7258cbac6b
+45189951aeb904f66cb73d917a73a61f
+4ca9b0ab42e86b6cf57e1c16187ed0f0
+aea5c033fb649ddc68b20c0a8e35ee91
+60e5ba738476041728440331f296423e
+447ad57dcc75168644cb5bc68d69159d
+594e88f699d326cd2816d7e078a92ec0
+a5aceccb24d345af283799441c990cdd
+d4038f03cbf863bfca65035ae0302e9b
+fe4941bbbf09c27abba70eed3aa37baf
+c5d34506e05fab782e8e5b94b75727ae
+c975fa56d69552fb9ce9c7e7ba53b7e2
+c0797ec6d5c0dbcd5035a175725eb601
+568a8ad9b5285a42d6a2709e01e5038f
+0d754f5bf0d4208f68d878a82de109db
+ab4d141e00cfbad95b917f2f56d3fe61
+02b0c9f81cf9af9a9de7ad8c9ff9e5fc
+894c56c6f76f33aa6a178da5885aa6ef
+b5a7131b92ca3776c897fd7e999ef6b6
+75ae00f20a8083b660e95bfd7d665ee9
+d74870512754c80dcc7fa7ec9bd68893
+8e13aba0322852c674d549df994a7667
+26c5b51f3cfc92e6603f571b2bd285a4
+494a4c85dcc017f07ee1eac991939306
+95aad84f6b9f3fa10972b3385bec5596
+78f9f6af1283a53e7655baff5f8b6b9f
+a31d282fcf831498a0d600fd4fc304cf
+5c74ac3e9b6dfec1ff9322ad4e18bace
+55f370a931d0517f211bad390132c956
+3a6dd052d6540c03456dd8675fb7162a
+ed3cde5855fd209d9a4a15c51ac09f38
+a96650e9e70f3b257e4f672d08a3a068
+dbcc2d29ae7394df6576eb5b70b795fd
+9196b56bf7fa11526cae4533d65e3522
+38c86dc91b997c616511b6d5bbe7c559
+51147edb892b832921a19af1e7f8e365
+07a7ecf76fe5c74bf11fe67a19106725
+af732667924f6402efb2f8be04e9c5e5
+270497ecb27150b246ca9c82873e0135
+7c9fb666f730e052a464acf457da8b4c
+0c1d10e7d5a641c01dbf0520baefc370
+56a3784f551e3b08222e085fab152585
+b7710ce344984f65663f63b5c3bcb6d9
+91b50d26d91fd11120ecca538b6a6f6c
+fe69aa16d462bec1dfac7b3764340291
+8eaf994e06b75c499f2c5dd7ad1874e9
+7599ff79a6e790210763c8ec2d37a467
+9e6b3ebf81d32b10b5c09762978bb377
+de79d1f9660083979a7745cf4eeebf20
+0797c95657c7efd825ae63e8c4ee422f
+d7533d498ad3fc4fd7dd4a1252da4cf5
+aeb7b6428ad0dbf35338581c11608e24
+f7ef5574a0c8f3ba688f285b769c422e
+51362d852bead3b9394e953f917aafc0
+a78fee20fcf890ca83cdeb9ea050bcd5
+bc9d82cc5f14569df75360ab4e1eaf88
+b9528db09da566de9baa619dc44660d6
+88348bab47f678f0e916e0a9f9f0f1c9
+72406314a50d8b8b9578daa4296c3e11
+878c0d208433f5e150b562c262a9d228
+592b41bbaffc18ac51b4cda700ab8b26
+d8ace0a38d42e3e7059307f7165a478a
+8868988b2fdb959b4a72998af4472c52
+332897213e627995867083ea355d0f16
+7f1d3c43299b01dfb0972cc2e4e31373
+9b640ded5e5cd58c20eb2592c01b3a6f
+4741a9b34957c06004ac9db5bfc45cdf
+9a4fdf316498de594a5c3591a58f2a89
+7131c8b9d2b9e67f688a0819428fe386
+2071afc5867f85bcaa6a9f114ee83d45
+bd23bfec64fd9e783b08203b9d398132
+36d8c6a284995f05228b65d0559967ea
+a58f141144a520c4abba54c3218cb6c6
+ed8d6a3f44401aa8e226e832a3810b08
+407b89e60a406a8bb697930a0bdf60bb
+39f4c5d51af727823bcf7abdfba5ce70
+9599316412ee2f061047e72e957b83a5
+7b622d420b89868d4c802fdcc42d2b7b
+533f06133c7cc0a5507af2808e9329e4
+351995cc34f2f88b2e4590bbe3676755
+0c0c23d3d53977df4c43dcb423288e14
+f7972a9536b9cc52628cdc20bdd3800c
+32a774c54a28f00b69cdbea7fb997536
+97d40419f3a1989a0820792b030dde2b
+f61b324dfd433323acef5569ac820355
+fbe5a5f55c155f71ed3a2e5b357c8001
+1974db51bf3b08252d7ca8a7e3068209
+b0b981e11f2e6ca2b8ca04cf3350c869
+a73edba9f8b020a90a6b50d544d51157
+4d5ffa1d84aa0ae3a09b1a421f115831
+3a6e9e0fb316b458c102ab3b051021af
+e36eeccf39dd4222ba7b03a295c5ed50
+bafe3547d0a163b13792fa170737bf18
+9ec27664cb0b5934f249a1e92e9871e4
+b82086fe535c84ba1ecf7727b847eb71
+96695e3ee157b8c356b737077af7bdc0
+bc426537f022a52ebec468916ccdc1a5
+44c77fea322430b0d50833ea6aafc7a0
+cefe0dd6d6741dba166a31bea0d3da13
+f99bcfb51fd0f98c17ea806da2e31087
+cffd324c59dc28f0aa8ef50c24f16b15
+61644b0c6af730dc5e89e341dd9385f2
+2e3a2d79cf1cabc8e533f9b6c8ae8451
+0678a7bc52b1328e62edb3cb004f12c4
+5375a4f3d963581dad00e57131f11ffe
+c996bba26321f7e49122d43856faf96d
+018a0a45b65057624cb7117ca5dfe21e
+4a230a45faa1d11236e5cd0a86a8e4ae
+3ebbfcd4d774ae916df0c7740834bca8
+cc62d26a0d80f066341c32bd384a3873
+c2a36a115a47f91237ea5eec62f029ac
+d0b6e28fd730ec397c2c5ac60732ae29
+b52bad90a2f49d485350b3c95d14c2e9
+10d66a28face105e28fded89b6132114
+d7645347184e74b8e0421804c1da2f77
+622811e6999827e2cf5c0faf5fc9184c
+b50a7d890c2974a06603b8424cd34654
+d42a5c13fa1bf6f4e74a1b9c8d25e637
+ec3505670adf0345defe33a9f691c2d0
+bf8c72534261120f6ec44aca903577c1
+81c6ed02b0f8b4ddb0a82e9ed6447463
+e414c75bcef3a10efdf18ae011d2a856
+273386e07f994d804a86db1808d5f22f
+53f158dd9f41141fd5340bcdd6137262
+0fde82acff6cce753b41d534a87ae6f0
+69a148387b91bce347cb02b2d2bbb8ee
+dedb98c00fcb2c202a61d11c0cbc3f5f
+3a9784a37a4bff9fe8107a486ac43f7b
+55b23adfd90a859ef70ef4db50279994
+26d452490a452eac08ff3d592e1ba414
+d07ff9f2b956d86a19498e496b0e03b7
+0a8a72185c2fa7e65aeb8e0913ba782a
+245a3c56f415b8e56931274e1e510723
+38dc2c68e88b1985622105e8bd809967
+13d4e439feee54f92b05fcaa22a08dff
+91213fec7fbcdc88b777f4f3ecd19ac3
+d27a84ee329dc2f19882fd8bb0f7730c
+693502c9100282d8d139cd0f051f236c
+abecd37a48b713f2114bef382fd35c35
+570536719170ffa5943742d04900b8fb
+413be8294c37d5749dde656cfb441e8b
+49f1947348ff9d04bee02e03ad0bb4e4
+e696a0c56607e865841d31aa10e40148
+4772d56f03e669cd99af2b869ebb5a69
+2a81f3594cd65dd668593a7f88e1515a
+a123ca250a58bd96b50f0801c675321c
+13a623e57017097439d2395405bc972c
+cc0f953f4371d7c84a37fa3705c9bc93
+8ade5a3cea85bf8a115b67d560a40590
+b8ea9549b0efd2081b852481b5f1f494
+38327d6c2d2c1e7698f82a3478a6ad55
+b6dad78677bfef5d7a0fb96f617e2121
+16766e363da074961804bc61040d5bdb
+44977c997acab8afb05cc5529518ac58
+63f14c08886fb6eb72b289f1998cba30
+84ffc47424026c19ba9757a134c7bfae
+716f8d14261a04fd8cfa1b0e19293e47
+d75e08cad8f6424e8ceb1f6c5f05fb99
+64b7d14d3ebe3aeb1c91eeeb0e638d56
+85e23f8c30e279a13660566c5c3fc007
+8d6bf6b42a0e67eeea72fc61d2774df1
+34624ebf01c5f8835b72365d70c61efd
+159ec7e07c39336e774526b81d35afe0
+e6e80f4efcbe2c0e98c6b19a743d6948
+b06550fefb265ec191ec922895208aa9
+e517f208e1a3edf1e164b69275e1bb2f
+f9ccf5ba95f667e0824b90d2c82ca4a9
+7d3ec2f43852e224d702044285254c57
+1ccdef52e05d6e14574b99019d2fff47
+63f37ba11ee13238d8dce0d8349cd03e
+d025158f0dc120803945fb55ea0dba97
+5154a5aca560b828f0539eb8c22bb224
+75f142bd9c88eb8ce9b7a1e8313000a0
+1e989e367e633ff9dd9b90107983a492
+d630863a22458617b7ddd7a48c831cd6
+04572bc2c2501925252e1adb8e06bfd8
+22d1084b8733762a4ded07fd60fc192c
+db54f43a4f04da33e810f9659fd6f621
+4bf8d424ebe22a4d8941a6ffcdee8923
+2dd8e8c1db484b752c69b4ebbaaf565e
+d9f974b8241dfc09ddfc1a73df266724
+b598747c3f660f741e65fe0a982bb9ed
+a5d1dd43357fce065f8242ed168c7cf5
+4e3bffd48c06679b04a0dbd6624d1940
+dfe8cbc625a979553ef7c2ca0766de17
+c988285918a5d3c6d09cfba6a7c13c56
+af6a0586a1521c28b9e796db2cd5b1ab
+cee25dd1c71f2bbe44edd24d452fb088
+46a1eab3e948461f57cc0734466f3afe
+633e0ac9ba0fc86e831064d947ba6fa9
+42c3a247f7a105e96332a908fa7b41c1
+17ef97aa99653da5ff0411936666da36
+08e9ef2c89ee7628bd1daec83bddec4d
+002c88ccc7253a4c4b383d6fe8e1cf01
+99e8ef845c50af0700377b34a1f21996
+1e4c02ed9422d83a5680d6d9c4929c0a
+53a0db99373a1a2a0811c28f71062682
+04236ff31fe6948cac94fe31804947ca
+b514157806dbd7d9b4a1f8a1cb3fdeaa
+7b956a9d8c3c1d0353fc0d2243cc4064
+872f7fe258a5c859d6ed0e7c0f62f78a
+b68de0f85fcc9ae71a20dca129027a7e
+bd39aace9d85ad9008c53cac75538c68
+7462d883d955e79296b7f86fbfda8a6e
+3c28ec7f27c24de48ea09b0ab873ee10
+e3fdb0fc988102671919105515a27860
+c63a3b06d00f7d5d5c8bada44964f3b0
+807585414db4fcbd18fed8fb1d84f4a0
+225fbb4827d328d77480770fadc5ad24
+8561b2a1f39c3b424d4bce673665827c
+155303442225fb2bee9bec52e58a26ab
+4d6243a34bdb09140c877fe953725f98
+1f9d975f1f182ceb56fccb16b7b20f44
+7a91ee3252a707dd2d7e6c3fd0e516b8
+572197bde43070ea4377804a0e191bc0
+3819443cea2200f6c8bf6a92d0087d5d
+926d89dc06cae3bea26f2f59e8130a5f
+9cc7ee15805e55657b581c54cdb0ac2a
+7db8e59ab2dc45b319ea544e3fefa868
+e730d71f8cfd00892c4ea3e672110900
+6b10e0d69135c0ed13e6f1ae3d6be739
+646f082d86ebc4ae45d2d130012beb1e
+49e5c570f14d763bc1ac5be171defa15
+38327ac28175bd3d591cbf4c77f18a4c
+948b74574422baa09b5db8671fee2902
+1155befaa9d0b89ebd34cdb8e63cf512
+989f5daad6e7d52ac73927bb33ec98ff
+9b6898b6ad020b6351350da25818caa2
+eee5dd2dee818162df88f2496263a32d
+7683368c78baf2f103676660407ede5c
+14e5fa91800d0ccfa277871d4d6b33d5
+2cac0b3913b6b81bc7b249bca280e06c
+e2f85ccc0a9a1f60ec67f0d1ae2a71d6
+225491e0907f241b95e365710977b054
+6e754b93dc278ea9dd71c748f1f39790
+637cd10d3b72f6eab11e4ccc21572499
+aaa9cecda4008731dfe8094d4a3a1087
+b42d8bcafc4b9657025f3ee2401fe5d2
+b9240af7b25df6cde79f328d63eb9e69
+1b2bf3834f2cecc0ff162982fd4c3e1c
+b0915985e5ba978cf5234c8bc27cb622
+66922a8dd4a1fe368f67ee2ceff8afe0
+df06bb8d2bd1d999047d6029e4276364
+fcc00235e0cec877a50f0dea1e02875c
+847efbe8be642875ad8f6879d04a2abf
+ae61b4868bdb7214c5dd5a337669ecd1
+0bb55792884e6c714fe42969ab429931
+f1812e266ba4149fae82e74c555decad
+188cffb8e0b8e74fd52cb2ba1af28c7d
+d4d8e42e9c4d498191be5df2e39e43d9
+0609ae38bd2faa75c25e38498f5b51b8
+f84e9e6b868b04820622e2387532cc91
+1bfbce595972ef48d001051f223757d5
+e18808813b0c3524b9544bed2a3d9840
+bddf5885939f450328062a160e5bc0a4
+a7d114e96325028bc57817383b504f92
+fdd4946dda7468c60574d47f3af8e7e4
+00cbc21ba46b55bd0f6abefcbc359633
+f86e04ad60558d890b22edcbc3125528
+2fb18c786f0392405cd14b45507468c8
+5923bf8dd8b36038a0b860e480d5e7d9
+d4e881ad33ff6e4faab5c45193045ec4
+09d63f950f0a55b53fbe4fb0f2dc0709
+56756afbec2bc12c9bfe04152f1c1f8d
+8ee42eb96ab047566c3a4eb760c2079e
+2c36c9c58c93b78197a98386b701c908
+0bc242d2fb3f5a3c394f126c8f23f707
+76ca698b28bcdcfb94482204e5858d4c
+4b775b0b003d538819f924621cc10693
+ef18a102d4f21fe78d4f2dd6012e114b
+ea885235c0859d0608322d492bcc08ec
+dca8c428c4b6762bb9a302137d89472b
+095e19b11e42f5d489e567b86cafefc5
+3ffa773f3e5bd429f24b9dff36f75a48
+ffa2d7456bb6f8efea9c9ea69aec49c7
+b58606f6eef8c4bc1170ecf25401ef7e
+15fe53694b016bcde58f161d4137220e
+a9bfb0a41d59a011a60fc0d9cab5b21b
+c207ec884a1186d32f3254d6503a93eb
+faf2472d81ea9d1fa8023164c8a929e9
+9b4500394fa5ed53dc26c53577387c85
+d2e6391304d0eaa0fb3ebb00f2a766dc
+39dd1fc8c4557f38071387615aa111c2
+0a6773a25d251f82400374bd31fd8333
+e9c7aac36da0316857c963d85b7e8269
+245f2dc4a6147703e4a19c6abfbb1cb3
+0e2bd39ee515a4b7d7b67f5afcf5b520
+a5eecc99f540be7d922dd9348014a905
+a0c13d65e76dacfb141e08377d3cc934
+493f1900528f60af6b44322258cff20d
+d9e3aa368bd3de7ddda54fe08a55a3b3
+4c66114295edaa054ebc87f67281babf
+8ef587a4dfd6670e96c0b27a609dab23
+680b42364b269c30cebd938796424ab1
+f975059c748229ce353593cff971dee6
+8e13d6c140759ddab9516e79fa8336cc
+d2e92cae36bc06450f9909414087df9a
+798d80dadb507f85808ba020c0725db7
+e5400b7b51e4229041b2a2a19a1b2025
+1c41fad554dd3dced58eade3debf3f0a
+a77d7b811081cc2b43d1f51165cd4298
+d382c37bf3b7c049d0dadafdf0ac4146
+0594ba4ecc0af31493692f9577a51587
+775690a1143537a1289449648ee0ac00
+17479077cc68245e52e9c6d426a935dc
+bc52c00b91e096b98e4cb603dcd3331a
+e40151232c8183c6785d58ee6073a1a0
+25e02f42c3e64752b6de1e38e85757b2
+80de4b02674e56bda177afb0b1c75a12
+a07e6807496b868b8bdf4adb55a8b75f
+50ddb871a181ec5175662ae8a430c943
+8ab05486d5e6fcb96b45aa7f6eb964ef
+0bb3b6770bc46c414efca0cb9da113b9
+3fdd637b66e81752094a16c93c36b720
+7fa92c5c4f8a6607781f04e14a807538
+f4eac2e9d3249aad3b6797c884f8745a
+62b431be2efe6b96a78218d6e234f47a
+a01de84bf52923ea19aa6fa48a658a60
+3488856ed4c0764bea392a7d898b8206
+a3dfcaa72c0e7b0e73baff03a298cfad
+dcf3707eef78124ad06377d5746db440
+cf4e695892788b4945871ae2c732cfc0
+048c226e663dee5ad0ebbb41030ddbd6
+8d376de4f7208b474e0befe3477ddfaf
+0f6b6e18a34bc70d69ad1c27a5e8a6b5
+8f234dab0bc9777fa7c0c0bd0d843a5f
+071008efc472d51e02b752aab9362fea
+ecc1ad14c873ad669c6fb87d0c65fb0c
+0fe58bb2e2ea15eab34b630f2da8057b
+e8dd747f51fdcb3042fcf6c9f1ed6dc7
+a5736dd2d01a1bf1a867274ce8082e1a
+3caa28d9754f25b9460f63373e494f17
+8649a0a08ca424328c6d63c6f6178665
+fb1d693199e6a24ccd505f8c170f7778
+6d2341a25bab0b042158ccaa53c04709
+5529873e890ff9cc9004dbde0a40f5e2
+ca5ca912c1102999ca76cd3ede288ff4
+49da77039aa784a203d25690e14668d3
+fa0713647cb3863900b14f5907c5804c
+72a9a1d28cc96d350421b6cb24d7e3d1
+f5eccb1aea4dc63069512e7ca03bb9a7
+d574bd865c550497468e0efedf9be1c7
+3a11cf043ddd78c9b5d9dd2a313897d5
+c1f70f9599b761e475d3dbc579472ca5
+1a35ace806637abced3942daf89095ae
+d56c7ebbb7a27b1ad7dc0208fab56d98
+d4a0690062f18c43e784ef40dfac151d
+0a9713a8e5ff18fe495ea70ea6a7e856
+fd07e0a6cc8864b2245b73b835fc7c0c
+4dbce60c290e663892b60af6ef4ed299
+8299c63e57c4b8ff4b6f009529dea7b0
+413ffca38b92f29911ee184f0ad700a3
+e4ddba8c15cba13763e5850a46914af0
+5eec33afa573f097c55efd032109a565
+3387f8aca7f11c07d4f693ae3e8b0657
+75ac34b67b569eee61f7a178b12c1ec4
+e68dc2b511c5757c793f3e9a6d5be6ac
+4dd509af7a4be810c02b5a3c9b9fd306
+cbe3d7787ce0aca901895daa05f2962e
+09a253346247c071785dae78769e21ff
+b8463f643866045d3b2a9201ad614199
+bbe6a36cd1647e80108091235c7a48e1
+5641e24ebbd1227a79ec3937efa11947
+65db6e811d5487742825d21d6beebc5c
+75f9c3c32696e03a3781c2e8bddcabc1
+82d12a23cbd777a501dc536f2226beb6
+0088209b0cbe94e88f2e9006fb490c2f
+d025fabdffcbd16e5ae203338954ebd9
+63cc3e311b2b96a3f8414efb041f16d0
+1ae167e1a553cf01a6e0f014b8ab164a
+995d80ef72a4b133bdc230202beff8d0
+7f846a349e0e91ff10708f7c04c1c288
+4c0eecaa60703ae8ab20b87c65517479
+2c2a649640e7739b18eac842efec68e4
+1854c6946c77d7569429f9f46c8357d3
+bca2b09c8e5d6669ca7b2d740d005e90
+dcea279098673103bc1750ff27959320
+83505785ddf4ff7c03e937b03d738390
+7a26e6677f66e108afe7cdbe888f52f8
+798aac52bee0cb14d2061b9c127bcd49
+560861679979b653c6a1c16e6b2efe9a
+14f718df499386f77118f3bc63dbe41a
+08d6f6801aaf02b833c1d353f32f2e58
+15e69fc468e7cddab9888ad74cc176b6
+e38a2383fe1046fdf38a143cd2c6c7cf
+decd11b1196bea417bfd22c605e00967
+3dc682eaa9fb114db6f0cd71419dbb62
+bc2795711bde8873dde44475f36d14c2
+0d088ee43b6ba84b3dd7d092a9733126
+da25a712889f491cddd777a2f3660cdb
+fad3d91be3a81eaa01c6f28aff3a5685
+58bda025895f515de24196318fa516af
+a9f1c1d087c87c08244066684fd9d113
+ee50bb13ef28e9ea13ba85c39da7f619
+c6490bb66afbf19c54a99b24c1c38b3b
+8cc6eefa0b0abbb89fe299552615159d
+d85354073ddaca033bad164a6eeda940
+e7bf57a1c28adcf244b89ec8fccfcb78
+a235cf7106eebfceeb033d9f367bbb7c
+22a8966bc1a9e9199d869140548d8b60
+3f6b5f6bd0bf1a835d8c0d86a98cd58f
+d484b710c3b296bbb0238b32d5054ba4
+eda5467299d95f6f8c929f7c6ce5d10d
+a62d1b6afa53431fead60eb8b294b42b
+929dcadab2122ebc4c6e9ab5eaf0e5c8
+49b753ff486d4418a510a0bb70f98d25
+0619a3a15312549cb86c4b129447bb5f
+1775b79bcfe045e724ff72a1ee3338a1
+bc41037f97c04513ac93f6ef65a5e152
+24e0e5ffcf59dba566118c00dc4fa2e8
+89b124d5a52acb71a18af70fa29a4f76
+add2264a666294c152ed61abc385e3de
+c1ed04cc75f7ab0bb766e01479cbcbf3
+43f78d3089f6ec656820c526e7647e1e
+ba645ae73fb4af3ecabc09a5e4db9ce8
+2517365b86b3d31879f2e1b62315ee0f
+08a1656d11025404110a75406a606163
+e5cbb28e6124acb13623ac6684f8d6f4
+d913d4b8ea8d2cc7a3f599c17657fba1
+c1db4b416ff07cdb0cd62e1798b1d71a
+5020a01e4de7a7a4195305704866b3dc
+f33902ef8265d7484a4f2492e532719f
+e9745d57357642699ce410da4a049b31
+deb305a2070726651268adafab82cc4a
+15b9fd18c1d0e563020eea62b84cc204
+e4e0bd877f908c0459b318af4fd89659
+e3f6756cc33909796b6329bad67c511f
+0c22b9c31d3534563778574f8c94ed95
+98b9140afa960da25e04370ade25aa9b
+b0369f68aa6d157cff20c99b646e7d1b
+dc61152d6ffc75fc9dcec83af0eb29b2
+240b00e617ee4620849794dae965e393
+3732dcff41cbdf35c84f25eba37d3ef2
+10e069d7197e73828739f93d7a8064b3
+397de72212172e2b7f77261e5635276d
+ab941f4c954577c6abcc393d72d20ff5
+9fb50e6d1090fdc6e6053b874fcd39b7
+7e82a63ce77c5e9dfb863531ce298dbc
+103431b17f728ea3561ab3f504cc37df
+3e16bd064f042e2f1aa3ac59382cdb6b
+8c1b469e4aa0a202fd02d97afb0fa424
+87a595da8fbbc0b36323a3e316256dce
+6bdac42396088aa88e84466e366a362e
+0b8b04c36c6c4250ce2a0cae1e3a6d55
+8738d9c98665ce38c7ea68df44c82fd1
+fbb6301a16d20d14d9a7a3dfc229e0ea
+a3d6e7efe691d861f581f191945c35b4
+89bbc67fbae78ef0a5719441d2101446
+29c0bc6a5ec2a30b7d136efea515a877
+1ecdd305b2700a9d367e2af4e8fa78c9
+264526789907322145a2cd1f7b7810ec
+d689c479749be6faa359ce29b2351428
+e0e0338f84575002dd4f37cf7f87dda1
+e61f2b87ae41de763e39a2ce2c8b6bfd
+66c0c6841b188d197a98a4223af1e5da
+6d0af215f6cea1f525190f802a4ced2c
+985d0408b069e4f86c292e44c927222d
+9ba7d1a9b92773b196cd66d4d978f75a
+142aab52cbf86009b18ae61455e871d9
+a0c383562c780a8f349b7ad5bc33742a
+24e8ed74d675ab52482f713d93c3b878
+d57e52de710b2775530a0ed85c2671b0
+9070e0d4e42f281471876447135287eb
+fb79d572d7c83ec324b18805b79de0b8
+5f98238991c3e4212343ce0bace8ff6d
+0204f049088dea8fe440695bcbede8d5
+cd7a1a5ed6db94e37cbcdc33f8831989
+5b858c92fdedc4b730c031db2886c94c
+f1de10984dee49ba56bd4e7af509f322
+7925ec4c15595de02e1539f91484a6eb
+356a4ec327a1ca30af130a69baa27e62
+732a97a31a17b62730dedd117d415d09
+15886066de9438681c2f57c38d211339
+092452313c8bbd4ead01e2b56a7667a9
+1ecb2270eaf296c83be88330bce0369c
+be06bf58fb168bbd91a82e8d7eb0bfbd
+a432aa58c8dd3719ed494aba5210ce6a
+34822c9565da5e3e3ddd670e994f8bf6
+74d69ca984292f045889d38d83ab0b7b
+6184cb3a7c5f7c385661fce687281e5c
+6ca4f1c79dd431a3c8c7c9c072242762
+09a0d7e5e34f43eb26c2521ddfcdee99
+5f9a7eb614084d1d71e243ad31c1a059
+74f42b91931e665776349c9061d2d616
+8f29f6b2c83921472c9030834491c4ef
+2c17b6bfac2009419614c133194a42c1
+c25d2643c1b383a2643d14e79e11fbdb
+130f9b099358e741336795e3e481e3a4
+cc1632a95c961587383e73928211d492
+e65ea3bf612badcf4cf555a136b0aa67
+ba84cbdd39b8e477b54b93c9928a25be
+1d8ee9a7c604e794c9b759ee72ba513a
+d4341ecf54c520088efad6ba6138ed8a
+b23a1312363b69a8c36d1e7eb37b3802
+535fd2fb75507e66b7adf8a4dafc04e9
+2eadedce639e4e8667e5e3b0df79b314
+056a220da86bdf5733d9481837a9b220
+6cea5abe8394c11de02d6fb9eed77404
+f32e8e1a054eab6d65634ca8b0221d34
+c0e6c0adceb6878f4ad266785b3ba433
+0e27e936f62de320f00ece2e75a81565
+931e4be679125ba677d8aef7f08b10d2
+c7d93629d29c1d7dd2ad64f3ea59cf84
+fab48723820d852f78f471016b24646f
+c47bd48bce5cb645e7125d99302c9830
+1bdd97644c02f4ba1480bba6cf1e7586
+9a621ea159955cf9ec055d117797cb28
+02bec5d77668e9d7341b9c47a1ee2c1c
+95f279c0bb7358b3cff6cae33689a498
+706d757335e9a25dd49bd228ae1d0571
+6dd925ef73218dc9d1af69c849737906
+c12f023b761733eb67d6c057a2959ddb
+3fd594c37e1bf5578acf80aade552375
+983403394628b2f6a6d42c40fa923c92
+2fdcf016c26ade8e9dc0504c1c9a1fd6
+d036d89812fc77d870515b86296cb233
+e70a288e291d690a7a5f989be391c5e0
+1137d3d9fcc09b5388ca8eb6119ef6e0
+cc441b41ad87619da59ce6b376288659
+6d76f46191b6633500c653b25898d820
+1521d3cb42bc0bda47d48f2279e2dc71
+6e4c5a77131904d8ace5dff265daed83
+91cc141263465ec2618090440d98dc59
+a70340353777f7f980d907649ee1f2fc
+c1f82d923fa48d0c4ee878e621d5dec6
+b31851eec43b505d586173b2dca876a6
+81a58f8ba094be56425eb10a7ede47b3
+f2b923f1fd10f8803138cf3dc266808b
+0a9e533a10396fe661af2034f0a1f275
+2beb491d07043d475a7b4547d8afa7fb
+b4d8a6d93ebb1449af58b13a814d674e
+5e1595d0cb536052e2cfd202526bf783
+a78a3d90fcf6570c52b676e191a36a9d
+977cd1ca31da70fa79de4a60977163e1
+3d4c87e84e74532424b936e78e12846b
+e7baff598af12a4ceb9cbb7172719d6d
+af017c65cbf042854982aa0f20a9b84e
+41ba287a0f83f351f86629643d4a473f
+72c05e8fb8b613b260424752999a11e3
+282cae562f35a6a765ba4705d5048700
+22eededa76b160778f637631e363432a
+1b380f05977758c5b1c78d2201858301
+689043490b4f2d8d5979741c107ec9d2
+8efb58697498989f6167ae8a665588f8
+22442ad12588d92fe26a11225245d6d1
+ae08acfd3be3e401fcadb3c2f7913ad4
+b74f756f48666d8f4c83eee9cabc14ad
+0296db3c6c8b0cd4f6bc8127e6488c44
+17c48ea4ef80dc3fd9e684307cdc5247
+942f84eb8e94f866b160ffae797dd9b1
+8160941b6a7496585f416fd645bb4c62
+51f6ded60111f3d47adb3fc8c6fc544e
+946acce8f68fe6fa0c9ff80f8d35e737
+8c6f332c0563ec4015ac6701d05654d4
+7a9234eae3306669ed3262090d24453d
+64f60fcf98d5066b16da018b7bb3415a
+4efb4c4641287cf482c294bcccf13b69
+a64e97679ee13cd98f0c118f2b459877
+a43ec9b0150551bd428f4784eb5a2b4a
+c37a278704435ba43e67e6e5741fdaeb
+df3aca87d4cb0a58b6f9a542b02e4577
+c8f7f313cca2ea5cd7de21199c288c98
+d2648cdfd749c82c3df33fd6781ab494
+3f28fe9acacfa1a96b749c309893cc33
+80ea6e5c98fd8119b2443db4121863e2
+2db734fbc142ab350a373ac7d6e08a8b
+9441762b8bfb44bcdf7384ff33e8153d
+0658e43105c1cca6d246ffb5db771cb6
+159861295ea9c1f4a459942d1945f3af
+8767f7495c4fba2305c72f1af15777ff
+19ea3ebb15790bbd165f27bdffac10e4
+0fca6551538430aff17c6f5086d2f36b
+20113a67b689155299326bb8cafd0086
+306486ec4c676ef2d3c41992867ff0b2
+a323f39318f9d45f346195ff8ab15ab9
+9f8ca244f98683b14ab2d3f3ad0554c3
+b316210352cff0d602e248fa6c9b2933
+e6d3a45e46ba43cd6d32544074eef5fe
+caf628a3244939c4531ba820ef4f58c3
+65a19536947191ad41599ed0ab468146
+fc6d541769bbf7ae62ad91ffa9cfe804
+b7f2500b2800e9735e2348e0ef1c1be9
+c093cc72f3edc837884e6dd0d3b8d9dd
+e9e871be8b79c7f8a175fe39a615cfa0
+5b214e83c153d3c3251692e407d86987
+f6f402e3ca9bbd44dde8ed1c9eb244f5
+b6fd5dcdf5011fdfccda7391451c0905
+ef688f871ebb5ab790c4ecb7125ef1b7
+9f99f623999a4bf1e1c1c141119b8021
+1ec45a5dd7d00323a4b43e97464aa82f
+6f6cb0ff69532cfa0bca7baf0fbc3e84
+77384709079d00ea0fcf93b81e52d9a6
+5a3b441618ac691337ceab6780e477f8
+86e7919549aecc0d5c974c6c4f5b5978
+82ea04c754068fb8e7183b016afd6044
+4d75948d20c01c3060382b0287fba9ad
+6b79fef3aa95b44da09ba85ec5ace705
+5e8807d41a660be63c95838e34464c04
+a5563accb5b9f51299cf917fd0072140
+3932c236f0c9639321fb271c6a86f2eb
+40bbcb4d1bc3d69c8e797973209b18c8
+99bade726e47ba89e871218dc13027ce
+0a1d70c829d4c608c16854b0aa93dc01
+04a377cc374945e453935934c2aa410e
+91f4abc34706cc22e489862523fdee26
+209e432f9fd356a65b4cd4e81e951592
+2e616a23d119e211873de94704ed9539
+b34d868df20363b201c05e9b1ec53d2f
+fc4cfe03e953087ed29e687506c88415
+9b2acef544f3cd0e506207a716f6e066
+f689c567d99ba7ba6f159c2db3e12507
+ec0ffbbd2167cc7439b9267752b13f39
+f691177d15c725a51d348b97de740356
+d4b321b738f7bc172294873b5af7d7dd
+9af32f010e5f01ab43e4b742234c88b6
+9dd48e794a0c8000c5ff08c4f2f4e27d
+33a8b0680cddbac84edabe45414e0bfa
+bc52515b635a19b8f02378a916d7c778
+a4f317cde779c8aa100bb2f1814b93ee
+fa7b6205cca1442dc21995e47e1f0294
+3fe458d2b238f55ca27bd2ebaf5b6933
+c821596720cd8145872857b2c761984d
+aa495ae25ecf4f71bdfc1ec45423e539
+957a7879c4a12a9728cc20399f50da94
+183f556584d60deec0995d63f3a28ecd
+2191018ff246a615635926f12ed26f3a
+4c57b53865ef1b65ffa58de5bbbda772
+2b269a26367cffaa3e3282614e42f08b
+6f09ba238b602e17834593c427e0c853
+70c508bdb1fe6dfc281d79b5697540e8
+e065e29927f7e10e2a9891770c0aaa9e
+c19323a86466e8848df7b3438b4d01c9
+80b30c75c08dfdb4489dd8aeef361d9e
+f37411b19279b4a3557ca15cb37d5e60
+4e3bea232546550b9c59dc228680dcc8
+27780c369bbf4118311360f8a972bf24
+63f33e53a8e941db9b7bf0cb8f5c3a22
+04a8cfb383247f8ab5ad0e8dfd0a55cf
+ddf680cbd847af2e5e996e7969e11031
+15d7d1505e031f8204eb34fac715529c
+a6fc0e1bca10cbf861565efa0702f769
+ad0a835c540ed73acf0b4091dac2684f
+c16b58d33c56d6781c6abbc811ea8296
+e6d3c1353b3b8b573f34625d64b09e82
+90184cf95219286674c048d2c8b51508
+c5e69b4cafdd7de742545b02fa84618f
+831fd3dfaa041d4af3867b9465d4b4b5
+87c4a8f0300c79efe8ad0903f55477eb
+055336da96fb3217bc669b5091109af6
+d07fe87b0811cb068f2fb2bc349c242b
+c713085686d552e977367f221cf98724
+f326549ed01ab2d3ef0ade37b9b74b75
+eb490ab4d5136f9650660a16d3ecc9be
+860f4cc339a2bd492b18f5c8e16cde6c
+eaf7ea76c4c6916da5e60e1f876f7248
+b378828959d90cf29424e4274b165d61
+a6c381042bc945f7328374fae29a7162
+ff40446d62131b20a5804da5a9ce3489
+27cbbe74ae4b66ba937808a6819fdbc5
+ef3d8f6a160819492522de1a4a259a32
+023bf0a28fe2c289a245bb1cbc1742ad
+462f9dbd99f127d04945153a455524b6
+319b8ee01412fa581c77fc773f4e18cf
+b53470e373c62628dde2b300e8a042b0
+409fa4a185c44607a5ac065612ffcea8
+6c352105e5b68cf96b62e9315c1cb3aa
+c184fc9cca14e91daf0b8e9ed05a9851
+32c237cb3c18129a5d2792ed61517a8a
+ceca8a0904ce4a9cead564f15a6bc4ab
+4bc51faba05c392bb6299e7bf9cafef0
+60c0c897b7af3882027da1de0b3195db
+1ab1967759ddda124a7472228e9172b1
+32972a92b6761cf94647632ea437732c
+a444115e9d300ef07a6a5d7c97078909
+d4c0a612142b4de5f727bcb91e342bb1
+cced9ec8e081b2acf82ac079d8903a82
+3cd7a7c0b63db02bb1ee9e265a978a59
+884f678af97bd6de7a8fc31d24ebdc64
+d132f17f4a861321ffd9cd9dcfe7c44b
+4f73be1240b6625515f244e7db87132c
+45820b4cc8998087ee17c437649ca36b
+1b9eb67fe49e6e1d392d80f483cb3e31
+8dba311c55d73468dd17a38f959f4001
+a6c8b155fc567918967570537125ea07
+53f877589f8f9fcb911a91ccbcd75918
+60cac85e9fbbf37a8c1bbbeb91ca1ad5
+8731d18a12fe528cf08d0e878ccebb3c
+10a8afe327de4e2b3e808d71601f181d
+0094b030c6e695f048d10daa6c273854
+9d77eef0f5a896eb7ddcf54a27ab6e98
+710abb73d36f3df0d5be1e171bf8fb3b
+feb077c26d8806abba48c1e05109d26d
+6f249dd8dfe6cf31b00200f0431bb66a
+65625a287816418307bb871e5f79d631
+8a5b66e27bd73e2f6a3ad55a7661f07a
+88b00f54cb1b841146a1675e7226812f
+307136c3c6ed2aee0ed444f8c8bb18c6
+308acdfa8554a5e2f7282bbd255dd01a
+4ad7a8a0cf06630c728126074dbccad1
+82b1c7c97110565533597aed7970544e
+703545cf5e366f03a7c5656779f90cd2
+1182ff6f9bdfbb459baf7abe2ed4942c
+414eff12892a60353de3c36d6e01e01c
+5422031b2863d1a8ba605ba1e76bef12
+5ccd9c7f36eee2bc941e25fdf59e4034
+2c076e87be048219eabc1a2110651d47
+65ab1ffbe4250c25a45513ddec906a1b
+db6b87d596b2bddbb0fdfa8733b347bd
+5fa80c5107f7416ebb2b21242db27679
+b442d6d18fb7a5d193f05b4cf85e5f19
+7da25c1ad449443d55c41ba2fe1411de
+8b205af1aa42374e4783eb7887eae71b
+7da59b9eddbcdfbf87f27f5b197d8268
+044b9f39b2ca14762c657242c70a52c8
+cb54e6e458d11d75dca7d6fbf9e19ddd
+07ca12a98a8adb4e52729921141d7572
+44b74dd6fbda1d7bb4efb9edaa18697b
+d15fb3349673b6b19ec0b6ece655984e
+a36b6717fd18e1146e71c6756d48ff41
+fb8d92dac67b5e497e39ad416790cdcd
+5147da25c17bd96f505c60d4263e17fa
+499413cec8d393f95f755621eedb7baa
+010fe3e325cd6fa19faebb3dff0269fe
+a93bee3188d3766165dd615cd19f9156
+98e6a3d19f182dfb54551839d307e456
+de4dc904daf4b836583a6c00a0671bda
+0bce8d0452ac234342d4048edfe9a9b0
+995f4aacd4e3d52e9be498eadde17a2c
+e677f52c235f31a2b47e35714ae4690e
+b75964758e255b905c5dd1828fae9738
+6e349ed25c0e40d63536f139b3fd10b4
+5c35e0b9453a53de35b3424771c424ce
+161c8881635ca0b30157730a0a865583
+5aadb4c79c9e13e5eae794b9086581f8
+27983188fe5b8fdab994225dcead1392
+6f25eeb867ab5e10ec3bfcc44c768cd6
+b25be6bc6fb0e483471948056f84c591
+d1b62bd4296631371e3bbfe7116e19f4
+42d179b751d2f71262c136dce18863ef
+9dd3bd01d767944361a885dc9fa157f4
+44308b646da7c1aa687856d5c9fc3dd1
+a953f9c371a3c89a71586631bf1104a2
+318367f0762ce3d3f0e4c684751b337e
+ae03b4ca45edc2b741e3b9965ff3505f
+e687b867d6d226415aaf2be7367f192e
+e3f849fc8695a2f0545c530d0957f3b2
+8db167c2aad932fbe73f465ea181d913
+3ea8f41dcfccf3139c793bf6d458480d
+da4e975913a951a72905e37f382b6251
+17d9e98e0e01581602b2b4a5c236ed15
+9fca849f8dc54158f489aacb5e944e84
+4804afcbb3d81ed268878a3a1aa30307
+fdf31278aca0a328053a6e0d49c0cf23
+c509fccd20c6981e713bc8b4f814e381
+37fd755568ffc52991610cd0c0899721
+60833448bdc7a2ee9147bce40bdd69c6
+6fba9c49df68775f4021904093d9e72d
+1d363a7d2812d1c040a8a4322c2bb503
+9cfdb753eb6b5ed81d77c3ecf46fee89
+f79866f1a6afa9638ebcfcfe79628a3b
+a24bf48716499c4fdb732c6c2bd84f5e
+a696e87a60767184c5e47235ef7ba11f
+a780e3d05b51133c3aa2b2202a18f7ec
+b6763177c6ed57fa0cea9355a195bc20
+bcfcd9fb03962ce355966eb3e2bbe46e
+0b740c86966e4c59ce2f4bcfa4dde40d
+48d8c34258495e7050a899b14ebf1ef4
+dc6c475af1b299cec896704a508f1fc0
+5b917492741f0420f7ff84218d25621b
+65a022b5f61ebffac1ccefa0670e0839
+693463fe8537c75e08c0d8a9723e3507
+970d12e2ff2ed977a2c09f6cd63a9d76
+4d4538546bd366446a4e79bfb9630f6c
+fb3a793e3f9d1f18c4cd3f7aecb16ae8
+381a67e7e0dbfd1dcc5cd46d3bad07f5
+43cbdd9e088d1b9a2ccae8c9eea4eabb
+b53cc79d6863333474bb3bf76f89e114
+39f5c0c2394b3c3076f79beaca068a56
+1150d19d7aad7fea5673d5e5c2daab17
+c3f8aab2759e51de8e8482eb4e825a63
+294663ccc03765ca9dfb24bc5e8e2b3d
+48b77ada0c596e5f335f059e19bf4f6d
+a0df473d11367d5e1fd4b4963e604469
+59d5c20c7e3cf4dc1cd00459f4fb2f5e
+1fad86b390d161b64aa41863aa7f63b4
+e8de0c6af906cbd831528cd4cac55f3d
+8f69f16ed7d4a56d7fe6f68300783cdd
+fdb53f5f9d73183bf77edf92eaa88198
+683adc9797a10a960f9a50e5b9d6d423
+21ce467b09e79d919c009e5e3e0cb050
+703cc23dab37eb9b9337b153ac1ede7c
+265a88417dcf633707a877a5d34f4d59
+695286c9a3f46c2523022b5404f0eaae
+133a0fb1ed349ef7aecd8bca3adad434
+5b444c387039b6b7e0935baeefcc0522
+bda6508553137ebc4059704a481623a6
+b1ea5ea507796ffcd4df76670bef3e70
+7703caf7bc84ab18f3b6c64aa85beaf4
+40f39c916054ece72f71d760b084c55a
+f41ce2571850ad07ad6bd5d6663e0298
+afdfd7d36593ca72ea81c09be25bd626
+1f98b727a798af7ea429fc4109fe052c
+acee84d91317e4650fbdfb9300934a7c
+e9e8de368cb4e6d2f316ec7ae62d38d9
+54c3be224b2088d61c8cd1c61603e7f8
+3457ad8cf8f02bd8ee7a68a6b3332aaa
+a263259500d8582c979828f0e6651c4d
+24cd05dd7f89e15a344177cda29cb3ff
+0e84b4e1f64966ad2901f3ef92047275
+01a3750ff38800edeb12c4bad2cdead6
+c2a234e91361e93d4a4d23d574152b3d
+048e0f33b854b2a3c00c3bdced0c36ad
+f60e77617188af6b7dfd6d291da2af35
+99b2c5c15251cb7b5fb00ed14e03a3a5
+23c0ea1c49666513593d4e3019007ecb
+cf5b065eca0d29a18bb4147965af335a
+68cdd271a2fed4dde6fe440501284b85
+343d4312c806a2db1c44387578ef7227
+a8412d028dff011f04e583d72b086568
+d41d8cd98f00b204e9800998ecf8427e
+a8a4918a950159dc7c6130a0e835ab39
+2042c50a049d5d4dc0bf4d851c6da63a
+0b087c626a7bee5c285815e3e0253343
+ed0719f8f9c8e4edd85ae6f2a5eb0918
+06938d12955a3b1afbfd98cd92f2800f
+7158e368711745a1b17611dac6ac8b80
+aa0827fdbe214b09a44546ae5ef9ef28
+cebe5fbc96fcdd06ecf261f566cfae4a
+57915d20ce6f5ba84cbcb30262ed92c4
+7bf7aa3e9f16153b52f3d1a21801942b
+28603d55a61fa386af302294cfa8e020
+ba6194c70a22b918953af8c7e2f3b559
+53fad99ae21425dcdf7cd2b6e5ae0f67
+bda0572959ea4c35b5b31c18c1e544fe
+a041b23c4da80d40a25527ccaa9bc78f
+3894d05fa9b56fb159fe579e35fb19fd
+f2ac8d6f7969085aa22150b3cb19a9db
+4e27eb71e4487a0dcae936514095d233
+5753cfd4fe6f8fb3a65bb57d087ad519
+583cd9d994db6f251f742675559ea151
+8a2ed2b6617b6623311258059f1bdeb7
+17be8954e025a02e3660c82e59bc77d2
+9b1f88e6c0b2d7f3ae92fcd7607529da
+3a3d7eacb9dcedb43c400c0414bd45d3
+a32317bd9c77bf379d923cd4510631d7
+c33cd6ada24d2ba1c74a348b876e4de4
+f8f6ec59493c6e309d5b6cb44c7104ab
+b0ff2a333c7b194f04551c5b95be0ea5
+e64261ca6eed48520ae7c7e23dee7551
+aa3d22b6544ff1825763be46fa74087e
+67a213fee5b9a6ba8a0e4f0bd1e49bb6
+644363a0f40c2e08dc7d742fc2a3634d
+e64051a74d443b7f9f3cc38b49929275
+161d6d7d2a3dc54682a9f26a3f236c27
+41da0f881edb4677cf310947cd304e34
+d938faa2215e5981e2a810dc01100bb3
+ea13fd4df7d00e08ac7e03bcc4ed1c58
+c7aa6c6628b3ec4d0bee969ee14a5be7
+1864473e9efeb68ab2c5fdebaa5ee28e
+aaff6f0ea1fed7760f9bd42ea6044a53
+5b49a95eac215342bb350fa90eb20a7f
+3cae083fc130599acdaef4409fb45c99
+cf569fbef3ce4535b080d6067933691e
+eba00f93285eb5ecc089567c0a27fbf0
+ca088215119f03154aa4b3484b8024ae
+629ff976967712e0456ef7c2f25b4302
+033e92c406a9664d11c45d6ef3184d59
+72b258950eff5fc71466beb4fb78c105
+0d8009bc3ec56c347875b488ac0df482
+2945a8d99eb9cf8b5b73ded92c49a2f0
+1701b0553d884a6b7dd334e23e8ee57a
+aab64fb40985b64bbbbcb88383b1c420
+427c5cbbffb18d06fb09af91d1a84cfe
+23ab0929e0b690f8252fbd03550e4658
+5c02891d54651b573730f86f5a8f13d7
+c2c9d6a9dbbb3f370163ac1db36e1d4d
+0e5399e6d7710f672d6fa1df9011c61c
+050192c1c6acd415cb65f3b2e2699e5a
+1eb1d53d350840480c6b4d2b6cba61b2
+f1fff97e5cf9c75d1df3b7cb7f9c4668
+5068c713972d501b10f4ecd037029ef8
+f611334b3d07d470c40edfb75c3e34fd
+f6fe9dd9b81d6441cbb7a621fb80fd85
+2cb13e3b0b23bf9ce2f8ee654a7f423e
+25c96636fef4920874290983da71ed27
+e10c347cdfbe043098c654dd536f2924
+268b122e49ee0c0bb2b599ff16893ece
+b7f46c0548af84ce8bbe0ee48cf7f7fb
+fc22d2bbd3e074592e97e5f6cd39f0c5
+9b905fa23df8cbd7ffb54f5686010663
+6e7d6adf07b453f64574ce5a9c9dc306
+73a954868e242429122ce512d8158ba4
+6b2e8ac4454932ef252d8bb171134138
+00774f5cc5941b4131142a1d5b6435c9
+4ddb271357efffae7a1a7d2fba761c5e
+115a0b969d9b8e8a40405051575e65cd
+5b68d3963f74768c49b5c88adff79625
+c49c6b9e75b0d941ec3be6ec004edef1
+d1da85c82f32b23c56f2aa93b60750d5
+8719f82749fba5aaad3e0856779a4100
+731f4c14de2c3ae835a2dcb4c496044e
+b9b9266ef4900324c7beee1320a160f3
+3a804034d95c426480b4e0868f951025
+73d2a1bddcf1d95eca646fd3f67a751b
+83a01122dc485e51988eadf492970ab9
+815aa77bd2b7982ac3fcf01c68178c18
+409ee3cd09b33235f4fa58c216f5e443
+7248d850f9f2c04e7135606d9bbb0402
+750e844d5577262bd6995b7ff1b6223b
+4671aa6abcd2b1c1ebe1d68d90bd034e
+af110386f8e6936b79734fe493636f0d
+8cc62c2f1c79a1aeb55a05c422230984
+6eb59f666d21700969dd81c851bb7a98
+f1eb57e9fa4186cf668de23ea58774cb
+07f00e6068e843aea79cebc08b9797ad
+4dba8df78e1769300db1f202465393a3
+6981db626b5b6e446042045c0c475d56
+25cd63cfd19d3d339747669ab33e98f5
+3995e2783556e3c5a3939835167b2ded
+c25dcf29140ac0f1d8a425b57c2ec381
+c8802624f08f0d8ae9e563070f143002
+0083ba674d91012a3d137619c6b2c7b9
+c02e16ac7521fd51ac0709dea2fad291
+ff8ab0865396b6e837a184d0ae911bb7
+2add9e5fbfb13bec94c935325bbff406
+93b11670e881dda9bff91f82a4442e0f
+8b2a71225ddf418ad2d7b7b36a42e5ed
+98b8e2cf57fee2be549f17380a7cca16
+e7498dbd153e213d8e7a26d985180cdc
+7efbd2066e7d7d55ae87cf91c0a45dca
+94bd03cac01e600bce964afc348f8daa
+ec11e2a865b98be2911b7eee415d279f
+c2b1ae09efa9192fd19bb9d98c8e3c54
+1ab1b0d77301ce63b7aae3397b3e64a7
+60db1bb9fe10bfbc3c91d941e53e9070
+a1432529b92a611a678c5980579d0afe
+0cf1c862fced8cf8fa067dfda639d7f3
+31c6ad0d67476bbf01fb1d5d2574206c
+d308cf247a9f358150969c0977dca989
+67e238c80f83ce39c119d18c966be2cc
+3cc2c898a0a341c466a82f4c3b2f52c8
+3b386164cb7f2fab50487550419b4eca
+a1c0bca5d683dbfdb9ad9e3220f5d8ea
+e9a8e22b0a6691796d6f90e1962b4de4
+4546d31288608cd2e168397811aeeef4
+52e75365194e6e238154ada4a22744ea
+ee67fcbd22879f1c457cb9bceb46d809
+c2b9da7f293b34af91048113a9d8004c
+1681069a084ad22e73de24e473478986
+f8574e9d75dc5ad8a19500898096a566
+b38a7fb5c08cc7f19339214f567a5c8b
+a8f39edca828d5034ac3287dd76e65c7
+6e93a0749812cdbcaef7734c05fae12b
+f9c8a501d6c5ac1e47d9f16c35c94128
+04302bd4a99b82008737ece14e68edb2
+b0fbf3b51120ea480e94a0e2a0e3abce
+b0bf8fe908d052be44bffdad02608760
+8c2c7616416685138af4909a6812daf6
+5b4d7a5b29a49ec758551dd2f7e19fb3
+235b2a7d4c6b0750b7047aa797b9dfcc
+2152e8013c64d680d72d6a805f92b4b6
+f1165bbcf719ee4695aa778aee40ae00
+6d3da9adbf4fecb56a599005c3527b56
+1bae283c1b9efa99138a86d6a657eea2
+49ef53aaa3754969e601729644922ed6
+bbbec8ad294d043045df8fe6cb136fc5
+f57c7bb465fb974ee3466a070a74c635
+cd8741c1cb7195691023dc090259e51d
+d5d3fdc13a884fd4c65f1b71ca6e4dac
+78838feef7cbf65e0f27a25a3aef255d
+306712bd703140068c8ece05169a4502
+e6d22e883fcd71308730e12f32a1eaa2
+58c0f2a3869f804769bd4e3bd00ca851
+74fb49ea70f71315f84087ee91eb29a2
+4b20ba027980f9771ff5adff426edbe2
+98dfb86abd564aede142a4695d6aeca5
+7b94d3c1009002f050108adf277492d0
+7fe42de2b32120e4d0812de4ecc4b24a
+36bf9e42ae60dcdc6d65f50eba9e085e
+7935afe6a28e2c090c0c917b622793cb
+b15ae83b34ae2f527b3bfc0247abf17c
+91ae595bf3828d57a4d17b3efe289941
+c876344d90a2f4d09de5c3756e54cda8
+3c0a0aaaa4e9072dd7d0e513e9b1869c
+5ca0f75f8a31929fa5ba0650b3b3db03
+7598d98a6cf982efc25ad72a5b9eb648
+912e5995c0a4ac2bc0b9b9b12d87c03f
+e2dc1502815c439fa5ee81b77a01d039
+6b749f3abdb63dd74dddecc15db74615
+f325da87db73abb7e3a4d3c993533b26
+1868b109ec42d5d0d3812b1dfa50c6ca
+d1b6a9f6ad803b96346d0f868e239a64
+b0a4eb98ec4f38757f1b1d42ec3b1621
+8ff2a6565c4186296cdf3b177a46e5e8
+00a6be2741e2dfeb05f7db114e87e7b8
+e2d57095101931fd58169b569a8044f9
+6d57a77384ae02ec54e47b50578afeee
+07891c8ca6338e0045395a44e83d35f2
+cb6b152bf06d27fe9254fa8ae06f7892
+105e3f58c481197e0ca61fe15fb058c3
+1c6523d8bf6dcfe200959561bb0e645d
+b62b5b503c3d10a7ee5b4fff09e99704
+f8fbb40dfeb7413047c2849a60b95c5f
+a64535531dabdc066eec664116c11dde
+8a0950a4d60790aef36bb37737d70ec3
+be8c2a158e16fa4b7b5b821b3b93f40e
+2324505a737d0b3cc71b1c03cbdb5bf8
+2f758001edc0e79d73c5ed9e9e02d14d
+785508bfa8aaf0f01a9ca65478444d5e
+72dd7ce19d4735639c57b73045095a7f
+d9f8f71ea4fdae4d93fb54d038294cec
+fdb40db62f9682427f2cad5b0529374f
+6ede02c4092e6462e38a9abe09ee1aaa
+84b53d2d7743c11b606e297523d54eca
+4806c351cf865bf58ae56418b6905dbc
+bd16902dfe0f5da38e3697741ecf9c85
+83b3634f944da3187fed5d88f8eb3706
+22330530436eacbbfe8fdef46befce6d
+eb2f484edc82a9148e2b3241108b2db9
+4575191d879fe737486a6e937a681fb8
+36517d336094e4c9cef7f4b0b82d2c0e
+8c1bcfa5da28415cfe31a0e5e12104ec
+306dfe3f9fcc9a746ef8da891188ce5a
+ec7daa9dd73b2d1729afed56a039b4e1
+83453eb1a8f51483f3a81a1e1b366578
+fede1d8bf7ab93b86b989e9720c46dfb
+265581df5e1ab4b33125c766eddb7544
+d5595a7f84ed1343025c3a547e137124
+a85fa75c919c9ca354ea12b534695f65
+2c1eb96e82be063e0ca22ed5184bf126
+72288dd0dbd4fd56903ec1377e522357
+cab188ef67c0060f157dac4a1f9f69f6
+1d3e52057a4d8d7ceeafa02ef2eb5fca
+ef57edbf158eca2f21f3d9daa9e27d97
+1145a10b19d5b5b8f073e27602ec28de
+efac85ff714c306300115d4dcac7b9da
+aad03d9b97ea830220a2383388d32fbe
+6c614ed154b31b9ad38447e73701f879
+094edeb5ae81d2e1c25eaa8a7fdc0284
+1ae7cd56a16262fab84bb455908fe748
+ef88e7839e4954115c4e484247b7740b
+1592536af228cf3a9266fa85fda9101d
+345301028fbaf61d67454e25f19b21b3
+47e0bfa0ba252b48e04152f1547d460a
+327738226c643398c561b4c3e5e80ab7
+1ce4856440de1b5d2edc5b0061bf0e4f
+3d88ab87cf40a67d030cac3564056fba
+9d0d2e78111eae4d7970c4759533a88d
+36c68b7049f6fe5726a50e13cf15d54e
+27098fd06d9920d93c975894f04a9222
+d1a6bd144fd7b0844793411f6df647b0
+d4e5a2940570e3ac6ab5ad2683b3b8c5
+7918cebb6013580da4070cd58a27e31f
+c6dbd40624b7640ffa0686abea87f8bd
+129a8f6a334c2413dd8464b0dd95d567
+e52c12cfb44341aa93405475333fe7e0
+971383078673bcedcaf1bec2447afa4d
+3a1412bff9c84908de4cfa9449c09418
+df44e7aa1c2692aa6fecb4f93b4a5ab1
+0639ce3be7280c98677c5d257ec072e6
+f97ab740bab8634a04b4ca4fc28b6f97
+3de3712718a81b8380faebbad156f7d2
+bfc8228054ed46328e18f54d1e6cdeca
+1aaaa749427fec1fe6a3bfadf801b81b
+2dec7eba94138b1724d7b7a2e0dd9692
+d9f980d1f9b0ed16e398b4ffb43be585
+fa123e54964ea41e33969bdc4ed723ae
+19c2411bfed0a185de68ef1381ea7797
+9c982139da4d145ea6d24c1aa24fd58d
+779e40dd3847e976641b4d75788648c2
+19605deed9937ff331c353c4d3df5d1f
+b4f88682e86c7534440b671b01e0dc3d
+c51123a68d80f4e1d85362b35fa3fbe3
+68bd8cfae30a137fe6c166ebc8fe1afe
+e5310bb3e41c871c74b9cd402edc0d74
+aceb63d87a085f7578537e01ed2d6315
+d2b540f8a3b2218a874bfadb3b4fbed6
+4b2620209ac18d2ca8341be35d73e033
+cb9cbd9a994927de338fb3e0a1da09a8
+10f94da841d64b5861ea9649be05b32d
+5fb33dfaa3e06838c43cea1336e04ae9
+d860aa81afe3cf2db3dfb5bd3ea12054
+fec2b98e0066774c3c73dee22e3bad8c
+dd1944eb872eb8b1833a429beea5ef9e
+ce1cbea363a67d5370a914cda138b18a
+8a649edc185774107ea9ffcc1434db7b
+3af484aacbc0af0c38c0f19b74c51aae
+0db433319c47f728549e121c95b1dd81
+9bc5419786fed6c51aa55f7f05edf862
+bc69aa685c6467634d76730e479eef00
+19aefb8f70a6a2d6fc3182e0c7407825
+5894ded8fc79589b97b6d66616838bf0
+7fd8cc7bf9b39d0ffeff4b2a9680eb4a
+9d40a84650eaa1082fb5d2ae238eb820
+2b0760328c3759dd0458ba4077c83ee0
+625e5525043bbf110552ad6e152703fb
+1c6f12eec0dec677eec891cdfef24e0d
+e5275814a602ddc9b761c6b77c476d4a
+3ecfd426148b2df5e831d84aa13c5851
+9baf4217e46c9a4416abeb999ca7b919
+e6f4f5b01a32e906cc43c9d596ba86f1
+9af326d41d9e42f2a07fb6ba98dd22b9
+f32884b0b08b23abe603de73ffcb563d
+ce910a250d42a36d52d045c635ab2e57
+862d33a2eff9ed0ac5965b2d054cafdd
+21bdc0790e84749ab61a77b48c4d0a56
+4eb8b62e90d8496af010ae856250f183
+7f0afafb4e10efb419d32e0f1ba61c4a
+acd3b3576e9ce8c4c49db51d95046605
+ff21dea36f362464abc0afb9f1d4ad84
+e8d6697bda9e640f587198591b61b97c
+f9b5df934cbe4ff797a00da2a8ad4e8f
+e023a73ac3570ae266e4a17919c12624
+02115ce1286ac70e24ff859caf97f6e1
+90e38437d53ccab4cb420074d320dfd8
+26420ec1dc8fd4b85cbc2adeef875f28
+9237dba3abbeb2e7fd29f0c5469445f1
+a1f3a64e93bf6ad9abe7b3b4023378e6
+1980e18b86bbe2dd45a5bcf9ff5f5043
+d9679464ba0ef8723e2ccaf094a25284
+78ee16efa0fae6ae4f29d029a61fb489
+9f049b46228ea63ca8e4c1bcfe34deb9
+4650c904288869b324fdeb4a62a5878c
+c88628b22a0e61da7e364d6323a84c9f
+862fdd90b1f1e822db99c66187de0a73
+b2ba4065dbb9a8c6132c54cb9b05b926
+ec4641776943211727d6867e1f25f70f
+0c414f0903fed3a95d0093e93f0cbd1d
+05a1a8dda980f164fcc70a86e5c6c393
+b10cdbcfd19e5018bbf1a319a4ca3e14
+3d2bd69a995c16c598b119c17dc0e88e
+4bea5530c27c98fd488d22496e570be8
+3cd50674e0584134aadb3f6ef1df9280
+b3877ef1ad35b4f6309a672fc6e8099c
+9c4729a96737dfe0d409a881a40e2c93
+5fa62a8f4fdf8dd20bb9aa593f9b04c1
+e11fe9e9aed4f03688f7e634a42ea4fa
+15fd07539265a6d96f16afc81a8cb93e
+b9e21e71ccfd4c687d66e418eb740d90
+a3b5dd30a846ae451843d541fdd89f84
+4e7a5fb938f3b07bbdb9ade8ed218059
+7323c6bb75b2b10b64a9a75222f263ad
+0386942d0ca26c9e38ef0851dd7e88fc
+730c3bd7f044c3ae7033481640286c2c
+3f297f7b519a8d82e1d84fc609348dab
+6bb1afc5f633f9518dac90b40d75a9f8
+6c45ec2c7ef266d8970da39a6ad1a956
+372161b035d0086d848222ce9898040e
+3d1eb956e2ae5d52ae272ac08de05c31
+363d11e3bf433edd518a81e73c606a72
+5e147138957ff01fe45475425ba36e27
+10eec6d8f059db7443797ccd8a09283e
+7743ceabeeeafc0b62d82facf29b21cf
+a151f55384375e809614095f85c6b474
+8f95b67b6150a65cababec403dae15ee
+174a98a40aaa441c8eb215df9c6d08f4
+90fa4bb3a6cbdabe4aa52b6de5acb952
+0898dad4d60515d53faa405e7ed1f98c
+1521951de32e93d3f6476bb0caa8a212
+d062b3d54975f080e9ed30ceccb47353
+e81c02b9d4351137de950510d2c32814
+61551d432774627a9cac6972270d5b61
+cd24291eec0019eda8ac62bd45d00a3d
+ef0d45aec4848cab74199b1714542725
+40a78f212321db054665dea53acfa28d
+a17ff5c5a3e79b3b929bef7bc7860e9f
+a2dc4ad32e222bc9a03fdbd53ce7c4c6
+69540fc2431fb9d1afab40d3cc11ff93
+8e4d037749b60ebb23c90dd7de15d635
+0c91aaae754261b5bbbd1b2d2a11db9e
+8c5743abd5dce05ec43ce8db591a5ae8
+2a1d45deea9518126d6978d9ae877b82
+545e931dadde28c9ee9ac912def5c3b4
+692d8c6fceabae5dfc5aed48039eed3e
+36c0a4cf4bbf4ed58e54ad01a002d9ef
+47867fdd37f3ebb4f05bf9633825cb66
+465fb330834b9d40ce3b6fed6cddd8a0
+4de9fb8fa273661844484ce63f74c5de
+75e1a46a9cbd9ca6587a5d555a60db6f
+e1fea7f3a0ed2d303eaab61bceb6c5ee
+ed42023cca6cf5270aadf7ae4c8f130e
+04a6526b42ec30f6471b20526228e58b
+5b9fb51831406beca7750860afc2a45d
+9afe59c50f390f08a7e99b47e53316ed
+65936eafe247e904c79dc3127fbd028c
+fd073b15b34a9ddc379fb9f25ad34068
+ef385f5a9c80e57d7c7b918103221d0f
+e1de30d5a42e6d9bc56f0634f3b6f884
+1500b0604cd9d2590703a71a0bfa0486
+2a213fe5bfb12027ded321c010b76d35
+f5ba7d17740708c401b19ec322b6febd
+458f7acfe8265bf9671a81d57415f1db
+808d79cf20340af8912dd6a0ed049310
+82e6a52ade540a98eef606fdfb287b56
+adc2c9a417854b5f4e4b68f2e90bb44d
+438dc46942f196f7ade45325ff760791
+f9e77be3d1d4cbba9e75ae95044dcfc1
+e17bcbf361d32fb271449138a4b76495
+ec131c23ab23bd68b0a5694ff4f9a517
+94c63b8ac5dc9f7d9af3a138f2704519
+267a360566cab59b0cfaf8eea20b59ec
+7e25cb30855b197c75d9980fd71d4681
+f53624d2c8c42c9f08435f76bf7f2eba
+583bb47c351413359d37b682742967bd
+19b721fb1cba695447a511b34a842fe6
+74c5eb229d1d1254031e6f53b4571489
+d308ea37f706c136b3f28e20e89082d0
+77e18e6b43f811f3cbf7ad4620c93cb6
+d2cb839e95e8252c391118002e621930
+fdf7ce55ee75e54f1a38618b762080e1
+254e4535257efe378a5d120de164f41b
+3a73ca5028eff0e78ee780da59d573cf
+27434c4487302ec77b18e221448b8c78
+daeca7d7a03739b516a518279776a5d1
+d3a14f9086fff9b27d1a9b7b0ea91c2b
+d49ebf21b9440512b12d96bc34d57035
+8b4d6f2d5719cc2f5e35ad564ae320ec
+e4843b2289edcb25330ba74904f239e7
+2aa3101e07bd9222b65fb9b9dc9a6ed7
+8207c7e47e8bbb20980438cc6cf2bfa5
+9be49f44f18bfb7fa9a79948b9217bfa
+469f0f6d11d85945001b6afd97eebdca
+b63a7e3173ccf0b7a4c994ecdf4dbdb7
+61eeb6f17d398dd63df4508aab2d8447
+3a90ebd75088a74e4564ad645fb8c58b
+b44924eb99b3e81c07364facb0b79e1d
+8833d99b27ae6f97741f502179a1f9a8
+896746942c61d25da30428bb968c4b63
+cc70d91abf9648711a37b9b031e1e382
+9437a180220d1428915c0187d0b01589
+0fe386036126c07a8f3af6b572d2484c
+f43daaecf9d318f2727b80b1b383cc93
+0911a3aa8ebc7b52278043696d8acc7a
+879b8b3cf6c192422ac0b660239b2b94
+5e41953ab9b7241b8d9c2b77bc19be55
+baff3d44dd39e7e9c96bb8a7f7499189
+80febe5399f5003b6a2c444c3c199fbf
+d41d8cd98f00b204e9800998ecf8427e
+4e5234d662a45c8e3c9b0e25defabec3
+e03a34b223a3a7a1a4872b940a0b18d1
+b5dcf8d51add424ffff31aa3a48f6a40
+85b35edb36df502391bfa5517605d25a
+92217bfcb52336c0a7427ea137ba511e
+2ebf9228cf2d49f883b66c15ffd50027
+20203de06146c67591e4c4b213bd97e0
+77ecf782d071a4bfcf2c768ccabbdca0
+5691fb20fb890c22619bdae7a68f647d
+494308fb7eb97ebd21210d6cf666d241
+72662dc25bbb686f1f807e722ac6701b
+e7f1d26c3944ad720a2f2d86f3a1369e
+98e1e7237bd7d8f946a0733778029d1b
+0ca4232b2f599e01898386a9888cae0c
+15fdc865ae8e5b2a521832c6861efa52
+4833cda85cf3253a6080327541980901
+ab3a7c743317856828d205a62a5a1f37
+6f4801a57867befab12e4d8d566aad57
+ceb77d9ddf9cc0f2c25962056450f732
+46bdac4f1d4e1ac7feb8c4d9e01ad7c2
+64401f9aa59aa9e079e16ac8367733e4
+6c0eb65404264f9956db5e1b8245e0d7
+4641cfe3a5068f1e406464e798d10665
+b1d679e7ae15a76b8fe4c46310c17a65
+65a6537b19af251617760c6fe85de64c
+3afd3b18bf68799c37a9bc5e91a727d2
+42b964916ae84a603e2909969b3b944e
+bb49621126e716d8b8279aace3d8bfba
+02c7f2287a2e48c29920cda144bcdd7a
+a08d5f489d856609bf0928b4078efc3e
+2e31727565624403e44f19227865427f
+0941614f6c26ba8cd51bdddcd3f73c70
+748ba27a2b40571bd9834cd7c03a5e95
+dca9d352a15e0142cccb1eae4b5c4588
+8d5a603ed3d0c5276a427df01adc418e
+1f90abe2796e0f2a90843642ecd05ec9
+4eca96d566d8da885bb37d260ed32828
+c1762cb987174c2612275b11f2353fdc
+b002c9c2520be157b68bdd1705820e2a
+3387b93a694c2f07343b9a178294893b
+7842c19a55c5e2a9d5ddb79c34f53fa7
+d644020edfd297d6fc06c125ccc14283
+5fac3aeeffbf07fc0a481e85a41df2de
+715cdb68cf9e0b60369f49eeacf0c85c
+e0d5bd78a5817f223cd945b3cca0ed38
+59fdfa94fead17d06d593b6e2f27fcd6
+f842353ea544e33250bc6d20f7dcb987
+8a0ef45cb4f90f054ca70ad2bd21499a
+e1b377e3928f38c1ae09f1021b77a928
+19d165160512689ffc1be568661fa5fc
+5451b4d17c42d28ab567460f09bd9571
+b6745feb6d8da51fd27b9946523aa437
+60b801a754278426d88c330bdc6044ce
+1aa6cff4f4facc2ccb656506c785e2f8
+73ca2356bf5d8a984408e64e5438860c
+623ec99f712ed0879265c44e7cbbde4c
+7e466bf0e1cc7ef49f89e800ff463685
+16bfa64077fa3446450baf314fecf177
+ac63deac7c910dbca4960dc950fc00fa
+4470072b63d36b30fae967c543914166
+38f2a5ae9e4e5eb51d024cbd6454c78a
+fd98c043a8154dff6d058fc3c14e29a4
+69ed119c9b4f93ba8c1798d9750b1612
+fa0cb557082a6e9ea0f546a6c0134d0d
+20f65094028ed3258a6ab6151f81cc8c
+83aff94aba8e503671cf49476b12560b
+34413b6a7783ad90fececff9d5f65512
+93ef0a26804315e282f63898db985a87
+f7adf6d3f32cfb92c8e9d3c813bab0c4
+a026244655601397b37fd0fe7884797e
+3fce89f8f6d4bb070208257e9b72cce3
+b44639df73ec7102383fe711b123d75f
+0100181dd8e9dcaec2a92b0573f5f11e
+a682911f2ad3798e887f3f10c9cf16dc
+4bf95e2563e1bea33ebb3215e0d4df1a
+df1e565e3f70cd6de9d5e684dd66f6fd
+0963cfab5415de0ca18cf5ab0c41446b
+5dda3c52631507c7570e578010ee9904
+a0bb750192bdf8bedb88cf6862684c59
+3b7796f2365f587f6b61cfe322b56895
+6542c06cbc4e9ac3428e99cce2501b56
+13cd58046d9676bf66b31d417746b753
+fc3a4a4d8aaf857931ba35f65963de02
+98cef74df1b8461f000527169f0bf935
+b84ada0b7e62eefde01f204eb87d2a6a
+ac18f3dac21671c4db5e6b6cb4813e93
+a4035f27adc61943bee97ae7403de4b6
+9bde0fb00efd337f909286127e46559e
+2645d0949913986d448ed39c4e3930b5
+1166062430827f05d6582d3d1f4fdcaf
+fc96c1b4bed1c7746640b4be43f8bf4a
+370d5e39242bc2c6351291f1700a345d
+68824a4f2d7da8a68843eb5edd9bcd75
+eba55a9e71b048966272e3599cd70f15
+dc2d4bdba8b839f84524ae0388ebbce0
+6b9f539f0fabbcdfab6631d98617e217
+b330059c8186b60cc8104e1a3ab553b0
+ab9c2a29b9d447f1f2f6437eb7d6a124
+2da772b183f8d837aba758efe68e46a8
+5b4c435546286a194a28a0a12c6260d2
+4dad96894594196742f8f151d81bfd5e
+1f3e1960cd7c8bee9831ae0e8218fa90
+aa3040ba694f6d71145e476a082501c1
+5571085546afffb18f062fff9746f3f6
+b65f01d171087ad172b4b37b76240661
+579e5d7c189bbe407386ad8aab99a88d
+fcb9978a97e8b8cb7f38482deb8fcc65
+3e81c68ae1765a07d55adf99a54f4ef4
+3bef65c8dfab393a04c8b00ad7298499
+df651af4eca9951a467bf33745cd0792
+4df79038725c610e4c4f0b2332815f95
+5208c5bb9c34ecbfbeef4f38aafb589c
+f3017cf5a96cb3f61bc94023526ef940
+22bd6df82dfbc03ab9addc0a7e48cd15
+9a20ed8b17eeb27416ce27940a0ae80d
+a14103a318fca64ce4ed43dd7b7d3d2e
+92c7a0c56048f2aa191978a0a52eb1a3
+2536be1ddb8385aba08e86ae42cab623
+3d53840a4183f4536589e1e1cba269dc
+942455ef5eabe707131145df68a45300
+3e7b89af05948ea90a0e05956238bedc
+71419425a5ebf5f8f1eacf54d59ed5cd
+4ceda2e7c654f0898031dd1f4cd28b23
+8fdd7708463b35e222efd57a2f149edf
+a2dd35042933e251e4014a9689583fdc
+56591654162276bbfb4676b6dfc601c4
+9040695f2f8caf2e72c25113b34ad2b7
+4f022738f2259a2492e0acfa0224f130
+b66e2d6fb3432b5d281e78793aabab07
+12a71f138035587605d5f1f587fb6c58
+4655a2ec7989229c790a548c604019c5
+c47d11b068ed06f137d5b59139fbd57a
+70f68d937c1accc04efcfafe2ca15f3e
+15f972a4acfe8b8f0c44abf6f3fec91b
+2cc71a98188a93a6cbca0a094663e24d
+18201c31af277e738f6cd8b56086c7e9
+14fdd80772b1e7c3b17a01409b6c76d3
+4bbb6345c48b0a1a238885eea6c407a5
+67984481de4a645763ef22fa3f8c3106
+987878fc0eeb8bf9e57cd361b187f2e7
+13151b79dfe92ddd15d6fbdb82455a98
+18c89e9d3c6c37501b95e993f24e30e1
+098b462c6a57c741d16017e1581b3742
+3545f29d0effecff2083479c8d5f8cf3
+ea9aaf400100862b4d2ce0534cb0ec37
+e84098625df55fc8be83a446c325c99b
+d5796d5ce1291e75440ef5f02f81b683
+ceeb73edb3dfc6cf9a50ca325ea5192b
+c96ad66efa6e8b85fc04a207b20cc8c2
+e773e775afc8caf56ecfa52369838c91
+c51bbb2f09fc9fe4fbc9e09f55b663fb
+a9480ac14734159ab11b26778a0d9768
+ade9171290bac2863583c9cb1059d841
+ae54e5efc2cca516e0696547597110cb
+6011e1bca03d9f07ee3077dc42c394d4
+f3e570dcbb067f6b62a0739a554047cb
+9a066380b1a3d515785c56c953cab099
+e32ce8f3a9c7b5bbe99fd40ef294bc8b
+6d093f093fc81f8d107f72ebe8e13dd0
+a32f2e82617fd3af54a84a215a36bb85
+fd9fcff6054fba1ac351618075788c25
+4b1b64557a63956d3f3abe1091dfe7af
+7b0c542ee2a51bf20de386a946e29357
+287f91d0e4058870135c3f6ea1f1b6fd
+7acb6337d7f98d1088422a87e9feb82d
+299b5d784bb238921e203bbf8ca35207
+1f1b09d000010a9579c749a2f98300cf
+d41d8cd98f00b204e9800998ecf8427e
+b12aaca2997cec15f5010311139a76b3
+ffebf014cb085aecdbec5992b391eaa0
+501ba5e9de7cca56e136ecd0aff96159
+797fc01320e598642aedb273885e48c9
+41a16b4ffe9bfd3c496b337e6de222bd
+227e01a3739b750dd481cb1331b2268c
+e950f4c4f5085667c6879c65120e91ed
+794f40c009c3581ee5cf63c7545cf39a
+88045d8850e1803093ff6704a0ec631d
+c0a60d437691e78eef2974859e85c6de
+aad2e32c0564bc4eed35d3bf353800ff
+1441485c8b4219455c2a975c1d239210
+952438ddeea5cf00e87c8a413c3ca5b3
+1467353c019533b552c69c757ddb605b
+eb5ebfd9985484d7975833d7e0ce240d
+be95ce125038e0e87b93e10f19c904ff
+5cfd0d1227e65edd29ecc076f2291a94
+ec14c8d760034dbdde003451852706ae
+9d5580f7ec001902b649ffcbaad8028b
+c905a383dccb6695fb638268aaa77cd3
+76f9b7aea11e41dbbb2b7c9ef082529b
+37e0685d0bfbb50e64f4d62a18b2ddaa
+af106d776d773f7aa298ed48912500fe
+06aa1f86c793a12280e620b2d32fe82b
+971122ac0f58f2540abc53d1f6b15c82
+c5fde7da3761ca207d383c06b071f022
+21fc177d9e7ae481e849580ff105a9a0
+4fe9dc543c815aa150b2c597e80483e0
+28112f13b287ae304678890f67d551b4
+6343044929da1ae04c412fce91426a6b
+09ded9375ef5d351c4eaa345ab54b031
+80036a8fd2ff736ef6defcfdd74b32ab
+1435973d7fac10fd314603f285397cfc
+2745a998c0ef6c3121a68501c504b2ab
+c1a104215bf2d47be130902dfc64a6aa
+3d1086d673b3679cc388a314e756a0f0
+12f498b141bb82a734251f7684cfb217
+8cb7030f6e6382add75de6ee29ae02e5
+312d7bd4b1cd9401ad724d6aef8f3c22
+8cacad94af1920831c085ede78c0caf2
+2ac109fa4c09347aaee6753d6d7ae7b7
+8ff9faee2d393df594168ec0a0be3b1b
+460be07f6a60d8d9fca3bb8ca7bd6b05
+72e6b350491f3e551684860056bfaab1
+81bc378b90737f55edb16526ba538913
+f363db1eb9ba5fc4217e602fca4b57bd
+d6949eeb7b0e8f5675b2d5e8dbeac3e2
+d41d8cd98f00b204e9800998ecf8427e
+7bb36e766a9c2fb1e33d280df5d0f006
+52dbe039b4a3322c5ef44871e1fe5643
+847d37b48e902864ed3f13af8b468004
+b5463dc337ac7bebc450eaf67397f720
+a1268fdd663bdf60c1c828aa25a182a0
+05d13a474ded2094149a82e75f815f13
+825c4d1fda1772ede0f32a8bb4afaacc
+2eb7c15c0cd1dfe5e28e67ce62a8f19a
+c9e8d701013e5719761681e7f087cbf0
+e753c51de1cadd17302e46bb6d2edbc6
+45b332839ffeeb65ba3922edd068b6a1
+cf325e3457c1df35e95b3d3526517969
+e4bc10f5ff9d6339ec715cb4eccdf15e
+f67101743d4146f2c85b5fae9824ae78
+3cc4fd24665bd72005bfe457bd2877e3
+b28416c5cbfe74c88939f9880940a038
+35f483b573f37d680b0505687444e153
+6c63240f4ba30526b8aad6d17319551b
+a32c7c774c0ae6444cabb18e70e87ad8
+036af8e53ca92b580e3eaf62ec15cfb7
+9bd94d9b5b64f4bb918b39254fe76fcc
+0756758f0461f1829f5f56dc242e174e
+0df56a6404bacdc2a58c929917760a5d
+d1e6b40209fcea33dedfce1bc2cdb8c3
+638356a26aa70141192c84db2c12fba2
+bb821404d019702062496205a53f93f1
+ef82fa83db273a3096df31aaa9538987
+3a858b72c620fcd5ac8c75c599cb9e5f
+17b9391bad6f9f6c3a2a265753720ba1
+e144168ae0334edc419300a96499c44b
+fda9b35a3c7b8e0da55ff61669da08fd
+574e44715e196f8c3fb56a7a952471e5
+e0da3504ab0ad788c9f82487d5712179
+4855b048114be00e3f214c6691c29f52
+7d8d06f115f9cda0a79150a1d5523aa6
+8bfb7179a428c849591734a3d9bc5e8b
+48a75b3f10a344c246afff802dc4fbe1
+7d7614f1b7af10e803842ceb44023ba2
+0d02971cc4d4f6d398c454f222f8c00c
+71a231c124f4bf4e084f5361aca66586
+5e50b32d86d4df1ab2f0409b7a75bc7d
+a75be02a32bf6ef6f2273b82025c30b9
+15a5398628dd54f30aa6ee64e080e884
+c70f27490e659b726015425ea031ec22
+96f861c2b1402203461e04cc4325c179
+3ea06f6cfe4ff2ae780a518b32af33b9
+28d41bec786b9d828e004ec6ca7bc6d2
+30b5072b093c12310f0a263be96b3947
+ba1c77584073b205e5a240e711f427aa
+d7ac3924b604f54013d0a6964eab4043
+9716998325c115b8d561e22ec20f1dd9
+ed158e94afd9556a3d17b327b67ba809
+acaa23fc6e1657d9b1ca3d9ac29aad92
+e5ef3f233b82b477e1357b6148df7f20
+dbb82d77c51678d159fd4fc1561ab849
+67410355d538806ddcd30cb04bc59524
+265ddacbeb524726b0cd64450f493bd0
+548772aa793ba06d1a4b471c71dde120
+f23c807daff05944eba55c8b4e6edebd
+2d84c91abfbc7cf958b6d0d3f387c5ee
+79c85db4ca3d43bdfc60de08830a48fd
+9f25cbeef848dcb310ca2a729da1f587
+fbc40325ef4de0252ca8defff068e50d
+7de6777f5fa637ef70a51c89e358e520
+53cc8462945cc5322da69e978c52aa59
+62aa0d04bd06a6f199b76a0126eb4ac2
+3aca28ba1c467977c0f3d6cfca005666
+2f1fb47d0936e4c8bf74391b5abc12c0
+db16f4ed1457fffee72ecc26088885d9
+4ca24b499feea0aa24dcfb64bb042c56
+35ae9566db96d52886d7a2f0237fd78d
+c697b952f01ab6d1b33941a18e4bf240
+263de5a6bdc460db063290d21bd7785a
+2ca7f566d7dbabef8ca9ef7ad2febd24
+87481000d64801883daff0aca6ef15f0
+bfa6479a4cd0d1d809fc67683c7f90c7
+80f4016b33391c15c2169f943a55f32c
+33d57848f07330cacdb7bf1e18d80139
+18c80f8117d83f8f4195ea1dbc1b4d56
+9dab4f2f633d24647e0099df9d485d5d
+64dc3fb1fc0c416201751831f13ce3b2
+f1f4a9fb91331ce62e70bc1a4bf3bcad
+cac7097b5318ad204db63ff4354da5ee
+31dd913476155fc39d51b44025a55e2f
+f7212e70f285bcb4991625975d348058
+59ddf8cddb0862582aa775ce64172fd6
+9d68bdbcae3c17e6c42818df96970d06
+34644a72f7f73e048a801030e4c4384e
+e2c2e922851bdff0096c1a98ec8dd4f4
+4e39410828c8277403818782de7b7b3d
+2d33b2f3ed20cfdf5d8463ec9690b306
+852748529f44b75381af71cb42834715
+b9e834efa77d5e7b37a432bc420e8155
+969f383b7c137dc5bf14c6685931eb0e
+5f1b058439f1ee55d15b94bbf9a81478
+65a6b22da03188f54795838dd4a105c7
+34c4fff07e2f449c387524d3574f5fbd
+276260737c1d9ffb0d0ae1a38c1b52fd
+07373ead31d9513564e650a137d26714
+bcf986c88fb64291a8713f52cf360f92
+f86affbbf60cf49e41f2499d19c06ef0
+3113f36281d34c5f8ef61b473873c8bf
+1647ac676adfa442ed54bd70786420cb
+4870fb606a5335fb6ebf3163b4ddd160
+61d1c62518e6ad5a711699c3c2484c8c
+5183da6e83915373bf18fa0ad75118dd
+8c092336fc54a480881a419e7770ba60
+7f361a3b2954e235499588d1135109c1
+f1b1fee1f9993cdbd4108318b33ac93d
+d6559136eba595b203a7c37efa2700fc
+aee75eecdabf3d42d5630fc8cfdcc802
+b94973e5a9a50eaf7bd23bfc0e1ee8e9
+864fd1327ee0744a283152e8ff47faee
+4eabf3bc8e38b79e7f795ef3eeaa037d
+432eca7e256a64dacc0c3803a007f3bc
+86597182f676571e5116be9212c7c83a
+a345f6183db05e135111e788b29a5f8b
+2fad7eb1a6b401d87df4136576ff5b60
+fb7bbf1583867701d0bb43d6b57c2770
+3725a639eb7464d1d17f94bb0dad10fc
+98a23c1a603838c6e27a780ae8d91c70
+52ad9fce88bb6e6c3de6188dca9f23f8
+967b3eadaf44845c4605d054caecd02e
+7e80253f03c8ecf11e2f492e198939cd
+0a77b80fa8d4b6760bd814e67d1c0c03
+9aab467c2c47d8c2b62a5e567c1cffde
+09f30034068061d347bed61b21daeab3
+bc4d57843e3875f69edad146a997acb8
+0238904a89e4459beba2a0cfd1263324
+cffa9080f58269d6291eee30dca4e8a9
+602548b7a9945f70aece2374af552ba0
+a607b1310f97cd50d828934d6192e66f
+f501cc0d602d22e5443a64f9bb9d9c0b
+dd14de77a7d28d78031691ba646a9058
+9bd13b70a446b101527b43fe6dc21592
+4909b1bdd6cb44285cb2504f09b3a616
+3af4b6717462fc4981ec0193c28ed5dc
+fc62da3ed345f33e295757bdd8951345
+ed07a35db3ece3982a3dad0d7c646dd9
+9eba4031aba8147615c10572175e71e5
+8c6d8111b939333633ec8255541a27ca
+da90baeb8a769c91caab9b9f15c1ed0f
+9fbdb46f775576474974321c3c8e1c0a
+b9aff6abde3d96a59492e9773c72d571
+b5c045cf487219c3daa204edf3a468e7
+118c1f514d443a105ac5e704026c5361
+af4f50abfd45a26915ecff94c54d3f58
+294755c94e7f18a9500248d0760fa084
+5ba75edf1b0f921a70cca75e6fe260ff
+12d9a4d595803ce837ba6cea39422ece
+0e6d344abfcce8f723edce23c12e043f
+17fd96941fd56b3d856c2310dca7fe40
+f1d901fc87d7548e1a83d3ec50a51a23
+fef802d08c48b1e879efb3a4d8ef899d
+880d0aa5921aebc7febca38a443f6a66
+ea73336131cd7e1220c14c9a1dd104d1
+cb3da05034d8a69ae49fc3031fd8e80f
+9a9686792a4611d96871e93a81b1f9c3
+5651c8640b2835c52128859a051493f8
+92eaf4c2960e0dc98836793dbdf66b03
+a882db39abaf8b4ed547ebc955eaf6b3
+783ab15fef590f86e180ff405e93545f
+1c27c170d9e9827c372faaab15176905
+7f41c8d7e0c30d3c63167bb5cbecf5ea
+b7f55877a8ce551863a707932111ebfa
+48dce791db154a99401d56471156c287
+465c9c774416e2275f86b644936b868b
+1d5d7d7ab422d1582cf8fd8ebd2d7bf2
+5f1b96b40d6d9db064ab2fcbf3df4e62
+b6cacb277e72f3a2084f9023375438e3
+30ba67fe8c78cdd462924a5d403be210
+128d181e2868271ad141822e0f0cec14
+711f74747c30e6507f8ad136fc3ab7f0
+9f81d3e43bb8b455867695ea5307492c
+0e0e55c72222bc3b34d472c2d236e723
+e1929f89bc9ee2e70b3021bda4f5b300
+9afb317aa6a94bbeaf50b13126d1fe56
+a5b4903203287f7bf9a70d406e6651fb
+a1d4be01f018075b7743b76ae514d412
+e6931c1c3e9719a010e0ff8a243c11bd
+63979ca0a32c293465280de39ef50846
+7094c467aed1064e7e84b37cc65e39a8
+d16f8b9d397e42ed0255d18ba72b554b
+ce4e4b1c96609d98ad588c567e756286
+10c9f09cf40af307c1fa6fa1e2ab0a99
+b6cccee2f937c4de3a50c21075536928
+f95e8d463f41b6e8b0366a7d07acc856
+fa053eac71110bd75b6e9af27808a6c5
+ccc1f81d2324bbed2e624f6f7f5865a5
+19320e5507e58346073f3d632d5b555a
+0cbb2ac5d0f562056e94d29419f531e8
+04948d1213a12da9ab521c0bbf4c2278
+aacec606f77778b71f224453452cfb78
+8ad1942fe88f6daf061fd0ad1a691285
+9105c904ee70eed1b5458429766ada94
+9a4912dc12ece981fe4500186b96ece7
+b5520777d6019e521b2b8b6027514790
+6d34e3e635b6da159b976f294fd887d9
+ac432c9d6a5468826e8d5bbfe6884e0b
+7d7d20a695cefc76e95cfbba5422e1fd
+902e146c116c751d41cb118e1e048dae
+6f7e011ed00f23f1999d44c982dd2eef
+82010d8a73234d00daf532c11830b388
+663d9b89c36713f5122dd58080b457ce
+1d662739229f68a020f58777f9ecedba
+7fbb699ec4977e63c112150c6d4a0896
+5ae9bfba1983692379b383fd5559100f
+3df86d35cb84be914ef5bab6171536d9
+f40566aceb83df6eda18903f60ded451
+748e7958a0615e0e9d912f900a536521
+c9cf6d8f1e75be55529e9b7595674243
+2833afc71ebbc6bc01fbf16b07bc7db4
+086bfb4f08174f895ff712e7bca3985b
+939cb598854a66d11bb408bc5547ba4d
+91d39bc8ff230a0a14fbf3039cfc6228
+1b94800572cb91d7b90e824ef5a7c724
+0d6c65d3048e460a849f86c3c9d98836
+ff4b5cbda4c314f072649b53bc7880d5
+bf2adc1b65d92af42dc4c75dd52cf8e9
+dbcde3d84b11a238ab27830d4cadba2e
+c4485a95298871e4948c276cd8f2991e
+e0253a399a100dec5cb48e935dd28e64
+f0c8659523426c9ac2a3ddcc182c13d0
+5c2fd00e627716dad7aa229b90badbb8
+8079080985a9e965ebe0e16c4b62a9f8
+1b4ccc71fcf04cab7717346bf7adaa48
+28639d23784c0f5150ccb4a40e8c5499
+a122a6913416d88365964478ffc96055
+ba9db83de58692b4250d666fdee1bcd1
+2f11667fcd77ef3ea3398b37ed217d5a
+1b64eb05bebd96becba8b7d9a99961d9
+53359a2c8046bc39e1e18354a49ecff7
+879169c7d803255d29cdc03f98391c01
+5d581663d0189deb4c52f2a62bde5c8e
+db03462a68c7b51c0176fa1eda6511e5
+cd3fd5169d7ca7f0f631ed1791bd5550
+2a68259dc7d231016a80996b80eccac4
+c2406590d8aecdb8a7846c6cbf2d78b7
+84607588735822d341fdcbf1d6ee94c2
+76669b161078ed6a07ade8e727f6cd10
+1c8d00ccf9d3eb9ff92eaa5469268d11
+972bca9bd6f838f2be1d895aa9caf505
+fee2b2e4624cabf8ddb1a1d3aa77b6ba
+c10d42188a208550c04b791ab666dbdd
+8a2e570b4880faea5efa86e88afad0ca
+b7589d27989fd909f1aeb6e94f331686
+f7ec74078f29037c45780aeca2b4e856
+f8dc36019ddc92636005c6d13346427c
+c9e0f0ab2e1a4e20fa3f6470034dd5e8
+495f9b06f68a3c695e5c88cbfe9e55a3
+4e10cc16a27741c2c19f600d4d3a99cb
+e2202d471f0047745dfb57247af990e2
+e3feb843f3122fffa07696758b68b9ca
+44c2ae5a2e19517c874cbbe999304908
+61bd5a3ba06f3d108419482ea286960d
+2f5a9dafc4134f7d98b4c3c700c6ea1f
+15f64c3963287d22f60ff401c3f4f045
+619a95b9f50517d214390236d4f6231f
+9b73e1f002685be818223a3904a5c9be
+789232700c2380104bbae1d191328278
+f16279a893f2b8d5384fd91dfc3cdcc1
+b49f822c8297b7ee750ad902714f7c64
+d63adcbdb642fa5e5efe0dce229ea723
+2754dee56b87186b0c8ac3830a8a8a83
+9f6ed3076b58d5e19ef52d36deea0503
+407c3e1769bada353615444469109a12
+2a9e98ad88580352061b932bb09f20b7
+74865c9f5b0baf833c3c4250d8867019
+c491c413e6f86a6477d3bc8bc04d03de
+1a078ec21a7036427dfe5649eedf69b3
+512732f23f5d0c91fca1d363ceade86e
+312d33d58215fbf15b0316eb0d4128ab
+0781ac1c8004d1e47fec00ec3c28a95a
+a0885d53b2caeb113098b5051f34f613
+fdfccff63894a202757323ec768d7eee
+a79d934fffdf1c012b54ae10ec262c8e
+5013678cb16a8f244347db271518ce78
+3427ae4e40d07c5f362a8d000e26b5fa
+faa4b9e85345d8e27176574038e15209
+c5c6bd18438e98f8c13712a5fbde7268
+257702261dcc181933df46f572573c72
+80f4342a5e4f26270abd179956a53e21
+543486ace6c8094eeb886957c106e5f0
+0c4853f96745085cced384585b1080f2
+4ca9ce35201903f42f418246fe6b1a7c
+4f695218e9e333c6fc7685c9e5d51e7c
+fb069fe5b8ad641abe35ba70bc1489df
+c39e156239a0a98783b551f993033973
+5ba2ab65fa19f8e444e70e9e35b149f6
+40e60c1151f3e668d9e53313f472c490
+c1149c964499b3e034b2d1643bddfde0
+8e9596988d1e9d40726dc840145b3ae3
+40e0e840f814f689077123d55b7a71a5
+8443084f0bc16d1e1e4b0acee3e3c791
+3678ee9925205b400e9c6fb6978f3ed3
+e26c166fe5784b7bc69b6eeaed083f0a
+135f213b81cc60acc65dd2154c180c81
+42e3e14accdbda699c1b117d790ab3ac
+820f75126dbfbf77a7c70e5cdc6a2b8a
+25ef1353453a0b817ed33bf7c5e07fa9
+a1a94c7086b0defc70e8ccea2ac659c0
+b661d7f13af2622cfd6ba708d85f015c
+9f6afa626af9cd40e3e52f6af1d683db
+f0155cb6ca3b08530057fb9330b54302
+6c946882d0dd2f67e3f212638f54c206
+e7c21c0240e81081fa8685d969bdeb1c
+ccc1d9cbf8fe70896cfd53dd889a50a2
+a2de2eda4e97a09b005bcf39bdd1a56d
+0f9277baee0b863edca0ae2756616609
+d236b28ef28ba0e1c9851740051b85bc
+b2958a30a9aaaf1be5204e180ccb9412
+258fb2300f14a82ef12f4a0ca6ee2444
+09e8164414477c148aac573185ed5ac2
+2b539a13831868e896d7f18f6c1c250c
+50686c59b8477e29318ed825be52aa34
+03a8c348e628df4f15a0ab42a8b7290b
+1bd6a78a394b27f9981908fdd94a0a38
+f9f79ccdfd8460381c8cf68ba055492a
+00c60ae4dfe5b4391a6a58a2c8f42baa
+d25d7d16638320468c39ed6b2153ad73
+df67d7b25245f0e88170aacc0a3f32da
+d7c0ff819a62cc98222cffb518ebd305
+936b1ccd4afe0d5bc686eb492280356b
+cd8889184862aa18090399b224620492
+e262d909b4577668df3afa2edf30fa02
+37979fc8fcb6088ddaa78e073cf07933
+2941b9f187606caf4180ddce0564f640
+f5a437ad5514032b3d911e7048b15fa7
+791925b3d7fc2072665d0090cefd0087
+0ad628ff3d90f2b978f3635ac82a581c
+7d148e85e9b40f40082c34c43d707c48
+41cf63d9403da69087102a4ae5ffb75c
+f4f63a358ac0cb56f86634fdff999291
+50f8fa988ed4bd4d18db83b43bf82efb
+4ad5059fcc1fbf73f6f8cd101cddef88
+5c5c2fccbcecc15c3f78eaebf43e5a49
+421ea07627ffecc6a418a8ac1a664506
+9282758c4d12316c2aaf055f46e23a3a
+ec50047b858709989b83f5a01896c646
+3b61cdf92cbd66b2fb824d1da488476b
+d3c9018ca77aa152b2b7d3092f8eea8e
+406e38333a02b9737a7d17e95b3a9cbd
+148008039d100377fb12c5791a88c0e7
+ab67470f2debca1a16411e70e3c6d7b2
+fc2d3ed4f8ee7199377db14576815adf
+7888352c576f33b5c90ebfdfb372d86c
+1638f25b3664acc721434534cbe6094f
+600a557eaca5515bf37676e220ee9a86
+cc6f65539a61c9f57b76bba2798480fb
+708bccaf71d5b3167b062c5f1909c7ac
+b8f829f2ab768154fdd0fe5f1d8d2236
+44425eb77e774ff8e3bfb9306804f5b3
+31d0217b2e8006dee903867d371f971e
+95929326d434ef0bd049e3e6ab21ff06
+3307034edaf0b3e9aae79a0c8286608e
+dacc99b7d7990d887942e2ffaa67e910
+7e79626061448fc41827dac12744d20a
+f7e8229a271763a8c9318bac7fd2cf0e
+2ae5e3d603120d52fa66430fbbed73dc
+0d21acc9a406ab40dd99d535d674c7de
+aa551dab754a0bc2042ce25656ffd476
+58d3fb3907cc15166f249fa119ad15b5
+95ae7e67dd1135cf1c0761388471e7ee
+813dbae8f56466ef140e2cccf8d2f761
+e610563c8f693ffcfd5f1cbed4386984
+0554a8c56e73c78e83e65ac11f752911
+1f6589971327328ee16c3df68c0515bf
+6b70db2c0044a33f39f3d6899a0cd1f3
+dd3059a2fecc8c87a55efe7cc53cc880
+15e26ea5034e09a40d9e1e7f3b13a8b9
+a0fc4c4763cb463ad647323bbd462557
+1abb6a543692a89081c6351c574a6d8e
+6fc6d9d94274908c8cb764a4789b9c00
+0980272d9e6ac0a4c6060155855a091d
+e7be7d2785bed1d39c1ad52fe1227922
+4d48d65ce5400887ff4d2901745b426f
+19159b14eab65c7dfa682f4098e4aece
+4945da109c19838bd2c465341dbd55b5
+f878f51c621e60675a0933ab7806f753
+b9d344090ded42bb4673dadbd1648cb5
+24bac23e91f35b3c22c72d964c7bd8ae
+367827ea8e0547795b759e25daae2960
+b37dd1ca734a92d422b68d74a3ea4bfe
+6191386eec12f9052def3d4e75b82cd4
+2d026fcc106f64a86825677f9544eeaa
+3a7aebc4796b5bcf92c2d719ef04c2a0
+7709d3bcd4e6baf2bd231b6a75d96647
+e99de5326fb2231b8fc6dd9dfbedcb70
+0a2a60f44199cf19a1005df334a95f9e
+6566f0dd8e441e12828c208771feb9fe
+cc31995abc9f3c4401a58e9c2b37128e
+469f4c3d42315a8cfe85590734c24f15
+2e3ae8d444fdb6aadfc6080d0e5dccc4
+a8f52f08893b0c55511a1a8af52303ef
+c38fed233ac83ee22985d7bb011d8c24
+74337300486368d98c050bfc9d88860f
+90eb66b09c1470a134dd9d005ed333d9
+3560e945e96196906fac38a94e5e9413
+ee307a19bfa63cea0ecd77e80f46466e
+866fb80e584bb0b16abfa9771fc6d412
+7de3f7ffcadfb59e97e43c91cbec228a
+4e9b1407a9c6178774c230b0becefd24
+26b32639be4a0efee004b28e2b36d7ab
+233641e6d10f787c6b8757504dda4af9
+c66f76f5c81921d4c721351a46171bc2
+b816573c0c73eb1dd1100e05b44e459f
+b1a9dfb893e6fc2faa872e6677c9c438
+86ca93714b475785252149fa75fa4b20
+43e732e0e405c7f83181677d94ff15c9
+a4629b6b39eac5e9a8a67d93870bab18
+07828e988d5c061415864f13dae128e2
+2ac709aaa136551bc00dc0a7498b4f75
+f14791b6126556661a8609fcf9cf8d9a
+81c5651a19e7520e07eece0c47d548a9
+c787ad665bb8d73d45ccc115bda02a2a
+9a23728c1185e48a60665e7f09a77c0c
+88f98ab0c23811d119db30e65ad10bab
+b4fddcb2bee3f5123691b3d547d0a81d
+afbe879b6661b3cf4490aa11b58a6adc
+ba334b97cac0beaf4de820a15f8a7b5b
+b97450a3a4d544f4076c7657bf4d741c
+34108610c14bee58682b0e1f22ddf07f
+b75b1dbc7ba21812f2ee5e1fd5d0d1ac
+1eb3fac473fa449ef5a9f7c7cb559d36
+45d3ffd8ce0a41aed76800acba97b5c3
+b87966e34f606507b124cdf4f582b05b
+85e52d1c4018c2e5f4bc5702385e9502
+a5e54d2c7d4ccb4f29458984c00a1d0c
+d1c74f7b763d81e0cf7e6ba6ff154d29
+de060a0494d229760554f0975173819a
+09fe7db517c7be7a1d29962f2c6425ec
+ce3e1a0195510b289533f8208c920168
+40cf8e2a61b2fb90f3694e571b999057
+f1d38db80ad63441cf3fab462f75bedc
+6b6a4414e5e6d63fca7e37b2a4c4cef7
+36690bc162233445d8d80efb2e3fbe1d
+bc455bb7e7416d9429fbf45581b16f05
+7582d4c6e99c6f95abe9650e0e31c6ab
+fd5038d4878aeb91c916ef57cf162d93
+84926ca60833467d66442fef3e817293
+5ddee6cc345c5cc4b38596a4bb32df3c
+5fe9ad532730f4d4bdb1207f0a659dc0
+ff0e40b13d998f85fd8127adf6a06870
+c2c8fbe47e6c0909b35bfe4ea89a335d
+907073a310ab2683ed1d024c4cb71183
+850714880bfd5b95f45b362ac53540bd
+c2f96ee8d9679fa8a19e3430a6189681
+3be0eeff3280b8093b699e2b8934d6ef
+0a3b0ca37773f282925f722d00a25ed7
+b25136c1999a179cf832885f6b80bb34
+d86da1a8a1a690e3ec665bd58e2bc444
+b8e193d30c1b8a440ddaa5aaaf5eb41a
+942df9a2c4d5e0cb90592976cc9e0930
+be9469719aa6efa2fc3bd5768bc406d7
+4411dfc21769da817be549cae022c270
+3c5249f2fbf9a97ab20ba027f7d4eee4
+c15b5d003c679b1e4d28a1c3061f58b9
+df24fa9c66a0b1efd33de33ee840178e
+673c5b19c0eab46c6fe9a9c8d72467be
+79515923453cd7f9de800edca612cea3
+03a5d737e4a8b305f89d9c00af336f6e
+04e1d6e1f6be8e5946e29058c4ba816a
+d31d4660eda23a6a2f91fa651e4735ae
+8ba6895e9a912f024d710ed65b1a3337
+f03eba65d9a3a359744b3e7e80e80f2f
+c2fe9bef89d05198e0058ac8cbf10767
+5e3291440eb8e99ecf9f18edc507f220
+49c5fc7e131ac465755a868b1264b2c9
+3619c739584e4328b17ba3888ccae9fa
+1498d87906ad9f3c89a520b670991a31
+bbf4e4f42c748123f97e0b911abc4101
+17946e10f80965651676a2effa8e6bfd
+e5416bdca358655d1111b0a5fc018875
+1708e92a876e363af38ac3d07f666be9
+adabc298237c9957d992632a3e57ec2a
+b3c443e2e77449c46d65f249f670cb16
+eccc899fb5fd843e5f1aab0a8eab0e97
+8b3fcf46b1db902313b24bbfef322da9
+06677b2b0aa7fdb2054217462bd74178
+a5b8612c2f1a6a487f4575a4ec06ecef
+54b6dda339dbb0dfb3e31ba80a4ea65d
+afd64030efc458bd94f99754c0a75937
+06529e5e27b7f39b735366453311cb8e
+73dfc3add3e688e1b2aef14349cef68f
+82a8ebf740280004d41d8d369ddfe22e
+01bc4ea7525228339d9bf1acfcc45cde
+63718efb29598d7a98cb088afe8cc2d7
+f2eaa42fbd1005ef619f5b0e7f04d907
+d93a617cdb44d96cde26aee9b9173b64
+e17fde92e5b67f253d6cc2a0a68ee497
+163b9ebe600874ca0f9f4f4892be8218
+cad3fc9dfeacb0bf062776b0d09d71d3
+02ae6c0d672c27c4f5c5bf3187252df5
+1b63e68404c5b96e701d6cf276e23925
+9206be8cbffdb9d911337ece834bb19f
+673741ffd40830244238455eff1b68c9
+013fb3d2c6e688f6a7f8d8e310d54ea6
+5a62f39f8bac1f5ebf64558e50722bef
+be508ae76dbacce12ffe3845921ad366
+ffc3e057d3c6fd7dfe22b517c0a5f460
+436a8518e3cd28d1af1b6c45da2d1b20
+830db50fe0efa4b4e0eb49ccc27a3d6d
+231725b83c8fe8045c6820a61c64adc7
+e53ea8bff2e36b3d7956824b1172b037
+e229ec3c2f98b30a38e372a9925410d8
+c52aecafd67619a18416960a69b4064b
+187d5b402f530bce5af869b616c76178
+a5310c7dc053ec1d619438085b2bb4de
+039cdb6eb1252931ad1d440cc4833d04
+8769bc4dbc5210b1b68684da3efaad3d
+4d8be5dd2b8ace90b23cde1d5a6e1824
+6e323bb70a1618bafed880cb13abeb0e
+4627d9330987962140caa27243e0a19a
+077c63d9232fda5f89dd66640fe9f696
+d7157325c1cd2ea0a1e5f7746fde3e66
+5223d0d2d4c416fe813a9069b963e7b6
+ee2957f52a9a1ebc084f3efd0356d5b6
+24d5024366fd70f1a38a30a55a985ab3
+57c6c338981b328de32d307b5041faca
+afac564e2683d7371e438524e7f54aa6
+1d14af645f6e3eb9d6dc333aae31d601
+b8cf0ff0c7527c084d288cd0a69f0bce
+e9eceeeee445e69b09f5325344dc4193
+c834519bd7d6bf9dfe3c142f31c89770
+0eccfcfbdd494773a3edacbeae318ff9
+81fb518a04343acff27b5466993de1b2
+f587adfc661209fa8a7a6b6f1eee2229
+a82450ddda326fc6b441872cdf25c5c4
+b209548b3e24695e802f2ca6f40e9b6e
+d808888c79c4410f4ae5d9651a3a0911
+ed7846417ed36d78f98420b2594099eb
+846ba1462d1b03210b176734b229be2f
+76c9478b5430c1d4ff6fb26788a326b1
+273b29402146de8d8e434ddd25f950e3
+6eea1f41443f84723e206aa11527ed8b
+9e273e0042b30e1e567be91d527c5d52
+1de6cc8795798019cf5b7915b995859d
+c2091dcf653a2c57c043d05ccf79a317
+419ac155fff72132d1520b61fc372217
+3f5358f12ee26473c2cf64bbad06a1d9
+2f77aa76ea2d2c320b96c1e042a7f262
+36b92e925b56174eebebb4fd3af46377
+145883609839e99b76480010f2c70980
+f299266446739f624a3ea4eb9aab7cae
+ed1fa250c86b3791bbb4148011280889
+f689611c0777b189528d43d1c52e8270
+883545e60f715ea5c76ad1b56fd2e0f5
+158fa6fc928fcedabd3c98f813dfbede
+b29ca94f244c0a2562fd45ec65117d87
+a6e1502c1ac78f0a5005df81bb7d3c72
+7da2b139af813165f735e15504196b27
+ba2bca2510962a335e1474664f89306c
+842776ff5dea0dddf9c3dba334276581
+62ca28a86b6f57a167726f370de98be1
+036dfa5a307083a4d3fdeb05857cf9cf
+5c30356d61627f29b22c5e06ec32d6ab
+6f4078b0bba469a2a8e107cf5112f7d1
+3d9a82c46f98009c6be2cc0faa30a4c4
+f2afbc2bd73e1f57fc9236a0bf6658d9
+541b503cbb36c17997c086ab58c5c8a2
+2b79c2f716fbc7341d4514258e70930f
+455a112a5e40610e9bf6ef00ff469d31
+d3e77fd91b1ae241c1d2a663f1061da5
+08a1baf3d3049e58e5914dfc2607d676
+710a4cbce90c32e37f59c0dfdec6d13f
+4d3d49400313f0f93a4348c8afc62fe1
+170e269070a75c871d5d62b5bff5a222
+61641a6546c4120a3c5a969795889d92
+7daff7b9fcf625b051b5a22074e35037
+059d7d6cd3d4f813ed7d834a560b0456
+2f8e210ce33847e2d2543a5ee8e393d0
+d8de9e747f3cfd4ca800e42c706f1c56
+25c5ed66b7ddc39a82133e44d45b7c91
+e3eec04616715e96afe9a79397ce6254
+44c333686e5e0bf3388f1b1435a0c632
+7116dc93db27d87a765fe170f2a78ccd
+5140a49652595180f00fecf93819281f
+d64f35b386609d7d36417fe5d65ddc91
+df9c725bc42797a7505de45c69cef3d7
+619669472a44e549aba89b7db278fd02
+4401aa34a7f099069709aa458ef7417d
+2f2d9109e62ceff4834227bf9bf85637
+7635d7795f8d86b03ab45206848f5126
+285e5320f6e062ad421bf69b5c1d263c
+dc58acc116a819cc133ae897a9520173
+a8ba483fe8824e21dd237ed4cfe97d31
+6c9f9e7e6e8e55c5bfe9f6f442ae2d95
+7e2303a5b43c74aeab99b729e7da3a9c
+bd7d3322edb04e382e28b6208dad9ae7
+aa2a2ef9e0eb580fc8aec78d8c9f7a40
+6c7d65aa1b8c46936874bbb3c150a285
+cdd8a7ecbbac2a8f3d1f205338219e8e
+d23a21002bf1cb18d25ae515a22ea4d3
+b8afdba7aa55dd52f680ddb45bb51c16
+e746a70844ee17fa5d89a4551df74165
+f45e2fab33603826706f6b7ef78c6768
+2eae5526e08b2d26f6aa372acee62ddf
+65e2fb76ffb25e54b2c6f6a5871304a5
+c43a356339e9c8dac6dee6108f5ff551
+18c28ca30e133b6d40a4df3c8f6f5995
+f79d4e705f5a0986cf0f17359c03cfac
+e8ed26b1b29dada57ceb318cee243d0e
+70f2a9f5eda7537fb117fa51558235e4
+3b88129a9096746186732dce8729f03c
+874585e1076c8eb7e29b34e49172be69
+e73a9c8d1d633733718487aecf75ffe5
+02a10a7d57bf1225566adf2d36682c20
+1d3e7ec268f643346d7dba75d24c67a7
+29f64868b6143c136470d0d08e594baf
+28c309d3e532d440cafe684ce532a549
+1e8db1f80f6a0731310cf4d4878cc483
+a77c094b8e983bff61251eb879523462
+c597217ceaf8e3eaafb5d96795f6905b
+53177de08ddc05f1b6ef3bae1bff6c51
+d41d8cd98f00b204e9800998ecf8427e
+97a8a6730314abda0d36d15bd89a82f9
+8e2fa162024524c0682de92c40bd1c98
+ece0e93eb2e5883915ed7d9217279d92
+63d3be7315f1216bcccadcef7e174a90
+c9d502e8c6730c238c176904dbf02127
+337f2c4053dfc60ad249e5e325c3fcc1
+736e60c6498a72aae23ef74450097b5c
+57ce5106101cbfde84b66867aad7ca6a
+5ceb980398f64b4bef1b27c039a18383
+e9ec73b30a8bf84c76ffa1d285b18640
+c3245a6e40f53873da20c7bf3f7a8fe4
+c4abd2192e089a53b444f3d816ae4ba0
+fc03f7572d44187bbf3615ef090ed9f8
+f290b89d98ef6af6572fc7b0af686bec
+fb8205612e55efa78a7362100df145c7
+7c75667c367ef91f92d885c8036a2dd1
+7078308a69f8f75d3fa2989eb1bbb28c
+d41d8cd98f00b204e9800998ecf8427e
+602e7740b62640f4a9d365eca00f61b9
+09fe402465ace4327e2d1df594efbae4
+dd5191d0cc32442a939d3c730d8d7d1b
+fbee8c1b708b3c8eefa477517ad6ac22
+45e04bed69a43493dcd175cef21cf0a1
+aeef2cfb73596bc60ce2ea7970dbb842
+9121129230a6b2a01a5aed288e7cfc8c
+62f5853289420fd4e182426495469d91
+e184a3afbe1ebacf56169c18a0aa1940
+a5fbc7bad1bd76b96472cf9bf1aca7ff
+525d42b2936e8b5e975d80ef64a0ad14
+674e516b693bf1378d4e575aed3a1b45
+130c16f3c9f16dc331e85735db299dc7
+5d3c4ab8057979dfe2d42993843330e4
+7914f2b356a89cb1f7fa3cb70473080d
+a529f6bab62f052edca66ea7a75a159c
+d21edbcdd48c96c213e5f2610303f5f5
+3cee85c2482e2c1e7bbdbc8d5c6ef071
+3ab74d2eb950af01880ca22068d363a0
+f48ee7afdeb40e4eae188b6546fe9533
+472247fb3ba328c09aecf722b6fb7226
+4396157e4134af62b899603bf84a877d
+4e40a5ec58775faf3792dc510f7df5b4
+1b27d4f77e981b3edae64bcbb01b9493
+27bad79919d58ed5855d9ac21e727d75
+0e27ae61abdb240be729cc5993f9062c
+8d5b8bc5b3fbfe717dc93f9c9534e0ac
+ac937bad15c76147d0a5e8c48ebb2c54
+2f5762e8aeae63eaf7a8b559819d261f
+540a36e7e0de6148cb49d74d75afafa1
+54f4abbf98e57af8f00292a8e765370d
+0ec0ce7e452d38d2599fe2ceebe31a97
+4f1235e055ed7fd08a517c82890a7056
+ffd0834ea82997e0a01cd396e6380280
+7a3180c94c7ec6468d82fbf9e2a7279d
+93257d0579174b017c3f6854a15797eb
+19efc67c11487676713bb80de3fdd4a5
+782db9da8d2c41958c2795ec7d97505c
+4a7a8e391f2781cf83b7aed0a00e9c46
+0001ddd02f5b61915e94ae9736f88057
+1a36f75907f844ecb73d2975ae685027
+42a661c55b1494307cd67e73c1fee23b
+006c830bb67d5b8dd61f87ddf9cd4607
+33c5972ba5e98a7fb9f2e05f9bf98c37
+56521902e28f564d2f9fe70f427a11b5
+7b12471d5d3fe3b1c6465b88c67811d6
+8f66c004e40f0d9544e8c7e5352d7f53
+cc4422362ef0f0d1fb0161482d717736
+7e87b58ba4e36ffb04d1b0a0e7eccdf0
+41e31d915f4b4fd7678c3ed39477d16d
+a12530b07c6ad4f5e299cd4d66146719
+6e0c2fc9953fdb791256cf681dfa4e8d
+f682236bf243aa17d08cd2269fbe0b42
+c6a1bee94729b38dde9f608af064ee91
+51f4cb206aee1b6086cca6cb88586481
+f55575c0e639b158dab393fc1a2a2821
+758a0cb5f25bd4002367bee44fd5904f
+331524d6f29a189be6d03ba342d8f185
+72047dde350a4a4f69899fe1b6ff56a6
+e4e7cbf20011ba853d7f19d23031519d
+99d2c1f0f42f8f5c57b07f864ecd654a
+5eda3e801f710e77daf176957f7649b7
+8b279982513c2f892d600131d0801844
+5a9a8fff968a3f96c943af62b10e80a3
+c4ce736d97556dcc3c180ab3540b034c
+d41d8cd98f00b204e9800998ecf8427e
+0a956d71668ddbd22b595194c8ad06b4
+18c96622bb9ae1175450a06bfa01569c
+4819b8dd8dfd0cb7e10021f3ea88e3fb
+88f4c3384e5f65ad98e14a102208a5c8
+39d833cda2d4513f19d47caf26611e4d
+9b190cc2d3d711b38da15e81eecef6b8
+4e218c68d13f9bbe04d20f5ff5f2961b
+83cf4ecafdcd9d0b51602ca9e584f728
+62ecd647528b7a00d45d219c29c29efe
+d01190db7d84360bf1f1f909b11b86b0
+5ba081c8022beff1e440c6e126ff217e
+1cd4db0a832ee94c6faab490020de1e8
+eca3e3f52bffcde84df927712a74e7cc
+7c370c0cc5e24f0a63267d57924fc4fc
+4e0b3a94ba52d61d0a7981d70411db8e
+4498373c6bb417f85522c916c5156f32
+e68c467fffaec526ed3004738e437a71
+de979200a23f2d610d6b00e6ead86301
+67e1a319475dd1d7f3fb78a3ffb55227
+7d0d013cf75d148c6ea0bb77c117a75e
+5196423778cd8fded2c39a873578f536
+188da6655b338daf88537d8cb708b0de
+d768b1ea05947d97bb55817d9e0782be
+4b4bd89706b6366154b2c88d03cec3fa
+24dff5acc7e4b89d47e4c690d931909f
+8935b4d6d495a6488a7b2420c278c1c2
+993539ffa04dee088963276a79832a3e
+2f033f1169473e7eb12f01b9fcac81ca
+eba431e41b8a8c7d87327eaed4708a36
+0a0754abb47714d1921de1767a2180b0
+b3a6952034d88fc2a8c00d440255c5b8
+abb95454ba58261f1bd5d5d8d9a81d96
+98dcdaad52d04209d944cfbbb732e8c5
+b9440145a44f8303f92fa721c2fe7614
+f17933d9000f48122fcab89a9f8fa95c
+7083ac1a76cfafa368e2ff8e99385cbc
+38c9f4a086c9c7475d20809fa9df1cb1
+5d30368d1c795ef2f3ddb4cda3ecfabb
+1cc055a881f479d6e9246f0defb3f436
+8202fccdf8485525dca0c27cbd1e251f
+1a990a90f2261b6fe556d505e38385de
+157bdedd1886a38e72437ba620020528
+6d259214baaf2bcfd8e9a5d09d1d1861
+ffa88c345f1816b454baa72df3fc31f3
+1611c6e835c52c2286f929e17a195d5f
+3e222dd2c5fcdb5e9b8188c21cd1cf1a
+276aeba3884b6fc1415570cebae0607e
+aea03dd2ed2393348525329e4765e69c
+50308bd790cf2a4a4bfa31ccc85379ea
+af10189e5d7b4dc3b11276af992207d0
+de384d2987c8a1e863a951dc13624d35
+ca5fe74fe068f405215fc74c40b4e402
+2212cd96c055ebbff660a497750ca59c
+72bd3776a666de5480d5eff4e21da294
+cd5363eab1ad2f627c109074188dee80
+2141f98d28158e19e4017a72b597feaf
+0e5ef57b0b56755d11e3b562839ccefb
+823727dd5b1367a8106d821b4f13a3f4
+637a436a679a0e5f749e5a6a7ff7e1dd
+2806f28cc9d16c4785c0306be3e4f48e
+9090cf67b72ade3bfbc598da4d6dc09e
+9920f3580fe9392a0309d6bc0e0edc8d
+36558a5d9838a175e2268217608dc0c2
+c790df6dd6f9789707668587414aa9c4
+f05c6b0c8dd7b3989bd088ec4c639cd1
+74f8c07f190e859a5394c110d96605f0
+789c8abe1f385d658832e75fad188f23
+4e909610790ab63cdada6d2767488ad2
+94e20482fb1b7433f5d56c27d3c89206
+c9337d8ac178ce47372ad9104113868f
+a69edab4165e1e608867da7e15b226b8
+ec758e39b57632a35cbdf28dbd0218df
+88c20a509f20d684d0cf8da47358f9ac
+e3f6fcdee6f68ae154a53df59ae68eb7
+2e3b5e81c90a0914d1d278d8de0f1133
+31561abc5bcf6bf5e91aadcecdf2729e
+c7b74ac88191a2038d420176195780b6
+2c1193e60fe6fb6c27d332d1382baa33
+0540b3bcedcca3cac5e0ed93ac4d37b0
+e8135c0683f9c4ecaca16f7e6cb5f93e
+33403cd90ac1ec29299b7d8be007eaf7
+385d310ba6a72db2d1fb5027c739694f
+a895130283f4a81917a23fb048e054ad
+10f6f6c3f1625309154b5f4e6be7cb0c
+4d5213d93d524878e5e2f6d90ac21d28
+329f4b1b823b9225085028a7ccda77a1
+939c689d69e807bf4d4382a423396255
+c6bdb95ae39dd5d0c9da38eca8602b63
+10bc4ea1775cb5d89a36d6e1ca4b7c14
+1f3a44704481685dbafb3e579b5d3978
+d41d8cd98f00b204e9800998ecf8427e
+969c1bc0a9de01494ab7d5a42b0a974a
+b015f77e1523ca63adc14a3f1a2d5cbc
+eb01e6972fd2d642689f5d3827fa6fb7
+1f2d06e041b9e480067e5b5c6b3fc21b
+e4e8d7eeb7e515a7d87a8442353cb707
+c45cfdde702e22fe6e7a24a9ce39979b
+6b692f0d6a7aec63c00d32eefd6466ab
+80ca418df7604f6c251af921e21ca35f
+c4324e510d50f74c978c61a385d45594
+86c0ad2fbc28a11d480fb59f9139bd66
+3bfde42cbb4c17d581209cbdae1e9d6f
+f837a2bd92444e2111f1ea8df121fa0d
+6b08a61a880849c935f4f9f69b22f52e
+133191cb2a556d4592b004ae633be70b
+92571c18fcbd5662d9065f9d4b05f0ba
+a82f094baf7a006043cd355578c11034
+c4103df69877142fb49c8e3feb6eea46
+236306c7f43a1e78744fb9ab4ac6c896
+08c37e6f7d59ce6c6e4096455924bfdd
+b2119e6f6409264ab2304567b6584c19
+c0feb91bdfb90dc0cdee121295a47b10
+a3a70b1f454fa517ff4fef07b6a3df5d
+ac0f5c154abf85fb276f2c048c8cc833
+58b54c520a88decb53f53ef0ad29f4a5
+89c052b6d81c1f2fbd44f310f46bd400
+ff8de59f2b4833287a6f0e78370432fd
+640d70c72f7f78180f033a7d2d20f41f
+7145557ab368bce363ba9d468d0825a5
+a3fc07bbfbbb26c8a17e52f70193d5cc
+8d8bdc542900f0fbe65d4df33bddb818
+7b605cd0ed194a386971a2c76a0242f0
+94b5b3502c6059dd169def9e803361bb
+1ef34c3ba3579a518cdff9654b31ba5f
+726e93b10942d29ddfc8ad8a5f95bcc7
+a357fdcd17f47785554ae92406f8333c
+6082f0ae50c99b8ed7cf9952603498a5
+602f1e636abb9c1f0b709b4106752a8f
+f573365f6d7c9038df90e3fed76023e8
+7553e6bc0733134f80af4c279b81e9fb
+6a65982e062d713a725eac6ca3189d5c
+5628da9b1d5abd89a70e34843109b0dc
+e6c4c5b31ad7bc24caf88c97033496a3
+c09e8ea89bb7757a0dc78f264c43a022
+939866a3189c73506de64562af21ce8f
+f5ec67daa330a2f60dd56295c0d7c0a7
+2532ad02034895b62d5e3f7656b99bd2
+e99355b3d5040da46f5053697f0afc66
+968e33d5e886db57328124b018d25cb7
+cbc17c50c806de645d3e7147f15b77be
+fe7e27dfbaa37ddfdfaebc58514927f6
+ec639292ea827e4ddb65786aaf3f6c9e
+c390fe77eb876cb08510e4e66890ce33
+abcb52fc056dd9f9d68a21aaaa483fd6
+459e01170a669df337cf3a7d0b9e152c
+3129e9546ab643d10cf573e138c7f01a
+f10b7cc9e7e77559007c3d6eb149a489
+f3167620ae797cf62cb7f055609de4f6
+b0d9aec585baa36397b5aebb62fc2c90
+586ad35f08acbdd5e6f16f9e9cca0591
+ac0dca788e0061140689a2a85b348aa2
+07725fcd3059147c94391addd7fc9036
+1c804a137877a543137cb46fe120fbed
+1d54bc8bbdda19d44eaddc13153daeb9
+79692bde4e803c99028ed13b9a20c71c
+e2dbb5cd68053dc53d14ad48590dc487
+6ba7baeedaa79d95a001cfe52989893a
+4e89b539fb338d8fc68c851a4988c41e
+868f74d4cf2764684ab49d689d5551ee
+f6083ff32b5e32a42b5839b8b8bf359c
+22663efd4a322036dbf23ea05cf0f3cd
+bdd32885fcfe8cfe2f81546afff7251d
+68a77f96764650f30398b11ff94f6a7c
+7bd24b5c1124b1d703082d2c3d0a568b
+74ada87bab26893ad01238ff854216c2
+beaa387024756852a656bb400f2bdfe0
+348b03bc7e83443e75d37135bf27867b
+bed7918c8002e7f06e1547ed25cd6e48
+1cce866b69e92a40dcf4799a1bb24369
+7fe22101da301959eb94089c81d2c575
+494242a26be35edb02361104e6b23b90
+8592f675a816019f79fb7cc39907f493
+323d9b8ad0f423e4e32928db81eef159
+830296b674205b25dc10664bca927cc1
+5e405a0e9bdb78df171dcd60557e9196
+b73a7fe0f20f6e8da25280cf541adcbf
+b2c5292b3445c3698d8c70fdc6ef9abe
+e4bbb379b8df82271bedad79dd8af2b8
+40dda55b0ac7a1bda0068f8393a6b879
+b68dd7210a67eb778faf3affad3aedda
+73806305de20dd837dceb0979d0a5ef2
+2cc2b2d79009859b9259af56c7a92404
+34b5c98a316684bbd7f2e81be07f3c01
+3b9cc788d55f0bd1671cab7612f192a7
+54ab8e644bd148abff3eef20137d2a67
+2ecb6789ff278e498381ce9428a74689
+ca4bf67ae9e2fa7a11ef78ea75591726
+90dfe5ab3ddec301858be01c89218b76
+bc9f0bc319b27420eb2280315e47c9de
+eb07bb5115d89568a1e2c07c73bf29e9
+ea88fcc9b7f7e07542eb86b745495a54
+901d3b336a6f91b504a76aa59ce9e8b0
+3eea2b0406e4238efe51b50ed07a8af2
+f9189b71eba04ce2a531d3be76445f53
+52ae47111cc1b6c49c6d74a5c85178a0
+fc102c1f58c8c261934b03bfd86a5860
+df573074b46388abf7a6d85311af0d31
+89a9bf16aa0fb4cb3ef2816ed84425e4
+b74160aeede8fe10ad82d7138408aa92
+a9c78f767baa8b6c47ffdb626142859f
+05368b5964bc7240857b574768d35202
+d6091c9e9f468278dc35fd54bc33c890
+8c4ea3367866c4fb9956b65afbaa3a55
+8a0b4d4de91fcc4fac7b51a3242e2447
+6dd6b5b98b91c976450dd1255b91e220
+e28e60884b2c6b59209c93402cdea067
+ace7fd136d81977a8f5dcba91a9d3c07
+22e5289c6d955140ae835c698239333c
+b728b50e5e25fc246f0780cfb6b2a53b
+8005c464fbb83ea4bead9ef92dd6bd29
+72221006d93f64eddf8c0042e3a56fc9
+d8d17e2ee3fb09e259385dc4161a919d
+2f35fe335846e630f8fb53101f272e34
+caddaa43eb08b6f90174c3e6629c7d5d
+3b2b777c8fbad698524c7808c9b76d63
+88d0d95d33e0394b8d894db0b9b93a5b
+eb9356302191c9d9112cc9e26a102679
+f239bb983942061876ad6cfdb4af597b
+41bfae21bce313f7e72707231dcfecc9
+10331f5aa2ddc3ebed66669fb4c0f06f
+d41e3711a573396097ba0df19a441676
+082377f124de597348da2aec4ebcb0f1
+b79f1d4fb660fb13fa8743d9bf6c1e05
+c1709c9b8a5f41ccbfea63faf8498921
+009b81d5384a3df7e6ccf5fc4d3dcc3d
+c8de16f6b3f11e6b610103b260e32267
+24fb5cd80fe11627ddc1a0bed5e8ddba
+6c666425a425e6e3b140587aff1ef0e5
+9110497cc09777eba1ca05f82674beed
+e7165e5ca77fb2c5f6dd37ea91e622fe
+0d3de6579739541f457d199f1f93e0c9
+301dbe3da485e3743d13295c4bedf15d
+de81b2afebf99fc165e6f6a7d6e100b1
+6ddc7c4b4373cb7510ac6cff3d362502
+9576f0edc821d674187d13523d79db2b
+c3cddf960e7b1754d2c3601529f4e40b
+b760d08ecdda0fb899604b3821a5e71e
+f6fce42e0ceff4d44af2ab6c3bc434e4
+8a9b9e58d746546f6785e0fe5c841df2
+7fc626cd03c6d66059cc6295312bf466
+e5df5c7bdfab5a873ad46209237392ac
+b7df5d17a8ef9bb8c29d28c3059c8956
+d70a1122a2dbeed22bcbcc868a9d4aea
+bbed738387f2c48d96da841cfe612e3f
+9a85b189ce6adeaa4d0a243aac02a85f
+fccbefeabc27788594873fef9f7f5c0b
+bfc2a791fa9a6adb51730180d8fc4d61
+df75d55ae79d90848d331f043f46da94
+4e7159dc22fa7f60799dd41000e47b58
+5a6d47e536935fd706d691bda183e08d
+4284c657e48111f5cbcf834e3098ecb3
+4f8dc069e8d15de56b0756f6b0a64369
+7bbed8a381752e1db15f6a1818c9d890
+760a82a6f45877e4ff82028ed39aa908
+c7fb41e35f07281c47fbc6905575dea8
+4a4f161085bb0e752562ddb729dadde6
+8a76184b236341f5f42ad77d2504160a
+2ea2afc519ae405c531a7894392e87c3
+f220cbd294423b9088b8f9280d44afe8
+5811583721ce4a4c2f3867f8bbcdc076
+5bc961cdb6b0ff06ffd7ef680676093a
+d5d4f7cecaefa9f5fa1875879f10b4ea
+efc4ca77f73c22a36e87b2c61a23cf4d
+1c32bbba50600b4593368736187f619a
+574723b960f63c52da97f349e8cd3f10
+e8d6c8678844dc55c5dc6472e9eb0cdb
+4cd93ecfa096a4ff8a329ed67baf801c
+58953c92246b79fe5e490486b19ea633
+a03ba481174e489580c6c122e7d45d54
+2dce5cdd52c6c074bf06f73ba30c81c8
+a1fe90b526d4e5f4c673acfbe85e1d72
+642b02b33bef57d01c7698a8be86f8d3
+c5e2bd10071294a9d32bfb409a1e9c20
+e79604c478c0523724972a936ad6e69b
+866773852c991da1c7707db2cf9132b4
+f6263a5dbb95f9d1b1d04f3caa6ba805
+2257ce94e17a1c80ddef1f303e28723e
+11d4992010c407674ba7d6c6af83528f
+c1eb277232d2596a2ceb62e466f2da36
+add917be60e4dba3dec4275b7b6de6fe
+73ae40bd1760bc0f2c29b6013f587243
+7d114db66f6ca0edfe296310ddd629b7
+9a8a3d99e8a09bb0b3bd24d59413573e
+1932ee0be6c412b30e1fa7cd114f2538
+e3e0fd4d7a9de1ae76be14a45a25510c
+b6688db35a3f569182d039ac9247a172
+3762c55474d0040df7524cf7862ae7ad
+b2757464316ab8f8104f0d28e229cdf2
+14c1344ce2798817ebd66d5e8640071c
+8f5b8422b03507635276a26a54e8e757
+a65c7e89c0700e2d7b2dcd3f4debba35
+75f696c7e8c829e8c34ece5a353ff6a0
+22ac583f463385bba74c420676c7c418
+029bc7b4cbae932623650368aa65e51c
+c8f8c2d78f9bb8351b0d84402390996c
+691b50819795db9595951249507297f4
+768333278e5b26173ea3446be597add0
+461297f1147867e59a28af8378bf7361
+d9ad6953146b97cbe57ecdfac7d0072b
+b6ebda29dc94b7cd2513f859788895d7
+e6f2de877e433ff5af4d1d3b091d306c
+296504fd41f47e159d769648c06ebdce
+88068b367cab81753b72da2d14c66c93
+61a079a7f21cec1c90df3872475014a8
+ed894a80168234da9c3ed1fd1910882a
+fd7d5114bb083109184f258c3d91733e
+b32fed06a7a30c9458048b7fd6c7e1b6
+18167793391010422fdf013abf500e7b
+926f648787ca65934ec85f1910589866
+f9a29ee6c973355f9a0d39839fb30b87
+68f8288f55ebedd8e091396004aea917
+4abfe4e85ca1c6936f6979ef3dfd8530
+e1da7c6855cc529e36ccb44dc7a49442
+6886688f2c612167306a0f1a115efa85
+4c96e85ee57bbd19ea962744454456de
+b4e22391105612164e2fd215355a8673
+a4048424d891b6b1e1e699fdb774bb71
+2fe2a8ece0021b318fd98125ddd579b1
+fee5cd43dfc0f2ea3d7442951578a0df
+fbd3afca19ff4ba98fca0c2198a65a83
+ed7e7308a031bbef2160f87fb86ce3e0
+36dcf70a8ed97a700a1df6cc99884893
+32422c1a69f95874e2a8ff46d622f71e
+ed759b61993b77eda1b086ebdfffc0f9
+2f75c3975ebb05e63f1a44cce3510b3d
+0c5140b4ea14a5c67908bcb0346fbf8f
+6b7b1402606c01fc5c64036e8537a445
+5b4c2e5982a72b06aa1571a0aa7bbd8a
+96fb6f21b885a10b469dcf5fa3ffa22b
+7ae78ebeabe4fc9faff3f50839c7cca3
+5b72be0d44cef72feb7a3d3f0118e67c
+84cdfd49a3fc7b2a167e2c6858584e23
+8da436dcd1c9363ec9d58ce407364c7c
+712a3c44d440fb87df50253833522b43
+d41d8cd98f00b204e9800998ecf8427e
+e3545446cd4af65fed53c322989e1324
+5110dde7769813e01c17deec258bf463
+c6af50c71c84cba5f5790fca923a43b8
+3c9741af154cd23a7c3f8839970d1fd4
+ed7e0a1b88527de2ebaa0feba81617d0
+55d1b2ef749bbb2ed43680fd9edb4453
+ba8cd0ba79694ae2c8c3801283684bd0
+bcbad1eedebf1f34e89a482d557b23f1
+da8bff83d559c6a3fff954b82eb59c21
+8c8282c9172c224b78a11eb2ed15d847
+ddac1fce45b90b41e5b5610e95eb5338
+0a1520f5cd592370aa92f24ada273c3c
+3ee21d3d11e7e4b2575177d910f62355
+5acdf3b3bd900baab0705beb6656340e
+22e71eb092855ce7dd85b3f1808cc36a
+a39e29a3cff74431917ee870a01f5f9c
+b46324e50da91d3b2075d806380b74f8
+a38a403d1db7c0ebd151ea063cf63cd0
+dffc39e8b7da5d2c1537df3bbec36c29
+fb005bcd9103415894bfb3b9522c6d06
+a17611775cc73e53812f3fb59b903cd7
+dee3106fb5be832fcb5cbe68f812267f
+45fa998119db3f62fd75cfa6c060c7b8
+5c928921700ad05c97b7e5361f863fbe
+79164b63f6fa36419f85d2b412eeb5c5
+808ba0d0ecbc4caab528ccf70ec32451
+787f76a37a8b700617448d39f07005e7
+580c709eb1fc3c91cfffa7cb29e27625
+cece116123523dbcfc8e30eb6b997aef
+cd0a3975d363c6bae6cfd009d500dcda
+ef62d7e27e5a610ae7294e4016b04a1f
+973591ec4d6f10baf79743e3e8a361e1
+ee8781e78afebc6d9d7bf838d5ed9804
+064d84f4191b9962171b6e6e8c8b86c9
+729e550f0ef26be0f18013af0653540a
+79c5b0849da364d4af61995dd45cb409
+2e0520a5df78f76de83ac5fba174f512
+8fe18a9490f176f6898f665ae2e9fc6f
+584ce6ec4c35b3874cac3f066aca7626
+d41d8cd98f00b204e9800998ecf8427e
+6b9de5fab007c4c0a9a51f000544ba98
+842e8880ae8747274f75b6a9115c10c6
+c2c6984c4237cae262f80dd4a9bae2a2
+fa3bb15742366a55462806a940119054
+617d8ae2a11f835724db24046810b82f
+311a9f4c041fc0a56d29545de621d2b8
+020456a0e30ffe62f63bd38786a369bc
+942a43ec06a1bfcfe5278c8e76fecb13
+e6f9e9296e655bf3456d523d776706ba
+c51b15d81129d8948a8566cc8634ee3b
+2dd02c79399e1f22497807fd5c75c443
+ebeb0e0b63855644cc87db254bda8f05
+5d0bb1e4fc9367f0b314ae3b3707764e
+d74d4417447ddaa99a0c68a16e5c82ca
+c603cb2131f1403d92904172decb1df1
+bb985a7c5c102078a7824fe54d8bec68
+086287e38be751f51302078beb205d58
+4c66ace3d90918bb4eb53ec8957f8d13
+63a3bc9d017d90e358ef3174d288fd5a
+0def4092b28bac86388f554d3a594f96
+813a69929b73ebc701a94b4c109e7d4d
+e1e04e7aae11e21ae99b4e425aa4afdc
+81c2d674999b4f589acf4cf8d9757df1
+1deec4fac809b4191945b8f345b22c9b
+615d28774960191a17e63dc9120b1424
+5ec0f861876a67b28ded6add3459023d
+65ba21ad8539b8c31e68f57b2108b392
+2fce06e38cb4621fa25d6464887576ba
+e513d214e6af0d0cb60d79a9b07ce945
+594048eccc11350942358b41dfbe78f8
+117ec30643a8170d058cc4581dfa5f63
+ee1106edb0be8cdb8245cb10af0f9a3f
+686ea323eb2858db9ce088a021993bb7
+0ec41cfeed12bb4a766f58bcb7d50a86
+995d7fa89f2623cba76ac418caa35a03
+d8de0913476a0afe2e927456ea5e10a4
+7c7f0b615d18d97a3a0890b4e10e8f3c
+d2141580c54c790d29f11ea85417fab4
+6abf4f3c45de08fbe2968dca3fb98ec4
+f7806bd589b2681fec6c784c81a8fe7a
+542a5ec70662c4aa20614ab460b9ef61
+20137c05bea6a6c36b01fa30dbb40df4
+8894d7ef26d046fbbf8e449caa150cc4
+e889fa69255ceb5c723dbc82e1b540f4
+476e8b8b7d77a9c2c40fe601ea6b080f
+d41d8cd98f00b204e9800998ecf8427e
+19284825ed85f92e90407850fabe34ac
+c15f2b924a498407a4c402c78c09c890
+86ab3fbaff80bfc68969401c8c390567
+84a658d3813aee7fa910ac9af539e041
+767c4703e78253157e0e479ebb424108
+4ab537ea120f531b5bcc7606a504e6a3
+869b2e2ff2a108bcd0079c10851dddee
+ca3118603183d2c24b4981782d710d25
+7626834f36e25e71695d1d1f223ab532
+0794d28cb5ef942046013a6ca207840b
+61380866738d2c67c3efd5d5268ec90a
+e95f361ba68fbca06c51e06c18949df6
+e3817734c23da4e7056a6b82bc5691a5
+19e75579adf38b6d5958b8dbbbe7ce45
+8a1ee8d04032f78f390c1c4692c20de5
+bc09a2f12d47c91684a59cc96035e8cc
+a3f1fe3c6b7d7e52f2bad0782e4bda6d
+6519c37fd554e9e8ad02205784726f4e
+5c5782eb3cf8d4fa57ca961e77f0481c
+1687d7e39483d9b1e5410c13cbf68ad3
+3354f7050da5409c202ae9533a148921
+b28a73f020194a630a17caf31a19bd5e
+82d64b3352e32e3fbc347f28a5e4b7ef
+3fe14324442510aa58c73e7096771a98
+dc64b980cc0c3d3aaad2ab07d91ed1f7
+0afa8dcc227f5ed32fc928512e34dcb6
+f7fb6594c4b255f1e6b9e2fc71b31cbe
+f0c60ba0a1fa5a0a6a36257545b85f17
+4a4ef6b4070f9c899b31a001bfe32a0b
+b3250e2e75ae5d29ce66fc0c0ec6f6fa
+1e3f7abcfb294650e2ffc63ce8870b79
+03fbe0e3473185fd2632cd42ca3ff636
+50655bef521e22835af34f1fb06b1390
+dccb6a898a49fd91b4e9b6f227cb1834
+7f04308510cbd32d07834677af7ed17e
+25ea2db382b03216a293ec512d6e45fb
+27ecd173a8d57124bb4c0e2b9d3847ee
+d91fb37bba1aa1ce4c18884b0e01005a
+ec8d3571eb9f4a74ae9e81d3827ab50e
+73417850390f05c3ff886abaf405145d
+e3e08137d355060a374004f512ce4180
+73e7a512592f1535addf9af65fd34ec0
+e980689450ede8f2d1218a730cb1ecd5
+642f369f7f55dfd38d33b3f61bbd9d21
+ea6efba8f88a29a81a8b52b7b18091a7
+b49821fd5879e082445162ad8ec4f6d6
+cf93fe580df516fbaa0ebf115bd7df23
+90ac0487d268a52b95ed1e629f5cbab5
+8f246926207507ae5afd44aedabcfa57
+a762ed5322d64866a896dec6160a768a
+6f5a8ad64e9db1c62dfd349627028ca2
+b89186f62f7fc6d6958e20da0785d44b
+872c76b95a80ebff03e68fdf075af38d
+7453c78c2d290a71567d5f3f2259ca8a
+81c6e29ec04cdd94145c5177125a26a9
+5238c613a901067c1e747e2296dd0fe8
+c41befc8de622b1d5bec875739216bc6
+6a8952e9ab762148ef6fcf1e2b6a07cc
+96efc7a03c87624fa400a2d5f7d9af5d
+2fb790059699a8255d15a09e687d9d3e
+6b98f5b9125b41b6045edf3068667b9f
+51b0a4714e70536276cda9d02bce4b1a
+a6da6c895528f9923ba0bde1438526a1
+271f4ab4dc651996ad4da1cb1ddce8f9
+54be80e9ce779dc23540b725b4ff2dcf
+462bc79b6584f19581f433042f761505
+ae5a3d39fb4aae4376af0eee146e1bba
+d3094bed037ed34346528da0315d60c7
+0a30b84c8165eb7f173534b27592c153
+65681f866a47997ac0a5c0867f36e010
+5dbde3ccd0b34c20f460cdbe7c0a1515
+c45328779299bfe8ba0939b3219bfe43
+4477ebb38e4663d45d257b9121b344c6
+4dad8edd9115271f52411ef353cc937c
+0652090a9fb3067ecff144f2e17ea26e
+2f16d9179bd485da39a488e98ad62984
+e210eb2f0543c5a49fe6cfd6fc536c41
+f8b7f0d1521c3b3b047049725b371f0e
+35e1a99fd8353dce1152d358787b1ab3
+bab7870b5dc731ab624847ceabad20a0
+7cb1bfe202dc5efff5fdaf4b4c96a24b
+b6acfb6bf335a25082898900aa0425f2
+e8aadd1ebb4557cd59c06bf2dbb16007
+b370e946f7c4f03fd8d2d462da7ec7bc
+8cbfaebfd05e5fc13edf4ba28c75b87c
+971dada654f118021701c4a04cbb3a29
+7d681d8d027e95d328f851a4e493c830
+35506360400b4f21a2db1fac7e467ccc
+b33243b6212317a2fad35e2a4fd5938b
+58647860b27ff13d8d02cd5110bd6841
+51d4367783c20bc57ba01a090621da8c
+2513e413a528c41d5cef96fe10257a36
+38d5bd432aa379728c94e18cb6e26883
+3f882f184408a428b3a69ac678d958f7
+52acc6fc1264bc9e8d129cb786df1160
+25c267a0b466556c2b35b3aecd5c6235
+795fcf771e02c98764c5d43524341796
+0a83e7f102b54a328aa241e12d925315
+756268e231946404abe982956235b0c7
+fe4f28703f73568456e1a1beb32a5e88
+c25ac11e39a207e1ddf9ff65f37ab426
+8a35b2ed4e56da50e938810422c7f560
+ef8c45517a739fa5a2bbcd4e69d957cd
+c4f1ccbe2f86a0ab189f8242ee1bc511
+e46d367d815a108746db9a87fb98aeca
+3cfeddd2b3e63b4b2d3ebf1e4fdcb212
+ffa4f40292498dd18efb5312dc81e5a8
+94e2bce96fe1e5b1f704ded0ef969e52
+81581066f0c7b7f59e381abec7b0710f
+d6b92fefc325251878685a9a1301a5b4
+14a0d48fa771f3d682a4be43a131b976
+f5508adc859920a03b24655645e1a737
+a138ba6ad42f6b8273cf097eca56d6fc
+20307a053780870198081db97997b3fa
+6285d4abc7e854e475570efa0149b92d
+5ec212656a99ea2b30106c713be528bf
+540d180db2d41eb3120081c9662038e8
+8bed65992eac4ebeace7a0a00bc1f137
+6a60ad795eeccd5dd383ddd5d366b849
+eea312dc49acbd9fdcd70baa4603ef3b
+ac40332e9b28156cb82a894f70f3b914
+75d4ba712a5dcc1d7123e38e8a7474c7
+74a4ab216048a6d64ceffe0f92f90287
+6b54dbf1ef40b53e836971a1900bea34
+0aef067780fe62c9115d9a395450b6aa
+99d19f78ac3ad06469791c5e60be597e
+93b3f6e78232791ab215c4d092d40db6
+67acb553241a9cea0cda68ab9bc8abf8
+2b9f2d6c552ef123c685c73bd61dcddc
+534113ec3d88360918cb80e7781aaebd
+4549c2dd51ffa5c86385e35a65d3889b
+5684e227d88fb1e4209122e70755ba66
+a0be555408b5a03b857b86ab12d8f8cb
+5d7df8424f0cd9227208f454ac38f04a
+afd1e90053bed1e205d8caf55a114911
+a06c2f79b18577e369bf515ef840805d
+5c85f81c36b99cb21c2ce6d52fae0954
+acef97a0ba258c04bdc78c7933f9d12b
+55ff7d29e9cc2a9cae7e117445ec2af2
+ac040b253196f2f71fafa0e12c0dda94
+b702d4f72e34ae46049ca38fb35ae5e6
+6adacefdfabeea71f77a65da1c012870
+fe2e0daea77284283021fd379430ecc2
+c15649a50a9494cc0ad553e061d5b2fa
+6de19f30ccf186d6cba8af2b47427c5d
+8b77ce4b5642a295c861979695ef8ae6
+2cffe11861213c771a3c6495a1c6d8d4
+819aaca44ab95835659d576caa575b39
+e41e3d8f2685a64be4a5e649adf6b547
+8a87bfe5a44452b86589f8312d168a2b
+7ec3cfc808ec31762686f4a855c20e70
+3ebb187c99ddca3ac21520f17e2c76de
+bee6e08f02072b48c303b801350b3b3a
+2ff0a5522798effca1f9c4044a6a6e3f
+dc28e3ccd5338c7962f7466f634eb615
+232181ec82f80997e57526353cf258f0
+63c2e8793f6ab37201635e46ba951a11
+55026c183dca75c30f22973c171fa91e
+169c11e7d35758125de32b2cbaca11d4
+4a02ab882a8fe258366eb47d869dd505
+d8f8439ea73f4a9b67108165af8cc6a2
+e60df20791eda7e41108208baf20fa2d
+75c9738f0bfad7306dc46ede7ffbb3db
+2347baa7494881de2265b48f4df043d4
+cd73cb4cd1d2ee9a3a3fecbecf6507c2
+eb60ca3dafb4061cfaf17b77822d644a
+4076caa4ed51e45eef0a49f7ed0c7be7
+4f2b73bbdb6feba30c021fed162e3230
+612d24bf39b0d0b82e48371bfd0c7bc9
+b89078e40ca35f1a59022333360d5ab3
+b2b0c55681b840c1f6ad76a57cd2908b
+9b7c65fed3dd5394ebb66bc3199cbd97
+a6fe526eddd29f025ae10095d31af6ed
+7c4b1063d360c7bfdc5abb0ba803c924
+b86e810ccb3a07ee03941f15837bfb35
+09952e69da88b8bc814d6420095d49ea
+9674ab992d6bf5e1950e56f0bb3e7795
+b2d0e03f153cf617447e1711191be346
+513d1d21e76c6f3d7a26d1937864ca09
+b7ae441389fa5bf1a749a444417bb89d
+f80ba0c3bcd685ebea6f3c1776ff264b
+36b47cce3927d0335267de9dd44af37a
+570a4ee8758a25856d67cce0b8764a83
+2e0f6e47a235693317907abf21b61f25
+8adcf9d09871466ac68384b261908cde
+95c61aab39dee746d5fa9d141dc533eb
+bcfb8f69331dbb931dded5225827727e
+16fec16abe4aa240ca63e812e012a5be
+d140f4cabafcfa2e60f2c3cd47b947b1
+ff9a2e2fa9acaef7b1e82728401d0649
+cf1cf3375b1c6ac27d7cb424159fbb11
+d2a365c26504fce9e766b1b29fdfef65
+a1b3188553b656f24fcefc9d9f2e6541
+04aa7e80753cfb39eb8111e6e08e5675
+da02b12c62159b4cbea11a38cfcc5236
+7a3a6388503765d7b8e9cb1aa4be8eb0
+cb01efea76ca47791e0386c753249fc9
+bafd870edb3bb192c797d71a4ae5858b
+8a89f7ffc91fabd9381630125b980cf5
+cba6a04deaf91360d4bd65b81e3e9f23
+4561b4348d3c3b0bd506d460b0da9f6e
+d6f890f3d67d5ef924e893c7188618d4
+88abce1ddfa228379e23ed0e2479c679
+c10e344bda762585faaf5142e58de8aa
+165cffcf5cb145ec2e46ceb950c27450
+024c334a457bce0366205658e90a75b9
+184b3c99f1ad68a2dc0a119133cdc4dc
+46945aab6d2536490a77ba1cd0f645cd
+33b8f33af56b1466969bc5789784a3a0
+23e84f72bc6062841c6c3b756e601a32
+ebad2fd5cc121d37f489c8f46555ff68
+4d72e592e01b1aa0616c4cda47076d78
+c7f7fa64bd28d2cf9b19900fc8a50076
+3ef5d9f053d91b6467a7bbac463fe14a
+edd3d1dbb0f15780c8ad5056e523bda9
+363f90810453835f0f66a01a2eae9baf
+9a609ed9858a3aa97632605ce86314f1
+aa43b3ba38888edcfabe6e76b3ee64d5
+7416e404161d5efae4aabe2250a856e8
+4547fa0c5f8e3ad986b5c7c1bf661976
+4d40e22ff16d0ff2be14b8ce5974afd4
+c785a9ddb0523622cb293f7281fb647c
+6be8a533391fb5f33a32a1801b6f1b60
+e188d0f31dd187076d06dcd0c47e517f
+ca1176d23e0a248384c71b7e51971db7
+3361617251b5346c57a53dd38dfc15ef
+024e3441fa9a2838033c6ce703e38d91
+83279e3142acadec21b235d94a510e49
+7c28a998a25c5954c80f6b365d42529d
+6f7378914b9150c08c50e6887f03f827
+d251e908b5680302bf2eadd34d2fabfd
+40dcf1dd3f72a1f1574d0deba11a630e
+9e7ca534d5b3c246548ed0c4c45ff35d
+e09cc8a1ca1bab3709082eabc53818c5
+78a3412250128d889c7d5a5b3d700152
+506f3f610c246d9f9aae0d41ed326ce2
+7cbed69b3e41f80dad4836322d173690
+fd9a3adc60b669a42ecc2dbb52cc4461
+7fab4b1e803723ebbbf893fcacab7058
+9c58a6ba8d0aaebffc4ea9fb6b028eb2
+1d30a0da4a109ef22fc1e93f0ee4c420
+2daf73157524150f4e332577d9262882
+9745a3b08982e1d6913d6ebdf2ff4d69
+d4b7ce0fb96365f9bc16ef62b694d259
+b8660c67030ea79dd9b9b186d01116f9
+a875706c97466642c4350cb802e39026
+7af78c263d4e0d2c9d52d23d4c8fbd70
+42c6842acb75901171f1fe6c4ac0bd60
+7403c7ed9390f128605aac7541819157
+e30c3264011b4ca431653713bb6ccec4
+f78403119d29b807a8dab4ec8b949d48
+724ea9d5bbd3723b2a24925c5b2a84f8
+752ff23317ced2be296951b33de6058c
+b430dccf182ff97fbb784f3a21b05a36
+e548610d655c60de0b210f51867f3344
+69dbcce1ff8043d87adfc224c079cd7f
+04f5223155b1499b21ba535f133b6dc5
+1f3024d03f530dc8e926e3fcfb6c78c4
+5b8280768d04f7da5430a2a46ab771b3
+50ccd2eb5215ed0aab7abe56afb9e3cc
+475b9c4f2653c6903f3bddbd203084e0
+f0603a57251a01f3a875fae78a7f63f5
+a8a4d9710bcc53b5eb0f38d0152abb81
+559c455609b8e56df1b7d48f69232b8a
+c7883e04223dda808756d2ddc533007b
+421e6dfc0696764bb9ec9567790303d2
+2d83be94d9aaeac57bf91b06a1efbc68
+e1a903ac72d7764fe4eb5c6afe21256d
+ec2ca25d57fa63b071990939cbf2ec56
+d41d8cd98f00b204e9800998ecf8427e
+a574d23d3596e1b4ab1d282975aad638
+9719e67bc20e812f051aecb58afd46ae
+589a8e554b66cdc0ea8e86c7e32c77cc
+7428cb8bb43b574626dc9b0d3b79823b
+720735a866d9ae080b5c01c993a6dc30
+541ff42e454d6b53a2d15889b303ff38
+7172ccf127e51039245bfe7c92276bef
+76ce2c185071d05918b8c87bcedde183
+4f0ff8078089d328bdd9019976e34291
+6fb6fc10998fa81baee1088156ad9e9e
+c016a915a9587e21242528edfd78ef44
+a1ae3c5f480435b98938f257c0874f3a
+343ff050d2d172596d377126b46fcff0
+5687149f81695ef2c6ee6f625b9eb1cf
+64db42623fffd5d948d26d2f44eb92dc
+0ba76415e1b9dd96dc84a553d1314efc
+967c8c410e19c0afd862cefd55311662
+e17cfcc4d85470d9042c23ba99df86dc
+eed8b673f96f13b31b7e2e421657c040
+2c4f3aafb774b823cfe51ce6c268a0fd
+cf3f96943dbc0ffa9d6b13ef82ff270b
+f72742df058f1d1d3c2a517763f2e775
+af8f23c15c0cfacead6bab0a396ffe5f
+6c4175f49238593308d2341669fcd69e
+24150ddde13e957562d975759e3b68e0
+efc491af74f7158739cd4f5e638c42dc
+b7051038eccc2802f0b1d465bd7224ba
+03366bb827ff250571d94a3a7812051f
+1fb2ce8337bdd76f2aa9dcf49358ca70
+3747897faae3b8a170783f510a8a4b17
+04d47a20af1e41ed9d52bdb1a09e7ae7
+3c2aafa7f28bbc837d886c5b5baeb2bd
+9b9824cf0d4f19db39b620409855f2a1
+13cf34337971081ad3eef9efc2510ccc
+bc1917a7f57908c7ce93578e44070cd1
+a8f3ec3cf2b4ef34e8be6ed8c373a227
+318be22b280c5335b0e59e2389e3f591
+ec3c066867fb324f4dc1774a824d297a
+55415fd578f99e0fd4fa997e57364b7d
+f751ecd31998f5caaf6eec0f1c7b20a0
+e92b5b59ee64b86817021e66d73b8719
+d06fff632ef7c699c43535d5619745f8
+04f13dc3184cca1e9de4b335c0a8d03e
+7c12ce0c29850af19777354f057ecdff
+8393933f8c748dd3b03ec76fa12284a7
+fdc798b00fcada2d7d33b1c466b5d462
+5238a9ba8c549eb107a2b994ac928f73
+34f8363a64cc4c8614539676402403bd
+2fd7853500be96714d93fce194e794b7
+7a305f7f0cf03309a1040344df348619
+b2e8aa817970ad36b66474a1498a77d1
+ce8038cc94ffbd0ef559183708abc73d
+0e9df29e8091a01b8f5bd8196d1e38ba
+25fd4b906cf1461730f08031c2b9361d
+928303fdc1427fdb3649a6acb4a0ec77
+8c254e59d91daf0d7edb1fcf941e7678
+84d090500c38a4304617315ffb2956f5
+f32ff7d407a2265bfe7a6f164d555e06
+5f39b662466be3ab17fc4d5bc7cf1217
+a6e849b44a7cb0aacb4ee5562911d2c0
+be59d844147d68dd863903647ec23cec
+29dc50707af80637bd8621a92c4f1903
+3ec1536df49c6ff41807d2da5be3033b
+9f8f15e6645b95c5ef2c8a710bd1b403
+661b62d0975731f3b3eefaff3142a32f
+412635f60376ecd534dc022b0692dfed
+49b2cca2605cd72705d04fbac09fac28
+ad980fae2c268f4424041953678b88bf
+b2313cfbf70f3b9e5430e2daeb4ee828
+99c16d82c4ed81a74bd111ef19620159
+b2fb903d02ba9623c8db14b5b51b4a15
+454d1298fe8762765ad251a10e35c4b0
+5bb6a6bc3e2606323c36060eb954606e
+35e93d8ac68cbea85282c9546c80d13f
+2a5c9ee0b0587fa54253b96b464a2f07
+e61eb3042caef7a7519b7b60790ca6b8
+d14e5c22c0bca438139a86df71487557
+8bb1f73b81c52ed872a04f20da256164
+3e0b6d311054f6e68f582dcf00b5ca67
+ffc386589a93b394e2b5be9800d24042
+b0facec33d1f61fba7638523b9bed212
+4c4f18783a575b92fb0943030d90d2d4
+9856beb4c721afb7e00e67affc10711d
+18b3da1e4d9157ab6e9706a9d88d2d9f
+b96ddde13bfeb6822e3512fb335748d4
+191c3e02edb1d753f490a411b251ce3a
+3eb1c8cbb4c3ccad43fa5de198490dc4
+b325176ff0bb67ac5ef6e9862fea28ba
+de3d0d218abb4103bc2d11057cb94803
+c7e502772e79803383b2d5d706523f32
+ba180213e38aa932320416cdba7a2921
+b2d622ea6af1636f897f6f919099fd88
+d73726cd47a86af5cf0ac2ae08aa29ab
+cbba3629ff6f8818eab47391855fabde
+05b909b9c733cf01784e32bcdbf420fe
+86ac9e85a1be146a17df81028f348de5
+7797d0eacc831e34192c37963b797383
+0f15420c0792ed6c04461fafef1920e5
+4190aad8f52106154452d4987a2471da
+729ecc9e7929500d34a33111e50997fc
+e28035446408171100922a39ecffdfed
+bd55bdf914330376f692c97fb35e86cb
+80d09e1a4cf708cedd4b7b3bf7ae56b9
+4791d44a5913907384e25a3b667c67fb
+7d65b2cf38a9671fc87aaa3e583388bc
+edb3cd668b9456b1f96f3f15818105fa
+8e729f986138b1376c83c27611f8d5ad
+9cf4c53b7ca1efd444a05368182693ed
+a88556f2f78fca0d7d9f4daba8eeb72d
+bb89a5a13aab5223e9cf9a3c7c937e6c
+6f85277a8615ade68bfedf2cefafd483
+c7b51e4860d2cda25d24e26c5baacdb3
+6e0793b24bf5ee23a2f02c4af6707888
+820bd6760b8553d84ed541cef13aa492
+17420c643906bc71f48a1f9e4129f54c
+248a1b5bf9b846695f67facf31c969a8
+4406f8e13838a88d9137aaf90d4784c4
+94963c247437a2a0321b9f3122b89ee7
+e6998474cb54233b455cd43b4d0d6b1e
+eb0cd26a84332fd8aad2c97dab602519
+2e8e6df51c063e513a8610d997c7ae75
+67fbbe89ffb34c680ba3e290ffbb4662
+6e7e6dd83e1934f492b48e809796292c
+0ea751fb39bfc32fbb6a80e8a5cdce40
+eebf2816b3173db9f71db4c57aec4f5d
+0a7ece842a143a88803f02d33416585d
+fe00f24c9cddc3e591ab0dfad940fa42
+ad93377138d952ef9447917416ad7f80
+6588a485f91255dbe04127ea3eeabdea
+cfa45db1b095d64786f413aab5db97b3
+24bdfa902ffd0f10d8ec1fcb76ece6da
+fda45cc5b7c2345d9db1cdfa34bf77e8
+d0dfd207b651e4b89085cab9fc76a13d
+4cb354f2ddb5e52849130a8d08fb1b3e
+fbb6013ed3bd4cede1f5c13c5d74fbab
+5c4269d258b4ef70265319a91aa49e3d
+0ffd7a752460957cc998c4932e51f31f
+b45bfec7cf02e5d7c26cd4f20ed6c02f
+3655265d5d7cca8c4740c39ff6a9de7a
+16823ee3c22cbe522fd60286e8eacef0
+5d8e0926ce0ff9c7dbf2e8d100a141be
+3e6ca19ee3d3d1f538dc09c7edf2ec32
+4b9bd5af359247fb5c73c5f1c25f672a
+e023eeaec2e6ad3c869a2bd2823dfd5d
+f40a4adcb4be700d51cb1955c19b566b
+dcf90197cbbdaa3279b1aace86491e04
+f6905c9ed93b57393947452558e10031
+111c39a99781c5b61ad4a754ac72715c
+63493d9fd03a39e282986ca636201086
+ac224b50b3915b3ae8dd696efec6f41c
+874c7ca93c05288aa9eeb97ddf65582c
+8ed6dd7337a5d652eff78b22f6473410
+eb258e4afb915492adc7d69e17d3f088
+1f428e265d734a7ea228dbeb5dcb98cb
+39eb2c02f77ab8455007ebeb055034d3
+bf70179ec85506e6295a411760a08d66
+2fff297f8f68665b78e7356916866889
+81ef76f2946e59e4fb4622a813f8fcce
+bb2bb6ad6e689c378d00d93e828bc780
+4f99005dc924bb17e338f7e7a9fb562f
+c5a729e225658d9effb9a99c52f33006
+8483349fb5ff09c5978703adba3727a3
+50191f7e96139ed9dd2a4ade41d423c9
+fcad5ef757a5be8f05e15f5a787907c4
+095b64f78ce8bf5956f84aaf191595f5
+f9a1cc194e80cb6715ee61164d3dfa4f
+5382358f3941e801f2e266ebb224e3c2
+c2558eafb98c5ec5d3cefeab3cff325e
+92349774488d804e96fa8fa2eed325a3
+2afc0190b90cea14c5a716d571a0a139
+786349ae1cb20f6e05f2bd8f1d5c6dab
+837700ff5d6207ef227d649d945997ff
+e52e567179ff7c6c8d1090960d69450f
+afad950cc5b4513d5b90b03eddc30339
+07a7a1c94e40ddc1ead7813d0afd54a0
+23dfebf8fec0b9eef2811a3df1fd6e0c
+ce1bab72b4b737c60fd09168840569e4
+dbd54915af3d9e9b22453b1d4a7a1c10
+3c26926b84765da1e234116b65d64512
+4ee63cc922a581adc17241569e90dc9a
+68891912130a823f0c87fdb3354c1dc6
+cc1ff896a960097a2d900f522166ae9f
+84f6bb11a88e6e5cec616e570218f935
+85cb14681e62ec5505c774734efd5dc8
+c76e6e63795d2e0863e5ff4f6c047b54
+0c965abc895cb60422d53e6c062ddd88
+e5b8a8de65abadfe9cf578ad77a95a2e
+60a7a9e345b9c2365160a377f48f1c35
+b01b05d61fabe4f6fdf3eeb5b8b180b9
+e08732ac999c9d62e77ab31bf359e400
+46f0d604d8dc8c867a62216d20a89ef9
+5626278ba823c38157c4a7c416b65478
+91b96623501cb7599e528915d1321eab
+cdcbfb987955ad93711f22f77972af9c
+ab81e7b51e1c2b0d579edb2098432fdf
+b391dca4260cc0090e9d4afa1daa4de6
+fe0acacbaf1a778e40b30e06ea0b013d
+5e1ce04ee017cd441e7fb7b173d1e70c
+7163c7c186a38e5e1f08d53dfad84667
+7faa2d63d58d20dd39de141f78f1c63d
+74bf8cad064462e43aaf9c6803708786
+a8ca117e116e50892eca2b2c0da31fff
+62a86d7ead8e934e64233f96b1219dc4
+ba6be1e5a91298cfe7bf75e5f88a6084
+eceeecbfd9a9c128539503305c110908
+ac68d49138187eb2915d291723273529
+c6ba3ef85ea436bd45a6cefd46608662
+99438c6e987b9205948cd9b882a0b08a
+2515fe0a59c0feb43363751929f1d827
+a14a75cadd2eeed824fdde5a97e2442d
+c1b62df20bea02a5e8504d251589696b
+4270095fbc8779e8bca38436bb630df9
+3efc5ae6d8cf98ac3b319265f40908a5
+e902d74a3a7aa5ce6b8bc1bfd6a8d5cc
+fc467d126c2cb5239933b3a74dd956fc
+df8f375c7be629ff2186f2f58edc1dcb
+1c0d245d5b8b35279632ff40f33c332c
+1673de403d79010676b5d92c75fd204e
+8f3f797a56d38f671fc8d1a5a7217959
+b76258cf8faaf7b35486df771b70e72c
+9e29ca0e6e7208674b5f235b17b46d39
+9f0322af541df599c111210d3286be38
+560091095c67d59859a8f2b575b6c2df
+badc9f1f747b69f4f7a8eed97ebf2df8
+d35f21290c9fa01686d05525fbc0a18a
+cd7d4912aa5fba4ce5390eac051f8b75
+f52d029ba4ebe2aa3a5f5eeeda0d455b
+382e88010e26d8fb5a260061a66d86b9
+28ef7bfdbf48b818d9dd837e2606fee3
+cbde3f1d4520675b2506422114ef341a
+64a8d8faadefdc6e350e8ecbbc722131
+e2eaa5d498d40d77c35a09ecd09dfdac
+b85db1f1878a4940bdfac1b34c6d4e7f
+c0fd40cb560e2b0499a2d167d8a755fa
+fe34b6d6e1f948aff884f1f7006f10b6
+d092976882a4c6cc30eddfe1e41422c7
+4a6abeb51c4bd5a119b7fb62c5394c53
+011fb657a74c41a86b47f83ec1947b86
+71c8d0adf6fc92d638592627c6e44841
+6e1f4db622d185d1c727afcfac36e7b8
+457924a1442ffa5cfb91a38dcbead9d8
+29f692f0284a61c171df55d9f111fbbc
+5365850c218bf59d9bd746e7b7421d29
+4cc462aad936034753fa7a377ccf8512
+b870d627975c2b958bcec5a6e911d86c
+54846205cd855dade0e1e0a44d6ce3a5
+5961e3108669be833aaacd95f339ed78
+3848909e4a48662eece0d95e0f141510
+8dc98bbe7c5294ea918b442f1943f1e6
+86341d6698b7749cf9ce8f2fe87da6f3
+03ac44a68d8db04fdd807a80f6a59446
+d4e7b17fb6484df71d96863c48ad4a75
+fc83adbd8cb9872c22e8fa36156dd45f
+61408e3455428ded0778dba8a0e3887d
+be7073e377eae061f79b805f51bc467b
+e20c856e0c403f4bf94da58c3f019a68
+75ed291b640df9fcfbdd3bbbd9ee82f6
+2c15039bc9f13c1828905e63a2d45566
+dbbd55337db640c0369d2fb21c151677
+00897c5f59507348e44037e3721e50f3
+1b80acbd68a10e697c86dfa67dd05f1e
+bd5ccdada6054f21332b566cce86045a
+3d8ca37b350320d5ef2c34e89c3b7cc2
+8b13ec7071557461c93bbd9327d424ef
+8963ce38b7e11b2a543d9176eb169ca8
+220e601168d5b2339d1cc6dcf3589a7b
+56a1e906a6494cb17302bb2ed0615894
+ba05fe64fd2fbc5741f5b674c0273cfb
+fada6568fce466ed5b1930c51c9327e9
+75963a4f096608b8ac5d79b4d2dc2503
+e76a00a72537065b97a82abf2261a0dc
+3006f508b865f2af8e48c19e87aa551f
+a45a3bf3a634b3c5df356324c599f5f6
+8ccd66639ed4dcc90bbc177e2229149f
+ba296ebdbb9da36882246d97a3c9ece4
+c66595aecf06f46de17d2467a1ef0869
+f8bce5d21d4cd535e60369969180813a
+2f60a287c964303eac8d30ee42776441
+11ea0146c55d8ddcf5c9e563af56ed78
+2a0e3aed6faca24e9be06cf585ed1ba0
+b536c082a8b74ee1f84ca947e2e88954
+d19f5d6e9ed1ca7d1bbccfe02b9052af
+8377cef920bf50987269703dae23739d
+a83aa7ff0e0873dd00439da960301280
+880280f09094f4229dbc929f967920f2
+3b9d76129e9328515ce39e728a8b180f
+8127ec2ed7d6bd1a79c55b45e9b3e604
+231c81d3bb7e82c7efb2afd0be2e5162
+268106c36b0a068ab748535af7fcd84a
+913f2a5a2a286afb3aa81b408fd5d0b5
+5c56d7e40a06f0e4fc79b7e9e3b714d4
+e827c4660b42c4cf83c0118e714051c8
+bf28a2b5c0fb2aeb1f1f1ef2975004bd
+8a6a7fded4e9ce0a32df35d10c0ac3c5
+00055af6fb33463c16aa806a680f195a
+a5452ba985b7999ef13a58c959e729a5
+9e34828fd2cbf34aa87375d63c4cecd5
+fae927d76d9f998d9988ae2ce71802c2
+6beecb4c3839bb79969ee0ea4ddaa17d
+de344436819af8221d408bdce462a013
+d560afa6987114b88234c799f4fecae8
+1ddebbb9a35c9005eab88991ba9e9d25
+9315bcf7a50db259105845f76e5781bb
+9a8fc373cd8be213e6d027af216f4a54
+37ea07f0994970e6919676b583e5b1cc
+3ac4d44747988a472ba464ff57573280
+b08f7854091abf8e24b9203b86502633
+7d1f5e9bf42bbd250ccf427d7893216a
+60bfcaa51c2d4831a7fbec9f8724e8ea
+5ddbb2aa02607d4484a0343a4f5e9a1d
+445c9a3e98b897caf0fdc994c6cd3fab
+7956dbd5fcd1a4f7a11d0add5b2e118b
+137d8ca6f772fd749c41e112e3952ab6
+194cc7929a1a37a352a95c70c4d5e7e9
+4d48b74523c1e4913b10b9106dbf8066
+e301b4412fe210b0f9d6b960455cf7ea
+875da1d7bd6669699ffd862526a92bfe
+4c4ed36652ae12eef92f5daa24b6ed94
+597e77bd280a17cb63eafde38592412d
+c441bc2917969077b021ea76faa6ba16
+993eac3100cadef1e433c5b269a65359
+078ed98216f7dac5a0ff9bd7983c974c
+4e7207afdfabd4a094b6a82ce552d517
+6778185aa0c457bf10488e8c76f6f0c1
+e4c1a1f6f34a04bc4d10e2aaa83217ec
+2bd77f818cbda0ec00780062fb743c0a
+e4ee0de04992967dda0225c09739f9c5
+8d47b9985e8b07c3a2823201d54350b7
+bff1f0737cfea692a11aeff2c8c0ab24
+fd9848326b9c42fe61105dc48b2251ac
+c26d049c180a518cdcdb05060da85924
+a059f92e0cda1baa637acfc4f679711c
+6c9fe76be62d7935c9bdc16d45211c81
+aa714bf3f10ceea5ee383f521c47bba3
+37dd9c7d0194bb159b47fb042ca90298
+a17c2b313b2217bcfa3b7a7c142c73d4
+9dd0d03934e71ee13c298f01fa8ff2ac
+d3dd50ab4aac31b47e769b81d4b6a84c
+a48d82d71c9cdf0be545a0535ef1c937
+e3eb232a8073b1e0e24747612878f997
+966658b90def87ac168dd4d6431b99f5
+9d43654bb270915eb0115affbab7873c
+859977f8563c88a86f4a36ff80363821
+af203a0ed56b9c839ff58b84b7be4b1a
+e9d62965308ee4809b45433e659e9341
+b94bb919b381d3e18211007ae8227ada
+47568819223e3e3967cd5a4cb5975eb9
+be609812bfbced98a641f2b00ff3fd35
+5869d7394c8790877cffa9b10337cbd1
+fbb0ecc801779ab7c77259cc9df10a03
+bd7354ed9945fc05bc6e388c3dd7c9d9
+a2a54de5c854dcb56953f5407f8c5b15
+8ddac008bb9939c619c08506de70558c
+56237e9aa797d86565a28b5f636335e8
+f188ab4accfe7741a258f21b3d313a49
+bfccad8e927547fe9adfa1e016170529
+0ad2f73b4e16a30bb31048509ca1fd3c
+e9c90aad416c38ed235b20a1df2093b3
+7eaa7b54f7e273849b001966498d6252
+44109828700f9270185d230f71c1019a
+8d4f12fe53cc443f6cd560b6be47ceed
+f8df3781216438cf71c27c16d0f948b4
+458978ca7d3f8fc43d2108854c45d453
+2dcb2501fe2991556db609138216423f
+ef623e7e2c61542ce696c90b4aafdc85
+eb8069e3723e125fb192e1020534e966
+efe60a5807e804a22be0d2dc91ac1956
+4ef75453056622c9e20f3e4aad1d9a0f
+90b45d2b7bea8a08ae97b6cbfc650cfd
+65b4464a7ca0e9e99e4ff10cc2a60d5d
+3b713ec5e965e5f9b4e4a6059022e813
+ccdcf000cf2a98e370524fc1d54a03f9
+e6a2eaadcf193405a752eade08e0cdff
+dd0b34b50c69af5dc231b8e37f74f536
+6651a945523b0b627ff81fee4fcac2ba
+a26025ef000af35da46c46167f283169
+ef3734521daf5b747cd2c31e36022058
+7b7e0494d8081ffc20ac0202e6de591c
+898c327751710d708480fa4ed43c42d7
+0648aa52c9d4a2fa78f7d4ad20df1fac
+a30eb55a4a0f39359b858a7f879182a6
+7de9f67250cb29f95b060ff85625adc1
+519821e89034b51a15b4762e3bf0bd38
+405f4f0109c33cf53cf3c9b2eb3f2a29
+0281f84f34ba13139942d45aeb21914f
+68174b435ffdf244773108d61dc53ca7
+cf227bf9a4c97de472950d5e36f5fb63
+ef4850c589524d350f91984f12a69e72
+d42942e61c952519e4681253b63b86a9
+7204e2eb9113aeb067a6ae30ae9ba2c7
+3cbc34dbdc6fb1ce92bca0db522ebf86
+b6e7422df11f7a5469841361f2029d88
+e0f73869de30ca974b0f84f9f5b3393f
+d41d8cd98f00b204e9800998ecf8427e
+6d1d8d22e4afc378b3243a57a80cdc45
+2cfd39028d07604aa5c5727a8e23f1da
+e7d0243fe48cdcce24a468bc6725b5c2
+763ab0abad61e193a991640ad05deb02
+211683ccd1b7f77f978b9402d5aad918
+f763fd34908f08308daddf2b2c3e0b76
+72bc48d45233b1029d5581415f00882f
+36a59054ce14c4978965d976efae46af
+5172c8170c68d3f78dfc0dcc328cb607
+e7b07ebe5f21f00e38750ba1ce7804d2
+68c328eb7ee1e8e51c7e71e9d7ca6887
+12f787423dedbf3b8bbb86aa379ce2d2
+deeb6f1c3219b9fd8c2d8a4710abd928
+f9164f5f3671c6df4a876f79a469cc2a
+a50ec6fe59961ea2589dcd1966984c4e
+14ad69b85c5680e8f695c9958c145725
+d8630eb68be27a2e4de913bbc916364a
+0c66ef0bd73956d6dafecb303410cb9a
+057dfb5d356adac34b441ceafa4eec91
+d63001d25353eee66e6b82e720021c86
+895ac04bd7bfb9bd2292a1211df48921
+c8b7ea505761350437318cc5acf337c9
+05316c27946e60a03f255ee6f1f932a9
+58b8b5baa12c22eac5e3a6ab3a69ed61
+1d3edb9f9609230a783b7bcbc0878d22
+6f27ea38161fe8e610ac35767a980cb6
+90e5f5e04cd1b9c199a781f46c38decb
+00c39eb2bfa287353804bb8b23f0efe5
+1e5b2638b26b004190f342777d204333
+62480226e50c3503a1594d89c9f8420b
+1a7ba1eabb067b9df7e8782ea9113e29
+f5620b3eb548e2db1ce1eff8dbb08ee2
+1e1bf32069c6441bbc3b4c78a5995292
+ebbc06ce4f9a4c7b5947c53b2c224a34
+6fedfbeed6cffbfeaac88e9228ddb799
+e6e10c0f8cbd4e6ead25728d70dc25c2
+f80fd653f63ec2fa196906c62f2fbdec
+daa667bbb50ea1f245e5f5667c402cf7
+d18034fbcf6058fef7bdcb6917fbf7e3
+c0d7d694e3b393df9c07e3664bbb32f9
+a8ab222471af79501fb5e9031f55822a
+1f2a37dc55dace5729272d0691143166
+e5439cbf939c3b3e09fedabc77f82154
+886413caccbb3f4ee41e6e9a6e4659e1
+7c52e65a1f7f1cfb646734532b433795
+eaa5129d137fe384d33ac12153aa593b
+fb761f3c689baeb1d7b9dcf6ca357bda
+83a75a5d1d1efe9ff23b23de0c340fd9
+483afca5ca1dbf532e550a3f168dbe85
+346fb22fb2f786025daf614c72dc1f48
+aaa067b99a416a93e95fb77a99abf36e
+5a50beeb7298a019fa9573d94a825598
+26023ffa3427bd139ceae7b785d55339
+ae06e1156a0eb72b38f43d6b0cecd024
+581ccb0b750ccaebb3196dd14657ad55
+b2db8e20637b650f36c609b2efdfea46
+e5d7b1f31806801bbb776b6e248479c4
+6407959c2931164639b07814a30aaa33
+5c06350d91eaee806fd62578da0eb4f3
+74376f0717a5e81c496745c5481b9534
+f0f63ec82bff702cc4858fad78568b56
+46b49e7f8001293ea59e5c991fac85d1
+71bd5ea87c3523a431451bef1949063b
+941db9ac142ee2312a3aa758a0c07049
+06767a2b5f92d47d601c6235333b399e
+59dfed6060b228b82f84f83de81b416c
+e2a4bfd0188fde2a84d2fd0605f13b5d
+805c55f9cf33dfe4bc3a0cc66ddf523a
+a28d1bfa46ec80669c3052f42b7eaba6
+9ab741a0de0bc28f4eda9509e8ba8d93
+e727b363cf09e1f1a37f2d3e7884dfe6
+07b4b2ccfd7e50b957748d273201b8c1
+21f075c82f973a2ed32d30715d655f4a
+ee4a45b4037071fb15d7e52ba57a6f2d
+47f5df89b56d4077bafa74fdec2c867c
+b7af0897fa651c3e9395d5632684f565
+746391b4e5c8c9a69bcd09ab9eaf8702
+0cb31388d764ee4cf133fda06002d424
+61ff1eada56193cde330ed55bac92eb9
+9a9a0e431b6c38333bd14ba8508c7ec3
+48b6f84e5603f2ad2329540ed8a0c3ba
+8a55c02de2484c0f8b4261701a44a96b
+58b26b5667ca527a309d72a3a4070cf2
+b0df46ee625446d4a7790ecef072c642
+2351edcc86c9dfff39f49190a5532d1e
+796d97ae041eaed884a5001a2f8ff73d
+d45c58d83ed00f315eb6f3e18b5dba5c
+771f44740f3d1ce2d8956426c2ad2473
+77a6d4337a1f6d3c2d05719274f7c3a9
+21f2959b6dd317fe5bff5ff82b774f1b
+3ccc32bcb693415490ff385f0858efc2
+e2f590909a4d8843d9025712316a1dcf
+1f6067da105455f4fff52333793624cd
+a806d50a24ae23affe950a78c7078365
+252940c576a3fd0d6833fb633e010fed
+4473e7b1020ef3d9b95d3c1a8a61d478
+7c846016e5cebff488cc68d5fe7fff84
+604dcaddba4bce8870187d1c774199df
+dad430e08e42d8e6ee1d132aab1a6db3
+f6a72777ed2a3e513bbd6e31acee4c5c
+3330d1cdbaa1a926163704dc88a79c5f
+8e52011180f2231dc2a7c9c0b1f8a1d6
+2eb278f0d2616858c07a5c1ce0455500
+16207f3bc7831a1650faf423a3edc170
+820162001e1798c608aab874f4b672e2
+5192a1e62cde44a16966de8c04abb76d
+36f7422b95a65d5697506b9115f6172a
+42f4f568b4b729bc2b9f991b60136bf4
+53e77894554ad2cda27b3e0f3c74588f
+eb9eac3752782ce606cbcaf5e6f6d486
+00c7cfb5a1f9b8831fd1dfd59c8c048c
+c4ef874c1ab352dcc18f09ecc3b9ef57
+33d1f78069857482b58da5f2d93c47af
+0f3a96e6df02258fcad5eae688ffa0ab
+60543612ce2298ab1618cc89d641ccb9
+e46796d3de230688bbd5fd6f1d73b269
+b36bba112f683be744963256f777dc07
+8b3c66d58caf2bbaed1df27d20848043
+7154fda4b68bca83f6ffc205c13f3eb0
+eafd71951ef720eaf6454f9967f08f80
+ea0bb03f734b7fc4d46f495860530e44
+e18208369d00086ace33b638b814a81b
+9ee84ba8e65c51d9743235d23452c079
+ef5232de491745f69a0f5d503759497b
+c8f80fb5574d8924339f09011a98f0a7
+ac850473a1a8db004413fc750b44cbb6
+3fd4c40549a254a23bc5a8fde4bdf186
+dd41122b979837de77eae7048ac67492
+34b88fb6be0d46f5442da41eaa19bcc8
+dd9a10d5417b6a091f79d0891821bcf5
+11e6c5cf70bf5f694ef08251931c2c82
+6c562bed6d47aa6d07f9dd853ff8a52a
+b942ffb22adc4e4de74ae0dce0c087e6
+81e4e51516c0226ddfd8ec00d3a4ee2b
+0f32d1b5ba58a6b85c0679d2955183a5
+b2ad17a9649e0c389afec45fc864567c
+60c6bfef7a0a50806d5b0ccb990d1c7d
+f7d9de7051114a82146aba43b412dcec
+c67c22c17d9505284a783e16d4a868da
+169a5970e430b55887de994c1fa075b0
+ffc11d2cd12696fa8adc9da0bb9af252
+002476bc3d34971bb8f780ba4401c1ce
+12a0e88c696e0b1f7c3aba18a99f65e6
+05a2a645da2303309d85b173505647f6
+2ebb437707bba3ac8d08cddfdc9be8d6
+6928625f6ff50c045b98ffa7b89af73f
+83bc7c1699f97444c31c0ccb725f9a9d
+b604e85d510b5408828479c3ac25df41
+b0a0d71150764c1f17b554d293a6df08
+28274530a94a945ced36e5e98e224ba7
+6769e0b88320af7f4331c6a4c68401e1
+9240836f66f1a1d9da3919973bb242b6
+ef35856628695e4564400d4f582f5ace
+dea4f3ed6ba37e05d8787a900ac66917
+9574bd70b98fd773aa002fc3855fd3a8
+931eeef1aaadaf1acd9c5a63f3e1c603
+b9e487441b516a0e7115455763290c10
+5679926a0b5e5824484c75eecd590324
+6bb90787c697d2a64f931f27cdb8e5a1
+1486b7d6856a7a20c35047936aa2396c
+e28c46efa26219ef558c97f20ed17dc7
+5d229ce1bfdbb3e3d0d31c969cc9ab1b
+df30d0091d652dccb434e21f69a98771
+715f17e451bfde4fe7dc4e4d704a8ca8
+d6f5d8411a6ec0afa73ef5579e567a57
+5d968eb396a7c38f2bc43867194fe913
+35e092a07af227191a67ec28c50a5410
+30d15008a352063d18d5c2ffece00500
+561c9fbc02e8b58de2be4b53b71cc0c8
+24f7afb38fc24c07fa92ef3d3a69bb46
+685d218c976fc88bc6a7b55130dfdf0f
+48630d0c39310d6823bff7f486198067
+271be356679455218722174cfba26a2d
+4c4386cb7f1ad25baefb31306b748cee
+2f4e846f27f7f8bac6d3076b2e8bab93
+86231d00efde51eeae6fad0448a8687f
+07585ca66580559401b20669b2cb1144
+2589e835d39bfafd27cc9ff7a1f1a126
+c407e43fc747fefcf39f086ddf46f75c
+6c756343132fb6126e211eb82e4bc0e2
+26272b6393aea4547dce563f13cbed0c
+7159fc40c3442e4abdd0f0d06f9fc20f
+57fd7a9b56d69dbdeaeeecf7613ed02c
+39f1ed93b0dd88e5eb9c3860c17bf3bb
+70bd3a44c951b04767e312c6dafaa3b0
+3170467b59b5e5d7d5af325d8ff8f5f5
+43fc6099791dff2003876cd6a3814a97
+f2e872710beeed6a12a97e134fb5fd59
+0db81a486b00a9349ab4bcf37c9b464a
+7f9e241d0378ab27aa6b86c68f3e07ca
+ae62a1122b8a6c65caf9233ea2dc7dfc
+96831ea0c3bb3ee0133c2075358f7bad
+3e2c87d612aece0643031c18a543066a
+53ce7c35009c5f69fad5ec6f4f0d6ff8
+311973ef2e072243cb72f2d890bf2db5
+28100de2174b14846fc79cce4daaed21
+0dbf099421307dec0ee22640645e29ca
+ad576d0a4e5f2e6ab4afce61854b2928
+95421a796eb3d32a2a9ae3c96644e4fb
+5f470e8c7c1101f9d510d628bf611613
+9349cda214c6ab540248a95a8fabba5e
+3f50c79db08c438e463cf75f76df407a
+b149c33ae151d63e9c182659dbfb40ba
+ae523b2549723c5687a5adcf48f5addb
+cdb442f9bc43207e1b1cb3c80378b938
+8a076393404edc96b5883799c6cba741
+190a3523a47fdfdf94fd331e507959fa
+06b68f1813aa162e02a0c6594106832c
+e1a07069c78e495e88865e7784301cea
+1de827f2984789483738cf5e9aee1837
+27d03123bc9b9ced76f961f381b3d9b6
+58cd601c11e2a7896c5252b9db0d345c
+b5d4d9b37f104bb476df95cd0d86f504
+363497ab67a09a0baedfc7506fcdbbe0
+b26995c8adb6bc911e9a0aba7a656da0
+7f46e41c7d1909cfa61fec8dc08a3a98
+b51986ef1667aab62ffe7670bd9fe502
+f437f61809b5c80f494647cec6ea4551
+fde1f359d31324070922e88bba3dc5c0
+41a9ba31540db0e4e4b0678f8ba80153
+053c6b0cc26295fc9b74b9bb9cf919a6
+d3fc70254ce4ecfe69c2dd2239316027
+420a9792b5e7aabab5730880db65ba19
+e38c842bd54f66e6abdf3eb73c1d4185
+47adb7d199e1932263fa99900a68b2ed
+86975bfa628b8313c927e06062199acb
+b4f2d43c1c3ebf0090d094b9b3ce8645
+66e273f30cb115642c6b619e47b3f6f8
+1a0bcdc66673eb3e32c643a95ec3d52d
+d24bc400bd9233ca85693a84a00f6ce1
+a104779fcfb6aeb4304568e232a11ac8
+f20f9b6caa1ddc7288e84a801a636e92
+9ba1c49b36efa7778c6cd6d06f63ad34
+449bac648103339d16de48915f8ef82e
+e19a0ad0a55a21fed22ff5a0d8f8b534
+cee75956090f16319c74d92f9ec6f086
+e71bcba2ca56af52c43a365ae0d3f5f3
+93e7b5e7667284694c6f1caa687499e3
+00232de0b7ba28269ef091acc1e3aa2c
+7f8982add39cd4a6fe0dcdc1e5fb6551
+d711377b466abdd32aecf1c1d4706986
+50dafb59bcf1dfce3f4620018e6196e0
+95b55fef58ed5c495405682d2310b4f9
+835ee5f732fdcec57ca3f132a430a3f8
+6ebea99af5fa69eab407e629823a7405
+a1fe23130317f5e54e1f5dd40f58ef3a
+0ef71f7ffd9849b846c64f758b695bde
+640aa1525b29830bc9984a7a9319c787
+d70d70fb7b556598b46aa89508448e34
+733be0c91e68bc8b4a66e88dde7a2afa
+0a4efb51082d6a47df59c07f210ddadb
+545816c7e0530f0ae1f5ed5923fa013f
+fd30215e770a609f813884cba3c41d6d
+a915f3f5337a47c7c9e7821e6fcc2420
+8f20cce183a616d5055decbc444fc8c7
+76a9318370952af13474031c4acea809
+b0a9ed2549ce86e3e8d217b728e59fd3
+37aa160475ea8f1683fb815c30f46414
+540485e699ce5a5226ab212f299b5b4b
+233988a25a121978710030eef3966b9b
+9982fe2d363defa9b43a28c9e6961b50
+f620fcfee98add87881760f077061af6
+5c02c1c94ef9f68c16a155e62439bafe
+7973511d1f1be1298944a46f979b7bba
+ddb2816f1c1f8e0741ab863c1b9d68d5
+6845d791131e438d7ad25c52a5dc9674
+e3bdb892ba8748851442258246e604f1
+f426478ebf67238455b4fb961ce61bbd
+dd0a5185abfd7266a16cbe5252acce4d
+f4e4c0ba926e0cfabc21f3813c2a19cb
+31d654f18685d2a7918d142da5b7b30f
+b625225b52d8aa224b46088a8c321ec0
+56a5608f12b0a3cd38c4e39aa7597a0b
+ba06c752ad6be92b8a586510815b2a7b
+73d17458cc141741f0e73b8c760d542c
+d902aff70d3a1088eddaac978753b1fd
+e466b8b5c1bbca8c1d1a0dc20212d339
+b97d5b4dbc562c93c10a5a33dd52a0bd
+39c4f530a572fc82c81f23d2590ebe52
+a6a97e55dd738dbacdd9822df486a5be
+276c3ebf05ee1b31a924a2327abc0762
+d41d8cd98f00b204e9800998ecf8427e
+c6e6bc16928cea4ce4f2a6f72af17eb3
+0df4fc79590a41d3853d4e75acc37b13
+825c3867ad27805d5a11ae4cd5994d25
+369e1cbbd4917b691d61d9e459733291
+779715032cdb0382754b2497a93679e7
+73b902da829be27d0353d37e93976460
+b75e86641130003f272dbc640dccd3c6
+1d5c1a92089e094260e2e7da67a380e2
+c7651be82c6424df9d6a855c22a4108a
+fb2d13c1f6fa6031fa0ad3a6f072f461
+382d284fd3ce4a529b2494895d644df5
+adcc4af7582fd612c6d9f9cd8438cb37
+38c340b6dfe7b1eaa531258c838da8a8
+667fc78200ab82e30d283e3d9b8c35a2
+5b3a2235b2611b254d577ce5300f9fd8
+d04a412b34ff5f79950fc2dc879c5da9
+48b3d9544ad7399ad5d8480f6eab894f
+483e5d985fa98797b584315f81aa1a1c
+777be93da670a41b6cc5cc1d2fd4b73d
+a7570db15b6c6f53e77f7fd91deab65c
+9218a42dfe37c0ed7e1ca5dca406d4bc
+75780e443237a7d52cb0dd35311ac8f3
+c34dd3dba8bfa835b23f961f8e814212
+224a220a97d2ec043314778bbfe29e93
+99cb7a508da937c562bd94689b2036f2
+0be92afad41530ad6d7264c48b02d408
+1af26722e5ff0f622943d5eb72f113a6
+2db1058cdeceb572961f2c886c04ecf7
+9b386bea8d87cd178234cb060b114d06
+970c68ee639ccc7a189b8ada52d2e874
+8ddce6d7cfde8ed5f815c383367eb2e4
+0894d2033a527c52e46394cd2ef58af5
+2cf0eaeef49ed59b15bfb7187afbe8b8
+60ade4c6a65e3d6fc6f71b978f6dfcc5
+71e4da487ca6497ff9badc5554e0f679
+c6a3d74f25b9e50684206cb3af4d207b
+dea067c9122bc96b897dcf45ebfffe08
+5a7b628c80b41bf3cf0c62ae6c5113db
+d3b7373def7b18a753a5302693727c57
+46874a3e4ba01ba69b3182429837c34a
+1c45514ab7c13bbf0cf8d45a97be2933
+7efd8ee08786824e0ded62a770a11926
+83d1caec7ab6f5cacda01501d056137e
+492f5a4095c1f63b223e38f8dd10e9bb
+a7e0f43b9c1b245a472a5a60a632f74c
+4dd1b8d31c75a173fb1e3f95903fa1aa
+5cc8cda4cf10e3e9bc201a4dd07ec4d7
+9836f2107cc2c32053282e0f314a72f6
+71111f6e18e41350c8b62da245d02e67
+1c7d57c31392bbf55e31c523e5b25a89
+c17f5191a5ca2e1c3a13357126c31941
+01d904bc69587a03f9d06cfa0b0cb8b8
+9548f45535d5fcfe2c2c1ad33ff96cbe
+0c0bfcc516071cab8c26e008abe5d8c4
+1b807e1233fa9d132788fc4ae96cb06b
+c500daf19a89019305b0563257415c8e
+40a161b8ccb37ed546bdfb059a8def63
+26de23d80525999b2363a49596b4817b
+917672e9616e792152aa75a784977f22
+f05da931bf18c24182e09b8cb1ac21e1
+8420e72d6ecf46408908877d513248eb
+3cd8d2c81a70e7f9f6e32fbdf3e010f5
+066933cb3a06fcbbe7ca0245c4faf5e1
+a51019175fb10c9719e8b355414ba8ed
+6257d13cdaf350c9e97ecd8fdf625d54
+95a00f05e87c304bc7f71beffffff782
+8e232ba53025d5620819df1410a71c8b
+f69d37ac3007ed1dc3da06ca658fb3a1
+367cb22a233d6f99a856df5ccf6a76d3
+568775baea23e9c4af2852a8abb8b86c
+3a0417a5bc52275b0e3932804fc05179
+2e5c5a67bb7bc39a4ea87db23276e9e6
+eb7c84d7dc3a3f58ca953ebe737f03cd
+61cdd8ecc778e9c520e05aa8f7f8db5f
+bdcd1c9c74ff432ff60536a8050c8ca3
+bceccb2e95ac44965ded6fcb179de994
+fa20034abf5e7bfbbca3909c104e3a29
+211277a458f8014340dd1b5759a24026
+1229d2f3aa05285838132010c1faa20a
+2563e3c4e05bd30827bcf4d22a6247b2
+b20ce305c146748149f6f2f4e4d8e67f
+4b71c6bcd70c77b03b2c160316924bb5
+5c18263d6a62ea711eafdbd68dbb5f9f
+d99a81dee08bfa1367eba393bf692ab8
+4bf4abdedd1341095709a07fb7337688
+58591b22dec9f0a6bd02a898c88a35de
+914db820c3dcc6bcc98ea872a4bac282
+e92f431030ccc604b4f667a78e635e34
+dd024218655d5fa669f895f52ed9fb78
+b58d80262d38288ac3fd200378118bb8
+4d5df73bcaa5644df7bc4a1591fb76a8
+cfe1399765c007eb7442bd8d47b40190
+146ef15c0935555b18aa8bc95efb3bd9
+458c7edb1199257b58263ede35729487
+b256c13ef5a590a546ebd3df04b08f1b
+24b9feb96f7a68daa9892787620240b9
+b32c04e08144ee9054bf31d547bb33f5
+8e5d2c110ac4d7b0ab713407dead4957
+96f33e8f72d814f80878d4af78234931
+3adbae70384f7fd3262db2b57052d9f2
+428cd773c5b907be2180ebbbd550cb95
+c485a8cb6b7113001d7d113b949aadb5
+f8e129bea0be242d6ee978fcdfd00439
+979e0c39a7efefccebaac86d480f292b
+4e215903498c792a344d3dd0c66bbcc2
+0421899d0066a64a0e35d5c14e563770
+87c99c1500ff977f229adc1d0d639731
+c181e0c6aba3b90add59662ce5a64554
+478fcd55ed52ec1138403fb5b241b05f
+730e08e10ed14f7a4df849fe1fa634a4
+c077e311c5cdaf4bc2da686609e79d9b
+826df42183271d2d91536e47aa5e9cf9
+341fa12b72f2cb647d8cf352c84f7572
+452102c01001890dec6b75eb1c0c354b
+8dc282c91b100a47aca065f5e51d28d2
+10897a20cf0833e58575e9cf28870f2d
+3f81bcb2ee77db68cda2d5574e10dfb3
+23cdc70a5a5f19e6cbd9f6e09b23dc1c
+a1cef1cb1f8dd0ac15bf0fa8c72bddd6
+a846d91b38c6f795d401258abaa033a2
+b6992b4eff175476dea7bf536694449a
+a22177d212f94aa581e1cd5f957f0a1c
+ae7214a3b6410078e6dccffd24b0788d
+331d693bdfa2fa1a9b07e7e5023404c4
+ae73f3178ad57dd082914bfe9f043325
+d8e749050d41ba322eee631a366fa20b
+11b2d418963fce1b4701152ac0f0190a
+f22d70a6f124c8d6782d041a2bd6c6c6
+f1f7561c7f80be2f7850f773cbdc1a86
+106f6cde972bd9a339e4ae463774b7cd
+6ab251a3c620b72d098ed019548c4dd3
+698eda314f0e18e32966571e03284eea
+68e52b33f62f368293872ba4ca2d550a
+a4b4aa6bc52bd6cf958d356f3c13e4c2
+31d0aa589af3f18edead86fc75776ac3
+814dd00f92c6e9288185e01d64f8a23e
+fd122f237b71540a9244e499b013fdfd
+69f719502d40807d7f7c4652feba4db3
+a85ecfc7413cf521a263f2ed4e8421f7
+4d38faf2c191532322027a6a36652ff0
+4a11fba7164adba57cb1cb274045eb85
+8aaaf2b40458ffe2aad28ab7ba839808
+0c5ceaebf49b1f096ce5f174affd7452
+5f9c96e35ed85992c56e676458a4b331
+aa2c254d785e14674ec160721e3a4969
+4009e101310f3f463a83189dd6fb2b67
+400df096c7ed4da8af97c23731f95f1c
+8efcd1931dbdd1f94ddb40712d505b3c
+046e2a9bb87b90bad31a47baecb62f4e
+700850cfc9ced8ad8f9cae1731760d99
+801c9960b064a60fdf73e3dba21c5272
+549115d804ba65d89c1923290d1d2b56
+fe32c143e455ed540d697a279fc314f8
+845c91d1447cdc91faf9868bc8bdfe6b
+6013e9b680bc1d1c96291d154f553809
+7bad5da6ad3a889ca6b02d6624d99467
+1d19f04014d0a4330cabca3754aa31fb
+98a87d745c08f692a5530906e6a9ef2d
+3411ba8b95fcfecf740ef9fd79848805
+ddde985dc78e8304fa1928b9e759dfef
+28ee79cb3e6bf92701ada502a4d4bf19
+7d5eac4d5e8c97a0552af4801f863eef
+c9fbd30bd4c6ec34ac3a5c65dcb99626
+edd59c3982a31c0a288a16fb41f0567e
+5bbce8a90f1bc79584c8c2b8e2cf9909
+bbe656130eb95d65c2cf0f26f0fab030
+a8c1f244c196f65132de0a5040a75592
+41f0f216464ae7c3cfcc0795f7da5403
+2bb92e1d44e04fc7fbfe8b5eae52162c
+c2f881dcc6c78fb3210aaff1d2113804
+69dd25561d6d7eb8a70f3401469004e3
+da149aabec16f0884f53cc1e084a5062
+e9299624690c8448cfcbf9d71b69954a
+dcbf540c3fcb65c428c943043de63b94
+6dbd1ff4ad2c1b9cea9d791d5362ba2b
+890978a48a7e65ecb2a296839af7188f
+dc5b2f9b7caaef9fa5d7e3e768473f4a
+187fa8de169057860c92a00c6f2d7471
+d5dcb9c4b13cae5531313034bd191ea0
+b80e62f4fa5be81b78126c24d22062e4
+34b47efad15f2bf482ce46edc6943b1e
+5cc49622ea6dad305b321afab0aee31d
+64859dc543a2805339fd63957a3a9c5c
+e54bad6624eb71a2cb847d86389384f7
+ff1fb6d1454993fab667fbac63885694
+98b06498ce0b446927439eccb1ba71ce
+e75d97e64eb36144d60ee142b5d455ec
+036a11112fc6245efaf85d60304bbb41
+3fcd75242f4a51f1537f799c8ebe5495
+9ea96800b16f6243c5898087a999b00c
+b4359b453d6943594bc04551f3de4de5
+5ba506e06f9e5660568bf45478017980
+490e7189076c3a263db15d52bacba953
+8c627610f76444d49d745d55c4457928
+424d11f3b8f2d7d5d14249356edf475a
+087f662bfcb0b22cab2b312791986a5c
+24b945e8d65952e2c7c3a6bfb787201f
+c0ccd1aa3c3e625f7ae6d0733a7d7d66
+80ac8ef8d7647294ccf249d4c26a859a
+48d30db27ec3907f4b5e33fc7559a67e
+edde5842beeb2100f2e2b845c7601281
+7a90a17bcc2d356a4b53e1ce119ced17
+3632fecb702e7828d1c92dcd8e4a7906
+9476c41a85d4d40d0d6819b45993a0be
+57ec530e6b9e949c8a861ab173da8262
+95d19c2cf5d78f76f377fb136fd742dd
+91e79e8d31b9097f231d60e95f918563
+8287504bc5f075059e394a11302213a5
+0971acfda3a09627d9135e95131029e7
+2dd459acd69aa46b1b13d9a04ac42624
+558251ffd4a7422b84aeb4a487618af8
+d1e7876b6f677decd2062b7448e98e76
+94a5d798605fb5605119e0a3af607a43
+ad2e5cab72661889a6f3fb97bb8c3e45
+d429df6c9ae6cb26fd9a1d81b6452ef5
+36537c9e3a5dbf3783009b07c4059300
+5a3765737e6db2000fb990521cea228f
+bbffae67be83c6d5b3631db86e0d6e98
+a9f05aff50294a0372c083ae55f4cea2
+67f324926c27c58212ae49d04cf5d7ec
+c2996900b6a0521af8d130d9b588942e
+b72377e7da0f194daccec7a562f7eab9
+dda2ee83afc3559edcbba15c9425ee60
+5ce735356f998297bd300b26654d8fe1
+d3b547f8aa89af33da474f2c758049df
+c8fcc030e2e26365b1f119228b35e778
+b6a5f63a02142791ecd3906921ab2dc5
+8224dca724dd35962c756c0df4fe46a4
+c7f9638b5d08b6cda17e235f5c1fa993
+1981c77f3296fce79d897bbb0290c300
+0c1b14d84c9ec10658191b364e9c2222
+0e51a25a48f885b91f52223c1cfecd97
+1c7debc4af2e22ef64616b49bfedc39a
+e16a738cc95fef14d80968de465ec1d9
+008a8cb802bc87dadf2b341a088298cf
+9978c5a8330318d5e448d843f429059e
+323636ca16103b9c6edc6ec913c84e95
+37b7154b3a8899e76334630f4b80a95a
+f05aef7dd91ff8b807d7a14d5637854e
+ce895f33e382023a81562aa6e4610d2f
+0beefacd6f5dd5933e3e56bdfd0fefc5
+1be2367fe1ee8d86e59c0a774ea79aa3
+7cf52629cc8c45ef31d87b4730cc1ff8
+3378906b53d413b9448a38a45a995ac5
+bb4d9bdb0009d2ccd6dcf7b4cb861c66
+308bed39a600934cd5d7b98073e67025
+33ca01f46b5ee3553841827abf43efcb
+3f9232eacebae815edf01704cffbdaf0
+884f705759b46449bd292b9e88f1b949
+0f06a105a65a44446545eaa16161c77c
+44e1d5f8f55d341f3f67bf51726b6f09
+c11dbb7e49cead8a72dacb07146ba57d
+f601d74c38e02c81ab361a5c548922d3
+f2875d525532a64eb40168569c08d5f0
+54eb6c4475789ab2f00908d8e3f72ae1
+0f0a8f640dfe4f106d57f1383aca24c4
+8cb16b7ea48708405b18d72a63fa5b25
+8280e33504d98b3d9e75b352220205a8
+cd6c7edae0211ff4721d1fa12cb35f7f
+576806462f1cc4a6c2f3a1e94727702d
+f08fcfe0c4af19f745d5db94ee8f718f
+cf369054fbb560af148c059974833371
+29856a6a97e68eb3b3d190253872572b
+cdfd70d1b7b6151c14cd3b845b0d3196
+cc501855a1f93bdeee7e936f24d79726
+8174da22c4d51b66951b8e60e43a230f
+2f01d4d2d61b72fa0849a377d5a673ce
+d386cd13027953b4c29b12dd5a8101c8
+718c44b2948e540c3c2c617f0494c58e
+e0082c662e93ac3631dab8f42b0fbb0c
+9c09eaf6553f4880927959acc88e5b9d
+1167544435419af1f84bf34fc5594c52
+d1c571501436f1299527d479e8fad2bc
+e54a5e248184b17ef13f2caae43014d6
+ab1d9406225ca392d391a124698f0eae
+9c6eb337cfa1acf62e6fd6e9a703468b
+83424113e24a1bce137e98a90ff09ae2
+7b946b9d8b8e4a4a703d19d42ecaf4c5
+c75bcaa7912d1ad65ad5a527431e8994
+6e71bb1c872e3688bcae344edec9e95a
+4e5e81bf82e0d6d7b49675cf4ad3b270
+c23ada8dbdae470d14044ab1deaba21d
+e6e7d9c97d1358a18f1d88038cf65e5e
+cefdf90e89d0f677a89f136e7dfc6e54
+80aa464ea2070712b75043c6a94839e1
+b2a556452ac73b68a997aae0f1fda65f
+03eed3ac0ae9f22e0f98dd1dba452f05
+7957b45a6ecd53485c8903e7c5021588
+2fa3c3db69ead07e10b4469dd15bdac7
+7ea3cc9b44b7d3b5b3b73a3cd90cb5ae
+de40090127f4b705f2a2da7a1a7491b1
+720f83c32d01bffa791f0ae3f2957c87
+7d691d3022d042deab4d175943c67c51
+cb594909295491051b11b224cf6b331e
+ce5439931bf52ffcbb7a8328aeb40d10
+a24ab42b1ece8dbf04b91fa0235b9be0
+4c03ae93ffa7146ccf02f00961731641
+9ef5e700f010b50c6650ad3b52518154
+52bb7c051025119de156d56fd72e4944
+3190dedba40ff311f2b1687715330c5a
+1f1fc4012aa358fc3bbee6c21c11a089
+46587380e14ba063f1a471ff2059b895
+4056bb2cdc7540d37a53d2c707779b55
+7e22043f4e7441f9aeb4aeed483a2ec6
+7b4ba805a3e708a779a174c3169d35c4
+8af2602699423b7ebd2decb5c41630ac
+7d87a26aad45f6ce987efa25717045b6
+2c3849a0ce9edceb29a81c9531d0363d
+30ae366d476469528ed2a7c596574251
+33ee2b91a614ce9b38111e3565e6d19f
+81ff6d18c1e541dfff8a46af46ed3bf7
+7c2e294112220b60f441a478df7927be
+55da8d76bd01287f46222dd751216c52
+767986017ffff029870f30cf0832bb42
+1b1ade3050c084466da4d24b5fc52dfe
+a9b81becc838ad275f53ab0e5678fdf6
+0fd3e7c7e3e601041ffaa351ab172bab
+a4c3956c0f079c1ecfef1f29ad7433e5
+f891b3366266162f00e4d6f72b640ea6
+531cf6f2112a9d39ce951189f67a3735
+7d6f4e30f66eafc98074376d1a717f68
+7268bb737cd0bcb64994ebb338282890
+7d42e897f5e98c55edf7915acd4e63e4
+8548be52c795ecdfba92d071afe0b72f
+ff5705c3f546a4d540d14a4045f1f55f
+f4e530b11aaee28198f82e5ce38e6d29
+eaebd82a941bcf7df1881bc1c370c51a
+c20062fc223d5c4d5a5b3614a8fa2633
+1c773cae784803bccbf490824ac4a4de
+d24e2e8b62b2253250f56ea0544001ef
+c5df9568c8d54065551506253c7fc523
+eec0f4812600da1b236769b7c936801b
+c47def009fe0a8e206a15469c36f389a
+65a96068350e1d708c5645dac368dd16
+3ebd2ce067dc54d8acf5eba1d9dfd6cb
+6ed152d8526338f03888f153caddbffa
+3439fbe02af492cb664263ff58cba418
+6225d81fbd7c1a2cd91f2d166e8e0aa5
+8fe89d1ececf5aa9cc1ceb6506e3d1d5
+2749f8d783ec679b15ee2c4a6260c497
+d22c49e8237780b4754095acdf32b41b
+1dc080830cc6b58fca67b1a59b873274
+23bd299cbc10f8921f227f78dc556d7c
+d7099eb6eab21e6985ad4b63598d9a48
+a58ab6361a3458027f634ef6ac7e153a
+2f353cf86663ecb2887d4f35832aa0fc
+7f49f807768cf594eef32342b0e37924
+d95494286cb08011ab48b1d05c153f3c
+698623f2d8f95f234f08035f6eec7f57
+ef21c025346aa20e1d4d6ace5e4deb70
+004ba3d14f7f8cee966bfd35a774efdc
+cd855e1563518f5cfb322e6d90d256de
+19c4fe55a265f0368ff7394a61daeefa
+9c70572ac970866c5637b5a28f2b3956
+23e6b69b02fbec866ad84c674abea1d9
+a28ee38acc41aed1e7c3fdc336d66b42
+0681ec02f4ad8626fa24dfa551a1674c
+6f83096c77f059145f4dfc670e9898d7
+4c5a22b04162d20ad814c647d2111870
+53881a55d2f1f07d64ff01758df9ac3d
+15a2db68e606b4d60bc311c589838ac9
+94a185178dd4db4a40a8217124083159
+990e76815908299bde59fba3fc85af7c
+29aca2292b20a04b53f1f7b8fae80bec
+514924bcb1e12718145aa252fbc3e2b3
+7be6743816de209ea222c6f2e8322bc3
+12666bef78cdf1e0936a15eee8bff903
+4b6dacadda979e4a2d97645699a4c941
+95e302ef317694f3d31aed2bbf535b7f
+bb4e438cb3a0cf3b170140f2ad970c59
+a9bc1e4b660a604e3c26ca17b0878077
+f1c11aea505fae6bc41763503a418bd7
+cf0f541e0e70e08c8716edd7e2a43e3e
+338fe9b1892c7c4824622d916d20567e
+c2e5c6e10778f4017de93f407c29a83e
+075e3c7748441053dd8bf126db878b17
+3407729f4b414dd495e4b0472d30d36e
+1aafba7f47161739d4e497df92af6533
+eae1619ab9c1f34e613018ab21944a67
+eff1b169793cc6b0722a3a4415e9ae9b
+20b237f31f13dd04e4ceb231a4f60785
+9e8fadd58781b46d1d5c283ae98e0744
+eaebebdc75e6c8fe838ae02391ee1c14
+3bb20d93f2065164105f2bb2c0600faa
+4f3708ec557f0c9de4224e456bc6e3c4
+d41d8cd98f00b204e9800998ecf8427e
+c8dd262235a5fded9c53811668dd09a3
+3f68d2efa31de20a87c99f09e5d5c77c
+57978df8250316031b7f30155cdf27cf
+c02993ae02a41eb69fc9c234568630ab
+21600d5825b9b8f2c8a5fde0accbde64
+dfa4cafb31c9cd348cc3b0f756ba651b
+240329779dbbd874cf5c9b3f7e17de8a
+459bd157df50f5dc16f4085d90899e99
+d4146ad2efbe44f53934e125dc444462
+8cffb4363732f77b272580a481e95288
+9de04cf99f97059bbad688418eb20fdf
+3b29a60008adae0dbc19cda23a05d298
+6d71a3b054ffd9d36251aa11dee8185d
+63ed82be9898c736f5cf5696626ee8b0
+ed07b77d60c2306b64a16efb5b44d80b
+d41d8cd98f00b204e9800998ecf8427e
+dd174058d2c538970de1f464488275cb
+e667da5de19504bd33ddc30e744df3e2
+9117473b3e475cdb56204d10250fdaa1
+d1acdce339b54a61b2bef6954fa1ccc9
+d268a53e9ad2757756d454116b2ff966
+b25654a78b4ec4a1af7073e8e9c1606c
+6f9e0a830137a02ae31483aa2fedccb8
+6fd4e270876b7f8cc384247c7fbedc31
+677e7b3c2eac8c278143b1c86691532b
+584dd72c8f9bab932d4b31a1ab5a0de1
+36357eba6fb541747f40aaabc21599c7
+79b527237a66f7b1b9810b28cd654fc8
+0287fdde969af2efc00582c383c2583b
+96aecd77925d84466a14a4b8f9aa79e8
+0c626d973bcc0a6e464e15146f0053d5
+81b0579fb055ef4686376f8b990e6326
+22f1b16413d49155c401a03a3daa0a45
+58d6964689b6891437996be7c01afb10
+499311694f65d21082875efe4260054b
+0ac72f255825c7b5aa67189b008861ff
+94615a7833976f4091b4bd340f9980a9
+7b946a7ec5d5b622670e2b1a9527a1c0
+1bb489178ef51d9abb1cd75c8fe2b3f6
+933c5b6db3a8ee94de81cc184c8a8aea
+f46a64079f92f41ed4e18d204ad3178c
+9fd0287c4e6719b71cc1af309c67078d
+a23c528e82b29839d5a049a7519e9c16
+538cbbc6787ca4ac7d93408d233783cb
+375eb2949b6c21327b7d6a50aa7f8d84
+0b4c4a98a372b26b6ca7042dd0baae9d
+05848d1f2555bc5eeb8973199a37253b
+72d0b06d1f7dde2cfc2dfc7182c696ef
+0680d8bffd0e5863dd5d2e627e356505
+f81ced71cb9e7f589ec2a334c2048fd1
+2787a2bf6a1cb2a1828577b3e7775928
+7d3a2f310f080c666482273103a67c8b
+440214f8e8865faf70d69b7389d1fb86
+a4fb174747ffa1dd6ddd48d426ab3a97
+a1af72b11c8551476b6ec342f457e827
+06be6081cb8becd26d70bf32b3b11afd
+c95b4e60054b1722086b1cddd92dbf4d
+3f8ff9b3c331a8fe7978ab78f03a130c
+88f7b8f92dcad947b78d3fa700e8de5d
+cd9bf27fd995f16089b4ff76e85bebbc
+a03ce7e41b3e112c04a7e595ac4f6455
+eb3efe2b03bd0763f90ee357c283c85d
+1ded3665f1d019029814c9810e4f6d41
+91a9906744566ef74f68dacd24bb0bca
+832909b8e642f3c1d3854d01f9c2b0e2
+d32f3ed9f04218f9cd9d052c518c57fc
+0ff1fbb669834fdf4140a48556054e0d
+6821e6d78f1614ea319c095263031945
+e33283e07b084c1a5085ac060562b1dc
+09ec10917d631ff61666e65c23dd90ac
+7691b9601bce81b819e9c19d3ffaa8e6
+6f8e89b37ce140296e811bc3a7b86b10
+8ab4c53fea94b45aab724fd72a938205
+7dd99b56e2cc40271a4c48096ed7465b
+aae9bad569679b18309cd2d99eea07bf
+eac48c6ef75a0725227aef29f9e50c6a
+25cf842676746135d64203105c7ee59b
+358e9d449379f34ea2595a89ee271eff
+7500b22492db74eb6e702fe516bcdb6e
+a96bfae810c7e04fb75f7ea6b8c257e7
+b01319e09fc5ff89dcd767ab36eadf25
+05e5d2f9373cc6ec54ed136f2fdaa721
+b05dfe1ed952201e62645b3347b9d680
+1a4d8f0c9977181aded1c9d988f3648c
+2ccbd4b95d98cf1db48534832b3ac749
+889c1fbc548e3ec6ca700232460da3c6
+65793565a04cc1bb3cf29fefd2e3c6a2
+ca5f89e3ba2b2c2d52890aedd44db0f0
+1148c8155630561c3a095371d3b84e57
+e2d31fa9d773884bdb049c71c6c044c0
+f54eb5609ab4acb1664260d0c80e2b26
+ca3faec4211129136df525f89aaa6b36
+40790e02f4cf0039d5d1aa22ee1b69b5
+38d60a40ed5de9c45b7ca23f469f44f4
+609c14a8e14ad1a7dffc4f067319154a
+08e92049604c227c934ae45be8a2e26f
+74df4cfaed38c35042238e928ec4f32d
+e734affe6f992226d9a03b650fbd9bc2
+a3d3f47e46e4ddde2e0723baf2ed49ba
+6da9bfa4892ac617442873d13ec5d0b5
+5ab56acad16673f90484516789bce8b9
+880ed1ad4de50a70e4fe5ce53c229edd
+dfde8a13f7e6f50abd194bfe9b738b47
+ab807d8b4bfb214a1334e074dc647593
+a5a87c676402093a6e1b117f910ddcef
+afc87b9fe13ff1a8e926a87c1bbb7b21
+d9ca73e6bb9e8424412910cbf9f9b79f
+02a123e4ceb95ce4390220c2bdcbd247
+dddd5091e5dc9cb115c57e4d579b0f21
+353ef879f5f0817d8590ed537a2e44b4
+023c8b1cb4ccd96a62c65dd9068d6d79
+cf19245cc3aac204a78f15a38b5bfe3a
+c140f6383f4c7baebc017332ed8013ed
+a017597df0d69d83affb907a76c27a6e
+0a5c2d760dda6e05910599307aa0b03a
+5675cf1151be0c3aaecf1795a46b495f
+e3c311b1579cfaf5964f1e1970bd0f55
+256033bb7aafa131ad8e9b6be921f339
+c4342cdd9d2758c785d414cebf4faf7e
+6c300a3051b11ca05e986e0c4c594c42
+adc4ac4e2b85552285e4d4dd05cbce82
+708cee22f7e68462c14c8edbae36113d
+61820b829aad5b9b000c753cf77ec1a0
+574ae0db872f9a1d07197c171869266d
+607890048c5940d41a571139eb83693c
+56ad988e585941ec857196efdc0c550e
+56063d5ec3fdcc63f5015188d22976f3
+e03eec0a3e4d12da1fd5cb2539f5b584
+2c878388194b4c94212d93de4bb6eef1
+c1e3cad29d12816afe6fb6f038d88042
+82cedbdc67b024e83a05a171505573f1
+d6c1ccda768774de82e87a55414c5570
+735160c5e38171f547330026a7281821
+cda5615653ca15065a395e4194edd732
+f1126686a4279407b47f9b1b07988923
+046b0958897f878d302e986ab640058c
+3f9fc8d29eefad9bb56a48dd1ba992ae
+aa2be2098e1ff63d72d1a626806c1ddf
+b1b33f2ef98c3369f81b31eaf253db50
+406650193cf0572ca1dda51669649aa2
+756a2af9fb9778cd41889e5c91850ddb
+7904303c30e6d4460e45d0aebd561933
+bb94f946ce59c6611067a231c5438694
+152245c577e5b98f3025e4b57cc99106
+414cadf90a71921ad0286c62d4ab5725
+12c51a50cf805a4115607995e8253984
+f696039ce93351d3fa4fa96362a1c08d
+338edd3c6231bc04cf574530fd671c0d
+a64fcbce82760e3b5b36f721ea5158af
+a964946720af93ad890a2ad224520acb
+5a999ebd64ccaae833119919811b00a4
+9a8d46d34c1cacfb564dc66b963e0449
+8fc29e7ab726793d3a97879b9826434c
+fe694678d3db879d508aa06b31d9d2d6
+e02717481fa370084f43eafc351e22d0
+86fc1b459c61c9f2febc2d5701a81a4b
+59eb3119e7de9ccfd639a4061d100618
+c99726cfc09110037652d8d2b392b959
+b3a5adce97452209c80ad94babbb5496
+72c73b0c19b630bf45cb01f81e4bb945
+9e6d6cb8b94f6e7c1a20add5406bbb8d
+cf9ad3f63bb3aee39e53548d4d7426ba
+45f078320e8d7068dd36f05e6f1955d3
+a5a2c68bbc8054ea4ee19fca1412cbfd
+b30134290348ee2f141010d39dec7aec
+6d8274e0dda5c19cffc13b8a98afcd39
+94ab774fcb977525845497f55149b6c4
+1cb162e589a80f0183aba766438474c7
+47a824e01143bb65405d9a11c07fd553
+bc0d64598b0cbd385a67736080709a1c
+8d2eedea91ceb521eb3bcb7684de8793
+41f355c378dcef03f0c1e39389981cfc
+380aa215cac31dae74077433f1051cd5
+95db93fef3ee314592f472eb979b16ff
+f24c9651891fe7e04863b5f8b7911274
+90e4249635cfc1fbf866f22a3ec386e5
+e2348824203ffc5bbde01afeea86e329
+3daead7ec26a777cedfe78976a8a2a49
+4e9d98a6710f23bcfc22cb39fd776dd1
+2f8557b74b3890879bde943e77ae45da
+dbe46ac02048c73484c12ec46d4edd18
+90989e0c78345992c10583e2812fc2bd
+83a49f4e2a7b66e0f2f1f76ec966b710
+903e7469e3341959963521c9825dd302
+6ac4f0fbb52f7b196f1e3e105a24400e
+737fa0f52dc0b702e057b069a767a608
+185f8e8ac7be0959d6ef9158e643f06e
+50ab4caed7d3fcde64df3c0cc343f0a8
+6fc19bf5cc7f39a80a14cda9037fbc73
+e19adcd0e844cdb78dbbbbd11c7e53a4
+456e84c02c602711b533be9da6bb8c2a
+b5e42c2517791ea18e1497674376f0c0
+8aab67be3e61ada3205a9267f7e818b8
+4d0dbd07d699c1ff6d9f4c1b76a69c35
+ed90428aba89634a4463015a91018d69
+ce27ba87e9af039e872025305abc823a
+d8ef3c33bab7b650f4f7422b83c79831
+b6c358866b8fe4ee1fffc9cfe4510a2e
+453db193d040bc53781ade23fb36501f
+705c40110b2dd5f175aa5d324ffa5d38
+6e90916a1765971fd874bc9c9fbf112e
+9da9675ca0c6bb2db8aa01076c27b262
+a6a494532762016be1052d5a14c790a8
+cacfe3e3abfddfb9545ac8c30d8b1800
+5eacfa195a9eb6bd7923ca7dc2928cc9
+3cf8e944f20da8bb1df90b0a0da9ed78
+c2fdfbb380e7b5fbae0cc0d2b77f38dc
+cd950c45f1cbf0cfc603ea3a7b74b66f
+d41d8cd98f00b204e9800998ecf8427e
+ff33efbab120042dbe04689698e44ba7
+70f342b578f8cf78c5215057fea35dad
+1e6e89d5a14d0053aed373c4a17163c5
+f22f65c85c292e7d72b23b98e671033a
+3c8f6ed546f75d93a464c2d24896f690
+ff974458b61c5e85f27f4672f40e81d4
+dff193d36b212dae756cb8a46a47c97d
+70904c08430f68bbc5b76616de6b9ac0
+63caa5b307e12d7a0de391091cc26c86
+44532409744739852699e849483f7b7b
+7db257d129a57d06f4ed1509860fe45a
+b5a54879e861a671d8a845b374dd72fd
+ea1923844474e03207e072b2d4e74837
+e8b97c4f41cb27eecd3deedfbcfc357a
+c42facc894ee9802d82c526e27f43ec6
+90c950012013e3c39301fc01aa7c3757
+7aeb9d8a609ff974371b67ad958db286
+cfb5970c43e854e1ab1e144799047d8c
+2e0b3fbc7bed0929274ae29759f017dc
+50d6ad630082e7234d972a463281987d
+ca82bf1f35ca955a4ec59fcd056eac29
+7ac6cea0f03b7f1bc45466102664fcae
+88b2df42fe4fec92255e38e0a8b0e6f3
+3bf240f0450f8d4f375dc7edcaf743a9
+4a2d11eb3dea7abd43c1bb7d82e8405a
+4c2fa28acdd01769e654df3ebcfe9424
+0331e427ee466e4c2fb7020e9b54418f
+7d004c386dacd8976143b299aa87d15a
+0cc414df224c1d451f89a7f6288baa6d
+032d476b549ba74f2ba4b004d61f1bd0
+0d0545f6ca2f327ae748318beb4f81ac
+b80c0dac858d9cd9f83b9eb856e427b8
+18fa100c786a8a9704451c4231c29d85
+15c6260ebcc8368dfb7c0f610a361f39
+93c228f8f11f742266f4c2589d97e0d0
+e92348dced75306d81a67d5bfbc4d1df
+b4756c86e3b41ab254a81db1d0ebdbdb
+730acac3261563cf8ed6468f1e9a591d
+b8ca05026e2ca7588076306bc6e3920e
+a43577a34832be9effe6aa5cacaaa52d
+9d5c63cb1a857bd17bd8157df4597452
+0d26bc126979bec59c521e25caeb32c1
+1a212f1c5b6a73487704c5c210f7ce12
+2b644c0b31161f8f068c2a3b8ee40f51
+07997e63fafdcc0292a1317c1b8b1a59
+1dc338feb4a0320b8e1958baa33aa99d
+18a4426bf13db23b8ba8a17a54b1bcd2
+5adb788f366e74faf239f609edb2d32e
+23a59a79260a8f73e7baf1e9d9b4f85c
+534b2861f13faa00697d7ac04f49ce12
+15cae9425dca7f7b4585e3644264224c
+6b72dcdf04c38bb6ca087a4fdbd72cc9
+f1079a5b29f0887835e41a924d6f5e8f
+c327c9639534b82628718b30e3192666
+253c6390b575db045c39c503a44f91b8
+dd054329d45f0f59b1a26734fd110307
+0bdfaa829e3050a3b9935bda2bd49e4b
+3725be0b61068ab2bc0b53b8b61dd7a9
+f91fba1534bdf761fab5e68238c94867
+9a6a4f620b0152d92a3807425a5884cc
+b466e2eb79b57294306f567b26b9723a
+033cd77a3e257030cdb15b3a7fd35818
+afe38aedae3bc9bcdda9d92d30f197e5
+6fd5223d5e351d2a79f639c1ebbf1733
+6f254fdbc8514a496c8cd5f6e9c380dd
+a9595fe6d1fd75c2c960c6cf76aa3f74
+d8f50a13421e6502ab8059e307c4c596
+562ee743002470291ec1e25773bc8326
+e40073cea557d9eaa402aadf41dc3eaa
+b3ec88915a4d7490e87f1c2054d3362d
+430c79d3f5593198be390014f4bad9f5
+79c69cde5ef1ebec99b7f3f98b2910e2
+9e70b5d7325d01be5bc13f399f0b28ca
+6a01a537c6271340be4387339ef2d976
+d57b4fdaa0488cd8970f19dd744b57ca
+3c5e568f6ef5ba136cb50540fa7299cd
+1c780a8491fd29e47a3ef511eb3eb4de
+177e4af50aeef6eae988ab18e51cb9cf
+7ca86dd32aa41d1098e6fbf27683292a
+34a86206890a162015174e0a6b89d1d7
+d86033f36f161e20c3801ddbf1d19d58
+a1faad53f5de4431b4b7b1fd82bcddf4
+37cd3f95b1bdd711a8ea716d4a86c9b5
+de847de25ba15f7915b4969fd2a9b97b
+c2867226af02706e502c8595f7381869
+029c8a296d220057e3ce9c9c92f93fde
+17d2a23723cfe20e5d299871c177769c
+dd14886af4ae1513c1e609067ee80829
+3774b5be9b1424d88514dba6c767887a
+44f952fb9086cae3879aaaf0483e256a
+3689fee84120ac6f1cb42f98c8b8dac2
+8cab110fee5c8c8ffefd2b6905e42f26
+9b081bef83877486ae69266d9826c5b9
+36816c5224bf191e293811ded898987e
+ddf258380e52ea1b5873f968cba55e67
+fa3480150eeb79485d5bc842bfd7b5a4
+d64bde523e19affe349029b731c6f647
+4bfb63ebaaf0246944288021ec50d582
+f079bb23f03a0bf5b61763cc5d1d1a8c
+0b90597f59ee7c3653e92928c3bc190a
+bcb98f7ba5211aa5a699036dd2ce2082
+fd22e104fdcf589c59992f440ba9a2f0
+9341a9e306d3f6bf3de590ba7bba37ee
+a22fe2381331e020d7de65a83eba198a
+7eac6624358c55c06cabee923d4c9f1c
+c8cc9503c2667c917f7cd655bf79b436
+8db383d73c5600d2b57819e51f3f2c37
+451688b39c1c99fb045c204187e7e30f
+3158bb5807fb7016eec25143301a258f
+72869eb5a1397cf2f715b9dcbb1f7d41
+239f5023f135782e7f91dafb0e90be65
+109fa496a6737d9b680e9fd87c9c1d9a
+4707ea3db99170bafddf50cea25553b6
+1ea18cba7c4132256a4b2f47ec107191
+078cd412e3406e0036b3e3d223ce9b1c
+14772ffc2c6f0b3720d39551e1d96cb8
+8bd732f36881d8f70aba862fee14b29f
+ec4c806760819870561925cb2660d86d
+9066e7caf07cc2ed5403147c7526a499
+16c654797a2d85161ae8003eade31ee5
+caa1037d097faa45a84280d22aa5d655
+de0d6178639afcfd3d9bf5000f0822a8
+fa10ccaaa30639764300043f5feaf54b
+6a7ee83b8cff29f8f9abb5a592da9f88
+c32132939f96b76dd10633bf12aa7a49
+57c2b32bd47eab3640346aed507c2491
+80166899b013c8be1cf36853b5bddb6b
+3aac40327e85bc0519b83a5a17470915
+86fc417a5a18993a370febe70ea4a546
+f7215664a9650be943ec1bb9dd8fa350
+2d5167fa6b74fd6d497fa23815268888
+b310268aac98b05ed4e5a1a3a961ad24
+4fe88cf001b2506992952b793c77278f
+3dc5fddea53f506e1ef0305bd2e87051
+e0967e7a5b2e4d163c56944424f88d61
+b1602baa6fcbb0800421c68d3a83498e
+1c457fb348985b240dafab3dc4f72b9e
+7d7c15112f744fe4a0d0258b20a60102
+16eced126508a202434b3c1c26622108
+a60fc676a2f1c6f85c95b8d87646dcfa
+08aabd6ad52d0075efcb83d68aab576c
+2e5678c141431ff32b4c91e8019f0a78
+dee19c289021b5842e25f3cc16d626c3
+7b8dcf0a866dc568cdce2ea79fc4505c
+3caa24b82e6bf8f482296a4ace25f6cd
+9866b0bdaadbec9d37631c45ec8311f8
+95aa113463c31d765a2b7c81fcf3d391
+c9f8cb4d6f95f0e5a218e4027ced4938
+5e3ff845ecc09f62f639610a74d4a54f
+a82e9d74a83aff0e8943c378bc5c0ffe
+2cd593435893ab72859de8da5781b51b
+fc5985bc62cc3caf7c38a79b15cd15b5
+e790f80a1bc19816458e506bcd361656
+ea49686d4e8dbf0351f686154afc877e
+54e26c7bf25175104fbb15101a7cb0f7
+0797d8a83a7947f73703973e8c8bd8c3
+e7cb258d99e52856c92451c9c1f81746
+5490c61da597144d1caad9f79e2c8551
+91e212691375a930a13c7277786f321f
+8bd9f4a8d241502c519c829b95d73d4c
+cf02dec5a756f5524c922b0fe6e036c8
+9c8fb32498660a6ceffd6e1e0fd018a6
+45b56b663ed8de4e27158d9506eadc3e
+4ea11ced9a2957aaf160b2885d8a1f54
+ba6862d8394bfde90733a52d6ae9c17c
+0ac88d608e266a3a2fd1138079738540
+fa747fc1b96dc352a1d71f60b42eb26e
+90ceca992dd6b2a30ed04ae29802cb48
+de0348b48e0d0e7064a0bb334bcbb079
+c1d828ed40184235413abff0a06520b0
+d8a6eb65757a8e7aa19215675d8e2023
+e3580d315e22014224d8c0974bdf8860
+5f45f90b3bc0124153d65915f10aeb0e
+4e055cd5fe24bf1b05f09b6df3751ad5
+ecfef0c79ada5e3fa77e8a8acded60de
+95e7989346654aebbcf054f9755f3774
+feef6db82a960946647de533bdc7410d
+1a60d3833bb940915588b55c03198c77
+2f628dc62617cf2954d714adacaa86b8
+93bf9f666635deefc1c493611d303fad
+43b15b19d991621fc6c6a49af363db6f
+8e12e31175338bc3cdf466da25d3de38
+dbb8ace8b7cbf921a93a436b96554dbf
+3b5e0a2a0ec1525523126bd4e8e11a96
+b0ad29663676d1a59b8bfdd2dda1f6f0
+ec7238f1e8ed0c035036b66f8592f20e
+db1efcdff9a32edc294af1eed58b03a3
+b6dd0c4c5acf9fb48d862fe8e7f3ce0a
+ca43bf2e16d1840bede8c665ab463457
+56e482f4a9726601e1185898abaacaf9
+5272e9949a8249ebf5547037e2cb3e08
+20fd0266701791b8e11643f77041201f
+c6a9195bf0595d6a67ae354c3e7d2aef
+9f3611e891706148aaaa53d83ebfc178
+0c447ee35ef08fcd29e382149c5b9dd5
+a957fb89f83a802e68958561756dcf5d
+8ef04dfda76ba9d088c4eca1fddcc117
+49d6debe18de88b085342c7ce49fac62
+2814ce5e159abcd509800a5b1196ae3b
+08278dee2a41b3d0ee84d9c265f19586
+bf73d722a51cabb90f8a998d6ea4040a
+f90c8e83c6963f4f51e82ed7f4a2cfbf
+e5b65da75e0480c44770aad6d476f58e
+05fd647846ee8a280243842587f39474
+0d1d8d5dbbd925a2aadfe683f6da3b18
+2e2e35d5e8e0131c0ac4d88fe454343c
+9fa2beab761c282397d90672b1cbb398
+2da1f7145ee1bf8924ac4ac5bdb1f4a3
+419199ea74523b9ed17dbb1648ed28ed
+e55eca59748a6a1bb898fb8ce1d4c15e
+1b7da8335f49e49df9a2c52c40790327
+c425165d6ce2cbae488ab746a3e57de8
+1c23a5cf6975f35becad0eff673f85e6
+ea3953d27e5217846201dbbd5f2b2340
+3c97cf09d8ebc3c1c07beee81c1a6fa8
+499a8ded70659779616feff89793f565
+092683bbba1ec542c7cc72df5b4fa583
+86b08b93e56929adb5c652d275521b14
+d3f0a91a80c58420c48d58f2c25113ca
+6c43413ae206aecda0596478b7a9f96a
+0a96b526bf06d956761e9ea05c1b3f41
+7afc9b589e0d1786a82fd908bec733bf
+b8941c56f829433b3cbdc4f7bc24e214
+ccac98329b27f64d37d90b104b28fb07
+d218f7ac8db91e1d0180682ff6cd4bc0
+7096c2742c852f88a5275881d5300440
+99de6704481850698939d216d10777af
+15da68285b95c13d60216d6743a94360
+f760ef51bcd38235a394f9c548ef6df5
+6bf318e12e78f988eb388af3f1a84781
+69c7636383f008f4eb73e6eb90fd0fff
+f1bb9b417dd88c9cce8a2e13a587f17d
+616a9a32a97c2872d2f9623179b7732d
+1c00692888cc52240b45bbeb02e42533
+bf125a8ef1613df1ff6fd7357151cea7
+4d29e012a286140624446c1f3e5d0a5e
+5f2fed43a2972f7647e9a8cc22a63965
+f6cdf263080db734b9f6b3296cbdb96f
+2f0aa1a95148a8cbb39d5962d3a7f956
+977444117bc451f2166e19f2eae87711
+5bec0a3d038c8759e6a3de9329c471b2
+6ebc21791f50481f53e6af1321af5aa4
+a2c6cebad1b1e89a81b19f61a16818c6
+4aaa4b61619bd3e49ded582bcb04f2cc
+be26825009c6b219ea839f2d1e9079a4
+ed3245c98b75cd7b2f53b06bac891b59
+efdc39014880b90b6a66cd1385645821
+1a2246e167d868092caa4459e102b528
+12b1b8326f339170ace9c4e03435af5e
+91959aa9becd644524ae790587e89b9a
+16d33ac8c2d554528bf7e87231ab8f99
+61eef34a92ac7138799938baf78e772a
+251a72c05ea753399b797383587dbdbd
+6ba33fcdb83c4d8e8addfcf55158df28
+fa9d15520d1d873b5612bf86c8ed44d7
+7c4a6eb1d498807f9a1e6a71c2b6c492
+142f897437c3f033344101248bdcc22e
+c898f59c9cd8d85aa7e9f39868a95fd0
+8f76186d9671e4da29a50583e59e9a34
+474a3c7294aa1239ff805a28fd00a54f
+994536ca4d25fbc70bfa7c77c6b238b5
+0eef060beb21f691c8ccd5e85bd2957d
+6f0f90a300dcbc0898b7db26e97ac16c
+26ccf6c56c2545d40ff8ac2531d09302
+d9d1acc8726086e86145f149d1f6b855
+8dd5210d3a93ec264893c34b8d8934c1
+f012957a14f56c3d16ff9f0209adca86
+078a9688326b2cc4d2628c2c0b6e7e81
+80cbb15b16c0cb014afa11ca98b71db4
+80b28368e34505f5f4eac63d87bdeec7
+1c9b58467a42451eb108a989e6ef1349
+ce6100a876a846186a1d0004e773c370
+220fb70889c721450c5b3a35c68b88f3
+b4b6914c3159a44f251b58ca677f338a
+b24d98d90403d1bb261f2476cd78b691
+851bc4076288ea6eebb24171bd2ca2ef
+00b3a949b697b7ed9edaa02bd8d45b24
+b1a483ce3f0a4da59868172b777dc3ed
+d7565e3b5fed1e20410e25a83c6efe1d
+1f1b084091fe75ebc1d753c4221109ba
+4fee38393a2f7423a4f89802e8b2319d
+90a18a719b9193fd5a95dd4fec64e6ed
+ca48993196c114f4299845e0df27be46
+46ee10ae47a0c99b415c2262e54fb838
+e8e583758eeaf475b755e035a2da9325
+c4919339b4d0b03d5ce371b60a4992c2
+885ac6fcadd795685737d59a4e6b4331
+2683ae1efe0e73f14995e4343f4534fd
+feb7c9110ddbd883bf2f2b4f739d11be
+94fea7f027d4fba8e07605526585db32
+db555499aec3a979f6caf215e89dcfc3
+f797d3e82678ee9f08dd2bf6e09cdbff
+29e15eb19a38b9e15c6af46579cf32f2
+d11b92eebe043183bc8b938c5f4faf2d
+7d313e3ee957133470af1f05eabb7826
+dad1f1ab077a220c724067183573f6b4
+ed17b3f8209a47e9bf233e3029f0cd21
+70dbb254daf4f6df9b013054580481d5
+14b1d4e18ffa098d9eef5f1c937506d8
+33e6a484a2d706de975556fec95d427e
+2f82911c0ae426d8cea6e3c6b74f0081
+95d261052908e744b8d52d263c300b97
+9609f792a152de8256557e3406dbfafc
+61118dbca9f3fe66c044aaa0224e67f2
+9e83fcbe240f89d82b28675f464cee48
+3dfd5e05da9c33044df768e6ceb98376
+0806d3290aff9149c54b8b103cfdad05
+dd2b60e8360257d5df4d45d5efc3d786
+bb533844ae531e7261b3625db0c3c8ce
+0dd5e25bb7e7417166dc23c19ef26159
+bc698f2deef6e28a5d3cfe8e48871481
+3683531fbb9befcff8169e60758c6c91
+4e2a59075413f4fc46e785af2c838ce9
+9ce1272a778a9009399e026b47018ebf
+cbafe6f7a49fc5aba7e4ad691b405b07
+42ead541d12b50f725a978da4bdaef2d
+a9407459062830dfd0fe6e8eecef81ef
+6382808f3faa777319896b528efb8bea
+5c3e58a40390a8db605092a20ed53ca2
+4b2fb951045775f23316a279b0bbef47
+e63084f440452f727625a23d8f335e56
+2a555e24751df641068cda10a877c0a5
+ddf1ca1772f6cb244823225d7ed90c42
+5dcb2109f69f7a26d7b55078b173baac
+4c6c9c2d627a40ad36c959beeed408e6
+187c9ff07638f7b665151a7e5e4e9dc2
+d41d8cd98f00b204e9800998ecf8427e
+f3aa6c01ec5dbe2599b75725b271a79a
+142449ebd18dff65f1aab8f8192a942a
+c366b57c308b322f70447e6f88b3e33e
+6b89ce59e1a3ba75368ecea9d1594f03
+d0989b54d641fc6b28bcc2a7378b4adf
+ffff8918d5775c220659969b73924b16
+bcdbdd9fc2870bb2899d3a3c5173fbbc
+04ff29f455142a4853e1adb275c7ebba
+634410db3329b20e085e006256da16e7
+a0ef8c3a505b1ebb6cdc0fcd2ec62e36
+108231dafeb42a3bdba7d7dc9519445a
+271b8188a4dac268219479804904cb5e
+0bbb305c78b25e3f1d30163d4104588c
+c2fd2c2a51854f4c693a9d4bf54c8776
+24fd3f06dd2b41757da251151d4b81b2
+9a81f4af298bdbe8234e7a35f37952f5
+2420b84744a62b7cf3eb9ea90418230d
+b3de425243069a681393b77f8cc80985
+2d5181964ca0d5afe76f294f6f28e516
+48c6756554b84ba1a5490d7ae07b34fe
+864185a7bb32c100dfbc84eaf21f4b68
+e62f11ee366fca4370d84b04e5564d9f
+80f8df1397262bfff5e4dda5c5777a53
+6ba0d26132f1cc0917d69fa510627e2d
+0d7eb6b4a6ce2761c281907bcefcfd36
+6f83c35ddb6fcafd870083b32a8debe6
+0c4db591139a3e68725db013feb807fd
+2fa548e84b8224ebcd869d1bdd6c5a69
+c1907774bae947e0bb9db7a6f220122e
+a9401b27d3f70ebd2aaeabdd35f7fcd8
+12f2b3c51391e96ab39df2b7dbe436a0
+34913642c51f8251c839f87143364832
+b13b3cf873a6d56d69c645736d712245
+60ee6e24b7d1a633f449d19a048c411d
+f28ee5e510c7f79f139ed4083c8aeb7b
+1f62d75294b8bdd1660e68b13214edf5
+e8efb2f3f7140d80f79835ae5b7fc514
+89071324382ed077e6ab43c4b880b8fc
+1c12feb4ea1b5335785c89375e65a668
+831ae893fbaf63ee39c4455a6e042b23
+21506868399acfe01ba486d3b1bdee60
+47c56f2bad3a085f3f478dad63fdff1f
+1dcc3cdd8ecfe29d896b428f1abbcbe7
+df3e255aa8cd9414801bea3a500862e1
+f50a0a6cf186d8349c2d88cda4544a3f
+de8dad8af2e5aad23fc02f5a84904589
+bd044a5c4e247edf61dc9ccf4be8928a
+e625230910b6f51ff2b3f642f17d25a8
+e37dbd5baf7455e23fdfa2ffa2a46720
+0ba54194a61a7318e891392d316f0b11
+e7b6a4752f1b1dae07f8833748876607
+2abff47bb28de35c0c471b29c44bf403
+1a7b18c0de0259a1c2d846f728d73d25
+5dab3e75f0872cfc585887c0611d1ff1
+c434557a11d8a4623ee8437567bd7f0e
+87e3b118384b3f0e136d89ed781642e8
+ae9f8cc43da1855e0ccc345464e74f41
+357f3b6a2c5a357fae9cfebc82344948
+9724a901b21d49c924c2337febc10229
+f2345ece6a5e6d4c84a7f16c72ba454a
+ba2e4a20634b2bce1b9a444a27b96c59
+fe404fcf27d2e3602a4d85e4398da909
+f5ad70812b2ca5c964a59ea725b275e7
+98083f8d321a41302d20a6e5a2507138
+90f86b8fd8c5e74c82a609275e63bbb2
+442f94169a67727df498317a11235cd8
+04dd558be675afbf5e02440ba3bbabfc
+6d33e353ad14b5dbacb0d7a0b003da63
+7451a338956970d7ba509daceae045e2
+8119009a00c399deec1cc52328e2f96e
+355e5dbbdfec594c2b7e76436318752e
+93fea000a851f8a86f07deb591bfe8c8
+b34be72ecd855fff514c8c0f82a4b77e
+3f18e77769a6811641cde1a49ceb0281
+fdf3014f93fa8d10fcc26fcb9b2a0ffa
+ff23bf8b822e914cf728644673b8c973
+c37f610465f218cb6f067a49fc93e065
+0a97e2f533aa1a79328974c5e6bb173c
+ed1d9e759599308aa24b2bb999e9d72b
+6d4190847c0a8c1b7fb9db734b4fbdf7
+86bbb8061194cf439cad467ab983245d
+6ca3e7398bb35ff8acf98c042d9ca94b
+a03734bb5ab50c4eef27ba8c1717952a
+8da24c8f0efe99a0c31b59aa95f80aff
+499ad0e659a5621ef66241553841abe4
+e12af779fffa25b09c4c4d1388141cd3
+64ead50a3212d45265bf37be74e9d5ea
+f97b640efbc60141bee93d84ad68854e
+d99eb73aeebd78e804f7ce1bec572b83
+740089b5fad41c6a239bc546022b05d7
+3acfe08b1e5c8bff5b427c1492633510
+3bb0fb861b170d969bc8bf7e14d1429a
+1439b9579069b0636fe8bb17737d1716
+f7bdfb090aa901ba166742a237b05e83
+e938a3eff3d577429fc41a04844b6ecc
+0aa6bb02e7927af3ac06a33c710490ef
+697319e55790e19ec8438c57b24b5e7a
+231972d00402faa8c7c40fb813ec921d
+5a483fccf0ef07971481418d88ddaec1
+a7d3e9917d3c06a66a65fbabe838b4c5
+0492bbeed9be1a02a4733685f626873f
+bfd61c4b658c68e7cc1aa7e107988ba5
+e5fa9fd7bcc0fa7dd0262ddd11de1317
+2ee228aba978a4123807f55a9ab1b414
+552c5ece015740f96a64135e1d6de33b
+054037480937f6ebebedd5caf92abb80
+8854d22bd8cd934723b63d84c4452d29
+f2e15e9ba538de22ba3af4a29c1000ff
+e9b01163e81ebf4015602e8f86b40ee1
+ab36aa890b88ed1881aa096873b2801b
+b2815dfa95f1e7a6efd378fc7efa0720
+0309c8ebc4bc39e2fac2b037d4af354f
+e8066a3d440ea82d9f3a5caef08e2cf7
+988247e7c621f92cbe771c6c8fbc2541
+e6196a68e9ee52bdbcc0ed2ff9704174
+8aa9b9b7c12787e2ae9441074875c86b
+20fb0c2dd447a391531336f1cd97e6c8
+d9cdc756377b3b014b6adfbee06e63a4
+92b396e5c9de8d1483a8a2d09c389c6f
+aa8f449cb93441c4718ac0ae48165a0d
+bc09755812637f8faea35c9c0e93a52c
+876d042eb3db284b6e8736f138501fc7
+18f947de72a1386caf3b1b5584339331
+f97a259d83c0da7a45e8d7b74c53a7d6
+6f25cdd694ec1da745dbbc4bf3db0845
+e1460c42f31f62a9a861f78b20dac1a1
+1ddf846bbe6c64ecae8f7ec2073965f6
+50b5017b627f5f3d9c40a5d45f61be31
+9fe05c10ae1b3919197a50b44f4e463c
+305a466ab7289e9f02b831ec457beca1
+8a25495bc106bb5783185dfc5d4668c0
+7469dfe03b5c1112c75f1bac09dda561
+fa08b155cda03bd11fe222cdd799d70c
+2af53039b7e4ec644fb444f6dcc4ec5d
+2830492252e54ed3ad952826fc2c9f46
+f0a77e6a35c182efe30210b08df7c86b
+0d65759e6840dd304c62ebd4f6a386c1
+723e949fa521a7ab5cdaec751d74ec91
+3db42f0b5c3a17754ac46199e186f2bb
+812a5bdc4dded8bffb972264f4326174
+d15f732870d9ec18243a6f2914e6a723
+fe5d38397478635e85b551b09ca5641e
+28c546f556225098eec3d7a007d1690e
+01242d7f40dc88347a73de213cf91cbd
+7768b3cb945856c20c2c5889ba562544
+779aab9ee36fee72883b7337a1814b1e
+8378d43633087992ce374ff2cce175ab
+e78b0bce6163799fd68cd94e98fad239
+d97b3951fe760bbf9ee10ca41cdbaa6b
+5433c6080cf839e3e0b1341d60d294f7
+d8c13101c548265e4fb7bc250557925d
+b5ef614945ea8a8ac7cbcd41e0968088
+be48f282ef2aa7a46fc604915239d81d
+51bf70c10bcbdbd12c0e67d350607687
+98b05e123eeb54b0e20ae42ea95fea8b
+394ae4d68c58f718128cddc8f0a6c843
+a009b82692c0f5468a4c08d0ed82db6e
+7a64567996b02733f4c19a90b49d07a8
+e3331ef1e2e120cb8976182a31a7061f
+d45d30b74df6cfa46ae3d08488f5020e
+6ea7eb6bde5619e73ca6fefc00e17055
+afa954ce1e4757dda7a4e0670eb97b38
+3bb7be80c90592320370f675b5056c07
+1236f02c1623ac9fd48fe6da70a26c26
+5cf37849b217ef29f74dac3250c02e1a
+937ae29582bcfa64a12c826066c9d812
+ff63ffa30c41805bef703a2543e071bc
+bfabb4b0cae163cc1b768356605a156a
+ee7196d769b75ef6461d4645c65a0da7
+d8a7f0d4809ab319fcd9193787587ca7
+0c9a200405cda487e3b138ae1c4effb1
+343b9a49138c4d2371e5b34d03807db8
+f8a6efc2ec9980a58a8afcececc6a1f6
+79f749bfacdbfd47d6c0bdd761b4c8cf
+bfc943bb60dc391e4308e15b60e3a289
+ce1adb7ead0030b4ee8cb28fb1fbf3f2
+e3d585a95d96c6554b504634d3a5075d
+910fa52238d5b9ae3692112eb6aa08f8
+02baed5a9e6dc1542ac46cb9f9bfd354
+7b0974849cb5986e3eb1f0f7b76e4771
+0cf1cdaabfd98e402dc780df7274b391
+d85e0ed8f5758730983d917b27346bf3
+ee4d288e32e8871e4608b4556adc10c3
+09bdb5791e7ae9ea1f9479d65bb2184d
+677ff5696d1a754e3377c9ba98118304
+b43b5243e017ad92d0ad0b1385aca7b2
+47c9617a00beb2c8f22aa35715dc0fa2
+1ce2451e8cf335f1009fb88959c04f48
+6c4320b48835411cf6490662475674bb
+5a543cb17baee6fd030b4fb51b9f36d5
+e6af1da8d345e76a71cf9e8c42369ba4
+1ff81c6820278501b944fcccc0cb8b4a
+c3c65d0c26e94db1442b077cb625b1b5
+c7bef846d9f77927441ab8256fdba598
+63edb66312d8f8544e44569aa200f37c
+35857bd519269dfb93695feba6bfc1c5
+e706c5b7ea8aef20a215f8f20f2affe2
+a0d795e147853d603c7edd7bd8bc3f90
+f727c5719e207370442ea36ab4453d21
+6365e061a9913486641e6a2786dc79f7
+d3d9501fedca9496aaec1e7a92a48d67
+bad96ac8beb546c551aef2b447696b7d
+47a0bed406192a09805b1af2eee52304
+d6e0c8b5810ad12fec48be3916948985
+a6b9fcb0f8b433982353165ae04392b4
+498ef691ac318bd33daefef9f88f0f72
+c13d3290c51e5ca605106239175cb0f0
+e104e6c23ce817787afb196336d484ea
+ac6fcf2ebd909d344301e3ca1c7982ce
+0653009326232267705ec203209fb35f
+63b14c45c20289388b62b54a625ea2b5
+475c221cebe0ae0d5b4353818640982a
+c2abf6bc7809cce21a757f4a07fa5e3c
+ab9cbfa8083cdfc75d541cafd6cdb90d
+76589bb5b62df0fad61132d0add75cd6
+72f5dc02241bc7155d9fddf1c4133975
+61258e77bc4c4fe7baa1e1c90a3f270a
+f8fd56da4022e15c38485e13c0963999
+2a8a35a62e3a2924dd1af4aad10537c3
+90d1e8fbde4e21c60f726047a12e9e07
+6c2e30deebea01e69eadde1aa355be3e
+c87415d9fb563c59e40f720069dc3e70
+037a2554560e09e5c327c813e5770f4b
+26aa57ae97d01566da4d7c0b4c210ecb
+cd47b5aca483e8320a4277a17127d506
+c51a30805c9b471a7b932ee825e05948
+1ddfdf8d9765948988721f01303c9085
+263f5ba791a278f05008147cfdf9bcfc
+81c60ec3bcc1fa5030acdad46f4676f0
+e0a4de61e01e62e933b09962bf1fb4b3
+70088b7a7530cfe2a994fad4b595fbc8
+1f1c366f62fdf304c59ec1cd98024c6a
+c395fd5c671c1f4ce2182a39c543ca1d
+425cbdde578d9094e04488bcb72052a2
+75f68432aec28093ab0bd212ea3e8af7
+9107ee3a03ea5a390d843e7dd58322fe
+af21e3c40f612d7c1ef67d5c07bd5103
+530045678c79d4bd35c9a7badec90706
+874224e4e2b58eec8067d5e3525387de
+961958423e9da23bd01dd76fc1cd2ebb
+247c8a7abd9b601c115b3211043c8d9b
+2fb2dcc4d4098899b64ca6ad0f3fdccf
+af3dc73289ce5d5cafbbaf482850f078
+34c119728aee61eeef935c973b8c08c6
+3434e8fe92fa40b9728838162cdc75f5
+10d6b3e7254230d2cf39aeea0f28c1a4
+860ba18fdb5d55eb45e57d99140a841c
+66ecb5c57c3b80df6cff8c6febbe0b78
+a5084787e2e46dc302d431d97bc171e5
+80426bd8aa9430fc7a4857b6e016cbda
+60440869b1f73ed4fd63e6a7e89ff3c3
+61a26fa42cf9d0e618dbfd67e1ccad38
+6139f6be729bcfb2842dab2dd38a575f
+c3357c7d2f4aee6de2df65baa47edf1a
+184104b3f11f0cd50d3350d26582bd83
+51506900260aed189f288363406c943f
+ddf2c71201cf577471b9a989fb6f349b
+204a81eaab768c4cfbc9ba1f4ab9d005
+2147057e7f10f7ef9a77bd217da618d5
+ecf6ce40ae41b75a0f2315cda25f5ec7
+8fc47cdb49d28177c61e3d119e324062
+10609af110a2cdc751a7293d207ec2a3
+a025665e13c7cc2fdad9042069ca258c
+dace91051b753440878b7ff1e00614b3
+49e36e18bb5fd2b0651787ea2d32787c
+dd38a50a3368269aa07ecf5a60a51975
+84ff9c3da68d2918de3af11ffa8b3f2a
+4a4505f152655cabcde5c576b269923b
+380584f79a40372cc6a91cf974ac1c06
+c0c9ff1ea0c48c7ff6facb4878deb5a2
+d39922084dbaf1797dd97422955d0aa0
+9973194b709ff18bb0d782c81d868284
+69c711c14e3114a04c62c4dbebce81ce
+f7335b252871bd0b06e8f8bfc2bdff0a
+196cb4809da1393619583cad25b4f0a6
+aaa91b2eae89f95db91dfebd392e46be
+f85057d526477457611aec5b614449c7
+4699d11508edfb43f94294b84b47aa8f
+10745948ddfc67c633a970e163bd87eb
+c8523e2ea785d61665fab0839f6e6fa0
+d56c53a3fec7b297c1adc45cba19bdaf
+8b5a2ce78dbaed037041bfd1fe84ebce
+96270dd9d4035bb228971631dac5c654
+64d35e60a30f4bc3c9956141d4fb4f95
+e47b0ae4e6a9805e12adc77e8cd7acc9
+9e0cb7ae9850e6b4e7de10d12526d9eb
+2e177df463ade3a4cedc8a112d93cc44
+ba866a4b9ffd13fe90f6fab7f3c3383b
+645b43c5a9b3817b4eea25a256d8abe5
+5331da7f94b709e6a6520cfe7913b6d5
+4017b5ea0564812733f702e4a73450db
+d42608a15b80199c72ee7f1195c137bc
+722f473848215d7db3eba5ea69694ac6
+e84a8297a2d5cac717c94dd69ec76a00
+f4af98a64ba3dbcdd6379ecb8f41b34a
+3806746e0fe98048745b7799aa29321c
+d3f2b0d166258300dd1736f960269ac6
+8ce7e20a10c5d73ab7e1dba1af1e5e10
+340a8d50aa3d3d190ed8a79b755e805e
+670caefed28ca6856ce807cd6006c514
+b4162c75b9b7df1289ca083d622452bb
+4dcab2b232f8335d9274e8330857dc60
+65d054e067e16a293f3059ed74d93028
+cddb2fbd11db453c42a3ca7e81c8171a
+7d258ccb212d73500f11c3c3333943d4
+f8b4ff45d593aa75b9816c7d0d647897
+57387aa18744eca00618a32462f3822b
+5f5a481a1f27aa4d1dbfe5f067825ef5
+8f9a9386847a80f7b59a314f60c81f66
+64da136621f72e837b3eb0885aa0dfb2
+b3f66b85be0839adfbc5660ef49a7e60
+f90bf569298feda92dc1aa65411dc49f
+da71cb7dcf4cbe0ad85b42ffeeb435ca
+4809ed20e935f75eced58590ed2d8d3f
+3c19ee35f88b2b3cd0371b63092fdc68
+fdb144678082117ec0755faa6a34576e
+690c1858f63e4f2bfdfe4860fe44e0b6
+328d386204447e2560c8faaa1639f5c7
+9e181288f5475414a9f9d1ae436d4600
+76f1c23887a6db49e12586726c78d52e
+21f3867b090f31c723b6d4c9de019450
+2f9a1314286bf211c0d23093ee3a52a4
+06eb771bdb3759ce43b70c93925dddd0
+d58488eb47c71b5eff02df9aa7fb914e
+d41d8cd98f00b204e9800998ecf8427e
+0648086666c128d0089160f0a9dfdee0
+f994c57e1cb39aae906aea4f33975d8b
+f098b741823e9081a291f19ebf473a9a
+d87663f06fe10ef317862625b40c9c25
+b11d7073b44eab456c567dd6045e07be
+808d38965df1021dd2f13eda982de83e
+4557863ac3b09c047d2bbce430a170eb
+619a42a557ec28cd532c61f1f29071e2
+7e126a78cf2d708a00a3a69f262273e2
+fc07660472c7558e1cad6cb5eb87144b
+9b24803b9ce6333f86ae47be74542dfd
+7ae471f697e88456f302a096479d9458
+c5cdc325e91695304e182b656806b9ed
+827422569cf39e0bf2fe0bc8378518aa
+f9a38b26c0a38e290dca574b73ef45d6
+27158336f3a466870b473261123748a7
+36a76d3276ce03b2f4ca11702f74cb20
+4b9dd37c3533948252b10d4e7025f812
+67caeafb507f7592554f26dbdac96e3d
+df17bdaa2793778d433e18a6364e8750
+06f65a6554393074661a1c39aba37312
+13d5faabd775cba15ba5defae38a6140
+52c3b2bceb34461afe900a64aa629c61
+c39fcaea4c296b7d58c4547b0a8cbd24
+da0820e1d3e00b9a14b106c94f4fb306
+a24e3bc5bfb1160482199e964050351a
+942dd6b77d4fbed1c718f5f7c6f03689
+c254ee3084a69aa5e4dcc4709512bb4c
+a1a4c8d63f3947c83f2014974bc89148
+f2afdac92f7447985fa928e074833a59
+cf81b36a5cfc8cea63f97e257d15a41a
+f31f7d4a230dfca399f0747bd38ff008
+9922f3cd25a6a3c1f0aa43596bb0f34d
+b9c1917c284ddf752c71f80f66a4682c
+6c0bf5979e433cd2d1a20c5f153eac7a
+90e5d795a9163d67028db89c56b6c064
+6d38d12048f29fe3703b5ac12bbfe3cc
+baf8189a5dd7463645eb9fcb198306f0
+41a120aaad47c576242eff2e32de8ccd
+09c4ac395dca4f668f1f2186dc34ee6e
+04d0f6ae23cdda1b25f032cb42f1978f
+22678114f4ac361b9b05a4c8c901c283
+38cf81e64d1c2e7f949aa2c408a65447
+d3840dd78c0f7de9ffdd32bd22489780
+c9a6014199b15e3208ae6760890ceae7
+c009dd63399e3bc92443a2392b007b51
+c7a29b02e3a21f71723bc1c54ff59fcb
+ec6d424a27c13964959e9d7032e22414
+dda42e61b1fd0cf78586b1c409aee77a
+a75d05b902337226a48940c9a8b8bff5
+d1b916cfcb8d0781a16ad2ceb96c4468
+f352b04e348724bf64a9580c9b7e7e7f
+621fd9c6bc3cb043c2519fe31ca43dd0
+95ab391351c3631c5c8744be221ead83
+a6992afe6821ae620f42ff6d717d0b00
+d41d8cd98f00b204e9800998ecf8427e
+e0ae2bcdace27b6575268e260e6b221b
+c1f22332c4e1ed83d9a664ecd76a7954
+4851e752e58b29fbbe81faedf58d13a3
+2a18fde1a7408e770d7465ede302d479
+a00e59854a078d7a307832bd8f39658a
+e8b892e3da2bf70abd17e3a2b004dafa
+aad791a54b81fb58cd08d89b48f624ac
+52225cb5659539810e5755d5f207dfa1
+bba58efca6741643ad87697652936a1a
+d8e717c43e4fd964ad108dbfba8c06c5
+c8baab65fb934cb07de27afed9e36146
+0f7ac57e0bb5e88887af92808f511085
+957784b98ac468413c945325d6ac8967
+b1860c6cbe604179e3423c76204c59fa
+86426a5c0b3e4383f50a23b7d30bcc3f
+5c5933db0d54ef05c3f092a5d2c30d9d
+822057e2762d7ab08188c27cfd0080f3
+e0869876e81c429f90ca485b60e110f5
+7d819ad6cda8b4a7f50be26d989085d8
+4cbc2add6be514464ba807d35fe48eda
+3ac1ea6bd1e2dc0bfb6ae3d976a60646
+bd9e9d0318ead01ecadbb585cfa55117
+5e03fb9ccd9a7664154087cd22806a9f
+fe785def91bc5812417e1f7f176fb3cf
+8e1a62bec8ddfdb7704d2423eb260fa8
+af4114dbd1188593e5687063e599c5b4
+ad49980574d954f07adb422712f3e960
+6e80c44351f7f6cfc0405e38a1324c46
+4809950c7b571496eaaef977268cab79
+43f3f4eac6e6145972dc8bcff59d6c2f
+58cc6e8565fbd77fbc400cf21d0a0b8f
+55dfafbb4223d6c43d1e9bc06a9314c8
+0565e89e6300429517a6b093152ec0c1
+697346db8126570fb5393c9814386c9b
+fa459da09ce33cd4a2811580dbb567b1
+150cd1492c27cc851f9077472847fb21
+6f68752846917ef00fdfe2dccb5c215f
+584c527a7313d06759fab4251ab5c35a
+00dc258b49e1df8aa47ce9293ff67a08
+6f86f4cc068f0f62a5fa0c6bf76d211e
+9bb000b5801422b5ee608e72012f3989
+55729755e432b835b1089c835d055db2
+18f1ee79d76d3f0df366d85956307abe
+53f3cda920d2dc2a210097ba5fe51532
+f322fce1f120ec5d98a9789bb6cd0568
+8456512ab83188c5f8fb905f868ede0b
+bb8d034f6bdc7704c0ec536f065436d8
+f2bad4dfb588697ff901c73f669fc135
+9e5e9fc524bdf647d5a8d9812c4a5cda
+8305b4ed9b95978f04ddee4752c623cb
+30ad4501be0af75c712681f7a0f276e5
+b78e6fc2d41d11e5c70ca75d2f413bac
+11cde0858e83cd1d2e215fe3a217e69f
+6e385a8fc3fc2a02cc99db852b049795
+99eb60517df2affd3318bafd5119e971
+b393dd7a7bcdd6f0b6f6bc13aa0031d1
+07fe8bbde8ca09dbbf4a8dd00061f2f9
+0d0074ce7bad7516d7de1e2bd599a535
+ed4a715d8a861028f36cc9b6c4f6d8d2
+0b112b73337adb13b4e8aff9660d2404
+e59ba61b6f3b71bd95408efa96034b31
+3298f58b4c3f210ca428811b741ea2a4
+e27332504c5bb1793a7a37e75e1bd0c4
+05fdff6fffdb2e2f71bdf9799ee46954
+687de5a52069fb527f4ac60c46f37a0e
+6731d2779eccce86e00a089e0ca9512c
+a63bad44ded2139f3bd5f68f677dd3ad
+6246ac9e0ab70bd4d042d7d31307f4af
+5bc1f149011d12c168dd46cb1271343e
+6590be2b17bcbf986ba5fcf9fbb3ce2d
+d24d5717b8ab219545ced355c10c9110
+1153a4122dba906e68237047bf8ffb2f
+29722104fd05de8b5baca230614364d2
+3242842e6dd27d1a5e44f46c2292cefe
+ad8c9b5028881955c5f59a6f188a8f12
+b6e3565bd87901ff58e105991bace0b0
+6c304df067c237b2120af1a807e93e3f
+24c748aa592ea6274faef3345aef444a
+96e5c49d99115b038e253774ac1288b1
+99cbac6a88c72f35d3306107193108d7
+3d6b0dc27334d8be3361f08928df0ddf
+df2c6de947db829da9a96c942be7c5f5
+08f312cb45f32072a07d84620bac049f
+01099ee933d30cf873d56b4ad98951e0
+90ce3942ae4d2271a8cd0849035f4f7e
+d5b99ea0a4aff8b848b2930e6bc0a4b0
+7dc5ffd2c96e0a928fde40e2f2c53a34
+792b702112acea32319249655efe9953
+d24f0ddd0eaf04748b33c82dc7e0f371
+1e4a5cc1c4cdbfb41acc942ba268e56d
+a85a138263ae596d391683caa045fa6e
+62569230e120fbe4682979dff204c97d
+258a6a9d613ce3cf4697831d4b1e59f6
+f3b7c168ff003863a0d545fed2309631
+64d9c159753ec2142150bc5562c148b5
+ea5a9df3b49d03e19790e9771d9f3596
+e427908e9623bca1b845f65955e6f751
+3e463c89ecfcf1a7b29912a01423eedb
+b18fc1390fa422b9e1f1d455170851f2
+2fb8b13db5c1df73a82dfd61fb999776
+0496cdfca5a08f8c55fdb2bc2c584725
+4db7cc055d763af502bcab130d20839e
+7da1b2e33c47688a77ddae62d0900cfe
+0dfe5022dcca4dc669077556a8236851
+0be009ce3e02cfae0d725bfa9f909a7d
+023c4d8e4702c8b2e8d91f2c56690e6b
+7e10439a7c58fb3ef12b4efb4cacc668
+70622f080fc409096c8e48af5827525d
+70d5dd70a128c30dcda5043afb04af21
+ed95b0cf76b798557bd600cacdb784e9
+b6b7ee3e4e45a3fd5295ea728d9bc9b0
+f6e17d4b77d602934170647fc6fbe2ce
+be73f29204d15d5f7be2bbaf51a221e0
+8e68702bc070737d0ee11322cba20b28
+188d6ed573bcf5878d140efe2c65b599
+efef5b7352283dacdc5e6803a59539fc
+b782aa2ab9be2cb09784ff309b29163c
+74af6d528c356073bcd0be75756e7b1f
+dc13fb341fb30b97a4ee3407136baa71
+f82a311e7c206c4eeecf1f3962dd8c8b
+1903dabbcc421c1017353b683d62af2b
+71294cdc12e4ca69f6708908ca8fe967
+3c03b01eae772bc600a9d844d0bbf620
+f562f4043e323031b0530f3883120f46
+5a4680c71cecf91f6d03a5cdebd52f85
+6d177ad665d3a5ee1dfc3459fdba2fc3
+901de888c20235ae5eb903263a6f4e1c
+46c971e9578defe37af0bd7c43c3a6c5
+e9765e85c8dc81abdca1277e865ee72b
+518259abacda6ab2d4c6b517af594ea9
+856be42075bf84e9b4aa84f75c783f3d
+012ebd67990b490970fd005cf197d97c
+d40d6e45fe0c7b737af5a6187a87be9d
+b842995634f03b69f23bee6cbc9f0014
+1479fc466d2e08cf52f6383151915d47
+fe572e951a84575e512bd1a85589fe32
+3124dd3e98f12f880f0b92bb2dfddaac
+53eaf2e06c38009cfa5eb53ede6f9053
+10c26f1e4a32e116bdf988ffefd05db3
+4329af3a3240368613a4ecc483ab4571
+46d2794296c60aae934108063e594d3e
+621e9ecad38d58e4784e77fc140e1e31
+252cb66467b3ec133db4d608cf4fa5b5
+4cb0b4bc7cc5d32516e518562a061991
+657228e1a66879fcb03d968c9986fb67
+bf31dade6846a821c47c4368d82dff49
+18bb8af4f6ab896b903a45c8a7c4b10b
+ba536ea2d3f21f6aeea205d74cc19b37
+561e10be3de08e09fa6c96d9f6416a92
+fbd2e36adf63e73cf896eab557fa5329
+98e0d5fc93ab2a427671f276317f2ac4
+37bdd2a54f7ff30ef63385033b064729
+ef9c499e9607c6759ffd200a9551236c
+375e17d88441935ec2c992ec24685f30
+fe6543296ab10d74972037e8f5552c72
+1023735ff6cf539f801bf37861d7b281
+4bc121334989d486a6c4c65ff474587b
+18dd4d7648282fe524d31407a05970ac
+e467af24877dab4dc441197c97121082
+1375024366ad6cffd9f6cd134054c4fc
+441603f67bed159d80c5f734f38955c9
+36ee3be10cf4187cfcee15f37b727961
+bf01e8b79f6efa36506692db62138afa
+b2d7b48b49190a32b4c0da066d7f0e61
+fe28ec5fea80793688578e83669b27c1
+6de7417e95f336a07545938011dc0d3a
+f250b7eb09d86c7e6718dd7158d9f3e1
+5e666bdcdd2fbdd82e7522e1cd2ebbbd
+375134fa90cd496160b1f1e99ddab294
+00ecdb7fa92e2aad0961f42060a56bd6
+f9ebb5024bade349ebcbb9af9d901f80
+01086f9c03295fa58108ab9a15a4a9b5
+0cc9731c4649d012f4bd6b79299425ca
+5b007bd03a5ac9bb6f48e1fbf1f4d26f
+7b1182864287604735456c490e39cf75
+819b6d207737c0ddef74228bfe661d30
+34214ad9c1435e64261e9981f927444e
+a09dfc836d0ae1ca36213d5e161e1fe0
+7997cb49f3b860a83387bb5641c9a485
+f416044d6c789dd8d8f59431b4239b56
+311006f35d2f842ac7e4e02e5eca08fb
+5039ab340c3071cab664c686207baa0a
+be62724c897eb805e3a44c3476716b35
+da8759e4280bf53324c44cd3a306c67b
+d6c02512932f82a0278ca6eafd7c5da1
+2c50437b2c69a5349675c2fa9ca3650e
+44ee565d7976c17bb41a30e70823416b
+d41d8cd98f00b204e9800998ecf8427e
+7fcb96fa3a4ddbf2f0903f7d0ca0e67a
+c81e3a12f6927955fd0ba11212df8a47
+0c2218d6c273ade477607e27943bc2e4
+5ed4ada231adc6dd76761d9aa512ea42
+d13fe67bd2ab8a393d22177689bfca61
+36e804166661ba41cae09fa48ea1ab24
+ae4258b5b32cabc9de69d9a8b0b7da57
+830d8300e4395f30a3abd06121e7d194
+62bb90903c6e2b3c3489ecd0b90fa922
+4ea087f6181c3150801abb7b5afa36ba
+9080a7fa27a3286f7e930ea4625fd8d9
+36b9a5ea81e16b5eef363ee4879ff121
+912b18f088b87a16a011d354829337d9
+145924a50f6b3779be89a1e08d19116a
+0e382863055fff8fb76b1746c2d9968b
+6d14ae50659c31685214924bd3c84d2e
+99876384e2c3d43a5f98b11502c7c88b
+da35d5e7d2d83ab09e1fed1dc432882a
+7549b188e04bd2aaa94d8840bebd8512
+65380b0f1ed4705bf0c3ead4de7b059c
+1d282237f7b80ab26f69692429e18187
+7b4af3a3dfc25897d104ddfff926eaa8
+b9982edfc688ebc670e9f84f398835e7
+33a2f78abb9f31149acc8e5d787e3087
+9c92ec053c55d9080912256fbc22fa4c
+f10d823345368a09ba42786a5663f1f3
+cc97674cb92cffad1dedf58d83c442c0
+e6a491ff4d8d7aec6c2ecd014141e903
+9f7283fcd50bf37de0d2707a37e6528d
+b005380273ec22780ddfa7f5958de197
+9b250dc81ef3b9301fa3aca93f890894
+4b56a12bc00724cd04be3f362ebe027a
+54c7793809694e1cad814dcd317fd1ed
+84ecfab19f806bcaa5940f340cfae423
+ccdc3d3146e3f4e24e9d69d9ad6a4f8c
+65166ee35915b4c188843207d8fe246d
+94f5749f2f4869aceb7b4b6ab0ab88f1
+a0b8cc3ccffef6d1f9da37449c0139c7
+ee66ee32b1a1fcbc85eedc6c030bb642
+658f5abb9a034ae42d6d55a86e931089
+48b1f6d934ed2cf16e5c881173355b23
+fb2c0227043867204e697968c41d66c2
+ad1a97d11d7612e694ed31b1b8f07bdb
+95541b830e0186e7ae273e8ef7736085
+e7937f37441ab7fa0a064f1b5fc87baa
+9291f8b38ffef4a785400add2707ad04
+5fa45d8b0b831b7381b59bb21734c700
+e624fe10ac95cd45d559dc7d4513615e
+aa656886c1c57c8db9b244d57c7d8048
+47805a227b19ba2b0c0a5f50a909a88e
+37d882b46fe7c4042bdd542be80d338c
+a68f7354fbaadbf3d16fa7b9db5a63b8
+0229081ecbe1d01bdc3726f02628cec2
+33820f87647d03fceb48b3b3c9fc3915
+efdd161c1efdf0de05642ebf7f7532bb
+a9bedf9ebe757c30da104ddf23501e1c
+cbf1a89da71cb7880c06febc2347274f
+8b16cb20bca31d20f97324538fe0fecb
+ce205d4bf5296b0b403ea4c2498e910f
+e73b63d4ec9cc2e312e1b81fe6c4a266
+de5fe53cd9b1fd30cf8ec18d3c82320d
+f0049dbd6f353c41040a326ba93bade7
+9a9ed5b444e873107d78bbc73e887e1e
+18a95e6eb2cfcf8f7977eddc91027aee
+6b8e16aebaafebf7d45e188d7d989610
+d84ed0bea2bd61ed6ba35ca61649aca5
+4a050b8d5ffb976bdd2047fd692e630e
+4c7dc0cc9fbff31c66ce161e6939fbd0
+f8a0a8dbc9608a52d3711760fee7bc76
+ebfd847c182866722aaf35d61101cc2b
+978f4cbdc9607ed274cacc653202c8cc
+b55b24dcd9f4561b3083eec75bb9db74
+c3e647e14d4ef2095a9e7b66f9511dd0
+8e7f3b821d3faa26f71cb0a326cc2d64
+8ad580cb20c92bae0f8a990029282569
+6c6cf9854d2f09f5a031ae7a561f7f74
+c8a55f59a78ea95ffcd1ece2b86b65bd
+90fb55fde8feb5e85f7b3bbf4415273e
+938f950b79327000d9d7c7eaf4aa1410
+d1ce18aa37e44df4dd0482a847fc7516
+33535f7a485dce1c0060998a1ffc2713
+f1bd4bc17090876c224a39d3a3b8210f
+4843eb58b881b3cdf276a8747a99293b
+424c1851c1d81f63e37acd50a10408fc
+0fa9939cb79bff15fdbc2e3906955ec4
+28a47f1d705b1e2cc0aa36fa6b6d7320
+07ceffb59547b847ec7b8313f78fd778
+3ceb83ab30882327bfbe9597974789f3
+cc92a817da93276179dadfaeb12d4e14
+1c2817a3c6af15e410d967acbc8934f3
+059600303056298ec20912e6c27333a0
+b610a3a0a92fd551c5820a99e9da8c9b
+53604ddb59a33ed8a088ad0a1640895c
+5759a3c0e0536b0dc11f3397f889df42
+44934b69f7f0157600fad773ab52fa99
+26b1925b1e02617445156789ec975815
+04168d403caeaeebf96a097a0203fc6a
+105fabc0d04b240c31ba7e1526d25225
+bc24fcb8b991212b46baab11cce91453
+fe7c52eb6accf7345d719735bc3d9d8d
+c53afaad00ca4d7d0a95d028188b4b51
+747303e6c831e4a17357499fa33b442c
+96ac20aa11a6988358a6bc94a69b5af3
+98581b95f9268898af4e6f6c5100c0b0
+3c5f6e841b6a0d423c36a42868d06753
+bbb6b4753fd0af817abc7983713c0249
+c11287d0071006de205190295f145863
+3ade3176b7f13d5ac95521eb8f500bbc
+3ac7adbe3b6706960e26372f4f0527b2
+79ac1cbebc9a7535876c8db1ef712bc4
+97b3f1c941bfd9068b2a9ddd19cb998b
+913b1f39ce0a5552c79aef5e2453e00e
+2973478ca153a6399f7d6e72f4934f21
+f5d9fd06f45b3c723bf18205c55b0c4c
+391295acba607f509f746aaf127b6e1b
+94983a68723cdc21f10ca094a8198a3b
+45dda606c4c127eb46ebdcfb21060325
+577f536d2bfc5c3fad2b2f543ef21e80
+d3b6f8e0dc3514bccd9f4d381990f433
+ca464ad380f06d8dd1aca58f1cc97c6d
+fbefb47371c1e1b24e03753465fac476
+ad5517983a254cf345fea5dba33df81a
+2a2ead9e3a0633514fb10007206f2ef2
+79ac2ad7c76e6253610061cac37a8b7b
+f282bc5494885d0306c8131e6912543b
+8640e9e88cb5ca1a094446e76d18d2bc
+998f066e2af0905b91997957740a3460
+bf55fad7135c2d71df476616d494093e
+7d7132882f89d70ed265bb6121895b8a
+3782ec21c3bbf452c6f0c391a4c5dcda
+55ef0cbbe5e9293087c7452becfff0e5
+20e1b0e127bbebded3507478ab66f8d1
+38a0b0709593bc0407786dea95798135
+55e78657debebe132574095a1ea43a77
+0ab2db239d56bc5c5740cdbfea52aa7b
+9ea25e442447a6af1ac5663c104834b8
+6bebe1f969b85423e4061627e1a3994b
+0001d64b01c64c09a485a565459f9e05
+20b13e15ac3340e0cf4eae052304148d
+3d70a6e3a7c6614bb8db84792219582c
+f3b0182183c5b84dd96d444e135e644d
+f1313b2f25aa73996922cadd020149b9
+2738c3e11b879cc0406130348e627480
+fed3287e3272de856152a7def8ba838c
+538e76c4430a398153fdd7c82ef31803
+9bb4a18501b696a52efb0db82089f5c8
+ac21329ab93d6d81e1533483c2c60d60
+44281c5f432765987cb49501675f985a
+6c5a6a4bf1d7575d8db1aae8094a331c
+70d24c6f680b3eb5c85a0cdb5785e339
+3ec08b7da42bb2b569c71bf748a721c3
+d08aad65153fd528db34391d71d64115
+5edf80575dfe43dbb0fc026f983c841e
+3c5f13121ccddf92e531d816508b2241
+28b36dac963ccba5264aedd122b0159b
+c3161deb01fafd41bafb3db0699912b0
+7ba263d0dbe1a52f0a959be41921b80b
+0d0b0d836709dcae88bf2e610a94d9fa
+d3043df14fad640a6dc0fd56551cdf59
+f5dca735fa476c7d0e7a86e3cfe3b4d3
+1e542c4365374d4cc851275fcce0ad6c
+800dae5f12ad7e6bfe11b2ff37bb009c
+d2da2cc41cd485ff1eccea77642b4592
+95ff8c22683da82d20d92f61dbf4a69e
+8f941f56a0694ba3383bd193e28485d7
+f27f7f45a86118bbafb05dcb89783fe8
+035c3bd0a2d5181f4c081b6de9217645
+b0054eca183edc438b343b05bda767cf
+32524d320ea5883adba3b1ed96af244b
+8c80308c5c64e5173877124a9a7addad
+7aa2e3210b41b6fe02a6fbd3f5967e21
+33cdce6b9c149d291af15626b260ff3c
+2cd1f926de66573c662bc96f2a7b5db9
+d389de790c5a2aef2526bcfb5702d303
+8869adfe2927843dd895966bda080438
+2c9b46c27f459e66b3cd1d1b63f5c22a
+7815c2897b7a9bbfad7b75a7513cf804
+ad12ea02569c628acde9f2e8b2c8b20d
+81a9b417fb0383f44fe22b267064d882
+7523c3017935ac00715baf59ac806962
+37042c40d4a014fce5ef8413a8d6c772
+a479e2d24ccf57a506a3333025835ca8
+ec81c5976e1524d36386a6170e6b398b
+15e89f749fea660804d3b65dde66c8b9
+b332d451b6854d92f2ac8f547dbee3bc
+5eed58bf1e068f9a2c92cb02d8d6cf34
+c8040ffcaf01cb1cf6a9c1b76a7b3582
+217504288bc08c9d4476b4a1f1ebd961
+b9926add46314944584419f48b143c3e
+55b56133c06d396ee340ab9ed3ba1851
+b1f9ee9c9b514535f336eefe911935f1
+dfea344541146c4bfd34916889b1066d
+a9183f2bfa8d270ae67bd3fbbd91efb2
+93b76123c80e1146db181287cb2bed2b
+706813f498a7a1fa2d734c80a189540f
+6e95ad91bbdc2af785dfecf10a9bb0f7
+5ebdda69cfa38a8e925f08c8bf815f41
+1f8e1923926cc3cfe1c11546ddf8e334
+b208cfc446ce23325409f2b0b7e0c984
+fb21a21ac14f63e4ad20b9ef30e8b0f8
+e89e5528b6c7c7906b18791c92e42935
+6fada3fda1f26d5938ff408a6d26e2d2
+344eb4e55761112111f4a3045216b57a
+5e3eb17a0eff9dcf99d9ed99ae2b32fd
+aadb5868c3a2ee7d366c45789ccd3e99
+002917588257691f6e85a640fbd2ec23
+49039a45498807130eafbcfc3c637bea
+d3fd38f4ede8173d1d18a12f96a7d0bb
+b4e1df1ad173a94b9e39ebfdaed4a1d4
+82672fd9b1421f3cde020435388a42ed
+f500f754291d122457ea7763b76f1c95
+160812821c2e9a52c2dbdab719702b77
+f9343f918912c3d1d0c566e527df540c
+7ec9e1895137b3d3676e03efbdd56492
+c2fc75d6ab01037f0bd85bd8d5a7230d
+12f5004583e396ad6ff09af7a6c83893
+f20ccab668aba6c45453e5f4c83d0ee7
+78a8fd7ad3759637682464ba6eae00d9
+e2ad56cf1378620fe7bbcee065e09c32
+973a9c2d1b1011867b225eceb4c0c01f
+53a7285f53b81e4dff61aa4808c2f939
+8fc8c8a0c05a8e8ea519e0f4e3032698
+5bf9753679c8245e211d051f12257578
+f4603cacd51bb303e91559e56279ee76
+cb70ee30babb54dcadcfdeb9b90ee3c3
+3bc9026c409e1eda182448b5f85ec33c
+725b6fb38a145d82dca75e891ba1369e
+a88e7c807e58d8b1c5ca4d71026528e4
+acff928e8389c92c8fd28c1b596081b2
+ded878cbed3a4560a00b997ce4b167a3
+b585df08c9c7860a23b130630c8d8d4b
+ee258a23b017967b1d63d48f640cc994
+c49a4f3b4663cfc62d49a51a244da1d0
+2d9d151e31f132a6451fad74b3101e04
+952e911d7313ba7b2858ea7ddbc93c65
+6a3159254b8ca73f168342d9b7700288
+c92bb829e69a834511c377662c2277be
+047e7ad03143a415db65aba041bf5455
+d2b333a83bea88782bc4c7c3b0ff6036
+299b88c1a04c81ff5845c9275b5a3eb1
+9eb37b720be5025e5d876327c47a0fb0
+38f32abc5b377c7643bcfdd806eacd22
+e85ada34790cbc65b8d374544166e61e
+0000c03ea84ac40ae7ab917674046c39
+de0991112082093632d0aca39d8b75e1
+279501a30b5a40197a212a592af891a5
+f234a5a9841ee09ac389877b653560c8
+7ed5ac12301b7f7fa07c8eb8ae0eb12a
+a4a6a643687b8273ab2e782ff836a341
+fd1fce0bcf6e95e32cf625186d6ee16d
+cf213544a5e2187666e11ca4c641fd5d
+177c56251d80028d3faf1685c4b249d7
+80f34a424def6f22a1669138ce181c27
+4b2ed09baf596658a655d1eea21e4cf6
+eb45ffc3a1d6c99d25c81b7a79031431
+c307b3eaa2420f9382c9b736c16a3f97
+feaa82bc9c8f12a1566df292c88d6a6f
+caf738f8f97326d9fd0fbb925f02b9a3
+9bdddc00bc3c1ca55ba5bacaf0e98c9e
+ab230e8af7b380969e2f8e2fab0cf8cc
+362c1ab8a2e006ffbfd333730c8ebfb0
+db410e04b7c2d90b49d0c0c2ba9cf83d
+b3675c73dd314633322718ba69f75dcf
+56fef0a52c313c9e6ac262527958f2b6
+514f9e477cdc5dfa49ff02d6b96f858f
+663e8e8beb01a843068f7f223ccd954d
+613902f10addf7c5b8664f30985722bd
+b6f268154f8520c1d9705d12baaec3c1
+9cc7cae510a1c5892ddb907d4e37e4d1
+7b34105b90f2400eaaac342def39bbd7
+cace146916a4ec0ea74d379d7ce75f9c
+49813d5dd3c517e3550b899df452eddd
+afa1163bcedd3c8cee5b32b93a30e72e
+82833e5373078a76ae1ba44aaab75544
+1699357ac8fb8315f217de86d90257ac
+b7ac98cd625367024ac8e915fa585447
+52f2da441768144e5feeffa226fcb7e6
+5cabf2ee7bfc7cb9c4e0a1cd230d6723
+eb61dbb5ed79388081dccb0eb034f5a7
+1a67600051be4edce3bb07a8853e58f7
+2bb6bf03d31889201eb06d84eb19e656
+002acf1c93fe828d7c095edf49051b3d
+233cf66cb14e590e9fe5c70779512e79
+01d537db75539a9cda25595583c5b2ec
+a9037faaf87205a4932ce998cb5c30f5
+37b70a87a4d045330cd92c19fff2ca85
+351fac152074c1da43b3c9bfe1a83d9a
+6c57fa09b5d87e2ba217fe0d5255c716
+2d9f7f87dacac7af0688eff1ac7ae38d
+04e496335a2a7507534e113f8d0bbfbb
+772be65e33e1e4277493e5125db57599
+e1cc23c8c3e1e603da9e57679ed37431
+01df37098924cb2130252bac60812546
+8f71819835268d361a8da7d2ef048f38
+535fdb87c93e8a62a6563dc5a2f22748
+c048affe34cd4948baf3a44b15fe4a4a
+c772ab5c6abc2800d40b48e4c9b6f738
+6b28d261367fa5becde05d13dafa000c
+6dc68a1f9e492ebf5185e9056d9fcf91
+84da9afb3dab9d5dd23c5a5027f248a2
+ffc2dd7b62e2603535f23b4548d481c1
+fa958b99635b0b2a290ad8daff06bcde
+de3128801afb559601f1054dc641884c
+748696ad381eb55719be69059c93a961
+77627c4eed773c9f38e414d7c7e6d108
+99074da32a3af3a9b3e9395fc3e17d2f
+471edf53579495bf9db8d1919b7bb81e
+07fbe4fd47b0b89d342fa45cd814b391
+ebbe7ba0eb1aedd267a8da72fff978fc
+653b978e9d6f0c6ca8823bb8f439a33e
+f96c3b388620d418a48255e2ef93199b
+a309a07189018bcb866e17ae7f287540
+0c7412da572175b63831f906ad23a5b0
+fc8e3a8a55cd7c9c1f4cb554be5b7c90
+f75fe1dfed1533b31b2cb23148e72935
+b2b2c9fbeb6dd04cb1556b5ba304bfd9
+8cf56d513c8b14d595093c1d634c627a
+a7082e5b7e98a4c8eaa4b8b34d205dc4
+872e5a5a162b60efb6eeafed3d397d6f
+03f931230390994acd1dc8e2d6bee046
+dfc7f7e57e73cde0cf44228a7973d5ee
+cb42207708df42afe7ae992e289954da
+de4764ab014c0b3af781f9a31a18e357
+8c39e1d3ed452995a6e639558a53c53d
+a8b4da1427565af6c9cb69eeada3d2af
+90f7a9fb5b4f42483331ff7be4e79c30
+39aa8f470aa9e40a4bd584ff27e11059
+5786ae4593ddafe32deb24ba86cf7148
+8029df09229a9bbe60f239fb541e1356
+7ab51d2f7330d20afd84718924b54ae3
+0ddd33fb231de67bab50072f1863f715
+9039e7db1f38dc5fa8be1ea1bde9e7d8
+a946d90c25efa3ae9ee6f0ce0a63b18b
+936edfb681ddce84c9a2dc8762f354fb
+c62a2aa2569247c47a676b68eb3edfb5
+790d9dcae503b2d8a975f2a0b23c9d14
+88ce4a1991431f9e56456fedf3c73b70
+4cfcc62c4486cb214ad417740dc2d975
+3613356d3445786666d6ae64c33f9f16
+a7d0ef1378bd982651abcfce1137e390
+cbd20a4abebb543e1e580576c08b5966
+8febe12d6d0be7aff7049469d253baec
+337540c5e05a890156e959bd2cd712d7
+b4941039d2cc5e636780c0fd1a079e3e
+be27f11e3bfd2ff43b01c517a4fbe182
+b521220c5bbf81ccb437be6746abea16
+bc69bd5b7ea3adffc80d6b60d2063de6
+9f109f254504a294cf47b8d0a391af06
+16bbd48c7a9221f06f5d4567fa31ea0a
+739626d342b24c89c1caf808886ef06f
+73ab15af2fa97d25c2b99c0e46478e01
+b2fc677381c9ad2de5755fcc570465d3
+b32bd6ed78691e3ad69a827fbe45dee0
+2d91fa749a11a7c84d088d26aab953d8
+d404c1f812e48d3476f1fd4388807be0
+4ba46db0de49cc898d332a4deb58af7a
+b10fc668e36baa61120a08e64a011382
+dbadb48b6a939948452d38095958395b
+07652321b755c62b50e493929fc72717
+e41b8d9aee2bf6cb2ec9e9acbeb3dc4c
+f8204920318021de13af1adfef295196
+adee1a1d1dea55315c091a7eca38d927
+a50c1746189fc19161e9c0c8b8adc9d5
+2f4aa242518a62bff1793274936fd5be
+6da5c9555ff0cb4541a67b03bde01744
+5039195b3ddbd8c31dd192d7a8d28289
+f5d4951d834308ccf50ee2be552ab63b
+1aef178db49aaf50dd1dbdc73a365e74
+cfb5273746ac52336d0d43cdc175cd8e
+014795df06c3797a8ecbb5d48ddb0348
+9e7e10d31239f79e5a32ade073b37f2b
+a65f82a39e1ae3c6cc37acd7a1024e5e
+1d73c3c495dd06d9d5394d50a1c1e091
+de9d801d670b0b0184bccedff73f2a10
+03a386a157e1cc49b84be04b56e0c6b3
+a1f0fef51074d35d1a7bc1203ba735a4
+c48dcaab4aea9d1ea4c21bd3636106a4
+7b74976f9bb90ae710173e7de01c8d33
+2f8af399a84fa92ca8578e5064f66796
+177f1bba4d56fecf932e6961cd9f391c
+34e67fbecaf54a9b86471d345f7da433
+9df6bed8293c1e9b1a2623d8f4b8a13a
+b58ce75242f99d206dd298b54dead6bc
+6d84c4a5a79b444530dae5abf0cffb36
+a4755139ce4e31708aee465b6f54377a
+8c3f9fa37bdd50746477fbb14a29d75c
+747d50aaea724ce0c96059a348e4104f
+496de6606c96823942fa504a59b0cf4d
+a758b8efb0a7fc01c1869e9db27dfc1d
+6a96edcb644cb5062ae261d5089bb08f
+302139ef9067da27db1651cf91515b5a
+9885033840bcb99a1a23b56b46e493b6
+ef251195204cbdcb0bf7f118d062fbb4
+c40081ff4f15acd850f06fa566c4b35e
+37374a73f87072a486234cf9008e7884
+2d668533a8ced6aff635d774bb08b78f
+380484816b1d5224732738ebd622d89a
+0ea0c667e738acfaf9e1d44f16dabd3c
+34265c82bc4d004dbe25939e31ab4894
+6e04a8cf22bbb911713c85a0c1ea9629
+ed2dfd1f0658de06faa8483c5351cdd8
+bd649782f0b418ddef2dafc6574315c8
+78b3a4f8c267387503a43712173e02c3
+d41d8cd98f00b204e9800998ecf8427e
+b0f36bdffe8976950573512a90eb023f
+14b91652fae7ef1aa6176487f07d1a73
+9e3408f31529096ac2add195dc5049f1
+347d0b374e5315496c985999e46d15e8
+60ba203f1077db34901d07ab3fff62c1
+9e3afa242300c636db43bb72e37c113d
+d100e0677cf6eea0c38711ca1bd9d5a5
+a36163ab557dd21a796e0c5a1ac84233
+8dddb67a599408d9ef79b4f3d91222f9
+60a9d112476835957dee06533089896f
+7a8991cbbbd6bfca8c117afed33c6be5
+8c5bec2980ff30f31d78dcddb1829cec
+8612ecd6db17727ea7634a78e73e7d85
+5fd607be465ee70298b0cd5fdcfdccfa
+786a47a3a6f257dd40ee3c7dc115116f
+dd54eeb3187eb05c54476fd26eefdeb8
+50873baa4446b5576b7c7fbe0ec9d177
+4dec9bdc4587199303285c424b5e458d
+ba5745c5a6438d64fe46d8f5d06cb343
+49f3dc27df5d21bb5568108ac571f341
+ea0f718e6fc6ff4ef0060bf38c3cbb4e
+952b57506714088ba9fd77f0679d254d
+734d2df8e36b998b610f265660d64733
+0439fe2993e0c5be486d48d1cc7cb0a2
+a334c15643152423fce17442def1eab9
+7de3de4b60f80721ef2b0bc4ac5bad39
+82937e2782997e5e74dd83e4b4be5dde
+2cc9144fae68c0bd4d3d09a1f0998373
+b682050a5ecb01c3d73f056c2eb8053f
+87574f655d6a134b3bc227b0fba3baaa
+c5d2243184726a441bc77522035cba5c
+ea78d2c6d50d0fab46a26f372357d6d2
+0121784e1eeb9fe82e7dbc89a45272ae
+e8ce15c954c54fdb3214db9fc3b8b286
+0db1d61b3198bef64a6687b72e2f5718
+ae594cdd590827a809b380681fefbfd0
+1b9a7c0d4f3ba57a0d909de37200506a
+72d85aa9b6aa6ee61f78ca1c3fb5e5c9
+d1395f0f012155fcb2122fa4ec4de531
+e8ebde4780c8703e370c41fcfc3b9e50
+f44a5f2a42ce905249b78afceb91db51
+29c3d5788721e777277284220ebbc7df
+41bd8acee87af8a82f9126e5076af06f
+2b669dde5178b11033d233895b23a4d4
+a76d7120bc50ea07e6231309fea43e0f
+34844814ca91f3f65cb245ef2a8fbc40
+40e3130c948c9784b62bc1d02c0a1cdc
+d74dea2fddfbda377fb235777d9d8961
+1ebc1faff2ac71215d61475627234d0d
+30156e5158de1cb0e85f6c5d972f49ca
+56ca37f017cd3f07068340c85327dc27
+c081b0efa23f0cebcde8dfbd0544b863
+3c823b4ec0316b3114179dfce48780d5
+45d86d938d2f94eb90cb45725e379675
+f140c4d1f1eab9a58b0af2d0ff9ccc64
+d8eec57c4f85e54d8eac260cc522a421
+780a5cf78b73e82ac0cae7a44098ecb3
+37fe24f799090ea85e1fc1d9f4fbab03
+dd7ceb5ebeac354ceb42c26bda62e2dc
+cd4170cc12ff95f54d282117bd184ee3
+c17cc10ea9c58c58512cffed7b45613f
+6b53369ec6ba717cfd8b84c0569bfffb
+e684288bd269ffd30ad8c70178ff4775
+8c33ac417828532321ba476e2b007bf5
+eb28414c8b755e8f2363ce1656a35432
+267426d67f616001c540eba5ddda05a1
+7d73c7f0cadbe134a4ca5fb843f8f390
+77838cd1810e53c5eb362638676d09ef
+74bf690e5d8af9aa2ab811d38942b0bb
+1fc34620d35b23b36bc7ee6d9e13d638
+89c7e184cbbf02f3312e48b3b06ce185
+a37524a4f8debc448bbdf5d9d421c55f
+42de5b0d195c06ebc10d0f1f33b98158
+0623de721ea88fd10ba693712bf3b6f0
+544ac40de50bf392ac8819f94fd1da9e
+d117875d7184426eaf193b424ce3786a
+21f6755cf86f1a9bafc5d88da94b5954
+636c29e30cf3d052653aa399a70cecd8
+040b08cb5e47a59ab1d10254c1191c28
+19cce3d4047359277afd85a8b57a3b68
+65bd35c3f0f31b72e41668d0e6c365d9
+7c130704fefb2fed2b0c4787ccc45430
+42a7f66906f1589db729693237e4f479
+c71b710d46da001abb97898344389410
+d4665ffcf313d9cdea501b42f275f67c
+b051badfd2654eb699191b59db8a453a
+b0a74b74fe7282f0a5c9700c0224c71a
+f9efe2042e48a77752daccb4ffffde3f
+5aba533165785082ac262e4136bd0107
+31bde200fc65a577a231b6cdd0bb36d4
+2caf939d1359f298bf3ec69b8c871e14
+3d7e123a009137030ccb780c3a47fd0b
+dc6ebf6088832cc3633057a9b11d19f5
+84f46a47581c921f488355c8880c34c1
+b34c4887ea999e6cd50e156459a67b3d
+af879f023bac30838f3d5610994376bb
+69321795c904409f02718dade740b428
+f70660ca03c695d3141055b6aa670385
+55240f2bf86d91d7b169fbb35053de3e
+c1a28899abffb96af5dbd2a6b66ae7b2
+c91da32681630fd902cfcad122eb1e39
+e53e3dfbef1d00c7df8b266010ac97ed
+b20a2c37ee8a01614be89ea5b1fef0b9
+8aaff9b32c34452335a20c1a83375875
+ff9c109f2484d1b4f90563d7250776f8
+41d7708948765461d3a83fc94858344f
+2d0e59e3064483f09cf3a4838c710631
+66a3c244d3088f4f184b17eb9fcbc365
+1094ade8ad14e2f69d11400b949ff8eb
+a5883b0e76a2527d04db574ad66f61f4
+8c7facb1812ad04e766ecdfe45dea4ad
+9fd89389ed34b86bbadef3944676acc0
+2cd0b9a75dce5064642b4718a28299a1
+817995463de4ec70e328e51f5ad89e4e
+352009faed0a0f25fee09c0052170874
+b9c74be94fe78c1ea0108ddd7a083833
+448b2d84c9103a07c04d295537df8c9f
+311f29ed305605798342ac0725408339
+f103189ee7904ef8d37360dc37216d45
+50d0e9c58ca38fcc3de0fcf5f4147143
+12932c88840d97488bea18de62b89952
+ab07ecb291edecc1eb61a89008ca0c6c
+abf3e66d007a5d76cd57d61a5a35f5c7
+cae289debe0fc049fda77ced1df41cdc
+12375d654321e7f542cd866c9994c742
+5d1a4de24638befdee4e6cd916649e4e
+652452b9bdae23af39a8f12b13f2e8f6
+d3f016a31ed9c38266f62a0405208422
+65b1871076b794b638dc71eed3f9b68d
+70c4d686bf75a30ba39711b50a8f0542
+8ea4680a6b083e6249221dc6de5e0287
+896d32e1dbb50ab8d2d9cedae8915538
+bafa0e5272d90980406592949470e86b
+c6bc242d9f633ae64d92ae10911e29fc
+ea8dd44624d6efdcc019cba1b76eb141
+3d8c0e38b7feff9d5fc338e29b8647b0
+bf1af2bd1ff8d5528a69a74f16f8eab0
+7527f35a567c5143576f8a77f808d37c
+0591585395aae178657a55329413225b
+5975492b5f55edef36cd86de48e1e87b
+6e1e29417d5a085764b0653e852ef7e0
+5489111e82a92d165bf8f69e29e09a0b
+a77a82ded2d87aab5699c40cfa91050a
+d8cc8c3a89bd442032984a2ca1640d51
+bf747a6d073acb0ebf92f1e1e9d8d58c
+dcacc28b4b68fbce65f36bbb9034b707
+8e1e0cfd7f3a43dcb107669b4e43124e
+8f7703eadb590e7c8fc3995997605dc8
+94e772bcfab819b13f67e4033b991079
+de1840f7bc1bf7c4a8950ceacff291b3
+b59d681ff44ba0eab7b56ac86686ec4c
+69e08ceaaa6f7d206ab8c30a99eb4598
+560e89fa1ff6a94df122aa37d544b1c4
+253f6c052136ea37b9e13714ed4da64b
+83323cc32727ea67862fdb4ae0423009
+c3af726ba87c5e8cd3ee0669254ff0cb
+34557dbb523875080aac36481c5fbeea
+8746e24cbce45c1563ade20bb260d5bc
+9d981b8f5f22650be444aa723336daa6
+020cf33f91c73a005d0e2cd71eaf8caf
+ed4db4bc773e0056db8f380139b6cacc
+0e3afe4e2f3ba01fd1c9c6dae7ce875d
+056dd0f9364fc9befacd573cd146fc6e
+d1d64d8a05f5c9c86aa03a7ff17d0c77
+31d912bb0ac42539577d335a73b41e8a
+0ff32774fdca85d7d0f581b1656e9fd7
+c58c1edaab980e208ab3545c1687c889
+482905bb08f96e856161e141a8c16592
+0d00818e769391b57d9688f7e9d40040
+2474cd7c824bb6b594c948ee0e4e3a3d
+bb36a15ca754d024487caea183d6ba41
+f139bfaad0f3030ddfc469696e180511
+3f51afdcfa43e5a761bc7e102be231b7
+4b1e93b43aabd117d72d6552bbf8f090
+589f1278bd3dacd0aac2b84c8e428739
+ab5d90986c28d6fe3cef83e4c14c4bf9
+3d9009e1a994c3a106a58c80ae4b5813
+6b54cc96c20baa7f77bf4ce1c05d5d1b
+d01dbcf3870c0539754ef023896b1baf
+024d4e95274b9527c47fbabdf5f05fc8
+187aca2ad4337b99a8388467c360bfcb
+1c8696e714e17b43618f41e1cfc4cfa5
+d67d33111692ce5b277fad12dd96f34e
+a1f60f958a232683d1947f1c442fb68c
+5d400880fa56bff9170d5bca21c3d614
+fd95b75e5d432dd5ceb029437ae3f91a
+91ed5c558923dedccce892e2fc45de65
+fa65f6581bf27efee4312029844d1cce
+f0a38793a7582d8731aff565c5d18830
+e6939707b93fe1e0347247c244c3a717
+0dd118fef0d3624bfb4f96dd1be3e378
+29dc99f93fde523d62bdb4c11e2fe517
+5aaf5c712464e9e0165b75b5926c25e0
+09ef5c1e01ce482cf6619fea0ef5de80
+c20e6cb2a69fa683845706d2cc59d72f
+1590ac46c1a22cfe9114158af94f66c2
+c6e0635fe28c1d6a4cff6b64453a4db7
+b56628bbf80f8f194ff702c1e4893e8b
+d564cd7e7091c2ee807bd339705a4891
+07122117cb90f83d15e5421a4d77f487
+47a14d8df48e72bdbfb011037764bbef
+b3eb84dadd3f98ad16ed03fa3d9734d1
+c802ac5b22c36ce7c2043386855f9cc2
+663b580c5b7d557c6dbbf943acfb4d76
+c212460c41e2926c28b8c0d4c0b91403
+863fb4801a4efc5470f2f9ec50260d0b
+ebc27b825f5a1c76326ac1f5b408a953
+001c5029b96d79c123683268303f87a0
+2db3bfe3515f929436b365d0f49ed442
+8b8f7f16150a6f8fd63e561d0b229a5a
+9a7a2b3190d6fa74922fd77185848a53
+cf704e27baf34f8aab3b3c481858da3f
+fbf635ad168555985e065a483f972efa
+4a38ff967224df9c8c2286813cd91c47
+78992865ad204e9babd04c0a3e1ec7bf
+1b2fc9f3e0bf6066d752ff4503e0bc7a
+1363c6c7e66d6ac40db7004faec90b6d
+741992e4a5388e2ba915a9942eeed0fa
+1f02b455ceead3c5ca43589510123e3d
+ca80180771cbf91bd68cc2ef2cd15b68
+732489483ea59262386c44d544dc890e
+71df60e1a833725349a548359323987b
+8d63c3cbdb2f16e3738698c365be3e96
+920c0a312bfdd8479b19d7d12ea48285
+4bc65b4e9aad864e591da277954b5bba
+cec868bc929a3b1fe722ca503c4569d5
+ccb716ef90b4c3e9ec9e7e45bc3ab9d8
+39b622c38144c3b37885f2f18296d911
+7e1d367b23d8514be167db44fc2709e0
+e72f96c262638d8c5b33e3be91178164
+e5321dee9902e1c41c161b6b5d6df12a
+e3e0f9623efb26c21face9b4a20ec921
+635cb2a3846d0a800adf98d9155a3d63
+a39bd73a2bc6329ab5c86189faf232b9
+0c72b6cc7e9a6e9d9a257c43316fa93e
+bcad28749b4142a32fe2f15784debf28
+8a481c744b4c8dd8ad7d39f13dbe5d3b
+52e91b707066b40de5c8749f21727225
+74d48587632fa6910a32ea3b0932c5f6
+4dcbacac7b10a87548b5f9e0b25f51f5
+d55a75183b131b7cd196a866204d5fe1
+98e1e2919020a365fac271a90e2310fb
+68236690ff0eaab4742181124ef97090
+182f817d8a508cb35ed6190c21fcacec
+565ea6d07b2ca6cd5079d9f9df680888
+8995d2b9a605d96b72e35ce6701d2f08
+2aad87ea604c5969433f24f6ef56d08a
+c6f8c1dd25a9a2aa88aa026e6e4008f2
+818341abcd5a98755156af1f8f210a9a
+8b7cf23672000f42b79eac1345b9ed84
+87507b8400f0bda2baa027d2a853070a
+852938cbf52e52cef4728198922e1658
+aca663d141ba6f9f5851a599df2a7116
+026c62c53d747def7d0ffbf7dd4e4361
+7542d84c1a84bf050701b71f58167d4b
+62a35aef5ed5d2c9455125a02ddacf36
+b5fc744e0360ba56b10cef5ad39e515a
+34d2c3c94dcef7c55b4a0888669b5b76
+1b382dfb2509980bd20c2aa64d194451
+6e5389a7039bd4e286278d51da6347e8
+fcda24bbdc36a1c64af566fa43721d4c
+601c5d5ef6c2c282819a76eaca787603
+cfe34865decc0a2ae3786ada6d7fa00b
+d5ffecf5aeb4ecab1d2be0c760a57fd5
+d0156faee4be6c86cab9a3d30c5f5cd7
+a4c94b9da78f462b07766ca52e5eeb4f
+58f4b0348039c8cbfb7d6a7233b867b2
+43bbbae1b21a9ee746edbfd722514e8b
+e0f44119ed868bc53b65b33d56252fdd
+11807f60720fc47a0e1e8d5836281a01
+b94c05cef0e33a816f860fc694f51b4f
+ab4b1b283902be31a59b5109ed428219
+a198060ca220ff692437766a1cc39acb
+b47f2e0a4dc59b56f63e35bf166512e4
+0df61059be0d952670d38dbef06a3da9
+ecf6468467386ea42825de4932333a2e
+6617d6c83d178e2345d23c38da96581e
+aac4ac442e6156dc0e0d245f818ed124
+ee258f77887e05ae9c239749e070d244
+7543f4ffdc0b26e281458b091a20fa9a
+64cd1d9549b5ec3be35af5f654a89e52
+2db1a5be1df6341c107beea762569302
+8a0224ff7232d0473eff6811bf36ce53
+138c3aaebd1f295ee0f5dde0f7a74a83
+327c0bbb4beafe847824c229bf8a8006
+e32ce049b87e513a5444154762016536
+1f3c1745938f1135ac818eadb3cc3bb7
+3bd0c86ce40c7e05475cc08070ed8db6
+52be51b9eafb1cc0692fe8252f6e0d77
+9c877e356037c07e7042f469484ce4ad
+58e7ece11304621cc08a1c2fe4f47c07
+82c8b86f2113b7446dbcbd59c816604c
+8a50c2d99903bbcf37bf0aa40d4f5553
+3e132a3cecbee105f560c59f90fed6eb
+50e9c504756b06836fdf7155c852c103
+3470e971674971a5da195a608695d0c2
+92388c23351d8d9599d6429e0717e610
+e04b7bba0d829a98b6458b8e2d192f19
+f1f54f1669032ed00b9fa16f4ef581d5
+52e920220e49c298150f0fa97f27d3b1
+de1ef4fcb19c417ab4ec0f3d6aaf79b4
+547a5a048f02548736568920c310a27f
+6420c42302bb12d87e49df9e7e38e618
+36aa0c5d676bb66f758f79e9ca461274
+cfe045c435d347d32ef3831230f65f27
+b972ed7abc9b9c3ac00f1b7b54072446
+3068b513c85e765d4c93ae4dac6a6d18
+7f13ad85c4073d7750d0c355d2739df2
+8c2bf0fe30e19496b2122a8cd2cf60ad
+861e9be89cdf7679af12f57d465c1e01
+a384d9e365eac1e308649fb94c38ac16
+8e52eca5941b343debfbaf298e9362c7
+0f1bc6370d6776926a062a788ec9637a
+a2966fce9331045194ef4989cc8e1eaf
+a1102f28bd4817f2599dbdcf281142ef
+2ea7790f27d76cde013405edeac0a174
+fc26077a381047acd132bcfa269e76c0
+87f482715515c7383f9aba248639dbdd
+be45e929505b30d3733fdfcd24a61844
+b6e47d3d971d7eb2cfd05cbf18821a3b
+9aa7d0e31a118d160da2417a4d024802
+2907ed5337ca379835cda1b07dbbe366
+4d9f30dc2d440519173b4f31d526e5bf
+54930844ecf9c01bbde1499e36b69064
+d75648194b611b0eb18a33e520cbd051
+acd3a886ff2956f0cfd5af200661b8dc
+8e67c6673fed1fc02e571bb44c665e4c
+11c7d855341fb968aff03b467a6eb62a
+c6694fb36b1dd98f82ac2e084d8d4a06
+cb7d66c30de0d59be680b537cc642218
+b67c5be4321aa3667c84018f56cc9dcb
+e406f82c0de803ff524af33184b091e1
+e742d5d1172b1f26698e7669c0ba7bb1
+60c1fddd294871f6a6a7b482512046ac
+cf32a769662bfabad675c8a3aefd7364
+43a4181e3b428ed3d704e6738528a45d
+f46e2a9ab5040ec0247dd510e8ff2c1c
+4cceed49d4001115d720e658f9f7ab6b
+30285bb4778fd5a1d1c5ffd0f8eae605
+252fcdbc8ab50663e4a4a3ad77807c51
+1699bcaec8fe7d785eca3e38559a11cb
+0af04d3d0f4921448b5887eb3c75c162
+2ff4e0d96f11851182cd59015512baa6
+a09376b6a28ab752034187d9a2df2a36
+07078e3eacbf637c9b7ad7acffa86d4f
+04d8967bd40091f91c3eff19594dd273
+f9a9369613834db725215956f8b27e6f
+1d47b7eecff41cd729f2a9b36f9f80a2
+6034dc41f471dab359fad4f7dacf4deb
+d621e8df4ab2288bbe7ef7a9317639b5
+873d440c7bc8bd84403400fda40e9ca9
+a5809d5fc69a58a03d6f3fa4ddd0bc4b
+c0eccb2d3ab64e40337f524671d2a795
+93589557ccb4dd0350ad25f9b9d64389
+4107c48c1601fc467c2a37b5962bb5b2
+a7f49b553cebb7b440345dc865719b84
+f66147e6928a07ae05c8eeabfb87f720
+c38ef681b90d26ba2ee69d488aa1cd05
+c9528bc437768d9c98ee16457c621bd5
+51c0ef7499c25030d98e92179b923140
+3abce79b432977c53efdd59df1a54ef7
+84e1b0e5c6a6f3e1b5c5b566226bbff7
+b905df2f05cc6801b73702693febc1f2
+462c5fce58fc8029334151c5c0f26df5
+90bd8ebe33fed952f4888aea39b59988
+41e9e289d014e31c9313a9e63b281f8e
+59a52851617ecdcb9ee6eea5e7cf0122
+3d2077c7b417881336b06092e482c4cd
+ca7f01139977b4fa3449382538aec9c2
+00bf0286aa25f50f06259bbc6ffebf34
+98083c971f45c42e27431e5e80bb5bf2
+ac10879916ab496f8a7e8350e6cc8cba
+1c76f0d247a217ac3ceb9d92a2648d47
+1d6afc8faa2b2a3e5c08660dbec774e9
+0a503caae1972be84a665b02942e5501
+30c3bba4f297c5254d1f59f703504531
+500e9486fedebc72cf542fbb49acf39f
+d0dc69ae27df4fc6e6651bccb1c78222
+872739f1dd44248b4d07770a8b41a23f
+a41de2d2375dc2564945d2451fe5dc71
+6f333878a13dd64077a71070170f640d
+ec86ed884867711f33f045382a3877f3
+cb7cc69cccb167b026a99910c69119cc
+6e927c4601f2b0abc08833cf036e66b1
+55b61deae06b840bbcfbe44b25dcdbfb
+a1b8d38926f586564038a732f057006d
+85190dfb710f6fbb1a0332595708be2b
+b5096c438699405a2660fc98664b2fec
+e0451775e7af067fee770ae6b25a39a3
+b3e925b7675c453f36eed97562bdb6bf
+f84d5f84c110272fbf227af599a115b3
+36561b99dfdaa276e266bb23dfa7ee76
+64f7214ba3e2cf01ff381a94c38835c3
+9d251de6fc19665e4df287807cd9fe0a
+57f5cca51527d5026302812cfed848b3
+6bfb7f4a859edf58a0d23832aea87059
+f300003e20e3cc451fae3ad3dc528e7b
+dbf8f7a91e89a79867a2792d5c63e1c8
+932be88d398e681fa89eb610b5a9455f
+aed0f4318d121b8d3bbca31b8495049d
+265ce5a72b45e807084dc4202efeb5f9
+e6ae281c8a75c7d385f7d167052776fc
+0f89a242d3d9a085a16d4eac917ccce4
+02d36429aee3a5b8639d58d152db0092
+7c8c4047bdf525a36d2893738a02a81a
+fc7dd9fcd1e674f611e950f023dcc288
+72db6f9a499be5c5c3cc029e19b8c434
+9f6036e7479d38c9072c67931d7ca596
+2ca25f56810bb31234e173e220bfd225
+3ad70a3b560539b55f06539df3a4ed4e
+0e817af00d21b8819bd24d14d4dd2a90
+c3e099471cedac9bf6f9aeddfda5d5d6
+ef0f97dd5cfcdd35fb23e09f34c20332
+176661670fe9d85f4d750aa96e56474b
+c54f354e5a0f8a2797a397f79fd4d5d4
+55d86b4d373d84018e6e0a2b8b87719c
+b275a8f2c1e221666fc2cddf7b2e168e
+7436c52c89c5316e81957040653e2d4a
+ee4a782a680f06839ae6d6d848f96768
+2f880d97f2b422796fb13339e7a7c890
+434b6549bb662dd288d924f6aff30800
+88b49c268bccc862cf215da9f20e1bde
+1bd82c1c915fb3ab8993b14c32690658
+b1af4b20462144985b0e3c65b999a032
+2f403862fdb25c9aa3d33c9cc2e94779
+635794b97bb368531d6b26f6c7788e6c
+80444bd3b4cde529eae62c15aa3473b9
+b913dbe40e0ffac06cba3cbb0b2848c4
+6ee1ab3c34488315315760db88fc792a
+bd821d5b4f26f07f8e553587ab64ebb6
+dcd578b2b3ade94946583564d9d87684
+57c798dc00d0aaa1837e8804284cd7d1
+596b0debdd77a442a66f723e71103255
+f720d93f0d2e50236afec04b43571172
+c23b0c89fb51fb869b83dabaff9a9b02
+7dfb15a3546f6a0712e131e907139671
+605662e07c2c55a01a8e35757da965d6
+e0d4ddea3cf02563cd4277adb00ea493
+45b03633bd0884c19c7fb87cc5b701e4
+0bf4f08c0bea39a406fd1f6520b1ea3f
+1248037200184a0e3e9c2c360a1ddec7
+7002c1c6b29072f86bab0567b608732d
+ffbfd4c0949de36925b82592476b262f
+649d39b096eac829ec4499dd33b08629
+ca4bd5733d1e7ef7708ca1b2e24b13af
+dd5e1cade1f4bcc3684f0bc3f996f084
+31ca0a6f7bcc14340642620b96d90e53
+81df7e6fe8c682abfaf546ca2293110a
+53a5eed0312f7625f0442d9852aff3fd
+03238cb9e7773f103bcba5fdde92bf56
+cf9352aa2dc2541b4c15c6624aa3d37a
+c0793d4854d9479e57c08796672ade23
+748c381528d7e8b5a48a34cc77e9babe
+ab88d33e4a763461be154cfb4221b5e8
+4601d8fc56f8bea38eadafe024885cbb
+081cea4187ddfb13d0741a2dd1a7e3df
+1edc807dd7f2bf19adf2809afc161c16
+f761e815004a3faa1553463341e84c61
+57ac6fd87cb313df8918360e3f0fa308
+cef231fd0462ca643bdd4553c1abc673
+267aa35bf89e7a9b780608fae2d6e67a
+3c8d411f27811610a4c09d6f5a87216c
+f0b7600b2dae756c68f1e80b3be02972
+db92a8b4cc0b2264ef5d06238c6bedad
+f4b1b5577707ee1e2619a8f7093bdf6a
+62d098fb60e1dee42d1b65fda1728916
+c09c298d28c278ac33094f3e243084b7
+4b03d7e5cf22f9def09de8583d071950
+e731104a6563626453825555d5ebcfeb
+98890dbe6bed546c00132c635d713dba
+caaefb004517ffe15dee18c7a19513ef
+3c57ba4bd744f4d69cc3e1253c40b74e
+bac1bb04cf77fda2670efcf7a135b915
+25b4a836c84cd9e1000c953b061664c9
+fe7afb330ff124f4708338ac6a40f1c7
+9e3300cac7f7ad0b52a2fd758c27bd3a
+acafbdd23923f77ccf5c0bb121faba16
+d86436487ba7ad3b87409847eb5ccac6
+c7bc9bf1b33c8556cded256396394b5c
+c4c949aff0bbb0deb70ea5593a9ad271
+bde49ef3968ab90d3b38f055e0ec3aba
+0cbd6ea60a5842196a9da45d943b5bc4
+84f538e78159d91ff24b6ae361ebe2b3
+8001d0c0a9e2bff543906aecb1deb392
+a9b609e403411252a653fd1088dca848
+6421d8cffb03950f39b51ae5827208d3
+afc91cf84c43f6a4f00573dedd25444d
+d94bfc328a0cd73fb8c8872b598e4feb
+698b91793c612a682bae6099530ce1f3
+4adab43bc374fe9c3a18866803745d97
+296538a55e5eeb47210cb5544e552f68
+47ebe14e44743e9fa7e30a1c14b4f3e5
+45ace18743fe4a8a4d89a3af44540da4
+892416cc9b12d2417f3978159fc10cf7
+5848418206a27e847c11512c12d31a2a
+518cfb6863ed49ebb500f84c840e15d0
+6fd14b5ec039806f24da698085c85701
+1606137f3dccd7d3632ee8baa3c8fd4f
+991e9c79935882eefced5b128dc509e4
+f37a53539945a187e89b05947f5a7ab6
+8c26023eb3e1b15075108749a474beda
+c47babbb490dcdc33aee6647e06c15fa
+086f1708e9699b6ecfa1ceecd22c1326
+d4d64bfc7bb6f24fd881b2e72fd563a2
+2f04492d2fc7d86a438f47d7e7619406
+f5f28c6080c42b6358de1243b89a1983
+e5bffdc84fa9edfcf370fcf780e0f29b
+c3ea236482b8bb3e573e8692edd21630
+f2e06d1bdec7a15c93dabab3628d7e9c
+264e45652f0594f0a94ec719f72ae68e
+b23eaaf7c03eb28914eb4a82f91975d0
+6eaaf2fbc7fd6b6cf101fe64114e9dad
+a76134468f94e8fcf479c1548dc5ebe0
+5aa8ef32e753691e7e19bf553d7b7cbe
+38cec0ed06eb15c3b7a6773dde1e259d
+c276d9ef63a9530a4740154761a12c89
+40092b2dc833a70bc2af5f599c270d63
+8b959a72479db02245b29034ab7c8dc8
+ad655f37d8edf2de14604473dab05105
+56a4dd2257971d9a6ca7c8689eabef5f
+0fe0248cdea34f7485fe2581e88517cb
+80c24e7794e910d2d5442b6679fd0181
+c21b6a44848512fcaa5517a02cd8ca99
+6d3c5e1f2704d72d073121632ef7d081
+f38f82294a0f801ad02e887e01948203
+70a7f3b13fd528ad34f3ad1befd7edf9
+342d746b877e11a94f0ae73262d2ae0c
+ebafcc5c1ac767b53c399f61d44de10a
+7487b3bbe50af6415570acccd90e2e33
+bd8d133188f6bfae72317ae810f1b06e
+3754429213111c48eb258677928fb088
+d41d8cd98f00b204e9800998ecf8427e
+d2281b347615e93d94a6a9ac77acbaed
+cbceafd0b606a08a02e819c2f0073383
+d56c37613e73f314a6b2a002f4973b69
+57e39447e9d7ef7caa6c767469eb9468
+b23e97cc84a3120cfb447e42e1ca5e6f
+c734125d339cad2b934aae25c5bf82d0
+2086afc54d6951629571457f759c61e1
+109fc7a174e5a615270d9bc103fa0df4
+0e14026302ec62189d2baa78dc6bfa2e
+b079e624c3706ee3ba6e4693dce553ea
+9bc97baba9bd017debcec5190ff0cf4c
+d98c40b21b9aca9ee89dbc1693bd9c64
+812b47fef78f2b2caf831f8ebfcc7dcb
+e244dabd2532cb90777fbf676df3739a
+3cc8bf56cd7c5c3a9f2915ddeccb7d2e
+b0f04f42c3302d4008d47295c7997642
+40462d7ad1ac6bb3c5594e3dc5ddf414
+00c06e0dddfa64432b5dd9c13e360e1e
+7cc8bdfd9c81651c83582e29e44a0450
+a1af1e18132591113c2c4ce7c86b0884
+c1e0e70763d15c8d74c5d5b83759c68c
+a17b5832ebb5f328ea7c25d9303539d4
+484201442c0081ac37a50e8ef4218b63
+07eb49e409860d21ee81c75f7f532cba
+560f5fe64f143c2be4fdb7e33922bd58
+c997f75371525aba0c6c19bdf22c83dc
+2e701d65fa0bb139a7a0a869bbd3c123
+005a3044ad4e526242f52a81e2735bfa
+af9af5f45f7f38b7a179904a3ef4f06d
+f3b33f3926120d22f7735d4510d49804
+6ce87c7c6812509b58b681cddd8a6e23
+5ead0b06978517d4ab7173b07fcda95f
+69d4401391dee609360bd35fb5cf6f91
+198213f5e82bb42bfa0395371f6e8745
+738de2944226bcb8123d2ad3670e7d5c
+63f8596254f72b59b3fd8822dbd75127
+6051dd00b2123b9b942f3d86cf9b4c8e
+36492754103f5cea78e3701a3f60b23c
+e7b4d6cc857f4e733c79df1d14317703
+4ce38f8e2af2cfb5bb8024a1c4933807
+29ef5cdd336af08f2bea81dbac36ac6b
+d4ce0baf00864c3c2cf6d9224d581605
+49c1dce8208b45c3db41cf85bb6bc648
+297ed79cf98dc25874f7dffdaef06c85
+26ef0e912dedbbd38b4e1d410bce390d
+a8ea1a19e7e9db40ac2eabaef38b6124
+355a1313734cf35dc8eac6ff12f9a9b4
+56cfe89f595b05b0706a30a92b0b5951
+3648194b25e6dbc55088dc5243c91682
+a1e813b395d5d6aecc733a45c72ace51
+8b0abfd89be736c99e83fdcaf3b2b09c
+025c754306ce1344b58ebb0540a21fa0
+3645833a41fd1d08532ddd8173a0e82c
+2e6b61717a650d2c36c1f463dfb93810
+99d1fc8c23b8074cc3df4153d88f4beb
+b4afd3a3405657afe81081247f8c1727
+4005dcc5046efdf885bb6a1bdbaf7eba
+7386a96794ca87d6076263d5abc6c44c
+856855224047ef910bd48cb057894397
+8b95863ce7bb88882882e9f85747466e
+da96c146024ef821b7d94dfe5ac93a81
+611c4612d44b8dd04514d226075114bf
+e685170c3f2f3e886173c7d2c1cd4fd8
+1939b5dd3c305d9f5463c611cce9f046
+cd916a0216ca5f0f089e7034d5ec5c56
+f8abe8ed661c4906720e20a11b4e45f1
+d33c57b5173df8e5efc3756302e1e488
+6c82cb956fd8859987e2f69d8ef49900
+b8641309ae05c484bc9ad10a359b16dc
+a9e3674e8d5fe7e0fc0ea6087e84ed41
+e426ca00a1f96b734c4f5f13b6107226
+e6e25a5913a60b8941502a6279f61458
+5294625f11eb0ae830dd4b1735c473f6
+c5d33539e73781de024d7de72d99370b
+a083df72c880525313a699d7e1a71b3f
+025956230158584d81f3008100420ea5
+50025661b52a10b1aac17536d4f993de
+9be08454e12cfcfa801c0b94c987b122
+42eb833c611ba8796adede13a3d87ccd
+4643d0059ef74b744043750247eeae56
+e04f3d0cc5c88841a7459efb25398e64
+903177ea93f1848b49ed69aee3fa636a
+419d4384a73055f405a6f63964e3998d
+52ab90856ec048ec535e8e680e5289dc
+fc9cf5c1ac3be7880e3bfe4557ef0693
+9d3f6766fa89e53714927a85e053c3e7
+04553cb82ee5fa0fdaef025d2771c660
+5fce2938e71f75e3ca2349674fff6cec
+079718201f431e2ddfb68579dde110f9
+013e4d43e498c1a0c365583c1ab95298
+04137ab2d64bf156c195745ef4e4ae6d
+d41d8cd98f00b204e9800998ecf8427e
+6e3367d63c82a0b79c687bd04ce62064
+5db9af4ed349ae3750f812bd0860f50a
+b6ee0e6beb34ef63e89df88dc8226157
+3aa80fda38ed4166db55799d0f8a3119
+39b6fbc68a5241a0faa0868dd0f7c1a4
+81e898de552c8dbdf40260ebdeb71ca1
+dc3e4a28e05707340f9ee4024687485b
+46a2fda480a60406cd1359eb97ae7231
+029ad55aafa2887108797fe1f1edcd34
+9478fce490174b8c05ba596ba69e6c5b
+f8172a52f77216d9088d1ee5776fd81f
+45fc9a361dbe58feadfc77a26448fb54
+7d50556e3d86b75122a9d843b1586fc3
+472a821ae83a28adfc108755b0820218
+48c660967752078f394c21dc46c001e4
+8379b1bac8a8cb274511416686a7599c
+d41d8cd98f00b204e9800998ecf8427e
+980c3569106188b41be79f12df8b4a8c
+7748695ced739f2df90ccb1c4280a5d8
+6974f31d8593850c985146de421bf02f
+c06f333302f9e3b4ecffdf5b8d7684cc
+c85822fc00ddfa25ab2ba0fad2037b60
+07c3f86812711deb7cdc055ff97d48f2
+b2cbceacadec87a93f8678d61c8b28a9
+b5a2cea6268fbfd5374a35b397161adb
+03b6aae2ef9652f8aa5aabe3db32683a
+e36e18470b806f644d0514c0b2e9dc85
+94969ffb0ee54eefe98b6637ace1298c
+e34cef426bbc279135703d4fa4ab301b
+1e31bd8e473b39b3fbeaf9e73f05efd6
+e0e7cdd1984d7293eef8d83020215869
+50776f0067f7c349f88df6708b2bf401
+326d9a553de65644ba5c31b1d4725b69
+1d77beb17fd0ae7c011a20b50a253f9e
+a589ed9ce6ff568dbb891974044fffe2
+a222a1e9d06a8223f9a0a6a34abe5163
+b46d401368cda7e5cf6dc08e7f9a2b19
+8dab09b74e3cc80bb5e18fd1d8d7228b
+80339bf36aca19cce839bd6e6f09890c
+e076909eda0a758b25749f28ba0afa89
+545d7670a3882c7fd014fdd7899c9e96
+99ab348fc6f415e581217249c77073a4
+3fac64c281d063ed68f1162201193326
+c81dcbef02a3f7abd94df71f85acf7ac
+72d364837f0604a07a4fe68083d95fde
+7743c9b29c4b6b8f63796a9c276e5ca7
+2be055b1f90ebd206937be6568601ea6
+268e6c59b9e3509c69e574ae4d9ce23b
+da80ea9f108c2368fa07dd29a8e1f620
+42530f39414d92bd4ad8327989b05b40
+3ccb371bbcef4afb05d3bf8902f13f6c
+5581c92902fbd1bc1ec7a3474b950aa2
+640534fac72ca0c8f99cdcb215ddda95
+8039afc92430bb764872ed77067e60a7
+b28e8a3b93afd1e826567d1cccc1d373
+8b3416e73fbe405d2f3ba620abf5ddd1
+551e7d2f3cae4c489502171cfccf866a
+debabc89ce1cb5459bcf13e9195141a9
+69f09d1344dcd4ca4c29c3732c8e0eb0
+6c314aaf77d840ec246c1badf9534b51
+bebdf116f95b13be22c85d7db490fe43
+ba05cbc85137587899a0bf0cf8727092
+6c50745bf30c58a5f791917dddee3741
+c8f16126bbab90d972fdabdf1f10e093
+b215de51777fa22a3b92c157d2c0feb7
+4ba8be63708d7f0303ebaaa86b313314
+53ffc498d04de26b7c03f820a773dd89
+2fa5760377991b07cefd6a3227fe6cc3
+723f77c8881f3e9b94841d5a3f235b9e
+764546a1e466434b85ff6f9ddf82f3f0
+f2939e9b42670b364c18276f8c730b1c
+749b23dc9d5179682c5705cbf2185a92
+1f6d96777d404c5ee697718bed3a69a3
+c110f5b1c4d9a1358d14f61db5de755f
+3e5461c457ce5ee96e998d5d3a21e5e4
+6075caeb03877627dba92567fbdf9f99
+5321286cafa7d1be1cd60fca27596bc5
+097c1a6a8244c2637ae935129c56c331
+63e9a9f014ca00c851b08165ba21075f
+eb414a5fc2bcb3fa0148eb57dbd2d565
+a00cf8e49c89081afa055dc358c88db3
+1bc6582cd823fb85bdbe8227591a20c1
+58ff1c022c3a25ffed98969a4e43743b
+99704fd953b212c231e39b2048bcfbdc
+63c5878a1739f01235b59f5e14cc7738
+29f9eb1e50d4dbdfb57dbd705168177e
+37eeb111b958b228e24a4fa4343eca1e
+6a2ac4927992c9e271f430a23a0cbecf
+3cae007120af31d6ee54aca35c292d24
+beae552b2b24b1f6b7bc36fb4139aaa4
+3cb66a13e284bb7b32c507c4d89dded1
+65572fd4817cd03487bf57b8232b95a9
+140147961182264ec78d1829ae4145b7
+8ac992916414353588945ac29430852b
+cb820a43c759b54b793b749cc803c902
+668170561b975223f54b896e968b3372
+660d0a05d65557b5ae0db2b736319bc0
+137824b1b95d70b28daf4c49db550341
+713a1df4752c6f8527bb40d0dee50579
+c234f4a53b689ad3c903df0bd5209b77
+f42d0805b982e8d2c237e5b86adc1fa2
+20dfb28f8fd055946f6b07246e4d9d10
+50e7ab222240c7b2b13bee9fb11597db
+3e192267cdab8a6f3613379af26af299
+d03473ff1b45193d5e3c93baba54c6b6
+9ef8cceb20c391157192c45c1b9a5a4f
+c468c0e37d6ef7ae635e69a5b7ff90cd
+2b50c7fe6f9e3fefba1963b46a969c14
+98b5419f21c2c2dd50fdbd1a28332048
+8908352604a59c1a1fa7d50e51f61c7f
+a66be97dbb29f0163469cdaa1ab485f9
+d41d8cd98f00b204e9800998ecf8427e
+81e11730913986c5b9662863f4f00e64
+6ab2d6cc0843b8f6306c0f17ac64e668
+fb39b688e851cb332ff8f78189f20c4c
+a42441b26c856385de051712ba5f4462
+b06740a5b9bec662c7a7949be2cd84ea
+bcfa8175692e1810ce99bbff7d71a7b5
+5172729ac07bb808936c04998c9cf6a4
+d95a7226eec43aa7f2c25bb68c5a213f
+3a1babace2667734065d55b762b530a7
+0467dc9574328abda2ed0db46f4ebe5c
+9a88302eb2f41d59f6b47f151a5cf6f6
+f44ed8518ab85d7186db62342c14f56e
+b8f5fe6d0ec1413960d5fc05df1424fd
+fe1fbb81b1befe027ca92740fee619be
+891344e9f9a829b766d1be5b5f1ee822
+6f989bf96cb8208fd0dca9ca964eb60f
+914fc8f5583c6731e6f914faeb8ad042
+81fd9bb451911c897eda75d3c25354e6
+d327ea7eae162b2f125af0e35bc59272
+f2147a222921ed909b97c81939bc592f
+fff4e067273fda1bb0c49fced92a55a8
+776d0bda22e871060965a4d2e97d853a
+74be22f2f7ba204c0e4b8a98a22215b5
+85b612600ff8a9ece6674ae833197867
+5bd75a9d659812e10c14e04a35a89b58
+e28a221de79ab1336b5b440e80cc7bde
+db0b82531f80de232c391fb647497304
+4348c09b8cfebf76ecdec92ed5a34160
+0664cd43711668ae8c5b1292cd42248b
+9124ca16d971d6ba17255800faa10ea5
+57b10fc7bbf375372a11a2eb4032d5fa
+5b7bd2e3b684f0f5a24292d8ddf64ec5
+2cc3922c868376dfd268fe72a28f8557
+8c64bfed63856284ca3f31e9f9d9eb36
+796fee97ddfd7c3d49318dde490d3979
+d184fe437f4980001acae1effdfb6a3f
+a90395e84f4f4a9680b176483bf7bcb2
+45f5a3f3764bdc5fe6142c165025dd0f
+23e8bfeb52dfe8e708a82c89da487643
+ab9f20de51855d98cd301f41c50607d6
+50d6e0cd4429b8afb1511fc79dcf4f10
+85ed00d2e23114a1ece84150daaed7eb
+2a2badb39688c204984b349aaac2628d
+21278305cc0aef07da9d69a13892e7d7
+a2493e0f5c6175d71edd27b97d34c854
+fff022590d1982a9153757e80a0f8be3
+ce02fff1b9a2a7a8981fa67fec64d305
+75d85cdd648553a489cce37ad3193aa0
+78d61927a8991e4667f864eb31ead9e9
+2c980f1f0a9e959e186a8f1f594d2a84
+b92a20cea5d978aa22280bcab473470d
+989ce492cadc63e356a5c5381c530e55
+1566e5ead0c94c2bfcc69fd2e3955603
+e1458d42ab475780485341598854aecd
+b7a2095bf521dd5e855869543b490b71
+5a6111b967ec2c8bbb3da0a1da6e022d
+f5f69c317ebe9554591d05149d726502
+38c185d894ea1d3b68030a26bae4f4dc
+953b6d0c32bedff55f8e4f544a13bc28
+8b5b282994c87f81aabad17cb992b547
+c53561ca4c45be89679203b2ec12e9d9
+8d2b55b21a0c44a1cb4511bb5197db2d
+a11e037ccb85882c6e57722da90479d8
+d3d22b265199bd11b158c679bb2d3e84
+26ff4c4041f9b5ae5ca67bfab24c7fd5
+8cfe82c53535aae647879557c6c679a6
+ec4e78462e1ae2903fe38a15c10270c0
+d3dcfab27b98beaa39a2b96d3b126210
+92f689c6f3face988b5ad117ed251d66
+c50926bea38293edc1ad61709f1a0131
+2a7e3fcc4b8eeff5a31eadcd4f848b27
+0b11bad6b9cd761709c69db0bc77c145
+3556e23db83a1f009d97e93b56f7ce4b
+9183564bf4f9f4c2c7290e3eb7c42b80
+0d22201c0d472da358d33005db53913a
+0c47296420dcfda90d89beb5a8ddc6ff
+4159d0702c1d8bbc5765ba9c62e50293
+3a3033a7ff964b8fa8b95959319f92cf
+962a46af55788879d64a9595bebf1607
+547755017f7fe735909d9c74c1af919a
+4118af9ea2f30646ac3152999ffe98d4
+a0531131bb50c64d812bf2d97b2272cb
+6624f9c07c94f0ac4e376be447f1c543
+361cd0957f51f6218d3641dc5d5fd6e2
+0d3bb2a36908db427b85c78a278ffaef
+786bbf8a5911c459968ab41cc1271e1a
+561321d6b9d0a93243c36d3ef7563886
+26a87a07b8279d720cacf61458252f0d
+5613b79db7ba1d15a66d1546ee1e5596
+55da213f93a6956361228280f221272e
+b8342810ae9c18dfd97880a72204b566
+ebf7be7373b467840552e061f110fbd4
+509c2d6c82b12138c7a40b691fe053e1
+f1a9ed1e89d1b4ada55545534fa6accd
+5ed62a9d020ad3366f12b9188ddb21fd
+2968fcd8385c1e8cd82e3918f3593961
+eb80bb702a7a194efb3b25d056b29738
+4def2af4a3757b4dd1be1ffdd00448ac
+9de787ffa47f7b183443fc4510e21976
+7726641ffd265ac00b314b4625fc41c1
+94b83b3ab151672d70c4b6b44c1c7dd0
+0d65214c5af62c27ae22acc67330845c
+15d9e07e8096faeff363d3dbb2cb9b3f
+7f9e04f9258bbea1b8986356c0d2ff1f
+3726802e552657e135e81b58a2685450
+d8022163dbc62d06e02beecfb0b998c2
+1242bcc5ee56676675db3114fcbff134
+bc6e8d59948ee99be2862dd20bc3a91a
+04ea6709a80def4dbc5a18289262cceb
+c01687ab6ecf8f4f8d177a644aa5cf57
+60007dc160364f50d9f242f5266bdedf
+47e047cdb8fa877a546b5c43bc3d9f23
+aae63acc48965df7535995caff07bc04
+f3cd8e48ca7baf7ba1a442c8781ff621
+f2e68488447272a6f60e76b901281feb
+877d1069497428d577ce99173fee3f76
+3de5b4ba96a58541cc81566f57e0f9c9
+2b142f22f9a312cc6aa915d56ceefd9c
+370cfdbe472ee4c55c9d2e7ff6eef39f
+8eca1fff9cc586a3359a1c3ecc8cde51
+90212308290412e6d3f2df2a6f747c12
+23a9a7ed489788d0437141499c68d6e0
+62b8ba3efe63378e5f76efed45d7f14e
+a7d9b2c1b379a3d56405653d840a1978
+4535f99a4da65bf31476549f751c4512
+b2c238080ba0e4a78f2fd0968a39d4ac
+1263721c5ce8311607c12b5d5004a212
+ca6954c45b22f3191b7f120f67196450
+c0f6780d1f7b996dcc7f1a58551fb047
+df4d54e054ea18aeda827ca75401b01e
+795b4087d42878a3021511b0b95ce9a7
+a356227e71f724c55e234d2386b2d115
+04bdcdee9a726f321ef595e56470c2d6
+b5a38f4727fab17f24b3bfb7ecaceb2e
+2d03c8a2780a328cea04af4cadbcf2e0
+f5287d6547dad7566f59f8642d569a32
+f0f072ced91e0fd0c190a8384a1558f9
+647fa05b654564899d5b428da1884348
+39fe3a11457af57516d9d1d71554a3e3
+c8b27f8e20e6ed53f205839bd0d903aa
+474f9077cd402dc522b779ab4decaccb
+02a728dc0ea9535679d742493952796c
+5e797f47962d11a9f0479182d1efb6c9
+c2b21386ec6b21e53d4d00e5501530cd
+7a968caf4c26a9dee69d6259df32ff0a
+cb04b7fe2ee5537ab4fd27d045ed8a9a
+5640edd012e4bf2a03a467aa444de5d6
+202d78105cf5bbd3dd6177267740e7c8
+813f7a94c173bb533a87ecb6aff8973e
+d948b97ebb3c1982487025e4bde464c5
+fa15c93f4de96973a685d4f2d5a21fd5
+b1a841de097ac66f3cef535564d0ff3c
+5d5d92dd8564c0f3f4fe3e18845d25a0
+6c0a8c45d06f9ca7dea557274c89cf93
+2927561c2d9a60afcc5d29376fed91d5
+b43e1adf375dcc33b00a241c38476426
+cb09afec96b494a9ab947b7e04f1c9d1
+fdd351d6b13eeaaa06dd4616023557ee
+2ea6c283edf265f9d776322c22f05de8
+41c66323956780942da3c6cfee7b8ec8
+d22f8c7b9dbfc8a5125e5e0ac5bf9dfc
+0de54288da48657be506aeaa390508e9
+a67ed8e661e0d972670a9655ef182f8d
+7d2f72a008dbd616d8f927e628bdf964
+1e2e02a94552b7c21e872f27624adb68
+f9129cde5d7e092a572be5af6a9a8c42
+7d129391ad87c13049397cd9139365ec
+42e8fdd6408f95039b1e77699d21f5ee
+c2dc63615dcfbfaa65b6945501fe5a72
+31ba68f5ba4ebba3e246bd0dc7f60d04
+4ed59773e8dfe96c0a09c4ab69901360
+0b2f44c830363823630bc672efc38edd
+e29bc22e74ef6e9ba6564ef2833aab74
+0badef9494e372c0e2c0692a3c9e45e4
+168fcf3d452694be0bcc323d091b84ad
+70872c0445f89bec5b6c24f9bff461fe
+2da8f901962bc311ec59c2ac43d7e812
+b483a2f142ff4adde0b3bac95fc49f8f
+ca51b206101ec09172c475ca80c4ee3a
+8807172964ac4cbb0cadb3b276dd3538
+8f7a6564ef23b4bd8348613d5ce78386
+9df6e3da2bb1778fdb499a9a18633451
+84c740978129da8f5279f52716b92e9b
+ffd5f1c91f82b370328f798a269c088d
+37f72f06a72d472836b73508d8c56d38
+c5073404bac0d47ecacb95729b1deaca
+3fc398e5320bde1fb6555d70f476db94
+dd16b5e167b7decf8a5e70374d976d96
+adcb793e6fbb4ca279b84d40a11697b3
+be947cc4b4643b63feab760b418487fb
+2acf42be6232cbe394ff7680d1c1fef7
+41f4a2cc4407f13368d3ff95f770e1de
+ac4835f918abcab3df1da4ab020bd0d3
+bccd2226c99abc2f4d018e2e4c460198
+c8f2c3addd4b58f175f209968a96c19c
+31fbf3555e3ece40abbe5b8e6330be95
+23b4e6b235f9124f1a531402737aef91
+ad915b96e4dc63aed31b14037e8cb2fc
+b2acd592523e71b6fce7d15ea4efb068
+1acdaf7e478a59dabd25ee807a8940f7
+3283a3c6ad1f2abcaebcbb685d4a31b3
+d286a1f0e3e1b6b27c48d295410525f7
+2540e5054b29935b45dad2ec0c5d9ee2
+b4399b0f57ae340b9155c7d085acbd8b
+c3ffb43533892db1a97d34fdb4dd2b65
+a6e483ce02311c1542e289495e52c0f1
+a2a16bbb82d3b01384c1dca4a1fb750b
+66c956827a9a868e4ee801d19d54a48f
+2402c732d414ec9168f286f77a27acd9
+b365260b8e74c0881774a5da8896dcb5
+aa175eb41b42d90f0189ef2f10ec3b94
+076586464e4550839eaedb31adc8fca3
+614c0f0896ac65bdaf263578cc93de1b
+6b840fe43f225e93ca8942f69d6b8408
+b0a73be97de827c887873a8a4a6df08c
+b13c86c17aed5193b45f573ecbc6e9c3
+55d1d8b2e9c1ae0619ad51e380372b05
+47b01c84fa49f50fa2f7dad34b19760c
+61072bd07cd1bf15580c30b7b4768c1e
+31692a153f15feea5cbf9718d81ce198
+a466a38bfe7a6758d1fae379e8873b98
+632bb3f4f8f1125d9c752d9c5d4d3469
+edcd6d515d39ecd83529e45e9ccd7468
+3dc73d6f215cbded5fd15907660908eb
+e07e594d82ff0a64840cebfb844327f3
+683bc20bf945b9bb43bb6e089c277fed
+34a88cf663afd72669a87627b0bd660a
+d9acce58a9a1d381c5fc62fc3b9805a8
+1db7e7aec781566577bb3805a9dc03df
+80558ec676d0350078d505e14a410403
+66862ad17b06b5187d5bfd4fd6993707
+6b9e5568eb81e3dc511c2080901bcd1e
+69ab90fffc8097fa39a191602e31c3ab
+e083331c12f31e5801d3fd57942f4806
+9ebdfc2f0363ff9c9978a6aee1f63700
+4980956439126aa15f6eae86f7f76bd3
+f1b1c77872fedfa775e05951f1a568ef
+b0ae44041b31df34be324726e81979cb
+96002a64ee94308e5635ed362cadc555
+628d0050f0804ad4c566b2e5830c1a25
+b7e9d54e7be4700bce28617e4480887b
+dd7e60671e6ac505206d812aa494ee3a
+abeb96a1b9ba44a48bed927d63f74702
+1fd2f8430df3065c0e5871962c934cc5
+080afdb57cfef8f8a37cbeefdb709d0e
+9abe77061623c3888428d5fb5d107602
+e4bf4235454292afe56b50704c33288c
+611de5c9efc16436e15c229098f9fad4
+570a13c30e5e5692f166c9f15d6894d9
+cd6f712ef3ae8bba04c9fd8c27a7aaaf
+d7d50208288b165969dd54eb4caf2258
+9d6c9b598fc0b3d40283bbb934b11550
+f86ff496a3597f11699667c0aa37ee27
+c97408aaaa62fa361d08ac869436d5ca
+afcaefef38138e356c1ddee9f747b6d4
+065df351b1ff35f037ef565607ee5d6b
+70c46bc9ee8c9f7e9bb5806c02505db5
+59c9cdd0d70101df057ded1840f63575
+54d8d5d6b7fdac3b62cb62fa5770b2a0
+02e429f379c3597d8f0b684782f8825e
+a4e239ac21a4a3b89b0c695fd924f834
+9ed299c09c50703231142886f955da2a
+0b37cdd53e196f0efc2ea63f9d1f08ff
+a1c32b2f5fd0de90294b127154659c88
+53052276609692804ce100a7d3276164
+2d96038480794afbc380be2939f89a43
+f9e465be10310febd5500c2109744367
+b90f4d3f2284b0731167729f2f4bd45b
+1fd77414cea05a4dea9223e2ec1db3fe
+bb19e8879b32a5b3a1d5a5a96765218b
+2c1a281ae96f97690e30f0c235a57b29
+01477d6fa24280ae787451071c10da70
+6f7ecf9418ca9294f5da837fb4d44906
+c31fe5d3f6fedd465e8601a5262ebd22
+59ff97d25a3c2f335555c8d39dffb7d9
+d91714c68bc836587af075514a263b4f
+6145ee067f4e6e01357196b3da872e1d
+5dbb3839c0bfa9a10858e6684d3ba0e1
+af726b931f7b92fbda39af6993f7d0fd
+735626f1b21461197a9d47bd597b208c
+0b31450c4adf2ad3c5be20240d3edc6f
+b7629fceafefb9dab4be2577505b0a50
+6605cc09e22bb347b8f24461c0add60f
+78885002add0b3c2c7436c811886ea46
+bd221828159a42572875b985de13db2b
+c1b64a367532e0007e915a0dbf906507
+cb51f8fb53397b3f3ab5c2b6579c988b
+1dea94f263efd5e8081109d555553a29
+d282eb3f9ed374326edf41d87c4c64b9
+83cbce4ec91f6efe1f7c4c214de4e3d5
+6f6ccf2d1993c21362c93bde18a9db07
+543f3636030a6117837b2bf9abdfb60f
+fff70aca8cfda3ec258fab291feddb05
+c9af4dedc2ba0f766ca9ddd119dfe12d
+c20c4dc12f26f6bf6980d2062d834c66
+499e878331337533f8998d9db155ebb6
+f511700ade424da2528c16237dd7097a
+7ac8d1a8ecf1dd210f8ad1929603fdb1
+1fd5b2dba90176eb472eb688a4517316
+9aeb4594beeeb31f7a381db4b2729f94
+b80f13a66904050de42854f4533925dd
+c14e19a22aeb3c0b899cf62d5b8c09f4
+27548701f4c732ac9de4152223d0a9a4
+0a969382d3168c508a028ed05b2ab292
+640315b1a33bdde0cd9f416becda6f54
+33ede77829bc21433257cd5599d61cd4
+929513f0051a19916bd9efb1a7dd4ac3
+f89520b173f4590b0dc13dcc698b53eb
+f5eef4cf96813741443a6f337ae360a3
+fbb0138e0675f3416d5464b738af42da
+b0828c41fe53c2fd1b47fe94a876b2c7
+66b0fa1f722841fdae0e0039c23d3647
+34a7716fcbc2b58075e8d9636733a734
+c535a7145d59a3bbbec82f7b83a3ce17
+f92ec82f117f89cc5a2fa27cb99ddfc1
+c13202255c72f12c3c8e82a72bf40393
+bb77c22882471a62cc18dcb9543ac4a9
+17b85dbf96f7f2567ac719f07a6c60c9
+70e543d39a390f833415d68f19900bb2
+586d419d1c701038b5feb78e4bdfbb43
+adbbca0a35a0098b3288b2f68a9ff0f3
+d41d8cd98f00b204e9800998ecf8427e
+de7f13af32e2690b7b67fe73c6dc5c7e
+19109f838eeb3683d6d8aaff72480caa
+0da22c3b40f459eac05a440af5554eb8
+91473fbce5854b2a49787df82b0453d7
+5bb8ff1ec352df99f33cbd691d3f0f55
+e239415a8aec7cc6e64462f06063b2c6
+ab5dc18e38b217411415ccf60d6ea32f
+837bf1d70b345bd3ca40775aebd71df7
+8cf9a47e9c8985099b5c2ded32ecf12c
+7966d22a81b4ff60d4875bf99d62c65a
+d057c221170ebd47096229309a4d5976
+28aa3f729e5a40454585d2de35903cd0
+33f52dd293909f99d277bcefac5176e7
+932cbc729f2730b35689810c100295a1
+4cc6a836ed547e64ea4e3bff562b10da
+169cde1a00b51d979b108ca6e734fbdd
+d5316abf64a87170d042d374ebd90562
+1f79d9915e7bbfca627c31bc47c10dd2
+b6aff79837c8f882a3467e67769c1ce0
+a73169021aa9d2640bc2fba7777c58a0
+75dcc033546de7fe833665abf49166be
+520d4bdea0fdb8e4dee12619c144e99e
+171989c4dcb9aaa339af718b09ce0646
+0bda18eb62c3c7a74e26cb5d2c31ccc1
+1d5792cd01c31f45dd9951fbbd61ca4b
+0056fd5839ae34d3a3f3beb73f26bf6e
+1016372feba2841bd652135b58d263e5
+6f50132a860f154ce3915b7b1746b801
+9094e12b641d39bca88384ae20f430db
+e811d824c460edaa85b6aaf0806e9a26
+a23ffbc77a00fe754f4527b22e019b02
+d552c609f7324816500feb9aaf407a3f
+f2f2acbb375045ca79c81e5a792a09ac
+03ae6f00ecbf598b5fb3fd40ac0ff7e6
+c6235c998ce93b21b8b397299763fcb4
+ffe37889a063ff4328477d3c7ede43cc
+1cdb53a654aa705204af1ba01dc7759b
+d5309ff716f712f2db69056a918b57a4
+c91a254e8796407cb96f81c7a2fa6af6
+2b0c9787eea66e9d289717db28f1210c
+ecf9a6b95ae48a0068cdc45c1e1e0363
+5b43def35835e05e1734db8267a69fa1
+d52c5f97a442f628c494e54127a75e2d
+a0fddd5b8f0a4de59eb799fdc554842b
+713b1e35ee0326e4dc7255698c8e2dbd
+d4964149bdea8a8ab5e43e2ed8851d72
+9399244f1695535af187c7a67010a6bb
+05d31a308bea07f435db33d3f92b9c17
+62fb48a1f0f59c4c9b91da79bb493ccd
+4593abab97282d908baff8a19af0c02f
+9a2fb3f5a3e2d85cb636dd6af8f34a2a
+68781174612dd3c26addec54747eae0e
+b60d7b8aae2b4ff8a7676ec1a037376a
+7fe07829a807a464af28169ecc6fa20f
+bebdf996d365358ad958ea17d292d942
+5197da010878f604f97875e4c38b2c5f
+7f98e68060c9ff75e495faddf95087f2
+9d9192e4389b197654a4dbf99699af0f
+090eba97b8736606165f6518cf1fc223
+523de1b9e61ab51918057f3d606cf7ef
+635cc1a932cbfe8042174c372c38102d
+47b530e7b57dcce328b3766b3655680c
+f1bdd25f4c535e9f4d0c15043c77f819
+a233a9504ab4597133e077a651d8fb05
+2579c58bfd6c4bb81ee502f7d9881cc8
+41a0951de8e4bb57314d554e20b9161f
+357cbdd3b601d1bb7489405909449ba0
+524f5366a405b9d1fd163565e6fcea71
+e7fc40e6ef3c00a24112a6bc66f6ef30
+6cfa33cea0dec8e7c50d52caf3b2e55b
+8c1255cafb61be5bd75d5307482a244d
+c0ed3cced9ddaebd4038d364ef8ec9cb
+fb3894eb0a6080831eee6df12a561bd6
+1b0836473430b9e3fd232691d15420b0
+b3732d05b2ced8d61ac6081c0e0efe3d
+f4af72f8b90658ab279b2620db2ff932
+4625e4d604e7c54df826f1b1414cffee
+de575e23a50dacdd2a2c5dec02e3202f
+63a515a35d20236fde949ebfb2bf5e76
+43e0b1a6487239a3b641eef7a47b7ac9
+23fe7f8cb9b7fdbdb545f225abef9bc8
+9a0ffdb92ab58fc520e75286b6d66d0c
+86975b03876695307ef7832288d50300
+5ecfebe9658ae9bc77570c8118074f95
+be57445eff4012c2e219d27f065dcbd0
+b21947dd12b6ddf9501fe5e1150c65f8
+e97d5fe74489ba3a78033814b6e8178f
+42a51aab000c0c3dd65fc092e03c3c1e
+f2595ccaac58d972ae68e0038de5b7a8
+b794eab370e7354864994b96c048f0e3
+68a6d74346aa831fe9cd41ea5def3460
+cdc923dbca899d12ea5f62080d61f24c
+5a4942c5880639fbba4152b2da648a94
+8e8f373b69d05ee646f66c2f7c3a06d8
+9eced944d84a487b7fa0977a3358a756
+1b662bd1d02432c063f284f0c749d8ea
+295b10f74b6428415846d2c6fa8eadff
+d69406909a2f50eff6e70c6ad1b1c193
+0bc91b8ad9146f0a90f016801c030d6c
+e1c1ac2d7ed1534840f671e8ed913c34
+7f89450df860ac554b3fa68025bf9ec4
+b297c9eace48931adfff6def4b884423
+c5c5e907c0fe5f9090153ff84c8952cc
+0700f9660fc0d54e5b6b2e49c3c3f44d
+9c2c2fa5889c26232a6fcdf7a3af63e4
+e4272197a54c383b5732c59f67b19de1
+55b24263fc334ac12ecb48f77babbd54
+d41d8cd98f00b204e9800998ecf8427e
+d41d8cd98f00b204e9800998ecf8427e
+9606f92d555f097007ef2eb357e10572
+b05995c0ce8abcbaa7e74d3438e7b034
+d965d7fceba729515f02b5d084c23be3
+785aad6b0b289f0a83785508def89392
+876694b2ec9b1fe9fe0df1be994c9c94
+83477d8ad8504c2418126a8955e67789
+db0e20047594fa70e957a9f6000d2189
+a748855b5692b1671007689e1783a86d
+8714d36bf87cbd04e01748d44f5096d4
+ab99500ca86c654e32ea14ebdb714290
+7b8eaf4528e7b52f804017939f36c784
+754ac6a4480cb4a8209c68eda5dbfceb
+b1a3f4b1dc0d850b39d7686b8b8bf3b6
+33e024aa7239b6f6d23dac2e688ce033
+7109236e43e80305e27e24e279500004
+ccf8a25b857c3bcc796a0aab129affa8
+247b4d873606ddf50b39c760b1cb0384
+92fd4d6cf27624455085cead5e7a77dd
+8964452ba992d1009ca199bbc4a6ad2e
+46e9c25b7e9aca44f1037e65d59186a1
+da8eac5d3e9499544a85c42e114d33ce
+be7dcb2b59ddb149950d3912671a55b5
+a0c2a8c84654fcb3ffe8ba857d60710f
+c0c5e1e8de02746056ef7f4769ba08de
+307ab2b247df7d078b4a8520e54fef82
+9ab45d7906ee32dcb27279b946ff408c
+ae487f2632bae7d646f06c912bc31fd3
+fed631dc0f07721a29b9597275c21f79
+f41e053483b02d9744cec8b9be13b420
+5b0f549cc4201174a9cf20ea2daa1b06
+2652e741ba4a325e2b0d0dfb3f470396
+0296687b4eb6d939a9ea54fbd403d2c2
+25eda214a60f0eab8ba7d1f14c114352
+ccd86204cef43be7c80ae150ec61e9b9
+eb532d7babc98cfaa342f3ce8dfb247f
+34ce60bef579bcc1cf932f4ef488a894
+a6ceb39d70ddf88aa865bc12976d836c
+ef7a0c7ae427a631e52dcdb41ac7d875
+09a03ad2f4f13cd9dc8b36ae446f0789
+390a6a203ecab6596e291899f00690c6
+6ec1f9798ba44b8724bcb863d3f01651
+3da669d8e2e31986827906121b9cc6a0
+258a683e7b3589600f536ac06701a71a
+90467697e3f7866efc007263a59b0411
+5792541c66234a80ba8819ef6c6987e3
+c959097f91c1d8f077777d070c0d294a
+8d35c2cc9b159e3e5156af08ca8d05c3
+154c68478d2d9cdc6faf927a9be310bd
+aaa7d5428eb16185f056b71652d5ad04
+fb6b162a6fc00727bd109b176f0dcca8
+92a09caf08b59fd29a7a6ecda6defe43
+e65caa35fa5ac24d371b82bd8169dd7a
+58e41af015b40169f1325b8eea6d6286
+5d6c85096055c44e224b94d28a2f492d
+179a2ed672d2aad6c6977e9278187d8f
+f98414071fc97514f5e55b1cb0571508
+449d2a7413121c779f397c9ea6d4707c
+f315ab6d241b62bf92839c9744dccc76
+b843a207664c36d3c7d512754921f406
+e62ad608f30531831e02a6aa7ff30772
+20e9769bfb62c705598c9e7cb3114fcf
+110d76c9076277de61ac0d210e00ea99
+37363223d2947e22d512af1d73caadd7
+a92a3480f9f521705438771d3cf8fe8c
+c871575298e5b7c4fa4c179108ff36b0
+b48c48d47a9d1dd0971f1111bfa67843
+c82d5d566bee4894b8253cc3dabddb41
+a30ef38e14ed0508d303217996734c9b
+e507b8c66775124c38668714b90e0f43
+27123a2129ccfb6264fefd9cd9b2cbb6
+ccc852c070670f0143cf51c4a5541328
+cb162c2dd562f49af187c05f39e1d6a9
+a7432d7cb59af5ecb9abd7926f603211
+d2bc46c172ff3f81c2440bbcd38dda1f
+5760aada9983ab894fdd8f61932b761a
+a99c856fd7d804e6471e1a2c85c45c13
+da62441dc5d06c8891798b2b5cfcf665
+c3ea7bdd9e13f8eb4c6d0331f544631e
+0fe4823b800e3ae17ba390c4485d53b5
+0fe9e2f0cf97e508c7f624beb6fd9f6d
+6cd0f4231972ae0660582db52ef7cc23
+b73f04635f9ec675d3b1edd253209457
+802092a88b94afadbda2e0f59fc884e4
+4fb184c96271dc1bd96f483bf87038ec
+4b9f1633d056a13ca0659fcc3654d36b
+2285611b7122fc937464c7ef7bef03d8
+d1a31a1894fb21625468269daefed02f
+238cdf6829a3f5b8b358b59456c240b2
+52a28cbcd0f0deca50b2a580f5582288
+1c2f8222ee57d66f888b809617701970
+9a7404134c52c3f0d328fdcfc775b730
+81097ec4cd4088bc7ff4199b04600ce7
+9e88cbd4f8648835c9c7642e53fdb0ae
+3bacab0c4f7304c3c1fb51e0166e131a
+3937c6ac4951900197bf9bc5a649b672
+d78b2820b38bc05700e643914668719a
+3ace2800560783e999f3edb53ebd7051
+ef09468a586d6877b5092956860b32fe
+f059e7c4752be9ca3e4c5dd2c62f90ec
+71aab94a8367756efa53fb507bc92464
+d7cee17688d254862307c0c7ec56c306
+fa284eb1f7e8bf4220acbedbbb8f7889
+32da44c8d0cb96027dfc9ea398c61949
+2bd15d94b269eda71b9661eb9b662140
+52b130fdb1dccb4b2f00d104c8b5367a
+97e26b59c73539e180695f03c8baadf5
+72949d382336780d7b0115485e6952e3
+2147b385add11a789dadb9256ee9fa79
+c478aa16e61a71f2b5020fa300065db5
+903b55bd2dbf885fd3f20a8f77599991
+a071e4c1958e6ece5379b432a1e8a8be
+d3e95042631a609e44ed78b5e239399f
+8e8f01a12abc718e60f1896d0135a674
+6c8e176465adb250c5b828881762deab
+0f809d86b3bbf8d78a784963c939e233
+3a3625db0e36ff156eda206423342eaf
+82b82639e3f28f261b5ec84f6e5e089d
+d79519ab0b9363cf25fba50cde7e93c4
+d1946c27d3cfea2c22e85dd3dda0dd46
+199cbae81a1e67934490162aed0f75e6
+289fbf98706bc2a1e2db76c42729ec92
+5b14e94bb38ab3b30fce6cb68aea4c46
+38b4318954acad766eb2f5b5a1388ba9
+5933a7c35678d0c854808dac5f979b35
+6c5e47b3b013414bfcfa12006902307a
+96ffa27de2d0656927e667cd252a143b
+2d9077324aac51a74602565b48c1a1e3
+944b8a413921a8cc4e8614f67f218dd4
+d04e38b42164623ee6f2b26455e75166
+d67d14f0deb11de4eebe929f0ef2029f
+5bd835292124416ec83fbbd988d504a2
+301c6102f3587d87355c64634ed29031
+1ba1896770c276ee7bbd26a27f9ad8b0
+9cf204f68637ff25333e775058b36ebd
+3a1215c548fdbf9b147298029d23d69d
+d03d31533295d185cc034e8165c91b40
+17437041817c7ca25d292bcbbff4cd06
+bc03311409b1637609e56f193bb52437
+85b016c2b54a085536948bda1bcc852e
+ffdfdf006894cc5badb0b4de82da2d10
+3ca05fec9da59dd32456ea7f16bb89bd
+9da14305915fe6d94fa31530bd3ed70c
+60d8259a21a1ad4df15dbe1f5fafb116
+a30a535fb327cb6da1aaf8c543ad5b01
+ccaab1ea2c790c900762f06a89da6d19
+2834ed765c6488eeca427a3bea3ebabc
+1f86ec033fa171563e7043862b5d8274
+70462fbb753e21e4141312827bd7389b
+daa6cc1798551facd3ea6f9455f4692f
+3ffba6b103d9942f79318249bcd6703c
+8b5d3515c8000ec0175324a504e5afb3
+1c9c25a6a2fc40ad2edb1b0bb7296d50
+71a660785f9d022f5fcb6a7850fb0d28
+d2c5eb8753d00f6eb517ab611260f664
+4b1d0bcbf89ecb56c6a4b0a58aef45ab
+4a6c0c83f1797b2fc1b818904249d1ea
+f362f9f590b63ae2dc3ac88991676f3f
+38f9147e82ca91ea4eb63714319589f0
+9a18842564e162c2bab52f809e47c25b
+5160149eade5fe401c309eed798044c3
+67e79ba11114f0b8b33af87cf9f0bafc
+bb1e6288cb1d8e4849480b8dd60cfca2
+e6e4435d9bd81d29046a30634209638c
+9de0becaf147dfa128adeadd30b03b1e
+c2915e53f9ad61256173d207d8937ae5
+f4f97811a94f0d364233123facd4cfa7
+d41d8cd98f00b204e9800998ecf8427e
+ac40d05aaca47f430b1d6a11381760e1
+901b02d538aced8d75f8d3fafb8e5643
+e2f42f48e8e63df523372ab682aba70b
+0d58b5261336320997909b0541f3963e
+4ed01f4da5f374132c3d0ea77b4e83f5
+d7e243c61857ba4c92e90b82f0915b98
+d860acce014bc17e74315000e1537c41
+57a50c74faac05fca307f713df0c8120
+4a585e0c1608d498e252063c4422aa0a
+1529d36d3708791cc062b23f83231dfb
+526c01896f73d6c7c9d9241f0f2bb1b9
+4eb315eb65968eaf1c1531e85ff9b9f7
+7644b2d857abbe282570c2d959763ad6
+eebf01c25c4ffcae57e223c9ca0e0c6f
+ed409b3eed7affd7d0e85eb3569079cc
+f0491e25a733f2010f2cd2fcda065046
+bf8a8516c435c27183384cb2437c9218
+b9a3ff562a8f06cdf1e0c4f1255001d4
+7708c2feb87de5b5640caf46b7c91751
+601a4aca0434d8f38fa474e2f61335a4
+5e67c2ec738e6bcba4b8d3365c93070d
+81052e29cc1f1960e9c18e1915ef0880
+0e794504e33696cbee146c6bb6dcd6bd
+8005555a7f0a11e7b5b2db60f1f3e8e6
+8767b88c7cb4ee0c1e0298367a94e1de
+469ce859d4eac923b1565f3a65e13070
+757efbe7709856c0c6c4dd1841049d8a
+cf784bd482a4900c4c370eb0e6ef8592
+44ddbd6a1845424b2990aff2ebdcff2e
+69a1323c8e30c7a9ff1cf1b92b03a6c4
+090b76538b2fdfa5780af37f0dab018b
+f3b37d61806db8bf05962025f42c16c0
+2a9f7b4300aebcc472077e5623cb1b39
+ec267be4d15989c8cf05318618ec860f
+fcf751c8b8ccfaddd4bf4404e0fde6fb
+eb55024084478f1c80f36836643e108f
+67111b2bfba5d398d4d72e54886545ff
+ef607b16a2752c3222b6512ccee8db5b
+510a0cf7fcce4345c38ac204bd460556
+4a9c93de716d27657ae974b3bfb49e80
+297c227676bdc36c0d3839dc5dccca6c
+697fe65d2cefb43a95495a8a1d1d4cef
+fb61836a1f95bbc0056c7c9a59376b4a
+bac8638095ac13a3f92fc3951031fbaf
+af540ad083ecd6049687e7faf7d2d64b
+9f0b7192955754ba7dc61296b12aa2a7
+39a01119ebeae58778c84ef3758fb9b2
+f325c5c5aedf65f683f7c47e03ded2a9
+90569b12e82e38adf8f400fb425a4199
+f952ce4eeff1bc17d9ae86010e21eb47
+5c543123387bcabecb5456a7128d8427
+f188698cdf956bad10162533e834fbbf
+5471e2da26ec0521990e6d661261df11
+1c35e8c1b3bb6cd97708df65b7cf71f6
+0dc05b06dd3273b9edc09f1eae772288
+a19ac8b12f45f9971ece25df32e7f234
+f4e50395682d5039865626b1a36ef426
+a1c81776747bfedeebb591500055825b
+d8ff984b4fe0d83043c3d7df72a73d9d
+6bfc3320236a00cdd870cc3a154cf127
+0835ef87884c61b40c245b7d24a3c6c3
+8ed826d700f17d93e4752104c5c7c67f
+9fa18db42b69d4f928666b04d7c1e46f
+b21e3d7d6ce950303575ee8281f2e33f
+609aa1579363fadbd9c997362a77a9af
+b8cd846d6314a157941715f3418374db
+9a8996869bfeaa41b5032a315624c399
+e1a1b884e57abacef28631dde6f9980c
+f562ba071a3789c3a45bc8a67e229dae
+724a46c9276049dab3d4308768a9d128
+8744e2d11e699a37360128ab106d23dd
+c02191e1b99eb5993d7009ac89aed319
+191f97833ddfa2154865bd9a612ab858
+2665237823701a7a91f0b56b4f8d4eeb
+ca1d149955260cf30375ec5b6fd0bb7d
+1aa9f95d49a62b6f765f94f0a316ef8f
+37b3af472dae58ee9fc810c6e9c6dd24
+1ce2bb1489f6f3d885af1dee6a0e6583
+9a358bb905499040cb2e472fa56ed99e
+6bf2435d50bfb89abd29e9906d682bec
+c0350541b80011142995712db7fce328
+f49cddd7221862aaa400af782c04d5e3
+08a0d0f322a6cf76ddb362bacd98b5a3
+d271843b455653e59dfa14e2d9cadfb6
+839c81f7b9d78941741aaf34d340bd3e
+113c25f46e52c909f547cf932a49e2bf
+d333223b24397e4ee08540ebedf8833f
+de64f5c11c651eaf91005ea9926a0d96
+155add75bc9193afa2e9640914850da3
+3d5be6fda23de7922c927cd9fed6e5f9
+258c4898afc1031e814f87dd896c042f
+5c813b2a04d0e7cd4f7a565e73079df0
+72e993ca6f3ed1d74c665f5115c242ff
+d41d8cd98f00b204e9800998ecf8427e
+8b9f265c101ef2add4059a342224f8c4
+6fdacf56f06aadfae0d15b6738d56a73
+79139f3d255378bc99e1582121f50997
+c5045450c8490f80146d515d54094776
+64be8f8c8780557f146e1dc2d2173876
+a6fc64959bd7c8ac36ce953539c44ff6
+9d31616df696a928ef6798029e982acb
+82e4b351b0273e024c0863eca59fe1c9
+a7049d2f9156db6224bbe4b0bc0c28a6
+fad9fd5a171cb03ab5dc60aa3464bc76
+c9bf34ad489902f1174dd8d3d5f54e19
+7f7588c5f16f2567dbca798e8e46af6e
+0017c2ecfcc9e03326bd56f6ae0db6c5
+3e56a9eb19203208405d2541cf5b200e
+3882b53832d62d5097f13441e58cd68c
+0fa91f4c38ea4b1ab2bbf748401ab7b4
+2ed1e1d4237f44ad3721b2643ec8f6fc
+8e4d5c1025781c0c78be3f54d4b62fb9
+531c8adb5c469cb4ffd50e8e64af3255
+cae3fe00abdbcf66f3785ceae8d636b4
+fe2a5761460503efdcb123875052a19c
+2354121b94eaf74da5b3507d20c7ab51
+90a62b98314aaa2d3f0ff52b2e7930c5
+cb509edc432a2c8596d7d708b6e9062d
+5fad5f595d1cb63430ea7b76c1ebee4c
+b656cba1999f7759b55d2ba926b4ffa8
+b5656fa0bd9f9d5d5bc110dab997e333
diff --git a/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/eval_varlength.chk b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/eval_varlength.chk
new file mode 100644
index 000000000..e9eb66348
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/eval_varlength.chk
@@ -0,0 +1 @@
+part_eval_10k.hdf5	611d8bae26646145e1c33338a27ba124
diff --git a/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/hdf5_md5.py b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/hdf5_md5.py
new file mode 100644
index 000000000..66345add3
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/hdf5_md5.py
@@ -0,0 +1,29 @@
+import argparse
+import h5py
+import numpy as np
+import hashlib
+import os
+
+# Exmaple usage:
+# python3 tfrecord_md5sum.py --input_tfrecord=eval_10k --output_md5sum=eval_shard.md5
+
+parser = argparse.ArgumentParser(
+    description="HDF5 variable length to MD5sums for BERT.")
+parser.add_argument(
+    '--input_hdf5',
+    type=str,
+    required=True,
+    help='Input tfrecord path')
+args = parser.parse_args()
+
+
+if __name__ == '__main__':
+
+  h = hashlib.md5
+
+row_sums=[]
+f = h5py.File(args.input_hdf5, 'r')
+for i in range(f['input_ids'].shape[0]):
+    row_sums.append(h(str(f['input_ids'][i].tolist()).encode('utf-8')).hexdigest())
+f.close()
+print("{}\t{}".format(os.path.basename(args.input_hdf5), h(str(row_sums).encode('utf-8')).hexdigest()))
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/parallel_create_hdf5.sh b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/parallel_create_hdf5.sh
new file mode 100644
index 000000000..0a59c447c
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/parallel_create_hdf5.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+CPUS=$( ls -d /sys/devices/system/cpu/cpu[[:digit:]]* | wc -w )
+CPUS=$((CPUS / 2))
+echo "Using ${CPUS} CPU cores"
+
+function usage()
+{
+   cat << HEREDOC
+
+   Usage: $progname [-i|--inputdir PATH -o|--outputdir PATH -v|--vocab VOCAB-PATH] [-h|--help TIME_STR]
+
+   optional arguments:
+     -h, --help            show this help message and exit
+     -o, --outputdir PATH  pass in a localization of resulting dataset
+     -i, --inputdir PATH   pass in a localization of resulting hdf5 files
+     -v, --vocab PATH pass in exact path to vocabulary file
+
+HEREDOC
+}
+
+
+#parse passed arguments
+while [[ $# -gt 0 ]]; do
+  key="$1"
+
+  case $key in
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    -o|--outputdir)
+      OUTPUTDIR="$2"
+      shift # past argument
+      shift # past value
+      ;;
+    -i|--inputdir)
+      INPUTDIR="$2"
+      shift
+      shift
+      ;;
+    -v|--vocab)
+      VOCAB="$2"
+      shift
+      shift
+      ;;
+    *)    # unknown option
+      usage
+      exit 1
+      ;;
+  esac
+done
+
+# get script reference directory 
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+
+
+mkdir -p ${OUTPUTDIR}
+find -L ${INPUTDIR} -name "part-00*" | xargs --max-args=1 --max-procs=${CPUS} -I{}  bash ${SCRIPT_DIR}/create_pretraining_data_wrapper.sh {} ${OUTPUTDIR} ${VOCAB}
+
+### If continue, you can try instead of line above something like line below to pick only the files not yet computed 
+# comm -3 <(ls -1 ${INPUTDIR}/) <(ls -1 ${OUTPUTDIR} | sed 's/\.hdf5$//') | grep -e "^part" | xargs --max-args=1 --max-procs=${CPUS} -I{}  ${SCRIPT_DIR}/create_pretraining_data_wrapper.sh ${INPUTDIR}/{} ${OUTPUTDIR} ${VOCAB}
+
diff --git a/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/pick_eval_samples.py b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/pick_eval_samples.py
new file mode 100644
index 000000000..b4cab2ed2
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/pick_eval_samples.py
@@ -0,0 +1,83 @@
+"""Script for picking certain number of samples.
+"""
+
+import argparse
+import time
+import logging
+import collections
+import h5py
+import numpy as np
+
+parser = argparse.ArgumentParser(
+    description="Eval sample picker for BERT.")
+parser.add_argument(
+    '--input_hdf5_file',
+    type=str,
+    default='',
+    help='Input hdf5_file path')
+parser.add_argument(
+    '--output_hdf5_file',
+    type=str,
+    default='',
+    help='Output hdf5_file path')
+parser.add_argument(
+    '--num_examples_to_pick',
+    type=int,
+    default=10000,
+    help='Number of examples to pick')
+parser.add_argument(
+    '--max_seq_length',
+    type=int,
+    default=512,
+    help='The maximum number of tokens within a sequence.')
+parser.add_argument(
+    '--max_predictions_per_seq',
+    type=int,
+    default=76,
+    help='The maximum number of predictions within a sequence.')
+args = parser.parse_args()
+
+max_seq_length = args.max_seq_length
+max_predictions_per_seq = args.max_predictions_per_seq
+logging.basicConfig(level=logging.INFO)
+
+if __name__ == '__main__':
+  tic = time.time()
+  h5_ifile = h5py.File(args.input_hdf5_file, 'r')
+  num_examples = h5_ifile.get('next_sentence_labels').shape[0]
+
+  input_ids = np.zeros([args.num_examples_to_pick, max_seq_length], dtype="int16")
+  input_mask = np.zeros([args.num_examples_to_pick, max_seq_length], dtype="int8")
+  segment_ids = np.zeros([args.num_examples_to_pick, max_seq_length], dtype="int8")
+  masked_lm_positions = np.zeros([args.num_examples_to_pick, max_predictions_per_seq], dtype="int16")
+  masked_lm_ids = np.zeros([args.num_examples_to_pick, max_predictions_per_seq], dtype="int16")
+  next_sentence_labels = np.zeros(args.num_examples_to_pick, dtype="int8")
+
+#  hdf5_compression_method = "gzip"
+  hdf5_compression_method = None
+  i = 0
+  pick_ratio = num_examples / args.num_examples_to_pick
+  num_examples_picked = 0
+  for i in range(args.num_examples_to_pick):
+    idx = int(i * pick_ratio)
+    input_ids[i,:] = h5_ifile['input_ids'][idx,:]
+    input_mask[i,:] = h5_ifile['input_mask'][idx,:]
+    segment_ids[i,:] = h5_ifile['segment_ids'][idx,:]
+    masked_lm_positions[i,:] = h5_ifile['masked_lm_positions'][idx,:]
+    masked_lm_ids[i,:] = h5_ifile['masked_lm_ids'][idx,:]
+    next_sentence_labels[i] = h5_ifile['next_sentence_labels'][idx]
+    num_examples_picked += 1
+
+  h5_writer = h5py.File(args.output_hdf5_file+".hdf5", 'w')
+  h5_writer.create_dataset('input_ids', data=input_ids, dtype='i2', compression=hdf5_compression_method)
+  h5_writer.create_dataset('input_mask', data=input_mask, dtype='i1', compression=hdf5_compression_method)
+  h5_writer.create_dataset('segment_ids', data=segment_ids, dtype='i1', compression=hdf5_compression_method)
+  h5_writer.create_dataset('masked_lm_positions', data=masked_lm_positions, dtype='i2', compression=hdf5_compression_method)
+  h5_writer.create_dataset('masked_lm_ids', data=masked_lm_ids, dtype='i2', compression=hdf5_compression_method)
+  h5_writer.create_dataset('next_sentence_labels', data=next_sentence_labels, dtype='i1', compression=hdf5_compression_method)
+  h5_writer.flush()
+  h5_writer.close()
+
+  toc = time.time()
+  logging.info("Picked %d examples out of %d samples in %.2f sec",
+               args.num_examples_to_pick, num_examples, toc - tic)
diff --git a/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/pick_eval_samples_varlength.py b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/pick_eval_samples_varlength.py
new file mode 100644
index 000000000..380f101b0
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/pick_eval_samples_varlength.py
@@ -0,0 +1,76 @@
+"""Script for picking certain number of samples.
+"""
+
+import argparse
+import time
+import logging
+import collections
+import h5py
+import numpy as np
+
+parser = argparse.ArgumentParser(
+    description="Eval sample picker for BERT.")
+parser.add_argument(
+    '--input_hdf5_file',
+    type=str,
+    default='',
+    help='Input hdf5_file path')
+parser.add_argument(
+    '--output_hdf5_file',
+    type=str,
+    default='',
+    help='Output hdf5_file path')
+parser.add_argument(
+    '--num_examples_to_pick',
+    type=int,
+    default=10000,
+    help='Number of examples to pick')
+parser.add_argument(
+    '--max_seq_length',
+    type=int,
+    default=512,
+    help='The maximum number of tokens within a sequence.')
+parser.add_argument(
+    '--max_predictions_per_seq',
+    type=int,
+    default=76,
+    help='The maximum number of predictions within a sequence.')
+args = parser.parse_args()
+
+max_seq_length = args.max_seq_length
+max_predictions_per_seq = args.max_predictions_per_seq
+logging.basicConfig(level=logging.INFO)
+
+if __name__ == '__main__':
+  tic = time.time()
+  h5_ifile = h5py.File(args.input_hdf5_file, 'r')
+  num_examples = h5_ifile.get('next_sentence_labels').shape[0]
+
+#  hdf5_compression_method = "gzip"
+  hdf5_compression_method = None
+
+  h5_writer = h5py.File(args.output_hdf5_file+".hdf5", 'w')
+  input_ids = h5_writer.create_dataset('input_ids', (args.num_examples_to_pick,), dtype=h5py.vlen_dtype(np.dtype('int16')), compression=hdf5_compression_method)
+  segment_ids = h5_writer.create_dataset('segment_ids', (args.num_examples_to_pick,), dtype=h5py.vlen_dtype(np.dtype('int8')), compression=hdf5_compression_method)
+  masked_lm_positions = h5_writer.create_dataset('masked_lm_positions', (args.num_examples_to_pick,), dtype=h5py.vlen_dtype(np.dtype('int16')), compression=hdf5_compression_method)
+  masked_lm_ids = h5_writer.create_dataset('masked_lm_ids', (args.num_examples_to_pick,), dtype=h5py.vlen_dtype(np.dtype('int16')), compression=hdf5_compression_method)
+  next_sentence_labels = h5_writer.create_dataset('next_sentence_labels', data=np.zeros(args.num_examples_to_pick, dtype="int8"), dtype='i1', compression=hdf5_compression_method)
+
+  i = 0
+  pick_ratio = num_examples / args.num_examples_to_pick
+  num_examples_picked = 0
+  for i in range(args.num_examples_to_pick):
+    idx = int(i * pick_ratio)
+    input_ids[i] = h5_ifile['input_ids'][idx, :sum(h5_ifile['input_mask'][idx])]
+    segment_ids[i] = h5_ifile['segment_ids'][idx, :sum(h5_ifile['input_mask'][idx])]
+    masked_lm_positions[i] = h5_ifile['masked_lm_positions'][idx, :sum(h5_ifile['masked_lm_positions'][idx]!=0)]
+    masked_lm_ids[i] = h5_ifile['masked_lm_ids'][idx, :sum(h5_ifile['masked_lm_positions'][idx]!=0)]
+    next_sentence_labels[i] = h5_ifile['next_sentence_labels'][idx]
+    num_examples_picked += 1
+
+  h5_writer.flush()
+  h5_writer.close()
+
+  toc = time.time()
+  logging.info("Picked %d examples out of %d samples in %.2f sec",
+               args.num_examples_to_pick, num_examples, toc - tic)
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/prepare_data.sh b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/prepare_data.sh
new file mode 100644
index 000000000..e88ad4d03
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/prepare_data.sh
@@ -0,0 +1,167 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+function usage()
+{
+   cat << HEREDOC
+
+   Usage: $progname [-o|--outputdir PATH] [-h|--help TIME_STR]
+
+   optional arguments:
+     -h, --help            show this help message and exit
+     -o, --outputdir PATH  pass in a localization of resulting dataset
+     -s, --skip-download   skip downloading raw files from GDrive (assuming it already has been done)
+     -p, --shards          number of resulting shards. For small scales (less than 256 nodes) use 2048. For sacles >256 4320 is recommended (default 4320)
+
+HEREDOC
+}
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+
+#if no arguments passed
+DATADIR=/workspace/bert_data
+SKIP=0
+SHARDS=4320
+
+#parse passed arguments
+while [[ $# -gt 0 ]]; do
+  key="$1"
+
+  case $key in
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    -o|--outputdir)
+      DATADIR="$2"
+      shift # past argument
+      shift # past value
+      ;;
+    -p|--shards)
+      SHARDS="$2"
+      shift # past argument
+      shift # past value
+      ;;
+    -s|--skip-download)
+      SKIP=1
+      shift
+      ;;
+    *)    # unknown option
+      usage
+      exit 1
+      ;;
+  esac
+done
+
+
+echo "Preparing Mlperf BERT dataset in ${DATADIR}"
+mkdir -p ${DATADIR}
+
+if (( SKIP==0 )) ; then
+    
+    mkdir -p ${DATADIR}/phase1 && cd ${DATADIR}/phase1
+    ### Download 
+    # bert_config.json
+    gdown https://drive.google.com/uc?id=1fbGClQMi2CoMv7fwrwTC5YYPooQBdcFW    
+    # vocab.txt
+    gdown https://drive.google.com/uc?id=1USK108J6hMM_d27xCHi738qBL8_BT1u1
+    
+    ### Download dataset
+    mkdir -p ${DATADIR}/download && cd ${DATADIR}/download
+    # md5 sums
+    gdown https://drive.google.com/uc?id=1tmMgLwoBvbEJEHXh77sqrXYw5RpqT8R_
+    # processed chunks
+    gdown https://drive.google.com/uc?id=14xV2OUGSQDG_yDBrmbSdcDC-QGeqpfs_
+    # unpack results and verify md5sums
+    tar -xzf results_text.tar.gz && (cd results4 && md5sum --check ../bert_reference_results_text_md5.txt)
+    
+    
+    ### Download TF1 checkpoint
+    mkdir -p ${DATADIR}/phase1 && cd ${DATADIR}/phase1
+    # model.ckpt-28252.data-00000-of-00001
+    gdown https://drive.google.com/uc?id=1chiTBljF0Eh1U5pKs6ureVHgSbtU8OG_
+    # model.ckpt-28252.index
+    gdown https://drive.google.com/uc?id=1Q47V3K3jFRkbJ2zGCrKkKk-n0fvMZsa0
+    # model.ckpt-28252.meta
+    gdown https://drive.google.com/uc?id=1vAcVmXSLsLeQ1q7gvHnQUSth5W_f_pwv
+    
+    cd ${DATADIR}
+    
+fi
+
+### Create HDF5 files for training
+mkdir -p ${DATADIR}/hdf5/training
+bash ${SCRIPT_DIR}/parallel_create_hdf5.sh -i ${DATADIR}/download/results4 -o ${DATADIR}/hdf5/training -v ${DATADIR}/phase1/vocab.txt
+
+### Chop HDF5 files into chunks
+python3 ${SCRIPT_DIR}/chop_hdf5_files.py \
+ --num_shards ${SHARDS} \
+ --input_hdf5_dir ${DATADIR}/hdf5/training \
+ --output_hdf5_dir ${DATADIR}/hdf5/training-${SHARDS}
+
+### Convert fixed length to variable length format
+mkdir -p ${DATADIR}/hdf5/training-${SHARDS}/hdf5_${SHARDS}_shards_varlength
+CPUS=$( ls -d /sys/devices/system/cpu/cpu[[:digit:]]* | wc -w )
+CPUS=$((CPUS / 2))
+ls -1 ${DATADIR}/hdf5/training-${SHARDS}/hdf5_${SHARDS}_shards_uncompressed | \
+  xargs --max-args=1 --max-procs=${CPUS} -I{} python3 ${SCRIPT_DIR}/convert_fixed2variable.py \
+  --input_hdf5_file ${DATADIR}/hdf5/training-${SHARDS}/hdf5_${SHARDS}_shards_uncompressed/{} \
+  --output_hdf5_file ${DATADIR}/hdf5/training-${SHARDS}/hdf5_${SHARDS}_shards_varlength/{}
+
+### Create full HDF5 files for evaluation
+mkdir -p ${DATADIR}/hdf5/eval
+python3 ${SCRIPT_DIR}/create_pretraining_data.py \
+ --input_file=${DATADIR}/download/results4/eval.txt \
+ --output_file=${DATADIR}/hdf5/eval/eval_all \
+ --vocab_file=${DATADIR}/phase1/vocab.txt \
+ --do_lower_case=True \
+ --max_seq_length=512 \
+ --max_predictions_per_seq=76 \
+ --masked_lm_prob=0.15 \
+ --random_seed=12345 \
+ --dupe_factor=10
+
+### pick 10k samples for evaluation
+python3 ${SCRIPT_DIR}/pick_eval_samples.py \
+ --input_hdf5_file=${DATADIR}/hdf5/eval/eval_all.hdf5 \
+ --output_hdf5_file=${DATADIR}/hdf5/eval/part_eval_10k \
+ --num_examples_to_pick=10000
+
+### Convert fixed length to variable length format
+mkdir -p ${DATADIR}/hdf5/eval_varlength
+python3 ${SCRIPT_DIR}/convert_fixed2variable.py --input_hdf5_file ${DATADIR}/hdf5/eval/part_eval_10k.hdf5 \
+  --output_hdf5_file ${DATADIR}/hdf5/eval_varlength/part_eval_10k.hdf5
+
+### Convert Tensorflow checkpoint to Pytorch one
+python3 ${SCRIPT_DIR}/convert_tf_checkpoint.py \
+  --tf_checkpoint ${DATADIR}/phase1/model.ckpt-28252 \
+  --bert_config_path ${DATADIR}/phase1/bert_config.json \
+  --output_checkpoint ${DATADIR}/phase1/model.ckpt-28252.pt
+
+
+###
+ln -s ${DATADIR}/phase1/model.ckpt-28252.pt ${DATADIR}/model.ckpt-28252.pt
+ln -s ${DATADIR}/phase1/bert_config.json ${DATADIR}/bert_config.json
+mkdir ${DATADIR}/eval_set_uncompressed
+ln -s ${DATADIR}/hdf5/eval/part_eval_10k.hdf5  ${DATADIR}/eval_set_uncompressed/
+ln -s ${DATADIR}/hdf5/training-2048/hdf5_2048_shards_uncompressed ${DATADIR}/2048_shards_uncompressed
+
+### Example of how to generate checksums to verify correctness of the process
+# for i in `seq -w 0000 04319`; do 
+#   python ${SCRIPT_DIR}/hdf5_md5.py \
+#     --input_hdf5 ${DATADIR}/hdf5/training-${SHARDS}/hdf5_${SHARDS}_shards_varlength/part_${i}_of_04320.hdf5 
+# done | tee 4320_shards_varlength.chk
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/process_wiki.sh b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/process_wiki.sh
new file mode 100644
index 000000000..c2b7da9c4
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/process_wiki.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# invocation script to cleanup the wiki dataset
+# Usage: ./process_wiki.sh <the wiki_?? files>
+# example: ./process_wiki.sh 'sample_data/wiki_??'
+# The resulted files will be placed in ./results
+
+inputs=$1
+
+pip install nltk
+
+# Remove doc tag and title
+# python ./cleanup_file.py --data=$inputs --output_suffix='.1'
+
+# Further clean up files
+# for f in ${inputs}; do
+#   ./clean.sh ${f}.1 ${f}.2
+# done
+
+# Sentence segmentation
+# python ./do_sentence_segmentation.py --data=$inputs --input_suffix='.2' --output_suffix='.3'
+
+mkdir -p ./results
+
+# Train/Eval seperation
+python ./seperate_test_set.py --data=$inputs --input_suffix='.3' --output_suffix='.4' --num_test_articles=10000 --test_output='./results/eval'
+
+## Choose file size method or number of packages by uncommenting only one of the following do_gather options
+# Gather into fixed size packages
+python ./do_gather.py --data=$inputs --input_suffix='.4' --block_size=26.92 --out_dir='./results'
+
+# Gather into fixed number of packages
+#NUM_PACKAGES=512
+#python ./do_gather.py --data=$inputs --input_suffix='.3' --num_outputs=$NUM_PACKAGES --out_dir='./results'
diff --git a/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/seperate_test_set.py b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/seperate_test_set.py
new file mode 100644
index 000000000..eb4192ee0
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/seperate_test_set.py
@@ -0,0 +1,120 @@
+"""Script for seperating training and test sets.
+"""
+import argparse
+import glob
+import io
+import logging
+import multiprocessing
+import os
+import time
+import random
+import hashlib
+
+parser = argparse.ArgumentParser(
+    description='Training and test sets seperator for BERT.')
+parser.add_argument(
+    '--data',
+    type=str,
+    default='./*/*.compact',
+    help='Input files. Default is "./*/*.compact"')
+parser.add_argument(
+    '--input_suffix',
+    type=str,
+    default='.3',
+    help='Suffix for input files. Default is ".3"')
+parser.add_argument(
+    '--output_suffix',
+    type=str,
+    default='.4',
+    help='Suffix for output training files. Default is ".4"')
+parser.add_argument(
+    '--nworker',
+    type=int,
+    default=72,
+    help='Number of workers for parallel processing.')
+parser.add_argument(
+    '--seed',
+    type=int,
+    default=12345,
+    help='Seed for randomization. Default is 12345.')
+parser.add_argument(
+    '--num_test_articles',
+    type=int,
+    default=10000,
+    help='Number of articals withheld in test set. Default is 10k.')
+parser.add_argument(
+    '--test_output',
+    type=str,
+    default='./results/eval',
+    help='Postfix for test set output. txt and md5 extensions will be added.')
+args = parser.parse_args()
+
+# arguments
+input_files = sorted(glob.glob(os.path.expanduser(args.data)))
+num_files = len(input_files)
+num_workers = args.nworker
+logging.basicConfig(level=logging.INFO)
+logging.info('Number of input files to process = %d', num_files)
+# test_articles_in_files = [[] for _ in range(num_files)]
+
+def process_one_file(file_id):
+  """Seperating train and eval data, for one file."""
+  one_input = input_files[file_id]
+  input_filename = one_input + args.input_suffix
+  output_filename = one_input + args.output_suffix
+  num_articles = 0
+  num_tests = int((file_id+1) * args.num_test_articles * 1.0 / num_files) \
+      - int(file_id * args.num_test_articles * 1.0 / num_files)
+  file_seed = args.seed + file_id * 13
+  rng = random.Random(file_seed)
+  test_articles = []
+
+  with io.open(input_filename, 'r', encoding='utf-8', newline='\n') as fin:
+    with io.open(output_filename, 'w', encoding='utf-8', newline='\n') as fout:
+      lines = fin.read()
+      articles = lines.split('\n\n')
+      num_articles = len(articles)
+      test_article_ids = []
+      while len(test_article_ids) < num_tests:
+        new_id = int(rng.random() * num_articles)
+        if new_id in test_article_ids:
+          continue
+        test_article_ids.append(new_id)
+
+      for i in range(num_articles):
+        article = articles[i]
+        if i in test_article_ids:
+          # test_articles_in_files[file_id].append(article)
+          test_articles.append(article)
+        else:
+          fout.write(article)
+          fout.write('\n\n')
+
+  logging.info('Processed %s => %s, %d of %d articals picked into test set. %s',
+               input_filename, output_filename, num_tests, num_articles,
+               test_article_ids)
+  return test_articles
+
+
+if __name__ == '__main__':
+  tic = time.time()
+  p = multiprocessing.Pool(num_workers)
+  file_ids = range(num_files)
+  test_articles_in_files = p.map(process_one_file, file_ids)
+  toc = time.time()
+  logging.info('Processed %s (%d files) in %.2f sec',
+               args.data, num_files, toc - tic)
+
+  output_filename = args.test_output + '.txt'
+  hash_filename = args.test_output + '.md5'
+  with io.open(output_filename, 'w', encoding='utf-8', newline='\n') as fout:
+    with io.open(hash_filename, 'w', encoding='utf-8', newline='\n') as hashout:
+      for f in test_articles_in_files:
+        for article in f:
+          fout.write(article)
+          fout.write('\n\n')
+          
+          article_hash = hashlib.md5(article.rstrip().encode('utf-8')).hexdigest()
+          hashout.write(article_hash)
+          hashout.write('\n')
+
diff --git a/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/tokenization.py b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/tokenization.py
new file mode 100644
index 000000000..4beb5b35c
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/data_preprocessing/tokenization.py
@@ -0,0 +1,413 @@
+"""Tokenization classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+import unicodedata
+
+from absl import flags
+import six
+import tensorflow.compat.v1 as tf
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_bool(
+    "preserve_unused_tokens", False,
+    "If True, Wordpiece tokenization will not be applied to words in the vocab."
+)
+
+_UNUSED_TOKEN_RE = re.compile("^\\[unused\\d+\\]$")
+
+
+def preserve_token(token, vocab):
+  """Returns True if the token should forgo tokenization and be preserved."""
+  if not FLAGS.preserve_unused_tokens:
+    return False
+  if token not in vocab:
+    return False
+  return bool(_UNUSED_TOKEN_RE.search(token))
+
+
+def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
+  """Checks whether the casing config is consistent with the checkpoint name."""
+
+  # The casing has to be passed in by the user and there is no explicit check
+  # as to whether it matches the checkpoint. The casing information probably
+  # should have been stored in the bert_config.json file, but it's not, so
+  # we have to heuristically detect it to validate.
+
+  if not init_checkpoint:
+    return
+
+  m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
+  if m is None:
+    return
+
+  model_name = m.group(1)
+
+  lower_models = [
+      "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
+      "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
+  ]
+
+  cased_models = [
+      "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
+      "multi_cased_L-12_H-768_A-12"
+  ]
+
+  is_bad_config = False
+  if model_name in lower_models and not do_lower_case:
+    is_bad_config = True
+    actual_flag = "False"
+    case_name = "lowercased"
+    opposite_flag = "True"
+
+  if model_name in cased_models and do_lower_case:
+    is_bad_config = True
+    actual_flag = "True"
+    case_name = "cased"
+    opposite_flag = "False"
+
+  if is_bad_config:
+    raise ValueError(
+        "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
+        "However, `%s` seems to be a %s model, so you "
+        "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
+        "how the model was pre-training. If this error is wrong, please "
+        "just comment out this check." % (actual_flag, init_checkpoint,
+                                          model_name, case_name, opposite_flag))
+
+
+def convert_to_unicode(text):
+  """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text.decode("utf-8", "ignore")
+    elif isinstance(text, unicode):
+      return text
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
+
+
+def printable_text(text):
+  """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+  # These functions want `str` for both Python2 and Python3, but in one case
+  # it's a Unicode string and in the other it's a byte string.
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, unicode):
+      return text.encode("utf-8")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+  """Loads a vocabulary file into a dictionary."""
+  vocab = collections.OrderedDict()
+  with tf.gfile.GFile(vocab_file, "r") as reader:
+    while True:
+      token = convert_to_unicode(reader.readline())
+      if not token:
+        break
+      token = token.strip()
+      if token not in vocab:
+        vocab[token] = len(vocab)
+  return vocab
+
+
+def convert_by_vocab(vocab, items):
+  """Converts a sequence of [tokens|ids] using the vocab."""
+  output = []
+  for item in items:
+    output.append(vocab[item])
+  return output
+
+
+def convert_tokens_to_ids(vocab, tokens):
+  return convert_by_vocab(vocab, tokens)
+
+
+def convert_ids_to_tokens(inv_vocab, ids):
+  return convert_by_vocab(inv_vocab, ids)
+
+
+def whitespace_tokenize(text):
+  """Runs basic whitespace cleaning and splitting on a piece of text."""
+  text = text.strip()
+  if not text:
+    return []
+  tokens = text.split()
+  return tokens
+
+
+class FullTokenizer(object):
+  """Runs end-to-end tokenziation."""
+
+  def __init__(self, vocab_file, do_lower_case=True):
+    self.vocab = load_vocab(vocab_file)
+    self.inv_vocab = {v: k for k, v in self.vocab.items()}
+    self.basic_tokenizer = BasicTokenizer(
+        do_lower_case=do_lower_case, vocab=self.vocab)
+    self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+  def tokenize(self, text):
+    split_tokens = []
+    for token in self.basic_tokenizer.tokenize(text):
+      if preserve_token(token, self.vocab):
+        split_tokens.append(token)
+        continue
+      for sub_token in self.wordpiece_tokenizer.tokenize(token):
+        split_tokens.append(sub_token)
+
+    return split_tokens
+
+  def convert_tokens_to_ids(self, tokens):
+    return convert_by_vocab(self.vocab, tokens)
+
+  def convert_ids_to_tokens(self, ids):
+    return convert_by_vocab(self.inv_vocab, ids)
+
+
+class BasicTokenizer(object):
+  """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+  def __init__(self, do_lower_case=True, vocab=tuple()):
+    """Constructs a BasicTokenizer.
+
+    Args:
+      do_lower_case: Whether to lower case the input.
+      vocab: A container of tokens to not mutate during tokenization.
+    """
+    self.do_lower_case = do_lower_case
+    self.vocab = vocab
+
+  def tokenize(self, text):
+    """Tokenizes a piece of text."""
+    text = convert_to_unicode(text)
+    text = self._clean_text(text)
+
+    # This was added on November 1st, 2018 for the multilingual and Chinese
+    # models. This is also applied to the English models now, but it doesn't
+    # matter since the English models were not trained on any Chinese data
+    # and generally don't have any Chinese data in them (there are Chinese
+    # characters in the vocabulary because Wikipedia does have some Chinese
+    # words in the English Wikipedia.).
+    text = self._tokenize_chinese_chars(text)
+
+    orig_tokens = whitespace_tokenize(text)
+    split_tokens = []
+    for token in orig_tokens:
+      if preserve_token(token, self.vocab):
+        split_tokens.append(token)
+        continue
+      if self.do_lower_case:
+        token = token.lower()
+        token = self._run_strip_accents(token)
+      split_tokens.extend(self._run_split_on_punc(token))
+
+    output_tokens = whitespace_tokenize(" ".join(split_tokens))
+    return output_tokens
+
+  def _run_strip_accents(self, text):
+    """Strips accents from a piece of text."""
+    text = unicodedata.normalize("NFD", text)
+    output = []
+    for char in text:
+      cat = unicodedata.category(char)
+      if cat == "Mn":
+        continue
+      output.append(char)
+    return "".join(output)
+
+  def _run_split_on_punc(self, text):
+    """Splits punctuation on a piece of text."""
+    chars = list(text)
+    i = 0
+    start_new_word = True
+    output = []
+    while i < len(chars):
+      char = chars[i]
+      if _is_punctuation(char):
+        output.append([char])
+        start_new_word = True
+      else:
+        if start_new_word:
+          output.append([])
+        start_new_word = False
+        output[-1].append(char)
+      i += 1
+
+    return ["".join(x) for x in output]
+
+  def _tokenize_chinese_chars(self, text):
+    """Adds whitespace around any CJK character."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if self._is_chinese_char(cp):
+        output.append(" ")
+        output.append(char)
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+
+  def _is_chinese_char(self, cp):
+    """Checks whether CP is the codepoint of a CJK character."""
+    # This defines a "chinese character" as anything in the CJK Unicode block:
+    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    #
+    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+    # despite its name. The modern Korean Hangul alphabet is a different block,
+    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+    # space-separated words, so they are not treated specially and handled
+    # like the all of the other languages.
+    if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+        (cp >= 0x3400 and cp <= 0x4DBF) or  #
+        (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+        (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+        (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+        (cp >= 0x2B820 and cp <= 0x2CEAF) or
+        (cp >= 0xF900 and cp <= 0xFAFF) or  #
+        (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+      return True
+
+    return False
+
+  def _clean_text(self, text):
+    """Performs invalid character removal and whitespace cleanup on text."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if cp == 0 or cp == 0xfffd or _is_control(char):
+        continue
+      if _is_whitespace(char):
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+
+
+class WordpieceTokenizer(object):
+  """Runs WordPiece tokenziation."""
+
+  def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
+    self.vocab = vocab
+    self.unk_token = unk_token
+    self.max_input_chars_per_word = max_input_chars_per_word
+
+  def tokenize(self, text):
+    """Tokenizes a piece of text into its word pieces.
+
+    This uses a greedy longest-match-first algorithm to perform tokenization
+    using the given vocabulary.
+
+    For example:
+      input = "unaffable"
+      output = ["un", "##aff", "##able"]
+
+    Args:
+      text: A single token or whitespace separated tokens. This should have
+        already been passed through `BasicTokenizer.
+
+    Returns:
+      A list of wordpiece tokens.
+    """
+
+    text = convert_to_unicode(text)
+
+    output_tokens = []
+    for token in whitespace_tokenize(text):
+      chars = list(token)
+      if len(chars) > self.max_input_chars_per_word:
+        output_tokens.append(self.unk_token)
+        continue
+
+      is_bad = False
+      start = 0
+      sub_tokens = []
+      while start < len(chars):
+        end = len(chars)
+        cur_substr = None
+        while start < end:
+          substr = "".join(chars[start:end])
+          if start > 0:
+            substr = "##" + substr
+          if substr in self.vocab:
+            cur_substr = substr
+            break
+          end -= 1
+        if cur_substr is None:
+          is_bad = True
+          break
+        sub_tokens.append(cur_substr)
+        start = end
+
+      if is_bad:
+        output_tokens.append(self.unk_token)
+      else:
+        output_tokens.extend(sub_tokens)
+    return output_tokens
+
+
+def _is_whitespace(char):
+  """Checks whether `chars` is a whitespace character."""
+  # \t, \n, and \r are technically control characters but we treat them
+  # as whitespace since they are generally considered as such.
+  if char == " " or char == "\t" or char == "\n" or char == "\r":
+    return True
+  cat = unicodedata.category(char)
+  if cat == "Zs":
+    return True
+  return False
+
+
+def _is_control(char):
+  """Checks whether `chars` is a control character."""
+  # These are technically control characters but we count them as whitespace
+  # characters.
+  if char == "\t" or char == "\n" or char == "\r":
+    return False
+  cat = unicodedata.category(char)
+  if cat in ("Cc", "Cf"):
+    return True
+  return False
+
+
+def _is_punctuation(char):
+  """Checks whether `chars` is a punctuation character."""
+  cp = ord(char)
+  # We treat all non-letter/number ASCII as punctuation.
+  # Characters such as "^", "$", and "`" are not in the Unicode
+  # Punctuation class but we treat them as punctuation anyways, for
+  # consistency.
+  if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+      (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+    return True
+  cat = unicodedata.category(char)
+  if cat.startswith("P"):
+    return True
+  return False
diff --git a/nlp/language_model/bert_sample/pytorch/base/dataloaders/__init__.py b/nlp/language_model/bert_sample/pytorch/base/dataloaders/__init__.py
new file mode 100644
index 000000000..7c351dd26
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/dataloaders/__init__.py
@@ -0,0 +1,3 @@
+from .dataset import PretrainingDataset
+from .dataloader import create_train_dataloader, create_eval_dataloader
+from .dataloader import WorkerInitializer
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/base/dataloaders/dataloader.py b/nlp/language_model/bert_sample/pytorch/base/dataloaders/dataloader.py
new file mode 100644
index 000000000..0331b50d5
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/dataloaders/dataloader.py
@@ -0,0 +1,235 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import math
+import os
+import random
+from concurrent.futures import ProcessPoolExecutor, Future
+from typing import Union, Tuple, Any, List
+
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, ConcatDataset
+from torch.utils.data.distributed import DistributedSampler
+
+import utils
+from .dataset import PretrainingDataset
+
+
+def get_sampler(dataset, sampler_type):
+    return dict(
+        random=RandomSampler,
+        sequential=SequentialSampler,
+        distributed=DistributedSampler
+    )[sampler_type.lower()](dataset)
+
+
+class WorkerInitializer(object):
+
+    _instance = None
+
+    def __init__(self, seed):
+        self.seed = seed
+
+    def __call__(self, idx):
+        np.random.seed(seed=self.seed + idx)
+        random.seed(self.seed + idx)
+
+    @classmethod
+    def default(cls, seed=0):
+        if cls._instance is None:
+            cls._instance = cls(seed)
+        return cls._instance
+
+
+# sampler: Random | Sequential | Distributed
+def create_train_dataloader(
+        dataset,
+        batch_size,
+        worker_init_fn: WorkerInitializer=None,
+        sampler_type='Random',
+        pin_memory=True
+):
+    if worker_init_fn is None:
+        worker_init_fn = WorkerInitializer.default()
+    sampler = get_sampler(dataset, sampler_type)
+    dataloader = DataLoader(
+        dataset,
+        sampler=sampler,
+        batch_size=batch_size,
+        num_workers=0 if batch_size <= 8 else 4,
+        worker_init_fn=worker_init_fn,
+        pin_memory=pin_memory,
+    )
+
+    return dataloader
+
+
+def create_eval_dataloader(eval_dir, eval_batch_size, max_predictions_per_seq, num_eval_examples, worker_init_fn):
+    eval_data = []
+    for eval_file in sorted(os.listdir(eval_dir)):
+        eval_file_path = os.path.join(eval_dir, eval_file)
+        if os.path.isfile(eval_file_path) and 'part' in eval_file_path:
+            eval_data.extend(PretrainingDataset(eval_file_path, max_pred_length=max_predictions_per_seq))
+            if len(eval_data) > num_eval_examples:
+                eval_data = eval_data[:num_eval_examples]
+                break
+
+    if torch.distributed.is_initialized():
+        chunk_size = num_eval_examples // torch.distributed.get_world_size()
+        eval_sampler = DistributedSampler(eval_data, shuffle=False)
+    else:
+        chunk_size = num_eval_examples
+        eval_sampler = SequentialSampler(eval_data)
+    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size,
+                                 num_workers=0 if min(chunk_size, eval_batch_size) <= 10 else 4,
+                                 worker_init_fn=worker_init_fn, pin_memory=True)
+    return eval_dataloader
+
+
+class PretrainingDataloaders:
+
+    def __init__(self, train_dir: str,
+                 max_predictions_per_seq: int,
+                 batch_size: int=2,
+                 shuffle: bool=True,
+                 seed: Union[int, list]=0,
+                 num_replicas: int=None,
+                 rank: int=None,
+                 num_files_per_iter: int=1,
+                 worker_init: WorkerInitializer=None,
+                 pin_memory: bool=True,
+                 pool: ProcessPoolExecutor=None):
+        self.train_dir = train_dir
+        self.max_predictions_per_seq = max_predictions_per_seq
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.seed = seed
+        self.num_files_per_iter = num_files_per_iter
+        self.worker_init = worker_init
+        self.pin_memory = pin_memory
+
+        self.files = self.get_files()
+        self.num_files = len(self.files)
+
+        if num_replicas is None:
+            if dist.is_initialized():
+                num_replicas = dist.get_world_size()
+            else:
+                num_replicas = 1
+        self.num_replicas = num_replicas
+
+        if rank is None:
+            rank = utils.get_rank()
+        self.rank = rank
+
+        self.num_files_per_replica = int(math.ceil(self.num_files / self.num_replicas))
+        self.total_files = self.num_files_per_replica * self.num_replicas
+
+        self.files_per_replica: List[str] = None
+
+        # Prefetch dataloader
+        self.pool = pool
+        self.prefetched_dataloader_future: Future = None
+
+        self.sub_dataloader: DataLoader = None
+
+    def get_seed(self, epoch=0):
+        if isinstance(self.seed, (tuple, list)):
+            # print("self.seed len: ", len(self.seed))
+            # print("epoch: ", epoch)
+            # length = len(self.seed)
+            # return self.seed[epoch % length]
+            return self.seed[epoch]
+
+        return self.seed + epoch
+
+    def get_files(self):
+        join = os.path.join
+        files = [join(self.train_dir, f) for f in os.listdir(self.train_dir) if
+                 os.path.isfile(join(self.train_dir, f)) and 'part' in f]
+
+        files.sort()
+
+        return files
+
+    def set_epoch(self, epoch):
+        if self.shuffle:
+            random.Random(self.get_seed(epoch)).shuffle(self.files)
+
+        files_per_replica = self.files[self.rank: self.total_files: self.num_replicas]
+        padding_size = self.num_files_per_replica - len(files_per_replica)
+        if padding_size > 0:
+            files_per_replica = files_per_replica + self.files[: padding_size]
+        self.files_per_replica = files_per_replica
+
+    @staticmethod
+    def next_dataloader(idx: int, max_predictions_per_seq: int,
+                        files_per_replica: List, num_files_per_iter: int,
+                        batch_size: int, shuffle: bool,
+                        worker_init: WorkerInitializer, pin_memory: bool):
+        # print("files_per_replica = ", files_per_replica)
+        # print(idx, num_files_per_iter)
+        files_per_iter = files_per_replica[idx * num_files_per_iter: (idx + 1) * num_files_per_iter]
+        datasets = []
+        for file in files_per_iter:
+            datasets.append(PretrainingDataset(file, max_predictions_per_seq))
+
+        datasets = ConcatDataset(datasets)
+        sampler_type = "Random" if shuffle else "Sequential"
+        return create_train_dataloader(
+            datasets, batch_size, worker_init,
+            sampler_type=sampler_type, pin_memory=pin_memory
+        )
+
+    def iter_batchs(self) -> Tuple[int, int, Any]:
+        for dataloader_idx, sub_dataloader in enumerate(self):
+            for batch_idx, batch in enumerate(sub_dataloader):
+                yield dataloader_idx, batch_idx, batch
+
+    def __iter__(self):
+        self._next_index = 0
+        self._num_iters = int(math.ceil(self.num_files_per_replica / self.num_files_per_iter))
+        return self
+
+    def __next__(self) -> DataLoader:
+        if self._next_index < self._num_iters:
+            next_dataloader_args = dict(
+                max_predictions_per_seq=self.max_predictions_per_seq,
+                files_per_replica=self.files_per_replica,
+                num_files_per_iter=self.num_files_per_iter,
+                batch_size=self.batch_size,
+                shuffle=self.shuffle,
+                worker_init=self.worker_init,
+                pin_memory=self.pin_memory
+            )
+            if (self._next_index + 1) < len(self.files_per_replica):
+                self.prefetch_dataloader(self._next_index + 1, **next_dataloader_args)
+            if self._next_index == 0 or self.pool is None:
+                data = self.next_dataloader(
+                    idx=self._next_index,
+                    **next_dataloader_args
+                )
+            else:
+                data = self.prefetched_dataloader_future.result()
+            self._next_index += 1
+            self.sub_dataloader = data
+            return data
+        else:
+            raise StopIteration()
+
+    def prefetch_dataloader(self, idx, *args, **kwargs):
+        if self.pool is not None:
+            self.prefetched_dataloader_future = self.pool.submit(
+                self.next_dataloader,
+                idx = idx, *args, **kwargs
+            )
+
diff --git a/nlp/language_model/bert_sample/pytorch/base/dataloaders/dataset.py b/nlp/language_model/bert_sample/pytorch/base/dataloaders/dataset.py
new file mode 100644
index 000000000..a9b52f8e2
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/dataloaders/dataset.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import random
+
+import h5py
+import numpy as np
+import os
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch.utils.data import Dataset
+
+
+class PretrainingDataset(Dataset):
+    def __init__(self, input_file, max_pred_length):
+        self.input_file = input_file
+        self.max_pred_length = max_pred_length
+        f = h5py.File(input_file, "r")
+        keys = ['input_ids', 'input_mask', 'segment_ids', 'masked_lm_positions', 'masked_lm_ids',
+                'next_sentence_labels']
+        self.inputs = [np.asarray(f[key][:]) for key in keys]
+        f.close()
+
+    def __len__(self):
+        'Denotes the total number of samples'
+        return len(self.inputs[0])
+
+    def __getitem__(self, index):
+        [input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids, next_sentence_labels] = [
+            torch.from_numpy(input[index].astype(np.int64)) if indice < 5 else torch.from_numpy(
+                np.asarray(input[index].astype(np.int64))) for indice, input in enumerate(self.inputs)]
+
+        masked_lm_labels = torch.zeros(input_ids.shape, dtype=torch.long)
+        index = self.max_pred_length
+        masked_token_count = torch.count_nonzero(masked_lm_positions)
+        if masked_token_count != 0:
+            index = masked_token_count
+        masked_lm_labels[masked_lm_positions[:index]] = masked_lm_ids[:index]
+
+        return [input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels]
+
+
+def exchange_padding_fast(device, max_batch_size, input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels):
+    torch.cuda.nvtx.range_push('exchangepadding')
+    pad_size = max_batch_size - input_ids.shape[0]
+    if pad_size > 0:
+        input_ids = F.pad(input_ids, (0, 0, 0, pad_size))
+        segment_ids = F.pad(segment_ids, (0, 0, 0, pad_size))
+        input_mask = F.pad(input_mask, (0, 0, 0, pad_size))
+        masked_lm_labels = F.pad(masked_lm_labels, (0, 0, 0, pad_size))
+        next_sentence_labels = F.pad(next_sentence_labels, (0, pad_size))
+    ngpus = torch.distributed.get_world_size()
+    nseqs = input_mask.shape[0]
+    ntokensperseq = input_mask.shape[1]
+    igpu = torch.distributed.get_rank()
+
+    flattened_length_seq = nseqs * ntokensperseq
+    flattened_length_nsp = nseqs
+
+    def get_local_packet_size():
+        return 4 * flattened_length_seq + flattened_length_nsp
+
+    # Storing tensors in same order as arguments
+    def encode_packet(input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels):
+
+        packet = torch.zeros([get_local_packet_size()], device=device, dtype=torch.int16)
+
+        curr_pos = 0
+
+        packet[curr_pos:curr_pos + flattened_length_seq] = input_ids.view(-1)[:]
+        curr_pos += flattened_length_seq
+
+        packet[curr_pos:curr_pos + flattened_length_seq] = segment_ids.view(-1)[:]
+        curr_pos += flattened_length_seq
+
+        packet[curr_pos:curr_pos + flattened_length_seq] = input_mask.view(-1)[:]
+        curr_pos += flattened_length_seq
+
+        packet[curr_pos:curr_pos + flattened_length_seq] = masked_lm_labels.view(-1)[:]
+        curr_pos += flattened_length_seq
+
+        packet[curr_pos:curr_pos + flattened_length_nsp] = next_sentence_labels.view(-1)[:]
+
+        return packet
+
+    def decode_packet(flat_packet):
+        packet = flat_packet.view(ngpus, get_local_packet_size())
+
+        curr_pos = 0
+
+        input_ids_ = packet[:, curr_pos:curr_pos + flattened_length_seq].contiguous().view(ngpus, nseqs, ntokensperseq)
+        curr_pos += flattened_length_seq
+
+        segment_ids_ = packet[:, curr_pos:curr_pos + flattened_length_seq].contiguous().view(ngpus, nseqs,
+                                                                                             ntokensperseq)
+        curr_pos += flattened_length_seq
+
+        input_mask_ = packet[:, curr_pos:curr_pos + flattened_length_seq].contiguous().view(ngpus, nseqs, ntokensperseq)
+        curr_pos += flattened_length_seq
+
+        masked_lm_labels_ = packet[:, curr_pos:curr_pos + flattened_length_seq].contiguous().view(ngpus, nseqs,
+                                                                                                  ntokensperseq)
+        curr_pos += flattened_length_seq
+
+        next_sentence_labels_ = packet[:, curr_pos:curr_pos + flattened_length_nsp].contiguous().view(ngpus, nseqs)
+
+        return input_ids_, segment_ids_, input_mask_, masked_lm_labels_, next_sentence_labels_
+
+    tensors = encode_packet(input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels)
+
+    tensors_ = torch.zeros([ngpus, get_local_packet_size()], device=device, dtype=torch.float16)
+    tensors_ = list(torch.split(tensors_, 1))
+    # Address valueError: ProcessGroupGloo::allgather: invalid tensor size at index 0 (expected (2049), got (1, 2049))
+    torch.distributed.all_gather(tensors_, tensors.view(torch.float16).unsqueeze(0))
+
+    tensors_ = torch.stack(tensors_).view(torch.int16).long()
+    input_ids_, segment_ids_, input_mask_, masked_lm_labels_, next_sentence_labels_ = decode_packet(tensors_)
+
+    seqlens_, indices = torch.sort(input_mask_.sum(dim=2).view(-1), descending=True)
+
+    if pad_size > 0:
+        input_ids_sorted = input_ids_.view(ngpus * nseqs, ntokensperseq)[indices[:], :]
+        segment_ids_sorted = segment_ids_.view(ngpus * nseqs, ntokensperseq)[indices[:], :]
+        input_mask_sorted = input_mask_.view(ngpus * nseqs, ntokensperseq)[indices[:], :]
+        masked_lm_labels_sorted = masked_lm_labels_.view(ngpus * nseqs, ntokensperseq)[indices[:], :]
+        next_sentence_labels_sorted = next_sentence_labels_.view(ngpus * nseqs)[indices[:]]
+        # we need to remove the empty sequences we added to the batch
+        valid_idx = seqlens_.view(nseqs, ngpus)[:, igpu] > 0
+        input_ids_sorted = input_ids_sorted.view(nseqs, ngpus, ntokensperseq)[valid_idx, igpu, :].contiguous()
+        segment_ids_sorted = segment_ids_sorted.view(nseqs, ngpus, ntokensperseq)[valid_idx, igpu, :].contiguous()
+        input_mask_sorted = input_mask_sorted.view(nseqs, ngpus, ntokensperseq)[valid_idx, igpu, :].contiguous()
+        masked_lm_labels_sorted = masked_lm_labels_sorted.view(nseqs, ngpus, ntokensperseq)[valid_idx, igpu,
+                                  :].contiguous()
+        next_sentence_labels_sorted = next_sentence_labels_sorted.view(nseqs, ngpus)[valid_idx, igpu].contiguous()
+    else:
+        indices_ = indices.view(nseqs, ngpus)[:, igpu]
+        input_ids_sorted = input_ids_.view(nseqs * ngpus, ntokensperseq)[indices_, :].contiguous()
+        segment_ids_sorted = segment_ids_.view(nseqs * ngpus, ntokensperseq)[indices_, :].contiguous()
+        input_mask_sorted = input_mask_.view(nseqs * ngpus, ntokensperseq)[indices_, :].contiguous()
+        masked_lm_labels_sorted = masked_lm_labels_.view(nseqs * ngpus, ntokensperseq)[indices_, :].contiguous()
+        next_sentence_labels_sorted = next_sentence_labels_.view(nseqs * ngpus)[indices_].contiguous()
+
+    torch.cuda.nvtx.range_pop()
+    return input_ids_sorted, segment_ids_sorted, input_mask_sorted, masked_lm_labels_sorted, next_sentence_labels_sorted
diff --git a/nlp/language_model/bert_sample/pytorch/base/model/__init__.py b/nlp/language_model/bert_sample/pytorch/base/model/__init__.py
new file mode 100644
index 000000000..d1311dcd9
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/model/__init__.py
@@ -0,0 +1,19 @@
+from model.models.modeling import BertForPretraining
+from model.models.modeling import BertConfig, BertForPreTraining
+
+
+def create_model(config):
+    config.resume_step = 0
+
+    bert_config = BertConfig.from_json_file(config.bert_config_path)
+    bert_config.fused_gelu_bias = config.fused_gelu_bias
+    bert_config.dense_seq_output = config.dense_seq_output
+    bert_config.fuse_dropout = config.enable_fuse_dropout
+    bert_config.fused_dropout_add = config.fused_dropout_add
+
+    # Padding for divisibility by 8
+    if bert_config.vocab_size % 8 != 0:
+        bert_config.vocab_size += 8 - (bert_config.vocab_size % 8)
+
+    model = BertForPreTraining(bert_config)
+    return bert_config, model
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/base/model/layers/__init__.py b/nlp/language_model/bert_sample/pytorch/base/model/layers/__init__.py
new file mode 100644
index 000000000..1a0702a49
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/model/layers/__init__.py
@@ -0,0 +1,6 @@
+import torch
+
+from .activations import bias_gelu_impl
+
+__all__ = ["bias_gelu_impl"]
+
diff --git a/nlp/language_model/bert_sample/pytorch/base/model/layers/activations.py b/nlp/language_model/bert_sample/pytorch/base/model/layers/activations.py
new file mode 100644
index 000000000..96b48ab66
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/model/layers/activations.py
@@ -0,0 +1,82 @@
+# coding=utf-8
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import torch
+from torch import nn
+
+# Fused GeLU
+torch._C._jit_set_profiling_mode(False)                                                                                    
+torch._C._jit_set_profiling_executor(False)                                                                                
+torch._C._jit_override_can_fuse_on_cpu(True)                                                                               
+torch._C._jit_override_can_fuse_on_gpu(True) 
+# 1/sqrt(2*pi)-> 0.3989423
+# 1/sqrt(2)   -> 0.70710678
+# sqrt(2/pi)  -> 0.79788456
+
+# this function is tanh approximation of gelu
+# actual gelu is:
+# x * 0.5 * (1.0 + torch.erf(x * 0.70710678))
+@torch.jit.script
+def bias_gelu(bias, y):
+  x = bias + y
+  return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+
+# gradient of tanh approximation of gelu
+# gradient of actual gelu is:
+# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
+@torch.jit.script
+def bias_gelu_back(g, bias, y):
+  x = bias + y
+  tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+  # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+  ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
+  return ff*g
+
+class GeLUFunction(torch.autograd.Function):
+  @staticmethod
+  # bias is an optional argument
+  def forward(ctx, input, bias):
+    ctx.save_for_backward(input, bias)
+    return bias_gelu(bias, input)
+
+  @staticmethod
+  def backward(ctx, grad_output):
+    input, bias = ctx.saved_tensors
+    tmp = bias_gelu_back(grad_output, bias, input)
+    return tmp, tmp
+
+bias_gelu_impl = GeLUFunction.apply
+
+# Swish
+def swish(x):
+  return x * torch.sigmoid(x)
+
+# Fast GeLU
+def fast_gelu(x):
+  pi = 3.1415926535897932
+  cdf = 0.5 * (1.0 + torch.tanh((math.sqrt(2 / pi) * (x + 0.044715 * torch.pow(x, 3)))))
+  return x * cdf
+
+ACT2FN = {
+  "gelu": fast_gelu,
+  "bias_gelu": bias_gelu_impl,
+  "relu": torch.nn.functional.relu,
+  "swish": swish
+}
+
diff --git a/nlp/language_model/bert_sample/pytorch/base/model/layers/embeddings.py b/nlp/language_model/bert_sample/pytorch/base/model/layers/embeddings.py
new file mode 100644
index 000000000..096f76c0f
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/model/layers/embeddings.py
@@ -0,0 +1,59 @@
+# coding=utf-8
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .layernorm import BertLayerNorm
+
+import torch
+from torch import nn
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, token_type_ids=None):
+        position_ids = self.get_position_ids(input_ids)
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = words_embeddings + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+    def get_position_ids(self, input_ids):
+        seq_length = input_ids.size(1)
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
+        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        return position_ids
diff --git a/nlp/language_model/bert_sample/pytorch/base/model/layers/layernorm.py b/nlp/language_model/bert_sample/pytorch/base/model/layers/layernorm.py
new file mode 100644
index 000000000..936cc0f68
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/model/layers/layernorm.py
@@ -0,0 +1,36 @@
+# coding=utf-8
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+from torch import nn
+
+class BertLayerNorm(nn.Module):
+  def __init__(self, hidden_size, eps=1e-12):
+    """Construct a layernorm module in the TF style (epsilon inside the square root).
+    """
+    super(BertLayerNorm, self).__init__()
+    self.weight = nn.Parameter(torch.ones(hidden_size))
+    self.bias = nn.Parameter(torch.zeros(hidden_size))
+    self.variance_epsilon = eps
+
+  def forward(self, x):
+    u = x.mean(-1, keepdim=True)
+    s = (x - u).pow(2).mean(-1, keepdim=True)
+    x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+    return self.weight * x + self.bias
+
diff --git a/nlp/language_model/bert_sample/pytorch/base/model/layers/padding.py b/nlp/language_model/bert_sample/pytorch/base/model/layers/padding.py
new file mode 100644
index 000000000..a7aa84be0
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/model/layers/padding.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import math
+
+#######################################################################################################################################################################
+
+def unpad_input(out_, in_, indices):
+    out_[:,:] = in_[indices[:],:]
+
+def pad_input(out_, in_, indices):
+    out_[indices[:],:] = in_[:,:]
+
+def unpad_mask(out_, in_, indices):
+    out_[:] = in_.flatten()[indices[:]]
+
+#######################################################################################################################################################################
+
+def generate_mask(attention_mask, heads, pad=False, fuse_mask=True, unpad_fmha=False):
+    if unpad_fmha:
+        seqlen = attention_mask.sum(dim=1).to(dtype=torch.int32).flatten()
+        indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+        maxseqlen = seqlen.max().item()
+        b = attention_mask.shape[0]
+        cu_seqlens = torch.zeros(b+1, device=attention_mask.device, dtype=torch.int32)
+        cu_seqlens[1:] = torch.cumsum(seqlen, dim=0)
+        ntokens = cu_seqlens[-1].item()
+        return indices, attention_mask, seqlen, ntokens, cu_seqlens, seqlen, maxseqlen
+    
+
+    seqlen = attention_mask.sum(dim=1).float().cpu()
+    if pad == False:
+        seqlen[:] = ((seqlen[:] + 16 - 1) / 16).floor()*16
+        seqlen[seqlen < 16] = 16
+        seqlen = seqlen.int()
+        ntokens = seqlen.sum().item()
+    else:
+        batch = attention_mask.shape[0]
+        maxseqlen = attention_mask.shape[1]
+        seqlen.fill_(maxseqlen)
+        seqlen = seqlen.int()
+        ntokens = batch * maxseqlen
+
+    padded_mask = attention_mask.clone()
+    for i in range(len(seqlen)):
+        padded_mask[i,:seqlen[i]] = 1
+    indices = torch.nonzero(padded_mask.flatten(), as_tuple=False).flatten()
+
+    if pad==False and fuse_mask == True:
+        mask = torch.zeros([ntokens], device="cuda", dtype=torch.float16)
+        unpad_mask(mask, attention_mask, indices)
+        mask = (1 - mask) * -10000.0
+    elif pad==False and fuse_mask == False:
+        padded_mask = (padded_mask.unsqueeze(1) * padded_mask.unsqueeze(2)).unsqueeze(1).half().repeat(1, heads, 1, 1)
+        indices_mask = torch.nonzero(padded_mask.flatten(), as_tuple=False).flatten()            
+        mask = torch.zeros([len(indices_mask)], device="cuda", dtype=torch.float16)            
+        unpad_mask(mask, padded_mask, indices_mask)            
+        mask = (1 - mask) * -10000.0
+    elif pad==True and fuse_mask == True:
+        mask = -10000.0 * (1 - attention_mask).half().view(-1)
+    elif pad==True and fuse_mask == False:
+        mask = -10000.0 * (1 - (attention_mask.unsqueeze(1) * attention_mask.unsqueeze(2))).unsqueeze(1).half().repeat(1, heads, 1, 1).view(-1)
+
+    return indices, mask, seqlen, ntokens, None, None, None
+
+#######################################################################################################################################################################
+
+class PadInput(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, indices, batch, maxseqlen, hidden, ntokens):
+        ctx.save_for_backward(indices)
+        ctx.hidden = hidden
+        ctx.ntokens = ntokens
+        ntokens = batch*maxseqlen
+
+        output = torch.zeros([ntokens,hidden], device="cuda", dtype=torch.float16)
+        pad_input(output, input, indices)
+
+        return output[:ntokens]
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        indices, = ctx.saved_tensors
+
+        grad_input = torch.zeros([ctx.ntokens,ctx.hidden], device="cuda", dtype=torch.float16)
+        unpad_input(grad_input, grad_output, indices)
+
+        return grad_input[:ctx.ntokens], None, None, None, None, None
+
+#######################################################################################################################################################################
+
+class UnpadInput(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input, indices, batch, maxseqlen, hidden, ntokens):
+        ctx.save_for_backward(indices)
+        ctx.hidden = hidden
+        ctx.ntokens = batch*maxseqlen
+
+        output = torch.zeros([ntokens, hidden], device="cuda", dtype=torch.float16)
+        unpad_input(output, input, indices)
+
+        return output[:ntokens]
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        indices, = ctx.saved_tensors
+
+        grad_input = torch.zeros([ctx.ntokens,ctx.hidden], device="cuda", dtype=torch.float16)
+        pad_input(grad_input, grad_output, indices)
+
+        return grad_input[:ctx.ntokens], None, None, None, None, None
+
+#######################################################################################################################################################################
diff --git a/nlp/language_model/bert_sample/pytorch/base/model/losses/__init__.py b/nlp/language_model/bert_sample/pytorch/base/model/losses/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/nlp/language_model/bert_sample/pytorch/base/model/models/__init__.py b/nlp/language_model/bert_sample/pytorch/base/model/models/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/nlp/language_model/bert_sample/pytorch/base/model/models/modeling.py b/nlp/language_model/bert_sample/pytorch/base/model/models/modeling.py
new file mode 100644
index 000000000..d7c4e1886
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/model/models/modeling.py
@@ -0,0 +1,1394 @@
+# coding=utf-8
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import copy
+import json
+import logging
+import math
+import os
+import sys
+from functools import reduce
+from io import open
+from operator import mul
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from torch.nn import LayerNorm as BertLayerNorm
+
+from model.layers.activations import ACT2FN
+from model.layers.embeddings import BertEmbeddings
+from utils import get_rank
+
+
+logger = logging.getLogger(__name__)
+
+torch._C._jit_set_profiling_mode(False)
+torch._C._jit_set_profiling_executor(False)
+torch._C._jit_override_can_fuse_on_cpu(True)
+torch._C._jit_override_can_fuse_on_gpu(True)
+
+
+def remap_attn_names_tf(name):
+    if 'attention' in name:
+        ind = name.index("attention")
+        if 'self' in name and 'query' in name and 'kernel' in name:
+            name = name[:(ind + 1)] + ['multi_head_attention', 'q_weight']
+        if 'self' in name and 'query' in name and 'bias' in name:
+            name = name[:(ind + 1)] + ['multi_head_attention', 'q_bias']
+        if 'self' in name and 'key' in name and 'kernel' in name:
+            name = name[:(ind + 1)] + ['multi_head_attention', 'k_weight']
+        if 'self' in name and 'key' in name and 'bias' in name:
+            name = name[:(ind + 1)] + ['multi_head_attention', 'k_bias']
+        if 'self' in name and 'value' in name and 'kernel' in name:
+            name = name[:(ind + 1)] + ['multi_head_attention', 'v_weight']
+        if 'self' in name and 'value' in name and 'bias' in name:
+            name = name[:(ind + 1)] + ['multi_head_attention', 'v_bias']
+        if 'output' in name and 'dense' in name and 'kernel' in name:
+            name = name[:(ind + 1)] + ['multi_head_attention', 'out_proj_weight']
+        if 'output' in name and 'dense' in name and 'bias' in name:
+            name = name[:(ind + 1)] + ['multi_head_attention', 'out_proj_bias']
+        if 'output' in name and 'LayerNorm' in name:
+            name = name[:(ind + 1)] + ['layer_norm'] + name[-1:]
+    return name
+
+
+def load_tf_weights_in_bert(model, tf_checkpoint_path, use_fast_mha=False):
+    """ Load tf checkpoints in a pytorch model
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+              "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    if get_rank() == 0:
+        print("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        if get_rank() == 0:
+            print("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    # MHA params need to be treated separately
+    if use_fast_mha:
+        mha_params = ['q_weight', 'q_bias', 'k_weight', 'k_bias', 'v_weight', 'v_bias', 'out_proj_weight',
+                      'out_proj_bias']
+    else:
+        mha_params = []
+
+    for name, array in zip(names, arrays):
+        name = name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in ["adam_v", "adam_m", "global_step", "LAMB", "LAMB_1", "beta1_power", "beta2_power"] for n in name):
+            if get_rank() == 0:
+                print("Skipping {}".format("/".join(name)))
+            continue
+
+        if use_fast_mha:
+            name = remap_attn_names_tf(name)
+
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                l = re.split(r'_(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] in mha_params:
+                pointer = getattr(pointer, l[0])
+            elif l[0] == 'kernel' or l[0] == 'gamma':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'output_bias' or l[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'output_weights':
+                pointer = getattr(pointer, 'weight')
+            else:
+                pointer = getattr(pointer, l[0])
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel' or (m_name in mha_params and 'bias' not in m_name):
+            array = np.ascontiguousarray(np.transpose(array))
+
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            # If copying smaller into larger, assume padded and ok
+            if reduce(mul, pointer.shape) > reduce(mul, array.shape):
+                if get_rank() == 0:
+                    print("Initialize padded PyTorch weight {}".format(name))
+                pointer.data.zero_()
+
+                def generate_slices():
+                    slices = []
+                    for i in range(array.ndim):
+                        slices.append(slice(0, array.shape[i], 1))
+                    return slices
+
+                # pointer.data[generate_slices()] = torch.from_numpy(array)
+                pointer.data[generate_slices()] = torch.from_numpy(array)
+            else:
+                e.args += (pointer.shape, array.shape)
+                raise
+        else:
+            if get_rank() == 0:
+                print("Initialize PyTorch weight {}".format(name))
+            pointer.data = torch.from_numpy(array)
+    return model
+
+
+@torch.jit.script
+def jit_dropout_add(x, residual, prob, is_training):
+    # type: (Tensor, Tensor, float, bool) -> Tensor
+    # out = F.dropout(x, p=prob, training=is_training)
+    out = torch.nn.functional.dropout(x, p=prob, training=is_training)
+    out = residual + out
+    return out
+
+
+@torch.jit.script
+def jit_bias_dropout_add(x, bias, residual, prob, is_training):
+    # type: (Tensor, Tensor, Tensor, float, bool) -> Tensor
+    # out = F.dropout(x, p=prob, training=is_training)
+    out = torch.nn.functional.dropout(x + bias, p=prob, training=is_training)
+    out = residual + out
+    return out
+
+
+class LinearActivation(torch.nn.Linear):
+    r"""Fused Linear and activation Module.
+    """
+    __constants__ = ['bias']
+
+    def __init__(self, in_features, out_features, act='gelu', bias=True):
+        super(LinearActivation, self).__init__(in_features, out_features, bias)
+        self.act_fn = nn.Identity()  #
+        self.biased_act_fn = None  #
+        if isinstance(act, str) or (sys.version_info[0] == 2 and isinstance(act, unicode)):  # For TorchScript
+            if bias and not 'bias' in act:  # compatibility
+                act = 'bias_' + act  #
+                self.biased_act_fn = ACT2FN[act]  #
+
+            else:
+                self.act_fn = ACT2FN[act]
+        else:
+            self.act_fn = act
+
+    def forward(self, input):
+        if not self.bias is None:
+            return self.biased_act_fn(self.bias, nn.functional.linear(input, self.weight, None))
+        else:
+            return self.act_fn(F.linear(input, self.weight, self.bias))
+
+
+class LinearDropoutAdd(torch.nn.Linear):
+    r"""Fused Linear and activation Module.
+    """
+    __constants__ = ['bias']
+
+    def __init__(self, in_features, out_features, bias=True, p=0.1):
+        super(LinearDropoutAdd, self).__init__(in_features, out_features, bias)
+        self.p = p
+
+    def forward(self, input, residual):
+        linear_out = nn.functional.linear(input, self.weight, None)
+        if self.bias is None:
+            result = jit_dropout_add(linear_out, residual, self.p, is_training=self.training)
+        else:
+            result = jit_bias_dropout_add(linear_out, self.bias.expand_as(residual), residual, self.p,
+                                          is_training=self.training)
+        return result
+
+
+class BertConfig(object):
+    """Configuration class to store the configuration of a `BertModel`.
+    """
+
+    def __init__(self,
+                 vocab_size_or_config_json_file,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act="gelu",
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02):
+        """Constructs BertConfig.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `BertModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+        """
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                                                               and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.hidden_act = hidden_act
+            self.intermediate_size = intermediate_size
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.initializer_range = initializer_range
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             "or the path to a pretrained model config file (str)")
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `BertConfig` from a Python dictionary of parameters."""
+        config = BertConfig(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `BertConfig` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+
+class BertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def transpose_key_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 3, 1)
+
+    def forward(self, hidden_states, attention_mask):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_key_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer)
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(2)
+        # attention_scores = attention_scores - (1.0 - attention_mask.unsqueeze(1).unsqueeze(2).float()) * 10000.0
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = self.softmax(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # context_layer = torch.einsum("bnft,btnh->bfnh", attention_probs, mixed_value_layer.view(1,512,16,64))
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
+        # return context_layer.reshape(context_layer.shape[:2] + (self.all_head_size,))
+
+
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+    def __init__(self, config):
+        super(BertAttention, self).__init__()
+        self.self = BertSelfAttention(config)
+        self.output = BertSelfOutput(config)
+
+    def forward(self, input_tensor, attention_mask, *args, **kwargs):
+        self_output = self.self(input_tensor, attention_mask)
+        attention_output = self.output(self_output, input_tensor)
+        return attention_output
+
+
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.fused_gelu_bias = config.fused_gelu_bias
+        if config.fused_gelu_bias:
+            self.dense = LinearActivation(config.hidden_size, config.intermediate_size, act=config.hidden_act)
+        else:
+            self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+            if isinstance(config.hidden_act, str) or (
+                    sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+                self.intermediate_act_fn = ACT2FN[config.hidden_act]
+            else:
+                self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        if not self.fused_gelu_bias:
+            hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        if not config.fused_dropout_add:
+            self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        else:
+            self.dense = LinearDropoutAdd(config.intermediate_size, config.hidden_size, bias=True,
+                                          p=config.hidden_dropout_prob)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.p = config.hidden_dropout_prob
+        self.fused_dropout_add = config.fused_dropout_add
+
+    def forward(self, hidden_states, input_tensor):
+        if self.fused_dropout_add:
+            hidden_states = self.dense(hidden_states, input_tensor)
+            hidden_states = self.LayerNorm(hidden_states)
+            return hidden_states
+        else:
+            hidden_states = self.dense(hidden_states)
+            hidden_states = self.dropout(hidden_states)
+            hidden_states = self.LayerNorm(hidden_states + input_tensor)
+            return hidden_states
+
+
+class BertLayer(nn.Module):
+    def __init__(self, config):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(self, hidden_states, attention_mask, seqlen=None, batch=None):
+        attention_output = self.attention(hidden_states, attention_mask, seqlen, batch)
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super(BertEncoder, self).__init__()
+        layer = BertLayer(config)
+        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+
+    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, checkpoint_activations=False):
+        all_encoder_layers = []
+        for layer_module in self.layer:
+            hidden_states = layer_module(hidden_states, attention_mask)
+            if output_all_encoded_layers:
+                all_encoder_layers.append(hidden_states)
+        if not output_all_encoded_layers:
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+
+
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super(BertPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertLMPredictionHead, self).__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
+                                 bert_model_embedding_weights.size(0),
+                                 bias=False)
+        self.decoder.weight = bert_model_embedding_weights
+        self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states) + self.bias
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertOnlyMLMHead, self).__init__()
+        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super(BertOnlyNSPHead, self).__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class BertPreTrainingHeads(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertPreTrainingHeads, self).__init__()
+        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+        self.dense_seq_output = config.dense_seq_output
+
+    def forward(self, sequence_output, pooled_output, masked_lm_labels):
+        if self.dense_seq_output:
+            batch_size = sequence_output.shape[0]
+            seq_len = sequence_output.shape[1]
+            hidden_dim = sequence_output.shape[2]
+            sequence_flattened = torch.index_select(sequence_output.view(-1, sequence_output.shape[-1]), 0,
+                                                    torch.nonzero(masked_lm_labels.view(-1) != 0,
+                                                                  as_tuple=False).squeeze())
+            sequence_output = sequence_flattened
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+
+        return prediction_scores, seq_relationship_score
+
+
+class BertPreTrainedModel(nn.Module):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+
+    def __init__(self, config, *inputs, **kwargs):
+        super(BertPreTrainedModel, self).__init__()
+        if not isinstance(config, BertConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
+                "To create a model from a Google pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                ))
+        self.config = config
+
+        # we want to make sure vocab size is padded to % 8 == 0
+        if self.config.vocab_size % 8 != 0:
+            self.config.vocab_size += 8 - (self.config.vocab_size % 8)
+            if get_rank == 0:
+                print(f'Padded vocab_size to : {self.config.vocab_size}')
+
+    def init_bert_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, BertLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    @classmethod
+    def from_pretrained(cls, pretrained_checkpoint, state_dict=None, cache_dir=None,
+                        from_tf=False, config=None, *inputs, **kwargs):
+        """
+        Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+
+        It is used preprocessing stage.
+
+        Params:
+            pretrained_model_name_or_path: either:
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . `pytorch_model.bin` a PyTorch dump of a BertForPretraining instance
+                - a path or url to a pretrained model archive containing:
+                    . `bert_config.json` a configuration file for the model
+                    . `model.chkpt` a TensorFlow checkpoint
+            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+            *inputs, **kwargs: additional input for the specific Bert class
+                (ex: num_labels for BertForSequenceClassification)
+        """
+        logger.info("loading archive file {}".format(pretrained_checkpoint))
+        assert config, "BERT configuration file must be provided to from_pretraining()"
+        logger.info("Model config {}".format(config))
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        if state_dict is None and not from_tf:
+            state_dict = torch.load(pretrained_checkpoint,
+                                    map_location='cpu' if not torch.cuda.is_available() else None)
+        if from_tf:
+            # Directly load from a TensorFlow checkpoint
+            return load_tf_weights_in_bert(model, pretrained_checkpoint, use_fast_mha=config.fused_mha)
+        # Load from a PyTorch state_dict
+        old_keys = []
+        new_keys = []
+        # print(f'loading keys: {state_dict.keys()}')
+        for key in state_dict.keys():
+            new_key = None
+            if 'gamma' in key:
+                new_key = key.replace('gamma', 'weight')
+            if 'beta' in key:
+                new_key = key.replace('beta', 'bias')
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=''):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + '.')
+
+        start_prefix = ''
+        if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()):
+            start_prefix = 'bert.'
+        load(model, prefix=start_prefix)
+        if len(missing_keys) > 0:
+            logger.info("Weights of {} not initialized from pretrained model: {}".format(
+                model.__class__.__name__, missing_keys))
+        if len(unexpected_keys) > 0:
+            logger.info("Weights from pretrained model not used in {}: {}".format(
+                model.__class__.__name__, unexpected_keys))
+        if len(error_msgs) > 0:
+            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                model.__class__.__name__, "\n\t".join(error_msgs)))
+        return model
+
+
+class BertModel(BertPreTrainedModel):
+    """BERT model ("Bidirectional Embedding Representations from a Transformer").
+
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
+
+    Outputs: Tuple of (encoded_layers, pooled_output)
+        `encoded_layers`: controled by `output_all_encoded_layers` argument:
+            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
+                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
+                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
+            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
+                to the last attention block of shape [batch_size, sequence_length, hidden_size],
+        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
+            classifier pretrained on top of the hidden state associated to the first character of the
+            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = modeling.BertModel(config=config)
+    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config):
+        super(BertModel, self).__init__(config)
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config)
+        self.apply(self.init_bert_weights)
+        self.unpad = False
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True,
+                checkpoint_activations=False):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask  # .unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        if self.unpad == False:
+            extended_attention_mask = extended_attention_mask.to(
+                dtype=next(self.parameters()).dtype)  # fp16 compatibility
+            extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        embedding_output = self.embeddings(input_ids, token_type_ids)
+        encoded_layers = self.encoder(embedding_output,
+                                      extended_attention_mask,
+                                      output_all_encoded_layers=output_all_encoded_layers,
+                                      checkpoint_activations=checkpoint_activations)
+        sequence_output = encoded_layers[-1]
+        pooled_output = self.pooler(sequence_output)
+        if not output_all_encoded_layers:
+            encoded_layers = encoded_layers[-1]
+        return encoded_layers, pooled_output
+
+
+class BertForPretraining(BertPreTrainedModel):
+    """BERT model with pre-training heads.
+    This module comprises the BERT model followed by the two pre-training heads:
+        - the masked language modeling head, and
+        - the next sentence classification head.
+
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., vocab_size]
+        `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, 1].
+            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+
+    Outputs:
+        if `masked_lm_labels` and `next_sentence_label` are not `None`:
+            Outputs the total_loss which is the sum of the masked language modeling loss and the next
+            sentence classification loss.
+        if `masked_lm_labels` or `next_sentence_label` is `None`:
+            Outputs a tuple comprising
+            - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
+            - the next sentence classification logits of shape [batch_size, 2].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = BertForPretraining(config)
+    masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config):
+        super(BertForPretraining, self).__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
+        self.apply(self.init_bert_weights)
+        self.dense_seq_output = config.dense_seq_output
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
+                next_sentence_label=None, checkpoint_activations=False):
+        sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
+                                                   output_all_encoded_layers=False,
+                                                   checkpoint_activations=checkpoint_activations)
+        # if dense_seq_output, prediction scores returned by this function is already masked out with masked_lm_labels, and first dimension is flattened
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output, masked_lm_labels)
+
+        if masked_lm_labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=0)
+
+            masked_lm_labels_flat = masked_lm_labels.view(-1)
+
+            if self.dense_seq_output:
+                masked_lm_labels_dense = masked_lm_labels_flat[masked_lm_labels_flat != 0]
+                masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels_dense)
+            else:
+                masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
+            nsp_loss_fct = CrossEntropyLoss(ignore_index=-1)
+            next_sentence_loss = nsp_loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            # print("loss is {} {}".format(masked_lm_loss, next_sentence_loss))
+            total_loss = masked_lm_loss + next_sentence_loss
+            total_loss = total_loss.float()
+
+            # Masked Language Model Accuracy
+            mlm_labels = masked_lm_labels_flat[masked_lm_labels_flat != 0]
+            if not self.dense_seq_output:
+                prediction_scores_flat = prediction_scores.view(-1, prediction_scores.shape[-1])
+                mlm_predictions_scores = prediction_scores_flat[masked_lm_labels_flat != 0]
+                mlm_predictions = mlm_predictions_scores.argmax(dim=-1)
+            else:
+                mlm_predictions = prediction_scores.argmax(dim=-1)
+
+            mlm_acc = (mlm_predictions == mlm_labels).sum(dtype=torch.float) / mlm_labels.numel()
+
+            return total_loss, mlm_acc, mlm_labels.numel()
+        else:  # TODO: Handle this path for dense sequence output as well
+            return prediction_scores, seq_relationship_score
+
+
+class BertForPreTrainingModelOnly(nn.Module):
+    def __init__(self, config):
+        super(BertForPreTrainingModelOnly, self).__init__()
+        self.bert = BertModel(config)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, checkpoint_activations=False):
+        sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
+                                                   output_all_encoded_layers=False,
+                                                   checkpoint_activations=checkpoint_activations)
+        return sequence_output, pooled_output
+
+
+class BertForPreTrainingHeadsOnly(nn.Module):
+    def __init__(self, config, bert_model_embedding_weights):
+        super(BertForPreTrainingHeadsOnly, self).__init__()
+        self.cls = BertPreTrainingHeads(config, bert_model_embedding_weights)
+        self.dense_seq_output = config.dense_seq_output
+        self.config = config
+
+    def forward(self, sequence_output, pooled_output, masked_lm_labels=None, next_sentence_label=None):
+        # if dense_seq_output, prediction scores returned by this function is already masked out with masked_lm_labels, and first dimension is flattened
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output, masked_lm_labels)
+
+        if masked_lm_labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=0)
+            masked_lm_labels_flat = masked_lm_labels.view(-1)
+            if self.dense_seq_output:
+                masked_lm_labels_dense = masked_lm_labels_flat[masked_lm_labels_flat != 0]
+                masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels_dense)
+            else:
+                masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
+            nsp_loss_fct = CrossEntropyLoss(ignore_index=-1)
+            next_sentence_loss = nsp_loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            if torch.any(torch.isnan(masked_lm_loss)) or torch.any(torch.isnan(next_sentence_loss)):
+                print("masked_lm_loss or next_sentence_loss exploded (corrupted by Inf or Nan)")
+                print(masked_lm_loss, next_sentence_loss)
+                # return None
+                sys.exit(1)
+            # print("loss is {} {}".format(masked_lm_loss, next_sentence_loss))
+            total_loss = (masked_lm_loss + next_sentence_loss).float()
+            # print("masked",masked_lm_loss)
+            # print("nsp",next_sentence_loss)
+            # total_loss=total_loss
+            # Masked Language Model Accuracy
+            valid_mask = (masked_lm_labels_flat != 0).int()
+            num_valid = valid_mask.sum(dtype=torch.int64)
+
+            # mlm_labels = masked_lm_labels_flat[masked_lm_labels_flat != 0]
+
+            if not self.dense_seq_output:
+                mlm_labels = masked_lm_labels_flat
+                prediction_scores_flat = prediction_scores.view(-1, prediction_scores.shape[-1])
+                mlm_predictions_scores = prediction_scores_flat
+                mlm_predictions = mlm_predictions_scores.argmax(dim=-1)
+                mlm_acc = ((mlm_predictions == mlm_labels) * valid_mask).sum(dtype=torch.float) / num_valid
+
+
+            #     prediction_scores_flat = prediction_scores.view(-1, prediction_scores.shape[-1])
+            #     mlm_predictions_scores = prediction_scores_flat[masked_lm_labels_flat != 0]
+            #     mlm_predictions = mlm_predictions_scores.argmax(dim=-1)
+            else:
+                #                mlm_predictions = prediction_scores.argmax(dim=-1).view(-1)
+                mlm_labels = masked_lm_labels_flat[masked_lm_labels_flat != 0]
+                mlm_predictions = prediction_scores.argmax(dim=-1).view(-1)
+                mlm_acc = (mlm_predictions == mlm_labels).sum(dtype=torch.float) / num_valid
+            return total_loss, mlm_acc, num_valid
+        #            mlm_acc = (mlm_predictions == mlm_labels).sum(dtype=torch.float) / mlm_labels.numel()
+        # print((mlm_predictions == mlm_labels).sum(dtype=torch.float),num_valid,mlm_acc)
+        # return total_loss, mlm_acc, mlm_labels.numel()
+
+        else:  # TODO: Handle this path for dense sequence output as well
+            return prediction_scores, seq_relationship_score
+
+
+class BertForPreTraining(BertPreTrainedModel):
+    def __init__(self, config):
+        super(BertForPreTraining, self).__init__(config)
+        self.bert_model_segment = BertForPreTrainingModelOnly(config)
+        self.heads_only_segment = BertForPreTrainingHeadsOnly(config,
+                                                              self.bert_model_segment.bert.embeddings.word_embeddings.weight)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
+                next_sentence_label=None, checkpoint_activations=False):
+        assert checkpoint_activations == False, "cuda-graphs: reattach passing of checkpoint_activations when function.py starts handling non-Tensor inputs"
+        sequence_output, pooled_output = self.bert_model_segment(input_ids, token_type_ids, attention_mask)
+        return self.heads_only_segment(sequence_output, pooled_output, masked_lm_labels, next_sentence_label)
+
+
+class BertForMaskedLM(BertPreTrainedModel):
+    """BERT model with the masked language modeling head.
+    This module comprises the BERT model followed by the masked language modeling head.
+
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+            is only computed for the labels set in [0, ..., vocab_size]
+
+    Outputs:
+        if `masked_lm_labels` is  not `None`:
+            Outputs the masked language modeling loss.
+        if `masked_lm_labels` is `None`:
+            Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = BertForMaskedLM(config)
+    masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config):
+        super(BertForMaskedLM, self).__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
+                checkpoint_activations=False):
+        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask,
+                                       output_all_encoded_layers=False)
+        prediction_scores = self.cls(sequence_output)
+
+        if masked_lm_labels is not None:
+            # loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss_fct = CrossEntropyLoss(ignore_index=0)
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
+            return masked_lm_loss
+        else:
+            return prediction_scores
+
+
+class BertForNextSentencePrediction(BertPreTrainedModel):
+    """BERT model with next sentence prediction head.
+    This module comprises the BERT model followed by the next sentence classification head.
+
+    Params:
+        config: a BertConfig class instance with the configuration to build a new model.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, 1].
+            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
+
+    Outputs:
+        if `next_sentence_label` is not `None`:
+            Outputs the total_loss which is the sum of the masked language modeling loss and the next
+            sentence classification loss.
+        if `next_sentence_label` is `None`:
+            Outputs the next sentence classification logits of shape [batch_size, 2].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = BertForNextSentencePrediction(config)
+    seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config):
+        super(BertForNextSentencePrediction, self).__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertOnlyNSPHead(config)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None,
+                checkpoint_activations=False):
+        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
+                                     output_all_encoded_layers=False)
+        seq_relationship_score = self.cls(pooled_output)
+
+        if next_sentence_label is not None:
+            # loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss_fct = CrossEntropyLoss(ignore_index=0)
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            return next_sentence_loss
+        else:
+            return seq_relationship_score
+
+
+class BertForSequenceClassification(BertPreTrainedModel):
+    """BERT model for classification.
+    This module is composed of the BERT model with a linear layer on top of
+    the pooled output.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model.
+        `num_labels`: the number of classes for the classifier. Default = 2.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, ..., num_labels].
+
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, num_labels].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    num_labels = 2
+
+    model = BertForSequenceClassification(config, num_labels)
+    logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config, num_labels):
+        super(BertForSequenceClassification, self).__init__(config)
+        self.num_labels = num_labels
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, num_labels)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False):
+        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            return loss
+        else:
+            return logits
+
+
+class BertForMultipleChoice(BertPreTrainedModel):
+    """BERT model for multiple choice tasks.
+    This module is composed of the BERT model with a linear layer on top of
+    the pooled output.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model.
+        `num_choices`: the number of classes for the classifier. Default = 2.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
+            with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
+            and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
+            with indices selected in [0, ..., num_choices].
+
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, num_labels].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
+    input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
+    token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    num_choices = 2
+
+    model = BertForMultipleChoice(config, num_choices)
+    logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config, num_choices):
+        super(BertForMultipleChoice, self).__init__(config)
+        self.num_choices = num_choices
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False):
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1))
+        _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask,
+                                     output_all_encoded_layers=False)
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, self.num_choices)
+
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+            return loss
+        else:
+            return reshaped_logits
+
+
+class BertForTokenClassification(BertPreTrainedModel):
+    """BERT model for token-level classification.
+    This module is composed of the BERT model with a linear layer on top of
+    the full hidden state of the last layer.
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model.
+        `num_labels`: the number of classes for the classifier. Default = 2.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
+            with indices selected in [0, ..., num_labels].
+
+    Outputs:
+        if `labels` is not `None`:
+            Outputs the CrossEntropy classification loss of the output with the labels.
+        if `labels` is `None`:
+            Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    num_labels = 2
+
+    model = BertForTokenClassification(config, num_labels)
+    logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config, num_labels):
+        super(BertForTokenClassification, self).__init__(config)
+        self.num_labels = num_labels
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, num_labels)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, checkpoint_activations=False):
+        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)[active_loss]
+                active_labels = labels.view(-1)[active_loss]
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            return loss
+        else:
+            return logits
+
+
+class BertForQuestionAnswering(BertPreTrainedModel):
+    """BERT model for Question Answering (span extraction).
+    This module is composed of the BERT model with a linear layer on top of
+    the sequence output that computes start_logits and end_logits
+
+    Params:
+        `config`: a BertConfig class instance with the configuration to build a new model.
+
+    Inputs:
+        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
+            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
+        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
+            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
+            a `sentence B` token (see BERT paper for more details).
+        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
+            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
+            input sequence length in the current batch. It's the mask that we typically use for attention when
+            a batch has varying length sentences.
+        `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
+            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
+            into account for computing the loss.
+        `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
+            Positions are clamped to the length of the sequence and position outside of the sequence are not taken
+            into account for computing the loss.
+
+    Outputs:
+        if `start_positions` and `end_positions` are not `None`:
+            Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
+        if `start_positions` or `end_positions` is `None`:
+            Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
+            position tokens of shape [batch_size, sequence_length].
+
+    Example usage:
+    ```python
+    # Already been converted into WordPiece token ids
+    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
+    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
+
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+    model = BertForQuestionAnswering(config)
+    start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
+    ```
+    """
+
+    def __init__(self, config):
+        super(BertForQuestionAnswering, self).__init__(config)
+        self.bert = BertModel(config)
+        # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
+        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+        self.apply(self.init_bert_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None,
+                checkpoint_activations=False):
+        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+            return total_loss
+        else:
+            return start_logits, end_logits
diff --git a/nlp/language_model/bert_sample/pytorch/base/optimizers/__init__.py b/nlp/language_model/bert_sample/pytorch/base/optimizers/__init__.py
new file mode 100644
index 000000000..8ff379afc
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/optimizers/__init__.py
@@ -0,0 +1 @@
+from .factory import create_optimizer
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/base/optimizers/factory.py b/nlp/language_model/bert_sample/pytorch/base/optimizers/factory.py
new file mode 100644
index 000000000..5d9a33fda
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/optimizers/factory.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+from torch.optim import AdamW
+
+from .lamb import Lamb
+
+
+def create_optimizer(name: str, params, config):
+    name = name.lower()
+
+    if name == "lamb":
+        return Lamb(
+            params, lr=config.learning_rate,
+            betas=(config.opt_lamb_beta_1, config.opt_lamb_beta_1), eps=1e-6,
+            weight_decay=config.weight_decay_rate, adam=False
+        )
+
+    if name == "adamw":
+        return AdamW(
+            params, lr=config.learning_rate,
+            betas=(config.opt_lamb_beta_1, config.opt_lamb_beta_2),
+            weight_decay=config.weight_decay_rate
+        )
+
+    raise RuntimeError(f"Not found optimier {name}.")
diff --git a/nlp/language_model/bert_sample/pytorch/base/optimizers/lamb.py b/nlp/language_model/bert_sample/pytorch/base/optimizers/lamb.py
new file mode 100644
index 000000000..287deb088
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/optimizers/lamb.py
@@ -0,0 +1,102 @@
+import torch
+from torch.optim import Optimizer
+
+
+class Lamb(Optimizer):
+    r"""Implements Lamb algorithm.
+    It has been proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        adam (bool, optional): always use trust ratio = 1, which turns this into
+            Adam. Useful for comparison purposes.
+    .. _Large Batch Optimization for Deep Learning: Training BERT in 76 minutes:
+        https://arxiv.org/abs/1904.00962
+    """
+
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6,
+                 weight_decay=0, adam=False):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay)
+        self.adam = adam
+        super(Lamb, self).__init__(params, defaults)
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instad.')
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                state['step'] += 1
+
+                # Decay the first and second moment running average coefficient
+                # m_t
+                exp_avg.mul_(beta1).add_(1 - beta1, grad)
+                # v_t
+                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+
+                # Paper v3 does not use debiasing.
+                # bias_correction1 = 1 - beta1 ** state['step']
+                # bias_correction2 = 1 - beta2 ** state['step']
+                # Apply bias to lr to avoid broadcast.
+                step_size = group['lr'] # * math.sqrt(bias_correction2) / bias_correction1
+
+                weight_norm = p.data.pow(2).sum().sqrt().clamp(0, 10)
+
+                adam_step = exp_avg / exp_avg_sq.sqrt().add(group['eps'])
+                if group['weight_decay'] != 0:
+                    adam_step.add_(group['weight_decay'], p.data)
+
+                adam_norm = adam_step.pow(2).sum().sqrt()
+                if weight_norm == 0 or adam_norm == 0:
+                    trust_ratio = 1
+                else:
+                    trust_ratio = weight_norm / adam_norm
+                state['weight_norm'] = weight_norm
+                state['adam_norm'] = adam_norm
+                state['trust_ratio'] = trust_ratio
+                if self.adam:
+                    trust_ratio = 1
+
+                p.data.add_(-step_size * trust_ratio, adam_step)
+
+        return loss
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/base/prepare.py b/nlp/language_model/bert_sample/pytorch/base/prepare.py
new file mode 100644
index 000000000..28fa7d254
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/prepare.py
@@ -0,0 +1,288 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import os
+import subprocess
+from typing import List, Optional, Union
+import os.path as ospath
+from argparse import ArgumentParser, REMAINDER
+from functools import partial, wraps
+from typing import NamedTuple
+
+
+# =========================================================
+# Define arguments
+# =========================================================
+
+def parse_args():
+    parser = ArgumentParser("Prepare")
+    parser.add_argument("--name", type=str, default=None, help="The name of submitter")
+    parser.add_argument("--data_dir", type=str, default=None, help="Data direction")
+    # WARN: Don't delete this argument
+    parser.add_argument('other_args', nargs=REMAINDER)
+    args = parser.parse_args()
+    return args
+
+
+# =========================================================
+# Constants
+# =========================================================
+args              = parse_args()
+
+APT               = "apt"
+PIP_INSTALL       = "pip3 install "
+PYTHON            = "python3"
+DOWNLOAD          = "wget"
+
+APT_PKGS          = ["numactl"]
+SUPPORTED_WHEELS  = ["torch", "apex"]
+
+
+MODEL_DIR = ospath.abspath(
+    ospath.join(
+        __file__,
+        "../../"
+    )
+)
+CURRENT_MODEL_NAME = ospath.basename(MODEL_DIR)
+PROJ_DIR = ospath.abspath(
+    ospath.join(
+        MODEL_DIR,
+        "../../"
+    )
+)
+
+PACKAGE_DIR_NAME  = "packages"
+SOURCE_DIR_NAME   = "csrc"
+SDK_DIR_NAME      = "sdk_installers"
+PACKAGE_LIST_NAME = "files.txt"
+SDK_LIST_NAME     = "files.txt"
+
+DATA_SHARDS       = 2048
+TRAIN_DIR_NAME    = f"{DATA_SHARDS}_shards_uncompressed"
+CONVERTED_CHECKPOINT_NAME = "model.ckpt-28252.pt"
+
+SUBMITTER         = args.name
+DATA_DIR          = args.data_dir
+SUBMITTER_DIR     = ospath.join(PROJ_DIR, SUBMITTER, CURRENT_MODEL_NAME)
+TRAIN_DATA_DIR    = ospath.join(DATA_DIR, TRAIN_DIR_NAME)
+
+EXTENSION_SOURCE_DIR_ENV = "EXTENSION_SOURCE_DIR"
+SDK_ARGUMENTS_ENV        = "SDK_ARGUMENTS"
+
+
+# =========================================================
+# Helpers
+# =========================================================
+
+class ShellResult(NamedTuple):
+
+    returncode: int
+    result: Union[subprocess.CompletedProcess, str] = None
+
+
+def _exec_cmd(cmd: Union[str, List], *args, **kwargs):
+    args_str = " ".join(args)
+    args_str += " ".join([f"{name}={value}" for name, value in kwargs.items()])
+    cmd_str = cmd
+    if isinstance(cmd, (tuple, list)):
+        cmd_str = " ".join(cmd)
+    print(f"Commands: {cmd_str}")
+
+    result = subprocess.run(cmd, *args, **kwargs, stdout=None, stderr=subprocess.STDOUT)
+
+    if result.returncode > 0:
+        msg = f"ERROR: {cmd} {args_str}"
+        return ShellResult(returncode=result.returncode, result=msg)
+
+    return ShellResult(returncode=result.returncode, result=result)
+
+
+def exec_shell_cmd(cmd: str, *args, **kwargs):
+    return _exec_cmd(cmd, shell=True, *args, **kwargs)
+
+
+def exec_shell_cmds(cmds: List[str], *args, **kwargs):
+    cmds = "\n".join(cmds)
+    return exec_shell_cmd(cmds, *args, **kwargs)
+
+
+def warning(*args, **kwargs):
+    print("WARN:", *args, **kwargs)
+
+
+def find_file_by_match(dir, file_patterns):
+    if ospath.exists(dir):
+        dir_files = os.listdir(dir)
+    else:
+        return file_patterns
+
+    for i, pattern in enumerate(file_patterns):
+        pattern = pattern.strip()
+        if len(pattern) <= 1 or not pattern.endswith("*"):
+            continue
+
+        pattern = pattern[:-1]
+
+        for dir_file in dir_files:
+            if dir_file.startswith(pattern):
+                file_patterns[i] = dir_file
+                break
+    return file_patterns
+
+# =========================================================
+# Pipelines
+# =========================================================
+
+def install_apt_packages():
+    return exec_shell_cmd(f"{APT} install -y {' '.join(APT_PKGS)}")
+
+
+def prepare_data():
+    checked_files = ["2048_shards_uncompressed", "bert_config.json", "eval_set_uncompressed", "model.ckpt-28252.pt"]
+    path_join = ospath.join
+    exist_preprocessed_data = all([ospath.exists(path_join(DATA_DIR, name)) for name in checked_files])
+
+    if exist_preprocessed_data:
+        return
+
+    # Check last download file
+    origin_data = path_join(DATA_DIR, "phase1", "model.ckpt-28252.meta")
+    need_download_dataset = "-s" if ospath.exists(origin_data) else ""
+    cmds = [
+        f"cd {path_join(MODEL_DIR, 'pytorch/data_preprocessing')}",
+        f"bash prepare_data.sh -o {DATA_DIR} -p {DATA_SHARDS} {need_download_dataset}"
+    ]
+    return exec_shell_cmds(cmds)
+
+
+def install_sdk():
+    def get_sdk_args():
+        sdk_args = dict()
+        if SDK_ARGUMENTS_ENV in os.environ:
+            sdk_args_str = os.environ[SDK_ARGUMENTS_ENV]
+
+            sdk_args_segments = sdk_args_str.split(';')
+            for sdk_arg in sdk_args_segments:
+                sdk, arg = sdk_arg.split('=', maxsplit=1)
+                sdk_args[sdk] = arg
+        return sdk_args
+
+    sdk_args_dict = get_sdk_args()
+    print("SDK Arguments:", sdk_args_dict)
+
+    sdk_installer_dir = ospath.join(SUBMITTER_DIR, SDK_DIR_NAME)
+    if not ospath.exists(sdk_installer_dir):
+        sdk_installer_dir = ospath.join(PROJ_DIR, SUBMITTER, SDK_DIR_NAME)
+        if not ospath.exists(sdk_installer_dir):
+            warning("Not found sdk\'s dir, skip run installer")
+            return
+
+    # Find sdk installers
+    sdk_list_file = ospath.join(sdk_installer_dir, SDK_LIST_NAME)
+    if ospath.exists(sdk_list_file):
+        with open(sdk_list_file) as f:
+            sdk_installers = f.readlines()
+        sdk_installers_pattern = [sdk.strip() for sdk in sdk_installers]
+        sdk_installers = find_file_by_match(sdk_installer_dir, sdk_installers_pattern)
+    else:
+        sdk_installers = os.listdir(sdk_installer_dir)
+        sdk_installers.sort()
+
+    sdk_installers_cmds = []
+    for sdk in sdk_installers:
+        if sdk.endswith(".run"):
+            sdk_arg = ""
+            for sdk_args_key in sdk_args_dict:
+                if sdk.startswith(sdk_args_key):
+                    sdk_arg = sdk_args_dict[sdk_args_key]
+            sdk_installers_cmds.append("sh " + ospath.join(sdk_installer_dir, sdk) + f" {sdk_arg}")
+
+    if len(sdk_installers_cmds) == 0:
+        warning("Not found installer in", sdk_installer_dir, ", skip run installer")
+        return
+
+    return exec_shell_cmds(sdk_installers_cmds)
+
+
+def install_requirements():
+    return exec_shell_cmd(
+        f"{PIP_INSTALL} -r requirements.txt"
+    )
+
+
+def install_wheel_pkgs(filter_packages: bool=False):
+    wheel_dir = ospath.join(SUBMITTER_DIR, PACKAGE_DIR_NAME)
+    if not ospath.exists(wheel_dir):
+        warning("Not found package\'s dir, skip install wheel package")
+        return
+
+    # Find packages
+    package_list_file = ospath.join(wheel_dir, PACKAGE_LIST_NAME)
+    if ospath.exists(package_list_file):
+        with open(package_list_file) as f:
+            packages = f.readlines()
+        packages_pattern = [pkg.strip() for pkg in packages]
+        packages = find_file_by_match(wheel_dir, packages_pattern)
+    else:
+        packages = os.listdir(wheel_dir)
+        packages.sort()
+
+    def _filter_packages(name: str):
+        for support_pkg in SUPPORTED_WHEELS:
+            if name.startswith(support_pkg):
+                return True
+        return False
+
+    if filter_packages:
+        packages = list(filter(_filter_packages, packages))
+
+    if len(packages) == 0:
+        warning("Not found wheel packages in", wheel_dir)
+        return
+
+    install_packages_cmds = [f"{PIP_INSTALL} {ospath.join(wheel_dir, pkg)}" for pkg in packages]
+    return exec_shell_cmds(install_packages_cmds)
+
+
+def install_extensions():
+    source_dir = ospath.join(SUBMITTER_DIR, SOURCE_DIR_NAME)
+    if not ospath.exists(source_dir):
+        warning("Not found source dir:", source_dir)
+        return
+    return exec_shell_cmd(
+        f"{EXTENSION_SOURCE_DIR_ENV}={source_dir} {PYTHON} setup.py build && cp build/lib*/ext_ops* ./"
+    )
+
+
+def pipelines():
+    return [
+        # TODO: Uncomment
+        install_apt_packages,
+        install_requirements,
+        install_sdk,
+        partial(install_wheel_pkgs, filter_packages=True),
+        install_extensions,
+        # prepare_data,
+    ]
+
+
+if __name__ == '__main__':
+    for pipeline in pipelines():
+        result = pipeline()
+        if result is not None and result.returncode > 0:
+            print(result.result)
+            print("Fail:", pipeline)
+            exit(result.returncode)
+
+
+
+
+
diff --git a/nlp/language_model/bert_sample/pytorch/base/requirements.txt b/nlp/language_model/bert_sample/pytorch/base/requirements.txt
new file mode 100644
index 000000000..1dd53e662
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/requirements.txt
@@ -0,0 +1,3 @@
+# progress bars in model download and training scripts
+h5py
+psutil
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/base/run_pretraining.py b/nlp/language_model/bert_sample/pytorch/base/run_pretraining.py
new file mode 100644
index 000000000..cb964a0ee
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/run_pretraining.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+from copy import copy
+import os
+import random
+import time
+from concurrent.futures import ProcessPoolExecutor
+
+import numpy as np
+import torch
+from torch.cuda.amp import GradScaler
+
+import utils
+from dataloaders import WorkerInitializer
+from dataloaders.dataloader import PretrainingDataloaders
+from train.evaluator import Evaluator
+from train.trainer import Trainer
+from train.training_state import TrainingState
+from train.event import TrainingEventCompose, TrainingLogger
+
+
+logger = None
+
+
+def main():
+    import config
+    parser = argparse.ArgumentParser("Bert")
+    config.activate_config_env(parser=parser, with_config_env_name=True)
+
+    if config.use_env and 'LOCAL_RANK' in os.environ:
+        config.local_rank = int(os.environ['LOCAL_RANK'])
+
+    device, num_gpus = utils.init_dist_training_env(config)
+    config.device = device
+    config.n_gpu = num_gpus
+
+    utils.check_config(config)
+
+    try:
+        from dltest import show_training_arguments
+        show_training_arguments(config)
+    except:
+        pass
+
+    interface = config.training_event(config)
+    events = [
+        TrainingLogger(config, log_freq=config.log_freq)
+    ]
+    training_event = TrainingEventCompose(interface, events)
+    training_event.launch()
+
+    global logger
+    logger = events[0].logger
+
+    utils.barrier()
+    training_event.on_init_start()
+    init_start_time = logger.previous_log_time
+    worker_seeds, shuffling_seeds = utils.setup_seeds(config.seed, config.num_epochs_to_generate_seeds_for, device)
+    if torch.distributed.is_initialized():
+        worker_seed = worker_seeds[torch.distributed.get_rank()]
+    else:
+        worker_seed = worker_seeds[0]
+
+    random.seed(worker_seed)
+    np.random.seed(worker_seed)
+    torch.manual_seed(worker_seed)
+    worker_init = WorkerInitializer.default(worker_seed)
+
+    pool = ProcessPoolExecutor(1)
+    evaluator = Evaluator(
+        config.eval_dir,
+        proc_pool=pool,
+        global_batch_size=utils.global_batch_size(config),
+        max_steps=config.max_steps,
+        worker_init=worker_init,
+        use_cache=config.cache_eval_data
+    )
+    grad_scaler = GradScaler(init_scale=float(os.getenv("INIT_LOSS_SCALE", 2 ** 20)), growth_interval=2000)
+    training_state = TrainingState()
+    trainer = Trainer(training_event, evaluator, training_state, grad_scaler, device=device)
+    training_state._trainer = trainer
+
+    utils.barrier()
+    trainer.init()
+
+    utils.barrier()
+    init_evaluation_start = time.time()
+    eval_loss, eval_mlm_acc = evaluator.evaluate(trainer)
+    training_state.eval_loss = eval_loss
+    training_state.eval_mlm_accuracy = eval_mlm_acc
+    init_evaluation_end = time.time()
+    init_evaluation_info = dict(
+        eval_loss = eval_loss,
+        eval_mlm_accuracy = eval_mlm_acc,
+        time = init_evaluation_end - init_evaluation_start
+    )
+    training_event.on_init_evaluate(init_evaluation_info)
+
+    if not config.do_train:
+        return config, training_state, init_evaluation_info["time"]
+
+    dataloader = PretrainingDataloaders(
+        config.train_dir,
+        max_predictions_per_seq=config.max_predictions_per_seq,
+        batch_size=config.train_batch_size,
+        seed=shuffling_seeds, num_files_per_iter=1,
+        worker_init=worker_init, pool=pool,
+    )
+
+    training_event.on_init_end()
+    init_end_time = logger.previous_log_time
+    training_state.init_time = (init_end_time - init_start_time) / 1e+3
+
+    utils.barrier()
+
+    epoch = -1
+    training_event.on_train_begin()
+    raw_train_start_time = logger.previous_log_time
+    if config.save_checkpoint:
+        trainer.save()
+    while training_state.global_steps < config.max_steps and not training_state.end_training:
+        epoch += 1
+        training_state.epoch = epoch
+        dataloader.set_epoch(epoch)
+        trainer.train_one_epoch(dataloader)
+    if config.save_checkpoint:
+        trainer.save()
+    training_event.on_train_end()
+    raw_train_end_time = logger.previous_log_time
+    training_state.raw_train_time = (raw_train_end_time - raw_train_start_time) / 1e+3
+    return config, training_state
+
+
+if __name__ == "__main__":
+    now = time.time()
+    config, state = main()
+
+    if not utils.is_main_process():
+        exit(1)
+
+    gpu_count = config.n_gpu
+    e2e_time = time.time() - now
+    training_perf = (utils.global_batch_size(config) * state.global_steps) / (state.raw_train_time + 1e-7)
+    if config.do_train:
+        finished_info = {
+            "e2e_time": e2e_time,
+            "training_sequences_per_second": training_perf,
+            "converged": state.converged,
+            "final_loss": state.eval_loss,
+            "final_mlm_accuracy": state.eval_mlm_accuracy,
+            "raw_train_time": state.raw_train_time,
+            "init_time": state.init_time,
+        }
+    else:
+        finished_info = {"e2e_time": e2e_time}
+    logger.log("FINISHED", finished_info, stacklevel=0)
+    if state.converged:
+        exit(0)
+    else:
+        exit(1)
diff --git a/nlp/language_model/bert_sample/pytorch/base/run_training.sh b/nlp/language_model/bert_sample/pytorch/base/run_training.sh
new file mode 100644
index 000000000..5a6d9edeb
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/run_training.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+get_lscpu_value() {
+    awk -F: "(\$1 == \"${1}\"){gsub(/ /, \"\", \$2); print \$2; found=1} END{exit found!=1}"
+}
+lscpu_out=$(lscpu)
+
+n_sockets=$(get_lscpu_value 'Socket(s)' <<< "${lscpu_out}")
+n_cores_per_socket=$(get_lscpu_value 'Core(s) per socket' <<< "${lscpu_out}")
+
+echo "Number of CPU sockets on a node: ${n_sockets}"
+echo "Number of CPU cores per socket: ${n_cores_per_socket}"
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0)); then
+        EXIT_STATUS=1
+    fi
+}
+
+export PYTHONPATH=../:$PYTHONPATH
+
+python3 -u -m bind_pyt \
+    --nsockets_per_node ${n_sockets} \
+    --ncores_per_socket ${n_cores_per_socket} \
+    --no_hyperthreads  \
+    --no_membind "$@" --training_script ./run_pretraining.py --do_train; check_status
+
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/base/run_with_docker.sh b/nlp/language_model/bert_sample/pytorch/base/run_with_docker.sh
new file mode 100644
index 000000000..df3c47c26
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/run_with_docker.sh
@@ -0,0 +1,208 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+# =================================================
+# Constants
+# =================================================
+
+MODEL="bert"
+export MODEL
+DOCKER_IMAGE="perf:${MODEL}"
+NEXP=1
+
+# TODO: Add to Dockerfile
+WORK_DIR="/workspace/baai-perf"
+MODEL_DIR="${WORK_DIR}/benchmarks/${MODEL}/pytorch"
+
+CURRENT_DIR=$(cd `dirname $0`; pwd)
+PROJ_DIR="${CURRENT_DIR}/../../../"
+BUILD_EXTENSION_DIR="${CURRENT_DIR}/build"
+BUILD_EXTENSION_PACKAGE_NAME="ext_ops"
+
+BASE_DOCKERFILE_PATH="${CURRENT_DIR}/BaseDockerfile"
+HOST_DOCKERFILE_PATH="${CURRENT_DIR}/Dockerfile"
+
+SOURCE_DATA_DIR=""
+MAP_DATA_DIR="/mnt/dataset/perf/${MODEL}"
+SUBMITTER="default"
+CONFIG=""
+
+: "${CLEAR_CACHES:=1}"
+SHM_SIZE="32g"
+
+
+# =================================================
+# Parse arguments
+# =================================================
+
+i=2
+TRAINING_SCRIPT_ARGS="$@"
+for arg in "$@"
+do
+    if [[ $arg =~ "--data_dir" ]]; then
+        if [[ $arg =~ "=" ]]; then
+            kv=(${arg//=/ })
+            SOURCE_DATA_DIR=${kv[1]}
+            TRAINING_SCRIPT_ARGS=${TRAINING_SCRIPT_ARGS/$arg/"--data_dir ${MAP_DATA_DIR}"}
+        else
+            SOURCE_DATA_DIR=${!i}
+            TRAINING_SCRIPT_ARGS=${TRAINING_SCRIPT_ARGS/"--data_dir ${!i}"/"--data_dir ${MAP_DATA_DIR}"}
+        fi
+
+    elif [[ $arg =~ "--name" ]]; then
+        if [[ $arg =~ "=" ]]; then
+            kv=(${arg//=/ })
+            SUBMITTER=${kv[1]}
+        else
+            SUBMITTER=${!i}
+        fi
+
+    elif [[ $arg =~ "--config" ]]; then
+        if [[ $arg =~ "=" ]]; then
+            kv=(${arg//=/ })
+            CONFIG=${kv[1]}
+        else
+            CONFIG=${!i}
+        fi
+    fi
+
+    let i++
+done
+
+
+# =================================================
+# Check arguments
+# =================================================
+
+if [[ "${SOURCE_DATA_DIR}" == "" ]]; then
+    echo "ERROR: data_dir is not given, please set --data_dir <DATA_DIR>"
+    exit 1
+fi
+
+if [[ "${CONFIG}" == "" ]]; then
+    echo "ERROR: config is not given, please set --config <CONFIG>"
+    exit 1
+fi
+
+CONTAINER_SUBMITTER_DIR="${WORK_DIR}/${SUBMITTER}"
+HOST_SUBMITTER_DIR="${PROJ_DIR}/${SUBMITTER}"
+
+CONTAINER_ENVIRONMENT_VARIABLES_PATH=${CONTAINER_SUBMITTER_DIR}/${MODEL}/config/environment_variables.sh
+HOST_ENVIRONMENT_VARIABLES_PATH="${HOST_SUBMITTER_DIR}/${MODEL}/config/environment_variables.sh"
+
+HOST_SUBMITTER_DOCKERFILE="${PROJ_DIR}/${SUBMITTER}/${MODEL}/config/Dockerfile"
+CONTAINER_NAME="perf-${SUBMITTER}-${MODEL}-container"
+
+if [ ! -f "${HOST_ENVIRONMENT_VARIABLES_PATH}" ]; then
+    touch "${HOST_ENVIRONMENT_VARIABLES_PATH}"
+fi
+
+source ${HOST_ENVIRONMENT_VARIABLES_PATH}
+
+RESULTS_DIR="${PROJ_DIR}/${SUBMITTER}/${MODEL}/results"
+LOG_FILE_BASE="${RESULTS_DIR}/config_${CONFIG}_experiment"
+
+echo "======================================"
+echo "Arguments"
+echo "---------"
+
+echo "MODEL = ${MODEL}"
+echo "CONTAINER_NAME = ${CONTAINER_NAME}"
+echo "DOCKER_IMAGE = ${DOCKER_IMAGE}"
+echo "MODEL_DIR = ${MODEL_DIR}"
+echo "SUBMITTER = ${SUBMITTER}"
+echo "CONTAINER_SUBMITTER_DIR = ${CONTAINER_SUBMITTER_DIR}"
+echo "HOST_SUBMITTER_DOCKERFILE = ${HOST_SUBMITTER_DOCKERFILE}"
+echo "CONFIG = ${CONFIG}"
+echo "CONTAINER_MOUNTS = ${CONTAINER_MOUNTS}"
+echo "TRAINING_SCRIPT_ARGS = ${TRAINING_SCRIPT_ARGS[*]}"
+echo "CURRENT_DIR = ${CURRENT_DIR}"
+echo "CONTAINER_ENVIRONMENT_VARIABLES_PATH = ${CONTAINER_ENVIRONMENT_VARIABLES_PATH}"
+echo "RESULTS_DIR = ${RESULTS_DIR}"
+echo "LOG_FILE_BASE = ${LOG_FILE_BASE}"
+echo "SHM_SIZE = ${SHM_SIZE}"
+echo "======================================"
+
+
+# =================================================
+# Training
+# =================================================
+
+# Cleanup container
+cleanup_docker() {
+    docker container rm -f "${CONTAINER_NAME}" || true
+}
+cleanup_docker
+trap 'set -eux; cleanup_docker' EXIT
+
+# Clean built extension
+if [ -d "${BUILD_EXTENSION_DIR}" ]; then
+    echo "WARN: Delete built extension"
+    rm -rf "${BUILD_EXTENSION_DIR}"
+    rm -rf ${CURRENT_DIR}/${BUILD_EXTENSION_PACKAGE_NAME}.*.so
+    echo "extension file: "${CURRENT_DIR}/${BUILD_EXTENSION_PACKAGE_NAME}.*.so""
+fi
+
+
+# Build image
+if [ -f "${HOST_DOCKERFILE_PATH}" ]; then
+    echo "WARN: Remove previous Dockerfile"
+    rm -f "${HOST_DOCKERFILE_PATH}"
+fi
+
+echo "WARN: cp BaseDockerfile to Dockerfile"
+cp "${BASE_DOCKERFILE_PATH}" "${HOST_DOCKERFILE_PATH}"
+
+if [ -f "${HOST_SUBMITTER_DOCKERFILE}" ]; then
+    echo "WARN: Found submitter's Dockerfile, merging submitter's Dockerfile to Dockerfile"
+    cat "${HOST_SUBMITTER_DOCKERFILE}" >> "${HOST_DOCKERFILE_PATH}"
+fi
+
+docker build -t ${DOCKER_IMAGE} ./
+
+# Setup container by Dockerfile
+docker run --rm --init --detach \
+    --net=host --uts=host --ipc=host --security-opt=seccomp=unconfined \
+    --privileged=true \
+    --ulimit=stack=67108864 --ulimit=memlock=-1 \
+    -w ${MODEL_DIR} \
+    --shm-size="${SHM_SIZE}" \
+    --volume ${SOURCE_DATA_DIR}:${MAP_DATA_DIR} \
+    --volume ${PROJ_DIR}:${WORK_DIR} \
+    --name="${CONTAINER_NAME}" ${CONTAINER_MOUNTS} \
+    "${DOCKER_IMAGE}" sleep infinity
+
+# make sure container has time to finish initialization
+# TODO: Uncomment
+#sleep 30
+docker exec -it "${CONTAINER_NAME}" true
+
+mkdir -p ${RESULTS_DIR}
+docker exec -it "${CONTAINER_NAME}" sh -c "chmod 777 run_training.sh"
+
+# TODO: Remove pip source
+docker exec -it "${CONTAINER_NAME}" /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"
+
+docker exec -it "${CONTAINER_NAME}" /bin/bash -c "source ${CONTAINER_ENVIRONMENT_VARIABLES_PATH};python3 prepare.py --name ${SUBMITTER} --data_dir ${MAP_DATA_DIR}"
+
+# Run experiments
+for _experiment_index in $(seq 1 "${NEXP}"); do
+    (
+        echo "Beginning trial ${_experiment_index} of ${NEXP}"
+        echo "source ${CONTAINER_ENVIRONMENT_VARIABLES_PATH};bash ./run_training.sh ${TRAINING_SCRIPT_ARGS[*]}"
+
+        if [ "${CLEAR_CACHES}" -eq 1 ]; then
+            sync && sudo /sbin/sysctl vm.drop_caches=3
+        fi
+
+        # Run experiment
+        docker exec -it "${CONTAINER_NAME}" /bin/bash -c "source ${CONTAINER_ENVIRONMENT_VARIABLES_PATH};bash ./run_training.sh ${TRAINING_SCRIPT_ARGS[*]}"
+    ) |& tee "${LOG_FILE_BASE}_${_experiment_index}.log"
+done
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/base/schedulers/__init__.py b/nlp/language_model/bert_sample/pytorch/base/schedulers/__init__.py
new file mode 100644
index 000000000..53a358824
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/schedulers/__init__.py
@@ -0,0 +1 @@
+from .factory import create_scheduler
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/base/schedulers/base.py b/nlp/language_model/bert_sample/pytorch/base/schedulers/base.py
new file mode 100644
index 000000000..ad93dea00
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/schedulers/base.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+from torch.optim.optimizer import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+
+
+class LRScheduler(_LRScheduler):
+    def __init__(self, optimizer, last_epoch=-1):
+        # Check if using mixed precision training
+        self.mixed_training = False
+        base_optimizer = optimizer
+
+        # Check that optimizer param is valid
+        if not isinstance(optimizer, Optimizer):
+            raise TypeError('{} is not an Optimizer'.format(
+                type(optimizer).__name__))
+
+        super(LRScheduler, self).__init__(base_optimizer, last_epoch)
+
+    def step(self, epoch=None):
+        # Set the current training step
+        # ('epoch' is used to be consistent with _LRScheduler)
+        if self.mixed_training:
+            # The assumption is that the step will be constant
+            state_dict = self.optimizer.state[self.optimizer.param_groups[0]['params'][0]]
+            if 'step' in state_dict:
+                self.last_epoch = state_dict['step'] + 1
+            else:
+                self.last_epoch = 1
+        else:
+            self.last_epoch = epoch if epoch is not None else self.last_epoch + 1
+
+        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
+            param_group['lr'] = lr
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/base/schedulers/factory.py b/nlp/language_model/bert_sample/pytorch/base/schedulers/factory.py
new file mode 100644
index 000000000..a74a242ba
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/schedulers/factory.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import config
+
+from .linear_warmup_poly_scheduler import LinearWarmupPolyDecayScheduler
+from .linear_warmup_scheduler import LinearWarmUpScheduler
+
+
+def create_scheduler(optimizer, scheduler="poly"):
+    if config.warmup_proportion == 0:
+        warmup_steps = config.warmup_steps
+        warmup_start = config.start_warmup_step
+    else:
+        warmup_steps = int(config.max_steps * config.warmup_proportion)
+        warmup_start = 0
+
+    if scheduler == "linear":
+        return LinearWarmUpScheduler(optimizer, warmup_steps, config.max_steps)
+
+    if scheduler == "poly":
+        return LinearWarmupPolyDecayScheduler(optimizer, start_warmup_steps=warmup_start,
+                                              warmup_steps=warmup_steps,
+                                              total_steps=config.max_steps, end_learning_rate=0.0, degree=1.0)
+
+    raise ValueError(f"Not found scheduler {scheduler}.")
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/base/schedulers/linear_warmup_poly_scheduler.py b/nlp/language_model/bert_sample/pytorch/base/schedulers/linear_warmup_poly_scheduler.py
new file mode 100644
index 000000000..df9d495cf
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/schedulers/linear_warmup_poly_scheduler.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+from .base import LRScheduler
+
+
+class LinearWarmupPolyDecayScheduler(LRScheduler):
+    """
+    Applies a warm up period to the learning rate.
+    """
+    def __init__(self, optimizer, start_warmup_steps, warmup_steps, total_steps, end_learning_rate=0.0, degree=1.0, last_epoch=-1):
+        self.num_warmup_updates = warmup_steps
+        self.start_warmup_steps = start_warmup_steps
+        self.total_steps = total_steps
+        self.end_learning_rate = end_learning_rate
+        self.degree = degree
+        super(LinearWarmupPolyDecayScheduler, self).__init__(optimizer, last_epoch)
+
+        param_group = self.optimizer.param_groups[0]
+        if 'step' in param_group and param_group['step']>0:
+            self.last_epoch = param_group['step']
+        if self.last_epoch <= 0:
+            self.last_epoch = 0
+
+    def step(self, epoch=None):
+        param_group = self.optimizer.param_groups[0]
+        if 'step' in param_group:
+            self.last_epoch = param_group['step'] + 1
+        else:
+            self.last_epoch += 1
+
+        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
+            param_group['lr'] = lr
+
+    def get_lr(self):
+        mod_step = self.last_epoch - self.start_warmup_steps
+        if mod_step < self.num_warmup_updates:
+            progress = mod_step / self.num_warmup_updates
+            return [(base_lr * progress) for base_lr in self.base_lrs]
+        else:
+            progress = min(self.last_epoch / self.total_steps, 1.0)
+            return [(base_lr - self.end_learning_rate) * (1-progress) ** self.degree + self.end_learning_rate
+                    for base_lr in self.base_lrs]
diff --git a/nlp/language_model/bert_sample/pytorch/base/schedulers/linear_warmup_scheduler.py b/nlp/language_model/bert_sample/pytorch/base/schedulers/linear_warmup_scheduler.py
new file mode 100644
index 000000000..8e82bf661
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/schedulers/linear_warmup_scheduler.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import numpy as np
+
+from .base import LRScheduler
+
+class LinearWarmUpScheduler(LRScheduler):
+    
+    def __init__(self, optimizer, warmup, total_steps, last_epoch=-1):
+        self.warmup = warmup
+        self.total_steps = total_steps
+        super(LinearWarmUpScheduler, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        progress = self.last_epoch / self.total_steps
+        if progress < self.warmup:
+            return [base_lr * progress / self.warmup for base_lr in self.base_lrs]
+        else:
+            return [base_lr * max(( progress - 1.0)/(self.warmup - 1.0), 0.) for base_lr in self.base_lrs]
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/base/setup.py b/nlp/language_model/bert_sample/pytorch/base/setup.py
new file mode 100644
index 000000000..867873cc2
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/setup.py
@@ -0,0 +1,98 @@
+import glob
+import os
+import os.path as ospath
+
+from setuptools import setup, Extension
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+PACKAGE_NAME = "ext_ops"
+
+SOURCE_FILE_EXT = ["c", "cpp", "cu"]
+HEADER_FILE_EXT = ["h", "hpp", "cuh"]
+
+SUPPORT_EXTENSIONS = SOURCE_FILE_EXT + HEADER_FILE_EXT
+
+
+def get_value_from_environ(name: str, default=None):
+    if name in os.environ:
+        return os.environ[name]
+    if name.upper() in os.environ:
+        return os.environ[name.upper()]
+
+    return default
+
+
+def check_source_dir():
+    cur_dir = os.path.dirname(os.path.abspath(__file__))
+    source_dir = os.path.join(os.path.dirname(cur_dir), 'iluvatar/csrc')
+    return source_dir
+
+
+def find_source_files() -> dict:
+    source_dir = check_source_dir()
+
+    if not ospath.exists(source_dir):
+        return dict()
+
+    # Search source files
+    sources = dict()
+    for ext in SOURCE_FILE_EXT:
+        sources[ext] = glob.glob(ospath.join(source_dir, "**", f"*.{ext}"), recursive=True)
+
+    return sources
+
+
+def find_include_dirs() -> list:
+    source_dir = check_source_dir()
+    if not ospath.exists(source_dir):
+        return []
+    return glob.glob(ospath.join(source_dir, "**", "include"), recursive=True)
+
+
+def get_nvcc_arguments() -> list:
+    return [
+        '-U__CUDA_NO_HALF_OPERATORS__',
+        '-U__CUDA_NO_HALF_CONVERSIONS__',
+        "-ftemplate-depth=1024",
+    ]
+
+
+source_files = find_source_files()
+include_dirs = find_include_dirs()
+c_sources = source_files.pop("c")
+other_sources = []
+for _sources in source_files.values():
+    other_sources.extend(_sources)
+
+nvcc_arguments = get_nvcc_arguments()
+
+ext_modules = []
+
+if len(c_sources) != 0:
+    ext_modules.append(Extension(
+        name=PACKAGE_NAME,
+        sources=c_sources,
+        include_dirs=include_dirs,
+        extra_compile_args={
+            'c': ['-O3']
+        }
+    ))
+
+if len(other_sources) != 0:
+    ext_modules.append(CUDAExtension(
+        name=PACKAGE_NAME,
+        sources=other_sources,
+        extra_compile_args={
+            'cxx': ['-O3', ],
+            'nvcc': ['-O3'] + nvcc_arguments
+        }
+    ))
+
+setup(
+    name=PACKAGE_NAME,
+    version="0.1",
+    ext_modules=ext_modules,
+    cmdclass={
+        'build_ext': BuildExtension
+    }
+)
diff --git a/nlp/language_model/bert_sample/pytorch/base/train/__init__.py b/nlp/language_model/bert_sample/pytorch/base/train/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/nlp/language_model/bert_sample/pytorch/base/train/evaluator.py b/nlp/language_model/bert_sample/pytorch/base/train/evaluator.py
new file mode 100644
index 000000000..f4da8d5f9
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/train/evaluator.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+from concurrent.futures import ProcessPoolExecutor
+
+import torch
+import torch.distributed as dist
+
+from dataloaders.dataloader import WorkerInitializer, create_eval_dataloader
+
+import config
+
+class Evaluator:
+
+    def __init__(self, eval_dir: str,
+                 proc_pool: ProcessPoolExecutor,
+                 global_batch_size: int,
+                 max_steps: int,
+                 worker_init: WorkerInitializer=None,
+                 use_cache: bool=False):
+        self.eval_dir = eval_dir
+        self.proc_pool = proc_pool
+        self.use_cache = use_cache
+
+        if worker_init is None:
+            worker_init = WorkerInitializer.default()
+        self.worker_init = worker_init
+
+        self.eval_count = 0
+        self.cached_batches = []
+
+        self.need_next_training_shard = global_batch_size * max_steps > 10000
+        self._dataloader = None
+        self.fetch_dataloader()
+
+    def fetch_dataloader(self):
+        if self._dataloader is None:
+            if self.need_next_training_shard:
+                if self.eval_count == 0:
+                    self.eval_dataset_future = self.proc_pool.submit(
+                        create_eval_dataloader,
+                        config.eval_dir, config.eval_batch_size, config.max_predictions_per_seq,
+                        config.num_eval_examples, self.worker_init
+                    )
+                else:
+                    self._dataloader = self.eval_dataset_future.result(timeout=None)
+            else:
+                self._dataloader = create_eval_dataloader(
+                    config.eval_dir, config.eval_batch_size, config.max_predictions_per_seq,
+                    config.num_eval_examples, self.worker_init
+                )
+
+        return self._dataloader
+
+    def evaluate(self, trainer):
+        self.eval_count += 1
+
+        eval_dataloader = self.fetch_dataloader()
+
+        trainer.model.eval()
+
+        total_eval_loss, total_eval_mlm_acc = 0.0, 0.0
+        total_masked = 0
+
+        # on first eval, load and cache data on GPU
+        if self.eval_count == 1 and self.use_cache:
+            for batch in eval_dataloader:
+                self.cached_batches.append([t.to(trainer.device) for t in batch])
+
+        with torch.no_grad():
+            for batch in self.cached_batches if self.use_cache else eval_dataloader:
+                if not self.use_cache:
+                    batch = [t.to(trainer.device) for t in batch]
+                loss, mlm_acc, num_masked = trainer.inference(batch)
+                total_eval_loss += loss * num_masked
+                total_eval_mlm_acc += mlm_acc * num_masked
+                total_masked += num_masked
+                torch.cuda.synchronize()
+        trainer.model.train()
+
+        if torch.distributed.is_initialized():
+            # Collect total scores from all ranks
+            torch.distributed.all_reduce(total_eval_mlm_acc, op=torch.distributed.ReduceOp.SUM)
+            torch.distributed.all_reduce(total_eval_loss, op=torch.distributed.ReduceOp.SUM)
+            torch.distributed.all_reduce(total_masked, op=torch.distributed.ReduceOp.SUM)
+
+        # Average by number of examples
+        total_eval_mlm_acc /= total_masked
+        total_eval_loss /= total_masked
+
+        return total_eval_loss.item(), total_eval_mlm_acc.item()
+
diff --git a/nlp/language_model/bert_sample/pytorch/base/train/event/__init__.py b/nlp/language_model/bert_sample/pytorch/base/train/event/__init__.py
new file mode 100644
index 000000000..2cd8c9e60
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/train/event/__init__.py
@@ -0,0 +1,4 @@
+from .base import BaseTrainingEventInterface
+from .base_adapter import BaseTrainingEventAdapter
+from .compose import TrainingEventCompose
+from .log import TrainingLogger
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/base/train/event/base.py b/nlp/language_model/bert_sample/pytorch/base/train/event/base.py
new file mode 100644
index 000000000..0e5075203
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/train/event/base.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+from typing import Tuple, List
+
+import torch.nn
+from torch import Tensor
+from torch.cuda.amp import GradScaler
+from torch.optim import Optimizer
+
+BERT_MODEL = torch.nn.Module
+BatchType =  Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]
+
+class BaseTrainingEventInterface:
+
+    def __init__(self, config):
+        self.config = config
+
+    def convert_model(self, model: BERT_MODEL) -> BERT_MODEL:
+        return model
+
+    def create_optimizer(self, model: BERT_MODEL) -> Optimizer:
+        raise NotImplementedError()
+
+    def model_to_fp16(self, model: BERT_MODEL, optimizer: Optimizer) -> Tuple[BERT_MODEL, Optimizer]:
+        return model, optimizer
+
+    def model_to_ddp(self, model: BERT_MODEL) -> BERT_MODEL:
+        return model
+
+    def on_init_start(self):
+        pass
+
+    def on_init_end(self):
+        pass
+
+    def on_backward(self, step: int, loss: Tensor, optimizer: Optimizer, grad_scaler: GradScaler=None):
+        pass
+
+    def on_train_begin(self):
+        pass
+
+    def on_train_end(self):
+        pass
+
+    def on_epoch_begin(self, epoch: int):
+        pass
+
+    def on_epoch_end(self, epoch: int):
+        pass
+
+    def on_step_begin(self, step: int):
+        pass
+
+    def on_step_end(self, step: int):
+        pass
+
+
diff --git a/nlp/language_model/bert_sample/pytorch/base/train/event/base_adapter.py b/nlp/language_model/bert_sample/pytorch/base/train/event/base_adapter.py
new file mode 100644
index 000000000..3d549ff00
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/train/event/base_adapter.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+from torch.optim import Optimizer
+
+from .base import BaseTrainingEventInterface
+
+
+class BaseTrainingEventMix:
+
+    def launch(self):
+        pass
+
+    def create_optimizer(self, optimizer: Optimizer):
+        pass
+
+    def on_init_evaluate(self, result: dict):
+        pass
+
+    def on_evaluate(self, result: dict):
+        pass
+
+    def on_step_end(self, step: int, result: dict = None):
+        pass
+
+
+class BaseTrainingEventAdapter(BaseTrainingEventMix, BaseTrainingEventInterface):
+    pass
+
+
+
+
+
diff --git a/nlp/language_model/bert_sample/pytorch/base/train/event/compose.py b/nlp/language_model/bert_sample/pytorch/base/train/event/compose.py
new file mode 100644
index 000000000..0aceb4350
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/train/event/compose.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+from typing import List, Union, Callable, Tuple
+
+from torch import Tensor
+from torch.cuda.amp import GradScaler
+from torch.optim import Optimizer
+
+from .base import BaseTrainingEventInterface as TrainingEventInterface, BERT_MODEL
+from .base_adapter import BaseTrainingEventMix, BaseTrainingEventAdapter
+
+
+class TrainingEventCompose(BaseTrainingEventAdapter):
+
+    def __init__(self, interface: TrainingEventInterface, events: List[BaseTrainingEventAdapter]):
+        super(TrainingEventCompose, self).__init__(interface.config)
+
+        self.interface = interface
+        self.events = events
+
+    def launch(self):
+        self._call_events_func(self.launch, with_interface=False)
+
+    def convert_model(self, model: BERT_MODEL) -> BERT_MODEL:
+        model = self.interface.convert_model(model)
+        self._call_events_func(self.convert_model, with_interface=False, model=model)
+        return model
+
+    def create_optimizer(self, model: BERT_MODEL) -> Optimizer:
+        optimizer = self.interface.create_optimizer(model)
+        self._call_events_func(self.create_optimizer, with_interface=False, optimizer=optimizer)
+        return optimizer
+
+    def model_to_fp16(self, model: BERT_MODEL, optimizer: Optimizer) -> Tuple[BERT_MODEL, Optimizer]:
+        model, optimizer = self.interface.model_to_fp16(model, optimizer)
+        self._call_events_func(self.model_to_fp16, with_interface=False, model=model, optimizer=optimizer)
+        return model, optimizer
+
+    def model_to_ddp(self, model: BERT_MODEL) -> BERT_MODEL:
+        model = self.interface.model_to_ddp(model)
+        self._call_events_func(self.model_to_ddp, with_interface=False, model=model)
+        return model
+
+    def on_init_evaluate(self, result: dict):
+        self._call_events_func(self.on_init_evaluate, with_interface=False, result=result)
+
+    def on_evaluate(self, result: dict):
+        self._call_events_func(self.on_evaluate, with_interface=False, result=result)
+
+    def on_init_start(self):
+        self._call_events_func(self.on_init_start, with_interface=True)
+
+    def on_init_end(self):
+        self._call_events_func(self.on_init_end, with_interface=True)
+
+    def on_backward(self, step: int, loss: Tensor, optimizer: Optimizer, grad_scaler: GradScaler = None):
+        return self.interface.on_backward(step, loss, optimizer, grad_scaler)
+
+    def on_train_begin(self):
+        self._call_events_func(self.on_train_begin, with_interface=True)
+
+    def on_train_end(self):
+        self._call_events_func(self.on_train_end, with_interface=True)
+
+    def on_epoch_begin(self, epoch: int):
+        self._call_events_func(self.on_epoch_begin, with_interface=True, epoch=epoch)
+
+    def on_epoch_end(self, epoch: int):
+        self._call_events_func(self.on_epoch_end, with_interface=True, epoch=epoch)
+
+    def on_step_begin(self, step: int):
+        self._call_events_func(self.on_step_begin, with_interface=True, step=step)
+
+    def on_step_end(self, step: int, result: dict = None):
+        self.interface.on_step_end(step)
+        self._call_events_func(self.on_step_end, with_interface=False, step=step, result=result)
+
+    def _call_events_func(self, func: Union[str, Callable], with_interface=False, *args, **kwargs):
+        func_name = self._get_func_name(func)
+        events = self.events
+        if with_interface:
+            events = [self.interface] + events
+
+        result = []
+        for event in events:
+            ret = None
+            if hasattr(event, func_name):
+                 ret = getattr(event, func_name)(*args, **kwargs)
+            result.append(ret)
+        return result
+
+    def _get_func_name(self, func: Union[str, Callable]):
+        if isinstance(func, str):
+            return func
+
+        if callable(func):
+            return func.__name__
+
+        return None
+
+
+
diff --git a/nlp/language_model/bert_sample/pytorch/base/train/event/log.py b/nlp/language_model/bert_sample/pytorch/base/train/event/log.py
new file mode 100644
index 000000000..c9c107429
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/train/event/log.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import copy
+import inspect
+import os
+import os.path as ospath
+from typing import Tuple, Union, Iterable
+
+from torch import Tensor
+from torch.cuda.amp import GradScaler
+from torch.optim import Optimizer
+
+from config.config_manager import get_properties_from_config
+from utils.logging import PerfLogger, LogEvent, PerfLogLevel
+from .base import BERT_MODEL
+from .base_adapter import BaseTrainingEventAdapter
+
+
+STACKLEVEL = 4
+
+
+class TrainingLogger(BaseTrainingEventAdapter):
+
+    def __init__(self, config, logger: PerfLogger=None, log_freq: int = 0):
+        super(TrainingLogger, self).__init__(config)
+        self.config = config
+        self.log_freq = log_freq
+        level = PerfLogLevel.INFO if log_freq > 0 else PerfLogLevel.SUBMITTION
+        if logger is None:
+            logger = PerfLogger.get_default_logger(rank=config.local_rank, level=level)
+        self.logger = logger
+
+        self.model = None
+        self.submitter = None
+
+    def launch(self):
+        self.logger.log(LogEvent.launch_training, "Launch training", stacklevel=STACKLEVEL)
+        config_path: str = self.config.config
+        config_dict = get_properties_from_config(self.config)
+        for key, value in config_dict.items():
+            if type(value) not in [int, float, str, bool] and not isinstance(value, Iterable):
+                config_dict[key] = str(value)
+
+        # Extract definition of training event
+        try:
+            training_event_class = self.config.training_event
+            if not inspect.isclass(training_event_class):
+                training_event_class = training_event_class.__class__
+            training_event_class_define = inspect.getabsfile(training_event_class)
+            training_event_class_define = training_event_class_define.rsplit(".py", maxsplit=1)[0]
+            training_event_class_define += ":" + training_event_class.__name__
+        except:
+            training_event_class_define = str(self.config.training_event)
+        config_dict['training_event'] = training_event_class_define
+
+        # Like /path/to/proj/submitter/model/config/config_xxx.py
+        if config_path.startswith("."):
+            config_path = ospath.abspath(config_path)
+
+        config_path_nodes = config_path.rsplit(sep="/", maxsplit=4)
+        submitter = config_path_nodes[1]
+        model = config_path_nodes[2]
+        self.logger.init_logger(submitter=submitter,
+                                model=model,
+                                config_path=config_path,
+                                config=config_dict,
+                                stacklevel=STACKLEVEL)
+
+        self.model = model
+        self.submitter = submitter
+
+    def convert_model(self, model: BERT_MODEL):
+        model_class = type(model)
+        model_info = dict(
+            type = model_class.__name__,
+            module = model_class.__module__ if hasattr(model_class, "__module__") else "None"
+        )
+        self._log_event(LogEvent.convert_model, model_info)
+
+    def create_optimizer(self, optimizer: Optimizer):
+        optimizer_class = type(optimizer)
+        optimizer_info = dict(
+            type=optimizer_class.__name__,
+            module=optimizer_class.__module__ if hasattr(optimizer_class, "__module__") else "None"
+        )
+        self._log_event(LogEvent.create_optimizer, optimizer_info)
+
+    def model_to_fp16(self, model: BERT_MODEL, optimizer: Optimizer):
+        fp16_info = dict(
+            fp16 = self.config.fp16 if hasattr(self.config, "fp16") else False
+        )
+        self._log_event(LogEvent.model_to_fp16, fp16_info)
+
+    def model_to_ddp(self, model: BERT_MODEL):
+        model_class = type(model)
+        model_info = dict(
+            type=model_class.__name__,
+            module=model_class.__module__ if hasattr(model_class, "__module__") else None
+        )
+        self._log_event(LogEvent.model_to_ddp, model_info)
+
+    def on_init_evaluate(self, result: dict):
+        self._log_event(LogEvent.init_evaluation, result)
+
+    def on_evaluate(self, result: dict):
+        self._log_event(LogEvent.evaluation, result)
+
+    def on_init_start(self):
+        self._log_event(LogEvent.init_start)
+
+    def on_init_end(self):
+        self._log_event(LogEvent.init_end, "Finish initialization")
+
+    def on_backward(self, step: int, loss: Tensor, optimizer: Optimizer, grad_scaler: GradScaler=None):
+        pass
+
+    def on_train_begin(self):
+        self._log_event(LogEvent.train_begin)
+
+    def on_train_end(self):
+        self._log_event(LogEvent.train_end)
+
+    def on_epoch_begin(self, epoch: int):
+        epoch_info = dict(epoch=epoch)
+        self._log_event(LogEvent.epoch_begin, epoch_info)
+
+    def on_epoch_end(self, epoch: int):
+        epoch_info = dict(epoch=epoch)
+        self._log_event(LogEvent.epoch_end, epoch_info)
+
+    def on_step_begin(self, step: int):
+        pass
+
+    def on_step_end(self, step: int, result: dict=None):
+        if (self.log_freq <= 0 or step % self.log_freq != 0) and step != 1:
+            return
+        if result is None:
+            step_info = dict()
+        else:
+            step_info = copy.copy(result)
+
+        step_info['step'] = step
+        self._log_event(LogEvent.step_end, step_info)
+
+    def _log_event(self, event, *args, **kwargs):
+        self.logger.log(event, stacklevel=STACKLEVEL, *args, **kwargs)
+
diff --git a/nlp/language_model/bert_sample/pytorch/base/train/trainer.py b/nlp/language_model/bert_sample/pytorch/base/train/trainer.py
new file mode 100644
index 000000000..4384c23c5
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/train/trainer.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+import numpy as np
+import math
+import time
+import os
+import sys
+
+import torch
+from torch.cuda.amp import GradScaler
+from torch.types import Device
+
+import config
+import utils
+from dataloaders.dataset import exchange_padding_fast
+from model import create_model
+from schedulers import create_scheduler
+from train.evaluator import Evaluator
+from train.training_state import TrainingState
+from train.event import TrainingEventCompose as TrainingEvent
+from utils.checkpoint import remap_segmented_model_parameters
+
+
+class Trainer():
+
+    def __init__(self, training_event: TrainingEvent,
+                 evaluator: Evaluator,
+                 training_state: TrainingState,
+                 grad_scaler: GradScaler,
+                 device: Device):
+        super(Trainer, self).__init__()
+        self.training_event = training_event
+        self.training_state = training_state
+        self.grad_scaler = grad_scaler
+
+        self.device = device
+        self.optimizer = None
+        self.bert_config = None
+        self.model = None
+        self.evaluator = evaluator
+        self.lr_scheduler = None
+        self.init_epoch = 0
+        self.init_dataloader_idx = 0
+
+    def init(self):
+        self.bert_config, self.model = create_model(config)
+        self.model = self._init_model(self.model, self.device)
+        self.model = self.training_event.convert_model(self.model)
+        self.optimizer = self.training_event.create_optimizer(self.model)
+        self.model, self.optimizer = self.training_event.model_to_fp16(self.model, self.optimizer)
+        self.model = self.training_event.model_to_ddp(self.model)
+        self.lr_scheduler = create_scheduler(self.optimizer)
+        self.load()
+
+    def _init_model(self, model, device):
+        checkpoint = torch.load(config.init_checkpoint, map_location="cpu")
+        if self._is_resume_checkpoint(checkpoint):
+            if "global_steps" in checkpoint['state'] and checkpoint['state']['global_steps'] > 0:
+                config.learning_rate = checkpoint['state']['learning_rate']
+        else:
+            if "model" in checkpoint:
+                checkpoint = checkpoint["model"]
+            checkpoint_remapped = remap_segmented_model_parameters(checkpoint)
+            model.load_state_dict(checkpoint_remapped, strict=True)
+        model = model.to(device)
+        return model
+
+    def _is_resume_checkpoint(self, checkpoint):
+        return "optimizer" in checkpoint
+    
+    def load(self):
+        checkpoint = torch.load(config.init_checkpoint, map_location="cpu")
+
+        if not self._is_resume_checkpoint(checkpoint):
+            return
+
+        model_ckpt = checkpoint['model']
+        if config.distributed_lamb:
+            self.model.load_state_dict(model_ckpt,strict=True)
+        else:
+            self.model.module.load_state_dict(model_ckpt,strict=True)
+
+        if 'global_steps' in  checkpoint['state'] and checkpoint['state']['global_steps'] > 0:
+            # restore optimizer
+            optimizer_ckpt = checkpoint['optimizer']
+            self.optimizer.load_state_dict(optimizer_ckpt)
+
+            # restore epoch, dataloader_idx
+            self.init_epoch = checkpoint['state']['epoch']
+            self.init_dataloader_idx = checkpoint['state']['iter_dataloader_idx']
+
+            self.training_state.global_steps = checkpoint['state']['global_steps']
+        
+
+    def save(self):
+        model_dict = self.model.state_dict()
+        optimizer_dict = self.optimizer.state_dict()
+        state_dict = self.training_state.to_dict()
+
+        save_dict = {'model':model_dict,'optimizer':optimizer_dict,'state': state_dict}
+
+        save_file = os.path.join(config.output_dir,f'model.ckpt-{self.training_state.global_steps}.pt')
+
+        utils.main_proc_print(f"save for steps:{self.training_state.global_steps}")
+
+        if utils.get_rank() == 0 or utils.get_rank() == -1:
+            if not os.path.isdir(config.output_dir):
+                os.makedirs(config.output_dir)
+            torch.save(save_dict,save_file)
+
+    def train_one_epoch(self, dataloader):
+        state = self.training_state
+        training_event = self.training_event
+
+        # restore epoch
+        if state.epoch < self.init_epoch:
+            return
+
+        training_event.on_epoch_begin(state.epoch)
+
+        step_start_time = time.time()
+        for dataloader_idx, batch_idx, batch in dataloader.iter_batchs():
+
+            # restore dataloader
+            if state.epoch == self.init_epoch and dataloader_idx <= self.init_dataloader_idx:
+                continue
+
+            state.num_trained_samples = state.global_steps * utils.global_batch_size(config)
+
+            state.global_steps += 1
+            state.iter_dataloader_idx = dataloader_idx
+            self.training_event.on_step_begin(state.global_steps)
+            self.train_one_step(batch_idx, batch)
+
+            other_state = dict()
+            if state.global_steps % config.gradient_accumulation_steps == 0:
+                step_end_time = time.time()
+                step_total_time = step_end_time - step_start_time
+                step_start_time = step_end_time
+                sequences_per_second = (utils.global_batch_size(config) * config.gradient_accumulation_steps) / step_total_time
+                other_state["seq/s"] = sequences_per_second
+
+            eval_result = None
+            # if self.can_do_eval(state):
+            if state.global_steps > 0 and  state.global_steps % config.eval_steps == 0:
+                eval_start = time.time()
+                state.eval_loss, state.eval_mlm_accuracy = self.evaluator.evaluate(self)
+                eval_end = time.time()
+                eval_result = dict(global_steps=state.global_steps,
+                                   eval_loss=state.eval_loss,
+                                   eval_mlm_accuracy=state.eval_mlm_accuracy,
+                                   time=eval_end - eval_start)
+                if config.save_checkpoint:
+                    self.save()
+
+            end_training = self.detect_training_status(state)
+
+            step_info = state.to_dict(**other_state)
+
+            self.training_event.on_step_end(state.global_steps, result=step_info)
+
+            if eval_result is not None:
+                self.training_event.on_evaluate(eval_result)
+
+            if end_training:
+                break
+
+        training_event.on_epoch_end(state.epoch)
+
+    def train_one_step(self, batch_idx, batch):
+        if config.exchange_padding == True:
+            batch = [t.to(self.device, non_blocking=True, dtype=torch.int16) for t in batch]
+            batch = exchange_padding_fast(self.device, config.train_batch_size, *batch)
+        else:
+            batch = [t.to(self.device, non_blocking=True) for t in batch]
+
+        state = self.training_state
+
+        self.model.train()
+        state.loss, state.mlm_acc, _ = self.forward(batch)
+        if not np.isfinite(state.loss.item()):
+            print("Loss is {}, stopping training".format(state.loss.item()))
+            sys.exit(1)
+        self.training_event.on_backward(state.global_steps, state.loss, self.optimizer, self.grad_scaler)
+        self.lr_scheduler.step()
+
+    def detect_training_status(self, state: TrainingState):
+        if state.eval_mlm_accuracy >= config.target_mlm_accuracy:
+            state.converged_success()
+
+        if state.global_steps > config.max_steps or state.num_trained_samples > config.max_samples_termination:
+            state.end_training = True
+
+        return state.end_training
+
+    def can_do_eval(self, state: TrainingState):
+        do_eval = all([
+            config.eval_dir is not None,
+            state.num_trained_samples >= config.eval_iter_start_samples,
+            state.global_steps % math.ceil(config.eval_interval_samples / utils.global_batch_size(config)) == 0,
+            config.eval_interval_samples > 0,
+            state.global_steps > 1,
+        ])
+
+        return do_eval or state.global_steps >= config.max_steps
+
+    def forward(self, batch):
+        input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch
+        loss, mlm_acc, num_valid = self.model(input_ids, segment_ids, input_mask,
+                                              masked_lm_labels, next_sentence_labels)
+        return loss, mlm_acc, num_valid
+
+    def inference(self, batch):
+        self.model.eval()
+        return self.forward(batch)
diff --git a/nlp/language_model/bert_sample/pytorch/base/train/training_state.py b/nlp/language_model/bert_sample/pytorch/base/train/training_state.py
new file mode 100644
index 000000000..ccfac960b
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/train/training_state.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+from dataclasses import dataclass
+
+import torch
+import utils
+
+
+@dataclass
+class TrainingState:
+    _trainer = None
+    _status = 'aborted'  # later set to 'success' if termination criteria met
+
+    global_steps = 0
+    skipped_steps = 0
+    iter_dataloader_idx = 0
+
+    loss: float = 0.0
+    mlm_acc: float = 0.0
+
+    epoch: int = 1
+    num_trained_samples = 0
+    end_training: bool = False
+    converged: bool = False
+
+    eval_loss = 0
+    eval_mlm_accuracy = 0
+
+    init_time = 0
+    raw_train_time = 0
+
+    def status(self):
+        if self.converged:
+            self._status = "success"
+        return self._status
+
+    def converged_success(self):
+        self.end_training = True
+        self.converged = True
+
+    def to_dict(self, **kwargs):
+        state_dict = dict()
+
+        for var_name, value in self.__dict__.items():
+            if not var_name.startswith("_") and utils.is_property(value):
+                state_dict[var_name] = value
+
+        lr = self._trainer.lr_scheduler.get_lr()
+        if isinstance(lr, (tuple, list)):
+            lr = lr[0]
+        state_dict["learning_rate"] = lr
+
+        exclude = ["eval_loss", "eval_mlm_accuracy", "skipped_steps",
+                   "converged", "init_time", "raw_train_time"]
+        for exkey in exclude:
+            if exkey in state_dict:
+                state_dict.pop(exkey)
+
+        state_dict.update(kwargs)
+
+        for k in state_dict.keys():
+            if torch.is_tensor(state_dict[k]):
+                state_dict[k] = state_dict[k].item()
+
+        return state_dict
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/base/utils/__init__.py b/nlp/language_model/bert_sample/pytorch/base/utils/__init__.py
new file mode 100644
index 000000000..9c0ed2c68
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/utils/__init__.py
@@ -0,0 +1,17 @@
+import inspect
+
+from .check import check_config
+from .dist import *
+
+def is_property(value):
+    status = [
+        not callable(value),
+        not inspect.isclass(value),
+        not inspect.ismodule(value),
+        not inspect.ismethod(value),
+        not inspect.isfunction(value),
+        not inspect.isbuiltin(value),
+        "classmethod object" not in str(value)
+    ]
+
+    return all(status)
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/base/utils/check.py b/nlp/language_model/bert_sample/pytorch/base/utils/check.py
new file mode 100644
index 000000000..41cad658e
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/utils/check.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import os
+import os.path as ospath
+from .dist import global_batch_size
+
+
+def get_config_arg(config, name):
+    if hasattr(config, name):
+        value = getattr(config, name)
+        if value is not None:
+            return value
+
+    if name in os.environ:
+        return os.environ[name]
+
+    return None
+
+
+def check_config(config):
+    print("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
+        config.device, config.n_gpu, config.local_rank != -1, config.fp16))
+
+    train_dir = get_config_arg(config, "train_dir")
+
+    data_dir = get_config_arg(config, "data_dir")
+    config.data_dir = data_dir
+    if data_dir is None and train_dir is None:
+        raise ValueError("Invalid data_dir and train_dir, should be given a path.")
+
+    if train_dir is None:
+        config.train_dir = ospath.join(data_dir, "2048_shards_uncompressed")
+
+    init_checkpoint = get_config_arg(config, "init_checkpoint")
+    config.init_checkpoint = init_checkpoint
+    if init_checkpoint is None:
+        if data_dir is None:
+            raise ValueError("Invalid init_checkpoint and data_dir, should be given a path.")
+        config.init_checkpoint = ospath.join(data_dir, "model.ckpt-28252.pt")
+
+    bert_config_path = get_config_arg(config, "bert_config_path")
+    config.bert_config_path = bert_config_path
+    if bert_config_path is None:
+        if data_dir is None:
+            raise ValueError("Invalid bert_config_path and data_dir, should be given a path.")
+        config.bert_config_path = ospath.join(data_dir, "bert_config.json")
+
+    eval_dir = get_config_arg(config, "eval_dir")
+    config.eval_dir = eval_dir
+    if eval_dir is None:
+        if data_dir is None:
+            raise ValueError("Invalid eval_dir and data_dir, should be given a path.")
+        config.eval_dir = ospath.join(data_dir, "eval_set_uncompressed")
+
+    if config.gradient_accumulation_steps < 1:
+        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
+            config.gradient_accumulation_steps))
+    # if args.train_batch_size % args.gradient_accumulation_steps != 0:
+    #     raise ValueError("Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible".format(
+    #         args.gradient_accumulation_steps, args.train_batch_size))
+    #
+    # args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
+
+    if config.eval_interval_samples == 0:
+        eval_interval_samples = (0.05 * (230.23 * global_batch_size(config) + 3000000)) / 25000
+        eval_interval_samples = int(eval_interval_samples) * 25000
+        config.eval_interval_samples = eval_interval_samples
+
+    # if not (config.do_train or (config.eval_dir and config.eval_iter_samples <= 0)):
+    #     raise ValueError(" `do_train`  or should be in offline eval mode")
+
+    # if not config.resume_from_checkpoint or not os.path.exists(config.output_dir):
+    #     os.makedirs(config.output_dir, exist_ok=True)
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/nlp/language_model/bert_sample/pytorch/base/utils/checkpoint.py b/nlp/language_model/bert_sample/pytorch/base/utils/checkpoint.py
new file mode 100644
index 000000000..7926f529d
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/utils/checkpoint.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Returns true only if resuming from a checkpoint found in output_dir.
+# init_checkpoint and init_tf_checkpoint are not considered
+import glob
+import os
+from collections import OrderedDict
+
+
+def found_resume_checkpoint(args):
+    if args.phase2:
+        checkpoint_str = "phase2_ckpt*.pt"
+    else:
+        checkpoint_str = "phase1_ckpt*.pt"
+    return args.resume_from_checkpoint and (args.resume_init_checkpoint is not None or len(
+        glob.glob(os.path.join(args.output_dir, checkpoint_str)))) > 0
+
+
+def remap_attn_parameters(model_dict):
+    res_dict = OrderedDict()
+    for k in model_dict:
+        if 'attention' in k:
+            if 'self.query.weight' in k:
+                new_k = k.replace('self.query.weight', 'multi_head_attention.q_weight')
+            elif 'self.key.weight' in k:
+                new_k = k.replace('self.key.weight', 'multi_head_attention.k_weight')
+            elif 'self.value.weight' in k:
+                new_k = k.replace('self.value.weight', 'multi_head_attention.v_weight')
+            elif 'self.query.bias' in k:
+                new_k = k.replace('self.query.bias', 'multi_head_attention.q_bias')
+            elif 'self.key.bias' in k:
+                new_k = k.replace('self.key.bias', 'multi_head_attention.k_bias')
+            elif 'self.value.bias' in k:
+                new_k = k.replace('self.value.bias', 'multi_head_attention.v_bias')
+            elif 'output.dense.weight' in k:
+                new_k = k.replace('output.dense.weight', 'multi_head_attention.out_proj_weight')
+            elif 'output.dense.bias' in k:
+                new_k = k.replace('output.dense.bias', 'multi_head_attention.out_proj_bias')
+            elif 'output.LayerNorm.weight' in k:
+                new_k = k.replace('output.LayerNorm.weight', 'layer_norm.weight')
+            elif 'output.LayerNorm.bias' in k:
+                new_k = k.replace('output.LayerNorm.bias', 'layer_norm.bias')
+            else:
+                new_k = k
+        else:
+            new_k = k
+        res_dict[new_k] = model_dict[k]
+    model_dict.clear()
+    return res_dict
+
+
+def remap_segmented_model_parameters(model_dict):
+    res_dict = OrderedDict()
+    for k in model_dict:
+        if 'bert' in k:
+            new_k = 'bert_model_segment.' + k
+        elif 'cls' in k:
+            new_k = 'heads_only_segment.' + k
+        else:
+            assert False, "shouldn't happen"
+        res_dict[new_k] = model_dict[k]
+    model_dict.clear()
+    return res_dict
diff --git a/nlp/language_model/bert_sample/pytorch/base/utils/dist.py b/nlp/language_model/bert_sample/pytorch/base/utils/dist.py
new file mode 100644
index 000000000..99744372e
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/utils/dist.py
@@ -0,0 +1,202 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import torch
+import torch.distributed as dist
+
+from contextlib import contextmanager
+import logging.config
+import random
+
+
+def generate_seeds(rng, size):
+    """
+    Generate list of random seeds
+
+    :param rng: random number generator
+    :param size: length of the returned list
+    """
+    seeds = [rng.randint(0, 2**32 - 1) for _ in range(size)]
+    return seeds
+
+
+def broadcast_seeds(seeds, device):
+    """
+    Broadcasts random seeds to all distributed workers.
+    Returns list of random seeds (broadcasted from workers with rank 0).
+
+    :param seeds: list of seeds (integers)
+    :param device: torch.device
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        seeds_tensor = torch.LongTensor(seeds).to(device)
+        torch.distributed.broadcast(seeds_tensor, 0)
+        seeds = seeds_tensor.tolist()
+    return seeds
+
+
+def setup_seeds(master_seed, epochs, device):
+    """
+    Generates seeds from one master_seed.
+    Function returns (worker_seeds, shuffling_seeds), worker_seeds are later
+    used to initialize per-worker random number generators (mostly for
+    dropouts), shuffling_seeds are for RNGs resposible for reshuffling the
+    dataset before each epoch.
+    Seeds are generated on worker with rank 0 and broadcasted to all other
+    workers.
+
+    :param master_seed: master RNG seed used to initialize other generators
+    :param epochs: number of epochs
+    :param device: torch.device (used for distributed.broadcast)
+    """
+    if master_seed is None:
+        # random master seed, random.SystemRandom() uses /dev/urandom on Unix
+        master_seed = random.SystemRandom().randint(0, 2**32 - 1)
+        if get_rank() == 0:
+            # master seed is reported only from rank=0 worker, it's to avoid
+            # confusion, seeds from rank=0 are later broadcasted to other
+            # workers
+            logging.info(f'Using random master seed: {master_seed}')
+    else:
+        # master seed was specified from command line
+        logging.info(f'Using master seed from command line: {master_seed}')
+
+    # initialize seeding RNG
+    seeding_rng = random.Random(master_seed)
+
+    # generate worker seeds, one seed for every distributed worker
+    worker_seeds = generate_seeds(seeding_rng, get_world_size())
+
+    # generate seeds for data shuffling, one seed for every epoch
+    shuffling_seeds = generate_seeds(seeding_rng, epochs)
+
+    # broadcast seeds from rank=0 to other workers
+    worker_seeds = broadcast_seeds(worker_seeds, device)
+    shuffling_seeds = broadcast_seeds(shuffling_seeds, device)
+    return worker_seeds, shuffling_seeds
+
+
+def barrier():
+    """
+    Works as a temporary distributed barrier, currently pytorch
+    doesn't implement barrier for NCCL backend.
+    Calls all_reduce on dummy tensor and synchronizes with GPU.
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        torch.distributed.all_reduce(torch.cuda.FloatTensor(1))
+        torch.cuda.synchronize()
+
+
+def get_rank(default=0):
+    """
+    Gets distributed rank or returns zero if distributed is not initialized.
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        rank = torch.distributed.get_rank()
+    else:
+        rank = default
+    return rank
+
+
+def get_world_size():
+    """
+    Gets total number of distributed workers or returns one if distributed is
+    not initialized.
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        world_size = torch.distributed.get_world_size()
+    else:
+        world_size = 1
+    return world_size
+
+
+def main_proc_print(*args, **kwargs):
+    if is_main_process():
+        print(*args, **kwargs)
+
+
+def set_device(cuda, local_rank):
+    """
+    Sets device based on local_rank and returns instance of torch.device.
+
+    :param cuda: if True: use cuda
+    :param local_rank: local rank of the worker
+    """
+    if cuda:
+        torch.cuda.set_device(local_rank)
+        device = torch.device('cuda')
+    else:
+        device = torch.device('cpu')
+    return device
+
+
+def init_dist_training_env(config):
+    if config.local_rank == -1:
+        device = torch.device("cuda")
+        num_gpus = torch.cuda.device_count()
+    else:
+        torch.cuda.set_device(config.local_rank)
+        device = torch.device("cuda", config.local_rank)
+        host_addr_full = 'tcp://' + os.environ["MASTER_ADDR"] + ':' + os.environ["MASTER_PORT"]
+        rank = int(os.environ["RANK"])
+        world_size = int(os.environ["WORLD_SIZE"])
+
+        dist_backend = config.dist_backend
+        DIST_BACKEND_ENV = "PT_DIST_BACKEND"
+        if DIST_BACKEND_ENV in os.environ:
+            print("WARN: Use the distributed backend of the environment.")
+            dist_backend = os.environ[DIST_BACKEND_ENV]
+
+        torch.distributed.init_process_group(backend=dist_backend, init_method=host_addr_full, rank=rank, world_size=world_size)
+        num_gpus = torch.distributed.get_world_size()
+
+    return device, num_gpus
+
+
+def global_batch_size(config):
+    return config.train_batch_size * config.n_gpu
+
+
+@contextmanager
+def sync_workers():
+    """
+    Yields distributed rank and synchronizes all workers on exit.
+    """
+    rank = get_rank()
+    yield rank
+    barrier()
+
+
+def is_main_process():
+    if dist.is_initialized():
+        if "LOCAL_RANK" in os.environ:
+            return int(os.environ["LOCAL_RANK"]) == 0
+        else:
+            return get_rank() == 0
+
+    return True
+
+
+def format_step(step):
+    if isinstance(step, str):
+        return step
+    s = ""
+    if len(step) > 0:
+        s += "Training Epoch: {} ".format(step[0])
+    if len(step) > 1:
+        s += "Training Iteration: {} ".format(step[1])
+    if len(step) > 2:
+        s += "Validation Iteration: {} ".format(step[2])
+    return s
diff --git a/nlp/language_model/bert_sample/pytorch/base/utils/logging.py b/nlp/language_model/bert_sample/pytorch/base/utils/logging.py
new file mode 100644
index 000000000..d38d5c6cb
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/utils/logging.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright Declaration: This software, including all of its code and documentation,
+# except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# CoreX. No user of this software shall have any right, ownership or interest in this software and
+# any use of this software shall be in compliance with the terms and conditions of the End User
+# License Agreement.
+
+
+import os
+import sys
+import time
+import logging
+import json
+from logging import currentframe
+from typing import NamedTuple, Union, Tuple, Optional
+from collections import OrderedDict
+
+from enum import IntEnum
+
+
+_srcfile = os.path.normcase(logging.addLevelName.__code__.co_filename)
+
+
+class LogKeys:
+    default_logger_name = "PerfLogger"
+
+    # Log format
+    log_header = "PerfLog"
+    log_template = "[{header}] {message}"
+
+    # Submitted info
+    submmiter: str = "submmiter"
+    model: str = "model"
+    optimizer_type: str = "optimizer_type"
+    config: str = "config"
+    config_path: str = "config_path"
+
+    # Event
+    event: str = "event"
+    value: str = "value"
+
+    # Metadata
+    metadata: str = "metadata"
+    called_log_file = "file"
+    called_log_file_lineno = "lineno"
+    time_ms = "time_ms"
+    rank = "rank"
+
+    # Other message
+    other_message: str = "other"
+
+
+class PerfLogLevel(IntEnum):
+
+    INFO = 100
+    SUBMITTION = 101
+
+    @staticmethod
+    def from_string(level: str):
+        return PerfLogLevel.__dict__[level.upper()]
+
+    @classmethod
+    def register_to_logging(cls, logging):
+        for level_name, level in PerfLogLevel.__dict__.items():
+            if isinstance(level, cls):
+                logging.addLevelName(level.value, level_name)
+
+
+PerfLogLevel.register_to_logging(logging)
+
+
+class LogEventField(NamedTuple):
+
+    name: str
+    rank: Union[int, list] = -1
+    level: PerfLogLevel = PerfLogLevel.SUBMITTION
+
+
+class LogEvent:
+
+    submitted_info    = LogEventField("SUBMITTED_INFO", rank=0)
+    launch_training   = LogEventField("LAUNCH_TRAINING")
+    convert_model     = LogEventField("CONVERT_MODEL", rank=0)
+    create_optimizer  = LogEventField("CREATE_OPTIMIZER", rank=0)
+    model_to_fp16     = LogEventField("MODEL_TO_FP16", rank=0)
+    model_to_ddp      = LogEventField("MODEL_TO_DDP", rank=0)
+    init_start        = LogEventField("INIT_START", rank=0)
+    init_end          = LogEventField("INIT_END", rank=0)
+    train_begin       = LogEventField("TRAIN_BEGIN", rank=0)
+    train_end         = LogEventField("TRAIN_END", rank=0)
+    epoch_begin       = LogEventField("EPOCH_BEGIN", rank=0, level=PerfLogLevel.INFO)
+    epoch_end         = LogEventField("EPOCH_END", rank=0, level=PerfLogLevel.INFO)
+    step_begin        = LogEventField("STEP_BEGIN", rank=0, level=PerfLogLevel.INFO)
+    step_end          = LogEventField("STEP_END", rank=0, level=PerfLogLevel.INFO)
+    init_evaluation   = LogEventField("INIT_EVALUATION", rank=0)
+    evaluation        = LogEventField("EVALUATION", rank=0)
+    finished          = LogEventField("FINISHED", rank=0)
+
+    @staticmethod
+    def from_string(key: str):
+        return LogEvent.__dict__[key.lower()]
+
+
+class PerfLogger:
+
+    _singleton = None
+
+    def __init__(self, rank: int,
+                 level: Union[str, PerfLogLevel]=PerfLogLevel.SUBMITTION,
+                 logger: logging.Logger=None):
+        self.rank = rank
+
+        if isinstance(level, str):
+            level = PerfLogLevel.from_string(level)
+        self.level = level
+
+        if logger is None:
+            logger = logging.Logger(LogKeys.default_logger_name)
+
+        self.logger = logger
+
+        self.previous_log_time = None
+
+    @property
+    def _current_time_ms(self):
+        current = int(time.time() * 1e3)
+        self.previous_log_time = current
+        return current
+
+    def init_logger(self, submitter: str, model: str, config_path: str, config: dict, *args, **kwargs):
+        message = {
+            LogKeys.submmiter: submitter,
+            LogKeys.model: model,
+            LogKeys.config_path: config_path,
+            LogKeys.config: config
+        }
+
+        self.log(LogEvent.submitted_info, message, *args, **kwargs)
+
+
+    def log(self, event: Union[str, LogEventField], message: Optional[Union[str, dict]]=None, *args, **kwargs):
+        if isinstance(event, str):
+            event = LogEvent.from_string(event)
+
+        show_log = any([
+            event.rank == 0 and self.rank == 0,
+            event.rank == -1,
+        ]) and any([
+            event.level == PerfLogLevel.SUBMITTION,
+            event.level == self.level
+        ])
+
+        if not show_log:
+            return
+
+        stacklevel = 1
+        if "stacklevel" in kwargs:
+            stacklevel = kwargs.pop("stacklevel")
+
+        call_info = self.get_caller(stacklevel=stacklevel)
+
+        message = self._encode_message(event, message, call_info)
+        self.logger.log(self.level.value, message, *args, **kwargs)
+
+    def _encode_message(self, event: LogEventField,
+                        message: Union[str, dict],
+                        call_info: Tuple[str, int]) -> str:
+        if isinstance(message, str):
+            message ={LogKeys.other_message: message}
+        message = OrderedDict({
+            LogKeys.event: event.name,
+            LogKeys.value: message
+        })
+        called_file, lineno = call_info
+        metadata = {
+            LogKeys.called_log_file: called_file,
+            LogKeys.called_log_file_lineno: lineno,
+            LogKeys.time_ms: self._current_time_ms,
+            LogKeys.rank: self.rank
+        }
+
+        message[LogKeys.metadata] = metadata
+        message = json.dumps(message)
+
+        return self._log_template(message)
+
+    def _log_template(self, message: str):
+        return LogKeys.log_template.format(header=LogKeys.log_header, message=message)
+
+    def get_caller(self, stacklevel=1) -> Tuple[str, int]:
+        f = currentframe()
+
+        if stacklevel == 0:
+            default_file_name = f.f_code.co_filename
+            default_lineno = f.f_lineno
+            return (default_file_name, default_lineno)
+
+        # On some versions of IronPython, currentframe() returns None if
+        # IronPython isn't run with -X:Frames.
+        if f is not None:
+            f = f.f_back
+        orig_f = f
+        while f and stacklevel > 1:
+            f = f.f_back
+            stacklevel -= 1
+        if not f:
+            f = orig_f
+        rv = ("(unknown file)", -1)
+
+        while hasattr(f, "f_code"):
+            co = f.f_code
+            filename = os.path.normcase(co.co_filename)
+            if filename == _srcfile:
+                f = f.f_back
+                continue
+            rv = (co.co_filename, f.f_lineno)
+            break
+        return rv
+
+
+    @classmethod
+    def get_default_logger(cls, rank: int=-1,
+                 level: Union[str, PerfLogLevel]=PerfLogLevel.SUBMITTION,
+                 logger: logging.Logger=None):
+        if cls._singleton is None:
+            cls._singleton = cls(rank=rank, level=level, logger=logger)
+
+        return cls._singleton
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/nlp/language_model/bert_sample/pytorch/base/utils/paths.py b/nlp/language_model/bert_sample/pytorch/base/utils/paths.py
new file mode 100644
index 000000000..e99480ff7
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/utils/paths.py
@@ -0,0 +1,18 @@
+import os.path as ospath
+
+
+MODEL_DIR = ospath.abspath(
+    ospath.join(
+        __file__,
+        "../../../"
+    )
+)
+
+CURRENT_MODEL_NAME = ospath.basename(MODEL_DIR)
+
+PROJ_DIR = ospath.abspath(
+    ospath.join(
+        MODEL_DIR,
+        "../../"
+    )
+)
diff --git a/nlp/language_model/bert_sample/pytorch/base/utils/tokenization.py b/nlp/language_model/bert_sample/pytorch/base/utils/tokenization.py
new file mode 100644
index 000000000..f9f96f788
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/base/utils/tokenization.py
@@ -0,0 +1,428 @@
+# coding=utf-8
+# Copyright 2020 MLBenchmark Group. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+import unicodedata
+
+from absl import flags
+import six
+import tensorflow.compat.v1 as tf
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_bool(
+    "preserve_unused_tokens", False,
+    "If True, Wordpiece tokenization will not be applied to words in the vocab."
+)
+
+_UNUSED_TOKEN_RE = re.compile("^\\[unused\\d+\\]$")
+
+
+def preserve_token(token, vocab):
+  """Returns True if the token should forgo tokenization and be preserved."""
+  if not FLAGS.preserve_unused_tokens:
+    return False
+  if token not in vocab:
+    return False
+  return bool(_UNUSED_TOKEN_RE.search(token))
+
+
+def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
+  """Checks whether the casing config is consistent with the checkpoint name."""
+
+  # The casing has to be passed in by the user and there is no explicit check
+  # as to whether it matches the checkpoint. The casing information probably
+  # should have been stored in the bert_config.json file, but it's not, so
+  # we have to heuristically detect it to validate.
+
+  if not init_checkpoint:
+    return
+
+  m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
+  if m is None:
+    return
+
+  model_name = m.group(1)
+
+  lower_models = [
+      "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
+      "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
+  ]
+
+  cased_models = [
+      "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
+      "multi_cased_L-12_H-768_A-12"
+  ]
+
+  is_bad_config = False
+  if model_name in lower_models and not do_lower_case:
+    is_bad_config = True
+    actual_flag = "False"
+    case_name = "lowercased"
+    opposite_flag = "True"
+
+  if model_name in cased_models and do_lower_case:
+    is_bad_config = True
+    actual_flag = "True"
+    case_name = "cased"
+    opposite_flag = "False"
+
+  if is_bad_config:
+    raise ValueError(
+        "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
+        "However, `%s` seems to be a %s model, so you "
+        "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
+        "how the model was pre-training. If this error is wrong, please "
+        "just comment out this check." % (actual_flag, init_checkpoint,
+                                          model_name, case_name, opposite_flag))
+
+
+def convert_to_unicode(text):
+  """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text.decode("utf-8", "ignore")
+    elif isinstance(text, unicode):
+      return text
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
+
+
+def printable_text(text):
+  """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+  # These functions want `str` for both Python2 and Python3, but in one case
+  # it's a Unicode string and in the other it's a byte string.
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, unicode):
+      return text.encode("utf-8")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+  """Loads a vocabulary file into a dictionary."""
+  vocab = collections.OrderedDict()
+  with tf.gfile.GFile(vocab_file, "r") as reader:
+    while True:
+      token = convert_to_unicode(reader.readline())
+      if not token:
+        break
+      token = token.strip()
+      if token not in vocab:
+        vocab[token] = len(vocab)
+  return vocab
+
+
+def convert_by_vocab(vocab, items):
+  """Converts a sequence of [tokens|ids] using the vocab."""
+  output = []
+  for item in items:
+    output.append(vocab[item])
+  return output
+
+
+def convert_tokens_to_ids(vocab, tokens):
+  return convert_by_vocab(vocab, tokens)
+
+
+def convert_ids_to_tokens(inv_vocab, ids):
+  return convert_by_vocab(inv_vocab, ids)
+
+
+def whitespace_tokenize(text):
+  """Runs basic whitespace cleaning and splitting on a piece of text."""
+  text = text.strip()
+  if not text:
+    return []
+  tokens = text.split()
+  return tokens
+
+
+class FullTokenizer(object):
+  """Runs end-to-end tokenziation."""
+
+  def __init__(self, vocab_file, do_lower_case=True):
+    self.vocab = load_vocab(vocab_file)
+    self.inv_vocab = {v: k for k, v in self.vocab.items()}
+    self.basic_tokenizer = BasicTokenizer(
+        do_lower_case=do_lower_case, vocab=self.vocab)
+    self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+  def tokenize(self, text):
+    split_tokens = []
+    for token in self.basic_tokenizer.tokenize(text):
+      if preserve_token(token, self.vocab):
+        split_tokens.append(token)
+        continue
+      for sub_token in self.wordpiece_tokenizer.tokenize(token):
+        split_tokens.append(sub_token)
+
+    return split_tokens
+
+  def convert_tokens_to_ids(self, tokens):
+    return convert_by_vocab(self.vocab, tokens)
+
+  def convert_ids_to_tokens(self, ids):
+    return convert_by_vocab(self.inv_vocab, ids)
+
+
+class BasicTokenizer(object):
+  """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+  def __init__(self, do_lower_case=True, vocab=tuple()):
+    """Constructs a BasicTokenizer.
+
+    Args:
+      do_lower_case: Whether to lower case the input.
+      vocab: A container of tokens to not mutate during tokenization.
+    """
+    self.do_lower_case = do_lower_case
+    self.vocab = vocab
+
+  def tokenize(self, text):
+    """Tokenizes a piece of text."""
+    text = convert_to_unicode(text)
+    text = self._clean_text(text)
+
+    # This was added on November 1st, 2018 for the multilingual and Chinese
+    # models. This is also applied to the English models now, but it doesn't
+    # matter since the English models were not trained on any Chinese data
+    # and generally don't have any Chinese data in them (there are Chinese
+    # characters in the vocabulary because Wikipedia does have some Chinese
+    # words in the English Wikipedia.).
+    text = self._tokenize_chinese_chars(text)
+
+    orig_tokens = whitespace_tokenize(text)
+    split_tokens = []
+    for token in orig_tokens:
+      if preserve_token(token, self.vocab):
+        split_tokens.append(token)
+        continue
+      if self.do_lower_case:
+        token = token.lower()
+        token = self._run_strip_accents(token)
+      split_tokens.extend(self._run_split_on_punc(token))
+
+    output_tokens = whitespace_tokenize(" ".join(split_tokens))
+    return output_tokens
+
+  def _run_strip_accents(self, text):
+    """Strips accents from a piece of text."""
+    text = unicodedata.normalize("NFD", text)
+    output = []
+    for char in text:
+      cat = unicodedata.category(char)
+      if cat == "Mn":
+        continue
+      output.append(char)
+    return "".join(output)
+
+  def _run_split_on_punc(self, text):
+    """Splits punctuation on a piece of text."""
+    chars = list(text)
+    i = 0
+    start_new_word = True
+    output = []
+    while i < len(chars):
+      char = chars[i]
+      if _is_punctuation(char):
+        output.append([char])
+        start_new_word = True
+      else:
+        if start_new_word:
+          output.append([])
+        start_new_word = False
+        output[-1].append(char)
+      i += 1
+
+    return ["".join(x) for x in output]
+
+  def _tokenize_chinese_chars(self, text):
+    """Adds whitespace around any CJK character."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if self._is_chinese_char(cp):
+        output.append(" ")
+        output.append(char)
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+
+  def _is_chinese_char(self, cp):
+    """Checks whether CP is the codepoint of a CJK character."""
+    # This defines a "chinese character" as anything in the CJK Unicode block:
+    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    #
+    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+    # despite its name. The modern Korean Hangul alphabet is a different block,
+    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+    # space-separated words, so they are not treated specially and handled
+    # like the all of the other languages.
+    if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+        (cp >= 0x3400 and cp <= 0x4DBF) or  #
+        (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+        (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+        (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+        (cp >= 0x2B820 and cp <= 0x2CEAF) or
+        (cp >= 0xF900 and cp <= 0xFAFF) or  #
+        (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+      return True
+
+    return False
+
+  def _clean_text(self, text):
+    """Performs invalid character removal and whitespace cleanup on text."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if cp == 0 or cp == 0xfffd or _is_control(char):
+        continue
+      if _is_whitespace(char):
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+
+
+class WordpieceTokenizer(object):
+  """Runs WordPiece tokenziation."""
+
+  def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
+    self.vocab = vocab
+    self.unk_token = unk_token
+    self.max_input_chars_per_word = max_input_chars_per_word
+
+  def tokenize(self, text):
+    """Tokenizes a piece of text into its word pieces.
+
+    This uses a greedy longest-match-first algorithm to perform tokenization
+    using the given vocabulary.
+
+    For example:
+      input = "unaffable"
+      output = ["un", "##aff", "##able"]
+
+    Args:
+      text: A single token or whitespace separated tokens. This should have
+        already been passed through `BasicTokenizer.
+
+    Returns:
+      A list of wordpiece tokens.
+    """
+
+    text = convert_to_unicode(text)
+
+    output_tokens = []
+    for token in whitespace_tokenize(text):
+      chars = list(token)
+      if len(chars) > self.max_input_chars_per_word:
+        output_tokens.append(self.unk_token)
+        continue
+
+      is_bad = False
+      start = 0
+      sub_tokens = []
+      while start < len(chars):
+        end = len(chars)
+        cur_substr = None
+        while start < end:
+          substr = "".join(chars[start:end])
+          if start > 0:
+            substr = "##" + substr
+          if substr in self.vocab:
+            cur_substr = substr
+            break
+          end -= 1
+        if cur_substr is None:
+          is_bad = True
+          break
+        sub_tokens.append(cur_substr)
+        start = end
+
+      if is_bad:
+        output_tokens.append(self.unk_token)
+      else:
+        output_tokens.extend(sub_tokens)
+    return output_tokens
+
+
+def _is_whitespace(char):
+  """Checks whether `chars` is a whitespace character."""
+  # \t, \n, and \r are technically control characters but we treat them
+  # as whitespace since they are generally considered as such.
+  if char == " " or char == "\t" or char == "\n" or char == "\r":
+    return True
+  cat = unicodedata.category(char)
+  if cat == "Zs":
+    return True
+  return False
+
+
+def _is_control(char):
+  """Checks whether `chars` is a control character."""
+  # These are technically control characters but we count them as whitespace
+  # characters.
+  if char == "\t" or char == "\n" or char == "\r":
+    return False
+  cat = unicodedata.category(char)
+  if cat in ("Cc", "Cf"):
+    return True
+  return False
+
+
+def _is_punctuation(char):
+  """Checks whether `chars` is a punctuation character."""
+  cp = ord(char)
+  # We treat all non-letter/number ASCII as punctuation.
+  # Characters such as "^", "$", and "`" are not in the Unicode
+  # Punctuation class but we treat them as punctuation anyways, for
+  # consistency.
+  if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+      (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+    return True
+  cat = unicodedata.category(char)
+  if cat.startswith("P"):
+    return True
+  return False
diff --git a/tests/executables/bert/init_torch.sh b/tests/executables/bert/init_torch.sh
new file mode 100644
index 000000000..bc93aa7fd
--- /dev/null
+++ b/tests/executables/bert/init_torch.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# /***************************************************************************************************
+# * Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# * Copyright Declaration: This software, including all of its code and documentation,
+# * except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# * Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# * Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# * CoreX. No user of this software shall have any right, ownership or interest in this software and
+# * any use of this software shall be in compliance with the terms and conditions of the End User
+# * License Agreement.
+#  **************************************************************************************************/
+
+CURRENT_DIR=$(cd `dirname $0`; pwd)
+
+set -euox pipefail
+
+# determine whether the user is root mode to execute this script
+prefix_sudo=""
+current_user=$(whoami)
+if [ "$current_user" != "root" ]; then
+    echo "User $current_user need to add sudo permission keywords"
+    prefix_sudo="sudo"
+fi
+
+echo "prefix_sudo= $prefix_sudo"
+
+source $(cd `dirname $0`; pwd)/../_utils/which_install_tool.sh
+if command_exists apt; then
+	$prefix_sudo apt install -y git numactl
+elif command_exists dnf; then
+	$prefix_sudo dnf install -y git numactl
+else
+	$prefix_sudo yum install -y git numactl
+fi
+
+if [ "$(ulimit -n)" -lt "1048576" ]; then
+	ulimit -n 1048576
+fi
+
+# prepare data
+cd  ${CURRENT_DIR}/../../data/datasets/bert_mini
+
+if [[ ! -d "${CURRENT_DIR}/../../data/datasets/bert_mini/2048_shards_uncompressed" ]]; then
+    echo "Unarchive 2048_shards_uncompressed_mini"
+    tar -zxf 2048_shards_uncompressed_mini.tar.gz
+fi
+if [[ ! -d "${CURRENT_DIR}/../../data/datasets/bert_mini/eval_set_uncompressed" ]]; then
+    echo "Unarchive eval_set_uncompressed.tar.gz"
+    tar -zxf eval_set_uncompressed.tar.gz
+fi
+
+if [[ "$(uname -m)" == "aarch64" ]]; then
+    set +euox pipefail
+    source /opt/rh/gcc-toolset-11/enable
+    set -euox pipefail
+fi
+
+
+# install sdk
+cd ${CURRENT_DIR}/../../nlp/language_model/bert_sample/pytorch/base
+pip3 install -r requirements.txt
+$prefix_sudo python3 setup.py install
+
+
+
+if [ "$?" != "0" ]; then
+    echo "init torch : failed."
+    exit 1
+fi
+
+echo "init torch : completed."
diff --git a/tests/executables/bert/train_bert_default_amp_dist_1x8_torch.sh b/tests/executables/bert/train_bert_default_amp_dist_1x8_torch.sh
new file mode 100644
index 000000000..311994b05
--- /dev/null
+++ b/tests/executables/bert/train_bert_default_amp_dist_1x8_torch.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# /***************************************************************************************************
+# * Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# * Copyright Declaration: This software, including all of its code and documentation,
+# * except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# * Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# * Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# * CoreX. No user of this software shall have any right, ownership or interest in this software and
+# * any use of this software shall be in compliance with the terms and conditions of the End User
+# * License Agreement.
+#  **************************************************************************************************/
+
+set -euox pipefail
+
+source ../_utils/global_environment_variables.sh
+
+: ${BATCH_SIZE:=10}
+
+cd ../../nlp/language_model/bert_sample/pytorch/base
+if [ "$?" != "0" ]; then
+    echo "train status: fail."
+    exit 1
+fi
+
+
+bash run_training.sh \
+--name default \
+--config V100x1x8 \
+--data_dir ../../../../../../data/datasets/bert_mini/ \
+--max_steps 500 \
+--train_batch_size ${BATCH_SIZE} \
+--target_mlm_accuracy 0.33 \
+--init_checkpoint "../../../../../../data/datasets/bert_mini/model.ckpt-28252.apex.pt"
+
+if [ "$?" != "0" ]; then
+    echo "train status: fail."
+    exit 1
+fi
+
+echo "train status: pass."
diff --git a/tests/executables/bert/train_bert_pretraining_amp_dist_1x8_torch.sh b/tests/executables/bert/train_bert_pretraining_amp_dist_1x8_torch.sh
new file mode 100644
index 000000000..1a5a8e0e3
--- /dev/null
+++ b/tests/executables/bert/train_bert_pretraining_amp_dist_1x8_torch.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# /***************************************************************************************************
+# * Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# * Copyright Declaration: This software, including all of its code and documentation,
+# * except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# * Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# * Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# * CoreX. No user of this software shall have any right, ownership or interest in this software and
+# * any use of this software shall be in compliance with the terms and conditions of the End User
+# * License Agreement.
+#  **************************************************************************************************/
+
+set -euox pipefail
+
+: ${BATCH_SIZE:=27}
+
+cd ../../nlp/language_model/bert_sample/pytorch/base/
+if [ "$?" != "0" ]; then
+    echo "ERROR: ../../nlp/language_model/bert_sample/pytorch/base/ not exist."
+    exit 1
+fi
+
+master_port=22233
+bash run_training.sh --name iluvatar --config 03V100x1x8 --train_batch_size ${BATCH_SIZE} --data_dir ../../../../../../data/datasets/bert_mini/ --master_port $master_port
+if [ "$?" != "0" ]; then
+    echo "eval result: fail."
+    exit 1
+fi
+
+echo "eval result: pass."
-- 
Gitee


From 513a6d7a94a8633855807de46e14d8556be5da60 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Fri, 26 Sep 2025 15:31:58 +0800
Subject: [PATCH 17/20] sync yolov5 all

---
 tests/executables/yolov5/init_torch.sh        | 78 +++++++++++++++++++
 .../yolov5/train_yolov5s_coco_amp_torch.sh    | 22 ++++++
 2 files changed, 100 insertions(+)
 create mode 100644 tests/executables/yolov5/init_torch.sh
 create mode 100644 tests/executables/yolov5/train_yolov5s_coco_amp_torch.sh

diff --git a/tests/executables/yolov5/init_torch.sh b/tests/executables/yolov5/init_torch.sh
new file mode 100644
index 000000000..8a26be22b
--- /dev/null
+++ b/tests/executables/yolov5/init_torch.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+#This script is to check if needed package is installed, also download dataset and pre-trained weights if not exist.
+. ../_utils/install_pip_pkgs.sh
+
+CURRENT_MODEL_DIR=$(cd `dirname $0`; pwd)
+PROJ_DIR="${CURRENT_MODEL_DIR}/../../"
+PROJECT_DATA="${PROJ_DIR}/data/datasets"
+MODEL_ZOO_DIR="${PROJ_DIR}/data/model_zoo"
+
+pkgs=('requests' 'matplotlib' 'numpy' 'Pillow' 'scipy' 'tqdm' 'seaborn' 'pandas' 'thop' 'opencv-python' 'pycocotools' '--ignore-installed PyYAML')
+install_pip_pkgs "${pkgs[@]}"
+
+pip3 install tqdm==4.62.1
+
+git clone https://gitee.com/deep-spark/deepsparkhub-gpl.git
+
+cd ${PROJ_DIR}/deepsparkhub-gpl/cv/detection/yolov5-sample/pytorch
+
+# Remove exist datas
+if [[ -d "./datasets/coco128" ]]; then
+    rm -rf ./datasets/coco128
+fi
+
+if [[ -d "./datasets/coco" ]]; then
+    rm -rf ./datasets/coco
+fi
+
+if [[ -d "./weights" ]]; then
+    rm -rf ./weights
+fi
+mkdir "weights"
+
+if [[ -d "./datasets" ]]; then
+    rm ./datasets
+fi
+
+# Build datas
+if [[ ! -d "datasets" ]]; then
+    echo "ln -s ${PROJECT_DATA} ./datasets"
+    ln -s ${PROJECT_DATA} ./datasets
+fi
+
+if [[ ! -d "${PROJECT_DATA}/coco128" ]];then
+    if [ -f "${PROJECT_DATA}/coco128.tgz" ]; then
+        echo "Unarchive coco128.tgz"
+        tar zxf "${PROJECT_DATA}/coco128.tgz" -C ./datasets/
+    else
+        echo "Error: Not found ${PROJECT_DATA}/coco128.tgz!"
+    fi
+else
+    echo "Warning: coco128 exist!"
+fi
+
+if [[ -d "${PROJECT_DATA}/coco2017" ]];then
+    if [[ -f "${PROJECT_DATA}/coco2017labels.zip" ]]; then
+        echo "Unarchive coco2017labels.zip"
+        unzip -q -d ./datasets/ "${PROJECT_DATA}/coco2017labels.zip"
+
+        echo "ln -s ${PROJECT_DATA}/coco2017/train2017 ./datasets/coco/images/"
+        ln -s ${PROJECT_DATA}/coco2017/train2017 ./datasets/coco/images/
+
+        echo "ln -s ${PROJECT_DATA}/coco2017/val2017 ./datasets/coco/images/"
+        ln -s ${PROJECT_DATA}/coco2017/val2017 ./datasets/coco/images/
+    else
+        echo "Error: Not found ${PROJECT_DATA}/coco2017labels.zip!"
+    fi
+else
+    echo "Warning: Not found coco2017!"
+fi
+
+if [[ -f "${MODEL_ZOO_DIR}/yolov5s.pt" ]];then
+    echo "ln -s ${MODEL_ZOO_DIR}/yolov5s.pt ./weights"
+    ln -s ${MODEL_ZOO_DIR}/yolov5s.pt ./weights
+fi
+
+if [[ -f "/opt/rh/gcc-toolset-11/enable" ]];then
+    source /opt/rh/gcc-toolset-11/enable
+fi
\ No newline at end of file
diff --git a/tests/executables/yolov5/train_yolov5s_coco_amp_torch.sh b/tests/executables/yolov5/train_yolov5s_coco_amp_torch.sh
new file mode 100644
index 000000000..471532b6a
--- /dev/null
+++ b/tests/executables/yolov5/train_yolov5s_coco_amp_torch.sh
@@ -0,0 +1,22 @@
+CURRENT_DIR=$(cd `dirname $0`; pwd)
+
+source ../_utils/global_environment_variables.sh
+
+: ${BATCH_SIZE:=8}
+
+ROOT_DIR=${CURRENT_DIR}/../..
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0)); then
+        EXIT_STATUS=1
+    fi
+}
+
+cd "${ROOT_DIR}/deepsparkhub-gpl/cv/detection/yolov5-sample/pytorch"
+ixdltest-check --nonstrict_mode_args="--epoch 1" -b 0.2 --run_script \
+python3 train.py --img-size 640 --batch-size ${BATCH_SIZE} \
+ --cfg ./models/yolov5s.yaml --weights ./weights/yolov5s.pt --data ./data/coco.yaml  --amp ${nonstrict_mode_args} "$@";  check_status
+
+exit ${EXIT_STATUS}
-- 
Gitee


From c4b5c1b95baf3cd6e1832bab940289b7dd870aab Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Fri, 26 Sep 2025 15:47:39 +0800
Subject: [PATCH 18/20] sync _utils script

---
 .../_utils/fix_import_sklearn_error.sh        | 23 +++++++++++++++
 ...x_import_sklearn_error_libgomp_d22c30c5.sh | 13 +++++++++
 .../_utils/init_classification_paddle.sh      | 29 +++++++++++++++++++
 .../_utils/init_detection_torch.sh            | 14 +++++++++
 .../_utils/init_segmentation_torch.sh         | 14 +++++++++
 .../_utils/init_tf_cnn_benckmark.sh           | 26 +++++++++++++++++
 .../set_paddle_environment_variables.sh       |  2 ++
 .../executables/_utils/which_install_tool.sh  | 24 +++++++++++++++
 8 files changed, 145 insertions(+)
 create mode 100644 tests/executables/_utils/fix_import_sklearn_error.sh
 create mode 100644 tests/executables/_utils/fix_import_sklearn_error_libgomp_d22c30c5.sh
 create mode 100644 tests/executables/_utils/init_classification_paddle.sh
 create mode 100644 tests/executables/_utils/init_detection_torch.sh
 create mode 100644 tests/executables/_utils/init_segmentation_torch.sh
 create mode 100644 tests/executables/_utils/init_tf_cnn_benckmark.sh
 create mode 100644 tests/executables/_utils/set_paddle_environment_variables.sh
 create mode 100644 tests/executables/_utils/which_install_tool.sh

diff --git a/tests/executables/_utils/fix_import_sklearn_error.sh b/tests/executables/_utils/fix_import_sklearn_error.sh
new file mode 100644
index 000000000..a2662c753
--- /dev/null
+++ b/tests/executables/_utils/fix_import_sklearn_error.sh
@@ -0,0 +1,23 @@
+sys_name_str=`uname -a`
+if [[ "${sys_name_str}" =~ "aarch64" ]]; then
+    if [ -z "$LD_PRELOAD" ]; then
+        ligo=`find /usr -iname "libgomp.so.1"`
+        for path in $ligo; do
+            if [[ "${path}" =~ "libgomp.so.1" ]]; then
+                export LD_PRELOAD="${path}"
+                echo "Set LD_PRELOAD="${path}""
+                break
+            fi
+        done
+
+        ligo=`find /usr -iname "libgomp-d22c30c5.so.1.0.0"`
+        for path in $ligo; do
+            if [[ "${path}" =~ "libgomp-d22c30c5.so.1.0.0" ]]; then
+                export LD_PRELOAD="${path}"
+                echo "Set LD_PRELOAD="${path}""
+                break
+            fi
+        done
+
+    fi
+fi
\ No newline at end of file
diff --git a/tests/executables/_utils/fix_import_sklearn_error_libgomp_d22c30c5.sh b/tests/executables/_utils/fix_import_sklearn_error_libgomp_d22c30c5.sh
new file mode 100644
index 000000000..3bf7013bf
--- /dev/null
+++ b/tests/executables/_utils/fix_import_sklearn_error_libgomp_d22c30c5.sh
@@ -0,0 +1,13 @@
+sys_name_str=`uname -a`
+if [[ "${sys_name_str}" =~ "aarch64" ]]; then
+    if [ -z "$LD_PRELOAD" ]; then
+        ligo=`find /usr -iname "libgomp-d22c30c5.so.1.0.0"`
+        for path in $ligo; do
+            if [[ "${path}" =~ "libgomp-d22c30c5.so.1.0.0" ]]; then
+                export LD_PRELOAD="${path}"
+                echo "Set LD_PRELOAD="${path}""
+                break
+            fi
+        done
+    fi
+fi
\ No newline at end of file
diff --git a/tests/executables/_utils/init_classification_paddle.sh b/tests/executables/_utils/init_classification_paddle.sh
new file mode 100644
index 000000000..52edc3c7d
--- /dev/null
+++ b/tests/executables/_utils/init_classification_paddle.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+if [ -n "$1" ]; then
+    _UTILS_DIR=$1
+else
+    _UTILS_DIR='../_utils'
+fi
+
+# Install packages
+. $_UTILS_DIR/install_pip_pkgs.sh
+
+pkgs=('scipy' 'scikit-learn==0.23.2' 'opencv-python' 'tqdm' "visualdl==2.3.0")
+
+
+PY_VERSION=$(python3 -V 2>&1|awk '{print $2}'|awk -F '.' '{print $2}')
+if [ "$PY_VERSION" == "10" ];
+then
+   pkgs=('scipy' 'scikit-learn==1.1.0' 'opencv-python' 'tqdm' "visualdl==2.3.0")
+   echo "$pkgs"
+elif  [ "$PY_VERSION" == "11" ];
+then
+   pkgs=('scipy' 'scikit-learn==1.3.1' 'opencv-python' 'tqdm' "visualdl==2.3.0")
+   echo "$pkgs"
+else
+   pkgs=('scipy' 'scikit-learn==0.24.0' 'opencv-python' 'tqdm' "visualdl==2.3.0")
+   echo "$pkgs"
+fi
+
+install_pip_pkgs "${pkgs[@]}"
diff --git a/tests/executables/_utils/init_detection_torch.sh b/tests/executables/_utils/init_detection_torch.sh
new file mode 100644
index 000000000..8661bebc9
--- /dev/null
+++ b/tests/executables/_utils/init_detection_torch.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+if [ -n "$1" ]; then
+    _UTILS_DIR=$1
+else
+    _UTILS_DIR='../_utils'
+fi
+
+# Install packages
+. $_UTILS_DIR/install_pip_pkgs.sh
+
+pkgs=('scipy' 'matplotlib' 'pycocotools' 'opencv-python' 'easydict' 'tqdm')
+
+install_pip_pkgs "${pkgs[@]}"
\ No newline at end of file
diff --git a/tests/executables/_utils/init_segmentation_torch.sh b/tests/executables/_utils/init_segmentation_torch.sh
new file mode 100644
index 000000000..d3bae05d0
--- /dev/null
+++ b/tests/executables/_utils/init_segmentation_torch.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+if [ -n "$1" ]; then
+    _UTILS_DIR=$1
+else
+    _UTILS_DIR='../_utils'
+fi
+
+# Install packages
+. $_UTILS_DIR/install_pip_pkgs.sh
+
+pkgs=('scipy==1.12.0' 'matplotlib' 'pycocotools' 'opencv-python' 'easydict' 'tqdm')
+
+install_pip_pkgs "${pkgs[@]}"
\ No newline at end of file
diff --git a/tests/executables/_utils/init_tf_cnn_benckmark.sh b/tests/executables/_utils/init_tf_cnn_benckmark.sh
new file mode 100644
index 000000000..105325e96
--- /dev/null
+++ b/tests/executables/_utils/init_tf_cnn_benckmark.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+CURRENT_DIR=$(cd `dirname $0`; pwd)
+ROOT_DIR=${CURRENT_DIR}/../..
+ROOT_DIR=${CURRENT_DIR}/../..
+DATA_DIR=${ROOT_DIR}/data/datasets/imagenette_tfrecord
+
+if [ -n "$1" ]; then
+    _UTILS_DIR=$1
+else
+    _UTILS_DIR='../_utils'
+fi
+
+# Install packages
+. $_UTILS_DIR/install_pip_pkgs.sh
+
+pkgs=('absl-py')
+
+install_pip_pkgs "${pkgs[@]}"
+pip3 install ${_UTILS_DIR}/../../data/packages/DLLogger-1.0.0-py3-none-any.whl
+
+
+if [ ! -d "${DATA_DIR}" ]; then
+    cd ${ROOT_DIR}/data/datasets/
+    tar -xzvf imagenette_tfrecord.tgz
+fi
diff --git a/tests/executables/_utils/set_paddle_environment_variables.sh b/tests/executables/_utils/set_paddle_environment_variables.sh
new file mode 100644
index 000000000..f01b471b2
--- /dev/null
+++ b/tests/executables/_utils/set_paddle_environment_variables.sh
@@ -0,0 +1,2 @@
+export FLAGS_cudnn_exhaustive_search=True
+export FLAGS_cudnn_batchnorm_spatial_persistent=True
diff --git a/tests/executables/_utils/which_install_tool.sh b/tests/executables/_utils/which_install_tool.sh
new file mode 100644
index 000000000..1110638d2
--- /dev/null
+++ b/tests/executables/_utils/which_install_tool.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+
+command_exists() {
+	command -v "$@" > /dev/null 2>&1
+}
+
+# install template
+# determine whether the user is root mode to execute this script
+# prefix_sudo=""
+# current_user=$(whoami)
+# if [ "$current_user" != "root" ]; then
+# 	echo "User $current_user need to add sudo permission keywords"
+#	prefix_sudo="sudo"
+# fi
+#
+# echo "prefix_sudo= $prefix_sudo"
+#
+# if command_exists apt; then
+# 	$prefix_sudo apt install -y 
+# elif command_exists dnf; then
+# 	$prefix_sudo dnf install -y 
+# else
+# 	$prefix_sudo yum install -y 
+# fi
-- 
Gitee


From 119b5472fea7179ea2fd7aa500f1cdab9d69981b Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Thu, 9 Oct 2025 14:37:17 +0800
Subject: [PATCH 19/20] fix ci and update readme

---
 .../conformer/pytorch/README.md               |  76 ++
 .../resnet50/paddlepaddle/.gitignore          |   1 -
 .../resnet50/paddlepaddle/README.md           |  78 +-
 .../paddlepaddle/ppcls/data/__init__.py       | 139 +++
 .../DistributedRandomIdentitySampler.py       |  90 ++
 .../ppcls/data/dataloader}/__init__.py        |   0
 .../ppcls/data/dataloader/common_dataset.py   |  83 ++
 .../ppcls/data/dataloader/dali.py             | 319 ++++++
 .../ppcls/data/dataloader/imagenet_dataset.py |  38 +
 .../ppcls/data/dataloader/mix_dataset.py      |  49 +
 .../ppcls/data/dataloader/mix_sampler.py      |  79 ++
 .../data/dataloader/multilabel_dataset.py     |  59 ++
 .../ppcls/data/dataloader/pk_sampler.py       | 105 ++
 .../ppcls/data/postprocess/__init__.py        |  41 +
 .../ppcls/data/postprocess/topk.py            |  85 ++
 .../ppcls/data/preprocess/__init__.py         | 100 ++
 .../data/preprocess/batch_ops/__init__.py     |   1 +
 .../preprocess/batch_ops/batch_operators.py   | 231 +++++
 .../ppcls/data/preprocess/ops/__init__.py     |   1 +
 .../ppcls/data/preprocess/ops/autoaugment.py  | 264 +++++
 .../ppcls/data/preprocess/ops/cutout.py       |  41 +
 .../ppcls/data/preprocess/ops/fmix.py         | 217 +++++
 .../ppcls/data/preprocess/ops/functional.py   | 138 +++
 .../ppcls/data/preprocess/ops/grid.py         |  89 ++
 .../data/preprocess/ops/hide_and_seek.py      |  44 +
 .../ppcls/data/preprocess/ops/operators.py    | 384 ++++++++
 .../ppcls/data/preprocess/ops/randaugment.py  | 106 ++
 .../data/preprocess/ops/random_erasing.py     |  90 ++
 .../data/preprocess/ops/timm_autoaugment.py   | 877 +++++++++++++++++
 .../paddlepaddle/ppcls/data/utils/__init__.py |  13 +
 .../ppcls/data/utils/get_image_list.py        |  49 +
 .../paddlepaddle/ppcls_2.6/data/__init__.py   | 197 ++++
 .../DistributedRandomIdentitySampler.py       | 135 +++
 .../ppcls_2.6/data/dataloader/__init__.py     |  16 +
 .../ppcls_2.6/data/dataloader/cifar.py        | 136 +++
 .../data/dataloader/common_dataset.py         |  88 ++
 .../data/dataloader/custom_label_dataset.py   |  88 ++
 .../ppcls_2.6/data/dataloader/dali.py         | 795 +++++++++++++++
 .../ppcls_2.6/data/dataloader/face_dataset.py | 144 +++
 .../data/dataloader/icartoon_dataset.py       |  36 +
 .../data/dataloader/imagenet_dataset.py       |  75 ++
 .../ppcls_2.6/data/dataloader/logo_dataset.py |  46 +
 .../data/dataloader/metabin_sampler.py        | 290 ++++++
 .../ppcls_2.6/data/dataloader/mix_dataset.py  |  49 +
 .../ppcls_2.6/data/dataloader/mix_sampler.py  |  79 ++
 .../data/dataloader/multi_scale_dataset.py    | 118 +++
 .../data/dataloader/multi_scale_sampler.py    | 133 +++
 .../data/dataloader/multilabel_dataset.py     |  65 ++
 .../data/dataloader/person_dataset.py         | 269 +++++
 .../ppcls_2.6/data/dataloader/pk_sampler.py   | 134 +++
 .../ppcls_2.6/data/dataloader/ra_sampler.py   |  57 ++
 .../data/dataloader/vehicle_dataset.py        | 173 ++++
 .../ppcls_2.6/data/postprocess/__init__.py    |  44 +
 .../ppcls_2.6/data/postprocess/attr_rec.py    | 284 ++++++
 .../ppcls_2.6/data/postprocess/scoreoutput.py |  18 +
 .../data/postprocess/threshoutput.py          |  90 ++
 .../ppcls_2.6/data/postprocess/topk.py        |  84 ++
 .../ppcls_2.6/data/preprocess/__init__.py     | 183 ++++
 .../data/preprocess/batch_ops/__init__.py     |   1 +
 .../preprocess/batch_ops/batch_operators.py   | 501 ++++++++++
 .../ppcls_2.6/data/preprocess/ops/__init__.py |   1 +
 .../data/preprocess/ops/autoaugment.py        | 265 +++++
 .../ppcls_2.6/data/preprocess/ops/cutout.py   |  55 ++
 .../data/preprocess/ops/dali_operators.py     | 235 +++++
 .../ppcls_2.6/data/preprocess/ops/fmix.py     | 220 +++++
 .../data/preprocess/ops/functional.py         | 138 +++
 .../ppcls_2.6/data/preprocess/ops/grid.py     |  90 ++
 .../data/preprocess/ops/hide_and_seek.py      |  45 +
 .../data/preprocess/ops/operators.py          | 920 ++++++++++++++++++
 .../data/preprocess/ops/randaugment.py        | 477 +++++++++
 .../data/preprocess/ops/random_erasing.py     | 113 +++
 .../data/preprocess/ops/timm_autoaugment.py   | 878 +++++++++++++++++
 .../ppcls_2.6/data/utils/__init__.py          |  13 +
 .../ppcls_2.6/data/utils/get_image_list.py    |  59 ++
 cv/detection/ssd/pytorch/README.md            |  42 +-
 cv/detection/yolov5-sample/pytorch/README.md  |  79 ++
 .../fairmot/pytorch/src/gen_labels_17.py      |   4 +-
 .../fairmot/pytorch/src/lib/cfg/mot17.json    |   6 +-
 .../lib/models/networks/config/hrnet_w32.yaml |   2 +-
 .../pytorch/README.md                         |  45 -
 .../pytorch/.gitignore                        |   0
 .../pytorch/LICENSE                           |   0
 .../pytorch/MANIFEST.in                       |   0
 .../stable-diffusion-2.1/pytorch/README.md    |  68 +-
 .../pytorch/build_diffusers.sh                |   0
 .../pytorch/clean_diffusers.sh                |   0
 .../pytorch/examples/text_to_image/README.md  |   0
 .../examples/text_to_image/README_sdxl.md     |   0
 .../text_to_image/default_config.yaml         |   0
 .../examples/text_to_image/requirements.txt   |   0
 .../text_to_image/requirements_flax.txt       |   0
 .../text_to_image/requirements_sdxl.txt       |   0
 .../examples/text_to_image/single_config.yaml |   0
 .../text_to_image/test_text_to_image.py       |   0
 .../text_to_image/test_text_to_image_lora.py  |   0
 .../text_to_image/train_text_to_image.py      |   0
 .../text_to_image/train_text_to_image_flax.py |   0
 .../text_to_image/train_text_to_image_lora.py |   0
 .../train_text_to_image_lora_sdxl.py          |   0
 .../text_to_image/train_text_to_image_sdxl.py |   0
 .../examples/text_to_image/zero2_config.yaml  |   0
 .../pytorch/install_diffusers.sh              |   0
 .../pytorch/run_sd_1.5.sh                     |   0
 .../pytorch/run_sd_1.5_single.sh              |   0
 .../pytorch/run_sd_2.1.sh                     |   0
 .../pytorch/run_sd_2.1_single.sh              |   0
 .../pytorch/run_sd_xl.sh                      |   0
 .../pytorch/setup.py                          |   0
 .../pytorch/src/diffusers/__init__.py         |   0
 .../src/diffusers/commands/__init__.py        |   0
 .../src/diffusers/commands/diffusers_cli.py   |   0
 .../pytorch/src/diffusers/commands/env.py     |   0
 .../diffusers/commands/fp16_safetensors.py    |   0
 .../src/diffusers/configuration_utils.py      |   0
 .../diffusers/dependency_versions_check.py    |   0
 .../diffusers/dependency_versions_table.py    |   0
 .../src/diffusers/experimental/README.md      |   0
 .../src/diffusers/experimental/__init__.py    |   0
 .../src/diffusers/experimental/rl/__init__.py |   0
 .../experimental/rl/value_guided_sampling.py  |   0
 .../pytorch/src/diffusers/image_processor.py  |   0
 .../pytorch/src/diffusers/loaders/__init__.py |   0
 .../src/diffusers/loaders/autoencoder.py      |   0
 .../src/diffusers/loaders/controlnet.py       |   0
 .../src/diffusers/loaders/ip_adapter.py       |   0
 .../pytorch/src/diffusers/loaders/lora.py     |   0
 .../loaders/lora_conversion_utils.py          |   0
 .../pytorch/src/diffusers/loaders/peft.py     |   0
 .../src/diffusers/loaders/single_file.py      |   0
 .../diffusers/loaders/single_file_utils.py    |   0
 .../diffusers/loaders/textual_inversion.py    |   0
 .../pytorch/src/diffusers/loaders/unet.py     |   0
 .../pytorch/src/diffusers/loaders/utils.py    |   0
 .../pytorch/src/diffusers/models/README.md    |   0
 .../pytorch/src/diffusers/models/__init__.py  |   0
 .../src/diffusers/models/activations.py       |   0
 .../pytorch/src/diffusers/models/adapter.py   |   0
 .../pytorch/src/diffusers/models/attention.py |   0
 .../src/diffusers/models/attention_flax.py    |   0
 .../diffusers/models/attention_processor.py   |   0
 .../diffusers/models/autoencoders/__init__.py |   0
 .../autoencoders/autoencoder_asym_kl.py       |   0
 .../models/autoencoders/autoencoder_kl.py     |   0
 .../autoencoder_kl_temporal_decoder.py        |   0
 .../models/autoencoders/autoencoder_tiny.py   |   0
 .../autoencoders/consistency_decoder_vae.py   |   0
 .../src/diffusers/models/autoencoders/vae.py  |   0
 .../src/diffusers/models/controlnet.py        |   0
 .../src/diffusers/models/controlnet_flax.py   |   0
 .../src/diffusers/models/downsampling.py      |   0
 .../diffusers/models/dual_transformer_2d.py   |   0
 .../src/diffusers/models/embeddings.py        |   0
 .../src/diffusers/models/embeddings_flax.py   |   0
 .../pytorch/src/diffusers/models/lora.py      |   0
 .../models/modeling_flax_pytorch_utils.py     |   0
 .../diffusers/models/modeling_flax_utils.py   |   0
 .../src/diffusers/models/modeling_outputs.py  |   0
 .../models/modeling_pytorch_flax_utils.py     |   0
 .../src/diffusers/models/modeling_utils.py    |   0
 .../diffusers/models/nhwc_groupnorm/Welford.h |   0
 .../models/nhwc_groupnorm/__init__.py}        |   0
 .../models/nhwc_groupnorm/custom_gn.cpp       |   0
 .../models/nhwc_groupnorm/custom_gn.py        |   0
 .../models/nhwc_groupnorm/gn_kernel.cu        |   0
 .../models/nhwc_groupnorm/gn_kernel.h         |   0
 .../models/nhwc_groupnorm/nchw_kernel.cu      |   0
 .../diffusers/models/nhwc_groupnorm/vecs.h    |   0
 .../src/diffusers/models/normalization.py     |   0
 .../src/diffusers/models/prior_transformer.py |   0
 .../pytorch/src/diffusers/models/resnet.py    |   0
 .../src/diffusers/models/resnet_flax.py       |   0
 .../diffusers/models/t5_film_transformer.py   |   0
 .../src/diffusers/models/transformer_2d.py    |   0
 .../diffusers/models/transformer_temporal.py  |   0
 .../diffusers/models/transformers/__init__.py |   0
 .../transformers/dual_transformer_2d.py       |   0
 .../models/transformers/prior_transformer.py  |   0
 .../transformers/t5_film_transformer.py       |   0
 .../models/transformers/transformer_2d.py     |   0
 .../transformers/transformer_temporal.py      |   0
 .../pytorch/src/diffusers/models/unet_1d.py   |   0
 .../src/diffusers/models/unet_1d_blocks.py    |   0
 .../pytorch/src/diffusers/models/unet_2d.py   |   0
 .../src/diffusers/models/unet_2d_blocks.py    |   0
 .../src/diffusers/models/unet_2d_condition.py |   0
 .../src/diffusers/models/unets/__init__.py    |   0
 .../src/diffusers/models/unets/unet_1d.py     |   0
 .../diffusers/models/unets/unet_1d_blocks.py  |   0
 .../src/diffusers/models/unets/unet_2d.py     |   0
 .../diffusers/models/unets/unet_2d_blocks.py  |   0
 .../models/unets/unet_2d_blocks_flax.py       |   0
 .../models/unets/unet_2d_condition.py         |   0
 .../models/unets/unet_2d_condition_flax.py    |   0
 .../diffusers/models/unets/unet_3d_blocks.py  |   0
 .../models/unets/unet_3d_condition.py         |   0
 .../diffusers/models/unets/unet_i2vgen_xl.py  |   0
 .../diffusers/models/unets/unet_kandinsky3.py |   0
 .../models/unets/unet_motion_model.py         |   0
 .../unets/unet_spatio_temporal_condition.py   |   0
 .../models/unets/unet_stable_cascade.py       |   0
 .../src/diffusers/models/unets/uvit_2d.py     |   0
 .../src/diffusers/models/upsampling.py        |   0
 .../pytorch/src/diffusers/models/vae_flax.py  |   0
 .../pytorch/src/diffusers/models/vq_model.py  |   0
 .../pytorch/src/diffusers/optimization.py     |   0
 .../pytorch/src/diffusers/pipelines/README.md |   0
 .../src/diffusers/pipelines/__init__.py       |   0
 .../diffusers/pipelines/amused/__init__.py    |   0
 .../pipelines/amused/pipeline_amused.py       |   0
 .../amused/pipeline_amused_img2img.py         |   0
 .../amused/pipeline_amused_inpaint.py         |   0
 .../pipelines/animatediff/__init__.py         |   0
 .../animatediff/pipeline_animatediff.py       |   0
 .../pipeline_animatediff_video2video.py       |   0
 .../pipelines/animatediff/pipeline_output.py  |   0
 .../diffusers/pipelines/audioldm/__init__.py  |   0
 .../pipelines/audioldm/pipeline_audioldm.py   |   0
 .../diffusers/pipelines/audioldm2/__init__.py |   0
 .../pipelines/audioldm2/modeling_audioldm2.py |   0
 .../pipelines/audioldm2/pipeline_audioldm2.py |   0
 .../src/diffusers/pipelines/auto_pipeline.py  |   0
 .../pipelines/blip_diffusion/__init__.py      |   0
 .../blip_diffusion/blip_image_processing.py   |   0
 .../blip_diffusion/modeling_blip2.py          |   0
 .../blip_diffusion/modeling_ctx_clip.py       |   0
 .../blip_diffusion/pipeline_blip_diffusion.py |   0
 .../pipelines/consistency_models/__init__.py  |   0
 .../pipeline_consistency_models.py            |   0
 .../pipelines/controlnet/__init__.py          |   0
 .../pipelines/controlnet/multicontrolnet.py   |   0
 .../controlnet/pipeline_controlnet.py         |   0
 .../pipeline_controlnet_blip_diffusion.py     |   0
 .../controlnet/pipeline_controlnet_img2img.py |   0
 .../controlnet/pipeline_controlnet_inpaint.py |   0
 .../pipeline_controlnet_inpaint_sd_xl.py      |   0
 .../controlnet/pipeline_controlnet_sd_xl.py   |   0
 .../pipeline_controlnet_sd_xl_img2img.py      |   0
 .../controlnet/pipeline_flax_controlnet.py    |   0
 .../pipelines/dance_diffusion/__init__.py     |   0
 .../pipeline_dance_diffusion.py               |   0
 .../src/diffusers/pipelines/ddim/__init__.py  |   0
 .../diffusers/pipelines/ddim/pipeline_ddim.py |   0
 .../src/diffusers/pipelines/ddpm/__init__.py  |   0
 .../diffusers/pipelines/ddpm/pipeline_ddpm.py |   0
 .../pipelines/deepfloyd_if/__init__.py        |   0
 .../pipelines/deepfloyd_if/pipeline_if.py     |   0
 .../deepfloyd_if/pipeline_if_img2img.py       |   0
 .../pipeline_if_img2img_superresolution.py    |   0
 .../deepfloyd_if/pipeline_if_inpainting.py    |   0
 .../pipeline_if_inpainting_superresolution.py |   0
 .../pipeline_if_superresolution.py            |   0
 .../pipelines/deepfloyd_if/pipeline_output.py |   0
 .../pipelines/deepfloyd_if/safety_checker.py  |   0
 .../pipelines/deepfloyd_if/timesteps.py       |   0
 .../pipelines/deepfloyd_if/watermark.py       |   0
 .../diffusers/pipelines/deprecated/README.md  |   0
 .../pipelines/deprecated/__init__.py          |   0
 .../deprecated/alt_diffusion/__init__.py      |   0
 .../alt_diffusion/modeling_roberta_series.py  |   0
 .../alt_diffusion/pipeline_alt_diffusion.py   |   0
 .../pipeline_alt_diffusion_img2img.py         |   0
 .../alt_diffusion/pipeline_output.py          |   0
 .../deprecated/audio_diffusion/__init__.py    |   0
 .../deprecated/audio_diffusion/mel.py         |   0
 .../pipeline_audio_diffusion.py               |   0
 .../latent_diffusion_uncond/__init__.py       |   0
 .../pipeline_latent_diffusion_uncond.py       |   0
 .../pipelines/deprecated/pndm/__init__.py     |   0
 .../deprecated/pndm/pipeline_pndm.py          |   0
 .../pipelines/deprecated/repaint/__init__.py  |   0
 .../deprecated/repaint/pipeline_repaint.py    |   0
 .../deprecated/score_sde_ve/__init__.py       |   0
 .../score_sde_ve/pipeline_score_sde_ve.py     |   0
 .../spectrogram_diffusion/__init__.py         |   0
 .../continuous_encoder.py                     |   0
 .../spectrogram_diffusion/midi_utils.py       |   0
 .../spectrogram_diffusion/notes_encoder.py    |   0
 .../pipeline_spectrogram_diffusion.py         |   0
 .../stable_diffusion_variants/__init__.py     |   0
 .../pipeline_cycle_diffusion.py               |   0
 ...ne_onnx_stable_diffusion_inpaint_legacy.py |   0
 ...ipeline_stable_diffusion_inpaint_legacy.py |   0
 ...pipeline_stable_diffusion_model_editing.py |   0
 .../pipeline_stable_diffusion_paradigms.py    |   0
 .../pipeline_stable_diffusion_pix2pix_zero.py |   0
 .../stochastic_karras_ve/__init__.py          |   0
 .../pipeline_stochastic_karras_ve.py          |   0
 .../versatile_diffusion/__init__.py           |   0
 .../versatile_diffusion/modeling_text_unet.py |   0
 .../pipeline_versatile_diffusion.py           |   0
 ...ipeline_versatile_diffusion_dual_guided.py |   0
 ...ine_versatile_diffusion_image_variation.py |   0
 ...eline_versatile_diffusion_text_to_image.py |   0
 .../deprecated/vq_diffusion/__init__.py       |   0
 .../vq_diffusion/pipeline_vq_diffusion.py     |   0
 .../src/diffusers/pipelines/dit/__init__.py   |   0
 .../diffusers/pipelines/dit/pipeline_dit.py   |   0
 .../diffusers/pipelines/free_init_utils.py    |   0
 .../diffusers/pipelines/i2vgen_xl/__init__.py |   0
 .../pipelines/i2vgen_xl/pipeline_i2vgen_xl.py |   0
 .../diffusers/pipelines/kandinsky/__init__.py |   0
 .../pipelines/kandinsky/pipeline_kandinsky.py |   0
 .../kandinsky/pipeline_kandinsky_combined.py  |   0
 .../kandinsky/pipeline_kandinsky_img2img.py   |   0
 .../kandinsky/pipeline_kandinsky_inpaint.py   |   0
 .../kandinsky/pipeline_kandinsky_prior.py     |   0
 .../pipelines/kandinsky/text_encoder.py       |   0
 .../pipelines/kandinsky2_2/__init__.py        |   0
 .../kandinsky2_2/pipeline_kandinsky2_2.py     |   0
 .../pipeline_kandinsky2_2_combined.py         |   0
 .../pipeline_kandinsky2_2_controlnet.py       |   0
 ...ipeline_kandinsky2_2_controlnet_img2img.py |   0
 .../pipeline_kandinsky2_2_img2img.py          |   0
 .../pipeline_kandinsky2_2_inpainting.py       |   0
 .../pipeline_kandinsky2_2_prior.py            |   0
 .../pipeline_kandinsky2_2_prior_emb2emb.py    |   0
 .../pipelines/kandinsky3/__init__.py          |   0
 .../kandinsky3/convert_kandinsky3_unet.py     |   0
 .../kandinsky3/pipeline_kandinsky3.py         |   0
 .../kandinsky3/pipeline_kandinsky3_img2img.py |   0
 .../latent_consistency_models/__init__.py     |   0
 .../pipeline_latent_consistency_img2img.py    |   0
 .../pipeline_latent_consistency_text2img.py   |   0
 .../pipelines/latent_diffusion/__init__.py    |   0
 .../pipeline_latent_diffusion.py              |   0
 ...peline_latent_diffusion_superresolution.py |   0
 .../diffusers/pipelines/ledits_pp/__init__.py |   0
 .../pipeline_leditspp_stable_diffusion.py     |   0
 .../pipeline_leditspp_stable_diffusion_xl.py  |   0
 .../pipelines/ledits_pp/pipeline_output.py    |   0
 .../diffusers/pipelines/musicldm/__init__.py  |   0
 .../pipelines/musicldm/pipeline_musicldm.py   |   0
 .../src/diffusers/pipelines/onnx_utils.py     |   0
 .../pipelines/paint_by_example/__init__.py    |   0
 .../paint_by_example/image_encoder.py         |   0
 .../pipeline_paint_by_example.py              |   0
 .../src/diffusers/pipelines/pia/__init__.py   |   0
 .../diffusers/pipelines/pia/pipeline_pia.py   |   0
 .../pipelines/pipeline_flax_utils.py          |   0
 .../pipelines/pipeline_loading_utils.py       |   0
 .../src/diffusers/pipelines/pipeline_utils.py |   0
 .../pipelines/pixart_alpha/__init__.py        |   0
 .../pixart_alpha/pipeline_pixart_alpha.py     |   0
 .../semantic_stable_diffusion/__init__.py     |   0
 .../pipeline_output.py                        |   0
 .../pipeline_semantic_stable_diffusion.py     |   0
 .../diffusers/pipelines/shap_e/__init__.py    |   0
 .../src/diffusers/pipelines/shap_e/camera.py  |   0
 .../pipelines/shap_e/pipeline_shap_e.py       |   0
 .../shap_e/pipeline_shap_e_img2img.py         |   0
 .../diffusers/pipelines/shap_e/renderer.py    |   0
 .../pipelines/stable_cascade/__init__.py      |   0
 .../stable_cascade/pipeline_stable_cascade.py |   0
 .../pipeline_stable_cascade_combined.py       |   0
 .../pipeline_stable_cascade_prior.py          |   0
 .../pipelines/stable_diffusion/README.md      |   0
 .../pipelines/stable_diffusion/__init__.py    |   0
 .../clip_image_project_model.py               |   0
 .../stable_diffusion/convert_from_ckpt.py     |   0
 .../pipeline_flax_stable_diffusion.py         |   0
 .../pipeline_flax_stable_diffusion_img2img.py |   0
 .../pipeline_flax_stable_diffusion_inpaint.py |   0
 .../pipeline_onnx_stable_diffusion.py         |   0
 .../pipeline_onnx_stable_diffusion_img2img.py |   0
 .../pipeline_onnx_stable_diffusion_inpaint.py |   0
 .../pipeline_onnx_stable_diffusion_upscale.py |   0
 .../stable_diffusion/pipeline_output.py       |   0
 .../pipeline_stable_diffusion.py              |   0
 .../pipeline_stable_diffusion_depth2img.py    |   0
 ...peline_stable_diffusion_image_variation.py |   0
 .../pipeline_stable_diffusion_img2img.py      |   0
 .../pipeline_stable_diffusion_inpaint.py      |   0
 ...eline_stable_diffusion_instruct_pix2pix.py |   0
 ...ipeline_stable_diffusion_latent_upscale.py |   0
 .../pipeline_stable_diffusion_upscale.py      |   0
 .../pipeline_stable_unclip.py                 |   0
 .../pipeline_stable_unclip_img2img.py         |   0
 .../stable_diffusion/safety_checker.py        |   0
 .../stable_diffusion/safety_checker_flax.py   |   0
 .../stable_unclip_image_normalizer.py         |   0
 .../__init__.py                               |   0
 ...line_stable_diffusion_attend_and_excite.py |   0
 .../stable_diffusion_diffedit/__init__.py     |   0
 .../pipeline_stable_diffusion_diffedit.py     |   0
 .../stable_diffusion_gligen/__init__.py       |   0
 .../pipeline_stable_diffusion_gligen.py       |   0
 ...line_stable_diffusion_gligen_text_image.py |   0
 .../stable_diffusion_k_diffusion/__init__.py  |   0
 .../pipeline_stable_diffusion_k_diffusion.py  |   0
 ...ipeline_stable_diffusion_xl_k_diffusion.py |   0
 .../stable_diffusion_ldm3d/__init__.py        |   0
 .../pipeline_stable_diffusion_ldm3d.py        |   0
 .../stable_diffusion_panorama/__init__.py     |   0
 .../pipeline_stable_diffusion_panorama.py     |   0
 .../stable_diffusion_safe/__init__.py         |   0
 .../stable_diffusion_safe/pipeline_output.py  |   0
 .../pipeline_stable_diffusion_safe.py         |   0
 .../stable_diffusion_safe/safety_checker.py   |   0
 .../stable_diffusion_sag/__init__.py          |   0
 .../pipeline_stable_diffusion_sag.py          |   0
 .../pipelines/stable_diffusion_xl/__init__.py |   0
 .../pipeline_flax_stable_diffusion_xl.py      |   0
 .../stable_diffusion_xl/pipeline_output.py    |   0
 .../pipeline_stable_diffusion_xl.py           |   0
 .../pipeline_stable_diffusion_xl_img2img.py   |   0
 .../pipeline_stable_diffusion_xl_inpaint.py   |   0
 ...ne_stable_diffusion_xl_instruct_pix2pix.py |   0
 .../stable_diffusion_xl/watermark.py          |   0
 .../stable_video_diffusion/__init__.py        |   0
 .../pipeline_stable_video_diffusion.py        |   0
 .../pipelines/t2i_adapter/__init__.py         |   0
 .../pipeline_stable_diffusion_adapter.py      |   0
 .../pipeline_stable_diffusion_xl_adapter.py   |   0
 .../text_to_video_synthesis/__init__.py       |   0
 .../pipeline_output.py                        |   0
 .../pipeline_text_to_video_synth.py           |   0
 .../pipeline_text_to_video_synth_img2img.py   |   0
 .../pipeline_text_to_video_zero.py            |   0
 .../pipeline_text_to_video_zero_sdxl.py       |   0
 .../diffusers/pipelines/unclip/__init__.py    |   0
 .../pipelines/unclip/pipeline_unclip.py       |   0
 .../unclip/pipeline_unclip_image_variation.py |   0
 .../diffusers/pipelines/unclip/text_proj.py   |   0
 .../pipelines/unidiffuser/__init__.py         |   0
 .../unidiffuser/modeling_text_decoder.py      |   0
 .../pipelines/unidiffuser/modeling_uvit.py    |   0
 .../unidiffuser/pipeline_unidiffuser.py       |   0
 .../pipelines/wuerstchen/__init__.py          |   0
 .../wuerstchen/modeling_paella_vq_model.py    |   0
 .../wuerstchen/modeling_wuerstchen_common.py  |   0
 .../modeling_wuerstchen_diffnext.py           |   0
 .../wuerstchen/modeling_wuerstchen_prior.py   |   0
 .../wuerstchen/pipeline_wuerstchen.py         |   0
 .../pipeline_wuerstchen_combined.py           |   0
 .../wuerstchen/pipeline_wuerstchen_prior.py   |   0
 .../pytorch/src/diffusers/py.typed            |   0
 .../src/diffusers/schedulers/README.md        |   0
 .../src/diffusers/schedulers/__init__.py      |   0
 .../schedulers/deprecated/__init__.py         |   0
 .../deprecated/scheduling_karras_ve.py        |   0
 .../deprecated/scheduling_sde_vp.py           |   0
 .../diffusers/schedulers/scheduling_amused.py |   0
 .../scheduling_consistency_decoder.py         |   0
 .../scheduling_consistency_models.py          |   0
 .../diffusers/schedulers/scheduling_ddim.py   |   0
 .../schedulers/scheduling_ddim_flax.py        |   0
 .../schedulers/scheduling_ddim_inverse.py     |   0
 .../schedulers/scheduling_ddim_parallel.py    |   0
 .../diffusers/schedulers/scheduling_ddpm.py   |   0
 .../schedulers/scheduling_ddpm_flax.py        |   0
 .../schedulers/scheduling_ddpm_parallel.py    |   0
 .../schedulers/scheduling_ddpm_wuerstchen.py  |   0
 .../schedulers/scheduling_deis_multistep.py   |   0
 .../scheduling_dpmsolver_multistep.py         |   0
 .../scheduling_dpmsolver_multistep_flax.py    |   0
 .../scheduling_dpmsolver_multistep_inverse.py |   0
 .../schedulers/scheduling_dpmsolver_sde.py    |   0
 .../scheduling_dpmsolver_singlestep.py        |   0
 .../scheduling_edm_dpmsolver_multistep.py     |   0
 .../schedulers/scheduling_edm_euler.py        |   0
 .../scheduling_euler_ancestral_discrete.py    |   0
 .../schedulers/scheduling_euler_discrete.py   |   0
 .../scheduling_euler_discrete_flax.py         |   0
 .../schedulers/scheduling_heun_discrete.py    |   0
 .../diffusers/schedulers/scheduling_ipndm.py  |   0
 .../scheduling_k_dpm_2_ancestral_discrete.py  |   0
 .../schedulers/scheduling_k_dpm_2_discrete.py |   0
 .../schedulers/scheduling_karras_ve_flax.py   |   0
 .../diffusers/schedulers/scheduling_lcm.py    |   0
 .../schedulers/scheduling_lms_discrete.py     |   0
 .../scheduling_lms_discrete_flax.py           |   0
 .../diffusers/schedulers/scheduling_pndm.py   |   0
 .../schedulers/scheduling_pndm_flax.py        |   0
 .../schedulers/scheduling_repaint.py          |   0
 .../schedulers/scheduling_sasolver.py         |   0
 .../diffusers/schedulers/scheduling_sde_ve.py |   0
 .../schedulers/scheduling_sde_ve_flax.py      |   0
 .../diffusers/schedulers/scheduling_tcd.py    |   0
 .../diffusers/schedulers/scheduling_unclip.py |   0
 .../schedulers/scheduling_unipc_multistep.py  |   0
 .../diffusers/schedulers/scheduling_utils.py  |   0
 .../schedulers/scheduling_utils_flax.py       |   0
 .../schedulers/scheduling_vq_diffusion.py     |   0
 .../pytorch/src/diffusers/training_utils.py   |   0
 .../pytorch/src/diffusers/utils/__init__.py   |   0
 .../src/diffusers/utils/accelerate_utils.py   |   0
 .../pytorch/src/diffusers/utils/constants.py  |   0
 .../src/diffusers/utils/deprecation_utils.py  |   0
 .../pytorch/src/diffusers/utils/doc_utils.py  |   0
 .../dummy_flax_and_transformers_objects.py    |   0
 .../src/diffusers/utils/dummy_flax_objects.py |   0
 .../diffusers/utils/dummy_note_seq_objects.py |   0
 .../src/diffusers/utils/dummy_onnx_objects.py |   0
 .../src/diffusers/utils/dummy_pt_objects.py   |   0
 .../utils/dummy_torch_and_librosa_objects.py  |   0
 .../utils/dummy_torch_and_scipy_objects.py    |   0
 .../utils/dummy_torch_and_torchsde_objects.py |   0
 ...nd_transformers_and_k_diffusion_objects.py |   0
 ...torch_and_transformers_and_onnx_objects.py |   0
 .../dummy_torch_and_transformers_objects.py   |   0
 ...sformers_and_torch_and_note_seq_objects.py |   0
 .../diffusers/utils/dynamic_modules_utils.py  |   0
 .../src/diffusers/utils/export_utils.py       |   0
 .../pytorch/src/diffusers/utils/hub_utils.py  |   0
 .../src/diffusers/utils/import_utils.py       |   0
 .../src/diffusers/utils/loading_utils.py      |   0
 .../pytorch/src/diffusers/utils/logging.py    |   0
 .../diffusers/utils/model_card_template.md    |   0
 .../pytorch/src/diffusers/utils/outputs.py    |   0
 .../pytorch/src/diffusers/utils/peft_utils.py |   0
 .../pytorch/src/diffusers/utils/pil_utils.py  |   0
 .../src/diffusers/utils/state_dict_utils.py   |   0
 .../src/diffusers/utils/testing_utils.py      |   0
 .../src/diffusers/utils/torch_utils.py        |   0
 .../pytorch/src/diffusers/utils/versions.py   |   0
 .../bert_sample/pytorch/README.md             |  76 ++
 .../pytorch/default/config/config_V100x1x1.py |  38 +
 .../default/config/config_V100x1x16.py        |  38 +
 .../pytorch/default/config/config_V100x1x4.py |  38 +
 .../pytorch/default/config/config_V100x1x8.py |  38 +
 .../pytorch/default/config/training_event.py  |  85 ++
 .../iluvatar/config/config_00V100x1x8.py      |  56 ++
 .../iluvatar/config/config_01V100x1x8.py      |  57 ++
 .../iluvatar/config/config_02V100x1x8.py      |  57 ++
 .../iluvatar/config/config_03V100x1x16.py     |  59 ++
 .../iluvatar/config/config_03V100x1x8.py      |  58 ++
 .../iluvatar/config/config_V100x1x1.py        |  44 +
 .../iluvatar/config/config_V100x1x8.py        |  57 ++
 .../iluvatar/config/config_V100x2x8.py        |  56 ++
 .../iluvatar/config/config_V100x4x8.py        |  57 ++
 .../pytorch/iluvatar/config/config_common.py  | 102 ++
 .../pytorch/iluvatar/config/converter.py      | 192 ++++
 .../iluvatar/config/distributed_fused_lamb.py | 136 +++
 .../iluvatar/config/environment_variables.sh  |  69 ++
 .../iluvatar/config/layers/__init__.py        |  15 +
 .../iluvatar/config/layers/attention.py       |  67 ++
 .../pytorch/iluvatar/config/layers/bmm1.py    | 150 +++
 .../pytorch/iluvatar/config/layers/bmm2.py    | 144 +++
 .../pytorch/iluvatar/config/layers/fmha.py    | 179 ++++
 .../pytorch/iluvatar/config/layers/mha.py     | 142 +++
 .../pytorch/iluvatar/config/layers/softmax.py | 149 +++
 .../pytorch/iluvatar/config/training_event.py | 243 +++++
 .../pytorch/iluvatar/csrc/mha_funcs.cu        | 405 ++++++++
 .../iluvatar/results/iluvatar-1x8-stage-1.txt | 191 ++++
 .../iluvatar/results/iluvatar-1x8-stage-2.txt | 189 ++++
 .../iluvatar/results/iluvatar-1x8-stage-3.txt | 185 ++++
 .../pytorch/iluvatar/results/iluvatar-1x8.txt | 226 +++++
 .../train_bert_default_amp_dist_1x8_torch.sh  |   2 +-
 .../train_bert_pretraining_amp_dist_1x8_tf.sh |   2 +-
 ...ain_bert_pretraining_amp_dist_1x8_torch.sh |   4 +-
 tests/executables/conformer/init_torch.sh     |   2 +-
 ...in_conformer_librispeech_dist_1x8_torch.sh |   2 +-
 .../dali/train_resnet50_dali_torch.sh         |   2 +-
 tests/executables/fairmot/init_torch.sh       |   2 +-
 .../train_fairmot_hrnet32_dist_torch.sh       |   2 +-
 .../train_maskrcnn_resnet50_amp_torch.sh      |   2 +-
 tests/executables/mobilenetv3/init_torch.sh   |   2 +-
 .../train_mobilenetv3_large_amp_torch.sh      |   2 +-
 tests/executables/resnet/init_paddle.sh       |   2 +-
 .../resnet/train_resnet50_amp_torch.sh        |   2 +-
 .../resnet/train_resnet50_dist_paddle.sh      |   2 +-
 .../resnet/train_resnet50_dist_tf.sh          |   2 +-
 .../retinanet/train_retinanet_amp_torch.sh    |   2 +-
 tests/executables/ssd/init_tf.sh              |   2 +-
 tests/executables/ssd/init_torch.sh           |   2 +-
 tests/executables/ssd/train_ssd_amp_tf.sh     |   2 +-
 tests/executables/ssd/train_ssd_amp_torch.sh  |  50 +-
 .../stable-diffusion/init_torch.sh            |   2 +-
 .../train_sd2.1_pokemon_dist_1x8_torch.sh     |   2 +-
 ...ain_unet3d_kits19_stage3_dist_1x8_torch.sh |   2 +-
 tests/executables/yolov5/init_torch.sh        |   2 +-
 571 files changed, 15479 insertions(+), 187 deletions(-)
 create mode 100644 audio/speech_recognition/conformer/pytorch/README.md
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/DistributedRandomIdentitySampler.py
 rename {multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm => cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader}/__init__.py (100%)
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/common_dataset.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/dali.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/imagenet_dataset.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/mix_dataset.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/mix_sampler.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/multilabel_dataset.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/pk_sampler.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/postprocess/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/postprocess/topk.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/batch_ops/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/batch_ops/batch_operators.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/autoaugment.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/cutout.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/fmix.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/functional.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/grid.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/hide_and_seek.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/operators.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/randaugment.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/random_erasing.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/timm_autoaugment.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/utils/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls/data/utils/get_image_list.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/DistributedRandomIdentitySampler.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/cifar.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/common_dataset.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/custom_label_dataset.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/dali.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/face_dataset.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/icartoon_dataset.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/imagenet_dataset.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/logo_dataset.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/metabin_sampler.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/mix_dataset.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/mix_sampler.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/multi_scale_dataset.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/multi_scale_sampler.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/multilabel_dataset.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/person_dataset.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/pk_sampler.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/ra_sampler.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/vehicle_dataset.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/postprocess/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/postprocess/attr_rec.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/postprocess/scoreoutput.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/postprocess/threshoutput.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/postprocess/topk.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/batch_ops/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/batch_ops/batch_operators.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/autoaugment.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/cutout.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/dali_operators.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/fmix.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/functional.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/grid.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/hide_and_seek.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/operators.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/randaugment.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/random_erasing.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/timm_autoaugment.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/utils/__init__.py
 create mode 100644 cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/utils/get_image_list.py
 create mode 100644 cv/detection/yolov5-sample/pytorch/README.md
 delete mode 100644 multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/README.md
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/.gitignore (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/LICENSE (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/MANIFEST.in (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/build_diffusers.sh (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/clean_diffusers.sh (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/examples/text_to_image/README.md (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/examples/text_to_image/README_sdxl.md (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/examples/text_to_image/default_config.yaml (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/examples/text_to_image/requirements.txt (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/examples/text_to_image/requirements_flax.txt (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/examples/text_to_image/requirements_sdxl.txt (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/examples/text_to_image/single_config.yaml (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/examples/text_to_image/test_text_to_image.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/examples/text_to_image/test_text_to_image_lora.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/examples/text_to_image/train_text_to_image.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/examples/text_to_image/train_text_to_image_flax.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/examples/text_to_image/train_text_to_image_lora.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/examples/text_to_image/train_text_to_image_lora_sdxl.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/examples/text_to_image/train_text_to_image_sdxl.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/examples/text_to_image/zero2_config.yaml (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/install_diffusers.sh (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/run_sd_1.5.sh (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/run_sd_1.5_single.sh (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/run_sd_2.1.sh (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/run_sd_2.1_single.sh (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/run_sd_xl.sh (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/setup.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/commands/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/commands/diffusers_cli.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/commands/env.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/commands/fp16_safetensors.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/configuration_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/dependency_versions_check.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/dependency_versions_table.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/experimental/README.md (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/experimental/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/experimental/rl/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/experimental/rl/value_guided_sampling.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/image_processor.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/loaders/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/loaders/autoencoder.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/loaders/controlnet.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/loaders/ip_adapter.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/loaders/lora.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/loaders/lora_conversion_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/loaders/peft.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/loaders/single_file.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/loaders/single_file_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/loaders/textual_inversion.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/loaders/unet.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/loaders/utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/README.md (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/activations.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/adapter.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/attention.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/attention_flax.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/attention_processor.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/autoencoders/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/autoencoders/autoencoder_asym_kl.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/autoencoders/autoencoder_kl.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/autoencoders/autoencoder_tiny.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/autoencoders/consistency_decoder_vae.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/autoencoders/vae.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/controlnet.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/controlnet_flax.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/downsampling.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/dual_transformer_2d.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/embeddings.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/embeddings_flax.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/lora.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/modeling_flax_pytorch_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/modeling_flax_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/modeling_outputs.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/modeling_pytorch_flax_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/modeling_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/nhwc_groupnorm/Welford.h (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon/pytorch/src/diffusers/py.typed => stable-diffusion-2.1/pytorch/src/diffusers/models/nhwc_groupnorm/__init__.py} (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/nhwc_groupnorm/custom_gn.cpp (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/nhwc_groupnorm/custom_gn.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/nhwc_groupnorm/gn_kernel.cu (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/nhwc_groupnorm/gn_kernel.h (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/nhwc_groupnorm/nchw_kernel.cu (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/nhwc_groupnorm/vecs.h (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/normalization.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/prior_transformer.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/resnet.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/resnet_flax.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/t5_film_transformer.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/transformer_2d.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/transformer_temporal.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/transformers/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/transformers/dual_transformer_2d.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/transformers/prior_transformer.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/transformers/t5_film_transformer.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/transformers/transformer_2d.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/transformers/transformer_temporal.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/unet_1d.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/unet_1d_blocks.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/unet_2d.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/unet_2d_blocks.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/unet_2d_condition.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/unets/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/unets/unet_1d.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/unets/unet_1d_blocks.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/unets/unet_2d.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/unets/unet_2d_blocks.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/unets/unet_2d_blocks_flax.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/unets/unet_2d_condition.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/unets/unet_2d_condition_flax.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/unets/unet_3d_blocks.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/unets/unet_3d_condition.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/unets/unet_i2vgen_xl.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/unets/unet_kandinsky3.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/unets/unet_motion_model.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/unets/unet_spatio_temporal_condition.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/unets/unet_stable_cascade.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/unets/uvit_2d.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/upsampling.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/vae_flax.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/models/vq_model.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/optimization.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/README.md (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/amused/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/amused/pipeline_amused.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/amused/pipeline_amused_img2img.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/animatediff/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/animatediff/pipeline_animatediff.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/animatediff/pipeline_output.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/audioldm/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/audioldm/pipeline_audioldm.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/audioldm2/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/auto_pipeline.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/blip_diffusion/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/blip_diffusion/blip_image_processing.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/consistency_models/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/controlnet/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/controlnet/multicontrolnet.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/dance_diffusion/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/ddim/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/ddim/pipeline_ddim.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/ddpm/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/ddpm/pipeline_ddpm.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deepfloyd_if/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_output.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deepfloyd_if/safety_checker.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deepfloyd_if/timesteps.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deepfloyd_if/watermark.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/README.md (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_output.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/mel.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/pndm/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/pndm/pipeline_pndm.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/repaint/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/score_sde_ve/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/stochastic_karras_ve/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/vq_diffusion/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/dit/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/dit/pipeline_dit.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/free_init_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/i2vgen_xl/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/kandinsky/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/kandinsky/text_encoder.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/kandinsky2_2/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/kandinsky3/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/latent_consistency_models/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/latent_diffusion/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/ledits_pp/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_output.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/musicldm/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/musicldm/pipeline_musicldm.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/onnx_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/paint_by_example/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/paint_by_example/image_encoder.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/pia/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/pia/pipeline_pia.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/pipeline_flax_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/pipeline_loading_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/pipeline_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/pixart_alpha/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_output.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/shap_e/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/shap_e/camera.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/shap_e/pipeline_shap_e.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/shap_e/renderer.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_cascade/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/README.md (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/clip_image_project_model.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_output.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/safety_checker.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/safety_checker_flax.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_attend_and_excite/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_diffedit/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_ldm3d/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_panorama/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_safe/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_safe/pipeline_output.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_sag/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_xl/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_output.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_diffusion_xl/watermark.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_video_diffusion/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/t2i_adapter/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/text_to_video_synthesis/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/unclip/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/unclip/pipeline_unclip.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/unclip/text_proj.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/unidiffuser/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/unidiffuser/modeling_uvit.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/wuerstchen/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py (100%)
 create mode 100644 multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/py.typed
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/README.md (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/deprecated/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/deprecated/scheduling_sde_vp.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_amused.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_consistency_decoder.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_consistency_models.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_ddim.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_ddim_flax.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_ddim_inverse.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_ddim_parallel.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_ddpm.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_ddpm_flax.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_ddpm_parallel.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_deis_multistep.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_sde.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_edm_euler.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_euler_discrete.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_euler_discrete_flax.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_heun_discrete.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_ipndm.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_karras_ve_flax.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_lcm.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_lms_discrete.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_lms_discrete_flax.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_pndm.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_pndm_flax.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_repaint.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_sasolver.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_sde_ve.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_sde_ve_flax.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_tcd.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_unclip.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_unipc_multistep.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_utils_flax.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/schedulers/scheduling_vq_diffusion.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/training_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/__init__.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/accelerate_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/constants.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/deprecation_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/doc_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/dummy_flax_and_transformers_objects.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/dummy_flax_objects.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/dummy_note_seq_objects.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/dummy_onnx_objects.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/dummy_pt_objects.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/dummy_torch_and_librosa_objects.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/dummy_torch_and_scipy_objects.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/dummy_torch_and_torchsde_objects.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/dummy_torch_and_transformers_and_k_diffusion_objects.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/dummy_torch_and_transformers_objects.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/dummy_transformers_and_torch_and_note_seq_objects.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/dynamic_modules_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/export_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/hub_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/import_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/loading_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/logging.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/model_card_template.md (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/outputs.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/peft_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/pil_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/state_dict_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/testing_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/torch_utils.py (100%)
 rename multimodal/diffusion_model/{stable-diffusion-2.1-pokemon => stable-diffusion-2.1}/pytorch/src/diffusers/utils/versions.py (100%)
 create mode 100644 nlp/language_model/bert_sample/pytorch/README.md
 create mode 100644 nlp/language_model/bert_sample/pytorch/default/config/config_V100x1x1.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/default/config/config_V100x1x16.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/default/config/config_V100x1x4.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/default/config/config_V100x1x8.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/default/config/training_event.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/config/config_00V100x1x8.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/config/config_01V100x1x8.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/config/config_02V100x1x8.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/config/config_03V100x1x16.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/config/config_03V100x1x8.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/config/config_V100x1x1.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/config/config_V100x1x8.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/config/config_V100x2x8.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/config/config_V100x4x8.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/config/config_common.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/config/converter.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/config/distributed_fused_lamb.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/config/environment_variables.sh
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/__init__.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/attention.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/bmm1.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/bmm2.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/fmha.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/mha.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/softmax.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/config/training_event.py
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/csrc/mha_funcs.cu
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/results/iluvatar-1x8-stage-1.txt
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/results/iluvatar-1x8-stage-2.txt
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/results/iluvatar-1x8-stage-3.txt
 create mode 100644 nlp/language_model/bert_sample/pytorch/iluvatar/results/iluvatar-1x8.txt

diff --git a/audio/speech_recognition/conformer/pytorch/README.md b/audio/speech_recognition/conformer/pytorch/README.md
new file mode 100644
index 000000000..a2acb0c3e
--- /dev/null
+++ b/audio/speech_recognition/conformer/pytorch/README.md
@@ -0,0 +1,76 @@
+# Conformer
+
+## Model Description
+
+Recently Transformer and Convolution neural network (CNN) based models have shown promising results in Automatic Speech
+Recognition (ASR), outperforming Recurrent neural networks (RNNs). Transformer models are good at capturing
+content-based global interactions, while CNNs exploit local features effectively. In this work, we achieve the best of
+both worlds by studying how to combine convolution neural networks and transformers to model both local and global
+dependencies of an audio sequence in a parameter-efficient way. To this regard, we propose the convolution-augmented
+transformer for speech recognition, named Conformer. Conformer significantly outperforms the previous Transformer and
+CNN based models achieving state-of-the-art accuracies. On the widely used LibriSpeech benchmark, our model achieves WER
+of 2.1%/4.3% without using a language model and 1.9%/3.9% with an external language model on test/testother. We also
+observe competitive performance of 2.7%/6.3% with a small model of only 10M parameters.
+
+## Supported Environments
+
+| GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| BI-V150 | 4.3.0     |  25.12  |
+
+## Model Preparation
+
+### Prepare Resources
+```bash
+mkdir -p data/datasets/LibriSpeech
+cd data/datasets/LibriSpeech
+wget http://files.deepspark.org.cn:880/deepspark/conformer/LibriSpeech/dev-clean.tar.gz
+wget http://files.deepspark.org.cn:880/deepspark/conformer/LibriSpeech/train-clean-100.tar.gz
+tar -xf dev-clean.tar.gz
+tar -xf train-clean-100.tar.gz
+mv LibriSpeech/* ./
+
+└── data/datasets/LibriSpeech
+    ├── train-clean-100.tar.gz
+    ├── dev-clean.tar.gz
+    ├── dev-clean
+    └── train-clean-100
+    └── ...
+
+mkdir -p data/model_zoo/sentencepieces
+cd data/model_zoo/sentencepieces
+wget http://files.deepspark.org.cn:880/deepspark/conformer/sentencepieces/sp.model
+wget http://files.deepspark.org.cn:880/deepspark/conformer/sentencepieces/sp.vocab
+```
+
+### Install Dependencies
+
+```sh
+apt install -y numactl libsndfile1
+pip3 install http://files.deepspark.org.cn:880/deepspark/conformer/IXPyLogger-1.0.0-py3-none-any.whl
+pip3 install numpy==1.26.4
+pip3 install -r requirements.txt
+
+wget https://librosa.org/data/audio/admiralbob77_-_Choice_-_Drum-bass.ogg
+mkdir -p ~/.cache/librosa/
+mv admiralbob77_-_Choice_-_Drum-bass.ogg ~/.cache/librosa/
+```
+
+## Model Training
+
+```bash
+bash run_training.sh --data_dir=./data \
+    --max_steps=800 \
+    --quality_target=1.6 \
+    --batch_size=8 \
+    --eval_freq=400 \
+    --ddp \
+    --max_steps=800 \
+    --quality_target=1.6 \
+    --eval_freq=400
+```
+
+## Model Results
+| GPU         | tps  | wer                   |
+|-------------|------|-----------------------|
+| BI-V150 x 8 | 127.7341 | 1.4652 |
diff --git a/cv/classification/resnet50/paddlepaddle/.gitignore b/cv/classification/resnet50/paddlepaddle/.gitignore
index dcf07c225..11bc9cbf4 100644
--- a/cv/classification/resnet50/paddlepaddle/.gitignore
+++ b/cv/classification/resnet50/paddlepaddle/.gitignore
@@ -1,4 +1,3 @@
-data/
 dataset/
 __pycache__/
 *.pyc
diff --git a/cv/classification/resnet50/paddlepaddle/README.md b/cv/classification/resnet50/paddlepaddle/README.md
index d21b52a02..06d43cd45 100644
--- a/cv/classification/resnet50/paddlepaddle/README.md
+++ b/cv/classification/resnet50/paddlepaddle/README.md
@@ -12,76 +12,54 @@ computer vision applications, serving as a backbone for various tasks like objec
 
 | GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
 | :----: | :----: | :----: |
-| BI-V150 | 4.2.0     |  25.03  |
-| BI-V100 | 2.3.0     |  22.12  |
+| BI-V150 | 4.3.0     |  25.12  |
 
 ## Model Preparation
 
 ### Prepare Resources
 
-Sign up and login in [ImageNet official website](https://www.image-net.org/index.php), then choose 'Download' to
-download the whole ImageNet dataset. Specify `/path/to/imagenet` to your ImageNet path in later training process.
-
-The ImageNet dataset path structure should look like:
-
 ```bash
-imagenet
-├── train
-│   └── n01440764
-│       ├── n01440764_10026.JPEG
-│       └── ...
-├── train_list.txt
-├── val
-│   └── n01440764
-│       ├── ILSVRC2012_val_00000293.JPEG
-│       └── ...
+mkdir -p data/datasets/flowers102
+cd data/datasets/flowers102
+wget http://files.deepspark.org.cn:880/deepspark/data/datasets/flowers102.tgz
+tar -xf flowers102.tgz
+
+data/datasets/flowers102
+├── jpg
+│   └── image_00000.jpg
+│   ├── image_00001.jpg
+│   └── ...
+├── flowers102_label_list.txt    
+├── train_extra_list.txt
 └── val_list.txt
 ```
 
 ### Install Dependencies
 
-```bash
-# Install libGL
-## CentOS
-yum install -y mesa-libGL
-## Ubuntu
-apt install -y libgl1-mesa-glx
-
-git clone https://github.com/PaddlePaddle/PaddleClas.git -b release/2.6 --depth=1
-cd PaddleClas
-pip3 install -r requirements.txt
-python3 setup.py install
-```
-
-Tips: for `PaddleClas` training, the images path in train_list.txt and val_list.txt must contain `train/` and `val/` directories:
-
-- train_list.txt: train/n01440764/n01440764_10026.JPEG 0
-- val_list.txt: val/n01667114/ILSVRC2012_val_00000229.JPEG 35
+Contact the Iluvatar administrator to get the missing packages:
+  - paddlepaddle-3.0.0+corex.4.3.0-cp310-cp310-linux_x86_64.whl
 
 ```bash
-# add "train/" and "val/" to head of lines
-sed -i 's#^#train/#g' train_list.txt
-sed -i 's#^#val/#g' val_list.txt
+mkdir -p dataset
+ln -s ${DATASET_DIR}/flowers102 dataset/flowers102
+
+export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
+pip3 install protobuf==3.20.3
+pip3 install pyyaml
+pip3 install -r requirements.txt
+rm -rf ppcls && ln -s ppcls_2.6 ppcls
 ```
 
-## Step 3: Run ResNet50
+## Model Training
 
 ```bash
-# Make sure your dataset path is the same as above
-cd PaddleClas
-# Link your dataset to default location
-ln -s /path/to/imagenet ./dataset/ILSVRC2012
-export FLAGS_cudnn_exhaustive_search=True
-export FLAGS_cudnn_batchnorm_spatial_persistent=True
-export CUDA_VISIBLE_DEVICES=0,1,2,3
-python3 -u -m paddle.distributed.launch --gpus=0,1,2,3 tools/train.py -c ppcls/configs/ImageNet/ResNet/ResNet50.yaml -o Arch.pretrained=False -o Global.device=gpu
+bash run_resnet50_dist.sh
 ```
 
 ## Model Results
 
-| Model    | GPU        | FP32                               |
-|----------|------------|------------------------------------|
-| ResNet50 | BI-V100 x4 | Acc@1=76.27,FPS=80.37,BatchSize=64 |
+| Model    | GPU        | CELoss   | loss   | top1   | top5   |
+|----------|------------|----------|--------|----------|----------|
+| ResNet50 | BI-V150 x8 | 4.80621  | 4.80621 | 0.05000 | 0.18529|
 
 ## Reference
-- [PaddleClas](https://github.com/PaddlePaddle/PaddleClas)
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/__init__.py
new file mode 100644
index 000000000..939c945a0
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/__init__.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import copy
+import paddle
+import numpy as np
+from paddle.io import DistributedBatchSampler, BatchSampler, DataLoader
+from ppcls.utils import logger
+
+from ppcls.data import dataloader
+# dataset
+from ppcls.data.dataloader.imagenet_dataset import ImageNetDataset
+from ppcls.data.dataloader.common_dataset import create_operators
+
+# sampler
+from ppcls.data.dataloader.DistributedRandomIdentitySampler import DistributedRandomIdentitySampler
+from ppcls.data.dataloader.pk_sampler import PKSampler
+from ppcls.data.dataloader.mix_sampler import MixSampler
+from ppcls.data import preprocess
+from ppcls.data.preprocess import transform
+
+
+def create_operators(params, class_num=None):
+    """
+    create operators based on the config
+
+    Args:
+        params(list): a dict list, used to create some operators
+    """
+    assert isinstance(params, list), ('operator config should be a list')
+    ops = []
+    for operator in params:
+        assert isinstance(operator,
+                          dict) and len(operator) == 1, "yaml format error"
+        op_name = list(operator)[0]
+        param = {} if operator[op_name] is None else operator[op_name]
+        op_func = getattr(preprocess, op_name)
+        if "class_num" in inspect.getfullargspec(op_func).args:
+            param.update({"class_num": class_num})
+        op = op_func(**param)
+        ops.append(op)
+
+    return ops
+
+
+def build_dataloader(config, mode, device, use_dali=False, seed=None):
+    assert mode in [
+        'Train', 'Eval', 'Test', 'Gallery', 'Query'
+    ], "Dataset mode should be Train, Eval, Test, Gallery, Query"
+    # build dataset
+    if use_dali:
+        from ppcls.data.dataloader.dali import dali_dataloader
+        return dali_dataloader(config, mode, paddle.device.get_device(), seed)
+
+    class_num = config.get("class_num", None)
+    config_dataset = config[mode]['dataset']
+    config_dataset = copy.deepcopy(config_dataset)
+    dataset_name = config_dataset.pop('name')
+    if 'batch_transform_ops' in config_dataset:
+        batch_transform = config_dataset.pop('batch_transform_ops')
+    else:
+        batch_transform = None
+
+    dataset = eval(dataset_name)(**config_dataset)
+
+    logger.debug("build dataset({}) success...".format(dataset))
+
+    # build sampler
+    config_sampler = config[mode]['sampler']
+    if "name" not in config_sampler:
+        batch_sampler = None
+        batch_size = config_sampler["batch_size"]
+        drop_last = config_sampler["drop_last"]
+        shuffle = config_sampler["shuffle"]
+    else:
+        sampler_name = config_sampler.pop("name")
+        batch_sampler = eval(sampler_name)(dataset, **config_sampler)
+
+    logger.debug("build batch_sampler({}) success...".format(batch_sampler))
+
+    # build batch operator
+    def mix_collate_fn(batch):
+        batch = transform(batch, batch_ops)
+        # batch each field
+        slots = []
+        for items in batch:
+            for i, item in enumerate(items):
+                if len(slots) < len(items):
+                    slots.append([item])
+                else:
+                    slots[i].append(item)
+        return [np.stack(slot, axis=0) for slot in slots]
+
+    if isinstance(batch_transform, list):
+        batch_ops = create_operators(batch_transform, class_num)
+        batch_collate_fn = mix_collate_fn
+    else:
+        batch_collate_fn = None
+
+    # build dataloader
+    config_loader = config[mode]['loader']
+    num_workers = config_loader["num_workers"]
+    use_shared_memory = config_loader["use_shared_memory"]
+
+    if batch_sampler is None:
+        data_loader = DataLoader(
+            dataset=dataset,
+            places=device,
+            num_workers=num_workers,
+            return_list=True,
+            use_shared_memory=use_shared_memory,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            drop_last=drop_last,
+            collate_fn=batch_collate_fn)
+    else:
+        data_loader = DataLoader(
+            dataset=dataset,
+            places=device,
+            num_workers=num_workers,
+            return_list=True,
+            use_shared_memory=use_shared_memory,
+            batch_sampler=batch_sampler,
+            collate_fn=batch_collate_fn)
+
+    logger.debug("build data_loader({}) success...".format(data_loader))
+    return data_loader
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/DistributedRandomIdentitySampler.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/DistributedRandomIdentitySampler.py
new file mode 100644
index 000000000..1203803f5
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/DistributedRandomIdentitySampler.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from collections import defaultdict
+import numpy as np
+import copy
+import random
+from paddle.io import DistributedBatchSampler, Sampler
+
+
+class DistributedRandomIdentitySampler(DistributedBatchSampler):
+    """
+    Randomly sample N identities, then for each identity,
+    randomly sample K instances, therefore batch size is N*K.
+    Args:
+    - data_source (list): list of (img_path, pid, camid).
+    - num_instances (int): number of instances per identity in a batch.
+    - batch_size (int): number of examples in a batch.
+    """
+
+    def __init__(self, dataset, batch_size, num_instances, drop_last, **args):
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.num_instances = num_instances
+        self.drop_last = drop_last
+        self.num_pids_per_batch = self.batch_size // self.num_instances
+        self.index_dic = defaultdict(list)
+        for index, pid in enumerate(self.dataset.labels):
+            self.index_dic[pid].append(index)
+        self.pids = list(self.index_dic.keys())
+        # estimate number of examples in an epoch
+        self.length = 0
+        for pid in self.pids:
+            idxs = self.index_dic[pid]
+            num = len(idxs)
+            if num < self.num_instances:
+                num = self.num_instances
+            self.length += num - num % self.num_instances
+
+    def __iter__(self):
+        batch_idxs_dict = defaultdict(list)
+        for pid in self.pids:
+            idxs = copy.deepcopy(self.index_dic[pid])
+            if len(idxs) < self.num_instances:
+                idxs = np.random.choice(
+                    idxs, size=self.num_instances, replace=True)
+            random.shuffle(idxs)
+            batch_idxs = []
+            for idx in idxs:
+                batch_idxs.append(idx)
+                if len(batch_idxs) == self.num_instances:
+                    batch_idxs_dict[pid].append(batch_idxs)
+                    batch_idxs = []
+        avai_pids = copy.deepcopy(self.pids)
+        final_idxs = []
+        while len(avai_pids) >= self.num_pids_per_batch:
+            selected_pids = random.sample(avai_pids, self.num_pids_per_batch)
+            for pid in selected_pids:
+                batch_idxs = batch_idxs_dict[pid].pop(0)
+                final_idxs.extend(batch_idxs)
+                if len(batch_idxs_dict[pid]) == 0:
+                    avai_pids.remove(pid)
+        _sample_iter = iter(final_idxs)
+        batch_indices = []
+        for idx in _sample_iter:
+            batch_indices.append(idx)
+            if len(batch_indices) == self.batch_size:
+                yield batch_indices
+                batch_indices = []
+        if not self.drop_last and len(batch_indices) > 0:
+            yield batch_indices
+
+    def __len__(self):
+        if self.drop_last:
+            return self.length // self.batch_size
+        else:
+            return (self.length + self.batch_size - 1) // self.batch_size
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/__init__.py
rename to cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/__init__.py
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/common_dataset.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/common_dataset.py
new file mode 100644
index 000000000..278cc3b08
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/common_dataset.py
@@ -0,0 +1,83 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+
+from paddle.io import Dataset
+
+from ppcls.data import preprocess
+from ppcls.data.preprocess import transform
+from ppcls.utils import logger
+
+
+def create_operators(params):
+    """
+    create operators based on the config
+    Args:
+        params(list): a dict list, used to create some operators
+    """
+    assert isinstance(params, list), ('operator config should be a list')
+    ops = []
+    for operator in params:
+        assert isinstance(operator,
+                          dict) and len(operator) == 1, "yaml format error"
+        op_name = list(operator)[0]
+        param = {} if operator[op_name] is None else operator[op_name]
+        op = getattr(preprocess, op_name)(**param)
+        ops.append(op)
+
+    return ops
+
+
+class CommonDataset(Dataset):
+    def __init__(
+            self,
+            image_root,
+            cls_label_path,
+            transform_ops=None, ):
+        self._img_root = image_root
+        self._cls_path = cls_label_path
+        if transform_ops:
+            self._transform_ops = create_operators(transform_ops)
+
+        self.images = []
+        self.labels = []
+        self._load_anno()
+
+    def _load_anno(self):
+        pass
+
+    def __getitem__(self, idx):
+        try:
+            with open(self.images[idx], 'rb') as f:
+                img = f.read()
+            if self._transform_ops:
+                img = transform(img, self._transform_ops)
+            img = img.transpose((2, 0, 1))
+            return (img, self.labels[idx])
+
+        except Exception as ex:
+            logger.error("Exception occured when parse line: {} with msg: {}".
+                         format(self.images[idx], ex))
+            rnd_idx = np.random.randint(self.__len__())
+            return self.__getitem__(rnd_idx)
+
+    def __len__(self):
+        return len(self.images)
+
+    @property
+    def class_num(self):
+        return len(set(self.labels))
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/dali.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/dali.py
new file mode 100644
index 000000000..a15c23156
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/dali.py
@@ -0,0 +1,319 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import copy
+import os
+
+import numpy as np
+import nvidia.dali.ops as ops
+import nvidia.dali.types as types
+import paddle
+from nvidia.dali import fn
+from nvidia.dali.pipeline import Pipeline
+from nvidia.dali.plugin.base_iterator import LastBatchPolicy
+from nvidia.dali.plugin.paddle import DALIGenericIterator
+
+
+class HybridTrainPipe(Pipeline):
+    def __init__(self,
+                 file_root,
+                 file_list,
+                 batch_size,
+                 resize_shorter,
+                 crop,
+                 min_area,
+                 lower,
+                 upper,
+                 interp,
+                 mean,
+                 std,
+                 device_id,
+                 shard_id=0,
+                 num_shards=1,
+                 random_shuffle=True,
+                 num_threads=4,
+                 seed=42,
+                 pad_output=False,
+                 output_dtype=types.FLOAT,
+                 dataset='Train'):
+        super(HybridTrainPipe, self).__init__(
+            batch_size, num_threads, device_id, seed=seed)
+        self.input = ops.readers.File(
+            file_root=file_root,
+            file_list=file_list,
+            shard_id=shard_id,
+            num_shards=num_shards,
+            random_shuffle=random_shuffle)
+        # set internal nvJPEG buffers size to handle full-sized ImageNet images
+        # without additional reallocations
+        device_memory_padding = 211025920
+        host_memory_padding = 140544512
+        self.decode = ops.decoders.ImageRandomCrop(
+            device='mixed',
+            output_type=types.DALIImageType.RGB,
+            device_memory_padding=device_memory_padding,
+            host_memory_padding=host_memory_padding,
+            random_aspect_ratio=[lower, upper],
+            random_area=[min_area, 1.0],
+            num_attempts=100)
+        self.res = ops.Resize(
+            device='gpu', resize_x=crop, resize_y=crop, interp_type=interp)
+        self.cmnp = ops.CropMirrorNormalize(
+            device="gpu",
+            dtype=output_dtype,
+            output_layout='CHW',
+            crop=(crop, crop),
+            mean=mean,
+            std=std,
+            pad_output=pad_output)
+        self.coin = ops.random.CoinFlip(probability=0.5)
+        self.to_int64 = ops.Cast(dtype=types.DALIDataType.INT64, device="gpu")
+
+    def define_graph(self):
+        rng = self.coin()
+        jpegs, labels = self.input(name="Reader")
+        images = self.decode(jpegs)
+        images = self.res(images)
+        output = self.cmnp(images.gpu(), mirror=rng)
+        return [output, self.to_int64(labels.gpu())]
+
+    def __len__(self):
+        return self.epoch_size("Reader")
+
+
+class HybridValPipe(Pipeline):
+    def __init__(self,
+                 file_root,
+                 file_list,
+                 batch_size,
+                 resize_shorter,
+                 crop,
+                 interp,
+                 mean,
+                 std,
+                 device_id,
+                 shard_id=0,
+                 num_shards=1,
+                 random_shuffle=False,
+                 num_threads=4,
+                 seed=42,
+                 pad_output=False,
+                 output_dtype=types.FLOAT):
+        super(HybridValPipe, self).__init__(
+            batch_size, num_threads, device_id, seed=seed)
+        self.input = ops.readers.File(
+            file_root=file_root,
+            file_list=file_list,
+            shard_id=shard_id,
+            num_shards=num_shards,
+            random_shuffle=random_shuffle)
+        self.decode = ops.decoders.Image(device="mixed")
+        self.res = ops.Resize(
+            device="gpu", resize_shorter=resize_shorter, interp_type=interp)
+        self.cmnp = ops.CropMirrorNormalize(
+            device="gpu",
+            dtype=output_dtype,
+            output_layout='CHW',
+            crop=(crop, crop),
+            mean=mean,
+            std=std,
+            pad_output=pad_output)
+        self.to_int64 = ops.Cast(dtype=types.DALIDataType.INT64, device="gpu")
+
+    def define_graph(self):
+        jpegs, labels = self.input(name="Reader")
+        images = self.decode(jpegs)
+        images = self.res(images)
+        output = self.cmnp(images)
+        return [output, self.to_int64(labels.gpu())]
+
+    def __len__(self):
+        return self.epoch_size("Reader")
+
+
+def dali_dataloader(config, mode, device, seed=None):
+    assert "gpu" in device, "gpu training is required for DALI"
+    device_id = int(device.split(':')[1])
+    config_dataloader = config[mode]
+    seed = 42 if seed is None else seed
+    ops = [
+        list(x.keys())[0]
+        for x in config_dataloader["dataset"]["transform_ops"]
+    ]
+    support_ops_train = [
+        "DecodeImage", "NormalizeImage", "RandFlipImage", "RandCropImage"
+    ]
+    support_ops_eval = [
+        "DecodeImage", "ResizeImage", "CropImage", "NormalizeImage"
+    ]
+
+    if mode.lower() == 'train':
+        assert set(ops) == set(
+            support_ops_train
+        ), "The supported trasform_ops for train_dataset in dali is : {}".format(
+            ",".join(support_ops_train))
+    else:
+        assert set(ops) == set(
+            support_ops_eval
+        ), "The supported trasform_ops for eval_dataset in dali is : {}".format(
+            ",".join(support_ops_eval))
+
+    normalize_ops = [
+        op for op in config_dataloader["dataset"]["transform_ops"]
+        if "NormalizeImage" in op
+    ][0]["NormalizeImage"]
+    channel_num = normalize_ops.get("channel_num", 3)
+    output_dtype = types.FLOAT16 if normalize_ops.get("output_fp16",
+                                                      False) else types.FLOAT
+
+    env = os.environ
+    #  assert float(env.get('FLAGS_fraction_of_gpu_memory_to_use', 0.92)) < 0.9, \
+    #      "Please leave enough GPU memory for DALI workspace, e.g., by setting" \
+    #      " `export FLAGS_fraction_of_gpu_memory_to_use=0.8`"
+
+    gpu_num = paddle.distributed.get_world_size()
+
+    batch_size = config_dataloader["sampler"]["batch_size"]
+
+    file_root = config_dataloader["dataset"]["image_root"]
+    file_list = config_dataloader["dataset"]["cls_label_path"]
+
+    interp = 1  # settings.interpolation or 1  # default to linear
+    interp_map = {
+        0: types.DALIInterpType.INTERP_NN,  # cv2.INTER_NEAREST
+        1: types.DALIInterpType.INTERP_LINEAR,  # cv2.INTER_LINEAR
+        2: types.DALIInterpType.INTERP_CUBIC,  # cv2.INTER_CUBIC
+        3: types.DALIInterpType.
+        INTERP_LANCZOS3,  # XXX use LANCZOS3 for cv2.INTER_LANCZOS4
+    }
+
+    assert interp in interp_map, "interpolation method not supported by DALI"
+    interp = interp_map[interp]
+    pad_output = channel_num == 4
+
+    transforms = {
+        k: v
+        for d in config_dataloader["dataset"]["transform_ops"]
+        for k, v in d.items()
+    }
+
+    scale = transforms["NormalizeImage"].get("scale", 1.0 / 255)
+    scale = eval(scale) if isinstance(scale, str) else scale
+    mean = transforms["NormalizeImage"].get("mean", [0.485, 0.456, 0.406])
+    std = transforms["NormalizeImage"].get("std", [0.229, 0.224, 0.225])
+    mean = [v / scale for v in mean]
+    std = [v / scale for v in std]
+
+    sampler_name = config_dataloader["sampler"].get("name",
+                                                    "DistributedBatchSampler")
+    assert sampler_name in ["DistributedBatchSampler", "BatchSampler"]
+
+    if mode.lower() == "train":
+        resize_shorter = 256
+        crop = transforms["RandCropImage"]["size"]
+        scale = transforms["RandCropImage"].get("scale", [0.08, 1.])
+        ratio = transforms["RandCropImage"].get("ratio", [3.0 / 4, 4.0 / 3])
+        min_area = scale[0]
+        lower = ratio[0]
+        upper = ratio[1]
+
+        if 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env:
+            shard_id = int(env['PADDLE_TRAINER_ID'])
+            num_shards = int(env['PADDLE_TRAINERS_NUM'])
+            device_id = int(env['FLAGS_selected_gpus'])
+            pipe = HybridTrainPipe(
+                file_root,
+                file_list,
+                batch_size,
+                resize_shorter,
+                crop,
+                min_area,
+                lower,
+                upper,
+                interp,
+                mean,
+                std,
+                device_id,
+                shard_id,
+                num_shards,
+                seed=seed + shard_id,
+                pad_output=pad_output,
+                output_dtype=output_dtype)
+            pipe.build()
+            pipelines = [pipe]
+            #  sample_per_shard = len(pipe) // num_shards
+        else:
+            pipe = HybridTrainPipe(
+                file_root,
+                file_list,
+                batch_size,
+                resize_shorter,
+                crop,
+                min_area,
+                lower,
+                upper,
+                interp,
+                mean,
+                std,
+                device_id=device_id,
+                shard_id=0,
+                num_shards=1,
+                seed=seed,
+                pad_output=pad_output,
+                output_dtype=output_dtype)
+            pipe.build()
+            pipelines = [pipe]
+            #  sample_per_shard = len(pipelines[0])
+        return DALIGenericIterator(
+            pipelines, ['data', 'label'], reader_name='Reader')
+    else:
+        resize_shorter = transforms["ResizeImage"].get("resize_short", 256)
+        crop = transforms["CropImage"]["size"]
+        if 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env and sampler_name == "DistributedBatchSampler":
+            shard_id = int(env['PADDLE_TRAINER_ID'])
+            num_shards = int(env['PADDLE_TRAINERS_NUM'])
+            device_id = int(env['FLAGS_selected_gpus'])
+
+            pipe = HybridValPipe(
+                file_root,
+                file_list,
+                batch_size,
+                resize_shorter,
+                crop,
+                interp,
+                mean,
+                std,
+                device_id=device_id,
+                shard_id=shard_id,
+                num_shards=num_shards,
+                pad_output=pad_output,
+                output_dtype=output_dtype)
+        else:
+            pipe = HybridValPipe(
+                file_root,
+                file_list,
+                batch_size,
+                resize_shorter,
+                crop,
+                interp,
+                mean,
+                std,
+                device_id=device_id,
+                pad_output=pad_output,
+                output_dtype=output_dtype)
+        pipe.build()
+        return DALIGenericIterator(
+            [pipe], ['data', 'label'], reader_name="Reader")
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/imagenet_dataset.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/imagenet_dataset.py
new file mode 100644
index 000000000..1166ab385
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/imagenet_dataset.py
@@ -0,0 +1,38 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import os
+
+from .common_dataset import CommonDataset
+
+
+class ImageNetDataset(CommonDataset):
+    def _load_anno(self, seed=None):
+        assert os.path.exists(self._cls_path)
+        assert os.path.exists(self._img_root)
+        self.images = []
+        self.labels = []
+
+        with open(self._cls_path) as fd:
+            lines = fd.readlines()
+            if seed is not None:
+                np.random.RandomState(seed).shuffle(lines)
+            for l in lines:
+                l = l.strip().split(" ")
+                self.images.append(os.path.join(self._img_root, l[0]))
+                self.labels.append(np.int64(l[1]))
+                assert os.path.exists(self.images[-1])
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/mix_dataset.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/mix_dataset.py
new file mode 100644
index 000000000..cbf4b4028
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/mix_dataset.py
@@ -0,0 +1,49 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import os
+
+from paddle.io import Dataset
+from .. import dataloader
+
+
+class MixDataset(Dataset):
+    def __init__(self, datasets_config):
+        super().__init__()
+        self.dataset_list = []
+        start_idx = 0
+        end_idx = 0
+        for config_i in datasets_config:
+            dataset_name = config_i.pop('name')
+            dataset = getattr(dataloader, dataset_name)(**config_i)
+            end_idx += len(dataset)
+            self.dataset_list.append([end_idx, start_idx, dataset])
+            start_idx = end_idx
+
+        self.length = end_idx
+
+    def __getitem__(self, idx):
+        for dataset_i in self.dataset_list:
+            if dataset_i[0] > idx:
+                dataset_i_idx = idx - dataset_i[1]
+                return dataset_i[2][dataset_i_idx]
+
+    def __len__(self):
+        return self.length
+
+    def get_dataset_list(self):
+        return self.dataset_list
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/mix_sampler.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/mix_sampler.py
new file mode 100644
index 000000000..2df3109ce
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/mix_sampler.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+
+from paddle.io import DistributedBatchSampler, Sampler
+
+from ppcls.utils import logger
+from ppcls.data.dataloader.mix_dataset import MixDataset
+from ppcls.data import dataloader
+
+
+class MixSampler(DistributedBatchSampler):
+    def __init__(self, dataset, batch_size, sample_configs, iter_per_epoch):
+        super().__init__(dataset, batch_size)
+        assert isinstance(dataset,
+                          MixDataset), "MixSampler only support MixDataset"
+        self.sampler_list = []
+        self.batch_size = batch_size
+        self.start_list = []
+        self.length = iter_per_epoch
+        dataset_list = dataset.get_dataset_list()
+        batch_size_left = self.batch_size
+        self.iter_list = []
+        for i, config_i in enumerate(sample_configs):
+            self.start_list.append(dataset_list[i][1])
+            sample_method = config_i.pop("name")
+            ratio_i = config_i.pop("ratio")
+            if i < len(sample_configs) - 1:
+                batch_size_i = int(self.batch_size * ratio_i)
+                batch_size_left -= batch_size_i
+            else:
+                batch_size_i = batch_size_left
+            assert batch_size_i <= len(dataset_list[i][2])
+            config_i["batch_size"] = batch_size_i
+            if sample_method == "DistributedBatchSampler":
+                sampler_i = DistributedBatchSampler(dataset_list[i][2],
+                                                    **config_i)
+            else:
+                sampler_i = getattr(dataloader, sample_method)(
+                    dataset_list[i][2], **config_i)
+            self.sampler_list.append(sampler_i)
+            self.iter_list.append(iter(sampler_i))
+            self.length += len(dataset_list[i][2]) * ratio_i
+            self.iter_counter = 0
+
+    def __iter__(self):
+        while self.iter_counter < self.length:
+            batch = []
+            for i, iter_i in enumerate(self.iter_list):
+                batch_i = next(iter_i, None)
+                if batch_i is None:
+                    iter_i = iter(self.sampler_list[i])
+                    self.iter_list[i] = iter_i
+                    batch_i = next(iter_i, None)
+                    assert batch_i is not None, "dataset {} return None".format(
+                        i)
+                batch += [idx + self.start_list[i] for idx in batch_i]
+            if len(batch) == self.batch_size:
+                self.iter_counter += 1
+                yield batch
+            else:
+                logger.info("Some dataset reaches end")
+        self.iter_counter = 0
+
+    def __len__(self):
+        return self.length
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/multilabel_dataset.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/multilabel_dataset.py
new file mode 100644
index 000000000..2c1ed7703
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/multilabel_dataset.py
@@ -0,0 +1,59 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import os
+import cv2
+
+from ppcls.data.preprocess import transform
+from ppcls.utils import logger
+
+from .common_dataset import CommonDataset
+
+
+class MultiLabelDataset(CommonDataset):
+    def _load_anno(self):
+        assert os.path.exists(self._cls_path)
+        assert os.path.exists(self._img_root)
+        self.images = []
+        self.labels = []
+        with open(self._cls_path) as fd:
+            lines = fd.readlines()
+            for l in lines:
+                l = l.strip().split("\t")
+                self.images.append(os.path.join(self._img_root, l[0]))
+
+                labels = l[1].split(',')
+                labels = [np.int64(i) for i in labels]
+
+                self.labels.append(labels)
+                assert os.path.exists(self.images[-1])
+
+    def __getitem__(self, idx):
+        try:
+            with open(self.images[idx], 'rb') as f:
+                img = f.read()
+            if self._transform_ops:
+                img = transform(img, self._transform_ops)
+            img = img.transpose((2, 0, 1))
+            label = np.array(self.labels[idx]).astype("float32")
+            return (img, label)
+
+        except Exception as ex:
+            logger.error("Exception occured when parse line: {} with msg: {}".
+                         format(self.images[idx], ex))
+            rnd_idx = np.random.randint(self.__len__())
+            return self.__getitem__(rnd_idx)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/pk_sampler.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/pk_sampler.py
new file mode 100644
index 000000000..bf563a6c1
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/dataloader/pk_sampler.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from collections import defaultdict
+import numpy as np
+import random
+from paddle.io import DistributedBatchSampler
+
+from ppcls.utils import logger
+
+
+class PKSampler(DistributedBatchSampler):
+    """
+    First, randomly sample P identities.
+    Then for each identity randomly sample K instances.
+    Therefore batch size is P*K, and the sampler called PKSampler.
+    Args:
+        dataset (paddle.io.Dataset): list of (img_path, pid, cam_id).
+        sample_per_id(int): number of instances per identity in a batch.
+        batch_size (int): number of examples in a batch.
+        shuffle(bool): whether to shuffle indices order before generating
+            batch indices. Default False.
+    """
+
+    def __init__(self,
+                 dataset,
+                 batch_size,
+                 sample_per_id,
+                 shuffle=True,
+                 drop_last=True,
+                 sample_method="sample_avg_prob"):
+        super().__init__(
+            dataset, batch_size, shuffle=shuffle, drop_last=drop_last)
+        assert batch_size % sample_per_id == 0, \
+            "PKSampler configs error, Sample_per_id must be a divisor of batch_size."
+        assert hasattr(self.dataset,
+                       "labels"), "Dataset must have labels attribute."
+        self.sample_per_label = sample_per_id
+        self.label_dict = defaultdict(list)
+        self.sample_method = sample_method
+        for idx, label in enumerate(self.dataset.labels):
+            self.label_dict[label].append(idx)
+        self.label_list = list(self.label_dict)
+        assert len(self.label_list) * self.sample_per_label > self.batch_size, \
+            "batch size should be smaller than "
+        if self.sample_method == "id_avg_prob":
+            self.prob_list = np.array([1 / len(self.label_list)] *
+                                      len(self.label_list))
+        elif self.sample_method == "sample_avg_prob":
+            counter = []
+            for label_i in self.label_list:
+                counter.append(len(self.label_dict[label_i]))
+            self.prob_list = np.array(counter) / sum(counter)
+        else:
+            logger.error(
+                "PKSampler only support id_avg_prob and sample_avg_prob sample method, "
+                "but receive {}.".format(self.sample_method))
+        diff = np.abs(sum(self.prob_list) - 1)
+        if diff > 0.00000001:
+            self.prob_list[-1] = 1 - sum(self.prob_list[:-1])
+            if self.prob_list[-1] > 1 or self.prob_list[-1] < 0:
+                logger.error("PKSampler prob list error")
+            else:
+                logger.info(
+                    "PKSampler: sum of prob list not equal to 1, diff is {}, change the last prob".format(diff)
+                )
+
+    def __iter__(self):
+        label_per_batch = self.batch_size // self.sample_per_label
+        for _ in range(len(self)):
+            batch_index = []
+            batch_label_list = np.random.choice(
+                self.label_list,
+                size=label_per_batch,
+                replace=False,
+                p=self.prob_list)
+            for label_i in batch_label_list:
+                label_i_indexes = self.label_dict[label_i]
+                if self.sample_per_label <= len(label_i_indexes):
+                    batch_index.extend(
+                        np.random.choice(
+                            label_i_indexes,
+                            size=self.sample_per_label,
+                            replace=False))
+                else:
+                    batch_index.extend(
+                        np.random.choice(
+                            label_i_indexes,
+                            size=self.sample_per_label,
+                            replace=True))
+            if not self.drop_last or len(batch_index) == self.batch_size:
+                yield batch_index
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/postprocess/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/postprocess/__init__.py
new file mode 100644
index 000000000..831a4da00
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/postprocess/__init__.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import importlib
+
+from . import topk
+
+from .topk import Topk, MultiLabelTopk
+
+
+def build_postprocess(config):
+    config = copy.deepcopy(config)
+    model_name = config.pop("name")
+    mod = importlib.import_module(__name__)
+    postprocess_func = getattr(mod, model_name)(**config)
+    return postprocess_func
+
+
+class DistillationPostProcess(object):
+    def __init__(self, model_name="Student", key=None, func="Topk", **kargs):
+        super().__init__()
+        self.func = eval(func)(**kargs)
+        self.model_name = model_name
+        self.key = key
+
+    def __call__(self, x, file_names=None):
+        x = x[self.model_name]
+        if self.key is not None:
+            x = x[self.key]
+        return self.func(x, file_names=file_names)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/postprocess/topk.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/postprocess/topk.py
new file mode 100644
index 000000000..9c1371bfd
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/postprocess/topk.py
@@ -0,0 +1,85 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+
+class Topk(object):
+    def __init__(self, topk=1, class_id_map_file=None):
+        assert isinstance(topk, (int, ))
+        self.class_id_map = self.parse_class_id_map(class_id_map_file)
+        self.topk = topk
+
+    def parse_class_id_map(self, class_id_map_file):
+        if class_id_map_file is None:
+            return None
+        if not os.path.exists(class_id_map_file):
+            print(
+                "Warning: If want to use your own label_dict, please input legal path!\nOtherwise label_names will be empty!"
+            )
+            return None
+
+        try:
+            class_id_map = {}
+            with open(class_id_map_file, "r") as fin:
+                lines = fin.readlines()
+                for line in lines:
+                    partition = line.split("\n")[0].partition(" ")
+                    class_id_map[int(partition[0])] = str(partition[-1])
+        except Exception as ex:
+            print(ex)
+            class_id_map = None
+        return class_id_map
+
+    def __call__(self, x, file_names=None, multilabel=False):
+        assert isinstance(x, paddle.Tensor)
+        if file_names is not None:
+            assert x.shape[0] == len(file_names)
+        x = F.softmax(x, axis=-1) if not multilabel else F.sigmoid(x)
+        x = x.numpy()
+        y = []
+        for idx, probs in enumerate(x):
+            index = probs.argsort(axis=0)[-self.topk:][::-1].astype(
+                "int32") if not multilabel else np.where(
+                    probs >= 0.5)[0].astype("int32")
+            clas_id_list = []
+            score_list = []
+            label_name_list = []
+            for i in index:
+                clas_id_list.append(i.item())
+                score_list.append(probs[i].item())
+                if self.class_id_map is not None:
+                    label_name_list.append(self.class_id_map[i.item()])
+            result = {
+                "class_ids": clas_id_list,
+                "scores": np.around(
+                    score_list, decimals=5).tolist(),
+            }
+            if file_names is not None:
+                result["file_name"] = file_names[idx]
+            if label_name_list is not None:
+                result["label_names"] = label_name_list
+            y.append(result)
+        return y
+
+
+class MultiLabelTopk(Topk):
+    def __init__(self, topk=1, class_id_map_file=None):
+        super().__init__()
+
+    def __call__(self, x, file_names=None):
+        return super().__call__(x, file_names, multilabel=True)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/__init__.py
new file mode 100644
index 000000000..075ee8927
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/__init__.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ppcls.data.preprocess.ops.autoaugment import ImageNetPolicy as RawImageNetPolicy
+from ppcls.data.preprocess.ops.randaugment import RandAugment as RawRandAugment
+from ppcls.data.preprocess.ops.timm_autoaugment import RawTimmAutoAugment
+from ppcls.data.preprocess.ops.cutout import Cutout
+
+from ppcls.data.preprocess.ops.hide_and_seek import HideAndSeek
+from ppcls.data.preprocess.ops.random_erasing import RandomErasing
+from ppcls.data.preprocess.ops.grid import GridMask
+
+from ppcls.data.preprocess.ops.operators import DecodeImage
+from ppcls.data.preprocess.ops.operators import ResizeImage
+from ppcls.data.preprocess.ops.operators import CropImage
+from ppcls.data.preprocess.ops.operators import RandCropImage
+from ppcls.data.preprocess.ops.operators import RandFlipImage
+from ppcls.data.preprocess.ops.operators import NormalizeImage
+from ppcls.data.preprocess.ops.operators import ToCHWImage
+from ppcls.data.preprocess.ops.operators import AugMix
+
+from ppcls.data.preprocess.batch_ops.batch_operators import MixupOperator, CutmixOperator, OpSampler, FmixOperator
+
+import numpy as np
+from PIL import Image
+
+
+def transform(data, ops=[]):
+    """ transform """
+    for op in ops:
+        data = op(data)
+    return data
+
+
+class AutoAugment(RawImageNetPolicy):
+    """ ImageNetPolicy wrapper to auto fit different img types """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def __call__(self, img):
+        if not isinstance(img, Image.Image):
+            img = np.ascontiguousarray(img)
+            img = Image.fromarray(img)
+
+        img = super().__call__(img)
+
+        if isinstance(img, Image.Image):
+            img = np.asarray(img)
+
+        return img
+
+
+class RandAugment(RawRandAugment):
+    """ RandAugment wrapper to auto fit different img types """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def __call__(self, img):
+        if not isinstance(img, Image.Image):
+            img = np.ascontiguousarray(img)
+            img = Image.fromarray(img)
+
+        img = super().__call__(img)
+
+        if isinstance(img, Image.Image):
+            img = np.asarray(img)
+
+        return img
+
+
+class TimmAutoAugment(RawTimmAutoAugment):
+    """ TimmAutoAugment wrapper to auto fit different img tyeps. """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def __call__(self, img):
+        if not isinstance(img, Image.Image):
+            img = np.ascontiguousarray(img)
+            img = Image.fromarray(img)
+
+        img = super().__call__(img)
+
+        if isinstance(img, Image.Image):
+            img = np.asarray(img)
+
+        return img
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/batch_ops/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/batch_ops/__init__.py
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/batch_ops/__init__.py
@@ -0,0 +1 @@
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/batch_ops/batch_operators.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/batch_ops/batch_operators.py
new file mode 100644
index 000000000..6f0abb864
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/batch_ops/batch_operators.py
@@ -0,0 +1,231 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import random
+
+import numpy as np
+
+from ppcls.utils import logger
+from ppcls.data.preprocess.ops.fmix import sample_mask
+
+
+class BatchOperator(object):
+    """ BatchOperator """
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def _unpack(self, batch):
+        """ _unpack """
+        assert isinstance(batch, list), \
+                'batch should be a list filled with tuples (img, label)'
+        bs = len(batch)
+        assert bs > 0, 'size of the batch data should > 0'
+        #imgs, labels = list(zip(*batch))
+        imgs = []
+        labels = []
+        for item in batch:
+            imgs.append(item[0])
+            labels.append(item[1])
+        return np.array(imgs), np.array(labels), bs
+
+    def _one_hot(self, targets):
+        return np.eye(self.class_num, dtype="float32")[targets]
+
+    def _mix_target(self, targets0, targets1, lam):
+        one_hots0 = self._one_hot(targets0)
+        one_hots1 = self._one_hot(targets1)
+        return one_hots0 * lam + one_hots1 * (1 - lam)
+
+    def __call__(self, batch):
+        return batch
+
+
+class MixupOperator(BatchOperator):
+    """ Mixup operator 
+    reference: https://arxiv.org/abs/1710.09412
+
+    """
+
+    def __init__(self, class_num, alpha: float=1.):
+        """Build Mixup operator
+
+        Args:
+            alpha (float, optional): The parameter alpha of mixup. Defaults to 1..
+
+        Raises:
+            Exception: The value of parameter is illegal.
+        """
+        if alpha <= 0:
+            raise Exception(
+                f"Parameter \"alpha\" of Mixup should be greater than 0. \"alpha\": {alpha}."
+            )
+        if not class_num:
+            msg = "Please set \"Arch.class_num\" in config if use \"MixupOperator\"."
+            logger.error(Exception(msg))
+            raise Exception(msg)
+
+        self._alpha = alpha
+        self.class_num = class_num
+
+    def __call__(self, batch):
+        imgs, labels, bs = self._unpack(batch)
+        idx = np.random.permutation(bs)
+        lam = np.random.beta(self._alpha, self._alpha)
+        imgs = lam * imgs + (1 - lam) * imgs[idx]
+        targets = self._mix_target(labels, labels[idx], lam)
+        return list(zip(imgs, targets))
+
+
+class CutmixOperator(BatchOperator):
+    """ Cutmix operator
+    reference: https://arxiv.org/abs/1905.04899
+
+    """
+
+    def __init__(self, class_num, alpha=0.2):
+        """Build Cutmix operator
+
+        Args:
+            alpha (float, optional): The parameter alpha of cutmix. Defaults to 0.2.
+
+        Raises:
+            Exception: The value of parameter is illegal.
+        """
+        if alpha <= 0:
+            raise Exception(
+                f"Parameter \"alpha\" of Cutmix should be greater than 0. \"alpha\": {alpha}."
+            )
+        if not class_num:
+            msg = "Please set \"Arch.class_num\" in config if use \"CutmixOperator\"."
+            logger.error(Exception(msg))
+            raise Exception(msg)
+
+        self._alpha = alpha
+        self.class_num = class_num
+
+    def _rand_bbox(self, size, lam):
+        """ _rand_bbox """
+        w = size[2]
+        h = size[3]
+        cut_rat = np.sqrt(1. - lam)
+        cut_w = int(w * cut_rat)
+        cut_h = int(h * cut_rat)
+
+        # uniform
+        cx = np.random.randint(w)
+        cy = np.random.randint(h)
+
+        bbx1 = np.clip(cx - cut_w // 2, 0, w)
+        bby1 = np.clip(cy - cut_h // 2, 0, h)
+        bbx2 = np.clip(cx + cut_w // 2, 0, w)
+        bby2 = np.clip(cy + cut_h // 2, 0, h)
+
+        return bbx1, bby1, bbx2, bby2
+
+    def __call__(self, batch):
+        imgs, labels, bs = self._unpack(batch)
+        idx = np.random.permutation(bs)
+        lam = np.random.beta(self._alpha, self._alpha)
+
+        bbx1, bby1, bbx2, bby2 = self._rand_bbox(imgs.shape, lam)
+        imgs[:, :, bbx1:bbx2, bby1:bby2] = imgs[idx, :, bbx1:bbx2, bby1:bby2]
+        lam = 1 - (float(bbx2 - bbx1) * (bby2 - bby1) /
+                   (imgs.shape[-2] * imgs.shape[-1]))
+        targets = self._mix_target(labels, labels[idx], lam)
+        return list(zip(imgs, targets))
+
+
+class FmixOperator(BatchOperator):
+    """ Fmix operator 
+    reference: https://arxiv.org/abs/2002.12047
+    
+    """
+
+    def __init__(self,
+                 class_num,
+                 alpha=1,
+                 decay_power=3,
+                 max_soft=0.,
+                 reformulate=False):
+        if not class_num:
+            msg = "Please set \"Arch.class_num\" in config if use \"FmixOperator\"."
+            logger.error(Exception(msg))
+            raise Exception(msg)
+
+        self._alpha = alpha
+        self._decay_power = decay_power
+        self._max_soft = max_soft
+        self._reformulate = reformulate
+        self.class_num = class_num
+
+    def __call__(self, batch):
+        imgs, labels, bs = self._unpack(batch)
+        idx = np.random.permutation(bs)
+        size = (imgs.shape[2], imgs.shape[3])
+        lam, mask = sample_mask(self._alpha, self._decay_power, \
+                size, self._max_soft, self._reformulate)
+        imgs = mask * imgs + (1 - mask) * imgs[idx]
+        targets = self._mix_target(labels, labels[idx], lam)
+        return list(zip(imgs, targets))
+
+
+class OpSampler(object):
+    """ Sample a operator from  """
+
+    def __init__(self, class_num, **op_dict):
+        """Build OpSampler
+
+        Raises:
+            Exception: The parameter \"prob\" of operator(s) are be set error.
+        """
+        if not class_num:
+            msg = "Please set \"Arch.class_num\" in config if use \"OpSampler\"."
+            logger.error(Exception(msg))
+            raise Exception(msg)
+
+        if len(op_dict) < 1:
+            msg = f"ConfigWarning: No operator in \"OpSampler\". \"OpSampler\" has been skipped."
+            logger.warning(msg)
+
+        self.ops = {}
+        total_prob = 0
+        for op_name in op_dict:
+            param = op_dict[op_name]
+            if "prob" not in param:
+                msg = f"ConfigWarning: Parameter \"prob\" should be set when use operator in \"OpSampler\". The operator \"{op_name}\"'s prob has been set \"0\"."
+                logger.warning(msg)
+            prob = param.pop("prob", 0)
+            total_prob += prob
+            param.update({"class_num": class_num})
+            op = eval(op_name)(**param)
+            self.ops.update({op: prob})
+
+        if total_prob > 1:
+            msg = f"ConfigError: The total prob of operators in \"OpSampler\" should be less 1."
+            logger.error(Exception(msg))
+            raise Exception(msg)
+
+        # add "None Op" when total_prob < 1, "None Op" do nothing
+        self.ops[None] = 1 - total_prob
+
+    def __call__(self, batch):
+        op = random.choices(
+            list(self.ops.keys()), weights=list(self.ops.values()), k=1)[0]
+        # return batch directly when None Op
+        return op(batch) if op else batch
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/__init__.py
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/__init__.py
@@ -0,0 +1 @@
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/autoaugment.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/autoaugment.py
new file mode 100644
index 000000000..330220a93
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/autoaugment.py
@@ -0,0 +1,264 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on https://github.com/DeepVoltaire/AutoAugment/blob/master/autoaugment.py
+
+from PIL import Image, ImageEnhance, ImageOps
+import numpy as np
+import random
+
+
+class ImageNetPolicy(object):
+    """ Randomly choose one of the best 24 Sub-policies on ImageNet.
+
+        Example:
+        >>> policy = ImageNetPolicy()
+        >>> transformed = policy(image)
+
+        Example as a PyTorch Transform:
+        >>> transform=transforms.Compose([
+        >>>     transforms.Resize(256),
+        >>>     ImageNetPolicy(),
+        >>>     transforms.ToTensor()])
+    """
+
+    def __init__(self, fillcolor=(128, 128, 128)):
+        self.policies = [
+            SubPolicy(0.4, "posterize", 8, 0.6, "rotate", 9, fillcolor),
+            SubPolicy(0.6, "solarize", 5, 0.6, "autocontrast", 5, fillcolor),
+            SubPolicy(0.8, "equalize", 8, 0.6, "equalize", 3, fillcolor),
+            SubPolicy(0.6, "posterize", 7, 0.6, "posterize", 6, fillcolor),
+            SubPolicy(0.4, "equalize", 7, 0.2, "solarize", 4, fillcolor),
+            SubPolicy(0.4, "equalize", 4, 0.8, "rotate", 8, fillcolor),
+            SubPolicy(0.6, "solarize", 3, 0.6, "equalize", 7, fillcolor),
+            SubPolicy(0.8, "posterize", 5, 1.0, "equalize", 2, fillcolor),
+            SubPolicy(0.2, "rotate", 3, 0.6, "solarize", 8, fillcolor),
+            SubPolicy(0.6, "equalize", 8, 0.4, "posterize", 6, fillcolor),
+            SubPolicy(0.8, "rotate", 8, 0.4, "color", 0, fillcolor),
+            SubPolicy(0.4, "rotate", 9, 0.6, "equalize", 2, fillcolor),
+            SubPolicy(0.0, "equalize", 7, 0.8, "equalize", 8, fillcolor),
+            SubPolicy(0.6, "invert", 4, 1.0, "equalize", 8, fillcolor),
+            SubPolicy(0.6, "color", 4, 1.0, "contrast", 8, fillcolor),
+            SubPolicy(0.8, "rotate", 8, 1.0, "color", 2, fillcolor),
+            SubPolicy(0.8, "color", 8, 0.8, "solarize", 7, fillcolor),
+            SubPolicy(0.4, "sharpness", 7, 0.6, "invert", 8, fillcolor),
+            SubPolicy(0.6, "shearX", 5, 1.0, "equalize", 9, fillcolor),
+            SubPolicy(0.4, "color", 0, 0.6, "equalize", 3, fillcolor),
+            SubPolicy(0.4, "equalize", 7, 0.2, "solarize", 4, fillcolor),
+            SubPolicy(0.6, "solarize", 5, 0.6, "autocontrast", 5, fillcolor),
+            SubPolicy(0.6, "invert", 4, 1.0, "equalize", 8, fillcolor),
+            SubPolicy(0.6, "color", 4, 1.0, "contrast", 8, fillcolor),
+            SubPolicy(0.8, "equalize", 8, 0.6, "equalize", 3, fillcolor)
+        ]
+
+    def __call__(self, img, policy_idx=None):
+        if policy_idx is None or not isinstance(policy_idx, int):
+            policy_idx = random.randint(0, len(self.policies) - 1)
+        else:
+            policy_idx = policy_idx % len(self.policies)
+        return self.policies[policy_idx](img)
+
+    def __repr__(self):
+        return "AutoAugment ImageNet Policy"
+
+
+class CIFAR10Policy(object):
+    """ Randomly choose one of the best 25 Sub-policies on CIFAR10.
+
+        Example:
+        >>> policy = CIFAR10Policy()
+        >>> transformed = policy(image)
+
+        Example as a PyTorch Transform:
+        >>> transform=transforms.Compose([
+        >>>     transforms.Resize(256),
+        >>>     CIFAR10Policy(),
+        >>>     transforms.ToTensor()])
+    """
+
+    def __init__(self, fillcolor=(128, 128, 128)):
+        self.policies = [
+            SubPolicy(0.1, "invert", 7, 0.2, "contrast", 6, fillcolor),
+            SubPolicy(0.7, "rotate", 2, 0.3, "translateX", 9, fillcolor),
+            SubPolicy(0.8, "sharpness", 1, 0.9, "sharpness", 3, fillcolor),
+            SubPolicy(0.5, "shearY", 8, 0.7, "translateY", 9, fillcolor),
+            SubPolicy(0.5, "autocontrast", 8, 0.9, "equalize", 2, fillcolor),
+            SubPolicy(0.2, "shearY", 7, 0.3, "posterize", 7, fillcolor),
+            SubPolicy(0.4, "color", 3, 0.6, "brightness", 7, fillcolor),
+            SubPolicy(0.3, "sharpness", 9, 0.7, "brightness", 9, fillcolor),
+            SubPolicy(0.6, "equalize", 5, 0.5, "equalize", 1, fillcolor),
+            SubPolicy(0.6, "contrast", 7, 0.6, "sharpness", 5, fillcolor),
+            SubPolicy(0.7, "color", 7, 0.5, "translateX", 8, fillcolor),
+            SubPolicy(0.3, "equalize", 7, 0.4, "autocontrast", 8, fillcolor),
+            SubPolicy(0.4, "translateY", 3, 0.2, "sharpness", 6, fillcolor),
+            SubPolicy(0.9, "brightness", 6, 0.2, "color", 8, fillcolor),
+            SubPolicy(0.5, "solarize", 2, 0.0, "invert", 3, fillcolor),
+            SubPolicy(0.2, "equalize", 0, 0.6, "autocontrast", 0, fillcolor),
+            SubPolicy(0.2, "equalize", 8, 0.8, "equalize", 4, fillcolor),
+            SubPolicy(0.9, "color", 9, 0.6, "equalize", 6, fillcolor),
+            SubPolicy(0.8, "autocontrast", 4, 0.2, "solarize", 8, fillcolor),
+            SubPolicy(0.1, "brightness", 3, 0.7, "color", 0, fillcolor),
+            SubPolicy(0.4, "solarize", 5, 0.9, "autocontrast", 3, fillcolor),
+            SubPolicy(0.9, "translateY", 9, 0.7, "translateY", 9, fillcolor),
+            SubPolicy(0.9, "autocontrast", 2, 0.8, "solarize", 3, fillcolor),
+            SubPolicy(0.8, "equalize", 8, 0.1, "invert", 3, fillcolor),
+            SubPolicy(0.7, "translateY", 9, 0.9, "autocontrast", 1, fillcolor)
+        ]
+
+    def __call__(self, img, policy_idx=None):
+        if policy_idx is None or not isinstance(policy_idx, int):
+            policy_idx = random.randint(0, len(self.policies) - 1)
+        else:
+            policy_idx = policy_idx % len(self.policies)
+        return self.policies[policy_idx](img)
+
+    def __repr__(self):
+        return "AutoAugment CIFAR10 Policy"
+
+
+class SVHNPolicy(object):
+    """ Randomly choose one of the best 25 Sub-policies on SVHN.
+
+        Example:
+        >>> policy = SVHNPolicy()
+        >>> transformed = policy(image)
+
+        Example as a PyTorch Transform:
+        >>> transform=transforms.Compose([
+        >>>     transforms.Resize(256),
+        >>>     SVHNPolicy(),
+        >>>     transforms.ToTensor()])
+    """
+
+    def __init__(self, fillcolor=(128, 128, 128)):
+        self.policies = [
+            SubPolicy(0.9, "shearX", 4, 0.2, "invert", 3, fillcolor),
+            SubPolicy(0.9, "shearY", 8, 0.7, "invert", 5, fillcolor),
+            SubPolicy(0.6, "equalize", 5, 0.6, "solarize", 6, fillcolor),
+            SubPolicy(0.9, "invert", 3, 0.6, "equalize", 3, fillcolor),
+            SubPolicy(0.6, "equalize", 1, 0.9, "rotate", 3, fillcolor),
+            SubPolicy(0.9, "shearX", 4, 0.8, "autocontrast", 3, fillcolor),
+            SubPolicy(0.9, "shearY", 8, 0.4, "invert", 5, fillcolor),
+            SubPolicy(0.9, "shearY", 5, 0.2, "solarize", 6, fillcolor),
+            SubPolicy(0.9, "invert", 6, 0.8, "autocontrast", 1, fillcolor),
+            SubPolicy(0.6, "equalize", 3, 0.9, "rotate", 3, fillcolor),
+            SubPolicy(0.9, "shearX", 4, 0.3, "solarize", 3, fillcolor),
+            SubPolicy(0.8, "shearY", 8, 0.7, "invert", 4, fillcolor),
+            SubPolicy(0.9, "equalize", 5, 0.6, "translateY", 6, fillcolor),
+            SubPolicy(0.9, "invert", 4, 0.6, "equalize", 7, fillcolor),
+            SubPolicy(0.3, "contrast", 3, 0.8, "rotate", 4, fillcolor),
+            SubPolicy(0.8, "invert", 5, 0.0, "translateY", 2, fillcolor),
+            SubPolicy(0.7, "shearY", 6, 0.4, "solarize", 8, fillcolor),
+            SubPolicy(0.6, "invert", 4, 0.8, "rotate", 4, fillcolor),
+            SubPolicy(
+                0.3, "shearY", 7, 0.9, "translateX", 3, fillcolor), SubPolicy(
+                    0.1, "shearX", 6, 0.6, "invert", 5, fillcolor), SubPolicy(
+                        0.7, "solarize", 2, 0.6, "translateY", 7,
+                        fillcolor), SubPolicy(0.8, "shearY", 4, 0.8, "invert",
+                                              8, fillcolor), SubPolicy(
+                                                  0.7, "shearX", 9, 0.8,
+                                                  "translateY", 3,
+                                                  fillcolor), SubPolicy(
+                                                      0.8, "shearY", 5, 0.7,
+                                                      "autocontrast", 3,
+                                                      fillcolor),
+            SubPolicy(0.7, "shearX", 2, 0.1, "invert", 5, fillcolor)
+        ]
+
+    def __call__(self, img, policy_idx=None):
+        if policy_idx is None or not isinstance(policy_idx, int):
+            policy_idx = random.randint(0, len(self.policies) - 1)
+        else:
+            policy_idx = policy_idx % len(self.policies)
+        return self.policies[policy_idx](img)
+
+    def __repr__(self):
+        return "AutoAugment SVHN Policy"
+
+
+class SubPolicy(object):
+    def __init__(self,
+                 p1,
+                 operation1,
+                 magnitude_idx1,
+                 p2,
+                 operation2,
+                 magnitude_idx2,
+                 fillcolor=(128, 128, 128)):
+        ranges = {
+            "shearX": np.linspace(0, 0.3, 10),
+            "shearY": np.linspace(0, 0.3, 10),
+            "translateX": np.linspace(0, 150 / 331, 10),
+            "translateY": np.linspace(0, 150 / 331, 10),
+            "rotate": np.linspace(0, 30, 10),
+            "color": np.linspace(0.0, 0.9, 10),
+            "posterize": np.round(np.linspace(8, 4, 10), 0).astype(np.int),
+            "solarize": np.linspace(256, 0, 10),
+            "contrast": np.linspace(0.0, 0.9, 10),
+            "sharpness": np.linspace(0.0, 0.9, 10),
+            "brightness": np.linspace(0.0, 0.9, 10),
+            "autocontrast": [0] * 10,
+            "equalize": [0] * 10,
+            "invert": [0] * 10
+        }
+
+        # from https://stackoverflow.com/questions/5252170/specify-image-filling-color-when-rotating-in-python-with-pil-and-setting-expand
+        def rotate_with_fill(img, magnitude):
+            rot = img.convert("RGBA").rotate(magnitude)
+            return Image.composite(rot,
+                                   Image.new("RGBA", rot.size, (128, ) * 4),
+                                   rot).convert(img.mode)
+
+        func = {
+            "shearX": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, magnitude * random.choice([-1, 1]), 0, 0, 1, 0),
+                Image.BICUBIC, fillcolor=fillcolor),
+            "shearY": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, 0, magnitude * random.choice([-1, 1]), 1, 0),
+                Image.BICUBIC, fillcolor=fillcolor),
+            "translateX": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, magnitude * img.size[0] * random.choice([-1, 1]), 0, 1, 0),
+                fillcolor=fillcolor),
+            "translateY": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, 0, 0, 1, magnitude * img.size[1] * random.choice([-1, 1])),
+                fillcolor=fillcolor),
+            "rotate": lambda img, magnitude: rotate_with_fill(img, magnitude),
+            # "rotate": lambda img, magnitude: img.rotate(magnitude * random.choice([-1, 1])),
+            "color": lambda img, magnitude: ImageEnhance.Color(img).enhance(1 + magnitude * random.choice([-1, 1])),
+            "posterize": lambda img, magnitude: ImageOps.posterize(img, magnitude),
+            "solarize": lambda img, magnitude: ImageOps.solarize(img, magnitude),
+            "contrast": lambda img, magnitude: ImageEnhance.Contrast(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "sharpness": lambda img, magnitude: ImageEnhance.Sharpness(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "brightness": lambda img, magnitude: ImageEnhance.Brightness(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "autocontrast": lambda img, magnitude: ImageOps.autocontrast(img),
+            "equalize": lambda img, magnitude: ImageOps.equalize(img),
+            "invert": lambda img, magnitude: ImageOps.invert(img)
+        }
+
+        self.p1 = p1
+        self.operation1 = func[operation1]
+        self.magnitude1 = ranges[operation1][magnitude_idx1]
+        self.p2 = p2
+        self.operation2 = func[operation2]
+        self.magnitude2 = ranges[operation2][magnitude_idx2]
+
+    def __call__(self, img):
+        if random.random() < self.p1:
+            img = self.operation1(img, self.magnitude1)
+        if random.random() < self.p2:
+            img = self.operation2(img, self.magnitude2)
+        return img
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/cutout.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/cutout.py
new file mode 100644
index 000000000..b906e1452
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/cutout.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on https://github.com/uoguelph-mlrg/Cutout
+
+import numpy as np
+import random
+
+
+class Cutout(object):
+    def __init__(self, n_holes=1, length=112):
+        self.n_holes = n_holes
+        self.length = length
+
+    def __call__(self, img):
+        """ cutout_image """
+        h, w = img.shape[:2]
+        mask = np.ones((h, w), np.float32)
+
+        for n in range(self.n_holes):
+            y = np.random.randint(h)
+            x = np.random.randint(w)
+
+            y1 = np.clip(y - self.length // 2, 0, h)
+            y2 = np.clip(y + self.length // 2, 0, h)
+            x1 = np.clip(x - self.length // 2, 0, w)
+            x2 = np.clip(x + self.length // 2, 0, w)
+
+            img[y1:y2, x1:x2] = 0
+        return img
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/fmix.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/fmix.py
new file mode 100644
index 000000000..dc2ef9120
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/fmix.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import random
+
+import numpy as np
+from scipy.stats import beta
+
+
+def fftfreqnd(h, w=None, z=None):
+    """ Get bin values for discrete fourier transform of size (h, w, z)
+
+    :param h: Required, first dimension size
+    :param w: Optional, second dimension size
+    :param z: Optional, third dimension size
+    """
+    fz = fx = 0
+    fy = np.fft.fftfreq(h)
+
+    if w is not None:
+        fy = np.expand_dims(fy, -1)
+
+        if w % 2 == 1:
+            fx = np.fft.fftfreq(w)[:w // 2 + 2]
+        else:
+            fx = np.fft.fftfreq(w)[:w // 2 + 1]
+
+    if z is not None:
+        fy = np.expand_dims(fy, -1)
+        if z % 2 == 1:
+            fz = np.fft.fftfreq(z)[:, None]
+        else:
+            fz = np.fft.fftfreq(z)[:, None]
+
+    return np.sqrt(fx * fx + fy * fy + fz * fz)
+
+
+def get_spectrum(freqs, decay_power, ch, h, w=0, z=0):
+    """ Samples a fourier image with given size and frequencies decayed by decay power
+
+    :param freqs: Bin values for the discrete fourier transform
+    :param decay_power: Decay power for frequency decay prop 1/f**d
+    :param ch: Number of channels for the resulting mask
+    :param h: Required, first dimension size
+    :param w: Optional, second dimension size
+    :param z: Optional, third dimension size
+    """
+    scale = np.ones(1) / (np.maximum(freqs, np.array([1. / max(w, h, z)]))
+                          **decay_power)
+
+    param_size = [ch] + list(freqs.shape) + [2]
+    param = np.random.randn(*param_size)
+
+    scale = np.expand_dims(scale, -1)[None, :]
+
+    return scale * param
+
+
+def make_low_freq_image(decay, shape, ch=1):
+    """ Sample a low frequency image from fourier space
+
+    :param decay_power: Decay power for frequency decay prop 1/f**d
+    :param shape: Shape of desired mask, list up to 3 dims
+    :param ch: Number of channels for desired mask
+    """
+    freqs = fftfreqnd(*shape)
+    spectrum = get_spectrum(freqs, decay, ch,
+                            *shape)  #.reshape((1, *shape[:-1], -1))
+    spectrum = spectrum[:, 0] + 1j * spectrum[:, 1]
+    mask = np.real(np.fft.irfftn(spectrum, shape))
+
+    if len(shape) == 1:
+        mask = mask[:1, :shape[0]]
+    if len(shape) == 2:
+        mask = mask[:1, :shape[0], :shape[1]]
+    if len(shape) == 3:
+        mask = mask[:1, :shape[0], :shape[1], :shape[2]]
+
+    mask = mask
+    mask = (mask - mask.min())
+    mask = mask / mask.max()
+    return mask
+
+
+def sample_lam(alpha, reformulate=False):
+    """ Sample a lambda from symmetric beta distribution with given alpha
+
+    :param alpha: Alpha value for beta distribution
+    :param reformulate: If True, uses the reformulation of [1].
+    """
+    if reformulate:
+        lam = beta.rvs(alpha + 1, alpha)
+    else:
+        lam = beta.rvs(alpha, alpha)
+
+    return lam
+
+
+def binarise_mask(mask, lam, in_shape, max_soft=0.0):
+    """ Binarises a given low frequency image such that it has mean lambda.
+
+    :param mask: Low frequency image, usually the result of `make_low_freq_image`
+    :param lam: Mean value of final mask
+    :param in_shape: Shape of inputs
+    :param max_soft: Softening value between 0 and 0.5 which smooths hard edges in the mask.
+    :return:
+    """
+    idx = mask.reshape(-1).argsort()[::-1]
+    mask = mask.reshape(-1)
+    num = math.ceil(lam * mask.size) if random.random() > 0.5 else math.floor(
+        lam * mask.size)
+
+    eff_soft = max_soft
+    if max_soft > lam or max_soft > (1 - lam):
+        eff_soft = min(lam, 1 - lam)
+
+    soft = int(mask.size * eff_soft)
+    num_low = int(num - soft)
+    num_high = int(num + soft)
+
+    mask[idx[:num_high]] = 1
+    mask[idx[num_low:]] = 0
+    mask[idx[num_low:num_high]] = np.linspace(1, 0, (num_high - num_low))
+
+    mask = mask.reshape((1, 1, in_shape[0], in_shape[1]))
+    return mask
+
+
+def sample_mask(alpha, decay_power, shape, max_soft=0.0, reformulate=False):
+    """ Samples a mean lambda from beta distribution parametrised by alpha, creates a low frequency image and binarises
+    it based on this lambda
+
+    :param alpha: Alpha value for beta distribution from which to sample mean of mask
+    :param decay_power: Decay power for frequency decay prop 1/f**d
+    :param shape: Shape of desired mask, list up to 3 dims
+    :param max_soft: Softening value between 0 and 0.5 which smooths hard edges in the mask.
+    :param reformulate: If True, uses the reformulation of [1].
+    """
+    if isinstance(shape, int):
+        shape = (shape, )
+
+    # Choose lambda
+    lam = sample_lam(alpha, reformulate)
+
+    # Make mask, get mean / std
+    mask = make_low_freq_image(decay_power, shape)
+    mask = binarise_mask(mask, lam, shape, max_soft)
+
+    return float(lam), mask
+
+
+def sample_and_apply(x,
+                     alpha,
+                     decay_power,
+                     shape,
+                     max_soft=0.0,
+                     reformulate=False):
+    """
+
+    :param x: Image batch on which to apply fmix of shape [b, c, shape*]
+    :param alpha: Alpha value for beta distribution from which to sample mean of mask
+    :param decay_power: Decay power for frequency decay prop 1/f**d
+    :param shape: Shape of desired mask, list up to 3 dims
+    :param max_soft: Softening value between 0 and 0.5 which smooths hard edges in the mask.
+    :param reformulate: If True, uses the reformulation of [1].
+    :return: mixed input, permutation indices, lambda value of mix,
+    """
+    lam, mask = sample_mask(alpha, decay_power, shape, max_soft, reformulate)
+    index = np.random.permutation(x.shape[0])
+
+    x1, x2 = x * mask, x[index] * (1 - mask)
+    return x1 + x2, index, lam
+
+
+class FMixBase:
+    """ FMix augmentation
+
+        Args:
+            decay_power (float): Decay power for frequency decay prop 1/f**d
+            alpha (float): Alpha value for beta distribution from which to sample mean of mask
+            size ([int] | [int, int] | [int, int, int]): Shape of desired mask, list up to 3 dims
+            max_soft (float): Softening value between 0 and 0.5 which smooths hard edges in the mask.
+            reformulate (bool): If True, uses the reformulation of [1].
+    """
+
+    def __init__(self,
+                 decay_power=3,
+                 alpha=1,
+                 size=(32, 32),
+                 max_soft=0.0,
+                 reformulate=False):
+        super().__init__()
+        self.decay_power = decay_power
+        self.reformulate = reformulate
+        self.size = size
+        self.alpha = alpha
+        self.max_soft = max_soft
+        self.index = None
+        self.lam = None
+
+    def __call__(self, x):
+        raise NotImplementedError
+
+    def loss(self, *args, **kwargs):
+        raise NotImplementedError
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/functional.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/functional.py
new file mode 100644
index 000000000..9f1369eef
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/functional.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# encoding: utf-8
+
+import numpy as np
+from PIL import Image, ImageOps, ImageEnhance
+
+
+
+def int_parameter(level, maxval):
+    """Helper function to scale `val` between 0 and maxval .
+    Args:
+      level: Level of the operation that will be between [0, `PARAMETER_MAX`].
+      maxval: Maximum value that the operation can have. This will be scaled to
+        level/PARAMETER_MAX.
+    Returns:
+      An int that results from scaling `maxval` according to `level`.
+    """
+    return int(level * maxval / 10)
+
+
+def float_parameter(level, maxval):
+    """Helper function to scale `val` between 0 and maxval.
+    Args:
+      level: Level of the operation that will be between [0, `PARAMETER_MAX`].
+      maxval: Maximum value that the operation can have. This will be scaled to
+        level/PARAMETER_MAX.
+    Returns:
+      A float that results from scaling `maxval` according to `level`.
+    """
+    return float(level) * maxval / 10.
+
+
+def sample_level(n):
+    return np.random.uniform(low=0.1, high=n)
+
+
+def autocontrast(pil_img, *args):
+    return ImageOps.autocontrast(pil_img)
+
+
+def equalize(pil_img, *args):
+    return ImageOps.equalize(pil_img)
+
+
+def posterize(pil_img, level, *args):
+    level = int_parameter(sample_level(level), 4)
+    return ImageOps.posterize(pil_img, 4 - level)
+
+
+def rotate(pil_img, level, *args):
+    degrees = int_parameter(sample_level(level), 30)
+    if np.random.uniform() > 0.5:
+        degrees = -degrees
+    return pil_img.rotate(degrees, resample=Image.BILINEAR)
+
+
+def solarize(pil_img, level, *args):
+    level = int_parameter(sample_level(level), 256)
+    return ImageOps.solarize(pil_img, 256 - level)
+
+
+def shear_x(pil_img, level):
+    level = float_parameter(sample_level(level), 0.3)
+    if np.random.uniform() > 0.5:
+        level = -level
+    return pil_img.transform(pil_img.size,
+                             Image.AFFINE, (1, level, 0, 0, 1, 0),
+                             resample=Image.BILINEAR)
+
+
+def shear_y(pil_img, level):
+    level = float_parameter(sample_level(level), 0.3)
+    if np.random.uniform() > 0.5:
+        level = -level
+    return pil_img.transform(pil_img.size,
+                             Image.AFFINE, (1, 0, 0, level, 1, 0),
+                             resample=Image.BILINEAR)
+
+
+def translate_x(pil_img, level):
+    level = int_parameter(sample_level(level), pil_img.size[0] / 3)
+    if np.random.random() > 0.5:
+        level = -level
+    return pil_img.transform(pil_img.size,
+                             Image.AFFINE, (1, 0, level, 0, 1, 0),
+                             resample=Image.BILINEAR)
+
+
+def translate_y(pil_img, level):
+    level = int_parameter(sample_level(level), pil_img.size[1] / 3)
+    if np.random.random() > 0.5:
+        level = -level
+    return pil_img.transform(pil_img.size,
+                             Image.AFFINE, (1, 0, 0, 0, 1, level),
+                             resample=Image.BILINEAR)
+
+
+# operation that overlaps with ImageNet-C's test set
+def color(pil_img, level, *args):
+    level = float_parameter(sample_level(level), 1.8) + 0.1
+    return ImageEnhance.Color(pil_img).enhance(level)
+
+
+# operation that overlaps with ImageNet-C's test set
+def contrast(pil_img, level, *args):
+    level = float_parameter(sample_level(level), 1.8) + 0.1
+    return ImageEnhance.Contrast(pil_img).enhance(level)
+
+
+# operation that overlaps with ImageNet-C's test set
+def brightness(pil_img, level, *args):
+    level = float_parameter(sample_level(level), 1.8) + 0.1
+    return ImageEnhance.Brightness(pil_img).enhance(level)
+
+
+# operation that overlaps with ImageNet-C's test set
+def sharpness(pil_img, level, *args):
+    level = float_parameter(sample_level(level), 1.8) + 0.1
+    return ImageEnhance.Sharpness(pil_img).enhance(level)
+
+
+augmentations = [
+    autocontrast, equalize, posterize, rotate, solarize, shear_x, shear_y,
+    translate_x, translate_y
+]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/grid.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/grid.py
new file mode 100644
index 000000000..6f0b2dc8d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/grid.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on https://github.com/akuxcw/GridMask
+
+import numpy as np
+from PIL import Image
+import pdb
+
+# curr
+CURR_EPOCH = 0
+# epoch for the prob to be the upper limit
+NUM_EPOCHS = 240
+
+
+class GridMask(object):
+    def __init__(self, d1=96, d2=224, rotate=1, ratio=0.5, mode=0, prob=1.):
+        self.d1 = d1
+        self.d2 = d2
+        self.rotate = rotate
+        self.ratio = ratio
+        self.mode = mode
+        self.st_prob = prob
+        self.prob = prob
+        self.last_prob = -1
+
+    def set_prob(self):
+        global CURR_EPOCH
+        global NUM_EPOCHS
+        self.prob = self.st_prob * min(1, 1.0 * CURR_EPOCH / NUM_EPOCHS)
+
+    def __call__(self, img):
+        self.set_prob()
+        if abs(self.last_prob - self.prob) > 1e-10:
+            global CURR_EPOCH
+            global NUM_EPOCHS
+            print(
+                "self.prob is updated, self.prob={}, CURR_EPOCH: {}, NUM_EPOCHS: {}".
+                format(self.prob, CURR_EPOCH, NUM_EPOCHS))
+            self.last_prob = self.prob
+        # print("CURR_EPOCH: {}, NUM_EPOCHS: {}, self.prob is set as: {}".format(CURR_EPOCH, NUM_EPOCHS, self.prob) )
+        if np.random.rand() > self.prob:
+            return img
+        _, h, w = img.shape
+        hh = int(1.5 * h)
+        ww = int(1.5 * w)
+        d = np.random.randint(self.d1, self.d2)
+        #d = self.d
+        self.l = int(d * self.ratio + 0.5)
+        mask = np.ones((hh, ww), np.float32)
+        st_h = np.random.randint(d)
+        st_w = np.random.randint(d)
+        for i in range(-1, hh // d + 1):
+            s = d * i + st_h
+            t = s + self.l
+            s = max(min(s, hh), 0)
+            t = max(min(t, hh), 0)
+            mask[s:t, :] *= 0
+        for i in range(-1, ww // d + 1):
+            s = d * i + st_w
+            t = s + self.l
+            s = max(min(s, ww), 0)
+            t = max(min(t, ww), 0)
+            mask[:, s:t] *= 0
+        r = np.random.randint(self.rotate)
+        mask = Image.fromarray(np.uint8(mask))
+        mask = mask.rotate(r)
+        mask = np.asarray(mask)
+        mask = mask[(hh - h) // 2:(hh - h) // 2 + h, (ww - w) // 2:(ww - w) //
+                    2 + w]
+
+        if self.mode == 1:
+            mask = 1 - mask
+
+        mask = np.expand_dims(mask, axis=0)
+        img = (img * mask).astype(img.dtype)
+
+        return img
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/hide_and_seek.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/hide_and_seek.py
new file mode 100644
index 000000000..33f25f265
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/hide_and_seek.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on https://github.com/kkanshul/Hide-and-Seek
+
+import numpy as np
+import random
+
+
+class HideAndSeek(object):
+    def __init__(self):
+        # possible grid size, 0 means no hiding
+        self.grid_sizes = [0, 16, 32, 44, 56]
+        # hiding probability
+        self.hide_prob = 0.5
+
+    def __call__(self, img):
+        # randomly choose one grid size
+        grid_size = np.random.choice(self.grid_sizes)
+
+        _, h, w = img.shape
+
+        # hide the patches
+        if grid_size == 0:
+            return img
+        for x in range(0, w, grid_size):
+            for y in range(0, h, grid_size):
+                x_end = min(w, x + grid_size)
+                y_end = min(h, y + grid_size)
+                if (random.random() <= self.hide_prob):
+                    img[:, x:x_end, y:y_end] = 0
+
+        return img
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/operators.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/operators.py
new file mode 100644
index 000000000..9cdc58b2b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/operators.py
@@ -0,0 +1,384 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from functools import partial
+import six
+import math
+import random
+import cv2
+import numpy as np
+from PIL import Image
+from paddle.vision.transforms import ColorJitter as RawColorJitter
+
+from .autoaugment import ImageNetPolicy
+from .functional import augmentations
+from ppcls.utils import logger
+
+
+class UnifiedResize(object):
+    def __init__(self, interpolation=None, backend="cv2"):
+        _cv2_interp_from_str = {
+            'nearest': cv2.INTER_NEAREST,
+            'bilinear': cv2.INTER_LINEAR,
+            'area': cv2.INTER_AREA,
+            'bicubic': cv2.INTER_CUBIC,
+            'lanczos': cv2.INTER_LANCZOS4
+        }
+        _pil_interp_from_str = {
+            'nearest': Image.NEAREST,
+            'bilinear': Image.BILINEAR,
+            'bicubic': Image.BICUBIC,
+            'box': Image.BOX,
+            'lanczos': Image.LANCZOS,
+            'hamming': Image.HAMMING
+        }
+
+        def _pil_resize(src, size, resample):
+            pil_img = Image.fromarray(src)
+            pil_img = pil_img.resize(size, resample)
+            return np.asarray(pil_img)
+
+        if backend.lower() == "cv2":
+            if isinstance(interpolation, str):
+                interpolation = _cv2_interp_from_str[interpolation.lower()]
+            # compatible with opencv < version 4.4.0
+            elif interpolation is None:
+                interpolation = cv2.INTER_LINEAR
+            self.resize_func = partial(cv2.resize, interpolation=interpolation)
+        elif backend.lower() == "pil":
+            if isinstance(interpolation, str):
+                interpolation = _pil_interp_from_str[interpolation.lower()]
+            self.resize_func = partial(_pil_resize, resample=interpolation)
+        else:
+            logger.warning(
+                f"The backend of Resize only support \"cv2\" or \"PIL\". \"f{backend}\" is unavailable. Use \"cv2\" instead."
+            )
+            self.resize_func = cv2.resize
+
+    def __call__(self, src, size):
+        return self.resize_func(src, size)
+
+
+class OperatorParamError(ValueError):
+    """ OperatorParamError
+    """
+    pass
+
+
+class DecodeImage(object):
+    """ decode image """
+
+    def __init__(self, to_rgb=True, to_np=False, channel_first=False):
+        self.to_rgb = to_rgb
+        self.to_np = to_np  # to numpy
+        self.channel_first = channel_first  # only enabled when to_np is True
+
+    def __call__(self, img):
+        if six.PY2:
+            assert type(img) is str and len(
+                img) > 0, "invalid input 'img' in DecodeImage"
+        else:
+            assert type(img) is bytes and len(
+                img) > 0, "invalid input 'img' in DecodeImage"
+        data = np.frombuffer(img, dtype='uint8')
+        img = cv2.imdecode(data, 1)
+        if self.to_rgb:
+            assert img.shape[2] == 3, 'invalid shape of image[%s]' % (
+                img.shape)
+            img = img[:, :, ::-1]
+
+        if self.channel_first:
+            img = img.transpose((2, 0, 1))
+
+        return img
+
+
+class ResizeImage(object):
+    """ resize image """
+
+    def __init__(self,
+                 size=None,
+                 resize_short=None,
+                 interpolation=None,
+                 backend="cv2"):
+        if resize_short is not None and resize_short > 0:
+            self.resize_short = resize_short
+            self.w = None
+            self.h = None
+        elif size is not None:
+            self.resize_short = None
+            self.w = size if type(size) is int else size[0]
+            self.h = size if type(size) is int else size[1]
+        else:
+            raise OperatorParamError("invalid params for ReisizeImage for '\
+                'both 'size' and 'resize_short' are None")
+
+        self._resize_func = UnifiedResize(
+            interpolation=interpolation, backend=backend)
+
+    def __call__(self, img):
+        img_h, img_w = img.shape[:2]
+        if self.resize_short is not None:
+            percent = float(self.resize_short) / min(img_w, img_h)
+            w = int(round(img_w * percent))
+            h = int(round(img_h * percent))
+        else:
+            w = self.w
+            h = self.h
+        return self._resize_func(img, (w, h))
+
+
+class CropImage(object):
+    """ crop image """
+
+    def __init__(self, size):
+        if type(size) is int:
+            self.size = (size, size)
+        else:
+            self.size = size  # (h, w)
+
+    def __call__(self, img):
+        w, h = self.size
+        img_h, img_w = img.shape[:2]
+        w_start = (img_w - w) // 2
+        h_start = (img_h - h) // 2
+
+        w_end = w_start + w
+        h_end = h_start + h
+        return img[h_start:h_end, w_start:w_end, :]
+
+
+class RandCropImage(object):
+    """ random crop image """
+
+    def __init__(self,
+                 size,
+                 scale=None,
+                 ratio=None,
+                 interpolation=None,
+                 backend="cv2"):
+        if type(size) is int:
+            self.size = (size, size)  # (h, w)
+        else:
+            self.size = size
+
+        self.scale = [0.08, 1.0] if scale is None else scale
+        self.ratio = [3. / 4., 4. / 3.] if ratio is None else ratio
+
+        self._resize_func = UnifiedResize(
+            interpolation=interpolation, backend=backend)
+
+    def __call__(self, img):
+        size = self.size
+        scale = self.scale
+        ratio = self.ratio
+
+        aspect_ratio = math.sqrt(random.uniform(*ratio))
+        w = 1. * aspect_ratio
+        h = 1. / aspect_ratio
+
+        img_h, img_w = img.shape[:2]
+
+        bound = min((float(img_w) / img_h) / (w**2),
+                    (float(img_h) / img_w) / (h**2))
+        scale_max = min(scale[1], bound)
+        scale_min = min(scale[0], bound)
+
+        target_area = img_w * img_h * random.uniform(scale_min, scale_max)
+        target_size = math.sqrt(target_area)
+        w = int(target_size * w)
+        h = int(target_size * h)
+
+        i = random.randint(0, img_w - w)
+        j = random.randint(0, img_h - h)
+
+        img = img[j:j + h, i:i + w, :]
+
+        return self._resize_func(img, size)
+
+
+class RandFlipImage(object):
+    """ random flip image
+        flip_code:
+            1: Flipped Horizontally
+            0: Flipped Vertically
+            -1: Flipped Horizontally & Vertically
+    """
+
+    def __init__(self, flip_code=1):
+        assert flip_code in [-1, 0, 1
+                             ], "flip_code should be a value in [-1, 0, 1]"
+        self.flip_code = flip_code
+
+    def __call__(self, img):
+        if random.randint(0, 1) == 1:
+            return cv2.flip(img, self.flip_code)
+        else:
+            return img
+
+
+class AutoAugment(object):
+    def __init__(self):
+        self.policy = ImageNetPolicy()
+
+    def __call__(self, img):
+        from PIL import Image
+        img = np.ascontiguousarray(img)
+        img = Image.fromarray(img)
+        img = self.policy(img)
+        img = np.asarray(img)
+
+
+class NormalizeImage(object):
+    """ normalize image such as substract mean, divide std
+    """
+
+    def __init__(self,
+                 scale=None,
+                 mean=None,
+                 std=None,
+                 order='chw',
+                 output_fp16=False,
+                 channel_num=3):
+        if isinstance(scale, str):
+            scale = eval(scale)
+        assert channel_num in [
+            3, 4
+        ], "channel number of input image should be set to 3 or 4."
+        self.channel_num = channel_num
+        self.output_dtype = 'float16' if output_fp16 else 'float32'
+        self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
+        self.order = order
+        mean = mean if mean is not None else [0.485, 0.456, 0.406]
+        std = std if std is not None else [0.229, 0.224, 0.225]
+
+        shape = (3, 1, 1) if self.order == 'chw' else (1, 1, 3)
+        self.mean = np.array(mean).reshape(shape).astype('float32')
+        self.std = np.array(std).reshape(shape).astype('float32')
+
+    def __call__(self, img):
+        from PIL import Image
+        if isinstance(img, Image.Image):
+            img = np.array(img)
+
+        assert isinstance(img,
+                          np.ndarray), "invalid input 'img' in NormalizeImage"
+
+        img = (img.astype('float32') * self.scale - self.mean) / self.std
+
+        if self.channel_num == 4:
+            img_h = img.shape[1] if self.order == 'chw' else img.shape[0]
+            img_w = img.shape[2] if self.order == 'chw' else img.shape[1]
+            pad_zeros = np.zeros(
+                (1, img_h, img_w)) if self.order == 'chw' else np.zeros(
+                    (img_h, img_w, 1))
+            img = (np.concatenate(
+                (img, pad_zeros), axis=0)
+                   if self.order == 'chw' else np.concatenate(
+                       (img, pad_zeros), axis=2))
+        return img.astype(self.output_dtype)
+
+
+class ToCHWImage(object):
+    """ convert hwc image to chw image
+    """
+
+    def __init__(self):
+        pass
+
+    def __call__(self, img):
+        from PIL import Image
+        if isinstance(img, Image.Image):
+            img = np.array(img)
+
+        return img.transpose((2, 0, 1))
+
+
+class AugMix(object):
+    """ Perform AugMix augmentation and compute mixture.
+    """
+
+    def __init__(self,
+                 prob=0.5,
+                 aug_prob_coeff=0.1,
+                 mixture_width=3,
+                 mixture_depth=1,
+                 aug_severity=1):
+        """
+        Args:
+            prob: Probability of taking augmix
+            aug_prob_coeff: Probability distribution coefficients.
+            mixture_width: Number of augmentation chains to mix per augmented example.
+            mixture_depth: Depth of augmentation chains. -1 denotes stochastic depth in [1, 3]'
+            aug_severity: Severity of underlying augmentation operators (between 1 to 10).
+        """
+        # fmt: off
+        self.prob = prob
+        self.aug_prob_coeff = aug_prob_coeff
+        self.mixture_width = mixture_width
+        self.mixture_depth = mixture_depth
+        self.aug_severity = aug_severity
+        self.augmentations = augmentations
+        # fmt: on
+
+    def __call__(self, image):
+        """Perform AugMix augmentations and compute mixture.
+        Returns:
+          mixed: Augmented and mixed image.
+        """
+        if random.random() > self.prob:
+            # Avoid the warning: the given NumPy array is not writeable
+            return np.asarray(image).copy()
+
+        ws = np.float32(
+            np.random.dirichlet([self.aug_prob_coeff] * self.mixture_width))
+        m = np.float32(
+            np.random.beta(self.aug_prob_coeff, self.aug_prob_coeff))
+
+        # image = Image.fromarray(image)
+        mix = np.zeros(image.shape)
+        for i in range(self.mixture_width):
+            image_aug = image.copy()
+            image_aug = Image.fromarray(image_aug)
+            depth = self.mixture_depth if self.mixture_depth > 0 else np.random.randint(
+                1, 4)
+            for _ in range(depth):
+                op = np.random.choice(self.augmentations)
+                image_aug = op(image_aug, self.aug_severity)
+            mix += ws[i] * np.asarray(image_aug)
+
+        mixed = (1 - m) * image + m * mix
+        return mixed.astype(np.uint8)
+
+
+class ColorJitter(RawColorJitter):
+    """ColorJitter.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def __call__(self, img):
+        if not isinstance(img, Image.Image):
+            img = np.ascontiguousarray(img)
+            img = Image.fromarray(img)
+        img = super()._apply_image(img)
+        if isinstance(img, Image.Image):
+            img = np.asarray(img)
+        return img
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/randaugment.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/randaugment.py
new file mode 100644
index 000000000..cca59da42
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/randaugment.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on https://github.com/heartInsert/randaugment
+
+from PIL import Image, ImageEnhance, ImageOps
+import numpy as np
+import random
+
+
+class RandAugment(object):
+    def __init__(self, num_layers=2, magnitude=5, fillcolor=(128, 128, 128)):
+        self.num_layers = num_layers
+        self.magnitude = magnitude
+        self.max_level = 10
+
+        abso_level = self.magnitude / self.max_level
+        self.level_map = {
+            "shearX": 0.3 * abso_level,
+            "shearY": 0.3 * abso_level,
+            "translateX": 150.0 / 331 * abso_level,
+            "translateY": 150.0 / 331 * abso_level,
+            "rotate": 30 * abso_level,
+            "color": 0.9 * abso_level,
+            "posterize": int(4.0 * abso_level),
+            "solarize": 256.0 * abso_level,
+            "contrast": 0.9 * abso_level,
+            "sharpness": 0.9 * abso_level,
+            "brightness": 0.9 * abso_level,
+            "autocontrast": 0,
+            "equalize": 0,
+            "invert": 0
+        }
+
+        # from https://stackoverflow.com/questions/5252170/
+        # specify-image-filling-color-when-rotating-in-python-with-pil-and-setting-expand
+        def rotate_with_fill(img, magnitude):
+            rot = img.convert("RGBA").rotate(magnitude)
+            return Image.composite(rot,
+                                   Image.new("RGBA", rot.size, (128, ) * 4),
+                                   rot).convert(img.mode)
+
+        rnd_ch_op = random.choice
+
+        self.func = {
+            "shearX": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, magnitude * rnd_ch_op([-1, 1]), 0, 0, 1, 0),
+                Image.BICUBIC,
+                fillcolor=fillcolor),
+            "shearY": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, 0, magnitude * rnd_ch_op([-1, 1]), 1, 0),
+                Image.BICUBIC,
+                fillcolor=fillcolor),
+            "translateX": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, magnitude * img.size[0] * rnd_ch_op([-1, 1]), 0, 1, 0),
+                fillcolor=fillcolor),
+            "translateY": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, 0, 0, 1, magnitude * img.size[1] * rnd_ch_op([-1, 1])),
+                fillcolor=fillcolor),
+            "rotate": lambda img, magnitude: rotate_with_fill(img, magnitude),
+            "color": lambda img, magnitude: ImageEnhance.Color(img).enhance(
+                1 + magnitude * rnd_ch_op([-1, 1])),
+            "posterize": lambda img, magnitude:
+                ImageOps.posterize(img, magnitude),
+            "solarize": lambda img, magnitude:
+                ImageOps.solarize(img, magnitude),
+            "contrast": lambda img, magnitude:
+                ImageEnhance.Contrast(img).enhance(
+                    1 + magnitude * rnd_ch_op([-1, 1])),
+            "sharpness": lambda img, magnitude:
+                ImageEnhance.Sharpness(img).enhance(
+                    1 + magnitude * rnd_ch_op([-1, 1])),
+            "brightness": lambda img, magnitude:
+                ImageEnhance.Brightness(img).enhance(
+                    1 + magnitude * rnd_ch_op([-1, 1])),
+            "autocontrast": lambda img, magnitude:
+                ImageOps.autocontrast(img),
+            "equalize": lambda img, magnitude: ImageOps.equalize(img),
+            "invert": lambda img, magnitude: ImageOps.invert(img)
+        }
+
+    def __call__(self, img):
+        avaiable_op_names = list(self.level_map.keys())
+        for layer_num in range(self.num_layers):
+            op_name = np.random.choice(avaiable_op_names)
+            img = self.func[op_name](img, self.level_map[op_name])
+        return img
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/random_erasing.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/random_erasing.py
new file mode 100644
index 000000000..f234abbba
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/random_erasing.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#This code is adapted from https://github.com/zhunzhong07/Random-Erasing, and refer to Timm.
+
+from functools import partial
+
+import math
+import random
+
+import numpy as np
+
+
+class Pixels(object):
+    def __init__(self, mode="const", mean=[0., 0., 0.]):
+        self._mode = mode
+        self._mean = mean
+
+    def __call__(self, h=224, w=224, c=3):
+        if self._mode == "rand":
+            return np.random.normal(size=(1, 1, 3))
+        elif self._mode == "pixel":
+            return np.random.normal(size=(h, w, c))
+        elif self._mode == "const":
+            return self._mean
+        else:
+            raise Exception(
+                "Invalid mode in RandomErasing, only support \"const\", \"rand\", \"pixel\""
+            )
+
+
+class RandomErasing(object):
+    """RandomErasing.
+    """
+
+    def __init__(self,
+                 EPSILON=0.5,
+                 sl=0.02,
+                 sh=0.4,
+                 r1=0.3,
+                 mean=[0., 0., 0.],
+                 attempt=100,
+                 use_log_aspect=False,
+                 mode='const'):
+        self.EPSILON = eval(EPSILON) if isinstance(EPSILON, str) else EPSILON
+        self.sl = eval(sl) if isinstance(sl, str) else sl
+        self.sh = eval(sh) if isinstance(sh, str) else sh
+        r1 = eval(r1) if isinstance(r1, str) else r1
+        self.r1 = (math.log(r1), math.log(1 / r1)) if use_log_aspect else (
+            r1, 1 / r1)
+        self.use_log_aspect = use_log_aspect
+        self.attempt = attempt
+        self.get_pixels = Pixels(mode, mean)
+
+    def __call__(self, img):
+        if random.random() > self.EPSILON:
+            return img
+
+        for _ in range(self.attempt):
+            area = img.shape[0] * img.shape[1]
+
+            target_area = random.uniform(self.sl, self.sh) * area
+            aspect_ratio = random.uniform(*self.r1)
+            if self.use_log_aspect:
+                aspect_ratio = math.exp(aspect_ratio)
+
+            h = int(round(math.sqrt(target_area * aspect_ratio)))
+            w = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if w < img.shape[1] and h < img.shape[0]:
+                pixels = self.get_pixels(h, w, img.shape[2])
+                x1 = random.randint(0, img.shape[0] - h)
+                y1 = random.randint(0, img.shape[1] - w)
+                if img.shape[2] == 3:
+                    img[x1:x1 + h, y1:y1 + w, :] = pixels
+                else:
+                    img[x1:x1 + h, y1:y1 + w, 0] = pixels[0]
+                return img
+        return img
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/timm_autoaugment.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/timm_autoaugment.py
new file mode 100644
index 000000000..dd2994dac
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/preprocess/ops/timm_autoaugment.py
@@ -0,0 +1,877 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Code was heavily based on  https://github.com/rwightman/pytorch-image-models
+
+import random
+import math
+import re
+from PIL import Image, ImageOps, ImageEnhance, ImageChops
+import PIL
+import numpy as np
+
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+
+_PIL_VER = tuple([int(x) for x in PIL.__version__.split('.')[:2]])
+
+_FILL = (128, 128, 128)
+
+# This signifies the max integer that the controller RNN could predict for the
+# augmentation scheme.
+_MAX_LEVEL = 10.
+
+_HPARAMS_DEFAULT = dict(
+    translate_const=250,
+    img_mean=_FILL, )
+
+_RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC)
+
+
+def _pil_interp(method):
+    if method == 'bicubic':
+        return Image.BICUBIC
+    elif method == 'lanczos':
+        return Image.LANCZOS
+    elif method == 'hamming':
+        return Image.HAMMING
+    else:
+        # default bilinear, do we want to allow nearest?
+        return Image.BILINEAR
+
+
+def _interpolation(kwargs):
+    interpolation = kwargs.pop('resample', Image.BILINEAR)
+    if isinstance(interpolation, (list, tuple)):
+        return random.choice(interpolation)
+    else:
+        return interpolation
+
+
+def _check_args_tf(kwargs):
+    if 'fillcolor' in kwargs and _PIL_VER < (5, 0):
+        kwargs.pop('fillcolor')
+    kwargs['resample'] = _interpolation(kwargs)
+
+
+def shear_x(img, factor, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, factor, 0, 0, 1, 0),
+                         **kwargs)
+
+
+def shear_y(img, factor, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, factor, 1, 0),
+                         **kwargs)
+
+
+def translate_x_rel(img, pct, **kwargs):
+    pixels = pct * img.size[0]
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0),
+                         **kwargs)
+
+
+def translate_y_rel(img, pct, **kwargs):
+    pixels = pct * img.size[1]
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels),
+                         **kwargs)
+
+
+def translate_x_abs(img, pixels, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0),
+                         **kwargs)
+
+
+def translate_y_abs(img, pixels, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels),
+                         **kwargs)
+
+
+def rotate(img, degrees, **kwargs):
+    _check_args_tf(kwargs)
+    if _PIL_VER >= (5, 2):
+        return img.rotate(degrees, **kwargs)
+    elif _PIL_VER >= (5, 0):
+        w, h = img.size
+        post_trans = (0, 0)
+        rotn_center = (w / 2.0, h / 2.0)
+        angle = -math.radians(degrees)
+        matrix = [
+            round(math.cos(angle), 15),
+            round(math.sin(angle), 15),
+            0.0,
+            round(-math.sin(angle), 15),
+            round(math.cos(angle), 15),
+            0.0,
+        ]
+
+        def transform(x, y, matrix):
+            (a, b, c, d, e, f) = matrix
+            return a * x + b * y + c, d * x + e * y + f
+
+        matrix[2], matrix[5] = transform(-rotn_center[0] - post_trans[0],
+                                         -rotn_center[1] - post_trans[1],
+                                         matrix)
+        matrix[2] += rotn_center[0]
+        matrix[5] += rotn_center[1]
+        return img.transform(img.size, Image.AFFINE, matrix, **kwargs)
+    else:
+        return img.rotate(degrees, resample=kwargs['resample'])
+
+
+def auto_contrast(img, **__):
+    return ImageOps.autocontrast(img)
+
+
+def invert(img, **__):
+    return ImageOps.invert(img)
+
+
+def equalize(img, **__):
+    return ImageOps.equalize(img)
+
+
+def solarize(img, thresh, **__):
+    return ImageOps.solarize(img, thresh)
+
+
+def solarize_add(img, add, thresh=128, **__):
+    lut = []
+    for i in range(256):
+        if i < thresh:
+            lut.append(min(255, i + add))
+        else:
+            lut.append(i)
+    if img.mode in ("L", "RGB"):
+        if img.mode == "RGB" and len(lut) == 256:
+            lut = lut + lut + lut
+        return img.point(lut)
+    else:
+        return img
+
+
+def posterize(img, bits_to_keep, **__):
+    if bits_to_keep >= 8:
+        return img
+    return ImageOps.posterize(img, bits_to_keep)
+
+
+def contrast(img, factor, **__):
+    return ImageEnhance.Contrast(img).enhance(factor)
+
+
+def color(img, factor, **__):
+    return ImageEnhance.Color(img).enhance(factor)
+
+
+def brightness(img, factor, **__):
+    return ImageEnhance.Brightness(img).enhance(factor)
+
+
+def sharpness(img, factor, **__):
+    return ImageEnhance.Sharpness(img).enhance(factor)
+
+
+def _randomly_negate(v):
+    """With 50% prob, negate the value"""
+    return -v if random.random() > 0.5 else v
+
+
+def _rotate_level_to_arg(level, _hparams):
+    # range [-30, 30]
+    level = (level / _MAX_LEVEL) * 30.
+    level = _randomly_negate(level)
+    return level,
+
+
+def _enhance_level_to_arg(level, _hparams):
+    # range [0.1, 1.9]
+    return (level / _MAX_LEVEL) * 1.8 + 0.1,
+
+
+def _enhance_increasing_level_to_arg(level, _hparams):
+    # the 'no change' level is 1.0, moving away from that towards 0. or 2.0 increases the enhancement blend
+    # range [0.1, 1.9]
+    level = (level / _MAX_LEVEL) * .9
+    level = 1.0 + _randomly_negate(level)
+    return level,
+
+
+def _shear_level_to_arg(level, _hparams):
+    # range [-0.3, 0.3]
+    level = (level / _MAX_LEVEL) * 0.3
+    level = _randomly_negate(level)
+    return level,
+
+
+def _translate_abs_level_to_arg(level, hparams):
+    translate_const = hparams['translate_const']
+    level = (level / _MAX_LEVEL) * float(translate_const)
+    level = _randomly_negate(level)
+    return level,
+
+
+def _translate_rel_level_to_arg(level, hparams):
+    # default range [-0.45, 0.45]
+    translate_pct = hparams.get('translate_pct', 0.45)
+    level = (level / _MAX_LEVEL) * translate_pct
+    level = _randomly_negate(level)
+    return level,
+
+
+def _posterize_level_to_arg(level, _hparams):
+    # As per Tensorflow TPU EfficientNet impl
+    # range [0, 4], 'keep 0 up to 4 MSB of original image'
+    # intensity/severity of augmentation decreases with level
+    return int((level / _MAX_LEVEL) * 4),
+
+
+def _posterize_increasing_level_to_arg(level, hparams):
+    # As per Tensorflow models research and UDA impl
+    # range [4, 0], 'keep 4 down to 0 MSB of original image',
+    # intensity/severity of augmentation increases with level
+    return 4 - _posterize_level_to_arg(level, hparams)[0],
+
+
+def _posterize_original_level_to_arg(level, _hparams):
+    # As per original AutoAugment paper description
+    # range [4, 8], 'keep 4 up to 8 MSB of image'
+    # intensity/severity of augmentation decreases with level
+    return int((level / _MAX_LEVEL) * 4) + 4,
+
+
+def _solarize_level_to_arg(level, _hparams):
+    # range [0, 256]
+    # intensity/severity of augmentation decreases with level
+    return int((level / _MAX_LEVEL) * 256),
+
+
+def _solarize_increasing_level_to_arg(level, _hparams):
+    # range [0, 256]
+    # intensity/severity of augmentation increases with level
+    return 256 - _solarize_level_to_arg(level, _hparams)[0],
+
+
+def _solarize_add_level_to_arg(level, _hparams):
+    # range [0, 110]
+    return int((level / _MAX_LEVEL) * 110),
+
+
+LEVEL_TO_ARG = {
+    'AutoContrast': None,
+    'Equalize': None,
+    'Invert': None,
+    'Rotate': _rotate_level_to_arg,
+    # There are several variations of the posterize level scaling in various Tensorflow/Google repositories/papers
+    'Posterize': _posterize_level_to_arg,
+    'PosterizeIncreasing': _posterize_increasing_level_to_arg,
+    'PosterizeOriginal': _posterize_original_level_to_arg,
+    'Solarize': _solarize_level_to_arg,
+    'SolarizeIncreasing': _solarize_increasing_level_to_arg,
+    'SolarizeAdd': _solarize_add_level_to_arg,
+    'Color': _enhance_level_to_arg,
+    'ColorIncreasing': _enhance_increasing_level_to_arg,
+    'Contrast': _enhance_level_to_arg,
+    'ContrastIncreasing': _enhance_increasing_level_to_arg,
+    'Brightness': _enhance_level_to_arg,
+    'BrightnessIncreasing': _enhance_increasing_level_to_arg,
+    'Sharpness': _enhance_level_to_arg,
+    'SharpnessIncreasing': _enhance_increasing_level_to_arg,
+    'ShearX': _shear_level_to_arg,
+    'ShearY': _shear_level_to_arg,
+    'TranslateX': _translate_abs_level_to_arg,
+    'TranslateY': _translate_abs_level_to_arg,
+    'TranslateXRel': _translate_rel_level_to_arg,
+    'TranslateYRel': _translate_rel_level_to_arg,
+}
+
+NAME_TO_OP = {
+    'AutoContrast': auto_contrast,
+    'Equalize': equalize,
+    'Invert': invert,
+    'Rotate': rotate,
+    'Posterize': posterize,
+    'PosterizeIncreasing': posterize,
+    'PosterizeOriginal': posterize,
+    'Solarize': solarize,
+    'SolarizeIncreasing': solarize,
+    'SolarizeAdd': solarize_add,
+    'Color': color,
+    'ColorIncreasing': color,
+    'Contrast': contrast,
+    'ContrastIncreasing': contrast,
+    'Brightness': brightness,
+    'BrightnessIncreasing': brightness,
+    'Sharpness': sharpness,
+    'SharpnessIncreasing': sharpness,
+    'ShearX': shear_x,
+    'ShearY': shear_y,
+    'TranslateX': translate_x_abs,
+    'TranslateY': translate_y_abs,
+    'TranslateXRel': translate_x_rel,
+    'TranslateYRel': translate_y_rel,
+}
+
+
+class AugmentOp(object):
+    def __init__(self, name, prob=0.5, magnitude=10, hparams=None):
+        hparams = hparams or _HPARAMS_DEFAULT
+        self.aug_fn = NAME_TO_OP[name]
+        self.level_fn = LEVEL_TO_ARG[name]
+        self.prob = prob
+        self.magnitude = magnitude
+        self.hparams = hparams.copy()
+        self.kwargs = dict(
+            fillcolor=hparams['img_mean'] if 'img_mean' in hparams else _FILL,
+            resample=hparams['interpolation']
+            if 'interpolation' in hparams else _RANDOM_INTERPOLATION, )
+
+        # If magnitude_std is > 0, we introduce some randomness
+        # in the usually fixed policy and sample magnitude from a normal distribution
+        # with mean `magnitude` and std-dev of `magnitude_std`.
+        # NOTE This is my own hack, being tested, not in papers or reference impls.
+        self.magnitude_std = self.hparams.get('magnitude_std', 0)
+
+    def __call__(self, img):
+        if self.prob < 1.0 and random.random() > self.prob:
+            return img
+        magnitude = self.magnitude
+        if self.magnitude_std and self.magnitude_std > 0:
+            magnitude = random.gauss(magnitude, self.magnitude_std)
+        magnitude = min(_MAX_LEVEL, max(0, magnitude))  # clip to valid range
+        level_args = self.level_fn(
+            magnitude, self.hparams) if self.level_fn is not None else tuple()
+        return self.aug_fn(img, *level_args, **self.kwargs)
+
+
+def auto_augment_policy_v0(hparams):
+    # ImageNet v0 policy from TPU EfficientNet impl, cannot find a paper reference.
+    policy = [
+        [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
+        [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
+        [('Color', 0.4, 1), ('Rotate', 0.6, 8)],
+        [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
+        [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
+        [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
+        [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
+        [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
+        [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
+        [('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
+        [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
+        [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
+        [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
+        [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
+        [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
+        [('Rotate', 1.0, 7), ('TranslateYRel', 0.8, 9)],
+        [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
+        [('ShearY', 0.8, 0), ('Color', 0.6, 4)],
+        [('Color', 1.0, 0), ('Rotate', 0.6, 2)],
+        [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
+        [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
+        [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
+        [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)
+         ],  # This results in black image with Tpu posterize
+        [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
+        [('Color', 0.8, 6), ('Rotate', 0.4, 5)],
+    ]
+    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy_v0r(hparams):
+    # ImageNet v0 policy from TPU EfficientNet impl, with variation of Posterize used
+    # in Google research implementation (number of bits discarded increases with magnitude)
+    policy = [
+        [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
+        [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
+        [('Color', 0.4, 1), ('Rotate', 0.6, 8)],
+        [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
+        [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
+        [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
+        [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
+        [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
+        [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
+        [('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
+        [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
+        [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
+        [('PosterizeIncreasing', 0.4, 6), ('AutoContrast', 0.4, 7)],
+        [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
+        [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
+        [('Rotate', 1.0, 7), ('TranslateYRel', 0.8, 9)],
+        [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
+        [('ShearY', 0.8, 0), ('Color', 0.6, 4)],
+        [('Color', 1.0, 0), ('Rotate', 0.6, 2)],
+        [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
+        [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
+        [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
+        [('PosterizeIncreasing', 0.8, 2), ('Solarize', 0.6, 10)],
+        [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
+        [('Color', 0.8, 6), ('Rotate', 0.4, 5)],
+    ]
+    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy_original(hparams):
+    # ImageNet policy from https://arxiv.org/abs/1805.09501
+    policy = [
+        [('PosterizeOriginal', 0.4, 8), ('Rotate', 0.6, 9)],
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
+        [('PosterizeOriginal', 0.6, 7), ('PosterizeOriginal', 0.6, 6)],
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
+        [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)],
+        [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)],
+        [('PosterizeOriginal', 0.8, 5), ('Equalize', 1.0, 2)],
+        [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)],
+        [('Equalize', 0.6, 8), ('PosterizeOriginal', 0.4, 6)],
+        [('Rotate', 0.8, 8), ('Color', 0.4, 0)],
+        [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)],
+        [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)],
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
+        [('Rotate', 0.8, 8), ('Color', 1.0, 2)],
+        [('Color', 0.8, 8), ('Solarize', 0.8, 7)],
+        [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)],
+        [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)],
+        [('Color', 0.4, 0), ('Equalize', 0.6, 3)],
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
+    ]
+    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy_originalr(hparams):
+    # ImageNet policy from https://arxiv.org/abs/1805.09501 with research posterize variation
+    policy = [
+        [('PosterizeIncreasing', 0.4, 8), ('Rotate', 0.6, 9)],
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
+        [('PosterizeIncreasing', 0.6, 7), ('PosterizeIncreasing', 0.6, 6)],
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
+        [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)],
+        [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)],
+        [('PosterizeIncreasing', 0.8, 5), ('Equalize', 1.0, 2)],
+        [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)],
+        [('Equalize', 0.6, 8), ('PosterizeIncreasing', 0.4, 6)],
+        [('Rotate', 0.8, 8), ('Color', 0.4, 0)],
+        [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)],
+        [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)],
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
+        [('Rotate', 0.8, 8), ('Color', 1.0, 2)],
+        [('Color', 0.8, 8), ('Solarize', 0.8, 7)],
+        [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)],
+        [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)],
+        [('Color', 0.4, 0), ('Equalize', 0.6, 3)],
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
+    ]
+    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy(name='v0', hparams=None):
+    hparams = hparams or _HPARAMS_DEFAULT
+    if name == 'original':
+        return auto_augment_policy_original(hparams)
+    elif name == 'originalr':
+        return auto_augment_policy_originalr(hparams)
+    elif name == 'v0':
+        return auto_augment_policy_v0(hparams)
+    elif name == 'v0r':
+        return auto_augment_policy_v0r(hparams)
+    else:
+        assert False, 'Unknown AA policy (%s)' % name
+
+
+class AutoAugment(object):
+    def __init__(self, policy):
+        self.policy = policy
+
+    def __call__(self, img):
+        sub_policy = random.choice(self.policy)
+        for op in sub_policy:
+            img = op(img)
+        return img
+
+
+def auto_augment_transform(config_str, hparams):
+    """
+    Create a AutoAugment transform
+
+    :param config_str: String defining configuration of auto augmentation. Consists of multiple sections separated by
+    dashes ('-'). The first section defines the AutoAugment policy (one of 'v0', 'v0r', 'original', 'originalr').
+    The remaining sections, not order sepecific determine
+        'mstd' -  float std deviation of magnitude noise applied
+    Ex 'original-mstd0.5' results in AutoAugment with original policy, magnitude_std 0.5
+
+    :param hparams: Other hparams (kwargs) for the AutoAugmentation scheme
+
+    :return: A callable Transform Op
+    """
+    config = config_str.split('-')
+    policy_name = config[0]
+    config = config[1:]
+    for c in config:
+        cs = re.split(r'(\d.*)', c)
+        if len(cs) < 2:
+            continue
+        key, val = cs[:2]
+        if key == 'mstd':
+            # noise param injected via hparams for now
+            hparams.setdefault('magnitude_std', float(val))
+        else:
+            assert False, 'Unknown AutoAugment config section'
+    aa_policy = auto_augment_policy(policy_name, hparams=hparams)
+    return AutoAugment(aa_policy)
+
+
+_RAND_TRANSFORMS = [
+    'AutoContrast',
+    'Equalize',
+    'Invert',
+    'Rotate',
+    'Posterize',
+    'Solarize',
+    'SolarizeAdd',
+    'Color',
+    'Contrast',
+    'Brightness',
+    'Sharpness',
+    'ShearX',
+    'ShearY',
+    'TranslateXRel',
+    'TranslateYRel',
+    #'Cutout'  # NOTE I've implement this as random erasing separately
+]
+
+_RAND_INCREASING_TRANSFORMS = [
+    'AutoContrast',
+    'Equalize',
+    'Invert',
+    'Rotate',
+    'PosterizeIncreasing',
+    'SolarizeIncreasing',
+    'SolarizeAdd',
+    'ColorIncreasing',
+    'ContrastIncreasing',
+    'BrightnessIncreasing',
+    'SharpnessIncreasing',
+    'ShearX',
+    'ShearY',
+    'TranslateXRel',
+    'TranslateYRel',
+    #'Cutout'  # NOTE I've implement this as random erasing separately
+]
+
+# These experimental weights are based loosely on the relative improvements mentioned in paper.
+# They may not result in increased performance, but could likely be tuned to so.
+_RAND_CHOICE_WEIGHTS_0 = {
+    'Rotate': 0.3,
+    'ShearX': 0.2,
+    'ShearY': 0.2,
+    'TranslateXRel': 0.1,
+    'TranslateYRel': 0.1,
+    'Color': .025,
+    'Sharpness': 0.025,
+    'AutoContrast': 0.025,
+    'Solarize': .005,
+    'SolarizeAdd': .005,
+    'Contrast': .005,
+    'Brightness': .005,
+    'Equalize': .005,
+    'Posterize': 0,
+    'Invert': 0,
+}
+
+
+def _select_rand_weights(weight_idx=0, transforms=None):
+    transforms = transforms or _RAND_TRANSFORMS
+    assert weight_idx == 0  # only one set of weights currently
+    rand_weights = _RAND_CHOICE_WEIGHTS_0
+    probs = [rand_weights[k] for k in transforms]
+    probs /= np.sum(probs)
+    return probs
+
+
+def rand_augment_ops(magnitude=10, hparams=None, transforms=None):
+    hparams = hparams or _HPARAMS_DEFAULT
+    transforms = transforms or _RAND_TRANSFORMS
+    return [
+        AugmentOp(
+            name, prob=0.5, magnitude=magnitude, hparams=hparams)
+        for name in transforms
+    ]
+
+
+class RandAugment(object):
+    def __init__(self, ops, num_layers=2, choice_weights=None):
+        self.ops = ops
+        self.num_layers = num_layers
+        self.choice_weights = choice_weights
+
+    def __call__(self, img):
+        # no replacement when using weighted choice
+        ops = np.random.choice(
+            self.ops,
+            self.num_layers,
+            replace=self.choice_weights is None,
+            p=self.choice_weights)
+        for op in ops:
+            img = op(img)
+        return img
+
+
+def rand_augment_transform(config_str, hparams):
+    """
+    Create a RandAugment transform
+
+    :param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by
+    dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining
+    sections, not order sepecific determine
+        'm' - integer magnitude of rand augment
+        'n' - integer num layers (number of transform ops selected per image)
+        'w' - integer probabiliy weight index (index of a set of weights to influence choice of op)
+        'mstd' -  float std deviation of magnitude noise applied
+        'inc' - integer (bool), use augmentations that increase in severity with magnitude (default: 0)
+    Ex 'rand-m9-n3-mstd0.5' results in RandAugment with magnitude 9, num_layers 3, magnitude_std 0.5
+    'rand-mstd1-w0' results in magnitude_std 1.0, weights 0, default magnitude of 10 and num_layers 2
+
+    :param hparams: Other hparams (kwargs) for the RandAugmentation scheme
+
+    :return: A callable Transform Op
+    """
+    magnitude = _MAX_LEVEL  # default to _MAX_LEVEL for magnitude (currently 10)
+    num_layers = 2  # default to 2 ops per image
+    weight_idx = None  # default to no probability weights for op choice
+    transforms = _RAND_TRANSFORMS
+    config = config_str.split('-')
+    assert config[0] == 'rand'
+    config = config[1:]
+    for c in config:
+        cs = re.split(r'(\d.*)', c)
+        if len(cs) < 2:
+            continue
+        key, val = cs[:2]
+        if key == 'mstd':
+            # noise param injected via hparams for now
+            hparams.setdefault('magnitude_std', float(val))
+        elif key == 'inc':
+            if bool(val):
+                transforms = _RAND_INCREASING_TRANSFORMS
+        elif key == 'm':
+            magnitude = int(val)
+        elif key == 'n':
+            num_layers = int(val)
+        elif key == 'w':
+            weight_idx = int(val)
+        else:
+            assert False, 'Unknown RandAugment config section'
+    ra_ops = rand_augment_ops(
+        magnitude=magnitude, hparams=hparams, transforms=transforms)
+    choice_weights = None if weight_idx is None else _select_rand_weights(
+        weight_idx)
+    return RandAugment(ra_ops, num_layers, choice_weights=choice_weights)
+
+
+_AUGMIX_TRANSFORMS = [
+    'AutoContrast',
+    'ColorIncreasing',  # not in paper
+    'ContrastIncreasing',  # not in paper
+    'BrightnessIncreasing',  # not in paper
+    'SharpnessIncreasing',  # not in paper
+    'Equalize',
+    'Rotate',
+    'PosterizeIncreasing',
+    'SolarizeIncreasing',
+    'ShearX',
+    'ShearY',
+    'TranslateXRel',
+    'TranslateYRel',
+]
+
+
+def augmix_ops(magnitude=10, hparams=None, transforms=None):
+    hparams = hparams or _HPARAMS_DEFAULT
+    transforms = transforms or _AUGMIX_TRANSFORMS
+    return [
+        AugmentOp(
+            name, prob=1.0, magnitude=magnitude, hparams=hparams)
+        for name in transforms
+    ]
+
+
+class AugMixAugment(object):
+    """ AugMix Transform
+    Adapted and improved from impl here: https://github.com/google-research/augmix/blob/master/imagenet.py
+    From paper: 'AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty -
+    https://arxiv.org/abs/1912.02781
+    """
+
+    def __init__(self, ops, alpha=1., width=3, depth=-1, blended=False):
+        self.ops = ops
+        self.alpha = alpha
+        self.width = width
+        self.depth = depth
+        self.blended = blended  # blended mode is faster but not well tested
+
+    def _calc_blended_weights(self, ws, m):
+        ws = ws * m
+        cump = 1.
+        rws = []
+        for w in ws[::-1]:
+            alpha = w / cump
+            cump *= (1 - alpha)
+            rws.append(alpha)
+        return np.array(rws[::-1], dtype=np.float32)
+
+    def _apply_blended(self, img, mixing_weights, m):
+        # This is my first crack and implementing a slightly faster mixed augmentation. Instead
+        # of accumulating the mix for each chain in a Numpy array and then blending with original,
+        # it recomputes the blending coefficients and applies one PIL image blend per chain.
+        # TODO the results appear in the right ballpark but they differ by more than rounding.
+        img_orig = img.copy()
+        ws = self._calc_blended_weights(mixing_weights, m)
+        for w in ws:
+            depth = self.depth if self.depth > 0 else np.random.randint(1, 4)
+            ops = np.random.choice(self.ops, depth, replace=True)
+            img_aug = img_orig  # no ops are in-place, deep copy not necessary
+            for op in ops:
+                img_aug = op(img_aug)
+            img = Image.blend(img, img_aug, w)
+        return img
+
+    def _apply_basic(self, img, mixing_weights, m):
+        # This is a literal adaptation of the paper/official implementation without normalizations and
+        # PIL <-> Numpy conversions between every op. It is still quite CPU compute heavy compared to the
+        # typical augmentation transforms, could use a GPU / Kornia implementation.
+        img_shape = img.size[0], img.size[1], len(img.getbands())
+        mixed = np.zeros(img_shape, dtype=np.float32)
+        for mw in mixing_weights:
+            depth = self.depth if self.depth > 0 else np.random.randint(1, 4)
+            ops = np.random.choice(self.ops, depth, replace=True)
+            img_aug = img  # no ops are in-place, deep copy not necessary
+            for op in ops:
+                img_aug = op(img_aug)
+            mixed += mw * np.asarray(img_aug, dtype=np.float32)
+        np.clip(mixed, 0, 255., out=mixed)
+        mixed = Image.fromarray(mixed.astype(np.uint8))
+        return Image.blend(img, mixed, m)
+
+    def __call__(self, img):
+        mixing_weights = np.float32(
+            np.random.dirichlet([self.alpha] * self.width))
+        m = np.float32(np.random.beta(self.alpha, self.alpha))
+        if self.blended:
+            mixed = self._apply_blended(img, mixing_weights, m)
+        else:
+            mixed = self._apply_basic(img, mixing_weights, m)
+        return mixed
+
+
+def augment_and_mix_transform(config_str, hparams):
+    """ Create AugMix transform
+
+    :param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by
+    dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining
+    sections, not order sepecific determine
+        'm' - integer magnitude (severity) of augmentation mix (default: 3)
+        'w' - integer width of augmentation chain (default: 3)
+        'd' - integer depth of augmentation chain (-1 is random [1, 3], default: -1)
+        'b' - integer (bool), blend each branch of chain into end result without a final blend, less CPU (default: 0)
+        'mstd' -  float std deviation of magnitude noise applied (default: 0)
+    Ex 'augmix-m5-w4-d2' results in AugMix with severity 5, chain width 4, chain depth 2
+
+    :param hparams: Other hparams (kwargs) for the Augmentation transforms
+
+    :return: A callable Transform Op
+    """
+    magnitude = 3
+    width = 3
+    depth = -1
+    alpha = 1.
+    blended = False
+    config = config_str.split('-')
+    assert config[0] == 'augmix'
+    config = config[1:]
+    for c in config:
+        cs = re.split(r'(\d.*)', c)
+        if len(cs) < 2:
+            continue
+        key, val = cs[:2]
+        if key == 'mstd':
+            # noise param injected via hparams for now
+            hparams.setdefault('magnitude_std', float(val))
+        elif key == 'm':
+            magnitude = int(val)
+        elif key == 'w':
+            width = int(val)
+        elif key == 'd':
+            depth = int(val)
+        elif key == 'a':
+            alpha = float(val)
+        elif key == 'b':
+            blended = bool(val)
+        else:
+            assert False, 'Unknown AugMix config section'
+    ops = augmix_ops(magnitude=magnitude, hparams=hparams)
+    return AugMixAugment(
+        ops, alpha=alpha, width=width, depth=depth, blended=blended)
+
+
+class RawTimmAutoAugment(object):
+    """TimmAutoAugment API for PaddleClas."""
+
+    def __init__(self,
+                 config_str="rand-m9-mstd0.5-inc1",
+                 interpolation="bicubic",
+                 img_size=224,
+                 mean=IMAGENET_DEFAULT_MEAN):
+        if isinstance(img_size, (tuple, list)):
+            img_size_min = min(img_size)
+        else:
+            img_size_min = img_size
+
+        aa_params = dict(
+            translate_const=int(img_size_min * 0.45),
+            img_mean=tuple([min(255, round(255 * x)) for x in mean]), )
+        if interpolation and interpolation != 'random':
+            aa_params['interpolation'] = _pil_interp(interpolation)
+        if config_str.startswith('rand'):
+            self.augment_func = rand_augment_transform(config_str, aa_params)
+        elif config_str.startswith('augmix'):
+            aa_params['translate_pct'] = 0.3
+            self.augment_func = augment_and_mix_transform(config_str,
+                                                          aa_params)
+        elif config_str.startswith('auto'):
+            self.augment_func = auto_augment_transform(config_str, aa_params)
+        else:
+            raise Exception(
+                "ConfigError: The TimmAutoAugment Op only support RandAugment, AutoAugment, AugMix, and the config_str only starts with \"rand\", \"augmix\", \"auto\"."
+            )
+
+    def __call__(self, img):
+        return self.augment_func(img)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/utils/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/utils/__init__.py
new file mode 100644
index 000000000..61d5aa213
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls/data/utils/get_image_list.py b/cv/classification/resnet50/paddlepaddle/ppcls/data/utils/get_image_list.py
new file mode 100644
index 000000000..6f10935ad
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls/data/utils/get_image_list.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+import base64
+import numpy as np
+
+
+def get_image_list(img_file):
+    imgs_lists = []
+    if img_file is None or not os.path.exists(img_file):
+        raise Exception("not found any img file in {}".format(img_file))
+
+    img_end = ['jpg', 'png', 'jpeg', 'JPEG', 'JPG', 'bmp']
+    if os.path.isfile(img_file) and img_file.split('.')[-1] in img_end:
+        imgs_lists.append(img_file)
+    elif os.path.isdir(img_file):
+        for single_file in os.listdir(img_file):
+            if single_file.split('.')[-1] in img_end:
+                imgs_lists.append(os.path.join(img_file, single_file))
+    if len(imgs_lists) == 0:
+        raise Exception("not found any img file in {}".format(img_file))
+    imgs_lists = sorted(imgs_lists)
+    return imgs_lists
+
+
+def get_image_list_from_label_file(image_path, label_file_path):
+    imgs_lists = []
+    gt_labels = []
+    with open(label_file_path, "r") as fin:
+        lines = fin.readlines()
+        for line in lines:
+            image_name, label = line.strip("\n").split()
+            label = int(label)
+            imgs_lists.append(os.path.join(image_path, image_name))
+            gt_labels.append(int(label))
+    return imgs_lists, gt_labels
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/__init__.py
new file mode 100644
index 000000000..ea70cc170
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/__init__.py
@@ -0,0 +1,197 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import copy
+import random
+import paddle
+import numpy as np
+import paddle.distributed as dist
+from functools import partial
+from paddle.io import DistributedBatchSampler, BatchSampler, DataLoader
+from ppcls.utils import logger
+
+from ppcls.data import dataloader
+# dataset
+from ppcls.data.dataloader.imagenet_dataset import ImageNetDataset
+from ppcls.data.dataloader.multilabel_dataset import MultiLabelDataset
+from ppcls.data.dataloader.common_dataset import create_operators
+from ppcls.data.dataloader.vehicle_dataset import CompCars, VeriWild
+from ppcls.data.dataloader.logo_dataset import LogoDataset
+from ppcls.data.dataloader.icartoon_dataset import ICartoonDataset
+from ppcls.data.dataloader.mix_dataset import MixDataset
+from ppcls.data.dataloader.multi_scale_dataset import MultiScaleDataset
+from ppcls.data.dataloader.person_dataset import Market1501, MSMT17, DukeMTMC
+from ppcls.data.dataloader.face_dataset import FaceEvalDataset, FiveFaceEvalDataset
+from ppcls.data.dataloader.custom_label_dataset import CustomLabelDataset
+from ppcls.data.dataloader.cifar import Cifar10, Cifar100
+from ppcls.data.dataloader.metabin_sampler import DomainShuffleBatchSampler, NaiveIdentityBatchSampler
+
+# sampler
+from ppcls.data.dataloader.DistributedRandomIdentitySampler import DistributedRandomIdentitySampler
+from ppcls.data.dataloader.pk_sampler import PKSampler
+from ppcls.data.dataloader.mix_sampler import MixSampler
+from ppcls.data.dataloader.multi_scale_sampler import MultiScaleSampler
+from ppcls.data.dataloader.ra_sampler import RASampler
+from ppcls.data import preprocess
+from ppcls.data.preprocess import transform
+
+
+def create_operators(params, class_num=None):
+    """
+    create operators based on the config
+
+    Args:
+        params(list): a dict list, used to create some operators
+    """
+    assert isinstance(params, list), ('operator config should be a list')
+    ops = []
+    for operator in params:
+        assert isinstance(operator,
+                          dict) and len(operator) == 1, "yaml format error"
+        op_name = list(operator)[0]
+        param = {} if operator[op_name] is None else operator[op_name]
+        op_func = getattr(preprocess, op_name)
+        if "class_num" in inspect.getfullargspec(op_func).args:
+            param.update({"class_num": class_num})
+        op = op_func(**param)
+        ops.append(op)
+
+    return ops
+
+
+def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int):
+    """callback function on each worker subprocess after seeding and before data loading.
+
+    Args:
+        worker_id (int): Worker id in [0, num_workers - 1]
+        num_workers (int): Number of subprocesses to use for data loading.
+        rank (int): Rank of process in distributed environment. If in non-distributed environment, it is a constant number `0`.
+        seed (int): Random seed
+    """
+    # The seed of each worker equals to
+    # num_worker * rank + worker_id + user_seed
+    worker_seed = num_workers * rank + worker_id + seed
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+
+
+def build_dataloader(config, mode, device, use_dali=False, seed=None):
+    assert mode in [
+        'Train', 'Eval', 'Test', 'Gallery', 'Query', 'UnLabelTrain'
+    ], "Dataset mode should be Train, Eval, Test, Gallery, Query, UnLabelTrain"
+    assert mode in config.keys(), "{} config not in yaml".format(mode)
+    # build dataset
+    if use_dali and paddle.device.is_compiled_with_cuda():
+        from ppcls.data.dataloader.dali import dali_dataloader
+        return dali_dataloader(
+            config,
+            mode,
+            paddle.device.get_device(),
+            num_threads=config[mode]['loader']["num_workers"],
+            seed=seed,
+            enable_fuse=True)
+
+    class_num = config.get("class_num", None)
+    epochs = config.get("epochs", None)
+    config_dataset = config[mode]['dataset']
+    config_dataset = copy.deepcopy(config_dataset)
+    dataset_name = config_dataset.pop('name')
+    if 'batch_transform_ops' in config_dataset:
+        batch_transform = config_dataset.pop('batch_transform_ops')
+    else:
+        batch_transform = None
+
+    dataset = eval(dataset_name)(**config_dataset)
+
+    logger.debug("build dataset({}) success...".format(dataset))
+
+    # build sampler
+    config_sampler = config[mode]['sampler']
+    if config_sampler and "name" not in config_sampler:
+        batch_sampler = None
+        batch_size = config_sampler["batch_size"]
+        drop_last = config_sampler["drop_last"]
+        shuffle = config_sampler["shuffle"]
+    else:
+        sampler_name = config_sampler.pop("name")
+        sampler_argspec = inspect.getfullargspec(eval(sampler_name)
+                                                 .__init__).args
+        if "total_epochs" in sampler_argspec:
+            config_sampler.update({"total_epochs": epochs})
+        batch_sampler = eval(sampler_name)(dataset, **config_sampler)
+
+    logger.debug("build batch_sampler({}) success...".format(batch_sampler))
+
+    # build batch operator
+    def mix_collate_fn(batch):
+        batch = transform(batch, batch_ops)
+        # batch each field
+        slots = []
+        for items in batch:
+            for i, item in enumerate(items):
+                if len(slots) < len(items):
+                    slots.append([item])
+                else:
+                    slots[i].append(item)
+        return [np.stack(slot, axis=0) for slot in slots]
+
+    if isinstance(batch_transform, list):
+        batch_ops = create_operators(batch_transform, class_num)
+        batch_collate_fn = mix_collate_fn
+    else:
+        batch_collate_fn = None
+
+    # build dataloader
+    config_loader = config[mode]['loader']
+    num_workers = config_loader["num_workers"]
+    use_shared_memory = config_loader["use_shared_memory"]
+
+    init_fn = partial(
+        worker_init_fn,
+        num_workers=num_workers,
+        rank=dist.get_rank(),
+        seed=seed) if seed is not None else None
+
+    if batch_sampler is None:
+        data_loader = DataLoader(
+            dataset=dataset,
+            places=device,
+            num_workers=num_workers,
+            return_list=True,
+            use_shared_memory=use_shared_memory,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            drop_last=drop_last,
+            collate_fn=batch_collate_fn,
+            worker_init_fn=init_fn)
+    else:
+        data_loader = DataLoader(
+            dataset=dataset,
+            places=device,
+            num_workers=num_workers,
+            return_list=True,
+            use_shared_memory=use_shared_memory,
+            batch_sampler=batch_sampler,
+            collate_fn=batch_collate_fn,
+            worker_init_fn=init_fn)
+
+    logger.debug("build data_loader({}) success...".format(data_loader))
+    return data_loader
+
+
+# for PaddleX
+ClsDataset = ImageNetDataset
+ShiTuRecDataset = ImageNetDataset
+MLClsDataset = MultiLabelDataset
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/DistributedRandomIdentitySampler.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/DistributedRandomIdentitySampler.py
new file mode 100644
index 000000000..b4d77a4fc
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/DistributedRandomIdentitySampler.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+
+import copy
+import random
+from collections import defaultdict
+
+import numpy as np
+from paddle.io import DistributedBatchSampler
+
+
+class DistributedRandomIdentitySampler(DistributedBatchSampler):
+    """Randomly sample N identities, then for each identity,
+       randomly sample K instances, therefore batch size equals to N * K.
+    Args:
+        dataset(Dataset): Dataset which contains list of (img_path, pid, camid))
+        batch_size (int): batch size
+        num_instances (int): number of instance(s) within an class
+        drop_last (bool): whether to discard the data at the end
+        max_iters (int): max iteration(s). Default to None.
+    """
+
+    def __init__(self,
+                 dataset,
+                 batch_size,
+                 num_instances,
+                 drop_last,
+                 max_iters=None,
+                 **args):
+        assert batch_size % num_instances == 0, \
+            f"batch_size({batch_size}) must be divisible by num_instances({num_instances}) when using DistributedRandomIdentitySampler"
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.num_instances = num_instances
+        self.drop_last = drop_last
+        self.max_iters = max_iters
+        self.num_pids_per_batch = self.batch_size // self.num_instances
+        self.index_dic = defaultdict(list)
+        for index, pid in enumerate(self.dataset.labels):
+            self.index_dic[pid].append(index)
+        self.pids = list(self.index_dic.keys())
+        # estimate number of examples in an epoch
+        self.length = 0
+        for pid in self.pids:
+            idxs = self.index_dic[pid]
+            num = len(idxs)
+            if num < self.num_instances:
+                num = self.num_instances
+            self.length += num - num % self.num_instances
+
+    def _prepare_batch(self):
+        batch_idxs_dict = defaultdict(list)
+        count = []
+        for pid in self.pids:
+            idxs = copy.deepcopy(self.index_dic[pid])
+            if len(idxs) < self.num_instances:
+                idxs = np.random.choice(
+                    idxs, size=self.num_instances, replace=True)
+            random.shuffle(idxs)
+            batch_idxs = []
+            for idx in idxs:
+                batch_idxs.append(idx)
+                if len(batch_idxs) == self.num_instances:
+                    batch_idxs_dict[pid].append(batch_idxs)
+                    batch_idxs = []
+        count = [len(batch_idxs_dict[pid]) for pid in self.pids]
+        count = np.array(count)
+        avai_pids = copy.deepcopy(self.pids)
+        return batch_idxs_dict, avai_pids, count
+
+    def __iter__(self):
+        # prepare
+        batch_idxs_dict, avai_pids, count = self._prepare_batch()
+
+        # sample
+        if self.max_iters is not None:
+            for _ in range(self.max_iters):
+                final_idxs = []
+                if len(avai_pids) < self.num_pids_per_batch:
+                    batch_idxs_dict, avai_pids, count = self._prepare_batch()
+
+                selected_pids = np.random.choice(avai_pids,
+                                                 self.num_pids_per_batch,
+                                                 False, count / count.sum())
+                for pid in selected_pids:
+                    batch_idxs = batch_idxs_dict[pid].pop(0)
+                    final_idxs.extend(batch_idxs)
+                    pid_idx = avai_pids.index(pid)
+                    if len(batch_idxs_dict[pid]) == 0:
+                        avai_pids.pop(pid_idx)
+                        count = np.delete(count, pid_idx)
+                    else:
+                        count[pid_idx] = len(batch_idxs_dict[pid])
+                yield final_idxs
+        else:
+            final_idxs = []
+            while len(avai_pids) >= self.num_pids_per_batch:
+                selected_pids = random.sample(avai_pids,
+                                              self.num_pids_per_batch)
+                for pid in selected_pids:
+                    batch_idxs = batch_idxs_dict[pid].pop(0)
+                    final_idxs.extend(batch_idxs)
+                    if len(batch_idxs_dict[pid]) == 0:
+                        avai_pids.remove(pid)
+            _sample_iter = iter(final_idxs)
+            batch_indices = []
+            for idx in _sample_iter:
+                batch_indices.append(idx)
+                if len(batch_indices) == self.batch_size:
+                    yield batch_indices
+                    batch_indices = []
+            if not self.drop_last and len(batch_indices) > 0:
+                yield batch_indices
+
+    def __len__(self):
+        if self.max_iters is not None:
+            return self.max_iters
+        elif self.drop_last:
+            return self.length // self.batch_size
+        else:
+            return (self.length + self.batch_size - 1) // self.batch_size
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/__init__.py
new file mode 100644
index 000000000..c919a52bc
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/__init__.py
@@ -0,0 +1,16 @@
+from ppcls.data.dataloader.imagenet_dataset import ImageNetDataset
+from ppcls.data.dataloader.multilabel_dataset import MultiLabelDataset
+from ppcls.data.dataloader.common_dataset import create_operators
+from ppcls.data.dataloader.vehicle_dataset import CompCars, VeriWild
+from ppcls.data.dataloader.logo_dataset import LogoDataset
+from ppcls.data.dataloader.icartoon_dataset import ICartoonDataset
+from ppcls.data.dataloader.mix_dataset import MixDataset
+from ppcls.data.dataloader.multi_scale_dataset import MultiScaleDataset
+from ppcls.data.dataloader.mix_sampler import MixSampler
+from ppcls.data.dataloader.multi_scale_sampler import MultiScaleSampler
+from ppcls.data.dataloader.pk_sampler import PKSampler
+from ppcls.data.dataloader.person_dataset import Market1501, MSMT17, DukeMTMC
+from ppcls.data.dataloader.face_dataset import FaceEvalDataset, FiveFaceEvalDataset
+from ppcls.data.dataloader.custom_label_dataset import CustomLabelDataset
+from ppcls.data.dataloader.cifar import Cifar10, Cifar100
+from ppcls.data.dataloader.metabin_sampler import DomainShuffleBatchSampler, NaiveIdentityBatchSampler
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/cifar.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/cifar.py
new file mode 100644
index 000000000..614f4f3e9
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/cifar.py
@@ -0,0 +1,136 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import numpy as np
+import cv2
+from ppcls.data import preprocess
+from ppcls.data.preprocess import transform
+from ppcls.data.dataloader.common_dataset import create_operators
+from paddle.vision.datasets import Cifar10 as Cifar10_paddle
+from paddle.vision.datasets import Cifar100 as Cifar100_paddle
+
+
+class Cifar10(Cifar10_paddle):
+    def __init__(self,
+                 data_file=None,
+                 mode='train',
+                 download=True,
+                 backend='cv2',
+                 sample_per_label=None,
+                 expand_labels=1,
+                 transform_ops=None,
+                 transform_ops_weak=None,
+                 transform_ops_strong=None,
+                 transform_ops_strong2=None):
+        super().__init__(data_file, mode, None, download, backend)
+        assert isinstance(expand_labels, int)
+        self._transform_ops = create_operators(transform_ops)
+        self._transform_ops_weak = create_operators(transform_ops_weak)
+        self._transform_ops_strong = create_operators(transform_ops_strong)
+        self._transform_ops_strong2 = create_operators(transform_ops_strong2)
+        self.class_num = 10
+        labels = []
+        for x in self.data:
+            labels.append(x[1])
+        labels = np.array(labels)
+        if isinstance(sample_per_label, int):
+            index = []
+            for i in range(self.class_num):
+                idx = np.where(labels == i)[0]
+                idx = np.random.choice(idx, sample_per_label, False)
+                index.extend(idx)
+            index = index * expand_labels
+            data = [self.data[x] for x in index]
+            self.data = data
+
+    def __getitem__(self, idx):
+        (image, label) = super().__getitem__(idx)
+        if self._transform_ops:
+            image1 = transform(image, self._transform_ops)
+            image1 = image1.transpose((2, 0, 1))
+            return (image1, np.int64(label))
+        elif self._transform_ops_weak and self._transform_ops_strong and self._transform_ops_strong2:
+            image2 = transform(image, self._transform_ops_weak)
+            image2 = image2.transpose((2, 0, 1))
+            image3 = transform(image, self._transform_ops_strong)
+            image3 = image3.transpose((2, 0, 1))
+            image4 = transform(image, self._transform_ops_strong2)
+            image4 = image4.transpose((2, 0, 1))
+            return (image2, image3, image4, np.int64(label))
+
+        elif self._transform_ops_weak and self._transform_ops_strong:
+            image2 = transform(image, self._transform_ops_weak)
+            image2 = image2.transpose((2, 0, 1))
+            image3 = transform(image, self._transform_ops_strong)
+            image3 = image3.transpose((2, 0, 1))
+
+            return (image2, image3, np.int64(label))
+
+
+class Cifar100(Cifar100_paddle):
+    def __init__(self,
+                 data_file=None,
+                 mode='train',
+                 download=True,
+                 backend='pil',
+                 sample_per_label=None,
+                 expand_labels=1,
+                 transform_ops=None,
+                 transform_ops_weak=None,
+                 transform_ops_strong=None,
+                 transform_ops_strong2=None):
+        super().__init__(data_file, mode, None, download, backend)
+        assert isinstance(expand_labels, int)
+        self._transform_ops = create_operators(transform_ops)
+        self._transform_ops_weak = create_operators(transform_ops_weak)
+        self._transform_ops_strong = create_operators(transform_ops_strong)
+        self._transform_ops_strong2 = create_operators(transform_ops_strong2)
+        self.class_num = 100
+
+        labels = []
+        for x in self.data:
+            labels.append(x[1])
+        labels = np.array(labels)
+        if isinstance(sample_per_label, int):
+            index = []
+            for i in range(self.class_num):
+                idx = np.where(labels == i)[0]
+                idx = np.random.choice(idx, sample_per_label, False)
+                index.extend(idx)
+            index = index * expand_labels
+            data = [self.data[x] for x in index]
+            self.data = data
+
+    def __getitem__(self, idx):
+        (image, label) = super().__getitem__(idx)
+        if self._transform_ops:
+            image1 = transform(image, self._transform_ops)
+            image1 = image1.transpose((2, 0, 1))
+            return (image1, np.int64(label))
+        elif self._transform_ops_weak and self._transform_ops_strong and self._transform_ops_strong2:
+            image2 = transform(image, self._transform_ops_weak)
+            image2 = image2.transpose((2, 0, 1))
+            image3 = transform(image, self._transform_ops_strong)
+            image3 = image3.transpose((2, 0, 1))
+            image4 = transform(image, self._transform_ops_strong2)
+            image4 = image4.transpose((2, 0, 1))
+            return (image2, image3, image4, np.int64(label))
+        elif self._transform_ops_weak and self._transform_ops_strong:
+            image2 = transform(image, self._transform_ops_weak)
+            image2 = image2.transpose((2, 0, 1))
+            image3 = transform(image, self._transform_ops_strong)
+            image3 = image3.transpose((2, 0, 1))
+
+            return (image2, image3, np.int64(label))
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/common_dataset.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/common_dataset.py
new file mode 100644
index 000000000..7530137eb
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/common_dataset.py
@@ -0,0 +1,88 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+
+from paddle.io import Dataset
+import cv2
+
+from ppcls.data import preprocess
+from ppcls.data.preprocess import transform
+from ppcls.utils import logger
+
+
+def create_operators(params):
+    """
+    create operators based on the config
+    Args:
+        params(list): a dict list, used to create some operators
+    """
+    if params is None:
+        return None
+    assert isinstance(params, list), ('operator config should be a list')
+    ops = []
+    for operator in params:
+        assert isinstance(operator,
+                          dict) and len(operator) == 1, "yaml format error"
+        op_name = list(operator)[0]
+        param = {} if operator[op_name] is None else operator[op_name]
+        op = getattr(preprocess, op_name)(**param)
+        ops.append(op)
+
+    return ops
+
+
+class CommonDataset(Dataset):
+    def __init__(self,
+                 image_root,
+                 cls_label_path,
+                 transform_ops=None,
+                 label_ratio=False):
+        self._img_root = image_root
+        self._cls_path = cls_label_path
+        self._transform_ops = create_operators(transform_ops)
+
+        self.images = []
+        self.labels = []
+        if label_ratio:
+            self.label_ratio = self._load_anno(label_ratio=label_ratio)
+        else:
+            self._load_anno()
+
+    def _load_anno(self):
+        pass
+
+    def __getitem__(self, idx):
+        try:
+            with open(self.images[idx], 'rb') as f:
+                img = f.read()
+            if self._transform_ops:
+                img = transform(img, self._transform_ops)
+            img = img.transpose((2, 0, 1))
+            return (img, self.labels[idx])
+
+        except Exception as ex:
+            logger.error("Exception occured when parse line: {} with msg: {}".
+                         format(self.images[idx], ex))
+            rnd_idx = np.random.randint(self.__len__())
+            return self.__getitem__(rnd_idx)
+
+    def __len__(self):
+        return len(self.images)
+
+    @property
+    def class_num(self):
+        return len(set(self.labels))
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/custom_label_dataset.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/custom_label_dataset.py
new file mode 100644
index 000000000..eeae8ad09
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/custom_label_dataset.py
@@ -0,0 +1,88 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import numpy as np
+
+from ppcls.data.preprocess import transform
+from ppcls.utils import logger
+
+from .common_dataset import CommonDataset
+
+
+class CustomLabelDataset(CommonDataset):
+    """CustomLabelDataset
+
+    Args:
+        image_root (str): image root, path to `ILSVRC2012`
+        sample_list_path (str): path to the file with samples listed.
+        transform_ops (list, optional): list of transform op(s). Defaults to None.
+        label_key (str, optional): Defaults to None.
+        delimiter (str, optional): delimiter. Defaults to None.
+    """
+
+    def __init__(self,
+                 image_root,
+                 sample_list_path,
+                 transform_ops=None,
+                 label_key=None,
+                 delimiter=None):
+        self.delimiter = delimiter
+        super().__init__(image_root, sample_list_path, transform_ops)
+        if self._transform_ops is None and label_key is not None:
+            label_key = None
+            msg = "Unable to get label by label_key when transform_ops is None. The label_key has been set to None."
+            logger.warning(msg)
+        self.label_key = label_key
+
+    def _load_anno(self, seed=None):
+        assert os.path.exists(
+            self._cls_path), f"path {self._cls_path} does not exist."
+        assert os.path.exists(
+            self._img_root), f"path {self._img_root} does not exist."
+        self.images = []
+
+        with open(self._cls_path) as fd:
+            lines = fd.readlines()
+
+            if seed is not None:
+                np.random.RandomState(seed).shuffle(lines)
+            for line in lines:
+                line = line.strip()
+                if self.delimiter is not None:
+                    line = line.split(self.delimiter)[0]
+                self.images.append(os.path.join(self._img_root, line))
+                assert os.path.exists(self.images[
+                    -1]), f"path {self.images[-1]} does not exist."
+
+    def __getitem__(self, idx):
+        try:
+            with open(self.images[idx], 'rb') as f:
+                img = f.read()
+            if self._transform_ops:
+                processed_sample = transform({"img": img}, self._transform_ops)
+                img = processed_sample["img"].transpose((2, 0, 1))
+                if self.label_key is not None:
+                    label = processed_sample[self.label_key]
+                    sample = (img, label)
+                    return sample
+            return (img)
+
+        except Exception as ex:
+            logger.error("Exception occured when parse line: {} with msg: {}".
+                         format(self.images[idx], ex))
+            rnd_idx = np.random.randint(self.__len__())
+            return self.__getitem__(rnd_idx)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/dali.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/dali.py
new file mode 100644
index 000000000..9720262c6
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/dali.py
@@ -0,0 +1,795 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import copy
+import os
+from collections import defaultdict
+from typing import Any, Callable, Dict, List, Tuple, Union, Optional
+
+import numpy as np
+import nvidia.dali.fn as fn
+import nvidia.dali.ops as ops
+import nvidia.dali.pipeline as pipeline
+import nvidia.dali.types as types
+import paddle
+from nvidia.dali.plugin.paddle import DALIGenericIterator
+from nvidia.dali.plugin.base_iterator import LastBatchPolicy
+from ppcls.data.preprocess.ops.dali_operators import ColorJitter
+from ppcls.data.preprocess.ops.dali_operators import CropImage
+from ppcls.data.preprocess.ops.dali_operators import CropMirrorNormalize
+from ppcls.data.preprocess.ops.dali_operators import DecodeImage
+from ppcls.data.preprocess.ops.dali_operators import DecodeRandomResizedCrop
+from ppcls.data.preprocess.ops.dali_operators import NormalizeImage
+from ppcls.data.preprocess.ops.dali_operators import Pad
+from ppcls.data.preprocess.ops.dali_operators import RandCropImage
+from ppcls.data.preprocess.ops.dali_operators import RandCropImageV2
+from ppcls.data.preprocess.ops.dali_operators import RandFlipImage
+from ppcls.data.preprocess.ops.dali_operators import RandomCropImage
+from ppcls.data.preprocess.ops.dali_operators import RandomRot90
+from ppcls.data.preprocess.ops.dali_operators import RandomRotation
+from ppcls.data.preprocess.ops.dali_operators import ResizeImage
+from ppcls.data.preprocess.ops.dali_operators import ToCHWImage
+from ppcls.engine.train.utils import type_name
+from ppcls.utils import logger
+
+INTERP_MAP = {
+    "nearest": types.DALIInterpType.INTERP_NN,  # cv2.INTER_NEAREST
+    "bilinear": types.DALIInterpType.INTERP_LINEAR,  # cv2.INTER_LINEAR
+    "bicubic": types.DALIInterpType.INTERP_CUBIC,  # cv2.INTER_CUBIC
+    "lanczos": types.DALIInterpType.INTERP_LANCZOS3,  # cv2.INTER_LANCZOS4
+}
+
+
+def make_pair(x: Union[Any, Tuple[Any], List[Any]]) -> Tuple[Any]:
+    """repeat input x to be an tuple if x is an single element, else return x directly
+
+    Args:
+        x (Union[Any, Tuple[Any], List[Any]]): input x
+
+    Returns:
+        Tuple[Any]: tupled input
+    """
+    return x if isinstance(x, (tuple, list)) else (x, x)
+
+
+def parse_value_with_key(content: Union[Dict, List[Dict]],
+                         key: str) -> Union[None, Any]:
+    """parse value according to given key recursively, return None if not found
+
+    Args:
+        content (Union[Dict, List[Dict]]): content to be parsed
+        key (str): given key
+
+    Returns:
+        Union[None, Any]: result
+    """
+    if isinstance(content, dict):
+        if key in content:
+            return content[key]
+        for content_ in content.values():
+            value = parse_value_with_key(content_, key)
+            if value is not None:
+                return value
+    elif isinstance(content, (tuple, list)):
+        for content_ in content:
+            value = parse_value_with_key(content_, key)
+            if value is not None:
+                return value
+    return None
+
+
+def convert_cfg_to_dali(op_name: str, device: str, **op_cfg) -> Dict[str, Any]:
+    """convert original preprocess op params into DALI-based op params
+
+    Args:
+        op_name (str): name of operator
+        device (str): device which operator applied on
+
+    Returns:
+        Dict[str, Any]: converted arguments for DALI initialization
+    """
+    assert device in ["cpu", "gpu"
+                      ], f"device({device}) must in [\"cpu\", \"gpu\"]"
+    dali_op_cfg = {}
+    if op_name == "DecodeImage":
+        device = "cpu" if device == "cpu" else "mixed"
+        to_rgb = op_cfg.get("to_rgb", True)
+        channel_first = op_cfg.get("channel_first", False)
+        assert channel_first is False, \
+            f"`channel_first` must set to False when using DALI, but got {channel_first}"
+        dali_op_cfg.update({"device": device})
+        dali_op_cfg.update({
+            "output_type": types.DALIImageType.RGB
+            if to_rgb else types.DALIImageType.BGR
+        })
+        dali_op_cfg.update({
+            "device_memory_padding":
+            op_cfg.get("device_memory_padding", 211025920)
+        })
+        dali_op_cfg.update({
+            "host_memory_padding": op_cfg.get("host_memory_padding", 140544512)
+        })
+    elif op_name == "ResizeImage":
+        size = op_cfg.get("size", None)
+        resize_short = op_cfg.get("resize_short", None)
+        interpolation = op_cfg.get("interpolation", None)
+        if size is not None:
+            size = make_pair(size)
+            dali_op_cfg.update({"resize_y": size[0], "resize_x": size[1]})
+        if resize_short is not None:
+            dali_op_cfg.update({"resize_shorter": resize_short})
+        if interpolation is not None:
+            dali_op_cfg.update({"interp_type": INTERP_MAP[interpolation]})
+    elif op_name == "CropImage":
+        size = op_cfg.get("size", 224)
+        size = make_pair(size)
+        dali_op_cfg.update({"crop_h": size[1], "crop_w": size[0]})
+        dali_op_cfg.update({"crop_pos_x": 0.5, "crop_pos_y": 0.5})
+    elif op_name == "RandomCropImage":
+        size = op_cfg.get("size", 224)
+        if size is not None:
+            size = make_pair(size)
+            dali_op_cfg.update({"crop_h": size[1], "crop_w": size[0]})
+    elif op_name == "RandCropImage":
+        size = op_cfg.get("size", 224)
+        size = make_pair(size)
+        scale = op_cfg.get("scale", [0.08, 1.0])
+        ratio = op_cfg.get("ratio", [3.0 / 4, 4.0 / 3])
+        interpolation = op_cfg.get("interpolation", "bilinear")
+        dali_op_cfg.update({"size": size})
+        if scale is not None:
+            dali_op_cfg.update({"random_area": scale})
+        if ratio is not None:
+            dali_op_cfg.update({"random_aspect_ratio": ratio})
+        if interpolation is not None:
+            dali_op_cfg.update({"interp_type": INTERP_MAP[interpolation]})
+    elif op_name == "RandCropImageV2":
+        size = op_cfg.get("size", 224)
+        size = make_pair(size)
+        dali_op_cfg.update({"crop_h": size[1], "crop_w": size[0]})
+    elif op_name == "RandFlipImage":
+        prob = op_cfg.get("prob", 0.5)
+        flip_code = op_cfg.get("flip_code", 1)
+        dali_op_cfg.update({"prob": prob})
+        dali_op_cfg.update({"flip_code": flip_code})
+    elif op_name == "NormalizeImage":
+        # scale * (in - mean) / stddev + shift
+        scale = op_cfg.get("scale", 1.0 / 255.0)
+        if isinstance(scale, str):
+            scale = eval(scale)
+        mean = op_cfg.get("mean", [0.485, 0.456, 0.406])
+        std = op_cfg.get("std", [0.229, 0.224, 0.225])
+        mean = [v / scale for v in mean]
+        std = [v / scale for v in std]
+        order = op_cfg.get("order", "chw")
+        channel_num = op_cfg.get("channel_num", 3)
+        output_fp16 = op_cfg.get("output_fp16", False)
+        dali_op_cfg.update({
+            "mean": np.reshape(
+                np.array(
+                    mean, dtype="float32"), [channel_num, 1, 1]
+                if order == "chw" else [1, 1, channel_num])
+        })
+        dali_op_cfg.update({
+            "stddev": np.reshape(
+                np.array(
+                    std, dtype="float32"), [channel_num, 1, 1]
+                if order == "chw" else [1, 1, channel_num])
+        })
+        if output_fp16:
+            dali_op_cfg.update({"dtype": types.FLOAT16})
+    elif op_name == "ToCHWImage":
+        dali_op_cfg.update({"perm": [2, 0, 1]})
+    elif op_name == "ColorJitter":
+        prob = op_cfg.get("prob", 1.0)
+        brightness = op_cfg.get("brightness", 0.0)
+        contrast = op_cfg.get("contrast", 0.0)
+        saturation = op_cfg.get("saturation", 0.0)
+        hue = op_cfg.get("hue", 0.0)
+        dali_op_cfg.update({"prob": prob})
+        dali_op_cfg.update({"brightness_factor": brightness})
+        dali_op_cfg.update({"contrast_factor": contrast})
+        dali_op_cfg.update({"saturation_factor": saturation})
+        dali_op_cfg.update({"hue_factor": hue})
+    elif op_name == "RandomRotation":
+        prob = op_cfg.get("prob", 0.5)
+        degrees = op_cfg.get("degrees", 90)
+        interpolation = op_cfg.get("interpolation", "bilinear")
+        dali_op_cfg.update({"prob": prob})
+        dali_op_cfg.update({"angle": degrees})
+        dali_op_cfg.update({"interp_type": INTERP_MAP[interpolation]})
+    elif op_name == "Pad":
+        size = op_cfg.get("size", 224)
+        size = make_pair(size)
+        padding = op_cfg.get("padding", 0)
+        fill = op_cfg.get("fill", 0)
+        dali_op_cfg.update({
+            "crop_h": padding + size[1] + padding,
+            "crop_w": padding + size[0] + padding
+        })
+        dali_op_cfg.update({"fill_values": fill})
+        dali_op_cfg.update({"out_of_bounds_policy": "pad"})
+    elif op_name == "RandomRot90":
+        interpolation = op_cfg.get("interpolation", "nearest")
+    elif op_name == "DecodeRandomResizedCrop":
+        device = "cpu" if device == "cpu" else "mixed"
+        output_type = op_cfg.get("output_type", types.DALIImageType.RGB)
+        device_memory_padding = op_cfg.get("device_memory_padding", 211025920)
+        host_memory_padding = op_cfg.get("host_memory_padding", 140544512)
+        scale = op_cfg.get("scale", [0.08, 1.0])
+        ratio = op_cfg.get("ratio", [3.0 / 4, 4.0 / 3])
+        num_attempts = op_cfg.get("num_attempts", 100)
+        size = op_cfg.get("size", 224)
+        dali_op_cfg.update({"device": device})
+        if output_type is not None:
+            dali_op_cfg.update({"output_type": output_type})
+        if device_memory_padding is not None:
+            dali_op_cfg.update({
+                "device_memory_padding": device_memory_padding
+            })
+        if host_memory_padding is not None:
+            dali_op_cfg.update({"host_memory_padding": host_memory_padding})
+        if scale is not None:
+            dali_op_cfg.update({"random_area": scale})
+        if ratio is not None:
+            dali_op_cfg.update({"random_aspect_ratio": ratio})
+        if num_attempts is not None:
+            dali_op_cfg.update({"num_attempts": num_attempts})
+        if size is not None:
+            dali_op_cfg.update({"resize_x": size, "resize_y": size})
+    elif op_name == "CropMirrorNormalize":
+        dtype = types.FLOAT16 if op_cfg.get("output_fp16",
+                                            False) else types.FLOAT
+        output_layout = op_cfg.get("output_layout", "CHW")
+        size = op_cfg.get("size", None)
+        scale = op_cfg.get("scale", 1 / 255.0)
+        if isinstance(scale, str):
+            scale = eval(scale)
+        mean = op_cfg.get("mean", [0.485, 0.456, 0.406])
+        mean = [v / scale for v in mean]
+        std = op_cfg.get("std", [0.229, 0.224, 0.225])
+        std = [v / scale for v in std]
+        pad_output = op_cfg.get("channel_num", 3) == 4
+        prob = op_cfg.get("prob", 0.5)
+        dali_op_cfg.update({"dtype": dtype})
+        if output_layout is not None:
+            dali_op_cfg.update({"output_layout": output_layout})
+        if size is not None:
+            dali_op_cfg.update({"crop": (size, size)})
+        if mean is not None:
+            dali_op_cfg.update({"mean": mean})
+        if std is not None:
+            dali_op_cfg.update({"std": std})
+        if pad_output is not None:
+            dali_op_cfg.update({"pad_output": pad_output})
+        if prob is not None:
+            dali_op_cfg.update({"prob": prob})
+    else:
+        raise ValueError(
+            f"DALI operator \"{op_name}\"  in PaddleClas is not implemented now. please refer to docs/zh_CN/training/config_description/develop_with_DALI.md"
+        )
+    if "device" not in dali_op_cfg:
+        dali_op_cfg.update({"device": device})
+    return dali_op_cfg
+
+
+def build_dali_transforms(op_cfg_list: List[Dict[str, Any]],
+                          mode: str,
+                          device: str="gpu",
+                          enable_fuse: bool=True) -> List[Callable]:
+    """create dali operators based on the config
+    Args:
+        op_cfg_list (List[Dict[str, Any]]): a dict list, used to create some operators, such as config below
+            --------------------------------
+            - DecodeImage:
+                to_rgb: True
+                channel_first: False
+            - ResizeImage:
+                size: 224
+            - NormalizeImage:
+                scale: 0.00392157
+                mean: [0.485, 0.456, 0.406]
+                std: [0.229, 0.224, 0.225]
+                order: ""
+            --------------------------------
+        mode (str): mode.
+        device (str): device which dali operator(s) applied in. Defaults to "gpu".
+        enable_fuse (bool): whether to use fused dali operators instead of single operators, such as DecodeRandomResizedCrop. Defaults to True.
+    Returns:
+        List[Callable]: Callable DALI operators in list.
+    """
+    assert isinstance(op_cfg_list, list), "operator config should be a list"
+    # build dali transforms list
+
+    dali_op_list = []
+    idx = 0
+    num_cfg_node = len(op_cfg_list)
+    while idx < num_cfg_node:
+        op_cfg = op_cfg_list[idx]
+        op_name = list(op_cfg)[0]
+        op_param = {} if op_cfg[op_name] is None else copy.deepcopy(op_cfg[
+            op_name])
+        fused_success = False
+        if enable_fuse:
+            # fuse operators if enabled
+            if idx + 1 < num_cfg_node:
+                op_name_nxt = list(op_cfg_list[idx + 1])[0]
+                if (op_name == "DecodeImage" and
+                        op_name_nxt == "RandCropImage"):
+                    fused_op_name = "DecodeRandomResizedCrop"
+                    fused_op_param = convert_cfg_to_dali(
+                        fused_op_name, device, **{
+                            ** op_param, ** (op_cfg_list[idx + 1][op_name_nxt])
+                        })
+                    fused_dali_op = eval(fused_op_name)(**fused_op_param)
+                    idx += 2
+                    dali_op_list.append(fused_dali_op)
+                    fused_success = True
+                    logger.info(
+                        f"DALI fused Operator conversion({mode}): [DecodeImage, RandCropImage] -> {type_name(dali_op_list[-1])}: {fused_op_param}"
+                    )
+            if not fused_success and 0 < idx and idx + 1 < num_cfg_node:
+                op_name_pre = list(op_cfg_list[idx - 1])[0]
+                op_name_nxt = list(op_cfg_list[idx + 1])[0]
+                if (op_name_pre == "RandCropImage" and
+                        op_name == "RandFlipImage" and
+                        op_name_nxt == "NormalizeImage"):
+                    fused_op_name = "CropMirrorNormalize"
+                    fused_op_param = convert_cfg_to_dali(
+                        fused_op_name, device, **{
+                            ** op_param, **
+                            (op_cfg_list[idx - 1][op_name_pre]), **
+                            (op_cfg_list[idx + 1][op_name_nxt])
+                        })
+                    fused_dali_op = eval(fused_op_name)(**fused_op_param)
+                    idx += 2
+                    dali_op_list.append(fused_dali_op)
+                    fused_success = True
+                    logger.info(
+                        f"DALI fused Operator conversion({mode}): [RandCropImage, RandFlipImage, NormalizeImage] -> {type_name(dali_op_list[-1])}: {fused_op_param}"
+                    )
+            if not fused_success and idx + 1 < num_cfg_node:
+                op_name_nxt = list(op_cfg_list[idx + 1])[0]
+                if (op_name == "CropImage" and
+                        op_name_nxt == "NormalizeImage"):
+                    fused_op_name = "CropMirrorNormalize"
+                    fused_op_param = convert_cfg_to_dali(
+                        fused_op_name, device, **{
+                            **
+                            op_param,
+                            **
+                            (op_cfg_list[idx + 1][op_name_nxt]),
+                            "prob": 0.0
+                        })
+                    fused_dali_op = eval(fused_op_name)(**fused_op_param)
+                    idx += 2
+                    dali_op_list.append(fused_dali_op)
+                    fused_success = True
+                    logger.info(
+                        f"DALI fused Operator conversion({mode}): [CropImage, NormalizeImage] -> {type_name(dali_op_list[-1])}: {fused_op_param}"
+                    )
+        if not enable_fuse or not fused_success:
+            assert isinstance(op_cfg,
+                              dict) and len(op_cfg) == 1, "yaml format error"
+            if op_name == "Pad":
+                # NOTE: Argument `size` must be provided for DALI operator
+                op_param.update({
+                    "size": parse_value_with_key(op_cfg_list[:idx], "size")
+                })
+            dali_param = convert_cfg_to_dali(op_name, device, **op_param)
+            dali_op = eval(op_name)(**dali_param)
+            dali_op_list.append(dali_op)
+            idx += 1
+            logger.info(
+                f"DALI Operator conversion({mode}): {op_name} -> {type_name(dali_op_list[-1])}: {dali_param}"
+            )
+    return dali_op_list
+
+
+class ExternalSource_RandomIdentity(object):
+    """PKsampler implemented with ExternalSource
+
+    Args:
+        batch_size (int): batch size
+        sample_per_id (int): number of instance(s) within an class
+        device_id (int): device id
+        shard_id (int): shard id
+        num_gpus (int): number of gpus
+        image_root (str): image root directory
+        cls_label_path (str): path to annotation file, such as `train_list.txt` or `val_list.txt`
+        delimiter (Optional[str], optional): delimiter. Defaults to None.
+        relabel (bool, optional): whether do relabel when original label do not starts from 0 or are discontinuous. Defaults to False.
+        sample_method (str, optional): sample method when generating prob_list. Defaults to "sample_avg_prob".
+        id_list (List[int], optional): list of (start_id, end_id, start_id, end_id) for set of ids to duplicated. Defaults to None.
+        ratio (List[Union[int, float]], optional): list of (ratio1, ratio2..) the duplication number for ids in id_list. Defaults to None.
+        shuffle (bool): whether to shuffle label list. Defaults to True.
+    """
+
+    def __init__(self,
+                 batch_size: int,
+                 sample_per_id: int,
+                 device_id: int,
+                 shard_id: int,
+                 num_gpus: int,
+                 image_root: str,
+                 cls_label_path: str,
+                 delimiter: Optional[str]=None,
+                 relabel: bool=False,
+                 sample_method: str="sample_avg_prob",
+                 id_list: List[int]=None,
+                 ratio: List[Union[int, float]]=None,
+                 shuffle: bool=True):
+        self.batch_size = batch_size
+        self.sample_per_id = sample_per_id
+        self.label_per_batch = self.batch_size // self.sample_per_id
+        self.device_id = device_id
+        self.shard_id = shard_id
+        self.num_gpus = num_gpus
+        self._img_root = image_root
+        self._cls_path = cls_label_path
+        self.delimiter = delimiter if delimiter is not None else " "
+        self.relabel = relabel
+        self.sample_method = sample_method
+        self.image_paths = []
+        self.labels = []
+        self.epoch = 0
+
+        # NOTE: code from ImageNetDataset below
+        with open(self._cls_path, "r") as fd:
+            lines = fd.readlines()
+            if self.relabel:
+                label_set = set()
+                for line in lines:
+                    line = line.strip().split(self.delimiter)
+                    label_set.add(np.int64(line[1]))
+                label_map = {
+                    oldlabel: newlabel
+                    for newlabel, oldlabel in enumerate(label_set)
+                }
+
+            for line in lines:
+                line = line.strip().split(self.delimiter)
+                self.image_paths.append(os.path.join(self._img_root, line[0]))
+                if self.relabel:
+                    self.labels.append(label_map[np.int64(line[1])])
+                else:
+                    self.labels.append(np.int64(line[1]))
+                assert os.path.exists(self.image_paths[
+                    -1]), f"path {self.image_paths[-1]} does not exist."
+
+        # NOTE: code from PKSampler below
+        # group sample indexes into their label bucket
+        self.label_dict = defaultdict(list)
+        for idx, label in enumerate(self.labels):
+            self.label_dict[label].append(idx)
+        # get all label
+        self.label_list = list(self.label_dict)
+        assert len(self.label_list) * self.sample_per_id >= self.batch_size, \
+            f"batch size({self.batch_size}) should not be bigger than than #classes({len(self.label_list)})*sample_per_id({self.sample_per_id})"
+
+        if self.sample_method == "id_avg_prob":
+            self.prob_list = np.array([1 / len(self.label_list)] *
+                                      len(self.label_list))
+        elif self.sample_method == "sample_avg_prob":
+            counter = []
+            for label_i in self.label_list:
+                counter.append(len(self.label_dict[label_i]))
+            self.prob_list = np.array(counter) / sum(counter)
+
+        # reweight prob_list according to id_list and ratio if provided
+        if id_list and ratio:
+            assert len(id_list) % 2 == 0 and len(id_list) == len(ratio) * 2
+            for i in range(len(self.prob_list)):
+                for j in range(len(ratio)):
+                    if i >= id_list[j * 2] and i <= id_list[j * 2 + 1]:
+                        self.prob_list[i] = self.prob_list[i] * ratio[j]
+                        break
+            self.prob_list = self.prob_list / sum(self.prob_list)
+
+        assert os.path.exists(
+            self._cls_path), f"path {self._cls_path} does not exist."
+        assert os.path.exists(
+            self._img_root), f"path {self._img_root} does not exist."
+
+        diff = np.abs(sum(self.prob_list) - 1)
+        if diff > 0.00000001:
+            self.prob_list[-1] = 1 - sum(self.prob_list[:-1])
+            if self.prob_list[-1] > 1 or self.prob_list[-1] < 0:
+                logger.error("PKSampler prob list error")
+            else:
+                logger.info(
+                    "sum of prob list not equal to 1, diff is {}, change the last prob".
+                    format(diff))
+
+        # whole dataset size
+        self.data_set_len = len(self.image_paths)
+
+        # get sharded size
+        self.sharded_data_set_len = self.data_set_len // self.num_gpus
+
+        # iteration log
+        self.shuffle = shuffle
+        self.total_iter = self.sharded_data_set_len // batch_size
+        self.iter_count = 0
+
+    def __iter__(self):
+        if self.shuffle:
+            seed = self.shard_id * 12345 + self.epoch
+            np.random.RandomState(seed).shuffle(self.label_list)
+            np.random.RandomState(seed).shuffle(self.prob_list)
+            self.epoch += 1
+        return self
+
+    def __next__(self):
+        if self.iter_count >= self.total_iter:
+            self.__iter__()
+            self.iter_count = 0
+
+        batch_indexes = []
+        for _ in range(self.sharded_data_set_len):
+            batch_label_list = np.random.choice(
+                self.label_list,
+                size=self.label_per_batch,
+                replace=False,
+                p=self.prob_list)
+            for label_i in batch_label_list:
+                label_i_indexes = self.label_dict[label_i]
+                if self.sample_per_id <= len(label_i_indexes):
+                    batch_indexes.extend(
+                        np.random.choice(
+                            label_i_indexes,
+                            size=self.sample_per_id,
+                            replace=False))
+                else:
+                    batch_indexes.extend(
+                        np.random.choice(
+                            label_i_indexes,
+                            size=self.sample_per_id,
+                            replace=True))
+            if len(batch_indexes) == self.batch_size:
+                break
+            batch_indexes = []
+
+        batch_raw_images = []
+        batch_labels = []
+        for index in batch_indexes:
+            batch_raw_images.append(
+                np.fromfile(
+                    self.image_paths[index], dtype="uint8"))
+            batch_labels.append(self.labels[index])
+
+        self.iter_count += 1
+        return (batch_raw_images, np.array(batch_labels, dtype="int64"))
+
+    def __len__(self):
+        return self.sharded_data_set_len
+
+
+class HybridPipeline(pipeline.Pipeline):
+    """Hybrid Pipeline
+
+    Args:
+        device (str): device
+        batch_size (int): batch size
+        py_num_workers (int): number of python worker(s)
+        num_threads (int): number of thread(s)
+        device_id (int): device id
+        seed (int): random seed
+        file_root (str): file root path
+        file_list (str): path to annotation file, such as `train_list.txt` or `val_list.txt`
+        transform_list (List[Callable]): List of DALI transform operator(s)
+        shard_id (int, optional): shard id. Defaults to 0.
+        num_shards (int, optional): number of shard(s). Defaults to 1.
+        random_shuffle (bool, optional): whether shuffle data during training. Defaults to True.
+        ext_src (optional): custom external source. Defaults to None.
+    """
+
+    def __init__(self,
+                 device: str,
+                 batch_size: int,
+                 py_num_workers: int,
+                 num_threads: int,
+                 device_id: int,
+                 seed: int,
+                 file_root: str,
+                 file_list: str,
+                 transform_list: List[Callable],
+                 shard_id: int=0,
+                 num_shards: int=1,
+                 random_shuffle: bool=True,
+                 ext_src=None):
+        super(HybridPipeline, self).__init__(
+            batch_size=batch_size,
+            device_id=device_id,
+            seed=seed,
+            py_start_method="fork" if ext_src is None else "spawn",
+            py_num_workers=py_num_workers,
+            num_threads=num_threads)
+        self.device = device
+        self.ext_src = ext_src
+        if ext_src is None:
+            self.reader = ops.readers.File(
+                file_root=file_root,
+                file_list=file_list,
+                shard_id=shard_id,
+                num_shards=num_shards,
+                random_shuffle=random_shuffle)
+        self.transforms = ops.Compose(transform_list)
+        self.cast = ops.Cast(dtype=types.DALIDataType.INT64, device=device)
+
+    def define_graph(self):
+        if self.ext_src:
+            raw_images, labels = fn.external_source(
+                source=self.ext_src,
+                num_outputs=2,
+                dtype=[types.DALIDataType.UINT8, types.DALIDataType.INT64],
+                batch=True,
+                parallel=True)
+        else:
+            raw_images, labels = self.reader(name="Reader")
+        images = self.transforms(raw_images)
+        return [
+            images, self.cast(labels.gpu() if self.device == "gpu" else labels)
+        ]
+
+    def __len__(self):
+        if self.ext_src is not None:
+            return len(self.ext_src)
+        return self.epoch_size(name="Reader")
+
+
+class DALIImageNetIterator(DALIGenericIterator):
+    def __init__(self, *kargs, **kwargs):
+        super(DALIImageNetIterator, self).__init__(*kargs, **kwargs)
+        self.in_dynamic_mode = paddle.in_dynamic_mode()
+
+    def __next__(self) -> List[paddle.Tensor]:
+        data_batch = super(DALIImageNetIterator,
+                           self).__next__()  # List[Dict[str, Tensor], ...]
+        # reformat to List[Tensor1, Tensor2, ...]
+        data_batch = [
+            paddle.to_tensor(data_batch[0][key])
+            if self.in_dynamic_mode else data_batch[0][key]
+            for key in self.output_map
+        ]
+        return data_batch
+
+
+def dali_dataloader(config: Dict[str, Any],
+                    mode: str,
+                    device: str,
+                    py_num_workers: int=1,
+                    num_threads: int=4,
+                    seed: Optional[int]=None,
+                    enable_fuse: bool=True) -> DALIImageNetIterator:
+    """build and return HybridPipeline
+
+    Args:
+        config (Dict[str, Any]): train/eval dataloader configuration
+        mode (str): mode
+        device (str): device string
+        py_num_workers (int, optional): number of python worker(s). Defaults to 1.
+        num_threads (int, optional): number of thread(s). Defaults to 4.
+        seed (Optional[int], optional): random seed. Defaults to None.
+        enable_fuse (bool, optional): enable fused operator(s). Defaults to True.
+
+    Returns:
+        DALIImageNetIterator: Iterable DALI dataloader
+    """
+    assert "gpu" in device, f"device must be \"gpu\" when running with DALI, but got {device}"
+    config_dataloader = config[mode]
+    device_id = int(device.split(":")[1])
+    device = "gpu"
+    seed = 42 if seed is None else seed
+    env = os.environ
+    num_gpus = paddle.distributed.get_world_size()
+
+    batch_size = config_dataloader["sampler"]["batch_size"]
+    file_root = config_dataloader["dataset"]["image_root"]
+    file_list = config_dataloader["dataset"]["cls_label_path"]
+    sampler_name = config_dataloader["sampler"].get("name",
+                                                    "DistributedBatchSampler")
+    transform_ops_cfg = config_dataloader["dataset"]["transform_ops"]
+    random_shuffle = config_dataloader["sampler"].get("shuffle", None)
+    dali_transforms = build_dali_transforms(
+        transform_ops_cfg, mode, device, enable_fuse=enable_fuse)
+    if "ToCHWImage" not in [type_name(op) for op in dali_transforms] and (
+            "CropMirrorNormalize" not in
+        [type_name(op) for op in dali_transforms]):
+        dali_transforms.append(ToCHWImage(perm=[2, 0, 1], device=device))
+        logger.info(
+            "Append DALI operator \"ToCHWImage\" at the end of dali_transforms for getting output in \"CHW\" shape"
+        )
+
+    if mode.lower() in ["train"]:
+        if "PADDLE_TRAINER_ID" in env and "PADDLE_TRAINERS_NUM" in env and "FLAGS_selected_gpus" in env:
+            shard_id = int(env["PADDLE_TRAINER_ID"])
+            num_shards = int(env["PADDLE_TRAINERS_NUM"])
+            device_id = int(env["FLAGS_selected_gpus"])
+        else:
+            shard_id = 0
+            num_shards = 1
+        logger.info(
+            f"Building DALI {mode} pipeline with num_shards: {num_shards}, num_gpus: {num_gpus}"
+        )
+
+        random_shuffle = random_shuffle if random_shuffle is not None else True
+        if sampler_name in ["PKSampler", "DistributedRandomIdentitySampler"]:
+            ext_src = ExternalSource_RandomIdentity(
+                batch_size=batch_size,
+                sample_per_id=config_dataloader["sampler"][
+                    "sample_per_id"
+                    if sampler_name == "PKSampler" else "num_instances"],
+                device_id=device_id,
+                shard_id=shard_id,
+                num_gpus=num_gpus,
+                image_root=file_root,
+                cls_label_path=file_list,
+                delimiter=None,
+                relabel=config_dataloader["dataset"].get("relabel", False),
+                sample_method=config_dataloader["sampler"].get(
+                    "sample_method", "sample_avg_prob"),
+                id_list=config_dataloader["sampler"].get("id_list", None),
+                ratio=config_dataloader["sampler"].get("ratio", None),
+                shuffle=random_shuffle)
+            logger.info(
+                f"Building DALI {mode} pipeline with ext_src({type_name(ext_src)})"
+            )
+        else:
+            ext_src = None
+
+        pipe = HybridPipeline(device, batch_size, py_num_workers, num_threads,
+                              device_id, seed + shard_id, file_root, file_list,
+                              dali_transforms, shard_id, num_shards,
+                              random_shuffle, ext_src)
+        pipe.build()
+        pipelines = [pipe]
+        if ext_src is None:
+            return DALIImageNetIterator(
+                pipelines, ["data", "label"], reader_name="Reader")
+        else:
+            return DALIImageNetIterator(
+                pipelines,
+                ["data", "label"],
+                size=len(ext_src),
+                last_batch_policy=LastBatchPolicy.
+                DROP  # make reset() successfully
+            )
+    elif mode.lower() in ["eval", "gallery", "query"]:
+        assert sampler_name in ["DistributedBatchSampler"], \
+            f"sampler_name({sampler_name}) must in [\"DistributedBatchSampler\"]"
+        if "PADDLE_TRAINER_ID" in env and "PADDLE_TRAINERS_NUM" in env and "FLAGS_selected_gpus" in env:
+            shard_id = int(env["PADDLE_TRAINER_ID"])
+            num_shards = int(env["PADDLE_TRAINERS_NUM"])
+            device_id = int(env["FLAGS_selected_gpus"])
+        else:
+            shard_id = 0
+            num_shards = 1
+        logger.info(
+            f"Building DALI {mode} pipeline with num_shards: {num_shards}, num_gpus: {num_gpus}..."
+        )
+
+        random_shuffle = random_shuffle if random_shuffle is not None else False
+        pipe = HybridPipeline(device, batch_size, py_num_workers, num_threads,
+                              device_id, seed + shard_id, file_root, file_list,
+                              dali_transforms, shard_id, num_shards,
+                              random_shuffle)
+        pipe.build()
+        pipelines = [pipe]
+        return DALIImageNetIterator(
+            pipelines, ["data", "label"], reader_name="Reader")
+    else:
+        raise ValueError(f"Invalid mode({mode}) when building DALI pipeline")
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/face_dataset.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/face_dataset.py
new file mode 100644
index 000000000..7ec13df32
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/face_dataset.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import pickle
+from paddle.io import Dataset
+from .common_dataset import create_operators
+from ppcls.data.preprocess import transform as transform_func
+
+class FaceEvalDataset(Dataset):
+    def __init__(self,
+                 dataset_root,
+                 pair_label_path,
+                 transform_ops=None,
+                 delimiter=None):
+        super().__init__()
+        self._dataset_root = dataset_root
+        self._pair_label_path = pair_label_path
+        self.delimiter = delimiter if delimiter is not None else " "
+        self._transform_ops = create_operators(transform_ops) if transform_ops \
+            is not None else None
+
+        self._load_anno()
+    
+    def _load_anno(self):
+        assert os.path.exists(
+            self._pair_label_path), f"pair label file {self._pair_label_path} does not exist"
+        assert os.path.exists(
+            self._dataset_root), f"path {self._dataset_root} does not exist."
+        self.image_pairs = []
+        self.labels = []
+
+        with open(self._pair_label_path) as fd:
+            lines = fd.readlines()
+            for line in lines:
+                line = line.strip().split(self.delimiter)
+
+                left_img_path = os.path.join(self._dataset_root, line[0])
+                assert os.path.exists(left_img_path), \
+                    f"path {left_img_path} does not exist."
+                right_img_path = os.path.join(self._dataset_root, line[1])
+                assert os.path.exists(right_img_path), \
+                    f"path {right_img_path} does not exist."
+                self.image_pairs.append((left_img_path, right_img_path))
+
+                label = np.int64(line[2])
+                assert label in [0, 1], f"label must be 0 or 1, but got {label}"
+                self.labels.append(label)
+
+    def __getitem__(self, idx):
+        with open(self.image_pairs[idx][0], 'rb') as f:
+            img_left = f.read()
+        with open(self.image_pairs[idx][1], 'rb') as f:
+            img_right = f.read()
+        if self._transform_ops is not None:
+            img_left = transform_func(img_left, self._transform_ops)
+            img_right = transform_func(img_right, self._transform_ops)
+        
+        img_left = img_left.transpose((2, 0, 1))
+        img_right = img_right.transpose((2, 0, 1))
+        return img_left, img_right, self.labels[idx]
+
+    def __len__(self):
+        return len(self.image_pairs)
+
+
+class FiveFaceEvalDataset(Dataset):
+    dataname_to_idx = {
+        "agedb_30": 0,
+        "cfp_fp": 1,
+        "lfw": 2,
+        "cplfw": 3,
+        "calfw": 4
+    }
+    def __init__(self, 
+                 val_data_path, 
+                 val_targets=['agedb_30','cfp_fp','lfw'],
+                 transform_ops=None):
+        '''
+        agedb_30: 0
+        cfp_fp: 1
+        lfw: 2
+        cplfw: 3
+        calfw: 4
+        '''
+        if isinstance(val_targets, str):
+            val_targets = [val_targets]
+        assert isinstance(val_targets, list)
+        assert all([x in self.dataname_to_idx.keys() for x in val_targets]), \
+            f"val_targets must be in {self.dataname_to_idx.keys()}"
+        self._transform_ops = create_operators(transform_ops)
+
+        # concat all dataset
+        all_img_buffs = []
+        all_issame = []
+        all_dataname_idxs = []
+        for dataname in val_targets:
+            dataname_idx = self.dataname_to_idx[dataname]
+            assert os.path.exists(
+                os.path.join(val_data_path, dataname+".bin")), \
+                f"{dataname}" f".bin not found in {val_data_path}"
+            with open(os.path.join(val_data_path, dataname+".bin"), 'rb') as f:
+                img_buffs, issame = pickle.load(f, encoding='bytes')
+            for i in range(0, len(img_buffs), 2):
+                left_buff, right_buff = img_buffs[i], img_buffs[i + 1]
+                if isinstance(left_buff, np.ndarray):
+                    left_buff = left_buff.tobytes()
+                if isinstance(right_buff, np.ndarray):
+                    right_buff = right_buff.tobytes()     
+                all_img_buffs.append((left_buff, right_buff))
+            all_issame.extend(list(issame))
+            all_dataname_idxs.extend([dataname_idx] * len(issame))
+            assert len(all_issame) == len(all_img_buffs)
+
+        self.all_img_buffs = all_img_buffs
+        self.all_issame = all_issame
+        self.all_dataname_idxs = all_dataname_idxs
+
+    def __getitem__(self, index):
+        left_buff, right_buff = self.all_img_buffs[index]
+        if self._transform_ops is not None:
+            img_left = transform_func(left_buff, self._transform_ops)
+            img_right = transform_func(right_buff, self._transform_ops)
+        img_left = img_left.transpose((2, 0, 1))
+        img_right = img_right.transpose((2, 0, 1))
+        
+        dataname_idx = self.all_dataname_idxs[index]
+        label = np.int64(self.all_issame[index])
+        return img_left, img_right, label, dataname_idx
+
+    def __len__(self):
+        return len(self.all_img_buffs)
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/icartoon_dataset.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/icartoon_dataset.py
new file mode 100644
index 000000000..18e3b4b7f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/icartoon_dataset.py
@@ -0,0 +1,36 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import os
+
+from .common_dataset import CommonDataset
+
+
+class ICartoonDataset(CommonDataset):
+    def _load_anno(self, seed=None):
+        assert os.path.exists(self._cls_path)
+        assert os.path.exists(self._img_root)
+        self.images = []
+        self.labels = []
+
+        with open(self._cls_path) as fd:
+            lines = fd.readlines()
+            for l in lines:
+                l = l.strip().split("\t")
+                self.images.append(os.path.join(self._img_root, l[0]))
+                self.labels.append(np.int64(l[1]))
+                assert os.path.exists(self.images[-1])
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/imagenet_dataset.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/imagenet_dataset.py
new file mode 100644
index 000000000..cc66007d9
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/imagenet_dataset.py
@@ -0,0 +1,75 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import os
+
+from .common_dataset import CommonDataset
+
+
+class ImageNetDataset(CommonDataset):
+    """ImageNetDataset
+
+    Args:
+        image_root (str): image root, path to `ILSVRC2012`
+        cls_label_path (str): path to annotation file `train_list.txt` or `val_list.txt`
+        transform_ops (list, optional): list of transform op(s). Defaults to None.
+        delimiter (str, optional): delimiter. Defaults to None.
+        relabel (bool, optional): whether do relabel when original label do not starts from 0 or are discontinuous. Defaults to False.
+    """
+
+    def __init__(self,
+                 image_root,
+                 cls_label_path,
+                 transform_ops=None,
+                 delimiter=None,
+                 relabel=False):
+        self.delimiter = delimiter if delimiter is not None else " "
+        self.relabel = relabel
+        super(ImageNetDataset, self).__init__(image_root, cls_label_path,
+                                              transform_ops)
+
+    def _load_anno(self, seed=None):
+        assert os.path.exists(
+            self._cls_path), f"path {self._cls_path} does not exist."
+        assert os.path.exists(
+            self._img_root), f"path {self._img_root} does not exist."
+        self.images = []
+        self.labels = []
+
+        with open(self._cls_path) as fd:
+            lines = fd.readlines()
+            if self.relabel:
+                label_set = set()
+                for line in lines:
+                    line = line.strip().split(self.delimiter)
+                    label_set.add(np.int64(line[1]))
+                label_map = {
+                    oldlabel: newlabel
+                    for newlabel, oldlabel in enumerate(label_set)
+                }
+
+            if seed is not None:
+                np.random.RandomState(seed).shuffle(lines)
+            for line in lines:
+                line = line.strip().split(self.delimiter)
+                self.images.append(os.path.join(self._img_root, line[0]))
+                if self.relabel:
+                    self.labels.append(label_map[np.int64(line[1])])
+                else:
+                    self.labels.append(np.int64(line[1]))
+                assert os.path.exists(self.images[
+                    -1]), f"path {self.images[-1]} does not exist."
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/logo_dataset.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/logo_dataset.py
new file mode 100644
index 000000000..132ead989
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/logo_dataset.py
@@ -0,0 +1,46 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import io
+import tarfile
+import numpy as np
+from PIL import Image  #all use default backend
+
+import paddle
+from paddle.io import Dataset
+import pickle
+import os
+import cv2
+import random
+
+from .common_dataset import CommonDataset
+
+
+class LogoDataset(CommonDataset):
+    def _load_anno(self):
+        assert os.path.exists(self._cls_path)
+        assert os.path.exists(self._img_root)
+        self.images = []
+        self.labels = []
+        with open(self._cls_path) as fd:
+            lines = fd.readlines()
+            for l in lines:
+                l = l.strip().split("\t")
+                if l[0] == 'image_id':
+                    continue
+                self.images.append(os.path.join(self._img_root, l[3]))
+                self.labels.append(np.int64(l[1]) - 1)
+                assert os.path.exists(self.images[-1])
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/metabin_sampler.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/metabin_sampler.py
new file mode 100644
index 000000000..f5cb29f75
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/metabin_sampler.py
@@ -0,0 +1,290 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import itertools
+from collections import defaultdict
+import numpy as np
+from paddle.io import Sampler, BatchSampler
+
+
+class DomainShuffleSampler(Sampler):
+    """
+    Domain shuffle sampler
+    Args:
+        dataset(Dataset): Dataset for sampling
+        batch_size (int): Number of examples in a batch.
+        num_instances (int): Number of instances per identity in a batch.
+        camera_to_domain (bool): If True, consider each camera as an individual domain
+    
+    Code was heavily based on https://github.com/bismex/MetaBIN
+    reference: https://arxiv.org/abs/2011.14670v2
+    """
+
+    def __init__(self,
+                 dataset,
+                 batch_size,
+                 num_instances,
+                 camera_to_domain=True):
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.num_instances = num_instances
+        self.num_pids_per_batch = batch_size // self.num_instances
+
+        self.index_pid = defaultdict(list)
+        self.pid_domain = defaultdict(list)
+        self.pid_index = defaultdict(list)
+        # data_source: [(img_path, pid, camera, domain), ...]  (camera_to_domain = True)
+        if camera_to_domain:
+            data_source = zip(dataset.images, dataset.labels, dataset.cameras,
+                              dataset.cameras)
+        else:
+            data_source = zip(dataset.images, dataset.labels, dataset.cameras,
+                              dataset.domains)
+        for index, info in enumerate(data_source):
+            domainid = info[3]
+            if camera_to_domain:
+                pid = 'p' + str(info[1]) + '_d' + str(domainid)
+            else:
+                pid = 'p' + str(info[1])
+            self.index_pid[index] = pid
+            self.pid_domain[pid] = domainid
+            self.pid_index[pid].append(index)
+
+        self.pids = list(self.pid_index.keys())
+        self.domains = list(self.pid_domain.values())
+
+        self.num_identities = len(self.pids)
+        self.num_domains = len(set(self.domains))
+
+        self.batch_size //= self.num_domains
+        self.num_pids_per_batch //= self.num_domains
+
+        val_pid_index = [len(x) for x in self.pid_index.values()]
+
+        val_pid_index_upper = []
+        for x in val_pid_index:
+            v_remain = x % self.num_instances
+            if v_remain == 0:
+                val_pid_index_upper.append(x)
+            else:
+                val_pid_index_upper.append(x - v_remain + self.num_instances)
+
+        cnt_domains = [0 for x in range(self.num_domains)]
+        for val, index in zip(val_pid_index_upper, self.domains):
+            cnt_domains[index] += val
+        self.max_cnt_domains = max(cnt_domains)
+        self.total_images = self.num_domains * (
+            self.max_cnt_domains -
+            (self.max_cnt_domains % self.batch_size) - self.batch_size)
+
+    def _get_epoch_indices(self):
+        def _get_batch_idxs(pids, pid_index, num_instances):
+            batch_idxs_dict = defaultdict(list)
+            for pid in pids:
+                idxs = copy.deepcopy(pid_index[pid])
+                if len(
+                        idxs
+                ) < self.num_instances:  # if idxs is smaller than num_instance, choice redundantly
+                    idxs = np.random.choice(
+                        idxs, size=self.num_instances, replace=True)
+                elif (len(idxs) % self.num_instances) != 0:
+                    idxs.extend(
+                        np.random.choice(
+                            idxs,
+                            size=self.num_instances - len(idxs) %
+                            self.num_instances,
+                            replace=False))
+
+                np.random.shuffle(idxs)
+                batch_idxs = []
+                for idx in idxs:
+                    batch_idxs.append(int(idx))
+                    if len(batch_idxs) == num_instances:
+                        batch_idxs_dict[pid].append(batch_idxs)
+                        batch_idxs = []
+            return batch_idxs_dict
+
+        batch_idxs_dict = _get_batch_idxs(self.pids, self.pid_index,
+                                          self.num_instances)
+
+        # batch_idxs_dict: dictionary, len(batch_idxs_dict) is len(pidx), each pidx, num_instance x k samples
+        avai_pids = copy.deepcopy(self.pids)
+
+        local_avai_pids = \
+            [[pids for pids, idx in zip(avai_pids, self.domains) if idx == i]
+             for i in list(set(self.domains))]
+        local_avai_pids_save = copy.deepcopy(local_avai_pids)
+
+        revive_idx = [False for i in range(self.num_domains)]
+        final_idxs = []
+        while len(avai_pids) >= self.num_pids_per_batch and not all(
+                revive_idx):
+            for i in range(self.num_domains):
+                selected_pids = np.random.choice(
+                    local_avai_pids[i], self.num_pids_per_batch, replace=False)
+                for pid in selected_pids:
+                    batch_idxs = batch_idxs_dict[pid].pop(0)
+                    final_idxs.extend(batch_idxs)
+                    if len(batch_idxs_dict[pid]) == 0:
+                        avai_pids.remove(pid)
+                        local_avai_pids[i].remove(pid)
+            for i in range(self.num_domains):
+                if len(local_avai_pids[i]) < self.num_pids_per_batch:
+                    batch_idxs_dict_new = _get_batch_idxs(
+                        self.pids, self.pid_index, self.num_instances)
+
+                    revive_idx[i] = True
+                    cnt = 0
+                    for pid, val in batch_idxs_dict_new.items():
+                        if self.domains[cnt] == i:
+                            batch_idxs_dict[pid] = copy.deepcopy(
+                                batch_idxs_dict_new[pid])
+                        cnt += 1
+                    local_avai_pids[i] = copy.deepcopy(local_avai_pids_save[i])
+                    avai_pids.extend(local_avai_pids_save[i])
+                    avai_pids = list(set(avai_pids))
+        return final_idxs
+
+    def __iter__(self):
+        yield from itertools.islice(self._infinite_indices(), 0, None, 1)
+
+    def _infinite_indices(self):
+        while True:
+            indices = self._get_epoch_indices()
+            yield from indices
+
+
+class DomainShuffleBatchSampler(BatchSampler):
+    def __init__(self, dataset, batch_size, num_instances, camera_to_domain,
+                 drop_last):
+        sampler = DomainShuffleSampler(
+            dataset=dataset,
+            batch_size=batch_size,
+            num_instances=num_instances,
+            camera_to_domain=camera_to_domain)
+        super().__init__(
+            sampler=sampler, batch_size=batch_size, drop_last=drop_last)
+
+
+class NaiveIdentitySampler(Sampler):
+    """
+    Randomly sample N identities, then for each identity,
+    randomly sample K instances, therefore batch size is N*K.
+    Args:
+        dataset(Dataset): Dataset for sampling
+        batch_size (int): Number of examples in a batch.
+        num_instances (int): Number of instances per identity in a batch.
+
+    Code was heavily based on https://github.com/bismex/MetaBIN
+    reference: https://arxiv.org/abs/2011.14670v2
+    """
+
+    def __init__(self, dataset, batch_size, num_instances):
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.num_instances = num_instances
+        self.num_pids_per_batch = batch_size // self.num_instances
+
+        self.index_pid = defaultdict(list)
+        self.pid_cam = defaultdict(list)
+        self.pid_index = defaultdict(list)
+
+        # data_source: [(img_path, pid, camera, domain), ...]  (camera_to_domain = True)
+        data_source = zip(dataset.images, dataset.labels, dataset.cameras,
+                          dataset.cameras)
+        for index, info in enumerate(data_source):
+            pid = info[1]
+            camid = info[2]
+            self.index_pid[index] = pid
+            self.pid_cam[pid].append(camid)
+            self.pid_index[pid].append(index)
+
+        self.pids = list(self.pid_index.keys())
+        self.num_identities = len(self.pids)
+
+        val_pid_index = [len(x) for x in self.pid_index.values()]
+
+        val_pid_index_upper = []
+        for x in val_pid_index:
+            v_remain = x % self.num_instances
+            if v_remain == 0:
+                val_pid_index_upper.append(x)
+            else:
+                val_pid_index_upper.append(x - v_remain + self.num_instances)
+
+        total_images = sum(val_pid_index_upper)
+        total_images = total_images - (total_images % self.batch_size
+                                       ) - self.batch_size  # approax
+        self.total_images = total_images
+
+    def _get_epoch_indices(self):
+        batch_idxs_dict = defaultdict(list)
+
+        for pid in self.pids:
+            idxs = copy.deepcopy(
+                self.pid_index[pid])  # whole index for each ID
+            if len(
+                    idxs
+            ) < self.num_instances:  # if idxs is smaller than num_instance, choice redundantly
+                idxs = np.random.choice(
+                    idxs, size=self.num_instances, replace=True)
+            elif (len(idxs) % self.num_instances) != 0:
+                idxs.extend(
+                    np.random.choice(
+                        idxs,
+                        size=self.num_instances - len(idxs) %
+                        self.num_instances,
+                        replace=False))
+
+            np.random.shuffle(idxs)
+            batch_idxs = []
+            for idx in idxs:
+                batch_idxs.append(int(idx))
+                if len(batch_idxs) == self.num_instances:
+                    batch_idxs_dict[pid].append(batch_idxs)
+                    batch_idxs = []
+        # batch_idxs_dict: dictionary, len(batch_idxs_dict) is len(pidx), each pidx, num_instance x k samples
+        avai_pids = copy.deepcopy(self.pids)
+        final_idxs = []
+
+        while len(avai_pids) >= self.num_pids_per_batch:
+            selected_pids = np.random.choice(
+                avai_pids, self.num_pids_per_batch, replace=False)
+            for pid in selected_pids:
+                batch_idxs = batch_idxs_dict[pid].pop(0)
+                final_idxs.extend(batch_idxs)
+                if len(batch_idxs_dict[pid]) == 0: avai_pids.remove(pid)
+        return final_idxs
+
+    def __iter__(self):
+        yield from itertools.islice(self._infinite_indices(), 0, None, 1)
+
+    def _infinite_indices(self):
+        while True:
+            indices = self._get_epoch_indices()
+            yield from indices
+
+    def __len__(self):
+        return self.total_images
+
+
+class NaiveIdentityBatchSampler(BatchSampler):
+    def __init__(self, dataset, batch_size, num_instances, drop_last):
+        sampler = NaiveIdentitySampler(
+            dataset=dataset,
+            batch_size=batch_size,
+            num_instances=num_instances)
+        super().__init__(
+            sampler=sampler, batch_size=batch_size, drop_last=drop_last)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/mix_dataset.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/mix_dataset.py
new file mode 100644
index 000000000..cbf4b4028
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/mix_dataset.py
@@ -0,0 +1,49 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import os
+
+from paddle.io import Dataset
+from .. import dataloader
+
+
+class MixDataset(Dataset):
+    def __init__(self, datasets_config):
+        super().__init__()
+        self.dataset_list = []
+        start_idx = 0
+        end_idx = 0
+        for config_i in datasets_config:
+            dataset_name = config_i.pop('name')
+            dataset = getattr(dataloader, dataset_name)(**config_i)
+            end_idx += len(dataset)
+            self.dataset_list.append([end_idx, start_idx, dataset])
+            start_idx = end_idx
+
+        self.length = end_idx
+
+    def __getitem__(self, idx):
+        for dataset_i in self.dataset_list:
+            if dataset_i[0] > idx:
+                dataset_i_idx = idx - dataset_i[1]
+                return dataset_i[2][dataset_i_idx]
+
+    def __len__(self):
+        return self.length
+
+    def get_dataset_list(self):
+        return self.dataset_list
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/mix_sampler.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/mix_sampler.py
new file mode 100644
index 000000000..2df3109ce
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/mix_sampler.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+
+from paddle.io import DistributedBatchSampler, Sampler
+
+from ppcls.utils import logger
+from ppcls.data.dataloader.mix_dataset import MixDataset
+from ppcls.data import dataloader
+
+
+class MixSampler(DistributedBatchSampler):
+    def __init__(self, dataset, batch_size, sample_configs, iter_per_epoch):
+        super().__init__(dataset, batch_size)
+        assert isinstance(dataset,
+                          MixDataset), "MixSampler only support MixDataset"
+        self.sampler_list = []
+        self.batch_size = batch_size
+        self.start_list = []
+        self.length = iter_per_epoch
+        dataset_list = dataset.get_dataset_list()
+        batch_size_left = self.batch_size
+        self.iter_list = []
+        for i, config_i in enumerate(sample_configs):
+            self.start_list.append(dataset_list[i][1])
+            sample_method = config_i.pop("name")
+            ratio_i = config_i.pop("ratio")
+            if i < len(sample_configs) - 1:
+                batch_size_i = int(self.batch_size * ratio_i)
+                batch_size_left -= batch_size_i
+            else:
+                batch_size_i = batch_size_left
+            assert batch_size_i <= len(dataset_list[i][2])
+            config_i["batch_size"] = batch_size_i
+            if sample_method == "DistributedBatchSampler":
+                sampler_i = DistributedBatchSampler(dataset_list[i][2],
+                                                    **config_i)
+            else:
+                sampler_i = getattr(dataloader, sample_method)(
+                    dataset_list[i][2], **config_i)
+            self.sampler_list.append(sampler_i)
+            self.iter_list.append(iter(sampler_i))
+            self.length += len(dataset_list[i][2]) * ratio_i
+            self.iter_counter = 0
+
+    def __iter__(self):
+        while self.iter_counter < self.length:
+            batch = []
+            for i, iter_i in enumerate(self.iter_list):
+                batch_i = next(iter_i, None)
+                if batch_i is None:
+                    iter_i = iter(self.sampler_list[i])
+                    self.iter_list[i] = iter_i
+                    batch_i = next(iter_i, None)
+                    assert batch_i is not None, "dataset {} return None".format(
+                        i)
+                batch += [idx + self.start_list[i] for idx in batch_i]
+            if len(batch) == self.batch_size:
+                self.iter_counter += 1
+                yield batch
+            else:
+                logger.info("Some dataset reaches end")
+        self.iter_counter = 0
+
+    def __len__(self):
+        return self.length
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/multi_scale_dataset.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/multi_scale_dataset.py
new file mode 100644
index 000000000..27e84ceb6
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/multi_scale_dataset.py
@@ -0,0 +1,118 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import os
+
+from paddle.io import Dataset
+from paddle.vision import transforms
+import cv2
+import warnings
+
+from ppcls.data import preprocess
+from ppcls.data.preprocess import transform
+from ppcls.data.preprocess.ops.operators import DecodeImage
+from ppcls.utils import logger
+from ppcls.data.dataloader.common_dataset import create_operators
+
+
+class MultiScaleDataset(Dataset):
+    """MultiScaleDataset
+
+    Args:
+        image_root (str): image root
+        cls_label_path (str): path to annotation file `train_list.txt` or `val_list.txt`
+        transform_ops (list, optional): list of transform op(s). Defaults to None.
+        delimiter (str, optional): delimiter. Defaults to None.
+    """
+
+    def __init__(
+            self,
+            image_root,
+            cls_label_path,
+            transform_ops=None,
+            delimiter=None, ):
+        self._img_root = image_root
+        self._cls_path = cls_label_path
+        self.transform_ops = transform_ops
+        self.delimiter = delimiter if delimiter is not None else " "
+        self.images = []
+        self.labels = []
+        self._load_anno()
+        self.has_crop_flag = 1
+
+    def _load_anno(self, seed=None):
+        assert os.path.exists(self._cls_path)
+        assert os.path.exists(self._img_root)
+        self.images = []
+        self.labels = []
+
+        with open(self._cls_path) as fd:
+            lines = fd.readlines()
+            if seed is not None:
+                np.random.RandomState(seed).shuffle(lines)
+            for l in lines:
+                l = l.strip().split(self.delimiter)
+                self.images.append(os.path.join(self._img_root, l[0]))
+                self.labels.append(np.int64(l[1]))
+                assert os.path.exists(self.images[-1])
+
+    def __getitem__(self, properties):
+        # properites is a tuple, contains (width, height, index)
+        img_width = properties[0]
+        img_height = properties[1]
+        index = properties[2]
+        has_crop = False
+        if self.transform_ops:
+            for i in range(len(self.transform_ops)):
+                op = self.transform_ops[i]
+                resize_op = ['RandCropImage', 'ResizeImage', 'CropImage']
+                for resize in resize_op:
+                    if resize in op:
+                        if self.has_crop_flag:
+                            logger.warning(
+                                "Multi scale dataset will crop image according to the multi scale resolution"
+                            )
+                        self.transform_ops[i][resize] = {
+                            'size': (img_width, img_height)
+                        }
+                        has_crop = True
+                        self.has_crop_flag = 0
+        if has_crop == False:
+            logger.error("Multi scale dateset requests RandCropImage")
+            raise RuntimeError("Multi scale dateset requests RandCropImage")
+        self._transform_ops = create_operators(self.transform_ops)
+
+        try:
+            with open(self.images[index], 'rb') as f:
+                img = f.read()
+            if self._transform_ops:
+                img = transform(img, self._transform_ops)
+            img = img.transpose((2, 0, 1))
+            return (img, self.labels[index])
+
+        except Exception as ex:
+            logger.error("Exception occured when parse line: {} with msg: {}".
+                         format(self.images[index], ex))
+            rnd_idx = np.random.randint(self.__len__())
+            return self.__getitem__(rnd_idx)
+
+    def __len__(self):
+        return len(self.images)
+
+    @property
+    def class_num(self):
+        return len(set(self.labels))
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/multi_scale_sampler.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/multi_scale_sampler.py
new file mode 100644
index 000000000..9208ed828
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/multi_scale_sampler.py
@@ -0,0 +1,133 @@
+from paddle.io import Sampler
+import paddle.distributed as dist
+
+import math
+import random
+import numpy as np
+
+from ppcls import data
+
+
+class MultiScaleSampler(Sampler):
+    def __init__(self,
+                 data_source,
+                 scales,
+                 first_bs,
+                 divided_factor=32,
+                 is_training=True,
+                 shuffle=True,
+                 seed=None):
+        """
+            multi scale samper
+            Args:
+                data_source(dataset)
+                scales(list): several scales for image resolution
+                first_bs(int): batch size for the first scale in scales
+                divided_factor(int): ImageNet models down-sample images by a factor, ensure that width and height dimensions are multiples are multiple of devided_factor.
+                is_training(boolean): mode
+        """
+        # min. and max. spatial dimensions
+        self.data_source = data_source
+        self.n_data_samples = len(self.data_source)
+
+        if isinstance(scales[0], tuple):
+            width_dims = [i[0] for i in scales]
+            height_dims = [i[1] for i in scales]
+        elif isinstance(scales[0], int):
+            width_dims = scales
+            height_dims = scales
+        base_im_w = width_dims[0]
+        base_im_h = height_dims[0]
+        base_batch_size = first_bs
+
+        # Get the GPU and node related information
+        num_replicas = dist.get_world_size()
+        rank = dist.get_rank()
+        # adjust the total samples to avoid batch dropping
+        num_samples_per_replica = int(
+            math.ceil(self.n_data_samples * 1.0 / num_replicas))
+        img_indices = [idx for idx in range(self.n_data_samples)]
+
+        self.shuffle = shuffle
+        if is_training:
+            # compute the spatial dimensions and corresponding batch size
+            # ImageNet models down-sample images by a factor of 32.
+            # Ensure that width and height dimensions are multiples are multiple of 32.
+            width_dims = [
+                int((w // divided_factor) * divided_factor) for w in width_dims
+            ]
+            height_dims = [
+                int((h // divided_factor) * divided_factor)
+                for h in height_dims
+            ]
+
+            img_batch_pairs = list()
+            base_elements = base_im_w * base_im_h * base_batch_size
+            for (h, w) in zip(height_dims, width_dims):
+                batch_size = int(max(1, (base_elements / (h * w))))
+                img_batch_pairs.append((w, h, batch_size))
+            self.img_batch_pairs = img_batch_pairs
+        else:
+            self.img_batch_pairs = [(base_im_w, base_im_h, base_batch_size)]
+
+        self.img_indices = img_indices
+        self.n_samples_per_replica = num_samples_per_replica
+        self.epoch = 0
+        self.rank = rank
+        self.num_replicas = num_replicas
+        self.seed = seed
+        self.batch_list = []
+        self.current = 0
+        indices_rank_i = self.img_indices[self.rank:len(self.img_indices):
+                                          self.num_replicas]
+        while self.current < self.n_samples_per_replica:
+            curr_w, curr_h, curr_bsz = random.choice(self.img_batch_pairs)
+
+            end_index = min(self.current + curr_bsz,
+                            self.n_samples_per_replica)
+
+            batch_ids = indices_rank_i[self.current:end_index]
+            n_batch_samples = len(batch_ids)
+            if n_batch_samples != curr_bsz:
+                batch_ids += indices_rank_i[:(curr_bsz - n_batch_samples)]
+            self.current += curr_bsz
+
+            if len(batch_ids) > 0:
+                batch = [curr_w, curr_h, len(batch_ids)]
+                self.batch_list.append(batch)
+        self.length = len(self.batch_list)
+
+    def __iter__(self):
+        if self.shuffle:
+            if self.seed is not None:
+                random.seed(self.seed)
+            else:
+                random.seed(self.epoch)
+            self.epoch += 1
+            random.shuffle(self.img_indices)
+            random.shuffle(self.img_batch_pairs)
+            indices_rank_i = self.img_indices[self.rank:len(self.img_indices):
+                                              self.num_replicas]
+        else:
+            indices_rank_i = self.img_indices[self.rank:len(self.img_indices):
+                                              self.num_replicas]
+
+        start_index = 0
+        for batch_tuple in self.batch_list:
+            curr_w, curr_h, curr_bsz = batch_tuple
+            end_index = min(start_index + curr_bsz, self.n_samples_per_replica)
+            batch_ids = indices_rank_i[start_index:end_index]
+            n_batch_samples = len(batch_ids)
+            if n_batch_samples != curr_bsz:
+                batch_ids += indices_rank_i[:(curr_bsz - n_batch_samples)]
+            start_index += curr_bsz
+
+            if len(batch_ids) > 0:
+                batch = [(curr_w, curr_h, b_id) for b_id in batch_ids]
+                yield batch
+
+    def set_epoch(self, epoch: int):
+        self.epoch = epoch
+
+    def __len__(self):
+        return self.length
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/multilabel_dataset.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/multilabel_dataset.py
new file mode 100644
index 000000000..c67a5ae78
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/multilabel_dataset.py
@@ -0,0 +1,65 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import os
+import cv2
+
+from ppcls.data.preprocess import transform
+from ppcls.utils import logger
+
+from .common_dataset import CommonDataset
+
+
+class MultiLabelDataset(CommonDataset):
+    def _load_anno(self, label_ratio=False):
+        assert os.path.exists(self._cls_path)
+        assert os.path.exists(self._img_root)
+        self.label_ratio = label_ratio
+        self.images = []
+        self.labels = []
+        with open(self._cls_path) as fd:
+            lines = fd.readlines()
+            for l in lines:
+                l = l.strip().split("\t")
+                self.images.append(os.path.join(self._img_root, l[0]))
+
+                labels = l[1].split(',')
+                labels = [np.int64(i) for i in labels]
+
+                self.labels.append(labels)
+                assert os.path.exists(self.images[-1])
+        if self.label_ratio is not False:
+            return np.array(self.labels).mean(0).astype("float32")
+
+    def __getitem__(self, idx):
+        try:
+            with open(self.images[idx], 'rb') as f:
+                img = f.read()
+            if self._transform_ops:
+                img = transform(img, self._transform_ops)
+            img = img.transpose((2, 0, 1))
+            label = np.array(self.labels[idx]).astype("float32")
+            if self.label_ratio is not False:
+                return (img, np.array([label, self.label_ratio]))
+            else:
+                return (img, label)
+
+        except Exception as ex:
+            logger.error("Exception occured when parse line: {} with msg: {}".
+                         format(self.images[idx], ex))
+            rnd_idx = np.random.randint(self.__len__())
+            return self.__getitem__(rnd_idx)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/person_dataset.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/person_dataset.py
new file mode 100644
index 000000000..eac7bc782
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/person_dataset.py
@@ -0,0 +1,269 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle.io import Dataset
+import os
+import cv2
+
+from ppcls.data import preprocess
+from ppcls.data.preprocess import transform
+from ppcls.utils import logger
+from .common_dataset import create_operators
+import os.path as osp
+import glob
+import re
+from PIL import Image
+
+
+class Market1501(Dataset):
+    """
+    Market1501
+    Reference:
+    Zheng et al. Scalable Person Re-identification: A Benchmark. ICCV 2015.
+    URL: http://www.liangzheng.org/Project/project_reid.html
+
+    Dataset statistics:
+    # identities: 1501 (+1 for background)
+    # images: 12936 (train) + 3368 (query) + 15913 (gallery)
+    """
+    _dataset_dir = 'market1501/Market-1501-v15.09.15'
+
+    def __init__(self,
+                 image_root,
+                 cls_label_path,
+                 transform_ops=None,
+                 backend="cv2"):
+        self._img_root = image_root
+        self._cls_path = cls_label_path  # the sub folder in the dataset
+        self._dataset_dir = osp.join(image_root, self._dataset_dir,
+                                     self._cls_path)
+        self._check_before_run()
+        if transform_ops:
+            self._transform_ops = create_operators(transform_ops)
+        self.backend = backend
+        self._dtype = paddle.get_default_dtype()
+        self._load_anno(relabel=True if 'train' in self._cls_path else False)
+
+    def _check_before_run(self):
+        """Check if the file is available before going deeper"""
+        if not osp.exists(self._dataset_dir):
+            raise RuntimeError("'{}' is not available".format(
+                self._dataset_dir))
+
+    def _load_anno(self, relabel=False):
+        img_paths = glob.glob(osp.join(self._dataset_dir, '*.jpg'))
+        pattern = re.compile(r'([-\d]+)_c(\d)')
+
+        self.images = []
+        self.labels = []
+        self.cameras = []
+        pid_container = set()
+
+        for img_path in sorted(img_paths):
+            pid, _ = map(int, pattern.search(img_path).groups())
+            if pid == -1: continue  # junk images are just ignored
+            pid_container.add(pid)
+        pid2label = {pid: label for label, pid in enumerate(pid_container)}
+
+        for img_path in sorted(img_paths):
+            pid, camid = map(int, pattern.search(img_path).groups())
+            if pid == -1: continue  # junk images are just ignored
+            assert 0 <= pid <= 1501  # pid == 0 means background
+            assert 1 <= camid <= 6
+            camid -= 1  # index starts from 0
+            if relabel: pid = pid2label[pid]
+            self.images.append(img_path)
+            self.labels.append(pid)
+            self.cameras.append(camid)
+
+        self.num_pids, self.num_imgs, self.num_cams = get_imagedata_info(
+            self.images, self.labels, self.cameras, subfolder=self._cls_path)
+
+    def __getitem__(self, idx):
+        try:
+            img = Image.open(self.images[idx]).convert('RGB')
+            if self.backend == "cv2":
+                img = np.array(img, dtype="float32").astype(np.uint8)
+            if self._transform_ops:
+                img = transform(img, self._transform_ops)
+            if self.backend == "cv2":
+                img = img.transpose((2, 0, 1))
+            return (img, self.labels[idx], self.cameras[idx])
+        except Exception as ex:
+            logger.error("Exception occured when parse line: {} with msg: {}".
+                         format(self.images[idx], ex))
+            rnd_idx = np.random.randint(self.__len__())
+            return self.__getitem__(rnd_idx)
+
+    def __len__(self):
+        return len(self.images)
+
+    @property
+    def class_num(self):
+        return len(set(self.labels))
+
+
+class MSMT17(Dataset):
+    """
+    MSMT17
+
+    Reference:
+    Wei et al. Person Transfer GAN to Bridge Domain Gap for Person Re-Identification. CVPR 2018.
+
+    URL: http://www.pkuvmc.com/publications/msmt17.html
+
+    Dataset statistics:
+    # identities: 4101
+    # images: 32621 (train) + 11659 (query) + 82161 (gallery)
+    # cameras: 15
+    """
+    _dataset_dir = 'msmt17/MSMT17_V1'
+
+    def __init__(self, image_root, cls_label_path, transform_ops=None):
+        self._img_root = image_root
+        self._cls_path = cls_label_path  # the sub folder in the dataset
+        self._dataset_dir = osp.join(image_root, self._dataset_dir,
+                                     self._cls_path)
+        self._check_before_run()
+        if transform_ops:
+            self._transform_ops = create_operators(transform_ops)
+        self._dtype = paddle.get_default_dtype()
+        self._load_anno(relabel=True if 'train' in self._cls_path else False)
+
+    def _check_before_run(self):
+        """Check if the file is available before going deeper"""
+        if not osp.exists(self._dataset_dir):
+            raise RuntimeError("'{}' is not available".format(
+                self._dataset_dir))
+
+    def _load_anno(self, relabel=False):
+        img_paths = glob.glob(osp.join(self._dataset_dir, '*.jpg'))
+        pattern = re.compile(r'([-\d]+)_c(\d+)')
+
+        self.images = []
+        self.labels = []
+        self.cameras = []
+        pid_container = set()
+
+        for img_path in img_paths:
+            pid, _ = map(int, pattern.search(img_path).groups())
+            if pid == -1:
+                continue  # junk images are just ignored
+            pid_container.add(pid)
+        pid2label = {pid: label for label, pid in enumerate(pid_container)}
+
+        for img_path in img_paths:
+            pid, camid = map(int, pattern.search(img_path).groups())
+            if pid == -1:
+                continue  # junk images are just ignored
+            assert 1 <= camid <= 15
+            camid -= 1  # index starts from 0
+            if relabel:
+                pid = pid2label[pid]
+            self.images.append(img_path)
+            self.labels.append(pid)
+            self.cameras.append(camid)
+
+        self.num_pids, self.num_imgs, self.num_cams = get_imagedata_info(
+            self.images, self.labels, self.cameras, subfolder=self._cls_path)
+
+    def __getitem__(self, idx):
+        try:
+            img = Image.open(self.images[idx]).convert('RGB')
+            img = np.array(img, dtype="float32").astype(np.uint8)
+            if self._transform_ops:
+                img = transform(img, self._transform_ops)
+            img = img.transpose((2, 0, 1))
+            return (img, self.labels[idx], self.cameras[idx])
+        except Exception as ex:
+            logger.error("Exception occured when parse line: {} with msg: {}".
+                         format(self.images[idx], ex))
+            rnd_idx = np.random.randint(self.__len__())
+            return self.__getitem__(rnd_idx)
+
+    def __len__(self):
+        return len(self.images)
+
+    @property
+    def class_num(self):
+        return len(set(self.labels))
+
+
+class DukeMTMC(Market1501):
+    """
+    DukeMTMC-reID.
+
+    Reference:
+    Ristani et al. Performance Measures and a Data Set for Multi-Target, Multi-Camera Tracking. ECCVW 2016.
+    Zheng et al. Unlabeled Samples Generated by GAN Improve the Person Re-identification Baseline in vitro. ICCV 2017.
+
+    URL: https://github.com/layumi/DukeMTMC-reID_evaluation
+
+    Dataset statistics:
+    # identities: 1404 (train + query)
+    # images: 16522 (train) + 2228 (query) + 17661 (gallery)
+    # cameras: 8
+    """
+    _dataset_dir = 'dukemtmc/DukeMTMC-reID'
+
+    def _load_anno(self, relabel=False):
+        img_paths = glob.glob(osp.join(self._dataset_dir, '*.jpg'))
+        pattern = re.compile(r'([-\d]+)_c(\d+)')
+
+        self.images = []
+        self.labels = []
+        self.cameras = []
+        pid_container = set()
+
+        for img_path in img_paths:
+            pid, _ = map(int, pattern.search(img_path).groups())
+            pid_container.add(pid)
+        pid2label = {pid: label for label, pid in enumerate(pid_container)}
+
+        for img_path in img_paths:
+            pid, camid = map(int, pattern.search(img_path).groups())
+            assert 1 <= camid <= 8
+            camid -= 1  # index starts from 0
+            if relabel:
+                pid = pid2label[pid]
+            self.images.append(img_path)
+            self.labels.append(pid)
+            self.cameras.append(camid)
+
+        self.num_pids, self.num_imgs, self.num_cams = get_imagedata_info(
+            self.images, self.labels, self.cameras, subfolder=self._cls_path)
+
+
+def get_imagedata_info(data, labels, cameras, subfolder='train'):
+    pids, cams = [], []
+    for _, pid, camid in zip(data, labels, cameras):
+        pids += [pid]
+        cams += [camid]
+    pids = set(pids)
+    cams = set(cams)
+    num_pids = len(pids)
+    num_cams = len(cams)
+    num_imgs = len(data)
+    print("Dataset statistics:")
+    print("  ----------------------------------------")
+    print("  subset   | # ids | # images | # cameras")
+    print("  ----------------------------------------")
+    print("  {}    | {:5d} | {:8d} | {:9d}".format(subfolder, num_pids,
+                                                   num_imgs, num_cams))
+    print("  ----------------------------------------")
+    return num_pids, num_imgs, num_cams
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/pk_sampler.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/pk_sampler.py
new file mode 100644
index 000000000..11d1ac8e6
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/pk_sampler.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+
+from collections import defaultdict
+
+import numpy as np
+import paddle.distributed as dist
+from paddle.io import DistributedBatchSampler
+
+from ppcls.utils import logger
+
+
+class PKSampler(DistributedBatchSampler):
+    """First, randomly sample P identities.
+        Then for each identity randomly sample K instances.
+        Therefore batch size equals to P * K, and the sampler called PKSampler.
+
+    Args:
+        dataset (Dataset): Dataset which contains list of (img_path, pid, camid))
+        batch_size (int): batch size
+        sample_per_id (int): number of instance(s) within an class
+        shuffle (bool, optional): _description_. Defaults to True.
+        id_list(list): list of (start_id, end_id, start_id, end_id) for set of ids to duplicated.
+        ratio(list): list of (ratio1, ratio2..) the duplication number for ids in id_list.
+        drop_last (bool, optional): whether to discard the data at the end. Defaults to True.
+        sample_method (str, optional): sample method when generating prob_list. Defaults to "sample_avg_prob".
+        total_epochs (int, optional): total epochs. Defaults to 0.
+    """
+
+    def __init__(self,
+                 dataset,
+                 batch_size,
+                 sample_per_id,
+                 shuffle=True,
+                 drop_last=True,
+                 id_list=None,
+                 ratio=None,
+                 sample_method="sample_avg_prob",
+                 total_epochs=0):
+        super().__init__(
+            dataset, batch_size, shuffle=shuffle, drop_last=drop_last)
+        assert batch_size % sample_per_id == 0, \
+            f"PKSampler configs error, sample_per_id({sample_per_id}) must be a divisor of batch_size({batch_size})."
+        assert hasattr(self.dataset,
+                       "labels"), "Dataset must have labels attribute."
+        self.sample_per_id = sample_per_id
+        self.label_dict = defaultdict(list)
+        self.sample_method = sample_method
+        self.total_epochs = total_epochs
+        for idx, label in enumerate(self.dataset.labels):
+            self.label_dict[label].append(idx)
+        self.label_list = list(self.label_dict)
+        assert len(self.label_list) * self.sample_per_id >= self.batch_size, \
+            f"batch size({self.batch_size}) should not be bigger than than #classes({len(self.label_list)})*sample_per_id({self.sample_per_id})"
+        if self.sample_method == "id_avg_prob":
+            self.prob_list = np.array([1 / len(self.label_list)] *
+                                      len(self.label_list))
+        elif self.sample_method == "sample_avg_prob":
+            counter = []
+            for label_i in self.label_list:
+                counter.append(len(self.label_dict[label_i]))
+            self.prob_list = np.array(counter) / sum(counter)
+        else:
+            logger.error(
+                "PKSampler only support id_avg_prob and sample_avg_prob sample method, "
+                "but receive {}.".format(self.sample_method))
+
+        if id_list and ratio:
+            assert len(id_list) % 2 == 0 and len(id_list) == len(ratio) * 2
+            for i in range(len(self.prob_list)):
+                for j in range(len(ratio)):
+                    if i >= id_list[j * 2] and i <= id_list[j * 2 + 1]:
+                        self.prob_list[i] = self.prob_list[i] * ratio[j]
+                        break
+            self.prob_list = self.prob_list / sum(self.prob_list)
+
+        diff = np.abs(sum(self.prob_list) - 1)
+        if diff > 0.00000001:
+            self.prob_list[-1] = 1 - sum(self.prob_list[:-1])
+            if self.prob_list[-1] > 1 or self.prob_list[-1] < 0:
+                logger.error("PKSampler prob list error")
+            else:
+                logger.info(
+                    "PKSampler: sum of prob list not equal to 1, diff is {}, change the last prob".
+                    format(diff))
+
+    def __iter__(self):
+        # shuffle manually, same as DistributedBatchSampler.__iter__
+        if self.shuffle:
+            rank = dist.get_rank()
+            np.random.RandomState(rank * self.total_epochs +
+                                  self.epoch).shuffle(self.label_list)
+            np.random.RandomState(rank * self.total_epochs +
+                                  self.epoch).shuffle(self.prob_list)
+            self.epoch += 1
+
+        label_per_batch = self.batch_size // self.sample_per_id
+        for _ in range(len(self)):
+            batch_index = []
+            batch_label_list = np.random.choice(
+                self.label_list,
+                size=label_per_batch,
+                replace=False,
+                p=self.prob_list)
+            for label_i in batch_label_list:
+                label_i_indexes = self.label_dict[label_i]
+                if self.sample_per_id <= len(label_i_indexes):
+                    batch_index.extend(
+                        np.random.choice(
+                            label_i_indexes,
+                            size=self.sample_per_id,
+                            replace=False))
+                else:
+                    batch_index.extend(
+                        np.random.choice(
+                            label_i_indexes,
+                            size=self.sample_per_id,
+                            replace=True))
+            if not self.drop_last or len(batch_index) == self.batch_size:
+                yield batch_index
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/ra_sampler.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/ra_sampler.py
new file mode 100644
index 000000000..cfba5492d
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/ra_sampler.py
@@ -0,0 +1,57 @@
+import math
+import numpy as np
+
+from paddle.io import DistributedBatchSampler
+
+
+class RASampler(DistributedBatchSampler):
+    """
+    based on https://github.com/facebookresearch/deit/blob/main/samplers.py
+    """
+
+    def __init__(self,
+                 dataset,
+                 batch_size,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=False,
+                 drop_last=False,
+                 num_repeats: int=3):
+        super().__init__(dataset, batch_size, num_replicas, rank, shuffle,
+                         drop_last)
+        self.num_repeats = num_repeats
+        self.num_samples = int(
+            math.ceil(len(self.dataset) * num_repeats / self.nranks))
+        self.total_size = self.num_samples * self.nranks
+        self.num_selected_samples = int(
+            math.floor(len(self.dataset) // 256 * 256 / self.nranks))
+
+    def __iter__(self):
+        num_samples = len(self.dataset)
+        indices = np.arange(num_samples).tolist()
+        if self.shuffle:
+            np.random.RandomState(self.epoch).shuffle(indices)
+            self.epoch += 1
+
+        indices = [ele for ele in indices for i in range(self.num_repeats)]
+        indices += indices[:(self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        # subsample
+        indices = indices[self.local_rank:self.total_size:self.nranks]
+        assert len(indices) == self.num_samples
+        _sample_iter = iter(indices[:self.num_selected_samples])
+
+        batch_indices = []
+        for idx in _sample_iter:
+            batch_indices.append(idx)
+            if len(batch_indices) == self.batch_size:
+                yield batch_indices
+                batch_indices = []
+        if not self.drop_last and len(batch_indices) > 0:
+            yield batch_indices
+
+    def __len__(self):
+        num_samples = self.num_selected_samples
+        num_samples += int(not self.drop_last) * (self.batch_size - 1)
+        return num_samples // self.batch_size
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/vehicle_dataset.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/vehicle_dataset.py
new file mode 100644
index 000000000..b2dba4d49
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/dataloader/vehicle_dataset.py
@@ -0,0 +1,173 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import paddle
+from paddle.io import Dataset
+import os
+import cv2
+from PIL import Image
+from ppcls.data.preprocess import transform
+from ppcls.utils import logger
+from .common_dataset import create_operators
+
+
+class CompCars(Dataset):
+    def __init__(self,
+                 image_root,
+                 cls_label_path,
+                 label_root=None,
+                 transform_ops=None,
+                 bbox_crop=False):
+        self._img_root = image_root
+        self._cls_path = cls_label_path
+        self._label_root = label_root
+        if transform_ops:
+            self._transform_ops = create_operators(transform_ops)
+        self._bbox_crop = bbox_crop
+        self._dtype = paddle.get_default_dtype()
+        self._load_anno()
+
+    def _load_anno(self):
+        assert os.path.exists(self._cls_path)
+        assert os.path.exists(self._img_root)
+        if self._bbox_crop:
+            assert os.path.exists(self._label_root)
+        self.images = []
+        self.labels = []
+        self.bboxes = []
+        with open(self._cls_path) as fd:
+            lines = fd.readlines()
+            for l in lines:
+                l = l.strip().split()
+                if not self._bbox_crop:
+                    self.images.append(os.path.join(self._img_root, l[0]))
+                    self.labels.append(int(l[1]))
+                else:
+                    label_path = os.path.join(self._label_root,
+                                              l[0].split('.')[0] + '.txt')
+                    assert os.path.exists(label_path)
+                    with open(label_path) as f:
+                        bbox = f.readlines()[-1].strip().split()
+                    bbox = [int(x) for x in bbox]
+                    self.images.append(os.path.join(self._img_root, l[0]))
+                    self.labels.append(int(l[1]))
+                    self.bboxes.append(bbox)
+                    assert os.path.exists(self.images[-1])
+
+    def __getitem__(self, idx):
+        img = cv2.imread(self.images[idx])
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        if self._bbox_crop:
+            bbox = self.bboxes[idx]
+            img = img[bbox[1]:bbox[3], bbox[0]:bbox[2], :]
+        if self._transform_ops:
+            img = transform(img, self._transform_ops)
+        img = img.transpose((2, 0, 1))
+        return (img, self.labels[idx])
+
+    def __len__(self):
+        return len(self.images)
+
+    @property
+    def class_num(self):
+        return len(set(self.labels))
+
+
+class VeriWild(Dataset):
+    """Dataset for Vehicle and other similar data structure, such as VeRI-Wild, SOP, Inshop...
+    Args:
+        image_root (str): image root
+        cls_label_path (str): path to annotation file
+        transform_ops (List[Callable], optional): list of transform op(s). Defaults to None.
+        backend (str, optional): pil or cv2. Defaults to "cv2".
+        relabel (bool, optional): whether do relabel when original label do not starts from 0 or are discontinuous. Defaults to False.
+    """
+
+    def __init__(self,
+                 image_root,
+                 cls_label_path,
+                 transform_ops=None,
+                 backend="cv2",
+                 relabel=False):
+        self._img_root = image_root
+        self._cls_path = cls_label_path
+        if transform_ops:
+            self._transform_ops = create_operators(transform_ops)
+        self.backend = backend
+        self._dtype = paddle.get_default_dtype()
+        self._load_anno(relabel)
+
+    def _load_anno(self, relabel):
+        assert os.path.exists(
+            self._cls_path), f"path {self._cls_path} does not exist."
+        assert os.path.exists(
+            self._img_root), f"path {self._img_root} does not exist."
+        self.images = []
+        self.labels = []
+        self.cameras = []
+        with open(self._cls_path) as fd:
+            lines = fd.readlines()
+            if relabel:
+                label_set = set()
+                for line in lines:
+                    line = line.strip().split()
+                    label_set.add(np.int64(line[1]))
+                label_map = {
+                    oldlabel: newlabel
+                    for newlabel, oldlabel in enumerate(label_set)
+                }
+            for line in lines:
+                line = line.strip().split()
+                self.images.append(os.path.join(self._img_root, line[0]))
+                if relabel:
+                    self.labels.append(label_map[np.int64(line[1])])
+                else:
+                    self.labels.append(np.int64(line[1]))
+                if len(line) >= 3:
+                    self.cameras.append(np.int64(line[2]))
+                assert os.path.exists(self.images[-1]), \
+                    f"path {self.images[-1]} does not exist."
+
+        self.has_camera = len(self.cameras) > 0
+
+    def __getitem__(self, idx):
+        try:
+            if self.backend == "cv2":
+                with open(self.images[idx], 'rb') as f:
+                    img = f.read()
+            else:
+                img = Image.open(self.images[idx]).convert("RGB")
+            if self._transform_ops:
+                img = transform(img, self._transform_ops)
+            if self.backend == "cv2":
+                img = img.transpose((2, 0, 1))
+            if self.has_camera:
+                return (img, self.labels[idx], self.cameras[idx])
+            else:
+                return (img, self.labels[idx])
+        except Exception as ex:
+            logger.error("Exception occured when parse line: {} with msg: {}".
+                         format(self.images[idx], ex))
+            rnd_idx = np.random.randint(self.__len__())
+            return self.__getitem__(rnd_idx)
+
+    def __len__(self):
+        return len(self.images)
+
+    @property
+    def class_num(self):
+        return len(set(self.labels))
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/postprocess/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/postprocess/__init__.py
new file mode 100644
index 000000000..202f5be8b
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/postprocess/__init__.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import importlib
+
+from . import topk, threshoutput
+
+from .topk import Topk
+from .threshoutput import ThreshOutput, MultiLabelThreshOutput
+from .attr_rec import VehicleAttribute, PersonAttribute, TableAttribute
+from .scoreoutput import ScoreOutput
+
+
+def build_postprocess(config):
+    config = copy.deepcopy(config)
+    model_name = config.pop("name")
+    mod = importlib.import_module(__name__)
+    postprocess_func = getattr(mod, model_name)(**config)
+    return postprocess_func
+
+
+class DistillationPostProcess(object):
+    def __init__(self, model_name="Student", key=None, func="Topk", **kargs):
+        super().__init__()
+        self.func = eval(func)(**kargs)
+        self.model_name = model_name
+        self.key = key
+
+    def __call__(self, x, file_names=None):
+        x = x[self.model_name]
+        if self.key is not None:
+            x = x[self.key]
+        return self.func(x, file_names=file_names)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/postprocess/attr_rec.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/postprocess/attr_rec.py
new file mode 100644
index 000000000..ff6dcee16
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/postprocess/attr_rec.py
@@ -0,0 +1,284 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+
+class VehicleAttribute(object):
+    def __init__(self, color_threshold=0.5, type_threshold=0.5):
+        self.color_threshold = color_threshold
+        self.type_threshold = type_threshold
+        self.color_list = [
+            "yellow", "orange", "green", "gray", "red", "blue", "white",
+            "golden", "brown", "black"
+        ]
+        self.type_list = [
+            "sedan", "suv", "van", "hatchback", "mpv", "pickup", "bus",
+            "truck", "estate"
+        ]
+
+    def __call__(self, x, file_names=None):
+        if isinstance(x, dict):
+            x = x['logits']
+        assert isinstance(x, paddle.Tensor)
+        if file_names is not None:
+            assert x.shape[0] == len(file_names)
+        x = F.sigmoid(x).numpy()
+
+        # postprocess output of predictor
+        batch_res = []
+        for idx, res in enumerate(x):
+            res = res.tolist()
+            label_res = []
+            color_idx = np.argmax(res[:10])
+            type_idx = np.argmax(res[10:])
+            print(color_idx, type_idx)
+            if res[color_idx] >= self.color_threshold:
+                color_info = f"Color: ({self.color_list[color_idx]}, prob: {res[color_idx]})"
+            else:
+                color_info = "Color unknown"
+
+            if res[type_idx + 10] >= self.type_threshold:
+                type_info = f"Type: ({self.type_list[type_idx]}, prob: {res[type_idx + 10]})"
+            else:
+                type_info = "Type unknown"
+
+            label_res = f"{color_info}, {type_info}"
+
+            threshold_list = [self.color_threshold
+                              ] * 10 + [self.type_threshold] * 9
+            pred_res = (np.array(res) > np.array(threshold_list)
+                        ).astype(np.int8).tolist()
+            batch_res.append({
+                "attr": label_res,
+                "pred": pred_res,
+                "file_name": file_names[idx]
+            })
+        return batch_res
+
+
+class PersonAttribute(object):
+    def __init__(self,
+                 threshold=0.5,
+                 glasses_threshold=0.3,
+                 hold_threshold=0.6):
+        self.threshold = threshold
+        self.glasses_threshold = glasses_threshold
+        self.hold_threshold = hold_threshold
+
+    def __call__(self, x, file_names=None):
+        if isinstance(x, dict):
+            x = x['logits']
+        assert isinstance(x, paddle.Tensor)
+        if file_names is not None:
+            assert x.shape[0] == len(file_names)
+        x = F.sigmoid(x).numpy()
+
+        # postprocess output of predictor
+        age_list = ['AgeLess18', 'Age18-60', 'AgeOver60']
+        direct_list = ['Front', 'Side', 'Back']
+        bag_list = ['HandBag', 'ShoulderBag', 'Backpack']
+        upper_list = ['UpperStride', 'UpperLogo', 'UpperPlaid', 'UpperSplice']
+        lower_list = [
+            'LowerStripe', 'LowerPattern', 'LongCoat', 'Trousers', 'Shorts',
+            'Skirt&Dress'
+        ]
+        batch_res = []
+        for idx, res in enumerate(x):
+            res = res.tolist()
+            label_res = []
+            # gender 
+            gender = 'Female' if res[22] > self.threshold else 'Male'
+            label_res.append(gender)
+            # age
+            age = age_list[np.argmax(res[19:22])]
+            label_res.append(age)
+            # direction 
+            direction = direct_list[np.argmax(res[23:])]
+            label_res.append(direction)
+            # glasses
+            glasses = 'Glasses: '
+            if res[1] > self.glasses_threshold:
+                glasses += 'True'
+            else:
+                glasses += 'False'
+            label_res.append(glasses)
+            # hat
+            hat = 'Hat: '
+            if res[0] > self.threshold:
+                hat += 'True'
+            else:
+                hat += 'False'
+            label_res.append(hat)
+            # hold obj
+            hold_obj = 'HoldObjectsInFront: '
+            if res[18] > self.hold_threshold:
+                hold_obj += 'True'
+            else:
+                hold_obj += 'False'
+            label_res.append(hold_obj)
+            # bag
+            bag = bag_list[np.argmax(res[15:18])]
+            bag_score = res[15 + np.argmax(res[15:18])]
+            bag_label = bag if bag_score > self.threshold else 'No bag'
+            label_res.append(bag_label)
+            # upper
+            upper_res = res[4:8]
+            upper_label = 'Upper:'
+            sleeve = 'LongSleeve' if res[3] > res[2] else 'ShortSleeve'
+            upper_label += ' {}'.format(sleeve)
+            for i, r in enumerate(upper_res):
+                if r > self.threshold:
+                    upper_label += ' {}'.format(upper_list[i])
+            label_res.append(upper_label)
+            # lower
+            lower_res = res[8:14]
+            lower_label = 'Lower: '
+            has_lower = False
+            for i, l in enumerate(lower_res):
+                if l > self.threshold:
+                    lower_label += ' {}'.format(lower_list[i])
+                    has_lower = True
+            if not has_lower:
+                lower_label += ' {}'.format(lower_list[np.argmax(lower_res)])
+
+            label_res.append(lower_label)
+            # shoe
+            shoe = 'Boots' if res[14] > self.threshold else 'No boots'
+            label_res.append(shoe)
+
+            threshold_list = [0.5] * len(res)
+            threshold_list[1] = self.glasses_threshold
+            threshold_list[18] = self.hold_threshold
+            pred_res = (np.array(res) > np.array(threshold_list)
+                        ).astype(np.int8).tolist()
+
+            batch_res.append({"attributes": label_res, "output": pred_res})
+        return batch_res
+
+
+class FaceAttribute(object):
+    def __init__(self, threshold=0.65, convert_cn=False):
+        self.threshold = threshold
+        self.convert_cn = convert_cn
+
+    def __call__(self, x, file_names=None):
+        if isinstance(x, dict):
+            x = x['logits']
+        assert isinstance(x, paddle.Tensor)
+
+        if file_names is not None:
+            assert x.shape[0] == len(file_names)
+        x = F.sigmoid(x).numpy()
+
+        attribute_list = [
+            ["CheekWhiskers", "刚长出的双颊胡须"], ["ArchedEyebrows", "柳叶眉"],
+            ["Attractive", "吸引人的"], ["BagsUnderEyes", "眼袋"], ["Bald", "秃头"],
+            ["Bangs", "刘海"], ["BigLips", "大嘴唇"], ["BigNose", "大鼻子"],
+            ["BlackHair", "黑发"], ["BlondHair", "金发"], ["Blurry", "模糊的"],
+            ["BrownHair", "棕发"], ["BushyEyebrows", "浓眉"], ["Chubby", "圆胖的"],
+            ["DoubleChin", "双下巴"], ["Eyeglasses", "带眼镜"], ["Goatee", "山羊胡子"],
+            ["GrayHair", "灰发或白发"], ["HeavyMakeup", "浓妆"],
+            ["HighCheekbones", "高颧骨"], ["Male", "男性"],
+            ["MouthSlightlyOpen", "微微张开嘴巴"], ["Mustache", "胡子"],
+            ["NarrowEyes", "细长的眼睛"], ["NoBeard", "无胡子"],
+            ["OvalFace", "椭圆形的脸"], ["PaleSkin", "苍白的皮肤"],
+            ["PointyNose", "尖鼻子"], ["RecedingHairline", "发际线后移"],
+            ["RosyCheeks", "红润的双颊"], ["Sideburns", "连鬓胡子"], ["Smiling", "微笑"],
+            ["StraightHair", "直发"], ["WavyHair", "卷发"],
+            ["WearingEarrings", "戴着耳环"], ["WearingHat", "戴着帽子"],
+            ["WearingLipstick", "涂了唇膏"], ["WearingNecklace", "戴着项链"],
+            ["WearingNecktie", "戴着领带"], ["Young", "年轻人"]
+        ]
+        gender_list = [["Male", "男性"], ["Female", "女性"]]
+        age_list = [["Young", "年轻人"], ["Old", "老年人"]]
+        batch_res = []
+        index = 1 if self.convert_cn else 0
+        for idx, res in enumerate(x):
+            res = res.tolist()
+            label_res = []
+            threshold_list = [self.threshold] * len(res)
+            pred_res = (np.array(res) > np.array(threshold_list)
+                        ).astype(np.int8).tolist()
+            for i, value in enumerate(pred_res):
+                if i == 20:
+                    label_res.append(gender_list[0][index]
+                                     if value == 1 else gender_list[1][index])
+                elif i == 39:
+                    label_res.append(age_list[0][index]
+                                     if value == 1 else age_list[1][index])
+                else:
+                    if value == 1:
+                        label_res.append(attribute_list[i][index])
+            batch_res.append({"attributes": label_res, "output": pred_res})
+        return batch_res
+
+
+class TableAttribute(object):
+    def __init__(
+            self,
+            source_threshold=0.5,
+            number_threshold=0.5,
+            color_threshold=0.5,
+            clarity_threshold=0.5,
+            obstruction_threshold=0.5,
+            angle_threshold=0.5, ):
+        self.source_threshold = source_threshold
+        self.number_threshold = number_threshold
+        self.color_threshold = color_threshold
+        self.clarity_threshold = clarity_threshold
+        self.obstruction_threshold = obstruction_threshold
+        self.angle_threshold = angle_threshold
+
+    def __call__(self, x, file_names=None):
+        if isinstance(x, dict):
+            x = x['logits']
+        assert isinstance(x, paddle.Tensor)
+        if file_names is not None:
+            assert x.shape[0] == len(file_names)
+        x = F.sigmoid(x).numpy()
+
+        # postprocess output of predictor
+        batch_res = []
+        for idx, res in enumerate(x):
+            res = res.tolist()
+            label_res = []
+            source = 'Scanned' if res[0] > self.source_threshold else 'Photo'
+            number = 'Little' if res[1] > self.number_threshold else 'Numerous'
+            color = 'Black-and-White' if res[
+                2] > self.color_threshold else 'Multicolor'
+            clarity = 'Clear' if res[3] > self.clarity_threshold else 'Blurry'
+            obstruction = 'Without-Obstacles' if res[
+                4] > self.number_threshold else 'With-Obstacles'
+            angle = 'Horizontal' if res[
+                5] > self.number_threshold else 'Tilted'
+
+            label_res = [source, number, color, clarity, obstruction, angle]
+
+            threshold_list = [
+                self.source_threshold, self.number_threshold,
+                self.color_threshold, self.clarity_threshold,
+                self.obstruction_threshold, self.angle_threshold
+            ]
+            pred_res = (np.array(res) > np.array(threshold_list)
+                        ).astype(np.int8).tolist()
+            batch_res.append({
+                "attributes": label_res,
+                "output": pred_res,
+                "file_name": file_names[idx]
+            })
+        return batch_res
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/postprocess/scoreoutput.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/postprocess/scoreoutput.py
new file mode 100644
index 000000000..d68dd54f7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/postprocess/scoreoutput.py
@@ -0,0 +1,18 @@
+import numpy
+import numpy as np
+import paddle
+
+
+class ScoreOutput(object):
+    def __init__(self, decimal_places):
+        self.decimal_places = decimal_places
+
+    def __call__(self, x, file_names=None):
+        y = []
+        for idx, probs in enumerate(x):
+            score = np.around(x[idx].numpy(), self.decimal_places)
+            result = {"scores": score}
+            if file_names is not None:
+                result["file_name"] = file_names[idx]
+            y.append(result)
+        return y
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/postprocess/threshoutput.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/postprocess/threshoutput.py
new file mode 100644
index 000000000..b329288d9
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/postprocess/threshoutput.py
@@ -0,0 +1,90 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle.nn.functional as F
+
+
+class ThreshOutput(object):
+    def __init__(self, threshold, label_0="0", label_1="1"):
+        self.threshold = threshold
+        self.label_0 = label_0
+        self.label_1 = label_1
+
+    def __call__(self, x, file_names=None):
+        y = []
+        x = F.softmax(x, axis=-1).numpy()
+        for idx, probs in enumerate(x):
+            score = probs[1]
+            if score < self.threshold:
+                result = {"class_ids": [0], "scores":  [1 - score], "label_names": [self.label_0]}
+            else:
+                result = {"class_ids": [1], "scores": [score], "label_names": [self.label_1]}
+            if file_names is not None:
+                result["file_name"] = file_names[idx]
+            y.append(result)
+        return y
+
+
+class MultiLabelThreshOutput(object):
+    def __init__(self, threshold=0.5, class_id_map_file=None, delimiter=None):
+        self.threshold = threshold
+        self.delimiter = delimiter if delimiter is not None else " "
+        self.class_id_map = self.parse_class_id_map(class_id_map_file)
+
+    def parse_class_id_map(self, class_id_map_file):
+        if class_id_map_file is None:
+            return None
+        if not os.path.exists(class_id_map_file):
+            print(
+                "Warning: If want to use your own label_dict, please input legal path!\nOtherwise label_names will be empty!"
+            )
+            return None
+
+        try:
+            class_id_map = {}
+            with open(class_id_map_file, "r") as fin:
+                lines = fin.readlines()
+                for line in lines:
+                    partition = line.split("\n")[0].partition(self.delimiter)
+                    class_id_map[int(partition[0])] = str(partition[-1])
+        except Exception as ex:
+            print(ex)
+            class_id_map = None
+        return class_id_map
+
+    def __call__(self, x, file_names=None):
+        y = []
+        x = F.sigmoid(x).numpy()
+        for idx, probs in enumerate(x):
+            index = np.where(probs >= self.threshold)[0].astype("int32")
+            clas_id_list = []
+            score_list = []
+            label_name_list = []
+            for i in index:
+                clas_id_list.append(i.item())
+                score_list.append(probs[i].item())
+                if self.class_id_map is not None:
+                    label_name_list.append(self.class_id_map[i.item()])
+            result = {
+                "class_ids": clas_id_list,
+                "scores": np.around(
+                    score_list, decimals=5).tolist(),
+                "label_names": label_name_list    
+            }
+            if file_names is not None:
+                result["file_name"] = file_names[idx]
+            y.append(result)
+        return y
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/postprocess/topk.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/postprocess/topk.py
new file mode 100644
index 000000000..50cc40d49
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/postprocess/topk.py
@@ -0,0 +1,84 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+
+class Topk(object):
+    def __init__(self, topk=1, class_id_map_file=None, delimiter=None):
+        assert isinstance(topk, (int, ))
+        self.topk = topk
+        self.delimiter = delimiter if delimiter is not None else " "
+        self.class_id_map = self.parse_class_id_map(class_id_map_file)
+
+    def parse_class_id_map(self, class_id_map_file):
+        if class_id_map_file is None:
+            return None
+        if not os.path.exists(class_id_map_file):
+            print(
+                "Warning: If want to use your own label_dict, please input legal path!\nOtherwise label_names will be empty!"
+            )
+            return None
+
+        try:
+            class_id_map = {}
+            try:
+                with open(class_id_map_file, "r", encoding='utf-8') as fin:
+                    lines = fin.readlines()
+            except Exception as e:
+                with open(class_id_map_file, "r", encoding='gbk') as fin:
+                    lines = fin.readlines()
+            for line in lines:
+                partition = line.split("\n")[0].partition(self.delimiter)
+                class_id_map[int(partition[0])] = str(partition[-1])
+        except Exception as ex:
+            print(ex)
+            class_id_map = None
+        return class_id_map
+
+    def __call__(self, x, file_names=None):
+        if isinstance(x, dict):
+            x = x['logits']
+        assert isinstance(x, paddle.Tensor)
+        if file_names is not None:
+            assert x.shape[0] == len(file_names)
+        x = F.softmax(x, axis=-1)
+        x = x.numpy()
+        y = []
+        for idx, probs in enumerate(x):
+            index = probs.argsort(axis=0)[-self.topk:][::-1].astype(
+                "int32")
+            clas_id_list = []
+            score_list = []
+            label_name_list = []
+            for i in index:
+                clas_id_list.append(i.item())
+                score_list.append(probs[i].item())
+                if self.class_id_map is not None:
+                    label_name_list.append(self.class_id_map[i.item()])
+            result = {
+                "class_ids": clas_id_list,
+                "scores": np.around(
+                    score_list, decimals=5).tolist(),
+            }
+            if file_names is not None:
+                result["file_name"] = file_names[idx]
+            if label_name_list is not None:
+                result["label_names"] = label_name_list
+            y.append(result)
+        return y
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/__init__.py
new file mode 100644
index 000000000..20451d7bf
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/__init__.py
@@ -0,0 +1,183 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ppcls.data.preprocess.ops.autoaugment import ImageNetPolicy as RawImageNetPolicy
+from ppcls.data.preprocess.ops.randaugment import RandAugment as RawRandAugment
+from ppcls.data.preprocess.ops.randaugment import RandomApply
+from ppcls.data.preprocess.ops.randaugment import RandAugmentV2 as RawRandAugmentV2
+from ppcls.data.preprocess.ops.randaugment import RandAugmentV3 as RawRandAugmentV3
+from ppcls.data.preprocess.ops.randaugment import RandAugmentV4 as RawRandAugmentV4
+from ppcls.data.preprocess.ops.timm_autoaugment import RawTimmAutoAugment
+from ppcls.data.preprocess.ops.cutout import Cutout
+
+from ppcls.data.preprocess.ops.hide_and_seek import HideAndSeek
+from ppcls.data.preprocess.ops.random_erasing import RandomErasing
+from ppcls.data.preprocess.ops.grid import GridMask
+
+from ppcls.data.preprocess.ops.operators import DecodeImage
+from ppcls.data.preprocess.ops.operators import ResizeImage
+from ppcls.data.preprocess.ops.operators import CropImage
+from ppcls.data.preprocess.ops.operators import CropImageAtRatio
+from ppcls.data.preprocess.ops.operators import CenterCrop, Resize
+from ppcls.data.preprocess.ops.operators import RandCropImage
+from ppcls.data.preprocess.ops.operators import RandCropImageV2
+from ppcls.data.preprocess.ops.operators import RandFlipImage
+from ppcls.data.preprocess.ops.operators import NormalizeImage
+from ppcls.data.preprocess.ops.operators import ToCHWImage
+from ppcls.data.preprocess.ops.operators import AugMix
+from ppcls.data.preprocess.ops.operators import Pad
+from ppcls.data.preprocess.ops.operators import ToTensor
+from ppcls.data.preprocess.ops.operators import Normalize
+from ppcls.data.preprocess.ops.operators import RandomHorizontalFlip
+from ppcls.data.preprocess.ops.operators import RandomResizedCrop
+from ppcls.data.preprocess.ops.operators import CropWithPadding
+from ppcls.data.preprocess.ops.operators import RandomInterpolationAugment
+from ppcls.data.preprocess.ops.operators import ColorJitter
+from ppcls.data.preprocess.ops.operators import RandomGrayscale
+from ppcls.data.preprocess.ops.operators import RandomCropImage
+from ppcls.data.preprocess.ops.operators import RandomRotation
+from ppcls.data.preprocess.ops.operators import Padv2
+from ppcls.data.preprocess.ops.operators import RandomRot90
+from ppcls.data.preprocess.ops.operators import PCALighting
+from .ops.operators import format_data
+from paddle.vision.transforms import Pad as Pad_paddle_vision
+
+from ppcls.data.preprocess.batch_ops.batch_operators import MixupOperator, CutmixOperator, OpSampler, FmixOperator
+from ppcls.data.preprocess.batch_ops.batch_operators import MixupCutmixHybrid
+
+import numpy as np
+from PIL import Image
+import random
+
+
+def transform(data, ops=[]):
+    """ transform """
+    for op in ops:
+        data = op(data)
+    return data
+
+
+class AutoAugment(RawImageNetPolicy):
+    """ ImageNetPolicy wrapper to auto fit different img types """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def __call__(self, img):
+        if not isinstance(img, Image.Image):
+            img = np.ascontiguousarray(img)
+            img = Image.fromarray(img)
+
+        img = super().__call__(img)
+
+        if isinstance(img, Image.Image):
+            img = np.asarray(img)
+
+        return img
+
+
+class RandAugment(RawRandAugment):
+    """ RandAugment wrapper to auto fit different img types """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def __call__(self, img):
+        if not isinstance(img, Image.Image):
+            img = np.ascontiguousarray(img)
+            img = Image.fromarray(img)
+
+        img = super().__call__(img)
+
+        if isinstance(img, Image.Image):
+            img = np.asarray(img)
+
+        return img
+
+
+class RandAugmentV2(RawRandAugmentV2):
+    """ RandAugmentV2 wrapper to auto fit different img types """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def __call__(self, img):
+        if not isinstance(img, Image.Image):
+            img = np.ascontiguousarray(img)
+            img = Image.fromarray(img)
+
+        img = super().__call__(img)
+
+        if isinstance(img, Image.Image):
+            img = np.asarray(img)
+
+        return img
+
+
+class RandAugmentV3(RawRandAugmentV3):
+    """ RandAugmentV3 wrapper to auto fit different img types """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def __call__(self, img):
+        if not isinstance(img, Image.Image):
+            img = np.ascontiguousarray(img)
+            img = Image.fromarray(img)
+
+        img = super().__call__(img)
+
+        if isinstance(img, Image.Image):
+            img = np.asarray(img)
+
+        return img
+
+
+class RandAugmentV4(RawRandAugmentV4):
+    """ RandAugmentV4 wrapper to auto fit different img types """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def __call__(self, img):
+        if not isinstance(img, Image.Image):
+            img = np.ascontiguousarray(img)
+            img = Image.fromarray(img)
+
+        img = super().__call__(img)
+
+        if isinstance(img, Image.Image):
+            img = np.asarray(img)
+
+        return img
+
+
+class TimmAutoAugment(RawTimmAutoAugment):
+    """ TimmAutoAugment wrapper to auto fit different img tyeps. """
+
+    def __init__(self, prob=1.0, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.prob = prob
+
+    @format_data
+    def __call__(self, img):
+        if not isinstance(img, Image.Image):
+            img = np.ascontiguousarray(img)
+            img = Image.fromarray(img)
+        if random.random() < self.prob:
+            img = super().__call__(img)
+        if isinstance(img, Image.Image):
+            img = np.asarray(img)
+
+        return img
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/batch_ops/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/batch_ops/__init__.py
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/batch_ops/__init__.py
@@ -0,0 +1 @@
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/batch_ops/batch_operators.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/batch_ops/batch_operators.py
new file mode 100644
index 000000000..0040bda42
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/batch_ops/batch_operators.py
@@ -0,0 +1,501 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import random
+
+import numpy as np
+
+from ppcls.utils import logger
+from ppcls.data.preprocess.ops.fmix import sample_mask
+
+import paddle
+import paddle.nn.functional as F
+
+
+class BatchOperator(object):
+    """ BatchOperator """
+
+    def __init__(self, *args, **kwargs):
+        pass
+
+    def _unpack(self, batch):
+        """ _unpack """
+        assert isinstance(batch, list), \
+                'batch should be a list filled with tuples (img, label)'
+        bs = len(batch)
+        assert bs > 0, 'size of the batch data should > 0'
+        #imgs, labels = list(zip(*batch))
+        imgs = []
+        labels = []
+        for item in batch:
+            imgs.append(item[0])
+            labels.append(item[1])
+        return np.array(imgs), np.array(labels), bs
+
+    def _one_hot(self, targets):
+        return np.eye(self.class_num, dtype="float32")[targets]
+
+    def _mix_target(self, targets0, targets1, lam):
+        one_hots0 = self._one_hot(targets0)
+        one_hots1 = self._one_hot(targets1)
+        return one_hots0 * lam + one_hots1 * (1 - lam)
+
+    def __call__(self, batch):
+        return batch
+
+
+class MixupOperator(BatchOperator):
+    """ Mixup operator
+    reference: https://arxiv.org/abs/1710.09412
+
+    """
+
+    def __init__(self, class_num, alpha: float=1.):
+        """Build Mixup operator
+
+        Args:
+            alpha (float, optional): The parameter alpha of mixup. Defaults to 1..
+
+        Raises:
+            Exception: The value of parameter is illegal.
+        """
+        if alpha <= 0:
+            raise Exception(
+                f"Parameter \"alpha\" of Mixup should be greater than 0. \"alpha\": {alpha}."
+            )
+        if not class_num:
+            msg = "Please set \"Arch.class_num\" in config if use \"MixupOperator\"."
+            logger.error(Exception(msg))
+            raise Exception(msg)
+
+        self._alpha = alpha
+        self.class_num = class_num
+
+    def __call__(self, batch):
+        imgs, labels, bs = self._unpack(batch)
+        idx = np.random.permutation(bs)
+        lam = np.random.beta(self._alpha, self._alpha)
+        imgs = lam * imgs + (1 - lam) * imgs[idx]
+        targets = self._mix_target(labels, labels[idx], lam)
+        return list(zip(imgs, targets))
+
+
+class CutmixOperator(BatchOperator):
+    """ Cutmix operator
+    reference: https://arxiv.org/abs/1905.04899
+
+    """
+
+    def __init__(self, class_num, alpha=0.2):
+        """Build Cutmix operator
+
+        Args:
+            alpha (float, optional): The parameter alpha of cutmix. Defaults to 0.2.
+
+        Raises:
+            Exception: The value of parameter is illegal.
+        """
+        if alpha <= 0:
+            raise Exception(
+                f"Parameter \"alpha\" of Cutmix should be greater than 0. \"alpha\": {alpha}."
+            )
+        if not class_num:
+            msg = "Please set \"Arch.class_num\" in config if use \"CutmixOperator\"."
+            logger.error(Exception(msg))
+            raise Exception(msg)
+
+        self._alpha = alpha
+        self.class_num = class_num
+
+    def _rand_bbox(self, size, lam):
+        """ _rand_bbox """
+        w = size[2]
+        h = size[3]
+        cut_rat = np.sqrt(1. - lam)
+        cut_w = int(w * cut_rat)
+        cut_h = int(h * cut_rat)
+
+        # uniform
+        cx = np.random.randint(w)
+        cy = np.random.randint(h)
+
+        bbx1 = np.clip(cx - cut_w // 2, 0, w)
+        bby1 = np.clip(cy - cut_h // 2, 0, h)
+        bbx2 = np.clip(cx + cut_w // 2, 0, w)
+        bby2 = np.clip(cy + cut_h // 2, 0, h)
+
+        return bbx1, bby1, bbx2, bby2
+
+    def __call__(self, batch):
+        imgs, labels, bs = self._unpack(batch)
+        idx = np.random.permutation(bs)
+        lam = np.random.beta(self._alpha, self._alpha)
+
+        bbx1, bby1, bbx2, bby2 = self._rand_bbox(imgs.shape, lam)
+        imgs[:, :, bbx1:bbx2, bby1:bby2] = imgs[idx, :, bbx1:bbx2, bby1:bby2]
+        lam = 1 - (float(bbx2 - bbx1) * (bby2 - bby1) /
+                   (imgs.shape[-2] * imgs.shape[-1]))
+        targets = self._mix_target(labels, labels[idx], lam)
+        return list(zip(imgs, targets))
+
+
+class FmixOperator(BatchOperator):
+    """ Fmix operator
+    reference: https://arxiv.org/abs/2002.12047
+
+    """
+
+    def __init__(self,
+                 class_num,
+                 alpha=1,
+                 decay_power=3,
+                 max_soft=0.,
+                 reformulate=False):
+        if not class_num:
+            msg = "Please set \"Arch.class_num\" in config if use \"FmixOperator\"."
+            logger.error(Exception(msg))
+            raise Exception(msg)
+
+        self._alpha = alpha
+        self._decay_power = decay_power
+        self._max_soft = max_soft
+        self._reformulate = reformulate
+        self.class_num = class_num
+
+    def __call__(self, batch):
+        imgs, labels, bs = self._unpack(batch)
+        idx = np.random.permutation(bs)
+        size = (imgs.shape[2], imgs.shape[3])
+        lam, mask = sample_mask(self._alpha, self._decay_power, \
+                size, self._max_soft, self._reformulate)
+        imgs = mask * imgs + (1 - mask) * imgs[idx]
+        targets = self._mix_target(labels, labels[idx], lam)
+        return list(zip(imgs, targets))
+
+
+class OpSampler(object):
+    """ Sample a operator from  """
+
+    def __init__(self, class_num, **op_dict):
+        """Build OpSampler
+
+        Raises:
+            Exception: The parameter \"prob\" of operator(s) are be set error.
+        """
+        if not class_num:
+            msg = "Please set \"Arch.class_num\" in config if use \"OpSampler\"."
+            logger.error(Exception(msg))
+            raise Exception(msg)
+
+        if len(op_dict) < 1:
+            msg = f"ConfigWarning: No operator in \"OpSampler\". \"OpSampler\" has been skipped."
+            logger.warning(msg)
+
+        self.ops = {}
+        total_prob = 0
+        for op_name in op_dict:
+            param = op_dict[op_name]
+            if "prob" not in param:
+                msg = f"ConfigWarning: Parameter \"prob\" should be set when use operator in \"OpSampler\". The operator \"{op_name}\"'s prob has been set \"0\"."
+                logger.warning(msg)
+            prob = param.pop("prob", 0)
+            total_prob += prob
+            param.update({"class_num": class_num})
+            op = eval(op_name)(**param)
+            self.ops.update({op: prob})
+
+        if total_prob > 1:
+            msg = f"ConfigError: The total prob of operators in \"OpSampler\" should be less 1."
+            logger.error(Exception(msg))
+            raise Exception(msg)
+
+        # add "None Op" when total_prob < 1, "None Op" do nothing
+        self.ops[None] = 1 - total_prob
+
+    def __call__(self, batch):
+        op = random.choices(
+            list(self.ops.keys()), weights=list(self.ops.values()), k=1)[0]
+        # return batch directly when None Op
+        return op(batch) if op else batch
+
+
+class MixupCutmixHybrid(object):
+    """ Mixup/Cutmix that applies different params to each element or whole batch
+
+    Args:
+        mixup_alpha (float): mixup alpha value, mixup is active if > 0.
+        cutmix_alpha (float): cutmix alpha value, cutmix is active if > 0.
+        cutmix_minmax (List[float]): cutmix min/max image ratio, cutmix is active and uses this vs alpha if not None.
+        prob (float): probability of applying mixup or cutmix per batch or element
+        switch_prob (float): probability of switching to cutmix instead of mixup when both are active
+        mode (str): how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element)
+        correct_lam (bool): apply lambda correction when cutmix bbox clipped by image borders
+        label_smoothing (float): apply label smoothing to the mixed target tensor
+        num_classes (int): number of classes for target
+    """
+
+    def __init__(self,
+                 mixup_alpha=1.,
+                 cutmix_alpha=0.,
+                 cutmix_minmax=None,
+                 prob=1.0,
+                 switch_prob=0.5,
+                 mode='batch',
+                 correct_lam=True,
+                 label_smoothing=0.1,
+                 num_classes=4):
+        self.mixup_alpha = mixup_alpha
+        self.cutmix_alpha = cutmix_alpha
+        self.cutmix_minmax = cutmix_minmax
+        if self.cutmix_minmax is not None:
+            assert len(self.cutmix_minmax) == 2
+            # force cutmix alpha == 1.0 when minmax active to keep logic simple & safe
+            self.cutmix_alpha = 1.0
+        self.mix_prob = prob
+        self.switch_prob = switch_prob
+        self.label_smoothing = label_smoothing
+        self.num_classes = num_classes
+        self.mode = mode
+        self.correct_lam = correct_lam  # correct lambda based on clipped area for cutmix
+        self.mixup_enabled = True  # set to false to disable mixing (intended tp be set by train loop)
+
+    def _one_hot(self, x, num_classes, on_value=1., off_value=0.):
+        x = paddle.cast(x, dtype='int64')
+        on_value = paddle.full([x.shape[0], num_classes], on_value)
+        off_value = paddle.full([x.shape[0], num_classes], off_value)
+        return paddle.where(
+            F.one_hot(x, num_classes) == 1, on_value, off_value)
+
+    def _mixup_target(self, target, num_classes, lam=1., smoothing=0.0):
+        off_value = smoothing / num_classes
+        on_value = 1. - smoothing + off_value
+        y1 = self._one_hot(
+            target,
+            num_classes,
+            on_value=on_value,
+            off_value=off_value, )
+        y2 = self._one_hot(
+            target.flip(0),
+            num_classes,
+            on_value=on_value,
+            off_value=off_value)
+        return y1 * lam + y2 * (1. - lam)
+
+    def _rand_bbox(self, img_shape, lam, margin=0., count=None):
+        """ Standard CutMix bounding-box
+        Generates a random square bbox based on lambda value. This impl includes
+        support for enforcing a border margin as percent of bbox dimensions.
+
+        Args:
+            img_shape (tuple): Image shape as tuple
+            lam (float): Cutmix lambda value
+            margin (float): Percentage of bbox dimension to enforce as margin (reduce amount of box outside image)
+            count (int): Number of bbox to generate
+        """
+        ratio = np.sqrt(1 - lam)
+        img_h, img_w = img_shape[-2:]
+        cut_h, cut_w = int(img_h * ratio), int(img_w * ratio)
+        margin_y, margin_x = int(margin * cut_h), int(margin * cut_w)
+        cy = np.random.randint(0 + margin_y, img_h - margin_y, size=count)
+        cx = np.random.randint(0 + margin_x, img_w - margin_x, size=count)
+        yl = np.clip(cy - cut_h // 2, 0, img_h)
+        yh = np.clip(cy + cut_h // 2, 0, img_h)
+        xl = np.clip(cx - cut_w // 2, 0, img_w)
+        xh = np.clip(cx + cut_w // 2, 0, img_w)
+        return yl, yh, xl, xh
+
+    def _rand_bbox_minmax(self, img_shape, minmax, count=None):
+        """ Min-Max CutMix bounding-box
+        Inspired by Darknet cutmix impl, generates a random rectangular bbox
+        based on min/max percent values applied to each dimension of the input image.
+
+        Typical defaults for minmax are usually in the  .2-.3 for min and .8-.9 range for max.
+
+        Args:
+            img_shape (tuple): Image shape as tuple
+            minmax (tuple or list): Min and max bbox ratios (as percent of image size)
+            count (int): Number of bbox to generate
+        """
+        assert len(minmax) == 2
+        img_h, img_w = img_shape[-2:]
+        cut_h = np.random.randint(
+            int(img_h * minmax[0]), int(img_h * minmax[1]), size=count)
+        cut_w = np.random.randint(
+            int(img_w * minmax[0]), int(img_w * minmax[1]), size=count)
+        yl = np.random.randint(0, img_h - cut_h, size=count)
+        xl = np.random.randint(0, img_w - cut_w, size=count)
+        yu = yl + cut_h
+        xu = xl + cut_w
+        return yl, yu, xl, xu
+
+    def _cutmix_bbox_and_lam(self,
+                             img_shape,
+                             lam,
+                             ratio_minmax=None,
+                             correct_lam=True,
+                             count=None):
+        """ Generate bbox and apply lambda correction.
+        """
+        if ratio_minmax is not None:
+            yl, yu, xl, xu = self._rand_bbox_minmax(
+                img_shape, ratio_minmax, count=count)
+        else:
+            yl, yu, xl, xu = self._rand_bbox(img_shape, lam, count=count)
+        if correct_lam or ratio_minmax is not None:
+            bbox_area = (yu - yl) * (xu - xl)
+            lam = 1. - bbox_area / float(img_shape[-2] * img_shape[-1])
+        return (yl, yu, xl, xu), lam
+
+    def _params_per_elem(self, batch_size):
+        lam = np.ones(batch_size, dtype=np.float32)
+        use_cutmix = np.zeros(batch_size, dtype=np.bool)
+        if self.mixup_enabled:
+            if self.mixup_alpha > 0. and self.cutmix_alpha > 0.:
+                use_cutmix = np.random.rand(batch_size) < self.switch_prob
+                lam_mix = np.where(
+                    use_cutmix,
+                    np.random.beta(
+                        self.cutmix_alpha, self.cutmix_alpha, size=batch_size),
+                    np.random.beta(
+                        self.mixup_alpha, self.mixup_alpha, size=batch_size))
+            elif self.mixup_alpha > 0.:
+                lam_mix = np.random.beta(
+                    self.mixup_alpha, self.mixup_alpha, size=batch_size)
+            elif self.cutmix_alpha > 0.:
+                use_cutmix = np.ones(batch_size, dtype=np.bool)
+                lam_mix = np.random.beta(
+                    self.cutmix_alpha, self.cutmix_alpha, size=batch_size)
+            else:
+                assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true."
+            lam = np.where(
+                np.random.rand(batch_size) < self.mix_prob,
+                lam_mix.astype(np.float32), lam)
+        return lam, use_cutmix
+
+    def _params_per_batch(self):
+        lam = 1.
+        use_cutmix = False
+        if self.mixup_enabled and np.random.rand() < self.mix_prob:
+            if self.mixup_alpha > 0. and self.cutmix_alpha > 0.:
+                use_cutmix = np.random.rand() < self.switch_prob
+                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha) if use_cutmix else \
+                    np.random.beta(self.mixup_alpha, self.mixup_alpha)
+            elif self.mixup_alpha > 0.:
+                lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha)
+            elif self.cutmix_alpha > 0.:
+                use_cutmix = True
+                lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha)
+            else:
+                assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true."
+            lam = float(lam_mix)
+        return lam, use_cutmix
+
+    def _mix_elem(self, x):
+        batch_size = len(x)
+        lam_batch, use_cutmix = self._params_per_elem(batch_size)
+        x_orig = x.clone(
+        )  # need to keep an unmodified original for mixing source
+        for i in range(batch_size):
+            j = batch_size - i - 1
+            lam = lam_batch[i]
+            if lam != 1.:
+                if use_cutmix[i]:
+                    (yl, yh, xl, xh), lam = self._cutmix_bbox_and_lam(
+                        x[i].shape,
+                        lam,
+                        ratio_minmax=self.cutmix_minmax,
+                        correct_lam=self.correct_lam)
+                    if yl < yh and xl < xh:
+                        x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh]
+                    lam_batch[i] = lam
+                else:
+                    x[i] = x[i] * lam + x_orig[j] * (1 - lam)
+        return paddle.to_tensor(lam_batch, dtype=x.dtype).unsqueeze(1)
+
+    def _mix_pair(self, x):
+        batch_size = len(x)
+        lam_batch, use_cutmix = self._params_per_elem(batch_size // 2)
+        x_orig = x.clone(
+        )  # need to keep an unmodified original for mixing source
+        for i in range(batch_size // 2):
+            j = batch_size - i - 1
+            lam = lam_batch[i]
+            if lam != 1.:
+                if use_cutmix[i]:
+                    (yl, yh, xl, xh), lam = self._cutmix_bbox_and_lam(
+                        x[i].shape,
+                        lam,
+                        ratio_minmax=self.cutmix_minmax,
+                        correct_lam=self.correct_lam)
+                    if yl < yh and xl < xh:
+                        x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh]
+                        x[j][:, yl:yh, xl:xh] = x_orig[i][:, yl:yh, xl:xh]
+                    lam_batch[i] = lam
+                else:
+                    x[i] = x[i] * lam + x_orig[j] * (1 - lam)
+                    x[j] = x[j] * lam + x_orig[i] * (1 - lam)
+        lam_batch = np.concatenate((lam_batch, lam_batch[::-1]))
+        return paddle.to_tensor(lam_batch, dtype=x.dtype).unsqueeze(1)
+
+    def _mix_batch(self, x):
+        lam, use_cutmix = self._params_per_batch()
+        if lam == 1.:
+            return 1.
+        if use_cutmix:
+            (yl, yh, xl, xh), lam = self._cutmix_bbox_and_lam(
+                x.shape,
+                lam,
+                ratio_minmax=self.cutmix_minmax,
+                correct_lam=self.correct_lam)
+            if yl < yh and xl < xh:
+                x[:, :, yl:yh, xl:xh] = x.flip(0)[:, :, yl:yh, xl:xh]
+
+        else:
+            x_flipped = x.flip(0) * (1. - lam)
+            x[:] = x * lam + x_flipped
+        return lam
+
+    def _unpack(self, batch):
+        """ _unpack """
+        assert isinstance(batch, list), \
+                'batch should be a list filled with tuples (img, label)'
+        bs = len(batch)
+        assert bs > 0, 'size of the batch data should > 0'
+        #imgs, labels = list(zip(*batch))
+        imgs = []
+        labels = []
+        for item in batch:
+            imgs.append(item[0])
+            labels.append(item[1])
+        return np.array(imgs), np.array(labels), bs
+
+    def __call__(self, batch):
+        x, target, bs = self._unpack(batch)
+        x = paddle.to_tensor(x)
+        target = paddle.to_tensor(target)
+        assert len(x) % 2 == 0, 'Batch size should be even when using this'
+        if self.mode == 'elem':
+            lam = self._mix_elem(x)
+        elif self.mode == 'pair':
+            lam = self._mix_pair(x)
+        else:
+            lam = self._mix_batch(x)
+        target = self._mixup_target(target, self.num_classes, lam,
+                                    self.label_smoothing)
+
+        return list(zip(x.numpy(), target.numpy()))
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/__init__.py
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/__init__.py
@@ -0,0 +1 @@
+
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/autoaugment.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/autoaugment.py
new file mode 100644
index 000000000..43327950f
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/autoaugment.py
@@ -0,0 +1,265 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on https://github.com/DeepVoltaire/AutoAugment/blob/master/autoaugment.py
+# reference: https://arxiv.org/abs/1805.09501
+
+from PIL import Image, ImageEnhance, ImageOps
+import numpy as np
+import random
+
+
+class ImageNetPolicy(object):
+    """ Randomly choose one of the best 24 Sub-policies on ImageNet.
+
+        Example:
+        >>> policy = ImageNetPolicy()
+        >>> transformed = policy(image)
+
+        Example as a PyTorch Transform:
+        >>> transform=transforms.Compose([
+        >>>     transforms.Resize(256),
+        >>>     ImageNetPolicy(),
+        >>>     transforms.ToTensor()])
+    """
+
+    def __init__(self, fillcolor=(128, 128, 128)):
+        self.policies = [
+            SubPolicy(0.4, "posterize", 8, 0.6, "rotate", 9, fillcolor),
+            SubPolicy(0.6, "solarize", 5, 0.6, "autocontrast", 5, fillcolor),
+            SubPolicy(0.8, "equalize", 8, 0.6, "equalize", 3, fillcolor),
+            SubPolicy(0.6, "posterize", 7, 0.6, "posterize", 6, fillcolor),
+            SubPolicy(0.4, "equalize", 7, 0.2, "solarize", 4, fillcolor),
+            SubPolicy(0.4, "equalize", 4, 0.8, "rotate", 8, fillcolor),
+            SubPolicy(0.6, "solarize", 3, 0.6, "equalize", 7, fillcolor),
+            SubPolicy(0.8, "posterize", 5, 1.0, "equalize", 2, fillcolor),
+            SubPolicy(0.2, "rotate", 3, 0.6, "solarize", 8, fillcolor),
+            SubPolicy(0.6, "equalize", 8, 0.4, "posterize", 6, fillcolor),
+            SubPolicy(0.8, "rotate", 8, 0.4, "color", 0, fillcolor),
+            SubPolicy(0.4, "rotate", 9, 0.6, "equalize", 2, fillcolor),
+            SubPolicy(0.0, "equalize", 7, 0.8, "equalize", 8, fillcolor),
+            SubPolicy(0.6, "invert", 4, 1.0, "equalize", 8, fillcolor),
+            SubPolicy(0.6, "color", 4, 1.0, "contrast", 8, fillcolor),
+            SubPolicy(0.8, "rotate", 8, 1.0, "color", 2, fillcolor),
+            SubPolicy(0.8, "color", 8, 0.8, "solarize", 7, fillcolor),
+            SubPolicy(0.4, "sharpness", 7, 0.6, "invert", 8, fillcolor),
+            SubPolicy(0.6, "shearX", 5, 1.0, "equalize", 9, fillcolor),
+            SubPolicy(0.4, "color", 0, 0.6, "equalize", 3, fillcolor),
+            SubPolicy(0.4, "equalize", 7, 0.2, "solarize", 4, fillcolor),
+            SubPolicy(0.6, "solarize", 5, 0.6, "autocontrast", 5, fillcolor),
+            SubPolicy(0.6, "invert", 4, 1.0, "equalize", 8, fillcolor),
+            SubPolicy(0.6, "color", 4, 1.0, "contrast", 8, fillcolor),
+            SubPolicy(0.8, "equalize", 8, 0.6, "equalize", 3, fillcolor)
+        ]
+
+    def __call__(self, img, policy_idx=None):
+        if policy_idx is None or not isinstance(policy_idx, int):
+            policy_idx = random.randint(0, len(self.policies) - 1)
+        else:
+            policy_idx = policy_idx % len(self.policies)
+        return self.policies[policy_idx](img)
+
+    def __repr__(self):
+        return "AutoAugment ImageNet Policy"
+
+
+class CIFAR10Policy(object):
+    """ Randomly choose one of the best 25 Sub-policies on CIFAR10.
+
+        Example:
+        >>> policy = CIFAR10Policy()
+        >>> transformed = policy(image)
+
+        Example as a PyTorch Transform:
+        >>> transform=transforms.Compose([
+        >>>     transforms.Resize(256),
+        >>>     CIFAR10Policy(),
+        >>>     transforms.ToTensor()])
+    """
+
+    def __init__(self, fillcolor=(128, 128, 128)):
+        self.policies = [
+            SubPolicy(0.1, "invert", 7, 0.2, "contrast", 6, fillcolor),
+            SubPolicy(0.7, "rotate", 2, 0.3, "translateX", 9, fillcolor),
+            SubPolicy(0.8, "sharpness", 1, 0.9, "sharpness", 3, fillcolor),
+            SubPolicy(0.5, "shearY", 8, 0.7, "translateY", 9, fillcolor),
+            SubPolicy(0.5, "autocontrast", 8, 0.9, "equalize", 2, fillcolor),
+            SubPolicy(0.2, "shearY", 7, 0.3, "posterize", 7, fillcolor),
+            SubPolicy(0.4, "color", 3, 0.6, "brightness", 7, fillcolor),
+            SubPolicy(0.3, "sharpness", 9, 0.7, "brightness", 9, fillcolor),
+            SubPolicy(0.6, "equalize", 5, 0.5, "equalize", 1, fillcolor),
+            SubPolicy(0.6, "contrast", 7, 0.6, "sharpness", 5, fillcolor),
+            SubPolicy(0.7, "color", 7, 0.5, "translateX", 8, fillcolor),
+            SubPolicy(0.3, "equalize", 7, 0.4, "autocontrast", 8, fillcolor),
+            SubPolicy(0.4, "translateY", 3, 0.2, "sharpness", 6, fillcolor),
+            SubPolicy(0.9, "brightness", 6, 0.2, "color", 8, fillcolor),
+            SubPolicy(0.5, "solarize", 2, 0.0, "invert", 3, fillcolor),
+            SubPolicy(0.2, "equalize", 0, 0.6, "autocontrast", 0, fillcolor),
+            SubPolicy(0.2, "equalize", 8, 0.8, "equalize", 4, fillcolor),
+            SubPolicy(0.9, "color", 9, 0.6, "equalize", 6, fillcolor),
+            SubPolicy(0.8, "autocontrast", 4, 0.2, "solarize", 8, fillcolor),
+            SubPolicy(0.1, "brightness", 3, 0.7, "color", 0, fillcolor),
+            SubPolicy(0.4, "solarize", 5, 0.9, "autocontrast", 3, fillcolor),
+            SubPolicy(0.9, "translateY", 9, 0.7, "translateY", 9, fillcolor),
+            SubPolicy(0.9, "autocontrast", 2, 0.8, "solarize", 3, fillcolor),
+            SubPolicy(0.8, "equalize", 8, 0.1, "invert", 3, fillcolor),
+            SubPolicy(0.7, "translateY", 9, 0.9, "autocontrast", 1, fillcolor)
+        ]
+
+    def __call__(self, img, policy_idx=None):
+        if policy_idx is None or not isinstance(policy_idx, int):
+            policy_idx = random.randint(0, len(self.policies) - 1)
+        else:
+            policy_idx = policy_idx % len(self.policies)
+        return self.policies[policy_idx](img)
+
+    def __repr__(self):
+        return "AutoAugment CIFAR10 Policy"
+
+
+class SVHNPolicy(object):
+    """ Randomly choose one of the best 25 Sub-policies on SVHN.
+
+        Example:
+        >>> policy = SVHNPolicy()
+        >>> transformed = policy(image)
+
+        Example as a PyTorch Transform:
+        >>> transform=transforms.Compose([
+        >>>     transforms.Resize(256),
+        >>>     SVHNPolicy(),
+        >>>     transforms.ToTensor()])
+    """
+
+    def __init__(self, fillcolor=(128, 128, 128)):
+        self.policies = [
+            SubPolicy(0.9, "shearX", 4, 0.2, "invert", 3, fillcolor),
+            SubPolicy(0.9, "shearY", 8, 0.7, "invert", 5, fillcolor),
+            SubPolicy(0.6, "equalize", 5, 0.6, "solarize", 6, fillcolor),
+            SubPolicy(0.9, "invert", 3, 0.6, "equalize", 3, fillcolor),
+            SubPolicy(0.6, "equalize", 1, 0.9, "rotate", 3, fillcolor),
+            SubPolicy(0.9, "shearX", 4, 0.8, "autocontrast", 3, fillcolor),
+            SubPolicy(0.9, "shearY", 8, 0.4, "invert", 5, fillcolor),
+            SubPolicy(0.9, "shearY", 5, 0.2, "solarize", 6, fillcolor),
+            SubPolicy(0.9, "invert", 6, 0.8, "autocontrast", 1, fillcolor),
+            SubPolicy(0.6, "equalize", 3, 0.9, "rotate", 3, fillcolor),
+            SubPolicy(0.9, "shearX", 4, 0.3, "solarize", 3, fillcolor),
+            SubPolicy(0.8, "shearY", 8, 0.7, "invert", 4, fillcolor),
+            SubPolicy(0.9, "equalize", 5, 0.6, "translateY", 6, fillcolor),
+            SubPolicy(0.9, "invert", 4, 0.6, "equalize", 7, fillcolor),
+            SubPolicy(0.3, "contrast", 3, 0.8, "rotate", 4, fillcolor),
+            SubPolicy(0.8, "invert", 5, 0.0, "translateY", 2, fillcolor),
+            SubPolicy(0.7, "shearY", 6, 0.4, "solarize", 8, fillcolor),
+            SubPolicy(0.6, "invert", 4, 0.8, "rotate", 4, fillcolor),
+            SubPolicy(
+                0.3, "shearY", 7, 0.9, "translateX", 3, fillcolor), SubPolicy(
+                    0.1, "shearX", 6, 0.6, "invert", 5, fillcolor), SubPolicy(
+                        0.7, "solarize", 2, 0.6, "translateY", 7,
+                        fillcolor), SubPolicy(0.8, "shearY", 4, 0.8, "invert",
+                                              8, fillcolor), SubPolicy(
+                                                  0.7, "shearX", 9, 0.8,
+                                                  "translateY", 3,
+                                                  fillcolor), SubPolicy(
+                                                      0.8, "shearY", 5, 0.7,
+                                                      "autocontrast", 3,
+                                                      fillcolor),
+            SubPolicy(0.7, "shearX", 2, 0.1, "invert", 5, fillcolor)
+        ]
+
+    def __call__(self, img, policy_idx=None):
+        if policy_idx is None or not isinstance(policy_idx, int):
+            policy_idx = random.randint(0, len(self.policies) - 1)
+        else:
+            policy_idx = policy_idx % len(self.policies)
+        return self.policies[policy_idx](img)
+
+    def __repr__(self):
+        return "AutoAugment SVHN Policy"
+
+
+class SubPolicy(object):
+    def __init__(self,
+                 p1,
+                 operation1,
+                 magnitude_idx1,
+                 p2,
+                 operation2,
+                 magnitude_idx2,
+                 fillcolor=(128, 128, 128)):
+        ranges = {
+            "shearX": np.linspace(0, 0.3, 10),
+            "shearY": np.linspace(0, 0.3, 10),
+            "translateX": np.linspace(0, 150 / 331, 10),
+            "translateY": np.linspace(0, 150 / 331, 10),
+            "rotate": np.linspace(0, 30, 10),
+            "color": np.linspace(0.0, 0.9, 10),
+            "posterize": np.round(np.linspace(8, 4, 10), 0).astype(np.int_),
+            "solarize": np.linspace(256, 0, 10),
+            "contrast": np.linspace(0.0, 0.9, 10),
+            "sharpness": np.linspace(0.0, 0.9, 10),
+            "brightness": np.linspace(0.0, 0.9, 10),
+            "autocontrast": [0] * 10,
+            "equalize": [0] * 10,
+            "invert": [0] * 10
+        }
+
+        # from https://stackoverflow.com/questions/5252170/specify-image-filling-color-when-rotating-in-python-with-pil-and-setting-expand
+        def rotate_with_fill(img, magnitude):
+            rot = img.convert("RGBA").rotate(magnitude)
+            return Image.composite(rot,
+                                   Image.new("RGBA", rot.size, (128, ) * 4),
+                                   rot).convert(img.mode)
+
+        func = {
+            "shearX": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, magnitude * random.choice([-1, 1]), 0, 0, 1, 0),
+                Image.BICUBIC, fillcolor=fillcolor),
+            "shearY": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, 0, magnitude * random.choice([-1, 1]), 1, 0),
+                Image.BICUBIC, fillcolor=fillcolor),
+            "translateX": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, magnitude * img.size[0] * random.choice([-1, 1]), 0, 1, 0),
+                fillcolor=fillcolor),
+            "translateY": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, 0, 0, 1, magnitude * img.size[1] * random.choice([-1, 1])),
+                fillcolor=fillcolor),
+            "rotate": lambda img, magnitude: rotate_with_fill(img, magnitude),
+            # "rotate": lambda img, magnitude: img.rotate(magnitude * random.choice([-1, 1])),
+            "color": lambda img, magnitude: ImageEnhance.Color(img).enhance(1 + magnitude * random.choice([-1, 1])),
+            "posterize": lambda img, magnitude: ImageOps.posterize(img, magnitude),
+            "solarize": lambda img, magnitude: ImageOps.solarize(img, magnitude),
+            "contrast": lambda img, magnitude: ImageEnhance.Contrast(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "sharpness": lambda img, magnitude: ImageEnhance.Sharpness(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "brightness": lambda img, magnitude: ImageEnhance.Brightness(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "autocontrast": lambda img, magnitude: ImageOps.autocontrast(img),
+            "equalize": lambda img, magnitude: ImageOps.equalize(img),
+            "invert": lambda img, magnitude: ImageOps.invert(img)
+        }
+
+        self.p1 = p1
+        self.operation1 = func[operation1]
+        self.magnitude1 = ranges[operation1][magnitude_idx1]
+        self.p2 = p2
+        self.operation2 = func[operation2]
+        self.magnitude2 = ranges[operation2][magnitude_idx2]
+
+    def __call__(self, img):
+        if random.random() < self.p1:
+            img = self.operation1(img, self.magnitude1)
+        if random.random() < self.p2:
+            img = self.operation2(img, self.magnitude2)
+        return img
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/cutout.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/cutout.py
new file mode 100644
index 000000000..7519ce844
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/cutout.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on https://github.com/uoguelph-mlrg/Cutout
+# reference: https://arxiv.org/abs/1708.04552
+
+import random
+
+import numpy as np
+import cv2
+
+
+class Cutout(object):
+    def __init__(self, n_holes=1, length=112, fill_value=(0, 0, 0)):
+        self.n_holes = n_holes
+        self.length = length
+        if fill_value == 'none' or fill_value is None:
+            self.fill_value = None
+
+    def __call__(self, img):
+        """ cutout_image """
+        h, w = img.shape[:2]
+
+        for n in range(self.n_holes):
+            y = np.random.randint(h)
+            x = np.random.randint(w)
+
+            y1 = np.clip(y - self.length // 2, 0, h)
+            y2 = np.clip(y + self.length // 2, 0, h)
+            x1 = np.clip(x - self.length // 2, 0, w)
+            x2 = np.clip(x + self.length // 2, 0, w)
+
+            fill_value = self.fill_value
+            if fill_value is None:
+                if img.ndim == 2:
+                    fill_value = random.randint(0, 255)
+                else:
+                    fill_value = [random.randint(0, 255),
+                                  random.randint(0, 255),
+                                  random.randint(0, 255)]
+
+                img = cv2.rectangle(np.array(img), (x1, y1), (x2, y2), fill_value, -1)
+
+        return img
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/dali_operators.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/dali_operators.py
new file mode 100644
index 000000000..baf4b087c
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/dali_operators.py
@@ -0,0 +1,235 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import nvidia.dali.fn as fn
+import nvidia.dali.ops as ops
+import nvidia.dali.types as types
+
+
+class DecodeImage(ops.decoders.Image):
+    def __init__(self, *kargs, device="cpu", **kwargs):
+        super(DecodeImage, self).__init__(*kargs, device=device, **kwargs)
+
+    def __call__(self, data, **kwargs):
+        return super(DecodeImage, self).__call__(data, **kwargs)
+
+
+class ToCHWImage(ops.Transpose):
+    def __init__(self, *kargs, device="cpu", **kwargs):
+        super(ToCHWImage, self).__init__(*kargs, device=device, **kwargs)
+
+    def __call__(self, data, **kwargs):
+        return super(ToCHWImage, self).__call__(data, **kwargs)
+
+
+class ColorJitter(ops.ColorTwist):
+    def __init__(self,
+                 *kargs,
+                 device="cpu",
+                 prob=1.0,
+                 brightness_factor=0.0,
+                 contrast_factor=0.0,
+                 saturation_factor=0.0,
+                 hue_factor=0.0,
+                 **kwargs):
+        super(ColorJitter, self).__init__(*kargs, device=device, **kwargs)
+        self.brightness_factor = brightness_factor
+        self.contrast_factor = contrast_factor
+        self.saturation_factor = saturation_factor
+        self.hue_factor = hue_factor
+        self.rng = ops.random.CoinFlip(probability=prob)
+
+    def __call__(self, data, **kwargs):
+        do_jitter = self.rng()
+        brightness = fn.random.uniform(
+            range=(max(0, 1 - self.brightness_factor),
+                   1 + self.brightness_factor)) * do_jitter
+        contrast = fn.random.uniform(
+            range=(max(0, 1 - self.contrast_factor),
+                   1 + self.contrast_factor)) * do_jitter
+        saturation = fn.random.uniform(
+            range=(max(0, 1 - self.saturation_factor),
+                   1 + self.saturation_factor)) * do_jitter
+        hue = fn.random.uniform(range=(-self.hue_factor,
+                                       self.hue_factor)) * do_jitter
+        return super(ColorJitter, self).__call__(
+            data,
+            brightness=brightness,
+            contrast=contrast,
+            saturation=saturation,
+            hue=hue,
+            **kwargs)
+
+
+class DecodeRandomResizedCrop(ops.decoders.ImageRandomCrop):
+    def __init__(self,
+                 *kargs,
+                 device="cpu",
+                 resize_x=224,
+                 resize_y=224,
+                 resize_short=None,
+                 interp_type=types.DALIInterpType.INTERP_LINEAR,
+                 **kwargs):
+        super(DecodeRandomResizedCrop, self).__init__(
+            *kargs, device=device, **kwargs)
+        if resize_short is None:
+            self.resize = ops.Resize(
+                device="gpu" if device == "mixed" else "cpu",
+                resize_x=resize_x,
+                resize_y=resize_y,
+                interp_type=interp_type)
+        else:
+            self.resize = ops.Resize(
+                device="gpu" if device == "mixed" else "cpu",
+                resize_short=resize_short,
+                interp_type=interp_type)
+
+    def __call__(self, data, **kwargs):
+        data = super(DecodeRandomResizedCrop, self).__call__(data, **kwargs)
+        data = self.resize(data)
+        return data
+
+
+class CropMirrorNormalize(ops.CropMirrorNormalize):
+    def __init__(self, *kargs, device="cpu", prob=0.5, **kwargs):
+        super(CropMirrorNormalize, self).__init__(
+            *kargs, device=device, **kwargs)
+        self.rng = ops.random.CoinFlip(probability=prob)
+
+    def __call__(self, data, **kwargs):
+        do_mirror = self.rng()
+        return super(CropMirrorNormalize, self).__call__(
+            data, mirror=do_mirror, **kwargs)
+
+
+class RandCropImage(ops.RandomResizedCrop):
+    def __init__(self, *kargs, device="cpu", **kwargs):
+        super(RandCropImage, self).__init__(*kargs, device=device, **kwargs)
+
+    def __call__(self, data, **kwargs):
+        return super(RandCropImage, self).__call__(data, **kwargs)
+
+
+class CropImage(ops.Crop):
+    def __init__(self, *kargs, device="cpu", **kwargs):
+        super(CropImage, self).__init__(*kargs, device=device, **kwargs)
+
+    def __call__(self, data, **kwargs):
+        return super(CropImage, self).__call__(data, **kwargs)
+
+
+class ResizeImage(ops.Resize):
+    def __init__(self, *kargs, device="cpu", **kwargs):
+        super(ResizeImage, self).__init__(*kargs, device=device, **kwargs)
+
+    def __call__(self, data, **kwargs):
+        return super(ResizeImage, self).__call__(data, **kwargs)
+
+
+class RandFlipImage(ops.Flip):
+    def __init__(self, *kargs, device="cpu", prob=0.5, flip_code=1, **kwargs):
+        super(RandFlipImage, self).__init__(*kargs, device=device, **kwargs)
+        self.flip_code = flip_code
+        self.rng = ops.random.CoinFlip(probability=prob)
+
+    def __call__(self, data, **kwargs):
+        do_flip = self.rng()
+        if self.flip_code == 1:
+            return super(RandFlipImage, self).__call__(
+                data, horizontal=do_flip, vertical=0, **kwargs)
+        elif self.flip_code == 0:
+            return super(RandFlipImage, self).__call__(
+                data, horizontal=0, vertical=do_flip, **kwargs)
+        else:
+            return super(RandFlipImage, self).__call__(
+                data, horizontal=do_flip, vertical=do_flip, **kwargs)
+
+
+class Pad(ops.Crop):
+    """
+    use ops.Crop to implement Pad operator, for ops.Pad alwayls only pad in right and bottom.
+    """
+
+    def __init__(self, *kargs, device="cpu", **kwargs):
+        super(Pad, self).__init__(*kargs, device=device, **kwargs)
+
+    def __call__(self, data, **kwargs):
+        return super(Pad, self).__call__(data, **kwargs)
+
+
+class RandCropImageV2(ops.Crop):
+    def __init__(self, *kargs, device="cpu", **kwargs):
+        super(RandCropImageV2, self).__init__(*kargs, device=device, **kwargs)
+        self.rng_x = ops.random.Uniform(range=(0.0, 1.0))
+        self.rng_y = ops.random.Uniform(range=(0.0, 1.0))
+
+    def __call__(self, data, **kwargs):
+        pos_x = self.rng_x()
+        pos_y = self.rng_y()
+        return super(RandCropImageV2, self).__call__(
+            data, crop_pos_x=pos_x, crop_pos_y=pos_y, **kwargs)
+
+
+class RandomCropImage(ops.Crop):
+    def __init__(self, *kargs, device="cpu", **kwargs):
+        super(RandomCropImage, self).__init__(*kargs, device=device, **kwargs)
+        self.rng_x = ops.random.Uniform(range=(0.0, 1.0))
+        self.rng_y = ops.random.Uniform(range=(0.0, 1.0))
+
+    def __call__(self, data, **kwargs):
+        pos_x = self.rng_x()
+        pos_y = self.rng_y()
+        return super(RandomCropImage, self).__call__(
+            data, crop_pos_x=pos_x, crop_pos_y=pos_y, **kwargs)
+
+
+class RandomRotation(ops.Rotate):
+    def __init__(self, *kargs, device="cpu", prob=0.5, angle=0, **kwargs):
+        super(RandomRotation, self).__init__(*kargs, device=device, **kwargs)
+        self.rng = ops.random.CoinFlip(probability=prob)
+        discrete_angle = list(range(-angle, angle + 1))
+        self.rng_angle = ops.random.Uniform(values=discrete_angle)
+
+    def __call__(self, data, **kwargs):
+        do_rotate = self.rng()
+        angle = self.rng_angle()
+        flip_data = super(RandomRotation, self).__call__(
+            data,
+            angle=do_rotate * angle,
+            keep_size=True,
+            fill_value=0,
+            **kwargs)
+        return flip_data
+
+
+class RandomRot90(ops.Rotate):
+    def __init__(self, *kargs, device="cpu", **kwargs):
+        super(RandomRot90, self).__init__(*kargs, device=device, **kwargs)
+        self.rng_angle = ops.random.Uniform(values=[0.0, 1.0, 2.0, 3.0])
+
+    def __call__(self, data, **kwargs):
+        angle = self.rng_angle() * 90.0
+        flip_data = super(RandomRot90, self).__call__(
+            data, angle=angle, keep_size=True, fill_value=0, **kwargs)
+        return flip_data
+
+
+class NormalizeImage(ops.Normalize):
+    def __init__(self, *kargs, device="cpu", **kwargs):
+        super(NormalizeImage, self).__init__(*kargs, device=device, **kwargs)
+
+    def __call__(self, data, **kwargs):
+        return super(NormalizeImage, self).__call__(data, **kwargs)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/fmix.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/fmix.py
new file mode 100644
index 000000000..019f618c5
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/fmix.py
@@ -0,0 +1,220 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on https://github.com/ecs-vlc/FMix
+# reference: https://arxiv.org/abs/2002.12047
+
+import math
+import random
+
+import numpy as np
+from scipy.stats import beta
+
+
+def fftfreqnd(h, w=None, z=None):
+    """ Get bin values for discrete fourier transform of size (h, w, z)
+
+    :param h: Required, first dimension size
+    :param w: Optional, second dimension size
+    :param z: Optional, third dimension size
+    """
+    fz = fx = 0
+    fy = np.fft.fftfreq(h)
+
+    if w is not None:
+        fy = np.expand_dims(fy, -1)
+
+        if w % 2 == 1:
+            fx = np.fft.fftfreq(w)[:w // 2 + 2]
+        else:
+            fx = np.fft.fftfreq(w)[:w // 2 + 1]
+
+    if z is not None:
+        fy = np.expand_dims(fy, -1)
+        if z % 2 == 1:
+            fz = np.fft.fftfreq(z)[:, None]
+        else:
+            fz = np.fft.fftfreq(z)[:, None]
+
+    return np.sqrt(fx * fx + fy * fy + fz * fz)
+
+
+def get_spectrum(freqs, decay_power, ch, h, w=0, z=0):
+    """ Samples a fourier image with given size and frequencies decayed by decay power
+
+    :param freqs: Bin values for the discrete fourier transform
+    :param decay_power: Decay power for frequency decay prop 1/f**d
+    :param ch: Number of channels for the resulting mask
+    :param h: Required, first dimension size
+    :param w: Optional, second dimension size
+    :param z: Optional, third dimension size
+    """
+    scale = np.ones(1) / (np.maximum(freqs, np.array([1. / max(w, h, z)]))
+                          **decay_power)
+
+    param_size = [ch] + list(freqs.shape) + [2]
+    param = np.random.randn(*param_size)
+
+    scale = np.expand_dims(scale, -1)[None, :]
+
+    return scale * param
+
+
+def make_low_freq_image(decay, shape, ch=1):
+    """ Sample a low frequency image from fourier space
+
+    :param decay_power: Decay power for frequency decay prop 1/f**d
+    :param shape: Shape of desired mask, list up to 3 dims
+    :param ch: Number of channels for desired mask
+    """
+    freqs = fftfreqnd(*shape)
+    spectrum = get_spectrum(freqs, decay, ch,
+                            *shape)  #.reshape((1, *shape[:-1], -1))
+    spectrum = spectrum[:, 0] + 1j * spectrum[:, 1]
+    mask = np.real(np.fft.irfftn(spectrum, shape))
+
+    if len(shape) == 1:
+        mask = mask[:1, :shape[0]]
+    if len(shape) == 2:
+        mask = mask[:1, :shape[0], :shape[1]]
+    if len(shape) == 3:
+        mask = mask[:1, :shape[0], :shape[1], :shape[2]]
+
+    mask = mask
+    mask = (mask - mask.min())
+    mask = mask / mask.max()
+    return mask
+
+
+def sample_lam(alpha, reformulate=False):
+    """ Sample a lambda from symmetric beta distribution with given alpha
+
+    :param alpha: Alpha value for beta distribution
+    :param reformulate: If True, uses the reformulation of [1].
+    """
+    if reformulate:
+        lam = beta.rvs(alpha + 1, alpha)
+    else:
+        lam = beta.rvs(alpha, alpha)
+
+    return lam
+
+
+def binarise_mask(mask, lam, in_shape, max_soft=0.0):
+    """ Binarises a given low frequency image such that it has mean lambda.
+
+    :param mask: Low frequency image, usually the result of `make_low_freq_image`
+    :param lam: Mean value of final mask
+    :param in_shape: Shape of inputs
+    :param max_soft: Softening value between 0 and 0.5 which smooths hard edges in the mask.
+    :return:
+    """
+    idx = mask.reshape(-1).argsort()[::-1]
+    mask = mask.reshape(-1)
+    num = math.ceil(lam * mask.size) if random.random() > 0.5 else math.floor(
+        lam * mask.size)
+
+    eff_soft = max_soft
+    if max_soft > lam or max_soft > (1 - lam):
+        eff_soft = min(lam, 1 - lam)
+
+    soft = int(mask.size * eff_soft)
+    num_low = int(num - soft)
+    num_high = int(num + soft)
+
+    mask[idx[:num_high]] = 1
+    mask[idx[num_low:]] = 0
+    mask[idx[num_low:num_high]] = np.linspace(1, 0, (num_high - num_low))
+
+    mask = mask.reshape((1, 1, in_shape[0], in_shape[1]))
+    return mask
+
+
+def sample_mask(alpha, decay_power, shape, max_soft=0.0, reformulate=False):
+    """ Samples a mean lambda from beta distribution parametrised by alpha, creates a low frequency image and binarises
+    it based on this lambda
+
+    :param alpha: Alpha value for beta distribution from which to sample mean of mask
+    :param decay_power: Decay power for frequency decay prop 1/f**d
+    :param shape: Shape of desired mask, list up to 3 dims
+    :param max_soft: Softening value between 0 and 0.5 which smooths hard edges in the mask.
+    :param reformulate: If True, uses the reformulation of [1].
+    """
+    if isinstance(shape, int):
+        shape = (shape, )
+
+    # Choose lambda
+    lam = sample_lam(alpha, reformulate)
+
+    # Make mask, get mean / std
+    mask = make_low_freq_image(decay_power, shape)
+    mask = binarise_mask(mask, lam, shape, max_soft)
+
+    return float(lam), mask
+
+
+def sample_and_apply(x,
+                     alpha,
+                     decay_power,
+                     shape,
+                     max_soft=0.0,
+                     reformulate=False):
+    """
+
+    :param x: Image batch on which to apply fmix of shape [b, c, shape*]
+    :param alpha: Alpha value for beta distribution from which to sample mean of mask
+    :param decay_power: Decay power for frequency decay prop 1/f**d
+    :param shape: Shape of desired mask, list up to 3 dims
+    :param max_soft: Softening value between 0 and 0.5 which smooths hard edges in the mask.
+    :param reformulate: If True, uses the reformulation of [1].
+    :return: mixed input, permutation indices, lambda value of mix,
+    """
+    lam, mask = sample_mask(alpha, decay_power, shape, max_soft, reformulate)
+    index = np.random.permutation(x.shape[0])
+
+    x1, x2 = x * mask, x[index] * (1 - mask)
+    return x1 + x2, index, lam
+
+
+class FMixBase:
+    """ FMix augmentation
+
+        Args:
+            decay_power (float): Decay power for frequency decay prop 1/f**d
+            alpha (float): Alpha value for beta distribution from which to sample mean of mask
+            size ([int] | [int, int] | [int, int, int]): Shape of desired mask, list up to 3 dims
+            max_soft (float): Softening value between 0 and 0.5 which smooths hard edges in the mask.
+            reformulate (bool): If True, uses the reformulation of [1].
+    """
+
+    def __init__(self,
+                 decay_power=3,
+                 alpha=1,
+                 size=(32, 32),
+                 max_soft=0.0,
+                 reformulate=False):
+        super().__init__()
+        self.decay_power = decay_power
+        self.reformulate = reformulate
+        self.size = size
+        self.alpha = alpha
+        self.max_soft = max_soft
+        self.index = None
+        self.lam = None
+
+    def __call__(self, x):
+        raise NotImplementedError
+
+    def loss(self, *args, **kwargs):
+        raise NotImplementedError
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/functional.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/functional.py
new file mode 100644
index 000000000..9f1369eef
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/functional.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# encoding: utf-8
+
+import numpy as np
+from PIL import Image, ImageOps, ImageEnhance
+
+
+
+def int_parameter(level, maxval):
+    """Helper function to scale `val` between 0 and maxval .
+    Args:
+      level: Level of the operation that will be between [0, `PARAMETER_MAX`].
+      maxval: Maximum value that the operation can have. This will be scaled to
+        level/PARAMETER_MAX.
+    Returns:
+      An int that results from scaling `maxval` according to `level`.
+    """
+    return int(level * maxval / 10)
+
+
+def float_parameter(level, maxval):
+    """Helper function to scale `val` between 0 and maxval.
+    Args:
+      level: Level of the operation that will be between [0, `PARAMETER_MAX`].
+      maxval: Maximum value that the operation can have. This will be scaled to
+        level/PARAMETER_MAX.
+    Returns:
+      A float that results from scaling `maxval` according to `level`.
+    """
+    return float(level) * maxval / 10.
+
+
+def sample_level(n):
+    return np.random.uniform(low=0.1, high=n)
+
+
+def autocontrast(pil_img, *args):
+    return ImageOps.autocontrast(pil_img)
+
+
+def equalize(pil_img, *args):
+    return ImageOps.equalize(pil_img)
+
+
+def posterize(pil_img, level, *args):
+    level = int_parameter(sample_level(level), 4)
+    return ImageOps.posterize(pil_img, 4 - level)
+
+
+def rotate(pil_img, level, *args):
+    degrees = int_parameter(sample_level(level), 30)
+    if np.random.uniform() > 0.5:
+        degrees = -degrees
+    return pil_img.rotate(degrees, resample=Image.BILINEAR)
+
+
+def solarize(pil_img, level, *args):
+    level = int_parameter(sample_level(level), 256)
+    return ImageOps.solarize(pil_img, 256 - level)
+
+
+def shear_x(pil_img, level):
+    level = float_parameter(sample_level(level), 0.3)
+    if np.random.uniform() > 0.5:
+        level = -level
+    return pil_img.transform(pil_img.size,
+                             Image.AFFINE, (1, level, 0, 0, 1, 0),
+                             resample=Image.BILINEAR)
+
+
+def shear_y(pil_img, level):
+    level = float_parameter(sample_level(level), 0.3)
+    if np.random.uniform() > 0.5:
+        level = -level
+    return pil_img.transform(pil_img.size,
+                             Image.AFFINE, (1, 0, 0, level, 1, 0),
+                             resample=Image.BILINEAR)
+
+
+def translate_x(pil_img, level):
+    level = int_parameter(sample_level(level), pil_img.size[0] / 3)
+    if np.random.random() > 0.5:
+        level = -level
+    return pil_img.transform(pil_img.size,
+                             Image.AFFINE, (1, 0, level, 0, 1, 0),
+                             resample=Image.BILINEAR)
+
+
+def translate_y(pil_img, level):
+    level = int_parameter(sample_level(level), pil_img.size[1] / 3)
+    if np.random.random() > 0.5:
+        level = -level
+    return pil_img.transform(pil_img.size,
+                             Image.AFFINE, (1, 0, 0, 0, 1, level),
+                             resample=Image.BILINEAR)
+
+
+# operation that overlaps with ImageNet-C's test set
+def color(pil_img, level, *args):
+    level = float_parameter(sample_level(level), 1.8) + 0.1
+    return ImageEnhance.Color(pil_img).enhance(level)
+
+
+# operation that overlaps with ImageNet-C's test set
+def contrast(pil_img, level, *args):
+    level = float_parameter(sample_level(level), 1.8) + 0.1
+    return ImageEnhance.Contrast(pil_img).enhance(level)
+
+
+# operation that overlaps with ImageNet-C's test set
+def brightness(pil_img, level, *args):
+    level = float_parameter(sample_level(level), 1.8) + 0.1
+    return ImageEnhance.Brightness(pil_img).enhance(level)
+
+
+# operation that overlaps with ImageNet-C's test set
+def sharpness(pil_img, level, *args):
+    level = float_parameter(sample_level(level), 1.8) + 0.1
+    return ImageEnhance.Sharpness(pil_img).enhance(level)
+
+
+augmentations = [
+    autocontrast, equalize, posterize, rotate, solarize, shear_x, shear_y,
+    translate_x, translate_y
+]
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/grid.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/grid.py
new file mode 100644
index 000000000..1a9a76d86
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/grid.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on https://github.com/akuxcw/GridMask
+# reference: https://arxiv.org/abs/2001.04086.
+
+import numpy as np
+from PIL import Image
+import pdb
+
+# curr
+CURR_EPOCH = 0
+# epoch for the prob to be the upper limit
+NUM_EPOCHS = 240
+
+
+class GridMask(object):
+    def __init__(self, d1=96, d2=224, rotate=1, ratio=0.5, mode=0, prob=1.):
+        self.d1 = d1
+        self.d2 = d2
+        self.rotate = rotate
+        self.ratio = ratio
+        self.mode = mode
+        self.st_prob = prob
+        self.prob = prob
+        self.last_prob = -1
+
+    def set_prob(self):
+        global CURR_EPOCH
+        global NUM_EPOCHS
+        self.prob = self.st_prob * min(1, 1.0 * CURR_EPOCH / NUM_EPOCHS)
+
+    def __call__(self, img):
+        self.set_prob()
+        if abs(self.last_prob - self.prob) > 1e-10:
+            global CURR_EPOCH
+            global NUM_EPOCHS
+            print(
+                "self.prob is updated, self.prob={}, CURR_EPOCH: {}, NUM_EPOCHS: {}".
+                format(self.prob, CURR_EPOCH, NUM_EPOCHS))
+            self.last_prob = self.prob
+        # print("CURR_EPOCH: {}, NUM_EPOCHS: {}, self.prob is set as: {}".format(CURR_EPOCH, NUM_EPOCHS, self.prob) )
+        if np.random.rand() > self.prob:
+            return img
+        _, h, w = img.shape
+        hh = int(1.5 * h)
+        ww = int(1.5 * w)
+        d = np.random.randint(self.d1, self.d2)
+        #d = self.d
+        self.l = int(d * self.ratio + 0.5)
+        mask = np.ones((hh, ww), np.float32)
+        st_h = np.random.randint(d)
+        st_w = np.random.randint(d)
+        for i in range(-1, hh // d + 1):
+            s = d * i + st_h
+            t = s + self.l
+            s = max(min(s, hh), 0)
+            t = max(min(t, hh), 0)
+            mask[s:t, :] *= 0
+        for i in range(-1, ww // d + 1):
+            s = d * i + st_w
+            t = s + self.l
+            s = max(min(s, ww), 0)
+            t = max(min(t, ww), 0)
+            mask[:, s:t] *= 0
+        r = np.random.randint(self.rotate)
+        mask = Image.fromarray(np.uint8(mask))
+        mask = mask.rotate(r)
+        mask = np.asarray(mask)
+        mask = mask[(hh - h) // 2:(hh - h) // 2 + h, (ww - w) // 2:(ww - w) //
+                    2 + w]
+
+        if self.mode == 1:
+            mask = 1 - mask
+
+        mask = np.expand_dims(mask, axis=0)
+        img = (img * mask).astype(img.dtype)
+
+        return img
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/hide_and_seek.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/hide_and_seek.py
new file mode 100644
index 000000000..16fc671cf
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/hide_and_seek.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on https://github.com/kkanshul/Hide-and-Seek
+# reference: http://krsingh.cs.ucdavis.edu/krishna_files/papers/hide_and_seek/my_files/iccv2017.pdf
+
+import numpy as np
+import random
+
+
+class HideAndSeek(object):
+    def __init__(self):
+        # possible grid size, 0 means no hiding
+        self.grid_sizes = [0, 16, 32, 44, 56]
+        # hiding probability
+        self.hide_prob = 0.5
+
+    def __call__(self, img):
+        # randomly choose one grid size
+        grid_size = np.random.choice(self.grid_sizes)
+
+        _, h, w = img.shape
+
+        # hide the patches
+        if grid_size == 0:
+            return img
+        for x in range(0, w, grid_size):
+            for y in range(0, h, grid_size):
+                x_end = min(w, x + grid_size)
+                y_end = min(h, y + grid_size)
+                if (random.random() <= self.hide_prob):
+                    img[:, x:x_end, y:y_end] = 0
+
+        return img
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/operators.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/operators.py
new file mode 100644
index 000000000..22f1e1899
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/operators.py
@@ -0,0 +1,920 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from functools import partial
+import io
+import six
+import math
+import random
+import cv2
+import numpy as np
+from PIL import Image, ImageOps, __version__ as PILLOW_VERSION
+from paddle.vision.transforms import ColorJitter as RawColorJitter
+from paddle.vision.transforms import CenterCrop, Resize
+from paddle.vision.transforms import RandomRotation as RawRandomRotation
+from paddle.vision.transforms import ToTensor, Normalize, RandomHorizontalFlip, RandomResizedCrop
+from paddle.vision.transforms import functional as F
+from .autoaugment import ImageNetPolicy
+from .functional import augmentations
+from ppcls.utils import logger
+
+
+def format_data(func):
+    def warpper(self, data):
+        if isinstance(data, dict):
+            img = data["img"]
+            result = func(self, img)
+            if not isinstance(result, dict):
+                result = {"img": result}
+            return { ** data, ** result}
+        else:
+            result = func(self, data)
+            if isinstance(result, dict):
+                result = result["img"]
+            return result
+
+    return warpper
+
+
+class UnifiedResize(object):
+    def __init__(self, interpolation=None, backend="cv2", return_numpy=True):
+        _cv2_interp_from_str = {
+            'nearest': cv2.INTER_NEAREST,
+            'bilinear': cv2.INTER_LINEAR,
+            'area': cv2.INTER_AREA,
+            'bicubic': cv2.INTER_CUBIC,
+            'lanczos': cv2.INTER_LANCZOS4,
+            'random': (cv2.INTER_LINEAR, cv2.INTER_CUBIC)
+        }
+        _pil_interp_from_str = {
+            'nearest': Image.NEAREST,
+            'bilinear': Image.BILINEAR,
+            'bicubic': Image.BICUBIC,
+            'box': Image.BOX,
+            'lanczos': Image.LANCZOS,
+            'hamming': Image.HAMMING,
+            'random': (Image.BILINEAR, Image.BICUBIC)
+        }
+
+        def _cv2_resize(src, size, resample):
+            if isinstance(resample, tuple):
+                resample = random.choice(resample)
+            return cv2.resize(src, size, interpolation=resample)
+
+        def _pil_resize(src, size, resample, return_numpy=True):
+            if isinstance(resample, tuple):
+                resample = random.choice(resample)
+            if isinstance(src, np.ndarray):
+                pil_img = Image.fromarray(src)
+            else:
+                pil_img = src
+            pil_img = pil_img.resize(size, resample)
+            if return_numpy:
+                return np.asarray(pil_img)
+            return pil_img
+
+        if backend.lower() == "cv2":
+            if isinstance(interpolation, str):
+                interpolation = _cv2_interp_from_str[interpolation.lower()]
+            # compatible with opencv < version 4.4.0
+            elif interpolation is None:
+                interpolation = cv2.INTER_LINEAR
+            self.resize_func = partial(_cv2_resize, resample=interpolation)
+        elif backend.lower() == "pil":
+            if isinstance(interpolation, str):
+                interpolation = _pil_interp_from_str[interpolation.lower()]
+            elif interpolation is None:
+                interpolation = Image.BILINEAR
+            self.resize_func = partial(
+                _pil_resize, resample=interpolation, return_numpy=return_numpy)
+        else:
+            logger.warning(
+                f"The backend of Resize only support \"cv2\" or \"PIL\". \"f{backend}\" is unavailable. Use \"cv2\" instead."
+            )
+            self.resize_func = cv2.resize
+
+    def __call__(self, src, size):
+        if isinstance(size, list):
+            size = tuple(size)
+        return self.resize_func(src, size)
+
+
+class RandomInterpolationAugment(object):
+    def __init__(self, prob):
+        self.prob = prob
+
+    def _aug(self, img):
+        img_shape = img.shape
+        side_ratio = np.random.uniform(0.2, 1.0)
+        small_side = int(side_ratio * img_shape[0])
+        interpolation = np.random.choice([
+            cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_AREA,
+            cv2.INTER_CUBIC, cv2.INTER_LANCZOS4
+        ])
+        small_img = cv2.resize(
+            img, (small_side, small_side), interpolation=interpolation)
+        interpolation = np.random.choice([
+            cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_AREA,
+            cv2.INTER_CUBIC, cv2.INTER_LANCZOS4
+        ])
+        aug_img = cv2.resize(
+            small_img, (img_shape[1], img_shape[0]),
+            interpolation=interpolation)
+        return aug_img
+
+    def __call__(self, img):
+        if np.random.random() < self.prob:
+            if isinstance(img, np.ndarray):
+                return self._aug(img)
+            else:
+                pil_img = np.array(img)
+                aug_img = self._aug(pil_img)
+                img = Image.fromarray(aug_img.astype(np.uint8))
+                return img
+        else:
+            return img
+
+
+class OperatorParamError(ValueError):
+    """ OperatorParamError
+    """
+    pass
+
+
+class DecodeImage(object):
+    """ decode image """
+
+    def __init__(self,
+                 to_np=True,
+                 to_rgb=True,
+                 channel_first=False,
+                 backend="cv2"):
+        self.to_np = to_np  # to numpy
+        self.to_rgb = to_rgb  # only enabled when to_np is True
+        self.channel_first = channel_first  # only enabled when to_np is True
+
+        if backend.lower() not in ["cv2", "pil"]:
+            logger.warning(
+                f"The backend of DecodeImage only support \"cv2\" or \"PIL\". \"f{backend}\" is unavailable. Use \"cv2\" instead."
+            )
+            backend = "cv2"
+        self.backend = backend.lower()
+
+        if not to_np:
+            logger.warning(
+                f"\"to_rgb\" and \"channel_first\" are only enabled when to_np is True. \"to_np\" is now {to_np}."
+            )
+
+    @format_data
+    def __call__(self, img):
+        if isinstance(img, Image.Image):
+            assert self.backend == "pil", "invalid input 'img' in DecodeImage"
+        elif isinstance(img, np.ndarray):
+            assert self.backend == "cv2", "invalid input 'img' in DecodeImage"
+        elif isinstance(img, bytes):
+            if self.backend == "pil":
+                data = io.BytesIO(img)
+                img = Image.open(data).convert("RGB")
+            else:
+                data = np.frombuffer(img, dtype="uint8")
+                img = cv2.imdecode(data, 1)
+        else:
+            raise ValueError("invalid input 'img' in DecodeImage")
+
+        if self.to_np:
+            if self.backend == "pil":
+                assert img.mode == "RGB", f"invalid mode of image[{img.mode}]"
+                img = np.asarray(img)[:, :, ::-1]  # BRG
+
+            if self.to_rgb:
+                assert img.shape[
+                    2] == 3, f"invalid shape of image[{img.shape}]"
+                img = img[:, :, ::-1]
+
+            if self.channel_first:
+                img = img.transpose((2, 0, 1))
+        return img
+
+
+class ResizeImage(object):
+    """ resize image """
+
+    def __init__(self,
+                 size=None,
+                 resize_short=None,
+                 interpolation=None,
+                 backend="cv2",
+                 return_numpy=True):
+        if resize_short is not None and resize_short > 0:
+            self.resize_short = resize_short
+            self.w = None
+            self.h = None
+        elif size is not None:
+            self.resize_short = None
+            self.w = size if type(size) is int else size[0]
+            self.h = size if type(size) is int else size[1]
+        else:
+            raise OperatorParamError("invalid params for ReisizeImage for '\
+                'both 'size' and 'resize_short' are None")
+
+        self._resize_func = UnifiedResize(
+            interpolation=interpolation,
+            backend=backend,
+            return_numpy=return_numpy)
+
+    @format_data
+    def __call__(self, img):
+        if isinstance(img, np.ndarray):
+            img_h, img_w = img.shape[:2]
+        else:
+            img_w, img_h = img.size
+
+        if self.resize_short is not None:
+            percent = float(self.resize_short) / min(img_w, img_h)
+            w = int(round(img_w * percent))
+            h = int(round(img_h * percent))
+        else:
+            w = self.w
+            h = self.h
+        return self._resize_func(img, (w, h))
+
+
+class CropWithPadding(RandomResizedCrop):
+    """
+    crop image and padding to original size
+    """
+
+    def __init__(self,
+                 prob=1,
+                 padding_num=0,
+                 size=224,
+                 scale=(0.08, 1.0),
+                 ratio=(3. / 4, 4. / 3),
+                 interpolation='bilinear',
+                 key=None):
+        super().__init__(size, scale, ratio, interpolation, key)
+        self.prob = prob
+        self.padding_num = padding_num
+
+    def __call__(self, img):
+        if np.random.random() < self.prob:
+            # RandomResizedCrop augmentation
+            new = np.zeros_like(np.array(img)) + self.padding_num
+            #  orig_W, orig_H = F._get_image_size(sample)
+            i, j, h, w = self._dynamic_get_param(img)
+            cropped = F.crop(img, i, j, h, w)
+            new[i:i + h, j:j + w, :] = np.array(cropped)
+            return new
+        else:
+            return img
+
+    def _get_image_size(self, img):
+        if F._is_pil_image(img):
+            return img.size
+        elif F._is_numpy_image(img):
+            return img.shape[:2][::-1]
+        elif F._is_tensor_image(img):
+            return img.shape[1:][::-1]  # chw
+        else:
+            raise TypeError("Unexpected type {}".format(type(img)))
+
+
+class CropImage(object):
+    """ crop image """
+
+    def __init__(self, size):
+        if type(size) is int:
+            self.size = (size, size)
+        else:
+            self.size = size  # (h, w)
+
+    def __call__(self, img):
+        w, h = self.size
+        img_h, img_w = img.shape[:2]
+        w_start = (img_w - w) // 2
+        h_start = (img_h - h) // 2
+
+        w_end = w_start + w
+        h_end = h_start + h
+        return img[h_start:h_end, w_start:w_end, :]
+
+
+class CropImageAtRatio(object):
+    """ crop image with specified size and padding"""
+
+    def __init__(self, size: int, pad: int, interpolation="bilinear"):
+        self.size = size
+        self.ratio = size / (size + pad)
+        self.interpolation = interpolation
+
+    def __call__(self, img):
+        height, width = img.shape[:2]
+        crop_size = int(self.ratio * min(height, width))
+
+        y = (height - crop_size) // 2
+        x = (width - crop_size) // 2
+
+        crop_img = img[y:y + crop_size, x:x + crop_size, :]
+        return F.resize(crop_img, [self.size, self.size], self.interpolation)
+
+
+class Padv2(object):
+    def __init__(self,
+                 size=None,
+                 size_divisor=32,
+                 pad_mode=0,
+                 offsets=None,
+                 fill_value=(127.5, 127.5, 127.5)):
+        """
+        Pad image to a specified size or multiple of size_divisor.
+        Args:
+            size (int, list): image target size, if None, pad to multiple of size_divisor, default None
+            size_divisor (int): size divisor, default 32
+            pad_mode (int): pad mode, currently only supports four modes [-1, 0, 1, 2]. if -1, use specified offsets
+                if 0, only pad to right and bottom. if 1, pad according to center. if 2, only pad left and top
+            offsets (list): [offset_x, offset_y], specify offset while padding, only supported pad_mode=-1
+            fill_value (bool): rgb value of pad area, default (127.5, 127.5, 127.5)
+        """
+
+        if not isinstance(size, (int, list)):
+            raise TypeError(
+                "Type of target_size is invalid when random_size is True. \
+                            Must be List, now is {}".format(type(size)))
+
+        if isinstance(size, int):
+            size = [size, size]
+
+        assert pad_mode in [
+            -1, 0, 1, 2
+        ], 'currently only supports four modes [-1, 0, 1, 2]'
+        if pad_mode == -1:
+            assert offsets, 'if pad_mode is -1, offsets should not be None'
+
+        self.size = size
+        self.size_divisor = size_divisor
+        self.pad_mode = pad_mode
+        self.fill_value = fill_value
+        self.offsets = offsets
+
+    def apply_image(self, image, offsets, im_size, size):
+        x, y = offsets
+        im_h, im_w = im_size
+        h, w = size
+        canvas = np.ones((h, w, 3), dtype=np.float32)
+        canvas *= np.array(self.fill_value, dtype=np.float32)
+        canvas[y:y + im_h, x:x + im_w, :] = image.astype(np.float32)
+        return canvas
+
+    def __call__(self, img):
+        im_h, im_w = img.shape[:2]
+        if self.size:
+            w, h = self.size
+            assert (
+                im_h <= h and im_w <= w
+            ), '(h, w) of target size should be greater than (im_h, im_w)'
+        else:
+            h = int(np.ceil(im_h / self.size_divisor) * self.size_divisor)
+            w = int(np.ceil(im_w / self.size_divisor) * self.size_divisor)
+
+        if h == im_h and w == im_w:
+            return img.astype(np.float32)
+
+        if self.pad_mode == -1:
+            offset_x, offset_y = self.offsets
+        elif self.pad_mode == 0:
+            offset_y, offset_x = 0, 0
+        elif self.pad_mode == 1:
+            offset_y, offset_x = (h - im_h) // 2, (w - im_w) // 2
+        else:
+            offset_y, offset_x = h - im_h, w - im_w
+
+        offsets, im_size, size = [offset_x, offset_y], [im_h, im_w], [h, w]
+
+        return self.apply_image(img, offsets, im_size, size)
+
+
+class RandomCropImage(object):
+    """Random crop image only
+    """
+
+    def __init__(self, size):
+        super(RandomCropImage, self).__init__()
+        if isinstance(size, int):
+            size = [size, size]
+        self.size = size
+
+    def __call__(self, img):
+
+        h, w = img.shape[:2]
+        tw, th = self.size
+        i = random.randint(0, h - th)
+        j = random.randint(0, w - tw)
+
+        img = img[i:i + th, j:j + tw, :]
+        return img
+
+
+class RandCropImage(object):
+    """ random crop image """
+
+    def __init__(self,
+                 size,
+                 progress_size=None,
+                 scale=None,
+                 ratio=None,
+                 interpolation=None,
+                 use_log_aspect=False,
+                 backend="cv2"):
+        if type(size) is int:
+            self.size = (size, size)  # (h, w)
+        else:
+            self.size = size
+
+        self.progress_size = progress_size
+        self.scale = [0.08, 1.0] if scale is None else scale
+        self.ratio = [3. / 4., 4. / 3.] if ratio is None else ratio
+        self.use_log_aspect = use_log_aspect
+
+        self._resize_func = UnifiedResize(
+            interpolation=interpolation, backend=backend)
+
+    @format_data
+    def __call__(self, img):
+        size = self.size
+        scale = self.scale
+        ratio = self.ratio
+
+        if self.use_log_aspect:
+            log_ratio = list(map(math.log, ratio))
+            aspect_ratio = math.exp(random.uniform(*log_ratio))
+        else:
+            aspect_ratio = random.uniform(*ratio)
+
+        img_h, img_w = img.shape[:2]
+        bound = min((float(img_w) / img_h) / aspect_ratio,
+                    (float(img_h) / img_w) * aspect_ratio)
+        scale_max = min(scale[1], bound)
+        scale_min = min(scale[0], bound)
+
+        target_area = img_w * img_h * random.uniform(scale_min, scale_max)
+        w = int(math.sqrt(target_area * aspect_ratio))
+        h = int(math.sqrt(target_area / aspect_ratio))
+
+        i = random.randint(0, img_w - w)
+        j = random.randint(0, img_h - h)
+
+        img = self._resize_func(img[j:j + h, i:i + w, :], size)
+        return img
+
+
+class RandCropImageV2(object):
+    """ RandCropImageV2 is different from RandCropImage,
+    it will Select a cutting position randomly in a uniform distribution way,
+    and cut according to the given size without resize at last."""
+
+    def __init__(self, size):
+        if type(size) is int:
+            self.size = (size, size)  # (h, w)
+        else:
+            self.size = size
+
+    def __call__(self, img):
+        if isinstance(img, np.ndarray):
+            img_h, img_w = img.shape[0], img.shape[1]
+        else:
+            img_w, img_h = img.size
+        tw, th = self.size
+
+        if img_h + 1 < th or img_w + 1 < tw:
+            raise ValueError(
+                "Required crop size {} is larger then input image size {}".
+                format((th, tw), (img_h, img_w)))
+
+        if img_w == tw and img_h == th:
+            return img
+
+        top = random.randint(0, img_h - th + 1)
+        left = random.randint(0, img_w - tw + 1)
+        if isinstance(img, np.ndarray):
+            return img[top:top + th, left:left + tw, :]
+        else:
+            return img.crop((left, top, left + tw, top + th))
+
+
+class RandFlipImage(object):
+    """ random flip image
+        flip_code:
+            1: Flipped Horizontally
+            0: Flipped Vertically
+            -1: Flipped Horizontally & Vertically
+    """
+
+    def __init__(self, flip_code=1):
+        assert flip_code in [-1, 0, 1
+                             ], "flip_code should be a value in [-1, 0, 1]"
+        self.flip_code = flip_code
+
+    @format_data
+    def __call__(self, img):
+        if random.randint(0, 1) == 1:
+            if isinstance(img, np.ndarray):
+                return cv2.flip(img, self.flip_code)
+            else:
+                if self.flip_code == 1:
+                    return img.transpose(Image.FLIP_LEFT_RIGHT)
+                elif self.flip_code == 0:
+                    return img.transpose(Image.FLIP_TOP_BOTTOM)
+                else:
+                    return img.transpose(Image.FLIP_LEFT_RIGHT).transpose(
+                        Image.FLIP_LEFT_RIGHT)
+        else:
+            return img
+
+
+class AutoAugment(object):
+    def __init__(self):
+        self.policy = ImageNetPolicy()
+
+    def __call__(self, img):
+        from PIL import Image
+        img = np.ascontiguousarray(img)
+        img = Image.fromarray(img)
+        img = self.policy(img)
+        img = np.asarray(img)
+
+
+class NormalizeImage(object):
+    """ normalize image such as substract mean, divide std
+    """
+
+    def __init__(self,
+                 scale=None,
+                 mean=None,
+                 std=None,
+                 order='chw',
+                 output_fp16=False,
+                 channel_num=3):
+        if isinstance(scale, str):
+            scale = eval(scale)
+        assert channel_num in [
+            3, 4
+        ], "channel number of input image should be set to 3 or 4."
+        self.channel_num = channel_num
+        self.output_dtype = 'float16' if output_fp16 else 'float32'
+        self.scale = np.float32(scale if scale is not None else 1.0 / 255.0)
+        self.order = order
+        mean = mean if mean is not None else [0.485, 0.456, 0.406]
+        std = std if std is not None else [0.229, 0.224, 0.225]
+
+        shape = (3, 1, 1) if self.order == 'chw' else (1, 1, 3)
+        self.mean = np.array(mean).reshape(shape).astype('float32')
+        self.std = np.array(std).reshape(shape).astype('float32')
+
+    @format_data
+    def __call__(self, img):
+        from PIL import Image
+        if isinstance(img, Image.Image):
+            img = np.array(img)
+
+        assert isinstance(img,
+                          np.ndarray), "invalid input 'img' in NormalizeImage"
+
+        img = (img.astype('float32') * self.scale - self.mean) / self.std
+
+        if self.channel_num == 4:
+            img_h = img.shape[1] if self.order == 'chw' else img.shape[0]
+            img_w = img.shape[2] if self.order == 'chw' else img.shape[1]
+            pad_zeros = np.zeros(
+                (1, img_h, img_w)) if self.order == 'chw' else np.zeros(
+                    (img_h, img_w, 1))
+            img = (np.concatenate(
+                (img, pad_zeros), axis=0)
+                   if self.order == 'chw' else np.concatenate(
+                       (img, pad_zeros), axis=2))
+
+        img = img.astype(self.output_dtype)
+        return img
+
+
+class ToCHWImage(object):
+    """ convert hwc image to chw image
+    """
+
+    def __init__(self):
+        pass
+
+    def __call__(self, img):
+        from PIL import Image
+        if isinstance(img, Image.Image):
+            img = np.array(img)
+
+        return img.transpose((2, 0, 1))
+
+
+class AugMix(object):
+    """ Perform AugMix augmentation and compute mixture.
+    """
+
+    def __init__(self,
+                 prob=0.5,
+                 aug_prob_coeff=0.1,
+                 mixture_width=3,
+                 mixture_depth=1,
+                 aug_severity=1):
+        """
+        Args:
+            prob: Probability of taking augmix
+            aug_prob_coeff: Probability distribution coefficients.
+            mixture_width: Number of augmentation chains to mix per augmented example.
+            mixture_depth: Depth of augmentation chains. -1 denotes stochastic depth in [1, 3]'
+            aug_severity: Severity of underlying augmentation operators (between 1 to 10).
+        """
+        # fmt: off
+        self.prob = prob
+        self.aug_prob_coeff = aug_prob_coeff
+        self.mixture_width = mixture_width
+        self.mixture_depth = mixture_depth
+        self.aug_severity = aug_severity
+        self.augmentations = augmentations
+        # fmt: on
+
+    def __call__(self, image):
+        """Perform AugMix augmentations and compute mixture.
+        Returns:
+          mixed: Augmented and mixed image.
+        """
+        if random.random() > self.prob:
+            # Avoid the warning: the given NumPy array is not writeable
+            return np.asarray(image).copy()
+
+        ws = np.float32(
+            np.random.dirichlet([self.aug_prob_coeff] * self.mixture_width))
+        m = np.float32(
+            np.random.beta(self.aug_prob_coeff, self.aug_prob_coeff))
+
+        # image = Image.fromarray(image)
+        mix = np.zeros(image.shape)
+        for i in range(self.mixture_width):
+            image_aug = image.copy()
+            image_aug = Image.fromarray(image_aug)
+            depth = self.mixture_depth if self.mixture_depth > 0 else np.random.randint(
+                1, 4)
+            for _ in range(depth):
+                op = np.random.choice(self.augmentations)
+                image_aug = op(image_aug, self.aug_severity)
+            mix += ws[i] * np.asarray(image_aug)
+
+        mixed = (1 - m) * image + m * mix
+        return mixed.astype(np.uint8)
+
+
+class ColorJitter(RawColorJitter):
+    """ColorJitter.
+    """
+
+    def __init__(self, prob=2, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.prob = prob
+
+    def __call__(self, img):
+        if np.random.random() < self.prob:
+            if not isinstance(img, Image.Image):
+                img = np.ascontiguousarray(img)
+                img = Image.fromarray(img)
+            img = super()._apply_image(img)
+            if isinstance(img, Image.Image):
+                img = np.asarray(img)
+        return img
+
+
+class RandomRotation(RawRandomRotation):
+    """RandomRotation.
+    """
+
+    def __init__(self, prob=0.5, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.prob = prob
+
+    def __call__(self, img):
+        if np.random.random() < self.prob:
+            img = super()._apply_image(img)
+        return img
+
+
+class Pad(object):
+    """
+    Pads the given PIL.Image on all sides with specified padding mode and fill value.
+    adapted from: https://pytorch.org/vision/stable/_modules/torchvision/transforms/transforms.html#Pad
+    """
+
+    def __init__(self,
+                 padding: int,
+                 fill: int=0,
+                 padding_mode: str="constant",
+                 backend: str="pil"):
+        self.padding = padding
+        self.fill = fill
+        self.padding_mode = padding_mode
+        self.backend = backend
+        assert backend in [
+            "pil", "cv2"
+        ], f"backend must in ['pil', 'cv2'], but got {backend}"
+
+    def _parse_fill(self, fill, img, min_pil_version, name="fillcolor"):
+        # Process fill color for affine transforms
+        major_found, minor_found = (int(v)
+                                    for v in PILLOW_VERSION.split('.')[:2])
+        major_required, minor_required = (
+            int(v) for v in min_pil_version.split('.')[:2])
+        if major_found < major_required or (major_found == major_required and
+                                            minor_found < minor_required):
+            if fill is None:
+                return {}
+            else:
+                msg = (
+                    "The option to fill background area of the transformed image, "
+                    "requires pillow>={}")
+                raise RuntimeError(msg.format(min_pil_version))
+
+        num_bands = len(img.getbands())
+        if fill is None:
+            fill = 0
+        if isinstance(fill, (int, float)) and num_bands > 1:
+            fill = tuple([fill] * num_bands)
+        if isinstance(fill, (list, tuple)):
+            if len(fill) != num_bands:
+                msg = (
+                    "The number of elements in 'fill' does not match the number of "
+                    "bands of the image ({} != {})")
+                raise ValueError(msg.format(len(fill), num_bands))
+
+            fill = tuple(fill)
+
+        return {name: fill}
+
+    def __call__(self, img):
+        if self.backend == "pil":
+            opts = self._parse_fill(self.fill, img, "2.3.0", name="fill")
+            if img.mode == "P":
+                palette = img.getpalette()
+                img = ImageOps.expand(img, border=self.padding, **opts)
+                img.putpalette(palette)
+                return img
+            return ImageOps.expand(img, border=self.padding, **opts)
+        else:
+            img = cv2.copyMakeBorder(
+                img,
+                self.padding,
+                self.padding,
+                self.padding,
+                self.padding,
+                cv2.BORDER_CONSTANT,
+                value=(self.fill, self.fill, self.fill))
+            return img
+
+
+# TODO(gaotingquan): integrate into RandomRotation
+class RandomRot90(object):
+    """RandomRot90
+    """
+
+    def __init__(self):
+        pass
+
+    @format_data
+    def __call__(self, img):
+        orientation = random.choice([0, 1, 2, 3])
+        if orientation:
+            img = np.rot90(img, orientation)
+        return {"img": img, "random_rot90_orientation": orientation}
+
+
+class BlurImage(object):
+    """BlurImage
+    """
+
+    def __init__(self,
+                 ratio=0.5,
+                 motion_max_ksize=12,
+                 motion_max_angle=45,
+                 gaussian_max_ksize=12):
+        self.ratio = ratio
+        self.motion_max_ksize = motion_max_ksize
+        self.motion_max_angle = motion_max_angle
+        self.gaussian_max_ksize = gaussian_max_ksize
+
+    def _gaussian_blur(self, img, max_ksize=12):
+        ksize = (np.random.choice(np.arange(5, max_ksize, 2)),
+                 np.random.choice(np.arange(5, max_ksize, 2)))
+        img = cv2.GaussianBlur(img, ksize, 0)
+        return img
+
+    def _motion_blur(self, img, max_ksize=12, max_angle=45):
+        degree = np.random.choice(np.arange(5, max_ksize, 2))
+        angle = np.random.choice(np.arange(-1 * max_angle, max_angle))
+
+        M = cv2.getRotationMatrix2D((degree / 2, degree / 2), angle, 1)
+        motion_blur_kernel = np.diag(np.ones(degree))
+        motion_blur_kernel = cv2.warpAffine(motion_blur_kernel, M,
+                                            (degree, degree))
+
+        motion_blur_kernel = motion_blur_kernel / degree
+        blurred = cv2.filter2D(img, -1, motion_blur_kernel)
+
+        cv2.normalize(blurred, blurred, 0, 255, cv2.NORM_MINMAX)
+        img = np.array(blurred, dtype=np.uint8)
+        return img
+
+    @format_data
+    def __call__(self, img):
+        if random.random() > self.ratio:
+            label = 0
+        else:
+            method = random.choice(["gaussian", "motion"])
+            if method == "gaussian":
+                img = self._gaussian_blur(img, self.gaussian_max_ksize)
+            else:
+                img = self._motion_blur(img, self.motion_max_ksize,
+                                        self.motion_max_angle)
+            label = 1
+        return {"img": img, "blur_image": label}
+
+
+class RandomGrayscale(object):
+    """Randomly convert image to grayscale with a probability of p (default 0.1).
+
+    Args:
+        p (float): probability that image should be converted to grayscale.
+
+    Returns:
+        PIL Image: Grayscale version of the input image with probability p and unchanged
+        with probability (1-p).
+        - If input image is 1 channel: grayscale version is 1 channel
+        - If input image is 3 channel: grayscale version is 3 channel with r == g == b
+
+    """
+
+    def __init__(self, p=0.1):
+        self.p = p
+
+    def __call__(self, img):
+        """
+        Args:
+            img (PIL Image): Image to be converted to grayscale.
+
+        Returns:
+            PIL Image: Randomly grayscaled image.
+        """
+        num_output_channels = 1 if img.mode == 'L' else 3
+        if random.random() < self.p:
+            return F.to_grayscale(img, num_output_channels=num_output_channels)
+        return img
+
+    def __repr__(self):
+        return self.__class__.__name__ + '()'
+
+
+class PCALighting(object):
+    """
+    Lighting noise(AlexNet - style PCA - based noise)
+    reference: https://github.com/DingXiaoH/DiverseBranchBlock
+    """
+
+    def __init__(self):
+        self.alphastd = 0.1
+        self.eigval = [0.2175, 0.0188, 0.0045]
+        self.eigvec = [
+            [-0.5675, 0.7192, 0.4009],
+            [-0.5808, -0.0045, -0.8140],
+            [-0.5836, -0.6948, 0.4203],
+        ]
+        self.eigval = np.array(self.eigval).astype(np.float32)
+        self.eigvec = np.array(self.eigvec).astype(np.float32)
+
+    def __call__(self, img):
+        if self.alphastd == 0:
+            return img
+
+        img = img.transpose((2, 0, 1))
+        alpha = np.random.normal(0, self.alphastd, size=(3)).astype(np.float32)
+        rgb = self.eigvec * np.broadcast_to(alpha.reshape(1, 3), (
+            3, 3)) * np.broadcast_to(self.eigval.reshape(1, 3), (3, 3))
+        rgb = rgb.sum(1).squeeze()
+        img = img + rgb.reshape(3, 1, 1)
+        return img.transpose((1, 2, 0))
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/randaugment.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/randaugment.py
new file mode 100644
index 000000000..966f1412a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/randaugment.py
@@ -0,0 +1,477 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is based on https://github.com/heartInsert/randaugment
+# reference: https://arxiv.org/abs/1909.13719
+
+import random
+from .operators import RawColorJitter
+from .timm_autoaugment import _pil_interp
+from paddle.vision.transforms import transforms as T
+
+import numpy as np
+from PIL import Image, ImageEnhance, ImageOps
+
+
+def solarize_add(img, add, thresh=128, **__):
+    lut = []
+    for i in range(256):
+        if i < thresh:
+            lut.append(min(255, i + add))
+        else:
+            lut.append(i)
+    if img.mode in ("L", "RGB"):
+        if img.mode == "RGB" and len(lut) == 256:
+            lut = lut + lut + lut
+        return img.point(lut)
+    else:
+        return img
+
+
+def cutout(image, pad_size, replace=0):
+    image_np = np.array(image)
+    image_height, image_width, _ = image_np.shape
+
+    # Sample the center location in the image where the zero mask will be applied.
+    cutout_center_height = np.random.randint(0, image_height + 1)
+    cutout_center_width = np.random.randint(0, image_width + 1)
+
+    lower_pad = np.maximum(0, cutout_center_height - pad_size)
+    upper_pad = np.maximum(0, image_height - cutout_center_height - pad_size)
+    left_pad = np.maximum(0, cutout_center_width - pad_size)
+    right_pad = np.maximum(0, image_width - cutout_center_width - pad_size)
+
+    cutout_shape = [
+        image_height - (lower_pad + upper_pad),
+        image_width - (left_pad + right_pad)
+    ]
+    padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]]
+    mask = np.pad(np.zeros(
+        cutout_shape, dtype=image_np.dtype),
+                  padding_dims,
+                  constant_values=1)
+    mask = np.expand_dims(mask, -1)
+    mask = np.tile(mask, [1, 1, 3])
+    image_np = np.where(
+        np.equal(mask, 0),
+        np.full_like(
+            image_np, fill_value=replace, dtype=image_np.dtype),
+        image_np)
+    return Image.fromarray(image_np)
+
+
+class RandAugment(object):
+    def __init__(self, num_layers=2, magnitude=5, fillcolor=(128, 128, 128)):
+        self.num_layers = num_layers
+        self.magnitude = magnitude
+        self.max_level = 10
+
+        abso_level = self.magnitude / self.max_level
+        self.level_map = {
+            "shearX": 0.3 * abso_level,
+            "shearY": 0.3 * abso_level,
+            "translateX": 150.0 / 331 * abso_level,
+            "translateY": 150.0 / 331 * abso_level,
+            "rotate": 30 * abso_level,
+            "color": 0.9 * abso_level,
+            "posterize": int(4.0 * abso_level),
+            "solarize": 256.0 * abso_level,
+            "contrast": 0.9 * abso_level,
+            "sharpness": 0.9 * abso_level,
+            "brightness": 0.9 * abso_level,
+            "autocontrast": 0,
+            "equalize": 0,
+            "invert": 0
+        }
+
+        # from https://stackoverflow.com/questions/5252170/
+        # specify-image-filling-color-when-rotating-in-python-with-pil-and-setting-expand
+        def rotate_with_fill(img, magnitude):
+            rot = img.convert("RGBA").rotate(magnitude)
+            return Image.composite(rot,
+                                   Image.new("RGBA", rot.size, (128, ) * 4),
+                                   rot).convert(img.mode)
+
+        rnd_ch_op = random.choice
+
+        self.func = {
+            "shearX": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, magnitude * rnd_ch_op([-1, 1]), 0, 0, 1, 0),
+                Image.BICUBIC,
+                fillcolor=fillcolor),
+            "shearY": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, 0, magnitude * rnd_ch_op([-1, 1]), 1, 0),
+                Image.BICUBIC,
+                fillcolor=fillcolor),
+            "translateX": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, magnitude * img.size[0] * rnd_ch_op([-1, 1]), 0, 1, 0),
+                fillcolor=fillcolor),
+            "translateY": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, 0, 0, 1, magnitude * img.size[1] * rnd_ch_op([-1, 1])),
+                fillcolor=fillcolor),
+            "rotate": lambda img, magnitude: rotate_with_fill(img, magnitude),
+            "color": lambda img, magnitude: ImageEnhance.Color(img).enhance(
+                1 + magnitude * rnd_ch_op([-1, 1])),
+            "posterize": lambda img, magnitude:
+                ImageOps.posterize(img, magnitude),
+            "solarize": lambda img, magnitude:
+                ImageOps.solarize(img, magnitude),
+            "contrast": lambda img, magnitude:
+                ImageEnhance.Contrast(img).enhance(
+                    1 + magnitude * rnd_ch_op([-1, 1])),
+            "sharpness": lambda img, magnitude:
+                ImageEnhance.Sharpness(img).enhance(
+                    1 + magnitude * rnd_ch_op([-1, 1])),
+            "brightness": lambda img, magnitude:
+                ImageEnhance.Brightness(img).enhance(
+                    1 + magnitude * rnd_ch_op([-1, 1])),
+            "autocontrast": lambda img, _:
+                ImageOps.autocontrast(img),
+            "equalize": lambda img, _: ImageOps.equalize(img),
+            "invert": lambda img, _: ImageOps.invert(img)
+        }
+
+    def __call__(self, img):
+        avaiable_op_names = list(self.level_map.keys())
+        for layer_num in range(self.num_layers):
+            op_name = np.random.choice(avaiable_op_names)
+            img = self.func[op_name](img, self.level_map[op_name])
+        return img
+
+
+class RandomApply(object):
+    def __init__(self, p, transforms):
+        self.p = p
+        ts = []
+        for t in transforms:
+            for key in t.keys():
+                ts.append(eval(key)(**t[key]))
+
+        self.trans = T.Compose(ts)
+
+    def __call__(self, img):
+        if self.p < np.random.rand(1):
+            return img
+        timg = self.trans(img)
+        return timg
+
+
+## RandAugment_EfficientNetV2 code below ##
+class RandAugmentV2(RandAugment):
+    """Customed RandAugment for EfficientNetV2"""
+
+    def __init__(self,
+                 num_layers=2,
+                 magnitude=5,
+                 progress_magnitude=None,
+                 fillcolor=(128, 128, 128)):
+        super().__init__(num_layers, magnitude, fillcolor)
+        self.progress_magnitude = progress_magnitude
+        abso_level = self.magnitude / self.max_level
+        self.level_map = {
+            "shearX": 0.3 * abso_level,
+            "shearY": 0.3 * abso_level,
+            "translateX": 100.0 * abso_level,
+            "translateY": 100.0 * abso_level,
+            "rotate": 30 * abso_level,
+            "color": 1.8 * abso_level + 0.1,
+            "posterize": int(4.0 * abso_level),
+            "solarize": int(256.0 * abso_level),
+            "solarize_add": int(110.0 * abso_level),
+            "contrast": 1.8 * abso_level + 0.1,
+            "sharpness": 1.8 * abso_level + 0.1,
+            "brightness": 1.8 * abso_level + 0.1,
+            "autocontrast": 0,
+            "equalize": 0,
+            "invert": 0,
+            "cutout": int(40 * abso_level)
+        }
+
+        # from https://stackoverflow.com/questions/5252170/
+        # specify-image-filling-color-when-rotating-in-python-with-pil-and-setting-expand
+        def rotate_with_fill(img, magnitude):
+            rot = img.convert("RGBA").rotate(magnitude)
+            return Image.composite(rot,
+                                   Image.new("RGBA", rot.size, (128, ) * 4),
+                                   rot).convert(img.mode)
+
+        rnd_ch_op = random.choice
+
+        self.func = {
+            "shearX": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, magnitude * rnd_ch_op([-1, 1]), 0, 0, 1, 0),
+                Image.NEAREST,
+                fillcolor=fillcolor),
+            "shearY": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, 0, magnitude * rnd_ch_op([-1, 1]), 1, 0),
+                Image.NEAREST,
+                fillcolor=fillcolor),
+            "translateX": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, magnitude * rnd_ch_op([-1, 1]), 0, 1, 0),
+                Image.NEAREST,
+                fillcolor=fillcolor),
+            "translateY": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, 0, 0, 1, magnitude * rnd_ch_op([-1, 1])),
+                Image.NEAREST,
+                fillcolor=fillcolor),
+            "rotate": lambda img, magnitude: rotate_with_fill(img, magnitude * rnd_ch_op([-1, 1])),
+            "color": lambda img, magnitude: ImageEnhance.Color(img).enhance(magnitude),
+            "posterize": lambda img, magnitude:
+                ImageOps.posterize(img, magnitude),
+            "solarize": lambda img, magnitude:
+                ImageOps.solarize(img, magnitude),
+            "solarize_add": lambda img, magnitude:
+                solarize_add(img, magnitude),
+            "contrast": lambda img, magnitude:
+                ImageEnhance.Contrast(img).enhance(magnitude),
+            "sharpness": lambda img, magnitude:
+                ImageEnhance.Sharpness(img).enhance(magnitude),
+            "brightness": lambda img, magnitude:
+                ImageEnhance.Brightness(img).enhance(magnitude),
+            "autocontrast": lambda img, _:
+                ImageOps.autocontrast(img),
+            "equalize": lambda img, _: ImageOps.equalize(img),
+            "invert": lambda img, _: ImageOps.invert(img),
+            "cutout": lambda img, magnitude: cutout(img, magnitude, replace=fillcolor[0])
+        }
+
+
+class RandAugmentV3(RandAugment):
+    """Customed RandAugment for MobileViTV2"""
+
+    def __init__(self,
+                 num_layers=2,
+                 magnitude=3,
+                 fillcolor=(0, 0, 0),
+                 interpolation="bicubic"):
+        self.num_layers = num_layers
+        self.magnitude = magnitude
+        self.max_level = 10
+        interpolation = _pil_interp(interpolation)
+
+        abso_level = self.magnitude / self.max_level
+        self.level_map = {
+            "shearX": 0.3 * abso_level,
+            "shearY": 0.3 * abso_level,
+            "translateX": 150.0 / 331.0 * abso_level,
+            "translateY": 150.0 / 331.0 * abso_level,
+            "rotate": 30 * abso_level,
+            "color": 0.9 * abso_level,
+            "posterize": 8 - int(4.0 * abso_level),
+            "solarize": 255.0 * (1 - abso_level),
+            "contrast": 0.9 * abso_level,
+            "sharpness": 0.9 * abso_level,
+            "brightness": 0.9 * abso_level,
+            "autocontrast": 0,
+            "equalize": 0,
+            "invert": 0
+        }
+
+        rnd_ch_op = random.choice
+
+        self.func = {
+            "shearX": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, magnitude * rnd_ch_op([-1, 1]), 0, 0, 1, 0),
+                interpolation,
+                fillcolor=fillcolor),
+            "shearY": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, 0, magnitude * rnd_ch_op([-1, 1]), 1, 0),
+                interpolation,
+                fillcolor=fillcolor),
+            "translateX": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, magnitude * img.size[0] * rnd_ch_op([-1, 1]), 0, 1, 0),
+                interpolation,
+                fillcolor=fillcolor),
+            "translateY": lambda img, magnitude: img.transform(
+                img.size,
+                Image.AFFINE,
+                (1, 0, 0, 0, 1, magnitude * img.size[1] * rnd_ch_op([-1, 1])),
+                interpolation,
+                fillcolor=fillcolor),
+            "rotate": lambda img, magnitude: img.rotate(
+                magnitude * rnd_ch_op([-1, 1]),
+                interpolation,
+                fillcolor=fillcolor),
+            "color": lambda img, magnitude: ImageEnhance.Color(img).enhance(
+                1 + magnitude * rnd_ch_op([-1, 1])),
+            "posterize": lambda img, magnitude:
+                ImageOps.posterize(img, magnitude),
+            "solarize": lambda img, magnitude:
+                ImageOps.solarize(img, magnitude),
+            "contrast": lambda img, magnitude:
+                ImageEnhance.Contrast(img).enhance(
+                    1 + magnitude * rnd_ch_op([-1, 1])),
+            "sharpness": lambda img, magnitude:
+                ImageEnhance.Sharpness(img).enhance(
+                    1 + magnitude * rnd_ch_op([-1, 1])),
+            "brightness": lambda img, magnitude:
+                ImageEnhance.Brightness(img).enhance(
+                    1 + magnitude * rnd_ch_op([-1, 1])),
+            "autocontrast": lambda img, _:
+                ImageOps.autocontrast(img),
+            "equalize": lambda img, _: ImageOps.equalize(img),
+            "invert": lambda img, _: ImageOps.invert(img)
+        }
+
+
+class SubPolicyV2(object):
+    """Custom SubPolicy for ML-Decoder"""
+
+    def __init__(self,
+                 p1,
+                 operation1,
+                 magnitude_idx1,
+                 p2,
+                 operation2,
+                 magnitude_idx2,
+                 fillcolor=(128, 128, 128)):
+        ranges = {
+            "shearX": np.linspace(0, 0.3, 10),
+            "shearY": np.linspace(0, 0.3, 10),
+            "translateX": np.linspace(0, 150 / 331, 10),
+            "translateY": np.linspace(0, 150 / 331, 10),
+            "rotate": np.linspace(0, 30, 10),
+            "color": np.linspace(0.0, 0.9, 10),
+            "posterize": np.round(np.linspace(8, 4, 10), 0).astype(np.int_),
+            "solarize": np.linspace(256, 0, 10),
+            "contrast": np.linspace(0.0, 0.9, 10),
+            "sharpness": np.linspace(0.0, 0.9, 10),
+            "brightness": np.linspace(0.0, 0.9, 10),
+            "autocontrast": [0] * 10,
+            "equalize": [0] * 10,
+            "invert": [0] * 10,
+            "cutout": np.round(np.linspace(0, 20, 10), 0).astype(np.int_),
+        }
+
+        # from https://stackoverflow.com/questions/5252170/specify-image-filling-color-when-rotating-in-python-with-pil-and-setting-expand
+        def rotate_with_fill(img, magnitude):
+            rot = img.convert("RGBA").rotate(magnitude)
+            return Image.composite(rot,
+                                   Image.new("RGBA", rot.size, (128,) * 4),
+                                   rot).convert(img.mode)
+
+        func = {
+            "shearX": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, magnitude * random.choice([-1, 1]), 0, 0, 1, 0),
+                Image.BICUBIC, fillcolor=fillcolor),
+            "shearY": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, 0, magnitude * random.choice([-1, 1]), 1, 0),
+                Image.BICUBIC, fillcolor=fillcolor),
+            "translateX": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, magnitude * img.size[0] * random.choice([-1, 1]), 0, 1, 0),
+                fillcolor=fillcolor),
+            "translateY": lambda img, magnitude: img.transform(
+                img.size, Image.AFFINE, (1, 0, 0, 0, 1, magnitude * img.size[1] * random.choice([-1, 1])),
+                fillcolor=fillcolor),
+            "rotate": lambda img, magnitude: rotate_with_fill(img, magnitude),
+            # "rotate": lambda img, magnitude: img.rotate(magnitude * random.choice([-1, 1])),
+            "color": lambda img, magnitude: ImageEnhance.Color(img).enhance(1 + magnitude * random.choice([-1, 1])),
+            "posterize": lambda img, magnitude: ImageOps.posterize(img, magnitude),
+            "solarize": lambda img, magnitude: ImageOps.solarize(img, magnitude),
+            "contrast": lambda img, magnitude: ImageEnhance.Contrast(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "sharpness": lambda img, magnitude: ImageEnhance.Sharpness(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "brightness": lambda img, magnitude: ImageEnhance.Brightness(img).enhance(
+                1 + magnitude * random.choice([-1, 1])),
+            "autocontrast": lambda img, magnitude: ImageOps.autocontrast(img),
+            "equalize": lambda img, magnitude: ImageOps.equalize(img),
+            "invert": lambda img, magnitude: ImageOps.invert(img),
+            "cutout": lambda img, magnitude: cutout(img, magnitude),
+        }
+
+        self.p1 = p1
+        self.operation1 = func[operation1]
+        self.magnitude1 = ranges[operation1][magnitude_idx1]
+        self.p2 = p2
+        self.operation2 = func[operation2]
+        self.magnitude2 = ranges[operation2][magnitude_idx2]
+
+    def __call__(self, img):
+        if random.random() < self.p1:
+            img = self.operation1(img, self.magnitude1)
+        if random.random() < self.p2:
+            img = self.operation2(img, self.magnitude2)
+        return img
+
+
+class RandAugmentV4(object):
+    """Custom RandAugment for ML-Decoder"""
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._policies = self.get_rand_policies()
+
+    @classmethod
+    def get_trans_list(cls):
+        trans_list = [
+            "shearX",
+            "shearY",
+            "translateX",
+            "translateY",
+            "rotate",
+            "color",
+            "posterize",
+            "solarize",
+            "contrast",
+            "sharpness",
+            "brightness",
+            "autocontrast",
+            "equalize",
+            "invert",
+            "cutout",
+        ]
+        return trans_list
+
+    @classmethod
+    def get_rand_policies(cls):
+        op_list = []
+        for trans in cls.get_trans_list():
+            for magnitude in range(1, 10):
+                op_list += [(0.5, trans, magnitude)]
+        policies = []
+        for op_1 in op_list:
+            for op_2 in op_list:
+                policies += [[op_1, op_2]]
+        return policies
+
+    def __call__(self, img):
+        randomly_chosen_policy = self._policies[
+            random.randint(0, len(self._policies) - 1)]
+        policy = SubPolicyV2(*randomly_chosen_policy[0],
+                             *randomly_chosen_policy[1])
+        img = policy(img)
+        return img
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/random_erasing.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/random_erasing.py
new file mode 100644
index 000000000..e687283c7
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/random_erasing.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is adapted from https://github.com/zhunzhong07/Random-Erasing, and refer to Timm(https://github.com/rwightman/pytorch-image-models).
+# reference: https://arxiv.org/abs/1708.04896
+
+from functools import partial
+
+import math
+import random
+
+import numpy as np
+
+from .operators import format_data
+
+
+class Pixels(object):
+    def __init__(self, mode="const", mean=[0., 0., 0.]):
+        self._mode = mode
+        self._mean = np.array(mean)
+
+    def __call__(self, h=224, w=224, c=3, channel_first=False):
+        if self._mode == "rand":
+            return np.random.normal(size=(
+                1, 1, 3)) if not channel_first else np.random.normal(size=(
+                    3, 1, 1))
+        elif self._mode == "pixel":
+            return np.random.normal(size=(
+                h, w, c)) if not channel_first else np.random.normal(size=(
+                    c, h, w))
+        elif self._mode == "const":
+            return np.reshape(self._mean, (
+                1, 1, c)) if not channel_first else np.reshape(self._mean,
+                                                               (c, 1, 1))
+        else:
+            raise Exception(
+                "Invalid mode in RandomErasing, only support \"const\", \"rand\", \"pixel\""
+            )
+
+
+class RandomErasing(object):
+    """RandomErasing.
+    """
+
+    def __init__(self,
+                 EPSILON=0.5,
+                 sl=0.02,
+                 sh=0.4,
+                 r1=0.3,
+                 mean=[0., 0., 0.],
+                 attempt=100,
+                 use_log_aspect=False,
+                 mode='const'):
+        self.EPSILON = eval(EPSILON) if isinstance(EPSILON, str) else EPSILON
+        self.sl = eval(sl) if isinstance(sl, str) else sl
+        self.sh = eval(sh) if isinstance(sh, str) else sh
+        r1 = eval(r1) if isinstance(r1, str) else r1
+        self.r1 = (math.log(r1), math.log(1 / r1)) if use_log_aspect else (
+            r1, 1 / r1)
+        self.use_log_aspect = use_log_aspect
+        self.attempt = attempt
+        self.get_pixels = Pixels(mode, mean)
+
+    @format_data
+    def __call__(self, img):
+        if random.random() > self.EPSILON:
+            return img
+
+        for _ in range(self.attempt):
+            if isinstance(img, np.ndarray):
+                img_h, img_w, img_c = img.shape
+                channel_first = False
+            else:
+                img_c, img_h, img_w = img.shape
+                channel_first = True
+            area = img_h * img_w
+
+            target_area = random.uniform(self.sl, self.sh) * area
+            aspect_ratio = random.uniform(*self.r1)
+            if self.use_log_aspect:
+                aspect_ratio = math.exp(aspect_ratio)
+
+            h = int(round(math.sqrt(target_area * aspect_ratio)))
+            w = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if w < img_w and h < img_h:
+                pixels = self.get_pixels(h, w, img_c, channel_first)
+                x1 = random.randint(0, img_h - h)
+                y1 = random.randint(0, img_w - w)
+                if img_c == 3:
+                    if channel_first:
+                        img[:, x1:x1 + h, y1:y1 + w] = pixels
+                    else:
+                        img[x1:x1 + h, y1:y1 + w, :] = pixels
+                else:
+                    if channel_first:
+                        img[0, x1:x1 + h, y1:y1 + w] = pixels[0]
+                    else:
+                        img[x1:x1 + h, y1:y1 + w, 0] = pixels[:, :, 0]
+                return img
+
+        return img
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/timm_autoaugment.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/timm_autoaugment.py
new file mode 100644
index 000000000..30f1f505a
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/preprocess/ops/timm_autoaugment.py
@@ -0,0 +1,878 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This code is heavily based on  https://github.com/rwightman/pytorch-image-models
+# reference: https://arxiv.org/abs/1805.09501
+
+import random
+import math
+import re
+from PIL import Image, ImageOps, ImageEnhance, ImageChops
+import PIL
+import numpy as np
+
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+
+_PIL_VER = tuple([int(x) for x in PIL.__version__.split('.')[:2]])
+
+_FILL = (128, 128, 128)
+
+# This signifies the max integer that the controller RNN could predict for the
+# augmentation scheme.
+_MAX_LEVEL = 10.
+
+_HPARAMS_DEFAULT = dict(
+    translate_const=250,
+    img_mean=_FILL, )
+
+_RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC)
+
+
+def _pil_interp(method):
+    if method == 'bicubic':
+        return Image.BICUBIC
+    elif method == 'lanczos':
+        return Image.LANCZOS
+    elif method == 'hamming':
+        return Image.HAMMING
+    else:
+        # default bilinear, do we want to allow nearest?
+        return Image.BILINEAR
+
+
+def _interpolation(kwargs):
+    interpolation = kwargs.pop('resample', Image.BILINEAR)
+    if isinstance(interpolation, (list, tuple)):
+        return random.choice(interpolation)
+    else:
+        return interpolation
+
+
+def _check_args_tf(kwargs):
+    if 'fillcolor' in kwargs and _PIL_VER < (5, 0):
+        kwargs.pop('fillcolor')
+    kwargs['resample'] = _interpolation(kwargs)
+
+
+def shear_x(img, factor, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, factor, 0, 0, 1, 0),
+                         **kwargs)
+
+
+def shear_y(img, factor, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, factor, 1, 0),
+                         **kwargs)
+
+
+def translate_x_rel(img, pct, **kwargs):
+    pixels = pct * img.size[0]
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0),
+                         **kwargs)
+
+
+def translate_y_rel(img, pct, **kwargs):
+    pixels = pct * img.size[1]
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels),
+                         **kwargs)
+
+
+def translate_x_abs(img, pixels, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0),
+                         **kwargs)
+
+
+def translate_y_abs(img, pixels, **kwargs):
+    _check_args_tf(kwargs)
+    return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels),
+                         **kwargs)
+
+
+def rotate(img, degrees, **kwargs):
+    _check_args_tf(kwargs)
+    if _PIL_VER >= (5, 2):
+        return img.rotate(degrees, **kwargs)
+    elif _PIL_VER >= (5, 0):
+        w, h = img.size
+        post_trans = (0, 0)
+        rotn_center = (w / 2.0, h / 2.0)
+        angle = -math.radians(degrees)
+        matrix = [
+            round(math.cos(angle), 15),
+            round(math.sin(angle), 15),
+            0.0,
+            round(-math.sin(angle), 15),
+            round(math.cos(angle), 15),
+            0.0,
+        ]
+
+        def transform(x, y, matrix):
+            (a, b, c, d, e, f) = matrix
+            return a * x + b * y + c, d * x + e * y + f
+
+        matrix[2], matrix[5] = transform(-rotn_center[0] - post_trans[0],
+                                         -rotn_center[1] - post_trans[1],
+                                         matrix)
+        matrix[2] += rotn_center[0]
+        matrix[5] += rotn_center[1]
+        return img.transform(img.size, Image.AFFINE, matrix, **kwargs)
+    else:
+        return img.rotate(degrees, resample=kwargs['resample'])
+
+
+def auto_contrast(img, **__):
+    return ImageOps.autocontrast(img)
+
+
+def invert(img, **__):
+    return ImageOps.invert(img)
+
+
+def equalize(img, **__):
+    return ImageOps.equalize(img)
+
+
+def solarize(img, thresh, **__):
+    return ImageOps.solarize(img, thresh)
+
+
+def solarize_add(img, add, thresh=128, **__):
+    lut = []
+    for i in range(256):
+        if i < thresh:
+            lut.append(min(255, i + add))
+        else:
+            lut.append(i)
+    if img.mode in ("L", "RGB"):
+        if img.mode == "RGB" and len(lut) == 256:
+            lut = lut + lut + lut
+        return img.point(lut)
+    else:
+        return img
+
+
+def posterize(img, bits_to_keep, **__):
+    if bits_to_keep >= 8:
+        return img
+    return ImageOps.posterize(img, bits_to_keep)
+
+
+def contrast(img, factor, **__):
+    return ImageEnhance.Contrast(img).enhance(factor)
+
+
+def color(img, factor, **__):
+    return ImageEnhance.Color(img).enhance(factor)
+
+
+def brightness(img, factor, **__):
+    return ImageEnhance.Brightness(img).enhance(factor)
+
+
+def sharpness(img, factor, **__):
+    return ImageEnhance.Sharpness(img).enhance(factor)
+
+
+def _randomly_negate(v):
+    """With 50% prob, negate the value"""
+    return -v if random.random() > 0.5 else v
+
+
+def _rotate_level_to_arg(level, _hparams):
+    # range [-30, 30]
+    level = (level / _MAX_LEVEL) * 30.
+    level = _randomly_negate(level)
+    return level,
+
+
+def _enhance_level_to_arg(level, _hparams):
+    # range [0.1, 1.9]
+    return (level / _MAX_LEVEL) * 1.8 + 0.1,
+
+
+def _enhance_increasing_level_to_arg(level, _hparams):
+    # the 'no change' level is 1.0, moving away from that towards 0. or 2.0 increases the enhancement blend
+    # range [0.1, 1.9]
+    level = (level / _MAX_LEVEL) * .9
+    level = 1.0 + _randomly_negate(level)
+    return level,
+
+
+def _shear_level_to_arg(level, _hparams):
+    # range [-0.3, 0.3]
+    level = (level / _MAX_LEVEL) * 0.3
+    level = _randomly_negate(level)
+    return level,
+
+
+def _translate_abs_level_to_arg(level, hparams):
+    translate_const = hparams['translate_const']
+    level = (level / _MAX_LEVEL) * float(translate_const)
+    level = _randomly_negate(level)
+    return level,
+
+
+def _translate_rel_level_to_arg(level, hparams):
+    # default range [-0.45, 0.45]
+    translate_pct = hparams.get('translate_pct', 0.45)
+    level = (level / _MAX_LEVEL) * translate_pct
+    level = _randomly_negate(level)
+    return level,
+
+
+def _posterize_level_to_arg(level, _hparams):
+    # As per Tensorflow TPU EfficientNet impl
+    # range [0, 4], 'keep 0 up to 4 MSB of original image'
+    # intensity/severity of augmentation decreases with level
+    return int((level / _MAX_LEVEL) * 4),
+
+
+def _posterize_increasing_level_to_arg(level, hparams):
+    # As per Tensorflow models research and UDA impl
+    # range [4, 0], 'keep 4 down to 0 MSB of original image',
+    # intensity/severity of augmentation increases with level
+    return 4 - _posterize_level_to_arg(level, hparams)[0],
+
+
+def _posterize_original_level_to_arg(level, _hparams):
+    # As per original AutoAugment paper description
+    # range [4, 8], 'keep 4 up to 8 MSB of image'
+    # intensity/severity of augmentation decreases with level
+    return int((level / _MAX_LEVEL) * 4) + 4,
+
+
+def _solarize_level_to_arg(level, _hparams):
+    # range [0, 256]
+    # intensity/severity of augmentation decreases with level
+    return int((level / _MAX_LEVEL) * 256),
+
+
+def _solarize_increasing_level_to_arg(level, _hparams):
+    # range [0, 256]
+    # intensity/severity of augmentation increases with level
+    return 256 - _solarize_level_to_arg(level, _hparams)[0],
+
+
+def _solarize_add_level_to_arg(level, _hparams):
+    # range [0, 110]
+    return int((level / _MAX_LEVEL) * 110),
+
+
+LEVEL_TO_ARG = {
+    'AutoContrast': None,
+    'Equalize': None,
+    'Invert': None,
+    'Rotate': _rotate_level_to_arg,
+    # There are several variations of the posterize level scaling in various Tensorflow/Google repositories/papers
+    'Posterize': _posterize_level_to_arg,
+    'PosterizeIncreasing': _posterize_increasing_level_to_arg,
+    'PosterizeOriginal': _posterize_original_level_to_arg,
+    'Solarize': _solarize_level_to_arg,
+    'SolarizeIncreasing': _solarize_increasing_level_to_arg,
+    'SolarizeAdd': _solarize_add_level_to_arg,
+    'Color': _enhance_level_to_arg,
+    'ColorIncreasing': _enhance_increasing_level_to_arg,
+    'Contrast': _enhance_level_to_arg,
+    'ContrastIncreasing': _enhance_increasing_level_to_arg,
+    'Brightness': _enhance_level_to_arg,
+    'BrightnessIncreasing': _enhance_increasing_level_to_arg,
+    'Sharpness': _enhance_level_to_arg,
+    'SharpnessIncreasing': _enhance_increasing_level_to_arg,
+    'ShearX': _shear_level_to_arg,
+    'ShearY': _shear_level_to_arg,
+    'TranslateX': _translate_abs_level_to_arg,
+    'TranslateY': _translate_abs_level_to_arg,
+    'TranslateXRel': _translate_rel_level_to_arg,
+    'TranslateYRel': _translate_rel_level_to_arg,
+}
+
+NAME_TO_OP = {
+    'AutoContrast': auto_contrast,
+    'Equalize': equalize,
+    'Invert': invert,
+    'Rotate': rotate,
+    'Posterize': posterize,
+    'PosterizeIncreasing': posterize,
+    'PosterizeOriginal': posterize,
+    'Solarize': solarize,
+    'SolarizeIncreasing': solarize,
+    'SolarizeAdd': solarize_add,
+    'Color': color,
+    'ColorIncreasing': color,
+    'Contrast': contrast,
+    'ContrastIncreasing': contrast,
+    'Brightness': brightness,
+    'BrightnessIncreasing': brightness,
+    'Sharpness': sharpness,
+    'SharpnessIncreasing': sharpness,
+    'ShearX': shear_x,
+    'ShearY': shear_y,
+    'TranslateX': translate_x_abs,
+    'TranslateY': translate_y_abs,
+    'TranslateXRel': translate_x_rel,
+    'TranslateYRel': translate_y_rel,
+}
+
+
+class AugmentOp(object):
+    def __init__(self, name, prob=0.5, magnitude=10, hparams=None):
+        hparams = hparams or _HPARAMS_DEFAULT
+        self.aug_fn = NAME_TO_OP[name]
+        self.level_fn = LEVEL_TO_ARG[name]
+        self.prob = prob
+        self.magnitude = magnitude
+        self.hparams = hparams.copy()
+        self.kwargs = dict(
+            fillcolor=hparams['img_mean'] if 'img_mean' in hparams else _FILL,
+            resample=hparams['interpolation']
+            if 'interpolation' in hparams else _RANDOM_INTERPOLATION, )
+
+        # If magnitude_std is > 0, we introduce some randomness
+        # in the usually fixed policy and sample magnitude from a normal distribution
+        # with mean `magnitude` and std-dev of `magnitude_std`.
+        # NOTE This is my own hack, being tested, not in papers or reference impls.
+        self.magnitude_std = self.hparams.get('magnitude_std', 0)
+
+    def __call__(self, img):
+        if self.prob < 1.0 and random.random() > self.prob:
+            return img
+        magnitude = self.magnitude
+        if self.magnitude_std and self.magnitude_std > 0:
+            magnitude = random.gauss(magnitude, self.magnitude_std)
+        magnitude = min(_MAX_LEVEL, max(0, magnitude))  # clip to valid range
+        level_args = self.level_fn(
+            magnitude, self.hparams) if self.level_fn is not None else tuple()
+        return self.aug_fn(img, *level_args, **self.kwargs)
+
+
+def auto_augment_policy_v0(hparams):
+    # ImageNet v0 policy from TPU EfficientNet impl, cannot find a paper reference.
+    policy = [
+        [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
+        [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
+        [('Color', 0.4, 1), ('Rotate', 0.6, 8)],
+        [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
+        [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
+        [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
+        [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
+        [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
+        [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
+        [('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
+        [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
+        [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
+        [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
+        [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
+        [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
+        [('Rotate', 1.0, 7), ('TranslateYRel', 0.8, 9)],
+        [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
+        [('ShearY', 0.8, 0), ('Color', 0.6, 4)],
+        [('Color', 1.0, 0), ('Rotate', 0.6, 2)],
+        [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
+        [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
+        [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
+        [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)
+         ],  # This results in black image with Tpu posterize
+        [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
+        [('Color', 0.8, 6), ('Rotate', 0.4, 5)],
+    ]
+    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy_v0r(hparams):
+    # ImageNet v0 policy from TPU EfficientNet impl, with variation of Posterize used
+    # in Google research implementation (number of bits discarded increases with magnitude)
+    policy = [
+        [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
+        [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
+        [('Color', 0.4, 1), ('Rotate', 0.6, 8)],
+        [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
+        [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
+        [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
+        [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
+        [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
+        [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
+        [('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
+        [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
+        [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
+        [('PosterizeIncreasing', 0.4, 6), ('AutoContrast', 0.4, 7)],
+        [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
+        [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
+        [('Rotate', 1.0, 7), ('TranslateYRel', 0.8, 9)],
+        [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
+        [('ShearY', 0.8, 0), ('Color', 0.6, 4)],
+        [('Color', 1.0, 0), ('Rotate', 0.6, 2)],
+        [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
+        [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
+        [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
+        [('PosterizeIncreasing', 0.8, 2), ('Solarize', 0.6, 10)],
+        [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
+        [('Color', 0.8, 6), ('Rotate', 0.4, 5)],
+    ]
+    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy_original(hparams):
+    # ImageNet policy from https://arxiv.org/abs/1805.09501
+    policy = [
+        [('PosterizeOriginal', 0.4, 8), ('Rotate', 0.6, 9)],
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
+        [('PosterizeOriginal', 0.6, 7), ('PosterizeOriginal', 0.6, 6)],
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
+        [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)],
+        [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)],
+        [('PosterizeOriginal', 0.8, 5), ('Equalize', 1.0, 2)],
+        [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)],
+        [('Equalize', 0.6, 8), ('PosterizeOriginal', 0.4, 6)],
+        [('Rotate', 0.8, 8), ('Color', 0.4, 0)],
+        [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)],
+        [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)],
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
+        [('Rotate', 0.8, 8), ('Color', 1.0, 2)],
+        [('Color', 0.8, 8), ('Solarize', 0.8, 7)],
+        [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)],
+        [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)],
+        [('Color', 0.4, 0), ('Equalize', 0.6, 3)],
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
+    ]
+    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy_originalr(hparams):
+    # ImageNet policy from https://arxiv.org/abs/1805.09501 with research posterize variation
+    policy = [
+        [('PosterizeIncreasing', 0.4, 8), ('Rotate', 0.6, 9)],
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
+        [('PosterizeIncreasing', 0.6, 7), ('PosterizeIncreasing', 0.6, 6)],
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
+        [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)],
+        [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)],
+        [('PosterizeIncreasing', 0.8, 5), ('Equalize', 1.0, 2)],
+        [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)],
+        [('Equalize', 0.6, 8), ('PosterizeIncreasing', 0.4, 6)],
+        [('Rotate', 0.8, 8), ('Color', 0.4, 0)],
+        [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)],
+        [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)],
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
+        [('Rotate', 0.8, 8), ('Color', 1.0, 2)],
+        [('Color', 0.8, 8), ('Solarize', 0.8, 7)],
+        [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)],
+        [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)],
+        [('Color', 0.4, 0), ('Equalize', 0.6, 3)],
+        [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)],
+        [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)],
+        [('Invert', 0.6, 4), ('Equalize', 1.0, 8)],
+        [('Color', 0.6, 4), ('Contrast', 1.0, 8)],
+        [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)],
+    ]
+    pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy]
+    return pc
+
+
+def auto_augment_policy(name='v0', hparams=None):
+    hparams = hparams or _HPARAMS_DEFAULT
+    if name == 'original':
+        return auto_augment_policy_original(hparams)
+    elif name == 'originalr':
+        return auto_augment_policy_originalr(hparams)
+    elif name == 'v0':
+        return auto_augment_policy_v0(hparams)
+    elif name == 'v0r':
+        return auto_augment_policy_v0r(hparams)
+    else:
+        assert False, 'Unknown AA policy (%s)' % name
+
+
+class AutoAugment(object):
+    def __init__(self, policy):
+        self.policy = policy
+
+    def __call__(self, img):
+        sub_policy = random.choice(self.policy)
+        for op in sub_policy:
+            img = op(img)
+        return img
+
+
+def auto_augment_transform(config_str, hparams):
+    """
+    Create a AutoAugment transform
+
+    :param config_str: String defining configuration of auto augmentation. Consists of multiple sections separated by
+    dashes ('-'). The first section defines the AutoAugment policy (one of 'v0', 'v0r', 'original', 'originalr').
+    The remaining sections, not order sepecific determine
+        'mstd' -  float std deviation of magnitude noise applied
+    Ex 'original-mstd0.5' results in AutoAugment with original policy, magnitude_std 0.5
+
+    :param hparams: Other hparams (kwargs) for the AutoAugmentation scheme
+
+    :return: A callable Transform Op
+    """
+    config = config_str.split('-')
+    policy_name = config[0]
+    config = config[1:]
+    for c in config:
+        cs = re.split(r'(\d.*)', c)
+        if len(cs) < 2:
+            continue
+        key, val = cs[:2]
+        if key == 'mstd':
+            # noise param injected via hparams for now
+            hparams.setdefault('magnitude_std', float(val))
+        else:
+            assert False, 'Unknown AutoAugment config section'
+    aa_policy = auto_augment_policy(policy_name, hparams=hparams)
+    return AutoAugment(aa_policy)
+
+
+_RAND_TRANSFORMS = [
+    'AutoContrast',
+    'Equalize',
+    'Invert',
+    'Rotate',
+    'Posterize',
+    'Solarize',
+    'SolarizeAdd',
+    'Color',
+    'Contrast',
+    'Brightness',
+    'Sharpness',
+    'ShearX',
+    'ShearY',
+    'TranslateXRel',
+    'TranslateYRel',
+    #'Cutout'  # NOTE I've implement this as random erasing separately
+]
+
+_RAND_INCREASING_TRANSFORMS = [
+    'AutoContrast',
+    'Equalize',
+    'Invert',
+    'Rotate',
+    'PosterizeIncreasing',
+    'SolarizeIncreasing',
+    'SolarizeAdd',
+    'ColorIncreasing',
+    'ContrastIncreasing',
+    'BrightnessIncreasing',
+    'SharpnessIncreasing',
+    'ShearX',
+    'ShearY',
+    'TranslateXRel',
+    'TranslateYRel',
+    #'Cutout'  # NOTE I've implement this as random erasing separately
+]
+
+# These experimental weights are based loosely on the relative improvements mentioned in paper.
+# They may not result in increased performance, but could likely be tuned to so.
+_RAND_CHOICE_WEIGHTS_0 = {
+    'Rotate': 0.3,
+    'ShearX': 0.2,
+    'ShearY': 0.2,
+    'TranslateXRel': 0.1,
+    'TranslateYRel': 0.1,
+    'Color': .025,
+    'Sharpness': 0.025,
+    'AutoContrast': 0.025,
+    'Solarize': .005,
+    'SolarizeAdd': .005,
+    'Contrast': .005,
+    'Brightness': .005,
+    'Equalize': .005,
+    'Posterize': 0,
+    'Invert': 0,
+}
+
+
+def _select_rand_weights(weight_idx=0, transforms=None):
+    transforms = transforms or _RAND_TRANSFORMS
+    assert weight_idx == 0  # only one set of weights currently
+    rand_weights = _RAND_CHOICE_WEIGHTS_0
+    probs = [rand_weights[k] for k in transforms]
+    probs /= np.sum(probs)
+    return probs
+
+
+def rand_augment_ops(magnitude=10, hparams=None, transforms=None):
+    hparams = hparams or _HPARAMS_DEFAULT
+    transforms = transforms or _RAND_TRANSFORMS
+    return [
+        AugmentOp(
+            name, prob=0.5, magnitude=magnitude, hparams=hparams)
+        for name in transforms
+    ]
+
+
+class RandAugment(object):
+    def __init__(self, ops, num_layers=2, choice_weights=None):
+        self.ops = ops
+        self.num_layers = num_layers
+        self.choice_weights = choice_weights
+
+    def __call__(self, img):
+        # no replacement when using weighted choice
+        ops = np.random.choice(
+            self.ops,
+            self.num_layers,
+            replace=self.choice_weights is None,
+            p=self.choice_weights)
+        for op in ops:
+            img = op(img)
+        return img
+
+
+def rand_augment_transform(config_str, hparams):
+    """
+    Create a RandAugment transform
+
+    :param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by
+    dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining
+    sections, not order sepecific determine
+        'm' - integer magnitude of rand augment
+        'n' - integer num layers (number of transform ops selected per image)
+        'w' - integer probabiliy weight index (index of a set of weights to influence choice of op)
+        'mstd' -  float std deviation of magnitude noise applied
+        'inc' - integer (bool), use augmentations that increase in severity with magnitude (default: 0)
+    Ex 'rand-m9-n3-mstd0.5' results in RandAugment with magnitude 9, num_layers 3, magnitude_std 0.5
+    'rand-mstd1-w0' results in magnitude_std 1.0, weights 0, default magnitude of 10 and num_layers 2
+
+    :param hparams: Other hparams (kwargs) for the RandAugmentation scheme
+
+    :return: A callable Transform Op
+    """
+    magnitude = _MAX_LEVEL  # default to _MAX_LEVEL for magnitude (currently 10)
+    num_layers = 2  # default to 2 ops per image
+    weight_idx = None  # default to no probability weights for op choice
+    transforms = _RAND_TRANSFORMS
+    config = config_str.split('-')
+    assert config[0] == 'rand'
+    config = config[1:]
+    for c in config:
+        cs = re.split(r'(\d.*)', c)
+        if len(cs) < 2:
+            continue
+        key, val = cs[:2]
+        if key == 'mstd':
+            # noise param injected via hparams for now
+            hparams.setdefault('magnitude_std', float(val))
+        elif key == 'inc':
+            if bool(val):
+                transforms = _RAND_INCREASING_TRANSFORMS
+        elif key == 'm':
+            magnitude = int(val)
+        elif key == 'n':
+            num_layers = int(val)
+        elif key == 'w':
+            weight_idx = int(val)
+        else:
+            assert False, 'Unknown RandAugment config section'
+    ra_ops = rand_augment_ops(
+        magnitude=magnitude, hparams=hparams, transforms=transforms)
+    choice_weights = None if weight_idx is None else _select_rand_weights(
+        weight_idx)
+    return RandAugment(ra_ops, num_layers, choice_weights=choice_weights)
+
+
+_AUGMIX_TRANSFORMS = [
+    'AutoContrast',
+    'ColorIncreasing',  # not in paper
+    'ContrastIncreasing',  # not in paper
+    'BrightnessIncreasing',  # not in paper
+    'SharpnessIncreasing',  # not in paper
+    'Equalize',
+    'Rotate',
+    'PosterizeIncreasing',
+    'SolarizeIncreasing',
+    'ShearX',
+    'ShearY',
+    'TranslateXRel',
+    'TranslateYRel',
+]
+
+
+def augmix_ops(magnitude=10, hparams=None, transforms=None):
+    hparams = hparams or _HPARAMS_DEFAULT
+    transforms = transforms or _AUGMIX_TRANSFORMS
+    return [
+        AugmentOp(
+            name, prob=1.0, magnitude=magnitude, hparams=hparams)
+        for name in transforms
+    ]
+
+
+class AugMixAugment(object):
+    """ AugMix Transform
+    Adapted and improved from impl here: https://github.com/google-research/augmix/blob/master/imagenet.py
+    From paper: 'AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty -
+    https://arxiv.org/abs/1912.02781
+    """
+
+    def __init__(self, ops, alpha=1., width=3, depth=-1, blended=False):
+        self.ops = ops
+        self.alpha = alpha
+        self.width = width
+        self.depth = depth
+        self.blended = blended  # blended mode is faster but not well tested
+
+    def _calc_blended_weights(self, ws, m):
+        ws = ws * m
+        cump = 1.
+        rws = []
+        for w in ws[::-1]:
+            alpha = w / cump
+            cump *= (1 - alpha)
+            rws.append(alpha)
+        return np.array(rws[::-1], dtype=np.float32)
+
+    def _apply_blended(self, img, mixing_weights, m):
+        # This is my first crack and implementing a slightly faster mixed augmentation. Instead
+        # of accumulating the mix for each chain in a Numpy array and then blending with original,
+        # it recomputes the blending coefficients and applies one PIL image blend per chain.
+        # TODO the results appear in the right ballpark but they differ by more than rounding.
+        img_orig = img.copy()
+        ws = self._calc_blended_weights(mixing_weights, m)
+        for w in ws:
+            depth = self.depth if self.depth > 0 else np.random.randint(1, 4)
+            ops = np.random.choice(self.ops, depth, replace=True)
+            img_aug = img_orig  # no ops are in-place, deep copy not necessary
+            for op in ops:
+                img_aug = op(img_aug)
+            img = Image.blend(img, img_aug, w)
+        return img
+
+    def _apply_basic(self, img, mixing_weights, m):
+        # This is a literal adaptation of the paper/official implementation without normalizations and
+        # PIL <-> Numpy conversions between every op. It is still quite CPU compute heavy compared to the
+        # typical augmentation transforms, could use a GPU / Kornia implementation.
+        img_shape = img.size[0], img.size[1], len(img.getbands())
+        mixed = np.zeros(img_shape, dtype=np.float32)
+        for mw in mixing_weights:
+            depth = self.depth if self.depth > 0 else np.random.randint(1, 4)
+            ops = np.random.choice(self.ops, depth, replace=True)
+            img_aug = img  # no ops are in-place, deep copy not necessary
+            for op in ops:
+                img_aug = op(img_aug)
+            mixed += mw * np.asarray(img_aug, dtype=np.float32)
+        np.clip(mixed, 0, 255., out=mixed)
+        mixed = Image.fromarray(mixed.astype(np.uint8))
+        return Image.blend(img, mixed, m)
+
+    def __call__(self, img):
+        mixing_weights = np.float32(
+            np.random.dirichlet([self.alpha] * self.width))
+        m = np.float32(np.random.beta(self.alpha, self.alpha))
+        if self.blended:
+            mixed = self._apply_blended(img, mixing_weights, m)
+        else:
+            mixed = self._apply_basic(img, mixing_weights, m)
+        return mixed
+
+
+def augment_and_mix_transform(config_str, hparams):
+    """ Create AugMix transform
+
+    :param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by
+    dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining
+    sections, not order sepecific determine
+        'm' - integer magnitude (severity) of augmentation mix (default: 3)
+        'w' - integer width of augmentation chain (default: 3)
+        'd' - integer depth of augmentation chain (-1 is random [1, 3], default: -1)
+        'b' - integer (bool), blend each branch of chain into end result without a final blend, less CPU (default: 0)
+        'mstd' -  float std deviation of magnitude noise applied (default: 0)
+    Ex 'augmix-m5-w4-d2' results in AugMix with severity 5, chain width 4, chain depth 2
+
+    :param hparams: Other hparams (kwargs) for the Augmentation transforms
+
+    :return: A callable Transform Op
+    """
+    magnitude = 3
+    width = 3
+    depth = -1
+    alpha = 1.
+    blended = False
+    config = config_str.split('-')
+    assert config[0] == 'augmix'
+    config = config[1:]
+    for c in config:
+        cs = re.split(r'(\d.*)', c)
+        if len(cs) < 2:
+            continue
+        key, val = cs[:2]
+        if key == 'mstd':
+            # noise param injected via hparams for now
+            hparams.setdefault('magnitude_std', float(val))
+        elif key == 'm':
+            magnitude = int(val)
+        elif key == 'w':
+            width = int(val)
+        elif key == 'd':
+            depth = int(val)
+        elif key == 'a':
+            alpha = float(val)
+        elif key == 'b':
+            blended = bool(val)
+        else:
+            assert False, 'Unknown AugMix config section'
+    ops = augmix_ops(magnitude=magnitude, hparams=hparams)
+    return AugMixAugment(
+        ops, alpha=alpha, width=width, depth=depth, blended=blended)
+
+
+class RawTimmAutoAugment(object):
+    """TimmAutoAugment API for PaddleClas."""
+
+    def __init__(self,
+                 config_str="rand-m9-mstd0.5-inc1",
+                 interpolation="bicubic",
+                 img_size=224,
+                 mean=IMAGENET_DEFAULT_MEAN):
+        if isinstance(img_size, (tuple, list)):
+            img_size_min = min(img_size)
+        else:
+            img_size_min = img_size
+
+        aa_params = dict(
+            translate_const=int(img_size_min * 0.45),
+            img_mean=tuple([min(255, round(255 * x)) for x in mean]), )
+        if interpolation and interpolation != 'random':
+            aa_params['interpolation'] = _pil_interp(interpolation)
+        if config_str.startswith('rand'):
+            self.augment_func = rand_augment_transform(config_str, aa_params)
+        elif config_str.startswith('augmix'):
+            aa_params['translate_pct'] = 0.3
+            self.augment_func = augment_and_mix_transform(config_str,
+                                                          aa_params)
+        elif config_str.startswith('auto'):
+            self.augment_func = auto_augment_transform(config_str, aa_params)
+        else:
+            raise Exception(
+                "ConfigError: The TimmAutoAugment Op only support RandAugment, AutoAugment, AugMix, and the config_str only starts with \"rand\", \"augmix\", \"auto\"."
+            )
+
+    def __call__(self, img):
+        return self.augment_func(img)
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/utils/__init__.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/utils/__init__.py
new file mode 100644
index 000000000..61d5aa213
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/utils/get_image_list.py b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/utils/get_image_list.py
new file mode 100644
index 000000000..9b6de0690
--- /dev/null
+++ b/cv/classification/resnet50/paddlepaddle/ppcls_2.6/data/utils/get_image_list.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+import base64
+import numpy as np
+
+
+def get_image_list(img_file, infer_list=None):
+    imgs_lists = []
+    if infer_list and not os.path.exists(infer_list):
+        raise Exception("not found infer list {}".format(infer_list))
+    if infer_list:
+        with open(infer_list, "r") as f:
+            lines = f.readlines()
+        for line in lines:
+            image_path = line.strip(" ").split()[0]
+            image_path = os.path.join(img_file, image_path)
+            imgs_lists.append(image_path)
+    else:
+        if img_file is None or not os.path.exists(img_file):
+            raise Exception("not found any img file in {}".format(img_file))
+        img_end = ['jpg', 'png', 'jpeg', 'JPEG', 'JPG', 'bmp']
+        if os.path.isfile(img_file) and img_file.split('.')[-1] in img_end:
+            imgs_lists.append(img_file)
+        elif os.path.isdir(img_file):
+            for root, dirs, files in os.walk(img_file):
+                for single_file in files:
+                    if single_file.split('.')[-1] in img_end:
+                        imgs_lists.append(os.path.join(root, single_file))
+    if len(imgs_lists) == 0:
+        raise Exception("not found any img file in {}".format(img_file))
+    imgs_lists = sorted(imgs_lists)
+    return imgs_lists
+
+
+def get_image_list_from_label_file(image_path, label_file_path):
+    imgs_lists = []
+    gt_labels = []
+    with open(label_file_path, "r") as fin:
+        lines = fin.readlines()
+        for line in lines:
+            image_name, label = line.strip("\n").split()
+            label = int(label)
+            imgs_lists.append(os.path.join(image_path, image_name))
+            gt_labels.append(int(label))
+    return imgs_lists, gt_labels
diff --git a/cv/detection/ssd/pytorch/README.md b/cv/detection/ssd/pytorch/README.md
index ab0f5eba9..9f5d3037c 100644
--- a/cv/detection/ssd/pytorch/README.md
+++ b/cv/detection/ssd/pytorch/README.md
@@ -11,7 +11,7 @@ objects at different resolutions, offering a good balance between speed and accu
 
 | GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
 | :----: | :----: | :----: |
-| BI-V100 | 2.2.0     |  22.09  |
+| BI-V150 | 4.3.0     |  25.12  |
 
 ## Model Preparation
 
@@ -24,7 +24,9 @@ Take coco2017 dataset as an example, specify `/path/to/coco2017` to your COCO pa
 unzipped dataset path structure sholud look like:
 
 ```bash
-coco2017
+mkdir -p data/datasets/
+
+data/datasets/coco2017
 ├── annotations
 │   ├── instances_train2017.json
 │   ├── instances_val2017.json
@@ -42,35 +44,35 @@ coco2017
 └── ...
 ```
 
-```bash
-mkdir -p /home/data/perf/ssd
-cd /home/data/perf/ssd
-ln -s /path/to/coco/ /home/data/perf/ssd
-```
-
-Download backbone.
+### Install Dependencies
+Contact the Iluvatar administrator to get the missing packages:
+    - dali-1.21.0+corex.4.3.0-cp310-cp310-linux_x86_64.whl
+    - apex-0.1+corex.4.3.0-cp310-cp310-linux_x86_64.whl
 
 ```bash
-cd /home/data/perf/ssd
-wget https://download.pytorch.org/models/resnet34-333f7ec4.pth
+apt install -y git numactl
+pip3 install "git+https://github.com/mlperf/logging.git@1.0-branch" pybind11==2.9.2 ujson==1.35
+pip3 install wheel numpy>=1.26.4 cython pycocotools==2.0.8
+
+bash ./clean_ssd.sh && bash ./build_ssd.sh && bash ./install_ssd.sh "$@"
+export DATA_PATH_BBOX=../../../..
+export DATA_PATH=data/datasets/coco2017
+python3 prepare-json.py --keep-keys ${DATA_PATH}/annotations/instances_val2017.json ${DATA_PATH_BBOX}/bbox_only_instances_val2017.json "$@"
+python3 prepare-json.py ${DATA_PATH}/annotations/instances_train2017.json ${DATA_PATH_BBOX}/bbox_only_instances_train2017.json "$@"
 ```
 
 ## Model Training
 
 ```bash
-# Multiple GPUs on one machine
-cd {deepsparkhub_root_path}/cv/detection/ssd/pytorch/base
-source ../iluvatar/config/environment_variables.sh
-python3  prepare.py --name iluvatar --data_dir /home/data/perf/ssd
-bash run_training.sh --name iluvatar --config V100x1x8 --data_dir /home/data/perf/ssd --backbone_path /home/data/perf/ssd/resnet34-333f7ec4.pth
+python3 train.py --dali --dali-cache 0 --data=${DATA_PATH} \
+--batch-size=160 --warmup-factor=0 --warmup=650 --lr=2.92e-3 --threshold=0.08 --epochs 5 --eval-batch-size=160 \
+--wd=1.6e-4 --use-fp16 --jit --nhwc --pad-input --delay-allreduce --lr-decay-factor=0.2 --lr-decay-epochs 34 45 --opt-level O2 --seed 1769250163 "$@"
 ```
 
 ## Model Results
 
-| Model | GPU        | Batch Size | FPS  | Train Epochs | mAP  |
+| Model | GPU        | Batch Size | IoU=0.50:0.95  | IoU=0.50 | IoU=0.75  |
 |-------|------------|------------|------|--------------|------|
-| SSD   | BI-V100 x8 | 192        | 2858 | 65           | 0.23 |
+| SSD   | BI-V150 x8 | 160        | 0.094 | 0.197           | 0.078 |
 
 ## References
-
-- [mlcommons](https://github.com/mlcommons/training_results_v0.7/tree/master/NVIDIA/benchmarks/ssd/implementations/pytorch)
diff --git a/cv/detection/yolov5-sample/pytorch/README.md b/cv/detection/yolov5-sample/pytorch/README.md
new file mode 100644
index 000000000..336f8471d
--- /dev/null
+++ b/cv/detection/yolov5-sample/pytorch/README.md
@@ -0,0 +1,79 @@
+# YOLOv5
+
+## Model Description
+
+YOLOv5 is a state-of-the-art object detection model that builds upon the YOLO architecture, offering improved speed and
+accuracy. It features a streamlined design with enhanced data augmentation and anchor box strategies. YOLOv5 supports
+multiple model sizes (n/s/m/l/x) for different performance needs. The model is known for its ease of use, fast training,
+and efficient inference, making it popular for real-time detection tasks across various applications.
+
+## Supported Environments
+
+| GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| BI-V150 | 4.3.0     |  25.12  |
+
+## Model Preparation
+
+### Prepare Resources
+
+Go to visit [COCO official website](https://cocodataset.org/#download), then select the COCO dataset you want to
+download.
+
+Take coco2017 dataset as an example, specify `/path/to/coco2017` to your COCO path in later training process, the
+unzipped dataset path structure sholud look like:
+
+```bash
+coco2017
+├── annotations
+│   ├── instances_train2017.json
+│   ├── instances_val2017.json
+│   └── ...
+├── train2017
+│   ├── 000000000009.jpg
+│   ├── 000000000025.jpg
+│   └── ...
+├── val2017
+│   ├── 000000000139.jpg
+│   ├── 000000000285.jpg
+│   └── ...
+├── train2017.txt
+├── val2017.txt
+└── ...
+```
+
+### Install Dependencies
+
+```bash
+pip3 install seaborn
+git clone https://gitee.com/deep-spark/deepsparkhub-GPL.git
+cd deepsparkhub-GPL/cv/detection/yolov5-sample/pytorch/
+mkdir -p weights
+wget -O weights/yolov5s.pt http://files.deepspark.org.cn:880/deepspark/data/checkpoints/yolov5s.pt
+mkdir -p datasets
+cd datasets
+wget http://files.deepspark.org.cn:880/deepspark/data/datasets/coco2017labels.zip
+wget http://files.deepspark.org.cn:880/deepspark/data/datasets/coco128.tgz
+tar xf coco128.tgz
+unzip -q -d ./ coco2017labels.zip
+ln -s coco2017/train2017 ./coco/images/
+ln -s coco2017/val2017 ./coco/images/
+```
+
+## Model Training
+
+```bash
+python3 train.py --img-size 640 --batch-size 8 \
+ --cfg ./models/yolov5s.yaml --weights ./weights/yolov5s.pt --data ./data/coco.yaml  --amp ${nonstrict_mode_args} "$@"
+```
+
+## Model Results
+
+
+| GPU        | Batch size | IoU=0.50:0.95 | IoU=0.50 | IoU=0.75 |
+|------------|------------|---------------|----------|----------|
+| BI-V150 x8 | 8          | 0.365         | 0.546    | 0.400    |
+
+## References
+
+- [YOLOv5](https://github.com/ultralytics/yolov5)
diff --git a/cv/multi_object_tracking/fairmot/pytorch/src/gen_labels_17.py b/cv/multi_object_tracking/fairmot/pytorch/src/gen_labels_17.py
index fc84f09ba..9d0012051 100644
--- a/cv/multi_object_tracking/fairmot/pytorch/src/gen_labels_17.py
+++ b/cv/multi_object_tracking/fairmot/pytorch/src/gen_labels_17.py
@@ -62,5 +62,5 @@ def get_mot_train(img_root, label_category):
 
 if __name__ == "__main__":
     for name in ['train']:
-        gen_labels_mot('../../../../../data/datasets/MOT17', name)
-        get_mot_train('../../../../../data/datasets/MOT17', name)
\ No newline at end of file
+        gen_labels_mot('../../../../../tests/data/datasets/MOT17', name)
+        get_mot_train('../../../../../tests/data/datasets/MOT17', name)
\ No newline at end of file
diff --git a/cv/multi_object_tracking/fairmot/pytorch/src/lib/cfg/mot17.json b/cv/multi_object_tracking/fairmot/pytorch/src/lib/cfg/mot17.json
index 3c3efc19a..4a71332cb 100644
--- a/cv/multi_object_tracking/fairmot/pytorch/src/lib/cfg/mot17.json
+++ b/cv/multi_object_tracking/fairmot/pytorch/src/lib/cfg/mot17.json
@@ -2,14 +2,14 @@
     "root":"./",
     "train":
     {
-        "mot17":"../../../../../data/datasets/MOT17/mot17.train"
+        "mot17":"../../../../tests/data/datasets/MOT17/mot17.train"
     },
     "test_emb":
     {
-        "mot17":"../../../../../data/datasets/MOT17/mot17.train"
+        "mot17":"../../../../tests/data/datasets/MOT17/mot17.train"
     },
     "test":
     {
-        "mot17":"../../../../../data/datasets/MOT17/mot17.train"
+        "mot17":"../../../../tests/data/datasets/MOT17/mot17.train"
     }
 }
diff --git a/cv/multi_object_tracking/fairmot/pytorch/src/lib/models/networks/config/hrnet_w32.yaml b/cv/multi_object_tracking/fairmot/pytorch/src/lib/models/networks/config/hrnet_w32.yaml
index b141d5eca..cebd114af 100644
--- a/cv/multi_object_tracking/fairmot/pytorch/src/lib/models/networks/config/hrnet_w32.yaml
+++ b/cv/multi_object_tracking/fairmot/pytorch/src/lib/models/networks/config/hrnet_w32.yaml
@@ -26,7 +26,7 @@ MODEL:
   INIT_WEIGHTS: true
   NAME: pose_hrnet
   NUM_JOINTS: 17
-  PRETRAINED: '../../../../../data/model_zoo/hrnetv2_w32_imagenet_pretrained.pth'
+  PRETRAINED: '../../../../tests/data/model_zoo/hrnetv2_w32_imagenet_pretrained.pth'
   TARGET_TYPE: gaussian
   IMAGE_SIZE:
   - 192
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/README.md b/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/README.md
deleted file mode 100644
index 3bfe91c9f..000000000
--- a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/README.md
+++ /dev/null
@@ -1,45 +0,0 @@
-## 安装环境
-### 安装 transformers
-```bash
-git clone ssh://git@bitbucket.iluvatar.ai:7999/apptp/transformers.git
-cd transformers
-python3 setup.py install
-```
-### 安装diffusers
-```bash
-cd diffusers
-pip3 install pip3 install -r examples/text_to_image/requirements.txt
-bash build_diffusers.sh && bash install_diffusers.sh
-```
-_默认已经安装的包有：torchvision,ixformer,flash-attn,deepspeed,apex_
-_上述包最好使用较新的daily，不然可能会有功能不支持_
-
-## 下载数据
-```bash
-mkdir -p pokemon-blip-captions 
-download here: http://10.150.9.95/swapp/datasets/multimodal/stable_diffusion/pokemon-blip-captions
-wget http://10.150.9.95/swapp/datasets/multimodal/stable_diffusion/stabilityai.tar   # sd2.1 权重
-tar -xvf stabilityai.tar
-```
-
-*sdxl权重链接：http://sw.iluvatar.ai/download/apps/datasets/aigc/xl/stable-diffusion-xl-base-1.0.tar.gz*
-
-*sd1.5权重链接：http://10.150.9.95/swapp/pretrained/multimodal/stable-diffusion/stable-diffusion-v1-5.zip*
-
-
-## 训练
-*以下脚本中包含的数据和预训练权重位置需要根据实际存放位置调整*
-### sd2.1 训练
-```bash
-$ bash run_sd_2.1.sh   # 多卡
-$ bash run_sd_2.1_single.sh   # 单卡
-```
-### sd1.5 训练
-```bash
-$ bash run_sd_1.5.sh   # 多卡
-$ bash run_sd_1.5_single.sh   # 单卡
-```
-### sdxl 训练
-```bash
-$ bash run_sd_xl.sh   # 多卡
-```
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/.gitignore b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/.gitignore
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/.gitignore
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/.gitignore
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/LICENSE b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/LICENSE
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/LICENSE
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/LICENSE
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/MANIFEST.in b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/MANIFEST.in
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/MANIFEST.in
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/MANIFEST.in
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/README.md b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/README.md
index e566d1794..1c5c17de2 100644
--- a/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/README.md
+++ b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/README.md
@@ -13,7 +13,7 @@ powerful tool for creative professionals and AI enthusiasts alike.
 
 | GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
 | :----: | :----: | :----: |
-| BI-V150 | 4.1.1     |  24.09  |
+| BI-V150 | 4.3.0     |  25.12  |
 
 ## Model Preparation
 
@@ -27,18 +27,33 @@ download the weights and data locally.
 Download the stabilityai/stable-diffusion-2-1-base from [huggingface
 page](https://huggingface.co/stabilityai/stable-diffusion-2-1-base).
 
+```bash
+mkdir -p data/model_zoo/stabilityai/stable-diffusion-2-1-base
+```
+
 #### Datasets
 
 Download the lambdalabs/pokemon-blip-captions  from [huggingface
 page](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions).
 
+```bash
+mkdir -p data/datasets/pokemon-blip-captions
+```
+
 ### Install Dependencies
+Contact the Iluvatar administrator to get the missing packages:
+  - deepspeed-0.16.4+corex.4.4.0.20250907-cp310-cp310-linux_x86_64.whl
+  - triton-3.1.0+corex.4.4.0.20250907-cp310-cp310-linux_x86_64.whl
+  - ixformer-0.6.0+corex.4.4.0.20250907-cp310-cp310-linux_x86_64.whl
+  - flash_attn-2.6.3+corex.4.4.0.20250907-cp310-cp310-linux_x86_64.whl
 
 ```bash
-pip3 install http://files.deepspark.org.cn:880/deepspark/add-ons/diffusers-0.29.0-py3-none-any.whl
-pip3 install http://files.deepspark.org.cn:880/deepspark/add-ons/transformers-4.38.1-py3-none-any.whl
-pip3 install -r requirements.txt
-pip3 install pillow --upgrade
+# install packages
+pip3 install http://files.deepspark.org.cn:880/deepspark/conformer/IXPyLogger-1.0.0-py3-none-any.whl
+pip3 install huggingface_hub==0.25.1 transformers==4.38.1
+pip3 install --upgrade pillow
+pip3 install -r examples/text_to_image/requirements.txt
+bash build_diffusers.sh && bash install_diffusers.sh
 ```
 
 ## Model Training
@@ -46,20 +61,41 @@ pip3 install pillow --upgrade
 If you have downloaded the weights and dataset, please export the environment variables like below.
 
 ```bash
-export MODEL_PATH=/path/to/sd_weights
-export DATASET_PATH=/path/to/data
+export CLIP_FLASH_ATTN=1
+export USE_NHWC_GN=1
+export USE_IXFORMER_GEGLU=0
+export USE_APEX_LN=1
+export ENABLE_FLASH_ATTENTION_WITH_IXDNN=1
+echo $ENABLE_FLASH_ATTENTION_WITH_IXDNN
+cd examples/text_to_image
+accelerate launch --config_file default_config.yaml --mixed_precision="fp16" train_text_to_image.py \
+  --pretrained_model_name_or_path=../../data/model_zoo/stabilityai/stable-diffusion-2-1-base \
+  --dataset_name=../../data/datasets/pokemon-blip-captions \
+  --resolution=512 \
+  --seed 42 \
+  --center_crop \
+  --random_flip \
+  --train_batch_size=1 \
+  --gradient_accumulation_steps=4 \
+  --gradient_checkpointing \
+  --learning_rate=1e-05 \
+  --max_grad_norm=1 \
+  --lr_scheduler="constant" \
+  --lr_warmup_steps=0 \
+  --output_dir="sd-pokemon-model-3" \
+  --max_train_steps=100 \
+  --NHWC \
+  --dataloader_num_workers=32 \
+  --apex_fused_adam "$@";
+
+  exit ${EXIT_STATUS}
 ```
 
-```bash
-# Go to diffusers path
-cd ${PROJ_ROOT}/toolbox/diffusers
-
-# Single GPU
-bash run_sd_2.1_single.sh
+## Model Results
 
-# Multi GPUs
-bash run_sd_2.1_multi.sh
-```
+| Model  | GPUs    | ips_per_device | ips_per_gpu |
+| ------ | ------- | -------------- | ----------- |
+| SD 2.1 | BI-V150 x 16 | 6.65          | 13.3     |
 
 ## References
 
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/build_diffusers.sh b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/build_diffusers.sh
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/build_diffusers.sh
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/build_diffusers.sh
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/clean_diffusers.sh b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/clean_diffusers.sh
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/clean_diffusers.sh
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/clean_diffusers.sh
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/README.md b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/README.md
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/README.md
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/README.md
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/README_sdxl.md b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/README_sdxl.md
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/README_sdxl.md
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/README_sdxl.md
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/default_config.yaml b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/default_config.yaml
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/default_config.yaml
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/default_config.yaml
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/requirements.txt b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/requirements.txt
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/requirements.txt
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/requirements.txt
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/requirements_flax.txt b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/requirements_flax.txt
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/requirements_flax.txt
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/requirements_flax.txt
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/requirements_sdxl.txt b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/requirements_sdxl.txt
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/requirements_sdxl.txt
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/requirements_sdxl.txt
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/single_config.yaml b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/single_config.yaml
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/single_config.yaml
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/single_config.yaml
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/test_text_to_image.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/test_text_to_image.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/test_text_to_image.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/test_text_to_image.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/test_text_to_image_lora.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/test_text_to_image_lora.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/test_text_to_image_lora.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/test_text_to_image_lora.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/train_text_to_image.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/train_text_to_image.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/train_text_to_image_flax.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image_flax.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/train_text_to_image_flax.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image_lora.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/train_text_to_image_lora.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image_lora.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/train_text_to_image_lora.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image_lora_sdxl.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/train_text_to_image_lora_sdxl.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image_lora_sdxl.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/train_text_to_image_lora_sdxl.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image_sdxl.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/train_text_to_image_sdxl.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/train_text_to_image_sdxl.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/train_text_to_image_sdxl.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/zero2_config.yaml b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/zero2_config.yaml
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image/zero2_config.yaml
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/examples/text_to_image/zero2_config.yaml
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/install_diffusers.sh b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/install_diffusers.sh
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/install_diffusers.sh
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/install_diffusers.sh
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_1.5.sh b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/run_sd_1.5.sh
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_1.5.sh
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/run_sd_1.5.sh
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_1.5_single.sh b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/run_sd_1.5_single.sh
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_1.5_single.sh
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/run_sd_1.5_single.sh
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_2.1.sh b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/run_sd_2.1.sh
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_2.1.sh
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/run_sd_2.1.sh
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_2.1_single.sh b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/run_sd_2.1_single.sh
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_2.1_single.sh
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/run_sd_2.1_single.sh
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_xl.sh b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/run_sd_xl.sh
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/run_sd_xl.sh
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/run_sd_xl.sh
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/setup.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/setup.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/setup.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/setup.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/commands/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/commands/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/commands/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/commands/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/commands/diffusers_cli.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/commands/diffusers_cli.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/commands/diffusers_cli.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/commands/diffusers_cli.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/commands/env.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/commands/env.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/commands/env.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/commands/env.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/commands/fp16_safetensors.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/commands/fp16_safetensors.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/commands/fp16_safetensors.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/commands/fp16_safetensors.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/configuration_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/configuration_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/configuration_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/configuration_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/dependency_versions_check.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/dependency_versions_check.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/dependency_versions_check.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/dependency_versions_check.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/dependency_versions_table.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/dependency_versions_table.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/dependency_versions_table.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/dependency_versions_table.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/experimental/README.md b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/experimental/README.md
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/experimental/README.md
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/experimental/README.md
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/experimental/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/experimental/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/experimental/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/experimental/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/experimental/rl/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/experimental/rl/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/experimental/rl/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/experimental/rl/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/experimental/rl/value_guided_sampling.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/experimental/rl/value_guided_sampling.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/experimental/rl/value_guided_sampling.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/experimental/rl/value_guided_sampling.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/image_processor.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/image_processor.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/image_processor.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/image_processor.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/loaders/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/loaders/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/autoencoder.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/loaders/autoencoder.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/autoencoder.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/loaders/autoencoder.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/controlnet.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/loaders/controlnet.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/controlnet.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/loaders/controlnet.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/ip_adapter.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/loaders/ip_adapter.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/ip_adapter.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/loaders/ip_adapter.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/lora.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/loaders/lora.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/lora.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/loaders/lora.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/lora_conversion_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/loaders/lora_conversion_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/lora_conversion_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/loaders/lora_conversion_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/peft.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/loaders/peft.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/peft.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/loaders/peft.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/single_file.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/loaders/single_file.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/single_file.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/loaders/single_file.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/single_file_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/loaders/single_file_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/single_file_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/loaders/single_file_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/textual_inversion.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/loaders/textual_inversion.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/textual_inversion.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/loaders/textual_inversion.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/unet.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/loaders/unet.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/unet.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/loaders/unet.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/loaders/utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/loaders/utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/loaders/utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/README.md b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/README.md
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/README.md
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/README.md
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/activations.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/activations.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/activations.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/activations.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/adapter.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/adapter.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/adapter.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/adapter.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/attention.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/attention.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/attention.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/attention.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/attention_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/attention_flax.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/attention_flax.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/attention_flax.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/attention_processor.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/attention_processor.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/attention_processor.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/attention_processor.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/autoencoders/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/autoencoders/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/autoencoder_asym_kl.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/autoencoders/autoencoder_asym_kl.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/autoencoder_asym_kl.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/autoencoders/autoencoder_asym_kl.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/autoencoder_kl.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/autoencoders/autoencoder_kl.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/autoencoder_kl.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/autoencoders/autoencoder_kl.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/autoencoders/autoencoder_kl_temporal_decoder.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/autoencoder_tiny.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/autoencoders/autoencoder_tiny.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/autoencoder_tiny.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/autoencoders/autoencoder_tiny.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/consistency_decoder_vae.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/autoencoders/consistency_decoder_vae.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/consistency_decoder_vae.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/autoencoders/consistency_decoder_vae.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/vae.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/autoencoders/vae.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/autoencoders/vae.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/autoencoders/vae.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/controlnet.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/controlnet.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/controlnet.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/controlnet.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/controlnet_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/controlnet_flax.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/controlnet_flax.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/controlnet_flax.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/downsampling.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/downsampling.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/downsampling.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/downsampling.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/dual_transformer_2d.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/dual_transformer_2d.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/dual_transformer_2d.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/dual_transformer_2d.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/embeddings.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/embeddings.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/embeddings.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/embeddings.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/embeddings_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/embeddings_flax.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/embeddings_flax.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/embeddings_flax.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/lora.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/lora.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/lora.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/lora.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_flax_pytorch_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/modeling_flax_pytorch_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_flax_pytorch_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/modeling_flax_pytorch_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_flax_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/modeling_flax_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_flax_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/modeling_flax_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_outputs.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/modeling_outputs.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_outputs.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/modeling_outputs.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_pytorch_flax_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/modeling_pytorch_flax_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_pytorch_flax_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/modeling_pytorch_flax_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/modeling_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/modeling_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/modeling_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/Welford.h b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/nhwc_groupnorm/Welford.h
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/Welford.h
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/nhwc_groupnorm/Welford.h
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/py.typed b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/nhwc_groupnorm/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/py.typed
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/nhwc_groupnorm/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/custom_gn.cpp b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/nhwc_groupnorm/custom_gn.cpp
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/custom_gn.cpp
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/nhwc_groupnorm/custom_gn.cpp
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/custom_gn.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/nhwc_groupnorm/custom_gn.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/custom_gn.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/nhwc_groupnorm/custom_gn.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/gn_kernel.cu b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/nhwc_groupnorm/gn_kernel.cu
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/gn_kernel.cu
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/nhwc_groupnorm/gn_kernel.cu
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/gn_kernel.h b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/nhwc_groupnorm/gn_kernel.h
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/gn_kernel.h
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/nhwc_groupnorm/gn_kernel.h
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/nchw_kernel.cu b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/nhwc_groupnorm/nchw_kernel.cu
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/nchw_kernel.cu
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/nhwc_groupnorm/nchw_kernel.cu
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/vecs.h b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/nhwc_groupnorm/vecs.h
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/nhwc_groupnorm/vecs.h
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/nhwc_groupnorm/vecs.h
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/normalization.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/normalization.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/normalization.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/normalization.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/prior_transformer.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/prior_transformer.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/prior_transformer.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/prior_transformer.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/resnet.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/resnet.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/resnet.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/resnet.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/resnet_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/resnet_flax.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/resnet_flax.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/resnet_flax.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/t5_film_transformer.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/t5_film_transformer.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/t5_film_transformer.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/t5_film_transformer.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformer_2d.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/transformer_2d.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformer_2d.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/transformer_2d.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformer_temporal.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/transformer_temporal.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformer_temporal.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/transformer_temporal.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/transformers/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/transformers/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/dual_transformer_2d.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/transformers/dual_transformer_2d.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/dual_transformer_2d.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/transformers/dual_transformer_2d.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/prior_transformer.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/transformers/prior_transformer.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/prior_transformer.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/transformers/prior_transformer.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/t5_film_transformer.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/transformers/t5_film_transformer.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/t5_film_transformer.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/transformers/t5_film_transformer.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/transformer_2d.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/transformers/transformer_2d.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/transformer_2d.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/transformers/transformer_2d.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/transformer_temporal.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/transformers/transformer_temporal.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/transformers/transformer_temporal.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/transformers/transformer_temporal.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_1d.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unet_1d.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_1d.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unet_1d.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_1d_blocks.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unet_1d_blocks.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_1d_blocks.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unet_1d_blocks.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_2d.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unet_2d.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_2d.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unet_2d.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_2d_blocks.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unet_2d_blocks.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_2d_blocks.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unet_2d_blocks.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_2d_condition.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unet_2d_condition.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unet_2d_condition.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unet_2d_condition.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_1d.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_1d.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_1d.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_1d.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_1d_blocks.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_1d_blocks.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_1d_blocks.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_1d_blocks.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_2d.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_2d.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d_blocks.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_2d_blocks.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d_blocks.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_2d_blocks.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d_blocks_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_2d_blocks_flax.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d_blocks_flax.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_2d_blocks_flax.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d_condition.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_2d_condition.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d_condition.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_2d_condition.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d_condition_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_2d_condition_flax.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_2d_condition_flax.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_2d_condition_flax.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_3d_blocks.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_3d_blocks.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_3d_blocks.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_3d_blocks.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_3d_condition.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_3d_condition.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_3d_condition.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_3d_condition.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_i2vgen_xl.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_i2vgen_xl.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_i2vgen_xl.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_i2vgen_xl.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_kandinsky3.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_kandinsky3.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_kandinsky3.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_kandinsky3.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_motion_model.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_motion_model.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_motion_model.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_motion_model.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_spatio_temporal_condition.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_spatio_temporal_condition.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_spatio_temporal_condition.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_spatio_temporal_condition.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_stable_cascade.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_stable_cascade.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/unet_stable_cascade.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/unet_stable_cascade.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/uvit_2d.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/uvit_2d.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/unets/uvit_2d.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/unets/uvit_2d.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/upsampling.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/upsampling.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/upsampling.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/upsampling.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/vae_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/vae_flax.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/vae_flax.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/vae_flax.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/vq_model.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/vq_model.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/models/vq_model.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/models/vq_model.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/optimization.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/optimization.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/optimization.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/optimization.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/README.md b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/README.md
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/README.md
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/README.md
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/amused/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/amused/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/amused/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/amused/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/amused/pipeline_amused.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/amused/pipeline_amused.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/amused/pipeline_amused.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/amused/pipeline_amused.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/amused/pipeline_amused_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/amused/pipeline_amused_img2img.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/amused/pipeline_amused_img2img.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/amused/pipeline_amused_img2img.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/amused/pipeline_amused_inpaint.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/animatediff/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/animatediff/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/animatediff/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/animatediff/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/animatediff/pipeline_animatediff.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/animatediff/pipeline_animatediff.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/animatediff/pipeline_output.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/animatediff/pipeline_output.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/animatediff/pipeline_output.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/animatediff/pipeline_output.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/audioldm/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/audioldm/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm/pipeline_audioldm.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/audioldm/pipeline_audioldm.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm2/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/audioldm2/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm2/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/audioldm2/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/audioldm2/modeling_audioldm2.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/audioldm2/pipeline_audioldm2.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/auto_pipeline.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/auto_pipeline.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/auto_pipeline.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/auto_pipeline.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/blip_diffusion/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/blip_diffusion/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/blip_image_processing.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/blip_diffusion/blip_image_processing.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/blip_image_processing.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/blip_diffusion/blip_image_processing.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/blip_diffusion/modeling_blip2.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/blip_diffusion/modeling_ctx_clip.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/consistency_models/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/consistency_models/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/consistency_models/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/consistency_models/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/controlnet/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/controlnet/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/multicontrolnet.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/controlnet/multicontrolnet.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/multicontrolnet.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/controlnet/multicontrolnet.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint_sd_xl.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl_img2img.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/dance_diffusion/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/dance_diffusion/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/dance_diffusion/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/dance_diffusion/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ddim/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/ddim/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ddim/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/ddim/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ddim/pipeline_ddim.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/ddim/pipeline_ddim.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ddim/pipeline_ddim.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/ddim/pipeline_ddim.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ddpm/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/ddpm/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ddpm/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/ddpm/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deepfloyd_if/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deepfloyd_if/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_output.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_output.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_output.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deepfloyd_if/pipeline_output.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/safety_checker.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deepfloyd_if/safety_checker.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/safety_checker.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deepfloyd_if/safety_checker.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/timesteps.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deepfloyd_if/timesteps.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/timesteps.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deepfloyd_if/timesteps.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/watermark.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deepfloyd_if/watermark.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deepfloyd_if/watermark.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deepfloyd_if/watermark.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/README.md b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/README.md
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/README.md
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/README.md
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/modeling_roberta_series.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_alt_diffusion_img2img.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_output.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_output.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_output.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/alt_diffusion/pipeline_output.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/mel.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/mel.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/mel.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/mel.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/audio_diffusion/pipeline_audio_diffusion.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/pndm/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/pndm/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/pndm/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/pndm/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/pndm/pipeline_pndm.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/pndm/pipeline_pndm.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/pndm/pipeline_pndm.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/pndm/pipeline_pndm.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/repaint/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/repaint/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/repaint/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/repaint/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/repaint/pipeline_repaint.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/score_sde_ve/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/score_sde_ve/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/score_sde_ve/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/score_sde_ve/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/score_sde_ve/pipeline_score_sde_ve.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/continuous_encoder.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/midi_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/notes_encoder.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/spectrogram_diffusion/pipeline_spectrogram_diffusion.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_cycle_diffusion.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_onnx_stable_diffusion_inpaint_legacy.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_inpaint_legacy.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_model_editing.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_paradigms.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/stable_diffusion_variants/pipeline_stable_diffusion_pix2pix_zero.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stochastic_karras_ve/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/stochastic_karras_ve/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stochastic_karras_ve/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/stochastic_karras_ve/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/stochastic_karras_ve/pipeline_stochastic_karras_ve.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/modeling_text_unet.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/vq_diffusion/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/vq_diffusion/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/vq_diffusion/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/vq_diffusion/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/deprecated/vq_diffusion/pipeline_vq_diffusion.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/dit/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/dit/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/dit/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/dit/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/dit/pipeline_dit.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/dit/pipeline_dit.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/dit/pipeline_dit.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/dit/pipeline_dit.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/free_init_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/free_init_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/free_init_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/free_init_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/i2vgen_xl/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/i2vgen_xl/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/i2vgen_xl/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/i2vgen_xl/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/text_encoder.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky/text_encoder.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky/text_encoder.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky/text_encoder.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky2_2/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky2_2/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_combined.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky3/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky3/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky3/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky3/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky3/convert_kandinsky3_unet.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_consistency_models/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/latent_consistency_models/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_consistency_models/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/latent_consistency_models/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_img2img.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/latent_consistency_models/pipeline_latent_consistency_text2img.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_diffusion/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/latent_diffusion/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_diffusion/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/latent_diffusion/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ledits_pp/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/ledits_pp/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ledits_pp/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/ledits_pp/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_leditspp_stable_diffusion_xl.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_output.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_output.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_output.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/ledits_pp/pipeline_output.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/musicldm/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/musicldm/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/musicldm/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/musicldm/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/musicldm/pipeline_musicldm.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/musicldm/pipeline_musicldm.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/musicldm/pipeline_musicldm.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/musicldm/pipeline_musicldm.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/onnx_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/onnx_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/onnx_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/onnx_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/paint_by_example/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/paint_by_example/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/paint_by_example/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/paint_by_example/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/paint_by_example/image_encoder.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/paint_by_example/image_encoder.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/paint_by_example/image_encoder.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/paint_by_example/image_encoder.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pia/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/pia/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pia/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/pia/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pia/pipeline_pia.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/pia/pipeline_pia.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pia/pipeline_pia.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/pia/pipeline_pia.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pipeline_flax_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/pipeline_flax_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pipeline_flax_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/pipeline_flax_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pipeline_loading_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/pipeline_loading_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pipeline_loading_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/pipeline_loading_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pipeline_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/pipeline_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pipeline_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/pipeline_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pixart_alpha/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/pixart_alpha/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pixart_alpha/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/pixart_alpha/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_output.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_output.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_output.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_output.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/semantic_stable_diffusion/pipeline_semantic_stable_diffusion.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/shap_e/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/shap_e/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/camera.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/shap_e/camera.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/camera.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/shap_e/camera.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/shap_e/pipeline_shap_e.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/renderer.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/shap_e/renderer.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/shap_e/renderer.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/shap_e/renderer.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_cascade/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_cascade/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_cascade/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_cascade/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_combined.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_cascade/pipeline_stable_cascade_prior.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/README.md b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/README.md
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/README.md
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/README.md
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/clip_image_project_model.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/clip_image_project_model.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/clip_image_project_model.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/clip_image_project_model.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_inpaint.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_upscale.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_output.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_output.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_output.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_output.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/safety_checker.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/safety_checker.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/safety_checker.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/safety_checker.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/safety_checker_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/safety_checker_flax.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/safety_checker_flax.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/safety_checker_flax.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion/stable_unclip_image_normalizer.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_attend_and_excite/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_attend_and_excite/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_attend_and_excite/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_attend_and_excite/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_attend_and_excite/pipeline_stable_diffusion_attend_and_excite.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_diffedit/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_diffedit/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_diffedit/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_diffedit/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_diffedit/pipeline_stable_diffusion_diffedit.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_gligen/pipeline_stable_diffusion_gligen_text_image.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_k_diffusion.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_k_diffusion/pipeline_stable_diffusion_xl_k_diffusion.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_ldm3d/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_ldm3d/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_ldm3d/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_ldm3d/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_ldm3d/pipeline_stable_diffusion_ldm3d.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_panorama/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_panorama/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_panorama/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_panorama/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_panorama/pipeline_stable_diffusion_panorama.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_safe/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_safe/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_safe/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_safe/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_safe/pipeline_output.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_safe/pipeline_output.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_safe/pipeline_output.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_safe/pipeline_output.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_safe/safety_checker.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_sag/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_sag/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_sag/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_sag/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_sag/pipeline_stable_diffusion_sag.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_xl/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_xl/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_flax_stable_diffusion_xl.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_output.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_output.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_output.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_output.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_instruct_pix2pix.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/watermark.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_xl/watermark.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_diffusion_xl/watermark.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_diffusion_xl/watermark.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_video_diffusion/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_video_diffusion/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_video_diffusion/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_video_diffusion/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/t2i_adapter/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/t2i_adapter/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/t2i_adapter/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/t2i_adapter/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_xl_adapter.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/text_to_video_synthesis/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/text_to_video_synthesis/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_output.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero_sdxl.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unclip/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/unclip/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unclip/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/unclip/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unclip/pipeline_unclip.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/unclip/pipeline_unclip.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unclip/pipeline_unclip.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/unclip/pipeline_unclip.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unclip/text_proj.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/unclip/text_proj.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unclip/text_proj.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/unclip/text_proj.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unidiffuser/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/unidiffuser/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unidiffuser/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/unidiffuser/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unidiffuser/modeling_uvit.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/unidiffuser/modeling_uvit.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unidiffuser/modeling_uvit.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/unidiffuser/modeling_uvit.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/wuerstchen/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/wuerstchen/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/wuerstchen/modeling_paella_vq_model.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_common.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_diffnext.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/wuerstchen/modeling_wuerstchen_prior.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/py.typed b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/py.typed
new file mode 100644
index 000000000..e69de29bb
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/README.md b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/README.md
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/README.md
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/README.md
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/deprecated/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/deprecated/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/deprecated/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/deprecated/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/deprecated/scheduling_karras_ve.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/deprecated/scheduling_sde_vp.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/deprecated/scheduling_sde_vp.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/deprecated/scheduling_sde_vp.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/deprecated/scheduling_sde_vp.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_amused.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_amused.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_amused.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_amused.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_consistency_decoder.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_consistency_decoder.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_consistency_decoder.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_consistency_decoder.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_consistency_models.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_consistency_models.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_consistency_models.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_consistency_models.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddim.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_ddim.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddim.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_ddim.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddim_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_ddim_flax.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddim_flax.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_ddim_flax.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddim_inverse.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_ddim_inverse.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddim_inverse.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_ddim_inverse.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddim_parallel.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_ddim_parallel.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddim_parallel.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_ddim_parallel.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddpm.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_ddpm.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddpm.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_ddpm.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddpm_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_ddpm_flax.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddpm_flax.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_ddpm_flax.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddpm_parallel.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_ddpm_parallel.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddpm_parallel.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_ddpm_parallel.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_ddpm_wuerstchen.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_deis_multistep.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_deis_multistep.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_deis_multistep.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_deis_multistep.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep_flax.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_sde.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_edm_dpmsolver_multistep.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_edm_euler.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_edm_euler.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_edm_euler.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_edm_euler.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_euler_discrete.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_euler_discrete.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_euler_discrete.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_euler_discrete.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_euler_discrete_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_euler_discrete_flax.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_euler_discrete_flax.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_euler_discrete_flax.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_heun_discrete.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_heun_discrete.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_heun_discrete.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_heun_discrete.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ipndm.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_ipndm.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_ipndm.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_ipndm.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_karras_ve_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_karras_ve_flax.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_karras_ve_flax.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_karras_ve_flax.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_lcm.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_lcm.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_lcm.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_lcm.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_lms_discrete.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_lms_discrete.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_lms_discrete.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_lms_discrete.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_lms_discrete_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_lms_discrete_flax.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_lms_discrete_flax.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_lms_discrete_flax.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_pndm.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_pndm.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_pndm.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_pndm.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_pndm_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_pndm_flax.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_pndm_flax.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_pndm_flax.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_repaint.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_repaint.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_repaint.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_repaint.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_sasolver.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_sasolver.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_sasolver.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_sasolver.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_sde_ve.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_sde_ve.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_sde_ve.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_sde_ve.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_sde_ve_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_sde_ve_flax.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_sde_ve_flax.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_sde_ve_flax.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_tcd.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_tcd.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_tcd.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_tcd.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_unclip.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_unclip.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_unclip.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_unclip.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_unipc_multistep.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_unipc_multistep.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_unipc_multistep.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_unipc_multistep.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_utils_flax.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_utils_flax.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_utils_flax.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_utils_flax.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_vq_diffusion.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_vq_diffusion.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/schedulers/scheduling_vq_diffusion.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/schedulers/scheduling_vq_diffusion.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/training_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/training_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/training_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/training_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/__init__.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/__init__.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/__init__.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/__init__.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/accelerate_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/accelerate_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/accelerate_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/accelerate_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/constants.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/constants.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/constants.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/constants.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/deprecation_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/deprecation_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/deprecation_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/deprecation_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/doc_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/doc_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/doc_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/doc_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_flax_and_transformers_objects.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dummy_flax_and_transformers_objects.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_flax_and_transformers_objects.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dummy_flax_and_transformers_objects.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_flax_objects.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dummy_flax_objects.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_flax_objects.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dummy_flax_objects.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_note_seq_objects.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dummy_note_seq_objects.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_note_seq_objects.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dummy_note_seq_objects.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_onnx_objects.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dummy_onnx_objects.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_onnx_objects.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dummy_onnx_objects.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_pt_objects.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dummy_pt_objects.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_pt_objects.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dummy_pt_objects.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_librosa_objects.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dummy_torch_and_librosa_objects.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_librosa_objects.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dummy_torch_and_librosa_objects.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_scipy_objects.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dummy_torch_and_scipy_objects.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_scipy_objects.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dummy_torch_and_scipy_objects.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_torchsde_objects.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dummy_torch_and_torchsde_objects.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_torchsde_objects.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dummy_torch_and_torchsde_objects.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_transformers_and_k_diffusion_objects.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dummy_torch_and_transformers_and_k_diffusion_objects.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_transformers_and_k_diffusion_objects.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dummy_torch_and_transformers_and_k_diffusion_objects.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dummy_torch_and_transformers_and_onnx_objects.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dummy_torch_and_transformers_objects.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_torch_and_transformers_objects.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dummy_torch_and_transformers_objects.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_transformers_and_torch_and_note_seq_objects.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dummy_transformers_and_torch_and_note_seq_objects.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dummy_transformers_and_torch_and_note_seq_objects.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dummy_transformers_and_torch_and_note_seq_objects.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dynamic_modules_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dynamic_modules_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/dynamic_modules_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/dynamic_modules_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/export_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/export_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/export_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/export_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/hub_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/hub_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/hub_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/hub_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/import_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/import_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/import_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/import_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/loading_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/loading_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/loading_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/loading_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/logging.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/logging.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/logging.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/logging.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/model_card_template.md b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/model_card_template.md
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/model_card_template.md
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/model_card_template.md
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/outputs.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/outputs.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/outputs.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/outputs.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/peft_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/peft_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/peft_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/peft_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/pil_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/pil_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/pil_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/pil_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/state_dict_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/state_dict_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/state_dict_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/state_dict_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/testing_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/testing_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/testing_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/testing_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/torch_utils.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/torch_utils.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/torch_utils.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/torch_utils.py
diff --git a/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/versions.py b/multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/versions.py
similarity index 100%
rename from multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/src/diffusers/utils/versions.py
rename to multimodal/diffusion_model/stable-diffusion-2.1/pytorch/src/diffusers/utils/versions.py
diff --git a/nlp/language_model/bert_sample/pytorch/README.md b/nlp/language_model/bert_sample/pytorch/README.md
new file mode 100644
index 000000000..eef1c746b
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/README.md
@@ -0,0 +1,76 @@
+# BERT Pretraining
+
+## Model Description
+
+BERT (Bidirectional Encoder Representations from Transformers) is a groundbreaking language model that revolutionized
+natural language processing. It employs a transformer architecture with bidirectional attention, enabling it to capture
+context from both directions in text. Pretrained using Masked Language Modeling (MLM) and Next Sentence Prediction (NSP)
+tasks, BERT achieves state-of-the-art results across various NLP tasks through fine-tuning. Its ability to understand
+deep contextual relationships in text has made it a fundamental model in modern NLP research and applications.
+
+## Supported Environments
+
+| GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| BI-V150 | 4.3.0     |  25.12  |
+
+## Model Preparation
+
+### Prepare Resources
+
+Reference: [training_results_v1.0](https://github.com/mlcommons/training_results_v1.0/tree/master/NVIDIA/benchmarks/bert/implementations/pytorch)
+
+```bash
+mkdir -p data/datasets/bert_mini
+cd data/datasets/bert_mini
+wget http://files.deepspark.org.cn:880/deepspark/bert_mini/2048_shards_uncompressed_mini.tar.gz
+wget http://files.deepspark.org.cn:880/deepspark/bert_mini/eval_set_uncompressed.tar.gz
+wget http://files.deepspark.org.cn:880/deepspark/bert_mini/model.ckpt-28252.apex.pt
+wget http://files.deepspark.org.cn:880/deepspark/bert_mini/bert_config.json
+tar -xf 2048_shards_uncompressed_mini.tar.gz
+tar -xf eval_set_uncompressed.tar.gz
+
+└── data/datasets/bert_mini
+    ├── 2048_shards_uncompressed
+    ├── eval_set_uncompressed
+    └── model.ckpt-28252.apex.pt
+    └── bert_config.json
+```
+
+### Install Dependencies
+
+```shell
+apt install -y numactl
+
+cd base
+pip3 install -r requirements.txt
+python3 setup.py install
+```
+
+## Model Training
+
+### Training with default
+```shell
+bash run_training.sh \
+--name default \
+--config V100x1x8 \
+--data_dir ../data/datasets/bert_mini/ \
+--max_steps 500 \
+--train_batch_size 10 \
+--target_mlm_accuracy 0.33 \
+--init_checkpoint "../data/datasets/bert_mini/model.ckpt-28252.apex.pt"
+```
+
+### Training with iluvatar
+```shell
+bash run_training.sh --name iluvatar --config 03V100x1x8 --train_batch_size 27 --data_dir ../data/datasets/bert_mini/ --master_port 22233
+```
+
+## Model Results
+| Model        | GPUs       | E2E   | MLM Accuracy | training_sequences_per_second | final_loss |
+|:------------:|:----------:|:-----:|:------------:|:-----------------------------:|:----------:|
+| BERT default | BI-V150 x8 | 82.72s | 0.339        | 3.685                         |  4.723     |
+| BERT iluvatar | BI-V150 x8 | 509.79s | 0.720       | 10513.181                     |  1.306   |
+
+## References
+
diff --git a/nlp/language_model/bert_sample/pytorch/default/config/config_V100x1x1.py b/nlp/language_model/bert_sample/pytorch/default/config/config_V100x1x1.py
new file mode 100644
index 000000000..99c2e1481
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/default/config/config_V100x1x1.py
@@ -0,0 +1,38 @@
+# /***************************************************************************************************
+# * Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# * Copyright Declaration: This software, including all of its code and documentation,
+# * except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# * Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# * Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# * CoreX. No user of this software shall have any right, ownership or interest in this software and
+# * any use of this software shall be in compliance with the terms and conditions of the End User
+# * License Agreement.
+#  **************************************************************************************************/
+
+
+from training_event import DefaultTrainingEvent
+
+
+fp16 = True
+dist_backend = "nccl"
+
+gradient_accumulation_steps = 1
+train_batch_size = 10
+max_steps = 7100
+start_warmup_step = 0
+warmup_proportion = 0
+warmup_steps = 0
+
+learning_rate = 2.0e-4
+weight_decay_rate = 0.01
+opt_lamb_beta_1 = 0.9
+opt_lamb_beta_2 = 0.999
+
+eval_batch_size = train_batch_size
+max_samples_termination = 4500000
+cache_eval_data = True
+exchange_padding = True
+
+seed = 9031
+
+training_event = DefaultTrainingEvent
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/default/config/config_V100x1x16.py b/nlp/language_model/bert_sample/pytorch/default/config/config_V100x1x16.py
new file mode 100644
index 000000000..547d07495
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/default/config/config_V100x1x16.py
@@ -0,0 +1,38 @@
+# /***************************************************************************************************
+# * Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# * Copyright Declaration: This software, including all of its code and documentation,
+# * except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# * Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# * Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# * CoreX. No user of this software shall have any right, ownership or interest in this software and
+# * any use of this software shall be in compliance with the terms and conditions of the End User
+# * License Agreement.
+#  **************************************************************************************************/
+
+
+from training_event import DefaultTrainingEvent
+
+
+fp16 = True
+dist_backend = "nccl"
+
+gradient_accumulation_steps = 1
+train_batch_size = 10
+max_steps = 7100
+start_warmup_step = 0
+warmup_proportion = 0
+warmup_steps = 0
+
+learning_rate = 3.5e-4
+weight_decay_rate = 0.01
+opt_lamb_beta_1 = 0.9
+opt_lamb_beta_2 = 0.999
+
+eval_batch_size = train_batch_size
+max_samples_termination = 4500000
+cache_eval_data = True
+exchange_padding = True
+
+seed = 9031
+
+training_event = DefaultTrainingEvent
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/default/config/config_V100x1x4.py b/nlp/language_model/bert_sample/pytorch/default/config/config_V100x1x4.py
new file mode 100644
index 000000000..99c2e1481
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/default/config/config_V100x1x4.py
@@ -0,0 +1,38 @@
+# /***************************************************************************************************
+# * Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# * Copyright Declaration: This software, including all of its code and documentation,
+# * except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# * Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# * Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# * CoreX. No user of this software shall have any right, ownership or interest in this software and
+# * any use of this software shall be in compliance with the terms and conditions of the End User
+# * License Agreement.
+#  **************************************************************************************************/
+
+
+from training_event import DefaultTrainingEvent
+
+
+fp16 = True
+dist_backend = "nccl"
+
+gradient_accumulation_steps = 1
+train_batch_size = 10
+max_steps = 7100
+start_warmup_step = 0
+warmup_proportion = 0
+warmup_steps = 0
+
+learning_rate = 2.0e-4
+weight_decay_rate = 0.01
+opt_lamb_beta_1 = 0.9
+opt_lamb_beta_2 = 0.999
+
+eval_batch_size = train_batch_size
+max_samples_termination = 4500000
+cache_eval_data = True
+exchange_padding = True
+
+seed = 9031
+
+training_event = DefaultTrainingEvent
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/default/config/config_V100x1x8.py b/nlp/language_model/bert_sample/pytorch/default/config/config_V100x1x8.py
new file mode 100644
index 000000000..547d07495
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/default/config/config_V100x1x8.py
@@ -0,0 +1,38 @@
+# /***************************************************************************************************
+# * Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# * Copyright Declaration: This software, including all of its code and documentation,
+# * except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# * Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# * Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# * CoreX. No user of this software shall have any right, ownership or interest in this software and
+# * any use of this software shall be in compliance with the terms and conditions of the End User
+# * License Agreement.
+#  **************************************************************************************************/
+
+
+from training_event import DefaultTrainingEvent
+
+
+fp16 = True
+dist_backend = "nccl"
+
+gradient_accumulation_steps = 1
+train_batch_size = 10
+max_steps = 7100
+start_warmup_step = 0
+warmup_proportion = 0
+warmup_steps = 0
+
+learning_rate = 3.5e-4
+weight_decay_rate = 0.01
+opt_lamb_beta_1 = 0.9
+opt_lamb_beta_2 = 0.999
+
+eval_batch_size = train_batch_size
+max_samples_termination = 4500000
+cache_eval_data = True
+exchange_padding = True
+
+seed = 9031
+
+training_event = DefaultTrainingEvent
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/default/config/training_event.py b/nlp/language_model/bert_sample/pytorch/default/config/training_event.py
new file mode 100644
index 000000000..a0d4b92b9
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/default/config/training_event.py
@@ -0,0 +1,85 @@
+# /***************************************************************************************************
+# * Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# * Copyright Declaration: This software, including all of its code and documentation,
+# * except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# * Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# * Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# * CoreX. No user of this software shall have any right, ownership or interest in this software and
+# * any use of this software shall be in compliance with the terms and conditions of the End User
+# * License Agreement.
+#  **************************************************************************************************/
+
+
+from typing import Tuple
+
+import torch
+import torch.distributed as dist
+from torch.cuda.amp import GradScaler
+from torch.cuda.amp import autocast
+from torch.nn.parallel import DistributedDataParallel as NativeDDP
+from torch.optim import Optimizer
+
+from optimizers import create_optimizer
+from train.event.base import BaseTrainingEventInterface, BERT_MODEL
+
+
+class DefaultTrainingEvent(BaseTrainingEventInterface):
+
+    def __init__(self, config):
+        super(DefaultTrainingEvent, self).__init__(config)
+        self.model = None
+        self.optimizer = None
+        self.num_iters_per_dataloader = 1
+
+        self.autocast_ctx = None
+
+    def create_optimizer(self, model: BERT_MODEL) -> Optimizer:
+        param_optimizer = list(model.named_parameters())
+
+        no_decay = ['bias', 'gamma', 'beta', 'LayerNorm']
+
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
+             'weight_decay': self.config.weight_decay_rate},
+            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
+
+        optimizer = create_optimizer('lamb', optimizer_grouped_parameters, self.config)
+
+        self.model = model
+        self.optimizer = optimizer
+        return optimizer
+
+    def model_to_fp16(self, model: BERT_MODEL, optimizer: Optimizer) -> Tuple[BERT_MODEL, Optimizer]:
+        return model, optimizer
+
+    def model_to_ddp(self, model: BERT_MODEL) -> BERT_MODEL:
+        use_ddp = dist.is_initialized()
+        if use_ddp:
+            model = NativeDDP(model,
+                              device_ids=[self.config.local_rank],
+                              bucket_cap_mb=100,
+                              gradient_as_bucket_view=self.config.use_gradient_as_bucket_view)
+        self.model = model
+        return model
+
+    def on_step_begin(self, step: int):
+        self.autocast_ctx = autocast(self.config.fp16)
+        self.autocast_ctx.__enter__()
+
+    def on_backward(self, step: int, loss: torch.Tensor, optimizer: Optimizer, grad_scaler: GradScaler=None):
+        self.autocast_ctx.__exit__(None, None, None)
+
+        scaled_loss = grad_scaler.scale(loss)
+        scaled_loss.backward()
+        update_step = step % self.config.gradient_accumulation_steps == 0
+        if update_step:
+            self.update_model_params(scaled_loss, optimizer, grad_scaler)
+
+    def update_model_params(self, loss, optimizer: Optimizer, grad_scaler: GradScaler=None):
+        grad_scaler.step(optimizer)
+        grad_scaler.update()
+
+        for param in self.model.parameters():
+            param.grad = None
+
+
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_00V100x1x8.py b/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_00V100x1x8.py
new file mode 100644
index 000000000..c47818185
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_00V100x1x8.py
@@ -0,0 +1,56 @@
+
+# /***************************************************************************************************
+# * Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# * Copyright Declaration: This software, including all of its code and documentation,
+# * except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# * Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# * Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# * CoreX. No user of this software shall have any right, ownership or interest in this software and
+# * any use of this software shall be in compliance with the terms and conditions of the End User
+# * License Agreement.
+#  **************************************************************************************************/
+
+from training_event import ApexTrainingEvent
+from config_common import *
+import os
+
+fp16 = True
+ddp_type = "apex"
+dist_backend = "nccl"
+
+gradient_accumulation_steps = 1
+train_batch_size = 27
+max_steps = 14000
+start_warmup_step = 0
+warmup_proportion = 0
+warmup_steps = 0
+
+distributed_lamb = True
+learning_rate = 3.5e-4
+weight_decay_rate = 0.01
+opt_lamb_beta_1 = 0.9
+opt_lamb_beta_2 = 0.999
+
+eval_batch_size = train_batch_size
+max_samples_termination = 4500000
+cache_eval_data = True
+
+fused_gelu_bias = True
+fused_mha = True
+dense_seq_output = True
+exchange_padding = True
+
+dwu_num_rs_pg = 1
+dwu_num_ar_pg = 1
+dwu_num_blocks = 1
+
+seed = 9031
+
+target_mlm_accuracy = 0.45
+save_checkpoint = False
+log_freq = 200
+eval_steps = 1000
+init_checkpoint = os.path.join(os.path.dirname(os.path.abspath(__file__)),'../../../../../../../data/model_zoo/lm_bert/model.ckpt-0.pt')
+output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),'../out')
+
+training_event = ApexTrainingEvent
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_01V100x1x8.py b/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_01V100x1x8.py
new file mode 100644
index 000000000..22f003f1d
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_01V100x1x8.py
@@ -0,0 +1,57 @@
+# /***************************************************************************************************
+# * Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# * Copyright Declaration: This software, including all of its code and documentation,
+# * except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# * Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# * Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# * CoreX. No user of this software shall have any right, ownership or interest in this software and
+# * any use of this software shall be in compliance with the terms and conditions of the End User
+# * License Agreement.
+#  **************************************************************************************************/
+
+from training_event import ApexTrainingEvent
+from config_common import *
+import os
+
+fp16 = True
+ddp_type = "apex"
+dist_backend = "nccl"
+
+gradient_accumulation_steps = 1
+train_batch_size = 27
+max_steps = 14000
+start_warmup_step = 0
+warmup_proportion = 0
+warmup_steps = 0
+
+distributed_lamb = True
+learning_rate = 3.5e-4
+weight_decay_rate = 0.01
+opt_lamb_beta_1 = 0.9
+opt_lamb_beta_2 = 0.999
+
+eval_batch_size = train_batch_size
+max_samples_termination = 4500000
+cache_eval_data = True
+
+fused_gelu_bias = True
+fused_mha = True
+dense_seq_output = True
+exchange_padding = True
+
+dwu_num_rs_pg = 1
+dwu_num_ar_pg = 1
+dwu_num_blocks = 1
+
+seed = 9031
+
+target_mlm_accuracy = 0.489
+save_checkpoint = False
+log_freq = 200
+eval_steps = 200
+init_checkpoint = os.path.join(os.path.dirname(os.path.abspath(__file__)),'../../../../../../../data/model_zoo/lm_bert/model.ckpt-1000.pt')
+output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),'out')
+
+
+
+training_event = ApexTrainingEvent
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_02V100x1x8.py b/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_02V100x1x8.py
new file mode 100644
index 000000000..a03a7c64d
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_02V100x1x8.py
@@ -0,0 +1,57 @@
+# /***************************************************************************************************
+# * Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# * Copyright Declaration: This software, including all of its code and documentation,
+# * except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# * Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# * Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# * CoreX. No user of this software shall have any right, ownership or interest in this software and
+# * any use of this software shall be in compliance with the terms and conditions of the End User
+# * License Agreement.
+#  **************************************************************************************************/
+
+from training_event import ApexTrainingEvent
+from config_common import *
+import os
+
+fp16 = True
+ddp_type = "apex"
+dist_backend = "nccl"
+
+gradient_accumulation_steps = 1
+train_batch_size = 27
+max_steps = 14000
+start_warmup_step = 0
+warmup_proportion = 0
+warmup_steps = 0
+
+distributed_lamb = True
+learning_rate = 3.5e-4
+weight_decay_rate = 0.01
+opt_lamb_beta_1 = 0.9
+opt_lamb_beta_2 = 0.999
+
+eval_batch_size = train_batch_size
+max_samples_termination = 4500000
+cache_eval_data = True
+
+fused_gelu_bias = True
+fused_mha = True
+dense_seq_output = True
+exchange_padding = True
+
+dwu_num_rs_pg = 1
+dwu_num_ar_pg = 1
+dwu_num_blocks = 1
+
+seed = 9031
+
+target_mlm_accuracy = 0.706
+save_checkpoint = False
+log_freq = 200
+eval_steps = 200
+init_checkpoint = os.path.join(os.path.dirname(os.path.abspath(__file__)),'../../../../../../../data/model_zoo/lm_bert/model.ckpt-3000.pt')
+output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),'out')
+
+
+
+training_event = ApexTrainingEvent
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_03V100x1x16.py b/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_03V100x1x16.py
new file mode 100644
index 000000000..3c1a121d3
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_03V100x1x16.py
@@ -0,0 +1,59 @@
+# /***************************************************************************************************
+# * Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# * Copyright Declaration: This software, including all of its code and documentation,
+# * except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# * Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# * Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# * CoreX. No user of this software shall have any right, ownership or interest in this software and
+# * any use of this software shall be in compliance with the terms and conditions of the End User
+# * License Agreement.
+#  **************************************************************************************************/
+
+from training_event import ApexTrainingEvent
+from config_common import *
+import os
+
+fp16 = True
+ddp_type = "apex"
+dist_backend = "nccl"
+
+gradient_accumulation_steps = 1
+train_batch_size = 27
+max_steps = 28000
+start_warmup_step = 0
+warmup_proportion = 0
+warmup_steps = 0
+
+distributed_lamb = True
+learning_rate = 3.5e-4
+# learning_rate = 7.0e-4
+weight_decay_rate = 0.01
+opt_lamb_beta_1 = 0.9
+opt_lamb_beta_2 = 0.999
+
+eval_batch_size = train_batch_size
+max_samples_termination = 18000000
+cache_eval_data = True
+
+fused_gelu_bias = True
+fused_mha = True
+dense_seq_output = True
+exchange_padding = True
+
+dwu_num_rs_pg = 1
+dwu_num_ar_pg = 1
+dwu_num_blocks = 1
+dwu_group_size = 16
+
+seed = 9031
+
+target_mlm_accuracy = 0.72
+save_checkpoint = False
+log_freq = 200
+eval_steps = 200
+init_checkpoint = os.path.join(os.path.dirname(os.path.abspath(__file__)),'../../../../../../../data/model_zoo/lm_bert/model.ckpt-23000.pt')
+output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),'out')
+
+
+
+training_event = ApexTrainingEvent
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_03V100x1x8.py b/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_03V100x1x8.py
new file mode 100644
index 000000000..042fb0d11
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_03V100x1x8.py
@@ -0,0 +1,58 @@
+# /***************************************************************************************************
+# * Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# * Copyright Declaration: This software, including all of its code and documentation,
+# * except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# * Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# * Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# * CoreX. No user of this software shall have any right, ownership or interest in this software and
+# * any use of this software shall be in compliance with the terms and conditions of the End User
+# * License Agreement.
+#  **************************************************************************************************/
+
+from training_event import ApexTrainingEvent
+from config_common import *
+import os
+
+fp16 = True
+ddp_type = "apex"
+dist_backend = "nccl"
+
+gradient_accumulation_steps = 1
+train_batch_size = 27
+max_steps = 28000
+start_warmup_step = 0
+warmup_proportion = 0
+warmup_steps = 0
+
+distributed_lamb = True
+learning_rate = 3.5e-4
+weight_decay_rate = 0.01
+opt_lamb_beta_1 = 0.9
+opt_lamb_beta_2 = 0.999
+
+eval_batch_size = train_batch_size
+max_samples_termination = 9000000
+cache_eval_data = True
+
+fused_gelu_bias = True
+fused_mha = True
+dense_seq_output = True
+exchange_padding = True
+
+dwu_num_rs_pg = 1
+dwu_num_ar_pg = 1
+dwu_num_blocks = 1
+
+seed = 9031
+
+target_mlm_accuracy = 0.72
+save_checkpoint = False
+log_freq = 200
+eval_steps = 200
+init_checkpoint = os.path.join(os.path.dirname(os.path.abspath(__file__)),'../../../../../../../data/model_zoo/lm_bert/model.ckpt-23000.pt')
+output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),'out')
+
+
+
+training_event = ApexTrainingEvent
+dwu_group_size = 8
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_V100x1x1.py b/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_V100x1x1.py
new file mode 100644
index 000000000..9290febe1
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_V100x1x1.py
@@ -0,0 +1,44 @@
+from training_event import ApexTrainingEvent
+from config_common import *
+import os
+
+fp16 = True
+ddp_type = "apex"
+dist_backend = "nccl"
+
+gradient_accumulation_steps = 1
+train_batch_size = 27
+max_steps = 7100
+start_warmup_step = 0
+warmup_proportion = 0
+warmup_steps = 0
+
+distributed_lamb = False
+learning_rate = 3.5e-4
+weight_decay_rate = 0.01
+opt_lamb_beta_1 = 0.9
+opt_lamb_beta_2 = 0.999
+
+eval_batch_size = train_batch_size
+max_samples_termination = 4500000
+cache_eval_data = True
+
+fused_gelu_bias = True
+fused_mha = True
+dense_seq_output = True
+exchange_padding = False
+
+dwu_num_rs_pg = 1
+dwu_num_ar_pg = 1
+dwu_num_blocks = 1
+
+seed = 9031
+
+target_mlm_accuracy = 0.40
+save_checkpoint = False
+log_freq = 200
+eval_steps = 200
+init_checkpoint = os.path.join(os.path.dirname(os.path.abspath(__file__)),'../../../../../../../data/model_zoo/lm_bert/model.ckpt-0.pt')
+output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),'../out')
+
+training_event = ApexTrainingEvent
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_V100x1x8.py b/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_V100x1x8.py
new file mode 100644
index 000000000..85913bd6b
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_V100x1x8.py
@@ -0,0 +1,57 @@
+
+# /***************************************************************************************************
+# * Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# * Copyright Declaration: This software, including all of its code and documentation,
+# * except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# * Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# * Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# * CoreX. No user of this software shall have any right, ownership or interest in this software and
+# * any use of this software shall be in compliance with the terms and conditions of the End User
+# * License Agreement.
+#  **************************************************************************************************/
+
+from training_event import ApexTrainingEvent
+from config_common import *
+import os
+
+fp16 = True
+ddp_type = "apex"
+dist_backend = "nccl"
+
+gradient_accumulation_steps = 1
+train_batch_size = 27
+max_steps = 28000
+start_warmup_step = 0
+warmup_proportion = 0
+warmup_steps = 0
+
+distributed_lamb = True
+learning_rate = 3.5e-4
+weight_decay_rate = 0.01
+opt_lamb_beta_1 = 0.9
+opt_lamb_beta_2 = 0.999
+
+eval_batch_size = train_batch_size
+max_samples_termination = 9000000
+cache_eval_data = True
+
+fused_gelu_bias = True
+fused_mha = True
+dense_seq_output = True
+exchange_padding = True
+
+dwu_num_rs_pg = 1
+dwu_num_ar_pg = 1
+dwu_num_blocks = 1
+
+seed = 9031
+
+target_mlm_accuracy = 0.72
+save_checkpoint = True
+log_freq = 200
+eval_steps = 1000
+init_checkpoint = os.path.join(os.path.dirname(os.path.abspath(__file__)),'../../../../../../../data/model_zoo/lm_bert/model.ckpt-0.pt')
+output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),'../out')
+
+training_event = ApexTrainingEvent
+dwu_group_size = 8
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_V100x2x8.py b/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_V100x2x8.py
new file mode 100644
index 000000000..4e897c706
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_V100x2x8.py
@@ -0,0 +1,56 @@
+# /***************************************************************************************************
+# * Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# * Copyright Declaration: This software, including all of its code and documentation,
+# * except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# * Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# * Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# * CoreX. No user of this software shall have any right, ownership or interest in this software and
+# * any use of this software shall be in compliance with the terms and conditions of the End User
+# * License Agreement.
+#  **************************************************************************************************/
+
+from training_event import ApexTrainingEvent
+from config_common import *
+import os
+
+fp16 = True
+ddp_type = "apex"
+dist_backend = "gloo"
+
+gradient_accumulation_steps = 1
+train_batch_size = 27
+max_steps = 8000
+start_warmup_step = 0
+warmup_proportion = 0
+warmup_steps = 0
+
+# WARN: DistributedLAMB is not compatible with Gloo backend
+# distributed_lamb = True
+learning_rate = 4e-4
+weight_decay_rate = 0.01
+opt_lamb_beta_1 = 0.9
+opt_lamb_beta_2 = 0.999
+
+eval_batch_size = train_batch_size
+max_samples_termination = 4500000
+cache_eval_data = True
+
+fused_gelu_bias = True
+fused_mha = True
+dense_seq_output = True
+exchange_padding = True
+
+dwu_num_rs_pg = 1
+dwu_num_ar_pg = 1
+dwu_num_blocks = 1
+
+seed = 9031
+
+target_mlm_accuracy = 0.71
+save_checkpoint = True
+log_freq = 200
+eval_steps = 1000
+init_checkpoint = os.path.join(os.path.dirname(os.path.abspath(__file__)),'../../../../../../../data/model_zoo/lm_bert/model.ckpt-0.pt')
+output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),'../out')
+
+training_event = ApexTrainingEvent
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_V100x4x8.py b/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_V100x4x8.py
new file mode 100644
index 000000000..b275e935f
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_V100x4x8.py
@@ -0,0 +1,57 @@
+# /***************************************************************************************************
+# * Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# * Copyright Declaration: This software, including all of its code and documentation,
+# * except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# * Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# * Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# * CoreX. No user of this software shall have any right, ownership or interest in this software and
+# * any use of this software shall be in compliance with the terms and conditions of the End User
+# * License Agreement.
+#  **************************************************************************************************/
+
+from training_event import ApexTrainingEvent
+from config_common import *
+import os
+
+fp16 = True
+ddp_type = "apex"
+dist_backend = "nccl"
+
+gradient_accumulation_steps = 1
+train_batch_size = 27
+max_steps = 8000
+start_warmup_step = 0
+warmup_proportion = 0
+warmup_steps = 0
+
+# WARN: DistributedLAMB is not compatible with Gloo backend
+distributed_lamb = False
+learning_rate = 4e-4
+weight_decay_rate = 0.01
+opt_lamb_beta_1 = 0.9
+opt_lamb_beta_2 = 0.999
+
+eval_batch_size = train_batch_size
+max_samples_termination = 4500000
+cache_eval_data = True
+
+fused_gelu_bias = True
+fused_mha = True
+dense_seq_output = True
+exchange_padding = True
+
+dwu_num_rs_pg = 1
+dwu_num_ar_pg = 1
+dwu_num_blocks = 1
+
+seed = 9031
+
+target_mlm_accuracy = 0.4
+save_checkpoint = True
+log_freq = 200
+eval_steps = 200
+init_checkpoint = os.path.join(os.path.dirname(os.path.abspath(__file__)),'../../../../../../../data/model_zoo/lm_bert/model.ckpt-0.pt')
+output_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),'../out')
+
+training_event = ApexTrainingEvent
+dwu_group_size = 8
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_common.py b/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_common.py
new file mode 100644
index 000000000..9aff9e1cb
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/config/config_common.py
@@ -0,0 +1,102 @@
+# /***************************************************************************************************
+# * Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# * Copyright Declaration: This software, including all of its code and documentation,
+# * except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# * Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# * Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# * CoreX. No user of this software shall have any right, ownership or interest in this software and
+# * any use of this software shall be in compliance with the terms and conditions of the End User
+# * License Agreement.
+#  **************************************************************************************************/
+
+# 'segmented' or 'full_iteration' options for CUDA graph capture.
+# 'segmented' option: Pytorch Autograd orchestrates execution of backward ops every iteration.
+# 'full_iteration' option: CUDA graph orchestrates execution of bwd ops every iteration without Autograd involvement (has composability limitations but could be more performant allowing optimizer                              and collectives capture).
+cuda_graph_mode: str = "segmented"
+
+# Maximum number of iterations to capture in a single graph.
+# Requires 'full_iteration' option for '--cuda_graph_mode'.
+max_iterations_per_graph: int = 4
+
+# Whether to do allreduces during gradient accumulation steps.
+allreduce_post_accumulation: bool = False
+
+# Whether to do fp16 allreduce post accumulation.
+allreduce_post_accumulation_fp16: bool = False
+
+# Whether to run with unpadding.
+unpad: bool = False
+
+# Whether to run with unpadding.
+unpad_fmha: bool = False
+
+# Whether to pad tokens.
+pad: bool = False
+
+# Whether to disable fusion of the scaling to BMM1.
+disable_fuse_scale: bool = False
+
+# Whether to disable fusion of the QKV GEMMs.
+disable_fuse_qkv: bool = False
+
+# Whether to disable apex softmax.
+disable_apex_softmax: bool = False
+
+# Enable use of streams for pad case.
+enable_stream: bool = False
+
+# Whether to run with optimizations.
+fused_mha: bool = False
+
+# Enable CUDA graph execution.
+use_cuda_graph: bool = False
+
+# DDP type: 'apex' or 'native'.
+ddp_type: str = "apex"
+
+# Bypass AMP unscaling and inf/nan checks for SOL measurements.
+bypass_amp: bool = False
+
+# Whether to use distributed lamb.
+distributed_lamb: bool = False
+
+# distributed weight update group size. If arg is 0, defaults to one node
+dwu_group_size: int = 0
+
+# number of blocks in dwu scheme
+dwu_num_blocks: int = 4
+
+# number of chunks in dwu scheme
+dwu_num_chunks: int = 1
+
+# number of reduction-scatter streams in dwu scheme
+dwu_num_rs_pg: int = 2
+
+# number of all-reduce streams in dwu scheme
+dwu_num_ar_pg: int = 4
+
+# number of all-gather streams in dwu scheme
+dwu_num_ag_pg: int = 2
+
+# whether to overlap reductions with backprop
+dwu_overlap_reductions: bool = False
+
+# do allgather with e5m2 floats
+dwu_e5m2_allgather: bool = False
+
+# the apex optimization level, value: [O1, O2]
+opt_level: str = "O2"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/config/converter.py b/nlp/language_model/bert_sample/pytorch/iluvatar/config/converter.py
new file mode 100644
index 000000000..ca11b5b60
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/config/converter.py
@@ -0,0 +1,192 @@
+# /***************************************************************************************************
+# * Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# * Copyright Declaration: This software, including all of its code and documentation,
+# * except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# * Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# * Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# * CoreX. No user of this software shall have any right, ownership or interest in this software and
+# * any use of this software shall be in compliance with the terms and conditions of the End User
+# * License Agreement.
+#  **************************************************************************************************/
+
+import copy
+import math
+from torch.utils import checkpoint
+
+from model.models.modeling import BertAttention, BertIntermediate, BertOutput, BertForPreTraining
+
+try:
+    from .layers import *
+except:
+    from layers import *
+
+
+class IluvatarBertLayer(nn.Module):
+    def __init__(self, config):
+        super(IluvatarBertLayer, self).__init__()
+        self.unpad = config.unpad
+        if config.fused_mha:
+            self.attention = FastBertAttention(config)
+        elif config.unpad:
+            self.attention = FastUnpadBertAttention(config)
+        else:
+            self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(self, hidden_states, attention_mask, seqlen=None, batch=None):
+        attention_output = self.attention(hidden_states, attention_mask, seqlen, batch)
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class NvidiaBertEncoder(nn.Module):
+    def __init__(self, config):
+        super(NvidiaBertEncoder, self).__init__()
+        layer = IluvatarBertLayer(config)
+        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+
+        self.num_attention_heads = config.num_attention_heads
+        self.fused_mha=config.fused_mha
+        self.unpad=config.unpad
+        self.unpad_fmha = config.unpad_fmha
+        self.pad = config.pad
+        self.fuse_mask = config.fuse_mask
+        self.enable_stream = config.enable_stream
+
+    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, checkpoint_activations=False):
+        # Unpad inputs and mask. It will remove tokens that are padded. Assume ntokens is total number of tokens (padded and non-padded)
+        # and ntokens_unpad is total number of non-padded tokens. Then unpadding performs the following compression of the inputs:
+        #        hidden_states[ntokens,hidden] -> hidden_states[ntokens_unpad,hidden]
+        batch = None
+        seqlen = None
+        if self.unpad:
+            batch = hidden_states.shape[0]
+            maxseqlen = hidden_states.shape[1]
+            hidden_size = hidden_states.shape[2]
+            attention_indices, attention_mask, seqlen, ntokens, cu_seqlens, actual_seqlens, maxseqlen_in_batch = generate_mask(attention_mask, self.num_attention_heads, pad=self.pad, fuse_mask=self.fuse_mask, unpad_fmha=self.unpad_fmha)
+            if self.pad == True and self.enable_stream == False:
+                hidden_states = hidden_states.view(batch,maxseqlen,hidden_size).permute(1,0,2).contiguous().view(batch*maxseqlen,hidden_size).contiguous()
+            if self.pad == True and self.enable_stream == True:
+                hidden_states = hidden_states.view(batch*maxseqlen,hidden_size)
+            if self.pad == False:
+                hidden_states = UnpadInput.apply(hidden_states.view(batch*maxseqlen, hidden_size).contiguous(), attention_indices, batch, maxseqlen, hidden_size, ntokens)
+
+        all_encoder_layers = []
+        def custom(start, end):
+            def custom_forward(*inputs):
+                layers = self.layer[start:end]
+                x_ = inputs[0]
+                for layer in layers:
+                    x_ = layer(x_, inputs[1])
+                return x_
+            return custom_forward
+
+        if checkpoint_activations:
+            l = 0
+            num_layers = len(self.layer)
+            chunk_length = math.ceil(math.sqrt(num_layers))
+            while l < num_layers:
+                hidden_states = checkpoint.checkpoint(custom(l, l+chunk_length), hidden_states, attention_mask*1)
+                l += chunk_length
+            # decoder layers
+        else:
+            if self.fused_mha:
+                hidden_states = hidden_states.permute(1,0,2).contiguous()
+            for i,layer_module in enumerate(self.layer):
+                if seqlen is None and batch is None:
+                    hidden_states = layer_module(hidden_states, attention_mask)
+                else:
+                    assert seqlen is not None
+                    assert batch is not None
+                    if self.unpad_fmha:
+                        hidden_states = layer_module(hidden_states, cu_seqlens, actual_seqlens, maxseqlen_in_batch)
+                    else:
+                        hidden_states = layer_module(hidden_states, attention_mask, seqlen, batch)
+
+                if output_all_encoded_layers:
+                    if self.fused_mha:
+                        all_encoder_layers.append(hidden_states.permute(1,0,2).contiguous())
+                    else:
+                        all_encoder_layers.append(hidden_states)
+
+        # Pad inputs and mask. It will insert back zero-padded tokens. Assume ntokens is total number of tokens (padded and non-padded)
+        # and ntokens_unpad is total number of non-padded tokens. Then padding performs the following de-compression:
+        #        hidden_states[ntokens_unpad,hidden] -> hidden_states[ntokens,hidden]
+        if self.unpad:
+            if self.pad == True and self.enable_stream == False:
+                hidden_states = hidden_states.view(maxseqlen, batch, hidden_size).permute(1,0,2).contiguous().view(batch,maxseqlen,hidden_size).contiguous()
+            if self.pad == True and self.enable_stream == True:
+                hidden_states = hidden_states.view(batch,maxseqlen,hidden_size)
+            if self.pad == False:
+                hidden_states = PadInput.apply(hidden_states, attention_indices, batch, maxseqlen, hidden_size, ntokens).view(batch, maxseqlen, hidden_size).contiguous()
+
+        if not output_all_encoded_layers or checkpoint_activations:
+            if self.fused_mha:
+                all_encoder_layers.append(hidden_states.permute(1,0,2).contiguous())
+            else:
+                all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+
+
+def convert_model(model: BertForPreTraining, config):
+    bert_config = copy.copy(model.config)
+
+    bert_config.fused_mha = config.fused_mha
+    bert_config.fused_gelu_bias = config.fused_gelu_bias
+    bert_config.dense_seq_output = config.dense_seq_output
+    bert_config.unpad = config.unpad
+    bert_config.unpad_fmha = config.unpad_fmha
+    bert_config.pad = config.pad
+    bert_config.fuse_qkv = not config.disable_fuse_qkv
+    bert_config.fuse_scale = not config.disable_fuse_scale
+    bert_config.fuse_mask = not config.disable_fuse_mask
+    bert_config.fuse_dropout = config.enable_fuse_dropout
+    bert_config.fused_dropout_add = config.fused_dropout_add
+    bert_config.apex_softmax = not config.disable_apex_softmax
+    bert_config.enable_stream = config.enable_stream
+    if bert_config.fuse_mask == True: bert_config.apex_softmax = True
+    if bert_config.pad == False: bert_config.enable_stream = True
+    if bert_config.unpad == True: bert_config.fused_mha = False
+
+    state_dict = model.state_dict()
+    if bert_config.fused_mha:
+        state_dict = remap_attn_parameters(state_dict)
+    model.bert_model_segment.bert.unpad = bert_config.unpad
+    model.bert_model_segment.bert.encoder = NvidiaBertEncoder(bert_config).to(torch.cuda.current_device())
+    model.load_state_dict(state_dict, strict=True)
+    return model
+
+
+def remap_attn_parameters(model_dict):
+    res_dict = OrderedDict()
+    for k in model_dict:
+        if 'attention' in k:
+            if 'self.query.weight' in k:
+                new_k = k.replace('self.query.weight', 'multi_head_attention.q_weight')
+            elif 'self.key.weight' in k:
+                new_k = k.replace('self.key.weight', 'multi_head_attention.k_weight')
+            elif 'self.value.weight' in k:
+                new_k = k.replace('self.value.weight', 'multi_head_attention.v_weight')
+            elif 'self.query.bias' in k:
+                new_k = k.replace('self.query.bias', 'multi_head_attention.q_bias')
+            elif 'self.key.bias' in k:
+                new_k = k.replace('self.key.bias', 'multi_head_attention.k_bias')
+            elif 'self.value.bias' in k:
+                new_k = k.replace('self.value.bias', 'multi_head_attention.v_bias')
+            elif 'output.dense.weight' in k:
+                new_k = k.replace('output.dense.weight', 'multi_head_attention.out_proj_weight')
+            elif 'output.dense.bias' in k:
+                new_k = k.replace('output.dense.bias', 'multi_head_attention.out_proj_bias')
+            elif 'output.LayerNorm.weight' in k:
+                new_k = k.replace('output.LayerNorm.weight', 'layer_norm.weight')
+            elif 'output.LayerNorm.bias' in k:
+                new_k = k.replace('output.LayerNorm.bias', 'layer_norm.bias')
+            else:
+                new_k = k
+        else:
+            new_k = k
+        res_dict[new_k] = model_dict[k]
+    model_dict.clear()
+    return res_dict
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/config/distributed_fused_lamb.py b/nlp/language_model/bert_sample/pytorch/iluvatar/config/distributed_fused_lamb.py
new file mode 100644
index 000000000..3ff365948
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/config/distributed_fused_lamb.py
@@ -0,0 +1,136 @@
+# /***************************************************************************************************
+# * Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# * Copyright Declaration: This software, including all of its code and documentation,
+# * except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# * Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# * Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# * CoreX. No user of this software shall have any right, ownership or interest in this software and
+# * any use of this software shall be in compliance with the terms and conditions of the End User
+# * License Agreement.
+#  **************************************************************************************************/
+
+import math
+import torch
+import importlib
+import amp_C
+from apex.multi_tensor_apply import multi_tensor_applier
+import torch.distributed.distributed_c10d as c10d
+
+#TODO: insert label, check data
+## Update to APEX (https://github.com/NVIDIA/apex.git)
+## changes incorporated in apex, compared to container APEX version 082f999 generated 4/13/2021
+## 1) function to support gradient clipping before all reduce (late rule change to MLPerf, now grad clipping before and after all reduce are both allowed)
+
+## Excerpted from PR # 1099 in apex library (https://github.com/NVIDIA/apex.git)
+## for supporting gradient clipping before allreduce
+## PR # 1099 adds the option to do either clip-before-allreduce or clip-after-allreduce
+def _pipeline_block_reductions_patched(self, block_id):
+    # Copy model grads to flat grads buffer
+    self._flatten_grad_mt(1.0)
+
+    # Compute L2 grad norm
+    self._l2_grad_norm_st.wait_stream(torch.cuda.current_stream())
+    with torch.cuda.stream(self._l2_grad_norm_st):
+        self._L2_grad_norm = self._flat_grads.norm(dtype=torch.float16, p=2).float()
+    torch.cuda.current_stream().wait_stream(self._l2_grad_norm_st)
+
+    # Apply clipping & pre-reduction scaling on grads
+    loss_scale = self.global_scale
+    max_grad_norm = loss_scale*self.defaults['max_grad_norm']
+    coeff = max_grad_norm /(1e-6+self.L2_grad_norm)
+    coeff = (coeff>1) * self._one + (coeff<=1) * coeff
+    tmp = torch.cat(((self._one), (coeff)))
+    index = (coeff+1>coeff).int()
+    scale = tmp.index_select(0, index).half()/self._world_size
+    self._flat_grads.mul_(scale)
+
+    # Reduction within each node
+    # Changes gradient format from [block * chunk * shard] to [shard * block * chunk]
+    # The output format is the same as the fp32 master parameters
+    works = [None]*self._num_chunks
+    for chunk_id in range(self._num_chunks):
+        glob_chunk_id = block_id * self._num_chunks + chunk_id
+        rs_stream = self._rs_st[glob_chunk_id%self._num_rs_pg]
+        rs_stream.wait_stream(torch.cuda.current_stream())
+        rs_stream.wait_stream(self._l2_grad_norm_st)
+        with torch.cuda.stream(rs_stream):
+            works[chunk_id] = torch.distributed.reduce_scatter(self._fp16_g_chunks[block_id][chunk_id],self._flat_grads_shards[block_id][chunk_id],group=self._rs_pg[glob_chunk_id%self._num_rs_pg],async_op=True)
+
+    # Reduction across nodes for each rank
+    if self._num_groups > 1:
+        for chunk_id in range(self._num_chunks):
+            glob_chunk_id = block_id * self._num_chunks + chunk_id
+            ar_stream = self._ar_st[glob_chunk_id%self._num_ar_pg]
+            with torch.cuda.stream(ar_stream):
+                works[chunk_id].wait()
+                works[chunk_id] = torch.distributed.all_reduce(self._fp16_g_chunks[block_id][chunk_id],group=self._ar_pg[glob_chunk_id%self._num_ar_pg],async_op=True)
+    self._reductions_works[block_id] = works
+
+    if block_id == 0:
+        for block_id in range(self._num_blocks):
+            for chunk_id in range(self._num_chunks):
+                self._reductions_works[block_id][chunk_id].wait()
+
+## Excerpted from PR # 1099 in apex library (https://github.com/NVIDIA/apex.git)
+## for supporting gradient clipping before allreduce
+## PR # 1099 adds the option to do either clip-before-allreduce or clip-after-allreduce
+def _pipeline_step_patched(self):
+    global_scale = self.global_scale
+    # if clip before ar, set max_grad_norm to 0
+    max_grad_norm = 0.0
+    self._completion_st.wait_stream(self._l2_grad_norm_st)
+    global_grad_norm = self.L2_grad_norm
+
+    # check global_grad_norm and fill overflow_buf
+    is_finite = (global_grad_norm + 1 > global_grad_norm).int()
+    self._overflow_buf = self._one * (is_finite ^ self._one) # toggle between 0 and 1
+    torch.distributed.all_reduce(is_finite,
+                                 op=torch.distributed.ReduceOp.MIN,
+                                 group=self._current_process_group)
+    torch.distributed.all_reduce(self._overflow_buf,
+                                 op=torch.distributed.ReduceOp.MAX,
+                                 group=self._current_process_group)
+
+    # increment step counter if no overflow
+    self._step += is_finite
+    self._completion_st.wait_stream(torch.cuda.current_stream())
+    self._completion_st.wait_stream(self._l2_grad_norm_st)
+
+    # Call step kernel once per step
+    # Call all-gather once per step
+    with torch.cuda.stream(self._completion_st):
+        for block_id in range(self._num_blocks):
+            for chunk_id in range(self._num_chunks):
+                self._reductions_works[block_id][chunk_id].wait()
+        #param_norm = self.__compute_contrib_param_norm()
+        param_norm = self._DistributedFusedLAMB__compute_contrib_param_norm()
+        multi_tensor_applier(self.multi_tensor_lamb_compute_update_term,
+                self._overflow_buf,
+                self._contrib_compute_update_term_tensor_list, # g, p, m, v, u
+                self._contrib_beta1,
+                self._contrib_beta2,
+                self._contrib_beta3,
+                self._contrib_bias_correction,
+                self._step,
+                self._contrib_epsilon,
+                self._adam_w_mode,
+                self._contrib_weight_decay,
+                global_scale,
+                global_grad_norm,
+                max_grad_norm)
+        upd_norm = self._DistributedFusedLAMB__compute_contrib_update_norm()
+        multi_tensor_applier(self.multi_tensor_lamb_update_weights,
+                self._overflow_buf,
+                self._contrib_update_weights_tensor_list, # u, p, p_copy
+                param_norm,
+                upd_norm,
+                self._offsets,
+                self._lr,
+                self._contrib_weight_decay,
+                global_grad_norm,
+                self._use_nvlamb)
+        torch.distributed.all_gather(self._new_params_mega_shards, self._fp16_p, group=self._ag_pg[0])
+
+
+
+
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/config/environment_variables.sh b/nlp/language_model/bert_sample/pytorch/iluvatar/config/environment_variables.sh
new file mode 100644
index 000000000..72dabb94a
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/config/environment_variables.sh
@@ -0,0 +1,69 @@
+# /***************************************************************************************************
+# * Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# * Copyright Declaration: This software, including all of its code and documentation,
+# * except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# * Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# * Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# * CoreX. No user of this software shall have any right, ownership or interest in this software and
+# * any use of this software shall be in compliance with the terms and conditions of the End User
+# * License Agreement.
+#  **************************************************************************************************/
+
+# =================================================
+# Constants
+# =================================================
+
+CURRENT_DIR=$(cd `dirname $0`; pwd)  # /path/to/proj/benchmarks/${MODEL}/pytorch
+PROJ_DIR="${CURRENT_DIR}/../../.."
+SUBMMIT_DIR="${PROJ_DIR}/iluvatar/${MODEL}"
+SDK_DIR="${SUBMMIT_DIR}/sdk_installers"
+if [ ! -d "${SDK_DIR}" ]; then
+    echo "WARN: Not found ${SDK_DIR}, set SDK_DIR to ${PROJ_DIR}/iluvatar/sdk_installers"
+    SDK_DIR="${PROJ_DIR}/iluvatar/sdk_installers"
+fi
+SDK_BAK_DIR="${SDK_DIR}.bak"
+
+DRIVER_FILE_PATH=""
+CUDA_FILE_PATH=""
+
+
+# =================================================
+# Check environment
+# =================================================
+
+if [ -d "${SDK_DIR}" ]; then
+    search_cuda_results=`find ${SDK_DIR} -name "*cuda*.run"`
+    for cuda_file_name in $search_cuda_results; do
+        CUDA_FILE_PATH="${cuda_file_name}"
+    done
+fi
+
+
+if [ -d "/usr/local/cuda" ]; then
+    # Found cuda
+
+    # Mapping host cuda to container
+    cuda_dirs=`find /usr/local -maxdepth 1 -name "cuda*"`
+    for cuda_dir in $cuda_dirs; do
+        CONTAINER_MOUNTS="$CONTAINER_MOUNTS -v ${cuda_dir}:${cuda_dir}"
+    done
+
+    # Blocking install cuda
+    mkdir -p "${SDK_BAK_DIR}"
+    if [ -n "${CUDA_FILE_PATH}" ] && [ -f "${CUDA_FILE_PATH}" ]; then
+        echo "WARN: Move ${CUDA_FILE_PATH} to ${SDK_BAK_DIR}"
+        mv "${CUDA_FILE_PATH}" "${SDK_BAK_DIR}"
+    fi
+fi
+
+
+# =================================================
+# Export variables
+# =================================================
+
+export CONTAINER_MOUNTS="${CONTAINER_MOUNTS} -v /dev:/dev -v /usr/src/:/usr/src -v /lib/modules/:/lib/modules --cap-add=ALL"
+export SDK_ARGUMENTS="cuda_=-- --silent --toolkit;corex-installer=-- --silent --cudapath=/usr/local/cuda"
+export LD_LIBRARY_PATH="/usr/local/corex/lib64"
+SYS_ENVS="/root/miniconda/bin:/root/miniconda/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
+export PATH="/usr/local/corex/bin:${SYS_ENVS}"
+export CONTAINER_MOUNTS
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/__init__.py b/nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/__init__.py
new file mode 100644
index 000000000..85d34f0c8
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/__init__.py
@@ -0,0 +1,15 @@
+# /***************************************************************************************************
+# * Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# * Copyright Declaration: This software, including all of its code and documentation,
+# * except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# * Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# * Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# * CoreX. No user of this software shall have any right, ownership or interest in this software and
+# * any use of this software shall be in compliance with the terms and conditions of the End User
+# * License Agreement.
+#  **************************************************************************************************/
+
+from .attention import *
+from .fmha import *
+from .mha import *
+from .softmax import *
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/attention.py b/nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/attention.py
new file mode 100644
index 000000000..77876bb1b
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/attention.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from torch import nn
+
+import apex
+from apex.contrib.multihead_attn import SelfMultiheadAttn
+from model.models.modeling import jit_dropout_add, BertSelfOutput
+from .fmha import FMHA
+from .mha import FastUnpadBertSelfAttention
+
+
+#apex.amp.register_half_function(apex.normalization.fused_layer_norm, 'FusedLayerNorm')
+import apex.normalization
+#apex.amp.register_float_function(apex.normalization.FusedLayerNorm, 'forward')
+from apex.contrib.layer_norm import FastLayerNorm as BertLayerNorm
+
+
+# This module uses Apex C++ multihead attention implementation with fusions.
+class FastBertAttention(nn.Module):
+    def __init__(self, config):
+        super(FastBertAttention, self).__init__()
+        self.multi_head_attention = SelfMultiheadAttn(config.hidden_size, config.num_attention_heads, dropout = config.attention_probs_dropout_prob, bias=True, include_norm_add=False, impl='fast', separate_qkv_params=True, mask_additive=True)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.p = config.hidden_dropout_prob
+        self.fused_dropout_add = config.fused_dropout_add
+        self.layer_norm = BertLayerNorm(config.hidden_size, eps=1e-12)
+
+    def forward(self, input_tensor, attention_mask, *args, **kwargs):
+        residual=input_tensor
+        multi_head_attention_output,_ = self.multi_head_attention(query = input_tensor, key = input_tensor, value = input_tensor, key_padding_mask=attention_mask, need_weights=True,attn_mask = None, is_training = self.training)
+        if self.fused_dropout_add:
+            attention_output = jit_dropout_add(multi_head_attention_output, residual, self.p, self.training)
+            attention_output = self.layer_norm(attention_output)
+            return attention_output
+        else:
+            attention_output = self.dropout(multi_head_attention_output)
+            attention_output = self.layer_norm(attention_output + residual)
+            return attention_output
+
+
+class FastUnpadBertAttention(nn.Module):
+    def __init__(self, config):
+        super(FastUnpadBertAttention, self).__init__()
+        if config.unpad_fmha:
+            self.self = FMHA(config)
+        else:
+            self.self = FastUnpadBertSelfAttention(config, enable_stream=config.enable_stream, enable_sync=False, fuse_mask=config.fuse_mask, fuse_scale=config.fuse_scale, fuse_qkv=config.fuse_qkv, fuse_dropout=config.fuse_dropout, apex_softmax=config.apex_softmax, pad=config.pad)
+        self.output = BertSelfOutput(config)
+
+    def forward(self, input_tensor, attention_mask, seqlen, batch):
+        self_output = self.self(input_tensor, attention_mask, seqlen, batch, is_training = self.training)
+        attention_output = self.output(self_output, input_tensor)
+        return attention_output
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/bmm1.py b/nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/bmm1.py
new file mode 100644
index 000000000..bceb8c7a1
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/bmm1.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import ext_ops
+
+###########################################################################################
+
+class Bmm1Function(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, batch1, batch2, seqlen, batch, maxseqlen, heads, embed, scale, stream, sync):
+        ctx.save_for_backward(batch1, batch2, seqlen)
+        ctx.batch = batch
+        ctx.maxseqlen = maxseqlen
+        ctx.heads = heads
+        ctx.embed = embed
+        ctx.scale = scale
+        ctx.sync = sync
+        ctx.stream = stream
+        ntokens = seqlen.sum().item()
+        ctx.ntokens = ntokens
+        ntokens2 = 0
+        for i in range(batch):
+            ntokens2 += seqlen[i]*seqlen[i]
+
+        output = torch.empty(ntokens2*heads, device="cuda", dtype=torch.float16)
+        ext_ops.FastBmm1Fprop(batch2.flatten().contiguous(), batch1.flatten().contiguous(), output.flatten().contiguous(), batch, seqlen, heads, embed, scale, False, stream, sync)
+
+        return output[:ntokens2*heads]
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        batch1, batch2, seqlen = ctx.saved_tensors
+        batch = ctx.batch
+        maxseqlen = ctx.maxseqlen
+        heads = ctx.heads
+        embed = ctx.embed
+        ntokens = ctx.ntokens
+
+        grad_batch1 = torch.empty(ntokens,heads*embed, device="cuda", dtype=torch.float16)
+        grad_batch2 = torch.empty(ntokens,heads*embed, device="cuda", dtype=torch.float16)
+
+        ext_ops.FastBmm1Dgrad2(batch2.flatten().contiguous(), grad_output.flatten().contiguous(), grad_batch1.flatten().contiguous(), batch, seqlen, heads, embed, ctx.scale, False, ctx.stream, ctx.sync)
+        ext_ops.FastBmm1Dgrad1(batch1.flatten().contiguous(), grad_output.flatten().contiguous(), grad_batch2.flatten().contiguous(), batch, seqlen, heads, embed, ctx.scale, False, ctx.stream, ctx.sync)
+
+        return grad_batch1[:ntokens], grad_batch2[:ntokens], None, None, None, None, None, None, None, None
+
+class Bmm1(torch.nn.Module):
+    def __init__(self, batch, seqlen, heads, embed, scale=False, stream=True, sync=True):
+        super(Bmm1, self).__init__()
+
+        self.heads = heads
+        self.embed = embed
+        self.maxseqlen = seqlen
+        self.scale = scale
+        self.sync = sync
+        self.stream = stream
+
+    def forward(self, batch1, batch2, batch, seqlen):
+        return Bmm1Function.apply(batch1, batch2, seqlen, batch, self.maxseqlen, self.heads, self.embed, self.scale, self.stream, self.sync)
+
+##########################################################################################
+
+class Bmm1StridedFunction(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, mixed, seqlen, batch, maxseqlen, heads, embed, scale, stream, sync, timers):
+        ctx.save_for_backward(mixed, seqlen)
+        ctx.batch = batch
+        ctx.maxseqlen = maxseqlen
+        ctx.heads = heads
+        ctx.embed = embed
+        ctx.scale = scale
+        ctx.sync = sync
+        ctx.stream = stream
+        ctx.timers = timers
+        ntokens = seqlen.sum().item()
+        ctx.ntokens = ntokens
+        ntokens2 = 0
+        for i in range(batch):
+            ntokens2 += seqlen[i]*seqlen[i]
+
+        output = torch.empty(ntokens2*heads, device="cuda", dtype=torch.float16)
+
+        if timers: timers['start_fprop'].record()
+        ext_ops.FastBmm1Fprop(mixed, mixed, output, batch, seqlen, heads, embed, scale, True, stream, sync)
+
+        if timers: timers['stop_fprop'].record()
+
+        return output[:ntokens2*heads], mixed
+
+    @staticmethod
+    #def backward(ctx, grad_output):
+    def backward(ctx, grad_output, grad_mixed):
+
+        mixed, seqlen = ctx.saved_tensors
+        batch = ctx.batch
+        maxseqlen = ctx.maxseqlen
+        heads = ctx.heads
+        embed = ctx.embed
+        ntokens = ctx.ntokens
+
+        #grad_mixed = torch.empty([ntokens,heads*3*embed], device="cuda", dtype=torch.float16)
+
+        if ctx.timers: ctx.timers['start_dgrad'].record()
+        ext_ops.FastBmm1Dgrad2(mixed, grad_output, grad_mixed, batch, seqlen, heads, embed, ctx.scale, True, ctx.stream, ctx.sync)
+        if ctx.timers: ctx.timers['stop_dgrad'].record()
+        if ctx.timers: ctx.timers['start_wgrad'].record()
+        ext_ops.FastBmm1Dgrad1(mixed, grad_output, grad_mixed, batch, seqlen, heads, embed, ctx.scale, True, ctx.stream, ctx.sync)
+        if ctx.timers: ctx.timers['stop_wgrad'].record()
+        #return grad_mixed[:ntokens], None, None, None, None, None, None, None, None, None
+        return grad_mixed[:ntokens], grad_mixed, None, None, None, None, None, None, None, None, None
+
+class Bmm1Strided(torch.nn.Module):
+    def __init__(self, batch, seqlen, heads, embed, scale=True, stream=True, sync=True, timer=False):
+        super(Bmm1Strided, self).__init__()
+
+        self.heads = heads
+        self.embed = embed
+        self.maxseqlen = seqlen
+        self.scale = scale
+        self.sync = sync
+        self.stream = stream
+        if timer:
+            self.timers = {'start_fprop':torch.cuda.Event(enable_timing=True),
+                           'start_dgrad':torch.cuda.Event(enable_timing=True),
+                           'start_wgrad':torch.cuda.Event(enable_timing=True),
+                           'stop_fprop':torch.cuda.Event(enable_timing=True),
+                           'stop_dgrad':torch.cuda.Event(enable_timing=True),
+                           'stop_wgrad':torch.cuda.Event(enable_timing=True)}
+        else:
+            self.timers = None
+
+    def forward(self, mixed, batch, seqlen):
+        return Bmm1StridedFunction.apply(mixed, seqlen, batch, self.maxseqlen, self.heads, self.embed, self.scale, self.stream, self.sync, self.timers)
+
+###########################################################################################
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/bmm2.py b/nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/bmm2.py
new file mode 100644
index 000000000..aa2bdb091
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/bmm2.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import ext_ops
+
+###########################################################################################
+
+class Bmm2Function(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, batch1, batch2, seqlen, batch, maxseqlen, heads, embed, sync, stream):
+        ctx.save_for_backward(batch1, batch2, seqlen)
+        ctx.batch = batch
+        ctx.maxseqlen = maxseqlen
+        ctx.heads = heads
+        ctx.embed = embed
+        ctx.stream = stream
+        ctx.sync = sync
+        ntokens = seqlen.sum().item()
+        ctx.ntokens = ntokens
+
+        output = torch.empty([ntokens,heads,embed], device="cuda", dtype=torch.float16)
+        ext_ops.FastBmm2Fprop(batch2.flatten().contiguous(), batch1.flatten().contiguous(), output, batch, seqlen, heads, embed, False, False, stream, sync)
+
+        return output[:ntokens]
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        batch1, batch2, seqlen = ctx.saved_tensors
+        batch = ctx.batch
+        maxseqlen = ctx.maxseqlen
+        heads = ctx.heads
+        embed = ctx.embed
+        ntokens = ctx.ntokens
+        ntokens2 = 0
+        for i in range(batch):
+            ntokens2 += seqlen[i]*seqlen[i]
+
+        grad_batch1 = torch.empty([ntokens2*heads], device="cuda", dtype=torch.float16)
+        grad_batch2 = torch.empty([ntokens,heads*embed], device="cuda", dtype=torch.float16)
+
+        ext_ops.FastBmm2Dgrad1(batch2.flatten().contiguous(), grad_output, grad_batch1, batch, seqlen, heads, embed, False, False, ctx.stream, ctx.sync)
+        ext_ops.FastBmm2Dgrad2(grad_output, batch1, grad_batch2, batch, seqlen, heads, embed, False, False, ctx.stream, ctx.sync)
+
+        return grad_batch1[:ntokens2*heads], grad_batch2[:ntokens], None, None, None, None, None, None, None
+
+class Bmm2(torch.nn.Module):
+    def __init__(self, batch, seqlen, heads, embed, stream=True, sync=True):
+        super(Bmm2, self).__init__()
+
+        self.heads = heads
+        self.embed = embed
+        self.maxseqlen = seqlen
+        self.stream = stream
+        self.sync = sync
+
+    def forward(self, batch1, batch2, batch, seqlen):
+        return Bmm2Function.apply(batch1, batch2, seqlen, batch, self.maxseqlen, self.heads, self.embed, self.stream, self.sync)
+
+###########################################################################################
+
+class Bmm2StridedFunction(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, batch1, mixed, seqlen, batch, maxseqlen, heads, embed, stream, sync, timers):
+        ctx.save_for_backward(batch1, mixed, seqlen)
+        ctx.batch = batch
+        ctx.maxseqlen = maxseqlen
+        ctx.heads = heads
+        ctx.embed = embed
+        ctx.stream = stream
+        ctx.sync = sync
+        ctx.timers = timers
+        ntokens = seqlen.sum().item()
+        ctx.ntokens = ntokens
+
+        output = torch.empty([ntokens,heads,embed], device="cuda", dtype=torch.float16)
+
+        if timers: timers['start_fprop'].record()
+        ext_ops.FastBmm2Fprop(mixed, batch1, output, batch, seqlen, heads, embed, False, True, stream, sync)
+        if timers: timers['stop_fprop'].record()
+
+        return output[:ntokens]
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        batch1, mixed, seqlen = ctx.saved_tensors
+        batch = ctx.batch
+        maxseqlen = ctx.maxseqlen
+        heads = ctx.heads
+        embed = ctx.embed
+        ntokens = ctx.ntokens
+        ntokens2 = 0
+        for i in range(batch):
+            ntokens2 += seqlen[i]*seqlen[i]
+
+        grad_batch1 = torch.empty(ntokens2*heads, device="cuda", dtype=torch.float16)
+        grad_mixed = torch.empty([ntokens,heads*3*embed], device="cuda", dtype=torch.float16)
+
+        if ctx.timers: ctx.timers['start_dgrad'].record()
+        ext_ops.FastBmm2Dgrad1(mixed, grad_output, grad_batch1, batch, seqlen, heads, embed, False, True, ctx.stream, ctx.sync)
+        if ctx.timers: ctx.timers['stop_dgrad'].record()
+        if ctx.timers: ctx.timers['start_wgrad'].record()
+        ext_ops.FastBmm2Dgrad2(grad_output, batch1, grad_mixed, batch, seqlen, heads, embed, False, True, ctx.stream, ctx.sync)
+        if ctx.timers: ctx.timers['stop_wgrad'].record()
+        return grad_batch1[:ntokens2*heads], grad_mixed[:ntokens], None, None, None, None, None, None, None, None
+
+class Bmm2Strided(torch.nn.Module):
+    def __init__(self, batch, seqlen, heads, embed, stream=True, sync=True, timer=False):
+        super(Bmm2Strided, self).__init__()
+
+        self.heads = heads
+        self.embed = embed
+        self.maxseqlen = seqlen
+        self.stream = stream
+        self.sync = sync
+        if timer:
+            self.timers = {'start_fprop':torch.cuda.Event(enable_timing=True),
+                           'start_dgrad':torch.cuda.Event(enable_timing=True),
+                           'start_wgrad':torch.cuda.Event(enable_timing=True),
+                           'stop_fprop':torch.cuda.Event(enable_timing=True),
+                           'stop_dgrad':torch.cuda.Event(enable_timing=True),
+                           'stop_wgrad':torch.cuda.Event(enable_timing=True)}
+        else:
+            self.timers = None
+
+    def forward(self, batch1, mixed, batch, seqlen):
+        return Bmm2StridedFunction.apply(batch1, mixed, seqlen, batch, self.maxseqlen, self.heads, self.embed, self.stream, self.sync, self.timers)
+
+###########################################################################################
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/fmha.py b/nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/fmha.py
new file mode 100644
index 000000000..2aeb6a1e6
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/fmha.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+import torch.nn.functional as F
+from apex.contrib.fmha import FMHAFun
+from collections import OrderedDict
+
+import numpy as np
+
+class TestParam(torch.nn.Parameter):
+    def __init__(self, data, requires_grad=True):
+        super(TestParam, self).__init__()
+        self.data = data
+        self.requires_grad = requires_grad
+        self.tag = 'qkv'
+        self.counter = 0
+
+class NoopCat(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, Wq, Wk, Wv, Bq, Bk, Bv, Wqkv, Bqkv, hidden_size):
+        assert not Wqkv.requires_grad and not Bqkv.requires_grad, "hye!"
+        Wtmp = Wqkv.view(3, hidden_size, hidden_size)
+        Btmp = Bqkv.view(3, hidden_size)
+        Wq.data = Wtmp[0,:,:]
+        Wk.data = Wtmp[1,:,:]
+        Wv.data = Wtmp[2,:,:]
+
+        Bq.data = Btmp[0,:]
+        Bk.data = Btmp[1,:]
+        Bv.data = Btmp[2,:]
+
+        Wtmp = Wqkv.new()
+        Wtmp.set_(Wqkv.storage(), Wqkv.storage_offset(), Wqkv.size(), Wqkv.stride())
+        Wtmp.requires_grad = True
+
+        Btmp = Bqkv.new()
+        Btmp.set_(Bqkv.storage(), Bqkv.storage_offset(), Bqkv.size(), Bqkv.stride())
+        Btmp.requires_grad = True
+        ctx.save_for_backward(Wqkv, Bqkv, Wq, Wk, Wv, Bq, Bk, Bv)
+        ctx.hidden_size = hidden_size
+        return Wtmp, Btmp
+        
+    @staticmethod
+    def backward(ctx, dWqkv, dBqkv):
+        Wqkv, Bqkv, Wq, Wk, Wv, Bq, Bk, Bv = ctx.saved_tensors
+        Wtmp = Wqkv.view(3, ctx.hidden_size, ctx.hidden_size)
+        Btmp = Bqkv.view(3, ctx.hidden_size)
+        Wq.data = Wtmp[0,:,:]
+        Wk.data = Wtmp[1,:,:]
+        Wv.data = Wtmp[2,:,:]
+
+        Bq.data = Btmp[0,:]
+        Bk.data = Btmp[1,:]
+        Bv.data = Btmp[2,:]
+
+        dWtmp = dWqkv.view(3, ctx.hidden_size, ctx.hidden_size)
+        dBtmp = dBqkv.view(3, ctx.hidden_size)
+        return dWtmp[0, :,:], dWtmp[1, :,:], dWtmp[2, :,:], dBtmp[0,:], dBtmp[1,:], dBtmp[2,:], None, None, None
+
+class FMHA(torch.nn.Module):
+
+    def __init__(self, config):
+
+        super(FMHA, self).__init__()
+
+        self.p_dropout = config.attention_probs_dropout_prob
+        self.h = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.d = self.hidden_size // self.h
+        assert self.d * self.h == self.hidden_size, "Invalid hidden size/num_heads"
+        
+        self.register_buffer("Wqkv",torch.zeros(3 * config.hidden_size, config.hidden_size))
+        self.register_buffer("Bqkv",torch.zeros(3 * config.hidden_size))
+        self.Wqkv.requires_grad = False
+        self.Bqkv.requires_grad = False
+        self.Wqkv.detach()
+        self.Bqkv.detach()
+        with torch.no_grad():
+            params = []
+            Wtmp = self.Wqkv.view(3, self.hidden_size, self.hidden_size)
+            Btmp = self.Bqkv.view(3, self.hidden_size)
+            for tag, idx in zip('qkv', range(3)):
+                params.append(('W' + tag, torch.nn.Parameter(Wtmp[idx,:,:])))
+                params.append(('B' + tag, torch.nn.Parameter(Btmp[idx,:])))
+
+            self.param_views = OrderedDict(params)
+
+            self._reset_param_views()
+
+        def prep_weights(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+            Wq = state_dict.pop(prefix + 'query.weight')
+            bq = state_dict.pop(prefix + 'query.bias')
+
+            Wk = state_dict.pop(prefix + 'key.weight')
+            bk = state_dict.pop(prefix + 'key.bias')
+
+            Wv = state_dict.pop(prefix + 'value.weight')
+            bv = state_dict.pop(prefix + 'value.bias')
+
+            weight = torch.cat([Wq.view(self.h, self.d, self.hidden_size),
+                                Wk.view(self.h, self.d, self.hidden_size),
+                                Wv.view(self.h, self.d, self.hidden_size)],
+                               dim=0).reshape(config.hidden_size*3,config.hidden_size).contiguous()
+
+            bias = torch.cat([bq.view(self.h, self.d),
+                              bk.view(self.h, self.d),
+                              bv.view(self.h, self.d)],
+                             dim=0).reshape(3*config.hidden_size).contiguous()
+                                    
+            state_dict[prefix + 'Wqkv'] = weight
+            state_dict[prefix + 'Bqkv'] = bias
+            state_dict[prefix + 'Wq'] = Wq 
+            state_dict[prefix + 'Wk'] = Wk 
+            state_dict[prefix + 'Wv'] = Wv 
+            state_dict[prefix + 'Bq'] = bq 
+            state_dict[prefix + 'Bk'] = bk 
+            state_dict[prefix + 'Bv'] = bv 
+
+        self._register_load_state_dict_pre_hook(prep_weights)
+
+    def _reset_param_views(self):
+        with torch.no_grad():
+            Wtmp = self.Wqkv.view(3, self.hidden_size, self.hidden_size)
+            Btmp = self.Bqkv.view(3, self.hidden_size)
+
+            for tag, idx in zip('qkv', range(3)):
+                self.param_views['W' + tag].data = Wtmp[idx, :, :]
+                self.param_views['B' + tag].data = Btmp[idx, :]
+
+    def _apply(self, fn):
+
+        with torch.no_grad():
+            self.Wqkv = fn(self.Wqkv)
+
+            if self.Wqkv.grad is not None:
+                self.Wqkv.grad = fn(self.Wqkv.grad)
+
+            self.Bqkv = fn(self.Bqkv)
+
+            if self.Bqkv.grad is not None:
+                self.Bqkv.grad = fn(self.Bqkv.grad)
+
+            self._reset_param_views()
+
+    @property
+    def _parameters(self):
+        self._reset_param_views()
+        return self.param_views
+
+    @_parameters.setter
+    def _parameters(self, _):
+        if 'Wqkv' in self.__dict__ and self.Wqkv is not None and self.Wqkv.device == torch.device('cuda:0'):
+            import traceback
+            traceback.print_stack()
+        pass
+
+    def forward(self, hidden_states, cu_seqlens, seqlens, max_s, is_training=True):
+
+        Wqkv, Bqkv = NoopCat.apply(*[self.param_views[x + y] for x in 'WB' for y in 'qkv'], self.Wqkv, self.Bqkv, self.hidden_size)
+        qkv = F.linear(hidden_states, Wqkv, Bqkv)
+        p_dropout = self.p_dropout
+
+        ctx = FMHAFun.apply(qkv.view(-1, 3, self.h, self.d), cu_seqlens, seqlens, p_dropout, max_s, is_training)
+
+        return ctx.view(-1, self.hidden_size)
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/mha.py b/nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/mha.py
new file mode 100644
index 000000000..0e3f4f5d2
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/mha.py
@@ -0,0 +1,142 @@
+# Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .bmm1 import *
+from .bmm2 import *
+from model.layers.padding import *
+from .softmax import *
+
+class FastUnpadBertSelfAttention(nn.Module):
+    def __init__(self, config, enable_stream=True, enable_sync=True, fuse_mask=True, fuse_scale=True, fuse_qkv=True, fuse_dropout=True, apex_softmax=True, pad=True):
+        super(FastUnpadBertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.hidden_size = config.hidden_size
+
+        self.fuse_qkv = fuse_qkv
+        self.fuse_scale = fuse_scale
+        self.fuse_mask = fuse_mask
+        self.fuse_dropout = fuse_dropout
+        self.apex_softmax = apex_softmax
+        self.pad = pad
+        self.enable_stream = enable_stream
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        if self.fuse_qkv:
+            self.bmm1 = Bmm1Strided(None,None,self.num_attention_heads,self.attention_head_size, scale=self.fuse_scale, stream=enable_stream, sync=enable_sync, timer=False)
+            self.bmm2 = Bmm2Strided(None,None,self.num_attention_heads,self.attention_head_size, stream=enable_stream, sync=enable_sync, timer=False)
+        else:
+            self.bmm1 = Bmm1(None,None,self.num_attention_heads,self.attention_head_size, scale=self.fuse_scale, stream=enable_stream, sync=enable_sync)
+            self.bmm2 = Bmm2(None,None,self.num_attention_heads,self.attention_head_size, stream=enable_stream, sync=enable_sync)
+
+        if self.fuse_dropout == False:
+            self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+        if self.fuse_mask == True and self.fuse_dropout == True:
+            self.softmax = FastMaskSoftmaxDropout(dim=-1, dropout_prob=config.attention_probs_dropout_prob,stream=enable_stream, sync=(not self.pad), timer=False)
+        elif self.fuse_mask == True:
+            self.softmax = FastMaskSoftmax(dim=-1, stream=enable_stream, sync=enable_sync, timer=False)
+        else:
+            self.softmax = FastSoftmax(dim=-1, stream=enable_stream, sync=enable_sync, timer=False)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = torch.reshape(x, new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def transpose_key_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = torch.reshape(x, new_x_shape)
+        return x.permute(0, 2, 3, 1)
+
+    def pytorch_softmax(self,attention_scores, batch, seqlen, heads):
+        ntokens2 = 0
+        for i in range(batch):
+            ntokens2 += seqlen[i]*seqlen[i]*self.num_attention_heads
+        attention_probs = torch.zeros(ntokens2, device="cuda", dtype=torch.float16)
+        ntokens2 = 0
+        for i in range(batch):
+            tokens2 = seqlen[i]*seqlen[i]*self.num_attention_heads
+            attention_probs[ntokens2:ntokens2+tokens2] = F.softmax(attention_scores[ntokens2:ntokens2+tokens2].view(1,self.num_attention_heads,seqlen[i],seqlen[i]), dim=-1).flatten().contiguous()
+            ntokens2 += tokens2
+        return attention_probs
+
+    def forward(self, hidden_states, attention_mask, seqlen, batch, is_training=True):
+
+        self.batch = batch
+
+        # QKV
+        if self.fuse_qkv:
+            weight = torch.cat([self.query.weight.view(self.num_attention_heads,self.attention_head_size,1,self.hidden_size), self.key.weight.view(self.num_attention_heads,self.attention_head_size,1,self.hidden_size), self.value.weight.view(self.num_attention_heads,self.attention_head_size,1,self.hidden_size)], dim=1).reshape(self.all_head_size*3,self.hidden_size).contiguous()
+            bias = torch.cat([self.query.bias.view(self.num_attention_heads,1,self.attention_head_size), self.key.bias.view(self.num_attention_heads,1,self.attention_head_size), self.value.bias.view(self.num_attention_heads,1,self.attention_head_size)],dim=1).reshape(3*self.hidden_size).contiguous()
+            mixed_x_layer = torch.addmm(bias, hidden_states, weight.t())
+        else:
+            query_layer = self.query(hidden_states)
+            key_layer = self.key(hidden_states)
+            value_layer = self.value(hidden_states)            
+
+        # BMM1.
+        if self.enable_stream: torch.cuda.synchronize()
+        if self.fuse_qkv:
+            attention_scores, qkv_layer = self.bmm1(mixed_x_layer, self.batch, seqlen)
+        else:
+            attention_scores = self.bmm1(query_layer, key_layer, self.batch, seqlen)            
+
+        if self.enable_stream: torch.cuda.synchronize()
+        if self.fuse_scale == False:
+            attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        # Softmax.
+        if self.enable_stream: torch.cuda.synchronize()        
+        if self.fuse_mask ==True and self.fuse_dropout == True:
+            attention_probs = self.softmax(attention_scores, attention_mask, self.batch, seqlen, self.num_attention_heads, is_training)
+        elif self.fuse_mask == True:
+            attention_probs = self.softmax(attention_scores, attention_mask, self.batch, seqlen, self.num_attention_heads)
+        else:
+            attention_scores = attention_scores + attention_mask.view(-1)
+            if self.apex_softmax == True:
+                attention_probs = self.softmax(attention_scores, self.batch, seqlen, self.num_attention_heads)
+            else:
+                if self.pad == True:
+                    attention_probs = F.softmax(attention_scores.view(batch,self.num_attention_heads,seqlen[0],seqlen[0]), dim=-1).flatten().contiguous()
+                else:
+                    attention_probs = self.pytorch_softmax(attention_scores, self.batch, seqlen, self.num_attention_heads)
+
+        # Dropout.
+        if self.enable_stream: torch.cuda.synchronize()                
+        if self.fuse_dropout == False:
+            attention_probs = self.dropout(attention_probs)
+
+        # BMM2.
+        if self.enable_stream: torch.cuda.synchronize()
+        if self.fuse_qkv:
+            context_layer = self.bmm2(attention_probs, qkv_layer, self.batch, seqlen)
+        else:
+            context_layer = self.bmm2(attention_probs, value_layer, self.batch, seqlen)
+
+        if self.enable_stream: torch.cuda.synchronize()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = torch.reshape(context_layer, new_context_layer_shape)
+        return context_layer
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/softmax.py b/nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/softmax.py
new file mode 100644
index 000000000..d378d71b5
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/config/layers/softmax.py
@@ -0,0 +1,149 @@
+import torch
+import ext_ops
+
+###########################################################################################
+
+class FastSoftmaxFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(cxt, input, dim, batch, seqlen, heads, stream, sync, timers):
+        if timers: timers['start_fprop'].record()
+        ext_ops.FastSoftmaxFprop(input, batch, seqlen, heads, stream, sync)
+        if timers: timers['stop_fprop'].record()
+
+        cxt.save_for_backward(input,seqlen)
+        cxt.dim = dim
+        cxt.batch = batch
+        cxt.heads = heads
+        cxt.stream = stream
+        cxt.sync = sync
+        cxt.timers = timers
+        return input
+
+    @staticmethod
+    def backward(cxt, grad_output):
+        output, seqlen, = cxt.saved_tensors
+        dim = cxt.dim
+        batch = cxt.batch
+        heads = cxt.heads
+
+        if cxt.timers: cxt.timers['start_dgrad'].record()
+        ext_ops.FastSoftmaxBprop(output, grad_output, batch, seqlen, heads, cxt.stream, cxt.sync)
+        if cxt.timers: cxt.timers['stop_dgrad'].record()
+        return grad_output, None, None, None, None, None, None, None
+
+class FastSoftmax(torch.nn.Module):
+    def __init__(self, dim=None, stream=True, sync=True, timer=False):
+        super(FastSoftmax, self).__init__()
+        self.dim = dim
+        self.stream = stream
+        self.sync = sync
+        if timer:
+            self.timers = {'start_fprop':torch.cuda.Event(enable_timing=True),
+                           'start_dgrad':torch.cuda.Event(enable_timing=True),
+                           'stop_fprop':torch.cuda.Event(enable_timing=True),
+                           'stop_dgrad':torch.cuda.Event(enable_timing=True)}
+        else:
+            self.timers = None
+
+    def forward(self, input, batch, seqlen, heads):
+        return FastSoftmaxFunction.apply(input, self.dim, batch, seqlen, heads, self.stream, self.sync, self.timers)
+
+###########################################################################################
+
+class FastMaskSoftmaxFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(cxt, input, mask, dim, batch, seqlen, heads, stream, sync, timers):
+        if timers: timers['start_fprop'].record()
+        ext_ops.FastMaskSoftmaxFprop(input, mask, batch, seqlen, heads, stream, sync)
+        if timers: timers['stop_fprop'].record()
+
+        cxt.save_for_backward(input,seqlen)
+        cxt.dim = dim
+        cxt.batch = batch
+        cxt.heads = heads
+        cxt.stream = stream
+        cxt.sync = sync
+        cxt.timers = timers
+        return input
+
+    @staticmethod
+    def backward(cxt, grad_output):
+        output, seqlen, = cxt.saved_tensors
+        dim = cxt.dim
+        batch = cxt.batch
+        heads = cxt.heads
+
+        if cxt.timers: cxt.timers['start_dgrad'].record()
+        ext_ops.FastSoftmaxBprop(output, grad_output, batch, seqlen, heads, cxt.stream, cxt.sync)
+        if cxt.timers: cxt.timers['stop_dgrad'].record()
+        return grad_output, None, None, None, None, None, None, None, None, None, None, None
+
+class FastMaskSoftmax(torch.nn.Module):
+    def __init__(self, dim=None, stream=True, sync=True, timer=False):
+        super(FastMaskSoftmax, self).__init__()
+        self.dim = dim
+        self.stream = stream
+        self.sync = sync
+        if timer:
+            self.timers = {'start_fprop':torch.cuda.Event(enable_timing=True),
+                           'start_dgrad':torch.cuda.Event(enable_timing=True),
+                           'stop_fprop':torch.cuda.Event(enable_timing=True),
+                           'stop_dgrad':torch.cuda.Event(enable_timing=True)}
+        else:
+            self.timers = None
+
+    def forward(self, input, mask, batch, seqlen, heads):
+        return FastMaskSoftmaxFunction.apply(input, mask, self.dim, batch, seqlen, heads, self.stream, self.sync, self.timers)
+
+###########################################################################################
+
+class FastMaskSoftmaxDropoutFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(cxt, input, mask, dim, batch, seqlen, heads, dropout_prob, stream, sync, timers, is_training):
+        if timers: timers['start_fprop'].record()
+        output, dropout_mask, = ext_ops.FastMaskSoftmaxDropoutFprop(input, mask, batch, seqlen, heads, dropout_prob, stream, sync, is_training)
+        if timers: timers['stop_fprop'].record()
+
+        cxt.save_for_backward(input,dropout_mask,seqlen)
+        cxt.dim = dim
+        cxt.batch = batch
+        cxt.heads = heads
+        cxt.dropout_prob = dropout_prob
+        cxt.stream = stream
+        cxt.sync = sync
+        cxt.timers = timers
+        return output
+
+    @staticmethod
+    def backward(cxt, grad_output):
+        output, dropout_mask, seqlen, = cxt.saved_tensors
+        dim = cxt.dim
+        batch = cxt.batch
+        heads = cxt.heads
+        dropout_prob = cxt.dropout_prob
+
+        if cxt.timers: cxt.timers['start_dgrad'].record()
+        ext_ops.FastMaskSoftmaxDropoutBprop(output, grad_output, dropout_mask, batch, seqlen, heads, dropout_prob, cxt.stream, cxt.sync)
+        if cxt.timers: cxt.timers['stop_dgrad'].record()
+        return grad_output, None, None, None, None, None, None, None, None, None, None, None, None, None
+
+class FastMaskSoftmaxDropout(torch.nn.Module):
+    def __init__(self, dim=None, dropout_prob=None, stream=True, sync=True, timer=False):
+        super(FastMaskSoftmaxDropout, self).__init__()
+        self.dim = dim
+        self.dropout_prob = dropout_prob
+        self.stream = stream
+        self.sync = sync
+        if timer:
+            self.timers = {'start_fprop':torch.cuda.Event(enable_timing=True),
+                           'start_dgrad':torch.cuda.Event(enable_timing=True),
+                           'stop_fprop':torch.cuda.Event(enable_timing=True),
+                           'stop_dgrad':torch.cuda.Event(enable_timing=True)}
+        else:
+            self.timers = None
+
+    def forward(self, input, mask, batch, seqlen, heads, is_training):
+        return FastMaskSoftmaxDropoutFunction.apply(input, mask, self.dim, batch, seqlen, heads, self.dropout_prob, self.stream, self.sync, self.timers, is_training)
+
+###########################################################################################
+
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/config/training_event.py b/nlp/language_model/bert_sample/pytorch/iluvatar/config/training_event.py
new file mode 100644
index 000000000..d9001d859
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/config/training_event.py
@@ -0,0 +1,243 @@
+# /***************************************************************************************************
+# * Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+# * Copyright Declaration: This software, including all of its code and documentation,
+# * except for the third-party software it contains, is a copyrighted work of Shanghai Iluvatar CoreX
+# * Semiconductor Co., Ltd. and its affiliates ("Iluvatar CoreX") in accordance with the PRC Copyright
+# * Law and relevant international treaties, and all rights contained therein are enjoyed by Iluvatar
+# * CoreX. No user of this software shall have any right, ownership or interest in this software and
+# * any use of this software shall be in compliance with the terms and conditions of the End User
+# * License Agreement.
+#  **************************************************************************************************/
+
+import os
+from typing import Tuple
+
+import torch
+import torch.distributed as dist
+
+import amp_C
+import apex_C
+from apex import amp
+from apex.amp import _amp_state
+from apex.contrib.optimizers.distributed_fused_lamb import DistributedFusedLAMB
+from apex.optimizers import FusedLAMB
+from apex.parallel import DistributedDataParallel as APEX_DDP
+from apex.parallel.distributed import flat_dist_call
+from torch.cuda.amp import GradScaler
+from torch.nn.parallel import DistributedDataParallel as NativeDDP
+from torch.optim import Optimizer
+
+import utils
+
+from train.event.base import BaseTrainingEventInterface
+from train.event.base import BatchType, BERT_MODEL
+
+from converter import convert_model
+from distributed_fused_lamb import _pipeline_block_reductions_patched, _pipeline_step_patched
+
+
+class ApexTrainingEvent(BaseTrainingEventInterface):
+
+    def __init__(self, config):
+        super(ApexTrainingEvent, self).__init__(config)
+        self.num_iters_per_dataloader = 1
+
+        self.optimizer = None
+
+        self.overflow_buf = None
+        if config.allreduce_post_accumulation:
+            self.overflow_buf = torch.cuda.IntTensor([0])
+
+    def on_init_start(self):
+        if self.config.unpad:
+            import ext_ops
+            torch.cuda.synchronize()
+            ext_ops.InitMHACUDAExtension()
+            torch.cuda.synchronize()
+            print("Init unpad")
+
+    def convert_model(self, model: BERT_MODEL) -> BERT_MODEL:
+        return convert_model(model, self.config)
+
+    def create_optimizer(self, model: BERT_MODEL) -> Optimizer:
+        config = self.config
+        param_optimizer = list(model.named_parameters())
+
+        no_decay = ['bias', 'gamma', 'beta', 'LayerNorm']
+
+        optimizer_grouped_parameters = [
+            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
+             'weight_decay': self.config.weight_decay_rate},
+            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
+
+        if config.distributed_lamb:
+            DistributedFusedLAMB._pipeline_block_reductions = _pipeline_block_reductions_patched
+            DistributedFusedLAMB._pipeline_step = _pipeline_step_patched
+            optimizer = DistributedFusedLAMB(optimizer_grouped_parameters, lr=config.learning_rate,
+                                             betas=(config.opt_lamb_beta_1, config.opt_lamb_beta_2),
+                                             eps=1e-6,
+                                             max_grad_norm=1.0,
+                                             overlap_reductions=config.dwu_overlap_reductions,
+                                             dwu_group_size=config.dwu_group_size,
+                                             dwu_num_blocks=config.dwu_num_blocks,
+                                             dwu_num_chunks=config.dwu_num_chunks,
+                                             dwu_num_rs_pg=config.dwu_num_rs_pg,
+                                             dwu_num_ar_pg=config.dwu_num_ar_pg,
+                                             dwu_num_ag_pg=config.dwu_num_ag_pg,
+                                             use_nvlamb=False,
+                                             e5m2_allgather=config.dwu_e5m2_allgather)
+            optimizer.set_global_scale(float(os.getenv("INIT_LOSS_SCALE", 2 ** 20)))
+        else:
+            optimizer = FusedLAMB(optimizer_grouped_parameters,
+                                  lr=config.learning_rate,
+                                  betas=(config.opt_lamb_beta_1, config.opt_lamb_beta_2))
+
+        return optimizer
+
+    def model_to_fp16(self, model: BERT_MODEL, optimizer: Optimizer) -> Tuple[BERT_MODEL, Optimizer]:
+        self.optimizer = optimizer
+        config = self.config
+        if config.fp16 and config.bypass_amp:
+            model.half()
+
+        if config.fp16 and not config.bypass_amp:
+            if config.distributed_lamb:
+                model.half()
+            elif config.fp16:
+                if config.loss_scale == 0:
+                    if config.opt_level == 'O0':
+                        loss_scale = '1.0'
+                        master_weights = False
+                    elif config.opt_level == 'O1':
+                        loss_scale = 'dynamic'
+                        master_weights = None
+                    else:
+                        loss_scale = 'dynamic'
+                        master_weights = True
+                    model, optimizer = amp.initialize(model, optimizer, opt_level=config.opt_level,
+                                                      loss_scale=loss_scale,
+                                                      master_weights=master_weights)
+                else:
+                    # assert False, "code path not tested with cuda graphs"
+                    model, optimizer = amp.initialize(model, optimizer, opt_level=config.opt_level,
+                                                      loss_scale=config.loss_scale)
+                amp._amp_state.loss_scalers[0]._loss_scale = float(os.getenv("INIT_LOSS_SCALE", 2 ** 20))
+        return model, optimizer
+
+    def model_to_ddp(self, model: BERT_MODEL) -> BERT_MODEL:
+        config = self.config
+        use_ddp = dist.is_initialized()
+        if use_ddp and not config.distributed_lamb and not config.allreduce_post_accumulation:
+            if config.ddp_type == 'native':
+                model = NativeDDP(model,
+                                  device_ids=[config.local_rank],
+                                  bucket_cap_mb=100,
+                                  gradient_as_bucket_view=config.use_gradient_as_bucket_view)
+            elif config.ddp_type == 'apex':
+                model = APEX_DDP(model,
+                                 message_size=250000000,
+                                 delay_allreduce=True,
+                                 gradient_predivide_factor=torch.distributed.get_world_size())
+            else:
+                assert False, "Invalid DDP type"
+
+        if use_ddp and config.distributed_lamb:
+            flat_dist_call([param.data for param in model.parameters()], torch.distributed.broadcast, (0,))
+
+        return model
+
+    def on_step_begin(self, step: int):
+        update_step = step % self.config.gradient_accumulation_steps == 0
+        if self.config.distributed_lamb:
+            # TODO: 仅适用于8卡，需针对该参数调整
+            num_iters = 2835
+            self.optimizer.set_last_step((step % num_iters) == 0)
+            self.optimizer.set_is_accumulation_step(not update_step)
+
+    def on_step_end(self, step: int):
+        if self.config.distributed_lamb:
+            self.optimizer._lr = torch.tensor([self.optimizer.param_groups[0]['lr']]).to(self.config.device)
+
+    def on_backward(self, step: int, loss: torch.Tensor, optimizer: Optimizer, grad_scaler: GradScaler=None):
+
+        loss = loss / self.config.gradient_accumulation_steps
+
+        if self.config.bypass_amp:
+            loss.backward()
+        elif self.config.distributed_lamb:
+            optimizer._lazy_init_stage1()
+            grad_scaler.scale(loss).backward()
+            optimizer._lazy_init_stage2()
+        else:
+            with amp.scale_loss(loss, optimizer, delay_overflow_check=self.config.allreduce_post_accumulation) as scaled_loss:
+                scaled_loss.backward()
+
+        update_step = step % self.config.gradient_accumulation_steps == 0
+        if update_step:
+            self.update_model_params(loss, optimizer, grad_scaler)
+
+    def update_model_params(self, loss, optimizer: Optimizer, grad_scaler: GradScaler=None):
+        config = self.config
+        if config.allreduce_post_accumulation and config.use_cuda_graph:
+            assert False, "code path not tested with cuda graphs"
+        if config.distributed_lamb:
+            optimizer.set_global_scale(grad_scaler._get_scale_async())
+            optimizer.complete_reductions()
+            grad_scaler.step(optimizer)
+            grad_scaler.update()
+
+            found_inf = optimizer._overflow_buf  # GPU tensor
+
+        elif config.allreduce_post_accumulation:
+            # manually allreduce gradients after all accumulation steps
+            # check for Inf/NaN
+            # 1. allocate an uninitialized buffer for flattened gradient
+            # torch.nn.utils.clip_grad_norm_(parameters=amp.master_params(optimizer), max_norm=1.0, norm_type=2.0)
+            scaler = _amp_state.loss_scalers[0]
+            master_grads = [p.grad for p in amp.master_params(optimizer) if p.grad is not None]
+            flat_grad_size = sum(p.numel() for p in master_grads)
+            allreduce_dtype = torch.float16 if config.allreduce_post_accumulation_fp16 else torch.float32
+            flat_raw = torch.empty(flat_grad_size, device='cuda', dtype=allreduce_dtype)
+            # 2. combine unflattening and predivision of unscaled 'raw' gradient
+            allreduced_views = apex_C.unflatten(flat_raw, master_grads)
+            self.overflow_buf.zero_()
+            amp_C.multi_tensor_scale(65536,
+                                     self.overflow_buf,
+                                     [master_grads, allreduced_views],
+                                     scaler.loss_scale() / (
+                                             torch.distributed.get_world_size() * config.gradient_accumulation_steps))
+            # 3. sum gradient across ranks. Because of the predivision, this averages the gradient
+            torch.distributed.all_reduce(flat_raw)
+            # 4. combine unscaling and unflattening of allreduced gradient
+            self.overflow_buf.zero_()
+            amp_C.multi_tensor_scale(65536,
+                                     self.overflow_buf,
+                                     [allreduced_views, master_grads],
+                                     1. / scaler.loss_scale())
+            # 5. update loss scale
+            scaler = _amp_state.loss_scalers[0]
+            old_overflow_buf = scaler._overflow_buf
+            scaler._overflow_buf = self.overflow_buf
+            had_overflow = scaler.update_scale()
+            scaler._overflow_buf = old_overflow_buf
+            # 6. call optimizer step function
+            if had_overflow == 0:
+                optimizer.step()
+            else:
+                # Overflow detected, print message and clear gradients
+                if utils.is_main_process():
+                    print("Overflow detected, reduced loss_scaler to %f" % (scaler.loss_scale()))
+                if _amp_state.opt_properties.master_weights:
+                    for param in optimizer._amp_stash.all_fp32_from_fp16_params:
+                        param.grad = None
+        else:
+            optimizer.step()
+
+        optimizer.zero_grad()
+
+    def _prepare_dist_lamb(self, model):
+        if self.config.local_rank != -1:
+            flat_dist_call([param.data for param in model.parameters()], torch.distributed.broadcast, (0,))
+
+
+
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/csrc/mha_funcs.cu b/nlp/language_model/bert_sample/pytorch/iluvatar/csrc/mha_funcs.cu
new file mode 100644
index 000000000..f0508ee19
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/csrc/mha_funcs.cu
@@ -0,0 +1,405 @@
+/*
+ Copyright (c) 2022 Iluvatar CoreX. All rights reserved.
+ Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+
+#include <vector>
+#include <iostream>
+
+#include <ATen/ATen.h>
+#include <cuda.h>
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+//#include <cuda_profiler_api.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include <math.h>
+
+#define nstreams 16
+
+// global variables.
+cudaStream_t stream[nstreams];
+cublasHandle_t handle;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+void FastBmm1Fprop_(torch::Tensor &A,
+                         torch::Tensor &B,
+                         torch::Tensor &C,
+	    	         int batch,
+  	  	         torch::Tensor &seq_len,
+                         int heads,
+		         int embed,
+			 bool scale,
+			 bool strided,
+			 bool enable_stream,
+			 bool sync)
+{
+
+    float one = 1.0, zero = 0.0, alpha = 1.0 / sqrt(static_cast<float>(embed));
+
+    int *seqlen = static_cast<int*>(seq_len.data_ptr());
+
+    void *ptrA = static_cast<void*>(static_cast<half*>(A.data_ptr()) + (strided ? embed : 0)); 	// key
+    void *ptrB = static_cast<void*>(static_cast<half*>(B.data_ptr())); 				// query
+    void *ptrC = static_cast<void*>(static_cast<half*>(C.data_ptr())); 	        		// output
+
+    for(int i = 0; i < (enable_stream ? batch : 1); i++) {
+        cublasSetStream(handle, enable_stream ? stream[i%nstreams]: at::cuda::getCurrentCUDAStream());
+        cublasGemmStridedBatchedEx(handle,
+                                   CUBLAS_OP_T,
+                                   CUBLAS_OP_N,
+                                   seqlen[i],
+                                   seqlen[i],
+                                   embed,
+                                   static_cast<const void*>(scale ? &alpha : &one),
+                                   ptrA,
+                                   CUDA_R_16F,
+                                   (enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
+                                   strided ? 3*embed : embed,
+                                   ptrB,
+                                   CUDA_R_16F,
+                                   (enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
+                                   strided ? 3*embed : embed,
+                                   static_cast<const void*>(&zero),
+                                   ptrC,
+                                   CUDA_R_16F,
+                                   seqlen[i],
+                                   seqlen[i]*seqlen[i],
+                                   enable_stream ? heads : batch*heads,
+                                   CUDA_R_32F,
+                                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+	ptrA = static_cast<void*>(static_cast<half*>(ptrA) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
+	ptrB = static_cast<void*>(static_cast<half*>(ptrB) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
+	ptrC = static_cast<void*>(static_cast<half*>(ptrC) + heads*seqlen[i]*seqlen[i]);
+    }
+    for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
+        if(sync) cudaStreamSynchronize(stream[i]);
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void FastBmm2Fprop_(torch::Tensor &A,
+                    torch::Tensor &B,
+                    torch::Tensor &C,
+                    int batch,
+                    torch::Tensor &seq_len,
+                    int heads,
+                    int embed,
+		    bool scale,
+		    bool strided,
+		    bool enable_stream,
+		    bool sync)
+{
+
+    float one = 1.0, zero = 0.0;
+
+    int *seqlen = static_cast<int*>(seq_len.data_ptr());
+
+    void *ptrA = static_cast<void*>(static_cast<half*>(A.data_ptr()) + (strided ? 2*embed : 0));  // value
+    void *ptrB = static_cast<void*>(static_cast<half*>(B.data_ptr()));            		// query*key
+    void *ptrC = static_cast<void*>(static_cast<half*>(C.data_ptr()));           		 // output
+
+    for(int i = 0; i < (enable_stream ? batch : 1); i++) {
+        cublasSetStream(handle, enable_stream ? stream[i%nstreams]: at::cuda::getCurrentCUDAStream());
+        cublasGemmStridedBatchedEx(handle,
+                                   CUBLAS_OP_N,
+                                   CUBLAS_OP_N,
+                                   embed,
+                                   seqlen[i],
+                                   seqlen[i],
+                                   static_cast<const void*>(&one),
+                                   ptrA,
+                                   CUDA_R_16F,
+                                   (enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
+                                   strided ? 3*embed : embed,
+                                   ptrB,
+                                   CUDA_R_16F,
+                                   seqlen[i],
+                                   seqlen[i]*seqlen[i],
+                                   static_cast<const void*>(&zero),
+                                   ptrC,
+                                   CUDA_R_16F,
+                                   enable_stream ? heads*embed : batch*heads*embed,
+                                   embed,
+                                   enable_stream ? heads : batch*heads,
+                                   CUDA_R_32F,
+                                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+        ptrA = static_cast<void*>(static_cast<half*>(ptrA) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
+        ptrB = static_cast<void*>(static_cast<half*>(ptrB) + heads*seqlen[i]*seqlen[i]);
+        ptrC = static_cast<void*>(static_cast<half*>(ptrC) + seqlen[i]*heads*embed);
+
+    }
+    for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
+        if(sync) cudaStreamSynchronize(stream[i]);
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void FastBmm1Dgrad1_(torch::Tensor &A,
+                         torch::Tensor &B,
+                         torch::Tensor &C,
+                         int batch,
+                         torch::Tensor &seq_len,
+                         int heads,
+                         int embed,
+			 bool scale,
+			 bool strided,
+			 bool enable_stream,
+			 bool sync)
+{
+
+    float one = 1.0, zero = 0.0, alpha = 1.0 / sqrt(static_cast<float>(embed));
+
+    int *seqlen = static_cast<int*>(seq_len.data_ptr());
+
+    void *ptrA = static_cast<void*>(static_cast<half*>(A.data_ptr()));           		// query
+    void *ptrB = static_cast<void*>(static_cast<half*>(B.data_ptr()));
+    void *ptrC = static_cast<void*>(static_cast<half*>(C.data_ptr()) + (strided ? embed : 0)); 	// grad_key
+
+    for(int i = 0; i < (enable_stream ? batch : 1); i++) {
+        cublasSetStream(handle, enable_stream ? stream[i%nstreams] : at::cuda::getCurrentCUDAStream());
+        cublasGemmStridedBatchedEx(handle,
+                                   CUBLAS_OP_N,
+                                   CUBLAS_OP_T,
+                                   embed,
+                                   seqlen[i],
+                                   seqlen[i],
+                                   static_cast<const void*>(scale ? &alpha : &one),
+                                   ptrA,
+                                   CUDA_R_16F,
+                                   (enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
+                                   strided ? 3*embed : embed,
+                                   ptrB,
+                                   CUDA_R_16F,
+                                   seqlen[i],
+                                   seqlen[i]*seqlen[i],
+                                   static_cast<const void*>(&zero),
+                                   ptrC,
+                                   CUDA_R_16F,
+                                   (enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
+                                   strided ? 3*embed : embed,
+                                   enable_stream ? heads : heads*batch,
+                                   CUDA_R_32F,
+                                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+        ptrA = static_cast<void*>(static_cast<half*>(ptrA) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
+        ptrB = static_cast<void*>(static_cast<half*>(ptrB) + heads*seqlen[i]*seqlen[i]);
+        ptrC = static_cast<void*>(static_cast<half*>(ptrC) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
+
+    }
+    for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
+        if(sync) cudaStreamSynchronize(stream[i]);
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void FastBmm2Dgrad1_(torch::Tensor &A,
+                     torch::Tensor &B,
+                     torch::Tensor &C,
+                     int batch,
+                     torch::Tensor &seq_len,
+                     int heads,
+                     int embed,
+		     bool scale,
+		     bool strided,
+		     bool enable_stream,
+		     bool sync)
+{
+
+    float one = 1.0, zero = 0.0;
+
+    int *seqlen = static_cast<int*>(seq_len.data_ptr());
+
+    void *ptrA = static_cast<void*>(static_cast<half*>(A.data_ptr()) + (strided ? 2*embed : 0));  // value
+    void *ptrB = static_cast<void*>(static_cast<half*>(B.data_ptr()));
+    void *ptrC = static_cast<void*>(static_cast<half*>(C.data_ptr()));
+
+    for(int i = 0; i < (enable_stream ? batch : 1); i++) {
+        cublasSetStream(handle, enable_stream ? stream[i%nstreams] : at::cuda::getCurrentCUDAStream());
+        cublasGemmStridedBatchedEx(handle,
+                                   CUBLAS_OP_T,
+                                   CUBLAS_OP_N,
+                                   seqlen[i],
+                                   seqlen[i],
+                                   embed,
+                                   static_cast<const void*>(&one),
+                                   ptrA,
+                                   CUDA_R_16F,
+                                   (enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
+                                   strided ? 3*embed : embed,
+                                   ptrB,
+                                   CUDA_R_16F,
+				   enable_stream ? heads*embed : batch*heads*embed,
+                                   embed,
+                                   static_cast<const void*>(&zero),
+                                   ptrC,
+                                   CUDA_R_16F,
+                                   seqlen[i],
+                                   seqlen[i]*seqlen[i],
+                                   enable_stream ? heads : batch*heads,
+                                   CUDA_R_32F,
+                                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+        ptrA = static_cast<void*>(static_cast<half*>(ptrA) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
+        ptrB = static_cast<void*>(static_cast<half*>(ptrB) + seqlen[i]*heads*embed);
+        ptrC = static_cast<void*>(static_cast<half*>(ptrC) + heads*seqlen[i]*seqlen[i]);
+
+    }
+    for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
+        if(sync) cudaStreamSynchronize(stream[i]);
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void FastBmm1Dgrad2_(torch::Tensor &A,
+                         torch::Tensor &B,
+                         torch::Tensor &C,
+                         int batch,
+                         torch::Tensor &seq_len,
+                         int heads,
+                         int embed,
+			 bool scale,
+			 bool strided,
+			 bool enable_stream,
+			 bool sync)
+{
+
+    float one = 1.0, zero = 0.0, alpha = 1.0 / sqrt(static_cast<float>(embed));
+
+    int *seqlen = static_cast<int*>(seq_len.data_ptr());
+
+    void *ptrA = static_cast<void*>(static_cast<half*>(A.data_ptr()) + (strided ? embed : 0));  	// key
+    void *ptrB = static_cast<void*>(static_cast<half*>(B.data_ptr()));
+    void *ptrC = static_cast<void*>(static_cast<half*>(C.data_ptr()));          		// grad query
+
+    for(int i = 0; i < (enable_stream ? batch : 1); i++) {
+        cublasSetStream(handle, enable_stream ? stream[i%nstreams] : at::cuda::getCurrentCUDAStream());
+        cublasGemmStridedBatchedEx(handle,
+                                   CUBLAS_OP_N,
+                                   CUBLAS_OP_N,
+                                   embed,
+                                   seqlen[i],
+                                   seqlen[i],
+                                   static_cast<const void*>(scale ? &alpha : &one),
+                                   ptrA,
+                                   CUDA_R_16F,
+                                   (enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
+                                   strided ? 3*embed : embed,
+                                   ptrB,
+                                   CUDA_R_16F,
+                                   seqlen[i],
+                                   seqlen[i]*seqlen[i],
+                                   static_cast<const void*>(&zero),
+                                   ptrC,
+                                   CUDA_R_16F,
+                                   (enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
+                                   strided ? 3*embed : embed,
+                                   enable_stream ? heads : batch*heads,
+                                   CUDA_R_32F,
+                                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+        ptrA = static_cast<void*>(static_cast<half*>(ptrA) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
+        ptrB = static_cast<void*>(static_cast<half*>(ptrB) + heads*seqlen[i]*seqlen[i]);
+        ptrC = static_cast<void*>(static_cast<half*>(ptrC) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
+
+    }
+    for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
+        if(sync) cudaStreamSynchronize(stream[i]);
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void FastBmm2Dgrad2_(torch::Tensor &A,
+                     torch::Tensor &B,
+                     torch::Tensor &C,
+                     int batch,
+                     torch::Tensor &seq_len,
+                     int heads,
+                     int embed,
+		     bool scale,
+		     bool strided,
+		     bool enable_stream,
+		     bool sync)
+{
+
+    float one = 1.0, zero = 0.0;
+
+    int *seqlen = static_cast<int*>(seq_len.data_ptr());
+
+    void *ptrA = static_cast<void*>(static_cast<half*>(A.data_ptr()));
+    void *ptrB = static_cast<void*>(static_cast<half*>(B.data_ptr()));
+    void *ptrC = static_cast<void*>(static_cast<half*>(C.data_ptr()) + (strided ? 2*embed : 0));  // grad-value
+
+    for(int i = 0; i < (enable_stream ? batch : 1); i++) {
+        cublasSetStream(handle, enable_stream ? stream[i%nstreams] : at::cuda::getCurrentCUDAStream());
+        cublasGemmStridedBatchedEx(handle,
+                                   CUBLAS_OP_N,
+                                   CUBLAS_OP_T,
+                                   embed,
+                                   seqlen[i],
+                                   seqlen[i],
+                                   static_cast<const void*>(&one),
+                                   ptrA,
+                                   CUDA_R_16F,
+				   enable_stream ? heads*embed : batch*heads*embed,
+                                   embed,
+                                   ptrB,
+                                   CUDA_R_16F,
+                                   seqlen[i],
+                                   seqlen[i]*seqlen[i],
+                                   static_cast<const void*>(&zero),
+                                   ptrC,
+                                   CUDA_R_16F,
+                                   (enable_stream ? 1 : batch) * (strided ? heads*3*embed : heads*embed),
+                                   strided ? 3*embed : embed,
+                                   enable_stream ? heads : batch*heads,
+                                   CUDA_R_32F,
+                                   CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+        ptrA = static_cast<void*>(static_cast<half*>(ptrA) + seqlen[i]*heads*embed);
+        ptrB = static_cast<void*>(static_cast<half*>(ptrB) + heads*seqlen[i]*seqlen[i]);
+        ptrC = static_cast<void*>(static_cast<half*>(ptrC) + (strided ? seqlen[i]*heads*3*embed : seqlen[i]*heads*embed));
+
+    }
+    for(int i = 0; i < (enable_stream ? nstreams : 0); i++) {
+        if(sync) cudaStreamSynchronize(stream[i]);
+    }
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void init_mha_cuda_extension()
+{
+    // CUDA Stream.
+    for(int i = 0; i < nstreams; i++) {
+        cudaStreamCreate(&stream[i]);
+    }
+
+    // CuBlas Handle.
+    cublasCreate(&handle);
+    cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("InitMHACUDAExtension", &init_mha_cuda_extension, "InitMHACUDAExtension");
+  m.def("FastBmm1Fprop", &FastBmm1Fprop_, "FastBmm1Fprop");
+  m.def("FastBmm1Dgrad1", &FastBmm1Dgrad1_, "FastBmm1Dgrad1");
+  m.def("FastBmm1Dgrad2", &FastBmm1Dgrad2_, "FastBmm1Dgrad2");
+  m.def("FastBmm2Fprop", &FastBmm2Fprop_, "FastBmm2Fprop");
+  m.def("FastBmm2Dgrad1", &FastBmm2Dgrad1_, "FastBmm2Dgrad1");
+  m.def("FastBmm2Dgrad2", &FastBmm2Dgrad2_, "FastBmm2Dgrad2");
+}
\ No newline at end of file
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/results/iluvatar-1x8-stage-1.txt b/nlp/language_model/bert_sample/pytorch/iluvatar/results/iluvatar-1x8-stage-1.txt
new file mode 100644
index 000000000..f8b685336
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/results/iluvatar-1x8-stage-1.txt
@@ -0,0 +1,191 @@
++ cd ../../benchmark/nlp/lm/bert/pytorch/base/
++ '[' 0 '!=' 0 ']'
++ bash run_training.sh --name iluvatar --config 01V100x1x8 --data_dir ../../../../../../datasets/bert_mini/
+Number of CPU sockets on a node: 2
+Number of CPU cores per socket: 40
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=0-9', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=0', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_01V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=10-19', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=1', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_01V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=20-29', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=2', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_01V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=30-39', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=3', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_01V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=40-49', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=4', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_01V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=50-59', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=5', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_01V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=60-69', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=6', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_01V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=70-79', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=7', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_01V100x1x8.py']
+================================================================================
+device: cuda:4 n_gpu: 8, distributed training: True, 16-bits training: Truedevice: cuda:7 n_gpu: 8, distributed training: True, 16-bits training: True
+
+device: cuda:6 n_gpu: 8, distributed training: True, 16-bits training: True
+device: cuda:5 n_gpu: 8, distributed training: True, 16-bits training: True
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646378788446, "rank": 7}}
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646378788446, "rank": 6}}
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646378788446, "rank": 4}}
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646378788446, "rank": 5}}
+device: cuda:0 n_gpu: 8, distributed training: True, 16-bits training: True
+device: cuda:3 n_gpu: 8, distributed training: True, 16-bits training: True
+device: cuda:2 n_gpu: 8, distributed training: True, 16-bits training: True
+device: cuda:1 n_gpu: 8, distributed training: True, 16-bits training: True
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646378788450, "rank": 0}}
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646378788450, "rank": 3}}
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646378788450, "rank": 2}}
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646378788450, "rank": 1}}
+[PerfLog] {"event": "SUBMITTED_INFO", "value": {"submmiter": "pytorch", "model": "iluvatar", "config_path": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_01V100x1x8.py", "config": {"data_dir": "../../../../../../datasets/bert_mini/", "train_dir": "../../../../../../datasets/bert_mini/2048_shards_uncompressed", "bert_model": "bert-large-uncased", "output_dir": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/out", "eval_dir": "../../../../../../datasets/bert_mini/eval_set_uncompressed", "eval_iter_start_samples": 150000, "eval_iter_samples": 150000, "num_eval_examples": 10000, "cache_eval_data": true, "init_checkpoint": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/../../../../../../../model_zoo/lm_bert/model.ckpt-1000.pt", "init_tf_checkpoint": "None", "verify_checkpoint": true, "max_seq_length": 512, "max_predictions_per_seq": 76, "train_batch_size": 27, "eval_batch_size": 27, "learning_rate": 0.00035, "weight_decay_rate": 0.01, "opt_lamb_beta_1": 0.9, "opt_lamb_beta_2": 0.999, "max_steps": 14000, "max_samples_termination": 4500000, "warmup_proportion": 0, "warmup_steps": 0, "start_warmup_step": 0, "local_rank": 0, "dist_backend": "nccl", "seed": 9031, "gradient_accumulation_steps": 1, "fp16": true, "loss_scale": 0.0, "log_freq": 200, "checkpoint_activations": false, "resume_from_checkpoint": false, "resume_init_checkpoint": "None", "keep_n_most_recent_checkpoints": 20, "num_samples_per_checkpoint": 500000, "min_samples_to_start_checkpoints": 3000000, "save_checkpoint": false, "do_train": true, "exchange_padding": true, "enable_fuse_dropout": false, "disable_fuse_mask": false, "fused_gelu_bias": true, "fused_dropout_add": false, "dense_seq_output": true, "use_env": false, "bert_config_path": "../../../../../../datasets/bert_mini/bert_config.json", "target_mlm_accuracy": 0.489, "train_mlm_accuracy_window_size": 0, "num_epochs_to_generate_seeds_for": 2, "use_ddp": false, "use_gradient_as_bucket_view": false, "device": "cuda:0", "n_gpu": 8, "eval_interval_samples": 150000, "eval_steps": 200, "cuda_graph_mode": "segmented", "max_iterations_per_graph": 4, "allreduce_post_accumulation": false, "allreduce_post_accumulation_fp16": false, "unpad": false, "unpad_fmha": false, "pad": false, "disable_fuse_scale": false, "disable_fuse_qkv": false, "disable_apex_softmax": false, "enable_stream": false, "fused_mha": true, "use_cuda_graph": false, "ddp_type": "apex", "bypass_amp": false, "distributed_lamb": true, "dwu_group_size": 0, "dwu_num_blocks": 1, "dwu_num_chunks": 1, "dwu_num_rs_pg": 1, "dwu_num_ar_pg": 1, "dwu_num_ag_pg": 2, "dwu_overlap_reductions": false, "dwu_e5m2_allgather": false, "opt_level": "O2", "config": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_01V100x1x8.py", "training_event": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/training_event:ApexTrainingEvent"}}, "metadata": {"file": "./run_pretraining.py", "lineno": 49, "time_ms": 1646378788451, "rank": 0}}
+[PerfLog] {"event": "INIT_START", "value": null, "metadata": {"file": "./run_pretraining.py", "lineno": 55, "time_ms": 1646378788547, "rank": 0}}
+[PerfLog] {"event": "CONVERT_MODEL", "value": {"type": "BertForPreTraining", "module": "model.models.modeling"}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 44, "time_ms": 1646378795132, "rank": 0}}
+[PerfLog] {"event": "CREATE_OPTIMIZER", "value": {"type": "DistributedFusedLAMB", "module": "apex.contrib.optimizers.distributed_fused_lamb"}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 45, "time_ms": 1646378795826, "rank": 0}}
+[PerfLog] {"event": "MODEL_TO_FP16", "value": {"fp16": true}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 46, "time_ms": 1646378795888, "rank": 0}}
+[PerfLog] {"event": "MODEL_TO_DDP", "value": {"type": "BertForPreTraining", "module": "model.models.modeling"}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 47, "time_ms": 1646378796027, "rank": 0}}
+/tmp/filewXAuIt/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+/tmp/fileSbZ7RC/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/filekXNPzt/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+/tmp/fileOiH24C/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+/tmp/fileyWSQvF/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/fileipKaXF/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+/tmp/fileac3nGG/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+/tmp/filemxiYvM/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+[PerfLog] {"event": "INIT_EVALUATION", "value": {"eval_loss": 3.9250423908233643, "eval_mlm_accuracy": 0.39407235383987427, "time": 26.064650058746338}, "metadata": {"file": "./run_pretraining.py", "lineno": 97, "time_ms": 1646378823081, "rank": 0}}
+[PerfLog] {"event": "INIT_END", "value": {"other": "Finish initialization"}, "metadata": {"file": "./run_pretraining.py", "lineno": 110, "time_ms": 1646378823083, "rank": 0}}
+[PerfLog] {"event": "TRAIN_BEGIN", "value": null, "metadata": {"file": "./run_pretraining.py", "lineno": 117, "time_ms": 1646378823088, "rank": 0}}
+[PerfLog] {"event": "EPOCH_BEGIN", "value": {"epoch": 0}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 110, "time_ms": 1646378823088, "rank": 0}}
+/tmp/file7yuU92/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/fileDqVvpu/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/fileum3KOP/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/filebsKBNh/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/fileY3KgRa/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/file81gula/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/file5N3f8X/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/fileoJ0l70/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+[PerfLog] {"event": "STEP_END", "value": {"loss": 3.3671875, "mlm_acc": 0.4545454680919647, "epoch": 0, "end_training": false, "global_steps": 1200, "num_trained_samples": 258984, "iter_dataloader_idx": 2, "learning_rate": 0.00032035714285714285, "seq/s": 181.2822897880427, "step": 1200}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646379104577, "rank": 0}}
+[PerfLog] {"event": "EVALUATION", "value": {"global_steps": 1200, "eval_loss": 3.8590714931488037, "eval_mlm_accuracy": 0.40260082483291626, "time": 16.11301875114441}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 154, "time_ms": 1646379104578, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 3.703125, "mlm_acc": 0.4392956495285034, "epoch": 0, "end_training": false, "global_steps": 1400, "num_trained_samples": 302184, "iter_dataloader_idx": 2, "learning_rate": 0.0003157142857142857, "seq/s": 179.23294514714507, "step": 1400}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646379361418, "rank": 0}}
+[PerfLog] {"event": "EVALUATION", "value": {"global_steps": 1400, "eval_loss": 3.7005832195281982, "eval_mlm_accuracy": 0.41687530279159546, "time": 16.125940561294556}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 154, "time_ms": 1646379361419, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 3.3203125, "mlm_acc": 0.47136563062667847, "epoch": 0, "end_training": false, "global_steps": 1600, "num_trained_samples": 345384, "iter_dataloader_idx": 2, "learning_rate": 0.00031107142857142857, "seq/s": 179.0870453463187, "step": 1600}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646379618315, "rank": 0}}
+[PerfLog] {"event": "EVALUATION", "value": {"global_steps": 1600, "eval_loss": 3.438451051712036, "eval_mlm_accuracy": 0.4469316601753235, "time": 16.113437175750732}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 154, "time_ms": 1646379618315, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 3.18359375, "mlm_acc": 0.49414941668510437, "epoch": 0, "end_training": false, "global_steps": 1800, "num_trained_samples": 388584, "iter_dataloader_idx": 2, "learning_rate": 0.0003064285714285714, "seq/s": 179.775539746508, "step": 1800}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646379875209, "rank": 0}}
+[PerfLog] {"event": "EVALUATION", "value": {"global_steps": 1800, "eval_loss": 3.160597085952759, "eval_mlm_accuracy": 0.47976580262184143, "time": 16.113293886184692}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 154, "time_ms": 1646379875210, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 2.896484375, "mlm_acc": 0.5089707374572754, "epoch": 0, "end_training": true, "global_steps": 2000, "num_trained_samples": 431784, "iter_dataloader_idx": 2, "learning_rate": 0.0003017857142857143, "seq/s": 179.8876248458941, "step": 2000}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646380132328, "rank": 0}}
+[PerfLog] {"event": "EVALUATION", "value": {"global_steps": 2000, "eval_loss": 2.764784336090088, "eval_mlm_accuracy": 0.5257572531700134, "time": 16.11146640777588}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 154, "time_ms": 1646380132328, "rank": 0}}
+[PerfLog] {"event": "EPOCH_END", "value": {"epoch": 0}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 159, "time_ms": 1646380132426, "rank": 0}}
+[PerfLog] {"event": "TRAIN_END", "value": null, "metadata": {"file": "./run_pretraining.py", "lineno": 128, "time_ms": 1646380132426, "rank": 0}}
+[PerfLog] {"event": "FINISHED", "value": {"e2e_time": 1345.2956030368805, "training_sequences_per_second": 329.93772425454694, "converged": true, "final_loss": 2.764784336090088, "final_mlm_accuracy": 0.5257572531700134, "raw_train_time": 1309.338, "init_time": 34.536}, "metadata": {"file": "./run_pretraining.py", "lineno": 156, "time_ms": 1646380132458, "rank": 0}}
++ '[' 0 '!=' 0 ']'
++ echo 'eval result: pass.'
+eval result: pass.
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/results/iluvatar-1x8-stage-2.txt b/nlp/language_model/bert_sample/pytorch/iluvatar/results/iluvatar-1x8-stage-2.txt
new file mode 100644
index 000000000..e9127ab41
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/results/iluvatar-1x8-stage-2.txt
@@ -0,0 +1,189 @@
++ cd ../../benchmark/nlp/lm/bert/pytorch/base/
++ '[' 0 '!=' 0 ']'
++ bash run_training.sh --name iluvatar --config 02V100x1x8 --data_dir ../../../../../../datasets/bert_mini/
+Number of CPU sockets on a node: 2
+Number of CPU cores per socket: 40
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=0-9', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=0', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_02V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=10-19', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=1', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_02V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=20-29', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=2', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_02V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=30-39', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=3', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_02V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=40-49', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=4', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_02V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=50-59', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=5', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_02V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=60-69', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=6', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_02V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=70-79', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=7', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_02V100x1x8.py']
+================================================================================
+device: cuda:7 n_gpu: 8, distributed training: True, 16-bits training: True
+device: cuda:4 n_gpu: 8, distributed training: True, 16-bits training: True
+device: cuda:0 n_gpu: 8, distributed training: True, 16-bits training: True
+device: cuda:6 n_gpu: 8, distributed training: True, 16-bits training: True
+device: cuda:5 n_gpu: 8, distributed training: True, 16-bits training: Truedevice: cuda:2 n_gpu: 8, distributed training: True, 16-bits training: True
+
+device: cuda:3 n_gpu: 8, distributed training: True, 16-bits training: True
+device: cuda:1 n_gpu: 8, distributed training: True, 16-bits training: True
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646380150281, "rank": 7}}
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646380150281, "rank": 4}}
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646380150281, "rank": 0}}
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646380150281, "rank": 6}}
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646380150281, "rank": 2}}
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646380150281, "rank": 5}}
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646380150281, "rank": 3}}
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646380150281, "rank": 1}}
+[PerfLog] {"event": "SUBMITTED_INFO", "value": {"submmiter": "pytorch", "model": "iluvatar", "config_path": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_02V100x1x8.py", "config": {"data_dir": "../../../../../../datasets/bert_mini/", "train_dir": "../../../../../../datasets/bert_mini/2048_shards_uncompressed", "bert_model": "bert-large-uncased", "output_dir": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/out", "eval_dir": "../../../../../../datasets/bert_mini/eval_set_uncompressed", "eval_iter_start_samples": 150000, "eval_iter_samples": 150000, "num_eval_examples": 10000, "cache_eval_data": true, "init_checkpoint": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/../../../../../../../model_zoo/lm_bert/model.ckpt-3000.pt", "init_tf_checkpoint": "None", "verify_checkpoint": true, "max_seq_length": 512, "max_predictions_per_seq": 76, "train_batch_size": 27, "eval_batch_size": 27, "learning_rate": 0.00035, "weight_decay_rate": 0.01, "opt_lamb_beta_1": 0.9, "opt_lamb_beta_2": 0.999, "max_steps": 14000, "max_samples_termination": 4500000, "warmup_proportion": 0, "warmup_steps": 0, "start_warmup_step": 0, "local_rank": 0, "dist_backend": "nccl", "seed": 9031, "gradient_accumulation_steps": 1, "fp16": true, "loss_scale": 0.0, "log_freq": 200, "checkpoint_activations": false, "resume_from_checkpoint": false, "resume_init_checkpoint": "None", "keep_n_most_recent_checkpoints": 20, "num_samples_per_checkpoint": 500000, "min_samples_to_start_checkpoints": 3000000, "save_checkpoint": false, "do_train": true, "exchange_padding": true, "enable_fuse_dropout": false, "disable_fuse_mask": false, "fused_gelu_bias": true, "fused_dropout_add": false, "dense_seq_output": true, "use_env": false, "bert_config_path": "../../../../../../datasets/bert_mini/bert_config.json", "target_mlm_accuracy": 0.706, "train_mlm_accuracy_window_size": 0, "num_epochs_to_generate_seeds_for": 2, "use_ddp": false, "use_gradient_as_bucket_view": false, "device": "cuda:0", "n_gpu": 8, "eval_interval_samples": 150000, "eval_steps": 200, "cuda_graph_mode": "segmented", "max_iterations_per_graph": 4, "allreduce_post_accumulation": false, "allreduce_post_accumulation_fp16": false, "unpad": false, "unpad_fmha": false, "pad": false, "disable_fuse_scale": false, "disable_fuse_qkv": false, "disable_apex_softmax": false, "enable_stream": false, "fused_mha": true, "use_cuda_graph": false, "ddp_type": "apex", "bypass_amp": false, "distributed_lamb": true, "dwu_group_size": 0, "dwu_num_blocks": 1, "dwu_num_chunks": 1, "dwu_num_rs_pg": 1, "dwu_num_ar_pg": 1, "dwu_num_ag_pg": 2, "dwu_overlap_reductions": false, "dwu_e5m2_allgather": false, "opt_level": "O2", "config": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_02V100x1x8.py", "training_event": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/training_event:ApexTrainingEvent"}}, "metadata": {"file": "./run_pretraining.py", "lineno": 49, "time_ms": 1646380150282, "rank": 0}}
+[PerfLog] {"event": "INIT_START", "value": null, "metadata": {"file": "./run_pretraining.py", "lineno": 55, "time_ms": 1646380150385, "rank": 0}}
+[PerfLog] {"event": "CONVERT_MODEL", "value": {"type": "BertForPreTraining", "module": "model.models.modeling"}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 44, "time_ms": 1646380156947, "rank": 0}}
+[PerfLog] {"event": "CREATE_OPTIMIZER", "value": {"type": "DistributedFusedLAMB", "module": "apex.contrib.optimizers.distributed_fused_lamb"}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 45, "time_ms": 1646380157580, "rank": 0}}
+[PerfLog] {"event": "MODEL_TO_FP16", "value": {"fp16": true}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 46, "time_ms": 1646380157642, "rank": 0}}
+[PerfLog] {"event": "MODEL_TO_DDP", "value": {"type": "BertForPreTraining", "module": "model.models.modeling"}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 47, "time_ms": 1646380157779, "rank": 0}}
+/tmp/filec9V7AC/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+/tmp/filewkKRpO/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+/tmp/fileKN2UMJ/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+/tmp/filecoRXgP/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+/tmp/fileimlLkT/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/filewtPVkS/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+/tmp/file892lbW/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/fileESygAW/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+[PerfLog] {"event": "INIT_EVALUATION", "value": {"eval_loss": 1.4461750984191895, "eval_mlm_accuracy": 0.696331262588501, "time": 26.021937608718872}, "metadata": {"file": "./run_pretraining.py", "lineno": 97, "time_ms": 1646380184816, "rank": 0}}
+[PerfLog] {"event": "INIT_END", "value": {"other": "Finish initialization"}, "metadata": {"file": "./run_pretraining.py", "lineno": 110, "time_ms": 1646380184817, "rank": 0}}
+[PerfLog] {"event": "TRAIN_BEGIN", "value": null, "metadata": {"file": "./run_pretraining.py", "lineno": 117, "time_ms": 1646380184822, "rank": 0}}
+[PerfLog] {"event": "EPOCH_BEGIN", "value": {"epoch": 0}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 110, "time_ms": 1646380184823, "rank": 0}}
+/tmp/fileaPWoW1/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/filexYfiDT/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/fileY5WSN5/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+/tmp/file3eQVN9/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^#define NAN __int_as_float(0x7fffffff)
+
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/fileUkXx89/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/filexatGDN/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/fileMcmACE/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/fileVpr85G/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+111 warning warning warning generated generated when compiling for  when compiling for  generatedivcore10 when compiling for ivcore10.
+ivcore10.
+.
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+[PerfLog] {"event": "STEP_END", "value": {"loss": 1.650390625, "mlm_acc": 0.6736947894096375, "epoch": 0, "end_training": false, "global_steps": 3200, "num_trained_samples": 690984, "iter_dataloader_idx": 3, "learning_rate": 0.0002710714285714286, "seq/s": 181.11228738084873, "step": 3200}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646380474335, "rank": 0}}
+[PerfLog] {"event": "EVALUATION", "value": {"global_steps": 3200, "eval_loss": 1.4227526187896729, "eval_mlm_accuracy": 0.7002610564231873, "time": 16.116336822509766}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 154, "time_ms": 1646380474336, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 1.5634765625, "mlm_acc": 0.7014492750167847, "epoch": 0, "end_training": false, "global_steps": 3400, "num_trained_samples": 734184, "iter_dataloader_idx": 3, "learning_rate": 0.0002671428571428572, "seq/s": 179.7384824643963, "step": 3400}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646380730093, "rank": 0}}
+[PerfLog] {"event": "EVALUATION", "value": {"global_steps": 3400, "eval_loss": 1.4111361503601074, "eval_mlm_accuracy": 0.7026439905166626, "time": 16.11122703552246}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 154, "time_ms": 1646380730094, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 1.5400390625, "mlm_acc": 0.6908212304115295, "epoch": 0, "end_training": false, "global_steps": 3600, "num_trained_samples": 777384, "iter_dataloader_idx": 3, "learning_rate": 0.00026321428571428573, "seq/s": 180.10713035401778, "step": 3600}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646380985841, "rank": 0}}
+[PerfLog] {"event": "EVALUATION", "value": {"global_steps": 3600, "eval_loss": 1.3965355157852173, "eval_mlm_accuracy": 0.7045647501945496, "time": 16.116958379745483}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 154, "time_ms": 1646380985842, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 1.5625, "mlm_acc": 0.688141942024231, "epoch": 0, "end_training": true, "global_steps": 3800, "num_trained_samples": 820584, "iter_dataloader_idx": 3, "learning_rate": 0.0002592857142857143, "seq/s": 180.28647883741974, "step": 3800}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646381241839, "rank": 0}}
+[PerfLog] {"event": "EVALUATION", "value": {"global_steps": 3800, "eval_loss": 1.3891334533691406, "eval_mlm_accuracy": 0.7061394453048706, "time": 16.123926639556885}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 154, "time_ms": 1646381241839, "rank": 0}}
+[PerfLog] {"event": "EPOCH_END", "value": {"epoch": 0}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 159, "time_ms": 1646381241940, "rank": 0}}
+[PerfLog] {"event": "TRAIN_END", "value": null, "metadata": {"file": "./run_pretraining.py", "lineno": 128, "time_ms": 1646381241941, "rank": 0}}
+[PerfLog] {"event": "FINISHED", "value": {"e2e_time": 1092.9787755012512, "training_sequences_per_second": 776.4499550192552, "converged": true, "final_loss": 1.3891334533691406, "final_mlm_accuracy": 0.7061394453048706, "raw_train_time": 1057.119, "init_time": 34.432}, "metadata": {"file": "./run_pretraining.py", "lineno": 156, "time_ms": 1646381241969, "rank": 0}}
++ '[' 0 '!=' 0 ']'
++ echo 'eval result: pass.'
+eval result: pass.
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/results/iluvatar-1x8-stage-3.txt b/nlp/language_model/bert_sample/pytorch/iluvatar/results/iluvatar-1x8-stage-3.txt
new file mode 100644
index 000000000..83f2c49ea
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/results/iluvatar-1x8-stage-3.txt
@@ -0,0 +1,185 @@
++ cd ../../benchmark/nlp/lm/bert/pytorch/base/
++ '[' 0 '!=' 0 ']'
++ bash run_training.sh --name iluvatar --config 03V100x1x8 --data_dir ../../../../../../datasets/bert_mini/
+Number of CPU sockets on a node: 2
+Number of CPU cores per socket: 40
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=0-9', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=0', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_03V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=10-19', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=1', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_03V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=20-29', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=2', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_03V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=30-39', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=3', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_03V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=40-49', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=4', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_03V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=50-59', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=5', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_03V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=60-69', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=6', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_03V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=70-79', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=7', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_03V100x1x8.py']
+================================================================================
+device: cuda:3 n_gpu: 8, distributed training: True, 16-bits training: True
+device: cuda:6 n_gpu: 8, distributed training: True, 16-bits training: True
+device: cuda:5 n_gpu: 8, distributed training: True, 16-bits training: True
+device: cuda:4 n_gpu: 8, distributed training: True, 16-bits training: True
+device: cuda:2 n_gpu: 8, distributed training: True, 16-bits training: True
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646382056427, "rank": 3}}
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646382056427, "rank": 6}}
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646382056427, "rank": 5}}
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646382056427, "rank": 4}}
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646382056427, "rank": 2}}
+device: cuda:1 n_gpu: 8, distributed training: True, 16-bits training: True
+device: cuda:0 n_gpu: 8, distributed training: True, 16-bits training: True
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646382056429, "rank": 1}}
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646382056429, "rank": 0}}
+[PerfLog] {"event": "SUBMITTED_INFO", "value": {"submmiter": "pytorch", "model": "iluvatar", "config_path": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_03V100x1x8.py", "config": {"data_dir": "../../../../../../datasets/bert_mini/", "train_dir": "../../../../../../datasets/bert_mini/2048_shards_uncompressed", "bert_model": "bert-large-uncased", "output_dir": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/out", "eval_dir": "../../../../../../datasets/bert_mini/eval_set_uncompressed", "eval_iter_start_samples": 150000, "eval_iter_samples": 150000, "num_eval_examples": 10000, "cache_eval_data": true, "init_checkpoint": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/../../../../../../../model_zoo/lm_bert/model.ckpt-5000.pt", "init_tf_checkpoint": "None", "verify_checkpoint": true, "max_seq_length": 512, "max_predictions_per_seq": 76, "train_batch_size": 27, "eval_batch_size": 27, "learning_rate": 0.00035, "weight_decay_rate": 0.01, "opt_lamb_beta_1": 0.9, "opt_lamb_beta_2": 0.999, "max_steps": 14000, "max_samples_termination": 4500000, "warmup_proportion": 0, "warmup_steps": 0, "start_warmup_step": 0, "local_rank": 0, "dist_backend": "nccl", "seed": 9031, "gradient_accumulation_steps": 1, "fp16": true, "loss_scale": 0.0, "log_freq": 200, "checkpoint_activations": false, "resume_from_checkpoint": false, "resume_init_checkpoint": "None", "keep_n_most_recent_checkpoints": 20, "num_samples_per_checkpoint": 500000, "min_samples_to_start_checkpoints": 3000000, "save_checkpoint": false, "do_train": true, "exchange_padding": true, "enable_fuse_dropout": false, "disable_fuse_mask": false, "fused_gelu_bias": true, "fused_dropout_add": false, "dense_seq_output": true, "use_env": false, "bert_config_path": "../../../../../../datasets/bert_mini/bert_config.json", "target_mlm_accuracy": 0.71, "train_mlm_accuracy_window_size": 0, "num_epochs_to_generate_seeds_for": 2, "use_ddp": false, "use_gradient_as_bucket_view": false, "device": "cuda:0", "n_gpu": 8, "eval_interval_samples": 150000, "eval_steps": 200, "cuda_graph_mode": "segmented", "max_iterations_per_graph": 4, "allreduce_post_accumulation": false, "allreduce_post_accumulation_fp16": false, "unpad": false, "unpad_fmha": false, "pad": false, "disable_fuse_scale": false, "disable_fuse_qkv": false, "disable_apex_softmax": false, "enable_stream": false, "fused_mha": true, "use_cuda_graph": false, "ddp_type": "apex", "bypass_amp": false, "distributed_lamb": true, "dwu_group_size": 0, "dwu_num_blocks": 1, "dwu_num_chunks": 1, "dwu_num_rs_pg": 1, "dwu_num_ar_pg": 1, "dwu_num_ag_pg": 2, "dwu_overlap_reductions": false, "dwu_e5m2_allgather": false, "opt_level": "O2", "config": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_03V100x1x8.py", "training_event": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/training_event:ApexTrainingEvent"}}, "metadata": {"file": "./run_pretraining.py", "lineno": 49, "time_ms": 1646382056430, "rank": 0}}
+device: cuda:7 n_gpu: 8, distributed training: True, 16-bits training: True
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646382056437, "rank": 7}}
+[PerfLog] {"event": "INIT_START", "value": null, "metadata": {"file": "./run_pretraining.py", "lineno": 55, "time_ms": 1646382056531, "rank": 0}}
+[PerfLog] {"event": "CONVERT_MODEL", "value": {"type": "BertForPreTraining", "module": "model.models.modeling"}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 44, "time_ms": 1646382063373, "rank": 0}}
+[PerfLog] {"event": "CREATE_OPTIMIZER", "value": {"type": "DistributedFusedLAMB", "module": "apex.contrib.optimizers.distributed_fused_lamb"}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 45, "time_ms": 1646382063945, "rank": 0}}
+[PerfLog] {"event": "MODEL_TO_FP16", "value": {"fp16": true}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 46, "time_ms": 1646382064007, "rank": 0}}
+[PerfLog] {"event": "MODEL_TO_DDP", "value": {"type": "BertForPreTraining", "module": "model.models.modeling"}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 47, "time_ms": 1646382064144, "rank": 0}}
+/tmp/fileKKjzXX/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+/tmp/fileEAKq06/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+/tmp/file8KvAx7/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+/tmp/fileA9BeFc/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/fileiv2Cdc/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+/tmp/fileSjVyFk/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+/tmp/file0RZfzn/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+/tmp/filee6DpUh/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+[PerfLog] {"event": "INIT_EVALUATION", "value": {"eval_loss": 1.3648749589920044, "eval_mlm_accuracy": 0.7087059020996094, "time": 25.958272695541382}, "metadata": {"file": "./run_pretraining.py", "lineno": 97, "time_ms": 1646382091122, "rank": 0}}
+[PerfLog] {"event": "INIT_END", "value": {"other": "Finish initialization"}, "metadata": {"file": "./run_pretraining.py", "lineno": 110, "time_ms": 1646382091124, "rank": 0}}
+[PerfLog] {"event": "TRAIN_BEGIN", "value": null, "metadata": {"file": "./run_pretraining.py", "lineno": 117, "time_ms": 1646382091127, "rank": 0}}
+[PerfLog] {"event": "EPOCH_BEGIN", "value": {"epoch": 0}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 110, "time_ms": 1646382091128, "rank": 0}}
+/tmp/fileu6KYFs/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/fileRGLFAd/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/filew1vksb/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/fileuOfsFd/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/filevB0sP3/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/fileH3SkXl/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/filekhMfM6/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/filel9iLnn/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+[PerfLog] {"event": "STEP_END", "value": {"loss": 1.474609375, "mlm_acc": 0.6867470145225525, "epoch": 0, "end_training": false, "global_steps": 5200, "num_trained_samples": 1122984, "iter_dataloader_idx": 3, "learning_rate": 0.00022178571428571427, "seq/s": 181.2351094168595, "step": 5200}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646382381181, "rank": 0}}
+[PerfLog] {"event": "EVALUATION", "value": {"global_steps": 5200, "eval_loss": 1.364203691482544, "eval_mlm_accuracy": 0.7095025777816772, "time": 16.115475177764893}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 154, "time_ms": 1646382381182, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 1.4326171875, "mlm_acc": 0.7285023927688599, "epoch": 0, "end_training": true, "global_steps": 5400, "num_trained_samples": 1166184, "iter_dataloader_idx": 3, "learning_rate": 0.00021857142857142854, "seq/s": 179.96466615138635, "step": 5400}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646382636983, "rank": 0}}
+[PerfLog] {"event": "EVALUATION", "value": {"global_steps": 5400, "eval_loss": 1.3631609678268433, "eval_mlm_accuracy": 0.7100414037704468, "time": 16.11443781852722}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 154, "time_ms": 1646382636983, "rank": 0}}
+[PerfLog] {"event": "EPOCH_END", "value": {"epoch": 0}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 159, "time_ms": 1646382637068, "rank": 0}}
+[PerfLog] {"event": "TRAIN_END", "value": null, "metadata": {"file": "./run_pretraining.py", "lineno": 128, "time_ms": 1646382637068, "rank": 0}}
+[PerfLog] {"event": "FINISHED", "value": {"e2e_time": 581.9371633529663, "training_sequences_per_second": 2136.4946028966497, "converged": true, "final_loss": 1.3631609678268433, "final_mlm_accuracy": 0.7100414037704468, "raw_train_time": 545.941, "init_time": 34.593}, "metadata": {"file": "./run_pretraining.py", "lineno": 156, "time_ms": 1646382637095, "rank": 0}}
++ '[' 0 '!=' 0 ']'
++ echo 'eval result: pass.'
+eval result: pass.
diff --git a/nlp/language_model/bert_sample/pytorch/iluvatar/results/iluvatar-1x8.txt b/nlp/language_model/bert_sample/pytorch/iluvatar/results/iluvatar-1x8.txt
new file mode 100644
index 000000000..518bb038c
--- /dev/null
+++ b/nlp/language_model/bert_sample/pytorch/iluvatar/results/iluvatar-1x8.txt
@@ -0,0 +1,226 @@
++ cd ../../benchmark/nlp/lm/bert/pytorch/base/
++ '[' 0 '!=' 0 ']'
++ bash run_training.sh --name iluvatar --config V100x1x8 --data_dir ../../../../../../datasets/bert_mini/
+Number of CPU sockets on a node: 2
+Number of CPU cores per socket: 40
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=0-9', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=0', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=10-19', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=1', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=20-29', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=2', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=30-39', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=3', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=40-49', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=4', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=50-59', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=5', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=60-69', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=6', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_V100x1x8.py']
+================================================================================
+================================================================================
+= numactlargs_flag
+['/usr/bin/numactl', '--physcpubind=70-79', '/root/miniconda/bin/python3', '-u', './run_pretraining.py', '--local_rank=7', '--data_dir', '../../../../../../datasets/bert_mini/', '--do_train', '/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_V100x1x8.py']
+================================================================================
+device: cuda:3 n_gpu: 8, distributed training: True, 16-bits training: True
+device: cuda:7 n_gpu: 8, distributed training: True, 16-bits training: True
+device: cuda:5 n_gpu: 8, distributed training: True, 16-bits training: Truedevice: cuda:6 n_gpu: 8, distributed training: True, 16-bits training: True
+
+device: cuda:4 n_gpu: 8, distributed training: True, 16-bits training: True
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646375877650, "rank": 3}}
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646375877650, "rank": 7}}
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646375877650, "rank": 5}}
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646375877650, "rank": 4}}
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646375877650, "rank": 6}}
+device: cuda:2 n_gpu: 8, distributed training: True, 16-bits training: True
+device: cuda:1 n_gpu: 8, distributed training: True, 16-bits training: True
+device: cuda:0 n_gpu: 8, distributed training: True, 16-bits training: True
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646375877654, "rank": 2}}
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646375877654, "rank": 1}}
+[PerfLog] {"event": "LAUNCH_TRAINING", "value": {"other": "Launch training"}, "metadata": {"file": "./run_pretraining.py", "lineno": 136, "time_ms": 1646375877654, "rank": 0}}
+[PerfLog] {"event": "SUBMITTED_INFO", "value": {"submmiter": "pytorch", "model": "iluvatar", "config_path": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_V100x1x8.py", "config": {"data_dir": "../../../../../../datasets/bert_mini/", "train_dir": "../../../../../../datasets/bert_mini/2048_shards_uncompressed", "bert_model": "bert-large-uncased", "output_dir": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/../out", "eval_dir": "../../../../../../datasets/bert_mini/eval_set_uncompressed", "eval_iter_start_samples": 150000, "eval_iter_samples": 150000, "num_eval_examples": 10000, "cache_eval_data": true, "init_checkpoint": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/../../../../../../../model_zoo/lm_bert/model.ckpt-0.pt", "init_tf_checkpoint": "None", "verify_checkpoint": true, "max_seq_length": 512, "max_predictions_per_seq": 76, "train_batch_size": 27, "eval_batch_size": 27, "learning_rate": 0.00035, "weight_decay_rate": 0.01, "opt_lamb_beta_1": 0.9, "opt_lamb_beta_2": 0.999, "max_steps": 14000, "max_samples_termination": 4500000, "warmup_proportion": 0, "warmup_steps": 0, "start_warmup_step": 0, "local_rank": 0, "dist_backend": "nccl", "seed": 9031, "gradient_accumulation_steps": 1, "fp16": true, "loss_scale": 0.0, "log_freq": 200, "checkpoint_activations": false, "resume_from_checkpoint": false, "resume_init_checkpoint": "None", "keep_n_most_recent_checkpoints": 20, "num_samples_per_checkpoint": 500000, "min_samples_to_start_checkpoints": 3000000, "save_checkpoint": true, "do_train": true, "exchange_padding": true, "enable_fuse_dropout": false, "disable_fuse_mask": false, "fused_gelu_bias": true, "fused_dropout_add": false, "dense_seq_output": true, "use_env": false, "bert_config_path": "../../../../../../datasets/bert_mini/bert_config.json", "target_mlm_accuracy": 0.71, "train_mlm_accuracy_window_size": 0, "num_epochs_to_generate_seeds_for": 2, "use_ddp": false, "use_gradient_as_bucket_view": false, "device": "cuda:0", "n_gpu": 8, "eval_interval_samples": 150000, "eval_steps": 1000, "cuda_graph_mode": "segmented", "max_iterations_per_graph": 4, "allreduce_post_accumulation": false, "allreduce_post_accumulation_fp16": false, "unpad": false, "unpad_fmha": false, "pad": false, "disable_fuse_scale": false, "disable_fuse_qkv": false, "disable_apex_softmax": false, "enable_stream": false, "fused_mha": true, "use_cuda_graph": false, "ddp_type": "apex", "bypass_amp": false, "distributed_lamb": true, "dwu_group_size": 0, "dwu_num_blocks": 1, "dwu_num_chunks": 1, "dwu_num_rs_pg": 1, "dwu_num_ar_pg": 1, "dwu_num_ag_pg": 2, "dwu_overlap_reductions": false, "dwu_e5m2_allgather": false, "opt_level": "O2", "config": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/config_V100x1x8.py", "training_event": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/iluvatar/config/training_event:ApexTrainingEvent"}}, "metadata": {"file": "./run_pretraining.py", "lineno": 49, "time_ms": 1646375877655, "rank": 0}}
+[PerfLog] {"event": "INIT_START", "value": null, "metadata": {"file": "./run_pretraining.py", "lineno": 55, "time_ms": 1646375877753, "rank": 0}}
+[PerfLog] {"event": "CONVERT_MODEL", "value": {"type": "BertForPreTraining", "module": "model.models.modeling"}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 44, "time_ms": 1646375883994, "rank": 0}}
+[PerfLog] {"event": "CREATE_OPTIMIZER", "value": {"type": "DistributedFusedLAMB", "module": "apex.contrib.optimizers.distributed_fused_lamb"}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 45, "time_ms": 1646375884492, "rank": 0}}
+[PerfLog] {"event": "MODEL_TO_FP16", "value": {"fp16": true}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 46, "time_ms": 1646375884554, "rank": 0}}
+[PerfLog] {"event": "MODEL_TO_DDP", "value": {"type": "BertForPreTraining", "module": "model.models.modeling"}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 47, "time_ms": 1646375884700, "rank": 0}}
+/tmp/fileBVcI20/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+/tmp/filejtlyAa/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/fileDPIsJ8/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+/tmp/fileXuwut9/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+/tmp/filenQXkFe/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+/tmp/fileTtm6ti/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+/tmp/file5oTKPg/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/fileTp3TMg/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+[PerfLog] {"event": "INIT_EVALUATION", "value": {"eval_loss": 4.720923900604248, "eval_mlm_accuracy": 0.340021550655365, "time": 26.123970985412598}, "metadata": {"file": "./run_pretraining.py", "lineno": 97, "time_ms": 1646375911297, "rank": 0}}
+[PerfLog] {"event": "INIT_END", "value": {"other": "Finish initialization"}, "metadata": {"file": "./run_pretraining.py", "lineno": 110, "time_ms": 1646375911299, "rank": 0}}
+[PerfLog] {"event": "TRAIN_BEGIN", "value": null, "metadata": {"file": "./run_pretraining.py", "lineno": 117, "time_ms": 1646375911303, "rank": 0}}
+save for steps:0
+[PerfLog] {"event": "EPOCH_BEGIN", "value": {"epoch": 0}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 110, "time_ms": 1646375912082, "rank": 0}}
+/tmp/fileVwYkmU/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/file4pIVab/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/filex01mKS/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/filevHN8k3/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/fileFenQXL/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/fileotFHib/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/filecUFVL0/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+/tmp/filekPslht/tmp.cu:53:9: warning: 'NAN' macro redefined [-Wmacro-redefined]
+#define NAN __int_as_float(0x7fffffff)
+        ^
+/usr/include/math.h:98:11: note: previous definition is here
+#  define NAN (__builtin_nanf (""))
+          ^
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+1 warning generated when compiling for ivcore10.
+[PerfLog] {"event": "STEP_END", "value": {"loss": 4.9296875, "mlm_acc": 0.3141593039035797, "epoch": 0, "end_training": false, "num_trained_samples": 0, "global_steps": 1, "iter_dataloader_idx": 1, "learning_rate": 0.000349975, "seq/s": 13.940688815264746, "step": 1}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646375927639, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 4.18359375, "mlm_acc": 0.35693779587745667, "epoch": 0, "end_training": false, "num_trained_samples": 42984, "global_steps": 200, "iter_dataloader_idx": 1, "learning_rate": 0.00034500000000000004, "seq/s": 180.3605947082102, "step": 200}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646376166553, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 4.08203125, "mlm_acc": 0.3757791519165039, "epoch": 0, "end_training": false, "num_trained_samples": 86184, "global_steps": 400, "iter_dataloader_idx": 1, "learning_rate": 0.00033999999999999997, "seq/s": 179.8760886243029, "step": 400}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646376406756, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 3.884765625, "mlm_acc": 0.4088495671749115, "epoch": 0, "end_training": false, "num_trained_samples": 129384, "global_steps": 600, "iter_dataloader_idx": 1, "learning_rate": 0.000335, "seq/s": 179.58752241202444, "step": 600}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646376647027, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 3.904296875, "mlm_acc": 0.4010600745677948, "epoch": 0, "end_training": false, "num_trained_samples": 172584, "global_steps": 800, "iter_dataloader_idx": 1, "learning_rate": 0.00033, "seq/s": 179.87758860657706, "step": 800}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646376887149, "rank": 0}}
+save for steps:1000
+[PerfLog] {"event": "STEP_END", "value": {"loss": 3.75, "mlm_acc": 0.3989070951938629, "epoch": 0, "end_training": false, "num_trained_samples": 215784, "global_steps": 1000, "iter_dataloader_idx": 1, "learning_rate": 0.000325, "seq/s": 180.35082874577924, "step": 1000}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646377145262, "rank": 0}}
+[PerfLog] {"event": "EVALUATION", "value": {"global_steps": 1000, "eval_loss": 3.9250423908233643, "eval_mlm_accuracy": 0.39407235383987427, "time": 15.965189218521118}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 154, "time_ms": 1646377145262, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 3.6640625, "mlm_acc": 0.4171779155731201, "epoch": 0, "end_training": false, "num_trained_samples": 258984, "global_steps": 1200, "iter_dataloader_idx": 1, "learning_rate": 0.00031999999999999997, "seq/s": 177.3535196847465, "step": 1200}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646377385302, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 3.662109375, "mlm_acc": 0.42560866475105286, "epoch": 0, "end_training": false, "num_trained_samples": 302184, "global_steps": 1400, "iter_dataloader_idx": 1, "learning_rate": 0.000315, "seq/s": 179.46009701866387, "step": 1400}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646377625507, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 3.626953125, "mlm_acc": 0.430402934551239, "epoch": 0, "end_training": false, "num_trained_samples": 345384, "global_steps": 1600, "iter_dataloader_idx": 1, "learning_rate": 0.00031, "seq/s": 180.38871365996465, "step": 1600}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646377865961, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 3.10546875, "mlm_acc": 0.4819611608982086, "epoch": 0, "end_training": false, "num_trained_samples": 388584, "global_steps": 1800, "iter_dataloader_idx": 1, "learning_rate": 0.000305, "seq/s": 178.83276799757442, "step": 1800}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646378106053, "rank": 0}}
+save for steps:2000
+[PerfLog] {"event": "STEP_END", "value": {"loss": 3.265625, "mlm_acc": 0.4693877696990967, "epoch": 0, "end_training": false, "num_trained_samples": 431784, "global_steps": 2000, "iter_dataloader_idx": 1, "learning_rate": 0.00030000000000000003, "seq/s": 180.05515596061153, "step": 2000}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646378363793, "rank": 0}}
+[PerfLog] {"event": "EVALUATION", "value": {"global_steps": 2000, "eval_loss": 3.0997543334960938, "eval_mlm_accuracy": 0.48964831233024597, "time": 15.965776205062866}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 154, "time_ms": 1646378363793, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 2.60546875, "mlm_acc": 0.5470162630081177, "epoch": 0, "end_training": false, "num_trained_samples": 474984, "global_steps": 2200, "iter_dataloader_idx": 1, "learning_rate": 0.000295, "seq/s": 179.56609432847463, "step": 2200}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646378604054, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 2.439453125, "mlm_acc": 0.5512934923171997, "epoch": 0, "end_training": false, "num_trained_samples": 518184, "global_steps": 2400, "iter_dataloader_idx": 1, "learning_rate": 0.00029, "seq/s": 179.75321081531524, "step": 2400}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646378844308, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 2.021484375, "mlm_acc": 0.6172608137130737, "epoch": 0, "end_training": false, "num_trained_samples": 561384, "global_steps": 2600, "iter_dataloader_idx": 1, "learning_rate": 0.000285, "seq/s": 180.33011548082376, "step": 2600}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646379084856, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 1.798828125, "mlm_acc": 0.6303191781044006, "epoch": 0, "end_training": false, "num_trained_samples": 604584, "global_steps": 2800, "iter_dataloader_idx": 1, "learning_rate": 0.00028000000000000003, "seq/s": 179.4828509883873, "step": 2800}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646379325044, "rank": 0}}
+save for steps:3000
+[PerfLog] {"event": "STEP_END", "value": {"loss": 1.48046875, "mlm_acc": 0.6902416944503784, "epoch": 0, "end_training": false, "num_trained_samples": 647784, "global_steps": 3000, "iter_dataloader_idx": 2, "learning_rate": 0.000275, "seq/s": 179.5248543243754, "step": 3000}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646379583448, "rank": 0}}
+[PerfLog] {"event": "EVALUATION", "value": {"global_steps": 3000, "eval_loss": 1.4461750984191895, "eval_mlm_accuracy": 0.696331262588501, "time": 15.956255197525024}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 154, "time_ms": 1646379583448, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 1.484375, "mlm_acc": 0.690305233001709, "epoch": 0, "end_training": false, "num_trained_samples": 690984, "global_steps": 3200, "iter_dataloader_idx": 2, "learning_rate": 0.00027, "seq/s": 178.38522180562606, "step": 3200}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646379823921, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 1.5458984375, "mlm_acc": 0.7045226097106934, "epoch": 0, "end_training": false, "num_trained_samples": 734184, "global_steps": 3400, "iter_dataloader_idx": 2, "learning_rate": 0.000265, "seq/s": 181.28849287788512, "step": 3400}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646380063980, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 1.62109375, "mlm_acc": 0.6920849680900574, "epoch": 0, "end_training": false, "num_trained_samples": 777384, "global_steps": 3600, "iter_dataloader_idx": 2, "learning_rate": 0.00026000000000000003, "seq/s": 181.8014769757576, "step": 3600}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646380304330, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 1.47265625, "mlm_acc": 0.7041166424751282, "epoch": 0, "end_training": false, "num_trained_samples": 820584, "global_steps": 3800, "iter_dataloader_idx": 2, "learning_rate": 0.000255, "seq/s": 178.9751421675208, "step": 3800}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646380544503, "rank": 0}}
+save for steps:4000
+[PerfLog] {"event": "STEP_END", "value": {"loss": 1.517578125, "mlm_acc": 0.6998106241226196, "epoch": 0, "end_training": false, "num_trained_samples": 863784, "global_steps": 4000, "iter_dataloader_idx": 2, "learning_rate": 0.00025, "seq/s": 180.76786294703385, "step": 4000}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646380802056, "rank": 0}}
+[PerfLog] {"event": "EVALUATION", "value": {"global_steps": 4000, "eval_loss": 1.3820699453353882, "eval_mlm_accuracy": 0.7064228057861328, "time": 15.965797185897827}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 154, "time_ms": 1646380802056, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 1.43359375, "mlm_acc": 0.6917073130607605, "epoch": 0, "end_training": false, "num_trained_samples": 906984, "global_steps": 4200, "iter_dataloader_idx": 2, "learning_rate": 0.000245, "seq/s": 180.42259014262225, "step": 4200}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646381042480, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 1.337890625, "mlm_acc": 0.7208854556083679, "epoch": 0, "end_training": false, "num_trained_samples": 950184, "global_steps": 4400, "iter_dataloader_idx": 2, "learning_rate": 0.00024, "seq/s": 179.73591505705716, "step": 4400}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646381282373, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 1.3046875, "mlm_acc": 0.710185170173645, "epoch": 0, "end_training": false, "num_trained_samples": 993384, "global_steps": 4600, "iter_dataloader_idx": 2, "learning_rate": 0.00023500000000000002, "seq/s": 182.45285751686637, "step": 4600}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646381522550, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 1.318359375, "mlm_acc": 0.7091237306594849, "epoch": 0, "end_training": false, "num_trained_samples": 1036584, "global_steps": 4800, "iter_dataloader_idx": 2, "learning_rate": 0.00023, "seq/s": 180.0546549761042, "step": 4800}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646381763056, "rank": 0}}
+save for steps:5000
+[PerfLog] {"event": "STEP_END", "value": {"loss": 1.376953125, "mlm_acc": 0.7196819186210632, "epoch": 0, "end_training": false, "num_trained_samples": 1079784, "global_steps": 5000, "iter_dataloader_idx": 2, "learning_rate": 0.00022499999999999997, "seq/s": 179.69958697463633, "step": 5000}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646382020573, "rank": 0}}
+[PerfLog] {"event": "EVALUATION", "value": {"global_steps": 5000, "eval_loss": 1.3648749589920044, "eval_mlm_accuracy": 0.7087059020996094, "time": 15.969963550567627}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 154, "time_ms": 1646382020573, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 1.4580078125, "mlm_acc": 0.6995708346366882, "epoch": 0, "end_training": false, "num_trained_samples": 1122984, "global_steps": 5200, "iter_dataloader_idx": 2, "learning_rate": 0.00021999999999999998, "seq/s": 179.39424968713863, "step": 5200}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646382260784, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 1.625, "mlm_acc": 0.7126969695091248, "epoch": 0, "end_training": false, "num_trained_samples": 1166184, "global_steps": 5400, "iter_dataloader_idx": 2, "learning_rate": 0.00021500000000000002, "seq/s": 180.18020294220656, "step": 5400}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646382500907, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 1.44921875, "mlm_acc": 0.6990825533866882, "epoch": 0, "end_training": false, "num_trained_samples": 1209384, "global_steps": 5600, "iter_dataloader_idx": 2, "learning_rate": 0.00020999999999999998, "seq/s": 179.93600022244487, "step": 5600}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646382740975, "rank": 0}}
+[PerfLog] {"event": "STEP_END", "value": {"loss": 1.5244140625, "mlm_acc": 0.6917148232460022, "epoch": 0, "end_training": false, "num_trained_samples": 1252584, "global_steps": 5800, "iter_dataloader_idx": 3, "learning_rate": 0.00020499999999999997, "seq/s": 180.25835588130485, "step": 5800}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646382981582, "rank": 0}}
+save for steps:6000
+[PerfLog] {"event": "STEP_END", "value": {"loss": 1.38671875, "mlm_acc": 0.7282296419143677, "epoch": 0, "end_training": true, "num_trained_samples": 1295784, "global_steps": 6000, "iter_dataloader_idx": 3, "learning_rate": 0.00019999999999999998, "seq/s": 180.2772948665217, "step": 6000}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 151, "time_ms": 1646383238299, "rank": 0}}
+[PerfLog] {"event": "EVALUATION", "value": {"global_steps": 6000, "eval_loss": 1.3515790700912476, "eval_mlm_accuracy": 0.7111283540725708, "time": 15.958270788192749}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 154, "time_ms": 1646383238299, "rank": 0}}
+[PerfLog] {"event": "EPOCH_END", "value": {"epoch": 0}, "metadata": {"file": "/home/jian.wang/Project/deeplearningsamples/deeplearningsamples/benchmark/nlp/lm/bert/pytorch/base/train/trainer.py", "lineno": 159, "time_ms": 1646383238376, "rank": 0}}
+save for steps:6000
+[PerfLog] {"event": "TRAIN_END", "value": null, "metadata": {"file": "./run_pretraining.py", "lineno": 128, "time_ms": 1646383240101, "rank": 0}}
+[PerfLog] {"event": "FINISHED", "value": {"e2e_time": 7363.76092171669, "training_sequences_per_second": 176.83663814993946, "converged": true, "final_loss": 1.3515790700912476, "final_mlm_accuracy": 0.7111283540725708, "raw_train_time": 7328.798, "init_time": 33.546}, "metadata": {"file": "./run_pretraining.py", "lineno": 156, "time_ms": 1646383240125, "rank": 0}}
++ '[' 0 '!=' 0 ']'
++ echo 'train status: pass.'
+train status: pass.
diff --git a/tests/executables/bert/train_bert_default_amp_dist_1x8_torch.sh b/tests/executables/bert/train_bert_default_amp_dist_1x8_torch.sh
index 311994b05..1eaf33a94 100644
--- a/tests/executables/bert/train_bert_default_amp_dist_1x8_torch.sh
+++ b/tests/executables/bert/train_bert_default_amp_dist_1x8_torch.sh
@@ -17,7 +17,7 @@ source ../_utils/global_environment_variables.sh
 
 : ${BATCH_SIZE:=10}
 
-cd ../../nlp/language_model/bert_sample/pytorch/base
+cd ../../../nlp/language_model/bert_sample/pytorch/base
 if [ "$?" != "0" ]; then
     echo "train status: fail."
     exit 1
diff --git a/tests/executables/bert/train_bert_pretraining_amp_dist_1x8_tf.sh b/tests/executables/bert/train_bert_pretraining_amp_dist_1x8_tf.sh
index f909c31d7..6c6b28067 100644
--- a/tests/executables/bert/train_bert_pretraining_amp_dist_1x8_tf.sh
+++ b/tests/executables/bert/train_bert_pretraining_amp_dist_1x8_tf.sh
@@ -25,7 +25,7 @@ set -euox pipefail
 
 current_path=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)
 ROOT_DIR=${current_path}"/../../"
-SRC_DIR=${ROOT_DIR}nlp/language_model/bert/tensorflow/base
+SRC_DIR=${ROOT_DIR}../nlp/language_model/bert/tensorflow/base
 DATA_DIR=${ROOT_DIR}data/datasets
 MODEL_DIR=${ROOT_DIR}data/model_zoo
 
diff --git a/tests/executables/bert/train_bert_pretraining_amp_dist_1x8_torch.sh b/tests/executables/bert/train_bert_pretraining_amp_dist_1x8_torch.sh
index 1a5a8e0e3..6e59480a1 100644
--- a/tests/executables/bert/train_bert_pretraining_amp_dist_1x8_torch.sh
+++ b/tests/executables/bert/train_bert_pretraining_amp_dist_1x8_torch.sh
@@ -15,9 +15,9 @@ set -euox pipefail
 
 : ${BATCH_SIZE:=27}
 
-cd ../../nlp/language_model/bert_sample/pytorch/base/
+cd ../../../nlp/language_model/bert_sample/pytorch/base/
 if [ "$?" != "0" ]; then
-    echo "ERROR: ../../nlp/language_model/bert_sample/pytorch/base/ not exist."
+    echo "ERROR: ../../../nlp/language_model/bert_sample/pytorch/base/ not exist."
     exit 1
 fi
 
diff --git a/tests/executables/conformer/init_torch.sh b/tests/executables/conformer/init_torch.sh
index 627d953a3..9635b0bd6 100644
--- a/tests/executables/conformer/init_torch.sh
+++ b/tests/executables/conformer/init_torch.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 ROOT_DIR="$(cd "$(dirname "$0")/../.."; pwd)"
-SRC_DIR=$ROOT_DIR/audio/speech_recognition/conformer/pytorch
+SRC_DIR=$ROOT_DIR/../audio/speech_recognition/conformer/pytorch
 DATA_DIR=$ROOT_DIR/data
 
 # determine whether the user is root mode to execute this script
diff --git a/tests/executables/conformer/train_conformer_librispeech_dist_1x8_torch.sh b/tests/executables/conformer/train_conformer_librispeech_dist_1x8_torch.sh
index d9500486f..9b18a1345 100644
--- a/tests/executables/conformer/train_conformer_librispeech_dist_1x8_torch.sh
+++ b/tests/executables/conformer/train_conformer_librispeech_dist_1x8_torch.sh
@@ -3,7 +3,7 @@ source ../_utils/global_environment_variables.sh
 : ${BATCH_SIZE:=8}
 
 ROOT_DIR="$(cd "$(dirname "$0")/../.."; pwd)"
-SRC_DIR=$ROOT_DIR/audio/speech_recognition/conformer/pytorch
+SRC_DIR=$ROOT_DIR/../audio/speech_recognition/conformer/pytorch
 DATA_DIR=$ROOT_DIR/data
 export DRT_MEMCPYUSEKERNEL=20000000000
 
diff --git a/tests/executables/dali/train_resnet50_dali_torch.sh b/tests/executables/dali/train_resnet50_dali_torch.sh
index 6c814901c..8f0a5eb8f 100644
--- a/tests/executables/dali/train_resnet50_dali_torch.sh
+++ b/tests/executables/dali/train_resnet50_dali_torch.sh
@@ -10,7 +10,7 @@ fi
 
 
 ixdltest-check --nonstrict_mode_args="--epoch ${NONSTRICT_EPOCH}" -b 8 --run_script \
-python3 ${PROJECT_DIR}/cv/classification/resnet50/pytorch/train.py \
+python3 ${PROJECT_DIR}../cv/classification/resnet50/pytorch/train.py \
 --data-path ${PROJECT_DIR}/data/datasets/imagenette \
 --batch-size ${BATCH_SIZE} \
 --output-dir ${OUTPUT_DIR} \
diff --git a/tests/executables/fairmot/init_torch.sh b/tests/executables/fairmot/init_torch.sh
index 6ec2c63c7..e01fe08f0 100644
--- a/tests/executables/fairmot/init_torch.sh
+++ b/tests/executables/fairmot/init_torch.sh
@@ -7,7 +7,7 @@ unzip -q MOT17.zip
 mkdir MOT17/images && mkdir MOT17/labels_with_ids
 mv ./MOT17/train ./MOT17/images/ && mv ./MOT17/test ./MOT17/images/
 
-cd ${ROOT_DIR}/cv/multi_object_tracking/fairmot/pytorch/
+cd ${ROOT_DIR}/../cv/multi_object_tracking/fairmot/pytorch/
 pip3 install Cython
 pip3 install -r requirements.txt
 
diff --git a/tests/executables/fairmot/train_fairmot_hrnet32_dist_torch.sh b/tests/executables/fairmot/train_fairmot_hrnet32_dist_torch.sh
index 748020898..2fbf7ccb4 100644
--- a/tests/executables/fairmot/train_fairmot_hrnet32_dist_torch.sh
+++ b/tests/executables/fairmot/train_fairmot_hrnet32_dist_torch.sh
@@ -14,7 +14,7 @@ fi
 ROOT_DIR=${CURRENT_DIR}/../..
 DADASAT_PATH=${ROOT_DIR}/data/datasets/MOT17
 
-cd ${ROOT_DIR}/cv/multi_object_tracking/fairmot/pytorch/
+cd ${ROOT_DIR}/../cv/multi_object_tracking/fairmot/pytorch/
 
 bash train_hrnet32_mot17.sh --batch_size $((IX_NUM_CUDA_VISIBLE_DEVICES*BATCH_SIZE)) \
  --lr 0.001 \
diff --git a/tests/executables/maskrcnn/train_maskrcnn_resnet50_amp_torch.sh b/tests/executables/maskrcnn/train_maskrcnn_resnet50_amp_torch.sh
index 860d6a0a2..0ae3a14e5 100644
--- a/tests/executables/maskrcnn/train_maskrcnn_resnet50_amp_torch.sh
+++ b/tests/executables/maskrcnn/train_maskrcnn_resnet50_amp_torch.sh
@@ -12,7 +12,7 @@ fi
 
 
 ixdltest-check --nonstrict_mode_args="--epoch ${NONSTRICT_EPOCH}" -b 10 --run_script \
-python3 ${PROJECT_DIR}/cv/detection/maskrcnn/pytorch/train.py \
+python3 ${PROJECT_DIR}../cv/detection/maskrcnn/pytorch/train.py \
 --model maskrcnn_resnet50_fpn \
 --data-path ${PROJECT_DIR}/data/datasets/VOC2012_sample \
 --amp \
diff --git a/tests/executables/mobilenetv3/init_torch.sh b/tests/executables/mobilenetv3/init_torch.sh
index e8d2163e7..e10f8246a 100644
--- a/tests/executables/mobilenetv3/init_torch.sh
+++ b/tests/executables/mobilenetv3/init_torch.sh
@@ -12,4 +12,4 @@ echo "prefix_sudo= $prefix_sudo"
 
 command -v yum >/dev/null && $prefix_sudo yum install -y numactl ||  $prefix_sudo apt install -y numactl
 
-pip3 install -r ../../cv/classification/mobilenetv3/pytorch/requirements.txt
\ No newline at end of file
+pip3 install -r ../../../cv/classification/mobilenetv3/pytorch/requirements.txt
\ No newline at end of file
diff --git a/tests/executables/mobilenetv3/train_mobilenetv3_large_amp_torch.sh b/tests/executables/mobilenetv3/train_mobilenetv3_large_amp_torch.sh
index 13a46dc79..f50b8e88e 100644
--- a/tests/executables/mobilenetv3/train_mobilenetv3_large_amp_torch.sh
+++ b/tests/executables/mobilenetv3/train_mobilenetv3_large_amp_torch.sh
@@ -16,7 +16,7 @@ check_status()
     fi
 }
 
-cd $CURRENT_DIR/../../cv/classification/mobilenetv3/pytorch/
+cd $CURRENT_DIR/../../../cv/classification/mobilenetv3/pytorch/
 ixdltest-check --nonstrict_mode_args="--epoch ${NONSTRICT_EPOCH}" -b 10 --run_script \
 python3 train.py --model  mobilenet_v3_large --data-path "${DATA_DIR}" \
    --epochs 600 --batch-size ${BATCH_SIZE} --opt sgd --lr 0.1 \
diff --git a/tests/executables/resnet/init_paddle.sh b/tests/executables/resnet/init_paddle.sh
index fdb475bd8..70773191f 100644
--- a/tests/executables/resnet/init_paddle.sh
+++ b/tests/executables/resnet/init_paddle.sh
@@ -13,7 +13,7 @@ if [ ! -d "${DATASET_DIR}/flowers102" ]; then
     tar zxf ${DATASET_DIR}/flowers102.tgz -C ${DATASET_DIR}
 fi
 
-RESNET_PADDLE_DIR=${PRJ_DIR}/cv/classification/resnet50/paddlepaddle
+RESNET_PADDLE_DIR=${PRJ_DIR}/../cv/classification/resnet50/paddlepaddle
 cd ${RESNET_PADDLE_DIR}
 pip3 install -r requirements.txt
 
diff --git a/tests/executables/resnet/train_resnet50_amp_torch.sh b/tests/executables/resnet/train_resnet50_amp_torch.sh
index 65132746b..46f15ff50 100644
--- a/tests/executables/resnet/train_resnet50_amp_torch.sh
+++ b/tests/executables/resnet/train_resnet50_amp_torch.sh
@@ -9,7 +9,7 @@ if [[ -d ${OUTPUT_DIR} ]]; then
 fi
 
 ixdltest-check --nonstrict_mode_args="--epoch ${NONSTRICT_EPOCH}" -b 8 --run_script \
-python3 ${PROJECT_DIR}/cv/classification/resnet50/pytorch/train.py \
+python3 ${PROJECT_DIR}../cv/classification/resnet50/pytorch/train.py \
 --data-path ${PROJECT_DIR}/data/datasets/imagenette \
 --batch-size ${BATCH_SIZE} \
 --lr 0.01 \
diff --git a/tests/executables/resnet/train_resnet50_dist_paddle.sh b/tests/executables/resnet/train_resnet50_dist_paddle.sh
index b4dccfde7..777cf7c17 100644
--- a/tests/executables/resnet/train_resnet50_dist_paddle.sh
+++ b/tests/executables/resnet/train_resnet50_dist_paddle.sh
@@ -8,7 +8,7 @@ if [[ -d ${OUTPUT_DIR} ]]; then
     mkdir -p ${OUTPUT_DIR}
 fi
 
-RESNET_PADDLE_DIR=${PROJECT_DIR}/cv/classification/resnet50/paddlepaddle/
+RESNET_PADDLE_DIR=${PROJECT_DIR}../cv/classification/resnet50/paddlepaddle/
 cd ${RESNET_PADDLE_DIR}
 
 ixdltest-check --nonstrict_mode_args="--epoch ${NONSTRICT_EPOCH}" -b 8 --run_script \
diff --git a/tests/executables/resnet/train_resnet50_dist_tf.sh b/tests/executables/resnet/train_resnet50_dist_tf.sh
index e6b121f0a..a71d21bb6 100644
--- a/tests/executables/resnet/train_resnet50_dist_tf.sh
+++ b/tests/executables/resnet/train_resnet50_dist_tf.sh
@@ -16,7 +16,7 @@ check_status()
     fi
 }
 
-cd ${ROOT_DIR}/cv/classification/resnet50/tensorflow/
+cd ${ROOT_DIR}/../cv/classification/resnet50/tensorflow/
 ixdltest-check --nonstrict_mode_args="--epoch ${NONSTRICT_EPOCH}" -b 0.01 --run_script \
 bash run_train_resnet50_distributed_imagenette.sh "$@";  check_status
 
diff --git a/tests/executables/retinanet/train_retinanet_amp_torch.sh b/tests/executables/retinanet/train_retinanet_amp_torch.sh
index a3431f8f7..d64bfe4fb 100644
--- a/tests/executables/retinanet/train_retinanet_amp_torch.sh
+++ b/tests/executables/retinanet/train_retinanet_amp_torch.sh
@@ -16,7 +16,7 @@ check_status()
 }
 
 ixdltest-check --nonstrict_mode_args="--epoch ${NONSTRICT_EPOCH}" -b 0 --run_script \
-python3 ../../cv/detection/retinanet/pytorch/train.py \
+python3 ../../../cv/detection/retinanet/pytorch/train.py \
 --model retinanet_resnet50_fpn \
 --lr 0.01 \
 --data-path ${DATA_DIR} \
diff --git a/tests/executables/ssd/init_tf.sh b/tests/executables/ssd/init_tf.sh
index a82a9f8f9..51111a05b 100644
--- a/tests/executables/ssd/init_tf.sh
+++ b/tests/executables/ssd/init_tf.sh
@@ -4,7 +4,7 @@ PROJECT_ROOT="${CUR_DIR}/../.."
 DATASET_DIR="${PROJECT_ROOT}/data/datasets"
 MODEL_CPT_DIR="${PROJECT_ROOT}/data/model_zoo/ssd_tf"
 VOC_RECORD_DIR="${DATASET_DIR}/tf_ssd_voc_record"
-SSD_ROOT="${PROJECT_ROOT}/cv/detection/ssd/tensorflow"
+SSD_ROOT="${PROJECT_ROOT}/../cv/detection/ssd/tensorflow"
 
 # determine whether the user is root mode to execute this script
 prefix_sudo=""
diff --git a/tests/executables/ssd/init_torch.sh b/tests/executables/ssd/init_torch.sh
index 19211d91a..c4fd1c84e 100644
--- a/tests/executables/ssd/init_torch.sh
+++ b/tests/executables/ssd/init_torch.sh
@@ -51,7 +51,7 @@ if [[ "$(uname -m)" == "aarch64" ]]; then
     source /opt/rh/gcc-toolset-11/enable
 fi
 
-cd ../../cv/detection/ssd/pytorch/ && bash ./clean_ssd.sh && bash ./build_ssd.sh && bash ./install_ssd.sh "$@";  check_status
+cd ../../../cv/detection/ssd/pytorch/ && bash ./clean_ssd.sh && bash ./build_ssd.sh && bash ./install_ssd.sh "$@";  check_status
 DATA_PATH_BBOX=../../../..
 
 python3 prepare-json.py --keep-keys ${DATA_PATH}/annotations/instances_val2017.json ${DATA_PATH_BBOX}/bbox_only_instances_val2017.json "$@";  check_status
diff --git a/tests/executables/ssd/train_ssd_amp_tf.sh b/tests/executables/ssd/train_ssd_amp_tf.sh
index e7aa985f3..d3b6050cc 100644
--- a/tests/executables/ssd/train_ssd_amp_tf.sh
+++ b/tests/executables/ssd/train_ssd_amp_tf.sh
@@ -10,7 +10,7 @@ ROOT_DIR="${CURRENT_DIR}/../.."
 DATASET_PATH="${ROOT_DIR}/data/datasets/imagenette"
 MODEL_ZOO="${ROOT_DIR}/data/model_zoo"
 WORKSPACE="${ROOT_DIR}/output/${MODEL_NAME}/$0/${CURTIME}"
-SRC_DIR="${ROOT_DIR}/cv/detection/ssd/tensorflow"
+SRC_DIR="${ROOT_DIR}/../cv/detection/ssd/tensorflow"
 
 EXIT_STATUS=0
 check_status()
diff --git a/tests/executables/ssd/train_ssd_amp_torch.sh b/tests/executables/ssd/train_ssd_amp_torch.sh
index 719664e12..e6c3bea77 100644
--- a/tests/executables/ssd/train_ssd_amp_torch.sh
+++ b/tests/executables/ssd/train_ssd_amp_torch.sh
@@ -1,25 +1,25 @@
-
-COCO_PATH="`pwd`/../../data/datasets/coco2017"
-
-: ${BATCH_SIZE:=160}
-
-EXIT_STATUS=0
-check_status()
-{
-    if ((${PIPESTATUS[0]} != 0)); then
-        EXIT_STATUS=1
-    fi
-}
-
-cd ../../cv/detection/ssd/pytorch/
-
-echo "python3 train.py --no-dali --dali-cache 0 --data=${COCO_PATH} \
---batch-size=${BATCH_SIZE} --warmup-factor=0 --warmup=650 --lr=2.92e-3 --threshold=0.08 --epochs 5 --eval-batch-size=160 \
---wd=1.6e-4 --use-fp16 --delay-allreduce --lr-decay-factor=0.2 --lr-decay-epochs 34 45 --opt-level O2 --seed 1769250163"
-
-python3 train.py --dali --dali-cache 0 --data=${COCO_PATH} \
---batch-size=${BATCH_SIZE} --warmup-factor=0 --warmup=650 --lr=2.92e-3 --threshold=0.08 --epochs 5 --eval-batch-size=160 \
---wd=1.6e-4 --use-fp16 --jit --nhwc --pad-input --delay-allreduce --lr-decay-factor=0.2 --lr-decay-epochs 34 45 --opt-level O2 --seed 1769250163 "$@";  check_status
-
-cd -
-exit ${EXIT_STATUS}
+
+COCO_PATH="`pwd`/../../data/datasets/coco2017"
+
+: ${BATCH_SIZE:=160}
+
+EXIT_STATUS=0
+check_status()
+{
+    if ((${PIPESTATUS[0]} != 0)); then
+        EXIT_STATUS=1
+    fi
+}
+
+cd ../../../cv/detection/ssd/pytorch/
+
+echo "python3 train.py --no-dali --dali-cache 0 --data=${COCO_PATH} \
+--batch-size=${BATCH_SIZE} --warmup-factor=0 --warmup=650 --lr=2.92e-3 --threshold=0.08 --epochs 5 --eval-batch-size=160 \
+--wd=1.6e-4 --use-fp16 --delay-allreduce --lr-decay-factor=0.2 --lr-decay-epochs 34 45 --opt-level O2 --seed 1769250163"
+
+python3 train.py --dali --dali-cache 0 --data=${COCO_PATH} \
+--batch-size=${BATCH_SIZE} --warmup-factor=0 --warmup=650 --lr=2.92e-3 --threshold=0.08 --epochs 5 --eval-batch-size=160 \
+--wd=1.6e-4 --use-fp16 --jit --nhwc --pad-input --delay-allreduce --lr-decay-factor=0.2 --lr-decay-epochs 34 45 --opt-level O2 --seed 1769250163 "$@";  check_status
+
+cd -
+exit ${EXIT_STATUS}
diff --git a/tests/executables/stable-diffusion/init_torch.sh b/tests/executables/stable-diffusion/init_torch.sh
index 37ed859e4..ea0bb7a09 100644
--- a/tests/executables/stable-diffusion/init_torch.sh
+++ b/tests/executables/stable-diffusion/init_torch.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 ROOT_DIR="$(cd "$(dirname "$0")/../.."; pwd)"
-SRC_DIR=$ROOT_DIR/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch
+SRC_DIR=$ROOT_DIR/../multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch
 DATA_DIR=$ROOT_DIR/data
 
 # install packages
diff --git a/tests/executables/stable-diffusion/train_sd2.1_pokemon_dist_1x8_torch.sh b/tests/executables/stable-diffusion/train_sd2.1_pokemon_dist_1x8_torch.sh
index 36ce85c69..d19bab854 100644
--- a/tests/executables/stable-diffusion/train_sd2.1_pokemon_dist_1x8_torch.sh
+++ b/tests/executables/stable-diffusion/train_sd2.1_pokemon_dist_1x8_torch.sh
@@ -6,7 +6,7 @@ export ENABLE_FLASH_ATTENTION_WITH_IXATTNBKD=0
 : ${BATCH_SIZE:=8}
 
 ROOT_DIR="$(cd "$(dirname "$0")/../.."; pwd)"
-SRC_DIR=$ROOT_DIR/multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image
+SRC_DIR=$ROOT_DIR/../multimodal/diffusion_model/stable-diffusion-2.1-pokemon/pytorch/examples/text_to_image
 DATASET_NAME=$ROOT_DIR/data/datasets/pokemon-blip-captions
 MODEL_NAME=$ROOT_DIR/data/model_zoo/stabilityai/stable-diffusion-2-1
 export DRT_MEMCPYUSEKERNEL=20000000000
diff --git a/tests/executables/unet3d/train_unet3d_kits19_stage3_dist_1x8_torch.sh b/tests/executables/unet3d/train_unet3d_kits19_stage3_dist_1x8_torch.sh
index 0172b5720..8c581e6e9 100644
--- a/tests/executables/unet3d/train_unet3d_kits19_stage3_dist_1x8_torch.sh
+++ b/tests/executables/unet3d/train_unet3d_kits19_stage3_dist_1x8_torch.sh
@@ -18,7 +18,7 @@ GRADIENT_ACCUMULATION_STEPS=1
 SAVE_CKPT="./ckpt_stage3"
 LOG_NAME='train_log_stage3.json'
 
-cd ../../cv/semantic_segmentation/unet3d/pytorch
+cd ../../../cv/semantic_segmentation/unet3d/pytorch
 if [ ! -d ${SAVE_CKPT} ]; then
     mkdir ${SAVE_CKPT};
 fi
diff --git a/tests/executables/yolov5/init_torch.sh b/tests/executables/yolov5/init_torch.sh
index 8a26be22b..24efe516a 100644
--- a/tests/executables/yolov5/init_torch.sh
+++ b/tests/executables/yolov5/init_torch.sh
@@ -12,7 +12,7 @@ install_pip_pkgs "${pkgs[@]}"
 
 pip3 install tqdm==4.62.1
 
-git clone https://gitee.com/deep-spark/deepsparkhub-gpl.git
+git clone https://gitee.com/deep-spark/deepsparkhub-gpl.git ../../deepsparkhub-gpl
 
 cd ${PROJ_DIR}/deepsparkhub-gpl/cv/detection/yolov5-sample/pytorch
 
-- 
Gitee


From 514da1a4b2da76b6e8ae5595ad7c5956c186513c Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Fri, 17 Oct 2025 10:01:06 +0800
Subject: [PATCH 20/20] sync bert paddle and tf2 and resnet tf2

---
 .../resnet50/tensorflow2.0/README.md          |  182 +
 .../resnet50/tensorflow2.0/augment.py         |  989 +++++
 .../resnet50/tensorflow2.0/augment_test.py    |  130 +
 .../resnet50/tensorflow2.0/callbacks.py       |  257 ++
 .../tensorflow2.0/classifier_trainer.py       |  463 +++
 .../tensorflow2.0/classifier_trainer_test.py  |  240 ++
 .../classifier_trainer_util_test.py           |  166 +
 .../resnet50/tensorflow2.0/common/__init__.py |   15 +
 .../tensorflow2.0/common/dataset_fn.py        |   42 +
 .../tensorflow2.0/common/distribute_utils.py  |  233 ++
 .../common/distribute_utils_test.py           |   59 +
 .../resnet50/tensorflow2.0/common/flags.py    |  110 +
 .../tensorflow2.0/common/registry_imports.py  |   18 +
 .../tensorflow2.0/configs/__init__.py         |   14 +
 .../tensorflow2.0/configs/base_configs.py     |  227 ++
 .../resnet50/tensorflow2.0/configs/configs.py |  113 +
 .../imagenet/efficientnet-b0-gpu.yaml         |   52 +
 .../imagenet/efficientnet-b0-tpu.yaml         |   52 +
 .../imagenet/efficientnet-b1-gpu.yaml         |   47 +
 .../imagenet/efficientnet-b1-tpu.yaml         |   51 +
 .../configs/examples/resnet/imagenet/gpu.yaml |   49 +
 .../resnet/imagenet/gpu_mirrored.yaml         |   51 +
 .../imagenet/gpu_multi_worker_mirrored.yaml   |   53 +
 .../configs/examples/resnet/imagenet/tpu.yaml |   55 +
 .../resnet50/tensorflow2.0/core/__init__.py   |   14 +
 .../resnet50/tensorflow2.0/core/actions.py    |   94 +
 .../tensorflow2.0/core/actions_test.py        |   81 +
 .../resnet50/tensorflow2.0/core/base_task.py  |  334 ++
 .../tensorflow2.0/core/base_trainer.py        |  481 +++
 .../tensorflow2.0/core/base_trainer_test.py   |  406 ++
 .../tensorflow2.0/core/config_definitions.py  |  252 ++
 .../tensorflow2.0/core/exp_factory.py         |   36 +
 .../tensorflow2.0/core/export_base.py         |  109 +
 .../tensorflow2.0/core/export_base_test.py    |   88 +
 .../tensorflow2.0/core/input_reader.py        |  423 +++
 .../resnet50/tensorflow2.0/core/registry.py   |   97 +
 .../tensorflow2.0/core/registry_test.py       |   88 +
 .../tensorflow2.0/core/task_factory.py        |   67 +
 .../resnet50/tensorflow2.0/core/train_lib.py  |  145 +
 .../tensorflow2.0/core/train_lib_test.py      |  137 +
 .../tensorflow2.0/core/train_utils.py         |  395 ++
 .../tensorflow2.0/core/train_utils_test.py    |   56 +
 .../resnet50/tensorflow2.0/dataset_factory.py |  544 +++
 .../resnet50/tensorflow2.0/download_script.sh |    0
 .../tensorflow2.0/efficientnet/__init__.py    |   14 +
 .../efficientnet/common_modules.py            |  119 +
 .../efficientnet/efficientnet_config.py       |   78 +
 .../efficientnet/efficientnet_model.py        |  500 +++
 .../efficientnet/tfhub_export.py              |   68 +
 .../resnet50/tensorflow2.0/learning_rate.py   |  117 +
 .../tensorflow2.0/learning_rate_test.py       |   60 +
 .../resnet50/tensorflow2.0/mnist_main.py      |  176 +
 .../resnet50/tensorflow2.0/mnist_test.py      |   89 +
 .../tensorflow2.0/modeling/__init__.py        |   14 +
 .../modeling/activations/__init__.py          |   21 +
 .../modeling/activations/gelu.py              |   32 +
 .../modeling/activations/gelu_test.py         |   34 +
 .../modeling/activations/relu.py              |   31 +
 .../modeling/activations/relu_test.py         |   35 +
 .../modeling/activations/sigmoid.py           |   31 +
 .../modeling/activations/sigmoid_test.py      |   40 +
 .../modeling/activations/swish.py             |   72 +
 .../modeling/activations/swish_test.py        |   44 +
 .../modeling/hyperparams/__init__.py          |   20 +
 .../modeling/hyperparams/base_config.py       |  270 ++
 .../modeling/hyperparams/base_config_test.py  |  360 ++
 .../hyperparams/config_definitions.py         |   57 +
 .../modeling/hyperparams/oneof.py             |   57 +
 .../modeling/hyperparams/oneof_test.py        |   71 +
 .../modeling/hyperparams/params_dict.py       |  464 +++
 .../modeling/hyperparams/params_dict_test.py  |  429 +++
 .../modeling/multitask/__init__.py            |   14 +
 .../modeling/multitask/base_model.py          |   60 +
 .../modeling/multitask/base_trainer.py        |  176 +
 .../modeling/multitask/base_trainer_test.py   |   90 +
 .../modeling/multitask/configs.py             |   79 +
 .../modeling/multitask/evaluator.py           |  172 +
 .../modeling/multitask/evaluator_test.py      |  138 +
 .../multitask/interleaving_trainer.py         |   92 +
 .../multitask/interleaving_trainer_test.py    |  101 +
 .../modeling/multitask/multitask.py           |  148 +
 .../modeling/multitask/task_sampler.py        |  121 +
 .../modeling/multitask/task_sampler_test.py   |   75 +
 .../modeling/multitask/test_utils.py          |  125 +
 .../modeling/multitask/train_lib.py           |  251 ++
 .../modeling/multitask/train_lib_test.py      |  120 +
 .../modeling/optimization/__init__.py         |   23 +
 .../modeling/optimization/configs/__init__.py |   14 +
 .../configs/learning_rate_config.py           |  250 ++
 .../configs/optimization_config.py            |  114 +
 .../configs/optimization_config_test.py       |   59 +
 .../optimization/configs/optimizer_config.py  |  249 ++
 .../modeling/optimization/ema_optimizer.py    |  255 ++
 .../modeling/optimization/lars_optimizer.py   |  186 +
 .../modeling/optimization/lr_schedule.py      |  385 ++
 .../modeling/optimization/lr_schedule_test.py |  109 +
 .../optimization/optimizer_factory.py         |  175 +
 .../optimization/optimizer_factory_test.py    |  398 ++
 .../modeling/optimization/slide_optimizer.py  |   20 +
 .../tensorflow2.0/modeling/performance.py     |   55 +
 .../modeling/progressive/policies.py          |  173 +
 .../modeling/progressive/train.py             |   69 +
 .../modeling/progressive/train_lib.py         |  126 +
 .../modeling/progressive/train_lib_test.py    |  183 +
 .../modeling/progressive/trainer.py           |  294 ++
 .../modeling/progressive/trainer_test.py      |  238 ++
 .../modeling/progressive/utils.py             |   56 +
 .../tensorflow2.0/modeling/tf_utils.py        |  200 +
 .../tensorflow2.0/optimizer_factory.py        |  183 +
 .../tensorflow2.0/optimizer_factory_test.py   |  119 +
 .../resnet50/tensorflow2.0/preprocessing.py   |  391 ++
 .../resnet50/tensorflow2.0/resnet/README.md   |  125 +
 .../resnet50/tensorflow2.0/resnet/__init__.py |   14 +
 .../resnet50/tensorflow2.0/resnet/common.py   |  418 ++
 .../resnet/imagenet_preprocessing.py          |  574 +++
 .../tensorflow2.0/resnet/resnet_config.py     |   55 +
 .../resnet/resnet_ctl_imagenet_main.py        |  195 +
 .../tensorflow2.0/resnet/resnet_model.py      |  326 ++
 .../tensorflow2.0/resnet/resnet_runnable.py   |  216 ++
 .../tensorflow2.0/resnet/tfhub_export.py      |   67 +
 .../run_train_resnet50_mirrored_imagenette.sh |   45 +
 ...ain_resnet50_worker_mirrored_imagenette.sh |   56 +
 .../tensorflow2.0/staging/__init__.py         |   14 +
 .../staging/training/__init__.py              |   14 +
 .../staging/training/grad_utils.py            |  151 +
 .../resnet50/tensorflow2.0/test_utils.py      |   37 +
 .../resnet50/tensorflow2.0/utils/__init__.py  |   14 +
 .../utils/docs/build_api_docs_lib.py          |   54 +
 .../utils/docs/build_nlp_api_docs.py          |   95 +
 .../utils/docs/build_vision_api_docs.py       |   93 +
 .../tensorflow2.0/utils/flags/README.md       |  102 +
 .../tensorflow2.0/utils/flags/__init__.py     |   14 +
 .../tensorflow2.0/utils/flags/_base.py        |  177 +
 .../tensorflow2.0/utils/flags/_benchmark.py   |  117 +
 .../tensorflow2.0/utils/flags/_conventions.py |   50 +
 .../tensorflow2.0/utils/flags/_device.py      |   90 +
 .../utils/flags/_distribution.py              |   52 +
 .../tensorflow2.0/utils/flags/_misc.py        |   48 +
 .../tensorflow2.0/utils/flags/_performance.py |  294 ++
 .../tensorflow2.0/utils/flags/core.py         |  130 +
 .../tensorflow2.0/utils/flags/flags_test.py   |  162 +
 .../tensorflow2.0/utils/flags/guidelines.md   |   65 +
 .../tensorflow2.0/utils/hyperparams_flags.py  |  123 +
 .../tensorflow2.0/utils/misc/__init__.py      |   14 +
 .../utils/misc/distribution_utils.py          |   17 +
 .../tensorflow2.0/utils/misc/keras_utils.py   |  211 ++
 .../tensorflow2.0/utils/misc/model_helpers.py |   94 +
 .../utils/misc/model_helpers_test.py          |  127 +
 .../tensorflow2.0/utils/testing/__init__.py   |   14 +
 .../utils/testing/integration.py              |   70 +
 .../tensorflow2.0/utils/testing/mock_task.py  |  101 +
 .../tensorflow2.0/utils/testing/pylint.rcfile |  168 +
 .../utils/testing/scripts/builds_common.sh    |   64 +
 .../utils/testing/scripts/ci_sanity.sh        |  132 +
 .../utils/testing/scripts/presubmit.sh        |   73 +
 .../bert/paddlepaddle/README.md               |  248 ++
 .../paddlepaddle/create_pretraining_data.py   |  497 +++
 .../bert/paddlepaddle/data/sample_text.txt    |  100 +
 .../bert/paddlepaddle/export_model.py         |   78 +
 .../paddlepaddle/paddlenlp_3.0.0/__init__.py  |   62 +
 .../paddlenlp_3.0.0/cli/__init__.py           |   14 +
 .../paddlenlp_3.0.0/cli/bos_community.py      |   96 +
 .../paddlenlp_3.0.0/cli/download.py           |   58 +
 .../paddlenlp_3.0.0/cli/install.py            |   47 +
 .../paddlepaddle/paddlenlp_3.0.0/cli/main.py  |  249 ++
 .../paddlenlp_3.0.0/cli/server.py             |   26 +
 .../paddlenlp_3.0.0/cli/utils/__init__.py     |   13 +
 .../paddlenlp_3.0.0/cli/utils/tabulate.py     |   84 +
 .../paddlenlp_3.0.0/data/__init__.py          |   22 +
 .../paddlenlp_3.0.0/data/blendable_dataset.py |  173 +
 .../paddlenlp_3.0.0/data/causal_dataset.py    |  711 ++++
 .../paddlenlp_3.0.0/data/collate.py           |  321 ++
 .../paddlenlp_3.0.0/data/data_collator.py     |  887 +++++
 .../paddlenlp_3.0.0/data/dist_dataloader.py   |  214 ++
 .../paddlenlp_3.0.0/data/indexed_dataset.py   |  972 +++++
 .../paddlenlp_3.0.0/data/iterator.py          |   15 +
 .../paddlenlp_3.0.0/data/sampler.py           |  416 ++
 .../paddlenlp_3.0.0/data/tokenizer.py         |  131 +
 .../paddlenlp_3.0.0/data/vocab.py             |  579 +++
 .../paddlenlp_3.0.0/dataaug/__init__.py       |   18 +
 .../paddlenlp_3.0.0/dataaug/base_augment.py   |  241 ++
 .../paddlenlp_3.0.0/dataaug/char.py           |  570 +++
 .../paddlenlp_3.0.0/dataaug/sentence.py       |  552 +++
 .../paddlenlp_3.0.0/dataaug/word.py           |  635 ++++
 .../paddlenlp_3.0.0/datasets/README.md        |    0
 .../paddlenlp_3.0.0/datasets/__init__.py      |   46 +
 .../paddlenlp_3.0.0/datasets/advertisegen.py  |   75 +
 .../paddlenlp_3.0.0/datasets/bellegroup.py    |  108 +
 .../paddlenlp_3.0.0/datasets/bq_corpus.py     |   73 +
 .../paddlenlp_3.0.0/datasets/bstc.py          |  156 +
 .../paddlenlp_3.0.0/datasets/c3.py            |  113 +
 .../datasets/cail2018_small.py                |  487 +++
 .../paddlenlp_3.0.0/datasets/cail2019_scm.py  |   76 +
 .../paddlenlp_3.0.0/datasets/cblue.py         |  456 +++
 .../paddlenlp_3.0.0/datasets/chnsenticorp.py  |   80 +
 .../datasets/chnsenticorp_v2.py               |   76 +
 .../paddlenlp_3.0.0/datasets/clue.py          |  271 ++
 .../paddlenlp_3.0.0/datasets/cmrc2018.py      |   83 +
 .../paddlenlp_3.0.0/datasets/cnn_dailymail.py |  231 ++
 .../paddlenlp_3.0.0/datasets/conll2002.py     |  159 +
 .../paddlenlp_3.0.0/datasets/cote.py          |  106 +
 .../paddlenlp_3.0.0/datasets/couplet.py       |  101 +
 .../paddlenlp_3.0.0/datasets/dataset.py       |  781 ++++
 .../paddlenlp_3.0.0/datasets/drcd.py          |   84 +
 .../paddlenlp_3.0.0/datasets/drcd_cn.py       |   76 +
 .../paddlenlp_3.0.0/datasets/duconv.py        |   58 +
 .../datasets/dureader_checklist.py            |   94 +
 .../paddlenlp_3.0.0/datasets/dureader_qg.py   |   72 +
 .../datasets/dureader_robust.py               |   73 +
 .../datasets/dureader_yesno.py                |   67 +
 .../paddlenlp_3.0.0/datasets/fewclue.py       |  336 ++
 .../paddlenlp_3.0.0/datasets/glue.py          |  288 ++
 .../datasets/hf_datasets/__init__.py          |   13 +
 .../datasets/hf_datasets/chnsenticorp.py      |  120 +
 .../datasets/hf_datasets/clue.py              |  552 +++
 .../datasets/hf_datasets/cmrc2018.py          |  135 +
 .../datasets/hf_datasets/cnn_dailymail.py     |  276 ++
 .../datasets/hf_datasets/cote.py              |  143 +
 .../datasets/hf_datasets/docvqa_zh.py         |  131 +
 .../datasets/hf_datasets/duconv.py            |  126 +
 .../datasets/hf_datasets/dureader_robust.py   |  129 +
 .../datasets/hf_datasets/funsd.py             |  141 +
 .../datasets/hf_datasets/glue.py              |  625 +++
 .../datasets/hf_datasets/imdb.py              |  109 +
 .../datasets/hf_datasets/language_pair.py     |  189 +
 .../datasets/hf_datasets/msra_ner.py          |  147 +
 .../datasets/hf_datasets/mt_eng_vietnamese.py |  124 +
 .../datasets/hf_datasets/ptb_text_only.py     |  144 +
 .../datasets/hf_datasets/rvl_cdip_sampled.py  |  144 +
 .../datasets/hf_datasets/seabsa16.py          |  136 +
 .../datasets/hf_datasets/squad.py             |  139 +
 .../datasets/hf_datasets/squad_v2.py          |  144 +
 .../datasets/hf_datasets/xfund_zh.py          |  153 +
 .../datasets/hf_datasets/xnli.py              |  209 +
 .../paddlenlp_3.0.0/datasets/hyp.py           |   68 +
 .../paddlenlp_3.0.0/datasets/imdb.py          |   73 +
 .../paddlenlp_3.0.0/datasets/iwslt15.py       |  123 +
 .../paddlenlp_3.0.0/datasets/lcqmc.py         |   70 +
 .../paddlenlp_3.0.0/datasets/lcqmc_v2.py      |   74 +
 .../paddlenlp_3.0.0/datasets/lcsts_new.py     |   67 +
 .../paddlenlp_3.0.0/datasets/msra_ner.py      |   67 +
 .../datasets/nlpcc13_evsam05_hit.py           |   95 +
 .../datasets/nlpcc13_evsam05_thu.py           |   91 +
 .../paddlenlp_3.0.0/datasets/nlpcc14_sc.py    |   80 +
 .../paddlenlp_3.0.0/datasets/nlpcc_dbqa.py    |   77 +
 .../paddlenlp_3.0.0/datasets/paws-x.py        |   70 +
 .../datasets/peoples_daily_ner.py             |   68 +
 .../paddlenlp_3.0.0/datasets/poetry.py        |   59 +
 .../paddlenlp_3.0.0/datasets/ptb.py           |   60 +
 .../paddlenlp_3.0.0/datasets/seabsa16.py      |  100 +
 .../paddlenlp_3.0.0/datasets/sighan-cn.py     |   51 +
 .../paddlenlp_3.0.0/datasets/squad.py         |   97 +
 .../paddlenlp_3.0.0/datasets/thucnews.py      |   69 +
 .../paddlenlp_3.0.0/datasets/triviaqa.py      |   80 +
 .../paddlenlp_3.0.0/datasets/wmt14ende.py     |  139 +
 .../paddlenlp_3.0.0/datasets/wos.py           |  221 ++
 .../paddlenlp_3.0.0/datasets/xnli.py          |  174 +
 .../paddlenlp_3.0.0/datasets/xnli_cn.py       |   87 +
 .../datasets/yahoo_answer_100k.py             |   72 +
 .../datasets/zero_padding_dataset.py          |  266 ++
 .../paddlenlp_3.0.0/embeddings/__init__.py    |   15 +
 .../paddlenlp_3.0.0/embeddings/constant.py    |   98 +
 .../embeddings/token_embedding.py             |  378 ++
 .../paddlenlp_3.0.0/experimental/__init__.py  |   17 +
 .../experimental/autonlp/README.md            |  146 +
 .../experimental/autonlp/README_en.md         |  147 +
 .../experimental/autonlp/__init__.py          |   15 +
 .../experimental/autonlp/auto_trainer_base.py |  383 ++
 .../experimental/autonlp/requirements.txt     |    4 +
 .../autonlp/text_classification.py            |  764 ++++
 .../experimental/autonlp/utils.py             |   34 +
 .../experimental/ernie_model.py               |  304 ++
 .../experimental/faster_tokenizer.py          |  152 +
 .../experimental/model_utils.py               |  427 +++
 .../experimental/transformers/__init__.py     |   23 +
 .../transformers/bloom/__init__.py            |   15 +
 .../transformers/bloom/modeling.py            |  768 ++++
 .../transformers/chatglm/__init__.py          |   15 +
 .../transformers/chatglm/modeling.py          |  745 ++++
 .../transformers/chatglm_v2/__init__.py       |   15 +
 .../transformers/chatglm_v2/modeling.py       |  487 +++
 .../transformers/fused_transformer_layers.py  | 1679 ++++++++
 .../transformers/generation_utils.py          | 1004 +++++
 .../experimental/transformers/gpt/__init__.py |   15 +
 .../experimental/transformers/gpt/modeling.py |  578 +++
 .../transformers/llama/__init__.py            |   15 +
 .../transformers/llama/modeling.py            | 1813 +++++++++
 .../transformers/llama/ptq_scales_map.json    |   21 +
 .../llama/ptq_scales_map_shift_smooth.json    |   21 +
 .../experimental/transformers/opt/__init__.py |   15 +
 .../experimental/transformers/opt/modeling.py |  556 +++
 .../transformers/qwen/__init__.py             |   15 +
 .../transformers/qwen/modeling.py             |  643 ++++
 .../transformers/qwen2/__init__.py            |   15 +
 .../transformers/qwen2/modeling.py            | 1264 +++++++
 .../transformers/qwen2/ptq_scales_map.json    |   21 +
 .../qwen2/ptq_scales_map_shift_smooth.json    |   21 +
 .../experimental/transformers/utils.py        |  159 +
 .../paddlenlp_3.0.0/generation/__init__.py    |   34 +
 .../generation/configuration_utils.py         |  597 +++
 .../generation/logits_process.py              |  646 ++++
 .../generation/stopping_criteria.py           |   91 +
 .../paddlenlp_3.0.0/generation/streamers.py   |  216 ++
 .../paddlenlp_3.0.0/generation/utils.py       | 1838 +++++++++
 .../paddlenlp_3.0.0/layers/__init__.py        |   23 +
 .../paddlenlp_3.0.0/layers/crf.py             |  417 ++
 .../paddlenlp_3.0.0/layers/globalpointer.py   |  145 +
 .../paddlenlp_3.0.0/layers/linear.py          |   59 +
 .../paddlenlp_3.0.0/layers/sequence.py        |   33 +
 .../paddlenlp_3.0.0/layers/tcn.py             |  154 +
 .../paddlenlp_3.0.0/losses/__init__.py        |   15 +
 .../paddlenlp_3.0.0/losses/rdrop.py           |   69 +
 .../paddlenlp_3.0.0/metrics/README.md         |   14 +
 .../paddlenlp_3.0.0/metrics/__init__.py       |   23 +
 .../paddlenlp_3.0.0/metrics/bleu.py           |  276 ++
 .../paddlenlp_3.0.0/metrics/chunk.py          |  195 +
 .../paddlenlp_3.0.0/metrics/distinct.py       |  164 +
 .../paddlenlp_3.0.0/metrics/dureader.py       |  340 ++
 .../paddlenlp_3.0.0/metrics/glue.py           |  668 ++++
 .../paddlenlp_3.0.0/metrics/mrr.py            |   68 +
 .../paddlenlp_3.0.0/metrics/perplexity.py     |  145 +
 .../paddlenlp_3.0.0/metrics/rouge.py          |  284 ++
 .../paddlenlp_3.0.0/metrics/sighan.py         |  103 +
 .../paddlenlp_3.0.0/metrics/span.py           |  103 +
 .../paddlenlp_3.0.0/metrics/squad.py          |  436 +++
 .../paddlenlp_3.0.0/metrics/utils.py          |   40 +
 .../paddlenlp_3.0.0/ops/__init__.py           |   19 +
 .../ops/distributed/__init__.py               |   22 +
 .../ops/distributed/parallel.py               |  311 ++
 .../ops/distributed/utils/__init__.py         |   21 +
 .../ops/distributed/utils/random.py           |   59 +
 .../ops/distributed/utils/topo.py             |   84 +
 .../paddlenlp_3.0.0/ops/einsum.py             |  367 ++
 .../paddlenlp_3.0.0/ops/optimizer/__init__.py |   19 +
 .../paddlenlp_3.0.0/ops/optimizer/adamwdl.py  |  257 ++
 .../paddlenlp_3.0.0/ops/optimizer/ema.py      |   48 +
 .../paddlenlp_3.0.0/ops/optimizer/lr.py       |   57 +
 .../paddlenlp_3.0.0/peft/__init__.py          |   17 +
 .../paddlenlp_3.0.0/peft/lora/__init__.py     |   17 +
 .../paddlenlp_3.0.0/peft/lora/lora_config.py  |  176 +
 .../paddlenlp_3.0.0/peft/lora/lora_layers.py  |  802 ++++
 .../paddlenlp_3.0.0/peft/lora/lora_model.py   |  861 +++++
 .../peft/lora/lora_quant_layers.py            |  272 ++
 .../peft/lora/lora_quantization_layers.py     |  504 +++
 .../peft/lora/lora_quick_layers.py            |  223 ++
 .../paddlenlp_3.0.0/peft/prefix/__init__.py   |   23 +
 .../peft/prefix/prefix_config.py              |  102 +
 .../peft/prefix/prefix_model.py               |  539 +++
 .../paddlenlp_3.0.0/peft/prefix/utils.py      |   52 +
 .../paddlenlp_3.0.0/peft/vera/__init__.py     |   17 +
 .../paddlenlp_3.0.0/peft/vera/vera_config.py  |  131 +
 .../paddlenlp_3.0.0/peft/vera/vera_layers.py  |  149 +
 .../paddlenlp_3.0.0/peft/vera/vera_model.py   |  284 ++
 .../paddlenlp_3.0.0/prompt/__init__.py        |   21 +
 .../paddlenlp_3.0.0/prompt/prompt_args.py     |   83 +
 .../paddlenlp_3.0.0/prompt/prompt_model.py    |  162 +
 .../prompt/prompt_tokenizer.py                |  224 ++
 .../paddlenlp_3.0.0/prompt/prompt_trainer.py  |  316 ++
 .../paddlenlp_3.0.0/prompt/prompt_utils.py    |  208 +
 .../paddlenlp_3.0.0/prompt/template.py        |  937 +++++
 .../paddlenlp_3.0.0/prompt/verbalizer.py      |  461 +++
 .../paddlenlp_3.0.0/quantization/__init__.py  |   15 +
 .../paddlenlp_3.0.0/quantization/qlora.py     |  115 +
 .../quantization/quantization_config.py       |  150 +
 .../quantization/quantization_linear.py       |  401 ++
 .../quantization/quantization_utils.py        |  197 +
 .../paddlenlp_3.0.0/seq2vec/__init__.py       |   15 +
 .../paddlenlp_3.0.0/seq2vec/encoder.py        |  997 +++++
 .../paddlenlp_3.0.0/server/__init__.py        |   16 +
 .../paddlenlp_3.0.0/server/base_router.py     |   32 +
 .../server/handlers/__init__.py               |   23 +
 .../server/handlers/base_handler.py           |   46 +
 .../server/handlers/cls_post_handler.py       |   71 +
 .../server/handlers/custom_model_handler.py   |  156 +
 .../server/handlers/qa_model_handler.py       |   89 +
 .../server/handlers/taskflow_handler.py       |   34 +
 .../server/handlers/token_model_handler.py    |  114 +
 .../server/http_router/__init__.py            |   15 +
 .../server/http_router/router.py              |  119 +
 .../paddlenlp_3.0.0/server/model_manager.py   |   96 +
 .../paddlenlp_3.0.0/server/predictor.py       |  202 +
 .../paddlenlp_3.0.0/server/server.py          |   83 +
 .../server/taskflow_manager.py                |   40 +
 .../paddlenlp_3.0.0/server/utils.py           |   25 +
 .../paddlenlp_3.0.0/taskflow/__init__.py      |   15 +
 .../taskflow/code_generation.py               |  167 +
 .../taskflow/dependency_parsing.py            |  736 ++++
 .../paddlenlp_3.0.0/taskflow/dialogue.py      |  370 ++
 .../taskflow/document_intelligence.py         |  252 ++
 .../paddlenlp_3.0.0/taskflow/fill_mask.py     |  167 +
 .../taskflow/information_extraction.py        | 1592 ++++++++
 .../taskflow/knowledge_mining.py              |  773 ++++
 .../taskflow/lexical_analysis.py              |  265 ++
 .../taskflow/models/__init__.py               |   18 +
 .../models/dependency_parsing_model.py        |  229 ++
 .../taskflow/models/lexical_analysis_model.py |  100 +
 .../models/sentiment_analysis_model.py        |  151 +
 .../taskflow/models/text_correction_model.py  |  127 +
 .../taskflow/multimodal_feature_extraction.py |  463 +++
 .../taskflow/named_entity_recognition.py      |  240 ++
 .../taskflow/poetry_generation.py             |   51 +
 .../paddlenlp_3.0.0/taskflow/pos_tagging.py   |   81 +
 .../taskflow/question_answering.py            |   52 +
 .../taskflow/question_generation.py           |  454 +++
 .../taskflow/sentiment_analysis.py            |  881 +++++
 .../paddlenlp_3.0.0/taskflow/task.py          |  529 +++
 .../paddlenlp_3.0.0/taskflow/taskflow.py      |  869 +++++
 .../taskflow/text2text_generation.py          |  252 ++
 .../taskflow/text_classification.py           |  369 ++
 .../taskflow/text_correction.py               |  265 ++
 .../taskflow/text_feature_extraction.py       |  585 +++
 .../taskflow/text_generation.py               |  158 +
 .../taskflow/text_similarity.py               |  353 ++
 .../taskflow/text_summarization.py            |  315 ++
 .../paddlenlp_3.0.0/taskflow/utils.py         | 2548 +++++++++++++
 .../taskflow/word_segmentation.py             |  173 +
 .../taskflow/zero_shot_text_classification.py |  427 +++
 .../paddlenlp_3.0.0/trainer/__init__.py       |   24 +
 .../paddlenlp_3.0.0/trainer/argparser.py      |  296 ++
 .../paddlenlp_3.0.0/trainer/auto_trainer.py   |  745 ++++
 .../trainer/compression_args.py               |  225 ++
 .../paddlenlp_3.0.0/trainer/integrations.py   |  432 +++
 .../trainer/plugins/__init__.py               |   13 +
 .../trainer/plugins/npu_plugin.py             |  127 +
 .../trainer/plugins/shared_memory_utils.py    |  148 +
 .../paddlenlp_3.0.0/trainer/plugins/timer.py  |  153 +
 .../trainer/plugins/unified_checkpoint.py     | 2349 ++++++++++++
 .../paddlenlp_3.0.0/trainer/trainer.py        | 3287 ++++++++++++++++
 .../trainer/trainer_callback.py               |  596 +++
 .../trainer/trainer_compress.py               | 1035 +++++
 .../trainer/trainer_seq2seq.py                |  248 ++
 .../paddlenlp_3.0.0/trainer/trainer_utils.py  | 1101 ++++++
 .../paddlenlp_3.0.0/trainer/training_args.py  | 2053 ++++++++++
 .../trainer/training_args_seq2seq.py          |   68 +
 .../paddlenlp_3.0.0/trainer/utils/__init__.py |   21 +
 .../trainer/utils/async_save.py               |  126 +
 .../paddlenlp_3.0.0/trainer/utils/doc.py      |   54 +
 .../paddlenlp_3.0.0/trainer/utils/helper.py   |  338 ++
 .../trainer/utils/reshard/__init__.py         |   23 +
 .../trainer/utils/reshard/common.py           |  587 +++
 .../trainer/utils/reshard/pp_reshard.py       |  336 ++
 .../trainer/utils/reshard/sharding_v1.py      |   42 +
 .../trainer/utils/reshard/sharding_v2.py      |  232 ++
 .../trainer/utils/sharding_io.py              |  605 +++
 .../paddlenlp_3.0.0/transformers/__init__.py  |  308 ++
 .../transformers/activations.py               |  174 +
 .../transformers/aistudio_utils.py            |   67 +
 .../transformers/albert/__init__.py           |   13 +
 .../transformers/albert/configuration.py      |  448 +++
 .../transformers/albert/modeling.py           | 1554 ++++++++
 .../transformers/albert/tokenizer.py          |  801 ++++
 .../transformers/artist/__init__.py           |   13 +
 .../transformers/artist/configuration.py      |  120 +
 .../transformers/artist/modeling.py           |   64 +
 .../transformers/artist/tokenizer.py          |  253 ++
 .../transformers/attention_utils.py           |  619 +++
 .../transformers/audio_utils.py               |  694 ++++
 .../transformers/auto/__init__.py             |   13 +
 .../transformers/auto/configuration.py        |  207 +
 .../transformers/auto/image_processing.py     |  183 +
 .../transformers/auto/modeling.py             | 1024 +++++
 .../transformers/auto/processing.py           |  193 +
 .../transformers/auto/tokenizer.py            |  269 ++
 .../transformers/bart/__init__.py             |   13 +
 .../transformers/bart/configuration.py        |  197 +
 .../transformers/bart/modeling.py             | 1407 +++++++
 .../transformers/bart/tokenizer.py            |  398 ++
 .../transformers/bert/__init__.py             |   13 +
 .../transformers/bert/configuration.py        |  407 ++
 .../transformers/bert/modeling.py             | 1421 +++++++
 .../transformers/bert/modeling.pyi            |  345 ++
 .../transformers/bert/tokenizer.py            |  630 +++
 .../transformers/bert_japanese/__init__.py    |   13 +
 .../transformers/bert_japanese/tokenizer.py   |  354 ++
 .../transformers/bigbird/__init__.py          |   13 +
 .../transformers/bigbird/configuration.py     |  208 +
 .../transformers/bigbird/modeling.py          | 1706 +++++++++
 .../transformers/bigbird/tokenizer.py         |  400 ++
 .../transformers/bit/__init__.py              |   13 +
 .../transformers/bit/configuration.py         |  130 +
 .../transformers/bit/image_processing.py      |  328 ++
 .../transformers/bit/modeling.py              |  915 +++++
 .../transformers/blenderbot/__init__.py       |   13 +
 .../transformers/blenderbot/configuration.py  |  203 +
 .../transformers/blenderbot/modeling.py       |  749 ++++
 .../transformers/blenderbot/tokenizer.py      |  161 +
 .../transformers/blenderbot_small/__init__.py |   13 +
 .../blenderbot_small/configuration.py         |  161 +
 .../transformers/blenderbot_small/modeling.py |  752 ++++
 .../blenderbot_small/tokenizer.py             |  220 ++
 .../transformers/blip/__init__.py             |   13 +
 .../transformers/blip/configuration.py        |  393 ++
 .../transformers/blip/image_processing.py     |  285 ++
 .../transformers/blip/modeling.py             | 1590 ++++++++
 .../transformers/blip/modeling_text.py        | 1101 ++++++
 .../transformers/blip/processing.py           |  119 +
 .../transformers/blip_2/__init__.py           |   13 +
 .../transformers/blip_2/configuration.py      |  366 ++
 .../transformers/blip_2/modeling.py           | 1679 ++++++++
 .../transformers/blip_2/processing.py         |  120 +
 .../transformers/bloom/__init__.py            |   13 +
 .../transformers/bloom/configuration.py       |  155 +
 .../transformers/bloom/modeling.py            | 1907 ++++++++++
 .../transformers/bloom/processor.py           |  176 +
 .../transformers/bloom/tokenizer.py           |  411 ++
 .../transformers/chatglm/LICENSE              |   65 +
 .../transformers/chatglm/__init__.py          |   13 +
 .../transformers/chatglm/configuration.py     |  137 +
 .../transformers/chatglm/modeling.py          |  986 +++++
 .../transformers/chatglm/tokenizer.py         |  287 ++
 .../transformers/chatglm_v2/LICENSE           |   65 +
 .../transformers/chatglm_v2/__init__.py       |   13 +
 .../chatglm-legacy-checkpoints-convert.py     |   51 +
 .../transformers/chatglm_v2/configuration.py  |   90 +
 .../transformers/chatglm_v2/modeling.py       |  859 +++++
 .../transformers/chatglm_v2/tokenizer.py      |  322 ++
 .../transformers/chinesebert/__init__.py      |   13 +
 .../transformers/chinesebert/configuration.py |  181 +
 .../transformers/chinesebert/modeling.py      |  822 ++++
 .../transformers/chinesebert/tokenizer.py     |  759 ++++
 .../transformers/chineseclip/__init__.py      |   13 +
 .../transformers/chineseclip/configuration.py |  380 ++
 .../transformers/chineseclip/converter.py     |  301 ++
 .../chineseclip/feature_extraction.py         |   32 +
 .../chineseclip/image_processing.py           |  328 ++
 .../transformers/chineseclip/modeling.py      | 1036 +++++
 .../transformers/chineseclip/processing.py    |  153 +
 .../transformers/chineseclip/tokenizer.py     |   29 +
 .../transformers/clap/__init__.py             |   13 +
 .../transformers/clap/configuration.py        |  450 +++
 .../transformers/clap/feature_extraction.py   |  358 ++
 .../transformers/clap/modeling.py             | 2285 +++++++++++
 .../transformers/clap/processing.py           |  120 +
 .../transformers/clip/__init__.py             |   13 +
 .../transformers/clip/configuration.py        |  509 +++
 .../transformers/clip/feature_extraction.py   |   32 +
 .../transformers/clip/image_processing.py     |  327 ++
 .../transformers/clip/modeling.py             | 1705 +++++++++
 .../transformers/clip/processing.py           |  156 +
 .../transformers/clip/tokenizer.py            |  553 +++
 .../transformers/clipseg/__init__.py          |   13 +
 .../transformers/clipseg/configuration.py     |  413 ++
 .../transformers/clipseg/image_processing.py  |  263 ++
 .../transformers/clipseg/modeling.py          | 1364 +++++++
 .../transformers/clipseg/processing.py        |  157 +
 .../transformers/codegen/__init__.py          |   13 +
 .../transformers/codegen/configuration.py     |  120 +
 .../transformers/codegen/modeling.py          |  688 ++++
 .../transformers/codegen/tokenizer.py         |  128 +
 .../transformers/configuration_utils.py       | 1231 ++++++
 .../transformers/context_parallel_utils.py    |   64 +
 .../transformers/convbert/__init__.py         |   13 +
 .../transformers/convbert/configuration.py    |  313 ++
 .../transformers/convbert/modeling.py         | 1546 ++++++++
 .../transformers/convbert/tokenizer.py        |   44 +
 .../transformers/conversion_utils.py          | 1544 ++++++++
 .../transformers/convert_slow_tokenizer.py    |  324 ++
 .../transformers/ctrl/__init__.py             |   13 +
 .../transformers/ctrl/configuration.py        |  145 +
 .../transformers/ctrl/modeling.py             |  748 ++++
 .../transformers/ctrl/tokenizer.py            |  357 ++
 .../transformers/dallebart/__init__.py        |   13 +
 .../transformers/dallebart/configuration.py   |  254 ++
 .../transformers/dallebart/modeling.py        | 1350 +++++++
 .../transformers/dallebart/tokenizer.py       |  503 +++
 .../transformers/deberta/__init__.py          |   13 +
 .../transformers/deberta/configuration.py     |  169 +
 .../transformers/deberta/modeling.py          | 1378 +++++++
 .../transformers/deberta/tokenizer.py         |  413 ++
 .../transformers/deberta_v2/__init__.py       |   13 +
 .../transformers/deberta_v2/configuration.py  |  260 ++
 .../transformers/deberta_v2/modeling.py       | 1482 ++++++++
 .../transformers/deberta_v2/tokenizer.py      |  587 +++
 .../transformers/distilbert/__init__.py       |   13 +
 .../transformers/distilbert/configuration.py  |  169 +
 .../transformers/distilbert/modeling.py       |  585 +++
 .../transformers/distilbert/tokenizer.py      |   73 +
 .../transformers/distill_utils.py             |  397 ++
 .../transformers/dpt/__init__.py              |   13 +
 .../transformers/dpt/configuration.py         |  226 ++
 .../transformers/dpt/image_processing.py      |  373 ++
 .../transformers/dpt/modeling.py              | 1336 +++++++
 .../transformers/electra/__init__.py          |   13 +
 .../transformers/electra/configuration.py     |  293 ++
 .../transformers/electra/converter.py         |  109 +
 .../transformers/electra/modeling.py          | 1813 +++++++++
 .../transformers/electra/tokenizer.py         |  309 ++
 .../transformers/ernie/README.md              |    1 +
 .../transformers/ernie/__init__.py            |   13 +
 .../transformers/ernie/configuration.py       | 1291 +++++++
 .../transformers/ernie/modeling.py            | 1381 +++++++
 .../match_static_to_dygraph.py                |  160 +
 .../transformers/ernie/tokenizer.py           |  918 +++++
 .../transformers/ernie_code/__init__.py       |   13 +
 .../transformers/ernie_code/configuration.py  |  198 +
 .../transformers/ernie_code/modeling.py       | 1751 +++++++++
 .../transformers/ernie_code/tokenizer.py      |  200 +
 .../transformers/ernie_ctm/__init__.py        |   13 +
 .../transformers/ernie_ctm/configuration.py   |  150 +
 .../transformers/ernie_ctm/modeling.py        |  830 ++++
 .../transformers/ernie_ctm/tokenizer.py       |  282 ++
 .../transformers/ernie_doc/__init__.py        |   13 +
 .../transformers/ernie_doc/configuration.py   |  165 +
 .../transformers/ernie_doc/modeling.py        |  808 ++++
 .../transformers/ernie_doc/tokenizer.py       |  193 +
 .../transformers/ernie_gen/__init__.py        |   13 +
 .../transformers/ernie_gen/modeling.py        |  633 ++++
 .../transformers/ernie_gen/params_map.json    |    1 +
 .../transformers/ernie_gram/__init__.py       |   13 +
 .../transformers/ernie_gram/configuration.py  |  160 +
 .../ernie_gram/matching_param_name.py         |  110 +
 .../transformers/ernie_gram/modeling.py       |  703 ++++
 .../transformers/ernie_gram/tokenizer.py      |  103 +
 .../transformers/ernie_layout/__init__.py     |   13 +
 .../ernie_layout/configuration.py             |  205 +
 .../transformers/ernie_layout/modeling.py     | 1183 ++++++
 .../transformers/ernie_layout/tokenizer.py    |  299 ++
 .../ernie_layout/visual_backbone.py           |  214 ++
 .../transformers/ernie_m/__init__.py          |   13 +
 .../transformers/ernie_m/configuration.py     |  177 +
 .../transformers/ernie_m/modeling.py          |  834 ++++
 .../transformers/ernie_m/tokenizer.py         |  348 ++
 .../transformers/ernie_vil/__init__.py        |   13 +
 .../transformers/ernie_vil/configuration.py   |  345 ++
 .../ernie_vil/feature_extraction.py           |   32 +
 .../ernie_vil/image_processing.py             |  328 ++
 .../transformers/ernie_vil/modeling.py        |  672 ++++
 .../transformers/ernie_vil/processing.py      |  149 +
 .../transformers/ernie_vil/tokenizer.py       |   36 +
 .../paddlenlp_3.0.0/transformers/export.py    |   68 +
 .../feature_extraction_sequence_utils.py      |  366 ++
 .../transformers/feature_extraction_utils.py  |  378 ++
 .../transformers/fnet/__init__.py             |   13 +
 .../transformers/fnet/configuration.py        |  142 +
 .../transformers/fnet/modeling.py             |  936 +++++
 .../transformers/fnet/tokenizer.py            |  208 +
 .../transformers/funnel/__init__.py           |   16 +
 .../transformers/funnel/configuration.py      |  206 +
 .../transformers/funnel/modeling.py           | 1581 ++++++++
 .../transformers/funnel/tokenizer.py          |  134 +
 .../transformers/gau_alpha/__init__.py        |   13 +
 .../transformers/gau_alpha/configuration.py   |  161 +
 .../transformers/gau_alpha/modeling.py        |  810 ++++
 .../transformers/gau_alpha/tokenizer.py       |  292 ++
 .../transformers/gemma/__init__.py            |   18 +
 .../transformers/gemma/configuration.py       |  171 +
 .../transformers/gemma/modeling.py            | 1547 ++++++++
 .../transformers/gemma/modeling_pp.py         |  313 ++
 .../transformers/gemma/tokenizer.py           |  360 ++
 .../transformers/glm/__init__.py              |   13 +
 .../transformers/glm/configuration.py         |  252 ++
 .../transformers/glm/modeling.py              |  878 +++++
 .../transformers/glm/tokenizer.py             |  501 +++
 .../transformers/gpt/__init__.py              |   19 +
 .../transformers/gpt/configuration.py         |  303 ++
 .../transformers/gpt/modeling.py              | 1913 ++++++++++
 .../transformers/gpt/modeling_auto.py         | 1333 +++++++
 .../transformers/gpt/modeling_pp.py           |  231 ++
 .../transformers/gpt/tokenizer.py             |  637 ++++
 .../transformers/gptj/__init__.py             |   13 +
 .../transformers/gptj/configuration.py        |  145 +
 .../transformers/gptj/modeling.py             |  799 ++++
 .../transformers/gptj/tokenizer.py            |   49 +
 .../transformers/image_processing_utils.py    |  547 +++
 .../transformers/image_transforms.py          |  655 ++++
 .../transformers/image_utils.py               |  621 +++
 .../transformers/jamba/__init__.py            |   13 +
 .../transformers/jamba/configuration.py       |  223 ++
 .../transformers/jamba/modeling.py            | 2010 ++++++++++
 .../transformers/jamba/tokenizer.py           |   26 +
 .../transformers/layoutlm/__init__.py         |   13 +
 .../transformers/layoutlm/configuration.py    |  158 +
 .../transformers/layoutlm/modeling.py         |  662 ++++
 .../transformers/layoutlm/tokenizer.py        |   42 +
 .../transformers/layoutlmv2/__init__.py       |   13 +
 .../transformers/layoutlmv2/configuration.py  |  252 ++
 .../transformers/layoutlmv2/modeling.py       | 1203 ++++++
 .../transformers/layoutlmv2/tokenizer.py      |   49 +
 .../transformers/layoutxlm/__init__.py        |   13 +
 .../transformers/layoutxlm/configuration.py   |  246 ++
 .../transformers/layoutxlm/modeling.py        | 1411 +++++++
 .../transformers/layoutxlm/tokenizer.py       |  170 +
 .../transformers/layoutxlm/visual_backbone.py |  737 ++++
 .../layoutxlm/visual_backbone.yaml            |  323 ++
 .../transformers/linear_utils.py              |   84 +
 .../transformers/llama/LICENSE                |   76 +
 .../transformers/llama/Llama2.LICENSE         |  126 +
 .../transformers/llama/__init__.py            |   21 +
 .../transformers/llama/configuration.py       |  209 +
 .../transformers/llama/fusion_ops.py          |  255 ++
 .../transformers/llama/modeling.py            | 2008 ++++++++++
 .../transformers/llama/modeling_auto.py       | 1308 +++++++
 .../llama/modeling_auto_static.py             | 1251 ++++++
 .../transformers/llama/modeling_pp.py         |  373 ++
 .../transformers/llama/tokenizer.py           |  562 +++
 .../transformers/llama/tokenizer_fast.py      |  171 +
 .../long_sequence_strategies/__init__.py      |   18 +
 .../attention_strategies.py                   |   51 +
 .../embedding_strategies.py                   |  122 +
 .../long_sequence_strategies.py               |   66 +
 .../transformers/luke/__init__.py             |   16 +
 .../transformers/luke/configuration.py        |  158 +
 .../transformers/luke/modeling.py             | 1124 ++++++
 .../transformers/luke/tokenizer.py            |  752 ++++
 .../transformers/mamba/__init__.py            |   18 +
 .../transformers/mamba/configuration.py       |  151 +
 .../transformers/mamba/modeling.py            |  795 ++++
 .../transformers/mamba/tokenizer.py           |  365 ++
 .../transformers/mbart/__init__.py            |   13 +
 .../transformers/mbart/configuration.py       |  272 ++
 .../transformers/mbart/modeling.py            | 1150 ++++++
 .../transformers/mbart/tokenizer.py           |  631 ++++
 .../transformers/mc2_parallel_linear.py       |  230 ++
 .../transformers/megatronbert/__init__.py     |   13 +
 .../megatronbert/configuration.py             |  156 +
 .../transformers/megatronbert/modeling.py     | 1006 +++++
 .../transformers/megatronbert/tokenizer.py    |  102 +
 .../transformers/minigpt4/__init__.py         |   13 +
 .../transformers/minigpt4/configuration.py    |  348 ++
 .../transformers/minigpt4/image_processing.py |  284 ++
 .../transformers/minigpt4/modeling.py         | 1771 +++++++++
 .../transformers/minigpt4/processing.py       |  245 ++
 .../transformers/mistral/__init__.py          |   15 +
 .../transformers/mistral/configuration.py     |   69 +
 .../transformers/mistral/modeling.py          |  962 +++++
 .../transformers/mixtral/__init__.py          |   16 +
 .../transformers/mixtral/configuration.py     |  172 +
 .../transformers/mixtral/modeling.py          | 1535 ++++++++
 .../transformers/mobilebert/__init__.py       |   13 +
 .../transformers/mobilebert/configuration.py  |  185 +
 .../transformers/mobilebert/modeling.py       | 1194 ++++++
 .../transformers/mobilebert/tokenizer.py      |  329 ++
 .../transformers/model_outputs.py             | 1520 ++++++++
 .../transformers/model_utils.py               | 2803 ++++++++++++++
 .../transformers/mpnet/__init__.py            |   16 +
 .../transformers/mpnet/configuration.py       |  117 +
 .../transformers/mpnet/modeling.py            |  731 ++++
 .../transformers/mpnet/tokenizer.py           |  201 +
 .../transformers/mt5/__init__.py              |   13 +
 .../transformers/mt5/configuration.py         |  133 +
 .../transformers/mt5/converter.py             |   68 +
 .../transformers/mt5/modeling.py              | 1742 +++++++++
 .../transformers/nezha/__init__.py            |   16 +
 .../transformers/nezha/configuration.py       |  190 +
 .../transformers/nezha/modeling.py            | 1179 ++++++
 .../transformers/nezha/tokenizer.py           |  304 ++
 .../transformers/nystromformer/__init__.py    |   14 +
 .../nystromformer/configuration.py            |  161 +
 .../transformers/nystromformer/modeling.py    | 1331 +++++++
 .../transformers/nystromformer/tokenizer.py   |  316 ++
 .../paddlenlp_3.0.0/transformers/ofa_utils.py |  326 ++
 .../transformers/opt/__init__.py              |   15 +
 .../transformers/opt/configuration.py         |  172 +
 .../opt/convert_torch_to_paddle.py            |  180 +
 .../transformers/opt/modeling.py              | 1216 ++++++
 .../transformers/optimization.py              |  304 ++
 .../transformers/pegasus/__init__.py          |   13 +
 .../transformers/pegasus/configuration.py     |  156 +
 .../transformers/pegasus/modeling.py          |  663 ++++
 .../transformers/pegasus/tokenizer.py         |  376 ++
 .../transformers/ppminilm/__init__.py         |   13 +
 .../transformers/ppminilm/configuration.py    |  151 +
 .../transformers/ppminilm/modeling.py         |  442 +++
 .../transformers/ppminilm/tokenizer.py        |  308 ++
 .../transformers/processing_utils.py          |  136 +
 .../transformers/prophetnet/__init__.py       |   17 +
 .../transformers/prophetnet/configuration.py  |  124 +
 .../transformers/prophetnet/modeling.py       | 1247 ++++++
 .../transformers/prophetnet/tokenizer.py      |  316 ++
 .../transformers/qwen/__init__.py             |   18 +
 .../transformers/qwen/configuration.py        |   84 +
 .../transformers/qwen/modeling.py             | 1192 ++++++
 .../transformers/qwen/modeling_3D_auto.py     |  962 +++++
 .../transformers/qwen/modeling_pp.py          |  207 +
 .../transformers/qwen/tokenizer.py            |  308 ++
 .../transformers/qwen2/__init__.py            |   19 +
 .../transformers/qwen2/configuration.py       |  158 +
 .../transformers/qwen2/modeling.py            | 1555 ++++++++
 .../transformers/qwen2/modeling_pp.py         |  289 ++
 .../transformers/qwen2/tokenizer.py           |  340 ++
 .../transformers/qwen2_moe/__init__.py        |   17 +
 .../transformers/qwen2_moe/configuration.py   |  186 +
 .../transformers/qwen2_moe/modeling.py        | 1556 ++++++++
 .../transformers/reformer/__init__.py         |   12 +
 .../transformers/reformer/configuration.py    |  310 ++
 .../transformers/reformer/modeling.py         | 2987 +++++++++++++++
 .../transformers/reformer/tokenizer.py        |  292 ++
 .../transformers/rembert/__init__.py          |   13 +
 .../transformers/rembert/configuration.py     |  135 +
 .../transformers/rembert/modeling.py          |  781 ++++
 .../transformers/rembert/tokenizer.py         |  240 ++
 .../transformers/ring_flash_attention.py      |  354 ++
 .../transformers/roberta/README.md            |    1 +
 .../transformers/roberta/__init__.py          |   13 +
 .../transformers/roberta/configuration.py     |  216 ++
 .../transformers/roberta/converter.py         |  109 +
 .../transformers/roberta/modeling.py          | 1387 +++++++
 .../transformers/roberta/tokenizer.py         |  628 +++
 .../transformers/roformer/__init__.py         |   16 +
 .../transformers/roformer/configuration.py    |  325 ++
 .../transformers/roformer/modeling.py         | 1380 +++++++
 .../transformers/roformer/tokenizer.py        |  381 ++
 .../transformers/roformerv2/__init__.py       |   16 +
 .../transformers/roformerv2/configuration.py  |  122 +
 .../transformers/roformerv2/modeling.py       |  802 ++++
 .../transformers/roformerv2/tokenizer.py      |  306 ++
 .../transformers/rw/__init__.py               |   13 +
 .../transformers/rw/configuration.py          |   84 +
 .../transformers/rw/modeling.py               |  894 +++++
 .../transformers/rw/tokenizer.py              |   96 +
 .../transformers/segment_parallel_utils.py    |  137 +
 .../transformers/semantic_search/__init__.py  |   13 +
 .../transformers/semantic_search/modeling.py  |  311 ++
 .../transformers/sentencepiece_model_pb2.py   | 1534 ++++++++
 .../transformers/skep/__init__.py             |   13 +
 .../transformers/skep/configuration.py        |  143 +
 .../transformers/skep/modeling.py             |  760 ++++
 .../transformers/skep/tokenizer.py            |  588 +++
 .../transformers/speecht5/__init__.py         |   13 +
 .../transformers/speecht5/configuration.py    |  419 ++
 .../speecht5/feature_extraction.py            |  394 ++
 .../transformers/speecht5/modeling.py         | 3112 +++++++++++++++
 .../transformers/speecht5/processing.py       |  192 +
 .../transformers/speecht5/tokenizer.py        |  217 ++
 .../transformers/squeezebert/__init__.py      |   13 +
 .../transformers/squeezebert/configuration.py |  233 ++
 .../transformers/squeezebert/modeling.py      |  623 +++
 .../transformers/squeezebert/tokenizer.py     |  234 ++
 .../transformers/tinybert/__init__.py         |   13 +
 .../transformers/tinybert/configuration.py    |  227 ++
 .../transformers/tinybert/modeling.py         |  754 ++++
 .../transformers/tinybert/tokenizer.py        |   53 +
 .../transformers/tokenizer_utils.py           | 2132 +++++++++++
 .../transformers/tokenizer_utils_base.py      | 3363 +++++++++++++++++
 .../transformers/tokenizer_utils_fast.py      |  869 +++++
 .../unified_transformer/__init__.py           |   13 +
 .../unified_transformer/configuration.py      |  222 ++
 .../unified_transformer/convert.py            |  120 +
 .../unified_transformer/modeling.py           |  577 +++
 .../unified_transformer/tokenizer.py          |  711 ++++
 .../transformers/unimo/__init__.py            |   13 +
 .../transformers/unimo/configuration.py       |  303 ++
 .../transformers/unimo/modeling.py            |  553 +++
 .../transformers/unimo/tokenizer.py           |  540 +++
 .../paddlenlp_3.0.0/transformers/utils.py     |  948 +++++
 .../transformers/visualglm/__init__.py        |   13 +
 .../transformers/visualglm/configuration.py   |  338 ++
 .../visualglm/image_processing.py             |  284 ++
 .../transformers/visualglm/modeling.py        | 1550 ++++++++
 .../transformers/visualglm/processing.py      |  223 ++
 .../transformers/xlm/__init__.py              |   15 +
 .../transformers/xlm/configuration.py         |  609 +++
 .../transformers/xlm/modeling.py              |  890 +++++
 .../transformers/xlm/tokenizer.py             | 1023 +++++
 .../transformers/xlnet/__init__.py            |   15 +
 .../transformers/xlnet/configuration.py       |  337 ++
 .../transformers/xlnet/converter.py           |   64 +
 .../transformers/xlnet/modeling.py            | 1931 ++++++++++
 .../transformers/xlnet/tokenizer.py           |  366 ++
 .../transformers/yuan/__init__.py             |   19 +
 .../transformers/yuan/configuration.py        |   67 +
 .../transformers/yuan/modeling.py             | 1296 +++++++
 .../transformers/yuan/tokenizer.py            |  262 ++
 .../paddlenlp_3.0.0/trl/__init__.py           |   17 +
 .../paddlenlp_3.0.0/trl/dpo_trainer.py        |  676 ++++
 .../paddlenlp_3.0.0/trl/trl_data.py           |  235 ++
 .../paddlenlp_3.0.0/trl/trl_utils.py          |   49 +
 .../paddlenlp_3.0.0/utils/__init__.py         |   40 +
 .../paddlenlp_3.0.0/utils/batch_sampler.py    |  182 +
 .../paddlenlp_3.0.0/utils/converter.py        |   18 +
 .../paddlenlp_3.0.0/utils/distributed.py      |  222 ++
 .../paddlenlp_3.0.0/utils/doc_parser.py       |  432 +++
 .../utils/download/__init__.py                |  367 ++
 .../utils/download/aistudio_hub_download.py   |  728 ++++
 .../utils/download/bos_download.py            |  285 ++
 .../paddlenlp_3.0.0/utils/download/common.py  |  662 ++++
 .../paddlenlp_3.0.0/utils/downloader.py       |  471 +++
 .../paddlepaddle/paddlenlp_3.0.0/utils/env.py |  113 +
 .../paddlenlp_3.0.0/utils/ie_utils.py         |  142 +
 .../paddlenlp_3.0.0/utils/image_utils.py      |  734 ++++
 .../paddlenlp_3.0.0/utils/import_utils.py     |  202 +
 .../paddlenlp_3.0.0/utils/initializer.py      |  337 ++
 .../paddlenlp_3.0.0/utils/llm_utils.py        |  875 +++++
 .../paddlepaddle/paddlenlp_3.0.0/utils/log.py |  133 +
 .../paddlenlp_3.0.0/utils/nested.py           |  118 +
 .../paddlenlp_3.0.0/utils/profiler.py         |  130 +
 .../paddlenlp_3.0.0/utils/safetensors.py      |  312 ++
 .../paddlenlp_3.0.0/utils/serialization.py    |  253 ++
 .../paddlenlp_3.0.0/utils/tools.py            |  839 ++++
 .../paddlenlp_3.0.0/version/__init__.py       |   51 +
 .../paddlenlp_3.0.0/version/git.py            |   48 +
 .../bert/paddlepaddle/predict.py              |  156 +
 .../bert/paddlepaddle/predict_glue.py         |  163 +
 .../bert/paddlepaddle/requirements.txt        |   30 +
 .../bert/paddlepaddle/run_glue.py             |  435 +++
 .../bert/paddlepaddle/run_pretrain.py         |  496 +++
 .../bert/paddlepaddle/run_training.sh         |   26 +
 .../bert/paddlepaddle/static/README.md        |  153 +
 .../static/create_pretraining_data.py         |  499 +++
 .../paddlepaddle/static/data/sample_text.txt  |  100 +
 .../bert/paddlepaddle/static/dataset.py       |  151 +
 .../bert/paddlepaddle/static/predict_glue.py  |  158 +
 .../bert/paddlepaddle/static/run_glue.py      |  446 +++
 .../static/run_glue_with_sparaity.py          |  458 +++
 .../bert/paddlepaddle/static/run_pretrain.py  |  439 +++
 .../bert/paddlepaddle/static_ipu/README.md    |  223 ++
 .../custom_ops/custom_checkpointoutput.cc     |   41 +
 .../static_ipu/custom_ops/custom_detach.cc    |   42 +
 .../static_ipu/custom_ops/custom_identity.cc  |   41 +
 .../static_ipu/custom_ops/custom_nll_loss.cc  |   55 +
 .../custom_ops/custom_shape_infer.cc          |   37 +
 .../disable_attn_dropout_bwd_pattern.cc       |   91 +
 .../static_ipu/custom_ops/tied_gather.cc      |  181 +
 .../custom_ops/tied_gather_pattern.cc         |  504 +++
 .../static_ipu/custom_ops/utils.cc            |  173 +
 .../prevent_const_expr_folding_op.cc          |  137 +
 .../paddlepaddle/static_ipu/dataset_ipu.py    |  283 ++
 .../paddlepaddle/static_ipu/load_tf_ckpt.py   |  201 +
 .../bert/paddlepaddle/static_ipu/modeling.py  |  705 ++++
 .../paddlepaddle/static_ipu/requirements.txt  |    8 +
 .../paddlepaddle/static_ipu/run_pretrain.py   |  410 ++
 .../bert/paddlepaddle/static_ipu/run_squad.py |  516 +++
 .../static_ipu/scripts/pod16/run_pretrain.sh  |   36 +
 .../scripts/pod16/run_pretrain_phase2.sh      |   38 +
 .../static_ipu/scripts/pod16/run_squad.sh     |   41 +
 .../scripts/pod16/run_squad_infer.sh          |   38 +
 .../static_ipu/scripts/pod4/run_pretrain.sh   |   36 +
 .../scripts/pod4/run_pretrain_phase2.sh       |   38 +
 .../static_ipu/scripts/pod4/run_squad.sh      |   41 +
 .../scripts/pod4/run_squad_infer.sh           |   38 +
 .../bert/paddlepaddle/static_ipu/utils.py     |  282 ++
 .../bert/tensorflow2.0/README.md              |  395 ++
 .../bert/tensorflow2.0/__init__.py            |   15 +
 .../bert/tensorflow2.0/albert/configs.py      |   50 +
 .../bert/tensorflow2.0/bert_cloud_tpu.md      |  110 +
 .../bert/tensorflow2.0/bert_models.py         |  366 ++
 .../bert/tensorflow2.0/common/__init__.py     |   15 +
 .../bert/tensorflow2.0/common/dataset_fn.py   |   42 +
 .../tensorflow2.0/common/distribute_utils.py  |  233 ++
 .../bert/tensorflow2.0/common/flags.py        |  110 +
 .../tensorflow2.0/common/registry_imports.py  |   20 +
 .../bert/tensorflow2.0/common_flags.py        |  129 +
 .../bert/tensorflow2.0/configs.py             |  104 +
 .../tensorflow2.0/core/config_definitions.py  |  252 ++
 .../bert/tensorflow2.0/data/__init__.py       |   14 +
 .../bert/tensorflow2.0/data/squad_lib.py      |  975 +++++
 .../bert/tensorflow2.0/data/squad_lib_sp.py   |  976 +++++
 .../bert/tensorflow2.0/download_glue_data.py  |  150 +
 .../bert/tensorflow2.0/download_script.sh     |    6 +
 .../bert/tensorflow2.0/export_tfhub.py        |  139 +
 .../bert/tensorflow2.0/input_pipeline.py      |  302 ++
 .../bert/tensorflow2.0/keras_nlp/README.md    |   37 +
 .../bert/tensorflow2.0/keras_nlp/__init__.py  |   18 +
 .../tensorflow2.0/keras_nlp/contributing.md   |   21 +
 .../keras_nlp/encoders/__init__.py            |   16 +
 .../keras_nlp/encoders/bert_encoder.py        |  262 ++
 .../keras_nlp/encoders/bert_encoder_test.py   |  232 ++
 .../keras_nlp/layers/__init__.py              |   20 +
 .../keras_nlp/layers/masked_lm.py             |  123 +
 .../keras_nlp/layers/on_device_embedding.py   |  106 +
 .../layers/on_device_embedding_test.py        |  213 ++
 .../keras_nlp/layers/position_embedding.py    |   93 +
 .../layers/position_embedding_test.py         |  132 +
 .../keras_nlp/layers/self_attention_mask.py   |   55 +
 .../layers/transformer_encoder_block.py       |  308 ++
 .../layers/transformer_encoder_block_test.py  |  324 ++
 .../tensorflow2.0/keras_nlp/requirements.txt  |    1 +
 .../bert/tensorflow2.0/keras_nlp/setup.py     |   69 +
 .../bert/tensorflow2.0/model_saving_utils.py  |   68 +
 .../tensorflow2.0/model_training_utils.py     |  590 +++
 .../bert/tensorflow2.0/modeling/__init__.py   |   14 +
 .../modeling/activations/__init__.py          |   21 +
 .../modeling/activations/gelu.py              |   32 +
 .../modeling/activations/relu.py              |   31 +
 .../modeling/activations/sigmoid.py           |   31 +
 .../modeling/activations/swish.py             |   72 +
 .../modeling/hyperparams/__init__.py          |   20 +
 .../modeling/hyperparams/base_config.py       |  270 ++
 .../hyperparams/config_definitions.py         |   57 +
 .../modeling/hyperparams/oneof.py             |   57 +
 .../modeling/hyperparams/params_dict.py       |  464 +++
 .../modeling/multitask/__init__.py            |   14 +
 .../modeling/multitask/base_model.py          |   60 +
 .../modeling/multitask/base_trainer.py        |  176 +
 .../modeling/multitask/configs.py             |   79 +
 .../modeling/multitask/evaluator.py           |  172 +
 .../multitask/interleaving_trainer.py         |   92 +
 .../modeling/multitask/multitask.py           |  148 +
 .../modeling/multitask/task_sampler.py        |  121 +
 .../modeling/multitask/test_utils.py          |  125 +
 .../modeling/multitask/train_lib.py           |  251 ++
 .../modeling/optimization/__init__.py         |   23 +
 .../modeling/optimization/configs/__init__.py |   14 +
 .../configs/learning_rate_config.py           |  250 ++
 .../configs/optimization_config.py            |  114 +
 .../optimization/configs/optimizer_config.py  |  249 ++
 .../modeling/optimization/ema_optimizer.py    |  255 ++
 .../modeling/optimization/lars_optimizer.py   |  186 +
 .../modeling/optimization/lr_schedule.py      |  385 ++
 .../optimization/optimizer_factory.py         |  177 +
 .../modeling/optimization/slide_optimizer.py  |   20 +
 .../tensorflow2.0/modeling/performance.py     |   55 +
 .../modeling/progressive/policies.py          |  173 +
 .../modeling/progressive/train.py             |   69 +
 .../modeling/progressive/train_lib.py         |  126 +
 .../modeling/progressive/trainer.py           |  294 ++
 .../modeling/progressive/utils.py             |   56 +
 .../bert/tensorflow2.0/modeling/tf_utils.py   |  200 +
 .../tensorflow2.0/nlp_configs/__init__.py     |   14 +
 .../bert/tensorflow2.0/nlp_configs/bert.py    |   43 +
 .../bert/tensorflow2.0/nlp_configs/electra.py |   36 +
 .../tensorflow2.0/nlp_configs/encoders.py     |  448 +++
 .../nlp_configs/experiment_configs.py         |   19 +
 .../experiments/glue_mnli_matched.yaml        |   49 +
 .../nlp_configs/experiments/squad_v1.yaml     |   50 +
 .../nlp_configs/finetuning_experiments.py     |  139 +
 .../models/bert_en_uncased_base.yaml          |   16 +
 .../nlp_configs/pretraining_experiments.py    |   82 +
 .../wmt_transformer_experiments.py            |  110 +
 .../bert/tensorflow2.0/nlp_modeling/README.md |   52 +
 .../tensorflow2.0/nlp_modeling/__init__.py    |   24 +
 .../nlp_modeling/layers/README.md             |  123 +
 .../nlp_modeling/layers/__init__.py           |   52 +
 .../nlp_modeling/layers/attention.py          |  107 +
 .../nlp_modeling/layers/bigbird_attention.py  |  492 +++
 .../nlp_modeling/layers/cls_head.py           |  334 ++
 .../nlp_modeling/layers/dense_einsum.py       |  180 +
 .../nlp_modeling/layers/gated_feedforward.py  |  210 +
 .../nlp_modeling/layers/gaussian_process.py   |  495 +++
 .../nlp_modeling/layers/kernel_attention.py   |  396 ++
 .../nlp_modeling/layers/masked_lm.py          |   20 +
 .../nlp_modeling/layers/masked_softmax.py     |   85 +
 .../layers/mat_mul_with_margin.py             |   69 +
 .../nlp_modeling/layers/mobile_bert_layers.py |  554 +++
 .../layers/multi_channel_attention.py         |  173 +
 .../layers/on_device_embedding.py             |   21 +
 .../nlp_modeling/layers/position_embedding.py |  237 ++
 .../nlp_modeling/layers/relative_attention.py |  499 +++
 .../nlp_modeling/layers/rezero_transformer.py |  233 ++
 .../layers/self_attention_mask.py             |   39 +
 .../layers/spectral_normalization.py          |  295 ++
 .../layers/talking_heads_attention.py         |  155 +
 .../nlp_modeling/layers/text_layers.py        |  704 ++++
 .../nlp_modeling/layers/tn_expand_condense.py |  180 +
 .../layers/tn_transformer_expand_condense.py  |  253 ++
 .../nlp_modeling/layers/transformer.py        |  431 +++
 .../layers/transformer_scaffold.py            |  305 ++
 .../nlp_modeling/layers/transformer_xl.py     |  559 +++
 .../tensorflow2.0/nlp_modeling/layers/util.py |   46 +
 .../nlp_modeling/losses/README.md             |    6 +
 .../nlp_modeling/losses/__init__.py           |   16 +
 ...eighted_sparse_categorical_crossentropy.py |   71 +
 .../nlp_modeling/models/README.md             |   25 +
 .../nlp_modeling/models/__init__.py           |   29 +
 .../nlp_modeling/models/bert_classifier.py    |  143 +
 .../nlp_modeling/models/bert_pretrainer.py    |  274 ++
 .../nlp_modeling/models/bert_span_labeler.py  |  125 +
 .../models/bert_token_classifier.py           |  133 +
 .../nlp_modeling/models/dual_encoder.py       |  162 +
 .../nlp_modeling/models/electra_pretrainer.py |  333 ++
 .../models/seq2seq_transformer.py             |  591 +++
 .../nlp_modeling/models/xlnet.py              |  342 ++
 .../nlp_modeling/networks/README.md           |   39 +
 .../nlp_modeling/networks/__init__.py         |   31 +
 .../nlp_modeling/networks/albert_encoder.py   |  211 ++
 .../nlp_modeling/networks/bert_encoder.py     |  150 +
 .../nlp_modeling/networks/classification.py   |  104 +
 .../nlp_modeling/networks/encoder_scaffold.py |  358 ++
 .../networks/mobile_bert_encoder.py           |  185 +
 .../networks/packed_sequence_embedding.py     |  319 ++
 .../nlp_modeling/networks/span_labeling.py    |  338 ++
 .../nlp_modeling/networks/xlnet_base.py       |  709 ++++
 .../nlp_modeling/ops/__init__.py              |   18 +
 .../nlp_modeling/ops/beam_search.py           |  704 ++++
 .../nlp_modeling/ops/decoding_module.py       |  282 ++
 .../nlp_modeling/ops/sampling_module.py       |  447 +++
 .../nlp_modeling/ops/segment_extractor.py     |  210 +
 .../bert/tensorflow2.0/optimization.py        |  231 ++
 .../bert/tensorflow2.0/orbit/LICENSE          |  202 +
 .../bert/tensorflow2.0/orbit/README.md        |   11 +
 .../bert/tensorflow2.0/orbit/__init__.py      |   29 +
 .../tensorflow2.0/orbit/actions/__init__.py   |   74 +
 .../orbit/actions/conditional_action.py       |   60 +
 .../orbit/actions/conditional_action_test.py  |   39 +
 .../orbit/actions/export_saved_model.py       |  137 +
 .../orbit/actions/export_saved_model_test.py  |  157 +
 .../orbit/actions/new_best_metric.py          |  222 ++
 .../orbit/actions/new_best_metric_test.py     |   94 +
 .../bert/tensorflow2.0/orbit/controller.py    |  515 +++
 .../tensorflow2.0/orbit/controller_test.py    |  775 ++++
 .../tensorflow2.0/orbit/examples/__init__.py  |   14 +
 .../orbit/examples/single_task/__init__.py    |   14 +
 .../single_task/single_task_evaluator.py      |   86 +
 .../single_task/single_task_evaluator_test.py |   65 +
 .../single_task/single_task_trainer.py        |  140 +
 .../single_task/single_task_trainer_test.py   |   60 +
 .../bert/tensorflow2.0/orbit/runner.py        |   83 +
 .../tensorflow2.0/orbit/standard_runner.py    |  447 +++
 .../orbit/standard_runner_test.py             |  152 +
 .../tensorflow2.0/orbit/utils/__init__.py     |   29 +
 .../bert/tensorflow2.0/orbit/utils/common.py  |  100 +
 .../tensorflow2.0/orbit/utils/common_test.py  |   34 +
 .../tensorflow2.0/orbit/utils/epoch_helper.py |   65 +
 .../tensorflow2.0/orbit/utils/loop_fns.py     |  192 +
 .../orbit/utils/summary_manager.py            |  110 +
 .../orbit/utils/tpu_summaries.py              |  145 +
 .../orbit/utils/tpu_summaries_test.py         |  120 +
 .../bert/tensorflow2.0/process_data.sh        |   16 +
 .../bert/tensorflow2.0/run_classifier.py      |  561 +++
 .../bert/tensorflow2.0/run_pretraining.py     |  218 ++
 .../bert/tensorflow2.0/run_squad.py           |  148 +
 .../bert/tensorflow2.0/run_squad_helper.py    |  472 +++
 .../bert/tensorflow2.0/run_train_mirrored.sh  |   74 +
 .../run_train_worker_mirrored.sh              |   85 +
 .../bert/tensorflow2.0/serving.py             |  133 +
 .../bert/tensorflow2.0/squad_evaluate_v1_1.py |  106 +
 .../bert/tensorflow2.0/squad_evaluate_v2_0.py |  249 ++
 .../bert/tensorflow2.0/staging/__init__.py    |   14 +
 .../staging/training/__init__.py              |   14 +
 .../staging/training/grad_utils.py            |  151 +
 .../bert/tensorflow2.0/tasks/__init__.py      |   14 +
 .../bert/tensorflow2.0/tasks/electra_task.py  |  242 ++
 .../bert/tensorflow2.0/tasks/masked_lm.py     |  200 +
 .../tensorflow2.0/tasks/question_answering.py |  498 +++
 .../tasks/sentence_prediction.py              |  299 ++
 .../bert/tensorflow2.0/tasks/tagging.py       |  265 ++
 .../bert/tensorflow2.0/tasks/translation.py   |  367 ++
 .../bert/tensorflow2.0/tasks/utils.py         |   76 +
 .../tf1_checkpoint_converter_lib.py           |  201 +
 .../tf2_encoder_checkpoint_converter.py       |  160 +
 .../bert/tensorflow2.0/tokenization.py        |  541 +++
 .../bert/tensorflow2.0/train_mirrored_nv.json |    8 +
 .../train_worker_mirrored_nv.json             |    8 +
 .../bert/tensorflow2.0/utils/__init__.py      |   14 +
 .../utils/docs/build_api_docs_lib.py          |   54 +
 .../utils/docs/build_nlp_api_docs.py          |   95 +
 .../utils/docs/build_vision_api_docs.py       |   93 +
 .../bert/tensorflow2.0/utils/flags/README.md  |  102 +
 .../tensorflow2.0/utils/flags/__init__.py     |   14 +
 .../bert/tensorflow2.0/utils/flags/_base.py   |  177 +
 .../tensorflow2.0/utils/flags/_benchmark.py   |  117 +
 .../tensorflow2.0/utils/flags/_conventions.py |   50 +
 .../bert/tensorflow2.0/utils/flags/_device.py |   90 +
 .../utils/flags/_distribution.py              |   52 +
 .../bert/tensorflow2.0/utils/flags/_misc.py   |   48 +
 .../tensorflow2.0/utils/flags/_performance.py |  294 ++
 .../bert/tensorflow2.0/utils/flags/core.py    |  130 +
 .../tensorflow2.0/utils/flags/guidelines.md   |   65 +
 .../tensorflow2.0/utils/hyperparams_flags.py  |  123 +
 .../bert/tensorflow2.0/utils/misc/__init__.py |   14 +
 .../utils/misc/distribution_utils.py          |   17 +
 .../tensorflow2.0/utils/misc/keras_utils.py   |  211 ++
 .../tensorflow2.0/utils/misc/model_helpers.py |   94 +
 .../tensorflow2.0/utils/testing/__init__.py   |   14 +
 .../utils/testing/integration.py              |   70 +
 .../tensorflow2.0/utils/testing/mock_task.py  |  101 +
 .../tensorflow2.0/utils/testing/pylint.rcfile |  168 +
 .../utils/testing/scripts/builds_common.sh    |   64 +
 .../utils/testing/scripts/ci_sanity.sh        |  132 +
 .../utils/testing/scripts/presubmit.sh        |   73 +
 1160 files changed, 343829 insertions(+)
 create mode 100644 cv/classification/resnet50/tensorflow2.0/README.md
 create mode 100644 cv/classification/resnet50/tensorflow2.0/augment.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/augment_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/callbacks.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/classifier_trainer.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/classifier_trainer_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/classifier_trainer_util_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/common/__init__.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/common/dataset_fn.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/common/distribute_utils.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/common/distribute_utils_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/common/flags.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/common/registry_imports.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/configs/__init__.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/configs/base_configs.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/configs/configs.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/configs/examples/efficientnet/imagenet/efficientnet-b0-gpu.yaml
 create mode 100644 cv/classification/resnet50/tensorflow2.0/configs/examples/efficientnet/imagenet/efficientnet-b0-tpu.yaml
 create mode 100644 cv/classification/resnet50/tensorflow2.0/configs/examples/efficientnet/imagenet/efficientnet-b1-gpu.yaml
 create mode 100644 cv/classification/resnet50/tensorflow2.0/configs/examples/efficientnet/imagenet/efficientnet-b1-tpu.yaml
 create mode 100644 cv/classification/resnet50/tensorflow2.0/configs/examples/resnet/imagenet/gpu.yaml
 create mode 100644 cv/classification/resnet50/tensorflow2.0/configs/examples/resnet/imagenet/gpu_mirrored.yaml
 create mode 100644 cv/classification/resnet50/tensorflow2.0/configs/examples/resnet/imagenet/gpu_multi_worker_mirrored.yaml
 create mode 100644 cv/classification/resnet50/tensorflow2.0/configs/examples/resnet/imagenet/tpu.yaml
 create mode 100644 cv/classification/resnet50/tensorflow2.0/core/__init__.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/core/actions.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/core/actions_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/core/base_task.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/core/base_trainer.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/core/base_trainer_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/core/config_definitions.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/core/exp_factory.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/core/export_base.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/core/export_base_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/core/input_reader.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/core/registry.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/core/registry_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/core/task_factory.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/core/train_lib.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/core/train_lib_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/core/train_utils.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/core/train_utils_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/dataset_factory.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/download_script.sh
 create mode 100644 cv/classification/resnet50/tensorflow2.0/efficientnet/__init__.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/efficientnet/common_modules.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/efficientnet/efficientnet_config.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/efficientnet/efficientnet_model.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/efficientnet/tfhub_export.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/learning_rate.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/learning_rate_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/mnist_main.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/mnist_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/__init__.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/activations/__init__.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/activations/gelu.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/activations/gelu_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/activations/relu.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/activations/relu_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/activations/sigmoid.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/activations/sigmoid_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/activations/swish.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/activations/swish_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/__init__.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/base_config.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/base_config_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/config_definitions.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/oneof.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/oneof_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/params_dict.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/params_dict_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/multitask/__init__.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/multitask/base_model.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/multitask/base_trainer.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/multitask/base_trainer_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/multitask/configs.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/multitask/evaluator.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/multitask/evaluator_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/multitask/interleaving_trainer.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/multitask/interleaving_trainer_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/multitask/multitask.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/multitask/task_sampler.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/multitask/task_sampler_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/multitask/test_utils.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/multitask/train_lib.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/multitask/train_lib_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/optimization/__init__.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/optimization/configs/__init__.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/optimization/configs/learning_rate_config.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/optimization/configs/optimization_config.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/optimization/configs/optimization_config_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/optimization/configs/optimizer_config.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/optimization/ema_optimizer.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/optimization/lars_optimizer.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/optimization/lr_schedule.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/optimization/lr_schedule_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/optimization/optimizer_factory.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/optimization/optimizer_factory_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/optimization/slide_optimizer.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/performance.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/progressive/policies.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/progressive/train.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/progressive/train_lib.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/progressive/train_lib_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/progressive/trainer.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/progressive/trainer_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/progressive/utils.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/modeling/tf_utils.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/optimizer_factory.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/optimizer_factory_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/preprocessing.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/resnet/README.md
 create mode 100644 cv/classification/resnet50/tensorflow2.0/resnet/__init__.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/resnet/common.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/resnet/imagenet_preprocessing.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/resnet/resnet_config.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/resnet/resnet_ctl_imagenet_main.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/resnet/resnet_model.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/resnet/resnet_runnable.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/resnet/tfhub_export.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/run_train_resnet50_mirrored_imagenette.sh
 create mode 100644 cv/classification/resnet50/tensorflow2.0/run_train_resnet50_worker_mirrored_imagenette.sh
 create mode 100644 cv/classification/resnet50/tensorflow2.0/staging/__init__.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/staging/training/__init__.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/staging/training/grad_utils.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/test_utils.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/__init__.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/docs/build_api_docs_lib.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/docs/build_nlp_api_docs.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/docs/build_vision_api_docs.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/flags/README.md
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/flags/__init__.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/flags/_base.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/flags/_benchmark.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/flags/_conventions.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/flags/_device.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/flags/_distribution.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/flags/_misc.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/flags/_performance.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/flags/core.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/flags/flags_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/flags/guidelines.md
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/hyperparams_flags.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/misc/__init__.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/misc/distribution_utils.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/misc/keras_utils.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/misc/model_helpers.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/misc/model_helpers_test.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/testing/__init__.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/testing/integration.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/testing/mock_task.py
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/testing/pylint.rcfile
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/testing/scripts/builds_common.sh
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/testing/scripts/ci_sanity.sh
 create mode 100644 cv/classification/resnet50/tensorflow2.0/utils/testing/scripts/presubmit.sh
 create mode 100644 nlp/text_classification/bert/paddlepaddle/README.md
 create mode 100644 nlp/text_classification/bert/paddlepaddle/create_pretraining_data.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/data/sample_text.txt
 create mode 100644 nlp/text_classification/bert/paddlepaddle/export_model.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/bos_community.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/download.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/install.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/main.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/server.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/utils/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/utils/tabulate.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/blendable_dataset.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/causal_dataset.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/collate.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/data_collator.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/dist_dataloader.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/indexed_dataset.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/iterator.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/sampler.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/vocab.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/dataaug/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/dataaug/base_augment.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/dataaug/char.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/dataaug/sentence.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/dataaug/word.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/README.md
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/advertisegen.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/bellegroup.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/bq_corpus.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/bstc.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/c3.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/cail2018_small.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/cail2019_scm.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/cblue.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/chnsenticorp.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/chnsenticorp_v2.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/clue.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/cmrc2018.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/cnn_dailymail.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/conll2002.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/cote.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/couplet.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/dataset.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/drcd.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/drcd_cn.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/duconv.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/dureader_checklist.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/dureader_qg.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/dureader_robust.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/dureader_yesno.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/fewclue.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/glue.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/chnsenticorp.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/clue.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/cmrc2018.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/cnn_dailymail.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/cote.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/docvqa_zh.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/duconv.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/dureader_robust.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/funsd.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/glue.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/imdb.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/language_pair.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/msra_ner.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/mt_eng_vietnamese.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/ptb_text_only.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/rvl_cdip_sampled.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/seabsa16.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/squad.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/squad_v2.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/xfund_zh.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/xnli.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hyp.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/imdb.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/iwslt15.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/lcqmc.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/lcqmc_v2.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/lcsts_new.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/msra_ner.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/nlpcc13_evsam05_hit.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/nlpcc13_evsam05_thu.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/nlpcc14_sc.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/nlpcc_dbqa.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/paws-x.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/peoples_daily_ner.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/poetry.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/ptb.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/seabsa16.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/sighan-cn.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/squad.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/thucnews.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/triviaqa.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/wmt14ende.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/wos.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/xnli.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/xnli_cn.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/yahoo_answer_100k.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/zero_padding_dataset.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/embeddings/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/embeddings/constant.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/embeddings/token_embedding.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/README.md
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/README_en.md
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/auto_trainer_base.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/requirements.txt
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/text_classification.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/ernie_model.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/faster_tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/model_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/bloom/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/bloom/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/chatglm/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/chatglm/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/chatglm_v2/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/chatglm_v2/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/fused_transformer_layers.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/generation_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/gpt/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/gpt/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/llama/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/llama/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/llama/ptq_scales_map.json
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/llama/ptq_scales_map_shift_smooth.json
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/opt/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/opt/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/qwen/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/qwen/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/qwen2/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/qwen2/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/qwen2/ptq_scales_map.json
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/qwen2/ptq_scales_map_shift_smooth.json
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/generation/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/generation/configuration_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/generation/logits_process.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/generation/stopping_criteria.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/generation/streamers.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/generation/utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/layers/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/layers/crf.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/layers/globalpointer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/layers/linear.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/layers/sequence.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/layers/tcn.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/losses/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/losses/rdrop.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/README.md
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/bleu.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/chunk.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/distinct.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/dureader.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/glue.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/mrr.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/perplexity.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/rouge.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/sighan.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/span.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/squad.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/distributed/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/distributed/parallel.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/distributed/utils/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/distributed/utils/random.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/distributed/utils/topo.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/einsum.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/optimizer/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/optimizer/adamwdl.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/optimizer/ema.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/optimizer/lr.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/lora_config.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/lora_layers.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/lora_model.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/lora_quant_layers.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/lora_quantization_layers.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/lora_quick_layers.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/prefix/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/prefix/prefix_config.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/prefix/prefix_model.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/prefix/utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/vera/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/vera/vera_config.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/vera/vera_layers.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/vera/vera_model.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/prompt_args.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/prompt_model.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/prompt_tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/prompt_trainer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/prompt_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/template.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/verbalizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/quantization/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/quantization/qlora.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/quantization/quantization_config.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/quantization/quantization_linear.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/quantization/quantization_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/seq2vec/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/seq2vec/encoder.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/base_router.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/base_handler.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/cls_post_handler.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/custom_model_handler.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/qa_model_handler.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/taskflow_handler.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/token_model_handler.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/http_router/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/http_router/router.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/model_manager.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/predictor.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/server.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/taskflow_manager.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/code_generation.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/dependency_parsing.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/dialogue.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/document_intelligence.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/fill_mask.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/information_extraction.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/knowledge_mining.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/lexical_analysis.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/models/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/models/dependency_parsing_model.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/models/lexical_analysis_model.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/models/sentiment_analysis_model.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/models/text_correction_model.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/multimodal_feature_extraction.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/named_entity_recognition.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/poetry_generation.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/pos_tagging.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/question_answering.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/question_generation.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/sentiment_analysis.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/task.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/taskflow.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text2text_generation.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text_classification.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text_correction.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text_feature_extraction.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text_generation.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text_similarity.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text_summarization.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/word_segmentation.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/zero_shot_text_classification.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/argparser.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/auto_trainer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/compression_args.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/integrations.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/plugins/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/plugins/npu_plugin.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/plugins/shared_memory_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/plugins/timer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/plugins/unified_checkpoint.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/trainer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/trainer_callback.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/trainer_compress.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/trainer_seq2seq.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/trainer_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/training_args.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/training_args_seq2seq.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/async_save.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/doc.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/helper.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/reshard/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/reshard/common.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/reshard/pp_reshard.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/reshard/sharding_v1.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/reshard/sharding_v2.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/sharding_io.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/activations.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/aistudio_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/albert/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/albert/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/albert/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/albert/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/artist/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/artist/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/artist/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/artist/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/attention_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/audio_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/auto/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/auto/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/auto/image_processing.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/auto/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/auto/processing.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/auto/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bart/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bart/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bart/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bart/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert/modeling.pyi
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert_japanese/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert_japanese/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bigbird/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bigbird/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bigbird/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bigbird/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bit/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bit/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bit/image_processing.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bit/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot_small/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot_small/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot_small/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot_small/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip/image_processing.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip/modeling_text.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip/processing.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip_2/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip_2/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip_2/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip_2/processing.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bloom/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bloom/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bloom/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bloom/processor.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bloom/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm/LICENSE
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm_v2/LICENSE
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm_v2/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm_v2/chatglm-legacy-checkpoints-convert.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm_v2/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm_v2/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm_v2/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chinesebert/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chinesebert/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chinesebert/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chinesebert/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/converter.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/feature_extraction.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/image_processing.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/processing.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clap/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clap/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clap/feature_extraction.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clap/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clap/processing.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/feature_extraction.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/image_processing.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/processing.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clipseg/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clipseg/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clipseg/image_processing.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clipseg/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clipseg/processing.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/codegen/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/codegen/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/codegen/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/codegen/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/configuration_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/context_parallel_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/convbert/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/convbert/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/convbert/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/convbert/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/conversion_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/convert_slow_tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ctrl/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ctrl/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ctrl/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ctrl/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dallebart/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dallebart/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dallebart/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dallebart/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta_v2/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta_v2/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta_v2/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta_v2/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/distilbert/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/distilbert/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/distilbert/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/distilbert/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/distill_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dpt/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dpt/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dpt/image_processing.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dpt/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/electra/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/electra/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/electra/converter.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/electra/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/electra/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie/README.md
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie/static_to_dygraph_params/match_static_to_dygraph.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_code/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_code/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_code/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_code/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_ctm/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_ctm/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_ctm/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_ctm/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_doc/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_doc/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_doc/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_doc/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gen/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gen/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gen/params_map.json
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gram/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gram/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gram/matching_param_name.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gram/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gram/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_layout/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_layout/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_layout/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_layout/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_layout/visual_backbone.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_m/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_m/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_m/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_m/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/feature_extraction.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/image_processing.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/processing.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/export.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/feature_extraction_sequence_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/feature_extraction_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/fnet/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/fnet/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/fnet/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/fnet/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/funnel/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/funnel/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/funnel/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/funnel/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gau_alpha/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gau_alpha/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gau_alpha/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gau_alpha/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gemma/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gemma/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gemma/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gemma/modeling_pp.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gemma/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/glm/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/glm/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/glm/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/glm/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gpt/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gpt/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gpt/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gpt/modeling_auto.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gpt/modeling_pp.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gpt/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gptj/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gptj/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gptj/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gptj/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/image_processing_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/image_transforms.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/image_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/jamba/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/jamba/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/jamba/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/jamba/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlm/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlm/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlm/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlm/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlmv2/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlmv2/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlmv2/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlmv2/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutxlm/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutxlm/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutxlm/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutxlm/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutxlm/visual_backbone.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutxlm/visual_backbone.yaml
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/linear_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/LICENSE
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/Llama2.LICENSE
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/fusion_ops.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/modeling_auto.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/modeling_auto_static.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/modeling_pp.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/tokenizer_fast.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/long_sequence_strategies/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/long_sequence_strategies/attention_strategies.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/long_sequence_strategies/embedding_strategies.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/long_sequence_strategies/long_sequence_strategies.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/luke/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/luke/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/luke/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/luke/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mamba/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mamba/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mamba/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mamba/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mbart/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mbart/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mbart/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mbart/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mc2_parallel_linear.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/megatronbert/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/megatronbert/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/megatronbert/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/megatronbert/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/minigpt4/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/minigpt4/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/minigpt4/image_processing.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/minigpt4/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/minigpt4/processing.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mistral/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mistral/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mistral/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mixtral/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mixtral/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mixtral/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mobilebert/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mobilebert/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mobilebert/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mobilebert/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/model_outputs.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/model_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mpnet/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mpnet/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mpnet/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mpnet/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mt5/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mt5/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mt5/converter.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mt5/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nezha/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nezha/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nezha/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nezha/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nystromformer/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nystromformer/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nystromformer/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nystromformer/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ofa_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/opt/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/opt/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/opt/convert_torch_to_paddle.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/opt/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/optimization.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/pegasus/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/pegasus/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/pegasus/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/pegasus/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ppminilm/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ppminilm/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ppminilm/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ppminilm/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/processing_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/prophetnet/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/prophetnet/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/prophetnet/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/prophetnet/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen/modeling_3D_auto.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen/modeling_pp.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2/modeling_pp.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2_moe/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2_moe/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2_moe/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/reformer/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/reformer/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/reformer/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/reformer/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rembert/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rembert/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rembert/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rembert/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ring_flash_attention.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roberta/README.md
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roberta/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roberta/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roberta/converter.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roberta/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roberta/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformer/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformer/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformer/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformer/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformerv2/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformerv2/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformerv2/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformerv2/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rw/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rw/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rw/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rw/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/segment_parallel_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/semantic_search/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/semantic_search/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/sentencepiece_model_pb2.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/skep/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/skep/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/skep/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/skep/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/speecht5/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/speecht5/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/speecht5/feature_extraction.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/speecht5/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/speecht5/processing.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/speecht5/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/squeezebert/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/squeezebert/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/squeezebert/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/squeezebert/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tinybert/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tinybert/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tinybert/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tinybert/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tokenizer_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tokenizer_utils_base.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tokenizer_utils_fast.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unified_transformer/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unified_transformer/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unified_transformer/convert.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unified_transformer/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unified_transformer/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unimo/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unimo/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unimo/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unimo/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/visualglm/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/visualglm/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/visualglm/image_processing.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/visualglm/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/visualglm/processing.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlm/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlm/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlm/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlm/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlnet/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlnet/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlnet/converter.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlnet/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlnet/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/yuan/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/yuan/configuration.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/yuan/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/yuan/tokenizer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trl/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trl/dpo_trainer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trl/trl_data.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trl/trl_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/batch_sampler.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/converter.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/distributed.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/doc_parser.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/download/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/download/aistudio_hub_download.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/download/bos_download.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/download/common.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/downloader.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/env.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/ie_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/image_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/import_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/initializer.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/llm_utils.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/log.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/nested.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/profiler.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/safetensors.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/serialization.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/tools.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/version/__init__.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/version/git.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/predict.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/predict_glue.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/requirements.txt
 create mode 100644 nlp/text_classification/bert/paddlepaddle/run_glue.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/run_pretrain.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/run_training.sh
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static/README.md
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static/create_pretraining_data.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static/data/sample_text.txt
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static/dataset.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static/predict_glue.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static/run_glue.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static/run_glue_with_sparaity.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static/run_pretrain.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/README.md
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/custom_checkpointoutput.cc
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/custom_detach.cc
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/custom_identity.cc
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/custom_nll_loss.cc
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/custom_shape_infer.cc
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/disable_attn_dropout_bwd_pattern.cc
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/tied_gather.cc
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/tied_gather_pattern.cc
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/utils.cc
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/workarounds/prevent_const_expr_folding_op.cc
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/dataset_ipu.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/load_tf_ckpt.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/modeling.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/requirements.txt
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/run_pretrain.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/run_squad.py
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod16/run_pretrain.sh
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod16/run_pretrain_phase2.sh
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod16/run_squad.sh
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod16/run_squad_infer.sh
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod4/run_pretrain.sh
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod4/run_pretrain_phase2.sh
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod4/run_squad.sh
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod4/run_squad_infer.sh
 create mode 100644 nlp/text_classification/bert/paddlepaddle/static_ipu/utils.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/README.md
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/albert/configs.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/bert_cloud_tpu.md
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/bert_models.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/common/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/common/dataset_fn.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/common/distribute_utils.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/common/flags.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/common/registry_imports.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/common_flags.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/configs.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/core/config_definitions.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/data/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/data/squad_lib.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/data/squad_lib_sp.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/download_glue_data.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/download_script.sh
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/export_tfhub.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/input_pipeline.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/keras_nlp/README.md
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/keras_nlp/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/keras_nlp/contributing.md
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/keras_nlp/encoders/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/keras_nlp/encoders/bert_encoder.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/keras_nlp/encoders/bert_encoder_test.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/masked_lm.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/on_device_embedding.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/on_device_embedding_test.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/position_embedding.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/position_embedding_test.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/self_attention_mask.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/transformer_encoder_block.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/transformer_encoder_block_test.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/keras_nlp/requirements.txt
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/keras_nlp/setup.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/model_saving_utils.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/model_training_utils.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/activations/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/activations/gelu.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/activations/relu.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/activations/sigmoid.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/activations/swish.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/hyperparams/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/hyperparams/base_config.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/hyperparams/config_definitions.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/hyperparams/oneof.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/hyperparams/params_dict.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/multitask/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/multitask/base_model.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/multitask/base_trainer.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/multitask/configs.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/multitask/evaluator.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/multitask/interleaving_trainer.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/multitask/multitask.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/multitask/task_sampler.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/multitask/test_utils.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/multitask/train_lib.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/optimization/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/optimization/configs/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/optimization/configs/learning_rate_config.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/optimization/configs/optimization_config.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/optimization/configs/optimizer_config.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/optimization/ema_optimizer.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/optimization/lars_optimizer.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/optimization/lr_schedule.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/optimization/optimizer_factory.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/optimization/slide_optimizer.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/performance.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/progressive/policies.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/progressive/train.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/progressive/train_lib.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/progressive/trainer.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/progressive/utils.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/modeling/tf_utils.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_configs/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_configs/bert.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_configs/electra.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_configs/encoders.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_configs/experiment_configs.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_configs/experiments/glue_mnli_matched.yaml
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_configs/experiments/squad_v1.yaml
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_configs/finetuning_experiments.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_configs/models/bert_en_uncased_base.yaml
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_configs/pretraining_experiments.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_configs/wmt_transformer_experiments.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/README.md
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/README.md
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/attention.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/bigbird_attention.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/cls_head.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/dense_einsum.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/gated_feedforward.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/gaussian_process.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/kernel_attention.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/masked_lm.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/masked_softmax.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/mat_mul_with_margin.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/mobile_bert_layers.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/multi_channel_attention.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/on_device_embedding.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/position_embedding.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/relative_attention.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/rezero_transformer.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/self_attention_mask.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/spectral_normalization.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/talking_heads_attention.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/text_layers.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/tn_expand_condense.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/tn_transformer_expand_condense.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/transformer.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/transformer_scaffold.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/transformer_xl.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/util.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/losses/README.md
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/losses/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/losses/weighted_sparse_categorical_crossentropy.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/README.md
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/bert_classifier.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/bert_pretrainer.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/bert_span_labeler.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/bert_token_classifier.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/dual_encoder.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/electra_pretrainer.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/seq2seq_transformer.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/xlnet.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/README.md
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/albert_encoder.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/bert_encoder.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/classification.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/encoder_scaffold.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/mobile_bert_encoder.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/packed_sequence_embedding.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/span_labeling.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/xlnet_base.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/ops/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/ops/beam_search.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/ops/decoding_module.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/ops/sampling_module.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/nlp_modeling/ops/segment_extractor.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/optimization.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/LICENSE
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/README.md
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/actions/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/actions/conditional_action.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/actions/conditional_action_test.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/actions/export_saved_model.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/actions/export_saved_model_test.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/actions/new_best_metric.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/actions/new_best_metric_test.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/controller.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/controller_test.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/examples/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/examples/single_task/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/examples/single_task/single_task_evaluator.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/examples/single_task/single_task_evaluator_test.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/examples/single_task/single_task_trainer.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/examples/single_task/single_task_trainer_test.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/runner.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/standard_runner.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/standard_runner_test.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/utils/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/utils/common.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/utils/common_test.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/utils/epoch_helper.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/utils/loop_fns.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/utils/summary_manager.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/utils/tpu_summaries.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/orbit/utils/tpu_summaries_test.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/process_data.sh
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/run_classifier.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/run_pretraining.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/run_squad.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/run_squad_helper.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/run_train_mirrored.sh
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/run_train_worker_mirrored.sh
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/serving.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/squad_evaluate_v1_1.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/squad_evaluate_v2_0.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/staging/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/staging/training/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/staging/training/grad_utils.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/tasks/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/tasks/electra_task.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/tasks/masked_lm.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/tasks/question_answering.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/tasks/sentence_prediction.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/tasks/tagging.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/tasks/translation.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/tasks/utils.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/tf1_checkpoint_converter_lib.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/tf2_encoder_checkpoint_converter.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/tokenization.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/train_mirrored_nv.json
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/train_worker_mirrored_nv.json
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/docs/build_api_docs_lib.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/docs/build_nlp_api_docs.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/docs/build_vision_api_docs.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/flags/README.md
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/flags/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/flags/_base.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/flags/_benchmark.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/flags/_conventions.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/flags/_device.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/flags/_distribution.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/flags/_misc.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/flags/_performance.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/flags/core.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/flags/guidelines.md
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/hyperparams_flags.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/misc/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/misc/distribution_utils.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/misc/keras_utils.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/misc/model_helpers.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/testing/__init__.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/testing/integration.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/testing/mock_task.py
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/testing/pylint.rcfile
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/testing/scripts/builds_common.sh
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/testing/scripts/ci_sanity.sh
 create mode 100644 nlp/text_classification/bert/tensorflow2.0/utils/testing/scripts/presubmit.sh

diff --git a/cv/classification/resnet50/tensorflow2.0/README.md b/cv/classification/resnet50/tensorflow2.0/README.md
new file mode 100644
index 000000000..1cb4428dd
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/README.md
@@ -0,0 +1,182 @@
+# Image Classification
+
+This folder contains TF 2.0 model examples for image classification:
+
+* [MNIST](#mnist)
+* [Classifier Trainer](#classifier-trainer), a framework that uses the Keras
+compile/fit methods for image classification models, including:
+  * ResNet
+  * EfficientNet[^1]
+
+[^1]: Currently a work in progress. We cannot match "AutoAugment (AA)" in [the original version](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet).
+For more information about other types of models, please refer to this
+[README file](../../README.md).
+
+## Before you begin
+Please make sure that you have the latest version of TensorFlow
+installed and
+[add the models folder to your Python path](/official/#running-the-models).
+
+### ImageNet preparation
+
+#### Using TFDS
+`classifier_trainer.py` supports ImageNet with
+[TensorFlow Datasets (TFDS)](https://www.tensorflow.org/datasets/overview).
+
+Please see the following [example snippet](https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/scripts/download_and_prepare.py)
+for more information on how to use TFDS to download and prepare datasets, and
+specifically the [TFDS ImageNet readme](https://github.com/tensorflow/datasets/blob/master/docs/catalog/imagenet2012.md)
+for manual download instructions.
+
+#### Legacy TFRecords
+Download the ImageNet dataset and convert it to TFRecord format.
+The following [script](https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py)
+and [README](https://github.com/tensorflow/tpu/tree/master/tools/datasets#imagenet_to_gcspy)
+provide a few options.
+
+Note that the legacy ResNet runners, e.g. [resnet/resnet_ctl_imagenet_main.py](resnet/resnet_ctl_imagenet_main.py)
+require TFRecords whereas `classifier_trainer.py` can use both by setting the
+builder to 'records' or 'tfds' in the configurations.
+
+### Running on Cloud TPUs
+
+Note: These models will **not** work with TPUs on Colab.
+
+You can train image classification models on Cloud TPUs using
+[tf.distribute.TPUStrategy](https://www.tensorflow.org/api_docs/python/tf.distribute.TPUStrategy?version=nightly).
+If you are not familiar with Cloud TPUs, it is strongly recommended that you go
+through the
+[quickstart](https://cloud.google.com/tpu/docs/quickstart) to learn how to
+create a TPU and GCE VM.
+
+### Running on multiple GPU hosts
+
+You can also train these models on multiple hosts, each with GPUs, using
+[tf.distribute.experimental.MultiWorkerMirroredStrategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy).
+
+The easiest way to run multi-host benchmarks is to set the
+[`TF_CONFIG`](https://www.tensorflow.org/guide/distributed_training#TF_CONFIG)
+appropriately at each host.  e.g., to run using `MultiWorkerMirroredStrategy` on
+2 hosts, the `cluster` in `TF_CONFIG` should have 2 `host:port` entries, and
+host `i` should have the `task` in `TF_CONFIG` set to `{"type": "worker",
+"index": i}`.  `MultiWorkerMirroredStrategy` will automatically use all the
+available GPUs at each host.
+
+## MNIST
+
+To download the data and run the MNIST sample model locally for the first time,
+run one of the following command:
+
+```bash
+python3 mnist_main.py \
+  --model_dir=$MODEL_DIR \
+  --data_dir=$DATA_DIR \
+  --train_epochs=10 \
+  --distribution_strategy=one_device \
+  --num_gpus=$NUM_GPUS \
+  --download
+```
+
+To train the model on a Cloud TPU, run the following command:
+
+```bash
+python3 mnist_main.py \
+  --tpu=$TPU_NAME \
+  --model_dir=$MODEL_DIR \
+  --data_dir=$DATA_DIR \
+  --train_epochs=10 \
+  --distribution_strategy=tpu \
+  --download
+```
+
+Note: the `--download` flag is only required the first time you run the model.
+
+
+## Classifier Trainer
+The classifier trainer is a unified framework for running image classification
+models using Keras's compile/fit methods. Experiments should be provided in the
+form of YAML files, some examples are included within the configs/examples
+folder. Please see [configs/examples](./configs/examples) for more example
+configurations.
+
+The provided configuration files use a per replica batch size and is scaled
+by the number of devices. For instance, if `batch size` = 64, then for 1 GPU
+the global batch size would be 64 * 1 = 64. For 8 GPUs, the global batch size
+would be 64 * 8 = 512. Similarly, for a v3-8 TPU, the global batch size would
+be 64 * 8 = 512, and for a v3-32, the global batch size is 64 * 32 = 2048.
+
+### ResNet50
+
+#### On GPU:
+```bash
+python3 classifier_trainer.py \
+  --mode=train_and_eval \
+  --model_type=resnet \
+  --dataset=imagenet \
+  --model_dir=$MODEL_DIR \
+  --data_dir=$DATA_DIR \
+  --config_file=configs/examples/resnet/imagenet/gpu.yaml \
+  --params_override='runtime.num_gpus=$NUM_GPUS'
+```
+
+To train on multiple hosts, each with GPUs attached using
+[MultiWorkerMirroredStrategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy)
+please update `runtime` section in gpu.yaml
+(or override using `--params_override`) with:
+
+```YAML
+# gpu.yaml
+runtime:
+  distribution_strategy: 'multi_worker_mirrored'
+  worker_hosts: '$HOST1:port,$HOST2:port'
+  num_gpus: $NUM_GPUS
+  task_index: 0
+```
+By having `task_index: 0` on the first host and `task_index: 1` on the second
+and so on. `$HOST1` and `$HOST2` are the IP addresses of the hosts, and `port`
+can be chosen any free port on the hosts. Only the first host will write
+TensorBoard Summaries and save checkpoints.
+
+#### On TPU:
+```bash
+python3 classifier_trainer.py \
+  --mode=train_and_eval \
+  --model_type=resnet \
+  --dataset=imagenet \
+  --tpu=$TPU_NAME \
+  --model_dir=$MODEL_DIR \
+  --data_dir=$DATA_DIR \
+  --config_file=configs/examples/resnet/imagenet/tpu.yaml
+```
+
+### EfficientNet
+**Note: EfficientNet development is a work in progress.**
+#### On GPU:
+```bash
+python3 classifier_trainer.py \
+  --mode=train_and_eval \
+  --model_type=efficientnet \
+  --dataset=imagenet \
+  --model_dir=$MODEL_DIR \
+  --data_dir=$DATA_DIR \
+  --config_file=configs/examples/efficientnet/imagenet/efficientnet-b0-gpu.yaml \
+  --params_override='runtime.num_gpus=$NUM_GPUS'
+```
+
+
+#### On TPU:
+```bash
+python3 classifier_trainer.py \
+  --mode=train_and_eval \
+  --model_type=efficientnet \
+  --dataset=imagenet \
+  --tpu=$TPU_NAME \
+  --model_dir=$MODEL_DIR \
+  --data_dir=$DATA_DIR \
+  --config_file=configs/examples/efficientnet/imagenet/efficientnet-b0-tpu.yaml
+```
+
+Note that the number of GPU devices can be overridden in the command line using
+`--params_overrides`. The TPU does not need this override as the device is fixed
+by providing the TPU address or name with the `--tpu` flag.
+
diff --git a/cv/classification/resnet50/tensorflow2.0/augment.py b/cv/classification/resnet50/tensorflow2.0/augment.py
new file mode 100644
index 000000000..ad1193557
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/augment.py
@@ -0,0 +1,989 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""AutoAugment and RandAugment policies for enhanced image preprocessing.
+
+AutoAugment Reference: https://arxiv.org/abs/1805.09501
+RandAugment Reference: https://arxiv.org/abs/1909.13719
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import math
+
+import tensorflow as tf
+from typing import Any, Dict, List, Optional, Text, Tuple
+
+try:
+    from tensorflow.python.keras.layers.preprocessing import image_preprocessing as image_ops
+except:
+    from keras.layers.preprocessing import image_preprocessing as image_ops
+
+# This signifies the max integer that the controller RNN could predict for the
+# augmentation scheme.
+_MAX_LEVEL = 10.
+
+
+def to_4d(image: tf.Tensor) -> tf.Tensor:
+  """Converts an input Tensor to 4 dimensions.
+
+  4D image => [N, H, W, C] or [N, C, H, W]
+  3D image => [1, H, W, C] or [1, C, H, W]
+  2D image => [1, H, W, 1]
+
+  Args:
+    image: The 2/3/4D input tensor.
+
+  Returns:
+    A 4D image tensor.
+
+  Raises:
+    `TypeError` if `image` is not a 2/3/4D tensor.
+
+  """
+  shape = tf.shape(image)
+  original_rank = tf.rank(image)
+  left_pad = tf.cast(tf.less_equal(original_rank, 3), dtype=tf.int32)
+  right_pad = tf.cast(tf.equal(original_rank, 2), dtype=tf.int32)
+  new_shape = tf.concat(
+      [
+          tf.ones(shape=left_pad, dtype=tf.int32),
+          shape,
+          tf.ones(shape=right_pad, dtype=tf.int32),
+      ],
+      axis=0,
+  )
+  return tf.reshape(image, new_shape)
+
+
+def from_4d(image: tf.Tensor, ndims: tf.Tensor) -> tf.Tensor:
+  """Converts a 4D image back to `ndims` rank."""
+  shape = tf.shape(image)
+  begin = tf.cast(tf.less_equal(ndims, 3), dtype=tf.int32)
+  end = 4 - tf.cast(tf.equal(ndims, 2), dtype=tf.int32)
+  new_shape = shape[begin:end]
+  return tf.reshape(image, new_shape)
+
+
+def _convert_translation_to_transform(translations: tf.Tensor) -> tf.Tensor:
+  """Converts translations to a projective transform.
+
+  The translation matrix looks like this:
+    [[1 0 -dx]
+     [0 1 -dy]
+     [0 0 1]]
+
+  Args:
+    translations: The 2-element list representing [dx, dy], or a matrix of
+      2-element lists representing [dx dy] to translate for each image. The
+      shape must be static.
+
+  Returns:
+    The transformation matrix of shape (num_images, 8).
+
+  Raises:
+    `TypeError` if
+      - the shape of `translations` is not known or
+      - the shape of `translations` is not rank 1 or 2.
+
+  """
+  translations = tf.convert_to_tensor(translations, dtype=tf.float32)
+  if translations.get_shape().ndims is None:
+    raise TypeError('translations rank must be statically known')
+  elif len(translations.get_shape()) == 1:
+    translations = translations[None]
+  elif len(translations.get_shape()) != 2:
+    raise TypeError('translations should have rank 1 or 2.')
+  num_translations = tf.shape(translations)[0]
+
+  return tf.concat(
+      values=[
+          tf.ones((num_translations, 1), tf.dtypes.float32),
+          tf.zeros((num_translations, 1), tf.dtypes.float32),
+          -translations[:, 0, None],
+          tf.zeros((num_translations, 1), tf.dtypes.float32),
+          tf.ones((num_translations, 1), tf.dtypes.float32),
+          -translations[:, 1, None],
+          tf.zeros((num_translations, 2), tf.dtypes.float32),
+      ],
+      axis=1,
+  )
+
+
+def _convert_angles_to_transform(angles: tf.Tensor, image_width: tf.Tensor,
+                                 image_height: tf.Tensor) -> tf.Tensor:
+  """Converts an angle or angles to a projective transform.
+
+  Args:
+    angles: A scalar to rotate all images, or a vector to rotate a batch of
+      images. This must be a scalar.
+    image_width: The width of the image(s) to be transformed.
+    image_height: The height of the image(s) to be transformed.
+
+  Returns:
+    A tensor of shape (num_images, 8).
+
+  Raises:
+    `TypeError` if `angles` is not rank 0 or 1.
+
+  """
+  angles = tf.convert_to_tensor(angles, dtype=tf.float32)
+  if len(angles.get_shape()) == 0:  # pylint:disable=g-explicit-length-test
+    angles = angles[None]
+  elif len(angles.get_shape()) != 1:
+    raise TypeError('Angles should have a rank 0 or 1.')
+  x_offset = ((image_width - 1) -
+              (tf.math.cos(angles) * (image_width - 1) - tf.math.sin(angles) *
+               (image_height - 1))) / 2.0
+  y_offset = ((image_height - 1) -
+              (tf.math.sin(angles) * (image_width - 1) + tf.math.cos(angles) *
+               (image_height - 1))) / 2.0
+  num_angles = tf.shape(angles)[0]
+  return tf.concat(
+      values=[
+          tf.math.cos(angles)[:, None],
+          -tf.math.sin(angles)[:, None],
+          x_offset[:, None],
+          tf.math.sin(angles)[:, None],
+          tf.math.cos(angles)[:, None],
+          y_offset[:, None],
+          tf.zeros((num_angles, 2), tf.dtypes.float32),
+      ],
+      axis=1,
+  )
+
+
+def transform(image: tf.Tensor, transforms) -> tf.Tensor:
+  """Prepares input data for `image_ops.transform`."""
+  original_ndims = tf.rank(image)
+  transforms = tf.convert_to_tensor(transforms, dtype=tf.float32)
+  if transforms.shape.rank == 1:
+    transforms = transforms[None]
+  image = to_4d(image)
+  image = image_ops.transform(
+      images=image, transforms=transforms, interpolation='nearest')
+  return from_4d(image, original_ndims)
+
+
+def translate(image: tf.Tensor, translations) -> tf.Tensor:
+  """Translates image(s) by provided vectors.
+
+  Args:
+    image: An image Tensor of type uint8.
+    translations: A vector or matrix representing [dx dy].
+
+  Returns:
+    The translated version of the image.
+
+  """
+  transforms = _convert_translation_to_transform(translations)
+  return transform(image, transforms=transforms)
+
+
+def rotate(image: tf.Tensor, degrees: float) -> tf.Tensor:
+  """Rotates the image by degrees either clockwise or counterclockwise.
+
+  Args:
+    image: An image Tensor of type uint8.
+    degrees: Float, a scalar angle in degrees to rotate all images by. If
+      degrees is positive the image will be rotated clockwise otherwise it will
+      be rotated counterclockwise.
+
+  Returns:
+    The rotated version of image.
+
+  """
+  # Convert from degrees to radians.
+  degrees_to_radians = math.pi / 180.0
+  radians = tf.cast(degrees * degrees_to_radians, tf.float32)
+
+  original_ndims = tf.rank(image)
+  image = to_4d(image)
+
+  image_height = tf.cast(tf.shape(image)[1], tf.float32)
+  image_width = tf.cast(tf.shape(image)[2], tf.float32)
+  transforms = _convert_angles_to_transform(
+      angles=radians, image_width=image_width, image_height=image_height)
+  # In practice, we should randomize the rotation degrees by flipping
+  # it negatively half the time, but that's done on 'degrees' outside
+  # of the function.
+  image = transform(image, transforms=transforms)
+  return from_4d(image, original_ndims)
+
+
+def blend(image1: tf.Tensor, image2: tf.Tensor, factor: float) -> tf.Tensor:
+  """Blend image1 and image2 using 'factor'.
+
+  Factor can be above 0.0.  A value of 0.0 means only image1 is used.
+  A value of 1.0 means only image2 is used.  A value between 0.0 and
+  1.0 means we linearly interpolate the pixel values between the two
+  images.  A value greater than 1.0 "extrapolates" the difference
+  between the two pixel values, and we clip the results to values
+  between 0 and 255.
+
+  Args:
+    image1: An image Tensor of type uint8.
+    image2: An image Tensor of type uint8.
+    factor: A floating point value above 0.0.
+
+  Returns:
+    A blended image Tensor of type uint8.
+  """
+  if factor == 0.0:
+    return tf.convert_to_tensor(image1)
+  if factor == 1.0:
+    return tf.convert_to_tensor(image2)
+
+  image1 = tf.cast(image1, tf.float32)
+  image2 = tf.cast(image2, tf.float32)
+
+  difference = image2 - image1
+  scaled = factor * difference
+
+  # Do addition in float.
+  temp = tf.cast(image1, tf.float32) + scaled
+
+  # Interpolate
+  if factor > 0.0 and factor < 1.0:
+    # Interpolation means we always stay within 0 and 255.
+    return tf.cast(temp, tf.uint8)
+
+  # Extrapolate:
+  #
+  # We need to clip and then cast.
+  return tf.cast(tf.clip_by_value(temp, 0.0, 255.0), tf.uint8)
+
+
+def cutout(image: tf.Tensor, pad_size: int, replace: int = 0) -> tf.Tensor:
+  """Apply cutout (https://arxiv.org/abs/1708.04552) to image.
+
+  This operation applies a (2*pad_size x 2*pad_size) mask of zeros to
+  a random location within `img`. The pixel values filled in will be of the
+  value `replace`. The located where the mask will be applied is randomly
+  chosen uniformly over the whole image.
+
+  Args:
+    image: An image Tensor of type uint8.
+    pad_size: Specifies how big the zero mask that will be generated is that is
+      applied to the image. The mask will be of size (2*pad_size x 2*pad_size).
+    replace: What pixel value to fill in the image in the area that has the
+      cutout mask applied to it.
+
+  Returns:
+    An image Tensor that is of type uint8.
+  """
+  image_height = tf.shape(image)[0]
+  image_width = tf.shape(image)[1]
+
+  # Sample the center location in the image where the zero mask will be applied.
+  cutout_center_height = tf.random.uniform(
+      shape=[], minval=0, maxval=image_height, dtype=tf.int32)
+
+  cutout_center_width = tf.random.uniform(
+      shape=[], minval=0, maxval=image_width, dtype=tf.int32)
+
+  lower_pad = tf.maximum(0, cutout_center_height - pad_size)
+  upper_pad = tf.maximum(0, image_height - cutout_center_height - pad_size)
+  left_pad = tf.maximum(0, cutout_center_width - pad_size)
+  right_pad = tf.maximum(0, image_width - cutout_center_width - pad_size)
+
+  cutout_shape = [
+      image_height - (lower_pad + upper_pad),
+      image_width - (left_pad + right_pad)
+  ]
+  padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]]
+  mask = tf.pad(
+      tf.zeros(cutout_shape, dtype=image.dtype),
+      padding_dims,
+      constant_values=1)
+  mask = tf.expand_dims(mask, -1)
+  mask = tf.tile(mask, [1, 1, 3])
+  image = tf.where(
+      tf.equal(mask, 0),
+      tf.ones_like(image, dtype=image.dtype) * replace, image)
+  return image
+
+
+def solarize(image: tf.Tensor, threshold: int = 128) -> tf.Tensor:
+  # For each pixel in the image, select the pixel
+  # if the value is less than the threshold.
+  # Otherwise, subtract 255 from the pixel.
+  return tf.where(image < threshold, image, 255 - image)
+
+
+def solarize_add(image: tf.Tensor,
+                 addition: int = 0,
+                 threshold: int = 128) -> tf.Tensor:
+  # For each pixel in the image less than threshold
+  # we add 'addition' amount to it and then clip the
+  # pixel value to be between 0 and 255. The value
+  # of 'addition' is between -128 and 128.
+  added_image = tf.cast(image, tf.int64) + addition
+  added_image = tf.cast(tf.clip_by_value(added_image, 0, 255), tf.uint8)
+  return tf.where(image < threshold, added_image, image)
+
+
+def color(image: tf.Tensor, factor: float) -> tf.Tensor:
+  """Equivalent of PIL Color."""
+  degenerate = tf.image.grayscale_to_rgb(tf.image.rgb_to_grayscale(image))
+  return blend(degenerate, image, factor)
+
+
+def contrast(image: tf.Tensor, factor: float) -> tf.Tensor:
+  """Equivalent of PIL Contrast."""
+  degenerate = tf.image.rgb_to_grayscale(image)
+  # Cast before calling tf.histogram.
+  degenerate = tf.cast(degenerate, tf.int32)
+
+  # Compute the grayscale histogram, then compute the mean pixel value,
+  # and create a constant image size of that value.  Use that as the
+  # blending degenerate target of the original image.
+  hist = tf.histogram_fixed_width(degenerate, [0, 255], nbins=256)
+  mean = tf.reduce_sum(tf.cast(hist, tf.float32)) / 256.0
+  degenerate = tf.ones_like(degenerate, dtype=tf.float32) * mean
+  degenerate = tf.clip_by_value(degenerate, 0.0, 255.0)
+  degenerate = tf.image.grayscale_to_rgb(tf.cast(degenerate, tf.uint8))
+  return blend(degenerate, image, factor)
+
+
+def brightness(image: tf.Tensor, factor: float) -> tf.Tensor:
+  """Equivalent of PIL Brightness."""
+  degenerate = tf.zeros_like(image)
+  return blend(degenerate, image, factor)
+
+
+def posterize(image: tf.Tensor, bits: int) -> tf.Tensor:
+  """Equivalent of PIL Posterize."""
+  shift = 8 - bits
+  return tf.bitwise.left_shift(tf.bitwise.right_shift(image, shift), shift)
+
+
+def wrapped_rotate(image: tf.Tensor, degrees: float, replace: int) -> tf.Tensor:
+  """Applies rotation with wrap/unwrap."""
+  image = rotate(wrap(image), degrees=degrees)
+  return unwrap(image, replace)
+
+
+def translate_x(image: tf.Tensor, pixels: int, replace: int) -> tf.Tensor:
+  """Equivalent of PIL Translate in X dimension."""
+  image = translate(wrap(image), [-pixels, 0])
+  return unwrap(image, replace)
+
+
+def translate_y(image: tf.Tensor, pixels: int, replace: int) -> tf.Tensor:
+  """Equivalent of PIL Translate in Y dimension."""
+  image = translate(wrap(image), [0, -pixels])
+  return unwrap(image, replace)
+
+
+def shear_x(image: tf.Tensor, level: float, replace: int) -> tf.Tensor:
+  """Equivalent of PIL Shearing in X dimension."""
+  # Shear parallel to x axis is a projective transform
+  # with a matrix form of:
+  # [1  level
+  #  0  1].
+  image = transform(
+      image=wrap(image), transforms=[1., level, 0., 0., 1., 0., 0., 0.])
+  return unwrap(image, replace)
+
+
+def shear_y(image: tf.Tensor, level: float, replace: int) -> tf.Tensor:
+  """Equivalent of PIL Shearing in Y dimension."""
+  # Shear parallel to y axis is a projective transform
+  # with a matrix form of:
+  # [1  0
+  #  level  1].
+  image = transform(
+      image=wrap(image), transforms=[1., 0., 0., level, 1., 0., 0., 0.])
+  return unwrap(image, replace)
+
+
+def autocontrast(image: tf.Tensor) -> tf.Tensor:
+  """Implements Autocontrast function from PIL using TF ops.
+
+  Args:
+    image: A 3D uint8 tensor.
+
+  Returns:
+    The image after it has had autocontrast applied to it and will be of type
+    uint8.
+  """
+
+  def scale_channel(image: tf.Tensor) -> tf.Tensor:
+    """Scale the 2D image using the autocontrast rule."""
+    # A possibly cheaper version can be done using cumsum/unique_with_counts
+    # over the histogram values, rather than iterating over the entire image.
+    # to compute mins and maxes.
+    lo = tf.cast(tf.reduce_min(image), tf.float32)
+    hi = tf.cast(tf.reduce_max(image), tf.float32)
+
+    # Scale the image, making the lowest value 0 and the highest value 255.
+    def scale_values(im):
+      scale = 255.0 / (hi - lo)
+      offset = -lo * scale
+      im = tf.cast(im, tf.float32) * scale + offset
+      im = tf.clip_by_value(im, 0.0, 255.0)
+      return tf.cast(im, tf.uint8)
+
+    result = tf.cond(hi > lo, lambda: scale_values(image), lambda: image)
+    return result
+
+  # Assumes RGB for now.  Scales each channel independently
+  # and then stacks the result.
+  s1 = scale_channel(image[:, :, 0])
+  s2 = scale_channel(image[:, :, 1])
+  s3 = scale_channel(image[:, :, 2])
+  image = tf.stack([s1, s2, s3], 2)
+  return image
+
+
+def sharpness(image: tf.Tensor, factor: float) -> tf.Tensor:
+  """Implements Sharpness function from PIL using TF ops."""
+  orig_image = image
+  image = tf.cast(image, tf.float32)
+  # Make image 4D for conv operation.
+  image = tf.expand_dims(image, 0)
+  # SMOOTH PIL Kernel.
+  kernel = tf.constant([[1, 1, 1], [1, 5, 1], [1, 1, 1]],
+                       dtype=tf.float32,
+                       shape=[3, 3, 1, 1]) / 13.
+  # Tile across channel dimension.
+  kernel = tf.tile(kernel, [1, 1, 3, 1])
+  strides = [1, 1, 1, 1]
+  degenerate = tf.nn.depthwise_conv2d(
+      image, kernel, strides, padding='VALID', dilations=[1, 1])
+  degenerate = tf.clip_by_value(degenerate, 0.0, 255.0)
+  degenerate = tf.squeeze(tf.cast(degenerate, tf.uint8), [0])
+
+  # For the borders of the resulting image, fill in the values of the
+  # original image.
+  mask = tf.ones_like(degenerate)
+  padded_mask = tf.pad(mask, [[1, 1], [1, 1], [0, 0]])
+  padded_degenerate = tf.pad(degenerate, [[1, 1], [1, 1], [0, 0]])
+  result = tf.where(tf.equal(padded_mask, 1), padded_degenerate, orig_image)
+
+  # Blend the final result.
+  return blend(result, orig_image, factor)
+
+
+def equalize(image: tf.Tensor) -> tf.Tensor:
+  """Implements Equalize function from PIL using TF ops."""
+
+  def scale_channel(im, c):
+    """Scale the data in the channel to implement equalize."""
+    im = tf.cast(im[:, :, c], tf.int32)
+    # Compute the histogram of the image channel.
+    histo = tf.histogram_fixed_width(im, [0, 255], nbins=256)
+
+    # For the purposes of computing the step, filter out the nonzeros.
+    nonzero = tf.where(tf.not_equal(histo, 0))
+    nonzero_histo = tf.reshape(tf.gather(histo, nonzero), [-1])
+    step = (tf.reduce_sum(nonzero_histo) - nonzero_histo[-1]) // 255
+
+    def build_lut(histo, step):
+      # Compute the cumulative sum, shifting by step // 2
+      # and then normalization by step.
+      lut = (tf.cumsum(histo) + (step // 2)) // step
+      # Shift lut, prepending with 0.
+      lut = tf.concat([[0], lut[:-1]], 0)
+      # Clip the counts to be in range.  This is done
+      # in the C code for image.point.
+      return tf.clip_by_value(lut, 0, 255)
+
+    # If step is zero, return the original image.  Otherwise, build
+    # lut from the full histogram and step and then index from it.
+    result = tf.cond(
+        tf.equal(step, 0), lambda: im,
+        lambda: tf.gather(build_lut(histo, step), im))
+
+    return tf.cast(result, tf.uint8)
+
+  # Assumes RGB for now.  Scales each channel independently
+  # and then stacks the result.
+  s1 = scale_channel(image, 0)
+  s2 = scale_channel(image, 1)
+  s3 = scale_channel(image, 2)
+  image = tf.stack([s1, s2, s3], 2)
+  return image
+
+
+def invert(image: tf.Tensor) -> tf.Tensor:
+  """Inverts the image pixels."""
+  image = tf.convert_to_tensor(image)
+  return 255 - image
+
+
+def wrap(image: tf.Tensor) -> tf.Tensor:
+  """Returns 'image' with an extra channel set to all 1s."""
+  shape = tf.shape(image)
+  extended_channel = tf.ones([shape[0], shape[1], 1], image.dtype)
+  extended = tf.concat([image, extended_channel], axis=2)
+  return extended
+
+
+def unwrap(image: tf.Tensor, replace: int) -> tf.Tensor:
+  """Unwraps an image produced by wrap.
+
+  Where there is a 0 in the last channel for every spatial position,
+  the rest of the three channels in that spatial dimension are grayed
+  (set to 128).  Operations like translate and shear on a wrapped
+  Tensor will leave 0s in empty locations.  Some transformations look
+  at the intensity of values to do preprocessing, and we want these
+  empty pixels to assume the 'average' value, rather than pure black.
+
+
+  Args:
+    image: A 3D Image Tensor with 4 channels.
+    replace: A one or three value 1D tensor to fill empty pixels.
+
+  Returns:
+    image: A 3D image Tensor with 3 channels.
+  """
+  image_shape = tf.shape(image)
+  # Flatten the spatial dimensions.
+  flattened_image = tf.reshape(image, [-1, image_shape[2]])
+
+  # Find all pixels where the last channel is zero.
+  alpha_channel = tf.expand_dims(flattened_image[:, 3], axis=-1)
+
+  replace = tf.concat([replace, tf.ones([1], image.dtype)], 0)
+
+  # Where they are zero, fill them in with 'replace'.
+  flattened_image = tf.where(
+      tf.equal(alpha_channel, 0),
+      tf.ones_like(flattened_image, dtype=image.dtype) * replace,
+      flattened_image)
+
+  image = tf.reshape(flattened_image, image_shape)
+  image = tf.slice(image, [0, 0, 0], [image_shape[0], image_shape[1], 3])
+  return image
+
+
+def _randomly_negate_tensor(tensor):
+  """With 50% prob turn the tensor negative."""
+  should_flip = tf.cast(tf.floor(tf.random.uniform([]) + 0.5), tf.bool)
+  final_tensor = tf.cond(should_flip, lambda: tensor, lambda: -tensor)
+  return final_tensor
+
+
+def _rotate_level_to_arg(level: float):
+  level = (level / _MAX_LEVEL) * 30.
+  level = _randomly_negate_tensor(level)
+  return (level,)
+
+
+def _shrink_level_to_arg(level: float):
+  """Converts level to ratio by which we shrink the image content."""
+  if level == 0:
+    return (1.0,)  # if level is zero, do not shrink the image
+  # Maximum shrinking ratio is 2.9.
+  level = 2. / (_MAX_LEVEL / level) + 0.9
+  return (level,)
+
+
+def _enhance_level_to_arg(level: float):
+  return ((level / _MAX_LEVEL) * 1.8 + 0.1,)
+
+
+def _shear_level_to_arg(level: float):
+  level = (level / _MAX_LEVEL) * 0.3
+  # Flip level to negative with 50% chance.
+  level = _randomly_negate_tensor(level)
+  return (level,)
+
+
+def _translate_level_to_arg(level: float, translate_const: float):
+  level = (level / _MAX_LEVEL) * float(translate_const)
+  # Flip level to negative with 50% chance.
+  level = _randomly_negate_tensor(level)
+  return (level,)
+
+
+def _mult_to_arg(level: float, multiplier: float = 1.):
+  return (int((level / _MAX_LEVEL) * multiplier),)
+
+
+def _apply_func_with_prob(func: Any, image: tf.Tensor, args: Any, prob: float):
+  """Apply `func` to image w/ `args` as input with probability `prob`."""
+  assert isinstance(args, tuple)
+
+  # Apply the function with probability `prob`.
+  should_apply_op = tf.cast(
+      tf.floor(tf.random.uniform([], dtype=tf.float32) + prob), tf.bool)
+  augmented_image = tf.cond(should_apply_op, lambda: func(image, *args),
+                            lambda: image)
+  return augmented_image
+
+
+def select_and_apply_random_policy(policies: Any, image: tf.Tensor):
+  """Select a random policy from `policies` and apply it to `image`."""
+  policy_to_select = tf.random.uniform([], maxval=len(policies), dtype=tf.int32)
+  # Note that using tf.case instead of tf.conds would result in significantly
+  # larger graphs and would even break export for some larger policies.
+  for (i, policy) in enumerate(policies):
+    image = tf.cond(
+        tf.equal(i, policy_to_select),
+        lambda selected_policy=policy: selected_policy(image),
+        lambda: image)
+  return image
+
+
+NAME_TO_FUNC = {
+    'AutoContrast': autocontrast,
+    'Equalize': equalize,
+    'Invert': invert,
+    'Rotate': wrapped_rotate,
+    'Posterize': posterize,
+    'Solarize': solarize,
+    'SolarizeAdd': solarize_add,
+    'Color': color,
+    'Contrast': contrast,
+    'Brightness': brightness,
+    'Sharpness': sharpness,
+    'ShearX': shear_x,
+    'ShearY': shear_y,
+    'TranslateX': translate_x,
+    'TranslateY': translate_y,
+    'Cutout': cutout,
+}
+
+# Functions that have a 'replace' parameter
+REPLACE_FUNCS = frozenset({
+    'Rotate',
+    'TranslateX',
+    'ShearX',
+    'ShearY',
+    'TranslateY',
+    'Cutout',
+})
+
+
+def level_to_arg(cutout_const: float, translate_const: float):
+  """Creates a dict mapping image operation names to their arguments."""
+
+  no_arg = lambda level: ()
+  posterize_arg = lambda level: _mult_to_arg(level, 4)
+  solarize_arg = lambda level: _mult_to_arg(level, 256)
+  solarize_add_arg = lambda level: _mult_to_arg(level, 110)
+  cutout_arg = lambda level: _mult_to_arg(level, cutout_const)
+  translate_arg = lambda level: _translate_level_to_arg(level, translate_const)
+
+  args = {
+      'AutoContrast': no_arg,
+      'Equalize': no_arg,
+      'Invert': no_arg,
+      'Rotate': _rotate_level_to_arg,
+      'Posterize': posterize_arg,
+      'Solarize': solarize_arg,
+      'SolarizeAdd': solarize_add_arg,
+      'Color': _enhance_level_to_arg,
+      'Contrast': _enhance_level_to_arg,
+      'Brightness': _enhance_level_to_arg,
+      'Sharpness': _enhance_level_to_arg,
+      'ShearX': _shear_level_to_arg,
+      'ShearY': _shear_level_to_arg,
+      'Cutout': cutout_arg,
+      'TranslateX': translate_arg,
+      'TranslateY': translate_arg,
+  }
+  return args
+
+
+def _parse_policy_info(name: Text, prob: float, level: float,
+                       replace_value: List[int], cutout_const: float,
+                       translate_const: float) -> Tuple[Any, float, Any]:
+  """Return the function that corresponds to `name` and update `level` param."""
+  func = NAME_TO_FUNC[name]
+  args = level_to_arg(cutout_const, translate_const)[name](level)
+
+  if name in REPLACE_FUNCS:
+    # Add in replace arg if it is required for the function that is called.
+    args = tuple(list(args) + [replace_value])
+
+  return func, prob, args
+
+
+class ImageAugment(object):
+  """Image augmentation class for applying image distortions."""
+
+  def distort(self, image: tf.Tensor) -> tf.Tensor:
+    """Given an image tensor, returns a distorted image with the same shape.
+
+    Args:
+      image: `Tensor` of shape [height, width, 3] representing an image.
+
+    Returns:
+      The augmented version of `image`.
+    """
+    raise NotImplementedError()
+
+
+class AutoAugment(ImageAugment):
+  """Applies the AutoAugment policy to images.
+
+    AutoAugment is from the paper: https://arxiv.org/abs/1805.09501.
+  """
+
+  def __init__(self,
+               augmentation_name: Text = 'v0',
+               policies: Optional[Dict[Text, Any]] = None,
+               cutout_const: float = 100,
+               translate_const: float = 250):
+    """Applies the AutoAugment policy to images.
+
+    Args:
+      augmentation_name: The name of the AutoAugment policy to use. The
+        available options are `v0` and `test`. `v0` is the policy used for all
+        of the results in the paper and was found to achieve the best results on
+        the COCO dataset. `v1`, `v2` and `v3` are additional good policies found
+        on the COCO dataset that have slight variation in what operations were
+        used during the search procedure along with how many operations are
+        applied in parallel to a single image (2 vs 3).
+      policies: list of lists of tuples in the form `(func, prob, level)`,
+        `func` is a string name of the augmentation function, `prob` is the
+        probability of applying the `func` operation, `level` is the input
+        argument for `func`.
+      cutout_const: multiplier for applying cutout.
+      translate_const: multiplier for applying translation.
+    """
+    super(AutoAugment, self).__init__()
+
+    if policies is None:
+      self.available_policies = {
+          'v0': self.policy_v0(),
+          'test': self.policy_test(),
+          'simple': self.policy_simple(),
+      }
+
+    if augmentation_name not in self.available_policies:
+      raise ValueError(
+          'Invalid augmentation_name: {}'.format(augmentation_name))
+
+    self.augmentation_name = augmentation_name
+    self.policies = self.available_policies[augmentation_name]
+    self.cutout_const = float(cutout_const)
+    self.translate_const = float(translate_const)
+
+  def distort(self, image: tf.Tensor) -> tf.Tensor:
+    """Applies the AutoAugment policy to `image`.
+
+    AutoAugment is from the paper: https://arxiv.org/abs/1805.09501.
+
+    Args:
+      image: `Tensor` of shape [height, width, 3] representing an image.
+
+    Returns:
+      A version of image that now has data augmentation applied to it based on
+      the `policies` pass into the function.
+    """
+    input_image_type = image.dtype
+
+    if input_image_type != tf.uint8:
+      image = tf.clip_by_value(image, 0.0, 255.0)
+      image = tf.cast(image, dtype=tf.uint8)
+
+    replace_value = [128] * 3
+
+    # func is the string name of the augmentation function, prob is the
+    # probability of applying the operation and level is the parameter
+    # associated with the tf op.
+
+    # tf_policies are functions that take in an image and return an augmented
+    # image.
+    tf_policies = []
+    for policy in self.policies:
+      tf_policy = []
+      # Link string name to the correct python function and make sure the
+      # correct argument is passed into that function.
+      for policy_info in policy:
+        policy_info = list(policy_info) + [
+            replace_value, self.cutout_const, self.translate_const
+        ]
+        tf_policy.append(_parse_policy_info(*policy_info))
+      # Now build the tf policy that will apply the augmentation procedue
+      # on image.
+      def make_final_policy(tf_policy_):
+
+        def final_policy(image_):
+          for func, prob, args in tf_policy_:
+            image_ = _apply_func_with_prob(func, image_, args, prob)
+          return image_
+
+        return final_policy
+
+      tf_policies.append(make_final_policy(tf_policy))
+
+    image = select_and_apply_random_policy(tf_policies, image)
+    image = tf.cast(image, dtype=input_image_type)
+    return image
+
+  @staticmethod
+  def policy_v0():
+    """Autoaugment policy that was used in AutoAugment Paper.
+
+    Each tuple is an augmentation operation of the form
+    (operation, probability, magnitude). Each element in policy is a
+    sub-policy that will be applied sequentially on the image.
+
+    Returns:
+      the policy.
+    """
+
+    # TODO(dankondratyuk): tensorflow_addons defines custom ops, which
+    # for some reason are not included when building/linking
+    # This results in the error, "Op type not registered
+    # 'Addons>ImageProjectiveTransformV2' in binary" when running on borg TPUs
+    policy = [
+        [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)],
+        [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
+        [('Color', 0.4, 1), ('Rotate', 0.6, 8)],
+        [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
+        [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
+        [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
+        [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
+        [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)],
+        [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
+        [('Invert', 0.4, 9), ('Rotate', 0.6, 0)],
+        [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)],
+        [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
+        [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
+        [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
+        [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)],
+        [('Rotate', 1.0, 7), ('TranslateY', 0.8, 9)],
+        [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)],
+        [('ShearY', 0.8, 0), ('Color', 0.6, 4)],
+        [('Color', 1.0, 0), ('Rotate', 0.6, 2)],
+        [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
+        [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
+        [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)],
+        [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)],
+        [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
+        [('Color', 0.8, 6), ('Rotate', 0.4, 5)],
+    ]
+    return policy
+
+  @staticmethod
+  def policy_simple():
+    """Same as `policy_v0`, except with custom ops removed."""
+
+    policy = [
+        [('Color', 0.4, 9), ('Equalize', 0.6, 3)],
+        [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)],
+        [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)],
+        [('Color', 0.2, 0), ('Equalize', 0.8, 8)],
+        [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)],
+        [('Color', 0.6, 1), ('Equalize', 1.0, 2)],
+        [('Color', 0.4, 7), ('Equalize', 0.6, 0)],
+        [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)],
+        [('Solarize', 0.6, 8), ('Color', 0.6, 9)],
+        [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)],
+        [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)],
+        [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)],
+        [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)],
+    ]
+    return policy
+
+  @staticmethod
+  def policy_test():
+    """Autoaugment test policy for debugging."""
+    policy = [
+        [('TranslateX', 1.0, 4), ('Equalize', 1.0, 10)],
+    ]
+    return policy
+
+
+class RandAugment(ImageAugment):
+  """Applies the RandAugment policy to images.
+
+  RandAugment is from the paper https://arxiv.org/abs/1909.13719,
+  """
+
+  def __init__(self,
+               num_layers: int = 2,
+               magnitude: float = 10.,
+               cutout_const: float = 40.,
+               translate_const: float = 100.):
+    """Applies the RandAugment policy to images.
+
+    Args:
+      num_layers: Integer, the number of augmentation transformations to apply
+        sequentially to an image. Represented as (N) in the paper. Usually best
+        values will be in the range [1, 3].
+      magnitude: Integer, shared magnitude across all augmentation operations.
+        Represented as (M) in the paper. Usually best values are in the range
+        [5, 10].
+      cutout_const: multiplier for applying cutout.
+      translate_const: multiplier for applying translation.
+    """
+    super(RandAugment, self).__init__()
+
+    self.num_layers = num_layers
+    self.magnitude = float(magnitude)
+    self.cutout_const = float(cutout_const)
+    self.translate_const = float(translate_const)
+    self.available_ops = [
+        'AutoContrast', 'Equalize', 'Invert', 'Rotate', 'Posterize', 'Solarize',
+        'Color', 'Contrast', 'Brightness', 'Sharpness', 'ShearX', 'ShearY',
+        'TranslateX', 'TranslateY', 'Cutout', 'SolarizeAdd'
+    ]
+
+  def distort(self, image: tf.Tensor) -> tf.Tensor:
+    """Applies the RandAugment policy to `image`.
+
+    Args:
+      image: `Tensor` of shape [height, width, 3] representing an image.
+
+    Returns:
+      The augmented version of `image`.
+    """
+    input_image_type = image.dtype
+
+    if input_image_type != tf.uint8:
+      image = tf.clip_by_value(image, 0.0, 255.0)
+      image = tf.cast(image, dtype=tf.uint8)
+
+    replace_value = [128] * 3
+    min_prob, max_prob = 0.2, 0.8
+
+    for _ in range(self.num_layers):
+      op_to_select = tf.random.uniform([],
+                                       maxval=len(self.available_ops) + 1,
+                                       dtype=tf.int32)
+
+      branch_fns = []
+      for (i, op_name) in enumerate(self.available_ops):
+        prob = tf.random.uniform([],
+                                 minval=min_prob,
+                                 maxval=max_prob,
+                                 dtype=tf.float32)
+        func, _, args = _parse_policy_info(op_name, prob, self.magnitude,
+                                           replace_value, self.cutout_const,
+                                           self.translate_const)
+        branch_fns.append((
+            i,
+            # pylint:disable=g-long-lambda
+            lambda selected_func=func, selected_args=args: selected_func(
+                image, *selected_args)))
+        # pylint:enable=g-long-lambda
+
+      image = tf.switch_case(
+          branch_index=op_to_select,
+          branch_fns=branch_fns,
+          default=lambda: tf.identity(image))
+
+    image = tf.cast(image, dtype=input_image_type)
+    return image
diff --git a/cv/classification/resnet50/tensorflow2.0/augment_test.py b/cv/classification/resnet50/tensorflow2.0/augment_test.py
new file mode 100644
index 000000000..dceb14eea
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/augment_test.py
@@ -0,0 +1,130 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for autoaugment."""
+
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+import tensorflow as tf
+
+from official.vision.image_classification import augment
+
+
+def get_dtype_test_cases():
+  return [
+      ('uint8', tf.uint8),
+      ('int32', tf.int32),
+      ('float16', tf.float16),
+      ('float32', tf.float32),
+  ]
+
+
+@parameterized.named_parameters(get_dtype_test_cases())
+class TransformsTest(parameterized.TestCase, tf.test.TestCase):
+  """Basic tests for fundamental transformations."""
+
+  def test_to_from_4d(self, dtype):
+    for shape in [(10, 10), (10, 10, 10), (10, 10, 10, 10)]:
+      original_ndims = len(shape)
+      image = tf.zeros(shape, dtype=dtype)
+      image_4d = augment.to_4d(image)
+      self.assertEqual(4, tf.rank(image_4d))
+      self.assertAllEqual(image, augment.from_4d(image_4d, original_ndims))
+
+  def test_transform(self, dtype):
+    image = tf.constant([[1, 2], [3, 4]], dtype=dtype)
+    self.assertAllEqual(
+        augment.transform(image, transforms=[1] * 8), [[4, 4], [4, 4]])
+
+  def test_translate(self, dtype):
+    image = tf.constant(
+        [[1, 0, 1, 0], [0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1]], dtype=dtype)
+    translations = [-1, -1]
+    translated = augment.translate(image=image, translations=translations)
+    expected = [[1, 0, 1, 1], [0, 1, 0, 0], [1, 0, 1, 1], [1, 0, 1, 1]]
+    self.assertAllEqual(translated, expected)
+
+  def test_translate_shapes(self, dtype):
+    translation = [0, 0]
+    for shape in [(3, 3), (5, 5), (224, 224, 3)]:
+      image = tf.zeros(shape, dtype=dtype)
+      self.assertAllEqual(image, augment.translate(image, translation))
+
+  def test_translate_invalid_translation(self, dtype):
+    image = tf.zeros((1, 1), dtype=dtype)
+    invalid_translation = [[[1, 1]]]
+    with self.assertRaisesRegex(TypeError, 'rank 1 or 2'):
+      _ = augment.translate(image, invalid_translation)
+
+  def test_rotate(self, dtype):
+    image = tf.reshape(tf.cast(tf.range(9), dtype), (3, 3))
+    rotation = 90.
+    transformed = augment.rotate(image=image, degrees=rotation)
+    expected = [[2, 5, 8], [1, 4, 7], [0, 3, 6]]
+    self.assertAllEqual(transformed, expected)
+
+  def test_rotate_shapes(self, dtype):
+    degrees = 0.
+    for shape in [(3, 3), (5, 5), (224, 224, 3)]:
+      image = tf.zeros(shape, dtype=dtype)
+      self.assertAllEqual(image, augment.rotate(image, degrees))
+
+
+class AutoaugmentTest(tf.test.TestCase):
+
+  def test_autoaugment(self):
+    """Smoke test to be sure there are no syntax errors."""
+    image = tf.zeros((224, 224, 3), dtype=tf.uint8)
+
+    augmenter = augment.AutoAugment()
+    aug_image = augmenter.distort(image)
+
+    self.assertEqual((224, 224, 3), aug_image.shape)
+
+  def test_randaug(self):
+    """Smoke test to be sure there are no syntax errors."""
+    image = tf.zeros((224, 224, 3), dtype=tf.uint8)
+
+    augmenter = augment.RandAugment()
+    aug_image = augmenter.distort(image)
+
+    self.assertEqual((224, 224, 3), aug_image.shape)
+
+  def test_all_policy_ops(self):
+    """Smoke test to be sure all augmentation functions can execute."""
+
+    prob = 1
+    magnitude = 10
+    replace_value = [128] * 3
+    cutout_const = 100
+    translate_const = 250
+
+    image = tf.ones((224, 224, 3), dtype=tf.uint8)
+
+    for op_name in augment.NAME_TO_FUNC:
+      func, _, args = augment._parse_policy_info(op_name, prob, magnitude,
+                                                 replace_value, cutout_const,
+                                                 translate_const)
+      image = func(image, *args)
+
+    self.assertEqual((224, 224, 3), image.shape)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/callbacks.py b/cv/classification/resnet50/tensorflow2.0/callbacks.py
new file mode 100644
index 000000000..a75f0911c
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/callbacks.py
@@ -0,0 +1,257 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Common modules for callbacks."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import os
+from typing import Any, List, MutableMapping, Optional, Text
+
+from absl import logging
+import tensorflow as tf
+
+from modeling import optimization
+from utils.misc import keras_utils
+
+
+def get_callbacks(
+    model_checkpoint: bool = True,
+    include_tensorboard: bool = True,
+    time_history: bool = True,
+    track_lr: bool = True,
+    write_model_weights: bool = True,
+    apply_moving_average: bool = False,
+    initial_step: int = 0,
+    batch_size: int = 0,
+    log_steps: int = 0,
+    model_dir: Optional[str] = None,
+    backup_and_restore: bool = False) -> List[tf.keras.callbacks.Callback]:
+  """Get all callbacks."""
+  model_dir = model_dir or ''
+  callbacks = []
+  if model_checkpoint:
+    ckpt_full_path = os.path.join(model_dir, 'model.ckpt-{epoch:04d}')
+    callbacks.append(
+        tf.keras.callbacks.ModelCheckpoint(
+            ckpt_full_path, save_weights_only=True, verbose=1))
+  if backup_and_restore:
+    backup_dir = os.path.join(model_dir, 'tmp')
+    callbacks.append(
+        tf.keras.callbacks.experimental.BackupAndRestore(backup_dir))
+  if include_tensorboard:
+    callbacks.append(
+        CustomTensorBoard(
+            log_dir=model_dir,
+            track_lr=track_lr,
+            initial_step=initial_step,
+            write_images=write_model_weights,
+            profile_batch=0))
+  if time_history:
+    callbacks.append(
+        keras_utils.TimeHistory(
+            batch_size,
+            log_steps,
+            logdir=model_dir if include_tensorboard else None))
+  if apply_moving_average:
+    # Save moving average model to a different file so that
+    # we can resume training from a checkpoint
+    ckpt_full_path = os.path.join(model_dir, 'average',
+                                  'model.ckpt-{epoch:04d}')
+    callbacks.append(
+        AverageModelCheckpoint(
+            update_weights=False,
+            filepath=ckpt_full_path,
+            save_weights_only=True,
+            verbose=1))
+    callbacks.append(MovingAverageCallback())
+  return callbacks
+
+
+def get_scalar_from_tensor(t: tf.Tensor) -> int:
+  """Utility function to convert a Tensor to a scalar."""
+  t = tf.keras.backend.get_value(t)
+  if callable(t):
+    return t()
+  else:
+    return t
+
+
+class CustomTensorBoard(tf.keras.callbacks.TensorBoard):
+  """A customized TensorBoard callback that tracks additional datapoints.
+
+  Metrics tracked:
+  - Global learning rate
+
+  Attributes:
+    log_dir: the path of the directory where to save the log files to be parsed
+      by TensorBoard.
+    track_lr: `bool`, whether or not to track the global learning rate.
+    initial_step: the initial step, used for preemption recovery.
+    **kwargs: Additional arguments for backwards compatibility. Possible key is
+      `period`.
+  """
+
+  # TODO(b/146499062): track params, flops, log lr, l2 loss,
+  # classification loss
+
+  def __init__(self,
+               log_dir: str,
+               track_lr: bool = False,
+               initial_step: int = 0,
+               **kwargs):
+    super(CustomTensorBoard, self).__init__(log_dir=log_dir, **kwargs)
+    self.step = initial_step
+    self._track_lr = track_lr
+
+  def on_batch_begin(self,
+                     epoch: int,
+                     logs: Optional[MutableMapping[str, Any]] = None) -> None:
+    self.step += 1
+    if logs is None:
+      logs = {}
+    logs.update(self._calculate_metrics())
+    super(CustomTensorBoard, self).on_batch_begin(epoch, logs)
+
+  def on_epoch_begin(self,
+                     epoch: int,
+                     logs: Optional[MutableMapping[str, Any]] = None) -> None:
+    if logs is None:
+      logs = {}
+    metrics = self._calculate_metrics()
+    logs.update(metrics)
+    for k, v in metrics.items():
+      logging.info('Current %s: %f', k, v)
+    super(CustomTensorBoard, self).on_epoch_begin(epoch, logs)
+
+  def on_epoch_end(self,
+                   epoch: int,
+                   logs: Optional[MutableMapping[str, Any]] = None) -> None:
+    if logs is None:
+      logs = {}
+    metrics = self._calculate_metrics()
+    logs.update(metrics)
+    super(CustomTensorBoard, self).on_epoch_end(epoch, logs)
+
+  def _calculate_metrics(self) -> MutableMapping[str, Any]:
+    logs = {}
+    # TODO(b/149030439): disable LR reporting.
+    # if self._track_lr:
+    #   logs['learning_rate'] = self._calculate_lr()
+    return logs
+
+  def _calculate_lr(self) -> int:
+    """Calculates the learning rate given the current step."""
+    return get_scalar_from_tensor(
+        self._get_base_optimizer()._decayed_lr(var_dtype=tf.float32))  # pylint:disable=protected-access
+
+  def _get_base_optimizer(self) -> tf.keras.optimizers.Optimizer:
+    """Get the base optimizer used by the current model."""
+
+    optimizer = self.model.optimizer
+
+    # The optimizer might be wrapped by another class, so unwrap it
+    while hasattr(optimizer, '_optimizer'):
+      optimizer = optimizer._optimizer  # pylint:disable=protected-access
+
+    return optimizer
+
+
+class MovingAverageCallback(tf.keras.callbacks.Callback):
+  """A Callback to be used with a `ExponentialMovingAverage` optimizer.
+
+  Applies moving average weights to the model during validation time to test
+  and predict on the averaged weights rather than the current model weights.
+  Once training is complete, the model weights will be overwritten with the
+  averaged weights (by default).
+
+  Attributes:
+    overwrite_weights_on_train_end: Whether to overwrite the current model
+      weights with the averaged weights from the moving average optimizer.
+    **kwargs: Any additional callback arguments.
+  """
+
+  def __init__(self, overwrite_weights_on_train_end: bool = False, **kwargs):
+    super(MovingAverageCallback, self).__init__(**kwargs)
+    self.overwrite_weights_on_train_end = overwrite_weights_on_train_end
+
+  def set_model(self, model: tf.keras.Model):
+    super(MovingAverageCallback, self).set_model(model)
+    assert isinstance(self.model.optimizer,
+                      optimization.ExponentialMovingAverage)
+    self.model.optimizer.shadow_copy(self.model)
+
+  def on_test_begin(self, logs: Optional[MutableMapping[Text, Any]] = None):
+    self.model.optimizer.swap_weights()
+
+  def on_test_end(self, logs: Optional[MutableMapping[Text, Any]] = None):
+    self.model.optimizer.swap_weights()
+
+  def on_train_end(self, logs: Optional[MutableMapping[Text, Any]] = None):
+    if self.overwrite_weights_on_train_end:
+      self.model.optimizer.assign_average_vars(self.model.variables)
+
+
+class AverageModelCheckpoint(tf.keras.callbacks.ModelCheckpoint):
+  """Saves and, optionally, assigns the averaged weights.
+
+  Taken from tfa.callbacks.AverageModelCheckpoint.
+
+  Attributes:
+    update_weights: If True, assign the moving average weights to the model, and
+      save them. If False, keep the old non-averaged weights, but the saved
+      model uses the average weights. See `tf.keras.callbacks.ModelCheckpoint`
+      for the other args.
+  """
+
+  def __init__(self,
+               update_weights: bool,
+               filepath: str,
+               monitor: str = 'val_loss',
+               verbose: int = 0,
+               save_best_only: bool = False,
+               save_weights_only: bool = False,
+               mode: str = 'auto',
+               save_freq: str = 'epoch',
+               **kwargs):
+    self.update_weights = update_weights
+    super().__init__(filepath, monitor, verbose, save_best_only,
+                     save_weights_only, mode, save_freq, **kwargs)
+
+  def set_model(self, model):
+    if not isinstance(model.optimizer, optimization.ExponentialMovingAverage):
+      raise TypeError('AverageModelCheckpoint is only used when training'
+                      'with MovingAverage')
+    return super().set_model(model)
+
+  def _save_model(self, epoch, logs):
+    assert isinstance(self.model.optimizer,
+                      optimization.ExponentialMovingAverage)
+
+    if self.update_weights:
+      self.model.optimizer.assign_average_vars(self.model.variables)
+      return super()._save_model(epoch, logs)
+    else:
+      # Note: `model.get_weights()` gives us the weights (non-ref)
+      # whereas `model.variables` returns references to the variables.
+      non_avg_weights = self.model.get_weights()
+      self.model.optimizer.assign_average_vars(self.model.variables)
+      # result is currently None, since `super._save_model` doesn't
+      # return anything, but this may change in the future.
+      result = super()._save_model(epoch, logs)
+      self.model.set_weights(non_avg_weights)
+      return result
diff --git a/cv/classification/resnet50/tensorflow2.0/classifier_trainer.py b/cv/classification/resnet50/tensorflow2.0/classifier_trainer.py
new file mode 100644
index 000000000..37911b135
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/classifier_trainer.py
@@ -0,0 +1,463 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Runs an Image Classification model."""
+
+import os
+import pprint
+from typing import Any, Tuple, Text, Optional, Mapping
+
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+from common import distribute_utils
+from modeling import hyperparams
+from modeling import performance
+from utils import hyperparams_flags
+from utils.misc import keras_utils
+import callbacks as custom_callbacks
+import dataset_factory
+import optimizer_factory
+from configs import base_configs
+from configs import configs
+from efficientnet import efficientnet_model
+from resnet import common
+from resnet import resnet_model
+
+
+def get_models() -> Mapping[str, tf.keras.Model]:
+  """Returns the mapping from model type name to Keras model."""
+  return {
+      'efficientnet': efficientnet_model.EfficientNet.from_name,
+      'resnet': resnet_model.resnet50,
+  }
+
+
+def get_dtype_map() -> Mapping[str, tf.dtypes.DType]:
+  """Returns the mapping from dtype string representations to TF dtypes."""
+  return {
+      'float32': tf.float32,
+      'bfloat16': tf.bfloat16,
+      'float16': tf.float16,
+      'fp32': tf.float32,
+      'bf16': tf.bfloat16,
+  }
+
+
+def _get_metrics(one_hot: bool) -> Mapping[Text, Any]:
+  """Get a dict of available metrics to track."""
+  if one_hot:
+    return {
+        # (name, metric_fn)
+        'acc':
+            tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
+        'accuracy':
+            tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
+        'top_1':
+            tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
+        'top_5':
+            tf.keras.metrics.TopKCategoricalAccuracy(
+                k=5, name='top_5_accuracy'),
+    }
+  else:
+    return {
+        # (name, metric_fn)
+        'acc':
+            tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
+        'accuracy':
+            tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
+        'top_1':
+            tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
+        'top_5':
+            tf.keras.metrics.SparseTopKCategoricalAccuracy(
+                k=5, name='top_5_accuracy'),
+    }
+
+
+def get_image_size_from_model(
+    params: base_configs.ExperimentConfig) -> Optional[int]:
+  """If the given model has a preferred image size, return it."""
+  if params.model_name == 'efficientnet':
+    efficientnet_name = params.model.model_params.model_name
+    if efficientnet_name in efficientnet_model.MODEL_CONFIGS:
+      return efficientnet_model.MODEL_CONFIGS[efficientnet_name].resolution
+  return None
+
+
+def _get_dataset_builders(params: base_configs.ExperimentConfig,
+                          strategy: tf.distribute.Strategy,
+                          one_hot: bool) -> Tuple[Any, Any]:
+  """Create and return train and validation dataset builders."""
+  if one_hot:
+    logging.warning('label_smoothing > 0, so datasets will be one hot encoded.')
+  else:
+    logging.warning('label_smoothing not applied, so datasets will not be one '
+                    'hot encoded.')
+
+  num_devices = strategy.num_replicas_in_sync if strategy else 1
+
+  image_size = get_image_size_from_model(params)
+
+  dataset_configs = [params.train_dataset, params.validation_dataset]
+  builders = []
+
+  for config in dataset_configs:
+    if config is not None and config.has_data:
+      builder = dataset_factory.DatasetBuilder(
+          config,
+          image_size=image_size or config.image_size,
+          num_devices=num_devices,
+          one_hot=one_hot)
+    else:
+      builder = None
+    builders.append(builder)
+
+  return builders
+
+
+def get_loss_scale(params: base_configs.ExperimentConfig,
+                   fp16_default: float = 128.) -> float:
+  """Returns the loss scale for initializations."""
+  loss_scale = params.runtime.loss_scale
+  if loss_scale == 'dynamic':
+    return loss_scale
+  elif loss_scale is not None:
+    return float(loss_scale)
+  elif (params.train_dataset.dtype == 'float32' or
+        params.train_dataset.dtype == 'bfloat16'):
+    return 1.
+  else:
+    assert params.train_dataset.dtype == 'float16'
+    return fp16_default
+
+
+def _get_params_from_flags(flags_obj: flags.FlagValues):
+  """Get ParamsDict from flags."""
+  model = flags_obj.model_type.lower()
+  dataset = flags_obj.dataset.lower()
+  params = configs.get_config(model=model, dataset=dataset)
+
+  flags_overrides = {
+      'model_dir': flags_obj.model_dir,
+      'mode': flags_obj.mode,
+      'model': {
+          'name': model,
+      },
+      'runtime': {
+          'run_eagerly': flags_obj.run_eagerly,
+          'tpu': flags_obj.tpu,
+      },
+      'train_dataset': {
+          'data_dir': flags_obj.data_dir,
+      },
+      'validation_dataset': {
+          'data_dir': flags_obj.data_dir,
+      },
+      'train': {
+          'time_history': {
+              'log_steps': flags_obj.log_steps,
+          },
+      },
+  }
+
+  overriding_configs = (flags_obj.config_file, flags_obj.params_override,
+                        flags_overrides)
+
+  pp = pprint.PrettyPrinter()
+
+  logging.info('Base params: %s', pp.pformat(params.as_dict()))
+
+  for param in overriding_configs:
+    logging.info('Overriding params: %s', param)
+    params = hyperparams.override_params_dict(params, param, is_strict=True)
+
+  params.validate()
+  params.lock()
+
+  logging.info('Final model parameters: %s', pp.pformat(params.as_dict()))
+  return params
+
+
+def resume_from_checkpoint(model: tf.keras.Model, model_dir: str,
+                           train_steps: int) -> int:
+  """Resumes from the latest checkpoint, if possible.
+
+  Loads the model weights and optimizer settings from a checkpoint.
+  This function should be used in case of preemption recovery.
+
+  Args:
+    model: The model whose weights should be restored.
+    model_dir: The directory where model weights were saved.
+    train_steps: The number of steps to train.
+
+  Returns:
+    The epoch of the latest checkpoint, or 0 if not restoring.
+
+  """
+  logging.info('Load from checkpoint is enabled.')
+  latest_checkpoint = tf.train.latest_checkpoint(model_dir)
+  logging.info('latest_checkpoint: %s', latest_checkpoint)
+  if not latest_checkpoint:
+    logging.info('No checkpoint detected.')
+    return 0
+
+  logging.info('Checkpoint file %s found and restoring from '
+               'checkpoint', latest_checkpoint)
+  model.load_weights(latest_checkpoint)
+  initial_epoch = model.optimizer.iterations // train_steps
+  logging.info('Completed loading from checkpoint.')
+  logging.info('Resuming from epoch %d', initial_epoch)
+  return int(initial_epoch)
+
+
+def initialize(params: base_configs.ExperimentConfig,
+               dataset_builder: dataset_factory.DatasetBuilder):
+  """Initializes backend related initializations."""
+  keras_utils.set_session_config(enable_xla=params.runtime.enable_xla)
+  performance.set_mixed_precision_policy(dataset_builder.dtype)
+  if tf.config.list_physical_devices('GPU'):
+    data_format = 'channels_first'
+  else:
+    data_format = 'channels_last'
+  tf.keras.backend.set_image_data_format(data_format)
+  if params.runtime.run_eagerly:
+    # Enable eager execution to allow step-by-step debugging
+    tf.config.experimental_run_functions_eagerly(True)
+  if tf.config.list_physical_devices('GPU'):
+    if params.runtime.gpu_thread_mode:
+      keras_utils.set_gpu_thread_mode_and_count(
+          per_gpu_thread_count=params.runtime.per_gpu_thread_count,
+          gpu_thread_mode=params.runtime.gpu_thread_mode,
+          num_gpus=params.runtime.num_gpus,
+          datasets_num_private_threads=params.runtime
+          .dataset_num_private_threads)  # pylint:disable=line-too-long
+    if params.runtime.batchnorm_spatial_persistent:
+      os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
+
+
+def define_classifier_flags():
+  """Defines common flags for image classification."""
+  hyperparams_flags.initialize_common_flags()
+  flags.DEFINE_string(
+      'data_dir', default=None, help='The location of the input data.')
+  flags.DEFINE_string(
+      'mode',
+      default=None,
+      help='Mode to run: `train`, `eval`, `train_and_eval` or `export`.')
+  flags.DEFINE_bool(
+      'run_eagerly',
+      default=None,
+      help='Use eager execution and disable autograph for debugging.')
+  flags.DEFINE_string(
+      'model_type',
+      default=None,
+      help='The type of the model, e.g. EfficientNet, etc.')
+  flags.DEFINE_string(
+      'dataset',
+      default=None,
+      help='The name of the dataset, e.g. ImageNet, etc.')
+  flags.DEFINE_integer(
+      'log_steps',
+      default=100,
+      help='The interval of steps between logging of batch level stats.')
+
+
+def serialize_config(params: base_configs.ExperimentConfig, model_dir: str):
+  """Serializes and saves the experiment config."""
+  params_save_path = os.path.join(model_dir, 'params.yaml')
+  logging.info('Saving experiment configuration to %s', params_save_path)
+  tf.io.gfile.makedirs(model_dir)
+  hyperparams.save_params_dict_to_yaml(params, params_save_path)
+
+
+def train_and_eval(
+    params: base_configs.ExperimentConfig,
+    strategy_override: tf.distribute.Strategy) -> Mapping[str, Any]:
+  """Runs the train and eval path using compile/fit."""
+  logging.info('Running train and eval.')
+
+  distribute_utils.configure_cluster(params.runtime.worker_hosts,
+                                     params.runtime.task_index)
+
+  # Note: for TPUs, strategy and scope should be created before the dataset
+  strategy = strategy_override or distribute_utils.get_distribution_strategy(
+      distribution_strategy=params.runtime.distribution_strategy,
+      all_reduce_alg=params.runtime.all_reduce_alg,
+      num_gpus=params.runtime.num_gpus,
+      tpu_address=params.runtime.tpu)
+
+  strategy_scope = distribute_utils.get_strategy_scope(strategy)
+
+  logging.info('Detected %d devices.',
+               strategy.num_replicas_in_sync if strategy else 1)
+
+  label_smoothing = params.model.loss.label_smoothing
+  one_hot = label_smoothing and label_smoothing > 0
+
+  builders = _get_dataset_builders(params, strategy, one_hot)
+  datasets = [
+      builder.build(strategy) if builder else None for builder in builders
+  ]
+
+  # Unpack datasets and builders based on train/val/test splits
+  train_builder, validation_builder = builders  # pylint: disable=unbalanced-tuple-unpacking
+  train_dataset, validation_dataset = datasets
+
+  train_epochs = params.train.epochs
+  train_steps = params.train.steps or train_builder.num_steps
+  validation_steps = params.evaluation.steps or validation_builder.num_steps
+
+  initialize(params, train_builder)
+
+  logging.info('Global batch size: %d', train_builder.global_batch_size)
+
+  with strategy_scope:
+    model_params = params.model.model_params.as_dict()
+    model = get_models()[params.model.name](**model_params)
+    learning_rate = optimizer_factory.build_learning_rate(
+        params=params.model.learning_rate,
+        batch_size=train_builder.global_batch_size,
+        train_epochs=train_epochs,
+        train_steps=train_steps)
+    optimizer = optimizer_factory.build_optimizer(
+        optimizer_name=params.model.optimizer.name,
+        base_learning_rate=learning_rate,
+        params=params.model.optimizer.as_dict(),
+        model=model)
+    optimizer = performance.configure_optimizer(
+        optimizer,
+        use_float16=train_builder.dtype == 'float16',
+        loss_scale=get_loss_scale(params))
+
+    metrics_map = _get_metrics(one_hot)
+    metrics = [metrics_map[metric] for metric in params.train.metrics]
+    steps_per_loop = train_steps if params.train.set_epoch_loop else 1
+
+    if one_hot:
+      loss_obj = tf.keras.losses.CategoricalCrossentropy(
+          label_smoothing=params.model.loss.label_smoothing)
+    else:
+      loss_obj = tf.keras.losses.SparseCategoricalCrossentropy()
+    model.compile(
+        optimizer=optimizer,
+        loss=loss_obj,
+        metrics=metrics,
+        steps_per_execution=steps_per_loop)
+
+    initial_epoch = 0
+    if params.train.resume_checkpoint:
+      initial_epoch = resume_from_checkpoint(
+          model=model, model_dir=params.model_dir, train_steps=train_steps)
+
+    callbacks = custom_callbacks.get_callbacks(
+        model_checkpoint=params.train.callbacks.enable_checkpoint_and_export,
+        include_tensorboard=params.train.callbacks.enable_tensorboard,
+        time_history=params.train.callbacks.enable_time_history,
+        track_lr=params.train.tensorboard.track_lr,
+        write_model_weights=params.train.tensorboard.write_model_weights,
+        initial_step=initial_epoch * train_steps,
+        batch_size=train_builder.global_batch_size,
+        log_steps=params.train.time_history.log_steps,
+        model_dir=params.model_dir,
+        backup_and_restore=params.train.callbacks.enable_backup_and_restore)
+
+  serialize_config(params=params, model_dir=params.model_dir)
+
+  if params.evaluation.skip_eval:
+    validation_kwargs = {}
+  else:
+    validation_kwargs = {
+        'validation_data': validation_dataset,
+        'validation_steps': validation_steps,
+        'validation_freq': params.evaluation.epochs_between_evals,
+    }
+
+  history = model.fit(
+      train_dataset,
+      epochs=train_epochs,
+      steps_per_epoch=train_steps,
+      initial_epoch=initial_epoch,
+      callbacks=callbacks,
+      verbose=2,
+      **validation_kwargs)
+
+  validation_output = None
+  if not params.evaluation.skip_eval:
+    validation_output = model.evaluate(
+        validation_dataset, steps=validation_steps, verbose=2)
+
+  # TODO(dankondratyuk): eval and save final test accuracy
+  stats = common.build_stats(history, validation_output, callbacks)
+  return stats
+
+
+def export(params: base_configs.ExperimentConfig):
+  """Runs the model export functionality."""
+  logging.info('Exporting model.')
+  model_params = params.model.model_params.as_dict()
+  model = get_models()[params.model.name](**model_params)
+  checkpoint = params.export.checkpoint
+  if checkpoint is None:
+    logging.info('No export checkpoint was provided. Using the latest '
+                 'checkpoint from model_dir.')
+    checkpoint = tf.train.latest_checkpoint(params.model_dir)
+
+  model.load_weights(checkpoint)
+  model.save(params.export.destination)
+
+
+def run(flags_obj: flags.FlagValues,
+        strategy_override: tf.distribute.Strategy = None) -> Mapping[str, Any]:
+  """Runs Image Classification model using native Keras APIs.
+
+  Args:
+    flags_obj: An object containing parsed flag values.
+    strategy_override: A `tf.distribute.Strategy` object to use for model.
+
+  Returns:
+    Dictionary of training/eval stats
+  """
+  params = _get_params_from_flags(flags_obj)
+
+  try:
+    from dltest import show_training_arguments
+    show_training_arguments(flags_obj)
+  except:
+    pass
+
+  if params.mode == 'train_and_eval':
+    return train_and_eval(params, strategy_override)
+  elif params.mode == 'export_only':
+    export(params)
+  else:
+    raise ValueError('{} is not a valid mode.'.format(params.mode))
+
+
+def main(_):
+  stats = run(flags.FLAGS)
+  if stats:
+    logging.info('Run stats:\n%s', stats)
+
+
+if __name__ == '__main__':
+  logging.set_verbosity(logging.INFO)
+  define_classifier_flags()
+  flags.mark_flag_as_required('data_dir')
+  flags.mark_flag_as_required('mode')
+  flags.mark_flag_as_required('model_type')
+  flags.mark_flag_as_required('dataset')
+
+  app.run(main)
diff --git a/cv/classification/resnet50/tensorflow2.0/classifier_trainer_test.py b/cv/classification/resnet50/tensorflow2.0/classifier_trainer_test.py
new file mode 100644
index 000000000..06227c154
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/classifier_trainer_test.py
@@ -0,0 +1,240 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Unit tests for the classifier trainer models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import json
+
+import os
+import sys
+
+from typing import Any, Callable, Iterable, Mapping, MutableMapping, Optional, Tuple
+
+from absl import flags
+from absl.testing import flagsaver
+from absl.testing import parameterized
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.utils.flags import core as flags_core
+from official.vision.image_classification import classifier_trainer
+
+
+classifier_trainer.define_classifier_flags()
+
+
+def distribution_strategy_combinations() -> Iterable[Tuple[Any, ...]]:
+  """Returns the combinations of end-to-end tests to run."""
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.cloud_tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+          strategy_combinations.mirrored_strategy_with_two_gpus,
+      ],
+      model=[
+          'efficientnet',
+          'resnet',
+      ],
+      dataset=[
+          'imagenet',
+      ],
+  )
+
+
+def get_params_override(params_override: Mapping[str, Any]) -> str:
+  """Converts params_override dict to string command."""
+  return '--params_override=' + json.dumps(params_override)
+
+
+def basic_params_override(dtype: str = 'float32') -> MutableMapping[str, Any]:
+  """Returns a basic parameter configuration for testing."""
+  return {
+      'train_dataset': {
+          'builder': 'synthetic',
+          'use_per_replica_batch_size': True,
+          'batch_size': 1,
+          'image_size': 224,
+          'dtype': dtype,
+      },
+      'validation_dataset': {
+          'builder': 'synthetic',
+          'batch_size': 1,
+          'use_per_replica_batch_size': True,
+          'image_size': 224,
+          'dtype': dtype,
+      },
+      'train': {
+          'steps': 1,
+          'epochs': 1,
+          'callbacks': {
+              'enable_checkpoint_and_export': True,
+              'enable_tensorboard': False,
+          },
+      },
+      'evaluation': {
+          'steps': 1,
+      },
+  }
+
+
+@flagsaver.flagsaver
+def run_end_to_end(main: Callable[[Any], None],
+                   extra_flags: Optional[Iterable[str]] = None,
+                   model_dir: Optional[str] = None):
+  """Runs the classifier trainer end-to-end."""
+  extra_flags = [] if extra_flags is None else extra_flags
+  args = [sys.argv[0], '--model_dir', model_dir] + extra_flags
+  flags_core.parse_flags(argv=args)
+  main(flags.FLAGS)
+
+
+class ClassifierTest(tf.test.TestCase, parameterized.TestCase):
+  """Unit tests for Keras models."""
+  _tempdir = None
+
+  @classmethod
+  def setUpClass(cls):  # pylint: disable=invalid-name
+    super(ClassifierTest, cls).setUpClass()
+
+  def tearDown(self):
+    super(ClassifierTest, self).tearDown()
+    tf.io.gfile.rmtree(self.get_temp_dir())
+
+  @combinations.generate(distribution_strategy_combinations())
+  def test_end_to_end_train_and_eval(self, distribution, model, dataset):
+    """Test train_and_eval and export for Keras classifier models."""
+    # Some parameters are not defined as flags (e.g. cannot run
+    # classifier_train.py --batch_size=...) by design, so use
+    # "--params_override=..." instead
+    model_dir = self.create_tempdir().full_path
+    base_flags = [
+        '--data_dir=not_used',
+        '--model_type=' + model,
+        '--dataset=' + dataset,
+    ]
+    train_and_eval_flags = base_flags + [
+        get_params_override(basic_params_override()),
+        '--mode=train_and_eval',
+    ]
+
+    run = functools.partial(
+        classifier_trainer.run, strategy_override=distribution)
+    run_end_to_end(
+        main=run, extra_flags=train_and_eval_flags, model_dir=model_dir)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          model=[
+              'efficientnet',
+              'resnet',
+          ],
+          dataset='imagenet',
+          dtype='float16',
+      ))
+  def test_gpu_train(self, distribution, model, dataset, dtype):
+    """Test train_and_eval and export for Keras classifier models."""
+    # Some parameters are not defined as flags (e.g. cannot run
+    # classifier_train.py --batch_size=...) by design, so use
+    # "--params_override=..." instead
+    model_dir = self.create_tempdir().full_path
+    base_flags = [
+        '--data_dir=not_used',
+        '--model_type=' + model,
+        '--dataset=' + dataset,
+    ]
+    train_and_eval_flags = base_flags + [
+        get_params_override(basic_params_override(dtype)),
+        '--mode=train_and_eval',
+    ]
+
+    export_params = basic_params_override()
+    export_path = os.path.join(model_dir, 'export')
+    export_params['export'] = {}
+    export_params['export']['destination'] = export_path
+    export_flags = base_flags + [
+        '--mode=export_only',
+        get_params_override(export_params)
+    ]
+
+    run = functools.partial(
+        classifier_trainer.run, strategy_override=distribution)
+    run_end_to_end(
+        main=run, extra_flags=train_and_eval_flags, model_dir=model_dir)
+    run_end_to_end(main=run, extra_flags=export_flags, model_dir=model_dir)
+    self.assertTrue(os.path.exists(export_path))
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.cloud_tpu_strategy,
+          ],
+          model=[
+              'efficientnet',
+              'resnet',
+          ],
+          dataset='imagenet',
+          dtype='bfloat16',
+      ))
+  def test_tpu_train(self, distribution, model, dataset, dtype):
+    """Test train_and_eval and export for Keras classifier models."""
+    # Some parameters are not defined as flags (e.g. cannot run
+    # classifier_train.py --batch_size=...) by design, so use
+    # "--params_override=..." instead
+    model_dir = self.create_tempdir().full_path
+    base_flags = [
+        '--data_dir=not_used',
+        '--model_type=' + model,
+        '--dataset=' + dataset,
+    ]
+    train_and_eval_flags = base_flags + [
+        get_params_override(basic_params_override(dtype)),
+        '--mode=train_and_eval',
+    ]
+
+    run = functools.partial(
+        classifier_trainer.run, strategy_override=distribution)
+    run_end_to_end(
+        main=run, extra_flags=train_and_eval_flags, model_dir=model_dir)
+
+  @combinations.generate(distribution_strategy_combinations())
+  def test_end_to_end_invalid_mode(self, distribution, model, dataset):
+    """Test the Keras EfficientNet model with `strategy`."""
+    model_dir = self.create_tempdir().full_path
+    extra_flags = [
+        '--data_dir=not_used',
+        '--mode=invalid_mode',
+        '--model_type=' + model,
+        '--dataset=' + dataset,
+        get_params_override(basic_params_override()),
+    ]
+
+    run = functools.partial(
+        classifier_trainer.run, strategy_override=distribution)
+    with self.assertRaises(ValueError):
+      run_end_to_end(main=run, extra_flags=extra_flags, model_dir=model_dir)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/classifier_trainer_util_test.py b/cv/classification/resnet50/tensorflow2.0/classifier_trainer_util_test.py
new file mode 100644
index 000000000..d3624c286
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/classifier_trainer_util_test.py
@@ -0,0 +1,166 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Unit tests for the classifier trainer models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import os
+
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.vision.image_classification import classifier_trainer
+from official.vision.image_classification import dataset_factory
+from official.vision.image_classification import test_utils
+from official.vision.image_classification.configs import base_configs
+
+
+def get_trivial_model(num_classes: int) -> tf.keras.Model:
+  """Creates and compiles trivial model for ImageNet dataset."""
+  model = test_utils.trivial_model(num_classes=num_classes)
+  lr = 0.01
+  optimizer = tf.keras.optimizers.SGD(learning_rate=lr)
+  loss_obj = tf.keras.losses.SparseCategoricalCrossentropy()
+  model.compile(optimizer=optimizer, loss=loss_obj, run_eagerly=True)
+  return model
+
+
+def get_trivial_data() -> tf.data.Dataset:
+  """Gets trivial data in the ImageNet size."""
+
+  def generate_data(_) -> tf.data.Dataset:
+    image = tf.zeros(shape=(224, 224, 3), dtype=tf.float32)
+    label = tf.zeros([1], dtype=tf.int32)
+    return image, label
+
+  dataset = tf.data.Dataset.range(1)
+  dataset = dataset.repeat()
+  dataset = dataset.map(
+      generate_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  dataset = dataset.prefetch(buffer_size=1).batch(1)
+  return dataset
+
+
+class UtilTests(parameterized.TestCase, tf.test.TestCase):
+  """Tests for individual utility functions within classifier_trainer.py."""
+
+  @parameterized.named_parameters(
+      ('efficientnet-b0', 'efficientnet', 'efficientnet-b0', 224),
+      ('efficientnet-b1', 'efficientnet', 'efficientnet-b1', 240),
+      ('efficientnet-b2', 'efficientnet', 'efficientnet-b2', 260),
+      ('efficientnet-b3', 'efficientnet', 'efficientnet-b3', 300),
+      ('efficientnet-b4', 'efficientnet', 'efficientnet-b4', 380),
+      ('efficientnet-b5', 'efficientnet', 'efficientnet-b5', 456),
+      ('efficientnet-b6', 'efficientnet', 'efficientnet-b6', 528),
+      ('efficientnet-b7', 'efficientnet', 'efficientnet-b7', 600),
+      ('resnet', 'resnet', '', None),
+  )
+  def test_get_model_size(self, model, model_name, expected):
+    config = base_configs.ExperimentConfig(
+        model_name=model,
+        model=base_configs.ModelConfig(
+            model_params={
+                'model_name': model_name,
+            },))
+    size = classifier_trainer.get_image_size_from_model(config)
+    self.assertEqual(size, expected)
+
+  @parameterized.named_parameters(
+      ('dynamic', 'dynamic', None, 'dynamic'),
+      ('scalar', 128., None, 128.),
+      ('float32', None, 'float32', 1),
+      ('float16', None, 'float16', 128),
+  )
+  def test_get_loss_scale(self, loss_scale, dtype, expected):
+    config = base_configs.ExperimentConfig(
+        runtime=base_configs.RuntimeConfig(loss_scale=loss_scale),
+        train_dataset=dataset_factory.DatasetConfig(dtype=dtype))
+    ls = classifier_trainer.get_loss_scale(config, fp16_default=128)
+    self.assertEqual(ls, expected)
+
+  @parameterized.named_parameters(('float16', 'float16'),
+                                  ('bfloat16', 'bfloat16'))
+  def test_initialize(self, dtype):
+    config = base_configs.ExperimentConfig(
+        runtime=base_configs.RuntimeConfig(
+            run_eagerly=False,
+            enable_xla=False,
+            per_gpu_thread_count=1,
+            gpu_thread_mode='gpu_private',
+            num_gpus=1,
+            dataset_num_private_threads=1,
+        ),
+        train_dataset=dataset_factory.DatasetConfig(dtype=dtype),
+        model=base_configs.ModelConfig(),
+    )
+
+    class EmptyClass:
+      pass
+
+    fake_ds_builder = EmptyClass()
+    fake_ds_builder.dtype = dtype
+    fake_ds_builder.config = EmptyClass()
+    classifier_trainer.initialize(config, fake_ds_builder)
+
+  def test_resume_from_checkpoint(self):
+    """Tests functionality for resuming from checkpoint."""
+    # Set the keras policy
+    tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')
+
+    # Get the model, datasets, and compile it.
+    model = get_trivial_model(10)
+
+    # Create the checkpoint
+    model_dir = self.create_tempdir().full_path
+    train_epochs = 1
+    train_steps = 10
+    ds = get_trivial_data()
+    callbacks = [
+        tf.keras.callbacks.ModelCheckpoint(
+            os.path.join(model_dir, 'model.ckpt-{epoch:04d}'),
+            save_weights_only=True)
+    ]
+    model.fit(
+        ds,
+        callbacks=callbacks,
+        epochs=train_epochs,
+        steps_per_epoch=train_steps)
+
+    # Test load from checkpoint
+    clean_model = get_trivial_model(10)
+    weights_before_load = copy.deepcopy(clean_model.get_weights())
+    initial_epoch = classifier_trainer.resume_from_checkpoint(
+        model=clean_model, model_dir=model_dir, train_steps=train_steps)
+    self.assertEqual(initial_epoch, 1)
+    self.assertNotAllClose(weights_before_load, clean_model.get_weights())
+
+    tf.io.gfile.rmtree(model_dir)
+
+  def test_serialize_config(self):
+    """Tests functionality for serializing data."""
+    config = base_configs.ExperimentConfig()
+    model_dir = self.create_tempdir().full_path
+    classifier_trainer.serialize_config(params=config, model_dir=model_dir)
+    saved_params_path = os.path.join(model_dir, 'params.yaml')
+    self.assertTrue(os.path.exists(saved_params_path))
+    tf.io.gfile.rmtree(model_dir)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/common/__init__.py b/cv/classification/resnet50/tensorflow2.0/common/__init__.py
new file mode 100644
index 000000000..a25710c22
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/common/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
diff --git a/cv/classification/resnet50/tensorflow2.0/common/dataset_fn.py b/cv/classification/resnet50/tensorflow2.0/common/dataset_fn.py
new file mode 100644
index 000000000..4ac16a31b
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/common/dataset_fn.py
@@ -0,0 +1,42 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility library for picking an appropriate dataset function."""
+
+from typing import Any, Callable, Union, Type
+
+import tensorflow as tf
+
+PossibleDatasetType = Union[Type[tf.data.Dataset], Callable[[tf.Tensor], Any]]
+
+
+def pick_dataset_fn(file_type: str) -> PossibleDatasetType:
+  if file_type == 'tfrecord':
+    return tf.data.TFRecordDataset
+
+  raise ValueError('Unrecognized file_type: {}'.format(file_type))
diff --git a/cv/classification/resnet50/tensorflow2.0/common/distribute_utils.py b/cv/classification/resnet50/tensorflow2.0/common/distribute_utils.py
new file mode 100644
index 000000000..e2e05df9f
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/common/distribute_utils.py
@@ -0,0 +1,233 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helper functions for running models in a distributed setting."""
+
+import json
+import os
+import tensorflow as tf
+
+
+def _collective_communication(all_reduce_alg):
+  """Return a CollectiveCommunication based on all_reduce_alg.
+
+  Args:
+    all_reduce_alg: a string specifying which collective communication to pick,
+      or None.
+
+  Returns:
+    tf.distribute.experimental.CollectiveCommunication object
+
+  Raises:
+    ValueError: if `all_reduce_alg` not in [None, "ring", "nccl"]
+  """
+  collective_communication_options = {
+      None: tf.distribute.experimental.CollectiveCommunication.AUTO,
+      "ring": tf.distribute.experimental.CollectiveCommunication.RING,
+      "nccl": tf.distribute.experimental.CollectiveCommunication.NCCL
+  }
+  if all_reduce_alg not in collective_communication_options:
+    raise ValueError(
+        "When used with `multi_worker_mirrored`, valid values for "
+        "all_reduce_alg are [`ring`, `nccl`].  Supplied value: {}".format(
+            all_reduce_alg))
+  return collective_communication_options[all_reduce_alg]
+
+
+def _mirrored_cross_device_ops(all_reduce_alg, num_packs):
+  """Return a CrossDeviceOps based on all_reduce_alg and num_packs.
+
+  Args:
+    all_reduce_alg: a string specifying which cross device op to pick, or None.
+    num_packs: an integer specifying number of packs for the cross device op.
+
+  Returns:
+    tf.distribute.CrossDeviceOps object or None.
+
+  Raises:
+    ValueError: if `all_reduce_alg` not in [None, "nccl", "hierarchical_copy"].
+  """
+  if all_reduce_alg is None:
+    return None
+  mirrored_all_reduce_options = {
+      "nccl": tf.distribute.NcclAllReduce,
+      "hierarchical_copy": tf.distribute.HierarchicalCopyAllReduce
+  }
+  if all_reduce_alg not in mirrored_all_reduce_options:
+    raise ValueError(
+        "When used with `mirrored`, valid values for all_reduce_alg are "
+        "[`nccl`, `hierarchical_copy`].  Supplied value: {}".format(
+            all_reduce_alg))
+  cross_device_ops_class = mirrored_all_reduce_options[all_reduce_alg]
+  return cross_device_ops_class(num_packs=num_packs)
+
+
+def tpu_initialize(tpu_address):
+  """Initializes TPU for TF 2.x training.
+
+  Args:
+    tpu_address: string, bns address of master TPU worker.
+
+  Returns:
+    A TPUClusterResolver.
+  """
+  cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
+      tpu=tpu_address)
+  if tpu_address not in ("", "local"):
+    tf.config.experimental_connect_to_cluster(cluster_resolver)
+  tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
+  return cluster_resolver
+
+
+def get_distribution_strategy(distribution_strategy="mirrored",
+                              num_gpus=0,
+                              all_reduce_alg=None,
+                              num_packs=1,
+                              tpu_address=None,
+                              **kwargs):
+  """Return a DistributionStrategy for running the model.
+
+  Args:
+    distribution_strategy: a string specifying which distribution strategy to
+      use. Accepted values are "off", "one_device", "mirrored",
+      "parameter_server", "multi_worker_mirrored", and "tpu" -- case
+      insensitive. "tpu" means to use TPUStrategy using `tpu_address`.
+      "off" means to use the default strategy which is obtained from
+      tf.distribute.get_strategy (for details on the default strategy, see
+      https://www.tensorflow.org/guide/distributed_training#default_strategy).
+    num_gpus: Number of GPUs to run this model.
+    all_reduce_alg: Optional. Specifies which algorithm to use when performing
+      all-reduce. For `MirroredStrategy`, valid values are "nccl" and
+      "hierarchical_copy". For `MultiWorkerMirroredStrategy`, valid values are
+      "ring" and "nccl".  If None, DistributionStrategy will choose based on
+      device topology.
+    num_packs: Optional.  Sets the `num_packs` in `tf.distribute.NcclAllReduce`
+      or `tf.distribute.HierarchicalCopyAllReduce` for `MirroredStrategy`.
+    tpu_address: Optional. String that represents TPU to connect to. Must not be
+      None if `distribution_strategy` is set to `tpu`.
+    **kwargs: Additional kwargs for internal usages.
+
+  Returns:
+    tf.distribute.DistibutionStrategy object.
+  Raises:
+    ValueError: if `distribution_strategy` is "off" or "one_device" and
+      `num_gpus` is larger than 1; or `num_gpus` is negative or if
+      `distribution_strategy` is `tpu` but `tpu_address` is not specified.
+  """
+  del kwargs
+  if num_gpus < 0:
+    raise ValueError("`num_gpus` can not be negative.")
+
+  if not isinstance(distribution_strategy, str):
+    msg = ("distribution_strategy must be a string but got: %s." %
+           (distribution_strategy,))
+    if distribution_strategy == False:  # pylint: disable=singleton-comparison,g-explicit-bool-comparison
+      msg += (" If you meant to pass the string 'off', make sure you add "
+              "quotes around 'off' so that yaml interprets it as a string "
+              "instead of a bool.")
+    raise ValueError(msg)
+
+  distribution_strategy = distribution_strategy.lower()
+  if distribution_strategy == "off":
+    if num_gpus > 1:
+      raise ValueError("When {} GPUs are specified, distribution_strategy "
+                       "flag cannot be set to `off`.".format(num_gpus))
+    # Return the default distribution strategy.
+    return tf.distribute.get_strategy()
+
+  if distribution_strategy == "tpu":
+    # When tpu_address is an empty string, we communicate with local TPUs.
+    cluster_resolver = tpu_initialize(tpu_address)
+    return tf.distribute.TPUStrategy(cluster_resolver)
+
+  if distribution_strategy == "multi_worker_mirrored":
+    return tf.distribute.experimental.MultiWorkerMirroredStrategy(
+        communication=_collective_communication(all_reduce_alg))
+
+  if distribution_strategy == "one_device":
+    if num_gpus == 0:
+      return tf.distribute.OneDeviceStrategy("device:CPU:0")
+    if num_gpus > 1:
+      raise ValueError("`OneDeviceStrategy` can not be used for more than "
+                       "one device.")
+    return tf.distribute.OneDeviceStrategy("device:GPU:0")
+
+  if distribution_strategy == "mirrored":
+    if num_gpus == 0:
+      devices = ["device:CPU:0"]
+    else:
+      devices = ["device:GPU:%d" % i for i in range(num_gpus)]
+    return tf.distribute.MirroredStrategy(
+        devices=devices,
+        cross_device_ops=_mirrored_cross_device_ops(all_reduce_alg, num_packs))
+
+  if distribution_strategy == "parameter_server":
+    cluster_resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver()
+    return tf.distribute.experimental.ParameterServerStrategy(cluster_resolver)
+
+  raise ValueError("Unrecognized Distribution Strategy: %r" %
+                   distribution_strategy)
+
+
+def configure_cluster(worker_hosts=None, task_index=-1):
+  """Set multi-worker cluster spec in TF_CONFIG environment variable.
+
+  Args:
+    worker_hosts: comma-separated list of worker ip:port pairs.
+    task_index: index of the worker.
+
+  Returns:
+    Number of workers in the cluster.
+  """
+  tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
+  if tf_config:
+    num_workers = (
+        len(tf_config["cluster"].get("chief", [])) +
+        len(tf_config["cluster"].get("worker", [])))
+  elif worker_hosts:
+    workers = worker_hosts.split(",")
+    num_workers = len(workers)
+    if num_workers > 1 and task_index < 0:
+      raise ValueError("Must specify task_index when number of workers > 1")
+    task_index = 0 if num_workers == 1 else task_index
+    os.environ["TF_CONFIG"] = json.dumps({
+        "cluster": {
+            "worker": workers
+        },
+        "task": {
+            "type": "worker",
+            "index": task_index
+        }
+    })
+  else:
+    num_workers = 1
+  return num_workers
+
+
+def get_strategy_scope(strategy):
+  if strategy:
+    strategy_scope = strategy.scope()
+  else:
+    strategy_scope = DummyContextManager()
+
+  return strategy_scope
+
+
+class DummyContextManager(object):
+
+  def __enter__(self):
+    pass
+
+  def __exit__(self, *args):
+    pass
diff --git a/cv/classification/resnet50/tensorflow2.0/common/distribute_utils_test.py b/cv/classification/resnet50/tensorflow2.0/common/distribute_utils_test.py
new file mode 100644
index 000000000..ad717295e
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/common/distribute_utils_test.py
@@ -0,0 +1,59 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Tests for distribution util functions."""
+
+import tensorflow as tf
+
+from . import distribute_utils
+
+
+class GetDistributionStrategyTest(tf.test.TestCase):
+  """Tests for get_distribution_strategy."""
+
+  def test_one_device_strategy_cpu(self):
+    ds = distribute_utils.get_distribution_strategy(num_gpus=0)
+    self.assertEquals(ds.num_replicas_in_sync, 1)
+    self.assertEquals(len(ds.extended.worker_devices), 1)
+    self.assertIn('CPU', ds.extended.worker_devices[0])
+
+  def test_one_device_strategy_gpu(self):
+    ds = distribute_utils.get_distribution_strategy(num_gpus=1)
+    self.assertEquals(ds.num_replicas_in_sync, 1)
+    self.assertEquals(len(ds.extended.worker_devices), 1)
+    self.assertIn('GPU', ds.extended.worker_devices[0])
+
+  def test_mirrored_strategy(self):
+    ds = distribute_utils.get_distribution_strategy(num_gpus=5)
+    self.assertEquals(ds.num_replicas_in_sync, 5)
+    self.assertEquals(len(ds.extended.worker_devices), 5)
+    for device in ds.extended.worker_devices:
+      self.assertIn('GPU', device)
+
+  def test_no_strategy(self):
+    ds = distribute_utils.get_distribution_strategy('off')
+    self.assertIs(ds, tf.distribute.get_strategy())
+
+  def test_invalid_strategy(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        'distribution_strategy must be a string but got: False. If'):
+      distribute_utils.get_distribution_strategy(False)
+    with self.assertRaisesRegexp(
+        ValueError, 'distribution_strategy must be a string but got: 1'):
+      distribute_utils.get_distribution_strategy(1)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/common/flags.py b/cv/classification/resnet50/tensorflow2.0/common/flags.py
new file mode 100644
index 000000000..01ddf57af
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/common/flags.py
@@ -0,0 +1,110 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The central place to define flags."""
+
+from absl import flags
+
+
+def define_flags():
+  """Defines flags.
+
+  All flags are defined as optional, but in practice most models use some of
+  these flags and so mark_flags_as_required() should be called after calling
+  this function. Typically, 'experiment', 'mode', and 'model_dir' are required.
+  For example:
+
+  ```
+  from absl import flags
+  from official.common import flags as tfm_flags  # pylint: disable=line-too-long
+  ...
+  tfm_flags.define_flags()
+  flags.mark_flags_as_required(['experiment', 'mode', 'model_dir'])
+  ```
+
+  The reason all flags are optional is because unit tests often do not set or
+  use any of the flags.
+  """
+  flags.DEFINE_string(
+      'experiment', default=None, help=
+      'The experiment type registered, specifying an ExperimentConfig.')
+
+  flags.DEFINE_enum(
+      'mode',
+      default=None,
+      enum_values=[
+          'train', 'eval', 'train_and_eval', 'continuous_eval',
+          'continuous_train_and_eval', 'train_and_validate'
+      ],
+      help='Mode to run: `train`, `eval`, `train_and_eval`, '
+      '`continuous_eval`, `continuous_train_and_eval` and '
+      '`train_and_validate` (which is not implemented in '
+      'the open source version).')
+
+  flags.DEFINE_string(
+      'model_dir',
+      default=None,
+      help='The directory where the model and training/evaluation summaries'
+      'are stored.')
+
+  flags.DEFINE_multi_string(
+      'config_file',
+      default=None,
+      help='YAML/JSON files which specifies overrides. The override order '
+      'follows the order of args. Note that each file '
+      'can be used as an override template to override the default parameters '
+      'specified in Python. If the same parameter is specified in both '
+      '`--config_file` and `--params_override`, `config_file` will be used '
+      'first, followed by params_override.')
+
+  flags.DEFINE_string(
+      'params_override',
+      default=None,
+      help='a YAML/JSON string or a YAML file which specifies additional '
+      'overrides over the default parameters and those specified in '
+      '`--config_file`. Note that this is supposed to be used only to override '
+      'the model parameters, but not the parameters like TPU specific flags. '
+      'One canonical use case of `--config_file` and `--params_override` is '
+      'users first define a template config file using `--config_file`, then '
+      'use `--params_override` to adjust the minimal set of tuning parameters, '
+      'for example setting up different `train_batch_size`. The final override '
+      'order of parameters: default_model_params --> params from config_file '
+      '--> params in params_override. See also the help message of '
+      '`--config_file`.')
+
+  # The libraries rely on gin often make mistakes that include flags inside
+  # the library files which causes conflicts.
+  try:
+    flags.DEFINE_multi_string(
+        'gin_file', default=None, help='List of paths to the config files.')
+  except flags.DuplicateFlagError:
+    pass
+
+  try:
+    flags.DEFINE_multi_string(
+        'gin_params',
+        default=None,
+        help='Newline separated list of Gin parameter bindings.')
+  except flags.DuplicateFlagError:
+    pass
+
+  flags.DEFINE_string(
+      'tpu',
+      default=None,
+      help='The Cloud TPU to use for training. This should be either the name '
+      'used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 '
+      'url.')
+
+  flags.DEFINE_string(
+      'tf_data_service', default=None, help='The tf.data service address')
diff --git a/cv/classification/resnet50/tensorflow2.0/common/registry_imports.py b/cv/classification/resnet50/tensorflow2.0/common/registry_imports.py
new file mode 100644
index 000000000..dea4ee934
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/common/registry_imports.py
@@ -0,0 +1,18 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""All necessary imports for registration."""
+# pylint: disable=unused-import
+from utils.testing import mock_task
+# from official.vision import beta
diff --git a/cv/classification/resnet50/tensorflow2.0/configs/__init__.py b/cv/classification/resnet50/tensorflow2.0/configs/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/configs/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/cv/classification/resnet50/tensorflow2.0/configs/base_configs.py b/cv/classification/resnet50/tensorflow2.0/configs/base_configs.py
new file mode 100644
index 000000000..c31d21fd0
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/configs/base_configs.py
@@ -0,0 +1,227 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Definitions for high level configuration groups.."""
+from typing import Any, List, Mapping, Optional
+
+import dataclasses
+
+from core import config_definitions
+from modeling import hyperparams
+from modeling.hyperparams import config_definitions as legacy_cfg
+
+CallbacksConfig = legacy_cfg.CallbacksConfig
+TensorboardConfig = legacy_cfg.TensorboardConfig
+RuntimeConfig = config_definitions.RuntimeConfig
+
+
+@dataclasses.dataclass
+class ExportConfig(hyperparams.Config):
+  """Configuration for exports.
+
+  Attributes:
+    checkpoint: the path to the checkpoint to export.
+    destination: the path to where the checkpoint should be exported.
+  """
+  checkpoint: str = None
+  destination: str = None
+
+
+@dataclasses.dataclass
+class MetricsConfig(hyperparams.Config):
+  """Configuration for Metrics.
+
+  Attributes:
+    accuracy: Whether or not to track accuracy as a Callback. Defaults to None.
+    top_5: Whether or not to track top_5_accuracy as a Callback. Defaults to
+      None.
+  """
+  accuracy: bool = None
+  top_5: bool = None
+
+
+@dataclasses.dataclass
+class TimeHistoryConfig(hyperparams.Config):
+  """Configuration for the TimeHistory callback.
+
+  Attributes:
+    log_steps: Interval of steps between logging of batch level stats.
+  """
+  log_steps: int = None
+
+
+@dataclasses.dataclass
+class TrainConfig(hyperparams.Config):
+  """Configuration for training.
+
+  Attributes:
+    resume_checkpoint: Whether or not to enable load checkpoint loading.
+      Defaults to None.
+    epochs: The number of training epochs to run. Defaults to None.
+    steps: The number of steps to run per epoch. If None, then this will be
+      inferred based on the number of images and batch size. Defaults to None.
+    callbacks: An instance of CallbacksConfig.
+    metrics: An instance of MetricsConfig.
+    tensorboard: An instance of TensorboardConfig.
+    set_epoch_loop: Whether or not to set `steps_per_execution` to
+      equal the number of training steps in `model.compile`. This reduces the
+      number of callbacks run per epoch which significantly improves end-to-end
+      TPU training time.
+  """
+  resume_checkpoint: bool = None
+  epochs: int = None
+  steps: int = None
+  callbacks: CallbacksConfig = CallbacksConfig()
+  metrics: MetricsConfig = None
+  tensorboard: TensorboardConfig = TensorboardConfig()
+  time_history: TimeHistoryConfig = TimeHistoryConfig()
+  set_epoch_loop: bool = False
+
+
+@dataclasses.dataclass
+class EvalConfig(hyperparams.Config):
+  """Configuration for evaluation.
+
+  Attributes:
+    epochs_between_evals: The number of train epochs to run between evaluations.
+      Defaults to None.
+    steps: The number of eval steps to run during evaluation. If None, this will
+      be inferred based on the number of images and batch size. Defaults to
+      None.
+    skip_eval: Whether or not to skip evaluation.
+  """
+  epochs_between_evals: int = None
+  steps: int = None
+  skip_eval: bool = False
+
+
+@dataclasses.dataclass
+class LossConfig(hyperparams.Config):
+  """Configuration for Loss.
+
+  Attributes:
+    name: The name of the loss. Defaults to None.
+    label_smoothing: Whether or not to apply label smoothing to the loss. This
+      only applies to 'categorical_cross_entropy'.
+  """
+  name: str = None
+  label_smoothing: float = None
+
+
+@dataclasses.dataclass
+class OptimizerConfig(hyperparams.Config):
+  """Configuration for Optimizers.
+
+  Attributes:
+    name: The name of the optimizer. Defaults to None.
+    decay: Decay or rho, discounting factor for gradient. Defaults to None.
+    epsilon: Small value used to avoid 0 denominator. Defaults to None.
+    momentum: Plain momentum constant. Defaults to None.
+    nesterov: Whether or not to apply Nesterov momentum. Defaults to None.
+    moving_average_decay: The amount of decay to apply. If 0 or None, then
+      exponential moving average is not used. Defaults to None.
+    lookahead: Whether or not to apply the lookahead optimizer. Defaults to
+      None.
+    beta_1: The exponential decay rate for the 1st moment estimates. Used in the
+      Adam optimizers. Defaults to None.
+    beta_2: The exponential decay rate for the 2nd moment estimates. Used in the
+      Adam optimizers. Defaults to None.
+    epsilon: Small value used to avoid 0 denominator. Defaults to 1e-7.
+  """
+  name: str = None
+  decay: float = None
+  epsilon: float = None
+  momentum: float = None
+  nesterov: bool = None
+  moving_average_decay: Optional[float] = None
+  lookahead: Optional[bool] = None
+  beta_1: float = None
+  beta_2: float = None
+  epsilon: float = None
+
+
+@dataclasses.dataclass
+class LearningRateConfig(hyperparams.Config):
+  """Configuration for learning rates.
+
+  Attributes:
+    name: The name of the learning rate. Defaults to None.
+    initial_lr: The initial learning rate. Defaults to None.
+    decay_epochs: The number of decay epochs. Defaults to None.
+    decay_rate: The rate of decay. Defaults to None.
+    warmup_epochs: The number of warmup epochs. Defaults to None.
+    batch_lr_multiplier: The multiplier to apply to the base learning rate, if
+      necessary. Defaults to None.
+    examples_per_epoch: the number of examples in a single epoch. Defaults to
+      None.
+    boundaries: boundaries used in piecewise constant decay with warmup.
+    multipliers: multipliers used in piecewise constant decay with warmup.
+    scale_by_batch_size: Scale the learning rate by a fraction of the batch
+      size. Set to 0 for no scaling (default).
+    staircase: Apply exponential decay at discrete values instead of continuous.
+  """
+  name: str = None
+  initial_lr: float = None
+  decay_epochs: float = None
+  decay_rate: float = None
+  warmup_epochs: int = None
+  examples_per_epoch: int = None
+  boundaries: List[int] = None
+  multipliers: List[float] = None
+  scale_by_batch_size: float = 0.
+  staircase: bool = None
+
+
+@dataclasses.dataclass
+class ModelConfig(hyperparams.Config):
+  """Configuration for Models.
+
+  Attributes:
+    name: The name of the model. Defaults to None.
+    model_params: The parameters used to create the model. Defaults to None.
+    num_classes: The number of classes in the model. Defaults to None.
+    loss: A `LossConfig` instance. Defaults to None.
+    optimizer: An `OptimizerConfig` instance. Defaults to None.
+  """
+  name: str = None
+  model_params: hyperparams.Config = None
+  num_classes: int = None
+  loss: LossConfig = None
+  optimizer: OptimizerConfig = None
+
+
+@dataclasses.dataclass
+class ExperimentConfig(hyperparams.Config):
+  """Base configuration for an image classification experiment.
+
+  Attributes:
+    model_dir: The directory to use when running an experiment.
+    mode: e.g. 'train_and_eval', 'export'
+    runtime: A `RuntimeConfig` instance.
+    train: A `TrainConfig` instance.
+    evaluation: An `EvalConfig` instance.
+    model: A `ModelConfig` instance.
+    export: An `ExportConfig` instance.
+  """
+  model_dir: str = None
+  model_name: str = None
+  mode: str = None
+  runtime: RuntimeConfig = None
+  train_dataset: Any = None
+  validation_dataset: Any = None
+  train: TrainConfig = None
+  evaluation: EvalConfig = None
+  model: ModelConfig = None
+  export: ExportConfig = None
diff --git a/cv/classification/resnet50/tensorflow2.0/configs/configs.py b/cv/classification/resnet50/tensorflow2.0/configs/configs.py
new file mode 100644
index 000000000..cafd9328f
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/configs/configs.py
@@ -0,0 +1,113 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Configuration utils for image classification experiments."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import dataclasses
+
+import dataset_factory
+from configs import base_configs
+from efficientnet import efficientnet_config
+from resnet import resnet_config
+
+
+@dataclasses.dataclass
+class EfficientNetImageNetConfig(base_configs.ExperimentConfig):
+  """Base configuration to train efficientnet-b0 on ImageNet.
+
+  Attributes:
+    export: An `ExportConfig` instance
+    runtime: A `RuntimeConfig` instance.
+    dataset: A `DatasetConfig` instance.
+    train: A `TrainConfig` instance.
+    evaluation: An `EvalConfig` instance.
+    model: A `ModelConfig` instance.
+  """
+  export: base_configs.ExportConfig = base_configs.ExportConfig()
+  runtime: base_configs.RuntimeConfig = base_configs.RuntimeConfig()
+  train_dataset: dataset_factory.DatasetConfig = \
+      dataset_factory.ImageNetConfig(split='train')
+  validation_dataset: dataset_factory.DatasetConfig = \
+      dataset_factory.ImageNetConfig(split='validation')
+  train: base_configs.TrainConfig = base_configs.TrainConfig(
+      resume_checkpoint=True,
+      epochs=500,
+      steps=None,
+      callbacks=base_configs.CallbacksConfig(
+          enable_checkpoint_and_export=True, enable_tensorboard=True),
+      metrics=['accuracy', 'top_5'],
+      time_history=base_configs.TimeHistoryConfig(log_steps=100),
+      tensorboard=base_configs.TensorboardConfig(
+          track_lr=True, write_model_weights=False),
+      set_epoch_loop=False)
+  evaluation: base_configs.EvalConfig = base_configs.EvalConfig(
+      epochs_between_evals=1, steps=None)
+  model: base_configs.ModelConfig = \
+    efficientnet_config.EfficientNetModelConfig()
+
+
+@dataclasses.dataclass
+class ResNetImagenetConfig(base_configs.ExperimentConfig):
+  """Base configuration to train resnet-50 on ImageNet."""
+  export: base_configs.ExportConfig = base_configs.ExportConfig()
+  runtime: base_configs.RuntimeConfig = base_configs.RuntimeConfig()
+  train_dataset: dataset_factory.DatasetConfig = \
+      dataset_factory.ImageNetConfig(split='train',
+                                     one_hot=False,
+                                     mean_subtract=True,
+                                     standardize=True)
+  validation_dataset: dataset_factory.DatasetConfig = \
+      dataset_factory.ImageNetConfig(split='validation',
+                                     one_hot=False,
+                                     mean_subtract=True,
+                                     standardize=True)
+  train: base_configs.TrainConfig = base_configs.TrainConfig(
+      resume_checkpoint=True,
+      epochs=90,
+      steps=None,
+      callbacks=base_configs.CallbacksConfig(
+          enable_checkpoint_and_export=True, enable_tensorboard=True),
+      metrics=['accuracy', 'top_5'],
+      time_history=base_configs.TimeHistoryConfig(log_steps=100),
+      tensorboard=base_configs.TensorboardConfig(
+          track_lr=True, write_model_weights=False),
+      set_epoch_loop=False)
+  evaluation: base_configs.EvalConfig = base_configs.EvalConfig(
+      epochs_between_evals=1, steps=None)
+  model: base_configs.ModelConfig = resnet_config.ResNetModelConfig()
+
+
+def get_config(model: str, dataset: str) -> base_configs.ExperimentConfig:
+  """Given model and dataset names, return the ExperimentConfig."""
+  dataset_model_config_map = {
+      'imagenet': {
+          'efficientnet': EfficientNetImageNetConfig(),
+          'resnet': ResNetImagenetConfig(),
+      }
+  }
+  try:
+    return dataset_model_config_map[dataset][model]
+  except KeyError:
+    if dataset not in dataset_model_config_map:
+      raise KeyError('Invalid dataset received. Received: {}. Supported '
+                     'datasets include: {}'.format(
+                         dataset, ', '.join(dataset_model_config_map.keys())))
+    raise KeyError('Invalid model received. Received: {}. Supported models for'
+                   '{} include: {}'.format(
+                       model, dataset,
+                       ', '.join(dataset_model_config_map[dataset].keys())))
diff --git a/cv/classification/resnet50/tensorflow2.0/configs/examples/efficientnet/imagenet/efficientnet-b0-gpu.yaml b/cv/classification/resnet50/tensorflow2.0/configs/examples/efficientnet/imagenet/efficientnet-b0-gpu.yaml
new file mode 100644
index 000000000..6f40ffb1e
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/configs/examples/efficientnet/imagenet/efficientnet-b0-gpu.yaml
@@ -0,0 +1,52 @@
+# Training configuration for EfficientNet-b0 trained on ImageNet on GPUs.
+# Takes ~32 minutes per epoch for 8 V100s.
+# Reaches ~76.1% within 350 epochs.
+# Note: This configuration uses a scaled per-replica batch size based on the number of devices.
+runtime:
+  distribution_strategy: 'mirrored'
+  num_gpus: 1
+train_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'records'
+  split: 'train'
+  num_classes: 1000
+  num_examples: 1281167
+  batch_size: 32
+  use_per_replica_batch_size: True
+  dtype: 'float32'
+  augmenter:
+    name: 'autoaugment'
+validation_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'records'
+  split: 'validation'
+  num_classes: 1000
+  num_examples: 50000
+  batch_size: 32
+  use_per_replica_batch_size: True
+  dtype: 'float32'
+model:
+  model_params:
+    model_name: 'efficientnet-b0'
+    overrides:
+      num_classes: 1000
+      batch_norm: 'default'
+      dtype: 'float32'
+      activation: 'swish'
+  optimizer:
+    name: 'rmsprop'
+    momentum: 0.9
+    decay: 0.9
+    moving_average_decay: 0.0
+    lookahead: false
+  learning_rate:
+    name: 'exponential'
+  loss:
+    label_smoothing: 0.1
+train:
+  resume_checkpoint: True
+  epochs: 500
+evaluation:
+  epochs_between_evals: 1
diff --git a/cv/classification/resnet50/tensorflow2.0/configs/examples/efficientnet/imagenet/efficientnet-b0-tpu.yaml b/cv/classification/resnet50/tensorflow2.0/configs/examples/efficientnet/imagenet/efficientnet-b0-tpu.yaml
new file mode 100644
index 000000000..c5be7e9ba
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/configs/examples/efficientnet/imagenet/efficientnet-b0-tpu.yaml
@@ -0,0 +1,52 @@
+# Training configuration for EfficientNet-b0 trained on ImageNet on TPUs.
+# Takes ~2 minutes, 50 seconds per epoch for v3-32.
+# Reaches ~76.1% within 350 epochs.
+# Note: This configuration uses a scaled per-replica batch size based on the number of devices.
+runtime:
+  distribution_strategy: 'tpu'
+train_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'records'
+  split: 'train'
+  num_classes: 1000
+  num_examples: 1281167
+  batch_size: 128
+  use_per_replica_batch_size: True
+  dtype: 'bfloat16'
+  augmenter:
+    name: 'autoaugment'
+validation_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'records'
+  split: 'validation'
+  num_classes: 1000
+  num_examples: 50000
+  batch_size: 128
+  use_per_replica_batch_size: True
+  dtype: 'bfloat16'
+model:
+  model_params:
+    model_name: 'efficientnet-b0'
+    overrides:
+      num_classes: 1000
+      batch_norm: 'tpu'
+      dtype: 'bfloat16'
+      activation: 'swish'
+  optimizer:
+    name: 'rmsprop'
+    momentum: 0.9
+    decay: 0.9
+    moving_average_decay: 0.0
+    lookahead: false
+  learning_rate:
+    name: 'exponential'
+  loss:
+    label_smoothing: 0.1
+train:
+  resume_checkpoint: True
+  epochs: 500
+  set_epoch_loop: True
+evaluation:
+  epochs_between_evals: 1
diff --git a/cv/classification/resnet50/tensorflow2.0/configs/examples/efficientnet/imagenet/efficientnet-b1-gpu.yaml b/cv/classification/resnet50/tensorflow2.0/configs/examples/efficientnet/imagenet/efficientnet-b1-gpu.yaml
new file mode 100644
index 000000000..2f3dce01a
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/configs/examples/efficientnet/imagenet/efficientnet-b1-gpu.yaml
@@ -0,0 +1,47 @@
+# Note: This configuration uses a scaled per-replica batch size based on the number of devices.
+runtime:
+  distribution_strategy: 'mirrored'
+  num_gpus: 1
+train_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'records'
+  split: 'train'
+  num_classes: 1000
+  num_examples: 1281167
+  batch_size: 32
+  use_per_replica_batch_size: True
+  dtype: 'float32'
+validation_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'records'
+  split: 'validation'
+  num_classes: 1000
+  num_examples: 50000
+  batch_size: 32
+  use_per_replica_batch_size: True
+  dtype: 'float32'
+model:
+  model_params:
+    model_name: 'efficientnet-b1'
+    overrides:
+      num_classes: 1000
+      batch_norm: 'default'
+      dtype: 'float32'
+      activation: 'swish'
+  optimizer:
+    name: 'rmsprop'
+    momentum: 0.9
+    decay: 0.9
+    moving_average_decay: 0.0
+    lookahead: false
+  learning_rate:
+    name: 'exponential'
+  loss:
+    label_smoothing: 0.1
+train:
+  resume_checkpoint: True
+  epochs: 500
+evaluation:
+  epochs_between_evals: 1
diff --git a/cv/classification/resnet50/tensorflow2.0/configs/examples/efficientnet/imagenet/efficientnet-b1-tpu.yaml b/cv/classification/resnet50/tensorflow2.0/configs/examples/efficientnet/imagenet/efficientnet-b1-tpu.yaml
new file mode 100644
index 000000000..0bb6a9fe6
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/configs/examples/efficientnet/imagenet/efficientnet-b1-tpu.yaml
@@ -0,0 +1,51 @@
+# Training configuration for EfficientNet-b1 trained on ImageNet on TPUs.
+# Takes ~3 minutes, 15 seconds per epoch for v3-32.
+# Note: This configuration uses a scaled per-replica batch size based on the number of devices.
+runtime:
+  distribution_strategy: 'tpu'
+train_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'records'
+  split: 'train'
+  num_classes: 1000
+  num_examples: 1281167
+  batch_size: 128
+  use_per_replica_batch_size: True
+  dtype: 'bfloat16'
+  augmenter:
+    name: 'autoaugment'
+validation_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'records'
+  split: 'validation'
+  num_classes: 1000
+  num_examples: 50000
+  batch_size: 128
+  use_per_replica_batch_size: True
+  dtype: 'bfloat16'
+model:
+  model_params:
+    model_name: 'efficientnet-b1'
+    overrides:
+      num_classes: 1000
+      batch_norm: 'tpu'
+      dtype: 'bfloat16'
+      activation: 'swish'
+  optimizer:
+    name: 'rmsprop'
+    momentum: 0.9
+    decay: 0.9
+    moving_average_decay: 0.0
+    lookahead: false
+  learning_rate:
+    name: 'exponential'
+  loss:
+    label_smoothing: 0.1
+train:
+  resume_checkpoint: True
+  epochs: 500
+  set_epoch_loop: True
+evaluation:
+  epochs_between_evals: 1
diff --git a/cv/classification/resnet50/tensorflow2.0/configs/examples/resnet/imagenet/gpu.yaml b/cv/classification/resnet50/tensorflow2.0/configs/examples/resnet/imagenet/gpu.yaml
new file mode 100644
index 000000000..2037d6b5d
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/configs/examples/resnet/imagenet/gpu.yaml
@@ -0,0 +1,49 @@
+# Training configuration for ResNet trained on ImageNet on GPUs.
+# Reaches > 76.1% within 90 epochs.
+# Note: This configuration uses a scaled per-replica batch size based on the number of devices.
+runtime:
+  distribution_strategy: 'mirrored'
+  num_gpus: 1
+  batchnorm_spatial_persistent: True
+train_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'tfds'
+  split: 'train'
+  image_size: 224
+  num_classes: 1000
+  num_examples: 1281167
+  batch_size: 256
+  use_per_replica_batch_size: True
+  dtype: 'float16'
+  mean_subtract: True
+  standardize: True
+validation_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'tfds'
+  split: 'validation'
+  image_size: 224
+  num_classes: 1000
+  num_examples: 50000
+  batch_size: 256
+  use_per_replica_batch_size: True
+  dtype: 'float16'
+  mean_subtract: True
+  standardize: True
+model:
+  name: 'resnet'
+  model_params:
+    rescale_inputs: False
+  optimizer:
+    name: 'momentum'
+    momentum: 0.9
+    decay: 0.9
+    epsilon: 0.001
+  loss:
+    label_smoothing: 0.1
+train:
+  resume_checkpoint: True
+  epochs: 90
+evaluation:
+  epochs_between_evals: 1
diff --git a/cv/classification/resnet50/tensorflow2.0/configs/examples/resnet/imagenet/gpu_mirrored.yaml b/cv/classification/resnet50/tensorflow2.0/configs/examples/resnet/imagenet/gpu_mirrored.yaml
new file mode 100644
index 000000000..365b22672
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/configs/examples/resnet/imagenet/gpu_mirrored.yaml
@@ -0,0 +1,51 @@
+# Training configuration for ResNet trained on ImageNet on GPUs.
+# Reaches > 76.1% within 90 epochs.
+# Note: This configuration uses a scaled per-replica batch size based on the number of devices.
+runtime:
+  distribution_strategy: 'mirrored'
+  num_gpus: 1
+  batchnorm_spatial_persistent: True
+train_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'records' #'tfds'
+  split: 'train'
+  image_size: 224
+  num_classes: 10 #1000
+  num_examples: 9469 #1281167
+  batch_size: 32 #256
+  use_per_replica_batch_size: True
+  dtype: 'float32' #'float16'
+  mean_subtract: True
+  standardize: True
+validation_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'records' #'tfds'
+  split: 'validation'
+  image_size: 224
+  num_classes: 10 #1000
+  num_examples: 3925  #50000
+  batch_size: 32 #256
+  use_per_replica_batch_size: True
+  dtype: 'float32' #'float16'
+  mean_subtract: True
+  standardize: True
+model:
+  name: 'resnet'
+  model_params:
+    rescale_inputs: False
+  optimizer:
+    name: 'momentum'
+    momentum: 0.9
+    decay: 0.9
+    epsilon: 0.001
+  loss:
+    label_smoothing: 0.1
+train:
+  resume_checkpoint: True
+  epochs: 10 #90
+  callbacks:
+    enable_checkpoint_and_export: False #True
+evaluation:
+  epochs_between_evals: 1
diff --git a/cv/classification/resnet50/tensorflow2.0/configs/examples/resnet/imagenet/gpu_multi_worker_mirrored.yaml b/cv/classification/resnet50/tensorflow2.0/configs/examples/resnet/imagenet/gpu_multi_worker_mirrored.yaml
new file mode 100644
index 000000000..2e53c2941
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/configs/examples/resnet/imagenet/gpu_multi_worker_mirrored.yaml
@@ -0,0 +1,53 @@
+# Training configuration for ResNet trained on ImageNet on GPUs.
+# Reaches > 76.1% within 90 epochs.
+# Note: This configuration uses a scaled per-replica batch size based on the number of devices.
+runtime:
+  distribution_strategy: 'multi_worker_mirrored'
+  worker_hosts: 'localhost:20002,localhost:20003'
+  all_reduce_alg: 'nccl'
+  num_gpus: 1
+  batchnorm_spatial_persistent: True
+train_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'records' #'tfds'
+  split: 'train'
+  image_size: 224
+  num_classes: 10 #1000
+  num_examples: 9469 #1281167
+  batch_size: 32 #256
+  use_per_replica_batch_size: True
+  dtype: 'float32' #'float16'
+  mean_subtract: True
+  standardize: True
+validation_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'records' #'tfds'
+  split: 'validation'
+  image_size: 224
+  num_classes: 10 #1000
+  num_examples: 3925  #50000
+  batch_size: 32 #256
+  use_per_replica_batch_size: True
+  dtype: 'float32' #'float16'
+  mean_subtract: True
+  standardize: True
+model:
+  name: 'resnet'
+  model_params:
+    rescale_inputs: False
+  optimizer:
+    name: 'momentum'
+    momentum: 0.9
+    decay: 0.9
+    epsilon: 0.001
+  loss:
+    label_smoothing: 0.1
+train:
+  resume_checkpoint: True
+  epochs: 10 #90
+  callbacks:
+    enable_checkpoint_and_export: False #True
+evaluation:
+  epochs_between_evals: 1
diff --git a/cv/classification/resnet50/tensorflow2.0/configs/examples/resnet/imagenet/tpu.yaml b/cv/classification/resnet50/tensorflow2.0/configs/examples/resnet/imagenet/tpu.yaml
new file mode 100644
index 000000000..0a3030333
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/configs/examples/resnet/imagenet/tpu.yaml
@@ -0,0 +1,55 @@
+# Training configuration for ResNet trained on ImageNet on TPUs.
+# Takes ~4 minutes, 30 seconds seconds per epoch for a v3-32.
+# Reaches > 76.1% within 90 epochs.
+# Note: This configuration uses a scaled per-replica batch size based on the number of devices.
+runtime:
+  distribution_strategy: 'tpu'
+train_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'tfds'
+  split: 'train'
+  one_hot: False
+  image_size: 224
+  num_classes: 1000
+  num_examples: 1281167
+  batch_size: 128
+  use_per_replica_batch_size: True
+  mean_subtract: False
+  standardize: False
+  dtype: 'bfloat16'
+validation_dataset:
+  name: 'imagenet2012'
+  data_dir: null
+  builder: 'tfds'
+  split: 'validation'
+  one_hot: False
+  image_size: 224
+  num_classes: 1000
+  num_examples: 50000
+  batch_size: 128
+  use_per_replica_batch_size: True
+  mean_subtract: False
+  standardize: False
+  dtype: 'bfloat16'
+model:
+  name: 'resnet'
+  model_params:
+    rescale_inputs: True
+  optimizer:
+    name: 'momentum'
+    momentum: 0.9
+    decay: 0.9
+    epsilon: 0.001
+    moving_average_decay: 0.
+    lookahead: False
+  loss:
+    label_smoothing: 0.1
+train:
+  callbacks:
+    enable_checkpoint_and_export: True
+  resume_checkpoint: True
+  epochs: 90
+  set_epoch_loop: True
+evaluation:
+  epochs_between_evals: 1
diff --git a/cv/classification/resnet50/tensorflow2.0/core/__init__.py b/cv/classification/resnet50/tensorflow2.0/core/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/core/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/cv/classification/resnet50/tensorflow2.0/core/actions.py b/cv/classification/resnet50/tensorflow2.0/core/actions.py
new file mode 100644
index 000000000..c19f3a170
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/core/actions.py
@@ -0,0 +1,94 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Provides TFM orbit actions and associated helper functions/classes."""
+
+import os
+from typing import List
+
+import gin
+import orbit
+import tensorflow as tf
+
+from . import base_trainer
+from . import config_definitions
+from modeling import optimization
+
+
+class EMACheckpointing:
+  """Eval action to save checkpoint with average weights when EMA is used.
+
+  This action swaps the weights of the model with the average weights, then it
+  saves the checkpoint under export_dir/ema_checkpoints. Checkpointing is
+  expensive for large models, so doing this action in eval is more efficient
+  than training.
+  """
+
+  def __init__(self, export_dir: str, optimizer: tf.keras.optimizers.Optimizer,
+               checkpoint: tf.train.Checkpoint, max_to_keep: int = 1):
+    """Initializes the instance.
+
+    Args:
+      export_dir: `str` for the export directory of the EMA average weights.
+      optimizer: `tf.keras.optimizers.Optimizer` optimizer instance used for
+        training. This will be used to swap the model weights with the average
+        weigths.
+      checkpoint: `tf.train.Checkpoint` instance.
+      max_to_keep: `int` for max checkpoints to keep in ema_checkpoints subdir.
+    """
+    if not isinstance(optimizer, optimization.ExponentialMovingAverage):
+      raise ValueError('Optimizer has to be instance of'
+                       'optimization.ExponentialMovingAverage for'
+                       'EMACheckpointing action')
+
+    export_dir = os.path.join(export_dir, 'ema_checkpoints')
+    tf.io.gfile.makedirs(
+        os.path.dirname(export_dir))
+    self._optimizer = optimizer
+    self._checkpoint = checkpoint
+    self._checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint,
+        directory=export_dir,
+        max_to_keep=max_to_keep,
+        checkpoint_name='average_weights')
+
+  def __call__(self, output: orbit.runner.Output):
+    """Swaps model weights, and saves the checkpoint.
+
+    Args:
+      output: The train or eval output to test.
+    """
+    self._optimizer.swap_weights()
+    self._checkpoint_manager.save(checkpoint_number=self._optimizer.iterations)
+    self._optimizer.swap_weights()
+
+
+@gin.configurable
+def get_eval_actions(
+    params: config_definitions.ExperimentConfig,
+    trainer: base_trainer.Trainer,
+    model_dir: str) -> List[orbit.Action]:
+  """Gets eval actions for TFM trainer."""
+  eval_actions = []
+  # Adds ema checkpointing action to save the average weights under
+  # ema_checkpoints subdir.
+  if isinstance(trainer.optimizer, optimization.ExponentialMovingAverage):
+    eval_actions.append(
+        EMACheckpointing(
+            export_dir=model_dir,
+            optimizer=trainer.optimizer,
+            checkpoint=trainer.checkpoint,
+            max_to_keep=params.trainer.max_to_keep))
+
+  return eval_actions
diff --git a/cv/classification/resnet50/tensorflow2.0/core/actions_test.py b/cv/classification/resnet50/tensorflow2.0/core/actions_test.py
new file mode 100644
index 000000000..5b05e99c6
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/core/actions_test.py
@@ -0,0 +1,81 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for TFM actions."""
+
+import os
+
+from absl.testing import parameterized
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.core import actions
+from official.modeling import optimization
+
+
+class TestModel(tf.Module):
+
+  def __init__(self):
+    self.value = tf.Variable(0)
+
+  @tf.function(input_signature=[])
+  def __call__(self):
+    return self.value
+
+
+def all_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.cloud_tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+      ],)
+
+
+class ActionsTest(tf.test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(all_strategy_combinations())
+  def test_ema_checkpointing(self, distribution):
+    with distribution.scope():
+      directory = self.create_tempdir()
+      model = TestModel()
+      optimizer = tf.keras.optimizers.SGD()
+      optimizer = optimization.ExponentialMovingAverage(
+          optimizer, trainable_weights_only=False)
+
+      # Creats average weights for the model variables. Average weights are
+      # initialized to zero.
+      optimizer.shadow_copy(model)
+      checkpoint = tf.train.Checkpoint(model=model)
+
+      # Changes model.value to 3, average value is still 0.
+      model.value.assign(3)
+
+      # Checks model.value is 3
+      self.assertEqual(model(), 3)
+      ema_action = actions.EMACheckpointing(directory, optimizer, checkpoint)
+
+      ema_action({})
+      self.assertNotEmpty(
+          tf.io.gfile.glob(os.path.join(directory, 'ema_checkpoints')))
+
+      checkpoint.read(tf.train.latest_checkpoint(
+          os.path.join(directory, 'ema_checkpoints')))
+
+      # Checks model.value is 0 after swapping.
+      self.assertEqual(model(), 0)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/core/base_task.py b/cv/classification/resnet50/tensorflow2.0/core/base_task.py
new file mode 100644
index 000000000..c2b455d1f
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/core/base_task.py
@@ -0,0 +1,334 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Defines the base task abstraction."""
+import abc
+from typing import Optional
+
+from absl import logging
+import tensorflow as tf
+
+from . import config_definitions
+from modeling import optimization
+from modeling import performance
+
+OptimizationConfig = optimization.OptimizationConfig
+RuntimeConfig = config_definitions.RuntimeConfig
+
+
+class Task(tf.Module, metaclass=abc.ABCMeta):
+  """A single-replica view of training procedure.
+
+  Tasks provide artifacts for training/validation procedures, including
+  loading/iterating over Datasets, training/validation steps, calculating the
+  loss and customized metrics with reduction.
+  """
+
+  # Special keys in train/validate step returned logs.
+  loss = "loss"
+
+  def __init__(self,
+               params,
+               logging_dir: Optional[str] = None,
+               name: Optional[str] = None):
+    """Task initialization.
+
+    Args:
+      params: the task configuration instance, which can be any of dataclass,
+        ConfigDict, namedtuple, etc.
+      logging_dir: a string pointing to where the model, summaries etc. will be
+        saved. You can also write additional stuff in this directory.
+      name: the task name.
+    """
+    super().__init__(name=name)
+    self._task_config = params
+    self._logging_dir = logging_dir
+
+  @property
+  def task_config(self):
+    return self._task_config
+
+  @property
+  def logging_dir(self) -> str:
+    return self._logging_dir
+
+  @classmethod
+  def create_optimizer(cls, optimizer_config: OptimizationConfig,
+                       runtime_config: Optional[RuntimeConfig] = None):
+    """Creates an TF optimizer from configurations.
+
+    Args:
+      optimizer_config: the parameters of the Optimization settings.
+      runtime_config: the parameters of the runtime.
+
+    Returns:
+      A tf.optimizers.Optimizer object.
+    """
+    opt_factory = optimization.OptimizerFactory(optimizer_config)
+    optimizer = opt_factory.build_optimizer(opt_factory.build_learning_rate())
+    # Configuring optimizer when loss_scale is set in runtime config. This helps
+    # avoiding overflow/underflow for float16 computations.
+    if runtime_config and runtime_config.loss_scale:
+      optimizer = performance.configure_optimizer(
+          optimizer,
+          use_float16=runtime_config.mixed_precision_dtype == "float16",
+          loss_scale=runtime_config.loss_scale)
+
+    return optimizer
+
+  def initialize(self, model: tf.keras.Model):
+    """[Optional] A callback function used as CheckpointManager's init_fn.
+
+    This function will be called when no checkpoint is found for the model.
+    If there is a checkpoint, the checkpoint will be loaded and this function
+    will not be called. You can use this callback function to load a pretrained
+    checkpoint, saved under a directory other than the model_dir.
+
+    Args:
+      model: The keras.Model built or used by this task.
+    """
+    ckpt_dir_or_file = self.task_config.init_checkpoint
+    logging.info("Trying to load pretrained checkpoint from %s",
+                 ckpt_dir_or_file)
+    if tf.io.gfile.isdir(ckpt_dir_or_file):
+      ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
+    if not ckpt_dir_or_file:
+      return
+
+    if hasattr(model, "checkpoint_items"):
+      checkpoint_items = model.checkpoint_items
+    else:
+      checkpoint_items = dict(model=model)
+    ckpt = tf.train.Checkpoint(**checkpoint_items)
+    status = ckpt.read(ckpt_dir_or_file)
+    status.expect_partial().assert_existing_objects_matched()
+    logging.info("Finished loading pretrained checkpoint from %s",
+                 ckpt_dir_or_file)
+
+  def build_model(self) -> tf.keras.Model:
+    """[Optional] Creates model architecture.
+
+    Returns:
+      A model instance.
+    """
+
+  @abc.abstractmethod
+  def build_inputs(self,
+                   params,
+                   input_context: Optional[tf.distribute.InputContext] = None):
+    """Returns a dataset or a nested structure of dataset functions.
+
+    Dataset functions define per-host datasets with the per-replica batch size.
+    With distributed training, this method runs on remote hosts.
+
+    Args:
+      params: hyperparams to create input pipelines, which can be any of
+        dataclass, ConfigDict, namedtuple, etc.
+      input_context: optional distribution input pipeline context.
+
+    Returns:
+      A nested structure of per-replica input functions.
+    """
+
+  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
+    """Standard interface to compute losses.
+
+    Args:
+      labels: optional label tensors.
+      model_outputs: a nested structure of output tensors.
+      aux_losses: auxiliary loss tensors, i.e. `losses` in keras.Model.
+
+    Returns:
+      The total loss tensor.
+    """
+    del model_outputs, labels
+
+    if aux_losses is None:
+      losses = [tf.constant(0.0, dtype=tf.float32)]
+    else:
+      losses = aux_losses
+    total_loss = tf.add_n(losses)
+    return total_loss
+
+  def build_metrics(self, training: bool = True):
+    """Gets streaming metrics for training/validation."""
+    del training
+    return []
+
+  def process_metrics(self, metrics, labels, model_outputs):
+    """Process and update metrics.
+
+    Called when using custom training loop API.
+
+    Args:
+      metrics: a nested structure of metrics objects. The return of function
+        self.build_metrics.
+      labels: a tensor or a nested structure of tensors.
+      model_outputs: a tensor or a nested structure of tensors. For example,
+        output of the keras model built by self.build_model.
+    """
+    for metric in metrics:
+      metric.update_state(labels, model_outputs)
+
+  def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
+    """Process and update compiled_metrics.
+
+    call when using compile/fit API.
+
+    Args:
+      compiled_metrics: the compiled metrics (model.compiled_metrics).
+      labels: a tensor or a nested structure of tensors.
+      model_outputs: a tensor or a nested structure of tensors. For example,
+        output of the keras model built by self.build_model.
+    """
+    compiled_metrics.update_state(labels, model_outputs)
+
+  def train_step(self,
+                 inputs,
+                 model: tf.keras.Model,
+                 optimizer: tf.keras.optimizers.Optimizer,
+                 metrics=None):
+    """Does forward and backward.
+
+    With distribution strategies, this method runs on devices.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the model, forward pass definition.
+      optimizer: the optimizer for this training step.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    if isinstance(inputs, tuple) and len(inputs) == 2:
+      features, labels = inputs
+    else:
+      features, labels = inputs, inputs
+    with tf.GradientTape() as tape:
+      outputs = model(features, training=True)
+      # Computes per-replica loss.
+      if model.compiled_loss:
+        loss = model.compiled_loss(
+            labels, outputs, regularization_losses=model.losses)
+        loss += self.build_losses(
+            labels=labels, model_outputs=outputs, aux_losses=None)
+      else:
+        loss = self.build_losses(
+            labels=labels, model_outputs=outputs, aux_losses=model.losses)
+      # Scales loss as the default gradients allreduce performs sum inside the
+      # optimizer.
+      scaled_loss = loss / tf.distribute.get_strategy().num_replicas_in_sync
+
+      # For mixed precision, when a LossScaleOptimizer is used, the loss is
+      # scaled to avoid numeric underflow.
+      if isinstance(optimizer,
+                    tf.keras.mixed_precision.LossScaleOptimizer):
+        scaled_loss = optimizer.get_scaled_loss(scaled_loss)
+
+    tvars = model.trainable_variables
+    grads = tape.gradient(scaled_loss, tvars)
+
+    if isinstance(optimizer,
+                  tf.keras.mixed_precision.LossScaleOptimizer):
+      grads = optimizer.get_unscaled_gradients(grads)
+    optimizer.apply_gradients(list(zip(grads, tvars)))
+    logs = {self.loss: loss}
+    if metrics:
+      self.process_metrics(metrics, labels, outputs)
+    if model.compiled_metrics:
+      self.process_compiled_metrics(model.compiled_metrics, labels, outputs)
+      logs.update({m.name: m.result() for m in metrics or []})
+      logs.update({m.name: m.result() for m in model.metrics})
+    return logs
+
+  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
+    """Validation step.
+
+    With distribution strategies, this method runs on devices.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the keras.Model.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    if isinstance(inputs, tuple) and len(inputs) == 2:
+      features, labels = inputs
+    else:
+      features, labels = inputs, inputs
+    outputs = self.inference_step(features, model)
+    loss = self.build_losses(
+        labels=labels, model_outputs=outputs, aux_losses=model.losses)
+    logs = {self.loss: loss}
+    if metrics:
+      self.process_metrics(metrics, labels, outputs)
+    if model.compiled_metrics:
+      self.process_compiled_metrics(model.compiled_metrics, labels, outputs)
+      logs.update({m.name: m.result() for m in metrics or []})
+      logs.update({m.name: m.result() for m in model.metrics})
+    return logs
+
+  def inference_step(self, inputs, model: tf.keras.Model):
+    """Performs the forward step.
+
+    With distribution strategies, this method runs on devices.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the keras.Model.
+
+    Returns:
+      Model outputs.
+    """
+    return model(inputs, training=False)
+
+  def aggregate_logs(self, state, step_logs):
+    """Optional aggregation over logs returned from a validation step.
+
+    Given step_logs from a validation step, this function aggregates the logs
+    after each eval_step() (see eval_reduce() function in
+    official/core/base_trainer.py). It runs on CPU and can be used to aggregate
+    metrics during validation, when there are too many metrics that cannot fit
+    into TPU memory. Note that this may increase latency due to data transfer
+    between TPU and CPU. Also, the step output from a validation step may be a
+    tuple with elements from replicas, and a concatenation of the elements is
+    needed in such case.
+
+    Args:
+      state: The current state of training, for example, it can be a sequence of
+        metrics.
+      step_logs: Logs from a validation step. Can be a dictionary.
+    """
+    pass
+
+  def reduce_aggregated_logs(self,
+                             aggregated_logs,
+                             global_step: Optional[tf.Tensor] = None):
+    """Optional reduce of aggregated logs over validation steps.
+
+    This function reduces aggregated logs at the end of validation, and can be
+    used to compute the final metrics. It runs on CPU and in each eval_end() in
+    base trainer (see eval_end() function in official/core/base_trainer.py).
+
+    Args:
+      aggregated_logs: Aggregated logs over multiple validation steps.
+      global_step: An optional variable of global step.
+
+    Returns:
+      A dictionary of reduced results.
+    """
+    return {}
diff --git a/cv/classification/resnet50/tensorflow2.0/core/base_trainer.py b/cv/classification/resnet50/tensorflow2.0/core/base_trainer.py
new file mode 100644
index 000000000..5f2df3c75
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/core/base_trainer.py
@@ -0,0 +1,481 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Standard Trainer implementation.
+
+The base trainer implements the Orbit `StandardTrainable` and
+`StandardEvaluable` interfaces. Trainers inside this project should be
+interchangable and independent on model architectures and tasks.
+"""
+import functools
+from typing import Union, Optional
+from absl import logging
+import gin
+import orbit
+import tensorflow as tf
+
+from . import base_task
+from . import config_definitions
+from modeling import optimization
+
+ExperimentConfig = config_definitions.ExperimentConfig
+TrainerConfig = config_definitions.TrainerConfig
+
+
+class Recovery:
+  """Built-in model blowup recovery module.
+
+  Checks the loss value by the given threshold. If applicable, recover the
+  model by reading the checkpoint on disk.
+  """
+
+  def __init__(self,
+               loss_upper_bound: float,
+               checkpoint_manager: tf.train.CheckpointManager,
+               recovery_begin_steps: int = 0,
+               recovery_max_trials: int = 3):
+    self.recover_counter = 0
+    self.recovery_begin_steps = recovery_begin_steps
+    self.recovery_max_trials = recovery_max_trials
+    self.loss_upper_bound = loss_upper_bound
+    self.checkpoint_manager = checkpoint_manager
+
+  def should_recover(self, loss_value, global_step):
+    if tf.math.is_nan(loss_value):
+      return True
+    if (global_step >= self.recovery_begin_steps and
+        loss_value > self.loss_upper_bound):
+      return True
+    return False
+
+  def maybe_recover(self, loss_value, global_step):
+    """Conditionally recovers the training by triggering checkpoint restoration.
+
+    Args:
+      loss_value: the loss value as a float.
+      global_step: the number of global training steps.
+
+    Raises:
+      RuntimeError: when recovery happens more than the max number of trials,
+      the job should crash.
+    """
+    if not self.should_recover(loss_value, global_step):
+      return
+    self.recover_counter += 1
+    if self.recover_counter > self.recovery_max_trials:
+      raise RuntimeError(
+          "The loss value is NaN after training loop and it happens %d times." %
+          self.recover_counter)
+    # Loads the previous good checkpoint.
+    checkpoint_path = self.checkpoint_manager.restore_or_initialize()
+    logging.warning(
+        "Recovering the model from checkpoint: %s. The loss value becomes "
+        "%f at step %d.", checkpoint_path, loss_value, global_step)
+
+
+class _AsyncTrainer(orbit.StandardTrainer, orbit.StandardEvaluator):
+  """Trainer class for both sync and async Strategy."""
+
+  def init_async(self):
+    """Initializes the Async Trainer base class."""
+    assert isinstance(self._strategy, tf.distribute.Strategy)
+    self._is_async = isinstance(
+        self._strategy, tf.distribute.experimental.ParameterServerStrategy)
+    self._coordinator = None
+    if self._is_async:
+      self._coordinator = (
+          tf.distribute.experimental.coordinator.ClusterCoordinator(
+              self._strategy))
+
+  def join(self):
+    """Join all async steps. Only useful in aysnc training."""
+    if getattr(self, "_is_async", False):
+      self._coordinator.join()
+
+  def create_train_loop_fn(self):
+    """Creates a eval loop from the given step function and options."""
+    train_loop_fn = super().create_train_loop_fn()
+    if getattr(self, "_is_async", False):
+
+      def _async_loop_fn(iterator, num_steps):
+        self._coordinator.schedule(train_loop_fn, args=(iterator, num_steps))
+
+      return _async_loop_fn
+    else:
+      return train_loop_fn
+
+  def create_eval_loop_fn(self, has_state: bool):
+    """Creates a training loop from the given step function and options."""
+    eval_loop_fn = super().create_eval_loop_fn(has_state)
+
+    if getattr(self, "_is_async", False):
+      if has_state:
+        raise ValueError(
+            "Stateful eval loop is not supported in async training.")
+
+      def _async_loop_fn(iterator, num_steps, state=None, reduce_fn=None):
+        assert state is None
+        assert reduce_fn is None
+        self._coordinator.schedule(eval_loop_fn, args=(iterator, num_steps))
+
+      return _async_loop_fn
+    else:
+      return eval_loop_fn
+
+  def distribute_dataset(self, dataset_or_fn, *args, **kwargs):
+    """A utility function to help create a `tf.distribute.DistributedDataset`.
+
+    Args:
+      dataset_or_fn: A instance of `tf.data.Dataset`, or a "dataset function"
+        returning a `tf.data.Dataset`. If it is a function, it may optionally
+        have an argument named `input_context` which will be passed a
+        `tf.distribute.InputContext` instance.
+      *args: Any positional arguments to pass through to `dataset_or_fn`.
+      **kwargs: Any keyword arguments to pass through to `dataset_or_fn`.
+
+    Returns:
+      A distributed Dataset.
+    """
+    if getattr(self, "_is_async", False):
+      per_worker_dataset_fn = functools.partial(
+          orbit.utils.make_distributed_dataset, self._strategy, dataset_or_fn,
+          *args, **kwargs)
+      per_worker_dataset_fn = tf.function(per_worker_dataset_fn)
+
+      return self._coordinator.create_per_worker_dataset(per_worker_dataset_fn)
+    else:
+      return orbit.utils.make_distributed_dataset(self._strategy, dataset_or_fn,
+                                                  *args, **kwargs)
+
+
+def get_runtime_options(config: ExperimentConfig):
+  """Get tf.distribute.RunOptions from config."""
+  xla_options = {}
+  if config.runtime.tpu_enable_xla_dynamic_padder is not None:
+    xla_options["enable_xla_dynamic_padder"] = (
+        config.runtime.tpu_enable_xla_dynamic_padder)
+  return tf.distribute.RunOptions(
+      experimental_xla_options=tf.tpu.XLAOptions(**xla_options))
+
+
+@gin.configurable
+class Trainer(_AsyncTrainer):
+  """Implements the common trainer shared for TensorFlow models."""
+
+  # pylint: disable=super-init-not-called
+  def __init__(
+      self,
+      config: ExperimentConfig,
+      task: base_task.Task,
+      model: tf.keras.Model,
+      optimizer: tf.optimizers.Optimizer,
+      train: bool = True,
+      evaluate: bool = True,
+      train_dataset: Optional[Union[tf.data.Dataset,
+                                    tf.distribute.DistributedDataset]] = None,
+      validation_dataset: Optional[Union[
+          tf.data.Dataset, tf.distribute.DistributedDataset]] = None,
+      checkpoint_exporter=None):
+    """Initialize common trainer for TensorFlow models.
+
+    Args:
+      config: An `ExperimentConfig` instance specifying experiment config.
+      task: A base_task.Task instance.
+      model: The model instance, e.g. a tf.keras.Model instance.
+      optimizer: tf.optimizers.Optimizer instance.
+      train: bool, whether or not this trainer will be used for training.
+        default to True.
+      evaluate: bool, whether or not this trainer will be used for evaluation.
+        default to True.
+      train_dataset: a dataset object created for training. With tf.distribute,
+        it needs to be a `DistributedDataset`.
+      validation_dataset: a dataset object created for evaluation. With
+        tf.distribute, it needs to be a `DistributedDataset`. The evaluator will
+        create a dataset iterator for each eval round, so the dataset does not
+        need to repeat.
+      checkpoint_exporter: an object that has the `maybe_export_checkpoint`
+        interface.
+    """
+    # Gets the current distribution strategy. If not inside any strategy scope,
+    # it gets a single-replica no-op strategy.
+    self._strategy = tf.distribute.get_strategy()
+    self._validate_params(
+        config,
+        check_train_data=train_dataset is None,
+        check_validation_data=validation_dataset is None)
+    self._config = config
+    self._task = task
+    self._model = model
+    self._optimizer = optimizer
+    self._checkpoint_exporter = checkpoint_exporter
+    self._recovery = None
+    # Runtime options are only applied to train_step.
+    # We use default for eval_step.
+    self._runtime_options = get_runtime_options(config)
+
+    # Creates a shadow copy of the weights to store weights moving average.
+    if isinstance(self._optimizer, optimization.ExponentialMovingAverage
+                 ) and not self._optimizer.has_shadow_copy:
+      self._optimizer.shadow_copy(self._model)
+
+    # global_step increases by 1 after each training iteration.
+    # We should have global_step.numpy() == self.optimizer.iterations.numpy()
+    # when there is only 1 optimizer.
+    self._global_step = orbit.utils.create_global_step()
+    if hasattr(self.model, "checkpoint_items"):
+      checkpoint_items = self.model.checkpoint_items
+    else:
+      checkpoint_items = {}
+    self._checkpoint = tf.train.Checkpoint(
+        global_step=self.global_step,
+        model=self.model,
+        optimizer=self.optimizer,
+        **checkpoint_items)
+
+    self._train_loss = tf.keras.metrics.Mean("training_loss", dtype=tf.float32)
+    self._validation_loss = tf.keras.metrics.Mean(
+        "validation_loss", dtype=tf.float32)
+    model_metrics = model.metrics if hasattr(model, "metrics") else []
+    self._train_metrics = self.task.build_metrics(
+        training=True) + model_metrics
+    self._validation_metrics = self.task.build_metrics(
+        training=False) + model_metrics
+
+    self.init_async()
+
+    if train:
+      train_dataset = train_dataset or self.distribute_dataset(
+          self.task.build_inputs, self.config.task.train_data)
+      orbit.StandardTrainer.__init__(
+          self,
+          train_dataset,
+          options=orbit.StandardTrainerOptions(
+              use_tf_while_loop=config.trainer.train_tf_while_loop,
+              use_tf_function=config.trainer.train_tf_function,
+              use_tpu_summary_optimization=config.trainer.allow_tpu_summary))
+
+    if evaluate:
+      validation_dataset = validation_dataset or self.distribute_dataset(
+          self.task.build_inputs, self.config.task.validation_data)
+      orbit.StandardEvaluator.__init__(
+          self,
+          validation_dataset,
+          options=orbit.StandardEvaluatorOptions(
+              use_tf_function=config.trainer.eval_tf_function,
+              use_tf_while_loop=config.trainer.eval_tf_while_loop))
+
+  def _validate_params(self,
+                       config,
+                       check_train_data=True,
+                       check_validation_data=True):
+    r"""Validates if the configuration object passed to the Trainer.
+
+    The experiment configuration should be structured as:
+    \trainer
+    \task
+      \train_data
+      \validation_data
+
+    Args:
+      config: a namedtuple, dataclass, ConfigDict, etc.
+      check_train_data: whether to check task.train_data field.
+      check_validation_data: whether to check task.validation_data field.
+    """
+    if not hasattr(config, "trainer"):
+      raise AttributeError("The trainer requires the configuration contains an"
+                           " attribute `trainer`.")
+
+    if not hasattr(config, "task"):
+      raise AttributeError("The trainer requires the configuration contains an"
+                           " attribute `task`.")
+
+    if check_train_data and not hasattr(config.task, "train_data"):
+      raise AttributeError("The trainer requires the configuration contains an"
+                           " attribute `task.train_data`.")
+
+    if check_validation_data and not hasattr(config.task, "validation_data"):
+      raise AttributeError("The trainer requires the configuration contains an"
+                           " attribute `task.validation_data`.")
+
+  @property
+  def strategy(self):
+    return self._strategy
+
+  @property
+  def config(self):
+    return self._config
+
+  @property
+  def task(self):
+    return self._task
+
+  @property
+  def model(self):
+    return self._model
+
+  @property
+  def optimizer(self):
+    if hasattr(self, "_optimizer"):
+      return self._optimizer
+    else:
+      return None
+
+  @property
+  def global_step(self):
+    return self._global_step
+
+  @property
+  def train_loss(self):
+    """Accesses the training loss metric object."""
+    return self._train_loss
+
+  @property
+  def validation_loss(self):
+    """Accesses the validation loss metric object."""
+    return self._validation_loss
+
+  @property
+  def train_metrics(self):
+    """Accesses all training metric objects."""
+    return self._train_metrics
+
+  @property
+  def validation_metrics(self):
+    """Accesses all validation metric metric objects."""
+    return self._validation_metrics
+
+  def initialize(self):
+    """A callback function.
+
+    This function will be called when no checkpoint found for the model.
+    If there is a checkpoint, the checkpoint will be loaded and this function
+    will not be called. Tasks may use this callback function to load a
+    pretrained checkpoint, saved under a directory other than the model_dir.
+    """
+    self.task.initialize(self.model)
+
+  @property
+  def checkpoint(self):
+    """Accesses the training checkpoint."""
+    return self._checkpoint
+
+  def add_recovery(self, params: TrainerConfig,
+                   checkpoint_manager: tf.train.CheckpointManager):
+    if params.recovery_max_trials >= 0:
+      self._recovery = Recovery(
+          loss_upper_bound=params.loss_upper_bound,
+          recovery_begin_steps=params.recovery_begin_steps,
+          recovery_max_trials=params.recovery_max_trials,
+          checkpoint_manager=checkpoint_manager)
+
+  def train_loop_end(self):
+    """See base class."""
+    self.join()
+    # Checks if the model numeric status is stable and conducts the checkpoint
+    # recovery accordingly.
+    if self._recovery:
+      self._recovery.maybe_recover(self.train_loss.result().numpy(),
+                                   self.global_step.numpy())
+    logs = {}
+    for metric in self.train_metrics + [self.train_loss]:
+      logs[metric.name] = metric.result()
+      metric.reset_states()
+    if callable(self.optimizer.learning_rate):
+      # Maybe a self-implemented optimizer does not have `optimizer.iterations`.
+      # So just to be safe here.
+      if hasattr(self.optimizer, "iterations"):
+        logs["learning_rate"] = self.optimizer.learning_rate(
+            self.optimizer.iterations)
+      else:
+        logs["learning_rate"] = self.optimizer.learning_rate(self.global_step)
+    else:
+      logs["learning_rate"] = self.optimizer.learning_rate
+    return logs
+
+  def train_step(self, iterator):
+    """See base class."""
+
+    def step_fn(inputs):
+      if self.config.runtime.enable_xla and (self.config.runtime.num_gpus > 0):
+        task_train_step = tf.function(self.task.train_step, jit_compile=True)
+      else:
+        task_train_step = self.task.train_step
+      logs = task_train_step(
+          inputs,
+          model=self.model,
+          optimizer=self.optimizer,
+          metrics=self.train_metrics)
+      self._train_loss.update_state(logs[self.task.loss])
+      self.global_step.assign_add(1)
+
+    self.strategy.run(
+        step_fn, args=(next(iterator),), options=self._runtime_options)
+
+  def eval_begin(self):
+    """Sets up metrics."""
+    for metric in self.validation_metrics + [self.validation_loss]:
+      metric.reset_states()
+    # Swaps weights to test on weights moving average.
+    if self.optimizer and isinstance(self.optimizer,
+                                     optimization.ExponentialMovingAverage):
+      self.optimizer.swap_weights()
+
+  def eval_step(self, iterator):
+    """See base class."""
+
+    def step_fn(inputs):
+      logs = self.task.validation_step(
+          inputs, model=self.model, metrics=self.validation_metrics)
+      if self.task.loss in logs:
+        self._validation_loss.update_state(logs[self.task.loss])
+      return logs
+
+    distributed_outputs = self.strategy.run(step_fn, args=(next(iterator),))
+    return tf.nest.map_structure(self.strategy.experimental_local_results,
+                                 distributed_outputs)
+
+  def eval_end(self, aggregated_logs=None):
+    """Processes evaluation results."""
+    self.join()
+    logs = {}
+    for metric in self.validation_metrics:
+      logs[metric.name] = metric.result()
+    if self.validation_loss.count.numpy() != 0:
+      logs[self.validation_loss.name] = self.validation_loss.result()
+    else:
+      # `self.validation_loss` metric was not updated, because the validation
+      # loss was not returned from the task's `validation_step` method.
+      logging.info("The task did not report validation loss.")
+    if aggregated_logs:
+      metrics = self.task.reduce_aggregated_logs(
+          aggregated_logs, global_step=self.global_step)
+      logs.update(metrics)
+
+    if self._checkpoint_exporter:
+      self._checkpoint_exporter.maybe_export_checkpoint(
+          self.checkpoint, logs, self.global_step.numpy())
+      metric_name = self.config.trainer.best_checkpoint_eval_metric
+      logs["best_" +
+           metric_name] = self._checkpoint_exporter.best_ckpt_logs[metric_name]
+
+    # Swaps back weights after testing when EMA is used.
+    # This happens after best checkpoint export so that average weights used for
+    # eval are exported instead of regular weights.
+    if self.optimizer and isinstance(self.optimizer,
+                                     optimization.ExponentialMovingAverage):
+      self.optimizer.swap_weights()
+    return logs
+
+  def eval_reduce(self, state=None, step_outputs=None):
+    return self.task.aggregate_logs(state, step_outputs)
diff --git a/cv/classification/resnet50/tensorflow2.0/core/base_trainer_test.py b/cv/classification/resnet50/tensorflow2.0/core/base_trainer_test.py
new file mode 100644
index 000000000..258d46c97
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/core/base_trainer_test.py
@@ -0,0 +1,406 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for tensorflow_models.core.trainers.trainer."""
+# pylint: disable=g-direct-tensorflow-import
+import multiprocessing
+import os
+import sys
+
+from absl.testing import parameterized
+import numpy as np
+import orbit
+import portpicker
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.core import base_trainer as trainer_lib
+from official.core import config_definitions as cfg
+from official.core import train_lib
+from official.utils.testing import mock_task
+
+TPU_TEST = 'test_tpu' in sys.argv[0]
+GPU_TEST = 'test_gpu' in sys.argv[0]
+
+
+def all_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.cloud_tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+      ],)
+
+
+def create_in_process_cluster(num_workers, num_ps):
+  """Creates and starts local servers and returns the cluster_resolver."""
+  worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
+  ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
+
+  cluster_dict = {}
+  cluster_dict['worker'] = ['localhost:%s' % port for port in worker_ports]
+  if num_ps > 0:
+    cluster_dict['ps'] = ['localhost:%s' % port for port in ps_ports]
+
+  cluster_spec = tf.train.ClusterSpec(cluster_dict)
+
+  # Workers need some inter_ops threads to work properly.
+  worker_config = tf.compat.v1.ConfigProto()
+  if multiprocessing.cpu_count() < num_workers + 1:
+    worker_config.inter_op_parallelism_threads = num_workers + 1
+
+  for i in range(num_workers):
+    tf.distribute.Server(
+        cluster_spec,
+        job_name='worker',
+        task_index=i,
+        config=worker_config,
+        protocol='grpc')
+
+  for i in range(num_ps):
+    tf.distribute.Server(
+        cluster_spec, job_name='ps', task_index=i, protocol='grpc')
+
+  cluster_resolver = tf.distribute.cluster_resolver.SimpleClusterResolver(
+      cluster_spec, rpc_layer='grpc')
+  return cluster_resolver
+
+
+def dataset_fn(input_context=None):
+  del input_context
+
+  def dummy_data(_):
+    return tf.zeros((1, 1), dtype=tf.float32)
+
+  dataset = tf.data.Dataset.range(1)
+  dataset = dataset.repeat()
+  dataset = dataset.map(
+      dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  return dataset
+
+
+class MockAsyncTrainer(trainer_lib._AsyncTrainer):
+  """Mock AsyncTrainer to test the _AsyncTrainer class."""
+
+  def __init__(self):
+    self._strategy = tf.distribute.get_strategy()
+    self.init_async()
+
+    self.global_step = tf.Variable(
+        0,
+        dtype=tf.int64,
+        name='global_step',
+        trainable=False,
+        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
+    self.eval_global_step = tf.Variable(
+        0,
+        dtype=tf.int64,
+        name='eval_global_step',
+        trainable=False,
+        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
+
+    train_dataset = self.distribute_dataset(dataset_fn)
+    orbit.StandardTrainer.__init__(
+        self, train_dataset, options=orbit.StandardTrainerOptions())
+
+    validation_dataset = self.distribute_dataset(dataset_fn)
+    orbit.StandardEvaluator.__init__(
+        self,
+        validation_dataset,
+        options=orbit.StandardEvaluatorOptions(use_tf_while_loop=True))
+
+  def train_loop_begin(self):
+    self.global_step.assign(0)
+
+  def train_step(self, iterator):
+
+    def replica_step(_):
+      self.global_step.assign_add(1)
+
+    self._strategy.run(replica_step, args=(next(iterator),))
+
+  def train_loop_end(self):
+    self.join()
+    return self.global_step.numpy()
+
+  def eval_begin(self):
+    self.eval_global_step.assign(0)
+
+  def eval_step(self, iterator):
+
+    def replica_step(_):
+      self.eval_global_step.assign_add(1)
+
+    self._strategy.run(replica_step, args=(next(iterator),))
+
+  def eval_end(self):
+    self.join()
+    return self.eval_global_step.numpy()
+
+
+class TrainerTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self._config = cfg.ExperimentConfig(
+        trainer=cfg.TrainerConfig(
+            optimizer_config=cfg.OptimizationConfig({
+                'optimizer': {
+                    'type': 'sgd'
+                },
+                'learning_rate': {
+                    'type': 'constant'
+                }
+            })))
+
+  def create_test_trainer(self, config, model_dir=None, task=None):
+    task = task or mock_task.MockTask(config.task, logging_dir=model_dir)
+    ckpt_exporter = train_lib.maybe_create_best_ckpt_exporter(config, model_dir)
+    trainer = trainer_lib.Trainer(
+        config,
+        task,
+        model=task.build_model(),
+        optimizer=task.create_optimizer(config.trainer.optimizer_config,
+                                        config.runtime),
+        checkpoint_exporter=ckpt_exporter)
+    return trainer
+
+  @combinations.generate(all_strategy_combinations())
+  def test_trainer_train(self, distribution):
+    with distribution.scope():
+      trainer = self.create_test_trainer(self._config)
+      logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
+      self.assertIn('training_loss', logs)
+      self.assertIn('learning_rate', logs)
+
+  @combinations.generate(all_strategy_combinations())
+  def test_trainer_passing_datasets(self, distribution):
+    with distribution.scope():
+      task = mock_task.MockTask(self._config)
+      train_dataset = orbit.utils.make_distributed_dataset(
+          distribution, task.build_inputs, self._config.task.train_data)
+      validation_dataset = orbit.utils.make_distributed_dataset(
+          distribution, task.build_inputs, self._config.task.validation_data)
+      self._config.task.train_data = None
+      self._config.task.validation_data = None
+      trainer = trainer_lib.Trainer(
+          self._config,
+          task,
+          model=task.build_model(),
+          optimizer=task.create_optimizer(self._config.trainer.optimizer_config,
+                                          self._config.runtime),
+          train_dataset=train_dataset,
+          validation_dataset=validation_dataset)
+    logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
+    self.assertIn('training_loss', logs)
+    self.assertIn('learning_rate', logs)
+    logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
+    self.assertIn('validation_loss', logs)
+
+  def test_base_async_trainer(self):
+    if TPU_TEST or GPU_TEST:
+      self.skipTest('Aysnc training is not available on GPU/GPU.')
+    num_workers = 3
+    num_ps = 2
+    cluster_resolver = create_in_process_cluster(num_workers, num_ps)
+    distribution = tf.distribute.experimental.ParameterServerStrategy(
+        cluster_resolver)
+    with distribution.scope():
+      trainer = MockAsyncTrainer()
+      trainer.init_async()
+      self.assertIsInstance(
+          trainer._coordinator,
+          tf.distribute.experimental.coordinator.ClusterCoordinator)
+      self.assertEqual(trainer.train(tf.constant(10)), 10)
+      self.assertEqual(trainer.evaluate(tf.constant(11)), 11)
+
+  def test_async_trainer_train(self):
+    if TPU_TEST or GPU_TEST:
+      self.skipTest('Aysnc training is not available on GPU/TPU.')
+    num_workers = 3
+    num_ps = 2
+    cluster_resolver = create_in_process_cluster(num_workers, num_ps)
+    distribution = tf.distribute.experimental.ParameterServerStrategy(
+        cluster_resolver)
+    with distribution.scope():
+      config = cfg.ExperimentConfig(**self._config.as_dict())
+      config.trainer.eval_tf_while_loop = True
+      trainer = self.create_test_trainer(config)
+      logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
+      self.assertIn('training_loss', logs)
+      self.assertIn('learning_rate', logs)
+
+  def test_async_trainer_validate(self):
+    if TPU_TEST or GPU_TEST:
+      self.skipTest('Aysnc training is not available on GPU/GPU.')
+    num_workers = 3
+    num_ps = 2
+    cluster_resolver = create_in_process_cluster(num_workers, num_ps)
+    distribution = tf.distribute.experimental.ParameterServerStrategy(
+        cluster_resolver)
+    with distribution.scope():
+      config = cfg.ExperimentConfig(**self._config.as_dict())
+      config.trainer.eval_tf_while_loop = True
+      trainer = self.create_test_trainer(config)
+      logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
+      self.assertIn('acc', logs)
+      self.assertIn('validation_loss', logs)
+
+  @combinations.generate(all_strategy_combinations())
+  def test_trainer_validate(self, distribution):
+    with distribution.scope():
+      trainer = self.create_test_trainer(self._config)
+      logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
+      self.assertEqual(logs['counter'], 5. * distribution.num_replicas_in_sync)
+      self.assertIn('validation_loss', logs)
+
+  @combinations.generate(all_strategy_combinations())
+  def test_trainer_validate_without_loss(self, distribution):
+
+    class MockTaskWithoutValidationLoss(mock_task.MockTask):
+
+      def validation_step(self, inputs, model, metrics=None):
+        # Disable validation loss.
+        logs = super().validation_step(inputs, model)
+        del logs[self.loss]
+        return logs
+
+    with distribution.scope():
+      task = MockTaskWithoutValidationLoss()
+      trainer = self.create_test_trainer(self._config, task=task)
+      logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
+      self.assertEqual(logs['counter'], 5. * distribution.num_replicas_in_sync)
+      self.assertNotIn('validation_loss', logs)
+
+  @combinations.generate(
+      combinations.combine(
+          mixed_precision_dtype=['float32', 'bfloat16', 'float16'],
+          loss_scale=[None, 'dynamic', 128, 256],
+      ))
+  def test_configure_optimizer(self, mixed_precision_dtype, loss_scale):
+    config = cfg.ExperimentConfig(
+        runtime=cfg.RuntimeConfig(
+            mixed_precision_dtype=mixed_precision_dtype, loss_scale=loss_scale),
+        trainer=cfg.TrainerConfig(
+            optimizer_config=cfg.OptimizationConfig({
+                'optimizer': {
+                    'type': 'sgd'
+                },
+                'learning_rate': {
+                    'type': 'constant'
+                },
+            })))
+    trainer = self.create_test_trainer(config)
+    if mixed_precision_dtype != 'float16':
+      self.assertIsInstance(trainer.optimizer, tf.keras.optimizers.SGD)
+    elif mixed_precision_dtype == 'float16' and loss_scale is None:
+      self.assertIsInstance(trainer.optimizer, tf.keras.optimizers.SGD)
+    else:
+      self.assertIsInstance(trainer.optimizer,
+                            tf.keras.mixed_precision.LossScaleOptimizer)
+
+    metrics = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
+    self.assertIn('training_loss', metrics)
+
+  def test_export_best_ckpt(self):
+    config = cfg.ExperimentConfig(
+        trainer=cfg.TrainerConfig(
+            best_checkpoint_export_subdir='best_ckpt',
+            best_checkpoint_eval_metric='acc',
+            optimizer_config=cfg.OptimizationConfig({
+                'optimizer': {
+                    'type': 'sgd'
+                },
+                'learning_rate': {
+                    'type': 'constant'
+                }
+            })))
+    model_dir = self.get_temp_dir()
+    trainer = self.create_test_trainer(config, model_dir=model_dir)
+    trainer.train(tf.convert_to_tensor(1, dtype=tf.int32))
+    trainer.evaluate(tf.convert_to_tensor(1, dtype=tf.int32))
+    self.assertTrue(
+        tf.io.gfile.exists(os.path.join(model_dir, 'best_ckpt', 'info.json')))
+
+  def test_recovery(self):
+    config = cfg.ExperimentConfig(
+        trainer=cfg.TrainerConfig(
+            loss_upper_bound=0.5,
+            recovery_max_trials=2,
+            optimizer_config=cfg.OptimizationConfig({
+                'optimizer': {
+                    'type': 'sgd'
+                },
+                'learning_rate': {
+                    'type': 'constant'
+                }
+            })))
+    model_dir = self.get_temp_dir()
+    trainer = self.create_test_trainer(config, model_dir=model_dir)
+    checkpoint_manager = tf.train.CheckpointManager(
+        trainer.checkpoint, self.get_temp_dir(), max_to_keep=2)
+    checkpoint_manager.save()
+    trainer.add_recovery(config.trainer, checkpoint_manager=checkpoint_manager)
+    before_weights = trainer.model.get_weights()
+    _ = trainer.train(tf.convert_to_tensor(1, dtype=tf.int32))
+    # The training loss is 1.0 and upper_bound is 0.5, so the recover happens.
+    after_weights = trainer.model.get_weights()
+    for left, right in zip(before_weights, after_weights):
+      self.assertAllEqual(left, right)
+
+    # Let's the loss be NaN and max_trials = 0 to see RuntimeError.
+    config = cfg.ExperimentConfig(
+        trainer=cfg.TrainerConfig(
+            recovery_max_trials=0,
+            optimizer_config=cfg.OptimizationConfig({
+                'optimizer': {
+                    'type': 'sgd'
+                },
+                'learning_rate': {
+                    'type': 'constant'
+                }
+            })))
+    task = mock_task.MockTask(config.task, logging_dir=model_dir)
+
+    def build_losses(labels, model_outputs, aux_losses=None):
+      del labels, model_outputs
+      return tf.constant([np.nan], tf.float32) + aux_losses
+
+    task.build_losses = build_losses
+    trainer = trainer_lib.Trainer(
+        config,
+        task,
+        model=task.build_model(),
+        optimizer=task.create_optimizer(config.trainer.optimizer_config,
+                                        config.runtime))
+    trainer.add_recovery(config.trainer, checkpoint_manager=checkpoint_manager)
+    with self.assertRaises(RuntimeError):
+      _ = trainer.train(tf.convert_to_tensor(2, dtype=tf.int32))
+
+  def test_model_with_compiled_loss(self):
+    task = mock_task.MockTask()
+    model = task.build_model()
+    model.compile(loss=tf.keras.losses.CategoricalCrossentropy())
+    trainer = trainer_lib.Trainer(
+        self._config,
+        task,
+        model=model,
+        optimizer=task.create_optimizer(self._config.trainer.optimizer_config))
+    logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
+    self.assertIn('training_loss', logs)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/core/config_definitions.py b/cv/classification/resnet50/tensorflow2.0/core/config_definitions.py
new file mode 100644
index 000000000..434058edd
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/core/config_definitions.py
@@ -0,0 +1,252 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Common configuration settings."""
+
+from typing import Optional, Sequence, Union
+
+import dataclasses
+
+from modeling.hyperparams import base_config
+from modeling.optimization.configs import optimization_config
+
+OptimizationConfig = optimization_config.OptimizationConfig
+
+
+@dataclasses.dataclass
+class DataConfig(base_config.Config):
+  """The base configuration for building datasets.
+
+  Attributes:
+    input_path: The path to the input. It can be either (1) a str indicating a
+      file path/pattern, or (2) a str indicating multiple file paths/patterns
+      separated by comma (e.g "a, b, c" or no spaces "a,b,c"), or (3) a list of
+      str, each of which is a file path/pattern or multiple file paths/patterns
+      separated by comma, or (4) a dictionary of the previous three approaches
+      for more advanced data mixing using named access. It should not be
+      specified when the following `tfds_name` is specified.
+    tfds_name: The name of the tensorflow dataset (TFDS). It should not be
+      specified when the above `input_path` is specified.
+    tfds_split: A str indicating which split of the data to load from TFDS. It
+      is required when above `tfds_name` is specified.
+    global_batch_size: The global batch size across all replicas.
+    is_training: Whether this data is used for training or not.
+    drop_remainder: Whether the last batch should be dropped in the case it has
+      fewer than `global_batch_size` elements.
+    shuffle_buffer_size: The buffer size used for shuffling training data.
+    cache: Whether to cache dataset examples. If `True`, we will cache the
+      dataset after applying the decode_fn and parse_fn. It can be used to avoid
+      re-reading from disk, re-decoding and re-parsing the example on the second
+      epoch, but it requires significant memory overhead.
+    cycle_length: The number of files that will be processed concurrently when
+      interleaving files.
+    block_length: The number of consecutive elements to produce from each input
+      element before cycling to another input element when interleaving files.
+    deterministic: A boolean controlling whether determinism should be enforced.
+    sharding: Whether sharding is used in the input pipeline.
+    enable_tf_data_service: A boolean indicating whether to enable tf.data
+      service for the input pipeline.
+    tf_data_service_address: The URI of a tf.data service to offload
+      preprocessing onto during training. The URI should be in the format
+      "protocol://address", e.g. "grpc://tf-data-service:5050". It can be
+        overridden by `FLAGS.tf_data_service` flag in the binary.
+    tf_data_service_job_name: The name of the tf.data service job. This argument
+      makes it possible for multiple datasets to share the same job. The default
+      behavior is that the dataset creates anonymous, exclusively owned jobs.
+    tfds_data_dir: A str specifying the directory to read/write TFDS data.
+    tfds_as_supervised: A bool. When loading dataset from TFDS, if True, the
+      returned tf.data.Dataset will have a 2-tuple structure (input, label)
+      according to builder.info.supervised_keys; if False, the default, the
+      returned tf.data.Dataset will have a dictionary with all the features.
+    tfds_skip_decoding_feature: A str to indicate which features are skipped for
+      decoding when loading dataset from TFDS. Use comma to separate multiple
+      features. The main use case is to skip the image/video decoding for better
+      performance.
+    seed: An optional seed to use for deterministic shuffling/preprocessing.
+  """
+  input_path: Union[Sequence[str], str, base_config.Config] = ""
+  tfds_name: str = ""
+  tfds_split: str = ""
+  global_batch_size: int = 0
+  is_training: bool = None
+  drop_remainder: bool = True
+  shuffle_buffer_size: int = 100
+  cache: bool = False
+  cycle_length: Optional[int] = None
+  block_length: int = 1
+  deterministic: Optional[bool] = None
+  sharding: bool = True
+  enable_tf_data_service: bool = False
+  tf_data_service_address: Optional[str] = None
+  tf_data_service_job_name: Optional[str] = None
+  tfds_data_dir: str = ""
+  tfds_as_supervised: bool = False
+  tfds_skip_decoding_feature: str = ""
+  seed: Optional[int] = None
+
+
+@dataclasses.dataclass
+class RuntimeConfig(base_config.Config):
+  """High-level configurations for Runtime.
+
+  These include parameters that are not directly related to the experiment,
+  e.g. directories, accelerator type, etc.
+
+  Attributes:
+    distribution_strategy: e.g. 'mirrored', 'tpu', etc.
+    enable_xla: Whether or not to enable XLA.
+    per_gpu_thread_count: thread count per GPU.
+    gpu_thread_mode: Whether and how the GPU device uses its own threadpool.
+    dataset_num_private_threads: Number of threads for a private threadpool
+      created for all datasets computation.
+    tpu: The address of the TPU to use, if any.
+    num_gpus: The number of GPUs to use, if any.
+    worker_hosts: comma-separated list of worker ip:port pairs for running
+      multi-worker models with DistributionStrategy.
+    task_index: If multi-worker training, the task index of this worker.
+    all_reduce_alg: Defines the algorithm for performing all-reduce.
+    num_packs: Sets `num_packs` in the cross device ops used in
+      MirroredStrategy.  For details, see tf.distribute.NcclAllReduce.
+    mixed_precision_dtype: dtype of mixed precision policy. It can be 'float32',
+      'float16', or 'bfloat16'.
+    loss_scale: The type of loss scale, or 'float' value. This is used when
+      setting the mixed precision policy.
+    run_eagerly: Whether or not to run the experiment eagerly.
+    batchnorm_spatial_persistent: Whether or not to enable the spatial
+      persistent mode for CuDNN batch norm kernel for improved GPU performance.
+  """
+  distribution_strategy: str = "mirrored"
+  enable_xla: bool = False
+  gpu_thread_mode: Optional[str] = None
+  dataset_num_private_threads: Optional[int] = None
+  per_gpu_thread_count: int = 0
+  tpu: Optional[str] = None
+  num_gpus: int = 0
+  worker_hosts: Optional[str] = None
+  task_index: int = -1
+  all_reduce_alg: Optional[str] = None
+  num_packs: int = 1
+  mixed_precision_dtype: Optional[str] = None
+  loss_scale: Optional[Union[str, float]] = None
+  run_eagerly: bool = False
+  batchnorm_spatial_persistent: bool = False
+
+  # XLA runtime params.
+  # XLA params are only applied to the train_step.
+  # These augments can improve training speed. They can also improve eval, but
+  # may reduce usability and users would need to make changes to code.
+
+  # Whether to enable XLA dynamic padder
+  # infrastructure to handle dynamic shapes inputs inside XLA. True by
+  # default. Disabling this may cause correctness issues with dynamic shapes
+  # inputs, as XLA will just assume the inputs are with padded shapes. However
+  # users can optionally set it to False to improve device time if masking is
+  # already handled in the user side.
+  # If None, will respect XLA default.
+  tpu_enable_xla_dynamic_padder: Optional[bool] = None
+
+  # Global model parallelism configurations.
+  num_cores_per_replica: int = 1
+  default_shard_dim: int = -1
+
+  def model_parallelism(self):
+    return dict(
+        num_cores_per_replica=self.num_cores_per_replica,
+        default_shard_dim=self.default_shard_dim)
+
+
+@dataclasses.dataclass
+class TrainerConfig(base_config.Config):
+  """Configuration for trainer.
+
+  Attributes:
+    optimizer_config: optimizer config, it includes optimizer, learning rate,
+      and warmup schedule configs.
+    train_tf_while_loop: whether or not to use tf while loop.
+    train_tf_function: whether or not to use tf_function for training loop.
+    eval_tf_function: whether or not to use tf_function for eval.
+    allow_tpu_summary: Whether to allow summary happen inside the XLA program
+      runs on TPU through automatic outside compilation.
+    steps_per_loop: number of steps per loop.
+    summary_interval: number of steps between each summary.
+    checkpoint_interval: number of steps between checkpoints.
+    max_to_keep: max checkpoints to keep.
+    continuous_eval_timeout: maximum number of seconds to wait between
+      checkpoints, if set to None, continuous eval will wait indefinitely. This
+      is only used continuous_train_and_eval and continuous_eval modes. Default
+      value is 1 hrs.
+    train_steps: number of train steps.
+    validation_steps: number of eval steps. If `None`, the entire eval dataset
+      is used.
+    validation_interval: number of training steps to run between evaluations.
+    best_checkpoint_export_subdir: if set, the trainer will keep track of the
+      best evaluation metric, and export the corresponding best checkpoint under
+      `model_dir/best_checkpoint_export_subdir`. Note that this only works if
+      mode contains eval (such as `train_and_eval`, `continuous_eval`, and
+      `continuous_train_and_eval`).
+    best_checkpoint_eval_metric: for exporting the best checkpoint, which
+      evaluation metric the trainer should monitor. This can be any evaluation
+      metric appears on tensorboard.
+    best_checkpoint_metric_comp: for exporting the best checkpoint, how the
+      trainer should compare the evaluation metrics. This can be either `higher`
+      (higher the better) or `lower` (lower the better).
+    validation_summary_subdir: A 'str', sub directory for saving eval summary.
+  """
+  optimizer_config: OptimizationConfig = OptimizationConfig()
+  # Orbit settings.
+  train_tf_while_loop: bool = True
+  train_tf_function: bool = True
+  eval_tf_function: bool = True
+  eval_tf_while_loop: bool = False
+  allow_tpu_summary: bool = False
+  # Trainer intervals.
+  steps_per_loop: int = 1000
+  summary_interval: int = 1000
+  checkpoint_interval: int = 1000
+  # Checkpoint manager.
+  max_to_keep: int = 5
+  continuous_eval_timeout: int = 60 * 60
+  # Train/Eval routines.
+  train_steps: int = 0
+  # Sets validation steps to be -1 to evaluate the entire dataset.
+  validation_steps: int = -1
+  validation_interval: int = 1000
+  # Best checkpoint export.
+  best_checkpoint_export_subdir: str = ""
+  best_checkpoint_eval_metric: str = ""
+  best_checkpoint_metric_comp: str = "higher"
+  # Blowup recovery.
+  loss_upper_bound: float = 1e6
+  recovery_begin_steps: int = 0  # Enforcing the loss bound after these steps.
+  # When max trials < 0, no recovery module; max trials = 0, we will check
+  # the condition and fail the job if the condition happens; max trials > 0,
+  # we will retore the model states.
+  recovery_max_trials: int = 0
+  validation_summary_subdir: str = "validation"
+
+
+@dataclasses.dataclass
+class TaskConfig(base_config.Config):
+  init_checkpoint: str = ""
+  model: base_config.Config = None
+  train_data: DataConfig = DataConfig()
+  validation_data: DataConfig = DataConfig()
+
+
+@dataclasses.dataclass
+class ExperimentConfig(base_config.Config):
+  """Top-level configuration."""
+  task: TaskConfig = TaskConfig()
+  trainer: TrainerConfig = TrainerConfig()
+  runtime: RuntimeConfig = RuntimeConfig()
diff --git a/cv/classification/resnet50/tensorflow2.0/core/exp_factory.py b/cv/classification/resnet50/tensorflow2.0/core/exp_factory.py
new file mode 100644
index 000000000..0cf90f84c
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/core/exp_factory.py
@@ -0,0 +1,36 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Experiment factory methods."""
+
+from core import config_definitions as cfg
+from core import registry
+
+
+_REGISTERED_CONFIGS = {}
+
+
+def register_config_factory(name):
+  """Register ExperimentConfig factory method."""
+  return registry.register(_REGISTERED_CONFIGS, name)
+
+
+def get_exp_config_creater(exp_name: str):
+  """Looks up ExperimentConfig factory methods."""
+  exp_creater = registry.lookup(_REGISTERED_CONFIGS, exp_name)
+  return exp_creater
+
+
+def get_exp_config(exp_name: str) -> cfg.ExperimentConfig:
+  return get_exp_config_creater(exp_name)()
diff --git a/cv/classification/resnet50/tensorflow2.0/core/export_base.py b/cv/classification/resnet50/tensorflow2.0/core/export_base.py
new file mode 100644
index 000000000..0937db0c6
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/core/export_base.py
@@ -0,0 +1,109 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Base class for model export."""
+
+import abc
+import functools
+from typing import Any, Callable, Dict, Mapping, List, Optional, Text, Union
+
+import tensorflow as tf
+from tensorflow.python.saved_model.model_utils import export_utils
+
+
+class ExportModule(tf.Module, metaclass=abc.ABCMeta):
+  """Base Export Module."""
+
+  def __init__(self,
+               params,
+               model: Union[tf.Module, tf.keras.Model],
+               inference_step: Optional[Callable[..., Any]] = None):
+    """Instantiates an ExportModel.
+
+    Args:
+      params: A dataclass for parameters to the module.
+      model: A model instance which contains weights and forward computation.
+      inference_step: An optional callable to define how the model is called.
+    """
+    super().__init__(name=None)
+    self.model = model
+    self.params = params
+
+    if inference_step is not None:
+      self.inference_step = functools.partial(inference_step, model=self.model)
+    else:
+      self.inference_step = functools.partial(
+          self.model.__call__, training=False)
+
+  @abc.abstractmethod
+  def serve(self) -> Mapping[Text, tf.Tensor]:
+    """The bare inference function which should run on all devices.
+
+    Expecting tensors are passed in through keyword arguments. Returns a
+    dictionary of tensors, when the keys will be used inside the SignatureDef.
+    """
+
+  @abc.abstractmethod
+  def get_inference_signatures(
+      self, function_keys: Dict[Text, Text]) -> Mapping[Text, Any]:
+    """Get defined function signatures."""
+
+
+def export(export_module: ExportModule,
+           function_keys: Union[List[Text], Dict[Text, Text]],
+           export_savedmodel_dir: Text,
+           checkpoint_path: Optional[Text] = None,
+           timestamped: bool = True,
+           save_options: Optional[tf.saved_model.SaveOptions] = None) -> Text:
+  """Exports to SavedModel format.
+
+  Args:
+    export_module: a ExportModule with the keras Model and serving tf.functions.
+    function_keys: a list of string keys to retrieve pre-defined serving
+      signatures. The signaute keys will be set with defaults. If a dictionary
+      is provided, the values will be used as signature keys.
+    export_savedmodel_dir: Output saved model directory.
+    checkpoint_path: Object-based checkpoint path or directory.
+    timestamped: Whether to export the savedmodel to a timestamped directory.
+    save_options: `SaveOptions` for `tf.saved_model.save`.
+
+  Returns:
+    The savedmodel directory path.
+  """
+  ckpt_dir_or_file = checkpoint_path
+  if tf.io.gfile.isdir(ckpt_dir_or_file):
+    ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
+  if ckpt_dir_or_file:
+    checkpoint = tf.train.Checkpoint(model=export_module.model)
+    checkpoint.read(
+        ckpt_dir_or_file).assert_existing_objects_matched().expect_partial()
+  if isinstance(function_keys, list):
+    if len(function_keys) == 1:
+      function_keys = {
+          function_keys[0]: tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+      }
+    else:
+      raise ValueError(
+          "If the function_keys is a list, it must contain a single element. %s"
+          % function_keys)
+
+  signatures = export_module.get_inference_signatures(function_keys)
+  if timestamped:
+    export_dir = export_utils.get_timestamped_export_dir(
+        export_savedmodel_dir).decode("utf-8")
+  else:
+    export_dir = export_savedmodel_dir
+  tf.saved_model.save(
+      export_module, export_dir, signatures=signatures, options=save_options)
+  return export_dir
diff --git a/cv/classification/resnet50/tensorflow2.0/core/export_base_test.py b/cv/classification/resnet50/tensorflow2.0/core/export_base_test.py
new file mode 100644
index 000000000..3358a4d4f
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/core/export_base_test.py
@@ -0,0 +1,88 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for official.core.export_base."""
+import os
+from typing import Any, Dict, Mapping, Text
+
+import tensorflow as tf
+
+from official.core import export_base
+
+
+class TestModule(export_base.ExportModule):
+
+  @tf.function
+  def serve(self, inputs: tf.Tensor) -> Mapping[Text, tf.Tensor]:
+    return {'outputs': self.inference_step(inputs)}
+
+  def get_inference_signatures(
+      self, function_keys: Dict[Text, Text]) -> Mapping[Text, Any]:
+    input_signature = tf.TensorSpec(shape=[None, None], dtype=tf.float32)
+    return {'foo': self.serve.get_concrete_function(input_signature)}
+
+
+class ExportBaseTest(tf.test.TestCase):
+
+  def test_export_module(self):
+    tmp_dir = self.get_temp_dir()
+    model = tf.keras.layers.Dense(2)
+    inputs = tf.ones([2, 4], tf.float32)
+    expected_output = model(inputs, training=False)
+    module = TestModule(params=None, model=model)
+    ckpt_path = tf.train.Checkpoint(model=model).save(
+        os.path.join(tmp_dir, 'ckpt'))
+    export_dir = export_base.export(
+        module, ['foo'],
+        export_savedmodel_dir=tmp_dir,
+        checkpoint_path=ckpt_path,
+        timestamped=True)
+    self.assertTrue(os.path.exists(os.path.join(export_dir, 'saved_model.pb')))
+    self.assertTrue(
+        os.path.exists(
+            os.path.join(export_dir, 'variables', 'variables.index')))
+    self.assertTrue(
+        os.path.exists(
+            os.path.join(export_dir, 'variables',
+                         'variables.data-00000-of-00001')))
+
+    imported = tf.saved_model.load(export_dir)
+    output = imported.signatures['foo'](inputs)
+    self.assertAllClose(output['outputs'].numpy(), expected_output.numpy())
+
+  def test_custom_inference_step(self):
+    tmp_dir = self.get_temp_dir()
+    model = tf.keras.layers.Dense(2)
+    inputs = tf.ones([2, 4], tf.float32)
+
+    def _inference_step(inputs, model):
+      return tf.nn.softmax(model(inputs, training=False))
+
+    module = TestModule(
+        params=None, model=model, inference_step=_inference_step)
+    expected_output = _inference_step(inputs, model)
+    ckpt_path = tf.train.Checkpoint(model=model).save(
+        os.path.join(tmp_dir, 'ckpt'))
+    export_dir = export_base.export(
+        module, ['foo'],
+        export_savedmodel_dir=tmp_dir,
+        checkpoint_path=ckpt_path,
+        timestamped=False)
+    imported = tf.saved_model.load(export_dir)
+    output = imported.signatures['foo'](inputs)
+    self.assertAllClose(output['outputs'].numpy(), expected_output.numpy())
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/core/input_reader.py b/cv/classification/resnet50/tensorflow2.0/core/input_reader.py
new file mode 100644
index 000000000..5769bcb42
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/core/input_reader.py
@@ -0,0 +1,423 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A common dataset reader."""
+import random
+from typing import Any, Callable, List, Optional, Union, Dict, Sequence
+
+from absl import logging
+import tensorflow as tf
+import tensorflow_datasets as tfds
+
+from core import config_definitions as cfg
+
+
+def _get_random_integer():
+  return random.randint(0, (1 << 31) - 1)
+
+
+def _maybe_map_fn(dataset: tf.data.Dataset,
+                  fn: Optional[Callable[..., Any]] = None) -> tf.data.Dataset:
+  """Calls dataset.map if a valid function is passed in."""
+  return dataset if fn is None else dataset.map(
+      fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+
+class InputReader:
+  """Input reader that returns a tf.data.Dataset instance."""
+
+  # A static random number which is the same across different InputReader
+  # instances.
+  static_randnum = _get_random_integer()
+
+  def __init__(self,
+               params: cfg.DataConfig,
+               dataset_fn=tf.data.TFRecordDataset,
+               decoder_fn: Optional[Callable[..., Any]] = None,
+               combine_fn: Optional[Callable[..., Any]] = None,
+               sample_fn: Optional[Callable[..., Any]] = None,
+               parser_fn: Optional[Callable[..., Any]] = None,
+               transform_and_batch_fn: Optional[Callable[
+                   [tf.data.Dataset, Optional[tf.distribute.InputContext]],
+                   tf.data.Dataset]] = None,
+               postprocess_fn: Optional[Callable[..., Any]] = None):
+    """Initializes an InputReader instance.
+
+    Args:
+      params: A config_definitions.DataConfig object.
+      dataset_fn: A `tf.data.Dataset` that consumes the input files. For
+        example, it can be `tf.data.TFRecordDataset`.
+      decoder_fn: An optional `callable` that takes the serialized data string
+        and decodes them into the raw tensor dictionary.
+      combine_fn: An optional `callable` that takes a dictionarty of
+        `tf.data.Dataset` objects as input and outputs a combined dataset. It
+        will be executed after the decoder_fn and before the sample_fn.
+      sample_fn: An optional `callable` that takes a `tf.data.Dataset` object as
+        input and outputs the transformed dataset. It performs sampling on the
+        decoded raw tensors dict before the parser_fn.
+      parser_fn: An optional `callable` that takes the decoded raw tensors dict
+        and parse them into a dictionary of tensors that can be consumed by the
+        model. It will be executed after decoder_fn.
+      transform_and_batch_fn: An optional `callable` that takes a
+        `tf.data.Dataset` object and an optional `tf.distribute.InputContext` as
+        input, and returns a `tf.data.Dataset` object. It will be executed after
+        `parser_fn` to transform and batch the dataset; if None, after
+        `parser_fn` is executed, the dataset will be batched into per-replica
+        batch size.
+      postprocess_fn: A optional `callable` that processes batched tensors. It
+        will be executed after batching.
+    """
+    if params.input_path and params.tfds_name:
+      raise ValueError('At most one of `input_path` and `tfds_name` can be '
+                       'specified, but got %s and %s.' %
+                       (params.input_path, params.tfds_name))
+
+    if isinstance(params.input_path,
+                  cfg.base_config.Config) and combine_fn is None:
+      raise ValueError(
+          'A `combine_fn` is required if the `input_path` is a dictionary.')
+
+    self._tfds_builder = None
+    self._matched_files = None
+    if params.input_path:
+      # we want to combine / mix datasets
+      if isinstance(params.input_path, cfg.base_config.Config):
+        self._matched_files = {}
+        for k, v in params.input_path.as_dict().items():
+          self._matched_files[k] = self._match_files(v)
+      # single dataset
+      else:
+        self._matched_files = self._match_files(params.input_path)
+    else:
+      # Read dataset from TFDS.
+      if not params.tfds_split:
+        raise ValueError(
+            '`tfds_name` is %s, but `tfds_split` is not specified.' %
+            params.tfds_name)
+      self._tfds_builder = tfds.builder(
+          params.tfds_name, data_dir=params.tfds_data_dir)
+
+    self._global_batch_size = params.global_batch_size
+    self._is_training = params.is_training
+    self._drop_remainder = params.drop_remainder
+    self._shuffle_buffer_size = params.shuffle_buffer_size
+    self._cache = params.cache
+    self._cycle_length = params.cycle_length
+    self._block_length = params.block_length
+    self._deterministic = params.deterministic
+    self._sharding = params.sharding
+    self._tfds_split = params.tfds_split
+    self._tfds_as_supervised = params.tfds_as_supervised
+    self._tfds_skip_decoding_feature = params.tfds_skip_decoding_feature
+
+    self._dataset_fn = dataset_fn
+    self._decoder_fn = decoder_fn
+    self._combine_fn = combine_fn
+    self._sample_fn = sample_fn
+    self._parser_fn = parser_fn
+    self._transform_and_batch_fn = transform_and_batch_fn
+    self._postprocess_fn = postprocess_fn
+    self._seed = params.seed
+
+    # When tf.data service is enabled, each data service worker should get
+    # different random seeds. Thus, we set `seed` to None.
+    # Sharding should also be disabled because tf data service handles how
+    # each worker shard data with `processing_mode` in distribute method.
+    if params.enable_tf_data_service:
+      self._seed = None
+      self._sharding = False
+
+    self._enable_tf_data_service = (
+        params.enable_tf_data_service and params.tf_data_service_address)
+    self._tf_data_service_address = params.tf_data_service_address
+    if self._enable_tf_data_service:
+      # Add a random seed as the tf.data service job name suffix, so tf.data
+      # service doesn't reuse the previous state if TPU worker gets preempted.
+      self._tf_data_service_job_name = (
+          params.tf_data_service_job_name + str(self.static_randnum))
+      self._enable_round_robin_tf_data_service = params.get(
+          'enable_round_robin_tf_data_service', False)
+
+  def _match_files(self, input_path: Union[Sequence[str], str]) -> List[str]:
+    """Matches files from an input_path."""
+    matched_files = []
+    # Read dataset from files.
+    usage = ('`input_path` should be either (1) a str indicating a file '
+             'path/pattern, or (2) a str indicating multiple file '
+             'paths/patterns separated by comma (e.g "a, b, c" or no spaces '
+             '"a,b,c", or (3) a list of str, each of which is a file '
+             'path/pattern or multiple file paths/patterns separated by '
+             'comma, but got: %s')
+    if isinstance(input_path, str):
+      input_path_list = [input_path]
+    elif isinstance(input_path, (list, tuple)):
+      if any(not isinstance(x, str) for x in input_path):
+        raise ValueError(usage % input_path)
+      input_path_list = input_path
+    else:
+      raise ValueError(usage % input_path)
+
+    for input_path in input_path_list:
+      input_patterns = input_path.strip().split(',')
+      for input_pattern in input_patterns:
+        input_pattern = input_pattern.strip()
+        if not input_pattern:
+          continue
+        if '*' in input_pattern or '?' in input_pattern:
+          tmp_matched_files = tf.io.gfile.glob(input_pattern)
+          if not tmp_matched_files:
+            raise ValueError('%s does not match any files.' % input_pattern)
+          matched_files.extend(tmp_matched_files)
+        else:
+          matched_files.append(input_pattern)
+
+    if not matched_files:
+      raise ValueError('%s does not match any files.' % input_path)
+
+    return matched_files
+
+  def _shard_files_then_read(
+      self,
+      matched_files: List[str],
+      dataset_fn,
+      input_context: Optional[tf.distribute.InputContext] = None
+  ) -> tf.data.Dataset:
+    """Shards the data files and then sent a split to every worker to read."""
+    dataset = tf.data.Dataset.from_tensor_slices(matched_files)
+
+    # Shuffle and repeat at file level.
+    # If cache is enabled, `reshuffle_each_iteration` is set to False,
+    # because we will read the same cached data in every iteration anyway.
+    if self._is_training:
+      # We need a seed to shuffle the files so that when each TPU workers gets
+      # its own shard the files do not overlap.
+      if self._sharding and self._seed is None:
+        seed = _get_random_integer()
+      else:
+        seed = self._seed
+      dataset = dataset.shuffle(
+          len(matched_files),
+          seed=seed,
+          reshuffle_each_iteration=True if not self._cache else False)
+
+    # Do not enable sharding if tf.data service is enabled, as sharding will be
+    # handled inside tf.data service.
+    if self._sharding and input_context and (input_context.num_input_pipelines >
+                                             1):
+      dataset = dataset.shard(input_context.num_input_pipelines,
+                              input_context.input_pipeline_id)
+
+    # If cache is enabled, we will call `repeat()` later after `cache()`.
+    if self._is_training and not self._cache:
+      dataset = dataset.repeat()
+
+    dataset = dataset.interleave(
+        map_func=dataset_fn,
+        cycle_length=self._cycle_length,
+        block_length=self._block_length,
+        num_parallel_calls=(self._cycle_length if self._cycle_length else
+                            tf.data.experimental.AUTOTUNE),
+        deterministic=self._deterministic)
+    return dataset
+
+  def _read_files_then_shard(
+      self,
+      matched_files: List[str],
+      dataset_fn,
+      input_context: Optional[tf.distribute.InputContext] = None
+  ) -> tf.data.Dataset:
+    """Sends all data files to every worker and then shard by data."""
+    dataset = dataset_fn(matched_files)
+
+    # When `input_file` is a path to a single file or the number of files is
+    # less than the number of input pipelines, disable auto sharding
+    # so that same input file is sent to all workers.
+    options = tf.data.Options()
+    options.experimental_distribute.auto_shard_policy = (
+        tf.data.experimental.AutoShardPolicy.OFF)
+    dataset = dataset.with_options(options)
+    # Do not enable sharding if tf.data service is enabled, as sharding will be
+    # handled inside tf.data service.
+    if self._sharding and input_context and (input_context.num_input_pipelines >
+                                             1):
+      dataset = dataset.shard(input_context.num_input_pipelines,
+                              input_context.input_pipeline_id)
+
+    # If cache is enabled, we will call `repeat()` later after `cache()`.
+    if self._is_training and not self._cache:
+      dataset = dataset.repeat()
+    return dataset
+
+  def _read_tfds(
+      self,
+      input_context: Optional[tf.distribute.InputContext] = None
+  ) -> tf.data.Dataset:
+    """Reads a dataset from tfds."""
+    # No op if exist.
+    self._tfds_builder.download_and_prepare()
+
+    read_config = tfds.ReadConfig(
+        interleave_cycle_length=self._cycle_length,
+        interleave_block_length=self._block_length,
+        input_context=input_context,
+        shuffle_seed=self._seed)
+    decoders = {}
+    if self._tfds_skip_decoding_feature:
+      for skip_feature in self._tfds_skip_decoding_feature.split(','):
+        decoders[skip_feature.strip()] = tfds.decode.SkipDecoding()
+    dataset = self._tfds_builder.as_dataset(
+        split=self._tfds_split,
+        shuffle_files=self._is_training,
+        as_supervised=self._tfds_as_supervised,
+        decoders=decoders,
+        read_config=read_config)
+
+    # If cache is enabled, we will call `repeat()` later after `cache()`.
+    if self._is_training and not self._cache:
+      dataset = dataset.repeat()
+    return dataset
+
+  @property
+  def tfds_info(self) -> tfds.core.DatasetInfo:
+    """Returns TFDS dataset info, if available."""
+    if self._tfds_builder:
+      return self._tfds_builder.info
+    else:
+      raise ValueError('tfds_info is not available, because the dataset '
+                       'is not loaded from tfds.')
+
+  def _read_decode_and_parse_dataset(
+      self,
+      matched_files: Union[Dict[str, List[str]], List[str]],
+      dataset_fn,
+      batch_size: int,
+      input_context: Optional[tf.distribute.InputContext] = None,
+      tfds_builder: bool = False) -> tf.data.Dataset:
+    """Returns a tf.data.Dataset object after reading, decoding, and parsing."""
+
+    def _files_to_dataset(files: List[str]) -> tf.data.Dataset:
+      if len(files) > 1:
+        if input_context and (len(files) < input_context.num_input_pipelines):
+          logging.warn(
+              'The number of files %d is less than the number of input pipelines '
+              '%d. We will send all input files to every worker. '
+              'Please consider sharding your data into more files.', len(files),
+              input_context.num_input_pipelines)
+          return self._read_files_then_shard(files, dataset_fn, input_context)
+        else:
+          return self._shard_files_then_read(files, dataset_fn, input_context)
+      elif len(files) == 1:
+        return self._read_files_then_shard(files, dataset_fn, input_context)
+      else:
+        raise ValueError('It is unexpected that `tfds_builder` is None and '
+                         'there is also no `files`.')
+
+    def _shuffle_and_decode(ds):
+      # If cache is enabled, we will call `shuffle()` later after `cache()`.
+      if self._is_training and not self._cache:
+        ds = ds.shuffle(self._shuffle_buffer_size, seed=self._seed)
+      # Decode
+      ds = _maybe_map_fn(ds, self._decoder_fn)
+      return ds
+
+    if tfds_builder:
+      dataset = self._read_tfds(input_context)
+      dataset = _shuffle_and_decode(dataset)
+    elif isinstance(matched_files, (list, tuple)):
+      dataset = _files_to_dataset(matched_files)
+      dataset = _shuffle_and_decode(dataset)
+    elif isinstance(matched_files, dict):
+      datasets = {}
+      for k, fs in matched_files.items():
+        datasets[k] = _files_to_dataset(fs)
+        datasets[k] = _shuffle_and_decode(datasets[k])
+      dataset = self._combine_fn(datasets)
+    else:
+      raise ValueError('`matched_files` should be a list or dict.')
+
+    if self._sample_fn is not None:
+      dataset = dataset.apply(self._sample_fn)
+    dataset = _maybe_map_fn(dataset, self._parser_fn)
+
+    if self._cache:
+      dataset = dataset.cache()
+      if self._is_training:
+        dataset = dataset.repeat()
+        dataset = dataset.shuffle(self._shuffle_buffer_size, seed=self._seed)
+
+    if self._transform_and_batch_fn is not None:
+      dataset = self._transform_and_batch_fn(dataset, input_context)
+    else:
+      per_replica_batch_size = input_context.get_per_replica_batch_size(
+          batch_size) if input_context else batch_size
+      dataset = dataset.batch(
+          per_replica_batch_size, drop_remainder=self._drop_remainder)
+
+    return dataset
+
+  def _maybe_apply_data_service(
+      self,
+      dataset: tf.data.Dataset,
+      input_context: Optional[tf.distribute.InputContext] = None
+  ) -> tf.data.Dataset:
+    """Potentially distributes a dataset."""
+    if self._enable_tf_data_service and input_context:
+      if self._enable_round_robin_tf_data_service:
+        replicas_per_input_pipeline = input_context.num_replicas_in_sync // (
+            input_context.num_input_pipelines)
+        base_consumer_index = input_context.input_pipeline_id * (
+            replicas_per_input_pipeline)
+        num_consumers = input_context.num_input_pipelines * (
+            replicas_per_input_pipeline)
+        range_dataset = tf.data.Dataset.range(replicas_per_input_pipeline)
+        dataset = range_dataset.map(lambda i: dataset.apply(  # pylint: disable=g-long-lambda
+            tf.data.experimental.service.distribute(
+                processing_mode='parallel_epochs',
+                service=self._tf_data_service_address,
+                job_name=self._tf_data_service_job_name,
+                consumer_index=base_consumer_index + i,
+                num_consumers=num_consumers)))
+        # Use parallel interleave to read multiple batches from a tf.data
+        # service worker in parallel.
+        dataset = dataset.interleave(
+            lambda x: x,
+            cycle_length=replicas_per_input_pipeline,
+            num_parallel_calls=replicas_per_input_pipeline,
+            deterministic=True)
+      else:
+        dataset = dataset.apply(
+            tf.data.experimental.service.distribute(
+                processing_mode='parallel_epochs',
+                service=self._tf_data_service_address,
+                job_name=self._tf_data_service_job_name))
+    return dataset
+
+  def read(
+      self,
+      input_context: Optional[tf.distribute.InputContext] = None
+  ) -> tf.data.Dataset:
+    """Generates a tf.data.Dataset object."""
+    dataset = self._read_decode_and_parse_dataset(self._matched_files,
+                                                  self._dataset_fn,
+                                                  self._global_batch_size,
+                                                  input_context,
+                                                  self._tfds_builder)
+    dataset = _maybe_map_fn(dataset, self._postprocess_fn)
+    dataset = self._maybe_apply_data_service(dataset, input_context)
+
+    if self._deterministic is not None:
+      options = tf.data.Options()
+      options.experimental_deterministic = self._deterministic
+      dataset = dataset.with_options(options)
+    return dataset.prefetch(tf.data.experimental.AUTOTUNE)
diff --git a/cv/classification/resnet50/tensorflow2.0/core/registry.py b/cv/classification/resnet50/tensorflow2.0/core/registry.py
new file mode 100644
index 000000000..0ea96b062
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/core/registry.py
@@ -0,0 +1,97 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Registry utility."""
+
+
+def register(registered_collection, reg_key):
+  """Register decorated function or class to collection.
+
+  Register decorated function or class into registered_collection, in a
+  hierarchical order. For example, when reg_key="my_model/my_exp/my_config_0"
+  the decorated function or class is stored under
+  registered_collection["my_model"]["my_exp"]["my_config_0"].
+  This decorator is supposed to be used together with the lookup() function in
+  this file.
+
+  Args:
+    registered_collection: a dictionary. The decorated function or class will be
+      put into this collection.
+    reg_key: The key for retrieving the registered function or class. If reg_key
+      is a string, it can be hierarchical like my_model/my_exp/my_config_0
+  Returns:
+    A decorator function
+  Raises:
+    KeyError: when function or class to register already exists.
+  """
+  def decorator(fn_or_cls):
+    """Put fn_or_cls in the dictionary."""
+    if isinstance(reg_key, str):
+      hierarchy = reg_key.split("/")
+      collection = registered_collection
+      for h_idx, entry_name in enumerate(hierarchy[:-1]):
+        if entry_name not in collection:
+          collection[entry_name] = {}
+        collection = collection[entry_name]
+        if not isinstance(collection, dict):
+          raise KeyError(
+              "Collection path {} at position {} already registered as "
+              "a function or class.".format(entry_name, h_idx))
+      leaf_reg_key = hierarchy[-1]
+    else:
+      collection = registered_collection
+      leaf_reg_key = reg_key
+
+    if leaf_reg_key in collection:
+      raise KeyError("Function or class {} registered multiple times.".format(
+          leaf_reg_key))
+
+    collection[leaf_reg_key] = fn_or_cls
+    return fn_or_cls
+  return decorator
+
+
+def lookup(registered_collection, reg_key):
+  """Lookup and return decorated function or class in the collection.
+
+  Lookup decorated function or class in registered_collection, in a
+  hierarchical order. For example, when
+  reg_key="my_model/my_exp/my_config_0",
+  this function will return
+  registered_collection["my_model"]["my_exp"]["my_config_0"].
+
+  Args:
+    registered_collection: a dictionary. The decorated function or class will be
+      retrieved from this collection.
+    reg_key: The key for retrieving the registered function or class. If reg_key
+      is a string, it can be hierarchical like my_model/my_exp/my_config_0
+  Returns:
+    The registered function or class.
+  Raises:
+    LookupError: when reg_key cannot be found.
+  """
+  if isinstance(reg_key, str):
+    hierarchy = reg_key.split("/")
+    collection = registered_collection
+    for h_idx, entry_name in enumerate(hierarchy):
+      if entry_name not in collection:
+        raise LookupError(
+            "collection path {} at position {} never registered.".format(
+                entry_name, h_idx))
+      collection = collection[entry_name]
+    return collection
+  else:
+    if reg_key not in registered_collection:
+      raise LookupError("registration key {} never registered.".format(reg_key))
+    return registered_collection[reg_key]
diff --git a/cv/classification/resnet50/tensorflow2.0/core/registry_test.py b/cv/classification/resnet50/tensorflow2.0/core/registry_test.py
new file mode 100644
index 000000000..0d0639c6b
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/core/registry_test.py
@@ -0,0 +1,88 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for registry."""
+
+import tensorflow as tf
+from official.core import registry
+
+
+class RegistryTest(tf.test.TestCase):
+
+  def test_register(self):
+    collection = {}
+
+    @registry.register(collection, 'functions/func_0')
+    def func_test():
+      pass
+
+    self.assertEqual(registry.lookup(collection, 'functions/func_0'), func_test)
+
+    @registry.register(collection, 'classes/cls_0')
+    class ClassRegistryKey:
+      pass
+
+    self.assertEqual(
+        registry.lookup(collection, 'classes/cls_0'), ClassRegistryKey)
+
+    @registry.register(collection, ClassRegistryKey)
+    class ClassRegistryValue:
+      pass
+
+    self.assertEqual(
+        registry.lookup(collection, ClassRegistryKey), ClassRegistryValue)
+
+  def test_register_hierarchy(self):
+    collection = {}
+
+    @registry.register(collection, 'functions/func_0')
+    def func_test0():
+      pass
+
+    @registry.register(collection, 'func_1')
+    def func_test1():
+      pass
+
+    @registry.register(collection, func_test1)
+    def func_test2():
+      pass
+
+    expected_collection = {
+        'functions': {
+            'func_0': func_test0,
+        },
+        'func_1': func_test1,
+        func_test1: func_test2,
+    }
+    self.assertEqual(collection, expected_collection)
+
+  def test_register_error(self):
+    collection = {}
+
+    @registry.register(collection, 'functions/func_0')
+    def func_test0():  # pylint: disable=unused-variable
+      pass
+
+    with self.assertRaises(KeyError):
+
+      @registry.register(collection, 'functions/func_0/sub_func')
+      def func_test1():  # pylint: disable=unused-variable
+        pass
+
+    with self.assertRaises(LookupError):
+      registry.lookup(collection, 'non-exist')
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/core/task_factory.py b/cv/classification/resnet50/tensorflow2.0/core/task_factory.py
new file mode 100644
index 000000000..f5a8fcd7e
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/core/task_factory.py
@@ -0,0 +1,67 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A global factory to register and access all registered tasks."""
+
+from . import registry
+
+_REGISTERED_TASK_CLS = {}
+
+
+# TODO(b/158741360): Add type annotations once pytype checks across modules.
+def register_task_cls(task_config_cls):
+  """Decorates a factory of Tasks for lookup by a subclass of TaskConfig.
+
+  This decorator supports registration of tasks as follows:
+
+  ```
+  @dataclasses.dataclass
+  class MyTaskConfig(TaskConfig):
+    # Add fields here.
+    pass
+
+  @register_task_cls(MyTaskConfig)
+  class MyTask(Task):
+    # Inherits def __init__(self, task_config).
+    pass
+
+  my_task_config = MyTaskConfig()
+  my_task = get_task(my_task_config)  # Returns MyTask(my_task_config).
+  ```
+
+  Besisdes a class itself, other callables that create a Task from a TaskConfig
+  can be decorated by the result of this function, as long as there is at most
+  one registration for each config class.
+
+  Args:
+    task_config_cls: a subclass of TaskConfig (*not* an instance of TaskConfig).
+      Each task_config_cls can only be used for a single registration.
+
+  Returns:
+    A callable for use as class decorator that registers the decorated class
+    for creation from an instance of task_config_cls.
+  """
+  return registry.register(_REGISTERED_TASK_CLS, task_config_cls)
+
+
+def get_task(task_config, **kwargs):
+  """Creates a Task (of suitable subclass type) from task_config."""
+  return get_task_cls(task_config.__class__)(task_config, **kwargs)
+
+
+# The user-visible get_task() is defined after classes have been registered.
+# TODO(b/158741360): Add type annotations once pytype checks across modules.
+def get_task_cls(task_config_cls):
+  task_cls = registry.lookup(_REGISTERED_TASK_CLS, task_config_cls)
+  return task_cls
diff --git a/cv/classification/resnet50/tensorflow2.0/core/train_lib.py b/cv/classification/resnet50/tensorflow2.0/core/train_lib.py
new file mode 100644
index 000000000..97f9a0092
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/core/train_lib.py
@@ -0,0 +1,145 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TFM common training driver library."""
+# pytype: disable=attribute-error
+import os
+from typing import Any, Mapping, Optional, Tuple
+
+# Import libraries
+
+from absl import logging
+import orbit
+import tensorflow as tf
+
+from . import actions
+from . import base_task
+from . import base_trainer
+from . import config_definitions
+from . import train_utils
+
+maybe_create_best_ckpt_exporter = train_utils.maybe_create_best_ckpt_exporter
+
+
+def run_experiment(
+    distribution_strategy: tf.distribute.Strategy,
+    task: base_task.Task,
+    mode: str,
+    params: config_definitions.ExperimentConfig,
+    model_dir: str,
+    run_post_eval: bool = False,
+    save_summary: bool = True,
+    trainer: Optional[base_trainer.Trainer] = None,
+    controller_cls=orbit.Controller
+) -> Tuple[tf.keras.Model, Mapping[str, Any]]:
+  """Runs train/eval configured by the experiment params.
+
+  Args:
+    distribution_strategy: A distribution distribution_strategy.
+    task: A Task instance.
+    mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval'
+      or 'continuous_eval'.
+    params: ExperimentConfig instance.
+    model_dir: A 'str', a path to store model checkpoints and summaries.
+    run_post_eval: Whether to run post eval once after training, metrics logs
+      are returned.
+    save_summary: Whether to save train and validation summary.
+    trainer: the base_trainer.Trainer instance. It should be created within the
+      strategy.scope().
+    controller_cls: The controller class to manage the train and eval process.
+      Must be a orbit.Controller subclass.
+
+  Returns:
+    A 2-tuple of (model, eval_logs).
+      model: `tf.keras.Model` instance.
+      eval_logs: returns eval metrics logs when run_post_eval is set to True,
+        otherwise, returns {}.
+  """
+
+  with distribution_strategy.scope():
+    if not trainer:
+      trainer = train_utils.create_trainer(
+          params,
+          task,
+          train='train' in mode,
+          evaluate=('eval' in mode) or run_post_eval,
+          checkpoint_exporter=maybe_create_best_ckpt_exporter(
+              params, model_dir))
+
+  if trainer.checkpoint:
+    if model_dir is None:
+      raise ValueError('model_dir must be specified, but got None')
+    checkpoint_manager = tf.train.CheckpointManager(
+        trainer.checkpoint,
+        directory=model_dir,
+        max_to_keep=params.trainer.max_to_keep,
+        step_counter=trainer.global_step,
+        checkpoint_interval=params.trainer.checkpoint_interval,
+        init_fn=trainer.initialize)
+    # Adds recovery handling.
+    trainer.add_recovery(params.trainer, checkpoint_manager=checkpoint_manager)
+  else:
+    checkpoint_manager = None
+
+  controller = controller_cls(
+      strategy=distribution_strategy,
+      trainer=trainer if 'train' in mode else None,
+      evaluator=trainer,
+      global_step=trainer.global_step,
+      steps_per_loop=params.trainer.steps_per_loop,
+      checkpoint_manager=checkpoint_manager,
+      summary_dir=os.path.join(model_dir, 'train') if (save_summary) else None,
+      eval_summary_dir=os.path.join(model_dir,
+                                    params.trainer.validation_summary_subdir) if
+      (save_summary) else None,
+      summary_interval=params.trainer.summary_interval if
+      (save_summary) else None,
+      eval_actions=actions.get_eval_actions(params, trainer, model_dir))
+
+  logging.info('Starts to execute mode: %s', mode)
+  with distribution_strategy.scope():
+    if mode == 'train':
+      controller.train(steps=params.trainer.train_steps)
+    elif mode == 'train_and_eval':
+      controller.train_and_evaluate(
+          train_steps=params.trainer.train_steps,
+          eval_steps=params.trainer.validation_steps,
+          eval_interval=params.trainer.validation_interval)
+    elif mode == 'eval':
+      controller.evaluate(steps=params.trainer.validation_steps)
+    elif mode == 'continuous_eval':
+
+      def timeout_fn():
+        if trainer.global_step.numpy() >= params.trainer.train_steps:
+          return True
+        return False
+
+      controller.evaluate_continuously(
+          steps=params.trainer.validation_steps,
+          timeout=params.trainer.continuous_eval_timeout,
+          timeout_fn=timeout_fn)
+    else:
+      raise NotImplementedError('The mode is not implemented: %s' % mode)
+
+  num_params = train_utils.try_count_params(trainer.model)
+  if num_params is not None:
+    logging.info('Number of trainable params in model: %f Millions.',
+                 num_params / 10.**6)
+
+  if run_post_eval:
+    with distribution_strategy.scope():
+      return trainer.model, trainer.evaluate(
+          tf.convert_to_tensor(params.trainer.validation_steps))
+  else:
+    return trainer.model, {}
diff --git a/cv/classification/resnet50/tensorflow2.0/core/train_lib_test.py b/cv/classification/resnet50/tensorflow2.0/core/train_lib_test.py
new file mode 100644
index 000000000..cd3e05965
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/core/train_lib_test.py
@@ -0,0 +1,137 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for train_ctl_lib."""
+import json
+import os
+
+from absl import flags
+from absl.testing import flagsaver
+from absl.testing import parameterized
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from common import flags as tfm_flags
+# pylint: disable=unused-import
+from official.common import registry_imports
+# pylint: enable=unused-import
+from official.core import task_factory
+from official.core import train_lib
+from official.core import train_utils
+
+FLAGS = flags.FLAGS
+
+tfm_flags.define_flags()
+
+
+class TrainTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super(TrainTest, self).setUp()
+    self._test_config = {
+        'trainer': {
+            'checkpoint_interval': 10,
+            'steps_per_loop': 10,
+            'summary_interval': 10,
+            'train_steps': 10,
+            'validation_steps': 5,
+            'validation_interval': 10,
+            'continuous_eval_timeout': 1,
+            'validation_summary_subdir': 'validation',
+            'optimizer_config': {
+                'optimizer': {
+                    'type': 'sgd',
+                },
+                'learning_rate': {
+                    'type': 'constant'
+                }
+            }
+        },
+    }
+
+  @combinations.generate(
+      combinations.combine(
+          distribution_strategy=[
+              strategy_combinations.default_strategy,
+              strategy_combinations.cloud_tpu_strategy,
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          flag_mode=['train', 'eval', 'train_and_eval'],
+          run_post_eval=[True, False]))
+  def test_end_to_end(self, distribution_strategy, flag_mode, run_post_eval):
+    model_dir = self.get_temp_dir()
+    flags_dict = dict(
+        experiment='mock',
+        mode=flag_mode,
+        model_dir=model_dir,
+        params_override=json.dumps(self._test_config))
+    with flagsaver.flagsaver(**flags_dict):
+      params = train_utils.parse_configuration(flags.FLAGS)
+      train_utils.serialize_config(params, model_dir)
+      with distribution_strategy.scope():
+        task = task_factory.get_task(params.task, logging_dir=model_dir)
+
+      _, logs = train_lib.run_experiment(
+          distribution_strategy=distribution_strategy,
+          task=task,
+          mode=flag_mode,
+          params=params,
+          model_dir=model_dir,
+          run_post_eval=run_post_eval)
+
+    if 'eval' in flag_mode:
+      self.assertTrue(
+          tf.io.gfile.exists(
+              os.path.join(model_dir,
+                           params.trainer.validation_summary_subdir)))
+    if run_post_eval:
+      self.assertNotEmpty(logs)
+    else:
+      self.assertEmpty(logs)
+    self.assertNotEmpty(
+        tf.io.gfile.glob(os.path.join(model_dir, 'params.yaml')))
+    if flag_mode == 'eval':
+      return
+    self.assertNotEmpty(
+        tf.io.gfile.glob(os.path.join(model_dir, 'checkpoint')))
+    # Tests continuous evaluation.
+    _, logs = train_lib.run_experiment(
+        distribution_strategy=distribution_strategy,
+        task=task,
+        mode='continuous_eval',
+        params=params,
+        model_dir=model_dir,
+        run_post_eval=run_post_eval)
+    print(logs)
+
+  def test_parse_configuration(self):
+    model_dir = self.get_temp_dir()
+    flags_dict = dict(
+        experiment='mock',
+        mode='train',
+        model_dir=model_dir,
+        params_override=json.dumps(self._test_config))
+    with flagsaver.flagsaver(**flags_dict):
+      params = train_utils.parse_configuration(flags.FLAGS, lock_return=True)
+      with self.assertRaises(ValueError):
+        params.override({'task': {'init_checkpoint': 'Foo'}})
+
+      params = train_utils.parse_configuration(flags.FLAGS, lock_return=False)
+      params.override({'task': {'init_checkpoint': 'Bar'}})
+      self.assertEqual(params.task.init_checkpoint, 'Bar')
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/core/train_utils.py b/cv/classification/resnet50/tensorflow2.0/core/train_utils.py
new file mode 100644
index 000000000..299a0c48b
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/core/train_utils.py
@@ -0,0 +1,395 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Training utils."""
+import copy
+import json
+import os
+import pprint
+from typing import Any, Callable, Dict, List, Optional
+
+from absl import logging
+import dataclasses
+import gin
+import orbit
+import tensorflow as tf
+
+from . import base_task
+from . import base_trainer
+from . import config_definitions
+from . import exp_factory
+from modeling import hyperparams
+
+
+def get_leaf_nested_dict(d: Dict[str, Any], keys: List[str]) -> Dict[str, Any]:
+  """Get leaf from a dictionary with arbitrary depth with a list of keys.
+
+  Args:
+    d: The dictionary to extract value from.
+    keys: The list of keys to extract values recursively.
+
+  Returns:
+    The value of the leaf.
+
+  Raises:
+    KeyError: If the value of keys extracted is a dictionary.
+  """
+  leaf = d
+  for k in keys:
+    if not isinstance(leaf, dict) or k not in leaf:
+      raise KeyError(
+          'Path not exist while traversing the dictionary: d with keys'
+          ': %s.' % keys)
+    leaf = leaf[k]
+
+  if isinstance(leaf, dict):
+    raise KeyError('The value extracted with keys: %s is not a leaf of the '
+                   'dictionary: %s.' % (keys, d))
+  return leaf
+
+
+def cast_leaf_nested_dict(d: Dict[str, Any],
+                          cast_fn: Callable[[Any], Any]) -> Dict[str, Any]:
+  """Cast the leaves of a dictionary with arbitrary depth in place.
+
+  Args:
+    d: The dictionary to extract value from.
+    cast_fn: The casting function.
+
+  Returns:
+    A dictionray with the same structure as d.
+  """
+  for key, value in d.items():
+    if isinstance(value, dict):
+      d[key] = cast_leaf_nested_dict(value, cast_fn)
+    else:
+      d[key] = cast_fn(value)
+  return d
+
+
+def maybe_create_best_ckpt_exporter(params: config_definitions.ExperimentConfig,
+                                    data_dir: str) -> Any:
+  """Maybe create a BestCheckpointExporter object, according to the config."""
+  export_subdir = params.trainer.best_checkpoint_export_subdir
+  metric_name = params.trainer.best_checkpoint_eval_metric
+  metric_comp = params.trainer.best_checkpoint_metric_comp
+  if data_dir and export_subdir and metric_name:
+    best_ckpt_dir = os.path.join(data_dir, export_subdir)
+    best_ckpt_exporter = BestCheckpointExporter(best_ckpt_dir, metric_name,
+                                                metric_comp)
+    logging.info(
+        'Created the best checkpoint exporter. '
+        'data_dir: %s, export_subdir: %s, metric_name: %s', data_dir,
+        export_subdir, metric_name)
+  else:
+    best_ckpt_exporter = None
+
+  return best_ckpt_exporter
+
+
+# TODO(b/180147589): Add tests for this module.
+class BestCheckpointExporter:
+  """Keeps track of the best result, and saves its checkpoint.
+
+  Orbit will support an API for checkpoint exporter. This class will be used
+  together with orbit once this functionality is ready.
+  """
+
+  def __init__(self, export_dir: str, metric_name: str, metric_comp: str):
+    """Initialization.
+
+    Args:
+      export_dir: The directory that will contain exported checkpoints.
+      metric_name: Indicates which metric to look at, when determining which
+        result is better. If eval_logs being passed to maybe_export_checkpoint
+        is a nested dictionary, use `|` as a seperator for different layers.
+      metric_comp: Indicates how to compare results. Either `lower` or `higher`.
+    """
+    self._export_dir = export_dir
+    self._metric_name = metric_name.split('|')
+    self._metric_comp = metric_comp
+    if self._metric_comp not in ('lower', 'higher'):
+      raise ValueError('best checkpoint metric comp must be one of '
+                       'higher, lower. Got: {}'.format(self._metric_comp))
+    tf.io.gfile.makedirs(os.path.dirname(self.best_ckpt_logs_path))
+    self._best_ckpt_logs = self._maybe_load_best_eval_metric()
+    self._checkpoint_manager = None
+
+  def _get_checkpoint_manager(self, checkpoint):
+    """Gets an existing checkpoint manager or creates a new one."""
+    if self._checkpoint_manager is None or (self._checkpoint_manager.checkpoint
+                                            != checkpoint):
+      logging.info('Creates a new checkpoint manager.')
+      self._checkpoint_manager = tf.train.CheckpointManager(
+          checkpoint,
+          directory=self._export_dir,
+          max_to_keep=1,
+          checkpoint_name='best_ckpt')
+
+    return self._checkpoint_manager
+
+  def maybe_export_checkpoint(self, checkpoint, eval_logs, global_step):
+    logging.info('[BestCheckpointExporter] received eval_logs: %s, at step: %d',
+                 eval_logs, global_step)
+    if self._best_ckpt_logs is None or self._new_metric_is_better(
+        self._best_ckpt_logs, eval_logs):
+      self._best_ckpt_logs = eval_logs
+      self._export_best_eval_metric(checkpoint, self._best_ckpt_logs,
+                                    global_step)
+
+  def _maybe_load_best_eval_metric(self):
+    if not tf.io.gfile.exists(self.best_ckpt_logs_path):
+      return None
+    with tf.io.gfile.GFile(self.best_ckpt_logs_path, 'r') as reader:
+      return json.loads(reader.read())
+
+  def _new_metric_is_better(self, old_logs, new_logs):
+    """Check if the metric in new_logs is better than the metric in old_logs."""
+    old_value = float(
+        orbit.utils.get_value(
+            get_leaf_nested_dict(old_logs, self._metric_name)))
+    new_value = float(
+        orbit.utils.get_value(
+            get_leaf_nested_dict(new_logs, self._metric_name)))
+
+    logging.info('[BestCheckpointExporter] comparing results. old: %f, new: %f',
+                 old_value, new_value)
+    if self._metric_comp == 'higher':
+      if new_value > old_value:
+        logging.info('[BestCheckpointExporter] '
+                     'the new number is better since it is higher.')
+        return True
+    else:  # self._metric_comp == 'lower':
+      if new_value < old_value:
+        logging.info('[BestCheckpointExporter] '
+                     'the new number is better since it is lower.')
+        return True
+    return False
+
+  def _export_best_eval_metric(self, checkpoint, eval_logs, global_step):
+    """Export evaluation results of the best checkpoint into a json file."""
+    eval_logs_ext = copy.copy(eval_logs)
+    eval_logs_ext['best_ckpt_global_step'] = global_step
+    eval_logs_ext = cast_leaf_nested_dict(
+        eval_logs_ext, lambda x: float(orbit.utils.get_value(x)))
+    # Saving json file is very fast.
+    with tf.io.gfile.GFile(self.best_ckpt_logs_path, 'w') as writer:
+      writer.write(json.dumps(eval_logs_ext, indent=4) + '\n')
+
+    self._get_checkpoint_manager(checkpoint).save()
+
+  @property
+  def best_ckpt_logs(self):
+    return self._best_ckpt_logs
+
+  @property
+  def best_ckpt_logs_path(self):
+    return os.path.join(self._export_dir, 'info.json')
+
+  @property
+  def best_ckpt_path(self):
+    """Returns the best ckpt path or None if there is no ckpt yet."""
+    return tf.train.latest_checkpoint(self._export_dir)
+
+
+@gin.configurable
+def create_trainer(params: config_definitions.ExperimentConfig,
+                   task: base_task.Task,
+                   train: bool,
+                   evaluate: bool,
+                   checkpoint_exporter: Optional[BestCheckpointExporter] = None,
+                   trainer_cls=base_trainer.Trainer) -> base_trainer.Trainer:
+  """Create trainer."""
+  logging.info('Running default trainer.')
+  model = task.build_model()
+  optimizer = task.create_optimizer(params.trainer.optimizer_config,
+                                    params.runtime)
+  return trainer_cls(
+      params,
+      task,
+      model=model,
+      optimizer=optimizer,
+      train=train,
+      evaluate=evaluate,
+      checkpoint_exporter=checkpoint_exporter)
+
+
+@dataclasses.dataclass
+class ParseConfigOptions:
+  """Use this dataclass instead of FLAGS to customize parse_configuration()."""
+  experiment: str
+  config_file: List[str]
+  tpu: str = ''
+  tf_data_service: str = ''
+  params_override: str = ''
+
+  def __contains__(self, name):
+    return name in dataclasses.asdict(self)
+
+
+def parse_configuration(flags_obj, lock_return=True, print_return=True):
+  """Parses ExperimentConfig from flags."""
+
+  if flags_obj.experiment is None:
+    raise ValueError('The flag --experiment must be specified.')
+
+  # 1. Get the default config from the registered experiment.
+  params = exp_factory.get_exp_config(flags_obj.experiment)
+
+  # 2. Get the first level of override from `--config_file`.
+  #    `--config_file` is typically used as a template that specifies the common
+  #    override for a particular experiment.
+  for config_file in flags_obj.config_file or []:
+    params = hyperparams.override_params_dict(
+        params, config_file, is_strict=True)
+
+  # 3. Override the TPU address and tf.data service address.
+  params.override({
+      'runtime': {
+          'tpu': flags_obj.tpu,
+      },
+  })
+  if ('tf_data_service' in flags_obj and flags_obj.tf_data_service and
+      isinstance(params.task, config_definitions.TaskConfig)):
+    params.override({
+        'task': {
+            'train_data': {
+                'tf_data_service_address': flags_obj.tf_data_service,
+            },
+            'validation_data': {
+                'tf_data_service_address': flags_obj.tf_data_service,
+            }
+        }
+    })
+
+  # 4. Get the second level of override from `--params_override`.
+  #    `--params_override` is typically used as a further override over the
+  #    template. For example, one may define a particular template for training
+  #    ResNet50 on ImageNet in a config file and pass it via `--config_file`,
+  #    then define different learning rates and pass it via `--params_override`.
+  if flags_obj.params_override:
+    params = hyperparams.override_params_dict(
+        params, flags_obj.params_override, is_strict=True)
+
+  params.validate()
+  if lock_return:
+    params.lock()
+
+  if print_return:
+    pp = pprint.PrettyPrinter()
+    logging.info('Final experiment parameters:\n%s',
+                 pp.pformat(params.as_dict()))
+
+  return params
+
+
+def serialize_config(params: config_definitions.ExperimentConfig,
+                     model_dir: str):
+  """Serializes and saves the experiment config."""
+  if model_dir is None:
+    raise ValueError('model_dir must be specified, but got None')
+  params_save_path = os.path.join(model_dir, 'params.yaml')
+  logging.info('Saving experiment configuration to %s', params_save_path)
+  tf.io.gfile.makedirs(model_dir)
+  hyperparams.save_params_dict_to_yaml(params, params_save_path)
+
+
+def save_gin_config(filename_surfix: str, model_dir: str):
+  """Serializes and saves the experiment config."""
+  gin_save_path = os.path.join(
+      model_dir, 'operative_config.{}.gin'.format(filename_surfix))
+  logging.info('Saving gin configurations to %s', gin_save_path)
+  tf.io.gfile.makedirs(model_dir)
+  with tf.io.gfile.GFile(gin_save_path, 'w') as f:
+    f.write(gin.operative_config_str())
+
+
+def read_global_step_from_checkpoint(ckpt_file_path):
+  """Read global step from checkpoint, or get global step from its filename."""
+  global_step = tf.Variable(-1, dtype=tf.int64)
+  ckpt = tf.train.Checkpoint(global_step=global_step)
+  try:
+    ckpt.restore(ckpt_file_path).expect_partial()
+    global_step_maybe_restored = global_step.numpy()
+  except tf.errors.InvalidArgumentError:
+    global_step_maybe_restored = -1
+
+  if global_step_maybe_restored == -1:
+    raise ValueError('global_step not found in checkpoint {}. '
+                     'If you want to run finetune eval jobs, you need to '
+                     'make sure that your pretrain model writes '
+                     'global_step in its checkpoints.'.format(ckpt_file_path))
+  global_step_restored = global_step.numpy()
+  logging.info('get global_step %d from checkpoint %s', global_step_restored,
+               ckpt_file_path)
+  return global_step_restored
+
+
+def write_json_summary(log_dir, global_step, eval_metrics):
+  """Dump evaluation metrics to json file."""
+  serializable_dict = {}
+  for name, value in eval_metrics.items():
+    if hasattr(value, 'numpy'):
+      serializable_dict[name] = str(value.numpy())
+    else:
+      serializable_dict[name] = str(value)
+  output_json = os.path.join(log_dir, 'metrics-{}.json'.format(global_step))
+  logging.info('Evaluation results at pretrain step %d: %s', global_step,
+               serializable_dict)
+  with tf.io.gfile.GFile(output_json, 'w') as writer:
+    writer.write(json.dumps(serializable_dict, indent=4) + '\n')
+
+
+def write_summary(summary_writer, global_step, eval_metrics):
+  """Write evaluation metrics to TF summary."""
+  numeric_dict = {}
+  for name, value in eval_metrics.items():
+    numeric_dict[name] = float(orbit.utils.get_value(value))
+  with summary_writer.as_default():
+    for name, value in numeric_dict.items():
+      tf.summary.scalar(name, value, step=global_step)
+    summary_writer.flush()
+
+
+def remove_ckpts(model_dir):
+  """Remove model checkpoints, so we can restart."""
+  ckpts = os.path.join(model_dir, 'ckpt-*')
+  logging.info('removing checkpoint files %s', ckpts)
+  for file_to_remove in tf.io.gfile.glob(ckpts):
+    tf.io.gfile.rmtree(file_to_remove)
+
+  file_to_remove = os.path.join(model_dir, 'checkpoint')
+  if tf.io.gfile.exists(file_to_remove):
+    tf.io.gfile.remove(file_to_remove)
+
+
+def try_count_params(model: tf.keras.Model):
+  """Count the number of parameters if model is possible.
+
+  Args:
+    model: Try to count the number of params in this model.
+
+  Returns:
+    The number of parameters or None.
+  """
+  if hasattr(model, 'count_params'):
+    try:
+      return model.count_params()
+    except ValueError:
+      logging.info('Number of trainable params unknown, because the build() '
+                   'methods in keras layers were not called. This is probably '
+                   'because the model was not feed any input, e.g., the max '
+                   'train step already reached before this run.')
+      return None
+  return None
diff --git a/cv/classification/resnet50/tensorflow2.0/core/train_utils_test.py b/cv/classification/resnet50/tensorflow2.0/core/train_utils_test.py
new file mode 100644
index 000000000..134950c2b
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/core/train_utils_test.py
@@ -0,0 +1,56 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for official.core.train_utils."""
+
+import tensorflow as tf
+
+from official.core import train_utils
+
+
+class TrainUtilsTest(tf.test.TestCase):
+
+  def test_get_leaf_nested_dict(self):
+    d = {'a': {'i': {'x': 5}}}
+    self.assertEqual(train_utils.get_leaf_nested_dict(d, ['a', 'i', 'x']), 5)
+
+  def test_get_leaf_nested_dict_not_leaf(self):
+    with self.assertRaisesRegex(KeyError, 'The value extracted with keys.*'):
+      d = {'a': {'i': {'x': 5}}}
+      train_utils.get_leaf_nested_dict(d, ['a', 'i'])
+
+  def test_get_leaf_nested_dict_path_not_exist_missing_key(self):
+    with self.assertRaisesRegex(KeyError, 'Path not exist while traversing .*'):
+      d = {'a': {'i': {'x': 5}}}
+      train_utils.get_leaf_nested_dict(d, ['a', 'i', 'y'])
+
+  def test_get_leaf_nested_dict_path_not_exist_out_of_range(self):
+    with self.assertRaisesRegex(KeyError, 'Path not exist while traversing .*'):
+      d = {'a': {'i': {'x': 5}}}
+      train_utils.get_leaf_nested_dict(d, ['a', 'i', 'z'])
+
+  def test_get_leaf_nested_dict_path_not_exist_meets_leaf(self):
+    with self.assertRaisesRegex(KeyError, 'Path not exist while traversing .*'):
+      d = {'a': {'i': 5}}
+      train_utils.get_leaf_nested_dict(d, ['a', 'i', 'z'])
+
+  def test_cast_leaf_nested_dict(self):
+    d = {'a': {'i': {'x': '123'}}, 'b': 456.5}
+    d = train_utils.cast_leaf_nested_dict(d, int)
+    self.assertEqual(d['a']['i']['x'], 123)
+    self.assertEqual(d['b'], 456)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/dataset_factory.py b/cv/classification/resnet50/tensorflow2.0/dataset_factory.py
new file mode 100644
index 000000000..ccc61732c
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/dataset_factory.py
@@ -0,0 +1,544 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Dataset utilities for vision tasks using TFDS and tf.data.Dataset."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import os
+from typing import Any, List, Optional, Tuple, Mapping, Union
+
+from absl import logging
+from dataclasses import dataclass
+import tensorflow as tf
+import tensorflow_datasets as tfds
+
+from modeling.hyperparams import base_config
+import augment
+import preprocessing
+
+AUGMENTERS = {
+    'autoaugment': augment.AutoAugment,
+    'randaugment': augment.RandAugment,
+}
+
+
+@dataclass
+class AugmentConfig(base_config.Config):
+  """Configuration for image augmenters.
+
+  Attributes:
+    name: The name of the image augmentation to use. Possible options are None
+      (default), 'autoaugment', or 'randaugment'.
+    params: Any paramaters used to initialize the augmenter.
+  """
+  name: Optional[str] = None
+  params: Optional[Mapping[str, Any]] = None
+
+  def build(self) -> augment.ImageAugment:
+    """Build the augmenter using this config."""
+    params = self.params or {}
+    augmenter = AUGMENTERS.get(self.name, None)
+    return augmenter(**params) if augmenter is not None else None
+
+
+@dataclass
+class DatasetConfig(base_config.Config):
+  """The base configuration for building datasets.
+
+  Attributes:
+    name: The name of the Dataset. Usually should correspond to a TFDS dataset.
+    data_dir: The path where the dataset files are stored, if available.
+    filenames: Optional list of strings representing the TFRecord names.
+    builder: The builder type used to load the dataset. Value should be one of
+      'tfds' (load using TFDS), 'records' (load from TFRecords), or 'synthetic'
+      (generate dummy synthetic data without reading from files).
+    split: The split of the dataset. Usually 'train', 'validation', or 'test'.
+    image_size: The size of the image in the dataset. This assumes that `width`
+      == `height`. Set to 'infer' to infer the image size from TFDS info. This
+      requires `name` to be a registered dataset in TFDS.
+    num_classes: The number of classes given by the dataset. Set to 'infer' to
+      infer the image size from TFDS info. This requires `name` to be a
+      registered dataset in TFDS.
+    num_channels: The number of channels given by the dataset. Set to 'infer' to
+      infer the image size from TFDS info. This requires `name` to be a
+      registered dataset in TFDS.
+    num_examples: The number of examples given by the dataset. Set to 'infer' to
+      infer the image size from TFDS info. This requires `name` to be a
+      registered dataset in TFDS.
+    batch_size: The base batch size for the dataset.
+    use_per_replica_batch_size: Whether to scale the batch size based on
+      available resources. If set to `True`, the dataset builder will return
+      batch_size multiplied by `num_devices`, the number of device replicas
+      (e.g., the number of GPUs or TPU cores). This setting should be `True` if
+      the strategy argument is passed to `build()` and `num_devices > 1`.
+    num_devices: The number of replica devices to use. This should be set by
+      `strategy.num_replicas_in_sync` when using a distribution strategy.
+    dtype: The desired dtype of the dataset. This will be set during
+      preprocessing.
+    one_hot: Whether to apply one hot encoding. Set to `True` to be able to use
+      label smoothing.
+    augmenter: The augmenter config to use. No augmentation is used by default.
+    download: Whether to download data using TFDS.
+    shuffle_buffer_size: The buffer size used for shuffling training data.
+    file_shuffle_buffer_size: The buffer size used for shuffling raw training
+      files.
+    skip_decoding: Whether to skip image decoding when loading from TFDS.
+    cache: whether to cache to dataset examples. Can be used to avoid re-reading
+      from disk on the second epoch. Requires significant memory overhead.
+    tf_data_service: The URI of a tf.data service to offload preprocessing onto
+      during training. The URI should be in the format "protocol://address",
+      e.g. "grpc://tf-data-service:5050".
+    mean_subtract: whether or not to apply mean subtraction to the dataset.
+    standardize: whether or not to apply standardization to the dataset.
+  """
+  name: Optional[str] = None
+  data_dir: Optional[str] = None
+  filenames: Optional[List[str]] = None
+  builder: str = 'tfds'
+  split: str = 'train'
+  image_size: Union[int, str] = 'infer'
+  num_classes: Union[int, str] = 'infer'
+  num_channels: Union[int, str] = 'infer'
+  num_examples: Union[int, str] = 'infer'
+  batch_size: int = 128
+  use_per_replica_batch_size: bool = True
+  num_devices: int = 1
+  dtype: str = 'float32'
+  one_hot: bool = True
+  augmenter: AugmentConfig = AugmentConfig()
+  download: bool = False
+  shuffle_buffer_size: int = 10000
+  file_shuffle_buffer_size: int = 1024
+  skip_decoding: bool = True
+  cache: bool = False
+  tf_data_service: Optional[str] = None
+  mean_subtract: bool = False
+  standardize: bool = False
+
+  @property
+  def has_data(self):
+    """Whether this dataset is has any data associated with it."""
+    return self.name or self.data_dir or self.filenames
+
+
+@dataclass
+class ImageNetConfig(DatasetConfig):
+  """The base ImageNet dataset config."""
+  name: str = 'imagenet2012'
+  # Note: for large datasets like ImageNet, using records is faster than tfds
+  builder: str = 'records'
+  image_size: int = 224
+  num_channels: int = 3
+  num_examples: int = 9469 #1281167
+  num_classes: int = 10 #1000
+  batch_size: int = 128
+
+
+@dataclass
+class Cifar10Config(DatasetConfig):
+  """The base CIFAR-10 dataset config."""
+  name: str = 'cifar10'
+  image_size: int = 224
+  batch_size: int = 128
+  download: bool = True
+  cache: bool = True
+
+
+class DatasetBuilder:
+  """An object for building datasets.
+
+  Allows building various pipelines fetching examples, preprocessing, etc.
+  Maintains additional state information calculated from the dataset, i.e.,
+  training set split, batch size, and number of steps (batches).
+  """
+
+  def __init__(self, config: DatasetConfig, **overrides: Any):
+    """Initialize the builder from the config."""
+    self.config = config.replace(**overrides)
+    self.builder_info = None
+
+    if self.config.augmenter is not None:
+      logging.info('Using augmentation: %s', self.config.augmenter.name)
+      self.augmenter = self.config.augmenter.build()
+    else:
+      self.augmenter = None
+
+  @property
+  def is_training(self) -> bool:
+    """Whether this is the training set."""
+    return self.config.split == 'train'
+
+  @property
+  def batch_size(self) -> int:
+    """The batch size, multiplied by the number of replicas (if configured)."""
+    if self.config.use_per_replica_batch_size:
+      return self.config.batch_size * self.config.num_devices
+    else:
+      return self.config.batch_size
+
+  @property
+  def global_batch_size(self):
+    """The global batch size across all replicas."""
+    return self.batch_size
+
+  @property
+  def local_batch_size(self):
+    """The base unscaled batch size."""
+    if self.config.use_per_replica_batch_size:
+      return self.config.batch_size
+    else:
+      return self.config.batch_size // self.config.num_devices
+
+  @property
+  def num_steps(self) -> int:
+    """The number of steps (batches) to exhaust this dataset."""
+    # Always divide by the global batch size to get the correct # of steps
+    return self.num_examples // self.global_batch_size
+
+  @property
+  def dtype(self) -> tf.dtypes.DType:
+    """Converts the config's dtype string to a tf dtype.
+
+    Returns:
+      A mapping from string representation of a dtype to the `tf.dtypes.DType`.
+
+    Raises:
+      ValueError if the config's dtype is not supported.
+
+    """
+    dtype_map = {
+        'float32': tf.float32,
+        'bfloat16': tf.bfloat16,
+        'float16': tf.float16,
+        'fp32': tf.float32,
+        'bf16': tf.bfloat16,
+    }
+    try:
+      return dtype_map[self.config.dtype]
+    except:
+      raise ValueError('Invalid DType provided. Supported types: {}'.format(
+          dtype_map.keys()))
+
+  @property
+  def image_size(self) -> int:
+    """The size of each image (can be inferred from the dataset)."""
+
+    if self.config.image_size == 'infer':
+      return self.info.features['image'].shape[0]
+    else:
+      return int(self.config.image_size)
+
+  @property
+  def num_channels(self) -> int:
+    """The number of image channels (can be inferred from the dataset)."""
+    if self.config.num_channels == 'infer':
+      return self.info.features['image'].shape[-1]
+    else:
+      return int(self.config.num_channels)
+
+  @property
+  def num_examples(self) -> int:
+    """The number of examples (can be inferred from the dataset)."""
+    if self.config.num_examples == 'infer':
+      return self.info.splits[self.config.split].num_examples
+    else:
+      return int(self.config.num_examples)
+
+  @property
+  def num_classes(self) -> int:
+    """The number of classes (can be inferred from the dataset)."""
+    if self.config.num_classes == 'infer':
+      return self.info.features['label'].num_classes
+    else:
+      return int(self.config.num_classes)
+
+  @property
+  def info(self) -> tfds.core.DatasetInfo:
+    """The TFDS dataset info, if available."""
+    try:
+      if self.builder_info is None:
+        self.builder_info = tfds.builder(self.config.name).info
+    except ConnectionError as e:
+      logging.error('Failed to use TFDS to load info. Please set dataset info '
+                    '(image_size, num_channels, num_examples, num_classes) in '
+                    'the dataset config.')
+      raise e
+    return self.builder_info
+
+  def build(
+      self,
+      strategy: Optional[tf.distribute.Strategy] = None) -> tf.data.Dataset:
+    """Construct a dataset end-to-end and return it using an optional strategy.
+
+    Args:
+      strategy: a strategy that, if passed, will distribute the dataset
+        according to that strategy. If passed and `num_devices > 1`,
+        `use_per_replica_batch_size` must be set to `True`.
+
+    Returns:
+      A TensorFlow dataset outputting batched images and labels.
+    """
+    if strategy:
+      if strategy.num_replicas_in_sync != self.config.num_devices:
+        logging.warn(
+            'Passed a strategy with %d devices, but expected'
+            '%d devices.', strategy.num_replicas_in_sync,
+            self.config.num_devices)
+      # dataset = strategy.distribute_datasets_from_function(self._build)
+      dataset = self._build()
+    else:
+      dataset = self._build()
+
+    return dataset
+
+  def _build(
+      self,
+      input_context: Optional[tf.distribute.InputContext] = None
+  ) -> tf.data.Dataset:
+    """Construct a dataset end-to-end and return it.
+
+    Args:
+      input_context: An optional context provided by `tf.distribute` for
+        cross-replica training.
+
+    Returns:
+      A TensorFlow dataset outputting batched images and labels.
+    """
+    builders = {
+        'tfds': self.load_tfds,
+        'records': self.load_records,
+        'synthetic': self.load_synthetic,
+    }
+
+    builder = builders.get(self.config.builder, None)
+
+    if builder is None:
+      raise ValueError('Unknown builder type {}'.format(self.config.builder))
+
+    self.input_context = input_context
+    dataset = builder()
+    dataset = self.pipeline(dataset)
+
+    return dataset
+
+  def load_tfds(self) -> tf.data.Dataset:
+    """Return a dataset loading files from TFDS."""
+
+    logging.info('Using TFDS to load data.')
+
+    builder = tfds.builder(self.config.name, data_dir=self.config.data_dir)
+
+    if self.config.download:
+      builder.download_and_prepare()
+
+    decoders = {}
+
+    if self.config.skip_decoding:
+      decoders['image'] = tfds.decode.SkipDecoding()
+
+    read_config = tfds.ReadConfig(
+        interleave_cycle_length=10,
+        interleave_block_length=1,
+        input_context=self.input_context)
+
+    dataset = builder.as_dataset(
+        split=self.config.split,
+        as_supervised=True,
+        shuffle_files=True,
+        decoders=decoders,
+        read_config=read_config)
+
+    return dataset
+
+  def load_records(self) -> tf.data.Dataset:
+    """Return a dataset loading files with TFRecords."""
+    logging.info('Using TFRecords to load data.')
+    if self.config.filenames is None:
+      if self.config.data_dir is None:
+        raise ValueError('Dataset must specify a path for the data files.')
+
+      file_pattern = os.path.join(self.config.data_dir,
+                                  '{}-*'.format(self.config.split))
+      dataset = tf.data.Dataset.list_files(file_pattern, shuffle=False)
+      files = tf.io.gfile.glob(file_pattern)
+      if len(files) == 1:
+        options = tf.data.Options()
+        options.experimental_distribute.auto_shard_policy = (tf.data.experimental.AutoShardPolicy.OFF)
+        dataset = dataset.with_options(options)
+    else:
+      dataset = tf.data.Dataset.from_tensor_slices(self.config.filenames)
+
+    return dataset
+
+  def load_synthetic(self) -> tf.data.Dataset:
+    """Return a dataset generating dummy synthetic data."""
+    logging.info('Generating a synthetic dataset.')
+
+    def generate_data(_):
+      image = tf.zeros([self.image_size, self.image_size, self.num_channels],
+                       dtype=self.dtype)
+      label = tf.zeros([1], dtype=tf.int32)
+      return image, label
+
+    dataset = tf.data.Dataset.range(1)
+    dataset = dataset.repeat()
+    dataset = dataset.map(
+        generate_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+    return dataset
+
+  def pipeline(self, dataset: tf.data.Dataset) -> tf.data.Dataset:
+    """Build a pipeline fetching, shuffling, and preprocessing the dataset.
+
+    Args:
+      dataset: A `tf.data.Dataset` that loads raw files.
+
+    Returns:
+      A TensorFlow dataset outputting batched images and labels.
+    """
+    if (self.config.builder != 'tfds' and self.input_context and
+        self.input_context.num_input_pipelines > 1):
+      dataset = dataset.shard(self.input_context.num_input_pipelines,
+                              self.input_context.input_pipeline_id)
+      logging.info(
+          'Sharding the dataset: input_pipeline_id=%d '
+          'num_input_pipelines=%d', self.input_context.num_input_pipelines,
+          self.input_context.input_pipeline_id)
+
+    if self.is_training and self.config.builder == 'records':
+      # Shuffle the input files.
+      dataset.shuffle(buffer_size=self.config.file_shuffle_buffer_size)
+
+    if self.is_training and not self.config.cache:
+      dataset = dataset.repeat()
+
+    if self.config.builder == 'records':
+      # Read the data from disk in parallel
+      dataset = dataset.interleave(
+          tf.data.TFRecordDataset,
+          cycle_length=10,
+          block_length=1,
+          num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+    if self.config.cache:
+      dataset = dataset.cache()
+
+    if self.is_training:
+      dataset = dataset.shuffle(self.config.shuffle_buffer_size)
+      dataset = dataset.repeat()
+
+    # Parse, pre-process, and batch the data in parallel
+    if self.config.builder == 'records':
+      preprocess = self.parse_record
+    else:
+      preprocess = self.preprocess
+    dataset = dataset.map(
+        preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+    if self.input_context and self.config.num_devices > 1:
+      if not self.config.use_per_replica_batch_size:
+        raise ValueError(
+            'The builder does not support a global batch size with more than '
+            'one replica. Got {} replicas. Please set a '
+            '`per_replica_batch_size` and enable '
+            '`use_per_replica_batch_size=True`.'.format(
+                self.config.num_devices))
+
+      # The batch size of the dataset will be multiplied by the number of
+      # replicas automatically when strategy.distribute_datasets_from_function
+      # is called, so we use local batch size here.
+      dataset = dataset.batch(
+          self.local_batch_size, drop_remainder=self.is_training)
+    else:
+      dataset = dataset.batch(
+          self.global_batch_size, drop_remainder=self.is_training)
+
+    # Prefetch overlaps in-feed with training
+    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+
+    if self.config.tf_data_service:
+      if not hasattr(tf.data.experimental, 'service'):
+        raise ValueError('The tf_data_service flag requires Tensorflow version '
+                         '>= 2.3.0, but the version is {}'.format(
+                             tf.__version__))
+      dataset = dataset.apply(
+          tf.data.experimental.service.distribute(
+              processing_mode='parallel_epochs',
+              service=self.config.tf_data_service,
+              job_name='resnet_train'))
+      dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+
+    return dataset
+
+  def parse_record(self, record: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+    """Parse an ImageNet record from a serialized string Tensor."""
+    keys_to_features = {
+        'image/encoded': tf.io.FixedLenFeature((), tf.string, ''),
+        'image/format': tf.io.FixedLenFeature((), tf.string, 'jpeg'),
+        'image/class/label': tf.io.FixedLenFeature([], tf.int64, -1),
+        'image/class/text': tf.io.FixedLenFeature([], tf.string, ''),
+        'image/object/bbox/xmin': tf.io.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/ymin': tf.io.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/xmax': tf.io.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/ymax': tf.io.VarLenFeature(dtype=tf.float32),
+        'image/object/class/label': tf.io.VarLenFeature(dtype=tf.int64),
+    }
+
+    parsed = tf.io.parse_single_example(record, keys_to_features)
+
+    label = tf.reshape(parsed['image/class/label'], shape=[1])
+
+    # Subtract one so that labels are in [0, 1000)
+    # label -= 1
+
+    image_bytes = tf.reshape(parsed['image/encoded'], shape=[])
+    image, label = self.preprocess(image_bytes, label)
+
+    return image, label
+
+  def preprocess(self, image: tf.Tensor,
+                 label: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+    """Apply image preprocessing and augmentation to the image and label."""
+    if self.is_training:
+      image = preprocessing.preprocess_for_train(
+          image,
+          image_size=self.image_size,
+          mean_subtract=self.config.mean_subtract,
+          standardize=self.config.standardize,
+          dtype=self.dtype,
+          augmenter=self.augmenter)
+    else:
+      image = preprocessing.preprocess_for_eval(
+          image,
+          image_size=self.image_size,
+          num_channels=self.num_channels,
+          mean_subtract=self.config.mean_subtract,
+          standardize=self.config.standardize,
+          dtype=self.dtype)
+
+    label = tf.cast(label, tf.int32)
+    if self.config.one_hot:
+      label = tf.one_hot(label, self.num_classes)
+      label = tf.reshape(label, [self.num_classes])
+
+    return image, label
+
+  @classmethod
+  def from_params(cls, *args, **kwargs):
+    """Construct a dataset builder from a default config and any overrides."""
+    config = DatasetConfig.from_args(*args, **kwargs)
+    return cls(config)
diff --git a/cv/classification/resnet50/tensorflow2.0/download_script.sh b/cv/classification/resnet50/tensorflow2.0/download_script.sh
new file mode 100644
index 000000000..e69de29bb
diff --git a/cv/classification/resnet50/tensorflow2.0/efficientnet/__init__.py b/cv/classification/resnet50/tensorflow2.0/efficientnet/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/efficientnet/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/cv/classification/resnet50/tensorflow2.0/efficientnet/common_modules.py b/cv/classification/resnet50/tensorflow2.0/efficientnet/common_modules.py
new file mode 100644
index 000000000..e3657bd86
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/efficientnet/common_modules.py
@@ -0,0 +1,119 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Common modeling utilities."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+import tensorflow.compat.v1 as tf1
+from typing import Text, Optional
+
+from tensorflow.python.tpu import tpu_function
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class TpuBatchNormalization(tf.keras.layers.BatchNormalization):
+  """Cross replica batch normalization."""
+
+  def __init__(self, fused: Optional[bool] = False, **kwargs):
+    if fused in (True, None):
+      raise ValueError('TpuBatchNormalization does not support fused=True.')
+    super(TpuBatchNormalization, self).__init__(fused=fused, **kwargs)
+
+  def _cross_replica_average(self, t: tf.Tensor, num_shards_per_group: int):
+    """Calculates the average value of input tensor across TPU replicas."""
+    num_shards = tpu_function.get_tpu_context().number_of_shards
+    group_assignment = None
+    if num_shards_per_group > 1:
+      if num_shards % num_shards_per_group != 0:
+        raise ValueError(
+            'num_shards: %d mod shards_per_group: %d, should be 0' %
+            (num_shards, num_shards_per_group))
+      num_groups = num_shards // num_shards_per_group
+      group_assignment = [[
+          x for x in range(num_shards) if x // num_shards_per_group == y
+      ] for y in range(num_groups)]
+    return tf1.tpu.cross_replica_sum(t, group_assignment) / tf.cast(
+        num_shards_per_group, t.dtype)
+
+  def _moments(self, inputs: tf.Tensor, reduction_axes: int, keep_dims: int):
+    """Compute the mean and variance: it overrides the original _moments."""
+    shard_mean, shard_variance = super(TpuBatchNormalization, self)._moments(
+        inputs, reduction_axes, keep_dims=keep_dims)
+
+    num_shards = tpu_function.get_tpu_context().number_of_shards or 1
+    if num_shards <= 8:  # Skip cross_replica for 2x2 or smaller slices.
+      num_shards_per_group = 1
+    else:
+      num_shards_per_group = max(8, num_shards // 8)
+    if num_shards_per_group > 1:
+      # Compute variance using: Var[X]= E[X^2] - E[X]^2.
+      shard_square_of_mean = tf.math.square(shard_mean)
+      shard_mean_of_square = shard_variance + shard_square_of_mean
+      group_mean = self._cross_replica_average(shard_mean, num_shards_per_group)
+      group_mean_of_square = self._cross_replica_average(
+          shard_mean_of_square, num_shards_per_group)
+      group_variance = group_mean_of_square - tf.math.square(group_mean)
+      return (group_mean, group_variance)
+    else:
+      return (shard_mean, shard_variance)
+
+
+def get_batch_norm(batch_norm_type: Text) -> tf.keras.layers.BatchNormalization:
+  """A helper to create a batch normalization getter.
+
+  Args:
+    batch_norm_type: The type of batch normalization layer implementation. `tpu`
+      will use `TpuBatchNormalization`.
+
+  Returns:
+    An instance of `tf.keras.layers.BatchNormalization`.
+  """
+  if batch_norm_type == 'tpu':
+    return TpuBatchNormalization
+
+  return tf.keras.layers.BatchNormalization
+
+
+def count_params(model, trainable_only=True):
+  """Returns the count of all model parameters, or just trainable ones."""
+  if not trainable_only:
+    return model.count_params()
+  else:
+    return int(
+        np.sum([
+            tf.keras.backend.count_params(p) for p in model.trainable_weights
+        ]))
+
+
+def load_weights(model: tf.keras.Model,
+                 model_weights_path: Text,
+                 weights_format: Text = 'saved_model'):
+  """Load model weights from the given file path.
+
+  Args:
+    model: the model to load weights into
+    model_weights_path: the path of the model weights
+    weights_format: the model weights format. One of 'saved_model', 'h5', or
+      'checkpoint'.
+  """
+  if weights_format == 'saved_model':
+    loaded_model = tf.keras.models.load_model(model_weights_path)
+    model.set_weights(loaded_model.get_weights())
+  else:
+    model.load_weights(model_weights_path)
diff --git a/cv/classification/resnet50/tensorflow2.0/efficientnet/efficientnet_config.py b/cv/classification/resnet50/tensorflow2.0/efficientnet/efficientnet_config.py
new file mode 100644
index 000000000..793279d08
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/efficientnet/efficientnet_config.py
@@ -0,0 +1,78 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Configuration definitions for EfficientNet losses, learning rates, and optimizers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from typing import Any, Mapping
+
+import dataclasses
+
+from modeling.hyperparams import base_config
+from configs import base_configs
+
+
+@dataclasses.dataclass
+class EfficientNetModelConfig(base_configs.ModelConfig):
+  """Configuration for the EfficientNet model.
+
+  This configuration will default to settings used for training efficientnet-b0
+  on a v3-8 TPU on ImageNet.
+
+  Attributes:
+    name: The name of the model. Defaults to 'EfficientNet'.
+    num_classes: The number of classes in the model.
+    model_params: A dictionary that represents the parameters of the
+      EfficientNet model. These will be passed in to the "from_name" function.
+    loss: The configuration for loss. Defaults to a categorical cross entropy
+      implementation.
+    optimizer: The configuration for optimizations. Defaults to an RMSProp
+      configuration.
+    learning_rate: The configuration for learning rate. Defaults to an
+      exponential configuration.
+  """
+  name: str = 'EfficientNet'
+  num_classes: int = 1000
+  model_params: base_config.Config = dataclasses.field(
+      default_factory=lambda: {
+          'model_name': 'efficientnet-b0',
+          'model_weights_path': '',
+          'weights_format': 'saved_model',
+          'overrides': {
+              'batch_norm': 'default',
+              'rescale_input': True,
+              'num_classes': 1000,
+              'activation': 'swish',
+              'dtype': 'float32',
+          }
+      })
+  loss: base_configs.LossConfig = base_configs.LossConfig(
+      name='categorical_crossentropy', label_smoothing=0.1)
+  optimizer: base_configs.OptimizerConfig = base_configs.OptimizerConfig(
+      name='rmsprop',
+      decay=0.9,
+      epsilon=0.001,
+      momentum=0.9,
+      moving_average_decay=None)
+  learning_rate: base_configs.LearningRateConfig = base_configs.LearningRateConfig(  # pylint: disable=line-too-long
+      name='exponential',
+      initial_lr=0.008,
+      decay_epochs=2.4,
+      decay_rate=0.97,
+      warmup_epochs=5,
+      scale_by_batch_size=1. / 128.,
+      staircase=True)
diff --git a/cv/classification/resnet50/tensorflow2.0/efficientnet/efficientnet_model.py b/cv/classification/resnet50/tensorflow2.0/efficientnet/efficientnet_model.py
new file mode 100644
index 000000000..4f0f1c647
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/efficientnet/efficientnet_model.py
@@ -0,0 +1,500 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Contains definitions for EfficientNet model.
+
+[1] Mingxing Tan, Quoc V. Le
+  EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks.
+  ICML'19, https://arxiv.org/abs/1905.11946
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import os
+from typing import Any, Dict, Optional, Text, Tuple
+
+from absl import logging
+from dataclasses import dataclass
+import tensorflow as tf
+
+from modeling import tf_utils
+from modeling.hyperparams import base_config
+import preprocessing
+from efficientnet import common_modules
+
+
+@dataclass
+class BlockConfig(base_config.Config):
+  """Config for a single MB Conv Block."""
+  input_filters: int = 0
+  output_filters: int = 0
+  kernel_size: int = 3
+  num_repeat: int = 1
+  expand_ratio: int = 1
+  strides: Tuple[int, int] = (1, 1)
+  se_ratio: Optional[float] = None
+  id_skip: bool = True
+  fused_conv: bool = False
+  conv_type: str = 'depthwise'
+
+
+@dataclass
+class ModelConfig(base_config.Config):
+  """Default Config for Efficientnet-B0."""
+  width_coefficient: float = 1.0
+  depth_coefficient: float = 1.0
+  resolution: int = 224
+  dropout_rate: float = 0.2
+  blocks: Tuple[BlockConfig, ...] = (
+      # (input_filters, output_filters, kernel_size, num_repeat,
+      #  expand_ratio, strides, se_ratio)
+      # pylint: disable=bad-whitespace
+      BlockConfig.from_args(32, 16, 3, 1, 1, (1, 1), 0.25),
+      BlockConfig.from_args(16, 24, 3, 2, 6, (2, 2), 0.25),
+      BlockConfig.from_args(24, 40, 5, 2, 6, (2, 2), 0.25),
+      BlockConfig.from_args(40, 80, 3, 3, 6, (2, 2), 0.25),
+      BlockConfig.from_args(80, 112, 5, 3, 6, (1, 1), 0.25),
+      BlockConfig.from_args(112, 192, 5, 4, 6, (2, 2), 0.25),
+      BlockConfig.from_args(192, 320, 3, 1, 6, (1, 1), 0.25),
+      # pylint: enable=bad-whitespace
+  )
+  stem_base_filters: int = 32
+  top_base_filters: int = 1280
+  activation: str = 'simple_swish'
+  batch_norm: str = 'default'
+  bn_momentum: float = 0.99
+  bn_epsilon: float = 1e-3
+  # While the original implementation used a weight decay of 1e-5,
+  # tf.nn.l2_loss divides it by 2, so we halve this to compensate in Keras
+  weight_decay: float = 5e-6
+  drop_connect_rate: float = 0.2
+  depth_divisor: int = 8
+  min_depth: Optional[int] = None
+  use_se: bool = True
+  input_channels: int = 3
+  num_classes: int = 1000
+  model_name: str = 'efficientnet'
+  rescale_input: bool = True
+  data_format: str = 'channels_last'
+  dtype: str = 'float32'
+
+
+MODEL_CONFIGS = {
+    # (width, depth, resolution, dropout)
+    'efficientnet-b0': ModelConfig.from_args(1.0, 1.0, 224, 0.2),
+    'efficientnet-b1': ModelConfig.from_args(1.0, 1.1, 240, 0.2),
+    'efficientnet-b2': ModelConfig.from_args(1.1, 1.2, 260, 0.3),
+    'efficientnet-b3': ModelConfig.from_args(1.2, 1.4, 300, 0.3),
+    'efficientnet-b4': ModelConfig.from_args(1.4, 1.8, 380, 0.4),
+    'efficientnet-b5': ModelConfig.from_args(1.6, 2.2, 456, 0.4),
+    'efficientnet-b6': ModelConfig.from_args(1.8, 2.6, 528, 0.5),
+    'efficientnet-b7': ModelConfig.from_args(2.0, 3.1, 600, 0.5),
+    'efficientnet-b8': ModelConfig.from_args(2.2, 3.6, 672, 0.5),
+    'efficientnet-l2': ModelConfig.from_args(4.3, 5.3, 800, 0.5),
+}
+
+CONV_KERNEL_INITIALIZER = {
+    'class_name': 'VarianceScaling',
+    'config': {
+        'scale': 2.0,
+        'mode': 'fan_out',
+        # Note: this is a truncated normal distribution
+        'distribution': 'normal'
+    }
+}
+
+DENSE_KERNEL_INITIALIZER = {
+    'class_name': 'VarianceScaling',
+    'config': {
+        'scale': 1 / 3.0,
+        'mode': 'fan_out',
+        'distribution': 'uniform'
+    }
+}
+
+
+def round_filters(filters: int, config: ModelConfig) -> int:
+  """Round number of filters based on width coefficient."""
+  width_coefficient = config.width_coefficient
+  min_depth = config.min_depth
+  divisor = config.depth_divisor
+  orig_filters = filters
+
+  if not width_coefficient:
+    return filters
+
+  filters *= width_coefficient
+  min_depth = min_depth or divisor
+  new_filters = max(min_depth, int(filters + divisor / 2) // divisor * divisor)
+  # Make sure that round down does not go down by more than 10%.
+  if new_filters < 0.9 * filters:
+    new_filters += divisor
+  logging.info('round_filter input=%s output=%s', orig_filters, new_filters)
+  return int(new_filters)
+
+
+def round_repeats(repeats: int, depth_coefficient: float) -> int:
+  """Round number of repeats based on depth coefficient."""
+  return int(math.ceil(depth_coefficient * repeats))
+
+
+def conv2d_block(inputs: tf.Tensor,
+                 conv_filters: Optional[int],
+                 config: ModelConfig,
+                 kernel_size: Any = (1, 1),
+                 strides: Any = (1, 1),
+                 use_batch_norm: bool = True,
+                 use_bias: bool = False,
+                 activation: Optional[Any] = None,
+                 depthwise: bool = False,
+                 name: Optional[Text] = None):
+  """A conv2d followed by batch norm and an activation."""
+  batch_norm = common_modules.get_batch_norm(config.batch_norm)
+  bn_momentum = config.bn_momentum
+  bn_epsilon = config.bn_epsilon
+  data_format = tf.keras.backend.image_data_format()
+  weight_decay = config.weight_decay
+
+  name = name or ''
+
+  # Collect args based on what kind of conv2d block is desired
+  init_kwargs = {
+      'kernel_size': kernel_size,
+      'strides': strides,
+      'use_bias': use_bias,
+      'padding': 'same',
+      'name': name + '_conv2d',
+      'kernel_regularizer': tf.keras.regularizers.l2(weight_decay),
+      'bias_regularizer': tf.keras.regularizers.l2(weight_decay),
+  }
+
+  if depthwise:
+    conv2d = tf.keras.layers.DepthwiseConv2D
+    init_kwargs.update({'depthwise_initializer': CONV_KERNEL_INITIALIZER})
+  else:
+    conv2d = tf.keras.layers.Conv2D
+    init_kwargs.update({
+        'filters': conv_filters,
+        'kernel_initializer': CONV_KERNEL_INITIALIZER
+    })
+
+  x = conv2d(**init_kwargs)(inputs)
+
+  if use_batch_norm:
+    bn_axis = 1 if data_format == 'channels_first' else -1
+    x = batch_norm(
+        axis=bn_axis,
+        momentum=bn_momentum,
+        epsilon=bn_epsilon,
+        name=name + '_bn')(
+            x)
+
+  if activation is not None:
+    x = tf.keras.layers.Activation(activation, name=name + '_activation')(x)
+  return x
+
+
+def mb_conv_block(inputs: tf.Tensor,
+                  block: BlockConfig,
+                  config: ModelConfig,
+                  prefix: Optional[Text] = None):
+  """Mobile Inverted Residual Bottleneck.
+
+  Args:
+    inputs: the Keras input to the block
+    block: BlockConfig, arguments to create a Block
+    config: ModelConfig, a set of model parameters
+    prefix: prefix for naming all layers
+
+  Returns:
+    the output of the block
+  """
+  use_se = config.use_se
+  activation = tf_utils.get_activation(config.activation)
+  drop_connect_rate = config.drop_connect_rate
+  data_format = tf.keras.backend.image_data_format()
+  use_depthwise = block.conv_type != 'no_depthwise'
+  prefix = prefix or ''
+
+  filters = block.input_filters * block.expand_ratio
+
+  x = inputs
+
+  if block.fused_conv:
+    # If we use fused mbconv, skip expansion and use regular conv.
+    x = conv2d_block(
+        x,
+        filters,
+        config,
+        kernel_size=block.kernel_size,
+        strides=block.strides,
+        activation=activation,
+        name=prefix + 'fused')
+  else:
+    if block.expand_ratio != 1:
+      # Expansion phase
+      kernel_size = (1, 1) if use_depthwise else (3, 3)
+      x = conv2d_block(
+          x,
+          filters,
+          config,
+          kernel_size=kernel_size,
+          activation=activation,
+          name=prefix + 'expand')
+
+    # Depthwise Convolution
+    if use_depthwise:
+      x = conv2d_block(
+          x,
+          conv_filters=None,
+          config=config,
+          kernel_size=block.kernel_size,
+          strides=block.strides,
+          activation=activation,
+          depthwise=True,
+          name=prefix + 'depthwise')
+
+  # Squeeze and Excitation phase
+  if use_se:
+    assert block.se_ratio is not None
+    assert 0 < block.se_ratio <= 1
+    num_reduced_filters = max(1, int(block.input_filters * block.se_ratio))
+
+    if data_format == 'channels_first':
+      se_shape = (filters, 1, 1)
+    else:
+      se_shape = (1, 1, filters)
+
+    se = tf.keras.layers.GlobalAveragePooling2D(name=prefix + 'se_squeeze')(x)
+    se = tf.keras.layers.Reshape(se_shape, name=prefix + 'se_reshape')(se)
+
+    se = conv2d_block(
+        se,
+        num_reduced_filters,
+        config,
+        use_bias=True,
+        use_batch_norm=False,
+        activation=activation,
+        name=prefix + 'se_reduce')
+    se = conv2d_block(
+        se,
+        filters,
+        config,
+        use_bias=True,
+        use_batch_norm=False,
+        activation='sigmoid',
+        name=prefix + 'se_expand')
+    x = tf.keras.layers.multiply([x, se], name=prefix + 'se_excite')
+
+  # Output phase
+  x = conv2d_block(
+      x, block.output_filters, config, activation=None, name=prefix + 'project')
+
+  # Add identity so that quantization-aware training can insert quantization
+  # ops correctly.
+  x = tf.keras.layers.Activation(
+      tf_utils.get_activation('identity'), name=prefix + 'id')(
+          x)
+
+  if (block.id_skip and all(s == 1 for s in block.strides) and
+      block.input_filters == block.output_filters):
+    if drop_connect_rate and drop_connect_rate > 0:
+      # Apply dropconnect
+      # The only difference between dropout and dropconnect in TF is scaling by
+      # drop_connect_rate during training. See:
+      # https://github.com/keras-team/keras/pull/9898#issuecomment-380577612
+      x = tf.keras.layers.Dropout(
+          drop_connect_rate, noise_shape=(None, 1, 1, 1), name=prefix + 'drop')(
+              x)
+
+    x = tf.keras.layers.add([x, inputs], name=prefix + 'add')
+
+  return x
+
+
+def efficientnet(image_input: tf.keras.layers.Input, config: ModelConfig):
+  """Creates an EfficientNet graph given the model parameters.
+
+  This function is wrapped by the `EfficientNet` class to make a tf.keras.Model.
+
+  Args:
+    image_input: the input batch of images
+    config: the model config
+
+  Returns:
+    the output of efficientnet
+  """
+  depth_coefficient = config.depth_coefficient
+  blocks = config.blocks
+  stem_base_filters = config.stem_base_filters
+  top_base_filters = config.top_base_filters
+  activation = tf_utils.get_activation(config.activation)
+  dropout_rate = config.dropout_rate
+  drop_connect_rate = config.drop_connect_rate
+  num_classes = config.num_classes
+  input_channels = config.input_channels
+  rescale_input = config.rescale_input
+  data_format = tf.keras.backend.image_data_format()
+  dtype = config.dtype
+  weight_decay = config.weight_decay
+
+  x = image_input
+  if data_format == 'channels_first':
+    # Happens on GPU/TPU if available.
+    x = tf.keras.layers.Permute((3, 1, 2))(x)
+  if rescale_input:
+    x = preprocessing.normalize_images(
+        x, num_channels=input_channels, dtype=dtype, data_format=data_format)
+
+  # Build stem
+  x = conv2d_block(
+      x,
+      round_filters(stem_base_filters, config),
+      config,
+      kernel_size=[3, 3],
+      strides=[2, 2],
+      activation=activation,
+      name='stem')
+
+  # Build blocks
+  num_blocks_total = sum(
+      round_repeats(block.num_repeat, depth_coefficient) for block in blocks)
+  block_num = 0
+
+  for stack_idx, block in enumerate(blocks):
+    assert block.num_repeat > 0
+    # Update block input and output filters based on depth multiplier
+    block = block.replace(
+        input_filters=round_filters(block.input_filters, config),
+        output_filters=round_filters(block.output_filters, config),
+        num_repeat=round_repeats(block.num_repeat, depth_coefficient))
+
+    # The first block needs to take care of stride and filter size increase
+    drop_rate = drop_connect_rate * float(block_num) / num_blocks_total
+    config = config.replace(drop_connect_rate=drop_rate)
+    block_prefix = 'stack_{}/block_0/'.format(stack_idx)
+    x = mb_conv_block(x, block, config, block_prefix)
+    block_num += 1
+    if block.num_repeat > 1:
+      block = block.replace(input_filters=block.output_filters, strides=[1, 1])
+
+      for block_idx in range(block.num_repeat - 1):
+        drop_rate = drop_connect_rate * float(block_num) / num_blocks_total
+        config = config.replace(drop_connect_rate=drop_rate)
+        block_prefix = 'stack_{}/block_{}/'.format(stack_idx, block_idx + 1)
+        x = mb_conv_block(x, block, config, prefix=block_prefix)
+        block_num += 1
+
+  # Build top
+  x = conv2d_block(
+      x,
+      round_filters(top_base_filters, config),
+      config,
+      activation=activation,
+      name='top')
+
+  # Build classifier
+  x = tf.keras.layers.GlobalAveragePooling2D(name='top_pool')(x)
+  if dropout_rate and dropout_rate > 0:
+    x = tf.keras.layers.Dropout(dropout_rate, name='top_dropout')(x)
+  x = tf.keras.layers.Dense(
+      num_classes,
+      kernel_initializer=DENSE_KERNEL_INITIALIZER,
+      kernel_regularizer=tf.keras.regularizers.l2(weight_decay),
+      bias_regularizer=tf.keras.regularizers.l2(weight_decay),
+      name='logits')(
+          x)
+  x = tf.keras.layers.Activation('softmax', name='probs')(x)
+
+  return x
+
+
+@tf.keras.utils.register_keras_serializable(package='Vision')
+class EfficientNet(tf.keras.Model):
+  """Wrapper class for an EfficientNet Keras model.
+
+  Contains helper methods to build, manage, and save metadata about the model.
+  """
+
+  def __init__(self,
+               config: Optional[ModelConfig] = None,
+               overrides: Optional[Dict[Text, Any]] = None):
+    """Create an EfficientNet model.
+
+    Args:
+      config: (optional) the main model parameters to create the model
+      overrides: (optional) a dict containing keys that can override config
+    """
+    overrides = overrides or {}
+    config = config or ModelConfig()
+
+    self.config = config.replace(**overrides)
+
+    input_channels = self.config.input_channels
+    model_name = self.config.model_name
+    input_shape = (None, None, input_channels)  # Should handle any size image
+    image_input = tf.keras.layers.Input(shape=input_shape)
+
+    output = efficientnet(image_input, self.config)
+
+    # Cast to float32 in case we have a different model dtype
+    output = tf.cast(output, tf.float32)
+
+    logging.info('Building model %s with params %s', model_name, self.config)
+
+    super(EfficientNet, self).__init__(
+        inputs=image_input, outputs=output, name=model_name)
+
+  @classmethod
+  def from_name(cls,
+                model_name: Text,
+                model_weights_path: Optional[Text] = None,
+                weights_format: Text = 'saved_model',
+                overrides: Optional[Dict[Text, Any]] = None):
+    """Construct an EfficientNet model from a predefined model name.
+
+    E.g., `EfficientNet.from_name('efficientnet-b0')`.
+
+    Args:
+      model_name: the predefined model name
+      model_weights_path: the path to the weights (h5 file or saved model dir)
+      weights_format: the model weights format. One of 'saved_model', 'h5', or
+        'checkpoint'.
+      overrides: (optional) a dict containing keys that can override config
+
+    Returns:
+      A constructed EfficientNet instance.
+    """
+    model_configs = dict(MODEL_CONFIGS)
+    overrides = dict(overrides) if overrides else {}
+
+    # One can define their own custom models if necessary
+    model_configs.update(overrides.pop('model_config', {}))
+
+    if model_name not in model_configs:
+      raise ValueError('Unknown model name {}'.format(model_name))
+
+    config = model_configs[model_name]
+
+    model = cls(config=config, overrides=overrides)
+
+    if model_weights_path:
+      common_modules.load_weights(
+          model, model_weights_path, weights_format=weights_format)
+
+    return model
diff --git a/cv/classification/resnet50/tensorflow2.0/efficientnet/tfhub_export.py b/cv/classification/resnet50/tensorflow2.0/efficientnet/tfhub_export.py
new file mode 100644
index 000000000..691e568fa
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/efficientnet/tfhub_export.py
@@ -0,0 +1,68 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A script to export TF-Hub SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import os
+
+from absl import app
+from absl import flags
+
+import tensorflow as tf
+
+from official.vision.image_classification.efficientnet import efficientnet_model
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("model_name", None, "EfficientNet model name.")
+flags.DEFINE_string("model_path", None, "File path to TF model checkpoint.")
+flags.DEFINE_string("export_path", None,
+                    "TF-Hub SavedModel destination path to export.")
+
+
+def export_tfhub(model_path, hub_destination, model_name):
+  """Restores a tf.keras.Model and saves for TF-Hub."""
+  model_configs = dict(efficientnet_model.MODEL_CONFIGS)
+  config = model_configs[model_name]
+
+  image_input = tf.keras.layers.Input(
+      shape=(None, None, 3), name="image_input", dtype=tf.float32)
+  x = image_input * 255.0
+  ouputs = efficientnet_model.efficientnet(x, config)
+  hub_model = tf.keras.Model(image_input, ouputs)
+  ckpt = tf.train.Checkpoint(model=hub_model)
+  ckpt.restore(model_path).assert_existing_objects_matched()
+  hub_model.save(
+      os.path.join(hub_destination, "classification"), include_optimizer=False)
+
+  feature_vector_output = hub_model.get_layer(name="top_pool").get_output_at(0)
+  hub_model2 = tf.keras.Model(image_input, feature_vector_output)
+  hub_model2.save(
+      os.path.join(hub_destination, "feature-vector"), include_optimizer=False)
+
+
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError("Too many command-line arguments.")
+
+  export_tfhub(FLAGS.model_path, FLAGS.export_path, FLAGS.model_name)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/cv/classification/resnet50/tensorflow2.0/learning_rate.py b/cv/classification/resnet50/tensorflow2.0/learning_rate.py
new file mode 100644
index 000000000..72f7e9518
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/learning_rate.py
@@ -0,0 +1,117 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Learning rate utilities for vision tasks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from typing import Any, Mapping, Optional
+
+import numpy as np
+import tensorflow as tf
+
+BASE_LEARNING_RATE = 0.1
+
+
+class WarmupDecaySchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """A wrapper for LearningRateSchedule that includes warmup steps."""
+
+  def __init__(self,
+               lr_schedule: tf.keras.optimizers.schedules.LearningRateSchedule,
+               warmup_steps: int,
+               warmup_lr: Optional[float] = None):
+    """Add warmup decay to a learning rate schedule.
+
+    Args:
+      lr_schedule: base learning rate scheduler
+      warmup_steps: number of warmup steps
+      warmup_lr: an optional field for the final warmup learning rate. This
+        should be provided if the base `lr_schedule` does not contain this
+        field.
+    """
+    super(WarmupDecaySchedule, self).__init__()
+    self._lr_schedule = lr_schedule
+    self._warmup_steps = warmup_steps
+    self._warmup_lr = warmup_lr
+
+  def __call__(self, step: int):
+    lr = self._lr_schedule(step)
+    if self._warmup_steps:
+      if self._warmup_lr is not None:
+        initial_learning_rate = tf.convert_to_tensor(
+            self._warmup_lr, name="initial_learning_rate")
+      else:
+        initial_learning_rate = tf.convert_to_tensor(
+            self._lr_schedule.initial_learning_rate,
+            name="initial_learning_rate")
+      dtype = initial_learning_rate.dtype
+      global_step_recomp = tf.cast(step, dtype)
+      warmup_steps = tf.cast(self._warmup_steps, dtype)
+      warmup_lr = initial_learning_rate * global_step_recomp / warmup_steps
+      lr = tf.cond(global_step_recomp < warmup_steps, lambda: warmup_lr,
+                   lambda: lr)
+    return lr
+
+  def get_config(self) -> Mapping[str, Any]:
+    config = self._lr_schedule.get_config()
+    config.update({
+        "warmup_steps": self._warmup_steps,
+        "warmup_lr": self._warmup_lr,
+    })
+    return config
+
+
+class CosineDecayWithWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Class to generate learning rate tensor."""
+
+  def __init__(self, batch_size: int, total_steps: int, warmup_steps: int):
+    """Creates the consine learning rate tensor with linear warmup.
+
+    Args:
+      batch_size: The training batch size used in the experiment.
+      total_steps: Total training steps.
+      warmup_steps: Steps for the warm up period.
+    """
+    super(CosineDecayWithWarmup, self).__init__()
+    base_lr_batch_size = 256
+    self._total_steps = total_steps
+    self._init_learning_rate = BASE_LEARNING_RATE * batch_size / base_lr_batch_size
+    self._warmup_steps = warmup_steps
+
+  def __call__(self, global_step: int):
+    global_step = tf.cast(global_step, dtype=tf.float32)
+    warmup_steps = self._warmup_steps
+    init_lr = self._init_learning_rate
+    total_steps = self._total_steps
+
+    linear_warmup = global_step / warmup_steps * init_lr
+
+    cosine_learning_rate = init_lr * (tf.cos(np.pi *
+                                             (global_step - warmup_steps) /
+                                             (total_steps - warmup_steps)) +
+                                      1.0) / 2.0
+
+    learning_rate = tf.where(global_step < warmup_steps, linear_warmup,
+                             cosine_learning_rate)
+    return learning_rate
+
+  def get_config(self):
+    return {
+        "total_steps": self._total_steps,
+        "warmup_learning_rate": self._warmup_learning_rate,
+        "warmup_steps": self._warmup_steps,
+        "init_learning_rate": self._init_learning_rate,
+    }
diff --git a/cv/classification/resnet50/tensorflow2.0/learning_rate_test.py b/cv/classification/resnet50/tensorflow2.0/learning_rate_test.py
new file mode 100644
index 000000000..6c33ed24b
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/learning_rate_test.py
@@ -0,0 +1,60 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for learning_rate."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from official.vision.image_classification import learning_rate
+
+
+class LearningRateTests(tf.test.TestCase):
+
+  def test_warmup_decay(self):
+    """Basic computational test for warmup decay."""
+    initial_lr = 0.01
+    decay_steps = 100
+    decay_rate = 0.01
+    warmup_steps = 10
+
+    base_lr = tf.keras.optimizers.schedules.ExponentialDecay(
+        initial_learning_rate=initial_lr,
+        decay_steps=decay_steps,
+        decay_rate=decay_rate)
+    lr = learning_rate.WarmupDecaySchedule(
+        lr_schedule=base_lr, warmup_steps=warmup_steps)
+
+    for step in range(warmup_steps - 1):
+      config = lr.get_config()
+      self.assertEqual(config['warmup_steps'], warmup_steps)
+      self.assertAllClose(
+          self.evaluate(lr(step)), step / warmup_steps * initial_lr)
+
+  def test_cosine_decay_with_warmup(self):
+    """Basic computational test for cosine decay with warmup."""
+    expected_lrs = [0.0, 0.1, 0.05, 0.0]
+
+    lr = learning_rate.CosineDecayWithWarmup(
+        batch_size=256, total_steps=3, warmup_steps=1)
+
+    for step in [0, 1, 2, 3]:
+      self.assertAllClose(lr(step), expected_lrs[step])
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/mnist_main.py b/cv/classification/resnet50/tensorflow2.0/mnist_main.py
new file mode 100644
index 000000000..3eba80b06
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/mnist_main.py
@@ -0,0 +1,176 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Runs a simple model on the MNIST dataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+# Import libraries
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+import tensorflow_datasets as tfds
+from official.common import distribute_utils
+from official.utils.flags import core as flags_core
+from official.utils.misc import model_helpers
+from official.vision.image_classification.resnet import common
+
+FLAGS = flags.FLAGS
+
+
+def build_model():
+  """Constructs the ML model used to predict handwritten digits."""
+
+  image = tf.keras.layers.Input(shape=(28, 28, 1))
+
+  y = tf.keras.layers.Conv2D(filters=32,
+                             kernel_size=5,
+                             padding='same',
+                             activation='relu')(image)
+  y = tf.keras.layers.MaxPooling2D(pool_size=(2, 2),
+                                   strides=(2, 2),
+                                   padding='same')(y)
+  y = tf.keras.layers.Conv2D(filters=32,
+                             kernel_size=5,
+                             padding='same',
+                             activation='relu')(y)
+  y = tf.keras.layers.MaxPooling2D(pool_size=(2, 2),
+                                   strides=(2, 2),
+                                   padding='same')(y)
+  y = tf.keras.layers.Flatten()(y)
+  y = tf.keras.layers.Dense(1024, activation='relu')(y)
+  y = tf.keras.layers.Dropout(0.4)(y)
+
+  probs = tf.keras.layers.Dense(10, activation='softmax')(y)
+
+  model = tf.keras.models.Model(image, probs, name='mnist')
+
+  return model
+
+
+@tfds.decode.make_decoder(output_dtype=tf.float32)
+def decode_image(example, feature):
+  """Convert image to float32 and normalize from [0, 255] to [0.0, 1.0]."""
+  return tf.cast(feature.decode_example(example), dtype=tf.float32) / 255
+
+
+def run(flags_obj, datasets_override=None, strategy_override=None):
+  """Run MNIST model training and eval loop using native Keras APIs.
+
+  Args:
+    flags_obj: An object containing parsed flag values.
+    datasets_override: A pair of `tf.data.Dataset` objects to train the model,
+                       representing the train and test sets.
+    strategy_override: A `tf.distribute.Strategy` object to use for model.
+
+  Returns:
+    Dictionary of training and eval stats.
+  """
+  # Start TF profiler server.
+  tf.profiler.experimental.server.start(flags_obj.profiler_port)
+
+  strategy = strategy_override or distribute_utils.get_distribution_strategy(
+      distribution_strategy=flags_obj.distribution_strategy,
+      num_gpus=flags_obj.num_gpus,
+      tpu_address=flags_obj.tpu)
+
+  strategy_scope = distribute_utils.get_strategy_scope(strategy)
+
+  mnist = tfds.builder('mnist', data_dir=flags_obj.data_dir)
+  if flags_obj.download:
+    mnist.download_and_prepare()
+
+  mnist_train, mnist_test = datasets_override or mnist.as_dataset(
+      split=['train', 'test'],
+      decoders={'image': decode_image()},  # pylint: disable=no-value-for-parameter
+      as_supervised=True)
+  train_input_dataset = mnist_train.cache().repeat().shuffle(
+      buffer_size=50000).batch(flags_obj.batch_size)
+  eval_input_dataset = mnist_test.cache().repeat().batch(flags_obj.batch_size)
+
+  with strategy_scope:
+    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
+        0.05, decay_steps=100000, decay_rate=0.96)
+    optimizer = tf.keras.optimizers.SGD(learning_rate=lr_schedule)
+
+    model = build_model()
+    model.compile(
+        optimizer=optimizer,
+        loss='sparse_categorical_crossentropy',
+        metrics=['sparse_categorical_accuracy'])
+
+  num_train_examples = mnist.info.splits['train'].num_examples
+  train_steps = num_train_examples // flags_obj.batch_size
+  train_epochs = flags_obj.train_epochs
+
+  ckpt_full_path = os.path.join(flags_obj.model_dir, 'model.ckpt-{epoch:04d}')
+  callbacks = [
+      tf.keras.callbacks.ModelCheckpoint(
+          ckpt_full_path, save_weights_only=True),
+      tf.keras.callbacks.TensorBoard(log_dir=flags_obj.model_dir),
+  ]
+
+  num_eval_examples = mnist.info.splits['test'].num_examples
+  num_eval_steps = num_eval_examples // flags_obj.batch_size
+
+  history = model.fit(
+      train_input_dataset,
+      epochs=train_epochs,
+      steps_per_epoch=train_steps,
+      callbacks=callbacks,
+      validation_steps=num_eval_steps,
+      validation_data=eval_input_dataset,
+      validation_freq=flags_obj.epochs_between_evals)
+
+  export_path = os.path.join(flags_obj.model_dir, 'saved_model')
+  model.save(export_path, include_optimizer=False)
+
+  eval_output = model.evaluate(
+      eval_input_dataset, steps=num_eval_steps, verbose=2)
+
+  stats = common.build_stats(history, eval_output, callbacks)
+  return stats
+
+
+def define_mnist_flags():
+  """Define command line flags for MNIST model."""
+  flags_core.define_base(
+      clean=True,
+      num_gpu=True,
+      train_epochs=True,
+      epochs_between_evals=True,
+      distribution_strategy=True)
+  flags_core.define_device()
+  flags_core.define_distribution()
+  flags.DEFINE_bool('download', True,
+                    'Whether to download data to `--data_dir`.')
+  flags.DEFINE_integer('profiler_port', 9012,
+                       'Port to start profiler server on.')
+  FLAGS.set_default('batch_size', 1024)
+
+
+def main(_):
+  model_helpers.apply_clean(FLAGS)
+  stats = run(flags.FLAGS)
+  logging.info('Run stats:\n%s', stats)
+
+
+if __name__ == '__main__':
+  logging.set_verbosity(logging.INFO)
+  define_mnist_flags()
+  app.run(main)
diff --git a/cv/classification/resnet50/tensorflow2.0/mnist_test.py b/cv/classification/resnet50/tensorflow2.0/mnist_test.py
new file mode 100644
index 000000000..c94396a44
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/mnist_test.py
@@ -0,0 +1,89 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test the Keras MNIST model on GPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+from absl.testing import parameterized
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.utils.testing import integration
+from official.vision.image_classification import mnist_main
+
+
+mnist_main.define_mnist_flags()
+
+
+def eager_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.cloud_tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+      ],)
+
+
+class KerasMnistTest(tf.test.TestCase, parameterized.TestCase):
+  """Unit tests for sample Keras MNIST model."""
+  _tempdir = None
+
+  @classmethod
+  def setUpClass(cls):  # pylint: disable=invalid-name
+    super(KerasMnistTest, cls).setUpClass()
+
+  def tearDown(self):
+    super(KerasMnistTest, self).tearDown()
+    tf.io.gfile.rmtree(self.get_temp_dir())
+
+  @combinations.generate(eager_strategy_combinations())
+  def test_end_to_end(self, distribution):
+    """Test Keras MNIST model with `strategy`."""
+
+    extra_flags = [
+        "-train_epochs",
+        "1",
+        # Let TFDS find the metadata folder automatically
+        "--data_dir="
+    ]
+
+    dummy_data = (
+        tf.ones(shape=(10, 28, 28, 1), dtype=tf.int32),
+        tf.range(10),
+    )
+    datasets = (
+        tf.data.Dataset.from_tensor_slices(dummy_data),
+        tf.data.Dataset.from_tensor_slices(dummy_data),
+    )
+
+    run = functools.partial(
+        mnist_main.run,
+        datasets_override=datasets,
+        strategy_override=distribution)
+
+    integration.run_synthetic(
+        main=run,
+        synth=False,
+        tmp_root=self.create_tempdir().full_path,
+        extra_flags=extra_flags)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/__init__.py b/cv/classification/resnet50/tensorflow2.0/modeling/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/activations/__init__.py b/cv/classification/resnet50/tensorflow2.0/modeling/activations/__init__.py
new file mode 100644
index 000000000..3237bbe6f
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/activations/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Activations package definition."""
+from modeling.activations.gelu import gelu
+from modeling.activations.relu import relu6
+from modeling.activations.sigmoid import hard_sigmoid
+from modeling.activations.swish import hard_swish
+from modeling.activations.swish import identity
+from modeling.activations.swish import simple_swish
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/activations/gelu.py b/cv/classification/resnet50/tensorflow2.0/modeling/activations/gelu.py
new file mode 100644
index 000000000..a73294aa5
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/activations/gelu.py
@@ -0,0 +1,32 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Gaussian error linear unit."""
+
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+def gelu(x):
+  """Gaussian Error Linear Unit.
+
+  This is a smoother version of the RELU.
+  Original paper: https://arxiv.org/abs/1606.08415
+  Args:
+    x: float Tensor to perform activation.
+
+  Returns:
+    `x` with the GELU activation applied.
+  """
+  return tf.keras.activations.gelu(x, approximate=True)
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/activations/gelu_test.py b/cv/classification/resnet50/tensorflow2.0/modeling/activations/gelu_test.py
new file mode 100644
index 000000000..cfe1950d9
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/activations/gelu_test.py
@@ -0,0 +1,34 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the Gaussian error linear unit."""
+
+import tensorflow as tf
+
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.modeling import activations
+
+
+@keras_parameterized.run_all_keras_modes
+class GeluTest(keras_parameterized.TestCase):
+
+  def test_gelu(self):
+    expected_data = [[0.14967535, 0., -0.10032465],
+                     [-0.15880796, -0.04540223, 2.9963627]]
+    gelu_data = activations.gelu([[.25, 0, -.25], [-1, -2, 3]])
+    self.assertAllClose(expected_data, gelu_data)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/activations/relu.py b/cv/classification/resnet50/tensorflow2.0/modeling/activations/relu.py
new file mode 100644
index 000000000..b3941b2f3
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/activations/relu.py
@@ -0,0 +1,31 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Customized Relu activation."""
+
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+def relu6(features):
+  """Computes the Relu6 activation function.
+
+  Args:
+    features: A `Tensor` representing preactivation values.
+
+  Returns:
+    The activation value.
+  """
+  features = tf.convert_to_tensor(features)
+  return tf.nn.relu6(features)
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/activations/relu_test.py b/cv/classification/resnet50/tensorflow2.0/modeling/activations/relu_test.py
new file mode 100644
index 000000000..215f189ea
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/activations/relu_test.py
@@ -0,0 +1,35 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the customized Relu activation."""
+
+import tensorflow as tf
+
+from tensorflow.python.keras import \
+  keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.modeling import activations
+
+
+@keras_parameterized.run_all_keras_modes
+class CustomizedReluTest(keras_parameterized.TestCase):
+
+  def test_relu6(self):
+    features = [[.25, 0, -.25], [-1, -2, 3]]
+    customized_relu6_data = activations.relu6(features)
+    relu6_data = tf.nn.relu6(features)
+    self.assertAllClose(customized_relu6_data, relu6_data)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/activations/sigmoid.py b/cv/classification/resnet50/tensorflow2.0/modeling/activations/sigmoid.py
new file mode 100644
index 000000000..277463040
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/activations/sigmoid.py
@@ -0,0 +1,31 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Customized Sigmoid activation."""
+
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+def hard_sigmoid(features):
+  """Computes the hard sigmoid activation function.
+
+  Args:
+    features: A `Tensor` representing preactivation values.
+
+  Returns:
+    The activation value.
+  """
+  features = tf.convert_to_tensor(features)
+  return tf.nn.relu6(features + tf.cast(3., features.dtype)) * 0.16667
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/activations/sigmoid_test.py b/cv/classification/resnet50/tensorflow2.0/modeling/activations/sigmoid_test.py
new file mode 100644
index 000000000..6aad90ef3
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/activations/sigmoid_test.py
@@ -0,0 +1,40 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the customized Sigmoid activation."""
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.keras import \
+  keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.modeling import activations
+
+
+@keras_parameterized.run_all_keras_modes
+class CustomizedSigmoidTest(keras_parameterized.TestCase):
+
+  def _hard_sigmoid_nn(self, x):
+    x = np.float32(x)
+    return tf.nn.relu6(x + 3.) * 0.16667
+
+  def test_hard_sigmoid(self):
+    features = [[.25, 0, -.25], [-1, -2, 3]]
+    customized_hard_sigmoid_data = activations.hard_sigmoid(features)
+    sigmoid_data = self._hard_sigmoid_nn(features)
+    self.assertAllClose(customized_hard_sigmoid_data, sigmoid_data)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/activations/swish.py b/cv/classification/resnet50/tensorflow2.0/modeling/activations/swish.py
new file mode 100644
index 000000000..ea79985e3
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/activations/swish.py
@@ -0,0 +1,72 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Customized Swish activation."""
+
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+def simple_swish(features):
+  """Computes the Swish activation function.
+
+  The tf.nn.swish operation uses a custom gradient to reduce memory usage.
+  Since saving custom gradients in SavedModel is currently not supported, and
+  one would not be able to use an exported TF-Hub module for fine-tuning, we
+  provide this wrapper that can allow to select whether to use the native
+  TensorFlow swish operation, or whether to use a customized operation that
+  has uses default TensorFlow gradient computation.
+
+  Args:
+    features: A `Tensor` representing preactivation values.
+
+  Returns:
+    The activation value.
+  """
+  features = tf.convert_to_tensor(features)
+  return features * tf.nn.sigmoid(features)
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+def hard_swish(features):
+  """Computes a hard version of the swish function.
+
+  This operation can be used to reduce computational cost and improve
+  quantization for edge devices.
+
+  Args:
+    features: A `Tensor` representing preactivation values.
+
+  Returns:
+    The activation value.
+  """
+  features = tf.convert_to_tensor(features)
+  fdtype = features.dtype
+  return features * tf.nn.relu6(features + tf.cast(3., fdtype)) * (1. / 6.)
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+def identity(features):
+  """Computes the identity function.
+
+  Useful for helping in quantization.
+
+  Args:
+    features: A `Tensor` representing preactivation values.
+
+  Returns:
+    The activation value.
+  """
+  features = tf.convert_to_tensor(features)
+  return tf.identity(features)
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/activations/swish_test.py b/cv/classification/resnet50/tensorflow2.0/modeling/activations/swish_test.py
new file mode 100644
index 000000000..3cb9495d8
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/activations/swish_test.py
@@ -0,0 +1,44 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the customized Swish activation."""
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.modeling import activations
+
+
+@keras_parameterized.run_all_keras_modes
+class CustomizedSwishTest(keras_parameterized.TestCase):
+
+  def _hard_swish_np(self, x):
+    x = np.float32(x)
+    return x * np.clip(x + 3, 0, 6) / 6
+
+  def test_simple_swish(self):
+    features = [[.25, 0, -.25], [-1, -2, 3]]
+    customized_swish_data = activations.simple_swish(features)
+    swish_data = tf.nn.swish(features)
+    self.assertAllClose(customized_swish_data, swish_data)
+
+  def test_hard_swish(self):
+    features = [[.25, 0, -.25], [-1, -2, 3]]
+    customized_swish_data = activations.hard_swish(features)
+    swish_data = self._hard_swish_np(features)
+    self.assertAllClose(customized_swish_data, swish_data)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/__init__.py b/cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/__init__.py
new file mode 100644
index 000000000..e47d28b1d
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Hyperparams package definition."""
+# pylint: disable=g-multiple-import
+from modeling.hyperparams.base_config import *
+from modeling.hyperparams.oneof import *
+from modeling.hyperparams.params_dict import *
+
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/base_config.py b/cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/base_config.py
new file mode 100644
index 000000000..07dcf4d0c
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/base_config.py
@@ -0,0 +1,270 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Base configurations to standardize experiments."""
+
+import copy
+import functools
+from typing import Any, List, Mapping, Optional, Type
+from absl import logging
+
+import dataclasses
+import tensorflow as tf
+import yaml
+
+from modeling.hyperparams import params_dict
+
+
+@dataclasses.dataclass
+class Config(params_dict.ParamsDict):
+  """The base configuration class that supports YAML/JSON based overrides.
+
+  Because of YAML/JSON serialization limitations, some semantics of dataclass
+  are not supported:
+  * It recursively enforces a allowlist of basic types and container types, so
+    it avoids surprises with copy and reuse caused by unanticipated types.
+  * Warning: it converts Dict to `Config` even within sequences,
+    e.g. for config = Config({'key': [([{'a': 42}],)]),
+         type(config.key[0][0][0]) is Config rather than dict.
+    If you define/annotate some field as Dict, the field will convert to a
+    `Config` instance and lose the dictionary type.
+  """
+
+  # It's safe to add bytes and other immutable types here.
+  IMMUTABLE_TYPES = (str, int, float, bool, type(None))
+  # It's safe to add set, frozenset and other collections here.
+  SEQUENCE_TYPES = (list, tuple)
+
+  default_params: dataclasses.InitVar[Optional[Mapping[str, Any]]] = None
+  restrictions: dataclasses.InitVar[Optional[List[str]]] = None
+
+  @classmethod
+  def _isvalidsequence(cls, v):
+    """Check if the input values are valid sequences.
+
+    Args:
+      v: Input sequence.
+
+    Returns:
+      True if the sequence is valid. Valid sequence includes the sequence
+      type in cls.SEQUENCE_TYPES and element type is in cls.IMMUTABLE_TYPES or
+      is dict or ParamsDict.
+    """
+    if not isinstance(v, cls.SEQUENCE_TYPES):
+      return False
+    return (all(isinstance(e, cls.IMMUTABLE_TYPES) for e in v) or
+            all(isinstance(e, dict) for e in v) or
+            all(isinstance(e, params_dict.ParamsDict) for e in v))
+
+  @classmethod
+  def _import_config(cls, v, subconfig_type):
+    """Returns v with dicts converted to Configs, recursively."""
+    if not issubclass(subconfig_type, params_dict.ParamsDict):
+      raise TypeError(
+          'Subconfig_type should be subclass of ParamsDict, found {!r}'.format(
+              subconfig_type))
+    if isinstance(v, cls.IMMUTABLE_TYPES):
+      return v
+    elif isinstance(v, cls.SEQUENCE_TYPES):
+      # Only support one layer of sequence.
+      if not cls._isvalidsequence(v):
+        raise TypeError(
+            'Invalid sequence: only supports single level {!r} of {!r} or '
+            'dict or ParamsDict found: {!r}'.format(cls.SEQUENCE_TYPES,
+                                                    cls.IMMUTABLE_TYPES, v))
+      import_fn = functools.partial(
+          cls._import_config, subconfig_type=subconfig_type)
+      return type(v)(map(import_fn, v))
+    elif isinstance(v, params_dict.ParamsDict):
+      # Deepcopy here is a temporary solution for preserving type in nested
+      # Config object.
+      return copy.deepcopy(v)
+    elif isinstance(v, dict):
+      return subconfig_type(v)
+    else:
+      raise TypeError('Unknown type: {!r}'.format(type(v)))
+
+  @classmethod
+  def _export_config(cls, v):
+    """Returns v with Configs converted to dicts, recursively."""
+    if isinstance(v, cls.IMMUTABLE_TYPES):
+      return v
+    elif isinstance(v, cls.SEQUENCE_TYPES):
+      return type(v)(map(cls._export_config, v))
+    elif isinstance(v, params_dict.ParamsDict):
+      return v.as_dict()
+    elif isinstance(v, dict):
+      raise TypeError('dict value not supported in converting.')
+    else:
+      raise TypeError('Unknown type: {!r}'.format(type(v)))
+
+  @classmethod
+  def _get_subconfig_type(cls, k) -> Type[params_dict.ParamsDict]:
+    """Get element type by the field name.
+
+    Args:
+      k: the key/name of the field.
+
+    Returns:
+      Config as default. If a type annotation is found for `k`,
+      1) returns the type of the annotation if it is subtype of ParamsDict;
+      2) returns the element type if the annotation of `k` is List[SubType]
+         or Tuple[SubType].
+    """
+    subconfig_type = Config
+    if k in cls.__annotations__:
+      # Directly Config subtype.
+      type_annotation = cls.__annotations__[k]  # pytype: disable=invalid-annotation
+      if (isinstance(type_annotation, type) and
+          issubclass(type_annotation, Config)):
+        subconfig_type = cls.__annotations__[k]  # pytype: disable=invalid-annotation
+      else:
+        # Check if the field is a sequence of subtypes.
+        field_type = getattr(type_annotation, '__origin__', type(None))
+        if (isinstance(field_type, type) and
+            issubclass(field_type, cls.SEQUENCE_TYPES)):
+          element_type = getattr(type_annotation, '__args__', [type(None)])[0]
+          subconfig_type = (
+              element_type if issubclass(element_type, params_dict.ParamsDict)
+              else subconfig_type)
+    return subconfig_type
+
+  def __post_init__(self, default_params, restrictions, *args, **kwargs):
+    super().__init__(
+        default_params=default_params,
+        restrictions=restrictions,
+        *args,
+        **kwargs)
+
+  def _set(self, k, v):
+    """Overrides same method in ParamsDict.
+
+    Also called by ParamsDict methods.
+
+    Args:
+      k: key to set.
+      v: value.
+
+    Raises:
+      RuntimeError
+    """
+    subconfig_type = self._get_subconfig_type(k)
+
+    def is_null(k):
+      if k not in self.__dict__ or not self.__dict__[k]:
+        return True
+      return False
+
+    if isinstance(v, dict):
+      if is_null(k):
+        # If the key not exist or the value is None, a new Config-family object
+        # sould be created for the key.
+        self.__dict__[k] = subconfig_type(v)
+      else:
+        self.__dict__[k].override(v)
+    elif not is_null(k) and isinstance(v, self.SEQUENCE_TYPES) and all(
+        [not isinstance(e, self.IMMUTABLE_TYPES) for e in v]):
+      if len(self.__dict__[k]) == len(v):
+        for i in range(len(v)):
+          self.__dict__[k][i].override(v[i])
+      elif not all([isinstance(e, self.IMMUTABLE_TYPES) for e in v]):
+        logging.warning(
+            "The list/tuple don't match the value dictionaries provided. Thus, "
+            'the list/tuple is determined by the type annotation and '
+            'values provided. This is error-prone.')
+        self.__dict__[k] = self._import_config(v, subconfig_type)
+      else:
+        self.__dict__[k] = self._import_config(v, subconfig_type)
+    else:
+      self.__dict__[k] = self._import_config(v, subconfig_type)
+
+  def __setattr__(self, k, v):
+    if k not in self.RESERVED_ATTR:
+      if getattr(self, '_locked', False):
+        raise ValueError('The Config has been locked. ' 'No change is allowed.')
+    self._set(k, v)
+
+  def _override(self, override_dict, is_strict=True):
+    """Overrides same method in ParamsDict.
+
+    Also called by ParamsDict methods.
+
+    Args:
+      override_dict: dictionary to write to .
+      is_strict: If True, not allows to add new keys.
+
+    Raises:
+      KeyError: overriding reserved keys or keys not exist (is_strict=True).
+    """
+    for k, v in sorted(override_dict.items()):
+      if k in self.RESERVED_ATTR:
+        raise KeyError('The key {!r} is internally reserved. '
+                       'Can not be overridden.'.format(k))
+      if k not in self.__dict__:
+        if is_strict:
+          raise KeyError('The key {!r} does not exist in {!r}. '
+                         'To extend the existing keys, use '
+                         '`override` with `is_strict` = False.'.format(
+                             k, type(self)))
+        else:
+          self._set(k, v)
+      else:
+        if isinstance(v, dict) and self.__dict__[k]:
+          self.__dict__[k]._override(v, is_strict)  # pylint: disable=protected-access
+        elif isinstance(v, params_dict.ParamsDict) and self.__dict__[k]:
+          self.__dict__[k]._override(v.as_dict(), is_strict)  # pylint: disable=protected-access
+        else:
+          self._set(k, v)
+
+  def as_dict(self):
+    """Returns a dict representation of params_dict.ParamsDict.
+
+    For the nested params_dict.ParamsDict, a nested dict will be returned.
+    """
+    return {
+        k: self._export_config(v)
+        for k, v in self.__dict__.items()
+        if k not in self.RESERVED_ATTR
+    }
+
+  def replace(self, **kwargs):
+    """Overrides/returns a unlocked copy with the current config unchanged."""
+    # pylint: disable=protected-access
+    params = copy.deepcopy(self)
+    params._locked = False
+    params._override(kwargs, is_strict=True)
+    # pylint: enable=protected-access
+    return params
+
+  @classmethod
+  def from_yaml(cls, file_path: str):
+    # Note: This only works if the Config has all default values.
+    with tf.io.gfile.GFile(file_path, 'r') as f:
+      loaded = yaml.load(f, Loader=yaml.FullLoader)
+      config = cls()
+      config.override(loaded)
+      return config
+
+  @classmethod
+  def from_json(cls, file_path: str):
+    """Wrapper for `from_yaml`."""
+    return cls.from_yaml(file_path)
+
+  @classmethod
+  def from_args(cls, *args, **kwargs):
+    """Builds a config from the given list of arguments."""
+    attributes = list(cls.__annotations__.keys())
+    default_params = {a: p for a, p in zip(attributes, args)}
+    default_params.update(kwargs)
+    return cls(default_params)
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/base_config_test.py b/cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/base_config_test.py
new file mode 100644
index 000000000..3e64ec532
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/base_config_test.py
@@ -0,0 +1,360 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pprint
+from typing import List, Tuple
+
+from absl.testing import parameterized
+import dataclasses
+import tensorflow as tf
+from official.modeling.hyperparams import base_config
+
+
+@dataclasses.dataclass
+class DumpConfig1(base_config.Config):
+  a: int = 1
+  b: str = 'text'
+
+
+@dataclasses.dataclass
+class DumpConfig2(base_config.Config):
+  c: int = 2
+  d: str = 'text'
+  e: DumpConfig1 = DumpConfig1()
+
+
+@dataclasses.dataclass
+class DumpConfig3(DumpConfig2):
+  f: int = 2
+  g: str = 'text'
+  h: List[DumpConfig1] = dataclasses.field(
+      default_factory=lambda: [DumpConfig1(), DumpConfig1()])
+  g: Tuple[DumpConfig1, ...] = (DumpConfig1(),)
+
+
+@dataclasses.dataclass
+class DumpConfig4(DumpConfig2):
+  x: int = 3
+
+
+@dataclasses.dataclass
+class DummyConfig5(base_config.Config):
+  y: Tuple[DumpConfig2, ...] = (DumpConfig2(), DumpConfig4())
+  z: Tuple[str] = ('a',)
+
+
+class BaseConfigTest(parameterized.TestCase, tf.test.TestCase):
+
+  def assertHasSameTypes(self, c, d, msg=''):
+    """Checks if a Config has the same structure as a given dict.
+
+    Args:
+      c: the Config object to be check.
+      d: the reference dict object.
+      msg: The error message to show when type mismatched.
+    """
+    # Make sure d is not a Config. Assume d is either
+    # dictionary or primitive type and c is the Config or primitive types.
+    self.assertNotIsInstance(d, base_config.Config)
+    if isinstance(d, base_config.Config.IMMUTABLE_TYPES):
+      self.assertEqual(pprint.pformat(c), pprint.pformat(d), msg=msg)
+    elif isinstance(d, base_config.Config.SEQUENCE_TYPES):
+      self.assertEqual(type(c), type(d), msg=msg)
+      for i, v in enumerate(d):
+        self.assertHasSameTypes(c[i], v, msg='{}[{!r}]'.format(msg, i))
+    elif isinstance(d, dict):
+      self.assertIsInstance(c, base_config.Config, msg=msg)
+      for k, v in sorted(d.items()):
+        self.assertHasSameTypes(getattr(c, k), v, msg='{}[{!r}]'.format(msg, k))
+    else:
+      raise TypeError('Unknown type: %r' % type(d))
+
+  def assertImportExport(self, v):
+    config = base_config.Config({'key': v})
+    back = config.as_dict()['key']
+    self.assertEqual(pprint.pformat(back), pprint.pformat(v))
+    self.assertHasSameTypes(config.key, v, msg='=%s v' % pprint.pformat(v))
+
+  def test_invalid_keys(self):
+    params = base_config.Config()
+    with self.assertRaises(AttributeError):
+      _ = params.a
+
+  def test_nested_config_types(self):
+    config = DumpConfig3()
+    self.assertIsInstance(config.e, DumpConfig1)
+    self.assertIsInstance(config.h[0], DumpConfig1)
+    self.assertIsInstance(config.h[1], DumpConfig1)
+    self.assertIsInstance(config.g[0], DumpConfig1)
+
+    config.override({'e': {'a': 2, 'b': 'new text'}})
+    self.assertIsInstance(config.e, DumpConfig1)
+    self.assertEqual(config.e.a, 2)
+    self.assertEqual(config.e.b, 'new text')
+
+    config.override({'h': [{'a': 3, 'b': 'new text 2'}]})
+    self.assertIsInstance(config.h[0], DumpConfig1)
+    self.assertLen(config.h, 1)
+    self.assertEqual(config.h[0].a, 3)
+    self.assertEqual(config.h[0].b, 'new text 2')
+
+    config.override({'g': [{'a': 4, 'b': 'new text 3'}]})
+    self.assertIsInstance(config.g[0], DumpConfig1)
+    self.assertLen(config.g, 1)
+    self.assertEqual(config.g[0].a, 4)
+    self.assertEqual(config.g[0].b, 'new text 3')
+
+  def test_replace(self):
+    config = DumpConfig2()
+    new_config = config.replace(e={'a': 2})
+    self.assertEqual(new_config.e.a, 2)
+    self.assertIsInstance(new_config.e, DumpConfig1)
+
+    config = DumpConfig2(e=DumpConfig2())
+    new_config = config.replace(e={'c': 4})
+    self.assertEqual(new_config.e.c, 4)
+    self.assertIsInstance(new_config.e, DumpConfig2)
+
+    config = DumpConfig3()
+    new_config = config.replace(g=[{'a': 4, 'b': 'new text 3'}])
+    self.assertIsInstance(new_config.g[0], DumpConfig1)
+    self.assertEqual(new_config.g[0].a, 4)
+
+  @parameterized.parameters(
+      ('_locked', "The key '_locked' is internally reserved."),
+      ('_restrictions', "The key '_restrictions' is internally reserved."),
+      ('aa', "The key 'aa' does not exist."),
+  )
+  def test_key_error(self, key, msg):
+    params = base_config.Config()
+    with self.assertRaisesRegex(KeyError, msg):
+      params.override({key: True})
+
+  @parameterized.parameters(
+      ('str data',),
+      (123,),
+      (1.23,),
+      (None,),
+      (['str', 1, 2.3, None],),
+      (('str', 1, 2.3, None),),
+  )
+  def test_import_export_immutable_types(self, v):
+    self.assertImportExport(v)
+    out = base_config.Config({'key': v})
+    self.assertEqual(pprint.pformat(v), pprint.pformat(out.key))
+
+  def test_override_is_strict_true(self):
+    params = base_config.Config({
+        'a': 'aa',
+        'b': 2,
+        'c': {
+            'c1': 'cc',
+            'c2': 20
+        }
+    })
+    params.override({'a': 2, 'c': {'c1': 'ccc'}}, is_strict=True)
+    self.assertEqual(params.a, 2)
+    self.assertEqual(params.c.c1, 'ccc')
+    with self.assertRaises(KeyError):
+      params.override({'d': 'ddd'}, is_strict=True)
+    with self.assertRaises(KeyError):
+      params.override({'c': {'c3': 30}}, is_strict=True)
+
+    config = base_config.Config({'key': [{'a': 42}]})
+    with self.assertRaisesRegex(KeyError, "The key 'b' does not exist"):
+      config.override({'key': [{'b': 43}]})
+
+  @parameterized.parameters(
+      (lambda x: x, 'Unknown type'),
+      (object(), 'Unknown type'),
+      (set(), 'Unknown type'),
+      (frozenset(), 'Unknown type'),
+  )
+  def test_import_unsupport_types(self, v, msg):
+    with self.assertRaisesRegex(TypeError, msg):
+      _ = base_config.Config({'key': v})
+
+  @parameterized.parameters(
+      ({
+          'a': [{
+              'b': 2,
+          }, {
+              'c': 3,
+          }]
+      },),
+      ({
+          'c': [{
+              'f': 1.1,
+          }, {
+              'h': [1, 2],
+          }]
+      },),
+      (({
+          'a': 'aa',
+          'b': 2,
+          'c': {
+              'c1': 10,
+              'c2': 20,
+          }
+      },),),
+  )
+  def test_import_export_nested_structure(self, d):
+    self.assertImportExport(d)
+
+  @parameterized.parameters(
+      ([{
+          'a': 42,
+          'b': 'hello',
+          'c': 1.2
+      }],),
+      (({
+          'a': 42,
+          'b': 'hello',
+          'c': 1.2
+      },),),
+  )
+  def test_import_export_nested_sequences(self, v):
+    self.assertImportExport(v)
+
+  @parameterized.parameters(
+      ([([{}],)],),
+      ([['str', 1, 2.3, None]],),
+      ((('str', 1, 2.3, None),),),
+      ([
+          ('str', 1, 2.3, None),
+      ],),
+      ([
+          ('str', 1, 2.3, None),
+      ],),
+      ([[{
+          'a': 42,
+          'b': 'hello',
+          'c': 1.2
+      }]],),
+      ([[[{
+          'a': 42,
+          'b': 'hello',
+          'c': 1.2
+      }]]],),
+      ((({
+          'a': 42,
+          'b': 'hello',
+          'c': 1.2
+      },),),),
+      (((({
+          'a': 42,
+          'b': 'hello',
+          'c': 1.2
+      },),),),),
+      ([({
+          'a': 42,
+          'b': 'hello',
+          'c': 1.2
+      },)],),
+      (([{
+          'a': 42,
+          'b': 'hello',
+          'c': 1.2
+      }],),),
+  )
+  def test_import_export_unsupport_sequence(self, v):
+    with self.assertRaisesRegex(TypeError,
+                                'Invalid sequence: only supports single level'):
+      _ = base_config.Config({'key': v})
+
+  def test_construct_subtype(self):
+    pass
+
+  def test_import_config(self):
+    params = base_config.Config({'a': [{'b': 2}, {'c': {'d': 3}}]})
+    self.assertLen(params.a, 2)
+    self.assertEqual(params.a[0].b, 2)
+    self.assertEqual(type(params.a[0]), base_config.Config)
+    self.assertEqual(pprint.pformat(params.a[0].b), '2')
+    self.assertEqual(type(params.a[1]), base_config.Config)
+    self.assertEqual(type(params.a[1].c), base_config.Config)
+    self.assertEqual(pprint.pformat(params.a[1].c.d), '3')
+
+  def test_override(self):
+    params = base_config.Config({'a': [{'b': 2}, {'c': {'d': 3}}]})
+    params.override({'a': [{'b': 4}, {'c': {'d': 5}}]}, is_strict=False)
+    self.assertEqual(type(params.a), list)
+    self.assertEqual(type(params.a[0]), base_config.Config)
+    self.assertEqual(pprint.pformat(params.a[0].b), '4')
+    self.assertEqual(type(params.a[1]), base_config.Config)
+    self.assertEqual(type(params.a[1].c), base_config.Config)
+    self.assertEqual(pprint.pformat(params.a[1].c.d), '5')
+
+  @parameterized.parameters(
+      ([{}],),
+      (({},),),
+  )
+  def test_config_vs_params_dict(self, v):
+    d = {'key': v}
+    self.assertEqual(type(base_config.Config(d).key[0]), base_config.Config)
+    self.assertEqual(type(base_config.params_dict.ParamsDict(d).key[0]), dict)
+
+  def test_ppformat(self):
+    self.assertEqual(
+        pprint.pformat([
+            's', 1, 1.0, True, None, {}, [], (), {
+                (2,): (3, [4], {
+                    6: 7,
+                }),
+                8: 9,
+            }
+        ]),
+        "['s', 1, 1.0, True, None, {}, [], (), {8: 9, (2,): (3, [4], {6: 7})}]")
+
+  def test_with_restrictions(self):
+    restrictions = ['e.a<c']
+    config = DumpConfig2(restrictions=restrictions)
+    config.validate()
+
+  def test_nested_tuple(self):
+    config = DummyConfig5()
+    config.override({
+        'y': [{
+            'c': 4,
+            'd': 'new text 3',
+            'e': {
+                'a': 2
+            }
+        }, {
+            'c': 0,
+            'd': 'new text 3',
+            'e': {
+                'a': 2
+            }
+        }],
+        'z': ['a', 'b', 'c'],
+    })
+    self.assertEqual(config.y[0].c, 4)
+    self.assertEqual(config.y[1].c, 0)
+    self.assertIsInstance(config.y[0], DumpConfig2)
+    self.assertIsInstance(config.y[1], DumpConfig4)
+    self.assertSameElements(config.z, ['a', 'b', 'c'])
+
+  def test_override_by_empty_sequence(self):
+    config = DummyConfig5()
+    config.override({
+        'y': [],
+        'z': (),
+    }, is_strict=True)
+    self.assertEmpty(config.y)
+    self.assertEmpty(config.z)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/config_definitions.py b/cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/config_definitions.py
new file mode 100644
index 000000000..3bd950084
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/config_definitions.py
@@ -0,0 +1,57 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Common configuration settings."""
+# pylint:disable=wildcard-import
+import dataclasses
+
+from core.config_definitions import *
+from modeling.hyperparams import base_config
+
+
+# TODO(hongkuny): These configs are used in models that are going to deprecate.
+# Once those models are removed, we should delete this file to avoid confusion.
+# Users should not use this file anymore.
+@dataclasses.dataclass
+class TensorboardConfig(base_config.Config):
+  """Configuration for Tensorboard.
+
+  Attributes:
+    track_lr: Whether or not to track the learning rate in Tensorboard. Defaults
+      to True.
+    write_model_weights: Whether or not to write the model weights as images in
+      Tensorboard. Defaults to False.
+  """
+  track_lr: bool = True
+  write_model_weights: bool = False
+
+
+@dataclasses.dataclass
+class CallbacksConfig(base_config.Config):
+  """Configuration for Callbacks.
+
+  Attributes:
+    enable_checkpoint_and_export: Whether or not to enable checkpoints as a
+      Callback. Defaults to True.
+    enable_backup_and_restore: Whether or not to add BackupAndRestore
+      callback. Defaults to True.
+    enable_tensorboard: Whether or not to enable Tensorboard as a Callback.
+      Defaults to True.
+    enable_time_history: Whether or not to enable TimeHistory Callbacks.
+      Defaults to True.
+  """
+  enable_checkpoint_and_export: bool = True
+  enable_backup_and_restore: bool = False
+  enable_tensorboard: bool = True
+  enable_time_history: bool = True
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/oneof.py b/cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/oneof.py
new file mode 100644
index 000000000..8879c02c1
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/oneof.py
@@ -0,0 +1,57 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Config class that supports oneof functionality."""
+
+from typing import Optional
+
+import dataclasses
+from modeling.hyperparams import base_config
+
+
+@dataclasses.dataclass
+class OneOfConfig(base_config.Config):
+  """Configuration for configs with one of feature.
+
+  Attributes:
+    type: 'str', name of the field to select.
+  """
+  type: Optional[str] = None
+
+  def as_dict(self):
+    """Returns a dict representation of OneOfConfig.
+
+    For the nested base_config.Config, a nested dict will be returned.
+    """
+    if self.type is None:
+      return {'type': None}
+    elif self.__dict__['type'] not in self.__dict__:
+      raise ValueError('type: {!r} is not a valid key!'.format(
+          self.__dict__['type']))
+    else:
+      chosen_type = self.type
+      chosen_value = self.__dict__[chosen_type]
+      return {'type': self.type, chosen_type: self._export_config(chosen_value)}
+
+  def get(self):
+    """Returns selected config based on the value of type.
+
+    If type is not set (None), None is returned.
+    """
+    chosen_type = self.type
+    if chosen_type is None:
+      return None
+    if chosen_type not in self.__dict__:
+      raise ValueError('type: {!r} is not a valid key!'.format(self.type))
+    return self.__dict__[chosen_type]
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/oneof_test.py b/cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/oneof_test.py
new file mode 100644
index 000000000..2cde73c15
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/oneof_test.py
@@ -0,0 +1,71 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import dataclasses
+import tensorflow as tf
+from official.modeling.hyperparams import base_config
+from official.modeling.hyperparams import oneof
+
+
+@dataclasses.dataclass
+class ResNet(base_config.Config):
+  model_depth: int = 50
+
+
+@dataclasses.dataclass
+class Backbone(oneof.OneOfConfig):
+  type: str = 'resnet'
+  resnet: ResNet = ResNet()
+  not_resnet: int = 2
+
+
+@dataclasses.dataclass
+class OutputLayer(oneof.OneOfConfig):
+  type: str = 'single'
+  single: int = 1
+  multi_head: int = 2
+
+
+@dataclasses.dataclass
+class Network(base_config.Config):
+  backbone: Backbone = Backbone()
+  output_layer: OutputLayer = OutputLayer()
+
+
+class OneOfTest(tf.test.TestCase):
+
+  def test_to_dict(self):
+    network_params = {
+        'backbone': {
+            'type': 'resnet',
+            'resnet': {
+                'model_depth': 50
+            }
+        },
+        'output_layer': {
+            'type': 'single',
+            'single': 1000
+        }
+    }
+    network_config = Network(network_params)
+    self.assertEqual(network_config.as_dict(), network_params)
+
+  def test_get_oneof(self):
+    backbone = Backbone()
+    self.assertIsInstance(backbone.get(), ResNet)
+    self.assertEqual(backbone.get().as_dict(), {'model_depth': 50})
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/params_dict.py b/cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/params_dict.py
new file mode 100644
index 000000000..76b0446f0
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/params_dict.py
@@ -0,0 +1,464 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A parameter dictionary class which supports the nest structure."""
+
+import collections
+import copy
+import re
+
+import six
+import tensorflow as tf
+import yaml
+
+# regex pattern that matches on key-value pairs in a comma-separated
+# key-value pair string. It splits each k-v pair on the = sign, and
+# matches on values that are within single quotes, double quotes, single
+# values (e.g. floats, ints, etc.), and a lists within brackets.
+_PARAM_RE = re.compile(
+    r"""
+  (?P<name>[a-zA-Z][\w\.]*)    # variable name: "var" or "x"
+  \s*=\s*
+  ((?P<val>\'(.*?)\'           # single quote
+  |
+  \"(.*?)\"                    # double quote
+  |
+  [^,\[]*                      # single value
+  |
+  \[[^\]]*\]))                 # list of values
+  ($|,\s*)""", re.VERBOSE)
+
+_CONST_VALUE_RE = re.compile(r'(\d.*|-\d.*|None)')
+
+# Yaml loader with an implicit resolver to parse float decimal and exponential
+# format. The regular experission parse the following cases:
+# 1- Decimal number with an optional exponential term.
+# 2- Integer number with an exponential term.
+# 3- Decimal number with an optional exponential term.
+# 4- Decimal number.
+
+LOADER = yaml.SafeLoader
+LOADER.add_implicit_resolver(
+    'tag:yaml.org,2002:float',
+    re.compile(r'''
+    ^(?:[-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+]?[0-9]+)?
+    |
+    [-+]?(?:[0-9][0-9_]*)(?:[eE][-+]?[0-9]+)
+    |
+    \\.[0-9_]+(?:[eE][-+][0-9]+)?
+    |
+    [-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*)$''', re.X),
+    list('-+0123456789.'))
+
+
+class ParamsDict(object):
+  """A hyperparameter container class."""
+
+  RESERVED_ATTR = ['_locked', '_restrictions']
+
+  def __init__(self, default_params=None, restrictions=None):
+    """Instantiate a ParamsDict.
+
+    Instantiate a ParamsDict given a set of default parameters and a list of
+    restrictions. Upon initialization, it validates itself by checking all the
+    defined restrictions, and raise error if it finds inconsistency.
+
+    Args:
+      default_params: a Python dict or another ParamsDict object including the
+        default parameters to initialize.
+      restrictions: a list of strings, which define a list of restrictions to
+        ensure the consistency of different parameters internally. Each
+        restriction string is defined as a binary relation with a set of
+        operators, including {'==', '!=',  '<', '<=', '>', '>='}.
+    """
+    self._locked = False
+    self._restrictions = []
+    if restrictions:
+      self._restrictions = restrictions
+    if default_params is None:
+      default_params = {}
+    self.override(default_params, is_strict=False)
+
+  def _set(self, k, v):
+    if isinstance(v, dict):
+      self.__dict__[k] = ParamsDict(v)
+    else:
+      self.__dict__[k] = copy.deepcopy(v)
+
+  def __setattr__(self, k, v):
+    """Sets the value of the existing key.
+
+    Note that this does not allow directly defining a new key. Use the
+    `override` method with `is_strict=False` instead.
+
+    Args:
+      k: the key string.
+      v: the value to be used to set the key `k`.
+
+    Raises:
+      KeyError: if k is not defined in the ParamsDict.
+    """
+    if k not in ParamsDict.RESERVED_ATTR:
+      if k not in self.__dict__.keys():
+        raise KeyError('The key `%{}` does not exist. '
+                       'To extend the existing keys, use '
+                       '`override` with `is_strict` = True.'.format(k))
+      if self._locked:
+        raise ValueError('The ParamsDict has been locked. '
+                         'No change is allowed.')
+    self._set(k, v)
+
+  def __getattr__(self, k):
+    """Gets the value of the existing key.
+
+    Args:
+      k: the key string.
+
+    Returns:
+      the value of the key.
+
+    Raises:
+      AttributeError: if k is not defined in the ParamsDict.
+    """
+    if k not in self.__dict__.keys():
+      raise AttributeError('The key `{}` does not exist. '.format(k))
+    return self.__dict__[k]
+
+  def __contains__(self, key):
+    """Implements the membership test operator."""
+    return key in self.__dict__
+
+  def get(self, key, value=None):
+    """Accesses through built-in dictionary get method."""
+    return self.__dict__.get(key, value)
+
+  def __delattr__(self, k):
+    """Deletes the key and removes its values.
+
+    Args:
+      k: the key string.
+
+    Raises:
+      AttributeError: if k is reserverd or not defined in the ParamsDict.
+      ValueError: if the ParamsDict instance has been locked.
+    """
+    if k in ParamsDict.RESERVED_ATTR:
+      raise AttributeError(
+          'The key `{}` is reserved. No change is allowes. '.format(k))
+    if k not in self.__dict__.keys():
+      raise AttributeError('The key `{}` does not exist. '.format(k))
+    if self._locked:
+      raise ValueError('The ParamsDict has been locked. No change is allowed.')
+    del self.__dict__[k]
+
+  def override(self, override_params, is_strict=True):
+    """Override the ParamsDict with a set of given params.
+
+    Args:
+      override_params: a dict or a ParamsDict specifying the parameters to be
+        overridden.
+      is_strict: a boolean specifying whether override is strict or not. If
+        True, keys in `override_params` must be present in the ParamsDict. If
+        False, keys in `override_params` can be different from what is currently
+        defined in the ParamsDict. In this case, the ParamsDict will be extended
+        to include the new keys.
+    """
+    if self._locked:
+      raise ValueError('The ParamsDict has been locked. No change is allowed.')
+    if isinstance(override_params, ParamsDict):
+      override_params = override_params.as_dict()
+    self._override(override_params, is_strict)  # pylint: disable=protected-access
+
+  def _override(self, override_dict, is_strict=True):
+    """The implementation of `override`."""
+    for k, v in six.iteritems(override_dict):
+      if k in ParamsDict.RESERVED_ATTR:
+        raise KeyError('The key `%{}` is internally reserved. '
+                       'Can not be overridden.')
+      if k not in self.__dict__.keys():
+        if is_strict:
+          raise KeyError('The key `{}` does not exist. '
+                         'To extend the existing keys, use '
+                         '`override` with `is_strict` = False.'.format(k))
+        else:
+          self._set(k, v)
+      else:
+        if isinstance(v, dict):
+          self.__dict__[k]._override(v, is_strict)  # pylint: disable=protected-access
+        elif isinstance(v, ParamsDict):
+          self.__dict__[k]._override(v.as_dict(), is_strict)  # pylint: disable=protected-access
+        else:
+          self.__dict__[k] = copy.deepcopy(v)
+
+  def lock(self):
+    """Makes the ParamsDict immutable."""
+    self._locked = True
+
+  def as_dict(self):
+    """Returns a dict representation of ParamsDict.
+
+    For the nested ParamsDict, a nested dict will be returned.
+    """
+    params_dict = {}
+    for k, v in six.iteritems(self.__dict__):
+      if k not in ParamsDict.RESERVED_ATTR:
+        if isinstance(v, ParamsDict):
+          params_dict[k] = v.as_dict()
+        else:
+          params_dict[k] = copy.deepcopy(v)
+    return params_dict
+
+  def validate(self):
+    """Validate the parameters consistency based on the restrictions.
+
+    This method validates the internal consistency using the pre-defined list of
+    restrictions. A restriction is defined as a string which specfiies a binary
+    operation. The supported binary operations are {'==', '!=', '<', '<=', '>',
+    '>='}. Note that the meaning of these operators are consistent with the
+    underlying Python immplementation. Users should make sure the define
+    restrictions on their type make sense.
+
+    For example, for a ParamsDict like the following
+    ```
+    a:
+      a1: 1
+      a2: 2
+    b:
+      bb:
+        bb1: 10
+        bb2: 20
+      ccc:
+        a1: 1
+        a3: 3
+    ```
+    one can define two restrictions like this
+    ['a.a1 == b.ccc.a1', 'a.a2 <= b.bb.bb2']
+
+    What it enforces are:
+     - a.a1 = 1 == b.ccc.a1 = 1
+     - a.a2 = 2 <= b.bb.bb2 = 20
+
+    Raises:
+      KeyError: if any of the following happens
+        (1) any of parameters in any of restrictions is not defined in
+            ParamsDict,
+        (2) any inconsistency violating the restriction is found.
+      ValueError: if the restriction defined in the string is not supported.
+    """
+
+    def _get_kv(dotted_string, params_dict):
+      """Get keys and values indicated by dotted_string."""
+      if _CONST_VALUE_RE.match(dotted_string) is not None:
+        const_str = dotted_string
+        if const_str == 'None':
+          constant = None
+        else:
+          constant = float(const_str)
+        return None, constant
+      else:
+        tokenized_params = dotted_string.split('.')
+        v = params_dict
+        for t in tokenized_params:
+          v = v[t]
+        return tokenized_params[-1], v
+
+    def _get_kvs(tokens, params_dict):
+      if len(tokens) != 2:
+        raise ValueError('Only support binary relation in restriction.')
+      stripped_tokens = [t.strip() for t in tokens]
+      left_k, left_v = _get_kv(stripped_tokens[0], params_dict)
+      right_k, right_v = _get_kv(stripped_tokens[1], params_dict)
+      return left_k, left_v, right_k, right_v
+
+    params_dict = self.as_dict()
+    for restriction in self._restrictions:
+      if '==' in restriction:
+        tokens = restriction.split('==')
+        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
+        if left_v != right_v:
+          raise KeyError(
+              'Found inconsistncy between key `{}` and key `{}`.'.format(
+                  tokens[0], tokens[1]))
+      elif '!=' in restriction:
+        tokens = restriction.split('!=')
+        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
+        if left_v == right_v:
+          raise KeyError(
+              'Found inconsistncy between key `{}` and key `{}`.'.format(
+                  tokens[0], tokens[1]))
+      elif '<' in restriction:
+        tokens = restriction.split('<')
+        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
+        if left_v >= right_v:
+          raise KeyError(
+              'Found inconsistncy between key `{}` and key `{}`.'.format(
+                  tokens[0], tokens[1]))
+      elif '<=' in restriction:
+        tokens = restriction.split('<=')
+        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
+        if left_v > right_v:
+          raise KeyError(
+              'Found inconsistncy between key `{}` and key `{}`.'.format(
+                  tokens[0], tokens[1]))
+      elif '>' in restriction:
+        tokens = restriction.split('>')
+        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
+        if left_v <= right_v:
+          raise KeyError(
+              'Found inconsistncy between key `{}` and key `{}`.'.format(
+                  tokens[0], tokens[1]))
+      elif '>=' in restriction:
+        tokens = restriction.split('>=')
+        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
+        if left_v < right_v:
+          raise KeyError(
+              'Found inconsistncy between key `{}` and key `{}`.'.format(
+                  tokens[0], tokens[1]))
+      else:
+        raise ValueError('Unsupported relation in restriction.')
+
+
+def read_yaml_to_params_dict(file_path: str):
+  """Reads a YAML file to a ParamsDict."""
+  with tf.io.gfile.GFile(file_path, 'r') as f:
+    params_dict = yaml.load(f, Loader=LOADER)
+    return ParamsDict(params_dict)
+
+
+def save_params_dict_to_yaml(params, file_path):
+  """Saves the input ParamsDict to a YAML file."""
+  with tf.io.gfile.GFile(file_path, 'w') as f:
+
+    def _my_list_rep(dumper, data):
+      # u'tag:yaml.org,2002:seq' is the YAML internal tag for sequence.
+      return dumper.represent_sequence(
+          u'tag:yaml.org,2002:seq', data, flow_style=True)
+
+    yaml.add_representer(list, _my_list_rep)
+    yaml.dump(params.as_dict(), f, default_flow_style=False)
+
+
+def nested_csv_str_to_json_str(csv_str):
+  """Converts a nested (using '.') comma-separated k=v string to a JSON string.
+
+  Converts a comma-separated string of key/value pairs that supports
+  nesting of keys to a JSON string. Nesting is implemented using
+  '.' between levels for a given key.
+
+  Spacing between commas and = is supported (e.g. there is no difference between
+  "a=1,b=2", "a = 1, b = 2", or "a=1, b=2") but there should be no spaces before
+  keys or after values (e.g. " a=1,b=2" and "a=1,b=2 " are not supported).
+
+  Note that this will only support values supported by CSV, meaning
+  values such as nested lists (e.g. "a=[[1,2,3],[4,5,6]]") are not
+  supported. Strings are supported as well, e.g. "a='hello'".
+
+  An example conversion would be:
+
+  "a=1, b=2, c.a=2, c.b=3, d.a.a=5"
+
+  to
+
+  "{ a: 1, b : 2, c: {a : 2, b : 3}, d: {a: {a : 5}}}"
+
+  Args:
+    csv_str: the comma separated string.
+
+  Returns:
+    the converted JSON string.
+
+  Raises:
+    ValueError: If csv_str is not in a comma separated string or
+      if the string is formatted incorrectly.
+  """
+  if not csv_str:
+    return ''
+
+  formatted_entries = []
+  nested_map = collections.defaultdict(list)
+  pos = 0
+  while pos < len(csv_str):
+    m = _PARAM_RE.match(csv_str, pos)
+    if not m:
+      raise ValueError('Malformed hyperparameter value while parsing '
+                       'CSV string: %s' % csv_str[pos:])
+    pos = m.end()
+    # Parse the values.
+    m_dict = m.groupdict()
+    name = m_dict['name']
+    v = m_dict['val']
+
+    # If a GCS path (e.g. gs://...) is provided, wrap this in quotes
+    # as yaml.load would otherwise throw an exception
+    if re.match(r'(?=[^\"\'])(?=[gs://])', v):
+      v = '\'{}\''.format(v)
+
+    name_nested = name.split('.')
+    if len(name_nested) > 1:
+      grouping = name_nested[0]
+      value = '.'.join(name_nested[1:]) + '=' + v
+      nested_map[grouping].append(value)
+    else:
+      formatted_entries.append('%s : %s' % (name, v))
+
+  for grouping, value in nested_map.items():
+    value = ','.join(value)
+    value = nested_csv_str_to_json_str(value)
+    formatted_entries.append('%s : %s' % (grouping, value))
+  return '{' + ', '.join(formatted_entries) + '}'
+
+
+def override_params_dict(params, dict_or_string_or_yaml_file, is_strict):
+  """Override a given ParamsDict using a dict, JSON/YAML/CSV string or YAML file.
+
+  The logic of the function is outlined below:
+  1. Test that the input is a dict. If not, proceed to 2.
+  2. Tests that the input is a string. If not, raise unknown ValueError
+  2.1. Test if the string is in a CSV format. If so, parse.
+  If not, proceed to 2.2.
+  2.2. Try loading the string as a YAML/JSON. If successful, parse to
+  dict and use it to override. If not, proceed to 2.3.
+  2.3. Try using the string as a file path and load the YAML file.
+
+  Args:
+    params: a ParamsDict object to be overridden.
+    dict_or_string_or_yaml_file: a Python dict, JSON/YAML/CSV string or path to
+      a YAML file specifying the parameters to be overridden.
+    is_strict: a boolean specifying whether override is strict or not.
+
+  Returns:
+    params: the overridden ParamsDict object.
+
+  Raises:
+    ValueError: if failed to override the parameters.
+  """
+  if not dict_or_string_or_yaml_file:
+    return params
+  if isinstance(dict_or_string_or_yaml_file, dict):
+    params.override(dict_or_string_or_yaml_file, is_strict)
+  elif isinstance(dict_or_string_or_yaml_file, six.string_types):
+    try:
+      dict_or_string_or_yaml_file = (
+          nested_csv_str_to_json_str(dict_or_string_or_yaml_file))
+    except ValueError:
+      pass
+    params_dict = yaml.load(dict_or_string_or_yaml_file, Loader=LOADER)
+    if isinstance(params_dict, dict):
+      params.override(params_dict, is_strict)
+    else:
+      with tf.io.gfile.GFile(dict_or_string_or_yaml_file) as f:
+        params.override(yaml.load(f, Loader=yaml.FullLoader), is_strict)
+  else:
+    raise ValueError('Unknown input type to parse.')
+  return params
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/params_dict_test.py b/cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/params_dict_test.py
new file mode 100644
index 000000000..248a81652
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/hyperparams/params_dict_test.py
@@ -0,0 +1,429 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for params_dict.py."""
+
+import os
+
+import tensorflow as tf
+import yaml
+
+from official.modeling.hyperparams import params_dict
+
+
+class ParamsDictTest(tf.test.TestCase):
+
+  def test_init_from_an_empty_dict(self):
+    params = params_dict.ParamsDict()
+    with self.assertRaises(AttributeError):
+      _ = params.a
+
+    with self.assertRaises(KeyError):
+      params.a = 'aa'
+
+  def test_init_from_a_dict(self):
+    params = params_dict.ParamsDict({'a': 'aa', 'b': 2})
+    self.assertEqual(params.a, 'aa')
+    self.assertEqual(params.b, 2)
+
+  def test_init_from_a_param_dict(self):
+    params_init = params_dict.ParamsDict({'a': 'aa', 'b': 2})
+    params = params_dict.ParamsDict(params_init)
+    self.assertEqual(params.a, 'aa')
+    self.assertEqual(params.b, 2)
+
+  def test_lock(self):
+    params = params_dict.ParamsDict({'a': 1, 'b': 2, 'c': 3})
+    params.lock()
+    with self.assertRaises(ValueError):
+      params.a = 10
+    with self.assertRaises(ValueError):
+      params.override({'b': 20})
+    with self.assertRaises(ValueError):
+      del params.c
+
+  def test_setattr(self):
+    params = params_dict.ParamsDict()
+    params.override({'a': 'aa', 'b': 2, 'c': None}, is_strict=False)
+    params.c = 'ccc'
+    self.assertEqual(params.a, 'aa')
+    self.assertEqual(params.b, 2)
+    self.assertEqual(params.c, 'ccc')
+
+  def test_getattr(self):
+    params = params_dict.ParamsDict()
+    params.override({'a': 'aa', 'b': 2, 'c': None}, is_strict=False)
+    self.assertEqual(params.a, 'aa')
+    self.assertEqual(params.b, 2)
+    self.assertEqual(params.c, None)
+
+  def test_delattr(self):
+    params = params_dict.ParamsDict()
+    params.override({
+        'a': 'aa',
+        'b': 2,
+        'c': None,
+        'd': {
+            'd1': 1,
+            'd2': 10
+        }
+    },
+                    is_strict=False)
+    del params.c
+    self.assertEqual(params.a, 'aa')
+    self.assertEqual(params.b, 2)
+    with self.assertRaises(AttributeError):
+      _ = params.c
+    del params.d
+    with self.assertRaises(AttributeError):
+      _ = params.d.d1
+
+  def test_contains(self):
+    params = params_dict.ParamsDict()
+    params.override({'a': 'aa'}, is_strict=False)
+    self.assertIn('a', params)
+    self.assertNotIn('b', params)
+
+  def test_get(self):
+    params = params_dict.ParamsDict()
+    params.override({'a': 'aa'}, is_strict=False)
+    self.assertEqual(params.get('a'), 'aa')
+    self.assertEqual(params.get('b', 2), 2)
+    self.assertEqual(params.get('b'), None)
+
+  def test_override_is_strict_true(self):
+    params = params_dict.ParamsDict({
+        'a': 'aa',
+        'b': 2,
+        'c': {
+            'c1': 'cc',
+            'c2': 20
+        }
+    })
+    params.override({'a': 2, 'c': {'c1': 'ccc'}}, is_strict=True)
+    self.assertEqual(params.a, 2)
+    self.assertEqual(params.c.c1, 'ccc')
+    with self.assertRaises(KeyError):
+      params.override({'d': 'ddd'}, is_strict=True)
+    with self.assertRaises(KeyError):
+      params.override({'c': {'c3': 30}}, is_strict=True)
+
+  def test_override_is_strict_false(self):
+    params = params_dict.ParamsDict({
+        'a': 'aa',
+        'b': 2,
+        'c': {
+            'c1': 10,
+            'c2': 20
+        }
+    })
+    params.override({'a': 2, 'c': {'c3': 3000}}, is_strict=False)
+    self.assertEqual(params.a, 2)
+    self.assertEqual(params.c.c3, 3000)
+    params.override({'d': 'ddd'}, is_strict=False)
+    self.assertEqual(params.d, 'ddd')
+    params.override({'c': {'c4': 4444}}, is_strict=False)
+    self.assertEqual(params.c.c4, 4444)
+
+  def test_as_dict(self):
+    params = params_dict.ParamsDict({
+        'a': 'aa',
+        'b': 2,
+        'c': {
+            'c1': 10,
+            'c2': 20
+        }
+    })
+    params_d = params.as_dict()
+    self.assertEqual(params_d['a'], 'aa')
+    self.assertEqual(params_d['b'], 2)
+    self.assertEqual(params_d['c']['c1'], 10)
+    self.assertEqual(params_d['c']['c2'], 20)
+
+  def test_validate(self):
+    # Raise error due to the unknown parameter.
+    with self.assertRaises(KeyError):
+      params = params_dict.ParamsDict({'a': 1, 'b': {'a': 11}}, ['a == c'])
+      params.validate()
+
+    # OK to check equality of two nested dicts.
+    params = params_dict.ParamsDict({
+        'a': 1,
+        'b': {
+            'a': 10
+        },
+        'c': {
+            'a': 10
+        }
+    }, ['b == c'])
+
+    # Raise error due to inconsistency
+    with self.assertRaises(KeyError):
+      params = params_dict.ParamsDict({'a': 1, 'c': {'a': 10}}, ['a == c.a'])
+      params.validate()
+
+    # Valid rule.
+    params = params_dict.ParamsDict({'a': 1, 'c': {'a': 1}}, ['a == c.a'])
+
+    # Overridding violates the existing rule, raise error upon validate.
+    params.override({'a': 11})
+    with self.assertRaises(KeyError):
+      params.validate()
+
+    # Valid restrictions with constant.
+    params = params_dict.ParamsDict({
+        'a': None,
+        'c': {
+            'a': 1
+        }
+    }, ['a == None', 'c.a == 1'])
+    params.validate()
+    with self.assertRaises(KeyError):
+      params = params_dict.ParamsDict({
+          'a': 4,
+          'c': {
+              'a': 1
+          }
+      }, ['a == None', 'c.a == 1'])
+      params.validate()
+
+
+class ParamsDictIOTest(tf.test.TestCase):
+
+  def write_temp_file(self, filename, text):
+    temp_file = os.path.join(self.get_temp_dir(), filename)
+    with tf.io.gfile.GFile(temp_file, 'w') as writer:
+      writer.write(text)
+    return temp_file
+
+  def test_save_params_dict_to_yaml(self):
+    params = params_dict.ParamsDict({
+        'a': 'aa',
+        'b': 2,
+        'c': {
+            'c1': 10,
+            'c2': 20
+        }
+    })
+    output_yaml_file = os.path.join(self.get_temp_dir(), 'params.yaml')
+    params_dict.save_params_dict_to_yaml(params, output_yaml_file)
+
+    with tf.io.gfile.GFile(output_yaml_file, 'r') as f:
+      params_d = yaml.load(f)
+      self.assertEqual(params.a, params_d['a'])
+      self.assertEqual(params.b, params_d['b'])
+      self.assertEqual(params.c.c1, params_d['c']['c1'])
+      self.assertEqual(params.c.c2, params_d['c']['c2'])
+
+  def test_read_yaml_to_params_dict(self):
+    input_yaml_file = self.write_temp_file(
+        'params.yaml', r"""
+        a: 'aa'
+        b: 2
+        c:
+          c1: 10
+          c2: 20
+    """)
+    params = params_dict.read_yaml_to_params_dict(input_yaml_file)
+
+    self.assertEqual(params.a, 'aa')
+    self.assertEqual(params.b, 2)
+    self.assertEqual(params.c.c1, 10)
+    self.assertEqual(params.c.c2, 20)
+
+  def test_override_params_dict_using_dict(self):
+    params = params_dict.ParamsDict({
+        'a': 1,
+        'b': 2.5,
+        'c': [3, 4],
+        'd': 'hello',
+        'e': False
+    })
+    override_dict = {'b': 5.2, 'c': [30, 40]}
+    params = params_dict.override_params_dict(
+        params, override_dict, is_strict=True)
+    self.assertEqual(1, params.a)
+    self.assertEqual(5.2, params.b)
+    self.assertEqual([30, 40], params.c)
+    self.assertEqual('hello', params.d)
+    self.assertEqual(False, params.e)
+
+  def test_override_params_dict_using_yaml_string(self):
+    params = params_dict.ParamsDict({
+        'a': 1,
+        'b': 2.5,
+        'c': [3, 4],
+        'd': 'hello',
+        'e': False
+    })
+    override_yaml_string = "'b': 5.2\n'c': [30, 40]"
+    params = params_dict.override_params_dict(
+        params, override_yaml_string, is_strict=True)
+    self.assertEqual(1, params.a)
+    self.assertEqual(5.2, params.b)
+    self.assertEqual([30, 40], params.c)
+    self.assertEqual('hello', params.d)
+    self.assertEqual(False, params.e)
+
+  def test_override_params_dict_using_json_string(self):
+    params = params_dict.ParamsDict({
+        'a': 1,
+        'b': {
+            'b1': 2,
+            'b2': [2, 3],
+        },
+        'd': {
+            'd1': {
+                'd2': 'hello'
+            }
+        },
+        'e': False
+    })
+    override_json_string = "{ b: { b2: [3, 4] }, d: { d1: { d2: 'hi' } } }"
+    params = params_dict.override_params_dict(
+        params, override_json_string, is_strict=True)
+    self.assertEqual(1, params.a)
+    self.assertEqual(2, params.b.b1)
+    self.assertEqual([3, 4], params.b.b2)
+    self.assertEqual('hi', params.d.d1.d2)
+    self.assertEqual(False, params.e)
+
+  def test_override_params_dict_using_csv_string(self):
+    params = params_dict.ParamsDict({
+        'a': 1,
+        'b': {
+            'b1': 2,
+            'b2': [2, 3],
+        },
+        'd': {
+            'd1': {
+                'd2': 'hello'
+            }
+        },
+        'e': False
+    })
+    override_csv_string = "b.b2=[3,4], d.d1.d2='hi, world', e=gs://test"
+    params = params_dict.override_params_dict(
+        params, override_csv_string, is_strict=True)
+    self.assertEqual(1, params.a)
+    self.assertEqual(2, params.b.b1)
+    self.assertEqual([3, 4], params.b.b2)
+    self.assertEqual('hi, world', params.d.d1.d2)
+    self.assertEqual('gs://test', params.e)
+    # Test different float formats
+    override_csv_string = 'b.b2=-1.e-3, d.d1.d2=+0.001, e=1e+3, a=-1.5E-3'
+    params = params_dict.override_params_dict(
+        params, override_csv_string, is_strict=True)
+    self.assertEqual(-1e-3, params.b.b2)
+    self.assertEqual(0.001, params.d.d1.d2)
+    self.assertEqual(1e3, params.e)
+    self.assertEqual(-1.5e-3, params.a)
+
+  def test_override_params_dict_using_yaml_file(self):
+    params = params_dict.ParamsDict({
+        'a': 1,
+        'b': 2.5,
+        'c': [3, 4],
+        'd': 'hello',
+        'e': False
+    })
+    override_yaml_file = self.write_temp_file(
+        'params.yaml', r"""
+        b: 5.2
+        c: [30, 40]
+        """)
+    params = params_dict.override_params_dict(
+        params, override_yaml_file, is_strict=True)
+    self.assertEqual(1, params.a)
+    self.assertEqual(5.2, params.b)
+    self.assertEqual([30, 40], params.c)
+    self.assertEqual('hello', params.d)
+    self.assertEqual(False, params.e)
+
+
+class IOTest(tf.test.TestCase):
+
+  def test_basic_csv_str_to_json_str(self):
+    csv_str = 'a=1,b=2,c=3'
+    json_str = '{a : 1, b : 2, c : 3}'
+    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
+    self.assertEqual(converted_csv_str, json_str)
+
+  def test_basic_csv_str_load(self):
+    csv_str = 'a=1,b=2,c=3'
+    expected_output = {'a': 1, 'b': 2, 'c': 3}
+    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
+    converted_dict = yaml.load(converted_csv_str)
+    self.assertDictEqual(converted_dict, expected_output)
+
+  def test_basic_nested_csv_str_to_json_str(self):
+    csv_str = 'a=1,b.b1=2'
+    json_str = '{a : 1, b : {b1 : 2}}'
+    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
+    self.assertEqual(converted_csv_str, json_str)
+
+  def test_basic_nested_csv_str_load(self):
+    csv_str = 'a=1,b.b1=2,c.c1=3'
+    expected_output = {'a': 1, 'b': {'b1': 2}, 'c': {'c1': 3}}
+    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
+    converted_dict = yaml.load(converted_csv_str)
+    self.assertDictEqual(converted_dict, expected_output)
+
+  def test_complex_nested_csv_str_to_json_str(self):
+    csv_str = 'a.aa.aaa.aaaaa.a=1'
+    json_str = '{a : {aa : {aaa : {aaaaa : {a : 1}}}}}'
+    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
+    self.assertEqual(converted_csv_str, json_str)
+
+  def test_complex_nested_csv_str_load(self):
+    csv_str = 'a.aa.aaa.aaaaa.a=1,a.a=2'
+    expected_output = {'a': {'aa': {'aaa': {'aaaaa': {'a': 1}}}, 'a': 2}}
+    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
+    converted_dict = yaml.load(converted_csv_str)
+    self.assertDictEqual(converted_dict, expected_output)
+
+  def test_csv_str_load_supported_datatypes(self):
+    csv_str = 'a=1,b=2.,c=[1,2,3],d=\'hello, there\',e=\"Hi.\"'
+    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
+    converted_dict = yaml.load(converted_csv_str)
+    self.assertEqual(converted_dict['a'], 1)
+    self.assertEqual(converted_dict['b'], 2.)
+    self.assertEqual(converted_dict['c'], [1, 2, 3])
+    self.assertEqual(converted_dict['d'], 'hello, there')
+    self.assertEqual(converted_dict['e'], 'Hi.')
+
+  def test_csv_str_load_unsupported_datatypes(self):
+    csv_str = 'a=[[1,2,3],[4,5,6]]'
+    self.assertRaises(ValueError, params_dict.nested_csv_str_to_json_str,
+                      csv_str)
+
+  def test_csv_str_to_json_str_spacing(self):
+    csv_str1 = 'a=1,b=2,c=3'
+    csv_str2 = 'a = 1, b = 2, c = 3'
+    json_str = '{a : 1, b : 2, c : 3}'
+    converted_csv_str1 = params_dict.nested_csv_str_to_json_str(csv_str1)
+    converted_csv_str2 = params_dict.nested_csv_str_to_json_str(csv_str2)
+    self.assertEqual(converted_csv_str1, converted_csv_str2)
+    self.assertEqual(converted_csv_str1, json_str)
+    self.assertEqual(converted_csv_str2, json_str)
+
+  def test_gcs_added_quotes(self):
+    csv_str = 'a=gs://abc, b=gs://def'
+    expected_output = '{a : \'gs://abc\', b : \'gs://def\'}'
+    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
+    self.assertEqual(converted_csv_str, expected_output)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/multitask/__init__.py b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/multitask/base_model.py b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/base_model.py
new file mode 100644
index 000000000..976b0d8e3
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/base_model.py
@@ -0,0 +1,60 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Abstraction of multi-task model."""
+from typing import Text, Dict
+
+import tensorflow as tf
+
+
+class MultiTaskBaseModel(tf.Module):
+  """Base class that holds multi-task model computation."""
+
+  def __init__(self, **kwargs):
+    super().__init__(**kwargs)
+    self._sub_tasks = self._instantiate_sub_tasks()
+
+  def _instantiate_sub_tasks(self) -> Dict[Text, tf.keras.Model]:
+    """Abstract function that sets up the computation for each sub-task.
+
+    Returns:
+      A map from task name (as string) to a tf.keras.Model object that
+        represents the sub-task in the multi-task pool.
+    """
+    raise NotImplementedError(
+        "_instantiate_sub_task_models() is not implemented.")
+
+  @property
+  def sub_tasks(self):
+    """Fetch a map of task name (string) to task model (tf.keras.Model)."""
+    return self._sub_tasks
+
+  def initialize(self):
+    """Optional function that loads a pre-train checkpoint."""
+    return
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/multitask/base_trainer.py b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/base_trainer.py
new file mode 100644
index 000000000..7f975be84
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/base_trainer.py
@@ -0,0 +1,176 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Multitask base trainer implementation.
+
+The trainer derives from the Orbit `StandardTrainer` class.
+"""
+from typing import Union
+import gin
+import orbit
+import tensorflow as tf
+
+from modeling.multitask import base_model
+from modeling.multitask import multitask
+
+
+@gin.configurable
+class MultiTaskBaseTrainer(orbit.StandardTrainer):
+  """Multitask base trainer."""
+
+  def __init__(self,
+               multi_task: multitask.MultiTask,
+               multi_task_model: Union[tf.keras.Model,
+                                       base_model.MultiTaskBaseModel],
+               optimizer: tf.optimizers.Optimizer,
+               trainer_options=None):
+    self._strategy = tf.distribute.get_strategy()
+    self._multi_task = multi_task
+    self._multi_task_model = multi_task_model
+    self._optimizer = optimizer
+
+    self._training_losses = None
+    self._training_metrics = None
+    self._global_step = orbit.utils.create_global_step()
+
+    if hasattr(self.multi_task_model, "checkpoint_items"):
+      checkpoint_items = self.multi_task_model.checkpoint_items
+    else:
+      checkpoint_items = {}
+
+    self._checkpoint = tf.train.Checkpoint(
+        model=self.multi_task_model,
+        optimizer=self.optimizer,
+        global_step=self.global_step,
+        **checkpoint_items)
+
+    train_datasets = {}
+    for name, task in self.multi_task.tasks.items():
+      train_datasets[name] = orbit.utils.make_distributed_dataset(
+          self.strategy, task.build_inputs, task.task_config.train_data)
+
+    super().__init__(
+        train_dataset=train_datasets,
+        options=trainer_options or orbit.StandardTrainerOptions())
+
+  def train_loop_begin(self):
+    """Clean up states that hold losses and metrics."""
+    for _, train_loss_metric in self.training_losses.items():
+      train_loss_metric.reset_states()
+
+    for _, metrics in self.training_metrics.items():
+      for metric in metrics:
+        metric.reset_states()
+
+  def train_loop_end(self):
+    """Record loss and metric values per task."""
+    result = {}
+    for task_name, loss in self.training_losses.items():
+      result[task_name] = {loss.name: loss.result()}
+    for task_name, task_metrics in self.training_metrics.items():
+      result[task_name].update(
+          {metric.name: metric.result() for metric in task_metrics})
+    # Note that, the learning rate schedule is managed by the keras optimizer
+    # internally, which respects the number of backward pass as `iterations`.
+    # The learning rate schedule does not follow the trainer logical global
+    # step of multiple tasks.
+    if callable(self.optimizer.learning_rate):
+      result["learning_rate"] = self.optimizer.learning_rate(
+          self.optimizer.iterations)
+    else:
+      result["learning_rate"] = self.optimizer.learning_rate
+    return result
+
+  @property
+  def checkpoint(self):
+    """Accesses the training checkpoint."""
+    return self._checkpoint
+
+  @property
+  def training_losses(self):
+    """Access training loss metric objects for all tasks."""
+    if self._training_losses is None:
+      # Builds the per-task metrics and losses.
+      # This the total summed training loss of tasks in the joint training.
+      self._training_losses = dict(
+          total_loss=tf.keras.metrics.Mean("training_loss", dtype=tf.float32))
+      for name in self.multi_task.tasks:
+        self._training_losses[name] = tf.keras.metrics.Mean(
+            "training_loss", dtype=tf.float32)
+    return self._training_losses
+
+  @property
+  def training_metrics(self):
+    """Access training metric metric objects for all tasks."""
+    if self._training_metrics is None:
+      # Builds the per-task metrics and losses.
+      self._training_metrics = {}
+      for name, task in self.multi_task.tasks.items():
+        self._training_metrics[name] = task.build_metrics(training=True)
+    return self._training_metrics
+
+  @property
+  def strategy(self):
+    return self._strategy
+
+  @property
+  def multi_task(self):
+    return self._multi_task
+
+  @property
+  def multi_task_model(self):
+    return self._multi_task_model
+
+  @property
+  def optimizer(self):
+    return self._optimizer
+
+  @property
+  def global_step(self):
+    return self._global_step
+
+  def train_step(self, iterator_map):
+    """The default train step calling the multi-task train step.
+
+    Args:
+      iterator_map: a dictionary of task names and per-task dataset iterators.
+    """
+
+    def step_fn(inputs):
+      losses = self.multi_task.joint_train_step(
+          inputs,
+          multi_task_model=self.multi_task_model,
+          optimizer=self.optimizer,
+          task_metrics=self.training_metrics)
+      for key, loss in losses.items():
+        self.training_losses[key].update_state(loss)
+
+    self.strategy.run(
+        step_fn, args=(tf.nest.map_structure(next, iterator_map),))
+    self.global_step.assign_add(1)
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/multitask/base_trainer_test.py b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/base_trainer_test.py
new file mode 100644
index 000000000..2427ff85f
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/base_trainer_test.py
@@ -0,0 +1,90 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for multitask.base_trainer."""
+from absl.testing import parameterized
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.modeling.multitask import base_trainer
+from official.modeling.multitask import configs
+from official.modeling.multitask import multitask
+from official.modeling.multitask import test_utils
+
+
+def all_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.cloud_tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+      ],
+      mode="eager",
+  )
+
+
+class BaseTrainerTest(tf.test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(all_strategy_combinations())
+  def test_multitask_joint_trainer(self, distribution):
+    with distribution.scope():
+      tasks = [
+          test_utils.MockFooTask(params=test_utils.FooConfig(), name="foo"),
+          test_utils.MockBarTask(params=test_utils.BarConfig(), name="bar")
+      ]
+      task_weights = {"foo": 1.0, "bar": 1.0}
+      test_multitask = multitask.MultiTask(
+          tasks=tasks, task_weights=task_weights)
+      test_optimizer = tf.keras.optimizers.SGD(0.1)
+      model = test_utils.MockMultiTaskModel()
+      test_trainer = base_trainer.MultiTaskBaseTrainer(
+          multi_task=test_multitask,
+          multi_task_model=model,
+          optimizer=test_optimizer)
+      results = test_trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
+      self.assertContainsSubset(["training_loss", "bar_acc"],
+                                results["bar"].keys())
+      self.assertContainsSubset(["training_loss", "foo_acc"],
+                                results["foo"].keys())
+
+  def test_trainer_with_configs(self):
+    config = configs.MultiTaskConfig(
+        task_routines=(configs.TaskRoutine(
+            task_name="foo",
+            task_config=test_utils.FooConfig(),
+            task_weight=0.5),
+                       configs.TaskRoutine(
+                           task_name="bar",
+                           task_config=test_utils.BarConfig(),
+                           task_weight=0.5)))
+    test_multitask = multitask.MultiTask.from_config(config)
+    test_optimizer = tf.keras.optimizers.SGD(0.1)
+    model = test_utils.MockMultiTaskModel()
+    test_trainer = base_trainer.MultiTaskBaseTrainer(
+        multi_task=test_multitask,
+        multi_task_model=model,
+        optimizer=test_optimizer)
+    results = test_trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
+    self.assertContainsSubset(["training_loss", "bar_acc"],
+                              results["bar"].keys())
+    self.assertContainsSubset(["training_loss", "foo_acc"],
+                              results["foo"].keys())
+    self.assertEqual(test_multitask.task_weight("foo"), 0.5)
+    self.assertEqual(test_trainer.global_step.numpy(), 5)
+    self.assertIn("learning_rate", results)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/multitask/configs.py b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/configs.py
new file mode 100644
index 000000000..70e98682a
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/configs.py
@@ -0,0 +1,79 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration definitions for multi-task training."""
+from typing import Optional, Tuple
+
+import dataclasses
+
+from core import config_definitions as cfg
+from modeling import hyperparams
+
+
+@dataclasses.dataclass
+class TaskRoutine(hyperparams.Config):
+  task_name: str = ""
+  task_config: cfg.TaskConfig = None
+  eval_steps: Optional[int] = None
+  task_weight: Optional[float] = 1.0
+
+
+@dataclasses.dataclass
+class MultiTaskConfig(hyperparams.Config):
+  init_checkpoint: str = ""
+  model: hyperparams.Config = None
+  task_routines: Tuple[TaskRoutine, ...] = ()
+
+
+@dataclasses.dataclass
+class ProportionalSampleConfig(hyperparams.Config):
+  alpha: float = 1.0
+
+
+@dataclasses.dataclass
+class AnnealingSampleConfig(hyperparams.Config):
+  steps_per_epoch: int = 5
+  total_steps: int = 20
+
+
+@dataclasses.dataclass
+class TaskSamplingConfig(hyperparams.OneOfConfig):
+  type: str = ""
+  uniform: hyperparams.Config = hyperparams.Config()
+  proportional: ProportionalSampleConfig = ProportionalSampleConfig()
+  annealing: AnnealingSampleConfig = AnnealingSampleConfig()
+
+
+@dataclasses.dataclass
+class MultiTaskTrainerConfig(cfg.TrainerConfig):
+  trainer_type: str = "interleaving"
+  task_sampler: TaskSamplingConfig = TaskSamplingConfig(type="proportional")
+
+
+@dataclasses.dataclass
+class MultiTaskExperimentConfig(hyperparams.Config):
+  """An experiment config for multi-task training and multi-task evaluation."""
+  task: MultiTaskConfig = MultiTaskConfig()
+  trainer: MultiTaskTrainerConfig = MultiTaskTrainerConfig()
+  runtime: cfg.RuntimeConfig = cfg.RuntimeConfig()
+
+
+@dataclasses.dataclass
+class MultiEvalExperimentConfig(cfg.ExperimentConfig):
+  """An experiment config for single-task training and multi-task evaluation.
+
+  Attributes:
+    eval_tasks: individual evaluation tasks.
+  """
+  eval_tasks: MultiTaskConfig = MultiTaskConfig()
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/multitask/evaluator.py b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/evaluator.py
new file mode 100644
index 000000000..cb28f784b
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/evaluator.py
@@ -0,0 +1,172 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Multitask Evaluator implementation.
+
+The evaluator implements the Orbit `AbstractEvaluator` interface.
+"""
+from typing import Optional, Union
+import gin
+import orbit
+import tensorflow as tf
+
+from core import train_utils
+from modeling.multitask import base_model
+from modeling.multitask import multitask
+
+
+@gin.configurable
+class MultiTaskEvaluator(orbit.AbstractEvaluator):
+  """Implements the common trainer shared for TensorFlow models."""
+
+  def __init__(
+      self,
+      task: multitask.MultiTask,
+      model: Union[tf.keras.Model, base_model.MultiTaskBaseModel],
+      global_step: Optional[tf.Variable] = None,
+      checkpoint_exporter: Optional[train_utils.BestCheckpointExporter] = None):
+    """Initialize common trainer for TensorFlow models.
+
+    Args:
+      task: A multitask.MultiTask instance.
+      model: tf.keras.Model instance.
+      global_step: the global step variable.
+      checkpoint_exporter: an object that has the `maybe_export_checkpoint`
+        interface.
+    """
+    # Gets the current distribution strategy. If not inside any strategy scope,
+    # it gets a single-replica no-op strategy.
+    self._strategy = tf.distribute.get_strategy()
+    self._task = task
+    self._model = model
+    self._global_step = global_step or orbit.utils.create_global_step()
+    self._checkpoint_exporter = checkpoint_exporter
+    self._checkpoint = tf.train.Checkpoint(
+        global_step=self.global_step,
+        model=self.model)
+
+    self._validation_losses = None
+    self._validation_metrics = None
+
+    # Builds per-task datasets.
+    self.eval_datasets = {}
+    for name, task in self.task.tasks.items():
+      self.eval_datasets[name] = orbit.utils.make_distributed_dataset(
+          self.strategy, task.build_inputs, task.task_config.validation_data)
+
+    # Builds per-task validation loops.
+    def get_function(task_name, task):
+
+      task_metrics = self.validation_metrics[task_name]
+      task_loss = self.validation_losses[task_name]
+      if isinstance(self.model, base_model.MultiTaskBaseModel):
+        model = self.model.sub_tasks[task_name]
+      else:
+        model = self.model
+
+      def step_fn(inputs):
+        logs = task.validation_step(inputs, model=model, metrics=task_metrics)
+        task_loss.update_state(logs[task.loss])
+        return logs
+
+      @tf.function
+      def eval_step_fn(iterator):
+        distributed_outputs = self.strategy.run(step_fn, args=(next(iterator),))
+        return tf.nest.map_structure(self.strategy.experimental_local_results,
+                                     distributed_outputs)
+
+      return orbit.utils.create_loop_fn(eval_step_fn)
+
+    self.task_fns = {
+        name: get_function(name, task)
+        for name, task in self.task.tasks.items()
+    }
+
+  @property
+  def strategy(self):
+    return self._strategy
+
+  @property
+  def task(self):
+    return self._task
+
+  @property
+  def model(self):
+    return self._model
+
+  @property
+  def global_step(self):
+    return self._global_step
+
+  @property
+  def validation_losses(self):
+    """Accesses the validation loss metric object."""
+    if self._validation_losses is None:
+      # Builds the per-task metrics and losses.
+      self._validation_losses = {}
+      for name in self.task.tasks:
+        self._validation_losses[name] = tf.keras.metrics.Mean(
+            "validation_loss", dtype=tf.float32)
+    return self._validation_losses
+
+  @property
+  def validation_metrics(self):
+    """Accesses all validation metric metric objects."""
+    if self._validation_metrics is None:
+      # Builds the per-task metrics and losses.
+      self._validation_metrics = {}
+      for name, task in self.task.tasks.items():
+        self._validation_metrics[name] = task.build_metrics(training=False)
+    return self._validation_metrics
+
+  @property
+  def checkpoint(self):
+    """Accesses the training checkpoint."""
+    return self._checkpoint
+
+  def evaluate(self, num_steps: tf.Tensor):
+    """Performs evaluation for each `EvalTask`."""
+    for metric in self.validation_losses.values():
+      metric.reset_states()
+    for metrics in self.validation_metrics.values():
+      for metric in metrics:
+        metric.reset_states()
+    results = {}
+    eval_iters = tf.nest.map_structure(iter, self.eval_datasets)
+
+    for name, task_eval_loop in self.task_fns.items():
+      outputs = None
+      eval_iter = eval_iters[name]
+      task = self.task.tasks[name]
+      task_eval_steps = self.task.task_eval_steps(name) or num_steps
+      outputs = task_eval_loop(
+          eval_iter,
+          task_eval_steps,
+          state=outputs,
+          reduce_fn=task.aggregate_logs)
+      task_metrics = self.validation_metrics[name]
+      task_loss = self.validation_losses[name]
+      logs = {}
+      for metric in task_metrics + [task_loss]:
+        logs[metric.name] = metric.result()
+      if outputs:
+        metrics = task.reduce_aggregated_logs(
+            outputs, global_step=self.global_step)
+        logs.update(metrics)
+      results[name] = logs
+
+    if self._checkpoint_exporter:
+      self._checkpoint_exporter.maybe_export_checkpoint(
+          self.checkpoint, results, self.global_step.numpy())
+    return results
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/multitask/evaluator_test.py b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/evaluator_test.py
new file mode 100644
index 000000000..6bee73eac
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/evaluator_test.py
@@ -0,0 +1,138 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for multitask.evaluator."""
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.core import base_task
+from official.core import config_definitions as cfg
+from official.modeling.multitask import evaluator
+from official.modeling.multitask import multitask
+
+
+def all_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.cloud_tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+      ],
+      mode="eager",
+  )
+
+
+class MockModel(tf.keras.Model):
+
+  def __init__(self, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+    self.dense = tf.keras.layers.Dense(1)
+
+  def call(self, inputs):
+    print(inputs, type(inputs))
+    if "y" in inputs:
+      self.add_loss(tf.zeros((1,), dtype=tf.float32))
+    else:
+      self.add_loss(tf.ones((1,), dtype=tf.float32))
+    return self.dense(inputs["x"])
+
+
+class MockTask(base_task.Task):
+  """Mock task object for testing."""
+
+  def build_metrics(self, training: bool = True):
+    del training
+    return [tf.keras.metrics.Accuracy(name="acc")]
+
+  def build_inputs(self, params):
+
+    def generate_data(_):
+      x = tf.zeros(shape=(2,), dtype=tf.float32)
+      label = tf.zeros([1], dtype=tf.int32)
+      if self.name == "bar":
+        return dict(x=x, y=x), label
+      else:
+        return dict(x=x), label
+
+    dataset = tf.data.Dataset.range(1)
+    dataset = dataset.repeat()
+    dataset = dataset.map(
+        generate_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+    return dataset.prefetch(buffer_size=1).batch(2, drop_remainder=True)
+
+  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
+    logs = super().validation_step(inputs, model, metrics)
+    logs["counter"] = tf.ones((1,), dtype=tf.float32)
+    return logs
+
+  def aggregate_logs(self, state, step_outputs):
+    if state is None:
+      state = {}
+    for key, value in step_outputs.items():
+      if key not in state:
+        state[key] = []
+      state[key].append(
+          np.concatenate([np.expand_dims(v.numpy(), axis=0) for v in value]))
+    return state
+
+  def reduce_aggregated_logs(self,
+                             aggregated_logs,
+                             global_step=None):
+    for k, v in aggregated_logs.items():
+      aggregated_logs[k] = np.sum(np.stack(v, axis=0))
+    return aggregated_logs
+
+
+class EvaluatorTest(tf.test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(all_strategy_combinations())
+  def test_multitask_evaluator(self, distribution):
+    with distribution.scope():
+      tasks = [
+          MockTask(params=cfg.TaskConfig(), name="bar"),
+          MockTask(params=cfg.TaskConfig(), name="foo")
+      ]
+      test_multitask = multitask.MultiTask(tasks=tasks)
+      model = MockModel()
+      test_evaluator = evaluator.MultiTaskEvaluator(
+          task=test_multitask, model=model)
+      results = test_evaluator.evaluate(tf.convert_to_tensor(1, dtype=tf.int32))
+    self.assertContainsSubset(["validation_loss", "acc"], results["bar"].keys())
+    self.assertContainsSubset(["validation_loss", "acc"], results["foo"].keys())
+    self.assertEqual(results["bar"]["validation_loss"], 0.0)
+    self.assertEqual(results["foo"]["validation_loss"], 1.0)
+
+  @combinations.generate(all_strategy_combinations())
+  def test_multitask_evaluator_numpy_metrics(self, distribution):
+    with distribution.scope():
+      tasks = [
+          MockTask(params=cfg.TaskConfig(), name="bar"),
+          MockTask(params=cfg.TaskConfig(), name="foo")
+      ]
+      test_multitask = multitask.MultiTask(tasks=tasks)
+      model = MockModel()
+      test_evaluator = evaluator.MultiTaskEvaluator(
+          task=test_multitask, model=model)
+      results = test_evaluator.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
+    self.assertEqual(results["bar"]["counter"],
+                     5. * distribution.num_replicas_in_sync)
+    self.assertEqual(results["foo"]["counter"],
+                     5. * distribution.num_replicas_in_sync)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/multitask/interleaving_trainer.py b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/interleaving_trainer.py
new file mode 100644
index 000000000..935351af5
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/interleaving_trainer.py
@@ -0,0 +1,92 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Multitask trainer that interleaves each task's train step."""
+from typing import Union
+import gin
+import orbit
+import tensorflow as tf
+from modeling.multitask import base_model
+from modeling.multitask import base_trainer
+from modeling.multitask import multitask
+from modeling.multitask import task_sampler as sampler
+
+
+@gin.configurable
+class MultiTaskInterleavingTrainer(base_trainer.MultiTaskBaseTrainer):
+  """MultiTask trainer that interleaves task update."""
+
+  def __init__(self,
+               multi_task: multitask.MultiTask,
+               multi_task_model: Union[tf.keras.Model,
+                                       base_model.MultiTaskBaseModel],
+               optimizer: tf.optimizers.Optimizer,
+               task_sampler: sampler.TaskSampler,
+               trainer_options=None):
+    super(MultiTaskInterleavingTrainer, self).__init__(
+        multi_task=multi_task,
+        multi_task_model=multi_task_model,
+        optimizer=optimizer,
+        trainer_options=trainer_options)
+    self._task_sampler = task_sampler
+
+    # Build per task train step.
+    def _get_task_step(task_name, task):
+
+      def step_fn(inputs):
+        if isinstance(self.multi_task_model, base_model.MultiTaskBaseModel):
+          task_model = self.multi_task_model.sub_tasks[task_name]
+        else:
+          task_model = self.multi_task_model
+        task_logs = task.train_step(
+            inputs,
+            model=task_model,
+            optimizer=self.optimizer,
+            metrics=self.training_metrics[task_name])
+        self.training_losses[task_name].update_state(task_logs[task.loss])
+
+      return step_fn
+
+    self._task_train_step_map = {
+        name: _get_task_step(name, task)
+        for name, task in self.multi_task.tasks.items()
+    }
+
+    # TODO(haozhangthu): Add taskwise step counter to train_loop_end for logging
+    # on TensorBoard.
+    self._task_step_counters = {
+        name: orbit.utils.create_global_step() for name in self.multi_task.tasks
+    }
+
+  def task_step_counter(self, name):
+    return self._task_step_counters[name]
+
+  def train_step(self, iterator_map):
+    # Sample one task to train according to a multinomial distribution
+    rn = tf.random.stateless_uniform(shape=[], seed=(0, self.global_step))
+    cumulative_sample_distribution = self._task_sampler.task_cumulative_distribution(
+        self.global_step)
+    # Prepend a [0.0] for indexing convenience.
+    cumulative_sample_distribution = tf.concat(
+        [tf.constant([0.0], dtype=tf.float32), cumulative_sample_distribution],
+        axis=0)
+
+    for idx, (name, _) in enumerate(self.multi_task.tasks.items()):
+      begin = cumulative_sample_distribution[idx]
+      end = cumulative_sample_distribution[idx + 1]
+      if rn >= begin and rn < end:
+        self._strategy.run(
+            self._task_train_step_map[name], args=(next(iterator_map[name]),))
+        self.global_step.assign_add(1)
+        self.task_step_counter(name).assign_add(1)
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/multitask/interleaving_trainer_test.py b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/interleaving_trainer_test.py
new file mode 100644
index 000000000..0ccc2670d
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/interleaving_trainer_test.py
@@ -0,0 +1,101 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for multitask.interleaving_trainer."""
+from absl.testing import parameterized
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.modeling.multitask import configs
+from official.modeling.multitask import interleaving_trainer
+from official.modeling.multitask import multitask
+from official.modeling.multitask import task_sampler
+from official.modeling.multitask import test_utils
+
+
+def all_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.cloud_tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+      ],
+      mode="eager",
+  )
+
+
+class InterleavingTrainerTest(tf.test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(all_strategy_combinations())
+  def test_multitask_interleaving_trainer(self, distribution):
+    with distribution.scope():
+      tasks = [
+          test_utils.MockFooTask(params=test_utils.FooConfig(), name="foo"),
+          test_utils.MockBarTask(params=test_utils.BarConfig(), name="bar")
+      ]
+      test_multitask = multitask.MultiTask(tasks=tasks)
+      test_optimizer = tf.keras.optimizers.SGD(0.1)
+      model = test_utils.MockMultiTaskModel()
+      sampler = task_sampler.UniformTaskSampler(
+          task_weights=test_multitask.task_weights)
+      test_trainer = interleaving_trainer.MultiTaskInterleavingTrainer(
+          multi_task=test_multitask,
+          multi_task_model=model,
+          optimizer=test_optimizer,
+          task_sampler=sampler)
+      results = test_trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
+      self.assertContainsSubset(["training_loss", "bar_acc"],
+                                results["bar"].keys())
+      self.assertContainsSubset(["training_loss", "foo_acc"],
+                                results["foo"].keys())
+
+  @combinations.generate(all_strategy_combinations())
+  def test_trainer_with_configs(self, distribution):
+    config = configs.MultiTaskConfig(
+        task_routines=(configs.TaskRoutine(
+            task_name="foo",
+            task_config=test_utils.FooConfig(),
+            task_weight=3.0),
+                       configs.TaskRoutine(
+                           task_name="bar",
+                           task_config=test_utils.BarConfig(),
+                           task_weight=1.0)))
+    with distribution.scope():
+      test_multitask = multitask.MultiTask.from_config(config)
+    test_optimizer = tf.keras.optimizers.SGD(0.1)
+    model = test_utils.MockMultiTaskModel()
+    num_step = 1000
+    sampler = task_sampler.AnnealingTaskSampler(
+        task_weights=test_multitask.task_weights,
+        steps_per_epoch=num_step/5,
+        total_steps=num_step)
+    test_trainer = interleaving_trainer.MultiTaskInterleavingTrainer(
+        multi_task=test_multitask,
+        multi_task_model=model,
+        optimizer=test_optimizer,
+        task_sampler=sampler)
+    results = test_trainer.train(tf.convert_to_tensor(num_step, dtype=tf.int32))
+    self.assertContainsSubset(["training_loss", "bar_acc"],
+                              results["bar"].keys())
+    self.assertContainsSubset(["training_loss", "foo_acc"],
+                              results["foo"].keys())
+    self.assertEqual(test_trainer.global_step.numpy(), num_step)
+    bar_sampled_step = test_trainer.task_step_counter("bar").numpy()
+    foo_sampled_step = test_trainer.task_step_counter("foo").numpy()
+    self.assertEqual(bar_sampled_step + foo_sampled_step, num_step)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/multitask/multitask.py b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/multitask.py
new file mode 100644
index 000000000..d32b897e9
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/multitask.py
@@ -0,0 +1,148 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Experimental MultiTask base class for multi-task training/evaluation."""
+import abc
+from typing import Dict, List, Optional, Text, Union
+
+import tensorflow as tf
+from core import base_task
+from core import config_definitions
+from core import task_factory
+from modeling import optimization
+from modeling.multitask import base_model
+from modeling.multitask import configs
+
+OptimizationConfig = optimization.OptimizationConfig
+RuntimeConfig = config_definitions.RuntimeConfig
+
+
+class MultiTask(tf.Module, metaclass=abc.ABCMeta):
+  """A multi-task class to manage multiple tasks."""
+
+  def __init__(self,
+               tasks: Union[Dict[Text, base_task.Task], List[base_task.Task]],
+               task_weights: Optional[Dict[str, Union[float, int]]] = None,
+               task_eval_steps: Optional[Dict[str, int]] = None,
+               name: Optional[str] = None):
+    """MultiTask initialization.
+
+    Args:
+      tasks: a list or a flat dict of Task.
+      task_weights: a dict of (task, task weight), task weight can be applied
+        directly during loss summation in a joint backward step, or it can be
+        used to sample task among interleaved backward step.
+      task_eval_steps: a dict of (task, eval steps).
+      name: the instance name of a MultiTask object.
+    """
+    super().__init__(name=name)
+    if isinstance(tasks, list):
+      self._tasks = {}
+      for task in tasks:
+        if task.name in self._tasks:
+          raise ValueError("Duplicated tasks found, task.name is %s" %
+                           task.name)
+        self._tasks[task.name] = task
+    elif isinstance(tasks, dict):
+      self._tasks = tasks
+    else:
+      raise ValueError("The tasks argument has an invalid type: %s" %
+                       type(tasks))
+    self._task_eval_steps = task_eval_steps or {}
+    self._task_eval_steps = dict([
+        (name, self._task_eval_steps.get(name, None)) for name in self.tasks
+    ])
+    self._task_weights = task_weights or {}
+    self._task_weights = dict([
+        (name, self._task_weights.get(name, 1.0)) for name in self.tasks
+    ])
+
+  @classmethod
+  def from_config(cls, config: configs.MultiTaskConfig, logging_dir=None):
+    tasks = {}
+    task_eval_steps = {}
+    task_weights = {}
+    for task_routine in config.task_routines:
+      task_name = task_routine.task_name
+      tasks[task_name] = task_factory.get_task(
+          task_routine.task_config, logging_dir=logging_dir)
+      task_eval_steps[task_name] = task_routine.eval_steps
+      task_weights[task_name] = task_routine.task_weight
+    return cls(
+        tasks, task_eval_steps=task_eval_steps, task_weights=task_weights)
+
+  @property
+  def tasks(self):
+    return self._tasks
+
+  def task_eval_steps(self, task_name):
+    return self._task_eval_steps[task_name]
+
+  def task_weight(self, task_name):
+    return self._task_weights[task_name]
+
+  @property
+  def task_weights(self):
+    return self._task_weights
+
+  @classmethod
+  def create_optimizer(cls,
+                       optimizer_config: OptimizationConfig,
+                       runtime_config: Optional[RuntimeConfig] = None):
+    return base_task.Task.create_optimizer(
+        optimizer_config=optimizer_config, runtime_config=runtime_config)
+
+  def joint_train_step(self, task_inputs,
+                       multi_task_model: base_model.MultiTaskBaseModel,
+                       optimizer: tf.keras.optimizers.Optimizer, task_metrics):
+    """The joint train step.
+
+    Args:
+      task_inputs: a dictionary of task names and per-task features.
+      multi_task_model: a MultiTaskBaseModel instance.
+      optimizer: a tf.optimizers.Optimizer.
+      task_metrics: a dictionary of task names and per-task metrics.
+
+    Returns:
+      A dictionary of losses, inculding per-task losses and their weighted sum.
+    """
+    losses = {}
+    with tf.GradientTape() as tape:
+      total_loss = 0.0
+      for name, model in multi_task_model.sub_tasks.items():
+        inputs = task_inputs[name]
+        if isinstance(inputs, tuple) and len(inputs) == 2:
+          features, labels = inputs
+        elif isinstance(inputs, dict):
+          features, labels = inputs, inputs
+        else:
+          raise ValueError("The iterator output is neither a tuple nor a "
+                           "dictionary. It is not implemented to support "
+                           "such outputs.")
+        outputs = model(features, training=True)
+        task_loss = self.tasks[name].build_losses(labels, outputs)
+        task_weight = self.task_weight(name)
+        total_loss += task_weight * task_loss
+        losses[name] = task_loss
+        self.tasks[name].process_metrics(task_metrics[name], labels, outputs)
+
+        # Scales loss as the default gradients allreduce performs sum inside
+        # the optimizer.
+        scaled_loss = total_loss / tf.distribute.get_strategy(
+        ).num_replicas_in_sync
+    tvars = multi_task_model.trainable_variables
+    grads = tape.gradient(scaled_loss, tvars)
+    optimizer.apply_gradients(list(zip(grads, tvars)))
+    losses["total_loss"] = total_loss
+    return losses
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/multitask/task_sampler.py b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/task_sampler.py
new file mode 100644
index 000000000..78ba84341
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/task_sampler.py
@@ -0,0 +1,121 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utils to sample tasks for interleaved optimization."""
+import abc
+from typing import Union, Dict, Text
+import tensorflow as tf
+
+from modeling.multitask import configs
+
+
+class TaskSampler(tf.Module, metaclass=abc.ABCMeta):
+  """An abstract class defining task sampling API for interleaving trainer."""
+
+  def __init__(self, task_weights: Dict[Text, Union[float, int]]):
+    self._task_weights = task_weights
+
+  @abc.abstractmethod
+  def task_cumulative_distribution(self, global_step: tf.Tensor) -> tf.Tensor:
+    """Compute cumulative distribution to sample tasks.
+
+    It calculates the cumulative distribution of the multinomial task
+    distribution with respect to which to be sampled against.
+
+    Args:
+      global_step: A tensor indicating current progess of training.
+
+    Returns:
+      A float tensor with shape (#(task), 1) that represents the cumulative
+        sampling distribution.
+    """
+    pass
+
+
+class UniformTaskSampler(TaskSampler):
+  """Sample all tasks uniformly."""
+
+  def __init__(self, task_weights: Dict[Text, Union[float, int]]):
+    super(UniformTaskSampler, self).__init__(task_weights=task_weights)
+    self._uniform_cumulative = tf.math.cumsum(
+        tf.constant(
+            [1.0 / len(self._task_weights)] * len(self._task_weights),
+            dtype=tf.float32))
+
+  def task_cumulative_distribution(self, global_step: tf.Tensor) -> tf.Tensor:
+    del global_step
+    return self._uniform_cumulative
+
+
+class ProportionalTaskSampler(TaskSampler):
+  """Sample tasks proportional to task weights."""
+
+  def __init__(self,
+               task_weights: Dict[Text, Union[float, int]],
+               alpha: float = 1.0):
+    super(ProportionalTaskSampler, self).__init__(task_weights=task_weights)
+    self._alpha = tf.cast(alpha, dtype=tf.float32)
+    task_weight_dict_ordered_list = tf.constant(
+        [weight for _, weight in self._task_weights.items()], dtype=tf.float32)
+    task_sizes = tf.math.pow(task_weight_dict_ordered_list, self._alpha)
+    task_distribution = task_sizes / tf.reduce_sum(task_sizes)
+    self._porportional_cumulative = tf.math.cumsum(task_distribution)
+
+  def task_cumulative_distribution(self, global_step: tf.Tensor) -> tf.Tensor:
+    del global_step
+    return self._porportional_cumulative
+
+
+class AnnealingTaskSampler(TaskSampler):
+  """Sample tasks according to task weights as well as training progress."""
+
+  def __init__(self,
+               task_weights: Dict[Text, Union[float, int]],
+               steps_per_epoch: int,
+               total_steps: int):
+    super(AnnealingTaskSampler, self).__init__(task_weights=task_weights)
+    self._steps_per_epoch = tf.cast(steps_per_epoch, dtype=tf.float32)
+    self._total_epochs = tf.cast(
+        total_steps / self._steps_per_epoch, dtype=tf.float32)
+
+  def task_cumulative_distribution(self, global_step: tf.Tensor) -> tf.Tensor:
+    cur_epoch = tf.math.floor(
+        tf.cast(global_step, dtype=tf.float32) / self._steps_per_epoch)
+    alpha = 1.0 - 0.8 * (cur_epoch - 1) / (self._total_epochs - 1 + 1e-10)
+    task_weight_dict_ordered_list = [
+        weight for _, weight in self._task_weights.items()
+    ]
+    task_sizes = tf.math.pow(
+        tf.constant(task_weight_dict_ordered_list, dtype=tf.float32),
+        tf.cast(alpha, dtype=tf.float32))
+    dynamic_task_distribution = task_sizes / tf.reduce_sum(task_sizes)
+    return tf.math.cumsum(dynamic_task_distribution)
+
+
+def get_task_sampler(config: configs.TaskSamplingConfig,
+                     task_weights: Dict[Text, float]) -> TaskSampler:
+  """Utils to create task sampler with configuration and task weights."""
+  oneof_config = config.get()
+  if config.type == 'uniform':
+    return UniformTaskSampler(task_weights=task_weights)
+  elif config.type == 'proportional':
+    return ProportionalTaskSampler(
+        task_weights=task_weights, alpha=oneof_config.alpha)
+  elif config.type == 'annealing':
+    return AnnealingTaskSampler(
+        task_weights=task_weights,
+        steps_per_epoch=oneof_config.steps_per_epoch,
+        total_steps=oneof_config.total_steps)
+  else:
+    raise RuntimeError('Task sampler type not supported')
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/multitask/task_sampler_test.py b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/task_sampler_test.py
new file mode 100644
index 000000000..db0fddbea
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/task_sampler_test.py
@@ -0,0 +1,75 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for multitask.task_sampler."""
+import tensorflow as tf
+
+from modeling.multitask import configs
+from modeling.multitask import task_sampler as sampler
+
+
+class TaskSamplerTest(tf.test.TestCase):
+
+  def setUp(self):
+    super(TaskSamplerTest, self).setUp()
+    self._task_weights = {'A': 1.0, 'B': 2.0, 'C': 3.0}
+
+  def test_uniform_sample_distribution(self):
+    uniform_sampler = sampler.get_task_sampler(
+        configs.TaskSamplingConfig(type='uniform'), self._task_weights)
+    for step in range(5):
+      cumulative_distribution = uniform_sampler.task_cumulative_distribution(
+          tf.constant(step, dtype=tf.int64))
+      self.assertAllClose([0.333333, 0.666666, 1.0],
+                          cumulative_distribution.numpy())
+
+  def test_proportional_sample_distribution(self):
+    prop_sampler = sampler.get_task_sampler(
+        configs.TaskSamplingConfig(
+            type='proportional',
+            proportional=configs.ProportionalSampleConfig(alpha=2.0)),
+        self._task_weights)
+    # CucmulativeOf(Normalize([1.0^2, 2.0^2, 3.0^2]))
+    for step in range(5):
+      cumulative_distribution = prop_sampler.task_cumulative_distribution(
+          tf.constant(step, dtype=tf.int64))
+      self.assertAllClose([0.07142857, 0.35714286, 1.0],
+                          cumulative_distribution.numpy())
+
+  def test_annealing_sample_distribution(self):
+    num_epoch = 3
+    step_per_epoch = 6
+    annel_sampler = sampler.get_task_sampler(
+        configs.TaskSamplingConfig(
+            type='annealing',
+            annealing=configs.AnnealingSampleConfig(
+                steps_per_epoch=step_per_epoch,
+                total_steps=step_per_epoch * num_epoch)), self._task_weights)
+
+    global_step = tf.Variable(
+        0, dtype=tf.int64, name='global_step', trainable=False)
+    expected_cumulative_epochs = [[0.12056106, 0.4387236, 1.0],
+                                  [0.16666667, 0.5, 1.0],
+                                  [0.22477472, 0.5654695, 1.0]]
+    for epoch in range(num_epoch):
+      for _ in range(step_per_epoch):
+        cumulative_distribution = annel_sampler.task_cumulative_distribution(
+            tf.constant(global_step, dtype=tf.int64))
+        global_step.assign_add(1)
+        self.assertAllClose(expected_cumulative_epochs[epoch],
+                            cumulative_distribution.numpy())
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/multitask/test_utils.py b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/test_utils.py
new file mode 100644
index 000000000..9ef43ee01
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/test_utils.py
@@ -0,0 +1,125 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Testing utils for mock models and tasks."""
+from typing import Dict, Text
+import tensorflow as tf
+from core import base_task
+from core import config_definitions as cfg
+from core import task_factory
+from modeling.multitask import base_model
+
+
+class MockFooModel(tf.keras.Model):
+  """A mock model can consume 'foo' and 'bar' inputs."""
+
+  def __init__(self, shared_layer, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+    self._share_layer = shared_layer
+    self._foo_specific_layer = tf.keras.layers.Dense(1)
+
+  def call(self, inputs):
+    self.add_loss(tf.zeros((1,), dtype=tf.float32))
+    if "foo" in inputs:
+      input_tensor = inputs["foo"]
+    else:
+      input_tensor = inputs["bar"]
+    return self._foo_specific_layer(self._share_layer(input_tensor))
+
+
+class MockBarModel(tf.keras.Model):
+
+  def __init__(self, shared_layer, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+    self._share_layer = shared_layer
+    self._bar_specific_layer = tf.keras.layers.Dense(1)
+
+  def call(self, inputs):
+    self.add_loss(tf.zeros((2,), dtype=tf.float32))
+    return self._bar_specific_layer(self._share_layer(inputs["bar"]))
+
+
+class MockMultiTaskModel(base_model.MultiTaskBaseModel):
+
+  def __init__(self, *args, **kwargs):
+    self._shared_dense = tf.keras.layers.Dense(1)
+    super().__init__(*args, **kwargs)
+
+  def _instantiate_sub_tasks(self) -> Dict[Text, tf.keras.Model]:
+    return {
+        "foo": MockFooModel(self._shared_dense),
+        "bar": MockBarModel(self._shared_dense)
+    }
+
+
+def mock_data(feature_name):
+  """Mock dataset function."""
+
+  def _generate_data(_):
+    x = tf.zeros(shape=(2,), dtype=tf.float32)
+    label = tf.zeros([1], dtype=tf.int32)
+    return {feature_name: x}, label
+
+  dataset = tf.data.Dataset.range(1)
+  dataset = dataset.repeat()
+  dataset = dataset.map(
+      _generate_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  return dataset.prefetch(buffer_size=1).batch(2, drop_remainder=True)
+
+
+class FooConfig(cfg.TaskConfig):
+  pass
+
+
+class BarConfig(cfg.TaskConfig):
+  pass
+
+
+@task_factory.register_task_cls(FooConfig)
+class MockFooTask(base_task.Task):
+  """Mock foo task object for testing."""
+
+  def build_metrics(self, training: bool = True):
+    del training
+    return [tf.keras.metrics.Accuracy(name="foo_acc")]
+
+  def build_inputs(self, params):
+    return mock_data("foo")
+
+  def build_model(self) -> tf.keras.Model:
+    return MockFooModel(shared_layer=tf.keras.layers.Dense(1))
+
+  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
+    loss = tf.keras.losses.mean_squared_error(labels, model_outputs)
+    if aux_losses:
+      loss += tf.add_n(aux_losses)
+    return tf.reduce_mean(loss)
+
+
+@task_factory.register_task_cls(BarConfig)
+class MockBarTask(base_task.Task):
+  """Mock bar task object for testing."""
+
+  def build_metrics(self, training: bool = True):
+    del training
+    return [tf.keras.metrics.Accuracy(name="bar_acc")]
+
+  def build_inputs(self, params):
+    return mock_data("bar")
+
+  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
+    loss = tf.keras.losses.mean_squared_error(labels, model_outputs)
+    if aux_losses:
+      loss += tf.add_n(aux_losses)
+    return tf.reduce_mean(loss)
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/multitask/train_lib.py b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/train_lib.py
new file mode 100644
index 000000000..bc8f508b5
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/train_lib.py
@@ -0,0 +1,251 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Multitask training driver library."""
+# pytype: disable=attribute-error
+import os
+from typing import Optional
+from absl import logging
+import orbit
+import tensorflow as tf
+from core import base_task
+from core import base_trainer as core_lib
+from core import train_utils
+from modeling.multitask import base_model
+from modeling.multitask import base_trainer
+from modeling.multitask import configs
+from modeling.multitask import evaluator as evaluator_lib
+from modeling.multitask import interleaving_trainer
+from modeling.multitask import multitask
+from modeling.multitask import task_sampler
+
+TRAINERS = {
+    'interleaving': interleaving_trainer.MultiTaskInterleavingTrainer,
+    'joint': base_trainer.MultiTaskBaseTrainer
+}
+
+
+def run_experiment(*, distribution_strategy: tf.distribute.Strategy,
+                   task: multitask.MultiTask,
+                   model: base_model.MultiTaskBaseModel, mode: str,
+                   params: configs.MultiTaskExperimentConfig,
+                   model_dir: str) -> base_model.MultiTaskBaseModel:
+  """Runs train/eval configured by the experiment params.
+
+  Args:
+    distribution_strategy: A distribution distribution_strategy.
+    task: A MultiTaskTask instance.
+    model: A MultiTaskBaseModel instance.
+    mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval'
+      or 'continuous_eval'.
+    params: ExperimentConfig instance.
+    model_dir: A 'str', a path to store model checkpoints and summaries.
+
+  Returns:
+      model: `base_model.MultiTaskBaseModel` instance.
+  """
+
+  is_training = 'train' in mode
+  is_eval = 'eval' in mode
+  with distribution_strategy.scope():
+    optimizer = task.create_optimizer(params.trainer.optimizer_config,
+                                      params.runtime)
+    kwargs = dict(multi_task=task, multi_task_model=model, optimizer=optimizer)
+    if params.trainer.trainer_type == 'interleaving':
+      sampler = task_sampler.get_task_sampler(params.trainer.task_sampler,
+                                              task.task_weights)
+      kwargs.update(dict(task_sampler=sampler))
+    trainer = TRAINERS[params.trainer.trainer_type](
+        **kwargs) if is_training else None
+    if is_eval:
+      evaluator = evaluator_lib.MultiTaskEvaluator(
+          task=task,
+          model=model,
+          global_step=trainer.global_step if is_training else None,
+          checkpoint_exporter=train_utils.maybe_create_best_ckpt_exporter(
+              params, model_dir))
+    else:
+      evaluator = None
+
+  if trainer:
+    checkpoint = trainer.checkpoint
+    global_step = trainer.global_step
+  else:
+    checkpoint = evaluator.checkpoint
+    global_step = evaluator.global_step
+
+  # TODO(hongkuny,haozhangthu): Revisit initialization method.
+  checkpoint_manager = tf.train.CheckpointManager(
+      checkpoint,
+      directory=model_dir,
+      max_to_keep=params.trainer.max_to_keep,
+      step_counter=global_step,
+      checkpoint_interval=params.trainer.checkpoint_interval,
+      init_fn=model.initialize)
+
+  controller = orbit.Controller(
+      strategy=distribution_strategy,
+      trainer=trainer,
+      evaluator=evaluator,
+      global_step=global_step,
+      steps_per_loop=params.trainer.steps_per_loop,
+      checkpoint_manager=checkpoint_manager,
+      summary_dir=os.path.join(model_dir, 'train'),
+      eval_summary_dir=os.path.join(model_dir, 'validation'),
+      summary_interval=params.trainer.summary_interval)
+
+  logging.info('Starts to execute mode: %s', mode)
+  with distribution_strategy.scope():
+    if mode == 'train':
+      controller.train(steps=params.trainer.train_steps)
+    elif mode == 'train_and_eval':
+      controller.train_and_evaluate(
+          train_steps=params.trainer.train_steps,
+          eval_steps=params.trainer.validation_steps,
+          eval_interval=params.trainer.validation_interval)
+    elif mode == 'eval':
+      controller.evaluate(steps=params.trainer.validation_steps)
+    elif mode == 'continuous_eval':
+
+      def timeout_fn():
+        if evaluator.global_step.numpy() >= params.trainer.train_steps:
+          return True
+        return False
+
+      controller.evaluate_continuously(
+          steps=params.trainer.validation_steps,
+          timeout=params.trainer.continuous_eval_timeout,
+          timeout_fn=timeout_fn)
+    else:
+      raise NotImplementedError('The mode is not implemented: %s' % mode)
+
+    return model
+
+
+def run_experiment_with_multitask_eval(
+    *,
+    distribution_strategy: tf.distribute.Strategy,
+    train_task: base_task.Task,
+    eval_tasks: multitask.MultiTask,
+    mode: str,
+    params: configs.MultiEvalExperimentConfig,
+    model_dir: str,
+    run_post_eval: bool = False,
+    save_summary: bool = True,
+    trainer: Optional[core_lib.Trainer] = None) -> tf.keras.Model:
+  """Runs train/eval configured by the experiment params.
+
+  Args:
+    distribution_strategy: A distribution distribution_strategy.
+    train_task: A base_task.Task instance.
+    eval_tasks: A multitask.MultiTask with evaluation tasks.
+    mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval'
+      or 'continuous_eval'.
+    params: MultiEvalExperimentConfig instance.
+    model_dir: A 'str', a path to store model checkpoints and summaries.
+    run_post_eval: Whether to run post eval once after training, metrics logs
+      are returned.
+    save_summary: Whether to save train and validation summary.
+    trainer: the core_lib.Trainer instance. It should be created within the
+      strategy.scope(). If not provided, an instance will be created by default
+      if `mode` contains 'train'.
+
+  Returns:
+      model: `tf.keras.Model` instance.
+  """
+
+  is_training = 'train' in mode
+  is_eval = 'eval' in mode
+  with distribution_strategy.scope():
+    if is_training:
+      trainer = trainer or core_lib.Trainer(
+          config=params,
+          task=train_task,
+          model=train_task.build_model(),
+          optimizer=train_task.create_optimizer(
+              params.trainer.optimizer_config, params.runtime),
+          train=True,
+          evaluate=False)
+    else:
+      trainer = None
+    model = trainer.model if trainer else train_task.build_model()
+
+    if is_eval:
+      evaluator = evaluator_lib.MultiTaskEvaluator(
+          task=eval_tasks,
+          model=model,
+          global_step=trainer.global_step if is_training else None,
+          checkpoint_exporter=train_utils.maybe_create_best_ckpt_exporter(
+              params, model_dir))
+    else:
+      evaluator = None
+
+  if trainer:
+    checkpoint = trainer.checkpoint
+    global_step = trainer.global_step
+  else:
+    checkpoint = evaluator.checkpoint
+    global_step = evaluator.global_step
+
+  checkpoint_manager = tf.train.CheckpointManager(
+      checkpoint,
+      directory=model_dir,
+      max_to_keep=params.trainer.max_to_keep,
+      step_counter=global_step,
+      checkpoint_interval=params.trainer.checkpoint_interval,
+      init_fn=trainer.initialize if trainer else None)
+
+  controller = orbit.Controller(
+      strategy=distribution_strategy,
+      trainer=trainer,
+      evaluator=evaluator,
+      global_step=global_step,
+      steps_per_loop=params.trainer.steps_per_loop,
+      checkpoint_manager=checkpoint_manager,
+      summary_dir=os.path.join(model_dir, 'train') if save_summary else None,
+      eval_summary_dir=os.path.join(model_dir, 'validation') if
+      (save_summary) else None,
+      summary_interval=params.trainer.summary_interval if
+      (save_summary) else None)
+
+  logging.info('Starts to execute mode: %s', mode)
+  with distribution_strategy.scope():
+    if mode == 'train':
+      controller.train(steps=params.trainer.train_steps)
+    elif mode == 'train_and_eval':
+      controller.train_and_evaluate(
+          train_steps=params.trainer.train_steps,
+          eval_steps=params.trainer.validation_steps,
+          eval_interval=params.trainer.validation_interval)
+    elif mode == 'eval':
+      controller.evaluate(steps=params.trainer.validation_steps)
+    elif mode == 'continuous_eval':
+
+      def timeout_fn():
+        if evaluator.global_step.numpy() >= params.trainer.train_steps:
+          return True
+        return False
+
+      controller.evaluate_continuously(
+          steps=params.trainer.validation_steps,
+          timeout=params.trainer.continuous_eval_timeout,
+          timeout_fn=timeout_fn)
+    else:
+      raise NotImplementedError('The mode is not implemented: %s' % mode)
+
+    if run_post_eval:
+      return model, evaluator.evaluate(
+          tf.convert_to_tensor(params.trainer.validation_steps))
+    else:
+      return model, {}
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/multitask/train_lib_test.py b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/train_lib_test.py
new file mode 100644
index 000000000..7d8a78d7e
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/multitask/train_lib_test.py
@@ -0,0 +1,120 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for multitask.train_lib."""
+from absl.testing import parameterized
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.core import task_factory
+from official.modeling.hyperparams import params_dict
+from official.modeling.multitask import configs
+from official.modeling.multitask import multitask
+from official.modeling.multitask import test_utils
+from official.modeling.multitask import train_lib
+
+
+class TrainLibTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self._test_config = {
+        'trainer': {
+            'checkpoint_interval': 10,
+            'steps_per_loop': 10,
+            'summary_interval': 10,
+            'train_steps': 10,
+            'validation_steps': 5,
+            'validation_interval': 10,
+            'continuous_eval_timeout': 1,
+            'optimizer_config': {
+                'optimizer': {
+                    'type': 'sgd',
+                },
+                'learning_rate': {
+                    'type': 'constant'
+                }
+            }
+        },
+    }
+
+  @combinations.generate(
+      combinations.combine(
+          distribution_strategy=[
+              strategy_combinations.default_strategy,
+              strategy_combinations.cloud_tpu_strategy,
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          mode='eager',
+          flag_mode=['train', 'eval', 'train_and_eval']))
+  def test_end_to_end(self, distribution_strategy, flag_mode):
+    model_dir = self.get_temp_dir()
+    experiment_config = configs.MultiTaskExperimentConfig(
+        task=configs.MultiTaskConfig(
+            task_routines=(
+                configs.TaskRoutine(
+                    task_name='foo',
+                    task_config=test_utils.FooConfig()),
+                configs.TaskRoutine(
+                    task_name='bar', task_config=test_utils.BarConfig()))))
+    experiment_config = params_dict.override_params_dict(
+        experiment_config, self._test_config, is_strict=False)
+    with distribution_strategy.scope():
+      test_multitask = multitask.MultiTask.from_config(experiment_config.task)
+      model = test_utils.MockMultiTaskModel()
+    train_lib.run_experiment(
+        distribution_strategy=distribution_strategy,
+        task=test_multitask,
+        model=model,
+        mode=flag_mode,
+        params=experiment_config,
+        model_dir=model_dir)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution_strategy=[
+              strategy_combinations.default_strategy,
+              strategy_combinations.cloud_tpu_strategy,
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          mode='eager',
+          flag_mode=['train', 'eval', 'train_and_eval']))
+  def test_end_to_end_multi_eval(self, distribution_strategy, flag_mode):
+    model_dir = self.get_temp_dir()
+    experiment_config = configs.MultiEvalExperimentConfig(
+        task=test_utils.FooConfig(),
+        eval_tasks=configs.MultiTaskConfig(
+            task_routines=(
+                configs.TaskRoutine(
+                    task_name='foo',
+                    task_config=test_utils.FooConfig()),
+                configs.TaskRoutine(
+                    task_name='bar', task_config=test_utils.BarConfig()))))
+    experiment_config = params_dict.override_params_dict(
+        experiment_config, self._test_config, is_strict=False)
+    with distribution_strategy.scope():
+      train_task = task_factory.get_task(experiment_config.task)
+      eval_tasks = multitask.MultiTask.from_config(experiment_config.eval_tasks)
+    train_lib.run_experiment_with_multitask_eval(
+        distribution_strategy=distribution_strategy,
+        train_task=train_task,
+        eval_tasks=eval_tasks,
+        mode=flag_mode,
+        params=experiment_config,
+        model_dir=model_dir)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/optimization/__init__.py b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/__init__.py
new file mode 100644
index 000000000..e6f22c6a8
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/__init__.py
@@ -0,0 +1,23 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Optimization package definition."""
+
+# pylint: disable=wildcard-import
+from modeling.optimization.configs.learning_rate_config import *
+from modeling.optimization.configs.optimization_config import *
+from modeling.optimization.configs.optimizer_config import *
+from modeling.optimization.ema_optimizer import ExponentialMovingAverage
+from modeling.optimization.lr_schedule import *
+from modeling.optimization.optimizer_factory import OptimizerFactory
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/optimization/configs/__init__.py b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/configs/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/configs/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/optimization/configs/learning_rate_config.py b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/configs/learning_rate_config.py
new file mode 100644
index 000000000..72b3da508
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/configs/learning_rate_config.py
@@ -0,0 +1,250 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Dataclasses for learning rate schedule config."""
+from typing import List, Optional
+
+import dataclasses
+from modeling.hyperparams import base_config
+
+
+@dataclasses.dataclass
+class ConstantLrConfig(base_config.Config):
+  """Configuration for constant learning rate.
+
+  This class is a containers for the constant learning rate decay configs.
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to Constant.
+    learning_rate: A float. The learning rate. Defaults to 0.1.
+  """
+  name: str = 'Constant'
+  learning_rate: float = 0.1
+
+
+@dataclasses.dataclass
+class StepwiseLrConfig(base_config.Config):
+  """Configuration for stepwise learning rate decay.
+
+  This class is a container for the piecewise constant learning rate scheduling
+  configs. It will configure an instance of PiecewiseConstantDecay keras
+  learning rate schedule.
+
+  An example (from keras docs): use a learning rate that's 1.0 for the first
+  100001 steps, 0.5 for the next 10000 steps, and 0.1 for any additional steps.
+    ```python
+    boundaries: [100000, 110000]
+    values: [1.0, 0.5, 0.1]
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to PiecewiseConstant.
+    boundaries: A list of ints of strictly increasing entries. Defaults to None.
+    values: A list of floats that specifies the values for the intervals defined
+      by `boundaries`. It should have one more element than `boundaries`.
+            The learning rate is computed as follows: [0, boundaries[0]] ->
+              values[0] [boundaries[0], boundaries[1]]     -> values[1]
+              [boundaries[n-1], boundaries[n]]   -> values[n] [boundaries[n],
+              end]               -> values[n+1] Defaults to None.
+    offset: An int. The offset applied to steps. Defaults to 0.
+  """
+  name: str = 'PiecewiseConstantDecay'
+  boundaries: Optional[List[int]] = None
+  values: Optional[List[float]] = None
+  offset: int = 0
+
+
+@dataclasses.dataclass
+class ExponentialLrConfig(base_config.Config):
+  """Configuration for exponential learning rate decay.
+
+  This class is a containers for the exponential learning rate decay configs.
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to ExponentialDecay.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    decay_steps: A positive integer that is used for decay computation. Defaults
+      to None.
+    decay_rate: A float. Defaults to None.
+    staircase: A boolean, if true, learning rate is decreased at discreate
+      intervals. Defaults to False.
+    offset: An int. The offset applied to steps. Defaults to 0.
+  """
+  name: str = 'ExponentialDecay'
+  initial_learning_rate: Optional[float] = None
+  decay_steps: Optional[int] = None
+  decay_rate: Optional[float] = None
+  staircase: Optional[bool] = None
+  offset: int = 0
+
+
+@dataclasses.dataclass
+class PolynomialLrConfig(base_config.Config):
+  """Configuration for polynomial learning rate decay.
+
+  This class is a containers for the polynomial learning rate decay configs.
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to PolynomialDecay.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    decay_steps: A positive integer that is used for decay computation. Defaults
+      to None.
+    end_learning_rate: A float.  The minimal end learning rate.
+    power: A float.  The power of the polynomial. Defaults to linear, 1.0.
+    cycle: A boolean, whether or not it should cycle beyond decay_steps.
+      Defaults to False.
+    offset: An int. The offset applied to steps. Defaults to 0.
+  """
+  name: str = 'PolynomialDecay'
+  initial_learning_rate: Optional[float] = None
+  decay_steps: Optional[int] = None
+  end_learning_rate: float = 0.0001
+  power: float = 1.0
+  cycle: bool = False
+  offset: int = 0
+
+
+@dataclasses.dataclass
+class CosineLrConfig(base_config.Config):
+  """Configuration for Cosine learning rate decay.
+
+  This class is a containers for the cosine learning rate decay configs,
+  tf.keras.experimental.CosineDecay.
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to CosineDecay.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    decay_steps: A positive integer that is used for decay computation. Defaults
+      to None.
+    alpha: A float.  Minimum learning rate value as a fraction of
+      initial_learning_rate.
+    offset: An int. The offset applied to steps. Defaults to 0.
+  """
+  name: str = 'CosineDecay'
+  initial_learning_rate: Optional[float] = None
+  decay_steps: Optional[int] = None
+  alpha: float = 0.0
+  offset: int = 0
+
+
+@dataclasses.dataclass
+class DirectPowerLrConfig(base_config.Config):
+  """Configuration for DirectPower learning rate decay.
+
+  This class configures a schedule following follows lr * (step)^power.
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to DirectPowerDecay.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    power: A float. Defaults to -0.5, for sqrt decay.
+  """
+  name: str = 'DirectPowerDecay'
+  initial_learning_rate: Optional[float] = None
+  power: float = -0.5
+
+
+@dataclasses.dataclass
+class PowerAndLinearDecayLrConfig(base_config.Config):
+  """Configuration for DirectPower learning rate decay.
+
+  The schedule has the following behavoir.
+  Let offset_step = step - offset.
+  1) offset_step < 0, the actual learning rate equals initial_learning_rate.
+  2) offset_step <= total_decay_steps * (1 - linear_decay_fraction), the
+  actual learning rate equals lr * offset_step^power.
+  3) total_decay_steps * (1 - linear_decay_fraction) <= offset_step <
+  total_decay_steps, the actual learning rate equals lr * offset_step^power *
+  (total_decay_steps - offset_step) / (total_decay_steps *
+  linear_decay_fraction).
+  4) offset_step >= total_decay_steps, the actual learning rate equals zero.
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to
+      PowerAndLinearDecay.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    total_decay_steps: An int. The total number of steps for power + linear
+      decay. Defaults to None.
+    power: A float. The order of the polynomial. Defaults to -0.5, for sqrt
+      decay.
+    linear_decay_fraction: A float. In the last `linear_decay_fraction` steps,
+      the learning rate will be multiplied by a linear decay. Defaults to 0.1.
+    offset: An int. The offset applied to steps. Defaults to 0.
+  """
+  name: str = 'PowerAndLinearDecay'
+  initial_learning_rate: Optional[float] = None
+  total_decay_steps: Optional[int] = None
+  power: float = -0.5
+  linear_decay_fraction: float = 0.1
+  offset: int = 0
+
+
+@dataclasses.dataclass
+class PowerDecayWithOffsetLrConfig(base_config.Config):
+  """Configuration for power learning rate decay with step offset.
+
+  Learning rate equals to `pre_offset_learning_rate` if `step` < `offset`.
+  Otherwise, learning rate equals to lr * (step - offset)^power.
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to
+      PowerDecayWithOffset.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    power: A float. Defaults to -0.5, for sqrt decay.
+    offset: An integer. Power decay happens after `offset` steps.
+    pre_offset_learning_rate: A float. The constant learning rate before
+      `offset` steps.
+  """
+  name: str = 'PowerDecayWithOffset'
+  initial_learning_rate: Optional[float] = None
+  power: float = -0.5
+  offset: int = 0
+  pre_offset_learning_rate: float = 1.0e6
+
+
+@dataclasses.dataclass
+class LinearWarmupConfig(base_config.Config):
+  """Configuration for linear warmup schedule config.
+
+  This class is a container for the linear warmup schedule configs.
+  Warmup_learning_rate is the initial learning rate, the final learning rate of
+  the warmup period is the learning_rate of the optimizer in use. The learning
+  rate at each step linearly increased according to the following formula:
+    warmup_learning_rate = warmup_learning_rate +
+    step / warmup_steps * (final_learning_rate - warmup_learning_rate).
+  Using warmup overrides the learning rate schedule by the number of warmup
+  steps.
+
+  Attributes:
+    name: The name of warmup schedule. Defaults to linear.
+    warmup_learning_rate: Initial learning rate for the warmup. Defaults to 0.
+    warmup_steps: Warmup steps. Defaults to None.
+  """
+  name: str = 'linear'
+  warmup_learning_rate: float = 0
+  warmup_steps: Optional[int] = None
+
+
+@dataclasses.dataclass
+class PolynomialWarmupConfig(base_config.Config):
+  """Configuration for linear warmup schedule config.
+
+  This class is a container for the polynomial warmup schedule configs.
+
+  Attributes:
+    name: The name of warmup schedule. Defaults to Polynomial.
+    power: Polynomial power. Defaults to 1.
+    warmup_steps: Warmup steps. Defaults to None.
+  """
+  name: str = 'polynomial'
+  power: float = 1
+  warmup_steps: Optional[int] = None
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/optimization/configs/optimization_config.py b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/configs/optimization_config.py
new file mode 100644
index 000000000..e1809f67f
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/configs/optimization_config.py
@@ -0,0 +1,114 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Dataclasses for optimization configs.
+
+This file define the dataclass for optimization configs (OptimizationConfig).
+It also has two helper functions get_optimizer_config, and get_lr_config from
+an OptimizationConfig class.
+"""
+from typing import Optional
+
+import dataclasses
+
+from modeling.hyperparams import base_config
+from modeling.hyperparams import oneof
+from modeling.optimization.configs import learning_rate_config as lr_cfg
+from modeling.optimization.configs import optimizer_config as opt_cfg
+
+
+@dataclasses.dataclass
+class OptimizerConfig(oneof.OneOfConfig):
+  """Configuration for optimizer.
+
+  Attributes:
+    type: 'str', type of optimizer to be used, on the of fields below.
+    sgd: sgd optimizer config.
+    adam: adam optimizer config.
+    adamw: adam with weight decay.
+    lamb: lamb optimizer.
+    rmsprop: rmsprop optimizer.
+    lars: lars optimizer.
+    adagrad: adagrad optimizer.
+    slide: slide optimizer.
+  """
+  type: Optional[str] = None
+  sgd: opt_cfg.SGDConfig = opt_cfg.SGDConfig()
+  adam: opt_cfg.AdamConfig = opt_cfg.AdamConfig()
+  adamw: opt_cfg.AdamWeightDecayConfig = opt_cfg.AdamWeightDecayConfig()
+  lamb: opt_cfg.LAMBConfig = opt_cfg.LAMBConfig()
+  rmsprop: opt_cfg.RMSPropConfig = opt_cfg.RMSPropConfig()
+  lars: opt_cfg.LARSConfig = opt_cfg.LARSConfig()
+  adagrad: opt_cfg.AdagradConfig = opt_cfg.AdagradConfig()
+  slide: opt_cfg.SLIDEConfig = opt_cfg.SLIDEConfig()
+
+
+@dataclasses.dataclass
+class LrConfig(oneof.OneOfConfig):
+  """Configuration for lr schedule.
+
+  Attributes:
+    type: 'str', type of lr schedule to be used, one of the fields below.
+    constant: constant learning rate config.
+    stepwise: stepwise learning rate config.
+    exponential: exponential learning rate config.
+    polynomial: polynomial learning rate config.
+    cosine: cosine learning rate config.
+    power: step^power learning rate config.
+    power_linear: learning rate config of step^power followed by
+      step^power*linear.
+    power_with_offset: power decay with a step offset.
+  """
+  type: Optional[str] = None
+  constant: lr_cfg.ConstantLrConfig = lr_cfg.ConstantLrConfig()
+  stepwise: lr_cfg.StepwiseLrConfig = lr_cfg.StepwiseLrConfig()
+  exponential: lr_cfg.ExponentialLrConfig = lr_cfg.ExponentialLrConfig()
+  polynomial: lr_cfg.PolynomialLrConfig = lr_cfg.PolynomialLrConfig()
+  cosine: lr_cfg.CosineLrConfig = lr_cfg.CosineLrConfig()
+  power: lr_cfg.DirectPowerLrConfig = lr_cfg.DirectPowerLrConfig()
+  power_linear: lr_cfg.PowerAndLinearDecayLrConfig = (
+      lr_cfg.PowerAndLinearDecayLrConfig())
+  power_with_offset: lr_cfg.PowerDecayWithOffsetLrConfig = (
+      lr_cfg.PowerDecayWithOffsetLrConfig())
+
+
+@dataclasses.dataclass
+class WarmupConfig(oneof.OneOfConfig):
+  """Configuration for lr schedule.
+
+  Attributes:
+    type: 'str', type of warmup schedule to be used, one of the fields below.
+    linear: linear warmup config.
+    polynomial: polynomial warmup config.
+  """
+  type: Optional[str] = None
+  linear: lr_cfg.LinearWarmupConfig = lr_cfg.LinearWarmupConfig()
+  polynomial: lr_cfg.PolynomialWarmupConfig = lr_cfg.PolynomialWarmupConfig()
+
+
+@dataclasses.dataclass
+class OptimizationConfig(base_config.Config):
+  """Configuration for optimizer and learning rate schedule.
+
+  Attributes:
+    optimizer: optimizer oneof config.
+    ema: optional exponential moving average optimizer config, if specified, ema
+      optimizer will be used.
+    learning_rate: learning rate oneof config.
+    warmup: warmup oneof config.
+  """
+  optimizer: OptimizerConfig = OptimizerConfig()
+  ema: Optional[opt_cfg.EMAConfig] = None
+  learning_rate: LrConfig = LrConfig()
+  warmup: WarmupConfig = WarmupConfig()
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/optimization/configs/optimization_config_test.py b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/configs/optimization_config_test.py
new file mode 100644
index 000000000..02b99f592
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/configs/optimization_config_test.py
@@ -0,0 +1,59 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for optimization_config.py."""
+
+import tensorflow as tf
+
+from official.modeling.optimization.configs import learning_rate_config as lr_cfg
+from official.modeling.optimization.configs import optimization_config
+from official.modeling.optimization.configs import optimizer_config as opt_cfg
+
+
+class OptimizerConfigTest(tf.test.TestCase):
+
+  def test_no_optimizer(self):
+    optimizer = optimization_config.OptimizationConfig({}).optimizer.get()
+    self.assertIsNone(optimizer)
+
+  def test_no_lr_schedule(self):
+    lr = optimization_config.OptimizationConfig({}).learning_rate.get()
+    self.assertIsNone(lr)
+
+  def test_no_warmup_schedule(self):
+    warmup = optimization_config.OptimizationConfig({}).warmup.get()
+    self.assertIsNone(warmup)
+
+  def test_config(self):
+    opt_config = optimization_config.OptimizationConfig({
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {}  # default config
+        },
+        'learning_rate': {
+            'type': 'polynomial',
+            'polynomial': {}
+        },
+        'warmup': {
+            'type': 'linear'
+        }
+    })
+    self.assertEqual(opt_config.optimizer.get(), opt_cfg.SGDConfig())
+    self.assertEqual(opt_config.learning_rate.get(),
+                     lr_cfg.PolynomialLrConfig())
+    self.assertEqual(opt_config.warmup.get(), lr_cfg.LinearWarmupConfig())
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/optimization/configs/optimizer_config.py b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/configs/optimizer_config.py
new file mode 100644
index 000000000..b267fde43
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/configs/optimizer_config.py
@@ -0,0 +1,249 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Dataclasses for optimizer configs."""
+from typing import List, Optional
+
+import dataclasses
+from modeling.hyperparams import base_config
+
+
+@dataclasses.dataclass
+class BaseOptimizerConfig(base_config.Config):
+  """Base optimizer config.
+
+  Attributes:
+    clipnorm: float >= 0 or None. If not None, Gradients will be clipped when
+      their L2 norm exceeds this value.
+    clipvalue: float >= 0 or None. If not None, Gradients will be clipped when
+      their absolute value exceeds this value.
+    global_clipnorm: float >= 0 or None. If not None, gradient of all weights is
+      clipped so that their global norm is no higher than this value
+  """
+  clipnorm: Optional[float] = None
+  clipvalue: Optional[float] = None
+  global_clipnorm: Optional[float] = None
+
+
+@dataclasses.dataclass
+class SGDConfig(BaseOptimizerConfig):
+  """Configuration for SGD optimizer.
+
+  The attributes for this class matches the arguments of tf.keras.optimizer.SGD.
+
+  Attributes:
+    name: name of the optimizer.
+    decay: decay rate for SGD optimizer.
+    nesterov: nesterov for SGD optimizer.
+    momentum: momentum for SGD optimizer.
+  """
+  name: str = "SGD"
+  decay: float = 0.0
+  nesterov: bool = False
+  momentum: float = 0.0
+
+
+@dataclasses.dataclass
+class RMSPropConfig(BaseOptimizerConfig):
+  """Configuration for RMSProp optimizer.
+
+  The attributes for this class matches the arguments of
+  tf.keras.optimizers.RMSprop.
+
+  Attributes:
+    name: name of the optimizer.
+    rho: discounting factor for RMSprop optimizer.
+    momentum: momentum for RMSprop optimizer.
+    epsilon: epsilon value for RMSprop optimizer, help with numerical stability.
+    centered: Whether to normalize gradients or not.
+  """
+  name: str = "RMSprop"
+  rho: float = 0.9
+  momentum: float = 0.0
+  epsilon: float = 1e-7
+  centered: bool = False
+
+
+@dataclasses.dataclass
+class AdagradConfig(BaseOptimizerConfig):
+  """Configuration for Adagrad optimizer.
+
+  The attributes of this class match the arguments of
+  tf.keras.optimizer.Adagrad.
+
+  Attributes:
+    name: name of the optimizer.
+    initial_accumulator_value: A floating point value. Starting value for the
+      accumulators, must be non-negative.
+    epsilon: A small floating point value to avoid zero denominator.
+  """
+  name: str = "Adagrad"
+  initial_accumulator_value: float = 0.1
+  epsilon: float = 1e-07
+
+
+@dataclasses.dataclass
+class AdamConfig(BaseOptimizerConfig):
+  """Configuration for Adam optimizer.
+
+  The attributes for this class matches the arguments of
+  tf.keras.optimizer.Adam.
+
+  Attributes:
+    name: name of the optimizer.
+    beta_1: decay rate for 1st order moments.
+    beta_2: decay rate for 2st order moments.
+    epsilon: epsilon value used for numerical stability in Adam optimizer.
+    amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
+      the paper "On the Convergence of Adam and beyond".
+  """
+  name: str = "Adam"
+  beta_1: float = 0.9
+  beta_2: float = 0.999
+  epsilon: float = 1e-07
+  amsgrad: bool = False
+
+
+@dataclasses.dataclass
+class AdamWeightDecayConfig(BaseOptimizerConfig):
+  """Configuration for Adam optimizer with weight decay.
+
+  Attributes:
+    name: name of the optimizer.
+    beta_1: decay rate for 1st order moments.
+    beta_2: decay rate for 2st order moments.
+    epsilon: epsilon value used for numerical stability in the optimizer.
+    amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
+      the paper "On the Convergence of Adam and beyond".
+    weight_decay_rate: float. Weight decay rate. Default to 0.
+    include_in_weight_decay: list[str], or None. List of weight names to include
+      in weight decay.
+    exclude_from_weight_decay: list[str], or None. List of weight names to not
+      include in weight decay.
+    gradient_clip_norm: A positive float. Clips the gradients to this maximum
+      L2-norm. Default to 1.0.
+  """
+  name: str = "AdamWeightDecay"
+  beta_1: float = 0.9
+  beta_2: float = 0.999
+  epsilon: float = 1e-07
+  amsgrad: bool = False
+  weight_decay_rate: float = 0.0
+  include_in_weight_decay: Optional[List[str]] = None
+  exclude_from_weight_decay: Optional[List[str]] = None
+  gradient_clip_norm: float = 1.0
+
+
+@dataclasses.dataclass
+class LAMBConfig(BaseOptimizerConfig):
+  """Configuration for LAMB optimizer.
+
+  The attributes for this class matches the arguments of
+  tensorflow_addons.optimizers.LAMB.
+
+  Attributes:
+    name: name of the optimizer.
+    beta_1: decay rate for 1st order moments.
+    beta_2: decay rate for 2st order moments.
+    epsilon: epsilon value used for numerical stability in LAMB optimizer.
+    weight_decay_rate: float. Weight decay rate. Default to 0.
+    exclude_from_weight_decay: List of regex patterns of variables excluded from
+      weight decay. Variables whose name contain a substring matching the
+      pattern will be excluded.
+    exclude_from_layer_adaptation: List of regex patterns of variables excluded
+      from layer adaptation. Variables whose name contain a substring matching
+      the pattern will be excluded.
+  """
+  name: str = "LAMB"
+  beta_1: float = 0.9
+  beta_2: float = 0.999
+  epsilon: float = 1e-6
+  weight_decay_rate: float = 0.0
+  exclude_from_weight_decay: Optional[List[str]] = None
+  exclude_from_layer_adaptation: Optional[List[str]] = None
+
+
+@dataclasses.dataclass
+class EMAConfig(BaseOptimizerConfig):
+  """Exponential moving average optimizer config.
+
+  Attributes:
+    name: 'str', name of the optimizer.
+    trainable_weights_only: 'bool', if True, only model trainable weights will
+      be updated. Otherwise, all model weights will be updated. This mainly
+      affects batch normalization parameters.
+    average_decay: 'float', average decay value.
+    start_step: 'int', start step to apply moving average.
+    dynamic_decay: 'bool', whether to apply dynamic decay or not.
+  """
+  name: str = "ExponentialMovingAverage"
+  trainable_weights_only: bool = True
+  average_decay: float = 0.99
+  start_step: int = 0
+  dynamic_decay: bool = True
+
+
+@dataclasses.dataclass
+class LARSConfig(BaseOptimizerConfig):
+  """Layer-wise adaptive rate scaling config.
+
+  Attributes:
+    name: 'str', name of the optimizer.
+    momentum: `float` hyperparameter >= 0 that accelerates gradient descent in
+      the relevant direction and dampens oscillations. Defaults to 0.9.
+    eeta: `float` LARS coefficient as used in the paper. Default set to LARS
+      coefficient from the paper. (eeta / weight_decay) determines the highest
+      scaling factor in LARS..
+    weight_decay_rate: `float` for weight decay.
+    nesterov: 'boolean' for whether to use nesterov momentum.
+    classic_momentum: `boolean` for whether to use classic (or popular)
+      momentum. The learning rate is applied during momentum update in classic
+      momentum, but after momentum for popular momentum.
+    exclude_from_weight_decay: A list of `string` for variable screening, if any
+      of the string appears in a variable's name, the variable will be excluded
+      for computing weight decay. For example, one could specify the list like
+      ['batch_normalization', 'bias'] to exclude BN and bias from weight decay.
+    exclude_from_layer_adaptation: Similar to exclude_from_weight_decay, but for
+      layer adaptation. If it is None, it will be defaulted the same as
+      exclude_from_weight_decay.
+  """
+  name: str = "LARS"
+  momentum: float = 0.9
+  eeta: float = 0.001
+  weight_decay_rate: float = 0.0
+  nesterov: bool = False
+  classic_momentum: bool = True
+  exclude_from_weight_decay: Optional[List[str]] = None
+  exclude_from_layer_adaptation: Optional[List[str]] = None
+
+
+@dataclasses.dataclass
+class SLIDEConfig(BaseOptimizerConfig):
+  """Configuration for SLIDE optimizer.
+
+  Details coming soon.
+  """
+  name: str = "SLIDE"
+  beta_1: float = 0.9
+  beta_2: float = 0.999
+  epsilon: float = 1e-6
+  weight_decay_rate: float = 0.0
+  weight_decay_type: str = "inner"
+  exclude_from_weight_decay: Optional[List[str]] = None
+  exclude_from_layer_adaptation: Optional[List[str]] = None
+  include_in_sparse_layer_adaptation: Optional[List[str]] = None
+  sparse_layer_learning_rate: float = 0.1
+  do_gradient_rescaling: bool = True
+  norm_type: str = "layer"
+  ratio_clip_norm: float = 1e5
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/optimization/ema_optimizer.py b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/ema_optimizer.py
new file mode 100644
index 000000000..c4f44d712
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/ema_optimizer.py
@@ -0,0 +1,255 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Exponential moving average optimizer."""
+
+from typing import List, Optional, Text
+
+import tensorflow as tf
+
+# pylint: disable=protected-access
+
+
+class ExponentialMovingAverage(tf.keras.optimizers.Optimizer):
+  """Optimizer that computes an exponential moving average of the variables.
+
+  Empirically it has been found that using the moving average of the trained
+  parameters of a deep network is better than using its trained parameters
+  directly. This optimizer allows you to compute this moving average and swap
+  the variables at save time so that any code outside of the training loop
+  will use by default the average values instead of the original ones.
+
+  Example of usage for training:
+  ```python
+  opt = tf.keras.optimizers.SGD(learning_rate)
+  opt = ExponentialMovingAverage(opt)
+
+  opt.shadow_copy(model)
+  ```
+
+  At test time, swap the shadow variables to evaluate on the averaged weights:
+  ```python
+  opt.swap_weights()
+  # Test eval the model here
+  opt.swap_weights()
+  ```
+  """
+
+  def __init__(self,
+               optimizer: tf.keras.optimizers.Optimizer,
+               trainable_weights_only: bool = True,
+               average_decay: float = 0.99,
+               start_step: int = 0,
+               dynamic_decay: bool = True,
+               name: Text = 'ExponentialMovingAverage',
+               **kwargs):
+    """Construct a new ExponentialMovingAverage optimizer.
+
+    Args:
+      optimizer: `tf.keras.optimizers.Optimizer` that will be
+        used to compute and apply gradients.
+      trainable_weights_only: 'bool', if True, only model trainable weights will
+        be updated. Otherwise, all model weights will be updated. This mainly
+        affects batch normalization parameters.
+      average_decay: float. Decay to use to maintain the moving averages
+        of trained variables.
+      start_step: int. What step to start the moving average.
+      dynamic_decay: bool. Whether to change the decay based on the number
+        of optimizer updates. Decay will start at 0.1 and gradually increase
+        up to `average_decay` after each optimizer update. This behavior is
+        similar to `tf.train.ExponentialMovingAverage` in TF 1.x.
+      name: Optional name for the operations created when applying
+        gradients. Defaults to "moving_average".
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`,
+        `clipvalue`, `lr`, `decay`}.
+    """
+    super().__init__(name, **kwargs)
+    self._average_decay = average_decay
+    self._trainable_weights_only = trainable_weights_only
+    self._start_step = tf.constant(start_step, tf.float32)
+    self._dynamic_decay = dynamic_decay
+    self._optimizer = optimizer
+    self._track_trackable(self._optimizer, 'base_optimizer')
+    self._average_weights = None
+    self._model_weights = None
+
+  def shadow_copy(self, model: tf.keras.Model):
+    """Creates shadow variables for the given model weights."""
+
+    if self._trainable_weights_only:
+      self._model_weights = model.trainable_variables
+    else:
+      self._model_weights = model.variables
+    for var in self._model_weights:
+      self.add_slot(var, 'average', initializer='zeros')
+
+    self._average_weights = [
+        self.get_slot(var, 'average') for var in self._model_weights
+    ]
+
+  @property
+  def has_shadow_copy(self):
+    """Whether this optimizer has created shadow variables."""
+    return self._model_weights is not None and self._average_weights is not None
+
+  def _create_slots(self, var_list):
+    self._optimizer._create_slots(var_list=var_list)  # pylint: disable=protected-access
+
+  def apply_gradients(self, grads_and_vars, name: Optional[Text] = None):
+    result = self._optimizer.apply_gradients(grads_and_vars, name)
+    self.update_average(self.iterations)
+    return result
+
+  @tf.function
+  def update_average(self, step: tf.Tensor):
+    step = tf.cast(step, tf.float32)
+    if step < self._start_step:
+      decay = tf.constant(0., tf.float32)
+    elif self._dynamic_decay:
+      decay = step - self._start_step
+      decay = tf.minimum(self._average_decay, (1. + decay) / (10. + decay))
+    else:
+      decay = self._average_decay
+
+    def _apply_moving(v_moving, v_normal):
+      diff = v_moving - v_normal
+      v_moving.assign_sub(tf.cast(1. - decay, v_moving.dtype) * diff)
+      return v_moving
+
+    def _update(strategy, v_moving_and_v_normal):
+      for v_moving, v_normal in v_moving_and_v_normal:
+        strategy.extended.update(v_moving, _apply_moving, args=(v_normal,))
+
+    ctx = tf.distribute.get_replica_context()
+    return ctx.merge_call(_update, args=(zip(self._average_weights,
+                                             self._model_weights),))
+
+  def swap_weights(self):
+    """Swap the average and moving weights.
+
+    This is a convenience method to allow one to evaluate the averaged weights
+    at test time. Loads the weights stored in `self._average` into the model,
+    keeping a copy of the original model weights. Swapping twice will return
+    the original weights.
+    """
+    if tf.distribute.in_cross_replica_context():
+      strategy = tf.distribute.get_strategy()
+      strategy.run(self._swap_weights, args=())
+    else:
+      raise ValueError('Swapping weights must occur under a '
+                       'tf.distribute.Strategy')
+
+  @tf.function
+  def _swap_weights(self):
+    def fn_0(a, b):
+      a.assign_add(b)
+      return a
+    def fn_1(b, a):
+      b.assign(a - b)
+      return b
+    def fn_2(a, b):
+      a.assign_sub(b)
+      return a
+
+    def swap(strategy, a_and_b):
+      """Swap `a` and `b` and mirror to all devices."""
+      for a, b in a_and_b:
+        strategy.extended.update(a, fn_0, args=(b,))  # a = a + b
+        strategy.extended.update(b, fn_1, args=(a,))  # b = a - b
+        strategy.extended.update(a, fn_2, args=(b,))  # a = a - b
+
+    ctx = tf.distribute.get_replica_context()
+    return ctx.merge_call(
+        swap, args=(zip(self._average_weights, self._model_weights),))
+
+  def assign_average_vars(self, var_list: List[tf.Variable]):
+    """Assign variables in var_list with their respective averages.
+
+    Args:
+      var_list: List of model variables to be assigned to their average.
+    Returns:
+      assign_op: The op corresponding to the assignment operation of
+        variables to their average.
+    """
+    assign_op = tf.group([
+        var.assign(self.get_slot(var, 'average')) for var in var_list
+        if var.trainable
+    ])
+    return assign_op
+
+  def _create_hypers(self):
+    self._optimizer._create_hypers()  # pylint: disable=protected-access
+
+  def _prepare(self, var_list):
+    return self._optimizer._prepare(var_list=var_list)  # pylint: disable=protected-access
+
+  @property
+  def iterations(self):
+    return self._optimizer.iterations
+
+  @iterations.setter
+  def iterations(self, variable):
+    self._optimizer.iterations = variable
+
+  @property
+  def weights(self):
+    # return self._weights + self._optimizer.weights
+    return self._optimizer.weights
+
+  def variables(self):
+    return self._weights + [self.iterations]
+
+  @property
+  def lr(self):
+    return self._optimizer._get_hyper('learning_rate')
+
+  @lr.setter
+  def lr(self, lr):
+    self._optimizer._set_hyper('learning_rate', lr)
+
+  @property
+  def learning_rate(self):
+    return self._optimizer._get_hyper('learning_rate')
+
+  @learning_rate.setter
+  def learning_rate(self, learning_rate):  # pylint: disable=redefined-outer-name
+    self._optimizer._set_hyper('learning_rate', learning_rate)
+
+  def _resource_apply_dense(self, grad, var):
+    return self._optimizer._resource_apply_dense(grad, var)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    return self._optimizer._resource_apply_sparse(grad, var, indices)
+
+  def _resource_apply_sparse_duplicate_indices(self, grad, var, indices):
+    return self._optimizer._resource_apply_sparse_duplicate_indices(
+        grad, var, indices)
+
+  def get_config(self):
+    config = {
+        'optimizer': tf.keras.optimizers.serialize(self._optimizer),
+        'average_decay': self._average_decay,
+        'start_step': self._start_step,
+        'dynamic_decay': self._dynamic_decay,
+    }
+    base_config = super(ExponentialMovingAverage, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    optimizer = tf.keras.optimizers.deserialize(
+        config.pop('optimizer'),
+        custom_objects=custom_objects,
+    )
+    return cls(optimizer, **config)
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/optimization/lars_optimizer.py b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/lars_optimizer.py
new file mode 100644
index 000000000..ac1504275
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/lars_optimizer.py
@@ -0,0 +1,186 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Layer-wise adaptive rate scaling optimizer."""
+import re
+from typing import Text, List, Optional
+
+import tensorflow as tf
+
+
+# pylint: disable=protected-access
+
+
+class LARS(tf.keras.optimizers.Optimizer):
+  """Layer-wise Adaptive Rate Scaling for large batch training.
+
+  Introduced by "Large Batch Training of Convolutional Networks" by Y. You,
+  I. Gitman, and B. Ginsburg. (https://arxiv.org/abs/1708.03888)
+  """
+
+  def __init__(self,
+               learning_rate: float = 0.01,
+               momentum: float = 0.9,
+               weight_decay_rate: float = 0.0,
+               eeta: float = 0.001,
+               nesterov: bool = False,
+               classic_momentum: bool = True,
+               exclude_from_weight_decay: Optional[List[Text]] = None,
+               exclude_from_layer_adaptation: Optional[List[Text]] = None,
+               name: Text = "LARS",
+               **kwargs):
+    """Constructs a LARSOptimizer.
+
+    Args:
+      learning_rate: `float` for learning rate. Defaults to 0.01.
+      momentum: `float` hyperparameter >= 0 that accelerates gradient descent
+          in the relevant direction and dampens oscillations. Defaults to 0.9.
+      weight_decay_rate: `float` for weight decay.
+      eeta: `float` LARS coefficient as used in the paper. Default set to LARS
+          coefficient from the paper. (eeta / weight_decay) determines the
+          highest scaling factor in LARS..
+      nesterov: 'boolean' for whether to use nesterov momentum.
+      classic_momentum: `boolean` for whether to use classic (or popular)
+          momentum. The learning rate is applied during momentum update in
+          classic momentum, but after momentum for popular momentum.
+      exclude_from_weight_decay: A list of `string` for variable screening, if
+          any of the string appears in a variable's name, the variable will be
+          excluded for computing weight decay. For example, one could specify
+          the list like ['batch_normalization', 'bias'] to exclude BN and bias
+          from weight decay.
+      exclude_from_layer_adaptation: Similar to exclude_from_weight_decay, but
+          for layer adaptation. If it is None, it will be defaulted the same as
+          exclude_from_weight_decay.
+      name: `Text` as optional name for the operations created when applying
+        gradients. Defaults to "LARS".
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for
+        backward compatibility, recommended to use `learning_rate` instead.
+    """
+    super(LARS, self).__init__(name, **kwargs)
+
+    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("decay", self._initial_decay)
+    self.momentum = momentum
+    self.weight_decay_rate = weight_decay_rate
+    self.eeta = eeta
+    self.nesterov = nesterov
+    self.classic_momentum = classic_momentum
+    self.exclude_from_weight_decay = exclude_from_weight_decay
+    # exclude_from_layer_adaptation is set to exclude_from_weight_decay if the
+    # arg is None.
+    if exclude_from_layer_adaptation:
+      self.exclude_from_layer_adaptation = exclude_from_layer_adaptation
+    else:
+      self.exclude_from_layer_adaptation = exclude_from_weight_decay
+
+  def _create_slots(self, var_list):
+    for v in var_list:
+      self.add_slot(v, "momentum")
+
+  def _resource_apply_dense(self, grad, param, apply_state=None):
+    if grad is None or param is None:
+      return tf.no_op()
+
+    var_device, var_dtype = param.device, param.dtype.base_dtype
+    coefficients = ((apply_state or {}).get((var_device, var_dtype)) or
+                    self._fallback_apply_state(var_device, var_dtype))
+    learning_rate = coefficients["lr_t"]
+
+    param_name = param.name
+
+    v = self.get_slot(param, "momentum")
+
+    if self._use_weight_decay(param_name):
+      grad += self.weight_decay_rate * param
+
+    if self.classic_momentum:
+      trust_ratio = 1.0
+      if self._do_layer_adaptation(param_name):
+        w_norm = tf.norm(param, ord=2)
+        g_norm = tf.norm(grad, ord=2)
+        trust_ratio = tf.where(
+            tf.greater(w_norm, 0),
+            tf.where(tf.greater(g_norm, 0), (self.eeta * w_norm / g_norm), 1.0),
+            1.0)
+      scaled_lr = learning_rate * trust_ratio
+
+      next_v = tf.multiply(self.momentum, v) + scaled_lr * grad
+      if self.nesterov:
+        update = tf.multiply(self.momentum, next_v) + scaled_lr * grad
+      else:
+        update = next_v
+      next_param = param - update
+    else:
+      next_v = tf.multiply(self.momentum, v) + grad
+      if self.nesterov:
+        update = tf.multiply(self.momentum, next_v) + grad
+      else:
+        update = next_v
+
+      trust_ratio = 1.0
+      if self._do_layer_adaptation(param_name):
+        w_norm = tf.norm(param, ord=2)
+        v_norm = tf.norm(update, ord=2)
+        trust_ratio = tf.where(
+            tf.greater(w_norm, 0),
+            tf.where(tf.greater(v_norm, 0), (self.eeta * w_norm / v_norm), 1.0),
+            1.0)
+      scaled_lr = trust_ratio * learning_rate
+      next_param = param - scaled_lr * update
+
+    return tf.group(*[
+        param.assign(next_param, use_locking=False),
+        v.assign(next_v, use_locking=False)
+    ])
+
+  def _resource_apply_sparse(self, grad, handle, indices, apply_state):
+    raise NotImplementedError("Applying sparse gradients is not implemented.")
+
+  def _use_weight_decay(self, param_name):
+    """Whether to use L2 weight decay for `param_name`."""
+    if not self.weight_decay_rate:
+      return False
+    if self.exclude_from_weight_decay:
+      for r in self.exclude_from_weight_decay:
+        if re.search(r, param_name) is not None:
+          return False
+    return True
+
+  def _do_layer_adaptation(self, param_name):
+    """Whether to do layer-wise learning rate adaptation for `param_name`."""
+    if self.exclude_from_layer_adaptation:
+      for r in self.exclude_from_layer_adaptation:
+        if re.search(r, param_name) is not None:
+          return False
+    return True
+
+  def get_config(self):
+    config = super(LARS, self).get_config()
+    config.update({
+        "learning_rate": self._serialize_hyperparameter("learning_rate"),
+        "decay": self._serialize_hyperparameter("decay"),
+        "momentum": self.momentum,
+        "classic_momentum": self.classic_momentum,
+        "weight_decay_rate": self.weight_decay_rate,
+        "eeta": self.eeta,
+        "nesterov": self.nesterov,
+    })
+    return config
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/optimization/lr_schedule.py b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/lr_schedule.py
new file mode 100644
index 000000000..09f082bbb
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/lr_schedule.py
@@ -0,0 +1,385 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Learning rate schedule classes."""
+
+from typing import Mapping, Any, Union, Optional
+
+import tensorflow as tf
+
+
+def _make_offset_wrapper(new_class_name: str, base_lr_class):
+  """Generates a offset wrapper of learning rate schedule.
+
+  It will returns a subclass of the the `base_lr_class`, the subclass takes an
+  `offset` argument in the constructor. When the new class instance is called,
+  the behavior is:
+    new_class_object(step) = base_lr_class_object(step - offset)
+
+  Example:
+    CosineDecayWithOffset = _make_offset_wrapper(
+                     'CosineDecayWithOffset', tf.keras.experimental.CosineDecay)
+    # Use the lr:
+    lr = CosineDecayWithOffset(offset=100, initial_learning_rate=0.1,
+                               decay_steps=1000)
+    lr(101) # equals to tf.keras.experimental.CosineDecay(...)(101-100)
+
+  Args:
+    new_class_name: the name of the new class.
+    base_lr_class: the base learning rate schedule class. Should be subclass of
+      tf.keras.optimizers.schedules.LearningRateSchedule
+
+  Returns:
+    A new class (subclass of the base_lr_class) that can take an offset.
+  """
+  assert issubclass(base_lr_class,
+                    tf.keras.optimizers.schedules.LearningRateSchedule), (
+                        "base_lr_class should be subclass of keras "
+                        f"LearningRateSchedule, got {base_lr_class}")
+
+  # pylint: disable=protected-access,pointless-statement
+  def offset_learning_rate_init(self, offset=0, **kwargs):
+    """Construct learning rate schedule object.
+
+    When this object is called, its behavior is
+       self.__call__(step) == base_lr_class.__call__(step - offset)
+    Args:
+      self: this object.
+      offset: The offset when computing the learning rate schedule.
+      **kwargs: Pass through to base learning rate class constructor.
+    """
+    base_lr_class.__init__(self, **kwargs)
+    self._offset = offset
+
+  def offset_learning_rate_call(self, step):
+    step = tf.cast(step - self._offset, tf.float32)
+    return base_lr_class.__call__(self, step)
+
+  # pylint: enable=protected-access,pointless-statement
+
+  return type(
+      new_class_name, (base_lr_class,), {
+          "base_lr_class": base_lr_class,
+          "__init__": offset_learning_rate_init,
+          "__call__": offset_learning_rate_call
+      })
+
+
+PiecewiseConstantDecayWithOffset = _make_offset_wrapper(
+    "PiecewiseConstantDecayWithOffset",
+    tf.keras.optimizers.schedules.PiecewiseConstantDecay)
+PolynomialDecayWithOffset = _make_offset_wrapper(
+    "PolynomialDecayWithOffset", tf.keras.optimizers.schedules.PolynomialDecay)
+ExponentialDecayWithOffset = _make_offset_wrapper(
+    "ExponentialDecayWithOffset",
+    tf.keras.optimizers.schedules.ExponentialDecay)
+CosineDecayWithOffset = _make_offset_wrapper("CosineDecayWithOffset",
+                                             tf.keras.experimental.CosineDecay)
+
+
+class LinearWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Linear warmup schedule."""
+
+  def __init__(self,
+               after_warmup_lr_sched: Union[
+                   tf.keras.optimizers.schedules.LearningRateSchedule, float],
+               warmup_steps: int,
+               warmup_learning_rate: float,
+               name: Optional[str] = None):
+    """Add linear warmup schedule to a learning rate schedule.
+
+    warmup_lr is the initial learning rate, the final learning rate of the
+    init_warmup period is the initial learning rate of lr_schedule in use.
+    The learning rate at each step linearly increased according to the following
+    formula:
+      learning_rate = warmup_lr + step / warmup_steps
+                    * (final_warmup_lr - warmup_lr).
+    Using warmup overrides the learning rate schedule by the number of warmup
+    steps.
+
+    Args:
+      after_warmup_lr_sched: tf.keras.optimizers.schedules .LearningRateSchedule
+        or a constant.
+      warmup_steps: Number of the warmup steps.
+      warmup_learning_rate: Initial learning rate for the warmup.
+      name: Optional, name of warmup schedule.
+    """
+    super().__init__()
+    self._name = name
+    self._after_warmup_lr_sched = after_warmup_lr_sched
+    self._warmup_steps = warmup_steps
+    self._init_warmup_lr = warmup_learning_rate
+    if isinstance(after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      self._final_warmup_lr = after_warmup_lr_sched(warmup_steps)
+    else:
+      self._final_warmup_lr = tf.cast(after_warmup_lr_sched, dtype=tf.float32)
+
+  def __call__(self, step: int):
+
+    global_step = tf.cast(step, dtype=tf.float32)
+
+    linear_warmup_lr = (
+        self._init_warmup_lr + global_step / self._warmup_steps *
+        (self._final_warmup_lr - self._init_warmup_lr))
+
+    if isinstance(self._after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      after_warmup_lr = self._after_warmup_lr_sched(step)
+    else:
+      after_warmup_lr = tf.cast(self._after_warmup_lr_sched, dtype=tf.float32)
+
+    lr = tf.cond(global_step < self._warmup_steps,
+                 lambda: linear_warmup_lr,
+                 lambda: after_warmup_lr)
+    return lr
+
+  def get_config(self) -> Mapping[str, Any]:
+    if isinstance(self._after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      config = {
+          "after_warmup_lr_sched": self._after_warmup_lr_sched.get_config()}  # pytype: disable=attribute-error
+    else:
+      config = {"after_warmup_lr_sched": self._after_warmup_lr_sched}  # pytype: disable=attribute-error
+
+    config.update({
+        "warmup_steps": self._warmup_steps,
+        "warmup_learning_rate": self._init_warmup_lr,
+        "name": self._name
+    })
+    return config
+
+
+class PolynomialWarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Applies polynomial warmup schedule on a given learning rate decay schedule."""
+
+  def __init__(self,
+               after_warmup_lr_sched: Union[
+                   tf.keras.optimizers.schedules.LearningRateSchedule, float],
+               warmup_steps: int,
+               power: float = 1.0,
+               name: str = "PolynomialWarmup"):
+    super().__init__()
+    if isinstance(after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      self._initial_learning_rate = after_warmup_lr_sched(warmup_steps)
+    else:
+      self._initial_learning_rate = tf.cast(
+          after_warmup_lr_sched, dtype=tf.float32)
+
+    self._warmup_steps = warmup_steps
+    self._power = power
+    self._after_warmup_lr_sched = after_warmup_lr_sched
+    self._name = name
+
+  def __call__(self, step):
+    with tf.name_scope(self._name or "PolynomialWarmUp") as name:
+      # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
+      # learning rate will be `global_step/num_warmup_steps * init_lr`.
+      global_step_float = tf.cast(step, tf.float32)
+      warmup_steps_float = tf.cast(self._warmup_steps, tf.float32)
+
+      if self._warmup_steps <= 0:
+        warmup_percent_done = 1.0
+      else:
+        # A zero `step` may cause Inf. So make `step` positive.
+        step_non_zero = tf.math.maximum(global_step_float, 1.0)
+        warmup_percent_done = step_non_zero / warmup_steps_float
+
+      warmup_learning_rate = (
+          self._initial_learning_rate *
+          tf.math.pow(warmup_percent_done, self._power))
+
+      if isinstance(self._after_warmup_lr_sched,
+                    tf.keras.optimizers.schedules.LearningRateSchedule):
+        after_warmup_lr = self._after_warmup_lr_sched(step)
+      else:
+        after_warmup_lr = tf.cast(self._after_warmup_lr_sched, dtype=tf.float32)
+
+      return tf.cond(
+          global_step_float < warmup_steps_float,
+          lambda: warmup_learning_rate,
+          lambda: after_warmup_lr,
+          name=name)
+
+  def get_config(self) -> Mapping[str, Any]:
+    if isinstance(self._after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      config = {
+          "after_warmup_lr_sched": self._after_warmup_lr_sched.get_config()}  # pytype: disable=attribute-error
+    else:
+      config = {"after_warmup_lr_sched": self._after_warmup_lr_sched}  # pytype: disable=attribute-error
+
+    config.update({
+        "warmup_steps": self._warmup_steps,
+        "power": self._power,
+        "name": self._name
+    })
+    return config
+
+
+class DirectPowerDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Learning rate schedule follows lr * (step)^power."""
+
+  def __init__(self,
+               initial_learning_rate: float,
+               power: float = 1.0,
+               name: str = "DirectPowerDecay"):
+    """Initialize configuration of the learning rate schedule.
+
+    Args:
+      initial_learning_rate: The initial learning rate.
+      power: The order of the polynomial.
+      name: Optional, name of learning rate schedule.
+    """
+    super().__init__()
+    self._initial_learning_rate = initial_learning_rate
+    self._power = power
+    self._name = name
+
+  def __call__(self, step):
+    with tf.name_scope(self._name or "DirectPowerDecay"):
+      step = tf.cast(step, tf.float32)
+      learning_rate = self._initial_learning_rate
+      # A zero `step` may cause Inf. So make `step` positive.
+      step_non_zero = tf.math.maximum(step, 1.0)
+      learning_rate *= tf.math.pow(step_non_zero, self._power)
+      return learning_rate
+
+  def get_config(self):
+    """Get the configuration of the learning rate schedule."""
+    return {
+        "initial_learning_rate": self._initial_learning_rate,
+        "power": self._power,
+        "name": self._name,
+    }
+
+
+class PowerAndLinearDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Learning rate schedule with multiplied by linear decay at the end.
+
+  The schedule has the following behavoir.
+  Let offset_step = step - offset.
+  1) offset_step < 0, the actual learning rate equals initial_learning_rate.
+  2) offset_step <= total_decay_steps * (1 - linear_decay_fraction), the
+  actual learning rate equals lr * offset_step^power.
+  3) total_decay_steps * (1 - linear_decay_fraction) <= offset_step <
+  total_decay_steps, the actual learning rate equals lr * offset_step^power *
+  (total_decay_steps - offset_step) / (total_decay_steps *
+  linear_decay_fraction).
+  4) offset_step >= total_decay_steps, the actual learning rate equals zero.
+  """
+
+  def __init__(self,
+               initial_learning_rate: float,
+               total_decay_steps: int,
+               power: float = 1.0,
+               linear_decay_fraction: float = 0.1,
+               offset: int = 0,
+               name: str = "PowerAndLinearDecay"):
+    """Initialize configuration of the learning rate schedule.
+
+    Args:
+      initial_learning_rate: The initial learning rate.
+      total_decay_steps: The total number of steps for power + linear decay.
+      power: The order of the polynomial.
+      linear_decay_fraction: In the last `linear_decay_fraction` steps, the
+        learning rate will be multiplied by a linear decay.
+      offset: The offset applied to steps.
+      name: Optional, name of learning rate schedule.
+    """
+    super().__init__()
+    self._initial_learning_rate = initial_learning_rate
+    self._total_decay_steps = total_decay_steps
+    self._power = power
+    self._linear_decay_fraction = linear_decay_fraction
+    self._offset = offset
+    self._name = name
+
+  def __call__(self, step):
+    with tf.name_scope(self._name or "PowerAndLinearDecay"):
+      step = tf.cast(step - self._offset, tf.float32)
+      learning_rate = self._initial_learning_rate
+      # A zero `step` may cause Inf. So make `step` positive.
+      step_non_zero = tf.math.maximum(step, 1.0)
+      learning_rate *= tf.math.pow(step_non_zero, self._power)
+      if self._total_decay_steps * self._linear_decay_fraction > 0:
+        learning_rate *= tf.minimum(
+            1.0, (self._total_decay_steps - step) /
+            (self._total_decay_steps * self._linear_decay_fraction))
+        learning_rate = tf.maximum(0.0, learning_rate)
+      return learning_rate
+
+  def get_config(self):
+    """Get the configuration of the learning rate schedule."""
+    return {
+        "initial_learning_rate": self._initial_learning_rate,
+        "total_decay_steps": self._total_decay_steps,
+        "power": self._power,
+        "linear_decay_fraction": self._linear_decay_fraction,
+        "offset": self._offset,
+        "name": self._name,
+    }
+
+
+class PowerDecayWithOffset(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Power learning rate decay with offset.
+
+  Learning rate equals to `pre_offset_learning_rate` if `step` < `offset`.
+  Otherwise, learning rate equals to lr * (step - offset)^power.
+  """
+
+  def __init__(self,
+               initial_learning_rate: float,
+               power: float = 1.0,
+               offset: int = 0,
+               pre_offset_learning_rate: float = 1.0e6,
+               name: str = "PowerDecayWithOffset"):
+    """Initialize configuration of the learning rate schedule.
+
+    Args:
+      initial_learning_rate: The initial learning rate.
+      power: The order of the polynomial.
+      offset: The offset when computing the power decay.
+      pre_offset_learning_rate: The maximum learning rate we'll use.
+      name: Optional, name of learning rate schedule.
+    """
+    super().__init__()
+    self._initial_learning_rate = initial_learning_rate
+    self._power = power
+    self._offset = offset
+    self._pre_offset_lr = pre_offset_learning_rate
+    self._name = name
+
+  def __call__(self, step):
+    with tf.name_scope(self._name or "PowerDecayWithOffset"):
+      step = tf.cast(step, tf.float32)
+      lr_after_offset = tf.math.pow(
+          tf.math.maximum(step - self._offset, 1.0), self._power) * (
+              self._initial_learning_rate)
+
+      sign = tf.cast(step > self._offset, tf.float32)
+      lr_combined = (1.0 - sign) * self._pre_offset_lr + sign * lr_after_offset
+      # Power may give infinitely large LR. So cap it with pre_offset_lr.
+      return tf.math.minimum(lr_combined, self._pre_offset_lr)
+
+  def get_config(self):
+    """Get the configuration of the learning rate schedule."""
+    return {
+        "initial_learning_rate": self._initial_learning_rate,
+        "power": self._power,
+        "offset": self._offset,
+        "pre_offset_learning_rate": self._pre_offset_lr,
+        "name": self._name,
+    }
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/optimization/lr_schedule_test.py b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/lr_schedule_test.py
new file mode 100644
index 000000000..bafd8be1f
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/lr_schedule_test.py
@@ -0,0 +1,109 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for lr_schedule."""
+from absl.testing import parameterized
+import tensorflow as tf
+
+from official.modeling.optimization import lr_schedule
+
+
+class PowerAndLinearDecayTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      dict(
+          testcase_name='power_only',
+          init_lr=1.0,
+          power=-1.0,
+          linear_decay_fraction=0.0,
+          total_decay_steps=100,
+          offset=0,
+          expected=[[0, 1.0], [1, 1.0], [40, 1. / 40.], [60, 1. / 60],
+                    [100, 1. / 100]]),
+      dict(
+          testcase_name='linear_only',
+          init_lr=1.0,
+          power=0.0,
+          linear_decay_fraction=1.0,
+          total_decay_steps=100,
+          offset=0,
+          expected=[[0, 1.0], [1, 0.99], [40, 0.6], [60, 0.4], [100, 0.0]]),
+      dict(
+          testcase_name='general',
+          init_lr=1.0,
+          power=-1.0,
+          linear_decay_fraction=0.5,
+          total_decay_steps=100,
+          offset=0,
+          expected=[[0, 1.0], [1, 1.0], [40, 1. / 40.],
+                    [60, 1. / 60. * 0.8], [100, 0.0]]),
+      dict(
+          testcase_name='offset',
+          init_lr=1.0,
+          power=-1.0,
+          linear_decay_fraction=0.5,
+          total_decay_steps=100,
+          offset=90,
+          expected=[[0, 1.0], [90, 1.0], [91, 1.0], [130, 1. / 40.],
+                    [150, 1. / 60. * 0.8], [190, 0.0], [200, 0.0]]),
+  )
+  def test_power_linear_lr_schedule(self, init_lr, power, linear_decay_fraction,
+                                    total_decay_steps, offset, expected):
+    lr = lr_schedule.PowerAndLinearDecay(
+        initial_learning_rate=init_lr,
+        power=power,
+        linear_decay_fraction=linear_decay_fraction,
+        total_decay_steps=total_decay_steps,
+        offset=offset)
+    for step, value in expected:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+
+class OffsetLearningRateTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(
+      dict(class_name=lr_schedule.PiecewiseConstantDecayWithOffset),
+      dict(class_name=lr_schedule.PolynomialDecayWithOffset),
+      dict(class_name=lr_schedule.ExponentialDecayWithOffset),
+      dict(class_name=lr_schedule.CosineDecayWithOffset),
+  )
+  def test_generated_docstring(self, class_name):
+    self.assertNotEmpty(class_name.__init__.__doc__)
+
+  @parameterized.parameters(
+      dict(
+          class_name=lr_schedule.PiecewiseConstantDecayWithOffset,
+          kwarg=dict(boundaries=[50, 80], values=[1.0, 0.5, 0.1])),
+      dict(
+          class_name=lr_schedule.PolynomialDecayWithOffset,
+          kwarg=dict(initial_learning_rate=1.0, decay_steps=100)),
+      dict(
+          class_name=lr_schedule.ExponentialDecayWithOffset,
+          kwarg=dict(
+              initial_learning_rate=1.0, decay_steps=100, decay_rate=0.5)),
+      dict(
+          class_name=lr_schedule.CosineDecayWithOffset,
+          kwarg=dict(initial_learning_rate=1.0, decay_steps=100)),
+  )
+  def test_offset(self, class_name, kwarg):
+    offset = 10
+    offset_lr = class_name(offset=offset, **kwarg)
+    base_lr = class_name.base_lr_class(**kwarg)
+    self.assertIsInstance(offset_lr, class_name)
+    for step in range(10, 101, 10):
+      self.assertEqual(offset_lr(step), base_lr(step - offset))
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/optimization/optimizer_factory.py b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/optimizer_factory.py
new file mode 100644
index 000000000..12c3ae0ae
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/optimizer_factory.py
@@ -0,0 +1,175 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Optimizer factory class."""
+from typing import Callable, Optional, Union
+
+import gin
+import tensorflow as tf
+# import tensorflow_addons.optimizers as tfa_optimizers
+
+from modeling.optimization import slide_optimizer
+from modeling.optimization import ema_optimizer
+from modeling.optimization import lars_optimizer
+from modeling.optimization import lr_schedule
+from modeling.optimization.configs import optimization_config as opt_cfg
+
+OPTIMIZERS_CLS = {
+    'sgd': tf.keras.optimizers.SGD,
+    'adam': tf.keras.optimizers.Adam,
+    # 'lamb': tfa_optimizers.LAMB,
+    'rmsprop': tf.keras.optimizers.RMSprop,
+    'lars': lars_optimizer.LARS,
+    'adagrad': tf.keras.optimizers.Adagrad,
+    'slide': slide_optimizer.SLIDE
+}
+
+LR_CLS = {
+    'stepwise': lr_schedule.PiecewiseConstantDecayWithOffset,
+    'polynomial': lr_schedule.PolynomialDecayWithOffset,
+    'exponential': lr_schedule.ExponentialDecayWithOffset,
+    'cosine': lr_schedule.CosineDecayWithOffset,
+    'power': lr_schedule.DirectPowerDecay,
+    'power_linear': lr_schedule.PowerAndLinearDecay,
+    'power_with_offset': lr_schedule.PowerDecayWithOffset,
+}
+
+WARMUP_CLS = {
+    'linear': lr_schedule.LinearWarmup,
+    'polynomial': lr_schedule.PolynomialWarmUp
+}
+
+
+class OptimizerFactory:
+  """Optimizer factory class.
+
+  This class builds learning rate and optimizer based on an optimization config.
+  To use this class, you need to do the following:
+  (1) Define optimization config, this includes optimizer, and learning rate
+      schedule.
+  (2) Initialize the class using the optimization config.
+  (3) Build learning rate.
+  (4) Build optimizer.
+
+  This is a typical example for using this class:
+  params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {'momentum': 0.9}
+        },
+        'learning_rate': {
+            'type': 'stepwise',
+            'stepwise': {'boundaries': [10000, 20000],
+                         'values': [0.1, 0.01, 0.001]}
+        },
+        'warmup': {
+            'type': 'linear',
+            'linear': {'warmup_steps': 500, 'warmup_learning_rate': 0.01}
+        }
+    }
+  opt_config = OptimizationConfig(params)
+  opt_factory = OptimizerFactory(opt_config)
+  lr = opt_factory.build_learning_rate()
+  optimizer = opt_factory.build_optimizer(lr)
+  """
+
+  def __init__(self, config: opt_cfg.OptimizationConfig):
+    """Initializing OptimizerFactory.
+
+    Args:
+      config: OptimizationConfig instance contain optimization config.
+    """
+    self._config = config
+    self._optimizer_config = config.optimizer.get()
+    self._optimizer_type = config.optimizer.type
+
+    self._use_ema = config.ema is not None
+    self._ema_config = config.ema
+
+    if self._optimizer_config is None:
+      raise ValueError('Optimizer type must be specified')
+
+    self._lr_config = config.learning_rate.get()
+    self._lr_type = config.learning_rate.type
+
+    if self._lr_type is None:
+      raise ValueError('Learning rate type must be specified')
+
+    self._warmup_config = config.warmup.get()
+    self._warmup_type = config.warmup.type
+
+  def build_learning_rate(self):
+    """Build learning rate.
+
+    Builds learning rate from config. Learning rate schedule is built according
+    to the learning rate config. If learning rate type is consant,
+    lr_config.learning_rate is returned.
+
+    Returns:
+      tf.keras.optimizers.schedules.LearningRateSchedule instance. If
+      learning rate type is consant, lr_config.learning_rate is returned.
+    """
+    if self._lr_type == 'constant':
+      lr = self._lr_config.learning_rate
+    else:
+      lr = LR_CLS[self._lr_type](**self._lr_config.as_dict())
+
+    if self._warmup_config:
+      lr = WARMUP_CLS[self._warmup_type](lr, **self._warmup_config.as_dict())
+
+    return lr
+
+  @gin.configurable
+  def build_optimizer(
+      self,
+      lr: Union[tf.keras.optimizers.schedules.LearningRateSchedule, float],
+      postprocessor: Optional[Callable[[tf.keras.optimizers.Optimizer],
+                                       tf.keras.optimizers.Optimizer]] = None):
+    """Build optimizer.
+
+    Builds optimizer from config. It takes learning rate as input, and builds
+    the optimizer according to the optimizer config. Typically, the learning
+    rate built using self.build_lr() is passed as an argument to this method.
+
+    Args:
+      lr: A floating point value, or a
+        tf.keras.optimizers.schedules.LearningRateSchedule instance.
+      postprocessor: An optional function for postprocessing the optimizer. It
+        takes an optimizer and returns an optimizer.
+
+    Returns:
+      tf.keras.optimizers.Optimizer instance.
+    """
+
+    optimizer_dict = self._optimizer_config.as_dict()
+    ## Delete clipnorm and clipvalue if None
+    if optimizer_dict['clipnorm'] is None:
+      del optimizer_dict['clipnorm']
+    if optimizer_dict['clipvalue'] is None:
+      del optimizer_dict['clipvalue']
+
+    optimizer_dict['learning_rate'] = lr
+
+    optimizer = OPTIMIZERS_CLS[self._optimizer_type](**optimizer_dict)
+
+    if self._use_ema:
+      optimizer = ema_optimizer.ExponentialMovingAverage(
+          optimizer, **self._ema_config.as_dict())
+    if postprocessor:
+      optimizer = postprocessor(optimizer)
+    assert isinstance(optimizer, tf.keras.optimizers.Optimizer), (
+        'OptimizerFactory.build_optimizer returning a non-optimizer object: '
+        '{}'.format(optimizer))
+
+    return optimizer
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/optimization/optimizer_factory_test.py b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/optimizer_factory_test.py
new file mode 100644
index 000000000..0c8dec447
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/optimizer_factory_test.py
@@ -0,0 +1,398 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for optimizer_factory.py."""
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from official.modeling.optimization import optimizer_factory
+from official.modeling.optimization.configs import optimization_config
+
+
+class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters(('sgd'), ('rmsprop'), ('adam'), ('adamw'), ('lamb'),
+                            ('lars'), ('adagrad'))
+  def test_optimizers(self, optimizer_type):
+    params = {
+        'optimizer': {
+            'type': optimizer_type
+        },
+        'learning_rate': {
+            'type': 'constant',
+            'constant': {
+                'learning_rate': 0.1
+            }
+        }
+    }
+    optimizer_cls = optimizer_factory.OPTIMIZERS_CLS[optimizer_type]
+    expected_optimizer_config = optimizer_cls().get_config()
+    expected_optimizer_config['learning_rate'] = 0.1
+
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+    optimizer = opt_factory.build_optimizer(lr, postprocessor=lambda x: x)
+
+    self.assertIsInstance(optimizer, optimizer_cls)
+    self.assertEqual(expected_optimizer_config, optimizer.get_config())
+
+  @parameterized.parameters((None, None), (1.0, None), (None, 1.0))
+  def test_gradient_clipping(self, clipnorm, clipvalue):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'clipnorm': clipnorm,
+                'clipvalue': clipvalue
+            }
+        },
+        'learning_rate': {
+            'type': 'constant',
+            'constant': {
+                'learning_rate': 1.0
+            }
+        }
+    }
+
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+    optimizer = opt_factory.build_optimizer(lr)
+
+    var0 = tf.Variable([1.0, 2.0])
+    var1 = tf.Variable([3.0, 4.0])
+
+    grads0 = tf.constant([0.1, 0.1])
+    grads1 = tf.constant([2.0, 3.0])
+
+    grads_and_vars = list(zip([grads0, grads1], [var0, var1]))
+    optimizer.apply_gradients(grads_and_vars)
+
+    self.assertAllClose(np.array([0.9, 1.9]), var0.numpy())
+    if clipvalue is not None:
+      self.assertAllClose(np.array([2.0, 3.0]), var1.numpy())
+    elif clipnorm is not None:
+      self.assertAllClose(np.array([2.4452999, 3.1679497]), var1.numpy())
+    else:
+      self.assertAllClose(np.array([1.0, 1.0]), var1.numpy())
+
+  def test_missing_types(self):
+    params = {'optimizer': {'type': 'sgd', 'sgd': {'momentum': 0.9}}}
+    with self.assertRaises(ValueError):
+      optimizer_factory.OptimizerFactory(
+          optimization_config.OptimizationConfig(params))
+    params = {
+        'learning_rate': {
+            'type': 'stepwise',
+            'stepwise': {
+                'boundaries': [10000, 20000],
+                'values': [0.1, 0.01, 0.001]
+            }
+        }
+    }
+    with self.assertRaises(ValueError):
+      optimizer_factory.OptimizerFactory(
+          optimization_config.OptimizationConfig(params))
+
+
+# TODO(b/187559334) refactor lr_schedule tests into `lr_schedule_test.py`.
+
+  def test_stepwise_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'stepwise',
+            'stepwise': {
+                'boundaries': [10000, 20000],
+                'values': [0.1, 0.01, 0.001]
+            }
+        }
+    }
+    expected_lr_step_values = [[0, 0.1], [5000, 0.1], [10000, 0.1],
+                               [10001, 0.01], [20000, 0.01], [20001, 0.001]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_stepwise_lr_with_warmup_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'stepwise',
+            'stepwise': {
+                'boundaries': [10000, 20000],
+                'values': [0.1, 0.01, 0.001]
+            }
+        },
+        'warmup': {
+            'type': 'linear',
+            'linear': {
+                'warmup_steps': 500,
+                'warmup_learning_rate': 0.01
+            }
+        }
+    }
+    expected_lr_step_values = [[0, 0.01], [250, 0.055], [500, 0.1], [5500, 0.1],
+                               [10000, 0.1], [10001, 0.01], [20000, 0.01],
+                               [20001, 0.001]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_exponential_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'exponential',
+            'exponential': {
+                'initial_learning_rate': 0.1,
+                'decay_steps': 1000,
+                'decay_rate': 0.96,
+                'staircase': True
+            }
+        }
+    }
+    expected_lr_step_values = [
+        [0, 0.1],
+        [999, 0.1],
+        [1000, 0.096],
+        [1999, 0.096],
+        [2000, 0.09216],
+    ]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_polynomial_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'polynomial',
+            'polynomial': {
+                'initial_learning_rate': 0.1,
+                'decay_steps': 1000,
+                'end_learning_rate': 0.001
+            }
+        }
+    }
+
+    expected_lr_step_values = [[0, 0.1], [500, 0.0505], [1000, 0.001]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_cosine_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'cosine',
+            'cosine': {
+                'initial_learning_rate': 0.1,
+                'decay_steps': 1000
+            }
+        }
+    }
+    expected_lr_step_values = [[0, 0.1], [250, 0.08535534], [500, 0.04999999],
+                               [750, 0.01464466], [1000, 0]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_constant_lr_with_warmup_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'constant',
+            'constant': {
+                'learning_rate': 0.1
+            }
+        },
+        'warmup': {
+            'type': 'linear',
+            'linear': {
+                'warmup_steps': 500,
+                'warmup_learning_rate': 0.01
+            }
+        }
+    }
+
+    expected_lr_step_values = [[0, 0.01], [250, 0.055], [500, 0.1], [5000, 0.1],
+                               [10000, 0.1], [20000, 0.1]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_stepwise_lr_with_polynomial_warmup_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'stepwise',
+            'stepwise': {
+                'boundaries': [10000, 20000],
+                'values': [0.1, 0.01, 0.001]
+            }
+        },
+        'warmup': {
+            'type': 'polynomial',
+            'polynomial': {
+                'warmup_steps': 500,
+                'power': 2.
+            }
+        }
+    }
+    expected_lr_step_values = [[0, 0.0], [250, 0.025], [500, 0.1], [5500, 0.1],
+                               [10000, 0.1], [10001, 0.01], [20000, 0.01],
+                               [20001, 0.001]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value, places=6)
+
+  def test_power_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'power',
+            'power': {
+                'initial_learning_rate': 1.0,
+                'power': -1.0
+            }
+        }
+    }
+    expected_lr_step_values = [[0, 1.0], [1, 1.0], [250, 1. / 250.]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_power_linear_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'power_linear',
+            'power_linear': {
+                'initial_learning_rate': 1.0,
+                'power': -1.0,
+                'linear_decay_fraction': 0.5,
+                'total_decay_steps': 100,
+                'offset': 0,
+            }
+        }
+    }
+    expected_lr_step_values = [[0, 1.0], [1, 1.0], [40, 1. / 40.],
+                               [60, 1. / 60. * 0.8]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+  def test_power_with_offset_lr_schedule(self):
+    params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {
+                'momentum': 0.9
+            }
+        },
+        'learning_rate': {
+            'type': 'power_with_offset',
+            'power_with_offset': {
+                'initial_learning_rate': 1.0,
+                'power': -1.0,
+                'offset': 10,
+                'pre_offset_learning_rate': 3.0,
+            }
+        }
+    }
+    expected_lr_step_values = [[1, 3.0], [10, 3.0], [20, 1. / 10.]]
+    opt_config = optimization_config.OptimizationConfig(params)
+    opt_factory = optimizer_factory.OptimizerFactory(opt_config)
+    lr = opt_factory.build_learning_rate()
+
+    for step, value in expected_lr_step_values:
+      self.assertAlmostEqual(lr(step).numpy(), value)
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/optimization/slide_optimizer.py b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/slide_optimizer.py
new file mode 100644
index 000000000..c1975a311
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/optimization/slide_optimizer.py
@@ -0,0 +1,20 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""SLIDE optimizer.
+
+A new optimizer that will be open sourced soon.
+"""
+
+SLIDE = "Unimplemented"
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/performance.py b/cv/classification/resnet50/tensorflow2.0/modeling/performance.py
new file mode 100644
index 000000000..9dd2438f4
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/performance.py
@@ -0,0 +1,55 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functions and classes related to training performance."""
+
+import tensorflow as tf
+
+
+def configure_optimizer(optimizer,
+                        use_float16=False,
+                        use_graph_rewrite=False,
+                        loss_scale='dynamic'):
+  """Configures optimizer object with performance options."""
+  if use_float16:
+    if loss_scale == 'dynamic':
+      optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)
+    else:
+      # loss_scale is a number. We interpret that as a fixed loss scale.
+      optimizer = tf.keras.mixed_precision.LossScaleOptimizer(
+          optimizer, dynamic=False, initial_scale=loss_scale)
+  if use_graph_rewrite:
+    # Note: the model dtype must be 'float32', which will ensure
+    # tf.keras.mixed_precision and enable_mixed_precision_graph_rewrite do not
+    # double up.
+    optimizer = (
+        tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
+            optimizer))
+  return optimizer
+
+
+def set_mixed_precision_policy(dtype, loss_scale=None):
+  """Sets the global `tf.keras.mixed_precision.Policy`."""
+  # TODO(b/191894773): Remove loss_scale argument
+  assert loss_scale is None, (
+      'The loss_scale argument must be None. The argument exists for '
+      'historical reasons and will be removed soon.')
+  if dtype == tf.float16:
+    tf.keras.mixed_precision.set_global_policy('mixed_float16')
+  elif dtype == tf.bfloat16:
+    tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')
+  elif dtype == tf.float32:
+    tf.keras.mixed_precision.set_global_policy('float32')
+  else:
+    raise ValueError('Unexpected dtype: %s' % dtype)
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/progressive/policies.py b/cv/classification/resnet50/tensorflow2.0/modeling/progressive/policies.py
new file mode 100644
index 000000000..14155214d
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/progressive/policies.py
@@ -0,0 +1,173 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Base ProgressivePolicy definition for progressive training.
+
+To write a progressive model, subclass ProgressivePolicy and implement its
+abstract methods to handle each training stage.
+"""
+
+import abc
+from typing import Any, Mapping
+from absl import logging
+import dataclasses
+import six
+import tensorflow as tf
+from modeling.hyperparams import base_config
+from modeling.progressive import utils
+
+
+@dataclasses.dataclass
+class ProgressiveConfig(base_config.Config):
+  pass
+
+
+@six.add_metaclass(abc.ABCMeta)
+class ProgressivePolicy:
+  """The APIs for handling progressive training stages.
+
+  Attributes:
+    cur_model: The model for the current progressive training stage.
+    cur_train_dataset: The train dataset function for the current stage.
+    cur_eval_dataset: The eval dataset function for the current stage.
+    cur_optimizer: The optimizer for the current stage.
+    cur_checkpoint_items: Items to be saved in and restored from checkpoints,
+      for the progressive trainer.
+    is_last_stage: Whether it is currently in the last stage.
+
+  Interfaces:
+    is_stage_advancing: Returns if progressive training is advancing to the
+      next stage.
+    update_pt_stage: Update progressive training stage.
+  """
+
+  def __init__(self):
+    """Initialize stage policy."""
+    self._cur_train_dataset = None
+    self._cur_eval_dataset = None
+    self._volatiles = utils.VolatileTrackable(optimizer=None, model=None)
+
+    stage_id = 0
+    self._stage_id = tf.Variable(
+        stage_id,
+        trainable=False,
+        dtype=tf.int64,
+        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
+        shape=[])
+    self._volatiles.reassign_trackable(
+        optimizer=self.get_optimizer(stage_id),
+        model=self.get_model(stage_id, old_model=None))
+
+  def compute_stage_id(self, global_step: int) -> int:
+    for stage_id in range(self.num_stages()):
+      global_step -= self.num_steps(stage_id)
+      if global_step < 0:
+        return stage_id
+    logging.error('Global step %d found no matching progressive stages. '
+                  'Default to the last stage.', global_step)
+    return self.num_stages() - 1
+
+  @abc.abstractmethod
+  def num_stages(self) -> int:
+    """Return the total number of progressive stages."""
+    pass
+
+  @abc.abstractmethod
+  def num_steps(self, stage_id: int) -> int:
+    """Return the total number of steps in this stage."""
+    pass
+
+  @abc.abstractmethod
+  def get_model(self,
+                stage_id: int,
+                old_model: tf.keras.Model = None) -> tf.keras.Model:
+    """Return model for this stage. For initialization, `old_model` = None."""
+    pass
+
+  @abc.abstractmethod
+  def get_optimizer(self, stage_id: int) -> tf.keras.optimizers.Optimizer:
+    """Return optimizer for this stage."""
+    pass
+
+  @abc.abstractmethod
+  def get_train_dataset(self, stage_id: int) -> tf.data.Dataset:
+    """Return training Dataset for this stage."""
+    pass
+
+  @abc.abstractmethod
+  def get_eval_dataset(self, stage_id: int) -> tf.data.Dataset:
+    """Return evaluation Dataset for this stage."""
+    pass
+
+  @property
+  def cur_model(self) -> tf.keras.Model:
+    return self._volatiles.model
+
+  @property
+  def cur_train_dataset(self) -> tf.data.Dataset:
+    if self._cur_train_dataset is None:
+      self._cur_train_dataset = self.get_train_dataset(self._stage_id.numpy())
+    return self._cur_train_dataset
+
+  @property
+  def cur_eval_dataset(self) -> tf.data.Dataset:
+    if self._cur_eval_dataset is None:
+      self._cur_eval_dataset = self.get_eval_dataset(self._stage_id.numpy())
+    return self._cur_eval_dataset
+
+  @property
+  def cur_optimizer(self) -> tf.keras.optimizers.Optimizer:
+    return self._volatiles.optimizer
+
+  @property
+  def is_last_stage(self) -> bool:
+    stage_id = self._stage_id.numpy()
+    return stage_id >= self.num_stages() - 1
+
+  @property
+  def cur_checkpoint_items(self) -> Mapping[str, Any]:
+    return dict(stage_id=self._stage_id, volatiles=self._volatiles)
+
+  def is_stage_advancing(self, global_step: int) -> bool:
+    old_stage_id = self._stage_id.numpy()
+    new_stage_id = self.compute_stage_id(global_step)
+    return old_stage_id != new_stage_id
+
+  def update_pt_stage(self, global_step: int, pass_old_model=True) -> None:
+    """Update progressive training internal status.
+
+    Call this after a training loop ends.
+
+    Args:
+      global_step: an integer scalar of the current global step.
+      pass_old_model: whether to pass the old_model to get_model() function.
+        This is set to False if the old_model is irrelevant (e.g, just a default
+        model from stage 0).
+    """
+    old_stage_id = self._stage_id.numpy()
+    new_stage_id = self.compute_stage_id(global_step)
+    logging.info('Switching stage from %d to %d', old_stage_id, new_stage_id)
+
+    # Update stage id.
+    self._stage_id.assign(new_stage_id)
+    # Update dataset function.
+    self._cur_train_dataset = None
+    self._cur_eval_dataset = None
+
+    # Update optimizer and model.
+    new_optimizer = self.get_optimizer(new_stage_id)
+    self._volatiles.reassign_trackable(optimizer=new_optimizer)
+    new_model = self.get_model(
+        new_stage_id, old_model=self.cur_model if pass_old_model else None)
+    self._volatiles.reassign_trackable(model=new_model)
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/progressive/train.py b/cv/classification/resnet50/tensorflow2.0/modeling/progressive/train.py
new file mode 100644
index 000000000..5c106687d
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/progressive/train.py
@@ -0,0 +1,69 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TFM binary for the progressive trainer."""
+
+from absl import app
+from absl import flags
+import gin
+
+from common import distribute_utils
+# pylint: disable=unused-import
+from common import registry_imports
+# pylint: enable=unused-import
+from common import flags as tfm_flags
+from core import task_factory
+from core import train_utils
+from modeling import performance
+from modeling.progressive import train_lib
+
+FLAGS = flags.FLAGS
+
+
+def main(_):
+  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
+  params = train_utils.parse_configuration(FLAGS)
+  model_dir = FLAGS.model_dir
+  if 'train' in FLAGS.mode:
+    # Pure eval modes do not output yaml files. Otherwise continuous eval job
+    # may race against the train job for writing the same file.
+    train_utils.serialize_config(params, model_dir)
+
+  # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
+  # can have significant impact on model speeds by utilizing float16 in case of
+  # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
+  # dtype is float16
+  if params.runtime.mixed_precision_dtype:
+    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype)
+  distribution_strategy = distribute_utils.get_distribution_strategy(
+      distribution_strategy=params.runtime.distribution_strategy,
+      all_reduce_alg=params.runtime.all_reduce_alg,
+      num_gpus=params.runtime.num_gpus,
+      tpu_address=params.runtime.tpu,
+      **params.runtime.model_parallelism())
+  with distribution_strategy.scope():
+    task = task_factory.get_task(params.task, logging_dir=model_dir)
+
+  train_lib.run_experiment(
+      distribution_strategy=distribution_strategy,
+      task=task,
+      mode=FLAGS.mode,
+      params=params,
+      model_dir=model_dir)
+
+  train_utils.save_gin_config(FLAGS.mode, model_dir)
+
+if __name__ == '__main__':
+  tfm_flags.define_flags()
+  app.run(main)
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/progressive/train_lib.py b/cv/classification/resnet50/tensorflow2.0/modeling/progressive/train_lib.py
new file mode 100644
index 000000000..409c2108f
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/progressive/train_lib.py
@@ -0,0 +1,126 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TFM progressive training driver library.
+
+Compared to the common training driver, the only difference is that we use
+prog_trainer_lib.ProgressiveTrainer instead of the base trainer.
+"""
+
+# pytype: disable=attribute-error
+import os
+from typing import Any, Mapping, Tuple
+
+# Import libraries
+from absl import logging
+import orbit
+import tensorflow as tf
+from core import base_task
+from core import config_definitions
+from core import train_lib as base_train_lib
+from modeling.progressive import trainer as prog_trainer_lib
+
+
+def run_experiment(distribution_strategy: tf.distribute.Strategy,
+                   task: base_task.Task,
+                   mode: str,
+                   params: config_definitions.ExperimentConfig,
+                   model_dir: str,
+                   run_post_eval: bool = False,
+                   save_summary: bool = True) \
+-> Tuple[tf.keras.Model, Mapping[str, Any]]:
+  """Runs train/eval configured by the experiment params.
+
+  Args:
+    distribution_strategy: A distribution distribution_strategy.
+    task: A Task instance.
+    mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval'
+      or 'continuous_eval'.
+    params: ExperimentConfig instance.
+    model_dir: A 'str', a path to store model checkpoints and summaries.
+    run_post_eval: Whether to run post eval once after training, metrics logs
+      are returned.
+    save_summary: Whether to save train and validation summary.
+
+  Returns:
+    A 2-tuple of (model, eval_logs).
+      model: `tf.keras.Model` instance.
+      eval_logs: returns eval metrics logs when run_post_eval is set to True,
+        otherwise, returns {}.
+  """
+
+  with distribution_strategy.scope():
+    logging.info('Running progressive trainer.')
+    trainer = prog_trainer_lib.ProgressiveTrainer(
+        params, task, ckpt_dir=model_dir,
+        train='train' in mode,
+        evaluate=('eval' in mode) or run_post_eval,
+        checkpoint_exporter=base_train_lib.maybe_create_best_ckpt_exporter(
+            params, model_dir))
+
+  if trainer.checkpoint:
+    checkpoint_manager = tf.train.CheckpointManager(
+        trainer.checkpoint,
+        directory=model_dir,
+        max_to_keep=params.trainer.max_to_keep,
+        step_counter=trainer.global_step,
+        checkpoint_interval=params.trainer.checkpoint_interval,
+        init_fn=trainer.initialize)
+  else:
+    checkpoint_manager = None
+
+  controller = orbit.Controller(
+      strategy=distribution_strategy,
+      trainer=trainer if 'train' in mode else None,
+      evaluator=trainer,
+      global_step=trainer.global_step,
+      steps_per_loop=params.trainer.steps_per_loop,
+      checkpoint_manager=checkpoint_manager,
+      summary_dir=os.path.join(model_dir, 'train') if (save_summary) else None,
+      eval_summary_dir=os.path.join(model_dir, 'validation') if
+      (save_summary) else None,
+      summary_interval=params.trainer.summary_interval if
+      (save_summary) else None)
+
+  logging.info('Starts to execute mode: %s', mode)
+  with distribution_strategy.scope():
+    if mode == 'train':
+      controller.train(steps=params.trainer.train_steps)
+    elif mode == 'train_and_eval':
+      controller.train_and_evaluate(
+          train_steps=params.trainer.train_steps,
+          eval_steps=params.trainer.validation_steps,
+          eval_interval=params.trainer.validation_interval)
+    elif mode == 'eval':
+      controller.evaluate(steps=params.trainer.validation_steps)
+    elif mode == 'continuous_eval':
+
+      def timeout_fn():
+        if trainer.global_step.numpy() >= params.trainer.train_steps:
+          return True
+        return False
+
+      controller.evaluate_continuously(
+          steps=params.trainer.validation_steps,
+          timeout=params.trainer.continuous_eval_timeout,
+          timeout_fn=timeout_fn)
+    else:
+      raise NotImplementedError('The mode is not implemented: %s' % mode)
+
+  if run_post_eval:
+    with distribution_strategy.scope():
+      return trainer.model, trainer.evaluate(
+          tf.convert_to_tensor(params.trainer.validation_steps))
+  else:
+    return trainer.model, {}
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/progressive/train_lib_test.py b/cv/classification/resnet50/tensorflow2.0/modeling/progressive/train_lib_test.py
new file mode 100644
index 000000000..178fc6df7
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/progressive/train_lib_test.py
@@ -0,0 +1,183 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the progressive train_lib."""
+import os
+
+from absl import flags
+from absl.testing import parameterized
+import dataclasses
+import orbit
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.common import flags as tfm_flags
+# pylint: disable=unused-import
+from official.common import registry_imports
+# pylint: enable=unused-import
+from official.core import config_definitions as cfg
+from official.core import task_factory
+from official.modeling import optimization
+from official.modeling.hyperparams import params_dict
+from official.modeling.progressive import policies
+from official.modeling.progressive import train_lib
+from official.modeling.progressive import trainer as prog_trainer_lib
+from official.utils.testing import mock_task
+
+FLAGS = flags.FLAGS
+
+tfm_flags.define_flags()
+
+
+@dataclasses.dataclass
+class ProgTaskConfig(cfg.TaskConfig):
+  pass
+
+
+@task_factory.register_task_cls(ProgTaskConfig)
+class ProgMockTask(policies.ProgressivePolicy, mock_task.MockTask):
+  """Progressive task for testing."""
+
+  def __init__(self, params: cfg.TaskConfig, logging_dir: str = None):
+    mock_task.MockTask.__init__(
+        self, params=params, logging_dir=logging_dir)
+    policies.ProgressivePolicy.__init__(self)
+
+  def num_stages(self):
+    return 2
+
+  def num_steps(self, stage_id):
+    return 2 if stage_id == 0 else 4
+
+  def get_model(self, stage_id, old_model=None):
+    del stage_id, old_model
+    return self.build_model()
+
+  def get_optimizer(self, stage_id):
+    """Build optimizer for each stage."""
+    params = optimization.OptimizationConfig({
+        'optimizer': {
+            'type': 'adamw',
+        },
+        'learning_rate': {
+            'type': 'polynomial',
+            'polynomial': {
+                'initial_learning_rate': 0.01,
+                'end_learning_rate': 0.0,
+                'power': 1.0,
+                'decay_steps': 10,
+            },
+        },
+        'warmup': {
+            'polynomial': {
+                'power': 1,
+                'warmup_steps': 2,
+            },
+            'type': 'polynomial',
+        }
+    })
+    opt_factory = optimization.OptimizerFactory(params)
+    optimizer = opt_factory.build_optimizer(opt_factory.build_learning_rate())
+
+    return optimizer
+
+  def get_train_dataset(self, stage_id):
+    del stage_id
+    strategy = tf.distribute.get_strategy()
+    return orbit.utils.make_distributed_dataset(
+        strategy, self.build_inputs, None)
+
+  def get_eval_dataset(self, stage_id):
+    del stage_id
+    strategy = tf.distribute.get_strategy()
+    return orbit.utils.make_distributed_dataset(
+        strategy, self.build_inputs, None)
+
+
+class TrainTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super(TrainTest, self).setUp()
+    self._test_config = {
+        'trainer': {
+            'checkpoint_interval': 10,
+            'steps_per_loop': 10,
+            'summary_interval': 10,
+            'train_steps': 10,
+            'validation_steps': 5,
+            'validation_interval': 10,
+            'continuous_eval_timeout': 1,
+            'optimizer_config': {
+                'optimizer': {
+                    'type': 'sgd',
+                },
+                'learning_rate': {
+                    'type': 'constant'
+                }
+            }
+        },
+    }
+
+  @combinations.generate(
+      combinations.combine(
+          distribution_strategy=[
+              strategy_combinations.default_strategy,
+              strategy_combinations.cloud_tpu_strategy,
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          flag_mode=['train', 'eval', 'train_and_eval'],
+          run_post_eval=[True, False]))
+  def test_end_to_end(self, distribution_strategy, flag_mode, run_post_eval):
+    model_dir = self.get_temp_dir()
+    experiment_config = cfg.ExperimentConfig(
+        trainer=prog_trainer_lib.ProgressiveTrainerConfig(),
+        task=ProgTaskConfig())
+    experiment_config = params_dict.override_params_dict(
+        experiment_config, self._test_config, is_strict=False)
+
+    with distribution_strategy.scope():
+      task = task_factory.get_task(experiment_config.task,
+                                   logging_dir=model_dir)
+
+    _, logs = train_lib.run_experiment(
+        distribution_strategy=distribution_strategy,
+        task=task,
+        mode=flag_mode,
+        params=experiment_config,
+        model_dir=model_dir,
+        run_post_eval=run_post_eval)
+
+    if run_post_eval:
+      self.assertNotEmpty(logs)
+    else:
+      self.assertEmpty(logs)
+
+    if flag_mode == 'eval':
+      return
+    self.assertNotEmpty(
+        tf.io.gfile.glob(os.path.join(model_dir, 'checkpoint')))
+    # Tests continuous evaluation.
+    _, logs = train_lib.run_experiment(
+        distribution_strategy=distribution_strategy,
+        task=task,
+        mode='continuous_eval',
+        params=experiment_config,
+        model_dir=model_dir,
+        run_post_eval=run_post_eval)
+    print(logs)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/progressive/trainer.py b/cv/classification/resnet50/tensorflow2.0/modeling/progressive/trainer.py
new file mode 100644
index 000000000..bc94c1632
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/progressive/trainer.py
@@ -0,0 +1,294 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Progressive Trainer implementation.
+
+The trainer implements the Orbit `StandardTrainable` and
+`StandardEvaluable` interfaces. Trainers inside this project should be
+interchangable and independent on model architectures and tasks.
+"""
+import os
+from typing import Any, Optional
+
+# Import libraries
+from absl import logging
+
+import dataclasses
+import gin
+import orbit
+import tensorflow as tf
+from core import base_task
+from core import base_trainer as trainer_lib
+from core import config_definitions
+from modeling.progressive import policies
+from modeling.progressive import utils
+
+ExperimentConfig = config_definitions.ExperimentConfig
+
+
+@dataclasses.dataclass
+class ProgressiveTrainerConfig(config_definitions.TrainerConfig):
+  """Configuration for progressive trainer.
+
+  Attributes:
+    progressive: A task-specific config. Users can subclass ProgressiveConfig
+      and define any task-specific settings in their subclass.
+    export_checkpoint: A bool. Whether to export checkpoints in non-progressive
+      manner (without the volatiles wrapper) such that your down-stream tasks
+      can load checkpoints from a progressive trainer as if it is a regular
+      checkpoint.
+    export_checkpoint_interval: A bool. The number of steps between exporting
+      checkpoints. If None (by default), will use the same value as
+      TrainerConfig.checkpoint_interval.
+    export_max_to_keep: The maximum number of exported checkpoints to keep.
+      If None (by default), will use the same value as
+      TrainerConfig.max_to_keep.
+    export_only_final_stage_ckpt: A bool. Whether to just export checkpoints
+      during the final progressive training stage. In other words, whether to
+      not export small, partial models. In many cases, it is not meaningful to
+      finetune a small, partial model in down-stream tasks.
+  """
+  progressive: Optional[policies.ProgressiveConfig] = None
+  export_checkpoint: bool = True
+  export_checkpoint_interval: Optional[int] = None
+  export_max_to_keep: Optional[int] = None
+  export_only_final_stage_ckpt: bool = True
+
+
+@gin.configurable
+class ProgressiveTrainer(trainer_lib.Trainer):
+  """Implements the progressive trainer shared for TensorFlow models."""
+
+  def __init__(
+      self,
+      config: ExperimentConfig,
+      prog_task: base_task.Task,  # also implemented ProgressivePolicy.
+      ckpt_dir: str = '',
+      train: bool = True,
+      evaluate: bool = True,
+      checkpoint_exporter: Any = None):
+    """Initialize common trainer for TensorFlow models.
+
+    Args:
+      config: An `ExperimentConfig` instance specifying experiment config.
+      prog_task: An instance both implemented policies.ProgressivePolicy and
+        base_task.Task.
+      ckpt_dir: Checkpoint directory.
+      train: bool, whether or not this trainer will be used for training.
+        default to True.
+      evaluate: bool, whether or not this trainer will be used for evaluation.
+        default to True.
+      checkpoint_exporter: an object that has the `maybe_export_checkpoint`
+        interface.
+    """
+    # Gets the current distribution strategy. If not inside any strategy scope,
+    # it gets a single-replica no-op strategy.
+    self._strategy = tf.distribute.get_strategy()
+    self._config = config
+    self._runtime_options = trainer_lib.get_runtime_options(config)
+    self._task = prog_task
+
+    # Directory for non-progressive checkpoint
+    self._export_ckpt_dir = os.path.join(ckpt_dir, 'exported_ckpts')
+    tf.io.gfile.makedirs(self._export_ckpt_dir)
+    self._export_ckpt_manager = None
+
+    # Receive other checkpoint export, e.g, best checkpoint exporter.
+    # TODO(lehou): unify the checkpoint exporting logic, although the default
+    # setting does not use checkpoint_exporter.
+    self._checkpoint_exporter = checkpoint_exporter
+
+    self._global_step = orbit.utils.create_global_step()
+
+    self._checkpoint = utils.CheckpointWithHooks(
+        before_load_hook=self._update_pt_stage_from_ckpt,
+        global_step=self.global_step,
+        **self._task.cur_checkpoint_items)
+
+    self._train_loss = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
+    self._validation_loss = tf.keras.metrics.Mean(
+        'validation_loss', dtype=tf.float32)
+    self._train_metrics = self.task.build_metrics(
+        training=True) + self.model.metrics
+    self._validation_metrics = self.task.build_metrics(
+        training=False) + self.model.metrics
+
+    if train:
+      orbit.StandardTrainer.__init__(
+          self,
+          None,  # Manage train_dataset by ourselves, not by StandardTrainer.
+          options=orbit.StandardTrainerOptions(
+              use_tf_while_loop=config.trainer.train_tf_while_loop,
+              use_tf_function=config.trainer.train_tf_function))
+
+    if evaluate:
+      orbit.StandardEvaluator.__init__(
+          self,
+          None,  # Manage train_dataset by ourselves, not by StandardEvaluator.
+          options=orbit.StandardEvaluatorOptions(
+              use_tf_function=config.trainer.eval_tf_function))
+
+  @property
+  def model(self):
+    return self._task.cur_model
+
+  @property
+  def optimizer(self):
+    return self._task.cur_optimizer
+
+  # override
+  @property
+  def train_dataset(self):
+    """Overriding StandardTrainer.train_dataset."""
+    return self._task.cur_train_dataset
+
+  # override
+  @train_dataset.setter
+  def train_dataset(self, _):
+    raise SyntaxError('Please do not set train_dataset. Progressive training '
+                      'relies on progressive policy to manager train dataset.')
+
+  # override
+  @property
+  def eval_dataset(self):
+    """Overriding StandardEvaluator.eval_dataset."""
+    return self._task.cur_eval_dataset
+
+  # override
+  @eval_dataset.setter
+  def eval_dataset(self, _):
+    raise SyntaxError('Please do not set eval_dataset. Progressive training '
+                      'relies on progressive policy to manager eval dataset.')
+
+  def train_loop_end(self):
+    """See base class."""
+    logs = {}
+    for metric in self.train_metrics + [self.train_loss]:
+      logs[metric.name] = metric.result()
+      metric.reset_states()
+    if callable(self.optimizer.learning_rate):
+      logs['learning_rate'] = self.optimizer.learning_rate(
+          self.optimizer.iterations)
+    else:
+      logs['learning_rate'] = self.optimizer.learning_rate
+
+    self._maybe_export_non_progressive_checkpoint(self._export_ckpt_dir)
+    if self._task.is_stage_advancing(self.global_step.numpy()):
+      old_train_dataset = self.train_dataset
+
+      # Update progressive properties
+      self._task.update_pt_stage(self.global_step.numpy())
+
+      # Setting `self._train_loop_fn` and `self._eval_loop_fn` to None will
+      # rebuild the train and eval functions with the updated model.
+      self._train_loop_fn = None
+      self._eval_loop_fn = None
+
+      if self.train_dataset != old_train_dataset:
+        # Setting `self._train_iter` to None will rebuild the dataset iterator.
+        self._train_iter = None
+
+      # Setting `self._export_ckpt_manager` to None will rebuild the checkpoint
+      # for exporting.
+      self._export_ckpt_manager = None
+
+    return logs
+
+  def _update_pt_stage_from_ckpt(self, ckpt_file):
+    """Update stage properties based on the global_step variable in a ckpt file.
+
+    Before loading variables from a checkpoint file, we need to go to the
+    correct stage and build corresponding model and optimizer, to make sure that
+    we retore variables of the right model and optimizer.
+
+    Args:
+      ckpt_file: Checkpoint file that will be restored/read from.
+    """
+    if not ckpt_file:
+      return
+    ckpt = tf.train.Checkpoint(global_step=self.global_step)
+    ckpt.read(ckpt_file).expect_partial().assert_existing_objects_matched()
+
+    if self._task.is_stage_advancing(self.global_step.numpy()):
+      old_train_dataset = self.train_dataset
+
+      # Update progressive properties
+      self._task.update_pt_stage(self.global_step.numpy(), pass_old_model=False)
+
+      # Setting `self._train_loop_fn` and `self._eval_loop_fn` to None will
+      # rebuild the train and eval functions with the updated model.
+      self._train_loop_fn = None
+      self._eval_loop_fn = None
+
+      if self.train_dataset != old_train_dataset:
+        # Setting `self._train_iter` to None will rebuild the dataset iterator.
+        self._train_iter = None
+
+      # Setting `self._export_ckpt_manager` to None will rebuild the checkpoint
+      # for exporting.
+      self._export_ckpt_manager = None
+
+  def _maybe_export_non_progressive_checkpoint(self, export_ckpt_dir):
+    """Export checkpoints in non-progressive format.
+
+    This basically removes the wrapping of self._task.cur_checkpoint_items
+    -- just save the model, optimizer, etc., directly.
+    The purpose is to let your down-stream tasks to use these checkpoints.
+
+    Args:
+      export_ckpt_dir: A str. folder of exported checkpoints.
+    """
+    if not self.config.trainer.export_checkpoint:
+      logging.info('Not exporting checkpoints.')
+      return
+    if not self._task.is_last_stage and (
+        self.config.trainer.export_only_final_stage_ckpt):
+      logging.info('Not exporting checkpoints until the last stage.')
+      return
+
+    if self._export_ckpt_manager is None:
+      # Create a checkpoint object just now, to make sure we use
+      # progressive_policy.cur_model and progressive_policy.cur_optimizer of the
+      # current stage.
+      if hasattr(self.model, 'checkpoint_items'):
+        checkpoint_items = self.model.checkpoint_items
+      else:
+        checkpoint_items = {}
+      checkpoint = tf.train.Checkpoint(
+          global_step=self.global_step,
+          model=self.model,
+          optimizer=self.optimizer,
+          **checkpoint_items)
+
+      max_to_keep = self.config.trainer.export_max_to_keep or (
+          self.config.trainer.max_to_keep)
+      checkpoint_interval = self.config.trainer.export_checkpoint_interval or (
+          self.config.trainer.checkpoint_interval)
+      self._export_ckpt_manager = tf.train.CheckpointManager(
+          checkpoint,
+          directory=export_ckpt_dir,
+          checkpoint_name='ckpt',
+          step_counter=self.global_step,
+          max_to_keep=max_to_keep,
+          checkpoint_interval=checkpoint_interval,
+      )
+
+    # Make sure we export the last checkpoint.
+    last_checkpoint = (
+        self.global_step.numpy() == self._config.trainer.train_steps)
+    checkpoint_path = self._export_ckpt_manager.save(
+        checkpoint_number=self.global_step.numpy(),
+        check_interval=not last_checkpoint)
+    if checkpoint_path:
+      logging.info('Checkpoints exported: %s.', checkpoint_path)
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/progressive/trainer_test.py b/cv/classification/resnet50/tensorflow2.0/modeling/progressive/trainer_test.py
new file mode 100644
index 000000000..7d4ab3e22
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/progressive/trainer_test.py
@@ -0,0 +1,238 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the progressive trainer."""
+# pylint: disable=g-direct-tensorflow-import
+import os
+
+from absl.testing import parameterized
+import orbit
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.core import config_definitions as cfg
+from official.modeling import optimization
+from official.modeling.progressive import policies
+from official.modeling.progressive import trainer as trainer_lib
+from official.nlp.configs import bert
+from official.utils.testing import mock_task
+
+
+def all_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.cloud_tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+      ],)
+
+
+def get_exp_config():
+  return cfg.ExperimentConfig(
+      task=cfg.TaskConfig(
+          model=bert.PretrainerConfig()),
+      trainer=trainer_lib.ProgressiveTrainerConfig(
+          export_checkpoint=True,
+          export_checkpoint_interval=1,
+          export_only_final_stage_ckpt=False))
+
+
+class TestPolicy(policies.ProgressivePolicy, mock_task.MockTask):
+  """Just for testing purposes."""
+
+  def __init__(self, strategy, task_config, change_train_dataset=True):
+    self._strategy = strategy
+    self._change_train_dataset = change_train_dataset
+    self._my_train_dataset = None
+    mock_task.MockTask.__init__(self, params=task_config, logging_dir=None)
+    policies.ProgressivePolicy.__init__(self)
+
+  def num_stages(self) -> int:
+    return 2
+
+  def num_steps(self, stage_id: int) -> int:
+    return 2 if stage_id == 0 else 4
+
+  def get_model(self,
+                stage_id: int,
+                old_model: tf.keras.Model) -> tf.keras.Model:
+    del stage_id, old_model
+    return self.build_model()
+
+  def get_optimizer(self, stage_id: int) -> tf.keras.optimizers.Optimizer:
+    optimizer_type = 'sgd' if stage_id == 0 else 'adamw'
+    optimizer_config = cfg.OptimizationConfig({
+        'optimizer': {'type': optimizer_type},
+        'learning_rate': {'type': 'constant'}})
+    opt_factory = optimization.OptimizerFactory(optimizer_config)
+    return opt_factory.build_optimizer(opt_factory.build_learning_rate())
+
+  def get_train_dataset(self, stage_id: int) -> tf.data.Dataset:
+    if not self._change_train_dataset and self._my_train_dataset:
+      return self._my_train_dataset
+    if self._strategy:
+      self._my_train_dataset = orbit.utils.make_distributed_dataset(
+          self._strategy,
+          self._build_inputs,
+          stage_id)
+    else:
+      self._my_train_dataset = self._build_inputs(stage_id)
+    return self._my_train_dataset
+
+  def get_eval_dataset(self, stage_id: int) -> tf.data.Dataset:
+    if self._strategy:
+      return orbit.utils.make_distributed_dataset(
+          self._strategy,
+          self._build_inputs,
+          stage_id)
+    return self._build_inputs(stage_id)
+
+  def _build_inputs(self, stage_id):
+    def dummy_data(_):
+      batch_size = 2 if stage_id == 0 else 1
+      x = tf.zeros(shape=(batch_size, 2), dtype=tf.float32)
+      label = tf.zeros(shape=(batch_size, 1), dtype=tf.float32)
+      return x, label
+    dataset = tf.data.Dataset.range(1)
+    dataset = dataset.repeat()
+    return dataset.map(
+        dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+
+class TrainerTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super(TrainerTest, self).setUp()
+    self._config = get_exp_config()
+
+  def create_test_trainer(self, distribution, model_dir, change_train_dataset):
+    trainer = trainer_lib.ProgressiveTrainer(
+        self._config,
+        prog_task=TestPolicy(
+            distribution, self._config.task, change_train_dataset),
+        ckpt_dir=model_dir)
+    return trainer
+
+  @combinations.generate(all_strategy_combinations())
+  def test_checkpointing(self, distribution):
+    model_dir = self.get_temp_dir()
+    ckpt_file = os.path.join(model_dir, 'ckpt')
+    with distribution.scope():
+      trainer = self.create_test_trainer(distribution, model_dir, True)
+      self.assertFalse(trainer._task.is_last_stage)
+      trainer.train(tf.convert_to_tensor(4, dtype=tf.int32))
+      self.assertTrue(trainer._task.is_last_stage)
+      trainer.checkpoint.save(ckpt_file)
+
+      trainer = self.create_test_trainer(distribution, model_dir, True)
+      self.assertFalse(trainer._task.is_last_stage)
+      trainer.checkpoint.restore(ckpt_file + '-1')
+      self.assertTrue(trainer._task.is_last_stage)
+
+  @combinations.generate(all_strategy_combinations())
+  def test_train_dataset(self, distribution):
+    model_dir = self.get_temp_dir()
+    with distribution.scope():
+      trainer = self.create_test_trainer(distribution, model_dir, True)
+      # Using dataset of stage == 0
+      train_iter = tf.nest.map_structure(iter, trainer.train_dataset)
+      train_data = train_iter.next()[0]
+      if distribution.num_replicas_in_sync > 1:
+        train_data = train_data.values[0]
+      self.assertEqual(train_data.shape[0], 2)
+
+      trainer.train(tf.convert_to_tensor(4, dtype=tf.int32))
+      # Using dataset of stage == 1
+      train_iter = tf.nest.map_structure(iter, trainer.train_dataset)
+      train_data = train_iter.next()[0]
+      if distribution.num_replicas_in_sync > 1:
+        train_data = train_data.values[0]
+      self.assertEqual(train_data.shape[0], 1)
+
+      with self.assertRaises(SyntaxError):
+        trainer.train_dataset = None
+
+  @combinations.generate(all_strategy_combinations())
+  def test_train_dataset_no_switch(self, distribution):
+    model_dir = self.get_temp_dir()
+    with distribution.scope():
+      trainer = self.create_test_trainer(distribution, model_dir, False)
+      trainer.train(tf.convert_to_tensor(2, dtype=tf.int32))
+      # _train_iter is not reset since the dataset is not changed.
+      self.assertIsNotNone(trainer._train_iter)
+    with distribution.scope():
+      trainer = self.create_test_trainer(distribution, model_dir, True)
+      trainer.train(tf.convert_to_tensor(2, dtype=tf.int32))
+      # _train_iter is reset since the dataset changed.
+      self.assertIsNone(trainer._train_iter)
+
+
+class TrainerWithMaskedLMTaskTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super(TrainerWithMaskedLMTaskTest, self).setUp()
+    self._config = get_exp_config()
+
+  def create_test_trainer(self, distribution):
+    trainer = trainer_lib.ProgressiveTrainer(
+        self._config,
+        prog_task=TestPolicy(distribution, self._config.task),
+        ckpt_dir=self.get_temp_dir())
+    return trainer
+
+  @combinations.generate(all_strategy_combinations())
+  def test_trainer_train(self, distribution):
+    with distribution.scope():
+      trainer = self.create_test_trainer(distribution)
+      logs = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
+      self.assertIn('training_loss', logs)
+      self.assertIn('learning_rate', logs)
+
+  @combinations.generate(all_strategy_combinations())
+  def test_trainer_validate(self, distribution):
+    with distribution.scope():
+      trainer = self.create_test_trainer(distribution)
+      logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
+      self.assertIn('validation_loss', logs)
+      self.assertEqual(logs['counter'], 5. * distribution.num_replicas_in_sync)
+
+  @combinations.generate(
+      combinations.combine(
+          mixed_precision_dtype=['float32', 'bfloat16', 'float16'],
+          loss_scale=[None, 'dynamic', 128, 256],
+      ))
+  def test_configure_optimizer(self, mixed_precision_dtype, loss_scale):
+    config = cfg.ExperimentConfig(
+        task=cfg.TaskConfig(
+            model=bert.PretrainerConfig()),
+        runtime=cfg.RuntimeConfig(
+            mixed_precision_dtype=mixed_precision_dtype, loss_scale=loss_scale),
+        trainer=trainer_lib.ProgressiveTrainerConfig(
+            export_checkpoint=True,
+            export_checkpoint_interval=1,
+            export_only_final_stage_ckpt=False))
+    task = TestPolicy(None, config.task)
+    trainer = trainer_lib.ProgressiveTrainer(config, task, self.get_temp_dir())
+    if mixed_precision_dtype != 'float16':
+      self.assertIsInstance(trainer.optimizer, tf.keras.optimizers.SGD)
+    elif mixed_precision_dtype == 'float16' and loss_scale is None:
+      self.assertIsInstance(trainer.optimizer, tf.keras.optimizers.SGD)
+
+    metrics = trainer.train(tf.convert_to_tensor(5, dtype=tf.int32))
+    self.assertIn('training_loss', metrics)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/progressive/utils.py b/cv/classification/resnet50/tensorflow2.0/modeling/progressive/utils.py
new file mode 100644
index 000000000..192170cb8
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/progressive/utils.py
@@ -0,0 +1,56 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Util classes and functions."""
+
+from absl import logging
+import tensorflow as tf
+
+# pylint: disable=g-direct-tensorflow-import
+from tensorflow.python.training.tracking import tracking
+
+
+class VolatileTrackable(tracking.AutoTrackable):
+  """A util class to keep Trackables that might change instances."""
+
+  def __init__(self, **kwargs):
+    for k, v in kwargs.items():
+      setattr(self, k, v)
+
+  def reassign_trackable(self, **kwargs):
+    for k, v in kwargs.items():
+      delattr(self, k)  # untrack this object
+      setattr(self, k, v)  # track the new object
+
+
+class CheckpointWithHooks(tf.train.Checkpoint):
+  """Same as tf.train.Checkpoint but supports hooks.
+
+  In progressive training, use this class instead of tf.train.Checkpoint.
+
+  Since the network architecture changes during progressive training, we need to
+  prepare something (like switch to the correct architecture) before loading the
+  checkpoint. This class supports a hook that will be executed before checkpoint
+  loading.
+  """
+
+  def __init__(self, before_load_hook, **kwargs):
+    self._before_load_hook = before_load_hook
+    super(CheckpointWithHooks, self).__init__(**kwargs)
+
+  # override
+  def read(self, save_path, options=None):
+    self._before_load_hook(save_path)
+    logging.info('Ran before_load_hook.')
+    super(CheckpointWithHooks, self).read(save_path=save_path, options=options)
diff --git a/cv/classification/resnet50/tensorflow2.0/modeling/tf_utils.py b/cv/classification/resnet50/tensorflow2.0/modeling/tf_utils.py
new file mode 100644
index 000000000..199662f74
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/modeling/tf_utils.py
@@ -0,0 +1,200 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Common TF utilities."""
+
+import six
+import tensorflow as tf
+
+from tensorflow.python.util import deprecation
+from modeling import activations
+
+
+@deprecation.deprecated(
+    None,
+    "tf.keras.layers.Layer supports multiple positional args and kwargs as "
+    "input tensors. pack/unpack inputs to override __call__ is no longer "
+    "needed.")
+def pack_inputs(inputs):
+  """Pack a list of `inputs` tensors to a tuple.
+
+  Args:
+    inputs: a list of tensors.
+
+  Returns:
+    a tuple of tensors. if any input is None, replace it with a special constant
+    tensor.
+  """
+  inputs = tf.nest.flatten(inputs)
+  outputs = []
+  for x in inputs:
+    if x is None:
+      outputs.append(tf.constant(0, shape=[], dtype=tf.int32))
+    else:
+      outputs.append(x)
+  return tuple(outputs)
+
+
+@deprecation.deprecated(
+    None,
+    "tf.keras.layers.Layer supports multiple positional args and kwargs as "
+    "input tensors. pack/unpack inputs to override __call__ is no longer "
+    "needed.")
+def unpack_inputs(inputs):
+  """unpack a tuple of `inputs` tensors to a tuple.
+
+  Args:
+    inputs: a list of tensors.
+
+  Returns:
+    a tuple of tensors. if any input is a special constant tensor, replace it
+    with None.
+  """
+  inputs = tf.nest.flatten(inputs)
+  outputs = []
+  for x in inputs:
+    if is_special_none_tensor(x):
+      outputs.append(None)
+    else:
+      outputs.append(x)
+  x = tuple(outputs)
+
+  # To trick the very pointless 'unbalanced-tuple-unpacking' pylint check
+  # from triggering.
+  if len(x) == 1:
+    return x[0]
+  return tuple(outputs)
+
+
+def is_special_none_tensor(tensor):
+  """Checks if a tensor is a special None Tensor."""
+  return tensor.shape.ndims == 0 and tensor.dtype == tf.int32
+
+
+def get_activation(identifier, use_keras_layer=False):
+  """Maps a identifier to a Python function, e.g., "relu" => `tf.nn.relu`.
+
+  It checks string first and if it is one of customized activation not in TF,
+  the corresponding activation will be returned. For non-customized activation
+  names and callable identifiers, always fallback to tf.keras.activations.get.
+
+  Prefers using keras layers when use_keras_layer=True. Now it only supports
+  'relu', 'linear', 'identity', 'swish'.
+
+  Args:
+    identifier: String name of the activation function or callable.
+    use_keras_layer: If True, use keras layer if identifier is allow-listed.
+
+  Returns:
+    A Python function corresponding to the activation function or a keras
+    activation layer when use_keras_layer=True.
+  """
+  if isinstance(identifier, six.string_types):
+    identifier = str(identifier).lower()
+    if use_keras_layer:
+      keras_layer_allowlist = {
+          "relu": "relu",
+          "linear": "linear",
+          "identity": "linear",
+          "swish": "swish",
+          "relu6": tf.nn.relu6,
+      }
+      if identifier in keras_layer_allowlist:
+        return tf.keras.layers.Activation(keras_layer_allowlist[identifier])
+    name_to_fn = {
+        "gelu": activations.gelu,
+        "simple_swish": activations.simple_swish,
+        "hard_swish": activations.hard_swish,
+        "relu6": activations.relu6,
+        "hard_sigmoid": activations.hard_sigmoid,
+        "identity": activations.identity,
+    }
+    if identifier in name_to_fn:
+      return tf.keras.activations.get(name_to_fn[identifier])
+  return tf.keras.activations.get(identifier)
+
+
+def get_shape_list(tensor, expected_rank=None, name=None):
+  """Returns a list of the shape of tensor, preferring static dimensions.
+
+  Args:
+    tensor: A tf.Tensor object to find the shape of.
+    expected_rank: (optional) int. The expected rank of `tensor`. If this is
+      specified and the `tensor` has a different rank, and exception will be
+      thrown.
+    name: Optional name of the tensor for the error message.
+
+  Returns:
+    A list of dimensions of the shape of tensor. All static dimensions will
+    be returned as python integers, and dynamic dimensions will be returned
+    as tf.Tensor scalars.
+  """
+  if expected_rank is not None:
+    assert_rank(tensor, expected_rank, name)
+
+  shape = tensor.shape.as_list()
+
+  non_static_indexes = []
+  for (index, dim) in enumerate(shape):
+    if dim is None:
+      non_static_indexes.append(index)
+
+  if not non_static_indexes:
+    return shape
+
+  dyn_shape = tf.shape(tensor)
+  for index in non_static_indexes:
+    shape[index] = dyn_shape[index]
+  return shape
+
+
+def assert_rank(tensor, expected_rank, name=None):
+  """Raises an exception if the tensor rank is not of the expected rank.
+
+  Args:
+    tensor: A tf.Tensor to check the rank of.
+    expected_rank: Python integer or list of integers, expected rank.
+    name: Optional name of the tensor for the error message.
+
+  Raises:
+    ValueError: If the expected shape doesn't match the actual shape.
+  """
+  expected_rank_dict = {}
+  if isinstance(expected_rank, six.integer_types):
+    expected_rank_dict[expected_rank] = True
+  else:
+    for x in expected_rank:
+      expected_rank_dict[x] = True
+
+  actual_rank = tensor.shape.ndims
+  if actual_rank not in expected_rank_dict:
+    raise ValueError(
+        "For the tensor `%s`, the actual tensor rank `%d` (shape = %s) is not "
+        "equal to the expected tensor rank `%s`" %
+        (name, actual_rank, str(tensor.shape), str(expected_rank)))
+
+
+def safe_mean(losses):
+  """Computes a safe mean of the losses.
+
+  Args:
+    losses: `Tensor` whose elements contain individual loss measurements.
+
+  Returns:
+    A scalar representing the mean of `losses`. If `num_present` is zero,
+      then zero is returned.
+  """
+  total = tf.reduce_sum(losses)
+  num_elements = tf.cast(tf.size(losses), dtype=losses.dtype)
+  return tf.math.divide_no_nan(total, num_elements)
diff --git a/cv/classification/resnet50/tensorflow2.0/optimizer_factory.py b/cv/classification/resnet50/tensorflow2.0/optimizer_factory.py
new file mode 100644
index 000000000..5febd265a
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/optimizer_factory.py
@@ -0,0 +1,183 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Optimizer factory for vision tasks."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+from typing import Any, Dict, Optional, Text
+
+from absl import logging
+import tensorflow as tf
+# import tensorflow_addons as tfa
+
+from modeling import optimization
+import learning_rate
+from configs import base_configs
+
+# pylint: disable=protected-access
+
+
+def build_optimizer(
+    optimizer_name: Text,
+    base_learning_rate: tf.keras.optimizers.schedules.LearningRateSchedule,
+    params: Dict[Text, Any],
+    model: Optional[tf.keras.Model] = None):
+  """Build the optimizer based on name.
+
+  Args:
+    optimizer_name: String representation of the optimizer name. Examples: sgd,
+      momentum, rmsprop.
+    base_learning_rate: `tf.keras.optimizers.schedules.LearningRateSchedule`
+      base learning rate.
+    params: String -> Any dictionary representing the optimizer params. This
+      should contain optimizer specific parameters such as `base_learning_rate`,
+      `decay`, etc.
+    model: The `tf.keras.Model`. This is used for the shadow copy if using
+      `ExponentialMovingAverage`.
+
+  Returns:
+    A tf.keras.Optimizer.
+
+  Raises:
+    ValueError if the provided optimizer_name is not supported.
+
+  """
+  optimizer_name = optimizer_name.lower()
+  logging.info('Building %s optimizer with params %s', optimizer_name, params)
+
+  if optimizer_name == 'sgd':
+    logging.info('Using SGD optimizer')
+    nesterov = params.get('nesterov', False)
+    optimizer = tf.keras.optimizers.SGD(
+        learning_rate=base_learning_rate, nesterov=nesterov)
+  elif optimizer_name == 'momentum':
+    logging.info('Using momentum optimizer')
+    nesterov = params.get('nesterov', False)
+    optimizer = tf.keras.optimizers.SGD(
+        learning_rate=base_learning_rate,
+        momentum=params['momentum'],
+        nesterov=nesterov,
+        jit_compile=False)
+  elif optimizer_name == 'rmsprop':
+    logging.info('Using RMSProp')
+    rho = params.get('decay', None) or params.get('rho', 0.9)
+    momentum = params.get('momentum', 0.9)
+    epsilon = params.get('epsilon', 1e-07)
+    optimizer = tf.keras.optimizers.RMSprop(
+        learning_rate=base_learning_rate,
+        rho=rho,
+        momentum=momentum,
+        epsilon=epsilon)
+  elif optimizer_name == 'adam':
+    logging.info('Using Adam')
+    beta_1 = params.get('beta_1', 0.9)
+    beta_2 = params.get('beta_2', 0.999)
+    epsilon = params.get('epsilon', 1e-07)
+    optimizer = tf.keras.optimizers.Adam(
+        learning_rate=base_learning_rate,
+        beta_1=beta_1,
+        beta_2=beta_2,
+        epsilon=epsilon)
+#   elif optimizer_name == 'adamw':
+#     logging.info('Using AdamW')
+#     weight_decay = params.get('weight_decay', 0.01)
+#     beta_1 = params.get('beta_1', 0.9)
+#     beta_2 = params.get('beta_2', 0.999)
+#     epsilon = params.get('epsilon', 1e-07)
+#     optimizer = tfa.optimizers.AdamW(
+#         weight_decay=weight_decay,
+#         learning_rate=base_learning_rate,
+#         beta_1=beta_1,
+#         beta_2=beta_2,
+#         epsilon=epsilon)
+  else:
+    raise ValueError('Unknown optimizer %s' % optimizer_name)
+
+#   if params.get('lookahead', None):
+#     logging.info('Using lookahead optimizer.')
+#     optimizer = tfa.optimizers.Lookahead(optimizer)
+
+  # Moving average should be applied last, as it's applied at test time
+  moving_average_decay = params.get('moving_average_decay', 0.)
+  if moving_average_decay is not None and moving_average_decay > 0.:
+    if model is None:
+      raise ValueError(
+          '`model` must be provided if using `ExponentialMovingAverage`.')
+    logging.info('Including moving average decay.')
+    optimizer = optimization.ExponentialMovingAverage(
+        optimizer=optimizer, average_decay=moving_average_decay)
+    optimizer.shadow_copy(model)
+  return optimizer
+
+
+def build_learning_rate(params: base_configs.LearningRateConfig,
+                        batch_size: Optional[int] = None,
+                        train_epochs: Optional[int] = None,
+                        train_steps: Optional[int] = None):
+  """Build the learning rate given the provided configuration."""
+  decay_type = params.name
+  base_lr = params.initial_lr
+  decay_rate = params.decay_rate
+  if params.decay_epochs is not None:
+    decay_steps = params.decay_epochs * train_steps
+  else:
+    decay_steps = 0
+  if params.warmup_epochs is not None:
+    warmup_steps = params.warmup_epochs * train_steps
+  else:
+    warmup_steps = 0
+
+  lr_multiplier = params.scale_by_batch_size
+
+  if lr_multiplier and lr_multiplier > 0:
+    # Scale the learning rate based on the batch size and a multiplier
+    base_lr *= lr_multiplier * batch_size
+    logging.info(
+        'Scaling the learning rate based on the batch size '
+        'multiplier. New base_lr: %f', base_lr)
+
+  if decay_type == 'exponential':
+    logging.info(
+        'Using exponential learning rate with: '
+        'initial_learning_rate: %f, decay_steps: %d, '
+        'decay_rate: %f', base_lr, decay_steps, decay_rate)
+    lr = tf.keras.optimizers.schedules.ExponentialDecay(
+        initial_learning_rate=base_lr,
+        decay_steps=decay_steps,
+        decay_rate=decay_rate,
+        staircase=params.staircase)
+  elif decay_type == 'stepwise':
+    steps_per_epoch = params.examples_per_epoch // batch_size
+    boundaries = [boundary * steps_per_epoch for boundary in params.boundaries]
+    multipliers = [batch_size * multiplier for multiplier in params.multipliers]
+    logging.info(
+        'Using stepwise learning rate. Parameters: '
+        'boundaries: %s, values: %s', boundaries, multipliers)
+    lr = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
+        boundaries=boundaries, values=multipliers)
+  elif decay_type == 'cosine_with_warmup':
+    lr = learning_rate.CosineDecayWithWarmup(
+        batch_size=batch_size,
+        total_steps=train_epochs * train_steps,
+        warmup_steps=warmup_steps)
+  if warmup_steps > 0:
+    if decay_type not in ['cosine_with_warmup']:
+      logging.info('Applying %d warmup steps to the learning rate',
+                   warmup_steps)
+      lr = learning_rate.WarmupDecaySchedule(
+          lr, warmup_steps, warmup_lr=base_lr)
+  return lr
diff --git a/cv/classification/resnet50/tensorflow2.0/optimizer_factory_test.py b/cv/classification/resnet50/tensorflow2.0/optimizer_factory_test.py
new file mode 100644
index 000000000..a98d23d9b
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/optimizer_factory_test.py
@@ -0,0 +1,119 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for optimizer_factory."""
+
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+import tensorflow as tf
+from official.vision.image_classification import optimizer_factory
+from official.vision.image_classification.configs import base_configs
+
+
+class OptimizerFactoryTest(tf.test.TestCase, parameterized.TestCase):
+
+  def build_toy_model(self) -> tf.keras.Model:
+    """Creates a toy `tf.Keras.Model`."""
+    model = tf.keras.Sequential()
+    model.add(tf.keras.layers.Dense(1, input_shape=(1,)))
+    return model
+
+  @parameterized.named_parameters(
+      ('sgd', 'sgd', 0., False), ('momentum', 'momentum', 0., False),
+      ('rmsprop', 'rmsprop', 0., False), ('adam', 'adam', 0., False),
+      ('adamw', 'adamw', 0., False),
+      ('momentum_lookahead', 'momentum', 0., True),
+      ('sgd_ema', 'sgd', 0.999, False),
+      ('momentum_ema', 'momentum', 0.999, False),
+      ('rmsprop_ema', 'rmsprop', 0.999, False))
+  def test_optimizer(self, optimizer_name, moving_average_decay, lookahead):
+    """Smoke test to be sure no syntax errors."""
+    model = self.build_toy_model()
+    params = {
+        'learning_rate': 0.001,
+        'rho': 0.09,
+        'momentum': 0.,
+        'epsilon': 1e-07,
+        'moving_average_decay': moving_average_decay,
+        'lookahead': lookahead,
+    }
+    optimizer = optimizer_factory.build_optimizer(
+        optimizer_name=optimizer_name,
+        base_learning_rate=params['learning_rate'],
+        params=params,
+        model=model)
+    self.assertTrue(issubclass(type(optimizer), tf.keras.optimizers.Optimizer))
+
+  def test_unknown_optimizer(self):
+    with self.assertRaises(ValueError):
+      optimizer_factory.build_optimizer(
+          optimizer_name='this_optimizer_does_not_exist',
+          base_learning_rate=None,
+          params=None)
+
+  def test_learning_rate_without_decay_or_warmups(self):
+    params = base_configs.LearningRateConfig(
+        name='exponential',
+        initial_lr=0.01,
+        decay_rate=0.01,
+        decay_epochs=None,
+        warmup_epochs=None,
+        scale_by_batch_size=0.01,
+        examples_per_epoch=1,
+        boundaries=[0],
+        multipliers=[0, 1])
+    batch_size = 1
+    train_steps = 1
+
+    lr = optimizer_factory.build_learning_rate(
+        params=params, batch_size=batch_size, train_steps=train_steps)
+    self.assertTrue(
+        issubclass(
+            type(lr), tf.keras.optimizers.schedules.LearningRateSchedule))
+
+  @parameterized.named_parameters(('exponential', 'exponential'),
+                                  ('cosine_with_warmup', 'cosine_with_warmup'))
+  def test_learning_rate_with_decay_and_warmup(self, lr_decay_type):
+    """Basic smoke test for syntax."""
+    params = base_configs.LearningRateConfig(
+        name=lr_decay_type,
+        initial_lr=0.01,
+        decay_rate=0.01,
+        decay_epochs=1,
+        warmup_epochs=1,
+        scale_by_batch_size=0.01,
+        examples_per_epoch=1,
+        boundaries=[0],
+        multipliers=[0, 1])
+    batch_size = 1
+    train_epochs = 1
+    train_steps = 1
+
+    lr = optimizer_factory.build_learning_rate(
+        params=params,
+        batch_size=batch_size,
+        train_epochs=train_epochs,
+        train_steps=train_steps)
+    self.assertTrue(
+        issubclass(
+            type(lr), tf.keras.optimizers.schedules.LearningRateSchedule))
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/preprocessing.py b/cv/classification/resnet50/tensorflow2.0/preprocessing.py
new file mode 100644
index 000000000..f8ca2428f
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/preprocessing.py
@@ -0,0 +1,391 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Preprocessing functions for images."""
+
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import tensorflow as tf
+from typing import List, Optional, Text, Tuple
+
+import augment
+
+
+# Calculated from the ImageNet training set
+MEAN_RGB = (0.485 * 255, 0.456 * 255, 0.406 * 255)
+STDDEV_RGB = (0.229 * 255, 0.224 * 255, 0.225 * 255)
+
+IMAGE_SIZE = 224
+CROP_PADDING = 32
+
+
+def mean_image_subtraction(
+    image_bytes: tf.Tensor,
+    means: Tuple[float, ...],
+    num_channels: int = 3,
+    dtype: tf.dtypes.DType = tf.float32,
+) ->  tf.Tensor:
+  """Subtracts the given means from each image channel.
+
+  For example:
+    means = [123.68, 116.779, 103.939]
+    image_bytes = mean_image_subtraction(image_bytes, means)
+
+  Note that the rank of `image` must be known.
+
+  Args:
+    image_bytes: a tensor of size [height, width, C].
+    means: a C-vector of values to subtract from each channel.
+    num_channels: number of color channels in the image that will be distorted.
+    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
+
+  Returns:
+    the centered image.
+
+  Raises:
+    ValueError: If the rank of `image` is unknown, if `image` has a rank other
+      than three or if the number of channels in `image` doesn't match the
+      number of values in `means`.
+  """
+  if image_bytes.get_shape().ndims != 3:
+    raise ValueError('Input must be of size [height, width, C>0]')
+
+  if len(means) != num_channels:
+    raise ValueError('len(means) must match the number of channels')
+
+  # We have a 1-D tensor of means; convert to 3-D.
+  # Note(b/130245863): we explicitly call `broadcast` instead of simply
+  # expanding dimensions for better performance.
+  means = tf.broadcast_to(means, tf.shape(image_bytes))
+  if dtype is not None:
+    means = tf.cast(means, dtype=dtype)
+
+  return image_bytes - means
+
+
+def standardize_image(
+    image_bytes: tf.Tensor,
+    stddev: Tuple[float, ...],
+    num_channels: int = 3,
+    dtype: tf.dtypes.DType = tf.float32,
+) ->  tf.Tensor:
+  """Divides the given stddev from each image channel.
+
+  For example:
+    stddev = [123.68, 116.779, 103.939]
+    image_bytes = standardize_image(image_bytes, stddev)
+
+  Note that the rank of `image` must be known.
+
+  Args:
+    image_bytes: a tensor of size [height, width, C].
+    stddev: a C-vector of values to divide from each channel.
+    num_channels: number of color channels in the image that will be distorted.
+    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
+
+  Returns:
+    the centered image.
+
+  Raises:
+    ValueError: If the rank of `image` is unknown, if `image` has a rank other
+      than three or if the number of channels in `image` doesn't match the
+      number of values in `stddev`.
+  """
+  if image_bytes.get_shape().ndims != 3:
+    raise ValueError('Input must be of size [height, width, C>0]')
+
+  if len(stddev) != num_channels:
+    raise ValueError('len(stddev) must match the number of channels')
+
+  # We have a 1-D tensor of stddev; convert to 3-D.
+  # Note(b/130245863): we explicitly call `broadcast` instead of simply
+  # expanding dimensions for better performance.
+  stddev = tf.broadcast_to(stddev, tf.shape(image_bytes))
+  if dtype is not None:
+    stddev = tf.cast(stddev, dtype=dtype)
+
+  return image_bytes / stddev
+
+
+def normalize_images(features: tf.Tensor,
+                     mean_rgb: Tuple[float, ...] = MEAN_RGB,
+                     stddev_rgb: Tuple[float, ...] = STDDEV_RGB,
+                     num_channels: int = 3,
+                     dtype: tf.dtypes.DType = tf.float32,
+                     data_format: Text = 'channels_last') -> tf.Tensor:
+  """Normalizes the input image channels with the given mean and stddev.
+
+  Args:
+    features: `Tensor` representing decoded images in float format.
+    mean_rgb: the mean of the channels to subtract.
+    stddev_rgb: the stddev of the channels to divide.
+    num_channels: the number of channels in the input image tensor.
+    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
+    data_format: the format of the input image tensor
+                 ['channels_first', 'channels_last'].
+
+  Returns:
+    A normalized image `Tensor`.
+  """
+  # TODO(allencwang) - figure out how to use mean_image_subtraction and
+  # standardize_image on batches of images and replace the following.
+  if data_format == 'channels_first':
+    stats_shape = [num_channels, 1, 1]
+  else:
+    stats_shape = [1, 1, num_channels]
+
+  if dtype is not None:
+    features = tf.image.convert_image_dtype(features, dtype=dtype)
+
+  if mean_rgb is not None:
+    mean_rgb = tf.constant(mean_rgb,
+                           shape=stats_shape,
+                           dtype=features.dtype)
+    mean_rgb = tf.broadcast_to(mean_rgb, tf.shape(features))
+    features = features - mean_rgb
+
+  if stddev_rgb is not None:
+    stddev_rgb = tf.constant(stddev_rgb,
+                             shape=stats_shape,
+                             dtype=features.dtype)
+    stddev_rgb = tf.broadcast_to(stddev_rgb, tf.shape(features))
+    features = features / stddev_rgb
+
+  return features
+
+
+def decode_and_center_crop(image_bytes: tf.Tensor,
+                           image_size: int = IMAGE_SIZE,
+                           crop_padding: int = CROP_PADDING) -> tf.Tensor:
+  """Crops to center of image with padding then scales image_size.
+
+  Args:
+    image_bytes: `Tensor` representing an image binary of arbitrary size.
+    image_size: image height/width dimension.
+    crop_padding: the padding size to use when centering the crop.
+
+  Returns:
+    A decoded and cropped image `Tensor`.
+  """
+  decoded = image_bytes.dtype != tf.string
+  shape = (tf.shape(image_bytes) if decoded
+           else tf.image.extract_jpeg_shape(image_bytes))
+  image_height = shape[0]
+  image_width = shape[1]
+
+  padded_center_crop_size = tf.cast(
+      ((image_size / (image_size + crop_padding)) *
+       tf.cast(tf.minimum(image_height, image_width), tf.float32)),
+      tf.int32)
+
+  offset_height = ((image_height - padded_center_crop_size) + 1) // 2
+  offset_width = ((image_width - padded_center_crop_size) + 1) // 2
+  crop_window = tf.stack([offset_height, offset_width,
+                          padded_center_crop_size, padded_center_crop_size])
+  if decoded:
+    image = tf.image.crop_to_bounding_box(
+        image_bytes,
+        offset_height=offset_height,
+        offset_width=offset_width,
+        target_height=padded_center_crop_size,
+        target_width=padded_center_crop_size)
+  else:
+    image = tf.image.decode_and_crop_jpeg(image_bytes, crop_window, channels=3)
+
+  image = resize_image(image_bytes=image,
+                       height=image_size,
+                       width=image_size)
+
+  return image
+
+
+def decode_crop_and_flip(image_bytes: tf.Tensor) -> tf.Tensor:
+  """Crops an image to a random part of the image, then randomly flips.
+
+  Args:
+    image_bytes: `Tensor` representing an image binary of arbitrary size.
+
+  Returns:
+    A decoded and cropped image `Tensor`.
+
+  """
+  decoded = image_bytes.dtype != tf.string
+  bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4])
+  shape = (tf.shape(image_bytes) if decoded
+           else tf.image.extract_jpeg_shape(image_bytes))
+  sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
+      shape,
+      bounding_boxes=bbox,
+      min_object_covered=0.1,
+      aspect_ratio_range=[0.75, 1.33],
+      area_range=[0.05, 1.0],
+      max_attempts=100,
+      use_image_if_no_bounding_boxes=True)
+  bbox_begin, bbox_size, _ = sample_distorted_bounding_box
+
+  # Reassemble the bounding box in the format the crop op requires.
+  offset_height, offset_width, _ = tf.unstack(bbox_begin)
+  target_height, target_width, _ = tf.unstack(bbox_size)
+  crop_window = tf.stack([offset_height, offset_width,
+                          target_height, target_width])
+  if decoded:
+    cropped = tf.image.crop_to_bounding_box(
+        image_bytes,
+        offset_height=offset_height,
+        offset_width=offset_width,
+        target_height=target_height,
+        target_width=target_width)
+  else:
+    cropped = tf.image.decode_and_crop_jpeg(image_bytes,
+                                            crop_window,
+                                            channels=3)
+
+  # Flip to add a little more random distortion in.
+  cropped = tf.image.random_flip_left_right(cropped)
+  return cropped
+
+
+def resize_image(image_bytes: tf.Tensor,
+                 height: int = IMAGE_SIZE,
+                 width: int = IMAGE_SIZE) -> tf.Tensor:
+  """Resizes an image to a given height and width.
+
+  Args:
+    image_bytes: `Tensor` representing an image binary of arbitrary size.
+    height: image height dimension.
+    width: image width dimension.
+
+  Returns:
+    A tensor containing the resized image.
+
+  """
+  return tf.compat.v1.image.resize(
+      image_bytes, [height, width], method=tf.image.ResizeMethod.BILINEAR,
+      align_corners=False)
+
+
+def preprocess_for_eval(
+    image_bytes: tf.Tensor,
+    image_size: int = IMAGE_SIZE,
+    num_channels: int = 3,
+    mean_subtract: bool = False,
+    standardize: bool = False,
+    dtype: tf.dtypes.DType = tf.float32
+) -> tf.Tensor:
+  """Preprocesses the given image for evaluation.
+
+  Args:
+    image_bytes: `Tensor` representing an image binary of arbitrary size.
+    image_size: image height/width dimension.
+    num_channels: number of image input channels.
+    mean_subtract: whether or not to apply mean subtraction.
+    standardize: whether or not to apply standardization.
+    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
+
+  Returns:
+    A preprocessed and normalized image `Tensor`.
+  """
+  images = decode_and_center_crop(image_bytes, image_size)
+  images = tf.reshape(images, [image_size, image_size, num_channels])
+
+  if mean_subtract:
+    images = mean_image_subtraction(image_bytes=images, means=MEAN_RGB)
+  if standardize:
+    images = standardize_image(image_bytes=images, stddev=STDDEV_RGB)
+  if dtype is not None:
+    images = tf.image.convert_image_dtype(images, dtype=dtype)
+
+  return images
+
+
+def load_eval_image(filename: Text, image_size: int = IMAGE_SIZE) -> tf.Tensor:
+  """Reads an image from the filesystem and applies image preprocessing.
+
+  Args:
+    filename: a filename path of an image.
+    image_size: image height/width dimension.
+
+  Returns:
+    A preprocessed and normalized image `Tensor`.
+  """
+  image_bytes = tf.io.read_file(filename)
+  image = preprocess_for_eval(image_bytes, image_size)
+
+  return image
+
+
+def build_eval_dataset(filenames: List[Text],
+                       labels: Optional[List[int]] = None,
+                       image_size: int = IMAGE_SIZE,
+                       batch_size: int = 1) -> tf.Tensor:
+  """Builds a tf.data.Dataset from a list of filenames and labels.
+
+  Args:
+    filenames: a list of filename paths of images.
+    labels: a list of labels corresponding to each image.
+    image_size: image height/width dimension.
+    batch_size: the batch size used by the dataset
+
+  Returns:
+    A preprocessed and normalized image `Tensor`.
+  """
+  if labels is None:
+    labels = [0] * len(filenames)
+
+  filenames = tf.constant(filenames)
+  labels = tf.constant(labels)
+  dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
+
+  dataset = dataset.map(
+      lambda filename, label: (load_eval_image(filename, image_size), label))
+  dataset = dataset.batch(batch_size)
+
+  return dataset
+
+
+def preprocess_for_train(image_bytes: tf.Tensor,
+                         image_size: int = IMAGE_SIZE,
+                         augmenter: Optional[augment.ImageAugment] = None,
+                         mean_subtract: bool = False,
+                         standardize: bool = False,
+                         dtype: tf.dtypes.DType = tf.float32) -> tf.Tensor:
+  """Preprocesses the given image for training.
+
+  Args:
+    image_bytes: `Tensor` representing an image binary of
+      arbitrary size of dtype tf.uint8.
+    image_size: image height/width dimension.
+    augmenter: the image augmenter to apply.
+    mean_subtract: whether or not to apply mean subtraction.
+    standardize: whether or not to apply standardization.
+    dtype: the dtype to convert the images to. Set to `None` to skip conversion.
+
+  Returns:
+    A preprocessed and normalized image `Tensor`.
+  """
+  images = decode_crop_and_flip(image_bytes=image_bytes)
+  images = resize_image(images, height=image_size, width=image_size)
+  if augmenter is not None:
+    images = augmenter.distort(images)
+  if mean_subtract:
+    images = mean_image_subtraction(image_bytes=images, means=MEAN_RGB)
+  if standardize:
+    images = standardize_image(image_bytes=images, stddev=STDDEV_RGB)
+  if dtype is not None:
+    images = tf.image.convert_image_dtype(images, dtype)
+
+  return images
diff --git a/cv/classification/resnet50/tensorflow2.0/resnet/README.md b/cv/classification/resnet50/tensorflow2.0/resnet/README.md
new file mode 100644
index 000000000..5064523fb
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/resnet/README.md
@@ -0,0 +1,125 @@
+This folder contains a
+[custom training loop (CTL)](#resnet-custom-training-loop) implementation for
+ResNet50.
+
+## Before you begin
+Please refer to the [README](../README.md) in the parent directory for
+information on setup and preparing the data.
+
+## ResNet (custom training loop)
+
+Similar to the [estimator implementation](../../../r1/resnet), the Keras
+implementation has code for the ImageNet dataset. The ImageNet
+version uses a ResNet50 model implemented in
+[`resnet_model.py`](./resnet_model.py).
+
+
+### Pretrained Models
+
+* [ResNet50 Checkpoints](https://storage.googleapis.com/cloud-tpu-checkpoints/resnet/resnet50.tar.gz)
+
+* ResNet50 TFHub: [feature vector](https://tfhub.dev/tensorflow/resnet_50/feature_vector/1)
+and [classification](https://tfhub.dev/tensorflow/resnet_50/classification/1)
+
+Again, if you did not download the data to the default directory, specify the
+location with the `--data_dir` flag:
+
+```bash
+python3 resnet_ctl_imagenet_main.py --data_dir=/path/to/imagenet
+```
+
+There are more flag options you can specify. Here are some examples:
+
+- `--use_synthetic_data`: when set to true, synthetic data, rather than real
+data, are used;
+- `--batch_size`: the batch size used for the model;
+- `--model_dir`: the directory to save the model checkpoint;
+- `--train_epochs`: number of epoches to run for training the model;
+- `--train_steps`: number of steps to run for training the model. We now only
+support a number that is smaller than the number of batches in an epoch.
+- `--skip_eval`: when set to true, evaluation as well as validation during
+training is skipped
+
+For example, this is a typical command line to run with ImageNet data with
+batch size 128 per GPU:
+
+```bash
+python3 -m resnet_ctl_imagenet_main.py \
+    --model_dir=/tmp/model_dir/something \
+    --num_gpus=2 \
+    --batch_size=128 \
+    --train_epochs=90 \
+    --train_steps=10 \
+    --use_synthetic_data=false
+```
+
+See [`common.py`](common.py) for full list of options.
+
+### Using multiple GPUs
+
+You can train these models on multiple GPUs using `tf.distribute.Strategy` API.
+You can read more about them in this
+[guide](https://www.tensorflow.org/guide/distribute_strategy).
+
+In this example, we have made it easier to use is with just a command line flag
+`--num_gpus`. By default this flag is 1 if TensorFlow is compiled with CUDA,
+and 0 otherwise.
+
+- --num_gpus=0: Uses tf.distribute.OneDeviceStrategy with CPU as the device.
+- --num_gpus=1: Uses tf.distribute.OneDeviceStrategy with GPU as the device.
+- --num_gpus=2+: Uses tf.distribute.MirroredStrategy to run synchronous
+distributed training across the GPUs.
+
+If you wish to run without `tf.distribute.Strategy`, you can do so by setting
+`--distribution_strategy=off`.
+
+### Running on multiple GPU hosts
+
+You can also train these models on multiple hosts, each with GPUs, using
+`tf.distribute.Strategy`.
+
+The easiest way to run multi-host benchmarks is to set the
+[`TF_CONFIG`](https://www.tensorflow.org/guide/distributed_training#TF_CONFIG)
+appropriately at each host.  e.g., to run using `MultiWorkerMirroredStrategy` on
+2 hosts, the `cluster` in `TF_CONFIG` should have 2 `host:port` entries, and
+host `i` should have the `task` in `TF_CONFIG` set to `{"type": "worker",
+"index": i}`.  `MultiWorkerMirroredStrategy` will automatically use all the
+available GPUs at each host.
+
+### Running on Cloud TPUs
+
+Note: This model will **not** work with TPUs on Colab.
+
+You can train the ResNet CTL model on Cloud TPUs using
+`tf.distribute.TPUStrategy`. If you are not familiar with Cloud TPUs, it is
+strongly recommended that you go through the
+[quickstart](https://cloud.google.com/tpu/docs/quickstart) to learn how to
+create a TPU and GCE VM.
+
+To run ResNet model on a TPU, you must set `--distribution_strategy=tpu` and
+`--tpu=$TPU_NAME`, where `$TPU_NAME` the name of your TPU in the Cloud Console.
+From a GCE VM, you can run the following command to train ResNet for one epoch
+on a v2-8 or v3-8 TPU by setting `TRAIN_EPOCHS` to 1:
+
+```bash
+python3 resnet_ctl_imagenet_main.py \
+  --tpu=$TPU_NAME \
+  --model_dir=$MODEL_DIR \
+  --data_dir=$DATA_DIR \
+  --batch_size=1024 \
+  --steps_per_loop=500 \
+  --train_epochs=$TRAIN_EPOCHS \
+  --use_synthetic_data=false \
+  --dtype=fp32 \
+  --enable_eager=true \
+  --enable_tensorboard=true \
+  --distribution_strategy=tpu \
+  --log_steps=50 \
+  --single_l2_loss_op=true \
+  --use_tf_function=true
+```
+
+To train the ResNet to convergence, run it for 90 epochs by setting
+`TRAIN_EPOCHS` to 90.
+
+Note: `$MODEL_DIR` and `$DATA_DIR` must be GCS paths.
diff --git a/cv/classification/resnet50/tensorflow2.0/resnet/__init__.py b/cv/classification/resnet50/tensorflow2.0/resnet/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/resnet/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/cv/classification/resnet50/tensorflow2.0/resnet/common.py b/cv/classification/resnet50/tensorflow2.0/resnet/common.py
new file mode 100644
index 000000000..dd0b88d81
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/resnet/common.py
@@ -0,0 +1,418 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Common util functions and classes used by both keras cifar and imagenet."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from absl import flags
+import tensorflow as tf
+
+import tensorflow_model_optimization as tfmot
+from utils.flags import core as flags_core
+from utils.misc import keras_utils
+
+FLAGS = flags.FLAGS
+BASE_LEARNING_RATE = 0.1  # This matches Jing's version.
+TRAIN_TOP_1 = 'training_accuracy_top_1'
+LR_SCHEDULE = [  # (multiplier, epoch to start) tuples
+    (1.0, 5), (0.1, 30), (0.01, 60), (0.001, 80)
+]
+
+
+class PiecewiseConstantDecayWithWarmup(
+    tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Piecewise constant decay with warmup schedule."""
+
+  def __init__(self,
+               batch_size,
+               epoch_size,
+               warmup_epochs,
+               boundaries,
+               multipliers,
+               compute_lr_on_cpu=True,
+               name=None):
+    super(PiecewiseConstantDecayWithWarmup, self).__init__()
+    if len(boundaries) != len(multipliers) - 1:
+      raise ValueError('The length of boundaries must be 1 less than the '
+                       'length of multipliers')
+
+    base_lr_batch_size = 256
+    steps_per_epoch = epoch_size // batch_size
+
+    self.rescaled_lr = BASE_LEARNING_RATE * batch_size / base_lr_batch_size
+    self.step_boundaries = [float(steps_per_epoch) * x for x in boundaries]
+    self.lr_values = [self.rescaled_lr * m for m in multipliers]
+    self.warmup_steps = warmup_epochs * steps_per_epoch
+    self.compute_lr_on_cpu = compute_lr_on_cpu
+    self.name = name
+
+    self.learning_rate_ops_cache = {}
+
+  def __call__(self, step):
+    if tf.executing_eagerly():
+      return self._get_learning_rate(step)
+
+    # In an eager function or graph, the current implementation of optimizer
+    # repeatedly call and thus create ops for the learning rate schedule. To
+    # avoid this, we cache the ops if not executing eagerly.
+    graph = tf.compat.v1.get_default_graph()
+    if graph not in self.learning_rate_ops_cache:
+      if self.compute_lr_on_cpu:
+        with tf.device('/device:CPU:0'):
+          self.learning_rate_ops_cache[graph] = self._get_learning_rate(step)
+      else:
+        self.learning_rate_ops_cache[graph] = self._get_learning_rate(step)
+    return self.learning_rate_ops_cache[graph]
+
+  def _get_learning_rate(self, step):
+    """Compute learning rate at given step."""
+    with tf.name_scope('PiecewiseConstantDecayWithWarmup'):
+
+      def warmup_lr(step):
+        return self.rescaled_lr * (
+            tf.cast(step, tf.float32) / tf.cast(self.warmup_steps, tf.float32))
+
+      def piecewise_lr(step):
+        return tf.compat.v1.train.piecewise_constant(step, self.step_boundaries,
+                                                     self.lr_values)
+
+      return tf.cond(step < self.warmup_steps, lambda: warmup_lr(step),
+                     lambda: piecewise_lr(step))
+
+  def get_config(self):
+    return {
+        'rescaled_lr': self.rescaled_lr,
+        'step_boundaries': self.step_boundaries,
+        'lr_values': self.lr_values,
+        'warmup_steps': self.warmup_steps,
+        'compute_lr_on_cpu': self.compute_lr_on_cpu,
+        'name': self.name
+    }
+
+
+def get_optimizer(learning_rate=0.1):
+  """Returns optimizer to use."""
+  # The learning_rate is overwritten at the beginning of each step by callback.
+  return tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9)
+
+
+def get_callbacks(pruning_method=None,
+                  enable_checkpoint_and_export=False,
+                  model_dir=None):
+  """Returns common callbacks."""
+  time_callback = keras_utils.TimeHistory(
+      FLAGS.batch_size,
+      FLAGS.log_steps,
+      logdir=FLAGS.model_dir if FLAGS.enable_tensorboard else None)
+  callbacks = [time_callback]
+
+  if FLAGS.enable_tensorboard:
+    tensorboard_callback = tf.keras.callbacks.TensorBoard(
+        log_dir=FLAGS.model_dir, profile_batch=FLAGS.profile_steps)
+    callbacks.append(tensorboard_callback)
+
+  is_pruning_enabled = pruning_method is not None
+  if is_pruning_enabled:
+    callbacks.append(tfmot.sparsity.keras.UpdatePruningStep())
+    if model_dir is not None:
+      callbacks.append(
+          tfmot.sparsity.keras.PruningSummaries(
+              log_dir=model_dir, profile_batch=0))
+
+  if enable_checkpoint_and_export:
+    if model_dir is not None:
+      ckpt_full_path = os.path.join(model_dir, 'model.ckpt-{epoch:04d}')
+      callbacks.append(
+          tf.keras.callbacks.ModelCheckpoint(
+              ckpt_full_path, save_weights_only=True))
+  return callbacks
+
+
+def build_stats(history, eval_output, callbacks):
+  """Normalizes and returns dictionary of stats.
+
+  Args:
+    history: Results of the training step. Supports both categorical_accuracy
+      and sparse_categorical_accuracy.
+    eval_output: Output of the eval step. Assumes first value is eval_loss and
+      second value is accuracy_top_1.
+    callbacks: a list of callbacks which might include a time history callback
+      used during keras.fit.
+
+  Returns:
+    Dictionary of normalized results.
+  """
+  stats = {}
+  if eval_output:
+    stats['accuracy_top_1'] = float(eval_output[1])
+    stats['eval_loss'] = float(eval_output[0])
+  if history and history.history:
+    train_hist = history.history
+    # Gets final loss from training.
+    stats['loss'] = float(train_hist['loss'][-1])
+    # Gets top_1 training accuracy.
+    if 'categorical_accuracy' in train_hist:
+      stats[TRAIN_TOP_1] = float(train_hist['categorical_accuracy'][-1])
+    elif 'sparse_categorical_accuracy' in train_hist:
+      stats[TRAIN_TOP_1] = float(train_hist['sparse_categorical_accuracy'][-1])
+    elif 'accuracy' in train_hist:
+      stats[TRAIN_TOP_1] = float(train_hist['accuracy'][-1])
+
+  if not callbacks:
+    return stats
+
+  # Look for the time history callback which was used during keras.fit
+  for callback in callbacks:
+    if isinstance(callback, keras_utils.TimeHistory):
+      timestamp_log = callback.timestamp_log
+      stats['step_timestamp_log'] = timestamp_log
+      stats['train_finish_time'] = callback.train_finish_time
+      if callback.epoch_runtime_log:
+        stats['avg_exp_per_second'] = callback.average_examples_per_second
+
+  return stats
+
+
+def define_keras_flags(model=False,
+                       optimizer=False,
+                       pretrained_filepath=False):
+  """Define flags for Keras models."""
+  flags_core.define_base(
+      clean=True,
+      num_gpu=True,
+      run_eagerly=True,
+      train_epochs=True,
+      epochs_between_evals=True,
+      distribution_strategy=True)
+  flags_core.define_performance(
+      num_parallel_calls=False,
+      synthetic_data=True,
+      dtype=True,
+      all_reduce_alg=True,
+      num_packs=True,
+      tf_gpu_thread_mode=True,
+      datasets_num_private_threads=True,
+      loss_scale=True,
+      fp16_implementation=True,
+      tf_data_experimental_slack=True,
+      enable_xla=True,
+      training_dataset_cache=True)
+  flags_core.define_image()
+  flags_core.define_benchmark()
+  flags_core.define_distribution()
+  flags.adopt_module_key_flags(flags_core)
+
+  flags.DEFINE_boolean(name='enable_eager', default=False, help='Enable eager?')
+  flags.DEFINE_boolean(name='skip_eval', default=False, help='Skip evaluation?')
+  # TODO(b/135607288): Remove this flag once we understand the root cause of
+  # slowdown when setting the learning phase in Keras backend.
+  flags.DEFINE_boolean(
+      name='set_learning_phase_to_train',
+      default=True,
+      help='If skip eval, also set Keras learning phase to 1 (training).')
+  flags.DEFINE_boolean(
+      name='explicit_gpu_placement',
+      default=False,
+      help='If not using distribution strategy, explicitly set device scope '
+      'for the Keras training loop.')
+  flags.DEFINE_boolean(
+      name='use_trivial_model',
+      default=False,
+      help='Whether to use a trivial Keras model.')
+  flags.DEFINE_boolean(
+      name='report_accuracy_metrics',
+      default=True,
+      help='Report metrics during training and evaluation.')
+  flags.DEFINE_boolean(
+      name='use_tensor_lr',
+      default=True,
+      help='Use learning rate tensor instead of a callback.')
+  flags.DEFINE_boolean(
+      name='enable_tensorboard',
+      default=False,
+      help='Whether to enable Tensorboard callback.')
+  flags.DEFINE_string(
+      name='profile_steps',
+      default=None,
+      help='Save profiling data to model dir at given range of global steps. The '
+      'value must be a comma separated pair of positive integers, specifying '
+      'the first and last step to profile. For example, "--profile_steps=2,4" '
+      'triggers the profiler to process 3 steps, starting from the 2nd step. '
+      'Note that profiler has a non-trivial performance overhead, and the '
+      'output file can be gigantic if profiling many steps.')
+  flags.DEFINE_integer(
+      name='train_steps',
+      default=None,
+      help='The number of steps to run for training. If it is larger than '
+      '# batches per epoch, then use # batches per epoch. This flag will be '
+      'ignored if train_epochs is set to be larger than 1. ')
+  flags.DEFINE_boolean(
+      name='batchnorm_spatial_persistent',
+      default=True,
+      help='Enable the spacial persistent mode for CuDNN batch norm kernel.')
+  flags.DEFINE_boolean(
+      name='enable_get_next_as_optional',
+      default=False,
+      help='Enable get_next_as_optional behavior in DistributedIterator.')
+  flags.DEFINE_boolean(
+      name='enable_checkpoint_and_export',
+      default=False,
+      help='Whether to enable a checkpoint callback and export the savedmodel.')
+  flags.DEFINE_string(name='tpu', default='', help='TPU address to connect to.')
+  flags.DEFINE_integer(
+      name='steps_per_loop',
+      default=None,
+      help='Number of steps per training loop. Only training step happens '
+      'inside the loop. Callbacks will not be called inside. Will be capped at '
+      'steps per epoch.')
+  flags.DEFINE_boolean(
+      name='use_tf_while_loop',
+      default=True,
+      help='Whether to build a tf.while_loop inside the training loop on the '
+      'host. Setting it to True is critical to have peak performance on '
+      'TPU.')
+
+  if model:
+    flags.DEFINE_string('model', 'resnet50_v1.5',
+                        'Name of model preset. (mobilenet, resnet50_v1.5)')
+  if optimizer:
+    flags.DEFINE_string(
+        'optimizer', 'resnet50_default', 'Name of optimizer preset. '
+        '(mobilenet_default, resnet50_default)')
+    # TODO(kimjaehong): Replace as general hyper-params not only for mobilenet.
+    flags.DEFINE_float(
+        'initial_learning_rate_per_sample', 0.00007,
+        'Initial value of learning rate per sample for '
+        'mobilenet_default.')
+    flags.DEFINE_float('lr_decay_factor', 0.94,
+                       'Learning rate decay factor for mobilenet_default.')
+    flags.DEFINE_float('num_epochs_per_decay', 2.5,
+                       'Number of epochs per decay for mobilenet_default.')
+  if pretrained_filepath:
+    flags.DEFINE_string('pretrained_filepath', '', 'Pretrained file path.')
+
+
+def get_synth_data(height, width, num_channels, num_classes, dtype):
+  """Creates a set of synthetic random data.
+
+  Args:
+    height: Integer height that will be used to create a fake image tensor.
+    width: Integer width that will be used to create a fake image tensor.
+    num_channels: Integer depth that will be used to create a fake image tensor.
+    num_classes: Number of classes that should be represented in the fake labels
+      tensor
+    dtype: Data type for features/images.
+
+  Returns:
+    A tuple of tensors representing the inputs and labels.
+
+  """
+  # Synthetic input should be within [0, 255].
+  inputs = tf.random.truncated_normal([height, width, num_channels],
+                                      dtype=dtype,
+                                      mean=127,
+                                      stddev=60,
+                                      name='synthetic_inputs')
+  labels = tf.random.uniform([1],
+                             minval=0,
+                             maxval=num_classes - 1,
+                             dtype=tf.int32,
+                             name='synthetic_labels')
+  return inputs, labels
+
+
+def define_pruning_flags():
+  """Define flags for pruning methods."""
+  flags.DEFINE_string(
+      'pruning_method', None, 'Pruning method.'
+      'None (no pruning) or polynomial_decay.')
+  flags.DEFINE_float('pruning_initial_sparsity', 0.0,
+                     'Initial sparsity for pruning.')
+  flags.DEFINE_float('pruning_final_sparsity', 0.5,
+                     'Final sparsity for pruning.')
+  flags.DEFINE_integer('pruning_begin_step', 0, 'Begin step for pruning.')
+  flags.DEFINE_integer('pruning_end_step', 100000, 'End step for pruning.')
+  flags.DEFINE_integer('pruning_frequency', 100, 'Frequency for pruning.')
+
+
+def define_clustering_flags():
+  """Define flags for clustering methods."""
+  flags.DEFINE_string('clustering_method', None,
+                      'None (no clustering) or selective_clustering '
+                      '(cluster last three Conv2D layers of the model).')
+
+
+def get_synth_input_fn(height,
+                       width,
+                       num_channels,
+                       num_classes,
+                       dtype=tf.float32,
+                       drop_remainder=True):
+  """Returns an input function that returns a dataset with random data.
+
+  This input_fn returns a data set that iterates over a set of random data and
+  bypasses all preprocessing, e.g. jpeg decode and copy. The host to device
+  copy is still included. This used to find the upper throughput bound when
+  tuning the full input pipeline.
+
+  Args:
+    height: Integer height that will be used to create a fake image tensor.
+    width: Integer width that will be used to create a fake image tensor.
+    num_channels: Integer depth that will be used to create a fake image tensor.
+    num_classes: Number of classes that should be represented in the fake labels
+      tensor
+    dtype: Data type for features/images.
+    drop_remainder: A boolean indicates whether to drop the remainder of the
+      batches. If True, the batch dimension will be static.
+
+  Returns:
+    An input_fn that can be used in place of a real one to return a dataset
+    that can be used for iteration.
+  """
+
+  # pylint: disable=unused-argument
+  def input_fn(is_training, data_dir, batch_size, *args, **kwargs):
+    """Returns dataset filled with random data."""
+    inputs, labels = get_synth_data(
+        height=height,
+        width=width,
+        num_channels=num_channels,
+        num_classes=num_classes,
+        dtype=dtype)
+    # Cast to float32 for Keras model.
+    labels = tf.cast(labels, dtype=tf.float32)
+    data = tf.data.Dataset.from_tensors((inputs, labels)).repeat()
+
+    # `drop_remainder` will make dataset produce outputs with known shapes.
+    data = data.batch(batch_size, drop_remainder=drop_remainder)
+    data = data.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+    return data
+
+  return input_fn
+
+
+def set_cudnn_batchnorm_mode():
+  """Set CuDNN batchnorm mode for better performance.
+
+     Note: Spatial Persistent mode may lead to accuracy losses for certain
+     models.
+  """
+  if FLAGS.batchnorm_spatial_persistent:
+    os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
+  else:
+    os.environ.pop('TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT', None)
diff --git a/cv/classification/resnet50/tensorflow2.0/resnet/imagenet_preprocessing.py b/cv/classification/resnet50/tensorflow2.0/resnet/imagenet_preprocessing.py
new file mode 100644
index 000000000..f27e3ca45
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/resnet/imagenet_preprocessing.py
@@ -0,0 +1,574 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Provides utilities to preprocess images.
+
+Training images are sampled using the provided bounding boxes, and subsequently
+cropped to the sampled bounding box. Images are additionally flipped randomly,
+then resized to the target output size (without aspect-ratio preservation).
+
+Images used during evaluation are resized (with aspect-ratio preservation) and
+centrally cropped.
+
+All images undergo mean color subtraction.
+
+Note that these steps are colloquially referred to as "ResNet preprocessing,"
+and they differ from "VGG preprocessing," which does not use bounding boxes
+and instead does an aspect-preserving resize followed by random crop during
+training. (These both differ from "Inception preprocessing," which introduces
+color distortion steps.)
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from absl import logging
+import tensorflow as tf
+
+DEFAULT_IMAGE_SIZE = 224
+NUM_CHANNELS = 3
+NUM_CLASSES = 10 #1001
+
+NUM_IMAGES = {
+    'train': 9469, #1281167,
+    'validation': 3925, #50000,
+}
+
+_NUM_TRAIN_FILES = 1024
+_SHUFFLE_BUFFER = 10000
+
+_R_MEAN = 123.68
+_G_MEAN = 116.78
+_B_MEAN = 103.94
+CHANNEL_MEANS = [_R_MEAN, _G_MEAN, _B_MEAN]
+
+# The lower bound for the smallest side of the image for aspect-preserving
+# resizing. For example, if an image is 500 x 1000, it will be resized to
+# _RESIZE_MIN x (_RESIZE_MIN * 2).
+_RESIZE_MIN = 256
+
+
+def process_record_dataset(dataset,
+                           is_training,
+                           batch_size,
+                           shuffle_buffer,
+                           parse_record_fn,
+                           dtype=tf.float32,
+                           datasets_num_private_threads=None,
+                           drop_remainder=False,
+                           tf_data_experimental_slack=False):
+  """Given a Dataset with raw records, return an iterator over the records.
+
+  Args:
+    dataset: A Dataset representing raw records
+    is_training: A boolean denoting whether the input is for training.
+    batch_size: The number of samples per batch.
+    shuffle_buffer: The buffer size to use when shuffling records. A larger
+      value results in better randomness, but smaller values reduce startup time
+      and use less memory.
+    parse_record_fn: A function that takes a raw record and returns the
+      corresponding (image, label) pair.
+    dtype: Data type to use for images/features.
+    datasets_num_private_threads: Number of threads for a private threadpool
+      created for all datasets computation.
+    drop_remainder: A boolean indicates whether to drop the remainder of the
+      batches. If True, the batch dimension will be static.
+    tf_data_experimental_slack: Whether to enable tf.data's `experimental_slack`
+      option.
+
+  Returns:
+    Dataset of (image, label) pairs ready for iteration.
+  """
+  # Defines a specific size thread pool for tf.data operations.
+  if datasets_num_private_threads:
+    options = tf.data.Options()
+    options.experimental_threading.private_threadpool_size = (
+        datasets_num_private_threads)
+    dataset = dataset.with_options(options)
+    logging.info('datasets_num_private_threads: %s',
+                 datasets_num_private_threads)
+
+  if is_training:
+    # Shuffles records before repeating to respect epoch boundaries.
+    dataset = dataset.shuffle(buffer_size=shuffle_buffer)
+    # Repeats the dataset for the number of epochs to train.
+    dataset = dataset.repeat()
+
+  # Parses the raw records into images and labels.
+  dataset = dataset.map(
+      lambda value: parse_record_fn(value, is_training, dtype),
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
+
+  # Operations between the final prefetch and the get_next call to the iterator
+  # will happen synchronously during run time. We prefetch here again to
+  # background all of the above processing work and keep it out of the
+  # critical training path. Setting buffer_size to tf.data.experimental.AUTOTUNE
+  # allows DistributionStrategies to adjust how many batches to fetch based
+  # on how many devices are present.
+  dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+
+  options = tf.data.Options()
+  options.experimental_slack = tf_data_experimental_slack
+  dataset = dataset.with_options(options)
+
+  return dataset
+
+
+def get_filenames(is_training, data_dir):
+  """Return filenames for dataset."""
+  if is_training:
+    return [
+        os.path.join(data_dir, 'train-%05d-of-01024' % i)
+        for i in range(_NUM_TRAIN_FILES)
+    ]
+  else:
+    return [
+        os.path.join(data_dir, 'validation-%05d-of-00128' % i)
+        for i in range(128)
+    ]
+
+
+def parse_example_proto(example_serialized):
+  """Parses an Example proto containing a training example of an image.
+
+  The output of the build_image_data.py image preprocessing script is a dataset
+  containing serialized Example protocol buffers. Each Example proto contains
+  the following fields (values are included as examples):
+
+    image/height: 462
+    image/width: 581
+    image/colorspace: 'RGB'
+    image/channels: 3
+    image/class/label: 615
+    image/class/synset: 'n03623198'
+    image/class/text: 'knee pad'
+    image/object/bbox/xmin: 0.1
+    image/object/bbox/xmax: 0.9
+    image/object/bbox/ymin: 0.2
+    image/object/bbox/ymax: 0.6
+    image/object/bbox/label: 615
+    image/format: 'JPEG'
+    image/filename: 'ILSVRC2012_val_00041207.JPEG'
+    image/encoded: <JPEG encoded string>
+
+  Args:
+    example_serialized: scalar Tensor tf.string containing a serialized Example
+      protocol buffer.
+
+  Returns:
+    image_buffer: Tensor tf.string containing the contents of a JPEG file.
+    label: Tensor tf.int32 containing the label.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as
+      [ymin, xmin, ymax, xmax].
+  """
+  # Dense features in Example proto.
+  feature_map = {
+      'image/encoded':
+          tf.io.FixedLenFeature([], dtype=tf.string, default_value=''),
+      'image/class/label':
+          tf.io.FixedLenFeature([], dtype=tf.int64, default_value=-1),
+      'image/class/text':
+          tf.io.FixedLenFeature([], dtype=tf.string, default_value=''),
+  }
+  sparse_float32 = tf.io.VarLenFeature(dtype=tf.float32)
+  # Sparse features in Example proto.
+  feature_map.update({
+      k: sparse_float32 for k in [
+          'image/object/bbox/xmin', 'image/object/bbox/ymin',
+          'image/object/bbox/xmax', 'image/object/bbox/ymax'
+      ]
+  })
+
+  features = tf.io.parse_single_example(
+      serialized=example_serialized, features=feature_map)
+  label = tf.cast(features['image/class/label'], dtype=tf.int32)
+
+  xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
+  ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
+  xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
+  ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
+
+  # Note that we impose an ordering of (y, x) just to make life difficult.
+  bbox = tf.concat([ymin, xmin, ymax, xmax], 0)
+
+  # Force the variable number of bounding boxes into the shape
+  # [1, num_boxes, coords].
+  bbox = tf.expand_dims(bbox, 0)
+  bbox = tf.transpose(a=bbox, perm=[0, 2, 1])
+
+  return features['image/encoded'], label, bbox
+
+
+def parse_record(raw_record, is_training, dtype):
+  """Parses a record containing a training example of an image.
+
+  The input record is parsed into a label and image, and the image is passed
+  through preprocessing steps (cropping, flipping, and so on).
+
+  Args:
+    raw_record: scalar Tensor tf.string containing a serialized Example protocol
+      buffer.
+    is_training: A boolean denoting whether the input is for training.
+    dtype: data type to use for images/features.
+
+  Returns:
+    Tuple with processed image tensor in a channel-last format and
+    one-hot-encoded label tensor.
+  """
+  image_buffer, label, bbox = parse_example_proto(raw_record)
+
+  image = preprocess_image(
+      image_buffer=image_buffer,
+      bbox=bbox,
+      output_height=DEFAULT_IMAGE_SIZE,
+      output_width=DEFAULT_IMAGE_SIZE,
+      num_channels=NUM_CHANNELS,
+      is_training=is_training)
+  image = tf.cast(image, dtype)
+
+  # Subtract one so that labels are in [0, 1000), and cast to float32 for
+  # Keras model.
+  label = tf.cast(
+      tf.cast(tf.reshape(label, shape=[1]), dtype=tf.int32) - 1,
+      dtype=tf.float32)
+  return image, label
+
+
+def get_parse_record_fn(use_keras_image_data_format=False):
+  """Get a function for parsing the records, accounting for image format.
+
+  This is useful by handling different types of Keras models. For instance,
+  the current resnet_model.resnet50 input format is always channel-last,
+  whereas the keras_applications mobilenet input format depends on
+  tf.keras.backend.image_data_format(). We should set
+  use_keras_image_data_format=False for the former and True for the latter.
+
+  Args:
+    use_keras_image_data_format: A boolean denoting whether data format is keras
+      backend image data format. If False, the image format is channel-last. If
+      True, the image format matches tf.keras.backend.image_data_format().
+
+  Returns:
+    Function to use for parsing the records.
+  """
+
+  def parse_record_fn(raw_record, is_training, dtype):
+    image, label = parse_record(raw_record, is_training, dtype)
+    if use_keras_image_data_format:
+      if tf.keras.backend.image_data_format() == 'channels_first':
+        image = tf.transpose(image, perm=[2, 0, 1])
+    return image, label
+
+  return parse_record_fn
+
+
+def input_fn(is_training,
+             data_dir,
+             batch_size,
+             dtype=tf.float32,
+             datasets_num_private_threads=None,
+             parse_record_fn=parse_record,
+             input_context=None,
+             drop_remainder=False,
+             tf_data_experimental_slack=False,
+             training_dataset_cache=False,
+             filenames=None):
+  """Input function which provides batches for train or eval.
+
+  Args:
+    is_training: A boolean denoting whether the input is for training.
+    data_dir: The directory containing the input data.
+    batch_size: The number of samples per batch.
+    dtype: Data type to use for images/features
+    datasets_num_private_threads: Number of private threads for tf.data.
+    parse_record_fn: Function to use for parsing the records.
+    input_context: A `tf.distribute.InputContext` object passed in by
+      `tf.distribute.Strategy`.
+    drop_remainder: A boolean indicates whether to drop the remainder of the
+      batches. If True, the batch dimension will be static.
+    tf_data_experimental_slack: Whether to enable tf.data's `experimental_slack`
+      option.
+    training_dataset_cache: Whether to cache the training dataset on workers.
+      Typically used to improve training performance when training data is in
+      remote storage and can fit into worker memory.
+    filenames: Optional field for providing the file names of the TFRecords.
+
+  Returns:
+    A dataset that can be used for iteration.
+  """
+  if filenames is None:
+    filenames = get_filenames(is_training, data_dir)
+  dataset = tf.data.Dataset.from_tensor_slices(filenames)
+
+  if input_context:
+    logging.info(
+        'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d',
+        input_context.input_pipeline_id, input_context.num_input_pipelines)
+    dataset = dataset.shard(input_context.num_input_pipelines,
+                            input_context.input_pipeline_id)
+
+  if is_training:
+    # Shuffle the input files
+    dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES)
+
+  # Convert to individual records.
+  # cycle_length = 10 means that up to 10 files will be read and deserialized in
+  # parallel. You may want to increase this number if you have a large number of
+  # CPU cores.
+  dataset = dataset.interleave(
+      tf.data.TFRecordDataset,
+      cycle_length=10,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+  if is_training and training_dataset_cache:
+    # Improve training performance when training data is in remote storage and
+    # can fit into worker memory.
+    dataset = dataset.cache()
+
+  return process_record_dataset(
+      dataset=dataset,
+      is_training=is_training,
+      batch_size=batch_size,
+      shuffle_buffer=_SHUFFLE_BUFFER,
+      parse_record_fn=parse_record_fn,
+      dtype=dtype,
+      datasets_num_private_threads=datasets_num_private_threads,
+      drop_remainder=drop_remainder,
+      tf_data_experimental_slack=tf_data_experimental_slack,
+  )
+
+
+def _decode_crop_and_flip(image_buffer, bbox, num_channels):
+  """Crops the given image to a random part of the image, and randomly flips.
+
+  We use the fused decode_and_crop op, which performs better than the two ops
+  used separately in series, but note that this requires that the image be
+  passed in as an un-decoded string Tensor.
+
+  Args:
+    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as [ymin,
+      xmin, ymax, xmax].
+    num_channels: Integer depth of the image buffer for decoding.
+
+  Returns:
+    3-D tensor with cropped image.
+
+  """
+  # A large fraction of image datasets contain a human-annotated bounding box
+  # delineating the region of the image containing the object of interest.  We
+  # choose to create a new bounding box for the object which is a randomly
+  # distorted version of the human-annotated bounding box that obeys an
+  # allowed range of aspect ratios, sizes and overlap with the human-annotated
+  # bounding box. If no box is supplied, then we assume the bounding box is
+  # the entire image.
+  sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
+      tf.image.extract_jpeg_shape(image_buffer),
+      bounding_boxes=bbox,
+      min_object_covered=0.1,
+      aspect_ratio_range=[0.75, 1.33],
+      area_range=[0.05, 1.0],
+      max_attempts=100,
+      use_image_if_no_bounding_boxes=True)
+  bbox_begin, bbox_size, _ = sample_distorted_bounding_box
+
+  # Reassemble the bounding box in the format the crop op requires.
+  offset_y, offset_x, _ = tf.unstack(bbox_begin)
+  target_height, target_width, _ = tf.unstack(bbox_size)
+  crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
+
+  # Use the fused decode and crop op here, which is faster than each in series.
+  cropped = tf.image.decode_and_crop_jpeg(
+      image_buffer, crop_window, channels=num_channels)
+
+  # Flip to add a little more random distortion in.
+  cropped = tf.image.random_flip_left_right(cropped)
+  return cropped
+
+
+def _central_crop(image, crop_height, crop_width):
+  """Performs central crops of the given image list.
+
+  Args:
+    image: a 3-D image tensor
+    crop_height: the height of the image following the crop.
+    crop_width: the width of the image following the crop.
+
+  Returns:
+    3-D tensor with cropped image.
+  """
+  shape = tf.shape(input=image)
+  height, width = shape[0], shape[1]
+
+  amount_to_be_cropped_h = (height - crop_height)
+  crop_top = amount_to_be_cropped_h // 2
+  amount_to_be_cropped_w = (width - crop_width)
+  crop_left = amount_to_be_cropped_w // 2
+  return tf.slice(image, [crop_top, crop_left, 0],
+                  [crop_height, crop_width, -1])
+
+
+def _mean_image_subtraction(image, means, num_channels):
+  """Subtracts the given means from each image channel.
+
+  For example:
+    means = [123.68, 116.779, 103.939]
+    image = _mean_image_subtraction(image, means)
+
+  Note that the rank of `image` must be known.
+
+  Args:
+    image: a tensor of size [height, width, C].
+    means: a C-vector of values to subtract from each channel.
+    num_channels: number of color channels in the image that will be distorted.
+
+  Returns:
+    the centered image.
+
+  Raises:
+    ValueError: If the rank of `image` is unknown, if `image` has a rank other
+      than three or if the number of channels in `image` doesn't match the
+      number of values in `means`.
+  """
+  if image.get_shape().ndims != 3:
+    raise ValueError('Input must be of size [height, width, C>0]')
+
+  if len(means) != num_channels:
+    raise ValueError('len(means) must match the number of channels')
+
+  # We have a 1-D tensor of means; convert to 3-D.
+  # Note(b/130245863): we explicitly call `broadcast` instead of simply
+  # expanding dimensions for better performance.
+  means = tf.broadcast_to(means, tf.shape(image))
+
+  return image - means
+
+
+def _smallest_size_at_least(height, width, resize_min):
+  """Computes new shape with the smallest side equal to `smallest_side`.
+
+  Computes new shape with the smallest side equal to `smallest_side` while
+  preserving the original aspect ratio.
+
+  Args:
+    height: an int32 scalar tensor indicating the current height.
+    width: an int32 scalar tensor indicating the current width.
+    resize_min: A python integer or scalar `Tensor` indicating the size of the
+      smallest side after resize.
+
+  Returns:
+    new_height: an int32 scalar tensor indicating the new height.
+    new_width: an int32 scalar tensor indicating the new width.
+  """
+  resize_min = tf.cast(resize_min, tf.float32)
+
+  # Convert to floats to make subsequent calculations go smoothly.
+  height, width = tf.cast(height, tf.float32), tf.cast(width, tf.float32)
+
+  smaller_dim = tf.minimum(height, width)
+  scale_ratio = resize_min / smaller_dim
+
+  # Convert back to ints to make heights and widths that TF ops will accept.
+  new_height = tf.cast(height * scale_ratio, tf.int32)
+  new_width = tf.cast(width * scale_ratio, tf.int32)
+
+  return new_height, new_width
+
+
+def _aspect_preserving_resize(image, resize_min):
+  """Resize images preserving the original aspect ratio.
+
+  Args:
+    image: A 3-D image `Tensor`.
+    resize_min: A python integer or scalar `Tensor` indicating the size of the
+      smallest side after resize.
+
+  Returns:
+    resized_image: A 3-D tensor containing the resized image.
+  """
+  shape = tf.shape(input=image)
+  height, width = shape[0], shape[1]
+
+  new_height, new_width = _smallest_size_at_least(height, width, resize_min)
+
+  return _resize_image(image, new_height, new_width)
+
+
+def _resize_image(image, height, width):
+  """Simple wrapper around tf.resize_images.
+
+  This is primarily to make sure we use the same `ResizeMethod` and other
+  details each time.
+
+  Args:
+    image: A 3-D image `Tensor`.
+    height: The target height for the resized image.
+    width: The target width for the resized image.
+
+  Returns:
+    resized_image: A 3-D tensor containing the resized image. The first two
+      dimensions have the shape [height, width].
+  """
+  return tf.compat.v1.image.resize(
+      image, [height, width],
+      method=tf.image.ResizeMethod.BILINEAR,
+      align_corners=False)
+
+
+def preprocess_image(image_buffer,
+                     bbox,
+                     output_height,
+                     output_width,
+                     num_channels,
+                     is_training=False):
+  """Preprocesses the given image.
+
+  Preprocessing includes decoding, cropping, and resizing for both training
+  and eval images. Training preprocessing, however, introduces some random
+  distortion of the image to improve accuracy.
+
+  Args:
+    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as [ymin,
+      xmin, ymax, xmax].
+    output_height: The height of the image after preprocessing.
+    output_width: The width of the image after preprocessing.
+    num_channels: Integer depth of the image buffer for decoding.
+    is_training: `True` if we're preprocessing the image for training and
+      `False` otherwise.
+
+  Returns:
+    A preprocessed image.
+  """
+  if is_training:
+    # For training, we want to randomize some of the distortions.
+    image = _decode_crop_and_flip(image_buffer, bbox, num_channels)
+    image = _resize_image(image, output_height, output_width)
+  else:
+    # For validation, we want to decode, resize, then just crop the middle.
+    image = tf.image.decode_jpeg(image_buffer, channels=num_channels)
+    image = _aspect_preserving_resize(image, _RESIZE_MIN)
+    image = _central_crop(image, output_height, output_width)
+
+  image.set_shape([output_height, output_width, num_channels])
+
+  return _mean_image_subtraction(image, CHANNEL_MEANS, num_channels)
diff --git a/cv/classification/resnet50/tensorflow2.0/resnet/resnet_config.py b/cv/classification/resnet50/tensorflow2.0/resnet/resnet_config.py
new file mode 100644
index 000000000..4d04dc6d2
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/resnet/resnet_config.py
@@ -0,0 +1,55 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Configuration definitions for ResNet losses, learning rates, and optimizers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import dataclasses
+
+from modeling.hyperparams import base_config
+from configs import base_configs
+
+
+@dataclasses.dataclass
+class ResNetModelConfig(base_configs.ModelConfig):
+  """Configuration for the ResNet model."""
+  name: str = 'ResNet'
+  num_classes: int = 10 #1000
+  model_params: base_config.Config = dataclasses.field(
+      default_factory=lambda: {
+          'num_classes': 10, #1000,
+          'batch_size': None,
+          'use_l2_regularizer': True,
+          'rescale_inputs': False,
+      })
+  loss: base_configs.LossConfig = base_configs.LossConfig(
+      name='sparse_categorical_crossentropy')
+  optimizer: base_configs.OptimizerConfig = base_configs.OptimizerConfig(
+      name='momentum',
+      decay=0.9,
+      epsilon=0.001,
+      momentum=0.9,
+      moving_average_decay=None)
+  learning_rate: base_configs.LearningRateConfig = (
+      base_configs.LearningRateConfig(
+          name='stepwise',
+          initial_lr=0.1, #0.1,
+          examples_per_epoch=9469, #1281167,
+          boundaries=[30, 60, 80],
+          warmup_epochs=5,
+          scale_by_batch_size=1. / 256.,
+          multipliers=[0.1 / 256, 0.01 / 256, 0.001 / 256, 0.0001 / 256]))
diff --git a/cv/classification/resnet50/tensorflow2.0/resnet/resnet_ctl_imagenet_main.py b/cv/classification/resnet50/tensorflow2.0/resnet/resnet_ctl_imagenet_main.py
new file mode 100644
index 000000000..055d97ff2
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/resnet/resnet_ctl_imagenet_main.py
@@ -0,0 +1,195 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
+
+import math
+import os
+
+# Import libraries
+from absl import app
+from absl import flags
+from absl import logging
+import orbit
+import tensorflow as tf
+from common import distribute_utils
+from modeling import performance
+from utils.flags import core as flags_core
+from utils.misc import keras_utils
+from utils.misc import model_helpers
+from resnet import common
+from resnet import imagenet_preprocessing
+from resnet import resnet_runnable
+
+flags.DEFINE_boolean(name='use_tf_function', default=True,
+                     help='Wrap the train and test step inside a '
+                     'tf.function.')
+flags.DEFINE_boolean(name='single_l2_loss_op', default=False,
+                     help='Calculate L2_loss on concatenated weights, '
+                     'instead of using Keras per-layer L2 loss.')
+
+
+def build_stats(runnable, time_callback):
+  """Normalizes and returns dictionary of stats.
+
+  Args:
+    runnable: The module containing all the training and evaluation metrics.
+    time_callback: Time tracking callback instance.
+
+  Returns:
+    Dictionary of normalized results.
+  """
+  stats = {}
+
+  if not runnable.flags_obj.skip_eval:
+    stats['eval_loss'] = runnable.test_loss.result().numpy()
+    stats['eval_acc'] = runnable.test_accuracy.result().numpy()
+
+    stats['train_loss'] = runnable.train_loss.result().numpy()
+    stats['train_acc'] = runnable.train_accuracy.result().numpy()
+
+  if time_callback:
+    timestamp_log = time_callback.timestamp_log
+    stats['step_timestamp_log'] = timestamp_log
+    stats['train_finish_time'] = time_callback.train_finish_time
+    if time_callback.epoch_runtime_log:
+      stats['avg_exp_per_second'] = time_callback.average_examples_per_second
+
+  return stats
+
+
+def get_num_train_iterations(flags_obj):
+  """Returns the number of training steps, train and test epochs."""
+  train_steps = (
+      imagenet_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size)
+  train_epochs = flags_obj.train_epochs
+
+  if flags_obj.train_steps:
+    train_steps = min(flags_obj.train_steps, train_steps)
+    train_epochs = 1
+
+  eval_steps = math.ceil(1.0 * imagenet_preprocessing.NUM_IMAGES['validation'] /
+                         flags_obj.batch_size)
+
+  return train_steps, train_epochs, eval_steps
+
+
+def run(flags_obj):
+  """Run ResNet ImageNet training and eval loop using custom training loops.
+
+  Args:
+    flags_obj: An object containing parsed flag values.
+
+  Raises:
+    ValueError: If fp16 is passed as it is not currently supported.
+
+  Returns:
+    Dictionary of training and eval stats.
+  """
+  keras_utils.set_session_config()
+  performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj))
+
+  if tf.config.list_physical_devices('GPU'):
+    if flags_obj.tf_gpu_thread_mode:
+      keras_utils.set_gpu_thread_mode_and_count(
+          per_gpu_thread_count=flags_obj.per_gpu_thread_count,
+          gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
+          num_gpus=flags_obj.num_gpus,
+          datasets_num_private_threads=flags_obj.datasets_num_private_threads)
+    common.set_cudnn_batchnorm_mode()
+
+  data_format = flags_obj.data_format
+  if data_format is None:
+    data_format = ('channels_first' if tf.config.list_physical_devices('GPU')
+                   else 'channels_last')
+  tf.keras.backend.set_image_data_format(data_format)
+
+  strategy = distribute_utils.get_distribution_strategy(
+      distribution_strategy=flags_obj.distribution_strategy,
+      num_gpus=flags_obj.num_gpus,
+      all_reduce_alg=flags_obj.all_reduce_alg,
+      num_packs=flags_obj.num_packs,
+      tpu_address=flags_obj.tpu)
+
+  per_epoch_steps, train_epochs, eval_steps = get_num_train_iterations(
+      flags_obj)
+  if flags_obj.steps_per_loop is None:
+    steps_per_loop = per_epoch_steps
+  elif flags_obj.steps_per_loop > per_epoch_steps:
+    steps_per_loop = per_epoch_steps
+    logging.warn('Setting steps_per_loop to %d to respect epoch boundary.',
+                 steps_per_loop)
+  else:
+    steps_per_loop = flags_obj.steps_per_loop
+
+  logging.info(
+      'Training %d epochs, each epoch has %d steps, '
+      'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps,
+      train_epochs * per_epoch_steps, eval_steps)
+
+  time_callback = keras_utils.TimeHistory(
+      flags_obj.batch_size,
+      flags_obj.log_steps,
+      logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None)
+  with distribute_utils.get_strategy_scope(strategy):
+    runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback,
+                                              per_epoch_steps)
+
+  eval_interval = flags_obj.epochs_between_evals * per_epoch_steps
+  checkpoint_interval = (
+      steps_per_loop * 5 if flags_obj.enable_checkpoint_and_export else None)
+  summary_interval = steps_per_loop if flags_obj.enable_tensorboard else None
+
+  checkpoint_manager = tf.train.CheckpointManager(
+      runnable.checkpoint,
+      directory=flags_obj.model_dir,
+      max_to_keep=10,
+      step_counter=runnable.global_step,
+      checkpoint_interval=checkpoint_interval)
+
+  resnet_controller = orbit.Controller(
+      strategy=strategy,
+      trainer=runnable,
+      evaluator=runnable if not flags_obj.skip_eval else None,
+      global_step=runnable.global_step,
+      steps_per_loop=steps_per_loop,
+      checkpoint_manager=checkpoint_manager,
+      summary_interval=summary_interval,
+      summary_dir=flags_obj.model_dir,
+      eval_summary_dir=os.path.join(flags_obj.model_dir, 'eval'))
+
+  time_callback.on_train_begin()
+  if not flags_obj.skip_eval:
+    resnet_controller.train_and_evaluate(
+        train_steps=per_epoch_steps * train_epochs,
+        eval_steps=eval_steps,
+        eval_interval=eval_interval)
+  else:
+    resnet_controller.train(steps=per_epoch_steps * train_epochs)
+  time_callback.on_train_end()
+
+  stats = build_stats(runnable, time_callback)
+  return stats
+
+
+def main(_):
+  model_helpers.apply_clean(flags.FLAGS)
+  stats = run(flags.FLAGS)
+  logging.info('Run stats:\n%s', stats)
+
+
+if __name__ == '__main__':
+  logging.set_verbosity(logging.INFO)
+  common.define_keras_flags()
+  app.run(main)
diff --git a/cv/classification/resnet50/tensorflow2.0/resnet/resnet_model.py b/cv/classification/resnet50/tensorflow2.0/resnet/resnet_model.py
new file mode 100644
index 000000000..eb998316d
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/resnet/resnet_model.py
@@ -0,0 +1,326 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ResNet50 model for Keras.
+
+Adapted from tf.keras.applications.resnet50.ResNet50().
+This is ResNet model version 1.5.
+
+Related papers/blogs:
+- https://arxiv.org/abs/1512.03385
+- https://arxiv.org/pdf/1603.05027v2.pdf
+- http://torch.ch/blog/2016/02/04/resnets.html
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from resnet import imagenet_preprocessing
+
+layers = tf.keras.layers
+
+
+def _gen_l2_regularizer(use_l2_regularizer=True, l2_weight_decay=1e-4):
+  return tf.keras.regularizers.L2(
+      l2_weight_decay) if use_l2_regularizer else None
+
+
+def identity_block(input_tensor,
+                   kernel_size,
+                   filters,
+                   stage,
+                   block,
+                   use_l2_regularizer=True,
+                   batch_norm_decay=0.9,
+                   batch_norm_epsilon=1e-5):
+  """The identity block is the block that has no conv layer at shortcut.
+
+  Args:
+    input_tensor: input tensor
+    kernel_size: default 3, the kernel size of middle conv layer at main path
+    filters: list of integers, the filters of 3 conv layer at main path
+    stage: integer, current stage label, used for generating layer names
+    block: 'a','b'..., current block label, used for generating layer names
+    use_l2_regularizer: whether to use L2 regularizer on Conv layer.
+    batch_norm_decay: Moment of batch norm layers.
+    batch_norm_epsilon: Epsilon of batch borm layers.
+
+  Returns:
+    Output tensor for the block.
+  """
+  filters1, filters2, filters3 = filters
+  if tf.keras.backend.image_data_format() == 'channels_last':
+    bn_axis = 3
+  else:
+    bn_axis = 1
+  conv_name_base = 'res' + str(stage) + block + '_branch'
+  bn_name_base = 'bn' + str(stage) + block + '_branch'
+
+  x = layers.Conv2D(
+      filters1, (1, 1),
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2a')(
+          input_tensor)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=batch_norm_decay,
+      epsilon=batch_norm_epsilon,
+      name=bn_name_base + '2a')(
+          x)
+  x = layers.Activation('relu')(x)
+
+  x = layers.Conv2D(
+      filters2,
+      kernel_size,
+      padding='same',
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2b')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=batch_norm_decay,
+      epsilon=batch_norm_epsilon,
+      name=bn_name_base + '2b')(
+          x)
+  x = layers.Activation('relu')(x)
+
+  x = layers.Conv2D(
+      filters3, (1, 1),
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2c')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=batch_norm_decay,
+      epsilon=batch_norm_epsilon,
+      name=bn_name_base + '2c')(
+          x)
+
+  x = layers.add([x, input_tensor])
+  x = layers.Activation('relu')(x)
+  return x
+
+
+def conv_block(input_tensor,
+               kernel_size,
+               filters,
+               stage,
+               block,
+               strides=(2, 2),
+               use_l2_regularizer=True,
+               batch_norm_decay=0.9,
+               batch_norm_epsilon=1e-5):
+  """A block that has a conv layer at shortcut.
+
+  Note that from stage 3,
+  the second conv layer at main path is with strides=(2, 2)
+  And the shortcut should have strides=(2, 2) as well
+
+  Args:
+    input_tensor: input tensor
+    kernel_size: default 3, the kernel size of middle conv layer at main path
+    filters: list of integers, the filters of 3 conv layer at main path
+    stage: integer, current stage label, used for generating layer names
+    block: 'a','b'..., current block label, used for generating layer names
+    strides: Strides for the second conv layer in the block.
+    use_l2_regularizer: whether to use L2 regularizer on Conv layer.
+    batch_norm_decay: Moment of batch norm layers.
+    batch_norm_epsilon: Epsilon of batch borm layers.
+
+  Returns:
+    Output tensor for the block.
+  """
+  filters1, filters2, filters3 = filters
+  if tf.keras.backend.image_data_format() == 'channels_last':
+    bn_axis = 3
+  else:
+    bn_axis = 1
+  conv_name_base = 'res' + str(stage) + block + '_branch'
+  bn_name_base = 'bn' + str(stage) + block + '_branch'
+
+  x = layers.Conv2D(
+      filters1, (1, 1),
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2a')(
+          input_tensor)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=batch_norm_decay,
+      epsilon=batch_norm_epsilon,
+      name=bn_name_base + '2a')(
+          x)
+  x = layers.Activation('relu')(x)
+
+  x = layers.Conv2D(
+      filters2,
+      kernel_size,
+      strides=strides,
+      padding='same',
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2b')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=batch_norm_decay,
+      epsilon=batch_norm_epsilon,
+      name=bn_name_base + '2b')(
+          x)
+  x = layers.Activation('relu')(x)
+
+  x = layers.Conv2D(
+      filters3, (1, 1),
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2c')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=batch_norm_decay,
+      epsilon=batch_norm_epsilon,
+      name=bn_name_base + '2c')(
+          x)
+
+  shortcut = layers.Conv2D(
+      filters3, (1, 1),
+      strides=strides,
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '1')(
+          input_tensor)
+  shortcut = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=batch_norm_decay,
+      epsilon=batch_norm_epsilon,
+      name=bn_name_base + '1')(
+          shortcut)
+
+  x = layers.add([x, shortcut])
+  x = layers.Activation('relu')(x)
+  return x
+
+
+def resnet50(num_classes,
+             batch_size=None,
+             use_l2_regularizer=True,
+             rescale_inputs=False,
+             batch_norm_decay=0.9,
+             batch_norm_epsilon=1e-5):
+  """Instantiates the ResNet50 architecture.
+
+  Args:
+    num_classes: `int` number of classes for image classification.
+    batch_size: Size of the batches for each step.
+    use_l2_regularizer: whether to use L2 regularizer on Conv/Dense layer.
+    rescale_inputs: whether to rescale inputs from 0 to 1.
+    batch_norm_decay: Moment of batch norm layers.
+    batch_norm_epsilon: Epsilon of batch borm layers.
+
+  Returns:
+      A Keras model instance.
+  """
+  input_shape = (224, 224, 3)
+  img_input = layers.Input(shape=input_shape, batch_size=batch_size)
+  if rescale_inputs:
+    # Hub image modules expect inputs in the range [0, 1]. This rescales these
+    # inputs to the range expected by the trained model.
+    x = layers.Lambda(
+        lambda x: x * 255.0 - tf.keras.backend.constant(    # pylint: disable=g-long-lambda
+            imagenet_preprocessing.CHANNEL_MEANS,
+            shape=[1, 1, 3],
+            dtype=x.dtype),
+        name='rescale')(
+            img_input)
+  else:
+    x = img_input
+
+  if tf.keras.backend.image_data_format() == 'channels_first':
+    x = layers.Permute((3, 1, 2))(x)
+    bn_axis = 1
+  else:  # channels_last
+    bn_axis = 3
+
+  block_config = dict(
+      use_l2_regularizer=use_l2_regularizer,
+      batch_norm_decay=batch_norm_decay,
+      batch_norm_epsilon=batch_norm_epsilon)
+  x = layers.ZeroPadding2D(padding=(3, 3), name='conv1_pad')(x)
+  x = layers.Conv2D(
+      64, (7, 7),
+      strides=(2, 2),
+      padding='valid',
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name='conv1')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=batch_norm_decay,
+      epsilon=batch_norm_epsilon,
+      name='bn_conv1')(
+          x)
+  x = layers.Activation('relu')(x)
+  x = layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
+
+  x = conv_block(
+      x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1), **block_config)
+  x = identity_block(x, 3, [64, 64, 256], stage=2, block='b', **block_config)
+  x = identity_block(x, 3, [64, 64, 256], stage=2, block='c', **block_config)
+
+  x = conv_block(x, 3, [128, 128, 512], stage=3, block='a', **block_config)
+  x = identity_block(x, 3, [128, 128, 512], stage=3, block='b', **block_config)
+  x = identity_block(x, 3, [128, 128, 512], stage=3, block='c', **block_config)
+  x = identity_block(x, 3, [128, 128, 512], stage=3, block='d', **block_config)
+
+  x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a', **block_config)
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b', **block_config)
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c', **block_config)
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d', **block_config)
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e', **block_config)
+  x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f', **block_config)
+
+  x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a', **block_config)
+  x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b', **block_config)
+  x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c', **block_config)
+
+  x = layers.GlobalAveragePooling2D()(x)
+  x = layers.Dense(
+      num_classes,
+      kernel_initializer=tf.compat.v1.keras.initializers.random_normal(
+          stddev=0.01),
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      bias_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name='fc1000')(
+          x)
+
+  # A softmax that is followed by the model loss must be done cannot be done
+  # in float16 due to numeric issues. So we pass dtype=float32.
+  x = layers.Activation('softmax', dtype='float32')(x)
+
+  # Create model.
+  return tf.keras.Model(img_input, x, name='resnet50')
diff --git a/cv/classification/resnet50/tensorflow2.0/resnet/resnet_runnable.py b/cv/classification/resnet50/tensorflow2.0/resnet/resnet_runnable.py
new file mode 100644
index 000000000..6aff91199
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/resnet/resnet_runnable.py
@@ -0,0 +1,216 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
+
+import orbit
+import tensorflow as tf
+
+from modeling import performance
+from staging.training import grad_utils
+from utils.flags import core as flags_core
+from resnet import common
+from resnet import imagenet_preprocessing
+from resnet import resnet_model
+
+
+class ResnetRunnable(orbit.StandardTrainer, orbit.StandardEvaluator):
+  """Implements the training and evaluation APIs for Resnet model."""
+
+  def __init__(self, flags_obj, time_callback, epoch_steps):
+    self.strategy = tf.distribute.get_strategy()
+    self.flags_obj = flags_obj
+    self.dtype = flags_core.get_tf_dtype(flags_obj)
+    self.time_callback = time_callback
+
+    # Input pipeline related
+    batch_size = flags_obj.batch_size
+    if batch_size % self.strategy.num_replicas_in_sync != 0:
+      raise ValueError(
+          'Batch size must be divisible by number of replicas : {}'.format(
+              self.strategy.num_replicas_in_sync))
+
+    # As auto rebatching is not supported in
+    # `distribute_datasets_from_function()` API, which is
+    # required when cloning dataset to multiple workers in eager mode,
+    # we use per-replica batch size.
+    self.batch_size = int(batch_size / self.strategy.num_replicas_in_sync)
+
+    if self.flags_obj.use_synthetic_data:
+      self.input_fn = common.get_synth_input_fn(
+          height=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
+          width=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
+          num_channels=imagenet_preprocessing.NUM_CHANNELS,
+          num_classes=imagenet_preprocessing.NUM_CLASSES,
+          dtype=self.dtype,
+          drop_remainder=True)
+    else:
+      self.input_fn = imagenet_preprocessing.input_fn
+
+    self.model = resnet_model.resnet50(
+        num_classes=imagenet_preprocessing.NUM_CLASSES,
+        use_l2_regularizer=not flags_obj.single_l2_loss_op)
+
+    lr_schedule = common.PiecewiseConstantDecayWithWarmup(
+        batch_size=flags_obj.batch_size,
+        epoch_size=imagenet_preprocessing.NUM_IMAGES['train'],
+        warmup_epochs=common.LR_SCHEDULE[0][1],
+        boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]),
+        multipliers=list(p[0] for p in common.LR_SCHEDULE),
+        compute_lr_on_cpu=True)
+    self.optimizer = common.get_optimizer(lr_schedule)
+    # Make sure iterations variable is created inside scope.
+    self.global_step = self.optimizer.iterations
+
+    use_graph_rewrite = flags_obj.fp16_implementation == 'graph_rewrite'
+    if use_graph_rewrite and not flags_obj.use_tf_function:
+      raise ValueError('--fp16_implementation=graph_rewrite requires '
+                       '--use_tf_function to be true')
+    self.optimizer = performance.configure_optimizer(
+        self.optimizer,
+        use_float16=self.dtype == tf.float16,
+        use_graph_rewrite=use_graph_rewrite,
+        loss_scale=flags_core.get_loss_scale(flags_obj, default_for_fp16=128))
+
+    self.train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
+    self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
+        'train_accuracy', dtype=tf.float32)
+    self.test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)
+    self.test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
+        'test_accuracy', dtype=tf.float32)
+
+    self.checkpoint = tf.train.Checkpoint(
+        model=self.model, optimizer=self.optimizer)
+
+    # Handling epochs.
+    self.epoch_steps = epoch_steps
+    self.epoch_helper = orbit.utils.EpochHelper(epoch_steps, self.global_step)
+    train_dataset = orbit.utils.make_distributed_dataset(
+        self.strategy,
+        self.input_fn,
+        is_training=True,
+        data_dir=self.flags_obj.data_dir,
+        batch_size=self.batch_size,
+        parse_record_fn=imagenet_preprocessing.parse_record,
+        datasets_num_private_threads=self.flags_obj
+        .datasets_num_private_threads,
+        dtype=self.dtype,
+        drop_remainder=True)
+    orbit.StandardTrainer.__init__(
+        self,
+        train_dataset,
+        options=orbit.StandardTrainerOptions(
+            use_tf_while_loop=flags_obj.use_tf_while_loop,
+            use_tf_function=flags_obj.use_tf_function))
+    if not flags_obj.skip_eval:
+      eval_dataset = orbit.utils.make_distributed_dataset(
+          self.strategy,
+          self.input_fn,
+          is_training=False,
+          data_dir=self.flags_obj.data_dir,
+          batch_size=self.batch_size,
+          parse_record_fn=imagenet_preprocessing.parse_record,
+          dtype=self.dtype)
+      orbit.StandardEvaluator.__init__(
+          self,
+          eval_dataset,
+          options=orbit.StandardEvaluatorOptions(
+              use_tf_function=flags_obj.use_tf_function))
+
+  def train_loop_begin(self):
+    """See base class."""
+    # Reset all metrics
+    self.train_loss.reset_states()
+    self.train_accuracy.reset_states()
+
+    self._epoch_begin()
+    self.time_callback.on_batch_begin(self.epoch_helper.batch_index)
+
+  def train_step(self, iterator):
+    """See base class."""
+
+    def step_fn(inputs):
+      """Function to run on the device."""
+      images, labels = inputs
+      with tf.GradientTape() as tape:
+        logits = self.model(images, training=True)
+
+        prediction_loss = tf.keras.losses.sparse_categorical_crossentropy(
+            labels, logits)
+        loss = tf.reduce_sum(prediction_loss) * (1.0 /
+                                                 self.flags_obj.batch_size)
+        num_replicas = self.strategy.num_replicas_in_sync
+        l2_weight_decay = 1e-4
+        if self.flags_obj.single_l2_loss_op:
+          l2_loss = l2_weight_decay * 2 * tf.add_n([
+              tf.nn.l2_loss(v)
+              for v in self.model.trainable_variables
+              if 'bn' not in v.name
+          ])
+
+          loss += (l2_loss / num_replicas)
+        else:
+          loss += (tf.reduce_sum(self.model.losses) / num_replicas)
+
+      grad_utils.minimize_using_explicit_allreduce(
+          tape, self.optimizer, loss, self.model.trainable_variables)
+      self.train_loss.update_state(loss)
+      self.train_accuracy.update_state(labels, logits)
+    if self.flags_obj.enable_xla:
+      step_fn = tf.function(step_fn, jit_compile=True)
+    self.strategy.run(step_fn, args=(next(iterator),))
+
+  def train_loop_end(self):
+    """See base class."""
+    metrics = {
+        'train_loss': self.train_loss.result(),
+        'train_accuracy': self.train_accuracy.result(),
+    }
+    self.time_callback.on_batch_end(self.epoch_helper.batch_index - 1)
+    self._epoch_end()
+    return metrics
+
+  def eval_begin(self):
+    """See base class."""
+    self.test_loss.reset_states()
+    self.test_accuracy.reset_states()
+
+  def eval_step(self, iterator):
+    """See base class."""
+
+    def step_fn(inputs):
+      """Function to run on the device."""
+      images, labels = inputs
+      logits = self.model(images, training=False)
+      loss = tf.keras.losses.sparse_categorical_crossentropy(labels, logits)
+      loss = tf.reduce_sum(loss) * (1.0 / self.flags_obj.batch_size)
+      self.test_loss.update_state(loss)
+      self.test_accuracy.update_state(labels, logits)
+
+    self.strategy.run(step_fn, args=(next(iterator),))
+
+  def eval_end(self):
+    """See base class."""
+    return {
+        'test_loss': self.test_loss.result(),
+        'test_accuracy': self.test_accuracy.result()
+    }
+
+  def _epoch_begin(self):
+    if self.epoch_helper.epoch_begin():
+      self.time_callback.on_epoch_begin(self.epoch_helper.current_epoch)
+
+  def _epoch_end(self):
+    if self.epoch_helper.epoch_end():
+      self.time_callback.on_epoch_end(self.epoch_helper.current_epoch)
diff --git a/cv/classification/resnet50/tensorflow2.0/resnet/tfhub_export.py b/cv/classification/resnet50/tensorflow2.0/resnet/tfhub_export.py
new file mode 100644
index 000000000..d65179895
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/resnet/tfhub_export.py
@@ -0,0 +1,67 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A script to export TF-Hub SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import os
+
+# Import libraries
+from absl import app
+from absl import flags
+
+import tensorflow as tf
+
+from resnet import imagenet_preprocessing
+from resnet import resnet_model
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("model_path", None,
+                    "File path to TF model checkpoint or H5 file.")
+flags.DEFINE_string("export_path", None,
+                    "TF-Hub SavedModel destination path to export.")
+
+
+def export_tfhub(model_path, hub_destination):
+  """Restores a tf.keras.Model and saves for TF-Hub."""
+  model = resnet_model.resnet50(
+      num_classes=imagenet_preprocessing.NUM_CLASSES, rescale_inputs=True)
+  model.load_weights(model_path)
+  model.save(
+      os.path.join(hub_destination, "classification"), include_optimizer=False)
+
+  # Extracts a sub-model to use pooling feature vector as model output.
+  image_input = model.get_layer(index=0).get_output_at(0)
+  feature_vector_output = model.get_layer(name="reduce_mean").get_output_at(0)
+  hub_model = tf.keras.Model(image_input, feature_vector_output)
+
+  # Exports a SavedModel.
+  hub_model.save(
+      os.path.join(hub_destination, "feature-vector"), include_optimizer=False)
+
+
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError("Too many command-line arguments.")
+
+  export_tfhub(FLAGS.model_path, FLAGS.export_path)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/cv/classification/resnet50/tensorflow2.0/run_train_resnet50_mirrored_imagenette.sh b/cv/classification/resnet50/tensorflow2.0/run_train_resnet50_mirrored_imagenette.sh
new file mode 100644
index 000000000..6f043c592
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/run_train_resnet50_mirrored_imagenette.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+
+: ${IX_NUM_CUDA_VISIBLE_DEVICES:=2}
+: ${DATA_DIR:="imagenette"}
+
+: ${BATCH_SIZE:=32}
+LOG_DIR="logs/resnet50"
+BASE_DIR="./out_model"
+MODEL_DIR=${BASE_DIR}/resnet50
+WORK_PATH=$(dirname $(readlink -f $0))
+OFFICALPATH=$WORK_PATH/../../../
+
+export PYTHONPATH=$OFFICALPATH:$PYTHONPATH
+
+
+EXIT_STATUS=0
+check_status()
+{
+  if ((${PIPESTATUS[0]} != 0)); then
+    EXIT_STATUS=1
+  fi
+}
+
+if [ ! -d "${LOG_DIR}" ]; then
+    mkdir -p ${LOG_DIR}
+fi
+
+if [ ! -d "${BASE_DIR}" ]; then
+    mkdir -p ${BASE_DIR}
+fi
+
+rm -rf ${MODEL_DIR}
+
+
+python3 classifier_trainer.py \
+  --mode=train_and_eval \
+  --model_type=resnet \
+  --dataset=imagenet \
+  --model_dir=${MODEL_DIR} \
+  --data_dir=${DATA_DIR} \
+  --config_file=configs/examples/resnet/imagenet/gpu_mirrored.yaml \
+  --params_override="train_dataset.batch_size=${BATCH_SIZE}" \
+  --params_override="runtime.num_gpus=${IX_NUM_CUDA_VISIBLE_DEVICES}$*";  check_status
+
+exit ${EXIT_STATUS}
diff --git a/cv/classification/resnet50/tensorflow2.0/run_train_resnet50_worker_mirrored_imagenette.sh b/cv/classification/resnet50/tensorflow2.0/run_train_resnet50_worker_mirrored_imagenette.sh
new file mode 100644
index 000000000..9c6b1e91d
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/run_train_resnet50_worker_mirrored_imagenette.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+
+LOG_DIR="logs/resnet50"
+DATA_DIR="imagenette"
+BASE_DIR="./out_model"
+MODEL_DIR=${BASE_DIR}/resnet50
+DATE=`date +%Y%m%d%H%M%S`
+WORK_PATH=$(dirname $(readlink -f $0))
+OFFICALPATH=$WORK_PATH/../../../
+
+export PYTHONPATH=$OFFICALPATH:$PYTHONPATH
+
+EXIT_STATUS=0
+check_status()
+{
+  if ((${PIPESTATUS[0]} != 0)); then
+    EXIT_STATUS=1
+  fi
+}
+
+if [ ! -d "${LOG_DIR}" ]; then
+    mkdir -p ${LOG_DIR}
+fi
+
+if [ ! -d "${BASE_DIR}" ]; then
+    mkdir -p ${BASE_DIR}
+fi
+rm -rf ${MODEL_DIR}
+
+
+
+for index in 0 1
+do
+  export CUDA_VISIBLE_DEVICES=${index}
+  time python3 classifier_trainer.py \
+    --mode=train_and_eval \
+    --model_type=resnet \
+    --dataset=imagenet \
+    --model_dir=${MODEL_DIR} \
+    --data_dir=${DATA_DIR} \
+    --config_file=configs/examples/resnet/imagenet/gpu_multi_worker_mirrored.yaml \
+    --params_override='runtime.num_gpus=2, runtime.task_index='${index}''  2>&1 | tee ${LOG_DIR}/${DATE}_${index}.log [[ ${PIPESTATUS[0]} == 0 ]] || exit &
+done
+
+wait
+if [ ! -f "compare_kv.py" -o ! -f "get_key_value.py" ]; then
+  ./download_script.sh
+  if [[ $? != 0 ]]; then
+    echo "ERROR: download scripts failed"
+    exit 1
+  fi
+fi
+echo ${DATE}
+python3 get_key_value.py -i ${LOG_DIR}/${DATE}_0.log -k 'val_accuracy: ' 'val_top_5_accuracy: ' -o train_resnet50_worker_mirrored_bi.json
+python3 compare_kv.py -b train_resnet50_worker_mirrored_bi.json -n train_resnet50_worker_mirrored_nv.json -i 'val_accuracy: ' 'val_top_5_accuracy: '; check_status
+exit ${EXIT_STATUS}
diff --git a/cv/classification/resnet50/tensorflow2.0/staging/__init__.py b/cv/classification/resnet50/tensorflow2.0/staging/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/staging/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/cv/classification/resnet50/tensorflow2.0/staging/training/__init__.py b/cv/classification/resnet50/tensorflow2.0/staging/training/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/staging/training/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/cv/classification/resnet50/tensorflow2.0/staging/training/grad_utils.py b/cv/classification/resnet50/tensorflow2.0/staging/training/grad_utils.py
new file mode 100644
index 000000000..1113d39d5
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/staging/training/grad_utils.py
@@ -0,0 +1,151 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Some gradient util functions to help users writing custom training loop."""
+
+from absl import logging
+
+import tensorflow as tf
+
+
+def _filter_grads(grads_and_vars):
+  """Filter out iterable with grad equal to None."""
+  grads_and_vars = tuple(grads_and_vars)
+  if not grads_and_vars:
+    return grads_and_vars
+  filtered = []
+  vars_with_empty_grads = []
+  for grad, var in grads_and_vars:
+    if grad is None:
+      vars_with_empty_grads.append(var)
+    else:
+      filtered.append((grad, var))
+  filtered = tuple(filtered)
+  if not filtered:
+    raise ValueError("No gradients provided for any variable: %s." %
+                     ([v.name for _, v in grads_and_vars],))
+  if vars_with_empty_grads:
+    logging.warning(
+        ("Gradients do not exist for variables %s when minimizing the loss."),
+        ([v.name for v in vars_with_empty_grads]))
+  return filtered
+
+
+def _filter_and_allreduce_gradients(grads_and_vars,
+                                    allreduce_precision="float32",
+                                    bytes_per_pack=0):
+  """Filter None grads and then allreduce gradients in specified precision.
+
+  This utils function is used when users intent to explicitly allreduce
+  gradients and customize gradients operations before and after allreduce.
+  The allreduced gradients are then passed to optimizer.apply_gradients(
+  experimental_aggregate_gradients=False).
+
+  Args:
+      grads_and_vars: gradients and variables pairs.
+      allreduce_precision: Whether to allreduce gradients in float32 or float16.
+      bytes_per_pack: A non-negative integer. Breaks collective operations into
+        packs of certain size. If it's zero, all gradients are in one pack.
+
+  Returns:
+      pairs of allreduced non-None gradients and variables.
+  """
+  filtered_grads_and_vars = _filter_grads(grads_and_vars)
+  (grads, variables) = zip(*filtered_grads_and_vars)
+  if allreduce_precision == "float16":
+    grads = [tf.cast(grad, "float16") for grad in grads]
+  hints = tf.distribute.experimental.CommunicationOptions(
+      bytes_per_pack=bytes_per_pack)
+  allreduced_grads = tf.distribute.get_strategy(  # pylint: disable=protected-access
+  ).extended._replica_ctx_all_reduce(tf.distribute.ReduceOp.SUM, grads, hints)
+  if allreduce_precision == "float16":
+    allreduced_grads = [tf.cast(grad, "float32") for grad in allreduced_grads]
+  return allreduced_grads, variables
+
+
+def _run_callbacks(callbacks, grads_and_vars):
+  for callback in callbacks:
+    grads_and_vars = callback(grads_and_vars)
+  return grads_and_vars
+
+
+def minimize_using_explicit_allreduce(tape,
+                                      optimizer,
+                                      loss,
+                                      trainable_variables,
+                                      pre_allreduce_callbacks=None,
+                                      post_allreduce_callbacks=None,
+                                      allreduce_bytes_per_pack=0):
+  """Minimizes loss for one step by updating `trainable_variables`.
+
+  Minimizes loss for one step by updating `trainable_variables`.
+  This explicitly performs gradient allreduce, instead of relying on implicit
+  allreduce in optimizer.apply_gradients(). If training using FP16 mixed
+  precision, explicit allreduce will aggregate gradients in FP16 format.
+  For TPU and GPU training using FP32, explicit allreduce will aggregate
+  gradients in FP32 format.
+
+  Args:
+      tape: An instance of `tf.GradientTape`.
+      optimizer: An instance of `tf.keras.optimizers.Optimizer`.
+      loss: the loss tensor.
+      trainable_variables: A list of model Variables.
+      pre_allreduce_callbacks: A list of callback functions that takes gradients
+        and model variables pairs as input, manipulate them, and returns a new
+        gradients and model variables pairs. The callback functions will be
+        invoked in the list order and before gradients are allreduced. With
+        mixed precision training, the pre_allreduce_allbacks will be applied on
+        scaled_gradients. Default is no callbacks.
+      post_allreduce_callbacks: A list of callback functions that takes
+        gradients and model variables pairs as input, manipulate them, and
+        returns a new gradients and model variables paris. The callback
+        functions will be invoked in the list order and right before gradients
+        are applied to variables for updates. Default is no callbacks.
+      allreduce_bytes_per_pack: A non-negative integer. Breaks collective
+        operations into packs of certain size. If it's zero, all gradients are
+        in one pack.
+  """
+  if isinstance(optimizer,
+                tf.keras.mixed_precision.LossScaleOptimizer):
+    # FP16 GPU code path
+    with tape:
+      scaled_loss = optimizer.get_scaled_loss(loss)
+    scaled_grads = tape.gradient(scaled_loss, trainable_variables)
+    grads_and_vars = zip(scaled_grads, trainable_variables)
+    if pre_allreduce_callbacks:
+      grads_and_vars = _run_callbacks(pre_allreduce_callbacks, grads_and_vars)
+    (allreduced_scaled_grads,
+     filtered_training_vars) = _filter_and_allreduce_gradients(
+         grads_and_vars,
+         allreduce_precision="float16",
+         bytes_per_pack=allreduce_bytes_per_pack)
+    allreduced_unscaled_grads = optimizer.get_unscaled_gradients(
+        allreduced_scaled_grads)
+    grads_and_vars = zip(allreduced_unscaled_grads, filtered_training_vars)
+  else:
+    # TPU or FP32 GPU code path
+    grads = tape.gradient(loss, trainable_variables)
+    grads_and_vars = zip(grads, trainable_variables)
+    if pre_allreduce_callbacks:
+      grads_and_vars = _run_callbacks(pre_allreduce_callbacks, grads_and_vars)
+    (allreduced_grads,
+     filtered_training_vars) = _filter_and_allreduce_gradients(
+         grads_and_vars,
+         allreduce_precision="float32",
+         bytes_per_pack=allreduce_bytes_per_pack)
+    grads_and_vars = zip(allreduced_grads, filtered_training_vars)
+  if post_allreduce_callbacks:
+    grads_and_vars = _run_callbacks(post_allreduce_callbacks, grads_and_vars)
+  optimizer.apply_gradients(
+      grads_and_vars, experimental_aggregate_gradients=False)
diff --git a/cv/classification/resnet50/tensorflow2.0/test_utils.py b/cv/classification/resnet50/tensorflow2.0/test_utils.py
new file mode 100644
index 000000000..8d7180c9d
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/test_utils.py
@@ -0,0 +1,37 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test utilities for image classification tasks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+def trivial_model(num_classes):
+  """Trivial model for ImageNet dataset."""
+
+  input_shape = (224, 224, 3)
+  img_input = tf.keras.layers.Input(shape=input_shape)
+
+  x = tf.keras.layers.Lambda(
+      lambda x: tf.keras.backend.reshape(x, [-1, 224 * 224 * 3]),
+      name='reshape')(img_input)
+  x = tf.keras.layers.Dense(1, name='fc1')(x)
+  x = tf.keras.layers.Dense(num_classes, name='fc1000')(x)
+  x = tf.keras.layers.Activation('softmax', dtype='float32')(x)
+
+  return tf.keras.models.Model(img_input, x, name='trivial')
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/__init__.py b/cv/classification/resnet50/tensorflow2.0/utils/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/docs/build_api_docs_lib.py b/cv/classification/resnet50/tensorflow2.0/utils/docs/build_api_docs_lib.py
new file mode 100644
index 000000000..0bff8b011
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/docs/build_api_docs_lib.py
@@ -0,0 +1,54 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Common library for API docs builder."""
+
+import tensorflow as tf
+from tensorflow_docs.api_generator import doc_controls
+
+
+def hide_module_model_and_layer_methods():
+  """Hide methods and properties defined in the base classes of Keras layers.
+
+  We hide all methods and properties of the base classes, except:
+  - `__init__` is always documented.
+  - `call` is always documented, as it can carry important information for
+    complex layers.
+  """
+  module_contents = list(tf.Module.__dict__.items())
+  model_contents = list(tf.keras.Model.__dict__.items())
+  layer_contents = list(tf.keras.layers.Layer.__dict__.items())
+
+  for name, obj in module_contents + layer_contents + model_contents:
+    if name == '__init__':
+      # Always document __init__.
+      continue
+
+    if name == 'call':
+      # Always document `call`.
+      if hasattr(obj, doc_controls._FOR_SUBCLASS_IMPLEMENTERS):  # pylint: disable=protected-access
+        delattr(obj, doc_controls._FOR_SUBCLASS_IMPLEMENTERS)  # pylint: disable=protected-access
+      continue
+
+    # Otherwise, exclude from documentation.
+    if isinstance(obj, property):
+      obj = obj.fget
+
+    if isinstance(obj, (staticmethod, classmethod)):
+      obj = obj.__func__
+
+    try:
+      doc_controls.do_not_doc_in_subclasses(obj)
+    except AttributeError:
+      pass
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/docs/build_nlp_api_docs.py b/cv/classification/resnet50/tensorflow2.0/utils/docs/build_nlp_api_docs.py
new file mode 100644
index 000000000..25e3dda02
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/docs/build_nlp_api_docs.py
@@ -0,0 +1,95 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Tool to generate api_docs for tensorflow_models/official library.
+
+Example:
+
+$> pip install -U git+https://github.com/tensorflow/docs
+$> python build_nlp_api_docs \
+ --output_dir=/tmp/api_docs
+"""
+
+import os
+
+from absl import app
+from absl import flags
+from absl import logging
+from tensorflow_docs.api_generator import generate_lib
+from tensorflow_docs.api_generator import public_api
+
+from official.nlp import modeling as tfnlp
+import build_api_docs_lib
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('output_dir', None, 'Where to write the resulting docs to.')
+flags.DEFINE_string(
+    'code_url_prefix',
+    'https://github.com/tensorflow/models/blob/master/official/nlp/modeling/',
+    'The url prefix for links to code.')
+
+flags.DEFINE_bool('search_hints', True,
+                  'Include metadata search hints in the generated files')
+
+flags.DEFINE_string('site_path', '/api_docs/python',
+                    'Path prefix in the _toc.yaml')
+
+flags.DEFINE_bool('gen_report', False,
+                  'Generate an API report containing the health of the '
+                  'docstrings of the public API.')
+
+PROJECT_SHORT_NAME = 'tfnlp'
+PROJECT_FULL_NAME = 'TensorFlow Official Models - NLP Modeling Library'
+
+
+def gen_api_docs(code_url_prefix, site_path, output_dir, gen_report,
+                 project_short_name, project_full_name, search_hints):
+  """Generates api docs for the tensorflow docs package."""
+  build_api_docs_lib.hide_module_model_and_layer_methods()
+  del tfnlp.layers.MultiHeadAttention
+  del tfnlp.layers.EinsumDense
+
+  doc_generator = generate_lib.DocGenerator(
+      root_title=project_full_name,
+      py_modules=[(project_short_name, tfnlp)],
+      base_dir=os.path.dirname(tfnlp.__file__),
+      code_url_prefix=code_url_prefix,
+      search_hints=search_hints,
+      site_path=site_path,
+      gen_report=gen_report,
+      callbacks=[public_api.explicit_package_contents_filter],
+  )
+
+  doc_generator.build(output_dir)
+  logging.info('Output docs to: %s', output_dir)
+
+
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError('Too many command-line arguments.')
+
+  gen_api_docs(
+      code_url_prefix=FLAGS.code_url_prefix,
+      site_path=FLAGS.site_path,
+      output_dir=FLAGS.output_dir,
+      gen_report=FLAGS.gen_report,
+      project_short_name=PROJECT_SHORT_NAME,
+      project_full_name=PROJECT_FULL_NAME,
+      search_hints=FLAGS.search_hints)
+
+
+if __name__ == '__main__':
+  flags.mark_flag_as_required('output_dir')
+  app.run(main)
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/docs/build_vision_api_docs.py b/cv/classification/resnet50/tensorflow2.0/utils/docs/build_vision_api_docs.py
new file mode 100644
index 000000000..095e04c69
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/docs/build_vision_api_docs.py
@@ -0,0 +1,93 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Tool to generate api_docs for tensorflow_models/official library.
+
+Example:
+
+$> pip install -U git+https://github.com/tensorflow/docs
+$> python build_vision_api_docs \
+ --output_dir=/tmp/api_docs
+"""
+
+import os
+
+from absl import app
+from absl import flags
+from absl import logging
+from tensorflow_docs.api_generator import generate_lib
+from tensorflow_docs.api_generator import public_api
+
+import build_api_docs_lib
+from official.vision.beta import modeling as tfvision
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('output_dir', None, 'Where to write the resulting docs to.')
+flags.DEFINE_string(
+    'code_url_prefix',
+    'https://github.com/tensorflow/models/blob/master/official/vision/beta/modeling/',
+    'The url prefix for links to code.')
+
+flags.DEFINE_bool('search_hints', True,
+                  'Include metadata search hints in the generated files')
+
+flags.DEFINE_string('site_path', 'tfvision/api_docs/python',
+                    'Path prefix in the _toc.yaml')
+
+flags.DEFINE_bool('gen_report', False,
+                  'Generate an API report containing the health of the '
+                  'docstrings of the public API.')
+
+PROJECT_SHORT_NAME = 'tfvision'
+PROJECT_FULL_NAME = 'TensorFlow Official Models - Vision Modeling Library'
+
+
+def gen_api_docs(code_url_prefix, site_path, output_dir, gen_report,
+                 project_short_name, project_full_name, search_hints):
+  """Generates api docs for the tensorflow docs package."""
+  build_api_docs_lib.hide_module_model_and_layer_methods()
+
+  doc_generator = generate_lib.DocGenerator(
+      root_title=project_full_name,
+      py_modules=[(project_short_name, tfvision)],
+      base_dir=os.path.dirname(tfvision.__file__),
+      code_url_prefix=code_url_prefix,
+      search_hints=search_hints,
+      site_path=site_path,
+      gen_report=gen_report,
+      callbacks=[public_api.explicit_package_contents_filter],
+  )
+
+  doc_generator.build(output_dir)
+  logging.info('Output docs to: %s', output_dir)
+
+
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError('Too many command-line arguments.')
+
+  gen_api_docs(
+      code_url_prefix=FLAGS.code_url_prefix,
+      site_path=FLAGS.site_path,
+      output_dir=FLAGS.output_dir,
+      gen_report=FLAGS.gen_report,
+      project_short_name=PROJECT_SHORT_NAME,
+      project_full_name=PROJECT_FULL_NAME,
+      search_hints=FLAGS.search_hints)
+
+
+if __name__ == '__main__':
+  flags.mark_flag_as_required('output_dir')
+  app.run(main)
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/flags/README.md b/cv/classification/resnet50/tensorflow2.0/utils/flags/README.md
new file mode 100644
index 000000000..beb3b2a1e
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/flags/README.md
@@ -0,0 +1,102 @@
+# Adding Abseil (absl) flags quickstart
+
+**WARNING** This module is deprecated. We no long use it in new models and
+your projects should not depend on it. We will remove this module when
+all models using it are deprecated which may take time.
+
+## Defining a flag
+absl flag definitions are similar to argparse, although they are defined on a global namespace.
+
+For instance defining a string flag looks like:
+```$xslt
+from absl import flags
+flags.DEFINE_string(
+    name="my_flag",
+    default="a_sensible_default",
+    help="Here is what this flag does."
+)
+```
+
+All three arguments are required, but default may be `None`. A common optional argument is
+short_name for defining abreviations. Certain `DEFINE_*` methods will have other required arguments.
+For instance `DEFINE_enum` requires the `enum_values` argument to be specified.
+
+## Key Flags
+absl has the concept of a key flag. Any flag defined in `__main__` is considered a key flag by
+default. Key flags are displayed in `--help`, others only appear in `--helpfull`. In order to
+handle key flags that are defined outside the module in question, absl provides the
+`flags.adopt_module_key_flags()` method. This adds the key flags of a different module to one's own
+key flags. For example:
+```$xslt
+File: flag_source.py
+---------------------------------------
+
+from absl import flags
+flags.DEFINE_string(name="my_flag", default="abc", help="a flag.")
+```
+
+```$xslt
+File: my_module.py
+---------------------------------------
+
+from absl import app as absl_app
+from absl import flags
+
+import flag_source
+
+flags.adopt_module_key_flags(flag_source)
+
+def main(_):
+  pass
+
+absl_app.run(main, [__file__, "-h"]
+```
+
+when `my_module.py` is run it will show the help text for `my_flag`. Because not all flags defined
+in a file are equally important, `official/utils/flags/core.py` (generally imported as flags_core)
+provides an abstraction for handling key flag declaration in an easy way through the
+`register_key_flags_in_core()` function, which allows a module to make a single
+`adopt_key_flags(flags_core)` call when using the util flag declaration functions.
+
+## Validators
+Often the constraints on a flag are complicated. absl provides the validator decorator to allow
+one to mark a function as a flag validation function. Suppose we want users to provide a flag
+which is a palindrome.
+
+```$xslt
+from absl import flags
+
+flags.DEFINE_string(name="pal_flag", short_name="pf", default="", help="Give me a palindrome")
+
+@flags.validator("pal_flag")
+def _check_pal(provided_pal_flag):
+  return provided_pal_flag == provided_pal_flag[::-1]
+
+```
+
+Validators take the form that returning True (truthy) passes, and all others 
+(False, None, exception) fail.
+
+## Testing
+To test using absl, simply declare flags in the setupClass method of TensorFlow's TestCase.
+
+```$xslt
+from absl import flags
+import tensorflow as tf
+
+def define_flags():
+  flags.DEFINE_string(name="test_flag", default="abc", help="an example flag")
+
+
+class BaseTester(unittest.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    super(BaseTester, cls).setUpClass()
+    define_flags()
+    
+  def test_trivial(self):
+    flags_core.parse_flags([__file__, "test_flag", "def"])
+    self.AssertEqual(flags.FLAGS.test_flag, "def")
+    
+```
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/flags/__init__.py b/cv/classification/resnet50/tensorflow2.0/utils/flags/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/flags/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/flags/_base.py b/cv/classification/resnet50/tensorflow2.0/utils/flags/_base.py
new file mode 100644
index 000000000..491300e42
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/flags/_base.py
@@ -0,0 +1,177 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Flags which will be nearly universal across models."""
+
+from absl import flags
+import tensorflow as tf
+from utils.flags._conventions import help_wrap
+
+
+def define_base(data_dir=True,
+                model_dir=True,
+                clean=False,
+                train_epochs=False,
+                epochs_between_evals=False,
+                stop_threshold=False,
+                batch_size=True,
+                num_gpu=False,
+                hooks=False,
+                export_dir=False,
+                distribution_strategy=False,
+                run_eagerly=False):
+  """Register base flags.
+
+  Args:
+    data_dir: Create a flag for specifying the input data directory.
+    model_dir: Create a flag for specifying the model file directory.
+    clean: Create a flag for removing the model_dir.
+    train_epochs: Create a flag to specify the number of training epochs.
+    epochs_between_evals: Create a flag to specify the frequency of testing.
+    stop_threshold: Create a flag to specify a threshold accuracy or other eval
+      metric which should trigger the end of training.
+    batch_size: Create a flag to specify the batch size.
+    num_gpu: Create a flag to specify the number of GPUs used.
+    hooks: Create a flag to specify hooks for logging.
+    export_dir: Create a flag to specify where a SavedModel should be exported.
+    distribution_strategy: Create a flag to specify which Distribution Strategy
+      to use.
+    run_eagerly: Create a flag to specify to run eagerly op by op.
+
+  Returns:
+    A list of flags for core.py to marks as key flags.
+  """
+  key_flags = []
+
+  if data_dir:
+    flags.DEFINE_string(
+        name="data_dir",
+        short_name="dd",
+        default="/tmp",
+        help=help_wrap("The location of the input data."))
+    key_flags.append("data_dir")
+
+  if model_dir:
+    flags.DEFINE_string(
+        name="model_dir",
+        short_name="md",
+        default="/tmp",
+        help=help_wrap("The location of the model checkpoint files."))
+    key_flags.append("model_dir")
+
+  if clean:
+    flags.DEFINE_boolean(
+        name="clean",
+        default=False,
+        help=help_wrap("If set, model_dir will be removed if it exists."))
+    key_flags.append("clean")
+
+  if train_epochs:
+    flags.DEFINE_integer(
+        name="train_epochs",
+        short_name="te",
+        default=1,
+        help=help_wrap("The number of epochs used to train."))
+    key_flags.append("train_epochs")
+
+  if epochs_between_evals:
+    flags.DEFINE_integer(
+        name="epochs_between_evals",
+        short_name="ebe",
+        default=1,
+        help=help_wrap("The number of training epochs to run between "
+                       "evaluations."))
+    key_flags.append("epochs_between_evals")
+
+  if stop_threshold:
+    flags.DEFINE_float(
+        name="stop_threshold",
+        short_name="st",
+        default=None,
+        help=help_wrap("If passed, training will stop at the earlier of "
+                       "train_epochs and when the evaluation metric is  "
+                       "greater than or equal to stop_threshold."))
+
+  if batch_size:
+    flags.DEFINE_integer(
+        name="batch_size",
+        short_name="bs",
+        default=32,
+        help=help_wrap("Batch size for training and evaluation. When using "
+                       "multiple gpus, this is the global batch size for "
+                       "all devices. For example, if the batch size is 32 "
+                       "and there are 4 GPUs, each GPU will get 8 examples on "
+                       "each step."))
+    key_flags.append("batch_size")
+
+  if num_gpu:
+    flags.DEFINE_integer(
+        name="num_gpus",
+        short_name="ng",
+        default=1,
+        help=help_wrap("How many GPUs to use at each worker with the "
+                       "DistributionStrategies API. The default is 1."))
+
+  if run_eagerly:
+    flags.DEFINE_boolean(
+        name="run_eagerly",
+        default=False,
+        help="Run the model op by op without building a model function.")
+
+  if hooks:
+    flags.DEFINE_list(
+        name="hooks",
+        short_name="hk",
+        default="LoggingTensorHook",
+        help=help_wrap(
+            u"A list of (case insensitive) strings to specify the names of "
+            u"training hooks. Example: `--hooks ProfilerHook,"
+            u"ExamplesPerSecondHook`\n See hooks_helper "
+            u"for details."))
+    key_flags.append("hooks")
+
+  if export_dir:
+    flags.DEFINE_string(
+        name="export_dir",
+        short_name="ed",
+        default=None,
+        help=help_wrap("If set, a SavedModel serialization of the model will "
+                       "be exported to this directory at the end of training. "
+                       "See the README for more details and relevant links."))
+    key_flags.append("export_dir")
+
+  if distribution_strategy:
+    flags.DEFINE_string(
+        name="distribution_strategy",
+        short_name="ds",
+        default="mirrored",
+        help=help_wrap("The Distribution Strategy to use for training. "
+                       "Accepted values are 'off', 'one_device', "
+                       "'mirrored', 'parameter_server', 'collective', "
+                       "case insensitive. 'off' means not to use "
+                       "Distribution Strategy; 'default' means to choose "
+                       "from `MirroredStrategy` or `OneDeviceStrategy` "
+                       "according to the number of GPUs."))
+
+  return key_flags
+
+
+def get_num_gpus(flags_obj):
+  """Treat num_gpus=-1 as 'use all'."""
+  if flags_obj.num_gpus != -1:
+    return flags_obj.num_gpus
+
+  from tensorflow.python.client import device_lib  # pylint: disable=g-import-not-at-top
+  local_device_protos = device_lib.list_local_devices()
+  return sum([1 for d in local_device_protos if d.device_type == "GPU"])
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/flags/_benchmark.py b/cv/classification/resnet50/tensorflow2.0/utils/flags/_benchmark.py
new file mode 100644
index 000000000..66ddefc05
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/flags/_benchmark.py
@@ -0,0 +1,117 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Flags for benchmarking models."""
+
+from absl import flags
+
+from utils.flags._conventions import help_wrap
+
+
+def define_log_steps():
+  flags.DEFINE_integer(
+      name="log_steps",
+      default=100,
+      help="Frequency with which to log timing information with TimeHistory.")
+
+  return []
+
+
+def define_benchmark(benchmark_log_dir=True, bigquery_uploader=True):
+  """Register benchmarking flags.
+
+  Args:
+    benchmark_log_dir: Create a flag to specify location for benchmark logging.
+    bigquery_uploader: Create flags for uploading results to BigQuery.
+
+  Returns:
+    A list of flags for core.py to marks as key flags.
+  """
+
+  key_flags = []
+
+  flags.DEFINE_enum(
+      name="benchmark_logger_type",
+      default="BaseBenchmarkLogger",
+      enum_values=["BaseBenchmarkLogger", "BenchmarkFileLogger"],
+      help=help_wrap("The type of benchmark logger to use. Defaults to using "
+                     "BaseBenchmarkLogger which logs to STDOUT. Different "
+                     "loggers will require other flags to be able to work."))
+  flags.DEFINE_string(
+      name="benchmark_test_id",
+      short_name="bti",
+      default=None,
+      help=help_wrap("The unique test ID of the benchmark run. It could be the "
+                     "combination of key parameters. It is hardware "
+                     "independent and could be used compare the performance "
+                     "between different test runs. This flag is designed for "
+                     "human consumption, and does not have any impact within "
+                     "the system."))
+
+  define_log_steps()
+
+  if benchmark_log_dir:
+    flags.DEFINE_string(
+        name="benchmark_log_dir",
+        short_name="bld",
+        default=None,
+        help=help_wrap("The location of the benchmark logging."))
+
+  if bigquery_uploader:
+    flags.DEFINE_string(
+        name="gcp_project",
+        short_name="gp",
+        default=None,
+        help=help_wrap(
+            "The GCP project name where the benchmark will be uploaded."))
+
+    flags.DEFINE_string(
+        name="bigquery_data_set",
+        short_name="bds",
+        default="test_benchmark",
+        help=help_wrap(
+            "The Bigquery dataset name where the benchmark will be uploaded."))
+
+    flags.DEFINE_string(
+        name="bigquery_run_table",
+        short_name="brt",
+        default="benchmark_run",
+        help=help_wrap("The Bigquery table name where the benchmark run "
+                       "information will be uploaded."))
+
+    flags.DEFINE_string(
+        name="bigquery_run_status_table",
+        short_name="brst",
+        default="benchmark_run_status",
+        help=help_wrap("The Bigquery table name where the benchmark run "
+                       "status information will be uploaded."))
+
+    flags.DEFINE_string(
+        name="bigquery_metric_table",
+        short_name="bmt",
+        default="benchmark_metric",
+        help=help_wrap("The Bigquery table name where the benchmark metric "
+                       "information will be uploaded."))
+
+  @flags.multi_flags_validator(
+      ["benchmark_logger_type", "benchmark_log_dir"],
+      message="--benchmark_logger_type=BenchmarkFileLogger will require "
+      "--benchmark_log_dir being set")
+  def _check_benchmark_log_dir(flags_dict):
+    benchmark_logger_type = flags_dict["benchmark_logger_type"]
+    if benchmark_logger_type == "BenchmarkFileLogger":
+      return flags_dict["benchmark_log_dir"]
+    return True
+
+  return key_flags
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/flags/_conventions.py b/cv/classification/resnet50/tensorflow2.0/utils/flags/_conventions.py
new file mode 100644
index 000000000..a42ff42a2
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/flags/_conventions.py
@@ -0,0 +1,50 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Central location for shared argparse convention definitions."""
+
+import sys
+import codecs
+import functools
+
+from absl import app as absl_app
+from absl import flags
+
+# This codifies help string conventions and makes it easy to update them if
+# necessary. Currently the only major effect is that help bodies start on the
+# line after flags are listed. All flag definitions should wrap the text bodies
+# with help wrap when calling DEFINE_*.
+_help_wrap = functools.partial(
+    flags.text_wrap, length=80, indent="", firstline_indent="\n")
+
+
+# Pretty formatting causes issues when utf-8 is not installed on a system.
+def _stdout_utf8():
+  try:
+    codecs.lookup("utf-8")
+  except LookupError:
+    return False
+  return getattr(sys.stdout, "encoding", "") == "UTF-8"
+
+
+if _stdout_utf8():
+  help_wrap = _help_wrap
+else:
+
+  def help_wrap(text, *args, **kwargs):
+    return _help_wrap(text, *args, **kwargs).replace(u"\ufeff", u"")
+
+
+# Replace None with h to also allow -h
+absl_app.HelpshortFlag.SHORT_NAME = "h"
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/flags/_device.py b/cv/classification/resnet50/tensorflow2.0/utils/flags/_device.py
new file mode 100644
index 000000000..1c9a3ad7d
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/flags/_device.py
@@ -0,0 +1,90 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Flags for managing compute devices. Currently only contains TPU flags."""
+
+from absl import flags
+from absl import logging
+
+from utils.flags._conventions import help_wrap
+
+
+def require_cloud_storage(flag_names):
+  """Register a validator to check directory flags.
+
+  Args:
+    flag_names: An iterable of strings containing the names of flags to be
+      checked.
+  """
+  msg = "TPU requires GCS path for {}".format(", ".join(flag_names))
+
+  @flags.multi_flags_validator(["tpu"] + flag_names, message=msg)
+  def _path_check(flag_values):  # pylint: disable=missing-docstring
+    if flag_values["tpu"] is None:
+      return True
+
+    valid_flags = True
+    for key in flag_names:
+      if not flag_values[key].startswith("gs://"):
+        logging.error("%s must be a GCS path.", key)
+        valid_flags = False
+
+    return valid_flags
+
+
+def define_device(tpu=True):
+  """Register device specific flags.
+
+  Args:
+    tpu: Create flags to specify TPU operation.
+
+  Returns:
+    A list of flags for core.py to marks as key flags.
+  """
+
+  key_flags = []
+
+  if tpu:
+    flags.DEFINE_string(
+        name="tpu",
+        default=None,
+        help=help_wrap(
+            "The Cloud TPU to use for training. This should be either the name "
+            "used when creating the Cloud TPU, or a "
+            "grpc://ip.address.of.tpu:8470 url. Passing `local` will use the"
+            "CPU of the local instance instead. (Good for debugging.)"))
+    key_flags.append("tpu")
+
+    flags.DEFINE_string(
+        name="tpu_zone",
+        default=None,
+        help=help_wrap(
+            "[Optional] GCE zone where the Cloud TPU is located in. If not "
+            "specified, we will attempt to automatically detect the GCE "
+            "project from metadata."))
+
+    flags.DEFINE_string(
+        name="tpu_gcp_project",
+        default=None,
+        help=help_wrap(
+            "[Optional] Project name for the Cloud TPU-enabled project. If not "
+            "specified, we will attempt to automatically detect the GCE "
+            "project from metadata."))
+
+    flags.DEFINE_integer(
+        name="num_tpu_shards",
+        default=8,
+        help=help_wrap("Number of shards (TPU chips)."))
+
+  return key_flags
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/flags/_distribution.py b/cv/classification/resnet50/tensorflow2.0/utils/flags/_distribution.py
new file mode 100644
index 000000000..e7cfcdf69
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/flags/_distribution.py
@@ -0,0 +1,52 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Flags related to distributed execution."""
+
+from absl import flags
+import tensorflow as tf
+
+from utils.flags._conventions import help_wrap
+
+
+def define_distribution(worker_hosts=True, task_index=True):
+  """Register distributed execution flags.
+
+  Args:
+    worker_hosts: Create a flag for specifying comma-separated list of workers.
+    task_index: Create a flag for specifying index of task.
+
+  Returns:
+    A list of flags for core.py to marks as key flags.
+  """
+  key_flags = []
+
+  if worker_hosts:
+    flags.DEFINE_string(
+        name='worker_hosts',
+        default=None,
+        help=help_wrap(
+            'Comma-separated list of worker ip:port pairs for running '
+            'multi-worker models with DistributionStrategy.  The user would '
+            'start the program on each host with identical value for this '
+            'flag.'))
+
+  if task_index:
+    flags.DEFINE_integer(
+        name='task_index',
+        default=-1,
+        help=help_wrap('If multi-worker training, the task_index of this '
+                       'worker.'))
+
+  return key_flags
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/flags/_misc.py b/cv/classification/resnet50/tensorflow2.0/utils/flags/_misc.py
new file mode 100644
index 000000000..eb248fe47
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/flags/_misc.py
@@ -0,0 +1,48 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Misc flags."""
+
+from absl import flags
+
+from utils.flags._conventions import help_wrap
+
+
+def define_image(data_format=True):
+  """Register image specific flags.
+
+  Args:
+    data_format: Create a flag to specify image axis convention.
+
+  Returns:
+    A list of flags for core.py to marks as key flags.
+  """
+
+  key_flags = []
+
+  if data_format:
+    flags.DEFINE_enum(
+        name="data_format",
+        short_name="df",
+        default=None,
+        enum_values=["channels_first", "channels_last"],
+        help=help_wrap(
+            "A flag to override the data format used in the model. "
+            "channels_first provides a performance boost on GPU but is not "
+            "always compatible with CPU. If left unspecified, the data format "
+            "will be chosen automatically based on whether TensorFlow was "
+            "built for CPU or GPU."))
+    key_flags.append("data_format")
+
+  return key_flags
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/flags/_performance.py b/cv/classification/resnet50/tensorflow2.0/utils/flags/_performance.py
new file mode 100644
index 000000000..56ccbe460
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/flags/_performance.py
@@ -0,0 +1,294 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Register flags for optimizing performance."""
+
+import multiprocessing
+
+from absl import flags  # pylint: disable=g-bad-import-order
+import tensorflow as tf  # pylint: disable=g-bad-import-order
+
+from utils.flags._conventions import help_wrap
+
+# Map string to TensorFlow dtype
+DTYPE_MAP = {
+    "fp16": tf.float16,
+    "bf16": tf.bfloat16,
+    "fp32": tf.float32,
+}
+
+
+def get_tf_dtype(flags_obj):
+  if getattr(flags_obj, "fp16_implementation", None) == "graph_rewrite":
+    # If the graph_rewrite is used, we build the graph with fp32, and let the
+    # graph rewrite change ops to fp16.
+    return tf.float32
+  return DTYPE_MAP[flags_obj.dtype]
+
+
+def get_loss_scale(flags_obj, default_for_fp16):
+  dtype = get_tf_dtype(flags_obj)
+  if flags_obj.loss_scale == "dynamic":
+    return flags_obj.loss_scale
+  elif flags_obj.loss_scale is not None:
+    return float(flags_obj.loss_scale)
+  elif dtype == tf.float32 or dtype == tf.bfloat16:
+    return 1  # No loss scaling is needed for fp32
+  else:
+    assert dtype == tf.float16
+    return default_for_fp16
+
+
+def define_performance(num_parallel_calls=False,
+                       inter_op=False,
+                       intra_op=False,
+                       synthetic_data=False,
+                       max_train_steps=False,
+                       dtype=False,
+                       all_reduce_alg=False,
+                       num_packs=False,
+                       tf_gpu_thread_mode=False,
+                       datasets_num_private_threads=False,
+                       datasets_num_parallel_batches=False,
+                       fp16_implementation=False,
+                       loss_scale=False,
+                       tf_data_experimental_slack=False,
+                       enable_xla=False,
+                       training_dataset_cache=False):
+  """Register flags for specifying performance tuning arguments.
+
+  Args:
+    num_parallel_calls: Create a flag to specify parallelism of data loading.
+    inter_op: Create a flag to allow specification of inter op threads.
+    intra_op: Create a flag to allow specification of intra op threads.
+    synthetic_data: Create a flag to allow the use of synthetic data.
+    max_train_steps: Create a flags to allow specification of maximum number of
+      training steps
+    dtype: Create flags for specifying dtype.
+    all_reduce_alg: If set forces a specific algorithm for multi-gpu.
+    num_packs: If set provides number of packs for MirroredStrategy's cross
+      device ops.
+    tf_gpu_thread_mode: gpu_private triggers us of private thread pool.
+    datasets_num_private_threads: Number of private threads for datasets.
+    datasets_num_parallel_batches: Determines how many batches to process in
+      parallel when using map and batch from tf.data.
+    fp16_implementation: Create fp16_implementation flag.
+    loss_scale: Controls the loss scaling, normally for mixed-precision
+      training. Can only be turned on if dtype is also True.
+    tf_data_experimental_slack: Determines whether to enable tf.data's
+      `experimental_slack` option.
+    enable_xla: Determines if XLA (auto clustering) is turned on.
+    training_dataset_cache: Whether to cache the training dataset on workers.
+      Typically used to improve training performance when training data is in
+      remote storage and can fit into worker memory.
+
+  Returns:
+    A list of flags for core.py to marks as key flags.
+  """
+
+  key_flags = []
+  if num_parallel_calls:
+    flags.DEFINE_integer(
+        name="num_parallel_calls",
+        short_name="npc",
+        default=multiprocessing.cpu_count(),
+        help=help_wrap("The number of records that are  processed in parallel "
+                       "during input processing. This can be optimized per "
+                       "data set but for generally homogeneous data sets, "
+                       "should be approximately the number of available CPU "
+                       "cores. (default behavior)"))
+
+  if inter_op:
+    flags.DEFINE_integer(
+        name="inter_op_parallelism_threads",
+        short_name="inter",
+        default=0,
+        help=help_wrap("Number of inter_op_parallelism_threads to use for CPU. "
+                       "See TensorFlow config.proto for details."))
+
+  if intra_op:
+    flags.DEFINE_integer(
+        name="intra_op_parallelism_threads",
+        short_name="intra",
+        default=0,
+        help=help_wrap("Number of intra_op_parallelism_threads to use for CPU. "
+                       "See TensorFlow config.proto for details."))
+
+  if synthetic_data:
+    flags.DEFINE_bool(
+        name="use_synthetic_data",
+        short_name="synth",
+        default=False,
+        help=help_wrap(
+            "If set, use fake data (zeroes) instead of a real dataset. "
+            "This mode is useful for performance debugging, as it removes "
+            "input processing steps, but will not learn anything."))
+
+  if max_train_steps:
+    flags.DEFINE_integer(
+        name="max_train_steps",
+        short_name="mts",
+        default=None,
+        help=help_wrap(
+            "The model will stop training if the global_step reaches this "
+            "value. If not set, training will run until the specified number "
+            "of epochs have run as usual. It is generally recommended to set "
+            "--train_epochs=1 when using this flag."))
+
+  if dtype:
+    flags.DEFINE_enum(
+        name="dtype",
+        short_name="dt",
+        default="fp32",
+        enum_values=DTYPE_MAP.keys(),
+        help=help_wrap("The TensorFlow datatype used for calculations. "
+                       "For 16-bit dtypes, variables and certain ops will "
+                       "still be float32 for numeric stability."))
+
+    if loss_scale:
+      flags.DEFINE_string(
+          name="loss_scale",
+          short_name="ls",
+          default=None,
+          help=help_wrap(
+              "The amount to scale the loss by when --dtype=fp16. This can be "
+              "an int/float or the string 'dynamic'. Before gradients are "
+              "computed, the loss is multiplied by the loss scale, making all "
+              "gradients loss_scale times larger. To adjust for this, "
+              "gradients are divided by the loss scale before being applied to "
+              "variables. This is mathematically equivalent to training "
+              "without a loss scale, but the loss scale helps avoid some "
+              "intermediate gradients from underflowing to zero. The default "
+              "is 'dynamic', which dynamic determines the optimal loss scale "
+              "during training."))
+
+      # pylint: disable=unused-variable
+      @flags.validator(
+          flag_name="loss_scale",
+          message="loss_scale should be a positive int/float or the string "
+                  "'dynamic'.")
+      def _check_loss_scale(loss_scale):
+        """Validator to check the loss scale flag is valid."""
+        if loss_scale is None:
+          return True  # null case is handled in get_loss_scale()
+
+        if loss_scale == "dynamic":
+          return True
+
+        try:
+          loss_scale = float(loss_scale)
+        except ValueError:
+          return False
+
+        return loss_scale > 0
+      # pylint: enable=unused-variable
+
+    if fp16_implementation:
+      flags.DEFINE_enum(
+          name="fp16_implementation",
+          default="keras",
+          enum_values=("keras', 'graph_rewrite"),
+          help=help_wrap(
+              "When --dtype=fp16, how fp16 should be implemented. This has no "
+              "impact on correctness. 'keras' uses the "
+              "tf.keras.mixed_precision API. 'graph_rewrite' uses the "
+              "tf.compat.v1.mixed_precision."
+              "enable_mixed_precision_graph_rewrite API."))
+
+      @flags.multi_flags_validator(
+          ["fp16_implementation", "dtype", "loss_scale"])
+      def _check_fp16_implementation(flags_dict):
+        """Validator to check fp16_implementation flag is valid."""
+        if (flags_dict["fp16_implementation"] == "graph_rewrite" and
+            flags_dict["dtype"] != "fp16"):
+          raise flags.ValidationError("--fp16_implementation should not be "
+                                      "specified unless --dtype=fp16")
+        return True
+
+  if all_reduce_alg:
+    flags.DEFINE_string(
+        name="all_reduce_alg",
+        short_name="ara",
+        default=None,
+        help=help_wrap("Defines the algorithm to use for performing all-reduce."
+                       "When specified with MirroredStrategy for single "
+                       "worker, this controls "
+                       "tf.contrib.distribute.AllReduceCrossTowerOps.  When "
+                       "specified with MultiWorkerMirroredStrategy, this "
+                       "controls "
+                       "tf.distribute.experimental.CollectiveCommunication; "
+                       "valid options are `ring` and `nccl`."))
+
+  if num_packs:
+    flags.DEFINE_integer(
+        name="num_packs",
+        default=1,
+        help=help_wrap("Sets `num_packs` in the cross device ops used in "
+                       "MirroredStrategy.  For details, see "
+                       "tf.distribute.NcclAllReduce."))
+
+  if tf_gpu_thread_mode:
+    flags.DEFINE_string(
+        name="tf_gpu_thread_mode",
+        short_name="gt_mode",
+        default=None,
+        help=help_wrap(
+            "Whether and how the GPU device uses its own threadpool."))
+
+    flags.DEFINE_integer(
+        name="per_gpu_thread_count",
+        short_name="pgtc",
+        default=0,
+        help=help_wrap("The number of threads to use for GPU. Only valid when "
+                       "tf_gpu_thread_mode is not global."))
+
+  if datasets_num_private_threads:
+    flags.DEFINE_integer(
+        name="datasets_num_private_threads",
+        default=None,
+        help=help_wrap(
+            "Number of threads for a private threadpool created for all"
+            "datasets computation.."))
+
+  if datasets_num_parallel_batches:
+    flags.DEFINE_integer(
+        name="datasets_num_parallel_batches",
+        default=None,
+        help=help_wrap(
+            "Determines how many batches to process in parallel when using "
+            "map and batch from tf.data."))
+
+  if training_dataset_cache:
+    flags.DEFINE_boolean(
+        name="training_dataset_cache",
+        default=False,
+        help=help_wrap(
+            "Determines whether to cache the training dataset on workers. "
+            "Typically used to improve training performance when training "
+            "data is in remote storage and can fit into worker memory."))
+
+  if tf_data_experimental_slack:
+    flags.DEFINE_boolean(
+        name="tf_data_experimental_slack",
+        default=False,
+        help=help_wrap(
+            "Whether to enable tf.data's `experimental_slack` option."))
+
+  if enable_xla:
+    flags.DEFINE_boolean(
+        name="enable_xla",
+        default=False,
+        help="Whether to enable XLA auto jit compilation")
+
+  return key_flags
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/flags/core.py b/cv/classification/resnet50/tensorflow2.0/utils/flags/core.py
new file mode 100644
index 000000000..3d894f9cb
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/flags/core.py
@@ -0,0 +1,130 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Public interface for flag definition.
+
+See _example.py for detailed instructions on defining flags.
+"""
+
+import sys
+
+from six.moves import shlex_quote
+
+from absl import app as absl_app
+from absl import flags
+
+from utils.flags import _base
+from utils.flags import _benchmark
+from utils.flags import _conventions
+from utils.flags import _device
+from utils.flags import _distribution
+from utils.flags import _misc
+from utils.flags import _performance
+
+
+def set_defaults(**kwargs):
+  for key, value in kwargs.items():
+    flags.FLAGS.set_default(name=key, value=value)
+
+
+def parse_flags(argv=None):
+  """Reset flags and reparse. Currently only used in testing."""
+  flags.FLAGS.unparse_flags()
+  absl_app.parse_flags_with_usage(argv or sys.argv)
+
+
+def register_key_flags_in_core(f):
+  """Defines a function in core.py, and registers its key flags.
+
+  absl uses the location of a flags.declare_key_flag() to determine the context
+  in which a flag is key. By making all declares in core, this allows model
+  main functions to call flags.adopt_module_key_flags() on core and correctly
+  chain key flags.
+
+  Args:
+    f:  The function to be wrapped
+
+  Returns:
+    The "core-defined" version of the input function.
+  """
+
+  def core_fn(*args, **kwargs):
+    key_flags = f(*args, **kwargs)
+    [flags.declare_key_flag(fl) for fl in key_flags]  # pylint: disable=expression-not-assigned
+
+  return core_fn
+
+
+define_base = register_key_flags_in_core(_base.define_base)
+# We have define_base_eager for compatibility, since it used to be a separate
+# function from define_base.
+define_base_eager = define_base
+define_log_steps = register_key_flags_in_core(_benchmark.define_log_steps)
+define_benchmark = register_key_flags_in_core(_benchmark.define_benchmark)
+define_device = register_key_flags_in_core(_device.define_device)
+define_image = register_key_flags_in_core(_misc.define_image)
+define_performance = register_key_flags_in_core(_performance.define_performance)
+define_distribution = register_key_flags_in_core(
+    _distribution.define_distribution)
+
+help_wrap = _conventions.help_wrap
+
+get_num_gpus = _base.get_num_gpus
+get_tf_dtype = _performance.get_tf_dtype
+get_loss_scale = _performance.get_loss_scale
+DTYPE_MAP = _performance.DTYPE_MAP
+require_cloud_storage = _device.require_cloud_storage
+
+
+def _get_nondefault_flags_as_dict():
+  """Returns the nondefault flags as a dict from flag name to value."""
+  nondefault_flags = {}
+  for flag_name in flags.FLAGS:
+    flag_value = getattr(flags.FLAGS, flag_name)
+    if (flag_name != flags.FLAGS[flag_name].short_name and
+        flag_value != flags.FLAGS[flag_name].default):
+      nondefault_flags[flag_name] = flag_value
+  return nondefault_flags
+
+
+def get_nondefault_flags_as_str():
+  """Returns flags as a string that can be passed as command line arguments.
+
+  E.g., returns: "--batch_size=256 --use_synthetic_data" for the following code
+  block:
+
+  ```
+  flags.FLAGS.batch_size = 256
+  flags.FLAGS.use_synthetic_data = True
+  print(get_nondefault_flags_as_str())
+  ```
+
+  Only flags with nondefault values are returned, as passing default flags as
+  command line arguments has no effect.
+
+  Returns:
+    A string with the flags, that can be passed as command line arguments to a
+    program to use the flags.
+  """
+  nondefault_flags = _get_nondefault_flags_as_dict()
+  flag_strings = []
+  for name, value in sorted(nondefault_flags.items()):
+    if isinstance(value, bool):
+      flag_str = '--{}'.format(name) if value else '--no{}'.format(name)
+    elif isinstance(value, list):
+      flag_str = '--{}={}'.format(name, ','.join(value))
+    else:
+      flag_str = '--{}={}'.format(name, value)
+    flag_strings.append(flag_str)
+  return ' '.join(shlex_quote(flag_str) for flag_str in flag_strings)
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/flags/flags_test.py b/cv/classification/resnet50/tensorflow2.0/utils/flags/flags_test.py
new file mode 100644
index 000000000..a9df4c99f
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/flags/flags_test.py
@@ -0,0 +1,162 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from absl import flags
+import tensorflow as tf
+
+from utils.flags import core as flags_core  # pylint: disable=g-bad-import-order
+
+
+def define_flags():
+  flags_core.define_base(
+      clean=True,
+      num_gpu=False,
+      stop_threshold=True,
+      hooks=True,
+      train_epochs=True,
+      epochs_between_evals=True)
+  flags_core.define_performance(
+      num_parallel_calls=True,
+      inter_op=True,
+      intra_op=True,
+      loss_scale=True,
+      synthetic_data=True,
+      dtype=True)
+  flags_core.define_image()
+  flags_core.define_benchmark()
+
+
+class BaseTester(unittest.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    super(BaseTester, cls).setUpClass()
+    define_flags()
+
+  def test_default_setting(self):
+    """Test to ensure fields exist and defaults can be set."""
+
+    defaults = dict(
+        data_dir="dfgasf",
+        model_dir="dfsdkjgbs",
+        train_epochs=534,
+        epochs_between_evals=15,
+        batch_size=256,
+        hooks=["LoggingTensorHook"],
+        num_parallel_calls=18,
+        inter_op_parallelism_threads=5,
+        intra_op_parallelism_threads=10,
+        data_format="channels_first")
+
+    flags_core.set_defaults(**defaults)
+    flags_core.parse_flags()
+
+    for key, value in defaults.items():
+      assert flags.FLAGS.get_flag_value(name=key, default=None) == value
+
+  def test_benchmark_setting(self):
+    defaults = dict(
+        hooks=["LoggingMetricHook"],
+        benchmark_log_dir="/tmp/12345",
+        gcp_project="project_abc",
+    )
+
+    flags_core.set_defaults(**defaults)
+    flags_core.parse_flags()
+
+    for key, value in defaults.items():
+      assert flags.FLAGS.get_flag_value(name=key, default=None) == value
+
+  def test_booleans(self):
+    """Test to ensure boolean flags trigger as expected."""
+
+    flags_core.parse_flags([__file__, "--use_synthetic_data"])
+
+    assert flags.FLAGS.use_synthetic_data
+
+  def test_parse_dtype_info(self):
+    flags_core.parse_flags([__file__, "--dtype", "fp16"])
+    self.assertEqual(flags_core.get_tf_dtype(flags.FLAGS), tf.float16)
+    self.assertEqual(
+        flags_core.get_loss_scale(flags.FLAGS, default_for_fp16=2), 2)
+
+    flags_core.parse_flags([__file__, "--dtype", "fp16", "--loss_scale", "5"])
+    self.assertEqual(
+        flags_core.get_loss_scale(flags.FLAGS, default_for_fp16=2), 5)
+
+    flags_core.parse_flags(
+        [__file__, "--dtype", "fp16", "--loss_scale", "dynamic"])
+    self.assertEqual(
+        flags_core.get_loss_scale(flags.FLAGS, default_for_fp16=2), "dynamic")
+
+    flags_core.parse_flags([__file__, "--dtype", "fp32"])
+    self.assertEqual(flags_core.get_tf_dtype(flags.FLAGS), tf.float32)
+    self.assertEqual(
+        flags_core.get_loss_scale(flags.FLAGS, default_for_fp16=2), 1)
+
+    flags_core.parse_flags([__file__, "--dtype", "fp32", "--loss_scale", "5"])
+    self.assertEqual(
+        flags_core.get_loss_scale(flags.FLAGS, default_for_fp16=2), 5)
+
+    with self.assertRaises(SystemExit):
+      flags_core.parse_flags([__file__, "--dtype", "int8"])
+
+    with self.assertRaises(SystemExit):
+      flags_core.parse_flags(
+          [__file__, "--dtype", "fp16", "--loss_scale", "abc"])
+
+  def test_get_nondefault_flags_as_str(self):
+    defaults = dict(
+        clean=True,
+        data_dir="abc",
+        hooks=["LoggingTensorHook"],
+        stop_threshold=1.5,
+        use_synthetic_data=False)
+    flags_core.set_defaults(**defaults)
+    flags_core.parse_flags()
+
+    expected_flags = ""
+    self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
+
+    flags.FLAGS.clean = False
+    expected_flags += "--noclean"
+    self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
+
+    flags.FLAGS.data_dir = "xyz"
+    expected_flags += " --data_dir=xyz"
+    self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
+
+    flags.FLAGS.hooks = ["aaa", "bbb", "ccc"]
+    expected_flags += " --hooks=aaa,bbb,ccc"
+    self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
+
+    flags.FLAGS.stop_threshold = 3.
+    expected_flags += " --stop_threshold=3.0"
+    self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
+
+    flags.FLAGS.use_synthetic_data = True
+    expected_flags += " --use_synthetic_data"
+    self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
+
+    # Assert that explicit setting a flag to its default value does not cause it
+    # to appear in the string
+    flags.FLAGS.use_synthetic_data = False
+    expected_flags = expected_flags[:-len(" --use_synthetic_data")]
+    self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
+
+
+if __name__ == "__main__":
+  unittest.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/flags/guidelines.md b/cv/classification/resnet50/tensorflow2.0/utils/flags/guidelines.md
new file mode 100644
index 000000000..db963aabe
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/flags/guidelines.md
@@ -0,0 +1,65 @@
+# Using flags in official models
+
+1. **All common flags must be incorporated in the models.**
+
+   Common flags (i.e. batch_size, model_dir, etc.) are provided by various flag definition functions,
+   and channeled through `official.utils.flags.core`. For instance to define common supervised
+   learning parameters one could use the following code:
+
+   ```$xslt
+   from absl import app as absl_app
+   from absl import flags
+
+   from official.utils.flags import core as flags_core
+
+
+   def define_flags():
+     flags_core.define_base()
+     flags.adopt_key_flags(flags_core)
+
+
+   def main(_):
+     flags_obj = flags.FLAGS
+     print(flags_obj)
+
+
+   if __name__ == "__main__"
+     absl_app.run(main)
+   ```
+2. **Validate flag values.**
+
+   See the [Validators](#validators) section for implementation details.
+
+   Validators in the official model repo should not access the file system, such as verifying
+   that files exist, due to the strict ordering requirements.
+
+3. **Flag values should not be mutated.**
+
+   Instead of mutating flag values, use getter functions to return the desired values. An example
+   getter function is `get_tf_dtype` function below:
+
+   ```
+   # Map string to TensorFlow dtype
+   DTYPE_MAP = {
+       "fp16": tf.float16,
+       "fp32": tf.float32,
+   }
+
+   def get_tf_dtype(flags_obj):
+     if getattr(flags_obj, "fp16_implementation", None) == "graph_rewrite":
+       # If the graph_rewrite is used, we build the graph with fp32, and let the
+       # graph rewrite change ops to fp16.
+       return tf.float32
+     return DTYPE_MAP[flags_obj.dtype]
+
+
+   def main(_):
+     flags_obj = flags.FLAGS()
+
+     # Do not mutate flags_obj
+     # if flags_obj.fp16_implementation == "graph_rewrite":
+     #   flags_obj.dtype = "float32" # Don't do this
+
+     print(get_tf_dtype(flags_obj))
+     ...
+   ```
\ No newline at end of file
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/hyperparams_flags.py b/cv/classification/resnet50/tensorflow2.0/utils/hyperparams_flags.py
new file mode 100644
index 000000000..08799f7d2
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/hyperparams_flags.py
@@ -0,0 +1,123 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Common flags for importing hyperparameters."""
+
+from absl import flags
+from utils.flags import core as flags_core
+
+FLAGS = flags.FLAGS
+
+
+def define_gin_flags():
+  """Define common gin configurable flags."""
+  flags.DEFINE_multi_string('gin_file', None,
+                            'List of paths to the config files.')
+  flags.DEFINE_multi_string(
+      'gin_param', None, 'Newline separated list of Gin parameter bindings.')
+
+
+def define_common_hparams_flags():
+  """Define the common flags across models."""
+
+  flags.DEFINE_string(
+      'model_dir',
+      default=None,
+      help=('The directory where the model and training/evaluation summaries'
+            'are stored.'))
+
+  flags.DEFINE_integer(
+      'train_batch_size', default=None, help='Batch size for training.')
+
+  flags.DEFINE_integer(
+      'eval_batch_size', default=None, help='Batch size for evaluation.')
+
+  flags.DEFINE_string(
+      'precision',
+      default=None,
+      help=('Precision to use; one of: {bfloat16, float32}'))
+
+  flags.DEFINE_string(
+      'config_file',
+      default=None,
+      help=('A YAML file which specifies overrides. Note that this file can be '
+            'used as an override template to override the default parameters '
+            'specified in Python. If the same parameter is specified in both '
+            '`--config_file` and `--params_override`, the one in '
+            '`--params_override` will be used finally.'))
+
+  flags.DEFINE_string(
+      'params_override',
+      default=None,
+      help=('a YAML/JSON string or a YAML file which specifies additional '
+            'overrides over the default parameters and those specified in '
+            '`--config_file`. Note that this is supposed to be used only to '
+            'override the model parameters, but not the parameters like TPU '
+            'specific flags. One canonical use case of `--config_file` and '
+            '`--params_override` is users first define a template config file '
+            'using `--config_file`, then use `--params_override` to adjust the '
+            'minimal set of tuning parameters, for example setting up different'
+            ' `train_batch_size`. '
+            'The final override order of parameters: default_model_params --> '
+            'params from config_file --> params in params_override.'
+            'See also the help message of `--config_file`.'))
+  flags.DEFINE_integer('save_checkpoint_freq', None,
+                       'Number of steps to save checkpoint.')
+
+
+def initialize_common_flags():
+  """Define the common flags across models."""
+  define_common_hparams_flags()
+
+  flags_core.define_device(tpu=True)
+  flags_core.define_base(
+      num_gpu=True, model_dir=False, data_dir=False, batch_size=False)
+  flags_core.define_distribution(worker_hosts=True, task_index=True)
+  flags_core.define_performance(all_reduce_alg=True, num_packs=True)
+
+  # Reset the default value of num_gpus to zero.
+  FLAGS.num_gpus = 0
+
+  flags.DEFINE_string(
+      'strategy_type', 'mirrored', 'Type of distribute strategy.'
+      'One of mirrored, tpu and multiworker.')
+
+
+def strategy_flags_dict():
+  """Returns TPU and/or GPU related flags in a dictionary."""
+  return {
+      'distribution_strategy': FLAGS.strategy_type,
+      # TPUStrategy related flags.
+      'tpu': FLAGS.tpu,
+      # MultiWorkerMirroredStrategy related flags.
+      'all_reduce_alg': FLAGS.all_reduce_alg,
+      'worker_hosts': FLAGS.worker_hosts,
+      'task_index': FLAGS.task_index,
+      # MirroredStrategy and OneDeviceStrategy
+      'num_gpus': FLAGS.num_gpus,
+      'num_packs': FLAGS.num_packs,
+  }
+
+
+def hparam_flags_dict():
+  """Returns model params related flags in a dictionary."""
+  return {
+      'data_dir': FLAGS.data_dir,
+      'model_dir': FLAGS.model_dir,
+      'train_batch_size': FLAGS.train_batch_size,
+      'eval_batch_size': FLAGS.eval_batch_size,
+      'precision': FLAGS.precision,
+      'config_file': FLAGS.config_file,
+      'params_override': FLAGS.params_override,
+  }
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/misc/__init__.py b/cv/classification/resnet50/tensorflow2.0/utils/misc/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/misc/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/misc/distribution_utils.py b/cv/classification/resnet50/tensorflow2.0/utils/misc/distribution_utils.py
new file mode 100644
index 000000000..3065c0e54
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/misc/distribution_utils.py
@@ -0,0 +1,17 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helper functions for running models in a distributed setting."""
+# pylint: disable=wildcard-import
+from common.distribute_utils import *
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/misc/keras_utils.py b/cv/classification/resnet50/tensorflow2.0/utils/misc/keras_utils.py
new file mode 100644
index 000000000..a5b20c8a3
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/misc/keras_utils.py
@@ -0,0 +1,211 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helper functions for the Keras implementations of models."""
+
+import multiprocessing
+import os
+import time
+
+from absl import logging
+import tensorflow as tf
+
+from tensorflow.python.eager import monitoring
+
+global_batch_size_gauge = monitoring.IntGauge(
+    '/tensorflow/training/global_batch_size', 'TF training global batch size')
+
+first_batch_time_gauge = monitoring.IntGauge(
+    '/tensorflow/training/first_batch',
+    'TF training start/end time for first batch (unix epoch time in us.',
+    'type')
+
+first_batch_start_time = first_batch_time_gauge.get_cell('start')
+first_batch_end_time = first_batch_time_gauge.get_cell('end')
+
+
+class BatchTimestamp(object):
+  """A structure to store batch time stamp."""
+
+  def __init__(self, batch_index, timestamp):
+    self.batch_index = batch_index
+    self.timestamp = timestamp
+
+  def __repr__(self):
+    return "'BatchTimestamp<batch_index: {}, timestamp: {}>'".format(
+        self.batch_index, self.timestamp)
+
+
+class TimeHistory(tf.keras.callbacks.Callback):
+  """Callback for Keras models."""
+
+  def __init__(self, batch_size, log_steps, initial_step=0, logdir=None):
+    """Callback for logging performance.
+
+    Args:
+      batch_size: Total batch size.
+      log_steps: Interval of steps between logging of batch level stats.
+      initial_step: Optional, initial step.
+      logdir: Optional directory to write TensorBoard summaries.
+    """
+    # TODO(wcromar): remove this parameter and rely on `logs` parameter of
+    # on_train_batch_end()
+    self.batch_size = batch_size
+    super(TimeHistory, self).__init__()
+    self.log_steps = log_steps
+    self.last_log_step = initial_step
+    self.steps_before_epoch = initial_step
+    self.steps_in_epoch = 0
+    self.start_time = None
+
+    global_batch_size_gauge.get_cell().set(batch_size)
+
+    if logdir:
+      self.summary_writer = tf.summary.create_file_writer(logdir)
+    else:
+      self.summary_writer = None
+
+    # Logs start of step 1 then end of each step based on log_steps interval.
+    self.timestamp_log = []
+
+    # Records the time each epoch takes to run from start to finish of epoch.
+    self.epoch_runtime_log = []
+
+  @property
+  def global_steps(self):
+    """The current 1-indexed global step."""
+    return self.steps_before_epoch + self.steps_in_epoch
+
+  @property
+  def average_steps_per_second(self):
+    """The average training steps per second across all epochs."""
+    return self.global_steps / sum(self.epoch_runtime_log)
+
+  @property
+  def average_examples_per_second(self):
+    """The average number of training examples per second across all epochs."""
+    return self.average_steps_per_second * self.batch_size
+
+  def get_examples_per_sec(self, warmup=1):
+    """Calculates examples/sec through timestamp_log and skip warmup period."""
+    # First entry in timestamp_log is the start of the step 1. The rest of the
+    # entries are the end of each step recorded.
+    time_log = self.timestamp_log
+    seconds = time_log[-1].timestamp - time_log[warmup].timestamp
+    steps = time_log[-1].batch_index - time_log[warmup].batch_index
+    return self.batch_size * steps / seconds
+
+  def get_startup_time(self, start_time_sec):
+    return self.timestamp_log[0].timestamp - start_time_sec
+
+  def on_train_end(self, logs=None):
+    self.train_finish_time = time.time()
+
+    if self.summary_writer:
+      self.summary_writer.flush()
+
+  def on_epoch_begin(self, epoch, logs=None):
+    self.epoch_start = time.time()
+
+  def on_batch_begin(self, batch, logs=None):
+    if not self.start_time:
+      self.start_time = time.time()
+      if not first_batch_start_time.value():
+        first_batch_start_time.set(int(self.start_time * 1000000))
+
+    # Record the timestamp of the first global step
+    if not self.timestamp_log:
+      self.timestamp_log.append(
+          BatchTimestamp(self.global_steps, self.start_time))
+
+  def on_batch_end(self, batch, logs=None):
+    """Records elapse time of the batch and calculates examples per second."""
+    if not first_batch_end_time.value():
+      first_batch_end_time.set(int(time.time() * 1000000))
+    self.steps_in_epoch = batch + 1
+    steps_since_last_log = self.global_steps - self.last_log_step
+    if steps_since_last_log >= self.log_steps:
+      now = time.time()
+      elapsed_time = now - self.start_time
+      steps_per_second = steps_since_last_log / elapsed_time
+      examples_per_second = steps_per_second * self.batch_size
+
+      self.timestamp_log.append(BatchTimestamp(self.global_steps, now))
+      logging.info(
+          'TimeHistory: %.2f seconds, %.2f examples/second between steps %d '
+          'and %d', elapsed_time, examples_per_second, self.last_log_step,
+          self.global_steps)
+
+      if self.summary_writer:
+        with self.summary_writer.as_default():
+          tf.summary.scalar('steps_per_second', steps_per_second,
+                            self.global_steps)
+          tf.summary.scalar('examples_per_second', examples_per_second,
+                            self.global_steps)
+
+      self.last_log_step = self.global_steps
+      self.start_time = None
+
+  def on_epoch_end(self, epoch, logs=None):
+    epoch_run_time = time.time() - self.epoch_start
+    self.epoch_runtime_log.append(epoch_run_time)
+
+    self.steps_before_epoch += self.steps_in_epoch
+    self.steps_in_epoch = 0
+
+
+class SimpleCheckpoint(tf.keras.callbacks.Callback):
+  """Keras callback to save tf.train.Checkpoints."""
+
+  def __init__(self, checkpoint_manager):
+    super(SimpleCheckpoint, self).__init__()
+    self.checkpoint_manager = checkpoint_manager
+
+  def on_epoch_end(self, epoch, logs=None):
+    step_counter = self.checkpoint_manager._step_counter.numpy()  # pylint: disable=protected-access
+    self.checkpoint_manager.save(checkpoint_number=step_counter)
+
+
+def set_session_config(enable_xla=False):
+  """Sets the session config."""
+  if enable_xla:
+    tf.config.optimizer.set_jit(True)
+
+
+# TODO(hongkuny): remove set_config_v2 globally.
+set_config_v2 = set_session_config
+
+
+def set_gpu_thread_mode_and_count(gpu_thread_mode, datasets_num_private_threads,
+                                  num_gpus, per_gpu_thread_count):
+  """Set GPU thread mode and count, and adjust dataset threads count."""
+  cpu_count = multiprocessing.cpu_count()
+  logging.info('Logical CPU cores: %s', cpu_count)
+
+  # Allocate private thread pool for each GPU to schedule and launch kernels
+  per_gpu_thread_count = per_gpu_thread_count or 2
+  os.environ['TF_GPU_THREAD_MODE'] = gpu_thread_mode
+  os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count)
+  logging.info('TF_GPU_THREAD_COUNT: %s', os.environ['TF_GPU_THREAD_COUNT'])
+  logging.info('TF_GPU_THREAD_MODE: %s', os.environ['TF_GPU_THREAD_MODE'])
+
+  # Limit data preprocessing threadpool to CPU cores minus number of total GPU
+  # private threads and memory copy threads.
+  total_gpu_thread_count = per_gpu_thread_count * num_gpus
+  num_runtime_threads = num_gpus
+  if not datasets_num_private_threads:
+    datasets_num_private_threads = min(
+        cpu_count - total_gpu_thread_count - num_runtime_threads, num_gpus * 8)
+    logging.info('Set datasets_num_private_threads to %s',
+                 datasets_num_private_threads)
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/misc/model_helpers.py b/cv/classification/resnet50/tensorflow2.0/utils/misc/model_helpers.py
new file mode 100644
index 000000000..4c310588b
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/misc/model_helpers.py
@@ -0,0 +1,94 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Miscellaneous functions that can be called by models."""
+
+import numbers
+
+from absl import logging
+import tensorflow as tf
+
+from tensorflow.python.util import nest
+# pylint:disable=logging-format-interpolation
+
+
+def past_stop_threshold(stop_threshold, eval_metric):
+  """Return a boolean representing whether a model should be stopped.
+
+  Args:
+    stop_threshold: float, the threshold above which a model should stop
+      training.
+    eval_metric: float, the current value of the relevant metric to check.
+
+  Returns:
+    True if training should stop, False otherwise.
+
+  Raises:
+    ValueError: if either stop_threshold or eval_metric is not a number
+  """
+  if stop_threshold is None:
+    return False
+
+  if not isinstance(stop_threshold, numbers.Number):
+    raise ValueError("Threshold for checking stop conditions must be a number.")
+  if not isinstance(eval_metric, numbers.Number):
+    raise ValueError("Eval metric being checked against stop conditions "
+                     "must be a number.")
+
+  if eval_metric >= stop_threshold:
+    logging.info("Stop threshold of {} was passed with metric value {}.".format(
+        stop_threshold, eval_metric))
+    return True
+
+  return False
+
+
+def generate_synthetic_data(input_shape,
+                            input_value=0,
+                            input_dtype=None,
+                            label_shape=None,
+                            label_value=0,
+                            label_dtype=None):
+  """Create a repeating dataset with constant values.
+
+  Args:
+    input_shape: a tf.TensorShape object or nested tf.TensorShapes. The shape of
+      the input data.
+    input_value: Value of each input element.
+    input_dtype: Input dtype. If None, will be inferred by the input value.
+    label_shape: a tf.TensorShape object or nested tf.TensorShapes. The shape of
+      the label data.
+    label_value: Value of each input element.
+    label_dtype: Input dtype. If None, will be inferred by the target value.
+
+  Returns:
+    Dataset of tensors or tuples of tensors (if label_shape is set).
+  """
+  # TODO(kathywu): Replace with SyntheticDataset once it is in contrib.
+  element = input_element = nest.map_structure(
+      lambda s: tf.constant(input_value, input_dtype, s), input_shape)
+
+  if label_shape:
+    label_element = nest.map_structure(
+        lambda s: tf.constant(label_value, label_dtype, s), label_shape)
+    element = (input_element, label_element)
+
+  return tf.data.Dataset.from_tensors(element).repeat()
+
+
+def apply_clean(flags_obj):
+  if flags_obj.clean and tf.io.gfile.exists(flags_obj.model_dir):
+    logging.info("--clean flag set. Removing existing model dir:"
+                 " {}".format(flags_obj.model_dir))
+    tf.io.gfile.rmtree(flags_obj.model_dir)
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/misc/model_helpers_test.py b/cv/classification/resnet50/tensorflow2.0/utils/misc/model_helpers_test.py
new file mode 100644
index 000000000..dd01c3431
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/misc/model_helpers_test.py
@@ -0,0 +1,127 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for Model Helper functions."""
+
+import tensorflow as tf  # pylint: disable=g-bad-import-order
+
+from official.utils.misc import model_helpers
+
+
+class PastStopThresholdTest(tf.test.TestCase):
+  """Tests for past_stop_threshold."""
+
+  def setUp(self):
+    super(PastStopThresholdTest, self).setUp()
+    tf.compat.v1.disable_eager_execution()
+
+  def test_past_stop_threshold(self):
+    """Tests for normal operating conditions."""
+    self.assertTrue(model_helpers.past_stop_threshold(0.54, 1))
+    self.assertTrue(model_helpers.past_stop_threshold(54, 100))
+    self.assertFalse(model_helpers.past_stop_threshold(0.54, 0.1))
+    self.assertFalse(model_helpers.past_stop_threshold(-0.54, -1.5))
+    self.assertTrue(model_helpers.past_stop_threshold(-0.54, 0))
+    self.assertTrue(model_helpers.past_stop_threshold(0, 0))
+    self.assertTrue(model_helpers.past_stop_threshold(0.54, 0.54))
+
+  def test_past_stop_threshold_none_false(self):
+    """Tests that check None returns false."""
+    self.assertFalse(model_helpers.past_stop_threshold(None, -1.5))
+    self.assertFalse(model_helpers.past_stop_threshold(None, None))
+    self.assertFalse(model_helpers.past_stop_threshold(None, 1.5))
+    # Zero should be okay, though.
+    self.assertTrue(model_helpers.past_stop_threshold(0, 1.5))
+
+  def test_past_stop_threshold_not_number(self):
+    """Tests for error conditions."""
+    with self.assertRaises(ValueError):
+      model_helpers.past_stop_threshold('str', 1)
+
+    with self.assertRaises(ValueError):
+      model_helpers.past_stop_threshold('str', tf.constant(5))
+
+    with self.assertRaises(ValueError):
+      model_helpers.past_stop_threshold('str', 'another')
+
+    with self.assertRaises(ValueError):
+      model_helpers.past_stop_threshold(0, None)
+
+    with self.assertRaises(ValueError):
+      model_helpers.past_stop_threshold(0.7, 'str')
+
+    with self.assertRaises(ValueError):
+      model_helpers.past_stop_threshold(tf.constant(4), None)
+
+
+class SyntheticDataTest(tf.test.TestCase):
+  """Tests for generate_synthetic_data."""
+
+  def test_generate_synethetic_data(self):
+    input_element, label_element = tf.compat.v1.data.make_one_shot_iterator(
+        model_helpers.generate_synthetic_data(
+            input_shape=tf.TensorShape([5]),
+            input_value=123,
+            input_dtype=tf.float32,
+            label_shape=tf.TensorShape([]),
+            label_value=456,
+            label_dtype=tf.int32)).get_next()
+
+    with self.session() as sess:
+      for n in range(5):
+        inp, lab = sess.run((input_element, label_element))
+        self.assertAllClose(inp, [123., 123., 123., 123., 123.])
+        self.assertEquals(lab, 456)
+
+  def test_generate_only_input_data(self):
+    d = model_helpers.generate_synthetic_data(
+        input_shape=tf.TensorShape([4]),
+        input_value=43.5,
+        input_dtype=tf.float32)
+
+    element = tf.compat.v1.data.make_one_shot_iterator(d).get_next()
+    self.assertFalse(isinstance(element, tuple))
+
+    with self.session() as sess:
+      inp = sess.run(element)
+      self.assertAllClose(inp, [43.5, 43.5, 43.5, 43.5])
+
+  def test_generate_nested_data(self):
+    d = model_helpers.generate_synthetic_data(
+        input_shape={
+            'a': tf.TensorShape([2]),
+            'b': {
+                'c': tf.TensorShape([3]),
+                'd': tf.TensorShape([])
+            }
+        },
+        input_value=1.1)
+
+    element = tf.compat.v1.data.make_one_shot_iterator(d).get_next()
+    self.assertIn('a', element)
+    self.assertIn('b', element)
+    self.assertEquals(len(element['b']), 2)
+    self.assertIn('c', element['b'])
+    self.assertIn('d', element['b'])
+    self.assertNotIn('c', element)
+
+    with self.session() as sess:
+      inp = sess.run(element)
+      self.assertAllClose(inp['a'], [1.1, 1.1])
+      self.assertAllClose(inp['b']['c'], [1.1, 1.1, 1.1])
+      self.assertAllClose(inp['b']['d'], 1.1)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/testing/__init__.py b/cv/classification/resnet50/tensorflow2.0/utils/testing/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/testing/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/testing/integration.py b/cv/classification/resnet50/tensorflow2.0/utils/testing/integration.py
new file mode 100644
index 000000000..ceee7f920
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/testing/integration.py
@@ -0,0 +1,70 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helper code to run complete models from within python."""
+
+import os
+import shutil
+import sys
+import tempfile
+
+from absl import flags
+from absl.testing import flagsaver
+
+from utils.flags import core as flags_core
+
+
+@flagsaver.flagsaver
+def run_synthetic(main,
+                  tmp_root,
+                  extra_flags=None,
+                  synth=True,
+                  train_epochs=1,
+                  epochs_between_evals=1):
+  """Performs a minimal run of a model.
+
+    This function is intended to test for syntax errors throughout a model. A
+  very limited run is performed using synthetic data.
+
+  Args:
+    main: The primary function used to exercise a code path. Generally this
+      function is "<MODULE>.main(argv)".
+    tmp_root: Root path for the temp directory created by the test class.
+    extra_flags: Additional flags passed by the caller of this function.
+    synth: Use synthetic data.
+    train_epochs: Value of the --train_epochs flag.
+    epochs_between_evals: Value of the --epochs_between_evals flag.
+  """
+
+  extra_flags = [] if extra_flags is None else extra_flags
+
+  model_dir = tempfile.mkdtemp(dir=tmp_root)
+
+  args = [sys.argv[0], "--model_dir", model_dir] + extra_flags
+
+  if synth:
+    args.append("--use_synthetic_data")
+
+  if train_epochs is not None:
+    args.extend(["--train_epochs", str(train_epochs)])
+
+  if epochs_between_evals is not None:
+    args.extend(["--epochs_between_evals", str(epochs_between_evals)])
+
+  try:
+    flags_core.parse_flags(argv=args)
+    main(flags.FLAGS)
+  finally:
+    if os.path.exists(model_dir):
+      shutil.rmtree(model_dir)
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/testing/mock_task.py b/cv/classification/resnet50/tensorflow2.0/utils/testing/mock_task.py
new file mode 100644
index 000000000..fdf7da4d0
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/testing/mock_task.py
@@ -0,0 +1,101 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Mock task for testing."""
+
+import dataclasses
+import numpy as np
+import tensorflow as tf
+
+from core import base_task
+from core import config_definitions as cfg
+from core import exp_factory
+from core import task_factory
+
+
+class MockModel(tf.keras.Model):
+
+  def __init__(self, network):
+    super().__init__()
+    self.network = network
+
+  def call(self, inputs):
+    outputs = self.network(inputs)
+    self.add_loss(tf.reduce_mean(outputs))
+    return outputs
+
+
+@dataclasses.dataclass
+class MockTaskConfig(cfg.TaskConfig):
+  pass
+
+
+@task_factory.register_task_cls(MockTaskConfig)
+class MockTask(base_task.Task):
+  """Mock task object for testing."""
+
+  def __init__(self, params=None, logging_dir=None, name=None):
+    super().__init__(params=params, logging_dir=logging_dir, name=name)
+
+  def build_model(self, *arg, **kwargs):
+    inputs = tf.keras.layers.Input(shape=(2,), name="random", dtype=tf.float32)
+    outputs = tf.keras.layers.Dense(
+        1, bias_initializer=tf.keras.initializers.Ones(), name="dense_0")(
+            inputs)
+    network = tf.keras.Model(inputs=inputs, outputs=outputs)
+    return MockModel(network)
+
+  def build_metrics(self, training: bool = True):
+    del training
+    return [tf.keras.metrics.Accuracy(name="acc")]
+
+  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
+    logs = super().validation_step(inputs, model, metrics)
+    logs["counter"] = tf.constant(1, dtype=tf.float32)
+    return logs
+
+  def build_inputs(self, params):
+
+    def generate_data(_):
+      x = tf.zeros(shape=(2,), dtype=tf.float32)
+      label = tf.zeros([1], dtype=tf.int32)
+      return x, label
+
+    dataset = tf.data.Dataset.range(1)
+    dataset = dataset.repeat()
+    dataset = dataset.map(
+        generate_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+    return dataset.prefetch(buffer_size=1).batch(2, drop_remainder=True)
+
+  def aggregate_logs(self, state, step_outputs):
+    if state is None:
+      state = {}
+    for key, value in step_outputs.items():
+      if key not in state:
+        state[key] = []
+      state[key].append(
+          np.concatenate([np.expand_dims(v.numpy(), axis=0) for v in value]))
+    return state
+
+  def reduce_aggregated_logs(self, aggregated_logs, global_step=None):
+    for k, v in aggregated_logs.items():
+      aggregated_logs[k] = np.sum(np.stack(v, axis=0))
+    return aggregated_logs
+
+
+@exp_factory.register_config_factory("mock")
+def mock_experiment() -> cfg.ExperimentConfig:
+  config = cfg.ExperimentConfig(
+      task=MockTaskConfig(), trainer=cfg.TrainerConfig())
+  return config
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/testing/pylint.rcfile b/cv/classification/resnet50/tensorflow2.0/utils/testing/pylint.rcfile
new file mode 100644
index 000000000..b872802a8
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/testing/pylint.rcfile
@@ -0,0 +1,168 @@
+[MESSAGES CONTROL]
+disable=R,W,bad-option-value,trailing-newlines,no-name-in-module
+
+[REPORTS]
+# Tells whether to display a full report or only the messages
+reports=no
+
+# Activate the evaluation score.
+score=no
+
+[BASIC]
+
+# Regular expression matching correct argument names
+argument-rgx=^[a-z][a-z0-9_]*$
+
+# Regular expression matching correct attribute names
+attr-rgx=^_{0,2}[a-z][a-z0-9_]*$
+
+# Regular expression matching correct class attribute names
+class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
+
+# Regular expression matching correct class names
+class-rgx=^_?[A-Z][a-zA-Z0-9]*$
+
+# Regular expression matching correct constant names
+const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=10
+
+# Regular expression matching correct function names
+function-rgx=^(?:(?P<camel_case>_?[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_?[a-z][a-z0-9_]*))$
+
+# Good variable names which should always be accepted, separated by a comma
+good-names=main,_
+
+# Regular expression matching correct inline iteration names
+inlinevar-rgx=^[a-z][a-z0-9_]*$
+
+# Regular expression matching correct method names
+method-rgx=^(?:(?P<exempt>__[a-z0-9_]+__|next)|(?P<camel_case>_{0,2}[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_{0,2}[a-z][a-z0-9_]*)|(setUp|tearDown))$
+
+# Regular expression matching correct module names
+module-rgx=^(_?[a-z][a-z0-9_]*)|__init__|PRESUBMIT|PRESUBMIT_unittest$
+
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=(__.*__|main|.*ArgParser)
+
+# Naming hint for variable names
+variable-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct variable names
+variable-rgx=^[a-z][a-z0-9_]*$
+
+[TYPECHECK]
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis. It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=absl, absl.*, official, official.*, tensorflow, tensorflow.*, LazyLoader, google, google.cloud.*
+
+
+[CLASSES]
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,__new__,setUp
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,_fields,_replace,_source,_make
+
+# This is deprecated, because it is not used anymore.
+#ignore-iface-methods=
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls,class_
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=mcs
+
+
+[DESIGN]
+
+# Argument names that match this expression will be ignored. Default to name
+# with leading underscore
+ignored-argument-names=_.*
+
+# Maximum number of arguments for function / method
+max-args=5
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+
+# Maximum number of branch for function / method body
+max-branches=12
+
+# Maximum number of locals for function / method body
+max-locals=15
+
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+
+# Maximum number of return / yield for function / method body
+max-returns=6
+
+# Maximum number of statements in function / method body
+max-statements=50
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=2
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when being caught. Defaults to
+# "Exception"
+overgeneral-exceptions=StandardError,Exception,BaseException
+
+
+[FORMAT]
+
+# Number of spaces of indent required inside a hanging or continued line.
+indent-after-paren=4
+
+# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
+# tab).
+indent-string='  '
+
+# Maximum number of characters on a single line.
+max-line-length=80
+
+# Maximum number of lines in a module
+max-module-lines=99999
+
+# List of optional constructs for which whitespace checking is disabled
+no-space-check=
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=yes
+
+# Allow URLs and comment type annotations to exceed the max line length as neither can be easily
+# split across lines.
+ignore-long-lines=^\s*(?:(# )?<?https?://\S+>?$|# type:)
+
+
+[VARIABLES]
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid to define new builtins when possible.
+additional-builtins=
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,_cb
+
+# A regular expression matching the name of dummy variables (i.e. expectedly
+# not used).
+dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_)
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/testing/scripts/builds_common.sh b/cv/classification/resnet50/tensorflow2.0/utils/testing/scripts/builds_common.sh
new file mode 100644
index 000000000..3cf08bb51
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/testing/scripts/builds_common.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Common Bash functions used by build scripts
+
+COLOR_NC='\033[0m'
+COLOR_BOLD='\033[1m'
+COLOR_LIGHT_GRAY='\033[0;37m'
+COLOR_GREEN='\033[0;32m'
+COLOR_RED='\033[0;31m'
+
+die() {
+    # Print a message and exit with code 1.
+    #
+    # Usage: die <error_message>
+    #   e.g., die "Something bad happened."
+
+    echo $@
+    exit 1
+}
+
+num_cpus() {
+    # Get the number of CPUs
+    N_CPUS=$(grep -c ^processor /proc/cpuinfo)
+    if [[ -z ${N_CPUS} ]]; then
+        die "ERROR: Unable to determine the number of CPUs"
+    fi
+
+    echo ${N_CPUS}
+}
+
+# List files changed (i.e., added, or revised) from
+# the common ancestor of HEAD and the latest master branch.
+# Usage: get_changed_files_from_master_branch
+get_changed_files_from_master_branch() {
+    ANCESTOR=$(git merge-base HEAD master origin/master)
+    git diff ${ANCESTOR} --diff-filter=d --name-only "$@"
+}
+
+# List python files changed that still exist,
+# i.e., not removed.
+# Usage: get_py_files_to_check [--incremental]
+get_py_files_to_check() {
+    if [[ "$1" == "--incremental" ]]; then
+        get_changed_files_from_master_branch -- '*.py'
+    elif [[ -z "$1" ]]; then
+        find official/ -name '*.py'
+    else
+        die "Found unsupported args: $@ for get_py_files_to_check."
+    fi
+}
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/testing/scripts/ci_sanity.sh b/cv/classification/resnet50/tensorflow2.0/utils/testing/scripts/ci_sanity.sh
new file mode 100644
index 000000000..0646c87a9
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/testing/scripts/ci_sanity.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Sanity check script that runs tests and lint under local environment.
+# Make sure that tensorflow and pylint is installed.
+# usage: models >: ./official/utils/testing/scripts/ci_sanity.sh do_pylint --incremental
+set +x
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/builds_common.sh"
+cd "$SCRIPT_DIR/../../../.."
+MODEL_ROOT="$(pwd)"
+
+export PYTHONPATH="$PYTHONPATH:${MODEL_ROOT}"
+
+# Run pylint
+do_pylint() {
+    # Usage: do_pylint [--incremental]
+    #
+    # Options:
+    #   --incremental  Performs check on only the python files changed in the
+    #                  last non-merge git commit.
+
+    # Use this list to ALLOWLIST pylint errors
+    ERROR_ALLOWLIST=""
+
+    echo "ERROR_ALLOWLIST=\"${ERROR_ALLOWLIST}\""
+
+    PYLINT_BIN="python3 -m pylint"
+
+    PYTHON_SRC_FILES=$(get_py_files_to_check $1)
+    if [[ -z ${PYTHON_SRC_FILES} ]]; then
+        echo "do_pylint found no Python files to check. Returning."
+        return 0
+    fi
+
+    PYLINTRC_FILE="official/utils/testing/pylint.rcfile"
+
+    if [[ ! -f "${PYLINTRC_FILE}" ]]; then
+        die "ERROR: Cannot find pylint rc file at ${PYLINTRC_FILE}"
+    fi
+
+    NUM_SRC_FILES=$(echo ${PYTHON_SRC_FILES} | wc -w)
+    NUM_CPUS=$(num_cpus)
+
+    echo "Running pylint on ${NUM_SRC_FILES} files with ${NUM_CPUS} "\
+    "parallel jobs..."
+    echo ""
+
+    PYLINT_START_TIME=$(date +'%s')
+    OUTPUT_FILE="$(mktemp)_pylint_output.log"
+    ERRORS_FILE="$(mktemp)_pylint_errors.log"
+    NONWL_ERRORS_FILE="$(mktemp)_pylint_nonwl_errors.log"
+
+    rm -rf ${OUTPUT_FILE}
+    rm -rf ${ERRORS_FILE}
+    rm -rf ${NONWL_ERRORS_FILE}
+    touch ${NONWL_ERRORS_FILE}
+
+    ${PYLINT_BIN} --rcfile="${PYLINTRC_FILE}" --output-format=parseable \
+        --jobs=${NUM_CPUS} ${PYTHON_SRC_FILES} > ${OUTPUT_FILE} 2>&1
+    PYLINT_END_TIME=$(date +'%s')
+
+    echo ""
+    echo "pylint took $((PYLINT_END_TIME - PYLINT_START_TIME)) s"
+    echo ""
+
+    # Report only what we care about
+    # Ref https://pylint.readthedocs.io/en/latest/technical_reference/features.html
+    # E: all errors
+    # W0311 bad-indentation
+    # W0312 mixed-indentation
+    # C0330 bad-continuation
+    # C0301 line-too-long
+    # C0326 bad-whitespace
+    # W0611 unused-import
+    # W0622 redefined-builtin
+    grep -E '(\[E|\[W0311|\[W0312|\[C0330|\[C0301|\[C0326|\[W0611|\[W0622)' ${OUTPUT_FILE} > ${ERRORS_FILE}
+
+    N_ERRORS=0
+    while read -r LINE; do
+        IS_ALLOWLISTED=0
+        for WL_REGEX in ${ERROR_ALLOWLIST}; do
+            if echo ${LINE} | grep -q "${WL_REGEX}"; then
+                echo "Found a ALLOWLISTed error:"
+                echo "  ${LINE}"
+                IS_ALLOWLISTED=1
+            fi
+        done
+
+        if [[ ${IS_ALLOWLISTED} == "0" ]]; then
+            echo "${LINE}" >> ${NONWL_ERRORS_FILE}
+            echo "" >> ${NONWL_ERRORS_FILE}
+            ((N_ERRORS++))
+        fi
+    done <${ERRORS_FILE}
+
+    echo "Raw lint output file: ${OUTPUT_FILE}"
+
+    echo ""
+    if [[ ${N_ERRORS} != 0 ]]; then
+        echo "FAIL: Found ${N_ERRORS} non-whitelited pylint errors:"
+        cat "${NONWL_ERRORS_FILE}"
+        return 1
+    else
+        echo "PASS: No non-ALLOWLISTed pylint errors were found."
+        return 0
+    fi
+}
+
+test_result=0
+
+TESTS="$@"
+
+for t in "${TESTS}"; do
+  ${t} || test_result=$?
+done
+
+exit "${test_result}"
diff --git a/cv/classification/resnet50/tensorflow2.0/utils/testing/scripts/presubmit.sh b/cv/classification/resnet50/tensorflow2.0/utils/testing/scripts/presubmit.sh
new file mode 100644
index 000000000..33eca3cbb
--- /dev/null
+++ b/cv/classification/resnet50/tensorflow2.0/utils/testing/scripts/presubmit.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Presubmit script that runs tests and lint under local environment.
+# Make sure that tensorflow and pylint is installed.
+# usage: models >: ./official/utils/testing/scripts/presubmit.sh
+# usage: models >: ./official/utils/testing/scripts/presubmit.sh lint py2_test py3_test
+set +x
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR/../../../.."
+MODEL_ROOT="$(pwd)"
+
+export PYTHONPATH="$PYTHONPATH:${MODEL_ROOT}"
+
+py_test() {
+  local PY_BINARY="$1"
+  local exit_code=0
+
+  echo "===========Running Python test============"
+  # Skipping Ranking tests, TODO(b/189265753) remove it once the issue is fixed.
+  for test_file in `find official/ -name '*test.py' -print | grep -v 'official/recommendation/ranking'`
+  do
+    echo "####=======Testing ${test_file}=======####"
+    ${PY_BINARY} "${test_file}"
+    _exit_code=$?
+    if [[ $_exit_code != 0 ]]; then
+      exit_code=$_exit_code
+      echo "FAIL: ${test_file}"
+    fi
+  done
+
+  return "${exit_code}"
+}
+
+py2_test() {
+  local PY_BINARY=$(which python2)
+  py_test "$PY_BINARY"
+  return $?
+}
+
+py3_test() {
+  local PY_BINARY=$(which python3)
+  py_test "$PY_BINARY"
+  return $?
+}
+
+test_result=0
+
+if [ "$#" -eq 0 ]; then
+  TESTS="lint py2_test py3_test"
+else
+  TESTS="$@"
+fi
+
+for t in "${TESTS}"; do
+  ${t} || test_result=$?
+done
+
+exit "${test_result}"
diff --git a/nlp/text_classification/bert/paddlepaddle/README.md b/nlp/text_classification/bert/paddlepaddle/README.md
new file mode 100644
index 000000000..8fdedc3b5
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/README.md
@@ -0,0 +1,248 @@
+# BERT
+
+## 模型简介
+
+[BERT](https://arxiv.org/abs/1810.04805) （Bidirectional Encoder Representations from Transformers）以[Transformer](https://arxiv.org/abs/1706.03762) 编码器为网络基本组件，使用掩码语言模型（Masked Language Model）和邻接句子预测（Next Sentence Prediction）两个任务在大规模无标注文本语料上进行预训练（pre-train），得到融合了双向内容的通用语义表示模型。以预训练产生的通用语义表示模型为基础，结合任务适配的简单输出层，微调（fine-tune）后即可应用到下游的NLP任务，效果通常也较直接在下游的任务上训练的模型更优。此前BERT即在[GLUE评测任务](https://gluebenchmark.com/tasks)上取得了SOTA的结果。
+
+本项目是BERT在 Paddle 2.0上的开源实现，包含了预训练和[GLUE评测任务](https://gluebenchmark.com/tasks)上的微调代码。
+
+## 快速开始
+
+### 环境依赖
+
+本教程除了需要安装PaddleNLP库，还需以下依赖
+
+```text
+h5py
+```
+
+### 数据准备
+
+#### Pre-training数据准备
+
+`create_pretraining_data.py` 是创建预训练程序所需数据的脚本。其以文本文件（使用换行符换行和空白符分隔，data目录下提供了部分示例数据）为输入，经由BERT tokenizer进行tokenize后再做生成sentence pair正负样本、掩码token等处理，最后输出hdf5格式的数据文件。使用方式如下：
+
+```shell
+python create_pretraining_data.py \
+  --input_file=data/sample_text.txt \
+  --output_file=data/training_data.hdf5 \
+  --bert_model=bert-base-uncased \
+  --max_seq_length=128 \
+  --max_predictions_per_seq=20 \
+  --masked_lm_prob=0.15 \
+  --random_seed=12345 \
+  --dupe_factor=5
+```
+
+其中参数释义如下：
+- `input_file` 指定输入文件，可以使用目录，指定目录时将包括目录中的所有`.txt`文件。
+- `output_file` 指定输出文件。
+- `bert_model` 指定使用特定BERT模型对应的tokenizer进行tokenize处理。
+- `max_seq_length` 指定最大句子长度，超过该长度将被截断，不足该长度的将会进行padding。
+- `max_predictions_per_seq` 表示每个句子中会被mask的token的最大数目。
+- `masked_lm_prob` 表示每个token被mask的概率。
+- `random_seed` 指定随机种子。
+- `dupe_factor` 指定输入数据被重复处理的次数，每次处理将重新产生随机mask。
+
+使用以上预训练数据生成程序可以用于处理领域垂类数据后进行二次预训练。若需要使用BERT论文中预训练使用的英文Wiki和BookCorpus数据，可以参考[这里](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT)进行处理，得到的数据可以直接接入本项目中的预训练程序使用。
+
+#### Fine-tunning数据准备
+
+##### GLUE评测任务数据
+
+GLUE评测任务所含数据集已在paddlenlp中以API形式提供，无需预先准备，使用`run_glue.py`执行微调时将会自动下载。
+
+### 执行Pre-training
+
+#### GPU训练
+```shell
+unset CUDA_VISIBLE_DEVICES
+python -m paddle.distributed.launch --gpus "0" run_pretrain.py \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --max_predictions_per_seq 20 \
+    --batch_size 32   \
+    --learning_rate 1e-4 \
+    --weight_decay 1e-2 \
+    --adam_epsilon 1e-6 \
+    --warmup_steps 10000 \
+    --num_train_epochs 3 \
+    --input_dir data/ \
+    --output_dir pretrained_models/ \
+    --logging_steps 1 \
+    --save_steps 20000 \
+    --max_steps 1000000 \
+    --device gpu \
+    --use_amp False
+```
+
+#### XPU训练
+```shell
+unset FLAGS_selected_xpus
+python -m paddle.distributed.launch --xpus "0" run_pretrain.py \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --max_predictions_per_seq 20 \
+    --batch_size 32   \
+    --learning_rate 1e-4 \
+    --weight_decay 1e-2 \
+    --adam_epsilon 1e-6 \
+    --warmup_steps 10000 \
+    --num_train_epochs 3 \
+    --input_dir data/ \
+    --output_dir pretrained_models/ \
+    --logging_steps 1 \
+    --save_steps 20000 \
+    --max_steps 1000000 \
+    --device xpu \
+    --use_amp False
+```
+其中参数释义如下：
+- `model_type` 指示了模型类型，使用BERT模型时设置为bert即可。
+- `model_name_or_path` 指示了某种特定配置的模型，对应有其预训练模型和预训练时使用的 tokenizer。若模型相关内容保存在本地，这里也可以提供相应目录地址。
+- `max_predictions_per_seq` 表示每个句子中会被mask的token的最大数目，与创建预训练数据时的设置一致。
+- `batch_size` 表示每次迭代**每张卡**上的样本数目。
+- `learning_rate` 表示基础学习率大小，将于learning rate scheduler产生的值相乘作为当前学习率。
+- `weight_decay` 表示AdamW优化器中使用的weight_decay的系数。
+- `adam_epsilon` 表示AdamW优化器中使用的epsilon值。
+- `warmup_steps` 表示动态学习率热启的step数。
+- `num_train_epochs` 表示训练轮数。
+- `input_dir` 表示输入数据的目录，该目录下所有文件名中包含training的文件将被作为训练数据。
+- `output_dir` 表示模型的保存目录。
+- `logging_steps` 表示日志打印间隔。
+- `save_steps` 表示模型保存及评估间隔。
+- `max_steps` 表示最大训练步数。若训练`num_train_epochs`轮包含的训练步数大于该值，则达到`max_steps`后就提前结束。
+- `device` 表示训练使用的设备, 'gpu'表示使用GPU, 'xpu'表示使用百度昆仑卡, 'cpu'表示使用CPU。
+- `use_amp` 指示是否启用自动混合精度训练。
+
+**NOTICE**: 预训练时data目录存放的是经过 `create_pretraining_data.py` 处理后的数据，因此需要通过该数据处理脚本预先处理，否则预训练将会出现报错。
+
+### 执行Fine-tunning
+
+以GLUE中的SST-2任务为例，启动Fine-tuning的方式如下：
+
+```shell
+unset CUDA_VISIBLE_DEVICES
+python -m paddle.distributed.launch --gpus "0" run_glue.py \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --task_name SST2 \
+    --max_seq_length 128 \
+    --batch_size 32   \
+    --learning_rate 2e-5 \
+    --num_train_epochs 3 \
+    --logging_steps 1 \
+    --save_steps 500 \
+    --output_dir ./tmp/ \
+    --device gpu \
+    --use_amp False
+```
+
+其中参数释义如下：
+- `model_type` 指示了模型类型，使用BERT模型时设置为bert即可。
+- `model_name_or_path` 指示了某种特定配置的模型，对应有其预训练模型和预训练时使用的 tokenizer。若模型相关内容保存在本地，这里也可以提供相应目录地址。注：`bert-base-uncased`等对应使用的预训练模型转自[huggingface/transformers](https://github.com/huggingface/transformers)，具体可参考当前目录下converter中的内容。
+- `task_name` 表示Fine-tuning的任务。
+- `max_seq_length` 表示最大句子长度，超过该长度将被截断。
+- `batch_size` 表示每次迭代**每张卡**上的样本数目。
+- `learning_rate` 表示基础学习率大小，将于learning rate scheduler产生的值相乘作为当前学习率。
+- `num_train_epochs` 表示训练轮数。
+- `logging_steps` 表示日志打印间隔。
+- `save_steps` 表示模型保存及评估间隔。
+- `output_dir` 表示模型保存路径。
+- `device` 表示训练使用的设备, 'gpu'表示使用GPU, 'xpu'表示使用百度昆仑卡, 'cpu'表示使用CPU。
+- `use_amp` 指示是否启用自动混合精度训练。
+
+基于`bert-base-uncased`在GLUE各评测任务上Fine-tuning后，在验证集上有如下结果：
+
+| Task  | Metric                       | Result            |
+|:-----:|:----------------------------:|:-----------------:|
+| SST2 | Accuracy                     |      0.92660      |
+| QNLI  | Accuracy                     |      0.91707      |
+| CoLA  | Mattehew's corr              |      0.59557      |
+| MRPC  | F1/Accuracy                  |  0.91667/0.88235  |
+| STSB | Person/Spearman corr         |  0.88847/0.88350  |
+| QQP   | Accuracy/F1                  |  0.90581/0.87347  |
+| MNLI  | Matched acc/MisMatched acc   |  0.84422/0.84825  |
+| RTE   | Accuracy                     |      0.711191     |
+
+
+### 预测
+
+在Fine-tuning完成后，我们可以使用如下方式导出希望用来预测的模型：
+
+```shell
+python -u ./export_model.py \
+    --model_type bert \
+    --model_path bert-base-uncased \
+    --output_path ./infer_model/model
+```
+
+其中参数释义如下：
+- `model_type` 指示了模型类型，使用BERT模型时设置为bert即可。
+- `model_path` 表示训练模型的保存路径，与训练时的`output_dir`一致。
+- `output_path` 表示导出预测模型文件的前缀。保存时会添加后缀（`pdiparams`，`pdiparams.info`，`pdmodel`）；除此之外，还会在`output_path`包含的目录下保存tokenizer相关内容。
+
+然后按照如下的方式进行GLUE中的评测任务进行预测（基于Paddle的[Python预测API](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/05_inference_deployment/inference/python_infer_cn.html)）：
+
+```shell
+python -u ./predict_glue.py \
+    --task_name SST2 \
+    --model_type bert \
+    --model_path ./infer_model/model \
+    --batch_size 32 \
+    --max_seq_length 128
+```
+
+其中参数释义如下：
+- `task_name` 表示Fine-tuning的任务。
+- `model_type` 指示了模型类型，使用BERT模型时设置为bert即可。
+- `model_path` 表示预测模型文件的前缀，和上一步导出预测模型中的`output_path`一致。
+- `batch_size` 表示每个预测批次的样本数目。
+- `max_seq_length` 表示最大句子长度，超过该长度将被截断。
+
+同时支持使用输入样例数据的方式进行预测任务，这里仅以文本情感分类数据[SST-2](https://nlp.stanford.edu/sentiment/index.html)为例，输出样例数据的分类预测结果：
+
+```shell
+python -u ./predict.py \
+    --model_path ./infer_model/model \
+    --device gpu \
+    --max_seq_length 128
+```
+
+其中参数释义如下：
+- `model_path` 表示预测模型文件的前缀，和上一步导出预测模型中的`output_path`一致。
+- `device` 表示训练使用的设备, 'gpu'表示使用GPU, 'xpu'表示使用百度昆仑卡, 'cpu'表示使用CPU。
+- `max_seq_length` 表示最大句子长度，超过该长度将被截断。
+
+样例中的待预测数据返回输出的预测结果如下：
+
+```text
+Data: against shimmering cinematography that lends the setting the ethereal beauty of an asian landscape painting
+ Label: positive
+ Negative prob: 0.0004963805549778044
+ Positive prob: 0.9995037317276001
+
+Data: the situation in a well-balanced fashion
+ Label: positive
+ Negative prob: 0.000471479695988819
+ Positive prob: 0.9995285272598267
+
+Data: at achieving the modest , crowd-pleasing goals it sets for itself
+ Label: positive
+ Negative prob: 0.0019163173856213689
+ Positive prob: 0.998083770275116
+
+Data: so pat it makes your teeth hurt
+ Label: negative
+ Negative prob: 0.9988648295402527
+ Positive prob: 0.0011351780267432332
+
+Data: this new jangle of noise , mayhem and stupidity must be a serious contender for the title .
+ Label: negative
+ Negative prob: 0.9884825348854065
+ Positive prob: 0.011517543345689774
+```
+
+## 扩展
+
+上述的介绍是基于动态图的BERT的预训练任务和微调任务以及预测任务的实践过程，同时在我们也提供了基于PaddlePaddle Fleet API的静态图的BERT相关实践，在组网代码层面保持动静统一，在计算速度以及多机联合训练方面有着更优的性能，具体的细节可以参考 [BERT静态图](./static)  。
diff --git a/nlp/text_classification/bert/paddlepaddle/create_pretraining_data.py b/nlp/text_classification/bert/paddlepaddle/create_pretraining_data.py
new file mode 100644
index 000000000..acd79615d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/create_pretraining_data.py
@@ -0,0 +1,497 @@
+# coding=utf-8
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Create masked LM/next sentence masked_lm examples for BERT."""
+import argparse
+import logging
+import os
+import random
+from io import open
+import h5py
+import numpy as np
+from tqdm import tqdm
+
+from paddlenlp.transformers import BertTokenizer
+from paddlenlp.transformers.tokenizer_utils import convert_to_unicode
+
+import random
+import collections
+
+
+class TrainingInstance(object):
+    """A single training instance (sentence pair)."""
+
+    def __init__(self, tokens, segment_ids, masked_lm_positions,
+                 masked_lm_labels, is_random_next):
+        self.tokens = tokens
+        self.segment_ids = segment_ids
+        self.is_random_next = is_random_next
+        self.masked_lm_positions = masked_lm_positions
+        self.masked_lm_labels = masked_lm_labels
+
+
+def write_instance_to_example_file(instances, tokenizer, max_seq_length,
+                                   max_predictions_per_seq, output_file):
+    """Create example files from `TrainingInstance`s."""
+
+    total_written = 0
+    features = collections.OrderedDict()
+
+    num_instances = len(instances)
+    features["input_ids"] = np.zeros(
+        [num_instances, max_seq_length], dtype="int32")
+    features["input_mask"] = np.zeros(
+        [num_instances, max_seq_length], dtype="int32")
+    features["segment_ids"] = np.zeros(
+        [num_instances, max_seq_length], dtype="int32")
+    features["masked_lm_positions"] = np.zeros(
+        [num_instances, max_predictions_per_seq], dtype="int32")
+    features["masked_lm_ids"] = np.zeros(
+        [num_instances, max_predictions_per_seq], dtype="int32")
+    features["next_sentence_labels"] = np.zeros(num_instances, dtype="int32")
+
+    for inst_index, instance in enumerate(tqdm(instances)):
+        input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
+        input_mask = [1] * len(input_ids)
+        segment_ids = list(instance.segment_ids)
+        assert len(input_ids) <= max_seq_length
+
+        while len(input_ids) < max_seq_length:
+            input_ids.append(0)
+            input_mask.append(0)
+            segment_ids.append(0)
+
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+
+        masked_lm_positions = list(instance.masked_lm_positions)
+        masked_lm_ids = tokenizer.convert_tokens_to_ids(
+            instance.masked_lm_labels)
+        masked_lm_weights = [1.0] * len(masked_lm_ids)
+
+        while len(masked_lm_positions) < max_predictions_per_seq:
+            masked_lm_positions.append(0)
+            masked_lm_ids.append(0)
+            masked_lm_weights.append(0.0)
+
+        next_sentence_label = 1 if instance.is_random_next else 0
+
+        features["input_ids"][inst_index] = input_ids
+        features["input_mask"][inst_index] = input_mask
+        features["segment_ids"][inst_index] = segment_ids
+        features["masked_lm_positions"][inst_index] = masked_lm_positions
+        features["masked_lm_ids"][inst_index] = masked_lm_ids
+        features["next_sentence_labels"][inst_index] = next_sentence_label
+
+        total_written += 1
+
+    print("saving data")
+    f = h5py.File(output_file, 'w')
+    f.create_dataset(
+        "input_ids", data=features["input_ids"], dtype='i4', compression='gzip')
+    f.create_dataset(
+        "input_mask",
+        data=features["input_mask"],
+        dtype='i1',
+        compression='gzip')
+    f.create_dataset(
+        "segment_ids",
+        data=features["segment_ids"],
+        dtype='i1',
+        compression='gzip')
+    f.create_dataset(
+        "masked_lm_positions",
+        data=features["masked_lm_positions"],
+        dtype='i4',
+        compression='gzip')
+    f.create_dataset(
+        "masked_lm_ids",
+        data=features["masked_lm_ids"],
+        dtype='i4',
+        compression='gzip')
+    f.create_dataset(
+        "next_sentence_labels",
+        data=features["next_sentence_labels"],
+        dtype='i1',
+        compression='gzip')
+    f.flush()
+    f.close()
+
+
+def create_training_instances(input_files, tokenizer, max_seq_length,
+                              dupe_factor, short_seq_prob, masked_lm_prob,
+                              max_predictions_per_seq, rng):
+    """Create `TrainingInstance`s from raw text."""
+    all_documents = [[]]
+
+    # Input file format:
+    # (1) One sentence per line. These should ideally be actual sentences, not
+    # entire paragraphs or arbitrary spans of text. (Because we use the
+    # sentence boundaries for the "next sentence prediction" task).
+    # (2) Blank lines between documents. Document boundaries are needed so
+    # that the "next sentence prediction" task doesn't span between documents.
+    for input_file in input_files:
+        print("creating instance from {}".format(input_file))
+        with open(input_file, "r", encoding="UTF-8") as reader:
+            while True:
+                line = convert_to_unicode(reader.readline())
+                if not line:
+                    break
+                line = line.strip()
+
+                # Empty lines are used as document delimiters
+                if not line:
+                    all_documents.append([])
+                tokens = tokenizer.tokenize(line)
+                if tokens:
+                    all_documents[-1].append(tokens)
+
+    # Remove empty documents
+    all_documents = [x for x in all_documents if x]
+    rng.shuffle(all_documents)
+
+    # vocab_words = list(tokenizer.vocab.keys())
+    vocab_words = list(tokenizer.vocab.token_to_idx.keys())
+    instances = []
+    for _ in range(dupe_factor):
+        for document_index in range(len(all_documents)):
+            instances.extend(
+                create_instances_from_document(
+                    all_documents, document_index, max_seq_length,
+                    short_seq_prob, masked_lm_prob, max_predictions_per_seq,
+                    vocab_words, rng))
+
+    rng.shuffle(instances)
+    return instances
+
+
+def create_instances_from_document(
+        all_documents, document_index, max_seq_length, short_seq_prob,
+        masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
+    """Creates `TrainingInstance`s for a single document."""
+    document = all_documents[document_index]
+
+    # Account for [CLS], [SEP], [SEP]
+    max_num_tokens = max_seq_length - 3
+
+    # We *usually* want to fill up the entire sequence since we are padding
+    # to `max_seq_length` anyways, so short sequences are generally wasted
+    # computation. However, we *sometimes*
+    # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
+    # sequences to minimize the mismatch between pre-training and fine-tuning.
+    # The `target_seq_length` is just a rough target however, whereas
+    # `max_seq_length` is a hard limit.
+    target_seq_length = max_num_tokens
+    if rng.random() < short_seq_prob:
+        target_seq_length = rng.randint(2, max_num_tokens)
+
+    # We DON'T just concatenate all of the tokens from a document into a long
+    # sequence and choose an arbitrary split point because this would make the
+    # next sentence prediction task too easy. Instead, we split the input into
+    # segments "A" and "B" based on the actual "sentences" provided by the user
+    # input.
+    instances = []
+    current_chunk = []
+    current_length = 0
+    i = 0
+    while i < len(document):
+        segment = document[i]
+        current_chunk.append(segment)
+        current_length += len(segment)
+        if i == len(document) - 1 or current_length >= target_seq_length:
+            if current_chunk:
+                # `a_end` is how many segments from `current_chunk` go into the `A`
+                # (first) sentence.
+                a_end = 1
+                if len(current_chunk) >= 2:
+                    a_end = rng.randint(1, len(current_chunk) - 1)
+
+                tokens_a = []
+                for j in range(a_end):
+                    tokens_a.extend(current_chunk[j])
+
+                tokens_b = []
+                # Random next
+                is_random_next = False
+                if len(current_chunk) == 1 or rng.random() < 0.5:
+                    is_random_next = True
+                    target_b_length = target_seq_length - len(tokens_a)
+
+                    # This should rarely go for more than one iteration for large
+                    # corpora. However, just to be careful, we try to make sure that
+                    # the random document is not the same as the document
+                    # we're processing.
+                    for _ in range(10):
+                        random_document_index = rng.randint(
+                            0, len(all_documents) - 1)
+                        if random_document_index != document_index:
+                            break
+
+                    #If picked random document is the same as the current document
+                    if random_document_index == document_index:
+                        is_random_next = False
+
+                    random_document = all_documents[random_document_index]
+                    random_start = rng.randint(0, len(random_document) - 1)
+                    for j in range(random_start, len(random_document)):
+                        tokens_b.extend(random_document[j])
+                        if len(tokens_b) >= target_b_length:
+                            break
+                    # We didn't actually use these segments so we "put them back" so
+                    # they don't go to waste.
+                    num_unused_segments = len(current_chunk) - a_end
+                    i -= num_unused_segments
+                # Actual next
+                else:
+                    is_random_next = False
+                    for j in range(a_end, len(current_chunk)):
+                        tokens_b.extend(current_chunk[j])
+                truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
+
+                assert len(tokens_a) >= 1
+                assert len(tokens_b) >= 1
+
+                tokens = []
+                segment_ids = []
+                tokens.append("[CLS]")
+                segment_ids.append(0)
+                for token in tokens_a:
+                    tokens.append(token)
+                    segment_ids.append(0)
+
+                tokens.append("[SEP]")
+                segment_ids.append(0)
+
+                for token in tokens_b:
+                    tokens.append(token)
+                    segment_ids.append(1)
+                tokens.append("[SEP]")
+                segment_ids.append(1)
+
+                (tokens, masked_lm_positions,
+                 masked_lm_labels) = create_masked_lm_predictions(
+                     tokens, masked_lm_prob, max_predictions_per_seq,
+                     vocab_words, rng)
+                instance = TrainingInstance(
+                    tokens=tokens,
+                    segment_ids=segment_ids,
+                    is_random_next=is_random_next,
+                    masked_lm_positions=masked_lm_positions,
+                    masked_lm_labels=masked_lm_labels)
+                instances.append(instance)
+            current_chunk = []
+            current_length = 0
+        i += 1
+
+    return instances
+
+
+MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
+                                          ["index", "label"])
+
+
+def create_masked_lm_predictions(tokens, masked_lm_prob,
+                                 max_predictions_per_seq, vocab_words, rng):
+    """Creates the predictions for the masked LM objective."""
+
+    cand_indexes = []
+    for (i, token) in enumerate(tokens):
+        if token == "[CLS]" or token == "[SEP]":
+            continue
+        cand_indexes.append(i)
+
+    rng.shuffle(cand_indexes)
+
+    output_tokens = list(tokens)
+
+    num_to_predict = min(max_predictions_per_seq,
+                         max(1, int(round(len(tokens) * masked_lm_prob))))
+
+    masked_lms = []
+    covered_indexes = set()
+    for index in cand_indexes:
+        if len(masked_lms) >= num_to_predict:
+            break
+        if index in covered_indexes:
+            continue
+        covered_indexes.add(index)
+
+        masked_token = None
+        # 80% of the time, replace with [MASK]
+        if rng.random() < 0.8:
+            masked_token = "[MASK]"
+        else:
+            # 10% of the time, keep original
+            if rng.random() < 0.5:
+                masked_token = tokens[index]
+            # 10% of the time, replace with random word
+            else:
+                masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
+
+        output_tokens[index] = masked_token
+
+        masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
+
+    masked_lms = sorted(masked_lms, key=lambda x: x.index)
+
+    masked_lm_positions = []
+    masked_lm_labels = []
+    for p in masked_lms:
+        masked_lm_positions.append(p.index)
+        masked_lm_labels.append(p.label)
+
+    return (output_tokens, masked_lm_positions, masked_lm_labels)
+
+
+def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
+    """Truncates a pair of sequences to a maximum sequence length."""
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_num_tokens:
+            break
+
+        trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
+        assert len(trunc_tokens) >= 1
+
+        # We want to sometimes truncate from the front and sometimes from the
+        # back to add more randomness and avoid biases.
+        if rng.random() < 0.5:
+            del trunc_tokens[0]
+        else:
+            trunc_tokens.pop()
+
+
+def main():
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--input_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The input train corpus. can be directory with .txt files or a path to a single file"
+    )
+    parser.add_argument(
+        "--output_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The output file where created hdf5 formatted data will be written."
+    )
+    parser.add_argument(
+        "--vocab_file",
+        default=None,
+        type=str,
+        required=False,
+        help="The vocabulary the BERT model will train on. "
+        "Use bert_model argument would ignore this. "
+        "The bert_model argument is recommended.")
+    parser.add_argument(
+        "--do_lower_case",
+        action='store_true',
+        default=True,
+        help="Whether to lower case the input text. True for uncased models, False for cased models. "
+        "Use bert_model argument would ignore this. The bert_model argument is recommended."
+    )
+    parser.add_argument(
+        "--bert_model",
+        default="bert-base-uncased",
+        type=str,
+        required=False,
+        help="Bert pre-trained model selected in the list: bert-base-uncased, "
+        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
+        "If provided, use the pre-trained model used tokenizer to create data "
+        "and ignore vocab_file and do_lower_case.")
+
+    ## Other parameters
+    #int
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after WordPiece tokenization. \n"
+        "Sequences longer than this will be truncated, and sequences shorter \n"
+        "than this will be padded.")
+    parser.add_argument(
+        "--dupe_factor",
+        default=10,
+        type=int,
+        help="Number of times to duplicate the input data (with different masks)."
+    )
+    parser.add_argument(
+        "--max_predictions_per_seq",
+        default=20,
+        type=int,
+        help="Maximum number of masked LM predictions per sequence.")
+
+    # floats
+    parser.add_argument(
+        "--masked_lm_prob",
+        default=0.15,
+        type=float,
+        help="Masked LM probability.")
+    parser.add_argument(
+        "--short_seq_prob",
+        default=0.1,
+        type=float,
+        help="Probability to create a sequence shorter than maximum sequence length"
+    )
+
+    parser.add_argument(
+        '--random_seed',
+        type=int,
+        default=12345,
+        help="random seed for initialization")
+
+    args = parser.parse_args()
+    print(args)
+
+    if args.bert_model:
+        tokenizer = BertTokenizer.from_pretrained(args.bert_model)
+    else:
+        assert args.vocab_file, (
+            "vocab_file must be set If bert_model is not provided.")
+        tokenizer = BertTokenizer(
+            args.vocab_file, do_lower_case=args.do_lower_case)
+
+    input_files = []
+    if os.path.isfile(args.input_file):
+        input_files.append(args.input_file)
+    elif os.path.isdir(args.input_file):
+        input_files = [
+            os.path.join(args.input_file, f)
+            for f in os.listdir(args.input_file)
+            if (os.path.isfile(os.path.join(args.input_file, f)) and f.endswith(
+                '.txt'))
+        ]
+    else:
+        raise ValueError("{} is not a valid path".format(args.input_file))
+
+    rng = random.Random(args.random_seed)
+    instances = create_training_instances(
+        input_files, tokenizer, args.max_seq_length, args.dupe_factor,
+        args.short_seq_prob, args.masked_lm_prob, args.max_predictions_per_seq,
+        rng)
+
+    output_file = args.output_file
+
+    write_instance_to_example_file(instances, tokenizer, args.max_seq_length,
+                                   args.max_predictions_per_seq, output_file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nlp/text_classification/bert/paddlepaddle/data/sample_text.txt b/nlp/text_classification/bert/paddlepaddle/data/sample_text.txt
new file mode 100644
index 000000000..75ec60cdb
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/data/sample_text.txt
@@ -0,0 +1,100 @@
+Zulfiqar A. Bhutta trained as a physician in Pakistan in the early stages of his career.
+He holds titles across various organizations in diverse geographies.
+Professor Bhutta is the Founding Director of the Center of Excellence in Women and Child Health & Institute for Global Child Health & Development, at the Aga Khan University South-Central Asia, East Africa & United Kingdom.
+He is currently the Co-Director at the Centre for Global Child Health, at the Hospital for Sick Children and leads many projects as a Senior Scientist at the Research Institute in the Centre for Global Child Health at Sick Kids.
+He holds a Professorship at the University of Toronto in the Department of Nutritional Sciences and the Division of Epidemiology, Dalla Lana School of Public Health.
+Additionally, he holds concurrent professorship at the Department of Paediatrics, Aga Khan University in Karachi, Pakistan and at the Schools of Public Health of Johns Hopkins University, Tufts University, Boston University, University of Alberta and the London School of Hygiene & Tropical Medicine.
+He is a designated Distinguished National Professor of the Government of Pakistan and was the Founding Chair of the National Research Ethics Committee of the Government of Pakistan from 2003-2014.
+Dr. Bhutta received his MBBS from Khyber Medical College in Peshawar, Pakistan in 1977 at which time he was names "Best Graduate of the Year" and awarded the University Gold Medal for overall distinction.
+His PhD work was completed at Karolinska Institute in Stockholm, Sweden in 1996.
+He is a Fellow of the Royal College of Physicians (Edinburgh & London), the Royal College of Paediatrics and Child Health (London), American Academy of Paediatrics and the Pakistan Academy of Sciences.
+Following the completion of his PhD Dr. Bhutta began working as House Surgeon in Obstetrics & Gynecology at the Khyber Teaching Hospital, Peshawar (April-November 1978).
+He began work in paediatrics as a physician in November of 1978 in the Professorial Unit at the Institute of Child Health, Jinnah Postgraduate Medical Centre, Karachi (Pakistan).
+Through 1980's he continued his work as a surgeon and paediatrician.
+He undertook his first professor position in the Department of Paediatrics, The Aga Khan University Hospital, Karachi (Pakistan), from November 1987 to June 1992.
+In 2005, Dr. Bhutta became the Chairman of the Department of Paediatrics & Child Health at the Aga Khan University & Medical Center, a position held until 2008.
+Following his term as Chairman he became The Noordin Noormahomed Sheriff Professor & Founding Chair, Division of Women & Child Health, The Aga Khan University, a position he held for four years.
+Dr. Bhutta currently holds the titles of co-director of the Centre for Global Child Health at the Hospital for Sick Children in Toronto, and founding director of the Centre of Excellence in Women and Child Health at the Aga Khan University.
+In 2020, he was appointed founding director of the Institute for Global child Health & Development at the Aga Khan University and elected Fellow to the Royal Society, United Kingdom.
+Outside of his professional responsibilities Dr. Bhutta serves on various local and international boards and committees, including a series of editorial boards.
+In his various capacities Dr. Bhutta has produced a large collection of publications working with his teams at Sick Kids, AKU and international partners.
+These include book reviews, chapters, 1.
+"Haematological disorders" "Neonatal Jaundice" in Neonatal Vade‑Mecum, Fleming PJ, Speidel BD, Dunn PM Eds, Lloyd‑Luke Publishers, UK, 1986.
+Revised 2nd Edition 1991.
+2.
+"Nutritional management of acute and persistent diarrhoea".
+A M Molla, Bhutta Z A and  A Molla.
+In McNeish A S, Mittal S K and Walker-Smith J A (eds).
+Recent trends in diarrhoea and malnutrition, MAMC, Delhi, 1991, pp 37-51.
+3.
+"Paediatric Prescribing” in "Text book of Paediatrics for developing countries"            Arif MA, Hanif SM, Wasti SMK Eds, 1989, 2nd Edition 1996,  PPA, Karachi.
+& Lahore 4.
+"Innovations in neonatal care : Impact on neonatal survival in the developing world:.
+Bhutta Z A  Zaidi S (Editor) 1992.
+TWEL Publisher.
+Karachi pp 121-131 5.
+"Short course therapy in Pediatrics" Bhutta Z A& Teele D.  In Tice A D, Waldvogel F (Eds), Contemporary issues in Infectious Disease Epidemiology and Management, 1993 Gardiner Caldwell, Cheshire, pp 52 - 60.
+6.
+"Dietary management of persistent diarrhoea".
+Bhutta Z A, Molla A M, Issani Z.
+In Reflections on  Diarrhoeal Disease & Nutrition  of Children".
+1993 Karachi, pp 97 - 103.
+7.
+"Prescribing practices amongst general practitioners (GPs) and consultant paediatricians in childhood diarrhoea.”  S.Q.
+Nizami, I.A.
+Khan, Bhutta Z A.
+In "Reflections on Diarrhoeal Disease and Nutrition of Children".
+1993 Karachi, pp  88-90.
+8.
+"The challenge of multidrug-resistant typhoid".
+Bhutta Z A.
+In Puri R K, Sachdev H P S, Choudhry P, Verma I C (Eds), Current concepts in Paediatrics, 1994.
+Jaypee Publishers, New Delhi, pp 403.8.
+9.
+"Perinatal Care in Pakistan: Current status and trends".
+In Proceedings of the Workshop in Reproductive Health.
+College of Physicians and Surgeons, Pakistan, Karachi, 1995, pp 95-103.
+10.
+“A study of whole body protein kinetics in malnourished children with persistent diarrhoea” Bhutta Z A, Nizami SQ, Isani Z, Hardy S, Hendricks K, Young V.   Report of the second RCM coordinated Research Programme for application of stable isotope tracer methods to studies of energy metabolism in malnourished populations of developing countries.
+NAHRES-30 1996 IAEA Vienna.
+11.
+"Pneumococcal infections in Pakistan: a country report".
+In Adult Immunization in Asia, Fondation Mercel Merieux, Lyon, 1998. pp 79-82.
+12.
+“Factors affecting protein and aminoacid metabolism in childhood from developing countries".
+In Child Nutrition: an international perspective.
+Editors Solomons NW, Caballero B, Brown KH.
+CRC Press 1998.
+13.
+"Protein Digestion and Bioavailability".
+In Encyclopedia of Human Nutrition.
+Editors: Sadler M, Strain JJ, Caballero B.
+Academic Press (London), 1998 pp.1646-54.
+14.
+"Perinatal Care in Pakistan.
+Reproductive Health: A manual for family practice and primary health care.
+Bhutta Z A, Maqbool S.  College of Physicians and Surgeons, Pakistan, Karachi, 1999, pp 69-78.
+15.
+“Effective interventions to reduce neonatal mortality and morbidity from perinatal infection.
+Bhutta ZA.
+In Costello A, Manandhar D (eds).
+"Improving Newborn Infant Health in Developing Countries’ 1999.
+Imperial College Press, London pp.289-308.
+16.
+“Ambulatory management of typhoid fever”            “Risk factors and management of micronutrient deficiencies”            “Management of persistent diarrhoea in developing countries”.
+In Manual of International Child Health, British Medical Journal, 2000 (in press).
+17.
+“The role of Cefixime in typhoid fever during childhood” in Cefixime, Adam D, Quintiliani R (Eds), Torre-Lazur-McCann, Tokyo, 2000; pp.107-112.
+18.
+"Micronutrients and Child Health in the Commonwealth”, Commonwealth Foundation" (UK) (2001).
+19.
+"Isotopic evaluation of breast milk intake, energy metabolism growth and body composition of exclusively breastfed infants in Pakistan".
+Bhutta ZA, Nizami SQ, Weaver LT, Preston T. In Application of Stable Isotopes to evaluate Growth and Body Composition of Exclusively Breastfed infants, IAEA and WHO, NAHRES Report.
+2000.
+20.
+“Typhoid Fever in Childhood: the south Asian experience”.
+Ahmad K &Bhutta ZA.
+In "Recent Advances in Paediatrics", Gupte S (Ed), 2000, India .
+21.
+“Neonatal Infections in developing countries” in  Carrera JM, Cabero L, Baraibar R (Eds).
+The Perinatal Medicine of the new Millennium.
\ No newline at end of file
diff --git a/nlp/text_classification/bert/paddlepaddle/export_model.py b/nlp/text_classification/bert/paddlepaddle/export_model.py
new file mode 100644
index 000000000..7e487a2c5
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/export_model.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+import paddle
+
+from run_glue import MODEL_CLASSES
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " +
+        ", ".join(MODEL_CLASSES.keys()), )
+    parser.add_argument(
+        "--model_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path of the trained model to be exported.", )
+    parser.add_argument(
+        "--output_path",
+        default=None,
+        type=str,
+        required=True,
+        help="The output file prefix used to save the exported inference model.",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    args.model_type = args.model_type.lower()
+    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+
+    # build model and load trained parameters
+    model = model_class.from_pretrained(args.model_path)
+    # switch to eval model
+    model.eval()
+    # convert to static graph with specific input description
+    model = paddle.jit.to_static(
+        model,
+        input_spec=[
+            paddle.static.InputSpec(
+                shape=[None, None], dtype="int64"),  # input_ids
+            paddle.static.InputSpec(
+                shape=[None, None], dtype="int64")  # segment_ids
+        ])
+    # save converted static graph model
+    paddle.jit.save(model, args.output_path)
+    # also save tokenizer for inference usage
+    tokenizer = tokenizer_class.from_pretrained(args.model_path)
+    tokenizer.save_pretrained(os.path.dirname(args.output_path))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/__init__.py
new file mode 100644
index 000000000..02fae6fae
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/__init__.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+from datetime import datetime
+
+PADDLENLP_STABLE_VERSION = "PADDLENLP_STABLE_VERSION"
+
+
+__version__ = "3.0.0b1.post"
+if os.getenv(PADDLENLP_STABLE_VERSION):
+    __version__ = __version__.replace(".post", "")
+else:
+    formatted_date = datetime.now().date().strftime("%Y%m%d")
+    __version__ = __version__.replace(".post", ".post{}".format(formatted_date))
+
+if "datasets" in sys.modules.keys():
+    from paddlenlp.utils.log import logger
+
+    logger.warning(
+        "Detected that datasets module was imported before paddlenlp. "
+        "This may cause PaddleNLP datasets to be unavalible in intranet. "
+        "Please import paddlenlp before datasets module to avoid download issues"
+    )
+import paddle
+
+from . import (
+    data,
+    dataaug,
+    datasets,
+    embeddings,
+    experimental,
+    layers,
+    losses,
+    metrics,
+    ops,
+    peft,
+    prompt,
+    quantization,
+    seq2vec,
+    trainer,
+    transformers,
+    trl,
+    utils,
+    version,
+)
+from .server import SimpleServer
+from .taskflow import Taskflow
+
+paddle.disable_signal_handler()
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/__init__.py
new file mode 100644
index 000000000..412a19101
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .main import main
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/bos_community.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/bos_community.py
new file mode 100644
index 000000000..2b8d238d4
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/bos_community.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+
+from baidubce.auth.bce_credentials import BceCredentials
+from baidubce.bce_client_configuration import BceClientConfiguration
+from baidubce.services.bos.bos_client import BosClient
+
+bos_config = {
+    "bucket": "models",
+    "bos_host": "paddlenlp.bj.bcebos.com",
+}
+
+
+bos_host = str(bos_config["bos_host"])
+bos_bucket = str(bos_config["bucket"])
+
+access_key_id = os.getenv("bos_access_key_id", None)
+secret_access_key = os.getenv("bos_secret_access_key", None)
+if access_key_id is None or secret_access_key is None:
+    raise ValueError(
+        "Please set environment variables of  bos_access_key_id, bos_secret_access_key, before uploading !!!"
+    )
+
+
+def upload_to_bos_from_raw(raw, name, category="test"):
+    b_config = BceClientConfiguration(credentials=BceCredentials(access_key_id, secret_access_key), endpoint=bos_host)
+    bos_client = BosClient(b_config)
+    bos_client.put_object_from_string(bos_bucket, "%s/%s" % (category, name), raw)
+    url = "https://paddlenlp.bj.bcebos.com/%s/%s/%s" % (bos_bucket, category, name)
+    return url
+
+
+def multi_upload_to_bos(filename, name, category):
+    b_config = BceClientConfiguration(credentials=BceCredentials(access_key_id, secret_access_key), endpoint=bos_host)
+    bos_client = BosClient(b_config)
+    # init multi-upload
+    key = "%s/%s" % (category, name)
+    bucket_name = bos_bucket
+    upload_id = bos_client.initiate_multipart_upload(bucket_name, key).upload_id
+
+    left_size = os.path.getsize(filename)
+    offset = 0
+    part_number = 1
+    part_list = []
+    while left_size > 0:
+        part_size = 3 * 1024 * 1024 * 1024
+        if left_size < part_size:
+            part_size = left_size
+        response = bos_client.upload_part_from_file(
+            bucket_name, key, upload_id, part_number, part_size, filename, offset
+        )
+        left_size -= part_size
+        offset += part_size
+        # your should store every part number and etag to invoke complete multi-upload
+        part_list.append({"partNumber": part_number, "eTag": response.metadata.etag})
+        part_number += 1
+    bos_client.complete_multipart_upload(bucket_name, key, upload_id, part_list)
+    url = "https://paddlenlp.bj.bcebos.com/%s/%s/%s" % (bos_bucket, category, name)
+    return url
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage: python bos_community.py organization/model local_dir")
+        sys.exit(1)
+
+    organization = sys.argv[1]
+    local_dir = sys.argv[2]
+
+    for filename in os.listdir(local_dir):
+        name = os.path.split(filename)[-1]
+        if name == "bos.log":
+            continue
+        filename = os.path.join(local_dir, filename)
+        left_size = os.path.getsize(filename)
+        print(f"Uploading to {organization}/{name}, size: {left_size}")
+        if left_size >= 5 * 1024 * 1024 * 1024:
+            url = multi_upload_to_bos(filename, name, category=f"community/{organization}")
+        else:
+            with open(filename, "rb") as fp:
+                url = upload_to_bos_from_raw(raw=fp.read(), name=name, category=f"community/{organization}")
+        print(f"Done: {url}")
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/download.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/download.py
new file mode 100644
index 000000000..80b6c8d3f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/download.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from typing import List, Tuple
+
+from paddlenlp.utils.downloader import COMMUNITY_MODEL_PREFIX, get_path_from_url
+from paddlenlp.utils.env import MODEL_HOME
+from paddlenlp.utils.log import logger
+
+COMMUNITY_MODEL_CONFIG_FILE_NAME = "community_models.json"
+
+
+def load_community_models() -> List[Tuple[str, str]]:
+    """load community models based on remote models.json
+
+    Returns:
+        List[Tuple[str, str]]: the name tuples of community models
+    """
+    # 1. check & download community models.json
+    local_community_model_config_path = os.path.join(MODEL_HOME, "community_models.json")
+
+    if not os.path.exists(local_community_model_config_path):
+        logger.info("download community model configuration from server ...")
+        remote_community_model_path = "/".join([COMMUNITY_MODEL_PREFIX, COMMUNITY_MODEL_CONFIG_FILE_NAME])
+        cache_dir = os.path.join(MODEL_HOME)
+        local_community_model_config_path = get_path_from_url(remote_community_model_path, root_dir=cache_dir)
+
+    # 2. load configuration
+    #
+    # config = {
+    #   "model_name": {
+    #       "type": "",
+    #       "files": ["", ""]
+    #   }
+    # }
+    #
+
+    with open(local_community_model_config_path, "r", encoding="utf-8") as f:
+        config = json.load(f)
+
+    model_names = set()
+    for model_name, obj in config.items():
+        model_names.add((model_name, obj.get("model_type", "")))
+    logger.info(f"find {len(model_names)} community models ...")
+    return model_names
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/install.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/install.py
new file mode 100644
index 000000000..7d49e1288
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/install.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path
+import subprocess
+
+from paddlenlp.utils.downloader import _download, url_file_exists
+from paddlenlp.utils.env import PACKAGE_HOME
+from paddlenlp.utils.log import logger
+
+PACKAGE_SERVER_HOME = "https://paddlenlp.bj.bcebos.com/wheels"
+
+
+def install_package_from_bos(package_name: str, tag: str):
+    """
+    install package from bos server based on package_name and tag
+    Args:
+        package_name (str): the name of package, eg: paddlenlp, ppdiffusers, paddle-pipelines
+        tag (str): pr number、 version of paddlenlp, or latest
+    """
+    # eg: https://paddlenlp.bj.bcebos.com/wheels/paddlenlp-latest-py3-none-any.whl
+    file_name = f"{package_name}-{tag}-py3-none-any.whl"
+    logger.info(f"start to downloading package<{file_name}>")
+
+    package_url = f"{PACKAGE_SERVER_HOME}/{file_name}"
+    if not url_file_exists(package_url):
+        raise ValueError(f"there is not valid package<{package_name}_py3_{tag}.whl> " f"from the url<{package_url}>")
+
+    file_path = os.path.join(PACKAGE_HOME, file_name)
+
+    # force download
+    file_path = _download(package_url, PACKAGE_HOME)
+
+    # force reinstall the local package but ignore the dependencies
+    command = f"python -m pip install --force-reinstall --no-dependencies {file_path}".split()
+    subprocess.Popen(command)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/main.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/main.py
new file mode 100644
index 000000000..d79904c16
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/main.py
@@ -0,0 +1,249 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from pathlib import Path
+from typing import List, Tuple, Type
+
+from uvicorn.config import LOGGING_CONFIG
+
+from paddlenlp.utils.import_utils import is_package_available
+
+# check whether the package is avaliable and give friendly description.
+if not is_package_available("typer"):
+    raise ModuleNotFoundError(
+        "paddlenlp-cli tools is not installed correctly, you can use the following command"
+        " to install paddlenlp cli tool: >>> pip install paddlenlp[cli]"
+    )
+
+import importlib
+import inspect
+import shutil
+
+import typer
+
+from paddlenlp.cli.download import load_community_models
+from paddlenlp.cli.install import install_package_from_bos
+from paddlenlp.cli.server import start_backend
+from paddlenlp.cli.utils.tabulate import print_example_code, tabulate
+from paddlenlp.transformers import (
+    AutoModel,
+    AutoTokenizer,
+    PretrainedModel,
+    PretrainedTokenizer,
+)
+from paddlenlp.transformers.utils import find_transformer_model_type
+from paddlenlp.utils.downloader import is_url
+from paddlenlp.utils.log import logger
+
+
+def load_all_models(include_community: bool = False) -> List[Tuple[str, str]]:
+    """load all model_name infos
+
+    Returns:
+        List[Tuple[str, str]]: [model_type, model_name]
+    """
+    # 1. load official models
+    module = importlib.import_module("paddlenlp.transformers")
+    model_names = []
+    model_names_dict = {}
+    for attr_name in dir(module):
+        if attr_name.startswith("_"):
+            continue
+        obj = getattr(module, attr_name)
+        if not inspect.isclass(obj):
+            continue
+        if not issubclass(obj, PretrainedModel):
+            continue
+
+        obj: Type[PretrainedModel] = obj
+        if not obj.__name__.endswith("PretrainedModel"):
+            continue
+        configurations = obj.pretrained_init_configuration
+        model_type = find_transformer_model_type(obj)
+        for model_name in configurations.keys():
+            # get model type with refactoring
+            model_names.append((model_type, model_name))
+            model_names_dict[model_name] = True
+
+    logger.info(f"find {len(model_names)} official models ...")
+
+    # 2. load & extend community models
+    if include_community:
+        community_model_names = load_community_models()
+        for model_name in community_model_names:
+            # there are some same model-names between codebase and community models
+            if model_name in model_names_dict:
+                continue
+
+            model_names.append(model_name)
+    # 3. sort result
+    model_names.sort(key=lambda item: item[0] + item[1])
+    return model_names
+
+
+app = typer.Typer()
+
+
+@app.command()
+def download(
+    model_name: str,
+    cache_dir: str = typer.Option(
+        "./pretrained_models", "--cache-dir", "-c", help="cache_dir for download pretrained model"
+    ),
+    force_download: bool = typer.Option(False, "--force-download", "-f", help="force download pretrained model"),
+):
+    """download the paddlenlp models with command, you can specific `model_name`
+
+    >>> paddlenlp download bert \n
+    >>> paddlenlp download -c ./my-models -f bert \n
+
+    Args:\n
+        model_name (str): pretarined model name, you can checkout all of model from source code. \n
+        cache_dir (str, optional): the cache_dir. Defaults to "./models".
+    """
+    if not os.path.isabs(cache_dir):
+        cache_dir = os.path.join(os.getcwd(), cache_dir)
+
+    if is_url(model_name):
+        logger.error("<MODEL_NAME> can not be url")
+        return
+
+    cache_dir = os.path.join(cache_dir, model_name)
+    if force_download:
+        shutil.rmtree(cache_dir, ignore_errors=True)
+
+    model: PretrainedModel = AutoModel.from_pretrained(model_name)
+    model.save_pretrained(cache_dir)
+
+    tokenizer: PretrainedTokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenizer.save_pretrained(cache_dir)
+
+    logger.info(f"successfully saved model into <{cache_dir}>")
+
+
+@app.command()
+def search(
+    query=typer.Argument(..., help="the query of searching model"),
+    include_community: bool = typer.Option(
+        False, "--include-community", "-i", help="whether searching community models"
+    ),
+):
+    """search the model with query, eg: paddlenlp search bert
+
+    >>> paddlenlp search bert \n
+    >>> paddlenlp search -i bert \n
+
+    Args: \n
+        query (Optional[str]): the str fragment of bert-name \n
+        include_community (Optional[bool]): whether searching community models
+    """
+    logger.info("start to search models ...")
+    model_names = load_all_models(include_community)
+
+    tables = []
+    for model_type, model_name in model_names:
+        # TODO(wj-Mcat): ignore the model_category info
+        if not query or query in model_name:
+            tables.append([model_type, model_name])
+    tabulate(tables, headers=["model type", "model name"], highlight_word=query)
+    print_example_code()
+
+    logger.info(f"the retrieved number of models results is {len(tables)} ...")
+
+
+@app.command(help="Start the PaddleNLP SimpleServer.")
+def server(
+    app: str,
+    host: str = typer.Option("127.0.0.1", "--host", help="Bind socket to this host."),
+    port: int = typer.Option("8000", "--port", help="Bind socket to this port."),
+    app_dir: str = typer.Option(None, "--app_dir", help="The application directory path."),
+    workers: int = typer.Option(
+        None,
+        "--workers",
+        help="Number of worker processes. Defaults to the $WEB_CONCURRENCY environment"
+        " variable if available, or 1. Not valid with --reload.",
+    ),
+    log_level: int = typer.Option(None, "--log_level", help="Log level. [default: info]"),
+    limit_concurrency: int = typer.Option(
+        None, "--limit-concurrency", help="Maximum number of concurrent connections or tasks to allow, before issuing"
+    ),
+    limit_max_requests: int = typer.Option(
+        None, "--limit-max-requests", help="Maximum number of requests to service before terminating the process."
+    ),
+    timeout_keep_alive: int = typer.Option(
+        15, "--timeout-keep-alive", help="Close Keep-Alive connections if no new data is received within this timeout."
+    ),
+    reload: bool = typer.Option(False, "--reload", help="Reload the server when the app_dir is changed."),
+):
+    """The main function for the staring the SimpleServer"""
+    logger.info("starting to PaddleNLP SimpleServer...")
+    if app_dir is None:
+        app_dir = str(Path(os.getcwd()))
+    # Flags of uvicorn
+    backend_kwargs = {
+        "host": host,
+        "port": port,
+        "log_config": LOGGING_CONFIG,
+        "log_level": log_level,
+        "workers": workers,
+        "limit_concurrency": limit_concurrency,
+        "limit_max_requests": limit_max_requests,
+        "timeout_keep_alive": timeout_keep_alive,
+        "app_dir": app_dir,
+        "reload": reload,
+    }
+    start_backend(app, **backend_kwargs)
+
+
+@app.command(
+    help="install the target version of paddlenlp, eg: paddlenlp install / paddlenlp install paddlepaddle==latest"
+)
+def install(
+    package: str = typer.Argument(default="paddlenlp==latest", help="install the target version of paddlenlp")
+):
+    """The main function for the staring the SimpleServer"""
+    package = package.replace(" ", "").strip()
+
+    if not package:
+        raise ValueError("please assign the package name")
+
+    # 1. parse the version of paddlenlp
+    splits = [item for item in package.split("==")]
+    if len(splits) == 0 or len(splits) > 2:
+        raise ValueError(
+            "please set the valid package: <package-name>==<version>, eg: paddlenlp==latest, paddlenlp==3099, "
+            f"but received: {package}"
+        )
+
+    tag = "latest"
+    package_name = splits[0]
+
+    # TODO(wj-Mcat): will support `pipelines`, `ppdiffusers` later.
+    assert package_name in ["paddlenlp"], "we only support paddlenlp"
+
+    if len(splits) == 2:
+        tag = splits[1]
+
+    # 2. download & install package from bos server
+    install_package_from_bos(package_name=package_name, tag=tag)
+
+
+def main():
+    """the PaddleNLPCLI entry"""
+    app()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/server.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/server.py
new file mode 100644
index 000000000..68805d287
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/server.py
@@ -0,0 +1,26 @@
+# coding:utf-8
+# copyright (c) 2022  paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license"
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+import uvicorn
+
+from ..utils.log import logger
+
+
+def start_backend(app, **kwargs):
+    logger.info("The PaddleNLP SimpleServer is starting, backend component uvicorn arguments as follows:")
+    for key, value in kwargs.items():
+        if key != "log_config":
+            logger.info("   the starting argument [{}]={}".format(key, value))
+    uvicorn.run(app, **kwargs)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/utils/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/utils/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/utils/tabulate.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/utils/tabulate.py
new file mode 100644
index 000000000..01e0e28cc
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/cli/utils/tabulate.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Dict, Union, Optional, Type
+from rich.console import Console
+from rich.theme import Theme
+from rich.markdown import Markdown
+from rich.table import Table
+from rich.highlighter import RegexHighlighter
+
+
+def _get_highlighter(word: str) -> Type[RegexHighlighter]:
+    """construct Regex Highlighter class based on the word
+
+    Args:
+        word (str): the query word
+
+    Returns:
+        Type[RegexHighlighter]: the sub-class of RegexHighlighter
+    """
+
+    class KeywordHighlighter(RegexHighlighter):
+        base_style = "paddlenlp."
+        highlights = [f"(?P<keyword>{word})"]
+
+    return KeywordHighlighter()
+
+
+def print_example_code():
+    # 1. define the console
+    console = Console()
+    markdown = """
+## you can download the above model with the following command:
+
+### ***paddlenlp download --cache-dir ./paddle_pretrained_models <model name>***
+
+### ***the <model name> is copied from above table***
+    """
+    console.print(Markdown(markdown))
+
+
+def tabulate(
+    tables: List[Union[List[str], Dict[str, str]]],
+    headers: Optional[List[str]] = None,
+    highlight_word: Optional[str] = None,
+):
+    """print tabulate data into console
+
+    Args:
+        tables (List[Union[List[str], Dict[str, str]]]): the table instance data
+        headers (Optional[List[str]], optional): the header configuration. Defaults to None.
+        highlight_word (Optional[str], optional): the highlight word. Defaults to None.
+    """
+    # 1. define the console
+    theme = Theme({"paddlenlp.keyword": "bold magenta"})
+    console = Console(highlighter=_get_highlighter(highlight_word), theme=theme)
+    table_instance = Table(
+        title="PaddleNLP 模型检索结果", show_header=headers is not None, header_style="bold magenta", highlight=True
+    )
+
+    # 2. add column
+    headers = headers or []
+    for header in headers:
+        if isinstance(header, str):
+            table_instance.add_column(header)
+        else:
+            table_instance.add_column(**header)
+
+    # 3. add row data
+    for row_data in tables:
+        table_instance.add_row(*row_data)
+
+    console.print(table_instance, justify="center")
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/__init__.py
new file mode 100644
index 000000000..cdd868fe4
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .blendable_dataset import *
+from .causal_dataset import *
+from .collate import *
+from .data_collator import *
+from .dist_dataloader import *
+from .sampler import *
+from .tokenizer import *
+from .vocab import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/blendable_dataset.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/blendable_dataset.py
new file mode 100644
index 000000000..c84eb4038
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/blendable_dataset.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import hashlib
+import os
+import time
+
+import numpy as np
+import paddle
+
+local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0))
+
+
+def print_rank_0(*args, **kwargs):
+    if paddle.distributed.get_rank() == 0:
+        print(*args, **kwargs)
+
+
+class BlendableDataset(paddle.io.Dataset):
+    def __init__(self, datasets, weights, size, share_folder, *, data_cache_path=None):
+
+        self.datasets = datasets
+        num_datasets = len(datasets)
+        assert num_datasets == len(weights)
+
+        self.size = size
+
+        # Normalize weights.
+        weights = np.array(weights, dtype=np.float64)
+        sum_weights = np.sum(weights)
+        assert sum_weights > 0.0
+        weights /= sum_weights
+
+        # Build indicies.
+        def _build_indices():
+            start_time = time.time()
+            assert num_datasets < 255
+            dataset_index = np.zeros(self.size, dtype=np.uint8)
+            dataset_sample_index = np.zeros(self.size, dtype=np.int64)
+
+            from tool_helpers import helpers
+
+            helpers.build_blending_indices(
+                dataset_index,
+                dataset_sample_index,
+                weights,
+                num_datasets,
+                self.size,
+                local_rank == 0,
+                #    paddle.distributed.get_rank() == 0,
+            )
+            print_rank_0(
+                "> elapsed time for building blendable dataset indices: "
+                "{:.2f} (sec)".format(time.time() - start_time)
+            )
+            return dataset_index, dataset_sample_index
+
+        desc = "Blendable dataset\n\n"
+        desc += "Datasets:\n"
+        for dataset in datasets:
+            desc += dataset.desc + "\n\n"
+        desc += f"Weights: {weights}\n"
+        desc += f"Size: {size}\n"
+        self.desc = desc
+
+        if data_cache_path:
+            desc_hash = hashlib.md5(desc.encode("utf-8")).hexdigest()
+            desc_path = os.path.join(data_cache_path, desc_hash + ".dsc")
+            index_path = os.path.join(data_cache_path, desc_hash + "_index.npy")
+            sample_index_path = os.path.join(data_cache_path, desc_hash + "_sample_index.npy")
+            cache_hit = os.path.isfile(index_path) and os.path.isfile(sample_index_path)
+            # cache_success = True
+            # if paddle.distributed.get_rank() == 0 and not cache_hit:
+            check_rank_flag = not cache_hit and local_rank == 0
+            if share_folder:
+                check_rank_flag = not cache_hit and paddle.distributed.get_rank() == 0
+
+            print(
+                f"searching for blendable dataset, cache_hit={cache_hit}, share_folder {share_folder}, check_rank_flag {check_rank_flag}",
+                flush=True,
+            )
+            if check_rank_flag:
+                print(
+                    " > WARNING: could not find index map files for blendable"
+                    " dataset, building indices on rank 0 ...",
+                    flush=True,
+                )
+                dataset_index, dataset_sample_index = _build_indices()
+                try:
+                    os.makedirs(os.path.dirname(index_path), exist_ok=True)
+                    with open(desc_path, "wt") as fd:
+                        fd.write(desc)
+                        np.save(index_path, dataset_index, allow_pickle=True)
+                        np.save(sample_index_path, dataset_sample_index, allow_pickle=True)
+                except OSError:
+                    print(f"There was an error trying to create the data cache directory ({data_cache_path})")
+                    print("or a file in it. This is set with the --data-cache-path argument. Please")
+                    print("ensure you have write access to this directory or specify one that you do have")
+                    print("write access to.")
+                    # cache_success = False
+
+            # hcg = paddle.distributed.fleet.get_hybrid_communicate_group()
+
+            # counts = paddle.to_tensor([cache_success], dtype="int64")
+            # paddle.distributed.all_reduce(counts, group=hcg.get_data_parallel_group())
+            # paddle.distributed.all_reduce(counts, group=hcg.get_pipeline_model_parallel_group())
+            # if counts[0].item() != (
+            #     paddle.distributed.get_world_size()
+            #     // paddle.distributed.get_world_size(group=hcg.get_tensor_model_parallel_group())
+            # ):
+            #     print_rank_0("Data index creation unsuccessful, exiting.")
+            #     exit()
+
+            else:
+                while True:
+                    if (not os.path.isfile(index_path)) or (not os.path.isfile(sample_index_path)):
+                        print("building indices on rank 0 ...", flush=True)
+                        time.sleep(3)
+                    else:
+                        try:
+                            np.load(index_path, allow_pickle=True, mmap_mode="r")
+                            print("build success", flush=True)
+                            break
+                        except Exception:
+                            print("%s file is still writing or damaged, please wait for a moment." % index_path)
+                            time.sleep(3)
+
+            # paddle.distributed.barrier()
+            # Load on all ranks.
+            print_rank_0(f"> loading blendable dataset index: {index_path}")
+            self.dataset_index = np.load(index_path, allow_pickle=True, mmap_mode="r")
+            assert self.dataset_index.size == self.size
+
+            print_rank_0(f"> loading blendable dataset sample index: {sample_index_path}")
+            self.dataset_sample_index = np.load(sample_index_path, allow_pickle=True, mmap_mode="r")
+            assert self.dataset_sample_index.size == self.size
+        else:
+            print_rank_0(
+                "building indices for the blendable dataset, Since --data_cache is not specified, the index file will not be stored.",
+                flush=True,
+            )
+            self.dataset_index, self.dataset_sample_index = _build_indices()
+
+        # Check size
+        _ = self.__getitem__(self.size - 1)
+        try:
+            _ = self.__getitem__(self.size)
+            raise RuntimeError("BlendedDataset size is improperly bounded")
+        except IndexError:
+            pass
+        print_rank_0("> size of blendable dataset: " "{} samples".format(self.size))
+
+    def __len__(self):
+        return self.size
+
+    def __getitem__(self, idx):
+        dataset_idx = self.dataset_index[idx]
+        sample_idx = self.dataset_sample_index[idx]
+        return {
+            "dataset_idx": dataset_idx,
+            **self.datasets[dataset_idx][sample_idx],
+        }
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/causal_dataset.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/causal_dataset.py
new file mode 100644
index 000000000..9d7050620
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/causal_dataset.py
@@ -0,0 +1,711 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+"""GPT style dataset."""
+import hashlib
+import math
+import os
+import time
+
+import numpy as np
+import paddle
+
+from paddlenlp.data.blendable_dataset import BlendableDataset
+from paddlenlp.data.indexed_dataset import make_dataset as make_indexed_dataset
+
+local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0))
+
+
+# class FakeHCG:
+#     def get_data_parallel_group(self):
+#         return None
+
+#     def get_pipe_parallel_group(self):
+#         return None
+
+#     def get_model_parallel_group(self):
+#         return None
+
+
+def check_data_split(splits_string, do_train, do_eval, do_predict):
+    splits = []
+    if splits_string.find(",") != -1:
+        splits = [float(s) for s in splits_string.split(",")]
+    elif splits_string.find("/") != -1:
+        splits = [float(s) for s in splits_string.split("/")]
+    else:
+        splits = [float(splits_string)]
+    while len(splits) < 3:
+        splits.append(0.0)
+    splits = splits[:3]
+    splits_sum = sum(splits)
+    data_flag = True
+    assert splits_sum > 0.0, "sum of splits should larger than 0.0!"
+    if (do_train and splits[0] == 0) or (do_eval and splits[1] == 0) or (do_predict and splits[2] == 0):
+        data_flag = False
+    if not data_flag:
+        raise ValueError("If do_train/do_eval/do_predict is True, the corresponding dataset split should not be 0!")
+
+
+def get_train_valid_test_split_(splits_string, size):
+    """Get dataset splits from comma or '/' separated string list."""
+
+    splits = []
+    if splits_string.find(",") != -1:
+        splits = [float(s) for s in splits_string.split(",")]
+    elif splits_string.find("/") != -1:
+        splits = [float(s) for s in splits_string.split("/")]
+    else:
+        splits = [float(splits_string)]
+    while len(splits) < 3:
+        splits.append(0.0)
+    splits = splits[:3]
+    splits_sum = sum(splits)
+    assert splits_sum > 0.0
+    splits = [split / splits_sum for split in splits]
+    splits_index = [0]
+    for index, split in enumerate(splits):
+        splits_index.append(splits_index[index] + int(round(split * float(size))))
+    diff = splits_index[-1] - size
+    for index in range(1, len(splits_index)):
+        splits_index[index] -= diff
+    assert len(splits_index) == 4
+    assert splits_index[-1] == size
+    return splits_index
+
+
+def get_datasets_weights_and_num_samples(data_prefix, train_val_test_num_samples):
+
+    # The data prefix should be in the format of:
+    #   weight-1, data-prefix-1, weight-2, data-prefix-2, ..
+    assert len(data_prefix) % 2 == 0
+    num_datasets = len(data_prefix) // 2
+    weights = [0] * num_datasets
+    prefixes = [0] * num_datasets
+    for i in range(num_datasets):
+        weights[i] = float(data_prefix[2 * i])
+        prefixes[i] = (data_prefix[2 * i + 1]).strip()
+    # Normalize weights
+    weight_sum = 0.0
+    for weight in weights:
+        weight_sum += weight
+    assert weight_sum > 0.0
+    weights = [weight / weight_sum for weight in weights]
+
+    # Add 0.5% (the 1.005 factor) so in case the bleding dataset does
+    # not uniformly distribute the number of samples, we still have
+    # samples left to feed to the network.
+    # (NOTE, yujun06): This is a workaround to avoid issues with indexing in the blending dataset. Therefore, we need to add 20 samples to each dataset.
+    datasets_train_valid_test_num_samples = []
+    for weight in weights:
+        datasets_train_valid_test_num_samples.append(
+            [int(math.ceil(val * weight * 1.005)) + 20 for val in train_val_test_num_samples]
+        )
+
+    return prefixes, weights, datasets_train_valid_test_num_samples
+
+
+def print_rank_0(*args, **kwargs):
+    if paddle.distributed.get_rank() == 0:
+        print(*args, **kwargs)
+
+
+def build_train_valid_test_datasets(
+    data_prefix,
+    data_impl,
+    splits_string,
+    train_val_test_num_samples,
+    seq_length,
+    seed,
+    skip_warmup,
+    train_data_prefix=None,
+    valid_data_prefix=None,
+    test_data_prefix=None,
+    return_doc_ids=False,
+    share_folder=False,
+    *,
+    data_cache_path=None,
+    need_data=True,
+):
+    """Build train, valid, and test datasets."""
+
+    # Single dataset.
+    if len(data_prefix) == 1:
+        return _build_train_valid_test_datasets(
+            data_prefix[0],
+            data_impl,
+            splits_string,
+            train_val_test_num_samples,
+            seq_length,
+            seed,
+            skip_warmup,
+            share_folder=share_folder,
+            data_cache_path=data_cache_path,
+            need_data=need_data,
+        )
+
+    # Blending dataset.
+    # Parse the values.
+    output = get_datasets_weights_and_num_samples(data_prefix, train_val_test_num_samples)
+    prefixes, weights, datasets_train_valid_test_num_samples = output
+    # NOTE: megatron/gpt_dataset.py has been updated. When creating BlendableDataset, we will use the raw train_val_test_num_samples instead of the expanded ones.
+    # Please refer to https://github.com/NVIDIA/NeMo/blob/72f630d087d45655b1a069dc72debf01dfdbdb2d/nemo/collections/nlp/data/language_modeling/megatron/gpt_dataset.py#L74-L80 for more information
+    train_num_samples, valid_num_samples, test_num_samples = train_val_test_num_samples
+
+    # Build individual datasets.
+    train_datasets = []
+    valid_datasets = []
+    test_datasets = []
+    for i in range(len(prefixes)):
+        train_ds, valid_ds, test_ds = _build_train_valid_test_datasets(
+            prefixes[i],
+            data_impl,
+            splits_string,
+            datasets_train_valid_test_num_samples[i],
+            seq_length,
+            seed,
+            skip_warmup,
+            return_doc_ids,
+            share_folder=share_folder,
+            data_cache_path=data_cache_path,
+            need_data=need_data,
+        )
+        if train_ds:
+            train_datasets.append(train_ds)
+        if valid_ds:
+            valid_datasets.append(valid_ds)
+        if test_ds:
+            test_datasets.append(test_ds)
+
+    blending_train_dataset = None
+    if train_datasets:
+        blending_train_dataset = BlendableDataset(
+            train_datasets, weights, train_num_samples, share_folder, data_cache_path=data_cache_path
+        )
+    blending_valid_dataset = None
+    if valid_datasets:
+        blending_valid_dataset = BlendableDataset(
+            valid_datasets, weights, valid_num_samples, share_folder, data_cache_path=data_cache_path
+        )
+    blending_test_dataset = None
+    if test_datasets:
+        blending_test_dataset = BlendableDataset(
+            test_datasets,
+            weights,
+            test_num_samples,
+            share_folder,
+            data_cache_path=data_cache_path,
+        )
+
+    return (blending_train_dataset, blending_valid_dataset, blending_test_dataset)
+
+
+def _build_train_valid_test_datasets(
+    data_prefix,
+    data_impl,
+    splits_string,
+    train_val_test_num_samples,
+    seq_length,
+    seed,
+    skip_warmup,
+    return_doc_ids=False,
+    share_folder=False,
+    *,
+    data_cache_path=None,
+    need_data=True,
+):
+    """Build train, valid, and test datasets."""
+
+    # Indexed dataset.
+    if need_data:
+        indexed_dataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup)
+
+        total_num_of_documents = indexed_dataset.sizes.shape[0]
+        splits = get_train_valid_test_split_(splits_string, total_num_of_documents)
+
+        # Print stats about the splits.
+        print_rank_0(" > dataset split:")
+
+        def print_split_stats(name, index):
+            print_rank_0("    {}:".format(name))
+            print_rank_0(
+                "     document indices in [{}, {}) total of {} "
+                "documents".format(splits[index], splits[index + 1], splits[index + 1] - splits[index])
+            )
+
+        print_split_stats("train", 0)
+        print_split_stats("validation", 1)
+        print_split_stats("test", 2)
+
+    if paddle.distributed.get_world_size() > 1:
+        paddle.distributed.barrier()
+
+    def build_dataset(index, name):
+        documents = np.arange(splits[index], splits[index + 1], 1, np.int32) if need_data else None
+        dataset = GPTDataset(
+            name,
+            data_prefix,
+            documents,
+            indexed_dataset if need_data else None,
+            splits_string,
+            train_val_test_num_samples[index],
+            seq_length,
+            seed,
+            return_doc_ids,
+            share_folder,
+            data_cache_path=data_cache_path,
+            need_data=need_data,
+        )
+        if need_data:
+            return dataset if splits[index + 1] > splits[index] else None
+        else:
+            return None
+
+    train_dataset = build_dataset(0, "train")
+    valid_dataset = build_dataset(1, "valid")
+    test_dataset = build_dataset(2, "test")
+
+    return (train_dataset, valid_dataset, test_dataset)
+
+
+def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
+    """Build indexed dataset."""
+    print_rank_0(" > building dataset index ...")
+
+    start_time = time.time()
+    indexed_dataset = make_indexed_dataset(data_prefix, data_impl, skip_warmup)
+    print_rank_0(" > finished creating indexed dataset in {:4f} " "seconds".format(time.time() - start_time))
+    print_rank_0("    number of documents: {}".format(indexed_dataset.sizes.shape[0]))
+
+    return indexed_dataset
+
+
+class GPTDataset(paddle.io.Dataset):
+    def __init__(
+        self,
+        name,
+        data_prefix,
+        documents,
+        indexed_dataset,
+        splits_string,
+        num_samples,
+        seq_length,
+        seed,
+        return_doc_ids=False,
+        share_folder=False,
+        *,
+        data_cache_path=None,
+        need_data=True,
+    ):
+
+        self.name = name
+        self.indexed_dataset = indexed_dataset
+        self.return_doc_ids = return_doc_ids
+
+        # Build index mappings.
+        if need_data and len(documents) > 0:
+            assert np.min(documents) >= 0
+            assert np.max(documents) < indexed_dataset.sizes.shape[0]
+
+            (
+                doc_idx_filename,
+                sample_idx_filename,
+                shuffle_idx_filename,
+                self.desc,
+                self.desc_hash,
+                num_epochs,
+            ) = _build_index_mappings(
+                self.name,
+                data_prefix,
+                documents,
+                self.indexed_dataset.sizes,
+                splits_string,
+                num_samples,
+                seq_length,
+                seed,
+                share_folder,
+                data_cache_path=data_cache_path,
+            )
+
+        if paddle.distributed.get_world_size() > 1:
+            paddle.distributed.barrier()
+
+        # Load mappings.
+        if need_data and len(documents) > 0:
+            start_time = time.time()
+            print_rank_0(f" > loading doc-idx mapping from {doc_idx_filename}")
+            self.doc_idx = np.load(doc_idx_filename, allow_pickle=True, mmap_mode="r")
+
+            print_rank_0(f" > loading sample-idx mapping from {sample_idx_filename}")
+            self.sample_idx = np.load(sample_idx_filename, allow_pickle=True, mmap_mode="r")
+
+            print_rank_0(f" > loading shuffle-idx mapping from {shuffle_idx_filename}")
+            self.shuffle_idx = np.load(shuffle_idx_filename, allow_pickle=True, mmap_mode="r")
+
+            print_rank_0("    loaded indexed file in {:3.3f} seconds".format(time.time() - start_time))
+            print_rank_0("    total number of samples: {}".format(self.sample_idx.shape[0]))
+            print_rank_0("    total number of epochs: {}".format(num_epochs))
+
+        if paddle.distributed.get_world_size() > 1:
+            paddle.distributed.barrier()
+
+    def __len__(self):
+        # -1 is due to data structure used to retieve the index:
+        #    sample i --> [sample_idx[i], sample_idx[i+1])
+        return self.sample_idx.shape[0] - 1
+
+    def __getitem__(self, idx):
+        # Get the shuffled index.
+        idx = self.shuffle_idx[idx]
+        # Start and end documents and offsets.
+        doc_index_f = self.sample_idx[idx][0]
+        doc_index_l = self.sample_idx[idx + 1][0]
+        offset_f = self.sample_idx[idx][1]
+        offset_l = self.sample_idx[idx + 1][1]
+        # If we are within the same document, just extract the chunk.
+        doc_ids = []
+        if doc_index_f == doc_index_l:
+            doc_ids.append(self.doc_idx[doc_index_f])
+
+            sample, mask = self.indexed_dataset.get(
+                self.doc_idx[doc_index_f], offset=offset_f, length=offset_l - offset_f + 1
+            )
+        else:
+            # Otherwise, get the rest of the initial document.
+            doc_ids.append(self.doc_idx[doc_index_f])
+            sample, mask = self.indexed_dataset.get(self.doc_idx[doc_index_f], offset=offset_f)
+            append_mask = True
+            if mask is None:
+                append_mask = False
+
+            sample_list = [sample]
+            mask_list = []
+            mask_list = [mask]
+            # Loop over all in between documents and add the entire document.
+            for i in range(doc_index_f + 1, doc_index_l):
+                doc_ids.append(self.doc_idx[i])
+                sample, mask = self.indexed_dataset.get(self.doc_idx[i])
+                sample_list.append(sample)
+                if append_mask:
+                    mask_list.append(mask)
+
+            # And finally add the relevant portion of last document.
+            doc_ids.append(self.doc_idx[doc_index_l])
+            sample, mask = self.indexed_dataset.get(self.doc_idx[doc_index_l], length=offset_l + 1)
+            sample_list.append(sample)
+            if append_mask:
+                mask_list.append(mask)
+            sample = np.concatenate(sample_list)
+            if append_mask:
+                mask = np.concatenate(mask_list)
+        # print(sample)
+        if self.return_doc_ids:  # for retro preprocessing
+            if mask is None:
+                return {"text": np.array(sample, dtype=np.int64), "doc_ids": np.array(doc_ids, dtype=np.int64)}
+            else:
+                return {
+                    "text": np.array(sample, dtype=np.int64),
+                    "doc_ids": np.array(doc_ids, dtype=np.int64),
+                    "mask": np.array(mask, dtype=np.int64),
+                }
+        else:
+            if mask is None:
+                return {"text": np.array(sample, dtype=np.int64)}
+            else:
+                return {"text": np.array(sample, dtype=np.int64), "mask": np.array(mask, dtype=np.int64)}
+
+
+def _build_index_mappings(
+    name, data_prefix, documents, sizes, splits_string, num_samples, seq_length, seed, share_folder, *, data_cache_path
+):
+    """Build doc-idx, sample-idx, and shuffle-idx.
+    doc-idx: is an array (ordered) of documents to be used in training.
+    sample-idx: is the start document index and document offset for each
+       training sample.
+    shuffle-idx: maps the sample index into a random index into sample-idx.
+    """
+
+    # Number of tokens in each epoch and number of required epochs.
+    tokens_per_epoch = _num_tokens(documents, sizes)
+    num_epochs = _num_epochs(tokens_per_epoch, seq_length, num_samples)
+
+    # rng state
+    np_rng = np.random.RandomState(seed=seed)
+    # Filename of the index mappings.
+    desc = "GPT Dataset\n\n"
+    desc += f"Data prefix {data_prefix}\n"
+    desc += f"Dataset name {name}\n"
+    desc += f"Number of samples {num_samples}\n"
+    desc += f"Sequence length {seq_length}\n"
+    desc += f"Random seed {seed}\n"
+    desc += f"Split {splits_string}\n"
+    desc_hash = hashlib.md5(desc.encode("utf-8")).hexdigest()
+    desc_filename = desc_hash + ".dsc"
+    doc_idx_filename = desc_hash + "_doc_idx.npy"
+    sample_idx_filename = desc_hash + "_sample_idx.npy"
+    shuffle_idx_filename = desc_hash + "_shuffle_idx.npy"
+
+    # Look for cache in main data dir first to avoid unnecessary
+    # duplication, then look in data-cache-path if specified,
+    # If nothing is found, use the last path looked in
+    build_indices = True
+    prefixes = [os.path.join(os.path.dirname(data_prefix), "index-cache")]
+    if data_cache_path is not None:
+        prefixes.append(data_cache_path)
+    for prefix in prefixes:
+        idx_path = {
+            "desc": os.path.join(prefix, desc_filename),
+            "doc": os.path.join(prefix, doc_idx_filename),
+            "sample": os.path.join(prefix, sample_idx_filename),
+            "shuffle": os.path.join(prefix, shuffle_idx_filename),
+        }
+        for f in idx_path.values():
+            if not os.path.isfile(f):
+                break
+        else:
+            # Found our files!
+            build_indices = False
+            break
+    data_cache_dir = os.path.dirname(idx_path["desc"])
+    # data_cache_success = True
+    # Build the indexed mapping if not exist.
+    check_rank_flag = build_indices and local_rank == 0
+    if share_folder:
+        check_rank_flag = build_indices and paddle.distributed.get_rank() == 0
+
+    # if build_indices and paddle.distributed.get_rank() == 0:
+
+    print(
+        f"searching for causual dataset, build_indices={build_indices}, share_folder {share_folder}, check_rank_flag {check_rank_flag}",
+        flush=True,
+    )
+    if check_rank_flag:
+        print_rank_0(" > WARNING: could not find index map files, building " "the indices on rank 0 ...")
+
+        # For the last epoch, decide whether include the entire epoch
+        # in the global shuffle or not.
+
+        # If we need only one epoch, then separating last epoch  does
+        # not mean anything.
+        if num_epochs == 1:
+            separate_last_epoch = False
+            print(" > only one epoch required, setting " "separate_last_epoch to False", flush=True)
+
+        else:
+            # Get the number of samples for the last epoch
+            num_samples_from_epochs_minus_one = ((num_epochs - 1) * tokens_per_epoch - 1) // seq_length
+            last_epoch_num_samples = num_samples - num_samples_from_epochs_minus_one
+            assert last_epoch_num_samples >= 0, "last epoch number of samples should be non-negative."
+            num_samples_per_epoch = (tokens_per_epoch - 1) // seq_length
+            assert last_epoch_num_samples <= (
+                num_samples_per_epoch + 1
+            ), "last epoch number of samples exceeded max value."
+            # If we have less than 80% of the samples for the last epoch,
+            # seperate out the epoch and treat it differently.
+            # Note: the 80% number is just based on common sense and can
+            # be adjusted if needed.
+            separate_last_epoch = last_epoch_num_samples < int(0.80 * num_samples_per_epoch)
+            if separate_last_epoch:
+                string = (
+                    " > last epoch number of samples ({}) is smaller "
+                    "than 80% of number of samples per epoch ({}), "
+                    "setting separate_last_epoch to True"
+                )
+            else:
+                string = (
+                    " > last epoch number of samples ({}) is larger "
+                    "than 80% of number of samples per epoch ({}), "
+                    "setting separate_last_epoch to False"
+                )
+            print(string.format(last_epoch_num_samples, num_samples_per_epoch), flush=True)
+
+        try:
+            os.makedirs(data_cache_dir, exist_ok=True)
+
+            # description
+            with open(idx_path["desc"], "wt") as fd:
+                fd.write(desc)
+
+            # doc-idx.
+            start_time = time.time()
+            doc_idx = _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch)
+            np.save(idx_path["doc"], doc_idx, allow_pickle=True)
+            print_rank_0(
+                " > elasped time to build and save doc-idx mapping "
+                "(seconds): {:4f}".format(time.time() - start_time)
+            )
+            # sample-idx.
+            start_time = time.time()
+            # Use C++ implementation for speed.
+            # First compile and then import.
+            # from megatron.data import helpers
+            from tool_helpers import helpers
+
+            assert doc_idx.dtype == np.int32
+            assert sizes.dtype == np.int32
+            sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch)
+            np.save(idx_path["sample"], sample_idx, allow_pickle=True)
+            print_rank_0(
+                " > elasped time to build and save sample-idx mapping "
+                "(seconds): {:4f}".format(time.time() - start_time)
+            )
+            # shuffle-idx.
+            start_time = time.time()
+            # -1 is due to data structure used to retieve the index:
+            #    sample i --> [sample_idx[i], sample_idx[i+1])
+            if separate_last_epoch:
+                num_samples_ = num_samples_from_epochs_minus_one
+            else:
+                num_samples_ = sample_idx.shape[0] - 1
+            shuffle_idx = _build_shuffle_idx(num_samples_, sample_idx.shape[0] - 1, np_rng)
+            np.save(idx_path["shuffle"], shuffle_idx, allow_pickle=True)
+            print_rank_0(
+                " > elasped time to build and save shuffle-idx mapping"
+                " (seconds): {:4f}".format(time.time() - start_time)
+            )
+        except OSError:
+            print(f"There was an error trying to create the data cache directory ({data_cache_dir})")
+            print('or a file in it. This defaults to a directory "index-cache" within the directory')
+            print("the data files are in and can be set with the --data-cache-path argument. Please")
+            print("ensure you have write access to this directory or specify one that you do have")
+            print("write access to.")
+            # data_cache_success = False
+    else:
+        while True:
+            if (
+                (not os.path.isfile(idx_path["doc"]))
+                or (not os.path.isfile(idx_path["sample"]))
+                or (not os.path.isfile(idx_path["shuffle"]))
+            ):
+                print("building indices on rank 0 ...", flush=True)
+                time.sleep(3)
+            else:
+                try:
+                    np.load(idx_path["shuffle"], allow_pickle=True, mmap_mode="r")
+                    print("build success", flush=True)
+                    break
+                except Exception:
+                    print("%s file is still writing or damaged, please wait for a moment." % idx_path["shuffle"])
+                    time.sleep(3)
+    # try:
+    #     hcg = paddle.distributed.fleet.get_hybrid_communicate_group()
+    # except:
+    #     hcg = FakeHCG()
+
+    # counts = paddle.to_tensor([data_cache_success], dtype="int64")
+    # paddle.distributed.all_reduce(counts, group=hcg.get_data_parallel_group())
+    # paddle.distributed.all_reduce(counts, group=hcg.get_pipe_parallel_group())
+    # if counts[0].item() != (
+    #     paddle.distributed.get_world_size() // paddle.distributed.get_world_size(group=hcg.get_model_parallel_group())
+    # ):
+    #     print_rank_0("Data index creation unsuccessful, exiting.")
+    #     exit()
+    # paddle.distributed.barrier()
+
+    return idx_path["doc"], idx_path["sample"], idx_path["shuffle"], desc, desc_hash, num_epochs
+
+
+def _num_tokens(documents, sizes):
+    """Total number of tokens in the dataset."""
+    return np.sum(sizes[documents])
+
+
+def _num_epochs(tokens_per_epoch, seq_length, num_samples):
+    """Based on number of samples and sequence lenght, calculate how many
+    epochs will be needed."""
+    num_epochs = 0
+    total_tokens = 0
+    while True:
+        num_epochs += 1
+        total_tokens += tokens_per_epoch
+        # -1 is because we need to retrieve seq_length + 1 token each time
+        # but the last token will overlap with the first token of the next
+        # sample except for the last sample.
+        if ((total_tokens - 1) // seq_length) >= num_samples:
+            return num_epochs
+
+
+def _build_doc_idx(documents, num_epochs, np_rng, separate_last_epoch):
+    """Build an array with length = number-of-epochs * number-of-dcuments.
+    Each index is mapped to a corresponding document."""
+    if not separate_last_epoch or num_epochs == 1:
+        doc_idx = np.mgrid[0:num_epochs, 0 : len(documents)][1]
+        doc_idx[:] = documents
+        doc_idx = doc_idx.reshape(-1)
+        doc_idx = doc_idx.astype(np.int32)
+        np_rng.shuffle(doc_idx)
+        return doc_idx
+
+    doc_idx_first = _build_doc_idx(documents, num_epochs - 1, np_rng, False)
+    doc_idx_last = _build_doc_idx(documents, 1, np_rng, False)
+    return np.concatenate((doc_idx_first, doc_idx_last))
+
+
+def _build_sample_idx(sizes, doc_idx, seq_length, num_epochs, tokens_per_epoch):
+    """Sample index mapping is a 2D array with sizes
+    [number-of-samples + 1, 2] where [..., 0] contains
+    the index into `doc_idx` and [..., 1] is the
+    starting offset in that document."""
+
+    # Total number of samples. For -1 see comments in `_num_epochs`.
+    num_samples = (num_epochs * tokens_per_epoch - 1) // seq_length
+    sample_idx = np.zeros([num_samples + 1, 2], dtype=np.int32)
+
+    # Index into sample_idx.
+    sample_index = 0
+    # Index into doc_idx.
+    doc_idx_index = 0
+    # Begining offset for each document.
+    doc_offset = 0
+    # Start with first document and no offset.
+    sample_idx[sample_index][0] = doc_idx_index
+    sample_idx[sample_index][1] = doc_offset
+    sample_index += 1
+    while sample_index <= num_samples:
+        # Start with a fresh sequence.
+        remaining_seq_length = seq_length + 1
+        while remaining_seq_length != 0:
+            # Get the document length.
+            doc_id = doc_idx[doc_idx_index]
+            doc_length = sizes[doc_id] - doc_offset
+            # And add it to the current sequence.
+            remaining_seq_length -= doc_length
+            # If we have more than a full sequence, adjust offset and set
+            # remaining length to zero so we return from the while loop.
+            # Note that -1 here is for the same reason we have -1 in
+            # `_num_epochs` calculations.
+            if remaining_seq_length <= 0:
+                doc_offset += remaining_seq_length + doc_length - 1
+                remaining_seq_length = 0
+            else:
+                # Otherwise, start from the begining of the next document.
+                doc_idx_index += 1
+                doc_offset = 0
+        # Record the sequence.
+        sample_idx[sample_index][0] = doc_idx_index
+        sample_idx[sample_index][1] = doc_offset
+        sample_index += 1
+
+    return sample_idx
+
+
+def _build_shuffle_idx(num_samples, total_size, np_rng):
+    """Build the range [0, size) and shuffle."""
+    print(
+        " > building shuffle index with split [0, {}) and [{}, {}) "
+        "...".format(num_samples, num_samples, total_size),
+        flush=True,
+    )
+
+    dtype_ = np.uint32
+    if total_size >= (np.iinfo(np.uint32).max - 1):
+        dtype_ = np.int64
+
+    shuffle_idx_first = np.arange(start=0, stop=num_samples, step=1, dtype=dtype_)
+    np_rng.shuffle(shuffle_idx_first)
+    if num_samples == total_size:
+        return shuffle_idx_first
+
+    shuffle_idx_last = np.arange(start=num_samples, stop=total_size, step=1, dtype=dtype_)
+    np_rng.shuffle(shuffle_idx_last)
+
+    return np.concatenate((shuffle_idx_first, shuffle_idx_last))
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/collate.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/collate.py
new file mode 100644
index 000000000..c4305404a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/collate.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+__all__ = [
+    "Stack",
+    "Pad",
+    "Tuple",
+    "Dict",
+]
+
+
+class Stack(object):
+    """
+    Stacks the input data samples to construct the batch. The N input samples
+    must have the same shape/length and will be stacked to construct a batch.
+
+    Args:
+        axis (int, optional): The axis in the result data along which the input
+            data are stacked. Default: 0.
+        dtype (str|numpy.dtype, optional): The value type of the output. If it
+            is set to None, the type of input data is used. Default: None.
+    """
+
+    def __init__(self, axis=0, dtype=None):
+        self._axis = axis
+        self._dtype = dtype
+
+    def __call__(self, data):
+        """
+        Batchifies the input data by stacking.
+
+        Args:
+            data (list[numpy.ndarray]): The input data samples. It is a list.
+                Each element is a numpy.ndarray or list.
+
+        Returns:
+            numpy.ndarray: Stacked batch data.
+
+
+        Example:
+            .. code-block:: python
+
+                from paddlenlp.data import Stack
+                a = [1, 2, 3, 4]
+                b = [3, 4, 5, 6]
+                c = [5, 6, 7, 8]
+                result = Stack()([a, b, c])
+                '''
+                [[1, 2, 3, 4],
+                 [3, 4, 5, 6],
+                 [5, 6, 7, 8]]
+                '''
+        """
+        data = np.stack(data, axis=self._axis).astype(self._dtype) if self._dtype else np.stack(data, axis=self._axis)
+        return data
+
+
+class Pad(object):
+    """
+    Pads the input data samples to the largest length at `axis`.
+
+    Args:
+        pad_val (float|int, optional): The padding value. Default: 0.
+        axis (int, optional): The axis to pad the arrays. The arrays will be
+            padded to the largest length at `axis`. For example, assume the
+            input arrays have shape (10, 8, 5), (6, 8, 5), (3, 8, 5) and the
+            axis is 0. Each input will be padded into (10, 8, 5) and then
+            stacked to form the final output, which has shape (3, 10, 8, 5).
+            Default: 0.
+        ret_length (bool|numpy.dtype, optional): If it is bool, indicate whether
+            to return the valid length in the output, and the data type of
+            returned length is int32 if True. If it is numpy.dtype, indicate the
+            data type of returned length. Default: None.
+        dtype (numpy.dtype, optional): The value type of the output. If it is
+            set to None, the input data type is used. Default: None.
+        pad_right (bool, optional): Whether the padding direction is right-side.
+            If True, it indicates we pad to the right side, while False indicates
+            we pad to the left side. Default: True.
+    """
+
+    def __init__(self, pad_val=0, axis=0, ret_length=None, dtype=None, pad_right=True):
+        self._pad_val = pad_val
+        self._axis = axis
+        self._ret_length = ret_length
+        self._dtype = dtype
+        self._pad_right = pad_right
+
+    def __call__(self, data):
+        """
+        Batchifies the input data by padding. The input will be padded to the
+        largest dimension at `axis` and then stacked to form the final output.
+        In addition, the function will output the original dimensions at the
+        `axis` if `ret_length` is not None or False.
+
+        Args:
+            data (list[numpy.ndarray|list]): The input data samples. It is a
+                list. Each element is a numpy.ndarray or list.
+
+        Returns:
+            numpy.ndarray|tuple[numpy.ndarray]: If `ret_length` is False, it
+            is a numpy.ndarray representing the padded batch data and the
+            shape is (N, …). Otherwise, it is a tuple, besides the padded batch
+            data, the tuple also includes a numpy.ndarray representing original
+            length at `axis` of all input samples, which shaped `(N,)`.
+
+        Example:
+            .. code-block:: python
+
+                from paddlenlp.data import Pad
+                a = [1, 2, 3, 4]
+                b = [5, 6, 7]
+                c = [8, 9]
+                result = Pad(pad_val=0)([a, b, c])
+                '''
+                [[1, 2, 3, 4],
+                 [5, 6, 7, 0],
+                 [8, 9, 0, 0]]
+                '''
+        """
+
+        # return data itself for rare unexpected cases when 1-D array is passed to Pad
+        if not isinstance(data[0], list) and not isinstance(data[0], np.ndarray):
+            return np.asarray(data, dtype=self._dtype if self._dtype is not None else np.int64)
+
+        arrs = [np.asarray(ele) for ele in data]
+        original_length = [ele.shape[self._axis] for ele in arrs]
+        max_size = max(original_length)
+        ret_shape = list(arrs[0].shape)
+        ret_shape[self._axis] = max_size
+        ret_shape = (len(arrs),) + tuple(ret_shape)
+        ret = np.full(
+            shape=ret_shape, fill_value=self._pad_val, dtype=arrs[0].dtype if self._dtype is None else self._dtype
+        )
+        for i, arr in enumerate(arrs):
+            if arr.shape[self._axis] == max_size:
+                ret[i] = arr
+            else:
+                slices = [slice(None) for _ in range(arr.ndim)]
+                if self._pad_right:
+                    slices[self._axis] = slice(0, arr.shape[self._axis])
+                else:
+                    slices[self._axis] = slice(max_size - arr.shape[self._axis], max_size)
+
+                if slices[self._axis].start != slices[self._axis].stop:
+                    slices = [slice(i, i + 1)] + slices
+                    ret[tuple(slices)] = arr
+        if self._ret_length:
+            return ret, np.asarray(original_length, dtype="int32") if self._ret_length else np.asarray(
+                original_length, self._ret_length
+            )
+        else:
+            return ret
+
+
+class Tuple(object):
+    """
+    Wraps multiple batchify functions together. The input functions will be applied
+    to the corresponding input fields.
+
+    Each sample should be a list or tuple containing multiple fields. The i'th
+    batchify function stored in Tuple will be applied on the i'th field.
+
+    For example, when data sample is (nd_data, label), you can wrap two batchify
+    functions using `Tuple(DataBatchify, LabelBatchify)` to batchify nd_data and
+    label correspondingly.
+
+    Args:
+        fn (callable|list[callable]|tuple[callable]): The batchify functions to
+            wrap. It is a callable function or a list/tuple of callable functions.
+        args (tuple[callable]): The additional batchify functions to wrap.
+    """
+
+    def __init__(self, fn, *args):
+        if isinstance(fn, (list, tuple)):
+            assert len(args) == 0, (
+                "Input pattern not understood. The input of Tuple can be "
+                "Tuple(A, B, C) or Tuple([A, B, C]) or Tuple((A, B, C)). "
+                "Received fn=%s, args=%s" % (str(fn), str(args))
+            )
+            self._fn = fn
+        else:
+            self._fn = (fn,) + args
+        for i, ele_fn in enumerate(self._fn):
+            assert callable(ele_fn), "Batchify functions must be callable! type(fn[%d]) = %s" % (i, str(type(ele_fn)))
+
+    def __call__(self, data):
+        """
+        Batchifies data samples by applying each function on the corresponding
+        data field, and each data field is produced by stacking the field data
+        of samples.
+
+        Args:
+            data (list|tuple): The samples to batchfy. Each sample in list/tuple
+                should contain `N` fields.
+
+        Returns:
+            tuple: A tuple composed of results from all including batchifying
+            functions.
+
+        Example:
+            .. code-block:: python
+
+                from paddlenlp.data import Stack, Pad, Tuple
+                data = [
+                        [[1, 2, 3, 4], [1]],
+                        [[5, 6, 7], [0]],
+                        [[8, 9], [1]],
+                       ]
+                batchify_fn = Tuple(Pad(pad_val=0), Stack())
+                ids, label = batchify_fn(data)
+                '''
+                ids:
+                [[1, 2, 3, 4],
+                [5, 6, 7, 0],
+                [8, 9, 0, 0]]
+                label: [[1], [0], [1]]
+                '''
+        """
+
+        assert len(data[0]) == len(
+            self._fn
+        ), "The number of attributes in each data sample should contain" " {} elements".format(len(self._fn))
+        ret = []
+        for i, ele_fn in enumerate(self._fn):
+            result = ele_fn([ele[i] for ele in data])
+            if isinstance(result, (tuple, list)):
+                ret.extend(result)
+            else:
+                ret.append(result)
+        return tuple(ret)
+
+
+class Dict(object):
+    """
+    Wraps multiple batchify functions together. The input functions will be
+    applied to the corresponding input fields.
+
+    Each sample should be a dict containing multiple fields. Each batchify
+    function with key stored in `Dict` will be applied on the field which has
+    the same key.
+
+    For example, when data sample is {'tokens': tokens, 'labels': labels}, you
+    can wrap two batchify functions using
+    `Dict({'tokens': DataBatchify, 'labels': LabelBatchify})` to batchify tokens
+    and labels correspondingly.
+
+    Args:
+        fn (dict): The batchify functions to wrap. It is a dict, which values is
+            callable functions.
+    """
+
+    def __init__(self, fn):
+        assert isinstance(fn, (dict)), (
+            "Input pattern not understood. The input of Dict must be a dict with key of input column name and value of collate_fn "
+            "Received fn=%s" % (str(fn))
+        )
+
+        self._fn = fn
+
+        for col_name, ele_fn in self._fn.items():
+            assert callable(ele_fn), "Batchify functions must be callable! type(fn[%d]) = %s" % (
+                col_name,
+                str(type(ele_fn)),
+            )
+
+    def __call__(self, data):
+        """
+        Batchifies data samples by applying each function on the corresponding
+        data field, and each data field is produced by stacking the field data
+        with the same key as batchify functions of all samples.
+
+        Args:
+            data (list[dict]|tuple[dict]): The samples to batchfy. Each sample
+                in list/tuple is a dict with `N` key-values.
+
+        Returns:
+            tuple: A tuple composed of results from all including batchifying
+            functions.
+
+        Example:
+            .. code-block:: python
+
+                from paddlenlp.data import Stack, Pad, Dict
+                data = [
+                        {'labels':[1], 'token_ids':[1, 2, 3, 4]},
+                        {'labels':[0], 'token_ids':[5, 6, 7]},
+                        {'labels':[1], 'token_ids':[8, 9]},
+                       ]
+                batchify_fn = Dict({'token_ids':Pad(pad_val=0), 'labels':Stack()})
+                ids, label = batchify_fn(data)
+                '''
+                ids:
+                [[1, 2, 3, 4],
+                [5, 6, 7, 0],
+                [8, 9, 0, 0]]
+                label: [[1], [0], [1]]
+                '''
+        """
+
+        ret = []
+        for col_name, ele_fn in self._fn.items():
+            result = ele_fn([ele[col_name] for ele in data])
+            if isinstance(result, (tuple, list)):
+                ret.extend(result)
+            else:
+                ret.append(result)
+        return tuple(ret)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/data_collator.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/data_collator.py
new file mode 100644
index 000000000..351c44867
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/data_collator.py
@@ -0,0 +1,887 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import random
+import warnings
+from collections.abc import Mapping
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
+
+import numpy as np
+import paddle
+
+from ..transformers import BertTokenizer
+from ..transformers.tokenizer_utils_base import (
+    BatchEncoding,
+    PaddingStrategy,
+    PretrainedTokenizerBase,
+)
+
+__all__ = [
+    "DataCollatorWithPadding",
+    "default_data_collator",
+    "DataCollator",
+    "DefaultDataCollator",
+    "DataCollatorForTokenClassification",
+    "DataCollatorForSeq2Seq",
+    "DataCollatorForLanguageModeling",
+    "DataCollatorForWholeWordMask",
+]
+
+InputDataClass = NewType("InputDataClass", Any)
+"""
+A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary
+of PaddlePaddle tensors or NumPy arrays.
+"""
+DataCollator = NewType("DataCollator", Callable[[List[InputDataClass]], Dict[str, Any]])
+
+
+class DataCollatorMixin:
+    def __call__(self, features, return_tensors=None):
+        if return_tensors is None:
+            return_tensors = self.return_tensors
+        if return_tensors == "pd":
+            return self.paddle_call(features)
+        elif return_tensors == "np":
+            return self.numpy_call(features)
+        else:
+            raise ValueError(f"Framework '{return_tensors}' not recognized!")
+
+
+def default_data_collator(features: List[InputDataClass], return_tensors="pd") -> Dict[str, Any]:
+    """
+    Very simple data collator that simply collates batches of dict-like objects and performs special handling for
+    potential keys named:
+
+        - `label`: handles a single value (int or float) per object
+        - `label_ids`: handles a list of values per object
+
+    Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs
+    to the model. See glue and ner for example of how it's useful.
+    """
+
+    # In this function we'll make the assumption that all `features` in the batch
+    # have the same attributes.
+    # So we will look at the first element as a proxy for what attributes exist
+    # on the whole batch.
+
+    if return_tensors == "pd":
+        return paddle_default_data_collator(features)
+    elif return_tensors == "np":
+        return numpy_default_data_collator(features)
+
+
+def paddle_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
+    if not isinstance(features[0], (dict, BatchEncoding)):
+        features = [vars(f) for f in features]
+    first = features[0]
+    batch = {}
+
+    # Special handling for labels.
+    # Ensure that tensor is created with the correct type
+    # (it should be automatically the case, but let's make sure of it.)
+    if "label" in first and first["label"] is not None:
+        label = first["label"].item() if isinstance(first["label"], paddle.Tensor) else first["label"]
+        dtype = "int64" if isinstance(label, int) else "float32"
+        batch["labels"] = paddle.to_tensor([f["label"] for f in features], dtype=dtype)
+    elif "label_ids" in first and first["label_ids"] is not None:
+        if isinstance(first["label_ids"], paddle.Tensor):
+            batch["labels"] = paddle.stack([f["label_ids"] for f in features])
+        else:
+            dtype = "int64" if type(first["label_ids"][0]) is int or np.int32 or np.int64 else "float32"
+            batch["labels"] = paddle.to_tensor([f["label_ids"] for f in features], dtype=dtype)
+
+    # Handling of all other possible keys.
+    # Again, we will use the first element to figure out which key/values are not None for this model.
+    for k, v in first.items():
+        if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
+            if isinstance(v, paddle.Tensor):
+                batch[k] = paddle.stack([f[k] for f in features])
+            else:
+                batch[k] = paddle.to_tensor([f[k] for f in features])
+
+    return batch
+
+
+def numpy_default_data_collator(features: List[InputDataClass]) -> Dict[str, Any]:
+
+    if not isinstance(features[0], (dict, BatchEncoding)):
+        features = [vars(f) for f in features]
+    first = features[0]
+    batch = {}
+
+    # Special handling for labels.
+    # Ensure that tensor is created with the correct type
+    # (it should be automatically the case, but let's make sure of it.)
+    if "label" in first and first["label"] is not None:
+        label = first["label"].item() if isinstance(first["label"], np.ndarray) else first["label"]
+        dtype = np.int64 if isinstance(label, int) else np.float32
+        batch["labels"] = np.array([f["label"] for f in features], dtype=dtype)
+    elif "label_ids" in first and first["label_ids"] is not None:
+        if isinstance(first["label_ids"], np.ndarray):
+            batch["labels"] = np.stack([f["label_ids"] for f in features])
+        else:
+            dtype = np.int64 if type(first["label_ids"][0]) is int or np.int32 or np.int64 else np.float32
+            batch["labels"] = np.array([f["label_ids"] for f in features], dtype=dtype)
+
+    # Handling of all other possible keys.
+    # Again, we will use the first element to figure out which key/values are not None for this model.
+    for k, v in first.items():
+        if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
+            if isinstance(v, np.ndarray):
+                batch[k] = np.stack([f[k] for f in features])
+            else:
+                batch[k] = np.array([f[k] for f in features])
+
+    return batch
+
+
+@dataclass
+class DefaultDataCollator(DataCollatorMixin):
+    """
+    Very simple data collator that simply collates batches of dict-like objects and performs special handling for
+    potential keys named:
+        - `label`: handles a single value (int or float) per object
+        - `label_ids`: handles a list of values per object
+    Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs
+    to the model. See glue and ner for example of how it's useful.
+    This is an object (like other data collators) rather than a pure function like default_data_collator. This can be
+    helpful if you need to set a return_tensors value at initialization.
+    Args:
+        return_tensors (`bool`):
+            Return Tensor or numpy array.
+    """
+
+    return_tensors: str = "pd"
+
+    def __call__(self, features: List[Dict[str, Any]], return_tensors=None) -> Dict[str, Any]:
+        if return_tensors is None:
+            return_tensors = self.return_tensors
+        return default_data_collator(features, return_tensors)
+
+
+@dataclass
+class DataCollatorWithPadding:
+    """
+    Data collator that will dynamically pad the inputs to the longest sequence in the batch.
+
+    Args:
+        tokenizer (`paddlenlp.transformers.PretrainedTokenizer`):
+            The tokenizer used for encoding the data.
+    """
+
+    tokenizer: PretrainedTokenizerBase
+    padding: Union[bool, str, PaddingStrategy] = True
+    max_length: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    return_tensors: str = "pd"
+    return_attention_mask: Optional[bool] = None
+
+    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
+        batch = self.tokenizer.pad(
+            features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors=self.return_tensors,
+            return_attention_mask=self.return_attention_mask,
+        )
+        if "label" in batch:
+            batch["labels"] = batch["label"]
+            del batch["label"]
+        if "label_ids" in batch:
+            batch["labels"] = batch["label_ids"]
+            del batch["label_ids"]
+        # To fix windows bug for paddle inference dtype error
+        # InvalidArgumentError: The type of data we are trying to retrieve does not match the type of data currently contained in the container
+        if self.return_tensors == "np":
+            batch = {k: np.array(v, dtype=np.int64) for k, v in batch.items()}
+        return batch
+
+
+@dataclass
+class DataCollatorForTokenClassification(DataCollatorMixin):
+    """
+    Data collator that will dynamically pad the inputs received, as well as the labels.
+
+    Args:
+        tokenizer ([`PretrainedTokenizer`] or [`PretrainedFasterTokenizer`]):
+            The tokenizer used for encoding the data.
+        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+
+            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence
+              is provided).
+            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+              acceptable input length for the model if that argument is not provided.
+            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+              lengths).
+        max_length (`int`, *optional*):
+            Maximum length of the returned list and optionally padding length (see above).
+        pad_to_multiple_of (`int`, *optional*):
+            If set will pad the sequence to a multiple of the provided value.
+
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+        label_pad_token_id (`int`, *optional*, defaults to -100):
+            The id to use when padding the labels (-100 will be automatically ignore by PaddlePaddle loss functions).
+        return_tensors (`str`):
+            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
+    """
+
+    tokenizer: PretrainedTokenizerBase
+    padding: Union[bool, str, PaddingStrategy] = True
+    max_length: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    label_pad_token_id: int = -100
+    return_tensors: str = "pd"
+
+    def paddle_call(self, features):
+        label_name = "label" if "label" in features[0].keys() else "labels"
+        labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
+        no_labels_features = [{k: v for k, v in feature.items() if k != label_name} for feature in features]
+
+        batch = self.tokenizer.pad(
+            no_labels_features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            # Conversion to tensors will fail if we have labels as they are not of the same length yet.
+            return_tensors="pd" if labels is None else None,
+        )
+
+        if labels is None:
+            return batch
+
+        sequence_length = paddle.to_tensor(batch["input_ids"]).shape[1]
+        padding_side = self.tokenizer.padding_side
+
+        def to_list(tensor_or_iterable):
+            if isinstance(tensor_or_iterable, paddle.Tensor):
+                return tensor_or_iterable.tolist()
+            return list(tensor_or_iterable)
+
+        if padding_side == "right":
+            batch[label_name] = [
+                to_list(label) + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels
+            ]
+        else:
+            batch[label_name] = [
+                [self.label_pad_token_id] * (sequence_length - len(label)) + to_list(label) for label in labels
+            ]
+
+        batch = {k: paddle.to_tensor(v, dtype="int64") for k, v in batch.items()}
+        return batch
+
+    def numpy_call(self, features):
+        label_name = "label" if "label" in features[0].keys() else "labels"
+        labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
+        batch = self.tokenizer.pad(
+            features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            # Conversion to tensors will fail if we have labels as they are not of the same length yet.
+            return_tensors="np" if labels is None else None,
+        )
+
+        if labels is None:
+            return batch
+
+        sequence_length = np.array(batch["input_ids"]).shape[1]
+        padding_side = self.tokenizer.padding_side
+        if padding_side == "right":
+            batch["labels"] = [
+                list(label) + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels
+            ]
+        else:
+            batch["labels"] = [
+                [self.label_pad_token_id] * (sequence_length - len(label)) + list(label) for label in labels
+            ]
+
+        batch = {k: np.array(v, dtype=np.int64) for k, v in batch.items()}
+        return batch
+
+
+@dataclass
+class DataCollatorForSeq2Seq:
+    """
+    Data collator that will dynamically pad the inputs received, as well as the labels.
+
+    Args:
+        tokenizer ([`PretrainedTokenizer`] or [`PretrainedFasterTokenizer`]):
+            The tokenizer used for encoding the data.
+        model ([`PreTrainedModel`]):
+            The model that is being trained. If set and has the *prepare_decoder_input_ids_from_labels*, use it to
+            prepare the *decoder_input_ids*
+
+            This is useful when using *label_smoothing* to avoid calculating loss twice.
+        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+
+            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence
+              is provided).
+            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+              acceptable input length for the model if that argument is not provided.
+            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+              lengths).
+        max_length (`int`, *optional*):
+            Maximum length of the returned list and optionally padding length (see above).
+        pad_to_multiple_of (`int`, *optional*):
+            If set will pad the sequence to a multiple of the provided value.
+
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+        label_pad_token_id (`int`, *optional*, defaults to -100):
+            The id to use when padding the labels (-100 will be automatically ignored by PaddlePaddle loss functions).
+        return_tensors (`str`):
+            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
+        max_label_length (`int`, *optional*, Pad label to max_label_length. defaults to `None`):
+    """
+
+    tokenizer: PretrainedTokenizerBase
+    model: Optional[Any] = None
+    padding: Union[bool, str, PaddingStrategy] = True
+    max_length: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    label_pad_token_id: int = -100
+    return_tensors: str = "pd"
+    return_attention_mask: Optional[bool] = None
+    max_label_length: Optional[int] = None
+
+    def __call__(self, features, return_tensors=None):
+        # Deep copy to avoid modifying features in-place
+        batch = copy.deepcopy(features)
+        if return_tensors is None:
+            return_tensors = self.return_tensors
+        labels = [feature["labels"] for feature in batch] if "labels" in batch[0].keys() else None
+        use_attn_mask_startend_row_indices = (
+            [feature["attn_mask_startend_row_indices"] for feature in batch]
+            if "attn_mask_startend_row_indices" in batch[0].keys()
+            else None
+        )
+        # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
+        # same length to return tensors.
+        if labels is not None:
+            # Note(gongenlei): In pipeline, max_label_length = self.max_length
+            if self.max_label_length is not None:
+                max_label_length = self.max_label_length
+            else:
+                max_label_length = max(len(l) for l in labels)
+            if self.pad_to_multiple_of is not None:
+                max_label_length = (
+                    (max_label_length + self.pad_to_multiple_of - 1)
+                    // self.pad_to_multiple_of
+                    * self.pad_to_multiple_of
+                )
+
+            padding_side = self.tokenizer.padding_side
+            for feature in batch:
+                remainder = [self.label_pad_token_id] * (max_label_length - len(feature["labels"]))
+                if isinstance(feature["labels"], list):
+                    feature["labels"] = (
+                        feature["labels"] + remainder if padding_side == "right" else remainder + feature["labels"]
+                    )
+                elif padding_side == "right":
+                    feature["labels"] = np.concatenate([feature["labels"], remainder]).astype(np.int64)
+                else:
+                    feature["labels"] = np.concatenate([remainder, feature["labels"]]).astype(np.int64)
+        if use_attn_mask_startend_row_indices is not None:
+            if self.max_length is not None:
+                max_length = self.max_length
+            else:
+                max_length = max(len(l) for l in use_attn_mask_startend_row_indices)
+            if self.pad_to_multiple_of is not None:
+                max_length = (
+                    (max_length + self.pad_to_multiple_of - 1) // self.pad_to_multiple_of * self.pad_to_multiple_of
+                )
+
+            for feature in batch:
+                pad_len = max_length - len(feature["attn_mask_startend_row_indices"])
+                remainder = np.zeros([1, pad_len], dtype=np.int32)
+                feature["attn_mask_startend_row_indices"] = (
+                    np.concatenate(
+                        [remainder, np.array([feature["attn_mask_startend_row_indices"]], dtype=np.int32) + pad_len],
+                        axis=-1,
+                    )
+                    if padding_side == "left"
+                    else np.concatenate(
+                        [np.array([feature["attn_mask_startend_row_indices"]], dtype=np.int32), remainder], axis=-1
+                    )
+                )
+
+        batch = self.tokenizer.pad(
+            batch,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_attention_mask=self.return_attention_mask,
+        )
+        # prepare decoder_input_ids
+        if (
+            labels is not None
+            and self.model is not None
+            and hasattr(self.model, "prepare_decoder_input_ids_from_labels")
+        ):
+            decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels(labels=batch["labels"])
+            batch["decoder_input_ids"] = decoder_input_ids
+        return batch
+
+
+def _paddle_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
+    """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
+    import paddle
+
+    # Tensorize if necessary.
+    if isinstance(examples[0], (list, tuple, np.ndarray)):
+        examples = [paddle.to_tensor(e, dtype="int64") for e in examples]
+
+    length_of_first = examples[0].shape[0]
+
+    # Check if padding is necessary.
+
+    are_tensors_same_length = all(x.shape[0] == length_of_first for x in examples)
+    if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
+        return paddle.stack(examples, axis=0)
+
+    # If yes, check if we have a `pad_token`.
+    if tokenizer._pad_token is None:
+        raise ValueError(
+            "You are attempting to pad samples but the tokenizer you are using"
+            f" ({tokenizer.__class__.__name__}) does not have a pad token."
+        )
+
+    # Creating the full tensor and filling it with our data.
+    max_length = max(x.shape[0] for x in examples)
+    if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+        max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+    # result = examples[0].new_full([len(examples), max_length], tokenizer.pad_token_id)
+    result = paddle.full([len(examples), max_length], tokenizer.pad_token_id, dtype=examples[0].dtype)
+
+    for i, example in enumerate(examples):
+        if tokenizer.padding_side == "right":
+            result[i, : example.shape[0]] = example
+        else:
+            result[i, -example.shape[0] :] = example
+    return result
+
+
+def _numpy_collate_batch(examples, tokenizer, pad_to_multiple_of: Optional[int] = None):
+    import numpy as np
+
+    """Collate `examples` into a batch, using the information in `tokenizer` for padding if necessary."""
+    # Tensorize if necessary.
+    if isinstance(examples[0], (list, tuple)):
+        examples = [np.array(e, dtype=np.int64) for e in examples]
+
+    # Check if padding is necessary.
+    length_of_first = len(examples[0])
+    are_tensors_same_length = all(len(x) == length_of_first for x in examples)
+    if are_tensors_same_length and (pad_to_multiple_of is None or length_of_first % pad_to_multiple_of == 0):
+        return np.stack(examples, axis=0)
+
+    # If yes, check if we have a `pad_token`.
+    if tokenizer._pad_token is None:
+        raise ValueError(
+            "You are attempting to pad samples but the tokenizer you are using"
+            f" ({tokenizer.__class__.__name__}) does not have a pad token."
+        )
+
+    # Creating the full tensor and filling it with our data.
+    max_length = max(len(x) for x in examples)
+    if pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+        max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+    result = np.full(shape=(len(examples), max_length), fill_value=tokenizer.pad_token_id, dtype=examples[0].dtype)
+    for i, example in enumerate(examples):
+        if tokenizer.padding_side == "right":
+            result[i, : example.shape[0]] = example
+        else:
+            result[i, -example.shape[0] :] = example
+    return result
+
+
+def tolist(x):
+    if isinstance(x, list):
+        return x
+    elif hasattr(x, "numpy"):  # Checks for TF tensors without needing the import
+        x = x.cpu().numpy()
+    return x.tolist()
+
+
+@dataclass
+class DataCollatorForLanguageModeling(DataCollatorMixin):
+    """
+    Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they
+    are not all of the same length.
+    Args:
+        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
+            The tokenizer used for encoding the data.
+        mlm (`bool`, *optional*, defaults to `True`):
+            Whether or not to use masked language modeling. If set to `False`, the labels are the same as the inputs
+            with the padding tokens ignored (by setting them to -100). Otherwise, the labels are -100 for non-masked
+            tokens and the value to predict for the masked token.
+        mlm_probability (`float`, *optional*, defaults to 0.15):
+            The probability with which to (randomly) mask tokens in the input, when `mlm` is set to `True`.
+        pad_to_multiple_of (`int`, *optional*):
+            If set will pad the sequence to a multiple of the provided value.
+        return_tensors (`str`):
+            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
+    <Tip>
+    For best performance, this data collator should be used with a dataset having items that are dictionaries or
+    BatchEncoding, with the `"special_tokens_mask"` key, as returned by a [`PreTrainedTokenizer`] or a
+    [`PreTrainedTokenizerFast`] with the argument `return_special_tokens_mask=True`.
+    </Tip>"""
+
+    tokenizer: PretrainedTokenizerBase
+    mlm: bool = True
+    mlm_probability: float = 0.15
+    pad_to_multiple_of: Optional[int] = None
+    return_tensors: str = "pd"
+
+    def paddle_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+        # Handle dict or lists with proper padding and conversion to tensor.
+        if isinstance(examples[0], Mapping):
+            batch = self.tokenizer.pad(examples, return_tensors="pd", pad_to_multiple_of=self.pad_to_multiple_of)
+        else:
+            batch = {
+                "input_ids": _paddle_collate_batch(
+                    examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of
+                )
+            }
+
+        # If special token mask has been preprocessed, pop it from the dict.
+        special_tokens_mask = batch.pop("special_tokens_mask", None)
+        if self.mlm:
+            batch["input_ids"], batch["labels"] = self.paddle_mask_tokens(
+                batch["input_ids"], special_tokens_mask=special_tokens_mask
+            )
+        else:
+            labels = batch["input_ids"].clone()
+            if self.tokenizer.pad_token_id is not None:
+                labels[labels == self.tokenizer.pad_token_id] = -100
+            batch["labels"] = labels
+        return batch
+
+    def paddle_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+        """
+        import paddle
+
+        labels = inputs.clone()
+        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
+        probability_matrix = paddle.full(labels.shape, self.mlm_probability)
+        if special_tokens_mask is None:
+            special_tokens_mask = [
+                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+            ]
+
+            special_tokens_mask = paddle.to_tensor(special_tokens_mask, dtype="bool")
+        else:
+            special_tokens_mask = special_tokens_mask.cast("bool")
+
+        def masked_fill(x, mask, value):
+            y = paddle.full(x.shape, value, x.dtype)
+            return paddle.where(mask, y, x)
+
+        # probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
+        probability_matrix = masked_fill(probability_matrix, special_tokens_mask, value=0.0)
+        masked_indices = paddle.bernoulli(probability_matrix).cast("bool")
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = paddle.bernoulli(paddle.full(labels.shape, 0.8)).cast("bool") & masked_indices
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = (
+            paddle.bernoulli(paddle.full(labels.shape, 0.5)).cast("bool") & masked_indices & ~indices_replaced
+        )
+        random_words = paddle.randint(len(self.tokenizer), shape=labels.shape, dtype="int64")
+        inputs[indices_random] = random_words[indices_random]
+
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+
+    def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+        # Handle dict or lists with proper padding and conversion to tensor.
+        if isinstance(examples[0], Mapping):
+            batch = self.tokenizer.pad(examples, return_tensors="np", pad_to_multiple_of=self.pad_to_multiple_of)
+        else:
+            batch = {
+                "input_ids": _numpy_collate_batch(examples, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
+            }
+
+        # If special token mask has been preprocessed, pop it from the dict.
+        special_tokens_mask = batch.pop("special_tokens_mask", None)
+        if self.mlm:
+            batch["input_ids"], batch["labels"] = self.numpy_mask_tokens(
+                batch["input_ids"], special_tokens_mask=special_tokens_mask
+            )
+        else:
+            labels = np.copy(batch["input_ids"])
+            if self.tokenizer.pad_token_id is not None:
+                labels[labels == self.tokenizer.pad_token_id] = -100
+            batch["labels"] = labels
+        return batch
+
+    def numpy_mask_tokens(self, inputs: Any, special_tokens_mask: Optional[Any] = None) -> Tuple[Any, Any]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+        """
+        labels = np.copy(inputs)
+        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
+        probability_matrix = np.full(labels.shape, self.mlm_probability)
+        if special_tokens_mask is None:
+            special_tokens_mask = [
+                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+            ]
+            special_tokens_mask = np.array(special_tokens_mask, dtype=bool)
+        else:
+            special_tokens_mask = special_tokens_mask.astype(bool)
+
+        probability_matrix[special_tokens_mask] = 0
+        # Numpy doesn't have bernoulli, so we use a binomial with 1 trial
+        masked_indices = np.random.binomial(1, probability_matrix, size=probability_matrix.shape).astype(bool)
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = np.random.binomial(1, 0.8, size=labels.shape).astype(bool) & masked_indices
+        inputs[indices_replaced] = self.tokenizer.mask_token_id
+
+        # 10% of the time, we replace masked input tokens with random word
+        # indices_random = paddle.bernoulli(paddle.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
+        indices_random = (
+            np.random.binomial(1, 0.5, size=labels.shape).astype(bool) & masked_indices & ~indices_replaced
+        )
+        random_words = np.random.randint(
+            low=0, high=len(self.tokenizer), size=np.count_nonzero(indices_random), dtype=np.int64
+        )
+        inputs[indices_random] = random_words
+
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+
+
+@dataclass
+class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
+    """
+    Data collator used for language modeling that masks entire words.
+    - collates batches of tensors, honoring their tokenizer's pad_token
+    - preprocesses batches for masked language modeling
+    <Tip>
+    This collator relies on details of the implementation of subword tokenization by [`BertTokenizer`], specifically
+    that subword tokens are prefixed with *##*. For tokenizers that do not adhere to this scheme, this collator will
+    produce an output that is roughly equivalent to [`.DataCollatorForLanguageModeling`].
+    </Tip>"""
+
+    def paddle_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+        if isinstance(examples[0], Mapping):
+            input_ids = [e["input_ids"] for e in examples]
+        else:
+            input_ids = examples
+            examples = [{"input_ids": e} for e in examples]
+
+        batch_input = _paddle_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
+
+        mask_labels = []
+        for e in examples:
+            ref_tokens = []
+            for id in tolist(e["input_ids"]):
+                token = self.tokenizer._convert_id_to_token(id)
+                ref_tokens.append(token)
+
+            # For Chinese tokens, we need extra inf to mark sub-word, e.g [喜,欢]-> [喜，##欢]
+            if "chinese_ref" in e:
+                ref_pos = tolist(e["chinese_ref"])
+                len_seq = len(e["input_ids"])
+                for i in range(len_seq):
+                    if i in ref_pos:
+                        ref_tokens[i] = "##" + ref_tokens[i]
+            mask_labels.append(self._whole_word_mask(ref_tokens))
+        batch_mask = _paddle_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
+        inputs, labels = self.paddle_mask_tokens(batch_input, batch_mask)
+        return {"input_ids": inputs, "labels": labels}
+
+    def numpy_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
+        if isinstance(examples[0], Mapping):
+            input_ids = [e["input_ids"] for e in examples]
+        else:
+            input_ids = examples
+            examples = [{"input_ids": e} for e in examples]
+
+        batch_input = _numpy_collate_batch(input_ids, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
+
+        mask_labels = []
+        for e in examples:
+            ref_tokens = []
+            for id in tolist(e["input_ids"]):
+                token = self.tokenizer._convert_id_to_token(id)
+                ref_tokens.append(token)
+
+            # For Chinese tokens, we need extra inf to mark sub-word, e.g [喜,欢]-> [喜，##欢]
+            if "chinese_ref" in e:
+                ref_pos = tolist(e["chinese_ref"])
+                len_seq = len(e["input_ids"])
+                for i in range(len_seq):
+                    if i in ref_pos:
+                        ref_tokens[i] = "##" + ref_tokens[i]
+            mask_labels.append(self._whole_word_mask(ref_tokens))
+        batch_mask = _numpy_collate_batch(mask_labels, self.tokenizer, pad_to_multiple_of=self.pad_to_multiple_of)
+        inputs, labels = self.numpy_mask_tokens(batch_input, batch_mask)
+        return {"input_ids": inputs, "labels": labels}
+
+    def _whole_word_mask(self, input_tokens: List[str], max_predictions=512):
+        """
+        Get 0/1 labels for masked tokens with whole word mask proxy
+        """
+        if not isinstance(self.tokenizer, (BertTokenizer)):
+            warnings.warn(
+                "DataCollatorForWholeWordMask is only suitable for BertTokenizer-like tokenizers. "
+                "Please refer to the documentation for more information."
+            )
+
+        cand_indexes = []
+        for i, token in enumerate(input_tokens):
+            if token == "[CLS]" or token == "[SEP]":
+                continue
+
+            if len(cand_indexes) >= 1 and token.startswith("##"):
+                cand_indexes[-1].append(i)
+            else:
+                cand_indexes.append([i])
+
+        random.shuffle(cand_indexes)
+        num_to_predict = min(max_predictions, max(1, int(round(len(input_tokens) * self.mlm_probability))))
+        masked_lms = []
+        covered_indexes = set()
+        for index_set in cand_indexes:
+            if len(masked_lms) >= num_to_predict:
+                break
+            # If adding a whole-word mask would exceed the maximum number of
+            # predictions, then just skip this candidate.
+            if len(masked_lms) + len(index_set) > num_to_predict:
+                continue
+            is_any_index_covered = False
+            for index in index_set:
+                if index in covered_indexes:
+                    is_any_index_covered = True
+                    break
+            if is_any_index_covered:
+                continue
+            for index in index_set:
+                covered_indexes.add(index)
+                masked_lms.append(index)
+
+        if len(covered_indexes) != len(masked_lms):
+            raise ValueError("Length of covered_indexes is not equal to length of masked_lms.")
+        mask_labels = [1 if i in covered_indexes else 0 for i in range(len(input_tokens))]
+        return mask_labels
+
+    def paddle_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
+        'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
+        """
+        import paddle
+
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
+                " --mlm flag if you want to use this tokenizer."
+            )
+        labels = inputs.clone()
+        # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
+
+        probability_matrix = mask_labels
+
+        special_tokens_mask = [
+            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+        ]
+
+        def masked_fill(x, mask, value):
+            y = paddle.full(x.shape, value, x.dtype)
+            return paddle.where(mask, y, x)
+
+        # probability_matrix.masked_fill_(paddle.tensor(special_tokens_mask, dtype=paddle.bool), value=0.0)
+        probability_matrix = masked_fill(
+            probability_matrix, paddle.to_tensor(special_tokens_mask, dtype="bool"), value=0.0
+        )
+        if self.tokenizer._pad_token is not None:
+            padding_mask = labels.equal(self.tokenizer.pad_token_id)
+            # probability_matrix.masked_fill_(padding_mask, value=0.0)
+            probability_matrix = masked_fill(probability_matrix, padding_mask, value=0.0)
+
+        masked_indices = probability_matrix.cast("bool")
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = paddle.bernoulli(paddle.full(labels.shape, 0.8)).cast("bool") & masked_indices
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = (
+            paddle.bernoulli(paddle.full(labels.shape, 0.5)).cast("bool") & masked_indices & ~indices_replaced
+        )
+
+        random_words = paddle.randint(0, len(self.tokenizer), labels.shape, dtype="int64")
+        inputs[indices_random] = random_words[indices_random]
+
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+
+    def numpy_mask_tokens(self, inputs: Any, mask_labels: Any) -> Tuple[Any, Any]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. Set
+        'mask_labels' means we use whole word mask (wwm), we directly mask idxs according to it's ref.
+        """
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the"
+                " --mlm flag if you want to use this tokenizer."
+            )
+        labels = np.copy(inputs)
+        # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
+
+        masked_indices = mask_labels.astype(bool)
+
+        special_tokens_mask = [
+            self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
+        ]
+        masked_indices[np.array(special_tokens_mask, dtype=bool)] = 0
+        if self.tokenizer._pad_token is not None:
+            padding_mask = labels == self.tokenizer.pad_token_id
+            masked_indices[padding_mask] = 0
+
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = np.random.binomial(1, 0.8, size=labels.shape).astype(bool) & masked_indices
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+
+        # 10% of the time, we replace masked input tokens with random word
+        # indices_random = paddle.bernoulli(paddle.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
+        indices_random = (
+            np.random.binomial(1, 0.5, size=labels.shape).astype(bool) & masked_indices & ~indices_replaced
+        )
+        random_words = np.random.randint(low=0, high=len(self.tokenizer), size=labels.shape, dtype=np.int64)
+        inputs[indices_random] = random_words[indices_random]
+
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/dist_dataloader.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/dist_dataloader.py
new file mode 100644
index 000000000..a6330ce1f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/dist_dataloader.py
@@ -0,0 +1,214 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.distributed import fleet
+
+from paddlenlp.utils.log import logger
+from paddlenlp.utils.nested import (
+    nested_broadcast_tensor,
+    nested_copy_place,
+    nested_empty_tensor,
+    nested_reduce_tensor,
+)
+
+
+class DummyDataset(paddle.io.Dataset):
+    """
+    A dummy dataset.
+    """
+
+    def __len__(self):
+        return 0
+
+
+class IterableDummyDataset(paddle.io.IterableDataset):
+    def __iter__(self):
+        return None
+
+
+class DistDataLoader(paddle.io.DataLoader):
+    """
+    DistDataLoader is a wrapper of paddle.io.DataLoader.
+    """
+
+    def __init__(
+        self,
+        dataset,
+        feed_list=None,
+        places=None,
+        return_list=True,
+        batch_sampler=None,
+        batch_size=1,
+        shuffle=False,
+        drop_last=False,
+        collate_fn=None,
+        num_workers=0,
+        use_buffer_reader=True,
+        prefetch_factor=2,
+        use_shared_memory=True,
+        timeout=0,
+        worker_init_fn=None,
+        persistent_workers=False,
+        **kwargs,
+    ):
+
+        eval = kwargs.pop("eval", False)
+        is_iterable_dataset = kwargs.pop("is_iterable_dataset", False)
+
+        if dataset is None:
+            dataset = DummyDataset() if not is_iterable_dataset else IterableDummyDataset()
+            logger.info("rank has no data, use Dummpy dataset")
+
+        super().__init__(dataset=dataset, batch_sampler=batch_sampler, collate_fn=collate_fn, num_workers=num_workers)
+
+        self._hcg = fleet.get_hybrid_communicate_group()
+        self.eval = eval
+
+        # Init pp data comm group.
+        if self._hcg.get_pipe_parallel_world_size() > 1:
+            self._pp_data_group = self._init_dataloader_comm_group()
+            self._pp_group = self._hcg.get_pipe_parallel_group()
+        else:
+            self._pp_data_group = None
+            self._pp_group = None
+
+        self.mp_group = self._hcg.get_model_parallel_group()
+        self.mp_rank = self._hcg.get_model_parallel_rank()
+        self.mp_src_rank = self._hcg.get_model_parallel_group_src_rank()
+
+        self.pp_rank = self._hcg.get_stage_id()
+        self.dp_rank = self._hcg.get_data_parallel_rank()
+        sharding_rank = self._hcg.get_sharding_parallel_rank()
+        self._need_data = (self.mp_rank == 0) and (self.pp_rank == 0)
+
+        if self._need_data:
+            self._dataloader = paddle.io.DataLoader(
+                dataset,
+                feed_list,
+                places,
+                return_list,
+                batch_sampler,
+                batch_size,
+                shuffle,
+                drop_last,
+                collate_fn,
+                num_workers,
+                use_buffer_reader,
+                prefetch_factor,
+                use_shared_memory,
+                timeout,
+                worker_init_fn,
+                persistent_workers,
+            )
+
+            self._lazy_dataloader_iter = None
+        else:
+            logger.info(
+                "mp{}_pp{}_sharding{}_dp{} no data needed, "
+                "skip init dataloader.".format(self.mp_rank, self.pp_rank, sharding_rank, self.dp_rank)
+            )
+
+    @property
+    def _dataloader_iter(self):
+        if self._lazy_dataloader_iter is None:
+            self._lazy_dataloader_iter = iter(self._dataloader)
+        return self._lazy_dataloader_iter
+
+    def __len__(self):
+        if self._need_data:
+            return super().__len__()
+        else:
+            raise ValueError("raise error for `paddlenlp.trainer.trainer_utils.has_length`")
+
+    def _init_dataloader_comm_group(self):
+        topo = self._hcg._topo
+        parallel_comm_group = None
+        parallel_groups = topo.get_comm_list("pipe")
+
+        for group in parallel_groups:
+            ranks = [group[0], group[-1]]
+            comm_group = paddle.distributed.new_group(ranks=ranks)
+            if paddle.distributed.get_rank() in ranks:
+                parallel_comm_group = comm_group
+        return parallel_comm_group
+
+    def __iter__(self):
+        return self
+
+    def _broadcast_data(self, data):
+        process_rank = paddle.distributed.get_rank()
+        if self.mp_group.nranks > 1:
+            if process_rank == self.mp_src_rank:
+                fake_data = [nested_reduce_tensor(data)]
+            else:
+                if data is not None:
+                    logger.warning(
+                        f"Your local rank {paddle.distributed.get_rank()} are forbidden to have a state_dict."
+                    )
+                fake_data = [None]
+        if self._pp_group is not None:
+            if process_rank == self._pp_group.ranks[0]:
+                fake_data = [nested_reduce_tensor(data)]
+            else:
+                if data is not None:
+                    logger.warning(
+                        f"Your local rank {paddle.distributed.get_rank()} are forbidden to have a state_dict."
+                    )
+                fake_data = [None]
+        if self.mp_group.nranks > 1 and self.pp_rank == 0:
+            paddle.distributed.broadcast_object_list(
+                fake_data,
+                src=self.mp_src_rank,
+                group=self.mp_group,
+            )
+        if self._pp_group is not None:
+            paddle.distributed.broadcast_object_list(
+                fake_data,
+                src=self._pp_group.ranks[0],
+                group=self._pp_group,
+            )
+
+        fake_data = fake_data[0]
+        if fake_data is None:
+            raise StopIteration
+
+        dst_pp_group = self._pp_group if self.eval else self._pp_data_group
+        if self.mp_group.nranks > 1:
+            if process_rank != self.mp_src_rank:
+                data = nested_empty_tensor(fake_data)
+        if dst_pp_group is not None:
+            if process_rank != dst_pp_group.ranks[0]:
+                data = nested_empty_tensor(fake_data)
+
+        if self.mp_group.nranks > 1 and self.pp_rank == 0:
+            data = nested_broadcast_tensor(data, src=self.mp_src_rank, group=self.mp_group)
+        if dst_pp_group is not None:
+            data = nested_broadcast_tensor(data, src=dst_pp_group.ranks[0], group=dst_pp_group)
+        # for pp1 - pp_{n-1}, Paddle need to recevie empty dict for pipeline parallel.
+        if data is None:
+            data = {}
+
+        return data
+
+    def __next__(self):
+        data = None
+        if self._need_data:
+            try:
+                data = next(self._dataloader_iter)
+                data = nested_copy_place(data, place=paddle.framework._current_expected_place())
+            except Exception as e:
+                logger.debug(e)
+        data = self._broadcast_data(data)
+        return data
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/indexed_dataset.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/indexed_dataset.py
new file mode 100644
index 000000000..513033977
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/indexed_dataset.py
@@ -0,0 +1,972 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# https://github.com/NVIDIA/Megatron-LM/blob/060415572f4365a2e895f8036c4e37dad0efbdf5/megatron/data/indexed_dataset.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# copied from fairseq/fairseq/data/indexed_dataset.py
+# Removed IndexedRawTextDataset since it relied on Fairseq dictionary
+# other slight modifications to remove fairseq dependencies
+# Added document index to index file and made it accessible.
+#    An empty sentence no longer separates documents.
+
+import os
+import shutil
+import struct
+import time
+from dataclasses import fields
+from functools import lru_cache
+from itertools import accumulate
+
+import numpy as np
+import paddle
+
+
+def print_rank_0(*args, **kwargs):
+    if paddle.distributed.get_rank() == 0:
+        print(*args, **kwargs)
+
+
+def __best_fitting_dtype(vocab_size=None):
+    if vocab_size is not None and vocab_size < 65500:
+        return np.uint16
+    else:
+        return np.int32
+
+
+def get_available_dataset_impl():
+    return ["lazy", "mmap"]
+
+
+def make_dataset(path, impl, skip_warmup=False):
+    if CompatibleIndexedDataset.exists(path):
+        print("Using old dataet (.npy & .npz)")
+        return CompatibleIndexedDataset(path)
+    elif not IndexedDataset.exists(path):
+        print(f"Dataset does not exist: {path}")
+        print("Path should be a basename that both .idx and .bin can be appended to get full filenames.")
+        return None
+    elif impl == "lazy" and IndexedDataset.exists(path):
+        return IndexedDataset(path)
+    elif impl == "mmap" and MMapIndexedDataset.exists(path):
+        return MMapIndexedDataset(path, skip_warmup)
+    print(f"Unknown dataset implementation: {impl}")
+    return None
+
+
+def make_sft_dataset(path, dataclass, skip_warmup=False, impl="mmap"):
+    if impl != "mmap":
+        raise ValueError("SFT Indexed Dataset only support mmap memory-mapped method temporarily")
+
+    print_rank_0(" > building dataset index ...")
+    start_time = time.time()
+    sft_indexed_dataset = SFTMMapIndexedDataset(path, dataclass, skip_warmup)
+    print_rank_0(" > finished creating SFT indexed dataset in {:4f} " "seconds".format(time.time() - start_time))
+    print_rank_0("    number of samples: {}".format(len(sft_indexed_dataset.doc_idx) - 1))
+
+    return sft_indexed_dataset
+
+
+def dataset_exists(path, impl):
+    if impl == "mmap":
+        return MMapIndexedDataset.exists(path)
+    else:
+        return IndexedDataset.exists(path)
+
+
+def read_longs(f, n):
+    a = np.empty(n, dtype=np.int64)
+    f.readinto(a)
+    return a
+
+
+def write_longs(f, a):
+    f.write(np.array(a, dtype=np.int64))
+
+
+def read_shorts(f, n):
+    a = np.empty(n, dtype=np.int32)
+    f.readinto(a)
+    return a
+
+
+def write_shorts(f, a):
+    f.write(np.array(a, dtype=np.int32))
+
+
+dtypes = {
+    1: np.uint8,
+    2: np.int8,
+    3: np.int16,
+    4: np.int32,
+    5: np.int64,
+    6: np.float64,
+    7: np.float32,
+    8: np.uint16,
+    9: np.uint32,
+    10: np.uint64,
+}
+
+
+def code(dtype):
+    for k in dtypes.keys():
+        if dtypes[k] == dtype:
+            return k
+    raise ValueError(dtype)
+
+
+def index_file_path(prefix_path):
+    return prefix_path + ".idx"
+
+
+def sft_index_file_path(prefix_path):
+    return os.path.join(prefix_path, "index.idx")
+
+
+def sft_data_file_path(prefix_path, dataclass):
+    file_path_list = []
+    for field in fields(dataclass):
+        file_path = os.path.join(prefix_path, f"{field.name}.bin")
+        file_path_list.append(file_path)
+    return file_path_list
+
+
+def data_file_path(prefix_path):
+    return prefix_path + ".bin"
+
+
+def loss_mask_file_path(prefix_path):
+    return prefix_path + ".lsm"
+
+
+def create_doc_idx(sizes):
+    doc_idx = [0]
+    for i, s in enumerate(sizes):
+        if s == 0:
+            doc_idx.append(i + 1)
+    return doc_idx
+
+
+class IndexedDataset(paddle.io.Dataset):
+    """Loader for IndexedDataset"""
+
+    _HDR_MAGIC = b"TNTIDX\x00\x00"
+
+    def __init__(self, path):
+        super().__init__()
+        self.path = path
+        self.data_file = None
+        self.read_index(path)
+
+    def read_index(self, path):
+        with open(index_file_path(path), "rb") as f:
+            magic = f.read(8)
+            assert magic == self._HDR_MAGIC, (
+                "Index file doesn't match expected format. " "Make sure that --dataset-impl is configured properly."
+            )
+            version = f.read(8)
+            assert struct.unpack("<Q", version) == (1,)
+            code, self.element_size = struct.unpack("<QQ", f.read(16))
+            self.dtype = dtypes[code]
+            self._len, self.s = struct.unpack("<QQ", f.read(16))
+            self.doc_count = struct.unpack("<Q", f.read(8))
+            self.dim_offsets = read_longs(f, self._len + 1)
+            self.data_offsets = read_longs(f, self._len + 1)
+            self.sizes = read_shorts(f, self.s)
+            self._doc_idx = read_longs(f, self.doc_count)
+
+    def read_data(self, path):
+        self.data_file = open(data_file_path(path), "rb", buffering=0)
+
+    def check_index(self, i):
+        if i < 0 or i >= self._len:
+            raise IndexError("index out of range")
+
+    def __del__(self):
+        if self.data_file:
+            self.data_file.close()
+
+    # @lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if not self.data_file:
+            self.read_data(self.path)
+        if isinstance(idx, int):
+            i = idx
+            self.check_index(i)
+            tensor_size = self.sizes[self.dim_offsets[i] : self.dim_offsets[i + 1]]
+            a = np.empty(tensor_size, dtype=self.dtype)
+            self.data_file.seek(self.data_offsets[i] * self.element_size)
+            self.data_file.readinto(a)
+            return a
+        elif isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            if step != 1:
+                raise ValueError("Slices into indexed_dataset must be contiguous")
+            sizes = self.sizes[self.dim_offsets[start] : self.dim_offsets[stop]]
+            size = sum(sizes)
+            a = np.empty(size, dtype=self.dtype)
+            self.data_file.seek(self.data_offsets[start] * self.element_size)
+            self.data_file.readinto(a)
+            offsets = list(accumulate(sizes))
+            sents = np.split(a, offsets[:-1])
+            return sents
+
+    def get(self, idx, offset=0, length=None):
+        """Retrieves a single item from the dataset with the option to only
+        return a portion of the item.
+
+        get(idx) is the same as [idx] but get() does not support slicing.
+        """
+        if not self.data_file:
+            self.read_data(self.path)
+        size = self.sizes[idx]
+        ptr = self.data_offsets[idx]
+        if length is None:
+            length = size - offset
+        ptr += offset
+        a = np.empty(length, dtype=self.dtype)
+        self.data_file.seek(ptr * self.element_size)
+        self.data_file.readinto(a)
+        return a
+
+    def __len__(self):
+        return self._len
+
+    def num_tokens(self, index):
+        return self.sizes[index]
+
+    def size(self, index):
+        return self.sizes[index]
+
+    @staticmethod
+    def exists(path):
+        return os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
+
+    @property
+    def supports_prefetch(self):
+        return False  # avoid prefetching to save memory
+
+    @property
+    def doc_idx(self):
+        return self._doc_idx
+
+    def get_doc_idx(self):
+        return self._doc_idx
+
+    def set_doc_idx(self, doc_idx_):
+        self._doc_idx = doc_idx_
+
+
+class IndexedDatasetBuilder(object):
+    element_sizes = {
+        np.uint8: 1,
+        np.int8: 1,
+        np.int16: 2,
+        np.uint16: 2,
+        np.int32: 4,
+        np.int64: 8,
+        np.float32: 4,
+        np.float64: 8,
+    }
+
+    def __init__(self, out_file, dtype=np.int32):
+        self.out_file = open(out_file, "wb")
+        self.dtype = dtype
+        self.data_offsets = [0]
+        self.dim_offsets = [0]
+        self.sizes = []
+        self.element_size = self.element_sizes[self.dtype]
+        self.doc_idx = [0]
+
+    def add_item(self, tensor):
+        tensor = np.array(tensor, dtype=self.dtype)
+        bytes = self.out_file.write(tensor)
+        self.data_offsets.append(self.data_offsets[-1] + bytes / self.element_size)
+        for s in tensor.shape:
+            self.sizes.append(s)
+        self.dim_offsets.append(self.dim_offsets[-1] + len(tensor.shape))
+        del bytes
+
+    def end_document(self):
+        self.doc_idx.append(len(self.sizes))
+
+    def merge_file_(self, another_file):
+        index = IndexedDataset(another_file)
+        assert index.dtype == self.dtype
+
+        doc_offset = len(self.sizes)
+
+        begin = self.data_offsets[-1]
+        for data_offset in index.data_offsets[1:]:
+            self.data_offsets.append(begin + data_offset)
+        self.sizes.extend(index.sizes)
+
+        begin = self.dim_offsets[-1]
+        for dim_offset in index.dim_offsets[1:]:
+            self.dim_offsets.append(begin + dim_offset)
+
+        self.doc_idx.extend((doc_offset + index.doc_idx)[1:])
+
+        with open(data_file_path(another_file), "rb") as f:
+            while True:
+                data = f.read(1024)
+                if data:
+                    self.out_file.write(data)
+                else:
+                    break
+
+    def finalize(self, index_file):
+        self.out_file.close()
+        index = open(index_file, "wb")
+        index.write(b"TNTIDX\x00\x00")
+        index.write(struct.pack("<Q", 1))
+        index.write(struct.pack("<QQ", code(self.dtype), self.element_size))
+        index.write(struct.pack("<QQ", len(self.data_offsets) - 1, len(self.sizes)))
+        index.write(struct.pack("<Q", len(self.doc_idx)))
+        write_longs(index, self.dim_offsets)
+        write_longs(index, self.data_offsets)
+        write_shorts(index, self.sizes)
+        write_longs(index, self.doc_idx)
+        index.close()
+
+        print("Total sentences num: %d" % len(self.sizes))
+        print("Total documents num: %d" % (len(self.doc_idx) - 1))
+        print("Total tokens num: %d" % sum(self.sizes))
+        print("Average tokens per sentence: %.2f" % (sum(self.sizes) / len(self.sizes)))
+        print("Average tokens per document: %.2f" % (sum(self.sizes) / (len(self.doc_idx) - 1)))
+
+
+def _warmup_mmap_file(path):
+    with open(path, "rb") as stream:
+        while stream.read(100 * 1024 * 1024):
+            pass
+
+
+class MMapIndexedDataset(paddle.io.Dataset):
+    class Index(object):
+        _HDR_MAGIC = b"MMIDIDX\x00\x00"
+
+        @classmethod
+        def writer(cls, path, dtype):
+            class _Writer(object):
+                def __enter__(self):
+                    self._file = open(path, "wb")
+
+                    self._file.write(cls._HDR_MAGIC)
+                    self._file.write(struct.pack("<Q", 1))
+                    self._file.write(struct.pack("<B", code(dtype)))
+
+                    return self
+
+                @staticmethod
+                def _get_pointers(sizes):
+                    dtype_size = dtype().itemsize
+                    address = 0
+                    pointers = []
+
+                    for size in sizes:
+                        pointers.append(address)
+                        address += size * dtype_size
+
+                    return pointers
+
+                def write(self, sizes, doc_idx):
+                    pointers = self._get_pointers(sizes)
+
+                    self._file.write(struct.pack("<Q", len(sizes)))
+                    self._file.write(struct.pack("<Q", len(doc_idx)))
+
+                    sizes = np.array(sizes, dtype=np.int32)
+                    self._file.write(sizes.tobytes(order="C"))
+                    del sizes
+
+                    pointers = np.array(pointers, dtype=np.int64)
+                    self._file.write(pointers.tobytes(order="C"))
+                    del pointers
+
+                    doc_idx = np.array(doc_idx, dtype=np.int64)
+                    self._file.write(doc_idx.tobytes(order="C"))
+
+                def __exit__(self, exc_type, exc_val, exc_tb):
+                    self._file.close()
+
+            return _Writer()
+
+        def __init__(self, path, skip_warmup=False):
+            with open(path, "rb") as stream:
+                magic_test = stream.read(9)
+                assert self._HDR_MAGIC == magic_test, (
+                    "Index file doesn't match expected format. "
+                    "Make sure that --dataset-impl is configured properly."
+                )
+                version = struct.unpack("<Q", stream.read(8))
+                assert (1,) == version
+
+                (dtype_code,) = struct.unpack("<B", stream.read(1))
+                self._dtype = dtypes[dtype_code]
+                self._dtype_size = self._dtype().itemsize
+
+                self._len = struct.unpack("<Q", stream.read(8))[0]
+                self._doc_count = struct.unpack("<Q", stream.read(8))[0]
+                offset = stream.tell()
+
+            if not skip_warmup:
+                print_rank_0("    warming up index mmap file...")
+                _warmup_mmap_file(path)
+
+            self._buffer_mmap = np.memmap(path, mode="r", order="C")
+            self._buffer = memoryview(self._buffer_mmap)
+            print_rank_0("    reading sizes...")
+            self._sizes = np.frombuffer(self._buffer, dtype=np.int32, count=self._len, offset=offset)
+            print_rank_0("    reading pointers...")
+            self._pointers = np.frombuffer(
+                self._buffer, dtype=np.int64, count=self._len, offset=offset + self._sizes.nbytes
+            )
+            print_rank_0("    reading document index...")
+            self._doc_idx = np.frombuffer(
+                self._buffer,
+                dtype=np.int64,
+                count=self._doc_count,
+                offset=offset + self._sizes.nbytes + self._pointers.nbytes,
+            )
+
+        def __del__(self):
+            self._buffer_mmap._mmap.close()
+            del self._buffer_mmap
+
+        @property
+        def dtype(self):
+            return self._dtype
+
+        @property
+        def sizes(self):
+            return self._sizes
+
+        @property
+        def doc_idx(self):
+            return self._doc_idx
+
+        @lru_cache(maxsize=8)
+        def __getitem__(self, i):
+            return self._pointers[i], self._sizes[i]
+
+        def __len__(self):
+            return self._len
+
+    def __init__(self, path, skip_warmup=False):
+        super().__init__()
+
+        self._path = None
+        self._index = None
+        self._bin_buffer = None
+        self._loss_mask_buffer = None
+
+        self._do_init(path, skip_warmup)
+
+    def __getstate__(self):
+        return self._path
+
+    def __setstate__(self, state):
+        self._do_init(state, skip_warmup=True)
+
+    def _do_init(self, path, skip_warmup):
+        self._path = path
+
+        if not self.exists(path):
+            raise ValueError("Missing file, %s" % (path))
+
+        self._index = self.Index(index_file_path(self._path), skip_warmup)
+
+        if not skip_warmup:
+            print_rank_0("    warming up data mmap file...")
+            _warmup_mmap_file(data_file_path(self._path))
+        print_rank_0("    creating numpy buffer of mmap...")
+        self._bin_buffer_mmap = np.memmap(data_file_path(self._path), mode="r", order="C")
+        if os.path.exists(loss_mask_file_path(self._path)):
+            self._loss_mask_buffer_mmap = np.memmap(loss_mask_file_path(self._path), mode="r", order="C")
+            self._loss_mask_buffer = memoryview(self._loss_mask_buffer_mmap)
+        print_rank_0("    creating memory view of numpy buffer...")
+        self._bin_buffer = memoryview(self._bin_buffer_mmap)
+
+    def __del__(self):
+        self._bin_buffer_mmap._mmap.close()
+        if hasattr(self, "_loss_mask_buffer_mmap"):
+            self._loss_mask_buffer_mmap._mmap.close()
+        del self._bin_buffer_mmap
+        del self._loss_mask_buffer
+        del self._index
+
+    def __len__(self):
+        return len(self._index)
+
+    # @lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if isinstance(idx, (int, np.integer)):
+            ptr, size = self._index[idx]
+            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, count=size, offset=ptr)
+            return np_array
+        elif isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            if step != 1:
+                raise ValueError("Slices into indexed_dataset must be contiguous")
+            ptr = self._index._pointers[start]
+            sizes = self._index._sizes[idx]
+            offsets = list(accumulate(sizes))
+            total_size = sum(sizes)
+            np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, count=total_size, offset=ptr)
+            sents = np.split(np_array, offsets[:-1])
+            return sents
+        else:
+            raise TypeError("Unexpected type received for idx: {}".format(type(idx)))
+
+    def get(self, idx, offset=0, length=None):
+        """Retrieves a single item from the dataset with the option to only
+        return a portion of the item.
+
+        get(idx) is the same as [idx] but get() does not support slicing.
+        """
+        ptr, size = self._index[idx]
+        if length is None:
+            length = size - offset
+        ptr += offset * np.dtype(self._index.dtype).itemsize
+        mask_ptr = ptr // np.dtype(self._index.dtype).itemsize
+        np_array = np.frombuffer(self._bin_buffer, dtype=self._index.dtype, count=length, offset=ptr)
+        mask_array = None
+        if self._loss_mask_buffer is not None:
+            mask_array = np.frombuffer(self._loss_mask_buffer, dtype=np.uint8, count=length, offset=mask_ptr)
+        return np_array, mask_array
+
+    @property
+    def sizes(self):
+        return self._index.sizes
+
+    @property
+    def doc_idx(self):
+        return self._index.doc_idx
+
+    def get_doc_idx(self):
+        return self._index._doc_idx
+
+    def set_doc_idx(self, doc_idx_):
+        self._index._doc_idx = doc_idx_
+
+    @property
+    def supports_prefetch(self):
+        return False
+
+    @staticmethod
+    def exists(path):
+        return os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))
+
+
+class SFTMMapIndexedDataset(paddle.io.Dataset):
+    class Index(object):
+        _HDR_MAGIC = b"MMIDIDX\x00\x00"
+
+        @classmethod
+        def writer(cls, path, dtype):
+            class _Writer(object):
+                def __enter__(self):
+                    self._file = open(path, "wb")
+                    self._file.write(cls._HDR_MAGIC)
+                    self._file.write(struct.pack("<Q", 1))
+                    self._file.write(struct.pack("<B", code(dtype)))
+
+                    return self
+
+                @staticmethod
+                def _get_pointers(sizes):
+                    dtype_size = dtype().itemsize
+                    address = 0
+                    pointers = []
+                    for size in sizes:
+                        pointers.append(address)
+                        address += size * dtype_size
+                    return pointers
+
+                def write(self, sizes, doc_idx):
+
+                    pointers = self._get_pointers(sizes)
+                    self._file.write(struct.pack("<Q", len(sizes)))
+                    self._file.write(struct.pack("<Q", len(doc_idx)))
+
+                    sizes = np.array(sizes, dtype=np.int32)
+                    self._file.write(sizes.tobytes(order="C"))
+                    del sizes
+
+                    pointers = np.array(pointers, dtype=np.int64)
+                    self._file.write(pointers.tobytes(order="C"))
+                    del pointers
+
+                    doc_idx = np.array(doc_idx, dtype=np.int64)
+                    self._file.write(doc_idx.tobytes(order="C"))
+
+                def __exit__(self, exc_type, exc_val, exc_tb):
+                    self._file.close()
+
+            return _Writer()
+
+        def __init__(self, path, skip_warmup=False):
+            with open(path, "rb") as stream:
+                magic_test = stream.read(9)
+                assert self._HDR_MAGIC == magic_test, (
+                    "Index file doesn't match expected format. "
+                    "Make sure that --dataset-impl is configured properly."
+                )
+                version = struct.unpack("<Q", stream.read(8))
+                assert (1,) == version
+
+                (dtype_code,) = struct.unpack("<B", stream.read(1))
+                self._dtype = dtypes[dtype_code]
+                self._dtype_size = self._dtype().itemsize
+
+                self._len = struct.unpack("<Q", stream.read(8))[0]
+                self._doc_count = struct.unpack("<Q", stream.read(8))[0]
+                offset = stream.tell()
+
+            if not skip_warmup:
+                print_rank_0("    warming up index mmap file...")
+                _warmup_mmap_file(path)
+
+            self._buffer_mmap = np.memmap(path, mode="r", order="C")
+            self._buffer = memoryview(self._buffer_mmap)
+            print_rank_0("    reading sizes...")
+            self._sizes = np.frombuffer(self._buffer, dtype=np.int32, count=self._len, offset=offset)
+            print_rank_0("    reading pointers...")
+            self._pointers = np.frombuffer(
+                self._buffer, dtype=np.int64, count=self._len, offset=offset + self._sizes.nbytes
+            )
+            print_rank_0("    reading document index...")
+            self._doc_idx = np.frombuffer(
+                self._buffer,
+                dtype=np.int64,
+                count=self._doc_count,
+                offset=offset + self._sizes.nbytes + self._pointers.nbytes,
+            )
+
+        def __del__(self):
+            self._buffer_mmap._mmap.close()
+            del self._buffer_mmap
+
+        @property
+        def dtype(self):
+            return self._dtype
+
+        @property
+        def sizes(self):
+            return self._sizes
+
+        @property
+        def doc_idx(self):
+            return self._doc_idx
+
+        @lru_cache(maxsize=8)
+        def __getitem__(self, i):
+            return self._pointers[i], self._sizes[i]
+
+        def __len__(self):
+            return self._doc_count - 1
+
+    def __init__(self, path, dataclass, skip_warmup=False):
+        super().__init__()
+        self._dataclass = dataclass
+        self._path = None
+        self._index = None
+        self._bin_buffer = None
+
+        self._do_init(path, skip_warmup)
+
+    def __getstate__(self):
+        return self._path
+
+    def __setstate__(self, state):
+        self._do_init(state, skip_warmup=True)
+
+    def _do_init(self, path, skip_warmup):
+        self._path = path
+        if not self.exists(path, self._dataclass):
+            raise ValueError("Missing file, %s" % (path))
+
+        self._index = self.Index(sft_index_file_path(self._path), skip_warmup)
+        if not skip_warmup:
+            print_rank_0("    warming up data mmap file...")
+            for data_file in sft_data_file_path(self._path, self._dataclass):
+                _warmup_mmap_file(data_file)
+        print_rank_0("    creating numpy buffer of mmap...")
+
+        self._bin_buffer_mmap_dict = {}
+        self._bin_buffer_dict = {}
+        for data_file in sft_data_file_path(self._path, self._dataclass):
+            self._bin_buffer_mmap_dict[data_file] = np.memmap(data_file, mode="r", order="C")
+            self._bin_buffer_dict[data_file] = memoryview(self._bin_buffer_mmap_dict[data_file])
+        print_rank_0("    creating memory view of numpy buffer...")
+
+    def __del__(self):
+        for key, value in self._bin_buffer_mmap_dict.items():
+            value._mmap.close()
+        for key, value in self._bin_buffer_dict.items():
+            del value
+        del self._index
+
+    def __len__(self):
+        return len(self._index)
+
+    def __getitem__(self, idx):
+        def get_index(idx):
+            doc_idx = self._index.doc_idx
+            start_sentence, end_sentence = doc_idx[idx], doc_idx[idx + 1]
+            start_pointers, _ = self._index[start_sentence]
+            length_list = self._index._sizes[start_sentence:end_sentence]
+
+            dataclass_fields = fields(self._dataclass)
+            dataclass_list = []
+            sequence_offset = start_pointers
+            scalar_offset = doc_idx[idx] * np.dtype(self._index.dtype).itemsize
+
+            for length in length_list:
+                field_data = {field.name: [] for field in dataclass_fields}
+                for field in dataclass_fields:
+                    bin_buffer = self._bin_buffer_dict[os.path.join(self._path, f"{field.name}.bin")]
+                    if field.type != int:
+                        data = np.frombuffer(bin_buffer, dtype=self._index.dtype, count=length, offset=sequence_offset)
+                        field_data[field.name] = data.tolist()
+                    else:
+                        data = np.frombuffer(bin_buffer, dtype=self._index.dtype, count=1, offset=scalar_offset)
+                        field_data[field.name] = int(data[0])
+
+                dataclass_list.append(self._dataclass(**field_data))
+
+                sequence_offset += length * np.dtype(self._index.dtype).itemsize
+                scalar_offset += np.dtype(self._index.dtype).itemsize
+            return dataclass_list
+
+        if isinstance(idx, (int, np.integer)):
+            return get_index(idx)
+        elif isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            if step != 1:
+                raise ValueError("Slices into indexed_dataset must be contiguous")
+            return [get_index(idx) for idx in range(start, stop)]
+
+    @property
+    def sizes(self):
+        return self._index.sizes
+
+    @property
+    def doc_idx(self):
+        return self._index.doc_idx
+
+    def get_doc_idx(self):
+        return self._index._doc_idx
+
+    def set_doc_idx(self, doc_idx_):
+        self._index._doc_idx = doc_idx_
+
+    @property
+    def supports_prefetch(self):
+        return False
+
+    @staticmethod
+    def exists(path, dataclass):
+        file_path_list = sft_data_file_path(path, dataclass)
+        file_path_list.append(sft_index_file_path(path))
+        for file_path in file_path_list:
+            if not os.path.exists(file_path):
+                return False
+        return True
+
+
+def make_builder(out_file, impl, save_dtype, loss_mask_file=None):
+    if impl == "mmap":
+        return MMapIndexedDatasetBuilder(out_file, dtype=save_dtype, loss_mask_file=loss_mask_file)
+    else:
+        return IndexedDatasetBuilder(out_file, dtype=save_dtype)
+
+
+class SFTMMapIndexedDatasetBuilder(object):
+    def __init__(self, output_file_dict, dtype):
+        self._data_file_dict = {}
+        for key, filename in output_file_dict.items():
+            self._data_file_dict[key] = open(filename, "wb")
+        self.output_file_dict = output_file_dict
+        self._dtype = dtype
+        self._sizes = []
+        self._doc_idx = [0]
+
+    def add_item(self, sequence):
+        add_sequence_len = False
+        for key in self._data_file_dict.keys():
+            tensor = np.array(getattr(sequence, key), dtype=self._dtype)
+            if tensor.size > 1 and not add_sequence_len:
+                self._sizes.append(tensor.size)
+                add_sequence_len = True
+            self._data_file_dict[key].write(tensor.tobytes(order="C"))
+
+    def end_document(self):
+        self._doc_idx.append(len(self._sizes))
+
+    def finalize(self, index_file):
+        for key, filename in self._data_file_dict.items():
+            filename.close()
+        with SFTMMapIndexedDataset.Index.writer(index_file, self._dtype) as index:
+            index.write(self._sizes, self._doc_idx)
+
+
+class MMapIndexedDatasetBuilder(object):
+    def __init__(self, out_file, dtype, loss_mask_file=None):
+        self._data_file = open(out_file, "wb")
+        self._loss_mask_file = None
+        if loss_mask_file is not None:
+            self._loss_mask_file = open(loss_mask_file, "wb")
+        self._dtype = dtype
+        self._sizes = []
+        self._doc_idx = [0]
+
+    def flush_loss_mask_item(self, loss_mask_lst):
+        for loss_mask in loss_mask_lst:
+            tensor = np.array(loss_mask, dtype=np.uint8)
+            self._loss_mask_file.write(tensor.tobytes(order="C"))
+
+    def add_item(self, tensor):
+        tensor = np.array(tensor, dtype=self._dtype)
+        self._data_file.write(tensor.tobytes(order="C"))
+        self._sizes.append(tensor.size)
+
+    def add_doc(self, tensor, sizes):
+        np_array = np.array(tensor, dtype=self._dtype)
+        self._data_file.write(np_array.tobytes(order="C"))
+        self._sizes.extend(sizes)
+        self._doc_idx.append(len(self._sizes))
+
+    def end_document(self):
+        self._doc_idx.append(len(self._sizes))
+
+    def merge_file_(self, another_file):
+        # Concatenate index
+        index = MMapIndexedDataset.Index(index_file_path(another_file))
+        assert index.dtype == self._dtype
+
+        offset = len(self._sizes)
+        self._sizes.extend(index.sizes)
+        self._doc_idx.extend((offset + index.doc_idx)[1:])
+
+        # Concatenate data
+        with open(data_file_path(another_file), "rb") as f:
+            shutil.copyfileobj(f, self._data_file)
+
+    def finalize(self, index_file):
+        self._data_file.close()
+
+        with MMapIndexedDataset.Index.writer(index_file, self._dtype) as index:
+            index.write(self._sizes, self._doc_idx)
+        print("Total sentences num: %d" % len(self._sizes))
+        print("Total documents num: %d" % (len(self._doc_idx) - 1))
+        print("Total tokens num: %d" % sum(self._sizes))
+        print("Average tokens per sentence: %.2f" % (sum(self._sizes) / len(self._sizes)))
+        print("Average tokens per document: %.2f" % (sum(self._sizes) / (len(self._doc_idx) - 1)))
+
+
+def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
+
+    print_rank_0(" > building dataset index ...")
+
+    start_time = time.time()
+    indexed_dataset = make_dataset(data_prefix, data_impl, skip_warmup)
+    assert indexed_dataset.sizes.shape[0] == indexed_dataset.doc_idx[-1]
+    print_rank_0(" > finished creating indexed dataset in {:4f} " "seconds".format(time.time() - start_time))
+
+    print_rank_0(" > indexed dataset stats:")
+    print_rank_0("    number of documents: {}".format(indexed_dataset.doc_idx.shape[0] - 1))
+    print_rank_0("    number of sentences: {}".format(indexed_dataset.sizes.shape[0]))
+
+    return indexed_dataset
+
+
+class CompatibleIndexedDataset(paddle.io.Dataset):
+    def __init__(self, path):
+        super().__init__()
+
+        self._path = path
+
+        # All documment ids, extend as 1-D array.
+        self._token_ids = np.load(path + "_ids.npy", mmap_mode="r", allow_pickle=True)
+        process_data = np.load(path + "_idx.npz")
+        self._sizes = process_data["lens"]
+        self._pointers = np.empty(len(self._sizes) + 1, dtype=np.int64)
+        self._pointers[0] = 0
+        np.cumsum(self._sizes, out=self._pointers[1:])
+        self._doc_idx = process_data["docs"]
+
+    def __getstate__(self):
+        return self._path
+
+    def __len__(self):
+        return len(self._sizes)
+
+    # @lru_cache(maxsize=8)
+    def __getitem__(self, idx):
+        if isinstance(idx, int):
+            size = self._sizes[idx]
+            ptr = self._pointers[idx]
+            np_array = self._token_ids[ptr : ptr + size]
+            return np_array
+
+        elif isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            if step != 1:
+                raise ValueError("Slices into indexed_dataset must be contiguous")
+            ptr = self._pointers[start]
+            sizes = self._sizes[idx]
+            offsets = list(accumulate(sizes))
+            total_size = sum(sizes)
+            np_array = self._token_ids[ptr : ptr + total_size]
+            sents = np.split(np_array, offsets[:-1])
+            return sents
+
+    def get(self, idx, offset=0, length=None):
+        """Retrieves a single item from the dataset with the option to only
+        return a portion of the item.
+
+        get(idx) is the same as [idx] but get() does not support slicing.
+        """
+        size = self._sizes[idx]
+        ptr = self._pointers[idx]
+
+        if length is None:
+            length = size - offset
+        ptr += offset
+        np_array = self._token_ids[ptr : ptr + length]
+        return np_array, None
+
+    @property
+    def sizes(self):
+        return self._sizes
+
+    @property
+    def doc_idx(self):
+        return self._doc_idx
+
+    def get_doc_idx(self):
+        return self._doc_idx
+
+    def set_doc_idx(self, doc_idx_):
+        self._doc_idx = doc_idx_
+
+    @staticmethod
+    def exists(path):
+        return os.path.isfile(path + "_ids.npy") and os.path.isfile(path + "_idx.npz")
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/iterator.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/iterator.py
new file mode 100644
index 000000000..ee969734d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/iterator.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Iterator for NLP Dataset
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/sampler.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/sampler.py
new file mode 100644
index 000000000..7fe68d6bd
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/sampler.py
@@ -0,0 +1,416 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import functools
+import math
+
+import numpy as np
+
+
+class SamplerHelper(object):
+    """
+    The class is to help construct iterable sampler used for
+    :class:`paddle.io.DataLoader`. It wraps a dataset and uses its
+    :meth:`__getitem__` method. Every subclass of :class:`SamplerHelper` has
+    to provide an :meth:`__iter__` method, providing a way to iterate over
+    indices of dataset elements, and a :meth:`__len__` method that returns the
+    length of the returned iterators.
+
+    The class also can be used as batch iterator instead of indices iterator
+    when `iterator` yield samples rather than indices by initializing `iterator`
+    with a iterable dataset.
+
+    .. note::
+        The :meth:`__len__` method isn't strictly required by
+        :class:`paddle.io.DataLoader`, but is expected in any calculation
+        involving the length of a :class:`paddle.io.DataLoader`.
+
+    Args:
+        dataset (Dataset): Input dataset for :class:`SamplerHelper`.
+        iterable (Iterable, optional): Iterator of dataset. Default: None.
+    """
+
+    # chain sampler
+    def __init__(self, dataset, iterable=None):
+        self.data_source = dataset
+        self.iterable = iterable
+        if isinstance(dataset, collections.abc.Iterable) and iterable is None:
+            # iterable-style datasets
+            self.iterable = dataset
+
+    def __iter__(self):
+        if self.iterable is None:
+            return iter(range(len(self.data_source)))
+        elif isinstance(self.iterable, collections.abc.Iterable):
+            return iter(self.iterable)
+        elif callable(self.iterable):
+            return self.iterable()
+        else:
+            raise ValueError("`iterable` should be None, instance of Iterable or callable " "producing generator.")
+
+    def __len__(self):
+        # Allow some samplers have different length with `len(data_source)`,
+        # such as batch sampler.
+        if hasattr(self, "_length"):
+            return self._length
+        else:
+            return len(self.data_source)
+
+    @property
+    def length(self):
+        """
+        Returns the length.
+        """
+
+        # since `len()` only produce integer, use length property to get None
+        # for uncertain length. samplers can set length if necessary.
+        try:
+            length = len(self)
+        except Exception:
+            length = None
+        return length
+
+    @length.setter
+    def length(self, length):
+        self._length = length
+
+    def apply(self, fn):
+        # Transformation functions would be performed. It includes
+        # :meth:`shuffle`, :meth:`sort`, :meth:`fit` and :meth:`shard`.
+        # Args:
+        #     fn (callable): Transformation functions to be performed.
+        # Returns:
+        #     SamplerHelper: A new transformed :class:`SamplerHelper` object.
+
+        rs = fn(self)
+        if isinstance(rs, (list, tuple)):
+            iterable, data_source = rs
+        else:
+            iterable, data_source = rs, self.data_source
+        sampler = type(self)(data_source, iterable)
+        return sampler
+
+    def shuffle(self, buffer_size=-1, seed=None):
+        """
+        Shuffles the dataset according to the given buffer size and random seed.
+
+        Args:
+            buffer_size (int, optional): Buffer size for shuffle. If
+                `buffer_size < 0` or more than the length of the dataset,
+                `buffer_size` is the length of the dataset. Default: -1.
+            seed (int, optional): Seed for the random. Default: None.
+
+        Returns:
+            SamplerHelper: A new shuffled :class:`SamplerHelper` object.
+
+        Example:
+            .. code-block:: python
+
+                from paddlenlp.data import SamplerHelper
+                from paddle.io import Dataset
+
+                class MyDataset(Dataset):
+                    def __init__(self):
+                        super(MyDataset, self).__init__()
+                        self.data = [
+                            [[1, 2, 3, 4], [1]],
+                            [[5, 6, 7], [0]],
+                            [[8, 9], [1]],
+                        ]
+
+                    def __getitem__(self, index):
+                        data = self.data[index][0]
+                        label = self.data[index][1]
+                        return data, label
+
+                    def __len__(self):
+                        return len(self.data)
+
+                dataset = MyDataset()
+                sampler = SamplerHelper(dataset)
+                print(list(sampler))    # indices of dataset elements
+                # [0, 1, 2]
+
+                sampler = sampler.shuffle(seed=2)
+                print(list(sampler))    # indices of dataset elements
+                # [2, 1, 0]
+        """
+        if seed is not None:
+            random_generator = np.random.RandomState(seed)
+        else:  # use the global random generator
+            random_generator = np.random
+
+        def _impl():
+            buf = []
+            for idx in iter(self):
+                buf.append(idx)
+                if buffer_size > 0 and len(buf) >= buffer_size:
+                    random_generator.shuffle(buf)
+                    for b in buf:
+                        yield b
+                    buf = []
+            if len(buf) > 0:
+                random_generator.shuffle(buf)
+                for b in buf:
+                    yield b
+
+        return type(self)(self.data_source, _impl)
+
+    def sort(self, cmp=None, key=None, reverse=False, buffer_size=-1):
+        """
+        Sorts the dataset according to given callable :meth:`cmp` or :meth:`key`.
+
+        Args:
+            cmp (callable, optional): The function of comparison. Default: None.
+            key (callable, optional): The function of key. Default: None.
+            reverse (bool, optional): Whether to reverse when sorting the data
+                samples. If True, it means in descending order, and False means
+                in ascending order. Default: False.
+            buffer_size (int, optional): Buffer size for sort. If
+                `buffer_size < 0` or `buffer_size` is more than the length
+                of the data, `buffer_size` will be set to the length of the data.
+                Default: -1.
+
+        Returns:
+            SamplerHelper: A new sorted :class:`SamplerHelper` object.
+
+        Example:
+            .. code-block:: python
+
+                from paddlenlp.data import SamplerHelper
+                from paddle.io import Dataset
+
+                class MyDataset(Dataset):
+                    def __init__(self):
+                        super(MyDataset, self).__init__()
+                        self.data = [
+                            [[1, 2, 3, 4], [1]],
+                            [[5, 6, 7], [0]],
+                            [[8, 9], [1]],
+                        ]
+
+                    def __getitem__(self, index):
+                        data = self.data[index][0]
+                        label = self.data[index][1]
+                        return data, label
+
+                    def __len__(self):
+                        return len(self.data)
+
+                dataset = MyDataset()
+                sampler = SamplerHelper(dataset)
+                print(list(sampler))    # indices of dataset elements
+                # [0, 1, 2]
+
+                # Sorted in ascending order by the length of the first field
+                # of the sample
+                key = (lambda x, data_source: len(data_source[x][0]))
+                sampler = sampler.sort(key=key)
+                print(list(sampler))    # indices of dataset elements
+                # [2, 1, 0]
+        """
+        if key:
+            key_wrapper = lambda x: key(x, self.data_source)
+        elif cmp:
+            key_wrapper = functools.cmp_to_key(lambda x, y: cmp(x, y, self.data_source))
+        else:
+            key_wrapper = lambda x: len(self.data_source[x])
+
+        def _impl():
+            buf = []
+            for idx in iter(self):
+                buf.append(idx)
+                if buffer_size > 0 and len(buf) >= buffer_size:
+                    buf = sorted(buf, key=key_wrapper, reverse=reverse)
+                    for b in buf:
+                        yield b
+                    buf = []
+            if len(buf) > 0:
+                buf = sorted(buf, key=key_wrapper, reverse=reverse)
+                for b in buf:
+                    yield b
+
+        return type(self)(self.data_source, _impl)
+
+    def batch(self, batch_size, drop_last=False, batch_size_fn=None, key=None):
+        """
+        Batches the dataset according to given `batch_size`.
+
+        Args:
+            batch_size (int): The batch size.
+            drop_last (bool, optional): Whether to drop the last mini batch.
+                Default: False.
+            batch_size_fn (callable, optional): It accepts four arguments:
+                index of data source, the length of minibatch, the size of
+                minibatch so far and data source, and it returns the size of
+                mini batch so far. Actually, the returned value can be anything
+                and would used as argument `size_so_far` in `key`. If None, it
+                would return the length of mini match. Default: None.
+            key (callable, optional): The function of key. It accepts the size of minibatch so far
+                and the length of minibatch, and returns what to be compared
+                with `batch_size`. If None, only the size of mini batch so far
+                would be compared with `batch_size`. Default: None.
+
+        Returns:
+            SamplerHelper: A new batched :class:`SamplerHelper` object.
+
+        Example:
+            .. code-block:: python
+
+                from paddlenlp.data import SamplerHelper
+                from paddle.io import Dataset
+
+                class MyDataset(Dataset):
+                    def __init__(self):
+                        super(MyDataset, self).__init__()
+                        self.data = [
+                            [[1, 2, 3, 4], [1]],
+                            [[5, 6, 7], [0]],
+                            [[8, 9], [1]],
+                        ]
+
+                    def __getitem__(self, index):
+                        data = self.data[index][0]
+                        label = self.data[index][1]
+                        return data, label
+
+                    def __len__(self):
+                        return len(self.data)
+
+                dataset = MyDataset()
+                sampler = SamplerHelper(dataset)
+                print(list(sampler))    # indices of dataset elements
+                # [0, 1, 2]
+
+                sampler = sampler.batch(batch_size=2)
+                print(list(sampler))    # indices of dataset elements
+                # [[0, 1], [2]]
+        """
+        _key = lambda size_so_far, minibatch_len: size_so_far
+
+        ori_batch_size_fn = batch_size_fn
+        if batch_size_fn is None:
+            batch_size_fn = lambda new, count, sofar, data_source: count
+        key = _key if key is None else key
+
+        def _impl():
+            data_source = self.data_source
+            minibatch, size_so_far = [], 0
+            for idx in iter(self):
+                minibatch.append(idx)
+                size_so_far = batch_size_fn(idx, len(minibatch), size_so_far, data_source)
+                if key(size_so_far, len(minibatch)) == batch_size:
+                    yield minibatch
+                    minibatch, size_so_far = [], 0
+                elif key(size_so_far, len(minibatch)) > batch_size:
+                    if len(minibatch) == 1:
+                        raise ValueError(
+                            "Please increase the value of `batch_size`, or limit the max length of batch."
+                        )
+                    yield minibatch[:-1]
+                    minibatch, size_so_far = minibatch[-1:], batch_size_fn(idx, 1, 0, data_source)
+            if minibatch and not drop_last:
+                yield minibatch
+
+        sampler = type(self)(self.data_source, _impl)
+        if ori_batch_size_fn is None and self.length is not None:
+            sampler.length = (self.length + int(not drop_last) * (batch_size - 1)) // batch_size
+        else:
+            sampler.length = None
+
+        return sampler
+
+    def shard(self, num_replicas=None, rank=None):
+        """
+        Slices the dataset for multi GPU training.
+
+        Args:
+            num_replicas (int, optional): The number of training process, and
+                is also the number of GPU cards used in training. If None, it
+                will be set by :meth:`paddle.distributed.get_world_size` method.
+                Default: None.
+            rank (int, optional): The id of current training process. Equal
+                to the value of the environment variable PADDLE_TRAINER_ID. If
+                None, it will be initialized by :meth:`paddle.distributed.get_rank`
+                method. Default: None.
+
+        Returns:
+            SamplerHelper: A new sliced :class:`SamplerHelper` object.
+
+        Example:
+            .. code-block:: python
+
+                from paddlenlp.data import SamplerHelper
+                from paddle.io import Dataset
+
+                class MyDataset(Dataset):
+                    def __init__(self):
+                        super(MyDataset, self).__init__()
+                        self.data = [
+                            [[1, 2, 3, 4], [1]],
+                            [[5, 6, 7], [0]],
+                            [[8, 9], [1]],
+                        ]
+
+                    def __getitem__(self, index):
+                        data = self.data[index][0]
+                        label = self.data[index][1]
+                        return data, label
+
+                    def __len__(self):
+                        return len(self.data)
+
+                dataset = MyDataset()
+                sampler = SamplerHelper(dataset)
+                print(list(sampler))    # indices of dataset elements
+                # [0, 1, 2]
+
+                sampler = sampler.shard(num_replicas=2)
+                print(list(sampler))    # indices of dataset elements
+                # [0, 2]
+        """
+        import paddle.distributed as dist
+
+        if num_replicas is None:
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            rank = dist.get_rank()
+
+        def _impl():
+            for i, idx in enumerate(self):
+                if i % num_replicas == rank:
+                    yield idx
+            if i % num_replicas != num_replicas - 1 and rank > i % num_replicas:
+                # use last samples to make it evenly divisible
+                yield idx
+
+        sampler = type(self)(self.data_source, _impl)
+        if self.length is not None:
+            sampler.length = int(math.ceil(self.length * 1.0 / num_replicas))
+        else:
+            sampler.length = None
+        return sampler
+
+    def list(self):
+        # Produce a sampler with a `listiterator` when calling `iter`. Since
+        # `list` would fetch all contents at time, thus it can get accurate
+        # length.
+
+        def _impl():
+            indices = list(iter(self))
+            self.length = len(indices)
+            return iter(indices)
+
+        return type(self)(self.data_source, _impl)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/tokenizer.py
new file mode 100644
index 000000000..def498354
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/tokenizer.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import jieba
+
+
+def get_idx_from_word(word, word_to_idx, unk_word):
+    if word in word_to_idx:
+        return word_to_idx[word]
+    return word_to_idx[unk_word]
+
+
+class BaseTokenizer(object):
+    def __init__(self, vocab):
+        self.vocab = vocab
+
+    def get_tokenizer(self):
+        return self.tokenizer
+
+    def cut(self, sentence):
+        pass
+
+    def encode(self, sentence):
+        pass
+
+
+class JiebaTokenizer(BaseTokenizer):
+    """
+    Constructs a tokenizer based on `jieba <https://github.com/fxsjy/jieba>`__.
+    It supports :meth:`cut` method to split the text to tokens, and :meth:`encode`
+    method to covert text to token ids.
+
+    Args:
+        vocab(paddlenlp.data.Vocab): An instance of :class:`paddlenlp.data.Vocab`.
+    """
+
+    def __init__(self, vocab):
+        super(JiebaTokenizer, self).__init__(vocab)
+        self.tokenizer = jieba.Tokenizer()
+        # initialize tokenizer
+        self.tokenizer.FREQ = {key: 1 for key in self.vocab.token_to_idx.keys()}
+        self.tokenizer.total = len(self.tokenizer.FREQ)
+        self.tokenizer.initialized = True
+
+    def cut(self, sentence, cut_all=False, use_hmm=True):
+        """
+        The method used to cut the text to tokens.
+
+        Args:
+            sentence(str): The text that needs to be cuted.
+            cut_all(bool, optional): Whether to use the full mode. If True,
+                using full mode that gets all the possible words from the
+                sentence, which is fast but not accurate. If False, using
+                accurate mode that attempts to cut the sentence into the most
+                accurate segmentations, which is suitable for text analysis.
+                Default: False.
+            use_hmm(bool, optional): Whether to use the HMM model. Default: True.
+
+        Returns:
+            list[str]: A list of tokens.
+
+        Example:
+            .. code-block:: python
+
+                from paddlenlp.data import Vocab, JiebaTokenizer
+                # The vocab file. The sample file can be downloaded firstly.
+                # wget https://bj.bcebos.com/paddlenlp/data/senta_word_dict.txt
+                vocab_file_path = './senta_word_dict.txt'
+                # Initialize the Vocab
+                vocab = Vocab.load_vocabulary(
+                    vocab_file_path,
+                    unk_token='[UNK]',
+                    pad_token='[PAD]')
+                tokenizer = JiebaTokenizer(vocab)
+
+                tokens = tokenizer.cut('我爱你中国')
+                print(tokens)
+                # ['我爱你', '中国']
+        """
+        return self.tokenizer.lcut(sentence, cut_all, use_hmm)
+
+    def encode(self, sentence, cut_all=False, use_hmm=True):
+        """
+        The method used to convert the text to ids. It will firstly call
+        :meth:`cut` method to cut the text to tokens. Then, convert tokens to
+        ids using `vocab`.
+
+        Args:
+            sentence(str): The text that needs to be cuted.
+            cut_all(bool, optional): Whether to use the full mode. If True,
+                using full mode that gets all the possible words from the
+                sentence, which is fast but not accurate. If False, using
+                accurate mode that attempts to cut the sentence into the most
+                accurate segmentations, which is suitable for text analysis.
+                Default: False.
+            use_hmm(bool, optional): Whether to use the HMM model. Default: True.
+
+        Returns:
+            list[int]: A list of ids.
+
+        Example:
+            .. code-block:: python
+
+                from paddlenlp.data import Vocab, JiebaTokenizer
+                # The vocab file. The sample file can be downloaded firstly.
+                # wget https://bj.bcebos.com/paddlenlp/data/senta_word_dict.txt
+                vocab_file_path = './senta_word_dict.txt'
+                # Initialize the Vocab
+                vocab = Vocab.load_vocabulary(
+                    vocab_file_path,
+                    unk_token='[UNK]',
+                    pad_token='[PAD]')
+                tokenizer = JiebaTokenizer(vocab)
+
+                ids = tokenizer.encode('我爱你中国')
+                print(ids)
+                # [1170578, 575565]
+        """
+        words = self.cut(sentence, cut_all, use_hmm)
+        return [get_idx_from_word(word, self.vocab.token_to_idx, self.vocab.unk_token) for word in words]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/vocab.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/vocab.py
new file mode 100644
index 000000000..a17810f6c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/data/vocab.py
@@ -0,0 +1,579 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import io
+import json
+import os
+import warnings
+
+import numpy as np
+
+
+class Vocab(object):
+    """
+    The class used to convert between tokens and ids. It also includes some
+    store/load functions.
+
+    Args:
+        counter (collections.Counter, optional): A Counter intance describes
+            the tokens and their frequencies. Its keys will be indexed accroding
+            to the order of frequency sorting to construct mapping relationship.
+            If None, `token_to_idx` must be provided as the mapping relationship.
+            Default: None.
+        max_size (int, optional): Max size of vocab, not including special tokens.
+            Default: None.
+        min_freq (int, optional): Ignore tokens whose frequencies are less than
+            `min_freq`. Default: 1.
+        token_to_idx (dict, optional): A dict specifies the mapping relationship
+            between tokens and indices to be used. If provided, adjust the tokens
+            and indices mapping according to it. If None, counter must be provided.
+            Default: None.
+        unk_token (str, optional): Special token for unknow token. If no need,
+            it also could be None. Default: None.
+        pad_token (str, optional): Special token for padding token. If no need,
+            it also could be None. Default: None.
+        bos_token (str, optional): Special token for bos token. If no need, it
+            also could be None. Default: None.
+        eos_token (str, optional): Special token for eos token. If no need, it
+            lso could be None. Default: None.
+
+        kwargs (dict): Keyword arguments ending with `_token`. It can be used
+            to specify further special tokens that will be exposed as attribute
+            of the vocabulary and associated with an index.
+    """
+
+    def __init__(
+        self,
+        counter=None,
+        max_size=None,
+        min_freq=1,
+        token_to_idx=None,
+        unk_token=None,
+        pad_token=None,
+        bos_token=None,
+        eos_token=None,
+        **kwargs
+    ):
+        # Handle special tokens
+        combs = (
+            ("unk_token", unk_token),
+            ("pad_token", pad_token),
+            ("bos_token", bos_token),
+            ("eos_token", eos_token),
+        )
+        for name, value in combs:
+            kwargs[name] = value
+        special_tokens = []
+        special_iter = kwargs.keys()
+        # sort alphabetically
+        special_iter = sorted(special_iter)
+        for special_token_name in special_iter:
+            # Test if kwarg specifies a special token
+            if not special_token_name.endswith("_token"):
+                raise ValueError(
+                    "{} is invalid. Only keyword arguments "
+                    "that end in '_token' are supported "
+                    "to declare special tokens.".format(special_token_name)
+                )
+
+            special_token = kwargs[special_token_name]
+            if special_token is not None and special_token not in special_tokens:
+                special_tokens.append(special_token)
+
+        if counter is None:
+            # use token_to_idx as dict to import pretrained vocabulary
+            assert token_to_idx, "token_to_idx should not be None when counter is None"
+            for special_token in special_tokens:
+                assert special_token in token_to_idx, "{} is not in token_to_idx".format(special_token)
+            self._token_to_idx = token_to_idx
+            self._idx_to_token = {idx: token for token, idx in token_to_idx.items()}
+            if unk_token:
+                unk_index = self._token_to_idx[unk_token]
+                self._token_to_idx = collections.defaultdict(lambda: unk_index)
+                self._token_to_idx.update(token_to_idx)
+        else:
+            self._idx_to_token = {idx: special_token for idx, special_token in enumerate(special_tokens)}
+            self._token_to_idx = collections.defaultdict()
+            self._token_to_idx.update((token, idx) for idx, token in self._idx_to_token.items())
+            self._index_counter_keys(counter, special_tokens, max_size, min_freq)
+            if token_to_idx:
+                self._sort_index_according_to_user_specification(token_to_idx)
+            if unk_token:
+                self._token_to_idx.default_factory = lambda: self._token_to_idx[unk_token]
+
+        # _expose_tokens_as_attributes
+        self._identifiers_to_tokens = kwargs
+        for identifier, token in kwargs.items():
+            if identifier.startswith("_"):
+                raise ValueError(
+                    "It is not allowed to use identifiers starting with "
+                    "underscore. In Python identifier names beginning with "
+                    "underscore are internal."
+                )
+            if hasattr(self, identifier):
+                raise ValueError(
+                    "vocab.{} already exists. "
+                    "Please choose a different identifier for token {}".format(identifier, token)
+                )
+            setattr(self, identifier, token)
+
+    def _index_counter_keys(self, counter, special_tokens, max_size, min_freq):
+        # sort by frequency, then alphabetically
+        token_freqs = sorted(counter.items(), key=lambda x: x[0])
+        token_freqs.sort(key=lambda x: x[1], reverse=True)
+        # frequencies of special tokens are not counted when building vocabulary
+        # in frequency order
+        special_tokens = set(special_tokens)
+        max_size = None if max_size is None else max_size + len(special_tokens)
+        for token, freq in token_freqs:
+            if freq < min_freq or len(self._idx_to_token) == max_size:
+                break
+            if token not in special_tokens:
+                self._idx_to_token[max(list(self._idx_to_token.keys()) + [-1]) + 1] = token
+                self._token_to_idx[token] = max(self._idx_to_token.keys())
+
+    def _sort_index_according_to_user_specification(self, token_to_idx):
+        # Sanity checks
+        if not set(token_to_idx.keys()).issubset(self.token_to_idx.keys()):
+            raise ValueError(
+                "User-specified token_to_idx mapping can only contain " "tokens that will be part of the vocabulary."
+            )
+        if len(set(token_to_idx.values())) != len(token_to_idx):
+            raise ValueError("User-specified indices must not contain duplicates.")
+        if min(token_to_idx.values()) < 0 or max(token_to_idx.values()) >= len(self.token_to_idx):
+            raise ValueError(
+                "User-specified indices must not be < 0 or >= the number of tokens "
+                "that will be in the vocabulary. The current vocab contains {}"
+                "tokens.".format(len(self.token_to_idx))
+            )
+
+        # Update index ordering
+        for token, new_idx in token_to_idx.items():
+            old_idx = self.token_to_idx[token]
+            ousted_token = self.idx_to_token[new_idx]
+
+            self.token_to_idx[token] = new_idx
+            self.token_to_idx[ousted_token] = old_idx
+            self.idx_to_token[old_idx] = ousted_token
+            self.idx_to_token[new_idx] = token
+
+    def to_tokens(self, indices):
+        """
+        Maps the input indices to token list.
+
+        Args:
+            indices (int|list[int]|tuple[int]|numpy.ndarray): The input indice(s) for mapping.
+                Must be an `int` or 1D `list[int]`|`tuple[int]`|`numpy.ndarray`.
+
+        Returns:
+            str|list[str]: Obtained token(s). If `indices` is an integer, it
+            will return a str. If `indices` is a list/tuple of integers, it will
+            return a list of str.
+
+        Example:
+            .. code-block:: python
+
+                from paddlenlp.data import Vocab
+                # The vocab file. The sample file can be downloaded firstly.
+                # wget https://bj.bcebos.com/paddlenlp/data/senta_word_dict.txt
+                vocab_file_path = './senta_word_dict.txt'
+                # Initialize the Vocab
+                vocab = Vocab.load_vocabulary(
+                    vocab_file_path,
+                    unk_token='[UNK]',
+                    pad_token='[PAD]')
+                tokens = vocab.to_tokens([0, 1, 2, 3])
+                print(tokens)
+                # ['[PAD]', '[UNK]', '一斤三', '意面屋']
+        """
+        to_reduce = False
+        if not isinstance(indices, (list, tuple, np.ndarray)):
+            indices = [indices]
+            to_reduce = True
+        if isinstance(indices, (list, tuple)):
+            indices = np.asarray(indices)
+
+        if isinstance(indices, (np.ndarray)) and len(indices.shape) > 1:
+            raise ValueError(
+                "Token indices is invalid. Expected 1D array, but received {}D array. ".format(len(indices.shape))
+            )
+
+        tokens = []
+        for idx in indices:
+            if not isinstance(idx, (int, np.integer)):
+                warnings.warn(
+                    "The type of `to_tokens()`'s input `indices` is not `int` which will be forcibly transfered to `int`. "
+                )
+                idx = int(idx)
+
+            try:
+                tokens.append(self._idx_to_token[idx])
+            except KeyError:
+                raise ValueError("Token index {} in the provided `indices` is invalid.".format(idx))
+
+        return tokens[0] if to_reduce else tokens
+
+    def to_indices(self, tokens):
+        """
+        Maps the input tokens into indices.
+
+        Args:
+            tokens (str|list[str]|tuple[str], optional): The input token(s) for
+                mapping.
+
+        Returns:
+            int|list[int]: Obationed indice(s). If `tokens` is a str, it will
+            return an integer. If `tokens` is a list/tuple of str, it will
+            return a list of integers.
+
+        Example:
+            .. code-block:: python
+
+                from paddlenlp.data import Vocab
+                # The vocab file. The sample file can be downloaded firstly.
+                # wget https://bj.bcebos.com/paddlenlp/data/senta_word_dict.txt
+                vocab_file_path = './senta_word_dict.txt'
+                # Initialize the Vocab
+                vocab = Vocab.load_vocabulary(
+                    vocab_file_path,
+                    unk_token='[UNK]',
+                    pad_token='[PAD]')
+                tokens = vocab.to_indices(['[PAD]', '[UNK]', '一斤三', '意面屋'])
+                print(tokens)
+                # [0, 1, 2, 3]
+        """
+        return self[tokens]
+
+    def __getitem__(self, tokens):
+        if not isinstance(tokens, (list, tuple)):
+            return self._token_to_idx[tokens] if tokens in self._token_to_idx else self._token_to_idx[self.unk_token]
+        else:
+            return [
+                self._token_to_idx[token] if token in self._token_to_idx else self._token_to_idx[self.unk_token]
+                for token in tokens
+            ]
+
+    def __len__(self):
+        return len(self._idx_to_token)
+
+    def __contains__(self, token):
+        return token in self._token_to_idx
+
+    def __call__(self, tokens):
+        """
+        Maps the input tokens into indices. Its function is the same as the
+        :meth:`to_indices` method.
+
+        See detail at `to_indices`.
+        """
+        return self[tokens]
+
+    @property
+    def idx_to_token(self):
+        # Returns index-token dict
+        return self._idx_to_token
+
+    @property
+    def token_to_idx(self):
+        # Return token-index dict
+        return self._token_to_idx
+
+    def to_json(self, path=None):
+        """
+        Summarizes some information of vocab as JSON string. If path is gaven,
+        the JSON string will be saved into files. The JSON string and the saved
+        file all can be used to reconstruct the :class:`Vocab` by calling
+        :meth:`from_json` method.
+
+        Args:
+            path (str, optional): The path to save JSON string. If None, the
+                JSON will not be saved. Default: None.
+
+        Returns:
+            str: The JSON string including information of vocab.
+
+        Example:
+            .. code-block:: python
+
+                from paddlenlp.data import Vocab
+                # The vocab file. The sample file can be downloaded firstly.
+                # wget https://bj.bcebos.com/paddlenlp/data/senta_word_dict.txt
+                vocab_file_path = './senta_word_dict.txt'
+                # Initialize the Vocab
+                vocab = Vocab.load_vocabulary(
+                    vocab_file_path,
+                    unk_token='[UNK]',
+                    pad_token='[PAD]')
+                json_str = vocab.to_json(path='./vocab.json')
+        """
+        vocab_dict = {}
+        vocab_dict["idx_to_token"] = dict(self.idx_to_token)
+        vocab_dict["token_to_idx"] = dict(self.token_to_idx)
+        vocab_dict["unk_token"] = self.unk_token
+        vocab_dict["identifiers_to_tokens"] = self._identifiers_to_tokens
+        json_str = json.dumps(vocab_dict)
+        if path:
+            with io.open(path, "w", encoding="utf-8") as f:
+                f.write(json_str)
+        return json_str
+
+    @classmethod
+    def from_json(cls, json_str):
+        """
+        Loads :class:`Vocab` from JSON string or JSON file, which is gotten by
+        calling :meth:`to_json` method.
+
+        Args:
+            json_str (str): JSON string or file path of JSON string.
+
+        Returns:
+            Vocab: An instance of :class:`Vocab` generated from information
+            contained in JSON string.
+
+        Example:
+            .. code-block:: python
+
+                from paddlenlp.data import Vocab
+                # The vocab file. The sample file can be downloaded firstly.
+                # wget https://bj.bcebos.com/paddlenlp/data/senta_word_dict.txt
+                vocab_file_path = './senta_word_dict.txt'
+                # Initialize the Vocab
+                vocab = Vocab.load_vocabulary(
+                    vocab_file_path,
+                    unk_token='[UNK]',
+                    pad_token='[PAD]')
+                json_str = vocab.to_json(path='./vocab.json')
+
+                vocab1 = Vocab.from_json(json_str)
+                vocab2 = Vocab.from_json('./vocab.json')
+                print(len(vocab), len(vocab1), len(vocab2))
+                # 1256608 1256608 1256608
+        """
+        if os.path.isfile(json_str):
+            with io.open(json_str, "r", encoding="utf-8") as f:
+                vocab_dict = json.load(f)
+        else:
+            vocab_dict = json.loads(json_str)
+        token_to_idx = vocab_dict.get("token_to_idx")
+        unk_token = vocab_dict.get("unk_token")
+        identifiers_to_tokens = vocab_dict.get("identifiers_to_tokens", dict())
+        if "unk_token" in identifiers_to_tokens:
+            del identifiers_to_tokens["unk_token"]
+        vocab = cls(counter=None, token_to_idx=token_to_idx, unk_token=unk_token, **identifiers_to_tokens)
+        return vocab
+
+    @classmethod
+    def from_dict(cls, token_to_idx, unk_token=None, pad_token=None, bos_token=None, eos_token=None, **kwargs):
+        """
+        Builds the :class:`Vocab` from a dict.
+
+        Args:
+            token_to_idx (dict): A dict describes the mapping relationship between
+                tokens and indices.
+            unk_token (str, optional): The special token for unknow token. If
+                no need, it also could be None. Default: None.
+            pad_token (str, optional): The special token for padding token. If
+                no need, it also could be None. Default: None.
+            bos_token (str, optional): The special token for bos token. If no
+                need, it also could be None. Default: None.
+            eos_token (str, optional): The special token for eos token. If no
+                need, it also could be None. Default: None.
+
+            kwargs (dict): Keyword arguments ending with `_token`. It can be
+                used to specify further special tokens that will be exposed as
+                attribute of the vocabulary and associated with an index.
+
+        Returns:
+            Vocab: An instance of :class:`Vocab` generated from the given dict
+            and special tokens.
+
+        Example:
+            .. code-block:: python
+
+                from paddlenlp.data import Vocab
+                # The vocab file. The sample file can be downloaded firstly.
+                # wget https://bj.bcebos.com/paddlenlp/data/senta_word_dict.txt
+                vocab_file_path = './senta_word_dict.txt'
+                # Initialize the Vocab
+                vocab = Vocab.load_vocabulary(
+                    vocab_file_path,
+                    unk_token='[UNK]',
+                    pad_token='[PAD]')
+
+                vocab1 = Vocab.from_dict(vocab.token_to_idx)
+                print(len(vocab), len(vocab.token_to_idx), len(vocab1))
+                # 1256608 1256608 1256608
+        """
+        vocab = cls(
+            counter=None,
+            token_to_idx=token_to_idx,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            **kwargs,
+        )
+        return vocab
+
+    @staticmethod
+    def build_vocab(
+        iterator,
+        max_size=None,
+        min_freq=1,
+        token_to_idx=None,
+        unk_token=None,
+        pad_token=None,
+        bos_token=None,
+        eos_token=None,
+        **kwargs
+    ):
+        """
+        Builds the :class:`Vocab` accoring to given iterator and other
+        information. Firstly, iterate over the `iterator` to construct a
+        :class:`collections.Counter` and used to init the as  :class:`Vocab`.
+
+        Args:
+            iterator (collections.Iterable): Iterator of tokens. Each element
+                should be a list of tokens if wordlevel vocab is needed.
+            max_size (int, optional): The max size of vocab, not including
+                special tokens. Default: None.
+            min_freq (int, optional): Ignore tokens whose frequencies are less
+                than `min_freq`. Default: 1.
+            token_to_idx (dict, optional): A dict specifies the mapping
+                relationship between tokens and indices to be used. If provided,
+                adjust the tokens and indices mapping according to it. If None,
+                counter must be provided. Default: None.
+            unk_token (str, optional): The special token for unknow token
+                '<unk>'. If no need, it also could be None. Default: None.
+            pad_token (str, optional): The special token for padding token
+                '<pad>'. If no need, it also could be None. Default: None.
+            bos_token (str, optional): The special token for bos token '<bos>'.
+                If no need, it also could be None. Default: None.
+            eos_token (str, optional): The special token for eos token '<eos>'.
+                If no need, it also could be None. Default: None.
+
+            kwargs (dict): Keyword arguments ending with `_token`. It can be
+                used to specify further special tokens that will be exposed as
+                attribute of the vocabulary and associated with an index.
+
+        Returns:
+            Vocab: An instance of :class:`Vocab` generated from given iterator
+            and other informations.
+
+        Example:
+            .. code-block:: python
+
+                from paddlenlp.data import Vocab
+                # The vocab file. The sample file can be downloaded firstly.
+                # wget https://bj.bcebos.com/paddlenlp/data/senta_word_dict.txt
+                vocab_file_path = './senta_word_dict.txt'
+                # Initialize the Vocab
+                vocab = Vocab.load_vocabulary(
+                    vocab_file_path,
+                    unk_token='[UNK]',
+                    pad_token='[PAD]')
+
+                vocab1 = Vocab.build_vocab([list(vocab.token_to_idx.keys())])
+                print(len(vocab), len(vocab1))
+                # 1256608 1256608
+        """
+        counter = collections.Counter()
+        for tokens in iterator:
+            counter.update(tokens)
+        vocab = Vocab(
+            counter,
+            max_size=max_size,
+            min_freq=min_freq,
+            token_to_idx=token_to_idx,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            **kwargs,
+        )
+        return vocab
+
+    @staticmethod
+    def load_vocabulary(filepath, unk_token=None, pad_token=None, bos_token=None, eos_token=None, **kwargs):
+        """
+        Builds the :class:`Vocab` from a file reserving all tokens by calling
+        :meth:`Vocab.from_dict` method. The file contains a token per line, and
+        the line index would be the index of corresponding token.
+
+        Args:
+            filepath (str): the path of file to construct vocabulary.
+            unk_token (str, optional): special token for unknown token. If no
+                need, it also could be None. Default: None.
+            pad_token (str, optional): special token for padding token. If no
+                need, it also could be None. Default: None.
+            bos_token (str, optional): special token for bos token. If no need,
+                it also could be None. Default: None.
+            eos_token (str, optional): special token for eos token. If no need,
+                it also could be None. Default: None.
+
+            kwargs (dict): Keyword arguments ending with `_token`. It can be
+                used to specify further special tokens that will be exposed as
+                attribute of the vocabulary and associated with an index.
+
+        Returns:
+            Vocab: An instance of :class:`Vocab` generated from the given file.
+
+        Example:
+            .. code-block:: python
+
+                from paddlenlp.data import Vocab
+                # The vocab file. The sample file can be downloaded firstly.
+                # wget https://bj.bcebos.com/paddlenlp/data/senta_word_dict.txt
+                vocab_file_path = './senta_word_dict.txt'
+                # Initialize the Vocab
+                vocab = Vocab.load_vocabulary(
+                    vocab_file_path,
+                    unk_token='[UNK]',
+                    pad_token='[PAD]')
+                print(len(vocab))
+                # 1256608
+        """
+        token_to_idx = {}
+        with io.open(filepath, "r", encoding="utf-8") as f:
+            for index, line in enumerate(f):
+                token = line.rstrip("\n")
+                token_to_idx[token] = int(index)
+        vocab = Vocab.from_dict(
+            token_to_idx, unk_token=unk_token, pad_token=pad_token, bos_token=bos_token, eos_token=eos_token, **kwargs
+        )
+        return vocab
+
+    def save_vocabulary(self, filepath):
+        """
+        Save the :class:`Vocab` to a specific file. Can be reloaded by calling `load_vocabulary`.
+
+        Args:
+            filepath (str): the path of file to save vocabulary.
+        """
+        with open(filepath, "w") as f:
+            for idx in range(len(self._idx_to_token)):
+                f.write(self._idx_to_token[idx] + "\n")
+
+    def get_unk_token_id(self):
+        return self._token_to_idx[self.unk_token] if self.unk_token is not None else self.unk_token
+
+    def get_bos_token_id(self):
+        return self._token_to_idx[self.bos_token] if self.bos_token is not None else self.bos_token
+
+    def get_eos_token_id(self):
+        return self._token_to_idx[self.eos_token] if self.eos_token is not None else self.eos_token
+
+    def get_pad_token_id(self):
+        return self._token_to_idx[self.pad_token] if self.pad_token is not None else self.pad_token
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/dataaug/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/dataaug/__init__.py
new file mode 100644
index 000000000..9d92b8040
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/dataaug/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base_augment import FileAugment
+from .char import *
+from .sentence import *
+from .word import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/dataaug/base_augment.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/dataaug/base_augment.py
new file mode 100644
index 000000000..e00878d03
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/dataaug/base_augment.py
@@ -0,0 +1,241 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import os
+import re
+from typing import Iterable
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..data import JiebaTokenizer, Vocab
+from ..utils.env import DATA_HOME
+
+
+class BaseAugment(object):
+    """
+    A base class for data augmentation
+
+    Args:
+        create_n (int):
+            Number of augmented sequences.
+        aug_n (int):
+            Number of augmented words in sequences.
+        aug_percent (int):
+            Percentage of augmented words in sequences.
+        aug_min (int):
+            Minimum number of augmented words in sequences.
+        aug_max (int):
+            Maximum number of augmented words in sequences.
+    """
+
+    def __init__(self, create_n=1, aug_n=None, aug_percent=0.1, aug_min=1, aug_max=10, vocab="vocab"):
+        self._DATA = {
+            "stop_words": (
+                "stopwords.txt",
+                "a4a76df756194777ca18cd788231b474",
+                "https://bj.bcebos.com/paddlenlp/data/stopwords.txt",
+            ),
+            "vocab": (
+                "baidu_encyclopedia_w2v_vocab.json",
+                "25c2d41aec5a6d328a65c1995d4e4c2e",
+                "https://bj.bcebos.com/paddlenlp/data/baidu_encyclopedia_w2v_vocab.json",
+            ),
+            "test_vocab": (
+                "test_vocab.json",
+                "1d2fce1c80a4a0ec2e90a136f339ab88",
+                "https://bj.bcebos.com/paddlenlp/data/test_vocab.json",
+            ),
+            "word_synonym": (
+                "word_synonym.json",
+                "aaa9f864b4af4123bce4bf138a5bfa0d",
+                "https://bj.bcebos.com/paddlenlp/data/word_synonym.json",
+            ),
+            "word_embedding": (
+                "word_embedding.json",
+                "534aa4ad274def4deff585cefd8ead32",
+                "https://bj.bcebos.com/paddlenlp/data/word_embedding.json",
+            ),
+            "word_homonym": (
+                "word_homonym.json",
+                "a578c04201a697e738f6a1ad555787d5",
+                "https://bj.bcebos.com/paddlenlp/data/word_homonym.json",
+            ),
+            "char_homonym": (
+                "char_homonym.json",
+                "dd98d5d5d32a3d3dd45c8f7ca503c7df",
+                "https://bj.bcebos.com/paddlenlp/data/char_homonym.json",
+            ),
+            "char_antonym": (
+                "char_antonym.json",
+                "f892f5dce06f17d19949ebcbe0ed52b7",
+                "https://bj.bcebos.com/paddlenlp/data/char_antonym.json",
+            ),
+            "word_antonym": (
+                "word_antonym.json",
+                "cbea11fa99fbe9d07e8185750b37e84a",
+                "https://bj.bcebos.com/paddlenlp/data/word_antonym.json",
+            ),
+        }
+        self.stop_words = self._get_data("stop_words")
+        self.aug_n = aug_n
+        self.aug_percent = aug_percent
+        self.aug_min = aug_min
+        self.aug_max = aug_max
+        self.create_n = create_n
+        self.vocab = Vocab.from_json(self._load_file(vocab))
+        self.tokenizer = JiebaTokenizer(self.vocab)
+        self.loop = 5
+
+    @classmethod
+    def clean(cls, sequences):
+        """Clean input sequences"""
+        if isinstance(sequences, str):
+            return sequences.strip()
+        if isinstance(sequences, Iterable):
+            return [str(s).strip() if s else s for s in sequences]
+        return str(sequences).strip()
+
+    def _load_file(self, mode):
+        """Check and download data"""
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash, url = self._DATA[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(url, default_root, data_hash)
+
+        return fullname
+
+    def _get_data(self, mode):
+        """Read data as list"""
+        fullname = self._load_file(mode)
+        data = []
+        if os.path.exists(fullname):
+            with open(fullname, "r", encoding="utf-8") as f:
+                for line in f:
+                    data.append(line.strip())
+            f.close()
+        else:
+            raise ValueError("The {} should exist.".format(fullname))
+
+        return data
+
+    def _get_aug_n(self, size, size_a=None):
+        """Calculate number of words for data augmentation"""
+        if size == 0:
+            return 0
+        aug_n = self.aug_n or int(math.ceil(self.aug_percent * size))
+        if self.aug_min and aug_n < self.aug_min:
+            aug_n = self.aug_min
+        elif self.aug_max and aug_n > self.aug_max:
+            aug_n = self.aug_max
+        if size_a is not None:
+            aug_n = min(aug_n, int(math.floor(size_a * 0.3)))
+        return aug_n
+
+    def _skip_stop_word_tokens(self, seq_tokens):
+        """Skip words. We can rewrite function to skip specify words."""
+        indexes = []
+        for i, seq_token in enumerate(seq_tokens):
+            if (
+                seq_token not in self.stop_words
+                and not seq_token.isdigit()
+                and not bool(re.search(r"\d", seq_token))
+                and not seq_token.encode("UTF-8").isalpha()
+            ):
+                indexes.append(i)
+        return indexes
+
+    def augment(self, sequences, num_thread=1):
+        """
+        Apply augmentation strategy on input sequences.
+
+            Args:
+            sequences (str or list(str)):
+                Input sequence or list of input sequences.
+            num_thread (int):
+                Number of threads
+        """
+        sequences = self.clean(sequences)
+        # Single Thread
+        if num_thread == 1:
+            if isinstance(sequences, str):
+                return [self._augment(sequences)]
+            else:
+                output = []
+                for sequence in sequences:
+                    output.append(self._augment(sequence))
+                return output
+        else:
+            raise NotImplementedError
+
+    def _augment(self, sequence):
+        raise NotImplementedError
+
+
+class FileAugment(object):
+    """
+    File data augmentation
+
+    Args:
+        strategies (List):
+            List of augmentation strategies.
+    """
+
+    def __init__(self, strategies):
+        self.strategies = strategies
+
+    def augment(self, input_file, output_file="aug.txt", separator=None, separator_id=0):
+        output_sequences = []
+        sequences = []
+
+        input_sequences = self.file_read(input_file)
+
+        if separator:
+            for input_sequence in input_sequences:
+                sequences.append(input_sequence.split(separator)[separator_id])
+        else:
+            sequences = input_sequences
+
+        for strategy in self.strategies:
+            aug_sequences = strategy.augment(sequences)
+            if separator:
+                for aug_sequence, input_sequence in zip(aug_sequences, input_sequences):
+                    input_items = input_sequence.split(separator)
+                    for s in aug_sequence:
+                        input_items[separator_id] = s
+                        output_sequences.append(separator.join(input_items))
+            else:
+                for aug_sequence in aug_sequences:
+                    output_sequences += aug_sequence
+
+        if output_file:
+            self.file_write(output_sequences, output_file)
+
+        return output_sequences
+
+    def file_read(self, input_file):
+        input_sequences = []
+        with open(input_file, "r", encoding="utf-8") as f:
+            for line in f:
+                input_sequences.append(line.strip())
+        f.close()
+        return input_sequences
+
+    def file_write(self, output_sequences, output_file):
+        with open(output_file, "w", encoding="utf-8") as f:
+            for output_sequence in output_sequences:
+                f.write(output_sequence + "\n")
+        f.close()
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/dataaug/char.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/dataaug/char.py
new file mode 100644
index 000000000..dbfc3b61c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/dataaug/char.py
@@ -0,0 +1,570 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import random
+from typing import Iterable
+
+import numpy as np
+import paddle
+
+from ..transformers import AutoModelForMaskedLM, AutoTokenizer
+from .base_augment import BaseAugment
+
+__all__ = ["CharSubstitute", "CharInsert", "CharSwap", "CharDelete"]
+
+
+class CharSubstitute(BaseAugment):
+    """
+    CharSubstitute is a char-level substitution data augmentation strategy
+    that supports replacing characters in the input sequence based on existing
+    dictionaries or custom dictionaries.
+
+    Args:
+        aug_type (str or list(str)):
+            Substitution dictionary type
+        custom_file_path (str, optional):
+            Custom substitution dictionary file path
+        delete_file_path (str, optional):
+            Dictionary file path for deleting characters in substitution dictionary
+        create_n (int):
+            Number of augmented sequences.
+        aug_n (int):
+            Number of augmented characters in sequences.
+        aug_percent (int):
+            Percentage of augmented characters in sequences.
+        aug_min (int):
+            Minimum number of augmented characters in sequences.
+        aug_max (int):
+            Maximum number of augmented characters in sequences.
+        model_name (str):
+            Model parameter name for MLM prediction task.
+    """
+
+    def __init__(
+        self,
+        aug_type,
+        custom_file_path=None,
+        delete_file_path=None,
+        create_n=1,
+        aug_n=None,
+        aug_percent=0.1,
+        aug_min=1,
+        aug_max=10,
+        model_name="ernie-1.0-large-zh-cw",
+        vocab="vocab",
+    ):
+        super().__init__(
+            create_n=create_n, aug_n=aug_n, aug_percent=aug_percent, aug_min=aug_min, aug_max=aug_max, vocab=vocab
+        )
+
+        self.custom_file_path = custom_file_path
+        self.delete_file_path = delete_file_path
+        self.model_name = model_name
+
+        if isinstance(aug_type, str):
+            self.type = aug_type
+            if aug_type in ["antonym", "homonym", "custom"]:
+                self.dict = self._load_substitue_dict(aug_type)
+            elif aug_type in ["mlm"]:
+                self.mlm_model = AutoModelForMaskedLM.from_pretrained(self.model_name)
+                self.mlm_tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        elif isinstance(aug_type, Iterable):
+            if len(aug_type) == 1:
+                self.type = aug_type[0]
+            else:
+                self.type = "combination"
+            if self.type in ["mlm"]:
+                self.mlm_model = AutoModelForMaskedLM.from_pretrained(self.model_name)
+                self.mlm_tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+            self.dict = {}
+            # Merge dictionaries from different sources
+            for t in aug_type:
+                if t in ["antonym", "homonym", "custom"]:
+                    t_dict = self._load_substitue_dict(t)
+                    for k in t_dict:
+                        if k in self.dict:
+                            self.dict[k] = list(set(self.dict[k] + t_dict[k]))
+                        else:
+                            self.dict[k] = t_dict[k]
+        else:
+            self.type = aug_type
+
+    def _load_substitue_dict(self, source_type):
+        """Load substitution dictionary"""
+        if source_type in ["antonym", "homonym"]:
+            fullname = self._load_file("char_" + source_type)
+        elif source_type in ["custom"]:
+            fullname = self.custom_file_path
+        elif source_type in ["delete"]:
+            fullname = self.delete_file_path
+
+        if os.path.exists(fullname):
+            with open(fullname, "r", encoding="utf-8") as f:
+                substitue_dict = json.load(f)
+            f.close()
+        else:
+            raise ValueError("The {} should exist.".format(fullname))
+
+        return substitue_dict
+
+    def _generate_sequence(self, output_seq_tokens, aug_tokens):
+        """Genearte the sequences according to the mapping list"""
+        for aug_token in aug_tokens:
+            idx, token = aug_token
+            output_seq_tokens[int(idx)] = token
+        return "".join(output_seq_tokens)
+
+    def _augment(self, sequence):
+        seq_tokens = [s for s in sequence]
+        aug_indexes = self._skip_stop_word_tokens(seq_tokens)
+        aug_n = self._get_aug_n(len(seq_tokens), len(aug_indexes))
+        p = None
+
+        if aug_n == 0:
+            return []
+        elif self.type == "mlm":
+            return self._augment_mlm(sequence, seq_tokens, aug_indexes, p)
+        elif aug_n == 1:
+            return self._augment_single(seq_tokens, aug_indexes, p)
+        else:
+            return self._augment_multi(seq_tokens, aug_n, aug_indexes, p)
+
+    @paddle.no_grad()
+    def _augment_mlm(self, sequence, seq_tokens, aug_indexes, p):
+        t = 0
+        sentences = []
+        while t < self.create_n * self.loop * 2 and len(sentences) < self.create_n:
+            skip = False
+            t += 1
+            idx = np.random.choice(aug_indexes, replace=False, p=p)
+
+            aug_tokens = [[idx, "[MASK]" * len(seq_tokens[idx])]]
+            sequence_mask = self._generate_sequence(seq_tokens.copy(), aug_tokens)
+            tokenized = self.mlm_tokenizer(sequence_mask)
+            masked_positions = [
+                i for i, idx in enumerate(tokenized["input_ids"]) if idx == self.mlm_tokenizer.mask_token_id
+            ]
+
+            output = self.mlm_model(
+                paddle.to_tensor([tokenized["input_ids"]]), paddle.to_tensor([tokenized["token_type_ids"]])
+            )
+            predicted = "".join(
+                self.mlm_tokenizer.convert_ids_to_tokens(paddle.argmax(output[0][masked_positions], axis=-1))
+            )
+            for ppp in predicted:
+                if ppp in self.stop_words:
+                    skip = True
+                    break
+            if skip:
+                continue
+            aug_tokens = [[idx, predicted]]
+            sequence_generate = self._generate_sequence(seq_tokens.copy(), aug_tokens)
+            if sequence_generate != sequence and sequence_generate not in sentences:
+                sentences.append(sequence_generate)
+        return sentences
+
+    def _augment_multi(self, seq_tokens, aug_n, aug_indexes, p):
+        sentences = []
+        aug_n = min(aug_n, len(aug_indexes))
+        if self.type in ["antonym", "homonym", "combination", "custom"]:
+            candidate_tokens = []
+            pp = []
+            for i, aug_index in enumerate(aug_indexes):
+                if seq_tokens[aug_index] in self.dict:
+                    candidate_tokens.append([aug_index, self.dict[seq_tokens[aug_index]]])
+            pp = np.array(pp)
+            pp /= sum(pp)
+            aug_n = min(aug_n, len(candidate_tokens))
+            if aug_n != 0:
+                t = 0
+                while t < self.create_n * self.loop and len(sentences) < self.create_n:
+                    t += 1
+                    idxes = random.sample(list(range(len(candidate_tokens))), aug_n)
+                    aug_tokens = []
+                    for idx in idxes:
+                        aug_index, aug_dict = candidate_tokens[idx]
+                        aug_tokens.append([aug_index, random.sample(aug_dict, 1)[0]])
+
+                    sentence = self._generate_sequence(seq_tokens.copy(), aug_tokens)
+                    if sentence not in sentences:
+                        sentences.append(sentence)
+        elif self.type in ["random"]:
+            t = 0
+            while t < self.create_n * self.loop and len(sentences) < self.create_n:
+                t += 1
+                aug_tokens = []
+                aug_choice_indexes = np.random.choice(aug_indexes, size=aug_n, replace=False, p=p)
+                for aug_index in aug_choice_indexes:
+                    token = self.vocab.to_tokens(random.randint(0, len(self.vocab) - 2))[0]
+                    aug_tokens.append([aug_index, token])
+                sentence = self._generate_sequence(seq_tokens.copy(), aug_tokens)
+                if sentence not in sentences:
+                    sentences.append(sentence)
+        return sentences
+
+    def _augment_single(self, seq_tokens, aug_indexes, p):
+        sentences = []
+        aug_tokens = []
+        if self.type in ["antonym", "homonym", "combination", "custom"]:
+            candidate_tokens = []
+            pp = []
+            for i, aug_index in enumerate(aug_indexes):
+                if seq_tokens[aug_index] in self.dict:
+                    for token in self.dict[seq_tokens[aug_index]]:
+                        candidate_tokens.append([aug_index, token])
+                        pp.append(p[i] / len(self.dict[seq_tokens[aug_index]]))
+            create_n = min(self.create_n, len(candidate_tokens))
+            pp = np.array(pp)
+            pp /= sum(pp)
+            aug_tokens = random.sample(candidate_tokens, create_n)
+        elif self.type in ["random"]:
+            t = 0
+            while t < self.create_n * self.loop and len(aug_tokens) < self.create_n:
+                t += 1
+                aug_index = np.random.choice(aug_indexes, replace=False, p=p)
+                token = self.vocab.to_tokens(random.randint(0, len(self.vocab) - 2))[0]
+                if [aug_index, token] not in aug_tokens:
+                    aug_tokens.append([aug_index, token])
+        for aug_token in aug_tokens:
+            sequence_generate = self._generate_sequence(seq_tokens.copy(), [aug_token])
+            sentences.append(sequence_generate)
+
+        return sentences
+
+
+class CharInsert(BaseAugment):
+    """
+    CharInsert is a character-level insert data augmentation strategy.
+
+    Args:
+        aug_type (str or list(str)):
+            Insert dictionary type
+        custom_file_path (str, optional):
+            Custom insert dictionary file path
+        delete_file_path (str, optional):
+            Dictionary file path for deleting characters in insert dictionary
+        create_n (int):
+            Number of augmented sequences.
+        aug_n (int):
+            Number of augmented characters in sequences.
+        aug_percent (int):
+            Percentage of augmented characters in sequences.
+        aug_min (int):
+            Minimum number of augmented characters in sequences.
+        aug_max (int):
+            Maximum number of augmented characters in sequences.
+    """
+
+    def __init__(
+        self,
+        aug_type,
+        custom_file_path=None,
+        delete_file_path=None,
+        create_n=1,
+        aug_n=None,
+        aug_percent=0.1,
+        aug_min=1,
+        aug_max=10,
+        model_name="ernie-1.0-large-zh-cw",
+        vocab="vocab",
+    ):
+        super().__init__(
+            create_n=create_n, aug_n=aug_n, aug_percent=aug_percent, aug_min=aug_min, aug_max=aug_max, vocab=vocab
+        )
+
+        self.custom_file_path = custom_file_path
+        self.delete_file_path = delete_file_path
+        self.model_name = model_name
+        if isinstance(aug_type, str):
+            self.type = aug_type
+            if aug_type in ["antonym", "homonym", "custom"]:
+                self.dict = self._load_insert_dict(aug_type)
+            elif aug_type in ["mlm"]:
+                self.mlm_model = AutoModelForMaskedLM.from_pretrained(self.model_name)
+                self.mlm_tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        elif isinstance(aug_type, Iterable):
+            self.type = "combination"
+            self.dict = {}
+            # Merge dictionaries from different sources
+            for t in aug_type:
+                if t in ["antonym", "homonym", "custom"]:
+                    t_dict = self._load_insert_dict(t)
+                    for k in t_dict:
+                        if k in self.dict:
+                            self.dict[k] = list(set(self.dict[k] + t_dict[k]))
+                        else:
+                            self.dict[k] = t_dict[k]
+        else:
+            self.type = aug_type
+
+    def _load_insert_dict(self, source_type):
+        """Load insert dictionary"""
+        if source_type in ["antonym", "homonym"]:
+            fullname = self._load_file("char_" + source_type)
+        elif source_type in ["custom"]:
+            fullname = self.custom_file_path
+        elif source_type in ["delete"]:
+            fullname = self.delete_file_path
+        if os.path.exists(fullname):
+            with open(fullname, "r", encoding="utf-8") as f:
+                insert_dict = json.load(f)
+            f.close()
+        else:
+            raise ValueError("The {} should exist.".format(fullname))
+        return insert_dict
+
+    def _augment(self, sequence):
+        seq_tokens = [s for s in sequence]
+        aug_indexes = self._skip_stop_word_tokens(seq_tokens)
+        aug_n = self._get_aug_n(len(seq_tokens), len(aug_indexes))
+        if aug_n == 0:
+            return []
+        elif self.type == "mlm":
+            return self._augment_mlm(sequence, seq_tokens, aug_indexes)
+        elif aug_n == 1:
+            return self._augment_single(seq_tokens, aug_indexes)
+        else:
+            return self._augment_multi(seq_tokens, aug_n, aug_indexes)
+
+    @paddle.no_grad()
+    def _augment_mlm(self, sequence, seq_tokens, aug_indexes):
+
+        t = 0
+        sentences = []
+        while t < self.create_n * self.loop and len(sentences) < self.create_n:
+            skip = False
+            t += 1
+            p = random.randint(0, 1)
+            idx = random.sample(aug_indexes, 1)[0]
+            aug_tokens = [[idx, "[MASK]" * len(seq_tokens[idx])]]
+            sequence_mask = self._generate_sequence(seq_tokens.copy(), aug_tokens, p)
+            tokenized = self.mlm_tokenizer(sequence_mask)
+            masked_positions = [
+                i for i, idx in enumerate(tokenized["input_ids"]) if idx == self.mlm_tokenizer.mask_token_id
+            ]
+            output = self.mlm_model(
+                paddle.to_tensor([tokenized["input_ids"]]), paddle.to_tensor([tokenized["token_type_ids"]])
+            )
+            predicted = "".join(
+                self.mlm_tokenizer.convert_ids_to_tokens(paddle.argmax(output[0][masked_positions], axis=-1))
+            )
+            for p in predicted:
+                if p in self.stop_words:
+                    skip = True
+                    break
+            if skip:
+                continue
+
+            aug_tokens = [[idx, predicted]]
+
+            sequence_generate = self._generate_sequence(seq_tokens.copy(), aug_tokens, p)
+            if sequence_generate != sequence and sequence_generate not in sentences:
+                sentences.append(sequence_generate)
+        return sentences
+
+    def _augment_multi(self, seq_tokens, aug_n, aug_indexes):
+        sentences = []
+        if self.type in ["antonym", "homonym", "combination", "custom"]:
+            candidate_tokens = []
+            for aug_index in aug_indexes:
+                if seq_tokens[aug_index] in self.dict:
+                    candidate_tokens.append([aug_index, self.dict[seq_tokens[aug_index]]])
+            aug_n = min(aug_n, len(candidate_tokens))
+            if aug_n != 0:
+                t = 0
+                while t < self.create_n * self.loop and len(sentences) < self.create_n:
+                    t += 1
+                    idxes = random.sample(list(range(len(candidate_tokens))), aug_n)
+                    aug_tokens = []
+                    for idx in idxes:
+                        aug_index, aug_dict = candidate_tokens[idx]
+                        aug_tokens.append([aug_index, random.sample(aug_dict, 1)[0]])
+                    p = random.randint(0, 1)
+                    sentence = self._generate_sequence(seq_tokens.copy(), aug_tokens, p)
+                    if sentence not in sentences:
+                        sentences.append(sentence)
+        elif self.type in ["random"]:
+            t = 0
+            while t < self.create_n * self.loop and len(sentences) < self.create_n:
+                t += 1
+                aug_tokens = []
+                aug_indexes = random.sample(aug_indexes, aug_n)
+                for aug_index in aug_indexes:
+                    token = self.vocab.to_tokens(random.randint(0, len(self.vocab) - 2))[0]
+                    aug_tokens.append([aug_index, token])
+                p = random.randint(0, 1)
+                sentence = self._generate_sequence(seq_tokens.copy(), aug_tokens, p)
+                if sentence not in sentences:
+                    sentences.append(sentence)
+        return sentences
+
+    def _augment_single(self, seq_tokens, aug_indexes):
+
+        sentences = []
+        aug_tokens = []
+        if self.type in ["antonym", "homonym", "combination", "custom"]:
+            candidate_tokens = []
+            for aug_index in aug_indexes:
+                if seq_tokens[aug_index] in self.dict:
+                    for token in self.dict[seq_tokens[aug_index]]:
+                        candidate_tokens.append([aug_index, token])
+            create_n = min(self.create_n, len(candidate_tokens))
+            aug_tokens = random.sample(candidate_tokens, create_n)
+        elif self.type in ["random"]:
+            t = 0
+            while t < self.create_n * self.loop and len(aug_tokens) < self.create_n:
+                t += 1
+                aug_index = random.sample(aug_indexes, 1)[0]
+                token = self.vocab.to_tokens(random.randint(0, len(self.vocab) - 2))[0]
+                if [aug_index, token] not in aug_tokens:
+                    aug_tokens.append([aug_index, token])
+        for aug_token in aug_tokens:
+            p = random.randint(0, 1)
+            sentences.append(self._generate_sequence(seq_tokens.copy(), [aug_token], p))
+        return sentences
+
+    def _generate_sequence(self, output_seq_tokens, aug_tokens, p):
+        """Genearte the sequences according to the mapping list"""
+        for aug_token in aug_tokens:
+            idx, token = aug_token
+            if p == 0:
+                output_seq_tokens[idx] = token + output_seq_tokens[idx]
+            else:
+                output_seq_tokens[idx] += token
+        return "".join(output_seq_tokens)
+
+
+class CharSwap(BaseAugment):
+    """
+    CharSwap is a character-level swap data augmentation strategy.
+
+    Args:
+        create_n (int):
+            Number of augmented sequences.
+        aug_n (int):
+            Number of augmented characters in sequences.
+        aug_percent (int):
+            Percentage of augmented characters in sequences.
+        aug_min (int):
+            Minimum number of augmented characters in sequences.
+        aug_max (int):
+            Maximum number of augmented characters in sequences.
+    """
+
+    def __init__(self, create_n=1, aug_n=None, aug_percent=None, aug_min=1, aug_max=10, vocab="vocab"):
+        super().__init__(
+            create_n=create_n, aug_n=aug_n, aug_percent=0.1, aug_min=aug_min, aug_max=aug_max, vocab=vocab
+        )
+
+    def _augment(self, sequence):
+
+        seq_tokens = [s for s in sequence]
+        aug_indexes = self._skip_chars(seq_tokens)
+        aug_n = self._get_aug_n(len(seq_tokens), len(aug_indexes))
+
+        t = 0
+        sentences = []
+
+        if aug_n == 0:
+            return []
+        while t < self.create_n * self.loop and len(sentences) < self.create_n:
+            t += 1
+            idxes = random.sample(aug_indexes, aug_n)
+            output_seq_tokens = seq_tokens.copy()
+            for idx in range(len(seq_tokens)):
+                if idx in idxes:
+                    output_seq_tokens[idx], output_seq_tokens[idx + 1] = (
+                        output_seq_tokens[idx + 1],
+                        output_seq_tokens[idx],
+                    )
+            sentence = "".join(output_seq_tokens)
+            if sentence not in sentences:
+                sentences.append(sentence)
+        return sentences
+
+    def _skip_chars(self, seq_tokens):
+        """Skip specific characters."""
+        indexes = []
+        for i, seq_token in enumerate(seq_tokens[:-1]):
+            if (
+                seq_token not in self.stop_words
+                and not seq_token.isdigit()
+                and not seq_token.encode("UTF-8").isalpha()
+            ):
+                if (
+                    seq_tokens[i + 1] not in self.stop_words
+                    and not seq_tokens[i + 1].isdigit()
+                    and not seq_tokens[i + 1].encode("UTF-8").isalpha()
+                ):
+                    indexes.append(i)
+        return indexes
+
+
+class CharDelete(BaseAugment):
+    """
+    CharDelete is a character-level deletion data augmentation strategy.
+
+    Args:
+        create_n (int):
+            Number of augmented sequences.
+        aug_n (int):
+            Number of augmented characters in sequences.
+        aug_percent (int):
+            Percentage of augmented characters in sequences.
+        aug_min (int):
+            Minimum number of augmented characters in sequences.
+        aug_max (int):
+            Maximum number of augmented characters in sequences.
+    """
+
+    def __init__(self, create_n=1, aug_n=None, aug_percent=0.1, aug_min=1, aug_max=10, vocab="vocab"):
+        super().__init__(
+            create_n=create_n, aug_n=aug_n, aug_percent=aug_percent, aug_min=aug_min, aug_max=aug_max, vocab=vocab
+        )
+
+    def _augment(self, sequence):
+
+        seq_tokens = [s for s in sequence]
+        aug_indexes = self._skip_chars(seq_tokens)
+        aug_n = self._get_aug_n(len(seq_tokens), len(aug_indexes))
+
+        t = 0
+        sentences = []
+        if aug_n == 0:
+            return sentences
+        while t < self.create_n * self.loop and len(sentences) < self.create_n:
+            t += 1
+            idxes = random.sample(aug_indexes, aug_n)
+            sentence = ""
+            for idx in range(len(seq_tokens)):
+                if idx not in idxes:
+                    sentence += seq_tokens[idx]
+            if sentence not in sentences:
+                sentences.append(sentence)
+        return sentences
+
+    def _skip_chars(self, seq_tokens):
+        """Skip specific characters."""
+        indexes = []
+        for i, seq_token in enumerate(seq_tokens):
+            if seq_token in self.stop_words or seq_token.isdigit() or seq_token.encode("UTF-8").isalpha():
+                continue
+            elif i != 0 and seq_tokens[i - 1].isdigit():
+                continue
+            elif i != len(seq_tokens) - 1 and seq_tokens[i + 1].isdigit():
+                continue
+            else:
+                indexes.append(i)
+        return indexes
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/dataaug/sentence.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/dataaug/sentence.py
new file mode 100644
index 000000000..e41bf98b1
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/dataaug/sentence.py
@@ -0,0 +1,552 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+
+from ..taskflow import Taskflow
+from ..transformers import (
+    AutoModelForCausalLM,
+    AutoModelForConditionalGeneration,
+    AutoTokenizer,
+)
+
+__all__ = [
+    "SentenceGenerate",
+    "SentenceSummarize",
+    "SentenceBackTranslate",
+    "SentenceBackTranslateAPI",
+    "SentenceContinue",
+]
+
+
+class SentenceGenerate:
+    """
+    SentenceGenerate is a sentence-level data augmentation strategy
+    that generates simialr sentences according to the input sequence.
+    The strattegy first generates several sentences, and then chooses
+    the top n simialr sentences by the model.
+
+    Args:
+        model_name (str):
+            Model parameter name for generation task.
+        create_n (int):
+            Number of augmented sequences.
+        generate_n (int):
+            Number of generated sequences.
+        max_length (int):
+            The max length of the prediction.
+        top_p (float): The cumulative probability for
+            top-p-filtering in the "sampling" strategy. The value should
+            satisfy 0 <= top_p < 1. Default to 0.95.
+    """
+
+    def __init__(
+        self, model_name="roformer-chinese-sim-char-base", create_n=1, generate_n=5, max_length=128, top_p=0.95
+    ):
+        self.model_name = model_name
+        self.create_n = create_n
+        self.generate_n = generate_n
+        self.max_length = max_length
+        self.top_p = top_p
+
+        self.model = AutoModelForCausalLM.from_pretrained(self.model_name)
+        self.model.eval()
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+
+    def augment(self, sequences):
+        """
+        Apply augmentation strategy on input sequences.
+
+            Args:
+            sequences (str or list(str)):
+                Input sequence or list of input sequences.
+
+        """
+        if isinstance(sequences, str):
+            sequences = [sequences]
+        augmented_sequences = []
+        for sequence in sequences:
+            augmented_sequences.append(self._generate_similar_sentence(sequence, self.model, self.tokenizer))
+        return augmented_sequences
+
+    @paddle.no_grad()
+    def _generate_similar_sentence(self, sequence, model, tokenizer):
+        """Generates generate_n similar sentences from the provided sequence, and chooose the best create_n similar sentences."""
+
+        # Generate generate_n similar sentences
+        generated_sequences = [sequence]
+        tokenized_input = tokenizer(sequence, return_tensors="pd", padding=True)
+        decoded_outputs = tokenizer.batch_decode(
+            model.generate(
+                **tokenized_input,
+                num_return_sequences=self.generate_n,
+                top_p=self.top_p,
+                decode_strategy="sampling",
+                max_length=self.max_length,
+            )[0],
+            skip_special_tokens=True,
+        )
+        for decoded_output in decoded_outputs:
+            s = decoded_output.replace(" ", "").replace(sequence, "")
+            if s not in generated_sequences and len(s) > 0:
+                generated_sequences.append(s)
+        tokenized_output = tokenizer(generated_sequences, return_tensors="pd", padding=True)
+
+        # Choose best create_n similar sentences
+        tokenized_output = tokenizer(generated_sequences, return_tensors="pd", padding=True)
+        Z = model.roformer(**tokenized_output)[1].cpu().numpy()
+        Z /= (Z**2).sum(axis=1, keepdims=True) ** 0.5
+
+        return [generated_sequences[i + 1] for i in np.dot(Z[1:], -Z[0]).argsort()[: self.create_n]]
+
+
+class SentenceSummarize:
+    """
+    SentenceSummarize is a sentence-level data augmentation strategy
+    that summarizes the input sequence.
+
+    Args:
+        create_n (int):
+            Number of augmented sequences.
+        max_length (int):
+            The max length of the summarization.
+        batch_size(int):
+            The sample number of a mini-batch.
+        top_k (int): The number of highest probability tokens to
+            keep for top-k-filtering in the "sampling" strategy. Default to
+            0, which means no effect.
+        top_p (float): The cumulative probability for
+            top-p-filtering in the "sampling" strategy. The value should
+            satisfy 0 <= top_p < 1. Default to 1.0, which means no
+            effect.
+        temperature (float): The value used to module the next
+            token probabilities in the "sampling" strategy. Default to 1.0,
+            which means no effect.
+        use_fp16_decoding: (bool): Whether to use fp16 for decoding.
+            Only works when faster entry is avalible. Default to False.
+        kwargs (dict): Additional keyword arguments refer to ..taskflow.text_summarization.TextSummarization
+    """
+
+    def __init__(
+        self,
+        create_n=1,
+        max_length=128,
+        batch_size=1,
+        top_k=5,
+        top_p=1.0,
+        temperature=1.0,
+        use_fp16_decoding=False,
+        **kwargs
+    ):
+
+        kwargs.setdefault("num_return_sequences", create_n)
+        kwargs.setdefault("num_beams", create_n * 4)
+        kwargs.setdefault("max_length", max_length)
+        kwargs.setdefault("batch_size", batch_size)
+        kwargs.setdefault("top_k", top_k)
+        kwargs.setdefault("top_p", top_p)
+        kwargs.setdefault("temperature", temperature)
+        kwargs.setdefault("use_fp16_decoding", use_fp16_decoding)
+
+        self.create_n = kwargs["num_return_sequences"]
+        self.summarization = Taskflow("text_summarization", **kwargs)
+
+    def augment(self, sequences):
+        """
+        Apply augmentation strategy on input sequences.
+
+            Args:
+            sequences (str or list(str)):
+                Input sequence or list of input sequences.
+
+        """
+        if isinstance(sequences, str):
+            sequences = [sequences]
+        augmented_sequences = self.summarization(sequences)
+        return [augmented_sequences[i * self.create_n : (i + 1) * self.create_n] for i in range(len(sequences))]
+
+
+class SentenceBackTranslate:
+    """
+    SentenceBackTranslate is a sentence-level data augmentation strategy
+    that translates the input sequence into one langugage, and backtranslate
+    back into the sourche language by the language models.
+
+    Args:
+        src_lang (str):
+            The source language of the input sequences.
+        tgt_lang (str):
+            The target language of the translated sequences.
+        max_length (int):
+            The max length of the translation.
+        batch_size(int):
+            The sample number of a mini-batch.
+        num_beams (int): The number of beams in the "beam_search"
+            strategy. Default to 4.
+        use_faster: (bool): Whether to use faster entry of model
+            for FasterGeneration. Default to False (already deprecated).
+        decode_strategy (str, optional): The decoding strategy in generation.
+            Currently, there are three decoding strategies supported:
+            "greedy_search", "sampling" and "beam_search". Default to
+            "beam_search".
+    """
+
+    def __init__(
+        self,
+        src_lang="zh",
+        tgt_lang="en",
+        max_length=128,
+        batch_size=1,
+        num_beams=4,
+        use_faster=False,
+        decode_strategy="beam_search",
+        from_model_name=None,
+        to_model_name=None,
+    ):
+        self.src_lang = src_lang
+        self.tgt_lang = tgt_lang
+        self.max_length = max_length
+        self.batch_size = batch_size
+        self.num_beams = num_beams
+        self.decode_strategy = decode_strategy
+        self.from_model_name = from_model_name
+        self.to_model_name = to_model_name
+        self.MBART_MAP = {
+            "ar": "ar_AR",
+            "cs": "cs_CZ",
+            "de": "de_DE",
+            "en": "en_XX",
+            "es": "es_XX",
+            "et": "et_EE",
+            "fi": "fi_FI",
+            "fr": "fr_XX",
+            "gu": "gu_IN",
+            "hi": "hi_IN",
+            "it": "it_IT",
+            "ja": "ja_XX",
+            "kk": "kk_KZ",
+            "ko": "ko_KR",
+            "lt": "lt_LT",
+            "lv": "lv_LV",
+            "my": "my_MM",
+            "ne": "ne_NP",
+            "nl": "nl_XX",
+            "ro": "ro_RO",
+            "ru": "ru_RU",
+            "si": "si_LK",
+            "tr": "tr_TR",
+            "vi": "vi_VN",
+            "zh": "zh_CN",
+            "af": "af_ZA",
+            "az": "az_AZ",
+            "bn": "bn_IN",
+            "fa": "fa_IR",
+            "he": "he_IL",
+            "hr": "hr_HR",
+            "id": "id_ID",
+            "ka": "ka_GE",
+            "km": "km_KH",
+            "mk": "mk_MK",
+            "ml": "ml_IN",
+            "mn": "mn_MN",
+            "mr": "mr_IN",
+            "pl": "pl_PL",
+            "ps": "ps_AF",
+            "pt": "pt_XX",
+            "sv": "sv_SE",
+            "sw": "sw_KE",
+            "ta": "ta_IN",
+            "te": "te_IN",
+            "th": "th_TH",
+            "tl": "tl_XX",
+            "uk": "uk_UA",
+            "ur": "ur_PK",
+            "xh": "xh_ZA",
+            "gl": "gl_ES",
+            "sl": "sl_SI",
+        }
+        if self.from_model_name is None:
+            if tgt_lang == "en":
+                self.from_model_name = "mbart-large-50-many-to-one-mmt"
+            else:
+                self.from_model_name = "mbart-large-50-many-to-many-mmt"
+
+        if to_model_name is None:
+            if tgt_lang == "en":
+                self.to_model_name = "mbart-large-50-one-to-many-mmt"
+            else:
+                self.to_model_name = "mbart-large-50-many-to-many-mmt"
+
+        self.from_model = AutoModelForConditionalGeneration.from_pretrained(self.from_model_name)
+        self.to_model = AutoModelForConditionalGeneration.from_pretrained(self.to_model_name)
+        self.from_tokenizer = AutoTokenizer.from_pretrained(self.from_model_name, src_lang=self.MBART_MAP[src_lang])
+        self.to_tokenizer = AutoTokenizer.from_pretrained(self.to_model_name, src_lang=self.MBART_MAP[tgt_lang])
+        self.from_model.eval()
+        self.to_model.eval()
+
+    def augment(self, sequences):
+        """
+        Apply augmentation strategy on input sequences.
+
+            Args:
+            sequences (str or list(str)):
+                Input sequence or list of input sequences.
+
+        """
+        if isinstance(sequences, str):
+            sequences = [sequences]
+        sequences = self._translate(self.from_model, self.from_tokenizer, sequences, self.tgt_lang)
+        sequences = self._translate(self.to_model, self.to_tokenizer, sequences, self.src_lang)
+        return [[sequence] for sequence in sequences]
+
+    @paddle.no_grad()
+    def _translate(self, model, tokenizer, sequences, lang):
+        batched_inputs = [sequences[idx : idx + self.batch_size] for idx in range(0, len(sequences), self.batch_size)]
+        translated_texts = []
+        eos_id = model.mbart.config["eos_token_id"]
+        for batched_input in batched_inputs:
+            tokenized_input = tokenizer(batched_input, return_tensors="pd", padding=True)["input_ids"]
+            outputs = model.generate(
+                input_ids=tokenized_input,
+                forced_bos_token_id=tokenizer.lang_code_to_id[self.MBART_MAP[lang]],
+                decode_strategy=self.decode_strategy,
+                num_beams=self.num_beams,
+                max_length=self.max_length,
+            )[0]
+            for output in outputs:
+                eos = np.where(output.cpu().numpy() == eos_id)[0]
+                if len(eos) == 0:
+                    eos_pos = len(output) - 1
+                else:
+                    eos_pos = eos[0]
+                translated_texts.append(tokenizer.convert_ids_to_string(output[1:eos_pos]))
+        return translated_texts
+
+
+class SentenceBackTranslateAPI:
+    """
+    SentenceBackTranslateAPI is a sentence-level data augmentation strategy
+    that translates the input sequence into one langugage, and backtranslate
+    back into the sourche language by baidu translate api.
+
+    Args:
+        src_lang (str):
+            The source language of the input sequences.
+        tgt_lang (str):
+            The target language of the translated sequences.
+        appid (str):
+            Appid for requesting Baidu translation service. (if use your own appid/appkey)
+        secretKey (str):
+            Secret key for requesting Baidu translation service. (if use your own appid/appkey)
+        qps (int):
+            Queries per second. (if use your own appid/appkey)
+    """
+
+    def __init__(self, src_lang="zh", tgt_lang="en", appid=None, secretKey=None, qps=1):
+
+        self.src_lang = src_lang
+        self.tgt_lang = tgt_lang
+        self.appid = appid
+        self.secretKey = secretKey
+        self.qps = qps
+        self.url = "http://api.fanyi.baidu.com/api/trans/vip/translate"
+
+    def augment(self, sequences):
+        """
+        Apply augmentation strategy on input sequences.
+
+            Args:
+            sequences (str or list(str)):
+                Input sequence or list of input sequences.
+
+        """
+        if isinstance(sequences, str):
+            sequences = [sequences]
+        if self.appid is None or self.secretKey is None:
+            return self._back_translate_hub(sequences)
+        else:
+            return self._back_translate_api(sequences)
+
+    def _back_translate_hub(self, sequences):
+        try:
+            import paddlehub as hub
+        except ImportError:
+            print(" PaddleHub not installed!")
+            import os
+
+            os.system("pip install paddlehub==2.3.1")
+            import paddlehub as hub
+
+        module = hub.Module(name="baidu_translate")
+        translated_texts = []
+        for sequence in sequences:
+            sequence = module.translate(sequence, self.src_lang, self.tgt_lang)
+            sequence = module.translate(sequence, self.tgt_lang, self.src_lang)
+            translated_texts.append([sequence])
+        return translated_texts
+
+    def _back_translate_api(self, sequences):
+
+        translated_texts = []
+        for sequence in sequences:
+            sequence = self._translate_api(sequence, self.src_lang, self.tgt_lang)
+            sequence = self._translate_api(sequence, self.tgt_lang, self.src_lang)
+            translated_texts.append(sequence)
+        return translated_texts
+
+    def _translate_api(self, query, from_lang, to_lang):
+
+        import hashlib
+        import random
+        import time
+
+        import requests
+
+        # Generate salt and sign
+        salt = str(random.randint(32768, 65536))
+        sign = self.appid + query + salt + self.secretKey
+        sign = hashlib.md5(sign.encode("utf-8")).hexdigest()
+
+        # Build request
+        headers = {"Content-Type": "application/x-www-form-urlencoded"}
+        payload = {
+            "appid": f"{self.appid}",
+            "q": f"{query}",
+            "from": from_lang,
+            "to": to_lang,
+            "salt": f"{salt}",
+            "sign": f"{sign}",
+        }
+
+        # Send request
+        time.sleep(1 / self.qps)
+        try:
+            r = requests.post(self.url, params=payload, headers=headers)
+            result = r.json()
+        except Exception as e:
+            error_msg = str(e)
+            raise RuntimeError(error_msg)
+        if "error_code" in result:
+            raise RuntimeError(result)
+        return result["trans_result"][0]["dst"]
+
+
+class SentenceContinue:
+    """
+    SentenceContinue is a sentence-level data augmentation strategy
+    that generates continuation for the input sequence.
+
+    Args:
+        model_name (str):
+            Model parameter name for summarization task.
+        max_length (int):
+            The max length of the summarization.
+        decode_strategy (str, optional): The decoding strategy in generation.
+            Currently, there are three decoding strategies supported:
+            "greedy_search", "sampling" and "beam_search". Default to
+            "beam_search".
+        use_faster: (bool): Whether to use faster entry of model
+            for FasterGeneration. Default to False (already deprecated).
+        create_n (int):
+            Number of augmented sequences.
+        batch_size(int):
+            The sample number of a mini-batch.
+        top_k (int): The number of highest probability tokens to
+            keep for top-k-filtering in the "sampling" strategy. Default to
+            0, which means no effect.
+        top_p (float): The cumulative probability for
+            top-p-filtering in the "sampling" strategy. The value should
+            satisfy 0 <= top_p < 1. Default to 1.0, which means no
+            effect.
+        temperature (float): The value used to module the next
+            token probabilities in the "sampling" strategy. Default to 1.0,
+            which means no effect.
+    """
+
+    def __init__(
+        self,
+        model_name="gpt-cpm-small-cn-distill",
+        max_length=64,
+        decode_strategy="sampling",
+        use_faster=False,
+        create_n=1,
+        top_k=50,
+        temperature=1.0,
+        top_p=0.9,
+        batch_size=1,
+    ):
+        self.model_name = model_name
+        self.max_length = max_length
+        self.decode_strategy = decode_strategy
+        self.create_n = create_n
+        self.top_k = top_k
+        self.temperature = temperature
+        self.top_p = top_p
+        self.batch_size = batch_size
+
+        self.model = AutoModelForCausalLM.from_pretrained(self.model_name)
+        self.model.eval()
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        self.tokenizer.add_special_tokens(
+            {"pad_token": self.tokenizer.convert_ids_to_tokens(self.model.config.pad_token_id)}
+        )
+
+    def augment(self, sequences):
+        """
+        Apply augmentation strategy on input sequences.
+
+            Args:
+            sequences (str or list(str)):
+                Input sequence or list of input sequences.
+
+        """
+        if isinstance(sequences, str):
+            sequences = [sequences]
+        return self._generate_continue(sequences, self.model, self.tokenizer)
+
+    @paddle.no_grad()
+    def _generate_continue(self, sequences, model, tokenizer):
+        batched_inputs = [sequences[idx : idx + self.batch_size] for idx in range(0, len(sequences), self.batch_size)]
+        generated_sequences = []
+        for batched_input in batched_inputs:
+            tokenized_inputs = tokenizer(
+                batched_input, return_tensors="pd", padding=True, return_attention_mask=True, return_position_ids=True
+            )
+            outputs = model.generate(
+                **tokenized_inputs,
+                max_length=self.max_length,
+                decode_strategy=self.decode_strategy,
+                num_return_sequences=self.create_n,
+                top_k=self.top_k,
+                temperature=self.temperature,
+                top_p=self.top_p,
+            )[0]
+            for i in range(outputs.shape[0]):
+                output = outputs[i].cpu().numpy()
+                eos = np.where(output == model.config.eos_token_id)[0]
+                if len(eos) == 0:
+                    eos_pos = len(output) - 1
+                else:
+                    eos_pos = eos[0]
+                generated_sequences.append(tokenizer.convert_ids_to_string(output[:eos_pos].tolist()))
+        augmented_sequences = []
+        for i, sequence in enumerate(sequences):
+            augmented_sequence = []
+            for ii in range(self.create_n):
+                continue_sequence = (
+                    generated_sequences[i * self.create_n + ii].replace(" ", "").replace("\n", "").replace("\t", "")
+                )
+                augmented_sequence.append(sequence + continue_sequence)
+            augmented_sequences.append(augmented_sequence)
+        return augmented_sequences
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/dataaug/word.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/dataaug/word.py
new file mode 100644
index 000000000..438935e54
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/dataaug/word.py
@@ -0,0 +1,635 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import math
+import os
+import random
+from typing import Iterable
+
+import numpy as np
+import paddle
+
+from ..transformers import AutoModelForMaskedLM, AutoTokenizer
+from .base_augment import BaseAugment
+
+__all__ = ["WordSubstitute", "WordInsert", "WordSwap", "WordDelete"]
+
+
+class WordSubstitute(BaseAugment):
+    """
+    WordSubstitute is a word-level substitution data augmentation strategy
+    that supports replacing words in the input sequence based on existing
+    dictionaries or custom dictionaries.
+
+    Args:
+        aug_type (str or list(str)):
+            Substitution dictionary type
+        custom_file_path (str, optional):
+            Custom substitution dictionary file path
+        delete_file_path (str, optional):
+            Dictionary file path for deleting words in substitution dictionary
+        create_n (int):
+            Number of augmented sequences.
+        aug_n (int):
+            Number of augmented words in sequences.
+        aug_percent (int):
+            Percentage of augmented words in sequences.
+        aug_min (int):
+            Minimum number of augmented words in sequences.
+        aug_max (int):
+            Maximum number of augmented words in sequences.
+        tf_idf (bool):
+            Use tf-idf to select the most unimportant word for substitution.
+        tf_idf (str):
+            File for calculating TF-IDF score.
+        model_name (str):
+            Model parameter name for MLM prediction task.
+    """
+
+    def __init__(
+        self,
+        aug_type,
+        custom_file_path=None,
+        delete_file_path=None,
+        create_n=1,
+        aug_n=None,
+        aug_percent=0.1,
+        aug_min=1,
+        aug_max=10,
+        tf_idf=False,
+        tf_idf_file=None,
+        model_name="ernie-1.0-large-zh-cw",
+        vocab="vocab",
+    ):
+        super().__init__(
+            create_n=create_n, aug_n=aug_n, aug_percent=aug_percent, aug_min=aug_min, aug_max=aug_max, vocab=vocab
+        )
+
+        self.custom_file_path = custom_file_path
+        self.delete_file_path = delete_file_path
+        self.tf_idf = tf_idf
+        self.model_name = model_name
+        if self.tf_idf:
+            self._count_idf(tf_idf_file)
+
+        if isinstance(aug_type, str):
+            self.type = aug_type
+            if aug_type in ["antonym", "embedding", "synonym", "homonym", "custom"]:
+                self.dict = self._load_substitue_dict(aug_type)
+            elif aug_type in ["mlm"]:
+                self.mlm_model = AutoModelForMaskedLM.from_pretrained(self.model_name)
+                self.mlm_tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        elif isinstance(aug_type, Iterable):
+            if len(aug_type) == 1:
+                self.type = aug_type[0]
+            else:
+                self.type = "combination"
+            if self.type in ["mlm"]:
+                self.mlm_model = AutoModelForMaskedLM.from_pretrained(self.model_name)
+                self.mlm_tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+            self.dict = {}
+            # Merge dictionaries from different sources
+            for t in aug_type:
+                if t in ["antonym", "embedding", "synonym", "homonym", "custom"]:
+                    t_dict = self._load_substitue_dict(t)
+                    for k in t_dict:
+                        if k in self.dict:
+                            self.dict[k] = list(set(self.dict[k] + t_dict[k]))
+                        else:
+                            self.dict[k] = t_dict[k]
+            # Todo: delete some words in the dictionary
+        else:
+            self.type = aug_type
+
+    def _count_idf(self, tf_idf_file):
+        if os.path.exists(tf_idf_file):
+            with open(tf_idf_file, "r", encoding="utf-8") as f:
+                self.word_count_dict = {}
+                self.text_tf_idf = []
+                self.num = 0
+                for line in f:
+                    self.num += 1
+                    self.text_tf_idf.append(line.strip())
+                    for word in set(self.tokenizer.cut(line.strip())):
+                        if word not in self.word_count_dict:
+                            self.word_count_dict[word] = 0
+                        self.word_count_dict[word] += 1
+            f.close()
+        else:
+            raise ValueError("The tf_idf_file should exist.")
+        return
+
+    def _calculate_tfidf(self, sequence, seq_tokens, aug_indexes):
+        if sequence not in self.text_tf_idf:
+            self.num += 1
+            self.text_tf_idf.append(sequence)
+            for word in set(seq_tokens):
+                if word not in self.word_count_dict:
+                    self.word_count_dict[word] = 0
+                self.word_count_dict[word] += 1
+        sequence_count = {}
+        for index in aug_indexes:
+            if seq_tokens[index] in sequence_count:
+                sequence_count[seq_tokens[index]] += 1
+            else:
+                sequence_count[seq_tokens[index]] = 1
+        tfidf = []
+        for index in aug_indexes:
+            tf = sequence_count[seq_tokens[index]] / len(aug_indexes)
+            idf = math.log(self.num / self.word_count_dict[seq_tokens[index]])
+            tfidf.append(tf * idf)
+        return np.array(tfidf)
+
+    def _load_substitue_dict(self, source_type):
+        """Load substitution dictionary"""
+        if source_type in ["antonym", "embedding", "synonym", "homonym"]:
+            fullname = self._load_file("word_" + source_type)
+        elif source_type in ["custom"]:
+            fullname = self.custom_file_path
+        elif source_type in ["delete"]:
+            fullname = self.delete_file_path
+
+        if os.path.exists(fullname):
+            with open(fullname, "r", encoding="utf-8") as f:
+                substitue_dict = json.load(f)
+            f.close()
+        else:
+            raise ValueError("The {} should exist.".format(fullname))
+
+        return substitue_dict
+
+    def _generate_sequence(self, output_seq_tokens, aug_tokens):
+        """Genearte the sequences according to the mapping list"""
+        for aug_token in aug_tokens:
+            idx, token = aug_token
+            output_seq_tokens[int(idx)] = token
+        return "".join(output_seq_tokens)
+
+    def _augment(self, sequence):
+        seq_tokens = self.tokenizer.cut(sequence)
+        aug_indexes = self._skip_stop_word_tokens(seq_tokens)
+        aug_n = self._get_aug_n(len(seq_tokens), len(aug_indexes))
+
+        if self.tf_idf:
+            tfidf = self._calculate_tfidf(sequence, seq_tokens, aug_indexes)
+            p = (max(tfidf) + 0.01 - tfidf) / sum(max(tfidf) + 0.01 - tfidf)
+        else:
+            p = None
+
+        if aug_n == 0:
+            return []
+        elif self.type == "mlm":
+            return self._augment_mlm(sequence, seq_tokens, aug_indexes, p)
+        elif aug_n == 1:
+            return self._augment_single(seq_tokens, aug_indexes, p)
+        else:
+            return self._augment_multi(seq_tokens, aug_n, aug_indexes, p)
+
+    @paddle.no_grad()
+    def _augment_mlm(self, sequence, seq_tokens, aug_indexes, p):
+        t = 0
+        sentences = []
+        while t < self.create_n * self.loop * 2 and len(sentences) < self.create_n:
+            skip = False
+            t += 1
+            idx = np.random.choice(aug_indexes, replace=False, p=p)
+
+            aug_tokens = [[idx, "[MASK]" * len(seq_tokens[idx])]]
+            sequence_mask = self._generate_sequence(seq_tokens.copy(), aug_tokens)
+            tokenized = self.mlm_tokenizer(sequence_mask)
+            masked_positions = [
+                i for i, idx in enumerate(tokenized["input_ids"]) if idx == self.mlm_tokenizer.mask_token_id
+            ]
+
+            output = self.mlm_model(
+                paddle.to_tensor([tokenized["input_ids"]]), paddle.to_tensor([tokenized["token_type_ids"]])
+            )
+            predicted = "".join(
+                self.mlm_tokenizer.convert_ids_to_tokens(paddle.argmax(output[0][masked_positions], axis=-1))
+            )
+            for ppp in predicted:
+                if ppp in self.stop_words:
+                    skip = True
+                    break
+            if skip:
+                continue
+            aug_tokens = [[idx, predicted]]
+            sequence_generate = self._generate_sequence(seq_tokens.copy(), aug_tokens)
+            if sequence_generate != sequence and sequence_generate not in sentences:
+                sentences.append(sequence_generate)
+        return sentences
+
+    def _augment_multi(self, seq_tokens, aug_n, aug_indexes, p):
+        sentences = []
+        aug_n = min(aug_n, len(aug_indexes))
+        if self.type in ["antonym", "embedding", "synonym", "homonym", "combination", "custom"]:
+            candidate_tokens = []
+            pp = []
+            for i, aug_index in enumerate(aug_indexes):
+                if seq_tokens[aug_index] in self.dict:
+                    candidate_tokens.append([aug_index, self.dict[seq_tokens[aug_index]]])
+                    if self.tf_idf:
+                        pp.append(p[i])
+            pp = np.array(pp)
+            pp /= sum(pp)
+            aug_n = min(aug_n, len(candidate_tokens))
+            if aug_n != 0:
+                t = 0
+                while t < self.create_n * self.loop and len(sentences) < self.create_n:
+                    t += 1
+                    if self.tf_idf:
+                        idxes = np.random.choice(list(range(len(candidate_tokens))), size=aug_n, replace=False, p=pp)
+                    else:
+                        idxes = random.sample(list(range(len(candidate_tokens))), aug_n)
+                    aug_tokens = []
+                    for idx in idxes:
+                        aug_index, aug_dict = candidate_tokens[idx]
+                        aug_tokens.append([aug_index, random.sample(aug_dict, 1)[0]])
+
+                    sentence = self._generate_sequence(seq_tokens.copy(), aug_tokens)
+                    if sentence not in sentences:
+                        sentences.append(sentence)
+        elif self.type in ["random"]:
+            t = 0
+            while t < self.create_n * self.loop and len(sentences) < self.create_n:
+                t += 1
+                aug_tokens = []
+                aug_choice_indexes = np.random.choice(aug_indexes, size=aug_n, replace=False, p=p)
+                for aug_index in aug_choice_indexes:
+                    token = self.vocab.to_tokens(random.randint(0, len(self.vocab) - 2))
+                    aug_tokens.append([aug_index, token])
+                sentence = self._generate_sequence(seq_tokens.copy(), aug_tokens)
+                if sentence not in sentences:
+                    sentences.append(sentence)
+        return sentences
+
+    def _augment_single(self, seq_tokens, aug_indexes, p):
+        sentences = []
+        aug_tokens = []
+        if self.type in ["antonym", "embedding", "synonym", "homonym", "combination", "custom"]:
+            candidate_tokens = []
+            pp = []
+            for i, aug_index in enumerate(aug_indexes):
+                if seq_tokens[aug_index] in self.dict:
+                    for token in self.dict[seq_tokens[aug_index]]:
+                        candidate_tokens.append([aug_index, token])
+                        if self.tf_idf:
+                            pp.append(p[i] / len(self.dict[seq_tokens[aug_index]]))
+            create_n = min(self.create_n, len(candidate_tokens))
+            pp = np.array(pp)
+            pp /= sum(pp)
+            if self.tf_idf:
+                candidate_indexes = np.random.choice(range(len(candidate_tokens)), size=create_n, replace=False, p=pp)
+                candidate_tokens = np.array(candidate_tokens)
+                aug_tokens = candidate_tokens[candidate_indexes]
+            else:
+                aug_tokens = random.sample(candidate_tokens, create_n)
+        elif self.type in ["random"]:
+            t = 0
+            while t < self.create_n * self.loop and len(aug_tokens) < self.create_n:
+                t += 1
+                aug_index = np.random.choice(aug_indexes, replace=False, p=p)
+                token = self.vocab.to_tokens(random.randint(0, len(self.vocab) - 2))
+                if [aug_index, token] not in aug_tokens:
+                    aug_tokens.append([aug_index, token])
+        for aug_token in aug_tokens:
+            sequence_generate = self._generate_sequence(seq_tokens.copy(), [aug_token])
+            sentences.append(sequence_generate)
+
+        return sentences
+
+
+class WordInsert(BaseAugment):
+    """
+    WordInsert is a word-level insert data augmentation strategy.
+
+    Args:
+        aug_type (str or list(str)):
+            Insert dictionary type
+        custom_file_path (str, optional):
+            Custom insert dictionary file path
+        delete_file_path (str, optional):
+            Dictionary file path for deleting words in insert dictionary
+        create_n (int):
+            Number of augmented sequences.
+        aug_n (int):
+            Number of augmented words in sequences.
+        aug_percent (int):
+            Percentage of augmented words in sequences.
+        aug_min (int):
+            Minimum number of augmented words in sequences.
+        aug_max (int):
+            Maximum number of augmented words in sequences.
+    """
+
+    def __init__(
+        self,
+        aug_type,
+        custom_file_path=None,
+        delete_file_path=None,
+        create_n=1,
+        aug_n=None,
+        aug_percent=0.1,
+        aug_min=1,
+        aug_max=10,
+        model_name="ernie-1.0-large-zh-cw",
+        vocab="vocab",
+    ):
+        super().__init__(
+            create_n=create_n, aug_n=aug_n, aug_percent=aug_percent, aug_min=aug_min, aug_max=aug_max, vocab=vocab
+        )
+
+        self.custom_file_path = custom_file_path
+        self.delete_file_path = delete_file_path
+        self.model_name = model_name
+        if isinstance(aug_type, str):
+            self.type = aug_type
+            if aug_type in ["antonym", "embedding", "synonym", "homonym", "custom"]:
+                self.dict = self._load_insert_dict(aug_type)
+            elif aug_type in ["mlm"]:
+                self.mlm_model = AutoModelForMaskedLM.from_pretrained(self.model_name)
+                self.mlm_tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        elif isinstance(aug_type, Iterable):
+            self.type = "combination"
+            self.dict = {}
+            # Merge dictionaries from different sources
+            for t in aug_type:
+                if t in ["antonym", "embedding", "synonym", "homonym", "custom"]:
+                    t_dict = self._load_insert_dict(t)
+                    for k in t_dict:
+                        if k in self.dict:
+                            self.dict[k] = list(set(self.dict[k] + t_dict[k]))
+                        else:
+                            self.dict[k] = t_dict[k]
+            # Todo: delete some words in the dictionary
+        else:
+            self.type = aug_type
+
+    def _load_insert_dict(self, source_type):
+        """Load insert dictionary"""
+        if source_type in ["antonym", "embedding", "synonym", "homonym"]:
+            fullname = self._load_file("word_" + source_type)
+        elif source_type in ["custom"]:
+            fullname = self.custom_file_path
+        elif source_type in ["delete"]:
+            fullname = self.delete_file_path
+        if os.path.exists(fullname):
+            with open(fullname, "r", encoding="utf-8") as f:
+                insert_dict = json.load(f)
+            f.close()
+        else:
+            raise ValueError("The {} should exist.".format(fullname))
+        return insert_dict
+
+    def _augment(self, sequence):
+        seq_tokens = self.tokenizer.cut(sequence)
+        aug_indexes = self._skip_stop_word_tokens(seq_tokens)
+        aug_n = self._get_aug_n(len(seq_tokens), len(aug_indexes))
+        if aug_n == 0:
+            return []
+        elif self.type == "mlm":
+            return self._augment_mlm(sequence, seq_tokens, aug_indexes)
+        elif aug_n == 1:
+            return self._augment_single(seq_tokens, aug_indexes)
+        else:
+            return self._augment_multi(seq_tokens, aug_n, aug_indexes)
+
+    @paddle.no_grad()
+    def _augment_mlm(self, sequence, seq_tokens, aug_indexes):
+
+        t = 0
+        sentences = []
+        while t < self.create_n * self.loop and len(sentences) < self.create_n:
+            skip = False
+            t += 1
+            p = random.randint(0, 1)
+            idx = random.sample(aug_indexes, 1)[0]
+            aug_tokens = [[idx, "[MASK]" * len(seq_tokens[idx])]]
+            sequence_mask = self._generate_sequence(seq_tokens.copy(), aug_tokens, p)
+            tokenized = self.mlm_tokenizer(sequence_mask)
+            masked_positions = [
+                i for i, idx in enumerate(tokenized["input_ids"]) if idx == self.mlm_tokenizer.mask_token_id
+            ]
+            output = self.mlm_model(
+                paddle.to_tensor([tokenized["input_ids"]]), paddle.to_tensor([tokenized["token_type_ids"]])
+            )
+            predicted = "".join(
+                self.mlm_tokenizer.convert_ids_to_tokens(paddle.argmax(output[0][masked_positions], axis=-1))
+            )
+            for p in predicted:
+                if p in self.stop_words:
+                    skip = True
+                    break
+            if skip:
+                continue
+
+            aug_tokens = [[idx, predicted]]
+
+            sequence_generate = self._generate_sequence(seq_tokens.copy(), aug_tokens, p)
+            if sequence_generate != sequence and sequence_generate not in sentences:
+                sentences.append(sequence_generate)
+        return sentences
+
+    def _augment_multi(self, seq_tokens, aug_n, aug_indexes):
+        sentences = []
+        if self.type in ["antonym", "embedding", "synonym", "homonym", "combination", "custom"]:
+            candidate_tokens = []
+            for aug_index in aug_indexes:
+                if seq_tokens[aug_index] in self.dict:
+                    candidate_tokens.append([aug_index, self.dict[seq_tokens[aug_index]]])
+            aug_n = min(aug_n, len(candidate_tokens))
+            if aug_n != 0:
+                t = 0
+                while t < self.create_n * self.loop and len(sentences) < self.create_n:
+                    t += 1
+                    idxes = random.sample(list(range(len(candidate_tokens))), aug_n)
+                    aug_tokens = []
+                    for idx in idxes:
+                        aug_index, aug_dict = candidate_tokens[idx]
+                        aug_tokens.append([aug_index, random.sample(aug_dict, 1)[0]])
+                    p = random.randint(0, 1)
+                    sentence = self._generate_sequence(seq_tokens.copy(), aug_tokens, p)
+                    if sentence not in sentences:
+                        sentences.append(sentence)
+        elif self.type in ["random"]:
+            t = 0
+            while t < self.create_n * self.loop and len(sentences) < self.create_n:
+                t += 1
+                aug_tokens = []
+                aug_indexes = random.sample(aug_indexes, aug_n)
+                for aug_index in aug_indexes:
+                    token = self.vocab.to_tokens(random.randint(0, len(self.vocab) - 2))
+                    aug_tokens.append([aug_index, token])
+                p = random.randint(0, 1)
+                sentence = self._generate_sequence(seq_tokens.copy(), aug_tokens, p)
+                if sentence not in sentences:
+                    sentences.append(sentence)
+        return sentences
+
+    def _augment_single(self, seq_tokens, aug_indexes):
+
+        sentences = []
+        aug_tokens = []
+        if self.type in ["antonym", "embedding", "synonym", "homonym", "combination", "custom"]:
+            candidate_tokens = []
+            for aug_index in aug_indexes:
+                if seq_tokens[aug_index] in self.dict:
+                    for token in self.dict[seq_tokens[aug_index]]:
+                        candidate_tokens.append([aug_index, token])
+            create_n = min(self.create_n, len(candidate_tokens))
+            aug_tokens = random.sample(candidate_tokens, create_n)
+        elif self.type in ["random"]:
+            t = 0
+            while t < self.create_n * self.loop and len(aug_tokens) < self.create_n:
+                t += 1
+                aug_index = random.sample(aug_indexes, 1)[0]
+                token = self.vocab.to_tokens(random.randint(0, len(self.vocab) - 2))
+                if [aug_index, token] not in aug_tokens:
+                    aug_tokens.append([aug_index, token])
+        for aug_token in aug_tokens:
+            p = random.randint(0, 1)
+            sentences.append(self._generate_sequence(seq_tokens.copy(), [aug_token], p))
+        return sentences
+
+    def _generate_sequence(self, output_seq_tokens, aug_tokens, p):
+        """Genearte the sequences according to the mapping list"""
+        for aug_token in aug_tokens:
+            idx, token = aug_token
+            if p == 0:
+                output_seq_tokens[idx] = token + output_seq_tokens[idx]
+            else:
+                output_seq_tokens[idx] += token
+        return "".join(output_seq_tokens)
+
+
+class WordSwap(BaseAugment):
+    """
+    WordSwap is a word-level swap data augmentation strategy.
+
+    Args:
+        create_n (int):
+            Number of augmented sequences.
+        aug_n (int):
+            Number of augmented words in sequences.
+        aug_percent (int):
+            Percentage of augmented words in sequences.
+        aug_min (int):
+            Minimum number of augmented words in sequences.
+        aug_max (int):
+            Maximum number of augmented words in sequences.
+    """
+
+    def __init__(self, create_n=1, aug_n=None, aug_percent=None, aug_min=1, aug_max=10, vocab="vocab"):
+        super().__init__(
+            create_n=create_n, aug_n=aug_n, aug_percent=0.1, aug_min=aug_min, aug_max=aug_max, vocab=vocab
+        )
+
+    def _augment(self, sequence):
+
+        seq_tokens = self.tokenizer.cut(sequence)
+        aug_indexes = self._skip_words(seq_tokens)
+        aug_n = self._get_aug_n(len(seq_tokens), len(aug_indexes))
+
+        t = 0
+        sentences = []
+
+        if aug_n == 0:
+            return []
+        while t < self.create_n * self.loop and len(sentences) < self.create_n:
+            t += 1
+            idxes = random.sample(aug_indexes, aug_n)
+            output_seq_tokens = seq_tokens.copy()
+            for idx in range(len(seq_tokens)):
+                if idx in idxes:
+                    output_seq_tokens[idx], output_seq_tokens[idx + 1] = (
+                        output_seq_tokens[idx + 1],
+                        output_seq_tokens[idx],
+                    )
+            sentence = "".join(output_seq_tokens)
+            if sentence not in sentences:
+                sentences.append(sentence)
+        return sentences
+
+    def _skip_words(self, seq_tokens):
+        """Skip specific words."""
+        indexes = []
+        for i, seq_token in enumerate(seq_tokens[:-1]):
+            if (
+                seq_token not in self.stop_words
+                and not seq_token.isdigit()
+                and not seq_token.encode("UTF-8").isalpha()
+            ):
+                if (
+                    seq_tokens[i + 1] not in self.stop_words
+                    and not seq_tokens[i + 1].isdigit()
+                    and not seq_tokens[i + 1].encode("UTF-8").isalpha()
+                ):
+                    indexes.append(i)
+        return indexes
+
+
+class WordDelete(BaseAugment):
+    """
+    WordDelete is a word-level deletion data augmentation strategy.
+
+    Args:
+        create_n (int):
+            Number of augmented sequences.
+        aug_n (int):
+            Number of augmented words in sequences.
+        aug_percent (int):
+            Percentage of augmented words in sequences.
+        aug_min (int):
+            Minimum number of augmented words in sequences.
+        aug_max (int):
+            Maximum number of augmented words in sequences.
+    """
+
+    def __init__(self, create_n=1, aug_n=None, aug_percent=0.1, aug_min=1, aug_max=10, vocab="vocab"):
+        super().__init__(
+            create_n=create_n, aug_n=aug_n, aug_percent=aug_percent, aug_min=aug_min, aug_max=aug_max, vocab=vocab
+        )
+
+    def _augment(self, sequence):
+
+        seq_tokens = self.tokenizer.cut(sequence)
+        aug_indexes = self._skip_words(seq_tokens)
+        aug_n = self._get_aug_n(len(seq_tokens), len(aug_indexes))
+
+        t = 0
+        sentences = []
+        if aug_n == 0:
+            return sentences
+        while t < self.create_n * self.loop and len(sentences) < self.create_n:
+            t += 1
+            idxes = random.sample(aug_indexes, aug_n)
+            sentence = ""
+            for idx in range(len(seq_tokens)):
+                if idx not in idxes:
+                    sentence += seq_tokens[idx]
+            if sentence not in sentences:
+                sentences.append(sentence)
+        return sentences
+
+    def _skip_words(self, seq_tokens):
+        """Skip specific words."""
+        indexes = []
+        for i, seq_token in enumerate(seq_tokens):
+            if (
+                seq_token not in self.stop_words
+                and not seq_token.isdigit()
+                and not seq_token.encode("UTF-8").isalpha()
+            ):
+                indexes.append(i)
+        return indexes
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/README.md b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/README.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/__init__.py
new file mode 100644
index 000000000..fda1d6586
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/__init__.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .bellegroup import *
+from .cail2018_small import *
+from .cblue import *
+from .chnsenticorp import *
+from .clue import *
+from .cmrc2018 import *
+from .conll2002 import *
+from .cote import *
+from .couplet import *
+from .dataset import *
+from .drcd import *
+from .drcd_cn import *
+from .dureader_robust import *
+from .glue import *
+from .imdb import *
+from .lcqmc import *
+from .msra_ner import *
+from .nlpcc13_evsam05_hit import *
+from .nlpcc13_evsam05_thu import *
+from .nlpcc14_sc import *
+from .nlpcc_dbqa import *
+from .peoples_daily_ner import *
+from .poetry import *
+from .ptb import *
+from .seabsa16 import *
+from .squad import *
+from .wmt14ende import *
+from .wos import *
+from .xnli import *
+from .xnli_cn import *
+from .yahoo_answer_100k import *
+from .zero_padding_dataset import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/advertisegen.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/advertisegen.py
new file mode 100644
index 000000000..de5ca20e3
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/advertisegen.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import json
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["AdvertiseGen"]
+
+
+class AdvertiseGen(DatasetBuilder):
+    """
+    This dataset contains 119K pairs of product specifications and the
+    corresponding advertising text. For more information, please refer
+    to `https://arxiv.org/abs/1908.06605v2`.
+    """
+
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5", "URL"))
+    SPLITS = {
+        "train": META_INFO(
+            os.path.join("train.json"),
+            "c0cc79f912099faa6175d28d3ddafafe",
+            "https://bj.bcebos.com/paddlenlp/datasets/AdvertiseGen/train.json",
+        ),
+        "dev": META_INFO(
+            os.path.join("dev.json"),
+            "5fda84828628a9722da5436485601df3",
+            "https://bj.bcebos.com/paddlenlp/datasets/AdvertiseGen/dev.json",
+        ),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash, URL = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(URL, default_root)
+
+        return fullname
+
+    def _read(self, filename, *args):
+        with open(filename, "r", encoding="utf8") as f:
+            data_id = 0
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                json_data = json.loads(line)
+
+                yield {
+                    "source": json_data["content"],
+                    "src": json_data["content"],
+                    "target": json_data.get("summary", ""),
+                    "tgt": json_data.get("summary", ""),
+                    "id": data_id,
+                }
+                data_id += 1
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/bellegroup.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/bellegroup.py
new file mode 100644
index 000000000..a369cbffc
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/bellegroup.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["BelleGroup"]
+
+
+class BelleGroup(DatasetBuilder):
+    """
+    From https://github.com/LianjiaTech/BELLE/tree/main
+
+    """
+
+    BUILDER_CONFIGS = {
+        "generated_chat_0.4M": {
+            "url": "https://paddlenlp.bj.bcebos.com/datasets/BelleGroup/generated_chat_0.4M.zip",
+            "md5": "9bb71d4f2aa99acede2a0c3a8e761905",
+            "splits": {
+                "train": [os.path.join("generated_chat_0.4M", "train.json"), "47ea511025fbda9ffd6e5178677bb027"],
+                "dev": [os.path.join("generated_chat_0.4M", "dev.json"), "d7bd4b71cdb006b9de90ebb634ca1179"],
+            },
+        },
+        "school_math_0.25M": {
+            "url": "https://paddlenlp.bj.bcebos.com/datasets/BelleGroup/school_math_0.25M.zip",
+            "md5": "10076cbdc0a7436d55481f0234db8609",
+            "splits": {
+                "train": [os.path.join("school_math_0.25M", "train.json"), "e5a36fc9deb015254686c51e21528683"],
+                "dev": [os.path.join("school_math_0.25M", "dev.json"), "99e967c38e39ed919327c011d9f6288f"],
+            },
+        },
+        "train_2M_CN": {
+            "url": "https://paddlenlp.bj.bcebos.com/datasets/BelleGroup/train_2M_CN.zip",
+            "md5": "da88aca71eb9f454fab39db6a7e851e6",
+            "splits": {
+                "train": [os.path.join("train_2M_CN", "train.json"), "83e2917701a31ecf5152e4e9f234fcd0"],
+                "dev": [os.path.join("train_2M_CN", "dev.json"), "74f67f04e30896aeccc10930a7dc1f40"],
+            },
+        },
+        "train_1M_CN": {
+            "url": "https://paddlenlp.bj.bcebos.com/datasets/BelleGroup/train_1M_CN.zip",
+            "md5": "65380b542e8ddb4db8f8d2be0f28795c",
+            "splits": {
+                "train": [os.path.join("train_1M_CN.zip", "train.json"), "489886aba320c74a1fdfad43c652635b"],
+                "dev": [os.path.join("train_1M_CN.zip", "dev.json"), "7bbf382aeab89f4398b2beca984e20e8"],
+            },
+        },
+        "train_0.5M_CN": {
+            "url": "https://paddlenlp.bj.bcebos.com/datasets/BelleGroup/train_0.5M_CN.zip",
+            "md5": "45be55109ca9595efa36eaaed7c475d3",
+            "splits": {
+                "train": [os.path.join("train_0.5M_CN.zip", "train.json"), "61dc155956622c8389265de33b439757"],
+                "dev": [os.path.join("train_0.5M_CN.zip", "dev.json"), "72617388fbc4897cb2952df3e5303c2b"],
+            },
+        },
+        "multiturn_chat_0.8M": {
+            "url": "https://paddlenlp.bj.bcebos.com/datasets/BelleGroup/multiturn_chat_0.8M.zip",
+            "md5": "974bc42c5920e5722146a89dce2b10cc",
+            "splits": {
+                "train": [os.path.join("multiturn_chat_0.8M", "train.json"), "27e3a7ecff0f4a199f6e7119909988e9"],
+                "dev": [os.path.join("multiturn_chat_0.8M", "dev.json"), "8fec175ea5e71cc78498d8ca3c1d5e66"],
+            },
+        },
+    }
+
+    def _get_data(self, mode, **kwargs):
+        builder_config = self.BUILDER_CONFIGS[self.name]
+
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = builder_config["splits"][mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(builder_config["url"], default_root, builder_config["md5"])
+
+        return fullname
+
+    def _read(self, filename, *args):
+        with open(filename, "r", encoding="utf8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+
+                json_data = json.loads(line)
+
+                yield {
+                    "instruction": json_data["instruction"],
+                    "input": json_data["input"],
+                    "output": json_data["output"],
+                }
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/bq_corpus.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/bq_corpus.py
new file mode 100644
index 000000000..7a15115fc
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/bq_corpus.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["BQCorpus"]
+
+
+class BQCorpus(DatasetBuilder):
+    """
+    BQCorpus: A Large-scale Domain-specific Chinese Corpus For Sentence
+    Semantic Equivalence Identification. More information please refer
+    to `https://www.aclweb.org/anthology/D18-1536.pdf`
+
+    Contributed by frozenfish123@Wuhan University
+
+    """
+
+    lazy = False
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/bq_corpus.zip"
+    MD5 = "abe6c480b96cb705b4d24bd522848009"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    SPLITS = {
+        "train": META_INFO(os.path.join("bq_corpus", "bq_corpus", "train.tsv"), "d37683e9ee778ee2f4326033b654adb9"),
+        "dev": META_INFO(os.path.join("bq_corpus", "bq_corpus", "dev.tsv"), "8a71f2a69453646921e9ee1aa457d1e4"),
+        "test": META_INFO(os.path.join("bq_corpus", "bq_corpus", "test.tsv"), "c797995baa248b144ceaa4018b191e52"),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        """Check and download Dataset"""
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename):
+        """Reads data."""
+        with open(filename, "r", encoding="utf-8") as f:
+            for line in f:
+                data = line.strip().split("\t")
+                if len(data) == 3:
+                    sentence1, sentence2, label = data
+                elif len(data) == 2:
+                    sentence1, sentence2 = data
+                    label = ""
+                yield {"sentence1": sentence1, "sentence2": sentence2, "label": label}
+
+    def get_labels(self):
+        """
+        Return labels of the BQCorpus object.
+        """
+        return ["0", "1"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/bstc.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/bstc.py
new file mode 100644
index 000000000..ee1a1ab99
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/bstc.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+
+class BSTC(DatasetBuilder):
+    """
+    BSTC (Baidu Speech Translation Corpus), a large-scale Chinese-English
+    speech translation dataset. This dataset is constructed based on a
+    collection of licensed videos of talks or lectures, including about
+    68 hours of Mandarin data, their manual transcripts and translations
+    into English, as well as automated transcripts by an automatic speech
+    recognition (ASR) model.
+    Details: https://arxiv.org/pdf/2104.03575.pdf
+    """
+
+    lazy = False
+    BUILDER_CONFIGS = {
+        "transcription_translation": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/bstc_transcription_translation.tar.gz",
+            "md5": "236800188e397c42a3251982aeee48ee",
+            "splits": {
+                "train": [os.path.join("bstc_transcription_translation", "train")],
+                "dev": [
+                    os.path.join("bstc_transcription_translation", "dev", "streaming_transcription"),
+                    os.path.join("bstc_transcription_translation", "dev", "ref_text"),
+                ],
+            },
+        },
+        "asr": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/bstc_asr.tar.gz",
+            "md5": "3a0cc5039f45e62e29485e27d3a5f5a7",
+            "splits": {
+                "train": [os.path.join("bstc_asr", "train", "asr_sentences")],
+                "dev": [os.path.join("bstc_asr", "dev", "streaming_asr"), os.path.join("bstc_asr", "dev", "ref_text")],
+            },
+        },
+    }
+
+    def _get_data(self, mode, **kwargs):
+        """Check and download Dataset"""
+        builder_config = self.BUILDER_CONFIGS[self.name]
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        source_file_dir = builder_config["splits"][mode][0]
+        source_full_dir = os.path.join(default_root, source_file_dir)
+        if not os.path.exists(source_full_dir):
+            get_path_from_url(builder_config["url"], default_root, builder_config["md5"])
+        if mode == "train":
+            return source_full_dir
+        elif mode == "dev":
+            target_file_dir = builder_config["splits"][mode][1]
+            target_full_dir = os.path.join(default_root, target_file_dir)
+            if not os.path.exists(target_full_dir):
+                get_path_from_url(builder_config["url"], default_root, builder_config["md5"])
+            return source_full_dir, target_full_dir
+
+    def _read(self, data_dir, split):
+        """Reads data."""
+        if split == "train":
+            if self.name == "transcription_translation":
+                source_full_dir = data_dir
+                filenames = [f for f in os.listdir(source_full_dir) if not f.startswith(".")]
+                filenames.sort(key=lambda x: int(x[:-5]))
+                for filename in filenames:
+                    with open(os.path.join(source_full_dir, filename), "r", encoding="utf-8") as f:
+                        for line in f.readlines():
+                            line = line.strip()
+                            if not line:
+                                continue
+                            yield json.loads(line)
+            elif self.name == "asr":
+                source_full_dir = data_dir
+                dir_list = [f for f in os.listdir(source_full_dir) if not f.startswith(".")]
+                dir_list.sort(key=lambda x: int(x))
+                for dir_name in dir_list:
+                    filenames = [
+                        f for f in os.listdir(os.path.join(source_full_dir, dir_name)) if not f.startswith(".")
+                    ]
+                    filenames.sort(key=lambda x: int(x[x.find("-") + 1 : -5]))
+                    for filename in filenames:
+                        with open(os.path.join(source_full_dir, dir_name, filename), "r", encoding="utf-8") as f:
+                            for line in f.readlines():
+                                line = line.strip()
+                                if not line:
+                                    continue
+                                yield json.loads(line)
+            else:
+                raise ValueError("Argument name should be one of [transcription_translation, asr].")
+        elif split == "dev":
+            source_full_dir, target_full_dir = data_dir
+            source_filenames = [f for f in os.listdir(source_full_dir) if f.endswith("txt")]
+            target_filenames = [f for f in os.listdir(target_full_dir) if f.endswith("txt")]
+            assert len(source_filenames) == len(target_filenames)
+            source_filenames.sort(
+                key=lambda x: int(x[:-4]) if self.name == "transcription_translation" else int(x[:-8])
+            )
+            target_filenames.sort(key=lambda x: int(x[:-4]))
+            for src_file, tgt_file in zip(source_filenames, target_filenames):
+                if self.name == "transcription_translation":
+                    src_list = []
+                    with open(os.path.join(source_full_dir, src_file), "r", encoding="utf-8") as src_f:
+                        src_part = []
+                        for src_line in src_f.readlines():
+                            src_line = src_line.strip()
+                            if not src_line:
+                                continue
+                            if len(src_part) != 0 and not src_line.startswith(src_part[-1]):
+                                src_list.append(src_part)
+                                src_part = [src_line]
+                            else:
+                                src_part.append(src_line)
+                        if len(src_part) > 0:
+                            src_list.append(src_part)
+                elif self.name == "asr":
+                    src_list = []
+                    with open(os.path.join(source_full_dir, src_file), "r", encoding="utf-8") as src_f:
+                        src_part = []
+                        for src_line in src_f.readlines():
+                            src_line = src_line.strip()
+                            if not src_line:
+                                continue
+                            line = src_line.split(", ")
+                            final = line[2].split(": ")[1] == "final"
+                            src_part.append(src_line)
+                            if final:
+                                src_list.append(src_part)
+                                src_part = []
+                else:
+                    raise ValueError("Argument name should be one of [transcription_translation, asr].")
+                tgt_list = []
+                with open(os.path.join(target_full_dir, tgt_file), "r", encoding="utf-8") as tgt_f:
+                    lines = tgt_f.readlines()
+                    for idx, tgt_line in enumerate(lines):
+                        tgt_line = tgt_line.strip()
+                        if not tgt_line:
+                            continue
+                        tgt_list.append(tgt_line)
+                yield {"src": src_list, "tgt": tgt_list}
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/c3.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/c3.py
new file mode 100644
index 000000000..896ab03e2
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/c3.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import json
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["C3"]
+
+
+class C3(DatasetBuilder):
+    """
+    C3 is the first free-form multiple-Choice Chinese machine reading Comprehension dataset,
+    containing 13,369 documents (dialogues or more formally written mixed-genre texts)
+    and their associated 19,577 multiple-choice free-form questions collected from
+    Chinese-as-a-second-language examinations.
+    See more details on https://arxiv.org/abs/1904.09679.
+    """
+
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5", "URL"))
+    SPLITS = {
+        "train": [
+            META_INFO(
+                os.path.join("c3-d-train.json"),
+                "291b07679bef785aa66bb5343f1b49b2",
+                "https://bj.bcebos.com/paddlenlp/datasets/c3/c3-d-train.json",
+            ),
+            META_INFO(
+                os.path.join("c3-m-train.json"),
+                "db321e631eb3e6f508e438992652618f",
+                "https://bj.bcebos.com/paddlenlp/datasets/c3/c3-m-train.json",
+            ),
+        ],
+        "dev": [
+            META_INFO(
+                os.path.join("c3-d-dev.json"),
+                "446e75358789d3fbe8730089cadf5fb0",
+                "https://bj.bcebos.com/paddlenlp/datasets/c3/c3-d-dev.json",
+            ),
+            META_INFO(
+                os.path.join("c3-m-dev.json"),
+                "beb2f2e08c18cd8e9429c6a55de6b8db",
+                "https://bj.bcebos.com/paddlenlp/datasets/c3/c3-m-dev.json",
+            ),
+        ],
+        "test": [
+            META_INFO(
+                os.path.join("c3-d-test.json"),
+                "002561f15f4942328761c50c90ced36c",
+                "https://bj.bcebos.com/paddlenlp/datasets/c3/c3-d-test.json",
+            ),
+            META_INFO(
+                os.path.join("c3-m-test.json"),
+                "f5f14c517926d22047b7bfd369dab724",
+                "https://bj.bcebos.com/paddlenlp/datasets/c3/c3-m-test.json",
+            ),
+        ],
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__, mode)
+        meta_info_list = self.SPLITS[mode]
+        fullnames = []
+        for meta_info in meta_info_list:
+            filename, data_hash, URL = meta_info
+            fullname = os.path.join(default_root, filename)
+            if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+                get_path_from_url(URL, default_root)
+            fullnames.append(fullname)
+        return fullnames
+
+    def _read(self, data_files, *args):
+        for fullname in data_files:
+            with open(fullname, "r", encoding="utf8") as fr:
+                samples = json.load(fr)
+                for sample in samples:
+                    context = sample[0]
+                    qas = sample[1]
+                    for qa in qas:
+                        question = qa["question"]
+                        choice = qa["choice"]
+                        answer = qa["answer"]
+                        label = str(choice.index(answer))
+                        yield {
+                            "context": context,
+                            "question": question,
+                            "choice": choice,
+                            "answer": answer,
+                            "label": label,
+                        }
+
+    def get_labels(self):
+        """
+        Return labels of the C3 object.
+        """
+        return ["0", "1", "2", "3"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/cail2018_small.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/cail2018_small.py
new file mode 100644
index 000000000..3238dface
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/cail2018_small.py
@@ -0,0 +1,487 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import json
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["CAIL2018Small"]
+
+
+class CAIL2018Small(DatasetBuilder):
+    """
+    CAIL2018-Small 196,000 criminal cases，which are collected from http://wenshu.court.gov.cn/
+    published by the Supreme People’s Court of China. Each case in CAIL2018 consists of two parts,
+    i.e., fact description and corresponding judgment result. The judgment result of each case is
+    refined into 3 representative ones, including relevant law articles, charges, and prison terms.
+
+    charges: predict the charges from referee result with regular expressions.
+
+    law_articles: predict the relevant law articles from referee result with regular expressions.
+
+    prison_term: predict the prison terms from referee result with regular expressions.
+
+    Find more dataset dertails in https://github.com/thunlp/CAIL
+    """
+
+    lazy = False
+    URL = "https://paddlenlp.bj.bcebos.com/datasets/cail2018_small.tar.gz"
+    MD5 = "963401d107150e250580d115dd2d43fc"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    SPLITS = {
+        "train": META_INFO(os.path.join("cail2018_small", "train.json"), "e11fc099cc7709a8d128e9fe9f029621"),
+        "dev": META_INFO(os.path.join("cail2018_small", "dev.json"), "ee13108aee6a08a94490fadeb400debb"),
+        "test": META_INFO(os.path.join("cail2018_small", "test.json"), "27cea977fff2f85b5c32a8e0f708b093"),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        """Check and download Dataset"""
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename, *args):
+
+        with open(filename, "r", encoding="utf-8") as f:
+            for line in f.readlines():
+                line = json.loads(line)
+                sentence = line["fact"]
+                if self.name == "charges":
+                    label = line["meta"]["accusation"]
+                    yield {"sentence": sentence, "label": label}
+                elif self.name == "law_articles":
+                    label = line["meta"]["relevant_articles"]
+                    yield {"sentence": sentence, "label": label}
+                elif self.name == "prison_term":
+                    if line["meta"]["term_of_imprisonment"]["life_imprisonment"]:
+                        lp = -1
+                    elif line["meta"]["term_of_imprisonment"]["death_penalty"]:
+                        lp = -2
+                    else:
+                        lp = line["meta"]["term_of_imprisonment"]["imprisonment"]
+                    yield {"sentence": sentence, "label": lp}
+                else:
+                    assert "Dataset name {} does not exist".format(self.name)
+        f.close()
+
+    def get_labels(self):
+        """
+        Return labels of the CAIL2018-Small.
+        """
+        if self.name == "charges":
+            return [
+                "故意伤害",
+                "盗窃",
+                "危险驾驶",
+                "非法[持有、私藏][枪支、弹药]",
+                "交通肇事",
+                "寻衅滋事",
+                "[窝藏、包庇]",
+                "放火",
+                "故意毁坏财物",
+                "绑架",
+                "赌博",
+                "妨害公务",
+                "合同诈骗",
+                "[走私、贩卖、运输、制造]毒品",
+                "抢劫",
+                "非法拘禁",
+                "诬告陷害",
+                "非法采矿",
+                "容留他人吸毒",
+                "强奸",
+                "[伪造、变造、买卖]国家机关[公文、证件、印章]",
+                "故意杀人",
+                "诈骗",
+                "聚众斗殴",
+                "[掩饰、隐瞒][犯罪所得、犯罪所得收益]",
+                "敲诈勒索",
+                "[组织、强迫、引诱、容留、介绍]卖淫",
+                "[引诱、容留、介绍]卖淫",
+                "开设赌场",
+                "重大责任事故",
+                "抢夺",
+                "破坏电力设备",
+                "[制造、贩卖、传播]淫秽物品",
+                "传播淫秽物品",
+                "虐待",
+                "非法[采伐、毁坏]国家重点保护植物",
+                "非法[制造、买卖、运输、邮寄、储存][枪支、弹药、爆炸物]",
+                "受贿",
+                "脱逃",
+                "行贿",
+                "破坏[广播电视设施、公用电信设施]",
+                "[伪造、变造]居民身份证",
+                "拐卖[妇女、儿童]",
+                "强迫交易",
+                "拒不支付劳动报酬",
+                "帮助[毁灭、伪造]证据",
+                "爆炸",
+                "污染环境",
+                "非法持有毒品",
+                "破坏易燃易爆设备",
+                "妨害信用卡管理",
+                "[引诱、教唆、欺骗]他人吸毒",
+                "非法处置[查封、扣押、冻结]的财产",
+                "贪污",
+                "职务侵占",
+                "帮助犯罪分子逃避处罚",
+                "盗伐林木",
+                "挪用资金",
+                "重婚",
+                "侵占",
+                "[窝藏、转移、收购、销售]赃物",
+                "妨害作证",
+                "挪用公款",
+                "伪造[公司、企业、事业单位、人民团体]印章",
+                "[窝藏、转移、隐瞒][毒品、毒赃]",
+                "[虚开增值税专用发票、用于骗取出口退税、抵扣税款发票]",
+                "非法侵入住宅",
+                "信用卡诈骗",
+                "非法获取公民个人信息",
+                "滥伐林木",
+                "非法经营",
+                "招摇撞骗",
+                "以危险方法危害公共安全",
+                "[盗窃、侮辱]尸体",
+                "过失致人死亡",
+                "[持有、使用]假币",
+                "传授犯罪方法",
+                "猥亵儿童",
+                "逃税",
+                "非法吸收公众存款",
+                "非法[转让、倒卖]土地使用权",
+                "骗取[贷款、票据承兑、金融票证]",
+                "破坏生产经营",
+                "高利转贷",
+                "[盗窃、抢夺][枪支、弹药、爆炸物]",
+                "[盗窃、抢夺][枪支、弹药、爆炸物、危险物质]",
+                "假冒注册商标",
+                "[伪造、变造]金融票证",
+                "强迫卖淫",
+                "扰乱无线电通讯管理秩序",
+                "虚开发票",
+                "非法占用农用地",
+                "[组织、领导、参加]黑社会性质组织",
+                "[隐匿、故意销毁][会计凭证、会计帐簿、财务会计报告]",
+                "保险诈骗",
+                "强制[猥亵、侮辱]妇女",
+                "非国家工作人员受贿",
+                "伪造货币",
+                "拒不执行[判决、裁定]",
+                "[生产、销售]伪劣产品",
+                "非法[收购、运输][盗伐、滥伐]的林木",
+                "冒充军人招摇撞骗",
+                "组织卖淫",
+                "持有伪造的发票",
+                "[生产、销售][有毒、有害]食品",
+                "非法[制造、出售]非法制造的发票",
+                "[伪造、变造、买卖]武装部队[公文、证件、印章]",
+                "[组织、领导]传销活动",
+                "强迫劳动",
+                "走私",
+                "贷款诈骗",
+                "串通投标",
+                "虚报注册资本",
+                "侮辱",
+                "伪证",
+                "聚众扰乱社会秩序",
+                "聚众扰乱[公共场所秩序、交通秩序]",
+                "劫持[船只、汽车]",
+                "集资诈骗",
+                "盗掘[古文化遗址、古墓葬]",
+                "失火",
+                "票据诈骗",
+                "经济犯",
+                "单位行贿",
+                "投放危险物质",
+                "过失致人重伤",
+                "破坏交通设施",
+                "聚众哄抢",
+                "走私普通[货物、物品]",
+                "收买被拐卖的[妇女、儿童]",
+                "非法狩猎",
+                "销售假冒注册商标的商品",
+                "破坏监管秩序",
+                "拐骗儿童",
+                "非法行医",
+                "协助组织卖淫",
+                "打击报复证人",
+                "强迫他人吸毒",
+                "非法[收购、运输、加工、出售][国家重点保护植物、国家重点保护植物制品]",
+                "[生产、销售]不符合安全标准的食品",
+                "非法买卖制毒物品",
+                "滥用职权",
+                "聚众冲击国家机关",
+                "[出售、购买、运输]假币",
+                "对非国家工作人员行贿",
+                "[编造、故意传播]虚假恐怖信息",
+                "玩忽职守",
+                "私分国有资产",
+                "非法携带[枪支、弹药、管制刀具、危险物品]危及公共安全",
+                "过失以危险方法危害公共安全",
+                "走私国家禁止进出口的[货物、物品]",
+                "违法发放贷款",
+                "徇私枉法",
+                "非法[买卖、运输、携带、持有]毒品原植物[种子、幼苗]",
+                "动植物检疫徇私舞弊",
+                "重大劳动安全事故",
+                "走私[武器、弹药]",
+                "破坏计算机信息系统",
+                "[制作、复制、出版、贩卖、传播]淫秽物品牟利",
+                "单位受贿",
+                "[生产、销售]伪劣[农药、兽药、化肥、种子]",
+                "过失损坏[武器装备、军事设施、军事通信]",
+                "破坏交通工具",
+                "包庇毒品犯罪分子",
+                "[生产、销售]假药",
+                "非法种植毒品原植物",
+                "诽谤",
+                "传播性病",
+                "介绍贿赂",
+                "金融凭证诈骗",
+                "非法[猎捕、杀害][珍贵、濒危]野生动物",
+                "徇私舞弊不移交刑事案件",
+                "巨额财产来源不明",
+                "过失损坏[广播电视设施、公用电信设施]",
+                "挪用特定款物",
+                "[窃取、收买、非法提供]信用卡信息",
+                "非法组织卖血",
+                "利用影响力受贿",
+                "非法捕捞水产品",
+                "对单位行贿",
+                "遗弃",
+                "徇私舞弊[不征、少征]税款",
+                "提供[侵入、非法控制计算机信息系统][程序、工具]",
+                "非法进行节育手术",
+                "危险物品肇事",
+                "非法[制造、买卖、运输、储存]危险物质",
+                "非法[制造、销售]非法制造的注册商标标识",
+                "侵犯著作权",
+                "倒卖[车票、船票]",
+                "过失投放危险物质",
+                "走私废物",
+                "非法出售发票",
+                "走私[珍贵动物、珍贵动物制品]",
+                "[伪造、倒卖]伪造的有价票证",
+                "招收[公务员、学生]徇私舞弊",
+                "非法[生产、销售]间谍专用器材",
+                "倒卖文物",
+                "虐待被监管人",
+                "洗钱",
+                "非法[生产、买卖]警用装备",
+                "非法获取国家秘密",
+                "非法[收购、运输、出售][珍贵、濒危野生动物、珍贵、濒危野生动物]制品",
+            ]
+        elif self.name == "law_articles":
+            return [
+                114,
+                115,
+                116,
+                117,
+                118,
+                119,
+                122,
+                124,
+                125,
+                127,
+                128,
+                130,
+                132,
+                133,
+                134,
+                135,
+                136,
+                140,
+                141,
+                143,
+                144,
+                147,
+                149,
+                150,
+                151,
+                152,
+                153,
+                155,
+                156,
+                158,
+                159,
+                161,
+                162,
+                163,
+                164,
+                168,
+                170,
+                171,
+                172,
+                175,
+                176,
+                177,
+                184,
+                185,
+                186,
+                191,
+                192,
+                193,
+                194,
+                196,
+                198,
+                199,
+                200,
+                201,
+                205,
+                209,
+                210,
+                211,
+                212,
+                213,
+                214,
+                215,
+                217,
+                220,
+                223,
+                224,
+                225,
+                226,
+                227,
+                228,
+                231,
+                232,
+                233,
+                234,
+                235,
+                236,
+                237,
+                238,
+                239,
+                240,
+                241,
+                243,
+                244,
+                245,
+                246,
+                248,
+                253,
+                258,
+                260,
+                261,
+                262,
+                263,
+                264,
+                266,
+                267,
+                268,
+                269,
+                270,
+                271,
+                272,
+                273,
+                274,
+                275,
+                276,
+                277,
+                279,
+                280,
+                281,
+                282,
+                283,
+                285,
+                286,
+                288,
+                290,
+                291,
+                292,
+                293,
+                294,
+                295,
+                302,
+                303,
+                305,
+                307,
+                308,
+                310,
+                312,
+                313,
+                314,
+                315,
+                316,
+                326,
+                328,
+                333,
+                336,
+                338,
+                340,
+                341,
+                342,
+                343,
+                344,
+                345,
+                346,
+                347,
+                348,
+                349,
+                350,
+                351,
+                352,
+                353,
+                354,
+                356,
+                357,
+                358,
+                359,
+                360,
+                361,
+                363,
+                364,
+                367,
+                369,
+                372,
+                375,
+                382,
+                383,
+                384,
+                385,
+                386,
+                387,
+                388,
+                389,
+                390,
+                391,
+                392,
+                393,
+                395,
+                396,
+                397,
+                399,
+                402,
+                404,
+                413,
+                417,
+                418,
+            ]
+        elif self.name == "prison_term":
+            return None
+        else:
+            assert "Dataset name {} does not exist".format(self.name)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/cail2019_scm.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/cail2019_scm.py
new file mode 100644
index 000000000..a7f261c05
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/cail2019_scm.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import json
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["CAIL2019_SCM"]
+
+
+class CAIL2019_SCM(DatasetBuilder):
+    """
+    CAIL2019-SCM contains 8,964 triplets of cases published by the Supreme People's
+    Court of China. The input of CAIL2019-SCM is a triplet (A, B, C), where A, B, C
+    are fact descriptions of three cases. The task of CAIL2019-SCM is to predict
+    whether sim(A, B) > sim(A, C) or sim(A, C) > sim(A, B).
+
+    See more details on https://arxiv.org/abs/1911.08962.
+    """
+
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5", "URL"))
+    SPLITS = {
+        "train": META_INFO(
+            os.path.join("cail2019_scm_train.json"),
+            "d50a105f9689e72be7d79adbba0ae224",
+            "https://bj.bcebos.com/paddlenlp/datasets/cail2019/scm/cail2019_scm_train.json",
+        ),
+        "dev": META_INFO(
+            os.path.join("cail2019_scm_dev.json"),
+            "e36a295c1cb8c6b9fb28015907a42d9e",
+            "https://bj.bcebos.com/paddlenlp/datasets/cail2019/scm/cail2019_scm_dev.json",
+        ),
+        "test": META_INFO(
+            os.path.join("cail2019_scm_test.json"),
+            "91a6cf060e1283f05fcc6a2027238379",
+            "https://bj.bcebos.com/paddlenlp/datasets/cail2019/scm/cail2019_scm_test.json",
+        ),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash, URL = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(URL, default_root)
+
+        return fullname
+
+    def _read(self, filename, *args):
+        with open(filename, "r", encoding="utf8") as f:
+            for line in f.readlines():
+                dic = json.loads(line)
+                yield {"text_a": dic["A"], "text_b": dic["B"], "text_c": dic["C"], "label": dic["label"]}
+
+    def get_labels(self):
+        """
+        Return labels of the CAIL2019_SCM object.
+        """
+        return ["B", "C"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/cblue.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/cblue.py
new file mode 100644
index 000000000..c6969ba3c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/cblue.py
@@ -0,0 +1,456 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+
+import pandas as pd
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+
+class CBLUE(DatasetBuilder):
+    """
+    The Chinese Biomedical Language Understanding Evaluation (CBLUE) benchmark
+    is a collection of natural language understanding tasks including named
+    entity recognition, information extraction, clinical diagnosis normalization
+    and single-sentence/sentence-pair classification.
+    From https://github.com/CBLUEbenchmark/CBLUE
+
+    CMeEE:
+        The Chinese Medical Named Entity Recognition is first released in CHIP20204.
+        Given a pre-defined schema, the task is to identify and extract entities
+        from the given sentence and classify them into nine categories: disease,
+        clinical manifestations, drugs, medical equipment, medical procedures,
+        body, medical examinations, microorganisms, and department.
+
+    CMeIE:
+        The Chinese Medical Information Extraction is also released in CHIP2020.
+        The task is aimed at identifying both entities and relations in a sentence
+        following the schema constraints. There are 53 relations defined in the dataset,
+        including 10 synonymous sub-relationships and 43 other sub-relationships.
+
+    CHIP-CDN:
+        The CHIP Clinical Diagnosis Normalization dataset aims to standardize
+        the terms from the final diagnoses of Chinese electronic medical records.
+
+    CHIP-CDN-2C:
+        The CHIP Clinical Diagnosis Normalization dataset is reformalized as a task of
+        pairwise classification to judge if a normalized term matches the original term
+        or not. For each original term from the whole ICD-10 vocabulary, 100 candidates
+        normalized terms are retrieved using Elasticsearch.
+
+    CHIP-CTC:
+        The CHIP Clinical Trial Classification dataset aimed at classifying
+        clinical trials eligibility criteria.
+
+    CHIP-STS:
+        The CHIP Semantic Textual Similarity dataset consists of question pairs
+        related to 5 different diseases and aims to determine sentence similarity.
+
+    KUAKE-QIC:
+        The KUAKE Query Intent Classification dataset is used to classify queries
+        of search engines into one of 11 medical intent categories, including
+        diagnosis, etiology analysis, treatment plan, medical advice, test result
+        analysis, disease description, consequence prediction, precautions, intended
+        effects, treatment fees, and others.
+
+    KUAKE-QTR:
+        The KUAKE Query Title Relevance dataset is used to estimate the
+        relevance of the title of a query document.
+
+    KUAKE-QQR:
+        The KUAKE Query-Query Relevance dataset is used to evaluate the
+        relevance of the content expressed in two queries.
+    """
+
+    BUILDER_CONFIGS = {
+        "CMeEE": {
+            "url": "https://paddlenlp.bj.bcebos.com/datasets/cblue/CMeEE.zip",
+            "md5": "2f21afc5d95918346b673f84eecd06b1",
+            "splits": {
+                "train": [os.path.join("CMeEE", "CMeEE_train.json"), "725b34819dd49a0ce028c37e4ad0a73b", ["text"]],
+                "dev": [os.path.join("CMeEE", "CMeEE_dev.json"), "42778760dcce7b9ada6e290f7b2a59c2", ["text"]],
+                "test": [os.path.join("CMeEE", "CMeEE_test.json"), "c45b3b3d79ca29776e3d9f009b7d6ee5", ["text"]],
+            },
+            "labels": [
+                [
+                    "B-bod",
+                    "I-bod",
+                    "E-bod",
+                    "S-bod",
+                    "B-dis",
+                    "I-dis",
+                    "E-dis",
+                    "S-dis",
+                    "B-pro",
+                    "I-pro",
+                    "E-pro",
+                    "S-pro",
+                    "B-dru",
+                    "I-dru",
+                    "E-dru",
+                    "S-dru",
+                    "B-ite",
+                    "I-ite",
+                    "E-ite",
+                    "S-ite",
+                    "B-mic",
+                    "I-mic",
+                    "E-mic",
+                    "S-mic",
+                    "B-equ",
+                    "I-equ",
+                    "E-equ",
+                    "S-equ",
+                    "B-dep",
+                    "I-dep",
+                    "E-dep",
+                    "S-dep",
+                    "O",
+                ],
+                ["B-sym", "I-sym", "E-sym", "S-sym", "O"],
+            ],
+        },
+        "CMeIE": {
+            "url": "https://paddlenlp.bj.bcebos.com/datasets/cblue/CMeIE.zip",
+            "md5": "444569dfc31580c8cfa18843d0a1bd59",
+            "splits": {
+                "train": [os.path.join("CMeIE", "CMeIE_train.json"), "d27a7d4f0f5326018db66f64ac63780c", ["text"]],
+                "dev": [os.path.join("CMeIE", "CMeIE_dev.json"), "54203d1e775a2f07aaea30b61b93ca2f", ["text"]],
+                "test": [os.path.join("CMeIE", "CMeIE_test.json"), "8ac74722e9448fdc76132206582b9a06", ["text"]],
+            },
+            "labels": [
+                "预防",
+                "阶段",
+                "就诊科室",
+                "辅助治疗",
+                "化疗",
+                "放射治疗",
+                "手术治疗",
+                "实验室检查",
+                "影像学检查",
+                "辅助检查",
+                "组织学检查",
+                "内窥镜检查",
+                "筛查",
+                "多发群体",
+                "发病率",
+                "发病年龄",
+                "多发地区",
+                "发病性别倾向",
+                "死亡率",
+                "多发季节",
+                "传播途径",
+                "并发症",
+                "病理分型",
+                "相关（导致）",
+                "鉴别诊断",
+                "相关（转化）",
+                "相关（症状）",
+                "临床表现",
+                "治疗后症状",
+                "侵及周围组织转移的症状",
+                "病因",
+                "高危因素",
+                "风险评估因素",
+                "病史",
+                "遗传因素",
+                "发病机制",
+                "病理生理",
+                "药物治疗",
+                "发病部位",
+                "转移部位",
+                "外侵部位",
+                "预后状况",
+                "预后生存率",
+                "同义词",
+            ],
+        },
+        "CHIP-CDN": {
+            "url": "https://paddlenlp.bj.bcebos.com/datasets/cblue/CHIP-CDN.zip",
+            "md5": "e378d6bfe6740aadfb197ca352db3427",
+            "splits": {
+                "train": [
+                    os.path.join("CHIP-CDN", "CHIP-CDN_train.json"),
+                    "2940ff04e91f52722f10010e5cbc1f18",
+                    ["text"],
+                ],
+                "dev": [os.path.join("CHIP-CDN", "CHIP-CDN_dev.json"), "c718cdd36f913deb11a1a0b46de51015", ["text"]],
+                "test": [os.path.join("CHIP-CDN", "CHIP-CDN_test.json"), "8dbe229a23af30bd7c3c5bdcdf156314", ["text"]],
+            },
+            "labels": "国际疾病分类 ICD-10北京临床版v601.xlsx",
+        },
+        "CHIP-CDN-2C": {
+            "url": "https://paddlenlp.bj.bcebos.com/datasets/cblue/CHIP-CDN-2C.zip",
+            "md5": "6dce903ff95713947d349b4a4e61a486",
+            "splits": {
+                "train": [
+                    os.path.join("CHIP-CDN-2C", "train.tsv"),
+                    "28e38f631b77b33bff0fd018d84c670f",
+                    ["text_a", "text_b"],
+                ],
+                "dev": [
+                    os.path.join("CHIP-CDN-2C", "dev.tsv"),
+                    "801a0e12101a7ed2261b5984350cd238",
+                    ["text_a", "text_b"],
+                ],
+                "test": [
+                    os.path.join("CHIP-CDN-2C", "test.tsv"),
+                    "0ff464a3c34b095f4d4c22753a119164",
+                    ["text_a", "text_b"],
+                ],
+            },
+            "labels": ["0", "1"],
+        },
+        "CHIP-CTC": {
+            "url": "https://paddlenlp.bj.bcebos.com/datasets/cblue/CHIP-CTC.zip",
+            "md5": "43d804211d46f9374c18ab13d6984f29",
+            "splits": {
+                "train": [
+                    os.path.join("CHIP-CTC", "CHIP-CTC_train.json"),
+                    "098ac22cafe7446393d941612f906531",
+                    ["text"],
+                ],
+                "dev": [os.path.join("CHIP-CTC", "CHIP-CTC_dev.json"), "b48d52fd686bea286de1a3b123398483", ["text"]],
+                "test": [os.path.join("CHIP-CTC", "CHIP-CTC_test.json"), "6a5f0f20f8f85f727d9ef1ea09f939d9", ["text"]],
+            },
+            "labels": "category.xlsx",
+        },
+        "CHIP-STS": {
+            "url": "https://paddlenlp.bj.bcebos.com/datasets/cblue/CHIP-STS.zip",
+            "md5": "4d4db5ef14336e3179e4e1f3c1cc2621",
+            "splits": {
+                "train": [
+                    os.path.join("CHIP-STS", "CHIP-STS_train.json"),
+                    "c6150e2628f107cf2657feb4ed2ba65b",
+                    ["text1", "text2"],
+                ],
+                "dev": [
+                    os.path.join("CHIP-STS", "CHIP-STS_dev.json"),
+                    "2813ecc0222ef8e4612296776e54639d",
+                    ["text1", "text2"],
+                ],
+                "test": [
+                    os.path.join("CHIP-STS", "CHIP-STS_test.json"),
+                    "44394681097024aa922e4e33fa651360",
+                    ["text1", "text2"],
+                ],
+            },
+            "labels": ["0", "1"],
+        },
+        "KUAKE-QIC": {
+            "url": "https://paddlenlp.bj.bcebos.com/datasets/cblue/KUAKE-QIC.zip",
+            "md5": "7661e3a6b5daf4ee025ba407669788d8",
+            "splits": {
+                "train": [
+                    os.path.join("KUAKE-QIC", "KUAKE-QIC_train.json"),
+                    "fc7e359decfcf7b1316e7833acc97b8a",
+                    ["query"],
+                ],
+                "dev": [
+                    os.path.join("KUAKE-QIC", "KUAKE-QIC_dev.json"),
+                    "2fd1f4131916239d89b213cc9860c1c6",
+                    ["query"],
+                ],
+                "test": [
+                    os.path.join("KUAKE-QIC", "KUAKE-QIC_test.json"),
+                    "337dc7f3cdc77b1a21b534ecb3142a6b",
+                    ["query"],
+                ],
+            },
+            "labels": ["病情诊断", "治疗方案", "病因分析", "指标解读", "就医建议", "疾病表述", "后果表述", "注意事项", "功效作用", "医疗费用", "其他"],
+        },
+        "KUAKE-QTR": {
+            "url": "https://paddlenlp.bj.bcebos.com/datasets/cblue/KUAKE-QTR.zip",
+            "md5": "a59686c2b489ac64ff6f0f029c1df068",
+            "splits": {
+                "train": [
+                    os.path.join("KUAKE-QTR", "KUAKE-QTR_train.json"),
+                    "7197f9ca963f337fc81ce6c8a1c97dc4",
+                    ["query", "title"],
+                ],
+                "dev": [
+                    os.path.join("KUAKE-QTR", "KUAKE-QTR_dev.json"),
+                    "e6c480aa46ef2dd04290afe165cdfa9a",
+                    ["query", "title"],
+                ],
+                "test": [
+                    os.path.join("KUAKE-QTR", "KUAKE-QTR_test.json"),
+                    "4ccfcf83eef0563b16914d5455d225a5",
+                    ["query", "title"],
+                ],
+            },
+            "labels": ["0", "1", "2", "3"],
+        },
+        "KUAKE-QQR": {
+            "url": "https://paddlenlp.bj.bcebos.com/datasets/cblue/KUAKE-QQR.zip",
+            "md5": "b7fdeed0ae56e450d7cf3aa7c0b19e20",
+            "splits": {
+                "train": [
+                    os.path.join("KUAKE-QQR", "KUAKE-QQR_train.json"),
+                    "f667e31610acf3f107369310b78d56a9",
+                    ("query1", "query2"),
+                ],
+                "dev": [
+                    os.path.join("KUAKE-QQR", "KUAKE-QQR_dev.json"),
+                    "597354382a806b8168a705584f4f6887",
+                    ("query1", "query2"),
+                ],
+                "test": [
+                    os.path.join("KUAKE-QQR", "KUAKE-QQR_test.json"),
+                    "2d257135c6e1651d24a84496dd50c658",
+                    ("query1", "query2"),
+                ],
+            },
+            "labels": ["0", "1", "2"],
+        },
+    }
+
+    def _get_data(self, mode, **kwargs):
+        builder_config = self.BUILDER_CONFIGS[self.name]
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash, _ = builder_config["splits"][mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(builder_config["url"], default_root, builder_config["md5"])
+        return fullname
+
+    def _search_entity_index(self, tokens, entity_tokens, skip_idx=None):
+        ent_len = len(entity_tokens)
+        for idx in range(len(tokens) - ent_len + 1):
+            if tokens[idx : idx + ent_len] == entity_tokens:
+                if skip_idx is None:
+                    return idx
+                elif idx < skip_idx[0] or idx > skip_idx[1]:
+                    return idx
+        return None
+
+    def _search_spo_index(self, tokens, subjects, objects):
+        tokens = [x.lower() for x in tokens]
+        subjects = [x.lower() for x in subjects]
+        objects = [x.lower() for x in objects]
+        if len(subjects) > len(objects):
+            sub_idx = self._search_entity_index(tokens, subjects)
+            obj_idx = self._search_entity_index(tokens, objects, (sub_idx, sub_idx + len(subjects) - 1))
+        else:
+            obj_idx = self._search_entity_index(tokens, objects)
+            sub_idx = self._search_entity_index(tokens, subjects, (obj_idx, obj_idx + len(objects) - 1))
+        return sub_idx, obj_idx
+
+    def _read(self, filename, split):
+        _, _, input_keys = self.BUILDER_CONFIGS[self.name]["splits"][split]
+        with open(filename, "r", encoding="utf-8") as f:
+            if self.name == "CMeIE":
+                for line in f.readlines():
+                    data = json.loads(line)
+                    labels = self.get_labels()
+                    label_map = dict([(x, i) for i, x in enumerate(labels)])
+                    data_list = data.get("spo_list", [])
+                    ent_list, spo_list = [], []
+                    ent_label, spo_label = [], []
+                    for spo in data_list:
+                        sub, obj = spo["subject"], spo["object"]["@value"]
+                        rel = spo["predicate"]
+                        ent_list.append(sub)
+                        ent_list.append(obj)
+                        spo_list.append((sub, rel, obj))
+
+                        sub_idx, obj_idx = self._search_spo_index(data["text"], sub, obj)
+                        if sub_idx is not None and obj_idx is not None:
+                            sub = tuple((sub_idx, sub_idx + len(sub) - 1))
+                            obj = tuple((obj_idx, obj_idx + len(obj) - 1))
+                            ent_label.append(sub)
+                            ent_label.append(obj)
+                            spo_label.append((sub, label_map[rel], obj))
+
+                        # The samples where subjects and objects have overlap
+                        # will be discarded during training.
+                        #
+                        # if sub_idx is None or obj_idx is None:
+                        #    print('Error: Can not find entities in tokens.')
+                        #    print('Tokens:', data['text'])
+                        #    print('Entities":', sub, obj)
+
+                    data["ent_list"] = ent_list
+                    data["spo_list"] = spo_list
+                    data["ent_label"] = ent_label
+                    data["spo_label"] = spo_label
+
+                    yield data
+            elif self.name == "CMeEE":
+                data_list = json.load(f)
+                for data in data_list:
+                    text_len = len(data[input_keys[0]])
+                    if data.get("entities", None):
+                        labels = [["O" for _ in range(text_len)], ["O" for _ in range(text_len)]]
+                        idx_dict = [{}, {}]
+                        for entity in data["entities"]:
+                            start_idx = entity["start_idx"]
+                            end_idx = entity["end_idx"]
+                            etype = entity["type"]
+                            ltype = int(etype == "sym")
+                            if start_idx in idx_dict[ltype]:
+                                if idx_dict[ltype][start_idx] >= end_idx:
+                                    continue
+                            idx_dict[ltype][start_idx] = end_idx
+                            if start_idx == end_idx:
+                                labels[ltype][start_idx] = "S-" + etype
+                            else:
+                                labels[ltype][start_idx] = "B-" + etype
+                                labels[ltype][end_idx] = "E-" + etype
+                                for x in range(start_idx + 1, end_idx):
+                                    labels[ltype][x] = "I-" + etype
+                        data.pop("entities")
+                        data["labels"] = labels
+                    yield data
+            elif self.name == "CHIP-CDN-2C":
+                data_keys = f.readline().strip().split("\t")
+                for data in f:
+                    data = data.strip().split("\t")
+                    data = dict([(k, v) for k, v in zip(data_keys, data)])
+                    yield data
+            else:
+                data_list = json.load(f)
+                for data in data_list:
+                    if data.get("normalized_result", None):
+                        data["labels"] = [x.strip('"') for x in data["normalized_result"].split("##")]
+                        data.pop("normalized_result")
+                    data["text_a"] = data[input_keys[0]]
+                    data.pop(input_keys[0])
+                    if len(input_keys) > 1:
+                        data["text_b"] = data[input_keys[1]]
+                        data.pop(input_keys[1])
+                    yield data
+
+    def get_labels(self):
+        """
+        Returns labels of the CBLUE task.
+        """
+        labels = self.BUILDER_CONFIGS[self.name]["labels"]
+        if isinstance(labels, str):
+            default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+            label_dir = os.path.join(default_root, self.name)
+        if self.name == "CHIP-CDN":
+            name = [x for x in os.listdir(label_dir) if x.endswith(".xlsx")][0]
+            labels = pd.read_excel(os.path.join(label_dir, name), header=None)
+            return sorted(labels[1].values)
+        elif self.name == "CHIP-CTC":
+            labels = pd.read_excel(os.path.join(label_dir, labels))
+            return sorted(labels["Label Name"].values)
+        else:
+            return self.BUILDER_CONFIGS[self.name]["labels"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/chnsenticorp.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/chnsenticorp.py
new file mode 100644
index 000000000..2dc48837a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/chnsenticorp.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["ChnSentiCorp"]
+
+
+class ChnSentiCorp(DatasetBuilder):
+    """
+    ChnSentiCorp (by Tan Songbo at ICT of Chinese Academy of Sciences, and for
+    opinion mining)
+
+    """
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/ChnSentiCorp.zip"
+    MD5 = "7ef61b08ad10fbddf2ba97613f071561"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    SPLITS = {
+        "train": META_INFO(
+            os.path.join("ChnSentiCorp", "ChnSentiCorp", "train.tsv"), "689360c4a4a9ce8d8719ed500ae80907"
+        ),
+        "dev": META_INFO(os.path.join("ChnSentiCorp", "ChnSentiCorp", "dev.tsv"), "20c77cc2371634731a367996b097ec0a"),
+        "test": META_INFO(
+            os.path.join("ChnSentiCorp", "ChnSentiCorp", "test.tsv"), "9b4dc7d1e4ada48c645b7e938592f49c"
+        ),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        """Downloads dataset."""
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename, split):
+        """Reads data."""
+        with open(filename, "r", encoding="utf-8") as f:
+            head = None
+            for line in f:
+                data = line.strip().split("\t")
+                if not head:
+                    head = data
+                else:
+                    if split == "train":
+                        label, text = data
+                        yield {"text": text, "label": label, "qid": ""}
+                    elif split == "dev":
+                        qid, label, text = data
+                        yield {"text": text, "label": label, "qid": qid}
+                    elif split == "test":
+                        qid, text = data
+                        yield {"text": text, "label": "", "qid": qid}
+
+    def get_labels(self):
+        """
+        Return labels of the ChnSentiCorp object.
+        """
+        return ["0", "1"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/chnsenticorp_v2.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/chnsenticorp_v2.py
new file mode 100644
index 000000000..af7cfc8c0
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/chnsenticorp_v2.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["ChnSentiCorpV2"]
+
+
+class ChnSentiCorpV2(DatasetBuilder):
+    """
+    ChnSentiCorp (by Tan Songbo at ICT of Chinese Academy of Sciences, and for
+    opinion mining)
+
+    """
+
+    URL = "https://paddlenlp.bj.bcebos.com/datasets/data-chnsenticorp.tar.gz"
+    MD5 = "e336e76d7be4ecd5479083d5b8f771e4"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    SPLITS = {
+        "train": META_INFO(os.path.join("chnsenticorp", "train", "part.0"), "3fac2659547f1ddf90d223b8ed31f22f"),
+        "dev": META_INFO(os.path.join("chnsenticorp", "dev", "part.0"), "a3a853bfb3af4a592fc4df24b56c88a7"),
+        "test": META_INFO(os.path.join("chnsenticorp", "test", "part.0"), "6bfc8f35f523d2fdf12648d9d02778ff"),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        """Downloads dataset."""
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename, split):
+        """Reads data."""
+        with open(filename, "r", encoding="utf-8") as f:
+            head = True
+            for line in f:
+                data = line.strip().split("\t")
+                if not head:
+                    head = data
+                else:
+                    if split == "train":
+                        text, label = data
+                        yield {"text": text, "label": label}
+                    elif split == "dev":
+                        text, label = data
+                        yield {"text": text, "label": label}
+                    elif split == "test":
+                        text, label = data
+                        yield {"text": text, "label": label}
+
+    def get_labels(self):
+        """
+        Return labels of the ChnSentiCorp object.
+        """
+        return ["0", "1"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/clue.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/clue.py
new file mode 100644
index 000000000..f862abb14
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/clue.py
@@ -0,0 +1,271 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+
+class Clue(DatasetBuilder):
+    """
+    `ClUE <https://arxiv.org/abs/2004.05986>`_ is the first large-scale Chinese
+    Language Understanding Evaluation(CLUE) benchmark. CLUE is an open-ended,
+    community-driven project that brings together 9 tasks spanning several
+    well-established single-sentence/sentence-pair classification tasks, as
+    well as machine reading comprehension, all on original Chinese text.
+
+    From https://github.com/CLUEbenchmark/CLUE
+
+    AFQMC:
+        AFQMC: The Ant Financial Question Matching Corpus3 comes from Ant
+        Technology Exploration Conference (ATEC) Developer competition. It is
+        a binary classification task that aims to predict whether two sentences
+        are semantically similar.
+
+    TNEWS:
+        TouTiao Text Classification for News Titles2 consists of Chinese news
+        published by TouTiao before May 2018, with a total of 73,360 titles.
+        Each title is labeled with one of 15 news categories (finance,
+        technology, sports, etc.) and the task is to predict which category the
+        title belongs to.
+
+    IFLYTEK:
+        IFLYTEK contains 17,332 app descriptions. The task is to assign each
+        description into one of 119 categories, such as food, car rental,
+        education, etc.
+
+    OCNLI:
+        Original Chinese Natural Language Inference is collected closely
+        following procedures of MNLI. OCNLI is composed of 56k inference pairs
+        from five genres: news, government, fiction, TV transcripts and
+        Telephone transcripts, where the premises are collected from Chinese
+        sources, and universities students in language majors are hired to
+        write the hypotheses.
+
+    CMNLI:
+        Chinese Multi-Genre NLI.
+
+    CLUEWSC2020:
+        The Chinese Winograd Schema Challenge dataset is an anaphora/
+        coreference resolution task where the model is asked to decide whether
+        a pronoun and a noun (phrase) in a sentence co-refer (binary
+        classification), built following similar datasets in English.
+
+    CSL:
+        Chinese Scientific Literature dataset contains Chinese paper abstracts
+        and their keywords from core journals of China, covering multiple
+        fields of natural sciences and social sciences.
+
+    """
+
+    BUILDER_CONFIGS = {
+        "afqmc": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/afqmc_public.zip",
+            "md5": "3377b559bb4e61d03a35282550902ca0",
+            "splits": {
+                "train": [
+                    os.path.join("afqmc_public", "train.json"),
+                    "319cf775353af9473140abca4052b89a",
+                ],
+                "dev": [
+                    os.path.join("afqmc_public", "dev.json"),
+                    "307154b59cb6c3e68a0f39c310bbd364",
+                ],
+                "test": [
+                    os.path.join("afqmc_public", "test.json"),
+                    "94b925f23a9615dd08199c4013f761f4",
+                ],
+            },
+            "labels": ["0", "1"],
+        },
+        "tnews": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/tnews_public.zip",
+            "md5": "38186ed0a751bc33e3ae0c1b59319777",
+            "splits": {
+                "train": [
+                    os.path.join("tnews_public", "train.json"),
+                    "25c021725309a3330736380a230850fd",
+                ],
+                "dev": [
+                    os.path.join("tnews_public", "dev.json"),
+                    "f0660a3339a32e764075c801b42ece3c",
+                ],
+                "test": [
+                    os.path.join("tnews_public", "test.json"),
+                    "045a6c4f59bf1a066c4a0d7afe6cd2b4",
+                ],
+                "test1.0": [
+                    os.path.join("tnews_public", "test1.0.json"),
+                    "2d1557c7548c72d5a84c47bbbd3a4e85",
+                ],
+                "labels": [
+                    os.path.join("tnews_public", "labels.json"),
+                    "a1a7595e596b202556dedd2a20617769",
+                ],
+            },
+            "labels": [
+                "100",
+                "101",
+                "102",
+                "103",
+                "104",
+                "106",
+                "107",
+                "108",
+                "109",
+                "110",
+                "112",
+                "113",
+                "114",
+                "115",
+                "116",
+            ],
+        },
+        "iflytek": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/iflytek_public.zip",
+            "md5": "19e4b19947db126f69aae18db0da2b87",
+            "splits": {
+                "train": [
+                    os.path.join("iflytek_public", "train.json"),
+                    "fc9a21700c32ee3efee3fc283e9ac560",
+                ],
+                "dev": [
+                    os.path.join("iflytek_public", "dev.json"),
+                    "79b7d95bddeb11cd54198fd077992704",
+                ],
+                "test": [
+                    os.path.join("iflytek_public", "test.json"),
+                    "ea764519ddb4369767d07664afde3325",
+                ],
+                "labels": [
+                    os.path.join("iflytek_public", "labels.json"),
+                    "7f9e794688ffb37fbd42b58325579fdf",
+                ],
+            },
+            "labels": [str(i) for i in range(119)],
+        },
+        "ocnli": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/ocnli_public.zip",
+            "md5": "acb426f6f3345076c6ce79239e7bc307",
+            "splits": {
+                "train": [
+                    os.path.join("ocnli_public", "train.50k.json"),
+                    "d38ec492ef086a894211590a18ab7596",
+                ],
+                "dev": [
+                    os.path.join("ocnli_public", "dev.json"),
+                    "3481b456bee57a3c9ded500fcff6834c",
+                ],
+                "test": [
+                    os.path.join("ocnli_public", "test.json"),
+                    "680ff24e6b3419ff8823859bc17936aa",
+                ],
+            },
+            "labels": ["entailment", "contradiction", "neutral"],
+        },
+        "cmnli": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/cmnli_public.zip",
+            "md5": "e0e8caefd9b3491220c18b466233f2ff",
+            "splits": {
+                "train": [
+                    os.path.join("cmnli_public", "train.json"),
+                    "7d02308650cd2a0e183bf599ca9bb263",
+                ],
+                "dev": [
+                    os.path.join("cmnli_public", "dev.json"),
+                    "0b16a50a297a9afb1ce5385ee4dd3d9c",
+                ],
+                "test": [
+                    os.path.join("cmnli_public", "test.json"),
+                    "804cb0bb67266983d59d1c855e6b03b0",
+                ],
+            },
+            "labels": ["contradiction", "entailment", "neutral"],
+        },
+        "cluewsc2020": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/cluewsc2020_public.zip",
+            "md5": "2e387e20e93eeab0ffaded5b0d2dfd3d",
+            "splits": {
+                "train": [
+                    os.path.join("cluewsc2020_public", "train.json"),
+                    "afd235dcf8cdb89ee1a21d0a4823eecc",
+                ],
+                "dev": [
+                    os.path.join("cluewsc2020_public", "dev.json"),
+                    "bad8cd6fa0916fc37ac96b8ce316714a",
+                ],
+                "test": [
+                    os.path.join("cluewsc2020_public", "test.json"),
+                    "27614454cc26be6fcab5bbd9a45967ff",
+                ],
+                "test1.0": [
+                    os.path.join("cluewsc2020_public", "test1.0.json"),
+                    "0e9e8ffd8ee90ddf1f58d6dc2e02de7b",
+                ],
+            },
+            "labels": ["true", "false"],
+        },
+        "csl": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/csl_public.zip",
+            "md5": "394a2ccbf6ddd7e331be4d5d7798f0f6",
+            "splits": {
+                "train": [
+                    os.path.join("csl_public", "train.json"),
+                    "e927948b4e0eb4992fe9f45a77446bf5",
+                ],
+                "dev": [
+                    os.path.join("csl_public", "dev.json"),
+                    "6c2ab8dd3b4785829ead94b05a1cb957",
+                ],
+                "test": [
+                    os.path.join("csl_public", "test.json"),
+                    "ebfb89575355f00dcd9b18f8353547cd",
+                ],
+            },
+            "labels": ["0", "1"],
+        },
+    }
+
+    def _get_data(self, mode, **kwargs):
+        builder_config = self.BUILDER_CONFIGS[self.name]
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = builder_config["splits"][mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(builder_config["url"], default_root, builder_config["md5"])
+        return fullname
+
+    def _read(self, filename, split):
+        if self.name == "cmnli" and split == "dev" or self.name == "ocnli" and split in ["train", "dev"]:
+            with open(filename, "r", encoding="utf-8") as f:
+                for line in f:
+                    example_dict = json.loads(line.rstrip())
+                    if example_dict["label"] == "-":
+                        continue
+                    yield example_dict
+        else:
+            with open(filename, "r", encoding="utf-8") as f:
+                for line in f:
+                    yield json.loads(line.rstrip())
+
+    def get_labels(self):
+        """
+        Returns labels of the Clue task.
+        """
+        return self.BUILDER_CONFIGS[self.name]["labels"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/cmrc2018.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/cmrc2018.py
new file mode 100644
index 000000000..c5dfbbd2a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/cmrc2018.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import json
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["CMRC2018"]
+
+
+class CMRC2018(DatasetBuilder):
+    """
+    This dataset is a Span-Extraction dataset for Chinese machine reading
+    comprehension. The dataset is composed by near 20,000 real questions
+    annotated on Wikipedia paragraphs by human experts.
+    """
+
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5", "URL"))
+    SPLITS = {
+        "train": META_INFO(
+            os.path.join("cmrc2018_train.json"),
+            "7fb714b479c7f40fbb16acabd7af0ede",
+            "https://bj.bcebos.com/paddlenlp/datasets/cmrc/cmrc2018_train.json",
+        ),
+        "dev": META_INFO(
+            os.path.join("cmrc2018_dev.json"),
+            "853b80709ff2d071f9fce196521b843c",
+            "https://bj.bcebos.com/paddlenlp/datasets/cmrc/cmrc2018_dev.json",
+        ),
+        "trial": META_INFO(
+            os.path.join("cmrc2018_trial.json"),
+            "070f8ade5b15cfdb095c1fcef9cf43c1",
+            "https://bj.bcebos.com/paddlenlp/datasets/cmrc/cmrc2018_trial.json",
+        ),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash, URL = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(URL, default_root)
+
+        return fullname
+
+    def _read(self, filename, *args):
+        with open(filename, "r", encoding="utf8") as f:
+            input_data = json.load(f)["data"]
+        for entry in input_data:
+            title = entry.get("title", "").strip()
+            for paragraph in entry["paragraphs"]:
+                context = paragraph["context"].strip()
+                for qa in paragraph["qas"]:
+                    qas_id = qa["id"]
+                    question = qa["question"].strip()
+                    answer_starts = [answer["answer_start"] for answer in qa.get("answers", [])]
+                    answers = [answer["text"].strip() for answer in qa.get("answers", [])]
+
+                    yield {
+                        "id": qas_id,
+                        "title": title,
+                        "context": context,
+                        "question": question,
+                        "answers": answers,
+                        "answer_starts": answer_starts,
+                    }
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/cnn_dailymail.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/cnn_dailymail.py
new file mode 100644
index 000000000..46ef30ea0
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/cnn_dailymail.py
@@ -0,0 +1,231 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import hashlib
+import os
+import shutil
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import _decompress, _get_unique_endpoints, get_path_from_url
+
+try:
+    from paddle.distributed import ParallelEnv
+except Exception:
+    import warnings
+
+    warnings.warn("paddle.distributed is not contains in you paddle!")
+
+from ..utils.env import DATA_HOME
+from ..utils.log import logger
+from .dataset import DatasetBuilder
+
+
+class CnnDailymail(DatasetBuilder):
+    """
+    CNN/DailyMail non-anonymized summarization dataset.
+    The CNN / DailyMail Dataset is an English-language dataset containing
+    just over 300k unique news articles as written by journalists at CNN
+    nd the Daily Mail. The current version supports both extractive and
+    abstractive summarization, though the original version was created
+    for machine reading and comprehension and abstractive question answering.
+
+    Version 1.0.0 aimed to support supervised neural methodologies for machine
+    reading and question answering with a large amount of real natural language
+    training data and released about 313k unique articles and nearly 1M Cloze
+    style questions to go with the articles.
+    Versions 2.0.0 and 3.0.0 changed the structure of the dataset to support
+    summarization rather than question answering. Version 3.0.0 provided a
+    non-anonymized version of the data, whereas both the previous versions were
+    preprocessed to replace named entities with unique identifier labels.
+
+    An updated version of the code that does not anonymize the data is available
+    at https://github.com/abisee/cnn-dailymail.
+    """
+
+    lazy = False
+    META_INFO = collections.namedtuple("META_INFO", ("file", "url", "md5"))
+    SPLITS = {
+        "train": META_INFO(
+            "all_train.txt",
+            "https://bj.bcebos.com/paddlenlp/datasets/cnn_dailymail/all_train.txt",
+            "c8ca98cfcb6cf3f99a404552568490bc",
+        ),
+        "dev": META_INFO(
+            "all_val.txt",
+            "https://bj.bcebos.com/paddlenlp/datasets/cnn_dailymail/all_val.txt",
+            "83a3c483b3ed38b1392285bed668bfee",
+        ),
+        "test": META_INFO(
+            "all_test.txt",
+            "https://bj.bcebos.com/paddlenlp/datasets/cnn_dailymail/all_test.txt",
+            "4f3ac04669934dbc746b7061e68a0258",
+        ),
+    }
+    cnn_dailymail = {
+        "cnn": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/cnn_dailymail/cnn_stories.tgz",
+            "md5": "85ac23a1926a831e8f46a6b8eaf57263",
+            "file_num": 92579,
+        },
+        "dailymail": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/cnn_dailymail/dailymail_stories.tgz",
+            "md5": "f9c5f565e8abe86c38bfa4ae8f96fd72",
+            "file_num": 219506,
+        },
+    }
+
+    def _read_text_file(self, text_file):
+        lines = []
+        with open(text_file, "r", encoding="utf8") as f:
+            for line in f:
+                lines.append(line.strip())
+        return lines
+
+    def _get_url_hashes(self, path):
+        """Get hashes of urls in file."""
+        urls = self._read_text_file(path)
+
+        def url_hash(u):
+            h = hashlib.sha1()
+            try:
+                u = u.encode("utf-8")
+            except UnicodeDecodeError:
+                logger.error("Cannot hash url: %s", u)
+            h.update(u)
+            return h.hexdigest()
+
+        return {url_hash(u): True for u in urls}
+
+    def _get_hash_from_path(self, p):
+        """Extract hash from path."""
+        basename = os.path.basename(p)
+        return basename[0 : basename.find(".story")]
+
+    def _find_files(self, dl_paths, publisher, url_dict):
+        """Find files corresponding to urls."""
+        if publisher == "cnn":
+            top_dir = os.path.join(dl_paths["cnn"], "stories")
+        elif publisher == "dailymail":
+            top_dir = os.path.join(dl_paths["dailymail"], "stories")
+        else:
+            logger.error("Unsupported publisher: %s", publisher)
+        files = sorted(os.listdir(top_dir))
+
+        ret_files = []
+        for p in files:
+            if self._get_hash_from_path(p) in url_dict:
+                ret_files.append(os.path.join(top_dir, p))
+        return ret_files
+
+    def _subset_filenames(self, dl_paths, split):
+        """Get filenames for a particular split."""
+        # Get filenames for a split.
+        urls = self._get_url_hashes(dl_paths[split])
+        cnn = self._find_files(dl_paths, "cnn", urls)
+        dm = self._find_files(dl_paths, "dailymail", urls)
+        return cnn + dm
+
+    def _get_art_abs(self, story_file, version):
+        """Get abstract (highlights) and article from a story file path."""
+        # Based on https://github.com/abisee/cnn-dailymail/blob/master/
+        #     make_datafiles.py
+
+        lines = self._read_text_file(story_file)
+
+        # The github code lowercase the text and we removed it in 3.0.0.
+
+        # Put periods on the ends of lines that are missing them
+        # (this is a problem in the dataset because many image captions don't end in
+        # periods; consequently they end up in the body of the article as run-on
+        # sentences)
+        def fix_missing_period(line):
+            """Adds a period to a line that is missing a period."""
+            if "@highlight" in line:
+                return line
+            if not line:
+                return line
+            if line[-1] in [".", "!", "?", "...", "'", "`", '"', "\u2019", "\u201d", ")"]:
+                return line
+            return line + " ."
+
+        lines = [fix_missing_period(line) for line in lines]
+
+        # Separate out article and abstract sentences
+        article_lines = []
+        highlights = []
+        next_is_highlight = False
+        for line in lines:
+            if not line:
+                continue  # empty line
+            elif line.startswith("@highlight"):
+                next_is_highlight = True
+            elif next_is_highlight:
+                highlights.append(line)
+            else:
+                article_lines.append(line)
+
+        # Make article into a single string
+        article = " ".join(article_lines)
+
+        if version >= "2.0.0":
+            abstract = "\n".join(highlights)
+        else:
+            abstract = " ".join(highlights)
+
+        return article, abstract
+
+    def _get_data(self, mode):
+        """Check and download Dataset"""
+        dl_paths = {}
+        version = self.name
+        if version is None:
+            version = "3.0.0"
+        if version not in ["1.0.0", "2.0.0", "3.0.0"]:
+            raise ValueError("Unsupported version: %s" % version)
+        dl_paths["version"] = version
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        for k, v in self.cnn_dailymail.items():
+            dir_path = os.path.join(default_root, k)
+            if not os.path.exists(dir_path):
+                get_path_from_url(v["url"], default_root, v["md5"])
+            unique_endpoints = _get_unique_endpoints(ParallelEnv().trainer_endpoints[:])
+            if ParallelEnv().current_endpoint in unique_endpoints:
+                file_num = len(os.listdir(os.path.join(dir_path, "stories")))
+                if file_num != v["file_num"]:
+                    logger.warning(
+                        "Number of %s stories is %d != %d, decompress again." % (k, file_num, v["file_num"])
+                    )
+                    shutil.rmtree(os.path.join(dir_path, "stories"))
+                    _decompress(os.path.join(default_root, os.path.basename(v["url"])))
+            dl_paths[k] = dir_path
+        filename, url, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(url, default_root, data_hash)
+        dl_paths[mode] = fullname
+        return dl_paths
+
+    def _read(self, dl_paths, split):
+        files = self._subset_filenames(dl_paths, split)
+        for p in files:
+            article, highlights = self._get_art_abs(p, dl_paths["version"])
+            if not article or not highlights:
+                continue
+            yield {
+                "article": article,
+                "highlights": highlights,
+                "id": self._get_hash_from_path(p),
+            }
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/conll2002.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/conll2002.py
new file mode 100644
index 000000000..333708cd0
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/conll2002.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+
+class Conll2002(DatasetBuilder):
+    """
+    Named entities are phrases that contain the names of persons, organizations,
+    locations, times and quantities. Example: [PER Wolff] , currently a journalist
+    in [LOC Argentina] , played with [PER Del Bosque] in the final years of the seventies in [ORG Real Madrid] .
+    The shared task of CoNLL-2002 concerns language-independent named entity recognition.
+    We will concentrate on four types of named entities: persons, locations, organizations and names of
+    miscellaneous entities that do not belong to the previous three groups. The participants of the
+    shared task will be offered training and test data for at least two languages.
+    They will use the data for developing a named-entity recognition system that includes a machine learning component.
+    Information sources other than the training data may be used in this shared task. We are especially interested
+    in methods that can use additional unannotated data for improving their performance (for example co-training).
+    For more details see https://www.clips.uantwerpen.be/conll2002/ner/
+    and https://www.aclweb.org/anthology/W02-2024/
+    """
+
+    META_INFO = collections.namedtuple("META_INFO", ("file", "url", "md5"))
+    BASE_URL = "https://bj.bcebos.com/paddlenlp/datasets/conll2002/"
+    BUILDER_CONFIGS = {
+        "es": {
+            "splits": {
+                "train": META_INFO("esp.train", BASE_URL + "esp.train", "c8c6b342371b9de2f83a93767d352c17"),
+                "dev": META_INFO("esp.testa", BASE_URL + "esp.testa", "de0578160dde26ec68cc580595587dde"),
+                "test": META_INFO("esp.testb", BASE_URL + "esp.testb", "c8d35f340685a2ce6559ee90d78f9e37"),
+            },
+            "pos_tags": [
+                "AO",
+                "AQ",
+                "CC",
+                "CS",
+                "DA",
+                "DE",
+                "DD",
+                "DI",
+                "DN",
+                "DP",
+                "DT",
+                "Faa",
+                "Fat",
+                "Fc",
+                "Fd",
+                "Fe",
+                "Fg",
+                "Fh",
+                "Fia",
+                "Fit",
+                "Fp",
+                "Fpa",
+                "Fpt",
+                "Fs",
+                "Ft",
+                "Fx",
+                "Fz",
+                "I",
+                "NC",
+                "NP",
+                "P0",
+                "PD",
+                "PI",
+                "PN",
+                "PP",
+                "PR",
+                "PT",
+                "PX",
+                "RG",
+                "RN",
+                "SP",
+                "VAI",
+                "VAM",
+                "VAN",
+                "VAP",
+                "VAS",
+                "VMG",
+                "VMI",
+                "VMM",
+                "VMN",
+                "VMP",
+                "VMS",
+                "VSG",
+                "VSI",
+                "VSM",
+                "VSN",
+                "VSP",
+                "VSS",
+                "Y",
+                "Z",
+            ],
+        },
+        "nl": {
+            "splits": {
+                "train": META_INFO("ned.train", BASE_URL + "ned.train", "b6189d04eb34597d2a98ca5cec477605"),
+                "dev": META_INFO("ned.testa", BASE_URL + "ned.testa", "626900497823fdbc4f84335518cb85ce"),
+                "test": META_INFO("ned.testb", BASE_URL + "ned.testb", "c37de92da20c68c6418a73dd42e322dc"),
+            },
+            "pos_tags": ["Adj", "Adv", "Art", "Conj", "Int", "Misc", "N", "Num", "Prep", "Pron", "Punc", "V"],
+        },
+    }
+
+    def _get_data(self, mode, **kwargs):
+        builder_config = self.BUILDER_CONFIGS[self.name]
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, url, data_hash = builder_config["splits"][mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(url, default_root, data_hash)
+        return fullname
+
+    def _read(self, filename, *args):
+        with open(filename, "r", encoding="utf-8") as f:
+            tokens = []
+            ner_tags = []
+            pos_tags = []
+            for line in f.readlines():
+                if line.startswith("-DOCSTART-") or line == "" or line == "\n":
+                    if tokens:
+                        yield {"tokens": tokens, "ner_tags": ner_tags, "pos_tags": pos_tags}
+                        tokens = []
+                        ner_tags = []
+                        pos_tags = []
+                else:
+                    # conll2002 tokens are space separated
+                    splits = line.split(" ")
+                    tokens.append(splits[0])
+                    pos_tags.append(splits[1])
+                    ner_tags.append(splits[2].rstrip())
+            # last example
+            yield {"tokens": tokens, "ner_tags": ner_tags, "pos_tags": pos_tags}
+
+    def get_labels(self):
+        """
+        Returns labels of ner tags and pos tags.
+        """
+        return ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"], self.BUILDER_CONFIGS[
+            self.name
+        ]["pos_tags"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/cote.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/cote.py
new file mode 100644
index 000000000..18f6e9444
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/cote.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["Cote"]
+
+
+class Cote(DatasetBuilder):
+    """
+    COTE_DP/COTE-BD/COTE-MFW dataset for Opinion Role Labeling task.
+    More information please refer to https://aistudio.baidu.com/aistudio/competition/detail/50/?isFromLuge=1.
+
+    """
+
+    BUILDER_CONFIGS = {
+        "dp": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/COTE-DP.zip",
+            "md5": "a73d4170a283a2264a41c3ee9eb4d262",
+            "splits": {
+                "train": [os.path.join("COTE-DP", "train.tsv"), "17d11ca91b7979f2c2023757650096e5"],
+                "test": [os.path.join("COTE-DP", "test.tsv"), "5bb9b9ccaaee6bcc1ac7a6c852b46f66"],
+            },
+            "labels": ["B", "I", "O"],
+        },
+        "bd": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/COTE-BD.zip",
+            "md5": "8d87ff9bb6f5e5d46269d72632a1b01f",
+            "splits": {
+                "train": [os.path.join("COTE-BD", "train.tsv"), "4c08ccbcc373cb3bf05c3429d435f608"],
+                "test": [os.path.join("COTE-BD", "test.tsv"), "aeb5c9af61488dadb12cbcc1d2180667"],
+            },
+            "labels": ["B", "I", "O"],
+        },
+        "mfw": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/COTE-MFW.zip",
+            "md5": "c85326bf2be4424d03373ea70cb32c3f",
+            "splits": {
+                "train": [os.path.join("COTE-MFW", "train.tsv"), "01fc90b9098d35615df6b8d257eb46ca"],
+                "test": [os.path.join("COTE-MFW", "test.tsv"), "c61a475917a461089db141c59c688343"],
+            },
+            "labels": ["B", "I", "O"],
+        },
+    }
+
+    def _get_data(self, mode, **kwargs):
+        """Downloads dataset."""
+        builder_config = self.BUILDER_CONFIGS[self.name]
+        default_root = os.path.join(DATA_HOME, f"COTE-{self.name.upper()}")
+        filename, data_hash = builder_config["splits"][mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            url = builder_config["url"]
+            md5 = builder_config["md5"]
+            get_path_from_url(url, DATA_HOME, md5)
+
+        return fullname
+
+    def _read(self, filename, split):
+        """Reads data"""
+        with open(filename, "r", encoding="utf-8") as f:
+            for idx, line in enumerate(f):
+                if idx == 0:
+                    # ignore first line about title
+                    continue
+                line_stripped = line.strip().split("\t")
+                if not line_stripped:
+                    continue
+                if split == "test":
+                    yield {"tokens": list(line_stripped[1])}
+                else:
+                    try:
+                        entity, text = line_stripped[0], line_stripped[1]
+                        start_idx = text.index(entity)
+                    except Exception:
+                        # drop the dirty data
+                        continue
+
+                    labels = ["O"] * len(text)
+                    labels[start_idx] = "B"
+                    for idx in range(start_idx + 1, start_idx + len(entity)):
+                        labels[idx] = "I"
+                    yield {"tokens": list(text), "labels": labels, "entity": entity}
+
+    def get_labels(self):
+        """
+        Return labels of the COTE.
+        """
+        return self.BUILDER_CONFIGS[self.name]["labels"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/couplet.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/couplet.py
new file mode 100644
index 000000000..390584d7e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/couplet.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["Couplet"]
+
+
+class Couplet(DatasetBuilder):
+    """
+    Couplet dataset. The couplet data is from this github repository:
+    https://github.com/v-zich/couplet-clean-dataset, which filters dirty data
+    from the original repository https://github.com/wb14123/couplet-dataset.
+    """
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/couplet.tar.gz"
+    META_INFO = collections.namedtuple("META_INFO", ("src_file", "tgt_file", "src_md5", "tgt_md5"))
+    MD5 = "5c0dcde8eec6a517492227041c2e2d54"
+    SPLITS = {
+        "train": META_INFO(
+            os.path.join("couplet", "train_src.tsv"),
+            os.path.join("couplet", "train_tgt.tsv"),
+            "ad137385ad5e264ac4a54fe8c95d1583",
+            "daf4dd79dbf26040696eee0d645ef5ad",
+        ),
+        "dev": META_INFO(
+            os.path.join("couplet", "dev_src.tsv"),
+            os.path.join("couplet", "dev_tgt.tsv"),
+            "65bf9e72fa8fdf0482751c1fd6b6833c",
+            "3bc3b300b19d170923edfa8491352951",
+        ),
+        "test": META_INFO(
+            os.path.join("couplet", "test_src.tsv"),
+            os.path.join("couplet", "test_tgt.tsv"),
+            "f0a7366dfa0acac884b9f4901aac2cc1",
+            "56664bff3f2edfd7a751a55a689f90c2",
+        ),
+    }
+    VOCAB_INFO = (os.path.join("couplet", "vocab.txt"), "0bea1445c7c7fb659b856bb07e54a604")
+    UNK_TOKEN = "<unk>"
+    BOS_TOKEN = "<s>"
+    EOS_TOKEN = "</s>"
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        src_filename, tgt_filename, src_data_hash, tgt_data_hash = self.SPLITS[mode]
+        src_fullname = os.path.join(default_root, src_filename)
+        tgt_fullname = os.path.join(default_root, tgt_filename)
+
+        vocab_filename, vocab_hash = self.VOCAB_INFO
+        vocab_fullname = os.path.join(default_root, vocab_filename)
+
+        if (
+            (not os.path.exists(src_fullname) or (src_data_hash and not md5file(src_fullname) == src_data_hash))
+            or (not os.path.exists(tgt_fullname) or (tgt_data_hash and not md5file(tgt_fullname) == tgt_data_hash))
+            or (not os.path.exists(vocab_fullname) or (vocab_hash and not md5file(vocab_fullname) == vocab_hash))
+        ):
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return src_fullname, tgt_fullname
+
+    def _read(self, filename, *args):
+        src_filename, tgt_filename = filename
+        with open(src_filename, "r", encoding="utf-8") as src_f:
+            with open(tgt_filename, "r", encoding="utf-8") as tgt_f:
+                for src_line, tgt_line in zip(src_f, tgt_f):
+                    src_line = src_line.strip()
+                    tgt_line = tgt_line.strip()
+                    if not src_line and not tgt_line:
+                        continue
+                    yield {"first": src_line, "second": tgt_line}
+
+    def get_vocab(self):
+        vocab_fullname = os.path.join(DATA_HOME, self.__class__.__name__, self.VOCAB_INFO[0])
+
+        # Construct vocab_info to match the form of the input of `Vocab.load_vocabulary()` function
+        vocab_info = {
+            "filepath": vocab_fullname,
+            "unk_token": self.UNK_TOKEN,
+            "bos_token": self.BOS_TOKEN,
+            "eos_token": self.EOS_TOKEN,
+        }
+        return vocab_info
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/dataset.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/dataset.py
new file mode 100644
index 000000000..7bf1b8368
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/dataset.py
@@ -0,0 +1,781 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import atexit
+import inspect
+import os
+import time
+import warnings
+from collections import namedtuple
+from itertools import islice
+
+import datasets
+from multiprocess import Pool, RLock
+
+import paddlenlp
+
+try:
+    import paddle.distributed as dist
+except Exception:
+    warnings.warn("paddle.distributed is not contains in you paddle!")
+
+import importlib
+from functools import partial
+
+from paddle.io import Dataset, IterableDataset
+from paddle.utils.download import _get_unique_endpoints
+
+from paddlenlp.utils.env import DATA_HOME
+
+__all__ = ["MapDataset", "DatasetBuilder", "IterDataset", "load_dataset"]
+
+DATASETS_MODULE_PATH = "paddlenlp.datasets."
+
+# Patch for intranet
+from datasets import load_dataset as origin_load_dataset  # noqa: E402
+
+
+def load_from_ppnlp(path, *args, **kwargs):
+    ppnlp_path = paddlenlp.datasets.__path__[0]
+    new_path = os.path.split(path)[-1]
+    new_path = os.path.join(ppnlp_path, "hf_datasets", new_path + ".py")
+    if os.path.exists(new_path):
+        return origin_load_dataset(new_path, trust_remote_code=True, *args, **kwargs)
+    else:
+        return origin_load_dataset(path, trust_remote_code=True, *args, **kwargs)
+
+
+datasets.load_dataset = load_from_ppnlp
+
+
+class DatasetTuple:
+    def __init__(self, splits):
+        self.identifier_map, identifiers = self._gen_identifier_map(splits)
+        self.tuple_cls = namedtuple("datasets", identifiers)
+        self.tuple = self.tuple_cls(*[None for _ in splits])
+
+    def __getitem__(self, key):
+        if isinstance(key, (int, slice)):
+            return self.tuple[key]
+        if isinstance(key, str):
+            return getattr(self.tuple, self.identifier_map[key])
+
+    def __setitem__(self, key, value):
+        self.tuple = self.tuple._replace(**{self.identifier_map[key]: value})
+
+    def _gen_identifier_map(self, splits):
+        identifier_map = {}
+        identifiers = []
+        for i in range(len(splits)):
+            identifiers.append("splits_" + str(i))
+            identifier_map[splits[i]] = "splits_" + str(i)
+        return identifier_map, identifiers
+
+    def __len__(self):
+        return len(self.tuple)
+
+
+def import_main_class(module_path):
+    """
+    Import a module at module_path and return its DatasetBuilder class.
+
+    """
+    module_path = DATASETS_MODULE_PATH + module_path
+    module = importlib.import_module(module_path)
+    main_cls_type = DatasetBuilder
+
+    # Find the main class in our imported module
+    module_main_cls = None
+    for name, obj in module.__dict__.items():
+        if isinstance(obj, type) and issubclass(obj, main_cls_type):
+            if name == "DatasetBuilder":
+                continue
+            module_main_cls = obj
+            break
+
+    return module_main_cls
+
+
+def load_from_hf(path, name=None, splits=None, **kwargs):
+    from datasets import DatasetDict, IterableDatasetDict
+    from datasets import load_dataset as load_hf_dataset
+    from datasets.features import ClassLabel
+
+    try:
+        hf_datasets = load_hf_dataset(path, name=name, split=splits, **kwargs)
+    except FileNotFoundError:
+        raise FileNotFoundError("Couldn't find the dataset script for '" + path + "' on PaddleNLP or HuggingFace")
+    else:
+        label_list = []
+        if isinstance(hf_datasets, DatasetDict):
+            datasets = DatasetTuple(list(hf_datasets.keys()))
+            for split, ds in hf_datasets.items():
+                for feature in ds.features.values():
+                    if isinstance(feature, ClassLabel):
+                        label_list = feature.names
+                datasets[split] = MapDataset(ds, label_list=label_list)
+        elif isinstance(hf_datasets, IterableDatasetDict):
+            datasets = DatasetTuple(list(hf_datasets.keys()))
+            for split, ds in hf_datasets.items():
+                datasets[split] = IterDataset(ds)
+        elif isinstance(hf_datasets, list):
+            datasets = DatasetTuple(splits)
+            for i, split in enumerate(splits):
+                for feature in hf_datasets[i].features.values():
+                    if isinstance(feature, ClassLabel):
+                        label_list = feature.names
+                datasets[split] = MapDataset(hf_datasets[i], label_list=label_list)
+        else:
+            for feature in hf_datasets.features.values():
+                if isinstance(feature, ClassLabel):
+                    label_list = feature.names
+            datasets = MapDataset(hf_datasets, label_list=label_list)
+    return datasets
+
+
+def load_dataset(path_or_read_func, name=None, data_files=None, splits=None, lazy=None, **kwargs):
+    """
+    This method will load a dataset, either form PaddleNLP library or from a
+    self-defined data loading script, by calling functions in `DatasetBuilder`.
+
+    For all the names of datasets in PaddleNLP library, see here:  `dataset_list
+    <https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_list.html>`__.
+
+    Either `splits` or `data_files` must be specified.
+
+    Args:
+        path_or_read_func (str|callable): Name of the dataset processing script
+            in PaddleNLP library or a custom data reading function.
+        name (str, optional): Additional name to select a more specific dataset.
+            Defaults to None.
+        data_files (str|list|tuple|dict, optional): Defining the path of dataset
+            files. If None. `splits` must be specified. Defaults to None.
+        splits (str|list|tuple, optional): Which split of the data to load. If None.
+            `data_files` must be specified. Defaults to None.
+        lazy (bool, optional): Weather to return `MapDataset` or an `IterDataset`.
+            True for `IterDataset`. False for `MapDataset`. If None, return the
+            default type of this dataset. Defaults to None.
+        kwargs (dict): Other keyword arguments to be passed to the `DatasetBuilder`.
+
+    Returns:
+        A `MapDataset` or `IterDataset` or a tuple of those.
+
+    For how to use this function, please see `dataset_load
+    <https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_load.html>`__
+    and `dataset_self_defined
+    <https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_self_defined.html>`__
+
+    """
+    if inspect.isfunction(path_or_read_func):
+        assert lazy is not None, "lazy can not be None in custom mode."
+        kwargs["name"] = name
+        kwargs["data_files"] = data_files
+        kwargs["splits"] = splits
+        custom_kwargs = {}
+        for name in inspect.signature(path_or_read_func).parameters.keys():
+            if name in kwargs.keys():
+                custom_kwargs[name] = kwargs[name]
+
+        reader_instance = SimpleBuilder(lazy=lazy, read_func=path_or_read_func)
+        return reader_instance.read(**custom_kwargs)
+    else:
+        try:
+            reader_cls = import_main_class(path_or_read_func)
+        except ModuleNotFoundError:
+            datasets = load_from_hf(
+                path_or_read_func, name=name, splits=splits, data_files=data_files, streaming=lazy, **kwargs
+            )
+        else:
+            reader_instance = reader_cls(lazy=lazy, name=name, **kwargs)
+
+            # Check if selected name and split is valid in this DatasetBuilder
+            if hasattr(reader_instance, "BUILDER_CONFIGS"):
+                if name in reader_cls.BUILDER_CONFIGS.keys():
+                    split_names = reader_cls.BUILDER_CONFIGS[name]["splits"].keys()
+                else:
+                    raise ValueError(
+                        'Invalid name "{}". Should be one of {}.'.format(name, list(reader_cls.BUILDER_CONFIGS.keys()))
+                    )
+            elif hasattr(reader_instance, "SPLITS"):
+                split_names = reader_instance.SPLITS.keys()
+            else:
+                raise AttributeError("Either 'SPLITS' or 'BUILDER_CONFIGS' must be implemented for DatasetBuilder.")
+
+            selected_splits = []
+            if isinstance(splits, list) or isinstance(splits, tuple):
+                selected_splits.extend(splits)
+            else:
+                selected_splits += [splits]
+
+            for split_name in selected_splits:
+                if split_name not in split_names and split_name is not None:
+                    raise ValueError('Invalid split "{}". Should be one of {}.'.format(split_name, list(split_names)))
+
+            datasets = reader_instance.read_datasets(data_files=data_files, splits=splits)
+        return datasets
+
+
+class MapDataset(Dataset):
+    """
+    Wraps a map-style dataset-like object as an instance of `MapDataset`, and equips it
+    with `map` and other utility methods. All non-magic methods of the raw object
+    are also accessible.
+
+    Args:
+        data (list|Dataset): An object with `__getitem__` and `__len__` methods. It could
+            be a list or a subclass of `paddle.io.Dataset`.
+        kwargs (dict, optional): Other information to be passed to the dataset.
+
+    For examples of this class, please see `dataset_self_defined
+    <https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_self_defined.html>`__.
+
+    """
+
+    def __init__(self, data, **kwargs):
+        self.data = data
+        self._transform_pipline = []
+        self.new_data = self.data
+        self.info = kwargs
+        self.label_list = self.info.pop("label_list", None)
+        self.vocab_info = self.info.pop("vocab_info", None)
+
+    def _transform(self, data):
+        for fn in self._transform_pipline:
+            data = fn(data)
+        return data
+
+    def __getitem__(self, idx):
+        """
+        Basic function of `MapDataset` to get sample from dataset with a given
+        index.
+        """
+        return self._transform(self.new_data[idx]) if self._transform_pipline else self.new_data[idx]
+
+    def __len__(self):
+        """
+        Returns the number of samples in dataset.
+        """
+        return len(self.new_data)
+
+    def filter(self, fn, num_workers=0):
+        """
+        Filters samples by the filter function and uses the filtered data to
+        update this dataset.
+
+        Args:
+            fn (callable): A filter function that takes a sample as input and
+                returns a boolean. Samples that return False would be discarded.
+            num_workers(int, optional): Number of processes for multiprocessing. If
+                set to 0, it doesn't use multiprocessing. Defaults to `0`.
+        """
+        assert num_workers >= 0, "num_workers should be a non-negative value"
+        if num_workers > 1:
+            shards = [
+                self._shard(num_shards=num_workers, index=index, contiguous=True) for index in range(num_workers)
+            ]
+            kwds_per_shard = [dict(self=shards[rank], fn=fn) for rank in range(num_workers)]
+            pool = Pool(num_workers, initargs=(RLock(),))
+
+            results = [pool.apply_async(self.__class__._filter, kwds=kwds) for kwds in kwds_per_shard]
+            transformed_shards = [r.get() for r in results]
+
+            pool.close()
+            pool.join()
+            self.new_data = []
+            for i in range(num_workers):
+                self.new_data += transformed_shards[i].new_data
+            return self
+        else:
+            return self._filter(fn)
+
+    def _filter(self, fn):
+        self.new_data = [self.new_data[idx] for idx in range(len(self.new_data)) if fn(self.new_data[idx])]
+        return self
+
+    def shard(self, num_shards=None, index=None, contiguous=False):
+        self.new_data = self._shard(num_shards=num_shards, index=index, contiguous=contiguous).data
+        return self
+
+    def _shard(self, num_shards=None, index=None, contiguous=False):
+        """
+        Split the dataset into `num_shards` pieces. Note that the size of each
+        shard might be different because the original dataset may not be evenly
+        divisible.
+
+        Args:
+            num_shards (int, optional): An integer representing the number of
+                data shards. If None, `num_shards` would be number of trainers.
+                Defaults to `None`.
+            index (int, optional): An integer representing the index of the
+                current shard. If None, `index` would be the current trainer rank
+                id. Defaults to `None`.
+            contiguous: (bool, optional): If true, contiguous chunks of data
+                will be select for sharding. And total number of examples will
+                be the same. Otherwise each shard will contain all examples of
+                dataset whose index mod `num_shards` = `index`. Defaults to `False`.
+        """
+        if num_shards is None:
+            num_shards = dist.get_world_size()
+        if index is None:
+            index = dist.get_rank()
+
+        if contiguous:
+            div = len(self) // num_shards
+            mod = len(self) % num_shards
+            start = div * index + min(index, mod)
+            end = start + div + (1 if index < mod else 0)
+            new_data = [self.new_data[idx] for idx in range(start, end)]
+        else:
+            new_data = [self.new_data[idx] for idx in range(len(self.new_data)) if idx % num_shards == index]
+
+        return MapDataset(new_data)
+
+    def map(self, fn, lazy=True, batched=False, num_workers=0):
+        """
+        Performs specific function on the dataset to transform and update every sample.
+
+        Args:
+            fn (callable): Transformations to be performed. It receives single
+                sample as argument if batched is False. Else it receives all examples.
+            lazy (bool, optional): If True, transformations would be delayed and
+                performed on demand. Otherwise, transforms all samples at once. Note that
+                if `fn` is stochastic, `lazy` should be True or you will get the same
+                result on all epochs. Defaults to False.
+            batched(bool, optional): If True, transformations would take all examples as
+                input and return a collection of transformed examples. Note that if set
+                True, `lazy` option would be ignored. Defaults to False.
+            num_workers(int, optional): Number of processes for multiprocessing. If
+                set to 0, it doesn't use multiprocessing. Note that if set to positive
+                value, `lazy` option would be ignored. Defaults to 0.
+        """
+
+        assert num_workers >= 0, "num_workers should be a non-negative value"
+        if num_workers > 1:
+            shards = [
+                self._shard(num_shards=num_workers, index=index, contiguous=True) for index in range(num_workers)
+            ]
+            kwds_per_shard = [
+                dict(self=shards[rank], fn=fn, lazy=False, batched=batched) for rank in range(num_workers)
+            ]
+            pool = Pool(num_workers, initargs=(RLock(),))
+            results = [pool.apply_async(self.__class__._map, kwds=kwds) for kwds in kwds_per_shard]
+            transformed_shards = [r.get() for r in results]
+            pool.close()
+            pool.join()
+            self.new_data = []
+            for i in range(num_workers):
+                self.new_data += transformed_shards[i].new_data
+            return self
+        else:
+            return self._map(fn, lazy=lazy, batched=batched)
+
+    def _map(self, fn, lazy=True, batched=False):
+        if batched:
+            self.new_data = fn(self.new_data)
+        elif lazy:
+            self._transform_pipline.append(fn)
+        else:
+            self.new_data = [fn(self.new_data[idx]) for idx in range(len(self.new_data))]
+        return self
+
+
+class IterDataset(IterableDataset):
+    """
+    Wraps a dataset-like object as an instance of `IterDataset`, and equips it with
+    `map` and other utility methods. All non-magic methods of the raw object
+    also accessible.
+
+    Args:
+        data (Iterable): An object with `__iter__` function. It can be a Iterable or a
+            subclass of `paddle.io.IterableDataset`.
+        kwargs (dict, optional): Other information to be passed to the dataset.
+
+    For examples of this class, please see `dataset_self_defined
+    <https://paddlenlp.readthedocs.io/zh/latest/data_prepare/dataset_self_defined.html>`__.
+    """
+
+    def __init__(self, data, **kwargs):
+        self.data = data
+        self._transform_pipline = []
+        self._filter_pipline = []
+
+        self.label_list = kwargs.pop("label_list", None)
+        self.vocab_info = kwargs.pop("vocab_info", None)
+
+    def _transform(self, data):
+        for fn in self._transform_pipline:
+            data = fn(data)
+        return data
+
+    def _shard_filter(self, num_samples):
+        return True
+
+    def _filter(self, data):
+        for fn in self._filter_pipline:
+            if not fn(data):
+                return False
+        return True
+
+    def __iter__(self):
+        """
+        yields sample sequentially.
+        """
+        num_samples = 0
+        if inspect.isfunction(self.data):
+            for example in self.data():
+                if (not self._filter_pipline or self._filter(self._filter_pipline)) and self._shard_filter(
+                    num_samples=num_samples
+                ):
+                    yield self._transform(example) if self._transform_pipline else example
+                num_samples += 1
+        else:
+            if inspect.isgenerator(self.data):
+                warnings.warn("Reciving generator as data source, data can only be iterated once")
+            for example in self.data:
+                if (not self._filter_pipline or self._filter(self._filter_pipline)) and self._shard_filter(
+                    num_samples=num_samples
+                ):
+                    yield self._transform(example) if self._transform_pipline else example
+                num_samples += 1
+
+    def skip(self, n):
+        if inspect.isfunction(self.data):
+            raise NotImplementedError("Function-based IterDataset does not support `.skip()`")
+        self.data = islice(self.data, n, None)
+        return self
+
+    def filter(self, fn):
+        """
+        Filters samples by the filter function and uses the filtered data to
+        update this dataset.
+
+        Args:
+            fn (callable): A filter function that takes a sample as input and
+                returns a boolean. Samples that return False are discarded.
+        """
+
+        self._filter_pipline.append(fn)
+
+        return self
+
+    def shard(self, num_shards=None, index=None):
+        """
+        Split the dataset into `num_shards` pieces.
+
+        Args:
+            num_shards (int, optional): An integer representing the number of
+                data shards. If None, `num_shards` would be number of trainers.
+                Defaults to None.
+            index (int, optional): An integer representing the index of the
+                current shard. If None, `index` would be the current trainer rank
+                id. Defaults to None.
+        """
+        if num_shards is None:
+            num_shards = dist.get_world_size()
+        if index is None:
+            index = dist.get_rank()
+
+        def sharder(num_shards, index, num_samples):
+            if num_samples % num_shards == index:
+                return True
+            else:
+                return False
+
+        fn = partial(sharder, num_shards=num_shards, index=index)
+        self._shard_filter = fn
+        return self
+
+    def map(self, fn):
+        """
+        Performs specific function on the dataset to transform and update every sample.
+
+        Args:
+            fn (callable): Transformations to be performed. It receives single
+                sample as argument.
+        """
+
+        self._transform_pipline.append(fn)
+
+        return self
+
+
+class DatasetBuilder:
+    """
+    A base class for all DatasetBuilder. It provides a `read()` function to turn
+    a data file into a MapDataset or IterDataset.
+
+    `_get_data()` function and `_read()` function should be implemented to download
+    data file and read data file into a `Iterable` of the examples.
+
+    For how to define a custom `DatasetBuilder`, please see `contribute_dataset
+    <https://paddlenlp.readthedocs.io/zh/latest/community/contribute_dataset.html>`__.
+    """
+
+    lazy = False
+
+    def __init__(self, lazy=None, name=None, **config):
+        if lazy is not None:
+            self.lazy = lazy
+        self.name = name
+        self.config = config
+
+    def read_datasets(self, splits=None, data_files=None):
+        def remove_if_exit(filepath):
+            if isinstance(filepath, (list, tuple)):
+                for file in filepath:
+                    try:
+                        os.remove(file)
+                    except OSError:
+                        pass
+            else:
+                try:
+                    os.remove(filepath)
+                except OSError:
+                    pass
+
+        if data_files is None:
+            if splits is None:
+                splits = (
+                    list(self.BUILDER_CONFIGS[self.name]["splits"].keys())
+                    if hasattr(self, "BUILDER_CONFIGS")
+                    else list(self.SPLITS.keys())
+                )
+
+            assert (
+                isinstance(splits, str)
+                or (isinstance(splits, list) and isinstance(splits[0], str))
+                or (isinstance(splits, tuple) and isinstance(splits[0], str))
+            ), "`splits` should be a string or list of string or a tuple of string."
+
+            if isinstance(splits, str):
+                splits = [splits]
+            datasets = DatasetTuple(splits)
+            parallel_env = dist.ParallelEnv()
+            unique_endpoints = _get_unique_endpoints(parallel_env.trainer_endpoints[:])
+            # move register hook to first and register togather
+            lock_files = []
+            for split in splits:
+                lock_file = os.path.join(DATA_HOME, self.__class__.__name__)
+                if self.name is not None:
+                    lock_file = lock_file + "." + self.name
+                lock_file += "." + split + ".done" + "." + str(os.getppid())
+                lock_files.append(lock_file)
+            # Must register to all procs to make the lock file can be removed
+            # when any proc breaks. Otherwise, the single registered proc may
+            # not receive proper singal send by the parent proc to exit.
+            atexit.register(lambda: remove_if_exit(lock_files))
+            for split in splits:
+                filename = self._get_data(split)
+                lock_file = os.path.join(DATA_HOME, self.__class__.__name__)
+                if self.name is not None:
+                    lock_file = lock_file + "." + self.name
+                lock_file += "." + split + ".done" + "." + str(os.getppid())
+                # `lock_file` indicates the finished status of`_get_data`.
+                # `_get_data` only works in the `unique_endpoints` specified
+                # proc since `get_path_from_url` only work for it. The other
+                # procs wait `_get_data` to be finished.
+                if parallel_env.current_endpoint in unique_endpoints:
+                    f = open(lock_file, "w")
+                    f.close()
+                else:
+                    while not os.path.exists(lock_file):
+                        time.sleep(1)
+                datasets[split] = self.read(filename=filename, split=split)
+        else:
+            assert (
+                isinstance(data_files, str) or isinstance(data_files, tuple) or isinstance(data_files, list)
+            ), "`data_files` should be a string or tuple or list of strings."
+            if isinstance(data_files, str):
+                data_files = [data_files]
+            default_split = "train"
+            if splits:
+                if isinstance(splits, str):
+                    splits = [splits]
+                datasets = DatasetTuple(splits)
+                assert len(splits) == len(
+                    data_files
+                ), "Number of `splits` and number of `data_files` should be the same if you want to specify the split of loacl data file."
+                for i in range(len(data_files)):
+                    datasets[splits[i]] = self.read(filename=data_files[i], split=splits[i])
+            else:
+                datasets = DatasetTuple(["split" + str(i) for i in range(len(data_files))])
+                for i in range(len(data_files)):
+                    datasets["split" + str(i)] = self.read(filename=data_files[i], split=default_split)
+
+        return datasets if len(datasets) > 1 else datasets[0]
+
+    def read(self, filename, split="train"):
+        """
+        Returns a dataset containing all the examples that can be read from the file path.
+
+        If `self.lazy` is False, this eagerly reads all instances from `self._read()`
+        and returns a `MapDataset`.
+
+        If `self.lazy` is True, this returns an `IterDataset`, which internally
+        relies on the generator created from `self._read()` to lazily produce examples.
+        In this case your implementation of `_read()` must also be lazy
+        (that is, not load all examples into memory at once).
+
+        Args:
+            filename (str): Path of data file to read, usually provided by `_get_data`
+                function.
+            split (str, optional): The split name of selected dataset. This only makes
+                a different when data files of different splits have different structures.
+
+        Returns:
+            A `MapDataset|IterDataset`.
+        """
+
+        label_list = self.get_labels()
+        vocab_info = self.get_vocab()
+
+        def _create_dict(labels):
+            # For multiple labels in the form of list.
+            if isinstance(labels[0], list) or isinstance(labels[0], tuple):
+                label_dict = []
+                for sub_labels in labels:
+                    sub_dict = {}
+                    for i, label in enumerate(sub_labels):
+                        sub_dict[label] = i
+                    label_dict.append(sub_dict)
+            else:
+                label_dict = {}
+                for i, label in enumerate(labels):
+                    label_dict[label] = i
+            return label_dict
+
+        def _convert_label_to_id(labels, label_dict):
+            if isinstance(labels, list) or isinstance(labels, tuple):
+                for label_idx in range(len(labels)):
+                    labels[label_idx] = label_dict[labels[label_idx]]
+            else:
+                labels = label_dict[labels]
+            return labels
+
+        if self.lazy:
+
+            def generate_examples():
+                generator = (
+                    self._read(filename, split) if self._read.__code__.co_argcount > 2 else self._read(filename)
+                )
+                for example in generator:
+                    # We need to check if the example contains label column and confirm its name.
+                    # For now we only allow `label` or `labels` to be the name of label column.
+                    if "labels" in example.keys():
+                        label_col = "labels"
+                    elif "label" in example.keys():
+                        label_col = "label"
+                    else:
+                        label_col = None
+
+                    # Convert class label to label ids.
+                    if label_list is not None and example.get(label_col, None):
+                        label_dict = _create_dict(label_list)
+                        # For multiple labels in the form of list.
+                        if isinstance(label_dict, list):
+                            for idx, sub_dict in enumerate(label_dict):
+                                example[label_col][idx] = _convert_label_to_id(example[label_col][idx], sub_dict)
+                        else:
+                            example[label_col] = _convert_label_to_id(example[label_col], label_dict)
+
+                        yield example
+                    else:
+                        yield example
+
+            return IterDataset(generate_examples(), label_list=label_list, vocab_info=vocab_info)
+        else:
+            examples = self._read(filename, split) if self._read.__code__.co_argcount > 2 else self._read(filename)
+
+            # Then some validation.
+            if not isinstance(examples, list):
+                examples = list(examples)
+
+            if not examples:
+                raise ValueError(
+                    "No instances were read from the given filepath {}. " "Is the path correct?".format(filename)
+                )
+
+            # We need to check if the example contains label column and confirm its name.
+            # For now we only allow `label` or `labels` to be the name of label column.
+            if "labels" in examples[0].keys():
+                label_col = "labels"
+            elif "label" in examples[0].keys():
+                label_col = "label"
+            else:
+                label_col = None
+
+            # Convert class label to label ids.
+            if label_list is not None and examples[0].get(label_col, None):
+                label_dict = _create_dict(label_list)
+                for idx in range(len(examples)):
+                    # For multiple labels in the form of list.
+                    if isinstance(label_dict, list):
+                        for i, sub_dict in enumerate(label_dict):
+                            examples[idx][label_col][i] = _convert_label_to_id(examples[idx][label_col][i], sub_dict)
+                    else:
+                        examples[idx][label_col] = _convert_label_to_id(examples[idx][label_col], label_dict)
+
+            return MapDataset(examples, label_list=label_list, vocab_info=vocab_info)
+
+    def _read(self, filename: str, *args):
+        """
+        Reads examples from the given file_path and returns them as an
+        `Iterable` (which could be a list or a generator).
+
+        This method must be implemented in self-defined `DatasetBuilder`.
+        """
+        raise NotImplementedError
+
+    def _get_data(self, mode: str):
+        """
+        Downloads examples from the given URL and customized split
+        informations and returns a filepath.
+
+        This method must be implemented in self-defined `DatasetBuilder`.
+        """
+        raise NotImplementedError
+
+    def get_labels(self):
+        """
+        Returns list of class labels of the dataset if specified.
+        """
+        return None
+
+    def get_vocab(self):
+        """
+        Returns vocab file path of the dataset if specified.
+        """
+        return None
+
+
+class SimpleBuilder(DatasetBuilder):
+    def __init__(self, lazy, read_func):
+        self._read = read_func
+        self.lazy = lazy
+
+    def read(self, **kwargs):
+        if self.lazy:
+
+            def generate_examples():
+                generator = self._read(**kwargs)
+                for example in generator:
+                    yield example
+
+            return IterDataset(generate_examples)
+        else:
+            examples = self._read(**kwargs)
+            if hasattr(examples, "__len__") and hasattr(examples, "__getitem__"):
+                return MapDataset(examples)
+            else:
+                return MapDataset(list(examples))
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/drcd.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/drcd.py
new file mode 100644
index 000000000..4c785078f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/drcd.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import json
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["DRCD"]
+
+
+class DRCD(DatasetBuilder):
+    """
+    Delta Reading Comprehension Dataset is an open domain traditional Chinese
+    machine reading comprehension (MRC) dataset. The dataset contains 10,014
+    paragraphs from 2,108 Wikipedia articles and 30,000+ questions generated
+    by annotators.
+    """
+
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5", "URL"))
+    SPLITS = {
+        "train": META_INFO(
+            os.path.join("DRCD_training.json"),
+            "bbeefc8ad7585ea3e4fef8c677e7643e",
+            "https://bj.bcebos.com/paddlenlp/datasets/DRCD/DRCD_training.json",
+        ),
+        "dev": META_INFO(
+            os.path.join("DRCD_dev.json"),
+            "42c2f2bca84fc36cf65a86563b0540e6",
+            "https://bj.bcebos.com/paddlenlp/datasets/DRCD/DRCD_dev.json",
+        ),
+        "test": META_INFO(
+            os.path.join("DRCD_test.json"),
+            "e36a295c1cb8c6b9fb28015907a42d9e",
+            "https://bj.bcebos.com/paddlenlp/datasets/DRCD/DRCD_test.json",
+        ),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash, URL = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(URL, default_root)
+
+        return fullname
+
+    def _read(self, filename, *args):
+        with open(filename, "r", encoding="utf8") as f:
+            input_data = json.load(f)["data"]
+        for entry in input_data:
+            title = entry.get("title", "").strip()
+            for paragraph in entry["paragraphs"]:
+                context = paragraph["context"].strip()
+                for qa in paragraph["qas"]:
+                    qas_id = qa["id"]
+                    question = qa["question"].strip()
+                    answer_starts = [answer["answer_start"] for answer in qa.get("answers", [])]
+                    answers = [answer["text"].strip() for answer in qa.get("answers", [])]
+
+                    yield {
+                        "id": qas_id,
+                        "title": title,
+                        "context": context,
+                        "question": question,
+                        "answers": answers,
+                        "answer_starts": answer_starts,
+                    }
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/drcd_cn.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/drcd_cn.py
new file mode 100644
index 000000000..799d22333
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/drcd_cn.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import json
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["DRCD_CN"]
+
+
+class DRCD_CN(DatasetBuilder):
+    """
+    Delta Reading Comprehension Dataset is an open domain traditional Chinese
+    machine reading comprehension (MRC) dataset. The dataset contains 10,014
+    paragraphs from 2,108 Wikipedia articles and 30,000+ questions generated
+    by annotators.
+
+    This dataset translate origin Traditional Chinese to Simplified Chinese.
+    """
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/drcd_cn.tar.gz"
+    MD5 = "8ceed5076c4f59d7a3666b13851e41fa"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    SPLITS = {
+        "train": META_INFO(os.path.join("drcd_cn", "train.json"), "5a51ee5a106e16965c85fce364d316d7"),
+        "dev": META_INFO(os.path.join("drcd_cn", "dev.json"), "f352b17cddeed69877ff94d4321817ce"),
+        "test": META_INFO(os.path.join("drcd_cn", "test.json"), "e674a667033c4e8c9ae6d05d95073d02"),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(self.URL, default_root)
+
+        return fullname
+
+    def _read(self, filename, *args):
+        with open(filename, "r", encoding="utf8") as f:
+            input_data = json.load(f)["data"]
+        for entry in input_data:
+            title = entry.get("title", "").strip()
+            for paragraph in entry["paragraphs"]:
+                context = paragraph["context"].strip()
+                for qa in paragraph["qas"]:
+                    qas_id = qa["id"]
+                    question = qa["question"].strip()
+                    answer_starts = [answer["answer_start"] for answer in qa.get("answers", [])]
+                    answers = [answer["text"].strip() for answer in qa.get("answers", [])]
+
+                    yield {
+                        "id": qas_id,
+                        "title": title,
+                        "context": context,
+                        "question": question,
+                        "answers": answers,
+                        "answer_starts": answer_starts,
+                    }
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/duconv.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/duconv.py
new file mode 100644
index 000000000..315bd29ea
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/duconv.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import json
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["DuConv"]
+
+
+class DuConv(DatasetBuilder):
+    """
+    Duconv is an dialogue dataset based on knowledge map released by Baidu.
+    Duconv contains two test sets, test_1 and test_2. And the test_1 contains
+    the response of the conversation but test_2 not. More information please
+    refer to `https://arxiv.org/abs/1503.02364`.
+    """
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/DuConv.tar.gz"
+    MD5 = "ef496871787f66718e567d62bd8f3546"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    SPLITS = {
+        "train": META_INFO(os.path.join("DuConv", "train.txt"), "26192809b8740f620b95c9e18c65edf4"),
+        "dev": META_INFO(os.path.join("DuConv", "dev.txt"), "2e5ee6396b0467309cad75d37d6460b1"),
+        "test_1": META_INFO(os.path.join("DuConv", "test_1.txt"), "8ec83a72318d004691962647905cc345"),
+        "test_2": META_INFO(os.path.join("DuConv", "test_2.txt"), "e8d5f04a5d0a03ab110b1605d0a632ad"),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(self.URL, default_root, self.MD5)
+        return fullname
+
+    def _read(self, filename, *args):
+        with open(filename, "r", encoding="utf-8") as fin:
+            for line in fin:
+                example = json.loads(line.strip())
+                yield example
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/dureader_checklist.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/dureader_checklist.py
new file mode 100644
index 000000000..1867ae0ae
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/dureader_checklist.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import json
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["DuReaderChecklist"]
+
+
+class DuReaderChecklist(DatasetBuilder):
+    """
+    A high-quality Chinese machine reading comprehension dataset for real
+    application scenarios. It that focus on challenging the MRC models
+    from multiple aspects, including understanding of vocabulary, phrase,
+    semantic role, reasoning and so on.
+    """
+
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5", "URL"))
+    SPLITS = {
+        "train": META_INFO(
+            os.path.join("dataset", "train.json"),
+            "28881033c067c690826a841d2d72a18a",
+            "https://bj.bcebos.com/paddlenlp/datasets/lic2021/dureader_checklist.dataset.tar.gz",
+        ),
+        "dev": META_INFO(
+            os.path.join("dataset", "dev.json"),
+            "28881033c067c690826a841d2d72a18a",
+            "https://bj.bcebos.com/paddlenlp/datasets/lic2021/dureader_checklist.dataset.tar.gz",
+        ),
+        "test1": META_INFO(
+            os.path.join("test1", "test1.json"),
+            "d7047ada5fb6734b4e58bfa198d47f6e",
+            "https://bj.bcebos.com/paddlenlp/datasets/lic2021/dureader_checklist.test1.tar.gz",
+        ),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash, URL = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(URL, default_root, data_hash)
+
+        return fullname
+
+    def _read(self, filename, *args):
+        with open(filename, "r", encoding="utf8") as f:
+            input_data = json.load(f)["data"]
+        for entry in input_data:
+            title = entry.get("title", "").strip()
+            for paragraph in entry["paragraphs"]:
+                context = paragraph["context"].strip()
+                for qa in paragraph["qas"]:
+                    qas_id = qa["id"]
+                    question = qa["question"].strip()
+                    answer_starts = []
+                    answers = []
+                    is_impossible = False
+                    qa_type = qa.get("type", "")
+
+                    if "is_impossible" in qa.keys():
+                        is_impossible = qa["is_impossible"]
+
+                    answer_starts = [answer["answer_start"] for answer in qa.get("answers", [])]
+                    answers = [answer["text"].strip() for answer in qa.get("answers", [])]
+
+                    yield {
+                        "id": qas_id,
+                        "title": title,
+                        "type": qa_type,
+                        "context": context,
+                        "question": question,
+                        "answers": answers,
+                        "answer_starts": answer_starts,
+                        "is_impossible": is_impossible,
+                    }
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/dureader_qg.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/dureader_qg.py
new file mode 100644
index 000000000..1eb451f82
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/dureader_qg.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import json
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["DuReaderQG"]
+
+
+class DuReaderQG(DatasetBuilder):
+    """
+    This dataset is made form the machine reading comprehension dataset
+    (i.e. DuReader robust) for question generation task.
+    """
+
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5", "URL"))
+    SPLITS = {
+        "train": META_INFO(
+            os.path.join("train.json"),
+            "a6d96bda4662e657ce644ed0e178fe70",
+            "https://bj.bcebos.com/paddlenlp/datasets/DuReaderQG/train.json",
+        ),
+        "dev": META_INFO(
+            os.path.join("dev.json"),
+            "a6bd22b0da0ed8e20784398f507d4acc",
+            "https://bj.bcebos.com/paddlenlp/datasets/DuReaderQG/dev.json",
+        ),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash, URL = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(URL, default_root)
+
+        return fullname
+
+    def _read(self, filename, *args):
+        with open(filename, "r", encoding="utf8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+
+                json_data = json.loads(line)
+                title = json_data.get("answer", None)
+
+                yield {
+                    "source": json_data["context"],
+                    "target": json_data.get("question", ""),
+                    "title": title,
+                    "id": json_data["id"],
+                }
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/dureader_robust.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/dureader_robust.py
new file mode 100644
index 000000000..ca13ef835
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/dureader_robust.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import json
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["DuReaderRobust"]
+
+
+class DuReaderRobust(DatasetBuilder):
+    """
+    The machine reading comprehension dataset (i.e. DuReader robust) is designed
+    to measure the robustness of a reading comprehension model, including the
+    over-sensitivity, over-stability and generalization ability of the model.
+    """
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/dureader_robust-data.tar.gz"
+    MD5 = "82f3d191a115ec17808856866787606e"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    SPLITS = {
+        "train": META_INFO(os.path.join("dureader_robust-data", "train.json"), "800a3dcb742f9fdf9b11e0a83433d4be"),
+        "dev": META_INFO(os.path.join("dureader_robust-data", "dev.json"), "ae73cec081eaa28a735204c4898a2222"),
+        "test": META_INFO(os.path.join("dureader_robust-data", "test.json"), "e0e8aa5c7b6d11b6fc3935e29fc7746f"),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename, *args):
+        with open(filename, "r", encoding="utf8") as f:
+            input_data = json.load(f)["data"]
+        for entry in input_data:
+            title = entry.get("title", "").strip()
+            for paragraph in entry["paragraphs"]:
+                context = paragraph["context"].strip()
+                for qa in paragraph["qas"]:
+                    qas_id = qa["id"]
+                    question = qa["question"].strip()
+                    answer_starts = [answer["answer_start"] for answer in qa.get("answers", [])]
+                    answers = [answer["text"].strip() for answer in qa.get("answers", [])]
+
+                    yield {
+                        "id": qas_id,
+                        "title": title,
+                        "context": context,
+                        "question": question,
+                        "answers": answers,
+                        "answer_starts": answer_starts,
+                    }
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/dureader_yesno.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/dureader_yesno.py
new file mode 100644
index 000000000..45172f708
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/dureader_yesno.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import json
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["DuReaderYesNo"]
+
+
+class DuReaderYesNo(DatasetBuilder):
+    """
+    DuReaderYesNo is a dataset with the judgment of opinion polarity as the
+    target task. Polarity of opinion is divided into three categories
+    {Yes, No, Depends}.
+    """
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/dureader_yesno-data.tar.gz"
+    MD5 = "30c744d65e87fdce00cdc707fd008138"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    SPLITS = {
+        "train": META_INFO(os.path.join("dureader_yesno-data", "train.json"), "c469a0ef3f975cfd705e3553ddb27cc1"),
+        "dev": META_INFO(os.path.join("dureader_yesno-data", "dev.json"), "c38544f8b5a7b567492314e3232057b5"),
+        "test": META_INFO(os.path.join("dureader_yesno-data", "test.json"), "1c7a1a3ea5b8992eeaeea017fdc2d55f"),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename, *args):
+        with open(filename, "r", encoding="utf8") as f:
+            for entry in f:
+                source = json.loads(entry.strip())
+                yield {
+                    "id": source["id"],
+                    "question": source["question"],
+                    "answer": source["answer"],
+                    "labels": source["yesno_answer"],
+                }
+
+    def get_labels(self):
+
+        return ["Yes", "No", "Depends"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/fewclue.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/fewclue.py
new file mode 100644
index 000000000..35f40a215
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/fewclue.py
@@ -0,0 +1,336 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+
+class FewCLUE(DatasetBuilder):
+    """
+    FewCLUE: Few-shot learning for Chinese Language Understanding Evaluation
+    From: https://github.com/CLUEbenchmark/FewCLUE
+
+    bustum:
+        XiaoBu Dialogue Short Text Matching
+
+    chid:
+        Chinese IDiom Dataset for Cloze Test
+
+    iflytek:
+        The Microsoft Research Paraphrase Corpus dataset.
+
+    tnews:
+        Toutiao Short Text Classificaiton for News
+
+    eprstmt:
+        E-commerce Product Review Dataset for Sentiment Analysis
+
+    ocnli:
+        Original Chinese Natural Language Inference
+
+    csldcp:
+        The classification data set of Chinese science and Literature Discipline
+
+    cluewsc:
+        WSC Winograd
+    csl:
+        Paper Keyword Recognition
+    """
+
+    BUILDER_CONFIGS = {
+        "bustm": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/FewCLUE/fewclue_bustm.tar.gz",
+            "md5": "206e037a88a57a8ca1ea157fdb756b14",
+            "splits": {
+                "train_0": [os.path.join("fewclue_bustm", "train_0.json"), "7d90d65c5305df064cbe0ea5f55be1eb"],
+                "train_1": [os.path.join("fewclue_bustm", "train_1.json"), "5e2ae6ce0129a39f14676d0b24090927"],
+                "train_2": [os.path.join("fewclue_bustm", "train_2.json"), "8c94f08f6f2cc93eaeb3f0cbc58aee2d"],
+                "train_3": [os.path.join("fewclue_bustm", "train_3.json"), "6bd32b4a15959ca037f7043e06a7663d"],
+                "train_4": [os.path.join("fewclue_bustm", "train_4.json"), "99a92cd924e1e6b4bd7c47d561fcbfee"],
+                "train_few_all": [
+                    os.path.join("fewclue_bustm", "train_few_all.json"),
+                    "7415f826a59eea3e4b319c70f6182f21",
+                ],
+                "dev_0": [os.path.join("fewclue_bustm", "dev_0.json"), "703c85a4595304a707f7b7caa85974f4"],
+                "dev_1": [os.path.join("fewclue_bustm", "dev_1.json"), "b16aa8ef45c51956be768e8e2810db4e"],
+                "dev_2": [os.path.join("fewclue_bustm", "dev_2.json"), "c5483c83c882090314e76bb7dc1e7d5a"],
+                "dev_3": [os.path.join("fewclue_bustm", "dev_3.json"), "bfcfdf318f72ac40095a4b671c8b8ec5"],
+                "dev_4": [os.path.join("fewclue_bustm", "dev_4.json"), "ac061fedac0c360d08090a2e19addcae"],
+                "dev_few_all": [os.path.join("fewclue_bustm", "dev_few_all.json"), "678159abbff4a9704001190541a45000"],
+                "unlabeled": [os.path.join("fewclue_bustm", "unlabeled.json"), "8ebf2b2178ca6e9ad3aab09b86dfaafb"],
+                "test": [os.path.join("fewclue_bustm", "test.json"), "28363457614d6fbfdd0487c3451eb9d1"],
+                "test_public": [os.path.join("fewclue_bustm", "test_public.json"), "b805ad47d511d819bd723b1c63a1a2dc"],
+            },
+            "labels": None,
+        },
+        "chid": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/FewCLUE/fewclue_chid.tar.gz",
+            "md5": "31d209e1bda2703708f2a53da66ca6ef",
+            "splits": {
+                "train_0": [os.path.join("fewclue_chid", "train_0.json"), "9fe1b1e9c2174c34bf2470b2b27e0d12"],
+                "train_1": [os.path.join("fewclue_chid", "train_1.json"), "3a3971f28707250a65a3cbdeb7c40711"],
+                "train_2": [os.path.join("fewclue_chid", "train_2.json"), "ab65bd8ca1ad1a4d464f0fd50adb5e24"],
+                "train_3": [os.path.join("fewclue_chid", "train_3.json"), "5ac78bc3bf2dbfff754a997298abae54"],
+                "train_4": [os.path.join("fewclue_chid", "train_4.json"), "9c3ad59e850bc2133d45d3d57353ba2c"],
+                "train_few_all": [
+                    os.path.join("fewclue_chid", "train_few_all.json"),
+                    "5d14b6e6aa7cbc77f0ea21d9bf36e740",
+                ],
+                "dev_0": [os.path.join("fewclue_chid", "dev_0.json"), "d50b501c0d80da404b09a3899feae907"],
+                "dev_1": [os.path.join("fewclue_chid", "dev_1.json"), "e00c8c98dd9d79f47fd38f012c80c23b"],
+                "dev_2": [os.path.join("fewclue_chid", "dev_2.json"), "283a68c62042f99740fc16d77d9df749"],
+                "dev_3": [os.path.join("fewclue_chid", "dev_3.json"), "09ddb889c668368ee5842ff1f6611817"],
+                "dev_4": [os.path.join("fewclue_chid", "dev_4.json"), "c4162fe8593fd91623c17abc7b0a0532"],
+                "dev_few_all": [os.path.join("fewclue_chid", "dev_few_all.json"), "6e0d456dc6d103f0db677cda3b607e20"],
+                "unlabeled": [os.path.join("fewclue_chid", "unlabeled.json"), "e4772b7600b348e9ff2245cef6a00812"],
+                "test": [os.path.join("fewclue_chid", "test.json"), "bf46b7a643b51f64dd890e3fcae8802a"],
+                "test_public": [os.path.join("fewclue_chid", "test_public.json"), "c8c3765c4319e370f752b601b9f2fb80"],
+            },
+            "labels": None,
+        },
+        "iflytek": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/FewCLUE/fewclue_iflytek.tar.gz",
+            "md5": "6f60fd6e0ab35c934732e41b7b7489b7",
+            "splits": {
+                "train_0": [os.path.join("fewclue_iflytek", "train_0.json"), "43e5f8ab327ae5f446fc0cfd97b6341d"],
+                "train_1": [os.path.join("fewclue_iflytek", "train_1.json"), "b3c04b6eec6f82e53f2a913b2487974a"],
+                "train_2": [os.path.join("fewclue_iflytek", "train_2.json"), "a4fdb0055ef1cb5543fef932a88092d0"],
+                "train_3": [os.path.join("fewclue_iflytek", "train_3.json"), "b8626c171555afb8e25d78b32cc2cfb1"],
+                "train_4": [os.path.join("fewclue_iflytek", "train_4.json"), "91dde0c9c939a3bc7768b105427cb3ef"],
+                "train_few_all": [
+                    os.path.join("fewclue_iflytek", "train_few_all.json"),
+                    "db4ceaf7e6682be02f4a9e9138fcda8c",
+                ],
+                "dev_0": [os.path.join("fewclue_iflytek", "dev_0.json"), "0703cb79c0c4fcb120c2cdeea2c56a6c"],
+                "dev_1": [os.path.join("fewclue_iflytek", "dev_1.json"), "a4b975f7ee524e1479d2067118fe15f5"],
+                "dev_2": [os.path.join("fewclue_iflytek", "dev_2.json"), "c0280a2675012bea323a36eb28ba2ecc"],
+                "dev_3": [os.path.join("fewclue_iflytek", "dev_3.json"), "ffdd7073ae25e40a8fa2c95f50f71c1f"],
+                "dev_4": [os.path.join("fewclue_iflytek", "dev_4.json"), "9e9a93fe76653ab7ee587b67061930ac"],
+                "dev_few_all": [
+                    os.path.join("fewclue_iflytek", "dev_few_all.json"),
+                    "86ec5c85c126e8e91efc274e79c39752",
+                ],
+                "unlabeled": [os.path.join("fewclue_iflytek", "unlabeled.json"), "431e0c787373b25f877e2c7b2fc91f91"],
+                "test": [os.path.join("fewclue_iflytek", "test.json"), "ea764519ddb4369767d07664afde3325"],
+                "test_public": [
+                    os.path.join("fewclue_iflytek", "test_public.json"),
+                    "b8ec7c77457baa842666f6e6620ab8fd",
+                ],
+            },
+            "labels": None,
+        },
+        "tnews": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/FewCLUE/fewclue_tnews.tar.gz",
+            "md5": "c1682c753e504fdba28328c0c9298e84",
+            "splits": {
+                "train_0": [os.path.join("fewclue_tnews", "train_0.json"), "e540cbcbf224e9c2e8c1297abab37d1d"],
+                "train_1": [os.path.join("fewclue_tnews", "train_1.json"), "019bb64e35371f6093451a8e7c720d02"],
+                "train_2": [os.path.join("fewclue_tnews", "train_2.json"), "9403d45f1b65fdbea38503e842e0e915"],
+                "train_3": [os.path.join("fewclue_tnews", "train_3.json"), "2f05be9b4f4c3b4fb468864f092005ac"],
+                "train_4": [os.path.join("fewclue_tnews", "train_4.json"), "ced405a502292f84f305214191cbd8d0"],
+                "train_few_all": [
+                    os.path.join("fewclue_tnews", "train_few_all.json"),
+                    "274340c49822c9cf06286bd74744cad4",
+                ],
+                "dev_0": [os.path.join("fewclue_tnews", "dev_0.json"), "ee20628d0d544869f9cc5442658602e4"],
+                "dev_1": [os.path.join("fewclue_tnews", "dev_1.json"), "15bd699553c8742f5d15909bf0aecddb"],
+                "dev_2": [os.path.join("fewclue_tnews", "dev_2.json"), "f8493a1e89d9a1e915700f0a46dda861"],
+                "dev_3": [os.path.join("fewclue_tnews", "dev_3.json"), "8948af6083f5d69ccbd1c6a9f2cc9ea6"],
+                "dev_4": [os.path.join("fewclue_tnews", "dev_4.json"), "508790da261bfd83beffcc64fef3aa66"],
+                "dev_few_all": [os.path.join("fewclue_tnews", "dev_few_all.json"), "9b079af311d8ccfb9938eb3f11b27ea7"],
+                "unlabeled": [os.path.join("fewclue_tnews", "unlabeled.json"), "6ce9e45f56521fd80e32980ef73fa7b7"],
+                "test": [os.path.join("fewclue_tnews", "test.json"), "d21791d746cd0035eaeeef9b3b9f9487"],
+                "test_public": [os.path.join("fewclue_tnews", "test_public.json"), "5539e4a3f0abc2aa4f84da04bf02ca0d"],
+            },
+            "labels": None,
+        },
+        "eprstmt": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/FewCLUE/fewclue_eprstmt.tar.gz",
+            "md5": "016091564b689fd36f52eab5e1e5407c",
+            "splits": {
+                "train_0": [os.path.join("fewclue_eprstmt", "train_0.json"), "d027ef9d3a19b4939c6bab3013397f16"],
+                "train_1": [os.path.join("fewclue_eprstmt", "train_1.json"), "aa70803b42143c648e127f5091c89512"],
+                "train_2": [os.path.join("fewclue_eprstmt", "train_2.json"), "acafc32e7c241300b943fd2557c6aacf"],
+                "train_3": [os.path.join("fewclue_eprstmt", "train_3.json"), "1cabd524e83259037f2192d978a7a32b"],
+                "train_4": [os.path.join("fewclue_eprstmt", "train_4.json"), "8648c607f00da8f2235e744a86f44c8f"],
+                "train_few_all": [
+                    os.path.join("fewclue_eprstmt", "train_few_all.json"),
+                    "72e4f19448bfb3b01229c3cd94d4e3e7",
+                ],
+                "dev_0": [os.path.join("fewclue_eprstmt", "dev_0.json"), "b6aab58bc487ad6174118d8ccf87a9e1"],
+                "dev_1": [os.path.join("fewclue_eprstmt", "dev_1.json"), "41a18a4b4d0c567c6568ff4577dbec0a"],
+                "dev_2": [os.path.join("fewclue_eprstmt", "dev_2.json"), "618590661a58ea660cabff917cc41044"],
+                "dev_3": [os.path.join("fewclue_eprstmt", "dev_3.json"), "18274080ad1d6612582f89065c1f19af"],
+                "dev_4": [os.path.join("fewclue_eprstmt", "dev_4.json"), "d5d8017e3838b6184e648696fe65fbb3"],
+                "dev_few_all": [
+                    os.path.join("fewclue_eprstmt", "dev_few_all.json"),
+                    "9cbda31b17f3adcb32ea89b020209806",
+                ],
+                "unlabeled": [os.path.join("fewclue_eprstmt", "unlabeled.json"), "e8802dad5889d7cc8f085f7d39aeb33b"],
+                "test": [os.path.join("fewclue_eprstmt", "test.json"), "05282edba3283a791167d0ce0343d182"],
+                "test_public": [
+                    os.path.join("fewclue_eprstmt", "test_public.json"),
+                    "704c551bc35d7fb2e4548637b11dabec",
+                ],
+            },
+            "labels": None,
+        },
+        "ocnli": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/FewCLUE/fewclue_ocnli.tar.gz",
+            "md5": "a49a160987d67d26e217b98edeee44a9",
+            "splits": {
+                "train_0": [os.path.join("fewclue_ocnli", "train_0.json"), "45a9a144919efde95aa53dc8b8ba9748"],
+                "train_1": [os.path.join("fewclue_ocnli", "train_1.json"), "a63b358e1b9e3ecf833a174d65713e11"],
+                "train_2": [os.path.join("fewclue_ocnli", "train_2.json"), "7882feb198022fe3cb6338f3652a5216"],
+                "train_3": [os.path.join("fewclue_ocnli", "train_3.json"), "0c6321202ca1fca9843259e6b1e83f5b"],
+                "train_4": [os.path.join("fewclue_ocnli", "train_4.json"), "f0c272e4a846b9f2483d70314a2fdff4"],
+                "train_few_all": [
+                    os.path.join("fewclue_ocnli", "train_few_all.json"),
+                    "f6d9b9198884d3a27249b346933661b6",
+                ],
+                "dev_0": [os.path.join("fewclue_ocnli", "dev_0.json"), "99f4dff1afabe4eb6808cc3e5bc5f422"],
+                "dev_1": [os.path.join("fewclue_ocnli", "dev_1.json"), "4f3b1d87ebf082ef71d29e76d9aaf909"],
+                "dev_2": [os.path.join("fewclue_ocnli", "dev_2.json"), "4c3c103f663a84f5c4fc04ee6aef98fb"],
+                "dev_3": [os.path.join("fewclue_ocnli", "dev_3.json"), "73687b04ae00f8750981ed3f86ef0baa"],
+                "dev_4": [os.path.join("fewclue_ocnli", "dev_4.json"), "b029f7b3f6d4681f4416fa2bc146e227"],
+                "dev_few_all": [os.path.join("fewclue_ocnli", "dev_few_all.json"), "f0235528abf52543c0fdec7f27dd70ae"],
+                "unlabeled": [os.path.join("fewclue_ocnli", "unlabeled.json"), "3db8319afb94780d04bfc7dff57efe81"],
+                "test": [os.path.join("fewclue_ocnli", "test.json"), "a82e69d8372ef99537c64aacba10dd4b"],
+                "test_public": [os.path.join("fewclue_ocnli", "test_public.json"), "ce8229a27a6948a63a3492d6acd6ee1f"],
+            },
+            "labels": None,
+        },
+        "csldcp": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/FewCLUE/fewclue_csldcp.tar.gz",
+            "md5": "5ce33afe9b4b8104e028e04a97e70d5c",
+            "splits": {
+                "train_0": [os.path.join("fewclue_csldcp", "train_0.json"), "ca5fc102bcbd5820743ef08ef415acfb"],
+                "train_1": [os.path.join("fewclue_csldcp", "train_1.json"), "ddfeab5c1c0b7051f3d8863b5145c0b6"],
+                "train_2": [os.path.join("fewclue_csldcp", "train_2.json"), "67fefbbabb063247108623ed9cb8bb90"],
+                "train_3": [os.path.join("fewclue_csldcp", "train_3.json"), "eebc7bc760422dd8ff8eefd5de39995b"],
+                "train_4": [os.path.join("fewclue_csldcp", "train_4.json"), "82ad233a803fd0e6ec4d9245299c3389"],
+                "train_few_all": [
+                    os.path.join("fewclue_csldcp", "train_few_all.json"),
+                    "3576c8413a9c77e20360296996f1217c",
+                ],
+                "dev_0": [os.path.join("fewclue_csldcp", "dev_0.json"), "24e6b62a23dda83ab2aa4d63b64d9306"],
+                "dev_1": [os.path.join("fewclue_csldcp", "dev_1.json"), "73f4439696f1c447c04ad2ea873fb603"],
+                "dev_2": [os.path.join("fewclue_csldcp", "dev_2.json"), "7f12d47d173c4beb77c4995a1409ad61"],
+                "dev_3": [os.path.join("fewclue_csldcp", "dev_3.json"), "35936d8347dd3d727050004cb871e686"],
+                "dev_4": [os.path.join("fewclue_csldcp", "dev_4.json"), "2fe45b969c8c33298c53c7415be9fc40"],
+                "dev_few_all": [
+                    os.path.join("fewclue_csldcp", "dev_few_all.json"),
+                    "17078e738790997cf0fe50ebe0568b8e",
+                ],
+                "unlabeled": [os.path.join("fewclue_csldcp", "unlabeled.json"), "e8802dad5889d7cc8f085f7d39aeb33b"],
+                "test": [os.path.join("fewclue_csldcp", "test.json"), "8e4c1680a30da48979f684edd4d175f2"],
+                "test_public": [
+                    os.path.join("fewclue_csldcp", "test_public.json"),
+                    "695058c4e6dc5e823be772963974c965",
+                ],
+            },
+            "labels": None,
+        },
+        "cluewsc": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/FewCLUE/fewclue_cluewsc.tar.gz",
+            "md5": "328e60d2ac14aaa6ecf255a9546e538d",
+            "splits": {
+                "train_0": [os.path.join("fewclue_cluewsc", "train_0.json"), "623085e169c6515a05cae6b52f2c5a2c"],
+                "train_1": [os.path.join("fewclue_cluewsc", "train_1.json"), "b30acf58e613ee21cd2d6fb4833e2763"],
+                "train_2": [os.path.join("fewclue_cluewsc", "train_2.json"), "ef0840acb8b61d22f1da7e94ecd7a309"],
+                "train_3": [os.path.join("fewclue_cluewsc", "train_3.json"), "7e6e15afab20ae488256278fa84468b5"],
+                "train_4": [os.path.join("fewclue_cluewsc", "train_4.json"), "5f21307270e83d3ea7d1e833db3dc514"],
+                "train_few_all": [
+                    os.path.join("fewclue_cluewsc", "train_few_all.json"),
+                    "0f875905c77747007e6e722e27e069f9",
+                ],
+                "dev_0": [os.path.join("fewclue_cluewsc", "dev_0.json"), "d52f7e97197af8782319be2946226b0f"],
+                "dev_1": [os.path.join("fewclue_cluewsc", "dev_1.json"), "d602d73dd7cc4f5e421fa0fd1deccc00"],
+                "dev_2": [os.path.join("fewclue_cluewsc", "dev_2.json"), "405bab04b2fdd00f4e23492ae24233ac"],
+                "dev_3": [os.path.join("fewclue_cluewsc", "dev_3.json"), "6896cee55db9539687ac788430319c53"],
+                "dev_4": [os.path.join("fewclue_cluewsc", "dev_4.json"), "a171a69d92408ce19449ddc4d629534e"],
+                "dev_few_all": [
+                    os.path.join("fewclue_cluewsc", "dev_few_all.json"),
+                    "9d5e5066758ac6ff24534b13dd2ed1ba",
+                ],
+                # Note: FewCLUE cluewsc unlabeled.json() is an empty file.
+                # https://github.com/CLUEbenchmark/FewCLUE/blob/main/datasets/cluewsc/unlabeled.json
+                # 'unlabeled': [
+                #    os.path.join('fewclue_cluewsc', 'unlabeled.json'),
+                #    'd41d8cd98f00b204e9800998ecf8427e'
+                # ],
+                "test": [os.path.join("fewclue_cluewsc", "test.json"), "0e9e8ffd8ee90ddf1f58d6dc2e02de7b"],
+                "test_public": [
+                    os.path.join("fewclue_cluewsc", "test_public.json"),
+                    "027bc101f000b632ef45ed6d86907527",
+                ],
+            },
+            "labels": None,
+        },
+        "csl": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/FewCLUE/fewclue_csl.tar.gz",
+            "md5": "434f3bad2958bba763506e9af8bf0419",
+            "splits": {
+                "train_0": [os.path.join("fewclue_csl", "train_0.json"), "d93bf9fcef2d5839819a7c1a695d38cb"],
+                "train_1": [os.path.join("fewclue_csl", "train_1.json"), "c5d1ce67e0c9081a160e0a0c790bc6af"],
+                "train_2": [os.path.join("fewclue_csl", "train_2.json"), "9fe5568b97e990e68770f00ce1ecd9bf"],
+                "train_3": [os.path.join("fewclue_csl", "train_3.json"), "e45acff15ae461bdf4001dd9f87ac413"],
+                "train_4": [os.path.join("fewclue_csl", "train_4.json"), "db87a6229793584e3ae1cbdb173de9db"],
+                "train_few_all": [
+                    os.path.join("fewclue_csl", "train_few_all.json"),
+                    "4b8882f1cfbdb0556b990b378ae7671e",
+                ],
+                "dev_0": [os.path.join("fewclue_csl", "dev_0.json"), "5ef6c4cce5cd8b313bd21dd2232bbdf2"],
+                "dev_1": [os.path.join("fewclue_csl", "dev_1.json"), "cbc3dbc4ed06bfe8bc9a9c25fdf98693"],
+                "dev_2": [os.path.join("fewclue_csl", "dev_2.json"), "581f5db0e79beb0e8a5f43db52fc1ff3"],
+                "dev_3": [os.path.join("fewclue_csl", "dev_3.json"), "cd3d99f6edf1ae20624b0b7aea1eeeba"],
+                "dev_4": [os.path.join("fewclue_csl", "dev_4.json"), "765bd899bad409812e6090330fc1be13"],
+                "dev_few_all": [os.path.join("fewclue_csl", "dev_few_all.json"), "2d4c44445a25bb61a48261cabea97e51"],
+                "unlabeled": [os.path.join("fewclue_csl", "unlabeled.json"), "2582af170971ab780d5650e75842e40c"],
+                "test": [os.path.join("fewclue_csl", "test.json"), "d34119b97113000988f1e03f92eb2dfe"],
+                "test_public": [os.path.join("fewclue_csl", "test_public.json"), "45a97013acfe94c887cf85e6ff540456"],
+            },
+            "labels": None,
+        },
+    }
+
+    def _get_data(self, mode, **kwargs):
+        builder_config = self.BUILDER_CONFIGS[self.name]
+
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+
+        filename, data_hash = builder_config["splits"][mode]
+
+        fullname = os.path.join(default_root, filename)
+
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(builder_config["url"], default_root, builder_config["md5"])
+        return fullname
+
+    def _read(self, filename, split):
+        with open(filename, "r", encoding="utf-8") as f:
+            for line in f:
+                yield json.loads(line.rstrip())
+
+    def get_labels(self):
+        """
+        Return labels of the FewCLUE task.
+        """
+        return self.BUILDER_CONFIGS[self.name]["labels"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/glue.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/glue.py
new file mode 100644
index 000000000..1ec8b0b8a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/glue.py
@@ -0,0 +1,288 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+
+class Glue(DatasetBuilder):
+    """
+    The General Language Understanding Evaluation (GLUE) benchmark is a collection
+    of resources for training, evaluating, and analyzing natural language
+    understanding systems.
+    From https://gluebenchmark.com/tasks
+
+    CoLA:
+        The Corpus of Linguistic Acceptability (Warstadt et al., 2018) consists of
+        English acceptability judgments drawn from books and journal articles on
+        linguistic theory.
+        Each example is a sequence of words annotated with whether it is a
+        grammatical English sentence.
+
+    SST2:
+        The Stanford Sentiment Treebank (Socher et al., 2013) consists of sentences
+        from movie reviews and human annotations of their sentiment.
+
+    MRPC:
+        The Microsoft Research Paraphrase Corpus dataset.
+
+    STSB:
+        The Semantic Textual Similarity Benchmark (Cer et al., 2017) is a
+        collection of sentence pairs drawn from news headlines, video and image
+        captions, and natural language inference data. Each pair is human-annotated
+        with a similarity score from 1 to 5.
+
+    QQP:
+        The Quora Question Pairs dataset is a collection of question pairs from the
+        community question-answering website Quora.
+
+    MNLI:
+        The Multi-Genre Natural Language Inference Corpus (Williams et al., 2018)
+        is a crowdsourced collection of sentence pairs with textual entailment
+        annotations.
+
+    QNLI:
+        The Question-answering NLI dataset converted from Stanford Question
+        Answering Dataset (Rajpurkar et al. 2016).
+
+    RTE:
+        The Recognizing Textual Entailment (RTE) datasets come from a series of
+        annual textual entailment challenges (RTE1, RTE2, RTE3, and RTE5).
+
+    WNLI:
+        The Winograd NLI dataset converted from the dataset in Winograd Schema
+        Challenge (Levesque et al., 2011).
+    """
+
+    BUILDER_CONFIGS = {
+        "cola": {
+            "url": "https://bj.bcebos.com/dataset/glue/CoLA.zip",
+            "md5": "b178a7c2f397b0433c39c7caf50a3543",
+            "splits": {
+                "train": [os.path.join("CoLA", "train.tsv"), "c79d4693b8681800338aa044bf9e797b", (3, 1), 0],
+                "dev": [os.path.join("CoLA", "dev.tsv"), "c5475ccefc9e7ca0917294b8bbda783c", (3, 1), 0],
+                "test": [os.path.join("CoLA", "test.tsv"), "d8721b7dedda0dcca73cebb2a9f4259f", (1,), 1],
+            },
+            "labels": ["0", "1"],
+        },
+        "sst-2": {
+            "url": "https://bj.bcebos.com/dataset/glue/SST.zip",
+            "md5": "9f81648d4199384278b86e315dac217c",
+            "splits": {
+                "train": [os.path.join("SST-2", "train.tsv"), "da409a0a939379ed32a470bc0f7fe99a", (0, 1), 1],
+                "dev": [os.path.join("SST-2", "dev.tsv"), "268856b487b2a31a28c0a93daaff7288", (0, 1), 1],
+                "test": [os.path.join("SST-2", "test.tsv"), "3230e4efec76488b87877a56ae49675a", (1,), 1],
+            },
+            "labels": ["0", "1"],
+        },
+        "sts-b": {
+            "url": "https://bj.bcebos.com/dataset/glue/STS.zip",
+            "md5": "d573676be38f1a075a5702b90ceab3de",
+            "splits": {
+                "train": [os.path.join("STS-B", "train.tsv"), "4f7a86dde15fe4832c18e5b970998672", (7, 8, 9), 1],
+                "dev": [os.path.join("STS-B", "dev.tsv"), "5f4d6b0d2a5f268b1b56db773ab2f1fe", (7, 8, 9), 1],
+                "test": [os.path.join("STS-B", "test.tsv"), "339b5817e414d19d9bb5f593dd94249c", (7, 8), 1],
+            },
+            "labels": None,
+        },
+        "qqp": {
+            "url": "https://dataset.bj.bcebos.com/glue/QQP.zip",
+            "md5": "884bf26e39c783d757acc510a2a516ef",
+            "splits": {
+                "train": [os.path.join("QQP", "train.tsv"), "e003db73d277d38bbd83a2ef15beb442", (3, 4, 5), 1],
+                "dev": [os.path.join("QQP", "dev.tsv"), "cff6a448d1580132367c22fc449ec214", (3, 4, 5), 1],
+                "test": [os.path.join("QQP", "test.tsv"), "73de726db186b1b08f071364b2bb96d0", (1, 2), 1],
+            },
+            "labels": ["0", "1"],
+        },
+        "mnli": {
+            "url": "https://bj.bcebos.com/dataset/glue/MNLI.zip",
+            "md5": "e343b4bdf53f927436d0792203b9b9ff",
+            "splits": {
+                "train": [os.path.join("MNLI", "train.tsv"), "220192295e23b6705f3545168272c740", (8, 9, 11), 1],
+                "dev_matched": [
+                    os.path.join("MNLI", "dev_matched.tsv"),
+                    "c3fa2817007f4cdf1a03663611a8ad23",
+                    (8, 9, 15),
+                    1,
+                ],
+                "dev_mismatched": [
+                    os.path.join("MNLI", "dev_mismatched.tsv"),
+                    "b219e6fe74e4aa779e2f417ffe713053",
+                    (8, 9, 15),
+                    1,
+                ],
+                "test_matched": [
+                    os.path.join("MNLI", "test_matched.tsv"),
+                    "33ea0389aedda8a43dabc9b3579684d9",
+                    (8, 9),
+                    1,
+                ],
+                "test_mismatched": [
+                    os.path.join("MNLI", "test_mismatched.tsv"),
+                    "7d2f60a73d54f30d8a65e474b615aeb6",
+                    (8, 9),
+                    1,
+                ],
+            },
+            "labels": ["contradiction", "entailment", "neutral"],
+        },
+        "qnli": {
+            "url": "https://bj.bcebos.com/dataset/glue/QNLI.zip",
+            "md5": "b4efd6554440de1712e9b54e14760e82",
+            "splits": {
+                "train": [os.path.join("QNLI", "train.tsv"), "5e6063f407b08d1f7c7074d049ace94a", (1, 2, 3), 1],
+                "dev": [os.path.join("QNLI", "dev.tsv"), "1e81e211959605f144ba6c0ad7dc948b", (1, 2, 3), 1],
+                "test": [os.path.join("QNLI", "test.tsv"), "f2a29f83f3fe1a9c049777822b7fa8b0", (1, 2), 1],
+            },
+            "labels": ["entailment", "not_entailment"],
+        },
+        "rte": {
+            "url": "https://bj.bcebos.com/dataset/glue/RTE.zip",
+            "md5": "bef554d0cafd4ab6743488101c638539",
+            "splits": {
+                "train": [os.path.join("RTE", "train.tsv"), "d2844f558d111a16503144bb37a8165f", (1, 2, 3), 1],
+                "dev": [os.path.join("RTE", "dev.tsv"), "973cb4178d4534cf745a01c309d4a66c", (1, 2, 3), 1],
+                "test": [os.path.join("RTE", "test.tsv"), "6041008f3f3e48704f57ce1b88ad2e74", (1, 2), 1],
+            },
+            "labels": ["entailment", "not_entailment"],
+        },
+        "wnli": {
+            "url": "https://bj.bcebos.com/dataset/glue/WNLI.zip",
+            "md5": "a1b4bd2861017d302d29e42139657a42",
+            "splits": {
+                "train": [os.path.join("WNLI", "train.tsv"), "5cdc5a87b7be0c87a6363fa6a5481fc1", (1, 2, 3), 1],
+                "dev": [os.path.join("WNLI", "dev.tsv"), "a79a6dd5d71287bcad6824c892e517ee", (1, 2, 3), 1],
+                "test": [os.path.join("WNLI", "test.tsv"), "a18789ba4f60f6fdc8cb4237e4ba24b5", (1, 2), 1],
+            },
+            "labels": ["0", "1"],
+        },
+        "mrpc": {
+            "url": {
+                "train_data": "https://bj.bcebos.com/dataset/glue/mrpc/msr_paraphrase_train.txt",
+                "dev_id": "https://bj.bcebos.com/dataset/glue/mrpc/dev_ids.tsv",
+                "test_data": "https://bj.bcebos.com/dataset/glue/mrpc/msr_paraphrase_test.txt",
+            },
+            "md5": {
+                "train_data": "793daf7b6224281e75fe61c1f80afe35",
+                "dev_id": "7ab59a1b04bd7cb773f98a0717106c9b",
+                "test_data": "e437fdddb92535b820fe8852e2df8a49",
+            },
+            "splits": {
+                "train": [os.path.join("MRPC", "train.tsv"), "dc2dac669a113866a6480a0b10cd50bf", (3, 4, 0), 1],
+                "dev": [os.path.join("MRPC", "dev.tsv"), "185958e46ba556b38c6a7cc63f3a2135", (3, 4, 0), 1],
+                "test": [os.path.join("MRPC", "test.tsv"), "4825dab4b4832f81455719660b608de5", (3, 4), 1],
+            },
+            "labels": ["0", "1"],
+        },
+    }
+
+    def _get_data(self, mode, **kwargs):
+        builder_config = self.BUILDER_CONFIGS[self.name]
+        if self.name != "mrpc":
+            default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+            filename, data_hash, _, _ = builder_config["splits"][mode]
+            fullname = os.path.join(default_root, filename)
+            if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+                get_path_from_url(builder_config["url"], default_root, builder_config["md5"])
+
+        else:
+            default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+            filename, data_hash, _, _ = builder_config["splits"][mode]
+            fullname = os.path.join(default_root, filename)
+            if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+                if mode in ("train", "dev"):
+                    dev_id_path = get_path_from_url(
+                        builder_config["url"]["dev_id"],
+                        os.path.join(default_root, "MRPC"),
+                        builder_config["md5"]["dev_id"],
+                    )
+                    train_data_path = get_path_from_url(
+                        builder_config["url"]["train_data"],
+                        os.path.join(default_root, "MRPC"),
+                        builder_config["md5"]["train_data"],
+                    )
+                    # read dev data ids
+                    dev_ids = []
+                    print(dev_id_path)
+                    with open(dev_id_path, encoding="utf-8") as ids_fh:
+                        for row in ids_fh:
+                            dev_ids.append(row.strip().split("\t"))
+
+                    # generate train and dev set
+                    train_path = os.path.join(default_root, "MRPC", "train.tsv")
+                    dev_path = os.path.join(default_root, "MRPC", "dev.tsv")
+                    with open(train_data_path, encoding="utf-8") as data_fh:
+                        with open(train_path, "w", encoding="utf-8") as train_fh:
+                            with open(dev_path, "w", encoding="utf8") as dev_fh:
+                                header = data_fh.readline()
+                                train_fh.write(header)
+                                dev_fh.write(header)
+                                for row in data_fh:
+                                    label, id1, id2, s1, s2 = row.strip().split("\t")
+                                    example = "%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2)
+                                    if [id1, id2] in dev_ids:
+                                        dev_fh.write(example)
+                                    else:
+                                        train_fh.write(example)
+
+                else:
+                    test_data_path = get_path_from_url(
+                        builder_config["url"]["test_data"],
+                        os.path.join(default_root, "MRPC"),
+                        builder_config["md5"]["test_data"],
+                    )
+                    test_path = os.path.join(default_root, "MRPC", "test.tsv")
+                    with open(test_data_path, encoding="utf-8") as data_fh:
+                        with open(test_path, "w", encoding="utf-8") as test_fh:
+                            header = data_fh.readline()
+                            test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
+                            for idx, row in enumerate(data_fh):
+                                label, id1, id2, s1, s2 = row.strip().split("\t")
+                                test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
+
+        return fullname
+
+    def _read(self, filename, split):
+        _, _, field_indices, num_discard_samples = self.BUILDER_CONFIGS[self.name]["splits"][split]
+        with open(filename, "r", encoding="utf-8") as f:
+            for idx, line in enumerate(f):
+                if idx < num_discard_samples:
+                    continue
+                line_stripped = line.strip().split("\t")
+                if not line_stripped:
+                    continue
+                example = [line_stripped[indice] for indice in field_indices]
+                if self.name in ["cola", "sst-2"]:
+                    yield {"sentence": example[0]} if "test" in split else {
+                        "sentence": example[0],
+                        "labels": example[-1],
+                    }
+                else:
+                    yield {"sentence1": example[0], "sentence2": example[1]} if "test" in split else {
+                        "sentence1": example[0],
+                        "sentence2": example[1],
+                        "labels": example[-1],
+                    }
+
+    def get_labels(self):
+        """
+        Returns labels of the Glue task.
+        """
+        return self.BUILDER_CONFIGS[self.name]["labels"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/chnsenticorp.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/chnsenticorp.py
new file mode 100644
index 000000000..519e2f579
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/chnsenticorp.py
@@ -0,0 +1,120 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""ChnSentiCorp: Chinese Corpus for sentence-level sentiment classification."""
+
+import csv
+import os
+
+import datasets
+
+logger = datasets.logging.get_logger(__name__)
+
+_CITATION = """\
+@article{tan2008empirical,
+  title={An empirical study of sentiment analysis for chinese documents},
+  author={Tan, Songbo and Zhang, Jin},
+  journal={Expert Systems with applications},
+  volume={34},
+  number={4},
+  pages={2622--2629},
+  year={2008},
+  publisher={Elsevier}
+}
+"""
+
+_DESCRIPTION = """\
+ChnSentiCorp: A classic sentence-level sentiment classification dataset, which includes hotel, laptop and data-related online review data, including positive and negative categories.
+More information refer to https://www.luge.ai/#/luge/dataDetail?id=25.
+"""
+
+_URL = "https://bj.bcebos.com/paddlenlp/datasets/ChnSentiCorp.zip"
+
+
+class ChnSentiCorpConfig(datasets.BuilderConfig):
+    """BuilderConfig for ChnSentiCorp."""
+
+    def __init__(self, **kwargs):
+        """BuilderConfig for ChnSentiCorp.
+
+        Args:
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(ChnSentiCorpConfig, self).__init__(**kwargs)
+
+
+class ChnSentiCorp(datasets.GeneratorBasedBuilder):
+    """ChnSentiCorp: Chinese Corpus for sentence-level sentiment classification."""
+
+    BUILDER_CONFIGS = [
+        ChnSentiCorpConfig(
+            name="chnsenticorp",
+            version=datasets.Version("1.0.0", ""),
+            description="COTE-BD crawled on baidu.",
+        )
+    ]
+
+    def _info(self):
+        features = {"id": datasets.Value("int32"), "text": datasets.Value("string"), "label": datasets.Value("int32")}
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(features),
+            homepage="https://www.luge.ai/#/luge/dataDetail?id=25",
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        downloaded_dir = dl_manager.download_and_extract(_URL)
+        data_dir = os.path.join(downloaded_dir, "ChnSentiCorp")
+
+        train_split = datasets.SplitGenerator(
+            name=datasets.Split.TRAIN, gen_kwargs={"filepath": os.path.join(data_dir, "train.tsv"), "split": "train"}
+        )
+
+        dev_split = datasets.SplitGenerator(
+            name=datasets.Split.VALIDATION, gen_kwargs={"filepath": os.path.join(data_dir, "dev.tsv"), "split": "dev"}
+        )
+
+        test_split = datasets.SplitGenerator(
+            name=datasets.Split.TEST, gen_kwargs={"filepath": os.path.join(data_dir, "test.tsv"), "split": "test"}
+        )
+
+        return [train_split, dev_split, test_split]
+
+    def _generate_examples(self, filepath, split):
+        """This function returns the examples in the raw (text) form."""
+        logger.info("generating examples from = %s", filepath)
+
+        with open(filepath, encoding="utf8") as f:
+            reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
+
+            for idx, row in enumerate(reader):
+                example = {}
+                example["id"] = idx
+                example["text"] = row["text_a"]
+
+                if split != "test":
+                    example["label"] = int(row["label"])
+                else:
+                    example["label"] = -1
+
+                # Filter out corrupted rows.
+                for value in example.values():
+                    if value is None:
+                        break
+                else:
+                    yield idx, example
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/clue.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/clue.py
new file mode 100644
index 000000000..87f60392a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/clue.py
@@ -0,0 +1,552 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""A Chinese Language Understanding Evaluation Benchmark (CLUE) benchmark."""
+
+import json
+import os
+import re
+import textwrap
+
+import datasets
+
+_CLUE_CITATION = """\
+@misc{xu2020clue,
+    title={CLUE: A Chinese Language Understanding Evaluation Benchmark},
+    author={Liang Xu and Xuanwei Zhang and Lu Li and Hai Hu and Chenjie Cao and Weitang Liu and Junyi Li and Yudong Li and Kai Sun and Yechen Xu and Yiming Cui and Cong Yu and Qianqian Dong and Yin Tian and Dian Yu and Bo Shi and Jun Zeng and Rongzhao Wang and Weijian Xie and Yanting Li and Yina Patterson and Zuoyu Tian and Yiwen Zhang and He Zhou and Shaoweihua Liu and Qipeng Zhao and Cong Yue and Xinrui Zhang and Zhengliang Yang and Zhenzhong Lan},
+    year={2020},
+    eprint={2004.05986},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
+"""
+
+_CLUE_DESCRIPTION = """\
+CLUE, A Chinese Language Understanding Evaluation Benchmark
+(https://www.cluebenchmarks.com/) is a collection of resources for training,
+evaluating, and analyzing Chinese language understanding systems.
+
+"""
+
+
+class ClueConfig(datasets.BuilderConfig):
+    """BuilderConfig for CLUE."""
+
+    def __init__(
+        self,
+        data_url,
+        text_features=None,
+        label_column=None,
+        data_dir="",
+        citation="",
+        url="",
+        label_classes=None,
+        process_label=lambda x: x,
+        **kwargs,
+    ):
+        """BuilderConfig for CLUE.
+
+        Args:
+          text_features: `dict[string, string]`, map from the name of the feature
+            dict for each text field to the name of the column in the tsv file
+          label_column: `string`, name of the column in the tsv file corresponding
+            to the label
+          data_url: `string`, url to download the zip file from
+          data_dir: `string`, the path to the folder containing the tsv files in the
+            downloaded zip
+          citation: `string`, citation for the data set
+          url: `string`, url for information about the data set
+          label_classes: `list[string]`, the list of classes if the label is
+            categorical. If not provided, then the label will be of type
+            `datasets.Value('float32')`.
+          process_label: `Function[string, any]`, function  taking in the raw value
+            of the label and processing it to the form required by the label feature
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(ClueConfig, self).__init__(version=datasets.Version("1.0.0", ""), **kwargs)
+        self.text_features = text_features
+        self.label_column = label_column
+        self.label_classes = label_classes
+        self.data_url = data_url
+        self.data_dir = data_dir
+        self.citation = citation
+        self.url = url
+        self.process_label = process_label
+
+
+class Clue(datasets.GeneratorBasedBuilder):
+    """A Chinese Language Understanding Evaluation Benchmark (CLUE) benchmark."""
+
+    BUILDER_CONFIGS = [
+        ClueConfig(
+            name="afqmc",
+            description=textwrap.dedent(
+                """\
+            Ant Financial Question Matching Corpus is a dataset for Chinese
+            question matching (similar to QQP).
+            """
+            ),
+            text_features={"sentence1": "sentence1", "sentence2": "sentence2"},
+            label_classes=["0", "1"],
+            label_column="label",
+            data_url="https://bj.bcebos.com/paddlenlp/datasets/afqmc_public.zip",
+            url="https://dc.cloud.alipay.com/index#/topic/data?id=8",
+        ),
+        ClueConfig(
+            name="tnews",
+            description=textwrap.dedent(
+                """\
+            Toutiao Short Text Classification for News is a dataset for Chinese
+            short news classification.
+            """
+            ),
+            text_features={"sentence": "sentence"},
+            label_classes=[
+                "100",
+                "101",
+                "102",
+                "103",
+                "104",
+                "106",
+                "107",
+                "108",
+                "109",
+                "110",
+                "112",
+                "113",
+                "114",
+                "115",
+                "116",
+            ],
+            label_column="label",
+            data_url="https://bj.bcebos.com/paddlenlp/datasets/tnews_public.zip",
+            url="https://github.com/skdjfla/toutiao-text-classfication-dataset",
+        ),
+        ClueConfig(
+            name="iflytek",
+            description=textwrap.dedent(
+                """\
+            IFLYTEK Long Text Classification for News is a dataset for Chinese
+            long text classification. The text is crawled from an app market.
+            """
+            ),
+            text_features={"sentence": "sentence"},
+            label_classes=[str(label) for label in range(119)],
+            label_column="label",
+            data_url="https://bj.bcebos.com/paddlenlp/datasets/iflytek_public.zip",
+        ),
+        ClueConfig(
+            name="cmnli",
+            description=textwrap.dedent(
+                """\
+            Chinese Multi-Genre NLI is a dataset for Chinese Natural Language
+            Inference. It consists of XNLI (Chinese subset) and translated MNLI.
+            """
+            ),
+            text_features={"sentence1": "sentence1", "sentence2": "sentence2"},
+            label_classes=["neutral", "entailment", "contradiction"],
+            label_column="label",
+            data_url="https://bj.bcebos.com/paddlenlp/datasets/cmnli_public.zip",
+            data_dir="cmnli_public",
+        ),
+        ClueConfig(
+            name="cluewsc2020",
+            description=textwrap.dedent(
+                """\
+            CLUE Winograd Scheme Challenge (CLUEWSC 2020) is a Chinese WSC dataset.
+            The text is from contemporary literature and annotated by human experts.
+            The task is to determine which noun the pronoun in the sentence refers to.
+            The question appears in the form of true and false discrimination.
+            """
+            ),
+            text_features={"text": "text", "target": "target"},
+            label_classes=["false", "true"],
+            label_column="label",
+            data_url="https://bj.bcebos.com/paddlenlp/datasets/cluewsc2020_public.zip",
+        ),
+        ClueConfig(
+            name="csl",
+            description=textwrap.dedent(
+                """\
+            Chinese Scientific Literature Dataset (CSL) is taken from the abstracts of
+            Chinese papers and their keywords. The papers are selected from some core
+            journals of Chinese social sciences and natural sciences. TF-IDF is used to
+            generate a mixture of fake keywords and real keywords in the paper to construct
+            abstract-keyword pairs. The task goal is to judge whether the keywords are
+            all real keywords based on the abstract.
+            """
+            ),
+            text_features={"abst": "abst", "keyword": "keyword", "corpus_id": "id"},
+            label_classes=["0", "1"],
+            label_column="label",
+            data_url="https://bj.bcebos.com/paddlenlp/datasets/csl_public.zip",
+            url="https://github.com/P01son6415/CSL",
+        ),
+        ClueConfig(
+            name="cmrc2018",
+            description=textwrap.dedent(
+                """\
+            CMRC2018 is the first Chinese Span-Extraction Machine Reading Comprehension
+            Dataset. The task requires to set up a system that reads context,
+            question and extract the answer from the context (the answer is a continuous
+            span in the context).
+            """
+            ),
+            data_url="https://paddlenlp.bj.bcebos.com/datasets/cmrc2018_public.zip",
+            url="https://hfl-rc.github.io/cmrc2018/",
+            citation=textwrap.dedent(
+                """\
+                  @article{cmrc2018-dataset,
+                  title={A Span-Extraction Dataset for Chinese Machine Reading Comprehension},
+                  author={Cui, Yiming and Liu, Ting and Xiao, Li and Chen, Zhipeng and Ma, Wentao and Che, Wanxiang and Wang, Shijin and Hu, Guoping},
+                  journal={arXiv preprint arXiv:1810.07366},
+                  year={2018}
+                }"""
+            ),
+        ),
+        ClueConfig(
+            name="drcd",
+            description=textwrap.dedent(
+                """\
+            Delta Reading Comprehension Dataset (DRCD) belongs to the general field of traditional
+            Chinese machine reading comprehension data set. This data set is expected to become a
+            standard Chinese reading comprehension data set suitable for transfer learning.
+            """
+            ),
+            data_url="https://paddlenlp.bj.bcebos.com/datasets/drcd_public.zip",
+            url="https://github.com/DRCKnowledgeTeam/DRCD",
+        ),
+        ClueConfig(
+            name="chid",
+            description=textwrap.dedent(
+                """\
+            Chinese IDiom Dataset for Cloze Test (CHID) contains many masked idioms in the text.
+            The candidates contain similar idioms to the real ones.
+            """
+            ),
+            text_features={"candidates": "candidates", "content": "content"},
+            data_url="https://paddlenlp.bj.bcebos.com/datasets/chid_public.zip",
+            url="https://arxiv.org/abs/1906.01265",
+            citation=textwrap.dedent(
+                """\
+                  @article{Zheng_2019,
+                   title={ChID: A Large-scale Chinese IDiom Dataset for Cloze Test},
+                   url={http://dx.doi.org/10.18653/v1/P19-1075},
+                   DOI={10.18653/v1/p19-1075},
+                   journal={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
+                   publisher={Association for Computational Linguistics},
+                   author={Zheng, Chujie and Huang, Minlie and Sun, Aixin},
+                   year={2019}
+                }"""
+            ),
+        ),
+        ClueConfig(
+            name="c3",
+            description=textwrap.dedent(
+                """\
+            Multiple-Choice Chinese Machine Reading Comprehension (C3, or C^3) is a Chinese
+            multi-choice reading comprehension data set, including mixed type data sets
+            such as dialogue and long text. Both the training and validation sets are
+            the concatenation of the dialogue and long-text subsets.
+            """
+            ),
+            text_features={"candidates": "candidates", "content": "content"},
+            data_url="https://paddlenlp.bj.bcebos.com/datasets/c3_public.zip",
+            url="https://arxiv.org/abs/1904.09679",
+            citation=textwrap.dedent(
+                """\
+                  @article{sun2020investigating,
+                      author    = {Kai Sun and
+                                   Dian Yu and
+                                   Dong Yu and
+                                   Claire Cardie},
+                      title     = {Investigating Prior Knowledge for Challenging Chinese Machine Reading
+                                   Comprehension},
+                      journal   = {Trans. Assoc. Comput. Linguistics},
+                      volume    = {8},
+                      pages     = {141--155},
+                      year      = {2020},
+                      url       = {https://transacl.org/ojs/index.php/tacl/article/view/1882}
+                    }"""
+            ),
+        ),
+        ClueConfig(
+            name="ocnli",
+            description=textwrap.dedent(
+                """\
+            OCNLI stands for Original Chinese Natural Language Inference. It is a corpus for
+            Chinese Natural Language Inference, collected following closely the procedures of MNLI,
+            but with enhanced strategies aiming for more challenging inference pairs. We want to
+            emphasize we did not use human/machine translation in creating the dataset, and thus
+            our Chinese texts are original and not translated.
+            """
+            ),
+            text_features={"sentence1": "sentence1", "sentence2": "sentence2"},
+            label_classes=["neutral", "entailment", "contradiction"],
+            label_column="label",
+            data_url="https://paddlenlp.bj.bcebos.com/datasets/OCNLI-02d55cb3c7dc984682677b8dd81db6a1e4710720.zip",
+            data_dir="OCNLI-02d55cb3c7dc984682677b8dd81db6a1e4710720/data/ocnli",
+            url="https://arxiv.org/abs/2010.05444",
+            citation=textwrap.dedent(
+                """\
+                  @inproceedings{ocnli,
+                    title={OCNLI: Original Chinese Natural Language Inference},
+                    author={Hai Hu and Kyle Richardson and Liang Xu and Lu Li and Sandra Kuebler and Larry Moss},
+                    booktitle={Findings of EMNLP},
+                    year={2020},
+                    url={https://arxiv.org/abs/2010.05444}
+                }"""
+            ),
+        ),
+        ClueConfig(
+            name="diagnostics",
+            description=textwrap.dedent(
+                """\
+            Diagnostic set, used to evaluate the performance of different models on 9 Chinese language
+            phenomena summarized by linguists.
+
+            Use the model trained on CMNLI to directly predict the result on this diagnostic set.
+            """
+            ),
+            text_features={"sentence1": "premise", "sentence2": "hypothesis"},
+            label_classes=["neutral", "entailment", "contradiction"],
+            label_column="label",
+            data_url="https://paddlenlp.bj.bcebos.com/datasets/clue_diagnostics_public.zip",
+        ),
+    ]
+
+    def _info(self):
+        if self.config.name in ["afqmc", "tnews", "iflytek", "cmnli", "diagnostics", "ocnli"]:
+            features = {text_feature: datasets.Value("string") for text_feature in self.config.text_features.keys()}
+            if self.config.label_classes:
+                features["label"] = datasets.features.ClassLabel(names=self.config.label_classes)
+            else:
+                features["label"] = datasets.Value("float32")
+            features["idx"] = datasets.Value("int32")
+        elif self.config.name == "cluewsc2020":
+            features = {
+                "idx": datasets.Value("int32"),
+                "text": datasets.Value("string"),
+                "label": datasets.features.ClassLabel(names=["true", "false"]),
+                "target": {
+                    "span1_text": datasets.Value("string"),
+                    "span2_text": datasets.Value("string"),
+                    "span1_index": datasets.Value("int32"),
+                    "span2_index": datasets.Value("int32"),
+                },
+            }
+        elif self.config.name == "csl":
+            features = {
+                "idx": datasets.Value("int32"),
+                "corpus_id": datasets.Value("int32"),
+                "abst": datasets.Value("string"),
+                "label": datasets.features.ClassLabel(names=self.config.label_classes),
+                "keyword": datasets.Sequence(datasets.Value("string")),
+            }
+        elif self.config.name in ["cmrc2018", "drcd"]:
+            features = {
+                "id": datasets.Value("string"),
+                "context": datasets.Value("string"),
+                "question": datasets.Value("string"),
+                "answers": datasets.Sequence(
+                    {
+                        "text": datasets.Value("string"),
+                        "answer_start": datasets.Value("int32"),
+                    }
+                ),
+            }
+        elif self.config.name == "chid":
+            features = {
+                "idx": datasets.Value("int32"),
+                "candidates": datasets.Sequence(datasets.Value("string")),
+                "content": datasets.Sequence(datasets.Value("string")),
+                "answers": datasets.features.Sequence(
+                    {
+                        "text": datasets.Value("string"),
+                        "candidate_id": datasets.Value("int32"),
+                    }
+                ),
+            }
+        elif self.config.name == "c3":
+            features = {
+                "id": datasets.Value("int32"),
+                "context": datasets.Sequence(datasets.Value("string")),
+                "question": datasets.Value("string"),
+                "choice": datasets.Sequence(datasets.Value("string")),
+                "answer": datasets.Value("string"),
+            }
+        else:
+            raise NotImplementedError(
+                "This task is not implemented. If you believe"
+                " this task was recently added to the CLUE benchmark, "
+                "please open a GitHub issue and we will add it."
+            )
+
+        return datasets.DatasetInfo(
+            description=_CLUE_DESCRIPTION,
+            features=datasets.Features(features),
+            homepage=self.config.url,
+            citation=self.config.citation + "\n" + _CLUE_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        dl_dir = dl_manager.download_and_extract(self.config.data_url)
+        data_dir = os.path.join(dl_dir, self.config.data_dir)
+
+        if self.config.name in {"chid", "c3"}:
+            test_file = "test1.1.json"
+        elif self.config.name == "diagnostics":
+            test_file = "diagnostics_test.json"
+        else:
+            test_file = "test.json"
+
+        test_split = datasets.SplitGenerator(
+            name=datasets.Split.TEST,
+            gen_kwargs={
+                "data_file": os.path.join(data_dir, test_file),
+                "split": "test",
+            },
+        )
+
+        split_list = [test_split]
+
+        if self.config.name != "diagnostics":
+            train_split = datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "data_file": os.path.join(
+                        data_dir or "", "train.json" if self.config.name != "c3" else "d-train.json"
+                    ),
+                    "split": "train",
+                },
+            )
+            val_split = datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "data_file": os.path.join(
+                        data_dir or "", "dev.json" if self.config.name != "c3" else "d-dev.json"
+                    ),
+                    "split": "dev",
+                },
+            )
+            split_list += [train_split, val_split]
+
+        if self.config.name == "cmrc2018":
+            split_list.append(
+                datasets.SplitGenerator(
+                    name=datasets.Split("trial"),
+                    gen_kwargs={
+                        "data_file": os.path.join(data_dir or "", "trial.json"),
+                        "split": "trial",
+                    },
+                )
+            )
+
+        return split_list
+
+    def _generate_examples(self, data_file, split):
+        process_label = self.config.process_label
+        label_classes = self.config.label_classes
+
+        if self.config.name == "chid" and split != "test":
+            answer_file = os.path.join(os.path.dirname(data_file), f"{split}_answer.json")
+            answer_dict = json.load(open(answer_file, encoding="utf8"))
+
+        if self.config.name == "c3":
+            if split == "test":
+                files = [data_file]
+            else:
+                data_dir = os.path.dirname(data_file)
+                files = [os.path.join(data_dir, f"{typ}-{split}.json") for typ in ["d", "m"]]
+            data = []
+            for f in files:
+                data_subset = json.load(open(f, encoding="utf8"))
+                data += data_subset
+            for idx, entry in enumerate(data):
+                for qidx, question in enumerate(entry[1]):
+                    example = {
+                        "id": idx if split != "test" else int(question["id"]),
+                        "context": entry[0],
+                        "question": question["question"],
+                        "choice": question["choice"],
+                        "answer": question["answer"] if split != "test" else "",
+                    }
+                    yield f"{idx}_{qidx}", example
+
+        else:
+            with open(data_file, encoding="utf8") as f:
+                if self.config.name in ["cmrc2018", "drcd"]:
+                    data = json.load(f)
+                    for example in data["data"]:
+                        for paragraph in example["paragraphs"]:
+                            context = paragraph["context"].strip()
+                            for qa in paragraph["qas"]:
+                                question = qa["question"].strip()
+                                id_ = qa["id"]
+
+                                answer_starts = [answer["answer_start"] for answer in qa["answers"]]
+                                answers = [answer["text"].strip() for answer in qa["answers"]]
+
+                                yield id_, {
+                                    "context": context,
+                                    "question": question,
+                                    "id": id_,
+                                    "answers": {
+                                        "answer_start": answer_starts,
+                                        "text": answers,
+                                    },
+                                }
+
+                else:
+                    for n, line in enumerate(f):
+                        row = json.loads(line)
+                        example = {feat: row[col] for feat, col in self.config.text_features.items()}
+                        example["idx"] = n if self.config.name != "diagnostics" else int(row["index"])
+                        if self.config.name == "chid":  # CHID has a separate gold label file
+                            contents = example["content"]
+                            candidates = example["candidates"]
+                            idiom_list = []
+                            if split != "test":
+                                for content in contents:
+                                    idioms = re.findall(r"#idiom\d+#", content)
+                                    for idiom in idioms:
+                                        idiom_list.append(
+                                            {
+                                                "candidate_id": answer_dict[idiom],
+                                                "text": candidates[answer_dict[idiom]],
+                                            }
+                                        )
+                            example["answers"] = idiom_list
+
+                        elif self.config.label_column in row:
+                            label = row[self.config.label_column]
+                            # Notice: some labels in CMNLI and OCNLI are invalid. We drop these data.
+                            if self.config.name in ["cmnli", "ocnli"] and label == "-":
+                                continue
+                            # For some tasks, the label is represented as 0 and 1 in the tsv
+                            # files and needs to be cast to integer to work with the feature.
+                            if label_classes and label not in label_classes:
+                                label = int(label) if label else None
+                            example["label"] = process_label(label)
+                        else:
+                            example["label"] = process_label(-1)
+
+                        # Filter out corrupted rows.
+                        for value in example.values():
+                            if value is None:
+                                break
+                        else:
+                            yield example["idx"], example
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/cmrc2018.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/cmrc2018.py
new file mode 100644
index 000000000..0af9d936c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/cmrc2018.py
@@ -0,0 +1,135 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TODO(cmrc2018): Add a description here."""
+
+import json
+
+import datasets
+from datasets.tasks import QuestionAnsweringExtractive
+
+# TODO(cmrc2018): BibTeX citation
+_CITATION = """\
+@inproceedings{cui-emnlp2019-cmrc2018,
+    title = {A Span-Extraction Dataset for {C}hinese Machine Reading Comprehension},
+    author = {Cui, Yiming  and
+      Liu, Ting  and
+      Che, Wanxiang  and
+      Xiao, Li  and
+      Chen, Zhipeng  and
+      Ma, Wentao  and
+      Wang, Shijin  and
+      Hu, Guoping},
+    booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
+    month = {nov},
+    year = {2019},
+    address = {Hong Kong, China},
+    publisher = {Association for Computational Linguistics},
+    url = {https://www.aclweb.org/anthology/D19-1600},
+    doi = {10.18653/v1/D19-1600},
+    pages = {5886--5891}}
+"""
+
+# TODO(cmrc2018):
+_DESCRIPTION = """\
+A Span-Extraction dataset for Chinese machine reading comprehension to add language
+diversities in this area. The dataset is composed by near 20,000 real questions annotated
+on Wikipedia paragraphs by human experts. We also annotated a challenge set which
+contains the questions that need comprehensive understanding and multi-sentence
+inference throughout the context.
+"""
+_URL = "https://github.com/ymcui/cmrc2018"
+_TRAIN_FILE = "https://paddlenlp.bj.bcebos.com/datasets/cmrc/cmrc2018_train.json"
+_DEV_FILE = "https://paddlenlp.bj.bcebos.com/datasets/cmrc/cmrc2018_dev.json"
+_TEST_FILE = "https://paddlenlp.bj.bcebos.com/datasets/cmrc/cmrc2018_trial.json"
+
+
+class Cmrc2018(datasets.GeneratorBasedBuilder):
+    """TODO(cmrc2018): Short description of my dataset."""
+
+    # TODO(cmrc2018): Set up version.
+    VERSION = datasets.Version("0.1.0")
+
+    def _info(self):
+        # TODO(cmrc2018): Specifies the datasets.DatasetInfo object
+        return datasets.DatasetInfo(
+            # This is the description that will appear on the datasets page.
+            description=_DESCRIPTION,
+            # datasets.features.FeatureConnectors
+            features=datasets.Features(
+                {
+                    "id": datasets.Value("string"),
+                    "context": datasets.Value("string"),
+                    "question": datasets.Value("string"),
+                    "answers": datasets.features.Sequence(
+                        {
+                            "text": datasets.Value("string"),
+                            "answer_start": datasets.Value("int32"),
+                        }
+                    ),
+                    # These are the features of your dataset like images, labels ...
+                }
+            ),
+            # If there's a common (input, target) tuple from the features,
+            # specify them here. They'll be used if as_supervised=True in
+            # builder.as_dataset.
+            supervised_keys=None,
+            # Homepage of the dataset for documentation
+            homepage=_URL,
+            citation=_CITATION,
+            task_templates=[
+                QuestionAnsweringExtractive(
+                    question_column="question", context_column="context", answers_column="answers"
+                )
+            ],
+        )
+
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        # TODO(cmrc2018): Downloads the data and defines the splits
+        # dl_manager is a datasets.download.DownloadManager that can be used to
+        # download and extract URLs
+        urls_to_download = {"train": _TRAIN_FILE, "dev": _DEV_FILE, "test": _TEST_FILE}
+        downloaded_files = dl_manager.download_and_extract(urls_to_download)
+
+        return [
+            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}),
+            datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": downloaded_files["dev"]}),
+            datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepath": downloaded_files["test"]}),
+        ]
+
+    def _generate_examples(self, filepath):
+        """Yields examples."""
+        # TODO(cmrc2018): Yields (key, example) tuples from the dataset
+        with open(filepath, encoding="utf-8") as f:
+            data = json.load(f)
+            for example in data["data"]:
+                for paragraph in example["paragraphs"]:
+                    context = paragraph["context"].strip()
+                    for qa in paragraph["qas"]:
+                        question = qa["question"].strip()
+                        id_ = qa["id"]
+
+                        answer_starts = [answer["answer_start"] for answer in qa["answers"]]
+                        answers = [answer["text"].strip() for answer in qa["answers"]]
+
+                        yield id_, {
+                            "context": context,
+                            "question": question,
+                            "id": id_,
+                            "answers": {
+                                "answer_start": answer_starts,
+                                "text": answers,
+                            },
+                        }
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/cnn_dailymail.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/cnn_dailymail.py
new file mode 100644
index 000000000..605f94769
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/cnn_dailymail.py
@@ -0,0 +1,276 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""CNN/DailyMail Summarization dataset, non-anonymized version."""
+
+import hashlib
+import os
+
+import datasets
+
+logger = datasets.logging.get_logger(__name__)
+
+_DESCRIPTION = """\
+CNN/DailyMail non-anonymized summarization dataset.
+
+There are two features:
+  - article: text of news article, used as the document to be summarized
+  - highlights: joined text of highlights with <s> and </s> around each
+    highlight, which is the target summary
+"""
+
+# The second citation introduces the source data, while the first
+# introduces the specific form (non-anonymized) we use here.
+_CITATION = """\
+@article{DBLP:journals/corr/SeeLM17,
+  author    = {Abigail See and
+               Peter J. Liu and
+               Christopher D. Manning},
+  title     = {Get To The Point: Summarization with Pointer-Generator Networks},
+  journal   = {CoRR},
+  volume    = {abs/1704.04368},
+  year      = {2017},
+  url       = {http://arxiv.org/abs/1704.04368},
+  archivePrefix = {arXiv},
+  eprint    = {1704.04368},
+  timestamp = {Mon, 13 Aug 2018 16:46:08 +0200},
+  biburl    = {https://dblp.org/rec/bib/journals/corr/SeeLM17},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
+
+@inproceedings{hermann2015teaching,
+  title={Teaching machines to read and comprehend},
+  author={Hermann, Karl Moritz and Kocisky, Tomas and Grefenstette, Edward and Espeholt, Lasse and Kay, Will and Suleyman, Mustafa and Blunsom, Phil},
+  booktitle={Advances in neural information processing systems},
+  pages={1693--1701},
+  year={2015}
+}
+"""
+
+_DL_URLS = {
+    # pylint: disable=line-too-long
+    "cnn_stories": "https://bj.bcebos.com/paddlenlp/datasets/cnn_dailymail/cnn_stories.tgz",
+    "dm_stories": "https://bj.bcebos.com/paddlenlp/datasets/cnn_dailymail/dailymail_stories.tgz",
+    "test_urls": "https://bj.bcebos.com/paddlenlp/datasets/cnn_dailymail/all_test.txt",
+    "train_urls": "https://bj.bcebos.com/paddlenlp/datasets/cnn_dailymail/all_train.txt",
+    "val_urls": "https://bj.bcebos.com/paddlenlp/datasets/cnn_dailymail/all_val.txt",
+    # pylint: enable=line-too-long
+}
+
+_HIGHLIGHTS = "highlights"
+_ARTICLE = "article"
+
+_SUPPORTED_VERSIONS = [
+    # Using cased version.
+    datasets.Version("3.0.0", "Using cased version."),
+    # Same data as 0.0.2
+    datasets.Version("1.0.0", ""),
+    # Having the model predict newline separators makes it easier to evaluate
+    # using summary-level ROUGE.
+    datasets.Version("2.0.0", "Separate target sentences with newline."),
+]
+
+_DEFAULT_VERSION = datasets.Version("3.0.0", "Using cased version.")
+
+
+class CnnDailymailConfig(datasets.BuilderConfig):
+    """BuilderConfig for CnnDailymail."""
+
+    def __init__(self, **kwargs):
+        """BuilderConfig for CnnDailymail.
+
+        Args:
+
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(CnnDailymailConfig, self).__init__(**kwargs)
+
+
+def _get_url_hashes(path):
+    """Get hashes of urls in file."""
+    urls = _read_text_file(path)
+
+    def url_hash(u):
+        h = hashlib.sha1()
+        try:
+            u = u.encode("utf-8")
+        except UnicodeDecodeError:
+            logger.error("Cannot hash url: %s", u)
+        h.update(u)
+        return h.hexdigest()
+
+    return {url_hash(u): True for u in urls}
+
+
+def _get_hash_from_path(p):
+    """Extract hash from path."""
+    basename = os.path.basename(p)
+    return basename[0 : basename.find(".story")]
+
+
+def _find_files(dl_paths, publisher, url_dict):
+    """Find files corresponding to urls."""
+    if publisher == "cnn":
+        top_dir = os.path.join(dl_paths["cnn_stories"], "cnn", "stories")
+    elif publisher == "dm":
+        top_dir = os.path.join(dl_paths["dm_stories"], "dailymail", "stories")
+    else:
+        logger.fatal("Unsupported publisher: %s", publisher)
+    files = sorted(os.listdir(top_dir))
+
+    ret_files = []
+    for p in files:
+        if _get_hash_from_path(p) in url_dict:
+            ret_files.append(os.path.join(top_dir, p))
+    return ret_files
+
+
+def _subset_filenames(dl_paths, split):
+    """Get filenames for a particular split."""
+    assert isinstance(dl_paths, dict), dl_paths
+    # Get filenames for a split.
+    if split == datasets.Split.TRAIN:
+        urls = _get_url_hashes(dl_paths["train_urls"])
+    elif split == datasets.Split.VALIDATION:
+        urls = _get_url_hashes(dl_paths["val_urls"])
+    elif split == datasets.Split.TEST:
+        urls = _get_url_hashes(dl_paths["test_urls"])
+    else:
+        logger.fatal("Unsupported split: %s", split)
+    cnn = _find_files(dl_paths, "cnn", urls)
+    dm = _find_files(dl_paths, "dm", urls)
+    return cnn + dm
+
+
+DM_SINGLE_CLOSE_QUOTE = "\u2019"  # unicode
+DM_DOUBLE_CLOSE_QUOTE = "\u201d"
+# acceptable ways to end a sentence
+END_TOKENS = [".", "!", "?", "...", "'", "`", '"', DM_SINGLE_CLOSE_QUOTE, DM_DOUBLE_CLOSE_QUOTE, ")"]
+
+
+def _read_text_file(text_file):
+    lines = []
+    with open(text_file, "r", encoding="utf-8") as f:
+        for line in f:
+            lines.append(line.strip())
+    return lines
+
+
+def _get_art_abs(story_file, tfds_version):
+    """Get abstract (highlights) and article from a story file path."""
+    # Based on https://github.com/abisee/cnn-dailymail/blob/master/
+    #     make_datafiles.py
+
+    lines = _read_text_file(story_file)
+
+    # The github code lowercase the text and we removed it in 3.0.0.
+
+    # Put periods on the ends of lines that are missing them
+    # (this is a problem in the dataset because many image captions don't end in
+    # periods; consequently they end up in the body of the article as run-on
+    # sentences)
+    def fix_missing_period(line):
+        """Adds a period to a line that is missing a period."""
+        if "@highlight" in line:
+            return line
+        if not line:
+            return line
+        if line[-1] in END_TOKENS:
+            return line
+        return line + " ."
+
+    lines = [fix_missing_period(line) for line in lines]
+
+    # Separate out article and abstract sentences
+    article_lines = []
+    highlights = []
+    next_is_highlight = False
+    for line in lines:
+        if not line:
+            continue  # empty line
+        elif line.startswith("@highlight"):
+            next_is_highlight = True
+        elif next_is_highlight:
+            highlights.append(line)
+        else:
+            article_lines.append(line)
+
+    # Make article into a single string
+    article = " ".join(article_lines)
+
+    if tfds_version >= "2.0.0":
+        abstract = "\n".join(highlights)
+    else:
+        abstract = " ".join(highlights)
+
+    return article, abstract
+
+
+class CnnDailymail(datasets.GeneratorBasedBuilder):
+    """CNN/DailyMail non-anonymized summarization dataset."""
+
+    BUILDER_CONFIGS = [
+        CnnDailymailConfig(name=str(version), description="Plain text", version=version)
+        for version in _SUPPORTED_VERSIONS
+    ]
+
+    def _info(self):
+        # Should return a datasets.DatasetInfo object
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    _ARTICLE: datasets.Value("string"),
+                    _HIGHLIGHTS: datasets.Value("string"),
+                    "id": datasets.Value("string"),
+                }
+            ),
+            supervised_keys=None,
+            homepage="https://github.com/abisee/cnn-dailymail",
+            citation=_CITATION,
+        )
+
+    def _vocab_text_gen(self, paths):
+        for _, ex in self._generate_examples(paths):
+            yield " ".join([ex[_ARTICLE], ex[_HIGHLIGHTS]])
+
+    def _split_generators(self, dl_manager):
+        dl_paths = dl_manager.download_and_extract(_DL_URLS)
+        train_files = _subset_filenames(dl_paths, datasets.Split.TRAIN)
+        # Generate shared vocabulary
+
+        return [
+            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"files": train_files}),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={"files": _subset_filenames(dl_paths, datasets.Split.VALIDATION)},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST, gen_kwargs={"files": _subset_filenames(dl_paths, datasets.Split.TEST)}
+            ),
+        ]
+
+    def _generate_examples(self, files):
+        for p in files:
+            article, highlights = _get_art_abs(p, self.config.version)
+            if not article or not highlights:
+                continue
+            fname = os.path.basename(p)
+            yield fname, {
+                _ARTICLE: article,
+                _HIGHLIGHTS: highlights,
+                "id": _get_hash_from_path(fname),
+            }
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/cote.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/cote.py
new file mode 100644
index 000000000..c60421a27
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/cote.py
@@ -0,0 +1,143 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""COTE: Chinese Opinion Target Extraction."""
+
+import csv
+import os
+
+import datasets
+
+logger = datasets.logging.get_logger(__name__)
+
+_CITATION = """\
+@inproceedings{li2018character,
+  title={Character-based bilstm-crf incorporating pos and dictionaries for chinese opinion target extraction},
+  author={Li, Yanzeng and Liu, Tingwen and Li, Diying and Li, Quangang and Shi, Jinqiao and Wang, Yanqiu},
+  booktitle={Asian Conference on Machine Learning},
+  pages={518--533},
+  year={2018},
+  organization={PMLR}
+}
+"""
+
+_DESCRIPTION = """\
+COTE, a dataset for Opinion target extraction (OTE) for sentiment analysis, which aims to extract target of a given text. This dataset covers data crawled on Baidu, Dianping, and Mafengwo.
+More information refer to https://www.luge.ai/#/luge/dataDetail?id=19.
+"""
+
+_COTE_URLs = {
+    # pylint: disable=line-too-long
+    "bd": "https://paddlenlp.bj.bcebos.com/datasets/COTE-BD.zip",
+    "mfw": "https://paddlenlp.bj.bcebos.com/datasets/COTE-MFW.zip",
+    "dp": "https://paddlenlp.bj.bcebos.com/datasets/COTE-DP.zip",
+    # pylint: enable=line-too-long
+}
+
+
+class COTEConfig(datasets.BuilderConfig):
+    """BuilderConfig for COTE."""
+
+    def __init__(self, data_url=None, data_dir=None, **kwargs):
+        """BuilderConfig for COTE.
+
+        Args:
+          data_url: `string`, url to download the zip file.
+          data_dir: `string`, the path to the folder containing the tsv files in the downloaded zip.
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(COTEConfig, self).__init__(**kwargs)
+        self.data_url = data_url
+        self.data_dir = data_dir
+
+
+class COTE(datasets.GeneratorBasedBuilder):
+    """COTE: Chinese Opinion Target Extraction."""
+
+    BUILDER_CONFIGS = [
+        COTEConfig(
+            name="bd",
+            data_url=_COTE_URLs["bd"],
+            data_dir="COTE-BD",
+            version=datasets.Version("1.0.0", ""),
+            description="COTE-BD crawled on baidu.",
+        ),
+        COTEConfig(
+            name="mfw",
+            data_url=_COTE_URLs["mfw"],
+            data_dir="COTE-MFW",
+            version=datasets.Version("1.0.0", ""),
+            description="COTE-MFW crawled on Mafengwo.",
+        ),
+        COTEConfig(
+            name="dp",
+            data_url=_COTE_URLs["dp"],
+            data_dir="COTE-DP",
+            version=datasets.Version("1.0.0", ""),
+            description="COTE-DP crawled on Dianping.",
+        ),
+    ]
+
+    def _info(self):
+        features = {
+            "id": datasets.Value("int32"),
+            "text_a": datasets.Value("string"),
+            "label": datasets.Value("string"),
+        }
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(features),
+            homepage="https://www.luge.ai/#/luge/dataDetail?id=19",
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        downloaded_dir = dl_manager.download_and_extract(self.config.data_url)
+        data_dir = os.path.join(downloaded_dir, self.config.data_dir)
+
+        train_split = datasets.SplitGenerator(
+            name=datasets.Split.TRAIN, gen_kwargs={"filepath": os.path.join(data_dir, "train.tsv"), "split": "train"}
+        )
+        test_split = datasets.SplitGenerator(
+            name=datasets.Split.TEST, gen_kwargs={"filepath": os.path.join(data_dir, "test.tsv"), "split": "test"}
+        )
+
+        return [train_split, test_split]
+
+    def _generate_examples(self, filepath, split):
+        """This function returns the examples in the raw (text) form."""
+        logger.info("generating examples from = %s", filepath)
+
+        with open(filepath, encoding="utf8") as f:
+            reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
+
+            for idx, row in enumerate(reader):
+                example = {}
+                example["id"] = idx
+                example["text_a"] = row["text_a"]
+
+                if split == "train":
+                    example["label"] = row["label"]
+                else:
+                    example["label"] = ""
+
+                # Filter out corrupted rows.
+                for value in example.values():
+                    if value is None:
+                        break
+                else:
+                    yield idx, example
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/docvqa_zh.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/docvqa_zh.py
new file mode 100644
index 000000000..43f877d0b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/docvqa_zh.py
@@ -0,0 +1,131 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+
+import os
+import json
+import hashlib
+
+import datasets
+
+logger = datasets.logging.get_logger(__name__)
+
+_DESCRIPTION = """\
+The training set from the competition of Insurance DocVQA organized by China Pacific Insurance. \
+The submission is now closed so we split original dataset into three parts for model evluation. \
+There are 4,187 training images, 500 validation images, and 500 test images.
+"""
+
+_URL = "https://bj.bcebos.com/paddlenlp/datasets/docvqa_zh.tar.gz"
+
+
+def _get_md5(string):
+    """Get md5 value for string"""
+    hl = hashlib.md5()
+    hl.update(string.encode(encoding="utf-8"))
+    return hl.hexdigest()
+
+
+class DocVQAZhConfig(datasets.BuilderConfig):
+    """funsd dataset config"""
+
+    target_size: int = 1000
+    max_size: int = 1000
+
+    def __init__(self, **kwargs):
+
+        super(DocVQAZhConfig, self).__init__(**kwargs)
+
+
+class DocVQAZh(datasets.GeneratorBasedBuilder):
+    """funsd dataset builder"""
+
+    BUILDER_CONFIGS = [
+        DocVQAZhConfig(
+            name="docvqa_zh",
+            version=datasets.Version("1.0.0", ""),
+            description="Plain text",
+        ),
+    ]
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "name": datasets.Value("string"),
+                    "page_no": datasets.Value("int32"),
+                    "text": datasets.features.Sequence(datasets.Value("string")),
+                    "bbox": datasets.features.Sequence(datasets.features.Sequence(datasets.Value("int32"))),
+                    "segment_bbox": datasets.features.Sequence(datasets.features.Sequence(datasets.Value("int32"))),
+                    "segment_id": datasets.features.Sequence(datasets.Value("int32")),
+                    "image": datasets.Value("string"),
+                    "width": datasets.Value("int32"),
+                    "height": datasets.Value("int32"),
+                    "md5sum": datasets.Value("string"),
+                    "qas": datasets.features.Sequence(
+                        {
+                            "question_id": datasets.Value("int32"),
+                            "question": datasets.Value("string"),
+                            "answers": datasets.features.Sequence(
+                                {
+                                    "text": datasets.Value("string"),
+                                    "answer_start": datasets.Value("int32"),
+                                    "answer_end": datasets.Value("int32"),
+                                }
+                            ),
+                        }
+                    ),
+                }
+            ),
+            supervised_keys=None,
+            homepage="http://ailab.aiwin.org.cn/competitions/49",
+        )
+
+    def _split_generators(self, dl_manager):
+        dl_dir = dl_manager.download_and_extract(_URL)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={"filepath": os.path.join(dl_dir, "docvqa_zh", "train.json")},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={"filepath": os.path.join(dl_dir, "docvqa_zh", "dev.json")},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={"filepath": os.path.join(dl_dir, "docvqa_zh", "test.json")},
+            ),
+        ]
+
+    def _generate_examples(self, filepath):
+        """This function returns the examples in the raw (text) form."""
+        logger.info("Generating examples from = {}".format(filepath))
+        idx = 0
+        with open(filepath, "r") as fin:
+            for line in fin:
+                data = json.loads(line)
+                if "page_no" not in data:
+                    data["page_no"] = 0
+                for item in data["qas"]:
+                    if "question_id" not in item:
+                        item["question_id"] = -1
+                data["md5sum"] = _get_md5(data["image"])
+                yield idx, data
+                idx += 1
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/duconv.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/duconv.py
new file mode 100644
index 000000000..be8f7db60
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/duconv.py
@@ -0,0 +1,126 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+
+import json
+import os
+
+import datasets
+
+logger = datasets.logging.get_logger(__name__)
+
+_DESCRIPTION = """\
+Duconv is a chinese conversation \
+dataset, designed to evaluate the dialogue models.
+"""
+
+_URL = "https://bj.bcebos.com/paddlenlp/datasets/DuConv.zip"
+
+
+class DuconvConfig(datasets.BuilderConfig):
+    """BuilderConfig for Duconv."""
+
+    def __init__(self, **kwargs):
+        """BuilderConfig for Duconv.
+
+        Args:
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(DuconvConfig, self).__init__(**kwargs)
+
+
+class Duconv(datasets.GeneratorBasedBuilder):
+    BUILDER_CONFIGS = [
+        DuconvConfig(
+            name="DuConv",
+            version=datasets.Version("1.0.0", ""),
+            description=_DESCRIPTION,
+        ),
+    ]
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "id": datasets.Value("string"),
+                    "goal": datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
+                    "knowledge": datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
+                    "conversation": datasets.Sequence(datasets.Value("string")),
+                    "history": datasets.Sequence(datasets.Value("string")),
+                    "response": datasets.Value("string"),
+                }
+            ),
+            # No default supervised_keys (as we have to pass both question
+            # and context as input).
+            supervised_keys=None,
+            homepage="https://arxiv.org/pdf/1906.05572.pdf",
+        )
+
+    def _split_generators(self, dl_manager):
+        dl_dir = dl_manager.download_and_extract(_URL)
+
+        return [
+            datasets.SplitGenerator(
+                name="train",
+                gen_kwargs={
+                    "filepath": os.path.join(dl_dir, "DuConv", "train.txt"),
+                },
+            ),
+            datasets.SplitGenerator(
+                name="dev",
+                gen_kwargs={
+                    "filepath": os.path.join(dl_dir, "DuConv", "dev.txt"),
+                },
+            ),
+            datasets.SplitGenerator(
+                name="test_1",
+                gen_kwargs={
+                    "filepath": os.path.join(dl_dir, "DuConv", "test_1.txt"),
+                },
+            ),
+            datasets.SplitGenerator(
+                name="test_2",
+                gen_kwargs={
+                    "filepath": os.path.join(dl_dir, "DuConv", "test_2.txt"),
+                },
+            ),
+        ]
+
+    def _generate_examples(self, filepath):
+        """This function returns the examples in the raw (text) form."""
+        logger.info("generating examples from = %s", filepath)
+        key = 0
+        with open(filepath, "r", encoding="utf-8") as fin:
+            for line in fin:
+                duconv = json.loads(line)
+
+                goal = duconv["goal"] if "goal" in duconv.keys() else [[]]
+                knowledge = duconv["knowledge"] if "knowledge" in duconv.keys() else [[]]
+                conversation = duconv["conversation"] if "conversation" in duconv.keys() else []
+                history = duconv["history"] if "history" in duconv.keys() else []
+                response = duconv["response"] if "response" in duconv.keys() else ""
+
+                yield key, {
+                    "id": str(key),
+                    "goal": goal,
+                    "knowledge": knowledge,
+                    "conversation": conversation,
+                    "history": history,
+                    "response": response,
+                }
+                key += 1
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/dureader_robust.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/dureader_robust.py
new file mode 100644
index 000000000..a1f8ee350
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/dureader_robust.py
@@ -0,0 +1,129 @@
+# coding=utf-8
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+
+import json
+import os
+
+import datasets
+from datasets.tasks import QuestionAnsweringExtractive
+
+logger = datasets.logging.get_logger(__name__)
+
+_DESCRIPTION = """\
+DureaderRobust is a chinese reading comprehension \
+dataset, designed to evaluate the MRC models from \
+three aspects: over-sensitivity, over-stability \
+and generalization.
+"""
+
+_URL = "https://bj.bcebos.com/paddlenlp/datasets/dureader_robust-data.tar.gz"
+
+
+class DureaderRobustConfig(datasets.BuilderConfig):
+    """BuilderConfig for DureaderRobust."""
+
+    def __init__(self, **kwargs):
+        """BuilderConfig for DureaderRobust.
+
+        Args:
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(DureaderRobustConfig, self).__init__(**kwargs)
+
+
+class DureaderRobust(datasets.GeneratorBasedBuilder):
+    BUILDER_CONFIGS = [
+        DureaderRobustConfig(
+            name="plain_text",
+            version=datasets.Version("1.0.0", ""),
+            description="Plain text",
+        ),
+    ]
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "id": datasets.Value("string"),
+                    "title": datasets.Value("string"),
+                    "context": datasets.Value("string"),
+                    "question": datasets.Value("string"),
+                    "answers": datasets.features.Sequence(
+                        {
+                            "text": datasets.Value("string"),
+                            "answer_start": datasets.Value("int32"),
+                        }
+                    ),
+                }
+            ),
+            # No default supervised_keys (as we have to pass both question
+            # and context as input).
+            supervised_keys=None,
+            homepage="https://arxiv.org/abs/2004.11142",
+            task_templates=[
+                QuestionAnsweringExtractive(
+                    question_column="question", context_column="context", answers_column="answers"
+                )
+            ],
+        )
+
+    def _split_generators(self, dl_manager):
+        dl_dir = dl_manager.download_and_extract(_URL)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={"filepath": os.path.join(dl_dir, "dureader_robust-data", "train.json")},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={"filepath": os.path.join(dl_dir, "dureader_robust-data", "dev.json")},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={"filepath": os.path.join(dl_dir, "dureader_robust-data", "test.json")},
+            ),
+        ]
+
+    def _generate_examples(self, filepath):
+        """This function returns the examples in the raw (text) form."""
+        logger.info("generating examples from = %s", filepath)
+        key = 0
+        with open(filepath, encoding="utf-8") as f:
+            durobust = json.load(f)
+            for article in durobust["data"]:
+                title = article.get("title", "")
+                for paragraph in article["paragraphs"]:
+                    context = paragraph["context"]  # do not strip leading blank spaces GH-2585
+                    for qa in paragraph["qas"]:
+                        answer_starts = [answer["answer_start"] for answer in qa.get("answers", "")]
+                        answers = [answer["text"] for answer in qa.get("answers", "")]
+                        # Features currently used are "context", "question", and "answers".
+                        # Others are extracted here for the ease of future expansions.
+                        yield key, {
+                            "title": title,
+                            "context": context,
+                            "question": qa["question"],
+                            "id": qa["id"],
+                            "answers": {
+                                "answer_start": answer_starts,
+                                "text": answers,
+                            },
+                        }
+                        key += 1
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/funsd.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/funsd.py
new file mode 100644
index 000000000..f9f21552f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/funsd.py
@@ -0,0 +1,141 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+
+import os
+import json
+import hashlib
+
+import datasets
+
+logger = datasets.logging.get_logger(__name__)
+
+_CITATION = """\
+@article{Jaume2019FUNSDAD,
+  title={FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents},
+  author={Guillaume Jaume and H. K. Ekenel and J. Thiran},
+  journal={2019 International Conference on Document Analysis and Recognition Workshops (ICDARW)},
+  year={2019},
+  volume={2},
+  pages={1-6}
+}
+"""
+
+_DESCRIPTION = """\
+https://guillaumejaume.github.io/FUNSD/
+"""
+
+_URL = "https://bj.bcebos.com/paddlenlp/datasets/funsd.tar.gz"
+
+
+def _get_md5(string):
+    """Get md5 value for string"""
+    hl = hashlib.md5()
+    hl.update(string.encode(encoding="utf-8"))
+    return hl.hexdigest()
+
+
+class FUNSDConfig(datasets.BuilderConfig):
+    """funsd dataset config"""
+
+    target_size: int = 1000
+    max_size: int = 1000
+
+    def __init__(self, **kwargs):
+
+        super(FUNSDConfig, self).__init__(**kwargs)
+
+
+class FUNSD(datasets.GeneratorBasedBuilder):
+    """funsd dataset builder"""
+
+    BUILDER_CONFIGS = [
+        FUNSDConfig(
+            name="funsd",
+            version=datasets.Version("1.0.0", ""),
+            description="Plain text",
+        ),
+    ]
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "name": datasets.Value("string"),
+                    "page_no": datasets.Value("int32"),
+                    "text": datasets.features.Sequence(datasets.Value("string")),
+                    "bbox": datasets.features.Sequence(datasets.features.Sequence(datasets.Value("int32"))),
+                    "segment_bbox": datasets.features.Sequence(datasets.features.Sequence(datasets.Value("int32"))),
+                    "segment_id": datasets.features.Sequence(datasets.Value("int32")),
+                    "image": datasets.Value("string"),
+                    "width": datasets.Value("int32"),
+                    "height": datasets.Value("int32"),
+                    "md5sum": datasets.Value("string"),
+                    "qas": datasets.features.Sequence(
+                        {
+                            "question_id": datasets.Value("int32"),
+                            "question": datasets.Value("string"),
+                            "answers": datasets.features.Sequence(
+                                {
+                                    "text": datasets.Value("string"),
+                                    "answer_start": datasets.Value("int32"),
+                                    "answer_end": datasets.Value("int32"),
+                                }
+                            ),
+                        }
+                    ),
+                }
+            ),
+            supervised_keys=None,
+            homepage="https://guillaumejaume.github.io/FUNSD/",
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        dl_dir = dl_manager.download_and_extract(_URL)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={"filepath": os.path.join(dl_dir, "funsd", "train.json")},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={"filepath": os.path.join(dl_dir, "funsd", "dev.json")},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={"filepath": os.path.join(dl_dir, "funsd", "test.json")},
+            ),
+        ]
+
+    def _generate_examples(self, filepath):
+        """This function returns the examples in the raw (text) form."""
+        logger.info("Generating examples from = {}".format(filepath))
+        idx = 0
+        with open(filepath, "r") as fin:
+            for line in fin:
+                data = json.loads(line)
+                if "page_no" not in data:
+                    data["page_no"] = 0
+                for item in data["qas"]:
+                    if "question_id" not in item:
+                        item["question_id"] = -1
+                data["md5sum"] = _get_md5(data["image"])
+                yield idx, data
+                idx += 1
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/glue.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/glue.py
new file mode 100644
index 000000000..4b7106963
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/glue.py
@@ -0,0 +1,625 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""The General Language Understanding Evaluation (GLUE) benchmark."""
+
+import csv
+import os
+import textwrap
+
+import datasets
+import numpy as np
+
+_GLUE_CITATION = """\
+@inproceedings{wang2019glue,
+  title={{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding},
+  author={Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R.},
+  note={In the Proceedings of ICLR.},
+  year={2019}
+}
+"""
+
+_GLUE_DESCRIPTION = """\
+GLUE, the General Language Understanding Evaluation benchmark
+(https://gluebenchmark.com/) is a collection of resources for training,
+evaluating, and analyzing natural language understanding systems.
+
+"""
+
+_MRPC_DEV_IDS = "https://bj.bcebos.com/dataset/glue/mrpc/dev_ids.tsv"
+_MRPC_TRAIN = "https://bj.bcebos.com/dataset/glue/mrpc/msr_paraphrase_train.txt"
+_MRPC_TEST = "https://bj.bcebos.com/dataset/glue/mrpc/msr_paraphrase_test.txt"
+
+_MNLI_BASE_KWARGS = dict(
+    text_features={
+        "premise": "sentence1",
+        "hypothesis": "sentence2",
+    },
+    label_classes=["entailment", "neutral", "contradiction"],
+    label_column="gold_label",
+    data_url="https://bj.bcebos.com/dataset/glue/MNLI.zip",
+    data_dir="MNLI",
+    citation=textwrap.dedent(
+        """\
+      @InProceedings{N18-1101,
+        author = "Williams, Adina
+                  and Nangia, Nikita
+                  and Bowman, Samuel",
+        title = "A Broad-Coverage Challenge Corpus for
+                 Sentence Understanding through Inference",
+        booktitle = "Proceedings of the 2018 Conference of
+                     the North American Chapter of the
+                     Association for Computational Linguistics:
+                     Human Language Technologies, Volume 1 (Long
+                     Papers)",
+        year = "2018",
+        publisher = "Association for Computational Linguistics",
+        pages = "1112--1122",
+        location = "New Orleans, Louisiana",
+        url = "http://aclweb.org/anthology/N18-1101"
+      }
+      @article{bowman2015large,
+        title={A large annotated corpus for learning natural language inference},
+        author={Bowman, Samuel R and Angeli, Gabor and Potts, Christopher and Manning, Christopher D},
+        journal={arXiv preprint arXiv:1508.05326},
+        year={2015}
+      }"""
+    ),
+    url="http://www.nyu.edu/projects/bowman/multinli/",
+)
+
+
+class GlueConfig(datasets.BuilderConfig):
+    """BuilderConfig for GLUE."""
+
+    def __init__(
+        self,
+        text_features,
+        label_column,
+        data_url,
+        data_dir,
+        citation,
+        url,
+        label_classes=None,
+        process_label=lambda x: x,
+        **kwargs,
+    ):
+        """BuilderConfig for GLUE.
+
+        Args:
+          text_features: `dict[string, string]`, map from the name of the feature
+            dict for each text field to the name of the column in the tsv file
+          label_column: `string`, name of the column in the tsv file corresponding
+            to the label
+          data_url: `string`, url to download the zip file from
+          data_dir: `string`, the path to the folder containing the tsv files in the
+            downloaded zip
+          citation: `string`, citation for the data set
+          url: `string`, url for information about the data set
+          label_classes: `list[string]`, the list of classes if the label is
+            categorical. If not provided, then the label will be of type
+            `datasets.Value('float32')`.
+          process_label: `Function[string, any]`, function  taking in the raw value
+            of the label and processing it to the form required by the label feature
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(GlueConfig, self).__init__(version=datasets.Version("1.0.0", ""), **kwargs)
+        self.text_features = text_features
+        self.label_column = label_column
+        self.label_classes = label_classes
+        self.data_url = data_url
+        self.data_dir = data_dir
+        self.citation = citation
+        self.url = url
+        self.process_label = process_label
+
+
+class Glue(datasets.GeneratorBasedBuilder):
+    """The General Language Understanding Evaluation (GLUE) benchmark."""
+
+    BUILDER_CONFIGS = [
+        GlueConfig(
+            name="cola",
+            description=textwrap.dedent(
+                """\
+            The Corpus of Linguistic Acceptability consists of English
+            acceptability judgments drawn from books and journal articles on
+            linguistic theory. Each example is a sequence of words annotated
+            with whether it is a grammatical English sentence."""
+            ),
+            text_features={"sentence": "sentence"},
+            label_classes=["unacceptable", "acceptable"],
+            label_column="is_acceptable",
+            data_url="https://bj.bcebos.com/dataset/glue/CoLA.zip",
+            data_dir="CoLA",
+            citation=textwrap.dedent(
+                """\
+            @article{warstadt2018neural,
+              title={Neural Network Acceptability Judgments},
+              author={Warstadt, Alex and Singh, Amanpreet and Bowman, Samuel R},
+              journal={arXiv preprint arXiv:1805.12471},
+              year={2018}
+            }"""
+            ),
+            url="https://nyu-mll.github.io/CoLA/",
+        ),
+        GlueConfig(
+            name="sst2",
+            description=textwrap.dedent(
+                """\
+            The Stanford Sentiment Treebank consists of sentences from movie reviews and
+            human annotations of their sentiment. The task is to predict the sentiment of a
+            given sentence. We use the two-way (positive/negative) class split, and use only
+            sentence-level labels."""
+            ),
+            text_features={"sentence": "sentence"},
+            label_classes=["negative", "positive"],
+            label_column="label",
+            data_url="https://bj.bcebos.com/dataset/glue/SST.zip",
+            data_dir="SST-2",
+            citation=textwrap.dedent(
+                """\
+            @inproceedings{socher2013recursive,
+              title={Recursive deep models for semantic compositionality over a sentiment treebank},
+              author={Socher, Richard and Perelygin, Alex and Wu, Jean and Chuang, Jason and Manning, Christopher D and Ng, Andrew and Potts, Christopher},
+              booktitle={Proceedings of the 2013 conference on empirical methods in natural language processing},
+              pages={1631--1642},
+              year={2013}
+            }"""
+            ),
+            url="https://datasets.stanford.edu/sentiment/index.html",
+        ),
+        GlueConfig(
+            name="mrpc",
+            description=textwrap.dedent(
+                """\
+            The Microsoft Research Paraphrase Corpus (Dolan & Brockett, 2005) is a corpus of
+            sentence pairs automatically extracted from online news sources, with human annotations
+            for whether the sentences in the pair are semantically equivalent."""
+            ),  # pylint: disable=line-too-long
+            text_features={"sentence1": "", "sentence2": ""},
+            label_classes=["not_equivalent", "equivalent"],
+            label_column="Quality",
+            data_url="",  # MRPC isn't hosted by GLUE.
+            data_dir="MRPC",
+            citation=textwrap.dedent(
+                """\
+            @inproceedings{dolan2005automatically,
+              title={Automatically constructing a corpus of sentential paraphrases},
+              author={Dolan, William B and Brockett, Chris},
+              booktitle={Proceedings of the Third International Workshop on Paraphrasing (IWP2005)},
+              year={2005}
+            }"""
+            ),
+            url="https://www.microsoft.com/en-us/download/details.aspx?id=52398",
+        ),
+        GlueConfig(
+            name="qqp",
+            description=textwrap.dedent(
+                """\
+            The Quora Question Pairs2 dataset is a collection of question pairs from the
+            community question-answering website Quora. The task is to determine whether a
+            pair of questions are semantically equivalent."""
+            ),
+            text_features={
+                "question1": "question1",
+                "question2": "question2",
+            },
+            label_classes=["not_duplicate", "duplicate"],
+            label_column="is_duplicate",
+            data_url="https://dataset.bj.bcebos.com/glue/QQP.zip",
+            data_dir="QQP",
+            citation=textwrap.dedent(
+                """\
+          @online{WinNT,
+            author = {Iyer, Shankar and Dandekar, Nikhil and Csernai, Kornel},
+            title = {First Quora Dataset Release: Question Pairs},
+            year = {2017},
+            url = {https://data.quora.com/First-Quora-Dataset-Release-Question-Pairs},
+            urldate = {2019-04-03}
+          }"""
+            ),
+            url="https://data.quora.com/First-Quora-Dataset-Release-Question-Pairs",
+        ),
+        GlueConfig(
+            name="stsb",
+            description=textwrap.dedent(
+                """\
+            The Semantic Textual Similarity Benchmark (Cer et al., 2017) is a collection of
+            sentence pairs drawn from news headlines, video and image captions, and natural
+            language inference data. Each pair is human-annotated with a similarity score
+            from 1 to 5."""
+            ),
+            text_features={
+                "sentence1": "sentence1",
+                "sentence2": "sentence2",
+            },
+            label_column="score",
+            data_url="https://bj.bcebos.com/dataset/glue/STS.zip",
+            data_dir="STS-B",
+            citation=textwrap.dedent(
+                """\
+            @article{cer2017semeval,
+              title={Semeval-2017 task 1: Semantic textual similarity-multilingual and cross-lingual focused evaluation},
+              author={Cer, Daniel and Diab, Mona and Agirre, Eneko and Lopez-Gazpio, Inigo and Specia, Lucia},
+              journal={arXiv preprint arXiv:1708.00055},
+              year={2017}
+            }"""
+            ),
+            url="https://huggingface.co/datasets/mteb/stsbenchmark-sts",
+            process_label=np.float32,
+        ),
+        GlueConfig(
+            name="mnli",
+            description=textwrap.dedent(
+                """\
+            The Multi-Genre Natural Language Inference Corpus is a crowdsourced
+            collection of sentence pairs with textual entailment annotations. Given a premise sentence
+            and a hypothesis sentence, the task is to predict whether the premise entails the hypothesis
+            (entailment), contradicts the hypothesis (contradiction), or neither (neutral). The premise sentences are
+            gathered from ten different sources, including transcribed speech, fiction, and government reports.
+            We use the standard test set, for which we obtained private labels from the authors, and evaluate
+            on both the matched (in-domain) and mismatched (cross-domain) section. We also use and recommend
+            the SNLI corpus as 550k examples of auxiliary training data."""
+            ),
+            **_MNLI_BASE_KWARGS,
+        ),
+        GlueConfig(
+            name="mnli_mismatched",
+            description=textwrap.dedent(
+                """\
+          The mismatched validation and test splits from MNLI.
+          See the "mnli" BuilderConfig for additional information."""
+            ),
+            **_MNLI_BASE_KWARGS,
+        ),
+        GlueConfig(
+            name="mnli_matched",
+            description=textwrap.dedent(
+                """\
+          The matched validation and test splits from MNLI.
+          See the "mnli" BuilderConfig for additional information."""
+            ),
+            **_MNLI_BASE_KWARGS,
+        ),
+        GlueConfig(
+            name="qnli",
+            description=textwrap.dedent(
+                """\
+            The Stanford Question Answering Dataset is a question-answering
+            dataset consisting of question-paragraph pairs, where one of the sentences in the paragraph (drawn
+            from Wikipedia) contains the answer to the corresponding question (written by an annotator). We
+            convert the task into sentence pair classification by forming a pair between each question and each
+            sentence in the corresponding context, and filtering out pairs with low lexical overlap between the
+            question and the context sentence. The task is to determine whether the context sentence contains
+            the answer to the question. This modified version of the original task removes the requirement that
+            the model select the exact answer, but also removes the simplifying assumptions that the answer
+            is always present in the input and that lexical overlap is a reliable cue."""
+            ),  # pylint: disable=line-too-long
+            text_features={
+                "question": "question",
+                "sentence": "sentence",
+            },
+            label_classes=["entailment", "not_entailment"],
+            label_column="label",
+            data_url="https://bj.bcebos.com/dataset/glue/QNLI.zip",
+            data_dir="QNLI",
+            citation=textwrap.dedent(
+                """\
+            @article{rajpurkar2016squad,
+              title={Squad: 100,000+ questions for machine comprehension of text},
+              author={Rajpurkar, Pranav and Zhang, Jian and Lopyrev, Konstantin and Liang, Percy},
+              journal={arXiv preprint arXiv:1606.05250},
+              year={2016}
+            }"""
+            ),
+            url="https://rajpurkar.github.io/SQuAD-explorer/",
+        ),
+        GlueConfig(
+            name="rte",
+            description=textwrap.dedent(
+                """\
+            The Recognizing Textual Entailment (RTE) datasets come from a series of annual textual
+            entailment challenges. We combine the data from RTE1 (Dagan et al., 2006), RTE2 (Bar Haim
+            et al., 2006), RTE3 (Giampiccolo et al., 2007), and RTE5 (Bentivogli et al., 2009).4 Examples are
+            constructed based on news and Wikipedia text. We convert all datasets to a two-class split, where
+            for three-class datasets we collapse neutral and contradiction into not entailment, for consistency."""
+            ),  # pylint: disable=line-too-long
+            text_features={
+                "sentence1": "sentence1",
+                "sentence2": "sentence2",
+            },
+            label_classes=["entailment", "not_entailment"],
+            label_column="label",
+            data_url="https://bj.bcebos.com/dataset/glue/RTE.zip",
+            data_dir="RTE",
+            citation=textwrap.dedent(
+                """\
+            @inproceedings{dagan2005pascal,
+              title={The PASCAL recognising textual entailment challenge},
+              author={Dagan, Ido and Glickman, Oren and Magnini, Bernardo},
+              booktitle={Machine Learning Challenges Workshop},
+              pages={177--190},
+              year={2005},
+              organization={Springer}
+            }
+            @inproceedings{bar2006second,
+              title={The second pascal recognising textual entailment challenge},
+              author={Bar-Haim, Roy and Dagan, Ido and Dolan, Bill and Ferro, Lisa and Giampiccolo, Danilo and Magnini, Bernardo and Szpektor, Idan},
+              booktitle={Proceedings of the second PASCAL challenges workshop on recognising textual entailment},
+              volume={6},
+              number={1},
+              pages={6--4},
+              year={2006},
+              organization={Venice}
+            }
+            @inproceedings{giampiccolo2007third,
+              title={The third pascal recognizing textual entailment challenge},
+              author={Giampiccolo, Danilo and Magnini, Bernardo and Dagan, Ido and Dolan, Bill},
+              booktitle={Proceedings of the ACL-PASCAL workshop on textual entailment and paraphrasing},
+              pages={1--9},
+              year={2007},
+              organization={Association for Computational Linguistics}
+            }
+            @inproceedings{bentivogli2009fifth,
+              title={The Fifth PASCAL Recognizing Textual Entailment Challenge.},
+              author={Bentivogli, Luisa and Clark, Peter and Dagan, Ido and Giampiccolo, Danilo},
+              booktitle={TAC},
+              year={2009}
+            }"""
+            ),
+            url="https://aclweb.org/aclwiki/Recognizing_Textual_Entailment",
+        ),
+        GlueConfig(
+            name="wnli",
+            description=textwrap.dedent(
+                """\
+            The Winograd Schema Challenge (Levesque et al., 2011) is a reading comprehension task
+            in which a system must read a sentence with a pronoun and select the referent of that pronoun from
+            a list of choices. The examples are manually constructed to foil simple statistical methods: Each
+            one is contingent on contextual information provided by a single word or phrase in the sentence.
+            To convert the problem into sentence pair classification, we construct sentence pairs by replacing
+            the ambiguous pronoun with each possible referent. The task is to predict if the sentence with the
+            pronoun substituted is entailed by the original sentence. We use a small evaluation set consisting of
+            new examples derived from fiction books that was shared privately by the authors of the original
+            corpus. While the included training set is balanced between two classes, the test set is imbalanced
+            between them (65% not entailment). Also, due to a data quirk, the development set is adversarial:
+            hypotheses are sometimes shared between training and development examples, so if a model memorizes the
+            training examples, they will predict the wrong label on corresponding development set
+            example. As with QNLI, each example is evaluated separately, so there is not a systematic correspondence
+            between a model's score on this task and its score on the unconverted original task. We
+            call converted dataset WNLI (Winograd NLI)."""
+            ),
+            text_features={
+                "sentence1": "sentence1",
+                "sentence2": "sentence2",
+            },
+            label_classes=["not_entailment", "entailment"],
+            label_column="label",
+            data_url="https://bj.bcebos.com/dataset/glue/WNLI.zip",
+            data_dir="WNLI",
+            citation=textwrap.dedent(
+                """\
+            @inproceedings{levesque2012winograd,
+              title={The winograd schema challenge},
+              author={Levesque, Hector and Davis, Ernest and Morgenstern, Leora},
+              booktitle={Thirteenth International Conference on the Principles of Knowledge Representation and Reasoning},
+              year={2012}
+            }"""
+            ),
+            url="https://cs.nyu.edu/faculty/davise/papers/WinogradSchemas/WS.html",
+        ),
+        GlueConfig(
+            name="ax",
+            description=textwrap.dedent(
+                """\
+            A manually-curated evaluation dataset for fine-grained analysis of
+            system performance on a broad range of linguistic phenomena. This
+            dataset evaluates sentence understanding through Natural Language
+            Inference (NLI) problems. Use a model trained on MulitNLI to produce
+            predictions for this dataset."""
+            ),
+            text_features={
+                "premise": "sentence1",
+                "hypothesis": "sentence2",
+            },
+            label_classes=["entailment", "neutral", "contradiction"],
+            label_column="",  # No label since we only have test set.
+            # We must use a URL shortener since the URL from GLUE is very long and
+            # causes issues in TFDS.
+            data_url="https://dl.fbaipublicfiles.com/glue/data/AX.tsv",
+            data_dir="",  # We are downloading a tsv.
+            citation="",  # The GLUE citation is sufficient.
+            url="https://gluebenchmark.com/diagnostics",
+        ),
+    ]
+
+    def _info(self):
+        features = {text_feature: datasets.Value("string") for text_feature in self.config.text_features.keys()}
+        if self.config.label_classes:
+            features["label"] = datasets.features.ClassLabel(names=self.config.label_classes)
+        else:
+            features["label"] = datasets.Value("float32")
+        features["idx"] = datasets.Value("int32")
+        return datasets.DatasetInfo(
+            description=_GLUE_DESCRIPTION,
+            features=datasets.Features(features),
+            homepage=self.config.url,
+            citation=self.config.citation + "\n" + _GLUE_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        if self.config.name == "ax":
+            data_file = dl_manager.download(self.config.data_url)
+            return [
+                datasets.SplitGenerator(
+                    name=datasets.Split.TEST,
+                    gen_kwargs={
+                        "data_file": data_file,
+                        "split": "test",
+                    },
+                )
+            ]
+
+        if self.config.name == "mrpc":
+            data_dir = None
+            mrpc_files = dl_manager.download(
+                {
+                    "dev_ids": _MRPC_DEV_IDS,
+                    "train": _MRPC_TRAIN,
+                    "test": _MRPC_TEST,
+                }
+            )
+        else:
+            dl_dir = dl_manager.download_and_extract(self.config.data_url)
+            data_dir = os.path.join(dl_dir, self.config.data_dir)
+            mrpc_files = None
+        train_split = datasets.SplitGenerator(
+            name=datasets.Split.TRAIN,
+            gen_kwargs={
+                "data_file": os.path.join(data_dir or "", "train.tsv"),
+                "split": "train",
+                "mrpc_files": mrpc_files,
+            },
+        )
+        if self.config.name == "mnli":
+            return [
+                train_split,
+                _mnli_split_generator("validation_matched", data_dir, "dev", matched=True),
+                _mnli_split_generator("validation_mismatched", data_dir, "dev", matched=False),
+                _mnli_split_generator("test_matched", data_dir, "test", matched=True),
+                _mnli_split_generator("test_mismatched", data_dir, "test", matched=False),
+            ]
+        elif self.config.name == "mnli_matched":
+            return [
+                _mnli_split_generator("validation", data_dir, "dev", matched=True),
+                _mnli_split_generator("test", data_dir, "test", matched=True),
+            ]
+        elif self.config.name == "mnli_mismatched":
+            return [
+                _mnli_split_generator("validation", data_dir, "dev", matched=False),
+                _mnli_split_generator("test", data_dir, "test", matched=False),
+            ]
+        else:
+            return [
+                train_split,
+                datasets.SplitGenerator(
+                    name=datasets.Split.VALIDATION,
+                    gen_kwargs={
+                        "data_file": os.path.join(data_dir or "", "dev.tsv"),
+                        "split": "dev",
+                        "mrpc_files": mrpc_files,
+                    },
+                ),
+                datasets.SplitGenerator(
+                    name=datasets.Split.TEST,
+                    gen_kwargs={
+                        "data_file": os.path.join(data_dir or "", "test.tsv"),
+                        "split": "test",
+                        "mrpc_files": mrpc_files,
+                    },
+                ),
+            ]
+
+    def _generate_examples(self, data_file, split, mrpc_files=None):
+        if self.config.name == "mrpc":
+            # We have to prepare the MRPC dataset from the original sources ourselves.
+            examples = self._generate_example_mrpc_files(mrpc_files=mrpc_files, split=split)
+            for example in examples:
+                yield example["idx"], example
+        else:
+            process_label = self.config.process_label
+            label_classes = self.config.label_classes
+
+            # The train and dev files for CoLA are the only tsv files without a
+            # header.
+            is_cola_non_test = self.config.name == "cola" and split != "test"
+
+            with open(data_file, encoding="utf8") as f:
+                reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
+                if is_cola_non_test:
+                    reader = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
+
+                for n, row in enumerate(reader):
+                    if is_cola_non_test:
+                        row = {
+                            "sentence": row[3],
+                            "is_acceptable": row[1],
+                        }
+
+                    example = {feat: row[col] for feat, col in self.config.text_features.items()}
+                    example["idx"] = n
+
+                    if self.config.label_column in row:
+                        label = row[self.config.label_column]
+                        # For some tasks, the label is represented as 0 and 1 in the tsv
+                        # files and needs to be cast to integer to work with the feature.
+                        if label_classes and label not in label_classes:
+                            label = int(label) if label else None
+                        example["label"] = process_label(label)
+                    else:
+                        example["label"] = process_label(-1)
+
+                    # Filter out corrupted rows.
+                    for value in example.values():
+                        if value is None:
+                            break
+                    else:
+                        yield example["idx"], example
+
+    def _generate_example_mrpc_files(self, mrpc_files, split):
+        if split == "test":
+            with open(mrpc_files["test"], encoding="utf8") as f:
+                # The first 3 bytes are the utf-8 BOM \xef\xbb\xbf, which messes with
+                # the Quality key.
+                f.seek(3)
+                reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
+                for n, row in enumerate(reader):
+                    yield {
+                        "sentence1": row["#1 String"],
+                        "sentence2": row["#2 String"],
+                        "label": int(row["Quality"]),
+                        "idx": n,
+                    }
+        else:
+            with open(mrpc_files["dev_ids"], encoding="utf8") as f:
+                reader = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
+                dev_ids = [[row[0], row[1]] for row in reader]
+            with open(mrpc_files["train"], encoding="utf8") as f:
+                # The first 3 bytes are the utf-8 BOM \xef\xbb\xbf, which messes with
+                # the Quality key.
+                f.seek(3)
+                reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
+                for n, row in enumerate(reader):
+                    is_row_in_dev = [row["#1 ID"], row["#2 ID"]] in dev_ids
+                    if is_row_in_dev == (split == "dev"):
+                        yield {
+                            "sentence1": row["#1 String"],
+                            "sentence2": row["#2 String"],
+                            "label": int(row["Quality"]),
+                            "idx": n,
+                        }
+
+
+def _mnli_split_generator(name, data_dir, split, matched):
+    return datasets.SplitGenerator(
+        name=name,
+        gen_kwargs={
+            "data_file": os.path.join(data_dir, "%s_%s.tsv" % (split, "matched" if matched else "mismatched")),
+            "split": split,
+            "mrpc_files": None,
+        },
+    )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/imdb.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/imdb.py
new file mode 100644
index 000000000..e4e335331
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/imdb.py
@@ -0,0 +1,109 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""IMDB movie reviews dataset."""
+
+import datasets
+from datasets.tasks import TextClassification
+
+_DESCRIPTION = """\
+Large Movie Review Dataset.
+This is a dataset for binary sentiment classification containing substantially \
+more data than previous benchmark datasets. We provide a set of 25,000 highly \
+polar movie reviews for training, and 25,000 for testing. There is additional \
+unlabeled data for use as well.\
+"""
+
+_CITATION = """\
+@InProceedings{maas-EtAl:2011:ACL-HLT2011,
+  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
+  title     = {Learning Word Vectors for Sentiment Analysis},
+  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
+  month     = {June},
+  year      = {2011},
+  address   = {Portland, Oregon, USA},
+  publisher = {Association for Computational Linguistics},
+  pages     = {142--150},
+  url       = {http://www.aclweb.org/anthology/P11-1015}
+}
+"""
+
+_DOWNLOAD_URL = "https://bj.bcebos.com/dataset/imdb%2FaclImdb_v1.tar.gz"
+
+
+class IMDBReviewsConfig(datasets.BuilderConfig):
+    """BuilderConfig for IMDBReviews."""
+
+    def __init__(self, **kwargs):
+        """BuilderConfig for IMDBReviews.
+        Args:
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(IMDBReviewsConfig, self).__init__(version=datasets.Version("1.0.0", ""), **kwargs)
+
+
+class Imdb(datasets.GeneratorBasedBuilder):
+    """IMDB movie reviews dataset."""
+
+    BUILDER_CONFIGS = [
+        IMDBReviewsConfig(
+            name="plain_text",
+            description="Plain text",
+        )
+    ]
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {"text": datasets.Value("string"), "label": datasets.features.ClassLabel(names=["neg", "pos"])}
+            ),
+            supervised_keys=None,
+            homepage="http://ai.stanford.edu/~amaas/data/sentiment/",
+            citation=_CITATION,
+            task_templates=[TextClassification(text_column="text", label_column="label")],
+        )
+
+    def _split_generators(self, dl_manager):
+        archive = dl_manager.download(_DOWNLOAD_URL)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN, gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "train"}
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST, gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "test"}
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split("unsupervised"),
+                gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "train", "labeled": False},
+            ),
+        ]
+
+    def _generate_examples(self, files, split, labeled=True):
+        """Generate aclImdb examples."""
+        # For labeled examples, extract the label from the path.
+        if labeled:
+            label_mapping = {"pos": 1, "neg": 0}
+            for path, f in files:
+                if path.startswith(f"aclImdb/{split}"):
+                    label = label_mapping.get(path.split("/")[2])
+                    if label is not None:
+                        yield path, {"text": f.read().decode("utf-8"), "label": label}
+        else:
+            for path, f in files:
+                if path.startswith(f"aclImdb/{split}"):
+                    if path.split("/")[2] == "unsup":
+                        yield path, {"text": f.read().decode("utf-8"), "label": -1}
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/language_pair.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/language_pair.py
new file mode 100644
index 000000000..85643f709
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/language_pair.py
@@ -0,0 +1,189 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import datasets
+
+logger = datasets.logging.get_logger(__name__)
+
+_DESCRIPTION = """
+LanguagePairDataset used for machine translation between any pair of languages. """
+
+_URL = "https://bj.bcebos.com/paddlenlp/datasets/WMT14.en-de.tar.gz"
+
+
+class LanguagePairConfig(datasets.BuilderConfig):
+    """BuilderConfig for a general LanguagePairDataset."""
+
+    def __init__(self, **kwargs):
+        """BuilderConfig for LanguagePairDataset.
+
+        Args:
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(LanguagePairConfig, self).__init__(**kwargs)
+
+
+class LanguagePairDataset(datasets.GeneratorBasedBuilder):
+    BUILDER_CONFIGS = [
+        LanguagePairConfig(
+            name="LanguagePair",
+            version=datasets.Version("1.0.0", ""),
+            description=_DESCRIPTION,
+        ),
+    ]
+
+    def _info(self):
+        logger.warning(
+            "LanguagePairDataset is an experimental API which we will continue to optimize and may be changed."
+        )
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "id": datasets.Value("string"),
+                    "source": datasets.Value("string"),
+                    "target": datasets.Value("string"),
+                }
+            ),
+            supervised_keys=None,
+        )
+
+    def _split_generators(self, dl_manager):
+        is_downloaded = False
+
+        # Train files.
+        if hasattr(self.config, "data_files") and "train" in self.config.data_files:
+            train_split = datasets.SplitGenerator(
+                name="train",
+                gen_kwargs={
+                    "source_filepath": os.path.abspath(self.config.data_files["train"][0]),
+                    "target_filepath": os.path.abspath(self.config.data_files["train"][1]),
+                },
+            )
+
+        else:
+            if not is_downloaded:
+                dl_dir = dl_manager.download_and_extract(_URL)
+                is_downloaded = True
+            train_split = datasets.SplitGenerator(
+                name="train",
+                gen_kwargs={
+                    "source_filepath": os.path.join(
+                        dl_dir, "WMT14.en-de", "wmt14_ende_data_bpe", "train.tok.clean.bpe.33708.en"
+                    ),
+                    "target_filepath": os.path.join(
+                        dl_dir, "WMT14.en-de", "wmt14_ende_data_bpe", "train.tok.clean.bpe.33708.de"
+                    ),
+                },
+            )
+
+        # Dev files.
+        if hasattr(self.config, "data_files") and "dev" in self.config.data_files:
+            dev_split = datasets.SplitGenerator(
+                name="dev",
+                gen_kwargs={
+                    "source_filepath": os.path.abspath(self.config.data_files["dev"][0]),
+                    "target_filepath": os.path.abspath(self.config.data_files["dev"][1]),
+                },
+            )
+
+        else:
+            if not is_downloaded:
+                dl_dir = dl_manager.download_and_extract(_URL)
+                is_downloaded = True
+            dev_split = datasets.SplitGenerator(
+                name="dev",
+                gen_kwargs={
+                    "source_filepath": os.path.join(
+                        dl_dir, "WMT14.en-de", "wmt14_ende_data_bpe", "newstest2013.tok.bpe.33708.en"
+                    ),
+                    "target_filepath": os.path.join(
+                        dl_dir, "WMT14.en-de", "wmt14_ende_data_bpe", "newstest2013.tok.bpe.33708.de"
+                    ),
+                },
+            )
+
+        # Test files.
+        if hasattr(self.config, "data_files") and "test" in self.config.data_files:
+            # test may not contain target languages.
+            if isinstance(self.config.data_files["test"], str):
+                self.config.data_files["test"] = [self.config.data_files["test"], None]
+            elif (
+                isinstance(self.config.data_files["test"], (list, tuple)) and len(self.config.data_files["test"]) == 1
+            ):
+                self.config.data_files["test"].append(None)
+
+            test_split = datasets.SplitGenerator(
+                name="test",
+                gen_kwargs={
+                    "source_filepath": os.path.abspath(self.config.data_files["test"][0]),
+                    "target_filepath": os.path.abspath(self.config.data_files["test"][1]),
+                },
+            )
+
+        else:
+            if not is_downloaded:
+                dl_dir = dl_manager.download_and_extract(_URL)
+                is_downloaded = True
+            test_split = datasets.SplitGenerator(
+                name="test",
+                gen_kwargs={
+                    "source_filepath": os.path.join(
+                        dl_dir, "WMT14.en-de", "wmt14_ende_data_bpe", "newstest2014.tok.bpe.33708.en"
+                    ),
+                    "target_filepath": os.path.join(
+                        dl_dir, "WMT14.en-de", "wmt14_ende_data_bpe", "newstest2014.tok.bpe.33708.de"
+                    ),
+                },
+            )
+
+        return [train_split, dev_split, test_split]
+
+    def _generate_examples(self, source_filepath, target_filepath):
+        """This function returns the examples in the raw (text) form."""
+
+        logger.info("generating examples from = source: {} & target: {}".format(source_filepath, target_filepath))
+        key = 0
+
+        with open(source_filepath, "r", encoding="utf-8") as src_fin:
+            if target_filepath is not None:
+                with open(target_filepath, "r", encoding="utf-8") as tgt_fin:
+                    src_seq = src_fin.readlines()
+                    tgt_seq = tgt_fin.readlines()
+
+                    for i, src in enumerate(src_seq):
+                        source = src.strip()
+                        target = tgt_seq[i].strip()
+
+                        yield key, {
+                            "id": str(key),
+                            "source": source,
+                            "target": target,
+                        }
+                        key += 1
+            else:
+                src_seq = src_fin.readlines()
+                for i, src in enumerate(src_seq):
+                    source = src.strip()
+
+                    yield key, {
+                        "id": str(key),
+                        "source": source,
+                        # None is not allowed.
+                        "target": "",
+                    }
+                    key += 1
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/msra_ner.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/msra_ner.py
new file mode 100644
index 000000000..c550a6fe8
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/msra_ner.py
@@ -0,0 +1,147 @@
+# coding=utf-8
+# Copyright 2020 HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Introduction to MSRA NER Dataset"""
+
+import datasets
+
+logger = datasets.logging.get_logger(__name__)
+
+_CITATION = """\
+@inproceedings{levow2006third,
+  author    = {Gina{-}Anne Levow},
+  title     = {The Third International Chinese Language Processing Bakeoff: Word
+               Segmentation and Named Entity Recognition},
+  booktitle = {SIGHAN@COLING/ACL},
+  pages     = {108--117},
+  publisher = {Association for Computational Linguistics},
+  year      = {2006}
+}
+"""
+
+_DESCRIPTION = """\
+The Third International Chinese Language
+Processing Bakeoff was held in Spring
+2006 to assess the state of the art in two
+important tasks: word segmentation and
+named entity recognition. Twenty-nine
+groups submitted result sets in the two
+tasks across two tracks and a total of five
+corpora. We found strong results in both
+tasks as well as continuing challenges.
+
+MSRA NER is one of the provided dataset.
+There are three types of NE, PER (person),
+ORG (organization) and LOC (location).
+The dataset is in the BIO scheme.
+
+For more details see https://faculty.washington.edu/levow/papers/sighan06.pdf
+"""
+
+_URL = "https://bj.bcebos.com/paddlenlp/datasets/msra/"
+_TRAINING_FILE = "msra_train_bio.txt"
+_TEST_FILE = "msra_test_bio.txt"
+
+
+class MsraNerConfig(datasets.BuilderConfig):
+    """BuilderConfig for MsraNer"""
+
+    def __init__(self, **kwargs):
+        """BuilderConfig for MSRA NER.
+
+        Args:
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(MsraNerConfig, self).__init__(**kwargs)
+
+
+class MsraNer(datasets.GeneratorBasedBuilder):
+    """MSRA NER dataset."""
+
+    BUILDER_CONFIGS = [
+        MsraNerConfig(name="msra_ner", version=datasets.Version("1.0.0"), description="MSRA NER dataset"),
+    ]
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "id": datasets.Value("string"),
+                    "tokens": datasets.Sequence(datasets.Value("string")),
+                    "ner_tags": datasets.Sequence(
+                        datasets.features.ClassLabel(
+                            names=[
+                                "O",
+                                "B-PER",
+                                "I-PER",
+                                "B-ORG",
+                                "I-ORG",
+                                "B-LOC",
+                                "I-LOC",
+                            ]
+                        )
+                    ),
+                }
+            ),
+            supervised_keys=None,
+            homepage="https://www.microsoft.com/en-us/download/details.aspx?id=52531",
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        urls_to_download = {
+            "train": f"{_URL}{_TRAINING_FILE}",
+            "test": f"{_URL}{_TEST_FILE}",
+        }
+        downloaded_files = dl_manager.download_and_extract(urls_to_download)
+
+        return [
+            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}),
+            datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepath": downloaded_files["test"]}),
+        ]
+
+    def _generate_examples(self, filepath):
+        logger.info("⏳ Generating examples from = %s", filepath)
+        with open(filepath, encoding="utf-8") as f:
+            guid = 0
+            tokens = []
+            ner_tags = []
+            for line in f:
+                line_stripped = line.strip()
+                if line_stripped == "":
+                    if tokens:
+                        yield guid, {
+                            "id": str(guid),
+                            "tokens": tokens,
+                            "ner_tags": ner_tags,
+                        }
+                        guid += 1
+                        tokens = []
+                        ner_tags = []
+                else:
+                    splits = line_stripped.split("\t")
+                    if len(splits) == 1:
+                        splits.append("O")
+                    tokens.append(splits[0])
+                    ner_tags.append(splits[1])
+            # last example
+            yield guid, {
+                "id": str(guid),
+                "tokens": tokens,
+                "ner_tags": ner_tags,
+            }
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/mt_eng_vietnamese.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/mt_eng_vietnamese.py
new file mode 100644
index 000000000..10f4f2af8
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/mt_eng_vietnamese.py
@@ -0,0 +1,124 @@
+# coding=utf-8
+# Copyright 2020 HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+
+import datasets
+
+_DESCRIPTION = """\
+Preprocessed Dataset from IWSLT'15 English-Vietnamese machine translation: English-Vietnamese.
+"""
+
+_CITATION = """\
+@inproceedings{Luong-Manning:iwslt15,
+        Address = {Da Nang, Vietnam}
+        Author = {Luong, Minh-Thang  and Manning, Christopher D.},
+        Booktitle = {International Workshop on Spoken Language Translation},
+        Title = {Stanford Neural Machine Translation Systems for Spoken Language Domain},
+        Year = {2015}}
+"""
+
+_DATA_URL = "https://paddlenlp.bj.bcebos.com/datasets/iwslt15.en-vi/{}.{}"
+
+# Tuple that describes a single pair of files with matching translations.
+# language_to_file is the map from language (2 letter string: example 'en')
+# to the file path in the extracted directory.
+TranslateData = collections.namedtuple("TranslateData", ["url", "language_to_file"])
+
+
+class MT_Eng_ViConfig(datasets.BuilderConfig):
+    """BuilderConfig for MT_Eng_Vietnamese."""
+
+    def __init__(self, language_pair=(None, None), **kwargs):
+        """BuilderConfig for MT_Eng_Vi.
+        Args:
+            for the `datasets.features.text.TextEncoder` used for the features feature.
+          language_pair: pair of languages that will be used for translation. Should
+            contain 2-letter coded strings. First will be used at source and second
+            as target in supervised mode. For example: ("vi", "en").
+          **kwargs: keyword arguments forwarded to super.
+        """
+
+        description = ("Translation dataset from %s to %s") % (language_pair[0], language_pair[1])
+        super(MT_Eng_ViConfig, self).__init__(
+            description=description,
+            version=datasets.Version("1.0.0"),
+            **kwargs,
+        )
+        self.language_pair = language_pair
+
+
+class MTEngVietnamese(datasets.GeneratorBasedBuilder):
+    """English Vietnamese machine translation dataset from IWSLT2015."""
+
+    BUILDER_CONFIGS = [
+        MT_Eng_ViConfig(
+            name="iwslt2015-vi-en",
+            language_pair=("vi", "en"),
+        ),
+        MT_Eng_ViConfig(
+            name="iwslt2015-en-vi",
+            language_pair=("en", "vi"),
+        ),
+    ]
+    BUILDER_CONFIG_CLASS = MT_Eng_ViConfig
+
+    def _info(self):
+        source, target = self.config.language_pair
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {"translation": datasets.features.Translation(languages=self.config.language_pair)}
+            ),
+            supervised_keys=(source, target),
+            homepage="https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/",
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        source, target = self.config.language_pair
+
+        files = {}
+        for split in ("train", "dev", "test"):
+            if split == "dev":
+                dl_dir_src = dl_manager.download_and_extract(_DATA_URL.format("tst2012", source))
+                dl_dir_tar = dl_manager.download_and_extract(_DATA_URL.format("tst2012", target))
+            if split == "dev":
+                dl_dir_src = dl_manager.download_and_extract(_DATA_URL.format("tst2013", source))
+                dl_dir_tar = dl_manager.download_and_extract(_DATA_URL.format("tst2013", target))
+            if split == "train":
+                dl_dir_src = dl_manager.download_and_extract(_DATA_URL.format(split, source))
+                dl_dir_tar = dl_manager.download_and_extract(_DATA_URL.format(split, target))
+
+            files[split] = {"source_file": dl_dir_src, "target_file": dl_dir_tar}
+
+        return [
+            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs=files["train"]),
+            datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs=files["dev"]),
+            datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs=files["test"]),
+        ]
+
+    def _generate_examples(self, source_file, target_file):
+        """This function returns the examples in the raw (text) form."""
+        with open(source_file, encoding="utf-8") as f:
+            source_sentences = f.read().split("\n")
+        with open(target_file, encoding="utf-8") as f:
+            target_sentences = f.read().split("\n")
+
+        source, target = self.config.language_pair
+        for idx, (l1, l2) in enumerate(zip(source_sentences, target_sentences)):
+            result = {"translation": {source: l1, target: l2}}
+            # Make sure that both translations are non-empty.
+            yield idx, result
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/ptb_text_only.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/ptb_text_only.py
new file mode 100644
index 000000000..dbb56d423
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/ptb_text_only.py
@@ -0,0 +1,144 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+    Load the Penn Treebank dataset.
+
+    This is the Penn Treebank Project: Release 2 CDROM, featuring a million words of 1989 Wall
+    Street Journal material.
+"""
+
+import datasets
+
+# TODO: Add BibTeX citation
+# Find for instance the citation on arxiv or on the dataset repo/website
+_CITATION = """\
+@article{marcus-etal-1993-building,
+    title = "Building a Large Annotated Corpus of {E}nglish: The {P}enn {T}reebank",
+    author = "Marcus, Mitchell P.  and
+      Santorini, Beatrice  and
+      Marcinkiewicz, Mary Ann",
+    journal = "Computational Linguistics",
+    volume = "19",
+    number = "2",
+    year = "1993",
+    url = "https://www.aclweb.org/anthology/J93-2004",
+    pages = "313--330",
+}
+"""
+
+# TODO: Add description of the dataset here
+# You can copy an official description
+_DESCRIPTION = """\
+This is the Penn Treebank Project: Release 2 CDROM, featuring a million words of 1989 Wall Street Journal material. This corpus has been annotated for part-of-speech (POS) information. In addition, over half of it has been annotated for skeletal syntactic structure.
+"""
+
+# TODO: Add a link to an official homepage for the dataset here
+_HOMEPAGE = "https://catalog.ldc.upenn.edu/LDC99T42"
+
+# TODO: Add the licence for the dataset here if you can find it
+_LICENSE = "LDC User Agreement for Non-Members"
+
+# TODO: Add link to the official dataset URLs here
+# The HuggingFace dataset library don't host the datasets but only point to the original files
+# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
+_URL = "https://paddlenlp.bj.bcebos.com/datasets/ptb/"
+_TRAINING_FILE = "ptb.train.txt"
+_DEV_FILE = "ptb.valid.txt"
+_TEST_FILE = "ptb.test.txt"
+
+
+class PtbTextOnlyConfig(datasets.BuilderConfig):
+    """BuilderConfig for PtbTextOnly"""
+
+    def __init__(self, **kwargs):
+        """BuilderConfig PtbTextOnly.
+        Args:
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(PtbTextOnlyConfig, self).__init__(**kwargs)
+
+
+class PtbTextOnly(datasets.GeneratorBasedBuilder):
+    """Load the Penn Treebank dataset."""
+
+    VERSION = datasets.Version("1.1.0")
+
+    # This is an example of a dataset with multiple configurations.
+    # If you don't want/need to define several sub-sets in your dataset,
+    # just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes.
+
+    # If you need to make complex sub-parts in the datasets with configurable options
+    # You can create your own builder configuration class to store attribute, inheriting from datasets.BuilderConfig
+    # BUILDER_CONFIG_CLASS = MyBuilderConfig
+
+    # You will be able to load one or the other configurations in the following list with
+    # data = datasets.load_dataset('my_dataset', 'first_domain')
+    # data = datasets.load_dataset('my_dataset', 'second_domain')
+    BUILDER_CONFIGS = [
+        PtbTextOnlyConfig(
+            name="penn_treebank",
+            version=VERSION,
+            description="Load the Penn Treebank dataset",
+        ),
+    ]
+
+    def _info(self):
+        features = datasets.Features({"sentence": datasets.Value("string")})
+        return datasets.DatasetInfo(
+            # This is the description that will appear on the datasets page.
+            description=_DESCRIPTION,
+            # This defines the different columns of the dataset and their types
+            features=features,  # Here we define them above because they are different between the two configurations
+            # If there's a common (input, target) tuple from the features,
+            # specify them here. They'll be used if as_supervised=True in
+            # builder.as_dataset.
+            supervised_keys=None,
+            # Homepage of the dataset for documentation
+            homepage=_HOMEPAGE,
+            # License for the dataset if available
+            license=_LICENSE,
+            # Citation for the dataset
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
+        # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name
+
+        # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs
+        # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files.
+        # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive
+        my_urls = {
+            "train": f"{_URL}{_TRAINING_FILE}",
+            "dev": f"{_URL}{_DEV_FILE}",
+            "test": f"{_URL}{_TEST_FILE}",
+        }
+        data_dir = dl_manager.download_and_extract(my_urls)
+        return [
+            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": data_dir["train"]}),
+            datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepath": data_dir["test"]}),
+            datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": data_dir["dev"]}),
+        ]
+
+    def _generate_examples(self, filepath):
+        """Yields examples."""
+        # TODO: This method will receive as arguments the `gen_kwargs` defined in the previous `_split_generators` method.
+        # It is in charge of opening the given file and yielding (key, example) tuples from the dataset
+        # The key is not important, it's more here for legacy reason (legacy from tfds)
+        with open(filepath, encoding="utf-8") as f:
+            for id_, line in enumerate(f):
+                line = line.strip()
+                yield id_, {"sentence": line}
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/rvl_cdip_sampled.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/rvl_cdip_sampled.py
new file mode 100644
index 000000000..276b1ddd4
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/rvl_cdip_sampled.py
@@ -0,0 +1,144 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+
+import os
+import json
+import hashlib
+
+import datasets
+
+logger = datasets.logging.get_logger(__name__)
+
+_CITATION = """\
+@inproceedings{harley2015icdar,
+    title = {Evaluation of Deep Convolutional Nets for Document Image Classification and Retrieval},
+    author = {Adam W Harley and Alex Ufkes and Konstantinos G Derpanis},
+    booktitle = {International Conference on Document Analysis and Recognition ({ICDAR})}},
+    year = {2015}
+}
+"""
+
+_DESCRIPTION = """\
+The RVL-CDIP (Ryerson Vision Lab Complex Document Information Processing) dataset consists of 400,000 grayscale images in 16 classes, with 25,000 images per class. \
+Because of the original dataset is large and slow for training, so we downsampling from it. \
+The sampled dataset consist of 6,400 training images, 800 validation images, and 800 test images.
+"""
+
+_LICENSE = "https://www.industrydocuments.ucsf.edu/help/copyright/"
+
+_URL = "https://bj.bcebos.com/paddlenlp/datasets/rvl_cdip_sampled.tar.gz"
+
+
+def _get_md5(string):
+    """Get md5 value for string"""
+    hl = hashlib.md5()
+    hl.update(string.encode(encoding="utf-8"))
+    return hl.hexdigest()
+
+
+class RVLCDIPSampledConfig(datasets.BuilderConfig):
+    """funsd dataset config"""
+
+    target_size: int = 1000
+    max_size: int = 1000
+
+    def __init__(self, **kwargs):
+
+        super(RVLCDIPSampledConfig, self).__init__(**kwargs)
+
+
+class RVLCDIPSampled(datasets.GeneratorBasedBuilder):
+    """funsd dataset builder"""
+
+    BUILDER_CONFIGS = [
+        RVLCDIPSampledConfig(
+            name="rvl_cdip_sampled",
+            version=datasets.Version("1.0.0", ""),
+            description="Plain text",
+        ),
+    ]
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "name": datasets.Value("string"),
+                    "page_no": datasets.Value("int32"),
+                    "text": datasets.features.Sequence(datasets.Value("string")),
+                    "bbox": datasets.features.Sequence(datasets.features.Sequence(datasets.Value("int32"))),
+                    "segment_bbox": datasets.features.Sequence(datasets.features.Sequence(datasets.Value("int32"))),
+                    "segment_id": datasets.features.Sequence(datasets.Value("int32")),
+                    "image": datasets.Value("string"),
+                    "width": datasets.Value("int32"),
+                    "height": datasets.Value("int32"),
+                    "md5sum": datasets.Value("string"),
+                    "qas": datasets.features.Sequence(
+                        {
+                            "question_id": datasets.Value("int32"),
+                            "question": datasets.Value("string"),
+                            "answers": datasets.features.Sequence(
+                                {
+                                    "text": datasets.Value("string"),
+                                    "answer_start": datasets.Value("int32"),
+                                    "answer_end": datasets.Value("int32"),
+                                }
+                            ),
+                        }
+                    ),
+                }
+            ),
+            supervised_keys=None,
+            homepage="https://adamharley.com/rvl-cdip/",
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        dl_dir = dl_manager.download_and_extract(_URL)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={"filepath": os.path.join(dl_dir, "rvl_cdip_sampled", "train.json")},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={"filepath": os.path.join(dl_dir, "rvl_cdip_sampled", "dev.json")},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={"filepath": os.path.join(dl_dir, "rvl_cdip_sampled", "test.json")},
+            ),
+        ]
+
+    def _generate_examples(self, filepath):
+        """This function returns the examples in the raw (text) form."""
+        logger.info("Generating examples from = {}".format(filepath))
+        idx = 0
+        with open(filepath, "r") as fin:
+            for line in fin:
+                data = json.loads(line)
+                if "page_no" not in data:
+                    data["page_no"] = 0
+                for item in data["qas"]:
+                    if "question_id" not in item:
+                        item["question_id"] = -1
+                data["md5sum"] = _get_md5(data["image"])
+                yield idx, data
+                idx += 1
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/seabsa16.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/seabsa16.py
new file mode 100644
index 000000000..c3b97dfc0
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/seabsa16.py
@@ -0,0 +1,136 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""SE-ABSA16: SemEval-2016 Task 5: Aspect Based Sentiment Analysis."""
+
+import csv
+import os
+
+import datasets
+
+logger = datasets.logging.get_logger(__name__)
+
+_CITATION = """\
+@inproceedings{pontiki2016semeval,
+  title={Semeval-2016 task 5: Aspect based sentiment analysis},
+  author={Pontiki, Maria and Galanis, Dimitrios and Papageorgiou, Haris and Androutsopoulos, Ion and Manandhar, Suresh and Al-Smadi, Mohammad and Al-Ayyoub, Mahmoud and Zhao, Yanyan and Qin, Bing and De Clercq, Orph{\'e}e and others},
+  booktitle={International workshop on semantic evaluation},
+  pages={19--30},
+  year={2016}
+}
+"""
+
+_DESCRIPTION = """\
+SE-ABSA16, a dataset for aspect based sentiment analysis, which aims to perform fine-grained sentiment classification for aspect in text. The dataset contains both positive and negative categories. It covers the data of mobile phone and camera.
+More information refer to https://www.luge.ai/#/luge/dataDetail?id=18.
+"""
+
+_SEABSA16_URLs = {
+    # pylint: disable=line-too-long
+    "came": "https://paddlenlp.bj.bcebos.com/datasets/SE-ABSA16_CAME.zip",
+    "phns": "https://paddlenlp.bj.bcebos.com/datasets/SE-ABSA16_PHNS.zip",
+    # pylint: enable=line-too-long
+}
+
+
+class SEABSA16Config(datasets.BuilderConfig):
+    """BuilderConfig for SEABSA16."""
+
+    def __init__(self, data_url=None, data_dir=None, **kwargs):
+        """BuilderConfig for SEABSA16.
+
+        Args:
+          data_url: `string`, url to download the zip file.
+          data_dir: `string`, the path to the folder containing the tsv files in the downloaded zip.
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(SEABSA16Config, self).__init__(**kwargs)
+        self.data_url = data_url
+        self.data_dir = data_dir
+
+
+class SEABSA16(datasets.GeneratorBasedBuilder):
+    """SE-ABSA16: SemEval-2016 Task 5: Aspect Based Sentiment Analysis."""
+
+    BUILDER_CONFIGS = [
+        SEABSA16Config(
+            name="came",
+            data_url=_SEABSA16_URLs["came"],
+            data_dir="SE-ABSA16_CAME",
+            version=datasets.Version("1.0.0", ""),
+            description="SE-ABSA16-CAME data about camera.",
+        ),
+        SEABSA16Config(
+            name="phns",
+            data_url=_SEABSA16_URLs["phns"],
+            data_dir="SE-ABSA16_PHNS",
+            version=datasets.Version("1.0.0", ""),
+            description="SE-ABSA16-PHNS data about phone.",
+        ),
+    ]
+
+    def _info(self):
+        features = {
+            "id": datasets.Value("int32"),
+            "text_a": datasets.Value("string"),
+            "text_b": datasets.Value("string"),
+            "label": datasets.Value("int32"),
+        }
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(features),
+            homepage="https://www.luge.ai/#/luge/dataDetail?id=18",
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        downloaded_dir = dl_manager.download_and_extract(self.config.data_url)
+        data_dir = os.path.join(downloaded_dir, self.config.data_dir)
+
+        train_split = datasets.SplitGenerator(
+            name=datasets.Split.TRAIN, gen_kwargs={"filepath": os.path.join(data_dir, "train.tsv"), "split": "train"}
+        )
+        test_split = datasets.SplitGenerator(
+            name=datasets.Split.TEST, gen_kwargs={"filepath": os.path.join(data_dir, "test.tsv"), "split": "test"}
+        )
+
+        return [train_split, test_split]
+
+    def _generate_examples(self, filepath, split):
+        """This function returns the examples in the raw (text) form."""
+        logger.info("generating examples from = %s", filepath)
+
+        with open(filepath, encoding="utf8") as f:
+            reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
+
+            for idx, row in enumerate(reader):
+                example = {}
+                example["id"] = idx
+                example["text_a"] = row["text_a"]
+                example["text_b"] = row["text_b"]
+
+                if split == "train":
+                    example["label"] = int(row["label"])
+                else:
+                    example["label"] = -1
+
+                # Filter out corrupted rows.
+                for value in example.values():
+                    if value is None:
+                        break
+                else:
+                    yield idx, example
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/squad.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/squad.py
new file mode 100644
index 000000000..cbf050655
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/squad.py
@@ -0,0 +1,139 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""SQUAD: The Stanford Question Answering Dataset."""
+
+import json
+
+import datasets
+from datasets.tasks import QuestionAnsweringExtractive
+
+logger = datasets.logging.get_logger(__name__)
+
+_CITATION = """\
+@article{2016arXiv160605250R,
+       author = {{Rajpurkar}, Pranav and {Zhang}, Jian and {Lopyrev},
+                 Konstantin and {Liang}, Percy},
+        title = "{SQuAD: 100,000+ Questions for Machine Comprehension of Text}",
+      journal = {arXiv e-prints},
+         year = 2016,
+          eid = {arXiv:1606.05250},
+        pages = {arXiv:1606.05250},
+archivePrefix = {arXiv},
+       eprint = {1606.05250},
+}
+"""
+
+_DESCRIPTION = """\
+Stanford Question Answering Dataset (SQuAD) is a reading comprehension \
+dataset, consisting of questions posed by crowdworkers on a set of Wikipedia \
+articles, where the answer to every question is a segment of text, or span, \
+from the corresponding reading passage, or the question might be unanswerable.
+"""
+
+_URL = "https://bj.bcebos.com/paddlenlp/datasets/squad/"
+_URLS = {
+    "train": _URL + "train-v1.1.json",
+    "dev": _URL + "dev-v1.1.json",
+}
+
+
+class SquadConfig(datasets.BuilderConfig):
+    """BuilderConfig for SQUAD."""
+
+    def __init__(self, **kwargs):
+        """BuilderConfig for SQUAD.
+
+        Args:
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(SquadConfig, self).__init__(**kwargs)
+
+
+class Squad(datasets.GeneratorBasedBuilder):
+    """SQUAD: The Stanford Question Answering Dataset. Version 1.1."""
+
+    BUILDER_CONFIGS = [
+        SquadConfig(
+            name="plain_text",
+            version=datasets.Version("1.0.0", ""),
+            description="Plain text",
+        ),
+    ]
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "id": datasets.Value("string"),
+                    "title": datasets.Value("string"),
+                    "context": datasets.Value("string"),
+                    "question": datasets.Value("string"),
+                    "answers": datasets.features.Sequence(
+                        {
+                            "text": datasets.Value("string"),
+                            "answer_start": datasets.Value("int32"),
+                        }
+                    ),
+                }
+            ),
+            # No default supervised_keys (as we have to pass both question
+            # and context as input).
+            supervised_keys=None,
+            homepage="https://rajpurkar.github.io/SQuAD-explorer/",
+            citation=_CITATION,
+            task_templates=[
+                QuestionAnsweringExtractive(
+                    question_column="question", context_column="context", answers_column="answers"
+                )
+            ],
+        )
+
+    def _split_generators(self, dl_manager):
+        downloaded_files = dl_manager.download_and_extract(_URLS)
+
+        return [
+            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}),
+            datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": downloaded_files["dev"]}),
+        ]
+
+    def _generate_examples(self, filepath):
+        """This function returns the examples in the raw (text) form."""
+        logger.info("generating examples from = %s", filepath)
+        key = 0
+        with open(filepath, encoding="utf-8") as f:
+            squad = json.load(f)
+            for article in squad["data"]:
+                title = article.get("title", "")
+                for paragraph in article["paragraphs"]:
+                    context = paragraph["context"]  # do not strip leading blank spaces GH-2585
+                    for qa in paragraph["qas"]:
+                        answer_starts = [answer["answer_start"] for answer in qa["answers"]]
+                        answers = [answer["text"] for answer in qa["answers"]]
+                        # Features currently used are "context", "question", and "answers".
+                        # Others are extracted here for the ease of future expansions.
+                        yield key, {
+                            "title": title,
+                            "context": context,
+                            "question": qa["question"],
+                            "id": qa["id"],
+                            "answers": {
+                                "answer_start": answer_starts,
+                                "text": answers,
+                            },
+                        }
+                        key += 1
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/squad_v2.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/squad_v2.py
new file mode 100644
index 000000000..3995a7875
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/squad_v2.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TODO(squad_v2): Add a description here."""
+
+import json
+
+import datasets
+from datasets.tasks import QuestionAnsweringExtractive
+
+# TODO(squad_v2): BibTeX citation
+_CITATION = """\
+@article{2016arXiv160605250R,
+       author = {{Rajpurkar}, Pranav and {Zhang}, Jian and {Lopyrev},
+                 Konstantin and {Liang}, Percy},
+        title = "{SQuAD: 100,000+ Questions for Machine Comprehension of Text}",
+      journal = {arXiv e-prints},
+         year = 2016,
+          eid = {arXiv:1606.05250},
+        pages = {arXiv:1606.05250},
+archivePrefix = {arXiv},
+       eprint = {1606.05250},
+}
+"""
+
+_DESCRIPTION = """\
+combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers
+ to look similar to answerable ones. To do well on SQuAD2.0, systems must not only answer questions when possible, but
+ also determine when no answer is supported by the paragraph and abstain from answering.
+"""
+
+_URL = "https://bj.bcebos.com/paddlenlp/datasets/squad/"
+_URLS = {
+    "train": _URL + "train-v2.0.json",
+    "dev": _URL + "dev-v2.0.json",
+}
+
+
+class SquadV2Config(datasets.BuilderConfig):
+    """BuilderConfig for SQUAD."""
+
+    def __init__(self, **kwargs):
+        """BuilderConfig for SQUADV2.
+
+        Args:
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(SquadV2Config, self).__init__(**kwargs)
+
+
+class SquadV2(datasets.GeneratorBasedBuilder):
+    """TODO(squad_v2): Short description of my dataset."""
+
+    # TODO(squad_v2): Set up version.
+    BUILDER_CONFIGS = [
+        SquadV2Config(name="squad_v2", version=datasets.Version("2.0.0"), description="SQuAD plaint text version 2"),
+    ]
+
+    def _info(self):
+        # TODO(squad_v2): Specifies the datasets.DatasetInfo object
+        return datasets.DatasetInfo(
+            # This is the description that will appear on the datasets page.
+            description=_DESCRIPTION,
+            # datasets.features.FeatureConnectors
+            features=datasets.Features(
+                {
+                    "id": datasets.Value("string"),
+                    "title": datasets.Value("string"),
+                    "context": datasets.Value("string"),
+                    "question": datasets.Value("string"),
+                    "answers": datasets.features.Sequence(
+                        {
+                            "text": datasets.Value("string"),
+                            "answer_start": datasets.Value("int32"),
+                        }
+                    ),
+                    # These are the features of your dataset like images, labels ...
+                }
+            ),
+            # If there's a common (input, target) tuple from the features,
+            # specify them here. They'll be used if as_supervised=True in
+            # builder.as_dataset.
+            supervised_keys=None,
+            # Homepage of the dataset for documentation
+            homepage="https://rajpurkar.github.io/SQuAD-explorer/",
+            citation=_CITATION,
+            task_templates=[
+                QuestionAnsweringExtractive(
+                    question_column="question", context_column="context", answers_column="answers"
+                )
+            ],
+        )
+
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        # TODO(squad_v2): Downloads the data and defines the splits
+        # dl_manager is a datasets.download.DownloadManager that can be used to
+        # download and extract URLs
+        urls_to_download = _URLS
+        downloaded_files = dl_manager.download_and_extract(urls_to_download)
+
+        return [
+            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}),
+            datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": downloaded_files["dev"]}),
+        ]
+
+    def _generate_examples(self, filepath):
+        """Yields examples."""
+        # TODO(squad_v2): Yields (key, example) tuples from the dataset
+        with open(filepath, encoding="utf-8") as f:
+            squad = json.load(f)
+            for example in squad["data"]:
+                title = example.get("title", "")
+                for paragraph in example["paragraphs"]:
+                    context = paragraph["context"]  # do not strip leading blank spaces GH-2585
+                    for qa in paragraph["qas"]:
+                        question = qa["question"]
+                        id_ = qa["id"]
+
+                        answer_starts = [answer["answer_start"] for answer in qa["answers"]]
+                        answers = [answer["text"] for answer in qa["answers"]]
+
+                        # Features currently used are "context", "question", and "answers".
+                        # Others are extracted here for the ease of future expansions.
+                        yield id_, {
+                            "title": title,
+                            "context": context,
+                            "question": question,
+                            "id": id_,
+                            "answers": {
+                                "answer_start": answer_starts,
+                                "text": answers,
+                            },
+                        }
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/xfund_zh.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/xfund_zh.py
new file mode 100644
index 000000000..5af7d7151
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/xfund_zh.py
@@ -0,0 +1,153 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+
+import os
+import json
+import hashlib
+
+import datasets
+
+logger = datasets.logging.get_logger(__name__)
+
+_CITATION = """\
+@inproceedings{xu-etal-2022-xfund,
+    title = "{XFUND}: A Benchmark Dataset for Multilingual Visually Rich Form Understanding",
+    author = "Xu, Yiheng  and
+      Lv, Tengchao  and
+      Cui, Lei  and
+      Wang, Guoxin  and
+      Lu, Yijuan  and
+      Florencio, Dinei  and
+      Zhang, Cha  and
+      Wei, Furu",
+    booktitle = "Findings of the Association for Computational Linguistics: ACL 2022",
+    month = may,
+    year = "2022",
+    address = "Dublin, Ireland",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2022.findings-acl.253",
+    doi = "10.18653/v1/2022.findings-acl.253",
+    pages = "3214--3224",
+    abstract = "Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually rich document understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. However, the existed research work has focused only on the English domain while neglecting the importance of multilingual generalization. In this paper, we introduce a human-annotated multilingual form understanding benchmark dataset named XFUND, which includes form understanding samples in 7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese). Meanwhile, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to bridge the language barriers for visually rich document understanding. Experimental results show that the LayoutXLM model has significantly outperformed the existing SOTA cross-lingual pre-trained models on the XFUND dataset. The XFUND dataset and the pre-trained LayoutXLM model have been publicly available at https://aka.ms/layoutxlm.",
+}
+"""
+
+_DESCRIPTION = """\
+https://github.com/doc-analysis/XFUND
+"""
+
+_URL = "https://bj.bcebos.com/paddlenlp/datasets/xfund_zh.tar.gz"
+
+
+def _get_md5(string):
+    """Get md5 value for string"""
+    hl = hashlib.md5()
+    hl.update(string.encode(encoding="utf-8"))
+    return hl.hexdigest()
+
+
+class XFUNDZhConfig(datasets.BuilderConfig):
+    """xfund_zh dataset config"""
+
+    target_size: int = 1000
+    max_size: int = 1000
+
+    def __init__(self, **kwargs):
+
+        super(XFUNDZhConfig, self).__init__(**kwargs)
+
+
+class XFUNDZh(datasets.GeneratorBasedBuilder):
+    """xfund_zh dataset builder"""
+
+    BUILDER_CONFIGS = [
+        XFUNDZhConfig(
+            name="xfund_zh",
+            version=datasets.Version("1.0.0", ""),
+            description="Plain text",
+        ),
+    ]
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "name": datasets.Value("string"),
+                    "page_no": datasets.Value("int32"),
+                    "text": datasets.features.Sequence(datasets.Value("string")),
+                    "bbox": datasets.features.Sequence(datasets.features.Sequence(datasets.Value("int32"))),
+                    "segment_bbox": datasets.features.Sequence(datasets.features.Sequence(datasets.Value("int32"))),
+                    "segment_id": datasets.features.Sequence(datasets.Value("int32")),
+                    "image": datasets.Value("string"),
+                    "width": datasets.Value("int32"),
+                    "height": datasets.Value("int32"),
+                    "md5sum": datasets.Value("string"),
+                    "qas": datasets.features.Sequence(
+                        {
+                            "question_id": datasets.Value("int32"),
+                            "question": datasets.Value("string"),
+                            "answers": datasets.features.Sequence(
+                                {
+                                    "text": datasets.Value("string"),
+                                    "answer_start": datasets.Value("int32"),
+                                    "answer_end": datasets.Value("int32"),
+                                }
+                            ),
+                        }
+                    ),
+                }
+            ),
+            supervised_keys=None,
+            homepage="https://github.com/doc-analysis/XFUND",
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        dl_dir = dl_manager.download_and_extract(_URL)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={"filepath": os.path.join(dl_dir, "xfund_zh", "train.json")},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={"filepath": os.path.join(dl_dir, "xfund_zh", "dev.json")},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={"filepath": os.path.join(dl_dir, "xfund_zh", "test.json")},
+            ),
+        ]
+
+    def _generate_examples(self, filepath):
+        """This function returns the examples in the raw (text) form."""
+        logger.info("Generating examples from = {}".format(filepath))
+        idx = 0
+        with open(filepath, "r") as fin:
+            for line in fin:
+                data = json.loads(line)
+                if "page_no" not in data:
+                    data["page_no"] = 0
+                for item in data["qas"]:
+                    if "question_id" not in item:
+                        item["question_id"] = -1
+                data["md5sum"] = _get_md5(data["image"])
+                yield idx, data
+                idx += 1
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/xnli.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/xnli.py
new file mode 100644
index 000000000..cae74ffed
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hf_datasets/xnli.py
@@ -0,0 +1,209 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""XNLI: The Cross-Lingual NLI Corpus."""
+
+import collections
+import csv
+import os
+from contextlib import ExitStack
+
+import datasets
+
+_CITATION = """\
+@InProceedings{conneau2018xnli,
+  author = {Conneau, Alexis
+                 and Rinott, Ruty
+                 and Lample, Guillaume
+                 and Williams, Adina
+                 and Bowman, Samuel R.
+                 and Schwenk, Holger
+                 and Stoyanov, Veselin},
+  title = {XNLI: Evaluating Cross-lingual Sentence Representations},
+  booktitle = {Proceedings of the 2018 Conference on Empirical Methods
+               in Natural Language Processing},
+  year = {2018},
+  publisher = {Association for Computational Linguistics},
+  location = {Brussels, Belgium},
+}"""
+
+_DESCRIPTION = """\
+XNLI is a subset of a few thousand examples from MNLI which has been translated
+into a 14 different languages (some low-ish resource). As with MNLI, the goal is
+to predict textual entailment (does sentence A imply/contradict/neither sentence
+B) and is a classification task (given two sentences, predict one of three
+labels).
+"""
+
+_TRAIN_DATA_URL = "https://bj.bcebos.com/paddlenlp/datasets/XNLI-MT-1.0.zip"
+_TESTVAL_DATA_URL = "https://bj.bcebos.com/paddlenlp/datasets/XNLI-1.0.zip"
+
+_LANGUAGES = ("ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "sw", "th", "tr", "ur", "vi", "zh")
+
+
+class XnliConfig(datasets.BuilderConfig):
+    """BuilderConfig for XNLI."""
+
+    def __init__(self, language: str, languages=None, **kwargs):
+        """BuilderConfig for XNLI.
+
+        Args:
+        language: One of ar,bg,de,el,en,es,fr,hi,ru,sw,th,tr,ur,vi,zh, or all_languages
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(XnliConfig, self).__init__(**kwargs)
+        self.language = language
+        if language != "all_languages":
+            self.languages = [language]
+        else:
+            self.languages = languages if languages is not None else _LANGUAGES
+
+
+class Xnli(datasets.GeneratorBasedBuilder):
+    """XNLI: The Cross-Lingual NLI Corpus. Version 1.0."""
+
+    VERSION = datasets.Version("1.1.0", "")
+    BUILDER_CONFIG_CLASS = XnliConfig
+    BUILDER_CONFIGS = [
+        XnliConfig(
+            name=lang,
+            language=lang,
+            version=datasets.Version("1.1.0", ""),
+            description=f"Plain text import of XNLI for the {lang} language",
+        )
+        for lang in _LANGUAGES
+    ] + [
+        XnliConfig(
+            name="all_languages",
+            language="all_languages",
+            version=datasets.Version("1.1.0", ""),
+            description="Plain text import of XNLI for all languages",
+        )
+    ]
+
+    def _info(self):
+        if self.config.language == "all_languages":
+            features = datasets.Features(
+                {
+                    "premise": datasets.Translation(
+                        languages=_LANGUAGES,
+                    ),
+                    "hypothesis": datasets.TranslationVariableLanguages(
+                        languages=_LANGUAGES,
+                    ),
+                    "label": datasets.ClassLabel(names=["entailment", "neutral", "contradiction"]),
+                }
+            )
+        else:
+            features = datasets.Features(
+                {
+                    "premise": datasets.Value("string"),
+                    "hypothesis": datasets.Value("string"),
+                    "label": datasets.ClassLabel(names=["entailment", "neutral", "contradiction"]),
+                }
+            )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            # No default supervised_keys (as we have to pass both premise
+            # and hypothesis as input).
+            supervised_keys=None,
+            homepage="https://www.nyu.edu/projects/bowman/xnli/",
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        dl_dirs = dl_manager.download_and_extract(
+            {
+                "train_data": _TRAIN_DATA_URL,
+                "testval_data": _TESTVAL_DATA_URL,
+            }
+        )
+        train_dir = os.path.join(dl_dirs["train_data"], "XNLI-MT-1.0", "multinli")
+        testval_dir = os.path.join(dl_dirs["testval_data"], "XNLI-1.0")
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepaths": [
+                        os.path.join(train_dir, f"multinli.train.{lang}.tsv") for lang in self.config.languages
+                    ],
+                    "data_format": "XNLI-MT",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={"filepaths": [os.path.join(testval_dir, "xnli.test.tsv")], "data_format": "XNLI"},
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={"filepaths": [os.path.join(testval_dir, "xnli.dev.tsv")], "data_format": "XNLI"},
+            ),
+        ]
+
+    def _generate_examples(self, data_format, filepaths):
+        """This function returns the examples in the raw (text) form."""
+
+        if self.config.language == "all_languages":
+            if data_format == "XNLI-MT":
+                with ExitStack() as stack:
+                    files = [stack.enter_context(open(filepath, encoding="utf-8")) for filepath in filepaths]
+                    readers = [csv.DictReader(file, delimiter="\t", quoting=csv.QUOTE_NONE) for file in files]
+                    for row_idx, rows in enumerate(zip(*readers)):
+                        yield row_idx, {
+                            "premise": {lang: row["premise"] for lang, row in zip(self.config.languages, rows)},
+                            "hypothesis": {lang: row["hypo"] for lang, row in zip(self.config.languages, rows)},
+                            "label": rows[0]["label"].replace("contradictory", "contradiction"),
+                        }
+            else:
+                rows_per_pair_id = collections.defaultdict(list)
+                for filepath in filepaths:
+                    with open(filepath, encoding="utf-8") as f:
+                        reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
+                        for row in reader:
+                            rows_per_pair_id[row["pairID"]].append(row)
+
+                for rows in rows_per_pair_id.values():
+                    premise = {row["language"]: row["sentence1"] for row in rows}
+                    hypothesis = {row["language"]: row["sentence2"] for row in rows}
+                    yield rows[0]["pairID"], {
+                        "premise": premise,
+                        "hypothesis": hypothesis,
+                        "label": rows[0]["gold_label"],
+                    }
+        else:
+            if data_format == "XNLI-MT":
+                for file_idx, filepath in enumerate(filepaths):
+                    with open(filepath, encoding="utf-8") as file:
+                        reader = csv.DictReader(file, delimiter="\t", quoting=csv.QUOTE_NONE)
+                        for row_idx, row in enumerate(reader):
+                            key = str(file_idx) + "_" + str(row_idx)
+                            yield key, {
+                                "premise": row["premise"],
+                                "hypothesis": row["hypo"],
+                                "label": row["label"].replace("contradictory", "contradiction"),
+                            }
+            else:
+                for filepath in filepaths:
+                    with open(filepath, encoding="utf-8") as f:
+                        reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
+                        for row in reader:
+                            if row["language"] == self.config.language:
+                                yield row["pairID"], {
+                                    "premise": row["sentence1"],
+                                    "hypothesis": row["sentence2"],
+                                    "label": row["gold_label"],
+                                }
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hyp.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hyp.py
new file mode 100644
index 000000000..bf2dfb63f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/hyp.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+import xml.dom.minidom
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+
+class HYP(DatasetBuilder):
+    """
+    Hyperpartisan News Detection
+    Task: Given a news article text, decide whether it follows a hyperpartisan
+    argumentation, i.e., whether it exhibits blind, prejudiced, or unreasoning
+    allegiance to one party, faction, cause, or person.
+
+    More detail at https://pan.webis.de/semeval19/semeval19-web/
+    """
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/hyp.zip"
+    MD5 = "125c504b4da6882c2d163ae9962b6220"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    SPLITS = {
+        "train": META_INFO(os.path.join("hyp", "train.xml"), "f9dc8cb583db4c061a5abfb556d8c164"),
+        "dev": META_INFO(os.path.join("hyp", "eval.xml"), "20a7a7e82ae695a7fac4b8c48d0e4932"),
+        "test": META_INFO(os.path.join("hyp", "test.xml"), "5b1a166e7966fa744b402b033b9ed3ae"),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        """Downloads dataset."""
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename, split):
+        """Reads data."""
+        dom = xml.dom.minidom.parse(filename)
+        example_nodes = dom.documentElement.getElementsByTagName("article")
+        for example in example_nodes:
+            text = "".join([nodes.toprettyxml(indent="", newl="") for nodes in example.childNodes])
+            label = example.getAttribute("hyperpartisan")
+            yield {"text": text, "label": label}
+
+    def get_labels(self):
+        """
+        Return labels of the HYP object.
+        """
+        return ["false", "true"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/imdb.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/imdb.py
new file mode 100644
index 000000000..b0144bf73
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/imdb.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import io
+import os
+
+from ..utils.downloader import get_path_from_url
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["Imdb"]
+
+
+class Imdb(DatasetBuilder):
+    """
+    Subsets of IMDb data are available for access to customers for personal and non-commercial use.
+    Each dataset is contained in a gzipped, tab-separated-values (TSV) formatted file in the UTF-8 character set.
+    The first line in each file contains headers that describe what is in each column.
+    Implementation of `IMDB <https://www.imdb.com/interfaces/>`_ dataset.
+
+    """
+
+    URL = "https://bj.bcebos.com/dataset/imdb%2FaclImdb_v1.tar.gz"
+    MD5 = "7c2ac02c03563afcf9b574c7e56c153a"
+    META_INFO = collections.namedtuple("META_INFO", ("data_dir", "md5"))
+    SPLITS = {
+        "train": META_INFO(os.path.join("aclImdb", "train"), None),
+        "test": META_INFO(os.path.join("aclImdb", "test"), None),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        """Downloads dataset."""
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, _ = self.SPLITS[mode]
+        data_dir = os.path.join(default_root, filename)
+        if not os.path.exists(data_dir):
+            get_path_from_url(self.URL, default_root, self.MD5)
+        return data_dir
+
+    def _read(self, data_dir, *args):
+        for label in ["pos", "neg"]:
+            root = os.path.join(data_dir, label)
+            data_files = os.listdir(root)
+            data_files.sort()
+
+            if label == "pos":
+                label_id = "1"
+            elif label == "neg":
+                label_id = "0"
+            for f in data_files:
+                f = os.path.join(root, f)
+                with io.open(f, "r", encoding="utf8") as fr:
+                    data = fr.readlines()
+                    data = data[0]
+                    yield {"text": data, "label": label_id}
+
+    def get_labels(self):
+        """
+        Return labels of the Imdb object.
+        """
+        return ["0", "1"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/iwslt15.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/iwslt15.py
new file mode 100644
index 000000000..a6ef2831f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/iwslt15.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["IWSLT15"]
+
+
+class IWSLT15(DatasetBuilder):
+    """
+    Created by Stanford at 2015, the IWSLT 15 English-Vietnamese Sentence
+    pairs for translation., in Multi-Lingual language. Containing 133 in Text
+    file format.
+    """
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/iwslt15.en-vi.tar.gz"
+    META_INFO = collections.namedtuple("META_INFO", ("src_file", "tgt_file", "src_md5", "tgt_md5"))
+    MD5 = "aca22dc3f90962e42916dbb36d8f3e8e"
+    SPLITS = {
+        "train": META_INFO(
+            os.path.join("iwslt15.en-vi", "train.en"),
+            os.path.join("iwslt15.en-vi", "train.vi"),
+            "5b6300f46160ab5a7a995546d2eeb9e6",
+            "858e884484885af5775068140ae85dab",
+        ),
+        "dev": META_INFO(
+            os.path.join("iwslt15.en-vi", "tst2012.en"),
+            os.path.join("iwslt15.en-vi", "tst2012.vi"),
+            "c14a0955ed8b8d6929fdabf4606e3875",
+            "dddf990faa149e980b11a36fca4a8898",
+        ),
+        "test": META_INFO(
+            os.path.join("iwslt15.en-vi", "tst2013.en"),
+            os.path.join("iwslt15.en-vi", "tst2013.vi"),
+            "c41c43cb6d3b122c093ee89608ba62bd",
+            "a3185b00264620297901b647a4cacf38",
+        ),
+    }
+    VOCAB_INFO = (
+        os.path.join("iwslt15.en-vi", "vocab.en"),
+        os.path.join("iwslt15.en-vi", "vocab.vi"),
+        "98b5011e1f579936277a273fd7f4e9b4",
+        "e8b05f8c26008a798073c619236712b4",
+    )
+    UNK_TOKEN = "<unk>"
+    BOS_TOKEN = "<s>"
+    EOS_TOKEN = "</s>"
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        src_filename, tgt_filename, src_data_hash, tgt_data_hash = self.SPLITS[mode]
+        src_fullname = os.path.join(default_root, src_filename)
+        tgt_fullname = os.path.join(default_root, tgt_filename)
+
+        src_vocab_filename, src_vocab_hash, tgt_vocab_filename, tgt_vocab_hash = self.VOCAB_INFO
+        src_vocab_fullname = os.path.join(default_root, src_vocab_filename)
+        tgt_vocab_fullname = os.path.join(default_root, tgt_vocab_filename)
+
+        if (
+            (not os.path.exists(src_fullname) or (src_data_hash and not md5file(src_fullname) == src_data_hash))
+            or (not os.path.exists(tgt_fullname) or (tgt_data_hash and not md5file(tgt_fullname) == tgt_data_hash))
+            or (
+                not os.path.exists(src_vocab_fullname)
+                or (src_vocab_hash and not md5file(src_vocab_fullname) == src_vocab_hash)
+            )
+            or (
+                not os.path.exists(tgt_vocab_fullname)
+                or (tgt_vocab_hash and not md5file(tgt_vocab_fullname) == tgt_vocab_hash)
+            )
+        ):
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return src_fullname, tgt_fullname
+
+    def _read(self, filename, *args):
+        src_filename, tgt_filename = filename
+        with open(src_filename, "r", encoding="utf-8") as src_f:
+            with open(tgt_filename, "r", encoding="utf-8") as tgt_f:
+                for src_line, tgt_line in zip(src_f, tgt_f):
+                    src_line = src_line.strip()
+                    tgt_line = tgt_line.strip()
+                    if not src_line and not tgt_line:
+                        continue
+                    yield {"en": src_line, "vi": tgt_line}
+
+    def get_vocab(self):
+        en_vocab_fullname = os.path.join(DATA_HOME, self.__class__.__name__, self.VOCAB_INFO[0])
+        vi_vocab_fullname = os.path.join(DATA_HOME, self.__class__.__name__, self.VOCAB_INFO[1])
+
+        # Construct vocab_info to match the form of the input of `Vocab.load_vocabulary()` function
+        vocab_info = {
+            "en": {
+                "filepath": en_vocab_fullname,
+                "unk_token": self.UNK_TOKEN,
+                "bos_token": self.BOS_TOKEN,
+                "eos_token": self.EOS_TOKEN,
+            },
+            "vi": {
+                "filepath": vi_vocab_fullname,
+                "unk_token": self.UNK_TOKEN,
+                "bos_token": self.BOS_TOKEN,
+                "eos_token": self.EOS_TOKEN,
+            },
+        }
+        return vocab_info
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/lcqmc.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/lcqmc.py
new file mode 100644
index 000000000..e85924b36
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/lcqmc.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["LCQMC"]
+
+
+class LCQMC(DatasetBuilder):
+    """
+    LCQMC:A Large-scale Chinese Question Matching Corpus
+    More information please refer to `https://www.aclweb.org/anthology/C18-1166/`
+
+    """
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/lcqmc.zip"
+    MD5 = "7069fa0cffbd2110845869c61f83814a"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    SPLITS = {
+        "train": META_INFO(os.path.join("lcqmc", "lcqmc", "train.tsv"), "479d94fe575981f236319f2a5b8b3c03"),
+        "dev": META_INFO(os.path.join("lcqmc", "lcqmc", "dev.tsv"), "089329fb44ef26155baef9c9c8c823ba"),
+        "test": META_INFO(os.path.join("lcqmc", "lcqmc", "test.tsv"), "a4a483f2f871d57e0f3894fca0d0f8f0"),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename):
+        """Reads data."""
+        with open(filename, "r", encoding="utf-8") as f:
+            for line in f:
+                data = line.strip().split("\t")
+                if len(data) == 3:
+                    query, title, label = data
+                    yield {"query": query, "title": title, "label": label}
+                elif len(data) == 2:
+                    query, title = data
+                    yield {"query": query, "title": title, "label": ""}
+                else:
+                    continue
+
+    def get_labels(self):
+        """
+        Return labels of the LCQMC object.
+        """
+        return ["0", "1"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/lcqmc_v2.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/lcqmc_v2.py
new file mode 100644
index 000000000..d3fcd53f3
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/lcqmc_v2.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["LCQMC_V2"]
+
+
+class LCQMC_V2(DatasetBuilder):
+    """
+    LCQMC:A Large-scale Chinese Question Matching Corpus
+    More information please refer to `https://www.aclweb.org/anthology/C18-1166/`
+
+    """
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/lcqmc_v2.tar.gz"
+    MD5 = "e44825d8e6d5117bc04caf3982cf934f"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    SPLITS = {
+        "train": META_INFO(os.path.join("lcqmc", "train.tsv"), "2193c022439b038ac12c0ae918b211a1"),
+        "dev": META_INFO(os.path.join("lcqmc", "dev.tsv"), "c5dcba253cb4105d914964fd8b3c0e94"),
+        "test": META_INFO(os.path.join("lcqmc", "test.tsv"), "8f4b71e15e67696cc9e112a459ec42bd"),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename):
+        """Reads data."""
+        with open(filename, "r", encoding="utf-8") as f:
+            head = True
+            for line in f:
+                data = line.strip().split("\t")
+                if head:
+                    head = False
+                else:
+                    if len(data) == 3:
+                        query, title, label = data
+                        yield {"query": query, "title": title, "label": label}
+                    elif len(data) == 2:
+                        query, title = data
+                        yield {"query": query, "title": title, "label": ""}
+                    else:
+                        continue
+
+    def get_labels(self):
+        """
+        Return labels of the LCQMC object.
+        """
+        return ["0", "1"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/lcsts_new.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/lcsts_new.py
new file mode 100644
index 000000000..a3a94968f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/lcsts_new.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import json
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["LCSTSNew"]
+
+
+class LCSTSNew(DatasetBuilder):
+    """
+    Large-scale Chinese Short Text Summarization(LCSTS) dataset is
+    constructed by utilizing the naturally annotated web resources
+    on Sina Weibo. For more information, please refer
+    to `https://aclanthology.org/D15-1229.pdf`.
+    """
+
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5", "URL"))
+    SPLITS = {
+        "train": META_INFO(
+            os.path.join("train.json"),
+            "4e06fd1cfd5e7f0380499df8cbe17237",
+            "https://bj.bcebos.com/paddlenlp/datasets/LCSTS_new/train.json",
+        ),
+        "dev": META_INFO(
+            os.path.join("dev.json"),
+            "9c39d49d25d5296bdc537409208ddc85",
+            "https://bj.bcebos.com/paddlenlp/datasets/LCSTS_new/dev.json",
+        ),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash, URL = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(URL, default_root)
+
+        return fullname
+
+    def _read(self, filename, *args):
+        with open(filename, "r", encoding="utf8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                json_data = json.loads(line)
+
+                yield {"source": json_data["content"], "target": json_data.get("summary", ""), "id": json_data["id"]}
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/msra_ner.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/msra_ner.py
new file mode 100644
index 000000000..5cffc4e8c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/msra_ner.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["MsraNer"]
+
+
+class MsraNer(DatasetBuilder):
+    """
+    Chinese Named Entity Recognition dataset published by Microsoft Research Asia
+    in 2006. The dataset is in the BIO scheme.
+    """
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/msra_ner.tar.gz"
+    MD5 = "f1aadbbf328ea2fa50c9c2b56db0d31e"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    SPLITS = {
+        "train": META_INFO(os.path.join("msra_ner", "train.tsv"), "e5b4b734ef91861384f441456ad995dd"),
+        "test": META_INFO(os.path.join("msra_ner", "test.tsv"), "40b26ae09b63af78ea3a91ac8b8ae303"),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename, *args):
+        with open(filename, "r", encoding="utf-8") as f:
+            for line in f:
+                line_stripped = line.strip().split("\t")
+                if not line_stripped:
+                    break
+                if len(line_stripped) == 2:
+                    tokens = line_stripped[0].split("\002")
+                    tags = line_stripped[1].split("\002")
+                else:
+                    tokens = line_stripped.split("\002")
+                    tags = []
+                yield {"tokens": tokens, "labels": tags}
+
+    def get_labels(self):
+
+        return ["B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/nlpcc13_evsam05_hit.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/nlpcc13_evsam05_hit.py
new file mode 100644
index 000000000..2158037c6
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/nlpcc13_evsam05_hit.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["NLPCC13EVSAM05HIT"]
+
+
+class NLPCC13EVSAM05HIT(DatasetBuilder):
+    """
+    NLPCC13_EVSAM05_HIT is the dataset for dependency parsing.
+    The format of this dataset is based on the CoNLL-X style:
+
+        '''
+        raw name        definition
+
+        ID              Token counter, starting at 1 for each new sentence.
+        FORM            Word form or punctuation symbol.
+        LEMMA           Lemma or stem (depending on the particular treebank) of word form, or an underscore if not available.
+        CPOSTAG         Coarse-grained part-of-speech tag, where the tagset depends on the treebank.
+        POSTAG          Fine-grained part-of-speech tag, where the tagset depends on the treebank.
+        FEATS           Unordered set of syntactic and/or morphological features (depending on the particular treebank), or an underscore if not available.
+        HEAD            Head of the current token, which is either a value of ID, or zero (’0’) if the token links to the virtual root node of the sentence.
+        DEPREL          Dependency relation to the HEAD.
+        PHEAD           Projective head of current token, which is either a value of ID or zero (’0’), or an underscore if not available.
+        PDEPREL         Dependency relation to the PHEAD, or an underscore if not available.
+        '''
+    """
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/nlpcc13_evsam05_hit.tar.gz"
+    MD5 = "5988ede79690dc87aa6e4343b5299944"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    SPLITS = {
+        "train": META_INFO(os.path.join("nlpcc13_evsam05_hit", "train.conll"), "d82e667950a5e22b18baf595b9feb30f"),
+        "dev": META_INFO(os.path.join("nlpcc13_evsam05_hit", "dev.conll"), "b71b08dc85e652769bfbda30b1e352a9"),
+        "test": META_INFO(os.path.join("nlpcc13_evsam05_hit", "test.conll"), "784fb9d966a286df5370f7eee4013cf0"),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        """Downloads dataset."""
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename, split):
+        start = 0
+        with open(filename, "r", encoding="utf-8") as f:
+            lines = []
+            for line in f.readlines():
+                if not line.startswith(" "):
+                    if not line.startswith("#") and (len(line) == 1 or line.split()[0].isdigit()):
+                        lines.append(line.strip())
+                else:
+                    lines.append("")
+
+        for i, line in enumerate(lines):
+            if not line:
+                values = list(zip(*[j.split("\t") for j in lines[start:i]]))
+                if split == "test":
+                    ID, FORM, LEMMA, CPOS, POS, FEATS, HEAD, DEPREL = values
+                else:
+                    ID, FORM, LEMMA, CPOS, POS, FEATS, HEAD, DEPREL, _, _ = values
+                if values:
+                    yield {
+                        "ID": ID,
+                        "FORM": FORM,
+                        "LEMMA": LEMMA,
+                        "CPOS": CPOS,
+                        "POS": POS,
+                        "FEATS": FEATS,
+                        "HEAD": HEAD,
+                        "DEPREL": DEPREL,
+                    }
+                start = i + 1
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/nlpcc13_evsam05_thu.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/nlpcc13_evsam05_thu.py
new file mode 100644
index 000000000..618f776b2
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/nlpcc13_evsam05_thu.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["NLPCC13EVSAM05THU"]
+
+
+class NLPCC13EVSAM05THU(DatasetBuilder):
+    """
+    NLPCC13_EVSAM05_THU is the dataset for dependency parsing.
+    The format of this dataset is based on the CoNLL-X style:
+
+        '''
+        raw name        definition
+
+        ID              Token counter, starting at 1 for each new sentence.
+        FORM            Word form or punctuation symbol.
+        LEMMA           Lemma or stem (depending on the particular treebank) of word form, or an underscore if not available.
+        CPOSTAG         Coarse-grained part-of-speech tag, where the tagset depends on the treebank.
+        POSTAG          Fine-grained part-of-speech tag, where the tagset depends on the treebank.
+        FEATS           Unordered set of syntactic and/or morphological features (depending on the particular treebank), or an underscore if not available.
+        HEAD            Head of the current token, which is either a value of ID, or zero (’0’) if the token links to the virtual root node of the sentence.
+        DEPREL          Dependency relation to the HEAD.
+        '''
+    """
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/nlpcc13_evsam05_thu.tar.gz"
+    MD5 = "297ad22217ba4668d49580009810446e"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    SPLITS = {
+        "train": META_INFO(os.path.join("nlpcc13_evsam05_thu", "train.conll"), "c7779f981203b4ecbe5b04c65aaaffce"),
+        "dev": META_INFO(os.path.join("nlpcc13_evsam05_thu", "dev.conll"), "59c2de72c7be39977f766e8290336dac"),
+        "test": META_INFO(os.path.join("nlpcc13_evsam05_thu", "test.conll"), "873223b42060ce16a7e24545e43a933f"),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        """Downloads dataset."""
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename, split):
+        start = 0
+        with open(filename, "r", encoding="utf-8") as f:
+            lines = []
+            for line in f.readlines():
+                if not line.startswith(" "):
+                    if not line.startswith("#") and (len(line) == 1 or line.split()[0].isdigit()):
+                        lines.append(line.strip())
+                else:
+                    lines.append("")
+
+        for i, line in enumerate(lines):
+            if not line:
+                values = list(zip(*[j.split("\t") for j in lines[start:i]]))
+
+                ID, FORM, LEMMA, CPOS, POS, FEATS, HEAD, DEPREL = values
+                if values:
+                    yield {
+                        "ID": ID,
+                        "FORM": FORM,
+                        "LEMMA": LEMMA,
+                        "CPOS": CPOS,
+                        "POS": POS,
+                        "FEATS": FEATS,
+                        "HEAD": HEAD,
+                        "DEPREL": DEPREL,
+                    }
+                start = i + 1
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/nlpcc14_sc.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/nlpcc14_sc.py
new file mode 100644
index 000000000..8eb766942
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/nlpcc14_sc.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["NLPCC14SC"]
+
+
+class NLPCC14SC(DatasetBuilder):
+    """
+    NLPCC14-SC is the dataset for sentiment classification. There are 2 classes
+    in the datasets: Negative (0) and Positive (1). The following is a part of
+    the train data:
+      '''
+      label	                  text_a
+      1	                      超级值得看的一个电影
+      0	                      我感觉卓越的东西现在好垃圾，还贵，关键贵。
+      '''
+    Please note that the test data contains no corresponding labels.
+
+    NLPCC14-SC datasets only contain train and test data, so we remove the dev
+    data in META_INFO. By Fiyen at Beijing Jiaotong University.
+    """
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/NLPCC14-SC.zip"
+    MD5 = "4792a0982bc64b83d9a76dcce8bc00ad"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    SPLITS = {
+        "train": META_INFO(os.path.join("NLPCC14-SC", "NLPCC14-SC", "train.tsv"), "b0c6f74bb8d41020067c8f103c6e08c0"),
+        "test": META_INFO(os.path.join("NLPCC14-SC", "NLPCC14-SC", "test.tsv"), "57526ba07510fdc901777e7602a26774"),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        """Downloads dataset."""
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename, split):
+        """Reads data."""
+        with open(filename, "r", encoding="utf-8") as f:
+            head = None
+            for line in f:
+                data = line.strip().split("\t")
+                if not head:
+                    head = data
+                else:
+                    if split == "train":
+                        label, text = data
+                        yield {"text": text, "label": label, "qid": ""}
+                    elif split == "test":
+                        qid, text = data
+                        yield {"text": text, "label": "", "qid": qid}
+
+    def get_labels(self):
+        """
+        Return labels of the NLPCC14-SC object.
+        """
+        return ["0", "1"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/nlpcc_dbqa.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/nlpcc_dbqa.py
new file mode 100644
index 000000000..d8ffc1114
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/nlpcc_dbqa.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["NLPCC_DBQA"]
+
+
+class NLPCC_DBQA(DatasetBuilder):
+    """
+    NLPCC2016 DBQA dataset.
+
+    Document-based QA (or DBQA) task
+    When predicting answers to each question, a DBQA system built by each
+    participating team IS LIMITED TO select sentences as answers from the
+    question’s given document.
+
+    For more information: http://tcci.ccf.org.cn/conference/2016/dldoc/evagline2.pdf
+    """
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/nlpcc-dbqa.zip"
+    MD5 = "a5f69c2462136ef4d1707e4e2551a57b"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    SPLITS = {
+        "train": META_INFO(os.path.join("nlpcc-dbqa", "nlpcc-dbqa", "train.tsv"), "4f84fefce1a8f52c8d9248d1ff5ab9bd"),
+        "dev": META_INFO(os.path.join("nlpcc-dbqa", "nlpcc-dbqa", "dev.tsv"), "3831beb0d42c29615d06343538538f53"),
+        "test": META_INFO(os.path.join("nlpcc-dbqa", "nlpcc-dbqa", "test.tsv"), "e224351353b1f6a15837008b5d0da703"),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        """Downloads dataset."""
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename, split):
+        """Reads data."""
+        with open(filename, "r", encoding="utf-8") as f:
+            head = None
+            for line in f:
+                data = line.strip().split("\t")
+                if not head:
+                    head = data
+                else:
+                    qid, text_a, text_b, label = data
+                    yield {"qid": qid, "text_a": text_a, "text_b": text_b, "label": label}
+
+    def get_labels(self):
+        """
+        Return labels of XNLI dataset.
+
+        Note:
+            Contradictory and contradiction are the same label
+        """
+        return ["0", "1"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/paws-x.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/paws-x.py
new file mode 100644
index 000000000..c15ec89cd
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/paws-x.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["PAWSX"]
+
+
+class PAWSX(DatasetBuilder):
+    """
+    PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification
+    More information please refer to `https://arxiv.org/abs/1908.11828`
+    Here we only store simplified Chinese(zh) version.
+    """
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/paws-x-zh.zip"
+    MD5 = "f1c6f2ab8afb1f29fe04a0c929e3ab1c"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    SPLITS = {
+        "train": META_INFO(os.path.join("paws-x-zh", "paws-x-zh", "train.tsv"), "3422ba98e5151c91bbb0a785c4873a4c"),
+        "dev": META_INFO(os.path.join("paws-x-zh", "paws-x-zh", "dev.tsv"), "dc163453e728cf118e17b4065d6602c8"),
+        "test": META_INFO(os.path.join("paws-x-zh", "paws-x-zh", "test.tsv"), "5b7320760e70559591092cb01b6f5955"),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename):
+        """Reads data."""
+        with open(filename, "r", encoding="utf-8") as f:
+            for line in f:
+                data = line.strip().split("\t")
+                if len(data) == 3:
+                    sentence1, sentence2, label = data
+                    yield {"sentence1": sentence1, "sentence2": sentence2, "label": label}
+                elif len(data) == 2:
+                    sentence1, sentence2 = data
+                    yield {"sentence1": sentence1, "sentence2": sentence2, "label": ""}
+                else:
+                    continue
+
+    def get_labels(self):
+        """
+        Return labels of the PAWS-X object.
+        """
+        return ["0", "1"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/peoples_daily_ner.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/peoples_daily_ner.py
new file mode 100644
index 000000000..5a99f543e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/peoples_daily_ner.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["PeoplesDailyNER"]
+
+
+class PeoplesDailyNER(DatasetBuilder):
+    """
+    Chinese Named Entity Recognition dataset published by People's Daily.
+    The dataset is in the BIO scheme with tags: LOC, ORG and PER.
+    """
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/peoples_daily_ner.tar.gz"
+    MD5 = "a44ff9c4b37b48add9ddc17994d5620c"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    SPLITS = {
+        "train": META_INFO(os.path.join("peoples_daily_ner", "train.tsv"), "67d3c93a37daba60ef43c03271f119d7"),
+        "dev": META_INFO(os.path.join("peoples_daily_ner", "dev.tsv"), "ec772f3ba914bca5269f6e785bb3375d"),
+        "test": META_INFO(os.path.join("peoples_daily_ner", "test.tsv"), "2f27ae68b5f61d6553ffa28bb577c8a7"),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename, *args):
+        with open(filename, "r", encoding="utf-8") as f:
+            next(f)
+            for line in f:
+                line_stripped = line.strip().split("\t")
+                if not line_stripped:
+                    break
+                if len(line_stripped) == 2:
+                    tokens = line_stripped[0].split("\002")
+                    tags = line_stripped[1].split("\002")
+                else:
+                    tokens = line_stripped.split("\002")
+                    tags = []
+                yield {"tokens": tokens, "labels": tags}
+
+    def get_labels(self):
+
+        return ["B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/poetry.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/poetry.py
new file mode 100644
index 000000000..b323eb27f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/poetry.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["Poetry"]
+
+
+class Poetry(DatasetBuilder):
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/poetry.tar.gz"
+    MD5 = "8edd7eda1b273145b70ef29c82cd622b"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    SPLITS = {
+        "train": META_INFO(os.path.join("poetry", "train.tsv"), "176c6202b5e71656ae7e7848eec4c54f"),
+        "dev": META_INFO(os.path.join("poetry", "dev.tsv"), "737e4b6da5facdc0ac33fe688df19931"),
+        "test": META_INFO(os.path.join("poetry", "test.tsv"), "1dca907b2d712730c7c828f8acee7431"),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename, *args):
+        with open(filename, "r", encoding="utf-8") as f:
+            for line in f:
+                line_stripped = line.strip().split("\t")
+                if not line_stripped:
+                    break
+                if len(line_stripped) == 2:
+                    tokens = line_stripped[0]
+                    labels = line_stripped[1]
+                else:
+                    tokens = line_stripped
+                    labels = []
+                yield {"tokens": tokens, "labels": labels}
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/ptb.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/ptb.py
new file mode 100644
index 000000000..ba953976d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/ptb.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["PTB"]
+
+
+class PTB(DatasetBuilder):
+    """
+    This is the Penn Treebank Project: Release 2 CDROM, featuring a million
+    words of 1989 Wall Street Journal material.
+    """
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/rnnlm/simple-examples.tgz"
+    MD5 = "30177ea32e27c525793142b6bf2c8e2d"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    SPLITS = {
+        "train": META_INFO(
+            os.path.join("simple-examples", "data", "ptb.train.txt"), "f26c4b92c5fdc7b3f8c7cdcb991d8420"
+        ),
+        "valid": META_INFO(
+            os.path.join("simple-examples", "data", "ptb.valid.txt"), "aa0affc06ff7c36e977d7cd49e3839bf"
+        ),
+        "test": META_INFO(os.path.join("simple-examples", "data", "ptb.test.txt"), "8b80168b89c18661a38ef683c0dc3721"),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename, *args):
+        with open(filename, "r", encoding="utf-8") as f:
+            for line in f:
+                line_stripped = line.strip()
+                yield {"sentence": line_stripped}
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/seabsa16.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/seabsa16.py
new file mode 100644
index 000000000..21001793b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/seabsa16.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["SeAbsa16"]
+
+
+class SeAbsa16(DatasetBuilder):
+    """
+    SE-ABSA16_PHNS dataset for Aspect-level Sentiment Classification task.
+    More information please refer to
+    https://aistudio.baidu.com/aistudio/competition/detail/50/?isFromLuge=1.
+
+    """
+
+    BUILDER_CONFIGS = {
+        # phns is short for phones.
+        "phns": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/SE-ABSA16_PHNS.zip",
+            "md5": "f5a62548f2fcf73892cacf2cdf159671",
+            "splits": {
+                "train": [
+                    os.path.join("SE-ABSA16_PHNS", "train.tsv"),
+                    "cb4f65aaee59fa76526a0c79b7c12689",
+                    (0, 1, 2),
+                    1,
+                ],
+                "test": [os.path.join("SE-ABSA16_PHNS", "test.tsv"), "7ad80f284e0eccc059ece3ce3d3a173f", (1, 2), 1],
+            },
+            "labels": ["0", "1"],
+        },
+        # came is short for cameras.
+        "came": {
+            "url": "https://bj.bcebos.com/paddlenlp/datasets/SE-ABSA16_CAME.zip",
+            "md5": "3104e92217bbff80a1ed834230f1df51",
+            "splits": {
+                "train": [
+                    os.path.join("SE-ABSA16_CAME", "train.tsv"),
+                    "8c661c0e83bb34b66c6fbf039c7fae80",
+                    (0, 1, 2),
+                    1,
+                ],
+                "test": [os.path.join("SE-ABSA16_CAME", "test.tsv"), "8b80f77960be55adca1184d7a20501df", (1, 2), 1],
+            },
+            "labels": ["0", "1"],
+        },
+    }
+
+    def _get_data(self, mode, **kwargs):
+        """Downloads dataset."""
+        builder_config = self.BUILDER_CONFIGS[self.name]
+        default_root = os.path.join(DATA_HOME, f"SE-ABSA16_{self.name.upper()}")
+        filename, data_hash, _, _ = builder_config["splits"][mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            url = builder_config["url"]
+            md5 = builder_config["md5"]
+            get_path_from_url(url, DATA_HOME, md5)
+
+        return fullname
+
+    def _read(self, filename, split):
+        """Reads data"""
+        _, _, field_indices, num_discard_samples = self.BUILDER_CONFIGS[self.name]["splits"][split]
+        with open(filename, "r", encoding="utf-8") as f:
+            for idx, line in enumerate(f):
+                if idx < num_discard_samples:
+                    continue
+                line_stripped = line.strip().split("\t")
+                if not line_stripped:
+                    continue
+                example = [line_stripped[indice] for indice in field_indices]
+                if split == "test":
+                    yield {"text": example[0], "text_pair": example[1]}
+                else:
+                    yield {"text": example[1], "text_pair": example[2], "label": example[0]}
+
+    def get_labels(self):
+        """
+        Return labels of the SE_ABSA16.
+        """
+        return self.BUILDER_CONFIGS[self.name]["labels"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/sighan-cn.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/sighan-cn.py
new file mode 100644
index 000000000..2fca3743a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/sighan-cn.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["SIGHAN_CN"]
+
+
+class SIGHAN_CN(DatasetBuilder):
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/sighan-cn.zip"
+    MD5 = "cd67b9b36a5908f848cbf04b5d83c005"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    SPLITS = {
+        "train": META_INFO(os.path.join("sighan-cn", "train.txt"), "5eb7b7847722f3bf69bf978d1a5f99cc"),
+        "dev": META_INFO(os.path.join("sighan-cn", "dev.txt"), "bc34d119aeb7ca022aa66e2f448ded95"),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        """Downloads dataset."""
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename, *args):
+        """Reads data."""
+        with open(filename, "r", encoding="utf8") as fr:
+            for line in fr:
+                source, target = line.strip("\n").split("\t")[0:2]
+                yield {"source": source, "target": target}
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/squad.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/squad.py
new file mode 100644
index 000000000..b5d603d41
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/squad.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import json
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["SQuAD"]
+
+
+class SQuAD(DatasetBuilder):
+    """
+    Stanford Question Answering Dataset (SQuAD) is a reading comprehension
+    dataset, consisting of questions posed by crowdworkers on a set of Wikipedia
+    articles, where the answer to every question is a segment of text, or span,
+    from the corresponding reading passage, or the question might be unanswerable.
+    """
+
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5", "URL"))
+    SPLITS = {
+        "train_v1": META_INFO(
+            os.path.join("train-v1.1.json"),
+            "981b29407e0affa3b1b156f72073b945",
+            "https://bj.bcebos.com/paddlenlp/datasets/squad/train-v1.1.json",
+        ),
+        "dev_v1": META_INFO(
+            os.path.join("dev-v1.1.json"),
+            "3e85deb501d4e538b6bc56f786231552",
+            "https://bj.bcebos.com/paddlenlp/datasets/squad/dev-v1.1.json",
+        ),
+        "train_v2": META_INFO(
+            os.path.join("train-v2.0.json"),
+            "62108c273c268d70893182d5cf8df740",
+            "https://bj.bcebos.com/paddlenlp/datasets/squad/train-v2.0.json",
+        ),
+        "dev_v2": META_INFO(
+            os.path.join("dev-v2.0.json"),
+            "246adae8b7002f8679c027697b0b7cf8",
+            "https://bj.bcebos.com/paddlenlp/datasets/squad/dev-v2.0.json",
+        ),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash, URL = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(URL, default_root)
+
+        return fullname
+
+    def _read(self, filename, *args):
+        with open(filename, "r", encoding="utf8") as f:
+            input_data = json.load(f)["data"]
+        for entry in input_data:
+            title = entry.get("title", "").strip()
+            for paragraph in entry["paragraphs"]:
+                context = paragraph["context"].strip()
+                for qa in paragraph["qas"]:
+                    qas_id = qa["id"]
+                    question = qa["question"].strip()
+                    answer_starts = []
+                    answers = []
+                    is_impossible = False
+
+                    if "is_impossible" in qa.keys():
+                        is_impossible = qa["is_impossible"]
+
+                    answer_starts = [answer["answer_start"] for answer in qa.get("answers", [])]
+                    answers = [answer["text"].strip() for answer in qa.get("answers", [])]
+
+                    yield {
+                        "id": qas_id,
+                        "title": title,
+                        "context": context,
+                        "question": question,
+                        "answers": answers,
+                        "answer_starts": answer_starts,
+                        "is_impossible": is_impossible,
+                    }
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/thucnews.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/thucnews.py
new file mode 100644
index 000000000..7b8a92ebc
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/thucnews.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+
+class THUCNews(DatasetBuilder):
+    """
+    A subset of THUCNews dataset. THUCNews is a text classification dataset.
+    See descrition about this subset version at https://github.com/gaussic/text-classification-cnn-rnn#%E6%95%B0%E6%8D%AE%E9%9B%86
+    The whole dataset can be downloaded at https://thunlp.oss-cn-qingdao.aliyuncs.com/THUCNews.zip
+    """
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/thucnews.zip"
+    MD5 = "97626b2268f902662a29aadf222f22cc"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    LABEL_PATH = os.path.join("thucnews", "label.txt")
+    SPLITS = {
+        "train": META_INFO(os.path.join("thucnews", "train.txt"), "beda43dfb4f7bd9bd3d465edb35fbb7f"),
+        "dev": META_INFO(os.path.join("thucnews", "val.txt"), "1abe8fe2c75dde701407a9161dcd223a"),
+        "test": META_INFO(os.path.join("thucnews", "test.txt"), "201f558b7d0b3419ddebcd695f3070f0"),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        """Downloads dataset."""
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename, split):
+        with open(filename, "r", encoding="utf8") as f:
+            examples = f.readlines()
+            for example in examples:
+                split_idx = example.find("\t")
+                label = example[:split_idx]
+                text = example[split_idx + 1 :].strip()
+                yield {"text": text, "label": label}
+
+    def get_labels(self):
+        labels = []
+        filename = os.path.join(DATA_HOME, self.__class__.__name__, self.LABEL_PATH)
+        with open(filename, "r", encoding="utf8") as f:
+            while True:
+                label = f.readline().strip()
+                if label == "":
+                    break
+                labels.append(label)
+        return labels
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/triviaqa.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/triviaqa.py
new file mode 100644
index 000000000..b966362ea
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/triviaqa.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import json
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["TriviaQA"]
+
+
+class TriviaQA(DatasetBuilder):
+    """
+    TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence
+    triples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts and
+    independently gathered evidence documents, six per question on average, that provide high
+    quality distant supervision for answering the questions. The details can be found ACL
+    17 paper: https://arxiv.org/abs/1705.03551.
+    """
+
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5", "URL"))
+    SPLITS = {
+        "train": META_INFO(
+            os.path.join("wikipedia-train.json"),
+            "e4b3c74e781472d92e68da9c4b7418fe",
+            "https://bj.bcebos.com/paddlenlp/datasets/triviaqa/wikipedia-train.zip",
+        ),
+        "dev": META_INFO(
+            os.path.join("wikipedia-dev.json"),
+            "20d23a2f668a46fe5c590d126f4d2b95",
+            "https://bj.bcebos.com/paddlenlp/datasets/triviaqa/wikipedia-dev.zip",
+        ),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash, URL = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(URL, default_root)
+
+        return fullname
+
+    def _read(self, filename, *args):
+        with open(filename, "r", encoding="utf8") as f:
+            input_data = json.load(f)["data"]
+        for entry in input_data:
+            title = entry.get("title", "").strip()
+            for paragraph in entry["paragraphs"]:
+                context = paragraph["context"]
+                for qa in paragraph["qas"]:
+                    qas_id = qa["qid"]
+                    question = qa["question"]
+                    answer_starts = [answer["answer_start"] for answer in qa.get("answers", [])]
+                    answers = [answer["text"] for answer in qa.get("answers", [])]
+                    if len(answers) == 1:
+                        yield {
+                            "id": qas_id,
+                            "title": title,
+                            "context": context,
+                            "question": question,
+                            "answers": answers,
+                            "answer_starts": answer_starts,
+                        }
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/wmt14ende.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/wmt14ende.py
new file mode 100644
index 000000000..b5455e448
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/wmt14ende.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["WMT14ende"]
+
+
+class WMT14ende(DatasetBuilder):
+    """
+    This dataset is a translation dataset for machine translation task. More
+    specifically, this dataset is a WMT14 English to German translation dataset
+    which uses commoncrawl, europarl and news-commentary as train dataset and
+    uses newstest2014 as test dataset.
+    """
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/WMT14.en-de.tar.gz"
+    META_INFO = collections.namedtuple("META_INFO", ("src_file", "tgt_file", "src_md5", "tgt_md5"))
+    SPLITS = {
+        "train": META_INFO(
+            os.path.join("WMT14.en-de", "wmt14_ende_data_bpe", "train.tok.clean.bpe.33708.en"),
+            os.path.join("WMT14.en-de", "wmt14_ende_data_bpe", "train.tok.clean.bpe.33708.de"),
+            "c7c0b77e672fc69f20be182ae37ff62c",
+            "1865ece46948fda1209d3b7794770a0a",
+        ),
+        "dev": META_INFO(
+            os.path.join("WMT14.en-de", "wmt14_ende_data_bpe", "newstest2013.tok.bpe.33708.en"),
+            os.path.join("WMT14.en-de", "wmt14_ende_data_bpe", "newstest2013.tok.bpe.33708.de"),
+            "aa4228a4bedb6c45d67525fbfbcee75e",
+            "9b1eeaff43a6d5e78a381a9b03170501",
+        ),
+        "test": META_INFO(
+            os.path.join("WMT14.en-de", "wmt14_ende_data_bpe", "newstest2014.tok.bpe.33708.en"),
+            os.path.join("WMT14.en-de", "wmt14_ende_data_bpe", "newstest2014.tok.bpe.33708.de"),
+            "c9403eacf623c6e2d9e5a1155bdff0b5",
+            "0058855b55e37c4acfcb8cffecba1050",
+        ),
+        "dev-eval": META_INFO(
+            os.path.join("WMT14.en-de", "wmt14_ende_data", "newstest2013.tok.en"),
+            os.path.join("WMT14.en-de", "wmt14_ende_data", "newstest2013.tok.de"),
+            "d74712eb35578aec022265c439831b0e",
+            "6ff76ced35b70e63a61ecec77a1c418f",
+        ),
+        "test-eval": META_INFO(
+            os.path.join("WMT14.en-de", "wmt14_ende_data", "newstest2014.tok.en"),
+            os.path.join("WMT14.en-de", "wmt14_ende_data", "newstest2014.tok.de"),
+            "8cce2028e4ca3d4cc039dfd33adbfb43",
+            "a1b1f4c47f487253e1ac88947b68b3b8",
+        ),
+    }
+    VOCAB_INFO = [
+        (
+            os.path.join("WMT14.en-de", "wmt14_ende_data_bpe", "vocab_all.bpe.33708"),
+            "2fc775b7df37368e936a8e1f63846bb0",
+        ),
+        (
+            os.path.join("WMT14.en-de", "wmt14_ende_data_bpe", "vocab_all.bpe.33712"),
+            "de485e3c2e17e23acf4b4b70b54682dd",
+        ),
+    ]
+    UNK_TOKEN = "<unk>"
+    BOS_TOKEN = "<s>"
+    EOS_TOKEN = "<e>"
+
+    MD5 = "a2b8410709ff760a3b40b84bd62dfbd8"
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        src_filename, tgt_filename, src_data_hash, tgt_data_hash = self.SPLITS[mode]
+        src_fullname = os.path.join(default_root, src_filename)
+        tgt_fullname = os.path.join(default_root, tgt_filename)
+
+        (bpe_vocab_filename, bpe_vocab_hash), (sub_vocab_filename, sub_vocab_hash) = self.VOCAB_INFO
+        bpe_vocab_fullname = os.path.join(default_root, bpe_vocab_filename)
+        sub_vocab_fullname = os.path.join(default_root, sub_vocab_filename)
+
+        if (
+            (not os.path.exists(src_fullname) or (src_data_hash and not md5file(src_fullname) == src_data_hash))
+            or (not os.path.exists(tgt_fullname) or (tgt_data_hash and not md5file(tgt_fullname) == tgt_data_hash))
+            or (
+                not os.path.exists(bpe_vocab_fullname)
+                or (bpe_vocab_hash and not md5file(bpe_vocab_fullname) == bpe_vocab_hash)
+            )
+            or (
+                not os.path.exists(sub_vocab_fullname)
+                or (sub_vocab_hash and not md5file(sub_vocab_fullname) == sub_vocab_hash)
+            )
+        ):
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return src_fullname, tgt_fullname
+
+    def _read(self, filename, *args):
+        src_filename, tgt_filename = filename
+        with open(src_filename, "r", encoding="utf-8") as src_f:
+            with open(tgt_filename, "r", encoding="utf-8") as tgt_f:
+                for src_line, tgt_line in zip(src_f, tgt_f):
+                    src_line = src_line.strip()
+                    tgt_line = tgt_line.strip()
+                    if not src_line and not tgt_line:
+                        continue
+                    yield {"source": src_line, "target": tgt_line}
+
+    def get_vocab(self):
+        bpe_vocab_fullname = os.path.join(DATA_HOME, self.__class__.__name__, self.VOCAB_INFO[0][0])
+        sub_vocab_fullname = os.path.join(DATA_HOME, self.__class__.__name__, self.VOCAB_INFO[1][0])
+        vocab_info = {
+            "bpe": {
+                "filepath": bpe_vocab_fullname,
+                "unk_token": self.UNK_TOKEN,
+                "bos_token": self.BOS_TOKEN,
+                "eos_token": self.EOS_TOKEN,
+            },
+            "benchmark": {
+                "filepath": sub_vocab_fullname,
+                "unk_token": self.UNK_TOKEN,
+                "bos_token": self.BOS_TOKEN,
+                "eos_token": self.EOS_TOKEN,
+            },
+        }
+        return vocab_info
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/wos.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/wos.py
new file mode 100644
index 000000000..e0cc437ac
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/wos.py
@@ -0,0 +1,221 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright (c) 2017 Kamran Kowsari
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this dataset and associated documentation files (the "Dataset"), to deal
+# in the dataset without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Dataset, and to permit persons to whom the dataset is
+# furnished to do so, subject to the following conditions:
+
+import collections
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from paddlenlp.datasets import DatasetBuilder
+from paddlenlp.utils.env import DATA_HOME
+
+__all__ = ["WOS"]
+
+
+class WOS(DatasetBuilder):
+    """
+    Web of Science(WOS) dataset contains abstracts of published papers from Web of Science.
+    More information please refer to 'https://data.mendeley.com/datasets/9rw3vkcfy4/2'.
+    """
+
+    lazy = False
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/wos.tar.gz"
+    MD5 = "15c8631ed6a474f471f480c31a6bbcda"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    SPLITS = {
+        "train": META_INFO(os.path.join("wos", "train.tsv"), "e0153a1ef502235edf2bb138afcfef99"),
+        "dev": META_INFO(os.path.join("wos", "dev.tsv"), "fcfc283349b353c3e1123fdd20429de9 "),
+        "test": META_INFO(os.path.join("wos", "test.tsv"), "6fe2068aada7f17220d521dd11c73aee"),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        """Check and download Dataset"""
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename, *args):
+
+        with open(filename, "r", encoding="utf-8") as f:
+            for line in f:
+                line_stripped = line.split("\t")
+
+                example = {"sentence": line_stripped[0].strip()}
+                for i in range(len(line_stripped) - 1):
+                    example["level {}".format(i + 1)] = line_stripped[i + 1].strip().split(",")
+
+                yield example
+
+    def get_labels(self):
+        """
+        Return labels of the WOS.
+        """
+        return [
+            "CS",
+            "ECE",
+            "Psychology",
+            "MAE",
+            "Civil",
+            "Medical",
+            "biochemistry",
+            "CS##Computer vision",
+            "CS##Machine learning",
+            "CS##network security",
+            "CS##Cryptography",
+            "CS##Operating systems",
+            "CS##Computer graphics",
+            "CS##Image processing",
+            "CS##Parallel computing",
+            "CS##Relational databases",
+            "CS##Software engineering",
+            "CS##Distributed computing",
+            "CS##Structured Storage",
+            "CS##Symbolic computation",
+            "CS##Algorithm design",
+            "CS##Computer programming",
+            "CS##Data structures",
+            "CS##Bioinformatics",
+            "ECE##Electricity",
+            "ECE##Lorentz force law",
+            "ECE##Electrical circuits",
+            "ECE##Voltage law",
+            "ECE##Digital control",
+            "ECE##System identification",
+            "ECE##Electrical network",
+            "ECE##Microcontroller",
+            "ECE##Electrical generator/Analog signal processing",
+            "ECE##Electric motor",
+            "ECE##Satellite radio",
+            "ECE##Control engineering",
+            "ECE##Signal-flow graph",
+            "ECE##State space representation",
+            "ECE##PID controller",
+            "ECE##Operational amplifier",
+            "Psychology##Prejudice",
+            "Psychology##Social cognition",
+            "Psychology##Person perception",
+            "Psychology##Nonverbal communication",
+            "Psychology##Prosocial behavior",
+            "Psychology##Leadership",
+            "Psychology##Eating disorders",
+            "Psychology##Depression",
+            "Psychology##Borderline personality disorder",
+            "Psychology##Seasonal affective disorder",
+            "Medical##Schizophrenia",
+            "Psychology##Antisocial personality disorder",
+            "Psychology##Media violence",
+            "Psychology##Prenatal development",
+            "Psychology##Child abuse",
+            "Psychology##Gender roles",
+            "Psychology##False memories",
+            "Psychology##Attention",
+            "Psychology##Problem-solving",
+            "MAE##computer-aided design",
+            "MAE##Hydraulics",
+            "MAE##Manufacturing engineering",
+            "MAE##Machine design",
+            "MAE##Fluid mechanics",
+            "MAE##Internal combustion engine",
+            "MAE##Thermodynamics",
+            "MAE##Materials Engineering",
+            "MAE##Strength of materials",
+            "Civil##Ambient Intelligence",
+            "Civil##Geotextile",
+            "Civil##Remote Sensing",
+            "Civil##Rainwater Harvesting",
+            "Civil##Water Pollution",
+            "Civil##Suspension Bridge",
+            "Civil##Stealth Technology",
+            "Civil##Green Building",
+            "Civil##Solar Energy",
+            "Civil##Construction Management",
+            "Civil##Smart Material",
+            "Medical##Addiction",
+            "Medical##Allergies",
+            "Medical##Alzheimer's Disease",
+            "Medical##Ankylosing Spondylitis",
+            "Medical##Anxiety",
+            "Medical##Asthma",
+            "Medical##Atopic Dermatitis",
+            "Medical##Atrial Fibrillation",
+            "Medical##Autism",
+            "Medical##Skin Care",
+            "Medical##Bipolar Disorder",
+            "Medical##Birth Control",
+            "Medical##Children's Health",
+            "Medical##Crohn's Disease",
+            "Medical##Dementia",
+            "Medical##Diabetes",
+            "Medical##Weight Loss",
+            "Medical##Digestive Health",
+            "Medical##Emergency Contraception",
+            "Medical##Mental Health",
+            "Medical##Fungal Infection",
+            "Medical##Headache",
+            "Medical##Healthy Sleep",
+            "Medical##Heart Disease",
+            "Medical##Hepatitis C",
+            "Medical##Hereditary Angioedema",
+            "Medical##HIV/AIDS",
+            "Medical##Hypothyroidism",
+            "Medical##Idiopathic Pulmonary Fibrosis",
+            "Medical##Irritable Bowel Syndrome",
+            "Medical##Kidney Health",
+            "Medical##Low Testosterone",
+            "Medical##Lymphoma",
+            "Medical##Medicare",
+            "Medical##Menopause",
+            "Medical##Migraine",
+            "Medical##Multiple Sclerosis",
+            "Medical##Myelofibrosis",
+            "Medical##Cancer",
+            "Medical##Osteoarthritis",
+            "Medical##Osteoporosis",
+            "Medical##Overactive Bladder",
+            "Medical##Parenting",
+            "Medical##Parkinson's Disease",
+            "Medical##Polycythemia Vera",
+            "Medical##Psoriasis",
+            "Medical##Psoriatic Arthritis",
+            "Medical##Rheumatoid Arthritis",
+            "Medical##Senior Health",
+            "Medical##Smoking Cessation",
+            "Medical##Sports Injuries",
+            "Medical##Sprains and Strains",
+            "Medical##Stress Management",
+            "biochemistry##Molecular biology",
+            "biochemistry##Cell biology",
+            "biochemistry##Human Metabolism",
+            "biochemistry##Immunology",
+            "biochemistry##Genetics",
+            "biochemistry##Enzymology",
+            "biochemistry##Polymerase chain reaction",
+            "biochemistry##Northern blotting",
+            "biochemistry##Southern blotting",
+        ]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/xnli.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/xnli.py
new file mode 100644
index 000000000..2f1fed622
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/xnli.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import csv
+import os
+import shutil
+from contextlib import ExitStack
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import _decompress, _get_unique_endpoints, get_path_from_url
+
+try:
+    from paddle.distributed import ParallelEnv
+except Exception:
+    import warnings
+
+    warnings.warn("paddle.distributed is not contains in you paddle!")
+
+from ..utils.env import DATA_HOME
+from ..utils.log import logger
+from .dataset import DatasetBuilder
+
+__all__ = ["XNLI"]
+ALL_LANGUAGES = ["ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "sw", "th", "tr", "ur", "vi", "zh"]
+
+
+class XNLI(DatasetBuilder):
+    """
+    XNLI is a subset of a few thousand examples from MNLI which has been translated into
+    a 14 different languages (some low-ish resource). As with MNLI, the goal is to predict
+    textual entailment (does sentence A imply/contradict/neither sentence B) and is a
+    classification task (given two sentences, predict one of three labels).
+
+    For more information, please visit https://github.com/facebookresearch/XNLI
+    """
+
+    META_INFO = collections.namedtuple("META_INFO", ("file", "data_md5", "url", "zipfile_md5"))
+    SPLITS = {
+        "train": META_INFO(
+            os.path.join("XNLI-MT-1.0", "XNLI-MT-1.0", "multinli"),
+            "",
+            "https://bj.bcebos.com/paddlenlp/datasets/XNLI-MT-1.0.zip",
+            "fa3d8d6c3d1866cedc45680ba93c296e",
+        ),
+        "dev": META_INFO(
+            os.path.join("XNLI-1.0", "XNLI-1.0", "xnli.dev.tsv"),
+            "4c23601abba3e3e222e19d1c6851649e",
+            "https://bj.bcebos.com/paddlenlp/datasets/XNLI-1.0.zip",
+            "53393158739ec671c34f205efc7d1666",
+        ),
+        "test": META_INFO(
+            os.path.join("XNLI-1.0", "XNLI-1.0", "xnli.test.tsv"),
+            "fbc26e90f7e892e24dde978a2bd8ece6",
+            "https://bj.bcebos.com/paddlenlp/datasets/XNLI-1.0.zip",
+            "53393158739ec671c34f205efc7d1666",
+        ),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        """Downloads dataset."""
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash, url, zipfile_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if mode == "train":
+            if not os.path.exists(fullname):
+                get_path_from_url(url, default_root, zipfile_hash)
+            unique_endpoints = _get_unique_endpoints(ParallelEnv().trainer_endpoints[:])
+            if ParallelEnv().current_endpoint in unique_endpoints:
+                file_num = len(os.listdir(fullname))
+                if file_num != len(ALL_LANGUAGES):
+                    logger.warning(
+                        "Number of train files is %d != %d, decompress again." % (file_num, len(ALL_LANGUAGES))
+                    )
+                    shutil.rmtree(fullname)
+                    _decompress(os.path.join(default_root, os.path.basename(url)))
+        else:
+            if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+                get_path_from_url(url, default_root, zipfile_hash)
+
+        return fullname
+
+    def _read(self, filename, split):
+        """Reads data."""
+        language = self.name
+        if language is None:
+            language = "all_languages"
+        if language not in ALL_LANGUAGES + ["all_languages"]:
+            raise ValueError(
+                f"Name parameter should be specified. Can be one of {ALL_LANGUAGES + ['all_languages']}. "
+            )
+        if language == "all_languages":
+            languages = ALL_LANGUAGES
+        else:
+            languages = [language]
+        if split == "train":
+            files = [os.path.join(filename, f"multinli.train.{lang}.tsv") for lang in languages]
+            if language == "all_languages":
+                with ExitStack() as stack:
+                    files = [stack.enter_context(open(file, "r", encoding="utf-8")) for file in files]
+                    readers = [csv.DictReader(file, delimiter="\t", quoting=csv.QUOTE_NONE) for file in files]
+                    for row_idx, rows in enumerate(zip(*readers)):
+                        if not rows[0]["label"]:
+                            continue
+                        data = {
+                            "premise": {},
+                            "hypothesis": {},
+                            "label": rows[0]["label"].replace("contradictory", "contradiction"),
+                        }
+                        for lang, row in zip(languages, rows):
+                            if not row["premise"] or not row["hypo"]:
+                                continue
+                            data["premise"][lang] = row["premise"]
+                            data["hypothesis"][lang] = row["hypo"]
+                        yield data
+            else:
+                for idx, file in enumerate(files):
+                    with open(file, "r", encoding="utf-8") as f:
+                        reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
+                        for row_idx, row in enumerate(reader):
+                            if not row["premise"] or not row["hypo"] or not row["label"]:
+                                continue
+                            yield {
+                                "premise": row["premise"],
+                                "hypothesis": row["hypo"],
+                                "label": row["label"].replace("contradictory", "contradiction"),
+                            }
+        else:
+            if language == "all_languages":
+                rows_per_pair_id = collections.defaultdict(list)
+                with open(filename, encoding="utf-8") as f:
+                    reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
+                    for row in reader:
+                        rows_per_pair_id[row["pairID"]].append(row)
+
+                for rows in rows_per_pair_id.values():
+                    if not rows[0]["gold_label"]:
+                        continue
+                    data = {"premise": {}, "hypothesis": {}, "label": rows[0]["gold_label"]}
+                    for row in rows:
+                        if not row["sentence1"] or not row["sentence2"]:
+                            continue
+                        data["premise"][row["language"]] = row["sentence1"]
+                        data["hypothesis"][row["language"]] = row["sentence2"]
+                    yield data
+            else:
+                with open(filename, encoding="utf-8") as f:
+                    reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
+                    for row in reader:
+                        if row["language"] == language:
+                            if not row["sentence1"] or not row["sentence2"] or not row["gold_label"]:
+                                continue
+                            yield {
+                                "premise": row["sentence1"],
+                                "hypothesis": row["sentence2"],
+                                "label": row["gold_label"],
+                            }
+
+    def get_labels(self):
+        """
+        Return labels of XNLI dataset.
+        """
+        return ["entailment", "neutral", "contradiction"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/xnli_cn.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/xnli_cn.py
new file mode 100644
index 000000000..a766e038e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/xnli_cn.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["XNLI_CN"]
+
+
+class XNLI_CN(DatasetBuilder):
+    """
+    XNLI dataset for chinese.
+
+    XNLI is an evaluation corpus for language transfer and cross-lingual
+    sentence classification in 15 languages. Here, XNLI only contrains
+    chinese corpus.
+
+    For more information, please visit https://github.com/facebookresearch/XNLI
+    """
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/xnli_cn.tar.gz"
+    MD5 = "aaf6de381a2553d61d8e6fad4ba96499"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    SPLITS = {
+        "train": META_INFO(
+            os.path.join("xnli_cn.tar", "xnli_cn", "train", "part-0"), "b0e4df29af8413eb935a2204de8958b7"
+        ),
+        "dev": META_INFO(os.path.join("xnli_cn.tar", "xnli_cn", "dev", "part-0"), "401a2178e15f4b0c35812ab4a322bd94"),
+        "test": META_INFO(
+            os.path.join("xnli_cn.tar", "xnli_cn", "test", "part-0"), "71b043be8207e54185e761fca00ba3d7"
+        ),
+    }
+
+    def _get_data(self, mode, **kwargs):
+        """Downloads dataset."""
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        if not os.path.exists(fullname) or (data_hash and not md5file(fullname) == data_hash):
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename, split):
+        """Reads data."""
+        with open(filename, "r", encoding="utf-8") as f:
+            head = None
+            for line in f:
+                data = line.strip().split("\t")
+                if not head:
+                    head = data
+                else:
+                    if split == "train":
+                        text_a, text_b, label = data
+                        yield {"text_a": text_a, "text_b": text_b, "label": label}
+                    elif split == "dev":
+                        text_a, text_b, label = data
+                        yield {"text_a": text_a, "text_b": text_b, "label": label}
+                    elif split == "test":
+                        text_a, text_b, label = data
+                        yield {"text_a": text_a, "text_b": text_b, "label": label}
+
+    def get_labels(self):
+        """
+        Return labels of XNLI dataset.
+
+        Note:
+            Contradictory and contradiction are the same label
+        """
+        return ["contradictory", "entailment", "neutral"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/yahoo_answer_100k.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/yahoo_answer_100k.py
new file mode 100644
index 000000000..457170453
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/yahoo_answer_100k.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+
+from paddle.dataset.common import md5file
+from paddle.utils.download import get_path_from_url
+
+from ..utils.env import DATA_HOME
+from .dataset import DatasetBuilder
+
+__all__ = ["YahooAnswer100K"]
+
+
+class YahooAnswer100K(DatasetBuilder):
+    """
+    The data is from https://arxiv.org/pdf/1702.08139.pdf, which samples 100k
+    documents from original Yahoo Answer data, and vocabulary size is 200k.
+    """
+
+    URL = "https://bj.bcebos.com/paddlenlp/datasets/yahoo-answer-100k.tar.gz"
+    MD5 = "68b88fd3f2cc9918a78047d99bcc6532"
+    META_INFO = collections.namedtuple("META_INFO", ("file", "md5"))
+    SPLITS = {
+        "train": META_INFO(os.path.join("yahoo-answer-100k", "yahoo.train.txt"), "3fb31bad56bae7c65fa084f702398c3b"),
+        "valid": META_INFO(os.path.join("yahoo-answer-100k", "yahoo.valid.txt"), "2680dd89b4fe882359846b5accfb7647"),
+        "test": META_INFO(os.path.join("yahoo-answer-100k", "yahoo.test.txt"), "3e6dcb643282e3543303980f1e21bb9d"),
+    }
+    VOCAB_INFO = (os.path.join("yahoo-answer-100k", "vocab.txt"), "2c17c7120e6240d34d19490404b5133d")
+    UNK_TOKEN = "_UNK"
+
+    def _get_data(self, mode, **kwargs):
+        default_root = os.path.join(DATA_HOME, self.__class__.__name__)
+        filename, data_hash = self.SPLITS[mode]
+        fullname = os.path.join(default_root, filename)
+        vocab_filename, vocab_hash = self.VOCAB_INFO
+        vocab_fullname = os.path.join(default_root, vocab_filename)
+
+        if (
+            (not os.path.exists(fullname))
+            or (data_hash and not md5file(fullname) == data_hash)
+            or (not os.path.exists(vocab_fullname) or (vocab_hash and not md5file(vocab_fullname) == vocab_hash))
+        ):
+
+            get_path_from_url(self.URL, default_root, self.MD5)
+
+        return fullname
+
+    def _read(self, filename, *args):
+        with open(filename, "r", encoding="utf-8") as f:
+            for line in f:
+                line_stripped = line.strip()
+                yield {"sentence": line_stripped}
+
+    def get_vocab(self):
+        vocab_fullname = os.path.join(DATA_HOME, self.__class__.__name__, self.VOCAB_INFO[0])
+
+        # Construct vocab_info to match the form of the input of `Vocab.load_vocabulary()` function
+        vocab_info = {"filepath": vocab_fullname, "unk_token": self.UNK_TOKEN}
+        return vocab_info
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/zero_padding_dataset.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/zero_padding_dataset.py
new file mode 100644
index 000000000..870394aac
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/datasets/zero_padding_dataset.py
@@ -0,0 +1,266 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from paddle.io import Dataset, IterableDataset
+from scipy.linalg import block_diag
+
+
+def generate_greedy_packs(examples, max_length):
+    left_len = np.zeros([len(examples)]) - 1
+    left_len[0] = max_length  # At the beginning, only the first pack is valid.
+    generate_packs = [[] for i in range(len(examples))]
+    index, left_index = 0, 0
+
+    while index < len(examples):
+        record = examples[index]
+        max_left_index = left_len.argmax()
+        # Put the current sequence into the largest left space valid pack.
+        if len(record["input_ids"]) <= left_len[max_left_index]:
+            generate_packs[max_left_index].append(record)
+            left_len[max_left_index] -= len(record["input_ids"])
+            index += 1
+        else:
+            left_index += 1
+            left_len[left_index] = max_length
+
+    return generate_packs
+
+
+class ZeroPadding:
+    required_output_keys = ["input_ids", "labels", "attention_mask"]
+    # Only supported the following keys for ZeroPadding. Keys outside of the set will be ignored.
+    supported_input_keys = [
+        "input_ids",
+        "labels",
+        "attention_mask",
+        "position_ids",
+        "chosen_labels",
+        "rejected_labels",
+        "response_indexs",
+        "attn_mask_startend_row_indices",
+    ]
+
+    @classmethod
+    def _pad_batch_records_to_max_length(cls, batch_records, max_length, pad_token=0):
+        # confirm the at least one item in the pack
+        if len(batch_records) == 0:
+            return batch_records
+        # count all records total length
+        total_length = sum([len(record["input_ids"]) for record in batch_records])
+        reserved_length = max_length - total_length
+
+        # append padding to the max_length
+        if "attn_mask_startend_row_indices" in batch_records[0]:
+            # attn_mask_startend_row_indices is a list of row indices `0`,
+            # which indicates that all tokens are masked.
+            batch_records.append(
+                {
+                    "input_ids": [pad_token] * reserved_length,
+                    "labels": [-100] * reserved_length,
+                    "attn_mask_startend_row_indices": [0] * reserved_length,
+                }
+            )
+        elif "attention_mask" in batch_records[0]:
+            # attention_mask is a fullly masked attention matrix (all False)
+            # which indicates that all tokens are masked.
+            batch_records.append(
+                {
+                    "input_ids": [pad_token] * reserved_length,
+                    "labels": [-100] * reserved_length,
+                    "attention_mask": np.zeros((reserved_length, reserved_length), dtype=bool),
+                }
+            )
+
+        return batch_records
+
+    @classmethod
+    def _pad_batch_records(cls, batch_records, max_length):
+        batch_records = cls._pad_batch_records_to_max_length(batch_records, max_length)
+
+        # Only consider supported input keys
+        input_keys = [key for key in batch_records[0].keys() if key in cls.supported_input_keys]
+        if "attn_mask_startend_row_indices" not in input_keys and "attention_mask" not in input_keys:
+            input_keys.append("attention_mask")
+        batched_features = {key: [] for key in input_keys}
+        sequence_sum = 0
+        for record in batch_records:
+            batched_features["input_ids"].extend(record["input_ids"])
+            if "labels" in record:
+                batched_features["labels"].extend(record["labels"])
+            elif "rejected_labels" in input_keys and "chosen_labels" in input_keys:
+                batched_features["rejected_labels"].extend(record["rejected_labels"])
+                batched_features["chosen_labels"].extend(record["chosen_labels"])
+                response_indexs = [
+                    record["response_indexs"][0] + sequence_sum,  # chosen_response_start_index
+                    record["response_indexs"][1] + sequence_sum,  # rejeted_response_start_index
+                    record["response_indexs"][2] + sequence_sum,  # rejeted_response_end_index + 1
+                ]
+                batched_features["response_indexs"].append(response_indexs)
+            else:
+                raise ValueError("labels is required for ZeroPadding Dataset")
+
+            seq_length = len(record["input_ids"])
+            # If attention_mask is not given, assume it's causal mask
+            if "attn_mask_startend_row_indices" in record:
+                attn_mask_startend_row_indices = [i + sequence_sum for i in record["attn_mask_startend_row_indices"]]
+                batched_features["attn_mask_startend_row_indices"].extend(attn_mask_startend_row_indices)
+            else:
+                attention_mask = record.get("attention_mask", np.tril(np.ones([seq_length, seq_length], dtype=bool)))
+                batched_features["attention_mask"].append(attention_mask)
+            # NOTE: position_ids is optional and not required by every model
+            # We append instead of extend here to accomodate 2D position ids
+            if "position_ids" in record:
+                batched_features["position_ids"].append(record["position_ids"])
+            sequence_sum += seq_length
+
+        if "attention_mask" in batched_features:
+            block_attention_mask = block_diag(*batched_features["attention_mask"])
+            # convert to 3-D [batch_size(1), seq_length, seq_length]
+            batched_features["attention_mask"] = np.expand_dims(block_attention_mask, axis=0)
+        if "position_ids" in batched_features:
+            # Accomodate both 1D and 2D position ids
+            batched_features["position_ids"] = np.concatenate(batched_features["position_ids"], axis=-1).tolist()
+        return batched_features
+
+
+class ZeroPaddingMapDataset(ZeroPadding, Dataset):
+    def __init__(self, data, tokenizer, max_length, greedy_zero_padding=False):
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.greedy_zero_padding = greedy_zero_padding
+        self.new_data = self._create_zero_padding_data(data)
+
+    def _create_zero_padding_data(self, data):
+        total_data = []
+        if not self.greedy_zero_padding:
+            batch_records = []
+            cur_len_so_far = 0
+            for i in range(len(data)):
+                record = data[i]
+                if len(record["input_ids"]) > self.max_length:
+                    continue
+                to_append = (cur_len_so_far + len(record["input_ids"])) <= self.max_length
+                if to_append:
+                    batch_records.append(record)
+                    cur_len_so_far += len(record["input_ids"])
+                else:
+                    # exceed max length
+                    padded_list = self._pad_batch_records(batch_records, self.max_length)
+                    total_data.append(padded_list)
+                    # reset
+                    batch_records = []
+                    cur_len_so_far = 0
+                    # append current data
+                    batch_records.append(record)
+                    cur_len_so_far += len(record["input_ids"])
+
+            # remaining data
+            if batch_records:
+                padded_list = self._pad_batch_records(batch_records, self.max_length)
+                total_data.append(padded_list)
+        else:
+            examples = []
+            buffer_size = 500
+            i = 0
+            for record in data:
+                if len(record["input_ids"]) > self.max_length:
+                    continue
+                if i < buffer_size:
+                    examples.append(record)
+                    i += 1
+                else:
+                    # Running greedy strategy in examples.
+                    generate_packs = generate_greedy_packs(examples, self.max_length)
+                    for batch_records in generate_packs:
+                        if len(batch_records) > 0:
+                            padded_list = self._pad_batch_records(batch_records, self.max_length)
+                            total_data.append(padded_list)
+                    examples = [record]
+                    i = 1
+            if len(examples) > 0:
+                generate_packs = generate_greedy_packs(examples, self.max_length)
+                for batch_records in generate_packs:
+                    if len(batch_records) > 0:
+                        padded_list = self._pad_batch_records(batch_records, self.max_length)
+                        total_data.append(padded_list)
+
+        return total_data
+
+    def __getitem__(self, idx):
+        return self.new_data[idx]
+
+    def __len__(self):
+        return len(self.new_data)
+
+
+class ZeroPaddingIterableDataset(ZeroPadding, IterableDataset):
+    def __init__(self, data, tokenizer, max_length, greedy_zero_padding=False):
+        self.data = data
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.zero_padding_global_step = 0
+        self.greedy_zero_padding = greedy_zero_padding
+
+    def __iter__(self):
+        if not self.greedy_zero_padding:
+            batch_records = []
+            cur_len_so_far = 0
+            for record in self.data:
+                to_append = (cur_len_so_far + len(record["input_ids"])) <= self.max_length
+                if to_append:
+                    batch_records.append(record)
+                    self.zero_padding_global_step += 1
+                    cur_len_so_far += len(record["input_ids"])
+                else:
+                    # exceed max length
+                    padded_list = self._pad_batch_records(batch_records, self.max_length)
+                    yield padded_list
+                    # reset
+                    batch_records = []
+                    cur_len_so_far = 0
+                    # append current data
+                    batch_records.append(record)
+                    self.zero_padding_global_step += 1
+                    cur_len_so_far += len(record["input_ids"])
+            if batch_records:
+                padded_list = self._pad_batch_records(batch_records, self.max_length)
+                yield padded_list
+        else:
+            examples = []
+            buffer_size = 500
+            i = 0
+            for record in self.data:
+                if len(record["input_ids"]) > self.max_length:
+                    continue
+                if i < buffer_size:
+                    examples.append(record)
+                    self.zero_padding_global_step += 1
+                    i += 1
+                else:
+                    # Running greedy strategy in examples.
+                    generate_packs = generate_greedy_packs(examples, self.max_length)
+                    for batch_records in generate_packs:
+                        if len(batch_records) > 0:
+                            padded_list = self._pad_batch_records(batch_records, self.max_length)
+                            yield padded_list
+                    examples = [record]
+                    self.zero_padding_global_step += 1
+                    i = 1
+            if len(examples) > 0:
+                generate_packs = generate_greedy_packs(examples, self.max_length)
+                for batch_records in generate_packs:
+                    if len(batch_records) > 0:
+                        padded_list = self._pad_batch_records(batch_records, self.max_length)
+                        yield padded_list
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/embeddings/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/embeddings/__init__.py
new file mode 100644
index 000000000..d0daf27cf
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/embeddings/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .token_embedding import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/embeddings/constant.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/embeddings/constant.py
new file mode 100644
index 000000000..600078eb2
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/embeddings/constant.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2020 PaddlePaddle Authors and Chinese-Word-Vectors Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+URL_ROOT = "https://bj.bcebos.com/paddlenlp"
+EMBEDDING_URL_ROOT = URL_ROOT + "/models/embeddings"
+
+PAD_TOKEN = "[PAD]"
+UNK_TOKEN = "[UNK]"
+
+EMBEDDING_NAME_LIST = [
+    # Word2Vec
+    # baidu_encyclopedia
+    "w2v.baidu_encyclopedia.target.word-word.dim300",
+    "w2v.baidu_encyclopedia.target.word-character.char1-1.dim300",
+    "w2v.baidu_encyclopedia.target.word-character.char1-2.dim300",
+    "w2v.baidu_encyclopedia.target.word-character.char1-4.dim300",
+    "w2v.baidu_encyclopedia.target.word-ngram.1-2.dim300",
+    "w2v.baidu_encyclopedia.target.word-ngram.1-3.dim300",
+    "w2v.baidu_encyclopedia.target.word-ngram.2-2.dim300",
+    "w2v.baidu_encyclopedia.target.word-wordLR.dim300",
+    "w2v.baidu_encyclopedia.target.word-wordPosition.dim300",
+    "w2v.baidu_encyclopedia.target.bigram-char.dim300",
+    "w2v.baidu_encyclopedia.context.word-word.dim300",
+    "w2v.baidu_encyclopedia.context.word-character.char1-1.dim300",
+    "w2v.baidu_encyclopedia.context.word-character.char1-2.dim300",
+    "w2v.baidu_encyclopedia.context.word-character.char1-4.dim300",
+    "w2v.baidu_encyclopedia.context.word-ngram.1-2.dim300",
+    "w2v.baidu_encyclopedia.context.word-ngram.1-3.dim300",
+    "w2v.baidu_encyclopedia.context.word-ngram.2-2.dim300",
+    "w2v.baidu_encyclopedia.context.word-wordLR.dim300",
+    "w2v.baidu_encyclopedia.context.word-wordPosition.dim300",
+    # wikipedia
+    "w2v.wiki.target.bigram-char.dim300",
+    "w2v.wiki.target.word-char.dim300",
+    "w2v.wiki.target.word-word.dim300",
+    "w2v.wiki.target.word-bigram.dim300",
+    # people_daily
+    "w2v.people_daily.target.bigram-char.dim300",
+    "w2v.people_daily.target.word-char.dim300",
+    "w2v.people_daily.target.word-word.dim300",
+    "w2v.people_daily.target.word-bigram.dim300",
+    # weibo
+    "w2v.weibo.target.bigram-char.dim300",
+    "w2v.weibo.target.word-char.dim300",
+    "w2v.weibo.target.word-word.dim300",
+    "w2v.weibo.target.word-bigram.dim300",
+    # sogou
+    "w2v.sogou.target.bigram-char.dim300",
+    "w2v.sogou.target.word-char.dim300",
+    "w2v.sogou.target.word-word.dim300",
+    "w2v.sogou.target.word-bigram.dim300",
+    # zhihu
+    "w2v.zhihu.target.bigram-char.dim300",
+    "w2v.zhihu.target.word-char.dim300",
+    "w2v.zhihu.target.word-word.dim300",
+    "w2v.zhihu.target.word-bigram.dim300",
+    # finacial
+    "w2v.financial.target.bigram-char.dim300",
+    "w2v.financial.target.word-char.dim300",
+    "w2v.financial.target.word-word.dim300",
+    "w2v.financial.target.word-bigram.dim300",
+    # literature
+    "w2v.literature.target.bigram-char.dim300",
+    "w2v.literature.target.word-char.dim300",
+    "w2v.literature.target.word-word.dim300",
+    "w2v.literature.target.word-bigram.dim300",
+    # siku
+    "w2v.sikuquanshu.target.word-word.dim300",
+    "w2v.sikuquanshu.target.word-bigram.dim300",
+    # Mix-large
+    "w2v.mixed-large.target.word-char.dim300",
+    "w2v.mixed-large.target.word-word.dim300",
+    # GOOGLE NEWS
+    "w2v.google_news.target.word-word.dim300.en",
+    # GloVe
+    "glove.wiki2014-gigaword.target.word-word.dim50.en",
+    "glove.wiki2014-gigaword.target.word-word.dim100.en",
+    "glove.wiki2014-gigaword.target.word-word.dim200.en",
+    "glove.wiki2014-gigaword.target.word-word.dim300.en",
+    "glove.twitter.target.word-word.dim25.en",
+    "glove.twitter.target.word-word.dim50.en",
+    "glove.twitter.target.word-word.dim100.en",
+    "glove.twitter.target.word-word.dim200.en",
+    # FastText
+    "fasttext.wiki-news.target.word-word.dim300.en",
+    "fasttext.crawl.target.word-word.dim300.en",
+]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/embeddings/token_embedding.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/embeddings/token_embedding.py
new file mode 100644
index 000000000..b915f668c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/embeddings/token_embedding.py
@@ -0,0 +1,378 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path as osp
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+from paddle.utils.download import get_path_from_url
+
+from paddlenlp.data import Vocab, get_idx_from_word
+from paddlenlp.utils.env import MODEL_HOME, _get_sub_home
+from paddlenlp.utils.log import logger
+
+from .constant import EMBEDDING_NAME_LIST, EMBEDDING_URL_ROOT, PAD_TOKEN, UNK_TOKEN
+
+EMBEDDING_HOME = _get_sub_home("embeddings", parent_home=MODEL_HOME)
+
+__all__ = ["list_embedding_name", "TokenEmbedding"]
+
+
+def list_embedding_name():
+    """
+    Lists all names of pretrained embedding models paddlenlp provides.
+    """
+    return list(EMBEDDING_NAME_LIST)
+
+
+class TokenEmbedding(nn.Embedding):
+    """
+    A `TokenEmbedding` can load pre-trained embedding model which paddlenlp provides by
+    specifying embedding name. Furthermore, a `TokenEmbedding` can load extended vocabulary
+    by specifying extended_vocab_path.
+
+    Args:
+        embedding_name (`str`, optional):
+            The pre-trained embedding model name. Use `paddlenlp.embeddings.list_embedding_name()` to
+            list the names of all embedding models that we provide.
+            Defaults to `w2v.baidu_encyclopedia.target.word-word.dim300`.
+        unknown_token (`str`, optional):
+            Specifies unknown token.
+            Defaults to `[UNK]`.
+        unknown_token_vector (`list`, optional):
+            To initialize the vector of unknown token. If it's none, use normal distribution to
+            initialize the vector of unknown token.
+            Defaults to `None`.
+        extended_vocab_path (`str`, optional):
+            The file path of extended vocabulary.
+            Defaults to `None`.
+        trainable (`bool`, optional):
+            Whether the weight of embedding can be trained.
+            Defaults to True.
+        keep_extended_vocab_only (`bool`, optional):
+            Whether to keep the extended vocabulary only, will be effective only if provides extended_vocab_path.
+            Defaults to False.
+    """
+
+    def __init__(
+        self,
+        embedding_name=EMBEDDING_NAME_LIST[0],
+        unknown_token=UNK_TOKEN,
+        unknown_token_vector=None,
+        extended_vocab_path=None,
+        trainable=True,
+        keep_extended_vocab_only=False,
+    ):
+        vector_path = osp.join(EMBEDDING_HOME, embedding_name + ".npz")
+        if not osp.exists(vector_path):
+            # download
+            url = EMBEDDING_URL_ROOT + "/" + embedding_name + ".tar.gz"
+            get_path_from_url(url, EMBEDDING_HOME)
+
+        logger.info("Loading token embedding...")
+        vector_np = np.load(vector_path)
+        self.embedding_dim = vector_np["embedding"].shape[1]
+        self.unknown_token = unknown_token
+        if unknown_token_vector is not None:
+            unk_vector = np.array(unknown_token_vector).astype(paddle.get_default_dtype())
+        else:
+            unk_vector = np.random.normal(scale=0.02, size=self.embedding_dim).astype(paddle.get_default_dtype())
+        pad_vector = np.array([0] * self.embedding_dim).astype(paddle.get_default_dtype())
+        if extended_vocab_path is not None:
+            embedding_table = self._extend_vocab(
+                extended_vocab_path, vector_np, pad_vector, unk_vector, keep_extended_vocab_only
+            )
+            trainable = True
+        else:
+            embedding_table = self._init_without_extend_vocab(vector_np, pad_vector, unk_vector)
+
+        self.vocab = Vocab.from_dict(self._word_to_idx, unk_token=unknown_token, pad_token=PAD_TOKEN)
+        self.num_embeddings = embedding_table.shape[0]
+        # import embedding
+        super(TokenEmbedding, self).__init__(
+            self.num_embeddings, self.embedding_dim, padding_idx=self._word_to_idx[PAD_TOKEN]
+        )
+        self.weight.set_value(embedding_table)
+        self.set_trainable(trainable)
+        logger.info("Finish loading embedding vector.")
+        s = "Token Embedding info:\
+             \nUnknown index: {}\
+             \nUnknown token: {}\
+             \nPadding index: {}\
+             \nPadding token: {}\
+             \nShape :{}".format(
+            self._word_to_idx[self.unknown_token],
+            self.unknown_token,
+            self._word_to_idx[PAD_TOKEN],
+            PAD_TOKEN,
+            self.weight.shape,
+        )
+        logger.info(s)
+
+    def _init_without_extend_vocab(self, vector_np, pad_vector, unk_vector):
+        """
+        Constructs index to word list, word to index dict and embedding weight.
+        """
+        self._idx_to_word = list(vector_np["vocab"])
+        self._idx_to_word.append(self.unknown_token)
+        self._idx_to_word.append(PAD_TOKEN)
+        self._word_to_idx = self._construct_word_to_idx(self._idx_to_word)
+        # insert unk, pad embedding
+        embedding_table = np.append(vector_np["embedding"], [unk_vector, pad_vector], axis=0)
+
+        return embedding_table
+
+    def _read_vocab_list_from_file(self, extended_vocab_path):
+        # load new vocab table from file
+        vocab_list = []
+        with open(extended_vocab_path, "r", encoding="utf-8") as f:
+            for line in f.readlines():
+                vocab = line.rstrip("\n").split("\t")[0]
+                vocab_list.append(vocab)
+        return vocab_list
+
+    def _extend_vocab(self, extended_vocab_path, vector_np, pad_vector, unk_vector, keep_extended_vocab_only):
+        """
+        Constructs index to word list, word to index dict and embedding weight using
+        extended vocab.
+        """
+        logger.info("Start extending vocab.")
+        extend_vocab_list = self._read_vocab_list_from_file(extended_vocab_path)
+        extend_vocab_set = set(extend_vocab_list)
+        # update idx_to_word
+        self._idx_to_word = extend_vocab_list
+        self._word_to_idx = self._construct_word_to_idx(self._idx_to_word)
+
+        # use the Xavier init the embedding
+        xavier_scale = np.sqrt(6.0 / float(len(self._idx_to_word) + self.embedding_dim))
+        embedding_table = np.random.uniform(
+            low=-1.0 * xavier_scale, high=xavier_scale, size=(len(self._idx_to_word), self.embedding_dim)
+        ).astype(paddle.get_default_dtype())
+
+        pretrained_idx_to_word = list(vector_np["vocab"])
+        pretrained_word_to_idx = self._construct_word_to_idx(pretrained_idx_to_word)
+        pretrained_embedding_table = np.array(vector_np["embedding"])
+
+        pretrained_vocab_set = set(pretrained_idx_to_word)
+        extend_vocab_set = set(self._idx_to_word)
+        vocab_intersection = pretrained_vocab_set & extend_vocab_set
+        vocab_subtraction = pretrained_vocab_set - extend_vocab_set
+
+        # assignment from pretrained_vocab_embedding to extend_vocab_embedding
+        pretrained_vocab_intersect_index = [pretrained_word_to_idx[word] for word in vocab_intersection]
+        pretrained_vocab_subtract_index = [pretrained_word_to_idx[word] for word in vocab_subtraction]
+        extend_vocab_intersect_index = [self._word_to_idx[word] for word in vocab_intersection]
+        embedding_table[extend_vocab_intersect_index] = pretrained_embedding_table[pretrained_vocab_intersect_index]
+        if not keep_extended_vocab_only:
+            for idx in pretrained_vocab_subtract_index:
+                word = pretrained_idx_to_word[idx]
+                self._idx_to_word.append(word)
+                self._word_to_idx[word] = len(self._idx_to_word) - 1
+
+            embedding_table = np.append(
+                embedding_table, pretrained_embedding_table[pretrained_vocab_subtract_index], axis=0
+            )
+
+        if self.unknown_token not in extend_vocab_set:
+            self._idx_to_word.append(self.unknown_token)
+            self._word_to_idx[self.unknown_token] = len(self._idx_to_word) - 1
+            embedding_table = np.append(embedding_table, [unk_vector], axis=0)
+        else:
+            unk_idx = self._word_to_idx[self.unknown_token]
+            embedding_table[unk_idx] = unk_vector
+
+        if PAD_TOKEN not in extend_vocab_set:
+            self._idx_to_word.append(PAD_TOKEN)
+            self._word_to_idx[PAD_TOKEN] = len(self._idx_to_word) - 1
+            embedding_table = np.append(embedding_table, [pad_vector], axis=0)
+        else:
+            embedding_table[self._word_to_idx[PAD_TOKEN]] = pad_vector
+
+        logger.info("Finish extending vocab.")
+        return embedding_table
+
+    def set_trainable(self, trainable):
+        """
+        Whether or not to set the weights of token embedding to be trainable.
+
+        Args:
+            trainable (`bool`):
+                The weights can be trained if trainable is set to True, or the weights are fixed if trainable is False.
+
+        """
+        self.weight.stop_gradient = not trainable
+
+    def search(self, words):
+        """
+        Gets the vectors of specifying words.
+
+        Args:
+            words (`list` or `str` or `int`): The words which need to be searched.
+
+        Returns:
+            `numpy.array`: The vectors of specifying words.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.embeddings import TokenEmbedding
+
+                embed = TokenEmbedding()
+                vector =  embed.search('Welcome to use PaddlePaddle and PaddleNLP!')
+
+        """
+        idx_list = self.get_idx_list_from_words(words)
+        idx_tensor = paddle.to_tensor(idx_list)
+        return self(idx_tensor).cpu().numpy()
+
+    def get_idx_from_word(self, word):
+        """
+        Gets the index of specifying word by searching word_to_idx dict.
+
+        Args:
+            word (`list` or `str` or `int`): The input token word which we want to get the token index converted from.
+
+        Returns:
+            `int`: The index of specifying word.
+
+        """
+        return get_idx_from_word(word, self.vocab.token_to_idx, self.unknown_token)
+
+    def get_idx_list_from_words(self, words):
+        """
+        Gets the index list of specifying words by searching word_to_idx dict.
+
+        Args:
+            words (`list` or `str` or `int`): The input token words which we want to get the token indices converted from.
+
+        Returns:
+            `list`: The indexes list of specifying words.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.embeddings import TokenEmbedding
+
+                embed = TokenEmbedding()
+                index =  embed.get_idx_from_word('Welcome to use PaddlePaddle and PaddleNLP!')
+                #635963
+
+        """
+        if isinstance(words, str):
+            idx_list = [self.get_idx_from_word(words)]
+        elif isinstance(words, int):
+            idx_list = [words]
+        elif isinstance(words, list) or isinstance(words, tuple):
+            idx_list = [self.get_idx_from_word(word) if isinstance(word, str) else word for word in words]
+        else:
+            raise TypeError
+        return idx_list
+
+    def _dot_np(self, array_a, array_b):
+        return np.sum(array_a * array_b)
+
+    def _calc_word(self, word_a, word_b, calc_kernel):
+        embeddings = self.search([word_a, word_b])
+        embedding_a = embeddings[0]
+        embedding_b = embeddings[1]
+        return calc_kernel(embedding_a, embedding_b)
+
+    def dot(self, word_a, word_b):
+        """
+        Calculates the dot product of 2 words. Dot product or scalar product is an
+        algebraic operation that takes two equal-length sequences of numbers (usually
+        coordinate vectors), and returns a single number.
+
+        Args:
+            word_a (`str`): The first word string.
+            word_b (`str`): The second word string.
+
+        Returns:
+            float: The dot product of 2 words.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.embeddings import TokenEmbedding
+
+                embed = TokenEmbedding()
+                dot_product =  embed.dot('PaddlePaddle', 'PaddleNLP!')
+                #0.11827179
+
+        """
+        dot = self._dot_np
+        return self._calc_word(word_a, word_b, lambda x, y: dot(x, y))
+
+    def cosine_sim(self, word_a, word_b):
+        """
+        Calculates the cosine similarity of 2 word vectors. Cosine similarity is the
+        cosine of the angle between two n-dimensional vectors in an n-dimensional space.
+
+        Args:
+            word_a (`str`): The first word string.
+            word_b (`str`): The second word string.
+
+        Returns:
+            float: The cosine similarity of 2 words.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.embeddings import TokenEmbedding
+
+                embed = TokenEmbedding()
+                cosine_simi =  embed.cosine_sim('PaddlePaddle', 'PaddleNLP!')
+                #0.99999994
+
+        """
+        dot = self._dot_np
+        return self._calc_word(word_a, word_b, lambda x, y: dot(x, y) / (np.sqrt(dot(x, x)) * np.sqrt(dot(y, y))))
+
+    def _construct_word_to_idx(self, idx_to_word):
+        """
+        Constructs word to index dict.
+
+        Args:
+            idx_to_word ('list'):
+
+        Returns:
+            `Dict`: The word to index dict constructed by idx_to_word.
+
+        """
+        word_to_idx = {}
+        for i, word in enumerate(idx_to_word):
+            word_to_idx[word] = i
+        return word_to_idx
+
+    def __repr__(self):
+        """
+        Returns:
+            `Str`: The token embedding infomation.
+
+        """
+        info = "Object   type: {}\
+             \nUnknown index: {}\
+             \nUnknown token: {}\
+             \nPadding index: {}\
+             \nPadding token: {}\
+             \n{}".format(
+            super(TokenEmbedding, self).__repr__(),
+            self._word_to_idx[self.unknown_token],
+            self.unknown_token,
+            self._word_to_idx[PAD_TOKEN],
+            PAD_TOKEN,
+            self.weight,
+        )
+        return info
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/__init__.py
new file mode 100644
index 000000000..b40bb0346
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .faster_tokenizer import *
+from .model_utils import *
+from .ernie_model import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/README.md b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/README.md
new file mode 100644
index 000000000..555f28039
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/README.md
@@ -0,0 +1,146 @@
+# AutoNLP
+
+**简体中文**🀄 | [English🌎](./README_en.md)
+
+## 简介
+
+**AutoNLP目前在实验阶段。在正式发布之前，AutoNLP API有可能会变动**
+
+**AutoNLP** 是 PaddleNLP 的一个早期的实验性质的项目，旨在让NLP技术赋能百业。交付一个成功的 NLP 项目并不容易，因为它需要深入的NLP领域知识，而我们经常看到开发者在应用NLP技术的过程中遇到困难。这就是我们开发 **AutoNLP** 项目的原因。与为获得最先进的模型精度而使用大规模计算资源的传统 AutoML 方法相比，我们有不同的理念：
+
+1. 我们的目标不是在大型集群，大型数据集上训练最先进的模型，而是**在有限计算资源下的训练出不错模型**。我们假设我们的用户最多只有几个 GPU，并且希望在8小时内训练出不错的模型。您可以在 [Baidu AI Studio](https://aistudio.baidu.com/aistudio) 免费获得此级别的计算资源。
+2. AutoNLP的目标是提供**低代码**的解决方案，使您能够用几行代码训练出不错的模型，但它不是无代码的模型训练服务。
+3. 我们将尽可能地**自动化和抽象化** PaddleNLP已有的**全流程能力**（例如 预处理，分词，微调，提示学习，模型压缩，一键部署等等），助力开发者对于自己的使用场景进行快速适配与落地。
+4. 我们的工作是**免费和开源**的。
+
+## 安装
+
+安装 **AutoNLP** 与安装 PaddleNLP 非常相似，唯一的区别是 需要添加`[autonlp]`的标签。
+
+```
+pip install -U paddlenlp[autonlp]
+```
+
+您还可以从我们的 [GitHub](https://github.com/PaddlePaddle/PaddleNLP) clone并通过“pip install .[autonlp]”从源代码安装来获取develop分支中的最新成果。
+
+## 基础使用
+
+由于目前AutoNLP唯一支持的任务是文本分类，因此以下文档是关于 **AutoTrainerForTextClassification** 的使用用法。您也可以参考我们的 AiStudio notebook (To be added)
+
+### 创建AutoTrainerForTextClassification对象
+
+`AutoTrainerForTextClassification` 是您用来运行模型实验并与经过训练的模型交互的主要类，您可以像下面这样构造它：
+
+```python
+auto_trainer = AutoTrainerForTextClassification(
+    train_dataset=train_ds,
+    eval_dataset=dev_ds,
+    label_column="labels",
+    text_column="sentence",
+    language="Chinese",
+    output_dir="temp"
+)
+```
+
+Args:
+
+- train_dataset (Dataset, required): `paddle.io.Dataset` 格式的训练数据集，必须包含下面指定的 `text_column` 和 `label_column`
+- eval_dataset (Dataset, required): `paddle.io.Dataset`格式的评估数据集，必须包含下面指定的`text_column`和`label_column`
+- text_column (string, required): 数据集中的文本字段，为模型的主要输入。
+- label_column (string, required): 数据集中的标签字段
+- language (string, required): 文本语言
+- metric_for_best_model (string, optional): 用来选择最优模型的评估指标
+- greater_is_better (bool, optional): 更好的模型是否应该有更大的指标。与`metric_for_best_model`结合使用
+- problem_type (str, optional): 根据问题的性质在 [`multi_class`, `multi_label`] 中选择
+- output_dir (str, optional): 输出目录，默认为`autpnlp_results`
+- verbosity: (int, optional): 控制日志的详细程度。默认为“1”，可在driver中看见worker的日志。如果需要减少日志量，请使用 `verbosity > 0` 。
+
+### 训练
+
+您可以使用以下命令开始训练模型：
+
+```python
+auto_trainer.train(
+    num_cpus=2,
+    num_gpus=1,
+    max_concurrent_trials=1,
+    num_models=10,
+    time_budget_s=60 * 10,
+    verbosity=1
+)
+```
+Args:
+
+- num_models (int, required): 模型试验数量
+- num_gpus (str, optional): 实验使用的 GPU 数量。默认情况下，这是根据检测到的 GPU 设置的。
+- num_cpus (str, optional): 实验使用的 CPU 数量。默认情况下，这是根据检测到的 vCPU 设置的。
+- max_concurrent_trials (int, optional): 同时运行的最大试验数。必须是非负数。如果为 None 或 0，则不应用任何限制。默认为None。
+- time_budget_s: (int|float|datetime.timedelta, optional) 以秒为单位的全局时间预算，超过时间后停止所有模型试验。
+- experiment_name: (str, optional): 实验的名称。实验日志将存储在"<output_dir>/<experiment_name>"下。默认为 UNIX 时间戳。
+- hp_overrides: (dict[str, Any], optional): （仅限高级用户）。覆盖每个候选模型的超参数。例如，`{"TrainingArguments.max_steps"：5}`。
+- custom_model_candiates: (dict[str, Any], optional): （仅限高级用户）。运行用户提供的候选模型而不 PaddleNLP 的默认候选模型。可以参考 `._model_candidates` 属性
+
+
+### 评估和检查实验结果
+
+#### 检查实验结果
+
+实验结束后，您可以像下面这样检查实验结果，它会打印一个 pandas DataFrame：
+
+```
+auto_trainer.show_training_results()
+```
+
+您还可以在`<output_dir>/experiment_results.csv`下找到实验结果。不同实验产生的模型的标识符是`trial_id`，您可以在 DataFrame 或 csv 文件中找到这个字段。
+
+#### 加载以前的实验结果
+
+您可以从之前的运行（包括未完成的运行）中恢复实验结果，如下所示：
+
+```python
+auto_trainer.load("path/to/previous/results")
+```
+
+这使您能够使用 `show_training_results` API 来检查结果。再次调用 train() 将覆盖之前的结果。
+
+#### 使用不同的评估数据集
+
+除了使用构建 AutoTrainerForTextClassification 的时候提供的评估数据集以外，您也可以使用其他的数据集进行评估：
+
+```
+auto_trainer.evaluate(
+    trial_id="trial_123456",
+    eval_dataset=new_eval_dataset
+)
+```
+
+Args:
+- trial_id (str, optional): 通过 `trial_id` 指定要评估的模型。默认为由`metric_for_best_model`决定的最佳模型
+- eval_dataset (Dataset, optional): 自定义评估数据集，并且必须包含`text_column`和`label_column`字段。如果未提供，则默认为构建时使用的评估数据集
+
+
+
+### 模型输出与部署
+
+如果需要导出模型供以后使用，可以使用以下的API：
+
+```
+auto_trainer.export(
+    trial_id="trial_123456",
+    export_path="different/path/to/store/the/model"
+)
+```
+
+Args:
+- export_path (str, required): 输出路径
+- trial_id (int, required): 通过 `trial_id` 指定要评估的模型。默认为由`metric_for_best_model`决定的最佳模型
+
+同时我们还提供了`to_taskflow()`的API，可以直接将模型转换为 `Taskflow` 进行推理：
+
+```
+taskflow = auto_trainer.to_taskflow()
+taskflow("this is a test input")
+```
+
+Args:
+- trial_id (int, required): 通过 `trial_id` 指定要评估的模型。默认为由`metric_for_best_model`决定的最佳模型
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/README_en.md b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/README_en.md
new file mode 100644
index 000000000..8db14f079
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/README_en.md
@@ -0,0 +1,147 @@
+# AutoNLP
+
+[简体中文🀄](./README.md) |  **English**🌎
+
+# Introduction
+
+**The AutoNLP APIs are subjective to significant changes until formal release**
+
+**AutoNLP** is an experimental project by PaddleNLP to democratize NLP for everyone. Delivering a successful NLP project is not easy, as it requires deep domain knowledge. Time after time, we have seen people struggle to make NLP work on their dataset, for their projects, which is why we are building **AutoNLP**. Compared with the traditional AutoML approach of massive paid compute for State-of-the-Art model performance, we have a different philosphy:
+
+
+1. Instead of training State-of-the-Art models on huge datasets running on huge clusters, our goal is to deliver **decent models under limited compute**. We assume our users have a few GPUs at most and want to get decent models under 8 hours on their own in-house datasets. Note that you can get this level of compute for FREE on [Baidu AI Studio](https://aistudio.baidu.com/aistudio).
+2. Our solution is **low-code** and enables you to train good models with a few lines of code but it won't be no code / drag and drop.
+3. Leverage the **full-cycle capability** of PaddleNLP, We intent to **automate and abstract away** as much of NLP as possible, ranging from preprocessing to tokenizing, from finetuning to prompt tuning, from model compression to deloyment, etc.
+4. Our work is and always will be **free and open-sourced**.
+
+## Installation
+
+Installing **AutoNLP** is very similar to installing PaddleNLP, with the only difference being the `[autonlp]` tag.
+
+```
+pip install -U paddlenlp[autonlp]
+```
+
+You can also get our latest work in the develop branch by cloning from our [GitHub](https://github.com/PaddlePaddle/PaddleNLP) and install from source via `pip install .[autonlp]`.
+
+## Basic Usage
+
+Since the only supported task is Text Classification for now, the following documentation are on the usage of **AutoTrainerForTextClassification**. You can also follow our AiStudio notebook for example.
+
+### Constructor
+
+`AutoTrainerForTextClassification` is the main class which you use to run model experiments and interact with the trained models You can construct it like the following:
+
+```python
+auto_trainer = AutoTrainerForTextClassification(
+    train_dataset=train_ds,
+    eval_dataset=dev_ds,
+    label_column="labels",
+    text_column="sentence",
+    language="Chinese",
+    output_dir="temp"
+)
+```
+
+Args:
+
+- train_dataset (Dataset, required): Training dataset in the format of `paddle.io.Dataset`, must contains the 'text_column' and 'label_column' specified below
+- eval_dataset (Dataset, required): Evaluation dataset in the format of `paddle.io.Dataset`, must contains the 'text_column' and 'label_column' specified below
+- text_column (string, required): Name of the column that contains the input text.
+- label_column (string, required): Name of the column that contains the target variable to predict.
+- language (string, required): language of the text
+- metric_for_best_model (string, optional): the name of the metrc for selecting the best model.
+- greater_is_better (bool, optional): Whether better models should have a greater metric or not. Use in conjuction with `metric_for_best_model`.
+- problem_type (str, optional): Select among ["multi_class", "multi_label"] based on the nature of your problem
+- output_dir (str, optional): Output directory for the experiments, defaults to "autpnlp_results"
+- verbosity: (int, optional): controls the verbosity of the run. Defaults to 1, which let the workers log to the driver.To reduce the amount of logs, use verbosity > 0 to set stop the workers from logging to the driver.
+
+
+### Train
+
+You can start training with the following command:
+
+```python
+auto_trainer.train(
+    num_cpus=2,
+    num_gpus=1,
+    max_concurrent_trials=1,
+    num_models=10,
+    time_budget_s=60 * 10,
+    verbosity=1
+)
+```
+Args:
+
+- num_models (int, required): number of model trials to run
+- num_gpus (str, optional): number of GPUs to use for the job. By default, this is set based on detected GPUs.
+- num_cpus (str, optional): number of CPUs to use for the job. By default, this is set based on virtual cores.
+- max_concurrent_trials (int, optional): maximum number of trials to run concurrently. Must be non-negative. If None or 0, no limit will be applied. Defaults to None.
+- time_budget_s: (int|float|datetime.timedelta, optional) global time budget in seconds after which all model trials are stopped.
+- experiment_name: (str, optional): name of the experiment. Experiment log will be stored under `<output_dir>/<experiment_name>`. Defaults to UNIX timestamp.
+- hp_overrides: (dict[str, Any], optional): Advanced users only. override the hyperparameters of every model candidate.  For example, {"TrainingArguments.max_steps": 5}.
+- custom_model_candiates: (dict[str, Any], optional): Advanced users only. Run the user-provided model candidates instead of the default model candidated from PaddleNLP. See `._model_candidates` property as an example
+
+### Evaluations and Examine Results
+
+#### Examine Results
+
+Once the experimenets conclude, you can examine the experiment results like the following, which prints a pandas DataFrame:
+
+```
+auto_trainer.show_training_results()
+```
+
+You can also find the experiment results under `<output_dir>/experiment_results.csv`. The identifier for the models produced by different experiments is `trial_id`, which you can find in the `DataFrame` or the csv file.
+
+#### Load Previous Results
+
+You can recover the experiment results from a previous run (including unfinished runs) like the following:
+
+```python
+auto_trainer.load("path/to/previous/results")
+```
+
+This enables you to use the `show_training_results` API to examine the results. Call `train()` again will override the previous results.
+
+#### Custom Evaluations
+
+To evaluate on datasets other than the evaluation dataset provided to `AutoTrainerForTextClassification` at construction, you can use the
+
+```
+auto_trainer.evaluate(
+    trial_id="trial_123456",
+    eval_dataset=new_eval_dataset
+)
+```
+
+Args:
+- trial_id (str, optional): specify the model to be evaluated through the `trial_id`. Defaults to the best model, ranked by `metric_for_best_model`
+- eval_dataset (Dataset, optional): custom evaluation dataset and must contains the 'text_column' and 'label_column' fields. If not provided, defaults to the evaluation dataset used at construction
+
+
+
+### Export and Inference
+
+To export a model for later use, do:
+
+```
+auto_trainer.export(
+    trial_id="trial_123456",
+    export_path="different/path/to/store/the/model"
+)
+```
+
+Args:
+- export_path (str, required): the filepath for export
+- trial_id (int, required): use the `trial_id` to select the model to export. Defaults to the best model selected by `metric_for_best_model`
+
+We also provide a convenience method to directly convert a model to a Taskflow for inference:
+
+```
+taskflow = auto_trainer.to_taskflow()
+taskflow("this is a test input")
+```
+
+Args:
+- trial_id (int, required): use the `trial_id` to select the model to export. Defaults to the best model selected by `metric_for_best_model`
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/__init__.py
new file mode 100644
index 000000000..6b2e409e9
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# flake8: noqa
+from .text_classification import AutoTrainerForTextClassification
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/auto_trainer_base.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/auto_trainer_base.py
new file mode 100644
index 000000000..d705450d5
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/auto_trainer_base.py
@@ -0,0 +1,383 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import datetime
+import logging
+import os
+import shutil
+import sys
+from abc import ABCMeta, abstractmethod
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import ray
+from hyperopt import hp
+from paddle.io import Dataset
+from ray import tune
+from ray.air import RunConfig
+from ray.tune.result_grid import ResultGrid
+from ray.tune.search import ConcurrencyLimiter
+from ray.tune.search.hyperopt import HyperOptSearch
+
+from paddlenlp.trainer import TrainingArguments
+from paddlenlp.trainer.trainer_utils import EvalPrediction
+from paddlenlp.transformers import PretrainedTokenizer
+from paddlenlp.utils.log import logger
+
+
+class AutoTrainerBase(metaclass=ABCMeta):
+    """
+    The meta classs of AutoTrainer, which contains the common properies and methods of AutoNLP.
+    Task-specific AutoTrainers need to inherit from the meta class.
+
+    Args:
+        train_dataset (Dataset, required): Training dataset, must contains the 'text_column' and 'label_column' specified below
+        eval_dataset (Dataset, required): Evaluation dataset, must contains the 'text_column' and 'label_column' specified below
+        language (string, required): language of the text
+        metric_for_best_model (string, optional): the name of the metrc for selecting the best model.
+        greater_is_better (bool, required): Whether better models should have a greater metric or not. Use in conjuction with `metric_for_best_model`.
+        output_dir (str, optional): Output directory for the experiments, defaults to "autpnlp_results"
+        verbosity: (int, optional): controls the verbosity of the run. Defaults to 1, which let the workers log to the driver.To reduce the amount of logs,
+                use verbosity > 0 to set stop the workers from logging to the driver.
+    """
+
+    training_path = "training_checkpoints"  # filepath for Trainer's training checkpoints
+    save_path = "trained_model"  # filepath for the trained dygraph model
+    export_path = "exported_model"  # filepath for the exported static model
+    compress_path = "compressed_model"  # filepath for the compressed static model
+    results_filename = "experiment_results.csv"  # filepath for storing experiment results
+    experiment_path = None  # filepath for the experiment results
+    visualdl_path = "visualdl"  # filepath for the visualdl
+
+    def __init__(
+        self,
+        train_dataset: Dataset,
+        eval_dataset: Dataset,
+        metric_for_best_model: str,
+        greater_is_better: bool,
+        language: str = "Chinese",
+        output_dir: str = "autonlp_results",
+        verbosity: int = 1,
+        **kwargs,
+    ):
+        if metric_for_best_model is not None and not metric_for_best_model.startswith("eval_"):
+            self.metric_for_best_model = f"eval_{metric_for_best_model}"
+        else:
+            self.metric_for_best_model = metric_for_best_model
+        self.train_dataset = train_dataset
+        self.eval_dataset = eval_dataset
+        self.greater_is_better = greater_is_better
+        if language not in self.supported_languages:
+            raise ValueError(
+                f"'{language}' is not supported. Please choose among the following: {self.supported_languages}"
+            )
+
+        self.language = language
+        self.output_dir = output_dir
+        self.kwargs = kwargs
+        # Per default, Ray Tune creates JSON, CSV and TensorBoardX logger callbacks, turning it off
+        os.environ["TUNE_DISABLE_AUTO_CALLBACK_LOGGERS"] = "1"
+        # use log_to_driver to control verbosity
+        ray.init(ignore_reinit_error=True, log_to_driver=True if verbosity >= 1 else False)
+
+    @property
+    @abstractmethod
+    def supported_languages(self) -> List[str]:
+        """
+        Override to store the supported languages for each auto trainer class
+        """
+
+    @property
+    @abstractmethod
+    def _default_training_argument(self) -> TrainingArguments:
+        """
+        Default TrainingArguments for the Trainer
+        """
+        return TrainingArguments(
+            output_dir=self.training_path,
+            disable_tqdm=True,
+            load_best_model_at_end=True,
+            save_total_limit=1,
+            report_to=["visualdl", "autonlp"],
+            logging_dir=self.visualdl_path,  # if logging_dir is redefined, the function visualdl() should be redefined as well.
+        )
+
+    @property
+    @abstractmethod
+    def _model_candidates(self) -> List[Dict[str, Any]]:
+        """
+        Model Candidates stored as Ray hyperparameter search space, organized by
+        self.language and preset
+        """
+
+    @abstractmethod
+    def _data_checks_and_inference(self, dataset_list: List[Dataset]):
+        """
+        Performs different data checks and inferences on the datasets
+        """
+
+    def _construct_trainable(self) -> Callable:
+        """
+        Returns the Trainable functions that contains the main preprocessing and training logic
+        """
+
+        def trainable(model_config):
+            # import is required for proper pickling
+            from paddlenlp.utils.log import logger
+
+            stdout_handler = logging.StreamHandler(sys.stdout)
+            stdout_handler.setFormatter(logger.format)
+            logger.logger.addHandler(stdout_handler)
+
+            # construct trainer
+            model_config = model_config["candidates"]
+            trainer = self._construct_trainer(model_config)
+            # train
+            trainer.train()
+            # evaluate
+            eval_metrics = trainer.evaluate()
+            # save dygraph model
+            trainer.save_model(self.save_path)
+
+            if os.path.exists(self.training_path):
+                logger.info("Removing training checkpoints to conserve disk space")
+                shutil.rmtree(self.training_path)
+            return eval_metrics
+
+        return trainable
+
+    @abstractmethod
+    def _compute_metrics(self, eval_preds: EvalPrediction) -> Dict[str, float]:
+        """
+        function used by the Trainer to compute metrics during training
+        See :class:`~paddlenlp.trainer.trainer_base.Trainer` for more details.
+        """
+
+    @abstractmethod
+    def _preprocess_fn(
+        self,
+        example: Dict[str, Any],
+        tokenizer: PretrainedTokenizer,
+        max_seq_length: int,
+        is_test: bool = False,
+    ) -> Dict[str, Any]:
+        """
+        preprocess an example from raw features to input features that Transformers models expect (e.g. input_ids, attention_mask, labels, etc)
+        """
+
+    @abstractmethod
+    def export(self, export_path: str, trial_id: Optional[str] = None):
+        """
+        Export the model from a certain `trial_id` to the given file path.
+
+        Args:
+            export_path (str, required): the filepath to export to
+            trial_id (int, optional): use the `trial_id` to select the model to export. Defaults to the best model selected by `metric_for_best_model`
+        """
+
+        raise NotImplementedError
+
+    @abstractmethod
+    def to_taskflow(self, trial_id: Optional[str] = None):
+        """
+        Convert the model from a certain `trial_id` to a Taskflow for model inference
+
+        Args:
+            trial_id (int, optional): use the `trial_id` to select the model to export. Defaults to the best model selected by `metric_for_best_model`
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def evaluate(self, eval_dataset: Optional[Dataset] = None, trial_id: Optional[str] = None) -> Dict[str, float]:
+        """
+        Run evaluation and returns metrics from a certain `trial_id` on the given dataset.
+
+        Args:
+            trial_id (str, optional): specify the model to be evaluated through the `trial_id`. Defaults to the best model selected by `metric_for_best_model`
+            eval_dataset (Dataset, optional): custom evaluation dataset and must contains the 'text_column' and 'label_column' fields.
+                If not provided, defaults to the evaluation dataset used at construction.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def predict(self, test_dataset: Dataset, trial_id: Optional[str] = None):
+        """
+        Run prediction and returns predictions and potential metrics from a certain `trial_id` on the given dataset
+        Args:
+            test_dataset (Dataset, required): Custom test dataset and must contains the 'text_column' and 'label_column' fields.
+            trial_id (str, optional): Specify the model to be evaluated through the `trial_id`. Defaults to the best model selected by `metric_for_best_model`.
+        """
+        raise NotImplementedError
+
+    def _override_hp(self, config: Dict[str, Any], default_hp: Any) -> Any:
+        """
+        Overrides the arguments with the provided hyperparameter config
+        """
+        new_hp = copy.deepcopy(default_hp)
+        for key, value in config.items():
+            if key in new_hp.to_dict():
+                if key in ["output_dir", "logging_dir"]:
+                    logger.warning(f"{key} cannot be overridden")
+                else:
+                    setattr(new_hp, key, value)
+        return new_hp
+
+    def _filter_model_candidates(
+        self, language=None, preset=None, custom_model_candidates=None
+    ) -> List[Dict[str, Any]]:
+        """
+        Model Candidates stored as Ray hyperparameter search space, organized by
+        override, language and preset
+        """
+        model_candidates = custom_model_candidates if custom_model_candidates is not None else self._model_candidates
+        if language is not None:
+            model_candidates = filter(
+                lambda x: x["language"] == language if "language" in x else True, model_candidates
+            )
+        if preset is not None:
+            model_candidates = filter(lambda x: x["preset"] == preset if "preset" in x else True, model_candidates)
+        return list(model_candidates)
+
+    def _get_model_result(self, trial_id=None):
+        if hasattr(self, "training_results"):
+            if trial_id is not None:
+                for result in self.training_results:
+                    if result.metrics["trial_id"] == trial_id:
+                        return result
+                raise LookupError(
+                    f"Trial_id '{trial_id}' is not found in 'training_results'. Did you enter the correct 'trial_id'?"
+                )
+            else:
+                result = self.training_results.get_best_result(
+                    metric=self.metric_for_best_model,
+                    mode="max" if self.greater_is_better else "min",
+                )
+                return result
+        else:
+            raise AttributeError(
+                "'AutoTrainer' has no attribute 'training_results'. Have you called the 'train' method?"
+            )
+
+    def show_training_results(self):
+        if hasattr(self, "training_results"):
+            return self.training_results.get_dataframe()
+        else:
+            raise AttributeError(
+                "'AutoTrainer' has no attribute 'training_results'. Have you called the 'train' method?"
+            )
+
+    def load(self, path: str):
+        """
+        Restores the AutoTrainer from a given experiment directory produced by a previous run
+
+        Args:
+            path (str, required): The filepath to load the previous experiments
+        """
+        logger.info(f"Restoring from {path}")
+        self.tuner = tune.Tuner.restore(path)
+        self.training_results = self.tuner.get_results()
+        logger.info("Found existing training results.")
+
+    def train(
+        self,
+        num_models: int = 1,
+        preset: Optional[str] = None,
+        num_gpus: Optional[int] = None,
+        num_cpus: Optional[int] = None,
+        max_concurrent_trials: Optional[int] = None,
+        time_budget_s: Optional[Union[int, float, datetime.timedelta]] = None,
+        experiment_name: str = None,
+        hp_overrides: Dict[str, Any] = None,
+        custom_model_candidates: List[Dict[str, Any]] = None,
+    ) -> ResultGrid:
+        """
+        Main logic of training models
+
+        Args:
+            num_models (int, required): number of model trials to run
+            preset (str, optional): preset configuration for the trained models, can significantly impact accuracy, size, and inference latency of trained models.
+                If not set, this will be inferred from data.
+            num_gpus (str, optional): number of GPUs to use for the job. By default, this is set based on detected GPUs.
+            num_cpus (str, optional): number of CPUs to use for the job. By default, this is set based on virtual cores.
+            max_concurrent_trials (int, optional): maximum number of trials to run concurrently. Must be non-negative. If None or 0, no limit will be applied.
+            time_budget_s: (int|float|datetime.timedelta, optional) global time budget in seconds after which all model trials are stopped.
+            experiment_name: (str, optional): name of the experiment. Experiment log will be stored under <output_dir>/<experiment_name>.
+                Defaults to UNIX timestamp.
+            hp_overrides: (dict[str, Any], optional): Advanced users only.
+                override the hyperparameters of every model candidate.  For example, {"max_steps": 5}.
+            custom_model_candiates: (dict[str, Any], optional): Advanced users only.
+                Run the user-provided model candidates instead of the default model candidated from PaddleNLP. See `._model_candidates` property as an example
+
+        Returns:
+            A set of objects for interacting with Ray Tune results. You can use it to inspect the trials and obtain the best result.
+        """
+        if hasattr(self, "tuner") and self.tuner is not None:
+            logger.info("Overwriting the existing Tuner and any previous training results")
+
+        trainable = self._construct_trainable()
+        model_candidates = self._filter_model_candidates(
+            language=self.language, preset=preset, custom_model_candidates=custom_model_candidates
+        )
+        if hp_overrides is not None:
+            for model_candidate in model_candidates:
+                model_candidate.update(hp_overrides)
+        search_space = {"candidates": hp.choice("candidates", model_candidates)}
+        mode = "max" if self.greater_is_better else "min"
+        algo = HyperOptSearch(space=search_space, metric=self.metric_for_best_model, mode=mode)
+        algo = ConcurrencyLimiter(algo, max_concurrent=max_concurrent_trials)
+        if num_gpus or num_cpus:
+            hardware_resources = {}
+            if num_gpus:
+                hardware_resources["gpu"] = num_gpus
+            if num_cpus:
+                hardware_resources["cpu"] = num_cpus
+            trainable = tune.with_resources(trainable, hardware_resources)
+
+        def trial_creator(trial):
+            return "{}".format(trial.trial_id)
+
+        tune_config = tune.TuneConfig(
+            num_samples=num_models,
+            time_budget_s=time_budget_s,
+            search_alg=algo,
+            trial_name_creator=trial_creator,
+            trial_dirname_creator=trial_creator,
+        )
+
+        if experiment_name is None:
+            experiment_name = datetime.datetime.now().strftime("%s")
+        self.experiment_path = os.path.join(self.output_dir, experiment_name)
+
+        self.tuner = tune.Tuner(
+            trainable,
+            tune_config=tune_config,
+            run_config=RunConfig(
+                name=experiment_name,
+                log_to_file="train.log",
+                local_dir=self.output_dir if self.output_dir else None,
+                callbacks=[tune.logger.CSVLoggerCallback()],
+            ),
+        )
+        self.training_results = self.tuner.fit()
+        self.show_training_results().to_csv(
+            path_or_buf=os.path.join(self.output_dir, experiment_name, self.results_filename), index=False
+        )
+
+        return self.training_results
+
+    def visualdl(self, trial_id: Optional[str] = None):
+        """
+        Return visualdl path to represent the results of the taskflow training.
+        """
+        model_result = self._get_model_result(trial_id=trial_id)
+        return os.path.join(model_result.log_dir, self.visualdl_path)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/requirements.txt b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/requirements.txt
new file mode 100644
index 000000000..c369392f2
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/requirements.txt
@@ -0,0 +1,4 @@
+protobuf==3.20.2 
+pydantic==1.10.11
+ray[tune]==2.5.1
+hyperopt>=0.2.5
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/text_classification.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/text_classification.py
new file mode 100644
index 000000000..5df473387
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/text_classification.py
@@ -0,0 +1,764 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import functools
+import json
+import os
+import shutil
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+import paddle
+from hyperopt import hp
+from paddle.io import Dataset
+from scipy.special import expit as sigmoid
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support
+
+from ...data import DataCollatorWithPadding
+from ...prompt import (
+    PromptDataCollatorWithPadding,
+    PromptModelForSequenceClassification,
+    PromptTrainer,
+    PromptTuningArguments,
+    UTCTemplate,
+)
+from ...taskflow import Taskflow
+from ...trainer import EarlyStoppingCallback, Trainer, TrainingArguments
+from ...trainer.trainer_utils import EvalPrediction
+from ...transformers import (
+    UTC,
+    AutoConfig,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    PretrainedTokenizer,
+    export_model,
+)
+from ...utils.log import logger
+from .auto_trainer_base import AutoTrainerBase
+from .utils import UTCLoss
+
+
+class AutoTrainerForTextClassification(AutoTrainerBase):
+    """
+    AutoTrainer for Text Classification problems
+
+    Args:
+        train_dataset (Dataset, required): Training dataset, must contains the 'text_column' and 'label_column' specified below
+        eval_dataset (Dataset, required): Evaluation dataset, must contains the 'text_column' and 'label_column' specified below
+        text_column (string, required): Name of the column that contains the input text.
+        label_column (string, required): Name of the column that contains the target variable to predict.
+        metric_for_best_model (string, optional): the name of the metrc for selecting the best model. Defaut to 'eval_accuracy'.
+        greater_is_better (bool, optional): Whether better models should have a greater metric or not. Use in conjuction with `metric_for_best_model`.
+        problem_type (str, optional): Select among ["multi_class", "multi_label"] based on the nature of your problem
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+            language (string, required): language of the text.
+            output_dir (str, optional): Output directory for the experiments, defaults to "autpnlp_results".
+            id2label(dict(int,string)): The dictionary to map the predictions from class ids to class names.
+            multilabel_threshold (float): The probability threshold used for the multi_label setup. Only effective if model = "multi_label". Defaults to 0.5.
+            verbosity: (int, optional): controls the verbosity of the run. Defaults to 1, which let the workers log to the driver.To reduce the amount of logs, use verbosity > 0 to set stop the workers from logging to the driver.
+    """
+
+    def __init__(
+        self,
+        text_column: str,
+        label_column: str,
+        train_dataset: Dataset,
+        eval_dataset: Dataset,
+        metric_for_best_model: Optional[str] = None,
+        greater_is_better: bool = True,
+        problem_type: str = "multi_class",
+        **kwargs
+    ):
+
+        super(AutoTrainerForTextClassification, self).__init__(
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            metric_for_best_model=metric_for_best_model,
+            greater_is_better=greater_is_better,
+            **kwargs,
+        )
+        self.text_column = text_column
+        self.label_column = label_column
+        self.id2label = self.kwargs.get("id2label", None)
+        self.multilabel_threshold = self.kwargs.get("multilabel_threshold", 0.5)
+        if problem_type in ["multi_label", "multi_class"]:
+            self.problem_type = problem_type
+        else:
+            raise NotImplementedError(
+                f"'{problem_type}' is not a supported problem_type. Please select among ['multi_label', 'multi_class']"
+            )
+        if self.metric_for_best_model is None:
+            if self.problem_type == "multi_class":
+                self.metric_for_best_model = "eval_accuracy"
+            else:
+                self.metric_for_best_model = "eval_macro_f1"
+
+        self._data_checks_and_inference([self.train_dataset, self.eval_dataset])
+
+    @property
+    def supported_languages(self) -> List[str]:
+        return ["Chinese", "English"]
+
+    @property
+    def _default_training_argument(self) -> TrainingArguments:
+        """
+        Default TrainingArguments for the Trainer
+        """
+        return TrainingArguments(
+            output_dir=self.training_path,
+            disable_tqdm=True,
+            metric_for_best_model=self.metric_for_best_model,
+            greater_is_better=True,
+            load_best_model_at_end=True,
+            evaluation_strategy="epoch",
+            save_strategy="epoch",
+            save_total_limit=1,
+            report_to=["visualdl", "autonlp"],
+            logging_dir=self.visualdl_path,
+        )
+
+    @property
+    def _default_prompt_tuning_arguments(self) -> PromptTuningArguments:
+        return PromptTuningArguments(
+            output_dir=self.training_path,
+            disable_tqdm=True,
+            metric_for_best_model=self.metric_for_best_model,
+            greater_is_better=True,
+            load_best_model_at_end=True,
+            evaluation_strategy="epoch",
+            save_strategy="epoch",
+            save_total_limit=1,
+            report_to=["visualdl", "autonlp"],
+            logging_dir=self.visualdl_path,
+        )
+
+    @property
+    def _model_candidates(self) -> List[Dict[str, Any]]:
+        train_batch_size = hp.choice("batch_size", [2, 4, 8, 16, 32])
+        chinese_finetune_models = hp.choice(
+            "finetune_models",
+            [
+                "ernie-1.0-large-zh-cw",  # 24-layer, 1024-hidden, 16-heads, 272M parameters.
+                "ernie-3.0-xbase-zh",  # 20-layer, 1024-hidden, 16-heads, 296M parameters.
+                "ernie-3.0-tiny-base-v2-zh",  # 12-layer, 768-hidden, 12-heads, 118M parameters.
+                "ernie-3.0-tiny-medium-v2-zh",  # 6-layer, 768-hidden, 12-heads, 75M parameters.
+                "ernie-3.0-tiny-mini-v2-zh",  # 6-layer, 384-hidden, 12-heads, 27M parameters
+                "ernie-3.0-tiny-micro-v2-zh",  # 4-layer, 384-hidden, 12-heads, 23M parameters
+                "ernie-3.0-tiny-nano-v2-zh",  # 4-layer, 312-hidden, 12-heads, 18M parameters.
+                "ernie-3.0-tiny-pico-v2-zh",  # 3-layer, 128-hidden, 2-heads, 5.9M parameters.
+            ],
+        )
+        english_finetune_models = hp.choice(
+            "finetune_models",
+            [
+                # add deberta-v3 when we have it
+                "roberta-large",  # 24-layer, 1024-hidden, 16-heads, 334M parameters. Case-sensitive
+                "roberta-base",  # 12-layer, 768-hidden, 12-heads, 110M parameters. Case-sensitive
+                "distilroberta-base",  # 6-layer, 768-hidden, 12-heads, 66M parameters. Case-sensitive
+                "ernie-2.0-base-en",  # 12-layer, 768-hidden, 12-heads, 103M parameters. Trained on lower-cased English text.
+                "ernie-2.0-large-en",  # 24-layer, 1024-hidden, 16-heads, 336M parameters. Trained on lower-cased English text.
+            ],
+        )
+        chinese_utc_models = hp.choice(
+            "utc_models",
+            [
+                "utc-xbase",  # 20-layer, 1024-hidden, 16-heads, 296M parameters.
+                "utc-base",  # 12-layer, 768-hidden, 12-heads, 118M parameters.
+                "utc-medium",  # 6-layer, 768-hidden, 12-heads, 75M parameters.
+                "utc-mini",  # 6-layer, 384-hidden, 12-heads, 27M parameters
+                "utc-micro",  # 4-layer, 384-hidden, 12-heads, 23M parameters
+                "utc-nano",  # 4-layer, 312-hidden, 12-heads, 18M parameters.
+            ],
+        )
+        return [
+            # fast learning: high LR, small early stop patience
+            {
+                "preset": "finetune",
+                "language": "Chinese",
+                "early_stopping_patience": 5,
+                "per_device_train_batch_size": train_batch_size,
+                "per_device_eval_batch_size": train_batch_size * 2,
+                "num_train_epochs": 100,
+                "model_name_or_path": chinese_finetune_models,
+                "learning_rate": 3e-5,
+            },
+            {
+                "preset": "finetune",
+                "language": "English",
+                "early_stopping_patience": 5,
+                "per_device_train_batch_size": train_batch_size,
+                "per_device_eval_batch_size": train_batch_size * 2,
+                "num_train_epochs": 100,
+                "model_name_or_path": english_finetune_models,
+                "learning_rate": 3e-5,
+            },
+            # slow learning: small LR, large early stop patience
+            {
+                "preset": "finetune",
+                "language": "Chinese",
+                "early_stopping_patience": 5,
+                "per_device_train_batch_size": train_batch_size,
+                "per_device_eval_batch_size": train_batch_size * 2,
+                "num_train_epochs": 100,
+                "model_name_or_path": chinese_finetune_models,
+                "learning_rate": 5e-6,
+            },
+            {
+                "preset": "finetune",
+                "language": "English",
+                "early_stopping_patience": 5,
+                "per_device_train_batch_size": train_batch_size,
+                "per_device_eval_batch_size": train_batch_size * 2,
+                "num_train_epochs": 100,
+                "model_name_or_path": english_finetune_models,
+                "learning_rate": 5e-6,
+            },
+            # utc tuning candidates
+            {
+                "preset": "utc",
+                "language": "Chinese",
+                "early_stopping_patience": 5,
+                "per_device_train_batch_size": train_batch_size,
+                "per_device_eval_batch_size": train_batch_size * 2,
+                "num_train_epochs": 100,
+                "model_name_or_path": chinese_utc_models,
+                "learning_rate": 1e-5,
+            },
+        ]
+
+    def _data_checks_and_inference(self, dataset_list: List[Dataset]):
+        """
+        Performs different data checks and generate id to label mapping on the datasets.
+        """
+        generate_id2label = True
+        if self.id2label is None:
+            self.id2label, self.label2id = {}, {}
+        else:
+            generate_id2label = False
+            self.label2id = {}
+            for i in self.id2label:
+                self.label2id[self.id2label[i]] = i
+
+        for dataset in dataset_list:
+            for example in dataset:
+                if self.text_column not in example or self.label_column not in example:
+                    raise ValueError(
+                        f"Text column: {self.text_column} and label columns:{self.label_column} must exist for example: {example}"
+                    )
+                if self.problem_type == "multi_class":
+                    label = example[self.label_column]
+                    if label not in self.label2id:
+                        if generate_id2label:
+                            self.label2id[label] = len(self.label2id)
+                            self.id2label[len(self.id2label)] = label
+                        else:
+                            raise ValueError(
+                                f"Label {label} is not found in the user-provided id2label argument: {self.id2label}"
+                            )
+                else:
+                    labels = example[self.label_column]
+                    for label in labels:
+                        if label not in self.label2id:
+                            if generate_id2label:
+                                self.label2id[label] = len(self.label2id)
+                                self.id2label[len(self.id2label)] = label
+                            else:
+                                raise ValueError(
+                                    f"Label {label} is not found in the user-provided id2label argument: {self.id2label}"
+                                )
+
+    def _construct_trainer(self, model_config) -> Trainer:
+
+        if "early_stopping_patience" in model_config:
+            callbacks = [EarlyStoppingCallback(early_stopping_patience=model_config["early_stopping_patience"])]
+        else:
+            callbacks = None
+
+        if self.problem_type == "multi_class":
+            criterion = paddle.nn.CrossEntropyLoss()
+        else:
+            criterion = paddle.nn.BCEWithLogitsLoss()
+
+        if "utc" in model_config["model_name_or_path"]:
+            model_path = model_config["model_name_or_path"]
+            tokenizer = AutoTokenizer.from_pretrained(model_path)
+            model = UTC.from_pretrained(model_path)
+            max_length = model_config.get("max_length", model.config.max_position_embeddings)
+
+            training_args = self._override_hp(model_config, self._default_prompt_tuning_arguments)
+            processed_train_dataset = self._preprocess_dataset(self.train_dataset, max_length, tokenizer, is_utc=True)
+            processed_eval_dataset = self._preprocess_dataset(self.eval_dataset, max_length, tokenizer, is_utc=True)
+
+            template = UTCTemplate(tokenizer=tokenizer, max_length=max_length)
+            criterion = UTCLoss()
+            prompt_model = PromptModelForSequenceClassification(
+                model, template, None, freeze_plm=training_args.freeze_plm, freeze_dropout=training_args.freeze_dropout
+            )
+
+            trainer = PromptTrainer(
+                model=prompt_model,
+                tokenizer=tokenizer,
+                args=training_args,
+                criterion=criterion,
+                train_dataset=processed_train_dataset,
+                eval_dataset=processed_eval_dataset,
+                callbacks=callbacks,
+                compute_metrics=self._compute_metrics,
+            )
+        else:
+            model_path = model_config["model_name_or_path"]
+            tokenizer = AutoTokenizer.from_pretrained(model_path)
+            model = AutoModelForSequenceClassification.from_pretrained(
+                model_path, num_labels=len(self.id2label), id2label=self.id2label, label2id=self.label2id
+            )
+            max_length = model_config.get("max_length", model.config.max_position_embeddings)
+
+            training_args = self._override_hp(model_config, self._default_training_argument)
+            processed_train_dataset = self._preprocess_dataset(self.train_dataset, max_length, tokenizer)
+            processed_eval_dataset = self._preprocess_dataset(self.eval_dataset, max_length, tokenizer)
+
+            trainer = Trainer(
+                model=model,
+                tokenizer=tokenizer,
+                args=training_args,
+                criterion=criterion,
+                train_dataset=processed_train_dataset,
+                eval_dataset=processed_eval_dataset,
+                data_collator=DataCollatorWithPadding(tokenizer),
+                compute_metrics=self._compute_metrics,
+                callbacks=callbacks,
+            )
+        return trainer
+
+    def evaluate(self, eval_dataset: Optional[Dataset] = None, trial_id: Optional[str] = None):
+        """
+        Run evaluation and returns metrics from a certain `trial_id` on the given dataset.
+        Args:
+            eval_dataset (Dataset, optional): custom evaluation dataset and must contains the 'text_column' and 'label_column' fields. If not provided, defaults to the evaluation dataset used at construction.
+            trial_id (str, optional): specify the model to be evaluated through the `trial_id`. Defaults to the best model selected by `metric_for_best_model`
+        """
+        model_result = self._get_model_result(trial_id=trial_id)
+        model_config = model_result.metrics["config"]["candidates"]
+        trainer = self._construct_trainer(model_config)
+        trainer._load_from_checkpoint(resume_from_checkpoint=os.path.join(model_result.log_dir, self.save_path))
+
+        if eval_dataset is not None:
+            self._data_checks_and_inference([eval_dataset])
+            is_utc = "utc" in model_config["model_name_or_path"]
+            if is_utc:
+                max_length = model_config.get("max_length", trainer.pretrained_model.config.max_position_embeddings)
+            else:
+                max_length = model_config.get("max_length", trainer.model.config.max_position_embeddings)
+            processed_eval_dataset = self._preprocess_dataset(
+                eval_dataset, max_length, trainer.tokenizer, is_utc=is_utc
+            )
+            eval_metrics = trainer.evaluate(eval_dataset=processed_eval_dataset)
+        else:
+            eval_metrics = trainer.evaluate()
+        trainer.log_metrics("eval", eval_metrics)
+
+        if os.path.exists(self.training_path):
+            logger.info(f"Removing {self.training_path} to conserve disk space")
+            shutil.rmtree(self.training_path)
+
+        return eval_metrics
+
+    def predict(self, test_dataset: Dataset, trial_id: Optional[str] = None):
+        """
+        Run prediction and returns predictions and potential metrics from a certain `trial_id` on the given dataset
+        Args:
+            test_dataset (Dataset): Custom test dataset and must contains the 'text_column' and 'label_column' fields.
+            trial_id (str, optional): Specify the model to be evaluated through the `trial_id`. Defaults to the best model selected by `metric_for_best_model`.
+        """
+        is_test = False
+        if self.label_column in test_dataset[0]:
+            self._data_checks_and_inference([test_dataset])
+        else:
+            is_test = True
+            for example in test_dataset:
+                if self.text_column not in example:
+                    raise ValueError(f"Text column: {self.text_column} must exist for example: {example}")
+
+        model_result = self._get_model_result(trial_id=trial_id)
+        model_config = model_result.metrics["config"]["candidates"]
+
+        trainer = self._construct_trainer(model_config)
+        trainer._load_from_checkpoint(resume_from_checkpoint=os.path.join(model_result.log_dir, self.save_path))
+
+        is_utc = False
+        if "utc" in model_config["model_name_or_path"]:
+            is_utc = True
+            max_length = model_config.get("max_length", trainer.pretrained_model.config.max_position_embeddings)
+        else:
+            max_length = model_config.get("max_length", trainer.model.config.max_position_embeddings)
+
+        processed_test_dataset = self._preprocess_dataset(
+            test_dataset, max_length, trainer.tokenizer, is_test=is_test, is_utc=is_utc
+        )
+        test_output = trainer.predict(test_dataset=processed_test_dataset)
+        trainer.log_metrics("test", test_output.metrics)
+
+        if os.path.exists(self.training_path):
+            logger.info(f"Removing {self.training_path} to conserve disk space")
+            shutil.rmtree(self.training_path)
+
+        return test_output
+
+    def _compute_metrics(self, eval_preds: EvalPrediction) -> Dict[str, float]:
+        """
+        function used by the Trainer to compute metrics during training
+        See :class:`~paddlenlp.trainer.trainer_base.Trainer` for more details.
+        """
+        if self.problem_type == "multi_class":
+            return self._compute_multi_class_metrics(eval_preds=eval_preds)
+        else:  # multi_label
+            return self._compute_multi_label_metrics(eval_preds=eval_preds)
+
+    def _compute_multi_class_metrics(self, eval_preds: EvalPrediction) -> Dict[str, float]:
+        #  utc labels is one-hot encoded
+        if len(eval_preds.label_ids[0]) > 1:
+            label_ids = np.argmax(eval_preds.label_ids, axis=-1)
+        else:
+            label_ids = eval_preds.label_ids
+
+        pred_ids = np.argmax(eval_preds.predictions, axis=-1)
+        metrics = {}
+        metrics["accuracy"] = accuracy_score(y_true=label_ids, y_pred=pred_ids)
+        for average in ["micro", "macro"]:
+            precision, recall, f1, _ = precision_recall_fscore_support(
+                y_true=label_ids, y_pred=pred_ids, average=average
+            )
+            metrics[f"{average}_precision"] = precision
+            metrics[f"{average}_recall"] = recall
+            metrics[f"{average}_f1"] = f1
+        return metrics
+
+    def _compute_multi_label_metrics(self, eval_preds: EvalPrediction) -> Dict[str, float]:
+        pred_probs = sigmoid(eval_preds.predictions)
+        pred_ids = pred_probs > self.multilabel_threshold
+        metrics = {}
+        # In multilabel classification, this function computes subset accuracy:
+        # the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.
+        metrics["accuracy"] = accuracy_score(y_true=eval_preds.label_ids, y_pred=pred_ids)
+        for average in ["micro", "macro"]:
+            precision, recall, f1, _ = precision_recall_fscore_support(
+                y_true=eval_preds.label_ids, y_pred=pred_ids, average=average
+            )
+            metrics[f"{average}_precision"] = precision
+            metrics[f"{average}_recall"] = recall
+            metrics[f"{average}_f1"] = f1
+        return metrics
+
+    def _preprocess_labels(self, example, is_test=False, is_utc=False):
+        if is_utc:
+            example["choices"] = list(self.label2id.keys())
+            example["text_a"] = example[self.text_column]
+            example["text_b"] = ""
+        if not is_test:
+            if is_utc or self.problem_type == "multi_label":
+                labels = [1.0 if i in example[self.label_column] else 0.0 for i in self.label2id]
+                example["labels"] = paddle.to_tensor(labels, dtype="float32")
+            elif self.problem_type == "multi_class":
+                example["labels"] = paddle.to_tensor([self.label2id[example[self.label_column]]], dtype="int64")
+        return example
+
+    def _preprocess_fn(
+        self,
+        example: Dict[str, Any],
+        tokenizer: PretrainedTokenizer,
+        max_length: int,
+        is_test: bool = False,
+    ):
+        """
+        Preprocess an example from raw features to input features that Transformers models expect (e.g. input_ids, attention_mask, labels, etc)
+        """
+        result = tokenizer(text=example[self.text_column], max_length=max_length, truncation=True)
+        if not is_test:
+            result["labels"] = self._preprocess_labels(example)["labels"]
+        return result
+
+    def _preprocess_dataset(
+        self,
+        dataset: Dataset,
+        max_length: int,
+        tokenizer: PretrainedTokenizer,
+        is_test: bool = False,
+        is_utc: bool = False,
+    ):
+        """
+        Preprocess dataset from raw features to input features used by the Trainer or PromptTrainer.
+        """
+        if is_utc:
+            trans_func = functools.partial(self._preprocess_labels, is_utc=is_utc, is_test=is_test)
+        else:
+            trans_func = functools.partial(
+                self._preprocess_fn,
+                tokenizer=tokenizer,
+                max_length=max_length,  # truncate to the max length allowed by the model
+                is_test=is_test,
+            )
+        processed_dataset = copy.deepcopy(dataset).map(trans_func, lazy=False)
+        return processed_dataset
+
+    def to_taskflow(
+        self, trial_id: Optional[str] = None, batch_size: int = 1, precision: str = "fp32", compress: bool = False
+    ):
+        """
+        Convert the model from a certain `trial_id` to a Taskflow for model inference.
+
+        Args:
+            trial_id (int): use the `trial_id` to select the model to export. Defaults to the best model selected by `metric_for_best_model`
+            batch_size(int): The sample number of a mini-batch. Defaults to 1.
+            precision (str): Select among ["fp32", "fp16"]. Default to "fp32".
+        """
+        model_result = self._get_model_result(trial_id=trial_id)
+        trial_id = model_result.metrics["trial_id"]
+        if compress:
+            export_path = os.path.join(model_result.log_dir, self.compress_path)
+        else:
+            export_path = os.path.join(model_result.log_dir, self.export_path)
+        self.export(export_path=export_path, trial_id=trial_id, compress=compress)
+
+        with open(os.path.join(export_path, "taskflow_config.json"), "r") as f:
+            taskflow_config = json.load(f)
+
+        taskflow_config["batch_size"] = batch_size
+        taskflow_config["precision"] = precision
+
+        return Taskflow(**taskflow_config)
+
+    def export(self, export_path: str, trial_id: Optional[str] = None, compress: bool = False):
+        """
+        Export the model from a certain `trial_id` to the given file path.
+
+        Args:
+            export_path (str, required): the filepath to export to
+            trial_id (int, required): use the `trial_id` to select the model to export. Defaults to the best model selected by `metric_for_best_model`
+        """
+
+        model_result = self._get_model_result(trial_id=trial_id)
+        model_config = model_result.metrics["config"]["candidates"]
+        trial_id = model_result.metrics["trial_id"]
+        if compress:
+            default_export_path = os.path.join(model_result.log_dir, self.compress_path)
+        else:
+            default_export_path = os.path.join(model_result.log_dir, self.export_path)
+
+        # Check whether it has been exported before
+        is_exported = False
+        if os.path.exists(default_export_path):
+            if "utc" in model_config["model_name_or_path"]:
+                files = [
+                    "model.pdiparams",
+                    "model.pdmodel",
+                    "tokenizer_config.json",
+                    "vocab.txt",
+                    "taskflow_config.json",
+                ]
+            else:
+                files = [
+                    "model.pdiparams",
+                    "model.pdmodel",
+                    "tokenizer_config.json",
+                    "vocab.txt",
+                    "taskflow_config.json",
+                ]
+
+            if all([os.path.exists(os.path.join(default_export_path, file)) for file in files]):
+                is_exported = True
+                if os.path.exists(export_path) and os.path.samefile(export_path, default_export_path):
+                    logger.info(f"Export_path: {export_path} already exists, skipping...")
+                    return
+
+        # Clear export path
+        if os.path.exists(export_path):
+            logger.info(f"Export path: {export_path} is not empty. The directory will be deleted.")
+            shutil.rmtree(export_path)
+
+        # Copy directly if it has been exported before
+        if is_exported:
+            logger.info(f"{default_export_path} already exists, copy {default_export_path} into {export_path}")
+            shutil.copytree(default_export_path, export_path)
+            return
+
+        # Construct trainer
+        trainer = self._construct_trainer(model_config)
+        trainer._load_from_checkpoint(resume_from_checkpoint=os.path.join(model_result.log_dir, self.save_path))
+
+        # Save static model
+        input_spec = self._get_input_spec(model_config=model_config)
+        if compress:
+            self.compress(trial_id=trial_id, compress_path=export_path)
+        elif "utc" in model_config["model_name_or_path"]:
+            export_model(model=trainer.pretrained_model, input_spec=input_spec, path=export_path)
+        else:
+            export_model(model=trainer.model, input_spec=input_spec, path=export_path)
+
+        # save tokenizer
+        trainer.tokenizer.save_pretrained(export_path)
+
+        # save taskflow config file
+        if "utc" in model_config["model_name_or_path"]:
+            taskflow_config = {
+                "task": "zero_shot_text_classification",
+                "model": model_config["model_name_or_path"],
+                "schema": list(self.label2id.keys()),
+                "single_label": True if self.problem_type == "multi_class" else False,
+                "is_static_model": True,
+                "pred_threshold": self.multilabel_threshold,
+                "max_seq_len": model_config.get("max_length", trainer.pretrained_model.config.max_position_embeddings),
+                "task_path": export_path,
+            }
+        else:
+            taskflow_config = {
+                "task": "text_classification",
+                "mode": "finetune",
+                "is_static_model": True,
+                "problem_type": self.problem_type,
+                "multilabel_threshold": self.multilabel_threshold,
+                "max_length": model_config.get("max_length", trainer.model.config.max_position_embeddings),
+                "id2label": self.id2label,
+                "task_path": export_path,
+            }
+
+        with open(os.path.join(export_path, "taskflow_config.json"), "w", encoding="utf-8") as f:
+            json.dump(taskflow_config, f, ensure_ascii=False)
+        logger.info(
+            f"Taskflow config saved to {export_path}. You can use the Taskflow config to create a Taskflow instance for inference"
+        )
+
+        logger.info(f"Exported trial_id: {trial_id} to export_path: {export_path} sucessfully!")
+
+        if os.path.exists(self.training_path):
+            logger.info("Removing training checkpoints to conserve disk space")
+            shutil.rmtree(self.training_path)
+
+    def _get_input_spec(self, model_config):
+
+        if "utc" in model_config["model_name_or_path"]:
+            input_spec = [
+                paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),
+                paddle.static.InputSpec(shape=[None, None], dtype="int64", name="token_type_ids"),
+                paddle.static.InputSpec(shape=[None, None], dtype="int64", name="position_ids"),
+                paddle.static.InputSpec(shape=[None, None, None, None], dtype="float32", name="attention_mask"),
+                paddle.static.InputSpec(shape=[None, None], dtype="int64", name="omask_positions"),
+                paddle.static.InputSpec(shape=[None], dtype="int64", name="cls_positions"),
+            ]
+        elif "ernie-m" in model_config["model_name_or_path"]:
+            input_spec = [paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")]
+        else:
+            input_spec = [
+                paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),
+                paddle.static.InputSpec(shape=[None, None], dtype="int64", name="token_type_ids"),
+            ]
+        return input_spec
+
+    def compress(self, compress_path: str, trial_id: Optional[str] = None):
+        """
+        Evaluate the models from a certain `trial_id` on the given dataset
+        Args:
+            compress_path(str): Path to the save compressed static model.
+            trial_id (str, optional): specify the model to be evaluated through the `trial_id`. Defaults to the best model selected by `metric_for_best_model`
+        """
+        logger.info("Currently Post Training Quantization is the only supported compression strategy.")
+        self._ptq_strategy(compress_path=compress_path, trial_id=trial_id)
+
+    def _ptq_strategy(
+        self,
+        compress_path: str,
+        trial_id: Optional[str] = None,
+        algo: str = "KL",
+        batch_size: int = 4,
+        batch_nums: int = 1,
+    ):
+        from paddle.static.quantization import PostTrainingQuantization
+
+        model_result = self._get_model_result(trial_id=trial_id)
+        model_config = model_result.metrics["config"]["candidates"]
+        trial_id = model_result.metrics["trial_id"]
+        export_path = os.path.join(model_result.log_dir, self.export_path)
+        self.export(export_path=export_path, trial_id=trial_id)
+        input_spec = self._get_input_spec(model_config=model_config)
+        if "utc" in model_config["model_name_or_path"]:
+            tokenizer = AutoTokenizer.from_pretrained(os.path.join(model_result.log_dir, self.save_path))
+            config = AutoConfig.from_pretrained(model_config["model_name_or_path"])
+            max_length = model_config.get("max_length", config.max_position_embeddings)
+            template = UTCTemplate(tokenizer, max_length)
+            inputs = [
+                template({"text_a": eval_ds[self.text_column], "text_b": "", "choices": list(self.label2id.keys())})
+                for eval_ds in self.eval_dataset
+            ]
+            collator = PromptDataCollatorWithPadding(
+                tokenizer, padding=True, return_tensors="np", return_attention_mask=True
+            )
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(os.path.join(model_result.log_dir, self.save_path))
+            config = AutoConfig.from_pretrained(model_config["model_name_or_path"])
+            max_length = model_config.get("max_length", config.max_position_embeddings)
+            inputs = [
+                tokenizer(eval_ds[self.text_column], max_length=max_length, truncation=True)
+                for eval_ds in self.eval_dataset
+            ]
+            collator = DataCollatorWithPadding(tokenizer, return_tensors="np")
+        batches = [collator(inputs[idx : idx + batch_size]) for idx in range(0, len(inputs), batch_size)]
+
+        def _batch_generator_func():
+            for batch in batches:
+                batch_data = []
+                for spec in input_spec:
+                    if spec.name == "attention_mask":
+                        if batch[spec.name].ndim == 2:
+                            batch[spec.name] = (1 - batch[spec.name][:, np.newaxis, np.newaxis, :]) * -1e4
+                        elif batch[spec.name].ndim != 4:
+                            raise ValueError(
+                                "Expect attention mask with ndim=2 or 4, but get ndim={}".format(batch[spec.name].ndim)
+                            )
+                    batch_data.append(batch[spec.name].astype(str(spec.dtype).split(".")[1]))
+                yield batch_data
+
+        paddle.enable_static()
+        place = paddle.framework._current_expected_place()
+        exe = paddle.static.Executor(place)
+
+        post_training_quantization = PostTrainingQuantization(
+            executor=exe,
+            batch_generator=_batch_generator_func,
+            model_dir=export_path,
+            model_filename="model.pdmodel",
+            params_filename="model.pdiparams",
+            batch_size=batch_size,
+            batch_nums=batch_nums,
+            scope=None,
+            algo=algo,
+            hist_percent=0.9999,
+            round_type="round",
+            bias_correction=False,
+            quantizable_op_type=["matmul", "matmul_v2"],
+            is_full_quantize=False,
+            weight_bits=8,
+            activation_bits=8,
+            activation_quantize_type="range_abs_max",
+            weight_quantize_type="channel_wise_abs_max",
+            onnx_format=False,
+            optimize_model=False,
+        )
+
+        post_training_quantization.quantize()
+        post_training_quantization.save_quantized_model(
+            save_model_path=compress_path,
+            model_filename="model.pdmodel",
+            params_filename="model.pdiparams",
+        )
+
+        paddle.disable_static()
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/utils.py
new file mode 100644
index 000000000..9e87586c6
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/autonlp/utils.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+
+
+class UTCLoss(object):
+    def __call__(self, logit, label):
+        return self.forward(logit, label)
+
+    def forward(self, logit, label):
+        logit = (1.0 - 2.0 * label) * logit
+        logit_neg = logit - label * 1e12
+        logit_pos = logit - (1.0 - label) * 1e12
+        zeros = paddle.zeros_like(logit[..., :1])
+        logit_neg = paddle.concat([logit_neg, zeros], axis=-1)
+        logit_pos = paddle.concat([logit_pos, zeros], axis=-1)
+        label = paddle.concat([label, zeros], axis=-1)
+        logit_neg[label == -100] = -1e12
+        logit_pos[label == -100] = -1e12
+        neg_loss = paddle.logsumexp(logit_neg, axis=-1)
+        pos_loss = paddle.logsumexp(logit_pos, axis=-1)
+        loss = (neg_loss + pos_loss).mean()
+        return loss
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/ernie_model.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/ernie_model.py
new file mode 100644
index 000000000..d9ba566ae
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/ernie_model.py
@@ -0,0 +1,304 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import paddle
+import paddle.nn as nn
+
+from paddlenlp.experimental import FasterPretrainedModel, FasterTokenizer
+from paddlenlp.transformers.ernie.modeling import ErnieEmbeddings, ErniePooler
+from paddlenlp.transformers.model_utils import register_base_model
+
+__all__ = ["FasterErnieModel", "FasterErnieForSequenceClassification", "FasterErnieForTokenClassification"]
+
+
+class FasterErniePretrainedModel(FasterPretrainedModel):
+    r"""
+    An abstract class for pretrained ERNIE models. It provides ERNIE related
+    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
+    `pretrained_init_configuration`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    Refer to :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+
+    """
+
+    model_config_file = "model_config.json"
+    pretrained_init_configuration = {
+        "ernie-1.0": {
+            "attention_probs_dropout_prob": 0.1,
+            "hidden_act": "relu",
+            "hidden_dropout_prob": 0.1,
+            "hidden_size": 768,
+            "initializer_range": 0.02,
+            "max_position_embeddings": 513,
+            "num_attention_heads": 12,
+            "num_hidden_layers": 12,
+            "type_vocab_size": 2,
+            "vocab_size": 18000,
+            "pad_token_id": 0,
+            "do_lower_case": True,
+        },
+        "ernie-2.0-en": {
+            "attention_probs_dropout_prob": 0.1,
+            "hidden_act": "gelu",
+            "hidden_dropout_prob": 0.1,
+            "hidden_size": 768,
+            "initializer_range": 0.02,
+            "max_position_embeddings": 512,
+            "num_attention_heads": 12,
+            "num_hidden_layers": 12,
+            "type_vocab_size": 4,
+            "vocab_size": 30522,
+            "pad_token_id": 0,
+            "do_lower_case": True,
+        },
+        "ernie-2.0-en-finetuned-squad": {
+            "attention_probs_dropout_prob": 0.1,
+            "hidden_act": "gelu",
+            "hidden_dropout_prob": 0.1,
+            "hidden_size": 768,
+            "initializer_range": 0.02,
+            "max_position_embeddings": 512,
+            "num_attention_heads": 12,
+            "num_hidden_layers": 12,
+            "type_vocab_size": 4,
+            "vocab_size": 30522,
+            "pad_token_id": 0,
+            "do_lower_case": True,
+        },
+        "ernie-2.0-large-en": {
+            "attention_probs_dropout_prob": 0.1,
+            "intermediate_size": 4096,  # special for ernie-2.0-large-en
+            "hidden_act": "gelu",
+            "hidden_dropout_prob": 0.1,
+            "hidden_size": 1024,
+            "initializer_range": 0.02,
+            "max_position_embeddings": 512,
+            "num_attention_heads": 16,
+            "num_hidden_layers": 24,
+            "type_vocab_size": 4,
+            "vocab_size": 30522,
+            "pad_token_id": 0,
+            "do_lower_case": True,
+        },
+    }
+    resource_files_names = {"model_state": "model_state.pdparams", "vocab_file": "vocab.txt"}
+    pretrained_resource_files_map = {
+        "model_state": {
+            "ernie-1.0": "https://bj.bcebos.com/paddlenlp/models/transformers/faster_ernie/faster_ernie_v1_chn_base.pdparams",
+            "ernie-2.0-en": "https://bj.bcebos.com/paddlenlp/models/transformers/faster_ernie_v2_base/faster_ernie_v2_eng_base.pdparams",
+            "ernie-2.0-en-finetuned-squad": "https://bj.bcebos.com/paddlenlp/models/transformers/faster_ernie_v2_base/faster_ernie_v2_eng_base_finetuned_squad.pdparams",
+            "ernie-2.0-large-en": "https://bj.bcebos.com/paddlenlp/models/transformers/faster_ernie_v2_large/faster_ernie_v2_eng_large.pdparams",
+        },
+        "vocab_file": {
+            "ernie-1.0": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/vocab.txt",
+            "ernie-2.0-en": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_v2_base/vocab.txt",
+            "ernie-2.0-en-finetuned-squad": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_v2_base/vocab.txt",
+            "ernie-2.0-large-en": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_v2_large/vocab.txt",
+        },
+    }
+    base_model_prefix = "ernie"
+
+    def init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # only support dygraph, use truncated_normal and make it inplace
+            # and configurable later
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.initializer_range
+                        if hasattr(self, "initializer_range")
+                        else self.ernie.config["initializer_range"],
+                        shape=layer.weight.shape,
+                    )
+                )
+        elif isinstance(layer, nn.LayerNorm):
+            layer._epsilon = 1e-12
+
+
+@register_base_model
+class FasterErnieModel(FasterErniePretrainedModel):
+    r"""
+    The bare ERNIE Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        vocab_size (int):
+            Vocabulary size of `inputs_ids` in `ErnieModel`. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `ErnieModel`.
+        hidden_size (int, optional):
+            Dimensionality of the embedding layer, encoder layers and pooler layer. Defaults to `768`.
+        num_hidden_layers (int, optional):
+            Number of hidden layers in the Transformer encoder. Defaults to `12`.
+        num_attention_heads (int, optional):
+            Number of attention heads for each attention layer in the Transformer encoder.
+            Defaults to `12`.
+        intermediate_size (int, optional):
+            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
+            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
+            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
+            Defaults to `3072`.
+        hidden_act (str, optional):
+            The non-linear activation function in the feed-forward layer.
+            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
+            are supported. Defaults to `"gelu"`.
+        hidden_dropout_prob (float, optional):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+            Defaults to `0.1`.
+        attention_probs_dropout_prob (float, optional):
+            The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
+            Defaults to `0.1`.
+        max_position_embeddings (int, optional):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
+            sequence. Defaults to `512`.
+        type_vocab_size (int, optional):
+            The vocabulary size of the `token_type_ids`.
+            Defaults to `2`.
+        initializer_range (float, optional):
+            The standard deviation of the normal initializer for initializing all weight matrices.
+            Defaults to `0.02`.
+
+            .. note::
+                A normal_initializer initializes weight matrices as normal distributions.
+                See :meth:`ErniePretrainedModel._init_weights()` for how weights are initialized in `ErnieModel`.
+
+        pad_token_id(int, optional):
+            The index of padding token in the token vocabulary.
+            Defaults to `0`.
+
+    """
+
+    def __init__(
+        self,
+        vocab_size,
+        vocab_file,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        pad_token_id=0,
+        do_lower_case=True,
+        is_split_into_words=False,
+        max_seq_len=512,
+    ):
+        super(FasterErnieModel, self).__init__()
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the "
+                "vocabulary from a pretrained model please use "
+                "`model = FasterErnieModel.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+        self.do_lower_case = do_lower_case
+        self.vocab = self.load_vocabulary(vocab_file)
+        self.max_seq_len = max_seq_len
+
+        self.tokenizer = FasterTokenizer(
+            self.vocab, do_lower_case=self.do_lower_case, is_split_into_words=is_split_into_words
+        )
+        self.pad_token_id = pad_token_id
+        self.initializer_range = initializer_range
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(mean=0.0, std=self.initializer_range))
+        self.embeddings = ErnieEmbeddings(
+            vocab_size,
+            hidden_size,
+            hidden_dropout_prob,
+            max_position_embeddings,
+            type_vocab_size,
+            pad_token_id,
+            weight_attr,
+        )
+        # Avoid import error in global scope when using paddle <= 2.2.0, therefore
+        # import FusedTransformerEncoderLayer in local scope.
+        # FusedTransformerEncoderLayer is supported by paddlepaddle since 2.2.0, please
+        # ensure the version >= 2.2.0
+        from paddle.incubate.nn import FusedTransformerEncoderLayer
+
+        encoder_layer = FusedTransformerEncoderLayer(
+            hidden_size,
+            num_attention_heads,
+            intermediate_size,
+            dropout_rate=hidden_dropout_prob,
+            activation=hidden_act,
+            attn_dropout_rate=attention_probs_dropout_prob,
+            act_dropout_rate=0,
+            weight_attr=weight_attr,
+        )
+        self.encoder = nn.TransformerEncoder(encoder_layer, num_hidden_layers)
+        self.pooler = ErniePooler(hidden_size, weight_attr)
+        self.apply(self.init_weights)
+
+    def forward(self, text, text_pair=None):
+        input_ids, token_type_ids = self.tokenizer(text=text, text_pair=text_pair, max_seq_len=self.max_seq_len)
+
+        attention_mask = paddle.unsqueeze(
+            (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e4, axis=[1, 2]
+        )
+        embedding_output = self.embeddings(input_ids=input_ids, token_type_ids=token_type_ids)
+        encoder_outputs = self.encoder(embedding_output, attention_mask)
+        sequence_output = encoder_outputs
+        pooled_output = self.pooler(sequence_output)
+        return sequence_output, pooled_output
+
+
+class FasterErnieForSequenceClassification(FasterErniePretrainedModel):
+    def __init__(self, ernie, num_classes=2, dropout=None):
+        super(FasterErnieForSequenceClassification, self).__init__()
+        self.num_classes = num_classes
+        self.ernie = ernie  # allow ernie to be config
+        self.dropout = nn.Dropout(dropout if dropout is not None else self.ernie.config["hidden_dropout_prob"])
+        self.classifier = nn.Linear(self.ernie.config["hidden_size"], num_classes)
+        self.apply(self.init_weights)
+
+    def forward(self, text, text_pair=None):
+
+        _, pooled_output = self.ernie(text, text_pair)
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        predictions = paddle.argmax(logits, axis=-1)
+        return logits, predictions
+
+
+class FasterErnieForTokenClassification(FasterErniePretrainedModel):
+    def __init__(self, ernie, num_classes=2, dropout=None):
+        super(FasterErnieForTokenClassification, self).__init__()
+        self.num_classes = num_classes
+        self.ernie = ernie  # allow ernie to be config
+        self.dropout = nn.Dropout(dropout if dropout is not None else self.ernie.config["hidden_dropout_prob"])
+        self.classifier = nn.Linear(self.ernie.config["hidden_size"], num_classes)
+        self.apply(self.init_weights)
+
+    def forward(self, text, text_pair=None):
+
+        sequence_output, _ = self.ernie(text, text_pair)
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        predictions = paddle.argmax(logits, axis=-1)
+        return logits, predictions
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/faster_tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/faster_tokenizer.py
new file mode 100644
index 000000000..051cf218f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/faster_tokenizer.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+
+import paddle
+import paddle.nn as nn
+from paddle.common_ops_import import LayerHelper
+from paddle.framework import core
+
+from paddlenlp.transformers import BertTokenizer, ErnieTokenizer, RobertaTokenizer
+from paddlenlp.transformers.ppminilm.tokenizer import PPMiniLMTokenizer
+from paddlenlp.utils.log import logger
+
+__all__ = ["to_tensor", "to_vocab_buffer", "FasterTokenizer"]
+
+
+def to_tensor(string_values, name="text"):
+    """
+    Create the tensor that the value holds the list of string.
+    NOTICE: The value will be holded in the cpu place.
+
+    Args:
+        string_values(list[string]): The value will be setted to the tensor.
+        name(string): The name of the tensor.
+    """
+    tensor = paddle.Tensor(core.VarDesc.VarType.STRING, [], name, core.VarDesc.VarType.STRINGS, False)
+    tensor.value().set_string_list(string_values)
+    return tensor
+
+
+def to_vocab_buffer(vocab_dict, name):
+    """
+    Create the tensor that the value holds the map, the type of key is the string.
+    NOTICE: The value will be holded in the cpu place.
+
+    Args:
+        vocab_dict(dict): The value will be setted to the tensor.
+            The key is token and the value is the token index.
+        name(string): The name of the tensor.
+    """
+    tensor = paddle.Tensor(core.VarDesc.VarType.RAW, [], name, core.VarDesc.VarType.VOCAB, True)
+    tensor.value().set_vocab(vocab_dict)
+    return tensor
+
+
+class FasterTokenizer(nn.Layer):
+    name_map = {
+        "bert-base-uncased": BertTokenizer,
+        "bert-large-uncased": BertTokenizer,
+        "bert-base-cased": BertTokenizer,
+        "bert-large-cased": BertTokenizer,
+        "bert-base-multilingual-uncased": BertTokenizer,
+        "bert-base-multilingual-cased": BertTokenizer,
+        "bert-base-chinese": BertTokenizer,
+        "bert-wwm-chinese": BertTokenizer,
+        "bert-wwm-ext-chinese": BertTokenizer,
+        "ernie-1.0": ErnieTokenizer,
+        "ernie-2.0-en": ErnieTokenizer,
+        "ernie-2.0-large-en": ErnieTokenizer,
+        "roberta-wwm-ext": RobertaTokenizer,
+        "roberta-wwm-ext-large": RobertaTokenizer,
+        "rbt3": RobertaTokenizer,
+        "rbtl3": RobertaTokenizer,
+        "ppminilm-6l-768h": PPMiniLMTokenizer,
+    }
+
+    def __init__(self, vocab, do_lower_case=False, is_split_into_words=False):
+        super(FasterTokenizer, self).__init__()
+
+        try:
+            self.mod = importlib.import_module("paddle._C_ops")
+        except Exception:
+            logger.warning(
+                "The paddlepaddle version is {paddle.__version__}, not the latest. Please upgrade the paddlepaddle package (>= 2.2.1)."
+            )
+            self.mod = importlib.import_module("paddle.framework.core.ops")
+
+        vocab_buffer = to_vocab_buffer(vocab, "vocab")
+        self.register_buffer("vocab", vocab_buffer, persistable=True)
+
+        self.do_lower_case = do_lower_case
+        self.is_split_into_words = is_split_into_words
+
+    def forward(self, text, text_pair=None, max_seq_len=0, pad_to_max_seq_len=False):
+        if paddle.in_dynamic_mode():
+            if isinstance(text, list) or isinstance(text, tuple):
+                text = to_tensor(list(text))
+            if text_pair is not None:
+                if isinstance(text_pair, list) or isinstance(text_pair, tuple):
+                    text_pair = to_tensor(list(text_pair))
+            input_ids, seg_ids = self.mod.faster_tokenizer(
+                self.vocab,
+                text,
+                text_pair,
+                "do_lower_case",
+                self.do_lower_case,
+                "max_seq_len",
+                max_seq_len,
+                "pad_to_max_seq_len",
+                pad_to_max_seq_len,
+                "is_split_into_words",
+                self.is_split_into_words,
+            )
+
+            return input_ids, seg_ids
+
+        attrs = {
+            "do_lower_case": self.do_lower_case,
+            "max_seq_len": max_seq_len,
+            "pad_to_max_seq_len": pad_to_max_seq_len,
+            "is_split_into_words": self.is_split_into_words,
+        }
+        helper = LayerHelper("faster_tokenizer")
+        input_ids = helper.create_variable_for_type_inference(dtype="int64")
+        seg_ids = helper.create_variable_for_type_inference(dtype="int64")
+        if text_pair is None:
+            helper.append_op(
+                type="faster_tokenizer",
+                inputs={"Vocab": self.vocab, "Text": text},
+                outputs={"InputIds": input_ids, "SegmentIds": seg_ids},
+                attrs=attrs,
+            )
+        else:
+            helper.append_op(
+                type="faster_tokenizer",
+                inputs={"Vocab": self.vocab, "Text": text, "TextPair": text_pair},
+                outputs={"InputIds": input_ids, "SegmentIds": seg_ids},
+                attrs=attrs,
+            )
+        return input_ids, seg_ids
+
+    @classmethod
+    def from_pretrained(cls, name):
+        if name in cls.name_map:
+            tokenizer_cls = cls.name_map[name]
+            tokenizer = tokenizer_cls.from_pretrained(name)
+            faster_tokenizer = cls(tokenizer.vocab.token_to_idx, tokenizer.do_lower_case)
+            return faster_tokenizer
+        else:
+            raise ValueError("Unknown name %s. Now %s surports  %s" % (name, cls.__name__, list(cls.name_map.keys())))
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/model_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/model_utils.py
new file mode 100644
index 000000000..b187bb370
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/model_utils.py
@@ -0,0 +1,427 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import inspect
+import io
+import json
+import os
+from shutil import copyfile
+
+import numpy as np
+import paddle
+from paddle.framework import core
+
+from paddlenlp.transformers import PretrainedModel
+from paddlenlp.utils.download import resolve_file_path
+
+# TODO(fangzeyang) Temporary fix and replace by paddle framework downloader later
+from paddlenlp.utils.log import logger
+
+__all__ = ["FasterPretrainedModel", "ActScalesLoader", "WeightScalesLoader"]
+
+
+def load_vocabulary(filepath):
+    token_to_idx = {}
+    with io.open(filepath, "r", encoding="utf-8") as f:
+        for index, line in enumerate(f):
+            token = line.rstrip("\n")
+            token_to_idx[token] = int(index)
+    return token_to_idx
+
+
+class FasterPretrainedModel(PretrainedModel):
+    def to_static(self, output_path):
+        self.eval()
+
+        # Convert to static graph with specific input description
+        model = paddle.jit.to_static(
+            self, input_spec=[paddle.static.InputSpec(shape=[None, None], dtype=core.VarDesc.VarType.STRINGS)]
+        )
+        paddle.jit.save(model, output_path)
+        logger.info("Already save the static model to the path %s" % output_path)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        """
+        Creates an instance of `PretrainedModel`. Model weights are loaded
+        by specifying name of a built-in pretrained model, or a community contributed model,
+        or a local file directory path.
+
+        Args:
+            pretrained_model_name_or_path (str): Name of pretrained model or dir path
+                to load from. The string can be:
+
+                - Name of a built-in pretrained model
+                - Name of a community-contributed pretrained model.
+                - Local directory path which contains model weights file("model_state.pdparams")
+                  and model config file ("model_config.json").
+            *args (tuple): Position arguments for model `__init__`. If provided,
+                use these as position argument values for model initialization.
+            **kwargs (dict): Keyword arguments for model `__init__`. If provided,
+                use these to update pre-defined keyword argument values for model
+                initialization. If the keyword is in `__init__` argument names of
+                base model, update argument values of the base model; else update
+                argument values of derived model.
+
+        Returns:
+            PretrainedModel: An instance of `PretrainedModel`.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import BertForSequenceClassification
+
+                # Name of built-in pretrained model
+                model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+
+                # Name of community-contributed pretrained model
+                model = BertForSequenceClassification.from_pretrained('yingyibiao/bert-base-uncased-sst-2-finetuned')
+
+                # Load from local directory path
+                model = BertForSequenceClassification.from_pretrained('./my_bert/')
+        """
+        pretrained_models = list(cls.pretrained_init_configuration.keys())
+        resource_files = {}
+        init_configuration = {}
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        cache_dir = kwargs.pop("cache_dir", None)
+        from_hf_hub = kwargs.pop("from_hf_hub", False)
+        from_aistudio = kwargs.pop("from_aistudio", False)
+        subfolder = kwargs.pop("subfolder", "")
+
+        # From built-in pretrained models
+        if pretrained_model_name_or_path in pretrained_models:
+            for file_id, map_list in cls.pretrained_resource_files_map.items():
+                resource_files[file_id] = map_list[pretrained_model_name_or_path]
+            init_configuration = copy.deepcopy(cls.pretrained_init_configuration[pretrained_model_name_or_path])
+        # From local dir path
+        elif os.path.isdir(pretrained_model_name_or_path):
+            for file_id, file_name in cls.resource_files_names.items():
+                full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
+                if os.path.isfile(full_file_name):
+                    resource_files[file_id] = full_file_name
+            resource_files["model_config_file"] = os.path.join(pretrained_model_name_or_path, cls.model_config_file)
+        else:
+            for file_id, file_name in cls.resource_files_names.items():
+                resource_files[file_id] = file_name
+
+        # default_root = os.path.join(MODEL_HOME, pretrained_model_name_or_path)
+        resolved_resource_files = {}
+        for file_id, file_path in resource_files.items():
+            if file_path is None or os.path.isfile(file_path):
+                resolved_resource_files[file_id] = file_path
+                continue
+            resolved_resource_files[file_id] = resolve_file_path(
+                pretrained_model_name_or_path,
+                [file_path],
+                subfolder,
+                cache_dir=cache_dir,
+                from_aistudio=from_aistudio,
+                from_hf_hub=from_hf_hub,
+            )
+
+        # Prepare model initialization kwargs
+        # Did we saved some inputs and kwargs to reload ?
+        model_config_file = resolved_resource_files.pop("model_config_file", None)
+        if model_config_file is not None:
+            with io.open(model_config_file, encoding="utf-8") as f:
+                init_kwargs = json.load(f)
+        else:
+            init_kwargs = init_configuration
+
+        # position args are stored in kwargs, maybe better not include
+        init_args = init_kwargs.pop("init_args", ())
+        # class name corresponds to this configuration
+        init_class = init_kwargs.pop("init_class", cls.base_model_class.__name__)
+        # Check if the loaded config matches the current model class's __init__
+        # arguments. If not match, the loaded config is for the base model class.
+        if init_class == cls.base_model_class.__name__:
+            base_args = init_args
+            base_kwargs = init_kwargs
+            derived_args = ()
+            derived_kwargs = {}
+            base_arg_index = None
+        else:  # extract config for base model
+            derived_args = list(init_args)
+            derived_kwargs = init_kwargs
+            base_arg = None
+            for i, arg in enumerate(init_args):
+                if isinstance(arg, dict) and "init_class" in arg:
+                    assert arg.pop("init_class") == cls.base_model_class.__name__, (
+                        "pretrained base model should be {}"
+                    ).format(cls.base_model_class.__name__)
+                    base_arg_index = i
+                    base_arg = arg
+                    break
+            for arg_name, arg in init_kwargs.items():
+                if isinstance(arg, dict) and "init_class" in arg:
+                    assert arg.pop("init_class") == cls.base_model_class.__name__, (
+                        "pretrained base model should be {}"
+                    ).format(cls.base_model_class.__name__)
+                    base_arg_index = arg_name
+                    base_arg = arg
+                    break
+
+            base_args = base_arg.pop("init_args", ())
+            base_kwargs = base_arg
+        if cls == cls.base_model_class:
+            # Update with newly provided args and kwargs for base model
+            base_args = base_args if not args else args
+            base_kwargs.update(kwargs)
+            vocab_file = resolved_resource_files.pop("vocab_file", None)
+            if vocab_file and base_kwargs.get("vocab_file", None) is None:
+                base_kwargs["vocab_file"] = vocab_file
+            assert base_kwargs.get("vocab_file", None) is not None, "The vocab "
+            f"file is None. Please reload the class  {cls.__name__} with pretrained_name."
+
+            model = cls(*base_args, **base_kwargs)
+        else:
+            # Update with newly provided args and kwargs for derived model
+            base_parameters_dict = inspect.signature(cls.base_model_class.__init__).parameters
+            for k, v in kwargs.items():
+                if k in base_parameters_dict:
+                    base_kwargs[k] = v
+
+            vocab_file = resolved_resource_files.pop("vocab_file", None)
+            if vocab_file and base_kwargs.get("vocab_file", None) is None:
+                base_kwargs["vocab_file"] = vocab_file
+            assert base_kwargs.get("vocab_file", None) is not None, "The vocab "
+            f"file is None. Please reload the class  {cls.__name__} with pretrained_name."
+
+            base_model = cls.base_model_class(*base_args, **base_kwargs)
+            if base_arg_index is not None:
+                derived_args[base_arg_index] = base_model
+            else:
+                derived_args = (base_model,)  # assume at the first position
+            derived_args = derived_args if not args else args
+            derived_parameters_dict = inspect.signature(cls.__init__).parameters
+            for k, v in kwargs.items():
+                if k in derived_parameters_dict:
+                    derived_kwargs[k] = v
+            model = cls(*derived_args, **derived_kwargs)
+
+        # Maybe need more ways to load resources.
+        weight_path = resolved_resource_files["model_state"]
+        assert weight_path.endswith(".pdparams"), "suffix of weight must be .pdparams"
+
+        state_dict = paddle.load(weight_path)
+        logger.info("Loaded parameters from %s" % weight_path)
+
+        # Make sure we are able to load base models as well as derived models
+        # (with heads)
+        start_prefix = ""
+        model_to_load = model
+        state_to_load = state_dict
+        unexpected_keys = []
+        missing_keys = []
+        if not hasattr(model, cls.base_model_prefix) and any(
+            s.startswith(cls.base_model_prefix) for s in state_dict.keys()
+        ):
+            # base model
+            state_to_load = {}
+            start_prefix = cls.base_model_prefix + "."
+            for k, v in state_dict.items():
+                if k.startswith(cls.base_model_prefix):
+                    state_to_load[k[len(start_prefix) :]] = v
+                else:
+                    unexpected_keys.append(k)
+        if hasattr(model, cls.base_model_prefix) and not any(
+            s.startswith(cls.base_model_prefix) for s in state_dict.keys()
+        ):
+            # derived model (base model with heads)
+            model_to_load = getattr(model, cls.base_model_prefix)
+            for k in model.state_dict().keys():
+                if not k.startswith(cls.base_model_prefix):
+                    missing_keys.append(k)
+        if len(missing_keys) > 0:
+            logger.info(
+                "Weights of {} not initialized from pretrained model: {}".format(
+                    model.__class__.__name__, missing_keys
+                )
+            )
+        if len(unexpected_keys) > 0:
+            logger.info(
+                "Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys)
+            )
+        if paddle.in_dynamic_mode():
+            model_to_load.set_state_dict(state_to_load)
+            return model
+        return model, state_to_load
+
+    @staticmethod
+    def load_vocabulary(filepath):
+        token_to_idx = {}
+        with io.open(filepath, "r", encoding="utf-8") as f:
+            for index, line in enumerate(f):
+                token = line.rstrip("\n")
+                token_to_idx[token] = int(index)
+        return token_to_idx
+
+    def save_pretrained(self, save_dir):
+        """
+        Saves model configuration and related resources (model state) as files
+        under `save_dir`. The model configuration would be saved into a file named
+        "model_config.json", and model state would be saved into a file
+        named "model_state.pdparams".
+
+        The `save_dir` can be used in `from_pretrained` as argument value
+        of `pretrained_model_name_or_path` to re-load the trained model.
+
+        Args:
+            save_dir (str): Directory to save files into.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import BertForSequenceClassification
+
+                model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+                model.save_pretrained('./trained_model/')
+                # reload from save_directory
+                model = BertForSequenceClassification.from_pretrained('./trained_model/')
+        """
+        assert not os.path.isfile(save_dir), "Saving directory ({}) should be a directory, not a file".format(save_dir)
+        os.makedirs(save_dir, exist_ok=True)
+        # Save model config
+        self.save_model_config(save_dir)
+        # Save model
+        if paddle.in_dynamic_mode():
+            file_name = os.path.join(save_dir, list(self.resource_files_names.values())[0])
+            paddle.save(self.state_dict(), file_name)
+        else:
+            logger.warning("Save pretrained model only supported dygraph mode for now!")
+        # Save resources file
+        self.save_resources(save_dir)
+
+    def save_resources(self, save_directory):
+        """
+        Save tokenizer related resources to `resource_files_names` indicating
+        files under `save_directory` by copying directly. Override it if necessary.
+
+        Args:
+            save_directory (str): Directory to save files into.
+        """
+        for name, file_name in self.resource_files_names.items():
+            src_path = self.init_config["init_args"][0].get(name, None)
+            dst_path = os.path.join(save_directory, file_name)
+            if src_path and os.path.abspath(src_path) != os.path.abspath(dst_path):
+                copyfile(src_path, dst_path)
+
+
+class ActScalesLoader:
+    def __init__(
+        self,
+        scale_json_file_path="act_scales.json",
+        key_map_dict=None,
+        num_of_layers=None,
+    ):
+        with open(scale_json_file_path) as json_file:
+            self.scale_dict = json.load(json_file)
+        self.key_map = key_map_dict
+        self.scale = {}
+        for scale_type, key_template in self.key_map.items():
+            self.scale[scale_type] = np.full([num_of_layers], fill_value=-1.0)
+            for i in range(num_of_layers):
+                if key_template.replace("#", str(i)) in self.scale_dict.keys():
+                    self.scale[scale_type][i] = 1 / self.scale_dict[key_template.replace("#", str(i))]
+
+
+class WeightScalesLoader:
+    def __init__(
+        self,
+        scale_json_file_path="weight_scales.json",
+        key_map_dict=None,
+        num_of_layers=None,
+        concat_qkv=False,
+        concat_ffn1=False,
+    ):
+        with open(scale_json_file_path) as json_file:
+            self.scale_dict = json.load(json_file)
+        self.key_map = key_map_dict
+        self.scale = {}
+        for scale_type, key_template in self.key_map.items():
+            no_skip_layer_list = []
+            n = 1
+            for i in range(num_of_layers):
+                if key_template.replace("#", str(i)) in self.scale_dict.keys():
+                    no_skip_layer_list.append(key_template.replace("#", str(i)))
+            if len(no_skip_layer_list) > 0:
+                n = len(self.scale_dict[no_skip_layer_list[0]])
+            self.scale[scale_type] = np.full([num_of_layers, n], fill_value=-1.0, dtype="float32")
+            for i in range(num_of_layers):
+                if key_template.replace("#", str(i)) in self.scale_dict.keys():
+                    self.scale[scale_type][i, :] = self.scale_dict[key_template.replace("#", str(i))]
+
+        # concat qkv and ffn1
+        if concat_qkv:
+            self.scale["qkv_weight_scale"] = []
+
+        if concat_ffn1:
+            self.scale["ffn1_weight_scale"] = []
+
+        for i in range(num_of_layers):
+            if concat_qkv:
+                self.scale["qkv_weight_scale"].append(
+                    np.concatenate(
+                        [
+                            self.scale["q_weight_scale"][i, :],
+                            self.scale["k_weight_scale"][i, :],
+                            self.scale["v_weight_scale"][i, :],
+                        ]
+                    )
+                )
+
+            if concat_ffn1:
+                self.scale["ffn1_weight_scale"].append(
+                    np.concatenate([self.scale["ffn1_1_weight_scale"][i, :], self.scale["ffn1_2_weight_scale"][i, :]])
+                )
+
+
+class CacheScaleLoader:
+    def __init__(
+        self,
+        scale_json_file_path="cache_scales.json",
+        key_map_dict=None,
+        num_of_layers=None,
+        num_heads=None,
+        num_key_value_heads=None,
+    ):
+        with open(scale_json_file_path) as json_file:
+            self.scale_dict = json.load(json_file)
+        self.key_map = key_map_dict
+        self.scale = {}
+        for scale_type, key_template in self.key_map.items():
+            if "cache_k" in scale_type:
+                scale_type_out = "cache_k_out_scale"
+            else:
+                scale_type_out = "cache_v_out_scale"
+            self.scale[scale_type] = np.full([num_of_layers, num_key_value_heads], fill_value=-1.0)
+            self.scale[scale_type_out] = np.full([num_of_layers, num_key_value_heads], fill_value=-1.0)
+
+            for i in range(num_of_layers):
+                if key_template.replace("#", str(i)) in self.scale_dict.keys():
+                    if num_heads != num_key_value_heads:
+                        self.scale[scale_type][i, :] = [
+                            127.0 / self.scale_dict[key_template.replace("#", str(i))][j]
+                            for j in range(0, num_heads, num_heads // num_key_value_heads)
+                        ]
+                    else:
+                        self.scale[scale_type][i, :] = [
+                            127.0 / self.scale_dict[key_template.replace("#", str(i))][j]
+                            for j in range(0, num_key_value_heads)
+                        ]
+                    self.scale[scale_type_out][i, :] = [
+                        1.0 / self.scale[scale_type][i, j] for j in range(0, num_key_value_heads)
+                    ]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/__init__.py
new file mode 100644
index 000000000..cb5e927e9
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .bloom import *
+from .chatglm import *
+from .chatglm_v2 import *
+from .fused_transformer_layers import *
+from .gpt import *
+from .llama import *
+from .opt import *
+from .qwen import *
+from .qwen2 import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/bloom/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/bloom/__init__.py
new file mode 100644
index 000000000..c2a7f656c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/bloom/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .modeling import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/bloom/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/bloom/modeling.py
new file mode 100644
index 000000000..ba3a1950c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/bloom/modeling.py
@@ -0,0 +1,768 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from typing import Tuple, Union
+
+import paddle
+from paddle import Tensor, nn
+from paddle.distributed import fleet
+from paddle.nn.quant import weight_quantize
+
+from paddlenlp.experimental.transformers.fused_transformer_layers import (
+    FusedBlockMultiTransformer,
+    FusedBlockMultiTransformerWeightOnly,
+    FusedMultiTransformerBase,
+    FusedMultiTransformerConfig,
+    FusedMultiTransformerWeightOnly,
+)
+from paddlenlp.experimental.transformers.generation_utils import (
+    GenerationBlockInferenceModel,
+    GenerationInferenceModel,
+)
+from paddlenlp.transformers.bloom.modeling import BloomPreTrainedModel
+from paddlenlp.transformers.model_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+)
+from paddlenlp.transformers.model_utils import (
+    dy2st_nocheck_guard_context,
+    register_base_model,
+)
+
+__all__ = [
+    "BloomModelInferenceModel",
+    "BloomForCausalLMInferenceModel",
+    "BloomBlockInferenceModel",
+    "BlommForCausalBlockLMInferenceModel",
+]
+
+
+def parallel_matmul(x: Tensor, y: Tensor, parallel_output=True):
+    is_fleet_init = True
+    world_size = 1
+    try:
+        hcg = fleet.get_hybrid_communicate_group()
+        model_parallel_group = hcg.get_model_parallel_group()
+        world_size = hcg.get_model_parallel_world_size()
+    except:
+        is_fleet_init = False
+    if is_fleet_init and world_size > 1:
+        # if not running under distributed.launch, it will raise AttributeError: 'Fleet' object has no attribute '_hcg'
+        hcg = fleet.get_hybrid_communicate_group()
+        model_parallel_group = hcg.get_model_parallel_group()
+        input_parallel = paddle.distributed.collective._c_identity(x, group=model_parallel_group)
+        logits = paddle.matmul(input_parallel, y, transpose_y=True)
+        if parallel_output:
+            return logits
+        return paddle.distributed.collective._c_concat(logits, group=model_parallel_group)
+    else:
+        logits = paddle.matmul(x, y, transpose_y=True)
+        return logits
+
+
+@register_base_model
+class BloomModelInferenceModel(BloomPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.padding_idx = 0
+
+        self.embed_dim = config.hidden_size
+        self.n_head = config.n_head
+
+        self.use_weight_only = False
+        if config.quant_type == "weight_only_int8":
+            self.use_weight_only = True
+            self.quant_algo = "weight_only_int8"
+        elif config.quant_type == "weight_only_int4":
+            self.use_weight_only = True
+            self.quant_algo = "weight_only_int4"
+
+        if self.use_weight_only:
+            assert (
+                self.quant_algo == "weight_only_int8" or self.quant_algo == "weight_only_int4"
+            ), "Expected quant_algo equal to 'weight_only_int8' or 'weight_only_int4', but received {}".format(
+                self.quant_algo
+            )
+
+        # Embedding + LN Embedding
+        if config.tensor_parallel_degree > 1:
+            self.word_embeddings = fleet.meta_parallel.VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                weight_attr=paddle.ParamAttr(
+                    initializer=nn.initializer.Normal(mean=0.0, std=config.initializer_range)
+                ),
+            )
+        else:
+            self.word_embeddings = nn.Embedding(config.vocab_size, self.embed_dim)
+
+        self.word_embeddings_layernorm = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_epsilon)
+
+        # get ring_id
+        ring_id = -1
+        try:
+            hcg = fleet.get_hybrid_communicate_group()
+            model_parallel_group = hcg.get_model_parallel_group()
+            ring_id = model_parallel_group.id
+        except:
+            pass
+
+        # Transformer blocks
+        ln_scale_attrs = [paddle.ParamAttr(name="fusemt.{}.ln_scale".format(i)) for i in range(config.n_layer)]
+        ln_bias_attrs = [paddle.ParamAttr(name="fusemt.{}.ln_bias".format(i)) for i in range(config.n_layer)]
+        qkv_weight_attrs = [
+            paddle.ParamAttr(
+                name="fusemt.{}.qkv_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(config.n_layer)
+        ]
+        qkv_bias_attrs = [paddle.ParamAttr(name="fusemt.{}.qkv_bias".format(i)) for i in range(config.n_layer)]
+        linear_weight_attrs = [
+            paddle.ParamAttr(
+                name="fusemt.{}.linear_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(config.n_layer)
+        ]
+        linear_bias_attrs = [paddle.ParamAttr(name="fusemt.{}.linear_bias".format(i)) for i in range(config.n_layer)]
+        ffn_ln_scale_attrs = [paddle.ParamAttr(name="fusemt.{}.ffn_ln_scale".format(i)) for i in range(config.n_layer)]
+        ffn_ln_bias_attrs = [paddle.ParamAttr(name="fusemt.{}.ffn_ln_bias".format(i)) for i in range(config.n_layer)]
+        ffn1_weight_attrs = [
+            paddle.ParamAttr(
+                name="fusemt.{}.ffn1_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(config.n_layer)
+        ]
+        ffn1_bias_attrs = [paddle.ParamAttr(name="fusemt.{}.ffn1_bias".format(i)) for i in range(config.n_layer)]
+        ffn2_weight_attrs = [
+            paddle.ParamAttr(
+                name="fusemt.{}.ffn2_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(config.n_layer)
+        ]
+        ffn2_bias_attrs = [paddle.ParamAttr(name="fusemt.{}.ffn2_bias".format(i)) for i in range(config.n_layer)]
+        qkv_weight_scale_attrs = None
+        linear_weight_scale_attrs = None
+        ffn1_weight_scale_attrs = None
+        ffn2_weight_scale_attrs = None
+        if self.use_weight_only:
+            qkv_weight_scale_attrs = [
+                paddle.ParamAttr(name="fusemt.{}.qkv_weight_scale".format(i)) for i in range(config.n_layer)
+            ]
+            linear_weight_scale_attrs = [
+                paddle.ParamAttr(name="fusemt.{}.linear_weight_scale".format(i)) for i in range(config.n_layer)
+            ]
+            ffn1_weight_scale_attrs = [
+                paddle.ParamAttr(name="fusemt.{}.ffn1_weight_scale".format(i)) for i in range(config.n_layer)
+            ]
+            ffn2_weight_scale_attrs = [
+                paddle.ParamAttr(name="fusemt.{}.ffn2_weight_scale".format(i)) for i in range(config.n_layer)
+            ]
+
+        transformer_config = FusedMultiTransformerConfig(
+            self.embed_dim,
+            self.n_head,
+            4 * self.embed_dim,
+            quant_type=config.quant_type,
+            activation="gelu",
+            num_layers=config.n_layer,
+            nranks=config.tensor_parallel_degree,
+            ring_id=ring_id,
+            ln_scale_attrs=ln_scale_attrs,
+            ln_bias_attrs=ln_bias_attrs,
+            qkv_weight_attrs=qkv_weight_attrs,
+            qkv_weight_scale_attrs=qkv_weight_scale_attrs,
+            qkv_bias_attrs=qkv_bias_attrs,
+            linear_weight_attrs=linear_weight_attrs,
+            linear_weight_scale_attrs=linear_weight_scale_attrs,
+            linear_bias_attrs=linear_bias_attrs,
+            ffn_ln_scale_attrs=ffn_ln_scale_attrs,
+            ffn_ln_bias_attrs=ffn_ln_bias_attrs,
+            ffn1_weight_attrs=ffn1_weight_attrs,
+            ffn1_weight_scale_attrs=ffn1_weight_scale_attrs,
+            ffn1_bias_attrs=ffn1_bias_attrs,
+            ffn2_weight_attrs=ffn2_weight_attrs,
+            ffn2_weight_scale_attrs=ffn2_weight_scale_attrs,
+            ffn2_bias_attrs=ffn2_bias_attrs,
+        )
+
+        self.set_transformer_block(transformer_config)
+
+        self.cache_kvs = []
+
+        # Final Layer Norm
+        self.ln_f = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_epsilon)
+
+        self.gradient_checkpointing = False
+
+    def set_transformer_block(self, transformer_config):
+        if self.use_weight_only:
+            self.transformer_block = FusedMultiTransformerWeightOnly(transformer_config)
+        else:
+            self.transformer_block = FusedMultiTransformerBase(transformer_config)
+
+    def get_input_embeddings(self):
+        return self.word_embeddings
+
+    def set_input_embeddings(self, new_embeddings: Tensor):
+        self.word_embeddings = new_embeddings
+
+    def remove_padding(self, input_ids, seq_lens_this_time):
+        cum_offsets_now = paddle.cumsum(paddle.max(seq_lens_this_time) - seq_lens_this_time)
+        token_num = paddle.sum(seq_lens_this_time)
+        from paddlenlp_ops import get_padding_offset
+
+        ids_remove_padding, cum_offsets, padding_offset = get_padding_offset(
+            input_ids, cum_offsets_now, token_num, seq_lens_this_time
+        )
+        return ids_remove_padding, padding_offset, cum_offsets
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        inputs_embeds=None,
+        cache=None,
+        cache_kvs=None,
+        pre_caches=None,
+        seq_len_encoder=None,
+        seq_len_decoder=None,
+        return_dict=None,
+        **kwargs,
+    ) -> Union[Tuple[Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        # past_key_values = kwargs.get("cache", past_key_values)
+        # is_decoder = past_key_values is not None
+        is_decoder = cache is not None
+        seq_len = seq_len_decoder if is_decoder else seq_len_encoder
+        if not is_decoder:
+            ids_remove_padding, padding_offset, cum_offsets = self.remove_padding(input_ids, seq_len)
+        else:
+            ids_remove_padding = input_ids
+            padding_offset = None
+            cum_offsets = None
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(ids_remove_padding)
+
+        hidden_states = self.word_embeddings_layernorm(inputs_embeds)
+        position_offset = 0
+        if not is_decoder and pre_caches is not None:
+            position_offset = 128
+
+        with dy2st_nocheck_guard_context():
+            hidden_states, _ = self.transformer_block(
+                src=hidden_states,
+                input_ids=input_ids,
+                cum_offsets=cum_offsets,
+                padding_offset=padding_offset,
+                attn_mask=paddle.cast(attention_mask, dtype=hidden_states.dtype),
+                caches=cache_kvs,
+                pre_caches=pre_caches,
+                pre_caches_length=position_offset,
+                seq_lens=seq_len,
+                time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None,
+            )
+
+        # Add last hidden state
+        hidden_states = self.ln_f(hidden_states)
+
+        return BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=hidden_states)
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict, use_structured_name=True):
+        for k, v in state_dict.items():
+            if k.find("word_embeddings.weight") >= 0:
+                self.word_embeddings.weight.set_value(paddle.to_tensor(v))
+            elif k.find("word_embeddings_layernorm.weight") >= 0:
+                self.word_embeddings_layernorm.weight.set_value(paddle.to_tensor(v))
+            elif k.find("word_embeddings_layernorm.bias") >= 0:
+                self.word_embeddings_layernorm.bias.set_value(paddle.to_tensor(v))
+            elif k.find("ln_f.weight") >= 0:
+                self.ln_f.weight.set_value(paddle.to_tensor(v))
+            elif k.find("ln_f.bias") >= 0:
+                self.ln_f.bias.set_value(paddle.to_tensor(v))
+            else:
+                # transformer block weights
+                splits = k.split(".")
+                idx = int(splits[1]) if splits[1].isdigit() else int(splits[2])
+
+                if k.endswith("input_layernorm.weight"):
+                    self.transformer_block.ln_scales[idx].set_value(paddle.to_tensor(v).astype("float32"))
+                elif k.endswith("input_layernorm.bias"):
+                    self.transformer_block.ln_biases[idx].set_value(paddle.to_tensor(v).astype("float32"))
+                elif k.endswith("self_attention.query_key_value.weight"):
+                    qkv_weight_tensor = (
+                        v.reshape(
+                            [
+                                self.embed_dim,
+                                self.n_head // self.config.tensor_parallel_degree,
+                                3,
+                                self.embed_dim // self.n_head,
+                            ]
+                        )
+                        .transpose([2, 1, 3, 0])
+                        .reshape([-1, self.embed_dim])
+                    )
+
+                    if self.use_weight_only:
+                        qkv_weight_tensor = paddle.transpose(qkv_weight_tensor, perm=[1, 0])
+                        qkv_quanted_weight_tensor, qkv_weight_scale_tensor = weight_quantize(
+                            qkv_weight_tensor, algo=self.quant_algo
+                        )
+                        self.transformer_block.qkv_weights[idx].set_value(qkv_quanted_weight_tensor)
+                        self.transformer_block.qkv_weights_scale[idx].set_value(qkv_weight_scale_tensor)
+                    else:
+                        self.transformer_block.qkv_weights[idx].set_value(qkv_weight_tensor)
+                elif k.endswith("self_attention.query_key_value.bias"):
+                    v = (
+                        v.reshape(
+                            [
+                                self.n_head // self.config.tensor_parallel_degree,
+                                3,
+                                self.embed_dim // self.n_head,
+                            ]
+                        )
+                        .transpose([1, 0, 2])
+                        .reshape([-1])
+                    )
+                    self.transformer_block.qkv_biases[idx].set_value(paddle.to_tensor(v))
+                elif k.endswith("self_attention.dense.weight"):
+                    linear_weight_tensor = paddle.to_tensor(v)
+                    if self.use_weight_only:
+                        linear_quanted_weight_tensor, linear_weight_scale_tensor = weight_quantize(
+                            linear_weight_tensor, algo=self.quant_algo
+                        )
+                        self.transformer_block.linear_weights[idx].set_value(linear_quanted_weight_tensor)
+                        self.transformer_block.linear_weights_scale[idx].set_value(linear_weight_scale_tensor)
+                    else:
+                        self.transformer_block.linear_weights[idx].set_value(linear_weight_tensor)
+                elif k.endswith("self_attention.dense.bias"):
+                    self.transformer_block.linear_biases[idx].set_value(paddle.to_tensor(v))
+                elif k.endswith("post_attention_layernorm.weight"):
+                    self.transformer_block.ffn_ln_scales[idx].set_value(paddle.to_tensor(v).astype("float32"))
+                elif k.endswith("post_attention_layernorm.bias"):
+                    self.transformer_block.ffn_ln_biases[idx].set_value(paddle.to_tensor(v).astype("float32"))
+                elif k.endswith("mlp.dense_h_to_4h.weight"):
+                    ffn1_weight_tensor = paddle.to_tensor(v)
+                    if self.use_weight_only:
+                        ffn1_quanted_weight_tensor, ffn1_weight_scale_tensor = weight_quantize(
+                            ffn1_weight_tensor, algo=self.quant_algo
+                        )
+                        self.transformer_block.ffn1_weights[idx].set_value(ffn1_quanted_weight_tensor)
+                        self.transformer_block.ffn1_weights_scale[idx].set_value(ffn1_weight_scale_tensor)
+                    else:
+                        self.transformer_block.ffn1_weights[idx].set_value(ffn1_weight_tensor)
+                elif k.endswith("mlp.dense_h_to_4h.bias"):
+                    self.transformer_block.ffn1_biases[idx].set_value(paddle.to_tensor(v))
+                elif k.endswith("mlp.dense_4h_to_h.weight"):
+                    ffn2_weight_tensor = paddle.to_tensor(v)
+                    if self.use_weight_only:
+                        ffn2_quanted_weight_tensor, ffn2_weight_scale_tensor = weight_quantize(
+                            ffn2_weight_tensor, algo=self.quant_algo
+                        )
+                        self.transformer_block.ffn2_weights[idx].set_value(ffn2_quanted_weight_tensor)
+                        self.transformer_block.ffn2_weights_scale[idx].set_value(ffn2_weight_scale_tensor)
+                    else:
+                        self.transformer_block.ffn2_weights[idx].set_value(ffn2_weight_tensor)
+
+                elif k.endswith("mlp.dense_4h_to_h.bias"):
+                    self.transformer_block.ffn2_biases[idx].set_value(paddle.to_tensor(v))
+                else:
+                    raise ValueError("Unknow weight {}".format(k))
+
+
+class BloomLMHead(nn.Layer):
+    def __init__(self, config, embedding_weights=None):
+        super(BloomLMHead, self).__init__()
+        self.decoder_weight = (
+            self.create_parameter(
+                shape=[config.vocab_size, config.hidden_size],
+                dtype=paddle.get_default_dtype(),
+                is_bias=True,
+            )
+            if embedding_weights is None
+            else embedding_weights
+        )
+        self.config = config
+
+    def forward(self, hidden_states):
+        logits = parallel_matmul(hidden_states, self.decoder_weight, parallel_output=False)
+        return logits
+
+
+class BloomPretrainingCriterion(paddle.nn.Layer):
+    """
+    Criterion for GPT.
+    It calculates the final loss.
+    """
+
+    def __init__(self, pad_token_id=None, tensor_parallel_degree=1, tensor_parallel_output=False):
+        super(BloomPretrainingCriterion, self).__init__()
+        if tensor_parallel_degree > 1 and tensor_parallel_output:
+            self.loss_func = fleet.meta_parallel.ParallelCrossEntropy()
+        else:
+            self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none")
+        self.pad_token_id = pad_token_id
+
+    def forward(self, prediction_scores, masked_lm_labels, loss_mask=None):
+        masked_lm_loss = self.loss_func(prediction_scores, masked_lm_labels.unsqueeze(2))
+        with paddle.amp.auto_cast(False):
+            masked_lm_loss = masked_lm_loss.astype("float32")
+            if loss_mask is not None:
+                loss_mask = loss_mask.reshape([-1])
+                masked_lm_loss = paddle.sum(masked_lm_loss.reshape([-1]) * loss_mask)
+                loss = masked_lm_loss / loss_mask.sum()
+            else:
+                assert self.pad_token_id is not None
+                masked_lm_loss = masked_lm_loss[masked_lm_labels != self.pad_token_id]
+                loss = paddle.mean(masked_lm_loss)
+
+        return loss
+
+
+class BloomForCausalLMInferenceModel(GenerationInferenceModel, BloomPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [
+        r"h.*.self_attention.scale_mask_softmax.causal_mask",
+        r"lm_head.weight",
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.bloom = BloomModelInferenceModel(config)
+        self.lm_head = BloomLMHead(config, self.bloom.word_embeddings.weight)
+        self.criterion = BloomPretrainingCriterion(
+            pad_token_id=config.pad_token_id,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_output=True,
+        )
+
+    @classmethod
+    def get_cache_kvs_shape(cls, config, max_batch_size=None, max_length=None) -> list[list[int]]:
+        """get cache_kvs tensor for llama model
+
+        Args:
+            max_batch_size (int): the max batch size
+            max_length (int | None, optional): the max_length of cache_kvs. Defaults to None.
+
+        Returns:
+            list[paddle.Tensor]: the list tensor shape for cache
+        """
+        if max_length is None:
+            max_length = 2048
+
+        cache_kvs = []
+        for _ in range(config.n_layer):
+            cache_kvs.append(
+                [
+                    2,
+                    max_batch_size,
+                    config.num_attention_heads // max(config.tensor_parallel_degree, 1),
+                    max_length,
+                    config.hidden_size // config.num_attention_heads,
+                ]
+            )
+        return cache_kvs
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def prepare_inputs_for_generation(self, input_ids, cache_kvs, tgt_ids, tgt_generation_mask, **kwargs):
+        # only last token for inputs_ids if cache is defined in kwargs
+        attention_mask = kwargs.get("attention_mask", None)
+        position_ids = kwargs.get("position_ids", None)
+        pre_caches = kwargs.get("pre_caches", None)
+        seq_len_encoder = kwargs.get("seq_len_encoder", None)
+        seq_len_decoder = kwargs.get("seq_len_decoder", None)
+        cache = kwargs.get("cache", None)
+        if cache is not None:
+            input_ids = tgt_ids
+            attention_mask = tgt_generation_mask
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+            "cache_kvs": cache_kvs,
+            "cache": cache,
+            "pre_caches": pre_caches,
+            "use_cache": True,
+            "seq_len_encoder": seq_len_encoder,
+            "seq_len_decoder": seq_len_decoder,
+        }
+
+    def forward(
+        self,
+        input_ids=None,
+        cache=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        cache_kvs=None,
+        pre_caches=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        seq_len_encoder=None,
+        seq_len_decoder=None,
+        return_dict=None,
+    ) -> Union[Tuple[Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.bloom(
+            input_ids,
+            cache=cache,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_kvs=cache_kvs,
+            pre_caches=pre_caches,
+            seq_len_encoder=seq_len_encoder,
+            seq_len_decoder=seq_len_decoder,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        lm_logits = self.lm_head(hidden_states)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return output
+
+        return CausalLMOutputWithCrossAttentions(logits=lm_logits)
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict, use_structured_name=True):
+        self.lm_head.set_state_dict(
+            {k: state_dict[k] for k in state_dict.keys() if "lm_head" in k},
+            use_structured_name,
+        )
+        self.bloom.set_state_dict({k: state_dict[k] for k in state_dict.keys() if "bloom" in k})
+
+    @staticmethod
+    def _reorder_cache(past: Tuple[Tuple[Tensor]], beam_idx: Tensor) -> Tuple[Tuple[Tensor]]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+        """
+        return tuple(tuple(past_state.index_select(0, beam_idx) for past_state in layer_past) for layer_past in past)
+
+
+@register_base_model
+class BloomBlockInferenceModel(BloomModelInferenceModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.max_seq_len = config.max_seq_len
+        self.block_size = config.block_size
+
+    def set_transformer_block(self, transformer_config):
+        if self.use_weight_only:
+            self.transformer_block = FusedBlockMultiTransformerWeightOnly(transformer_config)
+        else:
+            self.transformer_block = FusedBlockMultiTransformer(transformer_config)
+
+    def remove_padding(self, input_ids, seq_lens_this_time):
+        cum_offsets_now = paddle.cumsum(self.max_seq_len - seq_lens_this_time)
+        token_num = paddle.sum(seq_lens_this_time)
+        from paddlenlp_ops import get_padding_offset_v2
+
+        ids_remove_padding, cum_offsets, padding_offset, cu_seqlens_q, cu_seqlens_k = get_padding_offset_v2(
+            input_ids, cum_offsets_now, token_num, seq_lens_this_time
+        )
+        return ids_remove_padding, padding_offset, cum_offsets, cu_seqlens_q, cu_seqlens_k
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        caches=None,
+        pre_caches=None,
+        output_attentions=False,
+        output_hidden_states=None,
+        return_dict=False,
+        **kwargs,
+    ):
+
+        seq_lens_this_time = kwargs.get("seq_lens_this_time", None)
+        ids_remove_padding, padding_offset, cum_offsets, cu_seqlens_q, cu_seqlens_k = self.remove_padding(
+            input_ids, seq_lens_this_time
+        )
+        kwargs["cu_seqlens_q"] = cu_seqlens_q
+        kwargs["cu_seqlens_k"] = cu_seqlens_k
+        kwargs["padding_offsets"] = padding_offset
+        kwargs["max_input_length"] = self.max_seq_len
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(ids_remove_padding)
+
+        hidden_states = self.word_embeddings_layernorm(inputs_embeds)
+
+        with dy2st_nocheck_guard_context():
+            hidden_states, _ = self.transformer_block(
+                input_ids=input_ids,
+                src=hidden_states,
+                cum_offsets=cum_offsets,
+                attn_mask=attention_mask,
+                caches=caches,
+                pre_caches=pre_caches,
+                rotary_embs=None,
+                **kwargs,
+            )
+
+        hidden_states = self.ln_f(hidden_states)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=None,
+            hidden_states=None,
+            attentions=None,
+        )
+
+
+class BlommForCausalBlockLMInferenceModel(GenerationBlockInferenceModel, BloomPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bloom = BloomBlockInferenceModel(config)
+        self.lm_head = BloomLMHead(config, self.bloom.word_embeddings.weight)
+
+    @classmethod
+    def get_cache_kvs_shape(cls, config, max_batch_size: int = None, max_length: int = None):
+
+        max_block_per_seq = (config.max_seq_len + config.block_size - 1) // config.block_size
+        if max_batch_size == -1:
+            max_block_nums = None
+        else:
+            max_block_nums = max_batch_size * max_block_per_seq
+
+        cache_kvs = []
+        for _ in range(config.n_layer):
+            cache_kv_shape = [
+                max_block_nums,
+                config.n_head // max(config.tensor_parallel_degree, 1),
+                config.block_size,
+                config.hidden_size // config.n_head,
+            ]
+            cache_kvs.append(cache_kv_shape)
+            cache_kvs.append(cache_kv_shape)
+        return cache_kvs
+
+    def prepare_inputs_for_generation(self, **kwargs):
+        # only last token for inputs_ids if cache is defined in kwargs
+        input_ids = kwargs["input_ids"]
+        src_mask = kwargs.get("src_mask", None)
+
+        tgt_mask = kwargs.get("tgt_mask", None)
+
+        block_tables = kwargs.get("block_tables", None)
+
+        pre_caches = kwargs.get("pre_caches", None)
+        caches = kwargs.get("caches", None)
+
+        seq_lens_this_time = kwargs["seq_lens_this_time"]
+
+        seq_lens_encoder = kwargs["seq_lens_encoder"]
+        seq_lens_decoder = kwargs["seq_lens_decoder"]
+        k_quant_scales = kwargs.get("k_quant_scales", None)
+        v_quant_scales = kwargs.get("v_quant_scales", None)
+        k_dequant_scales = kwargs.get("k_dequant_scales", None)
+        v_dequant_scales = kwargs.get("v_dequant_scales", None)
+
+        # only slice a part of src_mask, because of phi::FlashAttnUnpaddedKernel.
+        valid_max_encoder_len = paddle.max(seq_lens_encoder)
+        src_mask = src_mask[:, :, :valid_max_encoder_len, :valid_max_encoder_len]
+
+        model_inputs = {
+            "input_ids": input_ids,
+            "src_mask": src_mask,
+            "tgt_mask": tgt_mask,
+            "rope_emb": None,
+            "pre_caches": pre_caches,
+            "caches": caches,
+            "seq_lens_this_time": seq_lens_this_time,
+            "seq_lens_encoder": seq_lens_encoder,
+            "seq_lens_decoder": seq_lens_decoder,
+            "block_tables": block_tables,
+            "k_quant_scales": k_quant_scales,
+            "v_quant_scales": v_quant_scales,
+            "k_dequant_scales": k_dequant_scales,
+            "v_dequant_scales": v_dequant_scales,
+        }
+        return model_inputs
+
+    def forward(
+        self,
+        input_ids,
+        src_mask=None,
+        tgt_mask=None,
+        pre_caches=None,
+        caches=None,
+        seq_lens_this_time=None,
+        seq_lens_encoder=None,
+        seq_lens_decoder=None,
+        rope_emb=None,
+        block_tables=None,
+        k_quant_scales=None,
+        v_quant_scales=None,
+        k_dequant_scales=None,
+        v_dequant_scales=None,
+    ):
+        outputs = self.bloom(
+            input_ids,
+            attention_mask=src_mask,
+            tgt_mask=tgt_mask,
+            caches=caches,
+            # bloom does not have rope_emb!
+            rope_emb=None,
+            block_tables=block_tables,
+            pre_caches=pre_caches,
+            seq_lens_this_time=seq_lens_this_time,
+            seq_lens_encoder=seq_lens_encoder,
+            seq_lens_decoder=seq_lens_decoder,
+            k_quant_scales=k_quant_scales,
+            v_quant_scales=v_quant_scales,
+            k_dequant_scales=k_dequant_scales,
+            v_dequant_scales=v_dequant_scales,
+        )
+
+        hidden_states = outputs[0]
+
+        output = self.lm_head(hidden_states)
+
+        return output
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict):
+        self.bloom.set_state_dict(state_dict)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/chatglm/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/chatglm/__init__.py
new file mode 100644
index 000000000..c2a7f656c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/chatglm/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .modeling import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/chatglm/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/chatglm/modeling.py
new file mode 100644
index 000000000..dc46aa602
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/chatglm/modeling.py
@@ -0,0 +1,745 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.distributed import fleet
+from paddle.nn.quant import weight_quantize
+
+from paddlenlp.experimental.transformers.fused_transformer_layers import (
+    FusedMultiTransformerConfig,
+    FusedMultiTransformerPostLayernorm,
+    FusedMultiTransformerWeightOnlyPostLayernorm,
+)
+from paddlenlp.experimental.transformers.generation_utils import (
+    GenerationInferenceModel,
+)
+from paddlenlp.experimental.transformers.utils import infererence_model_from_pretrained
+from paddlenlp.transformers import ChatGLMConfig, ChatGLMPretrainedModel
+from paddlenlp.transformers.model_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithPast,
+)
+from paddlenlp.transformers.model_utils import (
+    dy2st_nocheck_guard_context,
+    register_base_model,
+)
+
+__all__ = ["ChatGLMForCausalLMInferenceModel"]
+
+
+def parallel_matmul(lm_output, logit_weights, parallel_output):
+    hcg = fleet.get_hybrid_communicate_group()
+    model_parallel_group = hcg.get_model_parallel_group()
+    world_size = hcg.get_model_parallel_world_size()
+
+    if world_size > 1:
+        # _c_identity is backwards is reduce
+        input_parallel = paddle.distributed.collective._c_identity(lm_output, group=model_parallel_group)
+
+        logits = paddle.matmul(input_parallel, logit_weights, transpose_y=True)
+
+        if parallel_output:
+            return logits
+
+        # _c_concat has not grad backwards
+        return paddle.distributed.collective._c_concat(logits, group=model_parallel_group)
+    else:
+        logits = paddle.matmul(lm_output, logit_weights, transpose_y=True)
+        return logits
+
+
+class RotaryEmbeddingsDybatch(nn.Layer):
+    def __init__(self, hidden_size, base=10000.0, learnable=False):
+        super().__init__()
+        self.dtype = paddle.get_default_dtype()
+        inv_freq = 1.0 / (base ** (paddle.arange(0, hidden_size, 2).astype("float32") / hidden_size))
+        inv_freq = inv_freq.astype(self.dtype)
+        self.learnable = learnable
+        if learnable:
+            self.inv_freq = nn.Parameter(inv_freq)
+            self.max_seq_len_cached = None
+        else:
+            self.register_buffer("inv_freq", inv_freq)
+            self.max_seq_len_cached = None
+            self.cos_cached = None
+            self.sin_cached = None
+
+    def forward(self, seq_dim=1, seq_len=128):
+        # TODO: Remove the condition for converting to static graph.
+        # if self.max_seq_len_cached is None or seq_len > self.max_seq_len_cached:
+        #    self.max_seq_len_cached = None if self.learnable else seq_len
+
+        t = paddle.arange(seq_len).astype(self.dtype)
+        # [s, h/n/2]
+        # TODO: Failed for fp16 when converting to static graph.
+        freqs = paddle.einsum("i,j->ij", t.astype("float32"), self.inv_freq.astype("float32"))
+
+        freqs = freqs.astype(self.dtype)
+        # [s, h/n]
+        emb = paddle.concat([freqs, freqs], axis=-1)
+
+        if self.dtype == paddle.bfloat16:
+            emb = emb.astype("float32")
+        # [s, 1, h/n]
+        cos_cached = emb.cos().unsqueeze(1)
+        sin_cached = emb.sin().unsqueeze(1)
+
+        if self.dtype == paddle.bfloat16:
+            cos_cached = cos_cached.astype(self.dtype)
+            sin_cached = sin_cached.astype(self.dtype)
+
+        if self.learnable:
+            return cos_cached, sin_cached
+
+        self.cos_cached, self.sin_cached = cos_cached, sin_cached
+
+        return self.cos_cached[:seq_len, ...], self.sin_cached[:seq_len, ...]
+
+
+class ChatGLMStackDyBatch(nn.Layer):
+    """
+    GLM Transformer
+    """
+
+    def __init__(self, config: ChatGLMConfig):
+        super(ChatGLMStackDyBatch, self).__init__()
+        self.config = config
+        self.position_encoding_2d = config.position_encoding_2d
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+
+        self.config = config
+        self.current_rank = 0
+        self.world_size = 1
+
+        self.use_weight_only = False
+        if config.quant_type == "weight_only_int8":
+            self.use_weight_only = True
+            self.quant_algo = "weight_only_int8"
+        elif config.quant_type == "weight_only_int4":
+            self.use_weight_only = True
+            self.quant_algo = "weight_only_int4"
+
+        try:
+            self.current_rank = paddle.distributed.get_rank()
+            self.world_size = paddle.distributed.get_world_size()
+        except Exception:
+            pass
+
+        if self.config.tensor_parallel_degree > 1:
+            self.word_embeddings = fleet.meta_parallel.VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()),
+            )
+        else:
+            self.word_embeddings = nn.Embedding(
+                config.vocab_size,
+                config.hidden_size,
+                weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()),
+            )
+        self.rotary_embeddings = RotaryEmbeddingsDybatch(
+            self.hidden_size // (self.num_attention_heads * 2)
+            if self.position_encoding_2d
+            else self.hidden_size // self.num_attention_heads,
+            base=10000.0,
+        )
+
+        # get ring_id
+        ring_id = -1
+        try:
+            hcg = fleet.get_hybrid_communicate_group()
+            model_parallel_group = hcg.get_model_parallel_group()
+            ring_id = model_parallel_group.id
+        except:
+            pass
+
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layernorm_epsilon)
+        ln_scale_attrs = [paddle.ParamAttr(name="fusemt.{}.ln_scale".format(i)) for i in range(config.num_layers)]
+        ln_bias_attrs = [paddle.ParamAttr(name="fusemt.{}.ln_bias".format(i)) for i in range(config.num_layers)]
+        qkv_weight_attrs = [
+            paddle.ParamAttr(
+                name="fusemt.{}.qkv_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(config.num_layers)
+        ]
+        qkv_bias_attrs = [paddle.ParamAttr(name="fusemt.{}.qkv_bias".format(i)) for i in range(config.num_layers)]
+        linear_weight_attrs = [
+            paddle.ParamAttr(
+                name="fusemt.{}.linear_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(config.num_layers)
+        ]
+        linear_bias_attrs = [
+            paddle.ParamAttr(name="fusemt.{}.linear_bias".format(i)) for i in range(config.num_layers)
+        ]
+        ffn_ln_scale_attrs = [
+            paddle.ParamAttr(name="fusemt.{}.ffn_ln_scale".format(i)) for i in range(config.num_layers)
+        ]
+        ffn_ln_bias_attrs = [
+            paddle.ParamAttr(name="fusemt.{}.ffn_ln_bias".format(i)) for i in range(config.num_layers)
+        ]
+        ffn1_weight_attrs = [
+            paddle.ParamAttr(
+                name="fusemt.{}.ffn1_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(config.num_layers)
+        ]
+        ffn1_bias_attrs = [paddle.ParamAttr(name="fusemt.{}.ffn1_bias".format(i)) for i in range(config.num_layers)]
+        ffn2_weight_attrs = [
+            paddle.ParamAttr(
+                name="fusemt.{}.ffn2_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(config.num_layers)
+        ]
+        ffn2_bias_attrs = [paddle.ParamAttr(name="fusemt.{}.ffn2_bias".format(i)) for i in range(config.num_layers)]
+
+        qkv_weight_scale_attrs = None
+        linear_weight_scale_attrs = None
+        ffn1_weight_scale_attrs = None
+        ffn2_weight_scale_attrs = None
+
+        if self.use_weight_only:
+            qkv_weight_scale_attrs = [
+                paddle.ParamAttr(name="fusemt.{}.qkv_weight_scale".format(i)) for i in range(config.num_layers)
+            ]
+            linear_weight_scale_attrs = [
+                paddle.ParamAttr(name="fusemt.{}.linear_weight_scale".format(i)) for i in range(config.num_layers)
+            ]
+            ffn1_weight_scale_attrs = [
+                paddle.ParamAttr(name="fusemt.{}.ffn1_weight_scale".format(i)) for i in range(config.num_layers)
+            ]
+            ffn2_weight_scale_attrs = [
+                paddle.ParamAttr(name="fusemt.{}.ffn2_weight_scale".format(i)) for i in range(config.num_layers)
+            ]
+
+        alpha = (2 * self.config.num_hidden_layers) ** 0.5
+
+        transformer_config = FusedMultiTransformerConfig(
+            config.hidden_size,
+            config.num_attention_heads,
+            4 * config.hidden_size,
+            quant_type=config.quant_type,
+            activation="gelu",
+            num_layers=config.num_layers,
+            nranks=config.tensor_parallel_degree,
+            ring_id=ring_id,
+            ln_scale_attrs=ln_scale_attrs,
+            ln_bias_attrs=ln_bias_attrs,
+            qkv_weight_attrs=qkv_weight_attrs,
+            qkv_weight_scale_attrs=qkv_weight_scale_attrs,
+            qkv_bias_attrs=qkv_bias_attrs,
+            linear_weight_attrs=linear_weight_attrs,
+            linear_weight_scale_attrs=linear_weight_scale_attrs,
+            linear_bias_attrs=linear_bias_attrs,
+            ffn_ln_scale_attrs=ffn_ln_scale_attrs,
+            ffn_ln_bias_attrs=ffn_ln_bias_attrs,
+            ffn1_weight_attrs=ffn1_weight_attrs,
+            ffn1_weight_scale_attrs=ffn1_weight_scale_attrs,
+            ffn1_bias_attrs=ffn1_bias_attrs,
+            ffn2_weight_attrs=ffn2_weight_attrs,
+            ffn2_weight_scale_attrs=ffn2_weight_scale_attrs,
+            ffn2_bias_attrs=ffn2_bias_attrs,
+            trans_qkvw=True,
+            normalize_before=False,
+            residual_alpha=alpha,
+            norm_type="layernorm",
+            use_neox_rotary_style=True,
+        )
+        if self.use_weight_only:
+            self.transformer_block = FusedMultiTransformerWeightOnlyPostLayernorm(transformer_config)
+        else:
+            self.transformer_block = FusedMultiTransformerPostLayernorm(transformer_config)
+
+    def remove_padding(self, input_ids, seq_lens_this_time):
+        cum_offsets_now = paddle.cumsum(paddle.max(seq_lens_this_time) - seq_lens_this_time)
+        token_num = paddle.sum(seq_lens_this_time)
+        from paddlenlp_ops import get_padding_offset
+
+        ids_remove_padding, cum_offsets, padding_offset = get_padding_offset(
+            input_ids, cum_offsets_now, token_num, seq_lens_this_time
+        )
+        return ids_remove_padding, padding_offset, cum_offsets
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        use_cache=None,
+        cache=None,
+        cache_kvs=None,
+        pre_caches=None,
+        seq_len_encoder=None,
+        seq_len_decoder=None,
+        past_key_values=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        time_step=None,
+        **kwargs,
+    ):
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        encode_seq_length = input_ids.shape[1]
+        seq_lens = seq_len_decoder if encode_seq_length == 1 else seq_len_encoder
+
+        if encode_seq_length > 1:
+            ids_remove_padding, padding_offset, cum_offsets = self.remove_padding(input_ids, seq_len_encoder)
+        else:
+            ids_remove_padding = input_ids
+            padding_offset = None
+            cum_offsets = None
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(ids_remove_padding)
+
+        if cache is None:
+            cache = tuple([None] * self.config.num_layers)
+
+        hidden_states = inputs_embeds
+        if attention_mask is None:
+            attention_mask = paddle.zeros([1, 1]).astype("int64")
+
+        cos, sin = self.rotary_embeddings(seq_len=self.config.max_sequence_length + 1)
+        coses = []
+        sines = []
+        if self.position_encoding_2d:
+            block_position_ids = position_ids[:batch_size, 1, :].transpose([1, 0])
+            position_ids = position_ids[:batch_size, 0, :].transpose([1, 0])
+            coses.append(cos.squeeze(1)[position_ids].unsqueeze(2))
+            sines.append(sin.squeeze(1)[position_ids].unsqueeze(2))
+
+            coses.append(cos.squeeze(1)[block_position_ids].unsqueeze(2))
+            sines.append(sin.squeeze(1)[block_position_ids].unsqueeze(2))
+        else:
+            position_ids = position_ids.transpose([1, 0])
+            coses.append(cos.squeeze(1)[position_ids].unsqueeze(2))
+            sines.append(sin.squeeze(1)[position_ids].unsqueeze(2))
+
+        position_cos = coses[0].transpose([1, 2, 0, 3])
+        block_position_cos = coses[1].transpose([1, 2, 0, 3])
+
+        coses = paddle.concat([position_cos, block_position_cos], axis=-1).unsqueeze(0)
+        position_sin = sines[0].transpose([1, 2, 0, 3])
+
+        block_position_sin = sines[1].transpose([1, 2, 0, 3])
+        sines = paddle.concat([position_sin, block_position_sin], axis=-1).unsqueeze(0)
+
+        rotary_embeds = paddle.concat([coses, sines])
+
+        new_cache = [None]
+        hidden_states = self.input_layernorm(hidden_states)
+
+        position_offset = 0
+        if encode_seq_length > 1 and pre_caches is not None:
+            position_offset = 128
+
+        with dy2st_nocheck_guard_context():
+            hidden_states, new_cache = self.transformer_block(
+                input_ids,
+                hidden_states,
+                cum_offsets=cum_offsets,
+                padding_offset=padding_offset,
+                attn_mask=paddle.cast(attention_mask, dtype=hidden_states.dtype),
+                caches=cache_kvs,
+                pre_caches=pre_caches,
+                pre_caches_length=position_offset,
+                rotary_embs=paddle.cast(rotary_embeds, "float32"),
+                rotary_emb_dims=2 if self.config.position_encoding_2d else 1,
+                seq_lens=seq_lens,
+                time_step=time_step,
+            )
+
+        return (hidden_states, new_cache)
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict, use_structured_name=True):
+        dtype = paddle.get_default_dtype()
+        config = self.config
+        embed_dim = config.hidden_size
+        num_attention_heads = config.num_attention_heads // config.tensor_parallel_degree
+        head_dim = embed_dim // config.num_attention_heads
+
+        for k, v in state_dict.items():
+            if k.startswith("chatglm.transformer.word_embeddings.weight"):
+                self.word_embeddings.weight.set_value(v.astype(dtype))
+                continue
+            elif k.startswith("chatglm.transformer.final_layernorm.weight"):
+                self.transformer_block.ffn_ln_scales[config.num_hidden_layers - 1].set_value(v.astype("float32"))
+                continue
+            elif k.startswith("chatglm.transformer.final_layernorm.bias"):
+                self.transformer_block.ffn_ln_biases[config.num_hidden_layers - 1].set_value(v.astype("float32"))
+                continue
+            elif k.startswith("lm_head.weight"):
+                continue
+            elif k.endswith("rotary_embeddings.inv_freq") or k.endswith("rotary_emb.inv_freq"):
+                continue
+            idx = int(k.split(".")[3])
+            if k.endswith("input_layernorm.weight"):
+                if idx == 0:
+                    self.input_layernorm.weight.set_value(v.astype(dtype))
+                else:
+                    self.transformer_block.ffn_ln_scales[idx - 1].set_value(v.astype("float32"))
+            elif k.endswith("input_layernorm.bias"):
+                if idx == 0:
+                    self.input_layernorm.bias.set_value(v.astype(dtype))
+                else:
+                    self.transformer_block.ffn_ln_biases[idx - 1].set_value(v.astype("float32"))
+            elif k.endswith("post_attention_layernorm.weight"):
+                self.transformer_block.ln_scales[idx].set_value(v.astype("float32"))
+            elif k.endswith("post_attention_layernorm.bias"):
+                self.transformer_block.ln_biases[idx].set_value(v.astype("float32"))
+            elif k.endswith("attention.query_key_value.weight"):
+                # [embed_dim, num_heads, 3, head_dim] -> [embed_dim, 3, num_heads, head_dim]
+                qkv_weight_tensor = (
+                    v.reshape([embed_dim, num_attention_heads, 3, head_dim])
+                    .transpose([2, 1, 3, 0])
+                    .reshape([head_dim * num_attention_heads * 3, embed_dim])
+                )
+
+                if self.use_weight_only:
+                    qkv_weight_tensor = paddle.transpose(qkv_weight_tensor, perm=[1, 0])
+                    qkv_quanted_weight_tensor, qkv_weight_scale_tensor = weight_quantize(
+                        qkv_weight_tensor, algo=self.quant_algo
+                    )
+                    self.transformer_block.qkv_weights[idx].set_value(qkv_quanted_weight_tensor)
+                    self.transformer_block.qkv_weights_scale[idx].set_value(qkv_weight_scale_tensor)
+                else:
+                    self.transformer_block.qkv_weights[idx].set_value(qkv_weight_tensor.astype(dtype))
+
+            elif k.endswith("attention.query_key_value.bias"):
+                v = (
+                    v.reshape([num_attention_heads, 3, head_dim])
+                    .transpose([1, 0, 2])
+                    .reshape([head_dim * num_attention_heads * 3])
+                )
+                self.transformer_block.qkv_biases[idx].set_value(v.astype(dtype))
+            elif k.endswith("attention.dense.weight"):
+                linear_weight_tensor = v.astype(dtype)
+                if self.use_weight_only:
+                    linear_quanted_weight_tensor, linear_weight_scale_tensor = weight_quantize(
+                        linear_weight_tensor, algo=self.quant_algo
+                    )
+                    self.transformer_block.linear_weights[idx].set_value(linear_quanted_weight_tensor)
+                    self.transformer_block.linear_weights_scale[idx].set_value(linear_weight_scale_tensor)
+                else:
+                    self.transformer_block.linear_weights[idx].set_value(linear_weight_tensor)
+
+            elif k.endswith("attention.dense.bias"):
+                self.transformer_block.linear_biases[idx].set_value(v.astype(dtype))
+            elif k.endswith("mlp.dense_h_to_4h.weight"):
+                ffn1_weight_tensor = v.astype(dtype)
+                if self.use_weight_only:
+                    ffn1_quanted_weight_tensor, ffn1_weight_scale_tensor = weight_quantize(
+                        ffn1_weight_tensor, algo=self.quant_algo
+                    )
+                    self.transformer_block.ffn1_weights[idx].set_value(ffn1_quanted_weight_tensor)
+                    self.transformer_block.ffn1_weights_scale[idx].set_value(ffn1_weight_scale_tensor)
+                else:
+                    self.transformer_block.ffn1_weights[idx].set_value(ffn1_weight_tensor)
+
+            elif k.endswith("mlp.dense_h_to_4h.bias"):
+                self.transformer_block.ffn1_biases[idx].set_value(v.astype(dtype))
+            elif k.endswith("mlp.dense_4h_to_h.weight"):
+                ffn2_weight_tensor = v.astype(dtype)
+                if self.use_weight_only:
+                    ffn2_quanted_weight_tensor, ffn2_weight_scale_tensor = weight_quantize(
+                        ffn2_weight_tensor, algo=self.quant_algo
+                    )
+                    self.transformer_block.ffn2_weights[idx].set_value(ffn2_quanted_weight_tensor)
+                    self.transformer_block.ffn2_weights_scale[idx].set_value(ffn2_weight_scale_tensor)
+                else:
+                    self.transformer_block.ffn2_weights[idx].set_value(ffn2_weight_tensor)
+
+            elif k.endswith("mlp.dense_4h_to_h.bias"):
+                self.transformer_block.ffn2_biases[idx].set_value(v.astype(dtype))
+            else:
+                print("Unknow weight {}".format(k))
+
+
+@register_base_model
+class ChatGLMModelDyBatch(ChatGLMPretrainedModel):
+    r"""
+    The GLM Model transformer can behave as an encoder (with only self-attention) as well as a decoder, where
+    a layer of cross-attention is added between the self-attention layers, following the architecture
+    described in [Attention is all you need](https://arxiv.org/abs/1706.03762).
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/en/api/paddle/fluid/dygraph/layers/Layer_en.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+    """
+
+    def __init__(self, config: ChatGLMConfig):
+        super(ChatGLMModelDyBatch, self).__init__(config)
+        self.config = config
+        self.transformer = ChatGLMStackDyBatch(config)
+        self.apply(self.init_weights)
+
+    def get_input_embeddings(self):
+        return self.transformer.word_embeddings
+
+    def set_input_embeddings(self, new_embeddings):
+        self.transformer.word_embeddings = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        cache=None,
+        inputs_embeds=None,
+        use_cache=None,
+        cache_kvs=None,
+        pre_caches=None,
+        seq_len_encoder=None,
+        seq_len_decoder=None,
+        past_key_values=None,
+        output_attentions=False,
+        output_hidden_states=None,
+        return_dict=False,
+        time_step=None,
+        **kwargs,
+    ):
+        if attention_mask is None:
+            attention_mask = self.get_masks(input_ids)
+
+        if position_ids is None:
+            MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
+
+            use_gmasks = []
+            mask_positions = []
+            for seq in input_ids:
+                mask_token = gMASK if gMASK in seq else MASK
+                use_gmask = mask_token == gMASK
+                use_gmasks.append(use_gmask)
+                mask_positions.append(paddle.where(seq == mask_token)[0][0])
+            position_ids = self.get_position_ids(input_ids, mask_positions=mask_positions, use_gmasks=use_gmasks)
+
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        logits, new_caches = self.transformer(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache=cache,
+            cache_kvs=cache_kvs,
+            pre_caches=pre_caches,
+            seq_len_encoder=seq_len_encoder,
+            seq_len_decoder=seq_len_decoder,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            time_step=time_step,
+        )
+
+        if not return_dict:
+            return (logits, new_caches)
+
+        return BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=logits, past_key_values=new_caches)
+
+
+class ChatGLMForCausalLMInferenceModel(GenerationInferenceModel, ChatGLMPretrainedModel):
+    def __init__(self, config: ChatGLMConfig):
+        super(ChatGLMForCausalLMInferenceModel, self).__init__(config)
+
+        self.config = config
+        self.max_sequence_length = config.max_sequence_length
+        self.position_encoding_2d = config.position_encoding_2d
+        self.time_step = paddle.to_tensor([1], dtype="int32", place=paddle.CPUPlace())
+        self.model = ChatGLMModelDyBatch(config)
+
+        self.lm_head = self.model.get_input_embeddings()
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        return infererence_model_from_pretrained(cls, pretrained_model_name_or_path, args, kwargs, return_numpy=False)
+
+    @classmethod
+    def get_cache_kvs_shape(
+        cls, config: ChatGLMConfig, max_batch_size: int = None, max_length: int = None
+    ) -> list[list[int]]:
+        """get cache_kvs tensor for llama model
+
+        Args:
+            max_batch_size (int): the max batch size
+            max_length (int | None, optional): the max_length of cache_kvs. Defaults to None.
+
+        Returns:
+            list[paddle.Tensor]: the list tensor shape for cache
+        """
+        if max_length is None:
+            max_length = config.max_sequence_length
+
+        cache_kvs = []
+        for _ in range(config.num_hidden_layers):
+            cache_kvs.append(
+                [
+                    2,
+                    max_batch_size,
+                    config.num_attention_heads // max(config.tensor_parallel_degree, 1),
+                    max_length,
+                    config.hidden_size // config.num_attention_heads,
+                ]
+            )
+        return cache_kvs
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        cache_kvs,
+        seq_len_encoder,
+        seq_len_decoder,
+        tgt_ids,
+        tgt_pos,
+        tgt_generation_mask,
+        **kwargs,
+    ):
+        # only last token for inputs_ids if cache is defined in kwargs
+        position_ids = kwargs.get("position_ids", None)
+        attention_mask = kwargs.get("attention_mask", None)
+        cache = kwargs.get("cache", None)
+        pre_caches = kwargs.get("pre_caches", None)
+
+        time_step = None
+        if cache is not None:
+            time_step = self.time_step
+            input_ids = tgt_ids
+            position_ids = tgt_pos
+            attention_mask = (1 - tgt_generation_mask) * paddle.finfo(tgt_generation_mask.dtype).min
+        else:
+            self.time_step = paddle.to_tensor(input_ids.shape[1], dtype="int32", place=paddle.CPUPlace())
+            attention_mask = (1 - attention_mask) * paddle.finfo(tgt_generation_mask.dtype).min
+            paddle.increment(self.time_step, -1)
+
+        model_inputs = {
+            "input_ids": input_ids,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "cache_kvs": cache_kvs,
+            "seq_len_encoder": seq_len_encoder,
+            "seq_len_decoder": seq_len_decoder,
+            "cache": cache,
+            "time_step": time_step,
+            "pre_caches": pre_caches,
+        }
+        return model_inputs
+
+    def forward(
+        self,
+        input_ids,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=False,
+        cache=None,
+        cache_kvs=None,
+        pre_caches=None,
+        seq_len_encoder=None,
+        seq_len_decoder=None,
+        past_key_values=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        time_step=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache=cache,
+            cache_kvs=cache_kvs,
+            pre_caches=pre_caches,
+            seq_len_encoder=seq_len_encoder,
+            seq_len_decoder=seq_len_decoder,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            time_step=time_step,
+        )
+        hidden_states = transformer_outputs.last_hidden_state if return_dict else transformer_outputs[0]
+        if self.config.tensor_parallel_degree > 1:
+            lm_logits = parallel_matmul(hidden_states, self.lm_head.weight, self.config.tensor_parallel_output)
+        else:
+            lm_logits = F.linear(hidden_states, self.lm_head.weight.T)
+
+        loss = None
+        if labels is not None:
+            """
+            for p, l in zip(lm_logits[..., :-1, :].argmax(axis=-1), labels[..., 1:]):
+                print("prediction")
+                print(self.tokenizer.decode(p[l != -100].tolist()))
+                print("labels")
+                print(self.tokenizer.decode(l[l != -100].tolist()))
+            """
+
+            shift_logits = lm_logits[..., :-1, :]
+            shift_logits = shift_logits.reshape([-1, shift_logits.shape[-1]])
+            shift_logits = shift_logits.astype("float32")
+            shift_labels = labels[..., 1:].reshape([-1])
+
+            if self.config.tensor_parallel_degree > 1 and self.config.tensor_parallel_output:
+                self.parallel_loss_func = fleet.meta_parallel.ParallelCrossEntropy()
+                shift_logits = shift_logits[shift_labels != -100]
+                shift_labels = shift_labels[shift_labels != -100]
+                loss = self.parallel_loss_func(shift_logits, shift_labels).mean()
+            else:
+                loss = nn.functional.cross_entropy(shift_logits, shift_labels, ignore_index=-100)
+            loss = loss.astype(lm_logits.dtype)
+        if time_step:
+            paddle.increment(self.time_step)
+
+        if not return_dict:
+            if loss is not None:
+                return (loss, lm_logits, transformer_outputs[1:])
+            else:
+                return (lm_logits, transformer_outputs[1:])
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+        )
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict):
+        self.lm_head.weight.set_value(
+            state_dict["chatglm.transformer.word_embeddings.weight"].astype(self.lm_head.weight.dtype)
+        )
+        self.model.transformer.set_state_dict({k: state_dict[k] for k in state_dict.keys()})
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/chatglm_v2/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/chatglm_v2/__init__.py
new file mode 100644
index 000000000..c2a7f656c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/chatglm_v2/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .modeling import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/chatglm_v2/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/chatglm_v2/modeling.py
new file mode 100644
index 000000000..c7e9762fb
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/chatglm_v2/modeling.py
@@ -0,0 +1,487 @@
+# Copyright (c) 2023 ChatGLM2-6B Model Team and PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional
+
+import paddle
+import paddle.distributed.fleet as fleet
+import paddle.nn as nn
+from paddle.nn.quant import weight_quantize
+
+from paddlenlp.experimental.transformers.fused_transformer_layers import (
+    FusedMultiTransformerBase,
+    FusedMultiTransformerConfig,
+    FusedMultiTransformerWeightOnly,
+)
+from paddlenlp.experimental.transformers.generation_utils import (
+    GenerationInferenceModel,
+)
+from paddlenlp.transformers import ChatGLMv2Config, ChatGLMv2PretrainedModel
+from paddlenlp.transformers.chatglm_v2.modeling import (
+    Embedding,
+    RMSNorm,
+    RotaryEmbedding,
+)
+from paddlenlp.transformers.model_utils import (
+    dy2st_nocheck_guard_context,
+    register_base_model,
+)
+
+__all__ = [
+    "ChatGLMv2ForCausalLMInferenceModel",
+]
+
+
+@register_base_model
+class ChatGLMv2InferenceModel(ChatGLMv2PretrainedModel):
+    def __init__(self, config: ChatGLMv2Config, empty_init=True):
+        super().__init__(config)
+        self.embedding = Embedding(config)
+
+        # Rotary positional embeddings
+        self.max_sequence_length = config.max_sequence_length
+        rotary_dim = (
+            config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
+        )
+        self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2)
+
+        if config.tensor_parallel_degree > 1:
+            if config.tensor_parallel_degree > 2:
+                raise ValueError(
+                    "ChatGLM2 does not support `tensor_parallel_degree` > 2. Consider using Sharding stage 3"
+                )
+            self.output_layer = fleet.meta_parallel.ColumnParallelLinear(
+                config.hidden_size,
+                config.padded_vocab_size,
+                has_bias=False,
+                gather_output=not config.tensor_parallel_output,
+            )
+        else:
+            self.output_layer = nn.Linear(config.hidden_size, config.padded_vocab_size, bias_attr=False)
+
+        self.num_layers = config.num_hidden_layers
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_size = self.hidden_size // self.num_heads
+        self.multi_query_group_num = config.multi_query_group_num
+
+        self.use_weight_only = False
+        if config.quant_type == "weight_only_int8":
+            self.use_weight_only = True
+            self.quant_algo = "weight_only_int8"
+        elif config.quant_type == "weight_only_int4":
+            self.use_weight_only = True
+            self.quant_algo = "weight_only_int4"
+
+        ln_scale_attrs = [
+            paddle.ParamAttr(name="encoder.layers.{}.input_layernorm.weight".format(i))
+            for i in range(config.num_hidden_layers)
+        ]
+
+        qkv_weight_attrs = [
+            paddle.ParamAttr(
+                name="encoder.layers.{}.qkv_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(config.num_hidden_layers)
+        ]
+        qkv_bias_attrs = [
+            paddle.ParamAttr(name="encoder.layers.{}.qkv_bias".format(i)) for i in range(config.num_hidden_layers)
+        ]
+
+        out_proj_weight_attrs = [
+            paddle.ParamAttr(
+                name="encoder.layers.{}.self_attention.dense.weight".format(i),
+                initializer=paddle.nn.initializer.Constant(value=0),
+            )
+            for i in range(config.num_hidden_layers)
+        ]
+
+        ffn_ln_scale_attrs = [
+            paddle.ParamAttr(name="encoder.layers.{}.post_attention_layernorm.weight".format(i))
+            for i in range(config.num_hidden_layers)
+        ]
+
+        ffn1_weight_attrs = [
+            paddle.ParamAttr(
+                name="encoder.layers.{}.mlp.dense_h_to_4h.weight".format(i),
+                initializer=paddle.nn.initializer.Constant(value=0),
+            )
+            for i in range(config.num_hidden_layers)
+        ]
+
+        ffn2_weight_attrs = [
+            paddle.ParamAttr(
+                name="encoder.layers.{}.mlp.dense_4h_to_h.weight".format(i),
+                initializer=paddle.nn.initializer.Constant(value=0),
+            )
+            for i in range(config.num_hidden_layers)
+        ]
+
+        qkv_weight_scale_attrs = None
+        out_proj_weight_scale_attrs = None
+        ffn1_weight_scale_attrs = None
+        ffn2_weight_scale_attrs = None
+        if self.use_weight_only:
+            qkv_weight_scale_attrs = [
+                paddle.ParamAttr(name="encoder.layers.{}.qkv_weight_scale".format(i)) for i in range(self.num_layers)
+            ]
+            out_proj_weight_scale_attrs = [
+                paddle.ParamAttr(name="encoder.layers.{}.self_attention.dense.weight_scale".format(i))
+                for i in range(self.num_layers)
+            ]
+            ffn1_weight_scale_attrs = [
+                paddle.ParamAttr(name="encoder.layers.{}.mlp.dense_h_to_4h.weight_scale".format(i))
+                for i in range(self.num_layers)
+            ]
+            ffn2_weight_scale_attrs = [
+                paddle.ParamAttr(name="encoder.layers.{}.mlp.dense_4h_to_h.weight_scale".format(i))
+                for i in range(self.num_layers)
+            ]
+        transformer_config = FusedMultiTransformerConfig(
+            config.hidden_size,
+            config.num_attention_heads,
+            config.ffn_hidden_size,
+            dropout_rate=0.0,
+            quant_type=config.quant_type,
+            activation="swiglu",
+            normalize_before=True,
+            num_layers=config.num_hidden_layers,
+            nranks=1,
+            ring_id=-1,
+            ln_scale_attrs=ln_scale_attrs,
+            qkv_weight_attrs=qkv_weight_attrs,
+            qkv_weight_scale_attrs=qkv_weight_scale_attrs,
+            qkv_bias_attrs=qkv_bias_attrs,
+            linear_weight_attrs=out_proj_weight_attrs,
+            linear_weight_scale_attrs=out_proj_weight_scale_attrs,
+            ffn_ln_scale_attrs=ffn_ln_scale_attrs,
+            ffn1_weight_attrs=ffn1_weight_attrs,
+            ffn1_weight_scale_attrs=ffn1_weight_scale_attrs,
+            ffn2_weight_attrs=ffn2_weight_attrs,
+            ffn2_weight_scale_attrs=ffn2_weight_scale_attrs,
+            epsilon=config.layernorm_epsilon,
+            norm_type="rmsnorm",
+            kv_num_heads=config.multi_query_group_num,
+        )
+
+        if self.use_weight_only:
+            self.transformer_block = FusedMultiTransformerWeightOnly(transformer_config)
+        else:
+            self.transformer_block = FusedMultiTransformerBase(transformer_config)
+
+        self.post_layer_norm = config.post_layer_norm
+        if self.post_layer_norm:
+            LayerNormFunc = RMSNorm if config.rmsnorm else nn.LayerNorm
+            # Final layer norm before output.
+            self.final_layernorm = LayerNormFunc(config.hidden_size, epsilon=config.layernorm_epsilon)
+
+    def get_input_embeddings(self):
+        return self.embedding.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embedding.word_embeddings = value
+
+    def remove_padding(self, input_ids, seq_lens_this_time):
+        cum_offsets_now = paddle.cumsum(paddle.max(seq_lens_this_time) - seq_lens_this_time)
+        token_num = paddle.sum(seq_lens_this_time)
+        from paddlenlp_ops import get_padding_offset
+
+        ids_remove_padding, cum_offsets, padding_offset = get_padding_offset(
+            input_ids, cum_offsets_now, token_num, seq_lens_this_time
+        )
+        return ids_remove_padding, padding_offset, cum_offsets
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        inputs_embeds=None,
+        use_cache=None,
+        cache_kvs=None,
+        seq_len_encoder=None,
+        seq_len_decoder=None,
+        past_key_values=None,
+        output_attentions=False,
+        output_hidden_states=None,
+        return_dict=False,
+        **kwargs,
+    ):
+
+        # kwargs["cache"] is used used to distinguish between encoder and decoder phase.
+        past_key_values = kwargs.get("cache", None)
+        is_decoder = past_key_values is not None
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if not is_decoder:
+            ids_remove_padding, padding_offset, cum_offsets = self.remove_padding(input_ids, seq_len_encoder)
+        else:
+            ids_remove_padding = input_ids
+            padding_offset = None
+            cum_offsets = None
+
+        batch_size, seq_length = input_ids.shape
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embedding.word_embeddings(ids_remove_padding)
+        hidden_states = inputs_embeds
+
+        # Rotary positional embeddings
+        rotary_pos_emb = self.rotary_pos_emb(self.max_sequence_length)
+
+        if position_ids is not None:
+            rotary_pos_emb = rotary_pos_emb[position_ids]
+            rotary_pos_emb = rotary_pos_emb[:, :seq_length, :, :]
+        else:
+            rotary_pos_emb = rotary_pos_emb[None, :seq_length]
+
+        ones = paddle.ones([batch_size, seq_length, self.head_size // 4], dtype=paddle.get_default_dtype())
+        zeros = paddle.zeros([batch_size, seq_length, self.head_size // 4], dtype=paddle.get_default_dtype())
+        # make it to be [2, batch, seq_len, rotary_dim]
+        rotary_pos_emb = rotary_pos_emb.transpose([3, 0, 1, 2])
+        # The following code is for consistency with PaddleNLP/csrc/generation/encode_rotary_qk.cu, so boring.
+        cos = rotary_pos_emb[0]
+        sin = rotary_pos_emb[1]
+        cos = paddle.concat([cos, ones], axis=-1)
+        sin = paddle.concat([sin, zeros], axis=-1)
+        rotary_pos_emb = paddle.stack([cos, sin], axis=0)
+        rotary_pos_emb = (
+            rotary_pos_emb.unsqueeze(-1).tile([1, 1, 1, 1, 2]).reshape([2, batch_size, seq_length, self.head_size])
+        )
+
+        # Run encoder.
+        seq_lens = seq_len_decoder if is_decoder else seq_len_encoder
+        with dy2st_nocheck_guard_context():
+            hidden_states, _ = self.transformer_block(
+                input_ids,
+                hidden_states,
+                cum_offsets=cum_offsets,
+                padding_offset=padding_offset,
+                attn_mask=paddle.cast(attention_mask, dtype=hidden_states.dtype),
+                caches=cache_kvs,
+                pre_caches=None,
+                pre_caches_length=0,
+                seq_lens=seq_lens,
+                rotary_embs=paddle.cast(rotary_pos_emb, "float32"),
+                rotary_emb_dims=1,
+                time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None,
+            )
+
+        hidden_states = self.final_layernorm(hidden_states)
+
+        return tuple(v for v in [hidden_states, None, None, None] if v is not None)
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict):
+        # find the real name.
+        def key(name):
+            result_list = []
+            for i in state_dict.keys():
+                if i.find(name) >= 0:
+                    result_list.append(i)
+            assert len(result_list) == 1, name + " must be only one in state_dict"
+            return result_list[0]
+
+        self.embedding.word_embeddings.weight.set_value(state_dict.pop(key("embedding.word_embeddings.weight")))
+        self.final_layernorm.weight.set_value(state_dict.pop(key("encoder.final_layernorm.weight")))
+        self.output_layer.weight.set_value(state_dict.pop(key("output_layer.weight")))
+
+        for i in range(self.num_layers):
+            ln_scale = state_dict.pop(key("encoder.layers.{}.input_layernorm.weight".format(i)))
+
+            concated_qkv_weight = state_dict.pop(
+                key("encoder.layers.{}.self_attention.query_key_value.weight".format(i))
+            )
+            concated_qkv_weight = concated_qkv_weight.transpose([1, 0])
+            concated_qkv_weight = paddle.to_tensor(concated_qkv_weight)
+
+            concated_qkv_bias = state_dict.pop(key("encoder.layers.{}.self_attention.query_key_value.bias".format(i)))
+            concated_qkv_bias = paddle.to_tensor(concated_qkv_bias)
+
+            out_proj_weight = state_dict.pop(key("encoder.layers.{}.self_attention.dense.weight".format(i)))
+
+            ffn_ln_scale = state_dict.pop(key("encoder.layers.{}.post_attention_layernorm.weight".format(i)))
+
+            ffn1_weight = state_dict.pop(key("encoder.layers.{}.mlp.dense_h_to_4h.weight".format(i)))
+            ffn2_weight = state_dict.pop(key("encoder.layers.{}.mlp.dense_4h_to_h.weight".format(i)))
+
+            self.transformer_block.ln_scales[i].set_value(ln_scale)
+
+            if self.use_weight_only:
+                qkv_weight_tensor = paddle.to_tensor(concated_qkv_weight)
+                qkv_weight_tensor = paddle.transpose(qkv_weight_tensor, perm=[1, 0])
+                qkv_quanted_weight_tensor, qkv_weight_scale_tensor = weight_quantize(
+                    qkv_weight_tensor, algo=self.quant_algo
+                )
+                self.transformer_block.qkv_weights[i].set_value(qkv_quanted_weight_tensor)
+                self.transformer_block.qkv_weights_scale[i].set_value(qkv_weight_scale_tensor)
+            else:
+                self.transformer_block.qkv_weights[i].set_value(concated_qkv_weight)
+
+            self.transformer_block.qkv_biases[i].set_value(concated_qkv_bias)
+
+            if self.use_weight_only:
+                linear_quanted_weight_tensor, linear_weight_scale_tensor = weight_quantize(
+                    out_proj_weight, algo=self.quant_algo
+                )
+                self.transformer_block.linear_weights[i].set_value(linear_quanted_weight_tensor)
+                self.transformer_block.linear_weights_scale[i].set_value(linear_weight_scale_tensor)
+            else:
+                self.transformer_block.linear_weights[i].set_value(out_proj_weight)
+
+            self.transformer_block.ffn_ln_scales[i].set_value(ffn_ln_scale)
+
+            if self.use_weight_only:
+                ffn1_quanted_weight_tensor, ffn1_weight_scale_tensor = weight_quantize(
+                    ffn1_weight, algo=self.quant_algo
+                )
+                self.transformer_block.ffn1_weights[i].set_value(ffn1_quanted_weight_tensor)
+                self.transformer_block.ffn1_weights_scale[i].set_value(ffn1_weight_scale_tensor)
+            else:
+                self.transformer_block.ffn1_weights[i].set_value(ffn1_weight)
+
+            if self.use_weight_only:
+                ffn2_quanted_weight_tensor, ffn2_weight_scale_tensor = weight_quantize(
+                    ffn2_weight, algo=self.quant_algo
+                )
+                self.transformer_block.ffn2_weights[i].set_value(ffn2_quanted_weight_tensor)
+                self.transformer_block.ffn2_weights_scale[i].set_value(ffn2_weight_scale_tensor)
+            else:
+                self.transformer_block.ffn2_weights[i].set_value(ffn2_weight)
+
+
+class ChatGLMv2ForCausalLMInferenceModel(GenerationInferenceModel, ChatGLMv2PretrainedModel):
+    def __init__(self, config: ChatGLMv2Config):
+        super().__init__(config)
+        self.max_sequence_length = config.max_sequence_length
+        self.chatglm_v2 = ChatGLMv2InferenceModel(config)
+
+    @classmethod
+    def get_cache_kvs_shape(cls, config: ChatGLMv2Config, max_batch_size: int = None, max_length: int = None):
+        """get cache_kvs tensor for opt model
+
+        Args:
+            max_batch_size (int): the max batch size
+            max_length (int | None, optional): the max_length of cache_kvs. Defaults to None.
+
+        Returns:
+            list[paddle.Tensor]: the list tensor shape for cache
+        """
+
+        if max_length is None:
+            max_length = config.max_sequence_length
+
+        cache_kvs = []
+        for _ in range(config.num_hidden_layers):
+            cache_kvs.append(
+                [
+                    2,
+                    max_batch_size,
+                    config.multi_query_group_num,
+                    max_length,
+                    config.hidden_size // config.num_attention_heads,
+                ]
+            )
+        return cache_kvs
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        cache_kvs,
+        seq_len_encoder,
+        seq_len_decoder,
+        tgt_ids,
+        tgt_pos,
+        tgt_generation_mask,
+        **kwargs,
+    ):
+        position_ids = kwargs.get("position_ids", None)
+        attention_mask = kwargs.get("attention_mask", None)
+        cache = kwargs.get("cache", None)
+        pre_caches = kwargs.get("pre_caches", None)
+        inputs_embeds = kwargs.get("inputs_embeds", None)
+        if cache is not None:
+            input_ids = tgt_ids
+            position_ids = tgt_pos
+            attention_mask = (tgt_generation_mask - 1) * 1e4
+            # make inputs_embeds be none in decoder phase.
+            # in forward function, it will be assigned according to input_ids.
+            inputs_embeds = None
+        else:
+            attention_mask = (attention_mask - 1) * 1e4
+        model_inputs = {
+            "input_ids": input_ids,
+            "inputs_embeds": inputs_embeds,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "cache_kvs": cache_kvs,
+            "seq_len_encoder": seq_len_encoder,
+            "seq_len_decoder": seq_len_decoder,
+            "cache": cache,
+            "pre_caches": pre_caches,
+        }
+        return model_inputs
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=False,
+        cache=None,
+        cache_kvs=None,
+        pre_caches=None,
+        seq_len_encoder=None,
+        seq_len_decoder=None,
+        past_key_values=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.chatglm_v2(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache=cache,
+            cache_kvs=cache_kvs,
+            seq_len_encoder=seq_len_encoder,
+            seq_len_decoder=seq_len_decoder,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.chatglm_v2.output_layer(hidden_states)
+        output = (lm_logits,) + transformer_outputs[1:]
+        return output
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict):
+        self.chatglm_v2.set_state_dict(state_dict)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/fused_transformer_layers.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/fused_transformer_layers.py
new file mode 100644
index 000000000..923d29dda
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/fused_transformer_layers.py
@@ -0,0 +1,1679 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import paddle
+import paddle.distributed as dist
+from paddle.framework import LayerHelper, core, in_dynamic_mode, in_dynamic_or_pir_mode
+from paddle.incubate.nn.functional import (
+    fused_layer_norm,
+    fused_rms_norm,
+    masked_multihead_attention,
+    variable_length_memory_efficient_attention,
+)
+from paddle.nn import Layer
+from paddle.nn.initializer import Constant
+from paddle.nn.quant import weight_only_linear
+
+from paddlenlp.utils.import_utils import is_paddlenlp_ops_available
+from paddlenlp.utils.log import logger
+
+if not is_paddlenlp_ops_available():
+    logger.warning(
+        "The paddlenlp_ops package is not installed. you can read the docs and install it by hand, "
+        "you can refer to: https://github.com/PaddlePaddle/PaddleNLP/blob/develop/csrc/README.md"
+    )
+if core.is_compiled_with_xpu() or core.is_compiled_with_cuda():
+    from paddlenlp_ops import rebuild_padding_v2
+
+if core.is_compiled_with_cuda():
+    from paddlenlp_ops import (
+        dequant_int8,
+        encode_rotary_qk,
+        qkv_transpose_split,
+        quant_int8,
+        rebuild_padding,
+        transpose_remove_padding,
+        write_cache_kv,
+    )
+
+__all__ = [
+    "FusedMultiTransformerConfig",
+    "FusedMultiTransformerBase",
+    "FusedMultiTransformerPostLayernorm",
+    "FusedMultiTransformerWeightOnly",
+    "FusedMultiTransformerWeightOnlyPostLayernorm",
+    "FusedBlockMultiTransformer",
+    "FusedBlockMultiTransformerWeightOnly",
+    "FusedBlockMultiTransformerA8W8",
+]
+
+
+# for distributed tensor model parallel
+def _set_var_distributed(var):
+    if var is None:
+        return
+
+    var.is_distributed = True
+
+    if not in_dynamic_mode():
+        # NOTE: use current_block and find_var_recursive to support while_loop
+        startup_block = paddle.static.default_startup_program().current_block()
+        main_block = paddle.static.default_main_program().current_block()
+        startup_block._find_var_recursive(var.name).is_distributed = True
+        main_block._find_var_recursive(var.name).is_distributed = True
+
+
+def fused_act_bias_wrapper(
+    x,
+    bias=None,
+    dequant_scales=None,
+    shift=None,
+    smooth=None,
+    act_method="gelu",
+    compute_dtype="default",
+    quant_scale=-1,
+    quant_round_type=0,
+    quant_max_bound=0,
+    quant_min_bound=0,
+):
+    if in_dynamic_or_pir_mode():
+
+        return paddle._C_ops.fused_bias_act(
+            x,
+            bias,
+            dequant_scales,
+            shift,
+            smooth,
+            act_method,
+            compute_dtype,
+            quant_scale,
+            quant_round_type,
+            quant_max_bound,
+            quant_min_bound,
+        )
+    helper = LayerHelper("fused_bias_act")
+    if x.dtype == "int32":
+        if compute_dtype == "bf16":
+            dtype = "uint16"
+        elif compute_dtype == "fp16":
+            dtype = "float16"
+        elif compute_dtype == "fp32":
+            dtype = "float32"
+        out = helper.create_variable_for_type_inference(dtype=dtype)
+    else:
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    inputs = {}
+    inputs["x"] = x
+    if bias is not None:
+        inputs["bias"] = bias
+    if dequant_scales is not None:
+        inputs["dequant_scales"] = dequant_scales
+
+    if shift is not None:
+        inputs["shift"] = shift
+
+    if smooth is not None:
+        inputs["smooth"] = smooth
+
+    attrs = {
+        "act_method": act_method,
+        "compute_dtype": compute_dtype,
+        "quant_scale": quant_scale,
+        "quant_round_type": quant_round_type,
+        "quant_max_bound": quant_max_bound,
+        "quant_min_bound": quant_min_bound,
+    }
+
+    helper.append_op(
+        type="fused_bias_act",
+        inputs=inputs,
+        outputs={"out": out},
+        attrs=attrs,
+    )
+    return out
+
+
+class FusedMultiTransformerConfig:
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        dim_feedforward,
+        quant_type="",
+        dropout_rate=0.0,
+        activation="gelu",
+        norm_type="layernorm",
+        use_neox_rotary_style=False,
+        normalize_before=True,
+        ln_scale_attrs=None,
+        ln_bias_attrs=None,
+        qkv_weight_attrs=None,
+        qkv_weight_scale_attrs=None,
+        qkv_bias_attrs=None,
+        linear_weight_attrs=None,
+        linear_weight_scale_attrs=None,
+        linear_bias_attrs=None,
+        ffn_ln_scale_attrs=None,
+        ffn_ln_bias_attrs=None,
+        ffn1_weight_attrs=None,
+        ffn1_weight_scale_attrs=None,
+        ffn1_bias_attrs=None,
+        ffn2_weight_attrs=None,
+        ffn2_weight_scale_attrs=None,
+        ffn2_bias_attrs=None,
+        qkv_out_scale_attrs=None,
+        linear_out_scale_attrs=None,
+        ffn1_out_scale_attrs=None,
+        ffn2_out_scale_attrs=None,
+        linear_shift_attrs=None,
+        linear_smooth_attrs=None,
+        ffn2_shift_attrs=None,
+        ffn2_smooth_attrs=None,
+        cache_k_scale_attrs=None,
+        cache_v_scale_attrs=None,
+        cache_k_out_scale_attrs=None,
+        cache_v_out_scale_attrs=None,
+        quant_round_type=0,
+        quant_max_bound=127.0,
+        quant_min_bound=-127.0,
+        epsilon=1e-5,
+        residual_alpha=1.0,
+        num_layers=-1,
+        nranks=1,
+        trans_qkvw=True,
+        ring_id=-1,
+        kv_num_heads=-1,
+        cachekv_int8_type=None,
+        rank_id=-1,
+    ):
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        if kv_num_heads > 0:
+            self.kv_num_heads = kv_num_heads
+        else:
+            self.kv_num_heads = num_heads
+        self.dim_feedforward = dim_feedforward
+        self.dropout_rate = dropout_rate
+        self.activation = activation
+        self.norm_type = norm_type
+
+        self.use_neox_rotary_style = use_neox_rotary_style
+        self.normalize_before = normalize_before
+        self.ln_scale_attrs = ln_scale_attrs
+        self.ln_bias_attrs = ln_bias_attrs
+        self.qkv_weight_attrs = qkv_weight_attrs
+        self.qkv_weight_scale_attrs = qkv_weight_scale_attrs
+        self.qkv_bias_attrs = qkv_bias_attrs
+        self.linear_weight_attrs = linear_weight_attrs
+        self.linear_weight_scale_attrs = linear_weight_scale_attrs
+        self.linear_bias_attrs = linear_bias_attrs
+        self.ffn_ln_scale_attrs = ffn_ln_scale_attrs
+        self.ffn_ln_bias_attrs = ffn_ln_bias_attrs
+        self.ffn1_weight_attrs = ffn1_weight_attrs
+        self.ffn1_weight_scale_attrs = ffn1_weight_scale_attrs
+        self.ffn1_bias_attrs = ffn1_bias_attrs
+        self.ffn2_weight_attrs = ffn2_weight_attrs
+        self.ffn2_weight_scale_attrs = ffn2_weight_scale_attrs
+        self.ffn2_bias_attrs = ffn2_bias_attrs
+
+        self.qkv_out_scale_attrs = qkv_out_scale_attrs
+        self.linear_out_scale_attrs = linear_out_scale_attrs
+        self.ffn1_out_scale_attrs = ffn1_out_scale_attrs
+        self.ffn2_out_scale_attrs = ffn2_out_scale_attrs
+        self.linear_shift_attrs = linear_shift_attrs
+        self.linear_smooth_attrs = linear_smooth_attrs
+        self.ffn2_shift_attrs = ffn2_shift_attrs
+        self.ffn2_smooth_attrs = ffn2_smooth_attrs
+        self.cache_k_scale_attrs = cache_k_scale_attrs
+        self.cache_v_scale_attrs = cache_v_scale_attrs
+        self.cache_k_out_scale_attrs = cache_k_out_scale_attrs
+        self.cache_v_out_scale_attrs = cache_v_out_scale_attrs
+
+        self.quant_type = quant_type
+        self.quant_round_type = quant_round_type
+        self.quant_max_bound = quant_max_bound
+        self.quant_min_bound = quant_min_bound
+        self.cachekv_int8_type = cachekv_int8_type
+
+        self.epsilon = epsilon
+        self.residual_alpha = residual_alpha
+        self.num_layers = num_layers
+        self.nranks = nranks
+        self.rank_id = rank_id
+        self.trans_qkvw = trans_qkvw
+        self.ring_id = ring_id
+
+
+class FusedMultiTransformerBase(Layer):
+    def __init__(self, config: FusedMultiTransformerConfig):
+        super().__init__()
+
+        self.config = config
+
+        assert config.embed_dim > 0, "Expected embed_dim to be greater than 0, " "but received {}".format(
+            config.embed_dim
+        )
+        assert config.num_heads > 0, "Expected nhead to be greater than 0, " "but received {}".format(config.num_heads)
+        assert config.dim_feedforward > 0, "Expected dim_feedforward to be greater than 0, but received {}".format(
+            config.dim_feedforward
+        )
+
+        # self.normalize_before = normalize_before
+        self._dtype = self._helper.get_default_dtype()
+        self._epsilon = config.epsilon
+        self._residual_alpha = config.residual_alpha
+        self._trans_qkvw = config.trans_qkvw
+        self._ring_id = config.ring_id
+        self.nranks = config.nranks
+        self.norm_type = config.norm_type
+        if self.norm_type == "layernorm":
+            self.norm_func = fused_layer_norm
+        elif self.norm_type == "rmsnorm":
+            self.norm_func = fused_rms_norm
+        else:
+            raise NotImplementedError("Only support norm type of [layernorm, rmsnorm]")
+        self.use_neox_rotary_style = config.use_neox_rotary_style
+        self._norm_weight_dtype = "float32" if self.norm_type == "layernorm" else self._dtype
+
+        self.activation = config.activation
+
+        self.embed_dim = config.embed_dim
+        self.head_dim = config.embed_dim // config.num_heads
+        assert self.head_dim * config.num_heads == config.embed_dim, "embed_dim must be divisible by num_heads"
+
+        # tensor model parallel
+        if config.nranks > 1:
+            assert config.ring_id != -1
+        assert config.num_heads % config.nranks == 0
+        assert config.dim_feedforward % config.nranks == 0
+        self.num_heads = config.num_heads // config.nranks
+        self.kv_num_heads = config.kv_num_heads // config.nranks
+        dim_feedforward = config.dim_feedforward // config.nranks
+        self.dim_feedforward = dim_feedforward
+
+        self.num_layers = config.num_layers
+        assert self.num_layers > 0
+        if isinstance(config.qkv_weight_attrs, (list, tuple)):
+            assert self.num_layers == len(config.qkv_weight_attrs)
+
+        self.weight_dtype = self._dtype
+        self.create_params_type = self.get_weight_create_dype()
+
+        self.ln_scales, self.ln_biases = [], []
+        self.qkv_weights, self.qkv_biases = [], []
+        self.linear_weights, self.linear_biases = [], []
+        self.ffn_ln_scales, self.ffn_ln_biases = [], []
+        self.ffn1_weights, self.ffn1_biases = [], []
+        self.ffn2_weights, self.ffn2_biases = [], []
+        self.cache_k_scales, self.cache_v_scales = [], []
+        self.cache_k_out_scales, self.cache_v_out_scales = [], []
+
+        for i in range(self.num_layers):
+            ln_scale_attr = self.get_attr(config.ln_scale_attrs, i)
+            ln_bias_attr = self.get_attr(config.ln_bias_attrs, i)
+            qkv_weight_attr = self.get_attr(config.qkv_weight_attrs, i)
+
+            qkv_bias_attr = self.get_attr(config.qkv_bias_attrs, i)
+            linear_weight_attr = self.get_attr(config.linear_weight_attrs, i)
+            linear_bias_attr = self.get_attr(config.linear_bias_attrs, i)
+
+            ffn_ln_scale_attr = self.get_attr(config.ffn_ln_scale_attrs, i)
+            ffn_ln_bias_attr = self.get_attr(config.ffn_ln_bias_attrs, i)
+            ffn1_weight_attr = self.get_attr(config.ffn1_weight_attrs, i)
+            ffn1_bias_attr = self.get_attr(config.ffn1_bias_attrs, i)
+            ffn2_weight_attr = self.get_attr(config.ffn2_weight_attrs, i)
+            ffn2_bias_attr = self.get_attr(config.ffn2_bias_attrs, i)
+
+            cache_k_scale_attr = self.get_attr(config.cache_k_scale_attrs, i)
+            cache_v_scale_attr = self.get_attr(config.cache_v_scale_attrs, i)
+            cache_k_out_scale_attr = self.get_attr(config.cache_k_out_scale_attrs, i)
+            cache_v_out_scale_attr = self.get_attr(config.cache_v_out_scale_attrs, i)
+
+            ln_scale = self.create_parameter(
+                attr=ln_scale_attr,
+                shape=[config.embed_dim],
+                default_initializer=Constant(value=1.0),
+                dtype=self._norm_weight_dtype,
+            )
+            ln_bias = None
+            if ln_bias_attr:
+                ln_bias = self.create_parameter(
+                    attr=ln_bias_attr,
+                    shape=[config.embed_dim],
+                    is_bias=True,
+                    dtype=self._norm_weight_dtype,
+                )
+            self.init_weight_shape(config)
+
+            qkv_weight = self.create_parameter(
+                shape=self.qkv_weight_shape,
+                attr=qkv_weight_attr,
+                dtype=self.create_params_type,
+                is_bias=False,
+            )
+
+            qkv_bias = None
+            if qkv_bias_attr:
+                qkv_bias = self.create_parameter(
+                    shape=[(self.num_heads + 2 * self.kv_num_heads) * self.head_dim],
+                    attr=qkv_bias_attr,
+                    dtype=self._dtype,
+                    is_bias=True,
+                )
+
+            linear_weight = self.create_parameter(
+                shape=self.linear_weight_shape,
+                attr=linear_weight_attr,
+                dtype=self.create_params_type,
+                is_bias=False,
+            )
+
+            linear_bias = None
+            if linear_bias_attr:
+                linear_bias = self.create_parameter(
+                    shape=[config.embed_dim],
+                    attr=linear_bias_attr,
+                    dtype=self._dtype,
+                    is_bias=True,
+                )
+
+            ffn_ln_scale = self.create_parameter(
+                shape=[config.embed_dim],
+                attr=ffn_ln_scale_attr,
+                is_bias=False,
+                default_initializer=Constant(1.0),
+                dtype=self._norm_weight_dtype,
+            )
+
+            ffn_ln_bias = None
+            if ffn_ln_bias_attr:
+                ffn_ln_bias = self.create_parameter(
+                    shape=[config.embed_dim],
+                    attr=ffn_ln_bias_attr,
+                    is_bias=True,
+                    dtype=self._norm_weight_dtype,
+                )
+
+            ffn1_weight = self.create_parameter(
+                shape=self.ffn1_weight_shape,
+                attr=ffn1_weight_attr,
+                dtype=self.create_params_type,
+                is_bias=False,
+            )
+
+            ffn1_bias = None
+            if ffn1_bias_attr:
+                ffn1_bias = self.create_parameter(
+                    shape=[dim_feedforward * 2] if config.activation.endswith("glu") else [dim_feedforward],
+                    attr=ffn1_bias_attr,
+                    dtype=self._dtype,
+                    is_bias=True,
+                )
+
+            ffn2_weight = self.create_parameter(
+                shape=self.ffn2_weight_shape,
+                attr=ffn2_weight_attr,
+                dtype=self.create_params_type,
+                is_bias=False,
+            )
+
+            ffn2_bias = None
+            if ffn2_bias_attr:
+                ffn2_bias = self.create_parameter(
+                    shape=[config.embed_dim],
+                    attr=ffn2_bias_attr,
+                    dtype=self._dtype,
+                    is_bias=True,
+                )
+
+            cache_k_scale = None
+            if cache_k_scale_attr:
+                cache_k_scale = self.create_parameter(
+                    shape=[self.kv_num_heads],
+                    attr=cache_k_scale_attr,
+                    dtype="float32",
+                    is_bias=False,
+                )
+
+            cache_v_scale = None
+            if cache_v_scale_attr:
+                cache_v_scale = self.create_parameter(
+                    shape=[self.kv_num_heads],
+                    attr=cache_v_scale_attr,
+                    dtype="float32",
+                    is_bias=False,
+                )
+
+            cache_k_out_scale = None
+            if cache_k_out_scale_attr:
+                cache_k_out_scale = self.create_parameter(
+                    shape=[self.kv_num_heads],
+                    attr=cache_k_out_scale_attr,
+                    dtype="float32",
+                    is_bias=False,
+                )
+
+            cache_v_out_scale = None
+            if cache_v_out_scale_attr:
+                cache_v_out_scale = self.create_parameter(
+                    shape=[self.kv_num_heads],
+                    attr=cache_v_out_scale_attr,
+                    dtype="float32",
+                    is_bias=False,
+                )
+
+            # tensor model parallel
+            if config.nranks > 1:
+                # column parallel
+                _set_var_distributed(qkv_weight)
+                _set_var_distributed(qkv_bias)
+                _set_var_distributed(ffn1_weight)
+                _set_var_distributed(ffn1_bias)
+                # row parallel
+                _set_var_distributed(linear_weight)
+                _set_var_distributed(ffn2_weight)
+
+            self.ln_scales.append(ln_scale)
+            self.ln_biases.append(ln_bias)
+            self.qkv_weights.append(qkv_weight)
+            self.qkv_biases.append(qkv_bias)
+            self.linear_weights.append(linear_weight)
+            self.linear_biases.append(linear_bias)
+
+            self.ffn_ln_scales.append(ffn_ln_scale)
+            self.ffn_ln_biases.append(ffn_ln_bias)
+            self.ffn1_weights.append(ffn1_weight)
+            self.ffn1_biases.append(ffn1_bias)
+            self.ffn2_weights.append(ffn2_weight)
+            self.ffn2_biases.append(ffn2_bias)
+
+            self.cache_k_scales.append(cache_k_scale)
+            self.cache_v_scales.append(cache_v_scale)
+            self.cache_k_out_scales.append(cache_k_out_scale)
+            self.cache_v_out_scales.append(cache_v_out_scale)
+
+            self._add_parameter(ln_scale)
+            self._add_parameter(ln_bias)
+            self._add_parameter(qkv_weight)
+            self._add_parameter(qkv_bias)
+            self._add_parameter(linear_weight)
+            self._add_parameter(linear_bias)
+
+            self._add_parameter(ffn_ln_scale)
+            self._add_parameter(ffn_ln_bias)
+            self._add_parameter(ffn1_weight)
+            self._add_parameter(ffn1_bias)
+            self._add_parameter(ffn2_weight)
+            self._add_parameter(ffn2_bias)
+
+            self._add_parameter(cache_k_scale)
+            self._add_parameter(cache_v_scale)
+            self._add_parameter(cache_k_out_scale)
+            self._add_parameter(cache_v_out_scale)
+
+        self.dropout_rate = config.dropout_rate
+
+        from paddle.incubate.nn.functional import fused_linear
+
+        self.linear = fused_linear
+
+    def get_attr(self, attrs, idx):
+        if isinstance(attrs, (list, tuple)):
+            assert (
+                len(attrs) == self.num_layers
+            ), f"length of attrs is {len(attrs)} is not equal to self.num_layers {self.num_layers}"
+            return attrs[idx]
+        return attrs
+
+    def _add_parameter(self, param):
+        if param is None:
+            return
+        assert param.name not in self._parameters
+        self._parameters[param.name] = param
+
+    def init_weight_shape(self, config):
+        self.qkv_weight_shape = (
+            [(self.num_heads + 2 * self.kv_num_heads) * self.head_dim, self.embed_dim]
+            if config.trans_qkvw
+            else [self.embed_dim, (self.num_heads + 2 * self.kv_num_heads) * self.head_dim]
+        )
+        self.linear_weight_shape = [self.num_heads * self.head_dim, self.embed_dim]
+        self.ffn1_weight_shape = (
+            [self.embed_dim, self.dim_feedforward * 2]
+            if self.activation.endswith("glu")
+            else [self.embed_dim, self.dim_feedforward]
+        )
+        self.ffn2_weight_shape = [self.dim_feedforward, self.embed_dim]
+
+    def get_weight_create_dype(self):
+        return self._dtype
+
+    def compute_layernorm_before_qkv(self, src, i):
+        if i == 0:
+            ln_out = self.norm_func(src, self.ln_scales[i], self.ln_biases[i], self._epsilon, begin_norm_axis=1)
+        else:
+            ln_out = src
+
+        return ln_out
+
+    def compute_qkv_linear(self, ln_out, i):
+        if paddle.version.cuda() == "False" or float(paddle.version.cuda()) < 11.6:
+            qkv_out = paddle.matmul(ln_out, self.qkv_weights[i], False, True)
+            if self.qkv_biases[i] is not None:
+                qkv_out = paddle.add(qkv_out, self.qkv_biases[i])
+            return qkv_out
+        else:
+            # This method requires CUDA version >= 11.6.
+            return self.linear(ln_out, self.qkv_weights[i], self.qkv_biases[i], transpose_weight=True)
+
+    def compute_qkv(self, src, residual_input, i):
+        ln_out = self.compute_layernorm_before_qkv(src, i)
+        qkv_out = self.compute_qkv_linear(ln_out, i)
+        return qkv_out, residual_input
+
+    def compute_max_len(self, seq_lens_encoder, seq_lens_decoder, cum_offsets):
+        if seq_lens_encoder is None or seq_lens_decoder is None or cum_offsets is None:
+            return None, None
+        return paddle.incubate.nn.functional.blha_get_max_len(
+            seq_lens_encoder, seq_lens_decoder, cum_offsets  # cum_offsets.shape[0] used as bsz
+        )
+
+    def compute_fmha(
+        self,
+        qkv_out,
+        padding_offset,
+        seq_lens,
+        input_ids,
+        rotary_embs,
+        rotary_emb_dims,
+        caches,
+        pre_caches,
+        pre_caches_length,
+        attn_mask,
+        i,
+    ):
+        bsz = input_ids.shape[0]
+        """
+        qkv: bsz, seq_len, 3, numhead, headsize ->
+        q_out: bsz, numhead, seq_len, headsize
+        kv_out: 2, bsz, numhead, seq_len, headsize
+        """
+        q_out, k_out, v_out = qkv_transpose_split(
+            qkv_out, padding_offset, seq_lens, input_ids, self.num_heads, self.head_dim
+        )
+
+        # rotary emb (inplace)
+        if rotary_embs is not None:
+            encode_rotary_qk(
+                q_out,
+                k_out,
+                rotary_embs,
+                seq_lens,
+                rotary_emb_dims=rotary_emb_dims,
+                use_neox=self.use_neox_rotary_style,
+            )
+
+        if pre_caches is not None:
+            k_out = paddle.concat([pre_caches[i][0, :bsz], k_out], axis=2)
+            v_out = paddle.concat([pre_caches[i][1, :bsz], v_out], axis=2)
+
+        # write cache kv (inplace)
+        write_cache_kv(k_out, v_out, caches[i], seq_lens + pre_caches_length)
+
+        # cutlass fmha
+        qktv_out = variable_length_memory_efficient_attention(
+            q_out,
+            k_out,
+            v_out,
+            seq_lens,
+            seq_lens + pre_caches_length,
+            mask=attn_mask,
+            scale=float(self.head_dim**-0.5),
+        )
+
+        return transpose_remove_padding(qktv_out, seq_lens, padding_offset)
+
+    def compute_mmha(self, qkv_out, caches, attn_mask, seq_lens, rotary_embs, rotary_emb_dims, i):
+        return masked_multihead_attention(
+            x=qkv_out,
+            cache_kv=caches[i],
+            src_mask=attn_mask,
+            sequence_lengths=seq_lens,
+            rotary_tensor=rotary_embs,
+            rotary_emb_dims=rotary_emb_dims,
+            use_neox_rotary_style=self.use_neox_rotary_style,
+        )[0]
+
+    def compute_out_linear(self, fmha_out, i):
+        return paddle.matmul(fmha_out, self.linear_weights[i])
+
+    def compute_attn(
+        self,
+        time_step,
+        qkv_out,
+        padding_offset,
+        seq_lens,
+        input_ids,
+        rotary_embs,
+        rotary_emb_dims,
+        caches,
+        pre_caches,
+        pre_caches_length,
+        attn_mask,
+        i,
+        **kwargs,
+    ):
+        # fmha compute
+        if time_step is None:  # context
+            fmha_out = self.compute_fmha(
+                qkv_out,
+                padding_offset,
+                seq_lens,
+                input_ids,
+                rotary_embs,
+                rotary_emb_dims,
+                caches,
+                pre_caches,
+                pre_caches_length,
+                attn_mask,
+                i,
+            )
+
+        else:
+            fmha_out = self.compute_mmha(qkv_out, caches, attn_mask, seq_lens, rotary_embs, rotary_emb_dims, i)
+
+        out_linear_out = self.compute_out_linear(fmha_out, i)
+
+        return out_linear_out
+
+    def compute_ffn_layernorm(self, out_linear_out, residual_input, i):
+        norm_out = self.norm_func(
+            out_linear_out,
+            norm_weight=self.ffn_ln_scales[i],
+            norm_bias=self.ffn_ln_biases[i],
+            epsilon=self._epsilon,
+            begin_norm_axis=1,
+            bias=self.linear_biases[i],
+            residual=residual_input,
+        )
+        tmp_out, residual_input = norm_out[0], norm_out[1]
+
+        return tmp_out, residual_input
+
+    def compute_activation(self, ffn1_out, i):
+        return fused_act_bias_wrapper(ffn1_out, self.ffn1_biases[i], act_method=self.activation)
+
+    def compute_ffn1(self, tmp_out, i):
+        return paddle.matmul(tmp_out, self.ffn1_weights[i])
+
+    def compute_ffn2(self, ffn1_out, i):
+        return paddle.matmul(ffn1_out, self.ffn2_weights[i])
+
+    def compute_bias_residual_layernorm(self, ffn2_out, residual_input, i, num_layers):
+
+        if i != num_layers - 1:
+            norm_out = self.norm_func(
+                ffn2_out,
+                norm_weight=self.ln_scales[i + 1],
+                norm_bias=self.ln_biases[i + 1],
+                epsilon=self._epsilon,
+                begin_norm_axis=1,
+                bias=self.ffn2_biases[i],
+                residual=residual_input,
+            )
+            tmp_out, residual_input = norm_out[0], norm_out[1]
+        else:
+            tmp_out = fused_layer_norm(
+                ffn2_out,
+                norm_weight=None,
+                norm_bias=None,
+                epsilon=self._epsilon,
+                begin_norm_axis=1,
+                bias=self.ffn2_biases[i],
+                residual=residual_input,
+            )[0]
+        return tmp_out, residual_input
+
+    def pre_process(self, **kwargs):
+        pass
+
+    def post_process(self, **kwargs):
+        time_step = kwargs.get("time_step", None)
+        multi_block_output = kwargs.get("multi_block_output", None)
+        cum_offsets = kwargs.get("cum_offsets", None)
+        seq_lens = kwargs.get("seq_lens", None)
+        input_ids = kwargs.get("input_ids", None)
+
+        if time_step is None:
+            out = rebuild_padding(multi_block_output, cum_offsets, seq_lens, input_ids)
+        else:
+            out = multi_block_output
+
+        return out
+
+    def forward(
+        self,
+        input_ids,
+        src,
+        cum_offsets=None,
+        padding_offset=None,
+        attn_mask=None,
+        caches=None,
+        pre_caches=None,
+        pre_caches_length=0,
+        rotary_embs=None,
+        rotary_emb_dims=0,
+        seq_lens=None,
+        time_step=None,
+        **kwargs,
+    ):
+        r"""
+        Applies multi transformer layers on the input.
+
+        Parameters:
+            src (Tensor): The input of Transformer layers. It is
+                a tensor with shape `[batch_size, sequence_length, d_model]`.
+                The data type should be float16 or float32.
+            attn_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                `[batch_size, 1, sequence_length, sequence_length]`. It can be
+                None when nothing wanted or needed to be prevented attention to.
+                Default None.
+            caches (list(Tensor)|tuple(Tensor), optional): The cache structure
+                tensors for the inference generation model. It is only used for
+                inference and should be None for training. The shape is
+                `[2, batch_size, num_head, max_seq_len, head_dim]`. Default None.
+            pre_caches (list(Tensor)|tuple(Tensor), optional): The prefix caches
+                for the generation model. The shape is `[2, bsz, num\_head, cache\_len, head\_dim]`. Default None.
+            rotary_embs (Tensor optional): The RoPE embs for the rotary computation. The shape is `[2, bsz, 1, seq\_len, head\_dim]`. Default None.
+            rotary_emb_dims (int, optional): The rotary_emb_dims of rotary computation, and it is 0 when rotary_embs is None,
+                1 when rotary_embs is not None and pos_extra_ids is None, 2 when rotary_embs and pos_extra_ids are both not None. Default 0.
+            seq_lens (Tensor optional): The sequence lengths of this batch. The shape is `[bsz]`. Default None.
+            time_step (Tensor, optional): The time step tensor for the generation
+                model. Which used in decode stage, to represent the time step,
+                that is, the real seq_len of CacheKV. The shape is `[1]`, must be
+                in CPUPlace. Default None.
+
+        Returns:
+            Tensor|tuple: If `caches` is None, return a tensor that has
+            the same shape and data type with `src`, representing the output
+            of Transformer layers. If `caches` is not None, return the
+            tuple (output, caches), which output is the output of
+            Transformer layers, caches is inplace with input `caches`.
+        """
+        self.pre_process(**kwargs)
+        kwargs["cum_offsets"] = cum_offsets
+
+        if caches is not None:
+            assert len(caches) == len(self.qkv_weights) or len(caches) == 2 * len(self.qkv_weights)
+
+        assert self.num_layers == len(self.qkv_weights)
+
+        max_enc_len_this_time, max_dec_len_this_time = self.compute_max_len(
+            kwargs.get("seq_lens_encoder", None), kwargs.get("seq_lens_decoder", None), cum_offsets
+        )
+        kwargs["max_enc_len_this_time"] = max_enc_len_this_time
+        kwargs["max_dec_len_this_time"] = max_dec_len_this_time
+
+        residual_input = src
+        for i in range(self.num_layers):
+            qkv_out, residual_input = self.compute_qkv(src, residual_input, i)
+            out_linear_out = self.compute_attn(
+                time_step,
+                qkv_out,
+                padding_offset,
+                seq_lens,
+                input_ids,
+                rotary_embs,
+                rotary_emb_dims,
+                caches,
+                pre_caches,
+                pre_caches_length,
+                attn_mask,
+                i,
+                **kwargs,
+            )
+            # all_reduce
+            if self.nranks > 1:
+                dist.all_reduce(out_linear_out)
+
+            # ffn layernorm
+            tmp_out, residual_input = self.compute_ffn_layernorm(out_linear_out, residual_input, i)
+
+            # ffn1 matmul
+            ffn1_out = self.compute_ffn1(tmp_out, i)
+            ffn1_out = self.compute_activation(ffn1_out, i)
+
+            # ffn2 matmul
+            ffn2_out = self.compute_ffn2(ffn1_out, i)
+
+            # all_reduce
+            if self.nranks > 1:
+                dist.all_reduce(ffn2_out)
+
+            # norm + residual_add_bias
+            tmp_out, residual_input = self.compute_bias_residual_layernorm(
+                ffn2_out, residual_input, i, self.num_layers
+            )
+            src = tmp_out
+
+        kwargs["time_step"] = time_step
+        kwargs["multi_block_output"] = tmp_out
+        kwargs["seq_lens"] = seq_lens
+        kwargs["input_ids"] = input_ids
+
+        out = self.post_process(**kwargs)
+        return out, caches
+
+
+class FusedMultiTransformerPostLayernorm(FusedMultiTransformerBase):
+    def __init__(self, config: FusedMultiTransformerConfig):
+        super().__init__(config)
+
+    def compute_qkv(self, src, residual_input, i):
+        qkv_out = self.compute_qkv_linear(src, i)
+        return qkv_out, src
+
+    def compute_ffn_layernorm(self, out_linear_out, residual_input, i):
+        tmp_out = self.norm_func(
+            out_linear_out,
+            norm_weight=self.ln_scales[i],
+            norm_bias=self.ln_biases[i],
+            epsilon=self._epsilon,
+            residual_alpha=self._residual_alpha,
+            begin_norm_axis=1,
+            bias=self.linear_biases[i],
+            residual=residual_input,
+        )[0]
+
+        return tmp_out, tmp_out
+
+    def compute_bias_residual_layernorm(self, ffn2_out, residual_input, i, num_layers):
+        tmp_out = self.norm_func(
+            ffn2_out,
+            norm_weight=self.ffn_ln_scales[i],
+            norm_bias=self.ffn_ln_biases[i],
+            epsilon=self._epsilon,
+            residual_alpha=self._residual_alpha,
+            begin_norm_axis=1,
+            bias=self.ffn2_biases[i],
+            residual=residual_input,
+        )[0]
+        return tmp_out, tmp_out
+
+
+class FusedMultiTransformerWeightOnly(FusedMultiTransformerBase):
+    def __init__(self, config: FusedMultiTransformerConfig):
+        super().__init__(config)
+        self.quant_type = config.quant_type
+        if self.quant_type == "weight_only_int8":
+            self.weight_dtype = "int8"
+        elif self.quant_type == "weight_only_int4":
+            self.weight_dtype = "int4"
+        else:
+            assert (
+                self.quant_type == "weight_only_int8" or self.quant_type == "weight_only_int4"
+            ), "Expected quant_type equal to 'weight_only_int8' or 'weight_only_int4', but received {}".format(
+                self.quant_type
+            )
+
+        self.weight_scale_dtype = self._dtype
+        self.qkv_weights_scale = []
+        self.linear_weights_scale = []
+        self.ffn1_weights_scale = []
+        self.ffn2_weights_scale = []
+
+        for i in range(self.num_layers):
+
+            qkv_weight_scale_attr = self.get_attr(config.qkv_weight_scale_attrs, i)
+            linear_weight_scale_attr = self.get_attr(config.linear_weight_scale_attrs, i)
+            ffn1_weight_scale_attr = self.get_attr(config.ffn1_weight_scale_attrs, i)
+            ffn2_weight_scale_attr = self.get_attr(config.ffn2_weight_scale_attrs, i)
+
+            qkv_weight_scale = self.create_parameter(
+                shape=[(config.num_heads + 2 * config.kv_num_heads) * self.head_dim],
+                attr=qkv_weight_scale_attr,
+                dtype=self.weight_scale_dtype,
+                is_bias=False,
+            )
+
+            linear_weight_scale = self.create_parameter(
+                shape=[config.embed_dim],
+                attr=linear_weight_scale_attr,
+                dtype=self.weight_scale_dtype,
+                is_bias=False,
+            )
+
+            ffn1_weight_scale = self.create_parameter(
+                shape=[config.dim_feedforward * 2] if config.activation.endswith("glu") else [config.dim_feedforward],
+                attr=ffn1_weight_scale_attr,
+                dtype=self.weight_scale_dtype,
+                is_bias=False,
+            )
+
+            ffn2_weight_scale = self.create_parameter(
+                shape=[config.embed_dim],
+                attr=ffn2_weight_scale_attr,
+                dtype=self.weight_scale_dtype,
+                is_bias=False,
+            )
+
+            self.qkv_weights_scale.append(qkv_weight_scale)
+            self.linear_weights_scale.append(linear_weight_scale)
+            self.ffn1_weights_scale.append(ffn1_weight_scale)
+            self.ffn2_weights_scale.append(ffn2_weight_scale)
+
+            self._add_parameter(qkv_weight_scale)
+            self._add_parameter(linear_weight_scale)
+            self._add_parameter(ffn1_weight_scale)
+            self._add_parameter(ffn2_weight_scale)
+
+    def get_weight_create_dype(self):
+        return "int8"  # If use weightonly int4, params dtype is int8, and one of the dimension will be half.
+
+    def init_weight_shape(self, config):
+        super().init_weight_shape(config)
+
+        self.linear_weight_shape = [self.embed_dim, self.num_heads * self.head_dim]
+        self.ffn1_weight_shape = (
+            [self.dim_feedforward * 2, self.embed_dim]
+            if self.activation.endswith("glu")
+            else [self.dim_feedforward, self.embed_dim]
+        )
+        self.ffn2_weight_shape = [self.embed_dim, self.dim_feedforward]
+
+        if config.quant_type == "weight_only_int4":
+            self.qkv_weight_shape[0] //= 2
+            self.linear_weight_shape[0] //= 2
+            self.ffn1_weight_shape[0] //= 2
+            self.ffn2_weight_shape[0] //= 2
+
+    def compute_qkv_linear(self, ln_out, i):
+        return weight_only_linear(
+            ln_out,
+            weight=self.qkv_weights[i],
+            bias=self.qkv_biases[i],
+            weight_scale=self.qkv_weights_scale[i],
+            weight_dtype=self.weight_dtype,
+        )
+
+    def compute_out_linear(self, fmha_out, i):
+        return weight_only_linear(
+            fmha_out,
+            weight=self.linear_weights[i],
+            weight_scale=self.linear_weights_scale[i],
+            weight_dtype=self.weight_dtype,
+        )
+
+    def compute_ffn1(self, tmp_out, i):
+        return weight_only_linear(
+            tmp_out,
+            weight=self.ffn1_weights[i],
+            weight_scale=self.ffn1_weights_scale[i],
+            weight_dtype=self.weight_dtype,
+        )
+
+    def compute_ffn2(self, ffn1_out, i):
+        return weight_only_linear(
+            ffn1_out,
+            weight=self.ffn2_weights[i],
+            weight_scale=self.ffn2_weights_scale[i],
+            weight_dtype=self.weight_dtype,
+        )
+
+
+class FusedMultiTransformerWeightOnlyPostLayernorm(
+    FusedMultiTransformerWeightOnly, FusedMultiTransformerPostLayernorm
+):
+    def __init__(self, config: FusedMultiTransformerConfig):
+        super().__init__(config)
+
+
+class FusedMultiTransformerAvx(FusedMultiTransformerBase):
+    def __init__(self, config: FusedMultiTransformerConfig, max_position_embeddings, compute_type):
+        super().__init__(config)
+        self._dtype = self._helper.get_default_dtype()
+        self.embed_dim = config.embed_dim
+        self.head_dim = config.embed_dim // config.num_heads
+        self.num_heads = config.num_heads // config.nranks
+        self.kv_num_heads = config.kv_num_heads // config.nranks
+        self.num_layers = config.num_layers
+        self.create_params_type = self.get_weight_create_dype()
+        self.activation = config.activation
+        self.norm_type = config.norm_type
+        self.intermediate_size = config.dim_feedforward
+        self.max_positions = max_position_embeddings
+        self.max_pos_embed = max_position_embeddings
+        self.hiddensize = self.num_heads * self.head_dim
+        self._compute_type = compute_type
+
+        self.gate_weights = []
+        self.up_weights = []
+
+        gate_weight_attrs = [
+            paddle.ParamAttr(
+                name="fusellama.{}.gate_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(self.num_layers)
+        ]
+        up_weight_attrs = [
+            paddle.ParamAttr(
+                name="fusellama.{}.up_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(self.num_layers)
+        ]
+
+        for i in range(self.num_layers):
+            gate_weight_attr = self.get_attr(gate_weight_attrs, i)
+            up_weight_attr = self.get_attr(up_weight_attrs, i)
+            gate_weight = self.create_parameter(
+                shape=self.gate_weight_shape,
+                attr=gate_weight_attr,
+                dtype=self.create_params_type,
+                is_bias=False,
+            )
+            up_weight = self.create_parameter(
+                shape=self.up_weight_shape,
+                attr=up_weight_attr,
+                dtype=self.create_params_type,
+                is_bias=False,
+            )
+            self.gate_weights.append(gate_weight)
+            self.up_weights.append(up_weight)
+            self._add_parameter(gate_weight)
+            self._add_parameter(up_weight)
+
+    def init_weight_shape(self, config):
+        self.gate_weight_shape = [self.embed_dim, self.dim_feedforward]
+        self.up_weight_shape = [self.embed_dim, self.dim_feedforward]
+        self.down_weight_shape = [self.dim_feedforward, self.embed_dim]
+        self.qkv_weight_shape = [self.embed_dim, (self.num_heads + 2 * self.kv_num_heads) * self.head_dim]
+        self.linear_weight_shape = [self.num_heads * self.head_dim, self.embed_dim]
+        self.ffn1_weight_shape = (
+            [self.embed_dim, self.dim_feedforward * 2]
+            if self.activation.endswith("glu")
+            else [self.embed_dim, self.dim_feedforward]
+        )
+        self.ffn2_weight_shape = [self.dim_feedforward, self.embed_dim]
+
+    def forward(
+        self,
+        input_ids,
+        src,
+        past_seq_len=None,
+        cur_seq_len=None,
+        step_idx=None,
+        **kwargs,
+    ):
+        for i in range(self.num_layers):
+            from paddlenlp_ops import xft_llama_layer
+
+            xft_out = xft_llama_layer(
+                src,
+                self.ln_scales[i],
+                self.qkv_weights[i],
+                self.linear_weights[i],
+                self.ffn_ln_scales[i],
+                self.gate_weights[i],
+                self.up_weights[i],
+                self.ffn2_weights[i],
+                past_seq_len,
+                cur_seq_len,
+                step_idx,
+                self.hiddensize,
+                self.num_layers,
+                self._compute_type,
+                self.activation,
+                self.norm_type,
+                i,
+                self.head_dim,
+                self.num_heads,
+                self.kv_num_heads,
+                self.max_positions,
+                self.max_pos_embed,
+                self.intermediate_size,
+            )
+            src = xft_out
+
+        src = src[:, -1, :]
+
+        return src
+
+
+class FusedMultiTransformerA8W8(FusedMultiTransformerBase):
+    def __init__(self, config: FusedMultiTransformerConfig):
+        super().__init__(config)
+        self.quant_round_type = config.quant_round_type
+        self.quant_max_bound = config.quant_max_bound
+        self.quant_min_bound = config.quant_min_bound
+
+        if self._dtype == "bfloat16":
+            self._fuse_kernel_compute_dtype = "bf16"
+        elif self._dtype == "float16":
+            self._fuse_kernel_compute_dtype = "fp16"
+        elif self._dtype == "float32":
+            self._fuse_kernel_compute_dtype = "fp32"
+        else:
+            raise ValueError(
+                "FusedMultiTransformer just support float32, float16 and bfloat16 as default dtype, but received {}".format(
+                    self._dtype
+                )
+            )
+
+        self.qkv_out_scales = []
+        self.linear_out_scales = []
+        self.ffn1_out_scales = []
+        self.ffn2_out_scales = []
+
+        self.linear_shifts, self.linear_smooths, self.ffn2_shifts, self.ffn2_smooths = [], [], [], []
+
+        for i in range(self.num_layers):
+            qkv_out_scale_attr = self.get_attr(config.qkv_out_scale_attrs, i)
+            linear_out_scale_attr = self.get_attr(config.linear_out_scale_attrs, i)
+            ffn1_out_scale_attr = self.get_attr(config.ffn1_out_scale_attrs, i)
+            ffn2_out_scale_attr = self.get_attr(config.ffn2_out_scale_attrs, i)
+
+            linear_shift_attr = self.get_attr(config.linear_shift_attrs, i)
+            linear_smooth_attr = self.get_attr(config.linear_smooth_attrs, i)
+            ffn2_shift_attr = self.get_attr(config.ffn2_shift_attrs, i)
+            ffn2_smooth_attr = self.get_attr(config.ffn2_smooth_attrs, i)
+
+            qkv_out_scale = self.create_parameter(
+                shape=[self.head_dim * (2 * self.kv_num_heads + self.num_heads)],
+                attr=qkv_out_scale_attr,
+                dtype="float32",
+                is_bias=False,
+                default_initializer=paddle.nn.initializer.Constant(0),
+            )
+            linear_out_scale = self.create_parameter(
+                shape=[self.embed_dim],
+                attr=linear_out_scale_attr,
+                dtype="float32",
+                is_bias=False,
+                default_initializer=paddle.nn.initializer.Constant(0),
+            )
+            ffn1_out_scale = self.create_parameter(
+                shape=[self.dim_feedforward * 2] if self.activation.endswith("glu") else [self.dim_feedforward],
+                attr=ffn1_out_scale_attr,
+                dtype="float32",
+                is_bias=False,
+                default_initializer=paddle.nn.initializer.Constant(0),
+            )
+            ffn2_out_scale = self.create_parameter(
+                shape=[self.embed_dim],
+                attr=ffn2_out_scale_attr,
+                dtype="float32",
+                is_bias=False,
+                default_initializer=paddle.nn.initializer.Constant(0),
+            )
+
+            linear_shift = None
+            if linear_shift_attr:
+                linear_shift = self.create_parameter(
+                    shape=[self.num_heads * self.head_dim], attr=linear_shift_attr, dtype=self._dtype, is_bias=False
+                )
+
+            linear_smooth = None
+            if linear_smooth_attr:
+                linear_smooth = self.create_parameter(
+                    shape=[self.num_heads * self.head_dim], attr=linear_smooth_attr, dtype=self._dtype, is_bias=False
+                )
+
+            ffn2_shift = None
+            if ffn2_shift_attr:
+                ffn2_shift = self.create_parameter(
+                    shape=[self.dim_feedforward], attr=ffn2_shift_attr, dtype=self._dtype, is_bias=False
+                )
+
+            ffn2_smooth = None
+            if ffn2_smooth_attr:
+                ffn2_smooth = self.create_parameter(
+                    shape=[self.dim_feedforward], attr=ffn2_smooth_attr, dtype=self._dtype, is_bias=False
+                )
+
+            self.qkv_out_scales.append(qkv_out_scale)
+            self.linear_out_scales.append(linear_out_scale)
+            self.ffn1_out_scales.append(ffn1_out_scale)
+            self.ffn2_out_scales.append(ffn2_out_scale)
+
+            if linear_shift is not None:
+                self.linear_shifts.append(linear_shift)
+                self.linear_smooths.append(linear_smooth)
+                self.ffn2_shifts.append(ffn2_shift)
+                self.ffn2_smooths.append(ffn2_smooth)
+
+            self._add_parameter(qkv_out_scale)
+            self._add_parameter(linear_out_scale)
+            self._add_parameter(ffn1_out_scale)
+            self._add_parameter(ffn2_out_scale)
+
+            self._add_parameter(linear_shift)
+            self._add_parameter(linear_smooth)
+            self._add_parameter(ffn2_shift)
+            self._add_parameter(ffn2_smooth)
+
+    def get_weight_create_dype(self):
+        return "int8"
+
+    def init_weight_shape(self, config):
+        super().init_weight_shape(config)
+
+        if not paddle.is_compiled_with_rocm():
+            self.linear_weight_shape = [self.embed_dim, self.num_heads * self.head_dim]
+            self.ffn1_weight_shape = (
+                [self.dim_feedforward * 2, self.embed_dim]
+                if self.activation.endswith("glu")
+                else [self.dim_feedforward, self.embed_dim]
+            )
+            self.ffn2_weight_shape = [self.embed_dim, self.dim_feedforward]
+
+    def compute_layernorm_before_qkv(self, src, i):
+        if i == 0:
+            ln_out = self.norm_func(
+                src,
+                self.ln_scales[i],
+                self.ln_biases[i],
+                self._epsilon,
+                begin_norm_axis=1,
+                quant_scale=self.act_scales["qkv_in_scale"][i],  # quant_in_scale
+                quant_round_type=self.quant_round_type,
+                quant_max_bound=self.quant_max_bound,
+                quant_min_bound=self.quant_min_bound,
+            )
+        else:
+            ln_out = src
+
+        return ln_out
+
+    def compute_qkv_linear(self, ln_out, i):
+        if paddle.is_compiled_with_rocm():
+            qkv_out = paddle.matmul(ln_out, self.qkv_weights[i])
+        else:
+            qkv_out = paddle.matmul(ln_out, self.qkv_weights[i], False, True)
+        return qkv_out
+
+    def compute_fmha(
+        self,
+        qkv_out,
+        padding_offset,
+        seq_lens,
+        input_ids,
+        rotary_embs,
+        rotary_emb_dims,
+        caches,
+        pre_caches,
+        pre_caches_length,
+        attn_mask,
+        i,
+    ):
+        qkv_out = dequant_int8(qkv_out, self.qkv_out_scales[i], self._dtype)
+        if self.qkv_biases[i] is not None:
+            qkv_out = paddle.add(qkv_out, self.qkv_biases[i])
+
+        bsz = input_ids.shape[0]
+        """
+        qkv: bsz, seq_len, 3, numhead, headsize ->
+        q_out: bsz, numhead, seq_len, headsize
+        kv_out: 2, bsz, numhead, seq_len, headsize
+        """
+        q_out, k_out, v_out = qkv_transpose_split(
+            qkv_out, padding_offset, seq_lens, input_ids, self.num_heads, self.head_dim
+        )
+
+        # rotary emb (inplace)
+        if rotary_embs is not None:
+            encode_rotary_qk(
+                q_out,
+                k_out,
+                rotary_embs,
+                seq_lens,
+                rotary_emb_dims=rotary_emb_dims,
+                use_neox=self.use_neox_rotary_style,
+            )
+
+        if pre_caches is not None:
+            k_out = paddle.concat([pre_caches[i][0, :bsz], k_out], axis=2)
+            v_out = paddle.concat([pre_caches[i][1, :bsz], v_out], axis=2)
+
+        # write cache kv (inplace)
+        write_cache_kv(k_out, v_out, caches[i], seq_lens + pre_caches_length)
+
+        # cutlass fmha
+        qktv_out = variable_length_memory_efficient_attention(
+            q_out,
+            k_out,
+            v_out,
+            seq_lens,
+            seq_lens + pre_caches_length,
+            mask=attn_mask,
+            scale=float(self.head_dim**-0.5),
+        )
+
+        fmha_out = transpose_remove_padding(qktv_out, seq_lens, padding_offset)
+        fmha_out = quant_int8(
+            fmha_out,
+            self.linear_shifts[i] if len(self.linear_shifts) > 0 else None,
+            self.linear_smooths[i] if len(self.linear_smooths) > 0 else None,
+            self.act_scales["out_linear_in_scale"][i],
+            self.quant_round_type,
+            self.quant_max_bound,
+            self.quant_min_bound,
+        )
+        return fmha_out
+
+    def compute_mmha(self, qkv_out, caches, attn_mask, seq_lens, rotary_embs, rotary_emb_dims, i):
+        return masked_multihead_attention(
+            x=qkv_out,
+            bias=self.qkv_biases[i],
+            cache_kv=caches[i],
+            src_mask=attn_mask,
+            sequence_lengths=seq_lens,
+            rotary_tensor=rotary_embs,
+            rotary_emb_dims=rotary_emb_dims,
+            use_neox_rotary_style=self.use_neox_rotary_style,
+            qkv_out_scale=self.qkv_out_scales[i],
+            out_shift=self.linear_shifts[i] if len(self.linear_shifts) > 0 else None,
+            out_smooth=self.linear_smooths[i] if len(self.linear_smooths) > 0 else None,
+            out_scale=self.act_scales["out_linear_in_scale"][i],
+            quant_round_type=self.quant_round_type,
+            quant_max_bound=self.quant_max_bound,
+            quant_min_bound=self.quant_min_bound,
+            compute_dtype=self._fuse_kernel_compute_dtype,
+        )[0]
+
+    def compute_out_linear(self, fmha_out, i):
+        if paddle.is_compiled_with_rocm():
+            out_linear_out = paddle.matmul(fmha_out, self.linear_weights[i])
+        else:
+            out_linear_out = paddle.matmul(fmha_out, self.linear_weights[i], False, True)
+        return dequant_int8(out_linear_out, self.linear_out_scales[i], self._dtype)
+
+    def compute_ffn_layernorm(self, out_linear_out, residual_input, i):
+        norm_out = self.norm_func(
+            out_linear_out,
+            self.ffn_ln_scales[i],
+            self.ffn_ln_biases[i],
+            self._epsilon,
+            bias=self.linear_biases[i],
+            residual=residual_input,
+            begin_norm_axis=1,
+            quant_scale=self.act_scales["ffn1_in_scale"][i],  # quant_in_scale
+            quant_round_type=self.quant_round_type,
+            quant_max_bound=self.quant_max_bound,
+            quant_min_bound=self.quant_min_bound,
+        )
+        tmp_out, residual_input = norm_out[0], norm_out[1]
+
+        return tmp_out, residual_input
+
+    def compute_activation(self, ffn1_out, i):
+        return fused_act_bias_wrapper(
+            ffn1_out,
+            self.ffn1_biases[i],
+            act_method=self.activation,
+            compute_dtype=self._fuse_kernel_compute_dtype,
+            dequant_scales=self.ffn1_out_scales[i],
+            shift=self.ffn2_shifts[i] if len(self.ffn2_shifts) > 0 else None,
+            smooth=self.ffn2_smooths[i] if len(self.ffn2_smooths) > 0 else None,
+            quant_scale=self.act_scales["ffn2_in_scale"][i],
+            quant_round_type=self.quant_round_type,
+            quant_max_bound=self.quant_max_bound,
+            quant_min_bound=self.quant_min_bound,
+        )
+
+    def compute_ffn1(self, tmp_out, i):
+        if paddle.device.is_compiled_with_rocm():
+            return paddle.matmul(tmp_out, self.ffn1_weights[i])
+        else:
+            return paddle.matmul(tmp_out, self.ffn1_weights[i], False, True)
+
+    def compute_ffn2(self, ffn1_out, i):
+        if paddle.device.is_compiled_with_rocm():
+            ffn2_out = paddle.matmul(ffn1_out, self.ffn2_weights[i])
+        else:
+            ffn2_out = paddle.matmul(ffn1_out, self.ffn2_weights[i], False, True)
+        ffn2_out = dequant_int8(ffn2_out, self.ffn2_out_scales[i], self._dtype)
+        return ffn2_out
+
+    def compute_bias_residual_layernorm(self, ffn2_out, residual_input, i, num_layers):
+        if i != num_layers - 1:
+            norm_out = self.norm_func(
+                ffn2_out,
+                self.ln_scales[i + 1],
+                self.ln_biases[i + 1],
+                self._epsilon,
+                residual=residual_input,
+                begin_norm_axis=1,
+                quant_scale=self.act_scales["qkv_in_scale"][i + 1],
+                quant_round_type=self.quant_round_type,
+                quant_max_bound=self.quant_max_bound,
+                quant_min_bound=self.quant_min_bound,
+            )
+            tmp_out, residual_input = norm_out[0], norm_out[1]
+        else:
+            tmp_out = fused_layer_norm(
+                ffn2_out,
+                norm_weight=None,
+                norm_bias=None,
+                epsilon=self._epsilon,
+                begin_norm_axis=1,
+                bias=self.ffn2_biases[i],
+                residual=residual_input,
+            )[0]
+        return tmp_out, residual_input
+
+
+class FusedBlockMultiTransformer(FusedMultiTransformerBase):
+    def __init__(self, config: FusedMultiTransformerConfig):
+        super().__init__(config)
+        if core.is_compiled_with_xpu():
+            self.cache_k_per_batch_maxs = paddle.full(shape=[10, 6], fill_value=0, dtype="float32")
+            self.cache_v_per_batch_maxs = paddle.full(shape=[10, 6], fill_value=0, dtype="float32")
+
+    def compute_attn(
+        self,
+        time_step,
+        qkv_out,
+        padding_offset,
+        seq_lens,
+        input_ids,
+        rotary_embs,
+        rotary_emb_dims,
+        caches,
+        pre_caches,
+        pre_caches_length,
+        attn_mask,
+        i,
+        **kwargs,
+    ):
+        k_quant_scales = kwargs.get("k_quant_scales", None)
+        v_quant_scales = kwargs.get("v_quant_scales", None)
+        k_dequant_scales = kwargs.get("k_dequant_scales", None)
+        v_dequant_scales = kwargs.get("v_dequant_scales", None)
+
+        if self.config.cachekv_int8_type == "static":
+            k_quant_scales = self.cache_k_scales
+            v_quant_scales = self.cache_v_scales
+            k_dequant_scales = self.cache_k_out_scales
+            v_dequant_scales = self.cache_v_out_scales
+        if core.is_compiled_with_xpu():
+            fmha_out = paddle.incubate.nn.functional.block_multihead_attention_xpu(
+                qkv_out,
+                caches[2 * i],
+                caches[2 * i + 1],
+                kwargs.get("seq_lens_encoder", None),
+                kwargs.get("seq_lens_decoder", None),
+                kwargs.get("seq_lens_this_time", None),
+                kwargs.get("padding_offsets", None),
+                kwargs.get("cum_offsets", None),
+                kwargs.get("cu_seqlens_q", None),
+                kwargs.get("cu_seqlens_k", None),
+                kwargs.get("block_tables", None),
+                self.cache_k_per_batch_maxs,
+                self.cache_v_per_batch_maxs,
+                pre_caches[2 * i] if pre_caches is not None else None,  # pre_key_cache
+                pre_caches[2 * i + 1] if pre_caches is not None else None,  # pre_value_cache
+                k_quant_scales[i] if k_quant_scales is not None else None,
+                v_quant_scales[i] if v_quant_scales is not None else None,
+                k_dequant_scales[i] if k_dequant_scales is not None else None,
+                v_dequant_scales[i] if v_dequant_scales is not None else None,
+                None,  # qkv_out_scales
+                None,  # qkv_bias
+                None,  # out_shifts
+                None,  # out_smooths
+                kwargs.get("max_enc_len_this_time", None),
+                kwargs.get("max_dec_len_this_time", None),
+                rotary_embs,
+                attn_mask,
+                kwargs.get("tgt_mask", None),
+                kwargs.get("max_input_length", -1),
+                kwargs.get("block_size", 64),
+                self.use_neox_rotary_style,
+                self.config.cachekv_int8_type == "dynamic",
+                quant_round_type=self.config.quant_round_type,
+                quant_max_bound=self.config.quant_max_bound,
+                quant_min_bound=self.config.quant_min_bound,
+            )[0]
+        else:
+            fmha_out = paddle.incubate.nn.functional.block_multihead_attention(
+                qkv_out,
+                caches[2 * i],
+                caches[2 * i + 1],
+                kwargs.get("seq_lens_encoder", None),
+                kwargs.get("seq_lens_decoder", None),
+                kwargs.get("seq_lens_this_time", None),
+                kwargs.get("padding_offsets", None),
+                kwargs.get("cum_offsets", None),
+                kwargs.get("cu_seqlens_q", None),
+                kwargs.get("cu_seqlens_k", None),
+                kwargs.get("block_tables", None),
+                pre_caches[2 * i] if pre_caches is not None else None,  # pre_key_cache
+                pre_caches[2 * i + 1] if pre_caches is not None else None,  # pre_value_cache
+                k_quant_scales[i] if k_quant_scales is not None else None,
+                v_quant_scales[i] if v_quant_scales is not None else None,
+                k_dequant_scales[i] if k_dequant_scales is not None else None,
+                v_dequant_scales[i] if v_dequant_scales is not None else None,
+                None,  # qkv_out_scales
+                None,  # qkv_bias
+                None,  # out_shifts
+                None,  # out_smooths
+                kwargs.get("max_enc_len_this_time", None),
+                kwargs.get("max_dec_len_this_time", None),
+                rotary_embs,
+                attn_mask,
+                kwargs.get("tgt_mask", None),
+                kwargs.get("max_input_length", -1),
+                kwargs.get("block_size", 64),
+                self.use_neox_rotary_style,
+                self.config.cachekv_int8_type == "dynamic",
+                quant_round_type=self.config.quant_round_type,
+                quant_max_bound=self.config.quant_max_bound,
+                quant_min_bound=self.config.quant_min_bound,
+            )[0]
+        out_linear_out = self.compute_out_linear(fmha_out, i)
+
+        return out_linear_out
+
+    def post_process(self, **kwargs):
+        multi_block_output = kwargs.get("multi_block_output", None)
+        cum_offsets = kwargs.get("cum_offsets", None)
+        seq_lens_encoder = kwargs.get("seq_lens_encoder", None)
+        seq_lens_decoder = kwargs.get("seq_lens_decoder", None)
+        max_input_length = kwargs.get("max_input_length", -1)
+
+        out = rebuild_padding_v2(multi_block_output, cum_offsets, seq_lens_decoder, seq_lens_encoder, max_input_length)
+
+        return out
+
+
+class FusedBlockMultiTransformerWeightOnly(FusedBlockMultiTransformer, FusedMultiTransformerWeightOnly):
+    def __init__(self, config: FusedMultiTransformerConfig):
+        super().__init__(config)
+
+
+class FusedBlockMultiTransformerA8W8(FusedBlockMultiTransformer, FusedMultiTransformerA8W8):
+    def __init__(self, config: FusedMultiTransformerConfig):
+        super().__init__(config)
+
+    def compute_attn(
+        self,
+        time_step,
+        qkv_out,
+        padding_offset,
+        seq_lens,
+        input_ids,
+        rotary_embs,
+        rotary_emb_dims,
+        caches,
+        pre_caches,
+        pre_caches_length,
+        attn_mask,
+        i,
+        **kwargs,
+    ):
+        k_quant_scales = kwargs.get("k_quant_scales", None)
+        v_quant_scales = kwargs.get("v_quant_scales", None)
+        k_dequant_scales = kwargs.get("k_dequant_scales", None)
+        v_dequant_scales = kwargs.get("v_dequant_scales", None)
+
+        if self.config.cachekv_int8_type == "static":
+            k_quant_scales = self.cache_k_scales
+            v_quant_scales = self.cache_v_scales
+            k_dequant_scales = self.cache_k_out_scales
+            v_dequant_scales = self.cache_v_out_scales
+
+        fmha_out = paddle.incubate.nn.functional.block_multihead_attention(
+            qkv_out,
+            caches[2 * i],
+            caches[2 * i + 1],
+            kwargs.get("seq_lens_encoder", None),
+            kwargs.get("seq_lens_decoder", None),
+            kwargs.get("seq_lens_this_time", None),
+            kwargs.get("padding_offsets", None),
+            kwargs.get("cum_offsets", None),
+            kwargs.get("cu_seqlens_q", None),
+            kwargs.get("cu_seqlens_k", None),
+            kwargs.get("block_tables", None),
+            pre_caches[2 * i] if pre_caches is not None else None,  # pre_key_cache
+            pre_caches[2 * i + 1] if pre_caches is not None else None,  # pre_value_cache
+            k_quant_scales[i] if k_quant_scales is not None else None,
+            v_quant_scales[i] if v_quant_scales is not None else None,
+            k_dequant_scales[i] if k_dequant_scales is not None else None,
+            v_dequant_scales[i] if v_dequant_scales is not None else None,
+            self.qkv_out_scales[i],
+            self.qkv_biases[i] if len(self.qkv_biases) > 0 else None,
+            self.linear_shifts[i] if len(self.linear_shifts) > 0 else None,
+            self.linear_smooths[i] if len(self.linear_smooths) > 0 else None,
+            kwargs.get("max_enc_len_this_time", None),
+            kwargs.get("max_dec_len_this_time", None),
+            rotary_embs,
+            attn_mask,
+            kwargs.get("tgt_mask", None),
+            kwargs.get("max_input_length", -1),
+            kwargs.get("block_size", 64),
+            self.use_neox_rotary_style,
+            self.config.cachekv_int8_type == "dynamic",
+            quant_round_type=self.quant_round_type,
+            quant_max_bound=self.quant_max_bound,
+            quant_min_bound=self.quant_min_bound,
+            out_scale=self.act_scales["out_linear_in_scale"][i],
+            compute_dtype=self._fuse_kernel_compute_dtype,
+        )[0]
+
+        out_linear_out = self.compute_out_linear(fmha_out, i)
+
+        return out_linear_out
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/generation_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/generation_utils.py
new file mode 100644
index 000000000..b95653bbd
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/generation_utils.py
@@ -0,0 +1,1004 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from typing import List, Union
+
+import paddle
+import paddle.nn.functional as F
+
+from paddlenlp.generation import (
+    GenerationMixin,
+    LogitsProcessor,
+    LogitsProcessorList,
+    TopPProcess,
+)
+
+__all__ = ["GenerationInferenceModel", "GenerationBlockInferenceModel", "GenerationAvxInferenceModel"]
+
+
+class ForcedDecodingEOSTokenLogitsProcessor(LogitsProcessor):
+    """
+    This `LogitsProcessor` enforces the last generated token to be the selected `forced_eos_token`.
+
+    Args:
+        max_length (int): The maximum length of the sequence to be generated.
+        forced_eos_token_id (int): The id of the token to be generated as the last token.
+    """
+
+    def __init__(self, max_decoding_step: int, forced_eos_token_id: Union[int, List[int]]):
+        self.max_decoding_step = max_decoding_step
+        self.forced_eos_token_id = forced_eos_token_id
+
+    def __call__(self, input_ids, scores, decoding_step):
+        if decoding_step == self.max_decoding_step:
+            scores[:] = paddle.finfo(scores.dtype).min
+            scores[:, self.forced_eos_token_id] = 0
+        return scores
+
+
+class GenerationInferenceModel(GenerationMixin):
+    @classmethod
+    def get_cache_kvs_shape(cls, max_batch_size: int = None, max_length: int = None) -> list[list[int]]:
+        raise NotImplementedError
+
+    def to_static(self, output_path: str, config: dict):
+        dtype = config.get("dtype", paddle.get_default_dtype())
+
+        cache_kvs_shapes = self.get_cache_kvs_shape(self.config, max_length=config.get("max_length", None))
+        export_precache = config.get("export_precache", False)
+        if export_precache:
+            precache_input_spec = [
+                paddle.static.InputSpec(shape=[2, None, None, None, None], dtype=dtype, name=f"pre_caches_{i}")
+                for i in range(len(cache_kvs_shapes))
+            ]
+        else:
+            precache_input_spec = None
+
+        input_spec = [
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),  # input_ids
+            paddle.static.InputSpec(shape=[None, 1, None, None], dtype=dtype, name="attention_mask"),  # attention_mask
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="position_ids"),  # position_ids
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="penalty_score"),  # penalty_score
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="frequency_score"),  # frequency_score
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="presence_score"),  # presence_score
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="min_length"),  # min_decode_length
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="max_length"),  # max_decode_length
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="temperature"),  # temperature
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="top_p"),  # top_p
+            paddle.static.InputSpec(shape=[None], dtype="int64", name="eos_token_id"),  # eos_token_id
+            paddle.static.InputSpec(shape=[None, 1], dtype="int32", name="seq_len_encoder"),  # seq_len_encoder
+            paddle.static.InputSpec(shape=[None, 1], dtype="int32", name="seq_len_decoder"),  # seq_len_decoder
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="step_idx"),  # step_idx
+            paddle.static.InputSpec(shape=[None, 1], dtype="bool", name="stop_flags"),  # stop_flags
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="tgt_ids"),  # tgt_ids
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="tgt_pos"),  # tgt_pos
+            paddle.static.InputSpec(
+                shape=[None, 1, 1, None], dtype=dtype, name="tgt_generation_mask"
+            ),  # tgt_generation_mask
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="pre_ids"),  # pre_ids
+            paddle.static.InputSpec(shape=[1], dtype="int64", name="stop_nums"),  # stop_nums
+            [
+                paddle.static.InputSpec(
+                    shape=shape,
+                    dtype=dtype,
+                    name="cache_kvs_{}".format(i),
+                )
+                for i, shape in enumerate(cache_kvs_shapes)
+            ],  # cache_kvs
+            None,  # inputs_embeds
+            config.get("logits_processors", None),
+            precache_input_spec,
+        ]
+        # use "==" to distingusih between chatglm and chatglm_v2.
+        if self.config["model_type"] and "chatglm" == self.config.model_type.lower():
+            input_spec[2] = paddle.static.InputSpec(
+                shape=[None, None, None], dtype="int64", name="position_ids"
+            )  # position_ids
+            input_spec[16] = paddle.static.InputSpec(shape=[None, 2, 1], dtype="int64", name="tgt_pos")  # tgt_pos
+        elif self.config["model_type"] and "gpt" in self.config.model_type:
+            input_spec[2] = paddle.static.InputSpec(shape=[None], dtype="int64", name="position_ids")  # position_ids
+        model = paddle.jit.to_static(self.generate, input_spec=input_spec)
+        paddle.jit.save(
+            model, output_path, skip_prune_program=True
+        )  # Note(Zhengzekang): If we prune program it may cause some inference error.
+
+    @staticmethod
+    def prepare_input_ids_for_generation(bos_token_id, encoder_output=None):
+        batch_size = 1
+        seq_len = 1
+        if bos_token_id is None:
+            raise ValueError("`bos_token_id` should be defined when no " "`input_ids` are provided.")
+        if encoder_output is not None:
+            batch_size = encoder_output.shape[0]
+            seq_len = encoder_output.shape[1]
+        return paddle.ones([batch_size, seq_len], dtype="int64") * bos_token_id
+
+    @paddle.no_grad()
+    def generate(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        penalty_score=None,
+        frequency_score=None,
+        presence_score=None,
+        min_length=None,
+        max_length=None,
+        temperature=None,
+        top_p=None,
+        eos_token_id=None,
+        seq_len_encoder=None,
+        seq_len_decoder=None,
+        step_idx=None,
+        stop_flags=None,
+        tgt_ids=None,
+        tgt_pos=None,
+        tgt_generation_mask=None,
+        pre_ids=None,
+        stop_nums=None,
+        cache_kvs=[],
+        inputs_embeds=None,
+        logits_processors=None,
+        pre_caches=None,
+        **model_kwargs,
+    ):
+        model_kwargs["position_ids"] = position_ids
+        model_kwargs["attention_mask"] = attention_mask
+
+        model_kwargs["seq_len_encoder"] = seq_len_encoder
+        model_kwargs["seq_len_decoder"] = seq_len_decoder
+        model_kwargs["tgt_ids"] = tgt_ids
+        model_kwargs["tgt_generation_mask"] = tgt_generation_mask
+        model_kwargs["tgt_pos"] = tgt_pos
+        model_kwargs["step_idx"] = step_idx
+        model_kwargs["stop_flags"] = stop_flags
+        model_kwargs["pre_ids"] = pre_ids
+        model_kwargs["min_dec_len"] = min_length
+        model_kwargs["max_dec_len"] = max_length
+        model_kwargs["stop_nums"] = stop_nums
+        model_kwargs["penalty_score"] = penalty_score
+        model_kwargs["frequency_score"] = frequency_score
+        model_kwargs["presence_score"] = presence_score
+        model_kwargs["logits_processors"] = logits_processors or LogitsProcessorList()
+        if pre_caches is not None:
+            model_kwargs["pre_caches"] = pre_caches
+
+        ret = self.sample(
+            input_ids,
+            eos_token_id,
+            top_p=top_p,
+            cache_kvs=cache_kvs,
+            temperature=temperature,
+            inputs_embeds=inputs_embeds,
+            **model_kwargs,
+        )
+
+        return ret
+
+    def update_model_kwargs_for_generation(self, cache, just_decoder, next_tokens, eos_token_id, model_kwargs):
+        if cache is None:
+            model_kwargs["step_idx"] = paddle.where(
+                model_kwargs["seq_len_encoder"] == 0,
+                model_kwargs["step_idx"],
+                model_kwargs["step_idx"] + 1,
+            )
+        else:
+            model_kwargs["step_idx"] = paddle.where(
+                model_kwargs["stop_flags"],
+                model_kwargs["step_idx"],
+                model_kwargs["step_idx"] + 1,
+            )
+        length_cond = paddle.greater_equal(model_kwargs["step_idx"], model_kwargs["max_dec_len"])
+        model_kwargs["stop_flags"] = paddle.logical_or(model_kwargs["stop_flags"], length_cond)
+        if cache is None:
+            next_tokens = paddle.where(just_decoder, paddle.full_like(next_tokens, -1), next_tokens)
+        from paddlenlp_ops import set_stop_value_multi_ends
+
+        next_tokens, model_kwargs["stop_flags"] = set_stop_value_multi_ends(
+            next_tokens, model_kwargs["stop_flags"], eos_token_id, 2
+        )  # multi ends
+
+        if cache is None:
+            # encoder's generation
+            model_kwargs["tgt_ids"] = paddle.where(just_decoder, model_kwargs["tgt_ids"], next_tokens)
+            if self.config["position_encoding_2d"] and self.config.position_encoding_2d is True:
+                tgt_pos = model_kwargs["tgt_pos"]
+                new_position_id = tgt_pos[:, 0, :].clone()
+                new_block_id = tgt_pos[:, 1, :].clone()
+                new_block_id = new_block_id + 1
+
+                model_kwargs["tgt_pos"] = paddle.concat(
+                    [new_position_id.unsqueeze(1), new_block_id.unsqueeze(1)], axis=1
+                )
+            else:
+                model_kwargs["tgt_pos"] = paddle.where(
+                    just_decoder, model_kwargs["tgt_pos"], model_kwargs["tgt_pos"] + 1
+                )
+            model_kwargs["seq_len_decoder"] = paddle.where(
+                model_kwargs["stop_flags"],
+                model_kwargs["seq_len_decoder"] - model_kwargs["seq_len_decoder"],
+                model_kwargs["seq_len_decoder"],
+            )
+        else:
+            model_kwargs["tgt_ids"] = next_tokens
+            if self.config["position_encoding_2d"] and self.config.position_encoding_2d is True:
+                tgt_pos = model_kwargs["tgt_pos"]
+                new_position_id = tgt_pos[:, 0, :].clone()
+                new_block_id = tgt_pos[:, 1, :].clone()
+                new_block_id = new_block_id + 1
+
+                model_kwargs["tgt_pos"] = paddle.concat(
+                    [new_position_id.unsqueeze(1), new_block_id.unsqueeze(1)], axis=1
+                )
+            else:
+                model_kwargs["tgt_pos"] = paddle.where(
+                    model_kwargs["stop_flags"],
+                    model_kwargs["tgt_pos"],
+                    model_kwargs["tgt_pos"] + 1,
+                )
+
+            model_kwargs["seq_len_decoder"] = paddle.where(
+                model_kwargs["stop_flags"],
+                model_kwargs["seq_len_decoder"],
+                model_kwargs["seq_len_decoder"] + 1,
+            )
+
+            model_kwargs["seq_len_decoder"] = paddle.where(
+                model_kwargs["stop_flags"],
+                model_kwargs["seq_len_decoder"] - model_kwargs["seq_len_decoder"],
+                model_kwargs["seq_len_decoder"],
+            )
+
+        model_kwargs["next_tokens"] = next_tokens
+        return model_kwargs
+
+    def sample(
+        self,
+        input_ids=None,
+        eos_token_id=None,
+        cache_kvs=[],
+        top_p=None,
+        temperature=None,
+        inputs_embeds=None,
+        **model_kwargs,
+    ):
+        step_idx_ori = paddle.full(shape=[1], dtype="int64", fill_value=1)
+        batch_idx = paddle.full(shape=[1], dtype="int32", fill_value=-1)
+
+        # fake temp next_tokens
+        batch = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0]
+        next_tokens = paddle.full(shape=[batch, 1], dtype="int32", fill_value=0)
+
+        # let inputs_embeds enter into model_kwargs.
+        # because the code below directly use the model_kwargs as a parameter without using inputs_embeds.
+        if inputs_embeds is not None:
+            model_kwargs["inputs_embeds"] = inputs_embeds
+        model_kwargs["all_input_ids"] = input_ids
+        logits_processors = model_kwargs.pop("logits_processors")
+
+        def _forward_(**args):
+            # cache_kvs is never empty because it is passed as a parameter in def sample.
+            model_inputs = self.prepare_inputs_for_generation(input_ids, cache_kvs, **args)
+            return self(**model_inputs)
+
+        def _post_process_(outputs, top_p, temperature, step_idx_ori, model_kwargs):
+            cache = model_kwargs.get("cache", None)
+            just_decoder = model_kwargs["seq_len_encoder"] == 0
+            if cache is None:  # first decoder
+                step_idx = paddle.where(
+                    just_decoder,
+                    paddle.full_like(model_kwargs["step_idx"], -1),
+                    model_kwargs["step_idx"],
+                )  # not update when continue decode
+            else:
+                step_idx = model_kwargs["step_idx"]
+            from paddlenlp_ops import set_value_by_flags_and_idx
+
+            model_kwargs["stop_flags"] = set_value_by_flags_and_idx(
+                model_kwargs["pre_ids"],
+                model_kwargs["tgt_ids"],
+                step_idx,
+                model_kwargs["stop_flags"],
+            )
+            logits = outputs[0] if isinstance(outputs, tuple) else outputs
+
+            logits = paddle.cast(logits, paddle.float32)
+            logits = logits_processors(model_kwargs["all_input_ids"], logits, decoding_step=step_idx_ori)
+
+            from paddlenlp_ops import get_token_penalty_multi_scores
+
+            logits = get_token_penalty_multi_scores(
+                model_kwargs["pre_ids"],
+                logits,
+                model_kwargs["penalty_score"],
+                model_kwargs["frequency_score"],
+                model_kwargs["presence_score"],
+                step_idx,
+                model_kwargs["min_dec_len"],
+                eos_token_id,
+            )
+            logits = logits / temperature
+
+            # sample
+            probs = F.softmax(logits)
+
+            # compute next_tokens, use paddle.tensor.top_p_sampling
+            _, next_tokens = paddle.tensor.top_p_sampling(probs, top_p)
+
+            if self.config.tensor_parallel_degree > 1:
+                paddle.distributed.broadcast(next_tokens, 0)
+
+            model_kwargs = self.update_model_kwargs_for_generation(
+                cache, just_decoder, next_tokens, eos_token_id, model_kwargs
+            )
+            next_tokens = model_kwargs["next_tokens"]
+
+            if model_kwargs["all_input_ids"] is None:
+                model_kwargs["all_input_ids"] = next_tokens
+            else:
+                model_kwargs["all_input_ids"] = paddle.concat([model_kwargs["all_input_ids"], next_tokens], axis=1)
+
+            from paddlenlp_ops import save_with_output
+
+            save_with_output(
+                next_tokens,
+                batch_idx,
+                step_idx_ori,
+                "real_time_save.temp_ids",
+                self.config.tensor_parallel_rank,
+            )
+
+            return next_tokens, model_kwargs
+
+        # encoder
+        outputs = _forward_(**model_kwargs)
+        # first decoder
+        next_tokens, model_kwargs = _post_process_(
+            outputs,
+            top_p,
+            temperature,
+            step_idx_ori,
+            model_kwargs,
+        )
+        step_idx_ori += 1
+
+        # gives it a value, means we will entered into decoder phase.
+        model_kwargs["cache"] = 0
+
+        # decoder
+        while paddle.less_than(
+            paddle.sum(paddle.cast(model_kwargs["stop_flags"], "int64")),
+            model_kwargs["stop_nums"],
+        ):
+            next_tokens, model_kwargs = _post_process_(
+                _forward_(**model_kwargs),
+                top_p,
+                temperature,
+                step_idx_ori,
+                model_kwargs,
+            )
+            step_idx_ori += 1
+
+        return (
+            next_tokens,
+            model_kwargs["step_idx"],
+            paddle.cast(model_kwargs["stop_flags"], "int32"),
+            model_kwargs["seq_len_decoder"],
+            model_kwargs["tgt_pos"],
+        )
+
+
+class GenerationBlockInferenceModel(GenerationMixin):
+    @classmethod
+    def get_cache_kvs_shape(cls, max_batch_size: int = None, max_length: int = None) -> list[list[int]]:
+        raise NotImplementedError
+
+    def to_static(self, output_path: str, config: dict):
+        dtype = config.get("dtype", paddle.get_default_dtype())
+        cachekv_dtype = dtype
+
+        cache_kvs_shapes = self.get_cache_kvs_shape(
+            self.config, max_batch_size=config.get("max_batch_size", -1), max_length=config.get("max_length", None)
+        )
+        export_precache = config.get("export_precache", False)
+        if export_precache:
+            precache_kv_spec = [
+                paddle.static.InputSpec(shape=[None, None, None, None], dtype=dtype, name=f"pre_caches_{i}")
+                for i in range(len(cache_kvs_shapes))
+            ]
+        else:
+            precache_kv_spec = None
+        cachekv_int8_type = config.get("cachekv_int8_type", "None")
+
+        if cachekv_int8_type is not None:
+            cachekv_dtype = "uint8"
+
+        if cachekv_int8_type == "dynamic":
+            cache_k_quant_scales = [
+                paddle.static.InputSpec(
+                    shape=[None, self.config.num_attention_heads],
+                    dtype="float32",
+                    name="k_quant_scales_{}".format(i),
+                )
+                for i in range(int(len(cache_kvs_shapes) / 2))
+            ]
+
+            cache_v_quant_scales = [
+                paddle.static.InputSpec(
+                    shape=[None, self.config.num_attention_heads],
+                    dtype="float32",
+                    name="v_quant_scales_{}".format(i),
+                )
+                for i in range(int(len(cache_kvs_shapes) / 2))
+            ]
+
+            cache_k_dequant_scales = [
+                paddle.static.InputSpec(
+                    shape=[None, self.config.num_attention_heads],
+                    dtype="float32",
+                    name="k_dequant_scales_{}".format(i),
+                )
+                for i in range(int(len(cache_kvs_shapes) / 2))
+            ]
+            cache_v_dequant_scales = [
+                paddle.static.InputSpec(
+                    shape=[None, self.config.num_attention_heads],
+                    dtype="float32",
+                    name="v_dequant_scales_{}".format(i),
+                )
+                for i in range(int(len(cache_kvs_shapes) / 2))
+            ]
+        else:
+            cache_k_quant_scales = None
+            cache_v_quant_scales = None
+            cache_k_dequant_scales = None
+            cache_v_dequant_scales = None
+
+        caches = []
+        for i in range(len(cache_kvs_shapes) // 2):
+            caches.append(
+                paddle.static.InputSpec(
+                    shape=cache_kvs_shapes[2 * i], dtype=cachekv_dtype, name="key_caches_{}".format(i)
+                )
+            )
+            caches.append(
+                paddle.static.InputSpec(
+                    shape=cache_kvs_shapes[2 * i + 1], dtype=cachekv_dtype, name="value_caches_{}".format(i)
+                )
+            )
+        if export_precache:
+            src_mask_spec = paddle.static.InputSpec(shape=[None, 1, None, None], dtype=dtype, name="src_mask")
+        else:
+            src_mask_spec = None
+
+        # bloom model needs src_mask and tgt_mask!
+        if "bloom" in self.config.architectures[0].lower():
+            src_mask_spec = paddle.static.InputSpec(shape=[None, None, None, None], dtype=dtype, name="src_mask")
+            tgt_mask_spec = paddle.static.InputSpec(shape=[None, None, 1, None], dtype=dtype, name="tgt_mask")
+        else:
+            tgt_mask_spec = None
+
+        input_spec = [
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),  # input_ids
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="temperature"),  # temperature
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="top_p"),  # top_p
+            paddle.static.InputSpec(shape=[None], dtype="int64", name="eos_token_id"),  # eos_token_id
+            src_mask_spec,  # src_mask
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="penalty_score"),  # penalty_score
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="frequency_score"),  # frequency_score
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="presence_score"),  # presence_score
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="next_tokens"),  # next_tokens
+            paddle.static.InputSpec(shape=[None, 1], dtype="bool", name="is_block_step"),  # is_block_step
+            paddle.static.InputSpec(shape=[None, 1], dtype="int32", name="seq_lens_this_time"),  # seq_lens_this_time
+            paddle.static.InputSpec(shape=[None, 1], dtype="int32", name="seq_lens_encoder"),  # seq_lens_encoder
+            paddle.static.InputSpec(shape=[None, 1], dtype="int32", name="seq_lens_decoder"),  # seq_lens_decoder
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="step_idx"),  # step_idx
+            paddle.static.InputSpec(shape=[None, 1], dtype="bool", name="stop_flags"),  # stop_flags
+            paddle.static.InputSpec(
+                shape=[2, None, self.config.max_seq_len, None, None], dtype="float32", name="rope_emb"
+            ),  # rope_emb
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="min_length"),  # min_dec_len
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="max_length"),  # max_dec_len
+            paddle.static.InputSpec(shape=[1, 1], dtype="int64", name="stop_nums"),  # stop_nums
+            paddle.static.InputSpec(shape=[None], dtype="int64", name="bad_tokens"),  # bad_tokens
+            paddle.static.InputSpec(shape=[1, 1], dtype="bool", name="not_need_stop"),  # not_need_stop
+            paddle.static.InputSpec(shape=[None, None], dtype="int32", name="block_tables"),  # block_tables
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="pre_ids"),  # pre_ids
+            precache_kv_spec,
+            caches,  # cache_kvs
+            cache_k_quant_scales,
+            cache_v_quant_scales,
+            cache_k_dequant_scales,
+            cache_v_dequant_scales,
+            tgt_mask_spec,
+        ]
+        model = paddle.jit.to_static(self.generate, input_spec=input_spec)
+        paddle.jit.save(
+            model, output_path, skip_prune_program=True
+        )  # Note(Zhengzekang): If we prune program it may cause some inference error.
+
+    @staticmethod
+    def prepare_input_ids_for_generation(bos_token_id, encoder_output=None):
+        batch_size = 1
+        seq_len = 1
+        if bos_token_id is None:
+            raise ValueError("`bos_token_id` should be defined when no " "`input_ids` are provided.")
+        if encoder_output is not None:
+            batch_size = encoder_output.shape[0]
+            seq_len = encoder_output.shape[1]
+        return paddle.ones([batch_size, seq_len], dtype="int64") * bos_token_id
+
+    @paddle.no_grad()
+    def generate(
+        self,
+        input_ids=None,
+        temperature=None,
+        top_p=None,
+        eos_token_id=None,
+        src_mask=None,
+        penalty_score=None,
+        frequency_score=None,
+        presence_score=None,
+        next_tokens=None,
+        is_block_step=None,
+        seq_lens_this_time=None,  # update
+        seq_lens_encoder=None,  # update
+        seq_lens_decoder=None,  # update
+        step_idx=None,
+        stop_flags=None,
+        rope_emb=None,
+        min_length=None,
+        max_length=None,
+        stop_nums=None,
+        bad_tokens=None,
+        not_need_stop=None,
+        block_tables=None,
+        pre_ids=None,
+        pre_caches=None,
+        cache_kvs=[],
+        k_quant_scales=None,
+        v_quant_scales=None,
+        k_dequant_scales=None,
+        v_dequant_scales=None,
+        tgt_mask=None,
+        **model_kwargs,
+    ):
+
+        model_kwargs["input_ids"] = input_ids
+        model_kwargs["penalty_score"] = penalty_score
+        model_kwargs["frequency_score"] = frequency_score
+        model_kwargs["presence_score"] = presence_score
+        model_kwargs["seq_lens_this_time"] = seq_lens_this_time
+        model_kwargs["seq_lens_encoder"] = seq_lens_encoder
+        model_kwargs["seq_lens_decoder"] = seq_lens_decoder
+        model_kwargs["step_idx"] = step_idx
+        model_kwargs["stop_flags"] = stop_flags
+        model_kwargs["min_dec_len"] = min_length
+        model_kwargs["max_dec_len"] = max_length
+        model_kwargs["stop_nums"] = stop_nums
+        model_kwargs["rope_emb"] = rope_emb
+        model_kwargs["bad_tokens"] = bad_tokens
+        model_kwargs["block_tables"] = block_tables
+        model_kwargs["pre_ids"] = pre_ids
+        model_kwargs["not_need_stop"] = not_need_stop
+        model_kwargs["caches"] = cache_kvs
+        model_kwargs["k_quant_scales"] = k_quant_scales
+        model_kwargs["v_quant_scales"] = v_quant_scales
+        model_kwargs["k_dequant_scales"] = k_dequant_scales
+        model_kwargs["v_dequant_scales"] = v_dequant_scales
+        model_kwargs["pre_caches"] = pre_caches
+        model_kwargs["next_tokens"] = next_tokens
+        model_kwargs["is_block_step"] = is_block_step
+        model_kwargs["src_mask"] = src_mask
+        model_kwargs["tgt_mask"] = tgt_mask
+
+        ret = self.sample(
+            eos_token_id,
+            top_k=0,
+            top_p=top_p,
+            temperature=temperature,
+            **model_kwargs,
+        )
+        return ret
+
+    def sample(
+        self,
+        eos_token_id,
+        top_k,
+        top_p,
+        penalty_score,
+        frequency_score,
+        presence_score,
+        temperature=None,
+        min_tokens_to_keep=1,
+        **model_kwargs
+    ):
+        def _forward_(**args):
+            model_inputs = self.prepare_inputs_for_generation(**args)
+            return self(**model_inputs)
+
+        def _post_process_(
+            outputs,
+            top_k,
+            top_p,
+            penalty_score,
+            frequency_score,
+            presence_score,
+            temperature,
+            model_kwargs,
+        ):
+            step_idx = model_kwargs["step_idx"]
+            from paddlenlp_ops import set_value_by_flags_and_idx_v2
+
+            set_value_by_flags_and_idx_v2(
+                model_kwargs["pre_ids"],
+                model_kwargs["input_ids"],
+                model_kwargs["seq_lens_this_time"],
+                model_kwargs["seq_lens_encoder"],
+                model_kwargs["seq_lens_decoder"],
+                step_idx,
+                model_kwargs["stop_flags"],
+            )
+
+            logits = paddle.cast(outputs, paddle.float32)
+
+            # pre-process distribution
+            from paddlenlp_ops import get_token_penalty_multi_scores_v2
+
+            logits = get_token_penalty_multi_scores_v2(
+                model_kwargs["pre_ids"],
+                logits,
+                penalty_score,
+                frequency_score,
+                presence_score,
+                temperature,
+                model_kwargs["bad_tokens"],
+                step_idx,
+                model_kwargs["min_dec_len"],
+                eos_token_id,
+            )
+
+            # sample
+            probs = F.softmax(logits)
+            _, next_tokens = paddle.tensor.top_p_sampling(probs, top_p)
+
+            if self.config.tensor_parallel_degree > 1:
+                paddle.distributed.broadcast(next_tokens, 0)
+
+            step_idx = paddle.where(model_kwargs["stop_flags"], model_kwargs["step_idx"], model_kwargs["step_idx"] + 1)
+            paddle.assign(step_idx, model_kwargs["step_idx"])
+            length_cond = paddle.greater_equal(step_idx, model_kwargs["max_dec_len"])
+            stop_flags = paddle.logical_or(model_kwargs["stop_flags"], length_cond)
+            from paddlenlp_ops import set_stop_value_multi_ends_v2
+
+            set_stop_value_multi_ends_v2(
+                next_tokens, stop_flags, model_kwargs["seq_lens_this_time"], eos_token_id, model_kwargs["next_tokens"]
+            )  # multi ends
+            paddle.assign(stop_flags, model_kwargs["stop_flags"])
+            # update inputs
+            from paddlenlp_ops import update_inputs
+
+            update_inputs(
+                stop_flags,
+                model_kwargs["not_need_stop"],
+                model_kwargs["seq_lens_this_time"],
+                model_kwargs["seq_lens_encoder"],
+                model_kwargs["seq_lens_decoder"],
+                model_kwargs["input_ids"],
+                model_kwargs["stop_nums"],
+                next_tokens,
+                model_kwargs["is_block_step"],
+            )
+            from paddlenlp_ops import save_output
+
+            save_output(next_tokens, model_kwargs["not_need_stop"], self.config.tensor_parallel_rank)
+            return next_tokens
+
+        # encoder
+        outputs = _forward_(**model_kwargs)  # [bs, 1, dim_embed]
+        # first decoder
+        next_tokens = _post_process_(
+            outputs,
+            top_k,
+            top_p,
+            penalty_score,
+            frequency_score,
+            presence_score,
+            temperature,
+            model_kwargs,
+        )
+
+        return next_tokens
+
+
+class GenerationAvxInferenceModel(GenerationMixin):
+    @classmethod
+    def get_cache_kvs_shape(cls, max_batch_size: int = None, max_length: int = None) -> list[list[int]]:
+        raise NotImplementedError
+
+    def to_static(self, output_path: str, config: dict):
+        input_spec = [
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),  # input_ids
+            None,  # attention_mask
+            None,  # position_ids
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="penalty_score"),  # penalty_score
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="frequency_score"),  # frequency_score
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="presence_score"),  # presence_score
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="min_length"),  # min_decode_length
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="max_length"),  # max_decode_length
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="temperature"),  # temperature
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="top_p"),  # top_p
+            paddle.static.InputSpec(shape=[None], dtype="int64", name="eos_token_id"),  # eos_token_id
+            paddle.static.InputSpec(shape=[None, 1], dtype="int32", name="seq_len_encoder"),  # seq_len_encoder
+            paddle.static.InputSpec(shape=[None, 1], dtype="int32", name="seq_len_decoder"),  # seq_len_decoder
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="step_idx"),  # step_idx
+            paddle.static.InputSpec(shape=[None, 1], dtype="bool", name="stop_flags"),  # stop_flags
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="tgt_ids"),  # tgt_ids
+            None,  # tgt_pos
+            None,  # tgt_generation_mask
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="pre_ids"),  # pre_ids
+            paddle.static.InputSpec(shape=[1], dtype="int64", name="stop_nums"),  # stop_nums
+            None,  # cache_kvs
+            None,  # inputs_embeds
+            config.get("logits_processors", None),
+            None,
+        ]
+        model = paddle.jit.to_static(self.generate, input_spec=input_spec)
+        paddle.jit.save(
+            model, output_path, skip_prune_program=True
+        )  # Note(Zhengzekang): If we prune program it may cause some inference error.
+
+    @staticmethod
+    def prepare_input_ids_for_generation(bos_token_id, encoder_output=None):
+        batch_size = 1
+        seq_len = 1
+        if bos_token_id is None:
+            raise ValueError("`bos_token_id` should be defined when no " "`input_ids` are provided.")
+        if encoder_output is not None:
+            batch_size = encoder_output.shape[0]
+            seq_len = encoder_output.shape[1]
+        return paddle.ones([batch_size, seq_len], dtype="int64") * bos_token_id
+
+    @paddle.no_grad()
+    def generate(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        penalty_score=None,
+        frequency_score=None,
+        presence_score=None,
+        min_length=None,
+        max_length=None,
+        temperature=None,
+        top_p=None,
+        eos_token_id=None,
+        seq_len_encoder=None,
+        seq_len_decoder=None,
+        step_idx=None,
+        stop_flags=None,
+        tgt_ids=None,
+        tgt_pos=None,
+        tgt_generation_mask=None,
+        pre_ids=None,
+        stop_nums=None,
+        cache_kvs=[],
+        inputs_embeds=None,
+        logits_processors=None,
+        pre_caches=None,
+        **model_kwargs,
+    ):
+        model_kwargs["seq_len_encoder"] = seq_len_encoder
+        model_kwargs["seq_len_decoder"] = seq_len_decoder
+        model_kwargs["tgt_ids"] = tgt_ids
+        model_kwargs["step_idx"] = step_idx
+        model_kwargs["stop_flags"] = stop_flags
+        model_kwargs["pre_ids"] = pre_ids
+        model_kwargs["min_dec_len"] = min_length
+        model_kwargs["max_dec_len"] = max_length
+        model_kwargs["stop_nums"] = stop_nums
+        model_kwargs["penalty_score"] = penalty_score
+        model_kwargs["frequency_score"] = frequency_score
+        model_kwargs["presence_score"] = presence_score
+        model_kwargs["logits_processors"] = logits_processors or LogitsProcessorList()
+
+        ret = self.sample(
+            input_ids,
+            eos_token_id,
+            top_p=top_p,
+            cache_kvs=cache_kvs,
+            temperature=temperature,
+            inputs_embeds=inputs_embeds,
+            **model_kwargs,
+        )
+        return ret
+
+    def update_model_kwargs_for_generation(self, cache, just_decoder, next_tokens, eos_token_id, model_kwargs):
+        if cache is None:
+            # llama step_idx ++
+            model_kwargs["step_idx"] = paddle.where(
+                model_kwargs["seq_len_encoder"] == 0,
+                model_kwargs["step_idx"],
+                model_kwargs["step_idx"] + 1,
+            )
+        else:
+            model_kwargs["step_idx"] = paddle.where(
+                model_kwargs["stop_flags"],
+                model_kwargs["step_idx"],
+                model_kwargs["step_idx"] + 1,
+            )
+
+        length_cond = paddle.greater_equal(model_kwargs["step_idx"], model_kwargs["max_dec_len"])
+        model_kwargs["stop_flags"] = paddle.logical_or(model_kwargs["stop_flags"], length_cond)
+        if cache is None:
+            next_tokens = paddle.where(just_decoder, paddle.full_like(next_tokens, -1), next_tokens)
+        from paddlenlp_ops import set_stop_value_multi_ends
+
+        next_tokens, model_kwargs["stop_flags"] = set_stop_value_multi_ends(
+            next_tokens, model_kwargs["stop_flags"], eos_token_id
+        )  # multi ends
+
+        if cache is None:
+            # encoder's generation
+            model_kwargs["tgt_ids"] = paddle.where(just_decoder, model_kwargs["tgt_ids"], next_tokens)
+            model_kwargs["seq_len_decoder"] = paddle.where(
+                model_kwargs["stop_flags"],
+                model_kwargs["seq_len_decoder"] - model_kwargs["seq_len_decoder"],
+                model_kwargs["seq_len_decoder"],
+            )
+        else:
+            model_kwargs["tgt_ids"] = next_tokens
+            model_kwargs["seq_len_decoder"] = paddle.where(
+                model_kwargs["stop_flags"],
+                model_kwargs["seq_len_decoder"],
+                model_kwargs["seq_len_decoder"] + 1,
+            )
+
+            model_kwargs["seq_len_decoder"] = paddle.where(
+                model_kwargs["stop_flags"],
+                model_kwargs["seq_len_decoder"] - model_kwargs["seq_len_decoder"],
+                model_kwargs["seq_len_decoder"],
+            )
+
+        model_kwargs["next_tokens"] = next_tokens
+        return model_kwargs
+
+    def sample(
+        self,
+        input_ids=None,
+        eos_token_id=None,
+        cache_kvs=[],
+        top_p=None,
+        temperature=None,
+        inputs_embeds=None,
+        **model_kwargs,
+    ):
+        step_idx_ori = paddle.full(shape=[1], dtype="int64", fill_value=1)
+        batch_idx = paddle.full(shape=[1], dtype="int32", fill_value=-1)
+
+        # fake temp next_tokens
+        batch = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0]
+        next_tokens = paddle.full(shape=[batch, 1], dtype="int32", fill_value=0)
+
+        # let inputs_embeds enter into model_kwargs.
+        # because the code below directly use the model_kwargs as a parameter without using inputs_embeds.
+        model_kwargs["inputs_embeds"] = inputs_embeds
+        model_kwargs["all_input_ids"] = input_ids
+        logits_processors = model_kwargs.pop("logits_processors")
+
+        def _forward_(**args):
+            # cache_kvs is never empty because it is passed as a parameter in def sample.
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **args)
+            return self(**model_inputs)
+
+        def _post_process_(outputs, top_p, temperature, step_idx_ori, model_kwargs):
+            cache = model_kwargs.get("cache", None)
+            just_decoder = model_kwargs["seq_len_encoder"] == 0
+            if cache is None:  # first decoder
+                step_idx = paddle.where(
+                    just_decoder,
+                    paddle.full_like(model_kwargs["step_idx"], -1),
+                    model_kwargs["step_idx"],
+                )  # not update when continue decode
+            else:
+                step_idx = model_kwargs["step_idx"]
+
+            from paddlenlp_ops import set_value_by_flags_and_idx
+
+            model_kwargs["stop_flags"] = set_value_by_flags_and_idx(
+                model_kwargs["pre_ids"],
+                model_kwargs["tgt_ids"],
+                step_idx,
+                model_kwargs["stop_flags"],
+            )
+            logits = outputs[0] if isinstance(outputs, tuple) else outputs
+            logits = paddle.cast(logits, paddle.float32)
+            logits = logits_processors(model_kwargs["all_input_ids"], logits, decoding_step=step_idx_ori)
+
+            from paddlenlp_ops import get_token_penalty_multi_scores
+
+            logits = get_token_penalty_multi_scores(
+                model_kwargs["pre_ids"],
+                logits,
+                model_kwargs["penalty_score"],
+                model_kwargs["frequency_score"],
+                model_kwargs["presence_score"],
+                step_idx,
+                model_kwargs["min_dec_len"],
+                eos_token_id,
+            )
+            logits = logits / temperature
+            probs = F.softmax(logits)
+            min_tokens_to_keep = 1
+            if top_p is not None and top_p < 1.0:
+                probs = TopPProcess(probs, top_p, min_tokens_to_keep)
+            next_tokens = paddle.multinomial(probs)
+
+            model_kwargs = self.update_model_kwargs_for_generation(
+                cache, just_decoder, next_tokens, eos_token_id, model_kwargs
+            )
+            next_tokens = model_kwargs["next_tokens"]
+
+            if model_kwargs["all_input_ids"] is None:
+                model_kwargs["all_input_ids"] = next_tokens
+            else:
+                model_kwargs["all_input_ids"] = paddle.concat([model_kwargs["all_input_ids"], next_tokens], axis=1)
+
+            from paddlenlp_ops import save_with_output
+
+            save_with_output(
+                next_tokens,
+                batch_idx,
+                step_idx_ori,
+                "real_time_save.temp_ids",
+                self.config.tensor_parallel_rank,
+            )
+
+            return next_tokens, model_kwargs
+
+        # encoder
+        outputs = _forward_(**model_kwargs)
+        # first decoder
+        next_tokens, model_kwargs = _post_process_(
+            outputs,
+            top_p,
+            temperature,
+            step_idx_ori,
+            model_kwargs,
+        )
+        step_idx_ori += 1
+
+        # gives it a value, means we will entered into decoder phase.
+        model_kwargs["cache"] = 0
+
+        while paddle.less_than(
+            paddle.sum(paddle.cast(model_kwargs["stop_flags"], "int64")),
+            model_kwargs["stop_nums"],
+        ):
+            next_tokens, model_kwargs = _post_process_(
+                _forward_(**model_kwargs),
+                top_p,
+                temperature,
+                step_idx_ori,
+                model_kwargs,
+            )
+            step_idx_ori += 1
+        return (
+            next_tokens,
+            model_kwargs["step_idx"],
+            paddle.cast(model_kwargs["stop_flags"], "int32"),
+            model_kwargs["seq_len_decoder"],
+            None,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/gpt/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/gpt/__init__.py
new file mode 100644
index 000000000..c2a7f656c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/gpt/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .modeling import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/gpt/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/gpt/modeling.py
new file mode 100644
index 000000000..4371e9b3f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/gpt/modeling.py
@@ -0,0 +1,578 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import paddle
+from paddle import nn
+from paddle.distributed import fleet
+from paddle.nn.quant import weight_quantize
+
+from paddlenlp.experimental.transformers.fused_transformer_layers import (
+    FusedMultiTransformerBase,
+    FusedMultiTransformerConfig,
+    FusedMultiTransformerWeightOnly,
+)
+from paddlenlp.experimental.transformers.generation_utils import (
+    GenerationInferenceModel,
+)
+from paddlenlp.experimental.transformers.utils import infererence_model_from_pretrained
+from paddlenlp.transformers import GPTConfig, GPTPretrainedModel
+from paddlenlp.transformers.gpt.modeling import GPTEmbeddings, parallel_matmul
+from paddlenlp.transformers.model_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+)
+from paddlenlp.transformers.model_utils import (
+    dy2st_nocheck_guard_context,
+    register_base_model,
+)
+
+__all__ = ["GPTInferenceModel", "GPTForCausalLMInferenceModel"]
+
+
+@register_base_model
+class GPTInferenceModel(GPTPretrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`GPTDecoderLayer`]
+    Args:
+        config: GPTConfig
+    """
+
+    def __init__(self, config: GPTConfig):
+        super().__init__(config)
+        self.pad_token_id = config.pad_token_id
+        self.eos_token_id = config.eos_token_id
+        self.bos_token_id = config.bos_token_id
+        self.eol_token_id = config.eol_token_id
+
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.num_layers = config.num_hidden_layers
+
+        self.max_position_embeddings = config.max_position_embeddings
+
+        self.embeddings = GPTEmbeddings(config)
+
+        self.use_weight_only = False
+        if config.quant_type == "weight_only_int8":
+            self.use_weight_only = True
+            self.quant_algo = "weight_only_int8"
+        elif config.quant_type == "weight_only_int4":
+            self.use_weight_only = True
+            self.quant_algo = "weight_only_int4"
+
+        # get ring_id
+        ring_id = -1
+        try:
+            hcg = fleet.get_hybrid_communicate_group()
+            model_parallel_group = hcg.get_model_parallel_group()
+            ring_id = model_parallel_group.id
+        except:
+            pass
+
+        ln_scale_attrs = [
+            paddle.ParamAttr(name="gpt.decoder.layers.{}.norm1.weight".format(i)) for i in range(self.num_layers)
+        ]
+        ln_bias_attrs = [
+            paddle.ParamAttr(name="gpt.decoder.layers.{}.norm1.bias".format(i)) for i in range(self.num_layers)
+        ]
+        qkv_weight_attrs = [
+            paddle.ParamAttr(
+                name="gpt.decoder.layers.{}.self_attn.qkv_proj.weight".format(i),
+                initializer=paddle.nn.initializer.Constant(value=0),
+            )
+            for i in range(self.num_layers)
+        ]
+        qkv_bias_attrs = [
+            paddle.ParamAttr(name="gpt.decoder.layers.{}.self_attn.qkv_proj.bias".format(i))
+            for i in range(self.num_layers)
+        ]
+        linear_weight_attrs = [
+            paddle.ParamAttr(
+                name="gpt.decoder.layers.{}.self_attn.out_proj.weight".format(i),
+                initializer=paddle.nn.initializer.Constant(value=0),
+            )
+            for i in range(self.num_layers)
+        ]
+        linear_bias_attrs = [
+            paddle.ParamAttr(name="gpt.decoder.layers.{}.self_attn.out_proj.bias".format(i))
+            for i in range(self.num_layers)
+        ]
+        ffn_ln_scale_attrs = [
+            paddle.ParamAttr(name="gpt.decoder.layers.{}.norm2.weight".format(i)) for i in range(self.num_layers)
+        ]
+        ffn_ln_bias_attrs = [
+            paddle.ParamAttr(name="gpt.decoder.layers.{}.norm2.bias".format(i)) for i in range(self.num_layers)
+        ]
+        ffn1_weight_attrs = [
+            paddle.ParamAttr(
+                name="gpt.decoder.layers.{}.linear1.weight".format(i),
+                initializer=paddle.nn.initializer.Constant(value=0),
+            )
+            for i in range(self.num_layers)
+        ]
+        ffn1_bias_attrs = [
+            paddle.ParamAttr(name="gpt.decoder.layers.{}.linear1.bias".format(i)) for i in range(self.num_layers)
+        ]
+        ffn2_weight_attrs = [
+            paddle.ParamAttr(
+                name="gpt.decoder.layers.{}.linear2.weight".format(i),
+                initializer=paddle.nn.initializer.Constant(value=0),
+            )
+            for i in range(self.num_layers)
+        ]
+        ffn2_bias_attrs = [
+            paddle.ParamAttr(name="gpt.decoder.layers.{}.linear2.bias".format(i)) for i in range(self.num_layers)
+        ]
+
+        qkv_weight_scale_attrs = None
+        linear_weight_scale_attrs = None
+        ffn1_weight_scale_attrs = None
+        ffn2_weight_scale_attrs = None
+        if self.use_weight_only:
+            qkv_weight_scale_attrs = [
+                paddle.ParamAttr(name="fusemt.{}.qkv_weight_scale".format(i)) for i in range(config.n_layer)
+            ]
+            linear_weight_scale_attrs = [
+                paddle.ParamAttr(name="fusemt.{}.linear_weight_scale".format(i)) for i in range(config.n_layer)
+            ]
+            ffn1_weight_scale_attrs = [
+                paddle.ParamAttr(name="fusemt.{}.ffn1_weight_scale".format(i)) for i in range(config.n_layer)
+            ]
+            ffn2_weight_scale_attrs = [
+                paddle.ParamAttr(name="fusemt.{}.ffn2_weight_scale".format(i)) for i in range(config.n_layer)
+            ]
+
+        transformer_config = FusedMultiTransformerConfig(
+            config.hidden_size,
+            config.num_attention_heads,
+            4 * config.hidden_size,
+            quant_type=config.quant_type,
+            activation="gelu",
+            num_layers=self.num_layers,
+            nranks=config.tensor_parallel_degree,
+            ring_id=ring_id,
+            ln_scale_attrs=ln_scale_attrs,
+            ln_bias_attrs=ln_bias_attrs,
+            qkv_weight_attrs=qkv_weight_attrs,
+            qkv_weight_scale_attrs=qkv_weight_scale_attrs,
+            qkv_bias_attrs=qkv_bias_attrs,
+            linear_weight_attrs=linear_weight_attrs,
+            linear_weight_scale_attrs=linear_weight_scale_attrs,
+            linear_bias_attrs=linear_bias_attrs,
+            ffn_ln_scale_attrs=ffn_ln_scale_attrs,
+            ffn_ln_bias_attrs=ffn_ln_bias_attrs,
+            ffn1_weight_attrs=ffn1_weight_attrs,
+            ffn1_weight_scale_attrs=ffn1_weight_scale_attrs,
+            ffn1_bias_attrs=ffn1_bias_attrs,
+            ffn2_weight_attrs=ffn2_weight_attrs,
+            ffn2_weight_scale_attrs=ffn2_weight_scale_attrs,
+            ffn2_bias_attrs=ffn2_bias_attrs,
+            epsilon=1e-5,
+            norm_type="layernorm",
+        )
+        if self.use_weight_only:
+            self.transformer_block = FusedMultiTransformerWeightOnly(transformer_config)
+        else:
+            self.transformer_block = FusedMultiTransformerBase(transformer_config)
+        self.norm = nn.LayerNorm(config.hidden_size, epsilon=1e-5)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def remove_padding(self, input_ids, seq_lens_this_time):
+        cum_offsets_now = paddle.cumsum(paddle.max(seq_lens_this_time) - seq_lens_this_time)
+        token_num = paddle.sum(seq_lens_this_time)
+        from paddlenlp_ops import get_padding_offset
+
+        ids_remove_padding, cum_offsets, padding_offset = get_padding_offset(
+            input_ids, cum_offsets_now, token_num, seq_lens_this_time
+        )
+        return ids_remove_padding, padding_offset, cum_offsets
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        use_cache=None,
+        cache=None,
+        cache_kvs=None,
+        seq_len_encoder=None,
+        seq_len_decoder=None,
+        past_key_values=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
+        **kwargs,
+    ):
+        cache = kwargs.get("cache", cache)
+        is_decoder = cache is not None
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is None and inputs_embeds is None:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if not is_decoder:
+            ids_remove_padding, padding_offset, cum_offsets = self.remove_padding(input_ids, seq_len_encoder)
+        else:
+            ids_remove_padding = input_ids
+            padding_offset = None
+            cum_offsets = None
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embeddings(input_ids=ids_remove_padding, position_ids=position_ids)
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        seq_lens = seq_len_decoder if is_decoder else seq_len_encoder
+
+        hidden_states = inputs_embeds
+
+        with dy2st_nocheck_guard_context():
+            hidden_states, _ = self.transformer_block(
+                input_ids,
+                hidden_states,
+                cum_offsets=cum_offsets,
+                padding_offset=padding_offset,
+                attn_mask=paddle.cast(attention_mask, dtype=hidden_states.dtype),
+                caches=cache_kvs,
+                seq_lens=seq_lens,
+                time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None,
+            )
+
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, None, all_hidden_states, all_self_attns] if v is not None)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=None,
+        )
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict):
+        dtype = paddle.get_default_dtype()
+        if "gpt.decoder.layers.0.self_attn.q_proj.weight" in state_dict.keys():
+            for i in range(self.num_layers):
+                q_proj_weight = state_dict.pop(f"gpt.decoder.layers.{i}.self_attn.q_proj.weight")
+                k_proj_weight = state_dict.pop(f"gpt.decoder.layers.{i}.self_attn.k_proj.weight")
+                v_proj_weight = state_dict.pop(f"gpt.decoder.layers.{i}.self_attn.v_proj.weight")
+
+                q_proj_weight = q_proj_weight.transpose([1, 0]).reshape(
+                    [self.num_attention_heads, self.hidden_size // self.num_attention_heads, self.hidden_size]
+                )
+                k_proj_weight = k_proj_weight.transpose([1, 0]).reshape(
+                    [self.num_attention_heads, self.hidden_size // self.num_attention_heads, self.hidden_size]
+                )
+                v_proj_weight = v_proj_weight.transpose([1, 0]).reshape(
+                    [self.num_attention_heads, self.hidden_size // self.num_attention_heads, self.hidden_size]
+                )
+
+                concated_qkv_weight = (
+                    paddle.concat([q_proj_weight, k_proj_weight, v_proj_weight], axis=1)
+                    .reshape([3 * self.hidden_size, self.hidden_size])
+                    .transpose([1, 0])
+                )
+                state_dict[f"gpt.decoder.layers.{i}.self_attn.qkv_proj.weight"] = concated_qkv_weight
+
+                q_proj_bias = state_dict.pop(f"gpt.decoder.layers.{i}.self_attn.q_proj.bias")
+                k_proj_bias = state_dict.pop(f"gpt.decoder.layers.{i}.self_attn.k_proj.bias")
+                v_proj_bias = state_dict.pop(f"gpt.decoder.layers.{i}.self_attn.v_proj.bias")
+
+                q_proj_bias = q_proj_bias.reshape(
+                    [self.num_attention_heads, self.hidden_size // self.num_attention_heads]
+                )
+                k_proj_bias = k_proj_bias.reshape(
+                    [self.num_attention_heads, self.hidden_size // self.num_attention_heads]
+                )
+                v_proj_bias = v_proj_bias.reshape(
+                    [self.num_attention_heads, self.hidden_size // self.num_attention_heads]
+                )
+
+                concated_qkv_bias = paddle.concat([q_proj_bias, k_proj_bias, v_proj_bias], axis=-1).reshape([-1])
+                state_dict[f"gpt.decoder.layers.{i}.self_attn.qkv_proj.bias"] = concated_qkv_bias
+
+        for k, v in state_dict.items():
+            if k.startswith("gpt."):
+                k = str(k.split("gpt.")[1])
+            if k.find("embeddings.word_embeddings.weight") >= 0:
+                self.embeddings.word_embeddings.weight.set_value(v.astype(dtype))
+            elif k.find("embeddings.position_embeddings.weight") >= 0:
+                self.embeddings.position_embeddings.weight.set_value(v.astype(dtype))
+            elif k.find("decoder.norm.weight") >= 0:
+                self.norm.weight.set_value(v.astype(dtype))
+            elif k.find("decoder.norm.bias") >= 0:
+                self.norm.bias.set_value(v.astype(dtype))
+            else:
+                if not k.startswith("decoder.layers."):
+                    continue
+                idx = int(k.split(".")[2])
+                if k.endswith("norm1.weight"):
+                    self.transformer_block.ln_scales[idx].set_value(v.astype("float32"))
+                elif k.endswith("norm1.bias"):
+                    self.transformer_block.ln_biases[idx].set_value(v.astype("float32"))
+                elif k.endswith("self_attn.qkv_proj.weight"):
+                    qkv_weight_tensor = (
+                        v.reshape(
+                            [
+                                self.hidden_size,
+                                self.num_attention_heads // self.config.tensor_parallel_degree,
+                                3,
+                                self.hidden_size // self.num_attention_heads,
+                            ]
+                        )
+                        .transpose([2, 1, 3, 0])
+                        .reshape(
+                            [
+                                -1,
+                                self.hidden_size,
+                            ]
+                        )
+                        .astype(dtype)
+                    )
+
+                    if self.use_weight_only:
+                        qkv_weight_tensor = paddle.transpose(qkv_weight_tensor, perm=[1, 0])
+                        qkv_quanted_weight_tensor, qkv_weight_scale_tensor = weight_quantize(
+                            qkv_weight_tensor, algo=self.quant_algo
+                        )
+                        self.transformer_block.qkv_weights[idx].set_value(qkv_quanted_weight_tensor)
+                        self.transformer_block.qkv_weights_scale[idx].set_value(qkv_weight_scale_tensor)
+                    else:
+                        self.transformer_block.qkv_weights[idx].set_value(qkv_weight_tensor)
+
+                elif k.endswith("self_attn.qkv_proj.bias"):
+                    self.transformer_block.qkv_biases[idx].set_value(
+                        v.reshape(
+                            [
+                                self.num_attention_heads // self.config.tensor_parallel_degree,
+                                3,
+                                self.hidden_size // self.num_attention_heads,
+                            ]
+                        )
+                        .transpose([1, 0, 2])
+                        .reshape([-1])
+                        .astype(dtype)
+                    )
+                elif k.endswith("self_attn.out_proj.weight"):
+                    linear_weight_tensor = paddle.to_tensor(v.astype(dtype))
+                    if self.use_weight_only:
+                        linear_quanted_weight_tensor, linear_weight_scale_tensor = weight_quantize(
+                            linear_weight_tensor, algo=self.quant_algo
+                        )
+                        self.transformer_block.linear_weights[idx].set_value(linear_quanted_weight_tensor)
+                        self.transformer_block.linear_weights_scale[idx].set_value(linear_weight_scale_tensor)
+                    else:
+                        self.transformer_block.linear_weights[idx].set_value(linear_weight_tensor)
+
+                elif k.endswith("self_attn.out_proj.bias"):
+                    self.transformer_block.linear_biases[idx].set_value(v.astype(dtype))
+                elif k.endswith("norm2.weight"):
+                    self.transformer_block.ffn_ln_scales[idx].set_value(v.astype("float32"))
+                elif k.endswith("norm2.bias"):
+                    self.transformer_block.ffn_ln_biases[idx].set_value(v.astype("float32"))
+                elif k.endswith("linear1.weight"):
+                    ffn1_weight_tensor = paddle.to_tensor(v.astype(dtype))
+                    if self.use_weight_only:
+                        ffn1_quanted_weight_tensor, ffn1_weight_scale_tensor = weight_quantize(
+                            ffn1_weight_tensor, algo=self.quant_algo
+                        )
+                        self.transformer_block.ffn1_weights[idx].set_value(ffn1_quanted_weight_tensor)
+                        self.transformer_block.ffn1_weights_scale[idx].set_value(ffn1_weight_scale_tensor)
+                    else:
+                        self.transformer_block.ffn1_weights[idx].set_value(ffn1_weight_tensor)
+                elif k.endswith("linear1.bias"):
+                    self.transformer_block.ffn1_biases[idx].set_value(v.astype(dtype))
+                elif k.endswith("linear2.weight"):
+                    ffn2_weight_tensor = paddle.to_tensor(v.astype(dtype))
+                    if self.use_weight_only:
+                        ffn2_quanted_weight_tensor, ffn2_weight_scale_tensor = weight_quantize(
+                            ffn2_weight_tensor, algo=self.quant_algo
+                        )
+                        self.transformer_block.ffn2_weights[idx].set_value(ffn2_quanted_weight_tensor)
+                        self.transformer_block.ffn2_weights_scale[idx].set_value(ffn2_weight_scale_tensor)
+                    else:
+                        self.transformer_block.ffn2_weights[idx].set_value(ffn2_weight_tensor)
+                elif k.endswith("linear2.bias"):
+                    self.transformer_block.ffn2_biases[idx].set_value(v.astype(dtype))
+                else:
+                    raise ValueError("Unknow weight {}".format(k))
+
+
+class GPTForCausalLMInferenceModel(GenerationInferenceModel, GPTPretrainedModel):
+    """
+    Dynamic Batching for GPT Model with pretraining tasks on top.
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.gpt = GPTInferenceModel(config)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        return infererence_model_from_pretrained(cls, pretrained_model_name_or_path, args, kwargs)
+
+    @classmethod
+    def get_cache_kvs_shape(
+        cls, config: GPTConfig, max_batch_size: int = None, max_length: int = None
+    ) -> list[list[int]]:
+        """get cache_kvs tensor for gpt model
+
+        Args:
+            max_batch_size (int): the max batch size
+            max_length (int | None, optional): the max_length of cache_kvs. Defaults to None.
+
+        Returns:
+            list[paddle.Tensor]: the list tensor shape for cache
+        """
+        if max_length is None:
+            max_length = config.max_position_embeddings
+
+        cache_kvs = []
+        for _ in range(config.num_hidden_layers):
+            cache_kvs.append(
+                [
+                    2,
+                    max_batch_size,
+                    config.num_attention_heads // max(config.tensor_parallel_degree, 1),
+                    max_length,
+                    config.hidden_size // config.num_attention_heads,
+                ]
+            )
+        return cache_kvs
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        cache_kvs,
+        seq_len_encoder,
+        seq_len_decoder,
+        tgt_ids,
+        tgt_pos,
+        tgt_generation_mask,
+        **kwargs,
+    ):
+        position_ids = kwargs.get("position_ids", None)
+        attention_mask = kwargs.get("attention_mask", None)
+        cache = kwargs.get("cache", None)
+        if cache is not None:
+            input_ids = tgt_ids
+            position_ids = tgt_pos
+            attention_mask = (tgt_generation_mask - 1) * 1e4
+        else:
+            attention_mask = (attention_mask - 1) * 1e4
+
+        model_inputs = {
+            "input_ids": input_ids,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "cache_kvs": cache_kvs,
+            "seq_len_encoder": seq_len_encoder,
+            "seq_len_decoder": seq_len_decoder,
+            "cache": cache,
+        }
+        return model_inputs
+
+    @staticmethod
+    def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id):
+        is_pad_token_in_inputs_ids = (pad_token_id is not None) and paddle.any(
+            input_ids == pad_token_id
+        ).numpy().item()
+        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (
+            (eos_token_id is not None) and (pad_token_id != eos_token_id)
+        )
+        if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id:
+            attention_mask = (input_ids != pad_token_id).astype("int64")
+        else:
+            attention_mask = paddle.ones_like(input_ids, dtype="int64")
+        return paddle.unsqueeze(attention_mask, axis=[1, 2])
+
+    def forward(
+        self,
+        input_ids,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=False,
+        cache=None,
+        cache_kvs=None,
+        seq_len_encoder=None,
+        seq_len_decoder=None,
+        past_key_values=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.gpt(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache=cache,
+            cache_kvs=cache_kvs,
+            seq_len_encoder=seq_len_encoder,
+            seq_len_decoder=seq_len_decoder,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = parallel_matmul(
+            hidden_states, self.gpt.embeddings.word_embeddings.weight, tensor_parallel_output=False
+        )
+
+        if not return_dict:
+            return (logits, outputs[1:])
+
+        return CausalLMOutputWithCrossAttentions(
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict):
+        self.gpt.set_state_dict({k: state_dict[k] for k in state_dict.keys()})
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/llama/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/llama/__init__.py
new file mode 100644
index 000000000..c2a7f656c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/llama/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .modeling import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/llama/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/llama/modeling.py
new file mode 100644
index 000000000..b2f33381f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/llama/modeling.py
@@ -0,0 +1,1813 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import json
+import os
+from functools import partial
+
+import numpy as np
+import paddle
+from paddle import nn
+from paddle.distributed import fleet
+from paddle.nn.quant import weight_quantize
+
+from paddlenlp.experimental.model_utils import (
+    ActScalesLoader,
+    CacheScaleLoader,
+    WeightScalesLoader,
+)
+from paddlenlp.experimental.transformers.fused_transformer_layers import (
+    FusedBlockMultiTransformer,
+    FusedBlockMultiTransformerA8W8,
+    FusedBlockMultiTransformerWeightOnly,
+    FusedMultiTransformerA8W8,
+    FusedMultiTransformerAvx,
+    FusedMultiTransformerBase,
+    FusedMultiTransformerConfig,
+    FusedMultiTransformerWeightOnly,
+)
+from paddlenlp.experimental.transformers.generation_utils import (
+    GenerationAvxInferenceModel,
+    GenerationBlockInferenceModel,
+    GenerationInferenceModel,
+)
+from paddlenlp.experimental.transformers.utils import (
+    EmptyActScale,
+    EmptyCacheScale,
+    EmptyWeightScale,
+    infererence_model_from_pretrained,
+)
+from paddlenlp.transformers import LlamaConfig, LlamaPretrainedModel
+from paddlenlp.transformers.conversion_utils import split_param_func
+from paddlenlp.transformers.llama.modeling import LlamaLMHead
+from paddlenlp.transformers.model_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+)
+from paddlenlp.transformers.model_utils import (
+    dy2st_nocheck_guard_context,
+    register_base_model,
+)
+from paddlenlp.utils.log import logger
+
+__all__ = [
+    "LlamaInferenceModel",
+    "LlamaForCausalLMInferenceModel",
+    "LlamaForCausalLMAvxInferenceModel",
+    "LlamaForCausalLMBlockInferenceModel",
+    "LlamaForMiniGPT4InferenceModel",
+]
+
+
+class FusedLlamaRMSNorm(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.weight = paddle.create_parameter(
+            shape=[self.hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(1.0),
+        )
+        self.variance_epsilon = config.rms_norm_eps
+        self.config = config
+
+    def forward(self, hidden_states):
+        result = paddle.incubate.nn.functional.fused_rms_norm(
+            hidden_states, self.weight, None, self.variance_epsilon, begin_norm_axis=1
+        )
+        if isinstance(result, tuple):
+            return result[0]
+        return result
+
+
+@register_base_model
+class LlamaAvxInferenceModel(LlamaPretrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+    Args:
+        config: LlamaConfig
+    """
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.intermediate_size = config.intermediate_size
+        self.num_layers = config.num_hidden_layers
+        self.epsilon = config.rms_norm_eps
+        self.max_position_embeddings = config.max_position_embeddings
+        self.quant_type = config.quant_type
+        self.dtype = config.dtype
+        self.embed_tokens = nn.Embedding(
+            self.vocab_size,
+            self.hidden_size,
+        )
+        self.compute_type = config.avx_type
+        ln_scale_attrs = [paddle.ParamAttr(name="fusellama.{}.ln_scale".format(i)) for i in range(self.num_layers)]
+        qkv_weight_attrs = [
+            paddle.ParamAttr(
+                name="fusellama.{}.qkv_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(self.num_layers)
+        ]
+        out_proj_weight_attrs = [
+            paddle.ParamAttr(
+                name="fusellama.{}.out_proj_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(self.num_layers)
+        ]
+        ffn_ln_scale_attrs = [
+            paddle.ParamAttr(name="fusellama.{}.ffn_ln_scale".format(i)) for i in range(self.num_layers)
+        ]
+        ffn1_weight_attrs = [
+            paddle.ParamAttr(
+                name="fusellama.{}.ffn1_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(self.num_layers)
+        ]
+        ffn2_weight_attrs = [
+            paddle.ParamAttr(
+                name="fusellama.{}.ffn2_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(self.num_layers)
+        ]
+
+        transformer_config = FusedMultiTransformerConfig(
+            self.hidden_size,
+            self.num_attention_heads,
+            self.intermediate_size,
+            activation="silu",
+            num_layers=config.num_hidden_layers,
+            ln_scale_attrs=ln_scale_attrs,
+            qkv_weight_attrs=qkv_weight_attrs,
+            linear_weight_attrs=out_proj_weight_attrs,
+            ffn_ln_scale_attrs=ffn_ln_scale_attrs,
+            ffn1_weight_attrs=ffn1_weight_attrs,
+            ffn2_weight_attrs=ffn2_weight_attrs,
+            epsilon=self.epsilon,
+            norm_type="rmsnorm",
+        )
+
+        self.set_transformer_block(transformer_config, config.max_position_embeddings, self.compute_type)
+        self.norm = FusedLlamaRMSNorm(config)
+
+    def set_transformer_block(self, transformer_config, max_position_embeddings, compute_type):
+        self.transformer_block = FusedMultiTransformerAvx(transformer_config, max_position_embeddings, compute_type)
+
+    @staticmethod
+    def prepare_input_ids_for_generation(bos_token_id, encoder_output=None):
+        batch_size = 1
+        seq_len = 1
+        if bos_token_id is None:
+            raise ValueError("`bos_token_id` should be defined when no " "`input_ids` are provided.")
+        if encoder_output is not None:
+            batch_size = encoder_output.shape[0]
+            seq_len = encoder_output.shape[1]
+        return paddle.ones([batch_size, seq_len], dtype="int64") * bos_token_id
+
+    def forward(
+        self,
+        input_ids=None,
+        inputs_embeds=None,
+        past_seq_len=None,
+        cur_seq_len=None,
+        step_idx=None,
+        output_hidden_states=None,
+        return_dict=False,
+        **kwargs,
+    ):
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is None and inputs_embeds is None:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        # genereate a fake input_ids according to inputs_embeds
+        if input_ids is None and inputs_embeds is not None:
+            input_ids = self.prepare_input_ids_for_generation(self.config.bos_token_id, inputs_embeds)
+        if inputs_embeds is not None:
+            batch, seq_len, hidden_dim = inputs_embeds.shape
+            # merge batch and seq_len dimension.
+            inputs_embeds = inputs_embeds.reshape([batch * seq_len, hidden_dim])
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        with dy2st_nocheck_guard_context():
+            hidden_states = self.transformer_block(
+                input_ids,
+                hidden_states,
+                past_seq_len=past_seq_len,
+                cur_seq_len=cur_seq_len,
+                step_idx=step_idx,
+            )
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, None, all_hidden_states, None] if v is not None)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=None,
+            hidden_states=all_hidden_states,
+            attentions=None,
+        )
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict):
+        unfused_state_dict = {}
+        head_size = self.hidden_size // self.num_attention_heads
+        split_fn = split_param_func()
+
+        self.embed_tokens.weight.set_value(
+            paddle.to_tensor(state_dict["llama.embed_tokens.weight"]).cast(self.embed_tokens.weight.dtype)
+        )
+        self.norm.weight.set_value(paddle.to_tensor(state_dict["llama.norm.weight"]).cast(self.norm.weight.dtype))
+
+        for idx in range(self.config.num_hidden_layers):
+            logger.info(f"set state for layer {idx}")
+
+            if "llama.layers.{}.self_attn.qkv_proj.weight".format(idx) in state_dict.keys():
+                concated_qkv_weight = np.concatenate(
+                    split_fn(
+                        state_dict["llama.layers.{}.self_attn.qkv_proj.weight".format(idx)],
+                        is_qkv=True,
+                        num_heads=self.num_attention_heads // self.config.tensor_parallel_degree,
+                        num_key_value_heads=self.num_attention_heads // self.config.tensor_parallel_degree,
+                    ),
+                    axis=-1,
+                )
+            else:
+                unfused_state_dict = {}
+                unfused_state_dict["self_attn.q_proj.weight"] = state_dict[
+                    "llama.layers.{}.self_attn.q_proj.weight".format(idx)
+                ]
+                unfused_state_dict["self_attn.k_proj.weight"] = state_dict[
+                    "llama.layers.{}.self_attn.k_proj.weight".format(idx)
+                ]
+                unfused_state_dict["self_attn.v_proj.weight"] = state_dict[
+                    "llama.layers.{}.self_attn.v_proj.weight".format(idx)
+                ]
+                concated_qkv_weight = np.concatenate(
+                    [
+                        unfused_state_dict["self_attn.q_proj.weight"],
+                        unfused_state_dict["self_attn.k_proj.weight"],
+                        unfused_state_dict["self_attn.v_proj.weight"],
+                    ],
+                    axis=-1,
+                ).reshape(
+                    self.hidden_size,
+                    3 * (self.num_attention_heads // self.config.tensor_parallel_degree) * (head_size),
+                )  # reshape(3, self.num_attention_heself.hidden_sizeads // self.config.tensor_parallel_degree, head_size, )
+            if "llama.layers.{}.mlp.gate_up_fused_proj.weight".format(idx) in state_dict.keys():
+                concated_ffn1_weight = np.concatenate(
+                    split_fn(state_dict["llama.layers.{}.mlp.gate_up_fused_proj.weight".format(idx)]), axis=-1
+                )
+            else:
+                unfused_state_dict["mlp.gate_proj.weight"] = state_dict[
+                    "llama.layers.{}.mlp.gate_proj.weight".format(idx)
+                ]
+                unfused_state_dict["mlp.up_proj.weight"] = state_dict["llama.layers.{}.mlp.up_proj.weight".format(idx)]
+                concated_ffn1_weight = np.concatenate(
+                    [unfused_state_dict["mlp.gate_proj.weight"], unfused_state_dict["mlp.up_proj.weight"]], axis=-1
+                )
+            gate_up_list = split_fn(concated_ffn1_weight)
+            gate_weight_tensor = paddle.to_tensor(gate_up_list[0])
+            up_weight_tensor = paddle.to_tensor(gate_up_list[1])
+
+            qkv_weight_tensor = paddle.to_tensor(concated_qkv_weight)
+            self.transformer_block.qkv_weights[idx].set_value(
+                qkv_weight_tensor.cast(self.transformer_block.qkv_weights[idx].dtype)
+            )
+
+            linear_weight_tensor = paddle.to_tensor(state_dict["llama.layers.{}.self_attn.o_proj.weight".format(idx)])
+            self.transformer_block.linear_weights[idx].set_value(
+                linear_weight_tensor.cast(self.transformer_block.linear_weights[idx].dtype)
+            )
+            self.transformer_block.gate_weights[idx].set_value(
+                gate_weight_tensor.cast(self.transformer_block.gate_weights[idx].dtype)
+            )
+            self.transformer_block.up_weights[idx].set_value(
+                up_weight_tensor.cast(self.transformer_block.up_weights[idx].dtype)
+            )
+
+            ffn2_weight_tensor = paddle.to_tensor(state_dict["llama.layers.{}.mlp.down_proj.weight".format(idx)])
+            self.transformer_block.ffn2_weights[idx].set_value(
+                ffn2_weight_tensor.cast(self.transformer_block.ffn2_weights[idx].dtype)
+            )
+            self.transformer_block.ln_scales[idx].set_value(
+                paddle.to_tensor(state_dict["llama.layers.{}.input_layernorm.weight".format(idx)]).cast(
+                    self.transformer_block.ln_scales[idx].dtype
+                )
+            )
+
+            self.transformer_block.ffn_ln_scales[idx].set_value(
+                paddle.to_tensor(state_dict["llama.layers.{}.post_attention_layernorm.weight".format(idx)]).cast(
+                    self.transformer_block.ffn_ln_scales[idx].dtype
+                )
+            )
+
+
+@register_base_model
+class LlamaInferenceModel(LlamaPretrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+    Args:
+        config: LlamaConfig
+    """
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.intermediate_size = config.intermediate_size
+        self.num_layers = config.num_hidden_layers
+        self.epsilon = config.rms_norm_eps
+        self.max_position_embeddings = config.max_position_embeddings
+        self.quant_type = config.get("quant_type", "")
+
+        self.rope_theta = config.rope_theta
+        self.use_neox = True
+
+        self.use_weight_only = False
+        if config.quant_type == "weight_only_int8":
+            self.use_weight_only = True
+            self.quant_algo = "weight_only_int8"
+        elif config.quant_type == "weight_only_int4":
+            self.use_weight_only = True
+            self.quant_algo = "weight_only_int4"
+        elif "a8w8" in config.quant_type:
+            self.quant_model_path = config.model_name_or_path
+            self.shift = config.quantization_config.shift
+            self.smooth = config.quantization_config.smooth
+            self.shift_smooth_all_linears = config.quantization_config.shift_smooth_all_linears
+
+        self.use_fake_parameter = config.get("use_fake_parameter", False)
+
+        if self.use_weight_only:
+            assert (
+                self.quant_type == "weight_only_int8" or self.quant_type == "weight_only_int4"
+            ), "Expected quant_type equal to 'weight_only_int8' or 'weight_only_int4', but received {}".format(
+                self.quant_type
+            )
+
+        if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0:
+            self.embed_tokens = fleet.meta_parallel.VocabParallelEmbedding(
+                self.vocab_size,
+                self.hidden_size,
+                weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()),
+            )
+        else:
+            self.embed_tokens = nn.Embedding(
+                self.vocab_size,
+                self.hidden_size,
+            )
+
+        # get ring_id
+        ring_id = -1
+        try:
+            hcg = fleet.get_hybrid_communicate_group()
+            model_parallel_group = hcg.get_model_parallel_group()
+            ring_id = model_parallel_group.id
+        except:
+            pass
+
+        ln_scale_attrs = [paddle.ParamAttr(name="fusellama.{}.ln_scale".format(i)) for i in range(self.num_layers)]
+        qkv_weight_attrs = [
+            paddle.ParamAttr(
+                name="fusellama.{}.qkv_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(self.num_layers)
+        ]
+        out_proj_weight_attrs = [
+            paddle.ParamAttr(
+                name="fusellama.{}.out_proj_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(self.num_layers)
+        ]
+        ffn_ln_scale_attrs = [
+            paddle.ParamAttr(name="fusellama.{}.ffn_ln_scale".format(i)) for i in range(self.num_layers)
+        ]
+        ffn1_weight_attrs = [
+            paddle.ParamAttr(
+                name="fusellama.{}.ffn1_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(self.num_layers)
+        ]
+        ffn2_weight_attrs = [
+            paddle.ParamAttr(
+                name="fusellama.{}.ffn2_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(self.num_layers)
+        ]
+
+        qkv_out_scale_attrs = None
+        linear_out_scale_attrs = None
+        ffn1_out_scale_attrs = None
+        ffn2_out_scale_attrs = None
+        linear_shift_attrs = None
+        linear_smooth_attrs = None
+        ffn2_shift_attrs = None
+        ffn2_smooth_attrs = None
+        ln_bias_attrs = None
+        qkv_bias_attrs = None
+        out_proj_bias_attrs = None
+        ffn_ln_bias_attrs = None
+        ffn1_bias_attrs = None
+        ffn2_bias_attrs = None
+
+        if "a8w8" in self.quant_type:
+            qkv_out_scale_attrs = [
+                paddle.ParamAttr(name="fusellama.{}.qkv_out_scale".format(i)) for i in range(self.num_layers)
+            ]
+            linear_out_scale_attrs = [
+                paddle.ParamAttr(name="fusellama.{}.linear_out_scale".format(i)) for i in range(self.num_layers)
+            ]
+            ffn1_out_scale_attrs = [
+                paddle.ParamAttr(name="fusellama.{}.ffn1_out_scale".format(i)) for i in range(self.num_layers)
+            ]
+            ffn2_out_scale_attrs = [
+                paddle.ParamAttr(name="fusellama.{}.ffn2_out_scale".format(i)) for i in range(self.num_layers)
+            ]
+
+            if self.shift_smooth_all_linears:
+                linear_shift_attrs = [
+                    paddle.ParamAttr(name="fusellama.{}.linear_shift".format(i)) for i in range(self.num_layers)
+                ]
+                linear_smooth_attrs = [
+                    paddle.ParamAttr(name="fusellama.{}.linear_smooth".format(i)) for i in range(self.num_layers)
+                ]
+                ffn2_shift_attrs = [
+                    paddle.ParamAttr(name="fusellama.{}.ffn2_shift".format(i)) for i in range(self.num_layers)
+                ]
+                ffn2_smooth_attrs = [
+                    paddle.ParamAttr(name="fusellama.{}.ffn2_smooth".format(i)) for i in range(self.num_layers)
+                ]
+
+            if self.shift:
+                ln_bias_attrs = [
+                    paddle.ParamAttr(name="fusellama.{}.ln_bias".format(i)) for i in range(self.num_layers)
+                ]
+                ffn_ln_bias_attrs = [
+                    paddle.ParamAttr(name="fusellama.{}.ffn_ln_bias".format(i)) for i in range(self.num_layers)
+                ]
+                qkv_bias_attrs = [
+                    paddle.ParamAttr(name="fusellama.{}.qkv_bias".format(i)) for i in range(self.num_layers)
+                ]
+                ffn1_bias_attrs = [
+                    paddle.ParamAttr(name="fusellama.{}.ffn1_bias".format(i)) for i in range(self.num_layers)
+                ]
+                if self.shift_smooth_all_linears:
+                    out_proj_bias_attrs = [
+                        paddle.ParamAttr(name="fusellama.{}.out_proj_bias".format(i)) for i in range(self.num_layers)
+                    ]
+                    ffn2_bias_attrs = [
+                        paddle.ParamAttr(name="fusellama.{}.ffn2_bias".format(i)) for i in range(self.num_layers)
+                    ]
+
+        qkv_weight_scale_attrs = None
+        out_proj_weight_scale_attrs = None
+        ffn1_weight_scale_attrs = None
+        ffn2_weight_scale_attrs = None
+
+        if self.use_weight_only:
+            qkv_weight_scale_attrs = [
+                paddle.ParamAttr(name="fusellama.{}.qkv_weight_scale".format(i)) for i in range(self.num_layers)
+            ]
+            out_proj_weight_scale_attrs = [
+                paddle.ParamAttr(name="fusellama.{}.out_proj_weight_scale".format(i)) for i in range(self.num_layers)
+            ]
+            ffn1_weight_scale_attrs = [
+                paddle.ParamAttr(name="fusellama.{}.ffn1_weight_scale".format(i)) for i in range(self.num_layers)
+            ]
+            ffn2_weight_scale_attrs = [
+                paddle.ParamAttr(name="fusellama.{}.ffn2_weight_scale".format(i)) for i in range(self.num_layers)
+            ]
+
+        cache_k_scale_attrs = None
+        cache_v_scale_attrs = None
+        cache_k_out_scale_attrs = None
+        cache_v_out_scale_attrs = None
+
+        if config.cachekv_int8_type == "static":
+            cache_k_scale_attrs = [
+                paddle.ParamAttr(name="fusellama.{}.cache_k_scale".format(i)) for i in range(self.num_layers)
+            ]
+            cache_v_scale_attrs = [
+                paddle.ParamAttr(name="fusellama.{}.cache_v_scale".format(i)) for i in range(self.num_layers)
+            ]
+            cache_k_out_scale_attrs = [
+                paddle.ParamAttr(name="fusellama.{}.cache_k_out_scale".format(i)) for i in range(self.num_layers)
+            ]
+            cache_v_out_scale_attrs = [
+                paddle.ParamAttr(name="fusellama.{}.cache_v_out_scale".format(i)) for i in range(self.num_layers)
+            ]
+
+        transformer_config = FusedMultiTransformerConfig(
+            embed_dim=self.hidden_size,
+            num_heads=self.num_attention_heads,
+            kv_num_heads=self.num_key_value_heads,
+            dim_feedforward=self.intermediate_size,
+            quant_type=self.quant_type,
+            activation="swiglu",
+            num_layers=config.num_hidden_layers,
+            nranks=config.tensor_parallel_degree,
+            ring_id=ring_id,
+            ln_scale_attrs=ln_scale_attrs,
+            qkv_weight_attrs=qkv_weight_attrs,
+            qkv_weight_scale_attrs=qkv_weight_scale_attrs,
+            linear_weight_attrs=out_proj_weight_attrs,
+            linear_weight_scale_attrs=out_proj_weight_scale_attrs,
+            ffn_ln_scale_attrs=ffn_ln_scale_attrs,
+            ffn1_weight_attrs=ffn1_weight_attrs,
+            ffn1_weight_scale_attrs=ffn1_weight_scale_attrs,
+            ffn2_weight_attrs=ffn2_weight_attrs,
+            ffn2_weight_scale_attrs=ffn2_weight_scale_attrs,
+            qkv_out_scale_attrs=qkv_out_scale_attrs,
+            linear_out_scale_attrs=linear_out_scale_attrs,
+            ffn1_out_scale_attrs=ffn1_out_scale_attrs,
+            ffn2_out_scale_attrs=ffn2_out_scale_attrs,
+            linear_shift_attrs=linear_shift_attrs,
+            linear_smooth_attrs=linear_smooth_attrs,
+            ffn2_shift_attrs=ffn2_shift_attrs,
+            ffn2_smooth_attrs=ffn2_smooth_attrs,
+            ln_bias_attrs=ln_bias_attrs,
+            qkv_bias_attrs=qkv_bias_attrs,
+            linear_bias_attrs=out_proj_bias_attrs,
+            ffn_ln_bias_attrs=ffn_ln_bias_attrs,
+            ffn1_bias_attrs=ffn1_bias_attrs,
+            ffn2_bias_attrs=ffn2_bias_attrs,
+            cache_k_scale_attrs=cache_k_scale_attrs,
+            cache_v_scale_attrs=cache_v_scale_attrs,
+            cache_k_out_scale_attrs=cache_k_out_scale_attrs,
+            cache_v_out_scale_attrs=cache_v_out_scale_attrs,
+            epsilon=self.epsilon,
+            norm_type="rmsnorm",
+            use_neox_rotary_style=self.use_neox,
+            cachekv_int8_type=config.cachekv_int8_type,
+            rank_id=config.tensor_parallel_rank,
+            trans_qkvw=(False if paddle.is_compiled_with_rocm() and self.quant_type == "a8w8" else True),
+        )
+
+        self.set_transformer_block(transformer_config)
+        self.norm = FusedLlamaRMSNorm(config)
+
+        self.cache_kvs = None
+        self.head_dim_shape_tensor = paddle.ones((self.hidden_size // self.num_attention_heads), dtype="int8")
+
+        self.gradient_checkpointing = False
+
+    def set_transformer_block(self, transformer_config):
+        if self.use_weight_only:
+            self.transformer_block = FusedMultiTransformerWeightOnly(transformer_config)
+        elif "a8w8" in self.quant_type:
+            self.transformer_block = FusedMultiTransformerA8W8(transformer_config)
+        else:
+            self.transformer_block = FusedMultiTransformerBase(transformer_config)
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def remove_padding(self, input_ids, seq_lens_this_time):
+        cum_offsets_now = paddle.cumsum(paddle.max(seq_lens_this_time) - seq_lens_this_time)
+        token_num = paddle.sum(seq_lens_this_time)
+        from paddlenlp_ops import get_padding_offset
+
+        ids_remove_padding, cum_offsets, padding_offset = get_padding_offset(
+            input_ids, cum_offsets_now, token_num, seq_lens_this_time
+        )
+        return ids_remove_padding, padding_offset, cum_offsets
+
+    # This function is a little different from prepare_input_ids_for_generation in paddlenlp/transformers/generation/utils.py
+    @staticmethod
+    def prepare_input_ids_for_generation(bos_token_id, encoder_output=None):
+        batch_size = 1
+        seq_len = 1
+        if bos_token_id is None:
+            raise ValueError("`bos_token_id` should be defined when no " "`input_ids` are provided.")
+        if encoder_output is not None:
+            batch_size = encoder_output.shape[0]
+            seq_len = encoder_output.shape[1]
+        return paddle.ones([batch_size, seq_len], dtype="int64") * bos_token_id
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        use_cache=None,
+        cache_kvs=None,
+        pre_caches=None,
+        seq_len_encoder=None,
+        seq_len_decoder=None,
+        past_key_values=None,
+        output_attentions=False,
+        output_hidden_states=None,
+        return_dict=False,
+        **kwargs,
+    ):
+        # kwargs["cache"] is used used to distinguish between encoder and decoder phase.
+        past_key_values = kwargs.get("cache", None)
+        is_decoder = past_key_values is not None
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is None and inputs_embeds is None:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        # genereate a fake input_ids according to inputs_embeds
+        # this is usually occurred in img2txt multimodal model when first enter into this forward function.
+        if input_ids is None and inputs_embeds is not None:
+            input_ids = self.prepare_input_ids_for_generation(self.config.bos_token_id, inputs_embeds)
+        if inputs_embeds is not None:
+            batch, seq_len, hidden_dim = inputs_embeds.shape
+            # merge batch and seq_len dimension.
+            inputs_embeds = inputs_embeds.reshape([batch * seq_len, hidden_dim])
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        cache_kvs = cache_kvs if cache_kvs is not None else self.cache_kvs
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * self.config.num_hidden_layers)
+
+        if not is_decoder:
+            ids_remove_padding, padding_offset, cum_offsets = self.remove_padding(input_ids, seq_len_encoder)
+        else:
+            ids_remove_padding = input_ids.squeeze(axis=1)
+            padding_offset = None
+            cum_offsets = None
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(ids_remove_padding)
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        seq_lens = seq_len_decoder if is_decoder else seq_len_encoder
+
+        position_offset = 0
+        if not is_decoder and pre_caches is not None:
+            position_offset = 128
+        from paddlenlp_ops import fused_get_rotary_embedding
+
+        new_rope = fused_get_rotary_embedding(
+            input_ids, position_ids, self.head_dim_shape_tensor, position_offset, self.rope_theta, self.use_neox
+        )
+
+        with dy2st_nocheck_guard_context():
+            hidden_states, _ = self.transformer_block(
+                input_ids,
+                hidden_states,
+                cum_offsets=cum_offsets,
+                padding_offset=padding_offset,
+                attn_mask=paddle.cast(attention_mask, dtype=hidden_states.dtype),
+                caches=cache_kvs,
+                pre_caches=pre_caches,
+                pre_caches_length=position_offset,
+                seq_lens=seq_lens,
+                rotary_embs=new_rope,
+                rotary_emb_dims=1,
+                time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None,
+            )
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, None, all_hidden_states, all_self_attns] if v is not None)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict):
+        unfused_state_dict = {}
+        head_size = self.hidden_size // self.num_attention_heads
+        split_fn = split_param_func()
+
+        self.embed_tokens.weight.set_value(
+            paddle.to_tensor(state_dict["llama.embed_tokens.weight"]).cast(self.embed_tokens.weight.dtype)
+        )
+        self.norm.weight.set_value(paddle.to_tensor(state_dict["llama.norm.weight"]).cast(self.norm.weight.dtype))
+        if self.use_weight_only:
+            logger.info("weight only is enabled")
+        for idx in range(self.config.num_hidden_layers):
+            logger.info(f"set state for layer {idx}")
+
+            if "llama.layers.{}.self_attn.qkv_proj.weight".format(idx) in state_dict.keys():
+                concated_qkv_weight = np.concatenate(
+                    split_fn(
+                        state_dict["llama.layers.{}.self_attn.qkv_proj.weight".format(idx)],
+                        is_qkv=True,
+                        num_heads=self.num_attention_heads // self.config.tensor_parallel_degree,
+                        num_key_value_heads=self.num_key_value_heads // self.config.tensor_parallel_degree,
+                    ),
+                    axis=-1,
+                ).transpose(1, 0)
+            else:
+                unfused_state_dict = {}
+                unfused_state_dict["self_attn.q_proj.weight"] = state_dict[
+                    "llama.layers.{}.self_attn.q_proj.weight".format(idx)
+                ]
+                unfused_state_dict["self_attn.k_proj.weight"] = state_dict[
+                    "llama.layers.{}.self_attn.k_proj.weight".format(idx)
+                ]
+                unfused_state_dict["self_attn.v_proj.weight"] = state_dict[
+                    "llama.layers.{}.self_attn.v_proj.weight".format(idx)
+                ]
+                if paddle.is_compiled_with_rocm() and self.quant_type == "a8w8":
+                    concated_qkv_weight = np.concatenate(
+                        [
+                            unfused_state_dict["self_attn.q_proj.weight"],
+                            unfused_state_dict["self_attn.k_proj.weight"],
+                            unfused_state_dict["self_attn.v_proj.weight"],
+                        ],
+                        axis=-1,
+                    ).reshape(
+                        self.hidden_size,
+                        (
+                            self.num_attention_heads // self.config.tensor_parallel_degree
+                            + 2 * self.num_key_value_heads // self.config.tensor_parallel_degree
+                        )
+                        * (head_size),
+                    )
+                else:
+                    concated_qkv_weight = (
+                        np.concatenate(
+                            [
+                                unfused_state_dict["self_attn.q_proj.weight"],
+                                unfused_state_dict["self_attn.k_proj.weight"],
+                                unfused_state_dict["self_attn.v_proj.weight"],
+                            ],
+                            axis=-1,
+                        )
+                        .transpose(1, 0)
+                        .reshape(
+                            (
+                                self.num_attention_heads // self.config.tensor_parallel_degree
+                                + 2 * self.num_key_value_heads // self.config.tensor_parallel_degree
+                            )
+                            * (head_size),
+                            self.hidden_size,
+                        )
+                    )
+            if "llama.layers.{}.mlp.gate_up_fused_proj.weight".format(idx) in state_dict.keys():
+                concated_ffn1_weight = np.concatenate(
+                    split_fn(state_dict["llama.layers.{}.mlp.gate_up_fused_proj.weight".format(idx)]), axis=-1
+                )
+            else:
+                unfused_state_dict["mlp.gate_proj.weight"] = state_dict[
+                    "llama.layers.{}.mlp.gate_proj.weight".format(idx)
+                ]
+                unfused_state_dict["mlp.up_proj.weight"] = state_dict["llama.layers.{}.mlp.up_proj.weight".format(idx)]
+                concated_ffn1_weight = np.concatenate(
+                    [unfused_state_dict["mlp.gate_proj.weight"], unfused_state_dict["mlp.up_proj.weight"]], axis=-1
+                )
+
+            qkv_weight_tensor = paddle.to_tensor(concated_qkv_weight).cast(paddle.get_default_dtype())
+            if self.use_weight_only:
+                qkv_weight_tensor = paddle.transpose(qkv_weight_tensor, perm=[1, 0])
+                qkv_quanted_weight_tensor, qkv_weight_scale_tensor = weight_quantize(
+                    qkv_weight_tensor, algo=self.quant_algo
+                )
+                self.transformer_block.qkv_weights[idx].set_value(qkv_quanted_weight_tensor)
+                self.transformer_block.qkv_weights_scale[idx].set_value(qkv_weight_scale_tensor)
+            elif "a8w8" in self.quant_type:
+                self.transformer_block.qkv_weights[idx].set_value(
+                    paddle.cast(paddle.to_tensor(concated_qkv_weight), "int8")
+                )
+            else:
+                self.transformer_block.qkv_weights[idx].set_value(qkv_weight_tensor)
+
+            linear_weight_tensor = paddle.to_tensor(
+                state_dict["llama.layers.{}.self_attn.o_proj.weight".format(idx)]
+            ).cast(paddle.get_default_dtype())
+            if self.use_weight_only:
+                linear_quanted_weight_tensor, linear_weight_scale_tensor = weight_quantize(
+                    linear_weight_tensor, algo=self.quant_algo
+                )
+                self.transformer_block.linear_weights[idx].set_value(linear_quanted_weight_tensor)
+                self.transformer_block.linear_weights_scale[idx].set_value(linear_weight_scale_tensor)
+            elif "a8w8" in self.quant_type:
+                if paddle.is_compiled_with_rocm():
+                    self.transformer_block.linear_weights[idx].set_value(
+                        paddle.cast(
+                            paddle.to_tensor(state_dict["llama.layers.{}.self_attn.o_proj.weight".format(idx)]), "int8"
+                        )
+                    )
+                else:
+                    self.transformer_block.linear_weights[idx].set_value(
+                        paddle.cast(
+                            paddle.to_tensor(
+                                state_dict["llama.layers.{}.self_attn.o_proj.weight".format(idx)]
+                            ).transpose((1, 0)),
+                            "int8",
+                        )
+                    )
+            else:
+                self.transformer_block.linear_weights[idx].set_value(linear_weight_tensor)
+
+            ffn1_weight_tensor = paddle.to_tensor(concated_ffn1_weight).cast(paddle.get_default_dtype())
+            if self.use_weight_only:
+                ffn1_quanted_weight_tensor, ffn1_weight_scale_tensor = weight_quantize(
+                    ffn1_weight_tensor, algo=self.quant_algo
+                )
+                self.transformer_block.ffn1_weights[idx].set_value(ffn1_quanted_weight_tensor)
+                self.transformer_block.ffn1_weights_scale[idx].set_value(ffn1_weight_scale_tensor)
+            elif "a8w8" in self.quant_type:
+                if paddle.is_compiled_with_rocm():
+                    self.transformer_block.ffn1_weights[idx].set_value(
+                        paddle.cast(paddle.to_tensor(concated_ffn1_weight), "int8")
+                    )
+                else:
+                    self.transformer_block.ffn1_weights[idx].set_value(
+                        paddle.cast(paddle.to_tensor(concated_ffn1_weight).transpose((1, 0)), "int8")
+                    )
+            else:
+                self.transformer_block.ffn1_weights[idx].set_value(ffn1_weight_tensor)
+
+            ffn2_weight_tensor = paddle.to_tensor(state_dict["llama.layers.{}.mlp.down_proj.weight".format(idx)]).cast(
+                paddle.get_default_dtype()
+            )
+            if self.use_weight_only:
+                ffn2_quanted_weight_tensor, ffn2_weight_scale_tensor = weight_quantize(
+                    ffn2_weight_tensor, algo=self.quant_algo
+                )
+                self.transformer_block.ffn2_weights[idx].set_value(ffn2_quanted_weight_tensor)
+                self.transformer_block.ffn2_weights_scale[idx].set_value(ffn2_weight_scale_tensor)
+            elif "a8w8" in self.quant_type:
+                if paddle.is_compiled_with_rocm():
+                    self.transformer_block.ffn2_weights[idx].set_value(
+                        paddle.cast(
+                            paddle.to_tensor(state_dict["llama.layers.{}.mlp.down_proj.weight".format(idx)]), "int8"
+                        )
+                    )
+                else:
+                    self.transformer_block.ffn2_weights[idx].set_value(
+                        paddle.cast(
+                            paddle.to_tensor(state_dict["llama.layers.{}.mlp.down_proj.weight".format(idx)]).transpose(
+                                (1, 0)
+                            ),
+                            "int8",
+                        )
+                    )
+            else:
+                self.transformer_block.ffn2_weights[idx].set_value(ffn2_weight_tensor)
+
+            if "a8w8" in self.quant_type:
+                if self.shift_smooth_all_linears:
+                    if self.use_fake_parameter:
+                        if "llama.layers.{}.self_attn.o_proj.shift_bias".format(idx) not in state_dict:
+                            state_dict["llama.layers.{}.self_attn.o_proj.shift_bias".format(idx)] = paddle.zeros(
+                                shape=[
+                                    (self.num_attention_heads // self.config.tensor_parallel_degree)
+                                    * (self.hidden_size // self.num_attention_heads)
+                                ],
+                                dtype=paddle.get_default_dtype(),
+                            )
+                            state_dict["llama.layers.{}.self_attn.o_proj.smooth_weight".format(idx)] = paddle.ones(
+                                shape=[
+                                    (self.num_attention_heads // self.config.tensor_parallel_degree)
+                                    * (self.hidden_size // self.num_attention_heads)
+                                ],
+                                dtype=paddle.get_default_dtype(),
+                            )
+                            state_dict["llama.layers.{}.mlp.down_proj.shift_bias".format(idx)] = paddle.zeros(
+                                shape=[self.intermediate_size // self.config.tensor_parallel_degree],
+                                dtype=paddle.get_default_dtype(),
+                            )
+                            state_dict["llama.layers.{}.mlp.down_proj.smooth_weight".format(idx)] = paddle.ones(
+                                shape=[self.intermediate_size // self.config.tensor_parallel_degree],
+                                dtype=paddle.get_default_dtype(),
+                            )
+                    self.transformer_block.linear_shifts[idx].set_value(
+                        paddle.to_tensor(state_dict["llama.layers.{}.self_attn.o_proj.shift_bias".format(idx)])
+                    )
+                    self.transformer_block.linear_smooths[idx].set_value(
+                        paddle.to_tensor(state_dict["llama.layers.{}.self_attn.o_proj.smooth_weight".format(idx)])
+                    )
+                    self.transformer_block.ffn2_shifts[idx].set_value(
+                        paddle.to_tensor(state_dict["llama.layers.{}.mlp.down_proj.shift_bias".format(idx)])
+                    )
+                    self.transformer_block.ffn2_smooths[idx].set_value(
+                        paddle.to_tensor(state_dict["llama.layers.{}.mlp.down_proj.smooth_weight".format(idx)])
+                    )
+
+                if self.shift:
+                    if self.use_fake_parameter:
+                        if "llama.layers.{}.input_layernorm.bias".format(idx) not in state_dict:
+                            state_dict["llama.layers.{}.input_layernorm.bias".format(idx)] = paddle.zeros(
+                                shape=[self.hidden_size], dtype=paddle.get_default_dtype()
+                            )
+                            state_dict["llama.layers.{}.post_attention_layernorm.bias".format(idx)] = paddle.zeros(
+                                [self.hidden_size], dtype=paddle.get_default_dtype()
+                            )
+                            unfused_state_dict["self_attn.q_proj.bias"] = paddle.zeros(
+                                shape=[self.num_attention_heads * (self.hidden_size // self.num_attention_heads)],
+                                dtype=paddle.get_default_dtype(),
+                            )
+                            unfused_state_dict["self_attn.k_proj.bias"] = paddle.zeros(
+                                shape=[self.num_key_value_heads * (self.hidden_size // self.num_attention_heads)],
+                                dtype=paddle.get_default_dtype(),
+                            )
+                            unfused_state_dict["self_attn.v_proj.bias"] = paddle.zeros(
+                                shape=[self.num_key_value_heads * (self.hidden_size // self.num_attention_heads)],
+                                dtype=paddle.get_default_dtype(),
+                            )
+                            unfused_state_dict["mlp.gate_proj.bias"] = paddle.zeros(
+                                shape=[self.intermediate_size], dtype=paddle.get_default_dtype()
+                            )
+                            unfused_state_dict["mlp.up_proj.bias"] = paddle.zeros(
+                                shape=[self.intermediate_size], dtype=paddle.get_default_dtype()
+                            )
+
+                    self.transformer_block.ln_biases[idx].set_value(
+                        paddle.to_tensor(state_dict["llama.layers.{}.input_layernorm.bias".format(idx)])
+                    )
+                    self.transformer_block.ffn_ln_biases[idx].set_value(
+                        paddle.to_tensor(state_dict["llama.layers.{}.post_attention_layernorm.bias".format(idx)])
+                    )
+
+                    unfused_state_dict["self_attn.q_proj.bias"] = state_dict[
+                        "llama.layers.{}.self_attn.q_proj.bias".format(idx)
+                    ]
+                    unfused_state_dict["self_attn.k_proj.bias"] = state_dict[
+                        "llama.layers.{}.self_attn.k_proj.bias".format(idx)
+                    ]
+                    unfused_state_dict["self_attn.v_proj.bias"] = state_dict[
+                        "llama.layers.{}.self_attn.v_proj.bias".format(idx)
+                    ]
+
+                    concated_qkv_biases = np.concatenate(
+                        [
+                            unfused_state_dict["self_attn.q_proj.bias"],
+                            unfused_state_dict["self_attn.k_proj.bias"],
+                            unfused_state_dict["self_attn.v_proj.bias"],
+                        ],
+                        axis=-1,
+                    )
+
+                    self.transformer_block.qkv_biases[idx].set_value(paddle.to_tensor(concated_qkv_biases))
+
+                    unfused_state_dict["mlp.gate_proj.bias"] = state_dict[
+                        "llama.layers.{}.mlp.gate_proj.bias".format(idx)
+                    ]
+                    unfused_state_dict["mlp.up_proj.bias"] = state_dict["llama.layers.{}.mlp.up_proj.bias".format(idx)]
+
+                    concated_ffn1_bias = np.concatenate(
+                        [unfused_state_dict["mlp.gate_proj.bias"], unfused_state_dict["mlp.up_proj.bias"]], axis=-1
+                    )
+
+                    self.transformer_block.ffn1_biases[idx].set_value(paddle.to_tensor(concated_ffn1_bias))
+
+                    if self.shift_smooth_all_linears:
+                        if self.use_fake_parameter:
+                            if "llama.layers.{}.self_attn.o_proj.bias".format(idx) not in state_dict:
+                                state_dict["llama.layers.{}.self_attn.o_proj.bias".format(idx)] = paddle.zeros(
+                                    [self.hidden_size], dtype=paddle.get_default_dtype()
+                                )
+                                state_dict["llama.layers.{}.mlp.down_proj.layer.bias".format(idx)] = paddle.zeros(
+                                    [self.hidden_size], dtype=paddle.get_default_dtype()
+                                )
+                        self.transformer_block.linear_biases[idx].set_value(
+                            paddle.to_tensor(state_dict["llama.layers.{}.self_attn.o_proj.bias".format(idx)])
+                        )
+                        self.transformer_block.ffn2_biases[idx].set_value(
+                            paddle.to_tensor(state_dict["llama.layers.{}.mlp.down_proj.layer.bias".format(idx)])
+                        )
+
+            self.transformer_block.ln_scales[idx].set_value(
+                paddle.to_tensor(state_dict["llama.layers.{}.input_layernorm.weight".format(idx)]).cast(
+                    self.transformer_block.ln_scales[idx].dtype
+                )
+            )
+
+            self.transformer_block.ffn_ln_scales[idx].set_value(
+                paddle.to_tensor(state_dict["llama.layers.{}.post_attention_layernorm.weight".format(idx)]).cast(
+                    self.transformer_block.ffn_ln_scales[idx].dtype
+                )
+            )
+
+        if "a8w8" in self.quant_type:
+            current_work_dir = os.path.dirname(__file__)
+            scale_map_file = (
+                f"{current_work_dir}/ptq_scales_map.json"
+                if not self.shift_smooth_all_linears
+                else f"{current_work_dir}/ptq_scales_map_shift_smooth.json"
+            )
+
+            with open(scale_map_file) as json_file:
+                scale_map_dict = json.load(json_file)
+                act_scale_map_dict = scale_map_dict["act_scale"]
+                weight_scale_map_dict = scale_map_dict["weight_scale"]
+                cache_scale_map_dict = scale_map_dict["cachekv_scale"]
+
+                if not self.use_fake_parameter:
+                    act_scale_json_path = os.path.join(self.quant_model_path, "act_scales.json")
+                    weight_scale_json_path = os.path.join(self.quant_model_path, "weight_scales.json")
+                    if self.config.tensor_parallel_degree > 1 and not self.config.single_card_ptq:
+                        act_scale_json_path = os.path.join(
+                            self.quant_model_path, f"act_scales_{self.config.tensor_parallel_rank}.json"
+                        )
+                        weight_scale_json_path = os.path.join(
+                            self.quant_model_path, f"weight_scales_{self.config.tensor_parallel_rank}.json"
+                        )
+                    act_scale_loader = ActScalesLoader(
+                        act_scale_json_path, act_scale_map_dict, num_of_layers=self.config.num_hidden_layers
+                    )
+                    weight_scales_loader = WeightScalesLoader(
+                        weight_scale_json_path,
+                        weight_scale_map_dict,
+                        num_of_layers=self.config.num_hidden_layers,
+                        concat_qkv=True,
+                        concat_ffn1=True,
+                    )
+                else:
+                    act_scale_loader = EmptyActScale(act_scale_map_dict, num_of_layers=self.config.num_hidden_layers)
+                    weight_scales_loader = EmptyWeightScale(
+                        weight_scale_map_dict,
+                        num_of_layers=self.config.num_hidden_layers,
+                        num_head=self.num_attention_heads,
+                        dim_head=self.hidden_size // self.num_attention_heads,
+                        ffn_hidden_size=self.intermediate_size,
+                        num_key_value_heads=self.num_key_value_heads,
+                        mp_size=self.config.tensor_parallel_degree,
+                    )
+                self.transformer_block.act_scales = act_scale_loader.scale
+
+                if self.config.cachekv_int8_type == "static":
+                    if not self.use_fake_parameter:
+                        cache_scale_json_path = os.path.join(self.quant_model_path, "cachekv_scales.json")
+                        if self.config.tensor_parallel_degree > 1 and not self.config.single_card_ptq:
+                            cache_scale_json_path = os.path.join(
+                                self.quant_model_path, f"cachekv_scales_{self.config.tensor_parallel_rank}.json"
+                            )
+                        cache_scales_loader = CacheScaleLoader(
+                            cache_scale_json_path,
+                            cache_scale_map_dict,
+                            num_of_layers=self.config.num_hidden_layers,
+                            num_heads=self.num_attention_heads // self.config.tensor_parallel_degree,
+                            num_key_value_heads=self.num_key_value_heads // self.config.tensor_parallel_degree,
+                        )
+                    else:
+                        cache_scales_loader = EmptyCacheScale(
+                            cache_scale_map_dict,
+                            num_of_layers=self.config.num_hidden_layers,
+                            num_heads=self.num_attention_heads,
+                            dim_heads=self.hidden_size // self.num_attention_heads,
+                            is_channel_wise=False,
+                            num_key_value_heads=self.num_key_value_heads,
+                            mp_size=self.config.tensor_parallel_degree,
+                        )
+
+                    for k, v in cache_scales_loader.scale.items():
+                        for i_layer, weight_scale in enumerate(v):
+                            weight_scale = weight_scale.astype("float32")
+                            if k == "cache_k_scale":
+                                self.transformer_block.cache_k_scales[i_layer].set_value(weight_scale)
+                            elif k == "cache_v_scale":
+                                self.transformer_block.cache_v_scales[i_layer].set_value(weight_scale)
+                            elif k == "cache_k_out_scale":
+                                self.transformer_block.cache_k_out_scales[i_layer].set_value(weight_scale)
+                            else:
+                                self.transformer_block.cache_v_out_scales[i_layer].set_value(weight_scale)
+
+                for k, v in weight_scales_loader.scale.items():
+                    if "qkv_" in k:
+                        for i_layer, weight_scale in enumerate(v):
+                            tmp = paddle.to_tensor(
+                                weight_scale
+                                / (
+                                    127.0 * 127.0 * act_scale_loader.scale["qkv_in_scale"][i_layer]
+                                )  # [3 * num_head * dim_head]
+                            ).reshape([-1])
+
+                            if self.config.tensor_parallel_degree > 1 and self.config.single_card_ptq:
+                                tmp = (
+                                    tmp.reshape([3, self.num_attention_heads, head_size])
+                                    .split(self.config.tensor_parallel_degree, axis=1)[
+                                        self.config.tensor_parallel_rank
+                                    ]
+                                    .reshape([-1])
+                                )
+                            self.transformer_block.qkv_out_scales[i_layer].set_value(tmp)
+                        pass
+                    elif "out_linear_" in k:
+                        for i_layer, weight_scale in enumerate(v):
+                            tmp = paddle.to_tensor(
+                                weight_scale / (127.0 * 127.0 * act_scale_loader.scale["out_linear_in_scale"][i_layer])
+                            )
+                            self.transformer_block.linear_out_scales[i_layer].set_value(tmp)
+                    elif "ffn1_weight_scale" in k:
+                        for i_layer, weight_scale in enumerate(v):
+                            tmp = paddle.to_tensor(
+                                weight_scale / (127.0 * 127.0 * act_scale_loader.scale["ffn1_in_scale"][i_layer])
+                            )
+                            if self.config.tensor_parallel_degree > 1 and self.config.single_card_ptq:
+                                tmp = paddle.split(tmp, self.config.tensor_parallel_degree * 2)
+                                tmp = paddle.concat(
+                                    [
+                                        tmp[self.config.tensor_parallel_rank],
+                                        tmp[self.config.tensor_parallel_rank + self.config.tensor_parallel_degree],
+                                    ],
+                                    axis=0,
+                                )
+                            self.transformer_block.ffn1_out_scales[i_layer].set_value(tmp)
+                    elif "ffn2" in k:
+                        for i_layer, weight_scale in enumerate(v):
+                            self.transformer_block.ffn2_out_scales[i_layer].set_value(
+                                paddle.to_tensor(
+                                    weight_scale / (127.0 * 127.0 * act_scale_loader.scale["ffn2_in_scale"][i_layer])
+                                )
+                            )
+
+
+@register_base_model
+class LlamaBlockInferenceModel(LlamaInferenceModel):
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.max_seq_len = config.max_seq_len
+        self.block_size = config.block_size
+
+    def set_transformer_block(self, transformer_config):
+        if self.use_weight_only:
+            self.transformer_block = FusedBlockMultiTransformerWeightOnly(transformer_config)
+        elif "a8w8" in self.quant_type:
+            self.transformer_block = FusedBlockMultiTransformerA8W8(transformer_config)
+        else:
+            self.transformer_block = FusedBlockMultiTransformer(transformer_config)
+
+    def remove_padding(self, input_ids, seq_lens_this_time):
+        cum_offsets_now = paddle.cumsum(self.max_seq_len - seq_lens_this_time)
+        token_num = paddle.sum(seq_lens_this_time)
+        from paddlenlp_ops import get_padding_offset_v2
+
+        ids_remove_padding, cum_offsets, padding_offset, cu_seqlens_q, cu_seqlens_k = get_padding_offset_v2(
+            input_ids, cum_offsets_now, token_num, seq_lens_this_time
+        )
+        return ids_remove_padding, padding_offset, cum_offsets, cu_seqlens_q, cu_seqlens_k
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        caches=None,
+        pre_caches=None,
+        output_attentions=False,
+        output_hidden_states=None,
+        return_dict=False,
+        **kwargs,
+    ):
+
+        seq_lens_this_time = kwargs.get("seq_lens_this_time", None)
+        rope_emb = kwargs.get("rope_emb", None)
+        ids_remove_padding, padding_offset, cum_offsets, cu_seqlens_q, cu_seqlens_k = self.remove_padding(
+            input_ids, seq_lens_this_time
+        )
+        kwargs["cu_seqlens_q"] = cu_seqlens_q
+        kwargs["cu_seqlens_k"] = cu_seqlens_k
+        kwargs["padding_offsets"] = padding_offset
+        kwargs["max_input_length"] = self.max_seq_len
+
+        inputs_embeds = self.embed_tokens(ids_remove_padding)
+
+        with dy2st_nocheck_guard_context():
+            hidden_states, _ = self.transformer_block(
+                input_ids=input_ids,
+                src=inputs_embeds,
+                cum_offsets=cum_offsets,
+                attn_mask=attention_mask,
+                caches=caches,
+                pre_caches=pre_caches,
+                rotary_embs=rope_emb,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=None,
+            hidden_states=None,
+            attentions=None,
+        )
+
+
+class LlamaForCausalLMAvxInferenceModel(GenerationAvxInferenceModel, LlamaPretrainedModel):
+
+    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.llama = LlamaAvxInferenceModel(config)
+        self.lm_head = LlamaLMHead(config)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        return infererence_model_from_pretrained(cls, pretrained_model_name_or_path, args, kwargs)
+
+    @classmethod
+    def get_cache_kvs_shape(
+        cls, config: LlamaConfig, max_batch_size: int = None, max_length: int = None
+    ) -> list[list[int]]:
+        return []
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        **kwargs,
+    ):
+        seq_len_encoder = kwargs.get("seq_len_encoder", None)
+        seq_len_decoder = kwargs.get("seq_len_decoder", None)
+        tgt_ids = kwargs.get("tgt_ids", None)
+        cache = kwargs.get("cache", None)
+        inputs_embeds = kwargs.get("inputs_embeds", None)
+        step_idx = kwargs.get("step_idx", None)
+        if cache is None:
+            # encoder
+            past_seq_len = paddle.zeros_like(seq_len_decoder - seq_len_encoder, dtype="int64")
+        else:
+            # decoer
+            past_seq_len = paddle.cast(seq_len_decoder, "int64")
+            input_ids = tgt_ids
+            inputs_embeds = None
+
+        model_inputs = {
+            "input_ids": input_ids,
+            "inputs_embeds": inputs_embeds,
+            "past_seq_len": past_seq_len,
+            "step_idx": step_idx,
+        }
+        return model_inputs
+
+    def forward(
+        self,
+        input_ids,
+        inputs_embeds=None,
+        past_seq_len=None,
+        step_idx=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.llama(
+            input_ids,
+            inputs_embeds=inputs_embeds,
+            past_seq_len=past_seq_len,
+            cur_seq_len=paddle.to_tensor(input_ids.shape[1], dtype="int64"),
+            step_idx=step_idx,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(
+            hidden_states,
+            tensor_parallel_output=False,
+        )
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=None,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict):
+        if "lm_head.weight" in state_dict:
+            self.lm_head.weight.set_value(state_dict["lm_head.weight"])
+        self.llama.set_state_dict({k: state_dict[k] for k in state_dict.keys()})
+
+
+class LlamaForCausalLMInferenceModel(GenerationInferenceModel, LlamaPretrainedModel):
+    """
+    Dynamic Batching for LLaMA Model with pretraining tasks on top.
+    """
+
+    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.llama = LlamaInferenceModel(config)
+        self.lm_head = LlamaLMHead(config)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        return infererence_model_from_pretrained(cls, pretrained_model_name_or_path, args, kwargs)
+
+    @classmethod
+    def get_cache_kvs_shape(
+        cls, config: LlamaConfig, max_batch_size: int = None, max_length: int = None
+    ) -> list[list[int]]:
+        """get cache_kvs tensor for llama model
+
+        Args:
+            max_batch_size (int): the max batch size
+            max_length (int | None, optional): the max_length of cache_kvs. Defaults to None.
+
+        Returns:
+            list[paddle.Tensor]: the list tensor shape for cache
+        """
+        if max_length is None:
+            max_length = config.max_position_embeddings
+
+        cache_kvs = []
+        for _ in range(config.num_hidden_layers):
+            cache_kvs.append(
+                [
+                    2,
+                    max_batch_size,
+                    config.num_key_value_heads // max(config.tensor_parallel_degree, 1),
+                    max_length,
+                    config.hidden_size // config.num_attention_heads,
+                ]
+            )
+        return cache_kvs
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        cache_kvs,
+        seq_len_encoder,
+        seq_len_decoder,
+        tgt_ids,
+        tgt_pos,
+        tgt_generation_mask,
+        **kwargs,
+    ):
+        position_ids = kwargs.get("position_ids", None)
+        attention_mask = kwargs.get("attention_mask", None)
+        cache = kwargs.get("cache", None)
+        pre_caches = kwargs.get("pre_caches", None)
+        inputs_embeds = kwargs.get("inputs_embeds", None)
+        if cache is not None:
+            input_ids = tgt_ids
+            position_ids = tgt_pos
+            attention_mask = (tgt_generation_mask - 1) * 1e4
+            # make inputs_embeds be none in decoder phase.
+            # in forward function, it will be assigned according to input_ids.
+            inputs_embeds = None
+        else:
+            attention_mask = (attention_mask - 1) * 1e4
+        model_inputs = {
+            "input_ids": input_ids,
+            "inputs_embeds": inputs_embeds,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "cache_kvs": cache_kvs,
+            "seq_len_encoder": seq_len_encoder,
+            "seq_len_decoder": seq_len_decoder,
+            "cache": cache,
+            "pre_caches": pre_caches,
+        }
+        return model_inputs
+
+    def forward(
+        self,
+        input_ids,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=False,
+        cache=None,
+        cache_kvs=None,
+        pre_caches=None,
+        seq_len_encoder=None,
+        seq_len_decoder=None,
+        past_key_values=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.llama(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache=cache,
+            cache_kvs=cache_kvs,
+            pre_caches=pre_caches,
+            seq_len_encoder=seq_len_encoder,
+            seq_len_decoder=seq_len_decoder,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(
+            hidden_states,
+            tensor_parallel_output=False,
+        )
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+            # Flatten the tokens
+            loss = self.criterion(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict):
+        if "lm_head.weight" in state_dict:
+            self.lm_head.weight.set_value(
+                paddle.to_tensor(state_dict["lm_head.weight"]).cast(self.lm_head.weight.dtype)
+            )
+        self.llama.set_state_dict({k: state_dict[k] for k in state_dict.keys()})
+
+
+class LlamaForCausalLMBlockInferenceModel(GenerationBlockInferenceModel, LlamaPretrainedModel):
+    """
+    Dynamic Batching for LLaMA Model with pretraining tasks on top.
+    """
+
+    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.llama = LlamaBlockInferenceModel(config)
+        self.lm_head = LlamaLMHead(config)
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config: LlamaConfig, is_split=True):
+
+        logger.info("llama inference model _get_tensor_parallel_mappings")
+
+        from paddlenlp.transformers.conversion_utils import split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def get_tensor_parallel_split_mappings(num_layers):
+            final_actions = {}
+
+            base_actions = {
+                "lm_head.weight": partial(fn, is_column=True),
+                # Row Linear
+                "embed_tokens.weight": partial(fn, is_column=False),
+                "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False),
+                "layers.0.mlp.down_proj.weight": partial(fn, is_column=False),
+            }
+
+            if "a8w8" in config.quant_type:
+                if config.quantization_config.shift_smooth_all_linears:
+                    base_actions["layers.0.self_attn.o_proj.shift_bias"] = partial(fn, is_column=True)
+                    base_actions["layers.0.self_attn.o_proj.smooth_weight"] = partial(fn, is_column=True)
+                    base_actions["layers.0.mlp.down_proj.shift_bias"] = partial(fn, is_column=True)
+                    base_actions["layers.0.mlp.down_proj.smooth_weight"] = partial(fn, is_column=True)
+
+                if config.quantization_config.shift:
+                    if config.fuse_attention_qkv:
+                        base_actions["layers.0.self_attn.qkv_proj.bias"] = partial(fn, is_column=True)
+                    else:
+                        base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True)
+                        # if we have enough num_key_value_heads to split, then split it.
+                        if config.num_key_value_heads % config.tensor_parallel_degree == 0:
+                            base_actions["layers.0.self_attn.k_proj.bias"] = partial(fn, is_column=True)
+                            base_actions["layers.0.self_attn.v_proj.bias"] = partial(fn, is_column=True)
+
+                    if config.fuse_attention_ffn:
+                        base_actions["layers.0.mlp.gate_up_fused_proj.bias"] = partial(
+                            fn, is_column=True, is_naive_2fuse=True
+                        )
+                    else:
+                        base_actions["layers.0.mlp.gate_proj.bias"] = partial(fn, is_column=True)
+                        base_actions["layers.0.mlp.up_proj.bias"] = partial(fn, is_column=True)
+
+            # Column Linear
+            if config.fuse_attention_qkv:
+                base_actions["layers.0.self_attn.qkv_proj.weight"] = partial(fn, is_column=True)
+            else:
+                base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True)
+                # if we have enough num_key_value_heads to split, then split it.
+                if config.num_key_value_heads % config.tensor_parallel_degree == 0:
+                    base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True)
+                    base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True)
+
+            if config.fuse_attention_ffn:
+                base_actions["layers.0.mlp.gate_up_fused_proj.weight"] = partial(
+                    fn, is_column=True, is_naive_2fuse=True
+                )
+            else:
+                base_actions["layers.0.mlp.gate_proj.weight"] = partial(fn, is_column=True)
+                base_actions["layers.0.mlp.up_proj.weight"] = partial(fn, is_column=True)
+
+            for key, action in base_actions.items():
+                if "layers.0." in key:
+                    for i in range(num_layers):
+                        final_actions[key.replace("layers.0.", f"layers.{i}.")] = action
+                final_actions[key] = action
+
+            return final_actions
+
+        mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
+
+        return mappings
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        return infererence_model_from_pretrained(cls, pretrained_model_name_or_path, args, kwargs)
+
+    @classmethod
+    def get_cache_kvs_shape(
+        cls, config: LlamaConfig, max_batch_size: int = None, max_length: int = None
+    ) -> list[list[int]]:
+        """get cache_kvs tensor for llama model
+
+        Args:
+            max_batch_size (int): the max batch size
+            max_length (int | None, optional): the max_length of cache_kvs. Defaults to None.
+
+        Returns:
+            list[paddle.Tensor]: the list tensor shape for cache
+        """
+        max_block_per_seq = (config.max_seq_len + config.block_size - 1) // config.block_size
+        if max_batch_size == -1:
+            max_block_nums = None
+        else:
+            max_block_nums = max_batch_size * max_block_per_seq
+
+        cache_kvs = []
+        for _ in range(config.num_hidden_layers):
+            cache_kv_shape = [
+                max_block_nums,
+                config.num_key_value_heads // max(config.tensor_parallel_degree, 1),
+                config.block_size,
+                config.hidden_size // config.num_attention_heads,
+            ]
+            cache_kvs.append(cache_kv_shape)
+            cache_kvs.append(cache_kv_shape)
+        return cache_kvs
+
+    def prepare_inputs_for_generation(self, **kwargs):
+        # only last token for inputs_ids if cache is defined in kwargs
+        input_ids = kwargs["input_ids"]
+        src_mask = kwargs.get("src_mask", None)
+        block_tables = kwargs.get("block_tables", None)
+
+        pre_caches = kwargs.get("pre_caches", None)
+        caches = kwargs.get("caches", None)
+
+        rope_emb = kwargs["rope_emb"]
+        seq_lens_this_time = kwargs["seq_lens_this_time"]
+        seq_lens_encoder = kwargs["seq_lens_encoder"]
+        seq_lens_decoder = kwargs["seq_lens_decoder"]
+        k_quant_scales = kwargs.get("k_quant_scales", None)
+        v_quant_scales = kwargs.get("v_quant_scales", None)
+        k_dequant_scales = kwargs.get("k_dequant_scales", None)
+        v_dequant_scales = kwargs.get("v_dequant_scales", None)
+        model_inputs = {
+            "input_ids": input_ids,
+            "src_mask": src_mask,
+            "rope_emb": rope_emb,
+            "pre_caches": pre_caches,
+            "caches": caches,
+            "seq_lens_this_time": seq_lens_this_time,
+            "seq_lens_encoder": seq_lens_encoder,
+            "seq_lens_decoder": seq_lens_decoder,
+            "block_tables": block_tables,
+            "k_quant_scales": k_quant_scales,
+            "v_quant_scales": v_quant_scales,
+            "k_dequant_scales": k_dequant_scales,
+            "v_dequant_scales": v_dequant_scales,
+        }
+        return model_inputs
+
+    def forward(
+        self,
+        input_ids,
+        src_mask=None,
+        pre_caches=None,
+        caches=None,
+        seq_lens_this_time=None,
+        seq_lens_encoder=None,
+        seq_lens_decoder=None,
+        rope_emb=None,
+        block_tables=None,
+        k_quant_scales=None,
+        v_quant_scales=None,
+        k_dequant_scales=None,
+        v_dequant_scales=None,
+    ):
+        outputs = self.llama(
+            input_ids,
+            src_mask=src_mask,
+            caches=caches,
+            rope_emb=rope_emb,
+            block_tables=block_tables,
+            pre_caches=pre_caches,
+            seq_lens_this_time=seq_lens_this_time,
+            seq_lens_encoder=seq_lens_encoder,
+            seq_lens_decoder=seq_lens_decoder,
+            k_quant_scales=k_quant_scales,
+            v_quant_scales=v_quant_scales,
+            k_dequant_scales=k_dequant_scales,
+            v_dequant_scales=v_dequant_scales,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(
+            hidden_states,
+            tensor_parallel_output=False,
+        )
+
+        return logits
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict):
+        if "lm_head.weight" in state_dict:
+            self.lm_head.weight.set_value(
+                paddle.to_tensor(state_dict["lm_head.weight"]).cast(self.lm_head.weight.dtype)
+            )
+        self.llama.set_state_dict({k: state_dict[k] for k in state_dict.keys()})
+
+
+class LlamaForMiniGPT4InferenceModel(LlamaForCausalLMInferenceModel):
+    """
+    This class is 99% like LlamaForCausalLMInferenceModel.
+    Used only for miniGPT4's second part.
+    """
+
+    # This function corresponds to miniGPT4's second part, only used in miniGPT4.
+    @paddle.no_grad()
+    def generate_text_with_image_features(
+        self,
+        image_features: paddle.Tensor,
+        first_input_ids: paddle.Tensor,
+        second_input_ids: paddle.Tensor,
+        attention_mask: paddle.Tensor,
+        position_ids=None,
+        penalty_score=None,
+        frequency_score=None,
+        presence_score=None,
+        min_length=None,
+        max_length=None,
+        temperature=None,
+        top_p=None,
+        eos_token_id=None,
+        seq_len_encoder=None,
+        seq_len_decoder=None,
+        step_idx=None,
+        stop_flags=None,
+        tgt_ids=None,
+        tgt_pos=None,
+        tgt_generation_mask=None,
+        pre_ids=None,
+        stop_nums=None,
+        cache_kvs=[],
+        inputs_embeds=None,
+        **generate_kwargs
+    ) -> paddle.Tensor:
+
+        first_embeds = self.llama.embed_tokens(first_input_ids)
+        second_embeds = self.llama.embed_tokens(second_input_ids)
+        image_features = paddle.cast(image_features, dtype=first_embeds.dtype)
+        inputs_embeds = paddle.concat([first_embeds, image_features, second_embeds], axis=1)
+
+        outputs = self.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            penalty_score=penalty_score,
+            frequency_score=frequency_score,
+            presence_score=presence_score,
+            min_length=min_length,
+            max_length=max_length,
+            temperature=temperature,
+            top_p=top_p,
+            eos_token_id=eos_token_id,
+            seq_len_encoder=seq_len_encoder,
+            seq_len_decoder=seq_len_decoder,
+            step_idx=step_idx,
+            stop_flags=stop_flags,
+            tgt_ids=tgt_ids,
+            tgt_pos=tgt_pos,
+            tgt_generation_mask=tgt_generation_mask,
+            pre_ids=pre_ids,
+            stop_nums=stop_nums,
+            cache_kvs=cache_kvs,
+        )
+        return outputs
+
+    # rewrite to_static function in generation_utils.py
+    def to_static(self, output_path: str, config: dict):
+        dtype = config.get("dtype", paddle.get_default_dtype())
+        cache_kvs_shapes = self.get_cache_kvs_shape(self.config, max_length=config.get("max_length", None))
+        input_spec = [
+            paddle.static.InputSpec(
+                shape=[None, None, None], dtype="float32", name="image_features"
+            ),  # image_features
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="first_input_ids"),  # first_input_ids
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="second_input_ids"),  # second_input_ids
+            paddle.static.InputSpec(shape=[None, None], dtype=dtype, name="attention_mask"),  # attention_mask
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="position_ids"),  # position_ids
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="penalty_score"),  # penalty_score
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="frequency_score"),  # frequency_score
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="presence_score"),  # presence_score
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="min_length"),  # min_decode_length
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="max_length"),  # max_decode_length
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="temperature"),  # temperature
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="top_p"),  # top_p
+            paddle.static.InputSpec(shape=[None], dtype="int64", name="eos_token_id"),  # eos_token_id
+            paddle.static.InputSpec(shape=[None, 1], dtype="int32", name="seq_len_encoder"),  # seq_len_encoder
+            paddle.static.InputSpec(shape=[None, 1], dtype="int32", name="seq_len_decoder"),  # seq_len_decoder
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="step_idx"),  # step_idx
+            paddle.static.InputSpec(shape=[None, 1], dtype="bool", name="stop_flags"),  # stop_flags
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="tgt_ids"),  # tgt_ids
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="tgt_pos"),  # tgt_pos
+            paddle.static.InputSpec(
+                shape=[None, 1, 1, None], dtype=dtype, name="tgt_generation_mask"
+            ),  # tgt_generation_mask
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="pre_ids"),  # pre_ids
+            paddle.static.InputSpec(shape=[1], dtype="int64", name="stop_nums"),  # stop_nums
+            [
+                paddle.static.InputSpec(
+                    shape=shape,
+                    dtype=dtype,
+                    name="cache_kvs_{}".format(i),
+                )
+                for i, shape in enumerate(cache_kvs_shapes)
+            ],  # cache_kvs
+        ]
+
+        model = paddle.jit.to_static(self.generate_text_with_image_features, input_spec=input_spec)
+        paddle.jit.save(model, output_path)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/llama/ptq_scales_map.json b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/llama/ptq_scales_map.json
new file mode 100644
index 000000000..409db47f2
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/llama/ptq_scales_map.json
@@ -0,0 +1,21 @@
+{
+    "act_scale":{
+        "qkv_in_scale": "llama.layers.#.self_attn.q_proj.activation_quanter", 
+        "out_linear_in_scale": "llama.layers.#.self_attn.o_proj.activation_quanter", 
+        "ffn1_in_scale": "llama.layers.#.mlp.gate_proj.activation_quanter", 
+        "ffn2_in_scale": "llama.layers.#.mlp.down_proj.activation_quanter"
+    },
+    "weight_scale":{
+        "q_weight_scale":"llama.layers.#.self_attn.q_proj.weight_quanter",
+        "k_weight_scale":"llama.layers.#.self_attn.k_proj.weight_quanter",
+        "v_weight_scale":"llama.layers.#.self_attn.v_proj.weight_quanter",
+        "out_linear_weight_scale":"llama.layers.#.self_attn.o_proj.weight_quanter",
+        "ffn1_1_weight_scale":"llama.layers.#.mlp.gate_proj.weight_quanter",
+        "ffn1_2_weight_scale":"llama.layers.#.mlp.up_proj.weight_quanter",
+        "ffn2_weight_scale":"llama.layers.#.mlp.down_proj.weight_quanter"
+    },
+    "cachekv_scale":{
+        "cache_k_scale": "llama.layers.#.self_attn.cachek_matmul.activation_quanter",
+        "cache_v_scale": "llama.layers.#.self_attn.cachev_matmul.activation_quanter"
+    }
+  }
\ No newline at end of file
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/llama/ptq_scales_map_shift_smooth.json b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/llama/ptq_scales_map_shift_smooth.json
new file mode 100644
index 000000000..4aa512235
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/llama/ptq_scales_map_shift_smooth.json
@@ -0,0 +1,21 @@
+{
+    "act_scale":{
+        "qkv_in_scale": "llama.layers.#.self_attn.q_proj.activation_quanter", 
+        "out_linear_in_scale": "llama.layers.#.self_attn.o_proj.layer.activation_quanter", 
+        "ffn1_in_scale": "llama.layers.#.mlp.gate_proj.activation_quanter", 
+        "ffn2_in_scale": "llama.layers.#.mlp.down_proj.layer.activation_quanter"
+    },
+    "weight_scale":{
+        "q_weight_scale":"llama.layers.#.self_attn.q_proj.weight_quanter",
+        "k_weight_scale":"llama.layers.#.self_attn.k_proj.weight_quanter",
+        "v_weight_scale":"llama.layers.#.self_attn.v_proj.weight_quanter",
+        "out_linear_weight_scale":"llama.layers.#.self_attn.o_proj.layer.weight_quanter",
+        "ffn1_1_weight_scale":"llama.layers.#.mlp.gate_proj.weight_quanter",
+        "ffn1_2_weight_scale":"llama.layers.#.mlp.up_proj.weight_quanter",
+        "ffn2_weight_scale":"llama.layers.#.mlp.down_proj.layer.weight_quanter"
+    },
+    "cachekv_scale":{
+        "cache_k_scale": "llama.layers.#.self_attn.cachek_matmul.activation_quanter",
+        "cache_v_scale": "llama.layers.#.self_attn.cachev_matmul.activation_quanter"
+    }
+}
\ No newline at end of file
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/opt/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/opt/__init__.py
new file mode 100644
index 000000000..c2a7f656c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/opt/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .modeling import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/opt/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/opt/modeling.py
new file mode 100644
index 000000000..64d041453
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/opt/modeling.py
@@ -0,0 +1,556 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+
+from paddlenlp.experimental.transformers.fused_transformer_layers import (
+    FusedMultiTransformerBase,
+    FusedMultiTransformerConfig,
+)
+from paddlenlp.experimental.transformers.generation_utils import (
+    GenerationInferenceModel,
+)
+from paddlenlp.experimental.transformers.utils import infererence_model_from_pretrained
+from paddlenlp.transformers import OPTPretrainedModel
+from paddlenlp.transformers.model_utils import (
+    dy2st_nocheck_guard_context,
+    register_base_model,
+)
+from paddlenlp.transformers.opt.configuration import OPTConfig
+from paddlenlp.transformers.opt.modeling import OPTEmbeddings, OPTLMHead
+
+__all__ = ["OPTForCausalLMInferenceModel", "OPTForBlip2InferenceModel"]
+
+
+@register_base_model
+class OPTInferenceModel(OPTPretrainedModel):
+    def __init__(self, config: OPTConfig):
+        super(OPTInferenceModel, self).__init__(config)
+        self.pad_token_id = config.pad_token_id
+        self.initializer_range = config.initializer_range
+        self.vocab_size = config.vocab_size
+        self.embeddings = OPTEmbeddings(config)
+
+        if config.normalize_before:
+            self.final_layer_norm = nn.LayerNorm(config.hidden_size)
+        else:
+            self.final_layer_norm = None
+
+        self.num_layers = config.num_hidden_layers
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_size = self.hidden_size // self.num_heads
+
+        self.epsilon = 1e-5
+
+        ln_scale_attrs = [
+            paddle.ParamAttr(name="opt.decoder.layers.{}.norm1.weight".format(i))
+            for i in range(config.num_hidden_layers)
+        ]
+        ln_bias_attrs = [
+            paddle.ParamAttr(name="opt.decoder.layers.{}.norm1.bias".format(i))
+            for i in range(config.num_hidden_layers)
+        ]
+
+        qkv_weight_attrs = [
+            paddle.ParamAttr(name="opt.decoder.layers.{}.qkv_weight".format(i))
+            for i in range(config.num_hidden_layers)
+        ]
+        qkv_bias_attrs = [
+            paddle.ParamAttr(name="opt.decoder.layers.{}.qkv_bias".format(i)) for i in range(config.num_hidden_layers)
+        ]
+
+        out_proj_weight_attrs = [
+            paddle.ParamAttr(name="opt.decoder.layers.{}.self_attn.out_proj.weight".format(i))
+            for i in range(config.num_hidden_layers)
+        ]
+        out_proj_bias_attrs = [
+            paddle.ParamAttr(name="opt.decoder.layers.{}.self_attn.out_proj.bias".format(i))
+            for i in range(config.num_hidden_layers)
+        ]
+
+        ffn_ln_scale_attrs = [
+            paddle.ParamAttr(name="opt.decoder.layers.{}.norm2.weight".format(i))
+            for i in range(config.num_hidden_layers)
+        ]
+        ffn_ln_bias_attrs = [
+            paddle.ParamAttr(name="opt.decoder.layers.{}.norm2.bias".format(i))
+            for i in range(config.num_hidden_layers)
+        ]
+
+        ffn1_weight_attrs = [
+            paddle.ParamAttr(name="opt.decoder.layers.{}.linear1.weight".format(i))
+            for i in range(config.num_hidden_layers)
+        ]
+        ffn1_bias_attrs = [
+            paddle.ParamAttr(name="opt.decoder.layers.{}.linear1.bias".format(i))
+            for i in range(config.num_hidden_layers)
+        ]
+        ffn2_weight_attrs = [
+            paddle.ParamAttr(name="opt.decoder.layers.{}.linear2.weight".format(i))
+            for i in range(config.num_hidden_layers)
+        ]
+        ffn2_bias_attrs = [
+            paddle.ParamAttr(name="opt.decoder.layers.{}.linear2.bias".format(i))
+            for i in range(config.num_hidden_layers)
+        ]
+
+        transformer_config = FusedMultiTransformerConfig(
+            config.hidden_size,
+            config.num_attention_heads,
+            config.intermediate_size,
+            dropout_rate=0.0,
+            activation="relu",
+            normalize_before=True,
+            num_layers=config.num_hidden_layers,
+            nranks=1,
+            ring_id=-1,
+            ln_scale_attrs=ln_scale_attrs,
+            ln_bias_attrs=ln_bias_attrs,
+            qkv_weight_attrs=qkv_weight_attrs,
+            qkv_bias_attrs=qkv_bias_attrs,
+            linear_weight_attrs=out_proj_weight_attrs,
+            linear_bias_attrs=out_proj_bias_attrs,
+            ffn_ln_scale_attrs=ffn_ln_scale_attrs,
+            ffn_ln_bias_attrs=ffn_ln_bias_attrs,
+            ffn1_weight_attrs=ffn1_weight_attrs,
+            ffn1_bias_attrs=ffn1_bias_attrs,
+            ffn2_weight_attrs=ffn2_weight_attrs,
+            ffn2_bias_attrs=ffn2_bias_attrs,
+            epsilon=self.epsilon,
+        )
+
+        self.transformer_block = FusedMultiTransformerBase(transformer_config)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def remove_padding(self, input_ids, seq_lens_this_time):
+        cum_offsets_now = paddle.cumsum(paddle.max(seq_lens_this_time) - seq_lens_this_time)
+        token_num = paddle.sum(seq_lens_this_time)
+        from paddlenlp_ops import get_padding_offset
+
+        ids_remove_padding, cum_offsets, padding_offset = get_padding_offset(
+            input_ids, cum_offsets_now, token_num, seq_lens_this_time
+        )
+        return ids_remove_padding, padding_offset, cum_offsets
+
+    # This function is a little different from prepare_input_ids_for_generation in paddlenlp/transformers/generation/utils.py
+    @staticmethod
+    def prepare_input_ids_for_generation(bos_token_id, encoder_output=None):
+        batch_size = 1
+        seq_len = 1
+        if bos_token_id is None:
+            raise ValueError("`bos_token_id` should be defined when no " "`input_ids` are provided.")
+        if encoder_output is not None:
+            batch_size = encoder_output.shape[0]
+            seq_len = encoder_output.shape[1]
+        return paddle.ones([batch_size, seq_len], dtype="int64") * bos_token_id
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        use_cache=None,
+        cache_kvs=None,
+        seq_len_encoder=None,
+        seq_len_decoder=None,
+        past_key_values=None,
+        output_attentions=False,
+        output_hidden_states=None,
+        return_dict=False,
+        **kwargs,
+    ):
+        # kwargs["cache"] is used used to distinguish between encoder and decoder phase.
+        past_key_values = kwargs.get("cache", None)
+        is_decoder = past_key_values is not None
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is None and inputs_embeds is None:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        # genereate a fake input_ids according to inputs_embeds
+        # this is usually occurred in img2txt multimodal model when first enter into this forward function.
+        if input_ids is None and inputs_embeds is not None:
+            input_ids = self.prepare_input_ids_for_generation(self.config.bos_token_id, inputs_embeds)
+
+        batch, seq_len = input_ids.shape
+
+        past_kv_length = paddle.max(seq_len_decoder) if is_decoder else 0
+        now_len = past_kv_length + seq_len
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            attention_mask=paddle.ones([batch, now_len], dtype="int64"),
+            input_embeddings=inputs_embeds,
+            past_key_values_length=past_kv_length,
+        )
+
+        var_embedding_output = None
+        if not is_decoder:
+            # support variable seqence length embeddings
+            var_embedding_output = embedding_output[0, 0 : seq_len_encoder[0][0], :]
+            for b in range(1, batch):
+                var_embedding_output = paddle.concat(
+                    [var_embedding_output, embedding_output[b, 0 : seq_len_encoder[b][0], :]], axis=0
+                )
+        else:
+            # merge batch and seq_len dimension.
+            var_embedding_output = embedding_output.reshape([batch * seq_len, self.hidden_size])
+        embedding_output = var_embedding_output
+
+        if not is_decoder:
+            # ids_remove_padding
+            _, padding_offset, cum_offsets = self.remove_padding(input_ids, seq_len_encoder)
+        else:
+            _ = input_ids
+            padding_offset = None
+            cum_offsets = None
+
+        seq_lens = seq_len_decoder if is_decoder else seq_len_encoder
+        with dy2st_nocheck_guard_context():
+
+            hidden_states, _ = self.transformer_block(
+                input_ids,
+                embedding_output,
+                cum_offsets=cum_offsets,
+                padding_offset=padding_offset,
+                attn_mask=paddle.cast(attention_mask, dtype=embedding_output.dtype),
+                caches=cache_kvs,
+                seq_lens=seq_lens,
+                rotary_embs=None,
+                rotary_emb_dims=0,
+                time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None,
+            )
+
+        output = hidden_states
+
+        if self.final_layer_norm:
+            output = self.final_layer_norm(output)
+        return output
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict):
+
+        self.embeddings.position_embeddings.weight.set_value(
+            state_dict.pop("opt.embeddings.position_embeddings.weight")
+        )
+        self.embeddings.word_embeddings.weight.set_value(state_dict.pop("opt.embeddings.word_embeddings.weight"))
+        self.final_layer_norm.weight.set_value(state_dict.pop("opt.decoder.final_layer_norm.weight"))
+        self.final_layer_norm.bias.set_value(state_dict.pop("opt.decoder.final_layer_norm.bias"))
+
+        for i in range(self.num_layers):
+            ln_scale = state_dict.pop("opt.decoder.layers.{}.norm1.weight".format(i))
+            ln_bias = state_dict.pop("opt.decoder.layers.{}.norm1.bias".format(i))
+            ln_scale = paddle.cast(ln_scale, "float32")
+            ln_bias = paddle.cast(ln_bias, "float32")
+
+            q_weight = state_dict.pop("opt.decoder.layers.{}.self_attn.q_proj.weight".format(i))
+            k_weight = state_dict.pop("opt.decoder.layers.{}.self_attn.k_proj.weight".format(i))
+            v_weight = state_dict.pop("opt.decoder.layers.{}.self_attn.v_proj.weight".format(i))
+            q_bias = state_dict["opt.decoder.layers.{}.self_attn.q_proj.bias".format(i)]
+            k_bias = state_dict["opt.decoder.layers.{}.self_attn.k_proj.bias".format(i)]
+            v_bias = state_dict["opt.decoder.layers.{}.self_attn.v_proj.bias".format(i)]
+
+            concated_qkv_weight = np.concatenate([q_weight, k_weight, v_weight], axis=-1)
+            concated_qkv_weight = concated_qkv_weight.transpose(1, 0)
+            concated_qkv_weight = concated_qkv_weight.reshape(3 * self.num_heads * self.head_size, self.hidden_size)
+            concated_qkv_weight = paddle.to_tensor(concated_qkv_weight)
+
+            concated_qkv_bias = np.concatenate([q_bias, k_bias, v_bias], axis=-1)
+            concated_qkv_bias = concated_qkv_bias.reshape(3 * self.num_heads * self.head_size)
+            concated_qkv_bias = paddle.to_tensor(concated_qkv_bias)
+
+            out_proj_weight = state_dict.pop("opt.decoder.layers.{}.self_attn.out_proj.weight".format(i))
+            out_proj_bias = state_dict.pop("opt.decoder.layers.{}.self_attn.out_proj.bias".format(i))
+
+            ffn_ln_scale = state_dict.pop("opt.decoder.layers.{}.norm2.weight".format(i))
+            ffn_ln_bias = state_dict.pop("opt.decoder.layers.{}.norm2.bias".format(i))
+            ffn_ln_scale = paddle.cast(ffn_ln_scale, "float32")
+            ffn_ln_bias = paddle.cast(ffn_ln_bias, "float32")
+
+            ffn1_weight = state_dict.pop("opt.decoder.layers.{}.linear1.weight".format(i))
+            ffn1_bias = state_dict.pop("opt.decoder.layers.{}.linear1.bias".format(i))
+            ffn2_weight = state_dict.pop("opt.decoder.layers.{}.linear2.weight".format(i))
+            ffn2_bias = state_dict.pop("opt.decoder.layers.{}.linear2.bias".format(i))
+
+            self.transformer_block.ln_scales[i].set_value(ln_scale)
+            self.transformer_block.ln_biases[i].set_value(ln_bias)
+
+            self.transformer_block.qkv_weights[i].set_value(concated_qkv_weight)
+            self.transformer_block.qkv_biases[i].set_value(concated_qkv_bias)
+
+            self.transformer_block.linear_weights[i].set_value(out_proj_weight)
+            self.transformer_block.linear_biases[i].set_value(out_proj_bias)
+
+            self.transformer_block.ffn_ln_scales[i].set_value(ffn_ln_scale)
+            self.transformer_block.ffn_ln_biases[i].set_value(ffn_ln_bias)
+
+            self.transformer_block.ffn1_weights[i].set_value(ffn1_weight)
+            self.transformer_block.ffn1_biases[i].set_value(ffn1_bias)
+
+            self.transformer_block.ffn2_weights[i].set_value(ffn2_weight)
+            self.transformer_block.ffn2_biases[i].set_value(ffn2_bias)
+
+
+class OPTForCausalLMInferenceModel(GenerationInferenceModel, OPTPretrainedModel):
+    def __init__(self, config: OPTConfig, **kwargs):
+        super(OPTForCausalLMInferenceModel, self).__init__(config)
+        self.opt = OPTInferenceModel(config)
+        self.lm_head = OPTLMHead(config)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        return infererence_model_from_pretrained(cls, pretrained_model_name_or_path, args, kwargs)
+
+    @classmethod
+    def get_cache_kvs_shape(
+        cls, config: OPTConfig, max_batch_size: int = None, max_length: int = None
+    ) -> list[list[int]]:
+        """get cache_kvs tensor for opt model
+
+        Args:
+            max_batch_size (int): the max batch size
+            max_length (int | None, optional): the max_length of cache_kvs. Defaults to None.
+
+        Returns:
+            list[paddle.Tensor]: the list tensor shape for cache
+        """
+        if max_length is None:
+            max_length = config.max_position_embeddings
+
+        cache_kvs = []
+        for _ in range(config.num_hidden_layers):
+            cache_kvs.append(
+                [
+                    2,
+                    max_batch_size,
+                    config.num_attention_heads // max(config.tensor_parallel_degree, 1),
+                    max_length,
+                    config.hidden_size // config.num_attention_heads,
+                ]
+            )
+        return cache_kvs
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        cache_kvs,
+        seq_len_encoder,
+        seq_len_decoder,
+        tgt_ids,
+        tgt_pos,
+        tgt_generation_mask,
+        **kwargs,
+    ):
+        position_ids = kwargs.get("position_ids", None)
+        attention_mask = kwargs.get("attention_mask", None)
+        cache = kwargs.get("cache", None)
+        inputs_embeds = kwargs.get("inputs_embeds", None)
+        if cache is not None:
+            input_ids = tgt_ids
+            position_ids = tgt_pos
+            attention_mask = (tgt_generation_mask - 1) * 1e4
+            # make inputs_embeds be none in decoder phase.
+            # in forward function, it will be assigned according to input_ids.
+            inputs_embeds = None
+        else:
+            attention_mask = (attention_mask - 1) * 1e4
+        model_inputs = {
+            "input_ids": input_ids,
+            "inputs_embeds": inputs_embeds,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "cache_kvs": cache_kvs,
+            "seq_len_encoder": seq_len_encoder,
+            "seq_len_decoder": seq_len_decoder,
+            "cache": cache,
+        }
+        return model_inputs
+
+    def forward(
+        self,
+        input_ids,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=False,
+        cache=None,
+        cache_kvs=None,
+        seq_len_encoder=None,
+        seq_len_decoder=None,
+        past_key_values=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.opt(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache=cache,
+            cache_kvs=cache_kvs,
+            seq_len_encoder=seq_len_encoder,
+            seq_len_decoder=seq_len_decoder,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs
+        logits = self.lm_head(hidden_states)
+        return logits
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict):
+        if "lm_head.decoder_weight" in state_dict:
+            self.lm_head.decoder_weight.set_value(state_dict["lm_head.decoder_weight"])
+        self.opt.set_state_dict({k: state_dict[k] for k in state_dict.keys()})
+
+
+class OPTForBlip2InferenceModel(OPTForCausalLMInferenceModel):
+    """
+    This class is 99% like OPTForCausalLMInferenceModel.
+    Used only for blip2's second part.
+    """
+
+    # This function corresponds to miniGPT4's second part, only used in miniGPT4.
+    @paddle.no_grad()
+    def generate_text_with_image_features(
+        self,
+        image_features: paddle.Tensor,
+        second_input_ids: paddle.Tensor,
+        attention_mask: paddle.Tensor,
+        position_ids=None,
+        penalty_score=None,
+        frequency_score=None,
+        presence_score=None,
+        min_length=None,
+        max_length=None,
+        temperature=None,
+        top_p=None,
+        eos_token_id=None,
+        seq_len_encoder=None,
+        seq_len_decoder=None,
+        step_idx=None,
+        stop_flags=None,
+        tgt_ids=None,
+        tgt_pos=None,
+        tgt_generation_mask=None,
+        pre_ids=None,
+        stop_nums=None,
+        cache_kvs=[],
+        inputs_embeds=None,
+        **generate_kwargs
+    ) -> paddle.Tensor:
+
+        second_embeds = self.opt.get_input_embeddings()(second_input_ids)
+        image_features = paddle.cast(image_features, dtype=second_embeds.dtype)
+        inputs_embeds = paddle.concat([image_features, second_embeds], axis=1)
+
+        outputs = self.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            penalty_score=penalty_score,
+            frequency_score=frequency_score,
+            presence_score=presence_score,
+            min_length=min_length,
+            max_length=max_length,
+            temperature=temperature,
+            top_p=top_p,
+            eos_token_id=eos_token_id,
+            seq_len_encoder=seq_len_encoder,
+            seq_len_decoder=seq_len_decoder,
+            step_idx=step_idx,
+            stop_flags=stop_flags,
+            tgt_ids=tgt_ids,
+            tgt_pos=tgt_pos,
+            tgt_generation_mask=tgt_generation_mask,
+            pre_ids=pre_ids,
+            stop_nums=stop_nums,
+            cache_kvs=cache_kvs,
+        )
+        return outputs
+
+    # rewrite to_static function in generation_utils.py
+    def to_static(self, output_path: str, config: dict):
+        dtype = config.get("dtype", paddle.get_default_dtype())
+        cache_kvs_shapes = self.get_cache_kvs_shape(self.config, max_length=config.get("max_length", None))
+        input_spec = [
+            paddle.static.InputSpec(
+                shape=[None, None, None], dtype="float32", name="image_features"
+            ),  # image_features
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="second_input_ids"),  # second_input_ids
+            paddle.static.InputSpec(shape=[None, None], dtype=dtype, name="attention_mask"),  # attention_mask
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="position_ids"),  # position_ids
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="penalty_score"),  # penalty_score
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="frequency_score"),  # frequency_score
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="presence_score"),  # presence_score
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="min_length"),  # min_decode_length
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="max_length"),  # max_decode_length
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="temperature"),  # temperature
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="top_p"),  # top_p
+            paddle.static.InputSpec(shape=[None], dtype="int64", name="eos_token_id"),  # eos_token_id
+            paddle.static.InputSpec(shape=[None, 1], dtype="int32", name="seq_len_encoder"),  # seq_len_encoder
+            paddle.static.InputSpec(shape=[None, 1], dtype="int32", name="seq_len_decoder"),  # seq_len_decoder
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="step_idx"),  # step_idx
+            paddle.static.InputSpec(shape=[None, 1], dtype="bool", name="stop_flags"),  # stop_flags
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="tgt_ids"),  # tgt_ids
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="tgt_pos"),  # tgt_pos
+            paddle.static.InputSpec(
+                shape=[None, 1, 1, None], dtype=dtype, name="tgt_generation_mask"
+            ),  # tgt_generation_mask
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="pre_ids"),  # pre_ids
+            paddle.static.InputSpec(shape=[1], dtype="int64", name="stop_nums"),  # stop_nums
+            [
+                paddle.static.InputSpec(
+                    shape=shape,
+                    dtype=dtype,
+                    name="cache_kvs_{}".format(i),
+                )
+                for i, shape in enumerate(cache_kvs_shapes)
+            ],  # cache_kvs
+        ]
+
+        model = paddle.jit.to_static(self.generate_text_with_image_features, input_spec=input_spec)
+        paddle.jit.save(model, output_path, skip_prune_program=True)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/qwen/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/qwen/__init__.py
new file mode 100644
index 000000000..c2a7f656c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/qwen/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .modeling import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/qwen/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/qwen/modeling.py
new file mode 100644
index 000000000..abadb2467
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/qwen/modeling.py
@@ -0,0 +1,643 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import paddle
+from paddle import nn
+from paddle.nn.quant import weight_quantize
+
+from paddlenlp.experimental.transformers.fused_transformer_layers import (
+    FusedMultiTransformerBase,
+    FusedMultiTransformerConfig,
+    FusedMultiTransformerWeightOnly,
+)
+from paddlenlp.experimental.transformers.generation_utils import (
+    GenerationInferenceModel,
+)
+from paddlenlp.experimental.transformers.utils import infererence_model_from_pretrained
+from paddlenlp.transformers import QWenConfig, QWenPretrainedModel
+from paddlenlp.transformers.model_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from paddlenlp.transformers.model_utils import (
+    dy2st_nocheck_guard_context,
+    register_base_model,
+)
+from paddlenlp.transformers.qwen.modeling import QWenLMHead, QWenPretrainingCriterion
+
+__all__ = ["QWenForCausalLMInferenceModel", "QWenForQWenVLInferenceModel"]
+
+
+class FusedQWenRMSNorm(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.eps = config.layer_norm_epsilon
+        self.weight = paddle.create_parameter(
+            shape=[config.hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(1.0),
+        )
+
+    def forward(self, x):
+        result = paddle.incubate.nn.functional.fused_rms_norm(x, self.weight, None, self.eps, begin_norm_axis=1)
+        if isinstance(result, tuple):
+            return result[0]
+        return result
+
+
+@register_base_model
+class QWenInferenceModel(QWenPretrainedModel):
+    def __init__(self, config: QWenConfig):
+        super(QWenPretrainedModel, self).__init__(config)
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.intermediate_size = config.intermediate_size
+        self.num_layers = config.num_hidden_layers
+        self.layer_norm_epsilon = config.layer_norm_epsilon
+        self.max_position_embeddings = config.max_position_embeddings
+        self.quant_type = config.quant_type
+
+        self.use_weight_only = False
+        if config.quant_type == "weight_only_int8":
+            self.use_weight_only = True
+            self.quant_algo = "weight_only_int8"
+        elif config.quant_type == "weight_only_int4":
+            self.use_weight_only = True
+            self.quant_algo = "weight_only_int4"
+
+        if self.use_weight_only:
+            assert (
+                self.quant_algo == "weight_only_int8" or self.quant_algo == "weight_only_int4"
+            ), "Expected quant_type equal to 'weight_only_int8' or 'weight_only_int4', but received {}".format(
+                self.quant_algo
+            )
+
+        self.wte = nn.Embedding(self.vocab_size, self.hidden_size)
+
+        ln_scale_attrs = [paddle.ParamAttr(name="fuseqwen.{}.ln_scale".format(i)) for i in range(self.num_layers)]
+        qkv_weight_attrs = [
+            paddle.ParamAttr(
+                name="fuseqwen.{}.qkv_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(self.num_layers)
+        ]
+        qkv_bias_attrs = [paddle.ParamAttr(name="fuseqwen.{}.qkv_bias".format(i)) for i in range(self.num_layers)]
+        out_proj_weight_attrs = [
+            paddle.ParamAttr(
+                name="fuseqwen.{}.out_proj_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(self.num_layers)
+        ]
+        ffn_ln_scale_attrs = [
+            paddle.ParamAttr(name="fuseqwen.{}.ffn_ln_scale".format(i)) for i in range(self.num_layers)
+        ]
+        ffn1_weight_attrs = [
+            paddle.ParamAttr(
+                name="fuseqwen.{}.ffn1_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(self.num_layers)
+        ]
+        ffn2_weight_attrs = [
+            paddle.ParamAttr(
+                name="fuseqwen.{}.ffn2_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(self.num_layers)
+        ]
+
+        qkv_weight_scale_attrs = None
+        out_proj_weight_scale_attrs = None
+        ffn1_weight_scale_attrs = None
+        ffn2_weight_scale_attrs = None
+
+        if self.use_weight_only:
+            qkv_weight_scale_attrs = [
+                paddle.ParamAttr(name="fuseqwen.{}.qkv_weight_scale".format(i)) for i in range(self.num_layers)
+            ]
+            out_proj_weight_scale_attrs = [
+                paddle.ParamAttr(name="fuseqwen.{}.out_proj_weight_scale".format(i)) for i in range(self.num_layers)
+            ]
+            ffn1_weight_scale_attrs = [
+                paddle.ParamAttr(name="fuseqwen.{}.ffn1_weight_scale".format(i)) for i in range(self.num_layers)
+            ]
+            ffn2_weight_scale_attrs = [
+                paddle.ParamAttr(name="fuseqwen.{}.ffn2_weight_scale".format(i)) for i in range(self.num_layers)
+            ]
+
+        transformer_config = FusedMultiTransformerConfig(
+            self.hidden_size,
+            self.num_attention_heads,
+            self.intermediate_size // 2,
+            quant_type=self.quant_type,
+            activation="swiglu",
+            num_layers=config.num_hidden_layers,
+            nranks=1,
+            ring_id=-1,
+            ln_scale_attrs=ln_scale_attrs,
+            qkv_weight_attrs=qkv_weight_attrs,
+            qkv_weight_scale_attrs=qkv_weight_scale_attrs,
+            linear_weight_attrs=out_proj_weight_attrs,
+            linear_weight_scale_attrs=out_proj_weight_scale_attrs,
+            ffn_ln_scale_attrs=ffn_ln_scale_attrs,
+            ffn1_weight_attrs=ffn1_weight_attrs,
+            ffn1_weight_scale_attrs=ffn1_weight_scale_attrs,
+            ffn2_weight_attrs=ffn2_weight_attrs,
+            ffn2_weight_scale_attrs=ffn2_weight_scale_attrs,
+            qkv_bias_attrs=qkv_bias_attrs,
+            epsilon=self.layer_norm_epsilon,
+            norm_type="rmsnorm",
+            use_neox_rotary_style=True,
+        )
+
+        if self.use_weight_only:
+            self.transformer_block = FusedMultiTransformerWeightOnly(transformer_config)
+        else:
+            self.transformer_block = FusedMultiTransformerBase(transformer_config)
+
+        self.ln_f = FusedQWenRMSNorm(config)
+
+        self.cache_kvs = None
+        self.head_dim_shape_tensor = paddle.ones((self.hidden_size // self.num_attention_heads), dtype="int8")
+
+    def get_input_embeddings(self):
+        return self.wte
+
+    def set_input_embeddings(self, value):
+        self.wte = value
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict):
+        dtype = paddle.get_default_dtype()
+        wte_weight = paddle.to_tensor(state_dict["qwen.wte.weight"], dtype=dtype)
+        ln_f_weight = paddle.to_tensor(state_dict["qwen.ln_f.weight"], dtype=self.ln_f.weight.dtype)
+        self.wte.weight.set_value(wte_weight)
+        self.ln_f.weight.set_value(ln_f_weight)
+
+        for idx in range(self.num_layers):
+            ln_scale = paddle.to_tensor(
+                state_dict["qwen.h.{}.ln_1.weight".format(idx)], dtype=self.transformer_block.ln_scales[idx].dtype
+            )
+            self.transformer_block.ln_scales[idx].set_value(ln_scale)
+
+            qkv_weight = paddle.to_tensor(
+                state_dict["qwen.h.{}.attn.c_attn.weight".format(idx)].transpose([1, 0]), dtype=dtype
+            )
+            if self.use_weight_only:
+                qkv_weight = paddle.transpose(qkv_weight, perm=[1, 0])
+                qkv_quanted_weight, qkv_weight_scale = weight_quantize(qkv_weight, algo=self.quant_algo)
+                self.transformer_block.qkv_weights[idx].set_value(qkv_quanted_weight)
+                self.transformer_block.qkv_weights_scale[idx].set_value(qkv_weight_scale)
+            else:
+                self.transformer_block.qkv_weights[idx].set_value(qkv_weight)
+
+            qkv_bias = paddle.to_tensor(state_dict["qwen.h.{}.attn.c_attn.bias".format(idx)], dtype=dtype)
+            self.transformer_block.qkv_biases[idx].set_value(qkv_bias)
+
+            linear_weight = paddle.to_tensor(state_dict["qwen.h.{}.attn.c_proj.weight".format(idx)], dtype=dtype)
+            if self.use_weight_only:
+                linear_quanted_weight, linear_weight_scale = weight_quantize(linear_weight, algo=self.quant_algo)
+                self.transformer_block.linear_weights[idx].set_value(linear_quanted_weight)
+                self.transformer_block.linear_weights_scale[idx].set_value(linear_weight_scale)
+            else:
+                self.transformer_block.linear_weights[idx].set_value(linear_weight)
+
+            ffn_ln_scale = paddle.to_tensor(
+                state_dict["qwen.h.{}.ln_2.weight".format(idx)], dtype=self.transformer_block.ffn_ln_scales[idx].dtype
+            )
+            self.transformer_block.ffn_ln_scales[idx].set_value(ffn_ln_scale)
+
+            up_weight = paddle.to_tensor(state_dict["qwen.h.{}.mlp.w1.weight".format(idx)], dtype=dtype)
+            gate_weight = paddle.to_tensor(state_dict["qwen.h.{}.mlp.w2.weight".format(idx)], dtype=dtype)
+            ffn1_weight = paddle.concat(x=[gate_weight, up_weight], axis=-1)
+            if self.use_weight_only:
+                ffn1_quanted_weight, ffn1_weight_scale = weight_quantize(ffn1_weight, algo=self.quant_algo)
+                self.transformer_block.ffn1_weights[idx].set_value(ffn1_quanted_weight)
+                self.transformer_block.ffn1_weights_scale[idx].set_value(ffn1_weight_scale)
+            else:
+                self.transformer_block.ffn1_weights[idx].set_value(ffn1_weight)
+
+            ffn2_weight = paddle.to_tensor(state_dict["qwen.h.{}.mlp.c_proj.weight".format(idx)], dtype=dtype)
+            if self.use_weight_only:
+                ffn2_quanted_weight, ffn2_weight_scale = weight_quantize(ffn2_weight, algo=self.quant_algo)
+                self.transformer_block.ffn2_weights[idx].set_value(ffn2_quanted_weight)
+                self.transformer_block.ffn2_weights_scale[idx].set_value(ffn2_weight_scale)
+            else:
+                self.transformer_block.ffn2_weights[idx].set_value(ffn2_weight)
+
+    def remove_padding(self, input_ids, seq_lens_this_time):
+        cum_offsets_now = paddle.cumsum(paddle.max(seq_lens_this_time) - seq_lens_this_time)
+        token_num = paddle.sum(seq_lens_this_time)
+        from paddlenlp_ops import get_padding_offset
+
+        ids_remove_padding, cum_offsets, padding_offset = get_padding_offset(
+            input_ids, cum_offsets_now, token_num, seq_lens_this_time
+        )
+        return ids_remove_padding, padding_offset, cum_offsets
+
+    # This function is a little different from prepare_input_ids_for_generation in paddlenlp/transformers/generation/utils.py,
+    # it is used to generate fake input_ids according to inputs_embeds length.
+    @staticmethod
+    def prepare_input_ids_for_generation(bos_token_id, encoder_output=None):
+        batch_size = 1
+        seq_len = 1
+        if bos_token_id is None:
+            raise ValueError("`bos_token_id` should be defined when no " "`input_ids` are provided.")
+        if encoder_output is not None:
+            batch_size = encoder_output.shape[0]
+            seq_len = encoder_output.shape[1]
+        return paddle.full([batch_size, seq_len], bos_token_id, dtype="int64")
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        use_cache=None,
+        cache_kvs=None,
+        pre_caches=None,
+        seq_len_encoder=None,
+        seq_len_decoder=None,
+        past_key_values=None,
+        output_attentions=False,
+        output_hidden_states=None,
+        return_dict=False,
+        **kwargs,
+    ):
+        # kwargs["cache"] is used used to distinguish between encoder and decoder phase.
+        past_key_values = kwargs.get("cache", None)
+        is_decoder = past_key_values is not None
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is None and inputs_embeds is None:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        # generate a fake input_ids according to inputs_embeds
+        # this is usually occurred in img2txt multimodal model when first enter into this forward function.
+        if input_ids is None and inputs_embeds is not None:
+            input_ids = self.prepare_input_ids_for_generation(self.config.bos_token_id, inputs_embeds)
+        if inputs_embeds is not None:
+            batch, seq_len, hidden_dim = inputs_embeds.shape
+            inputs_embeds = inputs_embeds.reshape([batch * seq_len, hidden_dim])
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * self.config.num_hidden_layers)
+
+        if not is_decoder:
+            ids_remove_padding, padding_offset, cum_offsets = self.remove_padding(input_ids, seq_len_encoder)
+        else:
+            ids_remove_padding = input_ids
+            padding_offset = None
+            cum_offsets = None
+
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(ids_remove_padding)
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        presents = () if use_cache else None
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        seq_lens = seq_len_decoder if is_decoder else seq_len_encoder
+
+        position_offset = 0
+        theta = 10000.0
+        if not is_decoder and pre_caches is not None:
+            position_offset = 128
+
+        from paddlenlp_ops import fused_get_rotary_embedding
+
+        new_rope = fused_get_rotary_embedding(
+            input_ids, position_ids, self.head_dim_shape_tensor, position_offset, theta, True
+        )
+
+        with dy2st_nocheck_guard_context():
+            hidden_states, _ = self.transformer_block(
+                input_ids,
+                hidden_states,
+                cum_offsets=cum_offsets,
+                padding_offset=padding_offset,
+                attn_mask=paddle.cast(attention_mask, dtype=hidden_states.dtype),
+                caches=cache_kvs,
+                pre_caches=pre_caches,
+                pre_caches_length=position_offset,
+                seq_lens=seq_lens,
+                rotary_embs=new_rope,
+                rotary_emb_dims=1,
+                time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None,
+            )
+
+        hidden_states = self.ln_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class QWenForCausalLMInferenceModel(GenerationInferenceModel, QWenPretrainedModel):
+    def __init__(self, config: QWenConfig, **kwargs):
+        super(QWenForCausalLMInferenceModel, self).__init__(config)
+        self.qwen = QWenInferenceModel(config)
+        self.lm_head = QWenLMHead(config)
+        self.criterion = QWenPretrainingCriterion(config)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        return infererence_model_from_pretrained(cls, pretrained_model_name_or_path, args, kwargs)
+
+    @classmethod
+    def get_cache_kvs_shape(
+        cls, config: QWenConfig, max_batch_size: int = None, max_length: int = None
+    ) -> list[list[int]]:
+        """get cache_kvs tensor for qwen model
+
+        Args:
+            max_batch_size (int): the max batch size
+            max_length (int | None, optional): the max_length of cache_kvs. Defaults to None.
+
+        Returns:
+            list[paddle.Tensor]: the list tensor shape for cache
+        """
+        if max_length is None:
+            max_length = config.max_position_embeddings
+
+        cache_kvs = []
+        for _ in range(config.num_hidden_layers):
+            cache_kvs.append(
+                [
+                    2,
+                    max_batch_size,
+                    config.num_attention_heads // max(config.tensor_parallel_degree, 1),
+                    max_length,
+                    config.hidden_size // config.num_attention_heads,
+                ]
+            )
+        return cache_kvs
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        cache_kvs,
+        seq_len_encoder,
+        seq_len_decoder,
+        tgt_ids,
+        tgt_pos,
+        tgt_generation_mask,
+        **kwargs,
+    ):
+        position_ids = kwargs.get("position_ids", None)
+        attention_mask = kwargs.get("attention_mask", None)
+        cache = kwargs.get("cache", None)
+        pre_caches = kwargs.get("pre_caches", None)
+        inputs_embeds = kwargs.get("inputs_embeds", None)
+        if cache is not None:
+            input_ids = tgt_ids
+            position_ids = tgt_pos
+            attention_mask = (tgt_generation_mask - 1) * 1e4
+            # make inputs_embeds be none in decoder phase.
+            # in forward function, it will be assigned according to input_ids.
+            inputs_embeds = None
+        else:
+            attention_mask = (attention_mask - 1) * 1e4
+        model_inputs = {
+            "input_ids": input_ids,
+            "inputs_embeds": inputs_embeds,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "cache_kvs": cache_kvs,
+            "seq_len_encoder": seq_len_encoder,
+            "seq_len_decoder": seq_len_decoder,
+            "cache": cache,
+            "pre_caches": pre_caches,
+        }
+        return model_inputs
+
+    def forward(
+        self,
+        input_ids,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        use_cache=False,
+        cache=None,
+        cache_kvs=None,
+        pre_caches=None,
+        seq_len_encoder=None,
+        seq_len_decoder=None,
+        past_key_values=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.qwen(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache=cache,
+            cache_kvs=cache_kvs,
+            pre_caches=pre_caches,
+            seq_len_encoder=seq_len_encoder,
+            seq_len_decoder=seq_len_decoder,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+
+        # if labels is None，means we need full output, instead of tensor_parallel_output
+        # tensor_parallel_output is togather with ParallelCrossEntropy
+        tensor_parallel_output = (
+            self.config.tensor_parallel_output and labels is not None and self.config.tensor_parallel_degree > 1
+        )
+        lm_logits = self.lm_head(hidden_states, tensor_parallel_output=tensor_parallel_output)
+
+        loss = None
+        if labels is not None:
+            loss = self.criterion(lm_logits, labels)
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict):
+        if "lm_head.weight" in state_dict:
+            lm_head_weight = paddle.to_tensor(state_dict["lm_head.weight"], dtype=self.lm_head.weight.dtype)
+            self.lm_head.weight.set_value(lm_head_weight)
+        self.qwen.set_state_dict({k: state_dict[k] for k in state_dict.keys()})
+
+
+class QWenForQWenVLInferenceModel(QWenForCausalLMInferenceModel):
+    """
+    This class is 99% like QWenForCausalLMInferenceModel.
+    Used only for QWenVL's second part.
+    """
+
+    # This function corresponds to QWenVL's second part, only used for QWenVL.
+    @paddle.no_grad()
+    def generate_text_with_image_features(
+        self,
+        input_ids: paddle.Tensor,
+        image_features: paddle.Tensor,
+        img_pos: paddle.Tensor,
+        attention_mask: paddle.Tensor,
+        position_ids=None,
+        penalty_score=None,
+        frequency_score=None,
+        presence_score=None,
+        min_length=None,
+        max_length=None,
+        temperature=None,
+        top_p=None,
+        eos_token_id=None,
+        seq_len_encoder=None,
+        seq_len_decoder=None,
+        step_idx=None,
+        stop_flags=None,
+        tgt_ids=None,
+        tgt_pos=None,
+        tgt_generation_mask=None,
+        pre_ids=None,
+        stop_nums=None,
+        cache_kvs=[],
+        inputs_embeds=None,
+        **generate_kwargs
+    ) -> paddle.Tensor:
+        inputs_embeds = self.qwen.wte(input_ids)
+        inputs_embeds_dtype = inputs_embeds.dtype
+        if inputs_embeds_dtype != paddle.float32:
+            inputs_embeds = paddle.cast(inputs_embeds, paddle.float32)
+            image_features = paddle.cast(image_features, paddle.float32)
+
+        for idx, (i, image_start_idx, image_end_idx) in enumerate(img_pos):
+            index = paddle.arange(image_start_idx + 1, image_end_idx).unsqueeze(-1)
+            inputs_embeds[i] = paddle.scatter(inputs_embeds[i], index, image_features[idx])
+
+        if inputs_embeds_dtype != paddle.float32:
+            inputs_embeds = paddle.cast(inputs_embeds, inputs_embeds_dtype)
+
+        outputs = self.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            penalty_score=penalty_score,
+            frequency_score=frequency_score,
+            presence_score=presence_score,
+            min_length=min_length,
+            max_length=max_length,
+            temperature=temperature,
+            top_p=top_p,
+            eos_token_id=eos_token_id,
+            seq_len_encoder=seq_len_encoder,
+            seq_len_decoder=seq_len_decoder,
+            step_idx=step_idx,
+            stop_flags=stop_flags,
+            tgt_ids=tgt_ids,
+            tgt_pos=tgt_pos,
+            tgt_generation_mask=tgt_generation_mask,
+            pre_ids=pre_ids,
+            stop_nums=stop_nums,
+            cache_kvs=cache_kvs,
+        )
+        return outputs
+
+    # rewrite to_static function in generation_utils.py
+    def to_static(self, output_path: str, config: dict):
+        dtype = config.get("dtype", paddle.get_default_dtype())
+        cache_kvs_shapes = self.get_cache_kvs_shape(self.config, max_length=config.get("max_length", None))
+        input_spec = [
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),  # input_ids
+            paddle.static.InputSpec(
+                shape=[None, None, None], dtype="float32", name="image_features"
+            ),  # image_features
+            paddle.static.InputSpec(shape=[None, 3], dtype="int64", name="img_pos"),  # img_pos
+            paddle.static.InputSpec(shape=[None, None], dtype=dtype, name="attention_mask"),  # attention_mask
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="position_ids"),  # position_ids
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="penalty_score"),  # penalty_score
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="frequency_score"),  # frequency_score
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="presence_score"),  # presence_score
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="min_length"),  # min_decode_length
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="max_length"),  # max_decode_length
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="temperature"),  # temperature
+            paddle.static.InputSpec(shape=[None, 1], dtype="float32", name="top_p"),  # top_p
+            paddle.static.InputSpec(shape=[None], dtype="int64", name="eos_token_id"),  # eos_token_id
+            paddle.static.InputSpec(shape=[None, 1], dtype="int32", name="seq_len_encoder"),  # seq_len_encoder
+            paddle.static.InputSpec(shape=[None, 1], dtype="int32", name="seq_len_decoder"),  # seq_len_decoder
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="step_idx"),  # step_idx
+            paddle.static.InputSpec(shape=[None, 1], dtype="bool", name="stop_flags"),  # stop_flags
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="tgt_ids"),  # tgt_ids
+            paddle.static.InputSpec(shape=[None, 1], dtype="int64", name="tgt_pos"),  # tgt_pos
+            paddle.static.InputSpec(
+                shape=[None, 1, 1, None], dtype=dtype, name="tgt_generation_mask"
+            ),  # tgt_generation_mask
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="pre_ids"),  # pre_ids
+            paddle.static.InputSpec(shape=[1], dtype="int64", name="stop_nums"),  # stop_nums
+            [
+                paddle.static.InputSpec(
+                    shape=shape,
+                    dtype=dtype,
+                    name="cache_kvs_{}".format(i),
+                )
+                for i, shape in enumerate(cache_kvs_shapes)
+            ],  # cache_kvs
+        ]
+
+        model = paddle.jit.to_static(self.generate_text_with_image_features, input_spec=input_spec)
+        paddle.jit.save(model, output_path, skip_prune_program=True)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/qwen2/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/qwen2/__init__.py
new file mode 100644
index 000000000..0f0d00141
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/qwen2/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .modeling import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/qwen2/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/qwen2/modeling.py
new file mode 100644
index 000000000..2d785341f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/qwen2/modeling.py
@@ -0,0 +1,1264 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import json
+import os
+from functools import partial
+
+import numpy as np
+import paddle
+from paddle import nn
+from paddle.distributed import fleet
+from paddle.nn.quant import weight_quantize
+
+from paddlenlp.experimental.model_utils import (
+    ActScalesLoader,
+    CacheScaleLoader,
+    WeightScalesLoader,
+)
+from paddlenlp.experimental.transformers.fused_transformer_layers import (
+    FusedBlockMultiTransformer,
+    FusedBlockMultiTransformerA8W8,
+    FusedBlockMultiTransformerWeightOnly,
+    FusedMultiTransformerA8W8,
+    FusedMultiTransformerBase,
+    FusedMultiTransformerConfig,
+    FusedMultiTransformerWeightOnly,
+)
+from paddlenlp.experimental.transformers.generation_utils import (
+    GenerationBlockInferenceModel,
+    GenerationInferenceModel,
+)
+from paddlenlp.experimental.transformers.utils import infererence_model_from_pretrained
+from paddlenlp.transformers import Qwen2Config, Qwen2PretrainedModel
+from paddlenlp.transformers.conversion_utils import split_param_func
+from paddlenlp.transformers.model_outputs import (  # CausalLMOutputWithCrossAttentions,
+    BaseModelOutputWithPast,
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithPast,
+)
+from paddlenlp.transformers.model_utils import (
+    dy2st_nocheck_guard_context,
+    register_base_model,
+)
+from paddlenlp.transformers.qwen2.modeling import Qwen2LMHead, Qwen2PretrainingCriterion
+from paddlenlp.utils.log import logger
+
+__all__ = ["Qwen2ForCausalLMInferenceModel", "Qwen2ForCausalLMBlockInferenceModel"]
+
+
+class FusedQwen2RMSNorm(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.eps = config.rms_norm_eps
+        self.weight = paddle.create_parameter(
+            shape=[config.hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(1.0),
+        )
+
+    def forward(self, x):
+        result = paddle.incubate.nn.functional.fused_rms_norm(x, self.weight, None, self.eps, begin_norm_axis=1)
+        if isinstance(result, tuple):
+            return result[0]
+        return result
+
+
+@register_base_model
+class Qwen2InferenceModel(Qwen2PretrainedModel):
+    def __init__(self, config: Qwen2Config):
+        super(Qwen2PretrainedModel, self).__init__(config)
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.intermediate_size = config.intermediate_size
+        self.num_layers = config.num_hidden_layers
+        self.rms_norm_eps = config.rms_norm_eps
+        self.quant_type = config.quant_type
+        self.rope_theta = config.rope_theta
+
+        self.use_neox = True
+
+        self.use_weight_only = False
+        if config.quant_type == "weight_only_int8":
+            self.use_weight_only = True
+            self.quant_algo = "weight_only_int8"
+        elif config.quant_type == "weight_only_int4":
+            self.use_weight_only = True
+            self.quant_algo = "weight_only_int4"
+        elif "a8w8" in config.quant_type:
+            self.quant_model_path = config.model_name_or_path
+            self.shift = config.quantization_config.shift
+            self.smooth = config.quantization_config.smooth
+            self.shift_smooth_all_linears = config.quantization_config.shift_smooth_all_linears
+
+        if self.use_weight_only:
+            assert (
+                self.quant_type == "weight_only_int8" or self.quant_type == "weight_only_int4"
+            ), "Expected quant_type equal to 'weight_only_int8' or 'weight_only_int4', but received {}".format(
+                self.quant_type
+            )
+
+        if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0:
+            self.embed_tokens = fleet.meta_parallel.VocabParallelEmbedding(
+                self.vocab_size,
+                self.hidden_size,
+                weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()),
+            )
+        else:
+            self.embed_tokens = nn.Embedding(
+                self.vocab_size,
+                self.hidden_size,
+            )
+
+        # get ring_id
+        ring_id = -1
+        try:
+            hcg = fleet.get_hybrid_communicate_group()
+            model_parallel_group = hcg.get_model_parallel_group()
+            ring_id = model_parallel_group.id
+        except:
+            pass
+
+        ln_scale_attrs = [paddle.ParamAttr(name="fuseqwen2.{}.ln_scale".format(i)) for i in range(self.num_layers)]
+        qkv_weight_attrs = [
+            paddle.ParamAttr(
+                name="fuseqwen2.{}.qkv_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(self.num_layers)
+        ]
+        qkv_bias_attrs = [paddle.ParamAttr(name="fuseqwen2.{}.qkv_bias".format(i)) for i in range(self.num_layers)]
+        out_proj_weight_attrs = [
+            paddle.ParamAttr(
+                name="fuseqwen2.{}.out_proj_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(self.num_layers)
+        ]
+        ffn_ln_scale_attrs = [
+            paddle.ParamAttr(name="fuseqwen2.{}.ffn_ln_scale".format(i)) for i in range(self.num_layers)
+        ]
+
+        ffn1_weight_attrs = [
+            paddle.ParamAttr(
+                name="fuseqwen2.{}.ffn1_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(self.num_layers)
+        ]
+        ffn2_weight_attrs = [
+            paddle.ParamAttr(
+                name="fuseqwen2.{}.ffn2_weight".format(i), initializer=paddle.nn.initializer.Constant(value=0)
+            )
+            for i in range(self.num_layers)
+        ]
+
+        qkv_weight_scale_attrs = None
+        out_proj_weight_scale_attrs = None
+        ffn1_weight_scale_attrs = None
+        ffn2_weight_scale_attrs = None
+
+        qkv_out_scale_attrs = None
+        linear_out_scale_attrs = None
+        ffn1_out_scale_attrs = None
+        ffn2_out_scale_attrs = None
+        linear_shift_attrs = None
+        linear_smooth_attrs = None
+        ffn2_shift_attrs = None
+        ffn2_smooth_attrs = None
+
+        ln_bias_attrs = None
+        out_proj_bias_attrs = None
+        ffn_ln_bias_attrs = None
+        ffn1_bias_attrs = None
+        ffn2_bias_attrs = None
+
+        if "a8w8" in self.quant_type:
+            qkv_out_scale_attrs = [
+                paddle.ParamAttr(name="fuseqwen2.{}.qkv_out_scale".format(i)) for i in range(self.num_layers)
+            ]
+            linear_out_scale_attrs = [
+                paddle.ParamAttr(name="fuseqwen2.{}.linear_out_scale".format(i)) for i in range(self.num_layers)
+            ]
+            ffn1_out_scale_attrs = [
+                paddle.ParamAttr(name="fuseqwen2.{}.ffn1_out_scale".format(i)) for i in range(self.num_layers)
+            ]
+            ffn2_out_scale_attrs = [
+                paddle.ParamAttr(name="fuseqwen2.{}.ffn2_out_scale".format(i)) for i in range(self.num_layers)
+            ]
+
+            if self.shift_smooth_all_linears:
+                linear_shift_attrs = [
+                    paddle.ParamAttr(name="fuseqwen2.{}.linear_shift".format(i)) for i in range(self.num_layers)
+                ]
+                linear_smooth_attrs = [
+                    paddle.ParamAttr(name="fuseqwen2.{}.linear_smooth".format(i)) for i in range(self.num_layers)
+                ]
+                ffn2_shift_attrs = [
+                    paddle.ParamAttr(name="fuseqwen2.{}.ffn2_shift".format(i)) for i in range(self.num_layers)
+                ]
+                ffn2_smooth_attrs = [
+                    paddle.ParamAttr(name="fuseqwen2.{}.ffn2_smooth".format(i)) for i in range(self.num_layers)
+                ]
+
+            if self.shift:
+                ln_bias_attrs = [
+                    paddle.ParamAttr(name="fuseqwen2.{}.ln_bias".format(i)) for i in range(self.num_layers)
+                ]
+                ffn_ln_bias_attrs = [
+                    paddle.ParamAttr(name="fuseqwen2.{}.ffn_ln_bias".format(i)) for i in range(self.num_layers)
+                ]
+                qkv_bias_attrs = [
+                    paddle.ParamAttr(name="fuseqwen2.{}.qkv_bias".format(i)) for i in range(self.num_layers)
+                ]
+                ffn1_bias_attrs = [
+                    paddle.ParamAttr(name="fuseqwen2.{}.ffn1_bias".format(i)) for i in range(self.num_layers)
+                ]
+                if self.shift_smooth_all_linears:
+                    out_proj_bias_attrs = [
+                        paddle.ParamAttr(name="fuseqwen2.{}.out_proj_bias".format(i)) for i in range(self.num_layers)
+                    ]
+                    ffn2_bias_attrs = [
+                        paddle.ParamAttr(name="fuseqwen2.{}.ffn2_bias".format(i)) for i in range(self.num_layers)
+                    ]
+
+        qkv_weight_scale_attrs = None
+        out_proj_weight_scale_attrs = None
+        ffn1_weight_scale_attrs = None
+        ffn2_weight_scale_attrs = None
+
+        if self.use_weight_only:
+            qkv_weight_scale_attrs = [
+                paddle.ParamAttr(name="fuseqwen2.{}.qkv_weight_scale".format(i)) for i in range(self.num_layers)
+            ]
+            out_proj_weight_scale_attrs = [
+                paddle.ParamAttr(name="fuseqwen2.{}.out_proj_weight_scale".format(i)) for i in range(self.num_layers)
+            ]
+            ffn1_weight_scale_attrs = [
+                paddle.ParamAttr(name="fuseqwen2.{}.ffn1_weight_scale".format(i)) for i in range(self.num_layers)
+            ]
+            ffn2_weight_scale_attrs = [
+                paddle.ParamAttr(name="fuseqwen2.{}.ffn2_weight_scale".format(i)) for i in range(self.num_layers)
+            ]
+
+        cache_k_scale_attrs = None
+        cache_v_scale_attrs = None
+        cache_k_out_scale_attrs = None
+        cache_v_out_scale_attrs = None
+        if config.cachekv_int8_type == "static":
+            cache_k_scale_attrs = [
+                paddle.ParamAttr(name="fuseqwen2.{}.cache_k_scale".format(i)) for i in range(self.num_layers)
+            ]
+            cache_v_scale_attrs = [
+                paddle.ParamAttr(name="fuseqwen2.{}.cache_v_scale".format(i)) for i in range(self.num_layers)
+            ]
+            cache_k_out_scale_attrs = [
+                paddle.ParamAttr(name="fuseqwen2.{}.cache_k_out_scale".format(i)) for i in range(self.num_layers)
+            ]
+            cache_v_out_scale_attrs = [
+                paddle.ParamAttr(name="fuseqwen2.{}.cache_v_out_scale".format(i)) for i in range(self.num_layers)
+            ]
+
+        transformer_config = FusedMultiTransformerConfig(
+            embed_dim=self.hidden_size,
+            num_heads=self.num_attention_heads,
+            kv_num_heads=self.num_key_value_heads,
+            dim_feedforward=self.intermediate_size,
+            quant_type=self.quant_type,
+            activation="swiglu",
+            num_layers=config.num_hidden_layers,
+            nranks=config.tensor_parallel_degree,
+            ring_id=ring_id,
+            ln_scale_attrs=ln_scale_attrs,
+            qkv_weight_attrs=qkv_weight_attrs,
+            qkv_weight_scale_attrs=qkv_weight_scale_attrs,
+            linear_weight_attrs=out_proj_weight_attrs,
+            linear_weight_scale_attrs=out_proj_weight_scale_attrs,
+            ffn_ln_scale_attrs=ffn_ln_scale_attrs,
+            ffn1_weight_attrs=ffn1_weight_attrs,
+            ffn1_weight_scale_attrs=ffn1_weight_scale_attrs,
+            ffn2_weight_attrs=ffn2_weight_attrs,
+            ffn2_weight_scale_attrs=ffn2_weight_scale_attrs,
+            qkv_out_scale_attrs=qkv_out_scale_attrs,
+            linear_out_scale_attrs=linear_out_scale_attrs,
+            ffn1_out_scale_attrs=ffn1_out_scale_attrs,
+            ffn2_out_scale_attrs=ffn2_out_scale_attrs,
+            linear_shift_attrs=linear_shift_attrs,
+            linear_smooth_attrs=linear_smooth_attrs,
+            ffn2_shift_attrs=ffn2_shift_attrs,
+            ffn2_smooth_attrs=ffn2_smooth_attrs,
+            ln_bias_attrs=ln_bias_attrs,
+            qkv_bias_attrs=qkv_bias_attrs,
+            linear_bias_attrs=out_proj_bias_attrs,
+            ffn_ln_bias_attrs=ffn_ln_bias_attrs,
+            ffn1_bias_attrs=ffn1_bias_attrs,
+            ffn2_bias_attrs=ffn2_bias_attrs,
+            cache_k_scale_attrs=cache_k_scale_attrs,
+            cache_v_scale_attrs=cache_v_scale_attrs,
+            cache_k_out_scale_attrs=cache_k_out_scale_attrs,
+            cache_v_out_scale_attrs=cache_v_out_scale_attrs,
+            epsilon=self.rms_norm_eps,
+            norm_type="rmsnorm",
+            use_neox_rotary_style=self.use_neox,
+            cachekv_int8_type=config.cachekv_int8_type,
+            rank_id=config.tensor_parallel_rank,
+            trans_qkvw=(False if paddle.is_compiled_with_rocm() and self.quant_type == "a8w8" else True),
+        )
+
+        self.set_transformer_block(transformer_config)
+
+        self.norm = FusedQwen2RMSNorm(config)
+
+        self.cache_kvs = None
+        self.head_dim_shape_tensor = paddle.ones((self.hidden_size // self.num_attention_heads), dtype="int8")
+
+    def set_transformer_block(self, transformer_config):
+        if self.use_weight_only:
+            self.transformer_block = FusedMultiTransformerWeightOnly(transformer_config)
+        elif self.quant_type == "a8w8" or self.quant_type == "a8w8c8":
+            self.transformer_block = FusedMultiTransformerA8W8(transformer_config)
+        else:
+            self.transformer_block = FusedMultiTransformerBase(transformer_config)
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict):
+        head_size = self.hidden_size // self.num_attention_heads
+        split_fn = split_param_func()
+        self.embed_tokens.weight.set_value(
+            paddle.to_tensor(state_dict["qwen2.embed_tokens.weight"]).cast(self.embed_tokens.weight.dtype)
+        )
+        self.norm.weight.set_value(paddle.to_tensor(state_dict["qwen2.norm.weight"]).cast(self.norm.weight.dtype))
+
+        for idx in range(self.num_layers):
+            unfused_state_dict = {}
+            ln_scale = paddle.to_tensor(state_dict["qwen2.layers.{}.input_layernorm.weight".format(idx)]).cast(
+                self.transformer_block.ln_scales[idx].dtype
+            )
+            self.transformer_block.ln_scales[idx].set_value(ln_scale)
+
+            if "qwen2.layers.{}.self_attn.qkv_proj.weight".format(idx) in state_dict.keys():
+                concated_qkv_weight = np.concatenate(
+                    split_fn(
+                        state_dict["qwen2.layers.{}.self_attn.qkv_proj.weight".format(idx)],
+                        is_qkv=True,
+                        num_heads=self.num_attention_heads // self.config.tensor_parallel_degree,
+                        num_key_value_heads=self.num_key_value_heads // self.config.tensor_parallel_degree,
+                    ),
+                    axis=-1,
+                ).transpose(1, 0)
+            else:
+                unfused_state_dict = {}
+                unfused_state_dict["qwen2.self_attn.q_proj.weight"] = state_dict[
+                    "qwen2.layers.{}.self_attn.q_proj.weight".format(idx)
+                ]
+                unfused_state_dict["qwen2.self_attn.k_proj.weight"] = state_dict[
+                    "qwen2.layers.{}.self_attn.k_proj.weight".format(idx)
+                ]
+                unfused_state_dict["qwen2.self_attn.v_proj.weight"] = state_dict[
+                    "qwen2.layers.{}.self_attn.v_proj.weight".format(idx)
+                ]
+                if paddle.is_compiled_with_rocm() and (self.quant_type == "a8w8" or self.quant_type == "a8w8c8"):
+                    concated_qkv_weight = np.concatenate(
+                        [
+                            unfused_state_dict["self_attn.q_proj.weight"],
+                            unfused_state_dict["self_attn.k_proj.weight"],
+                            unfused_state_dict["self_attn.v_proj.weight"],
+                        ],
+                        axis=-1,
+                    ).reshape(
+                        self.hidden_size,
+                        (
+                            self.num_attention_heads // self.config.tensor_parallel_degree
+                            + 2 * self.num_key_value_heads // self.config.tensor_parallel_degree
+                        )
+                        * (head_size),
+                    )
+                else:
+                    concated_qkv_weight = (
+                        np.concatenate(
+                            [
+                                unfused_state_dict["qwen2.self_attn.q_proj.weight"],
+                                unfused_state_dict["qwen2.self_attn.k_proj.weight"],
+                                unfused_state_dict["qwen2.self_attn.v_proj.weight"],
+                            ],
+                            axis=-1,
+                        )
+                        .transpose(1, 0)
+                        .reshape(
+                            (
+                                self.num_attention_heads // self.config.tensor_parallel_degree
+                                + 2 * self.num_key_value_heads // self.config.tensor_parallel_degree
+                            )
+                            * (head_size),
+                            self.hidden_size,
+                        )
+                    )
+
+            qkv_weight = paddle.to_tensor(concated_qkv_weight).cast(paddle.get_default_dtype())
+
+            if self.use_weight_only:
+                qkv_weight = paddle.transpose(qkv_weight, perm=[1, 0])
+                qkv_quanted_weight, qkv_weight_scale = weight_quantize(qkv_weight, algo=self.quant_algo)
+                self.transformer_block.qkv_weights[idx].set_value(qkv_quanted_weight)
+                self.transformer_block.qkv_weights_scale[idx].set_value(qkv_weight_scale)
+            elif "a8w8" in self.quant_type:
+                self.transformer_block.qkv_weights[idx].set_value(
+                    paddle.cast(paddle.to_tensor(concated_qkv_weight), "int8")
+                )
+            else:
+                self.transformer_block.qkv_weights[idx].set_value(qkv_weight)
+
+            unfused_state_dict["qwen2.self_attn.q_proj.bias"] = state_dict[
+                "qwen2.layers.{}.self_attn.q_proj.bias".format(idx)
+            ]
+            unfused_state_dict["qwen2.self_attn.k_proj.bias"] = state_dict[
+                "qwen2.layers.{}.self_attn.k_proj.bias".format(idx)
+            ]
+            unfused_state_dict["qwen2.self_attn.v_proj.bias"] = state_dict[
+                "qwen2.layers.{}.self_attn.v_proj.bias".format(idx)
+            ]
+
+            concated_qkv_biases = np.concatenate(
+                [
+                    unfused_state_dict["qwen2.self_attn.q_proj.bias"],
+                    unfused_state_dict["qwen2.self_attn.k_proj.bias"],
+                    unfused_state_dict["qwen2.self_attn.v_proj.bias"],
+                ],
+                axis=-1,
+            )
+            qkv_bias = paddle.to_tensor(concated_qkv_biases)
+            self.transformer_block.qkv_biases[idx].set_value(
+                qkv_bias.cast(self.transformer_block.qkv_biases[idx].dtype)
+            )
+
+            linear_weight = paddle.to_tensor(state_dict["qwen2.layers.{}.self_attn.o_proj.weight".format(idx)]).cast(
+                paddle.get_default_dtype()
+            )
+            if self.use_weight_only:
+                linear_quanted_weight, linear_weight_scale = weight_quantize(linear_weight, algo=self.quant_algo)
+                self.transformer_block.linear_weights[idx].set_value(linear_quanted_weight)
+                self.transformer_block.linear_weights_scale[idx].set_value(linear_weight_scale)
+            elif "a8w8" in self.quant_type:
+                if paddle.is_compiled_with_rocm():
+                    self.transformer_block.linear_weights[idx].set_value(
+                        paddle.cast(
+                            paddle.to_tensor(state_dict["qwen2.layers.{}.self_attn.o_proj.weight".format(idx)]), "int8"
+                        )
+                    )
+                else:
+                    self.transformer_block.linear_weights[idx].set_value(
+                        paddle.cast(
+                            paddle.to_tensor(
+                                state_dict["qwen2.layers.{}.self_attn.o_proj.weight".format(idx)]
+                            ).transpose((1, 0)),
+                            "int8",
+                        )
+                    )
+            else:
+                self.transformer_block.linear_weights[idx].set_value(
+                    linear_weight.cast(self.transformer_block.linear_weights[idx].dtype)
+                )
+
+            ffn_ln_scale = paddle.to_tensor(
+                state_dict["qwen2.layers.{}.post_attention_layernorm.weight".format(idx)],
+            )
+
+            self.transformer_block.ffn_ln_scales[idx].set_value(
+                ffn_ln_scale.cast(self.transformer_block.ffn_ln_scales[idx].dtype)
+            )
+
+            if "qwen2.layers.{}.mlp.gate_up_fused_proj.weight".format(idx) in state_dict.keys():
+                concated_ffn1_weight = np.concatenate(
+                    split_fn(state_dict["qwen2.layers.{}.mlp.gate_up_fused_proj.weight".format(idx)]), axis=-1
+                )
+            else:
+                unfused_state_dict["mlp.gate_proj.weight"] = state_dict[
+                    "qwen2.layers.{}.mlp.gate_proj.weight".format(idx)
+                ]
+                unfused_state_dict["mlp.up_proj.weight"] = state_dict["qwen2.layers.{}.mlp.up_proj.weight".format(idx)]
+                concated_ffn1_weight = np.concatenate(
+                    [unfused_state_dict["mlp.gate_proj.weight"], unfused_state_dict["mlp.up_proj.weight"]], axis=-1
+                )
+            ffn1_weight = paddle.to_tensor(concated_ffn1_weight).cast(paddle.get_default_dtype())
+
+            if self.use_weight_only:
+                ffn1_quanted_weight, ffn1_weight_scale = weight_quantize(ffn1_weight, algo=self.quant_algo)
+                self.transformer_block.ffn1_weights[idx].set_value(ffn1_quanted_weight)
+                self.transformer_block.ffn1_weights_scale[idx].set_value(ffn1_weight_scale)
+            elif "a8w8" in self.quant_type:
+                if paddle.is_compiled_with_rocm():
+                    self.transformer_block.ffn1_weights[idx].set_value(
+                        paddle.cast(paddle.to_tensor(ffn1_weight), "int8")
+                    )
+                else:
+                    self.transformer_block.ffn1_weights[idx].set_value(
+                        paddle.cast(paddle.to_tensor(ffn1_weight).transpose((1, 0)), "int8")
+                    )
+            else:
+                self.transformer_block.ffn1_weights[idx].set_value(
+                    ffn1_weight.cast(self.transformer_block.ffn1_weights[idx].dtype)
+                )
+
+            ffn2_weight = paddle.to_tensor(state_dict["qwen2.layers.{}.mlp.down_proj.weight".format(idx)])
+            if self.use_weight_only:
+                ffn2_quanted_weight, ffn2_weight_scale = weight_quantize(ffn2_weight, algo=self.quant_algo)
+                self.transformer_block.ffn2_weights[idx].set_value(ffn2_quanted_weight)
+                self.transformer_block.ffn2_weights_scale[idx].set_value(ffn2_weight_scale)
+            elif "a8w8" in self.quant_type:
+                if paddle.is_compiled_with_rocm():
+                    self.transformer_block.ffn2_weights[idx].set_value(
+                        paddle.cast(
+                            paddle.to_tensor(state_dict["qwen2.layers.{}.mlp.down_proj.weight".format(idx)]), "int8"
+                        )
+                    )
+                else:
+                    self.transformer_block.ffn2_weights[idx].set_value(
+                        paddle.cast(
+                            paddle.to_tensor(state_dict["qwen2.layers.{}.mlp.down_proj.weight".format(idx)]).transpose(
+                                (1, 0)
+                            ),
+                            "int8",
+                        )
+                    )
+            else:
+                self.transformer_block.ffn2_weights[idx].set_value(
+                    ffn2_weight.cast(self.transformer_block.ffn2_weights[idx].dtype)
+                )
+
+            if "a8w8" in self.quant_type:
+                if self.shift_smooth_all_linears:
+                    self.transformer_block.linear_shifts[idx].set_value(
+                        paddle.to_tensor(state_dict["qwen2.layers.{}.self_attn.o_proj.shift_bias".format(idx)])
+                    )
+                    self.transformer_block.linear_smooths[idx].set_value(
+                        paddle.to_tensor(state_dict["qwen2.layers.{}.self_attn.o_proj.smooth_weight".format(idx)])
+                    )
+                    self.transformer_block.ffn2_shifts[idx].set_value(
+                        paddle.to_tensor(state_dict["qwen2.layers.{}.mlp.down_proj.shift_bias".format(idx)])
+                    )
+                    self.transformer_block.ffn2_smooths[idx].set_value(
+                        paddle.to_tensor(state_dict["qwen2.layers.{}.mlp.down_proj.smooth_weight".format(idx)])
+                    )
+
+                if self.shift:
+                    self.transformer_block.ln_biases[idx].set_value(
+                        paddle.to_tensor(state_dict["qwen2.layers.{}.input_layernorm.bias".format(idx)])
+                    )
+                    self.transformer_block.ffn_ln_biases[idx].set_value(
+                        paddle.to_tensor(state_dict["qwen2.layers.{}.post_attention_layernorm.bias".format(idx)])
+                    )
+
+                    unfused_state_dict["self_attn.q_proj.bias"] = state_dict[
+                        "qwen2.layers.{}.self_attn.q_proj.bias".format(idx)
+                    ]
+                    unfused_state_dict["self_attn.k_proj.bias"] = state_dict[
+                        "qwen2.layers.{}.self_attn.k_proj.bias".format(idx)
+                    ]
+                    unfused_state_dict["self_attn.v_proj.bias"] = state_dict[
+                        "qwen2.layers.{}.self_attn.v_proj.bias".format(idx)
+                    ]
+
+                    concated_qkv_biases = np.concatenate(
+                        [
+                            unfused_state_dict["self_attn.q_proj.bias"],
+                            unfused_state_dict["self_attn.k_proj.bias"],
+                            unfused_state_dict["self_attn.v_proj.bias"],
+                        ],
+                        axis=-1,
+                    )
+
+                    self.transformer_block.qkv_biases[idx].set_value(paddle.to_tensor(concated_qkv_biases))
+
+                    unfused_state_dict["mlp.gate_proj.bias"] = state_dict[
+                        "qwen2.layers.{}.mlp.gate_proj.bias".format(idx)
+                    ]
+                    unfused_state_dict["mlp.up_proj.bias"] = state_dict["qwen2.layers.{}.mlp.up_proj.bias".format(idx)]
+
+                    concated_ffn1_bias = np.concatenate(
+                        [unfused_state_dict["mlp.gate_proj.bias"], unfused_state_dict["mlp.up_proj.bias"]], axis=-1
+                    )
+
+                    self.transformer_block.ffn1_biases[idx].set_value(paddle.to_tensor(concated_ffn1_bias))
+
+                    if self.shift_smooth_all_linears:
+                        self.transformer_block.linear_biases[idx].set_value(
+                            paddle.to_tensor(state_dict["qwen2.layers.{}.self_attn.o_proj.bias".format(idx)])
+                        )
+                        self.transformer_block.ffn2_biases[idx].set_value(
+                            paddle.to_tensor(state_dict["qwen2.layers.{}.mlp.down_proj.layer.bias".format(idx)])
+                        )
+
+        if "a8w8" in self.quant_type:
+            current_work_dir = os.path.dirname(__file__)
+            scale_map_file = (
+                f"{current_work_dir}/ptq_scales_map.json"
+                if not self.shift_smooth_all_linears
+                else f"{current_work_dir}/ptq_scales_map_shift_smooth.json"
+            )
+            with open(scale_map_file) as json_file:
+                scale_map_dict = json.load(json_file)
+                act_scale_map_dict = scale_map_dict["act_scale"]
+                weight_scale_map_dict = scale_map_dict["weight_scale"]
+                cache_scale_map_dict = scale_map_dict["cachekv_scale"]
+                # TODO(RichardWooSJTU): support multi-cards
+
+                act_scale_json_path = os.path.join(self.quant_model_path, "act_scales.json")
+                weight_scale_json_path = os.path.join(self.quant_model_path, "weight_scales.json")
+                if self.config.tensor_parallel_degree > 1 and not self.config.single_card_ptq:
+                    act_scale_json_path = os.path.join(
+                        self.quant_model_path, f"act_scales_{self.config.tensor_parallel_rank}.json"
+                    )
+                    weight_scale_json_path = os.path.join(
+                        self.quant_model_path, f"weight_scales_{self.config.tensor_parallel_rank}.json"
+                    )
+                act_scale_loader = ActScalesLoader(
+                    act_scale_json_path, act_scale_map_dict, num_of_layers=self.config.num_hidden_layers
+                )
+                self.transformer_block.act_scales = act_scale_loader.scale
+                weight_scales_loader = WeightScalesLoader(
+                    weight_scale_json_path,
+                    weight_scale_map_dict,
+                    num_of_layers=self.config.num_hidden_layers,
+                    concat_qkv=True,
+                    concat_ffn1=True,
+                )
+
+                if self.config.cachekv_int8_type == "static":
+                    cache_scale_json_path = os.path.join(self.quant_model_path, "cachekv_scales.json")
+                    if self.config.tensor_parallel_degree > 1 and not self.config.single_card_ptq:
+                        cache_scale_json_path = os.path.join(
+                            self.quant_model_path, f"cachekv_scales_{self.config.tensor_parallel_rank}.json"
+                        )
+                    cache_scales_loader = CacheScaleLoader(
+                        cache_scale_json_path,
+                        cache_scale_map_dict,
+                        num_of_layers=self.config.num_hidden_layers,
+                        num_heads=self.num_attention_heads // self.config.tensor_parallel_degree,
+                        num_key_value_heads=self.num_key_value_heads // self.config.tensor_parallel_degree,
+                    )
+
+                    for k, v in cache_scales_loader.scale.items():
+                        for i_layer, weight_scale in enumerate(v):
+                            weight_scale = weight_scale.astype("float32")
+                            if k == "cache_k_scale":
+                                self.transformer_block.cache_k_scales[i_layer].set_value(weight_scale)
+                            elif k == "cache_v_scale":
+                                self.transformer_block.cache_v_scales[i_layer].set_value(weight_scale)
+                            elif k == "cache_k_out_scale":
+                                self.transformer_block.cache_k_out_scales[i_layer].set_value(weight_scale)
+                            else:
+                                self.transformer_block.cache_v_out_scales[i_layer].set_value(weight_scale)
+
+                for k, v in weight_scales_loader.scale.items():
+                    if "qkv_" in k:
+                        for i_layer, weight_scale in enumerate(v):
+                            tmp = paddle.to_tensor(
+                                weight_scale
+                                / (
+                                    127.0 * 127.0 * act_scale_loader.scale["qkv_in_scale"][i_layer]
+                                )  # [3 * num_head * dim_head]
+                            ).reshape([-1])
+                            if self.config.tensor_parallel_degree > 1 and self.config.single_card_ptq:
+                                tmp = (
+                                    tmp.reshape([3, self.num_attention_heads, head_size])
+                                    .split(self.config.tensor_parallel_degree, axis=1)[
+                                        self.config.tensor_parallel_rank
+                                    ]
+                                    .reshape([-1])
+                                )
+                            self.transformer_block.qkv_out_scales[i_layer].set_value(tmp)
+                        pass
+                    elif "out_linear_" in k:
+                        for i_layer, weight_scale in enumerate(v):
+                            tmp = paddle.to_tensor(
+                                weight_scale / (127.0 * 127.0 * act_scale_loader.scale["out_linear_in_scale"][i_layer])
+                            )
+                            self.transformer_block.linear_out_scales[i_layer].set_value(tmp)
+                    elif "ffn1_weight_scale" in k:
+                        for i_layer, weight_scale in enumerate(v):
+                            tmp = paddle.to_tensor(
+                                weight_scale / (127.0 * 127.0 * act_scale_loader.scale["ffn1_in_scale"][i_layer])
+                            )
+                            if self.config.tensor_parallel_degree > 1 and self.config.single_card_ptq:
+                                tmp = paddle.split(tmp, self.config.tensor_parallel_degree * 2)
+                                tmp = paddle.concat(
+                                    [
+                                        tmp[self.config.tensor_parallel_rank],
+                                        tmp[self.config.tensor_parallel_rank + self.config.tensor_parallel_degree],
+                                    ],
+                                    axis=0,
+                                )
+                            self.transformer_block.ffn1_out_scales[i_layer].set_value(tmp)
+                    elif "ffn2" in k:
+                        for i_layer, weight_scale in enumerate(v):
+                            self.transformer_block.ffn2_out_scales[i_layer].set_value(
+                                paddle.to_tensor(
+                                    weight_scale / (127.0 * 127.0 * act_scale_loader.scale["ffn2_in_scale"][i_layer])
+                                )
+                            )
+
+    def remove_padding(self, input_ids, seq_lens_this_time):
+        cum_offsets_now = paddle.cumsum(paddle.max(seq_lens_this_time) - seq_lens_this_time)
+        token_num = paddle.sum(seq_lens_this_time)
+        from paddlenlp_ops import get_padding_offset
+
+        ids_remove_padding, cum_offsets, padding_offset = get_padding_offset(
+            input_ids, cum_offsets_now, token_num, seq_lens_this_time
+        )
+        return ids_remove_padding, padding_offset, cum_offsets
+
+    # This function is a little different from prepare_input_ids_for_generation in paddlenlp/transformers/generation/utils.py,
+    # it is used to generate fake input_ids according to inputs_embeds length.
+    @staticmethod
+    def prepare_input_ids_for_generation(bos_token_id, encoder_output=None):
+        batch_size = 1
+        seq_len = 1
+        if bos_token_id is None:
+            raise ValueError("`bos_token_id` should be defined when no " "`input_ids` are provided.")
+        if encoder_output is not None:
+            batch_size = encoder_output.shape[0]
+            seq_len = encoder_output.shape[1]
+        return paddle.full([batch_size, seq_len], bos_token_id, dtype="int64")
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        use_cache=None,
+        cache_kvs=None,
+        pre_caches=None,
+        seq_len_encoder=None,
+        seq_len_decoder=None,
+        past_key_values=None,
+        output_attentions=False,
+        output_hidden_states=None,
+        return_dict=False,
+        **kwargs,
+    ):
+        # kwargs["cache"] is used used to distinguish between encoder and decoder phase.
+        past_key_values = kwargs.get("cache", None)
+        is_decoder = past_key_values is not None
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is None and inputs_embeds is None:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        # generate a fake input_ids according to inputs_embeds
+        # this is usually occurred in img2txt multimodal model when first enter into this forward function.
+        if input_ids is None and inputs_embeds is not None:
+            input_ids = self.prepare_input_ids_for_generation(self.config.bos_token_id, inputs_embeds)
+        if inputs_embeds is not None:
+            batch, seq_len, hidden_dim = inputs_embeds.shape
+            inputs_embeds = inputs_embeds.reshape([batch * seq_len, hidden_dim])
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * self.config.num_hidden_layers)
+
+        if not is_decoder:
+            ids_remove_padding, padding_offset, cum_offsets = self.remove_padding(input_ids, seq_len_encoder)
+        else:
+            ids_remove_padding = input_ids
+            padding_offset = None
+            cum_offsets = None
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(ids_remove_padding)
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        presents = () if use_cache else None
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        seq_lens = seq_len_decoder if is_decoder else seq_len_encoder
+
+        position_offset = 0
+        if not is_decoder and pre_caches is not None:
+            position_offset = 128
+
+        from paddlenlp_ops import fused_get_rotary_embedding
+
+        new_rope = fused_get_rotary_embedding(
+            input_ids, position_ids, self.head_dim_shape_tensor, position_offset, self.rope_theta, self.use_neox
+        )
+
+        with dy2st_nocheck_guard_context():
+            hidden_states, _ = self.transformer_block(
+                input_ids,
+                hidden_states,
+                cum_offsets=cum_offsets,
+                padding_offset=padding_offset,
+                attn_mask=paddle.cast(attention_mask, dtype=hidden_states.dtype),
+                caches=cache_kvs,
+                pre_caches=pre_caches,
+                pre_caches_length=position_offset,
+                seq_lens=seq_lens,
+                rotary_embs=new_rope,
+                rotary_emb_dims=1,
+                time_step=paddle.increment(paddle.shape(attention_mask)[-1], -1) if is_decoder else None,
+            )
+
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class Qwen2ForCausalLMInferenceModel(GenerationInferenceModel, Qwen2PretrainedModel):
+    def __init__(self, config: Qwen2Config, **kwargs):
+        super(Qwen2ForCausalLMInferenceModel, self).__init__(config)
+        self.qwen2 = Qwen2InferenceModel(config)
+        if config.tie_word_embeddings:
+            self.lm_head = Qwen2LMHead(config, embedding_weights=self.qwen2.embed_tokens.weight, transpose_y=True)
+            self.tie_weights()
+        else:
+            self.lm_head = Qwen2LMHead(config)
+        self.criterion = Qwen2PretrainingCriterion(config)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        return infererence_model_from_pretrained(cls, pretrained_model_name_or_path, args, kwargs)
+
+    @classmethod
+    def get_cache_kvs_shape(
+        cls, config: Qwen2Config, max_batch_size: int = None, max_length: int = None
+    ) -> list[list[int]]:
+        """get cache_kvs tensor for qwen model
+
+        Args:
+            max_batch_size (int): the max batch size
+            max_length (int | None, optional): the max_length of cache_kvs. Defaults to None.
+
+        Returns:
+            list[paddle.Tensor]: the list tensor shape for cache
+        """
+        if max_length is None:
+            max_length = config.max_position_embeddings
+
+        cache_kvs = []
+        for _ in range(config.num_hidden_layers):
+            cache_kvs.append(
+                [
+                    2,
+                    max_batch_size,
+                    config.num_key_value_heads // max(config.tensor_parallel_degree, 1),
+                    max_length,
+                    config.hidden_size // config.num_attention_heads,
+                ]
+            )
+        return cache_kvs
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        cache_kvs,
+        seq_len_encoder,
+        seq_len_decoder,
+        tgt_ids,
+        tgt_pos,
+        tgt_generation_mask,
+        **kwargs,
+    ):
+        position_ids = kwargs.get("position_ids", None)
+        attention_mask = kwargs.get("attention_mask", None)
+        cache = kwargs.get("cache", None)
+        pre_caches = kwargs.get("pre_caches", None)
+        inputs_embeds = kwargs.get("inputs_embeds", None)
+        if cache is not None:
+            input_ids = tgt_ids
+            position_ids = tgt_pos
+            attention_mask = (tgt_generation_mask - 1) * 1e4
+            # make inputs_embeds be none in decoder phase.
+            # in forward function, it will be assigned according to input_ids.
+            inputs_embeds = None
+        else:
+            attention_mask = (attention_mask - 1) * 1e4
+        model_inputs = {
+            "input_ids": input_ids,
+            "inputs_embeds": inputs_embeds,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "cache_kvs": cache_kvs,
+            "seq_len_encoder": seq_len_encoder,
+            "seq_len_decoder": seq_len_decoder,
+            "cache": cache,
+            "pre_caches": pre_caches,
+        }
+        return model_inputs
+
+    def forward(
+        self,
+        input_ids,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        use_cache=False,
+        cache=None,
+        cache_kvs=None,
+        pre_caches=None,
+        seq_len_encoder=None,
+        seq_len_decoder=None,
+        past_key_values=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.qwen2(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache=cache,
+            cache_kvs=cache_kvs,
+            pre_caches=pre_caches,
+            seq_len_encoder=seq_len_encoder,
+            seq_len_decoder=seq_len_decoder,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+
+        # if labels is None，means we need full output, instead of tensor_parallel_output
+        # tensor_parallel_output is togather with ParallelCrossEntropy
+        tensor_parallel_output = (
+            self.config.tensor_parallel_output and labels is not None and self.config.tensor_parallel_degree > 1
+        )
+        lm_logits = self.lm_head(hidden_states, tensor_parallel_output=tensor_parallel_output)
+
+        loss = None
+        if labels is not None:
+            loss = self.criterion(lm_logits, labels)
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict):
+        if "lm_head.weight" in state_dict:
+            lm_head_weight = paddle.to_tensor(state_dict["lm_head.weight"]).cast(self.lm_head.weight.dtype)
+            self.lm_head.weight.set_value(lm_head_weight)
+        self.qwen2.set_state_dict({k: state_dict[k] for k in state_dict.keys()})
+
+
+@register_base_model
+class Qwen2BlockInferenceModel(Qwen2InferenceModel):
+    def __init__(self, config: Qwen2Config):
+        super().__init__(config)
+        self.max_seq_len = config.max_seq_len
+        self.block_size = config.block_size
+
+    def set_transformer_block(self, transformer_config):
+        if self.use_weight_only:
+            self.transformer_block = FusedBlockMultiTransformerWeightOnly(transformer_config)
+        elif self.quant_type == "a8w8" or self.quant_type == "a8w8c8":
+            self.transformer_block = FusedBlockMultiTransformerA8W8(transformer_config)
+        else:
+            self.transformer_block = FusedBlockMultiTransformer(transformer_config)
+
+    def remove_padding(self, input_ids, seq_lens_this_time):
+        cum_offsets_now = paddle.cumsum(self.max_seq_len - seq_lens_this_time)
+        token_num = paddle.sum(seq_lens_this_time)
+        from paddlenlp_ops import get_padding_offset_v2
+
+        ids_remove_padding, cum_offsets, padding_offset, cu_seqlens_q, cu_seqlens_k = get_padding_offset_v2(
+            input_ids, cum_offsets_now, token_num, seq_lens_this_time
+        )
+        return ids_remove_padding, padding_offset, cum_offsets, cu_seqlens_q, cu_seqlens_k
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        caches=None,
+        pre_caches=None,
+        output_attentions=False,
+        output_hidden_states=None,
+        return_dict=False,
+        **kwargs,
+    ):
+
+        seq_lens_this_time = kwargs.get("seq_lens_this_time", None)
+        rope_emb = kwargs.get("rope_emb", None)
+        ids_remove_padding, padding_offset, cum_offsets, cu_seqlens_q, cu_seqlens_k = self.remove_padding(
+            input_ids, seq_lens_this_time
+        )
+        kwargs["cu_seqlens_q"] = cu_seqlens_q
+        kwargs["cu_seqlens_k"] = cu_seqlens_k
+        kwargs["padding_offsets"] = padding_offset
+        kwargs["max_input_length"] = self.max_seq_len
+
+        inputs_embeds = self.embed_tokens(ids_remove_padding)
+
+        with dy2st_nocheck_guard_context():
+            hidden_states, _ = self.transformer_block(
+                input_ids=input_ids,
+                src=inputs_embeds,
+                cum_offsets=cum_offsets,
+                attn_mask=attention_mask,
+                caches=caches,
+                pre_caches=pre_caches,
+                rotary_embs=rope_emb,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=None,
+            hidden_states=None,
+            attentions=None,
+        )
+
+
+class Qwen2ForCausalLMBlockInferenceModel(GenerationBlockInferenceModel, Qwen2PretrainedModel):
+    """
+    Dynamic Batching for Qwen2 Model with pretraining tasks on top.
+    """
+
+    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.qwen2 = Qwen2BlockInferenceModel(config)
+        if config.tie_word_embeddings:
+            self.lm_head = Qwen2LMHead(config, embedding_weights=self.qwen2.embed_tokens.weight, transpose_y=True)
+            self.tie_weights()
+        else:
+            self.lm_head = Qwen2LMHead(config)
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config: Qwen2Config, is_split=True):
+
+        logger.info("Qwen2 inference model _get_tensor_parallel_mappings")
+
+        from paddlenlp.transformers.conversion_utils import split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def get_tensor_parallel_split_mappings(num_layers):
+            final_actions = {}
+
+            base_actions = {
+                "lm_head.weight": partial(fn, is_column=True),
+                # Row Linear
+                "embed_tokens.weight": partial(fn, is_column=False),
+                "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False),
+                "layers.0.mlp.down_proj.weight": partial(fn, is_column=False),
+            }
+
+            # Column Linear
+            if config.fuse_attention_qkv:
+                base_actions["layers.0.self_attn.qkv_proj.weight"] = partial(fn, is_column=True)
+            else:
+                base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True)
+                # if we have enough num_key_value_heads to split, then split it.
+                if config.num_key_value_heads % config.tensor_parallel_degree == 0:
+                    base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True)
+                    base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True)
+
+            if config.fuse_attention_ffn:
+                base_actions["layers.0.mlp.gate_up_fused_proj.weight"] = partial(
+                    fn, is_column=True, is_naive_2fuse=True
+                )
+            else:
+                base_actions["layers.0.mlp.gate_proj.weight"] = partial(fn, is_column=True)
+                base_actions["layers.0.mlp.up_proj.weight"] = partial(fn, is_column=True)
+
+            for key, action in base_actions.items():
+                if "layers.0." in key:
+                    for i in range(num_layers):
+                        final_actions[key.replace("layers.0.", f"layers.{i}.")] = action
+                final_actions[key] = action
+
+            return final_actions
+
+        mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
+
+        return mappings
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        return infererence_model_from_pretrained(cls, pretrained_model_name_or_path, args, kwargs)
+
+    @classmethod
+    def get_cache_kvs_shape(
+        cls, config: Qwen2Config, max_batch_size: int = None, max_length: int = None
+    ) -> list[list[int]]:
+        """get cache_kvs tensor for Qwen2 model
+
+        Args:
+            max_batch_size (int): the max batch size
+            max_length (int | None, optional): the max_length of cache_kvs. Defaults to None.
+
+        Returns:
+            list[paddle.Tensor]: the list tensor shape for cache
+        """
+        max_block_per_seq = (config.max_seq_len + config.block_size - 1) // config.block_size
+        if max_batch_size == -1:
+            max_block_nums = None
+        else:
+            max_block_nums = max_batch_size * max_block_per_seq
+
+        cache_kvs = []
+        for _ in range(config.num_hidden_layers):
+            cache_kv_shape = [
+                max_block_nums,
+                config.num_key_value_heads // max(config.tensor_parallel_degree, 1),
+                config.block_size,
+                config.hidden_size // config.num_attention_heads,
+            ]
+            cache_kvs.append(cache_kv_shape)
+            cache_kvs.append(cache_kv_shape)
+        return cache_kvs
+
+    def prepare_inputs_for_generation(self, **kwargs):
+        # only last token for inputs_ids if cache is defined in kwargs
+        input_ids = kwargs["input_ids"]
+        src_mask = kwargs.get("src_mask", None)
+        block_tables = kwargs.get("block_tables", None)
+
+        pre_caches = kwargs.get("pre_caches", None)
+        caches = kwargs.get("caches", None)
+
+        rope_emb = kwargs["rope_emb"]
+        seq_lens_this_time = kwargs["seq_lens_this_time"]
+        seq_lens_encoder = kwargs["seq_lens_encoder"]
+        seq_lens_decoder = kwargs["seq_lens_decoder"]
+        k_quant_scales = kwargs.get("k_quant_scales", None)
+        v_quant_scales = kwargs.get("v_quant_scales", None)
+        k_dequant_scales = kwargs.get("k_dequant_scales", None)
+        v_dequant_scales = kwargs.get("v_dequant_scales", None)
+        model_inputs = {
+            "input_ids": input_ids,
+            "src_mask": src_mask,
+            "rope_emb": rope_emb,
+            "pre_caches": pre_caches,
+            "caches": caches,
+            "seq_lens_this_time": seq_lens_this_time,
+            "seq_lens_encoder": seq_lens_encoder,
+            "seq_lens_decoder": seq_lens_decoder,
+            "block_tables": block_tables,
+            "k_quant_scales": k_quant_scales,
+            "v_quant_scales": v_quant_scales,
+            "k_dequant_scales": k_dequant_scales,
+            "v_dequant_scales": v_dequant_scales,
+        }
+        return model_inputs
+
+    def forward(
+        self,
+        input_ids,
+        src_mask=None,
+        pre_caches=None,
+        caches=None,
+        seq_lens_this_time=None,
+        seq_lens_encoder=None,
+        seq_lens_decoder=None,
+        rope_emb=None,
+        block_tables=None,
+        k_quant_scales=None,
+        v_quant_scales=None,
+        k_dequant_scales=None,
+        v_dequant_scales=None,
+    ):
+        outputs = self.qwen2(
+            input_ids,
+            src_mask=src_mask,
+            caches=caches,
+            rope_emb=rope_emb,
+            block_tables=block_tables,
+            pre_caches=pre_caches,
+            seq_lens_this_time=seq_lens_this_time,
+            seq_lens_encoder=seq_lens_encoder,
+            seq_lens_decoder=seq_lens_decoder,
+            k_quant_scales=k_quant_scales,
+            v_quant_scales=v_quant_scales,
+            k_dequant_scales=k_dequant_scales,
+            v_dequant_scales=v_dequant_scales,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(
+            hidden_states,
+            tensor_parallel_output=False,
+        )
+
+        return logits
+
+    @paddle.no_grad()
+    def set_state_dict(self, state_dict):
+        if "lm_head.weight" in state_dict:
+            self.lm_head.weight.set_value(
+                paddle.to_tensor(state_dict["lm_head.weight"]).cast(self.lm_head.weight.dtype)
+            )
+        self.qwen2.set_state_dict({k: state_dict[k] for k in state_dict.keys()})
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/qwen2/ptq_scales_map.json b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/qwen2/ptq_scales_map.json
new file mode 100644
index 000000000..a069eddb3
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/qwen2/ptq_scales_map.json
@@ -0,0 +1,21 @@
+{
+    "act_scale":{
+        "qkv_in_scale": "qwen2.layers.#.self_attn.q_proj.activation_quanter", 
+        "out_linear_in_scale": "qwen2.layers.#.self_attn.o_proj.activation_quanter", 
+        "ffn1_in_scale": "qwen2.layers.#.mlp.gate_proj.activation_quanter", 
+        "ffn2_in_scale": "qwen2.layers.#.mlp.down_proj.activation_quanter"
+    },
+    "weight_scale":{
+        "q_weight_scale":"qwen2.layers.#.self_attn.q_proj.weight_quanter",
+        "k_weight_scale":"qwen2.layers.#.self_attn.k_proj.weight_quanter",
+        "v_weight_scale":"qwen2.layers.#.self_attn.v_proj.weight_quanter",
+        "out_linear_weight_scale":"qwen2.layers.#.self_attn.o_proj.weight_quanter",
+        "ffn1_1_weight_scale":"qwen2.layers.#.mlp.gate_proj.weight_quanter",
+        "ffn1_2_weight_scale":"qwen2.layers.#.mlp.up_proj.weight_quanter",
+        "ffn2_weight_scale":"qwen2.layers.#.mlp.down_proj.weight_quanter"
+    },
+    "cachekv_scale":{
+        "cache_k_scale": "qwen2.layers.#.self_attn.cachek_matmul.activation_quanter",
+        "cache_v_scale": "qwen2.layers.#.self_attn.cachev_matmul.activation_quanter"
+    }
+  }
\ No newline at end of file
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/qwen2/ptq_scales_map_shift_smooth.json b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/qwen2/ptq_scales_map_shift_smooth.json
new file mode 100644
index 000000000..af6a04229
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/qwen2/ptq_scales_map_shift_smooth.json
@@ -0,0 +1,21 @@
+{
+    "act_scale":{
+        "qkv_in_scale": "qwen2.layers.#.self_attn.q_proj.activation_quanter", 
+        "out_linear_in_scale": "qwen2.layers.#.self_attn.o_proj.layer.activation_quanter", 
+        "ffn1_in_scale": "qwen2.layers.#.mlp.gate_proj.activation_quanter", 
+        "ffn2_in_scale": "qwen2.layers.#.mlp.down_proj.layer.activation_quanter"
+    },
+    "weight_scale":{
+        "q_weight_scale":"qwen2.layers.#.self_attn.q_proj.weight_quanter",
+        "k_weight_scale":"qwen2.layers.#.self_attn.k_proj.weight_quanter",
+        "v_weight_scale":"qwen2.layers.#.self_attn.v_proj.weight_quanter",
+        "out_linear_weight_scale":"qwen2.layers.#.self_attn.o_proj.layer.weight_quanter",
+        "ffn1_1_weight_scale":"qwen2.layers.#.mlp.gate_proj.weight_quanter",
+        "ffn1_2_weight_scale":"qwen2.layers.#.mlp.up_proj.weight_quanter",
+        "ffn2_weight_scale":"qwen2.layers.#.mlp.down_proj.layer.weight_quanter"
+    },
+    "cachekv_scale":{
+        "cache_k_scale": "qwen2.layers.#.self_attn.cachek_matmul.activation_quanter",
+        "cache_v_scale": "qwen2.layers.#.self_attn.cachev_matmul.activation_quanter"
+    }
+}
\ No newline at end of file
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/utils.py
new file mode 100644
index 000000000..34a85684a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/experimental/transformers/utils.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import os
+
+import numpy as np
+import paddle
+
+from paddlenlp.transformers.model_utils import (
+    dtype_guard,
+    load_tp_checkpoint,
+    no_init_weights,
+)
+from paddlenlp.transformers.utils import (
+    ContextManagers,
+    is_paddle_support_lazy_init,
+    is_safetensors_available,
+)
+
+
+def infererence_model_from_pretrained(cls, pretrained_model_name_or_path, args, kwargs, return_numpy=True):
+    r"""
+    Instantiate a pretrained model configuration from a pre-trained model name or path.
+    """
+    config = kwargs.pop("config", None)
+    cache_dir = kwargs.pop("cache_dir", None)
+    dtype = kwargs.pop("dtype", None)
+    if dtype is None:
+        dtype = config.dtype
+    subfolder = kwargs.pop("subfolder", None)
+    if subfolder is None:
+        subfolder = ""
+    variant = kwargs.pop("variant", None)
+    use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
+    low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", False)
+
+    init_contexts = []
+    if low_cpu_mem_usage or config.quantization_config.is_weight_quantize():
+        # Instantiate model.
+        init_contexts.append(no_init_weights(_enable=True))
+        if is_paddle_support_lazy_init():
+            init_contexts.append(paddle.LazyGuard())
+    if dtype:
+        init_contexts.append(dtype_guard(dtype))
+
+    # init the model
+    with ContextManagers(init_contexts):
+        model = cls(config)
+
+    resolved_archive_file, _, _, _ = cls._resolve_model_file_path(
+        pretrained_model_name_or_path,
+        cache_dir=cache_dir,
+        subfolder=subfolder,
+        from_hf_hub=False,
+        from_aistudio=False,
+        config=config,
+        convert_from_torch=False,
+        use_safetensors=use_safetensors,
+        variant=variant,
+    )
+
+    model_path = os.path.dirname(resolved_archive_file)
+    state_dict = load_tp_checkpoint(model_path, cls, config, return_numpy=return_numpy)
+    model.set_state_dict(state_dict)
+
+    return model
+
+
+class EmptyActScale:
+    """
+    For fake parameter
+    """
+
+    def __init__(
+        self,
+        key_map_dict=None,
+        num_of_layers=None,
+    ):
+        self.key_map = key_map_dict
+        self.scale = {}
+        for scale_type, key_template in self.key_map.items():
+            self.scale[scale_type] = np.full([num_of_layers], fill_value=0.1)
+
+
+class EmptyWeightScale:
+    """
+    For fake parameter
+    """
+
+    def __init__(
+        self,
+        key_map_dict,
+        num_of_layers,
+        num_head,
+        dim_head,
+        ffn_hidden_size,
+        num_key_value_heads=-1,
+        mp_size=1,
+    ):
+        self.key_map = key_map_dict
+        self.scale = {}
+
+        num_key_value_heads = num_key_value_heads
+        qkv_out_size = (
+            3 * num_head * dim_head if num_key_value_heads <= 0 else (num_head + 2 * num_key_value_heads) * dim_head
+        )
+
+        for scale_type, key_template in self.key_map.items():
+            if "qkv" in scale_type:
+                n = qkv_out_size // mp_size
+            elif "ffn1" in scale_type:
+                n = ffn_hidden_size * 2 // mp_size
+            else:
+                n = num_head * dim_head
+            self.scale[scale_type] = np.full([num_of_layers, n], fill_value=0.1, dtype="float32")
+
+
+class EmptyCacheScale:
+    """
+    For fake parameter
+    """
+
+    def __init__(
+        self,
+        key_map_dict=None,
+        num_of_layers=None,
+        num_heads=None,
+        dim_heads=None,
+        is_channel_wise=False,
+        mp_size=1,
+        num_key_value_heads=-1,
+    ):
+        self.key_map = key_map_dict
+        self.scale = {}
+
+        num_heads = num_heads // mp_size
+        num_key_value_heads = num_key_value_heads // mp_size
+        kv_num_head = num_heads if num_key_value_heads <= 0 else num_key_value_heads
+        for scale_type, key_template in self.key_map.items():
+            if "cache_k" in scale_type:
+                scale_type_out = "cache_k_out_scale"
+            else:
+                scale_type_out = "cache_v_out_scale"
+
+            col_dim = kv_num_head * dim_heads if is_channel_wise else kv_num_head
+            self.scale[scale_type] = np.full([num_of_layers, col_dim], fill_value=1.0)
+            self.scale[scale_type_out] = np.full([num_of_layers, col_dim], fill_value=1.0)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/generation/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/generation/__init__.py
new file mode 100644
index 000000000..021a24732
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/generation/__init__.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .configuration_utils import GenerationConfig
+from .logits_process import (
+    ForcedBOSTokenLogitsProcessor,
+    ForcedEOSTokenLogitsProcessor,
+    HammingDiversityLogitsProcessor,
+    LogitsProcessor,
+    LogitsProcessorList,
+    MinLengthLogitsProcessor,
+    RepetitionPenaltyLogitsProcessor,
+    TopKProcess,
+    TopPProcess,
+)
+from .stopping_criteria import (
+    MaxLengthCriteria,
+    MaxTimeCriteria,
+    StoppingCriteria,
+    StoppingCriteriaList,
+    validate_stopping_criteria,
+)
+from .streamers import BaseStreamer, TextIteratorStreamer, TextStreamer
+from .utils import BeamSearchScorer, GenerationMixin, get_unfinished_flag
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/generation/configuration_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/generation/configuration_utils.py
new file mode 100644
index 000000000..4bd7e51aa
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/generation/configuration_utils.py
@@ -0,0 +1,597 @@
+# copyright (c) 2023 paddlepaddle authors. all rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Generation configuration class and utilities."""
+
+import copy
+import json
+import os
+import warnings
+from typing import Any, Dict, Optional, Union
+
+from huggingface_hub import hf_hub_download
+from paddle.common_ops_import import convert_dtype
+
+from paddlenlp import __version__
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+from paddlenlp.utils.download import resolve_file_path
+from paddlenlp.utils.log import logger
+
+from ..utils import GENERATION_CONFIG_NAME
+from ..utils.downloader import hf_file_exists
+
+DEFAULT_MAX_NEW_TOKENS = 20
+
+
+def resolve_hf_generation_config_path(repo_id: str, cache_dir: str, subfolder=None) -> str:
+    """resolve config file from hf hub
+
+    Args:
+        repo_id (str): the repo name from huggingface hub
+        cache_dir (str): the cachedir
+        subfolder (str, optional) An optional value corresponding to a folder inside the repo.
+
+    Returns:
+        str: the downloaded config file
+    """
+    if hf_file_exists(repo_id=repo_id, filename=GENERATION_CONFIG_NAME, subfolder=subfolder):
+        file_name = GENERATION_CONFIG_NAME
+    else:
+        raise ValueError(f"can not find the paddle/pytorch config file from: https://huggingface.co/{repo_id}")
+
+    return hf_hub_download(
+        repo_id=repo_id,
+        filename=file_name,
+        cache_dir=cache_dir,
+        subfolder=subfolder,
+        library_name="PaddleNLP",
+        library_version=__version__,
+    )
+
+
+class GenerationConfig:
+    r"""
+    Arg:
+        > Parameters that control the length of the output
+            max_length (int, optional): The maximum length of the sequence to
+                be generated. Default to 20.
+            min_length (int, optional): The minimum length of the sequence to
+                be generated. Default to 0.
+            decode_strategy (str, optional): The decoding strategy in generation.
+                Currently, there are three decoding strategies supported:
+                "greedy_search", "sampling" and "beam_search". Default to
+                "greedy_search".
+            temperature (float, optional): The value used to module the next
+                token probabilities in the "sampling" strategy. Default to 1.0,
+                which means no effect.
+            top_k (int, optional): The number of highest probability tokens to
+                keep for top-k-filtering in the "sampling" strategy. Default to
+                0, which means no effect.
+            top_p (float, optional): The cumulative probability for
+                top-p-filtering in the "sampling" strategy. The value should
+                satisfy :math:`0 <= top\_p < 1`. Default to 1.0, which means no
+                effect.
+            repetition_penalty (float, optional):
+                The parameter for repetition penalty. 1.0 means no penalty. See `this paper
+                <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details. Defaults to 1.0.
+            num_beams (int, optional): The number of beams in the "beam_search"
+                strategy. Default to 1.
+            num_beam_groups (int, optional):
+                Number of groups to divide `num_beams` into in order to use DIVERSE
+                BEAM SEARCH. See `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__
+                for more details. Default to 1.
+            length_penalty (float, optional): The exponential penalty to the
+                sequence length in the "beam_search" strategy. The larger this
+                param is, the more that the model would generate shorter
+                sequences. Default to 0.0, which means no penalty.
+            early_stopping (bool, optional): Whether to stop searching in the
+                "beam_search" strategy when at least `num_beams` sentences are
+                finished per batch or not. Default to False.
+            bos_token_id (int, optional): The id of the `bos_token`. Default to
+                None.
+            eos_token_id (int, optional): The id of the `eos_token`. Default to
+                None.
+            pad_token_id (int, optional): The id of the `pad_token`. Default to
+                None.
+            decoder_start_token_id (int, optional): The start token id for
+                encoder-decoder models. Default to None.
+            forced_bos_token_id (int, optional): The id of the token to force as
+                the first generated token. Usually use for multilingual models.
+                Default to None.
+            forced_eos_token_id (int, optional): The id of the token to force as
+                the last generated token. Default to None.
+            num_return_sequences (int, optional): The number of returned
+                sequences for each sequence in the batch. Default to 1.
+            diversity_rate (float, optional): If num_beam_groups is 1, this is the
+                diversity_rate for Diverse Siblings Search. See
+                `this paper https://arxiv.org/abs/1611.08562`__ for more details.
+                If not, this is the diversity_rate for DIVERSE BEAM SEARCH.
+            use_cache: (bool, optional): Whether to use the model cache to
+                speed up decoding. Default to True.
+            use_fast: (bool, optional): Whether to use fast entry of model
+                for FastGeneration. Default to False.
+            use_fp16_decoding: (bool, optional): Whether to use fp16 for decoding.
+                Only works when fast entry is avalible. Default to False.
+            trunc_input: (bool, optional): Whether to truncate the inputs from
+                output sequences . Default to True.
+            model_kwargs (dict): It can be used to specify additional kwargs
+                passed to the model.
+    """
+
+    def _get_generation_mode(self):
+        if hasattr(self, "num_beams") and self.num_beams == 1:
+            if hasattr(self, "do_sample") and self.do_sample is True:
+                generation_mode = "sampling"
+            else:
+                generation_mode = "greedy_search"
+        else:
+            generation_mode = "beam_search"
+
+        return generation_mode
+
+    def __init__(self, **kwargs):
+        # Parameters that control the length of the output
+        self.max_new_tokens = kwargs.get("max_new_tokens", DEFAULT_MAX_NEW_TOKENS)
+
+        if "min_new_token" in kwargs:
+            logger.warning("<min_new_token> field is deprecated. Please use <min_new_tokens> instead.")
+            kwargs["min_new_tokens"] = kwargs.pop("min_new_token")
+
+        self.min_new_tokens = kwargs.pop("min_new_tokens", 0)
+        self.max_length = kwargs.pop("max_length", 0)
+        self.min_length = kwargs.pop("min_length", 0)
+        self.early_stopping = kwargs.pop("early_stopping", False)
+        self.trunc_input = kwargs.pop("trunc_input", True)
+
+        # Parameters for manipulation of the model output logits
+        self.diversity_rate = kwargs.pop("diversity_rate", 0.0)
+        self.temperature = kwargs.pop("temperature", 1.0)
+        self.top_k = kwargs.pop("top_k", 50)
+        self.top_p = kwargs.pop("top_p", 1.0)
+        self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0)
+        self.length_penalty = kwargs.pop("length_penalty", 1.0)
+        self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", None)
+        self.forced_bos_token_id = kwargs.pop("forced_bos_token_id", None)
+        self.forced_eos_token_id = kwargs.pop("forced_eos_token_id", None)
+        self.num_beams = kwargs.pop("num_beams", 1)
+        self.num_beam_groups = kwargs.pop("num_beam_groups", 1)
+        self.use_cache = kwargs.pop("use_cache", True)
+
+        # Parameters that define the output variables of `generate`
+        self.num_return_sequences = kwargs.pop("num_return_sequences", 1)
+
+        # Special tokens that can be used at generation time
+        self.pad_token_id = kwargs.pop("pad_token_id", None)
+        self.bos_token_id = kwargs.pop("bos_token_id", None)
+        self.eos_token_id = kwargs.pop("eos_token_id", None)
+
+        # Generation parameters exclusive to encoder-decoder models
+        self.use_fast = kwargs.pop("use_fast", False)
+        self.use_fp16_decoding = kwargs.pop("use_fp16_decoding", False)
+        self.fast_ptq_sampling = kwargs.pop("fast_ptq_sampling", False)
+        self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None)
+        self._from_model_config = kwargs.pop("_from_model_config", False)
+        self.paddlenlp_version = kwargs.pop("paddlenlp_version", __version__)
+
+        # Additional attributes without default values
+        if not self._from_model_config:
+            # we don't want to copy values from the model config if we're initializing a `GenerationConfig` from a
+            # model's default configuration file
+            for key, value in kwargs.items():
+                try:
+                    setattr(self, key, value)
+                except AttributeError as err:
+                    logger.error(f"Can't set {key} with value {value} for {self}")
+                    raise err
+
+        # Parameters that control the generation strategy used
+        if "decode_strategy" in kwargs:
+            self.decode_strategy = kwargs.pop("decode_strategy")
+        else:
+            self.decode_strategy = self._get_generation_mode()
+
+        # Validate the values of the attributes
+        self.validate(is_init=True)
+
+    def __eq__(self, other):
+        if not isinstance(other, GenerationConfig):
+            return False
+
+        self_dict = self.__dict__.copy()
+        other_dict = other.__dict__.copy()
+        # ignore metadata
+        for metadata_field in ["_from_model_config", "paddlenlp_version"]:
+            self_dict.pop(metadata_field, None)
+            other_dict.pop(metadata_field, None)
+        return self_dict == other_dict
+
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+
+    def validate(self, is_init=False):
+        """
+        Validates the values of the attributes of the [`GenerationConfig`] instance. Raises exceptions in the presence
+        of parameterization that can be detected as incorrect from the configuration instance alone.
+
+        Note that some parameters are best validated at generate runtime, as they may depend on other inputs and/or the
+        model, such as parameters related to the generation length.
+        """
+
+        # Validation of individual attributes
+        if self.early_stopping not in {True, False, "never"}:
+            raise ValueError(f"`early_stopping` must be a boolean or 'never', but is {self.early_stopping}.")
+
+        # Validation of attribute relations:
+        fix_location = ""
+        if is_init:
+            fix_location = (
+                " This was detected when initializing the generation config instance, which means the corresponding "
+                "file may hold incorrect parameterization and should be fixed."
+            )
+
+        # 1. detect sampling-only parameterization when not in sampling mode
+        if self.decode_strategy == "greedy_search":
+            greedy_wrong_parameter_msg = (
+                "using greedy search strategy. However, `{flag_name}` is set to `{flag_value}` -- this flag is only "
+                'used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset `{flag_name}`.'
+                + fix_location
+            )
+            if self.temperature != 1.0:
+                warnings.warn(
+                    greedy_wrong_parameter_msg.format(flag_name="temperature", flag_value=self.temperature),
+                    UserWarning,
+                )
+            if self.top_p != 1.0:
+                warnings.warn(
+                    greedy_wrong_parameter_msg.format(flag_name="top_p", flag_value=self.top_p),
+                    UserWarning,
+                )
+
+        # 2. detect beam-only parameterization when not in beam mode
+        if self.decode_strategy != "beam_search":
+            single_beam_wrong_parameter_msg = (
+                "`num_beams` is set to 1. However, `{flag_name}` is set to `{flag_value}` -- this flag is only used "
+                "in beam-based generation modes. You should set `num_beams>1` or unset `{flag_name}`." + fix_location
+            )
+            if self.early_stopping is not False:
+                warnings.warn(
+                    single_beam_wrong_parameter_msg.format(flag_name="early_stopping", flag_value=self.early_stopping),
+                    UserWarning,
+                )
+            if self.num_beam_groups != 1:
+                warnings.warn(
+                    single_beam_wrong_parameter_msg.format(
+                        flag_name="num_beam_groups", flag_value=self.num_beam_groups
+                    ),
+                    UserWarning,
+                )
+            if self.length_penalty != 1.0:
+                warnings.warn(
+                    single_beam_wrong_parameter_msg.format(flag_name="length_penalty", flag_value=self.length_penalty),
+                    UserWarning,
+                )
+
+        # 4. check `num_return_sequences`
+        if self.num_return_sequences != 1:
+            if self.decode_strategy == "greedy_search":
+                raise ValueError(
+                    "Greedy methods without beam search do not support `num_return_sequences` different than 1 "
+                    f"(got {self.num_return_sequences})."
+                )
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        config_file_name: Optional[Union[str, os.PathLike]] = None,
+        **kwargs,
+    ):
+        r"""
+        Save a generation configuration object to the directory `save_directory`, so that it can be re-loaded using the
+        [`~GenerationConfig.from_pretrained`] class method.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the configuration JSON file will be saved (will be created if it does not exist).
+            config_file_name (`str` or `os.PathLike`, *optional*, defaults to `"generation_config.json"`):
+                Name of the generation configuration JSON file to be saved in `save_directory`.
+        """
+
+        # At save time, validate the instance -- if any warning/exception is thrown, we refuse to save the instance
+        try:
+            with warnings.catch_warnings(record=True) as caught_warnings:
+                self.validate()
+            for w in caught_warnings:
+                raise ValueError(w.message)
+        except ValueError as exc:
+            warnings.warn(
+                "The generation config instance is invalid -- `.validate()` throws warnings and/or exceptions. "
+                "Fix these issues to save the configuration. This warning will be raised to an exception."
+                "\n\nThrown during validation:\n" + str(exc),
+                UserWarning,
+            )
+            return
+
+        config_file_name = config_file_name if config_file_name is not None else GENERATION_CONFIG_NAME
+
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        output_config_file = os.path.join(save_directory, config_file_name)
+
+        self.to_json_file(output_config_file, use_diff=True)
+        logger.info(f"Configuration saved in {output_config_file}")
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        from_hf_hub: bool = False,
+        from_aistudio: bool = False,
+        config_file_name: Optional[Union[str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        force_download: bool = False,
+        **kwargs,
+    ) -> "GenerationConfig":
+        r"""
+        Instantiate a [`GenerationConfig`] from a generation configuration file.
+
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                This can be either:
+
+                - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+                  paddlenlp bos server. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a configuration file saved using the
+                  [`~PretrainedConfig.save_pretrained`] method, e.g., `./my_model_directory/`.
+                - a path or url to a saved configuration JSON *file*, e.g., `./my_model_directory/configuration.json`.
+            from_hf_hub (bool, *optional*):
+                load config from huggingface hub: https://huggingface.co/models
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force to (re-)download the configuration files and override the cached versions if
+                they exist.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                If `False`, then this function returns just the final configuration object.
+
+                If `True`, then this functions returns a `Tuple(config, unused_kwargs)` where *unused_kwargs* is a
+                dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e., the
+                part of `kwargs` which has not been used to update `config` and is otherwise ignored.
+            kwargs (`Dict[str, Any]`, *optional*):
+                The values in kwargs of any keys which are configuration attributes will be used to override the loaded
+                values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
+                by the `return_unused_kwargs` keyword parameter.
+
+        Returns:
+            [`GenerationConfig`]: The configuration object instantiated from this pretrained model.
+
+        Examples:
+
+        ```python
+        >>> from paddlenlp.transformers import GenerationConfig
+
+        >>> generation_config = GenerationConfig.from_pretrained("gpt2")
+
+        >>> # E.g. config was saved using *save_pretrained('./test/saved_model/')*
+        >>> generation_config.save_pretrained("./test/saved_model/")
+        >>> generation_config = GenerationConfig.from_pretrained("./test/saved_model/")
+
+        >>> # You can also specify configuration names to your generation configuration file
+        >>> generation_config.save_pretrained("./test/saved_model/", config_file_name="my_configuration.json")
+        >>> generation_config = GenerationConfig.from_pretrained("./test/saved_model/", "my_configuration.json")
+
+        >>> # If you'd like to try a minor variation to an existing configuration, you can also pass generation
+        >>> # arguments to `.from_pretrained()`. Be mindful that typos and unused arguments will be ignored
+        >>> generation_config, unused_kwargs = GenerationConfig.from_pretrained(
+        ...     "gpt2", top_k=1, foo=False, do_sample=True, return_unused_kwargs=True
+        ... )
+        >>> generation_config.top_k
+        1
+
+        >>> unused_kwargs
+        {'foo': False}
+        ```"""
+        config_file_name = config_file_name if config_file_name is not None else GENERATION_CONFIG_NAME
+
+        subfolder = kwargs.pop("subfolder", "")
+        if subfolder is None:
+            subfolder = ""
+
+        resolved_config_file = resolve_file_path(
+            pretrained_model_name_or_path,
+            [config_file_name],
+            subfolder,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            from_aistudio=from_aistudio,
+            from_hf_hub=from_hf_hub,
+        )
+        assert (
+            resolved_config_file is not None
+        ), f"please make sure {config_file_name} under {pretrained_model_name_or_path}"
+        try:
+            logger.info(f"Loading configuration file {resolved_config_file}")
+            # Load config dict
+            config_dict = cls._dict_from_json_file(resolved_config_file)
+        except (json.JSONDecodeError, UnicodeDecodeError):
+            raise EnvironmentError(f"Config file<'{resolved_config_file}'> is not a valid JSON file.")
+
+        return cls.from_dict(config_dict, **kwargs)
+
+    @classmethod
+    def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
+        with open(json_file, "r", encoding="utf-8") as reader:
+            text = reader.read()
+        return json.loads(text)
+
+    def dict_paddle_dtype_to_str(self, d: Dict[str, Any]) -> None:
+        """
+        Checks whether the passed dictionary and its nested dicts have a *paddle_dtype* key and if it's not None,
+        converts paddle.dtype to a string of just the type. For example, `paddle.float32` get converted into *"float32"*
+        string, which can then be stored in the json format.
+        """
+        if d.get("dtype", None) is not None and not isinstance(d["dtype"], str):
+            d["dtype"] = convert_dtype(d["dtype"])
+        for value in d.values():
+            if isinstance(value, dict):
+                self.dict_paddle_dtype_to_str(value)
+
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "GenerationConfig":
+        """
+        Instantiates a [`GenerationConfig`] from a Python dictionary of parameters.
+
+        Args:
+            config_dict (`Dict[str, Any]`):
+                Dictionary that will be used to instantiate the configuration object.
+            kwargs (`Dict[str, Any]`):
+                Additional parameters from which to initialize the configuration object.
+
+        Returns:
+            [`GenerationConfig`]: The configuration object instantiated from those parameters.
+        """
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+
+        config = cls(**{**config_dict, **kwargs})
+        unused_kwargs = config.update(**kwargs)
+
+        # logger.info(f"Generate config {config}")
+        if return_unused_kwargs:
+            return config, unused_kwargs
+        else:
+            return config
+
+    def to_diff_dict(self) -> Dict[str, Any]:
+        """
+        Removes all attributes from config which correspond to the default config attributes for better readability and
+        serializes to a Python dictionary.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        config_dict = self.to_dict()
+
+        # get the default config dict
+        default_config_dict = GenerationConfig().to_dict()
+
+        serializable_config_dict = {}
+
+        # only serialize values that differ from the default config
+        for key, value in config_dict.items():
+            if key not in default_config_dict or key == "transformers_version" or value != default_config_dict[key]:
+                serializable_config_dict[key] = value
+
+        self.dict_paddle_dtype_to_str(serializable_config_dict)
+        return serializable_config_dict
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+        """
+        output = copy.deepcopy(self.__dict__)
+
+        # PaddleNLP version when serializing this file
+        output["paddlenlp_version"] = __version__
+
+        self.dict_paddle_dtype_to_str(output)
+        return output
+
+    def to_json_string(self, use_diff: bool = True) -> str:
+        """
+        Serializes this instance to a JSON string.
+
+        Args:
+            use_diff (`bool`, *optional*, defaults to `True`):
+                If set to `True`, only the difference between the config instance and the default `GenerationConfig()`
+                is serialized to JSON string.
+
+        Returns:
+            `str`: String containing all the attributes that make up this configuration instance in JSON format.
+        """
+        if use_diff is True:
+            config_dict = self.to_diff_dict()
+        else:
+            config_dict = self.to_dict()
+        return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path: Union[str, os.PathLike], use_diff: bool = True):
+        """
+        Save this instance to a JSON file.
+
+        Args:
+            json_file_path (`str` or `os.PathLike`):
+                Path to the JSON file in which this configuration instance's parameters will be saved.
+            use_diff (`bool`, *optional*, defaults to `True`):
+                If set to `True`, only the difference between the config instance and the default `GenerationConfig()`
+                is serialized to JSON file.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            writer.write(self.to_json_string(use_diff=use_diff))
+
+    @classmethod
+    def from_model_config(cls, model_config: PretrainedConfig) -> "GenerationConfig":
+        """
+        Instantiates a [`GenerationConfig`] from a [`PretrainedConfig`]. This function is useful to convert legacy
+        [`PretrainedConfig`] objects, which may contain generation parameters, into a stand-alone [`GenerationConfig`].
+
+        Args:
+            model_config (`PretrainedConfig`):
+                The model config that will be used to instantiate the generation config.
+
+        Returns:
+            [`GenerationConfig`]: The configuration object instantiated from those parameters.
+        """
+        config_dict = model_config.to_dict()
+        config_dict.pop("_from_model_config", None)
+        config = cls.from_dict(config_dict, return_unused_kwargs=False, _from_model_config=True)
+
+        # Special case: some models have generation attributes set in the decoder. Use them if still unset in the
+        # generation config.
+        for decoder_name in ("decoder", "generator", "text_config"):
+            if decoder_name in config_dict:
+                default_generation_config = GenerationConfig()
+                decoder_config = config_dict[decoder_name]
+                for attr in config.to_dict().keys():
+                    if attr in decoder_config and getattr(config, attr) == getattr(default_generation_config, attr):
+                        setattr(config, attr, decoder_config[attr])
+
+        return config
+
+    def update(self, **kwargs):
+        """
+        Updates attributes of this class instance with attributes from `kwargs` if they match existing atributtes,
+        returning all the unused kwargs.
+
+        Args:
+            kwargs (`Dict[str, Any]`):
+                Dictionary of attributes to tentatively update this class.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance.
+        """
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(self, key):
+                setattr(self, key, value)
+                to_remove.append(key)
+
+        # remove all the attributes that were updated, without modifying the input dict
+        unused_kwargs = {key: value for key, value in kwargs.items() if key not in to_remove}
+        return unused_kwargs
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/generation/logits_process.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/generation/logits_process.py
new file mode 100644
index 000000000..914b15207
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/generation/logits_process.py
@@ -0,0 +1,646 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import inspect
+from abc import ABC
+from collections import OrderedDict
+from typing import Callable, Dict, List, Tuple, Union
+
+import numpy as np
+import paddle
+from paddle.nn.layer.layers import in_declarative_mode
+
+
+class LogitsProcessor(ABC):
+    """
+    Abstract base class for all logit processors that can be applied during
+    generation.
+    """
+
+    def __call__(self, input_ids: paddle.Tensor, logits: paddle.Tensor):
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. " "Only classes inheriting this class can be called."
+        )
+
+
+class LogitsProcessorList:
+    """use ordered dict to store processors"""
+
+    def __init__(self, processors: List[LogitsProcessor] = None) -> None:
+        self._processors = OrderedDict()
+        processors = processors or []
+        for processor in processors:
+            self.append(processor)
+
+    def __call__(self, input_ids: paddle.Tensor, logits: paddle.Tensor, **kwargs):
+        for processor in self._processors.values():
+            processor_args = inspect.signature(processor.__call__).parameters
+            if len(processor_args) > 2:
+                assert all(
+                    arg in kwargs for arg in list(processor_args.keys())[2:]
+                ), f"The parameters don't match for {processor.__class__}"
+                logits = processor(input_ids, logits, **kwargs)
+            else:
+                logits = processor(input_ids, logits)
+        return logits
+
+    def append(self, processor: LogitsProcessor):
+        self._processors[len(self._processors)] = processor
+
+
+class MinLengthLogitsProcessor(LogitsProcessor):
+    r"""
+    Enforcing a min-length by setting EOS probability to 0.
+
+    Args:
+        min_length (int): The minimum length of generation sequence.
+        eos_token_id (int): The id of the `end-of-sequence` token.
+    """
+
+    def __init__(self, min_length: int, eos_token_id: Union[int, List[int]]):
+        if min_length < 0 and not in_declarative_mode():
+            raise ValueError("`min_length` should be a positive integer, but get {}".format(min_length))
+
+        if not isinstance(eos_token_id, int) or eos_token_id < 0:
+            raise ValueError("`eos_token_id` should be a positive integer, but get {}".format(eos_token_id))
+
+        self.min_length = min_length
+        self.eos_token_id = eos_token_id
+
+    def __call__(self, input_ids: paddle.Tensor, logits: paddle.Tensor):
+        cur_len = input_ids.shape[-1]
+        if cur_len < self.min_length:
+            logits[:, self.eos_token_id] = paddle.finfo(logits.dtype).min
+        return logits
+
+
+class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
+    r"""
+    Enforcing an exponential penalty on repeated sequences.
+
+    Args:
+        repetition_penalty (float):
+            The parameter for repetition penalty. 1.0 means no penalty. See `this paper
+            <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
+    """
+
+    def __init__(self, penalty: float):
+        if not (penalty > 0) and not in_declarative_mode():
+            raise ValueError(f"`penalty` has to be a strictly positive float, but is {penalty}")
+
+        self.penalty = penalty
+
+    def __call__(self, input_ids: paddle.Tensor, logits: paddle.Tensor):
+        score = paddle.index_sample(logits, input_ids)
+        score = paddle.where(score < 0, score * self.penalty, score / self.penalty)
+        input_ids = input_ids + paddle.arange(logits.shape[0], dtype="int64").unsqueeze(-1) * logits.shape[-1]
+        outputs = paddle.scatter(logits.flatten(), input_ids.flatten(), score.flatten()).reshape(logits.shape)
+        return outputs
+
+
+def _get_ngrams(ngram_size: int, prev_input_ids: paddle.Tensor, num_hypos: int):
+    """
+    Assume ngram_size=2 and prev_input_ids=tensor([[40, 2883, 2712, 4346]]). The output of generated ngrams look like
+    this {(40,): [2883], (2883,): [2712], (2712,): [4346]}.
+
+    Args:
+        ngram_size (`int`):
+            The number sequential tokens taken as a group which may only occur once before being banned.
+        prev_input_ids (`paddle.Tensor`):
+           Generated token ids for the current hypothesis.
+        num_hypos (`int`):
+            The number of hypotheses for which n-grams need to be generated.
+
+    Returns:
+        generated_ngrams (`dict`):
+            Dictionary of generated ngrams.
+    """
+    generated_ngrams = [{} for _ in range(num_hypos)]
+    for idx in range(num_hypos):
+        gen_tokens = prev_input_ids[idx].tolist()
+        generated_ngram = generated_ngrams[idx]
+        for ngram in zip(*[gen_tokens[i:] for i in range(ngram_size)]):
+            prev_ngram_tuple = tuple(ngram[:-1])
+            generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]]
+    return generated_ngrams
+
+
+def _get_generated_ngrams(banned_ngrams, prev_input_ids, ngram_size, cur_len):
+    """
+    Determines the banned tokens for the current hypothesis based on previously generated n-grams.
+
+    Args:
+        banned_ngrams (`dict`):
+            A dictionary containing previously generated n-grams for each hypothesis.
+        prev_input_ids (`paddle.Tensor`):
+            Generated token ids for the current hypothesis.
+        ngram_size (`int`):
+            The number sequential tokens taken as a group which may only occur once before being banned.
+        cur_len (`int`):
+            The current length of the token sequences for which the n-grams are being checked.
+
+    Returns:
+        List of tokens that are banned.
+    """
+    start_idx = cur_len + 1 - ngram_size
+    ngram_idx = tuple(prev_input_ids[start_idx:cur_len].tolist())
+    return banned_ngrams.get(ngram_idx, [])
+
+
+def _calc_banned_ngram_tokens(ngram_size: int, prev_input_ids: paddle.Tensor, num_hypos: int, cur_len: int):
+    """Copied from fairseq for no_repeat_ngram in beam_search"""
+    if cur_len + 1 < ngram_size:
+        # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
+        return [[] for _ in range(num_hypos)]
+
+    generated_ngrams = _get_ngrams(ngram_size, prev_input_ids, num_hypos)
+
+    banned_tokens = [
+        _get_generated_ngrams(generated_ngrams[hypo_idx], prev_input_ids[hypo_idx], ngram_size, cur_len)
+        for hypo_idx in range(num_hypos)
+    ]
+    return banned_tokens
+
+
+class NoRepeatNGramLogitsProcessor(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] that enforces no repetition of n-grams. See
+    [Fairseq](https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345).
+    Args:
+        ngram_size (`int`):
+            All ngrams of size `ngram_size` can only occur once.
+    """
+
+    def __init__(self, ngram_size: int):
+        if not isinstance(ngram_size, int) or ngram_size <= 0:
+            raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}")
+        self.ngram_size = ngram_size
+
+    def __call__(self, input_ids: paddle.Tensor, scores: paddle.Tensor):
+        num_batch_hypotheses = scores.shape[0]
+        cur_len = input_ids.shape[-1]
+        banned_batch_tokens = _calc_banned_ngram_tokens(self.ngram_size, input_ids, num_batch_hypotheses, cur_len)
+
+        for i, banned_tokens in enumerate(banned_batch_tokens):
+            if len(banned_tokens) == 0:
+                continue
+            scores[i, banned_tokens] = paddle.finfo(scores.dtype).min
+
+        return scores
+
+
+class HammingDiversityLogitsProcessor(LogitsProcessor):
+    """
+    This `LogitsProcessor` enforces diverse beam search. Note that this logits
+    processor is only effective for `group_beam_search`. See
+    `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__ for more details.
+
+    Args:
+        diversity_rate (float): This value is subtracted from a beam's score if
+            it generates a token same as any beam from other group at a particular
+            time.
+        num_beams (int): Number of beams used for group beam search.
+        num_beam_groups (int): Number of groups to divide `num_beams` into in order
+            to ensure diversity among different groups of beams.
+    """
+
+    def __init__(self, diversity_rate: float, num_beams: int, num_beam_groups: int):
+        if not isinstance(diversity_rate, float) or (not diversity_rate > 0.0):
+            raise ValueError("`diversity_rate` should be a float strictly larger than 0.")
+        self._diversity_rate = diversity_rate
+        if not isinstance(num_beams, int) or num_beams < 2:
+            raise ValueError("`num_beams` should be an integer strictly larger than 1.")
+        self._num_beams = num_beams
+        if not isinstance(num_beam_groups, int) or num_beam_groups < 2:
+            raise ValueError("`num_beam_groups` should be an integer strictly larger than 1.")
+        self._num_sub_beams = num_beams // num_beam_groups
+
+    def __call__(
+        self, input_ids: paddle.Tensor, scores: paddle.Tensor, current_tokens: paddle.Tensor, beam_group_idx: int
+    ):
+        batch_size = current_tokens.shape[0] // self._num_beams
+        group_start_idx = beam_group_idx * self._num_sub_beams
+        group_end_idx = min(group_start_idx + self._num_sub_beams, self._num_beams)
+        group_size = group_end_idx - group_start_idx
+        vocab_size = scores.shape[-1]
+
+        if group_start_idx == 0:
+            return scores
+
+        for batch_idx in range(batch_size):
+            previous_group_tokens = current_tokens[
+                batch_idx * self._num_beams : batch_idx * self._num_beams + group_start_idx
+            ]
+            token_frequency = paddle.bincount(previous_group_tokens, minlength=vocab_size)
+            scores[batch_idx * group_size : (batch_idx + 1) * group_size] -= self._diversity_rate * token_frequency
+
+        return scores
+
+
+class ForcedBOSTokenLogitsProcessor(LogitsProcessor):
+    """
+    This `LogitsProcessor` enforces the first generated token to be the selected `forced_bos_token`.
+
+    Args:
+        forced_bos_token_id (:obj:`int`):
+            The id of the token to be generated as the first token.
+    """
+
+    def __init__(self, forced_bos_token_id: int):
+        self.forced_bos_token_id = forced_bos_token_id
+
+    def __call__(self, input_ids: paddle.Tensor, scores: paddle.Tensor):
+        cur_len = input_ids.shape[-1]
+        if cur_len == 1:
+            scores[:] = paddle.finfo(scores.dtype).min
+            scores[:, self.forced_bos_token_id] = 0
+        return scores
+
+
+class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
+    """
+    This `LogitsProcessor` enforces the last generated token to be the selected `forced_eos_token`.
+
+    Args:
+        max_length (int): The maximum length of the sequence to be generated.
+        forced_eos_token_id (int): The id of the token to be generated as the last token.
+    """
+
+    def __init__(self, max_length: int, forced_eos_token_id: Union[int, List[int]]):
+        self.max_length = max_length
+        self.forced_eos_token_id = forced_eos_token_id
+
+    def __call__(self, input_ids, scores):
+        cur_len = input_ids.shape[-1]
+        if cur_len == self.max_length - 1:
+            scores[:] = paddle.finfo(scores.dtype).min
+            scores[:, self.forced_eos_token_id] = 0
+        return scores
+
+
+def TopKProcess(probs: paddle.Tensor, top_k: int, min_tokens_to_keep: int):
+    top_k = paddle.minimum(
+        paddle.maximum(paddle.to_tensor(top_k), paddle.to_tensor(min_tokens_to_keep)),
+        paddle.to_tensor(probs.shape[-1]),
+    )
+    # Remove all tokens with a probability less than the last token of the top-k
+    # cast to float16 to support generation & d2s
+    if probs.dtype == paddle.bfloat16:
+        probs = paddle.cast(probs, paddle.float32)
+        topk_probs, _ = paddle.topk(probs, k=top_k)
+        topk_probs = paddle.cast(topk_probs, paddle.bfloat16)
+    else:
+        topk_probs, _ = paddle.topk(probs, k=top_k)
+
+    probs = paddle.where(probs >= topk_probs[:, -1:], probs, paddle.full_like(probs, 0.0))
+    return probs
+
+
+def TopPProcess(probs: paddle.Tensor, top_p: float, min_tokens_to_keep: int):
+    if probs.dtype == paddle.bfloat16:
+        probs = paddle.cast(probs, paddle.float32)
+
+        sorted_indices = paddle.argsort(probs, descending=True)
+        sorted_probs = paddle.sort(probs, descending=True)
+
+        sorted_probs = paddle.cast(sorted_probs, paddle.bfloat16)
+
+    else:
+        sorted_indices = paddle.argsort(probs, descending=True)
+        sorted_probs = paddle.sort(probs, descending=True)
+
+    cumulative_probs = paddle.cumsum(sorted_probs, axis=-1)
+
+    # Remove tokens with cumulative probs above the top_p, But keep at
+    # least min_tokens_to_keep tokens
+    sorted_indices_to_remove = cumulative_probs > top_p
+    if min_tokens_to_keep > 1:
+        # Set 'min_tokens_to_keep - 1' because the first token is kept
+        sorted_indices_to_remove[:, : min_tokens_to_keep - 1] = 0
+    # Keep the first token
+    sorted_indices_to_remove = paddle.cast(sorted_indices_to_remove, dtype="int64")
+    sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
+    sorted_indices_to_remove[:, 0] = 0
+
+    # Scatter sorted tensors to original indexing
+    sorted_indices = sorted_indices + paddle.arange(probs.shape[0], dtype="int64").unsqueeze(-1) * probs.shape[-1]
+    condition = paddle.scatter(
+        sorted_indices_to_remove.flatten(), sorted_indices.flatten(), sorted_indices_to_remove.flatten()
+    )
+    condition = paddle.cast(condition, "bool").reshape(probs.shape)
+    probs = paddle.where(condition, paddle.full_like(probs, 0.0), probs)
+    return probs
+
+
+class LogitsWarper:
+    """Abstract base class for all logit warpers that can be applied during generation with multinomial sampling."""
+
+    def __call__(self, input_ids: paddle.Tensor, scores: paddle.Tensor):
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
+        )
+
+
+class TemperatureLogitsWarper(LogitsWarper):
+    r"""
+    [`LogitsWarper`] for temperature (exponential scaling output probability distribution).
+    Args:
+        temperature (`float`):
+            The value used to module the logits distribution.
+    """
+
+    def __init__(self, temperature: float):
+        if not isinstance(temperature, float) or not (temperature > 0):
+            raise ValueError(f"`temperature` has to be a strictly positive float, but is {temperature}")
+
+        self.temperature = temperature
+
+    def __call__(self, input_ids: paddle.Tensor, scores: paddle.Tensor):
+        scores = scores / self.temperature
+        return scores
+
+
+class SequenceBiasLogitsProcessor(LogitsProcessor):
+    """
+    [`LogitsProcessor`] that applies an additive bias on sequences. The bias is applied to the last token of a sequence
+    when the next generated token can complete it. Consequently, to take the most of biasing sequences with more than
+    one token, consider using beam methods (to gracefully work around partially completed sequences that have a
+    negative bias) and applying the bias to their prefixes (to ensure the bias is applied earlier).
+
+    <Tip>
+
+    In order to get the token ids of the sequences that you want to bias, make sure to set `add_prefix_space=True` when
+    initializing the tokenizer, and use `tokenizer(bad_words, add_special_tokens=False).input_ids`. The
+    `add_prefix_space` argument is only supported for some slow tokenizers, as fast tokenizers' prefixing behaviours
+    come from `pre tokenizers`.
+
+    </Tip>
+
+    Args:
+        sequence_bias (`Dict[Tuple[int], float]`):
+            Dictionary that maps a sequence of tokens to its bias term. Positive biases increase the odds of the
+            sequence being selected, while negative biases do the opposite. If a sequence has a length of 1, its bias
+            will always be applied. Otherwise, the bias will only be applied if the sequence in question is about to be
+            completed (in the token selection step after this processor is applied).
+
+    Examples:
+
+    ```python
+    >>> from paddlenlp.transformers import AutoTokenizer, AutoModelForCausalLM
+
+    >>> model = AutoModelForCausalLM.from_pretrained("gpt2-en")
+    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2-en")
+    >>> inputs = tokenizer(["The full name of Donald is Donald"], return_tensors="pt")
+
+    >>> summary_ids = model.generate(inputs["input_ids"], max_new_tokens=4)
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True)[0])
+    The full name of Donald is Donald J. Trump Jr
+
+    >>> # Now let's control generation through a bias. Please note that the tokenizer is initialized differently!
+    >>> tokenizer_with_prefix_space = AutoTokenizer.from_pretrained("gpt2-en")
+
+
+    >>> def get_tokens_as_tuple(word):
+    ...     return tuple(tokenizer_with_prefix_space([word], add_special_tokens=False).input_ids[0])
+
+
+    >>> # If we add a negative bias without beam search, it may become "stuck" in a prefix without good continuations
+    >>> sequence_bias = {get_tokens_as_tuple("Trump"): -10.0}
+    >>> biased_ids = model.generate(inputs["input_ids"], max_new_tokens=4, sequence_bias=sequence_bias)
+    >>> print(tokenizer.batch_decode(biased_ids, skip_special_tokens=True)[0])
+    The full name of Donald is Donald J. Donald,
+
+    >>> biased_ids = model.generate(inputs["input_ids"], max_new_tokens=4, num_beams=4, sequence_bias=sequence_bias)
+    >>> print(tokenizer.batch_decode(biased_ids, skip_special_tokens=True)[0])
+    The full name of Donald is Donald Rumsfeld,
+
+    >>> # We can also add a positive bias to nudge the model towards specific tokens or continuations
+    >>> sequence_bias = {get_tokens_as_tuple("Donald Duck"): 10.0}
+    >>> biased_ids = model.generate(inputs["input_ids"], max_new_tokens=4, num_beams=4, sequence_bias=sequence_bias)
+    >>> print(tokenizer.batch_decode(biased_ids, skip_special_tokens=True)[0])
+    The full name of Donald is Donald Duck.
+    ```
+    """
+
+    def __init__(self, sequence_bias: Dict[Tuple[int], float]):
+        self.sequence_bias = sequence_bias
+        self._validate_arguments()
+
+        # Bias variables that will be populated on the first call (for retrocompatibility purposes, the vocabulary size
+        # is infered in the first usage, which inhibits initializing here)
+        self.length_1_bias = None
+        self.prepared_bias_variables = False
+
+    def __call__(self, input_ids, scores):
+        # 1 - Prepares the bias tensors. This is only needed the first time the logit processor is called.
+        if not self.prepared_bias_variables:
+            self._prepare_bias_variables(scores)
+
+        # 2 - prepares an empty bias to add
+        bias = paddle.zeros_like(scores)
+
+        # 3 - include the bias from length = 1
+        if self.length_1_bias is not None:
+            bias += self.length_1_bias
+
+        # 4 - include the bias from length > 1, after determining which biased sequences may be completed.
+        for sequence_ids, sequence_bias in self.sequence_bias.items():
+            if len(sequence_ids) == 1:  # the sequence is of length 1, already applied
+                continue
+            if len(sequence_ids) > input_ids.shape[1]:  # the sequence is longer than the context, ignore
+                continue
+            prefix_length = len(sequence_ids) - 1
+            last_token = sequence_ids[-1]
+            matching_rows = (
+                paddle.equal(
+                    input_ids[:, -prefix_length:],
+                    paddle.to_tensor(sequence_ids[:-1], dtype=input_ids.dtype),
+                )
+                .astype(paddle.int64)
+                .prod(axis=1)
+            )
+            bias[:, last_token] += paddle.where(
+                matching_rows == 1,
+                paddle.to_tensor(sequence_bias),
+                paddle.to_tensor(0.0),
+            )
+
+        # 5 - apply the bias to the scores
+        scores = scores + bias
+        return scores
+
+    def _prepare_bias_variables(self, scores):
+        vocabulary_size = scores.shape[-1]
+
+        # Check biased tokens out of bounds
+        invalid_biases = []
+        for sequence_ids in self.sequence_bias:
+            for token_id in sequence_ids:
+                if token_id >= vocabulary_size:
+                    invalid_biases.append(token_id)
+        if len(invalid_biases) > 0:
+            raise ValueError(
+                f"The model vocabulary size is {vocabulary_size}, but the following tokens were being biased: "
+                f"{invalid_biases}"
+            )
+
+        # Precompute the bias tensors to be applied. Sequences of length 1 are kept separately, as they can be applied
+        # with simpler logic.
+        self.length_1_bias = paddle.zeros((vocabulary_size,))
+        for sequence_ids, bias in self.sequence_bias.items():
+            if len(sequence_ids) == 1:
+                self.length_1_bias[sequence_ids[-1]] = bias
+
+        self.prepared_bias_variables = True
+
+    def _validate_arguments(self):
+        sequence_bias = self.sequence_bias
+        if not isinstance(sequence_bias, dict) or len(sequence_bias) == 0:
+            raise ValueError(f"`sequence_bias` has to be a non-empty dictionary, but is {sequence_bias}.")
+        if any(not isinstance(sequence_ids, tuple) for sequence_ids in sequence_bias.keys()):
+            raise ValueError(f"`sequence_bias` has to be a dict with tuples as keys, but is {sequence_bias}.")
+        if any(
+            any((not isinstance(token_id, (int, np.integer)) or token_id < 0) for token_id in sequence_ids)
+            or len(sequence_ids) == 0
+            for sequence_ids in sequence_bias.keys()
+        ):
+            raise ValueError(
+                f"Each key in `sequence_bias` has to be a non-empty tuple of positive integers, but is "
+                f"{sequence_bias}."
+            )
+        if any(not isinstance(bias, float) for bias in sequence_bias.values()):
+            raise ValueError(f"`sequence_bias` has to be a dict with floats as values, but is {sequence_bias}.")
+
+
+class NoBadWordsLogitsProcessor(SequenceBiasLogitsProcessor):
+    """
+    [`LogitsProcessor`] that enforces that specified sequences will never be selected.
+
+    <Tip>
+
+    In order to get the token ids of the words that should not appear in the generated text, make sure to set
+    `add_prefix_space=True` when initializing the tokenizer, and use `tokenizer(bad_words,
+    add_special_tokens=False).input_ids`. The `add_prefix_space` argument is only supported for some slow tokenizers,
+    as fast tokenizers' prefixing behaviours come from `pre tokenizers`. Read more
+    [here](https://huggingface.co/docs/tokenizers/api/pre-tokenizers).
+
+    </Tip>
+
+    Args:
+        bad_words_ids (`List[List[int]]`):
+            List of list of token ids that are not allowed to be generated.
+        eos_token_id (`Union[int, List[int]]`):
+            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+
+    Examples:
+
+    ```python
+    >>> from paddlenlp.transformers import AutoTokenizer, AutoModelForCausalLM
+
+    >>> model = AutoModelForCausalLM.from_pretrained("gpt2-en")
+    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2-en")
+    >>> inputs = tokenizer(["In a word, the cake is a"], return_tensors="pt")
+
+    >>> output_ids = model.generate(inputs["input_ids"], max_new_tokens=5, pad_token_id=tokenizer.eos_token_id)
+    >>> print(tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0])
+    In a word, the cake is a bit of a mess.
+
+    >>> # Now let's take the bad words out. Please note that the tokenizer is initialized differently
+    >>> tokenizer_with_prefix_space = AutoTokenizer.from_pretrained("gpt2-en", add_prefix_space=True)
+
+
+    >>> def get_tokens_as_list(word_list):
+    ...     "Converts a sequence of words into a list of tokens"
+    ...     tokens_list = []
+    ...     for word in word_list:
+    ...         tokenized_word = tokenizer_with_prefix_space([word], add_special_tokens=False).input_ids[0]
+    ...         tokens_list.append(tokenized_word)
+    ...     return tokens_list
+
+
+    >>> bad_words_ids = get_tokens_as_list(word_list=["mess"])
+    >>> output_ids = model.generate(
+    ...     inputs["input_ids"], max_new_tokens=5, bad_words_ids=bad_words_ids, pad_token_id=tokenizer.eos_token_id
+    ... )
+    >>> print(tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0])
+    In a word, the cake is a bit of a surprise.
+    ```
+
+    >>> from paddlenlp.transformers.generation import NoBadWordsLogitsProcessor, LogitsProcessorList
+    >>> logits_processors = LogitsProcessorList([NoBadWordsLogitsProcessor([[5,6]], eos_token_id=tokenizer.eos_token_id)])
+    >>> output_ids = model.generate(
+    ...     inputs["input_ids"], max_new_tokens=5, logits_processors=logits_processors, pad_token_id=tokenizer.eos_token_id
+    ... )
+    >>> print(tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0])
+    In a word, the cake is a bit of a surprise.
+    ```
+    """
+
+    def __init__(self, bad_words_ids: List[List[int]], eos_token_id: Union[int, List[int]]):
+        self.bad_word_ids = bad_words_ids
+        self._validate_arguments()
+
+        # Filter EOS token from bad_words_ids
+        if eos_token_id is None:
+            eos_token_id = []
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        bad_words_ids = list(
+            filter(lambda bad_token_seq: all(bad_token_seq != [i] for i in eos_token_id), bad_words_ids)
+        )
+
+        # Forbidding a sequence is equivalent to setting its bias to -inf
+        sequence_bias = {tuple(sequence): float("-inf") for sequence in bad_words_ids}
+        super().__init__(sequence_bias=sequence_bias)
+
+    def _validate_arguments(self):
+        bad_words_ids = self.bad_word_ids
+        if not isinstance(bad_words_ids, list) or len(bad_words_ids) == 0:
+            raise ValueError(f"`bad_words_ids` has to be a non-empty list, but is {bad_words_ids}.")
+        if any(not isinstance(bad_word_ids, list) for bad_word_ids in bad_words_ids):
+            raise ValueError(f"`bad_words_ids` has to be a list of lists, but is {bad_words_ids}.")
+        if any(
+            any((not isinstance(token_id, (int, np.integer)) or token_id < 0) for token_id in bad_word_ids)
+            for bad_word_ids in bad_words_ids
+        ):
+            raise ValueError(
+                f"Each list in `bad_words_ids` has to be a list of positive integers, but is {bad_words_ids}."
+            )
+
+
+class PrefixConstrainedLogitsProcessor(LogitsProcessor):
+    r"""
+    [`LogitsProcessor`] that enforces constrained generation and is useful for prefix-conditioned constrained
+    generation. See [Autoregressive Entity Retrieval](https://arxiv.org/abs/2010.00904) for more information.
+
+    Args:
+        prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`):
+            This function constraints the beam search to allowed tokens only at each step. This function takes 2
+            arguments `inputs_ids` and the batch ID `batch_id`. It has to return a list with the allowed tokens for the
+            next generation step conditioned on the previously generated tokens `inputs_ids` and the batch ID
+            `batch_id`.
+    """
+
+    def __init__(self, prefix_allowed_tokens_fn: Callable[[int, paddle.Tensor], List[int]], num_beams: int):
+        self._prefix_allowed_tokens_fn = prefix_allowed_tokens_fn
+        self._num_beams = num_beams
+
+    def __call__(self, input_ids: paddle.Tensor, scores: paddle.Tensor) -> paddle.Tensor:
+        mask = paddle.full_like(scores, paddle.finfo(scores.dtype).min)
+        for batch_id, beam_sent in enumerate(input_ids.reshape([-1, self._num_beams, input_ids.shape[-1]])):
+            for beam_id, sent in enumerate(beam_sent):
+                mask[batch_id * self._num_beams + beam_id, self._prefix_allowed_tokens_fn(batch_id, sent)] = 0
+
+        return scores + mask
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/generation/stopping_criteria.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/generation/stopping_criteria.py
new file mode 100644
index 000000000..32447b637
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/generation/stopping_criteria.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import warnings
+from abc import ABC
+from copy import deepcopy
+from typing import Optional
+
+import paddle
+
+
+class StoppingCriteria(ABC):
+    """
+    Abstract base class for all stopping criteria that can be applied during
+    generation.
+    """
+
+    def __call__(self, input_ids: paddle.Tensor, logits: paddle.Tensor, **kwargs):
+        raise NotImplementedError(f"{self.__class__} is an abstract class. " "StoppingCriteria needs to be subclassed")
+
+
+class MaxTimeCriteria(StoppingCriteria):
+    """
+    This class can be used to stop generation whenever the full generation exceeds some amount of time. By default, the
+    time will start being counted when you initialize this function. You can override this by passing an
+    `initial_time`.
+
+    Args:
+        max_time (`float`):
+            The maximum allowed time in seconds for the generation.
+        initial_time (`float`, *optional*, defaults to `time.time()`):
+            The start of the generation allowed time.
+    """
+
+    def __init__(self, max_time: float, initial_timestamp: Optional[float] = None):
+        self.max_time = max_time
+        self.initial_timestamp = time.time() if initial_timestamp is None else initial_timestamp
+
+    def __call__(self, input_ids: paddle.Tensor, scores: paddle.Tensor, **kwargs) -> bool:
+        return time.time() - self.initial_timestamp > self.max_time
+
+
+class MaxLengthCriteria(StoppingCriteria):
+    """
+    This class can be used to stop generation whenever the full generated number of tokens exceeds `max_length`. Keep
+    in mind for decoder-only type of transformers, [this will include the initial prompted tokens].
+
+    Args:
+        max_length (`int`):
+            The maximum length that the output sequence can have in number of tokens.
+    """
+
+    def __init__(self, max_length: int):
+        self.max_length = max_length
+
+    def __call__(self, input_ids: paddle.Tensor, scores: paddle.Tensor, **kwargs) -> bool:
+        return input_ids.shape[-1] >= self.max_length
+
+
+class StoppingCriteriaList(list):
+    def __call__(self, input_ids: paddle.Tensor, scores: paddle.Tensor, **kwargs):
+        return any(criteria(input_ids, scores) for criteria in self)
+
+    @property
+    def max_length(self):
+        for stopping_criterium in self:
+            if isinstance(stopping_criterium, MaxLengthCriteria):
+                return stopping_criterium.max_length
+        return None
+
+
+def validate_stopping_criteria(stopping_criteria: StoppingCriteriaList, max_length: int) -> StoppingCriteriaList:
+    stopping_max_length = stopping_criteria.max_length
+    new_stopping_criteria = deepcopy(stopping_criteria)
+    if stopping_max_length is not None and stopping_max_length != max_length:
+        warnings.warn("You set different `max_length` for stopping criteria and `max_length` parameter", UserWarning)
+    elif stopping_max_length is None:
+        new_stopping_criteria.append(MaxLengthCriteria(max_length=max_length))
+    return new_stopping_criteria
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/generation/streamers.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/generation/streamers.py
new file mode 100644
index 000000000..67b97b0cf
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/generation/streamers.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from queue import Queue
+from typing import Optional
+
+from paddlenlp.transformers.tokenizer_utils import PretrainedTokenizer
+
+
+class BaseStreamer:
+    """
+    Base class from which `.generate()` streamers should inherit.
+    """
+
+    def put(self, value):
+        """Function that is called by `.generate()` to push new tokens"""
+        raise NotImplementedError()
+
+    def end(self):
+        """Function that is called by `.generate()` to signal the end of generation"""
+        raise NotImplementedError()
+
+
+class TextStreamer(BaseStreamer):
+    """
+    Parameters:
+        tokenizer (`AutoTokenizer`):
+            The tokenized used to decode the tokens.
+        skip_prompt (`bool`, *optional*, defaults to `False`):
+            Whether to skip the prompt to `.generate()` or not. Useful e.g. for chatbots.
+        decode_kwargs (`dict`, *optional*):
+            Additional keyword arguments to pass to the tokenizer's `decode` method.
+
+    Examples:
+
+        ```python
+        >>> from paddlenlp.transformers import AutoModelForCausalLM, AutoTokenizer
+        >>> from paddlenlp.generation import TextStreamer
+
+        >>> tok = AutoTokenizer.from_pretrained("gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+        >>> inputs = tok(["An increasing sequence: one,"], return_tensors="pd")
+        >>> streamer = TextStreamer(tok)
+
+        >>> # Despite returning the usual output, the streamer will also print the generated text to stdout.
+        >>> _ = model.generate(**inputs, streamer=streamer, max_length=20)
+        An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven,
+        ```
+    """
+
+    def __init__(self, tokenizer: PretrainedTokenizer, skip_prompt: bool = False, **decode_kwargs):
+        self.tokenizer = tokenizer
+        self.skip_prompt = skip_prompt
+        self.decode_kwargs = decode_kwargs
+
+        # variables used in the streaming process
+        self.token_cache = []
+        self.print_len = 0
+        self.next_tokens_are_prompt = True
+
+    def put(self, value):
+        """
+        Receives tokens, decodes them, and prints them to stdout as soon as they form entire words.
+        """
+        if len(value.shape) > 1 and value.shape[0] > 1:
+            raise ValueError("TextStreamer only supports batch size 1")
+        elif len(value.shape) > 1:
+            value = value[0]
+
+        if self.skip_prompt and self.next_tokens_are_prompt:
+            self.next_tokens_are_prompt = False
+            return
+
+        # Add the new token to the cache and decodes the entire thing.
+        self.token_cache.extend(value.tolist())
+        text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs)
+
+        # After the symbol for a new line, we flush the cache.
+        if text.endswith("\n"):
+            printable_text = text[self.print_len :]
+            self.token_cache = []
+            self.print_len = 0
+        # If the last token is a CJK character, we print the characters.
+        elif len(text) > 0 and self._is_chinese_char(ord(text[-1])):
+            printable_text = text[self.print_len :]
+            self.print_len += len(printable_text)
+        # Otherwise, prints until the last space char (simple heuristic to avoid printing incomplete words,
+        # which may change with the subsequent token -- there are probably smarter ways to do this!)
+        else:
+            printable_text = text[self.print_len : text.rfind(" ") + 1]
+            self.print_len += len(printable_text)
+
+        self.on_finalized_text(printable_text)
+
+    def end(self):
+        """Flushes any remaining cache and prints a newline to stdout."""
+        # Flush the cache, if it exists
+        if len(self.token_cache) > 0:
+            text = self.tokenizer.decode(self.token_cache, **self.decode_kwargs)
+            printable_text = text[self.print_len :]
+            self.token_cache = []
+            self.print_len = 0
+        else:
+            printable_text = ""
+
+        self.next_tokens_are_prompt = True
+        self.on_finalized_text(printable_text, stream_end=True)
+
+    def on_finalized_text(self, text: str, stream_end: bool = False):
+        """Prints the new text to stdout. If the stream is ending, also prints a newline."""
+        print(text, flush=True, end="" if not stream_end else None)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)  #
+            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+        ):  #
+            return True
+
+        return False
+
+
+class TextIteratorStreamer(TextStreamer):
+    """
+    Streamer that stores print-ready text in a queue, to be used by a downstream application as an iterator. This is
+    useful for applications that benefit from acessing the generated text in a non-blocking way (e.g. in an interactive
+    Gradio demo).
+
+    Parameters:
+        tokenizer (`AutoTokenizer`):
+            The tokenized used to decode the tokens.
+        skip_prompt (`bool`, *optional*, defaults to `False`):
+            Whether to skip the prompt to `.generate()` or not. Useful e.g. for chatbots.
+        timeout (`float`, *optional*):
+            The timeout for the text queue. If `None`, the queue will block indefinitely. Useful to handle exceptions
+            in `.generate()`, when it is called in a separate thread.
+        decode_kwargs (`dict`, *optional*):
+            Additional keyword arguments to pass to the tokenizer's `decode` method.
+
+    Examples:
+
+        ```python
+        >>> from paddlenlp.transformers import AutoModelForCausalLM, AutoTokenizer
+        >>> from paddlenlp.generation import TextIteratorStreamer
+        >>> from threading import Thread
+
+        >>> tok = AutoTokenizer.from_pretrained("gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+        >>> inputs = tok(["An increasing sequence: one,"], return_tensors="pd")
+        >>> streamer = TextIteratorStreamer(tok)
+
+        >>> # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
+        >>> generation_kwargs = dict(inputs, streamer=streamer, max_length=20)
+        >>> thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        >>> thread.start()
+        >>> generated_text = ""
+        >>> for new_text in streamer:
+        ...     generated_text += new_text
+        >>> generated_text
+        'An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven,'
+        ```
+    """
+
+    def __init__(
+        self,
+        tokenizer: PretrainedTokenizer,
+        skip_prompt: bool = False,
+        timeout: Optional[float] = None,
+        **decode_kwargs
+    ):
+        super().__init__(tokenizer, skip_prompt, **decode_kwargs)
+        self.text_queue = Queue()
+        self.stop_signal = None
+        self.timeout = timeout
+
+    def on_finalized_text(self, text: str, stream_end: bool = False):
+        """Put the new text in the queue. If the stream is ending, also put a stop signal in the queue."""
+        self.text_queue.put(text, timeout=self.timeout)
+        if stream_end:
+            self.text_queue.put(self.stop_signal, timeout=self.timeout)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        value = self.text_queue.get(timeout=self.timeout)
+        if value == self.stop_signal:
+            raise StopIteration()
+        else:
+            return value
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/generation/utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/generation/utils.py
new file mode 100644
index 000000000..e7c3dd162
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/generation/utils.py
@@ -0,0 +1,1838 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import copy
+import inspect
+from typing import Optional, Union
+
+import paddle
+import paddle.distributed as dist
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import Tensor
+from paddle.common_ops_import import convert_dtype
+from paddle.utils import map_structure
+
+from paddlenlp.transformers.model_outputs import ModelOutput
+from paddlenlp.transformers.utils import get_scale_by_dtype
+from paddlenlp.utils.log import logger
+
+from .configuration_utils import DEFAULT_MAX_NEW_TOKENS, GenerationConfig
+from .logits_process import (
+    ForcedBOSTokenLogitsProcessor,
+    ForcedEOSTokenLogitsProcessor,
+    HammingDiversityLogitsProcessor,
+    LogitsProcessor,
+    LogitsProcessorList,
+    MinLengthLogitsProcessor,
+    NoRepeatNGramLogitsProcessor,
+    RepetitionPenaltyLogitsProcessor,
+    TopKProcess,
+    TopPProcess,
+)
+from .stopping_criteria import (
+    StoppingCriteria,
+    StoppingCriteriaList,
+    validate_stopping_criteria,
+)
+from .streamers import BaseStreamer
+
+__all__ = [
+    "GenerationMixin",
+    "BeamSearchScorer",
+    "BeamHypotheses",
+    "LogitsProcessorList",
+    "LogitsProcessor",
+    "MinLengthLogitsProcessor",
+    "RepetitionPenaltyLogitsProcessor",
+    "TopKProcess",
+    "TopPProcess",
+    "get_unfinished_flag",
+]
+
+
+def get_unfinished_flag(
+    input_ids: Tensor, unfinished_flag: Tensor, eos_token_id: Union[int, list[int], list[list[int]]]
+) -> Tensor:
+    """get unfinished flag for generation step
+
+    Args:
+        input_ids (Tensor): the input_ids
+        eos_token_id (Union[int, list[int], list[list[int]]]): the end os sentence flag, which can be:
+            * single token id, eg: 10
+            * multiple token ids to stop generation, eg: [10, 10]
+            * some more tokens to stop generations, eg: [[10], [20, 20], [30, 30, 30]]
+
+    Returns:
+        Tensor: the unfinished flag tensor
+    """
+    if isinstance(eos_token_id, int):
+        unfinished_flag = paddle.logical_and(unfinished_flag, input_ids[:, -1:] != eos_token_id)
+    else:
+        batch_unfinish_flag = None
+        for batch_eos_token_id in eos_token_id:
+            if batch_unfinish_flag is None:
+                batch_unfinish_flag = ~get_unfinished_flag(input_ids, unfinished_flag, batch_eos_token_id)
+            else:
+                batch_unfinish_flag = paddle.logical_or(
+                    batch_unfinish_flag, ~get_unfinished_flag(input_ids, unfinished_flag, batch_eos_token_id)
+                )
+
+        unfinished_flag = ~batch_unfinish_flag
+    return unfinished_flag
+
+
+class BeamHypotheses:
+    def __init__(self, num_beams, length_penalty, early_stopping):
+        """
+        Initialize n-best list of hypotheses.
+        """
+        self.length_penalty = length_penalty
+        self.early_stopping = early_stopping
+        self.num_beams = num_beams
+        self.beams = []
+        self.worst_score = get_scale_by_dtype()
+
+    def __len__(self):
+        """
+        Number of hypotheses in the list.
+        """
+        return len(self.beams)
+
+    def add(self, hyp, sum_logprobs, origin_len=0):
+        """
+        Add a new hypothesis to the list.
+        """
+        score = sum_logprobs / (((hyp.shape[-1] - origin_len + 5) / 6) ** self.length_penalty)
+        if len(self) < self.num_beams or score > self.worst_score:
+            self.beams.append((score, hyp))
+            if len(self) > self.num_beams:
+                sorted_next_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)])
+                del self.beams[sorted_next_scores[0][1]]
+                self.worst_score = sorted_next_scores[1][0]
+            else:
+                self.worst_score = min(score, self.worst_score)
+
+    def is_done(self, best_sum_logprobs, cur_len, origin_len=0):
+        """
+        If there are enough hypotheses and that none of the hypotheses being
+        generated can become better than the worst one in the heap, then we
+        are done with this sentence.
+        """
+        if len(self) < self.num_beams:
+            return False
+        elif self.early_stopping:
+            return True
+        else:
+            cur_score = best_sum_logprobs / ((cur_len - origin_len + 5) / 6) ** self.length_penalty
+            ret = self.worst_score >= cur_score
+            return ret
+
+
+class BeamSearchScorer(object):
+    """
+    implementing standard beam search decoding.
+    """
+
+    def __init__(
+        self,
+        batch_size,
+        max_length,
+        num_beams,
+        length_penalty=1.0,
+        do_early_stopping=False,
+        num_beam_hyps_to_keep=1,
+        num_beam_groups=1,
+    ):
+        self.max_length = max_length
+        self.num_beams = num_beams
+        self.length_penalty = length_penalty
+        self.do_early_stopping = do_early_stopping
+        self.num_beam_hyps_to_keep = num_beam_hyps_to_keep
+        self.num_beam_groups = num_beam_groups
+        self.group_size = self.num_beams // self.num_beam_groups
+
+        self._is_init = False
+        self._beam_hyps = [
+            BeamHypotheses(
+                num_beams=self.num_beams, length_penalty=self.length_penalty, early_stopping=self.do_early_stopping
+            )
+            for _ in range(batch_size)
+        ]
+        self._done = paddle.to_tensor([0 for _ in range(batch_size)], dtype="int64")
+
+        if not isinstance(num_beams, int) or num_beams <= 1:
+            raise ValueError(
+                "`num_beams` has to be an integer strictly greater than 1, but "
+                "received {}. For `num_beams` == 1, one should make use of "
+                "`greedy_search` instead.".format(num_beams)
+            )
+
+        if not isinstance(num_beam_groups, int) or (num_beam_groups > num_beams) or (num_beams % num_beam_groups != 0):
+            raise ValueError(
+                "`num_beam_groups` has to be an integer smaller or equal than "
+                "`num_beams` and `num_beams` has to be divisible by "
+                "`num_beam_groups`, but received num_beam_groups={}, num_beams="
+                "{}.".format(num_beam_groups, num_beams)
+            )
+
+    @property
+    def is_done(self):
+        return paddle.min(self._done) == 1
+
+    def process(
+        self, input_ids, next_scores, next_tokens, next_indices, origin_len=0, pad_token_id=None, eos_token_id=None
+    ):
+        cur_len = input_ids.shape[-1]
+        batch_size = len(self._beam_hyps)
+        assert batch_size == (input_ids.shape[0] // self.group_size)
+
+        next_beam_scores = paddle.zeros([batch_size, self.group_size], dtype=next_scores.dtype)
+        next_beam_tokens = paddle.zeros([batch_size, self.group_size], dtype=next_tokens.dtype)
+        next_beam_indices = paddle.zeros([batch_size, self.group_size], dtype=next_indices.dtype)
+
+        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
+            if self._done[batch_idx] == 1:
+                assert (
+                    len(beam_hyp) >= self.num_beams
+                ), "Batch can only be done if at least {} beams have been generated".format(self.num_beams)
+                assert (
+                    eos_token_id is not None and pad_token_id is not None
+                ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined"
+                # pad the batch
+                next_beam_scores[batch_idx, :] = 0
+                next_beam_tokens[batch_idx, :] = pad_token_id
+                next_beam_indices[batch_idx, :] = 0
+                continue
+
+            # next tokens for this sentence
+            beam_idx = 0
+            for beam_token_rank, (next_token, next_score, next_index) in enumerate(
+                zip(next_tokens[batch_idx], next_scores[batch_idx], next_indices[batch_idx])
+            ):
+                batch_beam_idx = batch_idx * self.group_size + next_index
+                # add to generated hypotheses if end of sentence
+                if (eos_token_id is not None) and (next_token.item() == eos_token_id):
+                    # If beam_token does not belong to top num_beams tokens,
+                    # it should not be added
+                    is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.group_size
+                    if is_beam_token_worse_than_top_num_beams:
+                        continue
+                    beam_hyp.add(input_ids[batch_beam_idx.item()].clone(), next_score.item(), origin_len)
+
+                else:
+                    # add next predicted token since it is not eos_token
+                    next_beam_scores[batch_idx, beam_idx] = next_score
+                    next_beam_tokens[batch_idx, beam_idx] = next_token.item()
+                    next_beam_indices[batch_idx, beam_idx] = batch_beam_idx.item()
+                    beam_idx += 1
+
+                # once the beam for next step is full, don't add more tokens to it.
+                if beam_idx == self.group_size:
+                    break
+
+            if beam_idx < self.group_size:
+                raise ValueError(
+                    "At most {} tokens in `next_tokens[batch_idx]` can be equal "
+                    "to `eos_token_id: {}`. Make sure `next_tokens[batch_idx]` "
+                    "are corrected.".format(self.group_size, eos_token_id)
+                )
+
+            # Check if we are done so that we can save a pad step if all(done)
+            if beam_hyp.is_done(next_scores[batch_idx].max().item(), cur_len, origin_len):
+                self._done[batch_idx] = 1
+
+        return {
+            "next_beam_scores": next_beam_scores.reshape([-1]),
+            "next_beam_tokens": next_beam_tokens.reshape([-1]),
+            "next_beam_indices": next_beam_indices.reshape([-1]),
+        }
+
+    def finalize(
+        self,
+        input_ids,
+        final_beam_scores,
+        final_beam_tokens,
+        final_beam_indices,
+        origin_len=0,
+        pad_token_id=None,
+        eos_token_id=None,
+    ):
+        batch_size = len(self._beam_hyps)
+
+        # finalize all open beam hypotheses and add to generated hypotheses
+        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
+            if self._done[batch_idx] == 1:
+                continue
+
+            # all open beam hypotheses are added to the beam hypothesis
+            # beam hypothesis class automatically keeps the best beams
+            for beam_id in range(self.num_beams):
+                batch_beam_idx = batch_idx * self.num_beams + beam_id
+                final_score = final_beam_scores[batch_beam_idx].item()
+                final_tokens = input_ids[batch_beam_idx]
+                beam_hyp.add(final_tokens, final_score, origin_len=origin_len)
+
+        # select the best hypotheses
+        sent_lengths = paddle.zeros([batch_size * self.num_beam_hyps_to_keep], dtype=input_ids.dtype)
+        best = []
+
+        # retrieve best hypotheses
+        for i, beam_hyp in enumerate(self._beam_hyps):
+            sorted_hyps = sorted(beam_hyp.beams, key=lambda x: x[0])
+            for j in range(self.num_beam_hyps_to_keep):
+                best_score, best_hyp = sorted_hyps.pop()
+                sent_lengths[self.num_beam_hyps_to_keep * i + j] = len(best_hyp)
+                best.append([best_hyp, best_score])
+
+        # prepare for adding eos
+        sent_max_len = min(sent_lengths.max().item() + 1, self.max_length)
+        decoded = paddle.zeros([batch_size * self.num_beam_hyps_to_keep, sent_max_len], dtype=input_ids.dtype)
+        # shorter batches are padded if needed
+        if sent_lengths.min().item() != sent_lengths.max().item():
+            assert pad_token_id is not None, "`pad_token_id` has to be defined"
+            decoded[:, :] = pad_token_id
+        decoded_score = paddle.zeros([batch_size * self.num_beam_hyps_to_keep, 1])
+
+        # fill with hypotheses and eos_token_id if the latter fits in
+        for i, (hypo, score) in enumerate(best):
+            decoded[i, : sent_lengths[i].item()] = hypo.cpu().numpy()
+            decoded_score[i] = score
+            if sent_lengths[i] < self.max_length:
+                decoded[i, sent_lengths[i].item()] = eos_token_id
+        return decoded, decoded_score
+
+
+class GenerationMixin(object):
+    r"""
+    This class implements the interface for generation task.
+
+    It's used as the base class of `paddlenlp.transformers.PretrainedModel
+    <https://paddlenlp.readthedocs.io/zh/latest/source/paddlenlp.transformers.model_utils.html>`__.
+    """
+    # enable `to_static` method for CausalLM Model
+    enable_to_static_method = False
+
+    @staticmethod
+    def prepare_input_ids_for_generation(bos_token_id, encoder_output=None):
+        batch_size = 1
+        if bos_token_id is None:
+            raise ValueError("`bos_token_id` should be defined when no " "`input_ids` are provided.")
+        if encoder_output is not None:
+            batch_size = encoder_output.shape[0]
+        return paddle.ones([batch_size, 1], dtype="int64") * bos_token_id
+
+    @staticmethod
+    def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id):
+        is_pad_token_in_inputs_ids = (pad_token_id is not None) and paddle.any(input_ids == pad_token_id).item()
+        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (
+            (eos_token_id is not None) and (pad_token_id != eos_token_id)
+        )
+        if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id:
+            attention_mask = (input_ids == pad_token_id).astype(paddle.get_default_dtype()) * get_scale_by_dtype(
+                return_positive=False
+            )
+        else:
+            attention_mask = paddle.zeros_like(input_ids, dtype=paddle.get_default_dtype())
+        return paddle.unsqueeze(attention_mask, axis=[1, 2])
+
+    @staticmethod
+    def prepare_seq_len_for_generation(input_ids, pad_token_id, eos_token_id):
+        is_pad_token_in_inputs_ids = (pad_token_id is not None) and paddle.any(input_ids == pad_token_id).item()
+        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (
+            (eos_token_id is not None) and (pad_token_id != eos_token_id)
+        )
+        if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id:
+            seq_len = paddle.sum(input_ids != pad_token_id, axis=1).unsqueeze(-1)
+        else:
+            seq_len = paddle.full((input_ids.shape[0], 1), input_ids.shape[1], dtype="int64")
+        return seq_len
+
+    def get_logits_processor(
+        self,
+        min_length=None,
+        max_length=None,
+        eos_token_id=None,
+        forced_bos_token_id=None,
+        forced_eos_token_id=None,
+        num_beams=1,
+        num_beam_groups=1,
+        diversity_rate=0.0,
+        repetition_penalty=None,
+        no_repeat_ngram_size=None,
+        logits_processors=None,
+    ):
+        processors = LogitsProcessorList()
+
+        if min_length is not None and eos_token_id is not None and min_length > -1:
+            processors.append(MinLengthLogitsProcessor(min_length, eos_token_id))
+        if num_beam_groups > 1 and diversity_rate > 0.0:
+            processors.append(
+                HammingDiversityLogitsProcessor(
+                    diversity_rate=diversity_rate, num_beams=num_beams, num_beam_groups=num_beam_groups
+                )
+            )
+        if repetition_penalty is not None and repetition_penalty != 1.0:
+            processors.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
+        if no_repeat_ngram_size is not None and no_repeat_ngram_size > 0:
+            processors.append(NoRepeatNGramLogitsProcessor(no_repeat_ngram_size))
+        if forced_bos_token_id is not None:
+            processors.append(ForcedBOSTokenLogitsProcessor(forced_bos_token_id))
+        if forced_eos_token_id is not None:
+            processors.append(ForcedEOSTokenLogitsProcessor(max_length, forced_eos_token_id))
+        # TODO
+        # Add more pre_processing for distribution
+
+        if logits_processors is not None:
+            custom_processors = LogitsProcessorList()
+            custom_processors_type = [type(lp) for lp in logits_processors]
+
+            for processor in processors:
+                if type(processor) not in custom_processors_type:
+                    custom_processors.append(processor)
+            custom_processors.extend(logits_processors)
+
+            return custom_processors
+        else:
+            return processors
+
+    @staticmethod
+    def expand_inputs_for_generation(input_ids, expand_size, attention_mask=None, **model_kwargs):
+
+        index = paddle.tile(paddle.arange(input_ids.shape[0], dtype="int64").unsqueeze(-1), [1, expand_size]).reshape(
+            [-1]
+        )
+
+        input_ids = paddle.gather(input_ids, index)
+
+        if attention_mask is not None:
+            model_kwargs["attention_mask"] = paddle.gather(attention_mask, index)
+
+        if "token_type_ids" in model_kwargs and model_kwargs["token_type_ids"] is not None:
+            token_type_ids = model_kwargs["token_type_ids"]
+            model_kwargs["token_type_ids"] = paddle.gather(token_type_ids, index)
+
+        if "position_ids" in model_kwargs and model_kwargs["position_ids"] is not None:
+            position_ids = model_kwargs["position_ids"]
+            model_kwargs["position_ids"] = paddle.gather(position_ids, index)
+
+        if "seq_len" in model_kwargs and model_kwargs["seq_len"] is not None:
+            seq_len = model_kwargs["seq_len"]
+            model_kwargs["seq_len"] = paddle.gather(seq_len, index)
+
+        if "encoder_output" in model_kwargs and model_kwargs["encoder_output"] is not None:
+            encoder_output = model_kwargs["encoder_output"]
+            model_kwargs["encoder_output"] = paddle.gather(encoder_output, index)
+
+        if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None:
+            role_ids = model_kwargs["role_ids"]
+            model_kwargs["role_ids"] = paddle.gather(role_ids, index)
+
+        return input_ids, model_kwargs
+
+    @staticmethod
+    def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False):
+        # Update the model inputs during generation.
+        # Note that If `token_type_ids` and `attention_mask` in `model_kwargs`
+        # and they contain pad value, the result vectors updated by this method
+        # may be different from expected. In this case, you need to rewrite the
+        # method.
+
+        # update cache
+        if isinstance(outputs, tuple) and len(outputs) > 1 and not isinstance(outputs[1], paddle.Tensor):
+            model_kwargs["cache"] = outputs[1]
+            model_kwargs["past_key_values"] = outputs[1]
+
+        if isinstance(outputs, ModelOutput) and "past_key_values" in outputs:
+            model_kwargs["cache"] = outputs.past_key_values
+            model_kwargs["past_key_values"] = outputs.past_key_values
+
+        # update token_type_ids with last value
+        if "token_type_ids" in model_kwargs and model_kwargs["token_type_ids"] is not None:
+            token_type_ids = model_kwargs["token_type_ids"]
+            model_kwargs["token_type_ids"] = paddle.concat([token_type_ids, token_type_ids[:, -1:]], axis=-1)
+
+        # update position_ids
+        if "position_ids" in model_kwargs and model_kwargs["position_ids"] is not None:
+            position_ids = model_kwargs["position_ids"]
+            model_kwargs["position_ids"] = paddle.concat([position_ids, position_ids[..., -1:] + 1], axis=-1)
+
+        # update attention_mask
+        if not is_encoder_decoder and "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            # nn.Pad2D don't support the data type `bool`
+            if convert_dtype(attention_mask.dtype) == "bool":
+                attention_mask = paddle.cast(attention_mask, "int64")
+            if len(attention_mask.shape) == 4:
+                cur_device = paddle.get_device()
+                if cur_device.split(":")[0] == "npu":
+                    attention_mask = nn.Pad2D([0, 0, 0, 1], mode="constant")(attention_mask)
+                    attention_mask = nn.Pad2D([0, 1, 0, 0], value=0)(attention_mask)
+                else:
+                    attention_mask = nn.Pad2D([0, 0, 0, 1], mode="replicate")(attention_mask)
+                    attention_mask = nn.Pad2D([0, 1, 0, 0], value=get_scale_by_dtype(return_positive=False))(
+                        attention_mask
+                    )
+
+                dtype = convert_dtype(attention_mask.dtype)
+                if "int" in dtype:
+                    attention_mask[:, :, -1, -1] = 1
+                elif "float" in dtype:
+                    attention_mask[:, :, -1, -1] = 0.0
+                else:
+                    raise ValueError("The data type of input `attention_mask` must " "be bool, int or float")
+            else:
+                attention_mask = paddle.concat(
+                    [attention_mask, paddle.ones([attention_mask.shape[0], 1], dtype="int64")], axis=-1
+                )
+            model_kwargs["attention_mask"] = attention_mask
+
+        # update role_ids
+        if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None:
+            role_ids = model_kwargs["role_ids"]
+            model_kwargs["role_ids"] = paddle.concat([role_ids, role_ids[:, -1:]], axis=-1)
+
+        return model_kwargs
+
+    @staticmethod
+    def update_scores_for_generation(scores, next_scores, length, unfinished_flag):
+        # update scores
+
+        unfinished_scores = (scores * paddle.to_tensor(length, dtype=scores.dtype) + next_scores) / (
+            paddle.to_tensor(length, dtype=scores.dtype) + 1
+        )
+        scores = paddle.where(unfinished_flag, unfinished_scores, scores)
+        return scores
+
+    def prepare_encoder_decoder_kwargs_for_generation(self, input_ids, model_kwargs):
+        if "encoder_output" not in model_kwargs:
+            # retrieve encoder hidden states
+            encoder = self.get_encoder()
+            encoder_kwargs = {
+                argument: value
+                for argument, value in model_kwargs.items()
+                if not (
+                    argument.startswith("decoder_") or argument.startswith("cross_attn") or argument == "use_cache"
+                )
+            }
+            # Use inputs_embeds as the priority if inputs_embeds exists
+            if "inputs_embeds" in encoder_kwargs:
+                model_kwargs["encoder_output"] = encoder(**encoder_kwargs)
+            else:
+                model_kwargs["encoder_output"] = encoder(input_ids=input_ids, **encoder_kwargs)
+        return model_kwargs
+
+    def prepare_decoder_input_ids_for_generation(self, input_ids, decoder_start_token_id=None, bos_token_id=None):
+        decoder_start_token_id = (
+            decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id
+        )
+        decoder_start_token_id = decoder_start_token_id if decoder_start_token_id is not None else bos_token_id
+
+        decoder_input_ids = paddle.ones([input_ids.shape[0], 1], dtype="int64") * decoder_start_token_id
+
+        return decoder_input_ids
+
+    def get_decoder_start_token_id(self, decoder_start_token_id=None, bos_token_id=None):
+        decoder_start_token_id = (
+            decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id
+        )
+        bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
+
+        if decoder_start_token_id is not None:
+            return decoder_start_token_id
+        elif self.config.decoder_start_token_id is not None:
+            return self.config.decoder_start_token_id
+        elif bos_token_id is not None:
+            return bos_token_id
+        elif self.config.bos_token_id is not None:
+            return self.config.bos_token_id
+        raise ValueError(
+            "`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation."
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        # Implement in subclasses for custom behavior to prepare inputs in the
+        # generate method.
+
+        return {"input_ids": input_ids}
+
+    def adjust_logits_during_generation(self, logits):
+        # Implement in subclasses for custom behavior to adjust the logits in
+        # the generate method.
+
+        return logits
+
+    def prepare_fast_entry(self, kwargs):
+        return False
+
+    def _convert_to_fast(self, kwargs):
+        # try general convert
+        pass
+
+    def _build_fast(self, kwargs):
+        self._fast_entry = False
+        if kwargs["num_beam_groups"] != 1:
+            # not support for group_beam_search yet in the fast version
+            raise AttributeError("'num_beam_groups != 1' is not supported yet in the fast version")
+        if paddle.get_default_dtype() == "float16" and kwargs["use_fp16_decoding"] is False:
+            logger.info(
+                "Since the default dtype is float16, float16 would be used " "though 'use_fp16_decoding=False'."
+            )
+            kwargs["use_fp16_decoding"] = True
+        self.prepare_fast_entry(kwargs)
+
+    def set_pad_token_id(self, pad_token_id, eos_token_id):
+        if pad_token_id is None and eos_token_id is not None:
+            logger.warning(
+                "Setting `pad_token_id` to `eos_token_id`:{} for " "open-end generation.".format(eos_token_id)
+            )
+            if isinstance(eos_token_id, list):
+                pad_token_id = eos_token_id[0]
+            else:
+                pad_token_id = eos_token_id
+        return pad_token_id
+
+    @paddle.no_grad()
+    def generate(
+        self,
+        input_ids: paddle.Tensor = None,
+        generation_config: GenerationConfig = None,
+        stopping_criteria: StoppingCriteria = None,
+        streamer: BaseStreamer = None,
+        synced_gpus: Optional[bool] = None,
+        **kwargs,
+    ):
+        r"""
+        The interface for generation task. This method can generate sequences
+        by using decoding strategy. Currently, there are three decoding
+        strategies supported: "greedy_search", "sampling" and "beam_search".
+
+        Args:
+            input_ids (Tensor, optional): The input sequence ids for the
+                generation. It is a Tensor with shape [batch_size, sequence_length].
+                The data type should be int32 or int64. Default to None, which
+                we will initialize it as a Tensor with shape [1, 1], filled
+                with the value `bos_token_id`.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                Custom stopping criteria that complement the default stopping criteria built from arguments and a
+                generation config. If a stopping criteria is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            streamer (`~streamer.BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            synced_gpus (`bool`, *optional*):
+                Whether to continue running the while loop until max_length. Unless overridden this flag will be set to
+                `True` under DeepSpeed ZeRO Stage 3 multiple GPUs environment to avoid hanging if one GPU finished
+                generating before other GPUs. Otherwise it'll be set to `False`.
+            kwargs (dict): It can be used to specify additional kwargs
+                passed to the model.
+
+        Returns:
+            tuple[Tensor]: It is a tuple contains two elements: ids and scores.
+            Each element is a Tensor.
+
+            With the fields:
+
+            - ids (Tensor):
+                The ids of the generated sequences. It is a Tensor with shape
+                [batch_size * num_return_sequences, sequence_length]. The data
+                type is same as the input `input_ids`.
+            - scores (Tensor):
+                The scores of the generated sequences. It is a Tensor with shape
+                [batch_size * num_return_sequences, 1]. The data type is float32
+                or float64, which is the same as the parameters in the model.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import (
+                    UnifiedTransformerLMHeadModel,
+                    UnifiedTransformerTokenizer
+                )
+
+                paddle.seed(2)
+
+                # Initialize the model and tokenizer
+                model_name_or_path = 'unified_transformer-12L-cn-luge'
+                model = UnifiedTransformerLMHeadModel.from_pretrained(model_name_or_path)
+                tokenizer = UnifiedTransformerTokenizer.from_pretrained(model_name_or_path)
+
+                # Prepare the model inputs.
+                history = "早上好，今天空气质量不错。"
+                inputs = tokenizer.dialogue_encode(history, task_type='chitchat',
+                    add_start_token_as_response=True, return_tensors=True)
+
+            .. code-block::
+
+                # Generate the sequence by using "greedy_search" strategy
+                ids, scores = model.generate(
+                    **inputs,
+                    decode_strategy="greedy_search")
+                print(ids.shape, scores.shape)
+                # [1, 3] [1, 1]
+                sequence_ids = ids.cpu().numpy().tolist()[0]
+                sequence_ids = sequence_ids[:sequence_ids.index(tokenizer.sep_token_id)]
+                response = tokenizer.convert_ids_to_string(sequence_ids, keep_space=False)
+                print(response)
+                # 是的
+
+            .. code-block::
+
+                # Generate 2 sequences by using "sampling" strategy (top_k=5)
+                generation_config = GenerationConfig(
+                    decode_strategy="sampling",
+                    top_k=5,
+                    num_return_sequences=2
+                )
+                ids, scores = model.generate(
+                    **inputs,
+                    generation_config=generation_config,
+                    )
+                print(ids.shape, scores.shape)
+                # [2, 7] [2, 1]
+                response = []
+                for sequence_ids in ids.cpu().numpy().tolist():
+                    sequence_ids = sequence_ids[:sequence_ids.index(tokenizer.sep_token_id)]
+                    text = tokenizer.convert_ids_to_string(sequence_ids, keep_space=False)
+                    response.append(text)
+                print(response)
+                # ['天气好,心情也好', '你也是']
+
+            .. code-block::
+
+                # Generate 2 sequences by using "beam_search" strategy (num_beams=5)
+                generation_config = GenerationConfig(
+                    decode_strategy="beam_search",
+                    num_beams=5,
+                    num_return_sequences=2
+                )
+                ids, scores = model.generate(
+                    **inputs,
+                    generation_config=generation_config,
+                    )
+                print(ids.shape, scores.shape)
+                # [2, 3] [2, 1]
+                response = []
+                for sequence_ids in ids.cpu().numpy().tolist():
+                    sequence_ids = sequence_ids[:sequence_ids.index(tokenizer.sep_token_id)]
+                    text = tokenizer.convert_ids_to_string(sequence_ids, keep_space=False)
+                    response.append(text)
+                print(response)
+                # ['是的', '嗯嗯']
+        """
+        if generation_config is None:
+            if self.generation_config._from_model_config:
+                new_generation_config = GenerationConfig.from_model_config(self.config)
+                if new_generation_config != self.generation_config:
+                    logger.warning(
+                        "model.generation_config is in conflict with model.config, " "model.config is used."
+                    )
+                    self.generation_config = new_generation_config
+            generation_config = self.generation_config
+
+        # without update model.generation_config
+        generation_config = copy.deepcopy(generation_config)
+        model_kwargs = generation_config.update(**kwargs)
+
+        assert generation_config.decode_strategy in [
+            "greedy_search",
+            "sampling",
+            "beam_search",
+        ], "`decode_strategy` must be one of 'greedy_search', 'sampling' or 'beam_search' but received {}.".format(
+            generation_config.decode_strategy
+        )
+
+        if getattr(self, "deprecated_warnings", None) is None:
+            self.deprecated_warnings = {}
+
+        use_fast = False
+        if "use_faster" in model_kwargs:
+            raise ValueError("`use_faster` is deprecated now.")
+
+        if "use_fast" in model_kwargs:
+            raise ValueError("`use_fast` is deprecated now.")
+
+        bos_token_id = (
+            generation_config.bos_token_id if generation_config.bos_token_id is not None else self.config.bos_token_id
+        )
+        eos_token_id = (
+            generation_config.eos_token_id if generation_config.eos_token_id is not None else self.config.eos_token_id
+        )
+        pad_token_id = (
+            generation_config.pad_token_id if generation_config.pad_token_id is not None else self.config.pad_token_id
+        )
+        forced_bos_token_id = (
+            generation_config.forced_bos_token_id
+            if generation_config.forced_bos_token_id is not None
+            else self.config.forced_bos_token_id
+        )
+        forced_eos_token_id = (
+            generation_config.forced_eos_token_id
+            if generation_config.forced_eos_token_id is not None
+            else self.config.forced_eos_token_id
+        )
+        decoder_start_token_id = (
+            generation_config.decoder_start_token_id
+            if generation_config.decoder_start_token_id is not None
+            else self.config.decoder_start_token_id
+        )
+        no_repeat_ngram_size = (
+            generation_config.no_repeat_ngram_size
+            if generation_config.no_repeat_ngram_size is not None
+            else self.config.no_repeat_ngram_size
+        )
+
+        if getattr(self, "_fast_entry", None) is not False and use_fast:
+            fg_args = locals()
+            fg_args.pop("self")
+            fg_args.pop("__class__", None)
+            model_kwargs = fg_args.pop("model_kwargs")
+            fg_args.update(model_kwargs)
+            try:
+                if getattr(self, "_fast_entry", None) is None:
+                    self._build_fast(fg_args)
+                if self._fast_entry:
+                    output = self._fast_entry(**fg_args)
+                    if isinstance(output, tuple):
+                        output_ids, dummy_srore = output
+                    else:
+                        output_ids = output
+                        # make result and fast result oneconsistent
+                        dummy_srore = None
+                    if generation_config.decode_strategy == "beam_search":
+                        output_ids = output_ids.transpose([1, 2, 0])
+                        output_ids = output_ids[:, : generation_config.num_return_sequences, :].reshape(
+                            [-1, output_ids.shape[-1]]
+                        )
+                        if dummy_srore is not None:
+                            dummy_srore = dummy_srore[:, : generation_config.num_return_sequences].flatten()
+                    else:
+                        output_ids = output_ids.transpose([1, 0])
+                    return output_ids, dummy_srore
+
+            except Exception as e:
+                fg_args["model_kwargs"] = model_kwargs
+                # TODO
+                # Prevent self._convert_to_fast to throw Exception
+                self._convert_to_fast(fg_args)
+                logger.warning(e)
+                logger.warning("FastGeneration is not available, " "and the original version would be used instead.")
+
+        # input_ids in model_kwargs is supported
+        if "input_ids" in model_kwargs:
+            _input_ids = model_kwargs.pop("input_ids")
+            if input_ids is None:
+                input_ids = _input_ids
+
+        # params check
+        if input_ids is None and "inputs_embeds" not in model_kwargs:
+            # Init `input_ids` with bos_token_id
+            input_ids = self.prepare_input_ids_for_generation(bos_token_id)
+        elif "inputs_embeds" in model_kwargs:
+            # Add input embeds support
+            input_ids = self.prepare_input_ids_for_generation(
+                bos_token_id, encoder_output=model_kwargs["inputs_embeds"]
+            )
+
+        if model_kwargs.get("attention_mask", None) is None:
+            # TODO
+            # Init `attention_mask` depending on `pad_token_id`
+            model_kwargs["attention_mask"] = self.prepare_attention_mask_for_generation(
+                input_ids, pad_token_id, eos_token_id
+            )
+        self.is_encoder_decoder = self.config.is_encoder_decoder
+
+        if self.is_encoder_decoder:
+            model_kwargs = self.prepare_encoder_decoder_kwargs_for_generation(input_ids, model_kwargs)
+            # set input_ids as decoder_input_ids
+            if "decoder_input_ids" in model_kwargs:
+                input_ids = model_kwargs.pop("decoder_input_ids")
+            else:
+                input_ids = self.prepare_decoder_input_ids_for_generation(
+                    input_ids, decoder_start_token_id, bos_token_id
+                )
+        # streamer
+        if streamer is not None:
+            # streamer couldn't support beam_search strategy
+            if generation_config.decode_strategy == "beam_search" or generation_config.num_beams > 1:
+                raise ValueError(
+                    "`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1."
+                )
+
+        pad_token_id = self.set_pad_token_id(pad_token_id, eos_token_id)
+
+        if generation_config.max_length != 0 and generation_config.max_new_tokens == DEFAULT_MAX_NEW_TOKENS:
+            logger.warning("`max_length` will be deprecated in future releases, use `max_new_tokens` instead.")
+            generation_config.max_new_tokens = generation_config.max_length
+
+        if generation_config.min_length != 0 and generation_config.min_new_tokens == 0:
+            logger.warning("`min_length` will be deprecated in future releases, use `min_new_tokens` instead.")
+            generation_config.min_new_tokens = generation_config.min_length
+
+        max_length = generation_config.max_new_tokens
+        min_length = generation_config.min_new_tokens
+
+        input_len = input_ids.shape[-1]
+        min_len = input_len + min_length
+        max_len = input_len + max_length
+
+        logits_processors = self.get_logits_processor(
+            min_length=min_len if min_length > 0 else None,
+            max_length=max_len,
+            eos_token_id=eos_token_id,
+            forced_bos_token_id=forced_bos_token_id,
+            forced_eos_token_id=forced_eos_token_id,
+            num_beams=generation_config.num_beams,
+            num_beam_groups=generation_config.num_beam_groups,
+            diversity_rate=generation_config.diversity_rate,
+            repetition_penalty=generation_config.repetition_penalty,
+            no_repeat_ngram_size=generation_config.no_repeat_ngram_size,
+            logits_processors=model_kwargs["logits_processors"]
+            if "logits_processors" in model_kwargs
+            and isinstance(model_kwargs["logits_processors"], LogitsProcessorList)
+            else None,
+        )
+        if "logits_processors" in model_kwargs:
+            model_kwargs.pop("logits_processors")
+
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+
+        if generation_config.decode_strategy == "greedy_search":
+            if generation_config.num_return_sequences > 1:
+                raise ValueError(
+                    "`num_return_sequences` has to be 1, but is {} "
+                    "when doing greedy search.".format(generation_config.num_return_sequences)
+                )
+            return self.greedy_search(
+                input_ids,
+                logits_processors,
+                max_len,
+                pad_token_id,
+                eos_token_id,
+                stopping_criteria=stopping_criteria,
+                streamer=streamer,
+                fast_ptq_sampling=generation_config.fast_ptq_sampling,
+                trunc_input=generation_config.trunc_input,
+                synced_gpus=synced_gpus,
+                **model_kwargs,
+            )
+
+        elif generation_config.decode_strategy == "sampling":
+            if generation_config.num_return_sequences > 1:
+                input_ids, model_kwargs = self.expand_inputs_for_generation(
+                    input_ids, expand_size=generation_config.num_return_sequences, **model_kwargs
+                )
+
+            return self.sample(
+                input_ids,
+                logits_processors,
+                max_len,
+                pad_token_id,
+                eos_token_id,
+                generation_config.top_k,
+                generation_config.top_p,
+                generation_config.temperature,
+                stopping_criteria=stopping_criteria,
+                streamer=streamer,
+                fast_ptq_sampling=generation_config.fast_ptq_sampling,
+                trunc_input=generation_config.trunc_input,
+                synced_gpus=synced_gpus,
+                **model_kwargs,
+            )
+
+        elif generation_config.decode_strategy == "beam_search":
+            batch_size = input_ids.shape[0]
+            if generation_config.num_return_sequences > generation_config.num_beams:
+                raise ValueError(
+                    "`num_return_sequences` has to be smaller or equal to "
+                    "`num_beams`. But received `num_return_sequences` is {}, "
+                    "`num_beams` is {}".format(generation_config.num_return_sequences, generation_config.num_beams)
+                )
+            if generation_config.num_beams <= 1:
+                raise ValueError(
+                    "`num_beams` has to be bigger than 1. But received "
+                    "`num_beams` is {}. If `num_beams` is 1, `decode_strategy` "
+                    "should be 'greedy_search'".format(generation_config.num_beams)
+                )
+            if generation_config.num_beam_groups > 1:
+                diverse_beam_scorer = BeamSearchScorer(
+                    batch_size=batch_size,
+                    max_length=max_len,
+                    num_beams=generation_config.num_beams,
+                    length_penalty=generation_config.length_penalty,
+                    do_early_stopping=generation_config.early_stopping,
+                    num_beam_hyps_to_keep=generation_config.num_return_sequences,
+                    num_beam_groups=generation_config.num_beam_groups,
+                )
+
+                # interleave with `num_beams`
+                input_ids, model_kwargs = self.expand_inputs_for_generation(
+                    input_ids, expand_size=generation_config.num_beams, **model_kwargs
+                )
+
+                return self.group_beam_search(
+                    input_ids,
+                    diverse_beam_scorer,
+                    logits_processors,
+                    max_len,
+                    pad_token_id,
+                    eos_token_id,
+                    stopping_criteria=stopping_criteria,
+                    fast_ptq_sampling=generation_config.fast_ptq_sampling,
+                    trunc_input=generation_config.trunc_input,
+                    synced_gpus=synced_gpus,
+                    **model_kwargs,
+                )
+            else:
+                beam_scorer = BeamSearchScorer(
+                    batch_size=batch_size,
+                    max_length=max_len,
+                    num_beams=generation_config.num_beams,
+                    length_penalty=generation_config.length_penalty,
+                    do_early_stopping=generation_config.early_stopping,
+                    num_beam_hyps_to_keep=generation_config.num_return_sequences,
+                )
+
+                input_ids, model_kwargs = self.expand_inputs_for_generation(
+                    input_ids, expand_size=generation_config.num_beams, **model_kwargs
+                )
+
+                return self.beam_search(
+                    input_ids,
+                    beam_scorer,
+                    logits_processors,
+                    max_len,
+                    generation_config.diversity_rate,
+                    pad_token_id,
+                    eos_token_id,
+                    stopping_criteria=stopping_criteria,
+                    fast_ptq_sampling=generation_config.fast_ptq_sampling,
+                    trunc_input=generation_config.trunc_input,
+                    synced_gpus=synced_gpus,
+                    **model_kwargs,
+                )
+
+    def greedy_search(
+        self,
+        input_ids,
+        logits_processors,
+        max_length,
+        pad_token_id,
+        eos_token_id,
+        stopping_criteria=None,
+        streamer=None,
+        fast_ptq_sampling=False,
+        trunc_input=True,
+        synced_gpus=False,
+        **model_kwargs
+    ):
+        model_kwargs["use_cache"] = model_kwargs.get("use_cache", True)
+        logits_processors = logits_processors if logits_processors is not None else LogitsProcessorList()
+
+        # max_length will be convert to MaxLengthCriteria
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        if max_length is not None:
+            # logger.warning(
+            #    "`max_length` is deprecated in this function, use"
+            #    " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead."
+            # )
+            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+
+        batch_size, cur_len = input_ids.shape
+        origin_len = cur_len
+        unfinished_flag = paddle.full([batch_size, 1], True, dtype="bool")
+        scores = paddle.full([batch_size, 1], 0.0, dtype=paddle.get_default_dtype())
+        generate_end = False
+        while True:
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = paddle.to_tensor(0.0 if generate_end else 1.0)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+
+            # prepare model inputs & get model output
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            outputs = self(**model_inputs)
+
+            if synced_gpus and generate_end:
+                continue  # don't waste resources running the code we don't need
+
+            if isinstance(outputs, tuple):
+                logits = outputs[0]
+            elif isinstance(outputs, ModelOutput):
+                logits = outputs.logits
+            else:
+                logits = outputs
+
+            # [batch_size, vocab_size]
+            next_token_logits = logits[:, -1, :]
+
+            # pre-process distribution
+            next_token_logits = self.adjust_logits_during_generation(next_token_logits)
+            probs = logits_processors(input_ids, next_token_logits)
+            # greedy
+            next_tokens = paddle.argmax(probs, axis=-1).unsqueeze(-1)
+            next_scores = paddle.index_sample(probs, next_tokens)
+
+            if eos_token_id is not None:
+                next_tokens = paddle.where(unfinished_flag, next_tokens, paddle.full_like(next_tokens, pad_token_id))
+
+            scores = self.update_scores_for_generation(scores, next_scores, cur_len - origin_len, unfinished_flag)
+            cur_len += 1
+
+            input_ids = paddle.concat([input_ids, next_tokens], axis=1)
+            if streamer is not None:
+                if self.config.tensor_parallel_rank == 0:
+                    streamer.put(next_tokens.cpu())
+
+            if stopping_criteria(input_ids, scores):
+                generate_end = True
+
+            if eos_token_id is not None:
+                unfinished_flag = get_unfinished_flag(input_ids, unfinished_flag, eos_token_id)
+                if not paddle.any(unfinished_flag):
+                    generate_end = True
+
+            # Stop when there is a </s> in all sentences
+            if generate_end and not synced_gpus:
+                break
+
+            model_kwargs = self.update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            if fast_ptq_sampling:
+                break
+
+        if streamer is not None:
+            streamer.end()
+
+        return input_ids[:, origin_len:] if trunc_input else input_ids, scores
+
+    def sample(
+        self,
+        input_ids,
+        logits_processors,
+        max_length,
+        pad_token_id,
+        eos_token_id,
+        top_k=None,
+        top_p=None,
+        temperature=None,
+        min_tokens_to_keep=1,
+        stopping_criteria=None,
+        streamer=None,
+        fast_ptq_sampling=False,
+        trunc_input=True,
+        synced_gpus=False,
+        **model_kwargs
+    ):
+        model_kwargs["use_cache"] = model_kwargs.get("use_cache", True)
+
+        logits_processors = logits_processors if logits_processors is not None else LogitsProcessorList()
+
+        # max_length will be convert to MaxLengthCriteria
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        if max_length is not None:
+            # logger.warning(
+            #    "`max_length` is deprecated in this function, use"
+            #    " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead."
+            # )
+            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+
+        batch_size, cur_len = input_ids.shape
+        origin_len = cur_len
+        unfinished_flag = paddle.full([batch_size, 1], True, dtype="bool")
+        scores = paddle.full([batch_size, 1], 0.0, dtype=paddle.get_default_dtype())
+
+        generate_end = False
+        while True:
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = paddle.to_tensor(0.0 if generate_end else 1.0)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+            # prepare model inputs & get model output
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # NOTE: to decrease ref-count and clear outdate cache in-time
+            model_kwargs["cache"] = None
+            model_kwargs["past_key_values"] = None
+            outputs = self(**model_inputs)
+            if synced_gpus and generate_end:
+                continue  # don't waste resources running the code we don't need
+
+            if isinstance(outputs, tuple):
+                logits = outputs[0]
+            elif isinstance(outputs, ModelOutput):
+                logits = outputs.logits
+            else:
+                logits = outputs
+
+            # [batch_size, vocab_size]
+            logits = logits[:, -1, :]
+
+            # pre-process distribution
+            logits = self.adjust_logits_during_generation(logits)
+            logits = logits_processors(input_ids, logits)
+
+            # sample
+            origin_probs = F.softmax(logits)
+            origin_probs = paddle.log(origin_probs)
+            if temperature is not None and temperature != 1.0:
+                logits = logits / temperature
+            probs = F.softmax(logits)
+            if top_k is not None and top_k != 0:
+                probs = TopKProcess(probs, top_k, min_tokens_to_keep)
+            if top_p is not None and top_p < 1.0:
+                probs = TopPProcess(probs, top_p, min_tokens_to_keep)
+            if paddle.device.is_compiled_with_custom_device("gcu"):
+                probs = paddle.cast(probs, "float32")
+            if paddle.device.is_compiled_with_xpu():
+                probs = paddle.cast(probs, "float32")
+
+            # multinomial already support fp16 and bf16 currently, fix issue: https://github.com/PaddlePaddle/Paddle/issues/51852
+            next_tokens = paddle.multinomial(probs)
+
+            if self.config.tensor_parallel_degree > 1:
+                # Maybe no need to broadcast if seed is set correclty.
+                from paddle.distributed import fleet
+
+                try:
+                    hcg = fleet.get_hybrid_communicate_group()
+                    group = hcg.get_model_parallel_group()
+                    src = hcg.get_model_parallel_group_src_rank()
+                except:
+                    group, src = None, 0
+                paddle.distributed.broadcast(next_tokens, src=src, group=group)
+            # config does not include pipeline_parallel_degree, and pipeline parallel
+            # uses trainer.model_wrapped to run in both train and predict mode
+            # which has pp_group as a attribute
+            # TODO(guosheng): only let the last stage of pipeline to do softmax
+            # and sampling, and then broadcast to avoid broadcast logits.
+            if getattr(self, "pp_group", None) is not None:
+                paddle.distributed.broadcast(
+                    next_tokens, src=self.pp_group.ranks[0], group=self.pp_group  # use rank 0 for same seed to check
+                )
+
+            next_scores = paddle.index_sample(origin_probs, next_tokens)
+            if eos_token_id is not None:
+                next_tokens = paddle.where(unfinished_flag, next_tokens, paddle.full_like(next_tokens, pad_token_id))
+
+            scores = self.update_scores_for_generation(scores, next_scores, cur_len - origin_len, unfinished_flag)
+
+            cur_len += 1
+            input_ids = paddle.concat([input_ids, next_tokens], axis=1)
+            if streamer is not None:
+                if self.config.tensor_parallel_rank == 0:
+                    streamer.put(next_tokens.cpu())
+
+            if stopping_criteria(input_ids, scores):
+                generate_end = True
+
+            if eos_token_id is not None:
+                unfinished_flag = get_unfinished_flag(input_ids, unfinished_flag, eos_token_id)
+                if not paddle.any(unfinished_flag):
+                    generate_end = True
+
+            # Stop when there is a </s> in all sentences
+            if generate_end and not synced_gpus:
+                break
+
+            model_kwargs = self.update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.is_encoder_decoder
+            )
+            if fast_ptq_sampling:
+                break
+
+        if streamer is not None:
+            streamer.end()
+
+        return input_ids[:, origin_len:] if trunc_input else input_ids, scores
+
+    def _get_model_inputs_spec(self, dtype: str):
+        spec = {
+            "input_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+            "attention_mask": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+        }
+        if "position_ids" in inspect.getfullargspec(self.forward).args:
+            spec["position_ids"] = paddle.static.InputSpec(shape=[None, None], dtype="int64")
+        return spec
+
+    def to_static(self, path: str, config: dict):
+        """export generation model to static
+
+        Args:
+            path (str): path of saved inference model
+            config (dict): configuration for generation
+                bos_token_id (int): token id of begin-of-sentence
+                eos_token_id (int): token id of end-of-sentence
+                pad_token_id (int): token id of pad token
+                use_top_p (bool): whether use top_p decoding strategy
+        """
+
+        use_top_p = config.get("use_top_p", True)
+
+        top_k_spec = paddle.static.InputSpec(shape=[1], dtype="int64") if not use_top_p else 0
+
+        top_p_spec = paddle.static.InputSpec(shape=[1], dtype="float32") if use_top_p else 1.0
+        temperature = paddle.static.InputSpec(shape=[1], dtype="float32") if use_top_p else 1.0
+        dtype = config.get("dtype", None)
+
+        logits_processors = config.get("logits_processors", None)
+        model_inputs_spec = self._get_model_inputs_spec(dtype)
+
+        input_spec = [
+            model_inputs_spec["input_ids"],  # input_ids
+            model_inputs_spec["attention_mask"],  # attention_mask
+            model_inputs_spec.get("position_ids", None),  # attention_mask
+            logits_processors,
+            paddle.static.InputSpec(shape=[1], dtype="int64"),  # max_length
+            self.generation_config.pad_token_id or config.get("pad_token_id", None),
+            self.generation_config.eos_token_id or config.get("eos_token_id", None),
+            top_k_spec,  # top_k
+            top_p_spec,  # top_p
+            temperature,  # temperature
+            1,
+        ]
+
+        model = paddle.jit.to_static(self.sample_d2s, input_spec=input_spec)
+
+        paddle.jit.save(model, path)
+
+    def sample_d2s(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        logits_processors,
+        max_new_tokens,
+        pad_token_id,
+        eos_token_id,
+        top_k=None,
+        top_p=None,
+        temperature=None,
+        min_tokens_to_keep=1,
+    ):
+
+        pad_token_id = self.set_pad_token_id(pad_token_id, eos_token_id)
+        logits_processors = logits_processors if logits_processors is not None else LogitsProcessorList()
+
+        if paddle.is_tensor(top_k) and not paddle.is_tensor(top_p):
+            use_top_p = False
+        elif not paddle.is_tensor(top_k) and paddle.is_tensor(top_p):
+            use_top_p = True
+
+        # top_k and top_p are the const value
+        elif isinstance(top_p, float) or isinstance(top_k, int):
+            use_top_p = True
+        else:
+            if top_p is None and top_k is None:
+                raise ValueError("top_k and top_p should not be None")
+            raise ValueError(
+                "you should not specify InputSpec for top_k and top_p parameters, one of InputSpec is expected"
+            )
+
+        batch_size, cur_len = input_ids.shape
+        # used for compute on gpu, avoid memcpy D2H
+        cur_len_gpu = paddle.full([1], cur_len, dtype="int64")
+
+        origin_len = input_ids.shape[1]
+        # used for compute on gpu, avoid memcpy D2H
+        origin_len_gpu = paddle.full([1], origin_len, dtype="int64")
+
+        unfinished_flag = paddle.full([batch_size, 1], True, dtype="bool")
+
+        scores = paddle.full([batch_size, 1], 0.0, dtype=paddle.get_default_dtype())
+
+        # use_cache is immutable, we split it off other mutable kwargs.
+        immutable = {"use_cache": True}
+        model_kwargs = {"attention_mask": attention_mask, "position_ids": position_ids}
+
+        def _forward_(**args):
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **args, **immutable)
+            assert "use_cache" in model_inputs
+            del model_inputs["use_cache"]
+            return self(**model_inputs, **immutable)
+
+        def _post_process_(
+            outputs, input_ids, cur_len, origin_len, scores, unfinished_flag, model_kwargs, pad_token_id
+        ):
+            if isinstance(outputs, tuple):
+                logits = outputs[0]
+            elif isinstance(outputs, ModelOutput):
+                logits = outputs.logits
+            else:
+                logits = outputs
+
+            # [batch_size, vocab_size]
+            logits = logits[:, -1, :]
+
+            # pre-process distribution
+            logits = self.adjust_logits_during_generation(logits)
+
+            logits = logits_processors(input_ids, logits)
+            probs = F.softmax(logits)
+
+            # sample
+            origin_probs = F.log_softmax(logits)
+            # compute next_tokens
+            if use_top_p:
+                logits = logits / temperature
+                top_ps_tensor = paddle.full(shape=[probs.shape[0], 1], fill_value=top_p, dtype=probs.dtype)
+                _, next_tokens = paddle.tensor.top_p_sampling(probs, top_ps_tensor)
+            else:
+                probs = TopKProcess(probs, top_k, min_tokens_to_keep)
+                if top_k == 1:
+                    next_tokens = paddle.unsqueeze_(paddle.argmax(probs, axis=-1), -1)
+                else:
+                    next_tokens = paddle.multinomial(probs)
+
+            next_scores = paddle.index_sample(origin_probs, next_tokens)
+            scores = self.update_scores_for_generation(scores, next_scores, cur_len - origin_len, unfinished_flag)
+            if eos_token_id is not None:
+                next_tokens = paddle.where(unfinished_flag, next_tokens, paddle.full_like(next_tokens, pad_token_id))
+
+            input_ids = paddle.concat([input_ids, next_tokens], axis=1)
+
+            if eos_token_id is not None:
+                unfinished_flag = get_unfinished_flag(input_ids, unfinished_flag, eos_token_id)
+
+            model_kwargs = self.update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+
+            return input_ids, scores, unfinished_flag, model_kwargs
+
+        outputs = _forward_(**model_kwargs)
+        input_ids, scores, unfinished_flag, model_kwargs = _post_process_(
+            outputs, input_ids, cur_len_gpu, origin_len_gpu, scores, unfinished_flag, model_kwargs, pad_token_id
+        )
+
+        if hasattr(paddle.framework, "_no_check_dy2st_diff"):
+            # TODO(daisiming): _no_check_dy2st_diff is used to turn off the checking of behavior
+            # inconsistency between dynamic graph and static graph. _no_check_dy2st_diff should be
+            # removed after static graphs support inplace and stride.
+            with paddle.framework._no_check_dy2st_diff():
+                paddle.increment(cur_len)
+                paddle.increment(cur_len_gpu)
+        else:
+            paddle.increment(cur_len)
+            paddle.increment(cur_len_gpu)
+
+        attn_mask = model_kwargs["attention_mask"]
+        # make the shape of attention_mask = (-1, -1, -1, -1) in dy2static.
+        model_kwargs["attention_mask"] = paddle.reshape(attn_mask, attn_mask.shape)
+        model_kwargs["cache"] = outputs[1] if isinstance(outputs, tuple) else None
+        max_new_tokens = paddle.full([1], max_new_tokens + cur_len - 1, dtype="int64")
+
+        if hasattr(paddle.framework, "_no_check_dy2st_diff"):
+            # TODO(daisiming): _no_check_dy2st_diff is used to turn off the checking of behavior
+            # inconsistency between dynamic graph and static graph. _no_check_dy2st_diff should be
+            # removed after static graphs support inplace and stride.
+            with paddle.framework._no_check_dy2st_diff():
+                while cur_len < max_new_tokens and paddle.any(unfinished_flag):
+                    input_ids, scores, unfinished_flag, model_kwargs = _post_process_(
+                        _forward_(**model_kwargs),
+                        input_ids,
+                        cur_len_gpu,
+                        origin_len_gpu,
+                        scores,
+                        unfinished_flag,
+                        model_kwargs,
+                        pad_token_id,
+                    )
+                    paddle.increment(cur_len)
+                    paddle.increment(cur_len_gpu)
+        else:
+            while cur_len < max_new_tokens and paddle.any(unfinished_flag):
+                input_ids, scores, unfinished_flag, model_kwargs = _post_process_(
+                    _forward_(**model_kwargs),
+                    input_ids,
+                    cur_len_gpu,
+                    origin_len_gpu,
+                    scores,
+                    unfinished_flag,
+                    model_kwargs,
+                    pad_token_id,
+                )
+                paddle.increment(cur_len)
+                paddle.increment(cur_len_gpu)
+
+        return input_ids[:, origin_len:], scores
+
+    def reorder_cache(self, cache, beam_idx):
+        cache = map_structure(lambda x: paddle.index_select(x, beam_idx), cache)
+        return cache
+
+    def beam_search(
+        self,
+        input_ids,
+        beam_scorer,
+        logits_processors,
+        max_length,
+        diversity_rate,
+        pad_token_id,
+        eos_token_id,
+        stopping_criteria=None,
+        fast_ptq_sampling=False,
+        trunc_input=True,
+        synced_gpus=False,
+        **model_kwargs
+    ):
+        model_kwargs["use_cache"] = model_kwargs.get("use_cache", True)
+
+        logits_processors = logits_processors if logits_processors is not None else LogitsProcessorList()
+
+        # max_length will be convert to MaxLengthCriteria
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        if max_length is not None:
+            # logger.warning(
+            #    "`max_length` is deprecated in this function, use"
+            #    " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead."
+            # )
+            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+
+        batch_size = len(beam_scorer._beam_hyps)
+        num_beams = beam_scorer.num_beams
+        batch_beam_size, cur_len = input_ids.shape
+        origin_len = cur_len
+
+        assert (
+            num_beams * batch_size == batch_beam_size
+        ), "Batch dimension of `input_ids` should be {}, but received {}.".format(
+            num_beams * batch_size, batch_beam_size
+        )
+
+        beam_scores = paddle.zeros((batch_size, num_beams), dtype=paddle.get_default_dtype())
+
+        beam_scores[:, 1:] = get_scale_by_dtype(return_positive=False)
+        beam_scores = paddle.reshape(beam_scores, [-1])
+
+        generate_end = False
+        while True:
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = paddle.to_tensor(0.0 if generate_end else 1.0)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+            # prepare model inputs & get model output
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            outputs = self(**model_inputs)
+            if synced_gpus and generate_end:
+                cur_len = cur_len + 1
+                continue  # don't waste resources running the code we don't need
+
+            if isinstance(outputs, tuple):
+                logits = outputs[0]
+            elif isinstance(outputs, ModelOutput):
+                logits = outputs.logits
+            else:
+                logits = outputs
+
+            # [batch_size, vocab_size]
+            logits = logits[:, -1, :]
+
+            # pre-process distribution
+            logits = self.adjust_logits_during_generation(logits)
+            # beam search
+            # [batch_size * num_beams, vocab_size]
+            next_scores = F.softmax(logits)
+            next_scores = paddle.log(next_scores)
+            next_scores = logits_processors(input_ids, next_scores)
+            next_scores = next_scores + beam_scores.unsqueeze(-1)
+
+            vocab_size = next_scores.shape[-1]
+            if diversity_rate == 0.0:
+                # reshape for beam search
+                next_scores = next_scores.reshape([batch_size, num_beams * vocab_size])
+
+                next_scores, next_tokens = paddle.topk(next_scores, 2 * num_beams, axis=1)
+
+                next_indices = next_tokens // vocab_size
+                next_tokens = next_tokens % vocab_size
+
+            else:
+                next_scores, next_tokens = paddle.topk(next_scores, 2 * num_beams, axis=1)
+
+                sibling_score = paddle.arange(1, 2 * num_beams + 1, dtype="int64").unsqueeze(0) * diversity_rate
+
+                diversed_score = next_scores - sibling_score
+
+                next_scores = next_scores.reshape([batch_size, 2 * num_beams * num_beams])
+                next_tokens = next_tokens.reshape([batch_size, 2 * num_beams * num_beams])
+
+                diversed_score = diversed_score.reshape([batch_size, 2 * num_beams * num_beams])
+                diversed_score, diversed_tokens = paddle.topk(diversed_score, 2 * num_beams, axis=1)
+
+                # TODO
+                # Use gather_nd() to select origan token and score
+                next_scores = paddle.stack(
+                    [paddle.index_select(next_scores[i], diversed_tokens[i]) for i in range(next_scores.shape[0])]
+                )
+                next_tokens = paddle.stack(
+                    [paddle.index_select(next_tokens[i], diversed_tokens[i]) for i in range(next_tokens.shape[0])]
+                )
+
+                next_indices = diversed_tokens // (2 * num_beams)
+
+            # stateless
+            beam_outputs = beam_scorer.process(
+                input_ids,
+                next_scores,
+                next_tokens,
+                next_indices,
+                origin_len=origin_len,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+            )
+            beam_scores = beam_outputs["next_beam_scores"]
+            beam_next_tokens = beam_outputs["next_beam_tokens"]
+            beam_idx = beam_outputs["next_beam_indices"]
+            # beam_idx may contain element -1 and cause error
+            # PR: https://github.com/PaddlePaddle/Paddle/issues/57366
+            beam_idx = paddle.maximum(beam_idx, paddle.full_like(beam_idx, 0))
+
+            cur_len += 1
+            input_ids = paddle.concat(
+                [paddle.index_select(input_ids, beam_idx), beam_next_tokens.unsqueeze(-1)], axis=-1
+            )
+
+            if beam_scorer.is_done or stopping_criteria(input_ids, beam_scores):
+                if not synced_gpus:
+                    break
+                else:
+                    generate_end = True
+
+            model_kwargs = self.update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.is_encoder_decoder
+            )
+            if "cache" in model_kwargs:
+                # reorder the cache
+                model_kwargs["cache"] = self.reorder_cache(model_kwargs["cache"], beam_idx)
+            if "past_key_values" in model_kwargs:
+                # reorder the cache
+                model_kwargs["past_key_values"] = self.reorder_cache(model_kwargs["past_key_values"], beam_idx)
+            if fast_ptq_sampling:
+                break
+
+        pred_ids, scores = beam_scorer.finalize(
+            input_ids,
+            beam_scores,
+            next_tokens,
+            next_indices,
+            origin_len=origin_len,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+        )
+        return pred_ids[:, origin_len:] if trunc_input else input_ids, scores
+
+    def group_beam_search(
+        self,
+        input_ids,
+        beam_scorer,
+        logits_processors,
+        max_length,
+        pad_token_id,
+        eos_token_id,
+        stopping_criteria=None,
+        fast_ptq_sampling=False,
+        trunc_input=True,
+        synced_gpus=False,
+        **model_kwargs
+    ):
+        model_kwargs["use_cache"] = model_kwargs.get("use_cache", True)
+        logits_processors = logits_processors if logits_processors is not None else LogitsProcessorList()
+
+        # max_length will be convert to MaxLengthCriteria
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        if max_length is not None:
+            # logger.warning(
+            #    "`max_length` is deprecated in this function, use"
+            #    " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead."
+            # )
+            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+
+        batch_size = len(beam_scorer._beam_hyps)
+        num_beams = beam_scorer.num_beams
+        num_beam_groups = beam_scorer.num_beam_groups
+        num_sub_beams = num_beams // num_beam_groups
+
+        batch_beam_size, cur_len = input_ids.shape
+        origin_len = cur_len
+
+        assert (
+            num_beams * batch_size == batch_beam_size
+        ), "Batch dimension of `input_ids` should be {}, but received {}.".format(
+            num_beams * batch_size, batch_beam_size
+        )
+
+        beam_scores = paddle.full((batch_size, num_beams), get_scale_by_dtype(return_positive=False), dtype="float32")
+        # initialise score of first beam of each group with 0 and the rest with 1e-9. This ensures that the beams in
+        # the same group don't produce same tokens everytime.
+        beam_scores[:, ::num_sub_beams] = 0
+        beam_scores = paddle.reshape(beam_scores, [-1])
+
+        generate_end = False
+        while True:
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = paddle.to_tensor(0.0 if generate_end else 1.0)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+            # predicted tokens in cur_len step
+            current_tokens = paddle.zeros(shape=[batch_size * num_beams], dtype=input_ids.dtype)
+
+            # indices which will form the beams in the next time step
+            reordering_indices = paddle.zeros(shape=[batch_size * num_beams], dtype="int64")
+            # prepare model inputs & get model output
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            outputs = self(**model_inputs)
+            if synced_gpus and generate_end:
+                cur_len = cur_len + 1
+                continue  # don't waste resources running the code we don't need
+
+            for beam_group_idx in range(num_beam_groups):
+                group_start_idx = beam_group_idx * num_sub_beams
+                group_end_idx = min(group_start_idx + num_sub_beams, num_beams)
+                group_size = group_end_idx - group_start_idx
+
+                # indices of beams of current group among all sentences in batch
+                batch_group_indices = []
+
+                for batch_idx in range(batch_size):
+                    batch_group_indices.extend(
+                        [batch_idx * num_beams + idx for idx in range(group_start_idx, group_end_idx)]
+                    )
+
+                group_input_ids = input_ids[batch_group_indices]
+
+                if isinstance(outputs, tuple):
+                    logits = outputs[0]
+                elif isinstance(outputs, ModelOutput):
+                    logits = outputs.logits
+                else:
+                    logits = outputs
+
+                logits = logits[:, -1, :]
+                logits = paddle.index_select(logits, paddle.to_tensor(batch_group_indices))
+                logits = self.adjust_logits_during_generation(logits)
+
+                next_scores = F.softmax(logits)
+                next_scores = paddle.log(next_scores)
+                vocab_size = next_scores.shape[-1]
+
+                next_scores = logits_processors(
+                    group_input_ids, next_scores, current_tokens=current_tokens, beam_group_idx=beam_group_idx
+                )
+
+                next_scores = next_scores + beam_scores[batch_group_indices].unsqueeze(-1)
+
+                # reshape for beam search
+                next_scores = next_scores.reshape([batch_size, group_size * vocab_size])
+
+                next_scores, next_tokens = paddle.topk(next_scores, 2 * group_size, axis=1)
+
+                next_indices = next_tokens // vocab_size
+                next_tokens = next_tokens % vocab_size
+
+                beam_outputs = beam_scorer.process(
+                    group_input_ids,
+                    next_scores,
+                    next_tokens,
+                    next_indices,
+                    origin_len=origin_len,
+                    pad_token_id=pad_token_id,
+                    eos_token_id=eos_token_id,
+                )
+
+                beam_scores[batch_group_indices] = beam_outputs["next_beam_scores"]
+                beam_next_tokens = beam_outputs["next_beam_tokens"]
+                beam_idx = beam_outputs["next_beam_indices"]
+                # beam_idx may contain element -1 and cause error
+                # PR: https://github.com/PaddlePaddle/Paddle/issues/57366
+                beam_idx = paddle.maximum(beam_idx, paddle.full_like(beam_idx, 0))
+
+                input_ids[batch_group_indices] = group_input_ids[beam_idx]
+                group_input_ids = paddle.concat(
+                    [paddle.index_select(group_input_ids, index=beam_idx), beam_next_tokens.unsqueeze(-1)], axis=-1
+                )
+                current_tokens[batch_group_indices] = beam_next_tokens
+
+                reordering_indices[batch_group_indices] = (
+                    num_beams * (beam_idx // group_size) + group_start_idx + (beam_idx % group_size)
+                )
+
+            input_ids = paddle.concat([input_ids, current_tokens.unsqueeze(-1)], axis=-1)
+
+            cur_len += 1
+
+            if beam_scorer.is_done or stopping_criteria(input_ids, beam_scores):
+                if not synced_gpus:
+                    break
+                else:
+                    generate_end = True
+
+            model_kwargs = self.update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.is_encoder_decoder
+            )
+
+            if "cache" in model_kwargs:
+                # reorder the cache
+                model_kwargs["cache"] = self.reorder_cache(model_kwargs["cache"], reordering_indices)
+            if "past_key_values" in model_kwargs:
+                # reorder the cache
+                model_kwargs["past_key_values"] = self.reorder_cache(
+                    model_kwargs["past_key_values"], reordering_indices
+                )
+
+            if fast_ptq_sampling:
+                break
+
+        pred_ids, scores = beam_scorer.finalize(
+            input_ids,
+            beam_scores,
+            next_tokens,
+            next_indices,
+            origin_len=origin_len,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+        )
+        return pred_ids[:, origin_len:] if trunc_input else input_ids, scores
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/layers/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/layers/__init__.py
new file mode 100644
index 000000000..13aec34a9
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/layers/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .crf import LinearChainCrf, LinearChainCrfLoss, ViterbiDecoder
+from .globalpointer import (
+    GlobalPointerForEntityExtraction,
+    GPLinkerForEventExtraction,
+    GPLinkerForRelationExtraction,
+)
+from .linear import Linear
+from .sequence import sequence_mask
+from .tcn import TCN, TemporalBlock
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/layers/crf.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/layers/crf.py
new file mode 100644
index 000000000..a07c0e2f7
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/layers/crf.py
@@ -0,0 +1,417 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+from ..utils.log import logger
+from .sequence import sequence_mask
+
+__all__ = ["LinearChainCrf", "LinearChainCrfLoss", "ViterbiDecoder"]
+
+
+def log_sum_exp(vec, dim=0):
+    # Avoid underflow and overflow
+    max_num = paddle.max(vec, dim)
+    max_exp = max_num.unsqueeze(-1)
+    return max_num + paddle.log(paddle.sum(paddle.exp(vec - max_exp), dim))
+
+
+class LinearChainCrf(nn.Layer):
+    """
+    LinearChainCrf is a linear chain Conditional Random Field layer, it can implement sequential dependencies in the predictions.
+    Therefore, it can take context into account whereas a classifier predicts a label for a single sample without considering "neighboring" samples.
+    See https://repository.upenn.edu/cgi/viewcontent.cgi?article=1162&context=cis_papers for reference.
+
+    Args:
+        num_labels (int):
+            The label number.
+        crf_lr (float, optional):
+            The crf layer learning rate. Defaults to ``0.1``.
+        with_start_stop_tag (bool, optional):
+            If set to True, the start tag and stop tag will be considered, the transitions params will be a tensor with a shape of `[num_labels+2, num_labels+2]`.
+            Else, the transitions params will be a tensor with a shape of `[num_labels, num_labels]`.
+    """
+
+    def __init__(self, num_labels, crf_lr=0.1, with_start_stop_tag=True):
+        super(LinearChainCrf, self).__init__()
+        if with_start_stop_tag:
+            self.num_tags = num_labels + 2  # Additional [START] and [STOP]
+            self.start_idx = int(self.num_tags - 1)
+            self.stop_idx = int(self.num_tags - 2)
+        else:
+            self.num_tags = num_labels
+
+        self.transitions = self.create_parameter(
+            attr=paddle.ParamAttr(learning_rate=crf_lr), shape=[self.num_tags, self.num_tags], dtype="float32"
+        )
+        self.with_start_stop_tag = with_start_stop_tag
+
+        self._initial_alpha = None
+        self._start_tensor = None
+        self._stop_tensor = None
+        self._batch_index = None
+        self._seq_index = None
+        self._batch_seq_index = None
+
+    def _initialize_alpha(self, batch_size):
+        # alpha accumulate the path value to get the different next tag
+        if self._initial_alpha is None or batch_size > self._initial_alpha.shape[0]:
+            # Initialized by a small value.
+            initial_alpha = paddle.full((batch_size, self.num_tags - 1), dtype="float32", fill_value=-10000.0)
+            # alpha_start fill_value = 0. > -10000., means the first one step START gets the most score.
+            alpha_start = paddle.full((batch_size, 1), dtype="float32", fill_value=0.0)
+            self._initial_alpha = paddle.concat([initial_alpha, alpha_start], axis=1)
+        return self._initial_alpha[:batch_size, :]
+
+    def forward(self, inputs, lengths):
+        """
+        Computes the normalization in a linear-chain CRF. See http://www.cs.columbia.edu/~mcollins/fb.pdf for reference.
+
+        .. math::
+            F & = logZ(x) = log\\sum_y exp(score(x,y))
+
+            score(x,y) & = \\sum_i Emit(x_i,y_i) + Trans(y_{i-1}, y_i)
+
+            p(y_i) & = Emit(x_i,y_i), T(y_{i-1}, y_i) = Trans(y_{i-1}, y_i)
+
+        then we can get:
+
+        .. math::
+            F(1) = log\\sum_{y1} exp(p(y_1) + T([START], y1))
+
+        .. math::
+            F(2) & = log\\sum_{y1}\\sum_{y2} exp(p(y_1) + T([START], y1) + p(y_2) + T(y_1,y_2)) \\\\
+            & = log\\sum_{y2} exp(F(1) + p(y_2) + T(y_1,y_2))
+
+        Further, We can get F(n) is a recursive formula with F(n-1).
+
+        Args:
+            inputs (Tensor):
+                The input predicted tensor. Its dtype is float32 and has a shape of `[batch_size, sequence_length, num_tags]`.
+            lengths (Tensor):
+                The input length. Its dtype is int64 and has a shape of `[batch_size]`.
+
+        Returns:
+            Tensor: Returns the normalizers tensor `norm_score`. Its dtype is float32 and has a shape of `[batch_size]`.
+        """
+        batch_size, seq_len, n_labels = inputs.shape
+        inputs_t_exp = inputs.transpose([1, 0, 2]).unsqueeze(-1)
+        # trans_exp: batch_size, num_tags, num_tags
+        trans_exp = self.transitions.unsqueeze(0)
+
+        all_alpha = []
+        if self.with_start_stop_tag:
+            alpha = self._initialize_alpha(batch_size)
+
+        for i, input_exp in enumerate(inputs_t_exp):
+            # input_exp: batch_size, num_tags, num_tags
+            # alpha_exp: batch_size, num_tags, num_tags
+            if i == 0 and not self.with_start_stop_tag:
+                alpha = inputs[:, 0]
+            else:
+                alpha_exp = alpha.unsqueeze(1)
+                # F(n) = logsumexp(F(n-1) + p(y_n) + T(y_{n-1}, y_n))
+                mat = input_exp + trans_exp + alpha_exp
+                alpha = log_sum_exp(mat, 2).squeeze(-1)
+            all_alpha.append(alpha)
+
+        # Get the valid alpha
+        all_alpha = paddle.stack(all_alpha).transpose([1, 0, 2])
+        batch_index = self._get_batch_index(batch_size)
+        last_index = lengths - 1
+        idxs = paddle.stack([batch_index, last_index], axis=1)
+        alpha = paddle.gather_nd(all_alpha, idxs)
+
+        if self.with_start_stop_tag:
+            # The last one step
+            alpha += self.transitions[self.stop_idx].unsqueeze(0)
+        norm_score = log_sum_exp(alpha, 1)  # .squeeze(-1)
+        return norm_score
+
+    def gold_score(self, inputs, labels, lengths):
+        """
+        Computes the unnormalized score for a tag sequence.
+        $$ score(x,y) = \\sum_i Emit(x_i,y_i) + Trans(y_{i-1}, y_i) $$
+
+        Args:
+            inputs (Tensor):
+                The input predicted tensor. Its dtype is float32 and has a shape of `[batch_size, sequence_length, num_tags]`.
+            labels (Tensor):
+                The input label tensor. Its dtype is int64 and has a shape of `[batch_size, sequence_length]`
+            lengths (Tensor):
+                The input length. Its dtype is int64 and has a shape of `[batch_size]`.
+
+        Returns:
+            Tensor: Returns the unnormalized sequence scores tensor `unnorm_score`. Its dtype is float32 and has a shape of `[batch_size]`.
+        """
+        unnorm_score = self._point_score(inputs, labels, lengths) + self._trans_score(labels, lengths)
+        return unnorm_score
+
+    def _point_score(self, inputs, labels, lengths):
+        batch_size, seq_len, n_labels = inputs.shape
+        # Get the true label logit value
+        flattened_inputs = inputs.reshape([-1])
+        offsets = paddle.unsqueeze(self._get_batch_index(batch_size) * seq_len * n_labels, 1)
+        offsets += paddle.unsqueeze(self._get_seq_index(seq_len) * n_labels, 0)
+        flattened_tag_indices = paddle.reshape(offsets + labels.astype(offsets.dtype), [-1])
+
+        scores = paddle.gather(flattened_inputs, flattened_tag_indices).reshape([batch_size, seq_len])
+
+        mask = paddle.cast(sequence_mask(self._get_batch_seq_index(batch_size, seq_len), lengths), "float32")
+        mask = mask[:, :seq_len]
+
+        mask_scores = scores * mask
+        score = paddle.sum(mask_scores, 1)
+        return score
+
+    def _trans_score(self, labels, lengths):
+        batch_size, seq_len = labels.shape
+
+        if self.with_start_stop_tag:
+            # Add START and STOP on either side of the labels
+            start_tensor, stop_tensor = self._get_start_stop_tensor(batch_size)
+            labels_ext = paddle.concat([start_tensor, labels, stop_tensor], axis=1)
+            mask = paddle.cast(sequence_mask(self._get_batch_seq_index(batch_size, seq_len), lengths + 1), "int64")
+            pad_stop = paddle.full((batch_size, seq_len + 2), dtype="int64", fill_value=self.stop_idx)
+            labels_ext = (1 - mask) * pad_stop + mask * labels_ext
+        else:
+            mask = paddle.cast(sequence_mask(self._get_batch_seq_index(batch_size, seq_len), lengths), "int64")
+            labels_ext = labels
+
+        start_tag_indices = labels_ext[:, :-1]
+        stop_tag_indices = labels_ext[:, 1:]
+
+        # Encode the indices in a flattened representation.
+        transition_indices = start_tag_indices * self.num_tags + stop_tag_indices
+        flattened_transition_indices = transition_indices.reshape([-1])
+        flattened_transition_params = paddle.flatten(self.transitions)
+        scores = paddle.gather(flattened_transition_params, flattened_transition_indices).reshape([batch_size, -1])
+        mask_scores = scores * mask[:, 1:].astype(scores.dtype)
+
+        # Accumulate the transition score
+        score = paddle.sum(mask_scores, 1)
+
+        return score
+
+    def _get_start_stop_tensor(self, batch_size):
+        if self._start_tensor is None or self._stop_tensor is None or batch_size != self._start_tensor.shape[0]:
+            self._start_tensor = paddle.full((batch_size, 1), dtype="int64", fill_value=self.start_idx)
+            self._stop_tensor = paddle.full((batch_size, 1), dtype="int64", fill_value=self.stop_idx)
+        return self._start_tensor, self._stop_tensor
+
+    def _get_batch_index(self, batch_size):
+        if self._batch_index is None or batch_size != self._batch_index.shape[0]:
+            self._batch_index = paddle.arange(end=batch_size, dtype="int64")
+        return self._batch_index
+
+    def _get_seq_index(self, length):
+        if self._seq_index is None or length > self._seq_index.shape[0]:
+            self._seq_index = paddle.arange(end=length, dtype="int64")
+        return self._seq_index[:length]
+
+    def _get_batch_seq_index(self, batch_size, length):
+        if (
+            self._batch_seq_index is None
+            or length + 2 > self._batch_seq_index.shape[1]
+            or batch_size > self._batch_seq_index.shape[0]
+        ):
+            self._batch_seq_index = paddle.cumsum(paddle.ones([batch_size, length + 2], "int64"), axis=1) - 1
+        if self.with_start_stop_tag:
+            return self._batch_seq_index[:batch_size, : length + 2]
+        else:
+            return self._batch_seq_index[:batch_size, :length]
+
+
+class LinearChainCrfLoss(nn.Layer):
+    """
+    The negative log-likelihood for linear chain Conditional Random Field (CRF).
+
+    Args:
+        crf (LinearChainCrf):
+            The `LinearChainCrf` network object. Its parameter will be used to calculate the loss.
+    """
+
+    def __init__(self, crf):
+        super(LinearChainCrfLoss, self).__init__()
+        self.crf = crf
+        if isinstance(crf, paddle.Tensor):
+            raise ValueError(
+                "From paddlenlp >= 2.0.0b4, the first param of LinearChainCrfLoss shoule be a LinearChainCrf object. For input parameter 'crf.transitions', you can remove '.transitions' to 'crf'"
+            )
+
+    def forward(self, inputs, lengths, labels, old_version_labels=None):
+        """
+        Calculate the crf loss. Let $$ Z(x) = \\sum_{y'}exp(score(x,y')) $$, means the sum of all path scores,
+        then we have $$ loss = -logp(y|x) = -log(exp(score(x,y))/Z(x)) = -score(x,y) + logZ(x) $$
+
+        Args:
+            inputs (Tensor):
+                The input predicted tensor. Its dtype is float32 and has a shape of `[batch_size, sequence_length, num_tags]`.
+            lengths (Tensor):
+                The input length. Its dtype is int64 and has a shape of `[batch_size]`.
+            labels (Tensor) :
+                The input label tensor. Its dtype is int64 and has a shape of `[batch_size, sequence_length]`
+            old_version_labels (Tensor, optional): Unnecessary parameter for compatibility with older versions. Defaults to ``None``.
+
+        Returns:
+            Tensor: The crf loss. Its dtype is float32 and has a shape of `[batch_size]`.
+        """
+        # Note: When closing to convergence, the loss could be a small negative number. This may caused by underflow when calculating exp in logsumexp.
+        #       We add relu here to avoid negative loss. In theory, the crf loss must be greater than or equal to 0, relu will not impact on it.
+        if old_version_labels is not None:
+            # TODO(qiujinxuan): rm compatibility support after lic.
+            labels = old_version_labels
+            if not getattr(self, "has_warn", False):
+                logger.warning(
+                    "Compatibility Warning: The params of LinearChainCrfLoss.forward has been modified. The third param is `labels`, and the fourth is not necessary. Please update the usage."
+                )
+                self.has_warn = True
+        loss = nn.functional.relu(self.crf.forward(inputs, lengths) - self.crf.gold_score(inputs, labels, lengths))
+        return loss
+
+
+class ViterbiDecoder(nn.Layer):
+    """
+    ViterbiDecoder can decode the highest scoring sequence of tags, it should only be used at test time.
+
+    Args:
+        transitions (Tensor):
+            The transition matrix.  Its dtype is float32 and has a shape of `[num_tags, num_tags]`.
+        with_start_stop_tag (bool, optional):
+            If set to True, the last row and the last column of transitions will be considered as start tag,
+            the penultimate row and the penultimate column of transitions will be considered as stop tag.
+            Else, all the rows and columns will be considered as the real tag. Defaults to ``None``.
+    """
+
+    def __init__(self, transitions, with_start_stop_tag=True):
+        super(ViterbiDecoder, self).__init__()
+        self.transitions = transitions
+        self.with_start_stop_tag = with_start_stop_tag
+        # If consider start and stop, -1 should be START and -2 should be STOP.
+        if with_start_stop_tag:
+            self.start_idx = -1
+            self.stop_idx = -2
+        self.num_tags = transitions.shape[0]
+
+        self._initial_alpha = None
+        self._index = None
+        self._batch_index = None
+        self._batch_seq_index = None
+
+    def _initialize_alpha(self, batch_size):
+        # alpha accumulate the path value to get the different next tag
+        if self._initial_alpha is None or batch_size > self._initial_alpha.shape[0]:
+            # Initialized by a small value.
+            initial_alpha = paddle.full([batch_size, self.num_tags - 1], dtype="float32", fill_value=-10000.0)
+            # alpha_start fill_value = 0. > -10000., means the first one step START gets the most score.
+            alpha_start = paddle.full([batch_size, 1], dtype="float32", fill_value=0.0)
+            self._initial_alpha = paddle.concat([initial_alpha, alpha_start], axis=1)
+        return paddle.slice(self._initial_alpha, axes=[0], starts=[0], ends=[batch_size])
+
+    def forward(self, inputs, lengths):
+        """
+        Decode the highest scoring sequence of tags.
+
+        Args:
+            inputs (Tensor):
+                The unary emission tensor. Its dtype is float32 and has a shape of `[batch_size, sequence_length, num_tags]`.
+            length (Tensor):
+                The input length tensor storing real length of each sequence for correctness. Its dtype is int64 and has a shape of `[batch_size]`.
+
+        Returns:
+            tuple: Returns tuple (scores, paths). The `scores` tensor containing the score for the Viterbi sequence.
+            Its dtype is float32 and has a shape of `[batch_size]`.
+            The `paths` tensor containing the highest scoring tag indices.
+            Its dtype is int64 and has a shape of `[batch_size, sequence_length]`.
+        """
+        input_shape = inputs.shape
+        batch_size = input_shape[0]
+        n_label = input_shape[2]
+
+        inputs_t = inputs.transpose([1, 0, 2])
+        trans_exp = self.transitions.unsqueeze(0).expand([batch_size, n_label, n_label])
+
+        historys = []
+        left_length = lengths.clone()
+        max_seq_len = left_length.max()
+        # no need to expand the 'mask' in the following iteration
+        left_length = left_length.unsqueeze(-1).expand([batch_size, n_label])
+
+        if self.with_start_stop_tag:
+            alpha = self._initialize_alpha(batch_size)
+        else:
+            alpha = paddle.zeros((batch_size, self.num_tags), dtype="float32")
+        for i, logit in enumerate(inputs_t[:max_seq_len]):
+            # if not with_start_stop_tag, the first label has not antecedent tag.
+            if i == 0 and not self.with_start_stop_tag:
+                alpha = logit
+                left_length = left_length - 1
+                continue
+            alpha_exp = alpha.unsqueeze(2)
+            # alpha_trn_sum: batch_size, n_labels, n_labels
+            alpha_trn_sum = alpha_exp + trans_exp
+
+            # alpha_max: batch_size, n_labels
+            # We don't include the emission scores here because the max does not depend on them (we add them in below)
+            alpha_max = alpha_trn_sum.max(1)
+            # If with_start_stop_tag, the first antecedent tag must be START, else the first label has not antecedent tag.
+            # So we can record the path from i=1.
+            if i >= 1:
+                alpha_argmax = alpha_trn_sum.argmax(1)
+                historys.append(alpha_argmax)
+            # Now add the emission scores
+            alpha_nxt = alpha_max + logit
+
+            mask = paddle.cast((left_length > 0), dtype="float32")
+            alpha = mask * alpha_nxt + (1 - mask) * alpha
+
+            if self.with_start_stop_tag:
+                mask = paddle.cast((left_length == 1), dtype="float32")
+                alpha += mask * trans_exp[:, self.stop_idx]
+
+            left_length = left_length - 1
+
+        # last_ids: batch_size
+        scores, last_ids = alpha.max(1), alpha.argmax(1)
+        if max_seq_len == 1:
+            return scores, last_ids.unsqueeze(1)
+        # Trace back the best path
+        # historys: seq_len, batch_size, n_labels
+        historys = paddle.stack(historys)
+        left_length = left_length[:, 0]
+        tag_mask = paddle.cast((left_length >= 0), "int64")
+        last_ids_update = last_ids * tag_mask
+
+        batch_path = [last_ids_update]
+        batch_offset = self._get_batch_index(batch_size) * n_label
+        historys = paddle.reverse(historys, [0])
+        for hist in historys:
+            # hist: batch_size, n_labels
+            left_length = left_length + 1
+            gather_idx = batch_offset + last_ids
+            tag_mask = paddle.cast((left_length > 0), "int64")
+            last_ids_update = paddle.gather(hist.flatten(), gather_idx) * tag_mask
+            zero_len_mask = paddle.cast((left_length == 0), "int64")
+            last_ids_update = last_ids_update * (1 - zero_len_mask) + last_ids * zero_len_mask
+            batch_path.append(last_ids_update)
+            tag_mask = paddle.cast((left_length >= 0), "int64")
+            last_ids = last_ids_update + last_ids * (1 - tag_mask)
+        batch_path = paddle.reverse(paddle.stack(batch_path, 1), [1])
+        return scores, batch_path
+
+    def _get_batch_index(self, batch_size):
+        if self._batch_index is None or batch_size != self._batch_index.shape[0]:
+            self._batch_index = paddle.arange(end=batch_size, dtype="int64")
+        return self._batch_index
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/layers/globalpointer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/layers/globalpointer.py
new file mode 100644
index 000000000..a76c60609
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/layers/globalpointer.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+
+class RotaryPositionEmbedding(nn.Layer):
+    def __init__(self, dim, max_seq_len=512):
+        super().__init__()
+        inv_freq = 1.0 / (10000 ** (paddle.arange(0, dim, 2, dtype="float32") / dim))
+        t = paddle.arange(max_seq_len, dtype=inv_freq.dtype)
+        freqs = paddle.matmul(t.unsqueeze(1), inv_freq.unsqueeze(0))
+        self.register_buffer("sin", freqs.sin(), persistable=False)
+        self.register_buffer("cos", freqs.cos(), persistable=False)
+
+    def forward(self, x, offset=0):
+        seqlen = x.shape[-2]
+        sin, cos = (
+            self.sin[offset : offset + seqlen, :],
+            self.cos[offset : offset + seqlen, :],
+        )
+        x1, x2 = x[..., 0::2], x[..., 1::2]
+        # 奇偶交错
+        return paddle.stack([x1 * cos - x2 * sin, x1 * sin + x2 * cos], axis=-1).flatten(-2, -1)
+
+
+class GlobalPointer(nn.Layer):
+    def __init__(self, hidden_size, heads, head_size=64, RoPE=True, tril_mask=True, max_length=512):
+        super().__init__()
+        self.heads = heads
+        self.head_size = head_size
+        self.RoPE = RoPE
+        self.tril_mask = tril_mask
+        self.dense1 = nn.Linear(hidden_size, head_size * 2)
+        self.dense2 = nn.Linear(head_size * 2, heads * 2)
+        if RoPE:
+            self.rotary = RotaryPositionEmbedding(head_size, max_length)
+
+    def forward(self, inputs, attention_mask=None):
+        inputs = self.dense1(inputs)
+        qw, kw = inputs[..., ::2], inputs[..., 1::2]
+        # RoPE编码
+        if self.RoPE:
+            qw, kw = self.rotary(qw), self.rotary(kw)
+
+        # 计算内积
+        logits = paddle.einsum("bmd,bnd->bmn", qw, kw) / self.head_size**0.5
+        bias = paddle.transpose(self.dense2(inputs), [0, 2, 1]) / 2
+        logits = logits[:, None] + bias[:, ::2, None] + bias[:, 1::2, :, None]
+
+        # 排除padding
+        attn_mask = 1 - attention_mask[:, None, None, :] * attention_mask[:, None, :, None]
+        logits = logits - attn_mask * 1e12
+
+        # # 排除下三角
+        if self.tril_mask:
+            mask = paddle.tril(paddle.ones_like(logits), diagonal=-1)
+
+            logits = logits - mask * 1e12
+
+        return logits
+
+
+class GlobalPointerForEntityExtraction(nn.Layer):
+    def __init__(self, encoder, label_maps, head_size=64):
+        super().__init__()
+        self.encoder = encoder
+        hidden_size = encoder.config["hidden_size"]
+        gpcls = GlobalPointer
+        self.entity_output = gpcls(hidden_size, len(label_maps["entity2id"]), head_size=head_size)
+
+    def forward(self, input_ids, attention_mask):
+        # input_ids, attention_mask, token_type_ids: (batch_size, seq_len)
+        context_outputs = self.encoder(input_ids, attention_mask=attention_mask)
+        # last_hidden_state: (batch_size, seq_len, hidden_size)
+        last_hidden_state = context_outputs[0]
+
+        entity_output = self.entity_output(last_hidden_state, attention_mask)
+        return [entity_output]
+
+
+class GPLinkerForRelationExtraction(nn.Layer):
+    def __init__(self, encoder, label_maps, head_size=64):
+        super().__init__()
+        self.encoder = encoder
+        hidden_size = encoder.config["hidden_size"]
+        num_ents = len(label_maps["entity2id"])
+        if "relation2id" in label_maps.keys():
+            num_rels = len(label_maps["relation2id"])
+        else:
+            num_rels = len(label_maps["sentiment2id"])
+        gpcls = GlobalPointer
+
+        self.entity_output = gpcls(hidden_size, num_ents, head_size=head_size)
+        self.head_output = gpcls(hidden_size, num_rels, head_size=head_size, RoPE=False, tril_mask=False)
+        self.tail_output = gpcls(hidden_size, num_rels, head_size=head_size, RoPE=False, tril_mask=False)
+
+    def forward(self, input_ids, attention_mask):
+        # input_ids, attention_mask, token_type_ids: (batch_size, seq_len)
+        context_outputs = self.encoder(input_ids, attention_mask=attention_mask)
+        # last_hidden_state: (batch_size, seq_len, hidden_size)
+        last_hidden_state = context_outputs[0]
+
+        entity_output = self.entity_output(last_hidden_state, attention_mask)
+        head_output = self.head_output(last_hidden_state, attention_mask)
+        tail_output = self.tail_output(last_hidden_state, attention_mask)
+        spo_output = [entity_output, head_output, tail_output]
+        return spo_output
+
+
+class GPLinkerForEventExtraction(nn.Layer):
+    def __init__(self, encoder, label_maps, head_size=64):
+        super().__init__()
+        self.encoder = encoder
+        hidden_size = encoder.config["hidden_size"]
+        num_labels = len(label_maps["label2id"])
+        gpcls = GlobalPointer
+
+        self.argu_output = gpcls(hidden_size, num_labels, head_size=head_size)
+        self.head_output = gpcls(hidden_size, 1, head_size=head_size, RoPE=False)
+        self.tail_output = gpcls(hidden_size, 1, head_size=head_size, RoPE=False)
+
+    def forward(self, input_ids, attention_mask):
+        # input_ids, attention_mask, token_type_ids: (batch_size, seq_len)
+        context_outputs = self.encoder(input_ids, attention_mask=attention_mask)
+        # last_hidden_state: (batch_size, seq_len, hidden_size)
+        last_hidden_state = context_outputs[0]
+
+        argu_output = self.argu_output(last_hidden_state, attention_mask)
+        head_output = self.head_output(last_hidden_state, attention_mask)
+        tail_output = self.tail_output(last_hidden_state, attention_mask)
+        aht_output = (argu_output, head_output, tail_output)
+        return aht_output
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/layers/linear.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/layers/linear.py
new file mode 100644
index 000000000..cf18b97e7
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/layers/linear.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle import nn
+from paddle.nn import functional as F
+
+
+class Linear(nn.Layer):
+    """
+    Same as paddle.layer.Linear, except weight matrix is stored as [out_features, in_features] (same as torch),
+    instead of [in_features, out_features]
+    """
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        weight_attr=None,
+        bias_attr=None,
+        name=None,
+    ):
+        super(Linear, self).__init__()
+        self._dtype = self._helper.get_default_dtype()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self.weight = self.create_parameter(
+            shape=[out_features, in_features],  # regular linear has shape [in_features, out_features]
+            attr=self._weight_attr,
+            dtype=self._dtype,
+            is_bias=False,
+        )
+        self.bias = self.create_parameter(
+            shape=[out_features],
+            attr=self._bias_attr,
+            dtype=self._dtype,
+            is_bias=True,
+        )
+        self.name = name
+
+    def forward(self, input):
+        out = F.linear(x=input, weight=self.weight.T, bias=self.bias, name=self.name)
+        return out
+
+    def extra_repr(self):
+        name_str = ", name={}".format(self.name) if self.name else ""
+        return "in_features={}, out_features={}, dtype={}{}".format(
+            self.weight.shape[1], self.weight.shape[0], self._dtype, name_str
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/layers/sequence.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/layers/sequence.py
new file mode 100644
index 000000000..3ae485dc7
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/layers/sequence.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def sequence_mask(seq_ids, valid_lengths):
+    """
+    To boost the performance, this sequence_mask is different with paddle.nn.functional.sequence_mask
+
+    Args:
+        seq_ids (Tensor):
+            The whole sequence index, a tensor with a shape of [batch_size, sequence_length].
+        valid_lengths (Tensor):
+            The valid length of every sequence, a tensor with a shape of [batch_size].
+
+    Returns:
+        Tensor: Returns the output sequence mask `mask`.
+        Its dtype is `bool` and has a shape of [batch_size, sequence_length].
+    """
+    lengths_exp = valid_lengths.unsqueeze(1)
+    mask = seq_ids < lengths_exp
+
+    return mask
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/layers/tcn.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/layers/tcn.py
new file mode 100644
index 000000000..0c7367965
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/layers/tcn.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.nn as nn
+from paddle.nn.utils import weight_norm
+
+__all__ = ["TemporalBlock", "TCN"]
+
+
+class Chomp1d(nn.Layer):
+    """
+    Remove the elements on the right.
+
+    Args:
+        chomp_size (int):
+            The number of elements removed.
+    """
+
+    def __init__(self, chomp_size):
+        super(Chomp1d, self).__init__()
+        self.chomp_size = chomp_size
+
+    def forward(self, x):
+        return x[:, :, : -self.chomp_size]
+
+
+class TemporalBlock(nn.Layer):
+    """
+    The TCN block, consists of dilated causal conv, relu and residual block.
+    See the Figure 1(b) in https://arxiv.org/pdf/1803.01271.pdf for more details.
+
+    Args:
+        n_inputs (int):
+            The number of channels in the input tensor.
+        n_outputs (int):
+            The number of filters.
+        kernel_size (int):
+            The filter size.
+        stride (int):
+            The stride size.
+        dilation (int):
+            The dilation size.
+        padding (int):
+            The size of zeros to be padded.
+        dropout (float, optional):
+            Probability of dropout the units. Defaults to 0.2.
+    """
+
+    def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2):
+
+        super(TemporalBlock, self).__init__()
+        self.conv1 = weight_norm(
+            nn.Conv1D(n_inputs, n_outputs, kernel_size, stride=stride, padding=padding, dilation=dilation)
+        )
+        # Chomp1d is used to make sure the network is causal.
+        # We pad by (k-1)*d on the two sides of the input for convolution,
+        # and then use Chomp1d to remove the (k-1)*d output elements on the right.
+        self.chomp1 = Chomp1d(padding)
+        self.relu1 = nn.ReLU()
+        self.dropout1 = nn.Dropout(dropout)
+
+        self.conv2 = weight_norm(
+            nn.Conv1D(n_outputs, n_outputs, kernel_size, stride=stride, padding=padding, dilation=dilation)
+        )
+        self.chomp2 = Chomp1d(padding)
+        self.relu2 = nn.ReLU()
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.net = nn.Sequential(
+            self.conv1, self.chomp1, self.relu1, self.dropout1, self.conv2, self.chomp2, self.relu2, self.dropout2
+        )
+        self.downsample = nn.Conv1D(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None
+        self.relu = nn.ReLU()
+        self.init_weights()
+
+    def init_weights(self):
+        self.conv1.weight.set_value(paddle.tensor.normal(0.0, 0.01, self.conv1.weight.shape))
+        self.conv2.weight.set_value(paddle.tensor.normal(0.0, 0.01, self.conv2.weight.shape))
+        if self.downsample is not None:
+            self.downsample.weight.set_value(paddle.tensor.normal(0.0, 0.01, self.downsample.weight.shape))
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor):
+                The input tensor with a shape  of [batch_size, input_channel, sequence_length].
+
+        """
+        out = self.net(x)
+        res = x if self.downsample is None else self.downsample(x)
+        return self.relu(out + res)
+
+
+class TCN(nn.Layer):
+    def __init__(self, input_channel, num_channels, kernel_size=2, dropout=0.2):
+        """
+        Temporal Convolutional Networks is a simple convolutional architecture. It outperforms canonical recurrent networks
+        such as LSTMs in many tasks. See https://arxiv.org/pdf/1803.01271.pdf for more details.
+
+        Args:
+            input_channel (int):
+                The number of channels in the input tensor.
+            num_channels (list | tuple):
+                The number of channels in different layer.
+            kernel_size (int, optional):
+                The filter size.. Defaults to 2.
+            dropout (float, optional):
+                Probability of dropout the units.. Defaults to 0.2.
+        """
+        super(TCN, self).__init__()
+        layers = nn.LayerList()
+        num_levels = len(num_channels)
+        for i in range(num_levels):
+            dilation_size = 2**i
+            in_channels = input_channel if i == 0 else num_channels[i - 1]
+            out_channels = num_channels[i]
+            layers.append(
+                TemporalBlock(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=dilation_size,
+                    padding=(kernel_size - 1) * dilation_size,
+                    dropout=dropout,
+                )
+            )
+
+        self.network = nn.Sequential(*layers)
+
+    def forward(self, x):
+        """
+        Apply temporal convolutional networks to the input tensor.
+
+        Args:
+            x (Tensor):
+                The input tensor with a shape of [batch_size, input_channel, sequence_length].
+
+        Returns:
+            Tensor: The `output` tensor with a shape of [batch_size, num_channels[-1], sequence_length].
+        """
+        output = self.network(x)
+        return output
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/losses/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/losses/__init__.py
new file mode 100644
index 000000000..e65d62cbe
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/losses/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .rdrop import RDropLoss
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/losses/rdrop.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/losses/rdrop.py
new file mode 100644
index 000000000..a5f0a27a5
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/losses/rdrop.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+__all__ = ["RDropLoss"]
+
+
+class RDropLoss(nn.Layer):
+    """
+    R-Drop Loss implementation
+    For more information about R-drop please refer to this paper: https://arxiv.org/abs/2106.14448
+    Original implementation please refer to this code: https://github.com/dropreg/R-Drop
+
+    Args:
+        reduction(str, optional):
+            Indicate how to average the loss, the candicates are ``'none'``,``'batchmean'``,``'mean'``,``'sum'``.
+            If `reduction` is ``'mean'``, the reduced mean loss is returned;
+            If `reduction` is ``'batchmean'``, the sum loss divided by batch size is returned;
+            If `reduction` is ``'sum'``, the reduced sum loss is returned;
+            If `reduction` is ``'none'``, no reduction will be applied.
+            Defaults to ``'none'``.
+    """
+
+    def __init__(self, reduction="none"):
+        super(RDropLoss, self).__init__()
+        if reduction not in ["sum", "mean", "none", "batchmean"]:
+            raise ValueError(
+                "'reduction' in 'RDropLoss' should be 'sum', 'mean' 'batchmean', or 'none', "
+                "but received {}.".format(reduction)
+            )
+        self.reduction = reduction
+
+    def forward(self, p, q, pad_mask=None):
+        """
+        Args:
+            p(Tensor): the first forward logits of training examples.
+            q(Tensor): the second forward logits of training examples.
+            pad_mask(Tensor, optional): The Tensor containing the binary mask to index with, it's data type is bool.
+
+        Returns:
+            Tensor: Returns tensor `loss`, the rdrop loss of p and q.
+        """
+        p_loss = F.kl_div(F.log_softmax(p, axis=-1), F.softmax(q, axis=-1), reduction=self.reduction)
+        q_loss = F.kl_div(F.log_softmax(q, axis=-1), F.softmax(p, axis=-1), reduction=self.reduction)
+
+        # pad_mask is for seq-level tasks
+        if pad_mask is not None:
+            p_loss = paddle.masked_select(p_loss, pad_mask)
+            q_loss = paddle.masked_select(q_loss, pad_mask)
+
+        # You can choose whether to use function "sum" and "mean" depending on your task
+        p_loss = p_loss.sum()
+        q_loss = q_loss.sum()
+        loss = (p_loss + q_loss) / 2
+        return loss
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/README.md b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/README.md
new file mode 100644
index 000000000..eac5ba169
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/README.md
@@ -0,0 +1,14 @@
+# paddlenlp.metrics
+
+目前paddlenlp提供以下评价指标：
+
+| Metric                                                   | 简介                                                         | API                                                          |
+| -------------------------------------------------------- | :----------------------------------------------------------- | ------------------------------------------------------------ |
+| Perplexity                                               | 困惑度，常用来衡量语言模型优劣，也可用于机器翻译、文本生成等任务。 | `paddlenlp.metrics.Perplexity`                               |
+| BLEU(bilingual evaluation understudy)                    | 机器翻译常用评价指标                                         | `paddlenlp.metrics.BLEU`                                     |
+| Rouge(Recall-Oriented Understudy for Gisting Evaluation) | 评估自动文摘以及机器翻译的指标                               | `paddlenlp.metrics.RougeL`, `paddlenlp.metrics.RougeN`       |
+| AccuracyAndF1                                            | 准确率及F1-score，可用于GLUE中的MRPC 和QQP任务               | `paddlenlp.metrics.AccuracyAndF1`                            |
+| PearsonAndSpearman                                       | 皮尔森相关性系数和斯皮尔曼相关系数。可用于GLUE中的STS-B任务  | `paddlenlp.metrics.PearsonAndSpearman`                       |
+| Mcc(Matthews correlation coefficient)                    | 马修斯相关系数，用以测量二分类的分类性能的指标。可用于GLUE中的CoLA任务 | `paddlenlp.metrics.Mcc`                                      |
+| ChunkEvaluator                                           | 计算了块检测的精确率、召回率和F1-score。常用于序列标记任务，如命名实体识别（NER） | `paddlenlp.metrics.ChunkEvaluator`                           |
+| Squad                                                    | 用于SQuAD和DuReader-robust的评价指标                         | `paddlenlp.metrics.compute_predictions`, `paddlenlp.metrics.squad_evaluate` |
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/__init__.py
new file mode 100644
index 000000000..fe8c83222
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .bleu import BLEU, BLEUForDuReader
+from .chunk import ChunkEvaluator
+from .distinct import Distinct
+from .glue import AccuracyAndF1, Mcc, MultiLabelsMetric, PearsonAndSpearman
+from .mrr import MRR
+from .perplexity import Perplexity
+from .rouge import Rouge1, Rouge2, RougeL, RougeLForDuReader, RougeN
+from .sighan import CorrectionF1, DetectionF1
+from .span import SpanEvaluator
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/bleu.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/bleu.py
new file mode 100644
index 000000000..6bb893393
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/bleu.py
@@ -0,0 +1,276 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import sys
+from collections import defaultdict
+
+import paddle
+
+from .utils import default_trans_func
+
+__all__ = ["BLEU", "BLEUForDuReader"]
+
+
+def get_match_size(cand_ngram, refs_ngram):
+    ref_set = defaultdict(int)
+    for ref_ngram in refs_ngram:
+        tmp_ref_set = defaultdict(int)
+        for ngram in ref_ngram:
+            tmp_ref_set[tuple(ngram)] += 1
+        for ngram, count in tmp_ref_set.items():
+            ref_set[tuple(ngram)] = max(ref_set[tuple(ngram)], count)
+    cand_set = defaultdict(int)
+    for ngram in cand_ngram:
+        cand_set[tuple(ngram)] += 1
+    match_size = 0
+    for ngram, count in cand_set.items():
+        match_size += min(count, ref_set.get(tuple(ngram), 0))
+    cand_size = len(cand_ngram)
+    return match_size, cand_size
+
+
+def get_ngram(sent, n_size, label=None):
+    def _ngram(sent, n_size):
+        ngram_list = []
+        for left in range(len(sent) - n_size):
+            ngram_list.append(sent[left : left + n_size + 1])
+        return ngram_list
+
+    ngram_list = _ngram(sent, n_size)
+    if label is not None:
+        ngram_list = [ngram + "_" + label for ngram in ngram_list]
+    return ngram_list
+
+
+class BLEU(paddle.metric.Metric):
+    r"""
+    BLEU (bilingual evaluation understudy) is an algorithm for evaluating the
+    quality of text which has been machine-translated from one natural language
+    to another. This metric uses a modified form of precision to compare a
+    candidate translation against multiple reference translations.
+
+    BLEU could be used as `paddle.metric.Metric` class, or an ordinary
+    class. When BLEU is used as `paddle.metric.Metric` class. A function is
+    needed that transforms the network output to reference string list, and
+    transforms the label to candidate string. By default, a default function
+    `default_trans_func` is provided, which gets target sequence id by
+    calculating the maximum probability of each step. In this case, user must
+    provide `vocab`. It should be noted that the BLEU here is different from
+    the BLEU calculated in prediction, and it is only for observation during
+    training and evaluation.
+
+    .. math::
+
+        BP & =
+        \begin{cases}
+        1,  & \text{if }c>r \\
+        e_{1-r/c}, & \text{if }c\leq r
+        \end{cases}
+
+        BLEU & = BP\exp(\sum_{n=1}^N w_{n} \log{p_{n}})
+
+    where `c` is the length of candidate sentence, and `r` is the length of reference sentence.
+
+    Args:
+        trans_func (callable, optional): `trans_func` transforms the network
+            output to string to calculate.
+        vocab (dict|paddlenlp.data.vocab, optional): Vocab for target language.
+            If `trans_func` is None and BLEU is used as `paddle.metric.Metric`
+            instance, `default_trans_func` will be performed and `vocab` must
+            be provided.
+        n_size (int, optional): Number of gram for BLEU metric. Defaults to 4.
+        weights (list, optional): The weights of precision of each gram.
+            Defaults to None.
+        name (str, optional): Name of `paddle.metric.Metric` instance.
+            Defaults to "bleu".
+
+    Examples:
+        1. Using as a general evaluation object.
+
+        .. code-block:: python
+
+            from paddlenlp.metrics import BLEU
+            bleu = BLEU()
+            cand = ["The","cat","The","cat","on","the","mat"]
+            ref_list = [["The","cat","is","on","the","mat"], ["There","is","a","cat","on","the","mat"]]
+            bleu.add_inst(cand, ref_list)
+            print(bleu.score()) # 0.4671379777282001
+
+        2. Using as an instance of `paddle.metric.Metric`.
+
+        .. code-block:: python
+
+            # You could add the code below to Seq2Seq example in this repo to
+            # use BLEU as `paddlenlp.metric.Metric' class. If you run the
+            # following code alone, you may get an error.
+            # log example:
+            # Epoch 1/12
+            # step 100/507 - loss: 308.7948 - Perplexity: 541.5600 - bleu: 2.2089e-79 - 923ms/step
+            # step 200/507 - loss: 264.2914 - Perplexity: 334.5099 - bleu: 0.0093 - 865ms/step
+            # step 300/507 - loss: 236.3913 - Perplexity: 213.2553 - bleu: 0.0244 - 849ms/step
+
+            from paddlenlp.data import Vocab
+            from paddlenlp.metrics import BLEU
+
+            bleu_metric = BLEU(vocab=src_vocab.idx_to_token)
+            model.prepare(optimizer, CrossEntropyCriterion(), [ppl_metric, bleu_metric])
+
+    """
+
+    def __init__(self, trans_func=None, vocab=None, n_size=4, weights=None, name="bleu"):
+        super(BLEU, self).__init__()
+        if not weights:
+            weights = [1 / n_size for _ in range(n_size)]
+        assert (
+            len(weights) == n_size
+        ), "Number of weights and n-gram should be the same, got Number of weights: '%d' and n-gram: '%d'" % (
+            len(weights),
+            n_size,
+        )
+        self._name = name
+        self.match_ngram = {}
+        self.candi_ngram = {}
+        self.weights = weights
+        self.bp_r = 0
+        self.bp_c = 0
+        self.n_size = n_size
+        self.vocab = vocab
+        self.trans_func = trans_func
+
+    def update(self, output, label, seq_mask=None):
+        if self.trans_func is None:
+            if self.vocab is None:
+                raise AttributeError(
+                    "The `update` method requires users to provide `trans_func` or `vocab` when initializing BLEU."
+                )
+            cand_list, ref_list = default_trans_func(output, label, seq_mask=seq_mask, vocab=self.vocab)
+        else:
+            cand_list, ref_list = self.trans_func(output, label, seq_mask)
+        if len(cand_list) != len(ref_list):
+            raise ValueError("Length error! Please check the output of network.")
+        for i in range(len(cand_list)):
+            self.add_inst(cand_list[i], ref_list[i])
+
+    def add_inst(self, cand, ref_list):
+        """
+        Update the states based on a pair of candidate and references.
+
+        Args:
+            cand (list): Tokenized candidate sentence.
+            ref_list (list of list): List of tokenized ground truth sentences.
+        """
+        for n_size in range(self.n_size):
+            self.count_ngram(cand, ref_list, n_size)
+        self.count_bp(cand, ref_list)
+
+    def count_ngram(self, cand, ref_list, n_size):
+        cand_ngram = get_ngram(cand, n_size)
+        refs_ngram = []
+        for ref in ref_list:
+            refs_ngram.append(get_ngram(ref, n_size))
+        if n_size not in self.match_ngram:
+            self.match_ngram[n_size] = 0
+            self.candi_ngram[n_size] = 0
+        match_size, cand_size = get_match_size(cand_ngram, refs_ngram)
+
+        self.match_ngram[n_size] += match_size
+        self.candi_ngram[n_size] += cand_size
+
+    def count_bp(self, cand, ref_list):
+        self.bp_c += len(cand)
+        self.bp_r += min([(abs(len(cand) - len(ref)), len(ref)) for ref in ref_list])[1]
+
+    def reset(self):
+        self.match_ngram = {}
+        self.candi_ngram = {}
+        self.bp_r = 0
+        self.bp_c = 0
+
+    def accumulate(self):
+        """
+        Calculates and returns the final bleu metric.
+
+        Returns:
+            Tensor: Returns the accumulated metric `bleu` and its data type is float64.
+        """
+        prob_list = []
+        for n_size in range(self.n_size):
+            try:
+                if self.candi_ngram[n_size] == 0:
+                    _score = 0.0
+                else:
+                    _score = self.match_ngram[n_size] / float(self.candi_ngram[n_size])
+            except Exception:
+                _score = 0
+            if _score == 0:
+                _score = sys.float_info.min
+            prob_list.append(_score)
+
+        logs = math.fsum(w_i * math.log(p_i) for w_i, p_i in zip(self.weights, prob_list))
+        bp = math.exp(min(1 - self.bp_r / float(self.bp_c), 0))
+        bleu = bp * math.exp(logs)
+        return bleu
+
+    def score(self):
+        return self.accumulate()
+
+    def name(self):
+        return self._name
+
+
+class BLEUForDuReader(BLEU):
+    """
+    BLEU metric with bonus for DuReader contest.
+
+    Please refer to `DuReader Homepage<https://ai.baidu.com//broad/subordinate?dataset=dureader>`_ for more details.
+
+    Args:
+        n_size (int, optional): Number of gram for BLEU metric. Defaults to 4.
+        alpha (float, optional): Weight of YesNo dataset when adding bonus for DuReader contest. Defaults to 1.0.
+        beta (float, optional): Weight of Entity dataset when adding bonus for DuReader contest. Defaults to 1.0.
+
+    """
+
+    def __init__(self, n_size=4, alpha=1.0, beta=1.0):
+        super(BLEUForDuReader, self).__init__(n_size)
+        self.alpha = alpha
+        self.beta = beta
+
+    def add_inst(self, cand, ref_list, yn_label=None, yn_ref=None, entity_ref=None):
+        BLEU.add_inst(self, cand, ref_list)
+        if yn_label is not None and yn_ref is not None:
+            self.add_yn_bonus(cand, ref_list, yn_label, yn_ref)
+        elif entity_ref is not None:
+            self.add_entity_bonus(cand, entity_ref)
+
+    def add_yn_bonus(self, cand, ref_list, yn_label, yn_ref):
+        for n_size in range(self.n_size):
+            cand_ngram = get_ngram(cand, n_size, label=yn_label)
+            ref_ngram = []
+            for ref_id, r in enumerate(yn_ref):
+                ref_ngram.append(get_ngram(ref_list[ref_id], n_size, label=r))
+            match_size, cand_size = get_match_size(cand_ngram, ref_ngram)
+            self.match_ngram[n_size] += self.alpha * match_size
+            self.candi_ngram[n_size] += self.alpha * match_size
+
+    def add_entity_bonus(self, cand, entity_ref):
+        for n_size in range(self.n_size):
+            cand_ngram = get_ngram(cand, n_size, label="ENTITY")
+            ref_ngram = []
+            for reff_id, r in enumerate(entity_ref):
+                ref_ngram.append(get_ngram(r, n_size, label="ENTITY"))
+            match_size, cand_size = get_match_size(cand_ngram, ref_ngram)
+            self.match_ngram[n_size] += self.beta * match_size
+            self.candi_ngram[n_size] += self.beta * match_size
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/chunk.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/chunk.py
new file mode 100644
index 000000000..ef24919db
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/chunk.py
@@ -0,0 +1,195 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+
+import numpy as np
+import paddle
+from paddlenlp.utils.log import logger
+from seqeval.metrics.sequence_labeling import get_entities
+
+
+def extract_tp_actual_correct(y_true, y_pred, suffix, *args):
+    entities_true = defaultdict(set)
+    entities_pred = defaultdict(set)
+    for type_name, start, end in get_entities(y_true, suffix):
+        entities_true[type_name].add((start, end))
+    for type_name, start, end in get_entities(y_pred, suffix):
+        entities_pred[type_name].add((start, end))
+
+    target_names = sorted(set(entities_true.keys()) | set(entities_pred.keys()))
+
+    tp_sum = np.array([], dtype=np.int32)
+    pred_sum = np.array([], dtype=np.int32)
+    true_sum = np.array([], dtype=np.int32)
+    for type_name in target_names:
+        entities_true_type = entities_true.get(type_name, set())
+        entities_pred_type = entities_pred.get(type_name, set())
+        tp_sum = np.append(tp_sum, len(entities_true_type & entities_pred_type))
+        pred_sum = np.append(pred_sum, len(entities_pred_type))
+        true_sum = np.append(true_sum, len(entities_true_type))
+
+    return pred_sum, tp_sum, true_sum
+
+
+class ChunkEvaluator(paddle.metric.Metric):
+    """
+    ChunkEvaluator computes the precision, recall and F1-score for chunk detection.
+    It is often used in sequence tagging tasks, such as Named Entity Recognition(NER).
+
+    Args:
+        label_list (list):
+            The label list.
+        suffix (bool):
+            If set True, the label ends with '-B', '-I', '-E' or '-S', else the label starts with them.
+            Defaults to `False`.
+
+    Example:
+        .. code-block::
+
+            from paddlenlp.metrics import ChunkEvaluator
+
+            num_infer_chunks = 10
+            num_label_chunks = 9
+            num_correct_chunks = 8
+
+            label_list = [1,1,0,0,1,0,1]
+            evaluator = ChunkEvaluator(label_list)
+            evaluator.update(num_infer_chunks, num_label_chunks, num_correct_chunks)
+            precision, recall, f1 = evaluator.accumulate()
+            print(precision, recall, f1)
+            # 0.8 0.8888888888888888 0.8421052631578948
+
+    """
+
+    def __init__(self, label_list, suffix=False):
+        super(ChunkEvaluator, self).__init__()
+        self.id2label_dict = dict(enumerate(label_list))
+        self.suffix = suffix
+        self.num_infer_chunks = 0
+        self.num_label_chunks = 0
+        self.num_correct_chunks = 0
+
+    def compute(self, lengths, predictions, labels, dummy=None):
+        """
+        Computes the precision, recall and F1-score for chunk detection.
+
+        Args:
+            lengths (Tensor): The valid length of every sequence, a tensor with shape `[batch_size]`
+            predictions (Tensor): The predictions index, a tensor with shape `[batch_size, sequence_length]`.
+            labels (Tensor): The labels index, a tensor with shape `[batch_size, sequence_length]`.
+            dummy (Tensor, optional): Unnecessary parameter for compatibility with older versions with parameters list `inputs`, `lengths`, `predictions`, `labels`. Defaults to None.
+
+        Returns:
+            tuple: Returns tuple (`num_infer_chunks, num_label_chunks, num_correct_chunks`).
+
+            With the fields:
+
+            - `num_infer_chunks` (Tensor):
+                The number of the inference chunks.
+
+            - `num_label_chunks` (Tensor):
+                The number of the label chunks.
+
+            - `num_correct_chunks` (Tensor):
+                The number of the correct chunks.
+        """
+        if dummy is not None:
+            # TODO(qiujinxuan): rm compatibility support after lic.
+            dummy, lengths, predictions, labels = lengths, predictions, labels, dummy
+            if not getattr(self, "has_warn", False):
+                logger.warning(
+                    "Compatibility Warning: The params of ChunkEvaluator.compute has been modified. The old version is `inputs`, `lengths`, `predictions`, `labels` while the current version is `lengths`, `predictions`, `labels`.  Please update the usage."
+                )
+                self.has_warn = True
+        labels = labels.numpy()
+        predictions = predictions.numpy()
+        unpad_labels = [
+            [self.id2label_dict[index] for index in labels[sent_index][: lengths[sent_index]]]
+            for sent_index in range(len(lengths))
+        ]
+        unpad_predictions = [
+            [self.id2label_dict.get(index, "O") for index in predictions[sent_index][: lengths[sent_index]]]
+            for sent_index in range(len(lengths))
+        ]
+
+        pred_sum, tp_sum, true_sum = extract_tp_actual_correct(unpad_labels, unpad_predictions, self.suffix)
+        num_correct_chunks = paddle.to_tensor([tp_sum.sum()])
+        num_infer_chunks = paddle.to_tensor([pred_sum.sum()])
+        num_label_chunks = paddle.to_tensor([true_sum.sum()])
+
+        return num_infer_chunks, num_label_chunks, num_correct_chunks
+
+    def _is_number_or_matrix(self, var):
+        def _is_number_(var):
+            return (
+                isinstance(var, int)
+                or isinstance(var, np.int64)
+                or isinstance(var, float)
+                or (isinstance(var, np.ndarray) and var.shape == (1,))
+            )
+
+        return _is_number_(var) or isinstance(var, np.ndarray)
+
+    def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks):
+        """
+        This function takes (num_infer_chunks, num_label_chunks, num_correct_chunks) as input,
+        to accumulate and update the corresponding status of the ChunkEvaluator object. The update method is as follows:
+
+        .. math::
+                   \\\\ \\begin{array}{l}{\\text { self. num_infer_chunks }+=\\text { num_infer_chunks }} \\\\ {\\text { self. num_Label_chunks }+=\\text { num_label_chunks }} \\\\ {\\text { self. num_correct_chunks }+=\\text { num_correct_chunks }}\\end{array} \\\\
+
+        Args:
+            num_infer_chunks(int|numpy.array):
+                The number of chunks in Inference on the given minibatch.
+            num_label_chunks(int|numpy.array):
+                The number of chunks in Label on the given mini-batch.
+            num_correct_chunks(int|float|numpy.array):
+                The number of chunks both in Inference and Label on the given mini-batch.
+        """
+        if not self._is_number_or_matrix(num_infer_chunks):
+            raise ValueError("The 'num_infer_chunks' must be a number(int) or a numpy ndarray.")
+        if not self._is_number_or_matrix(num_label_chunks):
+            raise ValueError("The 'num_label_chunks' must be a number(int, float) or a numpy ndarray.")
+        if not self._is_number_or_matrix(num_correct_chunks):
+            raise ValueError("The 'num_correct_chunks' must be a number(int, float) or a numpy ndarray.")
+        self.num_infer_chunks += num_infer_chunks
+        self.num_label_chunks += num_label_chunks
+        self.num_correct_chunks += num_correct_chunks
+
+    def accumulate(self):
+        """
+        This function returns the mean precision, recall and f1 score for all accumulated minibatches.
+
+        Returns:
+            tuple: Returns tuple (`precision, recall, f1 score`).
+        """
+        precision = float(self.num_correct_chunks / self.num_infer_chunks) if self.num_infer_chunks else 0.0
+        recall = float(self.num_correct_chunks / self.num_label_chunks) if self.num_label_chunks else 0.0
+        f1_score = float(2 * precision * recall / (precision + recall)) if self.num_correct_chunks else 0.0
+        return precision, recall, f1_score
+
+    def reset(self):
+        """
+        Reset function empties the evaluation memory for previous mini-batches.
+        """
+        self.num_infer_chunks = 0
+        self.num_label_chunks = 0
+        self.num_correct_chunks = 0
+
+    def name(self):
+        """
+        Return name of metric instance.
+        """
+        return "precision", "recall", "f1"
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/distinct.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/distinct.py
new file mode 100644
index 000000000..abcbfe95a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/distinct.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+__all__ = ["Distinct"]
+
+
+class Distinct(paddle.metric.Metric):
+    """
+    `Distinct` is an algorithm for evaluating the textual diversity of the
+    generated text by calculating the number of distinct n-grams. The larger
+    the number of distinct n-grams, the higher the diversity of the text. See
+    details at https://arxiv.org/abs/1510.03055.
+
+    :class:`Distinct` could be used as a :class:`paddle.metric.Metric` class,
+    or an ordinary class. When :class:`Distinct` is used as a
+    :class:`paddle.metric.Metric` class, a function is needed to transform
+    the network output to a string list.
+
+    Args:
+        n_size (int, optional):
+            Number of gram for :class:`Distinct` metric. Defaults to 2.
+        trans_func (callable, optional):
+            `trans_func` transforms the network output to a string list. Defaults to None.
+
+            .. note::
+                When :class:`Distinct` is used as a :class:`paddle.metric.Metric`
+                class, `trans_func` must be provided. Please note that the
+                input of `trans_func` is numpy array.
+
+        name (str, optional): Name of :class:`paddle.metric.Metric` instance.
+            Defaults to "distinct".
+
+    Examples:
+        1. Using as a general evaluation object.
+
+        .. code-block:: python
+
+            from paddlenlp.metrics import Distinct
+            distinct = Distinct()
+            cand = ["The","cat","The","cat","on","the","mat"]
+            #update the states
+            distinct.add_inst(cand)
+            print(distinct.score())
+            # 0.8333333333333334
+
+        2. Using as an instance of `paddle.metric.Metric`.
+
+        .. code-block:: python
+
+            import numpy as np
+            from functools import partial
+            import paddle
+            from paddlenlp.transformers import BertTokenizer
+            from paddlenlp.metrics import Distinct
+
+            def trans_func(logits, tokenizer):
+                '''Transform the network output `logits` to string list.'''
+                # [batch_size, seq_len]
+                token_ids = np.argmax(logits, axis=-1).tolist()
+                cand_list = []
+                for ids in token_ids:
+                    tokens = tokenizer.convert_ids_to_tokens(ids)
+                    strings = tokenizer.convert_tokens_to_string(tokens)
+                    cand_list.append(strings.split())
+                return cand_list
+
+            paddle.seed(2021)
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            distinct = Distinct(trans_func=partial(trans_func, tokenizer=tokenizer))
+            batch_size, seq_len, vocab_size = 4, 16, tokenizer.vocab_size
+            logits = paddle.rand([batch_size, seq_len, vocab_size])
+
+            distinct.update(logits.numpy())
+            print(distinct.accumulate()) # 1.0
+    """
+
+    def __init__(self, n_size=2, trans_func=None, name="distinct"):
+        super(Distinct, self).__init__()
+        self._name = name
+        self.diff_ngram = set()
+        self.count = 0.0
+        self.n_size = n_size
+        self.trans_func = trans_func
+
+    def update(self, output, *args):
+        """
+        Updates the metrics states. This method firstly will use
+        :meth:`trans_func` method to process the `output` to get the tokenized
+        candidate sentence list. Then call :meth:`add_inst` method to process
+        the candidate list one by one.
+
+        Args:
+            output (numpy.ndarray|Tensor):
+                The outputs of model.
+            args (tuple): The additional inputs.
+        """
+        if isinstance(output, paddle.Tensor):
+            output = output.numpy()
+
+        assert self.trans_func is not None, (
+            "The `update` method requires user " "to provide `trans_func` when initializing `Distinct`."
+        )
+        cand_list = self.trans_func(output)
+
+        for cand in cand_list:
+            self.add_inst(cand)
+
+    def add_inst(self, cand):
+        """
+        Updates the states based on the candidate.
+
+        Args:
+            cand (list): Tokenized candidate sentence generated by model.
+        """
+        for i in range(0, len(cand) - self.n_size + 1):
+            ngram = " ".join(cand[i : (i + self.n_size)])
+            self.count += 1
+            self.diff_ngram.add(ngram)
+
+    def reset(self):
+        """Resets states and result."""
+        self.diff_ngram = set()
+        self.count = 0.0
+
+    def accumulate(self):
+        """
+        Calculates the final distinct score.
+
+        Returns:
+            float: The final distinct score.
+        """
+        distinct = len(self.diff_ngram) / self.count
+        return distinct
+
+    def score(self):
+        """
+        The function is the same as :meth:`accumulate` method.
+
+        Returns:
+            float: The final distinct score.
+        """
+        return self.accumulate()
+
+    def name(self):
+        """
+        Returns the metric name.
+
+        Returns:
+            str: The metric name.
+        """
+        return self._name
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/dureader.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/dureader.py
new file mode 100644
index 000000000..003d554a4
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/dureader.py
@@ -0,0 +1,340 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Official evaluation script for SQuAD version 2.0.
+
+In addition to basic functionality, we also compute additional statistics and
+plot precision-recall curves if an additional na_prob.json file is provided.
+This file is expected to map question ID's to the model's predicted probability
+that a question is unanswerable.
+"""
+import collections
+import json
+import math
+
+from paddlenlp.metrics.bleu import BLEU
+from paddlenlp.metrics.rouge import RougeL
+
+
+def compute_predictions(
+    all_examples, all_features, all_results, n_best_size, max_answer_length, do_lower_case, verbose, tokenizer
+):
+    """Write final predictions to the json file and log-odds of null if needed."""
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
+    )
+
+    preds_for_eval = collections.OrderedDict()
+    preds_for_test = []
+
+    print(len(unique_id_to_result))
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        for (feature_index, feature) in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+            # if we could have irrelevant answers, get the min score of irrelevant
+
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= len(feature.tokens):
+                        continue
+                    if end_index >= len(feature.tokens):
+                        continue
+                    if start_index not in feature.token_to_orig_map:
+                        continue
+                    if end_index not in feature.token_to_orig_map:
+                        continue
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index],
+                        )
+                    )
+
+        prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
+
+        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+            "NbestPrediction", ["text", "start_logit", "end_logit"]
+        )
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+            if pred.start_index > 0:  # this is a non-null prediction
+                tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
+                orig_doc_start = feature.token_to_orig_map[pred.start_index]
+                orig_doc_end = feature.token_to_orig_map[pred.end_index]
+                orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
+                tok_text = "".join(tok_tokens)
+
+                # De-tokenize WordPieces that have been split off.
+                tok_text = tok_text.replace(" ##", "")
+                tok_text = tok_text.replace("##", "")
+
+                # Clean whitespace
+                tok_text = tok_text.strip()
+
+                tok_text = "".join(tok_text.split())
+                orig_text = "".join(orig_tokens)
+                final_text = get_final_text(tok_text, orig_text, tokenizer, verbose)
+                if final_text in seen_predictions:
+                    continue
+
+                seen_predictions[final_text] = True
+            else:
+                final_text = ""
+                seen_predictions[final_text] = True
+
+            nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
+
+        # if we didn't inlude the empty option in the n-best, inlcude it
+
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+        assert len(nbest) >= 1
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+            if not best_non_null_entry:
+                if entry.text:
+                    best_non_null_entry = entry
+                else:
+                    best_non_null_entry = _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)
+
+        preds_for_eval[example.qas_id] = best_non_null_entry.text
+
+        preds_for_test.append(
+            {
+                "yesno_answers": [],
+                "question": example.question_text,
+                "question_type": example.question_type,
+                "answers": [best_non_null_entry.text],
+                "question_id": example.qas_id,
+            }
+        )
+
+    return preds_for_eval, preds_for_test
+
+
+def get_final_text(pred_text, orig_text, tokenizer, verbose):
+    """Project the tokenized prediction back to the original text."""
+
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the SQuAD eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heruistic between
+    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+
+    tok_text = " ".join(tokenizer.basic_tokenizer.tokenize(orig_text))
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        if verbose:
+            print("Unable to find text: '%s' in '%s'" % (pred_text, tok_text))
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        if verbose:
+            print("Length not equal after stripping spaces: '%s' vs '%s'" % (orig_ns_text, tok_ns_text))
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for i, tok_index in tok_ns_to_s_map.items():
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        if verbose:
+            print("Couldn't map start position")
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        if verbose:
+            print("Couldn't map end position")
+        return orig_text
+
+    output_text = orig_text[orig_start_position : (orig_end_position + 1)]
+    return output_text
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
+
+
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
+
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+
+
+def normalize(s):
+    """
+    Normalize strings to space joined chars.
+    Args:
+        s: a list of strings.
+    Returns:
+        A list of normalized strings.
+    """
+    if not s:
+        return s
+    normalized = []
+    for ss in s:
+        tokens = [c for c in list(ss) if len(c.strip()) != 0]
+        norm_s = "".join(tokens)
+        norm_s = norm_s.replace("，", ",")
+        norm_s = norm_s.replace("。", ".")
+        norm_s = norm_s.replace("！", "!")
+        norm_s = norm_s.replace("？", "?")
+        norm_s = norm_s.replace("；", ";")
+        norm_s = norm_s.replace("（", "(").replace("）", ")")
+        norm_s = norm_s.replace("【", "[").replace("】", "]")
+        norm_s = norm_s.replace("“", '"').replace("“", '"')
+        normalized.append(norm_s)
+    return normalized
+
+
+def dureader_evaluate(examples, preds):
+    bleu_eval = BLEU(4)
+    rouge_eval = RougeL()
+
+    for example in examples:
+        qid = example.qas_id
+        if qid not in preds:
+            print("Missing prediction for %s" % qid)
+            continue
+        pred_answers = preds[qid]
+        pred_answers = normalize([pred_answers])[0]
+        ref_answers = example.orig_answer_text
+        if not ref_answers:
+            continue
+        ref_answers = normalize(ref_answers)
+
+        bleu_eval.add_inst(pred_answers, ref_answers)
+        rouge_eval.add_inst(pred_answers, ref_answers)
+
+    bleu4 = bleu_eval.score()
+    rouge_l = rouge_eval.score()
+    metrics = {"ROUGE-L": round(rouge_l * 100, 2), "BLEU-4": round(bleu4 * 100, 2)}
+
+    print(json.dumps(metrics).encode("utf8"))
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/glue.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/glue.py
new file mode 100644
index 000000000..8cfa40264
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/glue.py
@@ -0,0 +1,668 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import warnings
+
+import numpy as np
+import paddle
+from paddle.metric import Accuracy, Metric, Precision, Recall
+
+__all__ = ["AccuracyAndF1", "Mcc", "PearsonAndSpearman", "MultiLabelsMetric"]
+
+
+class AccuracyAndF1(Metric):
+    """
+    This class encapsulates Accuracy, Precision, Recall and F1 metric logic,
+    and `accumulate` function returns accuracy, precision, recall and f1.
+    The overview of all metrics could be seen at the document of `paddle.metric
+    <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/metric/Overview_cn.html>`_
+    for details.
+
+    Args:
+        topk (int or tuple(int), optional):
+            Number of top elements to look at for computing accuracy.
+            Defaults to (1,).
+        pos_label (int, optional): The positive label for calculating precision
+            and recall.
+            Defaults to 1.
+        name (str, optional):
+            String name of the metric instance. Defaults to 'acc_and_f1'.
+
+    Example:
+
+        .. code-block::
+
+            import paddle
+            from paddlenlp.metrics import AccuracyAndF1
+
+            x = paddle.to_tensor([[0.1, 0.9], [0.5, 0.5], [0.6, 0.4], [0.7, 0.3]])
+            y = paddle.to_tensor([[1], [0], [1], [1]])
+
+            m = AccuracyAndF1()
+            correct = m.compute(x, y)
+            m.update(correct)
+            res = m.accumulate()
+            print(res) # (0.5, 0.5, 0.3333333333333333, 0.4, 0.45)
+
+    """
+
+    def __init__(self, topk=(1,), pos_label=1, name="acc_and_f1", *args, **kwargs):
+        super(AccuracyAndF1, self).__init__(*args, **kwargs)
+        self.topk = topk
+        self.pos_label = pos_label
+        self._name = name
+        self.acc = Accuracy(self.topk, *args, **kwargs)
+        self.precision = Precision(*args, **kwargs)
+        self.recall = Recall(*args, **kwargs)
+        self.reset()
+
+    def compute(self, pred, label, *args):
+        """
+        Accepts network's output and the labels, and calculates the top-k
+        (maximum value in topk) indices for accuracy.
+
+        Args:
+            pred (Tensor):
+                Predicted tensor, and its dtype is float32 or float64, and
+                has a shape of [batch_size, num_classes].
+            label (Tensor):
+                The ground truth tensor, and its dtype is int64, and has a
+                shape of [batch_size, 1] or [batch_size, num_classes] in one
+                hot representation.
+
+        Returns:
+            Tensor: Correct mask, each element indicates whether the prediction
+            equals to the label. Its' a tensor with a data type of float32 and
+            has a shape of [batch_size, topk].
+
+        """
+        self.label = label
+        self.preds_pos = paddle.nn.functional.softmax(pred)[:, self.pos_label]
+        return self.acc.compute(pred, label)
+
+    def update(self, correct, *args):
+        """
+        Updates the metrics states (accuracy, precision and recall), in order to
+        calculate accumulated accuracy, precision and recall of all instances.
+
+        Args:
+            correct (Tensor):
+                Correct mask for calculating accuracy, and it's a tensor with
+                shape [batch_size, topk] and has a dtype of
+                float32.
+
+        """
+        self.acc.update(correct)
+        self.precision.update(self.preds_pos, self.label)
+        self.recall.update(self.preds_pos, self.label)
+
+    def accumulate(self):
+        """
+        Calculates and returns the accumulated metric.
+
+        Returns:
+            tuple: The accumulated metric. A tuple of shape (acc, precision,
+            recall, f1, average_of_acc_and_f1)
+
+            With the fields:
+
+            - `acc` (numpy.float64):
+                The accumulated accuracy.
+            - `precision` (numpy.float64):
+                The accumulated precision.
+            - `recall` (numpy.float64):
+                The accumulated recall.
+            - `f1` (numpy.float64):
+                The accumulated f1.
+            - `average_of_acc_and_f1` (numpy.float64):
+                The average of accumulated accuracy and f1.
+
+        """
+        acc = self.acc.accumulate()
+        precision = self.precision.accumulate()
+        recall = self.recall.accumulate()
+        if precision == 0.0 or recall == 0.0:
+            f1 = 0.0
+        else:
+            # 1/f1 = 1/2 * (1/precision + 1/recall)
+            f1 = (2 * precision * recall) / (precision + recall)
+        return (
+            acc,
+            precision,
+            recall,
+            f1,
+            (acc + f1) / 2,
+        )
+
+    def reset(self):
+        """
+        Resets all metric states.
+        """
+        self.acc.reset()
+        self.precision.reset()
+        self.recall.reset()
+        self.label = None
+        self.preds_pos = None
+
+    def name(self):
+        """
+        Returns name of the metric instance.
+
+        Returns:
+           str: The name of the metric instance.
+
+        """
+        return self._name
+
+
+class Mcc(Metric):
+    """
+    This class calculates `Matthews correlation coefficient <https://en.wikipedia.org/wiki/Matthews_correlation_coefficient>`_ .
+
+    Args:
+        name (str, optional):
+            String name of the metric instance. Defaults to 'mcc'.
+
+    Example:
+
+        .. code-block::
+
+            import paddle
+            from paddlenlp.metrics import Mcc
+
+            x = paddle.to_tensor([[-0.1, 0.12], [-0.23, 0.23], [-0.32, 0.21], [-0.13, 0.23]])
+            y = paddle.to_tensor([[1], [0], [1], [1]])
+
+            m = Mcc()
+            (preds, label) = m.compute(x, y)
+            m.update((preds, label))
+            res = m.accumulate()
+            print(res) # (0.0,)
+
+    """
+
+    def __init__(self, name="mcc", *args, **kwargs):
+        super(Mcc, self).__init__(*args, **kwargs)
+        self._name = name
+        self.tp = 0  # true positive
+        self.fp = 0  # false positive
+        self.tn = 0  # true negative
+        self.fn = 0  # false negative
+
+    def compute(self, pred, label, *args):
+        """
+        Processes the pred tensor, and returns the indices of the maximum of each
+        sample.
+
+        Args:
+            pred (Tensor):
+                The predicted value is a Tensor with dtype float32 or float64.
+                Shape is [batch_size, 1].
+            label (Tensor):
+                The ground truth value is Tensor with dtype int64, and its
+                shape is [batch_size, 1].
+
+        Returns:
+            tuple: A tuple of preds and label. Each shape is
+            [batch_size, 1], with dtype float32 or float64.
+
+        """
+        preds = paddle.argsort(pred, descending=True)[:, :1]
+        return (preds, label)
+
+    def update(self, preds_and_labels):
+        """
+        Calculates states, i.e. the number of true positive, false positive,
+        true negative and false negative samples.
+
+        Args:
+            preds_and_labels (tuple[Tensor]):
+                Tuple of predicted value and the ground truth label, with dtype
+                float32 or float64. Each shape is [batch_size, 1].
+
+        """
+        preds = preds_and_labels[0]
+        labels = preds_and_labels[1]
+        if isinstance(preds, paddle.Tensor):
+            preds = preds.numpy()
+        if isinstance(labels, paddle.Tensor):
+            labels = labels.numpy().reshape(-1, 1)
+        sample_num = labels.shape[0]
+        for i in range(sample_num):
+            pred = preds[i]
+            label = labels[i]
+            if pred == 1:
+                if pred == label:
+                    self.tp += 1
+                else:
+                    self.fp += 1
+            else:
+                if pred == label:
+                    self.tn += 1
+                else:
+                    self.fn += 1
+
+    def accumulate(self):
+        """
+        Calculates and returns the accumulated metric.
+
+        Returns:
+            tuple: Returns the accumulated metric, a tuple of shape (mcc,), `mcc` is the accumulated mcc and its data
+            type is float64.
+
+        """
+        if self.tp == 0 or self.fp == 0 or self.tn == 0 or self.fn == 0:
+            mcc = 0.0
+        else:
+            # mcc = (tp*tn-fp*fn)/ sqrt(tp+fp)(tp+fn)(tn+fp)(tn+fn))
+            mcc = (self.tp * self.tn - self.fp * self.fn) / math.sqrt(
+                (self.tp + self.fp) * (self.tp + self.fn) * (self.tn + self.fp) * (self.tn + self.fn)
+            )
+        return (mcc,)
+
+    def reset(self):
+        """
+        Resets all metric states.
+        """
+        self.tp = 0  # true positive
+        self.fp = 0  # false positive
+        self.tn = 0  # true negative
+        self.fn = 0  # false negative
+
+    def name(self):
+        """
+        Returns name of the metric instance.
+
+        Returns:
+            str: The name of the metric instance.
+
+        """
+        return self._name
+
+
+class PearsonAndSpearman(Metric):
+    """
+    The class calculates `Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_
+    and `Spearman's rank correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_ .
+
+
+    Args:
+        name (str, optional):
+            String name of the metric instance. Defaults to 'pearson_and_spearman'.
+
+    Example:
+
+        .. code-block::
+
+            import paddle
+            from paddlenlp.metrics import PearsonAndSpearman
+
+            x = paddle.to_tensor([[0.1], [1.0], [2.4], [0.9]])
+            y = paddle.to_tensor([[0.0], [1.0], [2.9], [1.0]])
+
+            m = PearsonAndSpearman()
+            m.update((x, y))
+            res = m.accumulate()
+            print(res) # (0.9985229081857804, 1.0, 0.9992614540928901)
+
+    """
+
+    def __init__(self, name="pearson_and_spearman", *args, **kwargs):
+        super(PearsonAndSpearman, self).__init__(*args, **kwargs)
+        self._name = name
+        self.preds = []
+        self.labels = []
+
+    def update(self, preds_and_labels):
+        """
+        Ensures the type of preds and labels is numpy.ndarray and reshapes them
+        into [-1, 1].
+
+        Args:
+            preds_and_labels (tuple[Tensor] or list[Tensor]):
+                Tuple or list of predicted value and the ground truth label.
+                Its data type should be float32 or float64 and its shape is [batch_size, d0, ..., dN].
+
+        """
+        preds = preds_and_labels[0]
+        labels = preds_and_labels[1]
+        if isinstance(preds, paddle.Tensor):
+            preds = preds.numpy()
+        if isinstance(labels, paddle.Tensor):
+            labels = labels.numpy()
+        preds = np.squeeze(preds.reshape(-1, 1)).tolist()
+        labels = np.squeeze(labels.reshape(-1, 1)).tolist()
+        self.preds.append(preds)
+        self.labels.append(labels)
+
+    def accumulate(self):
+        """
+        Calculates and returns the accumulated metric.
+
+        Returns:
+            tuple: Returns the accumulated metric, a tuple of (pearson, spearman,
+            the_average_of_pearson_and_spearman).
+
+            With the fields:
+
+            - `pearson` (numpy.float64):
+                The accumulated pearson.
+
+            - `spearman` (numpy.float64):
+                The accumulated spearman.
+
+            - `the_average_of_pearson_and_spearman` (numpy.float64):
+                The average of accumulated pearson and spearman correlation
+                coefficient.
+
+        """
+        preds = [item for sublist in self.preds for item in sublist]
+        labels = [item for sublist in self.labels for item in sublist]
+        pearson = self.pearson(preds, labels)
+        spearman = self.spearman(preds, labels)
+        return (
+            pearson,
+            spearman,
+            (pearson + spearman) / 2,
+        )
+
+    def pearson(self, preds, labels):
+        n = len(preds)
+        # simple sums
+        sum1 = sum(float(preds[i]) for i in range(n))
+        sum2 = sum(float(labels[i]) for i in range(n))
+        # sum up the squares
+        sum1_pow = sum([pow(v, 2.0) for v in preds])
+        sum2_pow = sum([pow(v, 2.0) for v in labels])
+        # sum up the products
+        p_sum = sum([preds[i] * labels[i] for i in range(n)])
+
+        numerator = p_sum - (sum1 * sum2 / n)
+        denominator = math.sqrt((sum1_pow - pow(sum1, 2) / n) * (sum2_pow - pow(sum2, 2) / n))
+        if denominator == 0:
+            return 0.0
+        return numerator / denominator
+
+    def spearman(self, preds, labels):
+        preds_rank = self.get_rank(preds)
+        labels_rank = self.get_rank(labels)
+
+        total = 0
+        n = len(preds)
+        for i in range(n):
+            total += pow((preds_rank[i] - labels_rank[i]), 2)
+        spearman = 1 - float(6 * total) / (n * (pow(n, 2) - 1))
+        return spearman
+
+    def get_rank(self, raw_list):
+        x = np.array(raw_list)
+        r_x = np.empty(x.shape, dtype=int)
+        y = np.argsort(-x)
+        for i, k in enumerate(y):
+            r_x[k] = i + 1
+        return r_x
+
+    def reset(self):
+        """
+        Resets all metric states.
+        """
+        self.preds = []
+        self.labels = []
+
+    def name(self):
+        """
+        Returns name of the metric instance.
+
+        Returns:
+           str: The name of the metric instance.
+
+        """
+        return self._name
+
+
+class MultiLabelsMetric(Metric):
+    """
+    This class encapsulates Accuracy, Precision, Recall and F1 metric logic in
+    multi-labels setting (also the binary setting).
+    Some codes are taken and modified from sklearn.metrics .
+
+    Args:
+        num_labels (int)
+            The total number of labels which is usually the number of classes
+        name (str, optional):
+            String name of the metric instance. Defaults to 'multi_labels_metric'.
+
+    Example:
+
+        .. code-block::
+
+            import paddle
+            from paddlenlp.metrics import MultiLabelsMetric
+
+            x = paddle.to_tensor([[0.1, 0.2, 0.9], [0.5, 0.8, 0.5], [0.6, 1.5, 0.4], [2.8, 0.7, 0.3]])
+            y = paddle.to_tensor([[2], [1], [2], [1]])
+
+            m = MultiLabelsMetric(num_labels=3)
+            args = m.compute(x, y)
+            m.update(args)
+
+            result1 = m.accumulate(average=None)
+            # (array([0.0, 0.5, 1.0]), array([0.0, 0.5, 0.5]), array([0.0, 0.5, 0.66666667]))
+            result2 = m.accumulate(average='binary', pos_label=0)
+            # (0.0, 0.0, 0.0)
+            result3 = m.accumulate(average='binary', pos_label=1)
+            # (0.5, 0.5, 0.5)
+            result4 = m.accumulate(average='binary', pos_label=2)
+            # (1.0, 0.5, 0.6666666666666666)
+            result5 = m.accumulate(average='micro')
+            # (0.5, 0.5, 0.5)
+            result6 = m.accumulate(average='macro')
+            # (0.5, 0.3333333333333333, 0.38888888888888884)
+            result7 = m.accumulate(average='weighted')
+            # (0.75, 0.5, 0.5833333333333333)
+
+    Note: When zero_division is encountered (details as followed), the corresponding metrics will be set to 0.0
+        precision is zero_division if there are no positive predictions
+        recall is zero_division if there are no positive labels
+        fscore is zero_division if all labels AND predictions are negative
+    """
+
+    def __init__(self, num_labels, name="multi_labels_metric"):
+        super(MultiLabelsMetric, self).__init__()
+        if num_labels <= 1:
+            raise ValueError(f"The num_labels is {num_labels}, which must be greater than 1.")
+        self.num_labels = num_labels
+        self._name = name
+        self._confusion_matrix = np.zeros((num_labels, 2, 2), dtype=int)
+
+    def update(self, args):
+        """
+        Updates the metrics states (accuracy, precision and recall), in order to
+        calculate accumulated accuracy, precision and recall of all instances.
+
+        Args:
+            args (tuple of Tensor):
+                the tuple returned from `compute` function
+        """
+        pred = args[0].numpy()
+        label = args[1].numpy()
+        tmp_confusion_matrix = self._multi_labels_confusion_matrix(pred, label)
+        self._confusion_matrix += tmp_confusion_matrix
+
+    def accumulate(self, average=None, pos_label=1):
+        """
+        Calculates and returns the accumulated metric.
+
+        Args:
+            average (str in {‘binary’, ‘micro’, ‘macro’, ’weighted’} or None, optional):
+            Defaults to `None`. If `None`, the scores for each class are returned.
+            Otherwise, this determines the type of averaging performed on the data:
+
+            - `binary` :
+                Only report results for the class specified by pos_label.
+
+            - `micro` :
+                Calculate metrics globally by counting the total true positives,
+                false negatives and false positives.
+
+            - `macro` :
+                Calculate metrics for each label, and find their unweighted mean.
+                This does not take label imbalance into account.
+
+            - `weighted` :
+                Calculate metrics for each label, and find their average weighted
+                by support (the number of true instances for each label). This
+                alters `macro` to account for label imbalance; it can result in
+                an F-score that is not between precision and recall.
+
+            pos_label (int, optional):
+                The positive label for calculating precision and recall in binary settings.
+                Noted: Only when `average='binary'`, this arguments will be used. Otherwise,
+                it will be ignored.
+                Defaults to 1.
+
+        Returns:
+            tuple: The accumulated metric. A tuple of shape (precision, recall, f1)
+                With the fields:
+
+                - `precision` (numpy.float64 or numpy.ndarray if average=None):
+                    The accumulated precision.
+                - `recall` (numpy.float64 or numpy.ndarray if average=None):
+                    The accumulated recall.
+                - `f1` (numpy.float64 or numpy.ndarray if average=None):
+                    The accumulated f1.
+
+        """
+        if average not in {"binary", "micro", "macro", "weighted", None}:
+            raise ValueError(f"The average is {average}, which is unknown.")
+        if average == "binary":
+            if pos_label >= self.num_labels:
+                raise ValueError(
+                    f"The pos_label is {pos_label}, num_labels is {self.num_labels}. "
+                    f"The num_labels must be greater than pos_label."
+                )
+
+        confusion_matrix = None  # [*, 2, 2]
+        if average == "binary":
+            confusion_matrix = np.expand_dims(self._confusion_matrix[pos_label], axis=0)
+        elif average == "micro":
+            confusion_matrix = self._confusion_matrix.sum(axis=0, keepdims=True)
+        #  if average is 'macro' or 'weighted' or None
+        else:
+            confusion_matrix = self._confusion_matrix
+
+        tp = confusion_matrix[:, 1, 1]  # [*,]
+        pred = tp + confusion_matrix[:, 0, 1]  # [*,]
+        true = tp + confusion_matrix[:, 1, 0]  # [*,]
+
+        def _robust_divide(numerator, denominator, metric_name):
+            mask = denominator == 0.0
+            denominator = denominator.copy()
+            denominator[mask] = 1  # avoid zero division
+            result = numerator / denominator
+
+            if not np.any(mask):
+                return result
+
+            # precision is zero_division if there are no positive predictions
+            # recall is zero_division if there are no positive labels
+            # fscore is zero_division if all labels AND predictions are negative
+            warnings.warn(f"Zero division when calculating {metric_name}.", UserWarning)
+            result[mask] = 0.0
+            return result
+
+        precision = _robust_divide(tp, pred, "precision")
+        recall = _robust_divide(tp, true, "recall")
+        f1 = _robust_divide(2 * (precision * recall), (precision + recall), "f1")
+
+        weights = None  # [num_labels]
+        if average == "weighted":
+            weights = true
+            if weights.sum() == 0:
+                zero_division_value = np.float64(0.0)
+                if pred.sum() == 0:
+                    return (zero_division_value, zero_division_value, zero_division_value)
+                else:
+                    return (np.float64(0.0), zero_division_value, np.float64(0.0))
+        elif average == "macro":
+            weights = np.ones((self.num_labels), dtype=float)
+        if average is not None:
+            precision = np.average(precision, weights=weights)
+            recall = np.average(recall, weights=weights)
+            f1 = np.average(f1, weights=weights)
+
+        return precision, recall, f1
+
+    def compute(self, pred, label):
+        """
+        Accepts network's output and the labels, and calculates the top-k
+        (maximum value in topk) indices for accuracy.
+
+        Args:
+            pred (Tensor):
+                Predicted tensor, and its dtype is float32 or float64, and
+                has a shape of [batch_size, *, num_labels].
+            label (Tensor):
+                The ground truth tensor, and its dtype is int64, and has a
+                shape of [batch_size, *] or [batch_size, *, num_labels] in one
+                hot representation.
+
+        Returns:
+            tuple of Tensor: it contains two Tensor of shape [*, 1].
+            The tuple should be passed to `update` function.
+        """
+        if not (paddle.is_tensor(pred) and paddle.is_tensor(label)):
+            raise ValueError("pred and label must be paddle tensor")
+
+        if pred.shape[-1] != self.num_labels:
+            raise ValueError(f"The last dim of pred is {pred.shape[-1]}, " f"which should be num_labels")
+        pred = paddle.reshape(pred, [-1, self.num_labels])
+        pred = paddle.argmax(pred, axis=-1)
+
+        if label.shape[-1] == self.num_labels:
+            label = paddle.reshape(label, [-1, self.num_labels])
+            label = paddle.argmax(label, axis=-1)
+        else:
+            label = paddle.reshape(label, [-1])
+            if paddle.max(label) >= self.num_labels:
+                raise ValueError(f"Tensor label has value {paddle.max(label)}, " f"which is no less than num_labels")
+
+        if pred.shape[0] != label.shape[0]:
+            raise ValueError("The length of pred is not equal to the length of label")
+
+        return pred, label
+
+    def _multi_labels_confusion_matrix(self, pred, label):
+        tp_bins = label[pred == label]
+        tp = np.bincount(tp_bins, minlength=self.num_labels)  # [num_labels,]
+        tp_plus_fp = np.bincount(pred, minlength=self.num_labels)  # [num_labels,]
+        tp_plus_fn = np.bincount(label, minlength=self.num_labels)  # [num_labels,]
+        fp = tp_plus_fp - tp  # [num_labels,]
+        fn = tp_plus_fn - tp  # [num_labels,]
+        tn = pred.shape[0] - tp - fp - fn  # [num_labels,]
+        return np.array([tn, fp, fn, tp]).T.reshape(-1, 2, 2)  # [num_labels, 2, 2]
+
+    def reset(self):
+        self._confusion_matrix = np.zeros((self.num_labels, 2, 2), dtype=int)
+
+    def name(self):
+        """
+        Returns name of the metric instance.
+
+        Returns:
+           str: The name of the metric instance.
+
+        """
+        return self._name
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/mrr.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/mrr.py
new file mode 100644
index 000000000..128ee9670
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/mrr.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from sklearn.metrics import pairwise_distances
+
+__all__ = ["MRR"]
+
+
+class MRR:
+    """
+    MRR - Mean Reciprocal Rank, is a popular metric for recommend system
+    and other retrival task. The higher mrr is, the better performance of
+    model in retrival task.
+
+    Args:
+        distance: which algorithm to use to get distance of embeddings, for example: "cosine", "euclidean"
+
+    """
+
+    def __init__(self, distance="cosine"):
+        super().__init__()
+        self.distance = distance
+
+    def reset_distance(self, distance):
+        """
+        change the algorithm of calculating distance, need to be supported of sklearn.metrics.pairwise_distance
+        """
+        self.distance = distance
+
+    def compute_matrix_mrr(self, labels, embeddings):
+        """
+        A function which can calculate the distance of one embedding to other embeddings
+        in the matrix, and then it can find the most similar embedding's index to calculate
+        the mrr metric for this one embedding. After getting all the embeddings' mrr metric,
+        a mean pool is used to get the final mrr metric for input matrix.
+
+        Param:
+          - labels(np.array): label matrix, shape=[size, ]
+          - embeddings(np.array): embedding matrix, shape=[size, emb_dim]
+
+        Return:
+            mrr metric for input embedding matrix.
+        """
+        matrix_size = labels.shape[0]
+        if labels.shape[0] != embeddings.shape[0]:
+            raise Exception("label and embedding matrix must have same size at dim=0 !")
+        row_mrr = []  # mrr metric for each embedding of matrix
+        for i in range(0, matrix_size):
+            emb, label = embeddings[i, :], labels[i]
+            dists = pairwise_distances(emb.reshape(1, -1), embeddings, metric=self.distance).reshape(-1)
+            ranks_ids = np.argsort(dists)[1:]
+            ranks = (labels[ranks_ids] == label).astype(int)
+            ranks_nonzero_ids = ranks.nonzero()[0]
+            row_mrr.append(1.0 / (1 + ranks_nonzero_ids[0]) if ranks_nonzero_ids.size else 0.0)
+        mrr = np.mean(row_mrr)  # user mean value as final mrr metric for the matrix.
+        return mrr
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/perplexity.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/perplexity.py
new file mode 100644
index 000000000..a785d3780
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/perplexity.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+
+class Perplexity(paddle.metric.Metric):
+    """
+    Perplexity is a metric used to judge how good a language model is.
+    We can define perplexity as the inverse probability of the test set,
+    normalised by the number of the words in the test set.
+    Perplexity is calculated using cross entropy. It supports both padding data
+    and no padding data.
+
+    If data is not padded, users should provide `seq_len` for `Metric`
+    initialization. If data is padded, your label should contain `seq_mask`,
+    which indicates the actual length of samples.
+
+    This Perplexity requires that the output of your network is prediction,
+    label and sequence length (optional). If the Perplexity here doesn't meet
+    your needs, you could override the `compute` or `update` method for
+    calculating Perplexity.
+
+    Args:
+        seq_len(int): Sequence length of each sample, it must be provided while
+            data is not padded. Defaults to 20.
+        name(str): Name of `Metric` instance. Defaults to 'Perplexity'.
+
+    Example:
+        .. code-block::
+
+            import paddle
+            from paddlenlp.transformers import BertTokenizer
+            from paddlenlp.metrics import Perplexity
+
+            paddle.seed(2021)
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            batch_size, seq_len, vocab_size = 1, 4, tokenizer.vocab_size
+            logits = paddle.rand([batch_size, seq_len, vocab_size])
+            labels= paddle.to_tensor([[1,0,1,1]])
+
+            perplexity = Perplexity()
+            correct = perplexity.compute(logits,labels)
+            perplexity.update(correct.numpy())
+            res = perplexity.accumulate()
+            print(res)
+            # 48263.528820122105
+    """
+
+    def __init__(self, name="Perplexity", *args, **kwargs):
+        super(Perplexity, self).__init__(*args, **kwargs)
+        self._name = name
+        self.total_ce = 0
+        self.total_word_num = 0
+
+    def compute(self, pred, label, seq_mask=None):
+        """
+        Computes cross entropy loss.
+
+        Args:
+            pred (Tensor):
+                Predictor tensor, and its dtype is float32 or float64, and has
+                a shape of [batch_size, sequence_length, vocab_size].
+            label(Tensor):
+                Label tensor, and its dtype is int64, and has a shape of
+                [batch_size, sequence_length, 1] or [batch_size, sequence_length].
+            seq_mask(Tensor, optional):
+                Sequence mask tensor, and its type could be float32, float64,
+                int32 or int64, and has a shape of [batch_size, sequence_length].
+                It's used to calculate loss. Defaults to None.
+
+        Returns:
+            tuple or Tensor: Returns tuple (`ce, word_num`) if `seq_mask` is not None. Otherwise, returns tensor `ce`.
+            `ce` it the cross entropy loss, its shape is [batch_size, sequence_length] and its data type should be float32.
+
+        """
+        if label.dim() == 2:
+            label = paddle.unsqueeze(label, axis=2)
+        ce = F.cross_entropy(input=pred, label=label, reduction="none", soft_label=False)
+        ce = paddle.squeeze(ce, axis=[2])
+        if seq_mask is not None:
+            ce = ce * seq_mask.astype(ce.dtype)
+            word_num = paddle.sum(seq_mask)
+            return ce, word_num
+        return ce
+
+    def update(self, ce, word_num=None):
+        """
+        Updates metric states.
+
+        Args:
+            ce (numpy.ndarray):
+                Cross entropy loss, it's calculated by `compute` and converted
+                to `numpy.ndarray`.
+            word_num (numpy.ndarray):
+               The number of words of sequence, it's calculated by `compute`
+               and converted to `numpy.ndarray`. Defaults to None.
+
+        """
+        batch_ce = np.sum(ce)
+        if word_num is None:
+            word_num = ce.shape[0] * ce.shape[1]
+        else:
+            word_num = word_num.item()
+        self.total_ce += batch_ce
+        self.total_word_num += word_num
+
+    def reset(self):
+        """
+        Resets all metric states.
+        """
+        self.total_ce = 0
+        self.total_word_num = 0
+
+    def accumulate(self):
+        """
+        Calculates and returns the value of perplexity.
+
+        Returns:
+            float: Returns `perplexity`, the calculation results.
+        """
+        return np.exp(self.total_ce / self.total_word_num)
+
+    def name(self):
+        """
+        Returns name of the metric instance.
+
+        Returns:
+           str: The name of the metric instance.
+
+        """
+        return self._name
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/rouge.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/rouge.py
new file mode 100644
index 000000000..0991c1586
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/rouge.py
@@ -0,0 +1,284 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+from .utils import default_trans_func
+
+__all__ = ["RougeL", "RougeLForDuReader"]
+
+
+class RougeN:
+    def __init__(self, n):
+        self.n = n
+
+    def _get_ngrams(self, words):
+        """Calculates word n-grams for multiple sentences."""
+        ngram_set = set()
+        max_index_ngram_start = len(words) - self.n
+        for i in range(max_index_ngram_start + 1):
+            ngram_set.add(tuple(words[i : i + self.n]))
+        return ngram_set
+
+    def score(self, evaluated_sentences_ids, reference_sentences_ids):
+        overlapping_count, reference_count = self.compute(evaluated_sentences_ids, reference_sentences_ids)
+        return overlapping_count / reference_count
+
+    def compute(self, evaluated_sentences_ids, reference_sentences_ids):
+        """
+        Args:
+            evaluated_sentences (list): the sentences ids predicted by the model.
+            reference_sentences (list): the referenced sentences ids. Its size should be same as evaluated_sentences.
+
+        Returns:
+            overlapping_count (int): the overlapping n-gram count.
+            reference_count (int): the reference sentences n-gram count.
+        """
+        if len(evaluated_sentences_ids) <= 0 or len(reference_sentences_ids) <= 0:
+            raise ValueError("Collections must contain at least 1 sentence.")
+
+        reference_count = 0
+        overlapping_count = 0
+
+        for evaluated_sentence_ids, reference_sentence_ids in zip(evaluated_sentences_ids, reference_sentences_ids):
+            evaluated_ngrams = self._get_ngrams(evaluated_sentence_ids)
+            reference_ngrams = self._get_ngrams(reference_sentence_ids)
+            reference_count += len(reference_ngrams)
+
+            # Gets the overlapping ngrams between evaluated and reference
+            overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)
+            overlapping_count += len(overlapping_ngrams)
+
+        return overlapping_count, reference_count
+
+    def accumulate(self):
+        """
+        This function returns the mean precision, recall and f1 score for all accumulated minibatches.
+
+        Returns:
+            float: mean precision, recall and f1 score.
+        """
+        rouge_score = self.overlapping_count / self.reference_count
+        return rouge_score
+
+    def reset(self):
+        """
+        Reset function empties the evaluation memory for previous mini-batches.
+        """
+        self.overlapping_count = 0
+        self.reference_count = 0
+
+    def name(self):
+        """
+        Return name of metric instance.
+        """
+        return "Rouge-%s" % self.n
+
+    def update(self, overlapping_count, reference_count):
+        """
+        Args:
+        """
+        self.overlapping_count += overlapping_count
+        self.reference_count += reference_count
+
+
+class Rouge1(RougeN):
+    def __init__(self):
+        super(Rouge1, self).__init__(n=1)
+
+
+class Rouge2(RougeN):
+    def __init__(self):
+        super(Rouge2, self).__init__(n=2)
+
+
+class RougeL(paddle.metric.Metric):
+    r"""
+    Rouge-L is Recall-Oriented Understudy for Gisting Evaluation based on Longest Common Subsequence (LCS).
+    Longest common subsequence problem takes into account sentence level structure
+    similarity naturally and identifies longest co-occurring
+    in sequence n-grams automatically.
+
+    .. math::
+
+        R_{LCS} & = \frac{LCS(C,S)}{len(S)}
+
+        P_{LCS} & = \frac{LCS(C,S)}{len(C)}
+
+        F_{LCS} & = \frac{(1 + \gamma^2)R_{LCS}P_{LCS}}{R_{LCS}} + \gamma^2{R_{LCS}}
+
+    where `C` is the candidate sentence, and `S` is the reference sentence.
+
+    Args:
+        trans_func (callable, optional): `trans_func` transforms the network
+            output to string to calculate.
+        vocab (dict|paddlenlp.data.vocab, optional): Vocab for target language.
+            If `trans_func` is None and RougeL is used as `paddle.metric.Metric`
+            instance, `default_trans_func` will be performed and `vocab` must
+            be provided.
+        gamma (float): A hyperparameter to decide the weight of recall. Defaults to 1.2.
+        name (str, optional): Name of `paddle.metric.Metric` instance. Defaults to "rouge-l".
+
+    Examples:
+        .. code-block:: python
+
+            from paddlenlp.metrics import RougeL
+            rougel = RougeL()
+            cand = ["The","cat","The","cat","on","the","mat"]
+            ref_list = [["The","cat","is","on","the","mat"], ["There","is","a","cat","on","the","mat"]]
+            rougel.add_inst(cand, ref_list)
+            print(rougel.score()) # 0.7800511508951408
+
+    """
+
+    def __init__(self, trans_func=None, vocab=None, gamma=1.2, name="rouge-l", *args, **kwargs):
+        super(RougeL, self).__init__(*args, **kwargs)
+        self.gamma = gamma
+        self.inst_scores = []
+        self._name = name
+        self.vocab = vocab
+        self.trans_func = trans_func
+
+    def lcs(self, string, sub):
+        """
+        Calculate the length of longest common subsequence of string and sub.
+
+        Args:
+            string (str):
+                The string to be calculated, usually longer the sub string.
+            sub (str):
+                The sub string to be calculated.
+
+        Returns:
+            float: Returns the length of the longest common subsequence of string and sub.
+        """
+        if len(string) < len(sub):
+            sub, string = string, sub
+        lengths = np.zeros((len(string) + 1, len(sub) + 1))
+        for j in range(1, len(sub) + 1):
+            for i in range(1, len(string) + 1):
+                if string[i - 1] == sub[j - 1]:
+                    lengths[i][j] = lengths[i - 1][j - 1] + 1
+                else:
+                    lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1])
+        return lengths[len(string)][len(sub)]
+
+    def add_inst(self, cand, ref_list):
+        """
+        Update the states based on the a pair of candidate and references.
+
+        Args:
+            cand (str): The candidate sentence generated by model.
+            ref_list (list): List of ground truth sentences.
+        """
+        precs, recalls = [], []
+        for ref in ref_list:
+            basic_lcs = self.lcs(cand, ref)
+            prec = basic_lcs / len(cand) if len(cand) > 0.0 else 0.0
+            rec = basic_lcs / len(ref) if len(ref) > 0.0 else 0.0
+            precs.append(prec)
+            recalls.append(rec)
+
+        prec_max = max(precs)
+        rec_max = max(recalls)
+
+        if prec_max != 0 and rec_max != 0:
+            score = ((1 + self.gamma**2) * prec_max * rec_max) / float(rec_max + self.gamma**2 * prec_max)
+        else:
+            score = 0.0
+        self.inst_scores.append(score)
+
+    def update(self, output, label, seq_mask=None):
+        if self.trans_func is None:
+            if self.vocab is None:
+                raise AttributeError(
+                    "The `update` method requires users to provide `trans_func` or `vocab` when initializing RougeL."
+                )
+            cand_list, ref_list = default_trans_func(output, label, seq_mask, self.vocab)
+        else:
+            cand_list, ref_list = self.trans_func(output, label, seq_mask)
+        if len(cand_list) != len(ref_list):
+            raise ValueError("Length error! Please check the output of network.")
+        for i in range(len(cand_list)):
+            self.add_inst(cand_list[i], ref_list[i])
+
+    def accumulate(self):
+        """
+        Calculate the final rouge-l metric.
+        """
+        return 1.0 * sum(self.inst_scores) / len(self.inst_scores)
+
+    def score(self):
+        return self.accumulate()
+
+    def reset(self):
+        self.inst_scores = []
+
+    def name(self):
+        return self._name
+
+
+class RougeLForDuReader(RougeL):
+    """
+    Rouge-L metric with bonus for DuReader contest.
+
+    Please refer to `DuReader Homepage<https://ai.baidu.com//broad/subordinate?dataset=dureader>`_ for more details.
+
+    Args:
+        alpha (float, optional): Weight of YesNo dataset when adding bonus for DuReader contest. Defaults to 1.0.
+        beta (float, optional): Weight of Entity dataset when adding bonus for DuReader contest. Defaults to 1.0.
+    """
+
+    def __init__(self, alpha=1.0, beta=1.0, gamma=1.2):
+        super(RougeLForDuReader, self).__init__(gamma)
+        self.alpha = alpha
+        self.beta = beta
+
+    def add_inst(self, cand, ref_list, yn_label=None, yn_ref=None, entity_ref=None):
+        precs, recalls = [], []
+        for i, ref in enumerate(ref_list):
+            basic_lcs = self.lcs(cand, ref)
+            yn_bonus, entity_bonus = 0.0, 0.0
+            if yn_ref is not None and yn_label is not None:
+                yn_bonus = self.add_yn_bonus(cand, ref, yn_label, yn_ref[i])
+            elif entity_ref is not None:
+                entity_bonus = self.add_entity_bonus(cand, entity_ref)
+            p_denom = len(cand) + self.alpha * yn_bonus + self.beta * entity_bonus
+            r_denom = len(ref) + self.alpha * yn_bonus + self.beta * entity_bonus
+            prec = (basic_lcs + self.alpha * yn_bonus + self.beta * entity_bonus) / p_denom if p_denom > 0.0 else 0.0
+            rec = (basic_lcs + self.alpha * yn_bonus + self.beta * entity_bonus) / r_denom if r_denom > 0.0 else 0.0
+            precs.append(prec)
+            recalls.append(rec)
+
+        prec_max = max(precs)
+        rec_max = max(recalls)
+        if prec_max != 0 and rec_max != 0:
+            score = ((1 + self.gamma**2) * prec_max * rec_max) / float(rec_max + self.gamma**2 * prec_max)
+        else:
+            score = 0.0
+        self.inst_scores.append(score)
+
+    def add_yn_bonus(self, cand, ref, yn_label, yn_ref):
+        if yn_label != yn_ref:
+            return 0.0
+        lcs_ = self.lcs(cand, ref)
+        return lcs_
+
+    def add_entity_bonus(self, cand, entity_ref):
+        lcs_ = 0.0
+        for ent in entity_ref:
+            if ent in cand:
+                lcs_ += len(ent)
+        return lcs_
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/sighan.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/sighan.py
new file mode 100644
index 000000000..e9226ab3f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/sighan.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import numpy as np
+from paddle.metric import Metric
+
+__all__ = ["DetectionF1", "CorrectionF1"]
+
+
+class DetectionF1(Metric):
+    def __init__(self, pos_label=1, name="DetectionF1", *args, **kwargs):
+        super(DetectionF1, self).__init__(*args, **kwargs)
+        self.pos_label = pos_label
+        self._name = name
+        self.reset()
+
+    def update(self, preds, labels, length, *args):
+        # [B, T, 2]
+        pred_labels = preds.argmax(axis=-1)
+        for i, label_length in enumerate(length):
+            pred_label = pred_labels[i][1 : 1 + label_length]
+            label = labels[i][1 : 1 + label_length]
+            # the sequence has errors
+            if (label == self.pos_label).any():
+                if (pred_label == label).all():
+                    self.tp += 1
+                else:
+                    self.fn += 1
+            else:
+                if (label != pred_label).any():
+                    self.fp += 1
+
+    def reset(self):
+        """
+        Resets all of the metric state.
+        """
+        self.tp = 0
+        self.fp = 0
+        self.fn = 0
+
+    def accumulate(self):
+        precision = np.nan
+        if self.tp + self.fp > 0:
+            precision = self.tp / (self.tp + self.fp)
+        recall = np.nan
+        if self.tp + self.fn > 0:
+            recall = self.tp / (self.tp + self.fn)
+        if self.tp == 0:
+            f1 = 0.0
+        else:
+            f1 = 2 * precision * recall / (precision + recall)
+        return f1, precision, recall
+
+    def name(self):
+        """
+        Returns name of the metric instance.
+
+        Returns:
+           str: The name of the metric instance.
+
+        """
+        return self._name
+
+
+class CorrectionF1(DetectionF1):
+    def __init__(self, pos_label=1, name="CorrectionF1", *args, **kwargs):
+        super(CorrectionF1, self).__init__(pos_label, name, *args, **kwargs)
+
+    def update(self, det_preds, det_labels, corr_preds, corr_labels, length, *args):
+        # [B, T, 2]
+        det_preds_labels = det_preds.argmax(axis=-1)
+        corr_preds_labels = corr_preds.argmax(axis=-1)
+
+        for i, label_length in enumerate(length):
+            # Ignore [CLS] token, so calculate from position 1.
+            det_preds_label = det_preds_labels[i][1 : 1 + label_length]
+            det_label = det_labels[i][1 : 1 + label_length]
+            corr_preds_label = corr_preds_labels[i][1 : 1 + label_length]
+            corr_label = corr_labels[i][1 : 1 + label_length]
+
+            # The sequence has any errors.
+            if (det_label == self.pos_label).any():
+                corr_pred_label = corr_preds_label * det_preds_label
+                corr_label = det_label * corr_label
+                if (corr_pred_label == corr_label).all():
+                    self.tp += 1
+                else:
+                    self.fn += 1
+            else:
+                if (det_label != det_preds_label).any():
+                    self.fp += 1
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/span.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/span.py
new file mode 100644
index 000000000..a432fd0cd
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/span.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.metric import Metric
+
+from ..utils.tools import get_bool_ids_greater_than, get_span
+
+
+class SpanEvaluator(Metric):
+    """
+    SpanEvaluator computes the precision, recall and F1-score for span detection.
+    """
+
+    def __init__(self, limit=0.5):
+        super(SpanEvaluator, self).__init__()
+        self.num_infer_spans = 0
+        self.num_label_spans = 0
+        self.num_correct_spans = 0
+        self.limit = limit
+
+    def compute(self, start_probs, end_probs, gold_start_ids, gold_end_ids):
+        """
+        Computes the precision, recall and F1-score for span detection.
+        """
+        pred_start_ids = get_bool_ids_greater_than(start_probs, self.limit)
+        pred_end_ids = get_bool_ids_greater_than(end_probs, self.limit)
+        gold_start_ids = get_bool_ids_greater_than(gold_start_ids.tolist(), self.limit)
+        gold_end_ids = get_bool_ids_greater_than(gold_end_ids.tolist(), self.limit)
+        num_correct_spans = 0
+        num_infer_spans = 0
+        num_label_spans = 0
+        for predict_start_ids, predict_end_ids, label_start_ids, label_end_ids in zip(
+            pred_start_ids, pred_end_ids, gold_start_ids, gold_end_ids
+        ):
+            [_correct, _infer, _label] = self.eval_span(
+                predict_start_ids, predict_end_ids, label_start_ids, label_end_ids
+            )
+            num_correct_spans += _correct
+            num_infer_spans += _infer
+            num_label_spans += _label
+        return num_correct_spans, num_infer_spans, num_label_spans
+
+    def update(self, num_correct_spans, num_infer_spans, num_label_spans):
+        """
+        This function takes (num_infer_spans, num_label_spans, num_correct_spans) as input,
+        to accumulate and update the corresponding status of the SpanEvaluator object.
+        """
+        self.num_infer_spans += num_infer_spans
+        self.num_label_spans += num_label_spans
+        self.num_correct_spans += num_correct_spans
+
+    def eval_span(self, predict_start_ids, predict_end_ids, label_start_ids, label_end_ids):
+        """
+        evaluate position extraction (start, end)
+        return num_correct, num_infer, num_label
+        input: [1, 2, 10] [4, 12] [2, 10] [4, 11]
+        output: (1, 2, 2)
+        """
+        pred_set = get_span(predict_start_ids, predict_end_ids)
+        label_set = get_span(label_start_ids, label_end_ids)
+        num_correct = len(pred_set & label_set)
+        num_infer = len(pred_set)
+        # For the case of overlapping in the same category,
+        # length of label_start_ids and label_end_ids is not equal
+        num_label = max(len(label_start_ids), len(label_end_ids))
+        return (num_correct, num_infer, num_label)
+
+    def accumulate(self):
+        """
+        This function returns the mean precision, recall and f1 score for all accumulated minibatches.
+
+        Returns:
+            tuple: Returns tuple (`precision, recall, f1 score`).
+        """
+        precision = float(self.num_correct_spans / self.num_infer_spans) if self.num_infer_spans else 0.0
+        recall = float(self.num_correct_spans / self.num_label_spans) if self.num_label_spans else 0.0
+        f1_score = float(2 * precision * recall / (precision + recall)) if self.num_correct_spans else 0.0
+        return precision, recall, f1_score
+
+    def reset(self):
+        """
+        Reset function empties the evaluation memory for previous mini-batches.
+        """
+        self.num_infer_spans = 0
+        self.num_label_spans = 0
+        self.num_correct_spans = 0
+
+    def name(self):
+        """
+        Return name of metric instance.
+        """
+        return "precision", "recall", "f1"
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/squad.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/squad.py
new file mode 100644
index 000000000..37b0f1ca7
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/squad.py
@@ -0,0 +1,436 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import re
+import string
+import json
+import numpy as np
+
+from ..utils.log import logger
+
+
+def compute_prediction(
+    examples,
+    features,
+    predictions,
+    version_2_with_negative=False,
+    n_best_size=20,
+    max_answer_length=30,
+    null_score_diff_threshold=0.0,
+):
+    """
+    Post-processes the predictions of a question-answering model to convert
+    them to answers that are substrings of the original contexts. This is
+    the base postprocessing functions for models that only return start and
+    end logits.
+
+    Args:
+        examples (list): List of raw squad-style data (see `run_squad.py
+            <https://github.com/PaddlePaddle/PaddleNLP/blob/develop/examples/
+            machine_reading_comprehension/SQuAD/run_squad.py>`__ for more
+            information).
+        features (list): List of processed squad-style features (see
+            `run_squad.py <https://github.com/PaddlePaddle/PaddleNLP/blob/
+            develop/examples/machine_reading_comprehension/SQuAD/run_squad.py>`__
+            for more information).
+        predictions (tuple): The predictions of the model. Should be a tuple
+            of two list containing the start logits and the end logits.
+        version_2_with_negative (bool, optional): Whether the dataset contains
+            examples with no answers. Defaults to False.
+        n_best_size (int, optional): The total number of candidate predictions
+            to generate. Defaults to 20.
+        max_answer_length (int, optional): The maximum length of predicted answer.
+            Defaults to 20.
+        null_score_diff_threshold (float, optional): The threshold used to select
+            the null answer. Only useful when `version_2_with_negative` is True.
+            Defaults to 0.0.
+
+    Returns:
+        A tuple of three dictionaries containing final selected answer, all n_best
+        answers along with their probability and scores, and the score_diff of each
+        example.
+    """
+    assert len(predictions) == 2, "`predictions` should be a tuple with two elements (start_logits, end_logits)."
+    all_start_logits, all_end_logits = predictions
+
+    assert len(predictions[0]) == len(features), "Number of predictions should be equal to number of features."
+
+    # Build a map example to its corresponding features.
+    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+    features_per_example = collections.defaultdict(list)
+    for i, feature in enumerate(features):
+        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
+
+    # The dictionaries we have to fill.
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+
+    scores_diff_json = collections.OrderedDict()
+
+    # Let's loop over all the examples!
+    for example_index, example in enumerate(examples):
+        # Those are the indices of the features associated to the current example.
+        feature_indices = features_per_example[example_index]
+
+        min_null_prediction = None
+        prelim_predictions = []
+
+        # Looping through all the features associated to the current example.
+        for feature_index in feature_indices:
+            # We grab the predictions of the model for this feature.
+            start_logits = all_start_logits[feature_index]
+            end_logits = all_end_logits[feature_index]
+            # This is what will allow us to map some the positions in our logits to span of texts in the original
+            # context.
+            offset_mapping = features[feature_index]["offset_mapping"]
+            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
+            # available in the current feature.
+            token_is_max_context = features[feature_index].get("token_is_max_context", None)
+
+            # Update minimum null prediction.
+            feature_null_score = start_logits[0] + end_logits[0]
+            if min_null_prediction is None or min_null_prediction["score"] > feature_null_score:
+                min_null_prediction = {
+                    "offsets": (0, 0),
+                    "score": feature_null_score,
+                    "start_logit": start_logits[0],
+                    "end_logit": end_logits[0],
+                }
+
+            # Go through all possibilities for the `n_best_size` greater start and end logits.
+            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
+            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
+                    # to part of the input_ids that are not in the context.
+                    if (
+                        start_index >= len(offset_mapping)
+                        or end_index >= len(offset_mapping)
+                        or offset_mapping[start_index] is None
+                        or offset_mapping[end_index] is None
+                        or len(offset_mapping[start_index]) == 0
+                        or len(offset_mapping[end_index]) == 0
+                    ):
+                        continue
+                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
+                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
+                        continue
+                    # Don't consider answer that don't have the maximum context available (if such information is
+                    # provided).
+                    if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
+                        continue
+                    prelim_predictions.append(
+                        {
+                            "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
+                            "score": start_logits[start_index] + end_logits[end_index],
+                            "start_logit": start_logits[start_index],
+                            "end_logit": end_logits[end_index],
+                        }
+                    )
+        if version_2_with_negative:
+            # Add the minimum null prediction
+            prelim_predictions.append(min_null_prediction)
+            null_score = min_null_prediction["score"]
+
+        # Only keep the best `n_best_size` predictions.
+        predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
+
+        # Add back the minimum null prediction if it was removed because of its low score.
+        if version_2_with_negative and not any(p["offsets"] == (0, 0) for p in predictions):
+            predictions.append(min_null_prediction)
+
+        # Use the offsets to gather the answer text in the original context.
+        context = example["context"]
+        for pred in predictions:
+            offsets = pred.pop("offsets")
+            pred["text"] = context[offsets[0] : offsets[1]]
+
+        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
+        # failure.
+        if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""):
+            predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0})
+
+        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
+        # the LogSumExp trick).
+        scores = np.array([pred.pop("score") for pred in predictions])
+        exp_scores = np.exp(scores - np.max(scores))
+        probs = exp_scores / exp_scores.sum()
+
+        # Include the probabilities in our predictions.
+        for prob, pred in zip(probs, predictions):
+            pred["probability"] = prob
+
+        # Pick the best prediction. If the null answer is not possible, this is easy.
+        if not version_2_with_negative:
+            all_predictions[example["id"]] = predictions[0]["text"]
+        else:
+            # Otherwise we first need to find the best non-empty prediction.
+            i = 0
+            while predictions[i]["text"] == "":
+                i += 1
+            best_non_null_pred = predictions[i]
+
+            # Then we compare to the null prediction using the threshold.
+            score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"]
+            scores_diff_json[example["id"]] = float(score_diff)  # To be JSON-serializable.
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example["id"]] = ""
+            else:
+                all_predictions[example["id"]] = best_non_null_pred["text"]
+
+        # Make `predictions` JSON-serializable by casting np.float back to float.
+        all_nbest_json[example["id"]] = [
+            {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
+            for pred in predictions
+        ]
+
+    return all_predictions, all_nbest_json, scores_diff_json
+
+
+def make_qid_to_has_ans(examples):
+    qid_to_has_ans = {}
+    for example in examples:
+        if "is_impossible" in example:
+            has_ans = example["is_impossible"]
+        else:
+            has_ans = not len(example["answers"]["answer_start"]) == 0
+        qid_to_has_ans[example["id"]] = has_ans
+    return qid_to_has_ans
+
+
+def remove_punctuation(in_str):
+    in_str = str(in_str).lower().strip()
+    sp_char = [
+        "-",
+        ":",
+        "_",
+        "*",
+        "^",
+        "/",
+        "\\",
+        "~",
+        "`",
+        "+",
+        "=",
+        "，",
+        "。",
+        "：",
+        "？",
+        "！",
+        "“",
+        "”",
+        "；",
+        "’",
+        "《",
+        "》",
+        "……",
+        "·",
+        "、",
+        "「",
+        "」",
+        "（",
+        "）",
+        "－",
+        "～",
+        "『",
+        "』",
+    ]
+    out_segs = []
+    for char in in_str:
+        if char in sp_char:
+            continue
+        else:
+            out_segs.append(char)
+    return "".join(out_segs)
+
+
+def normalize_answer(s):
+    # Lower text and remove punctuation, articles and extra whitespace.
+    def remove_articles(text):
+        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+        return re.sub(regex, " ", text)
+
+    def white_space_fix(text):
+        return " ".join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return remove_punctuation("".join(ch for ch in text if ch not in exclude))
+
+    def lower(text):
+        return text.lower()
+
+    if not s:
+        return ""
+    else:
+        return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def compute_exact(a_gold, a_pred):
+    return int(normalize_answer(a_gold) == normalize_answer(a_pred))
+
+
+def compute_f1(a_gold, a_pred, is_whitespace_splited=True):
+    gold_toks = normalize_answer(a_gold).split()
+    pred_toks = normalize_answer(a_pred).split()
+
+    if not is_whitespace_splited:
+        gold_toks = gold_toks[0] if gold_toks else ""
+        pred_toks = pred_toks[0] if pred_toks else ""
+
+    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+    num_same = sum(common.values())
+    if len(gold_toks) == 0 or len(pred_toks) == 0:
+        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+        return int(gold_toks == pred_toks)
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(pred_toks)
+    recall = 1.0 * num_same / len(gold_toks)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def get_raw_scores(examples, preds, is_whitespace_splited=True):
+    exact_scores = {}
+    f1_scores = {}
+    for example in examples:
+        qid = example["id"]
+        gold_answers = [text for text in example["answers"]["text"] if normalize_answer(text)]
+        if not gold_answers:
+            # For unanswerable questions, only correct answer is empty string
+            gold_answers = [""]
+        if qid not in preds:
+            logger.info("Missing prediction for %s" % qid)
+            continue
+        a_pred = preds[qid]
+        # Take max over all gold answers
+        exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
+        f1_scores[qid] = max(compute_f1(a, a_pred, is_whitespace_splited) for a in gold_answers)
+
+    return exact_scores, f1_scores
+
+
+def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
+    new_scores = {}
+    for qid, s in scores.items():
+        pred_na = na_probs[qid] > na_prob_thresh
+        if pred_na:
+            new_scores[qid] = float(not qid_to_has_ans[qid])
+        else:
+            new_scores[qid] = s
+    return new_scores
+
+
+def make_eval_dict(exact_scores, f1_scores, qid_list=None):
+    if not qid_list:
+        total = len(exact_scores)
+        return collections.OrderedDict(
+            [
+                ("exact", 100.0 * sum(exact_scores.values()) / total),
+                ("f1", 100.0 * sum(f1_scores.values()) / total),
+                ("total", total),
+            ]
+        )
+    else:
+        total = len(qid_list)
+        return collections.OrderedDict(
+            [
+                ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total),
+                ("f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total),
+                ("total", total),
+            ]
+        )
+
+
+def merge_eval(main_eval, new_eval, prefix):
+    for k in new_eval:
+        main_eval["%s_%s" % (prefix, k)] = new_eval[k]
+
+
+def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
+    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+    cur_score = num_no_ans
+    best_score = cur_score
+    best_thresh = 0.0
+    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+    for i, qid in enumerate(qid_list):
+        if qid not in scores:
+            continue
+        if qid_to_has_ans[qid]:
+            diff = scores[qid]
+        else:
+            if preds[qid]:
+                diff = -1
+            else:
+                diff = 0
+        cur_score += diff
+        if cur_score > best_score:
+            best_score = cur_score
+            best_thresh = na_probs[qid]
+    return 100.0 * best_score / len(scores), best_thresh
+
+
+def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+    best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
+    best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
+    main_eval["best_exact"] = best_exact
+    main_eval["best_exact_thresh"] = exact_thresh
+    main_eval["best_f1"] = best_f1
+    main_eval["best_f1_thresh"] = f1_thresh
+
+
+def squad_evaluate(examples, preds, na_probs=None, na_prob_thresh=1.0, is_whitespace_splited=True):
+    """
+    Computes and prints the f1 score and em score of input prediction.
+    Args:
+        examples (list): List of raw squad-style data (see `run_squad.py
+            <https://github.com/PaddlePaddle/PaddleNLP/blob/develop/examples/
+            machine_reading_comprehension/SQuAD/run_squad.py>`__ for more
+            information).
+        preds (dict): Dictionary of final predictions. Usually generated by
+            `compute_prediction`.
+        na_probs (dict, optional): Dictionary of score_diffs of each example.
+            Used to decide if answer exits and compute best score_diff
+            threshold of null. Defaults to None.
+        na_prob_thresh (float, optional): The threshold used to select the
+            null answer. Defaults to 1.0.
+        is_whitespace_splited (bool, optional): Whether the predictions and references
+            can be tokenized by whitespace. Usually set True for English and
+            False for Chinese. Defaults to True.
+    """
+
+    if not na_probs:
+        na_probs = {k: 0.0 for k in preds}
+
+    qid_to_has_ans = make_qid_to_has_ans(examples)  # maps qid to True/False
+    has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
+    no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
+    exact_raw, f1_raw = get_raw_scores(examples, preds, is_whitespace_splited)
+    exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans, na_prob_thresh)
+    f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans, na_prob_thresh)
+    out_eval = make_eval_dict(exact_thresh, f1_thresh)
+    if has_ans_qids:
+        has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
+        merge_eval(out_eval, has_ans_eval, "HasAns")
+    if no_ans_qids:
+        no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
+        merge_eval(out_eval, no_ans_eval, "NoAns")
+        find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans)
+    logger.info(json.dumps(out_eval, indent=2))
+
+    return out_eval
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/utils.py
new file mode 100644
index 000000000..8d4d61368
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/metrics/utils.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+
+def default_trans_func(output, label, seq_mask, vocab):
+    seq_mask = np.expand_dims(seq_mask, axis=2).repeat(output.shape[2], axis=2)
+    output = output * seq_mask
+    idx = np.argmax(output, axis=2)
+    cand, ref_list = [], []
+    for i in range(idx.shape[0]):
+        token_list = []
+        for j in range(idx.shape[1]):
+            if seq_mask[i][j][0] == 0:
+                break
+            token_list.append(vocab[idx[i][j]])
+        cand.append(token_list)
+
+    label = np.squeeze(label, axis=2)
+    for i in range(label.shape[0]):
+        token_list = []
+        for j in range(label.shape[1]):
+            if seq_mask[i][j][0] == 0:
+                break
+            token_list.append(vocab[label[i][j]])
+
+        ref_list.append([token_list])
+    return cand, ref_list
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/__init__.py
new file mode 100644
index 000000000..ea980d24c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+from . import optimizer
+from .distributed import *
+from .einsum import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/distributed/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/distributed/__init__.py
new file mode 100644
index 000000000..67f3869d7
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/distributed/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import parallel
+from . import utils
+from .parallel import *
+from .utils import *
+
+__all__ = []
+__all__ += parallel.__all__
+__all__ += utils.__all__
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/distributed/parallel.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/distributed/parallel.py
new file mode 100644
index 000000000..a0d93359e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/distributed/parallel.py
@@ -0,0 +1,311 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+try:
+    from paddle.distributed.fleet import fleet
+except Exception:
+    import warnings
+
+    warnings.warn("paddle.distributed is not contains in you paddle!")
+
+__all__ = [
+    "guard",
+    "ParallelEmbedding",
+    "ColumnParallelLiner",
+    "RowParallelLiner",
+]
+
+
+def guard(device):
+    def decorator(Layer):
+        class WrapperClass(Layer):
+            def __init__(self, *args, **kw):
+                with paddle.static.device_guard(device):
+                    print("Init {} on {}".format(Layer.__name__, device))
+                    super().__init__(*args, **kw)
+
+            def forward(self, *args, **kw):
+                with paddle.static.device_guard(device):
+                    print("Forward {} on {}".format(Layer.__name__, device))
+                    return super().forward(*args, **kw)
+
+        return WrapperClass
+
+    return decorator
+
+
+class ParallelEmbedding(nn.Layer):
+    """
+    Parallel Embedding.
+
+    Args:
+        num_embeddings (int):
+            The size of embedding dictionary which dictates the maximum value of the input id.
+        embedding_dim (int):
+            The dimensions of each embedding vector.
+        rank (int):
+            The rank of the current part, which determines the start index of the vocab.
+        world_size (int):
+            The number of trainers.
+        weight_attr (Tensor, optional):
+            Specify the weight parameter property, including the initialization method.
+            Defaults to None which means the default weight parameter property will be used.
+        name (str, optional):
+            Normally there is no need for user to set this property.
+            Defaults to None.
+    """
+
+    def __init__(self, num_embeddings, embedding_dim, rank, world_size, weight_attr=None, name=None):
+        super(ParallelEmbedding, self).__init__()
+        self.rank = rank
+        self.world_size = world_size
+        self.num_embeddings = num_embeddings
+        self.is_mp = self.world_size > 1
+
+        assert (
+            num_embeddings % self.world_size == 0
+        ), "The length of the vocabulary must be divisible by the parallelism degree of MP"
+
+        per_part_size = num_embeddings // self.world_size
+
+        self.vocab_start_index = self.rank * per_part_size
+        self._dtype = self._helper.get_default_dtype()
+        self._size = [per_part_size, embedding_dim]
+        self._weight_attr = weight_attr
+        self._name = name
+
+        self.weight = self.create_parameter(attr=self._weight_attr, shape=self._size, dtype=self._dtype, is_bias=False)
+        self.weight.is_distributed = True
+
+        startup_block = paddle.static.default_startup_program().global_block()
+        main_block = paddle.static.default_main_program().global_block()
+        startup_block.vars[self.weight.name].is_distributed = True
+        main_block.vars[self.weight.name].is_distributed = True
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor):
+                A Tensor contains the id information.
+                Its data type should be int32 or int64, and the value of the input id should be in [0, weight.shape[0]] .
+
+        Returns:
+            Tensor: Returns the embedding Tensor mapped by x.
+        """
+        if self.is_mp:
+            output_parallel = paddle.distributed.collective._c_lookup_table(
+                self.weight, x, start_index=self.vocab_start_index, name=self._name
+            )
+            output = paddle.distributed.collective._mp_allreduce(
+                output_parallel, group=None, use_calc_stream=True, use_model_parallel=True
+            )
+        else:
+            output = paddle.nn.functional.embedding(
+                x, weight=self.weight, padding_idx=None, sparse=False, name=self._name
+            )
+        return output
+
+
+class ColumnParallelLiner(nn.Layer):
+    """
+    Parallel Linear, axis=1.
+
+    Args:
+        size (int):
+            The size of embedding vector.
+        num_partitions (int, optional):
+            The number of parts within a model parallel group. Defaults to 1.
+        gather_out (bool, optional):
+            Whether to gather the output tensor. Defaults to True.
+        param_attr (Tensor, optional):
+            Specify the parameter property, including the initialization method.
+            Defaults to None which means the default parameter property will be used.
+        bias_attr (Tensor, optional):
+            Specify the bias property.
+            Defaults to None which means the default parameter property will be used.
+        name (str, optional):
+            Normally there is no need for user to set this property.
+            Defaults to None.
+
+    """
+
+    def __init__(self, size, num_partitions=1, gather_out=True, param_attr=None, bias_attr=None, name=None):
+        super().__init__()
+
+        if paddle.in_dynamic_mode():
+            rank = paddle.distributed.get_rank()
+        else:
+            assert fleet._role_maker, "To use paddle.distributed.split, " "you must call fleet.init() firstly."
+            rank = fleet.worker_index()
+
+        # rank within a model parallel group
+        inner_rank = rank % num_partitions
+        self.gather_out = gather_out
+
+        assert (
+            size[1] % num_partitions == 0
+        ), "Number of column of the weight for linear ({}) must be" " divisible by num_partitions ({})".format(
+            size[1], num_partitions
+        )
+        self.per_part_size = size[1] // num_partitions
+        linear_size = (size[0], self.per_part_size)
+
+        num_rows, num_cols = linear_size
+
+        if not name:
+            name = "fc_by_col_rank_%d" % inner_rank
+        else:
+            name = name + "_by_col_rank_%d" % inner_rank
+
+        self.linear = paddle.nn.Linear(num_rows, num_cols, weight_attr=param_attr, bias_attr=bias_attr, name=name)
+
+        weight = self.linear.weight
+        weight.is_distributed = True
+        # alias for weight tensor
+        self.weight = self.linear.weight
+
+        startup_block = paddle.static.default_startup_program().global_block()
+        main_block = paddle.static.default_main_program().global_block()
+        startup_block.vars[weight.name].is_distributed = True
+        main_block.vars[weight.name].is_distributed = True
+        # set is_distributed for splited bias
+        # if a linear layer is splited by col, the bias would also be split into each rank as its weight
+        if self.linear._bias_attr:
+            startup_block.vars[self.linear.bias.name].is_distributed = True
+            main_block.vars[self.linear.bias.name].is_distributed = True
+            self.bias = self.linear.bias
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor):
+                The input tensor. Its data type can be int or float.
+
+        Returns:
+            Tensor: Returns the embedding Tensor mapped by x.
+        """
+        group = None
+        x = paddle.distributed.collective._c_identity(x, group=group)
+        output_parallel = self.linear(x)
+        if self.gather_out is False:
+            return output_parallel
+
+        return paddle.distributed.collective._c_concat(output_parallel, group=group)
+
+
+class RowParallelLiner(nn.Layer):
+    """
+    Parallel Linear, axis=0.
+
+    Args:
+        size (int):
+            The size of embedding vector.
+        num_partitions (int, optional):
+            The number of parts within a model parallel group. Defaults to 1.
+        input_is_parallel (bool, optional):
+            Whether the input is parallel. Defaults to `False`.
+        param_attr (Tensor, optional):
+            Specify the parameter property, including the initialization method.
+            Defaults to None which means the default parameter property will be used.
+        bias_attr (Tensor, optional):
+            Specify the bias property.
+            Defaults to None which means the default parameter property will be used.
+        name (str, optional):
+            Normally there is no need for user to set this property.
+            Defaults to None.
+
+    """
+
+    def __init__(self, size, num_partitions=1, input_is_parallel=False, param_attr=None, bias_attr=None, name=None):
+        super().__init__()
+
+        if paddle.in_dynamic_mode():
+            rank = paddle.distributed.get_rank()
+        else:
+            assert fleet._role_maker, "To use paddle.distributed.split, " "you must call fleet.init() firstly."
+            rank = fleet.worker_index()
+
+        # rank within a model parallel group
+        inner_rank = rank % num_partitions
+        self.input_is_parallel = input_is_parallel
+
+        assert (
+            size[0] % num_partitions == 0
+        ), "Number of rows of the weight for linear ({}) must be" " divisible by num_partitions ({})".format(
+            size[0], num_partitions
+        )
+        self.per_part_size = size[0] // num_partitions
+        linear_size = (self.per_part_size, size[1])
+
+        num_rows, num_cols = linear_size
+
+        if not name:
+            name = "fc_by_row_rank_%d" % inner_rank
+        else:
+            name = name + "_by_row_rank_%d" % inner_rank
+        self.linear = paddle.nn.Linear(
+            num_rows,
+            num_cols,
+            weight_attr=param_attr,
+            # NOTE(wangxi): row split, bias need add after allreduce
+            bias_attr=False,
+            name=name,
+        )
+
+        weight = self.linear.weight
+        weight.is_distributed = True
+        # alias for weight tensor
+        self.weight = self.linear.weight
+        self.bias = self.linear.bias
+
+        startup_block = paddle.static.default_startup_program().global_block()
+        main_block = paddle.static.default_main_program().global_block()
+        startup_block.vars[weight.name].is_distributed = True
+        main_block.vars[weight.name].is_distributed = True
+        # set is_distributed for splited bias
+        # if a linear layer is splited by row, each rank would hold a complete bias
+
+        if bias_attr is not False:
+            self.bias = self.create_parameter(shape=[num_cols], attr=bias_attr, dtype=self._dtype, is_bias=True)
+        else:
+            self.bias = None
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor):
+                The input tensor. Its data type can be int or float.
+
+        Returns:
+            Tensor: Returns the embedding Tensor mapped by x.
+        """
+        group = None
+        if self.input_is_parallel:
+            assert x.shape[-1] == self.per_part_size, (
+                "The width ({}) of the input "
+                "x must be equal to the height ({}) of the weight. Maybe you "
+                "should split the input x using paddle.split.".format(x.shape[-1], self.per_part_size)
+            )
+        else:
+            # split last dim
+            x = paddle.distributed.collective._c_split(x, group=group)
+        output_parallel = self.linear(x)
+        output = paddle.distributed.collective._mp_allreduce(
+            output_parallel, group=group, use_calc_stream=True, use_model_parallel=True
+        )
+        output = output + self.bias if self.bias is not None else output
+        return output
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/distributed/utils/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/distributed/utils/__init__.py
new file mode 100644
index 000000000..429c4f5a6
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/distributed/utils/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .topo import Topology
+from .random import get_rng_state_tracker
+
+__all__ = [
+    "Topology",
+    "get_rng_state_tracker",
+]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/distributed/utils/random.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/distributed/utils/random.py
new file mode 100644
index 000000000..3fb406c6b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/distributed/utils/random.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import paddle
+
+MODEL_PARALLEL_RNG = "model_parallel_rng"
+
+
+class RNGStatesTracker:
+    """
+    Tracker the RNG states.
+    """
+
+    def __init__(self):
+        # Map from name to the rng state.
+        self.states_ = {}
+        self.seeds_ = set()
+
+    def add(self, name, seed):
+        if seed in self.seeds_:
+            raise ValueError("seed {} already exists".format(seed))
+        self.seeds_.add(seed)
+        if name in self.states_:
+            raise ValueError("state {} already exists".format(name))
+        orig_rng_state = paddle.get_cuda_rng_state()
+        paddle.seed(seed)
+        self.states_[name] = paddle.get_cuda_rng_state()
+        paddle.set_cuda_rng_state(orig_rng_state)
+
+    @contextlib.contextmanager
+    def rng_state(self, name=MODEL_PARALLEL_RNG):
+        if name not in self.states_:
+            raise ValueError("state {} does not exist".format(name))
+        orig_cuda_rng_state = paddle.get_cuda_rng_state()
+        paddle.set_cuda_rng_state(self.states_[name])
+        try:
+            yield
+        finally:
+            self.states_[name] = paddle.get_cuda_rng_state()
+            paddle.set_cuda_rng_state(orig_cuda_rng_state)
+
+
+RNG_STATE_TRACKER = RNGStatesTracker()
+
+
+def get_rng_state_tracker():
+    return RNG_STATE_TRACKER
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/distributed/utils/topo.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/distributed/utils/topo.py
new file mode 100644
index 000000000..9d1a3312a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/distributed/utils/topo.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import namedtuple
+
+import numpy as np
+
+GroupInfo = namedtuple("GroupInfo", ["size", "rank", "world"])
+
+
+class Topology:
+    def __init__(
+        self,
+        device_rank,
+        world_size,
+        dp_degree=None,
+        pp_degree=1,
+        sharding_degree=1,
+        mp_degree=1,
+        sep_degree=1,
+        order=["dp", "pp", "sharding", "mp", "sep"],
+    ):
+        assert set(order) == {"dp", "pp", "sharding", "mp", "sep"}, f"Illegal order : {order}"
+        self.order = order
+
+        degree_map = {
+            "dp": dp_degree,
+            "pp": pp_degree,
+            "sharding": sharding_degree,
+            "mp": mp_degree,
+            "sep": sep_degree,
+        }
+        shape = [degree_map[key] for key in self.order]
+
+        arr = np.arange(0, dp_degree * pp_degree * sharding_degree * mp_degree * sep_degree).reshape(shape)
+        ranks = [rank[0] for rank in np.where(arr == device_rank)]
+
+        self.world = GroupInfo(size=world_size, rank=device_rank, world=list(range(0, world_size)))
+        worlds = []
+        for i in range(len(ranks)):
+            indexs = tuple(ranks[:i] + [slice(None)] + ranks[(i + 1) :])
+            worlds.append(arr[indexs])
+
+        for i, key in enumerate(self.order):
+            if key == "dp":
+                self.dp_info = GroupInfo(size=len(worlds[i]), rank=ranks[i], world=worlds[i].tolist())
+            elif key == "pp":
+                self.pp_info = GroupInfo(size=len(worlds[i]), rank=ranks[i], world=worlds[i].tolist())
+            elif key == "sharding":
+                self.sharding_info = GroupInfo(size=len(worlds[i]), rank=ranks[i], world=worlds[i].tolist())
+            elif key == "mp":
+                self.mp_info = GroupInfo(size=len(worlds[i]), rank=ranks[i], world=worlds[i].tolist())
+            elif key == "sep":
+                self.sep_info = GroupInfo(size=len(worlds[i]), rank=ranks[i], world=worlds[i].tolist())
+
+        self.is_last = self.pp_info.rank == self.pp_info.size - 1
+
+        data_arr = np.arange(0, dp_degree * sharding_degree).reshape([dp_degree, sharding_degree])
+        for i, key in enumerate(self.order):
+            if key != "dp" and key != "sharding":
+                data_arr = np.expand_dims(data_arr, axis=i).repeat(degree_map[key], axis=i)
+
+        self.data_info = GroupInfo(
+            size=int(self.dp_info.size * self.sharding_info.size),
+            rank=int(self.dp_info.rank * self.sharding_info.size + self.sharding_info.rank),
+            world=data_arr.reshape(-1).tolist(),
+        )
+
+        assert self.data_info.world[device_rank] == self.data_info.rank, "Data rank caculate error!"
+        self.data_inner_times = self.world.size // self.data_info.size
+
+    def __repr__(self):
+        return f"dp_info:\n\t {self.dp_info}, \npp_info:\n\t {self.pp_info}, \nsharding_info:\n\t {self.sharding_info}, \nmp_info:\n\t {self.mp_info}, \nsep_info:\n\t {self.sep_info}, \ndata_info:\n\t {self.data_info}, \norder:\n\t {self.order}"
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/einsum.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/einsum.py
new file mode 100644
index 000000000..7e0e22850
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/einsum.py
@@ -0,0 +1,367 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+
+__all__ = ["einsum", "transfer_param"]
+
+
+def einsum(equation, *operands):
+    r"""
+    Executes the sum of product of provided operands based on the Einstein summation convention.
+    Einsum can be used to complete a variety of operations, such as sum, transpose,
+    batch matrix multiplication.
+
+    Args:
+        equation (`str`):
+            Uses uncased letters to specify the dimension of the operands and result. The input
+            equation is on the left hand before `->` while the output equation is on the right side.
+            Einsum can infer the result shape so that the `->` and the result label letters can be omitted.
+            Operands in the input equation are splitted by commas (','), e.g. 'abc,cde' describes two 3D
+            operands. The dimensions labeled with same letter should be same or be 1. Ellipsis ('...') can
+            be used to specify the broadcast dimensions.
+
+        operands (`Tensor`):
+            The operands to compute the Einstein sum of. The number of operands should be the same as the
+            the operands described in input equation.
+
+    Returns:
+        `Tensor`: The result of Einstein sum product.
+
+    Example:
+        .. code-block::
+
+            import numpy as np
+            import paddle
+            import paddlenlp
+
+            np.random.seed(102)
+
+            x = paddle.to_tensor(np.random.rand(4))
+            y = paddle.to_tensor(np.random.rand(5))
+            # sum
+            print(paddlenlp.ops.einsum('i->', x))
+            # Tensor(shape=[], dtype=float64, place=CUDAPlace(0), stop_gradient=True, 2.30369050)
+
+            # dot
+            print(paddlenlp.ops.einsum('i,i->', x, x))
+            # Tensor(shape=[], dtype=float64, place=CUDAPlace(0), stop_gradient=True, 1.43773247)
+
+            # outer
+            print(paddlenlp.ops.einsum("i,j->ij", x, y)),
+            # Tensor(shape=[4, 5], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            #         [[0.34590188, 0.48353496, 0.09996135, 0.18656330, 0.21392910],
+            #         [0.39122025, 0.54688535, 0.11305780, 0.21100591, 0.24195704],
+            #         [0.17320613, 0.24212422, 0.05005442, 0.09341929, 0.10712238],
+            #         [0.42290818, 0.59118179, 0.12221522, 0.22809690, 0.26155500]])
+
+            A = paddle.to_tensor(np.random.rand(2, 3, 2))
+            B = paddle.to_tensor(np.random.rand(2, 2, 3))
+            # transpose
+            print(paddlenlp.ops.einsum('ijk->kji', A))
+            #  Tensor(shape=[2, 3, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            #        [[[0.49174730, 0.33344683],
+            #          [0.89440989, 0.26162022],
+            #          [0.36116209, 0.12241719]],
+
+            #         [[0.49019824, 0.51895050],
+            #          [0.18241053, 0.13092809],
+            #          [0.81059146, 0.55165734]]])
+
+            # batch matrix multiplication
+            print(paddlenlp.ops.einsum('ijk, ikl->ijl', A,B))
+            # Tensor(shape=[2, 3, 3], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            #     [[[0.13654339, 0.39331432, 0.65059661],
+            #      [0.07171420, 0.57518653, 0.77629221],
+            #      [0.21250688, 0.37793541, 0.73643411]],
+
+            #     [[0.56925339, 0.65859030, 0.57509818],
+            #      [0.30368265, 0.25778348, 0.21630400],
+            #      [0.39587265, 0.58031243, 0.51824755]]])
+
+            # Ellipsis transpose
+            print(paddlenlp.ops.einsum('...jk->...kj', A))
+            # Tensor(shape=[2, 2, 3], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            #     [[[0.49174730, 0.89440989, 0.36116209],
+            #         [0.49019824, 0.18241053, 0.81059146]],
+
+            #         [[0.33344683, 0.26162022, 0.12241719],
+            #         [0.51895050, 0.13092809, 0.55165734]]])
+
+            # Ellipsis batch matrix multiplication
+            print(paddlenlp.ops.einsum('...jk, ...kl->...jl', A,B))
+            # Tensor(shape=[2, 3, 3], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+            # [[[0.13654339, 0.39331432, 0.65059661],
+            #     [0.07171420, 0.57518653, 0.77629221],
+            #     [0.21250688, 0.37793541, 0.73643411]],
+
+            #     [[0.56925339, 0.65859030, 0.57509818],
+            #     [0.30368265, 0.25778348, 0.21630400],
+            #     [0.39587265, 0.58031243, 0.51824755]]])
+    """
+    # paddle.einsum can be used in paddle 2.3.0+
+    if hasattr(paddle, "einsum"):
+        return paddle.einsum(equation, *operands)
+
+    def _mul_sum(left, right, sum_dims):
+        assert left.rank() == right.rank(), "number of rank should be equal."
+        if len(sum_dims) == 0:
+            return left * right
+        sum_dims_set = set(sum_dims)
+        batch_dims = []
+        left_out_dims = []
+        right_out_dims = []
+        batch_size = summed_size = left_size = right_size = 1
+        dim = len(left.shape)
+        for i in range(dim):
+            is_left_summed_dim = left.shape[i] > 1  # not broadcast dim
+            is_right_summed_dim = right.shape[i] > 1
+            if i in sum_dims_set:
+                if is_left_summed_dim and is_right_summed_dim:
+                    assert left.shape[i] == right.shape[i], "Non-broadcast dim should be equal."
+                    summed_size *= left.shape[i]
+                elif is_left_summed_dim:
+                    left = left.sum(axis=i, keepdim=True)
+                elif is_right_summed_dim:
+                    right = right.sum(axis=i, keepdim=True)
+            elif is_left_summed_dim and is_right_summed_dim:
+                assert left.shape[i] == right.shape[i], "Non-broadcast dim should be equal."
+                batch_dims.append(i)
+                batch_size *= left.shape[i]
+            elif is_left_summed_dim:
+                left_out_dims.append(i)
+                left_size *= left.shape[i]
+            else:
+                right_out_dims.append(i)
+                right_size *= right.shape[i]
+        out_shape = [left.shape[i] for i in batch_dims + left_out_dims]
+        out_shape.extend([1] * len(sum_dims))
+        out_shape.extend([right.shape[i] for i in right_out_dims])
+
+        left_perm = list(batch_dims)
+        left_perm.extend(left_out_dims)
+        left_perm.extend(sum_dims)
+        left_perm.extend(right_out_dims)
+
+        right_perm = list(batch_dims)
+        right_perm.extend(sum_dims)
+        right_perm.extend(right_out_dims)
+        right_perm.extend(left_out_dims)
+
+        output_perm = [-1] * (len(batch_dims) + len(left_out_dims) + len(sum_dims) + len(right_out_dims))
+        for i, j in enumerate(batch_dims + left_out_dims + sum_dims + right_out_dims):
+            output_perm[j] = i
+
+        left = paddle.reshape(paddle.transpose(left, perm=left_perm), (batch_size, left_size, summed_size))
+        right = paddle.reshape(paddle.transpose(right, perm=right_perm), (batch_size, summed_size, right_size))
+        result = paddle.matmul(left, right)
+        result = paddle.reshape(result, out_shape)
+        result = paddle.transpose(result, output_perm)
+        return result
+
+    if len(operands) == 1 and isinstance(operands[0], (list, tuple)):
+        operands = operands[0]
+    # Equation is case insensitive
+    num_letters = 26
+    letters_to_idx = [-1] * num_letters
+    equation = equation.lower().replace(" ", "")
+    # 1. Parse the equation
+    eqns = equation.split("->")
+    num_eqns_size = len(eqns)
+    assert num_eqns_size <= 2, "The '->' should exist at most only once"
+
+    input_eqn = eqns[0]
+    output_eqn = None if num_eqns_size <= 1 else eqns[1]
+    operand_eqns = input_eqn.split(",")
+    assert len(operand_eqns) == len(
+        operands
+    ), "Number of operands in equation and the tensors provided should be equal."
+
+    # Parse input equation
+    num_total_idxes = 0
+    input_operand_idxes = []
+    letter_frequence = [0] * num_letters
+    idxes_last_operand = []
+    num_ell_idxes = -1
+    first_ell_idx = 0
+    for i, term in enumerate(operand_eqns):
+        ell_char_count = 0
+        operand_rank = int(operands[i].rank().cpu().numpy())
+        curr_num_ell_idxes = operand_rank - len(term) + 3
+        dims_in_terms = 0
+        curr_operand_idxes = []
+        for ch in term:
+            if ch == ".":
+                ell_char_count += 1
+                assert ell_char_count <= 3, "The '.' should only exist in one ellipsis '...' in term {}".format(term)
+                if ell_char_count == 3:
+                    if num_ell_idxes == -1:
+                        num_ell_idxes = curr_num_ell_idxes
+                        first_ell_idx = num_total_idxes
+                        num_total_idxes += num_ell_idxes
+                    else:
+                        assert (
+                            curr_num_ell_idxes == num_ell_idxes
+                        ), "Ellipsis in all terms should represent same dimensions ({}).".format(num_ell_idxes)
+
+                    for j in range(num_ell_idxes):
+                        curr_operand_idxes.append(j + first_ell_idx)
+                        idxes_last_operand.append(i)
+                    dims_in_terms += num_ell_idxes
+            else:
+                assert (ell_char_count == 0) or (
+                    ell_char_count == 3
+                ), "'.' must only occur in ellipsis, operand {}".format(term)
+                assert ord("a") <= ord(ch) and ord(ch) <= ord("z"), "only accept alphabet (a-zA-Z)"
+                letter_num = ord(ch) - ord("a")
+                if letters_to_idx[letter_num] == -1:
+                    letters_to_idx[letter_num] = num_total_idxes
+                    num_total_idxes += 1
+                    idxes_last_operand.append(i)
+                else:
+                    idxes_last_operand[letters_to_idx[letter_num]] = i
+                letter_frequence[letter_num] += 1
+                curr_operand_idxes.append(letters_to_idx[letter_num])
+                dims_in_terms += 1
+
+        assert dims_in_terms == operand_rank, "Dimension dismatch for operand {}: equation {}, tensor {}".format(
+            i, dims_in_terms, operand_rank
+        )
+        input_operand_idxes.append(curr_operand_idxes)
+    # Parse output equation
+    idxes_to_output_dims = [-1] * num_total_idxes
+    num_output_dims = 0
+    if num_eqns_size == 2:
+        ell_char_count = 0
+        for ch in output_eqn:
+            if ch == ".":
+                ell_char_count += 1
+                assert ell_char_count <= 3, "The '.' should only exist in one ellipsis '...' in term {}".format(
+                    output_eqn
+                )
+                if ell_char_count == 3:
+                    assert num_ell_idxes > -1, "Input equation '{}' don't have ellipsis.".format(input_eqn)
+                    for j in range(num_ell_idxes):
+                        idxes_to_output_dims[first_ell_idx + j] = num_output_dims
+                        num_output_dims += 1
+
+            else:
+                assert (ell_char_count == 0) or (
+                    ell_char_count == 3
+                ), "'.' must only occur in ellipsis, operand {}".format(output_eqn)
+                assert ord("a") <= ord(ch) and ord(ch) <= ord("z"), "only accept alphabet (a-zA-Z)"
+                letter_num = ord(ch) - ord("a")
+                assert letters_to_idx[letter_num] != -1, "character {} doesn't exist in input".format(ch)
+                assert (
+                    idxes_to_output_dims[letters_to_idx[letter_num]] == -1
+                ), "character {} occurs twice in output".format(ch)
+
+                idxes_to_output_dims[letters_to_idx[letter_num]] = num_output_dims
+                num_output_dims += 1
+    else:  # num_eqns_size == 1
+        # Infer the output dims
+        if num_ell_idxes >= 0:
+            for j in range(num_ell_idxes):
+                idxes_to_output_dims[first_ell_idx + j] = num_output_dims
+                num_output_dims += 1
+        for j in range(num_letters):
+            if letter_frequence[j] == 1:
+                idxes_to_output_dims[letters_to_idx[j]] = num_output_dims
+                num_output_dims += 1
+
+    # Mark sum index
+    sum_dim = num_output_dims
+    for i in range(num_total_idxes):
+        if idxes_to_output_dims[i] == -1:
+            idxes_to_output_dims[i] = sum_dim
+            sum_dim += 1
+
+    preprocessed_operands = []
+    size_dims = [-1] * num_total_idxes
+    for i, preprocessed_operand in enumerate(operands):
+        idx_to_dims = [-1] * num_total_idxes
+        curr_operand_idxes = input_operand_idxes[i]
+        dim = 0
+        for j, idx in enumerate(curr_operand_idxes):
+            output_dim = idxes_to_output_dims[idx]
+            if idx_to_dims[output_dim] == -1:
+                idx_to_dims[output_dim] = dim
+                if size_dims[idx] == -1:
+                    size_dims[idx] = preprocessed_operand.shape[dim]
+                else:
+                    assert (
+                        size_dims[idx] == preprocessed_operand.shape[dim]
+                    ), "Dimension size does not match previous size. "
+                dim += 1
+            else:
+                # Diagonal repeated index
+                # TODO(zhoushunjie): Need to develop a paddle.diagonal api
+                raise NotImplementedError("Can't support diagonal.")
+        perm = []
+        for input_dim in idx_to_dims:
+            if input_dim > -1:
+                perm.append(input_dim)
+        # Transpose the tensor by perm
+        preprocessed_operand = paddle.transpose(preprocessed_operand, perm=perm)
+
+        for dim, input_dim in enumerate(idx_to_dims):
+            if input_dim == -1:
+                preprocessed_operand = paddle.unsqueeze(preprocessed_operand, dim)
+
+        preprocessed_operands.append(preprocessed_operand)
+
+    # 2. Execute the mul_sum
+    sum_dims = []
+    result = preprocessed_operands[0]
+    for i in range(num_total_idxes):
+        if idxes_last_operand[i] == 0 and idxes_to_output_dims[i] >= num_output_dims:
+            result = result.sum(axis=idxes_to_output_dims[i], keepdim=True)
+    for i in range(1, len(preprocessed_operands)):
+        for j in range(num_total_idxes):
+            if idxes_last_operand[j] == i and idxes_to_output_dims[j] >= num_output_dims:
+                sum_dims.append(idxes_to_output_dims[j])
+        result = _mul_sum(result, preprocessed_operands[i], sum_dims)
+
+    squeeze_dims = [i for i in range(len(result.shape) - 1, num_output_dims - 1, -1)]
+    if len(squeeze_dims) != 0:
+        result = paddle.squeeze(result, squeeze_dims)
+    return result
+
+
+# copy from fast transformers
+def transfer_param(p, is_bias=False, dtype="float16", restore_data=False):
+    param_shape = p.shape
+    # Allow CPU/GPU and float16/float32 transfer
+    # NOTE: str(p.place) differs between paddle develop and 2.2
+    if str(p.dtype)[-len(dtype) :] == dtype and ("gpu" in str(p.place).lower() or "cuda" in str(p.place).lower()):
+        return p
+    if restore_data:
+        if paddle.in_dynamic_mode():
+            param_data = p.numpy()
+            # Creating parameters with Assign initializer is too slow. Maybe we
+            # can cast to fp16 directly and get a tensor, while we do it more
+            # elaborately to get a ParamBase. Also note `VarBase.set_value`
+            # enforce the same dtype and can not be used directly.
+            new_p = type(p)(shape=param_shape, dtype=dtype, is_bias=is_bias)
+            new_p.value().get_tensor().set(param_data.astype(dtype), paddle.framework._current_expected_place())
+            return new_p
+        else:
+            param_data = np.array(paddle.static.global_scope().find_var(p.name).get_tensor())
+    return paddle.create_parameter(
+        shape=param_shape,
+        dtype=dtype,
+        is_bias=is_bias,
+        default_initializer=paddle.nn.initializer.Assign(param_data) if restore_data else None,
+    )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/optimizer/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/optimizer/__init__.py
new file mode 100644
index 000000000..dd46359ca
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/optimizer/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .adamwdl import AdamWDL, layerwise_lr_decay
+from .ema import ExponentialMovingAverage
+from .lr import InverseSquareRootSchedule
+
+__all__ = ["layerwise_lr_decay", "AdamWDL", "ExponentialMovingAverage", "InverseSquareRootSchedule"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/optimizer/adamwdl.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/optimizer/adamwdl.py
new file mode 100644
index 000000000..a1dd97684
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/optimizer/adamwdl.py
@@ -0,0 +1,257 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+
+import paddle
+from paddle.optimizer import AdamW
+
+__all__ = ["AdamWDL", "layerwise_lr_decay"]
+
+
+# Layerwise decay
+def layerwise_lr_decay(decay_rate, name_dict, n_layers, param):
+    """
+    Args:
+        decay_rate (float):
+            The layer-wise decay ratio.
+        name_dict (dict):
+            The keys of name_dict is dynamic name of model while the value
+            of name_dict is static name.
+            Use model.named_parameters() to get name_dict.
+        n_layers (int):
+            Total number of layers in the transformer encoder.
+    """
+    ratio = 1.0
+    static_name = name_dict[param.name]
+    if "encoder.layers" in static_name:
+        idx = static_name.find("encoder.layers.")
+        layer = int(static_name[idx:].split(".")[2])
+        ratio = decay_rate ** (n_layers - layer)
+    elif "embedding" in static_name:
+        ratio = decay_rate ** (n_layers + 1)
+    return ratio
+
+
+class AdamWDL(AdamW):
+    r"""
+    The AdamWDL optimizer is implemented based on the AdamW Optimization with dynamic lr setting.
+    Generally it's used for transformer model.
+    We use "layerwise_lr_decay" as default dynamic lr setting method of AdamWDL.
+    “Layer-wise decay” means exponentially decaying the learning rates of individual
+    layers in a top-down manner. For example, suppose the 24-th layer uses a learning
+    rate l, and the Layer-wise decay rate is α, then the learning rate of layer m
+    is lα^(24-m). See more details on: https://arxiv.org/abs/1906.08237.
+    .. math::
+        & t = t + 1
+
+        & moment\_1\_out = {\beta}_1 * moment\_1 + (1 - {\beta}_1) * grad
+        & moment\_2\_out = {\beta}_2 * moment\_2 + (1 - {\beta}_2) * grad * grad
+        & learning\_rate = learning\_rate * \frac{\sqrt{1 - {\beta}_2^t}}{1 - {\beta}_1^t}
+        & param\_out = param - learning\_rate * (\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param)
+    Args:
+        learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
+            It can be a float value or a LRScheduler. The default value is 0.001.
+        beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
+            It should be a float number or a Tensor with shape [1] and data type as float32.
+            The default value is 0.9.
+        beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
+            It should be a float number or a Tensor with shape [1] and data type as float32.
+            The default value is 0.999.
+        epsilon (float, optional): A small float value for numerical stability.
+            It should be a float number or a Tensor with shape [1] and data type as float32.
+            The default value is 1e-08.
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01.
+        apply_decay_param_fun (function|None, optional): If it is not None,
+            only tensors that makes apply_decay_param_fun(Tensor.name)==True
+            will be updated. It only works when we want to specify tensors.
+            Default: None.
+        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
+            some derived class of ``GradientClipBase`` . There are three cliping strategies
+            ( :ref:`api_paddle_nn_GradientClipByGlobalNorm` , :ref:`api_paddle_nn_GradientClipByNorm` ,
+            :ref:`api_paddle_nn_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
+        lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
+            The accumulators are updated at every step. Every element of the two moving-average
+            is updated in both dense mode and sparse mode. If the size of parameter is very large,
+            then the update may be very slow. The lazy mode only update the element that has
+            gradient in current mini-batch, so it will be much more faster. But this mode has
+            different semantics with the original Adam algorithm and may lead to different result.
+            The default value is False.
+        multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.
+        layerwise_decay (float, optional): The layer-wise decay ratio. Defaults to 1.0.
+        n_layers (int, optional): The total number of encoder layers. Defaults to 12.
+        set_param_lr_fun (function|None, optional): If it's not None, set_param_lr_fun() will set the parameter
+            learning rate before it executes Adam Operator. Defaults to :ref:`layerwise_lr_decay`.
+        name_dict (dict, optional): The keys of name_dict is dynamic name of model while the value
+            of name_dict is static name. Use model.named_parameters() to get name_dict.
+        name (str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name`.
+            The default value is None.
+    Examples:
+        .. code-block:: python
+            import paddle
+            from paddlenlp.ops.optimizer import AdamWDL
+            def simple_lr_setting(decay_rate, name_dict, n_layers, param):
+                ratio = 1.0
+                static_name = name_dict[param.name]
+                if "weight" in static_name:
+                    ratio = decay_rate**0.5
+                param.optimize_attr["learning_rate"] *= ratio
+
+            linear = paddle.nn.Linear(10, 10)
+            name_dict = dict()
+            for n, p in linear.named_parameters():
+                name_dict[p.name] = n
+            inp = paddle.rand([10,10], dtype="float32")
+            out = linear(inp)
+            loss = paddle.mean(out)
+            adamwdl = AdamWDL(
+                learning_rate=1e-4,
+                parameters=linear.parameters(),
+                set_param_lr_fun=simple_lr_setting,
+                layerwise_decay=0.8,
+                name_dict=name_dict)
+
+            loss.backward()
+            adamwdl.step()
+            adamwdl.clear_grad()
+    """
+
+    def __init__(
+        self,
+        learning_rate=0.001,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-8,
+        parameters=None,
+        weight_decay=0.01,
+        apply_decay_param_fun=None,
+        grad_clip=None,
+        lazy_mode=False,
+        multi_precision=False,
+        layerwise_decay=1.0,
+        n_layers=12,
+        set_param_lr_fun=layerwise_lr_decay,
+        name_dict=None,
+        name=None,
+    ):
+        if not isinstance(layerwise_decay, float) and not isinstance(layerwise_decay, paddle.framework.Variable):
+            raise TypeError("coeff should be float or Tensor.")
+        self.layerwise_decay = layerwise_decay
+        self.n_layers = n_layers
+        self.set_param_lr_fun = partial(set_param_lr_fun, layerwise_decay, name_dict, n_layers)
+        coeff = weight_decay
+        self._coeff = coeff
+        self._lr_to_coeff = dict()
+        super(AdamWDL, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            beta1=beta1,
+            beta2=beta2,
+            epsilon=epsilon,
+            grad_clip=grad_clip,
+            name=name,
+            apply_decay_param_fun=apply_decay_param_fun,
+            weight_decay=weight_decay,
+            lazy_mode=lazy_mode,
+            multi_precision=multi_precision,
+        )
+
+    def _set_auxiliary_var(self, key, val):
+        self._auxiliary_vars[key] = val
+
+    def _get_auxiliary_var(self, key):
+        if key in self._auxiliary_vars:
+            return self._auxiliary_vars[key]
+        else:
+            return None
+
+    def _append_optimize_op(self, block, param_and_grad):
+        if self.set_param_lr_fun is None:
+            return super(AdamWDL, self)._append_optimize_op(block, param_and_grad)
+
+        self._append_decoupled_weight_decay(block, param_and_grad)
+        prev_lr = param_and_grad[0].optimize_attr["learning_rate"]
+        ratio = self.set_param_lr_fun(param_and_grad[0])
+        param_and_grad[0].optimize_attr["learning_rate"] *= ratio
+
+        # excute Adam op
+        res = super(AdamWDL, self)._append_optimize_op(block, param_and_grad)
+        param_and_grad[0].optimize_attr["learning_rate"] = prev_lr
+        return res
+
+    def _append_decoupled_weight_decay(self, block, param_and_grad):
+        """
+        Add decoupled weight decay op.
+            parameter = parameter - parameter * coeff * lr
+        Args:
+            block: block in which variable is to be created
+            param_and_grad: (parameters, gradients) pairs,
+                the parameters need to decay.
+        Raises:
+            Exception: The type of coeff and parameter is not consistent.
+        """
+        if isinstance(param_and_grad, dict):
+            param_and_grad = self._update_param_group(param_and_grad)
+        param, grad = param_and_grad
+
+        if self._apply_decay_param_fun is not None and not self._apply_decay_param_fun(param.name):
+            return
+
+        if isinstance(self._learning_rate, float):
+            learning_rate = self._learning_rate
+        else:
+            # NOTE. We add this function to the _append_optimize_op(),
+            # for we must make sure _create_param_lr() be called after
+            # optimizer._create_global_learning_rate().
+            learning_rate = self._create_param_lr(param_and_grad)
+
+        with block.program._optimized_guard([param, grad]), paddle.static.name_scope("weight decay"):
+            self._params_name.add(param.name)
+
+            # If it has been calculated, the result will be reused.
+            # NOTE(wangxi): In dygraph mode, apply_gradient will be executed
+            # every step, so need clear _lr_to_coeff every step,
+            # we do this in _create_optimization_pass
+            decay_coeff = self._lr_to_coeff.get(learning_rate, None)
+            if decay_coeff is None:
+                # NOTE(wangxi): for pipeline to set device:all
+                with paddle.static.device_guard(None):
+                    decay_coeff = 1.0 - learning_rate * self._coeff
+                self._lr_to_coeff[learning_rate] = decay_coeff
+
+            find_master = self._multi_precision and param.dtype == paddle.float16
+            if find_master:
+                master_weight = self._master_weights[param.name]
+                scaled_param = master_weight * decay_coeff
+                paddle.assign(scaled_param, output=master_weight)
+            else:
+                scaled_param = param * decay_coeff
+                paddle.assign(scaled_param, output=param)
+
+    def _create_optimization_pass(self, parameters_and_grads):
+        optimize_ops = super(AdamWDL, self)._create_optimization_pass(parameters_and_grads)
+        # In dygraph mode, clear _lr_to_coeff after applied gradient
+        self._lr_to_coeff = dict()
+        return optimize_ops
+
+    def __str__(self):
+        return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
+
+    def _update_param_group(self, parameters):
+        self._coeff = parameters.get("coeff", self._default_dict["coeff"])
+        parameters = parameters.get("params")
+        return parameters
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/optimizer/ema.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/optimizer/ema.py
new file mode 100644
index 000000000..6a9e64ac1
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/optimizer/ema.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class ExponentialMovingAverage(object):
+    def __init__(self, model, decay=0.999):
+        self.model = model
+        self.decay = decay
+        self.shadow = {}
+        self.backup = {}
+
+    def register(self):
+        for name, param in self.model.named_parameters():
+            if not param.stop_gradient:
+                self.shadow[name] = param.clone()
+
+    def update(self):
+        for name, param in self.model.named_parameters():
+            if not param.stop_gradient:
+                assert name in self.shadow
+                new_average = (1.0 - self.decay) * param + self.decay + self.shadow[name]
+                self.shadow[name] = new_average.clone()
+
+    def apply_shadow(self):
+        for name, param in self.model.named_parameters():
+            if not param.stop_gradient:
+                assert name in self.shadow
+                self.backup[name] = param
+                # TODO(huijuan): paddle中parameters赋值方式不是param.data，这样改不了模型参数
+                param.data = self.shadow[name]
+
+    def restore(self):
+        for name, param in self.model.named_parameters():
+            if not param.stop_gradient:
+                assert name in self.backup
+                param = self.backup[name]
+        self.backup = {}
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/optimizer/lr.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/optimizer/lr.py
new file mode 100644
index 000000000..b685cc4fa
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/ops/optimizer/lr.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from paddle.optimizer.lr import LRScheduler
+
+
+class InverseSquareRootSchedule(LRScheduler):
+    """
+    Decay the LR based on the inverse square root of the update number.
+
+    We also support a warmup phase where we linearly increase the learning rate
+    from some initial learning rate  until the configured learning rate. Thereafter
+    we decay proportional to the number of updates, with a decay factor set to
+    align with the configured learning rate.
+
+    Args:
+        warmup_steps(int):
+            The number of warmup steps. A super parameter.
+        learning_rate(float, optional):
+            The learning rate. It is a python float number. Defaults to 1.0.
+        last_epoch(int, optional):
+            The index of last epoch. Can be set to restart training. Default: -1,
+            means initial learning rate.
+        verbose(bool, optional):
+            If ``True``, prints a message to stdout for each
+            update. Defaults to ``False``.
+    """
+
+    def __init__(self, warmup_steps, learning_rate=1.0, last_epoch=-1, verbose=False):
+        self.warmup_steps = warmup_steps
+        warmup_end_lr = learning_rate
+        self.warmup_init_lr = 0.0
+        self.lr_step = (warmup_end_lr - self.warmup_init_lr) / self.warmup_steps
+        self.decay_factor = warmup_end_lr * (self.warmup_steps**0.5)
+
+        super(InverseSquareRootSchedule, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        if self.last_epoch < self.warmup_steps:
+            return self.warmup_init_lr + self.last_epoch * self.lr_step
+        else:
+            return self.decay_factor * (self.last_epoch**-0.5)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/__init__.py
new file mode 100644
index 000000000..bf290397e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .lora import LoRAConfig, LoRAModel
+from .prefix import PrefixConfig, PrefixModelForCausalLM
+from .vera import VeRAConfig, VeRAModel
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/__init__.py
new file mode 100644
index 000000000..f1f83b9cd
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .lora_config import LoRAConfig
+from .lora_layers import ColumnParallelLoRALinear, LoRALinear, RowParallelLoRALinear
+from .lora_model import LoRAModel
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/lora_config.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/lora_config.py
new file mode 100644
index 000000000..40b59e5c1
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/lora_config.py
@@ -0,0 +1,176 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import math
+import os
+from dataclasses import asdict, dataclass, field
+from typing import List, Optional, Union
+
+from ...utils.env import LORA_CONFIG_NAME
+from ...utils.log import logger
+
+
+@dataclass
+class LoRAConfig:
+    """
+    This is the configuration class to store the configuration of a [`LoRAModel`].
+    Args:
+        r (`int`): Lora attention dimension
+        target_modules (`Union[List[str],str]`): The names of the modules to apply Lora to.
+        trainable_modules (`List[str]`): The names of the modules to train when applying Lora.
+        lora_alpha (`float`): The alpha parameter for Lora scaling.
+        lora_dropout (`float`): The dropout probability for Lora layers.
+        merge_weights (`bool`):
+            Whether to merge the weights of the Lora layers with the base transformer model in `eval` mode.
+    """
+
+    r: int = field(default=8, metadata={"help": "Lora attention dimension"})
+    target_modules: Optional[Union[List[str], str]] = field(
+        default=None,
+        metadata={
+            "help": "List of module names or regex expression of the module names to replace with Lora."
+            "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
+        },
+    )
+    trainable_modules: Optional[List[str]] = field(
+        default=None,
+        metadata={
+            "help": "List of module names or regex expression of the module names to train when applying with Lora."
+            "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
+        },
+    )
+    lora_alpha: int = field(default=8, metadata={"help": "Lora alpha"})
+    lora_dropout: float = field(default=0.0, metadata={"help": "Lora dropout"})
+    merge_weights: bool = field(
+        default=False, metadata={"help": "Merge weights of the original model and the Lora model"}
+    )
+    trainable_bias: Optional[str] = field(
+        default=None, metadata={"help": "Define trainable bias parameters for the Lora model."}
+    )
+    enable_lora_list: Optional[Union[List[bool], List[Optional[List[bool]]]]] = field(
+        default=None,
+        metadata={
+            "help": "Provides fine-grained control over `MergedLoRALinear`. If None, `LoRALinear` is used instead."
+        },
+    )
+    tensor_parallel_degree: int = field(default=-1, metadata={"help": "1 for not use tensor parallel"})
+    dtype: Optional[str] = field(default=None, metadata={"help": "The data type of tensor"})
+    head_dim: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The model multi head dimension.Only for LoRAMergedLinear and ColumnParallelLoRAMergedLinear."
+        },
+    )
+    do_qat: bool = field(default=False, metadata={"help": "Whether the lora model would do quant-aware training"})
+    rslora: bool = field(default=False, metadata={"help": "Whether to use RsLoRA"})
+    pissa: bool = field(default=False, metadata={"help": "Whether to use Pissa: https://arxiv.org/pdf/2404.02948.pdf"})
+    lora_plus_scale: float = field(default=1.0, metadata={"help": "Lora B scale in LoRA+"})
+    base_model_name_or_path: Optional[str] = field(
+        default=None, metadata={"help": "The name of the base model to use."}
+    )
+    use_quick_lora: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to use quick lora, The use of Quick LoRa will only take effect when lora_dropout is set to 0."
+        },
+    )
+
+    def __post_init__(self):
+        if self.use_quick_lora and self.lora_dropout > 0:
+            logger.warning(
+                "Quick LoRa is enabled, but lora_dropout is set to a non-zero value. "
+                "We will automatically set `use_quick_lora` to `False` to avoid potential inconsistencies."
+            )
+            self.use_quick_lora = False
+        if self.merge_weights:
+            logger.error(
+                "'merge_weights' is deprecated and will be removed in a future version. "
+                "Please apply model.merge() or model.unmerge() to merge/unmerge LoRA weight to base model."
+            )
+
+    @property
+    def scaling(self):
+        if not self.rslora and not self.pissa:
+            return self.lora_alpha / self.r
+        elif self.pissa:
+            return 1.0
+        else:
+            return self.lora_alpha / math.sqrt(self.r)
+
+    @property
+    def __dict__(self):
+        return asdict(self)
+
+    def to_dict(self):
+        return self.__dict__
+
+    def save_pretrained(self, save_directory):
+        r"""
+        This method saves the configuration of your adapter model in a directory.
+        Args:
+            save_directory (`str`):
+                The directory where the configuration will be saved.
+        """
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        output_dict = self.__dict__
+        output_dict["scaling"] = self.scaling
+        output_path = os.path.join(save_directory, LORA_CONFIG_NAME)
+
+        # save it
+        with open(output_path, "w") as writer:
+            writer.write(json.dumps(output_dict, indent=2, sort_keys=True))
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r"""
+        This method loads the configuration of your adapter model from a directory.
+        Args:
+            pretrained_model_name_or_path (`str`):
+                The directory or the hub-id where the configuration is saved.
+            **kwargs:
+                Additional keyword arguments passed along to the child class initialization.
+        """
+        if os.path.isfile(os.path.join(pretrained_model_name_or_path, LORA_CONFIG_NAME)):
+            config_file = os.path.join(pretrained_model_name_or_path, LORA_CONFIG_NAME)
+        else:
+            raise ValueError(f"Can't find lora_config.json at '{pretrained_model_name_or_path}'")
+
+        loaded_attributes = cls.from_json_file(config_file)
+        loaded_attributes.pop("scaling", None)
+
+        config = cls(**kwargs)
+
+        for key, value in loaded_attributes.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+
+        return config
+
+    @classmethod
+    def from_json_file(cls, path_json_file):
+        r"""
+        Loads a configuration file from a json file.
+        Args:
+            path_json_file (`str`):
+                The path to the json file.
+        """
+        with open(path_json_file, "r") as file:
+            json_object = json.load(file)
+
+        return json_object
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/lora_layers.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/lora_layers.py
new file mode 100644
index 000000000..a40f3fff6
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/lora_layers.py
@@ -0,0 +1,802 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Optional
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.distributed.fleet.layers.mpu import mp_ops
+from paddle.distributed.fleet.meta_parallel import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+)
+
+from ...transformers import linear_utils
+
+ColumnSequenceParallelLinear = linear_utils.ColumnSequenceParallelLinear
+RowSequenceParallelLinear = linear_utils.RowSequenceParallelLinear
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        AllGatherOp,
+        ReduceScatterOp,
+        mark_as_sequence_parallel_parameter,
+    )
+except:
+    AllGatherOp = None
+    ReduceScatterOp = None
+    mark_as_sequence_parallel_parameter = None
+
+
+from ...transformers.mc2_parallel_linear import (
+    MC2ColumnParallelCoreLinear,
+    MC2ColumnSeqParallelCoreLinear,
+    MC2RowParallelCoreLinear,
+    MC2RowSeqParallelCoreLinear,
+)
+from .lora_quick_layers import quick_lora
+
+
+class LoRALinear(nn.Linear):
+    # LoRA implemented in a dense layer
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        use_quick_lora: bool = False,
+        rslora: bool = False,
+        lora_plus_scale: float = 1.0,
+        pissa: bool = False,
+        **kwargs
+    ):
+        nn.Linear.__init__(self, in_features, out_features, **kwargs)
+        if not isinstance(r, int) or r <= 0:
+            raise ValueError("Lora rank r should be a positive integer")
+        self.r = r
+        self.lora_alpha = lora_alpha
+        # Optional dropout
+        if lora_dropout > 0.0:
+            self.lora_dropout = nn.Dropout(p=lora_dropout)
+        else:
+            self.lora_dropout = lambda x: x
+        # Mark the weight as unmerged
+        self.merged = False
+        self.pissa = pissa
+
+        # Actual trainable parameters
+        self.lora_A = self.create_parameter(
+            shape=[in_features, r],
+            dtype=self._dtype,
+            is_bias=False,
+            default_initializer=nn.initializer.KaimingUniform(negative_slope=math.sqrt(5), nonlinearity="leaky_relu"),
+        )
+        self.lora_B = self.create_parameter(
+            shape=[r, out_features],
+            dtype=self._dtype,
+            is_bias=False,
+            attr=paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(value=0.0),
+                learning_rate=lora_plus_scale,
+            ),
+        )
+        self.apply_pissa = False
+
+        if not rslora and not pissa:
+            self.scaling = self.lora_alpha / self.r
+        elif pissa:
+            self.scaling = 1.0
+        else:
+            self.scaling = self.lora_alpha / math.sqrt(self.r)
+
+        # Freezing the pre-trained weight matrix
+        self.weight.stop_gradient = True
+        self._use_quick_lora = use_quick_lora and lora_dropout == 0.0
+        self.disable_lora = False
+
+    @property
+    def use_quick_lora(self):
+        return self._use_quick_lora and self.training and not self.merged
+
+    def pissa_init(self, rank):
+        weight = self.weight
+        dtype = weight.dtype
+        if dtype != paddle.float32:
+            weight = weight.astype(paddle.float32)
+
+        U, S, Vh = paddle.linalg.svd(weight.data, full_matrices=False)
+        Ur = U[:, :rank]
+        Sr = S[:rank]
+        Vhr = Vh[:rank]
+
+        lora_A = Ur @ paddle.diag(paddle.sqrt(Sr))
+        lora_B = paddle.diag(paddle.sqrt(Sr)) @ Vhr
+        self.lora_A.set_value(lora_A.astype(dtype))
+        self.lora_B.set_value(lora_B.astype(dtype))
+        res = weight.data - lora_A @ lora_B
+        weight = res.astype(dtype)
+        self.weight.set_value(weight)
+
+    def merge(self):
+        if not self.merged:
+            new_weight = self.weight + self.lora_A @ self.lora_B * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = True
+
+    def unmerge(self):
+        if self.merged:
+            new_weight = self.weight - self.lora_A @ self.lora_B * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = False
+
+    def forward(self, input: paddle.Tensor, *args, **kwargs):
+        if not self.apply_pissa and self.pissa:
+            self.pissa_init(self.r)
+            self.apply_pissa = True
+        if self.disable_lora or self.merged:
+            result = F.linear(x=input, weight=self.weight, bias=self.bias, name=self.name)
+        elif self.use_quick_lora:
+            # Use the quick lora implementation
+            result = quick_lora(input, self.lora_A, self.lora_B, self.weight, self.bias, self.scaling)
+        else:
+            result = F.linear(x=input, weight=self.weight, bias=self.bias, name=self.name)
+            result += (self.lora_dropout(input) @ self.lora_A @ self.lora_B) * self.scaling
+        return result
+
+    def extra_repr(self):
+        name = f", name={self.name}" if self.name else ""
+        return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}"
+
+
+class RowParallelLoRALinear(RowParallelLinear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        rslora: bool = False,
+        lora_plus_scale: float = 1.0,
+        use_quick_lora: bool = False,
+        pissa: bool = False,
+        **kwargs
+    ):
+        RowParallelLinear.__init__(self, in_features, out_features, **kwargs)
+        if not isinstance(r, int) or r <= 0:
+            raise ValueError("Lora rank r should be a positive integer")
+
+        if pissa:
+            raise ValueError("Pissa is not supported in model parallel by now")
+
+        self.r = r
+        self.lora_alpha = lora_alpha
+        # Optional dropout
+        if lora_dropout > 0.0:
+            self.lora_dropout = nn.Dropout(p=lora_dropout)
+        else:
+            self.lora_dropout = lambda x: x
+        # Mark the weight as unmerged
+        self.merged = False
+
+        # compatible
+        self.name = self._name
+
+        # Actual trainable parameters
+        self.lora_A = self.create_parameter(
+            shape=[self.input_size_per_partition, r],
+            dtype=self._dtype,
+            is_bias=False,
+            attr=paddle.ParamAttr(
+                initializer=nn.initializer.KaimingUniform(negative_slope=math.sqrt(5), nonlinearity="leaky_relu")
+            ),
+        )
+        self.lora_B = self.create_parameter(
+            shape=[r, self.out_features],
+            dtype=self._dtype,
+            is_bias=False,
+            attr=paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(value=0.0),
+                learning_rate=lora_plus_scale,
+            ),
+        )
+
+        self.lora_A.is_distributed = True
+        self.lora_A.split_axis = 0
+        self.lora_B.is_distributed = False
+        if not rslora:
+            self.scaling = self.lora_alpha / self.r
+        else:
+            self.scaling = self.lora_alpha / math.sqrt(self.r)
+
+        # Freezing the pre-trained weight matrix
+        self.weight.stop_gradient = True
+        self._use_quick_lora = use_quick_lora and lora_dropout == 0.0
+        self.disable_lora = False
+
+    @property
+    def use_quick_lora(self):
+        return self._use_quick_lora and self.training and not self.merged
+
+    def unmerge(self):
+        if self.merged:
+            new_weight = self.weight - self.lora_A @ self.lora_B * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = False
+
+    def merge(self):
+        if not self.merged:
+            new_weight = self.weight + self.lora_A @ self.lora_B * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = True
+
+    def forward(self, x: paddle.Tensor):
+        if not self.input_is_parallel:
+            input_mp = mp_ops._c_split(x, group=self.model_parallel_group)
+        else:
+            input_mp = x
+        if self.disable_lora or self.merged:
+            # x @ W : [bz, in_f / ws] ===> [bz, out_f]
+            if MC2RowParallelCoreLinear is None:
+                result_mp = F.linear(x=input_mp, weight=self.weight, name=self.name)
+                output = mp_ops._mp_allreduce(
+                    result_mp,
+                    group=self.model_parallel_group,
+                    use_calc_stream=True,
+                    use_model_parallel=True,
+                )
+            else:
+                output = MC2RowParallelCoreLinear.apply(input_mp, self.weight, self.model_parallel_group)
+            output = output + self.bias if self.bias is not None else output
+        elif self.use_quick_lora:
+            # Use the quick lora implementation
+            result_mp = quick_lora(
+                input_mp,
+                self.lora_A,
+                self.lora_B,
+                self.weight,
+                self.bias,
+                self.scaling,
+                is_row=True,
+                group=self.model_parallel_group,
+                world_size=self.world_size,
+            )
+            output = mp_ops._mp_allreduce(
+                result_mp,
+                group=self.model_parallel_group,
+                use_calc_stream=True,
+                use_model_parallel=True,
+            )
+        else:
+            # x @ W : [bz, in_f / ws] ===> [bz, out_f]
+            if MC2RowParallelCoreLinear is None:
+                result_mp = F.linear(x=input_mp, weight=self.weight, name=self.name)
+                output = mp_ops._mp_allreduce(
+                    result_mp,
+                    group=self.model_parallel_group,
+                    use_calc_stream=True,
+                    use_model_parallel=True,
+                )
+            else:
+                output = MC2RowParallelCoreLinear.apply(input_mp, self.weight, self.model_parallel_group)
+
+            # x @ A: [bz, in_f/ ws] ===> [bz, r]
+            input_mp = self.lora_dropout(input_mp) @ self.lora_A
+            # all reduce to keep Lora B's gradient on different gpu consistent
+            input_dup = mp_ops._mp_allreduce(
+                input_mp,
+                group=self.model_parallel_group,
+                use_calc_stream=True,
+                use_model_parallel=True,
+            )
+            #  @ B: [bz, r] ===> [bz, out_f]
+            delta_mp = (input_dup @ self.lora_B) * self.scaling
+            output += delta_mp
+            output = output + self.bias if self.bias is not None else output
+        return output
+
+    def extra_repr(self):
+        name = f", name={self.name}" if self.name else ""
+        return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}"
+
+
+class RowSequenceParallelLoRALinear(RowSequenceParallelLinear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        rslora: bool = False,
+        lora_plus_scale: float = 1.0,
+        use_quick_lora: bool = False,
+        **kwargs
+    ):
+        RowSequenceParallelLinear.__init__(self, in_features, out_features, **kwargs)
+        if not isinstance(r, int) or r <= 0:
+            raise ValueError("Lora rank r should be a positive integer")
+        self.r = r
+        self.lora_alpha = lora_alpha
+        # Optional dropout
+        if lora_dropout > 0.0:
+            self.lora_dropout = nn.Dropout(p=lora_dropout)
+        else:
+            self.lora_dropout = lambda x: x
+        # Mark the weight as unmerged
+        self.merged = False
+
+        # compatible
+        self.name = self._name
+
+        # Actual trainable parameters
+        self.lora_A = self.create_parameter(
+            shape=[self.input_size_per_partition, r],
+            dtype=self._dtype,
+            is_bias=False,
+            attr=paddle.ParamAttr(
+                initializer=nn.initializer.KaimingUniform(negative_slope=math.sqrt(5), nonlinearity="leaky_relu")
+            ),
+        )
+        self.lora_B = self.create_parameter(
+            shape=[r, self.out_features],
+            dtype=self._dtype,
+            is_bias=False,
+            attr=paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(value=0.0),
+                learning_rate=lora_plus_scale,
+            ),
+        )
+
+        self.lora_A.is_distributed = True
+        self.lora_A.split_axis = 0
+        self.lora_B.is_distributed = False
+        mark_as_sequence_parallel_parameter(self.lora_B)
+        if not rslora:
+            self.scaling = self.lora_alpha / self.r
+        else:
+            self.scaling = self.lora_alpha / math.sqrt(self.r)
+
+        # Freezing the pre-trained weight matrix
+        self.weight.stop_gradient = True
+        self._use_quick_lora = use_quick_lora and lora_dropout == 0.0
+        self.disable_lora = False
+
+    @property
+    def use_quick_lora(self):
+        # TODO(@gexiao): support qlora
+        return False  # self._use_quick_lora and self.training and not self.merged
+
+    def unmerge(self):
+        if self.merged:
+            new_weight = self.weight - self.lora_A @ self.lora_B * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = False
+
+    def merge(self):
+        if not self.merged:
+            new_weight = self.weight + self.lora_A @ self.lora_B * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = True
+
+    def forward(self, x: paddle.Tensor):
+        if not self.input_is_parallel:
+            input_mp = mp_ops._c_split(x, group=self.model_parallel_group)
+        else:
+            input_mp = x
+
+        if MC2RowSeqParallelCoreLinear is None:
+            output_parallel = self.linear(input_mp, self.weight, name=self._name)
+            output_ = ReduceScatterOp.apply(output_parallel)
+            result_mp = output_ + self.bias if self.bias is not None else output_
+        else:
+            output_ = MC2RowSeqParallelCoreLinear.apply(input_mp, self.weight, self.model_parallel_group)
+            result_mp = output_ + self.bias if self.bias is not None else output_
+
+        if not self.merged and not self.disable_lora:
+            input_mp = self.lora_dropout(input_mp)
+            # TODO(@gexiao): temporary workaround for deterministic calculation
+            if True or MC2RowSeqParallelCoreLinear is None:
+                input_mp = input_mp @ self.lora_A
+                input_mp = ReduceScatterOp.apply(input_mp)
+            else:
+                input_mp = MC2RowSeqParallelCoreLinear.apply(input_mp, self.lora_A, self.model_parallel_group)
+            delta_mp = (input_mp @ self.lora_B) * self.scaling
+            result_mp += delta_mp
+        return result_mp
+
+    def extra_repr(self):
+        name = f", name={self.name}" if self.name else ""
+        return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}"
+
+
+class ColumnParallelLoRALinear(ColumnParallelLinear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        rslora: bool = False,
+        lora_plus_scale: float = 1.0,
+        lora_A_weight_attr: Optional[paddle.ParamAttr] = None,
+        use_quick_lora: bool = False,
+        pissa: bool = False,
+        **kwargs
+    ):
+        ColumnParallelLinear.__init__(self, in_features, out_features, **kwargs)
+        if not isinstance(r, int) or r <= 0:
+            raise ValueError("Lora rank r should be a positive integer")
+
+        if pissa:
+            raise ValueError("Pissa is not supported in model parallel by now")
+
+        self.r = r
+        self.lora_alpha = lora_alpha
+        # Optional dropout
+        if lora_dropout > 0.0:
+            self.lora_dropout = nn.Dropout(p=lora_dropout)
+        else:
+            self.lora_dropout = lambda x: x
+        # Mark the weight as unmerged
+        self.merged = False
+
+        # compatible
+        self.name = self._name
+
+        # Actual trainable parameters
+        self.lora_A = self.create_parameter(
+            shape=[in_features, r],
+            dtype=self._dtype,
+            is_bias=False,
+            attr=lora_A_weight_attr,
+        )
+        self.lora_A.is_distributed = False
+        self.lora_B = self.create_parameter(
+            shape=[r, self.output_size_per_partition],
+            dtype=self._dtype,
+            is_bias=False,
+            attr=paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(value=0.0),
+                learning_rate=lora_plus_scale,
+            ),
+        )
+
+        self.lora_B.is_distributed = True
+        self.lora_B.split_axis = 1
+        if not rslora:
+            self.scaling = self.lora_alpha / self.r
+        else:
+            self.scaling = self.lora_alpha / math.sqrt(self.r)
+
+        # Freezing the pre-trained weight matrix
+        self.weight.stop_gradient = True
+        self._use_quick_lora = use_quick_lora and lora_dropout == 0.0
+        self.disable_lora = False
+
+    @property
+    def use_quick_lora(self):
+        return self._use_quick_lora and self.training and not self.merged
+
+    def unmerge(self):
+        if self.merged:
+            # Make sure that the weights are not merged
+            new_weight = self.weight - self.lora_A @ self.lora_B * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = False
+
+    def merge(self):
+        if not self.merged:
+            # Merge the weights and mark it
+            new_weight = self.weight + self.lora_A @ self.lora_B * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = True
+
+    def forward(self, input: paddle.Tensor):
+        if self.disable_lora or self.merged:
+            if MC2ColumnParallelCoreLinear is None:
+                input_mp = mp_ops._c_identity(input, group=self.model_parallel_group)
+                result_mp = F.linear(x=input_mp, weight=self.weight, bias=self.bias, name=self.name)
+            else:
+                res_mp = MC2ColumnParallelCoreLinear.apply(input, self.weight, self.model_parallel_group)
+                result_mp = (res_mp + self.bias) if self.bias is not None else res_mp
+
+        elif self.use_quick_lora:
+            # Use the quick lora implementation
+            input_mp = mp_ops._c_identity(input, group=self.model_parallel_group) if self.is_mp else input
+            result_mp = quick_lora(
+                input_mp,
+                self.lora_A,
+                self.lora_B,
+                self.weight,
+                self.bias,
+                self.scaling,
+                is_column=True,
+                group=self.model_parallel_group,
+                world_size=self.world_size,
+            )
+        else:
+            if MC2ColumnParallelCoreLinear is None:
+                input_mp = mp_ops._c_identity(input, group=self.model_parallel_group)
+                result_mp = F.linear(x=input_mp, weight=self.weight, bias=self.bias, name=self.name)
+            else:
+                res_mp = MC2ColumnParallelCoreLinear.apply(input, self.weight, self.model_parallel_group)
+                result_mp = (res_mp + self.bias) if self.bias is not None else res_mp
+
+            input_a = self.lora_dropout(input) @ self.lora_A
+            if MC2ColumnParallelCoreLinear is None:
+                input_a_mp = mp_ops._c_identity(input_a, group=self.model_parallel_group)
+                delta_mp = (input_a_mp @ self.lora_B) * self.scaling
+            else:
+                tmp = MC2ColumnParallelCoreLinear.apply(input_a, self.lora_B, self.model_parallel_group)
+                delta_mp = tmp * self.scaling
+            result_mp += delta_mp
+
+        if self.gather_output and self.is_mp:
+            result = mp_ops._c_concat(result_mp, group=self.model_parallel_group)
+        else:
+            result = result_mp
+        return result
+
+    def extra_repr(self):
+        name = f", name={self.name}" if self.name else ""
+        return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}"
+
+
+class ColumnSequenceParallelLoRALinear(ColumnSequenceParallelLinear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        rslora: bool = False,
+        lora_plus_scale: float = 1.0,
+        lora_A_weight_attr: Optional[paddle.ParamAttr] = None,
+        use_quick_lora: bool = False,
+        **kwargs
+    ):
+        ColumnSequenceParallelLinear.__init__(self, in_features, out_features, **kwargs)
+        if not isinstance(r, int) or r <= 0:
+            raise ValueError("Lora rank r should be a positive integer")
+        self.r = r
+        self.lora_alpha = lora_alpha
+        # Optional dropout
+        if lora_dropout > 0.0:
+            self.lora_dropout = nn.Dropout(p=lora_dropout)
+        else:
+            self.lora_dropout = lambda x: x
+        # Mark the weight as unmerged
+        self.merged = False
+
+        # compatible
+        self.name = self._name
+
+        # Actual trainable parameters
+        self.lora_A = self.create_parameter(
+            shape=[in_features, r],
+            dtype=self._dtype,
+            is_bias=False,
+            attr=lora_A_weight_attr,
+        )
+        self.lora_A.is_distributed = False
+        mark_as_sequence_parallel_parameter(self.lora_A)
+
+        self.lora_B = self.create_parameter(
+            shape=[r, self.output_size_per_partition],
+            dtype=self._dtype,
+            is_bias=False,
+            attr=paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(value=0.0),
+                learning_rate=lora_plus_scale,
+            ),
+        )
+
+        self.lora_B.is_distributed = True
+        self.lora_B.split_axis = 1
+        if not rslora:
+            self.scaling = self.lora_alpha / self.r
+        else:
+            self.scaling = self.lora_alpha / math.sqrt(self.r)
+
+        # Freezing the pre-trained weight matrix
+        self.weight.stop_gradient = True
+        self._use_quick_lora = use_quick_lora and lora_dropout == 0.0
+        self.disable_lora = False
+
+    @property
+    def use_quick_lora(self):
+        # TODO(@gexiao): support qlora
+        return False  # self._use_quick_lora and self.training and not self.merged
+
+    def unmerge(self):
+        if self.merged:
+            new_weight = self.weight - self.lora_A @ self.lora_B * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = False
+
+    def merge(self):
+        if not self.merged:
+            new_weight = self.weight + self.lora_A @ self.lora_B * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = True
+
+    def forward(self, x: paddle.Tensor):
+        if MC2ColumnSeqParallelCoreLinear is None:
+            if self.is_mp:
+                input_parallel = AllGatherOp.apply(x)
+            else:
+                input_parallel = x
+            result_mp = self.linear(input_parallel, self.weight, self.bias, name=self._name)
+        else:
+            result_mp = MC2ColumnSeqParallelCoreLinear.apply(x, self.weight, self.model_parallel_group)
+            if self.bias is not None:
+                result_mp += self.bias
+
+        if not self.merged and not self.disable_lora:
+            input_a = self.lora_dropout(x) @ self.lora_A
+            # TODO(@gexiao): temporary workaround for deterministic calculation
+            if True or MC2ColumnSeqParallelCoreLinear is None:
+                input_a = AllGatherOp.apply(input_a)
+                delta_mp = (input_a @ self.lora_B) * self.scaling
+            else:
+                input_a = MC2ColumnSeqParallelCoreLinear.apply(input_a, self.lora_B, self.model_parallel_group)
+                delta_mp = input_a * self.scaling
+            result_mp += delta_mp
+
+        if self.gather_output and self.is_mp:
+            result = mp_ops._c_concat(result_mp, group=self.model_parallel_group)
+        else:
+            result = result_mp
+        return result
+
+    def extra_repr(self):
+        name = f", name={self.name}" if self.name else ""
+        return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}"
+
+
+class LoRAConv2D(nn.Conv2D):
+    # LoRA implemented in a dense layer
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+        **kwargs
+    ):
+        nn.Conv2D.__init__(self, in_channels, out_channels, kernel_size, **kwargs)
+        if not isinstance(r, int) or r <= 0:
+            raise ValueError("Lora rank r should be a positive integer")
+        self.r = r
+        self.lora_alpha = lora_alpha
+        # Optional dropout
+        if lora_dropout > 0.0:
+            self.lora_dropout = nn.Dropout(p=lora_dropout)
+        else:
+            self.lora_dropout = lambda x: x
+        # Mark the weight as unmerged
+        self.merged = False
+
+        # Actual trainable parameters
+        lora_A = nn.Conv2D(
+            in_channels,
+            r,
+            kernel_size=self._kernel_size,
+            stride=self._stride,
+            padding=self._padding,
+            weight_attr=nn.initializer.KaimingUniform(negative_slope=math.sqrt(5), nonlinearity="leaky_relu"),
+            bias_attr=False,
+        )
+        self.lora_A = lora_A.weight
+        self.lora_A_forward = lambda x: nn.Conv2D.__call__(lora_A, x)
+        lora_B = nn.Conv2D(
+            r,
+            out_channels,
+            kernel_size=(1, 1),
+            stride=(1, 1),
+            weight_attr=nn.initializer.Constant(value=0.0),
+            bias_attr=False,
+        )
+        self.lora_B_forward = lambda x: nn.Conv2D.__call__(lora_B, x)
+        self.lora_B = lora_B.weight
+        self.scaling = lora_alpha / r
+
+        # Freezing the pre-trained weight matrix
+        self.weight.stop_gradient = True
+        if self.bias is not None:
+            self.bias.stop_gradient = True
+        self.disable_lora = False
+
+    def unmerge(self):
+        if self.merged:
+            weight_A = self.lora_A.cast(dtype=self.weight.dtype)
+            weight_B = self.lora_B.cast(dtype=self.weight.dtype)
+            if self.weight.shape[2:4] == [1, 1]:
+                # conv2d 1x1
+                delta_weight = (weight_B.squeeze(3).squeeze(2) @ weight_A.squeeze(3).squeeze(2)).unsqueeze(
+                    2
+                ).unsqueeze(3) * self.scaling
+            else:
+                # conv2d 3x3
+                delta_weight = (
+                    F.conv2d(
+                        weight_A.transpose([1, 0, 2, 3]),
+                        weight_B,
+                    ).transpose([1, 0, 2, 3])
+                    * self.scaling
+                )
+            # Make sure that the weights are not merged
+            new_weight = self.weight - delta_weight
+            self.weight.set_value(new_weight)
+            self.merged = False
+
+    def merge(self):
+        if not self.merged:
+            weight_A = self.lora_A.cast(dtype=self.weight.dtype)
+            weight_B = self.lora_B.cast(dtype=self.weight.dtype)
+            if self.weight.shape[2:4] == [1, 1]:
+                # conv2d 1x1
+                delta_weight = (weight_B.squeeze(3).squeeze(2) @ weight_A.squeeze(3).squeeze(2)).unsqueeze(
+                    2
+                ).unsqueeze(3) * self.scaling
+            else:
+                # conv2d 3x3
+                delta_weight = (
+                    F.conv2d(
+                        weight_A.transpose([1, 0, 2, 3]),
+                        weight_B,
+                    ).transpose([1, 0, 2, 3])
+                    * self.scaling
+                )
+            # Merge the weights and mark it
+            new_weight = self.weight + delta_weight
+            self.weight.set_value(new_weight)
+            self.merged = True
+
+    def forward(self, input: paddle.Tensor, *args, **kwargs):
+        previous_dtype = input.dtype
+        result = super().forward(input)
+        if not self.merged and not self.disable_lora:
+            result += (
+                self.lora_B_forward(self.lora_A_forward(self.lora_dropout(input.cast(dtype=self.lora_A.dtype))))
+                * self.scaling
+            )
+        result = result.cast(dtype=previous_dtype)
+        return result
+
+    def extra_repr(self):
+        main_str = "{_in_channels}, {_out_channels}, kernel_size={_kernel_size}"
+        if self._stride != [1] * len(self._stride):
+            main_str += ", stride={_stride}"
+        if self._padding != 0:
+            main_str += ", padding={_padding}"
+        if self._padding_mode != "zeros":
+            main_str += ", padding_mode={_padding_mode}"
+        if self.output_padding != 0:
+            main_str += ", output_padding={output_padding}"
+        if self._dilation != [1] * len(self._dilation):
+            main_str += ", dilation={_dilation}"
+        if self._groups != 1:
+            main_str += ", groups={_groups}"
+        main_str += ", data_format={_data_format}, rank={r}, alpha={lora_alpha}"
+        return main_str.format(**self.__dict__)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/lora_model.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/lora_model.py
new file mode 100644
index 000000000..4f619307f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/lora_model.py
@@ -0,0 +1,861 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import gc
+import math
+import os
+import re
+import tempfile
+from collections import OrderedDict
+from functools import partial
+from typing import Dict, List, Union
+
+import aistudio_sdk
+import numpy as np
+import paddle
+import paddle.nn as nn
+from paddle.distributed.fleet.meta_parallel import (
+    ColumnParallelLinear,
+    PipelineLayer,
+    RowParallelLinear,
+)
+
+from ...transformers import linear_utils
+from ...transformers.conversion_utils import ConversionMixin
+from ...transformers.model_utils import (
+    PretrainedModel,
+    _add_variant,
+    _load_state_dict_into_model,
+    dtype_guard,
+    load_state_dict,
+)
+from ...transformers.utils import get_checkpoint_shard_files, weight_name_suffix
+from ...utils.distributed import distributed_allgather, distributed_gather
+from ...utils.env import LORA_WEIGHTS_NAME, SAFE_PEFT_WEIGHTS_INDEX_NAME
+from ...utils.log import logger
+from ...utils.tools import get_env_device
+from .lora_config import LoRAConfig
+
+
+def get_lora_layers():
+    try:
+        if get_env_device() == "xpu":
+            # If paddle_xpu is not installed, just use PaddleNLP's native lora layers
+            from paddle_xpu.layers.nn.lora_layers import (
+                XPUColumnParallelLoRALinear as ColumnParallelLoRALinear,
+            )
+            from paddle_xpu.layers.nn.lora_layers import (
+                XPUColumnSequenceParallelLoRALinear as ColumnSequenceParallelLoRALinear,
+            )
+            from paddle_xpu.layers.nn.lora_layers import XPULoRALinear as LoRALinear
+            from paddle_xpu.layers.nn.lora_layers import (
+                XPURowParallelLoRALinear as RowParallelLoRALinear,
+            )
+            from paddle_xpu.layers.nn.lora_layers import (
+                XPURowSequenceParallelLoRALinear as RowSequenceParallelLoRALinear,
+            )
+
+            from .lora_layers import LoRAConv2D
+        else:
+            raise ImportError  # Force to use the fallback if not XPU
+    except ImportError:
+        from .lora_layers import (
+            ColumnParallelLoRALinear,
+            ColumnSequenceParallelLoRALinear,
+            LoRAConv2D,
+            LoRALinear,
+            RowParallelLoRALinear,
+            RowSequenceParallelLoRALinear,
+        )
+
+    return {
+        "ColumnParallelLoRALinear": ColumnParallelLoRALinear,
+        "ColumnSequenceParallelLoRALinear": ColumnSequenceParallelLoRALinear,
+        "LoRAConv2D": LoRAConv2D,
+        "LoRALinear": LoRALinear,
+        "RowParallelLoRALinear": RowParallelLoRALinear,
+        "RowSequenceParallelLoRALinear": RowSequenceParallelLoRALinear,
+    }
+
+
+lora_layers = get_lora_layers()
+ColumnParallelLoRALinear = lora_layers["ColumnParallelLoRALinear"]
+ColumnSequenceParallelLoRALinear = lora_layers["ColumnSequenceParallelLoRALinear"]
+LoRAConv2D = lora_layers["LoRAConv2D"]
+LoRALinear = lora_layers["LoRALinear"]
+RowParallelLoRALinear = lora_layers["RowParallelLoRALinear"]
+RowSequenceParallelLoRALinear = lora_layers["RowSequenceParallelLoRALinear"]
+AVAILABLE_LAYERS = [
+    ColumnParallelLoRALinear,
+    ColumnSequenceParallelLoRALinear,
+    LoRAConv2D,
+    LoRALinear,
+    RowParallelLoRALinear,
+    RowSequenceParallelLoRALinear,
+]
+try:
+    from ...quantization.quantization_linear import (
+        ColumnParallelQuantizationLinear,
+        QuantizationLinear,
+        RowParallelQuantizationLinear,
+    )
+    from .lora_quantization_layers import (
+        ColumnParallelQuantizationLoRALinear,
+        QuantizationLoRALinear,
+        RowParallelQuantizationLoRALinear,
+    )
+
+    AVAILABLE_LAYERS += [
+        ColumnParallelQuantizationLoRALinear,
+        QuantizationLoRALinear,
+        RowParallelQuantizationLoRALinear,
+    ]
+except:
+    QuantizationLinear = None
+    ColumnParallelQuantizationLinear = None
+    RowParallelQuantizationLinear = None
+    QuantizationLoRALinear = None
+    ColumnParallelQuantizationLoRALinear = None
+    RowParallelQuantizationLoRALinear = None
+
+
+class LoRAModel(nn.Layer):
+    # TODO:lugimzzz support restore in following PR
+    restore_layer_map: Dict[nn.Layer, nn.Layer] = {
+        LoRALinear: nn.Linear,
+        LoRAConv2D: nn.Conv2D,
+        # ColumnParallelLoRALinear: ColumnParallelLinear,
+        # RowParallelLoRALinear: RowParallelLinear,
+        # QuantizationLoRALinear: QuantizationLinear,
+    }
+
+    def __init__(self, model, lora_config: LoRAConfig) -> None:
+        super().__init__()
+        self.quantized = False
+        self.lora_config = lora_config
+        self.lora_split_mapping = {}
+        if self.lora_config.dtype is None:
+            self.lora_config.dtype = paddle.get_default_dtype()
+        with dtype_guard(self.lora_config.dtype):
+            self.model = self.get_lora_model(model, lora_config)
+        self.is_pipelinemodel = False
+        if issubclass(type(self.model), PipelineLayer):
+            self.is_pipelinemodel = True
+            self.model._single_to_pp_mapping = None
+        if self.lora_config.tensor_parallel_degree != self.model.config.tensor_parallel_degree:
+            self.lora_config.tensor_parallel_degree = self.model.config.tensor_parallel_degree
+            logger.warning(
+                f"Reset tensor_parallel_degree of lora_config to {self.model.config.tensor_parallel_degree}."
+            )
+        self.forward = self.model.forward
+
+        logger.info("Mark only lora and trainable_module as trainable.")
+        self.mark_only_lora_as_trainable()
+
+    def add_lora_split_mapping(self, module_name, is_column=False):
+        self.lora_split_mapping[module_name] = is_column
+
+    def _get_tensor_parallel_mappings(self, config, is_split=True):
+
+        from paddlenlp.transformers.conversion_utils import split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        rename_lora_split_mapping = {}
+        if issubclass(type(self.model), PipelineLayer):
+            # rename lora_split_mapping
+            prefixes = self.model.get_sequential_name_prefixes()
+            keys = self.lora_split_mapping.keys()
+            first_key = ""
+            for k in keys:
+                first_key = k
+                break
+            first_key = first_key.split(".")
+            use_virtual_pp_degree = first_key[0].isdigit() and first_key[1].isdigit()
+
+            for k in keys:
+                name_splited = k.split(".")
+                if use_virtual_pp_degree:
+                    if name_splited[0].isdigit():
+                        if name_splited[1].isdigit():
+                            idx = str(int(name_splited[0]) + int(name_splited[1]))
+                            single_name = [prefixes[idx]]
+                            single_name.extend(name_splited[2:])
+                        else:
+                            single_name = [prefixes[str(len(prefixes) - 1)]]
+                            single_name.extend(name_splited[2:])
+                            logger.warning(
+                                f"Please check! we treat this key as last layer, get {k}, set origin name as {'.'.join(single_name)}"
+                            )
+                    else:
+                        raise ValueError(f"Please check! {k} is not a valid key.")
+                else:
+                    idx = name_splited[0]
+                    # for normal pp layer name
+                    if idx.isdigit():
+                        single_name = [prefixes[idx]]
+                        single_name.extend(name_splited[1:])
+                    else:
+                        raise ValueError(f"Unexpected key: {k} for pp lora layer.")
+                rename_lora_split_mapping[".".join(single_name)] = self.lora_split_mapping[k]
+
+        lora_split_mapping = (
+            rename_lora_split_mapping if issubclass(type(self.model), PipelineLayer) else self.lora_split_mapping
+        )
+
+        def get_tensor_parallel_split_mappings():
+            final_actions = {}
+            for key, is_col in lora_split_mapping.items():
+                final_actions[key] = partial(fn, is_column=is_col)
+
+            return final_actions
+
+        mappings = get_tensor_parallel_split_mappings()
+
+        return mappings
+
+    @classmethod
+    def from_pretrained(cls, model, lora_path, **kwargs):
+        lora_config = kwargs.pop("lora_config", None)
+        # init lora config & lora model
+        if not isinstance(lora_config, LoRAConfig):
+            lora_config = LoRAConfig.from_pretrained(lora_path)
+        # define a new variable to conserve original lora_config.tensor_parallel_degree value which will update while initializing lora model
+        lora_config_tensor_parallel_degree = lora_config.tensor_parallel_degree
+        lora_model = cls(model, lora_config)
+
+        lora_model_index_file = os.path.join(lora_path, SAFE_PEFT_WEIGHTS_INDEX_NAME)
+        if os.path.exists(lora_model_index_file):
+            # load safetensors format file.
+            resolved_archieve_file, sharded_metadata = get_checkpoint_shard_files(
+                pretrained_model_name_or_path=lora_path,
+                index_filename=lora_model_index_file,
+            )
+            loaded_keys = sharded_metadata["all_checkpoint_keys"]
+            expected_keys = set(lora_model.get_trainable_state_dict().keys())
+
+            missing_keys = expected_keys - set(loaded_keys)
+            if len(missing_keys) > 0:
+                raise ValueError(f"missing_keys: {missing_keys}")
+
+            error_msgs = []
+            for shard_file in resolved_archieve_file:
+                pre_tensor_parallel_split = False
+                if model.config.tensor_parallel_degree > 1:
+                    pre_tensor_parallel_split = True
+                    tp_actions = lora_model._get_tensor_parallel_convert_actions(loaded_keys, is_split=True)
+                state_dict = load_state_dict(
+                    shard_file, tp_actions if pre_tensor_parallel_split else None, expected_keys
+                )
+                error_msgs += _load_state_dict_into_model(lora_model.model, state_dict, "")
+                del state_dict
+                gc.collect()
+
+            if len(error_msgs) > 0:
+                error_msg = "\n\t".join(error_msgs)
+                raise RuntimeError(
+                    f"Error(s) in loading state_dict for {lora_model.__class__.__name__}:\n\t{error_msg}"
+                )
+
+            return lora_model
+
+        # define lora weight name
+        if lora_config_tensor_parallel_degree > 1:
+            lora_weight_name = _add_variant(LORA_WEIGHTS_NAME, f"tp{model.config.tensor_parallel_rank:0>2d}")
+        else:
+            lora_weight_name = LORA_WEIGHTS_NAME
+
+        # load and set lora weight parameter
+        lora_weight_path = os.path.join(lora_path, lora_weight_name)
+        if os.path.exists(lora_weight_path):
+            # load lora weight parameter
+            lora_state_dict = paddle.load(lora_weight_path, return_numpy=True)
+            logger.info(f"Loading the LoRA weights from {lora_weight_path}")
+
+            if (
+                lora_config_tensor_parallel_degree > 1
+                and lora_config_tensor_parallel_degree != model.config.tensor_parallel_degree
+            ):
+                raise NotImplementedError(
+                    f"{lora_config_tensor_parallel_degree} is not equal to {model.config.tensor_parallel_degree}. Please merge LoRA weights first."
+                )
+
+            # convert parameters to tensor parallel for mp model
+            if lora_config_tensor_parallel_degree <= 1 and model.config.tensor_parallel_degree > 1:
+                lora_state_dict = lora_model._convert_tensor_parallel(lora_state_dict=lora_state_dict)
+
+            # set lora state dict
+            lora_model.set_state_dict(lora_state_dict)
+        else:
+            logger.error(f"LoRA weights not found under {lora_path}, creating LoRA weights from scratch")
+
+        return lora_model
+
+    def set_state_dict(self, state_dict):
+        import warnings
+
+        warnings.filterwarnings(
+            action="ignore", message=".*Skip loading for.*", category=Warning, lineno=0, append=False
+        )
+        self.model.set_state_dict(state_dict)
+        logger.info("Load lora weight successfully")
+
+    def _merge_trainable_tensor_parallel(self, trainable_state_dict):
+        trainable_name_action_mappings = self._get_tensor_parallel_convert_actions(
+            trainable_state_dict.keys(), is_split=False
+        )
+
+        hcg = paddle.distributed.fleet.get_hybrid_communicate_group()
+        mp_group = hcg.get_model_parallel_group()
+        is_dst = paddle.distributed.get_rank(mp_group) == 0
+
+        for key in trainable_state_dict:
+            tensor = trainable_state_dict[key]
+            if key in trainable_name_action_mappings:
+                if get_env_device() == "xpu":
+                    ret = distributed_allgather(tensor, group=mp_group, offload=True)
+                else:
+                    ret = distributed_gather(tensor, group=mp_group, offload=True)
+                action = trainable_name_action_mappings[key]
+                if key in self.lora_split_mapping and not self.lora_split_mapping[key] and "_scale" in key and is_dst:
+                    ret = paddle.to_tensor(ret)
+                    tensor = paddle.max(ret, axis=0)
+                else:
+                    tensor = action(ret) if is_dst else None
+                trainable_state_dict[key] = tensor
+            else:
+                trainable_state_dict[key] = tensor.cpu().numpy() if is_dst else None
+
+        return trainable_state_dict
+
+    def _get_tensor_parallel_convert_actions(self, loaded_keys, is_split=True, ignore_error=False, config=None):
+        if config is None:
+            config = self.model.config
+        specific_name_action_mappings = self._get_tensor_parallel_mappings(config, is_split=is_split)
+        name_action_mappings = self.model._get_tensor_parallel_mappings(config, is_split=is_split)
+        state_keys_map = ConversionMixin._resolve_prefix_keys(
+            name_action_mappings.keys(), self.model.state_dict().keys(), ignore_error=ignore_error
+        )
+        for k, v in state_keys_map.items():
+            if v in loaded_keys:
+                specific_name_action_mappings[v] = name_action_mappings[k]
+        return specific_name_action_mappings
+
+    def _convert_tensor_parallel(self, lora_state_dict):
+        lora_name_action_mappings = self._get_tensor_parallel_convert_actions(lora_state_dict.keys(), is_split=True)
+
+        for name, action in lora_name_action_mappings.items():
+            if name in lora_state_dict:
+                tensor = lora_state_dict.pop(name)
+                lora_state_dict[name] = action(tensor)
+            else:
+                logger.warning(f"{name} not found in lora_state_dict!")
+        return lora_state_dict
+
+    def save_pretrained(self, save_directory: str, merge_tensor_parallel: bool = False, **kwargs):
+        save_model_config = kwargs.get("save_model_config", True)
+
+        if self.is_pipelinemodel:
+            self.model._single_to_pp_mapping = None
+        if self.quantized and merge_tensor_parallel and self.lora_config.tensor_parallel_degree > 1:
+            merge_tensor_parallel = False
+            logger.warning(
+                "Quantized strategy does not support merge_tensor_parallel. Set merge_tensor_parallel to False."
+            )
+        if self.is_pipelinemodel and merge_tensor_parallel and self.lora_config.tensor_parallel_degree > 1:
+            merge_tensor_parallel = False
+            logger.warning(
+                "Pipeline parallism does not support merge_tensor_parallel. Set merge_tensor_parallel to False."
+            )
+
+        variant = kwargs.get("variant", None)
+        is_main_process = kwargs.get("is_main_process", paddle.distributed.get_rank() == 0)
+
+        assert not os.path.isfile(
+            save_directory
+        ), f"Saving directory ({save_directory}) should be a directory, not a file"
+        os.makedirs(save_directory, exist_ok=True)
+
+        lora_config_to_save = LoRAConfig(**self.lora_config.to_dict())
+
+        if merge_tensor_parallel and lora_config_to_save.tensor_parallel_degree > 1:
+            trainable_state_dict = self.get_trainable_state_dict()
+            trainable_state_dict = self._merge_trainable_tensor_parallel(trainable_state_dict)
+            if not is_main_process:
+                logger.info("Saving with merge_tensor_parallel, tensor_parallel_rank > 0 don't need save")
+                return
+            if variant is not None and "tp" in variant:
+                variant = "_".join([x for x in variant.split("_") if "tp" not in x])
+            lora_config_to_save.tensor_parallel_degree = -1
+        else:
+            trainable_state_dict = self.get_trainable_state_dict()
+            if lora_config_to_save.tensor_parallel_degree > 1:
+                if variant is None:
+                    variant = weight_name_suffix()
+
+        # save lora weight
+        lora_weight_name = _add_variant(LORA_WEIGHTS_NAME, variant)
+        weight_filename = os.path.join(save_directory, lora_weight_name)
+        paddle.save(trainable_state_dict, weight_filename)
+
+        # save lora config
+        if is_main_process:
+            lora_config_to_save.save_pretrained(save_directory)
+            if save_model_config:
+                model_config_to_save = copy.deepcopy(self.model.config)
+                if merge_tensor_parallel:
+                    model_config_to_save.tensor_parallel_degree = -1
+                model_config_to_save.save_pretrained(save_directory)
+
+    def _find_and_replace_module(self, model, module_name, lora_config, enable_lora):
+        parent_module = model
+        attribute_chain = module_name.split(".")
+        for name in attribute_chain[:-1]:
+            parent_module = getattr(parent_module, name)
+        module = getattr(parent_module, attribute_chain[-1])
+        lora_module = None
+        if isinstance(module, nn.Linear):
+            lora_module = LoRALinear(
+                in_features=module.weight.shape[0],
+                out_features=module.weight.shape[1],
+                r=lora_config.r,
+                lora_alpha=lora_config.lora_alpha,
+                lora_dropout=lora_config.lora_dropout,
+                rslora=lora_config.rslora,
+                lora_plus_scale=lora_config.lora_plus_scale,
+                pissa=lora_config.pissa,
+                bias_attr=False if module.bias is None else None,
+                use_quick_lora=lora_config.use_quick_lora,
+            )
+        if isinstance(module, nn.Conv2D):
+            lora_module = LoRAConv2D(
+                in_channels=module._in_channels,
+                out_channels=module._out_channels,
+                kernel_size=module._kernel_size,
+                stride=module._stride,
+                padding=module._padding,
+                dilation=module._dilation,
+                groups=module._groups,
+                padding_mode=module._padding_mode,
+                data_format=module._data_format,
+                r=lora_config.r,
+                lora_alpha=lora_config.lora_alpha,
+                lora_dropout=lora_config.lora_dropout,
+                bias_attr=module._bias_attr,
+            )
+        elif isinstance(module, ColumnParallelLinear):
+            # recover the original output_features
+            output_features = module.weight.shape[1] * module.world_size
+            lora_module = ColumnParallelLoRALinear(
+                in_features=module.weight.shape[0],
+                out_features=output_features,
+                gather_output=module.gather_output,
+                has_bias=module.bias is not None,
+                r=lora_config.r,
+                lora_alpha=lora_config.lora_alpha,
+                lora_dropout=lora_config.lora_dropout,
+                rslora=lora_config.rslora,
+                lora_plus_scale=lora_config.lora_plus_scale,
+                pissa=lora_config.pissa,
+                lora_A_weight_attr=paddle.ParamAttr(
+                    initializer=nn.initializer.KaimingUniform(negative_slope=math.sqrt(5), nonlinearity="leaky_relu")
+                ),
+                use_quick_lora=lora_config.use_quick_lora,
+            )
+            # Lora column parallel will spilt lora B matrix
+            self.add_lora_split_mapping(module_name + ".lora_B", is_column=True)
+
+            # for lora qat
+            if self.lora_config.do_qat:
+                self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=True)
+                self.add_lora_split_mapping(module_name + ".activation_quanter._scale", is_column=False)
+                self.add_lora_split_mapping(module_name + ".activation_quanter.quanter._scale", is_column=False)
+        elif isinstance(module, RowParallelLinear):
+            # recover the original output_features
+            lora_module = RowParallelLoRALinear(
+                in_features=module.weight.shape[0] * module.world_size,
+                out_features=module.weight.shape[1],
+                has_bias=module.bias is not None,
+                input_is_parallel=module.input_is_parallel,
+                r=lora_config.r,
+                lora_alpha=lora_config.lora_alpha,
+                lora_dropout=lora_config.lora_dropout,
+                rslora=lora_config.rslora,
+                lora_plus_scale=lora_config.lora_plus_scale,
+                pissa=lora_config.pissa,
+                use_quick_lora=lora_config.use_quick_lora,
+            )
+            # Lora column parallel will spilt lora A matrix
+            self.add_lora_split_mapping(module_name + ".lora_A", is_column=False)
+
+            # for lora qat
+            if self.lora_config.do_qat:
+                self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=False)
+                self.add_lora_split_mapping(module_name + ".activation_quanter._scale", is_column=False)
+                self.add_lora_split_mapping(module_name + ".activation_quanter.quanter._scale", is_column=False)
+        elif isinstance(module, linear_utils.ColumnSequenceParallelLinear):
+            # recover the original output_features
+            output_features = module.weight.shape[1] * module.world_size
+            lora_module = ColumnSequenceParallelLoRALinear(
+                in_features=module.weight.shape[0],
+                out_features=output_features,
+                gather_output=module.gather_output,
+                has_bias=module.bias is not None,
+                r=lora_config.r,
+                lora_alpha=lora_config.lora_alpha,
+                lora_dropout=lora_config.lora_dropout,
+                rslora=lora_config.rslora,
+                lora_plus_scale=lora_config.lora_plus_scale,
+                lora_A_weight_attr=paddle.ParamAttr(
+                    initializer=nn.initializer.KaimingUniform(negative_slope=math.sqrt(5), nonlinearity="leaky_relu")
+                ),
+                use_quick_lora=lora_config.use_quick_lora,
+            )
+            # Lora column parallel will spilt lora B matrix
+            self.add_lora_split_mapping(module_name + ".lora_B", is_column=True)
+
+            # for lora qat
+            if self.lora_config.do_qat:
+                self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=True)
+                self.add_lora_split_mapping(module_name + ".activation_quanter._scale", is_column=False)
+                self.add_lora_split_mapping(module_name + ".activation_quanter.quanter._scale", is_column=False)
+        elif isinstance(module, linear_utils.RowSequenceParallelLinear):
+            # recover the original output_features
+            lora_module = RowSequenceParallelLoRALinear(
+                in_features=module.weight.shape[0] * module.world_size,
+                out_features=module.weight.shape[1],
+                has_bias=module.bias is not None,
+                input_is_parallel=module.input_is_parallel,
+                r=lora_config.r,
+                lora_alpha=lora_config.lora_alpha,
+                lora_dropout=lora_config.lora_dropout,
+                rslora=lora_config.rslora,
+                lora_plus_scale=lora_config.lora_plus_scale,
+                use_quick_lora=lora_config.use_quick_lora,
+            )
+            # Lora column parallel will spilt lora A matrix
+            self.add_lora_split_mapping(module_name + ".lora_A", is_column=False)
+
+            # for lora qat
+            if self.lora_config.do_qat:
+                self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=False)
+                self.add_lora_split_mapping(module_name + ".activation_quanter._scale", is_column=False)
+                self.add_lora_split_mapping(module_name + ".activation_quanter.quanter._scale", is_column=False)
+        elif QuantizationLinear is not None and isinstance(module, QuantizationLinear):
+            lora_module = QuantizationLoRALinear(
+                in_features=module.in_features,
+                out_features=module.out_features,
+                quant_algo=module.quant_algo,
+                dtype=module._dtype,
+                bias_attr=False if module.bias is None else None,
+                block_size=module.block_size,
+                double_quant_block_size=module.double_quant_block_size,
+                double_quant=module.double_quant,
+                r=lora_config.r,
+                lora_alpha=lora_config.lora_alpha,
+                lora_dropout=lora_config.lora_dropout,
+            )
+            self.quantized = True
+        elif ColumnParallelQuantizationLinear is not None and isinstance(module, ColumnParallelQuantizationLinear):
+            lora_module = ColumnParallelQuantizationLoRALinear(
+                in_features=module.in_features,
+                out_features=module.out_features,
+                quant_algo=module.quant_algo,
+                dtype=module._dtype,
+                bias_attr=False if module.bias is None else None,
+                gather_output=module.gather_output,
+                r=lora_config.r,
+                lora_alpha=lora_config.lora_alpha,
+                lora_dropout=lora_config.lora_dropout,
+                lora_A_weight_attr=paddle.ParamAttr(
+                    initializer=nn.initializer.KaimingUniform(negative_slope=math.sqrt(5), nonlinearity="leaky_relu")
+                ),
+            )
+            self.quantized = True
+        elif RowParallelQuantizationLinear is not None and isinstance(module, RowParallelQuantizationLinear):
+            lora_module = RowParallelQuantizationLoRALinear(
+                in_features=module.in_features,
+                out_features=module.out_features,
+                quant_algo=module.quant_algo,
+                dtype=module._dtype,
+                bias_attr=False if module.bias is None else None,
+                input_is_parallel=module.input_is_parallel,
+                r=lora_config.r,
+                lora_alpha=lora_config.lora_alpha,
+                lora_dropout=lora_config.lora_dropout,
+            )
+            self.quantized = True
+        if lora_module is None:
+            raise ValueError(
+                f"LoRA strategy only supports paddle.nn.Linear or paddle.distributed.fleet.meta_parallel.ColumnParallelLinear or paddlenlp.transformers.sequence_utils. {module}({module_name} {type(module).__name__}) is not supported。"
+            )
+        if getattr(lora_module, "quant_weight", None) is not None:
+            lora_module.quant_weight = module.quant_weight
+            if getattr(lora_module, "quant_scale", None) is not None:
+                lora_module.quant_scale = module.quant_scale
+            if getattr(lora_module, "qquant_scale", None) is not None:
+                lora_module.qquant_scale = module.qquant_scale
+            if getattr(lora_module, "double_quant_scale", None) is not None:
+                lora_module.double_quant_scale = module.double_quant_scale
+            if getattr(lora_module, "quant_sacle_offset", None) is not None:
+                lora_module.quant_sacle_offset = module.quant_sacle_offset
+        else:
+            lora_module.weight = module.weight
+        if module.bias is not None:
+            lora_module.bias = module.bias
+        setattr(parent_module, attribute_chain[-1], lora_module)
+
+    def _find_and_restore_module(self, module_name):
+        parent_module = self.model
+        attribute_chain = module_name.split(".")
+        for name in attribute_chain[:-1]:
+            parent_module = getattr(parent_module, name)
+        module = getattr(parent_module, attribute_chain[-1])
+        original_model_class = self.restore_layer_map[module.__class__]
+        original_module = original_model_class(in_features=module.weight.shape[0], out_features=module.weight.shape[1])
+        original_module.weight = module.weight
+        if module.bias is not None:
+            original_module.bias = module.bias
+        setattr(parent_module, attribute_chain[-1], original_module)
+
+    def get_trainable_state_dict(self):
+        trainable_state_dict = OrderedDict()
+        for name, weight in self.model.state_dict().items():
+            # get lora parameter & QAT scale parameter
+            if not weight.stop_gradient or "activation_quanter" in name or "weight_quanter" in name:
+                trainable_state_dict[name] = weight
+        return trainable_state_dict
+
+    def print_trainable_parameters(self) -> None:
+        freeze_numel = 0
+        trainable_numel = 0
+        for _, weight in self.model.state_dict().items():
+            if weight.stop_gradient:
+                freeze_numel += np.prod(weight.shape)
+            else:
+                trainable_numel += np.prod(weight.shape)
+        logger.debug(
+            f"Frozen parameters: {freeze_numel:.2e} || Trainable parameters:{trainable_numel:.2e} || Total parameters:{freeze_numel+trainable_numel:.2e}|| Trainable:{trainable_numel / (freeze_numel+trainable_numel):.2%}"
+        )
+
+    def mark_only_lora_as_trainable(self) -> None:
+        for _, layer in self.model.named_sublayers():
+            if (
+                isinstance(layer, LoRALinear)
+                or isinstance(layer, LoRAConv2D)
+                or isinstance(layer, ColumnParallelLoRALinear)
+                or isinstance(layer, RowParallelLoRALinear)
+                or isinstance(layer, ColumnSequenceParallelLoRALinear)
+                or isinstance(layer, RowSequenceParallelLoRALinear)
+                or (QuantizationLoRALinear is not None and isinstance(layer, QuantizationLoRALinear))
+                or (
+                    ColumnParallelQuantizationLoRALinear is not None
+                    and isinstance(layer, ColumnParallelQuantizationLoRALinear)
+                )
+                or (
+                    RowParallelQuantizationLoRALinear is not None
+                    and isinstance(layer, RowParallelQuantizationLoRALinear)
+                )
+            ):
+                for name, weight in layer.state_dict().items():
+                    if self.lora_config.trainable_bias in ["lora", "all"] and "bias" in name:
+                        weight.stop_gradient = False
+                    elif "lora" in name:
+                        weight.stop_gradient = False
+                    else:
+                        weight.stop_gradient = True
+            else:
+                for name, weight in layer.state_dict().items():
+                    if self.lora_config.trainable_bias == "all" and "bias" in name:
+                        weight.stop_gradient = False
+                    else:
+                        weight.stop_gradient = True
+        if self.lora_config.trainable_modules is not None:
+            for name, weight in self.model.state_dict().items():
+                if any(
+                    re.fullmatch(trainable_module, name) for trainable_module in self.lora_config.trainable_modules
+                ):
+                    weight.stop_gradient = False
+
+    def get_lora_model(self, model: Union[PretrainedModel, nn.Layer], lora_config: LoRAConfig):
+
+        if lora_config.target_modules is None:
+            return model
+        elif isinstance(lora_config.target_modules, str):
+            target_modules = [lora_config.target_modules]
+            if lora_config.enable_lora_list is None or (
+                isinstance(lora_config.enable_lora_list, List)
+                and all(isinstance(item, bool) for item in lora_config.enable_lora_list)
+            ):
+                enable_lora_list = [lora_config.enable_lora_list]
+            else:
+                raise TypeError(
+                    f"Invalid `enable_lora_list` value: {lora_config.enable_lora_list}. Since `target_modules` is `str`, `enable_lora_list` must be `None` or `List[bool]`"
+                )
+        else:
+            target_modules = lora_config.target_modules
+            if lora_config.enable_lora_list is None:
+                enable_lora_list = [None for _ in range(len(target_modules))]
+            elif isinstance(lora_config.enable_lora_list, List):
+                enable_lora_list = lora_config.enable_lora_list
+                if len(enable_lora_list) != len(target_modules):
+                    raise TypeError(
+                        f"Invalid lora_config.enable_lora_list value: {lora_config.enable_lora_list}. Since lora_config.target_modules is `List[str]`, `enable_lora_list` should have the same length as `target_modules`"
+                    )
+                for enable_lora in enable_lora_list:
+                    if not (
+                        enable_lora is None
+                        or (isinstance(enable_lora, List) and all(isinstance(item, bool) for item in enable_lora))
+                    ):
+                        raise TypeError(
+                            f"Invalid `enable_lora_list` value: {lora_config.enable_lora_list}. Since `target_modules` is `List[str]`, `enable_lora_list` must be `None` or  `List[Optional[List[bool]]]`"
+                        )
+            else:
+                raise TypeError(
+                    f"Invalid `enable_lora_list` value: {lora_config.enable_lora_list}. Since `target_modules` is `List[str]`, `enable_lora_list` must be `None` or `List[Optional[List[bool]]]`"
+                )
+
+        for target_module, enable_lora in zip(target_modules, enable_lora_list):
+            for i in model.named_sublayers():
+                module_name = i[0]
+                if re.fullmatch(target_module, module_name):
+                    self._find_and_replace_module(model, module_name, lora_config, enable_lora)
+        return model
+
+    def restore_original_model(self):
+        # make sure W and lora weights are not merged before we restore the original model
+
+        for layer_name, layer in self.model.named_sublayers():
+            if isinstance(layer, LoRALinear):
+                self._find_and_restore_module(layer_name)
+            elif (
+                isinstance(layer, ColumnParallelLoRALinear)
+                or isinstance(layer, ColumnSequenceParallelLoRALinear)
+                or isinstance(layer, LoRAConv2D)
+                or isinstance(layer, RowParallelLoRALinear)
+                or isinstance(layer, RowSequenceParallelLoRALinear)
+                or (QuantizationLoRALinear is not None and isinstance(layer, QuantizationLoRALinear))
+                or (
+                    ColumnParallelQuantizationLoRALinear is not None
+                    and isinstance(layer, ColumnParallelQuantizationLoRALinear)
+                )
+                or (
+                    RowParallelQuantizationLoRALinear is not None
+                    and isinstance(layer, RowParallelQuantizationLoRALinear)
+                )
+            ):
+                raise NotImplementedError(f"{layer} restoration is not supported yet.")
+        return self.model
+
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Layer's logic
+        except AttributeError:
+            return getattr(self.model, name)
+
+    def train(self):
+        self.training = True
+        self.model.training = True
+        for layer in self.model.sublayers():
+            layer.training = True
+            layer.train()
+
+    def eval(self):
+        self.training = False
+        self.model.training = False
+        for layer in self.model.sublayers():
+            layer.training = False
+            layer.eval()
+
+    def save_to_aistudio(
+        self,
+        repo_id,
+        private=True,
+        license="Apache License 2.0",
+        exist_ok=True,
+        subfolder=None,
+        merge_tensor_parallel=False,
+        **kwargs
+    ):
+        """
+        Uploads all elements of this model to a new AiStudio Hub repository.
+        Args:
+            repo_id (str): Repository name for your model/tokenizer in the Hub.
+            token (str): Your token for the Hub.
+            private (bool, optional): Whether the model/tokenizer is set to private. Defaults to True.
+            license (str): The license of your model/tokenizer. Defaults to: "Apache License 2.0".
+            exist_ok (bool, optional): Whether to override existing repository. Defaults to: True.
+            subfolder (str, optional): Push to a subfolder of the repo instead of the root
+            merge_tensor_parallel (bool): Whether to merge the tensor parallel weights. Defaults to False.
+        """
+        res = aistudio_sdk.hub.create_repo(repo_id=repo_id, private=private, license=license, **kwargs)
+        if "error_code" in res:
+            if res["error_code"] == 10003 and exist_ok:
+                logger.info(
+                    f"Repo {repo_id} already exists, it will override files with the same name. To avoid this, please set exist_ok=False"
+                )
+            else:
+                logger.error(
+                    f"Failed to create repo {repo_id}, error_code: {res['error_code']}, error_msg: {res['error_msg']}"
+                )
+        else:
+            logger.info(f"Successfully created repo {repo_id}")
+
+        with tempfile.TemporaryDirectory() as root_dir:
+            if subfolder is not None:
+                save_dir = os.path.join(root_dir, subfolder)
+            else:
+                save_dir = root_dir
+            # save model
+            self.save_pretrained(save_dir, merge_tensor_parallel=merge_tensor_parallel)
+
+            # Upload model and return
+            logger.info(f"Pushing to the {repo_id}. This might take a while")
+            for filename in os.listdir(save_dir):
+                res = aistudio_sdk.hub.upload(
+                    repo_id=repo_id, path_or_fileobj=os.path.join(save_dir, filename), path_in_repo=filename, **kwargs
+                )
+                if "error_code" in res:
+                    logger.error(
+                        f"Failed to upload {filename}, error_code: {res['error_code']}, error_msg: {res['error_msg']}"
+                    )
+                else:
+                    logger.info(f"{filename}: {res['message']}")
+
+    def disable_lora(self):
+        for _, layer in self.model.named_sublayers():
+            if any(isinstance(layer, lora_layer) for lora_layer in AVAILABLE_LAYERS):
+                layer.disable_lora = True
+
+    def enable_lora(self):
+        for _, layer in self.model.named_sublayers():
+            if any(isinstance(layer, lora_layer) for lora_layer in AVAILABLE_LAYERS):
+                layer.disable_lora = False
+
+    def merge(self):
+        for _, layer in self.model.named_sublayers():
+            if any(isinstance(layer, lora_layer) for lora_layer in AVAILABLE_LAYERS):
+                layer.merge()
+
+    def unmerge(self):
+        for _, layer in self.model.named_sublayers():
+            if any(isinstance(layer, lora_layer) for lora_layer in AVAILABLE_LAYERS):
+                layer.unmerge()
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/lora_quant_layers.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/lora_quant_layers.py
new file mode 100644
index 000000000..6f4e7b2b7
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/lora_quant_layers.py
@@ -0,0 +1,272 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+from paddle.distributed.fleet.layers.mpu import mp_ops
+from paddle.nn import functional as F
+from paddle.nn.quant.format import ConvertibleQuantedLayer
+
+
+class QuantedLoRALinear(ConvertibleQuantedLayer):
+    """
+    The computational logic of QuantizedLoRALinear is the same as LoRALinear.
+    The only difference is that its inputs are all fake quantized.
+
+    Note:
+        In order for proper quantization of this layer, we do (W + AB)x instead of Wx + ABx as in LoRALinear.
+        The quanted logic is quant(W + AB)x
+    """
+
+    def __init__(self, layer: nn.Layer, q_config):
+        super().__init__()
+        if isinstance(layer.lora_dropout, nn.Dropout):
+            raise ValueError("lora_dropout is not supported for QuantedLoRALinear")
+
+        self.weight = layer.weight
+        self.lora_A = layer.lora_A
+        self.lora_B = layer.lora_B
+        self.scaling = layer.scaling
+        self.bias = layer.bias
+        self.name = layer.name
+
+        # Mark the weight as unmerged
+        self.merged = False
+
+        # For FakeQuant
+
+        self.weight_quanter = None
+        self.activation_quanter = None
+        if q_config.weight is not None:
+            self.weight_quanter = q_config.weight._instance(layer)
+        if q_config.activation is not None:
+            self.activation_quanter = q_config.activation._instance(layer)
+        self.disable_lora = False
+
+    def forward(self, input):
+
+        if self.merged or self.disable_lora:
+            weight = self.weight
+        else:
+            weight = self.weight + self.lora_A @ self.lora_B * self.scaling
+
+        quant_input = self.activation_quanter(input) if self.activation_quanter is not None else input
+        quant_weight = self.weight_quanter(weight) if self.weight_quanter is not None else weight
+
+        return self._linear_forward(quant_input, quant_weight)
+
+    def _linear_forward(self, input, weight):
+        weight = paddle.cast(weight, input.dtype)
+        out = F.linear(x=input, weight=weight, bias=self.bias, name=self.name)
+        return out
+
+    def unmerge(self):
+        if self.merged:
+            # Make sure that the weights are not merged
+            new_weight = self.weight - self.lora_A @ self.lora_B * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = False
+
+    def merge(self):
+        if not self.merged:
+            # Merge the weights and mark it
+            new_weight = self.weight + self.lora_A @ self.lora_B * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = True
+
+    def weights_to_quanters(self):
+        return [("weight", "weight_quanter")]
+
+    def activation_quanters(self):
+        return ["activation_quanter"]
+
+
+class ColumnParallelQuantedLoRALinear(ConvertibleQuantedLayer):
+    """
+    The computational logic of ColumnParallelQuantedLoRALinear is the same as ColumnParallelLoRALinear.
+    The only difference is that its inputs are all fake quantized.
+
+    Note:
+        In order for proper quantization of this layer, we do (W + AB)x instead of Wx + ABx as in LoRALinear.
+        The quanted logic is quant(W + AB)x
+    """
+
+    def __init__(self, layer: nn.Layer, q_config):
+        super().__init__()
+        if isinstance(layer.lora_dropout, nn.Dropout):
+            raise ValueError("lora_dropout is not supported for QuantedLoRALinear")
+
+        self.weight = layer.weight
+        self.lora_A = layer.lora_A
+        self.lora_B = layer.lora_B
+        self.scaling = layer.scaling
+        self.bias = layer.bias
+        self.name = layer.name
+        self.is_mp = layer.is_mp
+        self.model_parallel_group = layer.model_parallel_group
+        self.gather_output = layer.gather_output
+        self.is_mp = layer.is_mp
+
+        # Mark the weight as unmerged
+        self.merged = False
+
+        # For FakeQuant
+        self.weight_quanter = None
+        self.activation_quanter = None
+        if q_config.weight is not None:
+            self.weight_quanter = q_config.weight._instance(layer)
+        if q_config.activation is not None:
+            self.activation_quanter = q_config.activation._instance(layer)
+        self.disable_lora = False
+
+    def forward(self, input):
+
+        if self.merged or self.disable_lora:
+            weight = self.weight
+        else:
+            weight = (
+                self.weight
+                + mp_ops._c_identity(self.lora_A, group=self.model_parallel_group) @ self.lora_B * self.scaling
+            )
+        quant_input = self.activation_quanter(input) if self.activation_quanter is not None else input
+        quant_weight = self.weight_quanter(weight) if self.weight_quanter is not None else weight
+
+        return self._linear_forward(quant_input, quant_weight)
+
+    def _linear_forward(self, input, weight):
+        if self.is_mp:
+            input_mp = mp_ops._c_identity(input, group=self.model_parallel_group)
+        else:
+            input_mp = input
+
+        result_mp = F.linear(x=input_mp, weight=weight, bias=self.bias, name=self.name)
+
+        if self.gather_output and self.is_mp:
+            result = mp_ops._c_concat(result_mp, group=self.model_parallel_group)
+        else:
+            result = result_mp
+        return result
+
+    def unmerge(self):
+        if self.merged:
+            # Make sure that the weights are not merged
+            new_weight = self.weight - self.lora_A @ self.lora_B * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = False
+
+    def merge(self):
+        if not self.merged:
+            # Merge the weights and mark it
+            new_weight = self.weight + self.lora_A @ self.lora_B * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = True
+
+    def weights_to_quanters(self):
+        return [("weight", "weight_quanter")]
+
+    def activation_quanters(self):
+        return ["activation_quanter"]
+
+
+class RowParallelQuantedLoRALinear(ConvertibleQuantedLayer):
+    """
+    The computational logic of RowParallelQuantedLoRALinear is the same as RowParallelLoRALinear.
+    The only difference is that its inputs are all fake quantized.
+
+    Note:
+        In order for proper quantization of this layer, we do (W + AB)x instead of Wx + ABx as in LoRALinear.
+        The quanted logic is quant(W + AB)x
+    """
+
+    def __init__(self, layer: nn.Layer, q_config):
+        super().__init__()
+        if isinstance(layer.lora_dropout, nn.Dropout):
+            raise ValueError("lora_dropout is not supported for QuantedLoRALinear")
+
+        self.weight = layer.weight
+        self.lora_A = layer.lora_A
+        self.lora_B = layer.lora_B
+        self.scaling = layer.scaling
+        self.bias = layer.bias
+        self.name = layer.name
+        self.is_mp = layer.is_mp
+        self.model_parallel_group = layer.model_parallel_group
+        self.input_is_parallel = layer.input_is_parallel
+        self.is_mp = layer.is_mp
+
+        # Mark the weight as unmerged
+        self.merged = False
+
+        # For FakeQuant
+        self.weight_quanter = None
+        self.activation_quanter = None
+        if q_config.weight is not None:
+            self.weight_quanter = q_config.weight._instance(layer)
+        if q_config.activation is not None:
+            self.activation_quanter = q_config.activation._instance(layer)
+        self.disable_lora = False
+
+    def forward(self, input):
+
+        if self.merged or self.disable_lora:
+            weight = self.weight
+        else:
+            weight = (
+                self.weight
+                + self.lora_A @ mp_ops._c_identity(self.lora_B, group=self.model_parallel_group) * self.scaling
+            )
+
+        quant_input = self.activation_quanter(input) if self.activation_quanter is not None else input
+        quant_weight = self.weight_quanter(weight) if self.weight_quanter is not None else weight
+
+        return self._linear_forward(quant_input, quant_weight)
+
+    def _linear_forward(self, input, weight):
+        if not self.input_is_parallel:
+            input_mp = mp_ops._c_split(input, group=self.model_parallel_group)
+        else:
+            input_mp = input
+
+        # x @ W : [bz, in_f / ws] ===> [bz, out_f]
+        result_mp = F.linear(x=input_mp, weight=weight, name=self.name)
+
+        output = mp_ops._mp_allreduce(
+            result_mp,
+            group=self.model_parallel_group,
+            use_calc_stream=True,
+            use_model_parallel=True,
+        )
+
+        output = output + self.bias if self.bias is not None else output
+        return output
+
+    def unmerge(self):
+        if self.merged:
+            # Make sure that the weights are not merged
+            new_weight = self.weight - self.lora_A @ self.lora_B * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = False
+
+    def merge(self):
+        if not self.merged:
+            # Merge the weights and mark it
+            new_weight = self.weight + self.lora_A @ self.lora_B * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = True
+
+    def weights_to_quanters(self):
+        return [("weight", "weight_quanter")]
+
+    def activation_quanters(self):
+        return ["activation_quanter"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/lora_quantization_layers.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/lora_quantization_layers.py
new file mode 100644
index 000000000..8ff597633
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/lora_quantization_layers.py
@@ -0,0 +1,504 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+from paddle import nn
+from paddle.distributed.fleet.layers.mpu import mp_ops
+from paddle.nn.quant import weight_dequantize, weight_only_linear, weight_quantize
+
+from ...quantization.qlora import qlora_weight_dequantize, qlora_weight_quantize
+from ...quantization.quantization_linear import (
+    ColumnParallelQuantizationLinear,
+    QuantizationLinear,
+    RowParallelQuantizationLinear,
+)
+
+
+class QuantizationLoRALinear(QuantizationLinear):
+    """
+    Quantization lora Linear layer.
+    The code implementation refers to paddlenlp.peft.lora.lora_layers.LoRALinear.
+    https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/peft/lora/lora_layers.py
+    Compare to LoRALinear, this class keeps weight in INT8/INT4 with quant scale, and supports
+    weight_only_linear for input tensor and origin weight(LoRA part still uses fp16/bf16).
+    """
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        quant_algo,
+        dtype,
+        weight_attr=None,
+        scale_attr=None,
+        bias_attr=None,
+        block_size=64,
+        double_quant_block_size=256,
+        double_quant=False,
+        qquant_scale_attr=None,
+        double_quant_scale_attr=None,
+        quant_sacle_offset_attr=None,
+        quant_scale_attr=None,
+        llm_int8_threshold=6.0,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+    ):
+        super().__init__(
+            in_features,
+            out_features,
+            quant_algo,
+            dtype,
+            weight_attr,
+            scale_attr,
+            bias_attr,
+            block_size,
+            double_quant_block_size,
+            double_quant,
+            qquant_scale_attr,
+            double_quant_scale_attr,
+            quant_sacle_offset_attr,
+            quant_scale_attr,
+            llm_int8_threshold,
+        )
+
+        if not isinstance(r, int) or r <= 0:
+            raise ValueError("Lora rank r should be a positive integer")
+        if self.quant_algo == "llm.int8":
+            raise NotImplementedError("llm.int8 not yet support lora strategy.")
+        self.in_features = in_features
+        self.out_features = out_features
+        self.r = r
+        self.lora_alpha = lora_alpha
+        # Mark the weight as unmerged
+        self.merged = False
+        # Optional dropout
+        if lora_dropout > 0.0:
+            self.lora_dropout = nn.Dropout(p=lora_dropout)
+        else:
+            self.lora_dropout = lambda x: x
+
+        # Actual trainable parameters
+        self.lora_A = self.create_parameter(
+            shape=[in_features, r],
+            dtype=self._dtype,
+            is_bias=False,
+            default_initializer=nn.initializer.KaimingUniform(negative_slope=math.sqrt(5), nonlinearity="leaky_relu"),
+        )
+        self.lora_B = self.create_parameter(
+            shape=[r, out_features],
+            dtype=self._dtype,
+            is_bias=False,
+            default_initializer=nn.initializer.Constant(value=0.0),
+        )
+        self.weight = None
+        self.scaling = self.lora_alpha / self.r
+        self.disable_lora = False
+
+    def dequantize_weight(self):
+        if self.quant_algo in ["fp4", "nf4"]:
+            new_weight = (
+                qlora_weight_dequantize(
+                    quant_weight=self.quant_weight,
+                    quant_algo=self.quant_algo,
+                    state=(self.qquant_scale, self.double_quant_scale, self.quant_scale_offset)
+                    if self.double_quant
+                    else self.quant_scale,
+                    double_quant=self.double_quant,
+                    block_size=self.block_size,
+                    double_quant_block_size=self.double_quant_block_size,
+                )
+                .cast(self._dtype)
+                .reshape([self.in_features, self.out_features])
+            )
+        elif self.quant_algo in ["weight_only_int8"]:
+            new_weight = weight_dequantize(self.quant_weight, self.quant_scale, self.quant_algo, self._dtype)
+        else:
+            raise NotImplementedError(f"{self.quant_algo} not yet support lora merge strategy.")
+        return new_weight
+
+    def quantize_weight(self, new_weight):
+        if self.quant_algo in ["fp4", "nf4"]:
+            print("self.quant_weight", self.quant_weight)
+            quant_weight, quant_state = qlora_weight_quantize(
+                weight=new_weight,
+                quant_algo=self.quant_algo,
+                double_quant=self.double_quant,
+                block_size=self.block_size,
+                double_quant_block_size=self.double_quant_block_size,
+                return_dict=False,
+            )
+            print("quant_weight", quant_weight)
+            self.quant_weight.set_value(quant_weight)
+            if self.double_quant:
+                qquant_scale, double_quant_scale, quant_sacle_offset = quant_state
+                self.qquant_scale.set_value(qquant_scale)
+                self.double_quant_scale.set_value(double_quant_scale)
+                self.quant_sacle_offset.set_value(quant_sacle_offset)
+            else:
+                quant_scale = quant_state
+                self.quant_scale.set_value(quant_scale)
+        elif self.quant_algo in ["weight_only_int8"]:
+            quant_weight, quant_scale = weight_quantize(new_weight, self.quant_algo)
+            self.quant_weight.set_value(quant_weight)
+            self.quant_scale.set_value(quant_scale)
+        else:
+            raise NotImplementedError(f"{self.quant_algo} not yet support lora merge strategy.")
+
+    def unmerge(self):
+        if self.merged:
+            # Make sure that the weights are not merged
+            new_weight = self.dequantize_weight()
+            new_weight -= self.lora_A @ self.lora_B * self.scaling
+            self.quantize_weight(new_weight)
+            self.merged = False
+
+    def merge(self):
+        if not self.merged:
+            # Merge the weights and mark it
+            new_weight = self.dequantize_weight()
+            new_weight += self.lora_A @ self.lora_B * self.scaling
+            self.quantize_weight(new_weight)
+            self.merged = True
+
+    def forward(self, x: paddle.Tensor):
+        result = super().forward(x)
+        if not self.merged and not self.disable_lora:
+            result += (self.lora_dropout(x) @ self.lora_A @ self.lora_B) * self.scaling
+        return result
+
+
+class ColumnParallelQuantizationLoRALinear(ColumnParallelQuantizationLinear):
+    """
+    Quantization lora Linear layer with mp parallelized(column).
+    The code implementation refers to paddlenlp.peft.lora.lora_layers.ColumnParallelLoRALinear.
+    https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/peft/lora/lora_layers.py#L203
+    Compare to ColumnParallelLoRALinear, this class keeps weight in INT8/INT4 with quant scale, and supports
+    weight_only_linear for input tensor and origin weight(LoRA part still uses fp16/bf16).
+    """
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        quant_algo,
+        dtype,
+        weight_attr=None,
+        scale_attr=None,
+        bias_attr=None,
+        gather_output=True,
+        mp_group=None,
+        r=0,
+        lora_alpha=1,
+        lora_dropout=0.0,
+        lora_A_weight_attr=None,
+    ):
+        ColumnParallelQuantizationLinear.__init__(
+            self,
+            in_features,
+            out_features,
+            quant_algo,
+            dtype,
+            weight_attr,
+            scale_attr,
+            bias_attr,
+            gather_output,
+            mp_group,
+        )
+        if not isinstance(r, int) or r <= 0:
+            raise ValueError("Lora rank r should be a positive integer")
+        if self.quant_algo == "llm.int8":
+            raise NotImplementedError("llm.int8 not yet support lora strategy.")
+        if self.quant_algo in ["fp4", "nf4"]:
+            raise NotImplementedError(f"{self.quant_algo} not yet support tensor parallelism.")
+
+        self.r = r
+        self.lora_alpha = lora_alpha
+        # Optional dropout
+        if lora_dropout > 0.0:
+            self.lora_dropout = nn.Dropout(p=lora_dropout)
+        else:
+            self.lora_dropout = lambda x: x
+
+        # Actual trainable parameters
+        self.lora_A = self.create_parameter(
+            shape=[in_features, r],
+            dtype=self._dtype,
+            is_bias=False,
+            attr=lora_A_weight_attr,
+        )
+        self.lora_A.is_distributed = False
+        self.lora_B = self.create_parameter(
+            shape=[r, self.output_size_per_partition],
+            dtype=self._dtype,
+            is_bias=False,
+            default_initializer=nn.initializer.Constant(value=0.0),
+        )
+        self.lora_B.is_distributed = True
+        self.lora_B.split_axis = 1
+        self.scaling = self.lora_alpha / self.r
+        self.disable_lora = False
+        # Mark the weight as unmerged
+        self.merged = False
+
+    def forward(self, x):
+
+        result_mp = super().forward(x)
+
+        if not self.disable_lora or not self.merged:
+            input_a = self.lora_dropout(x) @ self.lora_A
+            input_a_mp = mp_ops._c_identity(input_a, group=self.model_parallel_group)
+            delta_mp = (input_a_mp @ self.lora_B) * self.scaling
+            result_mp += delta_mp
+
+        if self.gather_output and self.is_mp:
+            result = mp_ops._c_concat(result_mp, group=self.model_parallel_group)
+        else:
+            result = result_mp
+        return result
+
+    def dequantize_weight(self):
+        if self.quant_algo in ["fp4", "nf4"]:
+            new_weight = (
+                qlora_weight_dequantize(
+                    quant_weight=self.quant_weight,
+                    quant_algo=self.quant_algo,
+                    state=(self.qquant_scale, self.double_quant_scale, self.quant_scale_offset)
+                    if self.double_quant
+                    else self.quant_scale,
+                    double_quant=self.double_quant,
+                    block_size=self.block_size,
+                    double_quant_block_size=self.double_quant_block_size,
+                )
+                .cast(self._dtype)
+                .reshape([self.in_features, self.out_features])
+            )
+        elif self.quant_algo in ["weight_only_int8"]:
+            new_weight = weight_dequantize(self.quant_weight, self.quant_scale, self.quant_algo, self._dtype)
+        else:
+            raise NotImplementedError(f"{self.quant_algo} not yet support lora merge strategy.")
+        return new_weight
+
+    def quantize_weight(self, new_weight):
+        if self.quant_algo in ["fp4", "nf4"]:
+            quant_weight, quant_state = qlora_weight_quantize(
+                weight=new_weight,
+                quant_algo=self.quant_algo,
+                double_quant=self.double_quant,
+                block_size=self.block_size,
+                double_quant_block_size=self.double_quant_block_size,
+                return_dict=False,
+            )
+            self.quant_weight.set_value(quant_weight)
+            if self.double_quant:
+                qquant_scale, double_quant_scale, quant_sacle_offset = quant_state
+                self.qquant_scale.set_value(qquant_scale)
+                self.double_quant_scale.set_value(double_quant_scale)
+                self.quant_sacle_offset.set_value(quant_sacle_offset)
+            else:
+                quant_scale = quant_state
+                self.quant_scale.set_value(quant_scale)
+        elif self.quant_algo in ["weight_only_int8"]:
+            quant_weight, quant_scale = weight_quantize(new_weight, self.quant_algo)
+            self.quant_weight.set_value(quant_weight)
+            self.quant_scale.set_value(quant_scale)
+        else:
+            raise NotImplementedError(f"{self.quant_algo} not yet support lora merge strategy.")
+
+    def unmerge(self):
+        if self.merged:
+            # Make sure that the weights are not merged
+            new_weight = self.dequantize_weight()
+            new_weight -= self.lora_A @ self.lora_B * self.scaling
+            self.quantize_weight(new_weight)
+            self.merged = False
+
+    def merge(self):
+        if not self.merged:
+            # Merge the weights and mark it
+            new_weight = self.dequantize_weight()
+            new_weight += self.lora_A @ self.lora_B * self.scaling
+            self.quantize_weight(new_weight)
+            self.merged = True
+
+
+class RowParallelQuantizationLoRALinear(RowParallelQuantizationLinear):
+    """
+    Quantization lora Linear layer with mp parallelized(row).
+    The code implementation refers to paddlenlp.peft.lora.lora_layers.RowParallelLoRALinear.
+    https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/peft/lora/lora_layers.py#L99
+    Compare to RowParallelLoRALinear, this class keeps weight in INT8/INT4 with quant scale, and supports
+    weight_only_linear for input tensor and origin weight(LoRA part still uses fp16/bf16).
+    """
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        quant_algo,
+        dtype,
+        weight_attr=None,
+        scale_attr=None,
+        bias_attr=None,
+        input_is_parallel=False,
+        mp_group=None,
+        r: int = 0,
+        lora_alpha: int = 1,
+        lora_dropout: float = 0.0,
+    ):
+        RowParallelQuantizationLinear.__init__(
+            self,
+            in_features,
+            out_features,
+            quant_algo,
+            dtype,
+            weight_attr,
+            scale_attr,
+            bias_attr,
+            input_is_parallel,
+            mp_group,
+        )
+        if not isinstance(r, int) or r <= 0:
+            raise ValueError("Lora rank r should be a positive integer")
+        if self.quant_algo == "llm.int8":
+            raise NotImplementedError("llm.int8 not yet support lora strategy.")
+        if self.quant_algo in ["fp4", "nf4"]:
+            raise NotImplementedError(f"{self.quant_algo} not yet support tensor parallelism.")
+        self.r = r
+        self.lora_alpha = lora_alpha
+        # Optional dropout
+        if lora_dropout > 0.0:
+            self.lora_dropout = nn.Dropout(p=lora_dropout)
+        else:
+            self.lora_dropout = lambda x: x
+
+        # Actual trainable parameters
+        self.lora_A = self.create_parameter(
+            shape=[self.input_size_per_partition, r],
+            dtype=self._dtype,
+            is_bias=False,
+            attr=paddle.ParamAttr(
+                initializer=nn.initializer.KaimingUniform(negative_slope=math.sqrt(5), nonlinearity="leaky_relu")
+            ),
+        )
+        self.lora_B = self.create_parameter(
+            shape=[r, self.out_features],
+            dtype=self._dtype,
+            is_bias=False,
+            default_initializer=nn.initializer.Constant(value=0.0),
+        )
+        self.lora_A.is_distributed = True
+        self.lora_A.split_axis = 0
+        self.lora_B.is_distributed = False
+        self.scaling = self.lora_alpha / self.r
+        self.disable_lora = False
+        self.merged = False
+
+    def forward(self, x: paddle.Tensor):
+        if not self.input_is_parallel:
+            input_mp = mp_ops._c_split(x, group=self.model_parallel_group)
+        else:
+            input_mp = x
+
+        # x @ W : [bz, in_f / ws] ===> [bz, out_f]
+        with paddle.amp.auto_cast(enable=False):
+            result_mp = weight_only_linear(input_mp, self.quant_weight, None, self.quant_scale, self.quant_dtype)
+
+        output = mp_ops._mp_allreduce(
+            result_mp,
+            group=self.model_parallel_group,
+            use_calc_stream=True,
+            use_model_parallel=True,
+        )
+        if not self.disable_lora or not self.merged:
+            # x @ A: [bz, in_f/ ws] ===> [bz, r]
+            input_mp = self.lora_dropout(input_mp) @ self.lora_A
+            # all reduce to keep Lora B's gradient on different gpu consistent
+            input_dup = mp_ops._mp_allreduce(
+                input_mp,
+                group=self.model_parallel_group,
+                use_calc_stream=True,
+                use_model_parallel=True,
+            )
+            #  @ B: [bz, r] ===> [bz, out_f]
+            delta_mp = (input_dup @ self.lora_B) * self.scaling
+            output += delta_mp
+        output = output + self.bias if self.bias is not None else output
+        return output
+
+    def dequantize_weight(self):
+        if self.quant_algo in ["fp4", "nf4"]:
+            new_weight = (
+                qlora_weight_dequantize(
+                    quant_weight=self.quant_weight,
+                    quant_algo=self.quant_algo,
+                    state=(self.qquant_scale, self.double_quant_scale, self.quant_scale_offset)
+                    if self.double_quant
+                    else self.quant_scale,
+                    double_quant=self.double_quant,
+                    block_size=self.block_size,
+                    double_quant_block_size=self.double_quant_block_size,
+                )
+                .cast(self._dtype)
+                .reshape([self.in_features, self.out_features])
+            )
+        elif self.quant_algo in ["weight_only_int8"]:
+            new_weight = weight_dequantize(self.quant_weight, self.quant_scale, self.quant_algo, self._dtype)
+        else:
+            raise NotImplementedError(f"{self.quant_algo} not yet support lora merge strategy.")
+        return new_weight
+
+    def quantize_weight(self, new_weight):
+        if self.quant_algo in ["fp4", "nf4"]:
+            quant_weight, quant_state = qlora_weight_quantize(
+                weight=new_weight,
+                quant_algo=self.quant_algo,
+                double_quant=self.double_quant,
+                block_size=self.block_size,
+                double_quant_block_size=self.double_quant_block_size,
+                return_dict=False,
+            )
+            self.quant_weight.set_value(quant_weight)
+            if self.double_quant:
+                qquant_scale, double_quant_scale, quant_sacle_offset = quant_state
+                self.qquant_scale.set_value(qquant_scale)
+                self.double_quant_scale.set_value(double_quant_scale)
+                self.quant_sacle_offset.set_value(quant_sacle_offset)
+            else:
+                quant_scale = quant_state
+                self.quant_scale.set_value(quant_scale)
+        elif self.quant_algo in ["weight_only_int8"]:
+            quant_weight, quant_scale = weight_quantize(new_weight, self.quant_algo)
+            self.quant_weight.set_value(quant_weight)
+            self.quant_scale.set_value(quant_scale)
+        else:
+            raise NotImplementedError(f"{self.quant_algo} not yet support lora merge strategy.")
+
+    def unmerge(self):
+        if self.merged:
+            # Make sure that the weights are not merged
+            new_weight = self.dequantize_weight()
+            new_weight -= self.lora_A @ self.lora_B * self.scaling
+            self.quantize_weight(new_weight)
+            self.merged = False
+
+    def merge(self):
+        if not self.merged:
+            # Merge the weights and mark it
+            new_weight = self.dequantize_weight()
+            new_weight += self.lora_A @ self.lora_B * self.scaling
+            self.quantize_weight(new_weight)
+            self.merged = True
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/lora_quick_layers.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/lora_quick_layers.py
new file mode 100644
index 000000000..ab48069b7
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/lora/lora_quick_layers.py
@@ -0,0 +1,223 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.autograd import PyLayer
+from paddle.distributed.communication.reduce import ReduceOp, _get_reduce_op
+from paddle.distributed.fleet.layers.mpu import mp_ops
+from paddle.framework import core
+
+__all__ = ["quick_lora"]
+
+
+def is_fused_matmul_bias_supported():
+    if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm() or paddle.is_compiled_with_xpu():
+        return hasattr(core.eager.ops.legacy, "fused_gemm_epilogue")
+    return False
+
+
+if is_fused_matmul_bias_supported():
+    linear_func = paddle.incubate.nn.functional.fused_linear
+else:
+    linear_func = paddle.nn.functional.linear
+
+
+def quick_lora(
+    input: paddle.Tensor,
+    lora_A: paddle.Tensor,
+    lora_B: paddle.Tensor,
+    weight: paddle.Tensor,
+    bias: paddle.Tensor = None,
+    scaling: float = 1.0,
+    is_column: bool = False,
+    is_row: bool = False,
+    group=None,
+    world_size: int = 1,
+):
+    r"""
+    Definition of the quick_lora function for efficient low-rank adaptation (LORA) operations
+
+    Parameters:
+        input: The input data for the LORA operation
+        lora_A: The LORA matrix A
+        lora_B: The LORA matrix B
+        weight: The weight matrix
+        bias: The bias vector (optional, defaults to None)
+        scaling: The scaling factor (optional, defaults to 1.0)
+        is_column: Flag indicating whether to perform LORA operation by column (optional, defaults to False)
+        is_row: Flag indicating whether to perform LORA operation by row (optional, defaults to False)
+        group: Group information (optional, defaults to None)
+        world_size: World size for distributed operations (optional, defaults to 1)
+
+    Returns:
+        The result of the LORA operation based on the specified parameters
+
+    """
+    assert weight.stop_gradient, "When using Quick LoRA, it is necessary that weight.stop_gradient is set to True."
+    if bias is not None:
+        assert bias.stop_gradient, "When using Quick LoRA, it is necessary that bias.stop_gradient is set to True."
+
+    input_stop_gradient = input.stop_gradient
+    if is_column:
+        # If is_column is True, apply the LORA operation by column using the ColumnQuickLora class
+        return ColumnQuickLora.apply(
+            input, lora_A, lora_B, weight, bias, scaling, group, input_stop_gradient=input_stop_gradient
+        )
+    elif is_row:
+        # If is_row is True, apply the LORA operation by row using the RowQuickLora class
+        return RowQuickLora.apply(
+            input, lora_A, lora_B, weight, bias, scaling, group, world_size, input_stop_gradient=input_stop_gradient
+        )
+    else:
+        # If neither is_column nor is_row is True, apply the regular LORA operation using the QuickLora class
+        return QuickLora.apply(input, lora_A, lora_B, weight, bias, scaling, input_stop_gradient=input_stop_gradient)
+
+
+class QuickLora(PyLayer):
+    @staticmethod
+    def forward(
+        ctx,
+        input,
+        lora_A,
+        lora_B,
+        weight,
+        bias: paddle.Tensor = None,
+        scaling: float = 1.0,
+        input_stop_gradient: bool = False,
+    ):
+        merged_weight = paddle.addmm(weight, lora_A, lora_B, beta=1.0, alpha=scaling)
+        ctx.input_stop_gradient = input_stop_gradient
+        ctx.scaling = scaling
+        ctx.save_for_backward(input, weight, lora_A, lora_B)
+        result = linear_func(input, merged_weight, bias)
+        return result
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, weight, lora_A, lora_B = ctx.saved_tensor()
+        grad_output = grad_output.flatten(0, 1)
+        input_fused = input.flatten(0, 1)
+        lora_B_input_grad = paddle.matmul(grad_output, lora_B, transpose_y=True)
+        input_grad = None
+
+        if not ctx.input_stop_gradient:
+            input_grad = paddle.addmm(
+                paddle.matmul(grad_output, weight, transpose_y=True),
+                lora_B_input_grad,
+                lora_A.T,
+                beta=1.0,
+                alpha=ctx.scaling,
+            ).reshape(input.shape)
+
+        lora_A_grad = paddle.matmul(input_fused, lora_B_input_grad, transpose_x=True) * ctx.scaling
+
+        lora_B_grad = paddle.matmul(paddle.matmul(input_fused, lora_A), grad_output, transpose_x=True) * ctx.scaling
+
+        return input_grad, lora_A_grad, lora_B_grad
+
+
+class ColumnQuickLora(PyLayer):
+    @staticmethod
+    def forward(
+        ctx, input, lora_A, lora_B, weight, bias=None, scaling=1.0, group=None, input_stop_gradient: bool = False
+    ):
+        merged_weight = paddle.addmm(weight, lora_A, lora_B, beta=1.0, alpha=scaling)
+        ctx.group = group
+        ctx.op_type = _get_reduce_op(ReduceOp.SUM, "_c_identity")
+        ctx.input_stop_gradient = input_stop_gradient
+        ctx.scaling = scaling
+        ctx.save_for_backward(input, weight, lora_A, lora_B)
+        result = linear_func(input, merged_weight, bias)
+        return result
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, weight, lora_A, lora_B = ctx.saved_tensor()
+        grad_output = grad_output.flatten(0, 1)
+        input_fused = input.flatten(0, 1)
+        lora_B_input_grad = paddle.matmul(grad_output, lora_B, transpose_y=True)
+        input_grad = None
+        if not ctx.input_stop_gradient:
+            input_grad = paddle.addmm(
+                paddle.matmul(grad_output, weight, transpose_y=True),
+                lora_B_input_grad,
+                lora_A.T,
+                beta=1.0,
+                alpha=ctx.scaling,
+            ).reshape(input.shape)
+
+        if ctx.group is not None:
+            ctx.group.process_group.all_reduce_on_calc_stream(lora_B_input_grad, ctx.op_type)
+        lora_A_grad = paddle.matmul(input_fused, lora_B_input_grad, transpose_x=True) * ctx.scaling
+
+        lora_B_grad = paddle.matmul(paddle.matmul(input_fused, lora_A), grad_output, transpose_x=True) * ctx.scaling
+
+        return input_grad, lora_A_grad, lora_B_grad
+
+
+class RowQuickLora(PyLayer):
+    @staticmethod
+    def forward(
+        ctx,
+        input,
+        lora_A,
+        lora_B,
+        weight,
+        bias=None,
+        scaling: float = 1.0,
+        group=None,
+        world_size: int = 1,
+        input_stop_gradient: bool = False,
+    ):
+        if world_size > 1 and bias is not None:
+            bias = paddle.scale(bias, 1.0 / world_size)
+        merged_weight = paddle.addmm(weight, lora_A, lora_B, beta=1.0, alpha=scaling)
+        ctx.input_stop_gradient = input_stop_gradient
+        ctx.group = group
+        ctx.scaling = scaling
+        ctx.save_for_backward(input, weight, lora_A, lora_B)
+        result = linear_func(input, merged_weight, bias)
+        return result
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, weight, lora_A, lora_B = ctx.saved_tensor()
+
+        grad_output = grad_output.flatten(0, 1)
+        input_fused = input.flatten(0, 1)
+
+        lora_B_input_grad = paddle.matmul(grad_output, lora_B, transpose_y=True)
+
+        input_grad = None
+        if not ctx.input_stop_gradient:
+            input_grad = paddle.addmm(
+                paddle.matmul(grad_output, weight, transpose_y=True),
+                lora_B_input_grad,
+                lora_A.T,
+                beta=1.0,
+                alpha=ctx.scaling,
+            ).reshape(input.shape)
+
+        lora_A_grad = paddle.matmul(input_fused, lora_B_input_grad, transpose_x=True) * ctx.scaling
+
+        x_lora_A = paddle.matmul(input_fused, lora_A)
+        if ctx.group is not None:
+            x_lora_A = mp_ops._mp_allreduce(
+                x_lora_A,
+                group=ctx.group,
+                use_calc_stream=True,
+                use_model_parallel=True,
+            )
+        lora_B_grad = paddle.matmul(x_lora_A, grad_output, transpose_x=True) * ctx.scaling
+        return input_grad, lora_A_grad, lora_B_grad
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/prefix/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/prefix/__init__.py
new file mode 100644
index 000000000..c8bd6e6f0
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/prefix/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .prefix_config import PrefixConfig
+from .prefix_model import PrefixModelForCausalLM
+from .utils import (
+    bloom_postprocess_past_key_value,
+    chatglm_postprocess_past_key_value,
+    llama_postprocess_past_key_value,
+    mistral_postprocess_past_key_value,
+    qwen_postprocess_past_key_value,
+)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/prefix/prefix_config.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/prefix/prefix_config.py
new file mode 100644
index 000000000..ba9135c6a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/prefix/prefix_config.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from dataclasses import asdict, dataclass, field
+from typing import Optional
+
+from ...utils.env import PREFIX_CONFIG_NAME
+
+
+@dataclass
+class PrefixConfig:
+    prefix_dropout: float = field(default=0.0, metadata={"help": "Prefix projection dropout"})
+    num_prefix_tokens: Optional[int] = field(default=None, metadata={"help": "Number of prefix tokens"})
+    num_attention_heads: Optional[int] = field(default=None, metadata={"help": "Number of attention heads"})
+    multi_query_group_num: Optional[int] = field(default=None, metadata={"help": "Number of Multi-Query Groups."})
+    num_hidden_layers: Optional[int] = field(default=None, metadata={"help": "Number of transformer hidden layers"})
+    hidden_size: Optional[int] = field(
+        default=None, metadata={"help": "The hidden embedding dimension of the transformer model"}
+    )
+    prefix_projection: bool = field(default=False, metadata={"help": "Whether to project the prefix tokens"})
+    prefix_projection_hidden_size: Optional[int] = field(
+        default=None, metadata={"help": "The hidden embedding dimension of the transformer model"}
+    )
+    tensor_parallel_degree: int = field(default=-1, metadata={"help": ("1 for not use tensor parallel")})
+    dtype: Optional[str] = field(default=None, metadata={"help": "The data type of tensor"})
+
+    @property
+    def __dict__(self):
+        return asdict(self)
+
+    def to_dict(self):
+        return self.__dict__
+
+    def save_pretrained(self, save_directory):
+        r"""
+        This method saves the configuration of your adapter model in a directory.
+        Args:
+            save_directory (`str`):
+                The directory where the configuration will be saved.
+        """
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        output_dict = self.__dict__
+        output_path = os.path.join(save_directory, PREFIX_CONFIG_NAME)
+
+        # save it
+        with open(output_path, "w") as writer:
+            writer.write(json.dumps(output_dict, indent=2, sort_keys=True))
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r"""
+        This method loads the configuration of your adapter model from a directory.
+        Args:
+            pretrained_model_name_or_path (`str`):
+                The directory or the hub-id where the configuration is saved.
+            **kwargs:
+                Additional keyword arguments passed along to the child class initialization.
+        """
+        if os.path.isfile(os.path.join(pretrained_model_name_or_path, PREFIX_CONFIG_NAME)):
+            config_file = os.path.join(pretrained_model_name_or_path, PREFIX_CONFIG_NAME)
+        else:
+            raise ValueError(f"Can't find prefix_config.json at '{pretrained_model_name_or_path}'")
+
+        loaded_attributes = cls.from_json_file(config_file)
+
+        config = cls(**kwargs)
+
+        for key, value in loaded_attributes.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+
+        return config
+
+    @classmethod
+    def from_json_file(cls, path_json_file):
+        r"""
+        Loads a configuration file from a json file.
+        Args:
+            path_json_file (`str`):
+                The path to the json file.
+        """
+        with open(path_json_file, "r") as file:
+            json_object = json.load(file)
+
+        return json_object
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/prefix/prefix_model.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/prefix/prefix_model.py
new file mode 100644
index 000000000..29a344422
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/prefix/prefix_model.py
@@ -0,0 +1,539 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import os
+import tempfile
+from functools import partial
+from typing import Callable, Optional
+
+import aistudio_sdk
+import numpy as np
+import paddle
+import paddle.nn as nn
+from paddle.distributed import fleet
+
+from ...prompt.prompt_utils import signature
+from ...transformers.model_utils import (
+    _add_variant,
+    _load_state_dict_into_model,
+    dtype_guard,
+    load_state_dict,
+)
+from ...transformers.utils import get_checkpoint_shard_files
+from ...utils.distributed import distributed_gather
+from ...utils.env import (
+    PAST_KEY_VALUES_FILE_NAME,
+    PREFIX_WEIGHTS_NAME,
+    SAFE_PEFT_WEIGHTS_INDEX_NAME,
+)
+from ...utils.log import logger
+from .prefix_config import PrefixConfig
+
+
+class PrefixModelForCausalLM(paddle.nn.Layer):
+    """
+    PrefixModel for causal language modeling.
+    """
+
+    def __init__(
+        self,
+        model,
+        prefix_config: PrefixConfig,
+        postprocess_past_key_value: Optional[Callable] = None,
+        pad_attention_mask: Optional[Callable] = None,
+    ) -> None:
+        super().__init__()
+        if isinstance(model, fleet.meta_parallel.PipelineLayer):
+            raise NotImplementedError("Prefix tuning is not implemented for pipeline parallelism.")
+        self.prefix_config = prefix_config
+        self.model = model
+        self.forward_keys = signature(self.model.forward)
+        self.config = model.config
+        if self.prefix_config.dtype is None:
+            self.prefix_config.dtype = paddle.get_default_dtype()
+        with dtype_guard(self.prefix_config.dtype):
+            self.prefix_encoder = self._create_prefix_encoder()
+            self.prefix_dropout = nn.Dropout(p=prefix_config.prefix_dropout)
+        self.prefix_tokens = paddle.arange(self.prefix_config.num_prefix_tokens, dtype="int64")
+        self.model_prepare_inputs_for_generation = self.model.prepare_inputs_for_generation
+        self.inference = False
+        self.postprocess_past_key_value = postprocess_past_key_value
+        self.pad_attention_mask = pad_attention_mask
+        if self.model.base_model_prefix == "chatglm_v2":
+            self.prefix_config.tensor_parallel_degree = -1
+        else:
+            if self.prefix_config.tensor_parallel_degree != self.model.config.tensor_parallel_degree:
+                self.prefix_config.tensor_parallel_degree = self.model.config.tensor_parallel_degree
+                logger.warning(
+                    f"Reset tensor_parallel_degree of prefix_config to {self.model.config.tensor_parallel_degree}."
+                )
+        logger.info("Mark only prefix and trainable_module as trainable.")
+        self.mark_only_prefix_as_trainable()
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        **kwargs,
+    ):
+
+        batch_size = input_ids.shape[0]
+        past_key_values = self._get_past_key_values(batch_size)
+
+        if attention_mask is not None:
+            if self.pad_attention_mask is not None:
+                attention_mask = self.pad_attention_mask(
+                    input_ids.shape, self.prefix_config.num_prefix_tokens, attention_mask
+                )
+            else:
+                if len(attention_mask.shape) == 2:
+                    prefix_attention_mask = paddle.ones(
+                        [batch_size, self.prefix_config.num_prefix_tokens], dtype=attention_mask.dtype
+                    )
+                elif len(attention_mask.shape) == 3:
+                    batch_size, src_seq_len, tgt_seq_len = attention_mask.shape
+                    prefix_attention_mask = paddle.ones(
+                        [batch_size, src_seq_len, self.prefix_config.num_prefix_tokens], dtype=attention_mask.dtype
+                    )
+                elif len(attention_mask.shape) == 4:
+                    batch_size, num_heads, src_seq_len, tgt_seq_len = attention_mask.shape
+                    prefix_attention_mask = paddle.ones(
+                        [batch_size, num_heads, src_seq_len, self.prefix_config.num_prefix_tokens],
+                        dtype=attention_mask.dtype,
+                    )
+                else:
+                    raise ValueError(f"Unexpected attention_mask shape: {attention_mask.shape}")
+                attention_mask = paddle.concat((prefix_attention_mask, attention_mask), axis=-1)
+            kwargs["attention_mask"] = attention_mask
+
+        if "past_key_values" in self.forward_keys:
+            output = self.model(input_ids=input_ids, past_key_values=past_key_values, **kwargs)
+        elif "cache" in self.forward_keys:
+            output = self.model(input_ids=input_ids, cache=past_key_values, **kwargs)
+        else:
+            raise NotImplementedError("Model does not support past_key_values either cache")
+        return output
+
+    def generate(self, **kwargs):
+        if "input_ids" not in kwargs:
+            raise ValueError("input_ids must be provided for Peft model generation")
+
+        self.model.prepare_inputs_for_generation = self._prepare_inputs_for_generation
+        outputs = self.model.generate(**kwargs)
+        self.model.prepare_inputs_for_generation = self.model_prepare_inputs_for_generation
+        return outputs
+
+    def _prepare_inputs_for_generation(self, *args, **kwargs):
+        model_kwargs = self.model_prepare_inputs_for_generation(*args, **kwargs)
+        attention_mask = model_kwargs["attention_mask"]
+        batch_size = model_kwargs["input_ids"].shape[0]
+        if self.pad_attention_mask is not None:
+            attention_mask = self.pad_attention_mask(
+                model_kwargs["input_ids"].shape, self.prefix_config.num_prefix_tokens, attention_mask
+            )
+        else:
+            if len(attention_mask.shape) == 2:
+                prefix_attention_mask = paddle.ones(
+                    [batch_size, self.prefix_config.num_prefix_tokens], dtype=attention_mask.dtype
+                )
+            elif len(attention_mask.shape) == 3:
+                batch_size, src_seq_len, tgt_seq_len = attention_mask.shape
+                prefix_attention_mask = paddle.ones(
+                    [batch_size, src_seq_len, self.prefix_config.num_prefix_tokens], dtype=attention_mask.dtype
+                )
+            elif len(attention_mask.shape) == 4:
+                batch_size, num_heads, src_seq_len, tgt_seq_len = attention_mask.shape
+                prefix_attention_mask = paddle.ones(
+                    [batch_size, num_heads, src_seq_len, self.prefix_config.num_prefix_tokens],
+                    dtype=attention_mask.dtype,
+                )
+            else:
+                raise ValueError(f"Unexpected attention_mask shape: {attention_mask.shape}")
+            attention_mask = paddle.concat((prefix_attention_mask, attention_mask), axis=-1)
+        model_kwargs["attention_mask"] = attention_mask
+
+        if "past_key_values" in self.forward_keys:
+            key = "past_key_values"
+        elif "cache" in self.forward_keys:
+            key = "cache"
+        else:
+            raise NotImplementedError("Model does not support past_key_values either cache")
+        if model_kwargs[key] is None:
+            past_key_values = self._get_past_key_values(batch_size)
+            model_kwargs[key] = past_key_values
+        return model_kwargs
+
+    def mark_only_prefix_as_trainable(self) -> None:
+        # freeze pretrained model
+        for _, weight in self.model.state_dict().items():
+            weight.stop_gradient = True
+        # train prefix encoder only
+        for _, weight in self.prefix_encoder.state_dict().items():
+            weight.stop_gradient = False
+
+    def _create_prefix_encoder(self):
+        prefix_dropout = nn.Dropout(p=self.prefix_config.prefix_dropout)
+        self.head_dim = self.prefix_config.hidden_size // self.prefix_config.num_attention_heads
+        if self.prefix_config.multi_query_group_num is not None:
+            self.num_heads = self.prefix_config.multi_query_group_num
+        else:
+            self.num_heads = self.prefix_config.num_attention_heads
+        if self.prefix_config.prefix_projection:
+            activation = nn.Tanh()
+            if self.prefix_config.tensor_parallel_degree > 1:
+                prefix_embedding = fleet.meta_parallel.VocabParallelEmbedding(
+                    self.prefix_config.num_prefix_tokens,
+                    self.head_dim * self.num_heads,
+                )
+                prefix_proj_0 = fleet.meta_parallel.ColumnParallelLinear(
+                    self.head_dim * self.num_heads,
+                    self.prefix_config.prefix_projection_hidden_size,
+                    has_bias=True,
+                    gather_output=False,
+                )
+                prefix_proj_1 = fleet.meta_parallel.RowParallelLinear(
+                    self.prefix_config.prefix_projection_hidden_size,
+                    self.head_dim * self.num_heads * self.prefix_config.num_hidden_layers * 2,
+                    has_bias=True,
+                    input_is_parallel=True,
+                )
+            else:
+                prefix_embedding = nn.Embedding(
+                    self.prefix_config.num_prefix_tokens,
+                    self.head_dim * self.num_heads,
+                )
+                prefix_proj_0 = nn.Linear(
+                    self.head_dim * self.num_heads,
+                    self.prefix_config.prefix_projection_hidden_size,
+                )
+                prefix_proj_1 = nn.Linear(
+                    self.prefix_config.prefix_projection_hidden_size,
+                    self.head_dim * self.num_heads * self.prefix_config.num_hidden_layers * 2,
+                )
+            prefix_encoder = nn.Sequential(prefix_embedding, prefix_proj_0, activation, prefix_proj_1, prefix_dropout)
+        else:
+            if self.prefix_config.tensor_parallel_degree > 1:
+                prefix_embedding = fleet.meta_parallel.VocabParallelEmbedding(
+                    self.prefix_config.num_prefix_tokens,
+                    self.head_dim * self.num_heads * self.prefix_config.num_hidden_layers * 2,
+                )
+            else:
+                prefix_embedding = nn.Embedding(
+                    self.prefix_config.num_prefix_tokens,
+                    self.head_dim * self.num_heads * self.prefix_config.num_hidden_layers * 2,
+                )
+            prefix_encoder = nn.Sequential(prefix_embedding, prefix_dropout)
+        return prefix_encoder
+
+    def _get_past_key_values(self, batch_size):
+
+        # (bs, prefixlen, hidden_dim*layer_num*2)
+        past_key_values = self.prefix_encoder(self.prefix_tokens.unsqueeze(0).expand([batch_size, -1]))
+
+        # (bs, prefixlen, hidden_dim*layer_num*2/tensor_parallel_degree)
+        if self.prefix_config.tensor_parallel_degree > 1:
+            split_past_key_values = past_key_values.split(
+                num_or_sections=self.prefix_config.tensor_parallel_degree, axis=2
+            )
+            past_key_values = split_past_key_values[self.model.config.tensor_parallel_rank]
+            num_heads_per_partition = self.num_heads // self.prefix_config.tensor_parallel_degree
+        else:
+            num_heads_per_partition = self.num_heads
+
+        # (bs, prefixlen, layer_num*2, head_num/tensor_parallel_degree,  head_dim)
+        past_key_values = past_key_values.reshape(
+            [
+                batch_size,
+                self.prefix_config.num_prefix_tokens,
+                self.prefix_config.num_hidden_layers * 2,
+                num_heads_per_partition,
+                self.head_dim,
+            ]
+        )
+
+        if self.postprocess_past_key_value is not None:
+            past_key_values = self.postprocess_past_key_value(past_key_values)
+
+        return past_key_values
+
+    def train(self):
+        self.training = True
+        self.model.training = True
+        self.prefix_encoder.training = True
+        self.model.train()
+        self.prefix_encoder.train()
+
+    def eval(self):
+        self.training = False
+        self.model.training = False
+        self.prefix_encoder.training = False
+        self.model.eval()
+        self.prefix_encoder.eval()
+
+    def print_trainable_parameters(self) -> None:
+        trainable_numel = 0
+        freeze_numel = 0
+        for _, weight in self.model.state_dict().items():
+            if weight.stop_gradient:
+                freeze_numel += np.prod(weight.shape)
+            else:
+                trainable_numel += np.prod(weight.shape)
+        for _, weight in self.prefix_encoder.state_dict().items():
+            if weight.stop_gradient:
+                freeze_numel += np.prod(weight.shape)
+            else:
+                trainable_numel += np.prod(weight.shape)
+        logger.debug(
+            f"Frozen parameters: {freeze_numel:.2e} || Trainable parameters:{trainable_numel:.2e} || Total parameters:{freeze_numel+trainable_numel:.2e}|| Trainable:{trainable_numel / (freeze_numel+trainable_numel):.2%}"
+        )
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model,
+        prefix_path,
+        postprocess_past_key_value=None,
+        pad_attention_mask=None,
+    ):
+        # init prefix config & prefix model
+        prefix_config = PrefixConfig.from_pretrained(prefix_path)
+        # define a new variable to conserve original prefix_config.tensor_parallel_degree value which will update while initializing prefix model
+        prefix_config_tensor_parallel_degree = prefix_config.tensor_parallel_degree
+        prefix_model = cls(model, prefix_config, postprocess_past_key_value, pad_attention_mask)
+
+        prefix_model_index_file = os.path.join(prefix_path, SAFE_PEFT_WEIGHTS_INDEX_NAME)
+        if os.path.exists(prefix_model_index_file):
+            # load safetensors format file.
+            resolved_archieve_file, sharded_metadata = get_checkpoint_shard_files(
+                pretrained_model_name_or_path=prefix_path,
+                index_filename=prefix_model_index_file,
+            )
+            loaded_keys = sharded_metadata["all_checkpoint_keys"]
+            expected_keys = set(prefix_model.prefix_encoder.state_dict().keys())
+            missing_keys = expected_keys - set(loaded_keys)
+            if len(missing_keys) > 0:
+                raise ValueError(f"missing_keys: {missing_keys}")
+
+            error_msgs = []
+            for shard_file in resolved_archieve_file:
+                pre_tensor_parallel_split = False
+                if model.config.tensor_parallel_degree > 1:
+                    pre_tensor_parallel_split = True
+                    tp_actions = prefix_model._get_tensor_parallel_convert_actions(is_split=True)
+                state_dict = load_state_dict(
+                    shard_file, tp_actions if pre_tensor_parallel_split else None, expected_keys
+                )
+                error_msgs += _load_state_dict_into_model(prefix_model.prefix_encoder, state_dict, "")
+                del state_dict
+                gc.collect()
+
+            if len(error_msgs) > 0:
+                error_msgs = "\n\t".join(error_msgs)
+                raise RuntimeError(
+                    f"Error(s) in loading state_dict for {prefix_model.__class__.__name__}:\n\t{error_msgs}"
+                )
+            return prefix_model
+
+        # define prefix weight name
+        if prefix_config_tensor_parallel_degree > 1:
+            prefix_weight_name = _add_variant(PREFIX_WEIGHTS_NAME, f"tp{model.config.tensor_parallel_rank:0>2d}")
+        else:
+            prefix_weight_name = PREFIX_WEIGHTS_NAME
+
+        # load and set prefix weight parameter
+        prefix_weight_path = os.path.join(prefix_path, prefix_weight_name)
+        if os.path.exists(prefix_weight_path):
+            # load prefix weight parameter
+            prefix_state_dict = paddle.load(prefix_weight_path, return_numpy=True)
+            logger.info(f"Loading the prefix weights from {prefix_weight_path}")
+
+            if (
+                prefix_config_tensor_parallel_degree > 1
+                and prefix_config_tensor_parallel_degree != model.config.tensor_parallel_degree
+            ):
+                raise NotImplementedError(
+                    f"{prefix_config_tensor_parallel_degree} is not equal to {model.config.tensor_parallel_degree}. Please merge prefix weights first."
+                )
+
+            # convert parameters to tensor parallel for mp model
+            if prefix_config_tensor_parallel_degree <= 1 and model.config.tensor_parallel_degree > 1:
+                prefix_state_dict = prefix_model._convert_tensor_parallel(prefix_state_dict=prefix_state_dict)
+
+            # set prefix state dict
+            prefix_model.set_state_dict(prefix_state_dict)
+        else:
+            logger.error(f"prefix weights not found under {prefix_path}, creating prefix weights from scratch")
+
+        return prefix_model
+
+    def save_pretrained(self, save_directory: str, merge_tensor_parallel: bool = True, **kwargs):
+        variant = kwargs.get("variant", None)
+        is_main_process = kwargs.get("is_main_process", paddle.distributed.get_rank() == 0)
+
+        assert not os.path.isfile(
+            save_directory
+        ), f"Saving directory ({save_directory}) should be a directory, not a file"
+        os.makedirs(save_directory, exist_ok=True)
+
+        # past_key_values: (prefixlen, hidden_dim*layer_num*2)
+        past_key_values = self.prefix_encoder(self.prefix_tokens.unsqueeze(0).expand([1, -1]))
+        # (prefixlen, 2, layer_num, num_heads, head_dim)
+        past_key_values = past_key_values.reshape(
+            [
+                self.prefix_config.num_prefix_tokens,
+                2,
+                self.prefix_config.num_hidden_layers,
+                self.num_heads,
+                self.head_dim,
+            ]
+        )
+        # (num_layers, 2, num_heads, prefixlen, head_dim)
+        past_key_values = paddle.transpose(past_key_values, perm=[2, 1, 3, 0, 4]).cpu().numpy()
+
+        if merge_tensor_parallel and self.prefix_config.tensor_parallel_degree > 1:
+            trainable_state_dict = self.prefix_encoder.state_dict()
+            trainable_state_dict = self._merge_trainable_tensor_parallel(trainable_state_dict)
+            if not is_main_process:
+                logger.info("Saving with merge_tensor_parallel, tensor_parallel_rank > 0 don't need save")
+                return
+            variant = None
+            self.prefix_config.tensor_parallel_degree = -1
+        else:
+            trainable_state_dict = self.prefix_encoder.state_dict()
+            if self.prefix_config.tensor_parallel_degree > 1:
+                if variant is None:
+                    variant = f"tp{self.model.config.tensor_parallel_rank:0>2d}"
+
+        # save prefix tuning weight
+        prefix_weight_name = _add_variant(PREFIX_WEIGHTS_NAME, variant)
+        weight_filename = os.path.join(save_directory, prefix_weight_name)
+        paddle.save(trainable_state_dict, weight_filename)
+
+        # save prefix config & past key values
+        if is_main_process:
+            self.prefix_config.save_pretrained(save_directory)
+            np.save(os.path.join(save_directory, PAST_KEY_VALUES_FILE_NAME), past_key_values)
+
+        if self.model.base_model_prefix == "chatglm_v2":
+            self.prefix_config.tensor_parallel_degree = -1
+        else:
+            self.prefix_config.tensor_parallel_degree = self.model.config.tensor_parallel_degree
+
+    def set_state_dict(self, state_dict):
+        self.prefix_encoder.set_state_dict(state_dict)
+        logger.info("Load prefix weight successfully")
+
+    def _get_tensor_parallel_convert_actions(self, loaded_keys=None, is_split=False, ignore_error=False):
+        from paddlenlp.transformers.conversion_utils import split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=self.prefix_config.tensor_parallel_degree,
+            tensor_parallel_rank=self.model.config.tensor_parallel_rank,
+            num_attention_heads=self.model.config.num_attention_heads,
+        )
+
+        if self.prefix_config.prefix_projection:
+            name_action_mappings = {
+                "0.weight": partial(fn, is_column=False),
+                "1.weight": partial(fn, is_column=True),
+                "1.bias": partial(fn, is_column=True),
+                "3.weight": partial(fn, is_column=False),
+            }
+        else:
+            name_action_mappings = {
+                "0.weight": partial(fn, is_column=False),
+            }
+        return name_action_mappings
+
+    def _merge_trainable_tensor_parallel(self, trainable_state_dict):
+        name_action_mappings = self._get_tensor_parallel_convert_actions(is_split=False)
+        hcg = paddle.distributed.fleet.get_hybrid_communicate_group()
+        mp_group = hcg.get_model_parallel_group()
+        is_dst = paddle.distributed.get_rank(mp_group) == 0
+
+        for key in trainable_state_dict:
+            tensor = trainable_state_dict[key]
+            if key in name_action_mappings:
+                ret = distributed_gather(tensor, group=mp_group, offload=True)
+                action = name_action_mappings[key]
+                tensor = action(ret) if is_dst else None
+                trainable_state_dict[key] = tensor
+            else:
+                trainable_state_dict[key] = tensor.cpu().numpy() if is_dst else None
+
+        return trainable_state_dict
+
+    def _convert_tensor_parallel(self, prefix_state_dict):
+        name_action_mappings = self._get_tensor_parallel_convert_actions(is_split=True)
+        for name, action in name_action_mappings.items():
+            tensor = prefix_state_dict.pop(name)
+            prefix_state_dict[name] = action(tensor)
+        return prefix_state_dict
+
+    def save_to_aistudio(
+        self,
+        repo_id,
+        private=True,
+        license="Apache License 2.0",
+        exist_ok=True,
+        subfolder=None,
+        merge_tensor_parallel=False,
+        **kwargs
+    ):
+        """
+        Uploads all elements of this model to a new AiStudio Hub repository.
+        Args:
+            repo_id (str): Repository name for your model/tokenizer in the Hub.
+            token (str): Your token for the Hub.
+            private (bool, optional): Whether the model/tokenizer is set to private. Defaults to True.
+            license (str): The license of your model/tokenizer. Defaults to: "Apache License 2.0".
+            exist_ok (bool, optional): Whether to override existing repository. Defaults to: True.
+            subfolder (str, optional): Push to a subfolder of the repo instead of the root
+            merge_tensor_parallel (bool): Whether to merge the tensor parallel weights. Defaults to False.
+        """
+        res = aistudio_sdk.hub.create_repo(repo_id=repo_id, private=private, license=license, **kwargs)
+        if "error_code" in res:
+            if res["error_code"] == 10003 and exist_ok:
+                logger.info(
+                    f"Repo {repo_id} already exists, it will override files with the same name. To avoid this, please set exist_ok=False"
+                )
+            else:
+                logger.error(
+                    f"Failed to create repo {repo_id}, error_code: {res['error_code']}, error_msg: {res['error_msg']}"
+                )
+        else:
+            logger.info(f"Successfully created repo {repo_id}")
+
+        with tempfile.TemporaryDirectory() as root_dir:
+            if subfolder is not None:
+                save_dir = os.path.join(root_dir, subfolder)
+            else:
+                save_dir = root_dir
+            # save model
+            self.save_pretrained(save_dir, merge_tensor_parallel=merge_tensor_parallel)
+
+            # Upload model and return
+            logger.info(f"Pushing to the {repo_id}. This might take a while")
+            for filename in os.listdir(save_dir):
+                res = aistudio_sdk.hub.upload(
+                    repo_id=repo_id, path_or_fileobj=os.path.join(save_dir, filename), path_in_repo=filename, **kwargs
+                )
+                if "error_code" in res:
+                    logger.error(
+                        f"Failed to upload {filename}, error_code: {res['error_code']}, error_msg: {res['error_msg']}"
+                    )
+                else:
+                    logger.info(f"{filename}: {res['message']}")
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/prefix/utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/prefix/utils.py
new file mode 100644
index 000000000..505846845
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/prefix/utils.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+
+def bloom_postprocess_past_key_value(past_key_values):
+    # (layer_num, bs, head_num/tensor_parallel_degree, prefixlen, head_dim)*2
+    keys, values = paddle.transpose(past_key_values, perm=[2, 0, 1, 3, 4]).split(2)
+    # keys: [layer_num, bs, head_num/tensor_parallel_degree, head_dim, prefixlen]
+    # value: [layer_num, bs, head_num/tensor_parallel_degree, prefixlen, head_dim]
+    # keys, values = past_key_values[0].transpose([0, 1, 2, 4, 3]), past_key_values[1]
+    return tuple(zip(keys, values))
+
+
+def chatglm_postprocess_past_key_value(past_key_values):
+    # (layer_num, prefixlen, bs, head_num/tensor_parallel_degree, head_dim)*2
+    keys, values = paddle.transpose(past_key_values, perm=[2, 1, 0, 3, 4]).split(2)
+
+    return tuple(zip(keys, values))
+
+
+def llama_postprocess_past_key_value(past_key_values):
+    # (layer_num, bs, prefixlen, head_num/tensor_parallel_degree, head_dim)*2
+    keys, values = paddle.transpose(past_key_values, perm=[2, 0, 1, 3, 4]).split(2)
+
+    return tuple(zip(keys, values))
+
+
+def mistral_postprocess_past_key_value(past_key_values):
+    # (layer_num, bs, head_num/tensor_parallel_degree, prefixlen, head_dim)*2
+    keys, values = paddle.transpose(past_key_values, perm=[2, 0, 3, 1, 4]).split(2)
+
+    return tuple(zip(keys, values))
+
+
+def qwen_postprocess_past_key_value(past_key_values):
+    # (layer_num, bs, prefixlen, head_num/tensor_parallel_degree, head_dim)*2
+    keys, values = paddle.transpose(past_key_values, perm=[2, 0, 1, 3, 4]).split(2)
+
+    return tuple(zip(keys, values))
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/vera/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/vera/__init__.py
new file mode 100644
index 000000000..2ba6e86f9
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/vera/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .vera_config import VeRAConfig
+from .vera_layers import VeRALinear
+from .vera_model import VeRAModel
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/vera/vera_config.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/vera/vera_config.py
new file mode 100644
index 000000000..76f0d3a73
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/vera/vera_config.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from dataclasses import asdict, dataclass, field
+from typing import List, Optional, Union
+
+from ...utils.env import VERA_CONFIG_NAME
+
+
+@dataclass
+class VeRAConfig:
+    """
+    This is the configuration class to store the configuration of a [`VeRAModel`].
+    Args:
+        r (`int`): vera attention dimension
+        target_modules (`Union[List[str],str]`): The names of the modules to apply vera to.
+        trainable_modules (`List[str]`): The names of the modules to train when applying vera.
+        vera_alpha (`float`): The alpha parameter for vera scaling.
+        vera_dropout (`float`): The dropout probability for vera layers.
+    """
+
+    r: int = field(default=8, metadata={"help": "vera attention dimension"})
+    target_modules: Optional[Union[List[str], str]] = field(
+        default=None,
+        metadata={
+            "help": "List of module names or regex expression of the module names to replace with vera."
+            "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
+        },
+    )
+    trainable_modules: Optional[List[str]] = field(
+        default=None,
+        metadata={
+            "help": "List of module names or regex expression of the module names to train when applying with vera."
+            "For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
+        },
+    )
+    vera_alpha: int = field(default=8, metadata={"help": "vera alpha"})
+    vera_dropout: float = field(default=0.0, metadata={"help": "vera dropout"})
+    trainable_bias: Optional[str] = field(
+        default=None, metadata={"help": "Define trainable bias parameters for the vera model."}
+    )
+    tensor_parallel_degree: int = field(default=-1, metadata={"help": "1 for not use tensor parallel"})
+    dtype: Optional[str] = field(default=None, metadata={"help": "The data type of tensor"})
+    head_dim: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The model multi head dimension.Only for veraMergedLinear and ColumnParallelveraMergedLinear."
+        },
+    )
+    do_qat: bool = field(default=False, metadata={"help": "Whether the vera model would do quant-aware training"})
+    base_model_name_or_path: Optional[str] = field(
+        default=None, metadata={"help": "The name of the base model to use."}
+    )
+    pissa_init: bool = field(default=False, metadata={"help": "Whether the vera weight initialized by pissa"})
+
+    @property
+    def __dict__(self):
+        return asdict(self)
+
+    def to_dict(self):
+        return self.__dict__
+
+    def save_pretrained(self, save_directory):
+        r"""
+        This method saves the configuration of your adapter model in a directory.
+        Args:
+            save_directory (`str`):
+                The directory where the configuration will be saved.
+        """
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        output_dict = self.__dict__
+        output_path = os.path.join(save_directory, VERA_CONFIG_NAME)
+
+        # save it
+        with open(output_path, "w") as writer:
+            writer.write(json.dumps(output_dict, indent=2, sort_keys=True))
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r"""
+        This method loads the configuration of your adapter model from a directory.
+        Args:
+            pretrained_model_name_or_path (`str`):
+                The directory or the hub-id where the configuration is saved.
+            **kwargs:
+                Additional keyword arguments passed along to the child class initialization.
+        """
+        if os.path.isfile(os.path.join(pretrained_model_name_or_path, VERA_CONFIG_NAME)):
+            config_file = os.path.join(pretrained_model_name_or_path, VERA_CONFIG_NAME)
+        else:
+            raise ValueError(f"Can't find vera_config.json at '{pretrained_model_name_or_path}'")
+
+        loaded_attributes = cls.from_json_file(config_file)
+
+        config = cls(**kwargs)
+
+        for key, value in loaded_attributes.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+
+        return config
+
+    @classmethod
+    def from_json_file(cls, path_json_file):
+        r"""
+        Loads a configuration file from a json file.
+        Args:
+            path_json_file (`str`):
+                The path to the json file.
+        """
+        with open(path_json_file, "r") as file:
+            json_object = json.load(file)
+
+        return json_object
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/vera/vera_layers.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/vera/vera_layers.py
new file mode 100644
index 000000000..8bf478503
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/vera/vera_layers.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class VeRALinear(nn.Linear):
+    # VeRA implemented in a dense layer
+    def __init__(
+        self,
+        base_linear_module: paddle.nn.layer.common.Linear,
+        in_features: int,
+        out_features: int,
+        r: int = 0,
+        vera_alpha: int = 1,
+        vera_dropout: float = 0.0,
+        pissa_init: bool = False,
+        **kwargs
+    ):
+        nn.Linear.__init__(self, in_features, out_features, **kwargs)
+        self.weight.set_value(base_linear_module.weight)
+
+        if not isinstance(r, int) or r <= 0:
+            raise ValueError("Vora rank r should be a positive integer")
+        self.r = r
+        self.vera_alpha = vera_alpha
+        # Optional dropout
+        if vera_dropout > 0.0:
+            self.vera_dropout = nn.Dropout(p=vera_dropout)
+        else:
+            self.vera_dropout = lambda x: x
+        # Mark the weight as unmerged
+        self.merged = False
+
+        if pissa_init:
+            assert self.vera_alpha == self.r, "pissa method requires vera_alpha=r, scaling=1"
+            self.scaling = 1.0
+            self.vera_A = self.create_parameter(
+                shape=[in_features, r],
+                dtype=self._dtype,
+                is_bias=False,
+            )
+            self.vera_B = self.create_parameter(
+                shape=[r, out_features],
+                dtype=self._dtype,
+                is_bias=False,
+            )
+            self.pissa_init(r)
+
+        else:
+            # Actual trainable parameters
+            self.vera_A = self.create_parameter(
+                shape=[in_features, r],
+                dtype=self._dtype,
+                is_bias=False,
+                default_initializer=nn.initializer.KaimingUniform(
+                    negative_slope=math.sqrt(5), nonlinearity="leaky_relu"
+                ),
+            )
+            self.vera_B = self.create_parameter(
+                shape=[r, out_features],
+                dtype=self._dtype,
+                is_bias=False,
+                default_initializer=nn.initializer.Constant(value=0.0),
+            )
+        self.scaling = self.vera_alpha / self.r
+
+        self.vera_b = self.create_parameter(
+            shape=[out_features],
+            dtype=self._dtype,
+            is_bias=False,
+            default_initializer=nn.initializer.Constant(value=1.0),
+        )
+
+        self.vera_d = self.create_parameter(
+            shape=[r],
+            dtype=self._dtype,
+            is_bias=False,
+            default_initializer=nn.initializer.Constant(value=1.0),
+        )
+
+        # Freezing the pre-trained weight matrix and bias vector
+        self.weight.stop_gradient = True
+
+    def pissa_init(self, r):
+        weight = self.weight
+        dtype = weight.dtype
+
+        if dtype != paddle.float32:
+            weight = weight.astype(paddle.float32)
+
+        U, S, Vh = paddle.linalg.svd(weight.data, full_matrices=False)
+
+        Ur = U[:, :r]
+        Sr = S[:r]
+        Vhr = Vh[:r]
+
+        vera_A = Ur @ paddle.diag(paddle.sqrt(Sr))
+        vera_B = paddle.diag(paddle.sqrt(Sr)) @ Vhr
+
+        self.vera_A.set_value(vera_A.astype(dtype))
+        self.vera_B.set_value(vera_B.astype(dtype))
+        res = weight.data - vera_A @ vera_B
+        weight = res.astype(dtype)
+        self.weight.set_value(weight)
+
+    def merge(self):
+        if not self.merged:
+            diag_b = paddle.diag(self.vera_b)
+            diag_d = paddle.diag(self.vera_d)
+            new_weight = self.weight + self.vera_A @ diag_d @ self.vera_B @ diag_b * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = True
+
+    def unmerge(self):
+        if self.merged:
+            diag_b = paddle.diag(self.vera_b)
+            diag_d = paddle.diag(self.vera_d)
+            new_weight = self.weight - self.vera_A @ diag_d @ self.vera_B @ diag_b * self.scaling
+            self.weight.set_value(new_weight)
+            self.merged = False
+
+    def forward(self, input: paddle.Tensor, *args, **kwargs):
+        result = F.linear(x=input, weight=self.weight, bias=self.bias, name=self.name)
+        if not self.merged:
+            # result += (self.vera_dropout(input) @ self.vera_A @ self.vera_B) * self.scaling
+            diag_b = paddle.diag(self.vera_b)
+            diag_d = paddle.diag(self.vera_d)
+            result += (self.vera_dropout(input) @ self.vera_A @ diag_d @ self.vera_B @ diag_b) * self.scaling
+        return result
+
+    def extra_repr(self):
+        name = f", name={self.name}" if self.name else ""
+        return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}"
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/vera/vera_model.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/vera/vera_model.py
new file mode 100644
index 000000000..bfd00f07b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/peft/vera/vera_model.py
@@ -0,0 +1,284 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+import re
+from collections import OrderedDict
+from typing import Dict, Union
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+from paddle.distributed.fleet.meta_parallel import PipelineLayer
+
+from ...transformers.model_utils import PretrainedModel, _add_variant, dtype_guard
+from ...utils.env import VERA_WEIGHTS_NAME
+from ...utils.log import logger
+from .vera_config import VeRAConfig
+from .vera_layers import VeRALinear
+
+
+class VeRAModel(nn.Layer):
+    restore_layer_map: Dict[nn.Layer, nn.Layer] = {
+        VeRALinear: nn.Linear,
+    }
+
+    def __init__(self, model, vera_config: VeRAConfig) -> None:
+        super().__init__()
+        self.quantized = False
+        self.vera_config = vera_config
+        if self.vera_config.dtype is None:
+            self.vera_config.dtype = paddle.get_default_dtype()
+        with dtype_guard(self.vera_config.dtype):
+            self.model = self.get_vera_model(model, vera_config)
+        self.is_pipelinemodel = False
+        if issubclass(type(self.model), PipelineLayer):
+            raise NotImplementedError("vera don't support pipeline parallel now")
+        if vera_config.tensor_parallel_degree > 1:
+            raise NotImplementedError("vera don't support tensor parallel now")
+        self.forward = self.model.forward
+
+    @classmethod
+    def from_pretrained(cls, model, vera_path, **kwargs):
+        vera_config = kwargs.pop("vera_config", None)
+        # init vera config & vera model
+        if not isinstance(vera_config, VeRAConfig):
+            vera_config = VeRAConfig.from_pretrained(vera_path)
+        # define a new variable to conserve original vera_config.tensor_parallel_degree value which will update while initializing vera model
+        vera_config_tensor_parallel_degree = vera_config.tensor_parallel_degree
+        vera_model = cls(model, vera_config)
+
+        vera_weight_name = VERA_WEIGHTS_NAME
+
+        # load and set vera weight parameter
+        vera_weight_path = os.path.join(vera_path, vera_weight_name)
+        logger.info(f"vera weight path is {vera_weight_path}")
+        if os.path.exists(vera_weight_path):
+            # load vera weight parameter
+            logger.info("vera_weight_path existed, loading vera weight parameter")
+
+            vera_state_dict = paddle.load(vera_weight_path, return_numpy=True)
+            logger.info(f"Loading the VeRA weights from {vera_weight_path}")
+
+            if (
+                vera_config_tensor_parallel_degree > 1
+                and vera_config_tensor_parallel_degree != model.config.tensor_parallel_degree
+            ):
+                raise NotImplementedError(
+                    f"{vera_config_tensor_parallel_degree} is not equal to {model.config.tensor_parallel_degree}. Please merge VeRA weights first."
+                )
+
+            # set vera state dict
+            vera_model.set_state_dict(vera_state_dict)
+        else:
+            logger.error(f"VeRA weights not found under {vera_path}, creating VeRA weights from scratch")
+
+        return vera_model
+
+    def set_state_dict(self, state_dict):
+        import warnings
+
+        warnings.filterwarnings(
+            action="ignore", message=".*Skip loading for.*", category=Warning, lineno=0, append=False
+        )
+        self.model.set_state_dict(state_dict)
+        logger.info("Load vera weight successfully")
+
+    def save_pretrained(self, save_directory: str, merge_tensor_parallel: bool = False, **kwargs):
+
+        logger.info("save vera pretrained")
+        save_model_config = kwargs.get("save_model_config", True)
+
+        if self.is_pipelinemodel:
+            self.model._single_to_pp_mapping = None
+        if self.quantized and merge_tensor_parallel and self.vera_config.tensor_parallel_degree > 1:
+            merge_tensor_parallel = False
+            logger.warning(
+                "Quantized strategy does not support merge_tensor_parallel. Set merge_tensor_parallel to False."
+            )
+        if self.is_pipelinemodel and merge_tensor_parallel and self.vera_config.tensor_parallel_degree > 1:
+            merge_tensor_parallel = False
+            logger.warning(
+                "Pipeline parallism does not support merge_tensor_parallel. Set merge_tensor_parallel to False."
+            )
+
+        variant = kwargs.get("variant", None)
+        is_main_process = kwargs.get("is_main_process", paddle.distributed.get_rank() == 0)
+
+        assert not os.path.isfile(
+            save_directory
+        ), f"Saving directory ({save_directory}) should be a directory, not a file"
+        os.makedirs(save_directory, exist_ok=True)
+
+        vera_config_to_save = VeRAConfig(**self.vera_config.to_dict())
+
+        logger.info(f"vera config to save is {vera_config_to_save}")
+
+        trainable_state_dict = self.get_trainable_state_dict()
+
+        # save vera weight
+        vera_weight_name = _add_variant(VERA_WEIGHTS_NAME, variant)
+        weight_filename = os.path.join(save_directory, vera_weight_name)
+        paddle.save(trainable_state_dict, weight_filename)
+
+        # save vera config
+        if is_main_process:
+            vera_config_to_save.save_pretrained(save_directory)
+            if save_model_config:
+                model_config_to_save = copy.deepcopy(self.model.config)
+                if merge_tensor_parallel:
+                    model_config_to_save.tensor_parallel_degree = -1
+                model_config_to_save.save_pretrained(save_directory)
+
+    def _find_and_replace_module(self, model, module_name, vera_config, enable_vera):
+        parent_module = model
+        attribute_chain = module_name.split(".")
+        for name in attribute_chain[:-1]:
+            parent_module = getattr(parent_module, name)
+        module = getattr(parent_module, attribute_chain[-1])
+        vera_module = None
+        if enable_vera is None:
+            if isinstance(module, nn.Linear):
+                vera_module = VeRALinear(
+                    # pass the base linear module
+                    base_linear_module=module,
+                    in_features=module.weight.shape[0],
+                    out_features=module.weight.shape[1],
+                    r=vera_config.r,
+                    vera_alpha=vera_config.vera_alpha,
+                    vera_dropout=vera_config.vera_dropout,
+                    bias_attr=False if module.bias is None else None,
+                    pissa_init=vera_config.pissa_init,
+                )
+
+        if vera_module is None:
+            raise ValueError(
+                f"VeRA strategy only supports paddle.nn.Linear or paddle.distributed.fleet.meta_parallel.ColumnParallelLinear. {module}({module_name}) is not supported。"
+            )
+
+        if module.bias is not None:
+            vera_module.bias = module.bias
+
+        setattr(parent_module, attribute_chain[-1], vera_module)
+
+    def _find_and_restore_module(self, module_name):
+        parent_module = self.model
+        attribute_chain = module_name.split(".")
+        for name in attribute_chain[:-1]:
+            parent_module = getattr(parent_module, name)
+        module = getattr(parent_module, attribute_chain[-1])
+        original_model_class = self.restore_layer_map[module.__class__]
+        original_module = original_model_class(in_features=module.weight.shape[0], out_features=module.weight.shape[1])
+        original_module.weight = module.weight
+        if module.bias is not None:
+            original_module.bias = module.bias
+        setattr(parent_module, attribute_chain[-1], original_module)
+
+    def get_trainable_state_dict(self):
+        trainable_state_dict = OrderedDict()
+        for name, weight in self.model.state_dict().items():
+            # get vera parameter
+            if not weight.stop_gradient:
+                trainable_state_dict[name] = weight
+        return trainable_state_dict
+
+    def print_trainable_parameters(self) -> None:
+        freeze_numel = 0
+        trainable_numel = 0
+        for _, weight in self.model.state_dict().items():
+            if weight.stop_gradient:
+                freeze_numel += np.prod(weight.shape)
+            else:
+                trainable_numel += np.prod(weight.shape)
+        logger.debug(
+            f"Frozen parameters: {freeze_numel:.2e} || Trainable parameters:{trainable_numel:.2e} || Total parameters:{freeze_numel+trainable_numel:.2e}|| Trainable:{trainable_numel / (freeze_numel+trainable_numel):.2%}"
+        )
+
+    def mark_only_vera_as_trainable(self, notfreezeB=False) -> None:
+        for _, layer in self.model.named_sublayers():
+            if isinstance(layer, VeRALinear):
+                for name, weight in layer.state_dict().items():
+                    if self.vera_config.trainable_bias in ["vera", "all"] and "bias" in name:
+                        weight.stop_gradient = False
+                    elif "vera" in name:
+                        # notfreezeB=True, vera_b, vera_d, vera_B is trainable
+                        # notfreezeB=False, vera_b, vera_d is trainable
+                        if "vera_b" in name or "vera_d" in name:
+                            weight.stop_gradient = False
+                        elif "vera_B" in name and notfreezeB:
+                            weight.stop_gradient = False
+                        else:
+                            weight.stop_gradient = True
+                    else:
+                        weight.stop_gradient = True
+            else:
+                for name, weight in layer.state_dict().items():
+                    if self.vera_config.trainable_bias == "all" and "bias" in name:
+                        weight.stop_gradient = False
+                    else:
+                        weight.stop_gradient = True
+        if self.vera_config.trainable_modules is not None:
+            for name, weight in self.model.state_dict().items():
+                if any(
+                    re.fullmatch(trainable_module, name) for trainable_module in self.vera_config.trainable_modules
+                ):
+                    weight.stop_gradient = False
+
+    def get_vera_model(self, model: Union[PretrainedModel, nn.Layer], vera_config: VeRAConfig):
+
+        if vera_config.target_modules is None:
+            return model
+        elif isinstance(vera_config.target_modules, str):
+            target_modules = [vera_config.target_modules]
+            enable_vera_list = [None]
+        else:
+            target_modules = vera_config.target_modules
+            enable_vera_list = [None for _ in range(len(target_modules))]
+
+        for target_module, enable_vera in zip(target_modules, enable_vera_list):
+            for i in model.named_sublayers():
+                module_name = i[0]
+                if re.fullmatch(target_module, module_name):
+                    self._find_and_replace_module(model, module_name, vera_config, enable_vera)
+        return model
+
+    def restore_original_model(self):
+        for layer_name, layer in self.model.named_sublayers():
+            if isinstance(layer, VeRALinear):
+                self._find_and_restore_module(layer_name)
+            else:
+                raise NotImplementedError(f"{layer} restoration is not supported yet.")
+        return self.model
+
+    def __getattr__(self, name: str):
+        """Forward missing attributes to the wrapped module."""
+        try:
+            return super().__getattr__(name)  # defer to nn.Layer's logic
+        except AttributeError:
+            return getattr(self.model, name)
+
+    def train(self):
+        self.training = True
+        self.model.training = True
+        for layer in self.model.sublayers():
+            layer.training = True
+            layer.train()
+
+    def eval(self):
+        self.training = False
+        self.model.training = False
+        for layer in self.model.sublayers():
+            layer.training = False
+            layer.eval()
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/__init__.py
new file mode 100644
index 000000000..9df588e4b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .prompt_args import *
+from .prompt_model import *
+from .prompt_tokenizer import *
+from .prompt_trainer import *
+from .prompt_utils import *
+from .template import *
+from .verbalizer import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/prompt_args.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/prompt_args.py
new file mode 100644
index 000000000..b1158e107
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/prompt_args.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+
+from ..trainer import TrainingArguments
+from ..utils.log import logger
+
+__all__ = ["PromptTuningArguments"]
+
+
+@dataclass
+class PromptTuningArguments(TrainingArguments):
+    """
+    The arguments' subset for training loop during prompt tuning.
+    """
+
+    max_seq_length: int = field(default=512, metadata={"help": "The maximum length of all input text."})
+    freeze_plm: bool = field(
+        default=False, metadata={"help": "If True, the pretrained parameters won't be " "updated during tuning."}
+    )
+    freeze_dropout: bool = field(
+        default=False,
+        metadata={
+            "help": "If True, pretrained parameters won't be updated " "during tuning and the dropout is disabled."
+        },
+    )
+    save_plm: bool = field(default=False, metadata={"help": "Whether to save pretrained model."})
+    use_rdrop: bool = field(
+        default=False,
+        metadata={
+            "help": "Use R-Drop regularization strategy."
+            "Please refer to the paper for more details: "
+            "https://arxiv.org/abs/2106.14448."
+        },
+    )
+    alpha_rdrop: float = field(default=5.0, metadata={"help": "The KL-divergence loss weight alpha in R-Drop."})
+    use_rgl: bool = field(
+        default=False,
+        metadata={
+            "help": "Use label consistency to boost tuning performance."
+            "Please refer to the paper for more details: "
+            "https://aclanthology.org/2022.findings-naacl.81/."
+        },
+    )
+    alpha_rgl: float = field(default=0.5, metadata={"help": "The weight of label consistency loss in RGL."})
+
+    ppt_learning_rate: float = field(
+        default=1e-4, metadata={"help": "The initial learning rate of prompt parameters."}
+    )
+    ppt_weight_decay: float = field(default=0.0, metadata={"help": "Weight decay for the AdamW optimizer of prompt."})
+    ppt_adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for the AdamW optimizer of prompt."})
+    ppt_adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for the AdamW optimizer of prompt."})
+    ppt_adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for the AdamW optimizer of prompt."})
+
+    def __post_init__(self):
+        super(PromptTuningArguments, self).__post_init__()
+        if self.use_rgl and self.alpha_rgl == 0.0:
+            logger.warning(
+                "Ignore `use_rgl` because `alpha_rgl` = 0. Please " "set `alpha_rgl` a positive float to use RGL loss."
+            )
+            self.use_rgl = False
+
+        if self.use_rdrop and self.alpha_rdrop == 0.0:
+            logger.warning(
+                "Ignore `use_rdrop` because `alpha_rdrop` = 0. Please "
+                "set `alpha_rdrop` a positive float to use R-Drop."
+            )
+            self.use_rdrop = False
+
+        if self.freeze_dropout:
+            self.freeze_plm = True
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/prompt_model.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/prompt_model.py
new file mode 100644
index 000000000..496662d5c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/prompt_model.py
@@ -0,0 +1,162 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Any, Dict, Optional
+
+import paddle
+from paddle.static import InputSpec
+
+from ..transformers.model_outputs import (
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    SequenceClassifierOutput,
+)
+from .prompt_utils import signature
+from .template import PrefixTemplate, Template
+from .verbalizer import Verbalizer
+
+
+class PromptModelForSequenceClassification(paddle.nn.Layer):
+    """
+    PromptModel for classification tasks.
+    """
+
+    def __init__(
+        self,
+        model: paddle.nn.Layer,
+        template: Template,
+        verbalizer: Optional[Verbalizer] = None,
+        freeze_plm: bool = False,
+        freeze_dropout: bool = False,
+    ):
+        super(PromptModelForSequenceClassification, self).__init__()
+        self.plm = model
+        self.template = template
+        self.verbalizer = verbalizer
+        self.freeze_plm = freeze_plm
+        self.freeze_dropout = freeze_dropout
+        if self.freeze_plm:
+            for param in self.plm.parameters():
+                param.stop_gradient = True
+            if self.freeze_dropout:
+                self.plm.eval()
+        self.forward_keys = signature(self.plm.forward)
+        self._mask_token_id = self.template.tokenizer.mask_token_id
+        self._pad_token_id = self.template.tokenizer.pad_token_id
+        if isinstance(self.template, PrefixTemplate):
+            self.plm = self.template.process_model(self.plm)
+            self.forward_keys.append("past_key_values")
+
+    def forward(
+        self,
+        input_ids: paddle.Tensor,
+        token_type_ids: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        masked_positions: Optional[paddle.Tensor] = None,
+        soft_token_ids: Optional[paddle.Tensor] = None,
+        encoder_ids: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs: Dict[str, Any]
+    ):
+        return_dict = return_dict if return_dict is not None else False
+        return_hidden_states = kwargs.get("return_hidden_states", False)
+        input_dict = {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "position_ids": position_ids,
+            "masked_positions": masked_positions,
+            "soft_token_ids": soft_token_ids,
+            "attention_mask": attention_mask,
+            "encoder_ids": encoder_ids,
+            **kwargs,
+        }
+        input_dict = self.template.process_batch(input_dict)
+        input_dict = {**input_dict, **kwargs}
+        model_inputs = {k: input_dict[k] for k in input_dict if k in self.forward_keys}
+        if "masked_positions" in model_inputs:
+            model_inputs.pop("masked_positions")
+        model_outputs = self.plm(**model_inputs, return_dict=True)
+        if isinstance(model_outputs, MaskedLMOutput):
+            if self.verbalizer is not None:
+                logits = self.verbalizer.process_outputs(model_outputs.logits, input_dict["masked_positions"])
+                num_labels = len(self.verbalizer.label_words)
+            else:
+                raise Exception("Verbalizer is required when model uses the MaskedLM head")
+        elif isinstance(model_outputs, SequenceClassifierOutput):
+            logits = model_outputs.logits
+            num_labels = self.plm.num_labels if self.plm.num_labels is not None else self.plm.num_labels
+        elif isinstance(model_outputs, MultipleChoiceModelOutput):
+            logits = model_outputs.logits
+            num_labels = -1
+        else:
+            raise Exception(f"Model type not support yet: {type(model_outputs)}")
+
+        loss = None
+        if labels is not None:
+            if num_labels == 1:
+                loss_fct = paddle.nn.MSELoss()
+                loss = loss_fct(logits, labels)
+            elif num_labels > 0 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32):
+                loss_fct = paddle.nn.CrossEntropyLoss()
+                loss = loss_fct(logits.reshape((-1, num_labels)), labels.reshape((-1,)))
+            else:
+                loss_fct = paddle.nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,)
+            if return_hidden_states:
+                output = output + (model_outputs.logits,)
+            if loss is not None:
+                return (loss,) + output
+            if isinstance(output, (list, tuple)) and len(output) == 1:
+                output = output[0]
+            return output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=model_outputs.logits,
+        )
+
+    def prompt_parameters(self):
+        """
+        Get the parameters of template and verbalizer.
+        """
+        params = [p for p in self.template.parameters()]
+        if self.verbalizer is not None:
+            params += [p for p in self.verbalizer.parameters()]
+        return params
+
+    def get_input_spec(self):
+        template_keywords = self.template.extract_template_keywords(self.template.prompt)
+        input_spec = [
+            InputSpec(shape=[None, None], dtype="int64", name="input_ids"),
+            InputSpec(shape=[None, None], dtype="int64", name="token_type_ids"),
+            InputSpec(shape=[None, None], dtype="int64", name="position_ids"),
+            InputSpec(shape=[None, None, None, None], dtype="float32", name="attention_mask"),
+        ]
+        if "mask" in template_keywords:
+            input_spec.append(InputSpec(shape=[None], dtype="int64", name="masked_positions"))
+        if "soft" in template_keywords:
+            # Add placeholder for argument `masked_positions` if not exists.
+            if "mask" not in template_keywords:
+                input_spec.append(None)
+            input_spec.append(InputSpec(shape=[None, None], dtype="int64", name="soft_token_ids"))
+            if "encoder" in template_keywords:
+                input_spec.append(InputSpec(shape=[None, None], dtype="int64", name="encoder_ids"))
+        return input_spec
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/prompt_tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/prompt_tokenizer.py
new file mode 100644
index 000000000..8e41162c5
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/prompt_tokenizer.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+from collections import defaultdict
+from typing import Any, Dict, List, Union
+
+import numpy as np
+
+from paddlenlp.utils.log import logger
+
+__all__ = ["MLMPromptTokenizer"]
+
+
+class MLMPromptTokenizer(object):
+
+    omask_token = "[O-MASK]"
+
+    def __init__(self, tokenizer, max_length):
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+
+    def __call__(self, inputs: List[Dict[str, Any]]):
+        part_do_truncate = [part["do_truncate"] for part in inputs]
+
+        encoded_inputs = defaultdict(list)
+        option_length = None
+        last_position = 1  # Id 0 denotes special token '[CLS]'.
+        last_token_type = 0
+        orig_input_ids = []
+        for index, part in enumerate(inputs):
+            # Create input_ids.
+            soft_token_ids = part.get("soft_tokens", None)
+            if soft_token_ids is None or len(soft_token_ids) == 1 and soft_token_ids[0] == 0:
+                orig_input_ids.append(
+                    self.tokenizer.encode(part["text"], add_special_tokens=False, return_token_type_ids=False)[
+                        "input_ids"
+                    ]
+                )
+            else:
+                orig_input_ids.append(soft_token_ids)
+        max_lengths = self._create_max_lengths_from_do_truncate(orig_input_ids, part_do_truncate)
+
+        for index, part in enumerate(inputs):
+            # Create input_ids.
+            soft_token_ids = part.get("soft_tokens", None)
+            if soft_token_ids is None or len(soft_token_ids) == 1 and soft_token_ids[0] == 0:
+                if self.tokenizer.truncation_side == "left":
+                    input_ids = orig_input_ids[index][-max_lengths[index] :]
+                else:
+                    input_ids = orig_input_ids[index][: max_lengths[index]]
+                encoded_inputs["soft_token_ids"].append([0] * len(input_ids))
+            else:
+                input_ids = soft_token_ids
+                encoded_inputs["soft_token_ids"].append(soft_token_ids)
+            encoded_inputs["input_ids"].append(input_ids)
+            part_length = len(input_ids)
+
+            # Create position_ids.
+            position_ids, last_position = self._create_position_ids_from_part(input_ids, part, last_position)
+            encoded_inputs["position_ids"].append(position_ids)
+
+            # Create token_type_ids.
+            if "token_types" in part:
+                last_token_type = part["token_types"]
+            encoded_inputs["token_type_ids"].append([last_token_type] * part_length)
+
+            # Create other features like encoder_ids.
+            for name in part:
+                if name not in ["text", "soft_tokens", "positions", "token_types"]:
+                    encoded_inputs[name].append([part[name]] * part_length)
+
+            # Record the length of options if exists.
+            if self.omask_token in part["text"]:
+                option_length = len(input_ids)
+
+        encoded_inputs.pop("do_truncate")
+        encoded_inputs = self.join(encoded_inputs)
+        encoded_inputs = self.add_special_tokens(encoded_inputs)
+        attention_mask = self._create_attention_mask(encoded_inputs["input_ids"], option_length)
+        if attention_mask is not None:
+            encoded_inputs["attention_mask"] = attention_mask
+        masked_positions = self._create_masked_positions(encoded_inputs["input_ids"], encoded_inputs["soft_token_ids"])
+        if masked_positions is not None:
+            encoded_inputs["masked_positions"] = masked_positions
+        return encoded_inputs
+
+    def _create_position_ids_from_part(self, input_ids: List[int], part: Dict[str, Any], last_position: int):
+        """
+        Create position ids from prompt for each part.
+        """
+        part_length = len(input_ids)
+        if "positions" in part and part["positions"] >= 0:
+            last_position = part["positions"]
+        if self.omask_token in part["text"]:
+            omask_id = self.tokenizer.convert_tokens_to_ids(self.omask_token)
+            omask_index = [x for x in range(part_length) if input_ids[x] == omask_id]
+            omask_index = [0] + omask_index
+            position_ids = []
+            max_index = 0
+            for start_id, end_id in zip(omask_index[:-1], omask_index[1:]):
+                position_ids.extend(list(range(last_position, last_position + end_id - start_id)))
+                max_index = max(end_id - start_id, max_index)
+            if len(position_ids) < part_length:
+                difference = part_length - len(position_ids)
+                position_ids.extend(range(last_position, last_position + difference))
+                max_index = max(difference, max_index)
+            last_position += max_index
+        else:
+            position_ids = list(range(last_position, last_position + part_length))
+            last_position += part_length
+        return position_ids, last_position
+
+    def _create_max_lengths_from_do_truncate(self, part_text: List[str], part_do_truncate: List[bool]):
+        """
+        Create the max sequence length of each part, where the longest part is truncated first.
+        """
+        text_length = sum([len(x) for x in part_text])
+        num_special_token = self.tokenizer.num_special_tokens_to_add()
+        max_length = self.max_length - num_special_token
+        if text_length <= max_length:
+            return [None] * len(part_text)
+        max_lengths = [None for _ in range(len(part_text))]
+        do_truncate = [int(x) for x in part_do_truncate]
+
+        # Remove parts that can not be truncated.
+        for index, part in enumerate(part_text):
+            if not part_do_truncate[index]:
+                max_length -= len(part)
+            else:
+                max_lengths[index] = len(part)
+        if sum(do_truncate) == 0:
+            logger.warning(
+                f"Can not truncate the sequence with length {text_length}. Set more `truncate` attributes as True."
+            )
+            return max_lengths
+
+        # Remove parts whose length is less than average maximum length of parts to truncate.
+        has_short = True
+        while has_short:
+            has_short = False
+            avg_max_length = max_length // sum(do_truncate)
+            for index, part in enumerate(part_text):
+                if do_truncate[index] == 1 and len(part) <= avg_max_length:
+                    do_truncate[index] = 0
+                    max_lengths[index] = len(part)
+                    max_length -= len(part)
+                    has_short = True
+        if max_length < 0:
+            raise AssertionError("Actual length has exceeded the maximum length. Check the implementation.")
+        avg_max_length = max_length // sum(do_truncate)
+        for index in range(len(part_text)):
+            if do_truncate[index] == 1:
+                max_lengths[index] = min(avg_max_length, max_length)
+                max_length -= max_lengths[index]
+                if max_length < 0:
+                    raise AssertionError("Actual length has exceeded the maximum length. Check the implementation.")
+        return max_lengths
+
+    def _create_attention_mask(self, input_ids: List[int], option_length: Union[int, None]):
+        if option_length is None:
+            return None
+        omask_id = self.tokenizer.convert_tokens_to_ids(self.omask_token)
+        input_ids = np.array(input_ids)
+        attention_mask = np.ones([len(input_ids), len(input_ids)])
+        omask_index = np.where(input_ids == omask_id)[0].tolist()
+        cls_indices = np.where(input_ids == self.tokenizer.cls_token_id)[0]
+        sep_indices = np.where(input_ids == self.tokenizer.sep_token_id)[0]
+        cls_index = len(input_ids)
+        for idx in cls_indices:
+            if idx > omask_index[-1]:
+                cls_index = idx
+                break
+        sep_index = len(input_ids)
+        for idx in sep_indices:
+            if idx > omask_index[-1]:
+                sep_index = idx
+                break
+        opt_begin = omask_index[0]
+        opt_end = min(cls_index, sep_index)
+        attention_mask[opt_begin:opt_end, opt_begin:opt_end] = 0
+        omask_index.append(opt_end)
+        for opt_begin, opt_end in zip(omask_index[:-1], omask_index[1:]):
+            attention_mask[opt_begin:opt_end, opt_begin:opt_end] = 1
+        attention_mask = (attention_mask - 1) * 1e4
+        return attention_mask
+
+    def _create_masked_positions(self, input_ids: List[int], soft_token_ids: List[int]):
+        non_soft_ids = np.array(input_ids) * (np.array(soft_token_ids) == 0)
+        mask_id = self.tokenizer.mask_token_id
+
+        masked_positions = np.where(non_soft_ids == mask_id)[0]
+        if masked_positions.shape[0] == 0:
+            return None
+        return masked_positions.tolist()
+
+    def add_special_tokens(self, input_dict: Dict[str, Any]):
+        for key in input_dict:
+            new_inputs = self.tokenizer.build_inputs_with_special_tokens(input_dict[key])
+            if key != "input_ids":
+                special_mask = np.array(self.tokenizer.get_special_tokens_mask(input_dict[key]))
+                new_inputs = np.array(new_inputs)
+                # TODO (Huijuan): Use different ids according to specific keyword.
+                new_inputs[special_mask == 1] = 0
+                new_inputs = new_inputs.tolist()
+            input_dict[key] = new_inputs
+        return input_dict
+
+    @staticmethod
+    def join(input_dict):
+        for key in input_dict:
+            input_dict[key] = list(itertools.chain(*input_dict[key]))
+        return input_dict
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/prompt_trainer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/prompt_trainer.py
new file mode 100644
index 000000000..2e8c97023
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/prompt_trainer.py
@@ -0,0 +1,316 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.io import DataLoader, Dataset
+
+from ..data import DataCollator
+from ..datasets import MapDataset
+from ..losses import RDropLoss
+from ..trainer import Trainer, TrainerCallback
+from ..trainer.trainer_utils import EvalPrediction, get_scheduler
+from ..transformers import PretrainedTokenizer, export_model
+from ..utils.log import logger
+from .prompt_args import PromptTuningArguments
+from .prompt_utils import PromptDataCollatorWithPadding
+from .template import AutoTemplate
+from .verbalizer import SoftVerbalizer
+
+
+class PromptTrainer(Trainer):
+    """
+    PromptTrainer is a feature-complete training and eval loop for PaddleNLP
+    on prompt-tuning.
+    """
+
+    def __init__(
+        self,
+        model: nn.Layer,
+        tokenizer: PretrainedTokenizer,
+        criterion: Optional[nn.Layer] = None,
+        args: Optional[PromptTuningArguments] = None,
+        data_collator: Optional[DataCollator] = None,
+        train_dataset: Optional[MapDataset] = None,
+        eval_dataset: Optional[MapDataset] = None,
+        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
+        callbacks: Optional[List[TrainerCallback]] = None,
+        optimizers: Tuple[paddle.optimizer.Optimizer, paddle.optimizer.lr.LRScheduler] = (None, None),
+    ):
+        if args is None:
+            output_dir = "tmp_trainer"
+            logger.info(
+                "No `TrainingArguments` passed, initialized with " "output_dir={} by default.".format(output_dir)
+            )
+            args = PromptTuningArguments(output_dir=output_dir)
+
+        if data_collator is None:
+            data_collator = PromptDataCollatorWithPadding(tokenizer, padding=True, return_tensors="pd")
+
+        if criterion is None and (args.use_rgl or args.use_rdrop):
+            raise Exception("'To use 'use_rgl', 'use_rdrop', 'criterion' must be specified")
+
+        super(PromptTrainer, self).__init__(
+            model=model,
+            criterion=criterion,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            tokenizer=tokenizer,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+        )
+
+        self._load_from_checkpoint(args.resume_from_checkpoint)
+
+        self.train_dataset = self._map_dataset(self.train_dataset)
+        self.eval_dataset = self._map_dataset(self.eval_dataset)
+
+        if self.args.use_rdrop:
+            self.rdrop_criterion = RDropLoss()
+
+    def _get_model(self):
+        model = self.model
+        if isinstance(model, paddle.DataParallel):
+            model = model._layers
+        return model
+
+    @property
+    def template(self):
+        return self._get_model().template
+
+    @template.setter
+    def template(self, template):
+        self._get_model().template = template
+
+    @property
+    def verbalizer(self):
+        return self._get_model().verbalizer
+
+    @verbalizer.setter
+    def verbalizer(self, verbalizer):
+        self._get_model().verbalizer = verbalizer
+
+    @property
+    def pretrained_model(self):
+        return self._get_model().plm
+
+    @pretrained_model.setter
+    def pretrained_model(self, model):
+        setattr(self._get_model(), "plm", model)
+
+    def _map_dataset(self, dataset: MapDataset):
+        if dataset is None:
+            return None
+        if not isinstance(dataset, MapDataset):
+            raise ValueError("Expected `MapDataset` but received {}.".format(type(dataset)))
+
+        def encode_with_template(example):
+            return self.template(example)
+
+        return dataset.map(encode_with_template)
+
+    def _prepare_input(self, inputs: Dict):
+        return inputs
+
+    def _save(
+        self,
+        output_dir: Optional[str] = None,
+        state_dict: Dict[str, Any] = None,
+        merge_tensor_parallel: Optional[bool] = True,
+    ):
+        super(PromptTrainer, self)._save(output_dir, state_dict, merge_tensor_parallel)
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        if self.template:
+            self.template.save(output_dir)
+        if self.verbalizer is not None:
+            self.verbalizer.save(output_dir)
+        if self.args.save_plm:
+            plm_output_dir = os.path.join(output_dir, "plm")
+            os.makedirs(plm_output_dir, exist_ok=True)
+            self.pretrained_model.save_pretrained(plm_output_dir)
+
+    def _load_from_checkpoint(self, resume_from_checkpoint: os.PathLike = None):
+        if resume_from_checkpoint is not None:
+            self.template = AutoTemplate.load_from(
+                resume_from_checkpoint, self.tokenizer, self.args.max_seq_length, self._get_model().plm
+            )
+        super(PromptTrainer, self)._load_from_checkpoint(resume_from_checkpoint)
+
+    def get_test_dataloader(self, test_dataset):
+        test_dataset = self._map_dataset(test_dataset)
+        return super(PromptTrainer, self).get_test_dataloader(test_dataset)
+
+    def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader:
+        if eval_dataset is not None:
+            eval_dataset = self._map_dataset(eval_dataset)
+        return super(PromptTrainer, self).get_eval_dataloader(eval_dataset)
+
+    def create_optimizer(self, lr_scheduler=None):
+        """
+        Setup the optimizer for both model and prompt parameters.
+        """
+        if self.optimizer is None:
+            optim_cls, optim_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
+
+            plm_parameters = []
+            if not self.args.freeze_plm:
+                plm_parameters.extend([p for p in self._get_model().plm.parameters() if not p.stop_gradient])
+
+            ppt_parameters = []
+            if self.template is not None:
+                ppt_parameters.extend([x for n, x in self.template.named_parameters() if not x.stop_gradient])
+            if self.verbalizer is not None:
+                if isinstance(self.verbalizer, SoftVerbalizer):
+                    if not self.args.freeze_plm:
+                        plm_parameters.extend(
+                            [p for n, p in self.verbalizer.non_head_parameters() if not p.stop_gradient]
+                        )
+                    ppt_parameters.extend([p for n, p in self.verbalizer.head_parameters()])
+                else:
+                    ppt_parameters.extend([p for n, p in self.verbalizer.parameters()])
+
+            decay_parameters = [
+                p.name for n, p in self._get_model().named_parameters() if not any(nd in n for nd in ["bias", "norm"])
+            ]
+
+            if len(plm_parameters) > 0:
+                ppt_lr = self.args.ppt_learning_rate / self.args.learning_rate
+                lr = self.lr_scheduler if lr_scheduler is None else lr_scheduler
+                if len(ppt_parameters) > 0:
+                    params = [
+                        {"params": plm_parameters},
+                        {
+                            "params": ppt_parameters,
+                            "learning_rate": ppt_lr,
+                            "weight_decay": self.args.ppt_weight_decay,
+                            "beta1": self.args.ppt_adam_beta1,
+                            "beta2": self.args.ppt_adam_beta2,
+                            "epsilon": self.args.ppt_adam_epsilon,
+                        },
+                    ]
+                else:
+                    params = plm_parameters
+            else:
+                if self.args.max_steps > 0:
+                    max_steps = self.args.max_steps
+                else:
+                    raise ValueError("Please use `max_steps` to set the maximum training steps.")
+                warmup = (
+                    self.args.warmup_steps if self.args.warmup_steps > 0 else int(self.args.warmup_ratio * max_steps)
+                )
+                self.lr_scheduler = get_scheduler(
+                    self.args.lr_scheduler_type,
+                    learning_rate=self.args.ppt_learning_rate,
+                    num_warmup_steps=warmup,
+                    num_training_steps=max_steps,
+                )
+                lr = self.lr_scheduler
+                params = ppt_parameters
+
+            self.optimizer = optim_cls(
+                learning_rate=lr,
+                apply_decay_param_fun=lambda x: x in decay_parameters,
+                parameters=params,
+                weight_decay=self.args.weight_decay,
+                grad_clip=nn.ClipGradByGlobalNorm(self.args.max_grad_norm),
+                **optim_kwargs,
+            )
+
+        return self.optimizer
+
+    def compute_loss(self, model, inputs, return_outputs=False):
+        """
+        Compute the total loss for every batch.
+        """
+        if "labels" not in inputs:
+            raise ValueError("Fail to compute loss as `labels` not in {}.".format(inputs))
+        labels = inputs["labels"]
+
+        input_dict = inputs.copy()
+
+        if self.criterion is not None:
+            # pop labels to move loss computation out of the model
+            input_dict.pop("labels")
+            input_dict["return_hidden_states"] = True
+            logits, hidden_states = model(**input_dict)
+            loss = self.criterion(logits, labels)
+
+            if self.args.use_rdrop:
+                loss = self._compute_rdrop_loss(model, input_dict, labels, logits, loss)
+
+            if self.args.use_rgl:
+                loss += self._compute_rgl_loss(hidden_states, labels)
+        else:
+            loss, logits = model(**input_dict)
+
+        outputs = (loss, logits)
+
+        return (loss, outputs) if return_outputs else loss
+
+    def _compute_rdrop_loss(self, model, input_dict, labels, outputs, loss):
+        re_outputs, _ = model(**input_dict)
+        ce_loss = (self.criterion(re_outputs, labels) + loss) * 0.5
+        kl_loss = self.rdrop_criterion(outputs, re_outputs)
+        loss = ce_loss + self.args.alpha_rdrop * kl_loss
+        return loss
+
+    def _compute_rgl_loss(self, embeddings, labels, equal_type="raw"):
+        """
+        Compute the label consistency loss of sentence embeddings per batch.
+        Please refer to https://aclanthology.org/2022.findings-naacl.81/
+        for more details.
+        """
+
+        def _max_equal(x, y):
+            return int(paddle.argmax(x, axis=0) == paddle.argmax(y, axis=0))
+
+        def _raw_equal(x, y):
+            return int(x == y)
+
+        if equal_type == "raw":
+            equals = _raw_equal
+        elif equal_type == "max":
+            equals = _max_equal
+        else:
+            raise ValueError("Unsupported equal type {}.".format(equal_type))
+        batch_size = embeddings.shape[0]
+        loss = 0
+        for i in range(batch_size):
+            for j in range(batch_size):
+                score = F.cosine_similarity(embeddings[i], embeddings[j], axis=0)
+                score = score.unsqueeze(0)
+                logits = paddle.concat([(1 - score) * 50, (1 + score) * 50], axis=-1)
+                label = paddle.to_tensor([equals(labels[i], labels[j])])
+                logits = logits.reshape([-1, logits.shape[-1]])
+                loss += F.cross_entropy(logits, label.unsqueeze(0))
+        loss = loss / (batch_size * (batch_size - 1))
+        loss = loss / 100 * self.args.alpha_rgl
+
+        return loss
+
+    def export_model(self, export_path, input_spec=None, export_type="paddle"):
+        os.makedirs(export_path, exist_ok=True)
+        self.template.save(export_path)
+        if self.verbalizer is not None:
+            self.verbalizer.save(export_path)
+        if input_spec is None:
+            input_spec = self.model.get_input_spec()
+        export_model(self.model, input_spec, export_path, export_type)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/prompt_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/prompt_utils.py
new file mode 100644
index 000000000..f446154aa
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/prompt_utils.py
@@ -0,0 +1,208 @@
+"""
+Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+This module defines the itermediate data structure of inputs.
+"""
+
+import inspect
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import paddle
+from paddle import Tensor
+
+from ..transformers.model_outputs import MaskedLMOutput, SequenceClassifierOutput
+from ..transformers.tokenizer_utils_base import PaddingStrategy, PretrainedTokenizerBase
+
+
+def signature(function):
+    """
+    Obtain the input arguments of the given function.
+    """
+    sig = inspect.signature(function)
+    args = [p.name for p in sig.parameters.values() if p.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD]
+    return args
+
+
+@dataclass
+class PromptDataCollatorWithPadding:
+    """
+    Data collator that will group inputs by keywords and dynamically
+    pad the inputs to the longest sequence in the batch.
+
+    Args:
+        tokenizer (`paddlenlp.transformers.PretrainedTokenizer`):
+            The tokenizer used for encoding the data from PromptTokenizer.
+    """
+
+    tokenizer: PretrainedTokenizerBase
+    padding: Union[bool, str, PaddingStrategy] = True
+    max_length: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    return_tensors: str = "pd"
+    return_attention_mask: Optional[bool] = None
+    default_model_input_names: List = (
+        "input_ids",
+        "token_type_ids",
+        "special_tokens_mask",
+        "offset_mapping",
+        "position_ids",
+    )
+
+    def _convert_to_tensors(self, data):
+        if self.return_tensors == "np":
+            return np.array(data)
+        else:
+            return paddle.to_tensor(data)
+
+    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
+        batch = {}
+        for key in features[0]:
+            if key in self.default_model_input_names:
+                batch[key] = [b[key] for b in features]
+
+        batch = self.tokenizer.pad(
+            batch,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors=self.return_tensors,
+            return_attention_mask=self.return_attention_mask,
+        )
+        max_length = batch["input_ids"].shape[1]
+        for key in features[0]:
+            if key not in self.default_model_input_names:
+                values = [b[key] for b in features if key in b]
+                if len(values) < len(features):
+                    continue
+                if key == "masked_positions":
+                    new_values = []
+                    for index, value in enumerate(values):
+                        value = np.array(value) + index * max_length
+                        new_values.extend(value.tolist())
+                    values = new_values
+                elif key == "attention_mask":
+                    new_values = np.ones([len(values), 1, max_length, max_length]) * -1e4
+                    for index, value in enumerate(values):
+                        length = len(value)
+                        new_values[index][0, :length, :length] = value
+                    values = new_values
+                elif key in ("soft_token_ids", "encoder_ids"):
+                    for index, value in enumerate(values):
+                        values[index] = value + [0] * (max_length - len(value))
+                elif key in ("omask_positions"):
+                    max_num_option = max([len(x) for x in values])
+                    for index, value in enumerate(values):
+                        values[index] = value + [0] * (max_num_option - len(value))
+                elif key == "labels":
+                    if isinstance(values[0], list):
+                        max_num_label = max([len(x) for x in values])
+                        for index, value in enumerate(values):
+                            values[index] = value + [-100] * (max_num_label - len(value))
+                elif key != "cls_positions":
+                    continue
+                batch[key] = self._convert_to_tensors(values)
+        return batch
+
+
+def sequence_classification_forward_with_past_key_values(
+    self,
+    input_ids: Optional[Tensor] = None,
+    token_type_ids: Optional[Tensor] = None,
+    position_ids: Optional[Tensor] = None,
+    attention_mask: Optional[Tensor] = None,
+    inputs_embeds: Optional[Tensor] = None,
+    labels: Optional[Tensor] = None,
+    output_hidden_states: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    past_key_values: Optional[Tuple[Tuple[Tensor]]] = None,
+):
+    outputs = self.ernie(
+        input_ids,
+        token_type_ids=token_type_ids,
+        position_ids=position_ids,
+        attention_mask=attention_mask,
+        inputs_embeds=inputs_embeds,
+        past_key_values=past_key_values,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=True,
+    )
+    pooled_output = outputs[1]
+
+    pooled_output = self.dropout(pooled_output)
+    logits = self.classifier(pooled_output)
+
+    loss = None
+    if labels is not None:
+        if self.num_labels == 1:
+            loss_fct = paddle.nn.MSELoss()
+            loss = loss_fct(logits, labels)
+        elif labels.dtype == paddle.int64 or labels.dtype == paddle.int32:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+        else:
+            loss_fct = paddle.nn.BCEWithLogitsLoss()
+            loss = loss_fct(logits, labels)
+
+    return SequenceClassifierOutput(
+        loss=loss,
+        logits=logits,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+    )
+
+
+def masked_lm_forward_with_past_key_values(
+    self,
+    input_ids: Optional[Tensor] = None,
+    token_type_ids: Optional[Tensor] = None,
+    position_ids: Optional[Tensor] = None,
+    attention_mask: Optional[Tensor] = None,
+    masked_positions: Optional[Tensor] = None,
+    inputs_embeds: Optional[Tensor] = None,
+    labels: Optional[Tensor] = None,
+    output_hidden_states: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    past_key_values: Optional[Tuple[Tuple[Tensor]]] = None,
+):
+    outputs = self.ernie(
+        input_ids,
+        token_type_ids=token_type_ids,
+        position_ids=position_ids,
+        attention_mask=attention_mask,
+        inputs_embeds=inputs_embeds,
+        past_key_values=past_key_values,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=True,
+    )
+    sequence_output = outputs[0]
+    prediction_scores = self.cls(sequence_output, masked_positions=masked_positions)
+
+    masked_lm_loss = None
+    if labels is not None:
+        loss_fct = paddle.nn.CrossEntropyLoss()
+        masked_lm_loss = loss_fct(prediction_scores.reshape((-1, prediction_scores.shape[-1])), labels.reshape((-1,)))
+
+    return MaskedLMOutput(
+        loss=masked_lm_loss,
+        logits=prediction_scores,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+    )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/template.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/template.py
new file mode 100644
index 000000000..9d4207013
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/template.py
@@ -0,0 +1,937 @@
+"""
+Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+This module provide prompt definition methods.
+"""
+
+import json
+import os
+import re
+import traceback
+from abc import abstractmethod
+from functools import partial
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+from paddle import Tensor
+
+from paddlenlp.transformers import PretrainedModel, PretrainedTokenizer
+from paddlenlp.utils.log import logger
+
+from .prompt_tokenizer import MLMPromptTokenizer
+from .prompt_utils import (
+    masked_lm_forward_with_past_key_values,
+    sequence_classification_forward_with_past_key_values,
+)
+
+__all__ = ["Template", "ManualTemplate", "SoftTemplate", "PrefixTemplate", "AutoTemplate", "UTCTemplate"]
+
+# Template used to be saved in a file.
+TEMPLATE_CONFIG_FILE = "template_config.json"
+TEMPLATE_PARAMETER_FILE = "template_state.pdparams"
+
+# Default values for some template attributes.
+DEFAULT_MAX_OPTIONS = 10
+
+
+class Template(nn.Layer):
+    """
+    Base class for [`Template`].
+
+    Args:
+        prompt (`str`):
+            A template string which defines how to combine text and prompt.
+        tokenizer (`PretrainedTokenizer`):
+            An instance of PretrainedTokenizer used for tokenization.
+        max_length (`int`):
+            If set to a number, it will limit the total sequence returned so
+            that it has a maximum length, including prompts.
+    """
+
+    template_special_tokens = ["text", "hard", "soft", "soft_id", "prefix", "sep", "mask", "options"]
+    template_attributes = [
+        "length",
+        "encoder",
+        "position",
+        "token_type",
+        "hidden_size",
+        "add_omask",
+        "add_prompt",
+        "add_space",
+        "truncate",
+    ]
+    input_feature_names = ["do_truncate", "token_types", "positions"]
+    opt_token = "[OPT]"
+    omask_token = "[O-MASK]"
+
+    def __init__(self, prompt: str, tokenizer: PretrainedTokenizer, max_length: int, **kwargs):
+        super(Template, self).__init__()
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+        self.tokenizer = tokenizer
+        self.prompt_tokenizer = MLMPromptTokenizer(tokenizer, max_length)
+        self.set_prompt(prompt)
+
+    @property
+    def prompt(self):
+        return self._prompt
+
+    @prompt.setter
+    def prompt(self, prompt):
+        logger.warning("Prompt can not be modified once set.")
+
+    def set_prompt(self, prompt: str):
+        if prompt is not None:
+            if isinstance(prompt, str):
+                self._prompt = self.parse_template_string(prompt)
+            else:
+                self._prompt = prompt
+            self.do_truncate = self.create_truncation_sequence_from_prompt()
+            self._check_template_special_tokens()
+            self.example_keys = self.create_example_keys_from_prompt()
+            self.token_types = self.create_token_type_sequence_from_prompt()
+            self.positions = self.create_position_sequence_from_prompt()
+            self.create_prompt_parameters()
+
+    @abstractmethod
+    def create_prompt_parameters(self):
+        raise NotImplementedError
+
+    def _check_template_special_tokens(self):
+        valid_attr = self.template_special_tokens + self.template_attributes
+        prompt_attr = []
+        for part in self._prompt:
+            prompt_attr.extend(list(part.keys()))
+            if "add_prompt" in part:
+                opt_prompt = part["add_prompt"]
+                if self.opt_token not in opt_prompt:
+                    raise ValueError("'{}' not found in option prompt.".format(self.opt_token))
+            if "add_omask" in part:
+                self._check_omask_token()
+        diff_attr = set(prompt_attr) - set(valid_attr)
+        if len(diff_attr) > 0:
+            raise ValueError("Invalid attributes found in template: {}.".format(diff_attr))
+        return True
+
+    def _check_example_name(self, name: str, example: Dict[str, Any]):
+        if name not in example:
+            raise ValueError(
+                "Unexpected value in template. Can not find keyword {} in example: {}".format(name, example)
+            )
+        return True
+
+    def _check_omask_token(self):
+        omask_example = """
+        Add '[O-MASK]' to tokenizer to use `add_omask`.
+
+        Examples:
+
+        ```python
+        omask_dict = {"additional_special_tokens": ["[O-MASK]"]}
+        tokenizer.add_special_tokens(omask_dict)
+        model.resize_token_embeddings(len(tokenizer))
+        ```"""
+        if self.omask_token not in self.tokenizer.additional_special_tokens:
+            self.tokenizer.add_special_tokens({"additional_special_tokens": [self.omask_token]})
+            return True
+            raise ValueError("'{}' not found in tokenizer.".format(self.omask_token) + omask_example)
+        return True
+
+    def build_inputs_with_prompt(
+        self, example: Dict[str, Any], prompt: Optional[List[Dict[str, Any]]] = None
+    ) -> List[str]:
+        """
+        Build input text sequences according to both prompt and example.
+
+        Args:
+            example (`Dict[str, Any]`):
+                A data sample with corresponding keys as `prompt`.
+            prompt (`Optional[List[Dict[str, Any]]]`):
+                A sequence of dictionary which defines positions of prompt,
+                input text and special tokens.
+        """
+        inputs = self._prompt.copy() if prompt is None else prompt.copy()
+
+        for index, part in enumerate(inputs):
+            if "text" in part:
+                self._check_example_name(part["text"], example)
+                inputs[index] = str(example[part["text"]])
+            elif "mask" in part:
+                if "length" not in part:
+                    part["length"] = 1
+                inputs[index] = self.tokenizer.mask_token * part["length"]
+            elif "sep" in part:
+                inputs[index] = self.tokenizer.sep_token
+            elif "hard" in part:
+                inputs[index] = part["hard"]
+            elif "options" in part:
+                if not isinstance(part["options"], list):
+                    self._check_example_name(part["options"], example)
+                    labels = example[part["options"]]
+                    labels = [labels] if isinstance(labels, str) else labels
+                else:
+                    labels = part["options"]
+                if "add_prompt" in part:
+                    opt_prompt = part["add_prompt"]
+                    labels = [opt_prompt.replace(self.opt_token, x) for x in labels]
+                if "add_omask" in part:
+                    labels = [self.omask_token + x for x in labels]
+                inputs[index] = "".join(labels)
+            else:
+                inputs[index] = part
+
+            if "add_space" in part:
+                inputs[index] = " " + inputs[index]
+        return inputs
+
+    def create_token_type_sequence_from_prompt(self, prompt: Optional[List[Dict[str, Any]]] = None) -> List[int]:
+        prompt = self._prompt if prompt is None else prompt
+        last_token_type = 0
+        token_type_ids = []
+        for part in prompt:
+            if "token_type" in part:
+                last_token_type = part["token_type"]
+            token_type_ids.append(last_token_type)
+        return token_type_ids
+
+    def create_position_sequence_from_prompt(self, prompt: Optional[List[Dict[str, Any]]] = None) -> List[int]:
+        prompt = self._prompt if prompt is None else prompt
+        position_ids = []
+        for part in prompt:
+            if "position" in part:
+                position_ids.append(part["position"])
+            else:
+                position_ids.append(-1)
+        return position_ids
+
+    def create_truncation_sequence_from_prompt(self, prompt: Optional[List[Dict[str, Any]]] = None) -> List[int]:
+        prompt = self._prompt.copy() if prompt is None else prompt.copy()
+        do_truncate = []
+        for part in prompt:
+            if "truncate" in part:
+                do_truncate.append(part["truncate"])
+            elif "text" in part:
+                do_truncate.append(True)
+            else:
+                do_truncate.append(False)
+        return do_truncate
+
+    def create_example_keys_from_prompt(self):
+        example_keys = set()
+        for part in self.prompt:
+            if "text" in part:
+                example_keys.add(part["text"])
+            if "options" in part and isinstance(part["options"], list):
+                example_keys.update(set(part["options"]))
+        if len(example_keys) == 0:
+            raise ValueError('No `text` keyword in template: "{}", please check it again.'.format(self.prompt))
+        return example_keys
+
+    def encode(self, example: Dict[str, Any]):
+        input_text = self.build_inputs_with_prompt(example)
+        input_names, input_values = ["text"], [input_text]
+        for name in self.input_feature_names:
+            input_names.append(name)
+            input_values.append(getattr(self, name, None))
+
+        inputs = []
+        for value in list(zip(*input_values)):
+            inputs.append(dict(zip(input_names, value)))
+
+        input_dict = self.prompt_tokenizer(inputs)
+        unused_example = {k: v for k, v in example.items() if k not in self.example_keys}
+
+        return {**input_dict, **unused_example}
+
+    def __call__(self, example: Dict[str, Any]):
+        return self.encode(example=example)
+
+    @abstractmethod
+    def process_batch(self, input_dict):
+        raise NotImplementedError
+
+    def save(self, save_path):
+        if not os.path.exists(save_path):
+            os.makedirs(save_path, exist_ok=True)
+        template_config_file = os.path.join(save_path, TEMPLATE_CONFIG_FILE)
+        template_class = self.__class__.__name__
+        with open(template_config_file, "w", encoding="utf-8") as fp:
+            fp.write(json.dumps(self._prompt, ensure_ascii=False) + "\n")
+            fp.write(json.dumps({"class": template_class}, ensure_ascii=False) + "\n")
+        template_param_file = os.path.join(save_path, TEMPLATE_PARAMETER_FILE)
+        template_state_dict = self.state_dict()
+        if len(template_state_dict) > 0:
+            paddle.save(template_state_dict, template_param_file)
+
+    @staticmethod
+    def extract_template_keywords(prompt: List[Dict[str, Any]]):
+        keywords = set()
+        for part in prompt:
+            keywords.update(part.keys())
+        return keywords
+
+    @staticmethod
+    def parse_template_string(prompt: str, left_token: Optional[str] = "{", right_token: Optional[str] = "}"):
+        """
+        Parse the defined string as a sequence of dictionaries.
+
+        Args:
+            prompt: A string comprised of nestable {}, [], integers and strings.
+
+        Returns:
+            A list of dictionaries corresponding to the input string.
+
+            For example, if we define `prompt` as
+
+            "{'text': 'hypothesis'}基于这一假设{'mask'}推断出{'options': 'label.txt'}",
+
+            then this function returns
+
+            [{"text": "hypothesis"}, {"hard": "基于这一假设"}, {"mask": null},
+             {"hard": "推断出"}, {"options": ["正确", "错误"]}].
+
+        Raises:
+            ValueError: A error occurred parsing an string with unmatched punctuations.
+        """
+        left_stack = []
+        parsed = []
+        index = 0
+        while index < len(prompt):
+            # Delete extra spaces.
+            part = {"add_space": " "} if prompt[index] == " " else {}
+            while index < len(prompt) and prompt[index] == " ":
+                index += 1
+            if index == len(prompt):
+                break
+            # Parse blocks with paired tokens like "{ }".
+            if prompt[index] == left_token:
+                left_index = index
+                while index < len(prompt):
+                    if prompt[index] == left_token:
+                        left_stack.append(index)
+                    elif prompt[index] == right_token:
+                        left_stack.pop()
+                        if len(left_stack) == 0:
+                            break
+                    index += 1
+                if index == len(prompt) and len(left_stack) > 0:
+                    raise ValueError(
+                        "{} at position {} has no corresponding {}".format(left_token, left_index, right_token)
+                    )
+                try:
+                    part_dict = eval(prompt[left_index : index + 1])
+                    if isinstance(part_dict, set):
+                        part_dict = {k: None for k in part_dict}
+                    part.update(part_dict)
+                except SyntaxError:
+                    logger.error(traceback.format_exc())
+                    exit()
+                index += 1
+            # Parse simplified discrete prompts.
+            else:
+                left_index = index
+                while index < len(prompt) and prompt[index] != left_token:
+                    index += 1
+                part["hard"] = prompt[left_index:index].rstrip(" ")
+
+            if "options" in part:
+                if os.path.isfile(part["options"]):
+                    with open(part["options"], "r") as fp:
+                        labels = [x.strip() for x in fp]
+                    part["options"] = labels
+                    part["length"] = len(labels)
+                elif "length" not in "options":
+                    part["length"] = DEFAULT_MAX_OPTIONS
+            if "length" in part:
+                assert part["length"] > 0
+                if "hard" in part:
+                    logger.warning("Ignore `length` attribute for keyword `hard`.")
+            if "position" in part:
+                assert part["position"] >= 0
+            if "token_type" in part:
+                assert part["token_type"] in (0, 1)
+            parsed.append(part)
+        return parsed
+
+
+class ManualTemplate(Template):
+    """
+    ManualTemplate for discrete prompt methods, such as PET, EFL.
+
+    Args:
+        prompt (`str`):
+            A template string which defines how to combine text and prompt.
+        tokenizer (`PretrainedTokenizer`):
+            An instance of PretrainedTokenizer used for tokenization.
+        max_length (`int`):
+            If set to a number, it will limit the total sequence returned so
+            that it has a maximum length, including prompts.
+    """
+
+    template_special_tokens = ["text", "hard", "sep", "mask", "options"]
+    template_attributes = ["length", "position", "token_type", "add_prompt", "add_space", "add_omask", "truncate"]
+
+    def __init__(self, prompt: str, tokenizer: PretrainedTokenizer, max_length: int):
+        super(ManualTemplate, self).__init__(prompt, tokenizer, max_length)
+
+    def create_prompt_parameters(self):
+        return None
+
+    def process_batch(self, input_dict):
+        return input_dict
+
+
+class SoftLSTM(nn.Layer):
+    """
+    LSTM encoder for soft token embeddings.
+    """
+
+    def __init__(self, input_size, hidden_size, output_size, activation):
+        super(SoftLSTM, self).__init__()
+        self.lstm = nn.LSTM(
+            input_size=input_size, hidden_size=hidden_size, num_layers=2, direction="bidirect", time_major=False
+        )
+        self.mlp = nn.Sequential(
+            nn.Linear(2 * hidden_size, hidden_size), activation, nn.Linear(hidden_size, output_size)
+        )
+
+    def forward(self, embeds):
+        hidden_states, _ = self.lstm(embeds)
+        return self.mlp(hidden_states)
+
+
+class SoftTemplate(Template):
+    """
+    SoftTemplate for continuous prompt methods on the input layer.
+
+    Args:
+        prompt (`str`):
+            A template string which defines how to combine text and prompt.
+        tokenizer (`PretrainedTokenizer`):
+            An instance of PretrainedTokenizer used for tokenization.
+        max_length (`int`):
+            If set to a number, it will limit the total sequence returned so
+            that it has a maximum length, including prompts.
+        word_embeddings (`Tensor`):
+            The word embeddings of pretrained models, which can be obtained by
+            calling `model.get_input_embeddings().weight`.
+        soft_embeddings (`Tensor`):
+            The embeddings of soft tokens, which overwrites `word_embeddings`
+            as initial weights when defined.
+    """
+
+    template_special_tokens = ["text", "hard", "soft", "soft_id", "sep", "mask", "options"]
+    input_feature_names = ["do_truncate", "token_types", "positions", "soft_tokens", "encoder_ids"]
+
+    def __init__(
+        self,
+        prompt: str,
+        tokenizer: PretrainedTokenizer,
+        max_length: int,
+        word_embeddings: Tensor,
+        soft_embeddings: Tensor = None,
+    ):
+        super(SoftTemplate, self).__init__(
+            prompt, tokenizer, max_length, word_embeddings=word_embeddings, soft_embeddings=soft_embeddings
+        )
+
+    def named_parameters(self):
+        named_params = [(n, p) for n, p in self.soft_embeddings.named_parameters()]
+        named_params.extend([(n, p) for n, p in self.encoder_list.named_parameters()])
+        return named_params
+
+    def parameters(self):
+        return [p for n, p in self.named_parameters()]
+
+    def create_prompt_parameters(self):
+        self._prompt, soft_token_config = self.parse_soft_prompt()
+        self.embed_size = self.word_embeddings.weight.shape[1]
+        soft2word, self.soft_tokens, self.num_soft_token = soft_token_config
+        self._init_soft_parameters(soft2word)
+        self.encoder_ids, self.encoder_list = self._create_soft_encoders()
+
+    def process_batch(self, input_dict: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """
+        Convert input_ids to inputs_embeds.
+
+        Soft tokens are encoded soft_embeddings with predefined encoders.
+        For other tokens, use word embeddings in pretrained model.
+        """
+        word_embeds = self.word_embeddings(input_dict["input_ids"])
+        if "attention_mask" not in input_dict or input_dict["attention_mask"] is None:
+            pad_token_id = self.tokenizer.pad_token_id
+            attention_mask = paddle.unsqueeze(
+                (input_dict["input_ids"] == pad_token_id).astype("float32") * -1e4, axis=[1, 2]
+            )
+            input_dict["attention_mask"] = attention_mask
+        input_dict["input_ids"] = None
+        soft_embeds = self.soft_embeddings(input_dict["soft_token_ids"])
+        soft_shape = soft_embeds.shape
+        soft_embeds = soft_embeds.reshape([-1, soft_shape[-1]])
+        for encoder_id in range(1, len(self.encoder_list)):
+            to_encode = paddle.where(input_dict["encoder_ids"] == encoder_id)
+            to_encode = to_encode[0] * soft_shape[1] + to_encode[1]
+            to_encode = to_encode.squeeze(1)
+            to_encode_embeds = soft_embeds[to_encode]
+            to_encode_embeds = to_encode_embeds.reshape([soft_shape[0], -1, soft_shape[-1]])
+            encoder = self.encoder_list[encoder_id]
+            encoded = encoder(to_encode_embeds)
+            encoded = encoded.reshape([-1, soft_shape[-1]])
+            soft_embeds = paddle.scatter(soft_embeds, to_encode, encoded)
+        soft_embeds = soft_embeds.reshape([soft_shape[0], -1, soft_shape[-1]])
+        soft_token_ids = input_dict["soft_token_ids"].unsqueeze(2)
+        input_dict["inputs_embeds"] = paddle.where(soft_token_ids > 0, soft_embeds, word_embeds)
+        return input_dict
+
+    def parse_soft_prompt(self):
+        """
+        Unify the form of continuous prompts as {"soft": "xxx"} and create
+        continuous token id sequence for each part in template.
+
+        Returns:
+            `List[Dict[str, str]]`: Template with continuous prompt formated as {"soft": "xxx"}.
+            `Tuple[Dict[int, int], List[List[int]], int]`:
+                - Mapping from continuous ids to word ids for initialization.
+                - Continuous ids for each part. Id 0 denotes none-continuous part.
+                - Number of unique continuous tokens.
+        """
+        prompt = self._prompt.copy()
+        num_soft_token = 1
+        soft_prompt = []
+        soft_token_ids = []
+        soft2word = {}
+        soft_id_reindex = {}
+
+        for part in prompt:
+            part_prompt = None
+            # Copy non-continuous prompt part.
+            if "soft" not in part and "soft_id" not in part:
+                soft_prompt.append(part)
+                soft_token_ids.append(None)
+
+            # Deal with continuous prompt with specific initialization.
+            elif "soft" in part and part["soft"] is not None:
+
+                # Get word tokens for initialization.
+                if "add_space" in part:
+                    part["soft"] = part["add_space"] + part["soft"]
+                word_token_ids = self.tokenizer(part["soft"], add_special_tokens=False, return_token_type_ids=False)[
+                    "input_ids"
+                ]
+
+                # Create continuous token ids.
+                soft_id_list = list(range(num_soft_token, num_soft_token + len(word_token_ids)))
+                num_soft_token += len(word_token_ids)
+
+                for soft_id, word_id in zip(soft_id_list, word_token_ids):
+                    soft2word[soft_id] = word_id
+
+                # Check `length` if exists.
+                if "length" in part:
+                    if part["length"] < len(word_token_ids):
+                        logger.warning("Ignore `length` because it is less than the length of defined word sequence.")
+                    elif part["length"] > len(word_token_ids):
+                        length = part["length"] - len(word_token_ids)
+                        soft_id_list += list(range(num_soft_token, num_soft_token + length))
+                        num_soft_token += length
+                        part["soft"] += self.tokenizer.unk_token * length
+
+                soft_token_ids.append(soft_id_list)
+                part_prompt = {"soft": part["soft"]}
+
+                # Check or record `soft_id` if exists.
+                if "soft_id" in part:
+                    if part["soft_id"] in soft_id_reindex:
+                        assert soft_id_list == soft_id_reindex[part["soft_id"]]
+                    else:
+                        soft_id_reindex[part["soft_id"]] = soft_id_list
+
+            # Deal with continuous prompt defined by `soft_id`.
+            elif "soft_id" in part and part["soft_id"] in soft_id_reindex:
+                soft_id_list = soft_id_reindex[part["soft_id"]]
+                if "length" in part:
+                    logger.warning("Ignore `length` because it is incompatible with existing `soft_id`.")
+                soft_token_ids.append(soft_id_list)
+                part_prompt = {"soft": [self.tokenizer.unk_token] * len(soft_id_list)}
+
+            # Deal with continuous prompt with random initialization.
+            else:
+                if "length" not in part:
+                    part["length"] = 1
+                soft_id_list = list(range(num_soft_token, num_soft_token + part["length"]))
+                num_soft_token += part["length"]
+                soft_token_ids.append(soft_id_list)
+                if "soft_id" in part:
+                    soft_id_reindex[part["soft_id"]] = soft_id_list
+                part_prompt = {"soft": [self.tokenizer.unk_token] * len(soft_id_list)}
+            if part_prompt is not None:
+                for key in part:
+                    if key not in ["soft", "soft_id", "length", "add_space"]:
+                        part_prompt[key] = part[key]
+                soft_prompt.append(part_prompt)
+
+        if num_soft_token == 1:
+            raise ValueError("Soft prompt expected for SoftTemplate, but get {}.".format(self._prompt))
+
+        soft_token_config = (soft2word, soft_token_ids, num_soft_token)
+
+        return soft_prompt, soft_token_config
+
+    def _init_soft_parameters(self, soft2word: Dict[int, int]):
+        if self.soft_embeddings is not None:
+            if self.soft_embeddings.weight.shape[0] != self.num_soft_token:
+                raise ValueError(
+                    "Given soft embeddings are incompatible with those "
+                    'defined in template "{}"'.format(self._prompt)
+                )
+        else:
+            self.soft_embeddings = nn.Embedding(self.num_soft_token, self.embed_size)
+            weight = self.soft_embeddings.weight.clone().detach()
+            for soft_id, word_id in soft2word.items():
+                # squeeze() is used here to be backward compatible with 0-D tensor introduced in paddle 2.5
+                word_id = paddle.to_tensor(word_id).squeeze()
+                weight[soft_id] = self.word_embeddings(word_id)
+            self.soft_embeddings.weight.set_value(weight)
+
+    def _create_soft_encoders(self, output_size: int = None, activation: nn.Layer = None):
+        encoder_list = [nn.Identity()]
+        encoder2id = {}
+        encoder_ids = []
+        output_size = self.embed_size if output_size is None else output_size
+        activation = nn.ReLU() if activation is None else activation
+        for part in self._prompt:
+            if "encoder" not in part or part["encoder"] is None:
+                encoder_ids.append(0)
+            else:
+                if part["encoder"] not in encoder2id:
+                    encoder2id[part["encoder"]] = len(encoder_list)
+                    encoder_ids.append(len(encoder_list))
+                    if "hidden_size" in part:
+                        hidden_size = part["hidden_size"]
+                    else:
+                        hidden_size = self.embed_size
+                    if part["encoder"] == "lstm":
+                        encoder_list.append(SoftLSTM(self.embed_size, hidden_size, output_size, activation))
+                    elif part["encoder"] == "mlp":
+                        encoder_list.append(
+                            nn.Sequential(
+                                nn.Linear(self.embed_size, hidden_size),
+                                activation,
+                                nn.Linear(hidden_size, output_size),
+                            )
+                        )
+                    else:
+                        raise ValueError("Encoder {} not supported.".format(part["encoder"]))
+                else:
+                    encoder_ids.append(encoder2id[part["encoder"]])
+        encoder_list = nn.LayerList(encoder_list)
+        return encoder_ids, encoder_list
+
+    def build_inputs_with_prompt(
+        self, example: Dict[str, Any], prompt: Optional[List[Dict[str, Any]]] = None
+    ) -> List[str]:
+        inputs = super(SoftTemplate, self).build_inputs_with_prompt(example, prompt)
+        for index, part in enumerate(inputs):
+            if isinstance(part, dict) and "soft" in part:
+                inputs[index] = part["soft"]
+        return inputs
+
+    def save(self, save_path):
+        super(SoftTemplate, self).save(save_path)
+        template_param_file = os.path.join(save_path, TEMPLATE_PARAMETER_FILE)
+        paddle.save(self.state_dict(), template_param_file)
+
+
+class PrefixTemplate(SoftTemplate):
+    """
+    PrefixTemplate for continuous prompt methods on every layer.
+
+    Args:
+        prompt (`str`):
+            A template string which defines how to combine text and prompt.
+        tokenizer (`PretrainedTokenizer`):
+            An instance of PretrainedTokenizer used for tokenization.
+        max_length (`int`):
+            If set to a number, it will limit the total sequence returned so
+            that it has a maximum length, including prompts.
+        model (`PretrainedModel`):
+            An instance of PretrainedModel.
+    """
+
+    template_special_tokens = ["text", "hard", "prefix", "soft", "sep", "mask", "options"]
+    input_feature_names = ["do_truncate", "token_types", "positions", "soft_tokens", "encoder_ids"]
+
+    def __init__(
+        self,
+        prompt: str,
+        tokenizer: PretrainedTokenizer,
+        max_length: int,
+        model: PretrainedModel,
+        prefix_dropout: float = 0.1,
+    ):
+        self.n_layer, self.n_heads = self._get_config(model)
+        super(PrefixTemplate, self).__init__(prompt, tokenizer, max_length, model.get_input_embeddings())
+        self.dropout = nn.Dropout(p=prefix_dropout)
+
+    @staticmethod
+    def _get_config(model):
+        names = [n for n, p in model.named_parameters() if "layers" in n]
+        pattern = re.compile(r".*?\.(\d+)\..*?")
+        indices = []
+        for name in names:
+            result = pattern.match(name)
+            if result is not None:
+                indices.append(int(result.group(1)))
+        num_layer = max(indices) + 1
+        layer_names = names[0].split(".")[:-2]
+        layer = model
+        for name in layer_names:
+            layer = getattr(layer, name)
+        num_heads = layer.num_heads
+
+        return num_layer, num_heads
+
+    def parse_soft_prompt(self):
+        prompt = self._prompt.copy()
+
+        for index, part in enumerate(prompt):
+            if "soft" in part:
+                raise ValueError("Keyward `soft` should not be used in PrefixTemplate.")
+            if "prefix" not in part:
+                continue
+            if index != 0:
+                raise ValueError("Keyword `prefix` should locate at the beginning of template.")
+            part["soft"] = part["prefix"]
+            part.pop("prefix")
+            if "encoder" not in part:
+                part["encoder"] = "mlp"
+            prompt[index] = part
+
+        self._prompt = prompt
+        return super(PrefixTemplate, self).parse_soft_prompt()
+
+    def process_model(self, model):
+        if model.__class__.__name__.endswith("ForSequenceClassification"):
+            model.forward = partial(sequence_classification_forward_with_past_key_values, self=model)
+        elif model.__class__.__name__.endswith("ForMaskedLM"):
+            model.forward = partial(masked_lm_forward_with_past_key_values, self=model)
+        return model
+
+    def process_batch(self, input_dict: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        word_embeds = self.word_embeddings(input_dict["input_ids"])
+        batch_size, _ = input_dict["soft_token_ids"].shape
+
+        soft_token_ids = paddle.masked_select(input_dict["soft_token_ids"], input_dict["soft_token_ids"] > 0)
+        soft_token_ids = soft_token_ids.reshape([batch_size, -1])
+        _, soft_len = soft_token_ids.shape
+
+        token_type_ids = paddle.masked_select(input_dict["token_type_ids"], input_dict["soft_token_ids"] == 0)
+        input_dict["token_type_ids"] = token_type_ids.reshape([batch_size, -1])
+        position_ids = paddle.masked_select(input_dict["position_ids"], input_dict["soft_token_ids"] == 0)
+        input_dict["position_ids"] = position_ids.reshape([batch_size, -1])
+        if "masked_position" in input_dict and input_dict["masked_positions"] is not None:
+            input_dict["masked_positions"] = input_dict["masked_positions"] - soft_len
+        input_dict["inputs_embeds"] = paddle.concat(
+            [word_embeds[:, 0, :].unsqueeze(1), word_embeds[:, soft_len + 1 :, :]], axis=1
+        )
+
+        if "attention_mask" not in input_dict or input_dict["attention_mask"] is None:
+            pad_token_id = self.tokenizer.pad_token_id
+            attention_mask = paddle.unsqueeze(
+                (input_dict["input_ids"] == pad_token_id).astype("float32") * -1e4, axis=[1, 2]
+            )
+            input_dict["attention_mask"] = attention_mask
+        input_dict["input_ids"] = None
+        input_dict.pop("soft_token_ids")
+        input_dict.pop("encoder_ids")
+
+        soft_embeds = self.soft_embeddings(soft_token_ids)
+        soft_embeds = self.encoder_list[1](soft_embeds)
+        soft_embeds = soft_embeds.reshape(
+            [batch_size, soft_len, self.n_layer * 2, self.n_heads, self.embed_size // self.n_heads]
+        )
+
+        soft_embeds = self.dropout(soft_embeds)
+        soft_embeds = paddle.transpose(soft_embeds, perm=[2, 0, 3, 1, 4])
+        soft_embeds = paddle.split(soft_embeds, num_or_sections=self.n_layer)
+        soft_embeds = [paddle.split(emb, 2) for emb in soft_embeds]
+        soft_embeds = [[x.squeeze(0) for x in emb] for emb in soft_embeds]
+        input_dict["past_key_values"] = tuple([tuple(emb) for emb in soft_embeds])
+        return input_dict
+
+    def _create_soft_encoders(self):
+        output_size = self.embed_size * self.n_layer * 2
+        activation = nn.Tanh()
+        return super(PrefixTemplate, self)._create_soft_encoders(output_size, activation)
+
+
+class AutoTemplate(object):
+    """
+    AutoTemplate can help you automatically create the relevant Template
+    given the provided prompt.
+    """
+
+    default_text_keyword = "text_a"
+
+    def __init__(self, *args, **kwargs):
+        raise EnvironmentError(
+            "{} is designed to be instantiated using {}.create_from("
+            "prompt, tokenizer, max_length, ...)".format(self.__class__.__name__, self.__class__.__name__)
+        )
+
+    @classmethod
+    def create_from(
+        cls,
+        prompt: str,
+        tokenizer: PretrainedTokenizer,
+        max_length: int = 512,
+        model: PretrainedModel = None,
+        soft_embeddings: Tensor = None,
+        prefix_dropout: float = 0.1,
+        template_class: str = None,
+    ):
+        # Default template if not defined.
+        if prompt is None:
+            prompt = "{'soft'}{'text': 'text_a'}{'mask'}"
+
+        if isinstance(prompt, str):
+            prompt = Template.parse_template_string(prompt)
+        template_keywords = Template.extract_template_keywords(prompt)
+
+        # Complement simplified template as ManualTemplate-style in form.
+        if "text" not in template_keywords:
+            prompt = [{"text": cls.default_text_keyword}] + prompt
+            if "mask" not in template_keywords:
+                prompt = prompt + [{"mask": None}]
+
+        if template_class is None:
+            if "prefix" in template_keywords:
+                template_class = "PrefixTemplate"
+            elif "soft" in template_keywords or "soft_id" in template_keywords:
+                template_class = "SoftTemplate"
+            else:
+                template_class = "ManualTemplate"
+
+        # Choose Template according to template keywords.
+        if template_class == "PrefixTemplate":
+            return PrefixTemplate(
+                prompt=prompt, tokenizer=tokenizer, max_length=max_length, model=model, prefix_dropout=prefix_dropout
+            )
+        elif template_class == "SoftTemplate":
+            word_embeddings = model.get_input_embeddings()
+            return SoftTemplate(
+                prompt=prompt,
+                tokenizer=tokenizer,
+                max_length=max_length,
+                word_embeddings=word_embeddings,
+                soft_embeddings=soft_embeddings,
+            )
+        elif template_class == "UTCTemplate":
+            return UTCTemplate(tokenizer=tokenizer, max_length=max_length)
+        elif template_class == "ManualTemplate":
+            return ManualTemplate(prompt=prompt, tokenizer=tokenizer, max_length=max_length)
+        else:
+            raise ValueError(f"Unknown template: {template_class}.")
+
+    @classmethod
+    def load_from(
+        cls, data_path: os.PathLike, tokenizer: PretrainedTokenizer, max_length: int, model: PretrainedModel = None
+    ):
+        template_config_file = os.path.join(data_path, TEMPLATE_CONFIG_FILE)
+        if not os.path.isfile(template_config_file):
+            raise ValueError("{} not found under {}".format(TEMPLATE_CONFIG_FILE, data_path))
+        with open(template_config_file, "r", encoding="utf-8") as fp:
+            config = [x.strip() for x in fp]
+            prompt = json.loads(config[0])
+            if len(config) > 1:
+                template_class = json.loads(config[1])["class"]
+            else:
+                template_class = None  # Compatible with previous versions
+        template = cls.create_from(
+            prompt=prompt, tokenizer=tokenizer, max_length=max_length, model=model, template_class=template_class
+        )
+        template_param_file = os.path.join(data_path, TEMPLATE_PARAMETER_FILE)
+        if os.path.isfile(template_param_file):
+            template.set_state_dict(paddle.load(template_param_file))
+        return template
+
+
+class UTCTemplate(Template):
+    """
+    Template for Unified Tag Classification.
+    """
+
+    template_special_tokens = ["text", "hard", "sep", "cls", "options"]
+
+    def __init__(self, tokenizer: PretrainedTokenizer, max_length: int, prompt: str = None):
+        prompt = (
+            (
+                "{'options': 'choices', 'add_omask': True, 'position': 0, 'token_type': 1}"
+                "{'sep': None, 'token_type': 0, 'position': 0}{'text': 'text_a'}{'sep': None, 'token_type': 1}{'text': 'text_b'}"
+            )
+            if prompt is None
+            else prompt
+        )
+        super(UTCTemplate, self).__init__(prompt, tokenizer, max_length)
+        self.max_position_id = self.tokenizer.model_max_length - 1
+        self.max_length = max_length
+        if not self._has_options():
+            raise ValueError(
+                "Expected `options` and `add_omask` are in defined prompt, but got {}".format(self.prompt)
+            )
+
+    def _has_options(self):
+        for part in self.prompt:
+            if "options" in part and "add_omask" in part:
+                return True
+        return False
+
+    def build_inputs_with_prompt(
+        self, example: Dict[str, Any], prompt: Optional[List[Dict[str, Any]]] = None
+    ) -> List[str]:
+        inputs = super(UTCTemplate, self).build_inputs_with_prompt(example, prompt)
+        for index, part in enumerate(inputs):
+            if "cls" in part:
+                inputs[index] = self.tokenizer.cls_token
+        return inputs
+
+    def encode(self, example: Dict[str, Any], use_mask: bool = False):
+        input_dict = super(UTCTemplate, self).encode(example)
+
+        # Set OMASK and MASK positions and labels for options.
+        omask_token_id = self.tokenizer.convert_tokens_to_ids("[O-MASK]")
+        input_dict["omask_positions"] = (
+            np.where(np.array(input_dict["input_ids"]) == omask_token_id)[0].squeeze().tolist()
+        )
+
+        sep_positions = (
+            np.where(np.array(input_dict["input_ids"]) == self.tokenizer.sep_token_id)[0].squeeze().tolist()
+        )
+        input_dict["cls_positions"] = sep_positions[0]
+
+        # Limit the maximum position ids.
+        position_ids = np.array(input_dict["position_ids"])
+        position_ids[position_ids > self.max_position_id] = self.max_position_id
+        input_dict["position_ids"] = position_ids.tolist()
+
+        return input_dict
+
+    def create_prompt_parameters(self):
+        return None
+
+    def process_batch(self, input_dict):
+        return input_dict
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/verbalizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/verbalizer.py
new file mode 100644
index 000000000..174a86380
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/prompt/verbalizer.py
@@ -0,0 +1,461 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import json
+import os
+from abc import abstractmethod
+from typing import Dict
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import Tensor
+
+from paddlenlp.layers import Linear as TransposedLinear
+from paddlenlp.transformers import PretrainedModel, PretrainedTokenizer
+from paddlenlp.utils.log import logger
+
+__all__ = ["Verbalizer", "ManualVerbalizer", "SoftVerbalizer", "MaskedLMVerbalizer"]
+
+# Verbalizer used to be saved in a file.
+VERBALIZER_CONFIG_FILE = "verbalizer_config.json"
+VERBALIZER_PARAMETER_FILE = "verbalizer_state.pdparams"
+
+
+class Verbalizer(nn.Layer):
+    """
+    Base class for [`Verbalizer`].
+
+    Args:
+        label_words (`dict`):
+            Define the mapping from labels to a single or multiple words.
+        tokenizer (`PretrainedTokenizer`):
+            An instance of PretrainedTokenizer for label word tokenization.
+    """
+
+    def __init__(self, label_words: Dict, tokenizer: PretrainedTokenizer, **kwargs):
+        super(Verbalizer, self).__init__()
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+        self.tokenizer = tokenizer
+        self.token_aggregate_type = kwargs.get("token_aggregate_type", "mean")
+        self.word_aggregate_type = kwargs.get("word_aggregate_type", "mean")
+        self.mask_aggregate_type = kwargs.get("mask_aggregate_type", "product")
+        self.post_log_softmax = kwargs.get("post_log_softmax", True)
+        self.label_token_weight = kwargs.get("label_token_weight", None)
+        self.label_words = label_words
+        if self.label_token_weight is not None:
+            self.label_token_weight = self.normalize(self.project(self.label_token_weight.unsqueeze(0)))
+
+    @property
+    def labels(self):
+        if not hasattr(self, "_labels"):
+            raise RuntimeError("Attribute `labels` is not set yet.")
+        return self._labels
+
+    @labels.setter
+    def labels(self, labels):
+        raise NotImplementedError("Please use `label_words` to change `labels`.")
+
+    @property
+    def label_words(self):
+        if not hasattr(self, "_label_words"):
+            raise RuntimeError("Mapping from labels to words is not set yet.")
+        return self._label_words
+
+    @label_words.setter
+    def label_words(self, label_words: Dict):
+        if label_words is None:
+            return None
+        self._labels = sorted(list(label_words.keys()))
+        self.labels_to_ids = {label: idx for idx, label in enumerate(self._labels)}
+        self._words = []
+        for label in self._labels:
+            words = label_words[label]
+            if isinstance(words, str):
+                words = [words]
+            self._words.append(words)
+        self._label_words = {label: word for label, word in zip(self._labels, self._words)}
+        self.preprocess_label_words()
+        self.create_parameters()
+
+    @abstractmethod
+    def create_parameters(self):
+        """
+        A hook to create parameters for mapping from labels to words.
+        """
+        raise NotImplementedError
+
+    def preprocess_label_words(self):
+        label_token_ids = []
+        for label_word in self._words:
+            word_token_ids = []
+            for word in label_word:
+                token_ids = self.tokenizer.encode(word, add_special_tokens=False, return_token_type_ids=False)
+                word_token_ids.append(token_ids["input_ids"])
+            label_token_ids.append(word_token_ids)
+
+        max_num_words = max([len(words) for words in self._words])
+        max_num_tokens = max(
+            [max([len(token_ids) for token_ids in word_token_ids]) for word_token_ids in label_token_ids]
+        )
+        token_ids_shape = [len(self.labels), max_num_words, max_num_tokens]
+        token_ids = np.zeros(token_ids_shape)
+        word_mask = np.zeros(token_ids_shape[:-1])
+        token_mask = np.zeros(token_ids_shape)
+        for label_id, word_token_ids in enumerate(label_token_ids):
+            word_mask[label_id][: len(word_token_ids)] = 1
+            for word_id, tokens in enumerate(word_token_ids):
+                token_ids[label_id][word_id][: len(tokens)] = tokens
+                token_mask[label_id][word_id][: len(tokens)] = 1
+        self.token_ids = paddle.to_tensor(token_ids, dtype="int64", stop_gradient=True)
+        self.word_mask = paddle.to_tensor(word_mask, dtype="int64", stop_gradient=True)
+        self.token_mask = paddle.to_tensor(token_mask, dtype="int64", stop_gradient=True)
+
+    def convert_labels_to_ids(self, label: str):
+        assert isinstance(label, str)
+        return self.labels_to_ids[label]
+
+    def convert_ids_to_labels(self, index: int):
+        assert isinstance(index, int)
+        return self.labels[index]
+
+    def project(self, outputs: Tensor):
+        """
+        Fetch label word predictions from outputs over vocabulary.
+        """
+        token_ids = self.token_ids.reshape([-1])
+        label_token_outputs = outputs.index_select(index=token_ids, axis=-1)
+        label_shape = [*outputs.shape[:-1], *self.token_ids.shape]
+        label_token_outputs = label_token_outputs.reshape(label_shape)
+        label_word_outputs = self.aggregate(label_token_outputs, self.token_mask, self.token_aggregate_type)
+        label_word_outputs -= 1e4 * (1 - self.word_mask)
+        return label_word_outputs
+
+    def process_outputs(self, outputs: Tensor, masked_positions: Tensor = None):
+        """
+        Process outputs of `PretrainedModelForMaskedLM` over vocabulary.
+        """
+        if masked_positions is None:
+            return outputs
+        batch_size, _, num_pred = outputs.shape
+        outputs = outputs.reshape([-1, num_pred])
+        outputs = paddle.gather(outputs, masked_positions)
+        outputs = outputs.reshape([batch_size, -1, num_pred])
+        return outputs
+
+    def aggregate(self, outputs: Tensor, mask: Tensor, atype: str):
+        """
+        Aggregate multiple tokens/words for each word/label.
+        """
+        if atype == "mean":
+            outputs = outputs * mask.astype(outputs.dtype)
+            outputs = outputs.sum(axis=-1) / (mask.sum(axis=-1) + 1e-15)
+        elif atype == "max":
+            outputs = (outputs - 1e4 * (1 - mask)).max(axis=-1)
+        elif atype == "first":
+            index = paddle.to_tensor([0])
+            outputs = paddle.index_select(outputs, index, axis=-1).squeeze(axis=-1)
+        else:
+            raise ValueError("Strategy {} is not supported to aggregate multiple " "tokens.".format(atype))
+        return outputs
+
+    def normalize(self, outputs: Tensor):
+        """
+        Normalize the outputs over the whole vocabulary.
+        """
+        batch_size = outputs.shape[0]
+        outputs = F.softmax(outputs.reshape([batch_size, -1]), axis=-1).reshape(outputs.shape)
+        return outputs
+
+    def calibrate(self, label_word_outputs: Tensor):
+        """
+        Calibrate predictions with pre-defined weights over the whole vocabulary.
+        """
+        if self.label_token_weight.dim() != 1:
+            raise ValueError("Weights of label tokens should be a 1-D tensor.")
+        weight_shape = self.label_token_weight.shape
+        output_shape = label_word_outputs.shape
+        if weight_shape[1:] != output_shape[1:] or weight_shape[0] != 1:
+            raise ValueError(
+                "Shapes of label token weights and predictions do not match, "
+                "got {} and {}.".format(weight_shape, output_shape)
+            )
+        label_word_outputs /= self.label_token_weight + 1e-15
+        batch_size = label_word_outputs.shape0[0]
+        label_word_outputs = paddle.mean(label_word_outputs.reshape([batch_size, -1])).reshape(output_shape)
+
+        return label_word_outputs
+
+    def save(self, save_path: str):
+        if not os.path.exists(save_path):
+            os.makedirs(save_path, exist_ok=True)
+        verb_config_file = os.path.join(save_path, VERBALIZER_CONFIG_FILE)
+        with open(verb_config_file, "w", encoding="utf-8") as fp:
+            json.dump(self.label_words, fp, ensure_ascii=False)
+        verb_params_file = os.path.join(save_path, VERBALIZER_PARAMETER_FILE)
+        verb_state_dict = self.state_dict()
+        if len(verb_state_dict) > 0:
+            paddle.save(self.state_dict(), verb_params_file)
+
+    @classmethod
+    def load_from(cls, data_path: os.PathLike, tokenizer: PretrainedTokenizer):
+        verb_config_file = os.path.join(data_path, VERBALIZER_CONFIG_FILE)
+        if not os.path.isfile(verb_config_file):
+            raise ValueError("{} not found under {}".format(VERBALIZER_CONFIG_FILE, data_path))
+        with open(verb_config_file, "r") as fp:
+            label_words = json.load(fp)
+
+        verbalizer = cls(label_words, tokenizer)
+        verb_state_file = os.path.join(data_path, VERBALIZER_PARAMETER_FILE)
+        if os.path.isfile(verb_state_file):
+            verbalizer.set_state_dict(paddle.load(verb_state_file))
+            logger.info("Loading verbalizer state dict from {}".format(verb_state_file))
+        return verbalizer
+
+
+class ManualVerbalizer(Verbalizer):
+    """
+    ManualVerbalizer defines mapping from labels to words manually.
+
+    Args:
+        label_words (`dict`):
+            Define the mapping from labels to a single or multiple words.
+        tokenizer (`PretrainedTokenizer`):
+            An instance of PretrainedTokenizer for label word tokenization.
+    """
+
+    def __init__(self, label_words: Dict, tokenizer: PretrainedTokenizer, **kwargs):
+        super(ManualVerbalizer, self).__init__(label_words=label_words, tokenizer=tokenizer, **kwargs)
+
+    def create_parameters(self):
+        return None
+
+    def aggregate_multiple_mask(self, outputs: Tensor, atype: str = None):
+        if atype is None:
+            return outputs
+        assert outputs.ndim == 3
+        if atype == "mean":
+            outputs = outputs.mean(axis=1)
+        elif atype == "max":
+            outputs = outputs.max(axis=1)
+        elif atype == "first":
+            index = paddle.to_tensor([0])
+            outputs = paddle.index_select(outputs, index, axis=1).squeeze(1)
+        elif atype == "product":
+            new_outputs = outputs[:, 0, :]
+            for index in range(1, outputs.shape[1]):
+                new_outputs *= outputs[:, index, :]
+            outputs = new_outputs
+        else:
+            raise ValueError("Strategy {} is not supported to aggregate multiple " "tokens.".format(atype))
+        return outputs
+
+    def process_outputs(self, outputs: Tensor, masked_positions: Tensor = None):
+        """
+        Process outputs over the vocabulary, including the following steps:
+
+        (1) Project outputs into the outputs of corresponding word.
+
+        If self.post_log_softmax is True:
+
+            (2) Normalize over all label words.
+
+            (3) Calibrate (optional)
+
+        (4) Aggregate multiple words for each label.
+
+        Args:
+            outputs (`Tensor`):
+                The outputs of `PretrainedModel` which class name ends with
+                `ForMaskedLM`.
+        Returns:
+            The prediction outputs over labels (`Tensor`).
+        """
+        outputs = super(ManualVerbalizer, self).process_outputs(outputs, masked_positions)
+        label_word_outputs = self.project(outputs)
+
+        if self.post_log_softmax:
+            label_word_outputs = self.normalize(label_word_outputs)
+
+            if self.label_token_weight is not None:
+                label_word_outputs = self.calibrate(label_word_outputs)
+
+            label_word_outputs = paddle.log(label_word_outputs + 1e-15)
+
+        label_outputs = self.aggregate(label_word_outputs, self.word_mask, self.word_aggregate_type)
+        label_outputs = self.aggregate_multiple_mask(label_outputs, self.mask_aggregate_type)
+        return label_outputs
+
+
+class MaskedLMIdentity(nn.Layer):
+    """
+    Identity layer with the same arguments as the last linear layer in
+    `PretrainedModel` whose name ends with `ForMaskedLM`.
+    """
+
+    def __init__(self):
+        super(MaskedLMIdentity, self).__init__()
+
+    def forward(self, sequence_output, masked_positions=None):
+        return sequence_output
+
+
+class SoftVerbalizer(Verbalizer):
+    """
+    SoftVerbalizer for the WARP method.
+
+    Args:
+        label_words (`dict`):
+            Define the mapping from labels to a single or multiple words.
+        tokenizer (`PretrainedTokenizer`):
+            An instance of PretrainedTokenizer for label word tokenization.
+        model (`PretrainedModel`):
+            An instance of PretrainedModel with class name ends with `ForMaskedLM`
+    """
+
+    def __init__(self, label_words: Dict, tokenizer: PretrainedTokenizer, model: PretrainedModel, **kwargs):
+        super(SoftVerbalizer, self).__init__(label_words=label_words, tokenizer=tokenizer, model=model, **kwargs)
+        del self.model
+        setattr(model, self.head_name[0], MaskedLMIdentity())
+
+    def create_parameters(self):
+        # Only the first word used for initialization.
+        if self.token_ids.shape[1] != 1:
+            logger.warning("Only the first word for each label is used for" " initialization.")
+            index = paddle.to_tensor([0])
+            self.token_ids = paddle.index_select(self.token_ids, index, axis=1)
+            self.token_mask = paddle.index_select(self.token_mask, index, axis=1)
+            self.word_mask = paddle.ones([len(self.labels), 1])
+        self._extract_head(self.model)
+
+    def process_outputs(self, outputs: Tensor, masked_positions: Tensor = None):
+        outputs = super(SoftVerbalizer, self).process_outputs(outputs, masked_positions)
+        return self.head(outputs).squeeze(1)
+
+    def head_parameters(self):
+        # possible head parameters: decoder.weight, decoder_bias, bias
+        return [(n, p) for n, p in self.head.named_parameters() if self.head_name[-1] in n or n == "bias"]
+
+    def non_head_parameters(self):
+        return [(n, p) for n, p in self.head.named_parameters() if self.head_name[-1] not in n and n != "bias"]
+
+    def _extract_head(self, model: PretrainedModel):
+        # Find the nn.Linear layer with in_features = vocab_size
+        module_name = None
+        for i in model.named_sublayers():
+            if isinstance(i[1], TransposedLinear):
+                module_name = i[0]
+                break
+        if module_name is None:
+            raise ValueError("Can not find output layer, make sure type of the input model is AutoModelForMaskedLM.")
+
+        # recursively get the parent module to the decoder linear layer
+        parent_module = model
+        attribute_chain = module_name.split(".")
+        for name in attribute_chain[:-1]:
+            parent_module = getattr(parent_module, name)
+        self.head = copy.deepcopy(parent_module)
+
+        # replace the decoder linear layer with a linear linear with the trimmed vocab size
+        # we create a new decoder linear here instead of `resize_token_embeddings` because we only want to change the output embeddings
+        # this also invalidates any previous tie_weights
+        self.head_name = attribute_chain
+        module_name = attribute_chain[-1]
+        module = getattr(self.head, module_name)
+        # modify weight
+        module_weight = module.weight
+        module_bias = module.bias
+        selected_weight = self._create_init_weight(module_weight)
+        selected_bias = self._create_init_weight(module_bias, is_bias=True)
+        setattr(
+            self.head, module_name, TransposedLinear(in_features=module.weight.shape[1], out_features=len(self.labels))
+        )
+        getattr(self.head, module_name).weight.set_value(selected_weight.T)
+        getattr(self.head, module_name).bias.set_value(selected_bias)
+
+    def _create_init_weight(self, weight: Tensor, is_bias: bool = False):
+        token_ids = self.token_ids.squeeze(1)
+        token_mask = self.token_mask.squeeze(1)
+        aggr_type = self.token_aggregate_type
+        if is_bias:
+            bias = paddle.index_select(weight, token_ids.reshape([-1]), axis=0).reshape(token_ids.shape)
+            bias = self.aggregate(bias, token_mask, aggr_type)
+            return bias
+        else:
+            word_shape = [weight.shape[1], *token_ids.shape]
+            weight = paddle.index_select(weight, token_ids.reshape([-1]), axis=0).reshape(word_shape)
+            weight = self.aggregate(weight, token_mask, aggr_type)
+            return weight
+
+
+class MaskedLMVerbalizer(Verbalizer):
+    """
+    MaskedLMVerbalizer defines mapping from labels to words manually and supports
+    multiple masks corresponding to multiple tokens in words.
+
+    Args:
+        label_words (`dict`):
+            Define the mapping from labels to a single word. Only the first word
+            is used if multiple words are defined.
+        tokenizer (`PretrainedTokenizer`):
+            An instance of PretrainedTokenizer for label word tokenization.
+    """
+
+    def __init__(self, label_words: Dict, tokenizer: PretrainedTokenizer, **kwargs):
+        label_words = self.check_label_words_constraint(label_words)
+        super(MaskedLMVerbalizer, self).__init__(label_words=label_words, tokenizer=tokenizer, **kwargs)
+
+    def create_parameters(self):
+        return None
+
+    def check_label_words_constraint(self, label_words: Dict):
+        assert isinstance(label_words, dict), "`label_words` mapping should be a dictionary."
+        std_label_words = {}
+        for label, word in label_words.items():
+            if isinstance(word, str):
+                word = [word]
+            if len(word) > 1:
+                word = word[:1]
+                logger.info(f"More than one word for label `{label}`, only `{word[0]}` used.")
+            std_label_words[label] = word
+        word_length = [len(w[0]) for l, w in std_label_words.items()]
+        if len(set(word_length)) > 1:
+            raise ValueError(f"Length of all words for labels should be equal, but received {std_label_words}.")
+        return std_label_words
+
+    def aggregate_multiple_mask(self, outputs: Tensor, atype: str = "product"):
+        assert outputs.ndim == 3
+        token_ids = self.token_ids[:, 0, :].T
+        batch_size, num_token, num_pred = outputs.shape
+        results = paddle.index_select(outputs[:, 0, :], token_ids[0], axis=1)
+        if atype == "first":
+            return results
+
+        for index in range(1, num_token):
+            sub_results = paddle.index_select(outputs[:, index, :], token_ids[index], axis=1)
+            if atype in ("mean", "sum"):
+                results += sub_results
+            elif atype == "product":
+                results *= sub_results
+            elif atype == "max":
+                results = paddle.stack([results, sub_results], axis=-1)
+                results = results.max(axis=-1)
+            else:
+                raise ValueError("Strategy {} is not supported to aggregate multiple tokens.".format(atype))
+        if atype == "mean":
+            results = results / num_token
+        return results
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/quantization/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/quantization/__init__.py
new file mode 100644
index 000000000..ffc995a94
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/quantization/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .quantization_config import QuantizationConfig
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/quantization/qlora.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/quantization/qlora.py
new file mode 100644
index 000000000..6795b13d5
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/quantization/qlora.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddleslim.lc.quantizers.quant_func import dequantize_8bit, quantize_8bit
+from paddleslim_ops import dequant_blockwise, quant_blockwise
+
+
+def qlora_weight_quantize(
+    weight,
+    quant_algo="nf4",
+    double_quant=False,
+    block_size=64,
+    double_quant_block_size=256,
+    linear_name=None,
+    return_dict=True,
+):
+    quant_weight, quant_scale = quant_blockwise(weight, None, blocksize=block_size, quant_type=quant_algo)
+    if double_quant:
+        quant_sacle_offset = quant_scale.mean()
+        quant_scale -= quant_sacle_offset
+        qquant_scale, double_quant_scale = quantize_8bit(
+            quant_scale, None, double_quant_block_size, quant_type="dynamic_fp8"
+        )
+        if not return_dict:
+            return quant_weight, (qquant_scale, double_quant_scale, quant_sacle_offset)
+        qquant_scale_name = f"{linear_name}.qquant_scale" if linear_name else "qquant_scale"
+        double_quant_scale_name = f"{linear_name}.double_quant_scale" if linear_name else "double_quant_scale"
+        quant_sacle_offset_name = f"{linear_name}.quant_sacle_offset" if linear_name else "quant_sacle_offset"
+        qlora_state_dict = {
+            qquant_scale_name: qquant_scale,
+            double_quant_scale_name: double_quant_scale,
+            quant_sacle_offset_name: quant_sacle_offset,
+        }
+    else:
+        quant_scale_name = f"{linear_name}.quant_scale" if linear_name else "quant_scale"
+        qlora_state_dict = {quant_scale_name: quant_scale}
+        if not return_dict:
+            return quant_weight, (quant_scale)
+    quant_weight_name = f"{linear_name}.quant_weight" if linear_name else "quant_weight"
+    qlora_state_dict[quant_weight_name] = quant_weight
+    return qlora_state_dict
+
+
+def qlora_weight_dequantize(
+    quant_weight, quant_algo, state, double_quant=False, block_size=64, double_quant_block_size=256
+):
+    if double_quant:
+        qquant_scale, double_quant_scale, quant_sacle_offset = state
+        quant_scale = dequantize_8bit(
+            qquant_scale, None, double_quant_scale, double_quant_block_size, quant_type="dynamic_fp8"
+        )
+        quant_scale += quant_sacle_offset
+    else:
+        quant_scale = state
+    out = dequant_blockwise(quant_weight, None, quant_scale, blocksize=block_size, quant_type=quant_algo)
+    return out
+
+
+def qlora_weight_quantize_dequantize(
+    weight, quant_algo="nf4", double_quant=False, block_size=64, double_quant_block_size=256
+):
+    dtype = weight.dtype
+    quant_weight, state = qlora_weight_quantize(
+        weight=weight,
+        quant_algo=quant_algo,
+        double_quant=double_quant,
+        block_size=block_size,
+        double_quant_block_size=double_quant_block_size,
+        return_dict=False,
+    )
+    quant_dequant_weight = (
+        qlora_weight_dequantize(
+            quant_weight=quant_weight,
+            quant_algo=quant_algo,
+            state=state,
+            double_quant=double_quant,
+            block_size=block_size,
+            double_quant_block_size=double_quant_block_size,
+        )
+        .reshape(weight.shape)
+        .cast(dtype)
+    )
+    return quant_dequant_weight
+
+
+def qlora_weight_linear(
+    x,
+    quant_weight,
+    dtype,
+    state,
+    quant_algo="nf4",
+    double_quant=False,
+    block_size=64,
+    double_quant_block_size=256,
+    bias=None,
+):
+    weight = (
+        qlora_weight_dequantize(quant_weight, quant_algo, state, double_quant, block_size, double_quant_block_size)
+        .cast(dtype)
+        .reshape([x.shape[-1], -1])
+    )
+    out = paddle.nn.functional.linear(x, weight, bias)
+    return out
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/quantization/quantization_config.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/quantization/quantization_config.py
new file mode 100644
index 000000000..f5b04e188
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/quantization/quantization_config.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import json
+from dataclasses import dataclass
+
+quant_inference_mapping = {"avg": "abs_max", "abs_max_channel_wise": "abs_max_channel_wise", "abs_max": "abs_max"}
+
+
+@dataclass
+class QuantizationConfig:
+    """
+    This is the configuration class to store quantization configuration.
+    Args:
+        weight_quantize_algo: Weight quantization algorithm.
+        quant_type: Quantization type appplied to weight and activation, weight may still keep in float tensor.
+        shift: Whether the model applied the shift strategy.
+        smooth: Whether the model applied the smooth strategy.
+        shift_smooth_all_linears: Whether the model applied shift or smooth strategy for all linears.
+        quant_round_type: The quant round type, 0:-rounding to nearest ties to even， 1: -rounding to nearest ties away from zero.
+        llm_int8_threshold: The threshold for llm.int8 quantization.
+        weight_double_quant: Whether quant weight scale.
+        weight_blocksize: Block size for weight quantization.
+        weight_double_quant_block_size: Block size for quant_scale of weight quant_scale.
+        weight_quant_method: The method for weight quantization.
+        act_quant_method: The method for activation quantization.
+    """
+
+    def __init__(
+        self,
+        weight_quantize_algo=None,
+        quant_type=None,
+        shift=False,
+        smooth=False,
+        shift_smooth_all_linears=False,
+        quant_round_type=0,
+        llm_int8_threshold=6.0,
+        weight_double_quant=False,
+        weight_blocksize=64,
+        weight_double_quant_block_size=256,
+        weight_quant_method="abs_max_channel_wise",
+        act_quant_method="abs_max",
+    ):
+        if weight_quantize_algo is not None and weight_quantize_algo not in [
+            "weight_only_int8",
+            "weight_only_int4",
+            "llm.int8",
+            "a8w8",
+            "nf4",
+            "fp4",
+        ]:
+            raise ValueError(
+                f"weight_quantize_algo:{weight_quantize_algo} not in supported list ['weight_only_int8', 'weight_only_int4', 'llm.int8', 'a8w8', 'nf4', 'fp4']"
+            )
+        if quant_type is not None and quant_type not in ["weight_only_int8", "weight_only_int4", "a8w8", "a8w8c8"]:
+            raise ValueError(
+                f"quant_type:{quant_type} not in supported list ['weight_only_int8', 'weight_only_int4', 'a8w8', 'a8w8c8']"
+            )
+        self.weight_quantize_algo = weight_quantize_algo
+        self.quant_type = quant_type
+        self.shift = shift
+        self.smooth = smooth
+        self.shift = shift
+        self.shift_smooth_all_linears = shift_smooth_all_linears
+        self.quant_round_type = quant_round_type
+        self.llm_int8_threshold = llm_int8_threshold
+        self.weight_double_quant = weight_double_quant
+        self.weight_blocksize = weight_blocksize
+        self.weight_quant_method = weight_quant_method
+        self.act_quant_method = quant_inference_mapping[act_quant_method]
+        self.weight_double_quant_block_size = weight_double_quant_block_size
+
+    def is_weight_quantize(self):
+        if self.weight_quantize_algo in ["weight_only_int8", "weight_only_int4", "llm.int8", "nf4", "fp4", "a8w8"]:
+            return True
+        else:
+            return False
+
+    def is_support_merge_tensor_parallel(self):
+        if self.weight_quantize_algo in ["weight_only_int8", "weight_only_int4", "llm.int8", "a8w8"]:
+            return False
+        else:
+            return True
+
+    @classmethod
+    def from_dict(cls, config_dict, return_unused_kwargs=False, **kwargs):
+        """
+        Instantiates QuantizationConfig from dict
+        """
+        config = cls(**config_dict)
+
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        if return_unused_kwargs:
+            return config, kwargs
+        else:
+            return config
+
+    def to_json_file(self, json_file_path):
+        """
+        Save this instance to a JSON file.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n")
+
+    def to_dict(self):
+        return copy.deepcopy(self.__dict__)
+
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+
+    def to_json_string(self, use_diff=True):
+        if use_diff is True:
+            config_dict = self.to_diff_dict()
+        else:
+            config_dict = self.to_dict()
+        return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+
+    def to_diff_dict(self):
+        config_dict = self.to_dict()
+
+        # get the default config dict
+        default_config_dict = QuantizationConfig().to_dict()
+
+        serializable_config_dict = {}
+
+        # only serialize values that differ from the default config
+        for key, value in config_dict.items():
+            if value != default_config_dict[key]:
+                serializable_config_dict[key] = value
+
+        return serializable_config_dict
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/quantization/quantization_linear.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/quantization/quantization_linear.py
new file mode 100644
index 000000000..46a9a733b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/quantization/quantization_linear.py
@@ -0,0 +1,401 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from paddle.distributed.fleet.base import topology as tp
+from paddle.distributed.fleet.layers.mpu import mp_ops
+from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
+
+try:
+    from paddle.nn.quant import llm_int8_linear, weight_only_linear
+except:
+    llm_int8_linear = None
+    weight_only_linear = None
+try:
+    from .qlora import qlora_weight_linear
+except:
+    qlora_weight_linear = None
+
+
+QuantMapping = {
+    # (quant_dtype, quant_weight_dtype, quant_weight_bit)
+    "weight_only_int8": ("int8", "int8", 8),
+    "weight_only_int4": ("int4", "int8", 4),
+    "llm.int8": ("int8", "int8", 8),
+    "fp4": ("fp4", "uint8", 4),
+    "nf4": ("nf4", "uint8", 4),
+}
+
+
+class QuantizationLinear(nn.Layer):
+    """Quantization Linear layer."""
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        quant_algo,
+        dtype,
+        weight_attr=None,
+        scale_attr=None,
+        bias_attr=None,
+        block_size=64,
+        double_quant_block_size=256,
+        double_quant=False,
+        qquant_scale_attr=None,
+        double_quant_scale_attr=None,
+        quant_scale_offset_attr=None,
+        quant_scale_attr=None,
+        llm_int8_threshold=6.0,
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.quant_algo = quant_algo
+        self.quant_dtype, self.quant_weight_dtype, self.quant_weight_bit = QuantMapping[self.quant_algo]
+        self._dtype = dtype
+        self.llm_int8_threshold = llm_int8_threshold
+        self.block_size = block_size
+        self.double_quant_block_size = double_quant_block_size
+        self.double_quant = double_quant
+
+        # PaddlePaddle dosen't support 4bit data type, one 8bit data represents two 4bit data.
+        # paddle.nn.quant.weight_quantize will transpose in_features and out_features.
+        if self.quant_algo in ["weight_only_int8", "weight_only_int4", "llm.int8"]:
+            self.quant_weight = self.create_parameter(
+                shape=[out_features // 2, in_features] if self.quant_weight_bit == 4 else [out_features, in_features],
+                attr=weight_attr if weight_attr else paddle.nn.initializer.Constant(value=0),
+                dtype=self.quant_weight_dtype,
+                is_bias=False,
+            )
+            self.quant_scale = self.create_parameter(
+                shape=[out_features],
+                attr=scale_attr,
+                dtype=self._dtype,
+                is_bias=False,
+            )
+        if self.quant_algo in ["fp4", "nf4"]:
+            if qlora_weight_linear is None:
+                raise ImportError(
+                    "Please run the following commands to install: qlora related package first\n"
+                    "1) git clone https://github.com/PaddlePaddle/PaddleSlim \n"
+                    "2) cd PaddleSlim && pip install -e .\n"
+                    "3) cd csrc &&  python ./setup_cuda.py install"
+                )
+            self.quant_weight = self.create_parameter(
+                shape=[out_features * in_features // 2, 1],
+                attr=weight_attr if weight_attr else paddle.nn.initializer.Constant(value=0),
+                dtype=self.quant_weight_dtype,
+                is_bias=False,
+            )
+            if self.double_quant:
+                # quantized quant_scale
+                self.qquant_scale = self.create_parameter(
+                    shape=[in_features * out_features // self.block_size],
+                    attr=qquant_scale_attr if qquant_scale_attr else paddle.nn.initializer.Constant(value=0),
+                    dtype="uint8",
+                    is_bias=False,
+                )
+                # double quant_scale: quant_scale of quantized quant_scale
+                self.double_quant_scale = self.create_parameter(
+                    shape=[in_features * out_features // self.block_size // self.double_quant_block_size],
+                    attr=double_quant_scale_attr,
+                    dtype="float32",
+                    is_bias=False,
+                )
+                self.quant_scale_offset = self.create_parameter(
+                    shape=[],
+                    attr=quant_scale_offset_attr,
+                    dtype="float32",
+                    is_bias=False,
+                )
+            else:
+                self.quant_scale = self.create_parameter(
+                    shape=[in_features * out_features // self.block_size],
+                    attr=quant_scale_attr if quant_scale_attr else paddle.nn.initializer.Constant(value=0),
+                    dtype="float32",
+                    is_bias=False,
+                )
+
+        if bias_attr is False:
+            self.bias = None
+        else:
+            self.bias = self.create_parameter(
+                shape=[out_features],
+                attr=bias_attr,
+                dtype=self._dtype,
+                is_bias=True,
+            )
+
+    def forward(self, x):
+        with paddle.amp.auto_cast(enable=False):
+            if self.quant_algo in ["weight_only_int8", "weight_only_int4"]:
+                out = weight_only_linear(x, self.quant_weight, self.bias, self.quant_scale, self.quant_dtype)
+            elif self.quant_algo in ["llm.int8"]:
+                out = llm_int8_linear(x, self.quant_weight, self.bias, self.quant_scale, self.llm_int8_threshold)
+            elif self.quant_algo in ["fp4", "nf4"]:
+                out = qlora_weight_linear(
+                    x=x,
+                    quant_weight=self.quant_weight,
+                    dtype=self._dtype,
+                    state=(self.qquant_scale, self.double_quant_scale, self.quant_scale_offset)
+                    if self.double_quant
+                    else self.quant_scale,
+                    quant_algo=self.quant_algo,
+                    double_quant=self.double_quant,
+                    block_size=self.block_size,
+                    double_quant_block_size=self.double_quant_block_size,
+                    bias=self.bias,
+                )
+        return out
+
+
+class ColumnParallelQuantizationLinear(nn.Layer):
+    """Quantization Linear layer with mp parallelized(column).
+    The code implementation refers to paddle.distributed.fleet.meta_parallel.ColumnParallelLinear.
+    https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/distributed/fleet/layers/mpu/mp_layers.py#L310
+    Different from ColumnParallelLinear, this class keeps weight in INT8/INT4 with quant scale, and supports matrix
+    multiplication(weight_only_linear/llm_int8_linear) for input tensor(fp16/bf16) and quantized weight(INT8/INT4)
+    and bias addition if provided.
+    Notice: quantized weight shape is transposed of weight shape in ColumnParallelLinear.
+    """
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        quant_algo,
+        dtype,
+        weight_attr=None,
+        scale_attr=None,
+        bias_attr=None,
+        gather_output=True,
+        mp_group=None,
+        llm_int8_threshold=6.0,
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.quant_algo = quant_algo
+        self.quant_dtype, self.quant_weight_dtype, self.quant_weight_bit = QuantMapping[self.quant_algo]
+        self._dtype = dtype
+        self.llm_int8_threshold = llm_int8_threshold
+
+        self.model_parallel_group = (
+            tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group() if mp_group is None else mp_group
+        )
+        self.world_size = (
+            tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size() if mp_group is None else mp_group.nranks
+        )
+        self.is_mp = self.world_size > 1
+        self.gather_output = gather_output
+        self.output_size_per_partition = out_features // self.world_size
+
+        # PaddlePaddle dosen't support Int4 data type, one Int8 data represents two Int4 data.
+        if self.is_mp and paddle.in_dynamic_mode():
+            with get_rng_state_tracker().rng_state():
+                self.quant_weight = self.create_parameter(
+                    shape=[self.output_size_per_partition // 2, in_features]
+                    if self.quant_dtype == "int4"
+                    else [self.output_size_per_partition, in_features],
+                    attr=weight_attr if weight_attr else paddle.nn.initializer.Constant(value=0),
+                    dtype="int8",
+                    is_bias=False,
+                )
+        else:
+            self.quant_weight = self.create_parameter(
+                shape=[self.output_size_per_partition // 2, in_features]
+                if self.quant_dtype == "int4"
+                else [self.output_size_per_partition, in_features],
+                attr=weight_attr if weight_attr else paddle.nn.initializer.Constant(value=0),
+                dtype="int8",
+                is_bias=False,
+            )
+
+        self.quant_weight.is_distributed = True if self.is_mp else False
+        if self.quant_weight.is_distributed:
+            self.quant_weight.split_axis = 0
+
+        self.quant_scale = self.create_parameter(
+            shape=[self.output_size_per_partition],
+            attr=scale_attr,
+            dtype=self._dtype,
+            is_bias=False,
+        )
+        self.quant_scale.is_distributed = True if self.is_mp else False
+        if self.quant_scale.is_distributed:
+            self.quant_scale.split_axis = 0
+
+        if bias_attr is False:
+            self.bias = None
+        else:
+            self.bias = self.create_parameter(
+                shape=[self.output_size_per_partition],
+                attr=bias_attr if bias_attr else paddle.nn.initializer.Constant(value=0.0),
+                dtype=self._dtype,
+                is_bias=True,
+            )
+            self.bias.is_distributed = True if self.is_mp else False
+            if self.bias.is_distributed:
+                self.bias.split_axis = 0
+
+    def forward(self, x):
+        if self.is_mp:
+            input_parallel = mp_ops._c_identity(x, group=self.model_parallel_group)
+        else:
+            input_parallel = x
+
+        with paddle.amp.auto_cast(enable=False):
+            if "weight_only" in self.quant_algo:
+                output_parallel = weight_only_linear(
+                    input_parallel, self.quant_weight, self.bias, self.quant_scale, self.quant_dtype
+                )
+            else:
+                output_parallel = llm_int8_linear(
+                    input_parallel, self.quant_weight, self.bias, self.quant_scale, self.llm_int8_threshold
+                )
+
+        if self.gather_output and self.is_mp:
+            output = mp_ops._c_concat(output_parallel, group=self.model_parallel_group)
+        else:
+            output = output_parallel
+        return output
+
+
+class RowParallelQuantizationLinear(nn.Layer):
+    """Quantization Linear layer with mp parallelized(row).
+    The code implementation refers to paddle.distributed.fleet.meta_parallel.RowParallelLinear.
+    https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/distributed/fleet/layers/mpu/mp_layers.py#L517
+    Different from RowParallelLinear, this class keeps weight in INT8/INT4 with quant scale, and supports matrix
+    multiplication(weight_only_linear/llm_int8_linear) for input tensor(fp16/bf16) and quantized weight(INT8/INT4)
+    and bias addition if provided.
+    Notice: quantized weight shape is transposed of weight shape in RowParallelLinear.
+    """
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        quant_algo,
+        dtype,
+        weight_attr=None,
+        scale_attr=None,
+        bias_attr=None,
+        input_is_parallel=False,
+        mp_group=None,
+        llm_int8_threshold=6.0,
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.quant_algo = quant_algo
+        self.quant_dtype, self.quant_weight_dtype, self.quant_weight_bit = QuantMapping[self.quant_algo]
+        self._dtype = dtype
+        self.llm_int8_threshold = llm_int8_threshold
+
+        self.model_parallel_group = (
+            tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group() if mp_group is None else mp_group
+        )
+        self.world_size = (
+            tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size() if mp_group is None else mp_group.nranks
+        )
+        self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank() if mp_group is None else mp_group.rank
+        self.is_mp = self.world_size > 1
+        self.input_is_parallel = input_is_parallel
+        self.input_size_per_partition = in_features // self.world_size
+
+        # PaddlePaddle dosen't support Int4 data type, one Int8 data represents two Int4 data.
+        # paddle.nn.quant.weight_quantize will transpose in_features and out_features.
+        if self.is_mp and paddle.in_dynamic_mode():
+            with get_rng_state_tracker().rng_state():
+                self.quant_weight = self.create_parameter(
+                    shape=[out_features // 2, self.input_size_per_partition]
+                    if self.quant_dtype == "int4"
+                    else [out_features, self.input_size_per_partition],
+                    attr=weight_attr if weight_attr else paddle.nn.initializer.Constant(value=0),
+                    dtype="int8",
+                    is_bias=False,
+                )
+        else:
+            self.quant_weight = self.create_parameter(
+                shape=[out_features // 2, self.input_size_per_partition]
+                if self.quant_dtype == "int4"
+                else [out_features, self.input_size_per_partition],
+                attr=weight_attr if weight_attr else paddle.nn.initializer.Constant(value=0),
+                dtype="int8",
+                is_bias=False,
+            )
+
+        self.quant_weight.is_distributed = True if self.is_mp else False
+        if self.quant_weight.is_distributed:
+            self.quant_weight.split_axis = 1
+
+        self.quant_scale = self.create_parameter(
+            shape=[out_features],
+            attr=scale_attr,
+            dtype=self._dtype,
+            is_bias=False,
+        )
+        self.quant_scale.is_distributed = True if self.is_mp else False
+        if self.quant_scale.is_distributed:
+            self.quant_scale.split_axis = 0
+
+        if bias_attr is False:
+            self.bias = None
+        else:
+            self.bias = self.create_parameter(
+                shape=[out_features],
+                attr=bias_attr if bias_attr else paddle.nn.initializer.Constant(value=0.0),
+                dtype=self._dtype,
+                is_bias=True,
+            )
+
+    def forward(self, x):
+        if self.input_is_parallel or (not self.is_mp):
+            input_parallel = x
+        else:
+            # split last dim
+            input_parallel = mp_ops._c_split(x, group=self.model_parallel_group)
+
+        if self.is_mp:
+            with paddle.amp.auto_cast(enable=False):
+                if "weight_only" in self.quant_algo:
+                    output_parallel = weight_only_linear(
+                        input_parallel, self.quant_weight, None, self.quant_scale, self.quant_dtype
+                    )
+                else:
+                    output_parallel = llm_int8_linear(
+                        input_parallel, self.quant_weight, None, self.quant_scale, self.llm_int8_threshold
+                    )
+
+            output_ = mp_ops._mp_allreduce(
+                output_parallel,
+                group=self.model_parallel_group,
+                use_calc_stream=True,
+                use_model_parallel=True,
+            )
+            output = output_ + self.bias if self.bias is not None else output_
+
+        else:
+            with paddle.amp.auto_cast(enable=False):
+                if "weight_only" in self.quant_algo:
+                    output = weight_only_linear(
+                        input_parallel, self.quant_weight, self.bias, self.quant_scale, self.quant_dtype
+                    )
+                else:
+                    output = llm_int8_linear(
+                        input_parallel, self.quant_weight, self.bias, self.quant_scale, self.llm_int8_threshold
+                    )
+        return output
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/quantization/quantization_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/quantization/quantization_utils.py
new file mode 100644
index 000000000..fe46efd2a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/quantization/quantization_utils.py
@@ -0,0 +1,197 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+
+import paddle
+import paddle.nn as nn
+from paddle.distributed.fleet.meta_parallel import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+)
+from paddle.nn.quant import weight_quantize
+
+from ..utils.log import logger
+from .quantization_linear import (
+    ColumnParallelQuantizationLinear,
+    QuantizationLinear,
+    RowParallelQuantizationLinear,
+)
+
+try:
+    from .qlora import qlora_weight_quantize
+except:
+    qlora_weight_quantize = None
+
+
+def replace_with_quantization_linear(model, quantization_config, name_prefix="", llm_int8_threshold=6.0):
+    quantization_linear_list = []
+    for name, child in model.named_children():
+        if isinstance(child, nn.Linear):
+            if child.bias is None:
+                bias_attr = False
+            else:
+                bias_attr = None
+
+            model._sub_layers[name] = QuantizationLinear(
+                child.weight.shape[0],
+                child.weight.shape[1],
+                quantization_config.weight_quantize_algo,
+                child._dtype,
+                bias_attr=bias_attr,
+                llm_int8_threshold=llm_int8_threshold,
+                block_size=quantization_config.weight_blocksize,
+                double_quant_block_size=quantization_config.weight_double_quant_block_size,
+                double_quant=quantization_config.weight_double_quant,
+            )
+            del child
+            quantization_linear_list.append(name_prefix + name)
+        elif isinstance(child, ColumnParallelLinear):
+            if child.bias is None:
+                bias_attr = False
+            else:
+                bias_attr = None
+            model._sub_layers[name] = ColumnParallelQuantizationLinear(
+                child.weight.shape[0],
+                child.weight.shape[1] * child.world_size,
+                quantization_config.weight_quantize_algo,
+                child._dtype,
+                bias_attr=bias_attr,
+                gather_output=child.gather_output,
+                llm_int8_threshold=llm_int8_threshold,
+            )
+            del child
+            quantization_linear_list.append(name_prefix + name)
+        elif isinstance(child, RowParallelLinear):
+            if child.bias is None:
+                bias_attr = False
+            else:
+                bias_attr = None
+            model._sub_layers[name] = RowParallelQuantizationLinear(
+                child.weight.shape[0] * child.world_size,
+                child.weight.shape[1],
+                quantization_config.weight_quantize_algo,
+                child._dtype,
+                bias_attr=bias_attr,
+                input_is_parallel=child.input_is_parallel,
+                llm_int8_threshold=llm_int8_threshold,
+            )
+            del child
+            quantization_linear_list.append(name_prefix + name)
+        else:
+            quantization_linear_list += replace_with_quantization_linear(
+                child, quantization_config, name_prefix + name + ".", llm_int8_threshold
+            )
+
+    gc.collect()
+    return quantization_linear_list
+
+
+def convert_to_quantize_state_dict_with_check(state_dict, quantization_linear_list, quant_algo, dtype):
+    for name in quantization_linear_list:
+        weight_name = name + ".weight"
+        quant_weight_name = name + ".quant_weight"
+        quant_scale_name = name + ".quant_scale"
+
+        if quant_weight_name in state_dict and quant_scale_name in state_dict:
+            if state_dict[quant_weight_name].dtype != paddle.int8:
+                raise ValueError(
+                    f"{quant_weight_name} should be {paddle.int8} in state_dict but received dtype {state_dict[quant_weight_name].dtype}."
+                )
+            if (
+                state_dict[quant_scale_name].dtype != paddle.float16
+                and state_dict[quant_scale_name].dtype != paddle.bfloat16
+            ):
+                raise ValueError(
+                    f"{quant_scale_name} should be {paddle.float16} or {paddle.bfloat16} in state_dict but received dtype {state_dict[quant_scale_name].dtype}."
+                )
+        elif weight_name in state_dict:
+            target_weight = state_dict.pop(weight_name).cast(dtype)
+            quant_weight, quant_scale = weight_quantize(target_weight, quant_algo)
+            state_dict[quant_weight_name] = quant_weight
+            state_dict[quant_scale_name] = quant_scale
+            del target_weight
+        gc.collect()
+    return state_dict
+
+
+def convert_to_quantize_state_dict_without_check(state_dict, quantization_linear_list, quantization_config, dtype):
+    if qlora_weight_quantize is None:
+        raise ImportError(
+            "Please run the following commands to install qlora related package first: \n"
+            "1) git clone https://github.com/PaddlePaddle/PaddleSlim \n"
+            "2) cd PaddleSlim \n"
+            "3) python ./csrc/setup_cuda.py install"
+        )
+    for name in quantization_linear_list:
+        weight_name = name + ".weight"
+        if weight_name in state_dict:
+            target_weight = state_dict.pop(weight_name).cast(dtype).cuda()
+            qlora_state_dict = qlora_weight_quantize(
+                weight=target_weight,
+                quant_algo=quantization_config.weight_quantize_algo,
+                double_quant=quantization_config.weight_double_quant,
+                block_size=quantization_config.weight_blocksize,
+                double_quant_block_size=quantization_config.weight_double_quant_block_size,
+                linear_name=name,
+                return_dict=True,
+            )
+            state_dict.update(qlora_state_dict)
+            del target_weight
+            gc.collect()
+            paddle.device.cuda.empty_cache()
+    return state_dict
+
+
+def convert_to_quantize_state_dict(state_dict, quantization_linear_list, quantization_config, dtype):
+    if quantization_config.weight_quantize_algo in ["weight_only_int8", "weight_only_int4", "llm.int8"]:
+        return convert_to_quantize_state_dict_with_check(
+            state_dict, quantization_linear_list, quantization_config.weight_quantize_algo, dtype
+        )
+    elif quantization_config.weight_quantize_algo in ["fp4", "nf4"]:
+        return convert_to_quantize_state_dict_without_check(
+            state_dict, quantization_linear_list, quantization_config, dtype
+        )
+    else:
+        raise NotImplementedError(
+            f"Please check the quantization_config.weight_quantize_algo: {quantization_config.weight_quantize_algo}"
+        )
+
+
+def update_loaded_state_dict_keys(state_dict, quantization_linear_list, quantization_config):
+    for name in quantization_linear_list:
+        weight_name = name + ".weight"
+        quant_weight_name = name + ".quant_weight"
+        quant_scale_name = name + ".quant_scale"
+        qquant_scale_name = name + ".qquant_scale"
+        double_quant_scale_name = name + ".double_quant_scale"
+        quant_sacle_offset_name = name + ".quant_sacle_offset"
+
+        if quant_weight_name in state_dict and quant_scale_name in state_dict:
+            continue
+        elif weight_name in state_dict:
+            state_dict.remove(weight_name)
+            state_dict.append(quant_weight_name)
+            if quantization_config.weight_double_quant:
+                state_dict.append(qquant_scale_name)
+                state_dict.append(double_quant_scale_name)
+                state_dict.append(quant_sacle_offset_name)
+            else:
+                state_dict.append(quant_scale_name)
+        else:
+            logger.warning(
+                f"Cannot find {weight_name} in state_dict or {quant_weight_name}  and {quant_scale_name} in state_dict"
+            )
+
+    return state_dict
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/seq2vec/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/seq2vec/__init__.py
new file mode 100644
index 000000000..9645655ac
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/seq2vec/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .encoder import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/seq2vec/encoder.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/seq2vec/encoder.py
new file mode 100644
index 000000000..0e5a7c1af
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/seq2vec/encoder.py
@@ -0,0 +1,997 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.utils import weight_norm
+
+__all__ = ["BoWEncoder", "CNNEncoder", "GRUEncoder", "LSTMEncoder", "RNNEncoder", "TCNEncoder"]
+
+
+class BoWEncoder(nn.Layer):
+    r"""
+    A `BoWEncoder` takes as input a sequence of vectors and returns a
+    single vector, which simply sums the embeddings of a sequence across the time dimension.
+    The input to this encoder is of shape `(batch_size, num_tokens, emb_dim)`,
+    and the output is of shape `(batch_size, emb_dim)`.
+
+    Args:
+        emb_dim(int):
+            The dimension of each vector in the input sequence.
+
+    Example:
+        .. code-block::
+
+            import paddle
+            import paddle.nn as nn
+            import paddlenlp as nlp
+
+            class BoWModel(nn.Layer):
+                def __init__(self,
+                            vocab_size,
+                            num_classes,
+                            emb_dim=128,
+                            padding_idx=0,
+                            hidden_size=128,
+                            fc_hidden_size=96):
+                    super().__init__()
+                    self.embedder = nn.Embedding(
+                        vocab_size, emb_dim, padding_idx=padding_idx)
+                    self.bow_encoder = nlp.seq2vec.BoWEncoder(emb_dim)
+                    self.fc1 = nn.Linear(self.bow_encoder.get_output_dim(), hidden_size)
+                    self.fc2 = nn.Linear(hidden_size, fc_hidden_size)
+                    self.output_layer = nn.Linear(fc_hidden_size, num_classes)
+
+                def forward(self, text):
+                    # Shape: (batch_size, num_tokens, embedding_dim)
+                    embedded_text = self.embedder(text)
+
+                    # Shape: (batch_size, embedding_dim)
+                    summed = self.bow_encoder(embedded_text)
+                    encoded_text = paddle.tanh(summed)
+
+                    # Shape: (batch_size, hidden_size)
+                    fc1_out = paddle.tanh(self.fc1(encoded_text))
+                    # Shape: (batch_size, fc_hidden_size)
+                    fc2_out = paddle.tanh(self.fc2(fc1_out))
+                    # Shape: (batch_size, num_classes)
+                    logits = self.output_layer(fc2_out)
+                    return logits
+
+            model = BoWModel(vocab_size=100, num_classes=2)
+
+            text = paddle.randint(low=1, high=10, shape=[1,10], dtype='int32')
+            logits = model(text)
+    """
+
+    def __init__(self, emb_dim):
+        super().__init__()
+        self._emb_dim = emb_dim
+
+    def get_input_dim(self):
+        r"""
+        Returns the dimension of the vector input for each element in the sequence input
+        to a `BoWEncoder`. This is not the shape of the input tensor, but the
+        last element of that shape.
+        """
+        return self._emb_dim
+
+    def get_output_dim(self):
+        r"""
+        Returns the dimension of the final vector output by this `BoWEncoder`.  This is not
+        the shape of the returned tensor, but the last element of that shape.
+        """
+        return self._emb_dim
+
+    def forward(self, inputs, mask=None):
+        r"""
+        It simply sums the embeddings of a sequence across the time dimension.
+
+        Args:
+            inputs (Tensor):
+                Shape as `(batch_size, num_tokens, emb_dim)` and dtype as `float32` or `float64`.
+                The sequence length of the input sequence.
+            mask (Tensor, optional):
+                Shape same as `inputs`.
+                Its each elements identify whether the corresponding input token is padding or not.
+                If True, not padding token. If False, padding token.
+                Defaults to `None`.
+
+        Returns:
+            Tensor:
+                Returns tensor `summed`, the result vector of BagOfEmbedding.
+                Its data type is same as `inputs` and its shape is `[batch_size, emb_dim]`.
+        """
+        if mask is not None:
+            inputs = inputs * mask
+
+        # Shape: (batch_size, embedding_dim)
+        summed = inputs.sum(axis=1)
+        return summed
+
+
+class CNNEncoder(nn.Layer):
+    r"""
+    A `CNNEncoder` takes as input a sequence of vectors and returns a
+    single vector, a combination of multiple convolution layers and max pooling layers.
+    The input to this encoder is of shape `(batch_size, num_tokens, emb_dim)`,
+    and the output is of shape `(batch_size, output_dim)` or `(batch_size, len(ngram_filter_sizes) * num_filter)`.
+
+    The CNN has one convolution layer for each ngram filter size. Each convolution operation gives
+    out a vector of size num_filter. The number of times a convolution layer will be used
+    is `num_tokens - ngram_size + 1`. The corresponding maxpooling layer aggregates all these
+    outputs from the convolution layer and outputs the max.
+
+    This operation is repeated for every ngram size passed, and consequently the dimensionality of
+    the output after maxpooling is `len(ngram_filter_sizes) * num_filter`.  This then gets
+    (optionally) projected down to a lower dimensional output, specified by `output_dim`.
+
+    We then use a fully connected layer to project in back to the desired output_dim.  For more
+    details, refer to `A Sensitivity Analysis of (and Practitioners’ Guide to) Convolutional Neural
+    Networks for Sentence Classification <https://arxiv.org/abs/1510.03820>`__ ,
+    Zhang and Wallace 2016, particularly Figure 1.
+
+    Args:
+        emb_dim(int):
+            The dimension of each vector in the input sequence.
+        num_filter(int):
+            This is the output dim for each convolutional layer, which is the number of "filters"
+            learned by that layer.
+        ngram_filter_sizes(Tuple[int], optional):
+            This specifies both the number of convolutional layers we will create and their sizes.  The
+            default of `(2, 3, 4, 5)` will have four convolutional layers, corresponding to encoding
+            ngrams of size 2 to 5 with some number of filters.
+        conv_layer_activation(Layer, optional):
+            Activation to use after the convolution layers.
+            Defaults to `paddle.nn.Tanh()`.
+        output_dim(int, optional):
+            After doing convolutions and pooling, we'll project the collected features into a vector of
+            this size.  If this value is `None`, we will just return the result of the max pooling,
+            giving an output of shape `len(ngram_filter_sizes) * num_filter`.
+            Defaults to `None`.
+
+    Example:
+        .. code-block::
+
+            import paddle
+            import paddle.nn as nn
+            import paddlenlp as nlp
+
+            class CNNModel(nn.Layer):
+                def __init__(self,
+                            vocab_size,
+                            num_classes,
+                            emb_dim=128,
+                            padding_idx=0,
+                            num_filter=128,
+                            ngram_filter_sizes=(3, ),
+                            fc_hidden_size=96):
+                    super().__init__()
+                    self.embedder = nn.Embedding(
+                        vocab_size, emb_dim, padding_idx=padding_idx)
+                    self.encoder = nlp.seq2vec.CNNEncoder(
+                        emb_dim=emb_dim,
+                        num_filter=num_filter,
+                        ngram_filter_sizes=ngram_filter_sizes)
+                    self.fc = nn.Linear(self.encoder.get_output_dim(), fc_hidden_size)
+                    self.output_layer = nn.Linear(fc_hidden_size, num_classes)
+
+                def forward(self, text):
+                    # Shape: (batch_size, num_tokens, embedding_dim)
+                    embedded_text = self.embedder(text)
+                    # Shape: (batch_size, len(ngram_filter_sizes)*num_filter)
+                    encoder_out = self.encoder(embedded_text)
+                    encoder_out = paddle.tanh(encoder_out)
+                    # Shape: (batch_size, fc_hidden_size)
+                    fc_out = self.fc(encoder_out)
+                    # Shape: (batch_size, num_classes)
+                    logits = self.output_layer(fc_out)
+                    return logits
+
+            model = CNNModel(vocab_size=100, num_classes=2)
+
+            text = paddle.randint(low=1, high=10, shape=[1,10], dtype='int32')
+            logits = model(text)
+    """
+
+    def __init__(
+        self,
+        emb_dim,
+        num_filter,
+        ngram_filter_sizes=(2, 3, 4, 5),
+        conv_layer_activation=nn.Tanh(),
+        output_dim=None,
+        **kwargs
+    ):
+        super().__init__()
+        self._emb_dim = emb_dim
+        self._num_filter = num_filter
+        self._ngram_filter_sizes = ngram_filter_sizes
+        self._activation = conv_layer_activation
+        self._output_dim = output_dim
+
+        self.convs = paddle.nn.LayerList(
+            [
+                nn.Conv2D(in_channels=1, out_channels=self._num_filter, kernel_size=(i, self._emb_dim), **kwargs)
+                for i in self._ngram_filter_sizes
+            ]
+        )
+
+        maxpool_output_dim = self._num_filter * len(self._ngram_filter_sizes)
+        if self._output_dim:
+            self.projection_layer = nn.Linear(maxpool_output_dim, self._output_dim)
+        else:
+            self.projection_layer = None
+            self._output_dim = maxpool_output_dim
+
+    def get_input_dim(self):
+        r"""
+        Returns the dimension of the vector input for each element in the sequence input
+        to a `CNNEncoder`. This is not the shape of the input tensor, but the
+        last element of that shape.
+        """
+        return self._emb_dim
+
+    def get_output_dim(self):
+        r"""
+        Returns the dimension of the final vector output by this `CNNEncoder`.  This is not
+        the shape of the returned tensor, but the last element of that shape.
+        """
+        return self._output_dim
+
+    def forward(self, inputs, mask=None):
+        r"""
+        The combination of multiple convolution layers and max pooling layers.
+
+        Args:
+            inputs (Tensor):
+                Shape as `(batch_size, num_tokens, emb_dim)` and dtype as `float32` or `float64`.
+                Tensor containing the features of the input sequence.
+            mask (Tensor, optional):
+                Shape should be same as `inputs` and dtype as `int32`, `int64`, `float32` or `float64`.
+                Its each elements identify whether the corresponding input token is padding or not.
+                If True, not padding token. If False, padding token.
+                Defaults to `None`.
+
+        Returns:
+            Tensor:
+                Returns tensor `result`.
+                If output_dim is None, the result shape is of `(batch_size, output_dim)` and
+                dtype is `float`; If not, the result shape is of `(batch_size, len(ngram_filter_sizes) * num_filter)`.
+
+        """
+        if mask is not None:
+            inputs = inputs * mask
+
+        # Shape: (batch_size, 1, num_tokens, emb_dim) = (N, C, H, W)
+        inputs = inputs.unsqueeze(1)
+
+        # If output_dim is None, result shape of (batch_size, len(ngram_filter_sizes) * num_filter));
+        # else, result shape of (batch_size, output_dim).
+        convs_out = [self._activation(conv(inputs)).squeeze(3) for conv in self.convs]
+        maxpool_out = [F.adaptive_max_pool1d(t, output_size=1).squeeze(2) for t in convs_out]
+        result = paddle.concat(maxpool_out, axis=1)
+
+        if self.projection_layer is not None:
+            result = self.projection_layer(result)
+        return result
+
+
+class GRUEncoder(nn.Layer):
+    r"""
+    A GRUEncoder takes as input a sequence of vectors and returns a
+    single vector, which is a combination of multiple `paddle.nn.GRU
+    <https://www.paddlepaddle.org.cn/documentation/docs/en/api
+    /paddle/nn/layer/rnn/GRU_en.html>`__ subclass.
+    The input to this encoder is of shape `(batch_size, num_tokens, input_size)`,
+    The output is of shape `(batch_size, hidden_size * 2)` if GRU is bidirection;
+    If not, output is of shape `(batch_size, hidden_size)`.
+
+    Paddle's GRU have two outputs: the hidden state for every time step at last layer,
+    and the hidden state at the last time step for every layer.
+    If `pooling_type` is not None, we perform the pooling on the hidden state of every time
+    step at last layer to create a single vector. If None, we use the hidden state
+    of the last time step at last layer as a single output (shape of `(batch_size, hidden_size)`);
+    And if direction is bidirection, the we concat the hidden state of the last forward
+    gru and backward gru layer to create a single vector (shape of `(batch_size, hidden_size * 2)`).
+
+    Args:
+        input_size (int):
+            The number of expected features in the input (the last dimension).
+        hidden_size (int):
+            The number of features in the hidden state.
+        num_layers (int, optional):
+            Number of recurrent layers.
+            E.g., setting num_layers=2 would mean stacking two GRUs together to form a stacked GRU,
+            with the second GRU taking in outputs of the first GRU and computing the final results.
+            Defaults to 1.
+        direction (str, optional):
+            The direction of the network. It can be "forward" and "bidirect"
+            (it means bidirection network). If "bidirect", it is a bidirectional GRU,
+            and returns the concat output from both directions.
+            Defaults to "forward".
+        dropout (float, optional):
+            If non-zero, introduces a Dropout layer on the outputs of each GRU layer
+            except the last layer, with dropout probability equal to dropout.
+            Defaults to 0.0.
+        pooling_type (str, optional):
+            If `pooling_type` is None, then the GRUEncoder will return the hidden state of
+            the last time step at last layer as a single vector.
+            If pooling_type is not None, it must be one of "sum", "max" and "mean".
+            Then it will be pooled on the GRU output (the hidden state of every time
+            step at last layer) to create a single vector.
+            Defaults to `None`
+
+    Example:
+        .. code-block::
+
+            import paddle
+            import paddle.nn as nn
+            import paddlenlp as nlp
+
+            class GRUModel(nn.Layer):
+                def __init__(self,
+                            vocab_size,
+                            num_classes,
+                            emb_dim=128,
+                            padding_idx=0,
+                            gru_hidden_size=198,
+                            direction='forward',
+                            gru_layers=1,
+                            dropout_rate=0.0,
+                            pooling_type=None,
+                            fc_hidden_size=96):
+                    super().__init__()
+                    self.embedder = nn.Embedding(
+                        num_embeddings=vocab_size,
+                        embedding_dim=emb_dim,
+                        padding_idx=padding_idx)
+                    self.gru_encoder = nlp.seq2vec.GRUEncoder(
+                        emb_dim,
+                        gru_hidden_size,
+                        num_layers=gru_layers,
+                        direction=direction,
+                        dropout=dropout_rate,
+                        pooling_type=pooling_type)
+                    self.fc = nn.Linear(self.gru_encoder.get_output_dim(), fc_hidden_size)
+                    self.output_layer = nn.Linear(fc_hidden_size, num_classes)
+
+                def forward(self, text, seq_len):
+                    # Shape: (batch_size, num_tokens, embedding_dim)
+                    embedded_text = self.embedder(text)
+                    # Shape: (batch_size, num_tokens, num_directions*gru_hidden_size)
+                    # num_directions = 2 if direction is 'bidirect'
+                    # if not, num_directions = 1
+                    text_repr = self.gru_encoder(embedded_text, sequence_length=seq_len)
+                    # Shape: (batch_size, fc_hidden_size)
+                    fc_out = paddle.tanh(self.fc(text_repr))
+                    # Shape: (batch_size, num_classes)
+                    logits = self.output_layer(fc_out)
+                    return logits
+
+            model = GRUModel(vocab_size=100, num_classes=2)
+
+            text = paddle.randint(low=1, high=10, shape=[1,10], dtype='int32')
+            seq_len = paddle.to_tensor([10])
+            logits = model(text, seq_len)
+    """
+
+    def __init__(
+        self, input_size, hidden_size, num_layers=1, direction="forward", dropout=0.0, pooling_type=None, **kwargs
+    ):
+        super().__init__()
+        self._input_size = input_size
+        self._hidden_size = hidden_size
+        self._direction = direction
+        self._pooling_type = pooling_type
+
+        self.gru_layer = nn.GRU(
+            input_size=input_size,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            direction=direction,
+            dropout=dropout,
+            **kwargs,
+        )
+
+    def get_input_dim(self):
+        r"""
+        Returns the dimension of the vector input for each element in the sequence input
+        to a `GRUEncoder`. This is not the shape of the input tensor, but the
+        last element of that shape.
+        """
+        return self._input_size
+
+    def get_output_dim(self):
+        r"""
+        Returns the dimension of the final vector output by this `GRUEncoder`.  This is not
+        the shape of the returned tensor, but the last element of that shape.
+        """
+        if self._direction == "bidirect":
+            return self._hidden_size * 2
+        else:
+            return self._hidden_size
+
+    def forward(self, inputs, sequence_length):
+        r"""
+        GRUEncoder takes the a sequence of vectors and returns a single vector,
+        which is a combination of multiple GRU layers. The input to this
+        encoder is of shape `(batch_size, num_tokens, input_size)`,
+        The output is of shape `(batch_size, hidden_size * 2)` if GRU is bidirection;
+        If not, output is of shape `(batch_size, hidden_size)`.
+
+        Args:
+            inputs (Tensor): Shape as `(batch_size, num_tokens, input_size)`.
+                Tensor containing the features of the input sequence.
+            sequence_length (Tensor): Shape as `(batch_size)`.
+                The sequence length of the input sequence.
+
+        Returns:
+            Tensor: Returns tensor `output`, the hidden state at the last time step for every layer.
+            Its data type is `float` and its shape is `[batch_size, hidden_size]`.
+
+        """
+        encoded_text, last_hidden = self.gru_layer(inputs, sequence_length=sequence_length)
+        if not self._pooling_type:
+            # We exploit the `last_hidden` (the hidden state at the last time step for every layer)
+            # to create a single vector.
+            # If gru is not bidirection, then output is the hidden state of the last time step
+            # at last layer. Output is shape of `(batch_size, hidden_size)`.
+            # If gru is bidirection, then output is concatenation of the forward and backward hidden state
+            # of the last time step at last layer. Output is shape of `(batch_size, hidden_size * 2)`.
+            if self._direction != "bidirect":
+                output = last_hidden[-1, :, :]
+            else:
+                output = paddle.concat((last_hidden[-2, :, :], last_hidden[-1, :, :]), axis=1)
+        else:
+            # We exploit the `encoded_text` (the hidden state at the every time step for last layer)
+            # to create a single vector. We perform pooling on the encoded text.
+            # The output shape is `(batch_size, hidden_size * 2)` if use bidirectional GRU,
+            # otherwise the output shape is `(batch_size, hidden_size * 2)`.
+            if self._pooling_type == "sum":
+                output = paddle.sum(encoded_text, axis=1)
+            elif self._pooling_type == "max":
+                output = paddle.max(encoded_text, axis=1)
+            elif self._pooling_type == "mean":
+                output = paddle.mean(encoded_text, axis=1)
+            else:
+                raise RuntimeError(
+                    "Unexpected pooling type %s ."
+                    "Pooling type must be one of sum, max and mean." % self._pooling_type
+                )
+        return output
+
+
+class LSTMEncoder(nn.Layer):
+    r"""
+    An LSTMEncoder takes as input a sequence of vectors and returns a
+    single vector, which is a combination of multiple `paddle.nn.LSTM
+    <https://www.paddlepaddle.org.cn/documentation/docs/en/api
+    /paddle/nn/layer/rnn/LSTM_en.html>`__ subclass.
+    The input to this encoder is of shape `(batch_size, num_tokens, input_size)`.
+    The output is of shape `(batch_size, hidden_size * 2)` if LSTM is bidirection;
+    If not, output is of shape `(batch_size, hidden_size)`.
+
+    Paddle's LSTM have two outputs: the hidden state for every time step at last layer,
+    and the hidden state and cell at the last time step for every layer.
+    If `pooling_type` is not None, we perform the pooling on the hidden state of every time
+    step at last layer to create a single vector. If None, we use the hidden state
+    of the last time step at last layer as a single output (shape of `(batch_size, hidden_size)`);
+    And if direction is bidirection, the we concat the hidden state of the last forward
+    lstm and backward lstm layer to create a single vector (shape of `(batch_size, hidden_size * 2)`).
+
+    Args:
+        input_size (int):
+            The number of expected features in the input (the last dimension).
+        hidden_size (int):
+            The number of features in the hidden state.
+        num_layers (int, optional):
+            Number of recurrent layers.
+            E.g., setting num_layers=2 would mean stacking two LSTMs together to form a stacked LSTM,
+            with the second LSTM taking in outputs of the first LSTM and computing the final results.
+            Defaults to 1.
+        direction (str, optional):
+            The direction of the network. It can be "forward" or "bidirect" (it means bidirection network).
+            If "bidirect", it is a bidirectional LSTM, and returns the concat output from both directions.
+            Defaults to "forward".
+        dropout (float, optional):
+            If non-zero, introduces a Dropout layer on the outputs of each LSTM layer
+            except the last layer, with dropout probability equal to dropout.
+            Defaults to 0.0 .
+        pooling_type (str, optional):
+            If `pooling_type` is None, then the LSTMEncoder will return
+            the hidden state of the last time step at last layer as a single vector.
+            If pooling_type is not None, it must be one of "sum", "max" and "mean".
+            Then it will be pooled on the LSTM output (the hidden state of every
+            time step at last layer) to create a single vector.
+            Defaults to `None`.
+
+    Example:
+        .. code-block::
+
+            import paddle
+            import paddle.nn as nn
+            import paddlenlp as nlp
+
+            class LSTMModel(nn.Layer):
+                def __init__(self,
+                            vocab_size,
+                            num_classes,
+                            emb_dim=128,
+                            padding_idx=0,
+                            lstm_hidden_size=198,
+                            direction='forward',
+                            lstm_layers=1,
+                            dropout_rate=0.0,
+                            pooling_type=None,
+                            fc_hidden_size=96):
+                    super().__init__()
+                    self.embedder = nn.Embedding(
+                        num_embeddings=vocab_size,
+                        embedding_dim=emb_dim,
+                        padding_idx=padding_idx)
+                    self.lstm_encoder = nlp.seq2vec.LSTMEncoder(
+                        emb_dim,
+                        lstm_hidden_size,
+                        num_layers=lstm_layers,
+                        direction=direction,
+                        dropout=dropout_rate,
+                        pooling_type=pooling_type)
+                    self.fc = nn.Linear(self.lstm_encoder.get_output_dim(), fc_hidden_size)
+                    self.output_layer = nn.Linear(fc_hidden_size, num_classes)
+
+                def forward(self, text, seq_len):
+                    # Shape: (batch_size, num_tokens, embedding_dim)
+                    embedded_text = self.embedder(text)
+                    # Shape: (batch_size, num_tokens, num_directions*lstm_hidden_size)
+                    # num_directions = 2 if direction is 'bidirect'
+                    # if not, num_directions = 1
+                    text_repr = self.lstm_encoder(embedded_text, sequence_length=seq_len)
+                    # Shape: (batch_size, fc_hidden_size)
+                    fc_out = paddle.tanh(self.fc(text_repr))
+                    # Shape: (batch_size, num_classes)
+                    logits = self.output_layer(fc_out)
+                    return logits
+
+            model = LSTMModel(vocab_size=100, num_classes=2)
+
+            text = paddle.randint(low=1, high=10, shape=[1,10], dtype='int32')
+            seq_len = paddle.to_tensor([10])
+            logits = model(text, seq_len)
+    """
+
+    def __init__(
+        self, input_size, hidden_size, num_layers=1, direction="forward", dropout=0.0, pooling_type=None, **kwargs
+    ):
+        super().__init__()
+        self._input_size = input_size
+        self._hidden_size = hidden_size
+        self._direction = direction
+        self._pooling_type = pooling_type
+
+        self.lstm_layer = nn.LSTM(
+            input_size=input_size,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            direction=direction,
+            dropout=dropout,
+            **kwargs,
+        )
+
+    def get_input_dim(self):
+        r"""
+        Returns the dimension of the vector input for each element in the sequence input
+        to a `LSTMEncoder`. This is not the shape of the input tensor, but the
+        last element of that shape.
+        """
+        return self._input_size
+
+    def get_output_dim(self):
+        r"""
+        Returns the dimension of the final vector output by this `LSTMEncoder`.  This is not
+        the shape of the returned tensor, but the last element of that shape.
+        """
+        if self._direction == "bidirect":
+            return self._hidden_size * 2
+        else:
+            return self._hidden_size
+
+    def forward(self, inputs, sequence_length):
+        r"""
+        LSTMEncoder takes the a sequence of vectors and returns a
+        single vector, which is a combination of multiple LSTM layers.
+        The input to this encoder is of shape `(batch_size, num_tokens, input_size)`,
+        The output is of shape `(batch_size, hidden_size * 2)` if LSTM is bidirection;
+        If not, output is of shape `(batch_size, hidden_size)`.
+
+        Args:
+            inputs (Tensor): Shape as `(batch_size, num_tokens, input_size)`.
+                Tensor containing the features of the input sequence.
+            sequence_length (Tensor): Shape as `(batch_size)`.
+                The sequence length of the input sequence.
+
+        Returns:
+            Tensor: Returns tensor `output`, the hidden state at the last time step for every layer.
+            Its data type is `float` and its shape is `[batch_size, hidden_size]`.
+
+        """
+        encoded_text, (last_hidden, last_cell) = self.lstm_layer(inputs, sequence_length=sequence_length)
+        if not self._pooling_type:
+            # We exploit the `last_hidden` (the hidden state at the last time step for every layer)
+            # to create a single vector.
+            # If lstm is not bidirection, then output is the hidden state of the last time step
+            # at last layer. Output is shape of `(batch_size, hidden_size)`.
+            # If lstm is bidirection, then output is concatenation of the forward and backward hidden state
+            # of the last time step at last layer. Output is shape of `(batch_size, hidden_size * 2)`.
+            if self._direction != "bidirect":
+                output = last_hidden[-1, :, :]
+            else:
+                output = paddle.concat((last_hidden[-2, :, :], last_hidden[-1, :, :]), axis=1)
+        else:
+            # We exploit the `encoded_text` (the hidden state at the every time step for last layer)
+            # to create a single vector. We perform pooling on the encoded text.
+            # The output shape is `(batch_size, hidden_size * 2)` if use bidirectional LSTM,
+            # otherwise the output shape is `(batch_size, hidden_size * 2)`.
+            if self._pooling_type == "sum":
+                output = paddle.sum(encoded_text, axis=1)
+            elif self._pooling_type == "max":
+                output = paddle.max(encoded_text, axis=1)
+            elif self._pooling_type == "mean":
+                output = paddle.mean(encoded_text, axis=1)
+            else:
+                raise RuntimeError(
+                    "Unexpected pooling type %s ."
+                    "Pooling type must be one of sum, max and mean." % self._pooling_type
+                )
+        return output
+
+
+class RNNEncoder(nn.Layer):
+    r"""
+    A RNNEncoder takes as input a sequence of vectors and returns a
+    single vector, which is a combination of multiple `paddle.nn.RNN
+    <https://www.paddlepaddle.org.cn/documentation/docs/en/api
+    /paddle/nn/layer/rnn/RNN_en.html>`__ subclass.
+    The input to this encoder is of shape `(batch_size, num_tokens, input_size)`,
+    The output is of shape `(batch_size, hidden_size * 2)` if RNN is bidirection;
+    If not, output is of shape `(batch_size, hidden_size)`.
+
+    Paddle's RNN have two outputs: the hidden state for every time step at last layer,
+    and the hidden state at the last time step for every layer.
+    If `pooling_type` is not None, we perform the pooling on the hidden state of every time
+    step at last layer to create a single vector. If None, we use the hidden state
+    of the last time step at last layer as a single output (shape of `(batch_size, hidden_size)`);
+    And if direction is bidirection, the we concat the hidden state of the last forward
+    rnn and backward rnn layer to create a single vector (shape of `(batch_size, hidden_size * 2)`).
+
+    Args:
+        input_size (int):
+            The number of expected features in the input (the last dimension).
+        hidden_size (int):
+            The number of features in the hidden state.
+        num_layers (int, optional):
+            Number of recurrent layers.
+            E.g., setting num_layers=2 would mean stacking two RNNs together to form a stacked RNN,
+            with the second RNN taking in outputs of the first RNN and computing the final results.
+            Defaults to 1.
+        direction (str, optional):
+            The direction of the network. It can be "forward" and "bidirect"
+            (it means bidirection network). If "bidirect", it is a bidirectional RNN,
+            and returns the concat output from both directions. Defaults to "forward"
+        dropout (float, optional):
+            If non-zero, introduces a Dropout layer on the outputs of each RNN layer
+            except the last layer, with dropout probability equal to dropout.
+            Defaults to 0.0.
+        pooling_type (str, optional):
+            If `pooling_type` is None, then the RNNEncoder will return the hidden state
+            of the last time step at last layer as a single vector.
+            If pooling_type is not None, it must be one of "sum", "max" and "mean".
+            Then it will be pooled on the RNN output (the hidden state of every time
+            step at last layer) to create a single vector.
+            Defaults to `None`.
+
+    Example:
+        .. code-block::
+
+            import paddle
+            import paddle.nn as nn
+            import paddlenlp as nlp
+
+            class RNNModel(nn.Layer):
+                def __init__(self,
+                            vocab_size,
+                            num_classes,
+                            emb_dim=128,
+                            padding_idx=0,
+                            rnn_hidden_size=198,
+                            direction='forward',
+                            rnn_layers=1,
+                            dropout_rate=0.0,
+                            pooling_type=None,
+                            fc_hidden_size=96):
+                    super().__init__()
+                    self.embedder = nn.Embedding(
+                        num_embeddings=vocab_size,
+                        embedding_dim=emb_dim,
+                        padding_idx=padding_idx)
+                    self.rnn_encoder = nlp.seq2vec.RNNEncoder(
+                        emb_dim,
+                        rnn_hidden_size,
+                        num_layers=rnn_layers,
+                        direction=direction,
+                        dropout=dropout_rate,
+                        pooling_type=pooling_type)
+                    self.fc = nn.Linear(self.rnn_encoder.get_output_dim(), fc_hidden_size)
+                    self.output_layer = nn.Linear(fc_hidden_size, num_classes)
+
+                def forward(self, text, seq_len):
+                    # Shape: (batch_size, num_tokens, embedding_dim)
+                    embedded_text = self.embedder(text)
+                    # Shape: (batch_size, num_tokens, num_directions*rnn_hidden_size)
+                    # num_directions = 2 if direction is 'bidirect'
+                    # if not, num_directions = 1
+                    text_repr = self.rnn_encoder(embedded_text, sequence_length=seq_len)
+                    # Shape: (batch_size, fc_hidden_size)
+                    fc_out = paddle.tanh(self.fc(text_repr))
+                    # Shape: (batch_size, num_classes)
+                    logits = self.output_layer(fc_out)
+                    return logits
+
+            model = RNNModel(vocab_size=100, num_classes=2)
+
+            text = paddle.randint(low=1, high=10, shape=[1,10], dtype='int32')
+            seq_len = paddle.to_tensor([10])
+            logits = model(text, seq_len)
+    """
+
+    def __init__(
+        self, input_size, hidden_size, num_layers=1, direction="forward", dropout=0.0, pooling_type=None, **kwargs
+    ):
+        super().__init__()
+        self._input_size = input_size
+        self._hidden_size = hidden_size
+        self._direction = direction
+        self._pooling_type = pooling_type
+
+        self.rnn_layer = nn.SimpleRNN(
+            input_size=input_size,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            direction=direction,
+            dropout=dropout,
+            **kwargs,
+        )
+
+    def get_input_dim(self):
+        r"""
+        Returns the dimension of the vector input for each element in the sequence input
+        to a `RNNEncoder`. This is not the shape of the input tensor, but the
+        last element of that shape.
+        """
+        return self._input_size
+
+    def get_output_dim(self):
+        r"""
+        Returns the dimension of the final vector output by this `RNNEncoder`.  This is not
+        the shape of the returned tensor, but the last element of that shape.
+        """
+        if self._direction == "bidirect":
+            return self._hidden_size * 2
+        else:
+            return self._hidden_size
+
+    def forward(self, inputs, sequence_length):
+        r"""
+        RNNEncoder takes the a sequence of vectors and returns a
+        single vector, which is a combination of multiple RNN layers.
+        The input to this encoder is of shape `(batch_size, num_tokens, input_size)`.
+        The output is of shape `(batch_size, hidden_size * 2)` if RNN is bidirection;
+        If not, output is of shape `(batch_size, hidden_size)`.
+
+        Args:
+            inputs (Tensor): Shape as `(batch_size, num_tokens, input_size)`.
+                Tensor containing the features of the input sequence.
+            sequence_length (Tensor): Shape as `(batch_size)`.
+                The sequence length of the input sequence.
+
+        Returns:
+            Tensor: Returns tensor `output`, the hidden state at the last time step for every layer.
+            Its data type is `float` and its shape is `[batch_size, hidden_size]`.
+
+        """
+        encoded_text, last_hidden = self.rnn_layer(inputs, sequence_length=sequence_length)
+        if not self._pooling_type:
+            # We exploit the `last_hidden` (the hidden state at the last time step for every layer)
+            # to create a single vector.
+            # If rnn is not bidirection, then output is the hidden state of the last time step
+            # at last layer. Output is shape of `(batch_size, hidden_size)`.
+            # If rnn is bidirection, then output is concatenation of the forward and backward hidden state
+            # of the last time step at last layer. Output is shape of `(batch_size, hidden_size * 2)`.
+            if self._direction != "bidirect":
+                output = last_hidden[-1, :, :]
+            else:
+                output = paddle.concat((last_hidden[-2, :, :], last_hidden[-1, :, :]), axis=1)
+        else:
+            # We exploit the `encoded_text` (the hidden state at the every time step for last layer)
+            # to create a single vector. We perform pooling on the encoded text.
+            # The output shape is `(batch_size, hidden_size * 2)` if use bidirectional RNN,
+            # otherwise the output shape is `(batch_size, hidden_size * 2)`.
+            if self._pooling_type == "sum":
+                output = paddle.sum(encoded_text, axis=1)
+            elif self._pooling_type == "max":
+                output = paddle.max(encoded_text, axis=1)
+            elif self._pooling_type == "mean":
+                output = paddle.mean(encoded_text, axis=1)
+            else:
+                raise RuntimeError(
+                    "Unexpected pooling type %s ."
+                    "Pooling type must be one of sum, max and mean." % self._pooling_type
+                )
+        return output
+
+
+class Chomp1d(nn.Layer):
+    """
+    Remove the elements on the right.
+
+    Args:
+        chomp_size (int): The number of elements removed.
+    """
+
+    def __init__(self, chomp_size):
+        super(Chomp1d, self).__init__()
+        self.chomp_size = chomp_size
+
+    def forward(self, x):
+        return x[:, :, : -self.chomp_size]
+
+
+class TemporalBlock(nn.Layer):
+    """
+    The TCN block, consists of dilated causal conv, relu and residual block.
+    See the Figure 1(b) in https://arxiv.org/pdf/1803.01271.pdf for more details.
+
+    Args:
+        n_inputs ([int]): The number of channels in the input tensor.
+        n_outputs ([int]): The number of filters.
+        kernel_size ([int]): The filter size.
+        stride ([int]): The stride size.
+        dilation ([int]): The dilation size.
+        padding ([int]): The size of zeros to be padded.
+        dropout (float, optional): Probability of dropout the units. Defaults to 0.2.
+    """
+
+    def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2):
+
+        super(TemporalBlock, self).__init__()
+        self.conv1 = weight_norm(
+            nn.Conv1D(n_inputs, n_outputs, kernel_size, stride=stride, padding=padding, dilation=dilation)
+        )
+        # Chomp1d is used to make sure the network is causal.
+        # We pad by (k-1)*d on the two sides of the input for convolution,
+        # and then use Chomp1d to remove the (k-1)*d output elements on the right.
+        self.chomp1 = Chomp1d(padding)
+        self.relu1 = nn.ReLU()
+        self.dropout1 = nn.Dropout(dropout)
+
+        self.conv2 = weight_norm(
+            nn.Conv1D(n_outputs, n_outputs, kernel_size, stride=stride, padding=padding, dilation=dilation)
+        )
+        self.chomp2 = Chomp1d(padding)
+        self.relu2 = nn.ReLU()
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.net = nn.Sequential(
+            self.conv1, self.chomp1, self.relu1, self.dropout1, self.conv2, self.chomp2, self.relu2, self.dropout2
+        )
+        self.downsample = nn.Conv1D(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None
+        self.relu = nn.ReLU()
+        self.init_weights()
+
+    def init_weights(self):
+        self.conv1.weight.set_value(paddle.tensor.normal(0.0, 0.01, self.conv1.weight.shape))
+        self.conv2.weight.set_value(paddle.tensor.normal(0.0, 0.01, self.conv2.weight.shape))
+        if self.downsample is not None:
+            self.downsample.weight.set_value(paddle.tensor.normal(0.0, 0.01, self.downsample.weight.shape))
+
+    def forward(self, x):
+        out = self.net(x)
+        res = x if self.downsample is None else self.downsample(x)
+        return self.relu(out + res)
+
+
+class TCNEncoder(nn.Layer):
+    r"""
+    A `TCNEncoder` takes as input a sequence of vectors and returns a
+    single vector, which is the last one time step in the feature map.
+    The input to this encoder is of shape `(batch_size, num_tokens, input_size)`,
+    and the output is of shape `(batch_size, num_channels[-1])` with a receptive
+    filed:
+
+    .. math::
+
+        receptive filed = 2 * \sum_{i=0}^{len(num\_channels)-1}2^i(kernel\_size-1).
+
+    Temporal Convolutional Networks is a simple convolutional architecture. It outperforms canonical recurrent networks
+    such as LSTMs in many tasks. See https://arxiv.org/pdf/1803.01271.pdf for more details.
+
+    Args:
+        input_size (int): The number of expected features in the input (the last dimension).
+        num_channels (list): The number of channels in different layer.
+        kernel_size (int): The kernel size. Defaults to 2.
+        dropout (float): The dropout probability. Defaults to 0.2.
+    """
+
+    def __init__(self, input_size, num_channels, kernel_size=2, dropout=0.2):
+        super(TCNEncoder, self).__init__()
+        self._input_size = input_size
+        self._output_dim = num_channels[-1]
+
+        layers = nn.LayerList()
+        num_levels = len(num_channels)
+        for i in range(num_levels):
+            dilation_size = 2**i
+            in_channels = input_size if i == 0 else num_channels[i - 1]
+            out_channels = num_channels[i]
+            layers.append(
+                TemporalBlock(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=dilation_size,
+                    padding=(kernel_size - 1) * dilation_size,
+                    dropout=dropout,
+                )
+            )
+
+        self.network = nn.Sequential(*layers)
+
+    def get_input_dim(self):
+        """
+        Returns the dimension of the vector input for each element in the sequence input
+        to a `TCNEncoder`. This is not the shape of the input tensor, but the
+        last element of that shape.
+        """
+        return self._input_size
+
+    def get_output_dim(self):
+        """
+        Returns the dimension of the final vector output by this `TCNEncoder`.  This is not
+        the shape of the returned tensor, but the last element of that shape.
+        """
+        return self._output_dim
+
+    def forward(self, inputs):
+        r"""
+        TCNEncoder takes as input a sequence of vectors and returns a
+        single vector, which is the last one time step in the feature map.
+        The input to this encoder is of shape `(batch_size, num_tokens, input_size)`,
+        and the output is of shape `(batch_size, num_channels[-1])` with a receptive
+        filed:
+
+        .. math::
+
+            receptive filed = 2 * \sum_{i=0}^{len(num\_channels)-1}2^i(kernel\_size-1).
+
+        Args:
+            inputs (Tensor): The input tensor with shape `[batch_size, num_tokens, input_size]`.
+
+        Returns:
+            Tensor: Returns tensor `output` with shape `[batch_size, num_channels[-1]]`.
+        """
+        inputs_t = inputs.transpose([0, 2, 1])
+        output = self.network(inputs_t).transpose([2, 0, 1])[-1]
+        return output
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/__init__.py
new file mode 100644
index 000000000..25faacc17
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .server import SimpleServer
+from .handlers import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/base_router.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/base_router.py
new file mode 100644
index 000000000..83f884e56
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/base_router.py
@@ -0,0 +1,32 @@
+# coding:utf-8
+# Copyright (c) 2022  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+
+
+class BaseRouterManager(abc.ABC):
+    _app = None
+
+    def __init__(self, app):
+        super().__init__()
+        self._app = app
+
+    @abc.abstractmethod
+    def register_models_router(self):
+        return NotImplemented
+
+    @abc.abstractmethod
+    def register_taskflow_router(self):
+        return NotImplemented
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/__init__.py
new file mode 100644
index 000000000..397234ceb
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base_handler import BaseModelHandler, BasePostHandler, BaseTaskflowHandler
+from .cls_post_handler import (
+    MultiClassificationPostHandler,
+    MultiLabelClassificationPostHandler,
+)
+from .custom_model_handler import CustomModelHandler, ERNIEMHandler
+from .qa_model_handler import QAModelHandler
+from .taskflow_handler import TaskflowHandler
+from .token_model_handler import TokenClsModelHandler
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/base_handler.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/base_handler.py
new file mode 100644
index 000000000..5206f28cb
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/base_handler.py
@@ -0,0 +1,46 @@
+# coding:utf-8
+# Copyright (c) 2022  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABCMeta, abstractmethod
+
+
+class BaseModelHandler(metaclass=ABCMeta):
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    @abstractmethod
+    def process(cls, predictor, tokenizer, data, parameters):
+        pass
+
+
+class BasePostHandler(metaclass=ABCMeta):
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    @abstractmethod
+    def process(cls, data, parameters):
+        pass
+
+
+class BaseTaskflowHandler(metaclass=ABCMeta):
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    @abstractmethod
+    def process(cls, data, parameters):
+        pass
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/cls_post_handler.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/cls_post_handler.py
new file mode 100644
index 000000000..cd019f8b3
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/cls_post_handler.py
@@ -0,0 +1,71 @@
+# coding:utf-8
+# Copyright (c) 2022  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+
+from .base_handler import BasePostHandler
+
+
+class MultiClassificationPostHandler(BasePostHandler):
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    def process(cls, data, parameters):
+        if "logits" not in data:
+            raise ValueError(
+                "The output of model handler do not include the 'logits', "
+                " please check the model handler output. The model handler output:\n{}".format(data)
+            )
+
+        logits = data["logits"]
+        logits = np.array(logits)
+        max_value = np.max(logits, axis=1, keepdims=True)
+        exp_data = np.exp(logits - max_value)
+        probs = exp_data / np.sum(exp_data, axis=1, keepdims=True)
+        out_dict = {"label": logits.argmax(axis=-1).tolist(), "confidence": probs.max(axis=-1).tolist()}
+        return out_dict
+
+
+class MultiLabelClassificationPostHandler(BasePostHandler):
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    def process(cls, data, parameters):
+        if "logits" not in data:
+            raise ValueError(
+                "The output of model handler do not include the 'logits', "
+                " please check the model handler output. The model handler output:\n{}".format(data)
+            )
+
+        prob_limit = 0.5
+        if "prob_limit" in parameters:
+            prob_limit = parameters["prob_limit"]
+        logits = data["logits"]
+        logits = np.array(logits)
+        logits = 1 / (1.0 + np.exp(-logits))
+        labels = []
+        probs = []
+        for logit in logits:
+            label = []
+            prob = []
+            for i, p in enumerate(logit):
+                if p > prob_limit:
+                    label.append(i)
+                    prob.append(p)
+            labels.append(label)
+            probs.append(prob)
+        out_dict = {"label": labels, "confidence": probs}
+        return out_dict
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/custom_model_handler.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/custom_model_handler.py
new file mode 100644
index 000000000..c5a133b8e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/custom_model_handler.py
@@ -0,0 +1,156 @@
+# coding:utf-8
+# Copyright (c) 2023  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+
+from ...data import Pad, Tuple
+from .base_handler import BaseModelHandler
+
+
+class CustomModelHandler(BaseModelHandler):
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    def process(cls, predictor, tokenizer, data, parameters):
+        max_seq_len = 128
+        batch_size = 1
+        if "max_seq_len" not in parameters:
+            max_seq_len = parameters["max_seq_len"]
+        if "batch_size" not in parameters:
+            batch_size = parameters["batch_size"]
+        text = None
+        if "text" in data:
+            text = data["text"]
+        if text is None:
+            return {}
+        if isinstance(text, str):
+            text = [text]
+        has_pair = False
+        if "text_pair" in data and data["text_pair"] is not None:
+            text_pair = data["text_pair"]
+            if isinstance(text_pair, str):
+                text_pair = [text_pair]
+            if len(text) != len(text_pair):
+                raise ValueError("The length of text and text_pair must be same.")
+            has_pair = True
+
+        # Get the result of tokenizer
+        examples = []
+        for idx, data in enumerate(text):
+            if has_pair:
+                result = tokenizer(text=text[idx], text_pair=text_pair[idx], max_length=max_seq_len)
+            else:
+                result = tokenizer(text=text[idx], max_length=max_seq_len)
+            examples.append((result["input_ids"], result["token_type_ids"]))
+
+        # Separates data into some batches.
+        batches = [examples[i : i + batch_size] for i in range(0, len(examples), batch_size)]
+
+        def batchify_fn(samples):
+            return Tuple(
+                Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),
+                Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"),
+            )(samples)
+
+        results = [[]] * predictor._output_num
+        for batch in batches:
+            input_ids, token_type_ids = batchify_fn(batch)
+            if predictor._predictor_type == "paddle_inference":
+                predictor._input_handles[0].copy_from_cpu(input_ids)
+                predictor._input_handles[1].copy_from_cpu(token_type_ids)
+                predictor._predictor.run()
+                output = [output_handle.copy_to_cpu() for output_handle in predictor._output_handles]
+                for i, out in enumerate(output):
+                    results[i].append(out)
+            else:
+                predictor._predictor.run(None, {"input_ids": input_ids, "token_type_ids": token_type_ids})
+                for i, out in enumerate(output):
+                    results[i].append(out)
+
+        # Resolve the logits result and get the predict label and confidence
+        results_concat = []
+        for i in range(0, len(results)):
+            results_concat.append(np.concatenate(results[i], axis=0))
+        out_dict = {"logits": results_concat[0].tolist(), "data": data}
+        for i in range(1, len(results_concat)):
+            out_dict[f"logits_{i}"] = results_concat[i].tolist()
+        return out_dict
+
+
+class ERNIEMHandler(BaseModelHandler):
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    def process(cls, predictor, tokenizer, data, parameters):
+        max_seq_len = 128
+        batch_size = 1
+        if "max_seq_len" not in parameters:
+            max_seq_len = parameters["max_seq_len"]
+        if "batch_size" not in parameters:
+            batch_size = parameters["batch_size"]
+        text = None
+        if "text" in data:
+            text = data["text"]
+        if text is None:
+            return {}
+        if isinstance(text, str):
+            text = [text]
+        has_pair = False
+        if "text_pair" in data and data["text_pair"] is not None:
+            text_pair = data["text_pair"]
+            if isinstance(text_pair, str):
+                text_pair = [text_pair]
+            if len(text) != len(text_pair):
+                raise ValueError("The length of text and text_pair must be same.")
+            has_pair = True
+
+        # Get the result of tokenizer
+        examples = []
+        for idx, data in enumerate(text):
+            if has_pair:
+                result = tokenizer(text=text[idx], text_pair=text_pair[idx], max_length=max_seq_len)
+            else:
+                result = tokenizer(text=text[idx], max_length=max_seq_len)
+            examples.append(result["input_ids"])
+
+        # Separates data into some batches.
+        batches = [examples[i : i + batch_size] for i in range(0, len(examples), batch_size)]
+
+        def batchify_fn(samples):
+            return Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64")(samples)
+
+        results = [[]] * predictor._output_num
+        for batch in batches:
+            input_ids = batchify_fn(batch)
+            if predictor._predictor_type == "paddle_inference":
+                predictor._input_handles[0].copy_from_cpu(input_ids)
+                predictor._predictor.run()
+                output = [output_handle.copy_to_cpu() for output_handle in predictor._output_handles]
+                for i, out in enumerate(output):
+                    results[i].append(out)
+            else:
+                predictor._predictor.run(None, {"input_ids": input_ids})
+                for i, out in enumerate(output):
+                    results[i].append(out)
+
+        # Resolve the logits result and get the predict label and confidence
+        results_concat = []
+        for i in range(0, len(results)):
+            results_concat.append(np.concatenate(results[i], axis=0))
+        out_dict = {"logits": results_concat[0].tolist(), "data": data}
+        for i in range(1, len(results_concat)):
+            out_dict[f"logits_{i}"] = results_concat[i].tolist()
+        return out_dict
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/qa_model_handler.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/qa_model_handler.py
new file mode 100644
index 000000000..e67232267
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/qa_model_handler.py
@@ -0,0 +1,89 @@
+# coding:utf-8
+# Copyright (c) 2022  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from .base_handler import BaseModelHandler
+
+
+class QAModelHandler(BaseModelHandler):
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    def process(cls, predictor, tokenizer, data, parameters):
+
+        max_seq_len = 128
+        doc_stride = 128
+        batch_size = 1
+        if "max_seq_len" in parameters:
+            max_seq_len = parameters["max_seq_len"]
+        if "batch_size" in parameters:
+            batch_size = parameters["batch_size"]
+        if "doc_stride" in parameters:
+            doc_stride = parameters["doc_stride"]
+        context = None
+        question = None
+
+        # Get the context in qa task
+        if "context" in data:
+            context = data["context"]
+        if context is None:
+            return {}
+        if isinstance(context, str):
+            context = [context]
+
+        # Get the context in qa task
+        if "question" in data:
+            question = data["question"]
+        if question is None:
+            return {}
+        if isinstance(question, str):
+            question = [question]
+
+        tokenizer_results = tokenizer(
+            question,
+            context,
+            stride=doc_stride,
+            max_length=max_seq_len,
+            return_offsets_mapping=True,
+            pad_to_max_seq_len=True,
+        )
+        input_ids = tokenizer_results["input_ids"]
+        token_type_ids = tokenizer_results["token_type_ids"]
+        # Separates data into some batches.
+        batches = [[i, i + batch_size] for i in range(0, len(input_ids), batch_size)]
+
+        results = [[] for i in range(0, predictor._output_num)]
+        for start, end in batches:
+            input_id = np.array(input_ids[start:end]).astype("int64")
+            token_type_id = np.array(token_type_ids[start:end]).astype("int64")
+            if predictor._predictor_type == "paddle_inference":
+                predictor._input_handles[0].copy_from_cpu(input_id)
+                predictor._input_handles[1].copy_from_cpu(token_type_id)
+
+                predictor._predictor.run()
+                output = [output_handle.copy_to_cpu() for output_handle in predictor._output_handles]
+                for i, out in enumerate(output):
+                    results[i].extend(out.tolist())
+            else:
+                output = predictor._predictor.run(None, {"input_ids": input_id, "token_type_ids": token_type_id})
+                for i, out in enumerate(output):
+                    results[i].extend(out.tolist())
+        data["offset_mapping"] = tokenizer_results["offset_mapping"]
+        out_dict = {"logits": results[0], "data": data}
+        for i in range(1, len(results)):
+            out_dict[f"logits_{i}"] = results[1]
+        return out_dict
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/taskflow_handler.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/taskflow_handler.py
new file mode 100644
index 000000000..fce31ccc8
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/taskflow_handler.py
@@ -0,0 +1,34 @@
+# coding:utf-8
+# Copyright (c) 2022  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .base_handler import BaseTaskflowHandler
+
+
+class TaskflowHandler(BaseTaskflowHandler):
+    def __init__(self):
+        self._name = "taskflow_handler"
+
+    @classmethod
+    def process(cls, predictor, data, parameters):
+        if data is None:
+            return {}
+        text = None
+        if "text" in data:
+            text = data["text"]
+        else:
+            return {}
+        if "schema" in parameters:
+            schema = parameters["schema"]
+            predictor.set_schema(schema)
+        return predictor(text)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/token_model_handler.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/token_model_handler.py
new file mode 100644
index 000000000..b7a3baf4e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/handlers/token_model_handler.py
@@ -0,0 +1,114 @@
+# coding:utf-8
+# Copyright (c) 2022  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+
+from ...data import Pad, Tuple
+from .base_handler import BaseModelHandler
+
+
+class TokenClsModelHandler(BaseModelHandler):
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    def process(cls, predictor, tokenizer, data, parameters):
+        max_seq_len = 128
+        batch_size = 1
+        return_attention_mask = False
+        is_split_into_words = False
+        if "max_seq_len" in parameters:
+            max_seq_len = parameters["max_seq_len"]
+        if "batch_size" in parameters:
+            batch_size = parameters["batch_size"]
+        if "return_attention_mask" in parameters:
+            return_attention_mask = parameters["return_attention_mask"]
+        if "is_split_into_words" in parameters:
+            is_split_into_words = parameters["is_split_into_words"]
+        text = None
+        if "text" in data:
+            text = data["text"]
+        if text is None:
+            return {}
+        if isinstance(text, str):
+            text = [text]
+        has_pair = False
+        if "text_pair" in data and data["text_pair"] is not None:
+            text_pair = data["text_pair"]
+            if isinstance(text_pair, str):
+                text_pair = [text_pair]
+            if len(text) != len(text_pair):
+                raise ValueError("The length of text and text_pair must be same.")
+            has_pair = True
+
+        # Get the result of tokenizer
+        pad = True
+        if len(text) == 1:
+            pad = False
+        examples = []
+        if has_pair:
+            tokenizer_result = tokenizer(
+                text=text,
+                text_pair=text_pair,
+                max_length=max_seq_len,
+                truncation=True,
+                return_attention_mask=return_attention_mask,
+                is_split_into_words=is_split_into_words,
+                padding=pad,
+            )
+        else:
+            tokenizer_result = tokenizer(
+                text=text,
+                max_length=max_seq_len,
+                truncation=True,
+                return_attention_mask=return_attention_mask,
+                is_split_into_words=is_split_into_words,
+                padding=pad,
+            )
+
+        examples = []
+        for input_ids, token_type_ids in zip(tokenizer_result["input_ids"], tokenizer_result["token_type_ids"]):
+            examples.append((input_ids, token_type_ids))
+        # Separates data into some batches.
+        batches = [examples[i : i + batch_size] for i in range(0, len(examples), batch_size)]
+
+        batchify_fn = lambda samples, fn=Tuple(
+            Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),  # input
+            Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"),  # segment
+        ): fn(samples)
+        results = [[] for i in range(0, predictor._output_num)]
+        for batch in batches:
+            input_ids, token_type_ids = batchify_fn(batch)
+            if predictor._predictor_type == "paddle_inference":
+                predictor._input_handles[0].copy_from_cpu(input_ids)
+                predictor._input_handles[1].copy_from_cpu(token_type_ids)
+
+                predictor._predictor.run()
+                output = [output_handle.copy_to_cpu() for output_handle in predictor._output_handles]
+                for i, out in enumerate(output):
+                    results[i].append(out)
+            else:
+                output = predictor._predictor.run(None, {"input_ids": input_ids, "token_type_ids": token_type_ids})
+                for i, out in enumerate(output):
+                    results[i].append(out)
+
+        results_concat = []
+        for i in range(0, len(results)):
+            results_concat.append(np.concatenate(results[i], axis=0))
+        out_dict = {"logits": results_concat[0].tolist(), "data": data}
+        for i in range(1, len(results_concat)):
+            out_dict[f"logits_{i}"] = results_concat[i].tolist()
+        if return_attention_mask:
+            out_dict["attention_mask"] = tokenizer_result["attention_mask"]
+        return out_dict
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/http_router/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/http_router/__init__.py
new file mode 100644
index 000000000..24ce449a2
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/http_router/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .router import HttpRouterManager
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/http_router/router.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/http_router/router.py
new file mode 100644
index 000000000..0b5668ded
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/http_router/router.py
@@ -0,0 +1,119 @@
+# coding:utf-8
+# Copyright (c) 2022  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import hashlib
+import typing
+from typing import Optional
+
+from fastapi import APIRouter, Request
+from pydantic import BaseModel, Extra, create_model
+
+from ...utils.log import logger
+from ..base_router import BaseRouterManager
+
+
+class ResponseBase(BaseModel):
+    text: Optional[str] = None
+
+
+class RequestBase(BaseModel, extra=Extra.forbid):
+    parameters: Optional[dict] = {}
+
+
+class HttpRouterManager(BaseRouterManager):
+    def register_models_router(self, task_name):
+
+        # Url path to register the model
+        paths = [f"/{task_name}"]
+        for path in paths:
+            logger.info("   Transformer model request [path]={} is genereated.".format(path))
+
+        # Unique name to create the pydantic model
+        unique_name = hashlib.md5(task_name.encode()).hexdigest()
+
+        # Create request model
+        req_model = create_model(
+            "RequestModel" + unique_name,
+            data=(typing.Any, ...),
+            __base__=RequestBase,
+        )
+
+        # Create response model
+        resp_model = create_model(
+            "ResponseModel" + unique_name,
+            result=(typing.Any, ...),
+            __base__=ResponseBase,
+        )
+
+        # Template predict endpoint function to dynamically serve different models
+        def predict(request: Request, inference_request: req_model):
+            result = self._app._model_manager.predict(inference_request.data, inference_request.parameters)
+            return {"result": result}
+
+        # Register the route and add to the app
+        router = APIRouter()
+        for path in paths:
+            router.add_api_route(
+                path,
+                predict,
+                methods=["post"],
+                summary=f"{task_name.title()}",
+                response_model=resp_model,
+                response_model_exclude_unset=True,
+                response_model_exclude_none=True,
+            )
+        self._app.include_router(router)
+
+    def register_taskflow_router(self, task_name):
+
+        # Url path to register the model
+        paths = [f"/{task_name}"]
+        for path in paths:
+            logger.info("   Taskflow  request [path]={} is genereated.".format(path))
+
+        # Unique name to create the pydantic model
+        unique_name = hashlib.md5(task_name.encode()).hexdigest()
+
+        # Create request model
+        req_model = create_model(
+            "RequestModel" + unique_name,
+            data=(typing.Any, ...),
+            __base__=RequestBase,
+        )
+
+        # Create response model
+        resp_model = create_model(
+            "ResponseModel" + unique_name,
+            result=(typing.Any, ...),
+            __base__=ResponseBase,
+        )
+
+        # Template predict endpoint function to dynamically serve different models
+        def predict(request: Request, inference_request: req_model):
+            result = self._app._taskflow_manager.predict(inference_request.data, inference_request.parameters)
+            return {"result": result}
+
+        # Register the route and add to the app
+        router = APIRouter()
+        for path in paths:
+            router.add_api_route(
+                path,
+                predict,
+                methods=["post"],
+                summary=f"{task_name.title()}",
+                response_model=resp_model,
+                response_model_exclude_unset=True,
+                response_model_exclude_none=True,
+            )
+        self._app.include_router(router)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/model_manager.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/model_manager.py
new file mode 100644
index 000000000..8739d55ae
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/model_manager.py
@@ -0,0 +1,96 @@
+# coding:utf-8
+# copyright (c) 2022  paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license"
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+import time
+
+from ..transformers import AutoTokenizer
+from ..utils.log import logger
+from ..utils.tools import get_env_device
+from .handlers import BaseModelHandler, BasePostHandler
+from .predictor import Predictor
+from .utils import lock_predictor
+
+
+class ModelManager:
+    def __init__(self, task_name, model_path, tokenizer_name, model_handler, post_handler, precision, device_id):
+        self._task_name = task_name
+        self._model_path = model_path
+        self._tokenizer_name = tokenizer_name
+        self._model_handler = model_handler
+        self._post_handler = post_handler
+        self._precision = precision
+        self._device_id = device_id
+        self._tokenizer = None
+        self._register()
+
+    def _register(self):
+        # Get the model handler
+        if not issubclass(self._model_handler, BaseModelHandler):
+            raise TypeError(
+                "The model_handler must be subclass of paddlenlp.server.handlers.BaseModelHandler, please check the type."
+            )
+        self._model_handler = self._model_handler.process
+
+        if not issubclass(self._post_handler, BasePostHandler):
+            raise TypeError(
+                "The post_handler must be subclass of paddlenlp.server.handlers.BasePostHandler, please check the type."
+            )
+        self._post_handler = self._post_handler.process
+
+        # Create the model predictor
+        device = get_env_device()
+        predictor_list = []
+        if device == "cpu" or self._device_id == -1:
+            predictor = Predictor(self._model_path, self._precision, "cpu")
+            predictor_list.append(predictor)
+        elif isinstance(self._device_id, int):
+            predictor = Predictor(self._model_path, self._precision, "gpu:" + str(self._device_id))
+            predictor_list.append(predictor)
+        elif isinstance(self._device_id, list):
+            for device in self._device_id:
+                predictor = Predictor(
+                    self._model_path,
+                    self._model_class_or_name,
+                    self._input_spec,
+                    self._precision,
+                    "gpu:" + str(device),
+                )
+                predictor_list.append(predictor)
+        self._predictor_list = predictor_list
+
+        # Get the tokenize of model
+        self._get_tokenizer()
+
+    def _get_tokenizer(self):
+        if self._tokenizer_name is not None:
+            if isinstance(self._tokenizer_name, str):
+                self._tokenizer = AutoTokenizer.from_pretrained(self._tokenizer_name)
+            else:
+                logger.error("The argrument of `tokenizer_name`  must be the name of tokenizer.")
+        assert self._tokenizer is not None, "The tokenizer must be not register, you could set the class of Tokenizer"
+
+    def _get_predict_id(self):
+        t = time.time()
+        t = int(round(t * 1000))
+        predictor_id = t % len(self._predictor_list)
+        logger.info("The predictor id: {} is selected by running the model.".format(predictor_id))
+        return predictor_id
+
+    def predict(self, data, parameters):
+        predictor_id = self._get_predict_id()
+        with lock_predictor(self._predictor_list[predictor_id]._lock):
+            model_output = self._model_handler(self._predictor_list[predictor_id], self._tokenizer, data, parameters)
+            final_output = self._post_handler(model_output, parameters)
+            return final_output
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/predictor.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/predictor.py
new file mode 100644
index 000000000..45d803e4b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/predictor.py
@@ -0,0 +1,202 @@
+# coding:utf-8
+# Copyright (c) 2022  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import os
+import sys
+import threading
+from multiprocessing import cpu_count
+
+import paddle
+
+from ..utils.log import logger
+
+
+class Predictor:
+    def __init__(self, model_path, precision, device):
+        self._model_path = model_path
+        self._default_static_model_path = "auto_static"
+        self._precision = precision
+        self._cpu_thread = 8
+        self._config = None
+        self._device = device
+        self._num_threads = math.ceil(cpu_count() / 2)
+        self._output_num = 1
+        paddle.set_device(device)
+        self._create_predictor()
+        self._lock = threading.Lock()
+
+    def _get_default_static_model_path(self):
+        # The model path had the static_model_path
+        static_model_path = os.path.join(self._model_path, self._default_static_model_path, "inference.pdmodel")
+        if os.path.exists(static_model_path):
+            return os.path.join(self._model_path, self._default_static_model_path, "inference")
+        for file_name in os.listdir(self._model_path):
+            # FIXME(wawltor) The path maybe not correct
+            if file_name.count(".pdmodel"):
+                return os.path.join(self._model_path, file_name[:-8])
+        return None
+
+    def _is_int8_model(self, model_path):
+        paddle.set_device("cpu")
+        model = paddle.jit.load(model_path)
+        program = model.program()
+        for block in program.blocks:
+            for i, op in enumerate(block.ops):
+                if op.type.count("quantize"):
+                    paddle.set_device(self._device)
+                    return True
+        paddle.set_device(self._device)
+        return False
+
+    def _create_predictor(self):
+        # Get the model parameter path and model config path
+        static_model_path = self._get_default_static_model_path()
+
+        # Convert the Draph Model to Static Model
+        if static_model_path is None:
+            raise RuntimeError("The model path do not include the inference model, please check!")
+        is_int8_model = self._is_int8_model(static_model_path)
+        # Load the inference model and maybe we will convert the onnx model
+        # Judge the predictor type for the inference
+        if self._precision == "int8" and not is_int8_model:
+            self._precision = "fp32"
+
+        if is_int8_model:
+            self._precision = "int8"
+
+        self._predictor_type = self._check_predictor_type()
+        if self._predictor_type == "paddle_inference":
+            self._prepare_paddle_mode(static_model_path)
+        else:
+            self._prepare_onnx_mode(static_model_path)
+
+    def _check_predictor_type(self):
+        predictor_type = "paddle_inference"
+        device = paddle.get_device()
+        if self._precision == "int8" or device == "xpu" or device == "cpu":
+            predictor_type = "paddle_inference"
+        else:
+            if device.count("gpu") and self._precision == "fp16":
+                try:
+                    import onnx  # noqa F401
+                    import onnxruntime as ort  # noqa F401
+                    import paddle2onnx  # noqa F401
+                    from onnxconverter_common import float16  # noqa F401
+
+                    predictor_type = "onnxruntime"
+                except Exception:
+                    logger.error(
+                        "The inference precision is change to 'fp32', please install the dependencies that required for 'fp16' inference, you could use the commands as fololws:\n"
+                        " ****** pip uninstall onnxruntime ******\n"
+                        " ****** pip install onnxruntime-gpu onnx onnxconverter-common ******"
+                    )
+                    sys.exit(-1)
+        return predictor_type
+
+    def _prepare_paddle_mode(self, static_model_path):
+        """
+        Construct the input data and predictor in the PaddlePaddele static mode.
+        """
+        self._config = paddle.inference.Config(static_model_path + ".pdmodel", static_model_path + ".pdiparams")
+        self._config.disable_glog_info()
+        if paddle.get_device() == "cpu":
+            self._config.disable_gpu()
+            self._config.enable_mkldnn()
+            self._config.enable_memory_optim()
+            if self._precision == "int8":
+                self._config.enable_mkldnn_bfloat16()
+            elif self._precision == "fp16":
+                self._config.enable_mkldnn_int8()
+        else:
+            self._config.enable_use_gpu(100, int(self._device.split(":")[-1]))
+            if self._precision == "int8":
+                # FIXME(wawltor) The paddlenlp serving support the int8 model
+                logger.warning("The PaddleNLP serving do not support the INT8 model, we will support later!")
+                sys.exit(-1)
+
+        self._config.switch_use_feed_fetch_ops(False)
+        self._config.set_cpu_math_library_num_threads(self._num_threads)
+        self._config.delete_pass("embedding_eltwise_layernorm_fuse_pass")
+        self._predictor = paddle.inference.create_predictor(self._config)
+        self._input_handles = [self._predictor.get_input_handle(name) for name in self._predictor.get_input_names()]
+        self._output_handles = [self._predictor.get_output_handle(name) for name in self._predictor.get_output_names()]
+        self._output_num = len(self._output_handles)
+
+    def _prepare_onnx_mode(self, static_model_path):
+        import onnx
+        import onnxruntime as ort
+        import paddle2onnx
+        from onnxconverter_common import float16
+
+        onnx_dir = os.path.join(self._model_path, "onnx")
+        if not os.path.exists(onnx_dir):
+            os.mkdir(onnx_dir)
+        float_onnx_file = os.path.join(onnx_dir, "model.onnx")
+        if not os.path.exists(float_onnx_file):
+            model_path = static_model_path + ".pdmodel"
+            params_file = static_model_path + ".pdiparams"
+            onnx_model = paddle2onnx.command.c_paddle_to_onnx(
+                model_file=model_path, params_file=params_file, opset_version=13, enable_onnx_checker=True
+            )
+            with open(float_onnx_file, "wb") as f:
+                f.write(onnx_model)
+        fp16_model_file = os.path.join(onnx_dir, "fp16_model.onnx")
+        if not os.path.exists(fp16_model_file):
+            onnx_model = onnx.load_model(float_onnx_file)
+            trans_model = float16.convert_float_to_float16(onnx_model, keep_io_types=True)
+            onnx.save_model(trans_model, fp16_model_file)
+        providers = ["CUDAExecutionProvider"]
+        sess_options = ort.SessionOptions()
+        sess_options.inter_op_num_threads = self._num_threads
+        device_id = int(self._device.split(":")[-1])
+        self._predictor = ort.InferenceSession(
+            fp16_model_file,
+            sess_options=sess_options,
+            providers=providers,
+            provider_options=[{"device_id": device_id}],
+        )
+        self._output_num = len(self._predictor.get_outputs())
+        assert "CUDAExecutionProvider" in self._predictor.get_providers(), (
+            "The environment for GPU inference is not set properly. "
+            "A possible cause is that you had installed both onnxruntime and onnxruntime-gpu. "
+            "Please run the following commands to reinstall: \n "
+            "1) pip uninstall -y onnxruntime onnxruntime-gpu \n 2) pip install onnxruntime-gpu"
+        )
+
+    def _convert_dygraph_to_static(self, model_instance, input_spec):
+        """
+        Convert the dygraph model to static model.
+        """
+        assert (
+            model_instance is not None
+        ), "The dygraph model must be created before converting the dygraph model to static model."
+        assert (
+            input_spec is not None
+        ), "The input spec must be created before converting the dygraph model to static model."
+        logger.info(
+            "Converting to the static inference model will cost a little time, please do not break this process."
+        )
+        try:
+            static_model = paddle.jit.to_static(model_instance, input_spec=input_spec)
+            save_path = os.path.join(self._model_path, self._default_static_model_path, "inference")
+            paddle.jit.save(static_model, save_path)
+            logger.info("The static inference model save in the path:{}".format(save_path))
+        except Exception:
+            logger.warning(
+                "Fail convert to inference model, please create the issue for the developers,"
+                "the issue link: https://github.com/PaddlePaddle/PaddleNLP/issues"
+            )
+            sys.exit(-1)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/server.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/server.py
new file mode 100644
index 000000000..15e455858
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/server.py
@@ -0,0 +1,83 @@
+# coding:utf-8
+# Copyright (c) 2022  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from fastapi import FastAPI
+from .http_router import HttpRouterManager
+from .model_manager import ModelManager
+from .taskflow_manager import TaskflowManager
+from ..taskflow import Taskflow
+
+
+class SimpleServer(FastAPI):
+    def __init__(self, **kwargs):
+        """
+        Initial function for the PaddleNLP SimpleServer.
+        """
+        super().__init__(**kwargs)
+        self._router_manager = HttpRouterManager(self)
+        self._taskflow_manager = None
+        self._model_manager = None
+        self._service_name = "paddlenlp"
+        self._service_type = None
+
+    def register(
+        self, task_name, model_path, tokenizer_name, model_handler, post_handler, precision="fp32", device_id=0
+    ):
+        """
+        The register function for the SimpleServer, the main register argrument as follows:
+
+        Args:
+            name(str): The server name for the route.
+            model_path (str):
+            handler(str):
+            device (int|list|str, optional):
+        """
+        self._server_type = "models"
+        model_manager = ModelManager(
+            task_name, model_path, tokenizer_name, model_handler, post_handler, precision, device_id
+        )
+        self._model_manager = model_manager
+        # Register transformers model server router
+        self._router_manager.register_models_router(task_name)
+
+    def register_taskflow(self, task_name, task, taskflow_handler=None):
+        """
+        The register function for the SimpleServer, the main register argrument as follows:
+
+        Args:
+            name(str): The server name for the route.
+            model_or_path (str):
+            handler(str):
+            device (int|list|str, optional):
+        """
+        self._server_type = "server"
+        check_flag = True
+
+        # Check the task type, it must be the instance of Taskflow or List[Taskflow]
+        if isinstance(task, Taskflow):
+            task = [task]
+        for t in task:
+            if not isinstance(t, Taskflow):
+                check_flag = False
+                break
+        if not check_flag:
+            raise TypeError(
+                "Unsupport task type {}, it must be instance of Taskflow or List[Taskflow]".format(type(task))
+            )
+
+        # Register Taskflow server router
+        taskflow_manager = TaskflowManager(task, taskflow_handler)
+        self._taskflow_manager = taskflow_manager
+        self._router_manager.register_taskflow_router(task_name)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/taskflow_manager.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/taskflow_manager.py
new file mode 100644
index 000000000..ba96c33ed
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/taskflow_manager.py
@@ -0,0 +1,40 @@
+# coding:utf-8
+# copyright (c) 2022  paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license"
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+import time
+from .handlers import TaskflowHandler
+from .utils import lock_predictor
+from ..utils.log import logger
+
+
+class TaskflowManager:
+    """
+    The TaskflowManager could predict the raw text.
+    """
+
+    def __init__(self, task, taskflow_handler=None):
+        self._task = task
+        if taskflow_handler is None:
+            self._handler_func = TaskflowHandler.process
+        else:
+            self._handler_func = taskflow_handler.process
+
+    def predict(self, data, parameters):
+        t = time.time()
+        t = int(round(t * 1000))
+        task_index = t % len(self._task)
+        logger.info("The predictor id: {} is selected by running the taskflow.".format(task_index))
+        with lock_predictor(self._task[task_index]._lock):
+            return self._handler_func(self._task[task_index], data, parameters)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/utils.py
new file mode 100644
index 000000000..34d480560
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/server/utils.py
@@ -0,0 +1,25 @@
+# coding:utf-8
+# Copyright (c) 2022  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+
+
+@contextlib.contextmanager
+def lock_predictor(lock):
+    lock.acquire()
+    try:
+        yield
+    finally:
+        lock.release()
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/__init__.py
new file mode 100644
index 000000000..d39fea274
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .taskflow import Taskflow
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/code_generation.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/code_generation.py
new file mode 100644
index 000000000..af4f531a5
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/code_generation.py
@@ -0,0 +1,167 @@
+# Copyright (c) 2022  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+import numpy as np
+import paddle
+
+from ..data import Pad
+from ..transformers import CodeGenForCausalLM, CodeGenTokenizer
+from .task import Task
+
+usage = r"""
+           from paddlenlp import Taskflow
+
+           codegen = Taskflow("code_generation")
+           codegen("def hello_world():")
+           '''
+           ['\n    print("Hello world")']
+           '''
+         """
+
+
+class CodeGenerationTask(Task):
+    """
+    The text generation model to predict the code.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    def __init__(self, task, model, **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
+        self._batch_size = kwargs.get("batch_size", 1)
+        self._max_length = kwargs.get("max_length", 128)
+        self._min_length = kwargs.get("min_length", 0)
+        self._decode_strategy = kwargs.get("decode_strategy", "sampling")
+        self._temperature = kwargs.get("temperature", 0.6)
+        self._top_k = kwargs.get("top_k", 5)
+        self._top_p = kwargs.get("top_p", 1.0)
+        self._num_beams = kwargs.get("num_beams", 4)
+        self._length_penalty = kwargs.get("length_penalty", 1.0)
+        self._repetition_penalty = kwargs.get("repetition_penalty", 1.1)
+        self._output_scores = kwargs.get("output_scores", False)
+        self._use_faster = kwargs.get("use_faster", False)
+        self._construct_tokenizer(model)
+        self._construct_model(model)
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+        self._model = CodeGenForCausalLM.from_pretrained(model)
+        self._model.eval()
+
+    def _construct_tokenizer(self, model):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        self._tokenizer = CodeGenTokenizer.from_pretrained(model)
+
+    def _batchify(self, data, batch_size):
+        """
+        Generate input batches.
+        """
+        padding = False if batch_size == 1 else True
+        pad_func = Pad(pad_val=self._model.pad_token_id, pad_right=False, dtype=np.int64)
+
+        def _parse_batch(batch_examples):
+            if padding:
+                input_ids = pad_func([example for example in batch_examples])
+            else:
+                input_ids = np.asarray([example for example in batch_examples], dtype=np.int64)
+            return input_ids
+
+        examples = self._convert_text_to_input(data)["input_ids"]
+
+        # Separates data into some batches.
+        one_batch = []
+        for example in examples:
+            one_batch.append(example)
+            if len(one_batch) == batch_size:
+                yield _parse_batch(one_batch)
+                one_batch = []
+        if one_batch:
+            yield _parse_batch(one_batch)
+
+    def _convert_text_to_input(self, texts):
+        """
+        Convert input strings to ids.
+        """
+        return self._tokenizer(texts)
+
+    def _preprocess(self, inputs):
+        """
+        Transform the raw text to the model inputs, two steps involved:
+           1) Transform the raw text to token ids.
+           2) Generate the other model inputs from the raw text and token ids.
+        """
+        inputs = self._check_input_text(inputs)
+        batches = self._batchify(inputs, self._batch_size)
+        outputs = {}
+        outputs["batches"] = batches
+        outputs["text"] = inputs
+        return outputs
+
+    def _run_model(self, inputs):
+        """
+        Run the task model from the outputs of the `_tokenize` function.
+        """
+        all_ids = []
+        all_scores = []
+
+        for batch in inputs["batches"]:
+            input_ids = paddle.to_tensor(batch)
+            ids, scores = self._model.generate(
+                input_ids=input_ids,
+                max_length=self._max_length,
+                min_length=self._min_length,
+                decode_strategy=self._decode_strategy,
+                temperature=self._temperature,
+                top_k=self._top_k,
+                top_p=self._top_p,
+                num_beams=self._num_beams,
+                length_penalty=self._length_penalty,
+                repetition_penalty=self._repetition_penalty,
+                use_fast=self._use_faster,
+            )
+            all_ids.extend(ids.numpy().tolist())
+            all_scores.extend(scores.numpy().tolist())
+        inputs["ids"] = all_ids
+        inputs["scores"] = all_scores
+        return inputs
+
+    def _postprocess(self, inputs):
+        """
+        The model output is tag ids, this function will convert the model output to raw text.
+        """
+        batch_out = []
+        generated_ids = inputs["ids"]
+        for generated_id in generated_ids:
+            text = self._tokenizer.decode(generated_id, skip_special_tokens=True, spaces_between_special_tokens=False)
+            text = re.split("\nclass|\ndef|\n#|\n@|\nprint|\nif", text)[0].rstrip()
+            batch_out.append(text)
+        if self._output_scores:
+            return batch_out, inputs["scores"]
+        return batch_out
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        self._input_spec = [
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),
+        ]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/dependency_parsing.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/dependency_parsing.py
new file mode 100644
index 000000000..e72c470ad
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/dependency_parsing.py
@@ -0,0 +1,736 @@
+# coding:utf-8
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+
+import numpy as np
+import paddle
+
+from ..data import Pad, Vocab
+from .models import BiAffineParser
+from .task import Task
+from .utils import download_file
+
+usage = r"""
+           from paddlenlp import Taskflow
+
+           ddp = Taskflow("dependency_parsing")
+           ddp("三亚是一座美丽的城市")
+           '''
+           [{'word': ['三亚', '是', '一座', '美丽', '的', '城市'], 'head': [2, 0, 6, 6, 4, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'MT', 'VOB']}]
+           '''
+           ddp(["三亚是一座美丽的城市", "他送了一本书"])
+           '''
+           [{'word': ['三亚', '是', '一座', '美丽', '的', '城市'], 'head': [2, 0, 6, 6, 4, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'MT', 'VOB']}, {'word': ['他', '送', '了', '一本', '书'], 'head': [2, 0, 2, 5, 2], 'deprel': ['SBV', 'HED', 'MT', 'ATT', 'VOB']}]
+           '''
+
+           ddp = Taskflow("dependency_parsing", prob=True, use_pos=True)
+           ddp("三亚是一座美丽的城市")
+           '''
+           [{'word': ['三亚', '是', '一座', '美丽的城市'], 'head': [2, 0, 4, 2], 'deprel': ['SBV', 'HED', 'ATT', 'VOB'], 'postag': ['LOC', 'v', 'm', 'n'], 'prob': [1.0, 1.0, 1.0, 1.0]}]
+           '''
+
+           ddp = Taskflow("dependency_parsing", model="ddparser-ernie-1.0")
+           ddp("三亚是一座美丽的城市")
+           '''
+           [{'word': ['三亚', '是', '一座', '美丽', '的', '城市'], 'head': [2, 0, 6, 6, 4, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'MT', 'VOB']}]
+           '''
+
+           ddp = Taskflow("dependency_parsing", model="ddparser-ernie-gram-zh")
+           ddp("三亚是一座美丽的城市")
+           '''
+           [{'word': ['三亚', '是', '一座', '美丽', '的', '城市'], 'head': [2, 0, 6, 6, 4, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'MT', 'VOB']}]
+           '''
+
+           # 已分词输入
+           ddp = Taskflow("dependency_parsing", segmented=True)
+           ddp.from_segments([["三亚", "是", "一座", "美丽", "的", "城市"]])
+           '''
+           [{'word': ['三亚', '是', '一座', '美丽', '的', '城市'], 'head': [2, 0, 6, 6, 4, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'MT', 'VOB']}]
+           '''
+           ddp.from_segments([['三亚', '是', '一座', '美丽', '的', '城市'], ['他', '送', '了', '一本', '书']])
+           '''
+           [{'word': ['三亚', '是', '一座', '美丽', '的', '城市'], 'head': [2, 0, 6, 6, 4, 2], 'deprel': ['SBV', 'HED', 'ATT', 'ATT', 'MT', 'VOB']}, {'word': ['他', '送', '了', '一本', '书'], 'head': [2, 0, 2, 5, 2], 'deprel': ['SBV', 'HED', 'MT', 'ATT', 'VOB']}]
+           '''
+         """
+
+
+class DDParserTask(Task):
+    """
+    DDParser task to analyze the dependency relationship between words in a sentence
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        tree(bool): Ensure the output conforms to the tree structure.
+        prob(bool): Whether to return the probability of predicted heads.
+        use_pos(bool): Whether to return the postag.
+        batch_size(int): Numbers of examples a batch.
+        return_visual(bool): If True, the result will contain the dependency visualization.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    resource_files_names = {
+        "model_state": "model_state.pdparams",
+        "word_vocab": "word_vocab.json",
+        "rel_vocab": "rel_vocab.json",
+    }
+    resource_files_urls = {
+        "ddparser": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/dependency_parsing/ddparser/model_state.pdparams",
+                "f388c91e85b5b4d0db40157a4ee28c08",
+            ],
+            "word_vocab": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/dependency_parsing/ddparser/word_vocab.json",
+                "594694033b149cbb724cac0975df07e4",
+            ],
+            "rel_vocab": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/dependency_parsing/ddparser/rel_vocab.json",
+                "0decf1363278705f885184ff8681f4cd",
+            ],
+        },
+        "ddparser-ernie-1.0": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/dependency_parsing/ddparser-ernie-1.0/model_state.pdparams",
+                "78a4d5c2add642a88f6fdbee3574f617",
+            ],
+            "word_vocab": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/dependency_parsing/ddparser-ernie-1.0/word_vocab.json",
+                "17ed37b5b7ebb8475d4bff1ff8dac4b7",
+            ],
+            "rel_vocab": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/dependency_parsing/ddparser-ernie-1.0/rel_vocab.json",
+                "0decf1363278705f885184ff8681f4cd",
+            ],
+        },
+        "ddparser-ernie-gram-zh": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/dependency_parsing/ddparser-ernie-gram-zh/model_state.pdparams",
+                "9d0a49026feb97fac22c8eec3e88f5c3",
+            ],
+            "word_vocab": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/dependency_parsing/ddparser-ernie-gram-zh/word_vocab.json",
+                "38120123d39876337975cc616901c8b9",
+            ],
+            "rel_vocab": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/dependency_parsing/ddparser-ernie-gram-zh/rel_vocab.json",
+                "0decf1363278705f885184ff8681f4cd",
+            ],
+        },
+        "font_file": {
+            "font_file": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/dependency_parsing/SourceHanSansCN-Regular.ttf",
+                "cecb7328bc0b9412b897fb3fc61edcdb",
+            ]
+        },
+    }
+
+    def __init__(
+        self,
+        task,
+        model,
+        tree=True,
+        prob=False,
+        use_pos=False,
+        use_cuda=False,
+        batch_size=1,
+        return_visual=False,
+        **kwargs
+    ):
+        super().__init__(task=task, model=model, **kwargs)
+        self._usage = usage
+        self.model = model
+
+        if self.model == "ddparser":
+            self.encoding_model = "lstm-pe"
+        elif self.model == "ddparser-ernie-1.0":
+            self.encoding_model = "ernie-1.0"
+        elif self.model == "ddparser-ernie-gram-zh":
+            self.encoding_model = "ernie-gram-zh"
+        else:
+            raise ValueError(
+                "The encoding model should be one of \
+                ddparser, ddparser-ernie-1.0 and ddparser-ernie-gram-zh"
+            )
+        self._check_task_files()
+        self._construct_vocabs()
+        self.font_file_path = download_file(
+            self._task_path,
+            "SourceHanSansCN-Regular.ttf",
+            self.resource_files_urls["font_file"]["font_file"][0],
+            self.resource_files_urls["font_file"]["font_file"][1],
+        )
+        self.tree = tree
+        self.prob = prob
+        self.use_pos = use_pos
+        self.batch_size = batch_size
+        self.return_visual = return_visual
+
+        try:
+            from LAC import LAC
+        except Exception:
+            raise ImportError("Please install the dependencies first, pip install LAC --upgrade")
+
+        self.use_cuda = use_cuda
+        self.lac = LAC(mode="lac" if self.use_pos else "seg", use_cuda=self.use_cuda)
+        self._get_inference_model()
+
+    def _check_segmented_words(self, inputs):
+        inputs = inputs[0]
+        if not all([isinstance(i, list) and i and all(i) for i in inputs]):
+            raise TypeError("Invalid input format.")
+        return inputs
+
+    def from_segments(self, segmented_words):
+        # pos tag is not available for segmented inputs
+        self.use_pos = False
+        segmented_words = self._check_segmented_words(segmented_words)
+        inputs = {}
+        inputs["words"] = segmented_words
+        inputs = self._preprocess_words(inputs)
+        outputs = self._run_model(inputs)
+        results = self._postprocess(outputs)
+        return results
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        self._input_spec = [
+            paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+            paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+        ]
+
+    def _construct_vocabs(self):
+        word_vocab_path = os.path.join(self._task_path, "word_vocab.json")
+        rel_vocab_path = os.path.join(self._task_path, "rel_vocab.json")
+        self.word_vocab = Vocab.from_json(word_vocab_path)
+        self.rel_vocab = Vocab.from_json(rel_vocab_path)
+        self.word_pad_index = self.word_vocab.to_indices("[PAD]")
+        self.word_bos_index = self.word_vocab.to_indices("[CLS]")
+        self.word_eos_index = self.word_vocab.to_indices("[SEP]")
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+        model_instance = BiAffineParser(
+            encoding_model=self.encoding_model,
+            n_rels=len(self.rel_vocab),
+            n_words=len(self.word_vocab),
+            pad_index=self.word_pad_index,
+            bos_index=self.word_bos_index,
+            eos_index=self.word_eos_index,
+        )
+        model_path = os.path.join(self._task_path, "model_state.pdparams")
+        # Load the model parameter for the predict
+        state_dict = paddle.load(model_path)
+        model_instance.set_dict(state_dict)
+        model_instance.eval()
+        self._model = model_instance
+
+    def _construct_tokenizer(self, model):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        return None
+
+    def _preprocess_words(self, inputs):
+        examples = []
+        for text in inputs["words"]:
+            example = {"FORM": text}
+            example = convert_example(example, vocabs=[self.word_vocab, self.rel_vocab])
+            examples.append(example)
+
+        batches = [examples[idx : idx + self.batch_size] for idx in range(0, len(examples), self.batch_size)]
+
+        def batchify_fn(batch):
+            raw_batch = [raw for raw in zip(*batch)]
+            batch = [pad_sequence(data) for data in raw_batch]
+            return batch
+
+        batches = [flat_words(batchify_fn(batch)[0]) for batch in batches]
+
+        inputs["data_loader"] = batches
+        return inputs
+
+    def _preprocess(self, inputs):
+        """
+        Transform the raw text to the model inputs, two steps involved:
+           1) Transform the raw text to token ids.
+           2) Generate the other model inputs from the raw text and token ids.
+        """
+
+        outputs = {}
+
+        lac_results = []
+        position = 0
+
+        inputs = self._check_input_text(inputs)
+        while position < len(inputs):
+            lac_results += self.lac.run(inputs[position : position + self.batch_size])
+            position += self.batch_size
+
+        if not self.use_pos:
+            outputs["words"] = lac_results
+        else:
+            outputs["words"], outputs["postags"] = [raw for raw in zip(*lac_results)]
+
+        outputs = self._preprocess_words(outputs)
+        return outputs
+
+    def _run_model(self, inputs):
+        """
+        Run the task model from the outputs of the `_tokenize` function.
+        """
+
+        arcs, rels, probs = [], [], []
+        for batch in inputs["data_loader"]:
+            words, wp = batch
+            self.input_handles[0].copy_from_cpu(words)
+            self.input_handles[1].copy_from_cpu(wp)
+            self.predictor.run()
+            arc_preds = self.output_handle[0].copy_to_cpu()
+            rel_preds = self.output_handle[1].copy_to_cpu()
+            s_arc = self.output_handle[2].copy_to_cpu()
+            mask = self.output_handle[3].copy_to_cpu().astype("bool")
+
+            arc_preds, rel_preds = decode(arc_preds, rel_preds, s_arc, mask, self.tree)
+
+            arcs.extend([arc_pred[m] for arc_pred, m in zip(arc_preds, mask)])
+            rels.extend([rel_pred[m] for rel_pred, m in zip(rel_preds, mask)])
+            if self.prob:
+                arc_probs = probability(s_arc, arc_preds)
+                probs.extend([arc_prob[m] for arc_prob, m in zip(arc_probs, mask)])
+        inputs["arcs"] = arcs
+        inputs["rels"] = rels
+        inputs["probs"] = probs
+        return inputs
+
+    def _postprocess(self, inputs):
+
+        arcs = inputs["arcs"]
+        rels = inputs["rels"]
+        words = inputs["words"]
+        arcs = [[s.item() for s in seq] for seq in arcs]
+        rels = [self.rel_vocab.to_tokens(seq) for seq in rels]
+
+        results = []
+
+        for word, arc, rel in zip(words, arcs, rels):
+            result = {
+                "word": word,
+                "head": arc,
+                "deprel": rel,
+            }
+            results.append(result)
+
+        if self.use_pos:
+            postags = inputs["postags"]
+            for result, postag in zip(results, postags):
+                result["postag"] = postag
+
+        if self.prob:
+            probs = inputs["probs"]
+            probs = [[round(p, 2) for p in seq.tolist()] for seq in probs]
+            for result, prob in zip(results, probs):
+                result["prob"] = prob
+
+        if self.return_visual:
+            for result in results:
+                result["visual"] = self._visualize(result)
+
+        return results
+
+    def _visualize(self, data):
+        """
+        Visualize the dependency.
+        Args:
+            data(dict): A dict contains the word, head and dep
+         Returns:
+            data: a numpy array, use cv2.imshow to show it or cv2.imwrite to save it.
+        """
+        try:
+            import matplotlib.font_manager as font_manager
+            import matplotlib.pyplot as plt
+        except Exception:
+            raise ImportError("Please install the dependencies first, pip install matplotlib --upgrade")
+
+        self.plt = plt
+        self.font = font_manager.FontProperties(fname=self.font_file_path)
+        word, head, deprel = data["word"], data["head"], data["deprel"]
+
+        nodes = ["ROOT"] + word
+        x = list(range(len(nodes)))
+        y = [0] * (len(nodes))
+        fig, ax = self.plt.subplots()
+        # Control the picture size
+        max_span = max([abs(i + 1 - j) for i, j in enumerate(head)])
+        fig.set_size_inches((len(nodes), max_span / 2))
+        # Set the points
+        self.plt.scatter(x, y, c="w")
+
+        for i in range(len(nodes)):
+            txt = nodes[i]
+            xytext = (i, 0)
+            if i == 0:
+                # Set 'ROOT'
+                ax.annotate(
+                    txt,
+                    xy=xytext,
+                    xycoords="data",
+                    xytext=xytext,
+                    textcoords="data",
+                )
+            else:
+                xy = (head[i - 1], 0)
+                rad = 0.5 if head[i - 1] < i else -0.5
+                # Set the word
+                ax.annotate(
+                    txt,
+                    xy=xy,
+                    xycoords="data",
+                    xytext=(xytext[0] - 0.1, xytext[1]),
+                    textcoords="data",
+                    fontproperties=self.font,
+                )
+                # Draw the curve
+                ax.annotate(
+                    "",
+                    xy=xy,
+                    xycoords="data",
+                    xytext=xytext,
+                    textcoords="data",
+                    arrowprops=dict(
+                        arrowstyle="<-",
+                        shrinkA=12,
+                        shrinkB=12,
+                        color="blue",
+                        connectionstyle="arc3,rad=%s" % rad,
+                    ),
+                )
+                # Set the deprel label. Calculate its position by the radius
+                text_x = min(i, head[i - 1]) + abs((i - head[i - 1])) / 2 - 0.2
+                text_y = abs((i - head[i - 1])) / 4
+                ax.annotate(deprel[i - 1], xy=xy, xycoords="data", xytext=[text_x, text_y], textcoords="data")
+
+        # Control the axis
+        self.plt.axis("equal")
+        self.plt.axis("off")
+
+        # Save to numpy array
+        fig.canvas.draw()
+        data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
+        data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))[:, :, ::-1]
+        return data
+
+
+def pad_sequence(sequences, padding_value=0, fix_len=None):
+    """Fill sequences(np.ndarray) into a fixed-length matrix."""
+    max_size = sequences[0].shape
+    trailing_dims = max_size[1:]
+    max_len = max([s.shape[0] for s in sequences])
+    if fix_len is not None:
+        assert fix_len >= max_len, "fix_len is too small."
+        max_len = fix_len
+    out_dims = (len(sequences), max_len) + trailing_dims
+    out_tensor = np.full(out_dims, padding_value, dtype=sequences[0].dtype)
+    for i, tensor in enumerate(sequences):
+        length = tensor.shape[0]
+        out_tensor[i, :length, ...] = tensor
+    return out_tensor
+
+
+def convert_example(example, vocabs, fix_len=20):
+    word_vocab, rel_vocab = vocabs
+
+    word_bos_index = word_vocab.to_indices("[CLS]")
+    word_eos_index = word_vocab.to_indices("[SEP]")
+
+    words = [[word_vocab.to_indices(char) for char in word] for word in example["FORM"]]
+    words = [[word_bos_index]] + words + [[word_eos_index]]
+    return [pad_sequence([np.array(ids[:fix_len], dtype=np.int64) for ids in words], fix_len=fix_len)]
+
+
+def flat_words(words, pad_index=0):
+    mask = words != pad_index
+    lens = np.sum(mask.astype(np.int64), axis=-1)
+    position = np.cumsum(lens + (lens == 0).astype(np.int64), axis=1) - 1
+    lens = np.sum(lens, -1)
+    words = words.ravel()[np.flatnonzero(words)]
+
+    sequences = []
+    idx = 0
+    for l in lens:
+        sequences.append(words[idx : idx + l])
+        idx += l
+    words = Pad(pad_val=pad_index)(sequences)
+
+    max_len = words.shape[1]
+
+    mask = (position >= max_len).astype(np.int64)
+    position = position * np.logical_not(mask) + mask * (max_len - 1)
+    return words, position
+
+
+def probability(s_arc, arc_preds):
+    s_arc = s_arc - s_arc.max(axis=-1).reshape(list(s_arc.shape)[:-1] + [1])
+    s_arc = np.exp(s_arc) / np.exp(s_arc).sum(axis=-1).reshape(list(s_arc.shape)[:-1] + [1])
+
+    arc_probs = [s[np.arange(len(arc_pred)), arc_pred] for s, arc_pred in zip(s_arc, arc_preds)]
+    return arc_probs
+
+
+def decode(arc_preds, rel_preds, s_arc, mask, tree):
+    """decode"""
+    lens = np.sum(mask, -1)
+
+    bad = [not istree(seq[: i + 1]) for i, seq in zip(lens, arc_preds)]
+    if tree and any(bad):
+        arc_preds[bad] = eisner(s_arc[bad], mask[bad])
+    rel_preds = [rel_pred[np.arange(len(arc_pred)), arc_pred] for arc_pred, rel_pred in zip(arc_preds, rel_preds)]
+    return arc_preds, rel_preds
+
+
+def eisner(scores, mask):
+    """
+    Eisner algorithm is a general dynamic programming decoding algorithm for bilexical grammar.
+
+    Args：
+        scores: Adjacency matrix，shape=(batch, seq_len, seq_len)
+        mask: mask matrix，shape=(batch, sql_len)
+
+    Returns:
+        output，shape=(batch, seq_len)，the index of the parent node corresponding to the token in the query
+
+    """
+    lens = mask.sum(1)
+    batch_size, seq_len, _ = scores.shape
+    scores = scores.transpose(2, 1, 0)
+    # Score for incomplete span
+    s_i = np.full_like(scores, float("-inf"))
+    # Score for complete span
+    s_c = np.full_like(scores, float("-inf"))
+    # Incomplete span position for backtrack
+    p_i = np.zeros((seq_len, seq_len, batch_size), dtype=np.int64)
+    # Complete span position for backtrack
+    p_c = np.zeros((seq_len, seq_len, batch_size), dtype=np.int64)
+    # Set 0 to s_c.diagonal
+    s_c = fill_diagonal(s_c, 0)
+    # Contiguous
+    s_c = np.ascontiguousarray(s_c)
+    s_i = np.ascontiguousarray(s_i)
+    for w in range(1, seq_len):
+        n = seq_len - w
+        starts = np.arange(n, dtype=np.int64)[np.newaxis, :]
+        # ilr = C(i->r) + C(j->r+1)
+        ilr = stripe(s_c, n, w) + stripe(s_c, n, w, (w, 1))
+        # Shape: [batch_size, n, w]
+        ilr = ilr.transpose(2, 0, 1)
+        # scores.diagonal(-w).shape:[batch, n]
+        il = ilr + scores.diagonal(-w)[..., np.newaxis]
+        # I(j->i) = max(C(i->r) + C(j->r+1) + s(j->i)), i <= r < j
+        il_span, il_path = il.max(-1), il.argmax(-1)
+        s_i = fill_diagonal(s_i, il_span, offset=-w)
+        p_i = fill_diagonal(p_i, il_path + starts, offset=-w)
+
+        ir = ilr + scores.diagonal(w)[..., np.newaxis]
+        # I(i->j) = max(C(i->r) + C(j->r+1) + s(i->j)), i <= r < j
+        ir_span, ir_path = ir.max(-1), ir.argmax(-1)
+        s_i = fill_diagonal(s_i, ir_span, offset=w)
+        p_i = fill_diagonal(p_i, ir_path + starts, offset=w)
+
+        # C(j->i) = max(C(r->i) + I(j->r)), i <= r < j
+        cl = stripe(s_c, n, w, (0, 0), 0) + stripe(s_i, n, w, (w, 0))
+        cl = cl.transpose(2, 0, 1)
+        cl_span, cl_path = cl.max(-1), cl.argmax(-1)
+        s_c = fill_diagonal(s_c, cl_span, offset=-w)
+        p_c = fill_diagonal(p_c, cl_path + starts, offset=-w)
+
+        # C(i->j) = max(I(i->r) + C(r->j)), i < r <= j
+        cr = stripe(s_i, n, w, (0, 1)) + stripe(s_c, n, w, (1, w), 0)
+        cr = cr.transpose(2, 0, 1)
+        cr_span, cr_path = cr.max(-1), cr.argmax(-1)
+        s_c = fill_diagonal(s_c, cr_span, offset=w)
+        s_c[0, w][np.not_equal(lens, w)] = float("-inf")
+        p_c = fill_diagonal(p_c, cr_path + starts + 1, offset=w)
+
+    predicts = []
+    p_c = p_c.transpose(2, 0, 1)
+    p_i = p_i.transpose(2, 0, 1)
+    for i, length in enumerate(lens.tolist()):
+        heads = np.ones(length + 1, dtype=np.int64)
+        backtrack(p_i[i], p_c[i], heads, 0, length, True)
+        predicts.append(heads)
+
+    return pad_sequence(predicts, fix_len=seq_len)
+
+
+def fill_diagonal(x, value, offset=0, dim1=0, dim2=1):
+    """
+    Fill value into the diagoanl of x that offset is ${offset}
+    and the coordinate system is (dim1, dim2).
+    """
+    strides = x.strides
+    shape = x.shape
+    if dim1 > dim2:
+        dim1, dim2 = dim2, dim1
+    assert 0 <= dim1 < dim2 <= 2
+    assert len(x.shape) == 3
+    assert shape[dim1] == shape[dim2]
+
+    dim_sum = dim1 + dim2
+    dim3 = 3 - dim_sum
+    if offset >= 0:
+        diagonal = np.lib.stride_tricks.as_strided(
+            x[:, offset:] if dim_sum == 1 else x[:, :, offset:],
+            shape=(shape[dim3], shape[dim1] - offset),
+            strides=(strides[dim3], strides[dim1] + strides[dim2]),
+        )
+    else:
+        diagonal = np.lib.stride_tricks.as_strided(
+            x[-offset:, :] if dim_sum in [1, 2] else x[:, -offset:],
+            shape=(shape[dim3], shape[dim1] + offset),
+            strides=(strides[dim3], strides[dim1] + strides[dim2]),
+        )
+
+    diagonal[...] = value
+    return x
+
+
+def backtrack(p_i, p_c, heads, i, j, complete):
+    """
+    Backtrack the position matrix of eisner to generate the tree
+    """
+    if i == j:
+        return
+    if complete:
+        r = p_c[i, j]
+        backtrack(p_i, p_c, heads, i, r, False)
+        backtrack(p_i, p_c, heads, r, j, True)
+    else:
+        r, heads[j] = p_i[i, j], i
+        i, j = sorted((i, j))
+        backtrack(p_i, p_c, heads, i, r, True)
+        backtrack(p_i, p_c, heads, j, r + 1, True)
+
+
+def stripe(x, n, w, offset=(0, 0), dim=1):
+    """
+    Returns a diagonal stripe of the tensor.
+
+    Args:
+        x (Tensor): the input tensor with 2 or more dims.
+        n (int): the length of the stripe.
+        w (int): the width of the stripe.
+        offset (tuple): the offset of the first two dims.
+        dim (int): 0 if returns a horizontal stripe; 1 else.
+
+    Example:
+    >>> x = np.arange(25).reshape(5, 5)
+    >>> x
+    tensor([[ 0,  1,  2,  3,  4],
+            [ 5,  6,  7,  8,  9],
+            [10, 11, 12, 13, 14],
+            [15, 16, 17, 18, 19],
+            [20, 21, 22, 23, 24]])
+    >>> stripe(x, 2, 3, (1, 1))
+    tensor([[ 6,  7,  8],
+            [12, 13, 14]])
+    >>> stripe(x, 2, 3, dim=0)
+    tensor([[ 0,  5, 10],
+            [ 6, 11, 16]])
+    """
+    if not x.flags["C_CONTIGUOUS"]:
+        x = np.ascontiguousarray(x)
+    strides = x.strides
+    m = strides[0] + strides[1]
+    k = strides[1] if dim == 1 else strides[0]
+    return np.lib.stride_tricks.as_strided(
+        x[offset[0] :, offset[1] :], shape=[n, w] + list(x.shape[2:]), strides=[m, k] + list(strides[2:])
+    )
+
+
+class Node:
+    """Node class"""
+
+    def __init__(self, id=None, parent=None):
+        self.lefts = []
+        self.rights = []
+        self.id = int(id)
+        self.parent = parent if parent is None else int(parent)
+
+
+class DepTree:
+    """
+    DepTree class, used to check whether the prediction result is a project Tree.
+    A projective tree means that you can project the tree without crossing arcs.
+    """
+
+    def __init__(self, sentence):
+        # set root head to -1
+        sentence = copy.deepcopy(sentence)
+        sentence[0] = -1
+        self.sentence = sentence
+        self.build_tree()
+        self.visit = [False] * len(sentence)
+
+    def build_tree(self):
+        """Build the tree"""
+        self.nodes = [Node(index, p_index) for index, p_index in enumerate(self.sentence)]
+        # set root
+        self.root = self.nodes[0]
+        for node in self.nodes[1:]:
+            self.add(self.nodes[node.parent], node)
+
+    def add(self, parent, child):
+        """Add a child node"""
+        if parent.id is None or child.id is None:
+            raise Exception("id is None")
+        if parent.id < child.id:
+            parent.rights = sorted(parent.rights + [child.id])
+        else:
+            parent.lefts = sorted(parent.lefts + [child.id])
+
+    def judge_legal(self):
+        """Determine whether it is a project tree"""
+        target_seq = list(range(len(self.nodes)))
+        if len(self.root.lefts + self.root.rights) != 1:
+            return False
+        cur_seq = self.inorder_traversal(self.root)
+        if target_seq != cur_seq:
+            return False
+        else:
+            return True
+
+    def inorder_traversal(self, node):
+        """Inorder traversal"""
+        if self.visit[node.id]:
+            return []
+        self.visit[node.id] = True
+        lf_list = []
+        rf_list = []
+        for ln in node.lefts:
+            lf_list += self.inorder_traversal(self.nodes[ln])
+        for rn in node.rights:
+            rf_list += self.inorder_traversal(self.nodes[rn])
+
+        return lf_list + [node.id] + rf_list
+
+
+def istree(sequence):
+    """Is the sequence a project tree"""
+    return DepTree(sequence).judge_legal()
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/dialogue.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/dialogue.py
new file mode 100644
index 000000000..78e0984e0
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/dialogue.py
@@ -0,0 +1,370 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+from collections import deque
+
+import numpy as np
+import paddle
+
+from ..data import Pad
+from ..transformers import UnifiedTransformerLMHeadModel, UnifiedTransformerTokenizer
+from .task import Task
+
+usage = r"""
+           from paddlenlp import Taskflow
+
+           # 非交互模式
+           dialogue = Taskflow("dialogue")
+           dialogue(["吃饭了吗"])
+           '''
+           ['刚吃完饭,你在干什么呢?']
+           '''
+           dialogue(["你好", "吃饭了吗"], ["你是谁？"])
+           '''
+           ['吃过了,你呢', '我是李明啊']
+           '''
+
+           dialogue = Taskflow("dialogue")
+           # 进入交互模式 (输入exit退出)
+           dialogue.interactive_mode(max_turn=3)
+
+           '''
+           [Human]:你好
+           [Bot]:你好,很高兴认识你,我想问你一下,你喜欢运动吗?
+           [Human]:喜欢
+           [Bot]:那你喜欢什么运动啊?
+           [Human]:篮球,你喜欢篮球吗
+           [Bot]:当然了,我很喜欢打篮球的。
+           '''
+         """
+
+
+class DialogueTask(Task):
+    """
+    Task of Chinese open domain dialogue.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    resource_files_names = {
+        "model_state": "model_state.pdparams",
+        "model_config": "model_config.json",
+    }
+    resource_files_urls = {
+        "plato-mini": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/dialogue/plato-mini/model_state.pdparams",
+                "450be85b9b7f0bc03b12252a75af04f3",
+            ],
+            "model_config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/dialogue/plato-mini/model_config.json",
+                "5e853fda9a9b573815ad112e494a65af",
+            ],
+        },
+        "__internal_testing__/tiny-random-plato": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-plato/model_state.pdparams",
+                "fda5d068908505cf0c3a46125eb4d39e",
+            ],
+            "model_config": [
+                "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-plato/config.json",
+                "3664e658d5273a132f2e7345a8cafa53",
+            ],
+        },
+    }
+
+    def __init__(self, task, model, batch_size=1, max_seq_len=512, **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
+        self._static_mode = False
+        self._usage = usage
+        if not self._custom_model:
+            self._check_task_files()
+        self._construct_tokenizer(self._task_path if self._custom_model else model)
+        self._batch_size = batch_size
+        self._max_seq_len = max_seq_len
+        self._interactive_mode = False
+        if self._static_mode:
+            self._get_inference_model()
+        else:
+            self._construct_model(self._task_path if self._custom_model else model)
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        self._input_spec = [
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),
+            paddle.static.InputSpec(shape=[None], dtype="int64", name="token_type_ids"),
+        ]
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+        model_instance = UnifiedTransformerLMHeadModel.from_pretrained(model, from_hf_hub=self.from_hf_hub)
+        model_instance.eval()
+        self._model = model_instance
+
+    def _construct_tokenizer(self, model):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        self._tokenizer = UnifiedTransformerTokenizer.from_pretrained(model, from_hf_hub=self.from_hf_hub)
+
+    def _batchify_fn(self, batch_examples):
+        # padding = False if self._batch_size == 1 else True
+        pad_func = Pad(pad_val=self._tokenizer.pad_token_id, pad_right=False, dtype="int64")
+
+        def pad_mask(batch_attention_mask):
+            batch_size = len(batch_attention_mask)
+            max_len = max(map(len, batch_attention_mask))
+            attention_mask = np.ones((batch_size, max_len, max_len), dtype="float32") * -1e4
+            for i, mask_data in enumerate(attention_mask):
+                seq_len = len(batch_attention_mask[i])
+                mask_data[-seq_len:, -seq_len:] = np.array(batch_attention_mask[i], dtype="float32")
+            # In order to ensure the correct broadcasting mechanism, expand one
+            # dimension to the second dimension (n_head of Transformer).
+            attention_mask = np.expand_dims(attention_mask, axis=1)
+            return attention_mask
+
+        input_ids = pad_func([example["input_ids"] for example in batch_examples])
+        token_type_ids = pad_func([example["token_type_ids"] for example in batch_examples])
+        position_ids = pad_func([example["position_ids"] for example in batch_examples])
+        attention_mask = pad_mask([example["attention_mask"] for example in batch_examples])
+
+        return input_ids, token_type_ids, position_ids, attention_mask
+
+    def _check_input_text(self, inputs):
+        if self._interactive_mode:
+            if isinstance(inputs, str):
+                self.context.append(inputs.strip())
+                inputs = [list(self.context)]
+                return inputs
+            else:
+                raise ValueError("In the interactive mode, the input data shold be a string")
+        elif not isinstance(inputs[0], list):
+            raise ValueError("If not in the interactive mode, the input data should be a list.")
+        return inputs
+
+    def _batchify(self, data, max_seq_len, batch_size):
+        """
+        Generate input batches.
+        """
+        padding = False if batch_size == 1 else True
+        pad_func = Pad(pad_val=self._tokenizer.pad_token_id, pad_right=False, dtype=np.int64)
+
+        def pad_mask(batch_attention_mask):
+            batch_size = len(batch_attention_mask)
+            max_len = max(map(len, batch_attention_mask))
+            attention_mask = np.ones((batch_size, max_len, max_len), dtype="float32") * -1e4
+            for i, mask_data in enumerate(attention_mask):
+                seq_len = len(batch_attention_mask[i])
+                mask_data[-seq_len:, -seq_len:] = np.array(batch_attention_mask[i], dtype="float32")
+            # In order to ensure the correct broadcasting mechanism, expand one
+            # dimension to the second dimension (n_head of Transformer).
+            attention_mask = np.expand_dims(attention_mask, axis=1)
+            return attention_mask
+
+        def _parse_batch(batch_examples):
+            if padding:
+                input_ids = pad_func([example["input_ids"] for example in batch_examples])
+                token_type_ids = pad_func([example["token_type_ids"] for example in batch_examples])
+                position_ids = pad_func([example["position_ids"] for example in batch_examples])
+                attention_mask = pad_mask([example["attention_mask"] for example in batch_examples])
+            else:
+                input_ids = np.asarray([example["input_ids"] for example in batch_examples], dtype=np.int64)
+                token_type_ids = np.asarray([example["token_type_ids"] for example in batch_examples], dtype=np.int64)
+                position_ids = np.asarray([example["position_ids"] for example in batch_examples], dtype=np.int64)
+                attention_mask = np.asarray([example["attention_mask"] for example in batch_examples])
+                attention_mask = np.expand_dims(attention_mask, 0)
+
+            return input_ids, token_type_ids, position_ids, attention_mask
+
+        examples = []
+        for texts in data:
+            examples.append(self._convert_text_to_input(texts, max_seq_len))
+
+        # Separates data into some batches.
+        one_batch = []
+        for example in examples:
+            one_batch.append(example)
+            if len(one_batch) == batch_size:
+                yield _parse_batch(one_batch)
+                one_batch = []
+        if one_batch:
+            yield _parse_batch(one_batch)
+
+    def _convert_text_to_input(self, texts, max_seq_len):
+        """
+        Convert input strings to tokens.
+        """
+        return self._tokenizer.dialogue_encode(
+            texts, max_seq_len=max_seq_len, add_start_token_as_response=True, is_split_into_words=False
+        )
+
+    def _preprocess(self, inputs):
+        """
+        Transform the raw text to the model inputs, two steps involved:
+           1) Transform the raw text to token ids.
+           2) Generate the other model inputs from the raw text and token ids.
+        """
+        inputs = self._check_input_text(inputs)
+        # Get the config from the kwargs
+        num_workers = self.kwargs["num_workers"] if "num_workers" in self.kwargs else 0  # noqa: F841
+        lazy_load = self.kwargs["lazy_load"] if "lazy_load" in self.kwargs else False  # noqa: F841
+
+        batches = self._batchify(inputs, self._max_seq_len, self._batch_size)
+
+        outputs = {}
+        outputs["batches"] = batches
+        outputs["text"] = inputs
+        return outputs
+
+    def _run_model(self, inputs):
+        """
+        Run the task model from the outputs of the `_tokenize` function.
+        """
+        all_ids = []
+        all_scores = []
+
+        for batch in inputs["batches"]:
+            input_ids, token_type_ids, position_ids, attention_mask = map(paddle.to_tensor, batch)
+            ids, scores = self._model.generate(
+                input_ids=input_ids,
+                token_type_ids=token_type_ids,
+                position_ids=position_ids,
+                attention_mask=attention_mask,
+                max_length=64,
+                min_length=1,
+                decode_strategy="sampling",
+                temperature=1.0,
+                top_k=5,
+                top_p=1.0,
+                num_beams=0,
+                length_penalty=1.0,
+                early_stopping=False,
+                use_fast=False,
+                num_return_sequences=1,
+            )
+            all_ids.extend([ids])
+            all_scores.extend([scores])
+        inputs["ids"] = all_ids
+        inputs["scores"] = all_scores
+        return inputs
+
+    def _post_process_response(self, token_ids, tokenizer):
+        """
+        Post-process the decoded sequence. Truncate from the first <eos>.
+        """
+        eos_pos = len(token_ids)
+        for i, tok_id in enumerate(token_ids):
+            if tok_id == tokenizer.sep_token_id:
+                eos_pos = i
+                break
+        token_ids = token_ids[:eos_pos]
+        tokens = tokenizer.convert_ids_to_tokens(token_ids)
+        tokens = tokenizer.merge_subword(tokens)
+        return token_ids, tokens
+
+    @contextlib.contextmanager
+    def interactive_mode(self, max_turn=3):
+        """
+        Enter the interactive mode.
+        """
+        self._interactive_mode = True
+        self.max_turn = max_turn
+        self.context = deque(maxlen=self.max_turn)
+        yield
+        self.context.clear()
+        self._interactive_mode = False
+
+    def _get_in_turn_repetition(self, pred, is_cn=False):
+        """
+        Get in-turn repetition.
+        """
+        if len(pred) == 0:
+            return 1.0
+        if isinstance(pred[0], str):
+            pred = [tok.lower() for tok in pred]
+            if is_cn:
+                pred = "".join(pred)
+        tri_grams = set()
+        for i in range(len(pred) - 2):
+            tri_gram = tuple(pred[i : i + 3])
+            if tri_gram in tri_grams:
+                return True
+            tri_grams.add(tri_gram)
+        return False
+
+    def _select_response(self, ids, scores, tokenizer, max_dec_len=None, num_return_sequences=1, keep_space=True):
+        """
+        Select response with the highest score.
+        """
+        ids = ids.numpy().tolist()
+        scores = scores.numpy()
+
+        if len(ids) != len(scores) or (len(ids) % num_return_sequences) != 0:
+            raise ValueError(
+                "the length of `ids` is {}, but the `num_return_sequences` is {}".format(
+                    len(ids), num_return_sequences
+                )
+            )
+
+        group = []
+        tmp = []
+        for pred, score in zip(ids, scores):
+            pred_token_ids, pred_tokens = self._post_process_response(pred, tokenizer)
+            num_token = len(pred_token_ids)
+            if keep_space:
+                response = " ".join(pred_tokens)
+            else:
+                response = "".join(pred_tokens)
+
+            in_turn_repetition = self._get_in_turn_repetition(pred_tokens, True) or self._get_in_turn_repetition(
+                pred_token_ids
+            )
+            # not ending
+            if max_dec_len is not None and num_token >= max_dec_len:
+                score -= 1e3
+            elif in_turn_repetition:
+                score -= 1e3
+
+            tmp.append([response, score])
+            if len(tmp) == num_return_sequences:
+                group.append(tmp)
+                tmp = []
+
+        results = []
+        for preds in group:
+            preds = sorted(preds, key=lambda x: -x[1])
+            results.append(preds[0][0])
+        return results
+
+    def _postprocess(self, inputs):
+        all_ids = inputs["ids"]
+        all_scores = inputs["scores"]
+        texts = inputs["text"]
+
+        results = []
+        for ids, scores, text in zip(all_ids, all_scores, texts):
+            results.extend(
+                self._select_response(ids, scores, self._tokenizer, num_return_sequences=1, keep_space=False)
+            )
+
+        if self._interactive_mode:
+            self.context.append(results[0].strip())
+        return results
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/document_intelligence.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/document_intelligence.py
new file mode 100644
index 000000000..771de056c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/document_intelligence.py
@@ -0,0 +1,252 @@
+# Copyright (c) 2022  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+
+from ..transformers import AutoTokenizer
+from .task import Task
+from .utils import ImageReader, download_file, find_answer_pos, get_doc_pred, sort_res
+
+usage = r"""
+            from paddlenlp import Taskflow
+            docprompt = Taskflow("document_intelligence")
+            # Types of doc: A string containing a local path to an image
+            docprompt({"doc": "./invoice.jpg", "prompt": ["发票号码是多少?", "校验码是多少?"]})
+            # Types of doc: A string containing a http link pointing to an image
+            docprompt({"doc": "https://bj.bcebos.com/paddlenlp/taskflow/document_intelligence/images/invoice.jpg", "prompt": ["发票号码是多少?", "校验码是多少?"]})
+            '''
+            [{'prompt': '发票号码是多少?', 'result': [{'value': 'No44527206', 'prob': 0.74, 'start': 2, 'end': 2}]}, {'prompt': '校验码是多少?', 'result': [{'value': '01107 555427109891646', 'prob': 1.0, 'start': 231, 'end': 233}]}]
+            '''
+
+            # Batch input
+            batch_input = [
+                {"doc": "./invoice.jpg", "prompt": ["发票号码是多少?", "校验码是多少?"]},
+                {"doc": "./resume.png", "prompt": ["五百丁本次想要担任的是什么职位?", "五百丁是在哪里上的大学?", "大学学的是什么专业?"]}
+            ]
+            docprompt(batch_input)
+            '''
+            [[{'prompt': '发票号码是多少?', 'result': [{'value': 'No44527206', 'prob': 0.74, 'start': 2, 'end': 2}]}, {'prompt': '校验码是多少?', 'result': [{'value': '01107 555427109891646', 'prob': 1.0, 'start': 231, 'end': 233}]}], [{'prompt': '五百丁本次想要担任的是什么职位?', 'result': [{'value': '客户经理', 'prob': 1.0, 'start': 4, 'end': 7}]}, {'prompt': '五百丁是在哪里上的大学?', 'result': [{'value': '广州五百丁学院', 'prob': 1.0, 'start': 31, 'end': 37}]}, {'prompt': '大学学的是什么专业?', 'result': [{'value': '金融学(本科）', 'prob': 0.82, 'start': 38, 'end': 44}]}]]
+            '''
+         """
+
+URLS = {
+    "docprompt": [
+        "https://bj.bcebos.com/paddlenlp/taskflow/document_intelligence/docprompt/docprompt_params.tar",
+        "8eae8148981731f230b328076c5a08bf",
+    ],
+}
+
+
+class DocPromptTask(Task):
+    """
+    The document intelligence model, give the querys and predict the answers.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    def __init__(self, task, model, **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
+        self._batch_size = kwargs.get("batch_size", 1)
+        self._topn = kwargs.get("topn", 1)
+        self._lang = kwargs.get("lang", "ch")
+        self._construct_ocr_engine(lang=self._lang)
+        self._usage = usage
+        download_file(self._task_path, "docprompt_params.tar", URLS[self.model][0], URLS[self.model][1])
+        self._get_inference_model()
+        self._construct_tokenizer()
+        self._reader = ImageReader(super_rel_pos=False, tokenizer=self._tokenizer)
+
+    def _construct_tokenizer(self):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        self._tokenizer = AutoTokenizer.from_pretrained("ernie-layoutx-base-uncased")
+
+    def _preprocess(self, inputs):
+        """
+        Transform the raw text to the model inputs, two steps involved:
+           1) Transform the raw text to token ids.
+           2) Generate the other model inputs from the raw text and token ids.
+        """
+        preprocess_results = self._check_input_text(inputs)
+        for example in preprocess_results:
+            if "word_boxes" in example.keys():
+                ocr_result = example["word_boxes"]
+                example["ocr_type"] = "word_boxes"
+            else:
+                ocr_result = self._ocr.ocr(example["doc"], cls=True)
+                example["ocr_type"] = "ppocr"
+                # Compatible with paddleocr>=2.6.0.2
+                ocr_result = ocr_result[0] if len(ocr_result) == 1 else ocr_result
+            example["ocr_result"] = ocr_result
+        return preprocess_results
+
+    def _run_model(self, inputs):
+        """
+        Run the task model from the outputs of the `_tokenize` function.
+        """
+        all_predictions_list = []
+        for example in inputs:
+            ocr_result = example["ocr_result"]
+            doc_path = example["doc"]
+            prompt = example["prompt"]
+            ocr_type = example["ocr_type"]
+
+            if not ocr_result:
+                all_predictions = [
+                    {"prompt": p, "result": [{"value": "", "prob": 0.0, "start": -1, "end": -1}]} for p in prompt
+                ]
+                all_boxes = {}
+            else:
+                data_loader = self._reader.data_generator(ocr_result, doc_path, prompt, self._batch_size, ocr_type)
+
+                RawResult = collections.namedtuple("RawResult", ["unique_id", "seq_logits"])
+
+                all_results = []
+                for data in data_loader:
+                    for idx in range(len(self.input_names)):
+                        self.input_handles[idx].copy_from_cpu(data[idx])
+                    self.predictor.run()
+                    outputs = [output_handle.copy_to_cpu() for output_handle in self.output_handle]
+                    unique_ids, seq_logits = outputs
+
+                    for idx in range(len(unique_ids)):
+                        all_results.append(
+                            RawResult(
+                                unique_id=int(unique_ids[idx]),
+                                seq_logits=seq_logits[idx],
+                            )
+                        )
+
+                all_examples = self._reader.examples["infer"]
+                all_features = self._reader.features["infer"]
+                all_key_probs = [1 for _ in all_examples]
+
+                example_index_to_features = collections.defaultdict(list)
+
+                for feature in all_features:
+                    example_index_to_features[feature.qas_id].append(feature)
+
+                unique_id_to_result = {}
+                for result in all_results:
+                    unique_id_to_result[result.unique_id] = result
+
+                all_predictions = []
+                all_boxes = {}
+                for (example_index, example) in enumerate(all_examples):
+                    example_doc_tokens = example.doc_tokens
+                    example_qas_id = example.qas_id
+                    page_id = example_qas_id.split("_")[0]
+                    if page_id not in all_boxes:
+                        all_boxes[page_id] = example.ori_boxes
+                    example_query = example.keys[0]
+                    features = example_index_to_features[example_qas_id]
+
+                    preds = []
+                    # keep track of the minimum score of null start+end of position 0
+                    for feature in features:
+                        if feature.unique_id not in unique_id_to_result:
+                            continue
+                        result = unique_id_to_result[feature.unique_id]
+
+                        # find preds
+                        ans_pos = find_answer_pos(result.seq_logits, feature)
+                        preds.extend(
+                            get_doc_pred(
+                                result, ans_pos, example, self._tokenizer, feature, True, all_key_probs, example_index
+                            )
+                        )
+
+                    if not preds:
+                        preds.append({"value": "", "prob": 0.0, "start": -1, "end": -1})
+                    else:
+                        preds = sort_res(example_query, preds, example_doc_tokens, all_boxes[page_id], self._lang)[
+                            : self._topn
+                        ]
+                    all_predictions.append({"prompt": example_query, "result": preds})
+            all_predictions_list.append(all_predictions)
+        return all_predictions_list
+
+    def _postprocess(self, inputs):
+        """
+        The model output is tag ids, this function will convert the model output to raw text.
+        """
+        results = inputs
+        results = results[0] if len(results) == 1 else results
+        return results
+
+    def _check_input_text(self, inputs):
+        inputs = inputs[0]
+        if isinstance(inputs, dict):
+            inputs = [inputs]
+        if isinstance(inputs, list):
+            input_list = []
+            for example in inputs:
+                data = {}
+                if isinstance(example, dict):
+                    if "doc" not in example.keys():
+                        raise ValueError(
+                            "Invalid inputs, the inputs should contain an url to an image or a local path."
+                        )
+                    else:
+                        if isinstance(example["doc"], str):
+                            if example["doc"].startswith("http://") or example["doc"].startswith("https://"):
+                                download_file("./", example["doc"].rsplit("/", 1)[-1], example["doc"])
+                                doc_path = example["doc"].rsplit("/", 1)[-1]
+                            else:
+                                doc_path = example["doc"]
+                            data["doc"] = doc_path
+                        else:
+                            raise ValueError("Incorrect path or url, URLs must start with `http://` or `https://`")
+                    if "prompt" not in example.keys():
+                        raise ValueError("Invalid inputs, the inputs should contain the prompt.")
+                    else:
+                        if isinstance(example["prompt"], str):
+                            data["prompt"] = [example["prompt"]]
+                        elif isinstance(example["prompt"], list) and all(
+                            isinstance(s, str) for s in example["prompt"]
+                        ):
+                            data["prompt"] = example["prompt"]
+                        else:
+                            raise TypeError("Incorrect prompt, prompt should be string or list of string.")
+                    if "word_boxes" in example.keys():
+                        data["word_boxes"] = example["word_boxes"]
+                    input_list.append(data)
+                else:
+                    raise TypeError(
+                        "Invalid inputs, input for document intelligence task should be dict or list of dict, but type of {} found!".format(
+                            type(example)
+                        )
+                    )
+        else:
+            raise TypeError(
+                "Invalid inputs, input for document intelligence task should be dict or list of dict, but type of {} found!".format(
+                    type(inputs)
+                )
+            )
+        return input_list
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+        pass
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        pass
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/fill_mask.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/fill_mask.py
new file mode 100644
index 000000000..585d23b0a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/fill_mask.py
@@ -0,0 +1,167 @@
+# coding:utf-8
+# Copyright (c) 2022  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Optional, Union
+
+import paddle
+import paddle.nn.functional as F
+
+from paddlenlp.data import DataCollatorWithPadding
+from paddlenlp.transformers import AutoModelForMaskedLM, AutoTokenizer
+
+from .task import Task
+from .utils import dygraph_mode_guard
+
+usage = r"""
+        from paddlenlp import Taskflow
+        text_cls = Taskflow(
+            "fill_mask",
+            task_path=<local_saved_model>,
+            top_k=1
+            )
+        text_cls('飞桨[MASK]度学习架')
+        '''
+        [
+            {
+                'token': <token_index>,
+                'token_str': '深',
+                'sequence': 飞桨深度学习框架,
+                'score': 0.65
+            }
+        ]
+        '''
+        text_cls(['飞桨[MASK]度学习架', '生活的真谛是[MASK]'])
+        '''
+        [
+            {
+                'token': <token_index>,
+                'token_str': '深',
+                'sequence': 飞桨深度学习框架,
+                'score': 0.65
+            },
+            {
+                'token': <token_index>,
+                'token_str': '爱',
+                'sequence': 生活的真谛是爱,
+                'score': 0.65
+            }
+        ]
+         """
+
+
+class FillMaskTask(Task):
+    """
+    Perform cloze-style mask filling with Masked Language Modeling (MLM)
+    NOTE: This task is different from all other tasks that it has no out-of-box zero-shot capabilities.
+    Instead, it's used as a simple inference pipeline.
+    Args:
+        task (string): The name of task.
+        task_path (string): The local file path to the model path or a pre-trained model
+        top_k (string, optional): The number of predictions to return.. Defaults to 5.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    def __init__(self, task: str, model: Optional[str] = None, top_k: Optional[str] = 5, **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
+        self.top_k = top_k
+        self._construct_tokenizer(self._task_path)
+        self._construct_model(self._task_path)
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        raise NotImplementedError(f"Conversion from dygraph to static graph is not supported in {self.__name__}")
+
+    def _construct_model(self, model: str):
+        """
+        Construct the inference model for the predictor.
+        """
+        model_instance = AutoModelForMaskedLM.from_pretrained(model, from_hf_hub=self.from_hf_hub)
+        model_instance.eval()
+        self._model = model_instance
+
+    def _construct_tokenizer(self, model: str):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        self._tokenizer = AutoTokenizer.from_pretrained(model, from_hf_hub=self.from_hf_hub)
+
+    def get_masked_index(self, input_ids):
+        return paddle.nonzero(input_ids == self._tokenizer.mask_token_id)
+
+    def ensure_exactly_one_mask_token(self, input_ids: List[int]):
+        num_mask_token = input_ids.count(self._tokenizer.mask_token_id)
+        if num_mask_token != 1:
+            raise ValueError(f"FillMaskTask expects 1 mask token for each input but found {num_mask_token}")
+
+    def _preprocess(self, inputs: Union[str, List[str]]) -> Dict[str, Any]:
+        """
+        Transform the raw text to the model inputs, two steps involved:
+           1) Transform the raw text to token ids.
+           2) Generate the other model inputs from the raw text and token ids.
+        """
+        inputs = self._check_input_text(inputs)
+        # Get the config from the kwargs
+        batch_size = self.kwargs["batch_size"] if "batch_size" in self.kwargs else 1
+
+        max_length = self.kwargs["max_length"] if "max_length" in self.kwargs else 512
+        collator = DataCollatorWithPadding(self._tokenizer, return_tensors="pd")
+        tokenized_inputs = []
+        for i in inputs:
+            tokenized_input = self._tokenizer(i, max_length=max_length)
+            self.ensure_exactly_one_mask_token(tokenized_input["input_ids"])
+            tokenized_inputs.append(tokenized_input)
+
+        batches = [tokenized_inputs[idx : idx + batch_size] for idx in range(0, len(tokenized_inputs), batch_size)]
+        outputs = {}
+        outputs["text"] = inputs
+        outputs["batches"] = [collator(batch) for batch in batches]
+
+        return outputs
+
+    def _run_model(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Run the task model from the outputs of the `_tokenize` function.
+        """
+        model_outputs = []
+        with dygraph_mode_guard():
+            for batch in inputs["batches"]:
+                logits = self._model(**batch)
+                masked_index = self.get_masked_index(batch["input_ids"])
+                mask_token_logits = paddle.gather_nd(logits, masked_index)
+                mask_token_probs = F.softmax(mask_token_logits, axis=-1)
+                top_probs, top_pred_indices = paddle.topk(mask_token_probs, k=self.top_k, axis=-1)
+                for probs, pred_indices in zip(top_probs.tolist(), top_pred_indices.tolist()):
+                    model_output = []
+                    for prob, pred in zip(probs, pred_indices):
+                        model_output.append({"token": pred, "score": prob})
+                    model_outputs.append(model_output)
+        outputs = {}
+        outputs["text"] = inputs["text"]
+        outputs["model_outputs"] = model_outputs
+        return outputs
+
+    def _postprocess(self, inputs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        The model output is tag ids, this function will convert the model output to raw text.
+        """
+        for i, model_output in enumerate(inputs["model_outputs"]):
+            # Same API with https://huggingface.co/tasks/fill-mask
+            for token_output in model_output:
+                token_output["token_str"] = self._tokenizer.decode(token_output["token"])
+                # Since we limit to 1 MASK per input, we can directly use .replace here
+                token_output["sequence"] = inputs["text"][i].replace("[MASK]", token_output["token_str"])
+        return inputs["model_outputs"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/information_extraction.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/information_extraction.py
new file mode 100644
index 000000000..fac8d7231
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/information_extraction.py
@@ -0,0 +1,1592 @@
+# coding:utf-8
+# Copyright (c) 2022  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+import json
+import os
+import re
+from typing import List
+
+import numpy as np
+import paddle
+from huggingface_hub import hf_hub_download
+
+from ..datasets import load_dataset
+from ..layers import GlobalPointerForEntityExtraction, GPLinkerForRelationExtraction
+from ..transformers import UIE, UIEM, UIEX, AutoModel, AutoTokenizer
+from ..utils.doc_parser import DocParser
+from ..utils.env import CONFIG_NAME, LEGACY_CONFIG_NAME
+from ..utils.ie_utils import map_offset, pad_image_data
+from ..utils.log import logger
+from ..utils.tools import get_bool_ids_greater_than, get_span
+from .task import Task
+from .utils import DataCollatorGP, SchemaTree, dbc2sbc, get_id_and_prob, gp_decode
+
+usage = r"""
+            from paddlenlp import Taskflow
+
+            # Entity Extraction
+            schema = ['时间', '选手', '赛事名称'] # Define the schema for entity extraction
+            ie = Taskflow('information_extraction', schema=schema)
+            ie("2月8日上午北京冬奥会自由式滑雪女子大跳台决赛中中国选手谷爱凌以188.25分获得金牌！")
+            '''
+            [{'时间': [{'text': '2月8日上午', 'start': 0, 'end': 6, 'probability': 0.9857378532924486}], '选手': [{'text': '谷爱凌', 'start': 28, 'end': 31, 'probability': 0.8981548639781138}], '赛事名称': [{'text': '北京冬奥会自由式滑雪女子大跳台决赛', 'start': 6, 'end': 23, 'probability': 0.8503089953268272}]}]
+            '''
+
+            # Relation Extraction
+            schema = [{"歌曲名称":["歌手", "所属专辑"]}] # Define the schema for relation extraction
+            ie.set_schema(schema) # Reset schema
+            ie("《告别了》是孙耀威在专辑爱的故事里面的歌曲")
+            '''
+            [{'歌曲名称': [{'text': '告别了', 'start': 1, 'end': 4, 'probability': 0.6296155977145546, 'relations': {'歌手': [{'text': '孙耀威', 'start': 6, 'end': 9, 'probability': 0.9988381005599081}], '所属专辑': [{'text': '爱的故事', 'start': 12, 'end': 16, 'probability': 0.9968462078543183}]}}, {'text': '爱的故事', 'start': 12, 'end': 16, 'probability': 0.2816869478191606, 'relations': {'歌手': [{'text': '孙耀威', 'start': 6, 'end': 9, 'probability': 0.9951415104192272}]}}]}]
+            '''
+
+            # Event Extraction
+            schema = [{'地震触发词': ['地震强度', '时间', '震中位置', '震源深度']}] # Define the schema for event extraction
+            ie.set_schema(schema) # Reset schema
+            ie('中国地震台网正式测定：5月16日06时08分在云南临沧市凤庆县(北纬24.34度，东经99.98度)发生3.5级地震，震源深度10千米。')
+            '''
+            [{'地震触发词': [{'text': '地震', 'start': 56, 'end': 58, 'probability': 0.9977425555988333, 'relations': {'地震强度': [{'text': '3.5级', 'start': 52, 'end': 56, 'probability': 0.998080217831891}], '时间': [{'text': '5月16日06时08分', 'start': 11, 'end': 22, 'probability': 0.9853299772936026}], '震中位置': [{'text': '云南临沧市凤庆县(北纬24.34度，东经99.98度)', 'start': 23, 'end': 50, 'probability': 0.7874012889740385}], '震源深度': [{'text': '10千米', 'start': 63, 'end': 67, 'probability': 0.9937974422968665}]}}]}]
+            '''
+
+            # Opinion Extraction
+            schema = [{'评价维度': ['观点词', '情感倾向[正向，负向]']}] # Define the schema for opinion extraction
+            ie.set_schema(schema) # Reset schema
+            ie("地址不错，服务一般，设施陈旧")
+            '''
+            [{'评价维度': [{'text': '地址', 'start': 0, 'end': 2, 'probability': 0.9888139270606509, 'relations': {'观点词': [{'text': '不错', 'start': 2, 'end': 4, 'probability': 0.9927847072459528}], '情感倾向[正向，负向]': [{'text': '正向', 'probability': 0.998228967796706}]}}, {'text': '设施', 'start': 10, 'end': 12, 'probability': 0.9588297379365116, 'relations': {'观点词': [{'text': '陈旧', 'start': 12, 'end': 14, 'probability': 0.9286753967902683}], '情感倾向[正向，负向]': [{'text': '负向', 'probability': 0.9949389795770394}]}}, {'text': '服务', 'start': 5, 'end': 7, 'probability': 0.9592857070501211, 'relations': {'观点词': [{'text': '一般', 'start': 7, 'end': 9, 'probability': 0.9949359182521675}], '情感倾向[正向，负向]': [{'text': '负向', 'probability': 0.9952498258302498}]}}]}]
+            '''
+
+            # Sentence-level Sentiment Classification
+            schema = ['情感倾向[正向，负向]'] # Define the schema for sentence-level sentiment classification
+            ie.set_schema(schema) # Reset schema
+            ie('这个产品用起来真的很流畅，我非常喜欢')
+            '''
+            [{'情感倾向[正向，负向]': [{'text': '正向', 'probability': 0.9990024058203417}]}]
+            '''
+
+            # English Model
+            schema = [{'Person': ['Company', 'Position']}]
+            ie_en = Taskflow('information_extraction', schema=schema, model='uie-base-en')
+            ie_en('In 1997, Steve was excited to become the CEO of Apple.')
+            '''
+            [{'Person': [{'text': 'Steve', 'start': 9, 'end': 14, 'probability': 0.999631971804547, 'relations': {'Company': [{'text': 'Apple', 'start': 48, 'end': 53, 'probability': 0.9960158209451642}], 'Position': [{'text': 'CEO', 'start': 41, 'end': 44, 'probability': 0.8871063806420736}]}}]}]
+            '''
+
+            schema = ['Sentiment classification [negative, positive]']
+            ie_en.set_schema(schema)
+            ie_en('I am sorry but this is the worst film I have ever seen in my life.')
+            '''
+            [{'Sentiment classification [negative, positive]': [{'text': 'negative', 'probability': 0.9998415771287057}]}]
+            '''
+
+            # Multilingual Model
+            schema = [{'Person': ['Company', 'Position']}]
+            ie_m = Taskflow('information_extraction', schema=schema, model='uie-m-base', schema_lang="en")
+            ie_m('In 1997, Steve was excited to become the CEO of Apple.')
+            '''
+            [{'Person': [{'text': 'Steve', 'start': 9, 'end': 14, 'probability': 0.9998436034905893, 'relations': {'Company': [{'text': 'Apple', 'start': 48, 'end': 53, 'probability': 0.9842775467359672}], 'Position': [{'text': 'CEO', 'start': 41, 'end': 44, 'probability': 0.9628799853543271}]}}]}]
+            '''
+         """
+
+MODEL_MAP = {"UIE": UIE, "UIEM": UIEM, "UIEX": UIEX}
+
+
+def get_dynamic_max_length(examples, default_max_length: int, dynamic_max_length: List[int]) -> int:
+    """get max_length by examples which you can change it by examples in batch"""
+    cur_length = len(examples[0]["input_ids"])
+    max_length = default_max_length
+    for max_length_option in sorted(dynamic_max_length):
+        if cur_length <= max_length_option:
+            max_length = max_length_option
+            break
+    return max_length
+
+
+class UIETask(Task):
+    """
+    Universal Information Extraction Task.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    resource_files_names = {
+        "model_state": "model_state.pdparams",
+        "config": "config.json",
+        "vocab_file": "vocab.txt",
+        "special_tokens_map": "special_tokens_map.json",
+        "tokenizer_config": "tokenizer_config.json",
+    }
+    # vocab.txt/special_tokens_map.json/tokenizer_config.json are common to the default model.
+    resource_files_urls = {
+        "uie-base": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base_v1.1/model_state.pdparams",
+                "47b93cf6a85688791699548210048085",
+            ],
+            "config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/config.json",
+                "ad8b5442c758fb2dc18ea53b61e867f7",
+            ],
+            "vocab_file": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
+                "1c1c1f4fd93c5bed3b4eebec4de976a8",
+            ],
+            "special_tokens_map": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json",
+                "59acb0ce78e79180a2491dfd8382b28c",
+            ],
+        },
+        "uie-medium": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_medium_v1.1/model_state.pdparams",
+                "c34475665eb05e25f3c9cd9b020b331a",
+            ],
+            "config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_medium/config.json",
+                "7fb22b3e07c5af76371c25ab814f06b8",
+            ],
+            "vocab_file": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
+                "1c1c1f4fd93c5bed3b4eebec4de976a8",
+            ],
+            "special_tokens_map": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json",
+                "59acb0ce78e79180a2491dfd8382b28c",
+            ],
+        },
+        "uie-mini": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_mini_v1.1/model_state.pdparams",
+                "9a0805762c41b104d590c15fbe9b19fd",
+            ],
+            "config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_mini/config.json",
+                "8ddebbf64c3f32a49e6f9e1c220e7322",
+            ],
+            "vocab_file": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
+                "1c1c1f4fd93c5bed3b4eebec4de976a8",
+            ],
+            "special_tokens_map": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json",
+                "59acb0ce78e79180a2491dfd8382b28c",
+            ],
+        },
+        "uie-micro": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_micro_v1.1/model_state.pdparams",
+                "da67287bca2906864929e16493f748e4",
+            ],
+            "config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_micro/config.json",
+                "544ddc65c758536cd3ba122f55b8709c",
+            ],
+            "vocab_file": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
+                "1c1c1f4fd93c5bed3b4eebec4de976a8",
+            ],
+            "special_tokens_map": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json",
+                "59acb0ce78e79180a2491dfd8382b28c",
+            ],
+        },
+        "uie-nano": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_nano_v1.1/model_state.pdparams",
+                "48db5206232e89ef16b66467562d90e5",
+            ],
+            "config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_nano/config.json",
+                "e0e0a2c0d9651ed1a8492be5507590a9",
+            ],
+            "vocab_file": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
+                "1c1c1f4fd93c5bed3b4eebec4de976a8",
+            ],
+            "special_tokens_map": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json",
+                "59acb0ce78e79180a2491dfd8382b28c",
+            ],
+        },
+        # Rename to `uie-medium` and the name of `uie-tiny` will be deprecated in future.
+        "uie-tiny": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_medium_v1.1/model_state.pdparams",
+                "c34475665eb05e25f3c9cd9b020b331a",
+            ],
+            "config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_medium/config.json",
+                "7fb22b3e07c5af76371c25ab814f06b8",
+            ],
+            "vocab_file": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
+                "1c1c1f4fd93c5bed3b4eebec4de976a8",
+            ],
+            "special_tokens_map": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json",
+                "59acb0ce78e79180a2491dfd8382b28c",
+            ],
+        },
+        "uie-medical-base": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_medical_base_v0.2/model_state.pdparams",
+                "7582d3b01f6faf00b7000111ea853796",
+            ],
+            "config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/config.json",
+                "ad8b5442c758fb2dc18ea53b61e867f7",
+            ],
+            "vocab_file": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/vocab.txt",
+                "1c1c1f4fd93c5bed3b4eebec4de976a8",
+            ],
+            "special_tokens_map": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base/tokenizer_config.json",
+                "59acb0ce78e79180a2491dfd8382b28c",
+            ],
+        },
+        "uie-base-en": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base_en_v1.2/model_state.pdparams",
+                "8c5d5c8faa76681a0aad58f982cd6141",
+            ],
+            "config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base_en/config.json",
+                "257b80ea8b7889fd8b83a9ace7a8a220",
+            ],
+            "vocab_file": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base_en/vocab.txt",
+                "64800d5d8528ce344256daf115d4965e",
+            ],
+            "special_tokens_map": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base_en/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_base_en/tokenizer_config.json",
+                "59acb0ce78e79180a2491dfd8382b28c",
+            ],
+        },
+        "uie-m-base": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_base_v1.1/model_state.pdparams",
+                "eb00c06bd7144e76343d750f5bf36ff6",
+            ],
+            "config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_base/config.json",
+                "f03de3ce1b83c13e7bee18e6f323d33f",
+            ],
+            "vocab_file": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_base/vocab.txt",
+                "e6e1091c984592e72c4460e8eb25045e",
+            ],
+            "special_tokens_map": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_base/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_base/tokenizer_config.json",
+                "f144bd065ea90cc26eaa91197124bdcc",
+            ],
+            "sentencepiece_model_file": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_base/sentencepiece.bpe.model",
+                "bf25eb5120ad92ef5c7d8596b5dc4046",
+            ],
+        },
+        "uie-m-large": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_large_v1.1/model_state.pdparams",
+                "9db83a67f34a9c2483dbe57d2510b4c2",
+            ],
+            "config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_large/config.json",
+                "8f540de05de57ecc66336b41f3a7ffdb",
+            ],
+            "vocab_file": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_large/vocab.txt",
+                "e6e1091c984592e72c4460e8eb25045e",
+            ],
+            "special_tokens_map": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_large/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_large/tokenizer_config.json",
+                "f144bd065ea90cc26eaa91197124bdcc",
+            ],
+            "sentencepiece_model_file": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_large/sentencepiece.bpe.model",
+                "bf25eb5120ad92ef5c7d8596b5dc4046",
+            ],
+        },
+        "uie-x-base": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_x_base_v1.0/model_state.pdparams",
+                "a953b55f7639ae73d1df6c2c5f7667dd",
+            ],
+            "config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_x_base/config.json",
+                "6bcd7d4b119717121fa0276c20bd9224",
+            ],
+            "vocab_file": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_x_base/vocab.txt",
+                "e6e1091c984592e72c4460e8eb25045e",
+            ],
+            "special_tokens_map": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_x_base/special_tokens_map.json",
+                "ba000b17745bb5b5b40236789318847f",
+            ],
+            "tokenizer_config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_x_base/tokenizer_config.json",
+                "09456ba644dac6f9d0b367353a36abe7",
+            ],
+            "sentencepiece_model_file": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_x_base/sentencepiece.bpe.model",
+                "bf25eb5120ad92ef5c7d8596b5dc4046",
+            ],
+        },
+        "__internal_testing__/tiny-random-uie": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-uie/model_state.pdparams",
+                "9e89a3bf94081b2d9ed89118419a3061",
+            ],
+            "config": [
+                "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-uie/config.json",
+                "113667d59b84133a99b4f1f1ec5784d7",
+            ],
+            "vocab_file": [
+                "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-uie/vocab.txt",
+                "1c1c1f4fd93c5bed3b4eebec4de976a8",
+            ],
+            "special_tokens_map": [
+                "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-uie/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-uie/tokenizer_config.json",
+                "dcb0f3257830c0eb1f2de47f2d86f89a",
+            ],
+        },
+        "__internal_testing__/tiny-random-uie-m": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-uie-m/model_state.pdparams",
+                "9fd51b19ba96ab634185744e0a214378",
+            ],
+            "config": [
+                "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-uie-m/config.json",
+                "7fc6b1503db1e68bec4e6035cc7705c5",
+            ],
+            "vocab_file": [
+                "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-uie-m/vocab.txt",
+                "e6e1091c984592e72c4460e8eb25045e",
+            ],
+            "special_tokens_map": [
+                "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-uie-m/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-uie-m/tokenizer_config.json",
+                "66651e1427b0936da3f964f640303d16",
+            ],
+            "sentencepiece_model_file": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_m_base/sentencepiece.bpe.model",
+                "bf25eb5120ad92ef5c7d8596b5dc4046",
+            ],
+        },
+        "__internal_testing__/tiny-random-uie-x": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-uie-x_v1.0/model_state.pdparams",
+                "d9b573b31a82b860b6e5a3005d7b879e",
+            ],
+            "config": [
+                "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-uie-x_v1.0/config.json",
+                "27d715e680596a69d882056a400d97db",
+            ],
+            "vocab_file": [
+                "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-uie-x/vocab.txt",
+                "e6e1091c984592e72c4460e8eb25045e",
+            ],
+            "special_tokens_map": [
+                "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-uie-x/special_tokens_map.json",
+                "ba000b17745bb5b5b40236789318847f",
+            ],
+            "tokenizer_config": [
+                "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-uie-x/tokenizer_config.json",
+                "c19bdbcec62476176d268e4dc7f1e506",
+            ],
+            "sentencepiece_model_file": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/information_extraction/uie_x_base/sentencepiece.bpe.model",
+                "bf25eb5120ad92ef5c7d8596b5dc4046",
+            ],
+        },
+    }
+
+    def __init__(self, task, model, schema=None, **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
+
+        self._convert_from_torch = kwargs.get("convert_from_torch", None)
+        self._max_seq_len = kwargs.get("max_seq_len", 512)
+        self._dynamic_max_length = kwargs.get("dynamic_max_length", None)
+        self._batch_size = kwargs.get("batch_size", 16)
+        self._split_sentence = kwargs.get("split_sentence", False)
+        self._position_prob = kwargs.get("position_prob", 0.5)
+        self._lazy_load = kwargs.get("lazy_load", False)
+        self._num_workers = kwargs.get("num_workers", 0)
+        self._use_fast = kwargs.get("use_fast", False)
+        self._layout_analysis = kwargs.get("layout_analysis", False)
+        self._ocr_lang = kwargs.get("ocr_lang", "ch")
+        self._schema_lang = kwargs.get("schema_lang", "ch")
+        self._expand_to_a4_size = False if self._custom_model else True
+
+        if self.model in [
+            "uie-m-base",
+            "uie-m-large",
+            "uie-x-base",
+            "__internal_testing__/tiny-random-uie-m",
+            "__internal_testing__/tiny-random-uie-x",
+        ]:
+            self.resource_files_names["sentencepiece_model_file"] = "sentencepiece.bpe.model"
+        elif "sentencepiece_model_file" in self.resource_files_names.keys():
+            del self.resource_files_names["sentencepiece_model_file"]
+
+        # TODO: temporary solution to support HF Hub due to lack of AutoModel
+        # change this logic to use AutoConfig when available
+        if self.from_hf_hub:
+            config_file_path = hf_hub_download(repo_id=self._task_path, filename=CONFIG_NAME)
+            with open(config_file_path) as f:
+                self._init_class = json.load(f)["architectures"].pop()
+        else:
+            # Compatible with the model fine-tuned without PretrainedConfig
+            if os.path.exists(os.path.join(self._task_path, LEGACY_CONFIG_NAME)):
+                if "config" in self.resource_files_names.keys():
+                    del self.resource_files_names["config"]
+                with open(os.path.join(self._task_path, LEGACY_CONFIG_NAME)) as f:
+                    self._init_class = json.load(f)["init_class"]
+                self._check_task_files()
+            else:
+                self._check_task_files()
+                with open(os.path.join(self._task_path, CONFIG_NAME)) as f:
+                    self._init_class = json.load(f)["architectures"].pop()
+
+        self._is_en = True if model in ["uie-base-en"] or self._schema_lang == "en" else False
+
+        if self._init_class in ["UIEX"]:
+            self._summary_token_num = 4  # [CLS] prompt [SEP] [SEP] text [SEP] for UIE-X
+        else:
+            self._summary_token_num = 3  # [CLS] prompt [SEP] text [SEP]
+
+        self._parser_map = {
+            "ch": None,  # OCR-CH
+            "en": None,  # OCR-EN
+            "ch-layout": None,  # Layout-CH
+            "en-layout": None,  # Layout-EN
+        }
+        if not schema:
+            logger.warning(
+                "The schema has not been set yet, please set a schema via set_schema(). "
+                "More details about the setting of schema please refer to https://github.com/PaddlePaddle/PaddleNLP/blob/develop/applications/information_extraction/taskflow_text.md"
+            )
+            self._schema_tree = None
+        else:
+            self.set_schema(schema)
+        self._check_predictor_type()
+        self._get_inference_model()
+        self._usage = usage
+        self._construct_tokenizer()
+
+    def set_argument(self, argument: dict):
+        for k, v in argument.items():
+            if k == "input":
+                continue
+            setattr(self, f"_{k}", v)
+
+    def set_schema(self, schema):
+        if isinstance(schema, dict) or isinstance(schema, str):
+            schema = [schema]
+        self._schema_tree = self._build_tree(schema)
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        if paddle.get_device().split(":", 1)[0] == "npu":
+            input_spec_dtype = "int32"
+        else:
+            input_spec_dtype = "int64"
+        if self._init_class in ["UIEX"]:
+            self._input_spec = [
+                paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),
+                paddle.static.InputSpec(shape=[None, None], dtype="int64", name="token_type_ids"),
+                paddle.static.InputSpec(shape=[None, None], dtype="int64", name="position_ids"),
+                paddle.static.InputSpec(shape=[None, None], dtype="int64", name="attention_mask"),
+                paddle.static.InputSpec(shape=[None, None, 4], dtype="int64", name="bbox"),
+                paddle.static.InputSpec(shape=[None, 3, 224, 224], dtype="float32", name="image"),
+            ]
+        elif self._init_class in ["UIEM"]:
+            self._input_spec = [
+                paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),
+                paddle.static.InputSpec(shape=[None, None], dtype="int64", name="position_ids"),
+            ]
+        else:
+            self._input_spec = [
+                paddle.static.InputSpec(shape=[None, None], dtype=input_spec_dtype, name="input_ids"),
+                paddle.static.InputSpec(shape=[None, None], dtype=input_spec_dtype, name="token_type_ids"),
+                paddle.static.InputSpec(shape=[None, None], dtype=input_spec_dtype, name="position_ids"),
+                paddle.static.InputSpec(shape=[None, None], dtype=input_spec_dtype, name="attention_mask"),
+            ]
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+        model_instance = MODEL_MAP[self._init_class].from_pretrained(
+            self._task_path, from_hf_hub=self.from_hf_hub, convert_from_torch=self._convert_from_torch
+        )
+        self._model = model_instance
+        self._model.eval()
+
+    def _construct_tokenizer(self):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        self._tokenizer = AutoTokenizer.from_pretrained(self._task_path, from_hf_hub=self.from_hf_hub)
+
+    def _preprocess(self, inputs):
+        """
+        Transform the raw text to the model inputs, two steps involved:
+           1) Transform the raw text to token ids.
+           2) Generate the other model inputs from the raw text and token ids.
+        """
+        inputs = self._check_input_text(inputs)
+        outputs = {}
+        outputs["text"] = inputs
+        return outputs
+
+    def _check_input_text(self, inputs):
+        """
+        Check whether the input meet the requirement.
+        """
+        self._ocr_lang_choice = (self._ocr_lang + "-layout") if self._layout_analysis else self._ocr_lang
+        inputs = inputs[0]
+        if isinstance(inputs, dict) or isinstance(inputs, str):
+            inputs = [inputs]
+        if isinstance(inputs, list):
+            input_list = []
+            for example in inputs:
+                data = {}
+                if isinstance(example, dict):
+                    if "doc" in example.keys():
+                        if not self._parser_map[self._ocr_lang_choice]:
+                            self._parser_map[self._ocr_lang_choice] = DocParser(
+                                ocr_lang=self._ocr_lang, layout_analysis=self._layout_analysis
+                            )
+                        if "layout" in example.keys():
+                            data = self._parser_map[self._ocr_lang_choice].parse(
+                                {"doc": example["doc"]}, do_ocr=False, expand_to_a4_size=self._expand_to_a4_size
+                            )
+                            data["layout"] = example["layout"]
+                        else:
+                            data = self._parser_map[self._ocr_lang_choice].parse(
+                                {"doc": example["doc"]}, expand_to_a4_size=self._expand_to_a4_size
+                            )
+                    elif "text" in example.keys():
+                        if not isinstance(example["text"], str):
+                            raise TypeError(
+                                "Invalid inputs, the input text should be string. but type of {} found!".format(
+                                    type(example["text"])
+                                )
+                            )
+                        data["text"] = example["text"]
+                    else:
+                        raise ValueError("Invalid inputs, the input should contain a doc or a text.")
+                    input_list.append(data)
+                elif isinstance(example, str):
+                    input_list.append(example)
+                else:
+                    raise TypeError(
+                        "Invalid inputs, the input should be dict or list of dict, but type of {} found!".format(
+                            type(example)
+                        )
+                    )
+        else:
+            raise TypeError("Invalid input format!")
+        return input_list
+
+    def _single_stage_predict(self, inputs):
+        input_texts = [d["text"] for d in inputs]
+        prompts = [d["prompt"] for d in inputs]
+
+        # max predict length should exclude the length of prompt and summary tokens
+        max_predict_len = self._max_seq_len - len(max(prompts)) - self._summary_token_num
+
+        if self._init_class in ["UIEX"]:
+            bbox_list = [d["bbox"] for d in inputs]
+            short_input_texts, short_bbox_list, input_mapping = self._auto_splitter(
+                input_texts, max_predict_len, bbox_list=bbox_list, split_sentence=self._split_sentence
+            )
+        else:
+            short_input_texts, input_mapping = self._auto_splitter(
+                input_texts, max_predict_len, split_sentence=self._split_sentence
+            )
+
+        short_texts_prompts = []
+        for k, v in input_mapping.items():
+            short_texts_prompts.extend([prompts[k] for _ in range(len(v))])
+        if self._init_class in ["UIEX"]:
+            image_list = []
+            for k, v in input_mapping.items():
+                image_list.extend([inputs[k]["image"] for _ in range(len(v))])
+            short_inputs = [
+                {
+                    "text": short_input_texts[i],
+                    "prompt": short_texts_prompts[i],
+                    "bbox": short_bbox_list[i],
+                    "image": image_list[i],
+                }
+                for i in range(len(short_input_texts))
+            ]
+        else:
+            short_inputs = [
+                {"text": short_input_texts[i], "prompt": short_texts_prompts[i]} for i in range(len(short_input_texts))
+            ]
+
+        def text_reader(inputs):
+            for example in inputs:
+                if self._dynamic_max_length is not None:
+                    temp_encoded_inputs = self._tokenizer(
+                        text=[example["prompt"]],
+                        text_pair=[example["text"]],
+                        truncation=True,
+                        max_seq_len=self._max_seq_len,
+                        return_attention_mask=True,
+                        return_position_ids=True,
+                        return_dict=False,
+                        return_offsets_mapping=True,
+                    )
+                    max_length = get_dynamic_max_length(
+                        examples=temp_encoded_inputs,
+                        default_max_length=self._max_seq_len,
+                        dynamic_max_length=self._dynamic_max_length,
+                    )
+                    encoded_inputs = self._tokenizer(
+                        text=[example["prompt"]],
+                        text_pair=[example["text"]],
+                        truncation=True,
+                        max_seq_len=max_length,
+                        pad_to_max_seq_len=True,
+                        return_attention_mask=True,
+                        return_position_ids=True,
+                        return_offsets_mapping=True,
+                    )
+                    logger.info("Inference with dynamic max length in {}".format(max_length))
+                else:
+                    encoded_inputs = self._tokenizer(
+                        text=[example["prompt"]],
+                        text_pair=[example["text"]],
+                        truncation=True,
+                        max_seq_len=self._max_seq_len,
+                        pad_to_max_seq_len=True,
+                        return_attention_mask=True,
+                        return_position_ids=True,
+                        return_offsets_mapping=True,
+                    )
+                if self._init_class in ["UIEM"]:
+                    tokenized_output = [
+                        encoded_inputs["input_ids"][0],
+                        encoded_inputs["position_ids"][0],
+                        encoded_inputs["offset_mapping"][0],
+                    ]
+                else:
+                    tokenized_output = [
+                        encoded_inputs["input_ids"][0],
+                        encoded_inputs["token_type_ids"][0],
+                        encoded_inputs["position_ids"][0],
+                        encoded_inputs["attention_mask"][0],
+                        encoded_inputs["offset_mapping"][0],
+                    ]
+                tokenized_output = [np.array(x, dtype="int64") for x in tokenized_output]
+                yield tuple(tokenized_output)
+
+        def doc_reader(inputs, pad_id=1, c_sep_id=2):
+            def _process_bbox(tokens, bbox_lines, offset_mapping, offset_bias):
+                bbox_list = [[0, 0, 0, 0] for x in range(len(tokens))]
+
+                for index, bbox in enumerate(bbox_lines):
+                    index_token = map_offset(index + offset_bias, offset_mapping)
+                    if 0 <= index_token < len(bbox_list):
+                        bbox_list[index_token] = bbox
+                return bbox_list
+
+            def _encode_doc(
+                tokenizer, offset_mapping, last_offset, prompt, this_text_line, inputs_ids, q_sep_index, max_seq_len
+            ):
+                if len(offset_mapping) == 0:
+                    content_encoded_inputs = tokenizer(
+                        text=[prompt],
+                        text_pair=[this_text_line],
+                        max_seq_len=max_seq_len,
+                        return_dict=False,
+                        return_offsets_mapping=True,
+                    )
+
+                    content_encoded_inputs = content_encoded_inputs[0]
+                    inputs_ids = content_encoded_inputs["input_ids"][:-1]
+                    sub_offset_mapping = [list(x) for x in content_encoded_inputs["offset_mapping"]]
+                    q_sep_index = content_encoded_inputs["input_ids"].index(2, 1)
+
+                    bias = 0
+                    for i in range(len(sub_offset_mapping)):
+                        if i == 0:
+                            continue
+                        mapping = sub_offset_mapping[i]
+                        if mapping[0] == 0 and mapping[1] == 0 and bias == 0:
+                            bias = sub_offset_mapping[i - 1][-1] + 1
+                        if mapping[0] == 0 and mapping[1] == 0:
+                            continue
+                        if mapping == sub_offset_mapping[i - 1]:
+                            continue
+                        sub_offset_mapping[i][0] += bias
+                        sub_offset_mapping[i][1] += bias
+
+                    offset_mapping = sub_offset_mapping[:-1]
+                    last_offset = offset_mapping[-1][-1]
+                else:
+                    content_encoded_inputs = tokenizer(
+                        text=this_text_line, max_seq_len=max_seq_len, return_dict=False, return_offsets_mapping=True
+                    )
+                    inputs_ids += content_encoded_inputs["input_ids"][1:-1]
+                    sub_offset_mapping = [list(x) for x in content_encoded_inputs["offset_mapping"]]
+                    for i, sub_list in enumerate(sub_offset_mapping[1:-1]):
+                        if i == 0:
+                            org_offset = sub_list[1]
+                        else:
+                            if sub_list[0] != org_offset and sub_offset_mapping[1:-1][i - 1] != sub_list:
+                                last_offset += 1
+                            org_offset = sub_list[1]
+                        offset_mapping += [[last_offset, sub_list[1] - sub_list[0] + last_offset]]
+                        last_offset = offset_mapping[-1][-1]
+                return offset_mapping, last_offset, q_sep_index, inputs_ids
+
+            for example in inputs:
+                content = example["text"]
+                prompt = example["prompt"]
+                bbox_lines = example.get("bbox", None)
+                image_buff_string = example.get("image", None)
+                # Text
+                if bbox_lines is None:
+                    encoded_inputs = self._tokenizer(
+                        text=[example["prompt"]],
+                        text_pair=[example["text"]],
+                        truncation=True,
+                        max_seq_len=self._max_seq_len,
+                        pad_to_max_seq_len=True,
+                        return_attention_mask=True,
+                        return_position_ids=True,
+                        return_offsets_mapping=True,
+                        return_dict=False,
+                    )
+
+                    encoded_inputs = encoded_inputs[0]
+
+                    inputs_ids = encoded_inputs["input_ids"]
+                    position_ids = encoded_inputs["position_ids"]
+                    attention_mask = encoded_inputs["attention_mask"]
+
+                    q_sep_index = inputs_ids.index(2, 1)
+                    c_sep_index = attention_mask.index(0)
+
+                    offset_mapping = [list(x) for x in encoded_inputs["offset_mapping"]]
+
+                    bbox_list = [[0, 0, 0, 0] for x in range(len(inputs_ids))]
+                    token_type_ids = [
+                        1 if token_index <= q_sep_index or token_index > c_sep_index else 0
+                        for token_index in range(self._max_seq_len)
+                    ]
+                    padded_image = np.zeros([3, 224, 224])
+                # Doc
+                else:
+                    inputs_ids = []
+                    prev_bbox = [-1, -1, -1, -1]
+                    this_text_line = ""
+                    q_sep_index = -1
+                    offset_mapping = []
+                    last_offset = 0
+                    for char_index, (char, bbox) in enumerate(zip(content, bbox_lines)):
+                        if char_index == 0:
+                            prev_bbox = bbox
+                            this_text_line = char
+                            continue
+
+                        if all([bbox[x] == prev_bbox[x] for x in range(4)]):
+                            this_text_line += char
+                        else:
+                            offset_mapping, last_offset, q_sep_index, inputs_ids = _encode_doc(
+                                self._tokenizer,
+                                offset_mapping,
+                                last_offset,
+                                prompt,
+                                this_text_line,
+                                inputs_ids,
+                                q_sep_index,
+                                self._max_seq_len,
+                            )
+                            this_text_line = char
+                        prev_bbox = bbox
+                    if len(this_text_line) > 0:
+                        offset_mapping, last_offset, q_sep_index, inputs_ids = _encode_doc(
+                            self._tokenizer,
+                            offset_mapping,
+                            last_offset,
+                            prompt,
+                            this_text_line,
+                            inputs_ids,
+                            q_sep_index,
+                            self._max_seq_len,
+                        )
+                    if len(inputs_ids) > self._max_seq_len:
+                        inputs_ids = inputs_ids[: (self._max_seq_len - 1)] + [c_sep_id]
+                        offset_mapping = offset_mapping[: (self._max_seq_len - 1)] + [[0, 0]]
+                    else:
+                        inputs_ids += [c_sep_id]
+                        offset_mapping += [[0, 0]]
+
+                    if len(offset_mapping) > 1:
+                        offset_bias = offset_mapping[q_sep_index - 1][-1] + 1
+                    else:
+                        offset_bias = 0
+
+                    seq_len = len(inputs_ids)
+                    inputs_ids += [pad_id] * (self._max_seq_len - seq_len)
+                    token_type_ids = [1] * (q_sep_index + 1) + [0] * (seq_len - q_sep_index - 1)
+                    token_type_ids += [pad_id] * (self._max_seq_len - seq_len)
+
+                    bbox_list = _process_bbox(inputs_ids, bbox_lines, offset_mapping, offset_bias)
+
+                    offset_mapping += [[0, 0]] * (self._max_seq_len - seq_len)
+
+                    # Reindex the text
+                    text_start_idx = offset_mapping[1:].index([0, 0]) + self._summary_token_num - 1
+                    for idx in range(text_start_idx, self._max_seq_len):
+                        offset_mapping[idx][0] -= offset_bias
+                        offset_mapping[idx][1] -= offset_bias
+
+                    position_ids = list(range(seq_len))
+
+                    position_ids = position_ids + [0] * (self._max_seq_len - seq_len)
+                    attention_mask = [1] * seq_len + [0] * (self._max_seq_len - seq_len)
+
+                    image_data = base64.b64decode(image_buff_string.encode("utf8"))
+                    padded_image = pad_image_data(image_data)
+
+                input_list = [
+                    inputs_ids,
+                    token_type_ids,
+                    position_ids,
+                    attention_mask,
+                    bbox_list,
+                    padded_image,
+                    offset_mapping,
+                ]
+                input_list = [inputs_ids, token_type_ids, position_ids, attention_mask, bbox_list]
+                return_list = [np.array(x, dtype="int64") for x in input_list]
+                return_list.append(np.array(padded_image, dtype="float32"))
+                return_list.append(np.array(offset_mapping, dtype="int64"))
+                assert len(inputs_ids) == self._max_seq_len
+                assert len(token_type_ids) == self._max_seq_len
+                assert len(position_ids) == self._max_seq_len
+                assert len(attention_mask) == self._max_seq_len
+                assert len(bbox_list) == self._max_seq_len
+                yield tuple(return_list)
+
+        reader = doc_reader if self._init_class in ["UIEX"] else text_reader
+        infer_ds = load_dataset(reader, inputs=short_inputs, lazy=self._lazy_load)
+        batch_sampler = paddle.io.BatchSampler(dataset=infer_ds, batch_size=self._batch_size, shuffle=False)
+
+        infer_data_loader = paddle.io.DataLoader(
+            dataset=infer_ds, batch_sampler=batch_sampler, num_workers=self._num_workers, return_list=True
+        )
+
+        sentence_ids = []
+        probs = []
+        for batch in infer_data_loader:
+            if self._init_class in ["UIEX"]:
+                input_ids, token_type_ids, pos_ids, att_mask, bbox, image, offset_maps = batch
+            elif self._init_class in ["UIEM"]:
+                input_ids, pos_ids, offset_maps = batch
+            else:
+                input_ids, token_type_ids, pos_ids, att_mask, offset_maps = batch
+            if self._predictor_type == "paddle-inference":
+                if self._init_class in ["UIEX"]:
+                    self.input_handles[0].copy_from_cpu(input_ids.numpy())
+                    self.input_handles[1].copy_from_cpu(token_type_ids.numpy())
+                    self.input_handles[2].copy_from_cpu(pos_ids.numpy())
+                    self.input_handles[3].copy_from_cpu(att_mask.numpy())
+                    self.input_handles[4].copy_from_cpu(bbox.numpy())
+                    self.input_handles[5].copy_from_cpu(image.numpy())
+                elif self._init_class in ["UIEM"]:
+                    self.input_handles[0].copy_from_cpu(input_ids.numpy())
+                    self.input_handles[1].copy_from_cpu(pos_ids.numpy())
+                else:
+                    self.input_handles[0].copy_from_cpu(input_ids.numpy())
+                    self.input_handles[1].copy_from_cpu(token_type_ids.numpy())
+                    self.input_handles[2].copy_from_cpu(pos_ids.numpy())
+                    self.input_handles[3].copy_from_cpu(att_mask.numpy())
+                self.predictor.run()
+                start_prob = self.output_handle[0].copy_to_cpu().tolist()
+                end_prob = self.output_handle[1].copy_to_cpu().tolist()
+            else:
+                if self._init_class in ["UIEX"]:
+                    input_dict = {
+                        "input_ids": input_ids.numpy(),
+                        "token_type_ids": token_type_ids.numpy(),
+                        "position_ids": pos_ids.numpy(),
+                        "attention_mask": att_mask.numpy(),
+                        "bbox": bbox.numpy(),
+                        "image": image.numpy(),
+                    }
+                elif self._init_class in ["UIEM"]:
+                    input_dict = {
+                        "input_ids": input_ids.numpy(),
+                        "position_ids": pos_ids.numpy(),
+                    }
+                else:
+                    input_dict = {
+                        "input_ids": input_ids.numpy(),
+                        "token_type_ids": token_type_ids.numpy(),
+                        "position_ids": pos_ids.numpy(),
+                        "attention_mask": att_mask.numpy(),
+                    }
+                start_prob, end_prob = self.predictor.run(None, input_dict)
+                start_prob = start_prob.tolist()
+                end_prob = end_prob.tolist()
+
+            start_ids_list = get_bool_ids_greater_than(start_prob, limit=self._position_prob, return_prob=True)
+            end_ids_list = get_bool_ids_greater_than(end_prob, limit=self._position_prob, return_prob=True)
+            for start_ids, end_ids, offset_map in zip(start_ids_list, end_ids_list, offset_maps.tolist()):
+                span_set = get_span(start_ids, end_ids, with_prob=True)
+                sentence_id, prob = get_id_and_prob(span_set, offset_map)
+                sentence_ids.append(sentence_id)
+                probs.append(prob)
+        results = self._convert_ids_to_results(short_inputs, sentence_ids, probs)
+        results = self._auto_joiner(results, short_input_texts, input_mapping)
+        return results
+
+    def _auto_joiner(self, short_results, short_inputs, input_mapping):
+        concat_results = []
+        is_cls_task = False
+        for short_result in short_results:
+            if short_result == []:
+                continue
+            elif "start" not in short_result[0].keys() and "end" not in short_result[0].keys():
+                is_cls_task = True
+                break
+            else:
+                break
+        for k, vs in input_mapping.items():
+            if is_cls_task:
+                cls_options = {}
+                single_results = []
+                for v in vs:
+                    if len(short_results[v]) == 0:
+                        continue
+                    if short_results[v][0]["text"] not in cls_options.keys():
+                        cls_options[short_results[v][0]["text"]] = [1, short_results[v][0]["probability"]]
+                    else:
+                        cls_options[short_results[v][0]["text"]][0] += 1
+                        cls_options[short_results[v][0]["text"]][1] += short_results[v][0]["probability"]
+                if len(cls_options) != 0:
+                    cls_res, cls_info = max(cls_options.items(), key=lambda x: x[1])
+                    concat_results.append([{"text": cls_res, "probability": cls_info[1] / cls_info[0]}])
+                else:
+                    concat_results.append([])
+            else:
+                offset = 0
+                single_results = []
+                for v in vs:
+                    if v == 0:
+                        single_results = short_results[v]
+                        offset += len(short_inputs[v])
+                    else:
+                        for i in range(len(short_results[v])):
+                            if "start" not in short_results[v][i] or "end" not in short_results[v][i]:
+                                continue
+                            short_results[v][i]["start"] += offset
+                            short_results[v][i]["end"] += offset
+                        offset += len(short_inputs[v])
+                        single_results.extend(short_results[v])
+                concat_results.append(single_results)
+        return concat_results
+
+    def _run_model(self, inputs):
+        raw_inputs = inputs["text"]
+        _inputs = self._parse_inputs(raw_inputs)
+        results = self._multi_stage_predict(_inputs)
+        inputs["result"] = results
+        return inputs
+
+    def _parse_inputs(self, inputs):
+        _inputs = []
+        for d in inputs:
+            if isinstance(d, dict):
+                if "doc" in d.keys():
+                    text = ""
+                    bbox = []
+                    img_w, img_h = d["img_w"], d["img_h"]
+                    offset_x, offset_y = d["offset_x"], d["offset_x"]
+                    for segment in d["layout"]:
+                        org_box = segment[0]  # bbox before expand to A4 size
+                        box = [
+                            org_box[0] + offset_x,
+                            org_box[1] + offset_y,
+                            org_box[2] + offset_x,
+                            org_box[3] + offset_y,
+                        ]
+                        box = self._parser_map[self._ocr_lang_choice]._normalize_box(box, [img_w, img_h], [1000, 1000])
+                        text += segment[1]
+                        bbox.extend([box] * len(segment[1]))
+                    _inputs.append({"text": text, "bbox": bbox, "image": d["image"], "layout": d["layout"]})
+                else:
+                    _inputs.append({"text": d["text"], "bbox": None, "image": None})
+            else:
+                _inputs.append({"text": d, "bbox": None, "image": None})
+        return _inputs
+
+    def _multi_stage_predict(self, data):
+        """
+        Traversal the schema tree and do multi-stage prediction.
+
+        Args:
+            data (list): a list of strings
+
+        Returns:
+            list: a list of predictions, where the list's length
+                equals to the length of `data`
+        """
+        results = [{} for _ in range(len(data))]
+        # Input check to early return
+        if len(data) < 1 or self._schema_tree is None:
+            return results
+
+        # Copy to stay `self._schema_tree` unchanged
+        schema_list = self._schema_tree.children[:]
+        while len(schema_list) > 0:
+            node = schema_list.pop(0)
+            examples = []
+            input_map = {}
+            cnt = 0
+            idx = 0
+            if not node.prefix:
+                for one_data in data:
+                    examples.append(
+                        {
+                            "text": one_data["text"],
+                            "bbox": one_data["bbox"],
+                            "image": one_data["image"],
+                            "prompt": dbc2sbc(node.name),
+                        }
+                    )
+                    input_map[cnt] = [idx]
+                    idx += 1
+                    cnt += 1
+            else:
+                for pre, one_data in zip(node.prefix, data):
+                    if len(pre) == 0:
+                        input_map[cnt] = []
+                    else:
+                        for p in pre:
+                            if self._is_en:
+                                if re.search(r"\[.*?\]$", node.name):
+                                    prompt_prefix = node.name[: node.name.find("[", 1)].strip()
+                                    cls_options = re.search(r"\[.*?\]$", node.name).group()
+                                    # Sentiment classification of xxx [positive, negative]
+                                    prompt = prompt_prefix + p + " " + cls_options
+                                else:
+                                    prompt = node.name + p
+                            else:
+                                prompt = p + node.name
+                            examples.append(
+                                {
+                                    "text": one_data["text"],
+                                    "bbox": one_data["bbox"],
+                                    "image": one_data["image"],
+                                    "prompt": dbc2sbc(prompt),
+                                }
+                            )
+                        input_map[cnt] = [i + idx for i in range(len(pre))]
+                        idx += len(pre)
+                    cnt += 1
+            if len(examples) == 0:
+                result_list = []
+            else:
+                result_list = self._single_stage_predict(examples)
+
+            if not node.parent_relations:
+                relations = [[] for i in range(len(data))]
+                for k, v in input_map.items():
+                    for idx in v:
+                        if len(result_list[idx]) == 0:
+                            continue
+                        if node.name not in results[k].keys():
+                            results[k][node.name] = result_list[idx]
+                        else:
+                            results[k][node.name].extend(result_list[idx])
+                    if node.name in results[k].keys():
+                        relations[k].extend(results[k][node.name])
+            else:
+                relations = node.parent_relations
+                for k, v in input_map.items():
+                    for i in range(len(v)):
+                        if len(result_list[v[i]]) == 0:
+                            continue
+                        if "relations" not in relations[k][i].keys():
+                            relations[k][i]["relations"] = {node.name: result_list[v[i]]}
+                        elif node.name not in relations[k][i]["relations"].keys():
+                            relations[k][i]["relations"][node.name] = result_list[v[i]]
+                        else:
+                            relations[k][i]["relations"][node.name].extend(result_list[v[i]])
+                new_relations = [[] for i in range(len(data))]
+                for i in range(len(relations)):
+                    for j in range(len(relations[i])):
+                        if "relations" in relations[i][j].keys() and node.name in relations[i][j]["relations"].keys():
+                            for k in range(len(relations[i][j]["relations"][node.name])):
+                                new_relations[i].append(relations[i][j]["relations"][node.name][k])
+                relations = new_relations
+
+            prefix = [[] for _ in range(len(data))]
+            for k, v in input_map.items():
+                for idx in v:
+                    for i in range(len(result_list[idx])):
+                        if self._is_en:
+                            prefix[k].append(" of " + result_list[idx][i]["text"])
+                        else:
+                            prefix[k].append(result_list[idx][i]["text"] + "的")
+
+            for child in node.children:
+                child.prefix = prefix
+                child.parent_relations = relations
+                schema_list.append(child)
+        results = self._add_bbox_info(results, data)
+        return results
+
+    def _add_bbox_info(self, results, data):
+        def _add_bbox(result, char_boxes):
+            for vs in result.values():
+                for v in vs:
+                    if "start" in v.keys() and "end" in v.keys():
+                        boxes = []
+                        for i in range(v["start"], v["end"]):
+                            cur_box = char_boxes[i][1]
+                            if i == v["start"]:
+                                box = cur_box
+                                continue
+                            _, cur_y1, cur_x2, cur_y2 = cur_box
+                            if cur_y1 == box[1] and cur_y2 == box[3]:
+                                box[2] = cur_x2
+                            else:
+                                boxes.append(box)
+                                box = cur_box
+                        if box:
+                            boxes.append(box)
+                        boxes = [[int(b) for b in box] for box in boxes]
+                        v["bbox"] = boxes
+                    if v.get("relations"):
+                        _add_bbox(v["relations"], char_boxes)
+            return result
+
+        new_results = []
+        for result, one_data in zip(results, data):
+            if "layout" in one_data.keys():
+                layout = one_data["layout"]
+                char_boxes = []
+                for segment in layout:
+                    sbox = segment[0]
+                    text_len = len(segment[1])
+                    if text_len == 0:
+                        continue
+                    if len(segment) == 2 or (len(segment) == 3 and segment[2] != "table"):
+                        char_w = (sbox[2] - sbox[0]) * 1.0 / text_len
+                        for i in range(text_len):
+                            cbox = [sbox[0] + i * char_w, sbox[1], sbox[0] + (i + 1) * char_w, sbox[3]]
+                            char_boxes.append((segment[1][i], cbox))
+                    else:
+                        cell_bbox = [(segment[1][i], sbox) for i in range(text_len)]
+                        char_boxes.extend(cell_bbox)
+
+                result = _add_bbox(result, char_boxes)
+            new_results.append(result)
+        return new_results
+
+    def _convert_ids_to_results(self, examples, sentence_ids, probs):
+        """
+        Convert ids to raw text in a single stage.
+        """
+        results = []
+        for example, sentence_id, prob in zip(examples, sentence_ids, probs):
+            if len(sentence_id) == 0:
+                results.append([])
+                continue
+            result_list = []
+            text = example["text"]
+            prompt = example["prompt"]
+            for i in range(len(sentence_id)):
+                start, end = sentence_id[i]
+                if start < 0 and end >= 0:
+                    continue
+                if end < 0:
+                    start += len(prompt) + 1
+                    end += len(prompt) + 1
+                    result = {"text": prompt[start:end], "probability": prob[i]}
+                    result_list.append(result)
+                else:
+                    result = {"text": text[start:end], "start": start, "end": end, "probability": prob[i]}
+                    result_list.append(result)
+            results.append(result_list)
+        return results
+
+    @classmethod
+    def _build_tree(cls, schema, name="root"):
+        """
+        Build the schema tree.
+        """
+        schema_tree = SchemaTree(name)
+        for s in schema:
+            if isinstance(s, str):
+                schema_tree.add_child(SchemaTree(s))
+            elif isinstance(s, dict):
+                for k, v in s.items():
+                    if isinstance(v, str):
+                        child = [v]
+                    elif isinstance(v, list):
+                        child = v
+                    else:
+                        raise TypeError(
+                            "Invalid schema, value for each key:value pairs should be list or string"
+                            "but {} received".format(type(v))
+                        )
+                    schema_tree.add_child(cls._build_tree(child, name=k))
+            else:
+                raise TypeError("Invalid schema, element should be string or dict, " "but {} received".format(type(s)))
+        return schema_tree
+
+    def _postprocess(self, inputs):
+        """
+        This function will convert the model output to raw text.
+        """
+        return inputs["result"]
+
+
+class GPTask(Task):
+    """
+    Global Pointer for closed-domain information extraction Task.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    resource_files_names = {
+        "model_state": "model_state.pdparams",
+        "model_config": "model_config.json",
+        "vocab_file": "vocab.txt",
+        "special_tokens_map": "special_tokens_map.json",
+        "tokenizer_config": "tokenizer_config.json",
+    }
+
+    def __init__(self, task, model, **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
+        self._schema_tree = None
+        self._load_config()
+        self._construct_tokenizer()
+        self._get_inference_model()
+
+        self._max_seq_len = kwargs.get("max_seq_len", 256)
+        self._batch_size = kwargs.get("batch_size", 64)
+        self._lazy_load = kwargs.get("lazy_load", False)
+        self._num_workers = kwargs.get("num_workers", 0)
+
+    def _load_config(self):
+        model_config_file = os.path.join(self._task_path, self.resource_files_names["model_config"])
+        with open(model_config_file, encoding="utf-8") as f:
+            model_config = json.load(f)
+        self._label_maps = model_config["label_maps"]
+        self._task_type = model_config["task_type"]
+        self._encoder = model_config["encoder"]
+        schema = model_config["label_maps"]["schema"]
+        self._set_schema(schema)
+
+    def _set_schema(self, schema):
+        if isinstance(schema, dict) or isinstance(schema, str):
+            schema = [schema]
+        self._schema_tree = self._build_tree(schema)
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        self._input_spec = [
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="att_mask"),
+        ]
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+        encoder = AutoModel.from_pretrained(self._encoder)
+        if self._task_type == "entity_extraction":
+            model_instance = GlobalPointerForEntityExtraction(encoder, self._label_maps)
+        else:
+            model_instance = GPLinkerForRelationExtraction(encoder, self._label_maps)
+        model_path = os.path.join(self._task_path, "model_state.pdparams")
+        state_dict = paddle.load(model_path)
+        model_instance.set_dict(state_dict)
+        self._model = model_instance
+        self._model.eval()
+
+    def _construct_tokenizer(self):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        # TODO(zhoushunjie): Will set use_fast=True in future.
+        self._tokenizer = AutoTokenizer.from_pretrained(self._task_path)
+
+    def _preprocess(self, inputs):
+        """
+        Transform the raw text to the model inputs, two steps involved:
+           1) Transform the raw text to token ids.
+           2) Generate the other model inputs from the raw text and token ids.
+        """
+        inputs = self._check_input_text(inputs)
+
+        def read(inputs):
+            for x in inputs:
+                tokenized_inputs = self._tokenizer(
+                    x,
+                    max_length=self._max_seq_len,
+                    padding=False,
+                    truncation=True,
+                    return_attention_mask=True,
+                    return_offsets_mapping=True,
+                    return_token_type_ids=False,
+                )
+                tokenized_inputs["text"] = x
+                yield tokenized_inputs
+
+        infer_ds = load_dataset(read, inputs=inputs, lazy=self._lazy_load)
+
+        data_collator = DataCollatorGP(self._tokenizer, label_maps=self._label_maps, task_type=self._task_type)
+
+        batch_sampler = paddle.io.BatchSampler(dataset=infer_ds, batch_size=self._batch_size, shuffle=False)
+
+        infer_data_loader = paddle.io.DataLoader(
+            dataset=infer_ds,
+            batch_sampler=batch_sampler,
+            collate_fn=data_collator,
+            num_workers=self._num_workers,
+            return_list=True,
+        )
+        outputs = {}
+        outputs["data_loader"] = infer_data_loader
+        outputs["input_texts"] = inputs
+        return outputs
+
+    def _run_model(self, inputs):
+        all_preds = ([], []) if self._task_type in ["opinion_extraction", "relation_extraction"] else []
+        for batch in inputs["data_loader"]:
+            input_ids, attention_masks, offset_mappings, texts = batch
+            self.input_handles[0].copy_from_cpu(input_ids.numpy().astype("int64"))
+            self.input_handles[1].copy_from_cpu(attention_masks.numpy().astype("int64"))
+            self.predictor.run()
+            logits = [paddle.to_tensor(self.output_handle[i].copy_to_cpu()) for i in range(len(self.output_handle))]
+            batch_outputs = gp_decode(logits, offset_mappings, texts, self._label_maps, self._task_type)
+            if isinstance(batch_outputs, tuple):
+                all_preds[0].extend(batch_outputs[0])  # Entity output
+                all_preds[1].extend(batch_outputs[1])  # Relation output
+            else:
+                all_preds.extend(batch_outputs)
+        inputs["result"] = all_preds
+        return inputs
+
+    @classmethod
+    def _build_tree(cls, schema, name="root"):
+        """
+        Build the schema tree.
+        """
+        schema_tree = SchemaTree(name)
+        for s in schema:
+            if isinstance(s, str):
+                schema_tree.add_child(SchemaTree(s))
+            elif isinstance(s, dict):
+                for k, v in s.items():
+                    if isinstance(v, str):
+                        child = [v]
+                    elif isinstance(v, list):
+                        child = v
+                    else:
+                        raise TypeError(
+                            "Invalid schema, value for each key:value pairs should be list or string"
+                            "but {} received".format(type(v))
+                        )
+                    schema_tree.add_child(cls._build_tree(child, name=k))
+            else:
+                raise TypeError("Invalid schema, element should be string or dict, " "but {} received".format(type(s)))
+        return schema_tree
+
+    def _postprocess(self, inputs):
+        if self._task_type == "entity_extraction":
+            results = self._postprocess_entity_extraction(inputs)
+        elif self._task_type == "opinion_extraction":
+            results = self._postprocess_opinion_extraction(inputs)
+        else:
+            results = self._postprocess_relation_extraction(inputs)
+        return results
+
+    def _postprocess_opinion_extraction(self, inputs):
+        all_ent_preds, all_rel_preds = inputs["result"]
+        results = []
+        for i in range(len(inputs["input_texts"])):
+            result = {}
+            aspect_maps = {}
+            for ent in all_ent_preds[i]:
+                ent_res = {
+                    "text": ent["text"],
+                    "start": ent["start_index"],
+                    "end": ent["start_index"] + len(ent["text"]),
+                    "probability": ent["probability"],
+                }
+                result.setdefault(ent["type"], []).append(ent_res)
+                if ent["type"] == "评价维度":
+                    for r in result["评价维度"]:
+                        if ent["text"] == r["text"] and ent["start_index"] == r["start"]:
+                            aspect_maps[(ent["text"], ent["start_index"])] = r
+                            break
+
+            for rel in all_rel_preds[i]:
+                r = aspect_maps[(rel["aspect"], rel["aspect_start_index"])]
+                r["relations"] = {}
+                sentiment = {"probability": rel["probability"], "text": rel["sentiment"]}
+                opinion = {
+                    "text": rel["opinion"],
+                    "start": rel["opinion_start_index"],
+                    "end": rel["opinion_start_index"] + len(rel["opinion"]),
+                    "probability": rel["probability"],
+                }
+                r["relations"].setdefault("情感倾向[正向，负向]", []).append(sentiment)
+                r["relations"].setdefault("观点词", []).append(opinion)
+            results.append(result)
+        return results
+
+    def _postprocess_relation_extraction(self, inputs):
+        all_ent_preds, all_rel_preds = inputs["result"]
+        results = []
+        for input_text_idx in range(len(inputs["input_texts"])):
+            result = {}
+            schema_list = self._schema_tree.children[:]
+            while len(schema_list) > 0:
+                node = schema_list.pop(0)
+                if node.parent_relations is None:
+                    prefix = []
+                    relations = [[]]
+                    cnt = -1
+                    for ent in all_ent_preds[input_text_idx]:
+                        if node.name == ent["type"]:
+                            ent_res = {
+                                "text": ent["text"],
+                                "start": ent["start_index"],
+                                "end": ent["start_index"] + len(ent["text"]),
+                                "probability": ent["probability"].astype("float"),
+                            }
+                            result.setdefault(node.name, []).append(ent_res)
+                            cnt += 1
+                            result[node.name][cnt]["relations"] = {}
+                            relations[0].append(result[node.name][cnt])
+                else:
+                    relations = [[] for _ in range(len(node.parent_relations))]
+                    for i, rs in enumerate(node.parent_relations):
+                        for r in rs:
+                            cnt = -1
+                            for rel in all_rel_preds[input_text_idx]:
+                                if (
+                                    r["text"] == rel["subject"]
+                                    and r["start"] == rel["subject_start_index"]
+                                    and node.name == rel["predicate"]
+                                ):
+                                    rel_res = {
+                                        "text": rel["object"],
+                                        "start": rel["object_start_index"],
+                                        "end": rel["object_start_index"] + len(rel["object"]),
+                                        "probability": rel["probability"].astype("float"),
+                                    }
+                                    r["relations"].setdefault(node.name, []).append(rel_res)
+                                    cnt += 1
+                                    r["relations"][node.name][cnt]["relations"] = {}
+                                    relations[i].append(r["relations"][node.name][cnt])
+                for child in node.children:
+                    child.prefix = prefix
+                    child.parent_relations = relations
+                    schema_list.append(child)
+            results.append(result)
+        return results
+
+    def _postprocess_entity_extraction(self, inputs):
+        all_preds = inputs["result"]
+        results = []
+        for input_text_idx in range(len(inputs["input_texts"])):
+            result = {}
+            schema_list = self._schema_tree.children[:]
+            while len(schema_list) > 0:
+                node = schema_list.pop(0)
+                for ent in all_preds[input_text_idx]:
+                    if node.name == ent["type"]:
+                        ent_res = {
+                            "text": ent["text"],
+                            "start": ent["start_index"],
+                            "end": ent["start_index"] + len(ent["text"]),
+                            "probability": ent["probability"].astype("float"),
+                        }
+                        result.setdefault(node.name, []).append(ent_res)
+            results.append(result)
+        return results
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/knowledge_mining.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/knowledge_mining.py
new file mode 100644
index 000000000..23dd88798
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/knowledge_mining.py
@@ -0,0 +1,773 @@
+# coding:utf-8
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from collections import OrderedDict
+
+import numpy as np
+import paddle
+
+from ..datasets import load_dataset
+from ..transformers import ErnieCtmNptagModel, ErnieCtmTokenizer, ErnieCtmWordtagModel
+from ..transformers.ernie_ctm.configuration import ErnieCtmConfig
+from .task import Task
+from .utils import (
+    BurkhardKellerTree,
+    Customization,
+    DataCollatorForErnieCtm,
+    TermTree,
+    WordTagRelationExtractor,
+    add_docstrings,
+)
+
+LABEL_TO_SCHEMA = {
+    "人物类_实体": ["人物|E", "虚拟角色|E", "演艺团体|E"],
+    "人物类_概念": ["人物|C", "虚拟角色|C"],
+    "作品类_实体": ["作品与出版物|E"],
+    "作品类_概念": ["作品与出版物|C", "文化类"],
+    "组织机构类": ["组织机构"],
+    "组织机构类_企事业单位": ["企事业单位", "品牌", "组织机构"],
+    "组织机构类_医疗卫生机构": ["医疗卫生机构", "组织机构"],
+    "组织机构类_国家机关": ["国家机关", "组织机构"],
+    "组织机构类_体育组织机构": ["体育组织机构", "组织机构"],
+    "组织机构类_教育组织机构": ["教育组织机构", "组织机构"],
+    "组织机构类_军事组织机构": ["军事组织机构", "组织机构"],
+    "物体类": ["物体与物品", "品牌", "虚拟物品", "虚拟物品"],
+    "物体类_兵器": ["兵器"],
+    "物体类_化学物质": ["物体与物品", "化学术语"],
+    "其他角色类": ["角色"],
+    "文化类": ["文化", "作品与出版物|C", "体育运动项目", "语言文字"],
+    "文化类_语言文字": ["语言学术语"],
+    "文化类_奖项赛事活动": ["奖项赛事活动", "特殊日", "事件"],
+    "文化类_制度政策协议": ["制度政策协议", "法律法规"],
+    "文化类_姓氏与人名": ["姓氏与人名"],
+    "生物类": ["生物"],
+    "生物类_植物": ["植物", "生物"],
+    "生物类_动物": ["动物", "生物"],
+    "品牌名": ["品牌", "企事业单位"],
+    "场所类": ["区域场所", "居民服务机构", "医疗卫生机构"],
+    "场所类_交通场所": ["交通场所", "设施"],
+    "位置方位": ["位置方位"],
+    "世界地区类": ["世界地区", "区域场所", "政权朝代"],
+    "饮食类": ["饮食", "生物类", "药物"],
+    "饮食类_菜品": ["饮食"],
+    "饮食类_饮品": ["饮食"],
+    "药物类": ["药物", "生物类"],
+    "药物类_中药": ["药物", "生物类"],
+    "医学术语类": ["医药学术语"],
+    "术语类_生物体": ["生物学术语"],
+    "疾病损伤类": ["疾病损伤", "动物疾病", "医药学术语"],
+    "疾病损伤类_植物病虫害": ["植物病虫害", "医药学术语"],
+    "宇宙类": ["天文学术语"],
+    "事件类": ["事件", "奖项赛事活动"],
+    "时间类": ["时间阶段", "政权朝代"],
+    "术语类": ["术语"],
+    "术语类_符号指标类": ["编码符号指标", "术语"],
+    "信息资料": ["生活用语"],
+    "链接地址": ["生活用语"],
+    "个性特征": ["个性特点", "生活用语"],
+    "感官特征": ["生活用语"],
+    "场景事件": ["场景事件", "情绪", "态度", "个性特点"],
+    "介词": ["介词"],
+    "介词_方位介词": ["介词"],
+    "助词": ["助词"],
+    "代词": ["代词"],
+    "连词": ["连词"],
+    "副词": ["副词"],
+    "疑问词": ["疑问词"],
+    "肯定词": ["肯定否定词"],
+    "否定词": ["肯定否定词"],
+    "数量词": ["数量词", "量词"],
+    "叹词": ["叹词"],
+    "拟声词": ["拟声词"],
+    "修饰词": ["修饰词", "生活用语"],
+    "外语单词": ["日文假名", "词汇用语"],
+    "汉语拼音": ["汉语拼音"],
+}
+
+usage = r"""
+            from paddlenlp import Taskflow
+
+            # 默认使用WordTag词类知识标注工具
+            wordtag = Taskflow("knowledge_mining", model="wordtag")
+            wordtag("《孤女》是2010年九州出版社出版的小说，作者是余兼羽")
+            '''
+            [{'text': '《孤女》是2010年九州出版社出版的小说，作者是余兼羽', 'items': [{'item': '《', 'offset': 0, 'wordtag_label': 'w', 'length': 1}, {'item': '孤女', 'offset': 1, 'wordtag_label': '作品类_实体', 'length': 2}, {'item': '》', 'offset': 3, 'wordtag_label': 'w', 'length': 1}, {'item': '是', 'offset': 4, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '2010年', 'offset': 5, 'wordtag_label': '时间类', 'length': 5, 'termid': '时间阶段_cb_2010年'}, {'item': '九州出版社', 'offset': 10, 'wordtag_label': '组织机构类', 'length': 5, 'termid': '组织机构_eb_九州出版社'}, {'item': '出版', 'offset': 15, 'wordtag_label': '场景事件', 'length': 2, 'termid': '场景事件_cb_出版'}, {'item': '的', 'offset': 17, 'wordtag_label': '助词', 'length': 1, 'termid': '助词_cb_的'}, {'item': '小说', 'offset': 18, 'wordtag_label': '作品类_概念', 'length': 2, 'termid': '小说_cb_小说'}, {'item': '，', 'offset': 20, 'wordtag_label': 'w', 'length': 1}, {'item': '作者', 'offset': 21, 'wordtag_label': '人物类_概念', 'length': 2, 'termid': '人物_cb_作者'}, {'item': '是', 'offset': 23, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '余兼羽', 'offset': 24, 'wordtag_label': '人物类_实体', 'length': 3}]}]
+            '''
+
+            wordtag= Taskflow("knowledge_mining", batch_size=2)
+            wordtag(["热梅茶是一道以梅子为主要原料制作的茶饮",
+                    "《孤女》是2010年九州出版社出版的小说，作者是余兼羽"])
+            '''
+            [{'text': '热梅茶是一道以梅子为主要原料制作的茶饮', 'items': [{'item': '热梅茶', 'offset': 0, 'wordtag_label': '饮食类_饮品', 'length': 3}, {'item': '是', 'offset': 3, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '一道', 'offset': 4, 'wordtag_label': '数量词', 'length': 2}, {'item': '以', 'offset': 6, 'wordtag_label': '介词', 'length': 1, 'termid': '介词_cb_以'}, {'item': '梅子', 'offset': 7, 'wordtag_label': '饮食类', 'length': 2, 'termid': '饮食_cb_梅'}, {'item': '为', 'offset': 9, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_为'}, {'item': '主要原料', 'offset': 10, 'wordtag_label': '物体类', 'length': 4, 'termid': '物品_cb_主要原料'}, {'item': '制作', 'offset': 14, 'wordtag_label': '场景事件', 'length': 2, 'termid': '场景事件_cb_制作'}, {'item': '的', 'offset': 16, 'wordtag_label': '助词', 'length': 1, 'termid': '助词_cb_的'}, {'item': '茶饮', 'offset': 17, 'wordtag_label': '饮食类_饮品', 'length': 2, 'termid': '饮品_cb_茶饮'}]}, {'text': '《孤女》是2010年九州出版社出版的小说，作者是余兼羽', 'items': [{'item': '《', 'offset': 0, 'wordtag_label': 'w', 'length': 1}, {'item': '孤女', 'offset': 1, 'wordtag_label': '作品类_实体', 'length': 2}, {'item': '》', 'offset': 3, 'wordtag_label': 'w', 'length': 1}, {'item': '是', 'offset': 4, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '2010年', 'offset': 5, 'wordtag_label': '时间类', 'length': 5, 'termid': '时间阶段_cb_2010年'}, {'item': '九州出版社', 'offset': 10, 'wordtag_label': '组织机构类', 'length': 5, 'termid': '组织机构_eb_九州出版社'}, {'item': '出版', 'offset': 15, 'wordtag_label': '场景事件', 'length': 2, 'termid': '场景事件_cb_出版'}, {'item': '的', 'offset': 17, 'wordtag_label': '助词', 'length': 1, 'termid': '助词_cb_的'}, {'item': '小说', 'offset': 18, 'wordtag_label': '作品类_概念', 'length': 2, 'termid': '小说_cb_小说'}, {'item': '，', 'offset': 20, 'wordtag_label': 'w', 'length': 1}, {'item': '作者', 'offset': 21, 'wordtag_label': '人物类_概念', 'length': 2, 'termid': '人物_cb_作者'}, {'item': '是', 'offset': 23, 'wordtag_label': '肯定词', 'length': 1, 'termid': '肯定否定词_cb_是'}, {'item': '余兼羽', 'offset': 24, 'wordtag_label': '人物类_实体', 'length': 3}]}]
+            '''
+
+            # 使用WordTag-IE进行信息抽取
+            wordtag = Taskflow("knowledge_mining", model="wordtag", with_ie=True)
+            '''
+            [[{'text': '《忘了所有》是一首由王杰作词、作曲并演唱的歌曲，收录在专辑同名《忘了所有》中，由波丽佳音唱片于1996年08月31日发行。', 'items': [{'item': '《', 'offset': 0, 'wordtag_label': 'w', 'length': 1}, {'item': '忘了所有', 'offset': 1, 'wordtag_label': '作品类_实体', 'length': 4}, {'item': '》', 'offset': 5, 'wordtag_label': 'w', 'length': 1}, {'item': '是', 'offset': 6, 'wordtag_label': '肯定词', 'length': 1}, {'item': '一首', 'offset': 7, 'wordtag_label': '数量词_单位数量词', 'length': 2}, {'item': '由', 'offset': 9, 'wordtag_label': '介词', 'length': 1}, {'item': '王杰', 'offset': 10, 'wordtag_label': '人物类_实体', 'length': 2}, {'item': '作词', 'offset': 12, 'wordtag_label': '场景事件', 'length': 2}, {'item': '、', 'offset': 14, 'wordtag_label': 'w', 'length': 1}, {'item': '作曲', 'offset': 15, 'wordtag_label': '场景事件', 'length': 2}, {'item': '并', 'offset': 17, 'wordtag_label': '连词', 'length': 1}, {'item': '演唱', 'offset': 18, 'wordtag_label': '场景事件', 'length': 2}, {'item': '的', 'offset': 20, 'wordtag_label': '助词', 'length': 1}, {'item': '歌曲', 'offset': 21, 'wordtag_label': '作品类_概念', 'length': 2}, {'item': '，', 'offset': 23, 'wordtag_label': 'w', 'length': 1}, {'item': '收录', 'offset': 24, 'wordtag_label': '场景事件', 'length': 2}, {'item': '在', 'offset': 26, 'wordtag_label': '介词', 'length': 1}, {'item': '专辑', 'offset': 27, 'wordtag_label': '作品类_概念', 'length': 2}, {'item': '同名', 'offset': 29, 'wordtag_label': '场景事件', 'length': 2}, {'item': '《', 'offset': 31, 'wordtag_label': 'w', 'length': 1}, {'item': '忘了所有', 'offset': 32, 'wordtag_label': '作品类_实体', 'length': 4}, {'item': '》', 'offset': 36, 'wordtag_label': 'w', 'length': 1}, {'item': '中', 'offset': 37, 'wordtag_label': '词汇用语', 'length': 1}, {'item': '，', 'offset': 38, 'wordtag_label': 'w', 'length': 1}, {'item': '由', 'offset': 39, 'wordtag_label': '介词', 'length': 1}, {'item': '波丽佳音', 'offset': 40, 'wordtag_label': '人物类_实体', 'length': 4}, {'item': '唱片', 'offset': 44, 'wordtag_label': '作品类_概念', 'length': 2}, {'item': '于', 'offset': 46, 'wordtag_label': '介词', 'length': 1}, {'item': '1996年08月31日', 'offset': 47, 'wordtag_label': '时间类_具体时间', 'length': 11}, {'item': '发行', 'offset': 58, 'wordtag_label': '场景事件', 'length': 2}, {'item': '。', 'offset': 60, 'wordtag_label': 'w', 'length': 1}]}], [[{'HEAD_ROLE': {'item': '王杰', 'offset': 10, 'type': '人物类_实体'}, 'TAIL_ROLE': [{'item': '忘了所有', 'type': '作品类_实体', 'offset': 1}], 'GROUP': '创作', 'TRIG': [{'item': '作词', 'offset': 12}, {'item': '作曲', 'offset': 15}, {'item': '演唱', 'offset': 18}], 'SRC': 'REVERSE'}, {'HEAD_ROLE': {'item': '忘了所有', 'type': '作品类_实体', 'offset': 1}, 'TAIL_ROLE': [{'item': '王杰', 'offset': 10, 'type': '人物类_实体'}], 'GROUP': '创作者', 'SRC': 'HTG', 'TRIG': [{'item': '作词', 'offset': 12}, {'item': '作曲', 'offset': 15}, {'item': '演唱', 'offset': 18}]}, {'HEAD_ROLE': {'item': '忘了所有', 'type': '作品类_实体', 'offset': 1}, 'TAIL_ROLE': [{'item': '歌曲', 'offset': 21, 'type': '作品类_概念'}], 'GROUP': '类型', 'SRC': 'TAIL'}, {'HEAD_ROLE': {'item': '忘了所有', 'offset': 32, 'type': '作品类_实体'}, 'TAIL_ROLE': [{'item': '忘了所有', 'type': '作品类_实体', 'offset': 1}], 'GROUP': '收录', 'TRIG': [{'item': '收录', 'offset': 24}], 'SRC': 'REVERSE'}, {'HEAD_ROLE': {'item': '忘了所有', 'type': '作品类_实体', 'offset': 1}, 'TAIL_ROLE': [{'item': '忘了所有', 'offset': 32, 'type': '作品类_实体'}], 'GROUP': '收录于', 'SRC': 'HGT', 'TRIG': [{'item': '收录', 'offset': 24}]}, {'HEAD_ROLE': {'item': '忘了所有', 'offset': 32, 'type': '作品类_实体'}, 'TAIL_ROLE': [{'item': '王杰', 'type': '人物类_实体', 'offset': 10}], 'GROUP': '创作者', 'TRIG': [{'item': '专辑', 'offset': 27}], 'SRC': 'REVERSE'}, {'HEAD_ROLE': {'item': '王杰', 'type': '人物类_实体', 'offset': 10}, 'TAIL_ROLE': [{'item': '忘了所有', 'offset': 32, 'type': '作品类_实体'}], 'GROUP': '创作', 'SRC': 'HGT', 'TRIG': [{'item': '专辑', 'offset': 27}]}, {'HEAD_ROLE': {'item': '忘了所有', 'type': '作品类_实体', 'offset': 32}, 'TAIL_ROLE': [{'item': '唱片', 'offset': 44, 'type': '作品类_概念'}], 'GROUP': '类型', 'SRC': 'TAIL'}]]]
+            '''
+
+            # 切换为NPTag名词短语标注工具
+            nptag = Taskflow("knowledge_mining", model="nptag")
+            nptag("糖醋排骨")
+            '''
+            [{'text': '糖醋排骨', 'label': '菜品'}]
+            '''
+
+            nptag(["糖醋排骨", "红曲霉菌"])
+            '''
+            [{'text': '糖醋排骨', 'label': '菜品'}, {'text': '红曲霉菌', 'label': '微生物'}]
+            '''
+
+            # 输出粗粒度类别标签`category`，即WordTag的词汇标签。
+            nptag = Taskflow("knowledge_mining", model="nptag", linking=True)
+            nptag(["糖醋排骨", "红曲霉菌"])
+            '''
+            [{'text': '糖醋排骨', 'label': '菜品', 'category': '饮食类_菜品'}, {'text': '红曲霉菌', 'label': '微生物', 'category': '生物类_微生物'}]
+            '''
+         """
+
+
+@add_docstrings(usage)
+class WordTagTask(Task):
+    """
+    This the NER(Named Entity Recognition) task that convert the raw text to entities. And the task with the `wordtag`
+    model will link the more meesage with the entity.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+
+    """
+
+    resource_files_names = {
+        "model_state": "model_state.pdparams",
+        "model_config": "config.json",
+        "termtree_schema": "termtree_type.csv",
+        "termtree_data": "termtree_data",
+        "tags": "tags.txt",
+        "spo_config": "spo_config.pkl",
+        "vocab_file": "vocab.txt",
+        "special_tokens_map": "special_tokens_map.json",
+        "tokenizer_config": "tokenizer_config.json",
+    }
+    resource_files_urls = {
+        "wordtag": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag_v1.5/model_state.pdparams",
+                "c7c9cef72f73ee22c70c26ef11393025",
+            ],
+            "model_config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag_v1.1/config.json",
+                "b9f307b3fa03ad98c08ecb5249c15dfa",
+            ],
+            "termtree_schema": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag/termtree_type.csv",
+                "062cb9ac24f4135bf836e2a2fc5a1209",
+            ],
+            "termtree_data": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag/termtree_data",
+                "a0efe723f84cf90540ac727be5b62e59",
+            ],
+            "tags": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag_v1.1/tags.txt",
+                "f33feedd01d478b03bac81be19b48d00",
+            ],
+            "spo_config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag_v1.1/spo_config.pkl",
+                "07a0b8d0422198d8c4c0f70e68963275",
+            ],
+            "vocab_file": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag/vocab.txt",
+                "54aa6e2eeb0478c2d18a2343b008590c",
+            ],
+            "special_tokens_map": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag/special_tokens_map.json",
+                "58104269e4f141a258bdb2ed06aa599f",
+            ],
+            "tokenizer_config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag/tokenizer_config.json",
+                "e3f2756e72e24e3bb298303fb9a171f7",
+            ],
+        }
+    }
+
+    def __init__(
+        self,
+        model,
+        task,
+        tag_path=None,
+        term_schema_path=None,
+        term_data_path=None,
+        user_dict=None,
+        linking=True,
+        spo_config_path=None,
+        with_ie=False,
+        **kwargs
+    ):
+        super().__init__(model=model, task=task, **kwargs)
+        self._tag_path = tag_path
+        self._term_schema_path = term_schema_path
+        self._term_data_path = term_data_path
+        self._user_dict = user_dict
+        self._linking = linking
+        self._spo_config_path = spo_config_path
+        self._with_ie = with_ie
+        self._check_task_files()
+        self._load_task_resources()
+        self._construct_tokenizer(model)
+        self._usage = usage
+        self._summary_num = 2
+        self._get_inference_model()
+
+        if self._user_dict:
+            self._custom = Customization()
+            self._custom.load_customization(self._user_dict)
+        else:
+            self._custom = None
+        self._num_workers = self.kwargs["num_workers"] if "num_workers" in self.kwargs else 0
+        self._batch_size = self.kwargs["batch_size"] if "batch_size" in self.kwargs else 1
+        self._lazy_load = self.kwargs["lazy_load"] if "lazy_load" in self.kwargs else False
+        self._max_seq_len = self.kwargs["max_seq_len"] if "max_seq_len" in self.kwargs else 512
+        self._split_sentence = self.kwargs["split_sentence"] if "split_sentence" in self.kwargs else False
+        if self._with_ie:
+            self._ie_extractor = WordTagRelationExtractor.from_pkl(self._spo_config_path)
+
+    @property
+    def summary_num(self):
+        """
+        Number of model summary token
+        """
+        return self._summary_num
+
+    @property
+    def linking(self):
+        """
+        Whether to do term linking.
+        """
+        return self._linking
+
+    @staticmethod
+    def _load_labels(tag_path):
+        tags_to_idx = {}
+        all_tags = []
+        i = 0
+        with open(tag_path, encoding="utf-8") as fp:
+            for line in fp:
+                line = line.strip()
+                tag = line.split("-")[-1]
+                if tag not in all_tags:
+                    all_tags.append(tag)
+                tags_to_idx[line] = i
+                i += 1
+        idx_to_tags = dict(zip(*(tags_to_idx.values(), tags_to_idx.keys())))
+        return tags_to_idx, idx_to_tags, all_tags
+
+    def _load_task_resources(self):
+        """
+        Load the resource of this task.
+        """
+        if self._tag_path is None:
+            self._tag_path = os.path.join(self._task_path, "tags.txt")
+            self._tags_to_index, self._index_to_tags, self._all_tags = self._load_labels(self._tag_path)
+
+        if self._term_schema_path is None:
+            self._term_schema_path = os.path.join(self._task_path, "termtree_type.csv")
+        if self._term_data_path is None:
+            self._term_data_path = os.path.join(self._task_path, "termtree_data")
+
+        if self._linking is True:
+            self._termtree = TermTree.from_dir(self._term_schema_path, self._term_data_path, self._linking)
+
+        if self._spo_config_path is None:
+            self._spo_config_path = os.path.join(self._task_path, "spo_config.pkl")
+
+    def _preprocess_text(self, input_texts):
+        """
+        Create the dataset and dataloader for the predict.
+        """
+        max_predict_len = self._max_seq_len - self.summary_num - 1
+        filter_input_texts = []
+        for input_text in input_texts:
+            if not (isinstance(input_text, str) and len(input_text) > 0):
+                continue
+            filter_input_texts.append(input_text)
+        input_texts = filter_input_texts
+
+        short_input_texts, self.input_mapping = self._auto_splitter(
+            input_texts, max_predict_len, split_sentence=self._split_sentence
+        )
+
+        def read(inputs):
+            for text in inputs:
+                tokenized_output = self._tokenizer(
+                    list(text), return_length=True, is_split_into_words=True, max_length=self._max_seq_len
+                )
+                yield {
+                    "input_ids": tokenized_output["input_ids"],
+                    "token_type_ids": tokenized_output["token_type_ids"],
+                    "seq_len": tokenized_output["seq_len"],
+                }
+
+        infer_ds = load_dataset(read, inputs=short_input_texts, lazy=self._lazy_load)
+
+        data_collator = DataCollatorForErnieCtm(self._tokenizer, model="wordtag")
+
+        batch_sampler = paddle.io.BatchSampler(dataset=infer_ds, batch_size=self._batch_size, shuffle=False)
+
+        infer_data_loader = paddle.io.DataLoader(
+            dataset=infer_ds,
+            batch_sampler=batch_sampler,
+            collate_fn=data_collator,
+            num_workers=self._num_workers,
+            return_list=True,
+        )
+
+        outputs = {}
+        outputs["data_loader"] = infer_data_loader
+        outputs["short_input_texts"] = short_input_texts
+        return outputs
+
+    def _reset_offset(self, pred_words):
+        for i in range(0, len(pred_words)):
+            if i > 0:
+                pred_words[i]["offset"] = pred_words[i - 1]["offset"] + len(pred_words[i - 1]["item"])
+            pred_words[i]["length"] = len(pred_words[i]["item"])
+        return pred_words
+
+    def _decode(self, batch_texts, batch_pred_tags):
+        batch_results = []
+        for sent_index in range(len(batch_texts)):
+            sent = batch_texts[sent_index]
+            indexes = batch_pred_tags[sent_index][self.summary_num : len(sent) + self.summary_num]
+            tags = [self._index_to_tags[index] for index in indexes]
+            if self._custom:
+                self._custom.parse_customization(sent, tags, prefix=True)
+            sent_out = []
+            tags_out = []
+            partial_word = ""
+            for ind, tag in enumerate(tags):
+                if partial_word == "":
+                    partial_word = sent[ind]
+                    tags_out.append(tag.split("-")[-1])
+                    continue
+                if tag.startswith("B") or tag.startswith("S") or tag.startswith("O"):
+                    sent_out.append(partial_word)
+                    tags_out.append(tag.split("-")[-1])
+                    partial_word = sent[ind]
+                    continue
+                partial_word += sent[ind]
+
+            if len(sent_out) < len(tags_out):
+                sent_out.append(partial_word)
+
+            pred_words = []
+            for s, t in zip(sent_out, tags_out):
+                pred_words.append({"item": s, "offset": 0, "wordtag_label": t})
+
+            pred_words = self._reset_offset(pred_words)
+            result = {"text": sent, "items": pred_words}
+            batch_results.append(result)
+        return batch_results
+
+    def _term_linking(self, wordtag_res):
+        for item in wordtag_res["items"]:
+            flag, _ = self._termtree.find_term(item["item"])
+            if flag is False:
+                continue
+            if item["wordtag_label"] not in LABEL_TO_SCHEMA:
+                # Custom label defined by user
+                if item["wordtag_label"] not in self._all_tags:
+                    target_type_can = [item["wordtag_label"]]
+                else:
+                    continue
+            else:
+                target_type_can = LABEL_TO_SCHEMA[item["wordtag_label"]]
+            high_priority = False
+            for target_type_raw in target_type_can:
+                target_type_ = target_type_raw.split("|")
+                target_src = None
+                if len(target_type_) == 2:
+                    target_src = target_type_[1]
+                target_type = target_type_[0]
+                flag, term_id = self._termtree.find_term(item["item"], target_type)
+                if flag is False:
+                    continue
+                term_id = list(filter(lambda d: self._termtree[d].node_type == "term", term_id))
+                if len(term_id) == 0:
+                    continue
+                if target_src is not None:
+                    term_id = list(filter(lambda d: self._termtree[d].base.startswith(target_src.lower()), term_id))
+                    if len(term_id) == 0:
+                        continue
+
+                term_id.sort(
+                    key=lambda d: (
+                        self._termtree[d].termtype == target_type or target_type in self._termtree[d].subtype,
+                        self._termtree[d].term == item["item"],
+                    ),
+                    reverse=True,
+                )
+                if self._termtree[term_id[0]].term == item["item"]:
+                    high_priority = True
+                    item["termid"] = term_id[0]
+                if high_priority:
+                    break
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        self._input_spec = [
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),  # input_ids
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="token_type_ids"),  # token_type_ids
+            paddle.static.InputSpec(shape=[None], dtype="int64", name="seq_len"),  # seq_len
+        ]
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+
+        model_config = ErnieCtmConfig.from_pretrained(self._task_path, num_labels=len(self._tags_to_index))
+        model_instance = ErnieCtmWordtagModel.from_pretrained(self._task_path, config=model_config)
+
+        self._model = model_instance
+        self._model.eval()
+
+    def _construct_tokenizer(self, model):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        tokenizer_instance = ErnieCtmTokenizer.from_pretrained(self._task_path)
+        self._tokenizer = tokenizer_instance
+
+    def _preprocess(self, inputs, padding=True, add_special_tokens=True):
+        """
+        Transform the raw text to the model inputs, two steps involved:
+           1) Transform the raw text to token ids.
+           2) Generate the other model inputs from the raw text and token ids.
+        """
+        inputs = self._check_input_text(inputs)
+        outputs = self._preprocess_text(inputs)
+        return outputs
+
+    def _run_model(self, inputs):
+        """
+        Run the task model from the outputs of the `_tokenize` function.
+        """
+        all_pred_tags = []
+        for batch in inputs["data_loader"]:
+            input_ids, token_type_ids, seq_len = batch
+            self.input_handles[0].copy_from_cpu(input_ids.numpy())
+            self.input_handles[1].copy_from_cpu(token_type_ids.numpy())
+            self.input_handles[2].copy_from_cpu(seq_len.numpy())
+            self.predictor.run()
+            pred_tags = self.output_handle[0].copy_to_cpu()
+            all_pred_tags.extend(pred_tags.tolist())
+        inputs["all_pred_tags"] = all_pred_tags
+        return inputs
+
+    def _postprocess(self, inputs):
+        """
+        The model output is the tag ids, this function will convert the model output to raw text.
+        """
+        results = self._decode(inputs["short_input_texts"], inputs["all_pred_tags"])
+        results = self._auto_joiner(results, self.input_mapping, is_dict=True)
+        for result in results:
+            pred_words = result["items"]
+            pred_words = self._reset_offset(pred_words)
+            result["items"] = pred_words
+        if self.linking is True:
+            for res in results:
+                self._term_linking(res)
+        if self._with_ie:
+            ie_results = []
+            for result in results:
+                spo_result = self._ie_extractor.extract_spo(result["items"])
+                ie_results.append(spo_result)
+            return [results, ie_results]
+        return results
+
+    def set_schema(self, schema):
+        """User define the schema for the information extraction.
+        Args:
+            schema (List[ Dict[str, Any]]): Dictionary data contain all k-v data.
+        """
+        self._ie_extractor = WordTagRelationExtractor.from_dict(schema)
+
+
+@add_docstrings(usage)
+class NPTagTask(Task):
+    """
+    Noun phrase tagging task that convert the noun phrase to POS tag.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        batch_size(int): Numbers of examples a batch.
+        linking(bool): Returns the categories. If `linking` is True, the fine-grained label (label) will link with the coarse-grained label (category).
+    """
+
+    resource_files_names = {
+        "model_state": "model_state.pdparams",
+        "model_config": "config.json",
+        "name_category_map": "name_category_map.json",
+        "vocab_file": "vocab.txt",
+        "special_tokens_map": "special_tokens_map.json",
+        "tokenizer_config": "tokenizer_config.json",
+    }
+    resource_files_urls = {
+        "nptag": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/nptag_v1.2/model_state.pdparams",
+                "34923c4d06acf936f52e1fa376b13748",
+            ],
+            "model_config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/nptag/config.json",
+                "895f0eba0819da56db709d00109c984e",
+            ],
+            "name_category_map": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/nptag/name_category_map.json",
+                "c60810205993d307d919a26a3b96786f",
+            ],
+            "vocab_file": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/nptag/vocab.txt",
+                "54aa6e2eeb0478c2d18a2343b008590c",
+            ],
+            "special_tokens_map": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/nptag/special_tokens_map.json",
+                "58104269e4f141a258bdb2ed06aa599f",
+            ],
+            "tokenizer_config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/nptag/tokenizer_config.json",
+                "e3f2756e72e24e3bb298303fb9a171f7",
+            ],
+        }
+    }
+
+    def __init__(self, task, model, batch_size=1, max_seq_len=64, linking=False, **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
+        self._usage = usage
+        self._batch_size = batch_size
+        self._max_seq_len = max_seq_len
+        self._linking = linking
+        self._check_task_files()
+        self._construct_tokenizer(model)
+        self._name_dict = None
+        self._summary_num = 2
+        self._max_cls_len = 5
+        self._lazy_load = kwargs.get("lazy_load", False)
+        self._num_workers = kwargs.get("num_workers", 0)
+        self._construct_dict_map()
+
+        self._get_inference_model()
+        # Disable IR optimization for NPTag
+        self._config.switch_ir_optim(False)
+
+    @property
+    def summary_num(self):
+        """
+        Number of model summary token
+        """
+        return self._summary_num
+
+    def _construct_dict_map(self):
+        """
+        Construct dict map for the predictor.
+        """
+        name_dict_path = os.path.join(self._task_path, "name_category_map.json")
+        with open(name_dict_path, encoding="utf-8") as fp:
+            self._name_dict = json.load(fp)
+        self._tree = BurkhardKellerTree()
+        self._cls_vocabs = OrderedDict()
+        for k in self._name_dict:
+            self._tree.add(k)
+            for c in k:
+                if c not in self._cls_vocabs:
+                    self._cls_vocabs[c] = len(self._cls_vocabs)
+        self._cls_vocabs["[PAD]"] = len(self._cls_vocabs)
+        self._id_vocabs = dict(zip(self._cls_vocabs.values(), self._cls_vocabs.keys()))
+        self._vocab_ids = self._tokenizer.vocab.to_indices(list(self._cls_vocabs.keys()))
+
+    def _decode(self, pred_ids):
+        tokens = [self._id_vocabs[i] for i in pred_ids]
+        valid_token = []
+        for token in tokens:
+            if token == "[PAD]":
+                break
+            valid_token.append(token)
+        return "".join(valid_token)
+
+    def _search(self, scores_can, pred_ids_can, depth, path, score):
+        if depth >= 5:
+            return [(path, score)]
+        res = []
+        for i in range(len(pred_ids_can[0])):
+            tmp_res = self._search(
+                scores_can, pred_ids_can, depth + 1, path + [pred_ids_can[depth][i]], score + scores_can[depth][i]
+            )
+            res.extend(tmp_res)
+        return res
+
+    def _find_topk(self, a, k, axis=-1, largest=True, sorted=True):
+        if axis is None:
+            axis_size = a.size
+        else:
+            axis_size = a.shape[axis]
+        assert 1 <= k <= axis_size
+
+        a = np.asanyarray(a)
+        if largest:
+            index_array = np.argpartition(a, axis_size - k, axis=axis)
+            topk_indices = np.take(index_array, -np.arange(k) - 1, axis=axis)
+        else:
+            index_array = np.argpartition(a, k - 1, axis=axis)
+            topk_indices = np.take(index_array, np.arange(k), axis=axis)
+        topk_values = np.take_along_axis(a, topk_indices, axis=axis)
+        if sorted:
+            sorted_indices_in_topk = np.argsort(topk_values, axis=axis)
+            if largest:
+                sorted_indices_in_topk = np.flip(sorted_indices_in_topk, axis=axis)
+            sorted_topk_values = np.take_along_axis(topk_values, sorted_indices_in_topk, axis=axis)
+            sorted_topk_indices = np.take_along_axis(topk_indices, sorted_indices_in_topk, axis=axis)
+            return sorted_topk_values, sorted_topk_indices
+        return topk_values, topk_indices
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        self._input_spec = [
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),  # input_ids
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="token_type_ids"),  # token_type_ids
+        ]
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+        model_instance = ErnieCtmNptagModel.from_pretrained(self._task_path)
+        self._model = model_instance
+        self._model.eval()
+
+    def _construct_tokenizer(self, model):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        tokenizer_instance = ErnieCtmTokenizer.from_pretrained(self._task_path)
+        self._tokenizer = tokenizer_instance
+
+    def _preprocess(self, inputs):
+        """
+        Create the dataset and dataloader for the predict.
+        """
+        inputs = self._check_input_text(inputs)
+        self._max_cls_len = 5
+
+        # Prompt template: input_text + "是" + "[MASK]" * cls_seq_length
+        prompt_template = ["是"] + ["[MASK]"] * self._max_cls_len
+
+        def read(inputs):
+            for text in inputs:
+                if len(text) + self._max_cls_len + 1 + self._summary_num + 1 > self._max_seq_len:
+                    text = text[: (self._max_seq_len - (self._max_cls_len + 1 + self._summary_num + 1))]
+
+                tokens = list(text) + prompt_template
+                tokenized_output = self._tokenizer(
+                    tokens, return_length=True, is_split_into_words=True, max_length=self._max_seq_len
+                )
+                label_indices = list(
+                    range(tokenized_output["seq_len"] - 1 - self._max_cls_len, tokenized_output["seq_len"] - 1)
+                )
+
+                yield {
+                    "input_ids": tokenized_output["input_ids"],
+                    "token_type_ids": tokenized_output["token_type_ids"],
+                    "label_indices": label_indices,
+                }
+
+        infer_ds = load_dataset(read, inputs=inputs, lazy=self._lazy_load)
+
+        data_collator = DataCollatorForErnieCtm(self._tokenizer, model="nptag")
+
+        batch_sampler = paddle.io.BatchSampler(dataset=infer_ds, batch_size=self._batch_size, shuffle=False)
+
+        infer_data_loader = paddle.io.DataLoader(
+            dataset=infer_ds,
+            batch_sampler=batch_sampler,
+            collate_fn=data_collator,
+            num_workers=self._num_workers,
+            return_list=True,
+        )
+
+        outputs = {}
+        outputs["data_loader"] = infer_data_loader
+        outputs["texts"] = inputs
+        return outputs
+
+    def _run_model(self, inputs):
+        all_scores_can = []
+        all_preds_can = []
+        pred_ids = []
+        for batch in inputs["data_loader"]:
+            input_ids, token_type_ids, label_indices = batch
+            self.input_handles[0].copy_from_cpu(input_ids.numpy())
+            self.input_handles[1].copy_from_cpu(token_type_ids.numpy())
+            self.predictor.run()
+            logits = self.output_handle[0].copy_to_cpu()
+            for i, l in zip(label_indices, logits):
+                score = l[i[0] : i[-1] + 1, self._vocab_ids]
+                # Find topk candidates of scores and predicted indices.
+                score_can, pred_id_can = self._find_topk(score, k=4, axis=-1)
+
+                all_scores_can.extend([score_can.tolist()])
+                all_preds_can.extend([pred_id_can.tolist()])
+                pred_ids.extend([pred_id_can[:, 0].tolist()])
+        inputs["all_scores_can"] = all_scores_can
+        inputs["all_preds_can"] = all_preds_can
+        inputs["pred_ids"] = pred_ids
+        return inputs
+
+    def _postprocess(self, inputs):
+        results = []
+
+        for i in range(len(inputs["texts"])):
+            cls_label = self._decode(inputs["pred_ids"][i])
+            result = {
+                "text": inputs["texts"][i],
+                "label": cls_label,
+            }
+            if cls_label not in self._name_dict:
+                scores_can = inputs["all_scores_can"][i]
+                pred_ids_can = inputs["all_preds_can"][i]
+                labels_can = self._search(scores_can, pred_ids_can, 0, [], 0)
+                labels_can.sort(key=lambda d: -d[1])
+                for labels in labels_can:
+                    cls_label_can = self._decode(labels[0])
+                    if cls_label_can in self._name_dict:
+                        result["label"] = cls_label_can
+                        break
+                else:
+                    labels_can = self._tree.search_similar_word(cls_label)
+                    if len(labels_can) != 0:
+                        result["label"] = labels_can[0][0]
+                        break
+            if self._linking:
+                if result["label"] in self._name_dict:
+                    result["category"] = self._name_dict[result["label"]]
+            results.append(result)
+        return results
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/lexical_analysis.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/lexical_analysis.py
new file mode 100644
index 000000000..92786e13d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/lexical_analysis.py
@@ -0,0 +1,265 @@
+# coding:utf-8
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import paddle
+
+from ..data import Pad, Stack, Tuple
+from ..datasets import load_dataset
+from .models import BiGruCrf
+from .task import Task
+from .utils import Customization
+
+usage = r"""
+           from paddlenlp import Taskflow
+
+           lac = Taskflow("lexical_analysis")
+           lac("LAC是个优秀的分词工具")
+           '''
+           [{'text': 'LAC是个优秀的分词工具', 'segs': ['LAC', '是', '个', '优秀', '的', '分词', '工具'], 'tags': ['nz', 'v', 'q', 'a', 'u', 'n', 'n']}]
+           '''
+
+           lac(["LAC是个优秀的分词工具", "三亚是一个美丽的城市"])
+           '''
+           [{'text': 'LAC是个优秀的分词工具', 'segs': ['LAC', '是', '个', '优秀', '的', '分词', '工具'], 'tags': ['nz', 'v', 'q', 'a', 'u', 'n', 'n']},
+            {'text': '三亚是一个美丽的城市', 'segs': ['三亚', '是', '一个', '美丽', '的', '城市'], 'tags': ['LOC', 'v', 'm', 'a', 'u', 'n']}
+           ]
+           '''
+
+         """
+
+
+def load_vocab(dict_path):
+    """
+    Load vocab from file
+    """
+    vocab = {}
+    reverse = None
+    with open(dict_path, "r", encoding="utf8") as fin:
+        for i, line in enumerate(fin):
+            terms = line.strip("\n").split("\t")
+            if len(terms) == 2:
+                if reverse is None:
+                    reverse = True if terms[0].isdigit() else False
+                if reverse:
+                    value, key = terms
+                else:
+                    key, value = terms
+            elif len(terms) == 1:
+                key, value = terms[0], str(i)
+            else:
+                raise ValueError("Error line: %s in file: %s" % (line, dict_path))
+            vocab[key] = value
+    return vocab
+
+
+class LacTask(Task):
+    """
+    Lexical analysis of Chinese task to segement the chinese sentence.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        user_dict(string): The user-defined dictionary, default to None.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    resource_files_names = {
+        "model_state": "model_state.pdparams",
+        "tags": "tag.dic",
+        "q2b": "q2b.dic",
+        "word": "word.dic",
+    }
+    resource_files_urls = {
+        "lac": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/lexical_analysis/lac/model_state.pdparams",
+                "3d4008c6c9d29424465829c9acf909bd",
+            ],
+            "tags": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/lexical_analysis/lac/tag.dic",
+                "b11b616926b9f7f0a40a8087f84a8a99",
+            ],
+            "q2b": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/lexical_analysis/lac/q2b.dic",
+                "4ef2cd16f8002fe7cd7dd31cdff47e0d",
+            ],
+            "word": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/lexical_analysis/lac/word.dic",
+                "f1dfc68139bb6dd58c9c4313c341e436",
+            ],
+        }
+    }
+
+    def __init__(self, task, model, user_dict=None, **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
+        self._usage = usage
+        self._user_dict = user_dict
+        self._check_task_files()
+        self._construct_vocabs()
+        self._get_inference_model()
+        self._max_seq_len = 512
+        if self._user_dict:
+            self._custom = Customization()
+            self._custom.load_customization(self._user_dict)
+        else:
+            self._custom = None
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        self._input_spec = [
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="token_ids"),
+            paddle.static.InputSpec(shape=[None], dtype="int64", name="length"),
+        ]
+
+    def _construct_vocabs(self):
+        word_dict_path = os.path.join(self._task_path, "word.dic")
+        tag_dict_path = os.path.join(self._task_path, "tag.dic")
+        q2b_dict_path = os.path.join(self._task_path, "q2b.dic")
+        self._word_vocab = load_vocab(word_dict_path)
+        self._tag_vocab = load_vocab(tag_dict_path)
+        self._q2b_vocab = load_vocab(q2b_dict_path)
+        self._id2word_dict = dict(zip(self._word_vocab.values(), self._word_vocab.keys()))
+        self._id2tag_dict = dict(zip(self._tag_vocab.values(), self._tag_vocab.keys()))
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+        model_instance = BiGruCrf(
+            self.kwargs["emb_dim"], self.kwargs["hidden_size"], len(self._word_vocab), len(self._tag_vocab)
+        )
+        # Load the model parameter for the predict
+        state_dict = paddle.load(os.path.join(self._task_path, "model_state.pdparams"))
+        model_instance.set_dict(state_dict)
+        self._model = model_instance
+        self._model.eval()
+
+    def _construct_tokenizer(self, model):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        return None
+
+    def _preprocess(self, inputs, padding=True, add_special_tokens=True):
+        """
+        Transform the raw text to the model inputs, two steps involved:
+           1) Transform the raw text to token ids.
+           2) Generate the other model inputs from the raw text and token ids.
+        """
+        inputs = self._check_input_text(inputs)
+        # Get the config from the kwargs
+        batch_size = self.kwargs["batch_size"] if "batch_size" in self.kwargs else 1
+        num_workers = self.kwargs["num_workers"] if "num_workers" in self.kwargs else 0
+        self._split_sentence = self.kwargs["split_sentence"] if "split_sentence" in self.kwargs else False
+        oov_token_id = self._word_vocab.get("OOV")
+
+        filter_inputs = []
+        for input in inputs:
+            if not (isinstance(input, str) and len(input.strip()) > 0):
+                continue
+            filter_inputs.append(input)
+
+        short_input_texts, self.input_mapping = self._auto_splitter(
+            filter_inputs, self._max_seq_len, split_sentence=self._split_sentence
+        )
+
+        def read(inputs):
+            for input_tokens in inputs:
+                ids = []
+                for token in input_tokens:
+                    token = self._q2b_vocab.get(token, token)
+                    token_id = self._word_vocab.get(token, oov_token_id)
+                    ids.append(token_id)
+                lens = len(ids)
+                yield ids, lens
+
+        infer_ds = load_dataset(read, inputs=short_input_texts, lazy=False)
+        batchify_fn = lambda samples, fn=Tuple(
+            Pad(axis=0, pad_val=0, dtype="int64"),  # input_ids
+            Stack(dtype="int64"),  # seq_len
+        ): fn(samples)
+        infer_data_loader = paddle.io.DataLoader(
+            infer_ds,
+            collate_fn=batchify_fn,
+            num_workers=num_workers,
+            batch_size=batch_size,
+            shuffle=False,
+            return_list=True,
+        )
+        outputs = {}
+        outputs["text"] = short_input_texts
+        outputs["data_loader"] = infer_data_loader
+        return outputs
+
+    def _run_model(self, inputs):
+        """
+        Run the task model from the outputs of the `_tokenize` function.
+        """
+        results = []
+        lens = []
+        for batch in inputs["data_loader"]:
+            input_ids, seq_len = batch
+            self.input_handles[0].copy_from_cpu(input_ids.numpy())
+            self.input_handles[1].copy_from_cpu(seq_len.numpy())
+            self.predictor.run()
+            tags_ids = self.output_handle[0].copy_to_cpu()
+            results.extend(tags_ids.tolist())
+            lens.extend(seq_len.tolist())
+
+        inputs["result"] = results
+        inputs["lens"] = lens
+        return inputs
+
+    def _postprocess(self, inputs):
+        """
+        The model output is the tag ids, this function will convert the model output to raw text.
+        """
+        lengths = inputs["lens"]
+        preds = inputs["result"]
+        sents = inputs["text"]
+        final_results = []
+        for sent_index in range(len(lengths)):
+            single_result = {}
+            tags = [self._id2tag_dict[str(index)] for index in preds[sent_index][: lengths[sent_index]]]
+            sent = sents[sent_index]
+            if self._custom:
+                self._custom.parse_customization(sent, tags)
+            sent_out = []
+            tags_out = []
+            parital_word = ""
+            for ind, tag in enumerate(tags):
+                if parital_word == "":
+                    parital_word = sent[ind]
+                    tags_out.append(tag.split("-")[0])
+                    continue
+                if tag.endswith("-B") or (tag == "O" and tags[ind - 1] != "O"):
+                    sent_out.append(parital_word)
+                    tags_out.append(tag.split("-")[0])
+                    parital_word = sent[ind]
+                    continue
+                parital_word += sent[ind]
+
+            if len(sent_out) < len(tags_out):
+                sent_out.append(parital_word)
+
+            single_result["text"] = sent
+            single_result["segs"] = sent_out
+            single_result["tags"] = tags_out
+            final_results.append(single_result)
+        final_results = self._auto_joiner(final_results, self.input_mapping, is_dict=True)
+        return final_results
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/models/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/models/__init__.py
new file mode 100644
index 000000000..59d4fe2d3
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/models/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .sentiment_analysis_model import BoWModel, LSTMModel, SkepSequenceModel
+from .lexical_analysis_model import BiGruCrf
+from .dependency_parsing_model import BiAffineParser
+from .text_correction_model import ErnieForCSC
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/models/dependency_parsing_model.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/models/dependency_parsing_model.py
new file mode 100644
index 000000000..5af9925cb
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/models/dependency_parsing_model.py
@@ -0,0 +1,229 @@
+# coding:utf-8
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from paddlenlp.transformers import AutoModel
+
+
+class BiAffineParser(nn.Layer):
+    """DDParser"""
+
+    def __init__(self, encoding_model, n_rels, n_words, pad_index, bos_index, eos_index, n_mlp_arc=500, n_mlp_rel=100):
+        super(BiAffineParser, self).__init__()
+        self.pad_index = pad_index
+        self.bos_index = bos_index
+        self.eos_index = eos_index
+
+        if encoding_model == "lstm-pe":
+            self.embed = LSTMByWPEncoder(n_words, pad_index)
+        else:  # encoding_model is "ernie-3.0-medium-zh", "ernie-1.0" or other models:
+            pretrained_model = AutoModel.from_pretrained(encoding_model)
+            self.embed = ErnieEncoder(pad_index, pretrained_model)
+
+        # MLP layer
+        self.mlp_arc_h = MLP(n_in=self.embed.mlp_input_size, n_out=n_mlp_arc)
+        self.mlp_arc_d = MLP(n_in=self.embed.mlp_input_size, n_out=n_mlp_arc)
+        self.mlp_rel_h = MLP(n_in=self.embed.mlp_input_size, n_out=n_mlp_rel)
+        self.mlp_rel_d = MLP(n_in=self.embed.mlp_input_size, n_out=n_mlp_rel)
+
+        # Biaffine layer
+        self.arc_attn = BiAffine(n_in=n_mlp_arc, bias_x=True, bias_y=False)
+        self.rel_attn = BiAffine(n_in=n_mlp_rel, n_out=n_rels, bias_x=True, bias_y=True)
+
+    def forward(self, words, wp):
+
+        words, x = self.embed(words, wp)
+        mask = paddle.logical_and(words != self.pad_index, words != self.eos_index)
+
+        arc_h = self.mlp_arc_h(x)
+        arc_d = self.mlp_arc_d(x)
+        rel_h = self.mlp_rel_h(x)
+        rel_d = self.mlp_rel_d(x)
+
+        # Get arc and rel scores from the bilinear attention
+        # Shape: (batch_size, seq_len, seq_len)
+        s_arc = self.arc_attn(arc_d, arc_h)
+        # Shape: (batch_size, seq_len, seq_len, n_rels)
+        s_rel = paddle.transpose(self.rel_attn(rel_d, rel_h), perm=[0, 2, 3, 1])
+        # Set the scores that exceed the length of each sentence to -1e5
+        s_arc_mask = paddle.unsqueeze(mask, 1)
+        s_arc = s_arc * s_arc_mask + paddle.scale(
+            paddle.cast(s_arc_mask, "int32"), scale=1e5, bias=-1, bias_after_scale=False
+        )
+
+        mask = paddle.cast(
+            paddle.logical_and(
+                paddle.logical_and(words != self.pad_index, words != self.bos_index),
+                words != self.eos_index,
+            ),
+            "int32",
+        )
+        arc_preds = paddle.argmax(s_arc, axis=-1)
+        rel_preds = paddle.argmax(s_rel, axis=-1)
+        return arc_preds, rel_preds, s_arc, mask
+
+
+class MLP(nn.Layer):
+    """MLP"""
+
+    def __init__(self, n_in, n_out):
+        super(MLP, self).__init__()
+
+        self.linear = nn.Linear(
+            n_in,
+            n_out,
+            weight_attr=nn.initializer.XavierNormal(),
+        )
+        self.leaky_relu = nn.LeakyReLU(negative_slope=0.1)
+
+    def forward(self, x):
+        # Shape: (batch_size, output_size)
+        x = self.linear(x)
+        x = self.leaky_relu(x)
+        return x
+
+
+class BiAffine(nn.Layer):
+    """BiAffine"""
+
+    def __init__(self, n_in, n_out=1, bias_x=True, bias_y=True):
+        super(BiAffine, self).__init__()
+
+        self.n_in = n_in
+        self.n_out = n_out
+        self.bias_x = bias_x
+        self.bias_y = bias_y
+        self.weight = self.create_parameter(shape=[n_out, n_in + bias_x, n_in + bias_y], dtype="float32")
+
+    def forward(self, x, y):
+        if self.bias_x:
+            x = paddle.concat([x, paddle.ones_like(x[:, :, :1])], axis=-1)
+        if self.bias_y:
+            y = paddle.concat([y, paddle.ones_like(x[:, :, :1])], axis=-1)
+        # Shape x: (batch_size, num_tokens, input_size + bias_x)
+        b = x.shape[0]
+        o = self.weight.shape[0]
+        # Shape x: (batch_size, output_size, num_tokens, input_size + bias_x)
+        x = paddle.expand(paddle.unsqueeze(x, axis=1), shape=(x.shape[0], o, x.shape[1], x.shape[2]))
+        # Shape y: (batch_size, output_size, num_tokens, input_size + bias_y)
+        y = paddle.expand(paddle.unsqueeze(y, axis=1), shape=(y.shape[0], o, y.shape[1], y.shape[2]))
+        # Shape weight: (batch_size, output_size, input_size + bias_x, input_size + bias_y)
+        weight = paddle.expand(
+            paddle.unsqueeze(self.weight, axis=0),
+            shape=(b, self.weight.shape[0], self.weight.shape[1], self.weight.shape[2]),
+        )
+
+        # Shape: (batch_size, output_size, num_tokens, num_tokens)
+        s = paddle.matmul(paddle.matmul(x, weight), paddle.transpose(y, perm=[0, 1, 3, 2]))
+        # Remove dim 1 if n_out == 1
+        if s.shape[1] == 1:
+            s = paddle.squeeze(s, axis=1)
+        return s
+
+
+class ErnieEncoder(nn.Layer):
+    def __init__(self, pad_index, pretrained_model):
+        super(ErnieEncoder, self).__init__()
+        self.pad_index = pad_index
+        self.ptm = pretrained_model
+        self.mlp_input_size = self.ptm.config["hidden_size"]
+
+    def forward(self, words, wp):
+        x, _ = self.ptm(words)
+        x = paddle.reshape(
+            index_sample(x, wp),
+            shape=[wp.shape[0], wp.shape[1], x.shape[2]],
+        )
+        words = index_sample(words, wp)
+        return words, x
+
+
+class LSTMByWPEncoder(nn.Layer):
+    def __init__(self, n_words, pad_index, lstm_by_wp_embed_size=200, n_embed=300, n_lstm_hidden=300, n_lstm_layers=3):
+        super(LSTMByWPEncoder, self).__init__()
+        self.pad_index = pad_index
+        self.word_embed = nn.Embedding(n_words, lstm_by_wp_embed_size)
+
+        self.lstm = nn.LSTM(
+            input_size=lstm_by_wp_embed_size,
+            hidden_size=n_lstm_hidden,
+            num_layers=n_lstm_layers,
+            direction="bidirectional",
+        )
+
+        self.mlp_input_size = n_lstm_hidden * 2
+
+    def forward(self, words, wp):
+
+        word_embed = self.word_embed(words)
+        mask = words != self.pad_index
+        seq_lens = paddle.sum(paddle.cast(mask, "int32"), axis=-1)
+
+        x, _ = self.lstm(word_embed, sequence_length=seq_lens)
+        x = paddle.reshape(
+            index_sample(x, wp),
+            shape=[wp.shape[0], wp.shape[1], x.shape[2]],
+        )
+        words = paddle.index_sample(words, wp)
+        return words, x
+
+
+def index_sample(x, index):
+    """Select input value according to index
+
+    Arags：
+        input: input matrix
+        index: index matrix
+    Returns:
+        output
+    >>> input
+    [
+        [1, 2, 3],
+        [4, 5, 6]
+    ]
+    >>> index
+    [
+        [1, 2],
+        [0, 1]
+    ]
+    >>> index_sample(input, index)
+    [
+        [2, 3],
+        [4, 5]
+    ]
+    """
+    x_s = x.shape
+    dim = len(index.shape) - 1
+    assert x_s[:dim] == index.shape[:dim]
+    if len(x_s) == 3 and dim == 1:
+        r_x = paddle.reshape(x, shape=[-1, x_s[1], x_s[-1]])
+    else:
+        r_x = paddle.reshape(x, shape=[-1, x_s[-1]])
+    index = paddle.reshape(index, shape=[len(r_x), -1, 1])
+    # Generate arange index, shape like index
+    arr_index = paddle.arange(start=0, end=len(index), dtype=index.dtype)
+    arr_index = paddle.unsqueeze(arr_index, axis=[1, 2])
+    arr_index = paddle.expand(arr_index, index.shape)
+    # Generate new index
+    new_index = paddle.concat((arr_index, index), -1)
+    new_index = paddle.reshape(new_index, (-1, 2))
+    # Get output
+    out = paddle.gather_nd(r_x, new_index)
+    if len(x_s) == 3 and dim == 2:
+        out = paddle.reshape(out, shape=[x_s[0], x_s[1], -1])
+    else:
+        out = paddle.reshape(out, shape=[x_s[0], -1])
+    return out
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/models/lexical_analysis_model.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/models/lexical_analysis_model.py
new file mode 100644
index 000000000..32f711020
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/models/lexical_analysis_model.py
@@ -0,0 +1,100 @@
+# coding:utf-8
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+from paddlenlp.layers.crf import LinearChainCrf, LinearChainCrfLoss
+
+try:
+    from paddle.text import ViterbiDecoder
+except Exception:
+    raise ImportError(
+        "Taskflow requires paddle version >= 2.2.0, but current paddle version is {}".format(
+            paddle.version.full_version
+        )
+    )
+
+
+class BiGruCrf(nn.Layer):
+    """The network for lexical analysis, based on two layers of BiGRU and one layer of CRF. More details see https://arxiv.org/abs/1807.01882
+    Args:
+        word_emb_dim (int): The dimension in which a word is embedded.
+        hidden_size (int): The number of hidden nodes in the GRU layer.
+        vocab_size (int): the word vocab size.
+        num_labels (int): the labels amount.
+        emb_lr (float, optional): The scaling of the learning rate of the embedding layer. Defaults to 2.0.
+        crf_lr (float, optional): The scaling of the learning rate of the crf layer. Defaults to 0.2.
+    """
+
+    def __init__(
+        self, word_emb_dim, hidden_size, vocab_size, num_labels, emb_lr=2.0, crf_lr=0.2, with_start_stop_tag=True
+    ):
+        super(BiGruCrf, self).__init__()
+        self.word_emb_dim = word_emb_dim
+        self.vocab_size = vocab_size
+        self.num_labels = num_labels
+        self.hidden_size = hidden_size
+        self.emb_lr = emb_lr
+        self.crf_lr = crf_lr
+        self.init_bound = 0.1
+
+        self.word_embedding = nn.Embedding(
+            num_embeddings=self.vocab_size,
+            embedding_dim=self.word_emb_dim,
+            weight_attr=paddle.ParamAttr(
+                learning_rate=self.emb_lr,
+                initializer=nn.initializer.Uniform(low=-self.init_bound, high=self.init_bound),
+            ),
+        )
+
+        self.gru = nn.GRU(
+            input_size=self.word_emb_dim,
+            hidden_size=self.hidden_size,
+            num_layers=2,
+            direction="bidirectional",
+            weight_ih_attr=paddle.ParamAttr(
+                initializer=nn.initializer.Uniform(low=-self.init_bound, high=self.init_bound),
+                regularizer=paddle.regularizer.L2Decay(coeff=1e-4),
+            ),
+            weight_hh_attr=paddle.ParamAttr(
+                initializer=nn.initializer.Uniform(low=-self.init_bound, high=self.init_bound),
+                regularizer=paddle.regularizer.L2Decay(coeff=1e-4),
+            ),
+        )
+
+        self.fc = nn.Linear(
+            in_features=self.hidden_size * 2,
+            out_features=self.num_labels + 2 if with_start_stop_tag else self.num_labels,
+            weight_attr=paddle.ParamAttr(
+                initializer=nn.initializer.Uniform(low=-self.init_bound, high=self.init_bound),
+                regularizer=paddle.regularizer.L2Decay(coeff=1e-4),
+            ),
+        )
+
+        self.crf = LinearChainCrf(self.num_labels, self.crf_lr, with_start_stop_tag)
+        self.crf_loss = LinearChainCrfLoss(self.crf)
+        self.viterbi_decoder = ViterbiDecoder(self.crf.transitions, with_start_stop_tag)
+
+    def forward(self, inputs, lengths, labels=None):
+        word_embed = self.word_embedding(inputs)
+        bigru_output, _ = self.gru(word_embed, sequence_length=lengths)
+        emission = self.fc(bigru_output)
+        if labels is not None:
+            loss = self.crf_loss(emission, lengths, labels)
+            return loss
+        else:
+            _, prediction = self.viterbi_decoder(emission, lengths)
+            return prediction
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/models/sentiment_analysis_model.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/models/sentiment_analysis_model.py
new file mode 100644
index 000000000..bfffd545d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/models/sentiment_analysis_model.py
@@ -0,0 +1,151 @@
+# coding:utf-8
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlenlp.seq2vec.encoder import BoWEncoder, LSTMEncoder
+from paddlenlp.transformers import SkepConfig, SkepModel, SkepPretrainedModel
+
+
+class BoWModel(nn.Layer):
+    """
+    This class implements the Bag of Words Classification Network model to classify texts.
+    At a high level, the model starts by embedding the tokens and running them through
+    a word embedding. Then, we encode these representations with a `BoWEncoder`.
+    Lastly, we take the output of the encoder to create a final representation,
+    which is passed through some feed-forward layers to output a logits (`output_layer`).
+    Args:
+        vocab_size(int): The vocab size that used to create the embedding.
+        num_class(int): The num class of the classifier.
+        emb_dim(int. optional): The size of the embedding, default value is 128.
+        padding_idx(int, optional): The padding value in the embedding, the padding_idx of embedding value will
+            not be updated, the default value is 0.
+        hidden_size(int, optional): The output size of linear that after the bow, default value is 128.
+        fc_hidden_size(int, optional): The output size of linear that after the first linear, default value is 96.
+    """
+
+    def __init__(self, vocab_size, num_classes, emb_dim=128, padding_idx=0, hidden_size=128, fc_hidden_size=96):
+        super().__init__()
+        self.embedder = nn.Embedding(vocab_size, emb_dim, padding_idx=padding_idx)
+        self.bow_encoder = BoWEncoder(emb_dim)
+        self.fc1 = nn.Linear(self.bow_encoder.get_output_dim(), hidden_size)
+        self.fc2 = nn.Linear(hidden_size, fc_hidden_size)
+        self.output_layer = nn.Linear(fc_hidden_size, num_classes)
+
+    def forward(self, text, seq_len=None):
+        # Shape: (batch_size, num_tokens, embedding_dim)
+        embedded_text = self.embedder(text)
+
+        # Shape: (batch_size, embedding_dim)
+        summed = self.bow_encoder(embedded_text)
+        encoded_text = paddle.tanh(summed)
+
+        # Shape: (batch_size, hidden_size)
+        fc1_out = paddle.tanh(self.fc1(encoded_text))
+        # Shape: (batch_size, fc_hidden_size)
+        fc2_out = paddle.tanh(self.fc2(fc1_out))
+        # Shape: (batch_size, num_classes)
+        logits = self.output_layer(fc2_out)
+        return logits
+
+
+class LSTMModel(nn.Layer):
+    """
+    This class implements the Bag of Words Classification Network model to classify texts.
+    At a high level, the model starts by embedding the tokens and running them through
+    a word embedding. Then, we encode these representations with a `BoWEncoder`.
+    Lastly, we take the output of the encoder to create a final representation,
+    which is passed through some feed-forward layers to output a logits (`output_layer`).
+    Args:
+        vocab_size(int): The vocab size that used to create the embedding.
+        num_class(int):  The num class of the classifier.
+        emb_dim(int. optional): The size of the embedding, default value is 128.
+        padding_idx(int, optional): The padding value in the embedding, the padding_idx of embedding value will
+            not be updated, the default value is 0.
+        lstm_hidden_size(int, optional): The output size of the lstm, default value 198.
+        direction(string, optional): The direction of lstm, default value is `forward`.
+        lstm_layers(string, optional): The num of lstm layer.
+        dropout(float, optional): The dropout rate of lstm.
+        pooling_type(float, optional): The pooling type of lstm. Default value is None,
+            if `pooling_type` is None, then the LSTMEncoder will return the hidden state of the last time step at last layer as a single vector.
+    """
+
+    def __init__(
+        self,
+        vocab_size,
+        num_classes,
+        emb_dim=128,
+        padding_idx=0,
+        lstm_hidden_size=198,
+        direction="forward",
+        lstm_layers=1,
+        dropout_rate=0.0,
+        pooling_type=None,
+        fc_hidden_size=96,
+    ):
+        super().__init__()
+        self.embedder = nn.Embedding(num_embeddings=vocab_size, embedding_dim=emb_dim, padding_idx=padding_idx)
+        self.lstm_encoder = LSTMEncoder(
+            emb_dim,
+            lstm_hidden_size,
+            num_layers=lstm_layers,
+            direction=direction,
+            dropout=dropout_rate,
+            pooling_type=pooling_type,
+        )
+        self.fc = nn.Linear(self.lstm_encoder.get_output_dim(), fc_hidden_size)
+        self.output_layer = nn.Linear(fc_hidden_size, num_classes)
+
+    def forward(self, text, seq_len):
+        # Shape: (batch_size, num_tokens, embedding_dim)
+        embedded_text = self.embedder(text)
+        # Shape: (batch_size, num_tokens, num_directions*lstm_hidden_size)
+        # num_directions = 2 if direction is 'bidirect'
+        # if not, num_directions = 1
+        text_repr = self.lstm_encoder(embedded_text, sequence_length=seq_len)
+        # Shape: (batch_size, fc_hidden_size)
+        fc_out = paddle.tanh(self.fc(text_repr))
+        # Shape: (batch_size, num_classes)
+        logits = self.output_layer(fc_out)
+        probs = F.softmax(logits, axis=1)
+        idx = paddle.argmax(probs, axis=1).numpy()
+        return idx, probs
+
+
+class SkepSequenceModel(SkepPretrainedModel):
+    def __init__(self, config: SkepConfig):
+        super(SkepSequenceModel, self).__init__(config)
+        self.skep = SkepModel(config)
+        self.num_labels = config.num_labels
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, self.num_labels)
+
+    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, attention_mask=None):
+        outputs = self.skep(
+            input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask
+        )
+
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        probs = F.softmax(logits, axis=1)
+        idx = paddle.argmax(probs, axis=1)
+
+        return idx, probs
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/models/text_correction_model.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/models/text_correction_model.py
new file mode 100644
index 000000000..0a1400627
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/models/text_correction_model.py
@@ -0,0 +1,127 @@
+# coding:utf-8
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+
+class ErnieForCSC(nn.Layer):
+    r"""
+    ErnieForCSC is a model specified for Chinese Spelling Correction task.
+
+    It integrates phonetic features into language model by leveraging the powerful
+    pre-training and fine-tuning method.
+
+    See more details on https://aclanthology.org/2021.findings-acl.198.pdf.
+    Args:
+        ernie (ErnieModel):
+            An instance of `paddlenlp.transformers.ErnieModel`.
+        pinyin_vocab_size (int):
+            The vocab size of pinyin vocab.
+        pad_pinyin_id (int, optional):
+            The pad token id of pinyin vocab. Defaults to 0.
+    """
+
+    def __init__(self, ernie, pinyin_vocab_size, pad_pinyin_id=0):
+        super(ErnieForCSC, self).__init__()
+        self.ernie = ernie
+        emb_size = self.ernie.config["hidden_size"]
+        hidden_size = self.ernie.config["hidden_size"]
+        vocab_size = self.ernie.config["vocab_size"]
+
+        self.pad_token_id = self.ernie.config["pad_token_id"]
+        self.pinyin_vocab_size = pinyin_vocab_size
+        self.pad_pinyin_id = pad_pinyin_id
+        self.pinyin_embeddings = nn.Embedding(self.pinyin_vocab_size, emb_size, padding_idx=pad_pinyin_id)
+        self.detection_layer = nn.Linear(hidden_size, 2)
+        self.correction_layer = nn.Linear(hidden_size, vocab_size)
+        self.softmax = nn.Softmax()
+
+    def forward(self, input_ids, pinyin_ids, token_type_ids=None, position_ids=None, attention_mask=None):
+        r"""
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
+            pinyin_ids (Tensor):
+                Indices of pinyin tokens of input sequence in the pinyin vocabulary. They are
+                numerical representations of tokens that build the pinyin input sequence.
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate first and second portions of the inputs.
+                Indices can be either 0 or 1:
+
+                - 0 corresponds to a **sentence A** token,
+                - 1 corresponds to a **sentence B** token.
+
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
+                Defaults to None, which means no segment embeddings is added to token embeddings.
+            position_ids (Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                config.max_position_embeddings - 1]``.
+                Defaults to `None`. Shape as `(batch_sie, num_tokens)` and dtype as `int32` or `int64`.
+            attention_mask (Tensor, optional):
+                Mask to indicate whether to perform attention on each input token or not.
+                The values should be either 0 or 1. The attention scores will be set
+                to **-infinity** for any positions in the mask that are **0**, and will be
+                **unchanged** for positions that are **1**.
+
+                - **1** for tokens that are **not masked**,
+                - **0** for tokens that are **masked**.
+
+                It's data type should be `float32` and has a shape of [batch_size, sequence_length].
+                Defaults to `None`.
+
+
+        Returns:
+            det_preds (Tensor):
+                A Tensor of the detection prediction of each tokens.
+                Shape as `(batch_size, sequence_length)` and dtype as `int`.
+
+            char_preds (Tensor):
+                A Tensor of the correction prediction of each tokens.
+                Shape as `(batch_size, sequence_length)` and dtype as `int`.
+
+        """
+        if attention_mask is None:
+            attention_mask = paddle.unsqueeze(
+                (input_ids == self.pad_token_id).astype(self.detection_layer.weight.dtype) * -1e4, axis=[1, 2]
+            )
+
+        embedding_output = self.ernie.embeddings(
+            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids
+        )
+        pinyin_embedding_output = self.pinyin_embeddings(pinyin_ids)
+
+        # Detection module aims to detect whether each Chinese character has spelling error.
+        detection_outputs = self.ernie.encoder(embedding_output, attention_mask)
+        # detection_error_probs shape: [B, T, 2]. It indicates the erroneous probability of each
+        # word in the sequence from 0 to 1.
+        detection_error_probs = self.softmax(self.detection_layer(detection_outputs))
+        # Correction module aims to correct each potential wrong character to right character.
+        word_pinyin_embedding_output = (
+            detection_error_probs[:, :, 0:1] * embedding_output
+            + detection_error_probs[:, :, 1:2] * pinyin_embedding_output
+        )
+
+        correction_outputs = self.ernie.encoder(word_pinyin_embedding_output, attention_mask)
+        # correction_logits shape: [B, T, V]. It indicates the correct score of each token in vocab
+        # according to each word in the sequence.
+        correction_logits = self.correction_layer(correction_outputs)
+
+        det_preds = detection_error_probs.argmax(axis=-1)
+        char_preds = correction_logits.argmax(axis=-1)
+        return det_preds, char_preds
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/multimodal_feature_extraction.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/multimodal_feature_extraction.py
new file mode 100644
index 000000000..3e6050081
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/multimodal_feature_extraction.py
@@ -0,0 +1,463 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+import paddle
+from PIL import Image
+
+from ..transformers import AutoModel, AutoProcessor
+from ..utils.log import logger
+from .task import Task
+from .utils import dygraph_mode_guard, static_mode_guard
+
+usage = r"""
+            from paddlenlp import Taskflow
+            from PIL import Image
+            # Multi modal feature_extraction with ernie_vil-2.0-base-zh
+            vision_language = Taskflow("feature_extraction", model='PaddlePaddle/ernie_vil-2.0-base-zh')
+            image_embeds = vision_language([Image.open("demo/000000039769.jpg")])
+            print(image_embeds)
+            '''
+            Tensor(shape=[1, 768], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                    [[-0.59475428, -0.69795364,  0.22144008,  0.88066685, -0.58184201,
+                        -0.73454666,  0.95557910, -0.61410815,  0.23474170,  0.13301648,
+                        0.86196446,  0.12281934,  0.69097638,  1.47614217,  0.07238606,
+                        ...
+            '''
+            text_embeds = vision_language(["猫的照片","狗的照片"])
+            text_features = text_embeds["features"]
+            print(text_features)
+            '''
+            Tensor(shape=[2, 768], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                    [[ 0.04250504, -0.41429776,  0.26163983, ...,  0.26221892,
+                        0.34387422,  0.18779707],
+            '''
+            image_features /= image_features.norm(axis=-1, keepdim=True)
+            text_features /= text_features.norm(axis=-1, keepdim=True)
+            logits_per_image = 100 * image_features @ text_features.t()
+            probs = F.softmax(logits_per_image, axis=-1)
+            print(probs)
+            '''
+            Tensor(shape=[1, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                [[0.99833173, 0.00166824]])
+            '''
+         """
+
+
+class MultimodalFeatureExtractionTask(Task):
+    """
+    Feature extraction task using no model head. This task extracts the hidden states from the base
+    model, which can be used as features in retrieval and clustering tasks.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    resource_files_names = {
+        "model_state": "model_state.pdparams",
+        "config": "config.json",
+        "vocab_file": "vocab.txt",
+        "preprocessor_config": "preprocessor_config.json",
+        "special_tokens_map": "special_tokens_map.json",
+        "tokenizer_config": "tokenizer_config.json",
+    }
+    resource_files_urls = {
+        "PaddlePaddle/ernie_vil-2.0-base-zh": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/models/community/PaddlePaddle/ernie_vil-2.0-base-zh/model_state.pdparams",
+                "38d8c8e01f74ba881e87d9a3f669e5ae",
+            ],
+            "config": [
+                "https://paddlenlp.bj.bcebos.com/models/community/PaddlePaddle/ernie_vil-2.0-base-zh/config.json",
+                "caf929b450d5638e8df2a95c936519e7",
+            ],
+            "vocab_file": [
+                "https://paddlenlp.bj.bcebos.com/models/community/PaddlePaddle/ernie_vil-2.0-base-zh/vocab.txt",
+                "1c1c1f4fd93c5bed3b4eebec4de976a8",
+            ],
+            "preprocessor_config": [
+                "https://paddlenlp.bj.bcebos.com/models/community/PaddlePaddle/ernie_vil-2.0-base-zh/preprocessor_config.json",
+                "9a2e8da9f41896fedb86756b79355ee2",
+            ],
+            "special_tokens_map": [
+                "https://paddlenlp.bj.bcebos.com/models/community/PaddlePaddle/ernie_vil-2.0-base-zh/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://paddlenlp.bj.bcebos.com/models/community/PaddlePaddle/ernie_vil-2.0-base-zh/tokenizer_config.json",
+                "da5385c23c8f522d33fc3aac829e4375",
+            ],
+        },
+        "OFA-Sys/chinese-clip-vit-base-patch16": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/models/community/OFA-Sys/chinese-clip-vit-base-patch16/model_state.pdparams",
+                "d594c94833b8cfeffc4f986712b3ef79",
+            ],
+            "config": [
+                "https://paddlenlp.bj.bcebos.com/models/community/OFA-Sys/chinese-clip-vit-base-patch16/config.json",
+                "3611b5c34ad69dcf91e3c1d03b01a93a",
+            ],
+            "vocab_file": [
+                "https://paddlenlp.bj.bcebos.com/models/community/OFA-Sys/chinese-clip-vit-base-patch16/vocab.txt",
+                "3b5b76c4aef48ecf8cb3abaafe960f09",
+            ],
+            "preprocessor_config": [
+                "https://paddlenlp.bj.bcebos.com/models/community/OFA-Sys/chinese-clip-vit-base-patch16/preprocessor_config.json",
+                "ba1fb66c75b18b3c9580ea5120e01ced",
+            ],
+            "special_tokens_map": [
+                "https://paddlenlp.bj.bcebos.com/models/community/OFA-Sys/chinese-clip-vit-base-patch16/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://paddlenlp.bj.bcebos.com/models/community/OFA-Sys/chinese-clip-vit-base-patch16/tokenizer_config.json",
+                "573ba0466e15cdb5bd423ff7010735ce",
+            ],
+        },
+        "OFA-Sys/chinese-clip-vit-large-patch14": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/models/community/OFA-Sys/chinese-clip-vit-large-patch14/model_state.pdparams",
+                "5c0dde02d68179a9cc566173e53966c0",
+            ],
+            "config": [
+                "https://paddlenlp.bj.bcebos.com/models/community/OFA-Sys/chinese-clip-vit-large-patch14/config.json",
+                "a5e35843aa87ab1106e9f60f1e16b96d",
+            ],
+            "vocab_file": [
+                "https://paddlenlp.bj.bcebos.com/models/community/OFA-Sys/chinese-clip-vit-large-patch14/vocab.txt",
+                "3b5b76c4aef48ecf8cb3abaafe960f09",
+            ],
+            "preprocessor_config": [
+                "https://paddlenlp.bj.bcebos.com/models/community/OFA-Sys/chinese-clip-vit-large-patch14/preprocessor_config.json",
+                "ba1fb66c75b18b3c9580ea5120e01ced",
+            ],
+            "special_tokens_map": [
+                "https://paddlenlp.bj.bcebos.com/models/community/OFA-Sys/chinese-clip-vit-large-patch14/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://paddlenlp.bj.bcebos.com/models/community/OFA-Sys/chinese-clip-vit-large-patch14/tokenizer_config.json",
+                "573ba0466e15cdb5bd423ff7010735ce",
+            ],
+        },
+        "OFA-Sys/chinese-clip-vit-large-patch14-336px": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/models/community/OFA-Sys/chinese-clip-vit-large-patch14-336px/model_state.pdparams",
+                "ee3eb7f9667cfb06338bea5757c5e0d7",
+            ],
+            "config": [
+                "https://paddlenlp.bj.bcebos.com/models/community/OFA-Sys/chinese-clip-vit-large-patch14-336px/config.json",
+                "cb2794d99bea8c8f45901d177e663e1e",
+            ],
+            "vocab_file": [
+                "https://paddlenlp.bj.bcebos.com/models/community/OFA-Sys/chinese-clip-vit-large-patch14-336px/vocab.txt",
+                "3b5b76c4aef48ecf8cb3abaafe960f09",
+            ],
+            "preprocessor_config": [
+                "https://paddlenlp.bj.bcebos.com/models/community/OFA-Sys/chinese-clip-vit-large-patch14-336px/preprocessor_config.json",
+                "c52a0b3abe9bdd1c3c5a3d56797f4a03",
+            ],
+            "special_tokens_map": [
+                "https://paddlenlp.bj.bcebos.com/models/community/OFA-Sys/chinese-clip-vit-large-patch14-336px/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://paddlenlp.bj.bcebos.com/models/community/OFA-Sys/chinese-clip-vit-large-patch14-336px/tokenizer_config.json",
+                "573ba0466e15cdb5bd423ff7010735ce",
+            ],
+        },
+        "__internal_testing__/tiny-random-ernievil2": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/models/community/__internal_testing__/tiny-random-ernievil2/model_state.pdparams",
+                "771c844e7b75f61123d9606c8c17b1d6",
+            ],
+            "config": [
+                "https://paddlenlp.bj.bcebos.com/models/community/__internal_testing__/tiny-random-ernievil2/config.json",
+                "ae27a68336ccec6d3ffd14b48a6d1f25",
+            ],
+            "vocab_file": [
+                "https://paddlenlp.bj.bcebos.com/models/community/__internal_testing__/tiny-random-ernievil2/vocab.txt",
+                "1c1c1f4fd93c5bed3b4eebec4de976a8",
+            ],
+            "preprocessor_config": [
+                "https://paddlenlp.bj.bcebos.com/models/community/__internal_testing__/tiny-random-ernievil2/preprocessor_config.json",
+                "9a2e8da9f41896fedb86756b79355ee2",
+            ],
+            "special_tokens_map": [
+                "https://paddlenlp.bj.bcebos.com/models/community/__internal_testing__/tiny-random-ernievil2/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://paddlenlp.bj.bcebos.com/models/community/__internal_testing__/tiny-random-ernievil2/tokenizer_config.json",
+                "2333f189cad8dd559de61bbff4d4a789",
+            ],
+        },
+    }
+
+    def __init__(self, task, model, batch_size=1, is_static_model=True, max_length=128, return_tensors="pd", **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
+        self._seed = None
+        self.export_type = "text"
+        self._batch_size = batch_size
+        self.return_tensors = return_tensors
+        if not self.from_hf_hub:
+            self._check_task_files()
+        self._max_length = max_length
+        self._construct_tokenizer()
+        self.is_static_model = is_static_model
+        self._config_map = {}
+        self.predictor_map = {}
+        self.input_names_map = {}
+        self.input_handles_map = {}
+        self.output_handle_map = {}
+        self._check_predictor_type()
+        if self.is_static_model:
+            self._get_inference_model()
+        else:
+            self._construct_model(model)
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+        self._model = AutoModel.from_pretrained(self._task_path)
+        self._model.eval()
+
+    def _construct_tokenizer(self):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        self._processor = AutoProcessor.from_pretrained(self._task_path)
+
+    def _batchify(self, data, batch_size):
+        """
+        Generate input batches.
+        """
+
+        def _parse_batch(batch_examples):
+            if isinstance(batch_examples[0], str):
+                batch_texts = batch_examples
+                batch_images = None
+            else:
+                batch_texts = None
+                batch_images = batch_examples
+            if self.is_static_model:
+                # The input of static model is numpy array
+                tokenized_inputs = self._processor(
+                    text=batch_texts,
+                    images=batch_images,
+                    return_tensors="np",
+                    padding="max_length",
+                    max_length=self._max_length,
+                    truncation=True,
+                )
+            else:
+                # The input of dygraph model is padddle.Tensor
+                tokenized_inputs = self._processor(
+                    text=batch_texts,
+                    images=batch_images,
+                    return_tensors="pd",
+                    padding="max_length",
+                    max_length=self._max_length,
+                    truncation=True,
+                )
+            return tokenized_inputs
+
+        # Separates data into some batches.
+        one_batch = []
+        for example in data:
+            one_batch.append(example)
+            if len(one_batch) == batch_size:
+                yield _parse_batch(one_batch)
+                one_batch = []
+        if one_batch:
+            yield _parse_batch(one_batch)
+
+    def _check_input_text(self, inputs):
+        """
+        Check whether the input text meet the requirement.
+        """
+        inputs = inputs[0]
+        if isinstance(inputs, str):
+            if len(inputs) == 0:
+                raise ValueError("Invalid inputs, input text should not be empty, please check your input.")
+            inputs = [inputs]
+        elif isinstance(inputs, Image.Image):
+            inputs = [inputs]
+        elif isinstance(inputs, list):
+            # and len(inputs[0].strip()) > 0
+            if not (isinstance(inputs[0], (str, Image.Image))):
+                raise TypeError(
+                    "Invalid inputs, input text/image should be list of str/PIL.image, and first element of list should not be empty."
+                )
+        else:
+            raise TypeError(
+                "Invalid inputs, input text should be str or list of str, but type of {} found!".format(type(inputs))
+            )
+        return inputs
+
+    def _preprocess(self, inputs):
+        """
+        Transform the raw inputs to the model inputs, two steps involved:
+           1) Transform the raw text/image to token ids/pixel_values.
+           2) Generate the other model inputs from the raw text/image and token ids/pixel_values.
+        """
+        inputs = self._check_input_text(inputs)
+        batches = self._batchify(inputs, self._batch_size)
+        outputs = {"batches": batches, "inputs": inputs}
+        return outputs
+
+    def _run_model(self, inputs):
+        """
+        Run the task model from the outputs of the `_preprocess` function.
+        """
+        all_feats = []
+        if self.is_static_model:
+            with static_mode_guard():
+                for batch_inputs in inputs["batches"]:
+                    if self._predictor_type == "paddle-inference":
+                        if "input_ids" in batch_inputs:
+                            self.input_handles_map["text"][0].copy_from_cpu(batch_inputs["input_ids"])
+                            self.predictor_map["text"].run()
+                            text_features = self.output_handle_map["text"][0].copy_to_cpu()
+                            all_feats.append(text_features)
+                        elif "pixel_values" in batch_inputs:
+                            self.input_handles_map["image"][0].copy_from_cpu(batch_inputs["pixel_values"])
+                            self.predictor_map["image"].run()
+                            image_features = self.output_handle_map["image"][0].copy_to_cpu()
+                            all_feats.append(image_features)
+                    else:
+                        # onnx mode
+                        if "input_ids" in batch_inputs:
+                            input_dict = {}
+                            input_dict["input_ids"] = batch_inputs["input_ids"]
+                            text_features = self.predictor_map["text"].run(None, input_dict)[0].tolist()
+                            all_feats.append(text_features)
+                        elif "pixel_values" in batch_inputs:
+                            input_dict = {}
+                            input_dict["pixel_values"] = batch_inputs["pixel_values"]
+                            image_features = self.predictor_map["image"].run(None, input_dict)[0].tolist()
+                            all_feats.append(image_features)
+        else:
+            for batch_inputs in inputs["batches"]:
+                if "input_ids" in batch_inputs:
+                    text_features = self._model.get_text_features(input_ids=batch_inputs["input_ids"])
+                    all_feats.append(text_features.numpy())
+                if "pixel_values" in batch_inputs:
+                    image_features = self._model.get_image_features(pixel_values=batch_inputs["pixel_values"])
+                    all_feats.append(image_features.numpy())
+        inputs.update({"features": all_feats})
+        return inputs
+
+    def _postprocess(self, inputs):
+        inputs["features"] = np.concatenate(inputs["features"], axis=0)
+        if self.return_tensors == "pd":
+            inputs["features"] = paddle.to_tensor(inputs["features"])
+        return inputs
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        self._input_text_spec = [
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),
+        ]
+
+        self._input_image_spec = [
+            paddle.static.InputSpec(shape=[None, 3, 224, 224], dtype="float32", name="pixel_values"),
+        ]
+
+    def _convert_dygraph_to_static(self):
+        """
+        Convert the dygraph model to static model.
+        """
+        assert (
+            self._model is not None
+        ), "The dygraph model must be created before converting the dygraph model to static model."
+        assert (
+            self._input_image_spec is not None or self._input_text_spec is not None
+        ), "The input spec must be created before converting the dygraph model to static model."
+        logger.info("Converting to the inference model cost a little time.")
+
+        static_model = paddle.jit.to_static(self._model.get_text_features, input_spec=self._input_text_spec)
+        self.inference_model_path = self.inference_text_model_path
+        paddle.jit.save(static_model, self.inference_model_path)
+        logger.info("The inference model save in the path:{}".format(self.inference_model_path))
+
+        static_model = paddle.jit.to_static(self._model.get_image_features, input_spec=self._input_image_spec)
+        self.inference_model_path = self.inference_image_model_path
+        paddle.jit.save(static_model, self.inference_model_path)
+        logger.info("The inference model save in the path:{}".format(self.inference_model_path))
+
+    def _get_inference_model(self):
+        """
+        Return the inference program, inputs and outputs in static mode.
+        """
+        _base_path = os.path.join(self._home_path, "taskflow", self.task, self.model)
+        self.inference_image_model_path = os.path.join(_base_path, "static", "get_image_features")
+        self.inference_text_model_path = os.path.join(_base_path, "static", "get_text_features")
+        if (
+            not os.path.exists(self.inference_image_model_path + ".pdiparams")
+            or self._param_updated
+            or not os.path.exists(self.inference_text_model_path + ".pdiparams")
+        ):
+            with dygraph_mode_guard():
+                self._construct_model(self.model)
+                self._construct_input_spec()
+                self._convert_dygraph_to_static()
+        if self._predictor_type == "paddle-inference":
+            # Get text inference model
+            self.inference_model_path = self.inference_text_model_path
+            self._static_model_file = self.inference_model_path + ".pdmodel"
+            self._static_params_file = self.inference_model_path + ".pdiparams"
+            self._config = paddle.inference.Config(self._static_model_file, self._static_params_file)
+            self._prepare_static_mode()
+
+            self.predictor_map["text"] = self.predictor
+            self.input_names_map["text"] = self.input_names
+            self.input_handles_map["text"] = self.input_handles
+            self.output_handle_map["text"] = self.output_handle
+            self._config_map["text"] = self._config
+
+            # Get image inference model
+            self.inference_model_path = self.inference_image_model_path
+            self._static_model_file = self.inference_model_path + ".pdmodel"
+            self._static_params_file = self.inference_model_path + ".pdiparams"
+            self._config = paddle.inference.Config(self._static_model_file, self._static_params_file)
+            self._prepare_static_mode()
+
+            self.predictor_map["image"] = self.predictor
+            self.input_names_map["image"] = self.input_names
+            self.input_handles_map["image"] = self.input_handles
+            self.output_handle_map["image"] = self.output_handle
+            self._config_map["image"] = self._config
+        else:
+            # Get text onnx model
+            self.export_type = "text"
+            self.inference_model_path = self.inference_text_model_path
+            self._static_model_file = self.inference_model_path + ".pdmodel"
+            self._static_params_file = self.inference_model_path + ".pdiparams"
+            self._prepare_onnx_mode()
+            self.predictor_map["text"] = self.predictor
+
+            # Get image onnx model
+            self.export_type = "image"
+            self.inference_model_path = self.inference_image_model_path
+            self._static_model_file = self.inference_model_path + ".pdmodel"
+            self._static_params_file = self.inference_model_path + ".pdiparams"
+            self._prepare_onnx_mode()
+            self.predictor_map["image"] = self.predictor
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/named_entity_recognition.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/named_entity_recognition.py
new file mode 100644
index 000000000..d590c0e4d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/named_entity_recognition.py
@@ -0,0 +1,240 @@
+# coding:utf-8
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .knowledge_mining import WordTagTask
+from .lexical_analysis import LacTask
+from .utils import Customization
+
+POS_LABEL_WORDTAG = [
+    "介词",
+    "介词_方位介词",
+    "助词",
+    "代词",
+    "连词",
+    "副词",
+    "疑问词",
+    "肯定词",
+    "否定词",
+    "数量词",
+    "叹词",
+    "拟声词",
+    "修饰词",
+    "外语单词",
+    "英语单词",
+    "汉语拼音",
+    "词汇用语",
+    "w",
+]
+
+POS_LABEL_LAC = ["n", "f", "s", "t", "v", "vd", "vn", "a", "ad", "an", "d", "m", "q", "r", "p", "c", "u", "xc", "w"]
+
+usage = r"""
+          from paddlenlp import Taskflow
+
+          # WordTag精确模式
+          ner = Taskflow("ner")
+          ner("《孤女》是2010年九州出版社出版的小说，作者是余兼羽")
+          '''
+          [('《', 'w'), ('孤女', '作品类_实体'), ('》', 'w'), ('是', '肯定词'), ('2010年', '时间类'), ('九州出版社', '组织机构类'), ('出版', '场景事件'), ('的', '助词'), ('小说', '作品类_概念'), ('，', 'w'), ('作者', '人物类_概念'), ('是', '肯定词'), ('余兼羽', '人物类_实体')]
+          '''
+
+          ner(["热梅茶是一道以梅子为主要原料制作的茶饮", "《孤女》是2010年九州出版社出版的小说，作者是余兼羽"])
+          '''
+          [[('热梅茶', '饮食类_饮品'), ('是', '肯定词'), ('一道', '数量词'), ('以', '介词'), ('梅子', '饮食类'), ('为', '肯定词'), ('主要原料', '物体类'), ('制作', '场景事件'), ('的', '助词'), ('茶饮', '饮食类_饮品')], [('《', 'w'), ('孤女', '作品类_实体'), ('》', 'w'), ('是', '肯定词'), ('2010年', '时间类'), ('九州出版社', '组织机构类'), ('出版', '场景事件'), ('的', '助词'), ('小说', '作品类_概念'), ('，', 'w'), ('作者', '人物类_概念'), ('是', '肯定词'), ('余兼羽', '人物类_实体')]]
+          '''
+
+          # 只返回实体/概念词
+          ner = Taskflow("ner", entity_only=True)
+          ner("《孤女》是2010年九州出版社出版的小说，作者是余兼羽")
+          '''
+          [('孤女', '作品类_实体'), ('2010年', '时间类'), ('九州出版社', '组织机构类'), ('出版', '场景事件'), ('小说', '作品类_概念'), ('作者', '人物类_概念'), ('余兼羽', '人物类_实体')]
+          '''
+
+          # 使用快速模式，只返回实体词
+          ner = Taskflow("ner", mode="fast", entity_only=True)
+          ner("三亚是一个美丽的城市")
+          '''
+          [('三亚', 'LOC')]
+          '''
+          """
+
+
+class NERWordTagTask(WordTagTask):
+    """
+    This the NER(Named Entity Recognition) task that convert the raw text to entities. And the task with the `wordtag`
+    model will link the more meesage with the entity.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+
+    """
+
+    resource_files_names = {
+        "model_state": "model_state.pdparams",
+        "model_config": "config.json",
+        "tags": "tags.txt",
+        "vocab_file": "vocab.txt",
+        "special_tokens_map": "special_tokens_map.json",
+        "tokenizer_config": "tokenizer_config.json",
+    }
+    resource_files_urls = {
+        "wordtag": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag_v1.5/model_state.pdparams",
+                "c7c9cef72f73ee22c70c26ef11393025",
+            ],
+            "model_config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag_v1.1/config.json",
+                "b9f307b3fa03ad98c08ecb5249c15dfa",
+            ],
+            "tags": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag_v1.1/tags.txt",
+                "f33feedd01d478b03bac81be19b48d00",
+            ],
+            "vocab_file": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag/vocab.txt",
+                "54aa6e2eeb0478c2d18a2343b008590c",
+            ],
+            "special_tokens_map": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag/special_tokens_map.json",
+                "58104269e4f141a258bdb2ed06aa599f",
+            ],
+            "tokenizer_config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/knowledge_mining/wordtag/tokenizer_config.json",
+                "e3f2756e72e24e3bb298303fb9a171f7",
+            ],
+        }
+    }
+
+    def __init__(self, model, task, entity_only=False, **kwargs):
+        super().__init__(model="wordtag", task=task, **kwargs)
+        self.entity_only = entity_only
+        if self._user_dict:
+            self._custom = Customization()
+            self._custom.load_customization(self._user_dict)
+        else:
+            self._custom = None
+
+    def _decode(self, batch_texts, batch_pred_tags):
+        batch_results = []
+        for sent_index in range(len(batch_texts)):
+            sent = batch_texts[sent_index]
+            indexes = batch_pred_tags[sent_index][self.summary_num : len(sent) + self.summary_num]
+            tags = [self._index_to_tags[index] for index in indexes]
+            if self._custom:
+                self._custom.parse_customization(sent, tags, prefix=True)
+            sent_out = []
+            tags_out = []
+            partial_word = ""
+            for ind, tag in enumerate(tags):
+                if partial_word == "":
+                    partial_word = sent[ind]
+                    tags_out.append(tag.split("-")[-1])
+                    continue
+                if tag.startswith("B") or tag.startswith("S") or tag.startswith("O"):
+                    sent_out.append(partial_word)
+                    tags_out.append(tag.split("-")[-1])
+                    partial_word = sent[ind]
+                    continue
+                partial_word += sent[ind]
+
+            if len(sent_out) < len(tags_out):
+                sent_out.append(partial_word)
+
+            pred_words = []
+            for s, t in zip(sent_out, tags_out):
+                pred_words.append({"item": s, "wordtag_label": t})
+
+            result = {"text": sent, "items": pred_words}
+            batch_results.append(result)
+        return batch_results
+
+    def _simplify_result(self, results):
+        simple_results = []
+        for result in results:
+            simple_result = []
+            if "items" in result:
+                for item in result["items"]:
+                    if self.entity_only and item["wordtag_label"] in POS_LABEL_WORDTAG:
+                        continue
+                    simple_result.append((item["item"], item["wordtag_label"]))
+            simple_results.append(simple_result)
+        simple_results = simple_results[0] if len(simple_results) == 1 else simple_results
+        return simple_results
+
+    def _postprocess(self, inputs):
+        """
+        The model output is the tag ids, this function will convert the model output to raw text.
+        """
+        results = self._decode(inputs["short_input_texts"], inputs["all_pred_tags"])
+        results = self._auto_joiner(results, self.input_mapping, is_dict=True)
+        results = self._simplify_result(results)
+        return results
+
+
+class NERLACTask(LacTask):
+    """
+    Part-of-speech tagging task for the raw text.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    def __init__(self, model, task, entity_only=False, **kwargs):
+        super().__init__(task=task, model="lac", **kwargs)
+        self.entity_only = entity_only
+
+    def _postprocess(self, inputs):
+        """
+        The model output is the tag ids, this function will convert the model output to raw text.
+        """
+        lengths = inputs["lens"]
+        preds = inputs["result"]
+        sents = inputs["text"]
+        final_results = []
+        for sent_index in range(len(lengths)):
+            tags = [self._id2tag_dict[str(index)] for index in preds[sent_index][: lengths[sent_index]]]
+            sent = sents[sent_index]
+            if self._custom:
+                self._custom.parse_customization(sent, tags)
+            sent_out = []
+            tags_out = []
+            parital_word = ""
+            for ind, tag in enumerate(tags):
+                if parital_word == "":
+                    parital_word = sent[ind]
+                    tags_out.append(tag.split("-")[0])
+                    continue
+                if tag.endswith("-B") or (tag == "O" and tags[ind - 1] != "O"):
+                    sent_out.append(parital_word)
+                    tags_out.append(tag.split("-")[0])
+                    parital_word = sent[ind]
+                    continue
+                parital_word += sent[ind]
+
+            if len(sent_out) < len(tags_out):
+                sent_out.append(parital_word)
+
+            result = []
+            for s, t in zip(sent_out, tags_out):
+                if self.entity_only and t in POS_LABEL_LAC:
+                    continue
+                result.append((s, t))
+            final_results.append(result)
+        final_results = self._auto_joiner(final_results, self.input_mapping)
+        final_results = final_results if len(final_results) > 1 else final_results[0]
+        return final_results
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/poetry_generation.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/poetry_generation.py
new file mode 100644
index 000000000..7678ea232
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/poetry_generation.py
@@ -0,0 +1,51 @@
+# coding:utf-8
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .text_generation import TextGenerationTask
+
+usage = r"""
+           from paddlenlp import Taskflow
+
+           poetry = Taskflow("poetry_generation")
+           poetry("林密不见人")
+           '''
+           [{'text': '林密不见人', 'answer': ',但闻人语响。'}]
+           '''
+
+           poetry(["林密不见人", "举头邀明月"])
+           '''
+           [{'text': '林密不见人', 'answer': ',但闻人语响。'}, {'text': '举头邀明月', 'answer': ',低头思故乡。'}]
+           '''
+         """
+
+URLS = {
+    "gpt-cpm-large-cn": [
+        "https://bj.bcebos.com/paddlenlp/taskflow/text_generation/gpt-cpm/gpt-cpm-large-cn_params.tar",
+        "5aad6f81053cfdbba4797f044fcf66d1",
+    ],
+}
+
+
+class PoetryGenerationTask(TextGenerationTask):
+    """
+    The text generation model to predict the question or chinese  poetry.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    def __init__(self, task, model, **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/pos_tagging.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/pos_tagging.py
new file mode 100644
index 000000000..6d7a30911
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/pos_tagging.py
@@ -0,0 +1,81 @@
+# coding:utf-8
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .lexical_analysis import LacTask
+
+usage = r"""
+           from paddlenlp import Taskflow
+
+           pos = Taskflow("pos_tagging")
+           pos("第十四届全运会在西安举办")
+           '''
+           [('第十四届', 'm'), ('全运会', 'nz'), ('在', 'p'), ('西安', 'LOC'), ('举办', 'v')]
+           '''
+
+           pos(["第十四届全运会在西安举办", "三亚是一个美丽的城市"])
+           '''
+           [[('第十四届', 'm'), ('全运会', 'nz'), ('在', 'p'), ('西安', 'LOC'), ('举办', 'v')], [('三亚', 'LOC'), ('是', 'v'), ('一个', 'm'), ('美丽', 'a'), ('的', 'u'), ('城市', 'n')]]
+           '''
+         """
+
+
+class POSTaggingTask(LacTask):
+    """
+    Part-of-speech tagging task for the raw text.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    def __init__(self, task, model, **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
+
+    def _postprocess(self, inputs):
+        """
+        The model output is the tag ids, this function will convert the model output to raw text.
+        """
+        lengths = inputs["lens"]
+        preds = inputs["result"]
+        sents = inputs["text"]
+        final_results = []
+        for sent_index in range(len(lengths)):
+            tags = [self._id2tag_dict[str(index)] for index in preds[sent_index][: lengths[sent_index]]]
+            sent = sents[sent_index]
+            if self._custom:
+                self._custom.parse_customization(sent, tags)
+            sent_out = []
+            tags_out = []
+            parital_word = ""
+            for ind, tag in enumerate(tags):
+                if parital_word == "":
+                    parital_word = sent[ind]
+                    tags_out.append(tag.split("-")[0])
+                    continue
+                if tag.endswith("-B") or (tag == "O" and tags[ind - 1] != "O"):
+                    sent_out.append(parital_word)
+                    tags_out.append(tag.split("-")[0])
+                    parital_word = sent[ind]
+                    continue
+                parital_word += sent[ind]
+
+            if len(sent_out) < len(tags_out):
+                sent_out.append(parital_word)
+
+            result = list(zip(sent_out, tags_out))
+            final_results.append(result)
+        final_results = self._auto_joiner(final_results, self.input_mapping)
+        final_results = final_results if len(final_results) > 1 else final_results[0]
+        return final_results
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/question_answering.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/question_answering.py
new file mode 100644
index 000000000..c00d1d3e7
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/question_answering.py
@@ -0,0 +1,52 @@
+# coding:utf-8
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .text_generation import TextGenerationTask
+
+usage = r"""
+           from paddlenlp import Taskflow
+
+           qa = Taskflow("question_answering")
+           qa("中国的国土面积有多大？")
+           '''
+           [{'text': '中国的国土面积有多大？', 'answer': '960万平方公里。'}]
+           '''
+
+           qa(["中国国土面积有多大？", "中国的首都在哪里？"])
+           '''
+           [{'text': '中国国土面积有多大？', 'answer': '960万平方公里。'}, {'text': '中国的首都在哪里？', 'answer': '北京。'}]
+           '''
+
+         """
+
+URLS = {
+    "gpt-cpm-large-cn": [
+        "https://bj.bcebos.com/paddlenlp/taskflow/text_generation/gpt-cpm/gpt-cpm-large-cn_params.tar",
+        "5aad6f81053cfdbba4797f044fcf66d1",
+    ],
+}
+
+
+class QuestionAnsweringTask(TextGenerationTask):
+    """
+    The text generation model to predict the question or chinese  poetry.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    def __init__(self, task, model, **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/question_generation.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/question_generation.py
new file mode 100644
index 000000000..c5303a28e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/question_generation.py
@@ -0,0 +1,454 @@
+# coding:utf-8
+# Copyright (c) 2022  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import numpy as np
+import paddle
+
+from ..data import Pad
+from ..transformers import UNIMOLMHeadModel, UNIMOTokenizer
+from .task import Task
+
+usage = r"""
+            from paddlenlp import Taskflow
+
+            question_generation = Taskflow("question_generation")
+            question_generation([{"context": "奇峰黄山千米以上的山峰有77座，整座黄山就是一座花岗岩的峰林，自古有36大峰，36小峰，最高峰莲花峰、最险峰天都峰和观日出的最佳点光明顶构成黄山的三大主峰。", "answer": "莲花峰"}]])
+            '''
+                ['黄山最高峰是什么']
+            '''
+         """
+
+
+class QuestionGenerationTask(Task):
+    """
+    The text summarization model to predict the summary of an input text.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    def __init__(self, task, model, **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
+        self._batch_size = kwargs.get("batch_size", 16)
+        self._output_scores = kwargs.get("output_scores", False)
+        self._is_select_from_num_return_sequences = kwargs.get("is_select_from_num_return_sequences", True)
+        self._construct_tokenizer(model)
+        self._construct_model(model)
+        # Hypter-parameter during generating.
+        self._max_length = kwargs.get("max_length", 50)
+        self._min_length = kwargs.get("min_length", 3)
+        self._decode_strategy = kwargs.get("decode_strategy", "beam_search")
+        self._temperature = kwargs.get("temperature", 1.0)
+        self._top_k = kwargs.get("top_k", 0)
+        self._top_p = kwargs.get("top_p", 1.0)
+        self._num_beams = kwargs.get("num_beams", 6)
+        self._num_beam_groups = kwargs.get("num_beam_groups", 1)
+        self._diversity_rate = kwargs.get("diversity_rate", 0.0)
+        self._length_penalty = kwargs.get("length_penalty", 1.2)
+        self._num_return_sequences = kwargs.get("num_return_sequences", 1)
+        self._repetition_penalty = kwargs.get("repetition_penalty", 1)
+        self._use_faster = kwargs.get("use_faster", False)
+        self._use_fp16_decoding = kwargs.get("use_fp16_decoding", False)
+        self._template = kwargs.get("template", 1)
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+        if self._custom_model:
+            self._model = UNIMOLMHeadModel.from_pretrained(self._task_path)
+        else:
+            self._model = UNIMOLMHeadModel.from_pretrained(model)
+        self._model.eval()
+
+    def _construct_tokenizer(self, model):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        if self._custom_model:
+            self._tokenizer = UNIMOTokenizer.from_pretrained(self._task_path)
+        else:
+            self._tokenizer = UNIMOTokenizer.from_pretrained(model)
+
+    def _preprocess(self, inputs):
+        """
+        Transform the raw text to the model inputs, two steps involved:
+           1) Transform the raw text to token ids.
+           2) Generate the other model inputs from the raw text and token ids.
+        """
+        inputs = self._check_input_text(inputs)
+        batches = self._batchify(inputs, self._batch_size)
+        outputs = {"batches": batches, "text": inputs}
+        return outputs
+
+    def _batchify(self, data, batch_size):
+        """
+        Generate input batches.
+        """
+        examples = [self._convert_example(i) for i in data]
+        # Separates data into some batches.
+        one_batch = []
+        for example in examples:
+            one_batch.append(example)
+            if len(one_batch) == batch_size:
+                yield self._parse_batch(one_batch, self._tokenizer.pad_token_id)
+                one_batch = []
+        if one_batch:
+            yield self._parse_batch(one_batch, self._tokenizer.pad_token_id)
+
+    def _check_input_text(self, inputs):
+        inputs = inputs[0]
+        if isinstance(inputs, str):
+            if len(inputs) == 0:
+                raise ValueError("Invalid inputs, input text should not be empty text, please check your input. ")
+            inputs = [inputs]
+        elif isinstance(inputs, dict):
+            if not ("source" in inputs and "title" in inputs) and not ("context" in inputs and "answer" in inputs):
+                raise TypeError(
+                    "Invalid inputs, source and title are not in the input dictionary, nor are context and answer."
+                )
+        elif isinstance(inputs, list):
+            if not (isinstance(inputs[0], dict)):
+                raise TypeError(
+                    "Invalid inputs, input text should be list of dict, but type of List({}) found! ".format(
+                        type(inputs[0])
+                    )
+                )
+        else:
+            raise TypeError(
+                "Invalid inputs, input text should be str or list of str, but type of {} found!".format(type(inputs))
+            )
+        return inputs
+
+    def _convert_example(self, example, max_seq_len=512, return_length=True, template=1):
+        """
+        Convert all examples into necessary features.
+        """
+        if isinstance(example, dict):
+            target = None
+            if "source" in example and "title" in example:
+                source = example["source"]
+                title = None
+                if "title" in example.keys():
+                    title = example["title"]
+            elif "context" in example and "answer" in example:
+                source = example["context"]
+                title = None
+                if "answer" in example.keys():
+                    title = example["answer"]
+            else:
+                assert False, "Source and title are not in the input dictionary, nor are context and answer."
+            if "target" in example.keys():
+                target = example["target"]
+        elif isinstance(example, list):
+            source = example[0]
+            title = example[1]
+
+        if self._template == 1:
+            # use template 1
+            source = "答案：" + title + self._tokenizer.sep_token + "上下文：" + source
+            title = None
+            if target:
+                target = "问题：" + target
+        elif self._template == 2:
+            # use template 2
+            source = "答案：" + title + self._tokenizer.sep_token + "上下文：" + source
+            title = None
+            if target:
+                target = "在已知答案的前提下，问题：" + target
+        elif self._template == 3:
+            # use template 3
+            source = "这是一个问题生成任务，根据提供的答案和上下文，来生成问题。" + title + self._tokenizer.sep_token + "上下文：" + source
+            title = None
+            if target:
+                target = "问题：" + target
+
+        tokenized_example = self._tokenizer.gen_encode(
+            source,
+            title=title,
+            max_seq_len=max_seq_len,
+            max_title_len=30,
+            add_start_token_for_decoding=True,
+            return_position_ids=True,
+        )
+
+        if "target" in example and example["target"]:
+            tokenized_example["target"] = example["target"]
+        # Use to gather the logits corresponding to the labels during training
+        return tokenized_example
+
+    def _parse_batch(self, batch_examples, pad_val, pad_right=False):
+        """
+        Batchify a batch of examples.
+        """
+
+        def pad_mask(batch_attention_mask):
+            """Pad attention_mask."""
+            batch_size = len(batch_attention_mask)
+            max_len = max(map(len, batch_attention_mask))
+            attention_mask = np.ones((batch_size, max_len, max_len), dtype="float32") * -1e9
+            for i, mask_data in enumerate(attention_mask):
+                seq_len = len(batch_attention_mask[i])
+                if pad_right:
+                    mask_data[:seq_len:, :seq_len] = np.array(batch_attention_mask[i], dtype="float32")
+                else:
+                    mask_data[-seq_len:, -seq_len:] = np.array(batch_attention_mask[i], dtype="float32")
+            # In order to ensure the correct broadcasting mechanism, expand one
+            # dimension to the second dimension (n_head of Transformer).
+            attention_mask = np.expand_dims(attention_mask, axis=1)
+            return attention_mask
+
+        pad_func = Pad(pad_val=pad_val, pad_right=pad_right, dtype="int64")
+        input_ids = pad_func([example["input_ids"] for example in batch_examples])
+        token_type_ids = pad_func([example["token_type_ids"] for example in batch_examples])
+        position_ids = pad_func([example["position_ids"] for example in batch_examples])
+        attention_mask = pad_mask([example["attention_mask"] for example in batch_examples])
+        # seq_len = np.asarray([example['seq_len'] for example in batch_examples],
+        #                      dtype='int32')
+        batch_dict = {}
+        batch_dict["input_ids"] = input_ids
+        batch_dict["token_type_ids"] = token_type_ids
+        batch_dict["position_ids"] = position_ids
+        batch_dict["attention_mask"] = attention_mask
+        # batch_dict['seq_len'] = seq_len
+        return batch_dict
+
+    def _run_model(self, inputs):
+        """
+        Run the task model from the outputs of the `_preprocess` function.
+        """
+        all_ids = []
+        all_scores = []
+
+        for batch in inputs["batches"]:
+            input_ids = paddle.to_tensor(batch["input_ids"], dtype="int64")
+            token_type_ids = paddle.to_tensor(batch["token_type_ids"], dtype="int64")
+            position_ids = paddle.to_tensor(batch["position_ids"], dtype="int64")
+            attention_mask = paddle.to_tensor(batch["attention_mask"], dtype="float32")
+            # seq_len = paddle.to_tensor(batch['seq_len'], dtype='int64')
+            ids, scores = self._model.generate(
+                input_ids=input_ids,
+                token_type_ids=token_type_ids,
+                position_ids=position_ids,
+                attention_mask=attention_mask,
+                max_length=self._max_length,
+                min_length=self._min_length,
+                decode_strategy=self._decode_strategy,
+                temperature=self._temperature,
+                top_k=self._top_k,
+                top_p=self._top_p,
+                num_beams=self._num_beams,
+                num_beam_groups=self._num_beam_groups,
+                diversity_rate=self._diversity_rate,
+                length_penalty=self._length_penalty,
+                num_return_sequences=self._num_return_sequences,
+                repetition_penalty=self._repetition_penalty,
+                bos_token_id=self._tokenizer.cls_token_id,
+                eos_token_id=self._tokenizer.mask_token_id,
+                use_fast=self._use_faster,
+                use_fp16_decoding=self._use_fp16_decoding,
+            )
+            all_ids.extend(ids)
+            all_scores.extend(scores)
+        inputs["ids"] = all_ids
+        inputs["scores"] = all_scores
+        return inputs
+
+    def out_run_model(self, input_ids, token_type_ids, position_ids, attention_mask):
+        """
+        Debug used.
+        """
+        all_ids = []
+        all_scores = []
+        # seq_len = paddle.to_tensor(batch['seq_len'], dtype='int64')
+        ids, scores = self._model.generate(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            max_length=self._max_length,
+            min_length=self._min_length,
+            decode_strategy=self._decode_strategy,
+            temperature=self._temperature,
+            top_k=self._top_k,
+            top_p=self._top_p,
+            num_beams=self._num_beams,
+            length_penalty=self._length_penalty,
+            num_return_sequences=self._num_return_sequences,
+            bos_token_id=self._tokenizer.cls_token_id,
+            eos_token_id=self._tokenizer.mask_token_id,
+        )
+        all_ids.extend(ids)
+        all_scores.extend(scores)
+
+        inputs = {}
+        inputs["ids"] = all_ids
+        inputs["scores"] = all_scores
+        return all_ids, all_scores
+
+    def _postprocess(self, inputs):
+        """
+        The model output is tag ids, this function will convert the model output to raw text.
+        """
+        ids_list = inputs["ids"]
+        scores_list = inputs["scores"]
+        if self._is_select_from_num_return_sequences:
+            results = self._select_from_num_return_sequences(
+                ids_list, scores_list, self._max_length, self._num_return_sequences
+            )
+        else:
+            results = self._return_num_return_sequences(
+                ids_list, scores_list, self._max_length, self._num_return_sequences
+            )
+        output_tokens = [result[0] for result in results]
+        output_scores = [math.exp(result[1]) for result in results]
+        # output_scores = [[math.exp(s) for s in result[1]] if isinstance(result[1], list) else math.exp(result[1]) for result in results]
+
+        if self._output_scores:
+            return output_tokens, output_scores
+        return output_tokens
+
+    def _return_num_return_sequences(self, ids, scores, max_dec_len=None, num_return_sequences=1):
+        """
+        Select generated sequence form several return sequences.
+        """
+        results = []
+        group = []
+        tmp = []
+        if scores is not None:
+            ids = [i.numpy() for i in ids]
+            scores = [i.numpy() for i in scores]
+
+            if len(ids) != len(scores) or (len(ids) % num_return_sequences) != 0:
+                raise ValueError(
+                    "the length of `ids` is {}, but the `num_return_sequences` is {}".format(
+                        len(ids), num_return_sequences
+                    )
+                )
+
+            for pred, score in zip(ids, scores):
+                pred_token_ids, pred_tokens = self._post_process_decoded_sequence(pred)
+                num_token = len(pred_token_ids)
+                target = "".join(pred_tokens)
+                target = self._remove_template(target)
+                # not ending
+                if max_dec_len is not None and num_token >= max_dec_len:
+                    score -= 1e3
+                tmp.append([target, score])
+                if len(tmp) == num_return_sequences:
+                    group.append(tmp)
+                    tmp = []
+            for preds in group:
+                preds = sorted(preds, key=lambda x: -x[1])
+                for pred in preds:
+                    results.append(pred)
+        else:
+            ids = ids.numpy()
+            for pred in ids:
+                pred_token_ids, pred_tokens = self._post_process_decoded_sequence(pred)
+                num_token = len(pred_token_ids)
+                response = "".join(pred_tokens)
+                response = self._remove_template(response)
+                # TODO: Support return scores in FT.
+                tmp.append([response])
+                if len(tmp) == num_return_sequences:
+                    group.append(tmp)
+                    tmp = []
+
+            for preds in group:
+                for pred in preds:
+                    results.append(pred)
+        return results
+
+    def _select_from_num_return_sequences(self, ids, scores, max_dec_len=None, num_return_sequences=1):
+        """
+        Select generated sequence form several return sequences.
+        """
+        results = []
+        group = []
+        tmp = []
+        if scores is not None:
+            ids = [i.numpy() for i in ids]
+            scores = [i.numpy() for i in scores]
+
+            if len(ids) != len(scores) or (len(ids) % num_return_sequences) != 0:
+                raise ValueError(
+                    "the length of `ids` is {}, but the `num_return_sequences` is {}".format(
+                        len(ids), num_return_sequences
+                    )
+                )
+
+            for pred, score in zip(ids, scores):
+                pred_token_ids, pred_tokens = self._post_process_decoded_sequence(pred)
+                num_token = len(pred_token_ids)
+                target = "".join(pred_tokens)
+                target = self._remove_template(target)
+                # not ending
+                if max_dec_len is not None and num_token >= max_dec_len:
+                    score -= 1e3
+                tmp.append([target, score])
+                if len(tmp) == num_return_sequences:
+                    group.append(tmp)
+                    tmp = []
+            for preds in group:
+                preds = sorted(preds, key=lambda x: -x[1])
+                results.append(preds[0])
+        else:
+            ids = ids.numpy()
+            for pred in ids:
+                pred_token_ids, pred_tokens = self._post_process_decoded_sequence(pred)
+                num_token = len(pred_token_ids)
+                response = "".join(pred_tokens)
+                response = self._remove_template(response)
+                # TODO: Support return scores in FT.
+                tmp.append([response])
+                if len(tmp) == num_return_sequences:
+                    group.append(tmp)
+                    tmp = []
+
+            for preds in group:
+                results.append(preds[0])
+        return results
+
+    def _post_process_decoded_sequence(self, token_ids):
+        """Post-process the decoded sequence. Truncate from the first <eos>."""
+        eos_pos = len(token_ids)
+        for i, tok_id in enumerate(token_ids):
+            if tok_id == self._tokenizer.mask_token_id:
+                eos_pos = i
+                break
+        token_ids = token_ids[:eos_pos]
+        tokens = self._tokenizer.convert_ids_to_tokens(token_ids)
+        tokens = self._tokenizer.merge_subword(tokens)
+        special_tokens = ["[UNK]"]
+        tokens = [token for token in tokens if token not in special_tokens]
+        return token_ids, tokens
+
+    def _remove_template(self, instr):
+        """Remove template prefix of decoded sequence."""
+        outstr = instr.strip("问题：")
+        outstr = instr.strip("在已知答案的前提下，问题：")
+        return outstr
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        self._input_spec = [
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),
+        ]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/sentiment_analysis.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/sentiment_analysis.py
new file mode 100644
index 000000000..1e5aaa7fa
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/sentiment_analysis.py
@@ -0,0 +1,881 @@
+# coding:utf-8
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+
+import numpy as np
+import paddle
+
+from ..data import JiebaTokenizer, Pad, Stack, Tuple, Vocab
+from ..datasets import load_dataset
+from ..transformers import UIE, AutoTokenizer, SkepTokenizer
+from ..utils.tools import get_bool_ids_greater_than, get_span
+from .models import LSTMModel, SkepSequenceModel
+from .task import Task
+from .utils import SchemaTree, dbc2sbc, get_id_and_prob, static_mode_guard
+
+usage = r"""
+            from paddlenlp import Taskflow
+
+            # sentiment analysis with bilstm
+            senta = Taskflow("sentiment_analysis")
+            senta("怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片")
+            '''
+            [{'text': '怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片', 'label': 'negative', 'score': 0.6691398620605469}]
+            '''
+
+            senta(["怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片",
+                   "作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间"])
+            '''
+            [{'text': '怀着十分激动的心情放映，可是看着看着发现，在放映完毕后，出现一集米老鼠的动画片', 'label': 'negative', 'score': 0.6691398620605469},
+             {'text': '作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间', 'label': 'positive', 'score': 0.9857505559921265}
+            ]
+            '''
+
+            # sentiment analysis with skep
+            senta = Taskflow("sentiment_analysis", model="skep_ernie_1.0_large_ch")
+            senta("作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。")
+            '''
+            [{'text': '作为老的四星酒店，房间依然很整洁，相当不错。机场接机服务很好，可以在车上办理入住手续，节省时间。', 'label': 'positive', 'score': 0.984320878982544}]
+            '''
+
+            # sentiment analysis with UIE
+            # aspect, opinion and sentiment extraction
+            schema = [{'评价维度': ['观点词', '情感倾向[正向,负向,未提及]']}]
+            ie = Taskflow('information_extraction', schema=schema,  model="uie-base")
+            ie("地址不错，服务一般，设施陈旧")
+            '''
+            [{'评价维度': [{'text': '地址', 'start': 0, 'end': 2, 'probability': 0.9888139270606509, 'relations': {'观点词': [{'text': '不错', 'start': 2, 'end': 4, 'probability': 0.9927847072459528}], '情感倾向[正向，负向]': [{'text': '正向', 'probability': 0.998228967796706}]}}, {'text': '设施', 'start': 10, 'end': 12, 'probability': 0.9588297379365116, 'relations': {'观点词': [{'text': '陈旧', 'start': 12, 'end': 14, 'probability': 0.9286753967902683}], '情感倾向[正向，负向]': [{'text': '负向', 'probability': 0.9949389795770394}]}}, {'text': '服务', 'start': 5, 'end': 7, 'probability': 0.9592857070501211, 'relations': {'观点词': [{'text': '一般', 'start': 7, 'end': 9, 'probability': 0.9949359182521675}], '情感倾向[正向，负向]': [{'text': '负向', 'probability': 0.9952498258302498}]}}]}]
+            '''
+            # opinion and sentiment extraction according to pre-given aspects
+            schema = [{'评价维度': ['观点词', '情感倾向[正向,负向,未提及]']}]
+            aspects = ['服务', '价格']
+            ie = Taskflow("sentiment_analysis", model="uie-base", schema=schema, aspects=aspects)
+            ie("蛋糕味道不错，很好吃，店家服务也很好")
+            '''
+            [{'评价维度': [{'text': '服务', 'relations': {'观点词': [{'text': '好', 'start': 17, 'end': 18, 'probability': 0.9998383583299955}], '情感倾向[正向,负向,未提及]': [{'text': '正向', 'probability': 0.9999240650320473}]}}, {'text': '价格', 'relations': {'情感倾向[正向,负向,未提及]': [{'text': '未提及', 'probability': 0.9999845028521719}]}}]}]
+            '''
+         """
+
+
+class SentaTask(Task):
+    """
+    Sentiment analysis task using RNN or BOW model to predict sentiment opinion on Chinese text.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    resource_files_names = {"model_state": "model_state.pdparams", "vocab": "vocab.txt"}
+    resource_files_urls = {
+        "bilstm": {
+            "vocab": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/sentiment_analysis/bilstm/vocab.txt",
+                "df714f0bfd6d749f88064679b4c97fd5",
+            ],
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/sentiment_analysis/bilstm/model_state.pdparams",
+                "609fc068aa35339e20f8310b5c20887c",
+            ],
+        }
+    }
+
+    def __init__(self, task, model, **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
+        self._static_mode = True
+        self._label_map = {0: "negative", 1: "positive"}
+        self._check_task_files()
+        self._construct_tokenizer(model)
+        if self._static_mode:
+            self._get_inference_model()
+        else:
+            self._construct_model(model)
+        self._usage = usage
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        self._input_spec = [
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="token_ids"),
+            paddle.static.InputSpec(shape=[None], dtype="int64", name="length"),
+        ]
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+        vocab_size = self.kwargs["vocab_size"]
+        pad_token_id = self.kwargs["pad_token_id"]
+        num_classes = 2
+
+        # Select the senta network for the inference
+        model_instance = LSTMModel(
+            vocab_size, num_classes, direction="bidirect", padding_idx=pad_token_id, pooling_type="max"
+        )
+        model_path = os.path.join(self._task_path, "model_state.pdparams")
+
+        # Load the model parameter for the predict
+        state_dict = paddle.load(model_path)
+        model_instance.set_dict(state_dict)
+        self._model = model_instance
+        self._model.eval()
+
+    def _construct_tokenizer(self, model):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        vocab_path = os.path.join(self._task_path, "vocab.txt")
+        vocab = Vocab.load_vocabulary(vocab_path, unk_token="[UNK]", pad_token="[PAD]")
+
+        vocab_size = len(vocab)
+        pad_token_id = vocab.to_indices("[PAD]")
+        # Construct the tokenizer form the JiebaToeknizer
+        self.kwargs["pad_token_id"] = pad_token_id
+        self.kwargs["vocab_size"] = vocab_size
+        tokenizer = JiebaTokenizer(vocab)
+        self._tokenizer = tokenizer
+
+    def _preprocess(self, inputs, padding=True, add_special_tokens=True):
+        """
+        Transform the raw text to the model inputs, two steps involved:
+           1) Transform the raw text to token ids.
+           2) Generate the other model inputs from the raw text and token ids.
+        """
+        inputs = self._check_input_text(inputs)
+        # Get the config from the kwargs
+        batch_size = self.kwargs["batch_size"] if "batch_size" in self.kwargs else 1
+        examples = []
+        filter_inputs = []
+        for input_data in inputs:
+            if not (isinstance(input_data, str) and len(input_data) > 0):
+                continue
+            filter_inputs.append(input_data)
+            ids = self._tokenizer.encode(input_data)
+            lens = len(ids)
+            examples.append((ids, lens))
+
+        batches = [examples[idx : idx + batch_size] for idx in range(0, len(examples), batch_size)]
+        outputs = {}
+        outputs["data_loader"] = batches
+        outputs["text"] = filter_inputs
+        return outputs
+
+    def _batchify_fn(self, samples):
+        fn = Tuple(
+            Pad(axis=0, pad_val=self._tokenizer.vocab.token_to_idx.get("[PAD]", 0)),  # input_ids
+            Stack(dtype="int64"),  # seq_len
+        )
+        return fn(samples)
+
+    def _run_model(self, inputs):
+        """
+        Run the task model from the outputs of the `_tokenize` function.
+        """
+        results = []
+        scores = []
+        with static_mode_guard():
+            for batch in inputs["data_loader"]:
+                ids, lens = self._batchify_fn(batch)
+                self.input_handles[0].copy_from_cpu(ids)
+                self.input_handles[1].copy_from_cpu(lens)
+                self.predictor.run()
+                idx = self.output_handle[0].copy_to_cpu().tolist()
+                probs = self.output_handle[1].copy_to_cpu().tolist()
+                labels = [self._label_map[i] for i in idx]
+                score = [max(prob) for prob in probs]
+                results.extend(labels)
+                scores.extend(score)
+
+        inputs["result"] = results
+        inputs["score"] = scores
+        return inputs
+
+    def _postprocess(self, inputs):
+        """
+        This function will convert the model output to raw text.
+        """
+        final_results = []
+        for text, label, score in zip(inputs["text"], inputs["result"], inputs["score"]):
+            result = {}
+            result["text"] = text
+            result["label"] = label
+            result["score"] = score
+            final_results.append(result)
+        return final_results
+
+
+class SkepTask(Task):
+    """
+    Sentiment analysis task using ERNIE-Gram model to predict sentiment opinion on Chinese text.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    resource_files_names = {
+        "model_state": "model_state.pdparams",
+        "model_config": "model_config.json",
+    }
+    resource_files_urls = {
+        "skep_ernie_1.0_large_ch": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/sentiment_analysis/skep_ernie_1.0_large_ch/model_state.pdparams",
+                "cf7aa5f5ffa834b329bbcb1dca54e9fc",
+            ],
+            "model_config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/sentiment_analysis/skep_ernie_1.0_large_ch/model_config.json",
+                "847b84ab08611a2f5a01a22c18b0be23",
+            ],
+        },
+        "__internal_testing__/tiny-random-skep": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/models/community/__internal_testing__/tiny-random-skep/model_state.pdparams",
+                "3bedff32b4de186252094499d1c8ede3",
+            ],
+            "model_config": [
+                "https://paddlenlp.bj.bcebos.com/models/community/__internal_testing__/tiny-random-skep/model_config.json",
+                "f891e4a927f946c23bc32653f535510b",
+            ],
+        },
+    }
+
+    def __init__(self, task, model, **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
+        self._static_mode = True
+        self._label_map = {0: "negative", 1: "positive"}
+        if not self._custom_model:
+            self._check_task_files()
+        self._construct_tokenizer(self._task_path if self._custom_model else model)
+        if self._static_mode:
+            self._get_inference_model()
+        else:
+            self._construct_model(self._task_path if self._custom_model else model)
+        self._usage = usage
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+        model_instance = SkepSequenceModel.from_pretrained(self._task_path, num_labels=len(self._label_map))
+        self._model = model_instance
+        self._model.eval()
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        self._input_spec = [
+            paddle.static.InputSpec(shape=[None, None], dtype="int64"),  # input_ids
+            paddle.static.InputSpec(shape=[None, None], dtype="int64"),  # segment_ids
+        ]
+
+    def _construct_tokenizer(self, model):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        tokenizer = SkepTokenizer.from_pretrained(model)
+        self._tokenizer = tokenizer
+
+    def _preprocess(self, inputs, padding=True, add_special_tokens=True):
+        """
+        Transform the raw text to the model inputs, two steps involved:
+           1) Transform the raw text to token ids.
+           2) Generate the other model inputs from the raw text and token ids.
+        """
+        inputs = self._check_input_text(inputs)
+        # Get the config from the kwargs
+        batch_size = self.kwargs["batch_size"] if "batch_size" in self.kwargs else 1
+
+        examples = []
+        filter_inputs = []
+        for input_data in inputs:
+            if not (isinstance(input_data, str) and len(input_data.strip()) > 0):
+                continue
+            filter_inputs.append(input_data)
+            encoded_inputs = self._tokenizer(text=input_data, max_seq_len=128)
+            ids = encoded_inputs["input_ids"]
+            segment_ids = encoded_inputs["token_type_ids"]
+            examples.append((ids, segment_ids))
+
+        batches = [examples[idx : idx + batch_size] for idx in range(0, len(examples), batch_size)]
+        outputs = {}
+        outputs["text"] = filter_inputs
+        outputs["data_loader"] = batches
+        return outputs
+
+    def _batchify_fn(self, samples):
+        fn = Tuple(
+            Pad(axis=0, pad_val=self._tokenizer.pad_token_id),  # input ids
+            Pad(axis=0, pad_val=self._tokenizer.pad_token_type_id),  # token type ids
+        )
+        return fn(samples)
+
+    def _run_model(self, inputs):
+        """
+        Run the task model from the outputs of the `_tokenize` function.
+        """
+        results = []
+        scores = []
+        with static_mode_guard():
+            for batch in inputs["data_loader"]:
+                ids, segment_ids = self._batchify_fn(batch)
+                self.input_handles[0].copy_from_cpu(ids)
+                self.input_handles[1].copy_from_cpu(segment_ids)
+                self.predictor.run()
+                idx = self.output_handle[0].copy_to_cpu().tolist()
+                probs = self.output_handle[1].copy_to_cpu().tolist()
+                labels = [self._label_map[i] for i in idx]
+                score = [max(prob) for prob in probs]
+                results.extend(labels)
+                scores.extend(score)
+
+        inputs["result"] = results
+        inputs["score"] = scores
+        return inputs
+
+    def _postprocess(self, inputs):
+        """
+        The model output is tag ids, this function will convert the model output to raw text.
+        """
+        final_results = []
+        for text, label, score in zip(inputs["text"], inputs["result"], inputs["score"]):
+            result = {}
+            result["text"] = text
+            result["label"] = label
+            result["score"] = score
+            final_results.append(result)
+        return final_results
+
+
+class UIESentaTask(Task):
+    """
+    Universal Information Extraction Task.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        aspects (list[string]):  a list of pre-given aspects
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    resource_files_names = {
+        "model_state": "model_state.pdparams",
+        "model_config": "model_config.json",
+        "vocab_file": "vocab.txt",
+        "special_tokens_map": "special_tokens_map.json",
+        "tokenizer_config": "tokenizer_config.json",
+    }
+    # vocab.txt/special_tokens_map.json/tokenizer_config.json are common to the default model.
+    resource_files_urls = {
+        "uie-senta-base": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-base/model_state.pdparams",
+                "88fcf3aa5afee16ddb61b4ecdf53f572",
+            ],
+            "model_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-base/model_config.json",
+                "74f033ab874a1acddb3aec9b9c4d9cde",
+            ],
+            "vocab_file": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-base/vocab.txt",
+                "1c1c1f4fd93c5bed3b4eebec4de976a8",
+            ],
+            "special_tokens_map": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-base/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-base/tokenizer_config.json",
+                "3e623b57084882fd73e17f544bdda47d",
+            ],
+        },
+        "uie-senta-medium": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-medium/model_state.pdparams",
+                "afc11ed983a0075f4bb13cf203ccd841",
+            ],
+            "model_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-medium/model_config.json",
+                "4c98a7bc547d60ac94e44e17c47a3488",
+            ],
+            "vocab_file": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-medium/vocab.txt",
+                "1c1c1f4fd93c5bed3b4eebec4de976a8",
+            ],
+            "special_tokens_map": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-medium/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-medium/tokenizer_config.json",
+                "3e623b57084882fd73e17f544bdda47d",
+            ],
+        },
+        "uie-senta-mini": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-mini/model_state.pdparams",
+                "83d5082596cfd95b9548aefc248c7ad1",
+            ],
+            "model_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-mini/model_config.json",
+                "9628a5c64a1e6ed8278c0344c8ef874a",
+            ],
+            "vocab_file": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-mini/vocab.txt",
+                "1c1c1f4fd93c5bed3b4eebec4de976a8",
+            ],
+            "special_tokens_map": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-mini/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-mini/tokenizer_config.json",
+                "3e623b57084882fd73e17f544bdda47d",
+            ],
+        },
+        "uie-senta-micro": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-micro/model_state.pdparams",
+                "047b5549dc182cfca036c3fce1e7f6f7",
+            ],
+            "model_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-micro/model_config.json",
+                "058a28845781dbe89a3827bc11355bc8",
+            ],
+            "vocab_file": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-micro/vocab.txt",
+                "1c1c1f4fd93c5bed3b4eebec4de976a8",
+            ],
+            "special_tokens_map": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-micro/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-micro/tokenizer_config.json",
+                "3e623b57084882fd73e17f544bdda47d",
+            ],
+        },
+        "uie-senta-nano": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-nano/model_state.pdparams",
+                "27afd8946f47a2b8618ffae9ac0f5922",
+            ],
+            "model_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-nano/model_config.json",
+                "b9f74bdf02f5fb2d208e1535c8a13649",
+            ],
+            "vocab_file": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-nano/vocab.txt",
+                "1c1c1f4fd93c5bed3b4eebec4de976a8",
+            ],
+            "special_tokens_map": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-nano/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/sentiment_analysis/uie-senta-nano/tokenizer_config.json",
+                "3e623b57084882fd73e17f544bdda47d",
+            ],
+        },
+    }
+
+    def __init__(self, task, model, schema, aspects=None, **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
+        self._schema_tree = None
+        self.set_schema(schema)
+        self._check_task_files()
+        self._check_predictor_type()
+        self._get_inference_model()
+        self._usage = usage
+        self._max_seq_len = self.kwargs["max_seq_len"] if "max_seq_len" in self.kwargs else 512
+        self._batch_size = self.kwargs["batch_size"] if "batch_size" in self.kwargs else 64
+        self._split_sentence = self.kwargs["split_sentence"] if "split_sentence" in self.kwargs else False
+        self._position_prob = self.kwargs["position_prob"] if "position_prob" in self.kwargs else 0.5
+        self._lazy_load = self.kwargs["lazy_load"] if "lazy_load" in self.kwargs else False
+        self._num_workers = self.kwargs["num_workers"] if "num_workers" in self.kwargs else 0
+        self.use_fast = self.kwargs["use_fast"] if "use_fast" in self.kwargs else False
+        self._construct_tokenizer()
+        self.aspects = self._check_aspects(aspects)
+
+    def set_schema(self, schema):
+        """
+        Set schema for UIE Model.
+        """
+        if isinstance(schema, dict) or isinstance(schema, str):
+            schema = [schema]
+        self._schema_tree = self._build_tree(schema)
+
+    def _check_aspects(self, aspects):
+        """
+        Check aspects whether to be valid.
+        """
+        if aspects is None:
+            return aspects
+        elif not isinstance(aspects, list):
+            raise TypeError(
+                "Invalid aspects, input aspects should be list of str, but type of {} found!".format(type(aspects))
+            )
+        elif not aspects:
+            raise ValueError("Invalid aspects, input aspects should not be empty, but {} found!".format(aspects))
+        else:
+            for i, aspect in enumerate(aspects):
+                if not isinstance(aspect, str):
+                    raise TypeError(
+                        "Invalid aspect, the aspect at index {} should be str, but type of {} found!".format(
+                            i, type(aspect)
+                        )
+                    )
+                if not aspect.strip():
+                    raise ValueError(
+                        "Invalid aspect, the aspect at index {} should not be empty, but {} found!".format(i, aspect)
+                    )
+        return aspects
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        self._input_spec = [
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="token_type_ids"),
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="pos_ids"),
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="att_mask"),
+        ]
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+        model_instance = UIE.from_pretrained(self._task_path)
+        self._model = model_instance
+        self._model.eval()
+
+    def _construct_tokenizer(self):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        self._tokenizer = AutoTokenizer.from_pretrained(self._task_path)
+
+    def _preprocess(self, inputs):
+        """
+        Read and analyze inputs.
+        """
+        examples = self._check_input_text(inputs)
+
+        outputs = {}
+        outputs["text"] = examples
+        return outputs
+
+    def _single_stage_predict(self, inputs):
+        input_texts = []
+        prompts = []
+        for i in range(len(inputs)):
+            input_texts.append(inputs[i]["text"])
+            prompts.append(inputs[i]["prompt"])
+        # max predict length should exclude the length of prompt and summary tokens
+        max_predict_len = self._max_seq_len - len(max(prompts)) - 3
+
+        short_input_texts, self.input_mapping = self._auto_splitter(
+            input_texts, max_predict_len, split_sentence=self._split_sentence
+        )
+
+        short_texts_prompts = []
+        for k, v in self.input_mapping.items():
+            short_texts_prompts.extend([prompts[k] for i in range(len(v))])
+        short_inputs = [
+            {"text": short_input_texts[i], "prompt": short_texts_prompts[i]} for i in range(len(short_input_texts))
+        ]
+
+        def read(inputs):
+            for example in inputs:
+                encoded_inputs = self._tokenizer(
+                    text=[example["prompt"]],
+                    text_pair=[example["text"]],
+                    truncation=True,
+                    max_seq_len=self._max_seq_len,
+                    pad_to_max_seq_len=True,
+                    return_attention_mask=True,
+                    return_position_ids=True,
+                    return_offsets_mapping=True,
+                )
+                tokenized_output = [
+                    encoded_inputs["input_ids"][0],
+                    encoded_inputs["token_type_ids"][0],
+                    encoded_inputs["position_ids"][0],
+                    encoded_inputs["attention_mask"][0],
+                    encoded_inputs["offset_mapping"][0],
+                ]
+                tokenized_output = [np.array(x, dtype="int64") for x in tokenized_output]
+                yield tuple(tokenized_output)
+
+        infer_ds = load_dataset(read, inputs=short_inputs, lazy=self._lazy_load)
+        batch_sampler = paddle.io.BatchSampler(dataset=infer_ds, batch_size=self._batch_size, shuffle=False)
+
+        infer_data_loader = paddle.io.DataLoader(
+            dataset=infer_ds, batch_sampler=batch_sampler, num_workers=self._num_workers, return_list=True
+        )
+
+        sentence_ids = []
+        probs = []
+        for batch in infer_data_loader:
+            input_ids, token_type_ids, pos_ids, att_mask, offset_maps = batch
+            if self._predictor_type == "paddle-inference":
+                self.input_handles[0].copy_from_cpu(input_ids.numpy())
+                self.input_handles[1].copy_from_cpu(token_type_ids.numpy())
+                self.input_handles[2].copy_from_cpu(pos_ids.numpy())
+                self.input_handles[3].copy_from_cpu(att_mask.numpy())
+                self.predictor.run()
+                start_prob = self.output_handle[0].copy_to_cpu().tolist()
+                end_prob = self.output_handle[1].copy_to_cpu().tolist()
+            else:
+                input_dict = {
+                    "input_ids": input_ids.numpy(),
+                    "token_type_ids": token_type_ids.numpy(),
+                    "pos_ids": pos_ids.numpy(),
+                    "att_mask": att_mask.numpy(),
+                }
+                start_prob, end_prob = self.predictor.run(None, input_dict)
+                start_prob = start_prob.tolist()
+                end_prob = end_prob.tolist()
+
+            start_ids_list = get_bool_ids_greater_than(start_prob, limit=self._position_prob, return_prob=True)
+            end_ids_list = get_bool_ids_greater_than(end_prob, limit=self._position_prob, return_prob=True)
+
+            for start_ids, end_ids, offset_map in zip(start_ids_list, end_ids_list, offset_maps.tolist()):
+                span_set = get_span(start_ids, end_ids, with_prob=True)
+                sentence_id, prob = get_id_and_prob(span_set, offset_map)
+                sentence_ids.append(sentence_id)
+                probs.append(prob)
+        results = self._convert_ids_to_results(short_inputs, sentence_ids, probs)
+        results = self._auto_joiner(results, short_input_texts, self.input_mapping)
+        return results
+
+    def _auto_joiner(self, short_results, short_inputs, input_mapping):
+        concat_results = []
+        is_cls_task = False
+        for short_result in short_results:
+            if short_result == []:
+                continue
+            elif "start" not in short_result[0].keys() and "end" not in short_result[0].keys():
+                is_cls_task = True
+                break
+            else:
+                break
+        for k, vs in input_mapping.items():
+            if is_cls_task:
+                cls_options = {}
+                single_results = []
+                for v in vs:
+                    if len(short_results[v]) == 0:
+                        continue
+                    if short_results[v][0]["text"] not in cls_options.keys():
+                        cls_options[short_results[v][0]["text"]] = [1, short_results[v][0]["probability"]]
+                    else:
+                        cls_options[short_results[v][0]["text"]][0] += 1
+                        cls_options[short_results[v][0]["text"]][1] += short_results[v][0]["probability"]
+                if len(cls_options) != 0:
+                    cls_res, cls_info = max(cls_options.items(), key=lambda x: x[1])
+                    concat_results.append([{"text": cls_res, "probability": cls_info[1] / cls_info[0]}])
+                else:
+                    concat_results.append([])
+            else:
+                offset = 0
+                single_results = []
+                for v in vs:
+                    if v == 0:
+                        single_results = short_results[v]
+                        offset += len(short_inputs[v])
+                    else:
+                        for i in range(len(short_results[v])):
+                            if "start" not in short_results[v][i] or "end" not in short_results[v][i]:
+                                continue
+                            short_results[v][i]["start"] += offset
+                            short_results[v][i]["end"] += offset
+                        offset += len(short_inputs[v])
+                        single_results.extend(short_results[v])
+                concat_results.append(single_results)
+        return concat_results
+
+    def _run_model(self, inputs):
+        raw_inputs = inputs["text"]
+        results = self._multi_stage_predict(raw_inputs)
+        inputs["result"] = results
+        return inputs
+
+    def _multi_stage_predict(self, data):
+        """
+        Traversal the schema tree and do multi-stage prediction.
+        Args:
+            data (list): a list of strings
+        Returns:
+            list: a list of predictions, where the list's length
+                equals to the length of `data`
+        """
+        if self.aspects is not None:
+            # predict with pre-give aspects
+            results = []
+            prefixs = []
+            relations = []
+            result = {"评价维度": [{"text": aspect} for aspect in self.aspects]}
+            prefix = [aspect + "的" for aspect in self.aspects]
+            for i in range(len(data)):
+                results.append(copy.deepcopy(result))
+                prefixs.append(copy.deepcopy(prefix))
+                relations.append(results[-1]["评价维度"])
+            # copy to stay `self._schema_tree` unchanged
+            schema_list = self._schema_tree.children[:]
+            for node in schema_list:
+                node.prefix = prefixs
+                node.parent_relations = relations
+
+        else:
+            results = [{} for _ in range(len(data))]
+            # input check to early return
+            if len(data) < 1 or self._schema_tree is None:
+                return results
+            # copy to stay `self._schema_tree` unchanged
+            schema_list = self._schema_tree.children[:]
+
+        while len(schema_list) > 0:
+            node = schema_list.pop(0)
+            examples = []
+            input_map = {}
+            cnt = 0
+            idx = 0
+            if not node.prefix:
+                for one_data in data:
+                    examples.append({"text": one_data, "prompt": dbc2sbc(node.name)})
+                    input_map[cnt] = [idx]
+                    idx += 1
+                    cnt += 1
+            else:
+                for pre, one_data in zip(node.prefix, data):
+                    if len(pre) == 0:
+                        input_map[cnt] = []
+                    else:
+                        for p in pre:
+                            examples.append({"text": one_data, "prompt": dbc2sbc(p + node.name)})
+                        input_map[cnt] = [i + idx for i in range(len(pre))]
+                        idx += len(pre)
+                    cnt += 1
+            if len(examples) == 0:
+                result_list = []
+            else:
+                result_list = self._single_stage_predict(examples)
+
+            if not node.parent_relations:
+                relations = [[] for i in range(len(data))]
+                for k, v in input_map.items():
+                    for idx in v:
+                        if len(result_list[idx]) == 0:
+                            continue
+                        if node.name not in results[k].keys():
+                            results[k][node.name] = result_list[idx]
+                        else:
+                            results[k][node.name].extend(result_list[idx])
+                    if node.name in results[k].keys():
+                        relations[k].extend(results[k][node.name])
+            else:
+                relations = node.parent_relations
+                for k, v in input_map.items():
+                    for i in range(len(v)):
+                        if len(result_list[v[i]]) == 0:
+                            continue
+                        if "relations" not in relations[k][i].keys():
+                            relations[k][i]["relations"] = {node.name: result_list[v[i]]}
+                        elif node.name not in relations[k][i]["relations"].keys():
+                            relations[k][i]["relations"][node.name] = result_list[v[i]]
+                        else:
+                            relations[k][i]["relations"][node.name].extend(result_list[v[i]])
+                new_relations = [[] for i in range(len(data))]
+                for i in range(len(relations)):
+                    for j in range(len(relations[i])):
+                        if "relations" in relations[i][j].keys() and node.name in relations[i][j]["relations"].keys():
+                            for k in range(len(relations[i][j]["relations"][node.name])):
+                                new_relations[i].append(relations[i][j]["relations"][node.name][k])
+                relations = new_relations
+
+            prefix = [[] for _ in range(len(data))]
+            for k, v in input_map.items():
+                for idx in v:
+                    for i in range(len(result_list[idx])):
+                        prefix[k].append(result_list[idx][i]["text"] + "的")
+
+            for child in node.children:
+                child.prefix = prefix
+                child.parent_relations = relations
+                schema_list.append(child)
+        return results
+
+    def _convert_ids_to_results(self, examples, sentence_ids, probs):
+        """
+        Convert ids to raw text in a single stage.
+        """
+        results = []
+        for example, sentence_id, prob in zip(examples, sentence_ids, probs):
+            if len(sentence_id) == 0:
+                results.append([])
+                continue
+            result_list = []
+            text = example["text"]
+            prompt = example["prompt"]
+            for i in range(len(sentence_id)):
+                start, end = sentence_id[i]
+                if start < 0 and end >= 0:
+                    continue
+                if end < 0:
+                    start += len(prompt) + 1
+                    end += len(prompt) + 1
+                    result = {"text": prompt[start:end], "probability": prob[i]}
+                    result_list.append(result)
+                else:
+                    result = {"text": text[start:end], "start": start, "end": end, "probability": prob[i]}
+                    result_list.append(result)
+            results.append(result_list)
+        return results
+
+    @classmethod
+    def _build_tree(cls, schema, name="root"):
+        """
+        Build the schema tree.
+        """
+        schema_tree = SchemaTree(name)
+        for s in schema:
+            if isinstance(s, str):
+                schema_tree.add_child(SchemaTree(s))
+            elif isinstance(s, dict):
+                for k, v in s.items():
+                    if isinstance(v, str):
+                        child = [v]
+                    elif isinstance(v, list):
+                        child = v
+                    else:
+                        raise TypeError(
+                            "Invalid schema, value for each key:value pairs should be list or string"
+                            "but {} received".format(type(v))
+                        )
+                    schema_tree.add_child(cls._build_tree(child, name=k))
+            else:
+                raise TypeError("Invalid schema, element should be string or dict, " "but {} received".format(type(s)))
+        return schema_tree
+
+    def _postprocess(self, inputs):
+        """
+        This function will convert the model output to raw text.
+        """
+        return inputs["result"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/task.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/task.py
new file mode 100644
index 000000000..22b178b61
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/task.py
@@ -0,0 +1,529 @@
+# coding:utf-8
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+import math
+import os
+from abc import abstractmethod
+from multiprocessing import cpu_count
+
+import paddle
+from paddle.dataset.common import md5file
+
+from ..utils.env import PPNLP_HOME
+from ..utils.log import logger
+from .utils import cut_chinese_sent, download_check, download_file, dygraph_mode_guard
+
+
+class Task(metaclass=abc.ABCMeta):
+    """
+    The meta classs of task in Taskflow. The meta class has the five abstract function,
+        the subclass need to inherit from the meta class.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    def __init__(self, model, task, priority_path=None, **kwargs):
+        self.model = model
+        self.is_static_model = kwargs.get("is_static_model", False)
+        self.task = task
+        self.kwargs = kwargs
+        self._priority_path = priority_path
+        self._usage = ""
+        # The dygraph model instance
+        self._model = None
+        # The static model instance
+        self._input_spec = None
+        self._config = None
+        self._init_class = None
+        self._custom_model = False
+        self._param_updated = False
+
+        self._num_threads = self.kwargs["num_threads"] if "num_threads" in self.kwargs else math.ceil(cpu_count() / 2)
+        self._infer_precision = self.kwargs["precision"] if "precision" in self.kwargs else "fp32"
+        # Default to use Paddle Inference
+        self._predictor_type = "paddle-inference"
+        # The root directory for storing Taskflow related files, default to ~/.paddlenlp.
+        self._home_path = self.kwargs["home_path"] if "home_path" in self.kwargs else PPNLP_HOME
+        self._task_flag = self.kwargs["task_flag"] if "task_flag" in self.kwargs else self.model
+        self.from_hf_hub = kwargs.pop("from_hf_hub", False)
+        # Add mode flag for onnx output path redirection
+        self.export_type = None
+
+        if "task_path" in self.kwargs:
+            self._task_path = self.kwargs["task_path"]
+            self._custom_model = True
+        elif self._priority_path:
+            self._task_path = os.path.join(self._home_path, "taskflow", self._priority_path)
+        else:
+            self._task_path = os.path.join(self._home_path, "taskflow", self.task, self.model)
+        if self.is_static_model:
+            self._static_model_name = self._get_static_model_name()
+
+        if not self.from_hf_hub:
+            download_check(self._task_flag)
+
+    @abstractmethod
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+
+    @abstractmethod
+    def _construct_tokenizer(self, model):
+        """
+        Construct the tokenizer for the predictor.
+        """
+
+    @abstractmethod
+    def _preprocess(self, inputs, padding=True, add_special_tokens=True):
+        """
+        Transform the raw text to the model inputs, two steps involved:
+           1) Transform the raw text to token ids.
+           2) Generate the other model inputs from the raw text and token ids.
+        """
+
+    @abstractmethod
+    def _run_model(self, inputs, **kwargs):
+        """
+        Run the task model from the outputs of the `_tokenize` function.
+        """
+
+    @abstractmethod
+    def _postprocess(self, inputs):
+        """
+        The model output is the logits and pros, this function will convert the model output to raw text.
+        """
+
+    @abstractmethod
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+
+    def _get_static_model_name(self):
+        names = []
+        for file_name in os.listdir(self._task_path):
+            if ".pdmodel" in file_name:
+                names.append(file_name[:-8])
+        if len(names) == 0:
+            raise IOError(f"{self._task_path} should include '.pdmodel' file.")
+        if len(names) > 1:
+            logger.warning(f"{self._task_path} includes more than one '.pdmodel' file.")
+        return names[0]
+
+    def _check_task_files(self):
+        """
+        Check files required by the task.
+        """
+        for file_id, file_name in self.resource_files_names.items():
+            if self.task in ["information_extraction"]:
+                dygraph_file = ["model_state.pdparams"]
+            else:
+                dygraph_file = ["model_state.pdparams", "config.json"]
+            if self.is_static_model and file_name in dygraph_file:
+                continue
+            path = os.path.join(self._task_path, file_name)
+            url = self.resource_files_urls[self.model][file_id][0]
+            md5 = self.resource_files_urls[self.model][file_id][1]
+
+            downloaded = True
+            if not os.path.exists(path):
+                downloaded = False
+            else:
+                if not self._custom_model:
+                    if os.path.exists(path):
+                        # Check whether the file is updated
+                        if not md5file(path) == md5:
+                            downloaded = False
+                            if file_id == "model_state":
+                                self._param_updated = True
+                    else:
+                        downloaded = False
+            if not downloaded:
+                download_file(self._task_path, file_name, url, md5)
+
+    def _check_predictor_type(self):
+        if paddle.get_device() == "cpu" and self._infer_precision == "fp16":
+            logger.warning("The inference precision is change to 'fp32', 'fp16' inference only takes effect on gpu.")
+        elif paddle.get_device().split(":", 1)[0] == "npu":
+            if self._infer_precision == "fp16":
+                logger.info("Inference on npu with fp16 precison")
+        else:
+            if self._infer_precision == "fp16":
+                self._predictor_type = "onnxruntime"
+
+    def _construct_ocr_engine(self, lang="ch", use_angle_cls=True):
+        """
+        Construct the OCR engine
+        """
+        try:
+            from paddleocr import PaddleOCR
+        except ImportError:
+            raise ImportError("Please install the dependencies first, pip install paddleocr")
+        use_gpu = False if paddle.get_device() == "cpu" else True
+        self._ocr = PaddleOCR(use_angle_cls=use_angle_cls, show_log=False, use_gpu=use_gpu, lang=lang)
+
+    def _construce_layout_analysis_engine(self):
+        """
+        Construct the layout analysis engine
+        """
+        try:
+            from paddleocr import PPStructure
+        except ImportError:
+            raise ImportError("Please install the dependencies first, pip install paddleocr")
+        self._layout_analysis_engine = PPStructure(table=False, ocr=True, show_log=False)
+
+    def _prepare_static_mode(self):
+        """
+        Construct the input data and predictor in the PaddlePaddele static mode.
+        """
+        if paddle.get_device() == "cpu":
+            self._config.disable_gpu()
+            self._config.enable_mkldnn()
+            if self._infer_precision == "int8":
+                # EnableMKLDNN() only works when IR optimization is enabled.
+                self._config.switch_ir_optim(True)
+                self._config.enable_mkldnn_int8()
+                logger.info((">>> [InferBackend] INT8 inference on CPU ..."))
+        elif paddle.get_device().split(":", 1)[0] == "npu":
+            self._config.disable_gpu()
+            self._config.enable_custom_device("npu", self.kwargs["device_id"])
+        else:
+            if self._infer_precision == "int8":
+                logger.info(
+                    ">>> [InferBackend] It is a INT8 model which is not yet supported on gpu, use FP32 to inference here ..."
+                )
+            self._config.enable_use_gpu(100, self.kwargs["device_id"])
+            # TODO(linjieccc): enable after fixed
+            self._config.delete_pass("embedding_eltwise_layernorm_fuse_pass")
+            self._config.delete_pass("fused_multi_transformer_encoder_pass")
+        self._config.set_cpu_math_library_num_threads(self._num_threads)
+        self._config.switch_use_feed_fetch_ops(False)
+        self._config.disable_glog_info()
+        self._config.enable_memory_optim()
+
+        # TODO(linjieccc): some temporary settings and will be remove in future
+        # after fixed
+        if self.task in ["document_intelligence", "knowledge_mining", "zero_shot_text_classification"]:
+            self._config.switch_ir_optim(False)
+        if self.model == "uie-data-distill-gp":
+            self._config.enable_memory_optim(False)
+
+        self.predictor = paddle.inference.create_predictor(self._config)
+        self.input_names = [name for name in self.predictor.get_input_names()]
+        self.input_handles = [self.predictor.get_input_handle(name) for name in self.predictor.get_input_names()]
+        self.output_handle = [self.predictor.get_output_handle(name) for name in self.predictor.get_output_names()]
+
+    def _prepare_onnx_mode(self):
+        try:
+            import onnx
+            import onnxruntime as ort
+            import paddle2onnx
+            from onnxconverter_common import float16
+        except ImportError:
+            logger.warning(
+                "The inference precision is change to 'fp32', please install the dependencies that required for 'fp16' inference, pip install onnxruntime-gpu onnx onnxconverter-common"
+            )
+        if self.export_type is None:
+            onnx_dir = os.path.join(self._task_path, "onnx")
+        else:
+            # Compatible multimodal model for saving image and text path
+            onnx_dir = os.path.join(self._task_path, "onnx", self.export_type)
+
+        if not os.path.exists(onnx_dir):
+            os.makedirs(onnx_dir, exist_ok=True)
+        float_onnx_file = os.path.join(onnx_dir, "model.onnx")
+        if not os.path.exists(float_onnx_file) or self._param_updated:
+            onnx_model = paddle2onnx.command.c_paddle_to_onnx(
+                model_file=self._static_model_file,
+                params_file=self._static_params_file,
+                opset_version=13,
+                enable_onnx_checker=True,
+            )
+            with open(float_onnx_file, "wb") as f:
+                f.write(onnx_model)
+        fp16_model_file = os.path.join(onnx_dir, "fp16_model.onnx")
+        if not os.path.exists(fp16_model_file) or self._param_updated:
+            onnx_model = onnx.load_model(float_onnx_file)
+            trans_model = float16.convert_float_to_float16(onnx_model, keep_io_types=True)
+            onnx.save_model(trans_model, fp16_model_file)
+        providers = [("CUDAExecutionProvider", {"device_id": self.kwargs["device_id"]})]
+        sess_options = ort.SessionOptions()
+        sess_options.intra_op_num_threads = self._num_threads
+        sess_options.inter_op_num_threads = self._num_threads
+        self.predictor = ort.InferenceSession(fp16_model_file, sess_options=sess_options, providers=providers)
+        assert "CUDAExecutionProvider" in self.predictor.get_providers(), (
+            "The environment for GPU inference is not set properly. "
+            "A possible cause is that you had installed both onnxruntime and onnxruntime-gpu. "
+            "Please run the following commands to reinstall: \n "
+            "1) pip uninstall -y onnxruntime onnxruntime-gpu \n 2) pip install onnxruntime-gpu"
+        )
+        self.input_handler = [i.name for i in self.predictor.get_inputs()]
+
+    def _get_inference_model(self):
+        """
+        Return the inference program, inputs and outputs in static mode.
+        """
+        if self._custom_model:
+            param_path = os.path.join(self._task_path, "model_state.pdparams")
+
+            if os.path.exists(param_path):
+                cache_info_path = os.path.join(self._task_path, ".cache_info")
+                md5 = md5file(param_path)
+                self._param_updated = True
+                if os.path.exists(cache_info_path) and open(cache_info_path).read()[:-8] == md5:
+                    self._param_updated = False
+                elif self.task == "information_extraction" and self.model != "uie-data-distill-gp":
+                    # UIE related models are moved to paddlenlp.transformers after v2.4.5
+                    # So we convert the parameter key names for compatibility
+                    # This check will be discard in future
+                    fp = open(cache_info_path, "w")
+                    fp.write(md5 + "taskflow")
+                    fp.close()
+                    model_state = paddle.load(param_path)
+                    prefix_map = {"UIE": "ernie", "UIEM": "ernie_m", "UIEX": "ernie_layout"}
+                    new_state_dict = {}
+                    for name, param in model_state.items():
+                        if "ernie" in name:
+                            new_state_dict[name] = param
+                        elif "encoder.encoder" in name:
+                            trans_name = name.replace("encoder.encoder", prefix_map[self._init_class] + ".encoder")
+                            new_state_dict[trans_name] = param
+                        elif "encoder" in name:
+                            trans_name = name.replace("encoder", prefix_map[self._init_class])
+                            new_state_dict[trans_name] = param
+                        else:
+                            new_state_dict[name] = param
+                    paddle.save(new_state_dict, param_path)
+                else:
+                    fp = open(cache_info_path, "w")
+                    fp.write(md5 + "taskflow")
+                    fp.close()
+
+        # When the user-provided model path is already a static model, skip to_static conversion
+        if self.is_static_model:
+            self.inference_model_path = os.path.join(self._task_path, self._static_model_name)
+            if not os.path.exists(self.inference_model_path + ".pdmodel") or not os.path.exists(
+                self.inference_model_path + ".pdiparams"
+            ):
+                raise IOError(
+                    f"{self._task_path} should include {self._static_model_name + '.pdmodel'} and {self._static_model_name + '.pdiparams'} while is_static_model is True"
+                )
+            if self.paddle_quantize_model(self.inference_model_path):
+                self._infer_precision = "int8"
+                self._predictor_type = "paddle-inference"
+
+        else:
+            # Since 'self._task_path' is used to load the HF Hub path when 'from_hf_hub=True', we construct the static model path in a different way
+            _base_path = (
+                self._task_path
+                if not self.from_hf_hub
+                else os.path.join(self._home_path, "taskflow", self.task, self._task_path)
+            )
+            self.inference_model_path = os.path.join(_base_path, "static", "inference")
+            if not os.path.exists(self.inference_model_path + ".pdiparams") or self._param_updated:
+                with dygraph_mode_guard():
+                    self._construct_model(self.model)
+                    self._construct_input_spec()
+                    self._convert_dygraph_to_static()
+
+        self._static_model_file = self.inference_model_path + ".pdmodel"
+        self._static_params_file = self.inference_model_path + ".pdiparams"
+
+        if paddle.get_device().split(":", 1)[0] == "npu" and self._infer_precision == "fp16":
+            # transform fp32 model tp fp16 model
+            self._static_fp16_model_file = self.inference_model_path + "-fp16.pdmodel"
+            self._static_fp16_params_file = self.inference_model_path + "-fp16.pdiparams"
+            if not os.path.exists(self._static_fp16_model_file) and not os.path.exists(self._static_fp16_params_file):
+                logger.info("Converting to the inference model from fp32 to fp16.")
+                paddle.inference.convert_to_mixed_precision(
+                    os.path.join(self._static_model_file),
+                    os.path.join(self._static_params_file),
+                    os.path.join(self._static_fp16_model_file),
+                    os.path.join(self._static_fp16_params_file),
+                    backend=paddle.inference.PlaceType.CUSTOM,
+                    mixed_precision=paddle.inference.PrecisionType.Half,
+                    # Here, npu sigmoid will lead to OOM and cpu sigmoid don't support fp16.
+                    # So, we add sigmoid to black list temporarily.
+                    black_list={"sigmoid"},
+                )
+                logger.info(
+                    "The inference model in fp16 precison save in the path:{}".format(self._static_fp16_model_file)
+                )
+            self._static_model_file = self._static_fp16_model_file
+            self._static_params_file = self._static_fp16_params_file
+        if self._predictor_type == "paddle-inference":
+            self._config = paddle.inference.Config(self._static_model_file, self._static_params_file)
+            self._prepare_static_mode()
+        else:
+            self._prepare_onnx_mode()
+
+    def _convert_dygraph_to_static(self):
+        """
+        Convert the dygraph model to static model.
+        """
+        assert (
+            self._model is not None
+        ), "The dygraph model must be created before converting the dygraph model to static model."
+        assert (
+            self._input_spec is not None
+        ), "The input spec must be created before converting the dygraph model to static model."
+        logger.info("Converting to the inference model cost a little time.")
+        static_model = paddle.jit.to_static(self._model, input_spec=self._input_spec)
+
+        paddle.jit.save(static_model, self.inference_model_path)
+        logger.info("The inference model save in the path:{}".format(self.inference_model_path))
+
+    def _check_input_text(self, inputs):
+        """
+        Check whether the input text meet the requirement.
+        """
+        inputs = inputs[0]
+        if isinstance(inputs, str):
+            if len(inputs) == 0:
+                raise ValueError("Invalid inputs, input text should not be empty text, please check your input.")
+            inputs = [inputs]
+        elif isinstance(inputs, list):
+            if not (isinstance(inputs[0], str) and len(inputs[0].strip()) > 0):
+                raise TypeError(
+                    "Invalid inputs, input text should be list of str, and first element of list should not be empty text."
+                )
+        else:
+            raise TypeError(
+                "Invalid inputs, input text should be str or list of str, but type of {} found!".format(type(inputs))
+            )
+        return inputs
+
+    def _auto_splitter(self, input_texts, max_text_len, bbox_list=None, split_sentence=False):
+        """
+        Split the raw texts automatically for model inference.
+        Args:
+            input_texts (List[str]): input raw texts.
+            max_text_len (int): cutting length.
+            bbox_list (List[float, float,float, float]): bbox for document input.
+            split_sentence (bool): If True, sentence-level split will be performed.
+                `split_sentence` will be set to False if bbox_list is not None since sentence-level split is not support for document.
+        return:
+            short_input_texts (List[str]): the short input texts for model inference.
+            input_mapping (dict): mapping between raw text and short input texts.
+        """
+        input_mapping = {}
+        short_input_texts = []
+        cnt_org = 0
+        cnt_short = 0
+        with_bbox = False
+        if bbox_list:
+            with_bbox = True
+            short_bbox_list = []
+            if split_sentence:
+                logger.warning(
+                    "`split_sentence` will be set to False if bbox_list is not None since sentence-level split is not support for document."
+                )
+                split_sentence = False
+
+        for idx in range(len(input_texts)):
+            if not split_sentence:
+                sens = [input_texts[idx]]
+            else:
+                sens = cut_chinese_sent(input_texts[idx])
+            for sen in sens:
+                lens = len(sen)
+                if lens <= max_text_len:
+                    short_input_texts.append(sen)
+                    if with_bbox:
+                        short_bbox_list.append(bbox_list[idx])
+                    input_mapping.setdefault(cnt_org, []).append(cnt_short)
+                    cnt_short += 1
+                else:
+                    temp_text_list = [sen[i : i + max_text_len] for i in range(0, lens, max_text_len)]
+                    short_input_texts.extend(temp_text_list)
+                    if with_bbox:
+                        if bbox_list[idx] is not None:
+                            temp_bbox_list = [
+                                bbox_list[idx][i : i + max_text_len] for i in range(0, lens, max_text_len)
+                            ]
+                            short_bbox_list.extend(temp_bbox_list)
+                        else:
+                            short_bbox_list.extend([None for _ in range(len(temp_text_list))])
+                    short_idx = cnt_short
+                    cnt_short += math.ceil(lens / max_text_len)
+                    temp_text_id = [short_idx + i for i in range(cnt_short - short_idx)]
+                    input_mapping.setdefault(cnt_org, []).extend(temp_text_id)
+            cnt_org += 1
+        if with_bbox:
+            return short_input_texts, short_bbox_list, input_mapping
+        else:
+            return short_input_texts, input_mapping
+
+    def _auto_joiner(self, short_results, input_mapping, is_dict=False):
+        """
+        Join the short results automatically and generate the final results to match with the user inputs.
+        Args:
+            short_results (List[dict] / List[List[str]] / List[str]): input raw texts.
+            input_mapping (dict): cutting length.
+            is_dict (bool): whether the element type is dict, default to False.
+        return:
+            short_input_texts (List[str]): the short input texts for model inference.
+        """
+        concat_results = []
+        elem_type = {} if is_dict else []
+        for k, vs in input_mapping.items():
+            single_results = elem_type
+            for v in vs:
+                if len(single_results) == 0:
+                    single_results = short_results[v]
+                elif isinstance(elem_type, list):
+                    single_results.extend(short_results[v])
+                elif isinstance(elem_type, dict):
+                    for sk in single_results.keys():
+                        if isinstance(single_results[sk], str):
+                            single_results[sk] += short_results[v][sk]
+                        else:
+                            single_results[sk].extend(short_results[v][sk])
+                else:
+                    raise ValueError(
+                        "Invalid element type, the type of results "
+                        "for each element should be list of dict, "
+                        "but {} received.".format(type(single_results))
+                    )
+            concat_results.append(single_results)
+        return concat_results
+
+    def paddle_quantize_model(self, model_path):
+        """
+        Determine whether it is an int8 model.
+        """
+        model = paddle.jit.load(model_path)
+        program = model.program()
+        for block in program.blocks:
+            for op in block.ops:
+                if op.type.count("quantize"):
+                    return True
+        return False
+
+    def help(self):
+        """
+        Return the usage message of the current task.
+        """
+        print("Examples:\n{}".format(self._usage))
+
+    def __call__(self, *args, **kwargs):
+        inputs = self._preprocess(*args)
+        outputs = self._run_model(inputs, **kwargs)
+        results = self._postprocess(outputs)
+        return results
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/taskflow.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/taskflow.py
new file mode 100644
index 000000000..520ad4cf5
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/taskflow.py
@@ -0,0 +1,869 @@
+# coding:utf-8
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import threading
+
+import paddle
+
+from ..utils.tools import get_env_device
+from .code_generation import CodeGenerationTask
+from .dependency_parsing import DDParserTask
+from .dialogue import DialogueTask
+from .document_intelligence import DocPromptTask
+from .fill_mask import FillMaskTask
+from .information_extraction import GPTask, UIETask
+from .knowledge_mining import NPTagTask, WordTagTask
+from .lexical_analysis import LacTask
+from .multimodal_feature_extraction import MultimodalFeatureExtractionTask
+from .named_entity_recognition import NERLACTask, NERWordTagTask
+from .poetry_generation import PoetryGenerationTask
+from .pos_tagging import POSTaggingTask
+from .question_answering import QuestionAnsweringTask
+from .question_generation import QuestionGenerationTask
+from .sentiment_analysis import SentaTask, SkepTask, UIESentaTask
+from .text2text_generation import ChatGLMTask
+from .text_classification import TextClassificationTask
+from .text_correction import CSCTask
+from .text_feature_extraction import (
+    SentenceFeatureExtractionTask,
+    TextFeatureExtractionTask,
+)
+from .text_similarity import TextSimilarityTask
+from .text_summarization import TextSummarizationTask
+from .word_segmentation import SegJiebaTask, SegLACTask, SegWordTagTask
+from .zero_shot_text_classification import ZeroShotTextClassificationTask
+
+TASKS = {
+    "dependency_parsing": {
+        "models": {
+            "ddparser": {
+                "task_class": DDParserTask,
+                "task_flag": "dependency_parsing-biaffine",
+            },
+            "ddparser-ernie-1.0": {
+                "task_class": DDParserTask,
+                "task_flag": "dependency_parsing-ernie-1.0",
+            },
+            "ddparser-ernie-gram-zh": {
+                "task_class": DDParserTask,
+                "task_flag": "dependency_parsing-ernie-gram-zh",
+            },
+        },
+        "default": {
+            "model": "ddparser",
+        },
+    },
+    "dialogue": {
+        "models": {
+            "plato-mini": {"task_class": DialogueTask, "task_flag": "dialogue-plato-mini"},
+            "__internal_testing__/tiny-random-plato": {
+                "task_class": DialogueTask,
+                "task_flag": "dialogue-tiny-random-plato",
+            },
+        },
+        "default": {
+            "model": "plato-mini",
+        },
+    },
+    "fill_mask": {
+        "models": {
+            "fill_mask": {"task_class": FillMaskTask, "task_flag": "fill_mask-fill_mask"},
+        },
+        "default": {
+            "model": "fill_mask",
+        },
+    },
+    "knowledge_mining": {
+        "models": {
+            "wordtag": {
+                "task_class": WordTagTask,
+                "task_flag": "knowledge_mining-wordtag",
+                "task_priority_path": "wordtag",
+            },
+            "nptag": {
+                "task_class": NPTagTask,
+                "task_flag": "knowledge_mining-nptag",
+            },
+        },
+        "default": {
+            "model": "wordtag",
+        },
+    },
+    "lexical_analysis": {
+        "models": {
+            "lac": {
+                "task_class": LacTask,
+                "hidden_size": 128,
+                "emb_dim": 128,
+                "task_flag": "lexical_analysis-gru_crf",
+                "task_priority_path": "lac",
+            }
+        },
+        "default": {"model": "lac"},
+    },
+    "ner": {
+        "modes": {
+            "accurate": {
+                "task_class": NERWordTagTask,
+                "task_flag": "ner-wordtag",
+                "task_priority_path": "wordtag",
+                "linking": False,
+            },
+            "fast": {
+                "task_class": NERLACTask,
+                "hidden_size": 128,
+                "emb_dim": 128,
+                "task_flag": "ner-lac",
+                "task_priority_path": "lac",
+            },
+        },
+        "default": {"mode": "accurate"},
+    },
+    "poetry_generation": {
+        "models": {
+            "gpt-cpm-large-cn": {
+                "task_class": PoetryGenerationTask,
+                "task_flag": "poetry_generation-gpt-cpm-large-cn",
+                "task_priority_path": "gpt-cpm-large-cn",
+            },
+        },
+        "default": {
+            "model": "gpt-cpm-large-cn",
+        },
+    },
+    "pos_tagging": {
+        "models": {
+            "lac": {
+                "task_class": POSTaggingTask,
+                "hidden_size": 128,
+                "emb_dim": 128,
+                "task_flag": "pos_tagging-gru_crf",
+                "task_priority_path": "lac",
+            }
+        },
+        "default": {"model": "lac"},
+    },
+    "question_answering": {
+        "models": {
+            "gpt-cpm-large-cn": {
+                "task_class": QuestionAnsweringTask,
+                "task_flag": "question_answering-gpt-cpm-large-cn",
+                "task_priority_path": "gpt-cpm-large-cn",
+            },
+        },
+        "default": {
+            "model": "gpt-cpm-large-cn",
+        },
+    },
+    "sentiment_analysis": {
+        "models": {
+            "bilstm": {
+                "task_class": SentaTask,
+                "task_flag": "sentiment_analysis-bilstm",
+            },
+            "skep_ernie_1.0_large_ch": {
+                "task_class": SkepTask,
+                "task_flag": "sentiment_analysis-skep_ernie_1.0_large_ch",
+            },
+            "uie-senta-base": {
+                "task_class": UIESentaTask,
+                "task_flag": "sentiment_analysis-uie-senta-base",
+            },
+            "uie-senta-medium": {
+                "task_class": UIESentaTask,
+                "task_flag": "sentiment_analysis-uie-senta-medium",
+            },
+            "uie-senta-mini": {
+                "task_class": UIESentaTask,
+                "task_flag": "sentiment_analysis-uie-senta-mini",
+            },
+            "uie-senta-micro": {
+                "task_class": UIESentaTask,
+                "task_flag": "sentiment_analysis-uie-senta-micro",
+            },
+            "uie-senta-nano": {
+                "task_class": UIESentaTask,
+                "task_flag": "sentiment_analysis-uie-senta-nano",
+            },
+            "__internal_testing__/tiny-random-skep": {
+                "task_class": SkepTask,
+                "task_flag": "sentiment_analysis-tiny-random-skep",
+            },
+        },
+        "default": {"model": "bilstm"},
+    },
+    "text_correction": {
+        "models": {
+            "ernie-csc": {"task_class": CSCTask, "task_flag": "text_correction-ernie-csc"},
+        },
+        "default": {"model": "ernie-csc"},
+    },
+    "text_similarity": {
+        "models": {
+            "simbert-base-chinese": {
+                "task_class": TextSimilarityTask,
+                "task_flag": "text_similarity-simbert-base-chinese",
+            },
+            "rocketqa-zh-dureader-cross-encoder": {
+                "task_class": TextSimilarityTask,
+                "task_flag": "text_similarity-rocketqa-zh-dureader-cross-encoder",
+            },
+            "rocketqa-base-cross-encoder": {
+                "task_class": TextSimilarityTask,
+                "task_flag": "text_similarity-rocketqa-base-cross-encoder",
+            },
+            "rocketqa-medium-cross-encoder": {
+                "task_class": TextSimilarityTask,
+                "task_flag": "text_similarity-rocketqa-medium-cross-encoder",
+            },
+            "rocketqa-mini-cross-encoder": {
+                "task_class": TextSimilarityTask,
+                "task_flag": "text_similarity-rocketqa-mini-cross-encoder",
+            },
+            "rocketqa-micro-cross-encoder": {
+                "task_class": TextSimilarityTask,
+                "task_flag": "text_similarity-rocketqa-micro-cross-encoder",
+            },
+            "rocketqa-nano-cross-encoder": {
+                "task_class": TextSimilarityTask,
+                "task_flag": "text_similarity-rocketqa-nano-cross-encoder",
+            },
+            "rocketqav2-en-marco-cross-encoder": {
+                "task_class": TextSimilarityTask,
+                "task_flag": "text_similarity-rocketqav2-en-marco-cross-encoder",
+            },
+            "ernie-search-large-cross-encoder-marco-en": {
+                "task_class": TextSimilarityTask,
+                "task_flag": "text_similarity-ernie-search-large-cross-encoder-marco-en",
+            },
+            "__internal_testing__/tiny-random-bert": {
+                "task_class": TextSimilarityTask,
+                "task_flag": "text_similarity-tiny-random-bert",
+            },
+        },
+        "default": {"model": "simbert-base-chinese"},
+    },
+    "text_summarization": {
+        "models": {
+            "unimo-text-1.0-summary": {
+                "task_class": TextSummarizationTask,
+                "task_flag": "text_summarization-unimo-text-1.0-summary",
+                "task_priority_path": "unimo-text-1.0-summary",
+            },
+            "IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese": {
+                "task_class": TextSummarizationTask,
+                "task_flag": "text_summarization-IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese",
+                "task_priority_path": "IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese",
+            },
+            "IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese": {
+                "task_class": TextSummarizationTask,
+                "task_flag": "text_summarization-IDEA-CCNL/Randeng-Pegasus523M-Summary-Chinese",
+                "task_priority_path": "IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese",
+            },
+            "IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese-V1": {
+                "task_class": TextSummarizationTask,
+                "task_flag": "text_summarization-IDEA-CCNL/Randeng-Pegasus523M-Summary-Chinese-V1",
+                "task_priority_path": "IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese-V1",
+            },
+            "PaddlePaddle/Randeng-Pegasus-238M-Summary-Chinese-SSTIA": {
+                "task_class": TextSummarizationTask,
+                "task_flag": "text_summarization-PaddlePaddle/Randeng-Pegasus-238M-Summary-Chinese-SSTIA",
+                "task_priority_path": "PaddlePaddle/Randeng-Pegasus-238M-Summary-Chinese-SSTIA",
+            },
+            "PaddlePaddle/Randeng-Pegasus-523M-Summary-Chinese-SSTIA": {
+                "task_class": TextSummarizationTask,
+                "task_flag": "text_summarization-PaddlePaddle/Randeng-Pegasus-523M-Summary-Chinese-SSTIA",
+                "task_priority_path": "PaddlePaddle/Randeng-Pegasus-523M-Summary-Chinese-SSTIA",
+            },
+        },
+        "default": {"model": "PaddlePaddle/Randeng-Pegasus-523M-Summary-Chinese-SSTIA"},
+    },
+    "word_segmentation": {
+        "modes": {
+            "fast": {
+                "task_class": SegJiebaTask,
+                "task_flag": "word_segmentation-jieba",
+            },
+            "base": {
+                "task_class": SegLACTask,
+                "hidden_size": 128,
+                "emb_dim": 128,
+                "task_flag": "word_segmentation-gru_crf",
+                "task_priority_path": "lac",
+            },
+            "accurate": {
+                "task_class": SegWordTagTask,
+                "task_flag": "word_segmentation-wordtag",
+                "task_priority_path": "wordtag",
+                "linking": False,
+            },
+        },
+        "default": {"mode": "base"},
+    },
+    "information_extraction": {
+        "models": {
+            "uie-base": {"task_class": UIETask, "hidden_size": 768, "task_flag": "information_extraction-uie-base"},
+            "uie-medium": {
+                "task_class": UIETask,
+                "hidden_size": 768,
+                "task_flag": "information_extraction-uie-medium",
+            },
+            "uie-mini": {"task_class": UIETask, "hidden_size": 384, "task_flag": "information_extraction-uie-mini"},
+            "uie-micro": {"task_class": UIETask, "hidden_size": 384, "task_flag": "information_extraction-uie-micro"},
+            "uie-nano": {"task_class": UIETask, "hidden_size": 312, "task_flag": "information_extraction-uie-nano"},
+            "uie-tiny": {"task_class": UIETask, "hidden_size": 768, "task_flag": "information_extraction-uie-tiny"},
+            "uie-medical-base": {
+                "task_class": UIETask,
+                "hidden_size": 768,
+                "task_flag": "information_extraction-uie-medical-base",
+            },
+            "uie-base-en": {
+                "task_class": UIETask,
+                "hidden_size": 768,
+                "task_flag": "information_extraction-uie-base-en",
+            },
+            "uie-m-base": {
+                "task_class": UIETask,
+                "hidden_size": 768,
+                "task_flag": "information_extraction-uie-m-base",
+            },
+            "uie-m-large": {
+                "task_class": UIETask,
+                "hidden_size": 1024,
+                "task_flag": "information_extraction-uie-m-large",
+            },
+            "uie-x-base": {
+                "task_class": UIETask,
+                "hidden_size": 768,
+                "task_flag": "information_extraction-uie-x-base",
+            },
+            "uie-data-distill-gp": {"task_class": GPTask, "task_flag": "information_extraction-uie-data-distill-gp"},
+            "__internal_testing__/tiny-random-uie": {
+                "task_class": UIETask,
+                "hidden_size": 8,
+                "task_flag": "information_extraction-tiny-random-uie",
+            },
+            "__internal_testing__/tiny-random-uie-m": {
+                "task_class": UIETask,
+                "hidden_size": 8,
+                "task_flag": "information_extraction-tiny-random-uie-m",
+            },
+            "__internal_testing__/tiny-random-uie-x": {
+                "task_class": UIETask,
+                "hidden_size": 8,
+                "task_flag": "information_extraction-tiny-random-uie-x",
+            },
+        },
+        "default": {"model": "uie-base"},
+    },
+    "code_generation": {
+        "models": {
+            "Salesforce/codegen-350M-mono": {
+                "task_class": CodeGenerationTask,
+                "task_flag": "code_generation-Salesforce/codegen-350M-mono",
+                "task_priority_path": "Salesforce/codegen-350M-mono",
+            },
+            "Salesforce/codegen-2B-mono": {
+                "task_class": CodeGenerationTask,
+                "task_flag": "code_generation-Salesforce/codegen-2B-mono",
+                "task_priority_path": "Salesforce/codegen-2B-mono",
+            },
+            "Salesforce/codegen-6B-mono": {
+                "task_class": CodeGenerationTask,
+                "task_flag": "code_generation-Salesforce/codegen-6B-mono",
+                "task_priority_path": "Salesforce/codegen-6B-mono",
+            },
+            "Salesforce/codegen-350M-nl": {
+                "task_class": CodeGenerationTask,
+                "task_flag": "code_generation-Salesforce/codegen-350M-nl",
+                "task_priority_path": "Salesforce/codegen-350M-nl",
+            },
+            "Salesforce/codegen-2B-nl": {
+                "task_class": CodeGenerationTask,
+                "task_flag": "code_generation-Salesforce/codegen-2B-nl",
+                "task_priority_path": "Salesforce/codegen-2B-nl",
+            },
+            "Salesforce/codegen-6B-nl": {
+                "task_class": CodeGenerationTask,
+                "task_flag": "code_generation-Salesforce/codegen-6B-nl",
+                "task_priority_path": "Salesforce/codegen-6B-nl",
+            },
+            "Salesforce/codegen-350M-multi": {
+                "task_class": CodeGenerationTask,
+                "task_flag": "code_generation-Salesforce/codegen-350M-multi",
+                "task_priority_path": "Salesforce/codegen-350M-multi",
+            },
+            "Salesforce/codegen-2B-multi": {
+                "task_class": CodeGenerationTask,
+                "task_flag": "code_generation-Salesforce/codegen-2B-multi",
+                "task_priority_path": "Salesforce/codegen-2B-multi",
+            },
+            "Salesforce/codegen-6B-multi": {
+                "task_class": CodeGenerationTask,
+                "task_flag": "code_generation-Salesforce/codegen-6B-multi",
+                "task_priority_path": "Salesforce/codegen-6B-multi",
+            },
+        },
+        "default": {
+            "model": "Salesforce/codegen-350M-mono",
+        },
+    },
+    "text_classification": {
+        "modes": {
+            "finetune": {
+                "task_class": TextClassificationTask,
+                "task_flag": "text_classification-finetune",
+            },
+            "prompt": {
+                "task_class": TextClassificationTask,
+                "task_flag": "text_classification-prompt",
+            },
+        },
+        "default": {"mode": "finetune"},
+    },
+    "document_intelligence": {
+        "models": {
+            "docprompt": {
+                "task_class": DocPromptTask,
+                "task_flag": "document_intelligence-docprompt",
+            },
+        },
+        "default": {"model": "docprompt"},
+    },
+    "question_generation": {
+        "models": {
+            "unimo-text-1.0": {
+                "task_class": QuestionGenerationTask,
+                "task_flag": "question_generation-unimo-text-1.0",
+            },
+            "unimo-text-1.0-dureader_qg": {
+                "task_class": QuestionGenerationTask,
+                "task_flag": "question_generation-unimo-text-1.0-dureader_qg",
+            },
+            "unimo-text-1.0-question-generation": {
+                "task_class": QuestionGenerationTask,
+                "task_flag": "question_generation-unimo-text-1.0-question-generation",
+            },
+            "unimo-text-1.0-question-generation-dureader_qg": {
+                "task_class": QuestionGenerationTask,
+                "task_flag": "question_generation-unimo-text-1.0-question-generation-dureader_qg",
+            },
+        },
+        "default": {"model": "unimo-text-1.0-dureader_qg"},
+    },
+    "text2text_generation": {
+        "models": {
+            "THUDM/chatglm-6b": {
+                "task_class": ChatGLMTask,
+                "task_flag": "text_generation-THUDM/chatglm-6b",
+            },
+            "THUDM/chatglm2-6b": {
+                "task_class": ChatGLMTask,
+                "task_flag": "text_generation-THUDM/chatglm2-6b",
+            },
+            "__internal_testing__/tiny-random-chatglm": {
+                "task_class": ChatGLMTask,
+                "task_flag": "text_generation-tiny-random-chatglm",
+            },
+            "THUDM/chatglm-6b-v1.1": {
+                "task_class": ChatGLMTask,
+                "task_flag": "text_generation-THUDM/chatglm-6b-v1.1",
+            },
+        },
+        "default": {"model": "THUDM/chatglm-6b-v1.1"},
+    },
+    "zero_shot_text_classification": {
+        "models": {
+            "utc-large": {
+                "task_class": ZeroShotTextClassificationTask,
+                "task_flag": "zero_shot_text_classification-utc-large",
+            },
+            "utc-xbase": {
+                "task_class": ZeroShotTextClassificationTask,
+                "task_flag": "zero_shot_text_classification-utc-xbase",
+            },
+            "utc-base": {
+                "task_class": ZeroShotTextClassificationTask,
+                "task_flag": "zero_shot_text_classification-utc-base",
+            },
+            "utc-medium": {
+                "task_class": ZeroShotTextClassificationTask,
+                "task_flag": "zero_shot_text_classification-utc-medium",
+            },
+            "utc-micro": {
+                "task_class": ZeroShotTextClassificationTask,
+                "task_flag": "zero_shot_text_classification-utc-micro",
+            },
+            "utc-mini": {
+                "task_class": ZeroShotTextClassificationTask,
+                "task_flag": "zero_shot_text_classification-utc-mini",
+            },
+            "utc-nano": {
+                "task_class": ZeroShotTextClassificationTask,
+                "task_flag": "zero_shot_text_classification-utc-nano",
+            },
+            "utc-pico": {
+                "task_class": ZeroShotTextClassificationTask,
+                "task_flag": "zero_shot_text_classification-utc-pico",
+            },
+            "__internal_testing__/tiny-random-utc": {
+                "task_class": ZeroShotTextClassificationTask,
+                "task_flag": "zero_shot_text_classification-tiny-random-utc",
+            },
+        },
+        "default": {"model": "utc-base"},
+    },
+    "feature_extraction": {
+        "models": {
+            "rocketqa-zh-dureader-query-encoder": {
+                "task_class": TextFeatureExtractionTask,
+                "task_flag": "feature_extraction-rocketqa-zh-dureader-query-encoder",
+                "task_priority_path": "rocketqa-zh-dureader-query-encoder",
+            },
+            "rocketqa-zh-dureader-para-encoder": {
+                "task_class": TextFeatureExtractionTask,
+                "task_flag": "feature_extraction-rocketqa-zh-dureader-para-encoder",
+                "task_priority_path": "rocketqa-rocketqa-zh-dureader-para-encoder",
+            },
+            "rocketqa-zh-base-query-encoder": {
+                "task_class": TextFeatureExtractionTask,
+                "task_flag": "feature_extraction-rocketqa-zh-base-query-encoder",
+                "task_priority_path": "rocketqa-zh-base-query-encoder",
+            },
+            "rocketqa-zh-base-para-encoder": {
+                "task_class": TextFeatureExtractionTask,
+                "task_flag": "feature_extraction-rocketqa-zh-base-para-encoder",
+                "task_priority_path": "rocketqa-zh-base-para-encoder",
+            },
+            "rocketqa-zh-medium-query-encoder": {
+                "task_class": TextFeatureExtractionTask,
+                "task_flag": "feature_extraction-rocketqa-zh-medium-query-encoder",
+                "task_priority_path": "rocketqa-zh-medium-query-encoder",
+            },
+            "rocketqa-zh-medium-para-encoder": {
+                "task_class": TextFeatureExtractionTask,
+                "task_flag": "feature_extraction-rocketqa-zh-medium-para-encoder",
+                "task_priority_path": "rocketqa-zh-medium-para-encoder",
+            },
+            "rocketqa-zh-mini-query-encoder": {
+                "task_class": TextFeatureExtractionTask,
+                "task_flag": "feature_extraction-rocketqa-zh-mini-query-encoder",
+                "task_priority_path": "rocketqa-zh-mini-query-encoder",
+            },
+            "rocketqa-zh-mini-para-encoder": {
+                "task_class": TextFeatureExtractionTask,
+                "task_flag": "feature_extraction-rocketqa-rocketqa-zh-mini-para-encoder",
+                "task_priority_path": "rocketqa-zh-mini-para-encoder",
+            },
+            "rocketqa-zh-micro-query-encoder": {
+                "task_class": TextFeatureExtractionTask,
+                "task_flag": "feature_extraction-rocketqa-zh-micro-query-encoder",
+                "task_priority_path": "rocketqa-zh-micro-query-encoder",
+            },
+            "rocketqa-zh-micro-para-encoder": {
+                "task_class": TextFeatureExtractionTask,
+                "task_flag": "feature_extraction-rocketqa-zh-micro-para-encoder",
+                "task_priority_path": "rocketqa-zh-micro-para-encoder",
+            },
+            "rocketqa-zh-nano-query-encoder": {
+                "task_class": TextFeatureExtractionTask,
+                "task_flag": "feature_extraction-rocketqa-zh-nano-query-encoder",
+                "task_priority_path": "rocketqa-zh-nano-query-encoder",
+            },
+            "rocketqa-zh-nano-para-encoder": {
+                "task_class": TextFeatureExtractionTask,
+                "task_flag": "feature_extraction-rocketqa-zh-nano-para-encoder",
+                "task_priority_path": "rocketqa-zh-nano-para-encoder",
+            },
+            "rocketqav2-en-marco-query-encoder": {
+                "task_class": TextFeatureExtractionTask,
+                "task_flag": "feature_extraction-rocketqav2-en-marco-query-encoder",
+                "task_priority_path": "rocketqav2-en-marco-query-encoder",
+            },
+            "rocketqav2-en-marco-para-encoder": {
+                "task_class": TextFeatureExtractionTask,
+                "task_flag": "feature_extraction-rocketqav2-en-marco-para-encoder",
+                "task_priority_path": "rocketqav2-en-marco-para-encoder",
+            },
+            "ernie-search-base-dual-encoder-marco-en": {
+                "task_class": TextFeatureExtractionTask,
+                "task_flag": "feature_extraction-ernie-search-base-dual-encoder-marco-en",
+                "task_priority_path": "ernie-search-base-dual-encoder-marco-en",
+            },
+            "PaddlePaddle/ernie_vil-2.0-base-zh": {
+                "task_class": MultimodalFeatureExtractionTask,
+                "task_flag": "feature_extraction-PaddlePaddle/ernie_vil-2.0-base-zh",
+                "task_priority_path": "PaddlePaddle/ernie_vil-2.0-base-zh",
+            },
+            "OFA-Sys/chinese-clip-vit-base-patch16": {
+                "task_class": MultimodalFeatureExtractionTask,
+                "task_flag": "feature_extraction-OFA-Sys/chinese-clip-vit-base-patch16",
+                "task_priority_path": "OFA-Sys/chinese-clip-vit-base-patch16",
+            },
+            "OFA-Sys/chinese-clip-vit-huge-patch14": {
+                "task_class": MultimodalFeatureExtractionTask,
+                "task_flag": "feature_extraction-OFA-Sys/chinese-clip-vit-huge-patch14",
+                "task_priority_path": "OFA-Sys/chinese-clip-vit-huge-patch14",
+            },
+            "OFA-Sys/chinese-clip-vit-large-patch14": {
+                "task_class": MultimodalFeatureExtractionTask,
+                "task_flag": "feature_extraction-OFA-Sys/chinese-clip-vit-large-patch14",
+                "task_priority_path": "OFA-Sys/chinese-clip-vit-large-patch14",
+            },
+            "OFA-Sys/chinese-clip-vit-large-patch14-336px": {
+                "task_class": MultimodalFeatureExtractionTask,
+                "task_flag": "feature_extraction-OFA-Sys/chinese-clip-vit-large-patch14-336px",
+                "task_priority_path": "OFA-Sys/chinese-clip-vit-large-patch14-336px",
+            },
+            "openai/clip-vit-base-patch32": {
+                "task_class": MultimodalFeatureExtractionTask,
+                "task_flag": "feature_extraction-openai/clip-vit-base-patch32",
+                "task_priority_path": "openai/clip-vit-base-patch32",
+            },
+            "openai/clip-vit-base-patch16": {
+                "task_class": MultimodalFeatureExtractionTask,
+                "task_flag": "feature_extraction-openai/clip-vit-base-patch16",
+                "task_priority_path": "openai/clip-vit-base-patch16",
+            },
+            "openai/clip-vit-large-patch14": {
+                "task_class": MultimodalFeatureExtractionTask,
+                "task_flag": "feature_extraction-openai/clip-vit-large-patch14",
+                "task_priority_path": "openai/clip-vit-large-patch14",
+            },
+            "laion/CLIP-ViT-H-14-laion2B-s32B-b79K": {
+                "task_class": MultimodalFeatureExtractionTask,
+                "task_flag": "feature_extraction-laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
+                "task_priority_path": "laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
+            },
+            "laion/CLIP-ViT-B-32-laion2B-s34B-b79K": {
+                "task_class": MultimodalFeatureExtractionTask,
+                "task_flag": "feature_extraction-laion/CLIP-ViT-B-32-laion2B-s34B-b79K",
+                "task_priority_path": "laion/CLIP-ViT-B-32-laion2B-s34B-b79K",
+            },
+            "openai/clip-rn50": {
+                "task_class": MultimodalFeatureExtractionTask,
+                "task_flag": "feature_extraction-openai/clip-rn50",
+                "task_priority_path": "openai/clip-rn50",
+            },
+            "openai/clip-rn101": {
+                "task_class": MultimodalFeatureExtractionTask,
+                "task_flag": "feature_extraction-openai/clip-rn101",
+                "task_priority_path": "openai/clip-rn101",
+            },
+            "openai/clip-rn50x4": {
+                "task_class": MultimodalFeatureExtractionTask,
+                "task_flag": "feature_extraction-openai/clip-rn50x4",
+                "task_priority_path": "openai/clip-rn50x4",
+            },
+            "__internal_testing__/tiny-random-ernievil2": {
+                "task_class": MultimodalFeatureExtractionTask,
+                "task_flag": "feature_extraction-tiny-random-ernievil2",
+                "task_priority_path": "__internal_testing__/tiny-random-ernievil2",
+            },
+            "moka-ai/m3e-base": {
+                "task_class": SentenceFeatureExtractionTask,
+                "task_flag": "feature_extraction-moka-ai/m3e-base",
+                "task_priority_path": "moka-ai/m3e-base",
+            },
+            "BAAI/bge-small-zh-v1.5": {
+                "task_class": SentenceFeatureExtractionTask,
+                "task_flag": "feature_extraction-BAAI/bge-small-zh-v1.5",
+                "task_priority_path": "BAAI/bge-small-zh-v1.5",
+            },
+            "__internal_testing__/tiny-random-m3e": {
+                "task_class": SentenceFeatureExtractionTask,
+                "task_flag": "__internal_testing__/tiny-random-m3e",
+                "task_priority_path": "__internal_testing__/tiny-random-m3e",
+            },
+        },
+        "default": {"model": "PaddlePaddle/ernie_vil-2.0-base-zh"},
+    },
+}
+
+support_schema_list = [
+    "uie-base",
+    "uie-medium",
+    "uie-mini",
+    "uie-micro",
+    "uie-nano",
+    "uie-tiny",
+    "uie-medical-base",
+    "uie-base-en",
+    "wordtag",
+    "uie-m-large",
+    "uie-m-base",
+    "uie-x-base",
+    "uie-senta-base",
+    "uie-senta-medium",
+    "uie-senta-mini",
+    "uie-senta-micro",
+    "uie-senta-nano",
+    "utc-large",
+    "utc-xbase",
+    "utc-base",
+    "utc-medium",
+    "utc-micro",
+    "utc-mini",
+    "utc-nano",
+    "utc-pico",
+    "utc-tiny",
+    "__internal_testing__/tiny-random-uie",
+    "__internal_testing__/tiny-random-uie-m",
+    "__internal_testing__/tiny-random-uie-x",
+]
+
+support_argument_list = [
+    "dalle-mini",
+    "dalle-mega",
+    "dalle-mega-v16",
+    "pai-painter-painting-base-zh",
+    "pai-painter-scenery-base-zh",
+    "pai-painter-commercial-base-zh",
+    "CompVis/stable-diffusion-v1-4",
+    "openai/disco-diffusion-clip-vit-base-patch32",
+    "openai/disco-diffusion-clip-rn50",
+    "openai/disco-diffusion-clip-rn101",
+    "PaddlePaddle/disco_diffusion_ernie_vil-2.0-base-zh",
+    "uie-base",
+    "uie-medium",
+    "uie-mini",
+    "uie-micro",
+    "uie-nano",
+    "uie-tiny",
+    "uie-medical-base",
+    "uie-base-en",
+    "uie-m-large",
+    "uie-m-base",
+    "uie-x-base",
+    "__internal_testing__/tiny-random-uie-m",
+    "__internal_testing__/tiny-random-uie-x",
+    "THUDM/chatglm-6b",
+    "THUDM/chatglm2-6b",
+    "THUDM/chatglm-6b-v1.1",
+]
+
+
+class Taskflow(object):
+    """
+    The Taskflow is the end2end interface that could convert the raw text to model result, and decode the model result to task result. The main functions as follows:
+        1) Convert the raw text to task result.
+        2) Convert the model to the inference model.
+        3) Offer the usage and help message.
+    Args:
+        task (str): The task name for the Taskflow, and get the task class from the name.
+        model (str, optional): The model name in the task, if set None, will use the default model.
+        mode (str, optional): Select the mode of the task, only used in the tasks of word_segmentation and ner.
+            If set None, will use the default mode.
+        device_id (int, optional): The device id for the gpu, xpu and other devices, the defalut value is 0.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+
+    """
+
+    def __init__(self, task, model=None, mode=None, device_id=0, from_hf_hub=False, **kwargs):
+        assert task in TASKS, f"The task name:{task} is not in Taskflow list, please check your task name."
+        self.task = task
+        # Set the device for the task
+        device = get_env_device()
+        if device == "cpu" or device_id == -1:
+            paddle.set_device("cpu")
+        else:
+            paddle.set_device(device + ":" + str(device_id))
+
+        if self.task in ["word_segmentation", "ner", "text_classification"]:
+            tag = "modes"
+            ind_tag = "mode"
+            self.model = mode
+        else:
+            tag = "models"
+            ind_tag = "model"
+            self.model = model
+
+        if self.model is not None:
+            assert self.model in set(TASKS[task][tag].keys()), f"The {tag} name: {model} is not in task:[{task}]"
+        else:
+            self.model = TASKS[task]["default"][ind_tag]
+
+        if "task_priority_path" in TASKS[self.task][tag][self.model]:
+            self.priority_path = TASKS[self.task][tag][self.model]["task_priority_path"]
+        else:
+            self.priority_path = None
+
+        # Update the task config to kwargs
+        config_kwargs = TASKS[self.task][tag][self.model]
+        kwargs["device_id"] = device_id
+        kwargs.update(config_kwargs)
+        self.kwargs = kwargs
+        task_class = TASKS[self.task][tag][self.model]["task_class"]
+        self.task_instance = task_class(
+            model=self.model, task=self.task, priority_path=self.priority_path, from_hf_hub=from_hf_hub, **self.kwargs
+        )
+        task_list = TASKS.keys()
+        Taskflow.task_list = task_list
+
+        # Add the lock for the concurrency requests
+        self._lock = threading.Lock()
+
+    def __call__(self, *inputs, **kwargs):
+        """
+        The main work function in the taskflow.
+        """
+        results = self.task_instance(inputs, **kwargs)
+        return results
+
+    def help(self):
+        """
+        Return the task usage message.
+        """
+        return self.task_instance.help()
+
+    def task_path(self):
+        """
+        Return the path of current task
+        """
+        return self.task_instance._task_path
+
+    @staticmethod
+    def tasks():
+        """
+        Return the available task list.
+        """
+        task_list = list(TASKS.keys())
+        return task_list
+
+    def from_segments(self, *inputs):
+        results = self.task_instance.from_segments(inputs)
+        return results
+
+    def interactive_mode(self, max_turn):
+        with self.task_instance.interactive_mode(max_turn):
+            while True:
+                human = input("[Human]:").strip()
+                if human.lower() == "exit":
+                    exit()
+                robot = self.task_instance(human)[0]
+                print("[Bot]:%s" % robot)
+
+    def set_schema(self, schema):
+        assert (
+            self.task_instance.model in support_schema_list
+        ), "This method can only be used by the task based on the model of uie or wordtag."
+        self.task_instance.set_schema(schema)
+
+    def set_argument(self, argument):
+        assert self.task_instance.model in support_argument_list, (
+            "This method can only be used by the task of text-to-image generation, information extraction "
+            "or zero-text-classification."
+        )
+        self.task_instance.set_argument(argument)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text2text_generation.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text2text_generation.py
new file mode 100644
index 000000000..7966f2995
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text2text_generation.py
@@ -0,0 +1,252 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+from ..transformers import AutoModelForCausalLM, AutoTokenizer
+from ..utils.log import logger
+from .task import Task
+from .utils import static_mode_guard
+
+
+class ChatGLMTask(Task):
+    """
+    The text to text generation LLM model to predict the question or chinese  poetry.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    def __init__(self, task, model, **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
+        # Default to static mode
+        self._static_mode = False
+        self._dtype = kwargs.get("dtype", "float16")
+        self.kwargs["generation_task"] = task
+        self._tgt_length = kwargs.get("tgt_length", 2048)
+        # Token max length
+        self._max_seq_length = kwargs.get("max_seq_length", 2048)
+        self._top_k = kwargs.get("top_k", 1)
+        self._top_p = kwargs.get("top_p", 1.0)
+        self._temperature = kwargs.get("temperature", 1.0)
+        self._decode_strategy = kwargs.get("decode_strategy", "sampling")
+        self._num_return_sequences = kwargs.get("num_return_sequences", 1)
+
+        self._construct_tokenizer(model)
+        if self._static_mode:
+            self._get_inference_model()
+        else:
+            self._construct_model(model)
+        self._construct_input_spec()
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        self._input_spec = [
+            paddle.static.InputSpec(shape=[None, None], dtype="int64"),  # input_ids
+            paddle.static.InputSpec(shape=[None, None, None, None], dtype="int64"),  # attention_mask
+            paddle.static.InputSpec(shape=[None, None, None], dtype="int64"),  # position_ids
+            # max_length
+            self._tgt_length,
+            # min_length
+            0,
+            # decode_strategy
+            self._decode_strategy,
+            # temperature
+            self._temperature,
+            # top_k
+            self._top_k,
+            # top_p
+            self._top_p,
+            # repetition_penalty
+            1,
+            # num_beams
+            1,
+            # num_beam_groups
+            1,
+            # length_penalty
+            0.0,
+            # early_stopping
+            False,
+            # bos_token_id
+            self._tokenizer.bos_token_id,
+            # eos_token_id
+            self._tokenizer.eos_token_id,
+            # pad_token_id
+            self._tokenizer.pad_token_id,
+            # decoder_start_token_id
+            None,
+            # forced_bos_token_id
+            None,
+            # forced_eos_token_id
+            None,
+            # no_repeat_ngram_size
+            None,
+            # num_return_sequences
+            self._num_return_sequences,
+            # diversity_rate
+            0.0,
+            # use_cache
+            True,
+        ]
+
+    def _construct_tokenizer(self, model):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        tokenizer_instance = AutoTokenizer.from_pretrained(model)
+
+        self._tokenizer = tokenizer_instance
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+        model_instance = AutoModelForCausalLM.from_pretrained(
+            self.model,
+            dtype=self._dtype,
+        )
+        # Load the model parameter for the predict
+        model_instance.eval()
+        self._model = model_instance
+
+    def _batchify(self, data, batch_size):
+        """
+        Generate input batches.
+        """
+        # Separates data into some batches.
+        one_batch = []
+        for example in data:
+            one_batch.append(example)
+            if len(one_batch) == batch_size:
+                yield one_batch
+                one_batch = []
+        if one_batch:
+            yield one_batch
+
+    def _preprocess(self, inputs, padding=True, add_special_tokens=True):
+        """
+        Transform the raw text to the model inputs, two steps involved:
+           1) Transform the raw text to token ids.
+           2) Generate the other model inputs from the raw text and token ids.
+        """
+        inputs = self._check_input_text(inputs)
+        # Get the config from the kwargs
+        batch_size = self.kwargs["batch_size"] if "batch_size" in self.kwargs else 1
+        batches = self._batchify(inputs, batch_size)
+        examples = []
+        for input_text in batches:
+            if self._static_mode:
+                tokenized_output = self._tokenizer(
+                    input_text,
+                    return_tensors="np",
+                    padding=True,
+                    max_length=self._max_seq_length,
+                    truncation=True,
+                    truncation_side="left",
+                )
+            else:
+                tokenized_output = self._tokenizer(
+                    input_text,
+                    return_tensors="pd",
+                    padding=True,
+                    max_length=self._max_seq_length,
+                    truncation=True,
+                    truncation_side="left",
+                )
+            examples.append(tokenized_output)
+        outputs = {}
+        outputs["text"] = inputs
+        outputs["data_loader"] = examples
+        return outputs
+
+    def _run_model(self, inputs):
+        """
+        Run the task model from the outputs of the `_tokenize` function.
+        """
+        results = []
+        if self._static_mode:
+            with static_mode_guard():
+                for batch in inputs["data_loader"]:
+                    input_ids = batch["input_ids"]
+                    attention_mask = batch["attention_mask"]
+                    position_ids = batch["position_ids"]
+                    self.input_handles[0].copy_from_cpu(input_ids)
+                    self.input_handles[1].copy_from_cpu(attention_mask)
+                    self.input_handles[2].copy_from_cpu(position_ids)
+                    self.predictor.run()
+                    result = self.output_handle[0].copy_to_cpu().tolist()
+                    results.extend(result)
+        else:
+            for batch_inputs in inputs["data_loader"]:
+                result = self._model.generate(
+                    **batch_inputs,
+                    decode_strategy=self._decode_strategy,
+                    top_k=self._top_k,
+                    top_p=self._top_p,
+                    temperature=self._temperature,
+                    max_length=self._tgt_length,
+                    bos_token_id=self._tokenizer.bos_token_id,
+                    eos_token_id=self._tokenizer.eos_token_id,
+                    pad_token_id=self._tokenizer.pad_token_id,
+                    num_return_sequences=self._num_return_sequences,
+                    use_cache=True,
+                )
+                result = result[0]
+                results.extend(result)
+
+        inputs["results"] = results
+        return inputs
+
+    def _postprocess(self, inputs):
+        """
+        The model output is tag ids, this function will convert the model output to raw text.
+        """
+        preds = inputs["results"]
+        result = []
+        for x in preds:
+            if self._static_mode:
+                res = self._tokenizer.decode(x, skip_special_tokens=True)
+                res = res.strip("\n")
+                result.append(res)
+            else:
+                res = self._tokenizer.decode(x.numpy().tolist(), skip_special_tokens=True)
+                res = res.strip("\n")
+                result.append(res)
+        out_dict = {"result": result}
+        return out_dict
+
+    def set_argument(self, argument: dict):
+        for k, v in argument.items():
+            if k == "input":
+                continue
+            setattr(self, f"_{k}", v)
+
+    def _convert_dygraph_to_static(self):
+        """
+        Convert the dygraph model to static model.
+        """
+        assert (
+            self._model is not None
+        ), "The dygraph model must be created before converting the dygraph model to static model."
+        assert (
+            self._input_spec is not None
+        ), "The input spec must be created before converting the dygraph model to static model."
+        logger.info("Converting to the inference model cost a little time.")
+
+        static_model = paddle.jit.to_static(self._model.generate, input_spec=self._input_spec)
+        paddle.jit.save(static_model, self.inference_model_path)
+        logger.info("The inference model save in the path:{}".format(self.inference_model_path))
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text_classification.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text_classification.py
new file mode 100644
index 000000000..170b8381e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text_classification.py
@@ -0,0 +1,369 @@
+# coding:utf-8
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+from typing import Any, Dict, List, Union
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from scipy.special import expit as np_sigmoid
+from scipy.special import softmax as np_softmax
+
+from ..data import DataCollatorWithPadding
+from ..prompt import (
+    AutoTemplate,
+    PromptDataCollatorWithPadding,
+    PromptModelForSequenceClassification,
+    SoftVerbalizer,
+)
+from ..transformers import (
+    AutoModelForMaskedLM,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+)
+from ..utils.env import CONFIG_NAME, LEGACY_CONFIG_NAME
+from ..utils.log import logger
+from .task import Task
+from .utils import static_mode_guard
+
+usage = r"""
+        from paddlenlp import Taskflow
+        text_cls = Taskflow(
+            "text_classification",
+            mode="finetune",
+            problem_type="multi_class",
+            task_path=<local_saved_dynamic_model>,
+            id2label={0: "negative", 1: "positive"}
+            )
+        text_cls('房间依然很整洁，相当不错')
+        '''
+        [
+            {
+                'text': '房间依然很整洁，相当不错',
+                'predictions: [{
+                    'label': 'positive',
+                    'score': 0.80
+                }]
+            }
+        ]
+        '''
+        text_cls = Taskflow(
+            "text_classification",
+            mode="prompt",
+            problem_type="multi_label",
+            is_static_model=True,
+            task_path=<local_saved_static_model>,
+            static_model_prefix=<static_model_prefix>,
+            plm_model_path=<local_saved_plm_model>,
+            id2label={ 0: "体育", 1: "经济", 2: "娱乐"}
+            )
+        text_cls(['这是一条体育娱乐新闻的例子',
+                        '这是一条经济新闻'])
+        '''
+        [
+            {
+                'text': '这是一条体育娱乐新闻的例子',
+                'predictions: [
+                    {
+                        'label': '体育',
+                        'score': 0.80
+                    },
+                    {
+                        'label': '娱乐',
+                        'score': 0.90
+                    }
+                ]
+            },
+            {
+                'text': '这是一条经济新闻',
+                'predictions: [
+                    {
+                    'label': '经济',
+                    'score': 0.80
+                    }
+                ]
+            }
+        ]
+         """
+
+
+def softmax(x, axis=None):
+    x_max = np.amax(x, axis=axis, keepdims=True)
+    exp_x_shifted = np.exp(x - x_max)
+    return exp_x_shifted / np.sum(exp_x_shifted, axis=axis, keepdims=True)
+
+
+class TextClassificationTask(Task):
+    """
+    The text classfication model to classify text.
+    NOTE: This task is different from all other tasks that it has no out-of-box zero-shot capabilities.
+    Instead, it's used as a simple inference pipeline.
+
+    Args:
+        task (string): The name of task.
+        model (string): Mode of the classification, Supports ["prompt", "finetune"].
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+            task_path (string): The local file path to the model path or a pre-trained model.
+            is_static_model (string): Whether the model in task path  is a static model.
+            problem_type (str, optional): Select among ["multi_class", "multi_label"] based on the nature of your problem. Default to "multi_class".
+            multilabel_threshold (float): The probability threshold used for the multi_label setup. Only effective if model = "multi_label". Defaults to 0.5.
+            max_length (int): Maximum number of tokens for the model.
+            precision (int): Select among ["fp32", "fp16"]. Default to "fp32".
+            plm_model_name (str): Pretrained langugae model name for PromptModel.
+            input_spec [list]: Specify the tensor information for each input parameter of the forward function.
+            id2label(dict(int,string)): The dictionary to map the predictions from class ids to class names.
+            batch_size(int): The sample number of a mini-batch.
+    """
+
+    def __init__(self, task: str, model: str = "finetune", **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
+        self.problem_type = self.kwargs.get("problem_type", "multi_class")
+        self.multilabel_threshold = self.kwargs.get("multilabel_threshold", 0.5)
+        self._max_length = self.kwargs.get("max_length", 512)
+
+        self._construct_tokenizer()
+        if self.model == "prompt":
+            self._initialize_prompt()
+        self._check_predictor_type()
+        self._get_inference_model()
+        self._construct_id2label()
+
+    def _initialize_prompt(self):
+        if "plm_model_name" in self.kwargs:
+            self._plm_model = AutoModelForMaskedLM.from_pretrained(self.kwargs["plm_model_name"])
+        elif os.path.isdir(os.path.join(self._task_path, "plm")):
+            self._plm_model = AutoModelForMaskedLM.from_pretrained(os.path.join(self._task_path, "plm"))
+            logger.info(f"Load pretrained language model from {self._plm_model}")
+        else:
+            raise NotImplementedError(
+                "Please specify the pretrained language model name （ex. plm_model_name='ernie-3.0-medium-zh'）."
+            )
+        self._template = AutoTemplate.load_from(self._task_path, self._tokenizer, self._max_length, self._plm_model)
+        with open(os.path.join(self._task_path, "verbalizer_config.json"), "r", encoding="utf-8") as fp:
+            self._label_words = json.load(fp)
+        self._verbalizer = SoftVerbalizer(self._label_words, self._tokenizer, self._plm_model)
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        if "input_spec" in self.kwargs:
+            self._input_spec = self.kwargs["input_spec"]
+        elif self.model == "finetune":
+            if os.path.exists(os.path.join(self._task_path, LEGACY_CONFIG_NAME)):
+                with open(os.path.join(self._task_path, LEGACY_CONFIG_NAME)) as fb:
+                    init_class = json.load(fb)["init_class"]
+            elif os.path.exists(os.path.join(self._task_path, CONFIG_NAME)):
+                with open(os.path.join(self._task_path, CONFIG_NAME)) as fb:
+                    init_class = json.load(fb)["architectures"].pop()
+            else:
+                raise IOError(
+                    f"Model configuration file dosen't exist.[task_path] should inclue {LEGACY_CONFIG_NAME} or {CONFIG_NAME}"
+                )
+
+            if init_class in ["ErnieMForSequenceClassification"]:
+                self._input_spec = [paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")]
+            else:
+                self._input_spec = [
+                    paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),
+                    paddle.static.InputSpec(shape=[None, None], dtype="int64", name="token_type_ids"),
+                ]
+        elif self.model == "prompt":
+            self._input_spec = self._model.get_input_spec()
+        else:
+            raise NotImplementedError(
+                f"'{self.model}' is not a supported model_type. Please select among ['finetune', 'prompt']"
+            )
+
+    def _construct_model(self, model: str):
+        """
+        Construct the inference model for the predictor.
+        """
+        if model == "finetune":
+            model_instance = AutoModelForSequenceClassification.from_pretrained(self._task_path)
+        elif model == "prompt":
+            model_instance = PromptModelForSequenceClassification(self._plm_model, self._template, self._verbalizer)
+            state_dict = paddle.load(os.path.join(self._task_path, "model_state.pdparams"), return_numpy=True)
+            model_instance.set_state_dict(state_dict)
+            # release memory
+            del state_dict
+        else:
+            raise NotImplementedError(
+                f"'{model}' is not a supported model_type. Please select among ['finetune', 'prompt']"
+            )
+
+        # Load the model parameter for the predict
+        model_instance.eval()
+        self._model = model_instance
+
+    def _construct_tokenizer(self):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        self._tokenizer = AutoTokenizer.from_pretrained(self._task_path)
+
+    def _construct_id2label(self):
+        if "id2label" in self.kwargs:
+            id2label = self.kwargs["id2label"]
+        elif os.path.exists(os.path.join(self._task_path, "id2label.json")):
+            id2label_path = os.path.join(self._task_path, "id2label.json")
+            with open(id2label_path) as fb:
+                id2label = json.load(fb)
+            logger.info(f"Load id2label from {id2label_path}.")
+        elif self.model == "prompt" and os.path.exists(os.path.join(self._task_path, "verbalizer_config.json")):
+            label_list = sorted(list(self._verbalizer.label_words.keys()))
+            id2label = {}
+            for i, l in enumerate(label_list):
+                id2label[i] = l
+            logger.info("Load id2label from verbalizer.")
+        elif self.model == "finetune" and os.path.exists(os.path.join(self._task_path, CONFIG_NAME)):
+            config_path = os.path.join(self._task_path, CONFIG_NAME)
+            with open(config_path) as fb:
+                config = json.load(fb)
+                if "id2label" in config:
+                    id2label = config["id2label"]
+                    logger.info(f"Load id2label from {config_path}.")
+                else:
+                    id2label = None
+        else:
+            id2label = None
+
+        if id2label is None:
+            self.id2label = id2label
+        else:
+            self.id2label = {}
+            for i in id2label:
+                self.id2label[int(i)] = id2label[i]
+
+    def _preprocess(self, inputs: Union[str, List[str]]) -> Dict[str, Any]:
+        """
+        Transform the raw text to the model inputs, two steps involved:
+           1) Transform the raw text to token ids.
+           2) Generate the other model inputs from the raw text and token ids.
+        """
+        inputs = self._check_input_text(inputs)
+        # Get the config from the kwargs
+        batch_size = self.kwargs["batch_size"] if "batch_size" in self.kwargs else 1
+
+        if self.model == "finetune":
+            collator = DataCollatorWithPadding(self._tokenizer, return_tensors="np")
+            tokenized_inputs = [self._tokenizer(i, max_length=self._max_length, truncation=True) for i in inputs]
+            batches = [tokenized_inputs[idx : idx + batch_size] for idx in range(0, len(tokenized_inputs), batch_size)]
+        elif self.model == "prompt":
+            collator = PromptDataCollatorWithPadding(
+                self._tokenizer, padding=True, return_tensors="np", return_attention_mask=True
+            )
+            part_text = "text"
+            for part in self._template.prompt:
+                if "text" in part:
+                    part_text = part["text"]
+            template_inputs = [self._template({part_text: x}) for x in inputs]
+            batches = [template_inputs[idx : idx + batch_size] for idx in range(0, len(template_inputs), batch_size)]
+        else:
+            raise NotImplementedError(
+                f"'{self.model}' is not a supported model_type. Please select among ['finetune', 'prompt']"
+            )
+        outputs = {}
+        outputs["text"] = inputs
+        outputs["batches"] = [collator(batch) for batch in batches]
+
+        return outputs
+
+    def _run_model(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Run the task model from the outputs of the `_tokenize` function.
+        """
+        # TODO: support hierachical classification
+        outputs = {}
+        outputs["text"] = inputs["text"]
+        outputs["batch_logits"] = []
+        dtype_dict = {
+            "input_ids": "int64",
+            "token_type_ids": "int64",
+            "position_ids": "int64",
+            "attention_mask": "float32",
+            "masked_positions": "int64",
+            "soft_token_ids": "int64",
+            "encoder_ids": "int64",
+        }
+        with static_mode_guard():
+            for batch in inputs["batches"]:
+                if "attention_mask" in batch:
+                    input_name = "attention_mask"
+                    if batch[input_name].ndim == 2:
+                        batch[input_name] = (1 - batch[input_name][:, np.newaxis, np.newaxis, :]) * -1e4
+                    elif batch[input_name].ndim != 4:
+                        raise ValueError(
+                            "Expect attention mask with ndim=2 or 4, but get ndim={}".format(batch[input_name].ndim)
+                        )
+                if self._predictor_type == "paddle-inference":
+                    for i, input_name in enumerate(self.predictor.get_input_names()):
+                        self.input_handles[i].copy_from_cpu(batch[input_name].astype(dtype_dict[input_name]))
+                    self.predictor.run()
+                    logits = self.output_handle[0].copy_to_cpu().tolist()
+                else:
+                    input_dict = {}
+                    for input_name in self.input_handler:
+                        input_dict[input_name] = batch[input_name].astype(dtype_dict[input_name])
+                    logits = self.predictor.run(None, input_dict)[0].tolist()
+                outputs["batch_logits"].append(logits)
+        return outputs
+
+    def _postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        This function converts the model logits output to class score and predictions
+        """
+        # TODO: support hierachical classification
+        postprocessed_outputs = []
+        for logits in inputs["batch_logits"]:
+            if self.problem_type == "multi_class":
+                if isinstance(logits, paddle.Tensor):  # dygraph
+                    scores = F.softmax(logits, axis=-1).numpy()
+                    labels = paddle.argmax(logits, axis=-1).numpy()
+                else:  # static graph
+                    scores = np_softmax(logits, axis=-1)
+                    labels = np.argmax(logits, axis=-1)
+                for score, label in zip(scores, labels):
+                    postprocessed_output = {}
+                    if self.id2label is None:
+                        postprocessed_output["predictions"] = [{"label": label, "score": score[label]}]
+                    else:
+                        postprocessed_output["predictions"] = [{"label": self.id2label[label], "score": score[label]}]
+                    postprocessed_outputs.append(postprocessed_output)
+            elif self.problem_type == "multi_label":  # multi_label
+                if isinstance(logits, paddle.Tensor):  # dygraph
+                    scores = F.sigmoid(logits).numpy()
+                else:  # static graph
+                    scores = np_sigmoid(logits)
+                for score in scores:
+                    postprocessed_output = {}
+                    postprocessed_output["predictions"] = []
+                    for i, class_score in enumerate(score):
+                        if class_score > self.multilabel_threshold:
+                            if self.id2label is None:
+                                postprocessed_output["predictions"].append({"label": i, "score": class_score})
+                            else:
+                                postprocessed_output["predictions"].append(
+                                    {"label": self.id2label[i], "score": class_score}
+                                )
+                    postprocessed_outputs.append(postprocessed_output)
+            else:
+                raise NotImplementedError(
+                    f"'{self.problem_type}' is not a supported problem type. Please select among ['multi_class', 'multi_label']"
+                )
+        for i, postprocessed_output in enumerate(postprocessed_outputs):
+            postprocessed_output["text"] = inputs["text"][i]
+        return postprocessed_outputs
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text_correction.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text_correction.py
new file mode 100644
index 000000000..d7fd11c62
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text_correction.py
@@ -0,0 +1,265 @@
+# coding:utf-8
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import paddle
+
+from ..data import Pad, Stack, Tuple, Vocab
+from ..transformers import ErnieModel, ErnieTokenizer, is_chinese_char
+from .models import ErnieForCSC
+from .task import Task
+from .utils import static_mode_guard
+
+usage = r"""
+           from paddlenlp import Taskflow
+
+           text_correction = Taskflow("text_correction")
+           text_correction('遇到逆竟时，我们必须勇于面对，而且要愈挫愈勇，这样我们才能朝著成功之路前进。')
+           '''
+           [{'source': '遇到逆竟时，我们必须勇于面对，而且要愈挫愈勇，这样我们才能朝著成功之路前进。',
+             'target': '遇到逆境时，我们必须勇于面对，而且要愈挫愈勇，这样我们才能朝著成功之路前进。',
+             'errors': [{'position': 3, 'correction': {'竟': '境'}}]}
+           ]
+           '''
+
+           text_correction(['遇到逆竟时，我们必须勇于面对，而且要愈挫愈勇，这样我们才能朝著成功之路前进。',
+                            '人生就是如此，经过磨练才能让自己更加拙壮，才能使自己更加乐观。'])
+           '''
+           [{'source': '遇到逆竟时，我们必须勇于面对，而且要愈挫愈勇，这样我们才能朝著成功之路前进。',
+             'target': '遇到逆境时，我们必须勇于面对，而且要愈挫愈勇，这样我们才能朝著成功之路前进。',
+             'errors': [{'position': 3, 'correction': {'竟': '境'}}]},
+            {'source': '人生就是如此，经过磨练才能让自己更加拙壮，才能使自己更加乐观。',
+             'target': '人生就是如此，经过磨练才能让自己更加茁壮，才能使自己更加乐观。',
+             'errors': [{'position': 18, 'correction': {'拙': '茁'}}]}
+           ]
+           '''
+
+         """
+
+TASK_MODEL_MAP = {"ernie-csc": "ernie-1.0"}
+
+
+class CSCTask(Task):
+    """
+    The text generation model to predict the question or chinese  poetry.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    resource_files_names = {"model_state": "model_state.pdparams", "pinyin_vocab": "pinyin_vocab.txt"}
+    resource_files_urls = {
+        "ernie-csc": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/text_correction/ernie-csc/model_state.pdparams",
+                "cdc53e7e3985ffc78fedcdf8e6dca6d2",
+            ],
+            "pinyin_vocab": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/text_correction/ernie-csc/pinyin_vocab.txt",
+                "5599a8116b6016af573d08f8e686b4b2",
+            ],
+        }
+    }
+
+    def __init__(self, task, model, **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
+        self._usage = usage
+        self._check_task_files()
+        self._construct_vocabs()
+        self._get_inference_model()
+        self._construct_tokenizer(model)
+        try:
+            import pypinyin
+        except ImportError:
+            raise ImportError("Please install the dependencies first, pip install pypinyin --upgrade")
+        self._pypinyin = pypinyin
+        self._batchify_fn = lambda samples, fn=Tuple(
+            Pad(axis=0, pad_val=self._tokenizer.pad_token_id, dtype="int64"),  # input
+            Pad(axis=0, pad_val=self._tokenizer.pad_token_type_id, dtype="int64"),  # segment
+            Pad(
+                axis=0, pad_val=self._pinyin_vocab.token_to_idx[self._pinyin_vocab.pad_token], dtype="int64"
+            ),  # pinyin
+            Stack(axis=0, dtype="int64"),  # length
+        ): [data for data in fn(samples)]
+        self._num_workers = self.kwargs["num_workers"] if "num_workers" in self.kwargs else 0
+        self._batch_size = self.kwargs["batch_size"] if "batch_size" in self.kwargs else 1
+        self._lazy_load = self.kwargs["lazy_load"] if "lazy_load" in self.kwargs else False
+        self._max_seq_len = self.kwargs["max_seq_len"] if "max_seq_len" in self.kwargs else 128
+        self._split_sentence = self.kwargs["split_sentence"] if "split_sentence" in self.kwargs else False
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        self._input_spec = [
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="pinyin_ids"),
+        ]
+
+    def _construct_vocabs(self):
+        pinyin_vocab_path = os.path.join(self._task_path, "pinyin_vocab.txt")
+        self._pinyin_vocab = Vocab.load_vocabulary(pinyin_vocab_path, unk_token="[UNK]", pad_token="[PAD]")
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+        ernie = ErnieModel.from_pretrained(TASK_MODEL_MAP[model])
+        model_instance = ErnieForCSC(
+            ernie,
+            pinyin_vocab_size=len(self._pinyin_vocab),
+            pad_pinyin_id=self._pinyin_vocab[self._pinyin_vocab.pad_token],
+        )
+        # Load the model parameter for the predict
+        model_path = os.path.join(self._task_path, "model_state.pdparams")
+        state_dict = paddle.load(model_path)
+        model_instance.set_state_dict(state_dict)
+        self._model = model_instance
+        self._model.eval()
+
+    def _construct_tokenizer(self, model):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        self._tokenizer = ErnieTokenizer.from_pretrained(TASK_MODEL_MAP[model])
+
+    def _preprocess(self, inputs, padding=True, add_special_tokens=True):
+        input_texts = self._check_input_text(inputs)
+        examples = []
+        texts = []
+        max_predict_len = self._max_seq_len - 2
+        short_input_texts, self.input_mapping = self._auto_splitter(
+            input_texts, max_predict_len, split_sentence=self._split_sentence
+        )
+        for text in short_input_texts:
+            if not (isinstance(text, str) and len(text) > 0):
+                continue
+            example = {"source": text.strip()}
+            input_ids, token_type_ids, pinyin_ids, length = self._convert_example(example)
+            examples.append((input_ids, token_type_ids, pinyin_ids, length))
+            texts.append(example["source"])
+
+        batch_examples = [examples[idx : idx + self._batch_size] for idx in range(0, len(examples), self._batch_size)]
+        batch_texts = [
+            short_input_texts[idx : idx + self._batch_size] for idx in range(0, len(examples), self._batch_size)
+        ]
+        outputs = {}
+        outputs["batch_examples"] = batch_examples
+        outputs["batch_texts"] = batch_texts
+        return outputs
+
+    def _run_model(self, inputs):
+        """
+        Run the task model from the outputs of the `_tokenize` function.
+        """
+        results = []
+        with static_mode_guard():
+            for examples in inputs["batch_examples"]:
+                token_ids, token_type_ids, pinyin_ids, lengths = self._batchify_fn(examples)
+                self.input_handles[0].copy_from_cpu(token_ids)
+                self.input_handles[1].copy_from_cpu(pinyin_ids)
+                self.predictor.run()
+                det_preds = self.output_handle[0].copy_to_cpu()
+                char_preds = self.output_handle[1].copy_to_cpu()
+
+                batch_result = []
+                for i in range(len(lengths)):
+                    batch_result.append((det_preds[i], char_preds[i], lengths[i]))
+                results.append(batch_result)
+        inputs["batch_results"] = results
+        return inputs
+
+    def _postprocess(self, inputs):
+        """
+        The model output is the logits and probs, this function will convert the model output to raw text.
+        """
+        results = []
+
+        for examples, texts, temp_results in zip(
+            inputs["batch_examples"], inputs["batch_texts"], inputs["batch_results"]
+        ):
+            for i in range(len(examples)):
+                result = {}
+                det_pred, char_preds, length = temp_results[i]
+                pred_result = self._parse_decode(texts[i], char_preds, det_pred, length)
+                result["source"] = texts[i]
+                result["target"] = "".join(pred_result)
+                results.append(result)
+        results = self._auto_joiner(results, self.input_mapping, is_dict=True)
+        for result in results:
+            errors_result = []
+            for i, (source_token, target_token) in enumerate(zip(result["source"], result["target"])):
+                if source_token != target_token:
+                    errors_result.append({"position": i, "correction": {source_token: target_token}})
+            result["errors"] = errors_result
+        return results
+
+    def _convert_example(self, example):
+        source = example["source"]
+        words = list(source)
+        length = len(words)
+        words = ["[CLS]"] + words + ["[SEP]"]
+        input_ids = self._tokenizer.convert_tokens_to_ids(words)
+        token_type_ids = [0] * len(input_ids)
+
+        # Use pad token in pinyin emb to map word emb [CLS], [SEP]
+        pinyins = self._pypinyin.lazy_pinyin(source, style=self._pypinyin.Style.TONE3, neutral_tone_with_five=True)
+
+        pinyin_ids = [0]
+        # Align pinyin and chinese char
+        pinyin_offset = 0
+        for i, word in enumerate(words[1:-1]):
+            pinyin = "[UNK]" if word != "[PAD]" else "[PAD]"
+            if len(word) == 1 and is_chinese_char(ord(word)):
+                while pinyin_offset < len(pinyins):
+                    current_pinyin = pinyins[pinyin_offset][:-1]
+                    pinyin_offset += 1
+                    if current_pinyin in self._pinyin_vocab:
+                        pinyin = current_pinyin
+                        break
+            pinyin_ids.append(self._pinyin_vocab[pinyin])
+
+        pinyin_ids.append(0)
+        assert len(input_ids) == len(pinyin_ids), "length of input_ids must be equal to length of pinyin_ids"
+        return input_ids, token_type_ids, pinyin_ids, length
+
+    def _parse_decode(self, words, corr_preds, det_preds, lengths):
+        UNK = self._tokenizer.unk_token
+        UNK_id = self._tokenizer.convert_tokens_to_ids(UNK)
+
+        corr_pred = corr_preds[1 : 1 + lengths].tolist()
+        det_pred = det_preds[1 : 1 + lengths].tolist()
+        words = list(words)
+        rest_words = []
+        max_seq_length = self._max_seq_len - 2
+        if len(words) > max_seq_length:
+            rest_words = words[max_seq_length:]
+            words = words[:max_seq_length]
+
+        pred_result = ""
+        for j, word in enumerate(words):
+            candidates = self._tokenizer.convert_ids_to_tokens(
+                corr_pred[j] if corr_pred[j] < self._tokenizer.vocab_size else UNK_id
+            )
+            word_icc = is_chinese_char(ord(word))
+            cand_icc = is_chinese_char(ord(candidates)) if len(candidates) == 1 else False
+            if not word_icc or det_pred[j] == 0 or candidates in [UNK, "[PAD]"] or (word_icc and not cand_icc):
+                pred_result += word
+            else:
+                pred_result += candidates.lstrip("##")
+        pred_result += "".join(rest_words)
+        return pred_result
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text_feature_extraction.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text_feature_extraction.py
new file mode 100644
index 000000000..5d098eef0
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text_feature_extraction.py
@@ -0,0 +1,585 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import numpy as np
+import paddle
+
+from paddlenlp.data import DataCollatorWithPadding
+from paddlenlp.transformers import AutoModel, AutoTokenizer, ErnieDualEncoder
+
+from ..utils.log import logger
+from .task import Task
+from .utils import dygraph_mode_guard, static_mode_guard
+
+ENCODER_TYPE = {
+    "rocketqa-zh-dureader-query-encoder": "query",
+    "rocketqa-zh-dureader-para-encoder": "paragraph",
+    "rocketqa-zh-base-query-encoder": "query",
+    "rocketqa-zh-base-para-encoder": "paragraph",
+    "rocketqa-zh-medium-query-encoder": "query",
+    "rocketqa-zh-medium-para-encoder": "paragraph",
+    "rocketqa-zh-mini-query-encoder": "query",
+    "rocketqa-zh-mini-para-encoder": "paragraph",
+    "rocketqa-zh-micro-query-encoder": "query",
+    "rocketqa-zh-micro-para-encoder": "paragraph",
+    "rocketqa-zh-nano-query-encoder": "query",
+    "rocketqa-zh-nano-para-encoder": "paragraph",
+    "rocketqav2-en-marco-query-encoder": "query",
+    "rocketqav2-en-marco-para-encoder": "paragraph",
+    "ernie-search-base-dual-encoder-marco-en": "query_paragraph",
+}
+
+
+usage = r"""
+            from paddlenlp import Taskflow
+            import paddle.nn.functional as F
+            # Text feature_extraction with rocketqa-zh-base-query-encoder
+            text_encoder = Taskflow("feature_extraction", model='rocketqa-zh-base-query-encoder')
+            text_embeds = text_encoder(['春天适合种什么花？','谁有狂三这张高清的?'])
+            text_features1 = text_embeds["features"]
+            print(text_features1)
+            '''
+            Tensor(shape=[2, 768], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                [[ 0.27640465, -0.13405125,  0.00612330, ..., -0.15600294,
+                    -0.18932408, -0.03029604],
+                    [-0.12041329, -0.07424965,  0.07895312, ..., -0.17068857,
+                    0.04485796, -0.18887770]])
+            '''
+            text_embeds = text_encoder('春天适合种什么菜？')
+            text_features2 = text_embeds["features"]
+            print(text_features2)
+            '''
+            Tensor(shape=[1, 768], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                [[ 0.32578075, -0.02398480, -0.18929179, -0.18639392, -0.04062131,
+                    0.06708499, -0.04631376, -0.41177100, -0.23074438, -0.23627219,
+                ......
+            '''
+            probs = F.cosine_similarity(text_features1, text_features2)
+            print(probs)
+            '''
+            Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+                [0.86455142, 0.41222256])
+            '''
+         """
+
+
+class TextFeatureExtractionTask(Task):
+
+    resource_files_names = {
+        "model_state": "model_state.pdparams",
+        "config": "config.json",
+        "vocab_file": "vocab.txt",
+        "special_tokens_map": "special_tokens_map.json",
+        "tokenizer_config": "tokenizer_config.json",
+    }
+
+    resource_files_urls = {
+        "rocketqa-zh-dureader-query-encoder": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/feature_extraction/rocketqa-zh-dureader-query-encoder/model_state.pdparams",
+                "6125930530fd55ed715b0595e65789aa",
+            ],
+            "config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/feature_extraction/rocketqa-zh-dureader-query-encoder/config.json",
+                "efc1280069bb22b5bd06dc44b780bc6a",
+            ],
+            "vocab_file": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/feature_extraction/rocketqa-zh-dureader-query-encoder/vocab.txt",
+                "062f696cad47bb62da86d8ae187b0ef4",
+            ],
+            "special_tokens_map": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/feature_extraction/rocketqa-zh-dureader-query-encoder/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/feature_extraction/rocketqa-zh-dureader-query-encoder/tokenizer_config.json",
+                "3a50349b8514e744fed72e59baca51b5",
+            ],
+        },
+        "rocketqa-zh-base-query-encoder": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/feature_extraction/rocketqa-zh-base-query-encoder/model_state.pdparams",
+                "3bb1a7870792146c6dd2fa47a45e15cc",
+            ],
+            "config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/feature_extraction/rocketqa-zh-base-query-encoder/config.json",
+                "be88115dd8a00e9de6b44f8c9a055e1a",
+            ],
+            "vocab_file": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/feature_extraction/rocketqa-zh-base-query-encoder/vocab.txt",
+                "1c1c1f4fd93c5bed3b4eebec4de976a8",
+            ],
+            "special_tokens_map": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/feature_extraction/rocketqa-zh-base-query-encoder/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/feature_extraction/rocketqa-zh-base-query-encoder/tokenizer_config.json",
+                "be86466f6769fde498690269d099ea7c",
+            ],
+        },
+    }
+
+    def __init__(
+        self,
+        task: str = None,
+        model: str = None,
+        batch_size: int = 1,
+        max_seq_len: int = 128,
+        _static_mode: bool = True,
+        return_tensors: str = "pd",
+        reinitialize: bool = False,
+        share_parameters: bool = False,
+        is_paragraph: bool = False,
+        output_emb_size: Optional[int] = None,
+        **kwargs
+    ):
+        super().__init__(task=task, model=model, **kwargs)
+        self._seed = None
+        self.export_type = "text"
+        self._batch_size = batch_size
+        self.max_seq_len = max_seq_len
+        self.model = model
+        self._static_mode = _static_mode
+        self.return_tensors = return_tensors
+
+        self.reinitialize = reinitialize
+        self.share_parameters = share_parameters
+        self.output_emb_size = output_emb_size
+        self.is_paragraph = is_paragraph
+        self._check_para_encoder()
+        # self._check_task_files()
+        self._check_predictor_type()
+        self._construct_tokenizer()
+        # self._get_inference_model()
+        if self._static_mode:
+            self._get_inference_model()
+        else:
+            self._construct_model(model)
+
+    def _check_para_encoder(self):
+        if self.model in ENCODER_TYPE:
+            if ENCODER_TYPE[self.model] == "paragraph":
+                self.is_paragraph = True
+            else:
+                self.is_paragraph = False
+        else:
+            self.is_paragraph = False
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+        # self._model = ErnieDualEncoder(self._task_path)
+        self._model = ErnieDualEncoder(
+            query_model_name_or_path=self.model,
+            output_emb_size=self.output_emb_size,
+            reinitialize=self.reinitialize,
+            share_parameters=self.share_parameters,
+        )
+        self._model.eval()
+
+    def _construct_tokenizer(self):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        self._tokenizer = AutoTokenizer.from_pretrained(self.model)
+        if self._static_mode:
+            self._collator = DataCollatorWithPadding(self._tokenizer, return_tensors="np")
+        else:
+            self._collator = DataCollatorWithPadding(self._tokenizer, return_tensors="pd")
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        self._input_spec = [
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="token_type_ids"),
+        ]
+
+    def _batchify(self, data, batch_size):
+        """
+        Generate input batches.
+        """
+
+        def _parse_batch(batch_examples):
+            if self.is_paragraph:
+                # The input of the passage encoder is [CLS][SEP]...[SEP].
+                tokenized_inputs = self._tokenizer(
+                    text=[""] * len(batch_examples),
+                    text_pair=batch_examples,
+                    padding="max_length",
+                    truncation=True,
+                    max_seq_len=self.max_seq_len,
+                )
+            else:
+                tokenized_inputs = self._tokenizer(
+                    text=batch_examples,
+                    padding="max_length",
+                    truncation=True,
+                    max_seq_len=self.max_seq_len,
+                )
+            return tokenized_inputs
+
+        # Separates data into some batches.
+        one_batch = []
+        for example in data:
+            one_batch.append(example)
+            if len(one_batch) == batch_size:
+                yield _parse_batch(one_batch)
+                one_batch = []
+        if one_batch:
+            yield _parse_batch(one_batch)
+
+    def _preprocess(self, inputs):
+        """
+        Transform the raw inputs to the model inputs, two steps involved:
+           1) Transform the raw text/image to token ids/pixel_values.
+           2) Generate the other model inputs from the raw text/image and token ids/pixel_values.
+        """
+        inputs = self._check_input_text(inputs)
+        batches = self._batchify(inputs, self._batch_size)
+        outputs = {"batches": batches, "inputs": inputs}
+        return outputs
+
+    def _run_model(self, inputs, **kwargs):
+        """
+        Run the task model from the outputs of the `_preprocess` function.
+        """
+        all_feats = []
+        if self._static_mode:
+            with static_mode_guard():
+                for batch_inputs in inputs["batches"]:
+                    batch_inputs = self._collator(batch_inputs)
+                    if self._predictor_type == "paddle-inference":
+                        if "input_ids" in batch_inputs:
+                            self.input_handles[0].copy_from_cpu(batch_inputs["input_ids"])
+                            self.input_handles[1].copy_from_cpu(batch_inputs["token_type_ids"])
+                            self.predictor.run()
+                            text_features = self.output_handle[0].copy_to_cpu()
+                            all_feats.append(text_features)
+                    else:
+                        # onnx mode
+                        if "input_ids" in batch_inputs:
+                            input_dict = {}
+                            input_dict["input_ids"] = batch_inputs["input_ids"]
+                            input_dict["token_type_ids"] = batch_inputs["token_type_ids"]
+                            text_features = self.predictor.run(None, input_dict)[0].tolist()
+                            all_feats.append(text_features)
+
+        else:
+            with dygraph_mode_guard():
+                for batch_inputs in inputs["batches"]:
+                    batch_inputs = self._collator(batch_inputs)
+                    text_features = self._model.get_pooled_embedding(
+                        input_ids=batch_inputs["input_ids"], token_type_ids=batch_inputs["token_type_ids"]
+                    )
+                    all_feats.append(text_features.detach().numpy())
+        inputs.update({"features": all_feats})
+        return inputs
+
+    def _postprocess(self, inputs):
+        inputs["features"] = np.concatenate(inputs["features"], axis=0)
+        if self.return_tensors == "pd":
+            inputs["features"] = paddle.to_tensor(inputs["features"])
+        return inputs
+
+    def _convert_dygraph_to_static(self):
+        """
+        Convert the dygraph model to static model.
+        """
+        assert (
+            self._model is not None
+        ), "The dygraph model must be created before converting the dygraph model to static model."
+        assert (
+            self._input_spec is not None
+        ), "The input spec must be created before converting the dygraph model to static model."
+        logger.info("Converting to the inference model cost a little time.")
+
+        static_model = paddle.jit.to_static(self._model.get_pooled_embedding, input_spec=self._input_spec)
+        paddle.jit.save(static_model, self.inference_model_path)
+        logger.info("The inference model save in the path:{}".format(self.inference_model_path))
+
+
+def text_length(text):
+    # {key: value} case
+    if isinstance(text, dict):
+        return len(next(iter(text.values())))
+    # Object has no len() method
+    elif not hasattr(text, "__len__"):
+        return 1
+    # Empty string or list of ints
+    elif len(text) == 0 or isinstance(text[0], int):
+        return len(text)
+    # Sum of length of individual strings
+    else:
+        return sum([len(t) for t in text])
+
+
+class SentenceFeatureExtractionTask(Task):
+
+    resource_files_names = {
+        "model_state": "model_state.pdparams",
+        "config": "config.json",
+        "vocab_file": "vocab.txt",
+        "special_tokens_map": "special_tokens_map.json",
+        "tokenizer_config": "tokenizer_config.json",
+    }
+
+    def __init__(
+        self,
+        task: str = None,
+        model: str = None,
+        batch_size: int = 1,
+        max_seq_len: int = 512,
+        _static_mode: bool = True,
+        return_tensors: str = "pd",
+        pooling_mode: str = "cls_token",
+        **kwargs
+    ):
+        super().__init__(
+            task=task,
+            model=model,
+            pooling_mode=pooling_mode,
+            **kwargs,
+        )
+        self._seed = None
+        self.export_type = "text"
+        self._batch_size = batch_size
+        self.max_seq_len = max_seq_len
+        self.model = model
+        self._static_mode = _static_mode
+        self.return_tensors = return_tensors
+        self.pooling_mode = pooling_mode
+        self._check_predictor_type()
+        self._construct_tokenizer()
+        if self._static_mode:
+            self._get_inference_model()
+        else:
+            self._construct_model(model)
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+        self._model = AutoModel.from_pretrained(self.model)
+        self._model.eval()
+
+    def _construct_tokenizer(self):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        self._tokenizer = AutoTokenizer.from_pretrained(self.model)
+        self.pad_token_id = self._tokenizer.convert_tokens_to_ids(self._tokenizer.pad_token)
+        if self._static_mode:
+            self._collator = DataCollatorWithPadding(self._tokenizer, return_tensors="np")
+        else:
+            self._collator = DataCollatorWithPadding(self._tokenizer, return_tensors="pd")
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        self._input_spec = [
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="token_type_ids"),
+        ]
+
+    def _batchify(self, data, batch_size):
+        """
+        Generate input batches.
+        """
+
+        def _parse_batch(batch_examples, max_seq_len=None):
+            if isinstance(batch_examples[0], str):
+                to_tokenize = [batch_examples]
+            else:
+                batch1, batch2 = [], []
+                for text_tuple in batch_examples:
+                    batch1.append(text_tuple[0])
+                    batch2.append(text_tuple[1])
+                to_tokenize = [batch1, batch2]
+            to_tokenize = [[str(s).strip() for s in col] for col in to_tokenize]
+            if max_seq_len is None:
+                max_seq_len = self.max_seq_len
+            tokenized_inputs = self._tokenizer(
+                to_tokenize[0],
+                padding=True,
+                truncation="longest_first",
+                max_seq_len=max_seq_len,
+            )
+            return tokenized_inputs
+
+        # Seperates data into some batches.
+        one_batch = []
+        self.length_sorted_idx = np.argsort([-text_length(sen) for sen in data])
+        sentences_sorted = [data[idx] for idx in self.length_sorted_idx]
+
+        for example in range(len(sentences_sorted)):
+            one_batch.append(sentences_sorted[example])
+            if len(one_batch) == batch_size:
+                yield _parse_batch(one_batch)
+                one_batch = []
+        if one_batch:
+            yield _parse_batch(one_batch)
+
+    def _preprocess(self, inputs):
+        """
+        Transform the raw inputs to the model inputs, two steps involved:
+           1) Transform the raw text/image to token ids/pixel_values.
+           2) Generate the other model inputs from the raw text/image and token ids/pixel_values.
+        """
+        inputs = self._check_input_text(inputs)
+        batches = self._batchify(inputs, self._batch_size)
+        outputs = {"batches": batches, "inputs": inputs}
+        return outputs
+
+    def _run_model(self, inputs, **kwargs):
+        """
+        Run the task model from the outputs of the `_preprocess` function.
+        """
+        pooling_mode = kwargs.get("pooling_mode", None)
+        if pooling_mode is None:
+            pooling_mode = self.pooling_mode
+        all_feats = []
+        if self._static_mode:
+            with static_mode_guard():
+                for batch_inputs in inputs["batches"]:
+                    batch_inputs = self._collator(batch_inputs)
+                    if self._predictor_type == "paddle-inference":
+                        if "input_ids" in batch_inputs:
+                            self.input_handles[0].copy_from_cpu(batch_inputs["input_ids"])
+                            self.input_handles[1].copy_from_cpu(batch_inputs["token_type_ids"])
+                            self.predictor.run()
+                            token_embeddings = self.output_handle[0].copy_to_cpu()
+                            if pooling_mode == "max_tokens":
+                                attention_mask = (batch_inputs["input_ids"] != self.pad_token_id).astype(
+                                    token_embeddings.dtype
+                                )
+                                input_mask_expanded = np.expand_dims(attention_mask, -1).repeat(
+                                    token_embeddings.shape[-1], axis=-1
+                                )
+                                token_embeddings[input_mask_expanded == 0] = -1e9
+                                max_over_time = np.max(token_embeddings, 1)
+                                all_feats.append(max_over_time)
+                            elif pooling_mode == "mean_tokens" or pooling_mode == "mean_sqrt_len_tokens":
+                                attention_mask = (batch_inputs["input_ids"] != self.pad_token_id).astype(
+                                    token_embeddings.dtype
+                                )
+                                input_mask_expanded = np.expand_dims(attention_mask, -1).repeat(
+                                    token_embeddings.shape[-1], axis=-1
+                                )
+                                sum_embeddings = np.sum(token_embeddings * input_mask_expanded, 1)
+                                sum_mask = input_mask_expanded.sum(1)
+                                sum_mask = np.clip(sum_mask, a_min=1e-9, a_max=np.max(sum_mask))
+                                if pooling_mode == "mean_tokens":
+                                    all_feats.append(sum_embeddings / sum_mask)
+                                elif pooling_mode == "mean_sqrt_len_tokens":
+                                    all_feats.append(sum_embeddings / np.sqrt(sum_mask))
+                            else:
+                                cls_token = token_embeddings[:, 0]
+                                all_feats.append(cls_token)
+                    else:
+                        # onnx mode
+                        if "input_ids" in batch_inputs:
+                            input_dict = {}
+                            input_dict["input_ids"] = batch_inputs["input_ids"]
+                            input_dict["token_type_ids"] = batch_inputs["token_type_ids"]
+                            token_embeddings = self.predictor.run(None, input_dict)[0]
+                            if pooling_mode == "max_tokens":
+                                attention_mask = (batch_inputs["input_ids"] != self.pad_token_id).astype(
+                                    token_embeddings.dtype
+                                )
+                                input_mask_expanded = np.expand_dims(attention_mask, -1).repeat(
+                                    token_embeddings.shape[-1], axis=-1
+                                )
+                                token_embeddings[input_mask_expanded == 0] = -1e9
+                                max_over_time = np.max(token_embeddings, 1)
+                                all_feats.append(max_over_time)
+                            elif pooling_mode == "mean_tokens" or pooling_mode == "mean_sqrt_len_tokens":
+                                attention_mask = (batch_inputs["input_ids"] != self.pad_token_id).astype(
+                                    token_embeddings.dtype
+                                )
+                                input_mask_expanded = np.expand_dims(attention_mask, -1).repeat(
+                                    token_embeddings.shape[-1], axis=-1
+                                )
+                                sum_embeddings = np.sum(token_embeddings * input_mask_expanded, 1)
+                                sum_mask = input_mask_expanded.sum(1)
+                                sum_mask = np.clip(sum_mask, a_min=1e-9, a_max=np.max(sum_mask))
+                                if pooling_mode == "mean_tokens":
+                                    all_feats.append(sum_embeddings / sum_mask)
+                                elif pooling_mode == "mean_sqrt_len_tokens":
+                                    all_feats.append(sum_embeddings / np.sqrt(sum_mask))
+                            else:
+                                cls_token = token_embeddings[:, 0]
+                                all_feats.append(cls_token)
+        else:
+            with dygraph_mode_guard():
+                for batch_inputs in inputs["batches"]:
+                    batch_inputs = self._collator(batch_inputs)
+                    token_embeddings = self._model(input_ids=batch_inputs["input_ids"])[0]
+                    if pooling_mode == "max_tokens":
+                        attention_mask = (batch_inputs["input_ids"] != self.pad_token_id).astype(
+                            self._model.pooler.dense.weight.dtype
+                        )
+                        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.shape)
+                        token_embeddings[input_mask_expanded == 0] = -1e9
+                        max_over_time = paddle.max(token_embeddings, 1).detach().numpy()
+                        all_feats.append(max_over_time)
+
+                    elif pooling_mode == "mean_tokens" or pooling_mode == "mean_sqrt_len_tokens":
+                        attention_mask = (batch_inputs["input_ids"] != self.pad_token_id).astype(
+                            self._model.pooler.dense.weight.dtype
+                        )
+                        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.shape)
+                        sum_embeddings = paddle.sum(token_embeddings * input_mask_expanded, 1)
+                        sum_mask = input_mask_expanded.sum(1)
+                        sum_mask = paddle.clip(sum_mask, min=1e-9)
+                        if pooling_mode == "mean_tokens":
+                            text_features = sum_embeddings / sum_mask
+                            all_feats.append(text_features.detach().numpy())
+                        elif pooling_mode == "mean_sqrt_len_tokens":
+                            text_features = sum_embeddings / paddle.sqrt(sum_mask)
+                            all_feats.append(text_features.detach().numpy())
+                    else:
+                        cls_token = token_embeddings[:, 0].detach().numpy()
+                        all_feats.append(cls_token)
+        inputs.update({"features": all_feats})
+        return inputs
+
+    def _postprocess(self, inputs):
+        inputs["features"] = np.concatenate(inputs["features"], axis=0)
+        inputs["features"] = [inputs["features"][idx] for idx in np.argsort(self.length_sorted_idx)]
+        if self.return_tensors == "pd":
+            inputs["features"] = paddle.to_tensor(inputs["features"])
+        return inputs
+
+    def _convert_dygraph_to_static(self):
+        """
+        Convert the dygraph model to static model.
+        """
+        assert (
+            self._model is not None
+        ), "The dygraph model must be created before converting the dygraph model to static model."
+        assert (
+            self._input_spec is not None
+        ), "The input spec must be created before converting the dygraph model to static model."
+        logger.info("Converting to the inference model cost a little time.")
+
+        static_model = paddle.jit.to_static(self._model, input_spec=self._input_spec)
+        paddle.jit.save(static_model, self.inference_model_path)
+        logger.info("The inference model save in the path:{}".format(self.inference_model_path))
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text_generation.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text_generation.py
new file mode 100644
index 000000000..0eeaacf45
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text_generation.py
@@ -0,0 +1,158 @@
+# coding:utf-8
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+from ..data import Pad, Stack, Tuple
+from ..transformers import GPTChineseTokenizer, GPTForGreedyGeneration, GPTTokenizer
+from .task import Task
+from .utils import download_file, static_mode_guard
+
+usage = r"""
+         """
+
+URLS = {
+    "gpt-cpm-large-cn": [
+        "https://bj.bcebos.com/paddlenlp/taskflow/text_generation/gpt-cpm/gpt-cpm-large-cn_params.tar",
+        "5aad6f81053cfdbba4797f044fcf66d1",
+    ],
+}
+
+
+class TextGenerationTask(Task):
+    """
+    The text generation model to predict the question or chinese  poetry.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    def __init__(self, task, model, **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
+        # Default to static mode
+        self._static_mode = True
+        self._usage = usage
+        if self._static_mode:
+            download_file(self._task_path, "gpt-cpm-large-cn_params.tar", URLS[self.model][0], URLS[self.model][1])
+            self._get_inference_model()
+        else:
+            self._construct_model(model)
+        self._construct_tokenizer(model)
+        self.kwargs["generation_task"] = task
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        self._input_spec = [paddle.static.InputSpec(shape=[None, None], dtype="int64", name="token_ids")]
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+        model_instance = GPTForGreedyGeneration.from_pretrained(self.model, max_predict_len=32)
+        # Load the model parameter for the predict
+        model_instance.eval()
+        self._model = model_instance
+
+    def _construct_tokenizer(self, model):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        if self.model == "gpt-cpm-large-cn":
+            tokenizer_instance = GPTChineseTokenizer.from_pretrained(model)
+        else:
+            tokenizer_instance = GPTTokenizer.from_pretrained(model)
+
+        self._tokenizer = tokenizer_instance
+
+    def _preprocess(self, inputs, padding=True, add_special_tokens=True):
+        """
+        Transform the raw text to the model inputs, two steps involved:
+           1) Transform the raw text to token ids.
+           2) Generate the other model inputs from the raw text and token ids.
+        """
+        inputs = self._check_input_text(inputs)
+        # Get the config from the kwargs
+        batch_size = self.kwargs["batch_size"] if "batch_size" in self.kwargs else 1
+        generation_task = self.kwargs["generation_task"] if "generation_task" in self.kwargs else "question_answering"
+
+        def select_few_shot_input(model_name, generation_task):
+            pre_input = ""
+            if generation_task not in ["question_answering", "poetry_generation"]:
+                raise ValueError("The generation task must be question or poetry")
+            if model_name == "gpt-cpm-large-cn":
+                if generation_task == "question_answering":
+                    pre_input = "问题：中国的首都是哪里？答案：北京。\n问题：{} 答案："
+                else:
+                    pre_input = "默写古诗: 大漠孤烟直，长河落日圆。\n{}"
+            return pre_input
+
+        pre_input = select_few_shot_input(self.model, generation_task)
+
+        examples = []
+        filter_inputs = []
+        for input_text in inputs:
+            if not (isinstance(input_text, str) and len(input_text) > 0):
+                continue
+            filter_inputs.append(input_text)
+            few_shot_input = pre_input.format(input_text)
+            ids = self._tokenizer(few_shot_input)["input_ids"]
+            examples.append((ids, len(ids)))
+
+        batchify_fn = lambda samples, fn=Tuple(
+            Pad(axis=0, pad_val=0, dtype="int64"),
+            Stack(dtype="int64"),  # seq_len
+        ): fn(samples)
+
+        batches = [examples[idx : idx + batch_size] for idx in range(0, len(examples), batch_size)]
+        outputs = {}
+        outputs["text"] = filter_inputs
+        outputs["data_loader"] = batches
+        self._batchify_fn = batchify_fn
+        return outputs
+
+    def _run_model(self, inputs):
+        """
+        Run the task model from the outputs of the `_tokenize` function.
+        """
+        results = []
+        lens = []
+        with static_mode_guard():
+            for batch in inputs["data_loader"]:
+                ids, seq_len = self._batchify_fn(batch)
+                self.input_handles[0].copy_from_cpu(ids)
+                self.predictor.run()
+                result = self.output_handle[0].copy_to_cpu().tolist()
+                results.extend(result)
+                lens.extend(seq_len.tolist())
+        inputs["results"] = results
+        inputs["lens"] = lens
+        return inputs
+
+    def _postprocess(self, inputs):
+        """
+        The model output is tag ids, this function will convert the model output to raw text.
+        """
+        batch_out = []
+        preds = inputs["results"]
+        for index in range(0, len(preds)):
+            seq_len = inputs["lens"][index]
+            single_result = {}
+            single_result["text"] = inputs["text"][index]
+            single_result["answer"] = self._tokenizer.convert_ids_to_string(preds[index][seq_len:-1])
+            batch_out.append(single_result)
+        return batch_out
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text_similarity.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text_similarity.py
new file mode 100644
index 000000000..579212521
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text_similarity.py
@@ -0,0 +1,353 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+from paddlenlp.transformers import AutoModel, AutoTokenizer
+
+from ..data import Pad, Tuple
+from ..transformers import ErnieCrossEncoder, ErnieTokenizer
+from ..utils.log import logger
+from .task import Task
+from .utils import static_mode_guard
+
+usage = r"""
+         from paddlenlp import Taskflow
+
+         similarity = Taskflow("text_similarity")
+         similarity([["世界上什么东西最小", "世界上什么东西最小？"]])
+         '''
+         [{'text1': '世界上什么东西最小', 'text2': '世界上什么东西最小？', 'similarity': 0.992725}]
+         '''
+
+         similarity = Taskflow("text_similarity", batch_size=2)
+         similarity([["光眼睛大就好看吗", "眼睛好看吗？"], ["小蝌蚪找妈妈怎么样", "小蝌蚪找妈妈是谁画的"]])
+         '''
+         [{'text1': '光眼睛大就好看吗', 'text2': '眼睛好看吗？', 'similarity': 0.74502707}, {'text1': '小蝌蚪找妈妈怎么样', 'text2': '小蝌蚪找妈妈是谁画的', 'similarity': 0.8192149}]
+         '''
+         """
+MATCH_TYPE = {
+    "rocketqa-zh-dureader-cross-encoder": "matching",
+    "rocketqa-base-cross-encoder": "matching",
+    "rocketqa-medium-cross-encoder": "matching",
+    "rocketqa-mini-cross-encoder": "matching",
+    "rocketqa-micro-cross-encoder": "matching",
+    "rocketqa-nano-cross-encoder": "matching",
+    "rocketqav2-en-marco-cross-encoder": "matching_v2",
+    "ernie-search-large-cross-encoder-marco-en": "matching_v3",
+}
+
+
+class TextSimilarityTask(Task):
+    """
+    Text similarity task using SimBERT to predict the similarity of sentence pair.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    resource_files_names = {
+        "model_state": "model_state.pdparams",
+        "model_config": "model_config.json",
+    }
+    resource_files_urls = {
+        "simbert-base-chinese": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/text_similarity/simbert-base-chinese/model_state.pdparams",
+                "27d9ef240c2e8e736bdfefea52af2542",
+            ],
+            "model_config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/text_similarity/simbert-base-chinese/model_config.json",
+                "1254bbd7598457a9dad0afcb2e24b70c",
+            ],
+        },
+        "rocketqa-zh-dureader-cross-encoder": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/text_similarity/rocketqa-zh-dureader-cross-encoder/model_state.pdparams",
+                "88bc3e1a64992a1bdfe4044ecba13bc7",
+            ],
+            "model_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/text_similarity/rocketqa-zh-dureader-cross-encoder/model_config.json",
+                "b69083c2895e8f68e1a10467b384daab",
+            ],
+        },
+        "rocketqa-base-cross-encoder": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/text_similarity/rocketqa-base-cross-encoder/model_state.pdparams",
+                "6d845a492a2695e62f2be79f8017be92",
+            ],
+            "model_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/text_similarity/rocketqa-base-cross-encoder/model_config.json",
+                "18ce260ede18bc3cb28dcb2e7df23b1a",
+            ],
+        },
+        "rocketqa-medium-cross-encoder": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/text_similarity/rocketqa-medium-cross-encoder/model_state.pdparams",
+                "4b929f4fc11a1df8f59fdf2784e23fa7",
+            ],
+            "model_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/text_similarity/rocketqa-medium-cross-encoder/model_config.json",
+                "10997db96bc86e29cd113e1bf58989d7",
+            ],
+        },
+        "rocketqa-mini-cross-encoder": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/text_similarity/rocketqa-mini-cross-encoder/model_state.pdparams",
+                "c411111df990132fb88c070d8b8cf3f7",
+            ],
+            "model_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/text_similarity/rocketqa-mini-cross-encoder/model_config.json",
+                "271e6d779acbe8e8acdd596b1c835546",
+            ],
+        },
+        "rocketqa-micro-cross-encoder": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/text_similarity/rocketqa-micro-cross-encoder/model_state.pdparams",
+                "3d643ff7d6029c8ceab5653680167dc0",
+            ],
+            "model_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/text_similarity/rocketqa-micro-cross-encoder/model_config.json",
+                "b32d1a932d8c367fab2a6216459dd0a7",
+            ],
+        },
+        "rocketqa-nano-cross-encoder": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/text_similarity/rocketqa-nano-cross-encoder/model_state.pdparams",
+                "4c1d36e5e94f5af09f665fc7ad0be140",
+            ],
+            "model_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/text_similarity/rocketqa-nano-cross-encoder/model_config.json",
+                "dcff14cd671e1064be2c5d63734098bb",
+            ],
+        },
+        "rocketqav2-en-marco-cross-encoder": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/text_similarity/rocketqav2-en-marco-cross-encoder/model_state.pdparams",
+                "a5afc77b6a63fc32a1beca3010f40f32",
+            ],
+            "model_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/text_similarity/rocketqav2-en-marco-cross-encoder/config.json",
+                "8f5d5c71c8a891b68d0402a13e38b6f9",
+            ],
+        },
+        "ernie-search-large-cross-encoder-marco-en": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/text_similarity/ernie-search-large-cross-encoder-marco-en/model_state.pdparams",
+                "fdf29f7de0f7fe570740d343c96165e5",
+            ],
+            "model_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/text_similarity/ernie-search-large-cross-encoder-marco-en/config.json",
+                "28bad2c7b36fa148fa75a8dc5b690485",
+            ],
+        },
+        "__internal_testing__/tiny-random-bert": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-bert/model_state.pdparams",
+                "8d8814d589c21bf083fdb35de6c11a57",
+            ],
+            "model_config": [
+                "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-bert/config.json",
+                "37e28e2359f330f64fc82beff1967a1e",
+            ],
+        },
+    }
+
+    def __init__(self, task, model, batch_size=1, max_length=384, **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
+        self._static_mode = True
+        self._check_predictor_type()
+        if not self.from_hf_hub:
+            self._check_task_files()
+        if self._static_mode:
+            self._get_inference_model()
+        else:
+            self._construct_model(model)
+        self._construct_tokenizer(model)
+        self._batch_size = batch_size
+        self._max_length = max_length
+        self._usage = usage
+        self.model_name = model
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        self._input_spec = [
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="token_type_ids"),
+        ]
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+
+        if "rocketqav2-en" in model or "ernie-search" in model:
+            self._model = ErnieCrossEncoder(self._task_path, num_classes=1, reinitialize=True)
+        elif "rocketqa" in model:
+            self._model = ErnieCrossEncoder(self._task_path, num_classes=2)
+        else:
+            self._model = AutoModel.from_pretrained(self._task_path, pool_act="linear")
+        self._model.eval()
+
+    def _construct_tokenizer(self, model):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        if "rocketqa" in model or "ernie-search" in model:
+            self._tokenizer = ErnieTokenizer.from_pretrained(model)
+        else:
+            self._tokenizer = AutoTokenizer.from_pretrained(model)
+
+    def _check_input_text(self, inputs):
+        inputs = inputs[0]
+        if not all([isinstance(i, list) and i and all(i) and len(i) == 2 for i in inputs]):
+            raise TypeError("Invalid input format.")
+        return inputs
+
+    def _preprocess(self, inputs):
+        """
+        Transform the raw text to the model inputs, two steps involved:
+           1) Transform the raw text to token ids.
+           2) Generate the other model inputs from the raw text and token ids.
+        """
+        inputs = self._check_input_text(inputs)
+
+        examples = []
+        for data in inputs:
+            text1, text2 = data[0], data[1]
+            if "rocketqa" in self.model_name or "ernie-search" in self.model_name:
+                # Todo: wugaosheng, Add erine-search encoding support
+                encoded_inputs = self._tokenizer(text=text1, text_pair=text2, max_length=self._max_length)
+                ids = encoded_inputs["input_ids"]
+                segment_ids = encoded_inputs["token_type_ids"]
+                examples.append((ids, segment_ids))
+            else:
+                text1_encoded_inputs = self._tokenizer(text=text1, max_length=self._max_length)
+                text1_input_ids = text1_encoded_inputs["input_ids"]
+                text1_token_type_ids = text1_encoded_inputs["token_type_ids"]
+
+                text2_encoded_inputs = self._tokenizer(text=text2, max_length=self._max_length)
+                text2_input_ids = text2_encoded_inputs["input_ids"]
+                text2_token_type_ids = text2_encoded_inputs["token_type_ids"]
+
+                examples.append((text1_input_ids, text1_token_type_ids, text2_input_ids, text2_token_type_ids))
+
+        batches = [examples[idx : idx + self._batch_size] for idx in range(0, len(examples), self._batch_size)]
+        if "rocketqa" in self.model_name or "ernie-search" in self.model_name:
+            batchify_fn = lambda samples, fn=Tuple(  # noqa: E731
+                Pad(axis=0, pad_val=self._tokenizer.pad_token_id, dtype="int64"),  # input ids
+                Pad(axis=0, pad_val=self._tokenizer.pad_token_type_id, dtype="int64"),  # token type ids
+            ): [data for data in fn(samples)]
+        else:
+            batchify_fn = lambda samples, fn=Tuple(  # noqa: E731
+                Pad(axis=0, pad_val=self._tokenizer.pad_token_id, dtype="int64"),  # text1_input_ids
+                Pad(axis=0, pad_val=self._tokenizer.pad_token_type_id, dtype="int64"),  # text1_token_type_ids
+                Pad(axis=0, pad_val=self._tokenizer.pad_token_id, dtype="int64"),  # text2_input_ids
+                Pad(axis=0, pad_val=self._tokenizer.pad_token_type_id, dtype="int64"),  # text2_token_type_ids
+            ): [data for data in fn(samples)]
+
+        outputs = {}
+        outputs["data_loader"] = batches
+        outputs["text"] = inputs
+        self._batchify_fn = batchify_fn
+        return outputs
+
+    def _run_model(self, inputs):
+        """
+        Run the task model from the outputs of the `_tokenize` function.
+        """
+        results = []
+        if "rocketqa" in self.model_name or "ernie-search" in self.model_name:
+            with static_mode_guard():
+                for batch in inputs["data_loader"]:
+
+                    if self._predictor_type == "paddle-inference":
+                        input_ids, segment_ids = self._batchify_fn(batch)
+                        self.input_handles[0].copy_from_cpu(input_ids)
+                        self.input_handles[1].copy_from_cpu(segment_ids)
+                        self.predictor.run()
+                        scores = self.output_handle[0].copy_to_cpu().tolist()
+                        results.extend(scores)
+                    else:
+                        # onnx mode
+                        input_dict = {}
+                        input_ids, segment_ids = self._batchify_fn(batch)
+                        input_dict["input_ids"] = input_ids
+                        input_dict["token_type_ids"] = segment_ids
+                        scores = self.predictor.run(None, input_dict)[0].tolist()
+                        results.extend(scores)
+        else:
+            with static_mode_guard():
+                for batch in inputs["data_loader"]:
+                    text1_ids, text1_segment_ids, text2_ids, text2_segment_ids = self._batchify_fn(batch)
+                    self.input_handles[0].copy_from_cpu(text1_ids)
+                    self.input_handles[1].copy_from_cpu(text1_segment_ids)
+                    self.predictor.run()
+                    vecs_text1 = self.output_handle[1].copy_to_cpu()
+
+                    self.input_handles[0].copy_from_cpu(text2_ids)
+                    self.input_handles[1].copy_from_cpu(text2_segment_ids)
+                    self.predictor.run()
+                    vecs_text2 = self.output_handle[1].copy_to_cpu()
+
+                    vecs_text1 = vecs_text1 / (vecs_text1**2).sum(axis=1, keepdims=True) ** 0.5
+                    vecs_text2 = vecs_text2 / (vecs_text2**2).sum(axis=1, keepdims=True) ** 0.5
+                    similarity = (vecs_text1 * vecs_text2).sum(axis=1)
+                    results.extend(similarity)
+        inputs["result"] = results
+        return inputs
+
+    def _postprocess(self, inputs):
+        """
+        The model output is tag ids, this function will convert the model output to raw text.
+        """
+        final_results = []
+        for text, similarity in zip(inputs["text"], inputs["result"]):
+            result = {}
+            result["text1"] = text[0]
+            result["text2"] = text[1]
+            # The numpy.float32 can not be converted to the json format
+            if isinstance(similarity, list):
+                result["similarity"] = float(similarity[0])
+            else:
+                result["similarity"] = float(similarity)
+            final_results.append(result)
+        return final_results
+
+    def _convert_dygraph_to_static(self):
+        """
+        Convert the dygraph model to static model.
+        """
+        assert (
+            self._model is not None
+        ), "The dygraph model must be created before converting the dygraph model to static model."
+        assert (
+            self._input_spec is not None
+        ), "The input spec must be created before converting the dygraph model to static model."
+        logger.info("Converting to the inference model cost a little time.")
+        if self.model in MATCH_TYPE:
+            if MATCH_TYPE[self.model] == "matching":
+                static_model = paddle.jit.to_static(self._model.matching, input_spec=self._input_spec)
+            elif MATCH_TYPE[self.model] == "matching_v2":
+                static_model = paddle.jit.to_static(self._model.matching_v2, input_spec=self._input_spec)
+            elif MATCH_TYPE[self.model] == "matching_v3":
+                static_model = paddle.jit.to_static(self._model.matching_v3, input_spec=self._input_spec)
+        else:
+            static_model = paddle.jit.to_static(self._model, input_spec=self._input_spec)
+
+        paddle.jit.save(static_model, self.inference_model_path)
+        logger.info("The inference model save in the path:{}".format(self.inference_model_path))
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text_summarization.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text_summarization.py
new file mode 100644
index 000000000..0acad2be6
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/text_summarization.py
@@ -0,0 +1,315 @@
+# coding:utf-8
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+
+from ..data import Pad
+from ..transformers import (
+    AutoModelForConditionalGeneration,
+    AutoTokenizer,
+    UNIMOForConditionalGeneration,
+)
+from .task import Task
+
+usage = r"""
+           from paddlenlp import Taskflow
+
+           text_summarization = Taskflow("text_summarization")
+           text_summarization(2022年，中国房地产进入转型阵痛期，传统“高杠杆、快周转”的模式难以为继，万科甚至直接喊话，中国房地产进入“黑铁时代”)
+           '''
+            ['万科喊话中国房地产进入“黑铁时代”']
+           '''
+
+           text_summarization(['据悉，2022年教育部将围绕“巩固提高、深化落实、创新突破”三个关键词展开工作。要进一步强化学校教育主阵地作用，继续把落实“双减”作为学校工作的重中之重，重点从提高作业设计水平、提高课后服务水平、提高课堂教学水平、提高均衡发展水平四个方面持续巩固提高学校“双减”工作水平。',
+          '党参有降血脂，降血压的作用，可以彻底消除血液中的垃圾，从而对冠心病以及心血管疾病的患者都有一定的稳定预防工作作用，因此平时口服党参能远离三高的危害。另外党参除了益气养血，降低中枢神经作用，调整消化系统功能，健脾补肺的功能。'])
+           '''
+            ['教育部：将从四个方面持续巩固提高学校“双减”工作水平', '党参能降低三高的危害']
+           '''
+         """
+
+
+class TextSummarizationTask(Task):
+    """
+    The text summarization model to predict the summary of an input text.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    def __init__(self, task, model, **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
+        self._batch_size = kwargs.get("batch_size", 1)
+        self._output_scores = kwargs.get("output_scores", False)
+        self._model_type = None
+        self._construct_tokenizer(model)
+        self._construct_model(model)
+        # Hypter-parameter during generating.
+        self._max_length = kwargs.get("max_length", 128)
+        self._min_length = kwargs.get("min_length", 0)
+        self._decode_strategy = kwargs.get("decode_strategy", "beam_search")
+        self._temperature = kwargs.get("temperature", 1.0)
+        self._top_k = kwargs.get("top_k", 5)
+        self._top_p = kwargs.get("top_p", 1.0)
+        self._num_beams = kwargs.get("num_beams", 4)
+        self._length_penalty = kwargs.get("length_penalty", 0.0)
+        self._num_return_sequences = kwargs.get("num_return_sequences", 1)
+        self._repetition_penalty = kwargs.get("repetition_penalty", 1)
+        self._use_faster = kwargs.get("use_faster", False)
+        self._use_fp16_decoding = kwargs.get("use_fp16_decoding", False)
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+        if self._custom_model:
+            self._model = AutoModelForConditionalGeneration.from_pretrained(
+                self._task_path, from_hf_hub=self.from_hf_hub
+            )
+        else:
+            self._model = AutoModelForConditionalGeneration.from_pretrained(model)
+        self._model.eval()
+        if isinstance(self._model, UNIMOForConditionalGeneration):
+            self._model_type = "unimo-text"
+
+    def _construct_tokenizer(self, model):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        if self._custom_model:
+            self._tokenizer = AutoTokenizer.from_pretrained(self._task_path, from_hf_hub=self.from_hf_hub)
+        else:
+            self._tokenizer = AutoTokenizer.from_pretrained(model)
+
+    def _preprocess(self, inputs):
+        """
+        Transform the raw text to the model inputs, two steps involved:
+           1) Transform the raw text to token ids.
+           2) Generate the other model inputs from the raw text and token ids.
+        """
+        inputs = self._check_input_text(inputs)
+        batches = self._batchify(inputs, self._batch_size)
+        outputs = {"batches": batches, "text": inputs}
+        return outputs
+
+    def _batchify(self, data, batch_size):
+        """
+        Generate input batches.
+        """
+        pad_right = False
+        if self._model_type != "unimo-text":
+            pad_right = True
+        examples = [self._convert_example(i) for i in data]
+        # Separates data into some batches.
+        one_batch = []
+        for example in examples:
+            one_batch.append(example)
+            if len(one_batch) == batch_size:
+                yield self._parse_batch(one_batch, self._tokenizer.pad_token_id, pad_right)
+                one_batch = []
+        if one_batch:
+            yield self._parse_batch(one_batch, self._tokenizer.pad_token_id, pad_right)
+
+    def _convert_example(self, example, max_seq_len=512, return_length=True):
+        """
+        Convert all examples into necessary features.
+        """
+        if self._model_type != "unimo-text":
+            tokenized_example = self._tokenizer(
+                example, max_length=max_seq_len, padding=False, truncation=True, return_attention_mask=True
+            )
+        else:
+            tokenized_example = self._tokenizer.gen_encode(
+                example,
+                max_seq_len=max_seq_len,
+                add_start_token_for_decoding=True,
+                return_length=True,
+                is_split_into_words=False,
+            )
+        # Use to gather the logits corresponding to the labels during training
+        return tokenized_example
+
+    def _parse_batch(self, batch_examples, pad_val, pad_right=False):
+        """
+        Batchify a batch of examples.
+        """
+
+        def pad_mask(batch_attention_mask):
+            """Pad attention_mask."""
+            batch_size = len(batch_attention_mask)
+            max_len = max(map(len, batch_attention_mask))
+            attention_mask = np.ones((batch_size, max_len, max_len), dtype="float32") * -1e9
+            for i, mask_data in enumerate(attention_mask):
+                seq_len = len(batch_attention_mask[i])
+                if pad_right:
+                    mask_data[:seq_len:, :seq_len] = np.array(batch_attention_mask[i], dtype="float32")
+                else:
+                    mask_data[-seq_len:, -seq_len:] = np.array(batch_attention_mask[i], dtype="float32")
+            # In order to ensure the correct broadcasting mechanism, expand one
+            # dimension to the second dimension (n_head of Transformer).
+            attention_mask = np.expand_dims(attention_mask, axis=1)
+            return attention_mask
+
+        pad_func = Pad(pad_val=pad_val, pad_right=pad_right, dtype="int32")
+        batch_dict = {}
+        input_ids = pad_func([example["input_ids"] for example in batch_examples])
+        if self._model_type != "unimo-text":
+            attention_mask = (input_ids != pad_val).astype("float32")
+            batch_dict["input_ids"] = input_ids
+            batch_dict["attention_mask"] = attention_mask
+        else:
+            token_type_ids = pad_func([example["token_type_ids"] for example in batch_examples])
+            position_ids = pad_func([example["position_ids"] for example in batch_examples])
+            attention_mask = pad_mask([example["attention_mask"] for example in batch_examples])
+            seq_len = np.asarray([example["seq_len"] for example in batch_examples], dtype="int32")
+            batch_dict["input_ids"] = input_ids
+            batch_dict["token_type_ids"] = token_type_ids
+            batch_dict["position_ids"] = position_ids
+            batch_dict["attention_mask"] = attention_mask
+            batch_dict["seq_len"] = seq_len
+        return batch_dict
+
+    def _run_model(self, inputs):
+        """
+        Run the task model from the outputs of the `_preprocess` function.
+        """
+        all_ids = []
+        all_scores = []
+
+        for batch in inputs["batches"]:
+            input_ids = paddle.to_tensor(batch["input_ids"], dtype="int64")
+            token_type_ids = (
+                paddle.to_tensor(batch["token_type_ids"], dtype="int64") if "token_type_ids" in batch else None
+            )
+            position_ids = paddle.to_tensor(batch["position_ids"], dtype="int64") if "position_ids" in batch else None
+            attention_mask = paddle.to_tensor(batch["attention_mask"], dtype="float32")
+            ids, scores = self._model.generate(
+                input_ids=input_ids,
+                token_type_ids=token_type_ids,
+                position_ids=position_ids,
+                attention_mask=attention_mask,
+                max_length=self._max_length,
+                min_length=self._min_length,
+                decode_strategy=self._decode_strategy,
+                temperature=self._temperature,
+                top_k=self._top_k,
+                top_p=self._top_p,
+                num_beams=self._num_beams,
+                length_penalty=self._length_penalty,
+                num_return_sequences=self._num_return_sequences,
+                repetition_penalty=self._repetition_penalty,
+                bos_token_id=None if self._model_type != "unimo-text" else self._tokenizer.cls_token_id,
+                eos_token_id=None if self._model_type != "unimo-text" else self._tokenizer.mask_token_id,
+                use_fast=self._use_faster,
+                use_fp16_decoding=self._use_fp16_decoding,
+            )
+            all_ids.extend(ids)
+            all_scores.extend(scores)
+        inputs["ids"] = all_ids
+        inputs["scores"] = all_scores
+        return inputs
+
+    def _postprocess(self, inputs):
+        """
+        The model output is tag ids, this function will convert the model output to raw text.
+        """
+        ids_list = inputs["ids"]
+        scores_list = inputs["scores"]
+        if self._model_type != "unimo-text":
+            output_tokens = self._tokenizer.batch_decode(
+                ids_list, skip_special_tokens=True, clean_up_tokenization_spaces=False
+            )
+            output_scores = [i.numpy() for i in scores_list]
+        else:
+            results = self._select_from_num_return_sequences(
+                ids_list, scores_list, self._max_length, self._num_return_sequences
+            )
+            output_tokens = [result[0] for result in results]
+            output_scores = [result[1] for result in results]
+
+        if self._output_scores:
+            return output_tokens, output_scores
+        return output_tokens
+
+    def _select_from_num_return_sequences(self, ids, scores, max_dec_len=None, num_return_sequences=1):
+        """
+        Select generated sequence form several return sequences.
+        """
+        results = []
+        group = []
+        tmp = []
+        if scores is not None:
+            ids = [i.numpy() for i in ids]
+            scores = [i.numpy() for i in scores]
+
+            if len(ids) != len(scores) or (len(ids) % num_return_sequences) != 0:
+                raise ValueError(
+                    "the length of `ids` is {}, but the `num_return_sequences` is {}".format(
+                        len(ids), num_return_sequences
+                    )
+                )
+
+            for pred, score in zip(ids, scores):
+                pred_token_ids, pred_tokens = self._post_process_decoded_sequence(pred)
+                num_token = len(pred_token_ids)
+                target = "".join(pred_tokens)
+                # not ending
+                if max_dec_len is not None and num_token >= max_dec_len:
+                    score -= 1e3
+                tmp.append([target, score])
+                if len(tmp) == num_return_sequences:
+                    group.append(tmp)
+                    tmp = []
+            for preds in group:
+                preds = sorted(preds, key=lambda x: -x[1])
+                results.append(preds[0])
+        else:
+            ids = ids.numpy()
+            for pred in ids:
+                pred_token_ids, pred_tokens = self._post_process_decoded_sequence(pred)
+                num_token = len(pred_token_ids)
+                response = "".join(pred_tokens)
+                # TODO: Support return scores in FT.
+                tmp.append([response])
+                if len(tmp) == num_return_sequences:
+                    group.append(tmp)
+                    tmp = []
+
+            for preds in group:
+                results.append(preds[0])
+        return results
+
+    def _post_process_decoded_sequence(self, token_ids):
+        """Post-process the decoded sequence. Truncate from the first <eos>."""
+        eos_pos = len(token_ids)
+        for i, tok_id in enumerate(token_ids):
+            if tok_id == self._tokenizer.mask_token_id:
+                eos_pos = i
+                break
+        token_ids = token_ids[:eos_pos]
+        tokens = self._tokenizer.convert_ids_to_tokens(token_ids)
+        tokens = self._tokenizer.merge_subword(tokens)
+        special_tokens = ["[UNK]"]
+        tokens = [token for token in tokens if token not in special_tokens]
+        return token_ids, tokens
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        self._input_spec = [
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),
+        ]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/utils.py
new file mode 100644
index 000000000..a5dbb0ed5
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/utils.py
@@ -0,0 +1,2548 @@
+# coding:utf-8
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import copy
+import csv
+import json
+import math
+import os
+import pickle
+import re
+import traceback
+import warnings
+from collections import OrderedDict, namedtuple
+from dataclasses import dataclass
+from datetime import datetime
+from functools import cmp_to_key
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+import six
+from paddle.dataset.common import md5file
+from PIL import Image
+
+from ..transformers.tokenizer_utils_base import PaddingStrategy, PretrainedTokenizerBase
+from ..utils.downloader import DownloaderCheck, get_path_from_url
+from ..utils.image_utils import (
+    Bbox,
+    DecodeImage,
+    NormalizeImage,
+    PadBatch,
+    Permute,
+    ResizeImage,
+    check,
+    img2base64,
+    two_dimension_sort_layout,
+)
+from ..utils.log import logger
+
+DOC_FORMAT = r"""
+    Examples:
+        .. code-block:: python
+              """
+DOWNLOAD_CHECK = False
+
+
+def download_file(save_dir, filename, url, md5=None):
+    """
+    Download the file from the url to specified directory.
+    Check md5 value when the file is exists, if the md5 value is the same as the existed file, just use
+    the older file, if not, will download the file from the url.
+
+    Args:
+        save_dir(string): The specified directory saving the file.
+        filename(string): The specified filename saving the file.
+        url(string): The url downling the file.
+        md5(string, optional): The md5 value that checking the version downloaded.
+    """
+    fullname = os.path.join(save_dir, filename)
+    if os.path.exists(fullname):
+        if md5 and (not md5file(fullname) == md5):
+            logger.info("Updating {} from {}".format(filename, url))
+            logger.disable()
+            get_path_from_url(url, save_dir, md5)
+    else:
+        logger.info("Downloading {} from {}".format(filename, url))
+        logger.disable()
+        get_path_from_url(url, save_dir, md5)
+    logger.enable()
+    return fullname
+
+
+def download_check(task):
+    """
+    Check the resource status in the specified task.
+
+    Args:
+        task(string): The name of specified task.
+    """
+    logger.disable()
+    global DOWNLOAD_CHECK
+    if not DOWNLOAD_CHECK:
+        DOWNLOAD_CHECK = True
+        checker = DownloaderCheck(task)
+        checker.start()
+        checker.join()
+    logger.enable()
+
+
+def add_docstrings(*docstr):
+    """
+    The function that add the doc string to doc of class.
+    """
+
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + "".join(DOC_FORMAT) + "".join(docstr)
+        return fn
+
+    return docstring_decorator
+
+
+@contextlib.contextmanager
+def static_mode_guard():
+    paddle.enable_static()
+    yield
+    paddle.disable_static()
+
+
+@contextlib.contextmanager
+def dygraph_mode_guard():
+    paddle.disable_static()
+    yield
+
+
+def cut_chinese_sent(para):
+    """
+    Cut the Chinese sentences more precisely, reference to "https://blog.csdn.net/blmoistawinde/article/details/82379256".
+    """
+    para = re.sub(r"([。！？\?])([^”’])", r"\1\n\2", para)
+    para = re.sub(r"(\.{6})([^”’])", r"\1\n\2", para)
+    para = re.sub(r"(\…{2})([^”’])", r"\1\n\2", para)
+    para = re.sub(r"([。！？\?][”’])([^，。！？\?])", r"\1\n\2", para)
+    para = para.rstrip()
+    return para.split("\n")
+
+
+class TermTreeNode(object):
+    """Defination of term node. All members are protected, to keep rigorism of data struct.
+
+    Args:
+        sid (str): term id of node.
+        term (str): term, common name of this term.
+        base (str): `cb` indicates concept base, `eb` indicates entity base.
+        term_type (Optional[str], optional): type of this term, constructs hirechical of `term` node. Defaults to None.
+        hyper (Optional[str], optional): parent type of a `type` node. Defaults to None.
+        node_type (str, optional): type statement of node, `type` or `term`. Defaults to "term".
+        alias (Optional[List[str]], optional): alias of this term. Defaults to None.
+        alias_ext (Optional[List[str]], optional): extended alias of this term, CANNOT be used in matching.
+            Defaults to None.
+        sub_type (Optional[List[str]], optional): grouped by some term. Defaults to None.
+        sub_term (Optional[List[str]], optional): some lower term. Defaults to None.
+        data (Optional[Dict[str, Any]], optional): to sore full imformation of a term. Defaults to None.
+
+    """
+
+    def __init__(
+        self,
+        sid: str,
+        term: str,
+        base: str,
+        node_type: str = "term",
+        term_type: Optional[str] = None,
+        hyper: Optional[str] = None,
+        level: Optional[int] = None,
+        alias: Optional[List[str]] = None,
+        alias_ext: Optional[List[str]] = None,
+        sub_type: Optional[List[str]] = None,
+        sub_term: Optional[List[str]] = None,
+        data: Optional[Dict[str, Any]] = None,
+    ):
+        self._sid = sid
+        self._term = term
+        self._base = base
+        self._term_type = term_type
+        self._hyper = hyper
+        self._sub_term = sub_term if sub_term is not None else []
+        self._sub_type = sub_type if sub_type is not None else []
+        self._alias = alias if alias is not None else []
+        self._alias_ext = alias_ext if alias_ext is not None else []
+        self._data = data
+        self._level = level
+        self._node_type = node_type
+        self._sons = set()
+
+    def __str__(self):
+        if self._data is not None:
+            return json.dumps(self._data, ensure_ascii=False)
+        else:
+            res = {
+                "termid": self._sid,
+                "term": self._term,
+                "src": self._base,
+                "alias": self._alias,
+                "alias_ext": self._alias_ext,
+                "termtype": self._term_type,
+                "subterms": self._sub_term,
+                "subtype": self._sub_type,
+                "links": [],
+            }
+            return json.dumps(res, ensure_ascii=False)
+
+    @property
+    def sid(self):
+        return self._sid
+
+    @property
+    def term(self):
+        return self._term
+
+    @property
+    def base(self):
+        return self._base
+
+    @property
+    def alias(self):
+        return self._alias
+
+    @property
+    def alias_ext(self):
+        return self._alias_ext
+
+    @property
+    def termtype(self):
+        return self._term_type
+
+    @property
+    def subtype(self):
+        return self._sub_type
+
+    @property
+    def subterm(self):
+        return self._sub_term
+
+    @property
+    def hyper(self):
+        return self._hyper
+
+    @property
+    def level(self):
+        return self._level
+
+    @property
+    def sons(self):
+        return self._sons
+
+    @property
+    def node_type(self):
+        return self._node_type
+
+    def add_son(self, son_name):
+        self._sons.add(son_name)
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]):
+        """Build a node from dictionary data.
+
+        Args:
+            data (Dict[str, Any]): Dictionary data contain all k-v data.
+
+        Returns:
+            [type]: TermTree node object.
+        """
+        return cls(
+            sid=data["termid"],
+            term=data["term"],
+            base=data["src"],
+            term_type=data["termtype"],
+            sub_type=data["subtype"],
+            sub_term=data["subterms"],
+            alias=data["alias"],
+            alias_ext=data["alias_ext"],
+            data=data,
+        )
+
+    @classmethod
+    def from_json(cls, json_str: str):
+        """Build a node from JSON string.
+
+        Args:
+            json_str (str): JSON string formatted by TermTree data.
+
+        Returns:
+            [type]: TermTree node object.
+        """
+        dict_data = json.loads(json_str)
+        return cls.from_dict(dict_data)
+
+
+class TermTree(object):
+    """TermTree class."""
+
+    def __init__(self):
+        self._nodes: Dict[str, TermTreeNode] = {}
+        self._root = TermTreeNode(sid="root", term="root", base="cb", node_type="root", level=0)
+        self._nodes["root"] = self.root
+        self._index = {}
+
+    def __build_sons(self):
+        for node in self._nodes:
+            self.__build_son(self._nodes[node])
+
+    def __getitem__(self, item):
+        return self._nodes[item]
+
+    def __contains__(self, item):
+        return item in self._nodes
+
+    def __iter__(self):
+        return self._nodes.__iter__()
+
+    @property
+    def root(self):
+        return self._root
+
+    def __load_type(self, file_path: str):
+        with open(file_path, "rt", newline="", encoding="utf8") as csvfile:
+            file_handler = csv.DictReader(csvfile, delimiter="\t")
+            for row in file_handler:
+                if row["type-1"] not in self:
+                    self.add_type(type_name=row["type-1"], hyper_type="root")
+                if row["type-2"] != "" and row["type-2"] not in self:
+                    self.add_type(type_name=row["type-2"], hyper_type=row["type-1"])
+                if row["type-3"] != "" and row["type-3"] not in self:
+                    self.add_type(type_name=row["type-3"], hyper_type=row["type-2"])
+
+    def __judge_term_node(self, node: TermTreeNode) -> bool:
+        if node.termtype not in self:
+            raise ValueError(f"Term type of new node {node.termtype} does not exists.")
+        if node.sid in self:
+            warnings.warn(f"{node.sid} exists, will be replaced by new node.")
+
+    def add_term(
+        self,
+        term: Optional[str] = None,
+        base: Optional[str] = None,
+        term_type: Optional[str] = None,
+        sub_type: Optional[List[str]] = None,
+        sub_term: Optional[List[str]] = None,
+        alias: Optional[List[str]] = None,
+        alias_ext: Optional[List[str]] = None,
+        data: Optional[Dict[str, Any]] = None,
+    ):
+        """Add a term into TermTree.
+
+        Args:
+            term (str): common name of name.
+            base (str): term is concept or entity.
+            term_type (str): term type of this term
+            sub_type (Optional[List[str]], optional): sub type of this term, must exists in TermTree. Defaults to None.
+            sub_terms (Optional[List[str]], optional): sub terms of this term. Defaults to None.
+            alias (Optional[List[str]], optional): alias of this term. Defaults to None.
+            alias_ext (Optional[List[str]], optional): . Defaults to None.
+            data (Optional[Dict[str, Any]], optional): [description]. Defaults to None.
+        """
+        if data is not None:
+            new_node = TermTreeNode.from_dict(data)
+        else:
+            new_node = TermTreeNode(
+                sid=f"{term_type}_{base}_{term}",
+                term=term,
+                base=base,
+                term_type=term_type,
+                sub_term=sub_term,
+                sub_type=sub_type,
+                alias=alias,
+                alias_ext=alias_ext,
+                node_type="term",
+            )
+        self.__judge_term_node(new_node)
+        self._nodes[new_node.sid] = new_node
+        self.__build_index(new_node)
+
+    def add_type(self, type_name, hyper_type):
+        if type_name in self._nodes:
+            raise ValueError(f"Term Type {type_name} exists.")
+        if hyper_type not in self._nodes:
+            raise ValueError(f"Hyper type {hyper_type} does not exist, please add it first.")
+        if self._nodes[hyper_type].level == 3:
+            raise ValueError(
+                "Term type schema must be 3-LEVEL, 3rd level type node should not be a parent of type node."
+            )
+        self._nodes[type_name] = TermTreeNode(
+            sid=type_name,
+            term=type_name,
+            base=None,
+            hyper=hyper_type,
+            node_type="type",
+            level=self._nodes[hyper_type].level + 1,
+        )
+        self.__build_index(self._nodes[type_name])
+
+    def __load_file(self, file_path: str):
+        with open(file_path, encoding="utf-8") as fp:
+            for line in fp:
+                data = json.loads(line)
+                self.add_term(data=data)
+
+    def __build_son(self, node: TermTreeNode):
+        """Build sons of a node
+
+        Args:
+            node (TermTreeNode): son node.
+        """
+        type_node = None
+        if node.termtype is not None:
+            type_node = self._nodes[node.termtype]
+        elif node.hyper is not None:
+            type_node = self._nodes[node.hyper]
+        if type_node is not None:
+            type_node.add_son(node.sid)
+        for sub_type in node.subtype:
+            sub_type_node = self._nodes[sub_type]
+            sub_type_node.add_son(node.sid)
+
+    def build_son(self, node: str):
+        self.__build_son(self[node])
+
+    def __build_index(self, node: TermTreeNode):
+        if node.term not in self._index:
+            self._index[node.term] = []
+        self._index[node.term].append(node.sid)
+        for alia in node.alias:
+            if alia not in self._index:
+                self._index[alia] = []
+            self._index[alia].append(node.sid)
+
+    def __judge_hyper(self, source_id, target_id) -> bool:
+        queue = [source_id]
+        visited_node = {source_id}
+        while len(queue) > 0:
+            cur_id = queue.pop(0)
+            if cur_id == target_id:
+                return True
+            cur_node = self._nodes[cur_id]
+            edge = []
+            if cur_node.hyper is not None:
+                edge.append(cur_node.hyper)
+            if cur_node.termtype is not None:
+                edge.append(cur_node.termtype)
+            edge.extend(cur_node.subtype)
+            for next_id in edge:
+                if next_id not in visited_node:
+                    queue.append(next_id)
+                    visited_node.add(next_id)
+        return False
+
+    def find_term(self, term: str, term_type: Optional[str] = None) -> Tuple[bool, Union[List[str], None]]:
+        """Find a term in Term Tree. If term not exists, return None.
+        If `term_type` is not None, will find term with this type.
+
+        Args:
+            term (str): term to look up.
+            term_type (Optional[str], optional): find term in this term_type. Defaults to None.
+
+        Returns:
+            Union[None, List[str]]: [description]
+        """
+        if term not in self._index:
+            return False, None
+        else:
+            if term_type is None:
+                return True, self._index[term]
+            else:
+                out = []
+                for term_id in self._index[term]:
+                    if self.__judge_hyper(term_id, term_type) is True:
+                        out.append(term_id)
+                if len(out) > 0:
+                    return True, out
+                else:
+                    return False, None
+
+    def build_from_dir(self, term_schema_path, term_data_path, linking=True):
+        """Build TermTree from a directory which should contain type schema and term data.
+
+        Args:
+            dir ([type]): [description]
+        """
+        self.__load_type(term_schema_path)
+        if linking:
+            self.__load_file(term_data_path)
+            self.__build_sons()
+
+    @classmethod
+    def from_dir(cls, term_schema_path, term_data_path, linking) -> "TermTree":
+        """Build TermTree from a directory which should contain type schema and term data.
+
+        Args:
+            source_dir ([type]): [description]
+
+        Returns:
+            TermTree: [description]
+        """
+        term_tree = cls()
+        term_tree.build_from_dir(term_schema_path, term_data_path, linking)
+        return term_tree
+
+    def __dfs(self, cur_id: str, depth: int, path: Dict[str, str], writer: csv.DictWriter):
+        cur_node = self._nodes[cur_id]
+        if cur_node.node_type == "term":
+            return
+        if depth > 0:
+            path[f"type-{depth}"] = cur_id
+        if path["type-1"] != "":
+            writer.writerow(path)
+        for son in cur_node.sons:
+            self.__dfs(son, depth + 1, path, writer)
+        if depth > 0:
+            path[f"type-{depth}"] = ""
+
+    def save(self, save_dir):
+        """Save term tree to directory `save_dir`
+
+        Args:
+            save_dir ([type]): Directory.
+        """
+        if os.path.exists(save_dir) is False:
+            os.makedirs(save_dir, exist_ok=True)
+        out_path = {}
+        for i in range(1, 3):
+            out_path[f"type-{i}"] = ""
+        with open(f"{save_dir}/termtree_type.csv", "wt", encoding="utf-8", newline="") as fp:
+            fieldnames = ["type-1", "type-2", "type-3"]
+            csv_writer = csv.DictWriter(fp, delimiter="\t", fieldnames=fieldnames)
+            csv_writer.writeheader()
+            self.__dfs("root", 0, out_path, csv_writer)
+        with open(f"{save_dir}/termtree_data", "w", encoding="utf-8", newline="") as fp:
+            for nid in self:
+                node = self[nid]
+                if node.node_type == "term":
+                    print(node, file=fp)
+
+
+def levenstein_distance(s1: str, s2: str) -> int:
+    """Calculate minimal Levenstein distance between s1 and s2.
+
+    Args:
+        s1 (str): string
+        s2 (str): string
+
+    Returns:
+        int: the minimal distance.
+    """
+    m, n = len(s1) + 1, len(s2) + 1
+
+    # Initialize
+    dp = [[0] * n for i in range(m)]
+    dp[0][0] = 0
+    for i in range(1, m):
+        dp[i][0] = dp[i - 1][0] + 1
+    for j in range(1, n):
+        dp[0][j] = dp[0][j - 1] + 1
+
+    for i in range(1, m):
+        for j in range(1, n):
+            if s1[i - 1] != s2[j - 1]:
+                dp[i][j] = min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1]) + 1
+            else:
+                dp[i][j] = dp[i - 1][j - 1]
+    return dp[m - 1][n - 1]
+
+
+class BurkhardKellerNode(object):
+    """Node implementatation for BK-Tree. A BK-Tree node stores the information of current word, and its approximate words calculated by levenstein distance.
+
+    Args:
+        word (str): word of current node.
+    """
+
+    def __init__(self, word: str):
+        self.word = word
+        self.next = {}
+
+
+class BurkhardKellerTree(object):
+    """Implementataion of BK-Tree"""
+
+    def __init__(self):
+        self.root = None
+        self.nodes = {}
+
+    def __add(self, cur_node: BurkhardKellerNode, word: str):
+        """Insert a word into current tree. If tree is empty, set this word to root.
+
+        Args:
+            word (str): word to be inserted.
+        """
+        if self.root is None:
+            self.root = BurkhardKellerNode(word)
+            return
+        if word in self.nodes:
+            return
+        dist = levenstein_distance(word, cur_node.word)
+        if dist not in cur_node.next:
+            self.nodes[word] = cur_node.next[dist] = BurkhardKellerNode(word)
+        else:
+            self.__add(cur_node.next[dist], word)
+
+    def add(self, word: str):
+        """Insert a word into current tree. If tree is empty, set this word to root.
+
+        Args:
+            word (str): word to be inserted.
+        """
+        return self.__add(self.root, word)
+
+    def __search_similar_word(self, cur_node: BurkhardKellerNode, s: str, threshold: int = 2) -> List[str]:
+        res = []
+        if cur_node is None:
+            return res
+        dist = levenstein_distance(cur_node.word, s)
+        if dist <= threshold:
+            res.append((cur_node.word, dist))
+        start = max(dist - threshold, 1)
+        while start < dist + threshold:
+            tmp_res = self.__search_similar_word(cur_node.next.get(start, None), s)[:]
+            res.extend(tmp_res)
+            start += 1
+        return res
+
+    def search_similar_word(self, word: str) -> List[str]:
+        """Search the most similar (minimal levenstain distance) word between `s`.
+
+        Args:
+            s (str): target word
+
+        Returns:
+            List[str]: similar words.
+        """
+        res = self.__search_similar_word(self.root, word)
+
+        def max_prefix(s1: str, s2: str) -> int:
+            res = 0
+            length = min(len(s1), len(s2))
+            for i in range(length):
+                if s1[i] == s2[i]:
+                    res += 1
+                else:
+                    break
+            return res
+
+        res.sort(key=lambda d: (d[1], -max_prefix(d[0], word)))
+        return res
+
+
+class TriedTree(object):
+    """Implementataion of TriedTree"""
+
+    def __init__(self):
+        self.tree = {}
+
+    def add_word(self, word):
+        """add single word into TriedTree"""
+        self.tree[word] = len(word)
+        for i in range(1, len(word)):
+            wfrag = word[:i]
+            self.tree[wfrag] = self.tree.get(wfrag, None)
+
+    def search(self, content):
+        """Backward maximum matching
+
+        Args:
+            content (str): string to be searched
+        Returns:
+            List[Tuple]: list of maximum matching words, each element represents
+                the starting and ending position of the matching string.
+        """
+        result = []
+        length = len(content)
+        for start in range(length):
+            for end in range(start + 1, length + 1):
+                pos = self.tree.get(content[start:end], -1)
+                if pos == -1:
+                    break
+                if pos and (len(result) == 0 or end > result[-1][1]):
+                    result.append((start, end))
+        return result
+
+
+class Customization(object):
+    """
+    User intervention based on Aho-Corasick automaton
+    """
+
+    def __init__(self):
+        self.dictitem = {}
+        self.ac = None
+
+    def load_customization(self, filename, sep=None):
+        """Load the custom vocab"""
+        self.ac = TriedTree()
+        with open(filename, "r", encoding="utf8") as f:
+            for line in f:
+                if sep is None:
+                    words = line.strip().split()
+
+                if len(words) == 0:
+                    continue
+
+                phrase = ""
+                tags = []
+                offset = []
+                for word in words:
+                    if word.rfind("/") < 1:
+                        phrase += word
+                        tags.append("")
+                    else:
+                        phrase += word[: word.rfind("/")]
+                        tags.append(word[word.rfind("/") + 1 :])
+                    offset.append(len(phrase))
+
+                if len(phrase) < 2 and tags[0] == "":
+                    continue
+
+                self.dictitem[phrase] = (tags, offset)
+                self.ac.add_word(phrase)
+
+    def parse_customization(self, query, lac_tags, prefix=False):
+        """Use custom vocab to modify the lac results"""
+        if not self.ac:
+            logger.warning("customization dict is not load")
+            return
+        ac_res = self.ac.search(query)
+
+        for begin, end in ac_res:
+            phrase = query[begin:end]
+            index = begin
+
+            tags, offsets = self.dictitem[phrase]
+
+            if prefix:
+                for tag, offset in zip(tags, offsets):
+                    while index < begin + offset:
+                        if len(tag) == 0:
+                            lac_tags[index] = "I" + lac_tags[index][1:]
+                        else:
+                            lac_tags[index] = "I-" + tag
+                        index += 1
+                lac_tags[begin] = "B" + lac_tags[begin][1:]
+                for offset in offsets:
+                    index = begin + offset
+                    if index < len(lac_tags):
+                        lac_tags[index] = "B" + lac_tags[index][1:]
+            else:
+                for tag, offset in zip(tags, offsets):
+                    while index < begin + offset:
+                        if len(tag) == 0:
+                            lac_tags[index] = lac_tags[index][:-1] + "I"
+                        else:
+                            lac_tags[index] = tag + "-I"
+                        index += 1
+                lac_tags[begin] = lac_tags[begin][:-1] + "B"
+                for offset in offsets:
+                    index = begin + offset
+                    if index < len(lac_tags):
+                        lac_tags[index] = lac_tags[index][:-1] + "B"
+
+
+class SchemaTree(object):
+    """
+    Implementataion of SchemaTree
+    """
+
+    def __init__(self, name="root", children=None):
+        self.name = name
+        self.children = []
+        self.prefix = None
+        self.parent_relations = None
+        self.parent = None
+        if children is not None:
+            for child in children:
+                self.add_child(child)
+
+    def __repr__(self):
+        return self.name
+
+    def add_child(self, node):
+        assert isinstance(node, SchemaTree), "The children of a node should be an instacne of SchemaTree."
+        self.children.append(node)
+
+
+def get_id_and_prob(span_set, offset_mapping):
+    """
+    Return text id and probability of predicted spans
+
+    Args:
+        span_set (set): set of predicted spans.
+        offset_mapping (list[int]): list of pair preserving the
+                index of start and end char in original text pair (prompt + text) for each token.
+    Returns:
+        sentence_id (list[tuple]): index of start and end char in original text.
+        prob (list[float]): probabilities of predicted spans.
+    """
+    prompt_end_token_id = offset_mapping[1:].index([0, 0])
+    bias = offset_mapping[prompt_end_token_id][1] + 1
+    for idx in range(1, prompt_end_token_id + 1):
+        offset_mapping[idx][0] -= bias
+        offset_mapping[idx][1] -= bias
+
+    sentence_id = []
+    prob = []
+    for start, end in span_set:
+        prob.append(start[1] * end[1])
+        start_id = offset_mapping[start[0]][0]
+        end_id = offset_mapping[end[0]][1]
+        sentence_id.append((start_id, end_id))
+    return sentence_id, prob
+
+
+def dbc2sbc(s):
+    rs = ""
+    for char in s:
+        code = ord(char)
+        if code == 0x3000:
+            code = 0x0020
+        else:
+            code -= 0xFEE0
+        if not (0x0021 <= code and code <= 0x7E):
+            rs += char
+            continue
+        rs += chr(code)
+    return rs
+
+
+class WordTagRelationExtractor(object):
+    """Implement of information extractor."""
+
+    _chain_items = {"和", "与", "兼", "及", "以及", "还有", "并"}
+    _all_items = None
+    _jux_buf = []
+
+    def __init__(self, schema):
+        self._schema = schema
+
+    @property
+    def schema(self):
+        return self._schema
+
+    @classmethod
+    def from_dict(cls, config_dict):
+        """Make an instance from a configuration dictionary.
+
+        Args:
+            config_dict (Dict[str, Any]): configuration dict.
+        """
+        res = {}
+
+        for i, trip_config in enumerate(config_dict):
+            head_role_type = trip_config["head_role"]
+            if head_role_type not in res:
+                res[head_role_type] = {"trigger": {}, "g_t_map": {}, "rel_group": {}, "trig_word": {}}
+            group_name = trip_config["group"]
+            if "rel_group" in trip_config:
+                res[head_role_type]["rel_group"][group_name] = trip_config["rel_group"]
+            if group_name not in res[head_role_type]["trig_word"]:
+                res[head_role_type]["trig_word"][group_name] = set()
+            for trig_word in trip_config["trig_word"]:
+                res[head_role_type]["trigger"][trig_word] = {
+                    "trigger_type": trip_config["trig_type"],
+                    "group_name": group_name,
+                    "rev_flag": trip_config["reverse"],
+                }
+                res[head_role_type]["trig_word"][group_name].add(trig_word)
+            res[head_role_type]["g_t_map"][group_name] = trip_config["tail_role"]
+
+        return cls(res)
+
+    @classmethod
+    def from_json(cls, json_str):
+        """Implement an instance from JSON str."""
+        config_dict = json.loads(json_str)
+        return cls.from_dict(config_dict)
+
+    @classmethod
+    def from_pkl(cls, pkl_path):
+        """Implement an instance from a serialized pickle package."""
+        with open(pkl_path, "rb") as fp:
+            schema = pickle.load(fp)
+        return cls(schema)
+
+    @classmethod
+    def from_config(cls, config_path):
+        """Implement an instance from a configuration file."""
+        with open(config_path, encoding="utf-8") as fp:
+            config_json = json.load(fp)
+        return cls.from_dict(config_json)
+
+    def add_schema_from_dict(self, config_dict):
+        """Add the schema from the dict."""
+        for i, trip_config in enumerate(config_dict):
+            head_role_type = trip_config["head_role"]
+            if head_role_type not in self._schema:
+                self._schema[head_role_type] = {"trigger": {}, "g_t_map": {}, "rel_group": {}, "trig_word": {}}
+            group_name = trip_config["group"]
+            if "rel_group" in self._schema:
+                self._schema[head_role_type]["rel_group"][group_name] = trip_config["rel_group"]
+            if group_name not in self._schema[head_role_type]["trig_word"]:
+                self._schema[head_role_type]["trig_word"][group_name] = set()
+            for trig_word in trip_config["trig_word"]:
+                self._schema[head_role_type]["trigger"][trig_word] = {
+                    "trigger_type": trip_config["trig_type"],
+                    "group_name": group_name,
+                    "rev_flag": trip_config["reverse"],
+                }
+                self._schema[head_role_type]["trig_word"][group_name].add(trig_word)
+            self._schema[head_role_type]["g_t_map"][group_name] = trip_config["tail_role"]
+
+    def _judge_jux(self, wordtag_item):
+        """Judge whether `wordtag_item` is a relevance componet between two juxtaposed items.
+
+        Args:
+            wordtag_item (dict): input item.
+
+        Returns:
+            bool: [description]
+        """
+        if wordtag_item["item"] in {"、", " ", "《", "》", "/"}:
+            return True
+        if wordtag_item["item"] in self._chain_items and wordtag_item["wordtag_label"] == "连词":
+            return True
+        return False
+
+    def _search_jux(self, cur_item, cur_pos=0, jux_type=None, jux_word=None, status_flag=None, search_list=None):
+        """Find juxtaposed items with `cur_item` at `cur_pos` in `self._all_items`.
+
+        Args:
+            cur_item (Dict[str, Any]): the item current viewing.
+            cur_pos (int, optional): current position of viewing item. Defaults to 0.
+            jux_type (Set[str], optional):  wordtag labels that can be considered as juxtaposed item. Defaults to None.
+            jux_word (Set[str], optional):  words that can be considered as juxtaposed item. Defaults to None.
+            status_flag (bool, optional): if True, on the juxtaposed item, or on chain item. Defaults to None.
+
+        Returns:
+            int: end postion of juxtable items.
+        """
+        if search_list is None:
+            search_list = self._all_items
+
+        if jux_type is None and jux_word is None:
+            raise ValueError("`jux_type` and `jux_word` are both None.")
+
+        if status_flag is True:
+            self._jux_buf.append(cur_item)
+
+        if cur_pos >= len(search_list) - 1:
+            return cur_pos
+
+        next_item = search_list[cur_pos + 1]
+
+        if self._judge_jux(next_item) is True:
+            return self._search_jux(
+                cur_item=next_item,
+                cur_pos=cur_pos + 1,
+                jux_type=jux_type,
+                jux_word=jux_word,
+                status_flag=False,
+                search_list=search_list,
+            )
+
+        next_flag = True
+        if jux_type is not None:
+            next_flag = next_flag and self._match_item(next_item, jux_type)
+        if jux_word is not None:
+            next_flag = next_flag and (next_item["item"] in jux_word)
+        if next_flag is True:
+            return self._search_jux(
+                cur_item=next_item, cur_pos=cur_pos + 1, jux_type=jux_type, jux_word=jux_word, status_flag=True
+            )
+        if next_flag is not True:
+            while self._judge_jux(search_list[cur_pos]) is True:
+                cur_pos -= 1
+        return cur_pos
+
+    @staticmethod
+    def _match_item(item, type_can):
+        match_key = item["wordtag_label"].split("_")[0]
+        return match_key in type_can or item["wordtag_label"] in type_can
+
+    def _trig_handler(self, cur_item, head_conf):
+        """Whether current item is a trigger, if True, return corresponding flag and configuration.
+
+        Args:
+            cur_item (Dict[str, Any]): current viewing ite,
+            st_conf (Dict[str, Any]): config
+
+        Returns:
+            Tuple[str, Union[None, dict]]: [description]
+        """
+        trigger_conf = head_conf["trigger"]
+        if cur_item["item"] in trigger_conf:
+            # find a trigger, then judge whether it is a tail-trigger or a rel trigger.
+            if trigger_conf[cur_item["item"]]["trigger_type"] == "role":
+                # find a tail-trigger, then judge wordtag label.
+                group_name = trigger_conf[cur_item["item"]]["group_name"]
+                for tail_conf in head_conf["g_t_map"][group_name]:
+                    if self._match_item(cur_item, tail_conf["main"]) is True:
+                        return "trig_t", tail_conf
+                else:
+                    return "un_trig", None
+            else:
+                return "trig_g", None
+        else:
+            return "un_trig", None
+
+    def _find_tail(self, search_range, sg_conf, head_hype):
+        """Find tail role in `search_range`
+
+        Args:
+            search_range (List[int]): index range of `self._all_items`, items to be checked.
+            sg_conf (Dict[str, Any]): configuration of group.
+            head_type (str): wordtag label of head role item.
+        """
+        for i in search_range:
+            item = self._all_items[i]
+            if item["item"] in {"，", "？", "、", "。", "；"}:
+                return -2, None
+            for j, tail_conf in enumerate(sg_conf):
+                flag = self._match_item(item, tail_conf["main"])
+                if flag is True:
+                    return i, tail_conf
+                if item["wordtag_label"].startswith(head_hype):
+                    return -1, None
+
+        return -2, None
+
+    def _find_supp(self, search_range, search_type):
+        res = []
+        for i in search_range:
+            item = self._all_items[i]
+            if item["item"] == "，":
+                break
+            if any(item["wordtag_label"].startswith(sup_t) for sup_t in search_type):
+                res.append(item)
+        return res if len(res) > 0 else None
+
+    def _make_output(self, head_item, tail_item, group, source, support=None, trig_word=None, **kwargs):
+        """Make formatted outputs of mined results.
+
+        Args:
+            head_item (Dict[str, Any]): [description]
+            head_index (int): [description]
+            tail_item (List[Dict[str, Any]]): [description]
+            tail_indices (List[int]): [description]
+            group (str): [description]
+            source (str): [description]
+            support (List[Dict[str, Any]], optional): [description]. Defaults to None.
+            support_indices (List[int], optional): [description]. Defaults to None.
+            trig_word (List[str], optional): [description]. Defaults to None.
+            trig_indices (List[int], optional): [description]. Defaults to None.
+        """
+        res = {
+            "HEAD_ROLE": {
+                "item": head_item["item"],
+                "type": head_item["wordtag_label"],
+                "offset": head_item["offset"],
+            },
+            "TAIL_ROLE": [
+                {"item": ti["item"], "offset": ti["offset"], "type": ti["wordtag_label"]} for ti in tail_item
+            ],
+            "GROUP": group,
+            "SRC": source,
+        }
+        if support is not None:
+            res["SUPPORT"] = [
+                {
+                    "item": si["item"],
+                    "offset": si["offset"],
+                    "type": si["wordtag_label"],
+                }
+                for si in support
+            ]
+        if trig_word is not None:
+            res["TRIG"] = [
+                {
+                    "item": ti["item"],
+                    "offset": ti["offset"],
+                }
+                for ti in trig_word
+            ]
+        return res
+
+    def _reverse(self, res, group_name=None):
+        ret = []
+        for rev_head in res["TAIL_ROLE"]:
+            rev_tmp = {
+                "HEAD_ROLE": rev_head,
+                "TAIL_ROLE": [res["HEAD_ROLE"]],
+                "GROUP": group_name if group_name is not None else res["GROUP"],
+            }
+            if "SUPPORT" in res:
+                rev_tmp["SUPPORT"] = res["SUPPORT"]
+            if "TRIG" in res:
+                rev_tmp["TRIG"] = res["TRIG"]
+            rev_tmp["SRC"] = "REVERSE" if group_name is not None else res["SRC"]
+            ret.append(rev_tmp)
+        return ret
+
+    def extract_spo(self, all_items):
+        """Pipeline of mining procedure.
+
+        Args:
+            all_items ([type]): [description]
+        """
+        self._all_items = all_items
+
+        res_cand = []
+
+        # Match head role, and consider it as central, search others.
+        for i, head_cand in enumerate(self._all_items):
+            last_end = i
+            try:
+                datetime.strptime(head_cand["item"], "%Y年%m月%d日")
+                head_cand["wordtag_label"] = "时间类_具体时间"
+            except ValueError:
+                pass
+
+            if head_cand["wordtag_label"] in self._schema:
+                head_conf = self._schema[head_cand["wordtag_label"]]
+                head_type = head_cand["wordtag_label"]
+            else:
+                match_key = head_cand["wordtag_label"].split("_")[0]
+                if match_key in self._schema:
+                    head_conf = self._schema[match_key]
+                    head_type = match_key
+                else:
+                    continue
+
+            trig_status = "un_trig"
+
+            # Consider `head_cand` as a start item, find trigger words behind.
+            # We suppose that minning strategy is directed, so only search items behinds head.
+            # If need, we can reverse constructed triples.
+            j = i + 1
+            while j < len(self._all_items):
+                cur_item = all_items[j]
+                cur_pos = j
+                j += 1
+
+                trig_status, trig_conf = self._trig_handler(cur_item, self._schema[head_type])
+
+                # Find a tail role, generate corresponding triple.
+                if trig_status == "trig_t":
+                    trig_status = "un_trig"
+                    tail_flag = True
+                    for k in range(i + 1, j):
+                        if self._all_items[k]["wordtag_label"] == head_cand["wordtag_label"]:
+                            tail_flag = False
+                            break
+                    if tail_flag is False:
+                        continue
+
+                    group_name = head_conf["trigger"][cur_item["item"]]["group_name"]
+                    del self._jux_buf[:]
+                    idx = self._search_jux(
+                        cur_item=cur_item, cur_pos=cur_pos, jux_type=trig_conf["main"], status_flag=True
+                    )
+                    supports = self._find_supp(search_range=range(j - 1, i, -1), search_type=trig_conf["support"])
+
+                    tmp = self._make_output(
+                        head_item=head_cand,
+                        tail_item=self._jux_buf[:],
+                        group=group_name,
+                        support=supports,
+                        source="TAIL",
+                    )
+
+                    # Reverse triple if group has relative.
+                    if (
+                        group_name in head_conf.get("rel_group", {})
+                        or head_conf["trigger"][cur_item["item"]]["rev_flag"] is True
+                    ):
+                        rev_tmp = self._reverse(tmp, head_conf.get("rel_group", {}).get(group_name, None))
+                        res_cand.extend(rev_tmp[:])
+                    if head_conf["trigger"][cur_item["item"]]["rev_flag"] is False:
+                        res_cand.append(tmp.copy())
+
+                    j = idx + 1
+                    last_end = idx
+                    continue
+
+                # Find a group trigger word, look for tail role items of current head role and group argument.
+                # Searching range is items behind group trigger and items between head rold and group trigger word.
+                if trig_status == "trig_g":
+                    trig_status = "un_trig"
+                    group_name = head_conf["trigger"][cur_item["item"]]["group_name"]
+
+                    del self._jux_buf[:]
+                    g_start_idx = j - 1
+                    g_idx = self._search_jux(
+                        cur_item=cur_item,
+                        cur_pos=cur_pos,
+                        jux_word=head_conf["trig_word"][group_name],
+                        status_flag=True,
+                    )
+
+                    g_trig_words = self._jux_buf[:]
+                    j = g_idx + 1
+
+                    # Search right.
+                    if j < len(self._all_items) - 1:
+                        tail_idx, tail_conf = self._find_tail(
+                            range(g_idx + 1, len(self._all_items)), head_conf["g_t_map"][group_name], head_type
+                        )
+
+                        if tail_idx > 0:
+                            # Find a tail.
+                            tail_item = self._all_items[tail_idx]
+                            del self._jux_buf[:]
+                            idx = self._search_jux(
+                                cur_item=tail_item, cur_pos=tail_idx, status_flag=True, jux_type=tail_conf["main"]
+                            )
+                            tail_cand = self._jux_buf[:]
+                            supports = self._find_supp(range(tail_idx - 1, i, -1), tail_conf["support"])
+
+                            tmp = self._make_output(
+                                head_item=head_cand,
+                                tail_item=tail_cand,
+                                group=group_name,
+                                source="HGT",
+                                support=supports,
+                                trig_word=g_trig_words,
+                            )
+
+                            if (
+                                group_name in head_conf.get("rel_group", {})
+                                or head_conf["trigger"][cur_item["item"]]["rev_flag"] is True
+                            ):
+                                rev_tmp = self._reverse(tmp, head_conf.get("rel_group", {}).get(group_name, None))
+                                res_cand.extend(rev_tmp[:])
+                            if head_conf["trigger"][cur_item["item"]]["rev_flag"] is False:
+                                res_cand.append(tmp.copy())
+
+                            j = idx + 1
+                            last_end = idx
+                            continue
+
+                    # Search left
+                    if g_idx - i > len(g_trig_words):
+                        tail_idx, tail_conf = self._find_tail(
+                            range(g_start_idx, last_end, -1), head_conf["g_t_map"][group_name], head_type
+                        )
+                        tail_item = self._all_items[tail_idx]
+                        if tail_idx > 0:
+                            del self._jux_buf[:]
+                            _ = self._search_jux(
+                                cur_item=tail_item,
+                                cur_pos=0,
+                                jux_type=tail_conf["main"],
+                                status_flag=True,
+                                search_list=self._all_items[i + 1 : tail_idx][::-1],
+                            )
+                            tail_cand = self._jux_buf[:]
+                            supports = self._find_supp(range(g_idx - 1, last_end, -1), tail_conf["support"])
+                            last_end = g_idx
+
+                            tmp = self._make_output(
+                                head_item=head_cand,
+                                tail_item=tail_cand,
+                                group=group_name,
+                                trig_word=g_trig_words,
+                                source="HTG",
+                                support=supports,
+                            )
+
+                            if (
+                                group_name in head_conf.get("rel_group", {})
+                                or head_conf["trigger"][cur_item["item"]]["rev_flag"] is True
+                            ):
+                                rev_tmp = self._reverse(tmp, head_conf.get("rel_group", {}).get(group_name, None))
+                                res_cand.extend(rev_tmp[:])
+                            if head_conf["trigger"][cur_item["item"]]["rev_flag"] is False:
+                                res_cand.append(tmp.copy())
+                            continue
+        return res_cand
+
+
+@dataclass
+class DataCollatorGP:
+    tokenizer: PretrainedTokenizerBase
+    padding: Union[bool, str, PaddingStrategy] = True
+    max_length: Optional[int] = None
+    label_maps: Optional[dict] = None
+    task_type: Optional[str] = None
+
+    def __call__(self, features: List[Dict[str, Union[List[int], paddle.Tensor]]]) -> Dict[str, paddle.Tensor]:
+        new_features = [{k: v for k, v in f.items() if k not in ["offset_mapping", "text"]} for f in features]
+
+        batch = self.tokenizer.pad(
+            new_features,
+            padding=self.padding,
+        )
+
+        batch = [paddle.to_tensor(batch[k]) for k in batch.keys()]
+        batch.append([feature["offset_mapping"] for feature in features])
+        batch.append([feature["text"] for feature in features])
+        return batch
+
+
+@dataclass
+class DataCollatorForErnieCtm:
+    tokenizer: PretrainedTokenizerBase
+    padding: Union[bool, str, PaddingStrategy] = True
+    model: Optional[str] = "wordtag"
+
+    def __call__(self, features: List[Dict[str, Union[List[int], paddle.Tensor]]]) -> Dict[str, paddle.Tensor]:
+        no_pad = "seq_len" if self.model == "wordtag" else "label_indices"
+        new_features = [{k: v for k, v in f.items() if k != no_pad} for f in features]
+        batch = self.tokenizer.pad(
+            new_features,
+            padding=self.padding,
+        )
+
+        batch = [paddle.to_tensor(batch[k]) for k in batch.keys()]
+        batch.append(paddle.to_tensor([f[no_pad] for f in features]))
+        return batch
+
+
+def gp_decode(batch_outputs, offset_mappings, texts, label_maps, task_type="relation_extraction"):
+    if task_type == "entity_extraction":
+        batch_ent_results = []
+        for entity_output, offset_mapping, text in zip(batch_outputs[0].numpy(), offset_mappings, texts):
+            entity_output[:, [0, -1]] -= np.inf
+            entity_output[:, :, [0, -1]] -= np.inf
+            entity_probs = F.softmax(paddle.to_tensor(entity_output), axis=1).numpy()
+            ent_list = []
+            for l, start, end in zip(*np.where(entity_output > 0.0)):
+                ent_prob = entity_probs[l, start, end]
+                start, end = (offset_mapping[start][0], offset_mapping[end][-1])
+                ent = {
+                    "text": text[start:end],
+                    "type": label_maps["id2entity"][str(l)],
+                    "start_index": start,
+                    "probability": ent_prob,
+                }
+                ent_list.append(ent)
+            batch_ent_results.append(ent_list)
+        return batch_ent_results
+    else:
+        batch_ent_results = []
+        batch_rel_results = []
+        for entity_output, head_output, tail_output, offset_mapping, text in zip(
+            batch_outputs[0].numpy(),
+            batch_outputs[1].numpy(),
+            batch_outputs[2].numpy(),
+            offset_mappings,
+            texts,
+        ):
+            entity_output[:, [0, -1]] -= np.inf
+            entity_output[:, :, [0, -1]] -= np.inf
+            entity_probs = F.softmax(paddle.to_tensor(entity_output), axis=1).numpy()
+            head_probs = F.softmax(paddle.to_tensor(head_output), axis=1).numpy()
+            tail_probs = F.softmax(paddle.to_tensor(tail_output), axis=1).numpy()
+
+            ents = set()
+            ent_list = []
+            for l, start, end in zip(*np.where(entity_output > 0.0)):
+                ent_prob = entity_probs[l, start, end]
+                ents.add((start, end))
+                start, end = (offset_mapping[start][0], offset_mapping[end][-1])
+                ent = {
+                    "text": text[start:end],
+                    "type": label_maps["id2entity"][str(l)],
+                    "start_index": start,
+                    "probability": ent_prob,
+                }
+                ent_list.append(ent)
+            batch_ent_results.append(ent_list)
+
+            rel_list = []
+            for sh, st in ents:
+                for oh, ot in ents:
+                    p1s = np.where(head_output[:, sh, oh] > 0.0)[0]
+                    p2s = np.where(tail_output[:, st, ot] > 0.0)[0]
+                    ps = set(p1s) & set(p2s)
+                    for p in ps:
+                        rel_prob = head_probs[p, sh, oh] * tail_probs[p, st, ot]
+                        if task_type == "relation_extraction":
+                            rel = {
+                                "subject": text[offset_mapping[sh][0] : offset_mapping[st][1]],
+                                "predicate": label_maps["id2relation"][str(p)],
+                                "object": text[offset_mapping[oh][0] : offset_mapping[ot][1]],
+                                "subject_start_index": offset_mapping[sh][0],
+                                "object_start_index": offset_mapping[oh][0],
+                                "probability": rel_prob,
+                            }
+                        else:
+                            rel = {
+                                "aspect": text[offset_mapping[sh][0] : offset_mapping[st][1]],
+                                "sentiment": label_maps["id2relation"][str(p)],
+                                "opinion": text[offset_mapping[oh][0] : offset_mapping[ot][1]],
+                                "aspect_start_index": offset_mapping[sh][0],
+                                "opinion_start_index": offset_mapping[oh][0],
+                                "probability": rel_prob,
+                            }
+                        rel_list.append(rel)
+            batch_rel_results.append(rel_list)
+        return (batch_ent_results, batch_rel_results)
+
+
+DocSpan = namedtuple("DocSpan", ["start", "length"])
+
+Example = namedtuple(
+    "Example",
+    [
+        "keys",
+        "key_labels",
+        "doc_tokens",
+        "text",
+        "qas_id",
+        "model_type",
+        "seq_labels",
+        "ori_boxes",
+        "boxes",
+        "segment_ids",
+        "symbol_ids",
+        "im_base64",
+        "image_rois",
+    ],
+)
+
+Feature = namedtuple(
+    "Feature",
+    [
+        "unique_id",
+        "example_index",
+        "qas_id",
+        "doc_span_index",
+        "tokens",
+        "token_to_orig_map",
+        "token_is_max_context",
+        "token_ids",
+        "position_ids",
+        "text_type_ids",
+        "text_symbol_ids",
+        "overlaps",
+        "key_labels",
+        "seq_labels",
+        "se_seq_labels",
+        "bio_seq_labels",
+        "bioes_seq_labels",
+        "keys",
+        "model_type",
+        "doc_tokens",
+        "doc_labels",
+        "text",
+        "boxes",
+        "segment_ids",
+        "im_base64",
+        "image_rois",
+    ],
+)
+
+
+class Compose(object):
+    """compose"""
+
+    def __init__(self, transforms, ctx=None):
+        """init"""
+        self.transforms = transforms
+        self.ctx = ctx
+
+    def __call__(self, data):
+        """call"""
+        ctx = self.ctx if self.ctx else {}
+        for f in self.transforms:
+            try:
+                data = f(data, ctx)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map op [{}] with error: {} and stack:\n{}".format(f, e, str(stack_info)))
+                raise e
+        return data
+
+
+def batch_arrange(batch_samples, fields):
+    def _segm(samples):
+        """"""
+        assert "gt_poly" in samples
+        segms = samples["gt_poly"]
+        if "is_crowd" in samples:
+            is_crowd = samples["is_crowd"]
+            if len(segms) != 0:
+                assert len(segms) == is_crowd.shape[0]
+
+        gt_masks = []
+        valid = True
+        for i in range(len(segms)):
+            segm = segms[i]
+            gt_segm = []
+            if "is_crowd" in samples and is_crowd[i]:
+                gt_segm.append([[0, 0]])
+            else:
+                for poly in segm:
+                    if len(poly) == 0:
+                        valid = False
+                        break
+                    gt_segm.append(np.array(poly).reshape(-1, 2))
+            if (not valid) or len(gt_segm) == 0:
+                break
+            gt_masks.append(gt_segm)
+        return gt_masks
+
+    def im_shape(samples, dim=3):
+        # hard code
+        assert "h" in samples
+        assert "w" in samples
+        if dim == 3:  # RCNN, ..
+            return np.array((samples["h"], samples["w"], 1), dtype=np.float32)
+        else:  # YOLOv3, ..
+            return np.array((samples["h"], samples["w"]), dtype=np.int32)
+
+    arrange_batch = []
+    for samples in batch_samples:
+        one_ins = ()
+        for i, field in enumerate(fields):
+            if field == "gt_mask":
+                one_ins += (_segm(samples),)
+            elif field == "im_shape":
+                one_ins += (im_shape(samples),)
+            elif field == "im_size":
+                one_ins += (im_shape(samples, 2),)
+            else:
+                if field == "is_difficult":
+                    field = "difficult"
+                assert field in samples, "{} not in samples".format(field)
+                one_ins += (samples[field],)
+        arrange_batch.append(one_ins)
+    return arrange_batch
+
+
+class ProcessReader(object):
+    """
+    Args:
+        dataset (DataSet): DataSet object
+        sample_transforms (list of BaseOperator): a list of sample transforms
+            operators.
+        batch_transforms (list of BaseOperator): a list of batch transforms
+            operators.
+        batch_size (int): batch size.
+        shuffle (bool): whether shuffle dataset or not. Default False.
+        drop_last (bool): whether drop last batch or not. Default False.
+        drop_empty (bool): whether drop sample when it's gt is empty or not.
+            Default True.
+        mixup_epoch (int): mixup epoc number. Default is -1, meaning
+            not use mixup.
+        cutmix_epoch (int): cutmix epoc number. Default is -1, meaning
+            not use cutmix.
+        class_aware_sampling (bool): whether use class-aware sampling or not.
+            Default False.
+        worker_num (int): number of working threads/processes.
+            Default -1, meaning not use multi-threads/multi-processes.
+        use_process (bool): whether use multi-processes or not.
+            It only works when worker_num > 1. Default False.
+        bufsize (int): buffer size for multi-threads/multi-processes,
+            please note, one instance in buffer is one batch data.
+        memsize (str): size of shared memory used in result queue when
+            use_process is true. Default 3G.
+        inputs_def (dict): network input definition use to get input fields,
+            which is used to determine the order of returned data.
+        devices_num (int): number of devices.
+        num_trainers (int): number of trainers. Default 1.
+    """
+
+    def __init__(
+        self,
+        dataset=None,
+        sample_transforms=None,
+        batch_transforms=None,
+        batch_size=None,
+        shuffle=False,
+        drop_last=False,
+        drop_empty=True,
+        mixup_epoch=-1,
+        cutmix_epoch=-1,
+        class_aware_sampling=False,
+        use_process=False,
+        use_fine_grained_loss=False,
+        num_classes=80,
+        bufsize=-1,
+        memsize="3G",
+        inputs_def=None,
+        devices_num=1,
+        num_trainers=1,
+    ):
+        """"""
+        self._fields = copy.deepcopy(inputs_def["fields"]) if inputs_def else None
+
+        # transform
+        self._sample_transforms = Compose(sample_transforms, {"fields": self._fields})
+        self._batch_transforms = None
+
+        if batch_transforms:
+            batch_transforms = [bt for bt in batch_transforms]
+            self._batch_transforms = Compose(batch_transforms, {"fields": self._fields})
+
+        self._batch_size = batch_size
+        self._shuffle = shuffle
+        self._drop_last = drop_last
+        self._drop_empty = drop_empty
+
+        # sampling
+        self._mixup_epoch = mixup_epoch // num_trainers
+        self._cutmix_epoch = cutmix_epoch // num_trainers
+        self._class_aware_sampling = class_aware_sampling
+
+        self._indexes = None
+        self._pos = -1
+        self._epoch = -1
+        self._curr_iter = 0
+
+    def process(self, dataset):
+        """process"""
+        batch = self._load_batch(dataset)
+        res = self.worker(self._drop_empty, batch)
+        return res
+
+    def _load_batch(self, dataset):
+        batch = []
+        for data in dataset:
+            sample = copy.deepcopy(data)
+            batch.append(sample)
+        return batch
+
+    def worker(self, drop_empty=True, batch_samples=None):
+        """
+        sample transform and batch transform.
+        """
+        batch = []
+        for sample in batch_samples:
+            sample = self._sample_transforms(sample)
+            batch.append(sample)
+        if len(batch) > 0 and self._batch_transforms:
+            batch = self._batch_transforms(batch)
+        if len(batch) > 0 and self._fields:
+            batch = batch_arrange(batch, self._fields)
+        return batch
+
+
+def pad_batch_data(
+    insts,
+    pad_idx=0,
+    max_seq_len=None,
+    return_pos=False,
+    return_input_mask=False,
+    return_max_len=False,
+    return_num_token=False,
+    return_seq_lens=False,
+    pad_2d_pos_ids=False,
+    pad_segment_id=False,
+    select=False,
+    extract=False,
+):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and attention bias.
+    """
+    return_list = []
+    max_len = max(len(inst) for inst in insts) if max_seq_len is None else max_seq_len
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
+    if pad_2d_pos_ids:
+        boxes = [x + [[0, 0, 0, 0]] * (max_len - len(x)) for x in insts]
+        boxes = np.array(boxes, dtype="int64")
+        return boxes
+
+    inst_data = np.array([inst + list([pad_idx] * (max_len - len(inst))) for inst in insts])
+    return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
+
+    # position data
+    if return_pos:
+        inst_pos = np.array([list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst)) for inst in insts])
+
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
+
+    if return_input_mask:
+        # This is used to avoid attention on paddings.
+        input_mask_data = np.array([[1] * len(inst) + [0] * (max_len - len(inst)) for inst in insts])
+        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
+        return_list += [input_mask_data.astype("float32")]
+
+    if return_max_len:
+        return_list += [max_len]
+
+    if return_num_token:
+        num_token = 0
+        for inst in insts:
+            num_token += len(inst)
+        return_list += [num_token]
+
+    if return_seq_lens:
+        seq_lens = np.array([len(inst) for inst in insts])
+        return_list += [seq_lens.astype("int64").reshape([-1, 1])]
+
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+class ImageReader(object):
+    def __init__(
+        self,
+        super_rel_pos,
+        tokenizer,
+        max_key_len=16,
+        max_seq_len=512,
+        image_size=1024,
+        block_w=7,
+        block_h=7,
+        im_npos=224,
+    ):
+        self.tokenizer = tokenizer
+        self.vocab = self.tokenizer.get_vocab()
+
+        self.pad_id = self.vocab["[PAD]"]
+        self.cls_id = self.vocab["[CLS]"]
+        self.sep_id = self.vocab["[SEP]"]
+        self.mask_id = self.vocab["[MASK]"]
+        self.pad = "[PAD]"
+        self.cls = "[CLS]"
+        self.sep = "[SEP]"
+        self.mask = "[MASK]"
+
+        self.super_rel_pos = super_rel_pos
+        self.max_key_len = max_key_len
+        self.max_seq_len = max_seq_len
+        self.doc_stride = 128
+        self.unique_id = 10000000
+
+        self.examples = {}
+        self.features = {}
+
+        self.image_size = image_size
+        self.block_w = block_w
+        self.block_h = block_h
+        self.im_npos = im_npos
+        self.image_rois = []
+        cut_width, cut_height = int(self.image_size / self.block_w), int(self.image_size / self.block_h)
+        for idh in range(self.block_h):
+            for idw in range(self.block_w):
+                self.image_rois.append([idw * cut_width, idh * cut_height, cut_width, cut_height])
+
+        sample_trans = [
+            DecodeImage(),
+            ResizeImage(target_size=self.im_npos, interp=1),
+            NormalizeImage(
+                is_channel_first=False,
+                mean=[103.530, 116.280, 123.675],
+                std=[57.375, 57.120, 58.395],
+            ),
+            Permute(to_bgr=False),
+        ]
+
+        batch_trans = [PadBatch(pad_to_stride=32, use_padded_im_info=True)]
+
+        inputs_def = {
+            "fields": ["image", "im_info", "im_id", "gt_bbox"],
+        }
+        self.data_loader = ProcessReader(
+            sample_transforms=sample_trans,
+            batch_transforms=batch_trans,
+            shuffle=False,
+            drop_empty=True,
+            inputs_def=inputs_def,
+        )
+
+    def ppocr2example(self, ocr_res, img_path, querys):
+        examples = []
+        segments = []
+        for rst in ocr_res:
+            left = min(rst[0][0][0], rst[0][3][0])
+            top = min(rst[0][0][-1], rst[0][1][-1])
+            width = max(rst[0][1][0], rst[0][2][0]) - min(rst[0][0][0], rst[0][3][0])
+            height = max(rst[0][2][-1], rst[0][3][-1]) - min(rst[0][0][-1], rst[0][1][-1])
+            segments.append({"bbox": Bbox(*[left, top, width, height]), "text": rst[-1][0]})
+        segments.sort(key=cmp_to_key(two_dimension_sort_layout))
+        # 2. im_base64
+        img_base64 = img2base64(img_path)
+        # 3. doc_tokens, doc_boxes, segment_ids
+        doc_tokens = []
+        doc_boxes = []
+        ori_boxes = []
+        doc_segment_ids = []
+
+        im_w_box = max([seg["bbox"].left + seg["bbox"].width for seg in segments]) + 20
+        im_h_box = max([seg["bbox"].top + seg["bbox"].height for seg in segments]) + 20
+        img = Image.open(img_path)
+        im_w, im_h = img.size
+        im_w, im_h = max(im_w, im_w_box), max(im_h, im_h_box)
+
+        scale_x = self.image_size / im_w
+        scale_y = self.image_size / im_h
+        for segment_id, segment in enumerate(segments):
+            bbox = segment["bbox"]  # x, y, w, h
+            x1, y1, w, h = bbox.left, bbox.top, bbox.width, bbox.height
+            sc_w = int(min(w * scale_x, self.image_size - 1))
+            sc_h = int(min(h * scale_y, self.image_size - 1))
+            sc_y1 = int(max(0, min(y1 * scale_y, self.image_size - h - 1)))
+            sc_x1 = int(max(0, min(x1 * scale_x, self.image_size - w - 1)))
+            if w < 0:
+                raise ValueError("Incorrect bbox, please check the input word boxes.")
+            ori_bbox = Bbox(*[x1, y1, w, h])
+            sc_bbox = Bbox(*[sc_x1, sc_y1, sc_w, sc_h])
+            text = segment["text"]
+            char_num = []
+            eng_word = ""
+            for char in text:
+                if not check(char) and not eng_word:
+                    doc_tokens.append([char])
+                    doc_segment_ids.append([segment_id])
+                    char_num.append(2)
+                elif not check(char) and eng_word:
+                    doc_tokens.append([eng_word])
+                    doc_segment_ids.append([segment_id])
+                    char_num.append(len(eng_word))
+                    eng_word = ""
+                    doc_tokens.append([char])
+                    doc_segment_ids.append([segment_id])
+                    char_num.append(2)
+                else:
+                    eng_word += char
+            if eng_word:
+                doc_tokens.append([eng_word])
+                doc_segment_ids.append([segment_id])
+                char_num.append(len(eng_word))
+            ori_char_width = round(ori_bbox.width / sum(char_num), 1)
+            sc_char_width = round(sc_bbox.width / sum(char_num), 1)
+            for chr_idx in range(len(char_num)):
+                if chr_idx == 0:
+                    doc_boxes.append(
+                        [Bbox(*[sc_bbox.left, sc_bbox.top, (sc_char_width * char_num[chr_idx]), sc_bbox.height])]
+                    )
+                    ori_boxes.append(
+                        [Bbox(*[ori_bbox.left, ori_bbox.top, (ori_char_width * char_num[chr_idx]), ori_bbox.height])]
+                    )
+                else:
+                    doc_boxes.append(
+                        [
+                            Bbox(
+                                *[
+                                    sc_bbox.left + (sc_char_width * sum(char_num[:chr_idx])),
+                                    sc_bbox.top,
+                                    (sc_char_width * char_num[chr_idx]),
+                                    sc_bbox.height,
+                                ]
+                            )
+                        ]
+                    )
+                    ori_boxes.append(
+                        [
+                            Bbox(
+                                *[
+                                    ori_bbox.left + (ori_char_width * sum(char_num[:chr_idx])),
+                                    ori_bbox.top,
+                                    (ori_char_width * char_num[chr_idx]),
+                                    ori_bbox.height,
+                                ]
+                            )
+                        ]
+                    )
+
+        qas_id = 0
+        for query in querys:
+            example = Example(
+                keys=[query],
+                key_labels=[0],
+                doc_tokens=doc_tokens,
+                seq_labels=[0 for one in doc_tokens],
+                text="",
+                qas_id="0_" + str(qas_id),
+                model_type=None,
+                ori_boxes=ori_boxes,
+                boxes=doc_boxes,
+                segment_ids=doc_segment_ids,
+                symbol_ids=None,
+                image_rois=self.image_rois,
+                im_base64=img_base64,
+            )
+
+            examples.append(example)
+            qas_id += 1
+        return examples
+
+    def box2example(self, ocr_res, img_path, querys):
+        """
+        ocr_res = [[word_str, [x1, y1, x2, y2]], [word_str, [x1, y1, x2, y2]], ...]
+        """
+        examples = []
+        doc_boxes = []
+        ori_boxes = []
+        boxes = [x[1] for x in ocr_res]
+        im_w_box = max([b[2] for b in boxes]) + 20
+        im_h_box = max([b[3] for b in boxes]) + 20
+        img = Image.open(img_path)
+        im_w, im_h = img.size
+        im_w, im_h = max(im_w, im_w_box), max(im_h, im_h_box)
+
+        scale_x = self.image_size / im_w
+        scale_y = self.image_size / im_h
+        for box in boxes:
+            x1, y1, x2, y2 = box
+            if x2 <= x1 or y2 <= y1:
+                raise ValueError("Invalid bbox format")
+            w = max(x1, x2) - min(x1, x2)
+            h = max(y1, y2) - min(y1, y2)
+            ori_boxes.append([Bbox(*[x1, y1, w, h])])
+            w = int(min(w * scale_x, self.image_size - 1))
+            h = int(min(h * scale_y, self.image_size - 1))
+            x1 = int(max(0, min(x1 * scale_x, self.image_size - w - 1)))
+            y1 = int(max(0, min(y1 * scale_y, self.image_size - h - 1)))
+            if w < 0:
+                raise ValueError("Invalid bbox format")
+            doc_boxes.append([Bbox(*[x1, y1, w, h])])
+
+        img_base64 = img2base64(img_path)
+
+        doc_tokens = [[x[0]] for x in ocr_res]
+        doc_segment_ids = [[0]] * len(doc_tokens)
+
+        qas_id = 0
+        for query in querys:
+            example = Example(
+                keys=[query],
+                key_labels=[0],
+                doc_tokens=doc_tokens,
+                seq_labels=[0 for one in doc_tokens],
+                text="",
+                qas_id=str(qas_id),
+                model_type=None,
+                ori_boxes=ori_boxes,
+                boxes=doc_boxes,
+                segment_ids=doc_segment_ids,
+                symbol_ids=None,
+                image_rois=self.image_rois,
+                im_base64=img_base64,
+            )
+
+            if not (len(example.doc_tokens) == len(example.boxes) == len(example.segment_ids)):
+                raise ValueError(
+                    "Incorrect word_boxes, the format should be `List[str, Tuple[float, float, float, float]]`"
+                )
+
+            examples.append(example)
+            qas_id += 1
+
+        return examples
+
+    def example2feature(self, example, tokenizer, max_line_id=128):
+        features = []
+        all_doc_tokens = []
+        tok_to_orig_index = []
+        boxes = []
+        segment_ids = []
+        all_doc_labels = []
+
+        query_tokens = tokenizer.tokenize("&" + str(example.keys[0]))[1:][: self.max_key_len]
+
+        for i, (token_list, box_list, seg_list, l) in enumerate(
+            zip(example.doc_tokens, example.boxes, example.segment_ids, example.seq_labels)
+        ):
+            assert len(token_list) == len(box_list) == len(seg_list)
+            for idt, (token, box, seg) in enumerate(zip(token_list, box_list, seg_list)):
+                sub_tokens = tokenizer.tokenize("&" + token)[1:]
+                for ii, sub_token in enumerate(sub_tokens):
+                    width_split = box.width / len(sub_tokens)
+                    boxes.append([box.left + ii * width_split, box.top, width_split, box.height])
+                    segment_ids.append(seg)
+                    tok_to_orig_index.append(i)
+                    all_doc_tokens.append(sub_token)
+                    all_doc_labels.extend([0])
+
+        max_tokens_for_doc = self.max_seq_len - len(query_tokens) - 4
+        doc_spans = []
+        start_offset = 0
+        while start_offset < len(all_doc_tokens):
+            length = len(all_doc_tokens) - start_offset
+            if length > max_tokens_for_doc:
+                length = max_tokens_for_doc
+            doc_spans.append(DocSpan(start=start_offset, length=length))
+            if start_offset + length == len(all_doc_tokens):
+                break
+            start_offset += min(length, self.doc_stride)
+
+        for (doc_span_index, doc_span) in enumerate(doc_spans):
+            tokens = []
+            labels = []
+            feature_segment_ids = []
+            feature_boxes = []
+            token_to_orig_map = {}
+            token_is_max_context = {}
+            text_type_ids = []
+            tokens.append(self.cls)
+            feature_boxes.append([0, 0, 0, 0])
+            labels.append(0)
+            text_type_ids.append(0)
+            feature_segment_ids.append(max_line_id - 1)
+
+            for i in range(doc_span.length):
+                split_token_index = doc_span.start + i
+                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
+                is_max_context = self._check_is_max_context(doc_spans, doc_span_index, split_token_index)
+                token_is_max_context[len(tokens)] = is_max_context
+                tokens.append(all_doc_tokens[split_token_index])
+
+                feature_boxes.append(boxes[split_token_index])
+                feature_segment_ids.append(segment_ids[split_token_index])
+                text_type_ids.append(0)
+                labels.append(all_doc_labels[split_token_index])
+
+            tokens.append(self.sep)
+            feature_boxes.append([0, 0, 0, 0])
+            text_type_ids.append(0)
+            feature_segment_ids.append(max_line_id - 1)
+            labels.append(0)
+            for token in query_tokens:
+                tokens.append(token)
+                feature_boxes.append([0, 0, 0, 0])
+                feature_segment_ids.append(max_line_id - 1)
+                text_type_ids.append(1)
+                labels.append(0)
+
+            tokens = tokens + [self.sep]
+            feature_boxes.extend([[0, 0, 0, 0]])
+            feature_segment_ids = feature_segment_ids + [max_line_id - 1]
+            text_type_ids = text_type_ids + [1]
+            labels.append(0)
+
+            position_ids = list(range(len(tokens)))
+            token_ids = tokenizer.convert_tokens_to_ids(tokens)
+            feature_segment_ids = [x % max_line_id for x in feature_segment_ids]
+
+            feature = Feature(
+                unique_id=self.unique_id,
+                example_index=0,
+                qas_id=example.qas_id,
+                doc_span_index=doc_span_index,
+                tokens=tokens,
+                token_to_orig_map=token_to_orig_map,
+                token_is_max_context=token_is_max_context,
+                token_ids=token_ids,
+                position_ids=position_ids,
+                text_type_ids=text_type_ids,
+                text_symbol_ids=None,
+                overlaps=None,
+                keys=example.keys,
+                seq_labels=labels,
+                se_seq_labels=None,
+                bio_seq_labels=None,
+                bioes_seq_labels=None,
+                key_labels=example.key_labels,
+                model_type=example.model_type,
+                doc_tokens=example.doc_tokens,
+                doc_labels=example.seq_labels,
+                text=example.text,
+                boxes=feature_boxes,
+                segment_ids=feature_segment_ids,
+                im_base64=example.im_base64,
+                image_rois=example.image_rois,
+            )
+            features.append(feature)
+            self.unique_id += 1
+        return features
+
+    def _pad_batch_records(self, batch_records, max_line_id=128, phase="infer"):
+        """pad batch records"""
+        return_list = []
+        batch_token_ids = []
+        batch_sent_ids = []
+        batch_pos_ids = []
+        batch_2d_pos_ids = []
+        batch_segment_ids = []
+        batch_labels = []
+        batch_unique_id = []
+        batch_image_base64 = []
+        batch_image_rois = []
+
+        for i in range(len(batch_records)):
+            batch_token_ids.append(batch_records[i].token_ids)
+            batch_sent_ids.append(batch_records[i].text_type_ids)
+            batch_segment_ids.append(batch_records[i].segment_ids)
+            batch_labels.append(batch_records[i].seq_labels)
+            batch_unique_id.append(batch_records[i].unique_id)
+            batch_pos_ids.append(batch_records[i].position_ids)
+            batch_2d_pos_ids.append(batch_records[i].boxes)
+            batch_image_base64.append(batch_records[i].im_base64)
+            batch_image_rois.append(batch_records[i].image_rois)
+
+        padded_token_ids, _ = pad_batch_data(batch_token_ids, pad_idx=self.pad_id, return_input_mask=True)
+        padded_sent_ids = pad_batch_data(batch_sent_ids, pad_idx=self.pad_id)
+        padded_pos_ids = pad_batch_data(batch_pos_ids, pad_idx=self.pad_id)
+        new_padded_pos_ids = []
+        for idp, pos_ids in enumerate(padded_pos_ids):
+            new_padded_pos_ids.append(
+                np.concatenate((pos_ids, np.array([[x] for x in range(self.block_w * self.block_h)])), axis=0)
+            )
+        padded_pos_ids = np.array(new_padded_pos_ids)
+        padded_2d_pos_ids = pad_batch_data(batch_2d_pos_ids, pad_2d_pos_ids=True, select=False, extract=True)
+        new_padded_2d_pos_ids = []
+        for pos_ids_2d, batch_record in zip(padded_2d_pos_ids, batch_records):
+            new_padded_2d_pos_ids.append(np.concatenate((pos_ids_2d, np.array(batch_record.image_rois)), axis=0))
+        padded_2d_pos_ids = np.array(new_padded_2d_pos_ids)
+        padded_segment_ids = pad_batch_data(batch_segment_ids, pad_idx=max_line_id - 1)
+
+        input_mask_mat = self._build_input_mask(
+            np.array([list(x) + [[-1] for _ in range(self.block_w * self.block_h)] for x in padded_token_ids])
+        )
+        super_rel_pos = self._build_rel_pos(
+            np.array([list(x) + [[-1] for _ in range(self.block_w * self.block_h)] for x in padded_token_ids])
+        )
+
+        unique_id = np.array(batch_unique_id).astype("float32").reshape([-1, 1])
+
+        bsz, seq_len, _ = padded_token_ids.shape
+        task_ids = np.ones((bsz, seq_len, 1)).astype("int64")
+        for b in range(bsz):
+            if np.sum(padded_2d_pos_ids[b]) > 0:
+                task_ids[b, :, :] = 0
+            else:
+                task_ids[b, :, :] = 1
+
+        coco_data = self.generate_coco_data(
+            [""] * len(batch_image_base64),
+            batch_image_base64,
+            [self.image_size] * len(batch_image_base64),
+            [self.image_size] * len(batch_image_base64),
+            batch_image_rois,
+        )
+
+        image_data = self.im_make_batch(
+            self.data_loader.process(coco_data),
+            self.block_w * self.block_h,
+            len(batch_image_base64),
+        )
+
+        return_list = [
+            padded_token_ids,
+            padded_sent_ids,
+            padded_pos_ids,
+            padded_2d_pos_ids,
+            padded_segment_ids,
+            task_ids,
+            input_mask_mat,
+            super_rel_pos,
+            unique_id,
+            image_data,
+        ]
+        return return_list
+
+    def data_generator(self, ocr_res, img_path, querys, batch_size, ocr_type="ppocr", phase="infer"):
+        if ocr_type == "ppocr":
+            self.examples[phase] = self.ppocr2example(ocr_res, img_path, querys)
+        elif ocr_type == "word_boxes":
+            self.examples[phase] = self.box2example(ocr_res, img_path, querys)
+        self.features[phase] = sum([self.example2feature(e, self.tokenizer) for e in self.examples[phase]], [])
+        for batch_data in self._prepare_batch_data(self.features[phase], batch_size, phase=phase):
+            yield self._pad_batch_records(batch_data)
+
+    def _prepare_batch_data(self, features, batch_size, phase=None):
+        """generate batch records"""
+        batch_records = []
+        for feature in features:
+            to_append = len(batch_records) < batch_size
+            if to_append:
+                batch_records.append(feature)
+            else:
+                yield batch_records
+                batch_records = [feature]
+
+        if phase == "infer" and batch_records:
+            yield batch_records
+
+    def _build_input_mask(self, padded_token_ids):
+        """build_input_mask"""
+        bsz, seq_len, _ = padded_token_ids.shape
+        return np.ones((bsz, seq_len, seq_len)).astype("float32")
+
+    def _build_rel_pos(self, padded_token_ids):
+        """build relative position"""
+        bsz, seq_len, _ = padded_token_ids.shape
+        rel_pos = np.zeros((bsz, seq_len, seq_len)).astype("int64")
+        return rel_pos
+
+    def generate_coco_data(
+        self,
+        batch_image_path,
+        batch_image_base64,
+        batch_scaled_width,
+        batch_scaled_height,
+        batch_rois,
+    ):
+        """generator coco data"""
+
+        def transform(dataset):
+            roidbs = []
+            for i in dataset:
+                rvl_rec = {
+                    "im_file": i[0],
+                    "im_id": np.array([i[1]]),
+                    "h": i[2],
+                    "w": i[3],
+                    "gt_bbox": i[4],
+                    "cover_box": i[5],
+                    "im_base64": i[6],
+                }
+
+                roidbs.append(rvl_rec)
+            return roidbs
+
+        result = []
+        for image_path, im_base64, width, height, roi in zip(
+            batch_image_path,
+            batch_image_base64,
+            batch_scaled_width,
+            batch_scaled_height,
+            batch_rois,
+        ):
+            result.append((image_path, 0, height, width, roi, None, im_base64))
+        return transform(result)
+
+    def im_make_batch(self, dataset, image_boxes_nums, bsize):
+        """make image batch"""
+        img_batch = np.array([i[0] for i in dataset], "float32")
+        return img_batch
+
+    def BIO2SPAN(self, BIO):
+        start_label, end_label = [], []
+        for seq in BIO:
+            first_one = True
+            start_pos = [1 if x == 2 else 0 for x in seq]
+            if sum(start_pos) == 0 and sum(seq) != 0:
+                start_pos = []
+                for idp, p in enumerate(seq):
+                    if p == 1 and first_one:
+                        start_pos.append(1)
+                        first_one = False
+                    else:
+                        start_pos.append(0)
+
+            start_label.append(start_pos)
+
+            end_tmp = []
+            for index, s in enumerate(seq):
+                if s == -100 or s == 0:
+                    end_tmp.append(s)
+                elif s == 2 and index + 1 < len(seq) and (seq[index + 1] == 0 or seq[index + 1] == 2):
+                    end_tmp.append(1)
+                elif s == 2 and index + 1 < len(seq) and seq[index + 1] != 0:
+                    end_tmp.append(0)
+                elif s == 2 and index + 1 == len(seq):
+                    end_tmp.append(1)
+                elif s == 1 and (index + 1 == len(seq) or seq[index + 1] != 1):
+                    end_tmp.append(1)
+                else:
+                    end_tmp.append(0)
+            end_label.append(end_tmp)
+
+        return start_label, end_label
+
+    def _check_is_max_context(self, doc_spans, cur_span_index, position):
+        best_score = None
+        best_span_index = None
+        for (span_index, doc_span) in enumerate(doc_spans):
+            end = doc_span.start + doc_span.length - 1
+            if position < doc_span.start:
+                continue
+            if position > end:
+                continue
+            num_left_context = position - doc_span.start
+            num_right_context = end - position
+            score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+            if best_score is None or score > best_score:
+                best_score = score
+                best_span_index = span_index
+        return cur_span_index == best_span_index
+
+
+def get_doc_pred(result, ans_pos, example, tokenizer, feature, do_lower_case, all_key_probs, example_index):
+    def _compute_softmax(scores):
+        """Compute softmax probability over raw logits."""
+        if len(scores) == 0:
+            return []
+
+        max_score = None
+        for score in scores:
+            if max_score is None or score > max_score:
+                max_score = score
+
+        exp_scores = []
+        total_sum = 0.0
+        for score in scores:
+            x = math.exp(score - max_score)
+            exp_scores.append(x)
+            total_sum += x
+
+        probs = []
+        for score in exp_scores:
+            probs.append(score / total_sum)
+        return probs
+
+    preds = []
+    for start_index, end_index in ans_pos:
+        # process data
+        tok_tokens = feature.tokens[start_index : end_index + 1]
+        tok_text = " ".join(tok_tokens)
+        # De-tokenize WordPieces that have been split off.
+        tok_text = tok_text.replace(" ##", "")
+        tok_text = tok_text.replace("##", "")
+        tok_text = tok_text.strip()
+        tok_text = "".join(tok_text.split())
+
+        orig_doc_start = feature.token_to_orig_map[start_index]
+        orig_doc_end = feature.token_to_orig_map[end_index]
+        orig_tokens = example.doc_tokens[orig_doc_start : orig_doc_end + 1]
+
+        # Clean whitespace
+        orig_text = "".join(["".join(x) for x in orig_tokens])
+        final_text = get_final_text(tok_text, orig_text, tokenizer, do_lower_case)
+
+        probs = []
+        for idx, logit in enumerate(result.seq_logits[start_index : end_index + 1]):
+            if idx == 0:
+                # -1 is for B in  OIB or I in OI
+                probs.append(_compute_softmax(logit)[-1])
+            else:
+                # 1 is for I in OIB or I in OI
+                probs.append(_compute_softmax(logit)[1])
+        avg_prob = sum(probs) / len(probs)
+        preds.append({"value": final_text, "prob": round(avg_prob, 2), "start": orig_doc_start, "end": orig_doc_end})
+    return preds
+
+
+def get_final_text(pred_text, orig_text, tokenizer, do_lower_case):
+    """Project the tokenized prediction back to the original text."""
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        return orig_text
+
+    output_text = orig_text[orig_start_position : (orig_end_position + 1)]
+    return output_text
+
+
+def find_bio_pos(label):
+    """find answer position from BIO label"""
+    e = []
+    cand_ans = []
+    last_l = None
+    for idx, l in enumerate(label):
+        if l == "O":
+            if e:
+                cand_ans.append([e[0], e[-1]])
+            e = []
+        elif l.startswith("B"):
+            if last_l == "O" or last_l is None:
+                if len(e) != 0:
+                    e = []
+                e.append(idx)
+            else:  # I B
+                if e:
+                    cand_ans.append([e[0], e[-1]])
+                    e = []
+                e.append(idx)
+        elif l.startswith("I"):
+            if len(e) == 0:
+                continue
+            else:
+                e.append(idx)
+        last_l = l
+    if e:
+        cand_ans.append([e[0], e[-1]])
+    return cand_ans
+
+
+def viterbi_decode(logits):
+    np_logits = np.array(logits)  # shape: L * D
+    length, dim = np_logits.shape
+    f = np.zeros(np_logits.shape)
+    path = [["" for i in range(dim)] for j in range(length)]
+    label_scheme = "OIB"
+    # oib label 0:O, 1:I, 2:B
+    # illegal matrix: [O, I ,B, start, end] * [O, I, B, start, end]
+    illegal = np.array([[0, -1, 0, -1, 0], [0, 0, 0, -1, 0], [0, 0, 0, 0, 0], [0, -1, 0, 0, 0], [-1, -1, -1, -1, -1]])
+    illegal = illegal * 1000
+
+    f[0, :] = np_logits[0, :] + illegal[3, :3]
+    path[0] = [label_scheme[i] for i in range(dim)]
+
+    for step in range(1, length):
+        last_s = f[step - 1, :]
+        for d in range(dim):
+            cand_score = illegal[:3, d] + last_s + np_logits[step, d]
+            f[step, d] = np.max(cand_score)
+            path[step][d] = path[step - 1][np.argmax(cand_score)] + label_scheme[d]
+    final_path = path[-1][np.argmax(f[-1, :])]
+    return final_path
+
+
+def find_answer_pos(logits, feature):
+    start_index = -1
+    end_index = -1
+    ans = []
+    cand_ans = []
+
+    best_path = viterbi_decode(logits)
+    cand_ans = find_bio_pos(best_path)
+
+    for start_index, end_index in cand_ans:
+        is_valid = True
+        if start_index not in feature.token_to_orig_map:
+            is_valid = False
+        if end_index not in feature.token_to_orig_map:
+            is_valid = False
+        if not feature.token_is_max_context.get(start_index, False):
+            is_valid = False
+        if end_index < start_index:
+            is_valid = False
+        if is_valid:
+            ans.append([start_index, end_index])
+
+    return ans
+
+
+def calEuclidean(x_list, y_list):
+    """
+    Calculate euclidean distance
+    """
+    if x_list is None or y_list is None:
+        return None
+    else:
+        dist = np.sqrt(np.square(x_list[0] - y_list[0]) + np.square(x_list[1] - y_list[1]))
+        return dist
+
+
+def longestCommonSequence(question_tokens, context_tokens):
+    """
+    Longest common sequence
+    """
+    max_index = -1
+    max_len = 0
+    m, n = len(question_tokens), len(context_tokens)
+    dp = [[0] * (n + 1) for _ in range(m + 1)]
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            if question_tokens[i - 1].lower() == context_tokens[j - 1][0].lower():
+                dp[i][j] = 1 + dp[i - 1][j - 1]
+                if dp[i][j] > max_len:
+                    max_len = dp[i][j]
+                    max_index = j - 1
+    return max_index, max_len
+
+
+def sort_res(prompt, ans_list, context, boxes, lang="en"):
+    if len(ans_list) == 1:
+        return ans_list
+    else:
+        ans_val = []
+        for ans in ans_list:
+            ans_val.append(ans["value"])
+        if len(set(ans_val)) == len(ans_val):
+            sorted_ans_list = sorted(ans_list, key=lambda x: x["prob"], reverse=True)
+            return sorted_ans_list
+        else:
+            if lang == "en":
+                clean_prompt = [word for word in prompt.split(" ")]
+            else:
+                clean_prompt = [word for word in prompt]
+
+            max_index, max_len = longestCommonSequence(clean_prompt, context)
+            if max_index == -1:
+                sorted_ans_list = sorted(ans_list, key=lambda x: x["prob"], reverse=True)
+                return sorted_ans_list
+            else:
+                prompt_center = []
+                for idx in range(max_index - max_len + 1, max_index + 1):
+                    box = boxes[idx][0]
+                    x = box.left + box.width / 2
+                    y = box.top + box.height / 2
+                    prompt_center.append([x, y])
+
+                ans_center = []
+                ans_prob = []
+                for ans in ans_list:
+                    ans_prob.append(ans["prob"])
+                    cent_list = []
+                    for idx in range(ans["start"], ans["end"] + 1):
+                        box = boxes[idx][0]
+                        x = box.left + box.width / 2
+                        y = box.top + box.height / 2
+                        cent_list.append([x, y])
+                    ans_center.append(cent_list)
+
+                ans_odist = []
+                for ans_c in ans_center:
+                    odist = 0
+                    for a_c in ans_c:
+                        for p_c in prompt_center:
+                            odist += calEuclidean(a_c, p_c)
+                    odist /= len(ans_c)
+                    ans_odist.append(odist * (-1))
+
+                ans_score = np.sum([ans_prob, ans_odist], axis=0).tolist()
+                sorted_ans_list = sorted(ans_list, key=lambda x: ans_score[ans_list.index(x)], reverse=True)
+                return sorted_ans_list
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/word_segmentation.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/word_segmentation.py
new file mode 100644
index 000000000..1bb3974b9
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/word_segmentation.py
@@ -0,0 +1,173 @@
+# coding:utf-8
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import jieba
+
+from .lexical_analysis import LacTask
+from .named_entity_recognition import NERWordTagTask
+from .task import Task
+
+usage = r"""
+           from paddlenlp import Taskflow
+
+           # Taskflow base模式
+           seg = Taskflow("word_segmentation")
+           seg("第十四届全运会在西安举办")
+           '''
+           ['第十四届', '全运会', '在', '西安', '举办']
+           '''
+
+           seg(["第十四届全运会在西安举办", "三亚是一个美丽的城市"])
+           '''
+           [['第十四届', '全运会', '在', '西安', '举办'], ['三亚', '是', '一个', '美丽', '的', '城市']]
+           '''
+
+           # 快速模式分词
+           seg = Taskflow("word_segmentation", mode="fast")
+           seg("第十四届全运会在西安举办")
+           '''
+           ['第十四届', '全运会', '在', '西安', '举办']
+           '''
+
+           # 精确模式分词
+           seg = Taskflow("word_segmentation", mode="accurate")
+           seg("李伟拿出具有科学性、可操作性的《陕西省高校管理体制改革实施方案》")
+           '''
+           ['李伟', '拿出', '具有', '科学性', '、', '可操作性', '的', '《', '陕西省高校管理体制改革实施方案', '》']
+           '''
+         """
+
+
+class SegJiebaTask(Task):
+    """
+    Word Segmentation task for the raw text.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        user_dict(string): The user-defined dictionary, default to None.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    def __init__(self, task, model, user_dict=None, **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
+        self._user_dict = user_dict
+        if self._user_dict:
+            jieba.load_userdict(user_dict)
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        return None
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+        return None
+
+    def _construct_tokenizer(self, model):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        return None
+
+    def _preprocess(self, inputs):
+        inputs = self._check_input_text(inputs)
+        return inputs
+
+    def _postprocess(self, inputs):
+        results = inputs if len(inputs) > 1 else inputs[0]
+        return results
+
+    def _run_model(self, inputs):
+        def cut(string):
+            return jieba.lcut(string)
+
+        results = list(map(cut, inputs))
+        return results
+
+
+class SegLACTask(LacTask):
+    """
+    Segement the sentences to the words using LAC mode.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    def __init__(self, task, model, **kwargs):
+        super().__init__(task=task, model="lac", **kwargs)
+
+    def _postprocess(self, inputs):
+        """
+        The model output is the tag ids, this function will convert the model output to raw text.
+        """
+        lengths = inputs["lens"]
+        preds = inputs["result"]
+        sents = inputs["text"]
+        final_results = []
+        for sent_index in range(len(lengths)):
+            tags = [self._id2tag_dict[str(index)] for index in preds[sent_index][: lengths[sent_index]]]
+            sent = sents[sent_index]
+            if self._custom:
+                self._custom.parse_customization(sent, tags)
+            sent_out = []
+            tags_out = []
+            parital_word = ""
+            for ind, tag in enumerate(tags):
+                if parital_word == "":
+                    parital_word = sent[ind]
+                    tags_out.append(tag.split("-")[0])
+                    continue
+                if tag.endswith("-B") or (tag == "O" and tags[ind - 1] != "O"):
+                    sent_out.append(parital_word)
+                    tags_out.append(tag.split("-")[0])
+                    parital_word = sent[ind]
+                    continue
+                parital_word += sent[ind]
+
+            if len(sent_out) < len(tags_out):
+                sent_out.append(parital_word)
+            final_results.append(sent_out)
+        final_results = self._auto_joiner(final_results, self.input_mapping)
+        final_results = final_results if len(final_results) > 1 else final_results[0]
+        return final_results
+
+
+class SegWordTagTask(NERWordTagTask):
+    """
+    Segement the sentences to the words using WordTag model.
+    Args:
+        task(string): The name of task.
+        model(string): The model name in the task.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+
+    """
+
+    def __init__(self, model, task, **kwargs):
+        super().__init__(model="wordtag", task=task, **kwargs)
+
+    def _simplify_result(self, results):
+        simple_results = []
+        for result in results:
+            simple_result = []
+            if "items" in result:
+                for item in result["items"]:
+                    simple_result.append(item["item"])
+            simple_results.append(simple_result)
+        simple_results = simple_results[0] if len(simple_results) == 1 else simple_results
+        return simple_results
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/zero_shot_text_classification.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/zero_shot_text_classification.py
new file mode 100644
index 000000000..43d9f8ff5
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/taskflow/zero_shot_text_classification.py
@@ -0,0 +1,427 @@
+# coding:utf-8
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Union
+
+import numpy as np
+from paddle.static import InputSpec
+from scipy.special import expit as np_sigmoid
+from scipy.special import softmax as np_softmax
+
+from ..prompt import PromptDataCollatorWithPadding, UTCTemplate
+from ..transformers import UTC, AutoTokenizer
+from .task import Task
+from .utils import static_mode_guard
+
+usage = r"""
+        from paddlenlp import Taskflow
+
+        schema = ['这是一条差评', '这是一条好评']
+        text_cls = Taskflow("zero_shot_text_classification", schema=schema)
+        text_cls('房间干净明亮，非常不错')
+        '''
+        [{'predictions': [{'label': '这是一条好评', 'score': 0.9695149765679986}], 'text_a': '房间干净明亮，非常不错'}]
+        '''
+         """
+
+
+class ZeroShotTextClassificationTask(Task):
+    """
+    Zero-shot Universial Text Classification Task.
+
+    Args:
+        task (string): The name of task.
+        model (string): The model_name in the task.
+        schema (list): List of candidate labels.
+        kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
+    """
+
+    resource_files_names = {
+        "model_state": "model_state.pdparams",
+        "config": "config.json",
+        "vocab_file": "vocab.txt",
+        "special_tokens_map": "special_tokens_map.json",
+        "tokenizer_config": "tokenizer_config.json",
+    }
+    resource_files_urls = {
+        "utc-xbase": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-xbase/model_state.pdparams",
+                "e751c3a78d4caff923759c0d0547bfe6",
+            ],
+            "config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-xbase/config.json",
+                "4c2b035c71ff226a14236171a1a202a4",
+            ],
+            "vocab_file": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-xbase/vocab.txt",
+                "97eb0ec5a5890c8190e10e251af2e133",
+            ],
+            "special_tokens_map": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-xbase/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-xbase/tokenizer_config.json",
+                "be86466f6769fde498690269d099ea7c",
+            ],
+        },
+        "utc-base": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-base/model_state.pdparams",
+                "72089351c6fb02bcf8f270fe0cc508e9",
+            ],
+            "config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-base/config.json",
+                "79aa9a69286604436937b03f429f4d34",
+            ],
+            "vocab_file": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-base/vocab.txt",
+                "97eb0ec5a5890c8190e10e251af2e133",
+            ],
+            "special_tokens_map": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-base/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-base/tokenizer_config.json",
+                "be86466f6769fde498690269d099ea7c",
+            ],
+        },
+        "utc-medium": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-medium/model_state.pdparams",
+                "2802c766a8b880aad910dd5a7db809ae",
+            ],
+            "config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-medium/config.json",
+                "2899cd7c8590dcdc4223e4b1262e2f4e",
+            ],
+            "vocab_file": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-medium/vocab.txt",
+                "97eb0ec5a5890c8190e10e251af2e133",
+            ],
+            "special_tokens_map": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-medium/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-medium/tokenizer_config.json",
+                "be86466f6769fde498690269d099ea7c",
+            ],
+        },
+        "utc-micro": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-micro/model_state.pdparams",
+                "d9ebdfce9a8c6ebda43630ed18b07c58",
+            ],
+            "config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-micro/config.json",
+                "8c8da9337e09e0c3962196987dca18bd",
+            ],
+            "vocab_file": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-micro/vocab.txt",
+                "97eb0ec5a5890c8190e10e251af2e133",
+            ],
+            "special_tokens_map": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-micro/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-micro/tokenizer_config.json",
+                "be86466f6769fde498690269d099ea7c",
+            ],
+        },
+        "utc-mini": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-mini/model_state.pdparams",
+                "848a2870cd51bfc22174a2a38884085c",
+            ],
+            "config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-mini/config.json",
+                "933b8ebfcf995b1f965764ac426a2ffa",
+            ],
+            "vocab_file": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-mini/vocab.txt",
+                "97eb0ec5a5890c8190e10e251af2e133",
+            ],
+            "special_tokens_map": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-mini/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-mini/tokenizer_config.json",
+                "be86466f6769fde498690269d099ea7c",
+            ],
+        },
+        "utc-nano": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-nano/model_state.pdparams",
+                "2bd31212d989619148eda3afebc7354d",
+            ],
+            "config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-nano/config.json",
+                "02fe311fdcc127e56ff0975038cc4d65",
+            ],
+            "vocab_file": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-nano/vocab.txt",
+                "97eb0ec5a5890c8190e10e251af2e133",
+            ],
+            "special_tokens_map": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-nano/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-nano/tokenizer_config.json",
+                "be86466f6769fde498690269d099ea7c",
+            ],
+        },
+        "utc-pico": {
+            "model_state": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-pico/model_state.pdparams",
+                "f7068d63ad2930de7ac850d475052946",
+            ],
+            "config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-pico/config.json",
+                "c0c7412cdd070edb5a1ce70c7fc68ad3",
+            ],
+            "vocab_file": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-pico/vocab.txt",
+                "97eb0ec5a5890c8190e10e251af2e133",
+            ],
+            "special_tokens_map": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-pico/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://paddlenlp.bj.bcebos.com/taskflow/zero_shot_text_classification/utc-pico/tokenizer_config.json",
+                "be86466f6769fde498690269d099ea7c",
+            ],
+        },
+        "utc-large": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/zero_shot_text_classification/utc-large/model_state.pdparams",
+                "71eb9a732c743a513b84ca048dc4945b",
+            ],
+            "config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/zero_shot_text_classification/utc-large/config.json",
+                "9496be2cc99f7e6adf29280320274142",
+            ],
+            "vocab_file": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/zero_text_classification/utc-large/vocab.txt",
+                "afc01b5680a53525df5afd7518b42b48",
+            ],
+            "special_tokens_map": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/zero_text_classification/utc-large/special_tokens_map.json",
+                "2458e2131219fc1f84a6e4843ae07008",
+            ],
+            "tokenizer_config": [
+                "https://bj.bcebos.com/paddlenlp/taskflow/zero_text_classification/utc-large/tokenizer_config.json",
+                "dcb0f3257830c0eb1f2de47f2d86f89a",
+            ],
+        },
+        "__internal_testing__/tiny-random-utc": {
+            "model_state": [
+                "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-utc/model_state.pdparams",
+                "d303b59447be690530c35c73f8fd03cd",
+            ],
+            "config": [
+                "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-utc/config.json",
+                "3420a6638a7c73c6239eb1d7ca1bc5fe",
+            ],
+            "vocab_file": [
+                "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-utc/vocab.txt",
+                "97eb0ec5a5890c8190e10e251af2e133",
+            ],
+            "special_tokens_map": [
+                "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-utc/special_tokens_map.json",
+                "8b3fb1023167bb4ab9d70708eb05f6ec",
+            ],
+            "tokenizer_config": [
+                "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-utc/tokenizer_config.json",
+                "258fc552c15cec90046066ca122899e2",
+            ],
+        },
+    }
+
+    def __init__(self, task: str, model: str, schema: list = None, **kwargs):
+        super().__init__(task=task, model=model, **kwargs)
+
+        self._set_utc_schema(schema)
+        self._max_seq_len = kwargs.get("max_seq_len", 512)
+        self._batch_size = kwargs.get("batch_size", 1)
+        self._pred_threshold = kwargs.get("pred_threshold", 0.5)
+        self._num_workers = kwargs.get("num_workers", 0)
+        self._single_label = kwargs.get("single_label", False)
+
+        self._check_task_files()
+        self._construct_tokenizer()
+        self._check_predictor_type()
+        self._get_inference_model()
+
+    def _set_utc_schema(self, schema):
+        if schema is None:
+            self._choices = None
+        elif isinstance(schema, list):
+            self._choices = schema
+        elif isinstance(schema, dict) and len(schema) == 1:
+            for key in schema:
+                self._choices = schema[key]
+        else:
+            raise ValueError(f"Invalid schema: {schema}.")
+
+    def set_schema(self, schema):
+        self._set_utc_schema(schema)
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        self._input_spec = [
+            InputSpec(shape=[None, None], dtype="int64", name="input_ids"),
+            InputSpec(shape=[None, None], dtype="int64", name="token_type_ids"),
+            InputSpec(shape=[None, None], dtype="int64", name="position_ids"),
+            InputSpec(shape=[None, None, None, None], dtype="float32", name="attention_mask"),
+            InputSpec(shape=[None, None], dtype="int64", name="omask_positions"),
+            InputSpec(shape=[None], dtype="int64", name="cls_positions"),
+        ]
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+        model_instance = UTC.from_pretrained(self._task_path, from_hf_hub=self.from_hf_hub)
+        self._model = model_instance
+        self._model.eval()
+
+    def _construct_tokenizer(self):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        self._tokenizer = AutoTokenizer.from_pretrained(self._task_path, from_hf_hub=self.from_hf_hub)
+        self._collator = PromptDataCollatorWithPadding(self._tokenizer, return_tensors="np")
+        self._template = UTCTemplate(self._tokenizer, self._max_seq_len)
+
+    def _check_input_text(self, inputs):
+        inputs = inputs[0]
+        if isinstance(inputs, str) or isinstance(inputs, dict):
+            inputs = [inputs]
+
+        if isinstance(inputs, list):
+            input_list = []
+            for example in inputs:
+                data = {"text_a": "", "text_b": "", "choices": self._choices}
+                if isinstance(example, dict):
+                    for k in example:
+                        if k in data:
+                            data[k] = example[k]
+                elif isinstance(example, str):
+                    data["text_a"] = example
+                    data["text_b"] = ""
+                elif isinstance(example, list):
+                    for x in example:
+                        if not isinstance(x, str):
+                            raise ValueError("Invalid inputs, input text should be strings.")
+                    data["text_a"] = example[0]
+                    data["text_b"] = "".join(example[1:]) if len(example) > 1 else ""
+                else:
+                    raise ValueError(
+                        "Invalid inputs, the input should be {'text_a': a, 'text_b': b}, a text or a list of text."
+                    )
+
+                if len(data["text_a"]) < 1 and len(data["text_b"]) < 1:
+                    raise ValueError("Invalid inputs, input `text_a` and `text_b` are both missing or empty.")
+                if not isinstance(data["choices"], list) or len(data["choices"]) < 2:
+                    raise ValueError("Invalid inputs, label candidates should be a list with length >= 2.")
+                input_list.append(data)
+        else:
+            raise TypeError("Invalid input format!")
+        return input_list
+
+    def _preprocess(self, inputs: Union[str, List[str]]) -> Dict[str, Any]:
+        """
+        Transform the raw text to the model inputs, two steps involved:
+           1) Transform the raw text to token ids.
+           2) Generate the other model inputs from the raw text and token ids.
+        """
+        inputs = self._check_input_text(inputs)
+        # Get the config from the kwargs
+        tokenized_inputs = [self._template(i) for i in inputs]
+        batches = [
+            tokenized_inputs[idx : idx + self._batch_size] for idx in range(0, len(tokenized_inputs), self._batch_size)
+        ]
+        inputs = [inputs[idx : idx + self._batch_size] for idx in range(0, len(inputs), self._batch_size)]
+        outputs = {}
+        outputs["text"] = inputs
+        outputs["batches"] = [self._collator(batch) for batch in batches]
+
+        return outputs
+
+    def _run_model(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        outputs = {}
+        outputs["text"] = inputs["text"]
+        outputs["batch_logits"] = []
+        dtype_dict = {
+            "input_ids": "int64",
+            "token_type_ids": "int64",
+            "position_ids": "int64",
+            "attention_mask": "float32",
+            "omask_positions": "int64",
+            "cls_positions": "int64",
+        }
+        with static_mode_guard():
+            for batch in inputs["batches"]:
+                if self._predictor_type == "paddle-inference":
+                    for i, input_name in enumerate(self.input_names):
+                        self.input_handles[i].copy_from_cpu(batch[input_name].astype(dtype_dict[input_name]))
+                    self.predictor.run()
+                    logits = self.output_handle[0].copy_to_cpu().tolist()
+                else:
+                    input_dict = {}
+                    for input_name in dtype_dict:
+                        input_dict[input_name] = batch[input_name].astype(dtype_dict[input_name])
+                    logits = self.predictor.run(None, input_dict)[0].tolist()
+                outputs["batch_logits"].append(logits)
+
+        return outputs
+
+    def _postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        This function converts the model logits output to class score and predictions
+        """
+        outputs = []
+        for batch_text, batch_logits in zip(inputs["text"], inputs["batch_logits"]):
+            for text, logits in zip(batch_text, batch_logits):
+                output = {}
+                if len(text["text_a"]) > 0:
+                    output["text_a"] = text["text_a"]
+                if len(text["text_b"]) > 0:
+                    output["text_b"] = text["text_b"]
+
+                if self._single_label:
+                    score = np_softmax(logits, axis=-1)
+                    label = np.argmax(logits, axis=-1)
+                    output["predictions"] = [{"label": text["choices"][label], "score": score[label]}]
+                else:
+                    scores = np_sigmoid(logits)
+                    output["predictions"] = []
+                    if scores.ndim == 2:
+                        scores = scores[0]
+                    for i, class_score in enumerate(scores):
+                        if class_score > self._pred_threshold:
+                            output["predictions"].append({"label": text["choices"][i], "score": class_score})
+                outputs.append(output)
+
+        return outputs
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/__init__.py
new file mode 100644
index 000000000..f74eadb40
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you smay not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .argparser import *
+from .compression_args import *
+from .plugins.timer import *
+from .trainer import *
+from .trainer_callback import *
+from .trainer_compress import *
+from .trainer_seq2seq import *
+from .trainer_utils import *
+from .training_args import *
+from .training_args_seq2seq import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/argparser.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/argparser.py
new file mode 100644
index 000000000..58bbe45dc
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/argparser.py
@@ -0,0 +1,296 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# # Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is modified from
+#  https://github.com/huggingface/transformers/blob/main/src/transformers/hf_argparser.py
+
+import dataclasses
+import json
+import sys
+from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, ArgumentTypeError
+from copy import copy
+from enum import Enum
+from inspect import isclass
+from pathlib import Path
+from typing import Any, Dict, Iterable, NewType, Optional, Tuple, Union, get_type_hints
+
+DataClass = NewType("DataClass", Any)
+DataClassType = NewType("DataClassType", Any)
+
+__all__ = [
+    "PdArgumentParser",
+    "strtobool",
+]
+
+
+# From https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
+def strtobool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise ArgumentTypeError(
+            f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)."
+        )
+
+
+class PdArgumentParser(ArgumentParser):
+    """
+    This subclass of `argparse.ArgumentParser` uses type hints on dataclasses to generate arguments.
+
+    The class is designed to play well with the native argparse. In particular, you can add more (non-dataclass backed)
+    arguments to the parser after initialization and you'll get the output back after parsing as an additional
+    namespace. Optional: To create sub argument groups use the `_argument_group_name` attribute in the dataclass.
+    """
+
+    dataclass_types: Iterable[DataClassType]
+
+    def __init__(self, dataclass_types: Union[DataClassType, Iterable[DataClassType]], **kwargs):
+        """
+        Args:
+            dataclass_types:
+                Dataclass type, or list of dataclass types for which we will "fill" instances with the parsed args.
+            kwargs:
+                (Optional) Passed to `argparse.ArgumentParser()` in the regular way.
+        """
+        # To make the default appear when using --help
+        if "formatter_class" not in kwargs:
+            kwargs["formatter_class"] = ArgumentDefaultsHelpFormatter
+        super().__init__(**kwargs)
+        if dataclasses.is_dataclass(dataclass_types):
+            dataclass_types = [dataclass_types]
+        self.dataclass_types = list(dataclass_types)
+        for dtype in self.dataclass_types:
+            self._add_dataclass_arguments(dtype)
+
+    @staticmethod
+    def _parse_dataclass_field(parser: ArgumentParser, field: dataclasses.Field):
+        field_name = f"--{field.name}"
+        kwargs = field.metadata.copy()
+        # field.metadata is not used at all by Data Classes,
+        # it is provided as a third-party extension mechanism.
+        if isinstance(field.type, str):
+            raise RuntimeError(
+                "Unresolved type detected, which should have been done with the help of "
+                "`typing.get_type_hints` method by default"
+            )
+
+        origin_type = getattr(field.type, "__origin__", field.type)
+        if origin_type is Union:
+            if len(field.type.__args__) != 2 or type(None) not in field.type.__args__:
+                raise ValueError("Only `Union[X, NoneType]` (i.e., `Optional[X]`) is allowed for `Union`")
+            if bool not in field.type.__args__:
+                # filter `NoneType` in Union (except for `Union[bool, NoneType]`)
+                field.type = (
+                    field.type.__args__[0] if isinstance(None, field.type.__args__[1]) else field.type.__args__[1]
+                )
+                origin_type = getattr(field.type, "__origin__", field.type)
+
+        # A variable to store kwargs for a boolean field, if needed
+        # so that we can init a `no_*` complement argument (see below)
+        bool_kwargs = {}
+        if isinstance(field.type, type) and issubclass(field.type, Enum):
+            kwargs["choices"] = [x.value for x in field.type]
+            kwargs["type"] = type(kwargs["choices"][0])
+            if field.default is not dataclasses.MISSING:
+                kwargs["default"] = field.default
+            else:
+                kwargs["required"] = True
+        # fix https://github.com/huggingface/transformers/pull/16946
+        elif field.type is bool or field.type == Optional[bool]:
+            # Copy the currect kwargs to use to instantiate a `no_*` complement argument below.
+            # We do not initialize it here because the `no_*` alternative must be instantiated after the real argument
+            bool_kwargs = copy(kwargs)
+
+            # Hack because type=bool in argparse does not behave as we want.
+            kwargs["type"] = strtobool
+            if field.type is bool or (field.default is not None and field.default is not dataclasses.MISSING):
+                # Default value is False if we have no default when of type bool.
+                default = False if field.default is dataclasses.MISSING else field.default
+                # This is the value that will get picked if we don't include --field_name in any way
+                kwargs["default"] = default
+                # This tells argparse we accept 0 or 1 value after --field_name
+                kwargs["nargs"] = "?"
+                # This is the value that will get picked if we do --field_name (without value)
+                kwargs["const"] = True
+        elif isclass(origin_type) and issubclass(origin_type, list):
+            kwargs["type"] = field.type.__args__[0]
+            kwargs["nargs"] = "+"
+            if field.default_factory is not dataclasses.MISSING:
+                kwargs["default"] = field.default_factory()
+            elif field.default is dataclasses.MISSING:
+                kwargs["required"] = True
+        else:
+            kwargs["type"] = json.loads if field.type is dict else field.type
+            if field.default is not dataclasses.MISSING:
+                kwargs["default"] = field.default
+            elif field.default_factory is not dataclasses.MISSING:
+                kwargs["default"] = field.default_factory()
+            else:
+                kwargs["required"] = True
+        parser.add_argument(field_name, **kwargs)
+
+        # Add a complement `no_*` argument for a boolean field AFTER the initial field has already been added.
+        # Order is important for arguments with the same destination!
+        # We use a copy of earlier kwargs because the original kwargs have changed a lot before reaching down
+        # here and we do not need those changes/additional keys.
+        if field.default is True and (field.type is bool or field.type == Optional[bool]):
+            bool_kwargs["default"] = False
+            parser.add_argument(f"--no_{field.name}", action="store_false", dest=field.name, **bool_kwargs)
+
+    def _add_dataclass_arguments(self, dtype: DataClassType):
+        if hasattr(dtype, "_argument_group_name"):
+            parser = self.add_argument_group(dtype._argument_group_name)
+        else:
+            parser = self
+
+        try:
+            type_hints: Dict[str, type] = get_type_hints(dtype)
+        except NameError:
+            raise RuntimeError(
+                f"Type resolution failed for f{dtype}. Try declaring the class in global scope or "
+                f"removing line of `from __future__ import annotations` which opts in Postponed "
+                f"Evaluation of Annotations (PEP 563)"
+            )
+
+        for field in dataclasses.fields(dtype):
+            if not field.init:
+                continue
+            field.type = type_hints[field.name]
+            self._parse_dataclass_field(parser, field)
+
+    def parse_args_into_dataclasses(
+        self, args=None, return_remaining_strings=False, look_for_args_file=True, args_filename=None
+    ) -> Tuple[DataClass, ...]:
+        """
+        Parse command-line args into instances of the specified dataclass types.
+
+        This relies on argparse's `ArgumentParser.parse_known_args`. See the doc at:
+        docs.python.org/3.7/library/argparse.html#argparse.ArgumentParser.parse_args
+
+        Args:
+            args:
+                List of strings to parse. The default is taken from sys.argv. (same as argparse.ArgumentParser)
+            return_remaining_strings:
+                If true, also return a list of remaining argument strings.
+            look_for_args_file:
+                If true, will look for a ".args" file with the same base name as the entry point script for this
+                process, and will append its potential content to the command line args.
+            args_filename:
+                If not None, will uses this file instead of the ".args" file specified in the previous argument.
+
+        Returns:
+            Tuple consisting of:
+
+                - the dataclass instances in the same order as they were passed to the initializer.abspath
+                - if applicable, an additional namespace for more (non-dataclass backed) arguments added to the parser
+                  after initialization.
+                - The potential list of remaining argument strings. (same as argparse.ArgumentParser.parse_known_args)
+        """
+        if args_filename or (look_for_args_file and len(sys.argv)):
+            if args_filename:
+                args_file = Path(args_filename)
+            else:
+                args_file = Path(sys.argv[0]).with_suffix(".args")
+
+            if args_file.exists():
+                fargs = args_file.read_text().split()
+                args = fargs + args if args is not None else fargs + sys.argv[1:]
+                # in case of duplicate arguments the first one has precedence
+                # so we append rather than prepend.
+
+        return self.common_parse(args, return_remaining_strings)
+
+    def common_parse(self, args, return_remaining_strings) -> Tuple[DataClass, ...]:
+        namespace, remaining_args = self.parse_known_args(args=args)
+        outputs = []
+        for dtype in self.dataclass_types:
+            keys = {f.name for f in dataclasses.fields(dtype) if f.init}
+            inputs = {k: v for k, v in vars(namespace).items() if k in keys}
+            for k in keys:
+                delattr(namespace, k)
+            obj = dtype(**inputs)
+            outputs.append(obj)
+        if len(namespace.__dict__) > 0:
+            # additional namespace.
+            outputs.append(namespace)
+        if return_remaining_strings:
+            return (*outputs, remaining_args)
+        else:
+            if remaining_args:
+                raise ValueError(f"Some specified arguments are not used by the PdArgumentParser: {remaining_args}")
+
+            return (*outputs,)
+
+    def read_json(self, json_file: str) -> list:
+        json_file = Path(json_file)
+        if json_file.exists():
+            with open(json_file, "r") as file:
+                data = json.load(file)
+            json_args = []
+            for key, value in data.items():
+                if isinstance(value, list):
+                    json_args.extend([f"--{key}", *[str(v) for v in value]])
+                elif isinstance(value, dict):
+                    json_args.extend([f"--{key}", json.dumps(value)])
+                else:
+                    json_args.extend([f"--{key}", str(value)])
+            return json_args
+        else:
+            raise FileNotFoundError(f"The argument file {json_file} does not exist.")
+
+    def parse_json_file(self, json_file: str, return_remaining_strings=False) -> Tuple[DataClass, ...]:
+        """
+        Alternative helper method that does not use `argparse` at all, instead loading a json file and populating the
+        dataclass types.
+        """
+        json_args = self.read_json(json_file)
+        return self.common_parse(json_args, return_remaining_strings)
+
+    def parse_json_file_and_cmd_lines(self, return_remaining_strings=False) -> Tuple[DataClass, ...]:
+        """
+        Extend the functionality of `parse_json_file` to handle command line arguments in addition to loading a JSON
+        file.
+
+        When there is a conflict between the command line arguments and the JSON file configuration,
+        the command line arguments will take precedence.
+
+        Returns:
+            Tuple consisting of:
+
+                - the dataclass instances in the same order as they were passed to the initializer.abspath
+        """
+        if not sys.argv[1].endswith(".json"):
+            raise ValueError(f"The first argument should be a JSON file, but it is {sys.argv[1]}")
+        json_args = self.read_json(sys.argv[1])
+        # In case of conflict, command line arguments take precedence
+        args = json_args + sys.argv[2:]
+        return self.common_parse(args, return_remaining_strings)
+
+    def parse_dict(self, args: dict) -> Tuple[DataClass, ...]:
+        """
+        Alternative helper method that does not use `argparse` at all, instead uses a dict and populating the dataclass
+        types.
+        """
+        outputs = []
+        for dtype in self.dataclass_types:
+            keys = {f.name for f in dataclasses.fields(dtype) if f.init}
+            inputs = {k: v for k, v in args.items() if k in keys}
+            obj = dtype(**inputs)
+            outputs.append(obj)
+        return (*outputs,)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/auto_trainer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/auto_trainer.py
new file mode 100644
index 000000000..d430fae85
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/auto_trainer.py
@@ -0,0 +1,745 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+import time
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+import paddle
+import paddle.distributed as dist
+import paddle.nn as nn
+from paddle.distributed import fleet
+from tqdm.auto import tqdm
+
+from paddlenlp.trainer import Trainer
+
+from ..utils.log import logger
+from .argparser import strtobool
+from .trainer import SCALER_NAME, SCHEDULER_NAME, TRAINER_STATE_NAME, TRAINING_ARGS_NAME
+from .trainer_callback import TrainerState
+from .trainer_utils import (  # set_hyrbid_parallel_seed,
+    PREFIX_CHECKPOINT_DIR,
+    ShardingOption,
+    TrainOutput,
+    _exec_mode_guard,
+    get_last_checkpoint,
+    has_length,
+    speed_metrics,
+)
+from .utils.helper import distributed_file, distributed_isfile  # nested_truncate,
+
+try:
+    from ..quantization.quantization_linear import QuantizationLinear
+except:
+    QuantizationLinear = None
+
+MODEL_NAME = "model"
+OPTIMIZER_NAME = "optimizer"
+DIST_CKPT_PATH = "dist_ckpt"
+DIST_MODEL_PATH = "dist_model"
+FREE_SVAE_LOAD_KEY_PATTERNS = ["learning_rate_", "gradient_merge_", "@GRAD@MERG", "eager_tmp"]
+
+
+class AutoTrainer(Trainer):
+    def __init__(self, *args, **kwargs):
+
+        if kwargs.get("args", None) is not None and kwargs["args"].to_static:
+            if kwargs.get("criterion", None) is None:
+
+                def loss_func(loss, outputs):
+                    return loss
+
+                kwargs.update({"criterion": loss_func})
+
+        super().__init__(*args, **kwargs)
+        assert self.args.enable_auto_parallel
+
+        self.global_mesh = fleet.auto.get_mesh()
+        self.comm_group_in_pp = fleet.get_hybrid_communicate_group().get_pipe_parallel_group()
+
+    def _nested_gather(self, tensors):
+        """
+        Gather value of `tensors` (tensor or list/tuple of nested tensors) and convert them to numpy before
+        concatenating them to `gathered`
+        """
+        with _exec_mode_guard("dynamic"):
+            if isinstance(tensors, paddle.Tensor):
+                tr_loss = tensors._local_value() if tensors.is_dist() else tensors
+            else:
+                tr_loss = paddle.to_tensor([tensors])
+
+        if self.args.pipeline_parallel_degree <= 1:
+            return super()._nested_gather(tr_loss)
+
+        paddle.distributed.broadcast(tr_loss, src=self.comm_group_in_pp.ranks[-1], group=self.comm_group_in_pp)
+
+        return super()._nested_gather(tr_loss)
+
+    def _wrap_model(self, model, training=True):
+        return model
+
+    def _get_meshes_for_loader(self):
+        def _get_mesh(pp_idx=0):
+            return self.global_mesh.get_mesh_with_dim("pp")[pp_idx]
+
+        # Note(lizhiyu): If the values returned by `DataLoader` don't have the format `[images, labels]`,
+        # error may occurs here.
+        meshes = []
+        meshes.append(_get_mesh(0))
+        if self.args.pipeline_parallel_degree > 1:
+            meshes.append(_get_mesh(self.args.pipeline_parallel_degree - 1))
+        return meshes
+
+    def _wrap_for_dist_loader(self, train_dataloader):
+        dist_loader = dist.shard_dataloader(
+            dataloader=train_dataloader,
+            meshes=self._get_meshes_for_loader(),
+            shard_dims="dp",
+        )
+        return dist_loader
+
+    def _wrap_for_auto(self, model, train_dataloader):
+        dist_loader = self._wrap_for_dist_loader(train_dataloader)
+
+        if ShardingOption.SHARD_OP in self.args.sharding:
+            self.optimizer = dist.shard_optimizer(self.optimizer, dist.ShardingStage1())
+        elif ShardingOption.SHARD_GRAD_OP in self.args.sharding:
+            self.optimizer = dist.shard_optimizer(self.optimizer, dist.ShardingStage2())
+        elif ShardingOption.FULL_SHARD in self.args.sharding:
+            self.optimizer = dist.shard_optimizer(self.optimizer, dist.ShardingStage3())
+
+        if self.args.to_static:
+            unified_strategy = dist.Strategy()
+            unified_strategy._from_legacy_strategy(self.args.strategy)
+            model = dist.to_static(model, dist_loader, self.criterion, self.optimizer, strategy=unified_strategy)
+
+        self.model_wrapped = model
+        return model, dist_loader
+
+    def _wrap_amp_model(self, args, model):
+        logger.info("Using half precision")
+        self.amp_dtype = "float16" if self.args.fp16 else "bfloat16"
+        if self.args.fp16_opt_level == "O2":
+            paddle.amp.decorate(
+                models=model,
+                level=self.args.fp16_opt_level,
+                dtype=self.amp_dtype,
+                master_grad=self.args.amp_master_grad,
+                excluded_layers=QuantizationLinear,
+            )
+        if args.to_static:
+            return
+        self.enable_autocast_context_manager = True
+        self.do_grad_scaling = True if self.args.fp16 else False
+        self.scaler = dist.shard_scaler(paddle.amp.GradScaler(init_loss_scaling=self.args.scale_loss))
+
+    def _get_item_from_loss(self, loss):
+        if isinstance(loss, paddle.Tensor):
+            if loss.is_dist():
+                return loss._local_value().item() if loss._is_initialized() else 0.0
+            else:
+                return loss.item() if loss._is_initialized() else 0.0
+        else:
+            return loss
+
+    def _split_batches_for_accumulation(self, inputs):
+        if self.args.gradient_accumulation_steps == 1:
+            return [inputs]
+
+        if self.args.to_static and self.args.pipeline_parallel_degree > 1:
+            return [inputs]
+
+        local_batches = [{} for i in range(self.args.gradient_accumulation_steps)]
+        assert isinstance(inputs, dict)
+
+        def split_dtensor_by_axis(dtensor, axis):
+            mesh = dtensor.process_mesh
+            placements = [dist.Replicate() for _ in range(len(mesh.shape))]
+            replicate_value = dist.reshard(dtensor, mesh, placements)
+            local_datas = replicate_value.split(self.args.gradient_accumulation_steps, axis=0)
+            return local_datas
+
+        for key, dtensors in inputs.items():
+            if isinstance(dtensors, paddle.Tensor):
+                mesh, placements = dtensors.process_mesh, dtensors.placements
+                local_datas = split_dtensor_by_axis(dtensors, 0)
+                for index, data in enumerate(local_datas):
+                    local_batches[index].update({key: dist.reshard(data, mesh, placements)})
+            elif isinstance(dtensors, (list, tuple)):
+                if len(dtensors) == 0:
+                    for i in range(self.args.gradient_accumulation_steps):
+                        local_batches[i].update({key: []})
+                else:
+                    for dtensor in dtensors:
+                        if isinstance(dtensor, paddle.Tensor):
+                            mesh, placements = dtensor.process_mesh, dtensor.placements
+                            local_datas = split_dtensor_by_axis(dtensor, 0)
+                            for index, data in enumerate(local_datas):
+                                if key in local_batches[index].keys():
+                                    local_batches[index][key].append(dist.reshard(data, mesh, placements))
+                                else:
+                                    local_batches[index].update({key: [dist.reshard(data, mesh, placements)]})
+                        else:
+                            raise ValueError(f"unsupported type: {type(dtensor)}")
+            else:
+                raise ValueError(f"unsupported type: {type(dtensors)}")
+        return local_batches
+
+    def _inner_training_loop(
+        self,
+        args,
+        model,
+        train_dataloader,
+        len_dataloader,
+        max_steps,
+        num_train_epochs,
+        num_update_steps_per_epoch,
+        num_train_samples,
+        resume_from_checkpoint,
+        ignore_keys_for_eval,
+    ):
+        start_time = time.time()
+        self._globalstep_last_start_time = time.time()
+        self.state.epoch = 0
+        epochs_trained = 0
+        steps_trained_in_current_epoch = 0
+        steps_trained_progress_bar = None
+
+        # Check if continuing training from a checkpoint
+        if (
+            resume_from_checkpoint is not None
+            and distributed_isfile(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
+            and not self.args.ignore_load_lr_and_optim
+        ):
+            self.state = TrainerState.load_from_json(
+                distributed_file(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
+            )
+            if self.args.world_size > 1:
+                global_step_list = []
+                paddle.distributed.all_gather(
+                    global_step_list, paddle.to_tensor([self.state.global_step], dtype="int64")
+                )
+                assert (
+                    paddle.sum(paddle.stack(global_step_list) - global_step_list[0]) == 0
+                ), f"Error, get different globel step, please check! step list: {[x.item() for x in global_step_list]}"
+
+            epochs_trained = self.state.global_step // num_update_steps_per_epoch
+            if not args.ignore_data_skip:
+                steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch)
+            else:
+                steps_trained_in_current_epoch = 0
+
+            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+            logger.info(f"  Continuing training from epoch {epochs_trained}")
+            logger.info(f"  Continuing training from global step {self.state.global_step}")
+            if not args.ignore_data_skip:
+                logger.info(
+                    f"  Will skip the first {epochs_trained} epochs then the first {steps_trained_in_current_epoch} "
+                    "batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` "
+                    "flag to your launch command, but you will resume the training on data already seen by your model."
+                )
+                if self.is_local_process_zero() and not args.disable_tqdm:
+                    steps_trained_progress_bar = tqdm(total=steps_trained_in_current_epoch)
+                    steps_trained_progress_bar.set_description("Skipping the first batches")
+
+        epoch_iterator = train_dataloader
+        # steps_in_epoch = len(epoch_iterator)
+        steps_in_epoch = (
+            len(epoch_iterator) if len_dataloader is not None else args.max_steps * args.gradient_accumulation_steps
+        )
+        if len_dataloader is not None:
+            if self.args.gradient_accumulation_steps > len(epoch_iterator):
+                logger.warning(
+                    f"changing accumulation step from `{self.args.gradient_accumulation_steps}` to `{len(epoch_iterator)}` to avoid, cross epoch accumulate"
+                )
+                self.args.gradient_accumulation_steps = len(epoch_iterator)
+
+        self.callback_handler.model = self.model
+        self.callback_handler.optimizer = self.optimizer
+        self.callback_handler.lr_scheduler = self.lr_scheduler
+        self.callback_handler.train_dataloader = train_dataloader
+
+        self.state.max_steps = int(max_steps)
+        self.state.num_train_epochs = num_train_epochs
+        self.state.is_local_process_zero = self.is_local_process_zero()
+        self.state.is_world_process_zero = self.is_world_process_zero()
+
+        self.control = self.callback_handler.on_train_begin(args, self.state, self.control)
+
+        tr_loss = paddle.to_tensor(0.0)
+        self._total_loss_scalar = 0.0
+        self._globalstep_last_logged = self.state.global_step
+
+        if self.args.device == "npu" and self.args.flatten_param_grads:
+            from .plugins.npu_plugin import npu_accelerate_plugin
+
+            npu_accelerate_plugin(self.optimizer)
+
+        model, dist_loader = self._wrap_for_auto(model, train_dataloader)
+        train_dataloader = dist_loader()
+
+        if resume_from_checkpoint is not None:
+            self._load_from_checkpoint(resume_from_checkpoint)
+
+        self.timers and self.timers("read-data").start()
+
+        for epoch in range(epochs_trained, num_train_epochs):
+
+            step_control = 0  # used in loop control, reset to 0 after every step
+            self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control)
+
+            # read global-batch from dist_loader
+            for step, inputs in enumerate(train_dataloader):
+                self.timers and self.timers("read-data").stop()
+                os.environ["TRAINER_GLOBAL_STEP"] = str(self.state.global_step)
+                self.callback_handler.on_load_data_end(args, self.state, self.control, inputs=inputs)
+
+                # Skip past any already trained steps if resuming training
+                # We use consumed_samples to reset the status
+                if steps_trained_in_current_epoch > 0:
+                    steps_trained_in_current_epoch -= 1
+                    if steps_trained_progress_bar is not None:
+                        steps_trained_progress_bar.update(1)
+                    if steps_trained_in_current_epoch == 0:
+                        self._load_rng_state(resume_from_checkpoint)
+                    continue
+                elif steps_trained_progress_bar is not None:
+                    steps_trained_progress_bar.close()
+                    steps_trained_progress_bar = None
+
+                inputs_list = self._split_batches_for_accumulation(inputs)
+
+                for inputs in inputs_list:
+                    if step_control % args.gradient_accumulation_steps == 0:
+                        self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
+                        self.timers and self.timers("forward-backward").start()
+
+                    tr_loss_step = self.training_step(model, inputs)
+
+                    with _exec_mode_guard("dynamic"):
+                        tr_loss += tr_loss_step
+
+                    disable_accumulation = self.args.pipeline_parallel_degree > 1 and self.args.to_static
+                    # disable_accumulation = self.args.to_static
+
+                    if (step_control + 1) % args.gradient_accumulation_steps == 0 or (
+                        # last step in epoch but step is always smaller than gradient_accumulation_steps
+                        steps_in_epoch <= args.gradient_accumulation_steps
+                        and (step + 1) == steps_in_epoch
+                        or disable_accumulation
+                    ):
+
+                        self.timers and self.timers("forward-backward").stop()
+
+                        self.timers and self.timers("optimizer-step").start()
+
+                        # Optimizer step
+                        self.callback_handler.on_optimizer_begin(
+                            args, self.state, self.control, scaler=self.scaler if self.do_grad_scaling else None
+                        )
+
+                        self.optimizer_step()
+
+                        self.timers and self.timers("optimizer-step").stop()
+
+                        self.callback_handler.on_optimizer_end(
+                            args, self.state, self.control, scaler=self.scaler if self.do_grad_scaling else None
+                        )
+
+                        self.state.global_step += 1
+                        self.state.epoch = epoch + (step + 1) / steps_in_epoch
+                        self.control = self.callback_handler.on_step_end(args, self.state, self.control)
+                        self._maybe_log_save_evaluate(tr_loss, model, epoch, ignore_keys_for_eval, inputs=inputs)
+                        self._print_timer()
+                        step_control = 0
+                    else:
+                        self.control = self.callback_handler.on_substep_end(args, self.state, self.control)
+                        step_control += 1
+
+                if self.control.should_epoch_stop or self.control.should_training_stop:
+                    break
+
+                self.timers and self.timers("read-data").start()
+
+            if step < 0:
+                logger.warning(
+                    f"There seems to be not a single sample in your epoch_iterator, stopping training at step"
+                    f" {self.state.global_step}! This is expected if you're using an IterableDataset and set"
+                    f" num_steps ({self.state.max_steps}) higher than the number of available samples."
+                )
+                self.control.should_training_stop = True
+
+            self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
+            self._maybe_log_save_evaluate(tr_loss, model, epoch, ignore_keys_for_eval, inputs=inputs)
+
+            if self.control.should_training_stop:
+                break
+
+        if args.past_index and hasattr(self, "_past"):
+            # Clean the state at the end of training
+            delattr(self, "_past")
+
+        logger.info("\nTraining completed. \n")
+
+        self._total_loss_scalar += self._get_item_from_loss(tr_loss)
+        train_loss = self._total_loss_scalar / self.state.global_step
+
+        metrics = speed_metrics("train", start_time, num_samples=num_train_samples, num_steps=self.state.max_steps)
+
+        metrics["train_loss"] = train_loss
+
+        self.is_in_train = False
+
+        self._memory_tracker.stop_and_update_metrics(metrics)
+
+        self.log(metrics)
+
+        self.control = self.callback_handler.on_train_end(args, self.state, self.control)
+
+        return TrainOutput(self.state.global_step, train_loss, metrics)
+
+    def _get_train_sampler(self) -> Optional[paddle.io.Sampler]:
+        if self.train_dataset is None or not has_length(self.train_dataset):
+            return None
+
+        total_batch_size_per_acc_step = self.args.per_device_train_batch_size * self.args.dataset_world_size
+        total_batch_size = total_batch_size_per_acc_step * self.args.gradient_accumulation_steps
+
+        return paddle.io.BatchSampler(
+            dataset=self.train_dataset,
+            shuffle=True,
+            batch_size=total_batch_size,
+            drop_last=self.args.dataloader_drop_last,
+        )
+
+    def compute_loss(self, model, inputs, return_outputs=False):
+        """
+        How the loss is computed by Trainer. By default, all models return the loss in the first element.
+        Subclass and override for custom behavior.
+        """
+        if self.criterion is not None:
+            if "labels" in inputs:
+                labels = inputs.pop("labels")
+            elif "start_positions" in inputs and "end_positions" in inputs:
+                labels = (inputs.pop("start_positions"), inputs.pop("end_positions"))
+            elif self.args.label_names is not None:
+                labels = []
+                for label in self.label_names:
+                    labels.append(inputs.pop(label))
+                labels = tuple(labels)
+            elif "generator_labels" in inputs:
+                labels = inputs["generator_labels"]
+        else:
+            labels = None
+
+        outputs = model(**inputs)
+
+        if self.criterion is not None:
+
+            def to_list(value):
+                if value is None:
+                    return value
+                if isinstance(value, (list, tuple)):
+                    return list(value)
+                return [value]
+
+            criterion_inputs = to_list(outputs)
+            criterion_labels = to_list(labels)
+            loss = self.criterion(*(criterion_inputs + criterion_labels))
+            outputs = (loss, outputs)
+
+        # Save past state if it exists
+        # TODO: this needs to be fixed and made cleaner later.
+        if self.args.past_index >= 0:
+            self._past = outputs[self.args.past_index]
+
+        # We don't use .loss here since the model may return tuples instead of ModelOutput.
+        loss = outputs["loss"] if isinstance(outputs, dict) else outputs
+        if isinstance(outputs, dict):
+            loss = outputs["loss"]
+        elif isinstance(outputs, tuple):
+            loss = outputs[0]
+        else:
+            loss = outputs
+
+        return (loss, outputs) if return_outputs else loss
+
+    def dynamic_traning(self, model: nn.Layer, inputs: Dict[str, Union[paddle.Tensor, Any]]) -> paddle.Tensor:
+        with self.autocast_smart_context_manager():
+            loss = self.compute_loss(model, inputs)
+
+        if loss is not None and self.args.gradient_accumulation_steps > 1:
+            loss = loss / self.args.gradient_accumulation_steps
+
+        if self.do_grad_scaling:
+            self.scaler.scale(loss).backward()
+        else:
+            loss.backward()
+
+        return loss
+
+    def static_traning(self, model: nn.Layer, inputs: Dict[str, Union[paddle.Tensor, Any]]) -> paddle.Tensor:
+        input_ids, labels = tuple(inputs.values())
+        loss = model(input_ids, labels)
+
+        if loss is not None and self.args.gradient_accumulation_steps > 1:
+            loss = loss / self.args.gradient_accumulation_steps
+
+        return loss
+
+    def training_step(self, model: nn.Layer, inputs: Dict[str, Union[paddle.Tensor, Any]]) -> paddle.Tensor:
+        model.train()
+
+        inputs = self._prepare_inputs(inputs)
+
+        if not self.args.to_static:
+            loss = self.dynamic_traning(model, inputs)
+        else:
+            loss = self.static_traning(model, inputs)
+
+        if isinstance(loss, paddle.Tensor):
+            return loss.detach() if loss._is_initialized() else float(0.0)
+        elif isinstance(loss, np.ndarray):
+            return np.sum(loss)
+        elif loss is None:
+            return float(0.0)
+        else:
+            return float(loss)
+
+    def optimizer_step(self):
+        if not self.args.to_static:
+            optimizer_was_run = True
+            if self.do_grad_scaling:
+                scale_before = paddle.assign(self.scaler._scale)
+                self.scaler.step(self.optimizer)
+                self.scaler.update()
+                scale_after = self.scaler._scale
+                # Compatible with paddlepaddle 2.6.0 using typo word.
+                if hasattr(self.scaler, "_cache_founf_inf"):
+                    optimizer_was_run = not self.scaler._cache_founf_inf
+                else:
+                    optimizer_was_run = not self.scaler._cache_found_inf
+                if not optimizer_was_run:
+                    scale_before_value = scale_before.cpu().numpy()
+                    scale_after_value = scale_after.cpu().numpy()
+                    logger.warning(
+                        f"optimizer not run, scale_before: {scale_before_value[0]}, scale_after: {scale_after_value[0]}"
+                    )
+            else:
+                self.optimizer.step()
+
+            if optimizer_was_run:
+                self.lr_scheduler.step()
+
+            self.optimizer.clear_grad()
+        else:
+            # TODO: support optimizer_was_run in static mode
+            self.lr_scheduler.step()
+
+    def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval, **kwargs):
+        with _exec_mode_guard("dynamic"):
+            super()._maybe_log_save_evaluate(tr_loss, model, epoch, ignore_keys_for_eval, **kwargs)
+
+    def _save_model(self):
+        if not self.args.to_static:
+            return
+        with _exec_mode_guard("static"):
+            output_dir = f"{self.args.output_dir}/{DIST_MODEL_PATH}"
+            os.makedirs(output_dir, exist_ok=True)
+            logger.info(f"Saving model files into {output_dir}")
+            model_file = os.path.join(output_dir, "rank_" + str(paddle.distributed.get_rank()) + ".pd_dist_model")
+            if os.path.exists(model_file):
+                os.remove(model_file)
+            paddle.save(self.model_wrapped.dist_main_program("train"), model_file)
+
+    def _save_checkpoint(self, model, metrics=None):
+
+        # Save model checkpoint
+        checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+        run_dir = self.args.output_dir
+        output_dir = f"{run_dir}/{checkpoint_folder}"
+
+        if self.args.should_save or self.args.should_save_model_state:
+            os.makedirs(output_dir, exist_ok=True)
+
+        if self.args.should_save:
+            logger.info(f"Saving checkpoinit files into {output_dir}")
+
+            if self.args.should_save_model_state:
+                if self.args.to_static:
+                    opt_state_dict = {
+                        key: value
+                        for key, value in model.state_dict("opt").items()
+                        if not any(keyword in key for keyword in FREE_SVAE_LOAD_KEY_PATTERNS)
+                    }
+                    state_dict = {
+                        MODEL_NAME: model.state_dict("param"),
+                        OPTIMIZER_NAME: opt_state_dict,
+                    }
+                else:
+                    optim_state_dict = self.optimizer.state_dict()
+                    optim_state_dict.pop("LR_Scheduler", None)
+                    opt_state_keys = ["_moment1_0", "_moment2_0", "_beta1_pow_acc_0", "_beta2_pow_acc_0"]
+                    for p_name, p in model.state_dict().items():
+                        if paddle.distributed.get_rank() not in p.process_mesh.process_ids:
+                            var_name = p.name
+                            for key in opt_state_keys:
+                                if (
+                                    var_name + key in optim_state_dict
+                                    and not optim_state_dict[var_name + key].is_dist()
+                                ):
+                                    optim_state_dict.pop(var_name + key)
+
+                    state_dict = {
+                        MODEL_NAME: model.state_dict(),
+                        OPTIMIZER_NAME: optim_state_dict,
+                    }
+
+                self._save_ckpt_func(state_dict, os.path.join(output_dir, DIST_CKPT_PATH))
+                logger.info(f"Model weights and optimizer states saved in {output_dir}/{DIST_CKPT_PATH}")
+
+                # FIXME: maybe only save one copy
+                paddle.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
+
+                if self.do_grad_scaling:
+                    paddle.save(self.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME))
+
+        # Determine the new best metric / best model checkpoint
+        if metrics is not None and self.args.metric_for_best_model is not None:
+            metric_to_check = self.args.metric_for_best_model
+            if not metric_to_check.startswith("eval_"):
+                metric_to_check = f"eval_{metric_to_check}"
+            metric_value = metrics[metric_to_check]
+
+            operator = np.greater if self.args.greater_is_better else np.less
+            if (
+                self.state.best_metric is None
+                or self.state.best_model_checkpoint is None
+                or operator(metric_value, self.state.best_metric)
+            ):
+                self.state.best_metric = metric_value
+                self.state.best_model_checkpoint = output_dir
+
+        # Save the Trainer state
+        if self.args.should_save:
+            self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
+
+        # Save RNG state in non-distributed training
+        rng_states = {
+            "python": random.getstate(),
+            "numpy": np.random.get_state(),
+            "cuda": paddle.get_rng_state(),
+            "cpu": paddle.framework.core.default_cpu_generator().get_state(),
+        }
+
+        if self.args.world_size > 1:
+            rng_states_list = []
+            paddle.distributed.all_gather_object(rng_states_list, rng_states)
+            if self.args.should_save:
+                os.makedirs(output_dir, exist_ok=True)
+                paddle.save(rng_states_list, os.path.join(output_dir, f"rng_state_{self.args.world_size}.pth"))
+        else:
+            os.makedirs(output_dir, exist_ok=True)
+            paddle.save(rng_states, os.path.join(output_dir, "rng_state.pth"))
+
+        if strtobool(os.getenv("FLAG_LLM_PDC", "False")):
+            # save checkpoint_done file to ensure checkpoint is complete
+            if self.args.should_save_model_state and self.args.should_save:
+                # For ckpt integrity
+                paddle.save(self.state.global_step, os.path.join(output_dir, ".checkpoint_done"))
+
+    def _save(self, output_dir: Optional[str] = None, state_dict=None, merge_tensor_parallel=False):
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        os.makedirs(output_dir, exist_ok=True)
+        logger.info(f"Saving model checkpoint to {output_dir}")
+
+        if self.args.should_save:
+            if self.tokenizer is not None:
+                self.tokenizer.save_pretrained(output_dir)
+            # Good practice: save your training arguments together with the trained model
+            paddle.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
+
+        if self.args.should_save_model_state:
+            self._save_ckpt_func(self.model.state_dict(), os.path.join(output_dir, MODEL_NAME))
+            logger.info(f"Model weights saved in {output_dir}/{MODEL_NAME}")
+
+    def _load_from_checkpoint(self, resume_from_checkpoint=None):
+
+        resume_from_checkpoint = None if not resume_from_checkpoint else resume_from_checkpoint
+
+        # Load potential model checkpoint
+        if isinstance(resume_from_checkpoint, bool) and resume_from_checkpoint:
+            resume_from_checkpoint = get_last_checkpoint(self.args.output_dir)
+            if resume_from_checkpoint is None:
+                raise ValueError(f"No valid checkpoint found in output directory ({self.args.output_dir})")
+
+        if resume_from_checkpoint is not None:
+
+            logger.info(f"Loading model from {resume_from_checkpoint} .")
+
+            if not self.args.ignore_load_lr_and_optim:
+                with _exec_mode_guard("dynamic"):
+                    if distributed_isfile(os.path.join(resume_from_checkpoint, SCHEDULER_NAME)):
+                        self.lr_scheduler.set_state_dict(
+                            paddle.load(distributed_file(os.path.join(resume_from_checkpoint, SCHEDULER_NAME)))
+                        )
+                    else:
+                        raise ValueError(
+                            f"scheduler-file not found, scheduler:{os.path.join(resume_from_checkpoint, SCHEDULER_NAME)}"
+                        )
+
+                    if self.do_grad_scaling and distributed_isfile(os.path.join(resume_from_checkpoint, SCALER_NAME)):
+                        self.scaler.load_state_dict(
+                            paddle.load(
+                                distributed_file(os.path.join(resume_from_checkpoint, SCALER_NAME)), return_numpy=True
+                            )
+                        )
+
+            ckpt_path = os.path.join(resume_from_checkpoint, DIST_CKPT_PATH)
+
+            if not os.path.isdir(ckpt_path):
+                raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}")
+
+            if self.args.to_static:
+                opt_state_dict = {
+                    key: value
+                    for key, value in self.model_wrapped.state_dict("opt").items()
+                    if not any(keyword in key for keyword in FREE_SVAE_LOAD_KEY_PATTERNS)
+                }
+                state_dict = {
+                    MODEL_NAME: self.model_wrapped.state_dict("param"),
+                    OPTIMIZER_NAME: opt_state_dict,
+                }
+            else:
+                model_state_dict = self.model_wrapped.state_dict()
+                optim_state_dict = self.optimizer.state_dict()
+                optim_state_dict.pop("LR_Scheduler", None)
+                if len(optim_state_dict) == 0:
+                    self.optimizer._create_accumulators(
+                        paddle.base.framework.default_main_program().global_block(), self.optimizer._parameter_list
+                    )
+                    optim_state_dict = self.optimizer.state_dict()
+                    optim_state_dict.pop("LR_Scheduler", None)
+
+                state_dict = {
+                    MODEL_NAME: model_state_dict,
+                    OPTIMIZER_NAME: optim_state_dict,
+                }
+
+            self._load_ckpt_func(state_dict, ckpt_path)
+
+            # release memory
+            del state_dict
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/compression_args.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/compression_args.py
new file mode 100644
index 000000000..d0ed1c7f1
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/compression_args.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2020-present the HuggingFace Inc. team.
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import types
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+import paddle
+
+from ..utils.log import logger
+from .training_args import TrainingArguments
+
+__all__ = [
+    "CompressionArguments",
+]
+
+
+@dataclass
+class CompressionArguments(TrainingArguments):
+    """
+    CompressionArguments is the subset of the arguments we use in our example
+    scripts **which relate to the training loop itself**.
+
+    Using [`PdArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse)
+    arguments that can be specified on the command line.
+    """
+
+    do_compress: bool = field(default=False, metadata={"help": "Whether to run compression after training."})
+    input_dtype: Optional[str] = field(
+        default="int64",
+        metadata={"help": "The data type of input tensor, it could be int32 or int64. Defaults to int64."},
+    )
+    # prune embeddings
+    prune_embeddings: bool = field(default=False, metadata={"help": "Whether to prune embeddings before finetuning."})
+    onnx_format: Optional[bool] = field(
+        default=True,
+        metadata={"help": "Whether to export onnx format quantized model, and it defaults to True."},
+    )
+    strategy: Optional[str] = field(
+        default="dynabert+ptq",
+        metadata={
+            "help": "Compression strategy. It supports 'dynabert+qat+embeddings',"
+            "'dynabert+qat', 'dynabert+ptq', 'dynabert+embeddings', 'dynabert', 'ptq' and 'qat' now."
+        },
+    )
+    # dynabert
+    width_mult_list: Optional[List[str]] = field(
+        default=None,
+        metadata={"help": ("List of width multiplicator for pruning using DynaBERT strategy.")},
+    )
+    logging_steps: int = field(default=100, metadata={"help": "Log every X updates steps."})
+
+    save_steps: int = field(default=100, metadata={"help": "Save checkpoint every X updates steps."})
+
+    warmup_ratio: float = field(
+        default=0.1, metadata={"help": "Linear warmup over warmup_ratio fraction of total steps."}
+    )
+
+    # quant
+    weight_quantize_type: Optional[str] = field(
+        default="channel_wise_abs_max",
+        metadata={
+            "help": "Quantization type for weights. Supports 'abs_max' and 'channel_wise_abs_max'. "
+            "This param only specifies the fake ops in saving quantized model, and "
+            "we save the scale obtained by post training quantization in fake ops. "
+            "Compared to 'abs_max' the model accuracy is usually higher when it is "
+            "'channel_wise_abs_max'."
+        },
+    )
+    activation_quantize_type: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Support 'abs_max', 'range_abs_max' and 'moving_average_abs_max'. "
+            "In strategy 'ptq', it defaults to 'range_abs_max' and in strategy "
+            "'qat', it defaults to 'moving_average_abs_max'."
+        },
+    )
+    # ptq:
+    algo_list: Optional[List[str]] = field(
+        default=None,
+        metadata={
+            "help": "Algorithm list for Post-Quantization, and it supports 'hist', 'KL', "
+            "'mse', 'avg', 'abs_max' and 'emd'.'KL' uses KL-divergenc method to get "
+            "the KL threshold for quantized activations and get the abs_max value "
+            "forquantized weights. 'abs_max' gets the abs max value for activations "
+            "and weights. 'min_max' gets the min and max value for quantized "
+            "activations and weights. 'avg' gets the average value among the max "
+            "values for activations. 'hist' gets the value of 'hist_percent' "
+            "quantile as the threshold. 'mse' gets the value which makes the "
+            "quantization mse loss minimal."
+        },
+    )
+
+    batch_num_list: Optional[List[int]] = field(
+        default=None,
+        metadata={
+            "help": "List of batch_num. 'batch_num' is the number of batchs for sampling. "
+            "the number of calibrate data is batch_size * batch_nums. "
+            "If batch_nums is None, use all data provided by data loader as calibrate data."
+        },
+    )
+    batch_size_list: Optional[List[int]] = field(
+        default=None,
+        metadata={"help": "List of batch_size. 'batch_size' is the batch of data loader."},
+    )
+
+    round_type: Optional[str] = field(
+        default="round",
+        metadata={
+            "help": "The method of converting the quantized weights value float->int. "
+            "Currently supports ['round', 'adaround'] methods. Default is `round`, "
+            "which is rounding nearest to the integer. 'adaround' is refer to "
+            "https://arxiv.org/abs/2004.10568."
+        },
+    )
+    bias_correction: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "If set to True, use the bias correction method of "
+            "https://arxiv.org/abs/1810.05723. Default is False."
+        },
+    )
+    input_infer_model_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "If you have only inference model, quantization is also supported."
+            " The format is `dirname/file_prefix` or `file_prefix`. Default "
+            "is None."
+        },
+    )
+    # qat
+    use_pact: Optional[bool] = field(
+        default=True,
+        metadata={
+            "help": "Whether to use PACT(Parameterized Clipping Activation for Quantized Neural Networks) "
+            "method in quantization aware training."
+        },
+    )
+    moving_rate: Optional[float] = field(
+        default=0.9,
+        metadata={"help": "The decay coefficient of moving average. Defaults to 0.9."},
+    )
+
+    def print_config(self, args=None, key=""):
+        """
+        Prints all config values.
+        """
+
+        compression_arg_name = [
+            "strategy",
+            "width_mult_list",
+            "batch_num_list",
+            "bias_correction",
+            "round_type",
+            "algo_list",
+            "batch_size_list",
+            "weight_quantize_type",
+            "activation_quantize_type",
+            "input_infer_model_path",
+            "activation_preprocess_type",
+            "weight_preprocess_type",
+            "moving_rate",
+            "use_pact",
+            "onnx_format",
+            "prune_embeddings",
+            "input_dtype",
+        ]
+        default_arg_dict = {
+            "width_mult_list": ["3/4"],
+            "batch_size_list": [4, 8, 16],
+            "algo_list": ["mse", "KL"],
+            "batch_num_list": [1],
+        }
+        logger.info("=" * 60)
+        if args is None:
+            args = self
+            key = "Compression"
+
+        logger.info("{:^40}".format("{} Configuration Arguments".format(key)))
+        if key == "Compression":
+            logger.info(
+                "Compression Suggestions: `Strategy` supports 'dynabert+qat+embeddings', "
+                "'dynabert+qat', 'dynabert+ptq', 'dynabert+embeddings', "
+                "'dynabert' and 'ptq'. `input_dtype`, `prune_embeddings`, "
+                "and `onnx_format` are common needed. `width_mult_list` is needed in "
+                "`dynabert`, and `algo_list`, `batch_num_list`, `batch_size_list`,"
+                " `round_type`, `bias_correction`, `weight_quantize_type`, "
+                "`input_infer_model_path` are needed in 'ptq'. `activation_preprocess_type'`, "
+                "'weight_preprocess_type', 'moving_rate', 'weight_quantize_type', "
+                "and 'activation_quantize_type' are needed in 'qat'."
+            )
+        logger.info("{:30}:{}".format("paddle commit id", paddle.version.commit))
+
+        for arg in dir(args):
+            if key == "Compression" and arg not in compression_arg_name:
+                continue
+            if arg[:2] != "__":  # don't print double underscore methods
+                v = getattr(args, arg)
+                if v is None and arg in default_arg_dict:
+                    v = default_arg_dict[arg]
+                    setattr(args, arg, v)
+                elif v is None and arg == "activation_quantize_type":
+                    if key == "Compression" and "ptq" in args.strategy:
+                        setattr(args, arg, "range_abs_max")
+                    elif key == "Compression" and "qat" in args.strategy:
+                        setattr(args, arg, "moving_average_abs_max")
+
+                if not isinstance(v, types.MethodType):
+                    logger.info("{:30}:{}".format(arg, v))
+
+        logger.info("")
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/integrations.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/integrations.py
new file mode 100644
index 000000000..210d5322b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/integrations.py
@@ -0,0 +1,432 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is modified from
+#  https://github.com/huggingface/transformers/blob/main/src/transformers/integrations.py
+
+import importlib
+import json
+import numbers
+import os
+import tempfile
+from pathlib import Path
+
+from ..peft import LoRAModel, PrefixModelForCausalLM, VeRAModel
+from ..transformers import PretrainedModel
+from ..utils.log import logger
+from .trainer_callback import TrainerCallback
+
+
+def is_visualdl_available():
+    return importlib.util.find_spec("visualdl") is not None
+
+
+def is_tensorboardX_available():
+    return importlib.util.find_spec("tensorboardX") is not None
+
+
+def is_wandb_available():
+    if os.getenv("WANDB_DISABLED", "").upper() in {"1", "ON", "YES", "TRUE"}:
+        return False
+    return importlib.util.find_spec("wandb") is not None
+
+
+def is_ray_available():
+    return importlib.util.find_spec("ray.air") is not None
+
+
+def get_available_reporting_integrations():
+    integrations = []
+    if is_visualdl_available():
+        integrations.append("visualdl")
+    if is_wandb_available():
+        integrations.append("wandb")
+    if is_tensorboardX_available():
+        integrations.append("tensorboard")
+
+    return integrations
+
+
+def rewrite_logs(d):
+    new_d = {}
+    eval_prefix = "eval_"
+    eval_prefix_len = len(eval_prefix)
+    test_prefix = "test_"
+    test_prefix_len = len(test_prefix)
+    for k, v in d.items():
+        if k.startswith(eval_prefix):
+            new_d["eval/" + k[eval_prefix_len:]] = v
+        elif k.startswith(test_prefix):
+            new_d["test/" + k[test_prefix_len:]] = v
+        else:
+            new_d["train/" + k] = v
+    return new_d
+
+
+class VisualDLCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that sends the logs to [VisualDL](https://www.paddlepaddle.org.cn/paddle/visualdl).
+    Args:
+        vdl_writer (`LogWriter`, *optional*):
+            The writer to use. Will instantiate one if not set.
+    """
+
+    def __init__(self, vdl_writer=None):
+        has_visualdl = is_visualdl_available()
+        if not has_visualdl:
+            raise RuntimeError("VisualDLCallback requires visualdl to be installed. Please install visualdl.")
+        if has_visualdl:
+            try:
+                from visualdl import LogWriter
+
+                self._LogWriter = LogWriter
+            except ImportError:
+                self._LogWriter = None
+        else:
+            self._LogWriter = None
+        self.vdl_writer = vdl_writer
+
+    def _init_summary_writer(self, args, log_dir=None):
+        log_dir = log_dir or args.logging_dir
+        if self._LogWriter is not None:
+            self.vdl_writer = self._LogWriter(logdir=log_dir)
+
+    def on_train_begin(self, args, state, control, **kwargs):
+        if not state.is_world_process_zero:
+            return
+
+        log_dir = None
+
+        if self.vdl_writer is None:
+            self._init_summary_writer(args, log_dir)
+
+        if self.vdl_writer is not None:
+            self.vdl_writer.add_text("args", args.to_json_string())
+            if "model" in kwargs and logger.logger.level < 20:
+                model = kwargs["model"]
+                if (
+                    isinstance(model, LoRAModel)
+                    or isinstance(model, PrefixModelForCausalLM)
+                    or isinstance(model, VeRAModel)
+                ):
+                    model = kwargs["model"].model
+                if isinstance(model, PretrainedModel) and model.constructed_from_pretrained_config():
+                    model.config.architectures = [model.__class__.__name__]
+                    self.vdl_writer.add_text("model_config", str(model.config))
+                elif hasattr(model, "init_config") and model.init_config is not None:
+                    model_config_json = json.dumps(model.get_model_config(), ensure_ascii=False, indent=2)
+                    self.vdl_writer.add_text("model_config", model_config_json)
+
+            if hasattr(self.vdl_writer, "add_hparams"):
+                self.vdl_writer.add_hparams(args.to_sanitized_dict(), metrics_list=[])
+
+    def on_log(self, args, state, control, logs=None, **kwargs):
+        if not state.is_world_process_zero:
+            return
+
+        if self.vdl_writer is None:
+            return
+
+        if self.vdl_writer is not None:
+            logs = rewrite_logs(logs)
+            for k, v in logs.items():
+                if isinstance(v, (int, float)):
+                    self.vdl_writer.add_scalar(k, v, state.global_step)
+                else:
+                    logger.warning(
+                        "Trainer is attempting to log a value of "
+                        f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
+                        "This invocation of VisualDL's writer.add_scalar() "
+                        "is incorrect so we dropped this attribute."
+                    )
+            self.vdl_writer.flush()
+
+    def on_train_end(self, args, state, control, **kwargs):
+        if self.vdl_writer:
+            self.vdl_writer.close()
+            self.vdl_writer = None
+
+
+class TensorBoardCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that sends the logs to [TensorBoard](https://www.tensorflow.org/tensorboard).
+
+    Args:
+        tb_writer (`SummaryWriter`, *optional*):
+            The writer to use. Will instantiate one if not set.
+    """
+
+    def __init__(self, tb_writer=None):
+        has_tensorboard = is_tensorboardX_available()
+        if not has_tensorboard:
+            raise RuntimeError("TensorBoardCallback requires tensorboardX to be installed")
+
+        if has_tensorboard:
+            try:
+                from tensorboardX import SummaryWriter
+
+                self._SummaryWriter = SummaryWriter
+            except ImportError:
+                self._SummaryWriter = None
+        else:
+            self._SummaryWriter = None
+        self.tb_writer = tb_writer
+
+    def _init_summary_writer(self, args, log_dir=None):
+        log_dir = log_dir or args.logging_dir
+        if self._SummaryWriter is not None:
+            self.tb_writer = self._SummaryWriter(log_dir=log_dir)
+
+    def on_train_begin(self, args, state, control, **kwargs):
+        if not state.is_world_process_zero:
+            return
+
+        log_dir = None
+
+        if self.tb_writer is None:
+            self._init_summary_writer(args, log_dir)
+
+        if self.tb_writer is not None:
+            self.tb_writer.add_text("args", args.to_json_string())
+            if "model" in kwargs:
+                model = kwargs["model"]
+                if hasattr(model, "config") and model.config is not None:
+                    model_config_json = model.config.to_json_string()
+                    self.tb_writer.add_text("model_config", model_config_json)
+
+    def on_log(self, args, state, control, logs=None, **kwargs):
+        if not state.is_world_process_zero:
+            return
+
+        if self.tb_writer is None:
+            self._init_summary_writer(args)
+
+        if self.tb_writer is not None:
+            logs = rewrite_logs(logs)
+            for k, v in logs.items():
+                if isinstance(v, (int, float)):
+                    self.tb_writer.add_scalar(k, v, state.global_step)
+                else:
+                    logger.warning(
+                        "Trainer is attempting to log a value of "
+                        f'"{v}" of type {type(v)} for key "{k}" as a scalar. '
+                        "This invocation of Tensorboard's writer.add_scalar() "
+                        "is incorrect so we dropped this attribute."
+                    )
+            self.tb_writer.flush()
+
+    def on_train_end(self, args, state, control, **kwargs):
+        if self.tb_writer:
+            self.tb_writer.close()
+            self.tb_writer = None
+
+
+class WandbCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that logs metrics, media, model checkpoints to [Weight and Biases](https://www.wandb.com/).
+    """
+
+    def __init__(self):
+        has_wandb = is_wandb_available()
+        if not has_wandb:
+            raise RuntimeError("WandbCallback requires wandb to be installed. Run `pip install wandb`.")
+        if has_wandb:
+            import wandb
+
+            self._wandb = wandb
+        self._initialized = False
+        # log model
+        self._log_model = os.getenv("WANDB_LOG_MODEL", "false").lower()
+
+    def setup(self, args, state, model, **kwargs):
+        """
+        Setup the optional Weights & Biases (*wandb*) integration.
+
+        One can subclass and override this method to customize the setup if needed.
+        variables:
+        Environment:
+        - **WANDB_LOG_MODEL** (`str`, *optional*, defaults to `"false"`):
+            Whether to log model and checkpoints during training. Can be `"end"`, `"checkpoint"` or `"false"`. If set
+            to `"end"`, the model will be uploaded at the end of training. If set to `"checkpoint"`, the checkpoint
+            will be uploaded every `args.save_steps` . If set to `"false"`, the model will not be uploaded. Use along
+            with [`TrainingArguments.load_best_model_at_end`] to upload best model.
+        - **WANDB_WATCH** (`str`, *optional* defaults to `"false"`):
+            Can be `"gradients"`, `"all"`, `"parameters"`, or `"false"`. Set to `"all"` to log gradients and
+            parameters.
+        - **WANDB_PROJECT** (`str`, *optional*, defaults to `"PaddleNLP"`):
+            Set this to a custom string to store results in a different project.
+        - **WANDB_DISABLED** (`bool`, *optional*, defaults to `False`):
+            Whether to disable wandb entirely. Set `WANDB_DISABLED=true` to disable.
+        """
+        if self._wandb is None:
+            return
+
+        # Check if a Weights & Biases (wandb) API key is provided in the training arguments
+        if args.wandb_api_key:
+            if self._wandb.api.api_key:
+                logger.warning(
+                    "A Weights & Biases API key is already configured in the environment. "
+                    "However, the training argument 'wandb_api_key' will take precedence. "
+                )
+            self._wandb.login(key=args.wandb_api_key)
+
+        self._initialized = True
+
+        if state.is_world_process_zero:
+            logger.info(
+                'Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"'
+            )
+            combined_dict = {**args.to_dict()}
+
+            if hasattr(model, "config") and model.config is not None:
+                model_config = model.config.to_dict()
+                combined_dict = {**model_config, **combined_dict}
+            trial_name = state.trial_name
+            init_args = {}
+            if trial_name is not None:
+                init_args["name"] = trial_name
+                init_args["group"] = args.run_name
+            else:
+                if not (args.run_name is None or args.run_name == args.output_dir):
+                    init_args["name"] = args.run_name
+            init_args["dir"] = args.logging_dir
+            if self._wandb.run is None:
+                self._wandb.init(
+                    project=os.getenv("WANDB_PROJECT", "PaddleNLP"),
+                    **init_args,
+                )
+            # add config parameters (run may have been created manually)
+            self._wandb.config.update(combined_dict, allow_val_change=True)
+
+            # define default x-axis (for latest wandb versions)
+            if getattr(self._wandb, "define_metric", None):
+                self._wandb.define_metric("train/global_step")
+                self._wandb.define_metric("*", step_metric="train/global_step", step_sync=True)
+
+            # keep track of model topology and gradients
+            _watch_model = os.getenv("WANDB_WATCH", "false")
+            if _watch_model in ("all", "parameters", "gradients"):
+                self._wandb.watch(model, log=_watch_model, log_freq=max(100, state.logging_steps))
+            self._wandb.run._label(code="transformers_trainer")
+
+    def on_train_begin(self, args, state, control, model=None, **kwargs):
+        if self._wandb is None:
+            return
+        if not self._initialized:
+            self.setup(args, state, model, **kwargs)
+
+    def on_train_end(self, args, state, control, model=None, tokenizer=None, **kwargs):
+        if self._wandb is None:
+            return
+        if self._log_model in ("end", "checkpoint") and self._initialized and state.is_world_process_zero:
+            from ..trainer import Trainer
+
+            fake_trainer = Trainer(args=args, model=model, tokenizer=tokenizer)
+            with tempfile.TemporaryDirectory() as temp_dir:
+                fake_trainer.save_model(temp_dir)
+                metadata = (
+                    {
+                        k: v
+                        for k, v in dict(self._wandb.summary).items()
+                        if isinstance(v, numbers.Number) and not k.startswith("_")
+                    }
+                    if not args.load_best_model_at_end
+                    else {
+                        f"eval/{args.metric_for_best_model}": state.best_metric,
+                        "train/total_floss": state.total_flos,
+                    }
+                )
+                logger.info("Logging model artifacts. ...")
+
+                model_name = (
+                    f"model-{self._wandb.run.id}"
+                    if (args.run_name is None or args.run_name == args.output_dir)
+                    else f"model-{self._wandb.run.name}"
+                )
+                artifact = self._wandb.Artifact(name=model_name, type="model", metadata=metadata)
+                for f in Path(temp_dir).glob("*"):
+                    if f.is_file():
+                        with artifact.new_file(f.name, mode="wb") as fa:
+                            fa.write(f.read_bytes())
+
+                self._wandb.run.log_artifact(artifact)
+
+    def on_log(self, args, state, control, model=None, logs=None, **kwargs):
+        if self._wandb is None:
+            return
+        if not self._initialized:
+            self.setup(args, state, model)
+        if state.is_world_process_zero:
+            logs = rewrite_logs(logs)
+            self._wandb.log({**logs, "train/global_step": state.global_step})
+
+    def on_save(self, args, state, control, **kwargs):
+        if self._log_model == "checkpoint" and self._initialized and state.is_world_process_zero:
+            checkpoint_metadata = {
+                k: v
+                for k, v in dict(self._wandb.summary).items()
+                if isinstance(v, numbers.Number) and not k.startswith("_")
+            }
+            ckpt_dir = f"checkpoint-{state.global_step}"
+            artifact_path = os.path.join(args.output_dir, ckpt_dir)
+            logger.info(f"Logging checkpoint artifacts in {ckpt_dir}. ...")
+            checkpoint_name = (
+                f"checkpoint-{self._wandb.run.id}"
+                if (args.run_name is None or args.run_name == args.output_dir)
+                else f"checkpoint-{self._wandb.run.name}"
+            )
+            artifact = self._wandb.Artifact(name=checkpoint_name, type="model", metadata=checkpoint_metadata)
+            artifact.add_dir(artifact_path)
+            self._wandb.log_artifact(artifact, aliases=[f"checkpoint-{state.global_step}"])
+
+
+class AutoNLPCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that sends the logs to [`Ray Tune`] for [`AutoNLP`]
+    """
+
+    def __init__(self):
+        if not is_ray_available():
+            raise RuntimeError(
+                "AutoNLPCallback requires extra dependencies to be installed. Please install paddlenlp with 'pip install paddlenlp[autonlp]'."
+            )
+        self.session = importlib.import_module("ray.air.session")
+        self.tune = importlib.import_module("ray.tune")
+
+    # report session metrics to Ray to track trial progress
+    def on_evaluate(self, args, state, control, **kwargs):
+        if not state.is_world_process_zero:
+            return
+
+        metrics = kwargs.get("metrics", None)
+        if self.tune.is_session_enabled() and metrics is not None and isinstance(metrics, dict):
+            self.session.report(metrics)
+
+
+INTEGRATION_TO_CALLBACK = {
+    "visualdl": VisualDLCallback,
+    "autonlp": AutoNLPCallback,
+    "wandb": WandbCallback,
+    "tensorboard": TensorBoardCallback,
+}
+
+
+def get_reporting_integration_callbacks(report_to):
+    for integration in report_to:
+        if integration not in INTEGRATION_TO_CALLBACK:
+            raise ValueError(
+                f"{integration} is not supported, only {', '.join(INTEGRATION_TO_CALLBACK.keys())} are supported."
+            )
+    return [INTEGRATION_TO_CALLBACK[integration] for integration in report_to]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/plugins/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/plugins/__init__.py
new file mode 100644
index 000000000..b5ab11090
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/plugins/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you smay not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/plugins/npu_plugin.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/plugins/npu_plugin.py
new file mode 100644
index 000000000..469e722a0
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/plugins/npu_plugin.py
@@ -0,0 +1,127 @@
+# Copyright 2020-present the HuggingFace Inc. team.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import types
+
+import numpy as np
+import paddle
+from paddle.common_ops_import import LayerHelper
+
+from ...utils.log import logger
+
+
+def npu_accelerate_plugin(optimizer):
+    """npu_accelerate_plugin uses the flatten_param_grads method to speed up the performance of the model on NPU devices.
+    flatten_param_grads method will be added to `step` function of optimizer.
+
+    Args:
+        optimizer (`paddle.optimizer.Optimizer`):
+            The Optimizer whose `step` method will be modified.
+    """
+    optimizer.step = types.MethodType(_optimizer_step_with_flatten_param_grads, optimizer)
+
+
+def _optimizer_step_with_flatten_param_grads(optimizer):
+    if not isinstance(optimizer._param_groups[0], dict):
+        params_grads = []
+        for param in optimizer._param_groups:
+            if param.stop_gradient:
+                continue
+            if param._grad_ivar() is not None:
+                grad_var = param._grad_ivar()
+                params_grads.append((param, grad_var))
+
+        # currently, only support ClipGradByGlobalNorm and without regularization.
+        if isinstance(params_grads, list) and optimizer.regularization is None:
+            if optimizer._grad_clip is None or isinstance(optimizer._grad_clip, paddle.nn.ClipGradByGlobalNorm):
+                params_grads = _flatten_param_grads(optimizer, params_grads)
+
+        optimizer._apply_optimize(
+            loss=None,
+            startup_program=None,
+            params_grads=params_grads,
+            param_group_idx=0,
+        )
+    else:
+        raise RuntimeError("flatten_param_grads is not supported when _param_groups[0] is dict.")
+
+
+def _flatten_param_grads(optimizer, params_grads):
+    optimizer.helper = LayerHelper(optimizer.__class__.__name__)
+    need_flatten_params = []
+    need_flatten_grads = []
+    for p, g in params_grads:
+        if g is None:
+            continue
+        g.persistable = True
+        if getattr(p, "need_clip", True) is False or getattr(p, "regularizer", None) is not None:
+            logger.warning(
+                f"flatten_param_grads=True will be discarded since paramter {p.name}'s need_clip is False or "
+                "the regularizer is set."
+            )
+            return params_grads
+
+        need_flatten_params.append(p)
+        need_flatten_grads.append(g)
+
+    shape = [np.prod(p.shape) for p in need_flatten_params]
+
+    flatten_param = optimizer.helper.create_global_variable(
+        name="flatten_param",
+        persistable=True,
+        dtype=need_flatten_params[0].dtype,
+        shape=[np.sum(shape)],
+        belong_to_optimizer=True,
+    )
+
+    flatten_grad = optimizer.helper.create_global_variable(
+        name="flatten_grad",
+        persistable=True,
+        dtype=need_flatten_grads[0].dtype,
+        shape=[np.sum(shape)],
+        belong_to_optimizer=True,
+    )
+
+    flatten_param.stop_gradient = False
+    # In the final state of the dynamic graph, the `coalesce_tensor` op
+    # does not support passing the output as an input into the op in
+    # temporary, so _legacy_C_ops is temporarily used here.
+    # `use_align` is set to false, which is different from the behavior
+    # under static graphs. `use_align` can be set to true after calling
+    # the coalesce_tensor op of the final state (_C_ops).
+    paddle._legacy_C_ops.coalesce_tensor(
+        need_flatten_params,
+        need_flatten_params,
+        flatten_param,
+        "copy_data",
+        True,
+        "use_align",
+        False,
+        "dtype",
+        need_flatten_params[0].dtype,
+    )
+
+    paddle._legacy_C_ops.coalesce_tensor(
+        need_flatten_grads,
+        need_flatten_grads,
+        flatten_grad,
+        "copy_data",
+        True,
+        "use_align",
+        False,
+        "dtype",
+        need_flatten_grads[0].dtype,
+    )
+    return [(flatten_param, flatten_grad)]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/plugins/shared_memory_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/plugins/shared_memory_utils.py
new file mode 100644
index 000000000..f27f6048e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/plugins/shared_memory_utils.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Shared Memory Utils"""
+
+from dataclasses import dataclass
+from typing import List, Mapping, Tuple
+
+import numpy as np
+import paddle
+
+from paddlenlp.transformers.utils import device_guard
+
+
+@dataclass
+class TensorMeta:
+    shape: Tuple[int] = None  # type: ignore
+    dtype: paddle.dtype = None  # type: ignore
+    element_size: int = 0
+    numel: int = 0
+    offset: int = 0
+
+
+dtype_mapping = {
+    paddle.float32: np.float32,
+    paddle.float64: np.float64,
+    paddle.int32: np.int32,
+    paddle.int64: np.int64,
+    paddle.uint8: np.uint8,
+    paddle.bool: np.bool_,
+    paddle.float16: np.float16,
+    paddle.bfloat16: np.uint16,
+    paddle.complex64: np.complex64,
+    paddle.complex128: np.complex128,
+}
+
+
+def _write_shared_memory(value: paddle.Tensor, meta: TensorMeta, buffer):
+    """
+    Write a CPU tensor into the shared memory.
+    """
+    if value.numel() == 0:
+        return
+    shm_numpy = np.frombuffer(
+        buffer, dtype=dtype_mapping[value.dtype], count=int(value.numel()), offset=int(meta.offset)
+    )
+    with device_guard("cpu"):
+        shm_tensor = paddle.Tensor(shm_numpy, zero_copy=True).reshape(value.shape)
+    shm_tensor.copy_(value, False)
+
+
+def _traverse_copy_to_shm(value, meta, buffer):
+    if isinstance(value, Mapping):
+        for k, v in value.items():
+            if isinstance(v, (Mapping, List)):
+                m = meta[k]
+                _traverse_copy_to_shm(v, m, buffer)
+            elif paddle.is_tensor(v):
+                m = meta[k]
+                _write_shared_memory(v, m, buffer)
+            else:
+                meta[k] = v
+    elif isinstance(value, List):
+        for i, v in enumerate(value):
+            if isinstance(v, (Mapping, List)):
+                m = meta[i]
+                _traverse_copy_to_shm(v, m, buffer)
+            elif paddle.is_tensor(v):
+                m = meta[i]
+                _write_shared_memory(v, m, buffer)
+            else:
+                meta[i] = v
+
+
+def _read_ndarray_from_buf(value, shm_tensor_buffer):
+    """
+    Read a numpy array from the buffer of shared memory.
+    """
+    if isinstance(value, TensorMeta):
+        if value.numel == 0:
+            return np.array([], dtype=dtype_mapping[value.dtype])
+        else:
+            shm_numpy = np.frombuffer(
+                buffer=shm_tensor_buffer.buf,
+                dtype=dtype_mapping[value.dtype],
+                offset=value.offset,
+                count=value.numel,
+            ).reshape(value.shape)
+            return shm_numpy
+    else:
+        return value
+
+
+def _read_state_dict_from_shm(meta_dict, tensor_shm):
+    state_dict = _traverse_state_dict(
+        meta_dict,
+        lambda x: _read_ndarray_from_buf(x, tensor_shm),
+    )
+    return state_dict
+
+
+def _traverse_state_dict(value, visitor):
+    """
+    Invoke ``visitor`` for each value recursively in ``state_dict``.
+    """
+    if isinstance(value, Mapping):
+        temp_dict = {}
+        for k, v in value.items():
+            temp_dict[k] = _traverse_state_dict(v, visitor)
+        return temp_dict
+    elif isinstance(value, List):
+        temp_list = []
+        for _, v in enumerate(value):
+            temp_list.append(_traverse_state_dict(v, visitor))
+        return temp_list
+    else:
+        return visitor(value)
+
+
+def create_meta_dict(state_dict):
+    buffer_size = 0
+
+    def _create_tensor_meta(value: paddle.Tensor):
+        nonlocal buffer_size
+        if not paddle.is_tensor(value):
+            return value
+        meta = TensorMeta(
+            shape=tuple(value.shape),  # type: ignore
+            dtype=value.dtype,
+            element_size=value.element_size(),
+            numel=int(value.numel()),
+            offset=int(buffer_size),
+        )
+        buffer_size += value.numel() * value.element_size()
+        return meta
+
+    meta_dict = _traverse_state_dict(state_dict, _create_tensor_meta)
+    return meta_dict, buffer_size
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/plugins/timer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/plugins/timer.py
new file mode 100644
index 000000000..50bc61d96
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/plugins/timer.py
@@ -0,0 +1,153 @@
+# Copyright 2020-present the HuggingFace Inc. team.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+import paddle
+
+from paddlenlp.utils.log import logger
+
+
+class _Timer:
+    """Profile Timer for recording time taken by forward/ bacward/ reduce/ step."""
+
+    def __init__(self, name):
+        self.name = name
+        self.elapsed_ = 0.0
+        self.started_ = False
+        self.start_time = time.time()
+
+    def start(self):
+        """Start the timer."""
+        assert not self.started_, f"{self.name} timer has already started"
+        if "cpu" not in paddle.device.get_device():
+            paddle.device.synchronize()
+        self.start_time = time.time()
+        self.started_ = True
+
+    def stop(self):
+        """Stop the timers."""
+        assert self.started_, f"{self.name} timer is not started."
+        if "cpu" not in paddle.device.get_device():
+            paddle.device.synchronize()
+        self.elapsed_ += time.time() - self.start_time
+        self.started_ = False
+
+    def reset(self):
+        """Reset timer."""
+        self.elapsed_ = 0.0
+        self.started_ = False
+
+    def elapsed(self, reset=True):
+        """Calculate the elapsed time."""
+        started_ = self.started_
+        # If the timing in progress, end it first.
+        if self.started_:
+            self.stop()
+        # Get the elapsed time.
+        elapsed_ = self.elapsed_
+        # Reset the elapsed time
+        if reset:
+            self.reset()
+        # If timing was in progress, set it back.
+        if started_:
+            self.start()
+        return elapsed_
+
+
+class RuntimeTimer:
+    """A timer that can be dynamically adjusted during runtime."""
+
+    def __init__(self, name):
+        self.timer = _Timer(name)
+
+    def start(self, name):
+        """Start the RuntimeTimer."""
+        self.timer.name = name
+        self.timer.start()
+
+    def stop(self):
+        """Stop the RuntimeTimer."""
+        self.timer.stop()
+
+    def log(self):
+        """Log, stop and reset the RuntimeTimer."""
+        runtime = self.timer.elapsed(reset=True)
+        if self.timer.started_ is True:
+            self.timer.stop()
+        self.timer.reset()
+
+        string = "[timelog] {}: {:.2f}s ({}) ".format(self.timer.name, runtime, time.strftime("%Y-%m-%d %H:%M:%S"))
+        return string
+
+
+class Timers:
+    """Group of timers."""
+
+    def __init__(self):
+        self.timers = {}
+
+    def __call__(self, name):
+        if name not in self.timers:
+            self.timers[name] = _Timer(name)
+        return self.timers[name]
+
+    def write(self, names, writer, iteration, normalizer=1.0, reset=True):
+        """Write timers to a tensorboard writer"""
+        assert normalizer > 0.0
+        for name in names:
+            value = self.timers[name].elapsed(reset=reset) / normalizer
+            writer.add_scalar("timers/" + name, value, iteration)
+
+    def log(self, names, normalizer=1.0, reset=True):
+        """Log a group of timers."""
+        assert normalizer > 0.0
+        # string = "time (ms) / rate"
+        string = "time (ms)"
+        names = sorted(list(names))
+
+        time_dict = {}
+        for name in names:
+            time_dict[name] = self.timers[name].elapsed(reset=reset) * 1000.0 / normalizer
+
+        # total_time = sum(list(time_dict.values()))
+        # string += " | total_time : {:.2f} ".format(total_time)
+        time_dict = sorted(time_dict.items(), key=lambda x: x[1], reverse=True)
+
+        for time_tuple in time_dict:
+            name, value = time_tuple
+            # string += " | {} : {:.2f} ({:.2f}%) ".format(name, value, value * 100.0 / total_time)
+            string += " | {} : {:.2f}".format(name, value)
+        return string
+
+
+_GLOBAL_TIMERS = None
+
+
+def get_timers():
+    global _GLOBAL_TIMERS
+    return _GLOBAL_TIMERS
+
+
+def set_timers():
+    global _GLOBAL_TIMERS
+    logger.info("enable PaddleNLP timer")
+    _GLOBAL_TIMERS = Timers()
+
+
+def disable_timers():
+    global _GLOBAL_TIMERS
+    logger.info("disable PaddleNLP timer")
+    _GLOBAL_TIMERS = None
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/plugins/unified_checkpoint.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/plugins/unified_checkpoint.py
new file mode 100644
index 000000000..56183485c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/plugins/unified_checkpoint.py
@@ -0,0 +1,2349 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import gc
+import json
+import multiprocessing
+import os
+import sys
+import time
+from multiprocessing import shared_memory
+
+import numpy as np
+import paddle
+import paddle.distributed as dist
+from paddle.distributed import fleet
+from tqdm.auto import tqdm
+
+from paddlenlp.peft import LoRAModel, PrefixModelForCausalLM
+from paddlenlp.trainer.argparser import strtobool
+from paddlenlp.trainer.trainer_utils import ExplicitEnum
+from paddlenlp.trainer.utils.helper import distributed_file, distributed_isfile
+from paddlenlp.transformers.model_utils import (
+    PretrainedModel,
+    _add_variant,
+    _load_state_dict_into_model,
+    faster_set_state_dict,
+    get_parameter_dtype,
+    load_state_dict,
+    unwrap_model,
+)
+from paddlenlp.transformers.utils import (
+    device_guard,
+    dtype_byte_size,
+    get_checkpoint_shard_files,
+    is_safetensors_available,
+)
+from paddlenlp.utils.distributed import distributed_allgather, distributed_gather
+from paddlenlp.utils.env import (
+    LORA_WEIGHTS_NAME,
+    PADDLE_MASTER_WEIGHTS_INDEX_NAME,
+    PADDLE_MASTER_WEIGHTS_NAME,
+    PADDLE_OPTIMIZER_INDEX_NAME,
+    PADDLE_OPTIMIZER_NAME,
+    PADDLE_PEFT_WEIGHTS_INDEX_NAME,
+    PADDLE_WEIGHTS_INDEX_NAME,
+    PADDLE_WEIGHTS_NAME,
+    PAST_KEY_VALUES_FILE_NAME,
+    PREFIX_WEIGHTS_NAME,
+    SAFE_MASTER_WEIGHTS_INDEX_NAME,
+    SAFE_MASTER_WEIGHTS_NAME,
+    SAFE_OPTIMIZER_INDEX_NAME,
+    SAFE_OPTIMIZER_NAME,
+    SAFE_PEFT_WEIGHTS_INDEX_NAME,
+    SAFE_PEFT_WEIGHTS_NAME,
+    SAFE_WEIGHTS_INDEX_NAME,
+    SAFE_WEIGHTS_NAME,
+)
+from paddlenlp.utils.log import logger
+from paddlenlp.utils.nested import nested_copy, nested_copy_place
+from paddlenlp.utils.tools import get_env_device
+
+if is_safetensors_available():
+    from safetensors.numpy import save_file as safe_save_file
+
+    if sys.platform.startswith("win"):
+        from safetensors import safe_open
+        from safetensors.numpy import load_file
+    else:
+        from paddlenlp.utils.safetensors import fast_safe_open as safe_open
+        from paddlenlp.utils.safetensors import fast_load_file as load_file
+
+from .shared_memory_utils import (
+    _read_state_dict_from_shm,
+    _traverse_copy_to_shm,
+    create_meta_dict,
+)
+
+FP32_MASTER = "fp32_master_0"
+optimizer_scalar_name = [
+    "beta1_pow_acc_0",
+    "beta2_pow_acc_0",
+]
+optimizer_non_scaler_name = [
+    "moment1_0",
+    "moment2_0",
+    "velocity_0",
+]  # to be added
+
+
+DEST_PLACE = paddle.CPUPlace()
+if paddle.device.is_compiled_with_cuda():
+    DEST_PLACE = paddle.CUDAPinnedPlace()
+
+
+class UnifiedCheckpointOption(ExplicitEnum):
+    """
+    "- skip_save_model_weight: do not save model weights when the masters weight exist\n"
+    "- master_weight_compatible: 1. if the master weights exist, only load when needed\n"
+    "                            2. if master weights does not exist, convert model weights to master weights when needed\n"
+    "- async_save: enable asynchronous saving checkpoints to disk\n"
+    "- enable_all_options: enable all optimization configurations\n"
+    """
+
+    SKIP_SAVE_MODEL_WEIGHT = "skip_save_model_weight"
+    MASTER_WEIGHT_COMPATIBLE = "master_weight_compatible"
+    ASYNC_SAVE = "async_save"
+    IGNORE_MERGE_OPTIMIZER = "ignore_merge_optimizer"
+
+
+class UnifiedCheckpointHandler:
+    def __init__(self, args):
+        self.args = args
+        self.global_rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1 else -1
+
+        # Mainly for asynchronous saving.
+        self._shm_model_weight = None
+        self._shm_master_weight = None
+        self._shm_optimizer_weight = None
+        self._meta_dict_model = None
+        self._meta_dict_master_weight = None
+        self._meta_dict_optim = None
+        self._process_model_weight = None
+        self._process_master_weight = None
+        self._process_optimizer_weight = None
+        self._lock = None
+        self._shared_save_path = None
+        self._shared_save_model_flag = None
+        self._shared_save_master_weight_flag = None
+        self._shared_save_optimizer_flag = None
+
+        if "async_save" in self.args.unified_checkpoint_config:
+            self._lock = multiprocessing.Lock()
+            self._shared_save_model_path = multiprocessing.Array("c", 100000)
+            self._shared_save_master_weight_path = multiprocessing.Array("c", 100000)
+            self._shared_save_optimizer_path = multiprocessing.Array("c", 100000)
+            self._shared_save_model_flag = multiprocessing.Array("i", 1)
+            self._shared_save_master_weight_flag = multiprocessing.Array("i", 1)
+            self._shared_save_optimizer_flag = multiprocessing.Array("i", 1)
+
+    def _file_save_async_or_sync(self, state_dict, path, is_sync=True, state_dict_type="model_weight"):
+        if is_sync:
+            for k in list(state_dict.keys()):
+                if isinstance(state_dict[k], paddle.Tensor):
+                    state_dict[k] = state_dict.pop(k).cpu().numpy()
+            safe_save_file(state_dict, path, metadata={"format": "np"})
+        else:
+            if state_dict_type == "model_weight":
+                if self._shm_model_weight is None:
+                    self._meta_dict_model, buffer_size = create_meta_dict(state_dict)
+                    self._shm_model_weight = shared_memory.SharedMemory(create=True, size=buffer_size)
+                shm_state_dict = self._shm_model_weight
+                meta_dict = self._meta_dict_model
+                shared_save_flag = self._shared_save_model_flag
+                shared_save_path = self._shared_save_model_path
+                if self._process_model_weight is None:
+                    self._process_model_weight = multiprocessing.Process(
+                        target=self._save_file_async_in_process,
+                        args=(
+                            meta_dict,
+                            self._shm_model_weight.name,
+                            self._shared_save_model_flag,
+                            self._shared_save_model_path,
+                            self._lock,
+                            state_dict_type,
+                            self.global_rank,
+                        ),
+                    )
+                    self._process_model_weight.start()
+            elif state_dict_type == "master_weight":
+                if self._shm_master_weight is None:
+                    self._meta_dict_master_weight, buffer_size = create_meta_dict(state_dict)
+                    self._shm_master_weight = shared_memory.SharedMemory(create=True, size=buffer_size)
+                shm_state_dict = self._shm_master_weight
+                meta_dict = self._meta_dict_master_weight
+                shared_save_flag = self._shared_save_master_weight_flag
+                shared_save_path = self._shared_save_master_weight_path
+                if self._process_master_weight is None:
+                    self._process_master_weight = multiprocessing.Process(
+                        target=self._save_file_async_in_process,
+                        args=(
+                            meta_dict,
+                            self._shm_master_weight.name,
+                            self._shared_save_master_weight_flag,
+                            self._shared_save_master_weight_path,
+                            self._lock,
+                            "model_weight"
+                            if "skip_save_model_weight" in self.args.unified_checkpoint_config
+                            else state_dict_type,
+                            self.global_rank,
+                        ),
+                    )
+                    self._process_master_weight.start()
+            elif state_dict_type == "optimizer_weight":
+                if self._shm_optimizer_weight is None:
+                    self._meta_dict_optim, buffer_size = create_meta_dict(state_dict)
+                    self._shm_optimizer_weight = shared_memory.SharedMemory(create=True, size=buffer_size)
+                shm_state_dict = self._shm_optimizer_weight
+                meta_dict = self._meta_dict_optim
+                shared_save_flag = self._shared_save_optimizer_flag
+                shared_save_path = self._shared_save_optimizer_path
+                if self._process_optimizer_weight is None:
+                    self._process_optimizer_weight = multiprocessing.Process(
+                        target=self._save_file_async_in_process,
+                        args=(
+                            meta_dict,
+                            self._shm_optimizer_weight.name,
+                            self._shared_save_optimizer_flag,
+                            self._shared_save_optimizer_path,
+                            self._lock,
+                            state_dict_type,
+                            self.global_rank,
+                        ),
+                    )
+                    self._process_optimizer_weight.start()
+
+            while True:  # wait until no process is saving.
+                flag_value = shared_save_flag[0]
+                if flag_value == 0:
+                    break
+                time.sleep(0.5)
+                logger.info(f"Wait for the previous save process to finish saving {state_dict_type}")
+            # only save model weight or save master weight, we enter this loop.
+            self._reset_and_update(shared_save_path, path)
+            _traverse_copy_to_shm(state_dict, meta_dict, shm_state_dict.buf)
+            with self._lock:
+                shared_save_flag[0] = 1
+
+    def _save_file_async_in_process(
+        self,
+        meta_dict,
+        shm_name,
+        shared_save_flag,
+        shared_save_path,
+        lock,
+        state_dict_type,
+        global_rank,
+    ):
+        shm = shared_memory.SharedMemory(name=shm_name)
+        while True:
+            flag_value = shared_save_flag[0]  # if process uses `spawn`, cannot read this value.
+            if flag_value == -1:  # stop process
+                break
+            if flag_value == 0:  # nothing to save
+                continue
+            if flag_value == 1:  # need to save
+                path = shared_save_path[:].decode("utf-8").rstrip("\x00")
+                logger.info(f"Start to async save {path}")
+                state_dict = _read_state_dict_from_shm(meta_dict, shm)  # numpy array
+                safe_save_file(state_dict, path, {"format": "np"})
+                del state_dict
+                saved_signal_path = os.path.join(os.path.dirname(path), f".{state_dict_type}.done.{global_rank}")
+                paddle.save(global_rank, saved_signal_path)
+                with lock:
+                    shared_save_flag[0] = 0
+            time.sleep(0.5)
+        shm.close()
+
+    def _reset_and_update(self, shared_array, new_value):
+        # clear array
+        for i in range(len(shared_array)):
+            shared_array[i] = b"\0"
+        # update array
+        encoded_value = new_value.encode("utf-8")
+        shared_array[: len(encoded_value)] = encoded_value
+
+    def save_unified_checkpoint(self, model, optimizer, output_dir):
+        """save unified checkpoint
+
+        Args:
+            model (PretrainedModel): model to save
+            output_dir (str): save dir
+            safe_serialization (bool, optional): use safetensors. Defaults to False.
+
+        Raises:
+            ValueError: if model is not an instance of `PretrainedModel` and the model cannot be saved
+        """
+        if isinstance(model, PretrainedModel):
+            model_to_save = model
+        elif isinstance(unwrap_model(model), PretrainedModel):
+            model_to_save = unwrap_model(model)
+        elif isinstance(model, PrefixModelForCausalLM) or isinstance(model, LoRAModel):
+            model_to_save = model
+        else:
+            raise ValueError("Unified checkpoint only supports PretrainedModel, LoRAModel and PrefixModelForCausalLM!")
+
+        # Under non distributed environment.
+        if paddle.distributed.get_world_size() <= 1:
+            self.save_single_card_checkpoint(model_to_save, output_dir)
+            return
+
+        skip_save_model_weight = False
+        if UnifiedCheckpointOption.SKIP_SAVE_MODEL_WEIGHT.value in self.args.unified_checkpoint_config:
+            if is_need_master_weight(optimizer, is_fp16_or_bp16=(self.args.fp16 or self.args.bf16)):
+                logger.info(
+                    f"With {UnifiedCheckpointOption.SKIP_SAVE_MODEL_WEIGHT.value}, skip the model checkpoint save."
+                    " The master weight will be loaded as model weights for next resumption."
+                )
+                # not save model weight, load from master weight
+                skip_save_model_weight = True
+
+        save_directory = output_dir
+        os.makedirs(save_directory, exist_ok=True)
+
+        # save model weights
+        if not skip_save_model_weight:
+            state_dict, shard_file, sharded_index = unified_checkpoint_into_shards(
+                self.args, model_to_save, safe_serialization=True
+            )
+            is_sync_save = True
+            if "async_save" in self.args.unified_checkpoint_config:
+                is_sync_save = False
+            self._file_save_async_or_sync(
+                state_dict,
+                path=os.path.join(save_directory, shard_file),
+                is_sync=is_sync_save,
+                state_dict_type="model_weight",
+            )
+            if sharded_index is not None:
+                if isinstance(model_to_save, LoRAModel) or isinstance(model_to_save, PrefixModelForCausalLM):
+                    index_name = SAFE_PEFT_WEIGHTS_INDEX_NAME
+                else:
+                    index_name = SAFE_WEIGHTS_INDEX_NAME
+                path = os.path.join(output_dir, index_name)
+
+                if self.args.should_save:
+                    with open(path, "w") as f:
+                        json.dump(sharded_index, f, indent=4)
+
+        if self.args.should_save:
+            # Save prefix model past_key_values
+            if isinstance(model_to_save, PrefixModelForCausalLM):
+                save_prefix_past_key_value(model_to_save, save_directory)
+                model_to_save.prefix_config.save_pretrained(save_directory)
+            if isinstance(model_to_save, LoRAModel):
+                model_to_save.lora_config.save_pretrained(save_directory)
+
+        # save the config
+        config_to_save = save_config(model_to_save)
+        # Attach architecture to the config
+        config_to_save.architectures = [model_to_save.__class__.__name__]
+        if self.args.should_save:
+            config_to_save.save_pretrained(save_directory)
+        paddle.device.cuda.empty_cache()
+
+        if strtobool(os.getenv("FLAG_LLM_PDC", "False")) and self.args.should_save:
+            world_size = paddle.distributed.get_world_size()
+            save_info = {
+                "world_size": world_size,
+                "ignore_save_lr_and_optim": self.args.ignore_save_lr_and_optim,
+                "skip_save_model_weight": "skip_save_model_weight" in self.args.unified_checkpoint_config,
+            }
+            paddle.save(save_info, os.path.join(save_directory, ".saving_info"))
+
+    def load_unified_checkpoint(self, model, optimizer, resume_from_checkpoint: str):
+        """Load potential model checkpoint
+
+        Args:
+            model (PretrainedModel): Your model to load
+            resume_from_checkpoint (str): path of the checkpoint to load
+
+        Returns:
+            None
+        """
+        if paddle.distributed.get_world_size() <= 1:
+            load_single_card_checkpoint(self.args, model, resume_from_checkpoint)
+            return
+
+        local_resume = check_unified_checkpoint(self.args, model, resume_from_checkpoint, safe_serialization=True)
+
+        if not local_resume:
+            logger.info("Begin to dynamically load unified checkpoint!")
+            load_unified_checkpoint_dynamically(
+                self.args, model, optimizer, resume_from_checkpoint, safe_serialization=True
+            )
+            return
+
+        if self.args.dataset_rank == 0:
+            load_unified_checkpoint_locally(self.args, model, resume_from_checkpoint, safe_serialization=True)
+
+    def save_non_merge_optimizer(self, model, optimizer, output_dir):
+        paddle.device.cuda.empty_cache()
+        optim_state_dict = nested_copy(optimizer.state_dict())
+        master_weights = None
+        if "master_weights" in optim_state_dict.keys():
+            master_weights = optim_state_dict["master_weights"]
+            optim_state_dict.pop("master_weights")
+        if "LR_Scheduler" in optim_state_dict.keys():
+            optim_state_dict.pop("LR_Scheduler")
+
+        # gather global master_weights status.
+        global_master_weights = reduce_master_weights_status(master_weights is not None)
+        if master_weights is None and global_master_weights:
+            master_weights = {}
+
+        # get optimizer param mappings
+        static2struct_name_mappings = {}
+        state_dict = get_expected_state_dict(model)
+        for k, v in state_dict.items():
+            static2struct_name_mappings[v.name] = k
+
+        # rename optimizer param name
+        for key in list(optim_state_dict.keys()):
+            static_name, type_name = generate_base_static_name(key)
+            new_name = static2struct_name_mappings[static_name] + "/" + type_name
+            optim_state_dict[new_name] = optim_state_dict.pop(key)
+        if master_weights is not None:
+            for key in list(master_weights.keys()):
+                master_weights[static2struct_name_mappings[key]] = master_weights.pop(key)
+
+        optimizer_name = _add_variant(SAFE_OPTIMIZER_NAME, self.args.optimizer_name_suffix)
+        master_weights_name = _add_variant(SAFE_MASTER_WEIGHTS_NAME, self.args.optimizer_name_suffix)
+
+        is_sync_save = True
+        if "async_save" in self.args.unified_checkpoint_config:
+            is_sync_save = False
+        self._file_save_async_or_sync(
+            optim_state_dict,
+            path=os.path.join(output_dir, optimizer_name),
+            is_sync=is_sync_save,
+            state_dict_type="optimizer_weight",
+        )
+        self._file_save_async_or_sync(
+            master_weights,
+            path=os.path.join(output_dir, master_weights_name),
+            is_sync=is_sync_save,
+            state_dict_type="master_weight",
+        )
+
+    def load_non_merge_optimizer(self, model, optimizer, resume_from_checkpoint):
+        # init and get optimizer LR_Scheduler
+        returned_optim_state_dict = nested_copy(optimizer.state_dict())
+
+        optimizer_name = _add_variant(SAFE_OPTIMIZER_NAME, self.args.optimizer_name_suffix)
+        master_weights_name = _add_variant(SAFE_MASTER_WEIGHTS_NAME, self.args.optimizer_name_suffix)
+        optimizer_path = os.path.join(resume_from_checkpoint, optimizer_name)
+        master_weights_path = os.path.join(resume_from_checkpoint, master_weights_name)
+        has_master_weights = True if os.path.isfile(master_weights_path) else False
+
+        model_state_dict = get_expected_state_dict(model)
+        struct2static_name_mappings = {k: v.name for k, v in model_state_dict.items()}  # get optimizer param mappings
+        optimizer_state_dict = load_file(optimizer_path)
+        if has_master_weights:
+            master_weights = load_file(master_weights_path)
+
+        # rename and move to paddle.Tensor
+        for key in list(optimizer_state_dict.keys()):
+            key_name = key.split("/")
+            static_name = struct2static_name_mappings[key_name[0]]
+            if has_master_weights:
+                key_name = "_".join([static_name, FP32_MASTER, key_name[1]])
+            else:
+                key_name = "_".join([static_name, key_name[1]])
+            with device_guard():
+                weight = paddle.Tensor(optimizer_state_dict.pop(key), zero_copy=True)
+            weight = weight._copy_to(paddle.framework._current_expected_place(), False)
+            returned_optim_state_dict[key_name] = weight
+            returned_optim_state_dict[key_name].name = key_name
+
+        if has_master_weights:
+            returned_optim_state_dict["master_weights"] = {}
+            for key in list(master_weights.keys()):
+                static_name = struct2static_name_mappings[key]
+                with device_guard():
+                    weight = paddle.Tensor(master_weights.pop(key), zero_copy=True)
+                weight = weight._copy_to(paddle.framework._current_expected_place(), False)
+                returned_optim_state_dict["master_weights"][static_name] = weight
+                returned_optim_state_dict["master_weights"][static_name].name = "_".join([static_name, FP32_MASTER])
+
+        return returned_optim_state_dict
+
+    def save_unified_optimizer(self, model, optimizer, output_dir):
+        """save unified optimizer
+
+        Args:
+            model (PretrainedModel): model used to get key mapping.
+            optimizer (Optimizer): optimizer to save
+            output_dir (str): Save directory.
+
+        """
+
+        if "ignore_merge_optimizer" in self.args.unified_checkpoint_config:
+            self.save_non_merge_optimizer(model, optimizer, output_dir)
+            return
+
+        if paddle.distributed.get_world_size() <= 1:
+            self.save_single_card_optimizer(model, optimizer, output_dir)
+            return
+
+        # Split into naive optimizer params and master weights.
+        results = unified_optimizer_into_shards(self.args, model, optimizer, safe_serialization=True)
+        master_weight_state_dict = None
+        if len(results) == 1:
+            optim_state_dict, shard_optim_file, sharded_optim_index = results[0]
+        else:
+            optim_state_dict, shard_optim_file, sharded_optim_index = results[0]
+            master_weight_state_dict, shard_master_weight_file, sharded_master_weight_index = results[1]
+
+        paddle.device.cuda.empty_cache()
+
+        save_directory = output_dir
+        os.makedirs(save_directory, exist_ok=True)
+
+        is_sync_save = True
+        if "async_save" in self.args.unified_checkpoint_config:
+            is_sync_save = False
+        self._file_save_async_or_sync(
+            optim_state_dict,
+            path=os.path.join(save_directory, shard_optim_file),
+            is_sync=is_sync_save,
+            state_dict_type="optimizer_weight",
+        )
+        if master_weight_state_dict is not None:
+            self._file_save_async_or_sync(
+                master_weight_state_dict,
+                path=os.path.join(save_directory, shard_master_weight_file),
+                is_sync=is_sync_save,
+                state_dict_type="master_weight",
+            )
+
+        if sharded_optim_index is not None:
+            optimizer_index_name = SAFE_OPTIMIZER_INDEX_NAME
+            path = os.path.join(output_dir, optimizer_index_name)
+            if self.args.should_save:
+                with open(path, "w") as f:
+                    json.dump(sharded_optim_index, f, indent=4)
+
+            master_weights_name = SAFE_MASTER_WEIGHTS_INDEX_NAME
+            if UnifiedCheckpointOption.SKIP_SAVE_MODEL_WEIGHT.value in self.args.unified_checkpoint_config:
+                master_weights_name = SAFE_WEIGHTS_INDEX_NAME
+            master_path = os.path.join(output_dir, master_weights_name)
+            if master_weight_state_dict is not None:
+                if self.args.should_save:
+                    with open(master_path, "w") as f:
+                        json.dump(sharded_master_weight_index, f, indent=4)
+
+    def load_unified_optimizer(self, args, model, optimizer, resume_from_checkpoint):
+        """Load potential model checkpoint
+
+        Args:
+            model (PretrainedModel): Your model to load
+            resume_from_checkpoint (str): path of the checkpoint to load
+
+        Returns:
+            None
+        """
+
+        if paddle.distributed.get_world_size() <= 1:
+            optim_state_dict = load_single_card_optimizer(self.args, model, optimizer, resume_from_checkpoint)
+            return optim_state_dict
+
+        if "ignore_merge_optimizer" in self.args.unified_checkpoint_config:
+            if self.args.data_parallel_rank == 0:
+                returned_optim_state_dict = self.load_non_merge_optimizer(
+                    model,
+                    optimizer,
+                    resume_from_checkpoint,
+                )
+                return returned_optim_state_dict
+            else:
+                return None
+
+        local_resume = check_unified_optimizer(
+            self.args, model, optimizer, resume_from_checkpoint, safe_serialization=True
+        )
+        if not local_resume:
+            logger.info("Begin to dynamically load unified optimizer!")
+            returned_optim_state_dict = load_unified_optimizer_dynamically(
+                self.args, model, optimizer, resume_from_checkpoint, safe_serialization=True
+            )
+            return returned_optim_state_dict
+
+        if self.args.data_parallel_rank == 0:
+            returned_optim_state_dict = load_unified_optimizer_locally(
+                self.args, model, optimizer, resume_from_checkpoint, safe_serialization=True
+            )
+            return returned_optim_state_dict
+        return None
+
+    def save_single_card_checkpoint(self, model_to_save, output_dir):
+        """Save checkpoint for non-distributed environment."""
+
+        state_dict = get_expected_state_dict(model_to_save)
+        if isinstance(model_to_save, LoRAModel) or isinstance(model_to_save, PrefixModelForCausalLM):
+            weight_filename = "peft_model-00001-of-00001.safetensors"
+            index_filename = SAFE_PEFT_WEIGHTS_INDEX_NAME
+        else:
+            weight_filename = "model-00001-of-00001.safetensors"
+            index_filename = SAFE_WEIGHTS_INDEX_NAME
+        # get index json
+        index_weight_file = {}
+        total_size = 0
+        for key, weight in state_dict.items():
+            index_weight_file[key] = weight_filename
+            total_size += weight.numel().item() * dtype_byte_size(weight.dtype)
+        sharded_index_json = {}
+        sharded_index_json["metadata"] = {"total_size": total_size}
+        sharded_index_json["weight_map"] = index_weight_file
+        if isinstance(model_to_save, LoRAModel):
+            sharded_index_json["type"] = "lora"
+        elif isinstance(model_to_save, PrefixModelForCausalLM):
+            sharded_index_json["type"] = "ptuning"
+
+        os.makedirs(output_dir, exist_ok=True)
+        path = os.path.join(output_dir, index_filename)
+        with open(path, "w") as f:
+            json.dump(sharded_index_json, f, indent=4)
+
+        # save checkpoint
+        self._file_save_async_or_sync(
+            state_dict, path=os.path.join(output_dir, weight_filename), is_sync=True, state_dict_type="model_weight"
+        )
+
+        if isinstance(model_to_save, PrefixModelForCausalLM):
+            save_prefix_past_key_value(model_to_save, output_dir)
+            model_to_save.prefix_config.save_pretrained(output_dir)
+        if isinstance(model_to_save, LoRAModel):
+            model_to_save.lora_config.save_pretrained(output_dir)
+
+        config_to_save = save_config(model_to_save)
+        config_to_save.architectures = [model_to_save.__class__.__name__]
+        config_to_save.save_pretrained(output_dir)
+
+    def save_single_card_optimizer(self, model, optimizer, output_dir):
+        """ "Save optimizer for non-distributed environment."""
+        # Split into optimizer params and master weights.
+        optim_state_dict = nested_copy(optimizer.state_dict())
+        master_weights = None
+        if "master_weights" in optim_state_dict.keys():
+            master_weights = optim_state_dict.pop("master_weights")
+        if "LR_Scheduler" in optim_state_dict.keys():
+            optim_state_dict.pop("LR_Scheduler")
+
+        static2struct_name_mappings = {}
+        state_dict = get_expected_state_dict(model)
+        for k, v in state_dict.items():
+            static2struct_name_mappings[v.name] = k
+
+        # rename optimizer param
+        for key in list(optim_state_dict.keys()):
+            static_name, type_name = generate_base_static_name(key)
+            new_name = static2struct_name_mappings[static_name] + "/" + type_name
+            optim_state_dict[new_name] = optim_state_dict.pop(key)
+        if master_weights is not None:
+            for key in list(master_weights.keys()):
+                master_weights[static2struct_name_mappings[key]] = master_weights.pop(key)
+
+        # save index json
+        index_optimizer_file, index_master_weight_file = {}, {}
+        total_optim_size, total_master_weight_size = 0, 0
+        for key, weight in optim_state_dict.items():
+            index_optimizer_file[key] = "optimizer-00001-of-00001.safetensors"
+            total_optim_size += weight.numel().item() * dtype_byte_size(weight.dtype)
+        if master_weights is not None:
+            for key, weight in master_weights.items():
+                index_master_weight_file[key] = "master_weights-00001-of-00001.safetensors"
+                total_master_weight_size += weight.numel().item() * dtype_byte_size(weight.dtype)
+        path = os.path.join(output_dir, SAFE_OPTIMIZER_INDEX_NAME)
+        master_path = os.path.join(output_dir, SAFE_MASTER_WEIGHTS_INDEX_NAME)
+        with open(path, "w") as f:
+            has_master_weights = master_weights is not None
+            json.dump(
+                {
+                    "metadata": {"total_size": total_optim_size},
+                    "weight_map": index_optimizer_file,
+                    "master_weights": has_master_weights,
+                },
+                f,
+                indent=4,
+            )
+        if master_weights is not None:
+            with open(master_path, "w") as f:
+                json.dump(
+                    {"metadata": {"total_size": total_master_weight_size}, "weight_map": index_master_weight_file},
+                    f,
+                    indent=4,
+                )
+
+        # save optimizer state dict
+        self._file_save_async_or_sync(
+            optim_state_dict,
+            path=os.path.join(output_dir, "optimizer-00001-of-00001.safetensors"),
+            is_sync=True,
+            state_dict_type="optimizer_weight",
+        )
+        if master_weights is not None:
+            self._file_save_async_or_sync(
+                master_weights,
+                path=os.path.join(output_dir, "master_weights-00001-of-00001.safetensors"),
+                is_sync=True,
+                state_dict_type="master_weight",
+            )
+
+    def unlink_shared_memory(self):
+        if not ("async_save" in self.args.unified_checkpoint_config):
+            return
+
+        if self._shared_save_model_flag is not None:
+            while self._shared_save_model_flag[0] > 0:  # async process is saving
+                time.sleep(0.5)
+            self._shared_save_model_flag[0] = -1
+        if self._shared_save_master_weight_flag is not None:
+            while self._shared_save_master_weight_flag[0] > 0:
+                time.sleep(0.5)
+            self._shared_save_master_weight_flag[0] = -1
+        if self._shared_save_optimizer_flag is not None:
+            while self._shared_save_optimizer_flag[0] > 0:
+                time.sleep(0.5)
+            self._shared_save_optimizer_flag[0] = -1
+
+        if self._shm_model_weight is not None:
+            self._shm_model_weight.close()
+            self._shm_model_weight.unlink()
+            self._shm_model_weight = None
+        if self._shm_master_weight is not None:
+            self._shm_master_weight.close()
+            self._shm_master_weight.unlink()
+            self._shm_master_weight = None
+        if self._shm_optimizer_weight is not None:
+            self._shm_optimizer_weight.close()
+            self._shm_optimizer_weight.unlink()
+            self._shm_optimizer_weight = None
+
+        dist.barrier()
+
+
+def load_unified_checkpoint_locally(args, model, resume_from_checkpoint: str, safe_serialization=False):
+    """
+    Only dataset_rank == 0 can enter this function.
+    """
+    index_filename = select_model_weight_index(args, model, resume_from_checkpoint, safe_serialization, local=True)
+
+    resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
+        pretrained_model_name_or_path=resume_from_checkpoint,
+        index_filename=os.path.join(resume_from_checkpoint, index_filename),
+    )
+    loaded_keys = sharded_metadata["all_checkpoint_keys"]
+
+    model_state_dict = get_expected_state_dict(model)
+    expected_keys = set(list(model_state_dict.keys()))
+    missing_keys = expected_keys - set(loaded_keys)
+
+    use_fast_set = True
+    if isinstance(model, LoRAModel) or isinstance(model, PrefixModelForCausalLM):
+        use_fast_set = False
+
+    if len(missing_keys) > 0:
+        raise ValueError(f"missing_keys: {missing_keys}")
+
+    def _remove_unused_keys(
+        state_dict,
+        model_state_dict,
+    ):
+        unused_keys = set(state_dict.keys()) - set(model_state_dict.keys())
+        for unused_key in unused_keys:
+            del state_dict[unused_key]
+        return unused_keys
+
+    # This should always be a list but, just to be sure.
+    if not isinstance(resolved_archive_file, list):
+        resolved_archive_file = [resolved_archive_file]
+
+    error_msgs = []
+
+    if len(resolved_archive_file) > 1:
+        resolved_archive_file = tqdm(resolved_archive_file, desc="Loading checkpoint shards")
+
+    for shard_file in resolved_archive_file:
+        # TODO: check if  no expected_keys in shard_file, then don't load it
+        if expected_keys.isdisjoint(sharded_metadata["file_map"][os.path.split(shard_file)[-1]]):
+            continue
+
+        pre_tensor_parallel_split = False
+        if shard_file.endswith(".safetensors") and model.config.tensor_parallel_degree > 1:
+            pre_tensor_parallel_split = True
+            assert loaded_keys is not None, "loaded_keys is not None."
+            if isinstance(model, LoRAModel) or isinstance(model, PrefixModelForCausalLM):
+                tp_actions = model._get_tensor_parallel_convert_actions(
+                    set(loaded_keys), is_split=True, ignore_error=True
+                )
+            else:
+                tp_actions = model.get_tensor_parallel_convert_actions(model.config, loaded_keys, ignore_error=True)
+        # Here we use expected_keys to optimize weights loading for pipeline model. Only works for safetensors
+        state_dict = load_state_dict(
+            shard_file, tp_actions if pre_tensor_parallel_split else None, expected_keys, device="expected"
+        )
+
+        if not pre_tensor_parallel_split:
+            # Since we load all keys but we only need one of pipeline stages
+            _ = _remove_unused_keys(state_dict, model_state_dict)
+
+        if model.config.tensor_parallel_degree > 1 and not pre_tensor_parallel_split:
+            logger.info("Converting state_dict to Tensor Parallel Format")
+            # ignore error for multi shard, since only parts of data
+            state_dict = model.convert_tensor_parallel(
+                None, model.config, state_dict=state_dict, ignore_error=len(resolved_archive_file) > 1
+            )
+
+        if use_fast_set:
+            error_msgs += faster_set_state_dict(model, state_dict, strict_dtype=False)
+        else:
+            error_msgs += _load_state_dict_into_model(model, state_dict, "")
+
+        # force memory release
+        del state_dict
+        # gc.collect()
+
+    if len(error_msgs) > 0:
+        error_msg = "\n\t".join(error_msgs)
+        if " but the expected shape is" in error_msg:
+            error_msg += (
+                "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
+            )
+        raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
+
+
+def save_config(model_to_save):
+    dtype = get_parameter_dtype(model_to_save)
+    model_to_save.config.dtype = str(dtype).split(".")[1]
+    config_to_save = copy.deepcopy(model_to_save.config)
+
+    if config_to_save.tensor_parallel_degree > 1:
+        # do we need to change?
+        config_to_save.tensor_parallel_degree = 1
+
+    return config_to_save
+
+
+def unified_checkpoint_into_shards(
+    args,
+    model_to_save,
+    safe_serialization=False,
+):
+    """Get state_dict and config to save
+
+    Args:
+        model_to_save (nn.Layer): model to, save
+        safe_serialization (bool, optional): safe serialization using safetensors. Defaults to False.
+
+    Returns:
+        tuple: state_dict, config, shard_file: file name, sharded_index: map for weight to file name.
+    """
+    paddle.device.cuda.empty_cache()
+    assert hasattr(model_to_save, "config")
+
+    state_dict = get_expected_state_dict(model_to_save)
+    all_filter_keys = filter_params(model_to_save, state_dict)
+
+    config_to_save = copy.deepcopy(model_to_save.config)
+
+    if config_to_save.tensor_parallel_degree > 1:
+        if isinstance(model_to_save, LoRAModel) or isinstance(model_to_save, PrefixModelForCausalLM):
+            tp_actions = model_to_save._get_tensor_parallel_convert_actions(
+                all_filter_keys, is_split=False, ignore_error=True
+            )
+        else:
+            tp_actions = model_to_save.get_tensor_parallel_convert_actions(
+                model_to_save.config, state_dict.keys(), is_split=False, ignore_error=True
+            )
+        logger.info("Unified model tensor parallel weights in shards")
+        state_dict = merge_tensor_parallel_with_shard(state_dict, tp_actions, all_filter_keys)
+
+    # build index json file
+    index_weight_file = {}
+    total_size = 0
+    if isinstance(model_to_save, LoRAModel):
+        weights_name = SAFE_PEFT_WEIGHTS_NAME if safe_serialization else LORA_WEIGHTS_NAME
+    elif isinstance(model_to_save, PrefixModelForCausalLM):
+        weights_name = SAFE_PEFT_WEIGHTS_NAME if safe_serialization else PREFIX_WEIGHTS_NAME
+    else:
+        weights_name = SAFE_WEIGHTS_NAME if safe_serialization else PADDLE_WEIGHTS_NAME
+
+    shard_file = get_sharded_file_name(args, weights_name)
+    for key, weight in state_dict.items():
+        index_weight_file[key] = shard_file
+        total_size += weight.numel().item() * dtype_byte_size(weight.dtype)
+
+    index_file_list, total_size_list = gather_sharded_object(index_weight_file, total_size)
+    sharded_index = get_sharded_index(
+        index_file_list,
+        total_size_list,
+    )
+    if sharded_index is not None:
+        if isinstance(model_to_save, LoRAModel):
+            sharded_index["type"] = "lora"
+        elif isinstance(model_to_save, PrefixModelForCausalLM):
+            sharded_index["type"] = "ptuning"
+
+    paddle.device.cuda.empty_cache()
+
+    return state_dict, shard_file, sharded_index
+
+
+def load_unified_optimizer_locally(args, model, optimizer, resume_from_checkpoint, safe_serialization=False):
+    # init and get optimizer LR_Scheduler
+    returned_optim_state_dict = nested_copy(optimizer.state_dict())
+
+    if not safe_serialization:
+        index_filename, index_filename_master_weights = (
+            PADDLE_OPTIMIZER_INDEX_NAME,
+            PADDLE_MASTER_WEIGHTS_INDEX_NAME,
+        )
+    else:
+        index_filename, index_filename_master_weights = SAFE_OPTIMIZER_INDEX_NAME, SAFE_MASTER_WEIGHTS_INDEX_NAME
+
+    resolved_archive_file, sharded_metadata = get_optimizer_shard_files(
+        optimizer_path=resume_from_checkpoint,
+        index_filename=os.path.join(resume_from_checkpoint, index_filename),
+    )
+    has_master_weights = True if sharded_metadata["master_weights"] else False
+
+    model_state_dict = get_expected_state_dict(model)
+    model_keys = list(model_state_dict.keys())
+    struct2static_name_mappings = {k: v.name for k, v in model_state_dict.items()}  # get optimizer param mappings
+
+    expected_keys = get_expected_keys(sharded_metadata, model, optimizer)
+
+    # This should always be a list but, just to be sure.
+    if not isinstance(resolved_archive_file, list):
+        resolved_archive_file = [resolved_archive_file]
+
+    if len(resolved_archive_file) > 1:
+        resolved_archive_file = tqdm(resolved_archive_file, desc="Loading optimizer shards")
+
+    # update has_master_weights and index_filename_master_weights
+    # 1. if the master weight exists, only has_master_weights is set True and loaded when needed
+    # 2. if master weight does not exist, convert model weight to master weight when needed
+    has_master_weights, index_filename_master_weights = update_master_weight_status(
+        args, optimizer, has_master_weights, safe_serialization
+    )
+
+    if has_master_weights:
+        returned_optim_state_dict["master_weights"] = {}
+
+        resolved_archive_file_mw, sharded_metadata_mw = get_optimizer_shard_files(
+            optimizer_path=resume_from_checkpoint,
+            index_filename=os.path.join(resume_from_checkpoint, index_filename_master_weights),
+        )
+
+        expected_keys_mw = get_expected_keys(sharded_metadata_mw, model, optimizer)
+        if not isinstance(resolved_archive_file_mw, list):
+            resolved_archive_file_mw = [resolved_archive_file_mw]
+        if len(resolved_archive_file_mw) > 1:
+            resolved_archive_file_mw = tqdm(resolved_archive_file_mw, desc="Loading master weights shards")
+
+    def load_resolved_archive_file(resolved_archive_file, sharded_metadata, expected_keys, is_master_weights=False):
+        returned_state_dict = {}
+        # load optimizer
+        for shard_file in resolved_archive_file:
+            # TODO: check if no expected_keys in shard_file, then don't load it
+            if expected_keys.isdisjoint(sharded_metadata["file_map"][os.path.split(shard_file)[-1]]):
+                continue
+
+            if shard_file.endswith(".safetensors"):
+                # assert model_keys is not None, "model_keys is None." TODO: correct the assert
+                if model.config.tensor_parallel_degree > 1:
+                    if isinstance(model, LoRAModel) or isinstance(model, PrefixModelForCausalLM):
+                        tp_actions = model._get_tensor_parallel_convert_actions(
+                            model_keys, is_split=True, ignore_error=True
+                        )
+                    else:
+                        tp_actions = model.get_tensor_parallel_convert_actions(
+                            model.config, model_keys, ignore_error=True
+                        )
+                    if not is_master_weights:
+                        tp_actions = mapping_optimizer_tp_actions(tp_actions, expected_keys)
+
+                    # Here we use expected_keys to optimize weights loading for pipeline model. Only works for safetensors
+                    state_dict = load_state_dict(shard_file, tp_actions, expected_keys, device="expected")
+                else:
+                    # for pipeline model, we don't need to use tp_actions
+                    state_dict = load_state_dict(shard_file, None, expected_keys, device="expected")
+
+            returned_state_dict.update(state_dict)
+            # force memory release
+            del state_dict
+            gc.collect()
+        return returned_state_dict
+
+    state_dict_optim = load_resolved_archive_file(resolved_archive_file, sharded_metadata, expected_keys)
+    if has_master_weights:
+        state_dict_master_weight = load_resolved_archive_file(
+            resolved_archive_file_mw, sharded_metadata_mw, expected_keys_mw, is_master_weights=True
+        )
+    # rename optimizer param
+    for key in list(state_dict_optim.keys()):
+        key_name = key.split("/")
+        static_name = struct2static_name_mappings[key_name[0]]
+        if has_master_weights:
+            key_name = "_".join([static_name, FP32_MASTER, key_name[1]])
+        else:
+            key_name = "_".join([static_name, key_name[1]])
+        returned_optim_state_dict[key_name] = state_dict_optim.pop(key)
+        returned_optim_state_dict[key_name].name = key_name
+
+    if has_master_weights:
+        for key in list(state_dict_master_weight.keys()):
+            static_name = struct2static_name_mappings[key]
+            returned_optim_state_dict["master_weights"][static_name] = state_dict_master_weight.pop(key)
+            returned_optim_state_dict["master_weights"][static_name].name = "_".join([static_name, FP32_MASTER])
+
+    return returned_optim_state_dict
+
+
+def unified_optimizer_into_shards(
+    args,
+    model,
+    optimizer,
+    safe_serialization=False,
+):
+    """Get optimizer state dict and master weight state dict.
+
+    Args:
+        optimizer (Optimizer): optimizer to save.
+        safe_serialization (bool, optional): safe serialization using safetensors. Defaults to False.
+    """
+    paddle.device.cuda.empty_cache()
+    optim_state_dict = nested_copy(optimizer.state_dict())
+    master_weights = None
+    if "master_weights" in optim_state_dict.keys():
+        master_weights = optim_state_dict["master_weights"]
+        optim_state_dict.pop("master_weights")
+    if "LR_Scheduler" in optim_state_dict.keys():
+        optim_state_dict.pop("LR_Scheduler")
+
+    # gather global master_weights status.
+    global_master_weights = reduce_master_weights_status(master_weights is not None)
+    if master_weights is None and global_master_weights:
+        master_weights = {}
+
+    # get optimizer param mappings
+    static2struct_name_mappings = {}
+    state_dict = get_expected_state_dict(model)
+    for k, v in state_dict.items():
+        static2struct_name_mappings[v.name] = k
+
+    # rename optimizer param
+    for key in list(optim_state_dict.keys()):
+        static_name, type_name = generate_base_static_name(key)
+        new_name = static2struct_name_mappings[static_name] + "/" + type_name
+        optim_state_dict[new_name] = optim_state_dict.pop(key)
+    if master_weights is not None:
+        for key in list(master_weights.keys()):
+            master_weights[static2struct_name_mappings[key]] = master_weights.pop(key)
+
+    # filter optimizer param
+    if master_weights is not None:
+        filter_master_keys = filter_params(model, master_weights, is_optimizer=True)
+    filter_optim_keys = filter_params(model, optim_state_dict, is_optimizer=True)
+
+    tp_group = fleet.get_hybrid_communicate_group().get_model_parallel_group()
+    tp_size = tp_group.nranks
+
+    if tp_size > 1:
+        # get tp_actions
+        model_keys = []
+        for key in optim_state_dict.keys():
+            base_model_key = key.split("/")[0]
+            if base_model_key not in model_keys:
+                model_keys.append(base_model_key)
+        if isinstance(model, LoRAModel) or isinstance(model, PrefixModelForCausalLM):
+            tp_actions = model._get_tensor_parallel_convert_actions(model_keys, is_split=False, ignore_error=True)
+        else:
+            tp_actions = model.get_tensor_parallel_convert_actions(
+                model.config, model_keys, is_split=False, ignore_error=True
+            )
+        logger.info("Unified optimizer tensor parallel in shards")
+        optim_state_dict = merge_tensor_parallel_for_optimizer(
+            optim_state_dict,
+            tp_actions,
+            filter_optim_keys,
+        )
+        paddle.device.cuda.empty_cache()
+
+        if master_weights is not None:
+            logger.info("Unified master weight tensor parallel in shards")
+            master_weights = merge_tensor_parallel_for_optimizer(
+                master_weights,
+                tp_actions,
+                filter_master_keys,
+            )
+            paddle.device.cuda.empty_cache()
+
+    # build index json file
+    index_optimizer_file, index_master_weight_file = {}, {}
+    total_optim_size, total_master_weight_size = 0, 0
+    optimizer_name = SAFE_OPTIMIZER_NAME if safe_serialization else PADDLE_OPTIMIZER_NAME
+    master_weights_name = SAFE_MASTER_WEIGHTS_NAME if safe_serialization else PADDLE_MASTER_WEIGHTS_NAME
+    if UnifiedCheckpointOption.SKIP_SAVE_MODEL_WEIGHT.value in args.unified_checkpoint_config:
+        master_weights_name = SAFE_WEIGHTS_NAME if safe_serialization else PADDLE_WEIGHTS_NAME
+    shard_optimizer_file = get_sharded_file_name(args, optimizer_name, is_optimizer=True)
+    shard_master_weight_file = get_sharded_file_name(args, master_weights_name, is_optimizer=True)
+
+    for key, weight in optim_state_dict.items():
+        index_optimizer_file[key] = shard_optimizer_file
+        total_optim_size += weight.numel().item() * dtype_byte_size(weight.dtype)
+
+    if master_weights is not None:
+        for key, weight in master_weights.items():
+            index_master_weight_file[key] = shard_master_weight_file
+            total_master_weight_size += weight.numel().item() * dtype_byte_size(weight.dtype)
+
+    index_optimizer_filelist, total_optim_size_list = gather_sharded_object(
+        index_optimizer_file, total_optim_size, is_optimizer=True
+    )
+    sharded_optim_index = get_sharded_index(index_optimizer_filelist, total_optim_size_list)
+    if master_weights is not None:
+        index_master_weight_filelist, total_master_weight_size_list = gather_sharded_object(
+            index_master_weight_file, total_master_weight_size, is_optimizer=True
+        )
+        sharded_master_weight_index = get_sharded_index(index_master_weight_filelist, total_master_weight_size_list)
+
+    if sharded_optim_index is not None:
+        if master_weights is not None:
+            sharded_optim_index["master_weights"] = True
+        else:
+            sharded_optim_index["master_weights"] = False
+
+    paddle.device.cuda.empty_cache()
+    if master_weights is None:
+        return [(optim_state_dict, shard_optimizer_file, sharded_optim_index)]
+    else:
+        return [
+            (optim_state_dict, shard_optimizer_file, sharded_optim_index),
+            (master_weights, shard_master_weight_file, sharded_master_weight_index),
+        ]
+
+
+def check_unified_checkpoint(args, model, resume_from_checkpoint, safe_serialization=False):
+    index_filename = select_model_weight_index(args, model, resume_from_checkpoint, safe_serialization, local=False)
+    index_filename = os.path.join(resume_from_checkpoint, index_filename)
+    # Find index json file and distribute this file in global group.
+    if distributed_isfile(index_filename):
+        distributed_file(index_filename)
+    else:
+        raise Exception(
+            f"Sorry, we can not find {index_filename}. This file should be appear at least on one machine."
+        )
+
+    with open(index_filename, "r") as f:
+        index = json.loads(f.read())
+    all_weight_filenames = sorted(set(index["weight_map"].values()))
+
+    # Get existed weight file list on current machine.
+    existed_filelist = []
+    existed_files = []
+    for filename in os.listdir(resume_from_checkpoint):
+        if filename in all_weight_filenames:
+            existed_files.append(filename)
+
+    # Gather all the existed files in global group.
+    dist.all_gather_object(existed_filelist, existed_files)
+    flatten_existed_filelist = flatten_list(existed_filelist)
+    diff_filelist = list(set(all_weight_filenames).difference(set(flatten_existed_filelist)))
+    if len(diff_filelist) != 0:
+        raise Exception(f"Sorry, the weight file list on the machines is not complete!, missing {diff_filelist}")
+
+    # To decide whether to load the checkpoint locally, or need to dynamically send tensors across machines.
+    local_resume = True
+    if args.dataset_rank == 0:
+        hcg = fleet.get_hybrid_communicate_group()
+        tp_group = hcg.get_model_parallel_group()
+        pp_group = hcg.get_pipe_parallel_group()
+
+        need_files = set()
+        state_dict = get_expected_state_dict(model)
+        for key in state_dict.keys():
+            filename = index["weight_map"][key]
+            need_files.add(filename)
+        diff_filelist = list(need_files.difference(set(existed_files)))
+        num_diff = paddle.to_tensor([len(diff_filelist)])
+        if tp_group.nranks > 1:
+            dist.all_reduce(num_diff, op=dist.ReduceOp.MAX, group=tp_group)
+        if pp_group.nranks > 1:
+            dist.all_reduce(num_diff, op=dist.ReduceOp.MAX, group=pp_group)
+        if num_diff.item() == 0:
+            local_resume = True
+        else:
+            local_resume = False
+    local_resume = paddle.to_tensor([local_resume])
+    dist.all_reduce(local_resume, op=dist.ReduceOp.PROD)
+    local_resume = local_resume.item()
+    return local_resume
+
+
+def check_unified_optimizer(args, model, optimizer, resume_from_checkpoint, safe_serialization=False):
+    if not safe_serialization:
+        index_filename, index_filename_master_weights = PADDLE_OPTIMIZER_INDEX_NAME, PADDLE_MASTER_WEIGHTS_INDEX_NAME
+    else:
+        index_filename, index_filename_master_weights = SAFE_OPTIMIZER_INDEX_NAME, SAFE_MASTER_WEIGHTS_INDEX_NAME
+    index_filename = os.path.join(resume_from_checkpoint, index_filename)
+    index_filename_master_weights = os.path.join(resume_from_checkpoint, index_filename_master_weights)
+
+    # Find index json file and distribute the file in global group.
+    if distributed_isfile(index_filename):
+        distributed_file(index_filename)
+    else:
+        raise Exception(
+            f"Sorry, we can not find {index_filename}. This file should be appear at least on one machine."
+        )
+
+    with open(index_filename, "r") as f:
+        index = json.loads(f.read())
+    all_optimizer_filenames = sorted(set(index["weight_map"].values()))
+
+    has_master_weights = index["master_weights"]
+    # update has_master_weights and index_filename_master_weights
+    # 1. if the master weight exists, only has_master_weights is set True and loaded when needed
+    # 2. if master weight does not exist, convert model weight to master weight when needed
+    has_master_weights, index_filename_master_weights = update_master_weight_status(
+        args, optimizer, has_master_weights, safe_serialization
+    )
+    if has_master_weights:
+        index_filename_master_weights = os.path.join(resume_from_checkpoint, index_filename_master_weights)
+        if distributed_isfile(index_filename_master_weights):
+            distributed_file(index_filename_master_weights)
+        else:
+            raise Exception(
+                f"Sorry, we can not find {index_filename_master_weights}. This file should be appear at least on one machine."
+            )
+        with open(index_filename_master_weights, "r") as f:
+            index_mw = json.loads(f.read())
+        all_mw_filenames = sorted(set(index_mw["weight_map"].values()))
+
+    hcg = fleet.get_hybrid_communicate_group()
+    tp_group = hcg.get_model_parallel_group()
+    pp_group = hcg.get_pipe_parallel_group()
+    sharding_group = hcg.get_sharding_parallel_group()
+    sharding_rank = sharding_group.rank
+    struct2static_name_mappings = {k: v.name for k, v in model.state_dict().items()}
+    if sharding_group.nranks > 1:
+        param2rank = optimizer._param2rank
+
+    def check_complete(all_filenames):
+        # Check whether the checkpoint files on machines are complete. If not complete, raise Exception.
+        existed_filelist = []
+        existed_files = []
+        for filename in os.listdir(resume_from_checkpoint):
+            if filename in all_filenames:
+                existed_files.append(filename)
+
+        dist.all_gather_object(existed_filelist, existed_files)
+        flatten_existed_filelist = flatten_list(existed_filelist)
+        diff_filelist = list(set(all_filenames).difference(set(flatten_existed_filelist)))
+        if len(diff_filelist) != 0:
+            raise Exception(
+                f"Sorry, the optimizer file list on `data_parallel_rank==0` machines is not complete!, missing {diff_filelist}"
+            )
+        return existed_files
+
+    def check_dynamic_load(args, weight_map, existed_files, is_master_weights=False, typename_set=None):
+        # To decide whether to load the checkpoint locally, or need to dynamically distribute the checkpoint.
+        local_resume = True
+        if args.data_parallel_rank == 0:
+            need_files = set()
+            state_dict = get_expected_state_dict(model)
+            for key in state_dict.keys():
+                if sharding_group.nranks > 1:
+                    static_name = struct2static_name_mappings.get(key, None)
+                    param_rank = param2rank.get(static_name, None)
+                    if param_rank != sharding_rank:
+                        continue
+
+                if not is_master_weights:
+                    for type_name in typename_set:
+                        type_key = key + "/" + type_name
+                        filename = weight_map[type_key]
+                        need_files.add(filename)
+                else:
+                    filename = weight_map[key]
+                    need_files.add(filename)
+
+            diff_filelist = list(need_files.difference(set(existed_files)))
+            num_diff = paddle.to_tensor([len(diff_filelist)])
+            if tp_group.nranks > 1:
+                dist.all_reduce(num_diff, op=dist.ReduceOp.MAX, group=tp_group)
+            if pp_group.nranks > 1:
+                dist.all_reduce(num_diff, op=dist.ReduceOp.MAX, group=pp_group)
+            if sharding_group.nranks > 1:
+                dist.all_reduce(num_diff, op=dist.ReduceOp.MAX, group=sharding_group)
+
+            if num_diff.item() == 0:
+                local_resume = True
+            else:
+                local_resume = False
+        local_resume = paddle.to_tensor([local_resume])
+        dist.all_reduce(local_resume, op=dist.ReduceOp.PROD)
+        return local_resume.item()
+
+    # check whether the optimizer checkpoint files are complete.
+    existed_files = check_complete(all_optimizer_filenames)
+    if has_master_weights:
+        existed_files_mw = check_complete(all_mw_filenames)
+    # get optimizer's param type name, like moment1_0.
+    typename_set = set()
+    for key in index["weight_map"].keys():
+        _, typename = key.split("/")
+        typename_set.add(typename)
+    local_resume = check_dynamic_load(
+        args, index["weight_map"], existed_files, is_master_weights=False, typename_set=typename_set
+    )
+    local_resume_rw = True
+    if has_master_weights:
+        local_resume_rw = check_dynamic_load(args, index_mw["weight_map"], existed_files_mw, is_master_weights=True)
+    return local_resume & local_resume_rw
+
+
+def save_prefix_past_key_value(model_to_save, save_directory):
+    past_key_value = model_to_save.prefix_encoder(model_to_save.prefix_tokens.unsqueeze(0).expand([1, -1]))
+    past_key_value = past_key_value.reshape(
+        [
+            model_to_save.prefix_config.num_prefix_tokens,
+            2,
+            model_to_save.prefix_config.num_hidden_layers,
+            model_to_save.num_heads,
+            model_to_save.head_dim,
+        ]
+    )
+    past_key_value = paddle.transpose(past_key_value, perm=[2, 1, 3, 0, 4]).cpu().numpy()
+    model_to_save.prefix_config.save_pretrained(save_directory)
+    np.save(os.path.join(save_directory, PAST_KEY_VALUES_FILE_NAME), past_key_value)
+
+
+def get_expected_state_dict(model_to_save):
+    if isinstance(model_to_save, PretrainedModel):
+        state_dict = model_to_save.state_dict()
+        if (
+            hasattr(model_to_save.config, "tie_word_embeddings")
+            and model_to_save.config.tie_word_embeddings
+            and hasattr(model_to_save, "_tied_weights_keys")
+            and model_to_save._tied_weights_keys is not None
+        ):
+            for key in model_to_save._tied_weights_keys:
+                if key in state_dict:
+                    state_dict.pop(key)
+    elif isinstance(model_to_save, LoRAModel):
+        state_dict = model_to_save.get_trainable_state_dict()
+    elif isinstance(model_to_save, PrefixModelForCausalLM):
+        state_dict = model_to_save.prefix_encoder.state_dict()
+
+    return state_dict
+
+
+def create_dispatch_table(args, model, file_keyname_mappings, file_machine_mappings, resume_from_checkpoint):
+    """Create dispatch table for dynamically loading state dict.
+
+    Args:
+        args
+    """
+
+    hcg = fleet.get_hybrid_communicate_group()
+    tp_group = hcg.get_model_parallel_group()
+    tp_rank = tp_group.rank
+
+    # Create tensor receive table, contains {"key0": [global_rank, tp_rank], "key1": [global_rank, tp_rank]}
+    dispatch_list = []
+    recv_table = {}
+    if args.dataset_rank == 0:
+        state_dict = get_expected_state_dict(model)
+        for (k, v) in state_dict.items():
+            if hasattr(v, "is_distributed") and v.is_distributed:
+                recv_table[k] = [(dist.get_rank(), tp_rank)]
+            else:
+                recv_table[k] = [(dist.get_rank(), -1)]
+
+    # Gather receive table in global group.
+    dist.all_gather_object(dispatch_list, recv_table)
+    recv_table = {}
+    for dl in dispatch_list:
+        for key, value in dl.items():
+            if key not in recv_table:
+                recv_table[key] = value
+            else:
+                recv_table[key] += value
+
+    # Create send table, to decide which worker to send the key. Contains {"key0:" global_rank, "key1": global_rank, ...}
+    send_table = create_send_table(file_keyname_mappings, file_machine_mappings)
+
+    return send_table, recv_table
+
+
+def create_optimizer_dispatch_table(
+    args,
+    model,
+    optimizer,
+    file_keyname_mappings,
+    file_machine_mappings,
+    resume_from_checkpoint,
+    struct2static_name_mappings,
+    is_master_weights=False,
+    typename_set=None,
+):
+    hcg = fleet.get_hybrid_communicate_group()
+    tp_group = hcg.get_model_parallel_group()
+    sharding_group = hcg.get_sharding_parallel_group()
+    sharding_rank = sharding_group.rank
+    if sharding_group.nranks > 1:
+        param2rank = optimizer._param2rank
+    tp_rank = tp_group.rank
+
+    # Create receive table, contains {"param_key0": [global_rank, tp_rank], "param_key1": [global_rank, tp_rank]}
+    dispatch_list = []
+    recv_table = {}
+    if args.data_parallel_rank == 0:
+        state_dict = get_expected_state_dict(model)
+        for (k, v) in state_dict.items():
+            if sharding_group.nranks > 1:
+                static_name = struct2static_name_mappings[k]
+                param_rank = param2rank.get(static_name, None)
+                if param_rank != sharding_rank:
+                    continue
+            if is_master_weights:
+                if hasattr(v, "is_distributed") and v.is_distributed:
+                    recv_table[k] = [(dist.get_rank(), tp_rank)]
+                else:
+                    recv_table[k] = [(dist.get_rank(), -1)]
+            else:
+                for typename in typename_set:
+                    type_key = k + "/" + typename
+                    if typename in optimizer_non_scaler_name:
+                        if hasattr(v, "is_distributed") and v.is_distributed:
+                            recv_table[type_key] = [(dist.get_rank(), tp_rank)]
+                        else:
+                            recv_table[type_key] = [(dist.get_rank(), -1)]
+                    else:
+                        recv_table[type_key] = [(dist.get_rank(), -1)]
+
+    dist.all_gather_object(dispatch_list, recv_table)
+    recv_table = {}
+    for dl in dispatch_list:
+        for k, v in dl.items():
+            if k not in recv_table:
+                recv_table[k] = v
+            else:
+                recv_table[k] += v
+
+    # Create send table, to decide which worker to send the key. Contains {"param_key0:" 0, "param_key1": 1, ...}
+    send_table = create_send_table(file_keyname_mappings, file_machine_mappings)
+    return send_table, recv_table
+
+
+def load_unified_checkpoint_dynamically(args, model, optimizer, resume_from_checkpoint, safe_serialization=False):
+    index_filename = select_model_weight_index(args, model, resume_from_checkpoint, safe_serialization, local=False)
+    index_filename = os.path.join(resume_from_checkpoint, index_filename)
+
+    with open(index_filename, "r") as f:
+        index = json.loads(f.read())
+
+    # `file_keyname_mappings` indicates which keys each file contains. For example, {"model-00001-of-00002.safetensors": ["llama.embed_tokens.weight", "llama.layers.0.self_attn.q_proj.weight", ...]}
+    # `file_machine_mappings` indicates the machine where the files appear. For example, {"model-00001-of-00002.safetensors": [machine_0, machine_1], "model-00002-of-00002.safetensors": [machine_0]}
+    file_keyname_mappings, file_machine_mappings = get_file_mappings(index, resume_from_checkpoint)
+
+    logger.debug("Creating dispatch table for unified checkpoint load ...")
+    # Get send_table and recv_table. The send table indicates which workers are responsible for sending tensors, and the recv table indicates which workers should receive the tensors.
+    send_table, recv_table = create_dispatch_table(
+        args, model, file_keyname_mappings, file_machine_mappings, resume_from_checkpoint
+    )
+
+    # Get all the keys that are splited by tensor parallelism.
+    all_tp_keys = set()
+    for k, v in recv_table.items():
+        if v[0][1] != -1:
+            all_tp_keys.add(k)
+
+    config_revise = copy.deepcopy(model.config)
+    config_revise.tensor_parallel_rank = None
+    if len(all_tp_keys) == 0:
+        tp_actions = {}
+    else:
+        # Get corresponding tensor parallel actions.
+        if isinstance(model, LoRAModel) or isinstance(model, PrefixModelForCausalLM):
+            tp_actions = model._get_tensor_parallel_convert_actions(
+                set(all_tp_keys), is_split=True, ignore_error=True, config=config_revise
+            )
+        else:
+            tp_actions = model.get_tensor_parallel_convert_actions(config_revise, all_tp_keys, ignore_error=True)
+
+    logger.debug("Distributed send recv for state dict load ...")
+    # Distribute the checkpoint tensor dynamically, using the `send_table` and `recv_table` we create before.
+    state_dict = distributed_send_recv(
+        config_revise,
+        get_expected_state_dict(model),
+        tp_actions,
+        send_table,
+        recv_table,
+        resume_from_checkpoint,
+        file_keyname_mappings,
+        file_machine_mappings,
+    )
+    dist.barrier()
+    logger.debug("Setting state dict into model ...")
+    error_msgs = _load_state_dict_into_model(model, state_dict, "")
+    if len(error_msgs) > 0:
+        error_msg = "\n\t".join(error_msgs)
+        raise RuntimeError(f"Error(s) in loading dynamic state_dict for {model.__class__.__name__}:\n\t{error_msg}")
+
+
+def load_unified_optimizer_dynamically(args, model, optimizer, resume_from_checkpoint, safe_serialization=False):
+    optim_state_dict = nested_copy(optimizer.state_dict())
+    if "master_weights" in optim_state_dict.keys():
+        optim_state_dict.pop("master_weights")
+
+    if safe_serialization:
+        index_filename, index_filename_mw = SAFE_OPTIMIZER_INDEX_NAME, SAFE_MASTER_WEIGHTS_INDEX_NAME
+    else:
+        index_filename, index_filename_mw = PADDLE_OPTIMIZER_INDEX_NAME, PADDLE_MASTER_WEIGHTS_INDEX_NAME
+
+    with open(os.path.join(resume_from_checkpoint, index_filename), "r") as f:
+        index = json.loads(f.read())
+
+    # `file_keyname_mappings` indicates which keys each file contains. For example, {"optimizer-00001-of-00002.safetensors": ["llama.embed_tokens.weight/moment1_0", "llama.layers.1.mlp.gate_proj.weight/moment1_0", ...]}
+    # `file_machine_mappings` indicates the machine where the files appear. For example, {"optimizer-00001-of-00002.safetensors": [machine_0, machine_1], "optimizer-00002-of-00002.safetensors": [machine_0]}
+    file_keyname_mappings, file_machine_mappings = get_file_mappings(index, resume_from_checkpoint)
+
+    has_master_weights = index["master_weights"]
+    # update has_master_weights and index_filename_master_weights
+    # 1. if the master weights exists, only has_master_weights is set True and load master weights when needed
+    # 2. if master weights does not exist, convert model weights to master weights when needed
+    has_master_weights, index_filename_mw = update_master_weight_status(
+        args, optimizer, has_master_weights, safe_serialization
+    )
+
+    if has_master_weights:
+        with open(os.path.join(resume_from_checkpoint, index_filename_mw), "r") as f:
+            index_mw = json.loads(f.read())
+        file_keyname_mappings_mw, file_machine_mappings_mw = get_file_mappings(index_mw, resume_from_checkpoint)
+
+    # Get optimizer param type name, like moment1_0, moment2_0, beta1_pow_acc_0.
+    typename_set = set()
+    for key in index["weight_map"].keys():
+        _, typename = key.split("/")
+        typename_set.add(typename)
+    struct2static_name_mappings = {k: v.name for k, v in get_expected_state_dict(model).items()}
+    static2struct_name_mappings = {v.name: k for k, v in get_expected_state_dict(model).items()}
+    # Get send_table and recv_table. The send table indicates which workers are responsible for sending tensors, and the recv table indicates which workers should receive the tensors.
+    send_table, recv_table = create_optimizer_dispatch_table(
+        args,
+        model,
+        optimizer,
+        file_keyname_mappings,
+        file_machine_mappings,
+        resume_from_checkpoint,
+        struct2static_name_mappings,
+        is_master_weights=False,
+        typename_set=typename_set,
+    )
+    if has_master_weights:
+        send_table_mw, recv_table_mw = create_optimizer_dispatch_table(
+            args,
+            model,
+            optimizer,
+            file_keyname_mappings_mw,
+            file_machine_mappings_mw,
+            resume_from_checkpoint,
+            struct2static_name_mappings,
+            is_master_weights=True,
+        )
+
+    # Initialize optimizer state dict.
+    hcg = fleet.get_hybrid_communicate_group()
+    sharding_group = hcg.get_sharding_parallel_group()
+    if sharding_group.nranks > 1:
+        param2rank = optimizer._param2rank
+    optim_state_dict_mw = {}
+
+    def check_optimizer_param(parameter):
+        if sharding_group.nranks > 1:
+            param_rank = param2rank.get(parameter.name, None)
+            if param_rank != sharding_group.rank:
+                return False
+        if parameter.stop_gradient:
+            return False
+        return True
+
+    optimizer_keys_with_shape = []
+    if isinstance(optimizer._parameter_list[0], dict):
+        for param_group in optimizer._parameter_list:
+            # If parameter groups are set, there must be `params` key. This is guaranteed by the optimizer's initialization code.
+            for parameter in param_group["params"]:
+                if check_optimizer_param(parameter):
+                    optimizer_keys_with_shape.append((parameter.name, parameter.shape))
+    else:
+        for parameter in optimizer._parameter_list:
+            if check_optimizer_param(parameter):
+                optimizer_keys_with_shape.append((parameter.name, parameter.shape))
+
+    # see how to change
+    for static_name, shape in optimizer_keys_with_shape:
+        k = static2struct_name_mappings[static_name]
+        for typename in typename_set:
+            new_k = k + "/" + typename
+            if typename in optimizer_scalar_name:
+                optim_state_dict[new_k] = paddle.empty([1], dtype="float32")
+            else:
+                optim_state_dict[new_k] = paddle.empty(shape, dtype="float32")
+        if has_master_weights:
+            optim_state_dict_mw[k] = paddle.empty(shape, dtype="float32")
+
+    # Get all the keys that are splited by tensor parallelism.
+    all_tp_keys = set()
+    for k, v in recv_table.items():
+        structure_name, typename = k.split("/")
+        if typename in optimizer_non_scaler_name:
+            if v[0][1] != -1:
+                all_tp_keys.add(structure_name)
+
+    # Get corresponding tensor parallel actions.
+    config_revise = copy.deepcopy(model.config)
+    config_revise.tensor_parallel_rank = None
+    if len(all_tp_keys) == 0:
+        tp_actions = {}
+    else:
+        if isinstance(model, LoRAModel) or isinstance(model, PrefixModelForCausalLM):
+            tp_actions = model._get_tensor_parallel_convert_actions(
+                set(all_tp_keys), is_split=True, ignore_error=True, config=config_revise
+            )
+        else:
+            tp_actions = model.get_tensor_parallel_convert_actions(config_revise, all_tp_keys, ignore_error=True)
+    optimizer_keys = list(index["weight_map"].keys())
+    optimizer_tp_actions = mapping_optimizer_tp_actions(tp_actions, optimizer_keys)
+    if has_master_weights:
+        optimizer_tp_actions.update(tp_actions)
+
+    # Distribute the optimizer checkpoint dynamically, using the `send_table` and `recv_table` we create before.
+    optim_state_dict = distributed_send_recv(
+        config_revise,
+        optim_state_dict,
+        optimizer_tp_actions,
+        send_table,
+        recv_table,
+        resume_from_checkpoint,
+        file_keyname_mappings,
+        file_machine_mappings,
+    )
+    dist.barrier()
+    if has_master_weights:
+        optim_state_dict_mw = distributed_send_recv(
+            config_revise,
+            optim_state_dict_mw,
+            optimizer_tp_actions,
+            send_table_mw,
+            recv_table_mw,
+            resume_from_checkpoint,
+            file_keyname_mappings_mw,
+            file_machine_mappings_mw,
+        )
+        dist.barrier()
+
+    # Rename optimizer state dict.
+    for key in list(optim_state_dict.keys()):
+        if key == "LR_Scheduler":
+            continue
+        key_name = key.split("/")
+        static_name = struct2static_name_mappings[key_name[0]]
+        if has_master_weights:
+            key_name = "_".join([static_name, FP32_MASTER, key_name[1]])
+        else:
+            key_name = "_".join([static_name, key_name[1]])
+        optim_state_dict[key_name] = optim_state_dict.pop(key)
+        optim_state_dict[key_name].name = key_name
+
+    if has_master_weights:
+        optim_state_dict["master_weights"] = {}
+        for key in list(optim_state_dict_mw.keys()):
+            static_name = struct2static_name_mappings[key]
+            optim_state_dict["master_weights"][static_name] = optim_state_dict_mw.pop(key)
+            optim_state_dict["master_weights"][static_name].name = "_".join([static_name, FP32_MASTER])
+
+    if args.data_parallel_rank == 0:
+        return optim_state_dict
+    return None
+
+
+def load_single_card_checkpoint(args, model, resume_from_checkpoint: str):
+    if isinstance(model, LoRAModel) or isinstance(model, PrefixModelForCausalLM):
+        index_filename = SAFE_PEFT_WEIGHTS_INDEX_NAME
+    else:
+        index_filename = SAFE_WEIGHTS_INDEX_NAME
+    resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
+        pretrained_model_name_or_path=resume_from_checkpoint,
+        index_filename=os.path.join(resume_from_checkpoint, index_filename),
+    )
+
+    loaded_keys = sharded_metadata["all_checkpoint_keys"]
+    model_state_dict = get_expected_state_dict(model)
+    expected_keys = set(list(model_state_dict.keys()))
+    missing_keys = expected_keys - set(loaded_keys)
+
+    if len(missing_keys) > 0:
+        raise ValueError(f"Missing keys: {missing_keys}")
+
+    state_dict = load_state_dict(resolved_archive_file[0], None, expected_keys)
+    error_msgs = _load_state_dict_into_model(model, state_dict, "")
+    del state_dict
+    gc.collect()
+
+    if error_msgs:
+        raise RuntimeError(f"Error(s) in loading state dict for {model.__class__.__name__}:\n\t{error_msgs}")
+
+
+def load_single_card_optimizer(args, model, optimizer, resume_from_checkpoint: str):
+    returned_optim_state_dict = nested_copy(optimizer.state_dict())
+
+    resolved_archive_file, sharded_metadata = get_optimizer_shard_files(
+        optimizer_path=resume_from_checkpoint,
+        index_filename=os.path.join(resume_from_checkpoint, SAFE_OPTIMIZER_INDEX_NAME),
+    )
+    has_master_weights = True if sharded_metadata["master_weights"] else False
+
+    model_state_dict = get_expected_state_dict(model)
+    struct2static_name_mappings = {k: v.name for k, v in model_state_dict.items()}
+    expected_keys = sharded_metadata["all_optimizer_keys"]
+
+    if has_master_weights:
+        returned_optim_state_dict["master_weights"] = {}
+        resolved_archive_file_mw, sharded_metadata_mw = get_optimizer_shard_files(
+            optimizer_path=resume_from_checkpoint,
+            index_filename=os.path.join(resume_from_checkpoint, SAFE_MASTER_WEIGHTS_INDEX_NAME),
+        )
+        expected_keys_mw = sharded_metadata_mw["all_optimizer_keys"]
+
+    state_dict_optim = load_state_dict(resolved_archive_file[0], None, expected_keys)
+    if has_master_weights:
+        state_dict_optim_mw = load_state_dict(resolved_archive_file_mw[0], None, expected_keys_mw)
+
+    for key in list(state_dict_optim.keys()):
+        key_name = key.split("/")
+        static_name = struct2static_name_mappings[key_name[0]]
+        if has_master_weights:
+            key_name = "_".join([static_name, FP32_MASTER, key_name[1]])
+        else:
+            key_name = "_".join([static_name, key_name[1]])
+        returned_optim_state_dict[key_name] = state_dict_optim.pop(key)
+        returned_optim_state_dict[key_name].name = key_name
+    if has_master_weights:
+        for key in list(state_dict_optim_mw.keys()):
+            static_name = struct2static_name_mappings[key]
+            returned_optim_state_dict["master_weights"][static_name] = state_dict_optim_mw.pop(key)
+            returned_optim_state_dict["master_weights"][static_name].name = "_".join([static_name, FP32_MASTER])
+
+    returned_optim_state_dict = nested_copy_place(
+        returned_optim_state_dict,
+        place=paddle.framework._current_expected_place(),
+        blocking=True,
+    )
+    return returned_optim_state_dict
+
+
+def get_file_mappings(index, resume_from_checkpoint):
+    file_keyname_mappings = {}
+    for k, v in index["weight_map"].items():
+        if v not in file_keyname_mappings:
+            file_keyname_mappings[v] = []
+        file_keyname_mappings[v].append(k)
+    for k in file_keyname_mappings.keys():
+        file_keyname_mappings[k] = sorted(file_keyname_mappings[k])
+
+    local_device_count = int(os.getenv("PADDLE_LOCAL_SIZE"))
+    local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0))
+    global_rank = dist.get_rank()
+    file_machine_mappings = {}
+    for filename in file_keyname_mappings.keys():
+        if local_rank == 0 and os.path.exists(os.path.join(resume_from_checkpoint, filename)):
+            file_machine_mappings[filename] = [global_rank // local_device_count]
+    file_machine_list = []
+    dist.all_gather_object(file_machine_list, file_machine_mappings)
+    file_machine_mappings = {}
+    for mappings in file_machine_list:
+        for k, v in mappings.items():
+            if k not in file_machine_mappings:
+                file_machine_mappings[k] = v
+            else:
+                file_machine_mappings[k] += v
+    return file_keyname_mappings, file_machine_mappings
+
+
+def create_send_table(file_keyname_mappings, file_machine_mappings):
+    send_table = {}
+    global_rank = dist.get_rank()
+    local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0))
+    local_device_count = int(os.getenv("PADDLE_LOCAL_SIZE"))
+    for filename, keys in file_keyname_mappings.items():
+        machine = file_machine_mappings[filename][0]
+        is_src = (global_rank // local_device_count) == machine
+        for i, key in enumerate(keys):
+            if is_src and local_rank == i % local_device_count:
+                send_table[key] = global_rank
+    dispatch_list = []
+    dist.all_gather_object(dispatch_list, send_table)
+    send_table = {}
+    for dl in dispatch_list:
+        send_table.update(dl)
+    return send_table
+
+
+def distributed_send_recv(
+    config,
+    state_dict,
+    tp_actions,
+    send_table,
+    recv_table,
+    resume_from_checkpoint,
+    file_keyname_mappings,
+    file_machine_mappings,
+):
+
+    local_device_count = int(os.getenv("PADDLE_LOCAL_SIZE"))
+    global_rank = dist.get_rank()
+    for filename in file_keyname_mappings.keys():
+        machine = file_machine_mappings[filename][0]
+        is_src = global_rank // local_device_count == machine
+        if is_src:
+            f = safe_open(os.path.join(resume_from_checkpoint, filename), framework="np")
+
+        for key in file_keyname_mappings[filename]:
+            recv_info = recv_table[key]
+            recv_ranklist = [a for (a, b) in recv_info]
+            if is_src and global_rank == send_table[key]:
+                py_safe_slice_ = f.get_slice(key)
+                # send
+                if key in tp_actions:
+                    weight = tp_actions[key](py_safe_slice_)
+                    # copy weight to GPU
+                    for j in range(len(weight)):
+                        with device_guard():
+                            weight[j] = paddle.Tensor(weight[j], zero_copy=True)
+                        weight[j] = weight[j]._copy_to(paddle.framework._current_expected_place(), False)
+
+                    for recv_rank, split_index in recv_info:
+                        if recv_rank == global_rank:
+                            state_dict[key] = weight[split_index]
+                        else:
+                            dist.stream.send(weight[split_index], dst=recv_rank)
+                else:
+                    # no need to tp split
+                    weight = py_safe_slice_[:]
+                    with device_guard():
+                        weight = paddle.Tensor(weight, zero_copy=True)
+                    weight = weight._copy_to(paddle.framework._current_expected_place(), False)
+                    for recv_rank, _ in recv_info:
+                        if recv_rank == global_rank:
+                            state_dict[key] = weight
+                        else:
+                            dist.stream.send(weight, dst=recv_rank)
+
+            if global_rank != send_table[key] and global_rank in recv_ranklist:
+                dist.stream.recv(state_dict[key], src=send_table[key])
+
+        if is_src:
+            f.__exit__(None, None, None)
+
+    return state_dict
+
+
+def get_sharded_file_name(args, file_name, is_optimizer=False):
+    if not is_optimizer:
+        shard_file = file_name.replace(
+            ".pdparams",
+            f"-{args.logical_process_index + 1:05d}-of-{args.world_size//args.dataset_world_size:05d}.pdparams",
+        )
+        shard_file = shard_file.replace(
+            ".safetensors",
+            f"-{args.logical_process_index + 1:05d}-of-{args.world_size//args.dataset_world_size:05d}.safetensors",
+        )
+    else:
+        hcg = fleet.get_hybrid_communicate_group()
+        dp_group = hcg.get_data_parallel_group()
+        shard_file = file_name.replace(
+            ".pdparams", f"-{args.logical_process_index + 1:05d}-of-{args.world_size//dp_group.nranks:05d}.pdparams"
+        )
+        shard_file = shard_file.replace(
+            ".safetensors",
+            f"-{args.logical_process_index + 1:05d}-of-{args.world_size//dp_group.nranks:05d}.safetensors",
+        )
+        shard_file = shard_file.replace(
+            ".pdopt", f"-{args.logical_process_index + 1:05d}-of-{args.world_size//dp_group.nranks:05d}.pdopt"
+        )
+    return shard_file
+
+
+def get_sharded_index(
+    index_file_list,
+    total_size_list,
+):
+    # save index json file
+    local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0))
+    if local_rank == 0:
+        sharded_index_json = {}
+
+        sharded_index_json["metadata"] = {"total_size": sum(total_size_list)}
+
+        weight_map = {}
+        for i, index_file in enumerate(index_file_list):
+            weight_map.update(index_file_list[i])
+
+        sharded_index_json["weight_map"] = weight_map
+        return sharded_index_json
+
+    return None
+
+
+def reduce_master_weights_status(has_master_weights=False):
+    data = paddle.to_tensor([has_master_weights], dtype="int32")
+
+    hcg = fleet.get_hybrid_communicate_group()
+    tp_group = hcg.get_model_parallel_group()
+    pp_group = hcg.get_pipe_parallel_group()
+    sharding_group = hcg.get_sharding_parallel_group()
+
+    if tp_group.nranks > 1:
+        dist.all_reduce(data, op=dist.ReduceOp.SUM, group=tp_group)
+    if pp_group.nranks > 1:
+        dist.all_reduce(data, op=dist.ReduceOp.SUM, group=pp_group)
+    if sharding_group.nranks > 1:
+        dist.all_reduce(data, op=dist.ReduceOp.SUM, group=sharding_group)
+
+    return data.item() > 0
+
+
+def gather_sharded_object(index_file, total_size, is_optimizer=False):
+
+    index_file_list, total_size_list = [], []
+
+    hcg = fleet.get_hybrid_communicate_group()
+    tp_group = hcg.get_model_parallel_group()
+    pp_group = hcg.get_pipe_parallel_group()
+
+    logger.info(
+        f"Unified checkpoint: generating sharded_index json files for {'optimizer or master weight' if is_optimizer else 'model weight'}."
+    )
+
+    if tp_group.nranks > 1:
+        dist.all_gather_object(index_file_list, index_file, tp_group)
+        dist.all_gather_object(total_size_list, total_size, tp_group)
+    if pp_group.nranks > 1:
+        pp_index_file_list = []
+        pp_total_size_list = []
+        dist.all_gather_object(
+            pp_index_file_list, index_file_list if len(index_file_list) > 0 else index_file, pp_group
+        )
+        dist.all_gather_object(
+            pp_total_size_list, total_size_list if len(total_size_list) > 0 else total_size, pp_group
+        )
+        index_file_list = pp_index_file_list
+        total_size_list = pp_total_size_list
+
+    index_file_list = flatten_list(index_file_list)
+    total_size_list = flatten_list(total_size_list)
+
+    # for pure sharding
+    if len(index_file_list) == 0 and len(total_size_list) == 0:
+        index_file_list = [index_file]
+        total_size_list = [total_size]
+    if is_optimizer:
+        sharding_group = hcg.get_sharding_parallel_group()
+        if sharding_group.nranks > 1:
+            sharding_index_file_list = []
+            sharding_total_size_list = []
+            dist.all_gather_object(sharding_index_file_list, index_file_list, sharding_group)
+            dist.all_gather_object(sharding_total_size_list, total_size_list, sharding_group)
+            index_file_list = flatten_list(sharding_index_file_list)
+            total_size_list = flatten_list(sharding_total_size_list)
+
+    return index_file_list, total_size_list
+
+
+def generate_base_static_name(vname):
+    # return base static name and specific type name, like [embedding_0.w_0, moment1_0]
+    if FP32_MASTER in vname:
+        vname = vname.split("_" + FP32_MASTER + "_")
+        return vname[0], vname[1]
+    else:
+        vname = vname.split(".")
+        a = vname[0] + "." + vname[1][:3]
+        b = vname[1][4:]
+        return a, b
+
+
+def filter_params(model_to_save, state_dict, is_optimizer=False):
+    hcg = fleet.get_hybrid_communicate_group()
+    tp_group = hcg.get_model_parallel_group()
+
+    tp_size = tp_group.nranks
+    tp_rank = tp_group.rank
+
+    # for pure sharding or pure pp
+    if tp_size <= 1:
+        return [list(state_dict.keys())]
+
+    filter_tensor_list = [[] for i in range(tp_size)]
+
+    if tp_rank == 0:
+        tensor_bytes_dict = {}
+        model_state_dict = get_expected_state_dict(model_to_save)
+        for (k, v) in state_dict.items():
+            model_v = model_state_dict[k.split("/")[0]] if is_optimizer else v
+            if hasattr(model_v, "is_distributed") and model_v.is_distributed:
+                tensor_bytes_dict[k] = v.numel().item() * tp_size * dtype_byte_size(v.dtype)
+            else:
+                tensor_bytes_dict[k] = v.numel().item() * dtype_byte_size(v.dtype)
+
+        filter_tensor_list = []
+        current_block = []
+        current_block_size = 0
+        total_size = 0
+
+        max_shard_size = (sum(tensor_bytes_dict.values()) + tp_size - 1) // tp_size
+
+        for index, (key, weight_size) in enumerate(tensor_bytes_dict.items()):
+            # If this weight is going to tip up over the maximal size, we split.
+            # if current_block_size + weight_size > max_shard_size:
+            if total_size + weight_size > max_shard_size * (len(filter_tensor_list) + 1) or (
+                len(tensor_bytes_dict) - index < (tp_size - len(filter_tensor_list))
+            ):
+                # fix if the first param is large than max_shard_size
+                if len(current_block) > 0:
+                    filter_tensor_list.append(current_block)
+                current_block = []
+                current_block_size = 0
+
+            current_block.append(key)
+            current_block_size += weight_size
+            total_size += weight_size
+
+        filter_tensor_list.append(current_block)
+        if len(filter_tensor_list) < tp_size:
+            filter_tensor_list.extend([[] for i in range(tp_size - len(filter_tensor_list))])
+
+    dist.broadcast_object_list(
+        filter_tensor_list,
+        src=hcg.get_model_parallel_group_src_rank(),
+        group=tp_group,
+    )
+
+    return filter_tensor_list
+
+
+def merge_large_tensor_parallel(tensor, tp_group, tp_action, dst_rank, is_dst):
+    num_rows = tensor.shape[0]
+    num_splits = 4
+    parts = np.array_split(np.arange(num_rows), num_splits)
+    splits = [len(part) for part in parts]
+    split_parts = np.insert(np.cumsum(splits), 0, 0)
+    split_tensors = []
+    for i in range(num_splits):
+        if get_env_device() == "xpu":
+            ret = distributed_allgather(tensor[split_parts[i] : split_parts[i + 1], :], group=tp_group, offload=False)
+        else:
+            ret = distributed_gather(
+                tensor[split_parts[i] : split_parts[i + 1], :], dst=dst_rank, group=tp_group, offload=False
+            )
+        # Copy to CPUPlace temporarily, may lower speed.
+        if ret is not None:
+            ret = [t.cpu() for t in ret]
+        split_tensors.append(ret)
+    concat_tensors = []
+    if is_dst:
+        for i in range(tp_group.nranks):
+            tmp = []
+            for j in range(num_splits):
+                tmp.append(split_tensors[j][i])
+            concat_tensors.append(paddle.concat(tmp))
+        tensor = tp_action(concat_tensors)
+    else:
+        tensor = None
+    return tensor
+
+
+def merge_tensor_parallel_with_shard(state_dict, tp_actions, all_filter_keys):
+    hcg = fleet.get_hybrid_communicate_group()
+    tp_group = hcg.get_model_parallel_group()
+    tp_rank = tp_group.rank
+
+    # filter actions for pipeline mode
+    if hcg.get_pipe_parallel_group().nranks > 1:
+        filter_keys = set([y for x in all_filter_keys for y in x])
+        for key in list(tp_actions.keys()):
+            if key not in filter_keys:
+                tp_actions.pop(key)
+
+    state_dict_to_save = {}
+    max_key_len = max([len(_) for _ in all_filter_keys])
+    for i in range(max_key_len):
+        for j, filter_keys in enumerate(all_filter_keys):
+            is_dst = tp_rank == j
+            if i > len(filter_keys) - 1:
+                continue
+            key = filter_keys[i]
+            tensor = state_dict[key]
+            if key in tp_actions:
+                # Get tensor size
+                tensor_bytes = tensor.numel().item() * dtype_byte_size(tensor.dtype) * tp_group.nranks
+                if tensor_bytes >= 5 * 1024 * 1024 * 1024:  # temporarily set 5GB as threshold
+                    tensor = merge_large_tensor_parallel(tensor, tp_group, tp_actions[key], j, is_dst)
+                else:
+                    if get_env_device() == "xpu":
+                        ret = distributed_allgather(tensor, group=tp_group, offload=False)
+                    else:
+                        ret = distributed_gather(tensor, dst=j, group=tp_group, offload=False)
+                    action = tp_actions.pop(key)
+                    tensor = action(ret) if is_dst else None
+            else:
+                if is_dst:
+                    tensor = tensor._copy_to(DEST_PLACE, False) if tensor.place.is_cpu_place() else tensor
+                else:
+                    tensor = None
+
+            if is_dst:
+                state_dict_to_save[key] = tensor
+
+    if len(tp_actions) > 0:
+        for x in tp_actions.keys():
+            logger.warning(f"key <{x}> need to merge tensor parallel but we can't find in model state.")
+
+    return state_dict_to_save
+
+
+def merge_tensor_parallel_for_optimizer(state_dict, tp_actions, all_filter_keys):
+    # Core function for UC
+    hcg = fleet.get_hybrid_communicate_group()
+    tp_group = hcg.get_model_parallel_group()
+    tp_rank = tp_group.rank
+
+    state_dict_to_save = {}
+    max_key_len = max([len(_) for _ in all_filter_keys])
+    for i in range(max_key_len):
+        for j, filter_keys in enumerate(all_filter_keys):
+            is_dst = tp_rank == j
+            if i > len(filter_keys) - 1:
+                continue
+            # get base model key
+            model_key = filter_keys[i].split("/")[0]
+            tensor = state_dict[filter_keys[i]]
+            if model_key in tp_actions:
+                # for example: beta1, beta2
+                if tensor.numel().item() == 1:
+                    if is_dst:
+                        tensor = tensor._copy_to(DEST_PLACE, False) if not tensor.place.is_cpu_place() else tensor
+                    else:
+                        tensor = None
+                else:
+                    # Get tensor size
+                    tensor_bytes = tensor.numel().item() * dtype_byte_size(tensor.dtype) * tp_group.nranks
+                    if tensor_bytes >= 5 * 1024 * 1024 * 1024:  # temporarily set 5GB as threshold
+                        tensor = merge_large_tensor_parallel(tensor, tp_group, tp_actions[model_key], j, is_dst)
+                    else:
+                        if get_env_device() == "xpu":
+                            ret = distributed_allgather(tensor, group=tp_group, offload=False)
+                        else:
+                            ret = distributed_gather(tensor, dst=j, group=tp_group, offload=False)
+                        action = tp_actions[model_key]
+                        tensor = action(ret) if is_dst else None
+            else:
+                if is_dst:
+                    tensor = tensor._copy_to(DEST_PLACE, False) if not tensor.place.is_cpu_place() else tensor
+                else:
+                    tensor = None
+
+            if is_dst:
+                state_dict_to_save[filter_keys[i]] = tensor
+
+    return state_dict_to_save
+
+
+def get_optimizer_shard_files(optimizer_path, index_filename):
+    """
+    For a given model:
+    - download and cache all the shards of a sharded checkpoint if `pretrained_model_name_or_path` is a model ID on the
+      Hub
+    - returns the list of paths to all the shards, as well as some metadata.
+    For the description of each arg, see [`PretrainedModel.from_pretrained`]. `index_filename` is the full path to the
+    index (downloaded and cached if `pretrained_model_name_or_path` is a model ID on the Hub).
+    """
+
+    import json
+
+    if not os.path.isfile(index_filename):
+        raise ValueError(f"Can't find a optimizer index ({index_filename}) in {optimizer_path}.")
+
+    with open(index_filename, "r") as f:
+        index = json.loads(f.read())
+
+    shard_filenames = sorted(set(index["weight_map"].values()))
+    sharded_metadata = index["metadata"]
+    sharded_metadata["all_optimizer_keys"] = list(index["weight_map"].keys())
+    sharded_metadata["weight_map"] = index["weight_map"].copy()
+    sharded_metadata["master_weights"] = index.get("master_weights", False)
+
+    file_map = {file: set() for file in shard_filenames}
+    for weight, file in index["weight_map"].items():
+        file_map[file].add(weight)
+
+    sharded_metadata["file_map"] = file_map
+
+    # First, let's deal with local folder.
+    # TODO: if optimizer_path is a folder, we should check if the optimizer is already cached or not.
+    if os.path.isdir(optimizer_path):
+        shard_filenames = [os.path.join(optimizer_path, f) for f in shard_filenames]
+        return shard_filenames, sharded_metadata
+
+
+def get_expected_keys(sharded_metadata, model, optimizer):
+    hcg = fleet.get_hybrid_communicate_group()
+    sharding_group = hcg.get_sharding_parallel_group()
+    sharding_rank = sharding_group.rank
+    in_sharding_parallel_model = sharding_group.nranks > 1
+    if in_sharding_parallel_model:
+        params2rank = optimizer._param2rank
+
+    struct2static_name_mappings = {k: v.name for k, v in get_expected_state_dict(model).items()}
+
+    expected_keys = []
+    for key in list(sharded_metadata["all_optimizer_keys"]):
+        key_name = key.split("/")[0]
+        static_name = struct2static_name_mappings.get(key_name, None)
+
+        if in_sharding_parallel_model:
+            params_rank = params2rank.get(static_name, None)
+            if params_rank == sharding_rank:
+                expected_keys.append(key)
+        else:
+            if static_name is not None:
+                expected_keys.append(key)
+    expected_keys = set(expected_keys)
+
+    loaded_keys = sharded_metadata["all_optimizer_keys"]
+    missing_keys = expected_keys - set(loaded_keys)
+    if len(missing_keys) > 0:
+        raise ValueError(f"optimizer missing weights keys: {missing_keys}")
+
+    return expected_keys
+
+
+def mapping_optimizer_tp_actions(tp_actions, optimizer_loaded_keys):
+    """# convert param.name to
+    param.key/moment1_0
+    or param.key/beta1_XXX
+    or param.key/beta2_XXX
+    Args:
+        tp_actions (dict): dictionay of tensor parallel actions {key: action}
+        optimizer_loaded_keys (list or set): [param.key1/moment1_0, param.key2/beta1_XXX, param.key3/beta2_XXX]
+    Returns:
+        dict: new dictionay of tensor parallel actions {key: action}
+    """
+    new_actions = {}
+    for key in optimizer_loaded_keys:
+        key_base, typename = key.split("/")
+        if typename in optimizer_non_scaler_name and key_base in tp_actions:
+            new_actions[key] = tp_actions[key_base]
+    return new_actions
+
+
+def flatten_list(nested_list):
+    flattened_list = []
+    for item in nested_list:
+        if isinstance(item, list):
+            flattened_list.extend(flatten_list(item))
+        else:
+            flattened_list.append(item)
+    return flattened_list
+
+
+def select_model_weight_index(args, model, resume_from_checkpoint, safe_serialization, local=True):
+    """
+    try select model weight index from model weight or master weight index.
+    """
+
+    # find model weight index file
+    if isinstance(model, LoRAModel) or isinstance(model, PrefixModelForCausalLM):
+        index_filename = SAFE_PEFT_WEIGHTS_INDEX_NAME if safe_serialization else PADDLE_PEFT_WEIGHTS_INDEX_NAME
+    else:
+        index_filename = SAFE_WEIGHTS_INDEX_NAME if safe_serialization else PADDLE_WEIGHTS_INDEX_NAME
+
+    index_filename_path = os.path.join(resume_from_checkpoint, index_filename)
+    identify_func = os.path.isfile if local else distributed_isfile
+
+    if identify_func(index_filename_path):
+        return index_filename
+    else:
+        index_filename = PADDLE_MASTER_WEIGHTS_INDEX_NAME if not safe_serialization else SAFE_MASTER_WEIGHTS_INDEX_NAME
+        index_filename_path = os.path.join(resume_from_checkpoint, index_filename)
+
+        if identify_func(index_filename_path):
+            return index_filename
+        else:
+            raise ValueError("Can't find a valid unified model or master weight checkpoint to load.")
+
+
+def update_master_weight_status(args, optimizer, has_master_weight, safe_serialization):
+    if is_need_master_weight(optimizer, is_fp16_or_bp16=(args.fp16 or args.bf16)):
+        if not has_master_weight:
+            if UnifiedCheckpointOption.MASTER_WEIGHT_COMPATIBLE.value in args.unified_checkpoint_config:
+                index_filename_master_weights = (
+                    PADDLE_WEIGHTS_INDEX_NAME if not safe_serialization else SAFE_WEIGHTS_INDEX_NAME
+                )
+                has_master_weight = True
+                logger.warning(
+                    "The unified checkpoint does not contain master weight, "
+                    "the model weight will be loaded as master weight."
+                )
+            else:
+                raise ValueError(
+                    "Can't find a valid unified master weight checkpoint,"
+                    f"add '{UnifiedCheckpointOption.MASTER_WEIGHT_COMPATIBLE.value}' into 'unified_checkpoint_config' to "
+                    "load model checkpoint as master weight"
+                )
+        else:
+            has_master_weight = True
+            index_filename_master_weights = (
+                PADDLE_MASTER_WEIGHTS_INDEX_NAME if not safe_serialization else SAFE_MASTER_WEIGHTS_INDEX_NAME
+            )
+            if UnifiedCheckpointOption.SKIP_SAVE_MODEL_WEIGHT.value in args.unified_checkpoint_config:
+                index_filename_master_weights = (
+                    PADDLE_WEIGHTS_INDEX_NAME if not safe_serialization else SAFE_WEIGHTS_INDEX_NAME
+                )
+    else:
+        has_master_weight = False
+        index_filename_master_weights = None
+
+    return has_master_weight, index_filename_master_weights
+
+
+def unwrap_optimizer(optimizer):
+    while hasattr(optimizer, "_inner_opt") or hasattr(optimizer, "_optim"):
+        if hasattr(optimizer, "_inner_opt"):
+            optimizer = optimizer._inner_opt
+        if hasattr(optimizer, "_optim"):
+            optimizer = optimizer._optim
+
+    return optimizer
+
+
+def is_need_master_weight(optimizer, is_fp16_or_bp16):
+    optimizer = unwrap_optimizer(optimizer)
+    if hasattr(optimizer, "_multi_precision"):
+        return optimizer._multi_precision and is_fp16_or_bp16
+    else:
+        return False
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/trainer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/trainer.py
new file mode 100644
index 000000000..b77c45b14
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/trainer.py
@@ -0,0 +1,3287 @@
+# Copyright 2020-present the HuggingFace Inc. team.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is modified from
+#  https://github.com/huggingface/transformers/blob/main/src/transformers/trainer.py
+
+import collections
+import contextlib
+import inspect
+import math
+import os
+import random
+import re
+import shutil
+import sys
+import time
+import types
+import warnings
+from collections import OrderedDict
+from collections.abc import Mapping
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import paddle
+import paddle.amp.auto_cast as autocast
+import paddle.distributed as dist
+import paddle.nn as nn
+from packaging import version
+from paddle import framework
+
+try:
+    from paddle.base import core
+except:
+    core = None
+from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.hybrid_parallel_optimizer import (
+    HybridParallelOptimizer,
+)
+from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import (
+    GroupShardedOptimizerStage2,
+)
+
+try:
+    from paddle.distributed.fleet.utils.hybrid_parallel_util import (
+        obtain_optimizer_parameters_list,
+    )
+
+    _obtain_optimizer_parameters_list = obtain_optimizer_parameters_list
+except:
+    try:
+        from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.hybrid_parallel_optimizer import (
+            _obtain_optimizer_parameters_list,
+        )
+    except:
+        _obtain_optimizer_parameters_list = None
+
+from paddle.distributed.fleet.utils.hybrid_parallel_util import (
+    fused_allreduce_gradients,
+)
+from paddle.io import DataLoader, Dataset, DistributedBatchSampler
+from tqdm.auto import tqdm
+
+from ..data import (
+    DataCollator,
+    DataCollatorWithPadding,
+    DistDataLoader,
+    default_data_collator,
+)
+from ..peft import LoRAModel, PrefixModelForCausalLM, VeRAModel
+
+try:
+    from ..quantization.quantization_linear import QuantizationLinear
+except:
+    QuantizationLinear = None
+from ..transformers.context_parallel_utils import split_inputs_sequence_dim_load_balance
+from ..transformers.model_utils import (
+    PretrainedModel,
+    _add_variant,
+    load_sharded_checkpoint,
+    unwrap_model,
+)
+from ..transformers.segment_parallel_utils import split_inputs_sequence_dim
+from ..transformers.tokenizer_utils import PretrainedTokenizer
+from ..utils.batch_sampler import DistributedBatchSampler as NlpDistributedBatchSampler
+from ..utils.env import (
+    LORA_WEIGHTS_NAME,
+    PADDLE_MASTER_WEIGHTS_INDEX_NAME,
+    PADDLE_PEFT_WEIGHTS_INDEX_NAME,
+    PADDLE_WEIGHTS_INDEX_NAME,
+    PADDLE_WEIGHTS_NAME,
+    PREFIX_WEIGHTS_NAME,
+    SAFE_MASTER_WEIGHTS_INDEX_NAME,
+    SAFE_PEFT_WEIGHTS_INDEX_NAME,
+    SAFE_WEIGHTS_INDEX_NAME,
+    VERA_WEIGHTS_NAME,
+)
+from ..utils.import_utils import is_datasets_available, is_paddle_cuda_available
+from ..utils.log import logger
+from .argparser import strtobool
+from .integrations import get_reporting_integration_callbacks
+from .plugins.timer import RuntimeTimer, get_timers, set_timers
+from .plugins.unified_checkpoint import UnifiedCheckpointHandler
+from .trainer_callback import (
+    CallbackHandler,
+    DefaultFlowCallback,
+    PrinterCallback,
+    ProgressCallback,
+    TrainerCallback,
+    TrainerControl,
+    TrainerState,
+)
+from .trainer_utils import (  # set_hyrbid_parallel_seed,
+    PREFIX_CHECKPOINT_DIR,
+    EvalLoopOutput,
+    EvalPrediction,
+    IterableDatasetShard,
+    OptimizerNames,
+    PredictionOutput,
+    RemoveColumnsCollator,
+    ShardingOption,
+    TrainerMemoryTracker,
+    TrainOutput,
+    find_batch_size,
+    get_last_checkpoint,
+    get_scheduler,
+    has_length,
+    set_seed,
+    speed_metrics,
+)
+from .training_args import TrainingArguments
+from .utils import reshard as reshard_util
+from .utils.async_save import AsyncSaver
+from .utils.helper import (  # nested_truncate,
+    broadcast_dataset_rank0_model,
+    broadcast_dp_optimizer,
+    broadcast_moe_optimizer,
+    distributed_concat,
+    distributed_file,
+    distributed_isfile,
+    nested_concat,
+    nested_detach,
+    nested_numpify,
+    nested_truncate,
+)
+from .utils.sharding_io import ShardingIO
+
+DEFAULT_CALLBACKS = [DefaultFlowCallback]
+DEFAULT_PROGRESS_CALLBACK = ProgressCallback
+
+# Name of the files used for checkpointing
+TRAINING_ARGS_NAME = "training_args.bin"
+TRAINER_STATE_NAME = "trainer_state.json"
+
+OPTIMIZER_NAME = "optimizer.pdopt"
+SCHEDULER_NAME = "scheduler.pdparams"
+SCALER_NAME = "scaler.pdparams"
+
+
+if is_datasets_available():
+    import datasets
+
+
+try:
+    from paddle.distributed.fleet.utils import mix_precision_utils
+except:
+    mix_precision_utils = None
+
+try:
+    from paddle.io.dataloader.dataloader_iter import _DataLoaderIterBase
+except:
+    from paddle.fluid.dataloader.dataloader_iter import _DataLoaderIterBase
+
+
+__all__ = ["Trainer"]
+
+
+class Trainer:
+    """
+    Trainer is a simple but feature-complete training and eval loop for PaddlePaddle, optimized for PaddleNLP.
+
+    Args:
+        model ([`PretrainedModel`] or `paddle.nn.Layer`, *optional*):
+            The model to train, evaluate or use for predictions.
+
+            [`Trainer`] is optimized to work with the [`PretrainedModel`] provided by the library. You can still use
+            your own models defined as `paddle.nn.Layer` as long as they work the same way as the PaddleNLP
+            models.
+        criterion(`paddle.nn.Layer`, *optional*):
+            The model may only output the loggit, if you want do more computation for the output of model, you can
+            add the criterion Layer.
+        args ([`TrainingArguments`], *optional*):
+            The arguments to tweak for training. Will default to a basic instance of [`TrainingArguments`] with the
+            `output_dir` set to a directory named *tmp_trainer* in the current directory if not provided.
+        data_collator (`DataCollator`, *optional*):
+            The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. Will
+            default to [`default_data_collator`] if no `tokenizer` is provided, an instance of
+            [`DataCollatorWithPadding`] otherwise.
+        train_dataset (`paddle.io.Dataset` or `paddle.io.IterableDataset`, *optional*):
+            The dataset to use for training. If it is an `datasets.Dataset`, columns not accepted by the
+            `model.forward()` method are automatically removed.
+        eval_dataset (Union[`paddle.io.Dataset`, Dict[str, `paddle.io.Dataset`]],  *optional*):
+             The dataset to use for evaluation. If it is a [`~datasets.Dataset`], columns not accepted by the
+             `model.forward()` method are automatically removed. If it is a dictionary, it will evaluate on each
+             dataset prepending the dictionary key to the metric name.
+        tokenizer ([`PretrainedTokenizer`], *optional*):
+            The tokenizer used to preprocess the data. If provided, will be used to automatically pad the inputs the
+            maximum length when batching inputs, and it will be saved along the model to make it easier to rerun an
+            interrupted training or reuse the fine-tuned model.
+        compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*):
+            The function that will be used to compute metrics at evaluation. Must take a [`EvalPrediction`] and return
+            a dictionary string to metric values.
+        callbacks (List of [`TrainerCallback`], *optional*):
+            A list of callbacks to customize the training loop. Will add those to the list of default callbacks.
+            If you want to remove one of the default callbacks used, use the [`Trainer.remove_callback`] method.
+        optimizers (`Tuple[paddle.optimizer.Optimizer, paddle.optimizer.lr.LRScheduler]`, *optional*): A tuple
+            containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your model
+            and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
+        preprocess_logits_for_metrics (`Callable[[paddle.Tensor, paddle.Tensor], paddle.Tensor]`, *optional*):
+            A function that preprocess the logits right before caching them at each evaluation step. Must take two
+            tensors, the logits and the labels, and return the logits once processed as desired. The modifications made
+            by this function will be reflected in the predictions received by `compute_metrics`.
+
+    Important attributes:
+
+        - **model** -- Always points to the core model. If using a transformers model, it will be a [`PretrainedModel`]
+          subclass.
+        - **model_wrapped** -- Always points to the most external model in case one or more other modules wrap the
+          original model. This is the model that should be used for the forward pass. For example, the inner model is
+          wrapped in `paddle.DataParallel`. If model hasn't been wrapped, then `self.model_wrapped` is the same
+          as `self.model`.
+
+    """
+
+    from .trainer_utils import log_metrics, metrics_format, save_metrics, save_state
+
+    def __init__(
+        self,
+        model: Union[PretrainedModel, nn.Layer] = None,
+        criterion: nn.Layer = None,
+        args: TrainingArguments = None,
+        data_collator: Optional[DataCollator] = None,
+        train_dataset: Optional[Dataset] = None,
+        eval_dataset: Union[Dataset, Dict[str, Dataset]] = None,
+        tokenizer: Optional[PretrainedTokenizer] = None,
+        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
+        callbacks: Optional[List[TrainerCallback]] = None,
+        optimizers: Tuple[paddle.optimizer.Optimizer, paddle.optimizer.lr.LRScheduler] = (None, None),
+        preprocess_logits_for_metrics: Callable[[paddle.Tensor, paddle.Tensor], paddle.Tensor] = None,
+    ):
+
+        if args is None:
+            output_dir = "tmp_trainer"
+            logger.info(f"No `TrainingArguments` passed, using `output_dir={output_dir}`.")
+            args = TrainingArguments(output_dir=output_dir)
+
+        self.args = args
+        self.is_in_train = False
+        # self.do_grad_scaling = args.fp16
+
+        # memory metrics - must set up as early as possible
+        self._memory_tracker = TrainerMemoryTracker(self.args.skip_memory_metrics)
+        self._memory_tracker.start()
+
+        # Seed must be set before instantiating the model when using model
+        set_seed(seed=self.args.seed)
+
+        if model is None:
+            raise RuntimeError("`Trainer` requires either a `model` or `model_init` argument")
+
+        if self.args.to_static:
+            model = paddle.jit.to_static(model)
+            logger.info("Successfully to apply @to_static to the whole model.")
+
+        if self.args.should_save or self.args.should_save_model_state:
+            os.makedirs(self.args.output_dir, exist_ok=True)
+
+        self.sharding = None
+        if len(args.sharding) > 0:
+            if args.local_rank == -1:
+                raise ValueError("Using sharding only works in distributed training.")
+            self.sharding = True
+
+        # init parallel env
+        if paddle.distributed.get_world_size() > 1:
+            if self.args.use_hybrid_parallel:
+                self.hcg = fleet.get_hybrid_communicate_group()
+                self.dp_group = self.hcg.get_data_parallel_group()
+                self.sharding_group = self.hcg.get_sharding_parallel_group()
+
+        default_collator = default_data_collator if tokenizer is None else DataCollatorWithPadding(tokenizer)
+
+        self.data_collator = data_collator if data_collator is not None else default_collator
+        self.train_dataset = train_dataset
+        self.eval_dataset = eval_dataset
+        self.tokenizer = tokenizer
+        if not args.skip_profile_timer:
+            set_timers()
+        self.timers = get_timers()
+        self.runtime_timer = RuntimeTimer("RuntimeTimer")
+
+        self.model_wrapped = model
+        self.model = model
+        self.criterion = criterion
+
+        self.compute_metrics = compute_metrics
+        self.preprocess_logits_for_metrics = preprocess_logits_for_metrics
+        self.optimizer, self.lr_scheduler = optimizers
+        # Label smoothing
+        # if self.args.label_smoothing_factor != 0:
+        #     self.label_smoother = LabelSmoother(epsilon=self.args.label_smoothing_factor)
+        # else:
+        self.label_smoother = None
+        self.state = TrainerState()
+        self.control = TrainerControl()
+        self._signature_columns = None
+        self.optimizer_grouped_parameters = None
+        self.sharding_io = None
+        if self.args.should_save_sharding_stage1_model or self.args.should_load_sharding_stage1_model:
+            self.sharding_io = ShardingIO(self.args, self.model, self.optimizer)
+        if self.args.unified_checkpoint:
+            self.unified_checkpoint_handler = UnifiedCheckpointHandler(self.args)
+
+        if self.sharding is not None and self.optimizer is not None:
+            raise RuntimeError(
+                "Passing `optimizers` is not allowed if sharding is enabled."
+                "You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method."
+            )
+
+        if self.args.pipeline_parallel_degree > 1 and self.args.use_hybrid_parallel:
+            from paddle.distributed.fleet.meta_parallel import PipelineLayer
+
+            assert (isinstance(model, LoRAModel) and isinstance(model.model, PipelineLayer)) or isinstance(
+                model, PipelineLayer
+            ), "Only support pipeline parallel mode when model is PipelineLayer!!!"
+
+        default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to)
+        callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks
+        self.callback_handler = CallbackHandler(
+            callbacks, self.model, self.tokenizer, self.optimizer, self.lr_scheduler
+        )
+        self.add_callback(PrinterCallback if self.args.disable_tqdm else DEFAULT_PROGRESS_CALLBACK)
+
+        self._save_ckpt_func = dist.save_state_dict if self.args.enable_auto_parallel else paddle.save
+        self._load_ckpt_func = dist.load_state_dict if self.args.enable_auto_parallel else paddle.load
+        if self.args.use_async_save:
+            self._async_optimizer_saver = AsyncSaver()
+
+        if args.max_steps > 0:
+            logger.info("max_steps is given, it will override any value given in num_train_epochs")
+
+        if train_dataset is not None and not isinstance(train_dataset, collections.abc.Sized) and args.max_steps <= 0:
+            raise ValueError("train_dataset does not implement __len__, max_steps has to be specified")
+
+        if (
+            isinstance(self.model, LoRAModel)
+            or isinstance(self.model, PrefixModelForCausalLM)
+            or isinstance(self.model, VeRAModel)
+        ):
+            if self.args.unified_checkpoint and "skip_save_model_weight" in self.args.unified_checkpoint_config:
+                self.args.unified_checkpoint_config.remove("skip_save_model_weight")
+                logger.warning(
+                    "We do not support skip_save_model_weight in peft model when using unified checkpoint, remove this config."
+                )
+
+        self.do_grad_scaling = False
+        self.enable_autocast_context_manager = False
+        if args.fp16 or args.bf16:
+            # set do_grad_scaling, enable_autocast_context_manager
+            self._wrap_amp_model(args, model)
+
+        if args.recompute:
+
+            def fn(layer):
+                if hasattr(layer, "enable_recompute") and (
+                    layer.enable_recompute is False or layer.enable_recompute == 0
+                ):
+                    layer.enable_recompute = True
+
+            model.apply(fn)
+
+        default_label_names = (
+            ["start_positions", "end_positions"]
+            if "QusetionAnswering" in type(self.model).__name__ or "UIE" in type(self.model).__name__
+            else ["labels"]
+        )
+        self.label_names = default_label_names if self.args.label_names is None else self.args.label_names
+
+        self.control = self.callback_handler.on_init_end(self.args, self.state, self.control)
+        self.print_config()
+
+        # very last
+        self._memory_tracker.stop_and_update_metrics()
+
+    def _wrap_amp_model(self, args, model):
+        logger.info("Using half precision")
+        self.enable_autocast_context_manager = True
+        self.do_grad_scaling = True if args.fp16 else False
+        self.amp_dtype = "float16" if args.fp16 else "bfloat16"
+        # fix for load saved fp16 or bf16 ckpt, decorate model first.
+        if self.args.fp16_opt_level == "O2":
+            paddle.amp.decorate(
+                models=model,
+                level=self.args.fp16_opt_level,
+                dtype=self.amp_dtype,
+                excluded_layers=[QuantizationLinear] + self._decorate_exclude_layers(model),
+            )
+        # for pipeline mode and pure tensor parallel
+        if self.args.pipeline_parallel_degree > 1 or (self.args.tensor_parallel_degree > 1 and self.sharding is None):
+            self.scaler = paddle.amp.GradScaler(init_loss_scaling=self.args.scale_loss)
+            if self.args.amp_master_grad:
+                mix_precision_utils.MixPrecisionScaler(self.scaler)  # retun value has no use
+            self.scaler = fleet.distributed_scaler(self.scaler)
+        elif self.sharding is not None:
+            self.scaler = paddle.amp.GradScaler(init_loss_scaling=self.args.scale_loss)
+            if self.amp_dtype == "float16" or self.amp_dtype == "bfloat16":
+                if ShardingOption.SHARD_OP in self.args.sharding:
+                    if self.args.amp_master_grad:
+                        mix_precision_utils.MixPrecisionScaler(self.scaler)  # retun value has no use
+                    self.scaler = fleet.distributed_scaler(self.scaler)
+                else:
+                    # scaler for stage2 and stage3
+                    from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import (
+                        GroupShardedScaler,
+                    )
+
+                    if self.args.amp_master_grad:
+                        mix_precision_utils.MixPrecisionScaler(self.scaler)  # return value has no use
+
+                    self.scaler = GroupShardedScaler(self.scaler)
+            else:
+                self.do_grad_scaling = False
+                self.use_cuda_amp = False
+                self.amp_dtype = None
+
+        else:
+            self.scaler = paddle.amp.GradScaler(init_loss_scaling=self.args.scale_loss)
+
+    def add_callback(self, callback):
+        """
+        Add a callback to the current list of [`~TrainerCallback`].
+
+        Args:
+           callback (`type` or [`~TrainerCallback`]):
+               A [`~TrainerCallback`] class or an instance of a [`~TrainerCallback`]. In the
+               first case, will instantiate a member of that class.
+        """
+        self.callback_handler.add_callback(callback)
+
+    def pop_callback(self, callback):
+        """
+        Remove a callback from the current list of [`~TrainerCallback`] and returns it.
+        If the callback is not found, returns `None` (and no error is raised).
+        Args:
+           callback (`type` or [`~TrainerCallback`]):
+               A [`~TrainerCallback`] class or an instance of a [`~TrainerCallback`]. In the
+               first case, will pop the first member of that class found in the list of callbacks.
+        Returns:
+            [`~TrainerCallback`]: The callback removed, if found.
+        """
+        return self.callback_handler.pop_callback(callback)
+
+    def remove_callback(self, callback):
+        """
+        Remove a callback from the current list of [`~TrainerCallback`].
+        Args:
+           callback (`type` or [`~TrainerCallback`]):
+               A [`~TrainerCallback`] class or an instance of a [`~TrainerCallback`]. In the
+               first case, will remove the first member of that class found in the list of callbacks.
+        """
+        self.callback_handler.remove_callback(callback)
+
+    def _load_from_peft_checkpoint(self, resume_from_checkpoint=None):
+        """load state_dict from checkpoint, Only for PEFT Model.
+
+        Args:
+            resume_from_checkpoint (`str` or `bool`, *optional*):
+                If a `str`, local path to a saved checkpoint as saved by a previous instance of [`Trainer`]. If a
+                `bool` and equals `True`, load the last checkpoint in *args.output_dir* as saved by a previous instance
+                of [`Trainer`]. Only load model state dict.
+        """
+
+        if resume_from_checkpoint is not None:
+            convert_tp = False
+            if isinstance(self.model, LoRAModel):
+                if self.model.quantized or self.args.pipeline_parallel_degree > 1:
+                    weights_file = os.path.join(
+                        resume_from_checkpoint, _add_variant(LORA_WEIGHTS_NAME, self.args.weight_name_suffix)
+                    )
+                else:
+                    weights_file = os.path.join(resume_from_checkpoint, LORA_WEIGHTS_NAME)
+                    if self.model.lora_config.tensor_parallel_degree > 1:
+                        convert_tp = True
+            elif isinstance(self.model, PrefixModelForCausalLM):
+                weights_file = os.path.join(resume_from_checkpoint, PREFIX_WEIGHTS_NAME)
+                if self.model.prefix_config.tensor_parallel_degree > 1:
+                    convert_tp = True
+            elif isinstance(self.model, VeRAModel):
+                weights_file = os.path.join(resume_from_checkpoint, VERA_WEIGHTS_NAME)
+            if self.args.dataset_rank == 0:
+                logger.info(f"Loading model from {resume_from_checkpoint} .")
+
+                if os.path.isfile(weights_file):
+                    # We load the model state dict on the CPU to avoid an OOM error.
+                    state_dict = paddle.load(weights_file, return_numpy=True)
+                    if convert_tp:
+                        state_dict = self.model._convert_tensor_parallel(state_dict)
+
+                    # If the model is on the GPU, it still works!
+                    self._set_state_dict_in_model(state_dict)
+                    # release memory
+                    del state_dict
+        elif resume_from_checkpoint is not None:
+            logger.info(f"not loading ckpt :{self.args.dataset_rank}")
+
+    def _load_from_checkpoint(self, resume_from_checkpoint=None):
+        """load state_dict from_checkpoint, Only load model state dict.
+
+        Args:
+            resume_from_checkpoint (`str` or `bool`, *optional*):
+                If a `str`, local path to a saved checkpoint as saved by a previous instance of [`Trainer`]. If a
+                `bool` and equals `True`, load the last checkpoint in *args.output_dir* as saved by a previous instance
+                of [`Trainer`]. Only load model state dict.
+        """
+        self.runtime_timer.start("checkpoint loading time")
+        resume_from_checkpoint = None if not resume_from_checkpoint else resume_from_checkpoint
+
+        # Load potential model checkpoint
+        if isinstance(resume_from_checkpoint, bool) and resume_from_checkpoint:
+            uc_async_save = self.args.unified_checkpoint and "async_save" in self.args.unified_checkpoint_config
+            resume_from_checkpoint = get_last_checkpoint(self.args.output_dir, uc_async_save)
+            if resume_from_checkpoint is None:
+                raise ValueError(f"No valid checkpoint found in output directory ({self.args.output_dir})")
+
+        if self.args.unified_checkpoint:
+            if resume_from_checkpoint is not None:
+                use_unified_checkpoint = False
+                if self.is_unified_checkpoint(resume_from_checkpoint):
+                    use_unified_checkpoint = True
+                else:
+                    logger.info("Loading origin checkpoint, the next checkpoint will be saved as unified checkpoint")
+
+                if use_unified_checkpoint:
+                    self.unified_checkpoint_handler.load_unified_checkpoint(
+                        self.model,
+                        self.optimizer,
+                        resume_from_checkpoint,
+                    )
+                    logger.info(f"Loading model from {resume_from_checkpoint} using unified checkpoint.")
+                    self.runtime_timer.stop()
+                    return
+
+        if (
+            isinstance(self.model, LoRAModel)
+            or isinstance(self.model, PrefixModelForCausalLM)
+            or isinstance(self.model, VeRAModel)
+        ):
+            self._load_from_peft_checkpoint(resume_from_checkpoint)
+            self.runtime_timer.stop()
+            return
+
+        weight_name = PADDLE_WEIGHTS_NAME
+        weight_index_name = PADDLE_WEIGHTS_INDEX_NAME  # currently set paddle as default, do not support safetensors.
+
+        if self.args.should_load_sharding_stage1_model:
+            state_dict = self.sharding_io.load_state_dict_from_checkpoint_with_reshard(
+                resume_from_checkpoint,
+                base_weight_name=weight_name,
+                model_wrapped=self.model_wrapped,
+            )
+            old_state_dict = self.model.state_dict()
+            new_state_dict = {}
+            for k, v in state_dict.items():
+                if k not in old_state_dict or id(v) != id(old_state_dict[k]):
+                    new_state_dict[k] = v
+            self.model.set_state_dict(new_state_dict)
+        else:
+            if resume_from_checkpoint is not None and (self.args.dataset_rank == 0 or self.args.use_expert_parallel):
+
+                weights_file = os.path.join(
+                    resume_from_checkpoint, _add_variant(weight_name, self.args.weight_name_suffix)
+                )
+                weights_index_file = os.path.join(
+                    resume_from_checkpoint, _add_variant(weight_index_name, self.args.weight_name_suffix)
+                )
+
+                if not any(
+                    os.path.isfile(f)
+                    for f in [
+                        weights_file,
+                        weights_index_file,
+                    ]
+                ):
+                    raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint} -- {weights_file}")
+
+                logger.info(f"Loading model from {resume_from_checkpoint} .")
+
+                if os.path.isfile(weights_file):
+                    # We load the model state dict on the CPU to avoid an OOM error.
+                    state_dict = paddle.load(weights_file, return_numpy=True)
+                    # If the model is on the GPU, it still works!
+                    self._set_state_dict_in_model(state_dict)
+                    # release memory
+                    del state_dict
+                else:
+                    # We load the sharded checkpoint.
+                    missing_keys, unexpected_keys = load_sharded_checkpoint(
+                        self.model, resume_from_checkpoint, self.args.weight_name_suffix, prefer_safe=False
+                    )
+                    logger.info(f"set state_dict: {missing_keys, unexpected_keys}")
+
+            elif resume_from_checkpoint is not None:
+                logger.info(f"not loading ckpt :{self.args.dataset_rank}")
+        self.runtime_timer.stop()
+
+    def _wrap_model_and_load_sharded_checkpoint(self, resume_from_checkpoint):
+        # In the sharded mode, should invoke _load_from_checkpoint after _wrap_model.
+        # In this mode, each sharding rank load sharded params, do not need to implement the broadcast logic.
+        model = self._wrap_model(self.model_wrapped)
+        if self.sharding_io is not None:
+            # the self.optimizer should be wrapped and it is done in _wrap_model
+            self.sharding_io.set_optimizer(self.optimizer)
+        if model is not self.model:
+            self.model_wrapped = model
+        # Should invoke _load_from_checpoint after _load_optimizer_and_scheduler
+        # because the _load_from_checkpoint method rely on the optimizer in the shareded mode.
+        if resume_from_checkpoint:
+            self._load_optimizer_and_scheduler(resume_from_checkpoint)
+            self._load_from_checkpoint(resume_from_checkpoint)
+        return model
+
+    def train(
+        self,
+        resume_from_checkpoint: Optional[Union[str, bool]] = None,
+        ignore_keys_for_eval: Optional[List[str]] = None,
+    ):
+        """
+        Main training entry point.
+
+        Args:
+            resume_from_checkpoint (`str` or `bool`, *optional*):
+                If a `str`, local path to a saved checkpoint as saved by a previous instance of [`Trainer`]. If a
+                `bool` and equals `True`, load the last checkpoint in *args.output_dir* as saved by a previous instance
+                of [`Trainer`]. If present, training will resume from the model/optimizer/scheduler states loaded here.
+            ignore_keys_for_eval (`List[str]`, *optional*)
+                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
+                gathering predictions for evaluation during the training.
+        """
+        args = self.args
+        self.is_in_train = True
+
+        logger.info(f"Starting training from resume_from_checkpoint : {resume_from_checkpoint}")
+
+        # The resume_from_checkpoint could be None in some machine node.
+        # Here we reset None to temp directory.
+        if args.world_size > 1:
+            is_resume_from_checkpoint = paddle.to_tensor([resume_from_checkpoint is not None], dtype="int32")
+            paddle.distributed.all_reduce(is_resume_from_checkpoint)
+            is_resume_from_checkpoint = is_resume_from_checkpoint.item()
+            if is_resume_from_checkpoint > 0 and is_resume_from_checkpoint < paddle.distributed.get_world_size():
+                if resume_from_checkpoint is None:
+                    resume_from_checkpoint = os.path.join(self.args.output_dir, "local_tempdir")
+                    if os.path.exists(resume_from_checkpoint) and self.args.local_rank == 0:
+                        shutil.rmtree(resume_from_checkpoint)
+                    os.makedirs(resume_from_checkpoint, exist_ok=True)
+                    logger.info(f"Reset resume_from_checkpoint to temp directory : {resume_from_checkpoint}")
+
+        train_dataloader = self.get_train_dataloader()
+
+        total_train_batch_size = args.train_batch_size * args.gradient_accumulation_steps * args.dataset_world_size
+        len_dataloader = None
+        if has_length(train_dataloader):
+            len_dataloader = len(train_dataloader)
+            num_update_steps_per_epoch = len(train_dataloader) // args.gradient_accumulation_steps
+            num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1)
+            num_examples = len(self.train_dataset)
+
+            if args.max_steps > 0:
+                max_steps = args.max_steps
+                num_train_epochs = args.max_steps // num_update_steps_per_epoch + int(
+                    args.max_steps % num_update_steps_per_epoch > 0
+                )
+                num_train_samples = args.max_steps * total_train_batch_size
+            else:
+                max_steps = int(num_update_steps_per_epoch * args.num_train_epochs)
+                num_train_epochs = math.ceil(args.num_train_epochs)
+                num_train_samples = int(len(self.train_dataset) * args.num_train_epochs)
+
+            if args.minimum_eval_times is not None and args.minimum_eval_times > 0:
+                if max_steps // args.eval_steps < args.minimum_eval_times:
+                    exp_step = max_steps / args.minimum_eval_times
+                    exp_step = max(int(exp_step - exp_step % 10), 10)
+                    logger.info("Reset eval step by minimum_eval_times to %d" % exp_step)
+                    args.eval_steps = exp_step
+        elif args.max_steps > 0:  # Rely on max_steps when dataloader does not have a working size
+            max_steps = args.max_steps
+            # Setting a very large number of epochs so we go as many times as necessary over the iterator.
+            num_train_epochs = sys.maxsize
+            num_update_steps_per_epoch = max_steps
+            num_examples = total_train_batch_size * args.max_steps
+            num_train_samples = args.max_steps * total_train_batch_size
+        else:
+            raise ValueError(
+                f"args.max_steps must be set to a positive value if dataloader does not have a length, was {args.max_steps}"
+            )
+
+        # delay_optimizer_creation = (
+        #     self.sharding is not None
+        #     and ShardingOption.SHARD_OP in self.args.sharding
+        # )
+        delay_optimizer_creation = False
+
+        if not delay_optimizer_creation:
+            self.create_optimizer_and_scheduler(num_training_steps=max_steps)
+
+        self.state = TrainerState()
+
+        # memory metrics - must set up as early as possible
+        self._memory_tracker.start()
+
+        if not self.args.enable_auto_parallel:
+            if not self.args.should_load_sharding_stage1_model:
+                self._load_from_checkpoint(resume_from_checkpoint)
+
+            if self.args.should_load_sharding_stage1_model:
+                model = self._wrap_model_and_load_sharded_checkpoint(resume_from_checkpoint)
+
+            elif self.args.should_save_sharding_stage1_model:
+                # In the non-sharded mode, should invoke _load_from_checkpoint before _wrap_model.
+                # In this mode, the rank0 load all params and the _wrap_model implicitly broadcast params from rank0 to the other ranks.
+                model = self._wrap_model(self.model_wrapped)
+                if self.sharding_io is not None:
+                    assert delay_optimizer_creation is False, "delay_optimizer_creation should be False"
+                    # the self.optimizer should be wrapped and it is done in _wrap_model
+                    self.sharding_io.set_optimizer(self.optimizer)
+                # for the rest of this function `model` is the outside model, whether it was wrapped or not
+                if model is not self.model:
+                    self.model_wrapped = model
+                if delay_optimizer_creation:
+                    self.create_optimizer_and_scheduler(num_training_steps=max_steps)
+                self._load_optimizer_and_scheduler(resume_from_checkpoint)
+            else:
+                model = self._wrap_model(self.model_wrapped)
+                # for the rest of this function `model` is the outside model, whether it was wrapped or not
+                if model is not self.model:
+                    self.model_wrapped = model
+                if delay_optimizer_creation:
+                    self.create_optimizer_and_scheduler(num_training_steps=max_steps)
+                self._load_optimizer_and_scheduler(resume_from_checkpoint)
+        else:
+            model = self.model_wrapped
+            if delay_optimizer_creation:
+                self.create_optimizer_and_scheduler(num_training_steps=max_steps)
+
+        logger.info(f"{self.runtime_timer.log()}")
+        logger.info("***** Running training *****")
+        logger.info(f"  Num examples = {num_examples:,}")
+        logger.info(f"  Num Epochs = {num_train_epochs}")
+        logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
+        logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size}")
+        logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+        logger.info(f"  Total optimization steps = {max_steps:,}")
+        logger.info(f"  Total num train samples = {num_train_samples:,}")
+        # per_device_trainable_numel = sum(p.numel().item() for p in model.parameters() if not p.stop_gradient)
+        # TODO: Temporary fix since Tensor.numel() not supported in distributed mode
+        per_device_trainable_numel = sum(np.prod(p.shape) for p in model.parameters() if not p.stop_gradient)
+        logger.debug(f"  Number of trainable parameters = {per_device_trainable_numel:,} (per device)")
+        if self.args.use_hybrid_parallel:
+            # todo fix for pipeline_parallel_degree
+            parts_num = max(self.args.tensor_parallel_degree, 1) * max(self.args.pipeline_parallel_degree, 1)
+            if parts_num > 1:
+                all_reduce_dtype = "int64"
+                if paddle.get_device().split(":")[0] in ["npu", "xpu"]:
+                    # TODO(duanyanhui): fix when NPU all_reduce supports int64
+                    all_reduce_dtype = "float32"
+                trainable_numel_tensor = paddle.to_tensor(per_device_trainable_numel, dtype=all_reduce_dtype)
+                paddle.distributed.all_reduce(trainable_numel_tensor)
+                trainable_numel = int(trainable_numel_tensor.item()) // self.args.dataset_world_size
+                if self.args.sep_parallel_degree > 0:
+                    trainable_numel = trainable_numel // self.args.sep_parallel_degree
+                if self.args.context_parallel_degree > 0:
+                    trainable_numel = trainable_numel // self.args.context_parallel_degree
+                # the numel is roughly, because the tensor parallel still hold own bias or layer_norm weight without splited
+                # so, the trainable numel is a little bigger than real.
+                logger.debug(f"  Number of trainable parameters = {trainable_numel:,} (all devices, roughly)")
+
+        return self._inner_training_loop(
+            args,
+            model,
+            train_dataloader,
+            len_dataloader,
+            max_steps,
+            num_train_epochs,
+            num_update_steps_per_epoch,
+            num_train_samples,
+            resume_from_checkpoint,
+            ignore_keys_for_eval,
+        )
+
+    def _inner_training_loop(
+        self,
+        args,
+        model,
+        train_dataloader,
+        len_dataloader,
+        max_steps,
+        num_train_epochs,
+        num_update_steps_per_epoch,
+        num_train_samples,
+        resume_from_checkpoint,
+        ignore_keys_for_eval,
+    ):
+        start_time = time.time()
+        self._globalstep_last_start_time = time.time()
+        self.state.epoch = 0
+        epochs_trained = 0
+        steps_trained_in_current_epoch = 0
+        steps_trained_progress_bar = None
+
+        # Check if continuing training from a checkpoint
+        if (
+            resume_from_checkpoint is not None
+            and distributed_isfile(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
+            and not self.args.ignore_load_lr_and_optim
+        ):
+            self.state = TrainerState.load_from_json(
+                distributed_file(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
+            )
+            if self.args.world_size > 1:
+                global_step_list = []
+                paddle.distributed.all_gather(
+                    global_step_list, paddle.to_tensor([self.state.global_step], dtype="int64")
+                )
+                assert (
+                    paddle.sum(paddle.stack(global_step_list) - global_step_list[0]) == 0
+                ), f"Error, get different globel step, please check! step list: {[x.item() for x in global_step_list]}"
+
+            epochs_trained = self.state.global_step // num_update_steps_per_epoch
+            if not args.ignore_data_skip:
+                steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch)
+                steps_trained_in_current_epoch *= args.gradient_accumulation_steps
+            else:
+                steps_trained_in_current_epoch = 0
+
+            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+            logger.info(f"  Continuing training from epoch {epochs_trained}")
+            logger.info(f"  Continuing training from global step {self.state.global_step}")
+            if not args.ignore_data_skip:
+                logger.info(
+                    f"  Will skip the first {epochs_trained} epochs then the first {steps_trained_in_current_epoch} "
+                    "batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` "
+                    "flag to your launch command, but you will resume the training on data already seen by your model."
+                )
+                if self.is_local_process_zero() and not args.disable_tqdm:
+                    steps_trained_progress_bar = tqdm(total=steps_trained_in_current_epoch)
+                    steps_trained_progress_bar.set_description("Skipping the first batches")
+            if not args.ignore_data_skip:
+                if isinstance(train_dataloader, paddle.io.DataLoader) and isinstance(
+                    train_dataloader.batch_sampler, NlpDistributedBatchSampler
+                ):
+                    consumed_samples = (
+                        self.state.global_step
+                        * args.train_batch_size
+                        * args.gradient_accumulation_steps
+                        * args.dataset_world_size
+                    )
+                    train_dataloader.batch_sampler.set_epoch(consumed_samples=consumed_samples)
+                    logger.info(f"Set DistributedBatchSampler consumed_samples to {consumed_samples}")
+
+        epoch_iterator = train_dataloader
+        # steps_in_epoch = len(epoch_iterator)
+        steps_in_epoch = (
+            len(epoch_iterator) if len_dataloader is not None else args.max_steps * args.gradient_accumulation_steps
+        )
+        if len_dataloader is not None:
+            if self.args.gradient_accumulation_steps > len(epoch_iterator):
+                logger.warning(
+                    f"changing accumulation step from `{self.args.gradient_accumulation_steps}` to `{len(epoch_iterator)}` to avoid, cross epoch accumulate"
+                )
+                self.args.gradient_accumulation_steps = len(epoch_iterator)
+
+        self.callback_handler.model = self.model
+        self.callback_handler.optimizer = self.optimizer
+        self.callback_handler.lr_scheduler = self.lr_scheduler
+        self.callback_handler.train_dataloader = train_dataloader
+
+        self.state.max_steps = int(max_steps)
+        self.state.num_train_epochs = num_train_epochs
+        self.state.is_local_process_zero = self.is_local_process_zero()
+        self.state.is_world_process_zero = self.is_world_process_zero()
+
+        self.control = self.callback_handler.on_train_begin(args, self.state, self.control)
+
+        tr_loss = paddle.to_tensor(0.0)
+        self._total_loss_scalar = 0.0
+        self._globalstep_last_logged = self.state.global_step
+
+        if self.args.device == "npu" and self.args.flatten_param_grads:
+            from .plugins.npu_plugin import npu_accelerate_plugin
+
+            npu_accelerate_plugin(self.optimizer)
+
+        if self.args.ignore_data_skip:
+            self.timers and self.timers("read-data").start()
+
+        for epoch in range(epochs_trained, num_train_epochs):
+            if isinstance(train_dataloader, paddle.io.DataLoader) and isinstance(
+                train_dataloader.batch_sampler, DistributedBatchSampler
+            ):
+                train_dataloader.batch_sampler.set_epoch(epoch)
+
+            step_control = 0  # used in loop control, reset to 0 after every step
+            self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control)
+
+            for step, inputs in enumerate(epoch_iterator):
+                if self.args.use_hybrid_parallel and self.args.sep_parallel_degree > 1:
+                    inputs = split_inputs_sequence_dim(inputs)
+                if self.args.use_hybrid_parallel and self.args.context_parallel_degree > 1:
+                    inputs = split_inputs_sequence_dim_load_balance(inputs)
+                if self.args.ignore_data_skip:
+                    self.timers and self.timers("read-data").stop()
+
+                os.environ["TRAINER_GLOBAL_STEP"] = str(self.state.global_step)
+                self.callback_handler.on_load_data_end(args, self.state, self.control, inputs=inputs)
+
+                # Skip past any already trained steps if resuming training
+                # for paddlenlp.utils.batch_sampler.DistributedBatchSampler
+                # We use consumed_samples to reset the status
+                if isinstance(train_dataloader, paddle.io.DataLoader) and isinstance(
+                    train_dataloader.batch_sampler, NlpDistributedBatchSampler
+                ):
+                    if step == 0:
+                        if steps_trained_progress_bar is not None:
+                            steps_trained_progress_bar.update(steps_trained_in_current_epoch)
+                            steps_trained_progress_bar.close()
+                            steps_trained_progress_bar = None
+                        self._load_rng_state(resume_from_checkpoint)
+                    step += steps_trained_in_current_epoch
+                elif steps_trained_in_current_epoch > 0:
+                    steps_trained_in_current_epoch -= 1
+                    if steps_trained_progress_bar is not None:
+                        steps_trained_progress_bar.update(1)
+                    if steps_trained_in_current_epoch == 0:
+                        self._load_rng_state(resume_from_checkpoint)
+                    self.timers and self.timers("read-data").start()
+                    continue
+                elif steps_trained_progress_bar is not None:
+                    steps_trained_progress_bar.close()
+                    steps_trained_progress_bar = None
+
+                if step_control % args.gradient_accumulation_steps == 0:
+                    self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
+                    self.timers and self.timers("forward-backward").start()
+
+                # stage2 and stage3 should not no_sync, because the is no DDP wrapper and no_sync API
+                # hybrid_parallel (tp or pp or sharding stage 1) should not no_sync
+                availiable_no_sync = hasattr(model, "no_sync")
+                is_no_sync = (
+                    (
+                        ((step_control + 1) % args.gradient_accumulation_steps != 0)
+                        and args._no_sync_in_gradient_accumulation
+                    )
+                    or args.recompute
+                    or args.use_expert_parallel
+                ) and availiable_no_sync
+                # sharding
+                # stage1. the same as ddp
+                # stage2. manualy collect gradient on dp group
+
+                dp_master_grad = (
+                    self.args.world_size > 1 and self.args.amp_master_grad and not self.args.use_hybrid_parallel
+                )
+                if dp_master_grad:
+                    is_no_sync = True
+
+                if is_no_sync:
+                    # Avoid unnecessary DDP synchronization since there will be no backward pass on this example.
+                    with model.no_sync():
+                        tr_loss_step = self.training_step(model, inputs)
+                else:
+                    tr_loss_step = self.training_step(model, inputs)
+
+                tr_loss += tr_loss_step
+
+                def fused_allreduce_gradients_no_sync(paramlist, hcg):
+                    paramlist = list(paramlist)
+                    nonmoe_list = [p for p in paramlist if not getattr(p, "no_sync", False)]
+                    moelist = [p for p in paramlist if getattr(p, "no_sync", False)]
+                    if moelist and not self.args.use_expert_parallel:
+                        logger.warning("found `no sync` param when `use_expert_parallel=False`")
+                    fused_allreduce_gradients(nonmoe_list, hcg)
+
+                if (step_control + 1) % args.gradient_accumulation_steps == 0 or (
+                    # last step in epoch but step is always smaller than gradient_accumulation_steps
+                    steps_in_epoch <= args.gradient_accumulation_steps
+                    and (step + 1) == steps_in_epoch
+                ):
+                    if self.args.pipeline_parallel_degree <= 1 and self._enable_delay_scale_loss():
+                        tr_loss /= self.args.gradient_accumulation_steps
+
+                    self.timers and self.timers("forward-backward").stop()
+                    # Maunally collect gradients
+                    # Case 1: Use recompute and dp
+                    # Case 2: Hack dp with master_grad
+                    # Case 3: Pipeline or sharding overlap
+                    # local_rank != -1 don't means dp in networks.
+                    self.timers and self.timers("all-reduce").start()
+
+                    # Case 1: Use recompute and dp / sharding stage1,
+                    # manualy collect gradient for dp.
+                    if (args.recompute or args.use_expert_parallel) and availiable_no_sync:
+                        fused_allreduce_gradients_no_sync(list(model.parameters()), None)
+
+                    # Case 2: hack dp with master_grad
+                    elif dp_master_grad:
+                        fused_allreduce_gradients_no_sync(list(model.parameters()), None)
+
+                    # Pipeline parallel mode,  handle gradient reduce here to overlap
+                    enable_dp_comm_overlap = "enable_dp_comm_overlap" in args.pipeline_parallel_config
+
+                    enable_release_grads = False
+                    if args.sharding_parallel_degree > 1:
+                        enable_release_grads = "enable_release_grads" in args.sharding_parallel_config
+                    if not enable_release_grads and args.pipeline_parallel_degree > 1:
+                        enable_release_grads = "enable_release_grads" in args.pipeline_parallel_config
+
+                    # Case 3: Pipeline parallel mode, overlap with dp
+                    if isinstance(self.optimizer, HybridParallelOptimizer) and not self.do_grad_scaling:
+                        parameters_list = _obtain_optimizer_parameters_list(self.optimizer._inner_opt)
+
+                        if not enable_dp_comm_overlap:
+                            if self.optimizer._sharding_enable:
+                                assert reshard_util.is_sharding_opt(self.optimizer)
+                                self.optimizer._inner_opt.reduce_gradients(list(parameters_list), self.optimizer._hcg)
+
+                            if self.optimizer._dp_enable or getattr(self.optimizer, "_sep_enable", False):
+                                fused_allreduce_gradients_no_sync(list(parameters_list), self.optimizer._hcg)
+                    self.timers and self.timers("all-reduce").stop()
+                    self.timers and self.timers("optimizer-step").start()
+
+                    if self.args.gradient_accumulation_steps > 1 and self._enable_delay_scale_loss():
+                        paddle.device.synchronize()
+                        for p in model._layers.parameters():
+                            with paddle.no_grad():
+                                if hasattr(p, "main_grad") and p.main_grad is not None:
+                                    assert p.grad is None
+                                    p.main_grad.scale_(1.0 / self.args.gradient_accumulation_steps)
+                                elif p.grad is not None:
+                                    p.grad.scale_(1.0 / self.args.gradient_accumulation_steps)
+
+                    # Optimizer step
+                    self.callback_handler.on_optimizer_begin(
+                        args, self.state, self.control, scaler=self.scaler if self.do_grad_scaling else None
+                    )
+                    optimizer_was_run = True
+                    if self.do_grad_scaling:
+                        if args.pipeline_parallel_degree > 1:
+                            assert not self.args.use_expert_parallel, "pipeline moe not work under fp16"
+                        scale_before = paddle.assign(self.scaler._scale)
+                        self.scaler.step(self.optimizer)
+                        self.scaler.update()
+                        scale_after = self.scaler._scale
+                        # Compatible with paddlepaddle 2.6.0 using typo word.
+                        if hasattr(self.scaler, "_cache_founf_inf"):
+                            optimizer_was_run = not self.scaler._cache_founf_inf
+                        else:
+                            optimizer_was_run = not self.scaler._cache_found_inf
+                        if not optimizer_was_run:
+                            scale_before_value = scale_before.cpu().numpy()
+                            scale_after_value = scale_after.cpu().numpy()
+                            logger.warning(
+                                f"optimizer not run, scale_before: {scale_before_value[0]}, scale_after: {scale_after_value[0]}"
+                            )
+                    elif isinstance(self.optimizer, HybridParallelOptimizer):
+                        self.optimizer._step(parameters_list)
+                    else:
+                        self.optimizer.step()
+
+                    self.timers and self.timers("optimizer-step").stop()
+
+                    if optimizer_was_run:
+                        self.lr_scheduler.step()
+
+                    if args.release_grads or enable_release_grads:
+                        self.optimizer.clear_grad(set_to_zero=False)
+                        if args.pipeline_parallel_degree > 1:
+                            for _, buffers in model._chunk_2_comm_buffers.items():
+                                for buffer in buffers:
+                                    buffer._clear_grad_storage()
+                    else:
+                        self.optimizer.clear_grad()
+
+                    self.callback_handler.on_optimizer_end(
+                        args, self.state, self.control, scaler=self.scaler if self.do_grad_scaling else None
+                    )
+
+                    self.state.global_step += 1
+                    self.state.epoch = epoch + (step + 1) / steps_in_epoch
+                    self.control = self.callback_handler.on_step_end(args, self.state, self.control)
+                    self._maybe_log_save_evaluate(tr_loss, model, epoch, ignore_keys_for_eval, inputs=inputs)
+                    self._print_timer()
+                    step_control = 0
+                else:
+                    self.control = self.callback_handler.on_substep_end(args, self.state, self.control)
+                    step_control += 1
+
+                if self.control.should_epoch_stop or self.control.should_training_stop:
+                    break
+
+                if self.args.ignore_data_skip:
+                    self.timers and self.timers("read-data").start()
+
+            if step < 0:
+                logger.warning(
+                    f"There seems to be not a single sample in your epoch_iterator, stopping training at step"
+                    f" {self.state.global_step}! This is expected if you're using an IterableDataset and set"
+                    f" num_steps ({self.state.max_steps}) higher than the number of available samples."
+                )
+                self.control.should_training_stop = True
+
+            self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
+            self._maybe_log_save_evaluate(tr_loss, model, epoch, ignore_keys_for_eval, inputs=inputs)
+
+            if self.control.should_training_stop:
+                break
+
+        if args.past_index and hasattr(self, "_past"):
+            # Clean the state at the end of training
+            delattr(self, "_past")
+
+        logger.info("\nTraining completed. \n")
+
+        # unlink shared_memory if used.
+        if self.args.unified_checkpoint:
+            self.unified_checkpoint_handler.unlink_shared_memory()
+
+        if args.load_best_model_at_end and self.state.best_model_checkpoint is not None:
+            if args.local_rank != -1:
+                dist.barrier()
+
+            logger.info(
+                f"Loading best model from {self.state.best_model_checkpoint} (score: {self.state.best_metric})."
+            )
+            if isinstance(self.model, LoRAModel) or isinstance(self.model, PrefixModelForCausalLM):
+                self._load_best_model_from_peft_checkpoint()
+            else:
+                if self.args.unified_checkpoint:
+                    self.unified_checkpoint_handler.load_unified_checkpoint(
+                        self.model,
+                        self.optimizer,
+                        self.state.best_model_checkpoint,
+                    )
+                    if self.args.sharding_parallel_degree > 1 or self.args.data_parallel_degree > 1:
+                        broadcast_dataset_rank0_model(self.model)
+                else:
+                    weight_name = PADDLE_WEIGHTS_NAME
+                    best_model_path = os.path.join(
+                        self.state.best_model_checkpoint, _add_variant(weight_name, self.args.weight_name_suffix)
+                    )
+                    if os.path.exists(best_model_path):
+                        # We load the model state dict on the CPU to avoid an OOM error.
+                        state_dict = paddle.load(best_model_path, return_numpy=True)
+                        # If the model is on the GPU, it still works!
+                        self._set_state_dict_in_model(state_dict)
+                    else:
+                        logger.warning(
+                            f"Could not locate the best model at {best_model_path}, if you are running a distributed training "
+                            "on multiple nodes, you should activate `--save_on_each_node`."
+                        )
+
+        self._total_loss_scalar += tr_loss.item()
+        train_loss = self._total_loss_scalar / self.state.global_step
+
+        metrics = speed_metrics("train", start_time, num_samples=num_train_samples, num_steps=self.state.max_steps)
+
+        metrics["train_loss"] = train_loss
+
+        self.is_in_train = False
+
+        self._memory_tracker.stop_and_update_metrics(metrics)
+
+        self.log(metrics)
+
+        self.control = self.callback_handler.on_train_end(args, self.state, self.control)
+
+        return TrainOutput(self.state.global_step, train_loss, metrics)
+
+    def _load_best_model_from_peft_checkpoint(self):
+        if self.args.unified_checkpoint:
+            self.unified_checkpoint_handler.load_unified_checkpoint(
+                self.model,
+                self.optimizer,
+                self.state.best_model_checkpoint,
+            )
+            if self.args.sharding_parallel_degree > 1 or self.args.data_parallel_degree > 1:
+                broadcast_dataset_rank0_model(self.model)
+            return
+
+        convert_tp = False
+        if isinstance(self.model, LoRAModel):
+            if self.model.quantized or self.args.pipeline_parallel_degree > 1:
+                best_model_path = os.path.join(
+                    self.state.best_model_checkpoint, _add_variant(LORA_WEIGHTS_NAME, self.args.weight_name_suffix)
+                )
+            else:
+                best_model_path = os.path.join(self.state.best_model_checkpoint, LORA_WEIGHTS_NAME)
+                if self.model.lora_config.tensor_parallel_degree > 1:
+                    convert_tp = True
+
+        elif isinstance(self.model, PrefixModelForCausalLM):
+            best_model_path = os.path.join(self.state.best_model_checkpoint, PREFIX_WEIGHTS_NAME)
+            if self.model.prefix_config.tensor_parallel_degree > 1:
+                convert_tp = True
+
+        if os.path.exists(best_model_path):
+            # We load the model state dict on the CPU to avoid an OOM error.
+            state_dict = paddle.load(best_model_path, return_numpy=True)
+            if convert_tp:
+                state_dict = self.model._convert_tensor_parallel(state_dict)
+            # If the model is on the GPU, it still works!
+            self._set_state_dict_in_model(state_dict)
+        else:
+            logger.warning(
+                f"Could not locate the best model at {best_model_path}, if you are running a distributed training "
+                "on multiple nodes, you should activate `--save_on_each_node`."
+            )
+
+    def _get_train_sampler(self) -> Optional[paddle.io.Sampler]:
+        if self.train_dataset is None or not has_length(self.train_dataset):
+            return None
+
+        if self.args.world_size <= 1:
+            return paddle.io.BatchSampler(
+                dataset=self.train_dataset,
+                shuffle=True,
+                batch_size=self.args.per_device_train_batch_size,
+                drop_last=self.args.dataloader_drop_last,
+            )
+
+        return DistributedBatchSampler(
+            self.train_dataset,
+            batch_size=self.args.per_device_train_batch_size,
+            shuffle=True,
+            num_replicas=self.args.dataset_world_size,
+            rank=self.args.dataset_rank,
+            drop_last=self.args.dataloader_drop_last,
+        )
+
+    def _set_state_dict_in_model(self, state_dict):
+        # TODO  @ZHUI paddle need return the results of set_state_dict.
+        logger.info(f"set state-dict :{self.model.set_state_dict(state_dict)}")
+
+    def _print_timer(self):
+        """print timer and clear states"""
+        paddle_timer_info = ""
+        try:
+            from paddle.distributed.fleet.utils.timer_helper import (
+                get_timers as paddle_get_timers,
+            )
+
+            paddle_pipeline_timers = paddle_get_timers()
+            for name, timer in paddle_pipeline_timers.timers.items():
+                elapsed_time = timer.elapsed(reset=False) * 1000.0
+                paddle_timer_info += f" | {name}: {elapsed_time:.2f}"
+            paddle_pipeline_timers.log(paddle_pipeline_timers.timers.keys(), reset=True)
+        except ImportError:  # paddle version too old, timer not support
+            warnings.warn(f"paddle version:{paddle.__git_commit__} does not support pipeline timer")
+        except AssertionError:  # paddle timer not enabled
+            pass
+
+        if self.timers is not None:
+            timer_info = self.timers.log(self.timers.timers.keys(), reset=True)
+        else:
+            timer_info = ""
+
+        if timer_info or paddle_timer_info:
+            logger.info(f"[Profile global_step: {self.state.global_step}] {timer_info} {paddle_timer_info}")
+
+    def _get_item_from_loss(self, loss):
+        assert isinstance(loss, paddle.Tensor) and loss._is_initialized()
+        return loss.item()
+
+    def _maybe_log_save_evaluate(self, tr_loss, model, epoch, ignore_keys_for_eval, **kwargs):
+        if self.control.should_log:
+
+            logs: Dict[str, float] = {}
+
+            # all_gather + mean() to get average loss over all processes
+            tr_loss_scalar = self._get_item_from_loss(self._nested_gather(tr_loss).mean())
+
+            # reset tr_loss to zero
+            tr_loss.subtract_(tr_loss)
+
+            logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 8)
+            logs["learning_rate"] = float("{0:.3e}".format(self._get_learning_rate()))
+            logs["global_step"] = int(self.state.global_step)
+
+            divisor = 2**30
+            # TODO(@gexiao): replace these codes with unified APIs in Paddle
+            current_device = framework._current_expected_place_()
+            if str(current_device) != "Place(cpu)":
+                device_id = current_device.get_device_id()
+                current_memory_allocated = core.device_memory_stat_current_value("Allocated", device_id)
+                current_memory_reserved = core.device_memory_stat_current_value("Reserved", device_id)
+                max_memory_allocated = core.device_memory_stat_peak_value("Allocated", device_id)
+                max_memory_reserved = core.device_memory_stat_peak_value("Reserved", device_id)
+                logs["current_memory_allocated"] = current_memory_allocated / divisor
+                logs["current_memory_reserved"] = current_memory_reserved / divisor
+                logs["max_memory_allocated"] = max_memory_allocated / divisor
+                logs["max_memory_reserved"] = max_memory_reserved / divisor
+
+            total_train_batch_size = (
+                self.args.train_batch_size * self.args.gradient_accumulation_steps * self.args.dataset_world_size
+            )
+            num_steps = self.state.global_step - self._globalstep_last_logged
+            seq_length = None
+            if getattr(self, "is_pretraining", False) and hasattr(self.model, "config"):
+                seq_length = getattr(self.model.config, "seq_length", None)
+            logs.update(
+                speed_metrics(
+                    "interval",
+                    self._globalstep_last_start_time,
+                    num_samples=total_train_batch_size * num_steps,
+                    num_steps=num_steps,
+                    seq_length=seq_length,
+                )
+            )
+
+            self._total_loss_scalar += tr_loss_scalar
+            self._globalstep_last_logged = self.state.global_step
+            self._globalstep_last_start_time = time.time()
+
+            # Add additional memory in log.
+            if not self.args.skip_memory_metrics:
+                logs.update(
+                    {
+                        "cpu_mem_used": self._memory_tracker.cpu_mem_used() >> 20,
+                        "cpu_mem_used_peak": self._memory_tracker.cpu_mem_used_peak >> 20,
+                    }
+                )
+                if is_paddle_cuda_available():
+                    logs.update(
+                        {
+                            "gpu_max_memory_allocated": paddle.device.cuda.max_memory_allocated() >> 20,
+                            "gpu_max_memory_reserved": paddle.device.cuda.max_memory_reserved() >> 20,
+                        }
+                    )
+
+            self.log(logs, **kwargs)
+
+        metrics = None
+        if self.control.should_evaluate:
+            if isinstance(self.optimizer, GroupShardedOptimizerStage2) and self.optimizer._broadcast_overlap:
+                paddle.device.synchronize()
+
+            if isinstance(self.eval_dataset, dict):
+                for eval_dataset_name, eval_dataset in self.eval_dataset.items():
+                    metrics = self.evaluate(
+                        eval_dataset=eval_dataset,
+                        ignore_keys=ignore_keys_for_eval,
+                        metric_key_prefix=f"eval_{eval_dataset_name}",
+                    )
+            else:
+                metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
+
+        if self.control.should_save:
+            if isinstance(self.optimizer, GroupShardedOptimizerStage2) and self.optimizer._broadcast_overlap:
+                paddle.device.synchronize()
+
+            self._save_checkpoint(model, metrics=metrics)
+            logger.info(f"{self.runtime_timer.log()}")
+            self.control = self.callback_handler.on_save(self.args, self.state, self.control)
+
+    def _get_learning_rate(self):
+        return self.optimizer.get_lr()
+
+    def get_train_dataloader(self):
+        """
+        Returns the training [`~paddle.io.DataLoader`].
+
+        Will use no sampler if `self.train_dataset` does not implement `__len__`, a random sampler (adapted to
+        distributed training if necessary) otherwise.
+
+        Subclass and override this method if you want to inject some custom behavior.
+        """
+        if self.args.should_load_dataset and self.train_dataset is None:
+            raise ValueError("Training requires a train_dataset when should_load_dataset is True.")
+        if not self.args.should_load_dataset and self.train_dataset is not None:
+            raise ValueError("We don't need train_dataset when should_load_dataset is False.")
+
+        train_dataset = self.train_dataset
+        if self.args.distributed_dataloader:
+            is_iterable_dataset = self._is_iterable_dataset_distributed(train_dataset)
+        else:
+            is_iterable_dataset = self._is_iterable_dataset(train_dataset)
+        if is_datasets_available() and train_dataset is not None and isinstance(train_dataset, datasets.Dataset):
+            train_dataset = self._remove_unused_columns(train_dataset, description="training")
+        _DataLoader = DistDataLoader if self.args.distributed_dataloader else DataLoader
+
+        if is_iterable_dataset:  # For iterable dataset
+            if self.args.dataset_world_size > 1 and train_dataset is not None:
+                train_dataset = IterableDatasetShard(
+                    train_dataset,
+                    batch_size=self.args.per_device_train_batch_size,
+                    drop_last=self.args.dataloader_drop_last,
+                    num_processes=self.args.dataset_world_size,
+                    process_index=self.args.dataset_rank,
+                )
+
+            if self.args.distributed_dataloader:
+                logger.info("Training using DistDataLoader.")
+                additional_configs = {"is_iterable_dataset": True}
+            else:
+                additional_configs = {}
+            return _DataLoader(
+                train_dataset,
+                batch_size=self.args.per_device_train_batch_size,
+                collate_fn=self.data_collator,
+                num_workers=self.args.dataloader_num_workers,
+                **additional_configs,
+            )
+        else:
+            train_sampler = self._get_train_sampler()
+            if self.args.distributed_dataloader:
+                logger.info("Training using DistDataLoader.")
+            return _DataLoader(
+                train_dataset,
+                batch_sampler=train_sampler,
+                collate_fn=self.data_collator,
+                num_workers=self.args.dataloader_num_workers,
+            )
+
+    def _get_eval_sampler(self, eval_dataset: Dataset):
+        if eval_dataset is None or not has_length(eval_dataset):
+            return None
+
+        if self.args.world_size <= 1:
+            return paddle.io.BatchSampler(
+                eval_dataset,
+                batch_size=self.args.per_device_eval_batch_size,
+                shuffle=False,
+                drop_last=False,
+            )
+        else:
+            drop_last = False
+            if self.args.pipeline_parallel_degree > 1:
+                drop_last = True
+                logger.warning(
+                    "In parallel mode, the batch_size is strictly checked. set DistributedBatchSampler drop_last=True."
+                )
+
+            return DistributedBatchSampler(
+                eval_dataset,
+                num_replicas=self.args.dataset_world_size,
+                rank=self.args.dataset_rank,
+                batch_size=self.args.per_device_eval_batch_size,
+                shuffle=False,
+                drop_last=drop_last,
+            )
+
+    def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader:
+        """
+        Returns the evaluation [`~paddle.io.DataLoader`].
+
+        Subclass and override this method if you want to inject some custom behavior.
+
+        Args:
+            eval_dataset (`paddle.io.Dataset`, *optional*):
+                If provided, will override `self.eval_dataset`. If it is an `datasets.Dataset`, columns not accepted by
+                the `model.forward()` method are automatically removed. It must implement `__len__`.
+        """
+        if self.args.should_load_dataset and eval_dataset is None and self.eval_dataset is None:
+            raise ValueError("Evaluation requires an eval_dataset when should_load_dataset is True.")
+        if not self.args.should_load_dataset and not (eval_dataset is None and self.eval_dataset is None):
+            raise ValueError("We don't need eval_dataset when should_load_dataset is False.")
+
+        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
+        if self.args.distributed_dataloader:
+            is_iterable_dataset = self._is_iterable_dataset_distributed(eval_dataset)
+        else:
+            is_iterable_dataset = self._is_iterable_dataset(eval_dataset)
+        if is_datasets_available() and eval_dataset is not None and isinstance(eval_dataset, datasets.Dataset):
+            eval_dataset = self._remove_unused_columns(eval_dataset, description="evaluation")
+        _DataLoader = DistDataLoader if self.args.distributed_dataloader else DataLoader
+
+        if is_iterable_dataset:
+            if self.args.dataset_world_size > 1 and eval_dataset is not None:
+                eval_dataset = IterableDatasetShard(
+                    eval_dataset,
+                    batch_size=self.args.per_device_eval_batch_size,
+                    drop_last=self.args.dataloader_drop_last,
+                    num_processes=self.args.dataset_world_size,
+                    process_index=self.args.dataset_rank,
+                )
+            if self.args.distributed_dataloader:
+                logger.info("Eval using DistDataLoader.")
+                additional_configs = {"eval": True, "is_iterable_dataset": True}
+            else:
+                additional_configs = {}
+            return _DataLoader(
+                eval_dataset,
+                batch_size=self.args.per_device_eval_batch_size,
+                collate_fn=self.data_collator,
+                num_workers=0,
+                **additional_configs,
+            )
+        else:
+            eval_sampler = self._get_eval_sampler(eval_dataset)
+            if self.args.distributed_dataloader:
+                logger.info("Eval using DistDataLoader.")
+                additional_configs = {"eval": True}
+            else:
+                additional_configs = {}
+            return _DataLoader(
+                eval_dataset,
+                batch_sampler=eval_sampler,
+                collate_fn=self.data_collator,
+                num_workers=self.args.dataloader_num_workers,
+                **additional_configs,
+            )
+
+    def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
+        """
+        Returns the test [`~paddle.io.DataLoader`].
+
+        Subclass and override this method if you want to inject some custom behavior.
+
+        Args:
+            test_dataset (`paddle.io.Dataset`, *optional*):
+                The test dataset to use. If it is an `datasets.Dataset`, columns not accepted by the `model.forward()`
+                method are automatically removed. It must implement `__len__`.
+        """
+        if self.args.should_load_dataset and not test_dataset:
+            raise ValueError("Test requires an test_dataset when should_load_dataset is True.")
+        if not self.args.should_load_dataset and test_dataset is not None:
+            raise ValueError("We don't need test_dataset when should_load_dataset is False.")
+
+        if self.args.distributed_dataloader:
+            is_iterable_dataset = self._is_iterable_dataset_distributed(test_dataset)
+        else:
+            is_iterable_dataset = self._is_iterable_dataset(test_dataset)
+        if is_datasets_available() and test_dataset is not None and isinstance(test_dataset, datasets.Dataset):
+            test_dataset = self._remove_unused_columns(test_dataset, description="test")
+        _DataLoader = DistDataLoader if self.args.distributed_dataloader else DataLoader
+
+        if is_iterable_dataset:
+            if self.args.dataset_world_size > 1 and test_dataset is not None:
+                test_dataset = IterableDatasetShard(
+                    test_dataset,
+                    batch_size=self.args.per_device_eval_batch_size,
+                    drop_last=self.args.dataloader_drop_last,
+                    num_processes=self.args.dataset_world_size,
+                    process_index=self.args.dataset_rank,
+                )
+
+            if self.args.distributed_dataloader:
+                logger.info("Test using DistDataLoader.")
+                additional_config = {"eval": True, "is_iterable_dataset": True}
+            else:
+                additional_config = {}
+            return _DataLoader(
+                test_dataset,
+                batch_size=self.args.per_device_eval_batch_size * self.world_size,
+                collate_fn=self.data_collator,
+                num_workers=self.args.dataloader_num_workers,
+                **additional_config,
+            )
+        else:
+            test_sampler = self._get_eval_sampler(test_dataset)
+            if self.args.distributed_dataloader:
+                logger.info("Test using DistDataLoader.")
+                additional_config = {"eval": True}
+            else:
+                additional_config = {}
+            # We use the same batch_size as for eval.
+            return _DataLoader(
+                test_dataset,
+                batch_sampler=test_sampler,
+                collate_fn=self.data_collator,
+                drop_last=self.args.dataloader_drop_last,
+                **additional_config,
+            )
+
+    def create_optimizer_and_scheduler(self, num_training_steps: int):
+        """
+        Setup the optimizer and the learning rate scheduler.
+
+        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+        Trainer's init through `optimizers`, or subclass and override this method (or `create_optimizer` and/or
+        `create_scheduler`) in a subclass.
+        """
+        self.create_scheduler(num_training_steps=num_training_steps)
+        self.create_optimizer(self.lr_scheduler)
+
+    def create_optimizer(self, lr_scheduler=None):
+        """
+        Setup the optimizer.
+
+        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
+        """
+        if self.optimizer is None:
+            if self.optimizer_grouped_parameters is not None:
+                params = self.optimizer_grouped_parameters
+                apply_decay_param_fun = None
+            else:
+                params = self.model.parameters()
+                decay_parameters = [
+                    p.name for n, p in self.model.named_parameters() if not any(nd in n for nd in ["bias", "norm"])
+                ]
+
+                def apply_decay_param_fun(x):
+                    return x in decay_parameters
+
+            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
+            if hasattr(optimizer_cls, "_create_master_weight") and self.args.fp16_opt_level == "O2":
+                optimizer_kwargs["multi_precision"] = True
+
+            self.optimizer = optimizer_cls(
+                learning_rate=self.lr_scheduler if lr_scheduler is None else lr_scheduler,
+                apply_decay_param_fun=apply_decay_param_fun,
+                parameters=params,
+                weight_decay=self.args.weight_decay,
+                grad_clip=nn.ClipGradByGlobalNorm(self.args.max_grad_norm) if self.args.max_grad_norm > 0 else None,
+                **optimizer_kwargs,
+            )
+
+        return self.optimizer
+
+    def _load_rng_state(self, checkpoint):
+        # Load RNG states from `checkpoint`
+        if checkpoint is None:
+            return
+
+        # if use distributed training
+        if self.args.world_size > 1:
+            process_index = self.args.process_index
+            rng_file_list = [None for x in range(self.args.world_size)]
+            if self.args.should_save:
+                rng_file = os.path.join(checkpoint, f"rng_state_{self.args.world_size}.pth")
+                if os.path.isfile(rng_file):
+                    rng_file_list = paddle.load(rng_file, return_numpy=True)
+            paddle.distributed.broadcast_object_list(rng_file_list, src=0)
+            # if rng_file_list still empty, not log rng state.
+            if rng_file_list[0] is None:
+                logger.info(
+                    f"Didn't find an RNG file for process {process_index}, if you are resuming a training that "
+                    "wasn't launched in a distributed fashion, reproducibility is not guaranteed."
+                )
+                return
+            else:
+                checkpoint_rng_state = rng_file_list[process_index]
+        else:
+            rng_file = os.path.join(checkpoint, "rng_state.pth")
+            if not os.path.isfile(rng_file):
+                logger.info(
+                    "Didn't find an RNG file, if you are resuming a training that was launched in a distributed "
+                    "fashion, reproducibility is not guaranteed."
+                )
+                return
+
+            checkpoint_rng_state = paddle.load(rng_file, return_numpy=True)
+
+        random.setstate(checkpoint_rng_state["python"])
+        np.random.set_state(checkpoint_rng_state["numpy"])
+
+        core.default_cpu_generator().set_state(checkpoint_rng_state["cpu"])
+        if core.is_compiled_with_cuda():
+            if not len(checkpoint_rng_state["cuda"]) == core.get_cuda_device_count():
+                raise ValueError("Length of gpu state list shoule be equal to the gpu device count")
+            for i in range(core.get_cuda_device_count()):
+                core.default_cuda_generator(i).set_state(checkpoint_rng_state["cuda"][i])
+
+        if paddle.device.get_all_custom_device_type() is not None:
+            custom_device_type = paddle.device.get_all_custom_device_type()
+            for device in custom_device_type:
+                if not len(checkpoint_rng_state["cuda"]) == core.get_custom_device_count(device):
+                    raise ValueError("Length of custom device state list shoule be equal to the custom device count")
+                for i in range(core.get_custom_device_count(device)):
+                    core.default_custom_device_generator(paddle.CustomPlace(device, i)).set_state(
+                        checkpoint_rng_state["cuda"][i]
+                    )
+
+        if self.args.use_hybrid_parallel:
+            if "hybrid_parallel_rng_state_tracker" in checkpoint_rng_state:
+                if self.args.tensor_parallel_degree <= 1:
+                    checkpoint_rng_state["hybrid_parallel_rng_state_tracker"].pop("model_parallel_rng", None)
+                fleet.meta_parallel.get_rng_state_tracker().set_states_tracker(
+                    checkpoint_rng_state["hybrid_parallel_rng_state_tracker"]
+                )
+            else:
+                logger.warning("Not found hybrid parallel RNG state.")
+
+    @staticmethod
+    def get_optimizer_cls_and_kwargs(args: TrainingArguments) -> Tuple[Any, Any]:
+        """
+        Returns the optimizer class and optimizer parameters based on the training arguments.
+
+        Args:
+            args (`paddlenlp.training_args.TrainingArguments`):
+                The training arguments for the training session.
+
+        """
+        # optimizer_kwargs = {"lr": args.learning_rate}
+        optimizer_kwargs = {}
+        adam_kwargs = {
+            "beta1": args.adam_beta1,
+            "beta2": args.adam_beta2,
+            "epsilon": args.adam_epsilon,
+        }
+        if args.optim == OptimizerNames.ADAMW:
+            from paddle.optimizer import AdamW
+
+            optimizer_cls = AdamW
+            optimizer_kwargs.update(adam_kwargs)
+        else:
+            raise ValueError(f"Trainer cannot instantiate unsupported optimizer: {args.optim}")
+        return optimizer_cls, optimizer_kwargs
+
+    def create_scheduler(self, num_training_steps: int):
+        """
+        Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
+        passed as an argument.
+
+        Args:
+            num_training_steps (int): The number of training steps to do.
+        """
+        warmup = (
+            self.args.warmup_steps if self.args.warmup_steps > 0 else int(self.args.warmup_ratio * num_training_steps)
+        )
+        decay_steps = num_training_steps
+        if getattr(self.args, "decay_steps", None) and self.args.decay_steps > 0:
+            decay_steps = self.args.decay_steps
+
+        if self.lr_scheduler is None:
+            self.lr_scheduler = get_scheduler(
+                self.args.lr_scheduler_type,
+                learning_rate=self.args.learning_rate,
+                num_warmup_steps=warmup,
+                num_training_steps=decay_steps,
+                num_cycles=self.args.num_cycles,
+                lr_end=self.args.lr_end,
+                power=self.args.power,
+            )
+
+        return self.lr_scheduler
+
+    def num_examples(self, dataloader: DataLoader) -> int:
+        """
+        Helper to get number of samples in a [`~paddle.io.DataLoader`] by accessing its dataset. When
+        dataloader.dataset does not exist or has no length, estimates as best it can
+        """
+        try:
+            dataset = dataloader.dataset
+            # Special case for IterableDatasetShard, we need to dig deeper
+            if isinstance(dataset, IterableDatasetShard):
+                return len(dataloader.dataset.dataset)
+            return len(dataloader.dataset)
+        except (NameError, AttributeError, TypeError):  # no dataset or length, estimate by length of dataloader
+            return len(dataloader) * self.args.per_device_train_batch_size
+
+    def _decorate_exclude_layers(self, model: nn.Layer):
+        """
+        Exclude layers from the model for paddle.amp.decorate.
+        Args:
+            model (`nn.Layer`): The model to exclude layers from.
+        Returns:
+            A list of excluded layers.
+        """
+        exclude_layers = []
+        return exclude_layers
+
+    def _wrap_model(self, model, training=True):
+
+        # train/eval could be run multiple-times - if already wrapped, don't re-wrap it again
+        if unwrap_model(model) is not model:
+            return model
+
+        # Note: in paddle.distributed mode, there's no point in wrapping the model
+        # inside a DistributedDataParallel as we'll be under `no_grad` anyways.
+        if not training:
+            return model
+
+        # Mixed precision training
+        if training and self.do_grad_scaling:  # self.args.fp16_opt_level=="O2":
+            # model, self.optimizer
+            decorated = paddle.amp.decorate(
+                models=model,
+                optimizers=self.optimizer,
+                level=self.args.fp16_opt_level,
+                dtype=self.amp_dtype,
+                excluded_layers=[QuantizationLinear] + self._decorate_exclude_layers(model),
+            )
+
+            if self.optimizer is None:
+                model = decorated
+            else:
+                model, self.optimizer = decorated
+
+        if self.args.world_size == 1:
+            if self.args.amp_master_grad:
+                mix_precision_utils.MixPrecisionLayer(model, dtype=self.amp_dtype)
+                assert self.optimizer is not None, "optimizer is empty!"
+                self.optimizer = mix_precision_utils.MixPrecisionOptimizer(self.optimizer)
+
+        in_pipeline_parallel_mode = self.args.pipeline_parallel_degree > 1
+        in_sharding_parallel_mode = self.sharding is not None
+        in_tensor_parallel_mode = self.args.tensor_parallel_degree > 1
+        in_sep_parallel_mode = self.args.sep_parallel_degree > 1
+        in_cp_parallel_mode = self.args.context_parallel_degree > 1
+
+        # Multi-gpu training
+        if self.args.world_size > 1 and (not self.args.use_hybrid_parallel):
+            # MOE use DDP to broadcaset parameters.
+            ddp_kwargs = {}
+            if self.args.ddp_find_unused_parameters is not None:
+                ddp_kwargs["find_unused_parameters"] = self.args.ddp_find_unused_parameters
+            elif isinstance(model, PretrainedModel):
+                # find_unused_parameters breaks checkpointing as per
+                # https://github.com/huggingface/transformers/pull/4659#issuecomment-643356021
+                ddp_kwargs["find_unused_parameters"] = not any(
+                    hasattr(m, "enable_recompute") and m.enable_recompute for m in model.sublayers(include_self=True)
+                )
+            else:
+                ddp_kwargs["find_unused_parameters"] = True
+            model = paddle.DataParallel(model, **ddp_kwargs)
+            # Distributed training (should be after fp16 initialization)
+
+            if self.args.amp_master_grad:
+                mix_precision_utils.MixPrecisionLayer(model, dtype=self.amp_dtype)
+                assert self.optimizer is not None, "optimizer is empty!"
+                self.optimizer = mix_precision_utils.MixPrecisionOptimizer(self.optimizer)
+
+        # Pipeline mode
+        if in_pipeline_parallel_mode:
+            if self.args.amp_master_grad:
+                mix_precision_utils.MixPrecisionLayer(model, dtype=self.amp_dtype)  # return value has no use
+            # hack for pipeline model mini batch to batch
+            # need batter solution @ZHUI
+            # make batch_fn compatible for fleet.distributed_model decorate.
+            prepare_pipeline_inputs_func = (
+                model._prepare_pipeline_inputs_func if hasattr(model, "_prepare_pipeline_inputs_func") else None
+            )
+            if isinstance(model, LoRAModel):
+                model = model.model
+            model = fleet.distributed_model(model)
+            if prepare_pipeline_inputs_func is not None:
+                model._prepare_pipeline_inputs_func = prepare_pipeline_inputs_func
+            else:
+
+                def _prepare_pipeline_inputs_func(inputs):
+                    first_stage_keys = ["input_ids", "attention_mask", "position_ids"]
+                    last_stage_keys = ["labels"]
+
+                    def get_expected_keys(inputs, keys):
+                        ret = tuple([inputs.pop(k) for k in keys if k in inputs])
+                        if len(ret) == 1:
+                            ret = ret[0]
+                        return ret
+
+                    if type(inputs) is dict or type(inputs) is OrderedDict:
+                        return [
+                            get_expected_keys(inputs, first_stage_keys),
+                            get_expected_keys(inputs, last_stage_keys),
+                        ]
+
+                    keys = list(inputs[0].keys())
+                    inputs_batch = {key: [data.pop(key) for data in inputs] for key in keys}
+                    return [
+                        get_expected_keys(inputs_batch, first_stage_keys),
+                        get_expected_keys(inputs_batch, last_stage_keys),
+                    ]
+
+                logger.warning(
+                    "Using default prepare pipeline inputs func, only support input_ids and labels as inputs."
+                )
+                model._prepare_pipeline_inputs_func = _prepare_pipeline_inputs_func
+
+            assert self.optimizer is not None, "Pipeline mode need decorate optimizer, pelease init optimizer."
+            if self.args.amp_master_grad:
+                self.optimizer = mix_precision_utils.MixPrecisionOptimizer(self.optimizer)
+            self.optimizer = fleet.distributed_optimizer(self.optimizer)
+
+        # No pipeline mode, sharding only
+        if not in_pipeline_parallel_mode and in_sharding_parallel_mode:
+            # Sharded DDP!
+            if self.args.tensor_parallel_degree > 1:
+                hcg = fleet.get_hybrid_communicate_group()
+                assert (
+                    ShardingOption.SHARD_GRAD_OP in self.args.sharding or ShardingOption.SHARD_OP in self.args.sharding
+                ), "Only support tensor parallel + sharding stage1/stage2 hybrid parallel now."
+                model = paddle.distributed.fleet.meta_parallel.TensorParallel(model, hcg, strategy=None)
+
+            if ShardingOption.SHARD_OP in self.args.sharding:
+                if self.args.amp_master_grad:
+                    mix_precision_utils.MixPrecisionLayer(model, dtype=self.amp_dtype)  # return value has no use
+                model = fleet.distributed_model(model)
+                if self.args.amp_master_grad:
+                    self.optimizer = mix_precision_utils.MixPrecisionOptimizer(self.optimizer)
+                self.optimizer = fleet.distributed_optimizer(self.optimizer)
+            else:
+                cpu_offload = ShardingOption.OFFLOAD in self.args.sharding
+                assert self.optimizer is not None, "optimizer is empty!"
+                level = None
+                if ShardingOption.SHARD_GRAD_OP in self.args.sharding:
+                    level = "os_g"
+                if ShardingOption.FULL_SHARD in self.args.sharding:
+                    level = "p_g_os"
+
+                from paddle.distributed.sharding import group_sharded_parallel
+
+                # add dp_group and exclude_layer params
+                # https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/distributed/sharding/group_sharded_parallel_cn.html#group-sharded-parallel
+                extra_kwargs = {}
+                extra_kwargs["dp_group"] = self.dp_group
+                extra_kwargs["exclude_layer"] = ["GroupNorm"]
+
+                if self.args.amp_master_grad:
+                    assert (
+                        self.args.data_parallel_degree == 1
+                    ), "Sharding stage 2 / Sharding stage 3 main grad is not compatible with dp for now."
+                    mix_precision_utils.MixPrecisionLayer(model, dtype=self.amp_dtype)  # return value has no use
+                    self.optimizer = mix_precision_utils.MixPrecisionOptimizer(self.optimizer)
+
+                model, optimizer, _ = group_sharded_parallel(
+                    model,
+                    self.optimizer,
+                    level=level,
+                    scaler=None,
+                    group=self.sharding_group,
+                    offload=cpu_offload,
+                    **extra_kwargs,
+                )
+                if ShardingOption.SHARD_GRAD_OP in self.args.sharding and self.args.amp_master_grad:
+                    assert hasattr(optimizer, "use_main_grad"), (
+                        "Current installed paddle doesn't support sharding stage 2 with main grad, "
+                        "please upgrade your paddle (using nightly version)."
+                    )
+
+                if level == "os_g" and "enable_stage2_overlap" in self.args.sharding_parallel_config:
+                    model._set_reduce_overlap(True)
+                    optimizer._set_broadcast_overlap(True, model)
+
+                self.optimizer = optimizer
+        # pure tesnor parallel mode, no pipeline_parallel, no sharding.
+        if (
+            not in_pipeline_parallel_mode
+            and not in_sharding_parallel_mode
+            and (in_tensor_parallel_mode or in_sep_parallel_mode or in_cp_parallel_mode)
+        ):
+            if self.args.amp_master_grad:
+                mix_precision_utils.MixPrecisionLayer(model, dtype=self.amp_dtype)  # return value has no use
+
+            model = fleet.distributed_model(model)
+            assert self.optimizer is not None, "Tensor parallel mode need decorate optimizer, pelease init optimizer."
+            if self.args.amp_master_grad:
+                self.optimizer = mix_precision_utils.MixPrecisionOptimizer(self.optimizer)
+            self.optimizer = fleet.distributed_optimizer(self.optimizer)
+
+        # stage1 has v1 and v2 version
+        if in_sharding_parallel_mode and ShardingOption.SHARD_OP in self.args.sharding:
+            if "split_param" in self.args.sharding_parallel_config:
+                if (
+                    hasattr(self.optimizer, "_set_all_gather_overlap_forward")
+                    and "enable_stage1_allgather_overlap" in self.args.sharding_parallel_config
+                ):
+                    self.optimizer._set_all_gather_overlap_forward(True, model)
+            else:
+                if (
+                    hasattr(self.optimizer, "_set_broadcast_overlap")
+                    and "enable_stage1_broadcast_overlap" in self.args.sharding_parallel_config
+                ):
+                    self.optimizer._set_broadcast_overlap(True, model)
+
+        return model
+
+    def _prepare_input(self, data: Union[paddle.Tensor, Any]) -> Union[paddle.Tensor, Any]:
+        """
+        Prepares one `data` before feeding it to the model, be it a tensor or a nested list/dictionary of tensors.
+        """
+        if isinstance(data, Mapping):
+            return type(data)({k: self._prepare_input(v) for k, v in data.items()})
+        elif isinstance(data, (tuple, list)):
+            return type(data)(self._prepare_input(v) for v in data)
+        elif isinstance(data, paddle.Tensor):
+            # kwargs = dict(device=self.args.current_device)
+            # update data type for pure fp16
+            if data.place.is_cuda_pinned_place():
+                return data.cuda()
+            return data
+            # return data.to(**kwargs)
+        return data
+
+    def _prepare_inputs(self, inputs: Dict[str, Union[paddle.Tensor, Any]]) -> Dict[str, Union[paddle.Tensor, Any]]:
+        """
+        Prepare `inputs` before feeding them to the model, converting them to tensors if they are not already and
+        handling potential state.
+        """
+        inputs = self._prepare_input(inputs)
+        if self.args.past_index >= 0 and self._past is not None:
+            inputs["mems"] = self._past
+
+        return inputs
+
+    def autocast_smart_context_manager(self):
+        """
+        A helper wrapper that creates an appropriate context manager for `autocast` while feeding it the desired
+        arguments, depending on the situation.
+        """
+        if self.enable_autocast_context_manager:
+            custom_black_list = ["reduce_sum", "c_softmax_with_cross_entropy"]
+            custom_white_list = []
+            if self.args.fp16_opt_level == "O2":
+                # https://github.com/PaddlePaddle/Paddle/blob/eb97f4f0adca40b16a309b927e480178beb8ae96/python/paddle/amp/amp_lists.py#L85-L86
+                # the lookup_table is in black_list, but in O2, we need it return fp16
+                custom_white_list.extend(["lookup_table", "lookup_table_v2"])
+
+            if self.args.amp_custom_white_list is not None:
+                custom_white_list.extend(self.args.amp_custom_white_list)
+            if self.args.amp_custom_black_list is not None:
+                custom_black_list.extend(self.args.amp_custom_black_list)
+
+            ctx_manager = autocast(
+                True,
+                custom_black_list=set(custom_black_list),
+                custom_white_list=set(custom_white_list),
+                level=self.args.fp16_opt_level,
+                dtype=self.amp_dtype,
+            )
+        else:
+            ctx_manager = contextlib.nullcontext() if sys.version_info >= (3, 7) else contextlib.suppress()
+
+        return ctx_manager
+
+    def compute_loss(self, model, inputs, return_outputs=False):
+        """
+        How the loss is computed by Trainer. By default, all models return the loss in the first element.
+        Subclass and override for custom behavior.
+        """
+        if self.criterion is not None:
+            if "labels" in inputs:
+                labels = inputs.pop("labels")
+            elif "start_positions" in inputs and "end_positions" in inputs:
+                labels = (inputs.pop("start_positions"), inputs.pop("end_positions"))
+            elif self.args.label_names is not None:
+                labels = []
+                for label in self.label_names:
+                    labels.append(inputs.pop(label))
+                labels = tuple(labels)
+            elif "generator_labels" in inputs:
+                labels = inputs["generator_labels"]
+        else:
+            labels = None
+
+        outputs = model(**inputs)
+
+        if self.criterion is not None:
+            loss = self.criterion(outputs, labels)
+            outputs = (loss, outputs)
+
+        # Save past state if it exists
+        # TODO: this needs to be fixed and made cleaner later.
+        if self.args.past_index >= 0:
+            self._past = outputs[self.args.past_index]
+
+        # We don't use .loss here since the model may return tuples instead of ModelOutput.
+        loss = outputs["loss"] if isinstance(outputs, dict) else outputs
+        if isinstance(outputs, dict):
+            loss = outputs["loss"]
+        elif isinstance(outputs, tuple):
+            loss = outputs[0]
+        else:
+            loss = outputs
+
+        return (loss, outputs) if return_outputs else loss
+
+    def _enable_delay_scale_loss(self):
+        key = "enable_delay_scale_loss"
+        if self.args.pipeline_parallel_degree > 1:
+            return key in self.args.pipeline_parallel_config
+        elif self.args.tensor_parallel_degree > 1:
+            return key in self.args.tensor_parallel_config
+        else:
+            return False
+
+    def training_step(self, model: nn.Layer, inputs: Dict[str, Union[paddle.Tensor, Any]]) -> paddle.Tensor:
+        """
+        Perform a training step on a batch of inputs.
+
+        Subclass and override to inject custom behavior.
+
+        Args:
+            model (`nn.Layer`):
+                The model to train.
+            inputs (`Dict[str, Union[paddle.Tensor, Any]]`):
+                The inputs and targets of the model.
+
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument `labels`. Check your model's documentation for all accepted arguments.
+
+        Return:
+            `paddle.Tensor`: The tensor with training loss on this batch.
+        """
+        if self.args.pipeline_parallel_degree > 1:
+            return self.training_pipeline_step(model, inputs)
+
+        model.train()
+        inputs = self._prepare_inputs(inputs)
+        with self.autocast_smart_context_manager():
+            loss = self.compute_loss(model, inputs)
+
+        if self.args.gradient_accumulation_steps > 1 and not self._enable_delay_scale_loss():
+            loss = loss / self.args.gradient_accumulation_steps
+
+        if self.do_grad_scaling:
+            self.scaler.scale(loss).backward()
+        else:
+            loss.backward()
+        return loss.detach()
+
+    def training_pipeline_step(self, model: nn.Layer, inputs: Dict[str, Union[paddle.Tensor, Any]]) -> paddle.Tensor:
+        """
+        Perform a training step on a batch of inputs.
+
+        Subclass and override to inject custom behavior.
+
+        Args:
+            model (`nn.Layer`):
+                The model to train.
+            inputs (`Dict[str, Union[paddle.Tensor, Any]]`):
+                The inputs and targets of the model.
+
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument `labels`. Check your model's documentation for all accepted arguments.
+
+        Return:
+            `paddle.Tensor`: The tensor with training loss on this batch.
+        """
+        # accumulation data
+        if not hasattr(self, "_pp_data_buffer"):
+            self._pp_data_buffer = []
+        self._pp_data_buffer.append(inputs)
+        if len(self._pp_data_buffer) != self.args.gradient_accumulation_steps:
+            return paddle.zeros([])
+
+        # for v in self._pp_data_buffer[0].values():
+        #     assert isinstance(v, paddle.Tensor), f"Only support tensor as pipeline mode input, got type {type(v)}"
+
+        inputs = model._prepare_pipeline_inputs_func(self._pp_data_buffer)
+        self._pp_data_buffer = []
+
+        model.train()
+        # hack pipeline-layers
+        # since the pipeline layer will check input is valid every iter.
+        # in same case,  for example, batch size warmup, we need dynamic change gradient_accumulation_steps to implement.
+        config_backup = model.micro_batch_size, model.accumulate_steps
+        model.micro_batch_size = self.args.per_device_train_batch_size
+        model.accumulate_steps = self.args.gradient_accumulation_steps
+
+        if model._dp_comm_overlap or model._sharding_comm_overlap:
+            for _, buffers in model._chunk_2_comm_buffers.items():
+                for buffer in buffers:
+                    buffer._acc_steps = self.args.gradient_accumulation_steps
+
+        inputs = model._prepare_training(
+            inputs, self.optimizer, self.lr_scheduler
+        )  # None, None => [optimizer, lr_scheduler]
+        model.optimizer = None  # we do not use `PipelineParallel` to handler optimizer step
+        model.lr_scheduler = None
+
+        with self.autocast_smart_context_manager():
+            loss = model.forward_backward_pipeline(inputs, self.scaler if self.do_grad_scaling else None)
+
+        model.micro_batch_size, model.accumulate_steps = config_backup
+
+        return loss.detach()
+
+    def save_model(self, output_dir: Optional[str] = None, merge_tensor_parallel: Optional[bool] = False):
+        """
+        Will save the model, so you can reload it using `from_pretrained()`.
+
+        Will only save from the main process.
+        """
+
+        if output_dir is None:
+            output_dir = self.args.output_dir
+
+        if ShardingOption.FULL_SHARD in self.args.sharding:
+            self.model_wrapped.get_all_parameters(convert2cpu=True)
+
+        if self.args.should_save_model_state:
+            unified_checkpoint_config_backup = self.args.unified_checkpoint_config
+            # backup and remove unified_checkpoint_config for not trine stage
+            if not self.is_in_train:
+                self.args.unified_checkpoint_config = []
+
+            self._save(output_dir=output_dir, merge_tensor_parallel=merge_tensor_parallel)
+
+            # recover unified_checkpoint_config for not trine stage
+            if not self.is_in_train:
+                self.args.unified_checkpoint_config = unified_checkpoint_config_backup
+        else:
+            if self.args.unified_checkpoint and "async_save" in self.args.unified_checkpoint_config:
+                if self.is_in_train:
+                    global_rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1 else -1
+                    paddle.save(global_rank, os.path.join(output_dir, f".model_weight.done.{global_rank}"))
+
+        if strtobool(os.getenv("FLAG_LLM_PDC", "False")):
+            # save model_done file to ensure model is complete
+            if (
+                self.args.should_save_model_state
+                and self.args.should_save
+                and not ("async_save" in self.args.unified_checkpoint_config)
+            ):
+                # For ckpt integrity
+                paddle.save(self.state.global_step, os.path.join(output_dir, ".model_done"))
+        if (
+            self.args.unified_checkpoint
+            and "async_save" in self.args.unified_checkpoint_config
+            and not self.is_in_train
+        ):
+            global_rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1 else -1
+            paddle.save(self.state.global_step, os.path.join(output_dir, f".model_weight.done.{global_rank}"))
+
+    def _filter_moe_no_sync_optimizer_params(self):
+        """
+        filter optimizer params which should not sync
+        """
+        state_dict = self.model.state_dict()
+        optimzier_state_dict = self.optimizer.state_dict()
+        filter_optimzier_state_dict = OrderedDict()
+        param_names_in_master_weights = list(optimzier_state_dict["master_weights"].keys()) if self.args.bf16 else []
+        filter_optimzier_state_dict["master_weights"] = OrderedDict()
+        for k, v in state_dict.items():
+            if getattr(v, "no_sync", False):
+                if v.name in param_names_in_master_weights:
+                    filter_optimzier_state_dict["master_weights"][v.name] = optimzier_state_dict["master_weights"][
+                        v.name
+                    ]
+                for op_k, op_v in optimzier_state_dict.items():
+                    if op_k.startswith(v.name):
+                        filter_optimzier_state_dict[op_k] = op_v
+        return filter_optimzier_state_dict
+
+    def _save_checkpoint(self, model, metrics=None):
+        # assert unwrap_model(model) is self.model, "internal model should be a reference to self.model"
+        self.runtime_timer.start("checkpoint saving time")
+
+        # Save model checkpoint
+        checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+
+        run_dir = self.args.output_dir
+
+        output_dir = os.path.join(run_dir, checkpoint_folder)
+
+        if isinstance(self.model, LoRAModel) and (self.model.quantized or self.args.pipeline_parallel_degree > 1):
+            self.save_model(output_dir)
+        elif isinstance(self.model, LoRAModel) or isinstance(self.model, PrefixModelForCausalLM):
+            self.save_model(output_dir, True)
+        else:
+            self.save_model(output_dir)
+
+        # only save model state dict, ignore optimizer and scheduler
+        if not self.args.ignore_save_lr_and_optim:
+            optimizer_name = _add_variant(OPTIMIZER_NAME, self.args.optimizer_name_suffix)
+            saved_signal_path = os.path.join(output_dir, f"saved_signal_{dist.get_rank()}")
+
+            if self.args.use_hybrid_parallel:
+                if self.dp_group.rank <= 0 or self.args.use_expert_parallel:
+                    os.makedirs(output_dir, exist_ok=True)
+                    logger.info("Saving optimizer files.")
+                    if self.args.unified_checkpoint:
+                        self.unified_checkpoint_handler.save_unified_optimizer(
+                            self.model,
+                            self.optimizer,
+                            output_dir,
+                        )
+                    else:
+                        if self.dp_group.rank > 0:  # this should only work for MoE saving
+                            self._save_ckpt_func(
+                                self._filter_moe_no_sync_optimizer_params(),
+                                os.path.join(output_dir, optimizer_name),
+                            )
+                        else:
+                            state_dict = self.optimizer.state_dict()
+                            save_path = os.path.join(output_dir, optimizer_name)
+                            if self.args.use_async_save:
+                                assert not strtobool(os.getenv("FLAG_LLM_PDC", "False")), "Dont support FLAG_LLM_PDC"
+                                self._async_optimizer_saver.run(
+                                    state_dict, save_path, saved_signal_path=saved_signal_path
+                                )
+                            else:
+                                self._save_ckpt_func(state_dict, save_path)
+                                with open(saved_signal_path, mode="w+") as f:
+                                    f.write("1")
+
+            if self.args.should_save or self.args.use_expert_parallel:
+                if not self.args.use_hybrid_parallel:
+                    logger.info("Saving optimizer files.")
+                    if self.args.unified_checkpoint:
+                        self.unified_checkpoint_handler.save_unified_optimizer(
+                            self.model,
+                            self.optimizer,
+                            output_dir,
+                        )
+                    else:
+                        if self.args.data_parallel_rank > 0 and self.args.use_expert_parallel:
+                            self._save_ckpt_func(
+                                self._filter_moe_no_sync_optimizer_params(), os.path.join(output_dir, OPTIMIZER_NAME)
+                            )
+                        else:
+                            self._save_ckpt_func(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
+
+                # FIXME: maybe only save one copy
+                paddle.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
+
+                if self.do_grad_scaling:
+                    paddle.save(self.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME))
+
+        self.runtime_timer.stop()
+        # Determine the new best metric / best model checkpoint
+        if metrics is not None and self.args.metric_for_best_model is not None:
+            metric_to_check = self.args.metric_for_best_model
+            if not metric_to_check.startswith("eval_"):
+                metric_to_check = f"eval_{metric_to_check}"
+            metric_value = metrics[metric_to_check]
+
+            operator = np.greater if self.args.greater_is_better else np.less
+            if (
+                self.state.best_metric is None
+                or self.state.best_model_checkpoint is None
+                or operator(metric_value, self.state.best_metric)
+            ):
+                self.state.best_metric = metric_value
+                self.state.best_model_checkpoint = output_dir
+
+        # Save the Trainer state
+        if self.args.should_save:
+            self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
+
+        # Save RNG state in non-distributed training
+        rng_states = {
+            "python": random.getstate(),
+            "numpy": np.random.get_state(),
+            "cuda": paddle.get_rng_state(),
+            "cpu": paddle.framework.core.default_cpu_generator().get_state(),
+        }
+        if self.args.use_hybrid_parallel:
+            rng_states[
+                "hybrid_parallel_rng_state_tracker"
+            ] = fleet.meta_parallel.get_rng_state_tracker().get_states_tracker()
+
+        if self.args.world_size > 1:
+            rng_states_list = []
+            paddle.distributed.all_gather_object(rng_states_list, rng_states)
+            if self.args.should_save:
+                os.makedirs(output_dir, exist_ok=True)
+                paddle.save(rng_states_list, os.path.join(output_dir, f"rng_state_{self.args.world_size}.pth"))
+        else:
+            os.makedirs(output_dir, exist_ok=True)
+            paddle.save(rng_states, os.path.join(output_dir, "rng_state.pth"))
+
+        # Maybe delete some older checkpoints.
+        # For hybrid parallel training, the checkpoint files maybe on different node.
+        need_to_rotate_checkpoints = False
+        if self.args.use_hybrid_parallel:
+            if self.dp_group.rank <= 0:
+                need_to_rotate_checkpoints = True
+        else:
+            need_to_rotate_checkpoints = self.args.should_save_model_state
+
+        # Delete only by one process
+        need_to_rotate_checkpoints = need_to_rotate_checkpoints and self.args.local_rank == 0
+        if need_to_rotate_checkpoints:
+            self._rotate_checkpoints(use_mtime=True, output_dir=run_dir)
+
+        if strtobool(os.getenv("FLAG_LLM_PDC", "False")) and not ("async_save" in self.args.unified_checkpoint_config):
+            # save checkpoint_done file to ensure checkpoint is complete
+            if self.args.should_save_model_state and self.args.should_save:
+                # For ckpt integrity
+                paddle.save(self.state.global_step, os.path.join(output_dir, ".checkpoint_done"))
+
+    def set_optimizer_grouped_parameters(self, optimizer_grouped_parameters=None):
+        """
+        set optimizer grouped parameters:
+
+        you can set optimizer_grouped_parameters with whatever argments on whatever parameters to train.
+        """
+        self.optimizer_grouped_parameters = optimizer_grouped_parameters
+
+    def disable_autocast_context_manager(self):
+        """
+        For pure fp16 or pure bf16 training, the paddle.amp.autocast is annoy for always cast fp32 to fp16.
+        if you networks cast fp16 to fp32 manually to get higher precision, autocast make it not work, since it cast fp32 to fp16 back.
+
+        """
+        assert self.args.fp16_opt_level == "O2", "disable_autocast_context_manager should only work for pure fp16/bf16"
+        self.enable_autocast_context_manager = False
+
+    def _sorted_checkpoints(
+        self, output_dir=None, checkpoint_prefix=PREFIX_CHECKPOINT_DIR, use_mtime=False
+    ) -> List[str]:
+        ordering_and_checkpoint_path = []
+
+        glob_checkpoints = [str(x) for x in Path(output_dir).glob(f"{checkpoint_prefix}-*")]
+
+        for path in glob_checkpoints:
+            if use_mtime:
+                ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
+            else:
+                regex_match = re.match(f".*{checkpoint_prefix}-([0-9]+)", path)
+                if regex_match is not None and regex_match.groups() is not None:
+                    ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
+
+        checkpoints_sorted = sorted(ordering_and_checkpoint_path)
+        checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
+        # Make sure we don't delete the best model.
+        if self.state.best_model_checkpoint is not None:
+            best_model_index = checkpoints_sorted.index(str(Path(self.state.best_model_checkpoint)))
+            for i in range(best_model_index, len(checkpoints_sorted) - 2):
+                checkpoints_sorted[i], checkpoints_sorted[i + 1] = checkpoints_sorted[i + 1], checkpoints_sorted[i]
+        return checkpoints_sorted
+
+    def _rotate_checkpoints(self, use_mtime=False, output_dir=None) -> None:
+        if self.args.save_total_limit is None or self.args.save_total_limit <= 0:
+            return
+
+        # Check if we should delete older checkpoint(s)
+        checkpoints_sorted = self._sorted_checkpoints(use_mtime=use_mtime, output_dir=output_dir)
+        if len(checkpoints_sorted) <= self.args.save_total_limit:
+            return
+
+        # If save_total_limit=1 with load_best_model_at_end=True, we could end up deleting the last checkpoint, which
+        # we don't do to allow resuming.
+        save_total_limit = self.args.save_total_limit
+        if (
+            self.state.best_model_checkpoint is not None
+            and self.args.save_total_limit == 1
+            and checkpoints_sorted[-1] != self.state.best_model_checkpoint
+        ):
+            save_total_limit = 2
+
+        number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - save_total_limit)
+        checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
+        for checkpoint in checkpoints_to_be_deleted:
+            logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
+            # ignore_errors for shared disks between train nodes.
+            shutil.rmtree(checkpoint, ignore_errors=True)
+
+    def _save(self, output_dir: Optional[str] = None, state_dict=None, merge_tensor_parallel=False):
+        output_dir = output_dir if output_dir is not None else self.args.output_dir
+        os.makedirs(output_dir, exist_ok=True)
+        logger.info(f"Saving model checkpoint to {output_dir}")
+        # Save a trained model and configuration using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+
+        if self.args.should_save:
+            if self.tokenizer is not None:
+                self.tokenizer.save_pretrained(output_dir)
+            # Good practice: save your training arguments together with the trained model
+            paddle.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
+
+        if self.args.unified_checkpoint:
+            self.unified_checkpoint_handler.save_unified_checkpoint(self.model, self.optimizer, output_dir)
+            return
+
+        merge_tensor_parallel = merge_tensor_parallel and self.args.use_hybrid_parallel
+        # peft model
+        if (
+            isinstance(self.model, LoRAModel)
+            or isinstance(self.model, PrefixModelForCausalLM)
+            or isinstance(self.model, VeRAModel)
+        ):
+            self.model.save_pretrained(
+                output_dir,
+                variant=self.args.weight_name_suffix,
+                save_function=self._save_ckpt_func,
+                merge_tensor_parallel=merge_tensor_parallel,
+                is_main_process=self.args.should_save,
+                max_shard_size="1024GB",
+            )
+        # TODO: @ZHUI unifiy unwrap_model(self.model) and self.model
+        elif not isinstance(self.model, PretrainedModel):
+            if isinstance(unwrap_model(self.model), PretrainedModel):
+                if self.args.should_save_sharding_stage1_model:
+                    config_to_save = None
+                    state_dict, config_to_save, weight_name_suffix = self.sharding_io.manipulate_state_dict_and_config(
+                        unwrap_model(self.model), merge_tensor_parallel=merge_tensor_parallel
+                    )
+                    unwrap_model(self.model).save_pretrained(
+                        output_dir,
+                        state_dict=state_dict,
+                        config_to_save=config_to_save,
+                        merge_tensor_parallel=merge_tensor_parallel,
+                        variant=weight_name_suffix,
+                        save_function=self._save_ckpt_func,
+                        is_main_process=self.args.should_save,
+                        max_shard_size="1024GB",
+                    )
+                else:
+                    unwrap_model(self.model).save_pretrained(
+                        output_dir,
+                        merge_tensor_parallel=merge_tensor_parallel,
+                        variant=self.args.weight_name_suffix,
+                        save_function=self._save_ckpt_func,
+                        is_main_process=self.args.should_save,
+                        max_shard_size="1024GB",
+                    )
+            else:
+                logger.info("Trainer.model is not a `PretrainedModel`, only saving its state dict.")
+                if merge_tensor_parallel:
+                    logger.warning("Trainer.model is not a `PretrainedModel`, not suppor for merge_tensor_parallel.")
+                if state_dict is None:
+                    state_dict = self.model.state_dict()
+
+                if self.args.should_save_sharding_stage1_model:
+                    state_dict, _, _ = self.sharding_io.manipulate_state_dict_and_config(
+                        unwrap_model(self.model), merge_tensor_parallel=False, state_dict=state_dict
+                    )
+                    variant = _add_variant(PADDLE_WEIGHTS_NAME, self.args.sharded_name_suffix())
+                else:
+                    variant = _add_variant(PADDLE_WEIGHTS_NAME, self.args.weight_name_suffix)
+
+                self._save_ckpt_func(state_dict, os.path.join(output_dir, variant))
+        else:
+            if isinstance(self.model, PretrainedModel) and self.args.should_save_sharding_stage1_model:
+                config_to_save = None
+                state_dict, config_to_save, weight_name_suffix = self.sharding_io.manipulate_state_dict_and_config(
+                    self.model, merge_tensor_parallel=merge_tensor_parallel
+                )
+                self.model.save_pretrained(
+                    output_dir,
+                    state_dict=state_dict,
+                    config_to_save=config_to_save,
+                    merge_tensor_parallel=merge_tensor_parallel,
+                    variant=weight_name_suffix,
+                    save_function=self._save_ckpt_func,
+                    is_main_process=self.args.should_save,
+                    max_shard_size="1024GB",
+                )
+            else:
+                self.model.save_pretrained(
+                    output_dir,
+                    merge_tensor_parallel=merge_tensor_parallel,
+                    variant=self.args.weight_name_suffix,
+                    save_function=self._save_ckpt_func,
+                    is_main_process=self.args.should_save,
+                    max_shard_size="1024GB",
+                )
+        if self.args.should_save_sharding_stage1_model:
+            self.sharding_io.save_distributed_model_meta(output_dir)
+
+    def _load_optimizer_and_scheduler(self, checkpoint):
+        """If optimizer and scheduler states exist, load them."""
+        self.runtime_timer.start("checkpoint loading time")
+        if checkpoint is None:
+            self.runtime_timer.stop()
+            return
+
+        logger.info("Loading optimizer and scheduler...")
+        if (not self.args.should_load_sharding_stage1_model) and self.args.ignore_load_lr_and_optim:
+            self.runtime_timer.stop()
+            return
+
+        opt_state_dict = None
+        if self.args.should_load_sharding_stage1_model:
+            opt_state_dict = self.sharding_io.load_optimizer_state_with_reshard(
+                checkpoint, OPTIMIZER_NAME, self.model_wrapped
+            )
+        else:
+            use_unified_checkpoint = False
+            if self.args.unified_checkpoint:
+                if self.is_unified_checkpoint(checkpoint):
+                    use_unified_checkpoint = True
+                else:
+                    logger.info("Loading checkpoint, the next checkpoint will be saved as unified checkpoint")
+
+            if not use_unified_checkpoint:
+                if self.args.data_parallel_rank == 0 or self.args.use_expert_parallel:
+                    optimizer_name = _add_variant(OPTIMIZER_NAME, self.args.optimizer_name_suffix)
+                    path = os.path.join(checkpoint, optimizer_name)
+                    if os.path.isfile(path):
+                        opt_state_dict = paddle.load(path)
+                else:
+                    opt_state_dict = None
+            else:
+                opt_state_dict = self.unified_checkpoint_handler.load_unified_optimizer(
+                    args=self.args,
+                    model=self.model,
+                    optimizer=self.optimizer,
+                    resume_from_checkpoint=checkpoint,
+                )
+
+        if self.args.ignore_load_lr_and_optim and opt_state_dict:
+            tmp = self.optimizer.state_dict()
+            tmp["master_weights"] = opt_state_dict["master_weights"]
+            opt_state_dict = tmp
+
+        # broadcast optimizer state in dp group
+        if self.args.local_rank != -1:
+            dist.barrier()
+        if self.args.use_expert_parallel:
+            opt_state_dict = broadcast_moe_optimizer(
+                opt_state_dict,
+                model_state_dict=self.model.state_dict(),
+                broadcast_dp=not self.args.should_load_sharding_stage1_model,
+            )
+        else:
+            if not self.args.should_load_sharding_stage1_model:
+                opt_state_dict = broadcast_dp_optimizer(opt_state_dict)
+
+        if opt_state_dict is not None:
+            # Load in optimizer and scheduler states
+            self.optimizer.set_state_dict(opt_state_dict)
+        else:
+            optimizer_name = _add_variant(OPTIMIZER_NAME, self.args.optimizer_name_suffix)
+            raise ValueError(f"optimizer-state-dict not found, opt: {os.path.join(checkpoint, optimizer_name)}.")
+
+        if not self.args.ignore_load_lr_and_optim:
+            if distributed_isfile(os.path.join(checkpoint, SCHEDULER_NAME)):
+                self.lr_scheduler.set_state_dict(
+                    paddle.load(distributed_file(os.path.join(checkpoint, SCHEDULER_NAME)))
+                )
+            else:
+                raise ValueError(f"scheduler-file not found, scheduler:{os.path.join(checkpoint, SCHEDULER_NAME)}")
+
+            if self.do_grad_scaling and distributed_isfile(os.path.join(checkpoint, SCALER_NAME)):
+                self.scaler.load_state_dict(
+                    paddle.load(distributed_file(os.path.join(checkpoint, SCALER_NAME)), return_numpy=True)
+                )
+        self.runtime_timer.stop()
+
+    def log(self, logs: Dict[str, float], **kwargs) -> None:
+        """
+        Log `logs` on the various objects watching training.
+
+        Subclass and override this method to inject custom behavior.
+
+        Args:
+            logs (`Dict[str, float]`):
+                The values to log.
+        """
+
+        try:
+            from paddle.distributed.fleet.utils.timer_helper import (
+                get_timers as paddle_get_timers,
+            )
+
+            paddle_pipeline_timers = paddle_get_timers()
+        except ImportError:  # paddle version too old, timer not support
+            warnings.warn(f"paddle version:{paddle.__git_commit__} does not support pipeline timer")
+            paddle_pipeline_timers = None
+        except AssertionError:
+            paddle_pipeline_timers = None
+        kwargs.update(timer=self.timers, paddle_pipeline_timers=paddle_pipeline_timers)
+
+        if self.state.epoch is not None:
+            logs["progress_or_epoch"] = round(self.state.epoch, 4)
+        self.state.log_history = []
+        self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs, **kwargs)
+
+    def evaluate(
+        self,
+        eval_dataset: Optional[Dataset] = None,
+        ignore_keys: Optional[List[str]] = None,
+        metric_key_prefix: str = "eval",
+    ) -> Dict[str, float]:
+        """
+        Run evaluation and returns metrics.
+
+        The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
+        (pass it to the init `compute_metrics` argument).
+
+        You can also subclass and override this method to inject custom behavior.
+
+        Args:
+            eval_dataset (`Dataset`, *optional*):
+                Pass a dataset if you wish to override `self.eval_dataset`. If it is an `datasets.Dataset`, columns not
+                accepted by the `model.forward()` method are automatically removed. It must implement the `__len__`
+                method.
+            ignore_keys (`Lst[str]`, *optional*):
+                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
+                gathering predictions.
+            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
+                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
+                "eval_bleu" if the prefix is "eval" (default)
+
+        Returns:
+            A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
+            dictionary also contains the epoch number which comes from the training state.
+        """
+        # memory metrics - must set up as early as possible
+        self._memory_tracker.start()
+
+        eval_dataloader = self.get_eval_dataloader(eval_dataset)
+        start_time = time.time()
+
+        output = self.evaluation_loop(
+            eval_dataloader,
+            description="Evaluation",
+            # No point gathering the predictions if there are no metrics, otherwise we defer to
+            # self.args.prediction_loss_only
+            prediction_loss_only=True if self.compute_metrics is None else None,
+            ignore_keys=ignore_keys,
+            metric_key_prefix=metric_key_prefix,
+            max_eval_iters=self.args.max_evaluate_steps,
+        )
+
+        total_batch_size = self.args.eval_batch_size * self.args.dataset_world_size
+        output.metrics.update(
+            speed_metrics(
+                metric_key_prefix,
+                start_time,
+                num_samples=output.num_samples,
+                num_steps=math.ceil(output.num_samples / total_batch_size),
+            )
+        )
+
+        self.log(output.metrics)
+
+        self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics)
+
+        self._memory_tracker.stop_and_update_metrics(output.metrics)
+
+        return output.metrics
+
+    def evaluation_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        prediction_loss_only: Optional[bool] = None,
+        ignore_keys: Optional[List[str]] = None,
+        metric_key_prefix: str = "eval",
+        max_eval_iters: Optional[int] = -1,
+    ) -> EvalLoopOutput:
+        """
+        Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
+
+        Works both with or without labels.
+        """
+        args = self.args
+
+        prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only
+
+        if self.args.pipeline_parallel_degree > 1:
+            # Only accept wrapped model for pipeline_parallel mode
+            if self.model is self.model_wrapped:
+                # NOTE(gongenlei): when do_train=False, do_eval=True, we need to wrap model for pipeline
+                self.model_wrapped = fleet.distributed_model(self.model_wrapped)
+            model = self.model_wrapped
+        else:
+            model = self.model
+
+        if isinstance(dataloader, paddle.io.DataLoader):
+            batch_size = dataloader.batch_sampler.batch_size
+        elif isinstance(dataloader, _DataLoaderIterBase):
+            # support for inner dataloader
+            batch_size = dataloader._batch_sampler.batch_size
+            # alias for inner dataloader
+            dataloader.dataset = dataloader._dataset
+        else:
+            raise ValueError("Only support for paddle.io.DataLoader")
+
+        num_samples = None
+        if max_eval_iters > 0:
+            # on eval limit steps
+            num_samples = batch_size * self.args.dataset_world_size * max_eval_iters
+            if isinstance(dataloader, _DataLoaderIterBase) and isinstance(
+                dataloader._batch_sampler, NlpDistributedBatchSampler
+            ):
+                consumed_samples = (
+                    ((self.state.global_step) // args.eval_steps)
+                    * max_eval_iters
+                    * args.per_device_eval_batch_size
+                    * args.dataset_world_size
+                )
+                dataloader._batch_sampler.set_epoch(consumed_samples=consumed_samples)
+
+        logger.info(f"***** Running {description} *****")
+
+        if not self.args.distributed_dataloader or (
+            self.args.distributed_dataloader and self.args.should_load_dataset
+        ):
+            if has_length(dataloader):
+                logger.info(f"  Num examples = {self.num_examples(dataloader)}")
+                if max_eval_iters > 0:
+                    logger.info(f"  Total prediction steps = {max_eval_iters}")
+                else:
+                    logger.info(f"  Total prediction steps = {len(dataloader)}")
+            else:
+                logger.info("  Num examples: Unknown")
+                if max_eval_iters > 0:
+                    logger.info(f"  Total prediction steps = {max_eval_iters}")
+
+            logger.info(f"  Pre device batch size = {batch_size}")
+            logger.info(f"  Total Batch size = {batch_size * self.args.dataset_world_size}")
+
+        model.eval()
+
+        self.callback_handler.eval_dataloader = dataloader
+        # Do this before wrapping.
+        eval_dataset = dataloader.dataset
+
+        if args.past_index >= 0:
+            self._past = None
+
+        # Initialize containers
+        # losses/preds/labels on GPU (accumulated for eval_accumulation_steps)
+        losses_host = None
+        preds_host = None
+        labels_host = None
+        # losses/preds/labels on CPU (final containers)
+        all_losses = None
+        all_preds = None
+        all_labels = None
+        # Will be useful when we have an iterable dataset so don't know its length.
+
+        observed_num_examples = 0
+        # Main evaluation loop
+        losses = []
+        for step, inputs in enumerate(dataloader):
+            # Update the observed num examples
+            observed_batch_size = find_batch_size(inputs)
+            if observed_batch_size is not None:
+                observed_num_examples += observed_batch_size
+                # For batch samplers, batch_size is not known by the dataloader in advance.
+                batch_size = observed_batch_size
+
+            # Prediction step
+            loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
+
+            # Update containers on host
+            if loss is not None:
+                # losses = self._nested_gather(loss.repeat(batch_size))
+                losses = self._nested_gather(paddle.tile(loss, repeat_times=[batch_size, 1]))
+                losses_host = losses if losses_host is None else paddle.concat((losses_host, losses), axis=0)
+            if labels is not None:
+                labels = self._pad_across_processes(labels)
+                labels = self._nested_gather(labels)
+                labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)
+            if logits is not None:
+                logits = self._pad_across_processes(logits)
+                logits = self._nested_gather(logits)
+                if self.preprocess_logits_for_metrics is not None:
+                    logits = self.preprocess_logits_for_metrics(logits, labels)
+                preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
+            self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
+
+            # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
+            if args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0:
+                if losses_host is not None:
+                    losses = nested_numpify(losses_host)
+                    all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
+                if preds_host is not None:
+                    logits = nested_numpify(preds_host)
+                    all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
+
+                if labels_host is not None:
+                    labels = nested_numpify(labels_host)
+                    all_labels = (
+                        labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
+                    )
+
+                # Set back to None to begin a new accumulation
+                losses_host, preds_host, labels_host = None, None, None
+
+            if max_eval_iters > 0 and step >= max_eval_iters - 1:
+                break
+
+        # Gather all remaining tensors and put them back on the CPU
+        if losses_host is not None:
+            losses = nested_numpify(losses_host)
+            all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
+        if preds_host is not None:
+            logits = nested_numpify(preds_host)
+            all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
+        if labels_host is not None:
+            labels = nested_numpify(labels_host)
+            all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
+
+        # Number of samples
+        if num_samples is not None:
+            pass
+        elif has_length(eval_dataset):
+            num_samples = len(eval_dataset)
+        # The instance check is weird and does not actually check for the type, but whether the dataset has the right
+        # methods. Therefore we need to make sure it also has the attribute.
+        elif isinstance(eval_dataset, IterableDatasetShard) and hasattr(eval_dataset, "num_examples"):
+            num_samples = eval_dataset.num_examples
+        else:
+            if has_length(dataloader):
+                num_samples = self.num_examples(dataloader)
+            else:  # both len(dataloader.dataset) and len(dataloader) fail
+                num_samples = observed_num_examples
+
+        # Number of losses has been rounded to a multiple of batch_size and in a distributed training, the number of
+        # samplers has been rounded to a multiple of batch_size, so we truncate.
+        if all_losses is not None:
+            all_losses = all_losses[: num_samples * int(self.args.world_size / self.args.dataset_world_size)]
+        if all_preds is not None:
+            all_preds = nested_truncate(
+                all_preds, num_samples * int(self.args.world_size / self.args.dataset_world_size)
+            )
+        if all_labels is not None:
+            all_labels = nested_truncate(
+                all_labels, num_samples * int(self.args.world_size / self.args.dataset_world_size)
+            )
+
+        model.train()
+
+        # Metrics!
+        if self.compute_metrics is not None and all_preds is not None and all_labels is not None:
+            metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
+        else:
+            metrics = {}
+
+        if all_losses is not None:
+            metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item()
+
+        # Prefix all keys with metric_key_prefix + '_'
+        for key in list(metrics.keys()):
+            if not key.startswith(f"{metric_key_prefix}_"):
+                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+        return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples)
+
+    def predict(
+        self, test_dataset: Dataset, ignore_keys: Optional[List[str]] = None, metric_key_prefix: str = "test"
+    ) -> PredictionOutput:
+        """
+        Run prediction and returns predictions and potential metrics.
+        Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method
+        will also return metrics, like in `evaluate()`.
+        Args:
+            test_dataset (`Dataset`):
+                Dataset to run the predictions on. If it is an `datasets.Dataset`, columns not accepted by the
+                `model.forward()` method are automatically removed. Has to implement the method `__len__`
+            ignore_keys (`Lst[str]`, *optional*):
+                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
+                gathering predictions.
+            metric_key_prefix (`str`, *optional*, defaults to `"test"`):
+                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
+                "test_bleu" if the prefix is "test" (default)
+        <Tip>
+        If your predictions or labels have different sequence length (for instance because you're doing dynamic padding
+        in a token classification task) the predictions will be padded (on the right) to allow for concatenation into
+        one array. The padding index is -100.
+        </Tip>
+        Returns: *NamedTuple* A namedtuple with the following keys:
+            - predictions (`np.ndarray`): The predictions on `test_dataset`.
+            - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some).
+            - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained
+              labels).
+        """
+        # memory metrics - must set up as early as possible
+        self._memory_tracker.start()
+
+        test_dataloader = self.get_test_dataloader(test_dataset)
+        start_time = time.time()
+
+        eval_loop = self.evaluation_loop
+        output = eval_loop(
+            test_dataloader,
+            description="Prediction",
+            ignore_keys=ignore_keys,
+            prediction_loss_only=True if self.compute_metrics is None else None,
+            metric_key_prefix=metric_key_prefix,
+            max_eval_iters=self.args.max_evaluate_steps,
+        )
+        total_batch_size = self.args.per_device_eval_batch_size * self.args.dataset_world_size
+        output.metrics.update(
+            speed_metrics(
+                metric_key_prefix,
+                start_time,
+                num_samples=output.num_samples,
+                num_steps=math.ceil(output.num_samples / total_batch_size),
+            )
+        )
+
+        self._memory_tracker.stop_and_update_metrics(output.metrics)
+
+        return PredictionOutput(predictions=output.predictions, label_ids=output.label_ids, metrics=output.metrics)
+
+    def prediction_pipeline_step(
+        self,
+        model: nn.Layer,
+        inputs: Dict[str, Union[paddle.Tensor, Any]],
+        prediction_loss_only: bool,
+        ignore_keys: Optional[List[str]] = None,
+    ) -> Tuple[Optional[paddle.Tensor], Optional[paddle.Tensor], Optional[paddle.Tensor]]:
+        """
+        prediction_step function for pipeline parallel mode.
+        """
+        if hasattr(model, "_prepare_pipeline_inputs_func"):
+            inputs, labels = model._prepare_pipeline_inputs_func(inputs)
+            has_labels = labels is not None
+        else:
+            has_labels = all(inputs.get(k) is not None for k in self.label_names)
+            inputs = self._prepare_inputs(inputs)
+            # labels may be popped when computing the loss (label smoothing for instance) so we grab them first.
+            if has_labels:
+                labels = nested_detach(tuple(inputs.get(name) for name in self.label_names))
+                if len(labels) == 1:
+                    labels = labels[0]
+            else:
+                labels = None
+            inputs = inputs.pop("input_ids")
+
+        with paddle.no_grad():
+            if has_labels:
+                with self.autocast_smart_context_manager():
+                    loss = model.eval_batch([inputs, labels], compute_loss=True)
+                    # loss, outputs = self.compute_loss(model, inputs, return_outputs=True)
+                loss = loss.mean().detach()
+            else:
+                raise ValueError("pipeline mode eval need label!")
+
+        return (loss, None, labels)
+
+    def prediction_step(
+        self,
+        model: nn.Layer,
+        inputs: Dict[str, Union[paddle.Tensor, Any]],
+        prediction_loss_only: bool,
+        ignore_keys: Optional[List[str]] = None,
+    ) -> Tuple[Optional[paddle.Tensor], Optional[paddle.Tensor], Optional[paddle.Tensor]]:
+        """
+        Perform an evaluation step on `model` using `inputs`.
+
+        Subclass and override to inject custom behavior.
+
+        Args:
+            model (`nn.Layer`):
+                The model to evaluate.
+            inputs (`Dict[str, Union[paddle.Tensor, Any]]`):
+                The inputs and targets of the model.
+
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument `labels`. Check your model's documentation for all accepted arguments.
+            prediction_loss_only (`bool`):
+                Whether or not to return the loss only.
+            ignore_keys (`Lst[str]`, *optional*):
+                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
+                gathering predictions.
+
+        Return:
+            Tuple[Optional[paddle.Tensor], Optional[paddle.Tensor], Optional[paddle.Tensor]]: A tuple with the loss,
+            logits and labels (each being optional).
+        """
+        if self.args.pipeline_parallel_degree > 1:
+            # hack for pipeline mode
+            inputs = self._prepare_inputs(inputs)
+            return self.prediction_pipeline_step(model, inputs, prediction_loss_only, ignore_keys)
+
+        has_labels = all(inputs.get(k) is not None for k in self.label_names)
+        inputs = self._prepare_inputs(inputs)
+        if ignore_keys is None:
+            if hasattr(self.model, "config"):
+                ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", [])
+            else:
+                ignore_keys = []
+
+        # labels may be popped when computing the loss (label smoothing for instance) so we grab them first.
+        if has_labels:
+            labels = nested_detach(tuple(inputs.get(name) for name in self.label_names))
+            if len(labels) == 1:
+                labels = labels[0]
+        else:
+            labels = None
+
+        with paddle.no_grad():
+            if has_labels:
+                with self.autocast_smart_context_manager():
+                    loss, outputs = self.compute_loss(model, inputs, return_outputs=True)
+                loss = loss.mean().detach()
+
+                if isinstance(outputs, dict):
+                    logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"])
+                else:
+                    logits = outputs[1:]
+            else:
+                loss = None
+                with self.autocast_smart_context_manager():
+                    outputs = model(**inputs)
+                if isinstance(outputs, dict):
+                    logits = tuple(v for k, v in outputs.items() if k not in ignore_keys)
+                else:
+                    logits = outputs
+                # TODO: this needs to be fixed and made cleaner later.
+                if self.args.past_index >= 0:
+                    self._past = outputs[self.args.past_index - 1]
+
+        if prediction_loss_only:
+            return (loss, None, None)
+
+        logits = nested_detach(logits)
+        if isinstance(logits, (list, tuple)) and len(logits) == 1:
+            logits = logits[0]
+
+        return (loss, logits, labels)
+
+    def is_local_process_zero(self) -> bool:
+        """
+        Whether or not this process is the local (e.g., on one machine if training in a distributed fashion on several
+        machines) main process.
+        """
+        return self.args.local_process_index == 0
+
+    def is_world_process_zero(self) -> bool:
+        """
+        Whether or not this process is the global main process (when training in a distributed fashion on several
+        machines, this is only going to be `True` for one process).
+        """
+        return self.args.process_index == 0
+
+    def _nested_gather(self, tensors):
+        """
+        Gather value of `tensors` (tensor or list/tuple of nested tensors) and convert them to numpy before
+        concatenating them to `gathered`
+        """
+        if tensors is None:
+            return
+        if self.args.local_rank != -1:
+            tensors = distributed_concat(tensors)
+        return tensors
+
+        # Copied from Accelerate.
+
+    def _pad_across_processes(self, tensor, pad_index=-100):
+        """
+        Recursively pad the tensors in a nested list/tuple/dictionary of tensors from all devices to the same size so
+        they can safely be gathered.
+        """
+        if isinstance(tensor, (list, tuple)):
+            return type(tensor)(self._pad_across_processes(t, pad_index=pad_index) for t in tensor)
+        elif isinstance(tensor, dict):
+            return type(tensor)({k: self._pad_across_processes(v, pad_index=pad_index) for k, v in tensor.items()})
+        elif not isinstance(tensor, paddle.Tensor):
+            raise TypeError(
+                f"Can't pad the values of type {type(tensor)}, only of nested list/tuple/dicts of tensors."
+            )
+
+        if len(tensor.shape) < 2:
+            return tensor
+        # Gather all sizes
+        size = paddle.to_tensor(tensor.shape)[None]
+        sizes = self._nested_gather(size).cpu()
+
+        max_size = max(s[1] for s in sizes)
+        if tensor.shape[1] == max_size:
+            return tensor
+
+        # Then pad to the maximum size
+        old_size = tensor.shape
+        new_size = list(old_size)
+        new_size[1] = max_size
+        # new_tensor = tensor.new_zeros(tuple(new_size)) + pad_index
+        new_tensor = paddle.zeros(tuple(new_size), dtype=tensor.dtype) + pad_index
+        new_tensor[:, : old_size[1]] = tensor
+        return new_tensor
+
+    def _set_signature_columns_if_needed(self):
+        if self._signature_columns is None:
+            # Inspect model forward signature to keep only the arguments it accepts.
+            signature = inspect.signature(self.model.forward)
+            self._signature_columns = list(signature.parameters.keys())
+            # Labels may be named label or label_ids, the default data collator handles that.
+            self._signature_columns += list(set(["label", "label_ids"] + self.label_names))
+
+    def _remove_unused_columns(self, dataset: "datasets.Dataset", description: Optional[str] = None):
+        if not self.args.remove_unused_columns:
+            return dataset
+        if self._signature_columns is None:
+            # Inspect model forward signature to keep only the arguments it accepts.
+            signature = inspect.signature(self.model.forward)
+            self._signature_columns = list(signature.parameters.keys())
+            # Labels may be named label or label_ids, the default data collator handles that.
+            self._signature_columns += ["label", "label_ids", "labels", "start_positions", "end_positions"]
+
+        ignored_columns = list(set(dataset.column_names) - set(self._signature_columns))
+        if len(ignored_columns) > 0:
+            dset_description = "" if description is None else f"in the {description} set "
+            logger.info(
+                f"The following columns {dset_description} don't have a corresponding argument in "
+                f"`{self.model.__class__.__name__}.forward` and have been ignored: {', '.join(ignored_columns)}."
+                f" If {', '.join(ignored_columns)} are not expected by `{self.model.__class__.__name__}.forward`, "
+                f" you can safely ignore this message."
+            )
+
+        columns = [k for k in self._signature_columns if k in dataset.column_names]
+
+        if version.parse(datasets.__version__) < version.parse("1.4.0"):
+            dataset.set_format(
+                type=dataset.format["type"], columns=columns, format_kwargs=dataset.format["format_kwargs"]
+            )
+            return dataset
+        else:
+            return dataset.remove_columns(ignored_columns)
+
+    def _get_collator_with_removed_columns(
+        self, data_collator: Callable, description: Optional[str] = None
+    ) -> Callable:
+        """Wrap the data collator in a callable removing unused columns."""
+        if not self.args.remove_unused_columns:
+            return data_collator
+        self._set_signature_columns_if_needed()
+        signature_columns = self._signature_columns
+
+        remove_columns_collator = RemoveColumnsCollator(
+            data_collator=data_collator,
+            signature_columns=signature_columns,
+            logger=logger,
+            description=description,
+            model_name=self.model.__class__.__name__,
+        )
+        return remove_columns_collator
+
+    def _is_iterable_dataset(self, dataset):
+        return isinstance(dataset, paddle.io.IterableDataset)
+
+    def _is_iterable_dataset_distributed(self, dataset):
+        # For distributed dataloaer.
+        is_iterable_dataset_tensor = paddle.to_tensor(self._is_iterable_dataset(dataset)).reshape([1])
+        if dist.get_world_size() > 1:
+            dist.all_reduce(is_iterable_dataset_tensor, op=dist.ReduceOp.MAX)
+        if is_iterable_dataset_tensor.item() == 1:
+            return True
+        return False
+
+    def print_config(self, args=None, key=""):
+        """
+        print config values
+        """
+        logger.debug("=" * 60)
+        if args is None:
+            args = self.args
+            key = "Training"
+        import paddlenlp
+
+        logger.debug("{:^40}".format("{} Configuration Arguments".format(key)))
+        logger.debug("{:30}: {}".format("paddle commit id", paddle.version.commit))
+        logger.debug("{:30}: {}".format("paddlenlp commit id", paddlenlp.version.commit))
+
+        for a in dir(args):
+            if a[:2] != "__":  # don't print double underscore methods
+                v = getattr(args, a)
+                if not isinstance(v, types.MethodType):
+                    logger.debug("{:30}: {}".format(a, v))
+
+        logger.debug("")
+
+    def is_unified_checkpoint(self, resume_from_checkpoint, safe_serialization=True):
+        is_unified_checkpoint_type = False
+        if isinstance(self.model, LoRAModel) or isinstance(self.model, PrefixModelForCausalLM):
+            weights_index_name = (
+                PADDLE_PEFT_WEIGHTS_INDEX_NAME if not safe_serialization else SAFE_PEFT_WEIGHTS_INDEX_NAME
+            )
+        else:
+            weights_index_name = PADDLE_WEIGHTS_INDEX_NAME if not safe_serialization else SAFE_WEIGHTS_INDEX_NAME
+        master_weights_index_name = (
+            PADDLE_MASTER_WEIGHTS_INDEX_NAME if not safe_serialization else SAFE_MASTER_WEIGHTS_INDEX_NAME
+        )
+        weights_index_file = os.path.join(
+            resume_from_checkpoint,
+            weights_index_name,
+        )
+        master_weights_index_file = os.path.join(
+            resume_from_checkpoint,
+            master_weights_index_name,
+        )
+
+        if distributed_isfile(weights_index_file) or distributed_isfile(master_weights_index_file):
+            is_unified_checkpoint_type = True
+
+        return is_unified_checkpoint_type
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/trainer_callback.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/trainer_callback.py
new file mode 100644
index 000000000..b263c7930
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/trainer_callback.py
@@ -0,0 +1,596 @@
+# Copyright 2020-present the HuggingFace Inc. team.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is modified from
+#  https://github.com/huggingface/transformers/blob/main/src/transformers/trainer_callback.py
+"""
+Callbacks to use with the Trainer class and customize the training loop.
+"""
+import dataclasses
+import json
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+from tqdm.auto import tqdm
+
+from paddlenlp.utils.log import logger
+
+from .trainer_utils import IntervalStrategy, has_length
+from .training_args import TrainingArguments
+
+__all__ = [
+    "TrainerState",
+    "TrainerControl",
+    "TrainerCallback",
+    "CallbackHandler",
+    "DefaultFlowCallback",
+    "ProgressCallback",
+    "PrinterCallback",
+    "EarlyStoppingCallback",
+]
+
+
+@dataclass
+class TrainerState:
+    """
+    A class containing the [`Trainer`] inner state that will be saved along the model and optimizer when checkpointing
+    and passed to the [`TrainerCallback`].
+
+    <Tip>
+
+    In all this class, one step is to be understood as one update step. When using gradient accumulation, one update
+    step may require several forward and backward passes: if you use `gradient_accumulation_steps=n`, then one update
+    step requires going through *n* batches.
+
+    </Tip>
+
+    Args:
+        epoch (`float`, *optional*):
+            Only set during training, will represent the epoch the training is at (the decimal part being the
+            percentage of the current epoch completed).
+        global_step (`int`, *optional*, defaults to 0):
+            During training, represents the number of update steps completed.
+        max_steps (`int`, *optional*, defaults to 0):
+            The number of update steps to do during the current training.
+        total_flos (`float`, *optional*, defaults to 0):
+            The total number of floating operations done by the model since the beginning of training (stored as floats
+            to avoid overflow).
+        log_history (`List[Dict[str, float]]`, *optional*):
+            The list of logs done since the beginning of training.
+        best_metric (`float`, *optional*):
+            When tracking the best model, the value of the best metric encountered so far.
+        best_model_checkpoint (`str`, *optional*):
+            When tracking the best model, the value of the name of the checkpoint for the best model encountered so
+            far.
+        is_local_process_zero (`bool`, *optional*, defaults to `True`):
+            Whether or not this process is the local (e.g., on one machine if training in a distributed fashion on
+            several machines) main process.
+        is_world_process_zero (`bool`, *optional*, defaults to `True`):
+            Whether or not this process is the global main process (when training in a distributed fashion on several
+            machines, this is only going to be `True` for one process).
+    """
+
+    epoch: Optional[float] = None
+    global_step: int = 0
+    max_steps: int = 0
+    num_train_epochs: int = 0
+    total_flos: float = 0
+    log_history: List[Dict[str, float]] = None
+    best_metric: Optional[float] = None
+    best_model_checkpoint: Optional[str] = None
+    is_local_process_zero: bool = True
+    is_world_process_zero: bool = True
+    trial_name: str = None
+    trial_params: Dict[str, Union[str, float, int, bool]] = None
+
+    def __post_init__(self):
+        if self.log_history is None:
+            self.log_history = []
+
+    def save_to_json(self, json_path: str):
+        """Save the content of this instance in JSON format inside `json_path`."""
+        json_string = json.dumps(dataclasses.asdict(self), indent=2, sort_keys=True) + "\n"
+        with open(json_path, "w", encoding="utf-8") as f:
+            f.write(json_string)
+
+    @classmethod
+    def load_from_json(cls, json_path: str):
+        """Create an instance from the content of `json_path`."""
+        with open(json_path, "r", encoding="utf-8") as f:
+            text = f.read()
+        return cls(**json.loads(text))
+
+
+@dataclass
+class TrainerControl:
+    """
+    A class that handles the [`Trainer`] control flow. This class is used by the [`TrainerCallback`] to activate some
+    switches in the training loop.
+
+    Args:
+        should_training_stop (`bool`, *optional*, defaults to `False`):
+            Whether or not the training should be interrupted.
+
+            If `True`, this variable will not be set back to `False`. The training will just stop.
+        should_epoch_stop (`bool`, *optional*, defaults to `False`):
+            Whether or not the current epoch should be interrupted.
+
+            If `True`, this variable will be set back to `False` at the beginning of the next epoch.
+        should_save (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should be saved at this step.
+
+            If `True`, this variable will be set back to `False` at the beginning of the next step.
+        should_evaluate (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should be evaluated at this step.
+
+            If `True`, this variable will be set back to `False` at the beginning of the next step.
+        should_log (`bool`, *optional*, defaults to `False`):
+            Whether or not the logs should be reported at this step.
+
+            If `True`, this variable will be set back to `False` at the beginning of the next step.
+    """
+
+    should_training_stop: bool = False
+    should_epoch_stop: bool = False
+    should_save: bool = False
+    should_evaluate: bool = False
+    should_log: bool = False
+
+    def _new_training(self):
+        """Internal method that resets the variable for a new training."""
+        self.should_training_stop = False
+
+    def _new_epoch(self):
+        """Internal method that resets the variable for a new epoch."""
+        self.should_epoch_stop = False
+
+    def _new_step(self):
+        """Internal method that resets the variable for a new step."""
+        self.should_save = False
+        self.should_evaluate = False
+        self.should_log = False
+
+
+class TrainerCallback:
+    """
+    A class for objects that will inspect the state of the training loop at some events and take some decisions. At
+    each of those events the following arguments are available:
+
+    Args:
+        args ([`TrainingArguments`]):
+            The training arguments used to instantiate the [`Trainer`].
+        state ([`TrainerState`]):
+            The current state of the [`Trainer`].
+        control ([`TrainerControl`]):
+            The object that is returned to the [`Trainer`] and can be used to make some decisions.
+        model ([`PreTrainedModel`] or `paddle.nn.Layer`):
+            The model being trained.
+        tokenizer ([`PreTrainedTokenizer`]):
+            The tokenizer used for encoding the data.
+        optimizer (`paddle.optimizer.Optimizer`):
+            The optimizer used for the training steps.
+        lr_scheduler (`paddle.optimizer.lr.LRScheduler`):
+            The scheduler used for setting the learning rate.
+        train_dataloader (`paddle.io.DataLoader`, *optional*):
+            The current dataloader used for training.
+        eval_dataloader (`paddle.io.DataLoader`, *optional*):
+            The current dataloader used for training.
+        metrics (`Dict[str, float]`):
+            The metrics computed by the last evaluation phase.
+
+            Those are only accessible in the event `on_evaluate`.
+        logs  (`Dict[str, float]`):
+            The values to log.
+
+            Those are only accessible in the event `on_log`.
+
+    The `control` object is the only one that can be changed by the callback, in which case the event that changes it
+    should return the modified version.
+
+    The argument `args`, `state` and `control` are positionals for all events, all the others are grouped in `kwargs`.
+    You can unpack the ones you need in the signature of the event using them. As an example, see the code of the
+    simple [`~transformer.PrinterCallback`].
+
+    Example:
+
+    ```python
+    class PrinterCallback(TrainerCallback):
+        def on_log(self, args, state, control, logs=None, **kwargs):
+            _ = logs.pop("total_flos", None)
+            if state.is_local_process_zero:
+                logger.info(logs)
+    ```"""
+
+    def on_init_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        """
+        Event called at the end of the initialization of the [`Trainer`].
+        """
+        pass
+
+    def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        """
+        Event called at the beginning of training.
+        """
+        pass
+
+    def on_train_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        """
+        Event called at the end of training.
+        """
+        pass
+
+    def on_epoch_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        """
+        Event called at the beginning of an epoch.
+        """
+        pass
+
+    def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        """
+        Event called at the end of an epoch.
+        """
+        pass
+
+    def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        """
+        Event called at the beginning of a training step. If using gradient accumulation, one training step might take
+        several inputs.
+        """
+        pass
+
+    def on_load_data_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        pass
+
+    def on_optimizer_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        pass
+
+    def on_optimizer_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        pass
+
+    def on_substep_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        """
+        Event called at the end of an substep during gradient accumulation.
+        """
+        pass
+
+    def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        """
+        Event called at the end of a training step. If using gradient accumulation, one training step might take
+        several inputs.
+        """
+        pass
+
+    def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        """
+        Event called after an evaluation phase.
+        """
+        pass
+
+    def on_save(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        """
+        Event called after a checkpoint save.
+        """
+        pass
+
+    def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        """
+        Event called after logging the last logs.
+        """
+        pass
+
+    def on_prediction_step(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        """
+        Event called after a prediction step.
+        """
+        pass
+
+
+class CallbackHandler(TrainerCallback):
+    """Internal class that just calls the list of callbacks in order."""
+
+    def __init__(self, callbacks, model, tokenizer, optimizer, lr_scheduler):
+        self.callbacks = []
+        for cb in callbacks:
+            self.add_callback(cb)
+        self.model = model
+        self.tokenizer = tokenizer
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+        self.train_dataloader = None
+        self.eval_dataloader = None
+
+        if not any(isinstance(cb, DefaultFlowCallback) for cb in self.callbacks):
+            logger.warning(
+                "The Trainer will not work properly if you don't have a `DefaultFlowCallback` in its callbacks. You\n"
+                + "should add one before training with `trainer.add_callback(DefaultFlowCallback). The current list of"
+                + "callbacks is\n:"
+                + self.callback_list
+            )
+
+    def add_callback(self, callback):
+        cb = callback() if isinstance(callback, type) else callback
+        cb_class = callback if isinstance(callback, type) else callback.__class__
+        if cb_class in [c.__class__ for c in self.callbacks]:
+            logger.warning(
+                f"You are adding a {cb_class} to the callbacks of this Trainer, but there is already one. The current"
+                + "list of callbacks is\n:"
+                + self.callback_list
+            )
+        self.callbacks.append(cb)
+
+    def pop_callback(self, callback):
+        if isinstance(callback, type):
+            for cb in self.callbacks:
+                if isinstance(cb, callback):
+                    self.callbacks.remove(cb)
+                    return cb
+        else:
+            for cb in self.callbacks:
+                if cb == callback:
+                    self.callbacks.remove(cb)
+                    return cb
+
+    def remove_callback(self, callback):
+        if isinstance(callback, type):
+            for cb in self.callbacks:
+                if isinstance(cb, callback):
+                    self.callbacks.remove(cb)
+                    return
+        else:
+            self.callbacks.remove(callback)
+
+    @property
+    def callback_list(self):
+        return "\n".join(cb.__class__.__name__ for cb in self.callbacks)
+
+    def on_init_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
+        return self.call_event("on_init_end", args, state, control)
+
+    def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
+        control.should_training_stop = False
+        return self.call_event("on_train_begin", args, state, control)
+
+    def on_train_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
+        return self.call_event("on_train_end", args, state, control)
+
+    def on_epoch_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
+        control.should_epoch_stop = False
+        return self.call_event("on_epoch_begin", args, state, control)
+
+    def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
+        return self.call_event("on_epoch_end", args, state, control)
+
+    def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
+        control.should_log = False
+        control.should_evaluate = False
+        control.should_save = False
+        return self.call_event("on_step_begin", args, state, control)
+
+    def on_load_data_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, inputs: Dict):
+        return self.call_event("on_load_data_end", args, state, control, inputs=inputs)
+
+    def on_optimizer_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, scaler):
+        return self.call_event("on_optimizer_begin", args, state, control, scaler=scaler)
+
+    def on_optimizer_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, scaler):
+        return self.call_event("on_optimizer_end", args, state, control, scaler=scaler)
+
+    def on_substep_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
+        return self.call_event("on_substep_end", args, state, control)
+
+    def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
+        return self.call_event("on_step_end", args, state, control)
+
+    def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, metrics):
+        control.should_evaluate = False
+        return self.call_event("on_evaluate", args, state, control, metrics=metrics)
+
+    def on_save(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
+        control.should_save = False
+        return self.call_event("on_save", args, state, control)
+
+    def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, logs, **kwargs):
+        control.should_log = False
+        return self.call_event("on_log", args, state, control, logs=logs, **kwargs)
+
+    def on_prediction_step(self, args: TrainingArguments, state: TrainerState, control: TrainerControl):
+        return self.call_event("on_prediction_step", args, state, control)
+
+    def call_event(self, event, args, state, control, **kwargs):
+        for callback in self.callbacks:
+            result = getattr(callback, event)(
+                args,
+                state,
+                control,
+                model=self.model,
+                tokenizer=self.tokenizer,
+                optimizer=self.optimizer,
+                lr_scheduler=self.lr_scheduler,
+                train_dataloader=self.train_dataloader,
+                eval_dataloader=self.eval_dataloader,
+                **kwargs,
+            )
+            # A Callback can skip the return of `control` if it doesn't change it.
+            if result is not None:
+                control = result
+        return control
+
+
+class DefaultFlowCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that handles the default flow of the training loop for logs, evaluation and checkpoints.
+    """
+
+    def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        # Log
+        if state.global_step == 1 and args.logging_first_step:
+            control.should_log = True
+        if args.logging_strategy == IntervalStrategy.STEPS and state.global_step % args.logging_steps == 0:
+            control.should_log = True
+
+        # Evaluate
+        if args.evaluation_strategy == IntervalStrategy.STEPS and state.global_step % args.eval_steps == 0:
+            control.should_evaluate = True
+
+        # Save
+        if (
+            args.save_strategy == IntervalStrategy.STEPS
+            and args.save_steps > 0
+            and state.global_step % args.save_steps == 0
+        ):
+            control.should_save = True
+
+        # End training
+        if state.global_step >= state.max_steps:
+            control.should_training_stop = True
+
+        return control
+
+    def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        # Log
+        if args.logging_strategy == IntervalStrategy.EPOCH:
+            control.should_log = True
+
+        # Evaluate
+        if args.evaluation_strategy == IntervalStrategy.EPOCH:
+            control.should_evaluate = True
+
+        # Save
+        if args.save_strategy == IntervalStrategy.EPOCH:
+            control.should_save = True
+
+        return control
+
+
+class ProgressCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that displays the progress of training or evaluation.
+    """
+
+    def __init__(self):
+        self.training_bar = None
+        self.prediction_bar = None
+
+    def on_train_begin(self, args, state, control, **kwargs):
+        if state.is_local_process_zero:
+            self.training_bar = tqdm(total=state.max_steps, desc="TrainProcess")
+        self.current_step = 0
+
+    def on_step_end(self, args, state, control, **kwargs):
+        if state.is_local_process_zero:
+            self.training_bar.update(state.global_step - self.current_step)
+            self.current_step = state.global_step
+
+    def on_prediction_step(self, args, state, control, eval_dataloader=None, **kwargs):
+        if state.is_local_process_zero and has_length(eval_dataloader.dataset):
+            if self.prediction_bar is None:
+                self.prediction_bar = tqdm(
+                    total=len(eval_dataloader), leave=self.training_bar is None, desc="PredictProcess"
+                )
+            self.prediction_bar.update(1)
+
+    def on_evaluate(self, args, state, control, **kwargs):
+        if state.is_local_process_zero:
+            if self.prediction_bar is not None:
+                self.prediction_bar.close()
+            self.prediction_bar = None
+
+    def on_log(self, args, state, control, logs=None, **kwargs):
+        if state.is_local_process_zero and self.training_bar is not None:
+            _ = logs.pop("total_flos", None)
+            if type(logs) is dict:
+                logs_str = ", ".join(f"{k}: {v}" for k, v in logs.items())
+            else:
+                logs_str = str(logs)
+            logger.info(logs_str)
+
+    def on_train_end(self, args, state, control, **kwargs):
+        if state.is_local_process_zero:
+            self.training_bar.close()
+            self.training_bar = None
+
+
+class PrinterCallback(TrainerCallback):
+    """
+    A bare [`TrainerCallback`] that just prints the logs.
+    """
+
+    def on_log(self, args, state, control, logs=None, **kwargs):
+        _ = logs.pop("total_flos", None)
+        if state.is_local_process_zero:
+            if type(logs) is dict:
+                logger.info(", ".join(f"{k}: {v}" for k, v in logs.items()))
+            else:
+                logger.info(logs)
+
+
+class EarlyStoppingCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that handles early stopping.
+
+    Args:
+       early_stopping_patience (`int`):
+            Use with `metric_for_best_model` to stop training when the specified metric worsens for
+            `early_stopping_patience` evaluation calls.
+       early_stopping_threshold(`float`, *optional*):
+            Use with TrainingArguments `metric_for_best_model` and `early_stopping_patience` to denote how much the
+            specified metric must improve to satisfy early stopping conditions. `
+
+    This callback depends on [`TrainingArguments`] argument *load_best_model_at_end* functionality to set best_metric
+    in [`TrainerState`].
+    """
+
+    def __init__(self, early_stopping_patience: int = 1, early_stopping_threshold: Optional[float] = 0.0):
+        self.early_stopping_patience = early_stopping_patience
+        self.early_stopping_threshold = early_stopping_threshold
+        # early_stopping_patience_counter denotes the number of times validation metrics failed to improve.
+        self.early_stopping_patience_counter = 0
+
+    def check_metric_value(self, args, state, control, metric_value):
+        # best_metric is set by code for load_best_model
+        operator = np.greater if args.greater_is_better else np.less
+        if state.best_metric is None or (
+            operator(metric_value, state.best_metric)
+            and abs(metric_value - state.best_metric) > self.early_stopping_threshold
+        ):
+            self.early_stopping_patience_counter = 0
+        else:
+            self.early_stopping_patience_counter += 1
+
+    def on_train_begin(self, args, state, control, **kwargs):
+        assert args.load_best_model_at_end, "EarlyStoppingCallback requires load_best_model_at_end = True"
+        assert (
+            args.metric_for_best_model is not None
+        ), "EarlyStoppingCallback requires metric_for_best_model is defined"
+        assert (
+            args.evaluation_strategy != IntervalStrategy.NO
+        ), "EarlyStoppingCallback requires IntervalStrategy of steps or epoch"
+
+    def on_evaluate(self, args, state, control, metrics, **kwargs):
+        metric_to_check = args.metric_for_best_model
+        if not metric_to_check.startswith("eval_"):
+            metric_to_check = f"eval_{metric_to_check}"
+        metric_value = metrics.get(metric_to_check)
+
+        if metric_value is None:
+            logger.warning(
+                f"early stopping required metric_for_best_model, but did not find {metric_to_check} so early stopping is disabled"
+            )
+            return
+
+        self.check_metric_value(args, state, control, metric_value)
+        if self.early_stopping_patience_counter >= self.early_stopping_patience:
+            control.should_training_stop = True
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/trainer_compress.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/trainer_compress.py
new file mode 100644
index 000000000..f2f945cd1
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/trainer_compress.py
@@ -0,0 +1,1035 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import inspect
+import json
+import math
+import os
+import time
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.metric import Accuracy
+from paddle.utils import try_import
+
+from ..data import Pad
+from ..metrics import ChunkEvaluator
+from ..metrics.squad import compute_prediction, squad_evaluate
+from ..transformers import export_model
+from ..transformers.model_outputs import BaseModelOutputWithPoolingAndCrossAttentions
+from ..transformers.ofa_utils import (
+    compute_neuron_head_importance,
+    encoder_layer_ofa_forward,
+    encoder_ofa_forward,
+    mha_ofa_forward,
+    prepare_qkv_ofa,
+    reorder_neuron_head,
+)
+from ..utils.log import logger
+from .trainer import Trainer
+
+
+def global_try_import_slim():
+    global paddleslim
+    try_import("paddleslim")
+    import paddleslim
+
+
+def compress(self, custom_evaluate=None):
+    """
+    Supports pruning DynaBERT and post-training quantization. If both are
+    needed, pruning DynaBERT would be performed before quantizaton.
+    """
+    args = self.args
+    self.custom_evaluate = custom_evaluate
+    if "dynabert" in args.strategy:
+        global_try_import_slim()
+        if self.args.width_mult_list is not None:
+            self.args.width_mult_list = [eval(width_mult) for width_mult in self.args.width_mult_list]
+        class_name = self.model.__class__.__name__
+        if (
+            "SequenceClassification" not in class_name
+            and "TokenClassification" not in class_name
+            and "QuestionAnswering" not in class_name
+        ):
+            assert (
+                self.custom_evaluate is not None
+            ), "Custom model using DynaBERT strategy needs to pass in parameters `custom_evaluate`."
+        model = copy.deepcopy(self.model)
+        self.original_model = model
+        _dynabert(self, self.model)
+
+        del self.original_model
+        if "ptq" in args.strategy or "qat" in args.strategy:
+            output_dir_list = []
+            for width_mult in args.width_mult_list:
+                output_dir_width = os.path.join(args.output_dir, "width_mult_" + str(round(width_mult, 2)))
+                if "ptq" in args.strategy:
+                    output_dir_list += self.quant(output_dir_width, "ptq")
+                elif "qat" in args.strategy:
+                    self.quant(output_dir_width, "qat")
+                    output_dir_list.append(output_dir_width)
+        if "embeddings" in args.strategy:
+            if "ptq" not in args.strategy and "qat" not in args.strategy:
+                output_dir_list = []
+                for width_mult in args.width_mult_list:
+                    output_dir_width = os.path.join(
+                        args.output_dir, "width_mult_" + str(round(width_mult, 2)), args.input_filename_prefix
+                    )
+                    self.quant(output_dir_width, "embeddings")
+            else:
+                for output_dir in output_dir_list:
+                    self.quant(os.path.join(output_dir, args.output_filename_prefix), "embeddings")
+
+    elif "ptq" in args.strategy:
+        # When input model is an inference model
+        if args.input_infer_model_path is not None:
+            model_dir = os.path.dirname(args.input_infer_model_path)
+            self.args.input_filename_prefix = os.path.basename(args.input_infer_model_path)
+            output_dir_list = self.quant(model_dir, "ptq")
+        # Input model is load from Trainer API in dygraph.
+        else:
+            # When input model is a dygraph.
+            # exports model and then do 'ptq'
+            # Prefix of `export_model` is 'model'
+            self.args.input_filename_prefix = "model"
+            input_spec = generate_input_spec(self.model, self.train_dataset, self.args.input_dtype)
+            input_dir = args.output_dir
+            export_model(model=self.model, input_spec=input_spec, path=input_dir)
+            output_dir_list = self.quant(input_dir, "ptq")
+        if "embeddings" in args.strategy:
+            for output_dir in output_dir_list:
+                self.quant(os.path.join(output_dir, args.output_filename_prefix), "embeddings")
+    elif "qat" in args.strategy:
+        global_try_import_slim()
+        self.quant(args.output_dir, "qat")
+        if "embeddings" in args.strategy:
+            self.quant(os.path.join(args.output_dir, args.output_filename_prefix), "embeddings")
+
+
+def quant(self, model_dir, strategy):
+    """
+    Supports Post-Training Quantization, Quantization Aware Training and
+    Embedding Quantization.
+    """
+    if strategy == "ptq":
+        return _post_training_quantization_grid_search(self, model_dir)
+    elif strategy == "qat":
+        _quant_aware_training_dynamic(self, model_dir)
+    elif strategy == "embeddings":
+        _quant_embeddings(self, model_dir)
+
+
+def generate_input_spec(model, dataset, input_dtype="int64"):
+    model_para_keys = inspect.signature(model.forward).parameters.keys()
+    input_num = 0
+    for key in dataset[0].keys():
+        if key in model_para_keys and key not in ("labels", "start_positions", "end_positions"):
+            input_num += 1
+    input_spec = [paddle.static.InputSpec(shape=[None, None], dtype=input_dtype) for i in range(input_num)]
+    return input_spec
+
+
+def _dynabert(self, model):
+    args = self.args
+    model = _replace_auto_model_forward(model)
+    if args.width_mult_list is None:
+        args.width_mult_list = [0.75]
+    # Each batch is a dict.
+    train_dataloader = self.get_train_dataloader()
+    eval_dataloader = self.get_eval_dataloader(self.eval_dataset)
+    if "QuestionAnswering" in model.__class__.__name__:
+        eval_dataloader_with_label = self.get_eval_dataloader(self.eval_examples)
+        ofa_model, teacher_model = _dynabert_init(self, model, eval_dataloader_with_label)
+    else:
+        ofa_model, teacher_model = _dynabert_init(self, model, eval_dataloader)
+
+    # TODO: args.gradient_accumulation_steps
+    if args.max_steps > 0:
+        args.num_training_steps = args.max_steps
+        args.num_train_epochs = math.ceil(args.num_training_steps / len(train_dataloader))
+    else:
+        args.num_training_steps = len(train_dataloader) * args.num_train_epochs
+        args.num_train_epochs = math.ceil(args.num_train_epochs)
+    self.create_optimizer_and_scheduler(num_training_steps=args.num_training_steps)
+
+    ofa_model = _dynabert_training(
+        self, ofa_model, model, teacher_model, train_dataloader, eval_dataloader, args.num_train_epochs
+    )
+    self.reset_optimizer_and_scheduler()
+
+    # Each width_mult best model would be exported.
+    _dynabert_export(self)
+
+    ofa_model, ofa_model.model = _recover_transformer_func(ofa_model, True), _recover_transformer_func(
+        ofa_model.model, True
+    )
+    ofa_model.model = _recover_auto_model_forward(ofa_model.model)
+    logger.info("Pruning is finished using DynaBERT strategy.")
+
+
+def _replace_transformer_func(self):
+    nn.MultiHeadAttention._ori_forward = paddle.nn.MultiHeadAttention.forward
+    nn.MultiHeadAttention._ori_prepare_qkv = nn.MultiHeadAttention._prepare_qkv
+
+    nn.MultiHeadAttention._forward = mha_ofa_forward
+    nn.MultiHeadAttention.__prepare_qkv = prepare_qkv_ofa
+    nn.TransformerEncoder._forward = encoder_ofa_forward
+    nn.TransformerEncoderLayer._forward = encoder_layer_ofa_forward
+
+    def init_func(layer):
+        if isinstance(layer, nn.MultiHeadAttention):
+            layer.forward = layer._forward
+            layer._prepare_qkv = layer.__prepare_qkv
+        elif isinstance(layer, nn.TransformerEncoderLayer):
+            layer.forward = layer._forward
+        elif isinstance(layer, nn.TransformerEncoder):
+            layer.forward = layer._forward
+
+    for layer in self.children():
+        layer.apply(init_func)
+    return self
+
+
+def _recover_transformer_func(self, all_recover=False):
+    def init_func(layer):
+        if isinstance(layer, nn.MultiHeadAttention):
+            layer.forward = layer._ori_forward
+        elif isinstance(layer, nn.TransformerEncoderLayer):
+            layer.forward = layer._ori_forward
+        elif isinstance(layer, nn.TransformerEncoder):
+            layer.forward = layer._ori_forward
+        if all_recover:
+            if isinstance(layer, nn.MultiHeadAttention):
+                layer._prepare_qkv = layer._ori_prepare_qkv
+
+    for layer in self.children():
+        layer.apply(init_func)
+
+    return self
+
+
+def _replace_auto_model_forward(self):
+    self.base_model_class._forward = auto_model_dynabert_forward
+    self.base_model_class._ori_forward = self.base_model_class.forward
+
+    def init_func(layer):
+        if isinstance(layer, self.base_model_class):
+            layer.forward = layer._forward
+
+    for layer in self.children():
+        layer.apply(init_func)
+    return self
+
+
+def _replace_auto_model_qat_forward(self):
+    self.base_model_class._forward = auto_model_forward
+    self.base_model_class._ori_forward = self.base_model_class.forward
+
+    def init_func(layer):
+        if isinstance(layer, self.base_model_class):
+            layer.forward = layer._forward
+
+    for layer in self.children():
+        layer.apply(init_func)
+    return self
+
+
+def _recover_auto_model_forward(self):
+    def init_func(layer):
+        if isinstance(
+            layer,
+            self.base_model_class if not isinstance(self, paddle.DataParallel) else self._layers.base_model_class,
+        ):
+            layer.forward = layer._ori_forward
+
+    for layer in self._layers.children() if isinstance(self, paddle.DataParallel) else self.children():
+        layer.apply(init_func)
+    return self
+
+
+def _dynabert_init(self, model, eval_dataloader):
+    from paddleslim.nas.ofa import OFA, DistillConfig, utils
+    from paddleslim.nas.ofa.convert_super import Convert, supernet
+
+    # Step1: Initialize a dictionary to save the weights from the origin model.
+    origin_weights = model.state_dict()
+
+    # Step2: Define teacher model.
+    teacher_model = copy.deepcopy(model)
+
+    # Step3: Convert origin model to supernet.
+    sp_config = supernet(expand_ratio=[1.0])
+    model = Convert(sp_config).convert(model)
+
+    # Use weights saved in the dictionary to initialize supernet.
+    utils.set_state_dict(model, origin_weights)
+    del origin_weights
+
+    # Step4: Config about distillation.
+    mapping_layers = [model.base_model_prefix + ".embeddings"]
+    for idx in range(model.base_model.config["num_hidden_layers"]):
+        mapping_layers.append(model.base_model_prefix + ".encoder.layers.{}".format(idx))
+
+    default_distill_config = {
+        "lambda_distill": 0.1,
+        "teacher_model": teacher_model,
+        "mapping_layers": mapping_layers,
+    }
+    distill_config = DistillConfig(**default_distill_config)
+
+    # Step5: Config in supernet training.
+    ofa_model = OFA(model, distill_config=distill_config, elastic_order=["width"])
+
+    # Step6: Calculate the importance of neurons and head,
+    # and then reorder them according to the importance.
+    ofa_model.model, ofa_model = _replace_transformer_func(ofa_model.model), _replace_transformer_func(ofa_model)
+    head_importance, neuron_importance = compute_neuron_head_importance(
+        model=ofa_model.model,
+        data_loader=eval_dataloader,
+        loss_fct=self.criterion,
+        num_layers=model.base_model.config["num_hidden_layers"],
+        num_heads=model.base_model.config["num_attention_heads"],
+        label_names=self.args.label_names,
+    )
+
+    reorder_neuron_head(ofa_model.model, head_importance, neuron_importance)
+
+    if paddle.distributed.get_world_size() > 1:
+        ofa_model.model = paddle.DataParallel(ofa_model.model)
+
+    return ofa_model, teacher_model
+
+
+def check_dynabert_config(net_config, width_mult):
+    """
+    Corrects net_config for OFA model if necessary.
+    """
+    if "electra.embeddings_project" in net_config:
+        net_config["electra.embeddings_project"]["expand_ratio"] = 1.0
+    for key in net_config:
+        # Makes sure to expands the size of the last dim to `width_mult` for
+        # these Linear weights.
+        if "q_proj" in key or "k_proj" in key or "v_proj" in key or "linear1" in key:
+            net_config[key]["expand_ratio"] = width_mult
+        # Keeps the size of the last dim of these Linear weights same as
+        # before.
+        elif "out_proj" in key or "linear2" in key:
+            net_config[key]["expand_ratio"] = 1.0
+    return net_config
+
+
+def evaluate(self, model, data_loader):
+    if self.custom_evaluate is not None:
+        return self.custom_evaluate(self, model, data_loader)
+    if isinstance(model, paddleslim.nas.ofa.OFA):
+        class_name = model.model.__class__.__name__
+    else:
+        class_name = model.__class__.__name__
+    if "SequenceClassification" in class_name:
+        return evaluate_seq_cls(self, model, data_loader)
+    elif "QuestionAnswering" in class_name:
+        return evaluate_qa(self, model, data_loader)
+    elif "TokenClassification" in class_name:
+        return evaluate_token_cls(self, model, data_loader)
+    else:
+        raise NotImplementedError(
+            "Model to be compressed is an instance of a custom class, "
+            "so function `evaluate(self, model, data_loader)` should be "
+            "implemented, and `model` should support both `paddle.nn.layer` "
+            "and `paddleslim.nas.ofa.OFA` instances, and it should return "
+            "a single float for precision value, such as acc."
+        )
+
+
+@paddle.no_grad()
+def evaluate_qa(self, model, data_loader):
+    model.eval()
+    all_start_logits = []
+    all_end_logits = []
+    for batch in data_loader:
+        logits = model(input_ids=batch["input_ids"], token_type_ids=batch["token_type_ids"])
+        if isinstance(model, paddleslim.nas.ofa.OFA):
+            start_logits_tensor, end_logits_tensor = logits[0]
+        else:
+            start_logits_tensor, end_logits_tensor = logits
+        for idx in range(start_logits_tensor.shape[0]):
+            all_start_logits.append(start_logits_tensor.numpy()[idx])
+            all_end_logits.append(end_logits_tensor.numpy()[idx])
+    n_best_size = 20
+    max_answer_length = 50
+    all_predictions, _, _ = compute_prediction(
+        self.eval_examples,
+        self.eval_dataset,
+        (all_start_logits, all_end_logits),
+        False,
+        n_best_size,
+        max_answer_length,
+    )
+    res = squad_evaluate(
+        examples=[raw_data for raw_data in self.eval_examples], preds=all_predictions, is_whitespace_splited=False
+    )
+    logger.info("EM: %f, F1: %f, " % (res["exact"], res["f1"]))
+    res = res["exact"]
+    model.train()
+    return res
+
+
+@paddle.no_grad()
+def evaluate_seq_cls(self, model, data_loader):
+    metric = Accuracy()
+    model.eval()
+    metric.reset()
+    for batch in data_loader:
+        labels = batch.pop("labels")
+        logits = model(**batch)
+        if isinstance(model, paddleslim.nas.ofa.OFA):
+            logits = logits[0]
+        correct = metric.compute(logits, labels)
+        metric.update(correct)
+    res = metric.accumulate()
+    logger.info("acc: %s, " % res)
+    model.train()
+    return res
+
+
+@paddle.no_grad()
+def evaluate_token_cls(self, model, data_loader):
+    metric = ChunkEvaluator(label_list=self.train_dataset.label_list)
+    model.eval()
+    metric.reset()
+    for batch in data_loader:
+        logits = model(input_ids=batch["input_ids"], token_type_ids=batch["token_type_ids"])
+        if isinstance(model, paddleslim.nas.ofa.OFA):
+            logits = logits[0]
+        preds = logits.argmax(axis=2)
+        seq_len = paddle.sum(batch["labels"] != self.train_dataset.ignore_label, axis=-1)
+        num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute(seq_len, preds, batch["labels"])
+        metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
+    res = metric.accumulate()
+    logger.info("precision: %f, recall: %f, f1_score: %f" % (res[0], res[1], res[2]))
+    res = res[2]
+    model.train()
+    return res
+
+
+def _dynabert_training(self, ofa_model, model, teacher_model, train_dataloader, eval_dataloader, num_train_epochs):
+    from paddleslim.nas.ofa import utils
+
+    global_step = 0
+    lambda_logit = 1.0
+    tic_train = time.time()
+    best_acc = [0.0] * len(self.args.width_mult_list)
+    acc = 0.0
+
+    logger.info("Teacher's evaluation starts.")
+    tic_eval = time.time()
+    evaluate(self, teacher_model, eval_dataloader)
+    logger.info("eval done total: %s s" % (time.time() - tic_eval))
+
+    logger.info("DynaBERT training starts. This period will cost some time.")
+    for epoch in range(num_train_epochs):
+        # Step7: Set current epoch and task.
+        ofa_model.set_epoch(epoch)
+        ofa_model.set_task("width")
+        for step, batch in enumerate(train_dataloader):
+            global_step += 1
+            for width_mult in self.args.width_mult_list:
+                # Step8: Broadcast supernet config from width_mult,
+                # and use this config in supernet training.
+                net_config = utils.dynabert_config(ofa_model, width_mult)
+                net_config = check_dynabert_config(net_config, width_mult)
+                ofa_model.set_net_config(net_config)
+                if "token_type_ids" in batch:
+                    logits, teacher_logits = ofa_model(
+                        input_ids=batch["input_ids"],
+                        token_type_ids=batch["token_type_ids"],
+                        attention_mask=[None, None],
+                    )
+                else:
+                    logits, teacher_logits = ofa_model(batch["input_ids"], attention_mask=[None, None])
+                rep_loss = ofa_model.calc_distill_loss()
+                if isinstance(logits, tuple):
+                    logit_loss, num_logit = 0, 0
+                    for i in range(len(logits)):
+                        try:
+                            logit_loss += soft_cross_entropy(logits[i], teacher_logits[i].detach())
+                            num_logit += 1
+                        except RuntimeError:
+                            pass
+                    logit_loss /= num_logit
+                else:
+                    logit_loss = soft_cross_entropy(logits, teacher_logits.detach())
+                loss = rep_loss + lambda_logit * logit_loss
+                loss.backward()
+            self.optimizer.step()
+            self.lr_scheduler.step()
+            self.optimizer.clear_grad()
+            if global_step % self.args.logging_steps == 0:
+                if paddle.distributed.get_rank() == 0:
+                    logger.info(
+                        "global step %d, epoch: %d, batch: %d, lr: %.3e, loss: %f, speed: %.2f step/s"
+                        % (
+                            global_step,
+                            epoch,
+                            step,
+                            self.optimizer.get_lr(),
+                            loss,
+                            self.args.logging_steps / (time.time() - tic_train),
+                        )
+                    )
+                tic_train = time.time()
+
+            if global_step % self.args.save_steps == 0:
+                for idx, width_mult in enumerate(self.args.width_mult_list):
+                    net_config = utils.dynabert_config(ofa_model, width_mult)
+                    net_config = check_dynabert_config(net_config, width_mult)
+                    ofa_model.set_net_config(net_config)
+                    tic_eval = time.time()
+                    logger.info("width_mult %s:" % round(width_mult, 2))
+                    acc = evaluate(self, ofa_model, eval_dataloader)
+                    if acc > best_acc[idx]:
+                        best_acc[idx] = acc
+                        if paddle.distributed.get_rank() == 0:
+                            output_dir_width = os.path.join(
+                                self.args.output_dir, "width_mult_" + str(round(width_mult, 2))
+                            )
+                            if not os.path.exists(output_dir_width):
+                                os.makedirs(output_dir_width)
+                            # need better way to get inner model of DataParallel
+                            model_to_save = model._layers if isinstance(model, paddle.DataParallel) else model
+                            model_to_save.save_pretrained(output_dir_width)
+                    logger.info("eval done total: %s s" % (time.time() - tic_eval))
+            if global_step > self.args.num_training_steps:
+                if best_acc[idx] == 0.0:
+                    output_dir_width = os.path.join(self.args.output_dir, "width_mult_" + str(round(width_mult, 2)))
+                    if not os.path.exists(output_dir_width):
+                        os.makedirs(output_dir_width)
+                    # need better way to get inner model of DataParallel
+                    model_to_save = model._layers if isinstance(model, paddle.DataParallel) else model
+                    model_to_save.save_pretrained(output_dir_width)
+                logger.info("Best result of width_mult %.2f: %.4f" % (width_mult, best_acc[idx]))
+                return ofa_model
+
+    for idx, width_mult in enumerate(self.args.width_mult_list):
+        logger.info("Best result of width_mult %.2f: %.4f" % (width_mult, best_acc[idx]))
+    return ofa_model
+
+
+def _get_dynabert_model(model, width_mult):
+    for layer in model.base_model.encoder.layers:
+        # Multi-Head Attention
+        layer.self_attn.num_heads = int(layer.self_attn.num_heads * width_mult)
+        layer.self_attn.q_proj = nn.Linear(
+            layer.self_attn.q_proj.weight.shape[0],
+            int(layer.self_attn.q_proj.weight.shape[1] * width_mult),
+            layer.self_attn.q_proj._weight_attr,
+            layer.self_attn.q_proj._bias_attr,
+        )
+        layer.self_attn.k_proj = nn.Linear(
+            layer.self_attn.k_proj.weight.shape[0],
+            int(layer.self_attn.k_proj.weight.shape[1] * width_mult),
+            layer.self_attn.k_proj._weight_attr,
+            layer.self_attn.k_proj._bias_attr,
+        )
+        layer.self_attn.v_proj = nn.Linear(
+            layer.self_attn.v_proj.weight.shape[0],
+            int(layer.self_attn.v_proj.weight.shape[1] * width_mult),
+            layer.self_attn.v_proj._weight_attr,
+            layer.self_attn.v_proj._bias_attr,
+        )
+        layer.self_attn.out_proj = nn.Linear(
+            int(layer.self_attn.out_proj.weight.shape[0] * width_mult),
+            layer.self_attn.out_proj.weight.shape[1],
+            layer.self_attn.out_proj._weight_attr,
+            layer.self_attn.out_proj._bias_attr,
+        )
+
+        # Feed Forward
+        layer.linear1 = nn.Linear(
+            layer.linear1.weight.shape[0],
+            int(layer.linear1.weight.shape[1] * width_mult),
+            layer.linear1._weight_attr,
+            layer.linear1._bias_attr,
+        )
+        layer.linear2 = nn.Linear(
+            int(layer.linear2.weight.shape[0] * width_mult),
+            layer.linear2.weight.shape[1],
+            layer.linear2._weight_attr,
+            layer.linear2._bias_attr,
+        )
+    return model
+
+
+def _load_parameters(dynabert_model, ori_state_dict):
+    dynabert_state_dict = dynabert_model.state_dict()
+    for key in ori_state_dict.keys():
+        # Removes '.fn' from ofa model parameters
+        dynabert_key = key.replace(".fn", "")
+        if dynabert_key not in dynabert_state_dict.keys():
+            logger.warning("Failed to export parameter %s" % key)
+        else:
+            dynabert_shape = dynabert_state_dict[dynabert_key].shape
+            if len(dynabert_shape) == 2:
+                dynabert_state_dict[dynabert_key] = ori_state_dict[key][: dynabert_shape[0], : dynabert_shape[1]]
+            elif len(dynabert_shape) == 1:
+                dynabert_state_dict[dynabert_key] = ori_state_dict[key][: dynabert_shape[0]]
+            else:
+                raise ValueError("Please check input model. Length of shape should be 1 or 2 for any parameter.")
+    dynabert_model.set_state_dict(dynabert_state_dict)
+    return dynabert_model
+
+
+def _export_dynamic_dynabert_model(self, width_mult):
+    model_dir = os.path.join(self.args.output_dir, "width_mult_" + str(round(width_mult, 2)))
+    state_dict = paddle.load(os.path.join(model_dir, "model_state.pdparams"))
+    dynabert_model = _get_dynabert_model(self.original_model, width_mult)
+    dynabert_model = _load_parameters(dynabert_model, state_dict)
+    return dynabert_model
+
+
+def _dynabert_export(self):
+    for width_mult in self.args.width_mult_list:
+        dynabert_model = _export_dynamic_dynabert_model(self, width_mult)
+        self.model = dynabert_model
+        if "qat" not in self.args.strategy:
+            input_spec = generate_input_spec(self.model, self.train_dataset, self.args.input_dtype)
+            pruned_infer_model_dir = os.path.join(self.args.output_dir, "width_mult_" + str(round(width_mult, 2)))
+            export_model(model=dynabert_model, input_spec=input_spec, path=pruned_infer_model_dir)
+            self.args.input_filename_prefix = "model"
+            logger.info("Pruned models have been exported.")
+
+
+def _post_training_quantization_grid_search(self, model_dir):
+    args = self.args
+    if args.batch_num_list is None:
+        args.batch_num_list = [1]
+    if args.batch_size_list is None:
+        args.batch_size_list = [4, 8, 16]
+    if args.algo_list is None:
+        args.algo_list = ["mse", "KL"]
+
+    paddle.enable_static()
+    place = paddle.set_device(args.device)
+    exe = paddle.static.Executor(place)
+
+    args.output_filename_prefix = "int8"
+    output_dir_list = []
+
+    def _post_training_quantization(algo, batch_size, batch_nums):
+        from paddle.static.quantization import PostTrainingQuantization
+
+        def _batch_generator_func():
+            param_name_list = []
+            for key in self.eval_dataset[0]:
+                if key in ("input_ids", "token_type_ids"):
+                    param_name_list.append(key)
+            batch_data = [[] for i in range(len(param_name_list))]
+            for data in self.eval_dataset:
+                for i in range(len(param_name_list)):
+                    batch_data[i].append(data[param_name_list[i]])
+                if len(batch_data[0]) == batch_size:
+                    for i in range(len(param_name_list)):
+                        batch_data[i] = Pad(axis=0, pad_val=0)(batch_data[i])
+                    yield batch_data
+                    batch_data = [[] for i in range(len(param_name_list))]
+
+        post_training_quantization = PostTrainingQuantization(
+            executor=exe,
+            batch_generator=_batch_generator_func,
+            model_dir=model_dir,
+            model_filename=args.input_filename_prefix + ".pdmodel",
+            params_filename=args.input_filename_prefix + ".pdiparams",
+            batch_size=batch_size,
+            batch_nums=batch_nums,
+            scope=None,
+            algo=algo,
+            hist_percent=0.9999,
+            round_type=args.round_type,
+            bias_correction=args.bias_correction,
+            quantizable_op_type=["matmul", "matmul_v2"],
+            is_full_quantize=False,
+            weight_bits=8,
+            activation_bits=8,
+            activation_quantize_type="range_abs_max"
+            if args.activation_quantize_type is None
+            else args.activation_quantize_type,
+            weight_quantize_type=args.weight_quantize_type,
+            onnx_format=args.onnx_format,
+            optimize_model=False,
+        )
+        post_training_quantization.quantize()
+        save_model_path = os.path.join(model_dir, algo + "_".join([str(batch_size), str(batch_nums)]))
+        post_training_quantization.save_quantized_model(
+            save_model_path=save_model_path,
+            model_filename=args.output_filename_prefix + ".pdmodel",
+            params_filename=args.output_filename_prefix + ".pdiparams",
+        )
+        output_dir_list.append(save_model_path)
+
+    logger.info("Post training quantization starts.")
+    for algo in args.algo_list:
+        for batch_size in args.batch_size_list:
+            for batch_nums in args.batch_num_list:
+                _post_training_quantization(algo, batch_size, batch_nums)
+
+    paddle.disable_static()
+    logger.info("Post training quantization ends and quantized models are saved.")
+    return output_dir_list
+
+
+def _quant_aware_training_dynamic(self, input_dir):
+    # TODO: Switch from multiple GPUs to a single GPU.
+    from paddleslim import QAT
+
+    args = self.args
+    args.output_filename_prefix = "int8"
+
+    quant_config = {
+        # It defauts to None, which means that no preprocessing is performed
+        # on the active value."
+        "activation_preprocess_type": "PACT" if args.use_pact else None,
+        # It defauts to None, which means that no preprocessing is performed
+        # on weights.
+        "weight_preprocess_type": "PACT" if args.use_pact else None,
+        "weight_quantize_type": args.weight_quantize_type,
+        "activation_quantize_type": "moving_average_abs_max"
+        if args.activation_quantize_type is None
+        else args.activation_quantize_type,
+        "weight_bits": 8,
+        "activation_bits": 8,
+        "dtype": "int8",
+        # window size for 'range_abs_max' quantization. defaulf is 10000
+        "window_size": 10000,
+        "quantizable_layer_type": ["Linear", "Conv2D"],
+        "moving_rate": args.moving_rate,
+        "onnx_format": args.onnx_format,
+    }
+
+    if not os.path.exists(input_dir):
+        os.makedirs(input_dir)
+
+    output_param_path = os.path.join(input_dir, "best_quant.pdparams")
+
+    train_dataloader = self.get_train_dataloader()
+    eval_dataloader = self.get_eval_dataloader(self.eval_dataset)
+
+    # TODO: args.gradient_accumulation_steps
+    if args.max_steps > 0:
+        args.num_training_steps = args.max_steps
+        args.num_train_epochs = math.ceil(args.num_training_steps / len(train_dataloader))
+    else:
+        args.num_training_steps = len(train_dataloader) * args.num_train_epochs
+        args.num_train_epochs = math.ceil(args.num_train_epochs)
+
+    self.create_optimizer_and_scheduler(num_training_steps=args.num_training_steps)
+
+    logger.info("Evaluating FP32 model before quantization aware training.")
+
+    tic_eval = time.time()
+
+    acc = evaluate(self, self.model, eval_dataloader)
+    logger.info("eval done total: %s s" % (time.time() - tic_eval))
+
+    quanter = QAT(config=quant_config)
+    self.model = _replace_auto_model_qat_forward(self.model)
+    quanter.quantize(self.model)
+
+    global_step = 0
+    tic_train = time.time()
+    best_acc, acc = 0.0, 0.0
+
+    logger.info("Quant aware training starts.")
+    # Train self.model
+    for epoch in range(args.num_train_epochs):
+        for step, batch in enumerate(train_dataloader):
+            global_step += 1
+            labels = None
+            if self.args.label_names is None:
+                if "labels" in batch:
+                    labels = batch.pop("labels")
+                elif "start_positions" in batch and "end_positions" in batch:
+                    labels = (batch.pop("start_positions"), batch.pop("end_positions"))
+            else:
+                labels = []
+                for label in self.args.label_names:
+                    labels.append(batch.pop(label))
+                labels = tuple(labels)
+            model_para_keys = inspect.signature(self.model.forward).parameters.keys()
+            inputs = {}
+            for key in batch:
+                if key in model_para_keys:
+                    inputs[key] = batch[key]
+            logits = self.model(**inputs)
+            loss = self.criterion(logits, labels)
+            loss.backward()
+
+            self.optimizer.step()
+            self.lr_scheduler.step()
+            self.optimizer.clear_grad()
+            if global_step % self.args.logging_steps == 0:
+                if paddle.distributed.get_rank() == 0:
+                    logger.info(
+                        "global step %d, epoch: %d, batch: %d, lr: %.3e, loss: %f, speed: %.2f step/s"
+                        % (
+                            global_step,
+                            epoch,
+                            step,
+                            self.optimizer.get_lr(),
+                            loss,
+                            args.logging_steps / (time.time() - tic_train),
+                        )
+                    )
+                tic_train = time.time()
+
+            if global_step % args.save_steps == 0:
+                tic_eval = time.time()
+                acc = evaluate(self, self.model, eval_dataloader)
+                if acc > best_acc:
+                    best_acc = acc
+                    if paddle.distributed.get_rank() == 0:
+                        # need better way to get inner model of DataParallel
+                        model_to_save = (
+                            self.model._layers if isinstance(self.model, paddle.DataParallel) else self.model
+                        )
+                        paddle.save(model_to_save.state_dict(), output_param_path)
+                logger.info("eval done total: %s s" % (time.time() - tic_eval))
+    logger.info("Best result: %.4f" % best_acc)
+    self.model.set_state_dict(paddle.load(output_param_path))
+
+    input_spec = generate_input_spec(self.model, self.train_dataset, self.args.input_dtype)
+
+    quanter.save_quantized_model(
+        self.model, os.path.join(input_dir, args.output_filename_prefix), input_spec=input_spec
+    )
+
+    self.model = _recover_auto_model_forward(self.model)
+    logger.info(
+        "Quant aware training ends and quantized models are saved to %s."
+        % os.path.join(input_dir, args.output_filename_prefix)
+    )
+
+
+def _quant_embeddings(self, input_prefix):
+    import paddleslim.quant as quant
+
+    self.args.output_filename_prefix = "quant_emb"
+
+    paddle.enable_static()
+    place = paddle.set_device(self.args.device)
+    exe = paddle.static.Executor(place)
+    main_program, feed_target_names, fetch_targets = paddle.static.load_inference_model(input_prefix, exe)
+
+    config = {"quantize_op_types": ["lookup_table_v2"], "lookup_table_v2": {"quantize_type": "log"}}
+
+    quant_emb_program = quant.quant_embedding(main_program, place, config)
+
+    input_dir = os.path.dirname(input_prefix)
+
+    paddle.static.save_inference_model(
+        os.path.join(input_dir, self.args.output_filename_prefix),
+        feed_target_names,
+        fetch_targets,
+        exe,
+        program=quant_emb_program,
+    )
+
+
+def auto_model_dynabert_forward(
+    self,
+    input_ids,
+    token_type_ids=None,
+    position_ids=None,
+    attention_mask=[None, None],
+    task_type_ids=None,
+    past_key_values=None,
+    inputs_embeds=None,
+    use_cache=None,
+    output_hidden_states=False,
+    output_attentions=False,
+    return_dict=False,
+):
+    kwargs = locals()
+    wtype = (
+        self.encoder.layers[0].norm1.fn.weight.dtype
+        if hasattr(self.encoder.layers[0].norm1, "fn")
+        else self.encoder.layers[0].norm1.weight.dtype
+    )
+    if input_ids is not None and inputs_embeds is not None:
+        raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time.")
+    elif input_ids is not None:
+        input_shape = input_ids.shape
+    elif inputs_embeds is not None:
+        input_shape = inputs_embeds.shape[:-1]
+    else:
+        raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+    past_key_values_length = None
+    if past_key_values is not None:
+        past_key_values_length = past_key_values[0][0].shape[2]
+
+    if attention_mask is None:
+        # input_ids[0][0] is equals to 0 while exporting.
+        if input_ids[0][0] != 0:
+            attention_mask = [None, None]
+            attention_mask[0] = paddle.unsqueeze((input_ids == self.pad_token_id).astype(wtype) * -1e4, axis=[1, 2])
+        else:
+            if past_key_values is not None:
+                batch_size = past_key_values[0][0].shape[0]
+                past_mask = paddle.zeros([batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype)
+                attention_mask = paddle.concat([past_mask, attention_mask], axis=-1)
+    elif isinstance(attention_mask, paddle.Tensor) and attention_mask.ndim == 2:
+        attention_mask = paddle.unsqueeze(attention_mask, axis=[1, 2]).astype(wtype)
+        attention_mask = (1.0 - attention_mask) * -1e4
+    elif attention_mask[0] is None:
+        attention_mask[0] = paddle.unsqueeze((input_ids == self.pad_token_id).astype(wtype) * -1e4, axis=[1, 2])
+
+    embedding_kwargs_keys = inspect.signature(self.embeddings.forward).parameters.keys()
+    embedding_kwargs = {}
+    for key in embedding_kwargs_keys:
+        if key in kwargs.keys():
+            embedding_kwargs[key] = kwargs[key]
+    embedding_kwargs["input_ids"] = input_ids
+
+    embedding_output = self.embeddings(**embedding_kwargs)
+    if hasattr(self, "embeddings_project"):
+        embedding_output = self.embeddings_project(embedding_output)
+
+    self.encoder._use_cache = use_cache  # To be consistent with HF
+
+    encoder_kwargs_keys = inspect.signature(self.encoder.forward).parameters.keys()
+    encoder_kwargs = {}
+    for key in encoder_kwargs_keys:
+        if key == "cache":
+            encoder_kwargs[key] = past_key_values
+        elif key == "src_mask":
+            encoder_kwargs[key] = attention_mask
+        elif key in kwargs:
+            encoder_kwargs[key] = kwargs[key]
+
+    encoder_outputs = self.encoder(embedding_output, **encoder_kwargs)
+    if isinstance(encoder_outputs, type(embedding_output)):
+        sequence_output = encoder_outputs
+        if hasattr(self, "pooler"):
+            pooled_output = self.pooler(sequence_output)
+        else:
+            pooled_output = sequence_output[:, 0]
+        return (sequence_output, pooled_output)
+    else:
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output)
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+def auto_model_forward(
+    self,
+    input_ids,
+    token_type_ids=None,
+    position_ids=None,
+    attention_mask=None,
+    task_type_ids=None,
+    past_key_values=None,
+    inputs_embeds=None,
+    use_cache=None,
+    output_hidden_states=False,
+    output_attentions=False,
+    return_dict=False,
+):
+    kwargs = locals()
+    past_key_values_length = None
+    if past_key_values is not None:
+        past_key_values_length = past_key_values[0][0].shape[2]
+
+    if attention_mask is None:
+        attention_mask = paddle.unsqueeze((input_ids == self.pad_token_id).astype(paddle.float32) * -1e4, axis=[1, 2])
+        if past_key_values is not None:
+            batch_size = past_key_values[0][0].shape[0]
+            past_mask = paddle.zeros([batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype)
+            attention_mask = paddle.concat([past_mask, attention_mask], axis=-1)
+    # For 2D attention_mask from tokenizer
+    elif attention_mask.ndim == 2:
+        attention_mask = paddle.unsqueeze(attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
+        attention_mask = (1.0 - attention_mask) * -1e4
+
+    kwargs_keys = inspect.signature(self._ori_forward).parameters.keys()
+
+    model_kwargs = {}
+    for key in kwargs_keys:
+        model_kwargs[key] = kwargs[key]
+    model_kwargs["attention_mask"] = attention_mask
+    return self._ori_forward(**model_kwargs)
+
+
+def soft_cross_entropy(inp, target):
+    inp_likelihood = F.log_softmax(inp, axis=-1)
+    target_prob = F.softmax(target, axis=-1)
+    return -1.0 * paddle.mean(paddle.sum(inp_likelihood * target_prob, axis=-1))
+
+
+def reset_optimizer_and_scheduler(self):
+    self.optimizer, self.lr_scheduler = None, None
+
+
+def cut_embeddings(model, tokenizer, config, word_emb_index, max_seq_length, max_vocab_size, output_dir):
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    state_dict = model.state_dict()
+
+    word_emb_name = model.base_model_prefix + ".embeddings.word_embeddings.weight"
+    word_emb_np = state_dict[word_emb_name].cpu().numpy()
+    word_emb_np_new = [word_emb_np[idx] for idx in word_emb_index]
+
+    state_dict[word_emb_name] = paddle.to_tensor(word_emb_np_new)
+    # Rewrites Position Embedding parameters
+    pos_emb_name = model.base_model_prefix + ".embeddings.position_embeddings.weight"
+    state_dict[pos_emb_name] = state_dict[pos_emb_name][:max_seq_length, :]
+
+    paddle.save(state_dict, os.path.join(output_dir, "model_state.pdparams"))
+
+    # Rewrites config
+    config["max_position_embeddings"] = max_seq_length
+    config["vocab_size"] = max_vocab_size
+    config.save_pretrained(output_dir)
+
+    # Rewrites vocab file
+    vocab_file = os.path.join(output_dir, "vocab.txt")
+    f = open(vocab_file, "w")
+    for idx in word_emb_index:
+        f.write(tokenizer.convert_ids_to_tokens(idx) + "\n")
+    f.close()
+
+    tokenizer.init_config["model_max_length"] = max_seq_length
+    if "vocab_file" in tokenizer.init_config:
+        tokenizer.init_config.pop("vocab_file")
+    f = open(os.path.join(output_dir, tokenizer.tokenizer_config_file), "w")
+
+    f.write(json.dumps(tokenizer.init_config))
+    f.close()
+
+
+Trainer.compress = compress
+Trainer.quant = quant
+Trainer.reset_optimizer_and_scheduler = reset_optimizer_and_scheduler
+Trainer.cut_embeddings = cut_embeddings
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/trainer_seq2seq.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/trainer_seq2seq.py
new file mode 100644
index 000000000..d4bdab183
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/trainer_seq2seq.py
@@ -0,0 +1,248 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import paddle
+from paddle import nn
+from paddle.io import Dataset
+
+from .trainer import Trainer
+from .trainer_utils import PredictionOutput
+
+__all__ = [
+    "Seq2SeqTrainer",
+]
+
+
+class Seq2SeqTrainer(Trainer):
+    def evaluate(
+        self,
+        eval_dataset: Optional[Dataset] = None,
+        ignore_keys: Optional[List[str]] = None,
+        metric_key_prefix: str = "eval",
+        **gen_kwargs
+    ) -> Dict[str, float]:
+        """
+        Run evaluation and returns metrics.
+
+        The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
+        (pass it to the init `compute_metrics` argument).
+
+        You can also subclass and override this method to inject custom behavior.
+
+        Args:
+            eval_dataset (`Dataset`, *optional*):
+                Pass a dataset if you wish to override `self.eval_dataset`. If it is an [`~datasets.Dataset`], columns
+                not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__`
+                method.
+            ignore_keys (`List[str]`, *optional*):
+                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
+                gathering predictions.
+            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
+                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
+                "eval_bleu" if the prefix is `"eval"` (default)
+            max_length (`int`, *optional*):
+                The maximum target length to use when predicting with the generate method.
+            num_beams (`int`, *optional*):
+                Number of beams for beam search that will be used when predicting with the generate method. 1 means no
+                beam search.
+            gen_kwargs:
+                Additional `generate` specific kwargs.
+
+        Returns:
+            A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
+            dictionary also contains the epoch number which comes from the training state.
+        """
+
+        gen_kwargs = gen_kwargs.copy()
+        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
+            gen_kwargs["max_length"] = self.args.generation_max_length
+        gen_kwargs["num_beams"] = (
+            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams
+        )
+        self._gen_kwargs = gen_kwargs
+
+        return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
+
+    def predict(
+        self,
+        test_dataset: Dataset,
+        ignore_keys: Optional[List[str]] = None,
+        metric_key_prefix: str = "test",
+        **gen_kwargs
+    ) -> PredictionOutput:
+        """
+        Run prediction and returns predictions and potential metrics.
+
+        Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method
+        will also return metrics, like in `evaluate()`.
+
+        Args:
+            test_dataset (`Dataset`):
+                Dataset to run the predictions on. If it is a [`~datasets.Dataset`], columns not accepted by the
+                `model.forward()` method are automatically removed. Has to implement the method `__len__`
+            ignore_keys (`List[str]`, *optional*):
+                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
+                gathering predictions.
+            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
+                An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
+                "eval_bleu" if the prefix is `"eval"` (default)
+            max_length (`int`, *optional*):
+                The maximum target length to use when predicting with the generate method.
+            num_beams (`int`, *optional*):
+                Number of beams for beam search that will be used when predicting with the generate method. 1 means no
+                beam search.
+            gen_kwargs:
+                Additional `generate` specific kwargs.
+
+        <Tip>
+
+        If your predictions or labels have different sequence lengths (for instance because you're doing dynamic
+        padding in a token classification task) the predictions will be padded (on the right) to allow for
+        concatenation into one array. The padding index is -100.
+
+        </Tip>
+
+        Returns: *NamedTuple* A namedtuple with the following keys:
+
+            - predictions (`np.ndarray`): The predictions on `test_dataset`.
+            - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some).
+            - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained
+              labels).
+        """
+
+        gen_kwargs = gen_kwargs.copy()
+        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
+            gen_kwargs["max_length"] = self.args.generation_max_length
+        gen_kwargs["num_beams"] = (
+            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams
+        )
+        self._gen_kwargs = gen_kwargs
+
+        return super().predict(test_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
+
+    def prediction_step(
+        self,
+        model: nn.Layer,
+        inputs: Dict[str, Union[paddle.Tensor, Any]],
+        prediction_loss_only: bool,
+        ignore_keys: Optional[List[str]] = None,
+    ) -> Tuple[Optional[float], Optional[paddle.Tensor], Optional[paddle.Tensor]]:
+        """
+        Perform an evaluation step on `model` using `inputs`.
+
+        Subclass and override to inject custom behavior.
+
+        Args:
+            model (`nn.Layer`):
+                The model to evaluate.
+            inputs (`Dict[str, Union[paddle.Tensor, Any]]`):
+                The inputs and targets of the model.
+
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument `labels`. Check your model's documentation for all accepted arguments.
+            prediction_loss_only (`bool`):
+                Whether or not to return the loss only.
+
+        Return:
+            Tuple[Optional[float], Optional[paddle.Tensor], Optional[paddle.Tensor]]: A tuple with the loss, logits and
+            labels (each being optional).
+        """
+
+        if not self.args.predict_with_generate or prediction_loss_only:
+            return super().prediction_step(
+                model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
+            )
+
+        has_labels = "labels" in inputs
+        inputs = self._prepare_inputs(inputs)
+
+        gen_kwargs = self._gen_kwargs.copy()
+        if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
+            gen_kwargs["max_length"] = self.model.config.max_length
+        gen_kwargs["num_beams"] = (
+            gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.model.config.num_beams
+        )
+
+        if "attention_mask" in inputs:
+            gen_kwargs["attention_mask"] = inputs.get("attention_mask", None)
+        if "global_attention_mask" in inputs:
+            gen_kwargs["global_attention_mask"] = inputs.get("global_attention_mask", None)
+
+        # prepare generation inputs
+        # some encoder-decoder models can have varying encoder's and thus
+        # varying model input names
+        if hasattr(self.model, "encoder") and self.model.encoder.main_input_name != self.model.main_input_name:
+            generation_inputs = inputs[self.model.encoder.main_input_name]
+        else:
+            generation_inputs = inputs[self.model.main_input_name]
+
+        generated_tokens = self.model.generate(
+            generation_inputs,
+            **gen_kwargs,
+        )
+        # different from hf returns: tuple[Tensor]: It is a tuple contains two elements: ids and scores.
+        if isinstance(generated_tokens, tuple):
+            generated_tokens = generated_tokens[0]
+        # in case the batch is shorter than max length, the output should be padded
+        if gen_kwargs.get("max_length") is not None and generated_tokens.shape[-1] < gen_kwargs["max_length"]:
+            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
+        elif gen_kwargs.get("max_new_tokens") is not None and generated_tokens.shape[-1] < (
+            gen_kwargs["max_new_tokens"] + 1
+        ):
+            generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_new_tokens"] + 1)
+
+        with paddle.no_grad():
+            if has_labels:
+                with self.autocast_smart_context_manager():
+                    outputs = model(**inputs)
+                if self.label_smoother is not None:
+                    loss = self.label_smoother(outputs, inputs["labels"]).mean().detach()
+                else:
+                    loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach()
+            else:
+                loss = None
+
+        if self.args.prediction_loss_only:
+            return (loss, None, None)
+
+        if has_labels:
+            labels = inputs["labels"]
+            if gen_kwargs.get("max_length") is not None and labels.shape[-1] < gen_kwargs["max_length"]:
+                labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
+            elif gen_kwargs.get("max_new_tokens") is not None and labels.shape[-1] < (
+                gen_kwargs["max_new_tokens"] + 1
+            ):
+                labels = self._pad_tensors_to_max_len(labels, (gen_kwargs["max_new_tokens"] + 1))
+        else:
+            labels = None
+
+        return (loss, generated_tokens, labels)
+
+    def _pad_tensors_to_max_len(self, tensor, max_length):
+        if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"):
+            # If PAD token is not defined at least EOS token has to be defined
+            pad_token_id = (
+                self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
+            )
+        else:
+            if self.model.config.pad_token_id is not None:
+                pad_token_id = self.model.config.pad_token_id
+            else:
+                raise ValueError("Pad_token_id must be set in the configuration of the model, in order to pad tensors")
+        # paddle.ones need to support device args.
+        padded_tensor = pad_token_id * paddle.ones((tensor.shape[0], max_length), dtype=tensor.dtype)
+        padded_tensor[:, : tensor.shape[-1]] = tensor
+        return padded_tensor
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/trainer_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/trainer_utils.py
new file mode 100644
index 000000000..a385e3655
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/trainer_utils.py
@@ -0,0 +1,1101 @@
+# Copyright 2020-present the HuggingFace Inc. team.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is modified from
+#  https://github.com/huggingface/transformers/blob/main/src/transformers/trainer_utils.py
+
+"""
+Utilities for the Trainer class.
+"""
+import datetime
+import gc
+import inspect
+import json
+import math
+import os
+import random
+import re
+import threading
+import time
+from contextlib import contextmanager
+from enum import Enum
+from typing import Dict, List, NamedTuple, Optional, Tuple, Union
+
+import numpy as np
+import paddle
+from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
+from paddle.io import IterableDataset
+from paddle.optimizer.lr import LambdaDecay
+
+from paddlenlp.ops import Topology
+
+from ..trainer.argparser import strtobool
+from ..transformers.tokenizer_utils_base import BatchEncoding
+from ..utils.import_utils import is_paddle_cuda_available, is_psutil_available
+from ..utils.log import logger
+
+__all__ = [
+    "TrainOutput",
+    "PredictionOutput",
+    "EvalPrediction",
+    "IntervalStrategy",
+    "SchedulerType",
+    "set_seed",
+    "speed_metrics",
+    "get_last_checkpoint",
+    "get_scheduler",
+    "set_hyrbid_parallel_seed",
+    "log_trainer_start",
+]
+
+
+def log_trainer_start():
+    if "MAIN_PROCESS_STARTED" not in os.environ:
+        start_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+        logger.info(f"The Training Main Process Started Successfully. time: {start_time}, pid: {os.getpid()}")
+        os.environ["MAIN_PROCESS_STARTED"] = "1"
+
+
+def _get_distributed_seeds(seed: int = 1234, topo: Topology = None):
+    """
+    Get the seeds from distributed environment strategy.
+    Args:
+        seed (:obj:`int`, `optional`, defaults to 1234): The seeds for initializing distributed training.
+        topo (:obj:`Topology`, `optional`, defaults to None): The topology of hybrid parallel in semi-auto mode.
+    Returns:
+        Tuple[int, int]: The global seed and local seed respectively.
+    """
+
+    # NOTE: For parameter init seed:
+    # seed: dp/mp_undistributed_paramter/sharding is same; others is different
+    # For compute seed(dropout):
+    # global seed: only mp group is same.
+    # local seed: all groups are different
+    hcg = None
+    if hasattr(fleet.fleet, "_hcg") and topo is None:
+        hcg = fleet.get_hybrid_communicate_group()
+
+    if topo is not None and paddle.distributed.get_world_size() > 1:
+        dp_rank = topo.dp_info.rank
+        dp_size = topo.dp_info.size
+
+        pp_rank = topo.pp_info.rank
+        pp_size = topo.pp_info.size
+
+        mp_rank = topo.mp_info.rank
+        mp_size = topo.mp_info.size
+
+        sep_rank = topo.sep_info.rank
+        sep_size = topo.sep_info.size
+
+        sharding_rank = topo.sharding_info.rank
+    elif hcg is not None and paddle.distributed.get_world_size() > 1:
+        # obtain rank message of hybrid parallel
+
+        mp_rank = hcg.get_model_parallel_rank()
+        mp_size = hcg.get_model_parallel_world_size()
+
+        if hasattr(hcg, "get_sep_parallel_rank"):
+            sep_rank = hcg.get_sep_parallel_rank()
+            sep_size = hcg.get_sep_parallel_world_size()
+        else:
+            sep_rank, sep_size = 0, 1
+
+        pp_rank = hcg.get_stage_id()
+        pp_size = hcg.get_pipe_parallel_world_size()
+
+        dp_rank = hcg.get_data_parallel_rank()
+        dp_size = hcg.get_data_parallel_world_size()
+
+        sharding_rank = hcg.get_sharding_parallel_rank()
+    else:
+        mp_rank, mp_size = 0, 1
+        sep_rank, sep_size = 0, 1
+        pp_rank, pp_size = 0, 1
+        dp_rank, dp_size = 0, 1
+        sharding_rank, _ = 0, 1
+
+    seed_offset = seed
+    global_seed = (
+        seed_offset
+        + sep_rank * (mp_size)
+        + pp_rank * (mp_size * sep_size)
+        + dp_rank * (mp_size * sep_size * pp_size)
+        + sharding_rank * (mp_size * sep_size * pp_size * dp_size)
+    )
+
+    seed_offset += paddle.distributed.get_world_size()
+    local_seed = (
+        seed_offset
+        + mp_rank
+        + sep_rank * (mp_size)
+        + pp_rank * (mp_size * sep_size)
+        + dp_rank * (mp_size * sep_size * pp_size)
+        + sharding_rank * (mp_size * sep_size * pp_size * dp_size)
+    )
+
+    # NOTE: the commented seeds are set only for precision validation
+    random_seed = seed + 100 * pp_rank
+
+    return global_seed, local_seed, random_seed
+
+
+def set_seed(seed: int = 1234, topo=None):
+    global_seed, local_seed, random_seed = _get_distributed_seeds(seed, topo)
+
+    tracker = get_rng_state_tracker()
+    if "global_seed" not in tracker.states_ and global_seed not in tracker.seeds_:
+        tracker.add("global_seed", global_seed)
+
+    if "local_seed" not in tracker.states_ and local_seed not in tracker.seeds_:
+        tracker.add("local_seed", local_seed)
+
+    paddle.seed(global_seed)
+    random.seed(random_seed)
+    np.random.seed(random_seed)
+
+    logger.info(
+        "The global seed is set to {}, local seed is set to {} and "
+        "random seed is set to {}.".format(global_seed, local_seed, random_seed)
+    )
+
+
+def _switch_mode(mode="dynamic"):
+    assert mode in ["dynamic", "static"]
+    if mode == "dynamic":
+        paddle.disable_static()
+    else:
+        paddle.enable_static()
+
+
+@contextmanager
+def _exec_mode_guard(mode="dynamic"):
+    origin_mode = "dynamic" if paddle.in_dynamic_mode() else "static"
+    _switch_mode(mode)
+    try:
+        yield
+    finally:
+        _switch_mode(origin_mode)
+
+
+class ExplicitEnum(Enum):
+    """
+    Enum with more explicit error message for missing values.
+    """
+
+    @classmethod
+    def _missing_(cls, value):
+        raise ValueError(
+            f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}"
+        )
+
+
+class EvalPrediction(NamedTuple):
+    """
+    Evaluation output (always contains labels), to be used to compute metrics.
+
+    Parameters:
+        predictions (`np.ndarray`): Predictions of the model.
+        label_ids (`np.ndarray`): Targets to be matched.
+    """
+
+    predictions: Union[np.ndarray, Tuple[np.ndarray]]
+    label_ids: Union[np.ndarray, Tuple[np.ndarray]]
+
+
+class EvalLoopOutput(NamedTuple):
+    predictions: Union[np.ndarray, Tuple[np.ndarray]]
+    label_ids: Optional[Union[np.ndarray, Tuple[np.ndarray]]]
+    metrics: Optional[Dict[str, float]]
+    num_samples: Optional[int]
+
+
+class PredictionOutput(NamedTuple):
+    predictions: Union[np.ndarray, Tuple[np.ndarray]]
+    label_ids: Optional[Union[np.ndarray, Tuple[np.ndarray]]]
+    metrics: Optional[Dict[str, float]]
+
+
+class TrainOutput(NamedTuple):
+    global_step: int
+    training_loss: float
+    metrics: Dict[str, float]
+
+
+PREFIX_CHECKPOINT_DIR = "checkpoint"
+_re_checkpoint = re.compile(r"^" + PREFIX_CHECKPOINT_DIR + r"\-(\d+)$")
+
+
+def _check_checkpoint_files(folder_path, world_size, ignore_save_lr_and_optim, skip_save_model_weight):
+    files = os.listdir(folder_path)
+    model_weight_files = [f for f in files if f.startswith(".model_weight")]
+    a = len(model_weight_files) == world_size
+    if not ignore_save_lr_and_optim:
+        b = True
+        if not skip_save_model_weight:
+            master_weight_file = [f for f in files if f.startswith(".master_weight")]
+            b = len(master_weight_file) == world_size
+        optimizer_file = [f for f in files if f.startswith(".optimizer_weight")]
+        c = len(optimizer_file) == world_size
+        return a and b and c
+    else:
+        return a
+
+
+def get_last_checkpoint(folder, uc_async_save=False):
+    content = os.listdir(folder)
+    checkpoints = [
+        path
+        for path in content
+        if _re_checkpoint.search(path) is not None and os.path.isdir(os.path.join(folder, path))
+    ]
+    if len(checkpoints) == 0:
+        return
+
+    if strtobool(os.getenv("FLAG_LLM_PDC", "False")):
+        for i in sorted(checkpoints, key=lambda x: int(_re_checkpoint.search(x).groups()[0]), reverse=True):
+            current_path = os.path.join(folder, i)
+            # make sure the checkpoint is valid
+            if not uc_async_save:
+                if os.path.exists(os.path.join(current_path, ".checkpoint_done")):
+                    return current_path
+            else:
+                saving_info = paddle.load(os.path.join(current_path, ".saving_info"))
+                pre_world_size = saving_info.get("world_size", 1)
+                ignore_save_lr_and_optim = saving_info.get("ignore_save_lr_and_optim", False)
+                skip_save_model_weight = saving_info.get("skip_save_model_weight", False)
+                if _check_checkpoint_files(
+                    current_path, pre_world_size, ignore_save_lr_and_optim, skip_save_model_weight
+                ):
+                    return current_path
+        return
+    else:
+        return os.path.join(folder, max(checkpoints, key=lambda x: int(_re_checkpoint.search(x).groups()[0])))
+
+
+class IntervalStrategy(ExplicitEnum):
+    NO = "no"
+    STEPS = "steps"
+    EPOCH = "epoch"
+
+
+class EvaluationStrategy(ExplicitEnum):
+    NO = "no"
+    STEPS = "steps"
+    EPOCH = "epoch"
+
+
+class OptimizerNames(ExplicitEnum):
+    """
+    Stores the acceptable string identifiers for optimizers.
+    """
+
+    ADAMW = "adamw"
+    ADAFACTOR = "adafactor"
+
+
+class ShardingOption(ExplicitEnum):
+    """
+    Sharding Option
+    OP for sharding optimizer state
+    GRAD for sharding gradients
+    FULL_SHARD for sharding optimizer gradient and parameter
+    OFFLOAD means offload to cpu.
+    """
+
+    SHARD_OP = "stage1"
+    SHARD_GRAD_OP = "stage2"
+    FULL_SHARD = "stage3"
+    # NO_SHARD = "no"
+    OFFLOAD = "offload"
+
+
+def is_main_process(local_rank):
+    """
+    Whether or not the current process is the local process, based on `xm.get_ordinal()` (for TPUs) first, then on
+    `local_rank`.
+    """
+
+    return local_rank in [-1, 0]
+
+
+def total_processes_number(local_rank):
+    """
+    Return the number of processes launched in parallel. Works with `paddle.distributed` and TPUs.
+    """
+    if local_rank != -1:
+        import paddle
+
+        return paddle.distributed.get_world_size()
+    return 1
+
+
+def speed_metrics(split, start_time, num_samples=None, num_steps=None, seq_length=None):
+    """
+    Measure and return speed performance metrics.
+
+    This function requires a time snapshot `start_time` before the operation to be measured starts and this function
+    should be run immediately after the operation to be measured has completed.
+
+    Args:
+
+    - split: name to prefix metric (like train, eval, test...)
+    - start_time: operation start time
+    - num_samples: number of samples processed
+    """
+    runtime = time.time() - start_time
+    result = {f"{split}_runtime": round(runtime, 4)}
+    if num_samples is not None:
+        samples_per_second = num_samples / runtime
+        result[f"{split}_samples_per_second"] = round(samples_per_second, 4)
+        if seq_length is not None:
+            tokens_per_second_per_device = samples_per_second * seq_length / paddle.distributed.get_world_size()
+            result[f"{split}_tokens_per_second_per_device"] = round(tokens_per_second_per_device, 4)
+    if num_steps is not None:
+        steps_per_second = num_steps / runtime
+        result[f"{split}_steps_per_second"] = round(steps_per_second, 4)
+    return result
+
+
+class SchedulerType(ExplicitEnum):
+    LINEAR = "linear"
+    COSINE = "cosine"
+    CONSTANT = "constant"
+    CONSTANT_WITH_WARMUP = "constant_with_warmup"
+    POLYNOMIAL = "polynomial"
+
+
+def get_constant_schedule(learning_rate: float, last_epoch: int = -1):
+    """
+    Create a schedule with a constant learning rate, using the learning rate set in optimizer.
+    Args:
+        learning_rate (float)
+            The initial learning rate. It is a python float number.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    Return:
+        `paddle.optimizer.lr.LambdaDecay` with the appropriate schedule.
+    """
+    return LambdaDecay(learning_rate, lambda _: 1, last_epoch=last_epoch)
+
+
+def get_constant_schedule_with_warmup(learning_rate: float, num_warmup_steps: int, last_epoch: int = -1):
+    """
+    Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate
+    increases linearly between 0 and the initial lr set in the optimizer.
+    Args:
+        learning_rate (float)
+            The initial learning rate. It is a python float number.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    Return:
+        `paddle.optimizer.lr.LambdaDecay` with the appropriate schedule.
+    """
+
+    def lr_lambda(current_step: int):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1.0, num_warmup_steps))
+        return 1.0
+
+    return LambdaDecay(learning_rate, lr_lambda, last_epoch=last_epoch)
+
+
+def get_linear_schedule_with_warmup(learning_rate: float, num_warmup_steps, num_training_steps, last_epoch=-1):
+    """
+    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
+    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
+    Args:
+        learning_rate (float)
+            The initial learning rate. It is a python float number.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    Return:
+        `paddle.optimizer.lr.LambdaDecay` with the appropriate schedule.
+    """
+
+    def lr_lambda(current_step: int):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        return max(
+            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
+        )
+
+    return LambdaDecay(learning_rate, lr_lambda, last_epoch)
+
+
+def get_cosine_schedule_with_warmup(
+    learning_rate: float, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, last_epoch: int = -1
+):
+    """
+    Create a schedule with a learning rate that decreases following the values of the cosine function between the
+    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
+    initial lr set in the optimizer.
+    Args:
+        learning_rate (float)
+            The initial learning rate. It is a python float number.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        num_cycles (`float`, *optional*, defaults to 0.5):
+            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
+            following a half-cosine).
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    Return:
+        `paddle.optimizer.lr.LambdaDecay` with the appropriate schedule.
+    """
+
+    def lr_lambda(current_step):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
+        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
+
+    return LambdaDecay(learning_rate, lr_lambda, last_epoch)
+
+
+def get_polynomial_decay_schedule_with_warmup(
+    learning_rate: float,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    lr_end: float = 1e-7,
+    power: float = 1.0,
+    last_epoch: int = -1,
+):
+    """
+    Create a schedule with a learning rate that decreases as a polynomial decay from the initial lr set in the
+    optimizer to end lr defined by *lr_end*, after a warmup period during which it increases linearly from 0 to the
+    initial lr set in the optimizer.
+    Args:
+        learning_rate (`float`):
+            The base learning rate. It is a python float number.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        lr_end (`float`, *optional*, defaults to 1e-7):
+            The end LR.
+        power (`float`, *optional*, defaults to 1.0):
+            Power factor.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    Note: *power* defaults to 1.0 as in the fairseq implementation, which in turn is based on the original BERT
+    implementation at
+    https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/optimization.py#L37
+    Return:
+        `paddle.optimizer.lr.LambdaDecay` with the appropriate schedule.
+    """
+
+    lr_init = learning_rate
+    if not (lr_init > lr_end):
+        raise ValueError(f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})")
+
+    def lr_lambda(current_step: int):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        elif current_step > num_training_steps:
+            return lr_end / lr_init  # as LambdaLR multiplies by lr_init
+        else:
+            lr_range = lr_init - lr_end
+            decay_steps = num_training_steps - num_warmup_steps
+            pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps
+            decay = lr_range * pct_remaining**power + lr_end
+            return decay / lr_init  # as LambdaLR multiplies by lr_init
+
+    return LambdaDecay(learning_rate, lr_lambda, last_epoch)
+
+
+TYPE_TO_SCHEDULER_FUNCTION = {
+    SchedulerType.LINEAR: get_linear_schedule_with_warmup,
+    SchedulerType.COSINE: get_cosine_schedule_with_warmup,
+    SchedulerType.CONSTANT: get_constant_schedule,
+    SchedulerType.POLYNOMIAL: get_polynomial_decay_schedule_with_warmup,
+    SchedulerType.CONSTANT_WITH_WARMUP: get_constant_schedule_with_warmup,
+}
+
+
+def get_scheduler(
+    name: Union[str, SchedulerType],
+    learning_rate: float,
+    num_warmup_steps: Optional[int] = None,
+    num_training_steps: Optional[int] = None,
+    num_cycles: Optional[float] = 0.5,
+    lr_end: Optional[float] = 1e-7,
+    power: Optional[float] = 1.0,
+):
+    """
+    Unified API to get any scheduler from its name.
+    Args:
+        name (`str` or `SchedulerType`):
+            The name of the scheduler to use.
+        learning_rate (float)
+            The initial learning rate. It is a python float number.
+        num_warmup_steps (`int`, *optional*):
+            The number of warmup steps to do. This is not required by all schedulers (hence the argument being
+            optional), the function will raise an error if it's unset and the scheduler type requires it.
+        num_training_steps (`int``, *optional*):
+            The number of training steps to do. This is not required by all schedulers (hence the argument being
+            optional), the function will raise an error if it's unset and the scheduler type requires it.
+        num_cycles (``float``, *optional*):
+            The number of waves in the cosine scheduler (the defaults is to just decrease from the max value to 0
+            following a half-cosine). This is not required by all schedulers (hence the argument being optional)
+        lr_end (``float``, *optional*):
+            The end LR in the polynomial scheduler. This is not required by all schedulers (hence the argument
+            being optional).
+        power (``float``, *optional*):
+            The power factor in the polynomial scheduler. This is not required by all schedulers (hence the argument
+            being optional).
+    """
+    name = SchedulerType(name)
+    schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
+    if name == SchedulerType.CONSTANT:
+        return schedule_func(learning_rate)
+
+    # All other schedulers require `num_warmup_steps`
+    if num_warmup_steps is None:
+        raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
+
+    if name == SchedulerType.CONSTANT_WITH_WARMUP:
+        return schedule_func(learning_rate, num_warmup_steps=num_warmup_steps)
+
+    # All other schedulers require `num_training_steps`
+    if num_training_steps is None:
+        raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
+
+    if name == SchedulerType.COSINE:
+        return schedule_func(
+            learning_rate,
+            num_warmup_steps=num_warmup_steps,
+            num_training_steps=num_training_steps,
+            num_cycles=num_cycles,
+        )
+
+    if name == SchedulerType.POLYNOMIAL:
+        return schedule_func(
+            learning_rate,
+            num_warmup_steps=num_warmup_steps,
+            num_training_steps=num_training_steps,
+            lr_end=lr_end,
+            power=power,
+        )
+
+    return schedule_func(learning_rate, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
+
+
+def _secs2timedelta(secs):
+    """
+    convert seconds to hh:mm:ss.msec, msecs rounded to 2 decimals
+    """
+
+    msec = int(abs(secs - int(secs)) * 100)
+    return f"{datetime.timedelta(seconds=int(secs))}.{msec:02d}"
+
+
+def metrics_format(self, metrics: Dict[str, float]) -> Dict[str, float]:
+    """
+    Reformat Trainer metrics values to a human-readable format
+    Args:
+        metrics (`Dict[str, float]`):
+            The metrics returned from train/evaluate/predict
+    Returns:
+        metrics (`Dict[str, float]`): The reformatted metrics
+    """
+
+    metrics_copy = metrics.copy()
+    for k, v in metrics_copy.items():
+        if "_mem_" in k:
+            metrics_copy[k] = f"{ v >> 20 }MB"
+        elif "_runtime" in k:
+            metrics_copy[k] = _secs2timedelta(v)
+        elif k == "total_flos":
+            metrics_copy[k] = f"{ int(v) >> 30 }GF"
+        elif isinstance(metrics_copy[k], float):
+            metrics_copy[k] = round(v, 4)
+
+    return metrics_copy
+
+
+def log_metrics(self, split, metrics):
+    """
+    Log metrics in a specially formatted way
+    Under distributed environment this is done only for a process with rank 0.
+    Args:
+        split (`str`):
+            Mode/split name: one of `train`, `eval`, `test`
+        metrics (`Dict[str, float]`):
+            The metrics returned from train/evaluate/predictmetrics: metrics dict
+    """
+    if not self.is_world_process_zero():
+        return
+
+    logger.info(f"***** {split} metrics *****")
+    metrics_formatted = self.metrics_format(metrics)
+    k_width = max(len(str(x)) for x in metrics_formatted.keys())
+    v_width = max(len(str(x)) for x in metrics_formatted.values())
+    for key in sorted(metrics_formatted.keys()):
+        logger.info(f"  {key: <{k_width}} = {metrics_formatted[key]:>{v_width}}")
+
+
+def save_metrics(self, split, metrics, combined=True):
+    """
+    Save metrics into a json file for that split, e.g. `train_results.json`.
+    Under distributed environment this is done only for a process with rank 0.
+    Args:
+        split (`str`):
+            Mode/split name: one of `train`, `eval`, `test`, `all`
+        metrics (`Dict[str, float]`):
+            The metrics returned from train/evaluate/predict
+        combined (`bool`, *optional*, defaults to `True`):
+            Creates combined metrics by updating `all_results.json` with metrics of this call
+    To understand the metrics please read the docstring of [`~Trainer.log_metrics`]. The only difference is that raw
+    unformatted numbers are saved in the current method.
+    """
+    if not self.is_world_process_zero():
+        return
+
+    path = os.path.join(self.args.output_dir, f"{split}_results.json")
+    with open(path, "w") as f:
+        json.dump(metrics, f, indent=4, sort_keys=True)
+
+    if combined:
+        path = os.path.join(self.args.output_dir, "all_results.json")
+        if os.path.exists(path):
+            with open(path, "r") as f:
+                all_metrics = json.load(f)
+        else:
+            all_metrics = {}
+
+        all_metrics.update(metrics)
+        with open(path, "w") as f:
+            json.dump(all_metrics, f, indent=4, sort_keys=True)
+
+
+def save_state(self):
+    """
+    Saves the Trainer state, since Trainer.save_model saves only the tokenizer with the model
+    Under distributed environment this is done only for a process with rank 0.
+    """
+    if not self.is_world_process_zero():
+        return
+
+    path = os.path.join(self.args.output_dir, "trainer_state.json")
+    self.state.save_to_json(path)
+
+
+def has_length(dataset):
+    """
+    Checks if the dataset implements __len__() and it doesn't raise an error
+    """
+    try:
+        return len(dataset) is not None
+    except (TypeError, ValueError, RuntimeError):
+        # TypeError: len() of unsized object
+        return False
+
+
+class TrainerMemoryTracker:
+    """
+    A helper class that tracks cpu and gpu memory.
+
+    This class will silently skip unless `psutil` is available. Install with `pip install psutil`.
+
+    When a stage completes, it can pass metrics dict to update with the memory metrics gathered during this stage.
+
+    Example :
+
+    ```python
+    self._memory_tracker = TrainerMemoryTracker(self.args.skip_memory_metrics)
+    self._memory_tracker.start()
+    # code ...
+    metrics = {"train_runtime": 10.5}
+    self._memory_tracker.stop_and_update_metrics(metrics)
+    ```
+
+    At the moment GPU tracking is only for `paddle`.
+
+    # To understand this class' intricacies please read the documentation of [`~Trainer.log_metrics`].
+    """
+
+    # map trainer methods to metrics prefix
+    stages = {
+        "__init__": "init",
+        "train": "train",
+        "_inner_training_loop": "train",
+        "evaluate": "eval",
+        "predict": "test",
+    }
+
+    def __init__(self, skip_memory_metrics=False):
+
+        self.skip_memory_metrics = skip_memory_metrics
+
+        if not is_psutil_available():
+            # soft dependency on psutil
+            self.skip_memory_metrics = True
+
+        if self.skip_memory_metrics:
+            return
+
+        import psutil  # noqa
+
+        if is_paddle_cuda_available():
+            import paddle
+
+            self.paddle = paddle
+            self.gpu = {}
+        else:
+            self.paddle = None
+
+        self.process = psutil.Process()
+
+        self.cur_stage = None
+        self.cpu = {}
+        self.init_reported = False
+
+    def derive_stage(self):
+        """derives the stage/caller name automatically"""
+        caller = inspect.currentframe().f_back.f_back.f_code.co_name
+        if caller in self.stages:
+            return self.stages[caller]
+        else:
+            raise ValueError(
+                f"was called from {caller}, but only expect to be called from one of {self.stages.keys()}"
+            )
+
+    def cpu_mem_used(self):
+        """get resident set size memory for the current process"""
+        return self.process.memory_info().rss
+
+    def peak_monitor_func(self):
+        self.cpu_mem_used_peak = -1
+
+        while True:
+            self.cpu_mem_used_peak = max(self.cpu_mem_used(), self.cpu_mem_used_peak)
+
+            # can't sleep or will not catch the peak right (this comment is here on purpose)
+            # time.sleep(0.001) # 1msec
+
+            if not self.peak_monitoring:
+                break
+
+    def start(self):
+        """start tracking for the caller's stage"""
+        if self.skip_memory_metrics:
+            return
+
+        stage = self.derive_stage()
+        # deal with nested calls of eval during train - simply ignore those
+        if self.cur_stage is not None and self.cur_stage != stage:
+            return
+
+        self.cur_stage = stage
+
+        gc.collect()
+
+        if self.paddle is not None:
+            # self.paddle.cuda.reset_peak_memory_stats()?
+            self.paddle.device.cuda.empty_cache()
+
+        # gpu
+        if self.paddle is not None:
+            self.gpu_mem_used_at_start = self.paddle.device.cuda.memory_allocated()
+
+        # cpu
+        self.cpu_mem_used_at_start = self.cpu_mem_used()
+
+        self.peak_monitoring = True
+        peak_monitor_thread = threading.Thread(target=self.peak_monitor_func)
+        peak_monitor_thread.daemon = True
+        peak_monitor_thread.start()
+
+    def stop(self, stage):
+        """stop tracking for the passed stage"""
+
+        # deal with nested calls of eval during train - simply ignore those
+        if self.cur_stage is not None and self.cur_stage != stage:
+            return
+
+        # this sends a signal to peak_monitor_func to complete its loop
+        self.peak_monitoring = False
+
+        # first ensure all objects get collected and their memory is freed
+        gc.collect()
+
+        if self.paddle is not None:
+            self.paddle.device.cuda.empty_cache()
+
+        # concepts:
+        # - alloc_delta:  the difference of allocated memory between the end and the start
+        # - peaked_delta: the difference between the peak memory and the current memory
+        # in order to know how much memory the measured code consumed one needs to sum these two
+
+        # gpu
+        if self.paddle is not None:
+            self.gpu_mem_used_now = self.paddle.device.cuda.memory_allocated()
+            self.gpu_mem_used_peak = self.paddle.device.cuda.max_memory_allocated()
+            self.gpu[self.cur_stage] = dict(
+                begin=self.gpu_mem_used_at_start,
+                end=self.gpu_mem_used_now,
+                alloc=(self.gpu_mem_used_now - self.gpu_mem_used_at_start),
+                peaked=max(0, self.gpu_mem_used_peak - self.gpu_mem_used_now),
+            )
+
+        # cpu
+        self.cpu_mem_used_now = self.cpu_mem_used()
+        self.cpu[self.cur_stage] = dict(
+            begin=self.cpu_mem_used_at_start,
+            end=self.cpu_mem_used_now,
+            alloc=(self.cpu_mem_used_now - self.cpu_mem_used_at_start),
+            peaked=max(0, self.cpu_mem_used_peak - self.cpu_mem_used_now),
+        )
+
+        # reset - cycle finished
+        self.cur_stage = None
+
+    def update_metrics(self, stage, metrics):
+        """updates the metrics"""
+        if self.skip_memory_metrics:
+            return
+
+        # deal with nested calls of eval during train - simply ignore those
+        if self.cur_stage is not None and self.cur_stage != stage:
+            return
+
+        if hasattr(self, "gpu_mem_used_peak"):
+            metrics["gpu_mem_max_memory_allocated"] = self.gpu_mem_used_peak
+            metrics["gpu_mem_max_memory_reserved"] = self.paddle.device.cuda.max_memory_reserved()
+
+        # since we don't have a way to return init metrics, we push them into the first of train/val/predict
+        stages = [stage]
+        if not self.init_reported:
+            stages.insert(0, "init")
+            self.init_reported = True
+
+        for stage in stages:
+            for t in ["alloc", "peaked"]:
+                if stage in self.cpu and t in self.cpu[stage]:
+                    metrics[f"{stage}_mem_cpu_{t}_delta"] = self.cpu[stage][t]
+                if self.paddle is not None and stage in self.gpu and t in self.gpu[stage]:
+                    metrics[f"{stage}_mem_gpu_{t}_delta"] = self.gpu[stage][t]
+            # if we need additional debug info, enable the following
+            # for t in ["begin", "end"]:
+            #     if stage in self.cpu and t in self.cpu[stage]:
+            #         metrics[f"{stage}_mem_cpu_{t}"] = self.cpu[stage][t]
+            #     if self.paddle is not None and stage in self.gpu and t in self.gpu[stage]:
+            #         metrics[f"{stage}_mem_gpu_{t}"] = self.gpu[stage][t]
+
+        # since memory can be allocated before init, and it might be difficult to track overall
+        # memory usage, in particular for GPU, let's report memory usage at the point init was called
+        if stages[0] == "init":
+            metrics["before_init_mem_cpu"] = self.cpu["init"]["begin"]
+            if self.paddle is not None:
+                metrics["before_init_mem_gpu"] = self.gpu["init"]["begin"]
+            # if we also wanted to report any additional memory allocations in between init and
+            # whatever the next stage was we could also report this:
+            # if self.cpu["init"]["end"] != self.cpu[stage]["begin"]:
+            #     metrics[f"after_init_mem_cpu_delta"] = self.cpu[stage]["begin"] - self.cpu["init"]["end"]
+            # if self.paddle is not None and self.gpu["init"]["end"] != self.gpu[stage]["begin"]:
+            #     metrics[f"after_init_mem_gpu_delta"] = self.gpu[stage]["begin"] - self.gpu["init"]["end"]
+
+    def stop_and_update_metrics(self, metrics=None):
+        """combine stop and metrics update in one call for simpler code"""
+        if self.skip_memory_metrics:
+            return
+
+        stage = self.derive_stage()
+        self.stop(stage)
+
+        # init doesn't have metrics to update so we just save that data for later stages to retrieve
+        if metrics is not None:
+            self.update_metrics(stage, metrics)
+
+
+class IterableDatasetShard(IterableDataset):
+    """
+    Wraps a Paddle `IterableDataset` to generate samples for one of the processes only. Instances of this class will
+    always yield a number of samples that is a round multiple of the actual batch size (which is `batch_size x
+    num_processes`). Depending on the value of the `drop_last` attribute, it will either stop the iteration at the
+    first batch that would be too small or loop with indices from the beginning.
+    On two processes with an iterable dataset yielding of `[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]` with a batch size of
+    2:
+    - the shard on process 0 will yield `[0, 1, 4, 5, 8, 9]` so will see batches `[0, 1]`, `[4, 5]`, `[8, 9]`
+    - the shard on process 1 will yield `[2, 3, 6, 7, 10, 11]` so will see batches `[2, 3]`, `[6, 7]`, `[10, 11]`
+    Args:
+        dataset (`paddle.io.IterableDataset`):
+            The batch sampler to split in several shards.
+        batch_size (`int`, *optional*, defaults to 1):
+            The size of the batches per shard.
+        drop_last (`bool`, *optional*, defaults to `False`):
+            Whether or not to drop the last incomplete batch or complete the last batches by using the samples from the
+            beginning.
+        num_processes (`int`, *optional*, defaults to 1):
+            The number of processes running concurrently.
+        process_index (`int`, *optional*, defaults to 0):
+            The index of the current process.
+        seed (`int`, *optional*, defaults to 0):
+            A random seed that will be used for the random number generation in
+            [`~trainer_utils.IterableDatasetShard.set_epoch`].
+    """
+
+    def __init__(
+        self,
+        dataset: IterableDataset,
+        batch_size: int = 1,
+        drop_last: bool = False,
+        num_processes: int = 1,
+        process_index: int = 0,
+        seed: int = 0,
+    ):
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+        self.num_processes = num_processes
+        self.process_index = process_index
+        self.seed = seed
+        self.epoch = 0
+        self.num_examples = 0
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+        if hasattr(self.dataset, "set_epoch"):
+            self.dataset.set_epoch(epoch)
+
+    def __iter__(self):
+        self.num_examples = 0
+        # TODO: support generator seed in sampling.
+        #
+        # if (
+        #     not hasattr(self.dataset, "set_epoch")
+        #     and hasattr(self.dataset, "generator")
+        #     and isinstance(self.dataset.generator, paddle.fluid.Generator)
+        # ):
+        #     self.dataset.generator.manual_seed(self.seed + self.epoch)
+        real_batch_size = self.batch_size * self.num_processes
+        process_slice = range(self.process_index * self.batch_size, (self.process_index + 1) * self.batch_size)
+
+        first_batch = None
+        current_batch = []
+        for element in self.dataset:
+            self.num_examples += 1
+            current_batch.append(element)
+            # Wait to have a full batch before yielding elements.
+            if len(current_batch) == real_batch_size:
+                for i in process_slice:
+                    yield current_batch[i]
+                if first_batch is None:
+                    first_batch = current_batch.copy()
+                current_batch = []
+
+        # Finished if drop_last is True, otherwise complete the last batch with elements from the beginning.
+        if not self.drop_last and len(current_batch) > 0:
+            if first_batch is None:
+                first_batch = current_batch.copy()
+            while len(current_batch) < real_batch_size:
+                current_batch += first_batch
+            for i in process_slice:
+                yield current_batch[i]
+
+    def __len__(self):
+        # Will raise an error if the underlying dataset is not sized.
+        if self.drop_last:
+            return (len(self.dataset) // (self.batch_size * self.num_processes)) * self.batch_size
+        else:
+            return math.ceil(len(self.dataset) / (self.batch_size * self.num_processes)) * self.batch_size
+
+
+def find_batch_size(tensors):
+    """
+    Find the first dimension of a tensor in a nested list/tuple/dict of tensors.
+    """
+    if isinstance(tensors, (list, tuple)):
+        for t in tensors:
+            result = find_batch_size(t)
+            if result is not None:
+                return result
+    elif isinstance(tensors, (dict, BatchEncoding)):
+        for key, value in tensors.items():
+            result = find_batch_size(value)
+            if result is not None:
+                return result
+    elif isinstance(tensors, paddle.Tensor):
+        return tensors.shape[0] if len(tensors.shape) >= 1 else None
+    elif isinstance(tensors, np.ndarray):
+        return tensors.shape[0] if len(tensors.shape) >= 1 else None
+
+
+class RemoveColumnsCollator:
+    """Wrap the data collator to remove unused columns before they are passed to the collator."""
+
+    def __init__(
+        self,
+        data_collator,
+        signature_columns,
+        logger=None,
+        model_name: Optional[str] = None,
+        description: Optional[str] = None,
+    ):
+        self.data_collator = data_collator
+        self.signature_columns = signature_columns
+        self.logger = logger
+        self.description = description
+        self.model_name = model_name
+        self.message_logged = False
+
+    def _remove_columns(self, feature: dict) -> dict:
+        if not isinstance(feature, dict):
+            return feature
+        if not self.message_logged and self.logger and self.model_name:
+            ignored_columns = list(set(feature.keys()) - set(self.signature_columns))
+            if len(ignored_columns) > 0:
+                dset_description = "" if self.description is None else f"in the {self.description} set"
+                self.logger.info(
+                    f"The following columns {dset_description} don't have a corresponding argument in "
+                    f"`{self.model_name}.forward` and have been ignored: {', '.join(ignored_columns)}."
+                    f" If {', '.join(ignored_columns)} are not expected by `{self.model_name}.forward`, "
+                    " you can safely ignore this message."
+                )
+                self.message_logged = True
+        return {k: v for k, v in feature.items() if k in self.signature_columns}
+
+    def __call__(self, features: List[dict]):
+        features = [self._remove_columns(feature) for feature in features]
+        return self.data_collator(features)
+
+
+def set_hyrbid_parallel_seed(basic_seed, dataset_rank, tp_rank, pp_rank=0):
+    from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
+
+    random.seed(basic_seed + dataset_rank)
+    np.random.seed(basic_seed + dataset_rank)
+    paddle.seed(basic_seed + dataset_rank)
+
+    # local_seed/ global_seed is used to control dropout in ModelParallel
+    local_seed = basic_seed + 59999 + tp_rank * 10 + pp_rank * 1000
+    global_seed = basic_seed + 100003 + dataset_rank
+
+    tracker = get_rng_state_tracker()
+
+    if "global_seed" not in tracker.states_ and global_seed not in tracker.seeds_:
+        tracker.add("global_seed", global_seed)
+    if "local_seed" not in tracker.states_ and local_seed not in tracker.seeds_:
+        tracker.add("local_seed", local_seed)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/training_args.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/training_args.py
new file mode 100644
index 000000000..04577306c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/training_args.py
@@ -0,0 +1,2053 @@
+# Copyright 2020-present the HuggingFace Inc. team.
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is modified from
+#  https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py
+
+import contextlib
+import json
+import math
+import os
+import sys
+import types
+import warnings
+from dataclasses import asdict, dataclass, field
+from enum import Enum
+from typing import Any, Dict, List, Optional
+
+import paddle
+import paddle.distributed as dist
+from paddle.distributed import fleet
+
+from ..utils.log import logger
+from .trainer_utils import (
+    IntervalStrategy,
+    OptimizerNames,
+    SchedulerType,
+    ShardingOption,
+)
+
+__all__ = [
+    "default_logdir",
+    "TrainingArguments",
+]
+
+
+def default_logdir() -> str:
+    """
+    Same default
+    """
+    import socket
+    from datetime import datetime
+
+    current_time = datetime.now().strftime("%b%d_%H-%M-%S")
+    return os.path.join("runs", current_time + "_" + socket.gethostname())
+
+
+@dataclass
+class TrainingArguments:
+    """
+    TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
+    itself**.
+
+    Using [`PdArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters:
+        output_dir (`str`):
+            The output directory where the model predictions and checkpoints will be written.
+        overwrite_output_dir (`bool`, *optional*, defaults to `False`):
+            If `True`, overwrite the content of the output directory. Use this to continue training if `output_dir`
+            points to a checkpoint directory.
+        do_train (`bool`, *optional*, defaults to `False`):
+            Whether to run training or not. This argument is not directly used by [`Trainer`], it's intended to be used
+            by your training/evaluation scripts instead. See the [example
+            scripts](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples) for more details.
+        do_eval (`bool`, *optional*):
+            Whether to run evaluation on the validation set or not. Will be set to `True` if `evaluation_strategy` is
+            different from `"no"`. This argument is not directly used by [`Trainer`], it's intended to be used by your
+            training/evaluation scripts instead. See the [example
+            scripts](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples) for more details.
+        do_predict (`bool`, *optional*, defaults to `False`):
+            Whether to run predictions on the test set or not. This argument is not directly used by [`Trainer`], it's
+            intended to be used by your training/evaluation scripts instead. See the [example
+            scripts](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples) for more details.
+        do_export (`bool`, *optional*, defaults to `False`):
+            Whether to export inference model or not. This argument is not directly used by [`Trainer`], it's
+            intended to be used by your training/evaluation scripts instead.
+        evaluation_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`):
+            The evaluation strategy to adopt during training. Possible values are:
+
+                - `"no"`: No evaluation is done during training.
+                - `"steps"`: Evaluation is done (and logged) every `eval_steps`.
+                - `"epoch"`: Evaluation is done at the end of each epoch.
+
+        prediction_loss_only (`bool`, *optional*, defaults to `False`):
+            When performing evaluation and generating predictions, only returns the loss.
+        per_device_train_batch_size (`int`, *optional*, defaults to 8):
+            The batch size per GPU core/CPU for training.
+        per_device_eval_batch_size (`int`, *optional*, defaults to 8):
+            The batch size per GPU core/CPU for evaluation.
+        gradient_accumulation_steps (`int`, *optional*, defaults to 1):
+            Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
+
+            <Tip warning={true}>
+
+            When using gradient accumulation, one step is counted as one step with backward pass. Therefore, logging,
+            evaluation, save will be conducted every `gradient_accumulation_steps * xxx_step` training examples.
+
+            </Tip>
+
+        eval_accumulation_steps (`int`, *optional*):
+            Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU. If
+            left unset, the whole predictions are accumulated on GPU/TPU before being moved to the CPU (faster but
+            requires more memory).
+        learning_rate (`float`, *optional*, defaults to 5e-5):
+            The initial learning rate for [`AdamW`] optimizer.
+        weight_decay (`float`, *optional*, defaults to 0):
+            The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in [`AdamW`]
+            optimizer.
+        adam_beta1 (`float`, *optional*, defaults to 0.9):
+            The beta1 hyperparameter for the [`AdamW`] optimizer.
+        adam_beta2 (`float`, *optional*, defaults to 0.999):
+            The beta2 hyperparameter for the [`AdamW`] optimizer.
+        adam_epsilon (`float`, *optional*, defaults to 1e-8):
+            The epsilon hyperparameter for the [`AdamW`] optimizer.
+        max_grad_norm (`float`, *optional*, defaults to 1.0):
+            Maximum gradient norm (for gradient clipping).
+        num_train_epochs(`float`, *optional*, defaults to 1.0):
+            Total number of training epochs to perform (if not an integer, will perform the decimal part percents of
+            the last epoch before stopping training).
+        max_steps (`int`, *optional*, defaults to -1):
+            If set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`.
+            In case of using a finite iterable dataset the training may stop before reaching the set number of steps
+            when all data is exhausted
+        lr_scheduler_type (`str` or [`SchedulerType`], *optional*, defaults to `"linear"`):
+            The scheduler type to use. See the documentation of [`SchedulerType`] for all possible values.
+        warmup_ratio (`float`, *optional*, defaults to 0.0):
+            Ratio of total training steps used for a linear warmup from 0 to `learning_rate`.
+        warmup_steps (`int`, *optional*, defaults to 0):
+            Number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of `warmup_ratio`.
+        num_cycles (`float`, *optional*, defaults to 0.5):
+            The number of waves in the cosine scheduler.
+        lr_end (`float`, *optional*, defaults to 1e-7):
+            The end LR used in the polynomial scheduler.
+        power (`float`, *optional*, defaults to 1.0):
+            The power factor used in the polynomial scheduler.
+
+        log_on_each_node (`bool`, *optional*, defaults to `True`):
+            In multinode distributed training, whether to log using `log_level` once per node, or only on the main
+            node.
+        logging_dir (`str`, *optional*):
+            log directory. Will default to *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***.
+        logging_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`):
+            The logging strategy to adopt during training. Possible values are:
+
+                - `"no"`: No logging is done during training.
+                - `"epoch"`: Logging is done at the end of each epoch.
+                - `"steps"`: Logging is done every `logging_steps`.
+
+        logging_first_step (`bool`, *optional*, defaults to `False`):
+            Whether to log and evaluate the first `global_step` or not.
+        logging_steps (`int`, *optional*, defaults to 500):
+            Number of update steps between two logs if `logging_strategy="steps"`.
+        save_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`):
+            The checkpoint save strategy to adopt during training. Possible values are:
+
+                - `"no"`: No save is done during training.
+                - `"epoch"`: Save is done at the end of each epoch.
+                - `"steps"`: Save is done every `save_steps`.
+        save_steps (`int`, *optional*, defaults to 500):
+            Number of updates steps before two checkpoint saves if `save_strategy="steps"`.
+        save_total_limit (`int`, *optional*):
+            If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
+            `output_dir`.
+        save_on_each_node (`bool`, *optional*, defaults to `False`):
+            When doing multi-node distributed training, whether to save models and checkpoints on each node, or only on
+            the main one.
+
+            This should not be activated when the different nodes use the same storage as the files will be saved with
+            the same names for each node.
+        no_cuda (`bool`, *optional*, defaults to `False`):
+            Whether to not use CUDA even when it is available or not.
+        seed (`int`, *optional*, defaults to 42):
+            Random seed that will be set at the beginning of training. To ensure reproducibility across runs, use the
+            [`~Trainer.model_init`] function to instantiate the model if it has some randomly initialized parameters.
+        fp16 (`bool`, *optional*, defaults to `False`):
+            Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.
+        fp16_opt_level (`str`, *optional*, defaults to 'O1'):
+            For `fp16` training,  AMP optimization level selected in ['O0', 'O1', 'O2']. See details at
+            https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/amp/auto_cast_cn.html
+        amp_custom_black_list (`List[str]`, *optional*, defaults to `None`):
+            The custom black_list. The set of ops that support fp16/bf16 calculation and are considered numerically-dangerous
+            and whose effects may also be observed in downstream ops. These ops will not be converted to fp16/bf16.
+        amp_custom_white_list (`List[str]`, *optional*, defaults to `None`):
+            The custom white_list. It’s the set of ops that support fp16/bf16 calculation and are considered numerically-safe and
+             performance-critical. These ops will be converted to fp16/bf16.
+        amp_master_grad (`bool`, *optional*, defaults to `False`):
+            For amp opt level=’O2’, whether to use float32 weight gradients
+            for calculations such as gradient clipping, weight decay, and weight updates. If master_grad is enabled,
+            the weight gradients will be float32 dtype after the backpropagation. Default is False, there is only float16 weight gradients.
+            Note: only support model parallel and pipeline parallel for now !!!
+        sharding (`str`, *optional*, defaults to ``):
+            Whether or not to use Paddle Sharding Data Parallel training (in distributed training
+            only). The base option should be `stage1`, `stage2` or `stage3` and you can add
+            CPU-offload to `stage2` or `stage3` like this: `stage2 offload` or `stage3 offload`.
+            Each stage means:
+                stage1 : optimizer state segmentation
+                stage2 : optimizer state + gradient segmentation
+                stage3 : parameter + gradient + optimizer state segmentation
+                offload : offload parameters to cpu
+        sharding_parallel_degree (`int`, *optional*, defaults to `-1`)
+            Sharding parameter in certain cards group. For example, aussume we use 2 machines each with 8 cards,
+            then set sharding_parallel_degree=8, sharding will only communication inside machine.
+            default -1 means sharding parameters between all workers.
+        tensor_parallel_degree (`int`, *optional*, defaults to `-1`)
+            Tensor parallelism is parallel technique proposed in (https://arxiv.org/pdf/2104.04473.pdf see 2.3 Tensor Model Parallelism).
+            This technique splits one transformer layer into multi-cards (For examples, tensor_parallel_degree=4, will split a layer to 4-parts)
+            tensor_parallel_degree means split the transformer layer to how many parts.
+            default -1 for not use tensor parallel,  Suggest tensor_parallel_degree<=8 for better proformance.
+            Note, this need model support in source code, currently GPT/BLOOM/LLAMA/BLOOM/CLM/CHATGLM is supported.
+        pipeline_parallel_degree (`int`, *optional*, defaults to `-1`)
+            Pipeline parallelism is parallel technique proposed in (https://arxiv.org/pdf/2104.04473.pdf see 2.2 Pipeline Model Parallelism).
+            Pipeline parallelism assigns multi-transformer layers to different cards, the micro batch data stream passed between cards like pipelines.
+            pipeline_parallel_degree means split all transformer layers to how many stages.
+            default -1 for not use pipeline parallel.
+            Note. this need model support in source code, see llama modeling_pp.py file
+        sep_parallel_degree (`int`, *optional*, defaults to `-1`)(
+            The paddle sequence parallel strategy. It can reduce the GPU memory of activation to 1/sep, and it is orthogonal to
+            data parallel, sharding stage1, tensor parallel and pipeline parallel strategy.
+        )
+        context_parallel_degree (`int`, *optional*, defaults to `-1`)(
+            Context parallelism is a parallel method that segments training data in the sequence dimension.
+            This method uses Ring FlashAttention to ensure the correctness of the Attention result after segmentation. The complete attention score is obtained through ring communication and iterative updates.
+        )
+        data_parallel_config (`str`, *optional*)(
+            Some additional configs which affect data parallel performance, we provide some option to config it.
+            following config is support:
+              enable_allreduce_avg_in_gradinent_scale, it replace `allreduce_sum + scale` pattern with `allreduce_avg` when scale gradient in data_parallel, which improve the performance. ONLY supported for auto mode now.
+              gradient_sync_after_accumulate, move gradient sync operations from backward into optimizer step when gradient accumulate enabling, which reduce the sync times to improve performance, but will increase the memory usage. ONLY supported for auto mode now.
+        tensor_parallel_config (`str`, *optional*)(
+            Some additional configs which affect model parallel performance, we provide some option to config it.
+            following config is support:
+              enable_mp_async_allreduce, it supports all_reduce(dx) overlap with matmul(dw) in ColumnParallelLinear backward when it set True, which can accelerate model parallel performance.
+              enable_mp_skip_c_identity, it supports skip c_identity in ColumnParallelLinear and RowParallelLinear. It only works when set mp_async_allreduce is True. It can accelerate model parallel further.
+              enable_mp_fused_linear_param_grad_add, it supports fused_linear_param_grad_add in ColumnParallelLinear (cuda >= 11.6). It only works when mp_async_allreduce is true. It can accelerate model parallel further.
+              enable_sp_async_reduce_scatter, it supports async reduce_scatter in ColumnSequenceParallelLinear. It only works when set sp_async_reduce_scatter is True. It can accelerate sequence parallel further.
+              enable_delay_scale_loss, accumulate gradients until optimizer step, all gradients div by accumute step. instead of div accumute step on loss directly.
+              sync_param, in optimizer step, use broadcast to sync parameters those attr 'is_distributed' is False.
+              sync_grad, in optimizer step, use broadcast to sync gradients those attr 'is_distributed' is False.
+              sync_moment, in optimizer step, use broadcast to sync momentums those attr 'is_distributed' is False.
+        pipeline_parallel_config (`str`, *optional*)(
+            Some additional config it highly affect the useage of pipeline parallel, we provide some option to config it.
+            following config is support:
+              disable_p2p_cache_shape, if you max sequence length is varying, please set disable_p2p_cache_shape.
+              disable_partial_send_recv, optmize send speed for tensor parallel.
+              enable_delay_scale_loss, accumulate gradients until optimizer step, all gradients div by inner pipeline accumute step. instead of div accumute step on loss directly.
+              enable_dp_comm_overlap, fuse data parallel gradient communication.
+              enable_sharding_comm_overlap, fuse sharding stage 1 parallel gradient communication.
+              enable_release_grads, reduce peak memory usage by releasing gradients after each iteration. The creation of gradients will be postponed until backward propagation of the next iteration.
+              enable_overlap_p2p_comm, overlap p2p communication with computation.
+              enable_clear_every_step_cache, clear every step cache for pipeline parallel.
+              disable_non_batch_p2p_comm, disable batched send/recv in pipeline parallel mode.
+        sharding_parallel_config (`str`, *optional*)(
+            Some additional config it highly affect the useage of sharding parallel, we provide some option to config it.
+            following config is support:
+              enable_stage1_tensor_fusion, fuse small tensors into big tensor chunks to accelerate communications, may increase memory occupation
+              enable_stage1_overlap, fuse small tensors into big tensor chunks to accelerate communications and do communication overlap with backward computation, may harm the backward speed
+              enable_stage2_overlap, overlap stage2 NCCL communication with computation. There are some constraints for the overlap, such as the logging_step should be bigger than 1 for broadcast overlap and no other sync could be called during the training for broadcast overlap.
+              enable_stage1_broadcast_overlap, overlap stage1 V1 broadcast with next step forward computation. There are some constraints for the overlap, such as the logging_step should be bigger than 1 for broadcast overlap forward compute and no other sync could be called during the training for broadcast overlap.
+              enable_stage1_allgather_overlap, overlap stage1 V2 allgather with next step forward computation. There are some constraints for the overlap, such as the logging_step should be bigger than 1 for allgather overlap forward compute and no other sync could be called during the training for allgather overlap.
+              disable_stage1_reduce_avg, replace reduce_avg with original reduce_sum+scale in stage1, which can be used for accuracy verification.
+              enable_release_grads, reduce peak memory usage by releasing gradients after each iteration. The creation of gradients will be postponed until backward propagation of the next iteration.
+        recompute (`bool`, *optional*, defaults to `False`):
+            Recompute the forward pass to calculate gradients. Used for saving memory.
+            Only support for networks with transformer blocks.
+        scale_loss (`float`,  *optional*, defaults to 32768):
+            The value of initial scale_loss for fp16. (default: 32768)
+        local_rank (`int`, *optional*, defaults to -1):
+            Rank of the process during distributed training.
+        dataloader_drop_last (`bool`, *optional*, defaults to `False`):
+            Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
+            or not.
+        eval_steps (`int`, *optional*):
+            Number of update steps between two evaluations if `evaluation_strategy="steps"`. Will default to the same
+            value as `logging_steps` if not set.
+        max_evaluate_steps (`int`, *optional*, defaults to -1):
+            If set to a positive number, the total number of evaluation steps to perform.
+        dataloader_num_workers (`int`, *optional*, defaults to 0):
+            Number of subprocesses to use for data loading. 0 means that the data will be loaded in the
+            main process.
+        past_index (`int`, *optional*, defaults to -1):
+            Some models like TransformerXL or XLNet can make use of the past hidden states for their predictions.
+            If this argument is set to a positive int, the `Trainer` will use the corresponding output (usually index 2) as
+            the past state and feed it to the model at the next training step under the keyword argument `mems`.
+        run_name (`str`, *optional*):
+            A descriptor for the run. Typically used for logging.
+        disable_tqdm (`bool`, *optional*):
+            Whether or not to disable the tqdm progress bars and table of metrics. Will default to `True` if the logging
+            level is set to warn or lower (default), `False` otherwise.
+        remove_unused_columns (`bool`, *optional*, defaults to `True`):
+            If using `datasets.Dataset` datasets, whether or not to automatically remove the columns unused by the
+            model forward method.
+        label_names (`List[str]`, *optional*):
+            The list of keys in your dictionary of inputs that correspond to the labels.
+            Will eventually default to `["labels"]` except if the model used is one of the `XxxForQuestionAnswering` in
+            which case it will default to `["start_positions", "end_positions"]`.
+        load_best_model_at_end (`bool`, *optional*, defaults to `False`):
+            Whether or not to load the best model found during training at the end of training.
+
+            <Tip>
+
+            When set to `True`, the parameters `save_strategy` needs to be the same as `eval_strategy`, and in the case
+            it is "steps", `save_steps` must be a round multiple of `eval_steps`.
+
+            </Tip>
+
+        metric_for_best_model (`str`, *optional*):
+            Use in conjunction with `load_best_model_at_end` to specify the metric to use to compare two different
+            models. Must be the name of a metric returned by the evaluation with or without the prefix `"eval_"`. Will
+            default to `"loss"` if unspecified and `load_best_model_at_end=True` (to use the evaluation loss).
+
+            If you set this value, `greater_is_better` will default to `True`. Don't forget to set it to `False` if
+            your metric is better when lower.
+        greater_is_better (`bool`, *optional*):
+            Use in conjunction with `load_best_model_at_end` and `metric_for_best_model` to specify if better models
+            should have a greater metric or not. Will default to:
+
+            - `True` if `metric_for_best_model` is set to a value that isn't `"loss"` or `"eval_loss"`.
+            - `False` if `metric_for_best_model` is not set, or set to `"loss"` or `"eval_loss"`.
+        ignore_data_skip (`bool`, *optional*, defaults to `False`):
+            When resuming training, whether or not to skip the epochs and batches to get the data loading at the same
+            stage as in the previous training. If set to `True`, the training will begin faster (as that skipping step
+            can take a long time) but will not yield the same results as the interrupted training would have.
+        optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw"`):
+            The optimizer to use: adamw, or adafactor.
+        length_column_name (`str`, *optional*, defaults to `"length"`):
+            Column name for precomputed lengths. If the column exists, grouping by length will use these values rather
+            than computing them on train startup. Ignored unless `group_by_length` is `True` and the dataset is an
+            instance of `Dataset`.
+        report_to (`str` or `List[str]`, *optional*, defaults to `"visualdl"`):
+            The list of integrations to report the results and logs to.
+            Supported platforms are `"visualdl"`/`"wandb"`/`"tensorboard"`.
+            `"none"` for no integrations.
+        ddp_find_unused_parameters (`bool`, *optional*):
+            When using distributed training, the value of the flag `find_unused_parameters` passed to
+            `paddle.DataParallel`. Will default to `False` if recompute is used, `True` otherwise.
+        wandb_api_key (`str`, *optional*):
+            Weights & Biases (WandB) API key(s) for authentication with the WandB service.
+        resume_from_checkpoint (`str`, *optional*):
+            The path to a folder with a valid checkpoint for your model. This argument is not directly used by
+            [`Trainer`], it's intended to be used by your training/evaluation scripts instead. See the [example
+            scripts](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples) for more details.
+        flatten_param_grads (`bool`, *optional*):
+            Whether use flatten_param_grads method in optimizer, only used on NPU devices. Default is `False`.
+        skip_profile_timer (`bool`, *optional*):
+            Whether skip profile timer, timer will record time usage of forward/ backward/ step, etc.
+        distributed_dataloader (`bool`, *optional*):
+            Whether to use distributed dataloader. Default is `False`.
+        release_grads (`bool`, *optional*):
+            Whether to release gradients during training. Default is `False`.
+    """
+
+    output_dir: str = field(
+        metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
+    )
+    overwrite_output_dir: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Overwrite the content of the output directory. "
+                "Use this to continue training if output_dir points to a checkpoint directory."
+            )
+        },
+    )
+
+    do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
+    do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."})
+    do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."})
+    do_export: bool = field(default=False, metadata={"help": "Whether to export infernece model."})
+    evaluation_strategy: IntervalStrategy = field(
+        default="no",
+        metadata={"help": "The evaluation strategy to use."},
+    )
+    prediction_loss_only: bool = field(
+        default=False,
+        metadata={"help": "When performing evaluation and predictions, only returns the loss."},
+    )
+
+    per_device_train_batch_size: int = field(default=8, metadata={"help": "Batch size per GPU core/CPU for training."})
+    per_device_eval_batch_size: int = field(
+        default=8, metadata={"help": "Batch size per GPU core/CPU for evaluation."}
+    )
+
+    gradient_accumulation_steps: int = field(
+        default=1,
+        metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."},
+    )
+    eval_accumulation_steps: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of predictions steps to accumulate before moving the tensors to the CPU."},
+    )
+
+    learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."})
+    weight_decay: float = field(default=0.0, metadata={"help": "Weight decay for AdamW if we apply some."})
+    adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for AdamW optimizer"})
+    adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for AdamW optimizer"})
+    adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."})
+    max_grad_norm: float = field(default=1.0, metadata={"help": "Max gradient norm."})
+
+    num_train_epochs: float = field(default=1.0, metadata={"help": "Total number of training epochs to perform."})
+    max_steps: int = field(
+        default=-1,
+        metadata={"help": "If > 0: set total number of training steps to perform. Override num_train_epochs."},
+    )
+    lr_scheduler_type: str = field(
+        default="linear",
+        metadata={"help": "The scheduler type to use. suppor linear, cosine, constant, constant_with_warmup"},
+    )
+    warmup_ratio: float = field(
+        default=0.0, metadata={"help": "Linear warmup over warmup_ratio fraction of total steps."}
+    )
+    warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."})
+    num_cycles: float = field(default=0.5, metadata={"help": "The number of waves in the cosine scheduler."})
+    lr_end: float = field(default=1e-7, metadata={"help": "The end LR in the polynomial scheduler."})
+    power: float = field(default=1.0, metadata={"help": "The power factor in the polynomial scheduler."})
+
+    log_on_each_node: bool = field(
+        default=True,
+        metadata={
+            "help": "When doing a multinode distributed training, whether to log once per node or just once on the main node."
+        },
+    )
+    logging_dir: Optional[str] = field(default=None, metadata={"help": "VisualDL log dir."})
+    logging_strategy: IntervalStrategy = field(
+        default="steps",
+        metadata={"help": "The logging strategy to use."},
+    )
+    logging_first_step: bool = field(default=False, metadata={"help": "Log the first global_step"})
+    logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
+
+    save_strategy: IntervalStrategy = field(
+        default="steps",
+        metadata={"help": "The checkpoint save strategy to use."},
+    )
+    save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
+    save_total_limit: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Limit the total amount of checkpoints. "
+                "Deletes the older checkpoints in the output_dir. Default is unlimited checkpoints"
+            )
+        },
+    )
+    save_on_each_node: bool = field(
+        default=False,
+        metadata={
+            "help": "When doing multi-node distributed training, whether to save models and checkpoints on each node, or only on the main one"
+        },
+    )
+    no_cuda: bool = field(default=False, metadata={"help": "Do not use CUDA even when it is available"})
+    seed: int = field(default=42, metadata={"help": "Random seed that will be set at the beginning of training."})
+
+    bf16: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA"
+                " architecture or using CPU (no_cuda). This is an experimental API and it may change."
+            )
+        },
+    )
+    fp16: bool = field(
+        default=False,
+        metadata={"help": "Whether to use fp16 (mixed) precision instead of 32-bit"},
+    )
+    fp16_opt_level: str = field(
+        default="O1",
+        metadata={
+            "help": (
+                "For fp16: AMP optimization level selected in ['O0', 'O1', and 'O2']. "
+                "See details at https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/amp/auto_cast_cn.html"
+            )
+        },
+    )
+    amp_master_grad: bool = field(
+        default=False,
+        metadata={
+            "help": "amp_master_grad (bool, optional) – For amp opt level=’O2’, whether to use float32 weight gradients "
+            " for calculations such as gradient clipping, weight decay, and weight updates. If master_grad is enabled,"
+            " the weight gradients will be float32 dtype after the backpropagation. Default is False, there is only float16 weight gradients."
+            "Note: only support model parallel and pipeline parallel for now !!!"
+        },
+    )
+    bf16_full_eval: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to use full bfloat16 evaluation instead of 32-bit. This is an experimental API and it may"
+                " change."
+            )
+        },
+    )
+    fp16_full_eval: bool = field(
+        default=False,
+        metadata={"help": "Whether to use full float16 evaluation instead of 32-bit"},
+    )
+
+    amp_custom_black_list: Optional[List[str]] = field(
+        default=None,
+        metadata={
+            "help": "The set of ops that support fp16/bf16 calculation and are considered numerically-dangerous and whose effects may also be observed in downstream ops."
+        },
+    )
+    amp_custom_white_list: Optional[List[str]] = field(
+        default=None,
+        metadata={
+            "help": "The the set of ops that support fp16/bf16 calculation and are considered numerically-safe and performance-critical. These ops will be converted to fp16/bf16."
+        },
+    )
+
+    sharding: str = field(
+        default="",
+        metadata={
+            "help": (
+                "Whether or not to use Paddle Sharding Data Parallel training (in distributed training"
+                " only). The base option should be `stage1`, `stage2` or `stage3` and you can add"
+                " CPU-offload to `stage2` or `stage3` like this: stage2 offload` or `stage3"
+                " offload`. "
+            )
+        },
+    )
+    sharding_degree: int = field(  # Alias for sharding_parallel_degree
+        default=-1,
+        metadata={"help": ("@deprecated Please use sharding_parallel_degree. ")},
+    )
+    sharding_parallel_degree: int = field(
+        default=-1,
+        metadata={
+            "help": (
+                "Sharding parameter in certain cards group. For example, aussume we use 2 machines each with 8 cards, "
+                "then set sharding_degree=8, sharding will only communication inside machine. "
+                "default -1 means sharding parameters between all workers."
+            )
+        },
+    )
+    save_sharded_model: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "When use sharding stage1 and set save_sharded_model True, each shanding rank only save part of the model. It reduce time to save the model."
+            )
+        },
+    )
+
+    load_sharded_model: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "When use sharding stage1 and set load_sharded_model True, it means loading the sharded model. The sharded model is saved when we set save_sharded_model True."
+            )
+        },
+    )
+    tensor_parallel_degree: int = field(
+        default=-1,
+        metadata={
+            "help": (
+                "Tensor parallelism is parallel technique proposed in (https://arxiv.org/pdf/2104.04473.pdf see 2.3 Tensor Model Parallelism). "
+                "This techique splits one transformer layer into multi-cards (For examples, tensor_parallel_degree=4, will split a layer to 4-parts) "
+                "tensor_parallel_degree means split the transformer layer to how many parts."
+                "default -1 for not use tensor parallel,  Suggest tensor_parallel_degree<=8 for better proformance."
+                "Note, this need model support in source code, currently GPT/BLOOM/LLAMA/BLOOM/CLM/CHATGLM is supported. "
+            )
+        },
+    )
+    pipeline_parallel_degree: int = field(
+        default=-1,
+        metadata={
+            "help": (
+                "Pipeline parallelism is parallel technique proposed in (https://arxiv.org/pdf/2104.04473.pdf see 2.2 Pipeline Model Parallelism). "
+                "Pipeline parallelism assigns multi-transformer layers to different cards, the micro batch data stream passed between cards like pipelines."
+                "pipeline_parallel_degree means split all transformer layers to how many stages."
+                "default -1 for not use pipeline parallel."
+                "Note. this need model support in source code, see llama modeling_pp.py file"
+            )
+        },
+    )
+    sep_parallel_degree: int = field(
+        default=-1,
+        metadata={
+            "help": (
+                "The paddle sequence parallel strategy. It can reduce the GPU memory of activation to 1/sep, and it is orthogonal to "
+                "data parallel, sharding stage1, tensor parallel and pipeline parallel strategy. "
+            )
+        },
+    )
+    context_parallel_degree: int = field(
+        default=-1,
+        metadata={
+            "help": (
+                "The paddle context parallel strategy. It can reduce the GPU memory of activation to 1/cp, and it is orthogonal to "
+                "data parallel, sharding stage1, tensor parallel and pipeline parallel strategy. "
+            )
+        },
+    )
+    data_parallel_config: str = field(
+        default="",
+        metadata={
+            "help": (
+                "Some additional configs which affect data parallel performance, we provide some option to config it."
+                "following config is support:\n"
+                "enable_allreduce_avg_in_gradinent_scale, it replace `allreduce_sum + scale` pattern with `allreduce_avg` when scale gradient in data_parallel, which improve the performance. ONLY supported for auto mode now. \n"
+                "gradient_sync_after_accumulate, move gradient sync operations from backward into optimizer step when gradient accumulate enabling, which reduce the sync times to improve performance, but will increase the memory usage. ONLY supported for auto mode now. \n"
+            )
+        },
+    )
+    sequence_parallel_config: str = field(
+        default="",
+        metadata={
+            "help": (
+                "Some additional configs which affect sequence parallel performance, we provide some option to config it."
+                "following config is support:\n"
+                "enable_allreduce_avg_in_gradinent_scale, it replace `allreduce_sum + scale` pattern with `allreduce_avg` when scale gradient in sequence_parallel, which improve the performance. ONLY supported for auto mode now. \n"
+            )
+        },
+    )
+    tensor_parallel_config: str = field(
+        default="",
+        metadata={
+            "help": (
+                "Some additional configs which affect model parallel performance, we provide some option to config it."
+                "following config is support:\n"
+                "enable_mp_async_allreduce, it supports all_reduce(dx) overlap with matmul(dw) in ColumnParallelLinear backward when it set True, which can accelerate model parallel performance. \n"
+                "enable_mp_skip_c_identity, it supports skip c_identity in ColumnParallelLinear and RowParallelLinear. It only works when set mp_async_allreduce is True. It can accelerate model parallel further.\n"
+                "enable_mp_fused_linear_param_grad_add, it supports fused_linear_param_grad_add in ColumnParallelLinear (cuda >= 11.6). It only works when mp_async_allreduce is true.  It can accelerate model parallel further.\n"
+                "enable_sp_async_reduce_scatter, it supports async reduce_scatter in ColumnSequenceParallelLinear. It only works when set sp_async_reduce_scatter is True. It can accelerate sequence parallel further.\n"
+                "enable_delay_scale_loss, accumulate gradients until optimizer step, all gradients div by accumute step. instead of div accumute step on loss directly.\n"
+                "sync_param, in optimizer step, use broadcast to sync parameters those attr 'is_distributed' is False.\n"
+                "sync_grad, in optimizer step, use broadcast to sync gradients those attr 'is_distributed' is False.\n"
+                "sync_moment, in optimizer step, use broadcast to sync momentums those attr 'is_distributed' is False.\n"
+            )
+        },
+    )
+    pipeline_parallel_config: str = field(
+        default="",
+        metadata={
+            "help": (
+                "Some additional config it highly affect the useage of pipeline parallel, we provide some option to config it."
+                "following config is support:\n"
+                "disable_p2p_cache_shape, if you max sequence length is varying, please set disable_p2p_cache_shape. \n"
+                "disable_partial_send_recv, optmize send speed for tensor parallel.\n"
+                "enable_delay_scale_loss, accumulate gradients until optimizer step, all gradients div by inner pipeline accumute step. instead of div accumute step on loss directly.\n"
+                "enable_dp_comm_overlap, fuse data parallel gradient communication. \n"
+                "enable_sharding_comm_overlap, fuse sharding stage 1 parallel gradient communication. \n"
+                "enable_overlap_p2p_comm, overlap p2p communication with computation. \n"
+                "enable_clear_every_step_cache, clear every step cache for pipeline parallel. \n"
+                "disable_batch_p2p_comm, disable batched send/recv in pipeline parallel mode. \n"
+                "enable_split_backward, only can be used in StaticGraph-AutoParallel! split the `backward` program into `backward_b` and `backward_w` to decrease the bubble in VPP pipeline mode when `acc_step == pp_degree`. it increase the memory! \n"
+            )
+        },
+    )
+    sharding_parallel_config: str = field(
+        default="",
+        metadata={
+            "help": (
+                "Some additional config it highly affect the useage of sharding parallel, we provide some option to config it."
+                "following config is support: \n"
+                "enable_stage1_tensor_fusion, fuse small tensors into big tensor chunks to accelerate communications, may increase memory occupation\n"
+                "enable_stage1_overlap, fuse small tensors into big tensor chunks to accelerate communications and do communication overlap with backward computation, may harm the backward speed\n"
+                "disable_stage1_reduce_avg, replace reduce_avg with original reduce_sum+scale in stage1, which can be used for accuracy verification.\n"
+                "enable_stage2_overlap, overlap stage2 NCCL communication with computation. There are some constraints for the overlap, such as the logging_step should be bigger than 1 for broadcast overlap and no other sync could be called during the training for broadcast overlap\n"
+                "enable_stage1_broadcast_overlap, overlap stage1 V1 broadcast with next step forward computation. There are some constraints for the overlap, such as the logging_step should be bigger than 1 for broadcast overlap forward compute and no other sync could be called during the training for broadcast overlap.\n"
+                "enable_stage1_allgather_overlap, overlap stage1 V2 allgather with next step forward computation. There are some constraints for the overlap, such as the logging_step should be bigger than 1 for allgather overlap forward compute and no other sync could be called during the training for allgather overlap."
+            )
+        },
+    )
+    hybrid_parallel_topo_order: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "In hybrid parallelism, the order of communication groups may affect efficiency.\n"
+                "Following options are supported:\n"
+                "- pp_first. the topo order is dp, pp, sharding, mp \n"
+                "- sharding_first. the topo order is dp, sharding, pp, mp \n"
+                "Defalut is None, for pp_first"
+            )
+        },
+    )
+    recompute: bool = field(
+        default=False,
+        metadata={
+            "help": "Recompute the forward pass to calculate gradients. Used for saving memory. "
+            "Only support for networks with transformer blocks."
+        },
+    )
+
+    scale_loss: float = field(default=2**15, metadata={"help": "The value of initial scale_loss for fp16."})
+
+    minimum_eval_times: int = field(
+        default=None,
+        metadata={
+            "help": "If under eval_steps, the valid time is less then minimum_eval_times, the config of override eval_steps."
+        },
+    )
+
+    local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"})
+
+    dataloader_drop_last: bool = field(
+        default=False, metadata={"help": "Drop the last incomplete batch if it is not divisible by the batch size."}
+    )
+    eval_steps: int = field(default=None, metadata={"help": "Run an evaluation every X steps."})
+    max_evaluate_steps: int = field(
+        default=-1, metadata={"help": "If set to a positive number, the total number of evaluation steps to perform."}
+    )
+    dataloader_num_workers: int = field(
+        default=0,
+        metadata={
+            "help": "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        },
+    )
+
+    past_index: int = field(
+        default=-1,
+        metadata={"help": "If >=0, uses the corresponding part of the output as the past state for next step."},
+    )
+
+    run_name: Optional[str] = field(default=None, metadata={"help": "An optional descriptor for the run."})
+
+    device: Optional[str] = field(default="gpu", metadata={"help": "select cpu, gpu, xpu, npu devices."})
+
+    disable_tqdm: Optional[bool] = field(
+        default=None, metadata={"help": "Whether or not to disable the tqdm progress bars."}
+    )
+
+    remove_unused_columns: Optional[bool] = field(
+        default=True, metadata={"help": "Remove columns not required by the model when using an nlp.Dataset."}
+    )
+
+    label_names: Optional[List[str]] = field(
+        default=None, metadata={"help": "The list of keys in your dictionary of inputs that correspond to the labels."}
+    )
+
+    load_best_model_at_end: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether or not to load the best model found during training at the end of training."},
+    )
+    metric_for_best_model: Optional[str] = field(
+        default=None, metadata={"help": "The metric to use to compare two different models."}
+    )
+    greater_is_better: Optional[bool] = field(
+        default=None, metadata={"help": "Whether the `metric_for_best_model` should be maximized or not."}
+    )
+    ignore_data_skip: bool = field(
+        default=False,
+        metadata={
+            "help": "When resuming training, whether or not to skip the first epochs and batches to get to the same training data."
+        },
+    )
+    optim: str = field(
+        default="adamw",
+        metadata={"help": "The optimizer to use."},
+    )
+    report_to: Optional[List[str]] = field(
+        default=None, metadata={"help": "The list of integrations to report the results and logs to."}
+    )
+    ddp_find_unused_parameters: Optional[bool] = field(
+        default=None,
+        metadata={
+            "help": (
+                "When using distributed training, the value of the flag `find_unused_parameters` passed to "
+                "`DataParallel`."
+            )
+        },
+    )
+    wandb_api_key: Optional[str] = field(
+        default=None,
+        metadata={"help": "Weights & Biases (WandB) API key(s) for authentication with the WandB service."},
+    )
+    resume_from_checkpoint: Optional[str] = field(
+        default=None,
+        metadata={"help": "The path to a folder with a valid checkpoint for your model."},
+    )
+    skip_memory_metrics: bool = field(
+        default=True, metadata={"help": "Whether or not to skip adding of memory profiler reports to metrics."}
+    )
+    flatten_param_grads: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether use flatten_param_grads method in optimizer, only used on NPU devices."},
+    )
+    lazy_data_processing: Optional[bool] = field(
+        default=True,
+        metadata={"help": "Whether use lazy data processing."},
+    )
+    use_async_save: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether to use async_save instead of paddle.save."},
+    )
+    skip_profile_timer: Optional[bool] = field(
+        default=True,
+        metadata={"help": "enable framework timer, will output timeline informatoin in logging and visualdl."},
+    )
+    distributed_dataloader: Optional[bool] = field(
+        default=False, metadata={"help": "Whether to use distributed dataloader."}
+    )
+    unified_checkpoint: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Whether to unify hybrid parallel checkpoint."},
+    )
+    to_static: Optional[bool] = field(
+        default=False,
+        metadata={"help": ("Whether to train model under static mode by jit.to_static or distributed.to_static.")},
+    )
+    unified_checkpoint_config: Optional[str] = field(
+        default="",
+        metadata={
+            "help": (
+                "Configs to unify hybrid parallel checkpoint.\n"
+                "Following options are supports:\n"
+                "- skip_save_model_weight: do not save model weights when the masters weight exist\n"
+                "- master_weight_compatible: 1. if the master weights exist, only load when needed\n"
+                "                            2. if master weights does not exist, convert model weights to master weights when needed\n"
+                "- async_save: enable asynchronous saving checkpoints to disk\n"
+                "- enable_all_options: enable all optimization configurations\n"
+            )
+        },
+    )
+    ignore_load_lr_and_optim: Optional[bool] = field(
+        default=False,
+        metadata={"help": "whether to ignore load optimizer and scheduler."},
+    )
+    ignore_save_lr_and_optim: Optional[bool] = field(
+        default=False,
+        metadata={"help": "whether to ignore save optimizer and scheduler."},
+    )
+    force_reshard_pp: Optional[bool] = field(
+        default=False,
+        metadata={"help": "reshard pp even if pp degree in the model and pp degree in script match"},
+    )
+    enable_auto_parallel: Optional[bool] = field(
+        default=False,
+        metadata={"help": "whether to run distributed training in auto parallel mode"},
+    )
+    use_expert_parallel: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enable MoE (Mixture of Experts) expert parallel training"},
+    )
+    release_grads: Optional[bool] = field(
+        default=False, metadata={"help": "Whether to release gradients during training. Default is `False`."}
+    )
+
+    def __post_init__(self):
+        env_local_rank = int(os.environ.get("PADDLE_RANK_IN_NODE", -1))
+        if env_local_rank != -1 and env_local_rank != self.local_rank and paddle.distributed.get_world_size() > 1:
+            self.local_rank = env_local_rank
+
+        # NOTE(gongenlei): new add, disable sharding when we have only single gpu
+        if paddle.distributed.get_world_size() <= 1:
+            self.sharding = ""
+            self.sharding_degree = -1
+            self.sharding_parallel_degree = -1
+            self.tensor_parallel_degree = -1
+            self.pipeline_parallel_degree = -1
+
+        # convert to int
+        self.log_level = -1
+        self.log_level_replica = -1
+
+        # expand paths, if not os.makedirs("~/bar") will make directory
+        # in the current directory instead of the actual home
+        if self.output_dir is not None:
+            self.output_dir = os.path.expanduser(self.output_dir)
+        if self.logging_dir is None and self.output_dir is not None:
+            self.logging_dir = os.path.join(self.output_dir, default_logdir())
+        if self.logging_dir is not None:
+            self.logging_dir = os.path.expanduser(self.logging_dir)
+
+        if self.disable_tqdm is None:
+            self.disable_tqdm = False  # logger.getEffectiveLevel() > logging.WARN
+
+        self.evaluation_strategy = IntervalStrategy(self.evaluation_strategy)
+        self.logging_strategy = IntervalStrategy(self.logging_strategy)
+        self.save_strategy = IntervalStrategy(self.save_strategy)
+
+        self.lr_scheduler_type = SchedulerType(self.lr_scheduler_type)
+        if self.do_eval is False and self.evaluation_strategy != IntervalStrategy.NO:
+            self.do_eval = True
+
+        if self.do_eval and self.evaluation_strategy == IntervalStrategy.NO:
+            logger.warning(
+                "evaluation_strategy reset to IntervalStrategy.STEPS for do_eval is True. you can also set evaluation_strategy='epoch'."
+            )
+            self.evaluation_strategy = IntervalStrategy.STEPS
+
+        # eval_steps has to be defined and non-zero, fallbacks to logging_steps if the latter is non-zero
+        if self.evaluation_strategy == IntervalStrategy.STEPS and (self.eval_steps is None or self.eval_steps == 0):
+            if self.logging_steps > 0:
+                logger.info(f"using `logging_steps` to initialize `eval_steps` to {self.logging_steps}")
+                self.eval_steps = self.logging_steps
+            else:
+                raise ValueError(
+                    f"evaluation strategy {self.evaluation_strategy} requires either non-zero --eval_steps or --logging_steps"
+                )
+
+        # logging_steps must be non-zero for logging_strategy that is other than 'no'
+        if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps == 0:
+            raise ValueError(f"logging strategy {self.logging_strategy} requires non-zero --logging_steps")
+
+        # Sanity checks for load_best_model_at_end: we require save and eval strategies to be compatible.
+        if self.load_best_model_at_end:
+            if self.evaluation_strategy != self.save_strategy:
+                raise ValueError(
+                    "--load_best_model_at_end requires the save and eval strategy to match, but found\n- Evaluation "
+                    f"strategy: {self.evaluation_strategy}\n- Save strategy: {self.save_strategy}"
+                )
+            if self.evaluation_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0:
+                raise ValueError(
+                    "--load_best_model_at_end requires the saving steps to be a round multiple of the evaluation "
+                    f"steps, but found {self.save_steps}, which is not a round multiple of {self.eval_steps}."
+                )
+
+        if self.load_best_model_at_end and self.metric_for_best_model is None:
+            self.metric_for_best_model = "loss"
+        if self.greater_is_better is None and self.metric_for_best_model is not None:
+            self.greater_is_better = self.metric_for_best_model not in ["loss", "eval_loss"]
+        if self.run_name is None:
+            self.run_name = self.output_dir
+
+        if self.fp16 and self.bf16:
+            raise ValueError("At most one of fp16 and bf16 can be True, but not both")
+
+        if self.fp16_full_eval and self.bf16_full_eval:
+            raise ValueError("At most one of fp16 and bf16 can be True for full eval, but not both")
+
+        self.optim = OptimizerNames(self.optim)
+
+        self.use_hybrid_parallel = False
+
+        if isinstance(self.sharding, bool):
+            self.sharding = "stage1" if self.sharding else ""
+        if isinstance(self.sharding, str):
+            self.sharding = [ShardingOption(s) for s in self.sharding.split()]
+        if self.sharding == [ShardingOption.OFFLOAD]:
+            raise ValueError(
+                "`--sharding offload` can't work on its own. It needs to be added to `--sharding stage2` or "
+                '`--sharding stage3`. For example, `--sharding "stage2 offload"`.'
+            )
+        elif len(self.sharding) > (ShardingOption.OFFLOAD in self.sharding) + 1:
+            raise ValueError("`--sharding` recived too many arguments.")
+
+        if self.sharding_degree > 0:
+            warnings.warn("`sharding_degree` is deprecated, please use `sharding_parallel_degree`")
+            self.sharding_parallel_degree = max(self.sharding_degree, self.sharding_parallel_degree)
+
+        delattr(self, "sharding_degree")
+
+        if len(self.sharding) == 0 and self.sharding_parallel_degree > 0:
+            warnings.warn("`--sharding_parallel_degree` is useful only when `--sharding` is specified.")
+
+        world_size = paddle.distributed.get_world_size()
+
+        if world_size > 1:
+            tensor_parallel_degree = max(self.tensor_parallel_degree, 1)
+            sep_parallel_degree = max(self.sep_parallel_degree, 1)
+            context_parallel_degree = max(self.context_parallel_degree, 1)
+            pipeline_parallel_degree = max(self.pipeline_parallel_degree, 1)
+
+            assert (
+                world_size % (self.tensor_parallel_degree * self.pipeline_parallel_degree) == 0
+            ), f"Total world_size:{world_size} shoule be devided by tensor_parallel_degree: {self.tensor_parallel_degree} and pipeline_parallel_degree: {self.pipeline_parallel_degree}."
+
+            assert not (
+                sep_parallel_degree > 1 and context_parallel_degree > 1
+            ), f"sep parallel and context parallel cannot be used together, sep_parallel_degree:{sep_parallel_degree}, context_parallel_degree:{context_parallel_degree}."
+
+            if self.sharding_parallel_degree == -1:
+                if len(self.sharding) > 0:
+                    self.sharding_parallel_degree = world_size // (
+                        tensor_parallel_degree
+                        * sep_parallel_degree
+                        * context_parallel_degree
+                        * pipeline_parallel_degree
+                    )
+
+            sharding_parallel_degree = max(self.sharding_parallel_degree, 1)
+            if sharding_parallel_degree == 1 and len(self.sharding) > 0:
+                logger.warning("sharding_parallel_degree=1 means no sharding, please set sharding to empty!")
+                self.sharding = []
+
+            self.data_parallel_degree = world_size // (
+                sharding_parallel_degree
+                * tensor_parallel_degree
+                * sep_parallel_degree
+                * context_parallel_degree
+                * pipeline_parallel_degree
+            )
+
+            if (
+                sharding_parallel_degree > 1
+                or tensor_parallel_degree > 1
+                or pipeline_parallel_degree > 1
+                or self.sep_parallel_degree > 1
+                or self.context_parallel_degree > 1
+            ):
+                self.use_hybrid_parallel = True
+                self.sharding_parallel_degree = sharding_parallel_degree
+                self.tensor_parallel_degree = tensor_parallel_degree
+                self.pipeline_parallel_degree = pipeline_parallel_degree
+                self.sep_parallel_degree = sep_parallel_degree
+                self.context_parallel_degree = context_parallel_degree
+
+            if not self.use_hybrid_parallel:
+                self.sharding = []
+                self.sharding_parallel_degree = -1
+                self.tensor_parallel_degree = -1
+                self.pipeline_parallel_degree = -1
+                self.sep_parallel_degree = -1
+                self.context_parallel_degree = -1
+
+        if self.hybrid_parallel_topo_order is None:
+            self.hybrid_parallel_topo_order = "pp_first"
+        assert self.hybrid_parallel_topo_order in ["pp_first", "sharding_first"]
+
+        if self.use_hybrid_parallel and self.enable_auto_parallel:
+            self.use_hybrid_parallel = False
+
+        if self.to_static:
+            assert world_size == 1 or self.enable_auto_parallel, (
+                "It's not supported for training in static mode except the following cases : "
+                "1. world_size == 1, which means single-card training while no parallelism is used; "
+                "2. enable_auto_parallel is set to True, which means the training will be executed in static mode of auto parallel."
+            )
+
+        if self.distributed_dataloader and not (self.tensor_parallel_degree > 1 or self.pipeline_parallel_degree > 1):
+            warnings.warn("We set `distributed_dataloader` to False if tp_degree <= 1 and pp_degree <= 1")
+            self.distributed_dataloader = False
+
+        if self.amp_master_grad:
+            if not (self.bf16 or self.fp16):
+                logger.warning("set amp_master_grad to false since amp is disabled.")
+                self.amp_master_grad = False
+
+        def split_parallel_config(parallel_config):
+            if "," in parallel_config:
+                parallel_config = set(parallel_config.split(","))
+            else:
+                parallel_config = set(parallel_config.split(" "))
+            return parallel_config
+
+        # use_hybrid_parallel
+        if self.use_hybrid_parallel:
+
+            if ShardingOption.OFFLOAD in self.sharding:
+                warnings.warn("`offload` is not supported NOW!")
+
+            if self.pipeline_parallel_degree > 1:
+                if ShardingOption.FULL_SHARD in self.sharding or ShardingOption.SHARD_GRAD_OP in self.sharding:
+                    raise ValueError(
+                        "pipeline parallel is not compatible for sharding stage2 or stage3, please using sharding stage1"
+                    )
+
+            # TODO use paddle.distributed.is_initialized() after paddle 2.4rc
+            if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized():
+                strategy = fleet.DistributedStrategy()
+                assert self.data_parallel_config == "", "data_parallle_config is not supported in hybrid parallel"
+                if self.pipeline_parallel_degree > 1:
+                    pipeline_parallel_config = split_parallel_config(self.pipeline_parallel_config)
+                    for x in pipeline_parallel_config:
+                        if len(x) > 0:
+                            if x not in [
+                                "disable_p2p_cache_shape",
+                                "disable_partial_send_recv",
+                                "enable_delay_scale_loss",
+                                "enable_dp_comm_overlap",
+                                "enable_sharding_comm_overlap",
+                                "enable_timer",
+                                "enable_release_grads",
+                                "enable_dp_comm_overlap",
+                                "enable_clear_every_step_cache",
+                                "enable_overlap_p2p_comm",
+                                "disable_batch_p2p_comm",
+                            ]:
+                                raise ValueError(
+                                    f"Found unknown pipeline mode config {x}, accpet config is disable_p2p_cache_shape, disable_partial_send_recv."
+                                )
+
+                    strategy.pipeline_configs = {
+                        "accumulate_steps": self.gradient_accumulation_steps,
+                        "micro_batch_size": self.per_device_train_batch_size,
+                        "enable_partial_send_recv": "disable_partial_send_recv" not in pipeline_parallel_config,
+                        "p2p_cache_shape": False if "disable_p2p_cache_shape" in pipeline_parallel_config else True,
+                        # "delay_scale_loss": True, Fix ME
+                    }
+                    logger.info(f"PP configs:{strategy.pipeline_configs}, use master_grad: {self.amp_master_grad}")
+
+                    using_comm_overlap = (
+                        "enable_sharding_comm_overlap" in pipeline_parallel_config
+                        or "enable_dp_comm_overlap" in pipeline_parallel_config
+                    )
+                    enable_dp_comm_overlap = using_comm_overlap and self.data_parallel_degree > 1
+                    enable_sharding_comm_overlap = using_comm_overlap and self.sharding_parallel_degree > 1
+                    assert not (
+                        enable_dp_comm_overlap and enable_sharding_comm_overlap
+                    ), "dp_comm_overlap and sharding_comm_overlap cannot be enabled at the same time"
+
+                    if enable_sharding_comm_overlap and not self.amp_master_grad:
+                        raise ValueError(
+                            "If `enable_sharding_comm_overlap` in pipeline_parallel_configs, `amp_master_grad` must be True."
+                        )
+
+                    dygraph_pp_configs = {
+                        "delay_scale_loss": True if "enable_delay_scale_loss" in pipeline_parallel_config else False,
+                        "dp_comm_overlap": enable_dp_comm_overlap,
+                        "sharding_comm_overlap": enable_sharding_comm_overlap,
+                        "enable_timer": "enable_timer" in pipeline_parallel_config,
+                        "release_gradients": "enable_release_grads" in pipeline_parallel_config,
+                        "overlap_p2p_comm": "enable_overlap_p2p_comm" in pipeline_parallel_config,
+                        "clear_every_step_cache": "enable_clear_every_step_cache" in pipeline_parallel_config,
+                        "use_batch_p2p_comm": "disable_batch_p2p_comm" not in pipeline_parallel_config,
+                    }
+                    if dygraph_pp_configs["dp_comm_overlap"]:
+                        raise ValueError("overlap has accuracy issue")  # TODO: fix `overalap` + `delay_scale` issue
+
+                    if self.do_eval:
+                        if (
+                            self.per_device_train_batch_size * self.gradient_accumulation_steps
+                            != self.per_device_eval_batch_size
+                        ):
+                            logger.warning(
+                                "In pipeline model, the evaluation also shares same setting with training. "
+                                "We will enforce that per_device_eval_batch_size=per_device_train_batch_size * gradient_accumulation_steps."
+                            )
+
+                            self.per_device_eval_batch_size = (
+                                self.per_device_train_batch_size * self.gradient_accumulation_steps
+                            )
+
+                if self.tensor_parallel_degree > 1:
+                    strategy.tensor_parallel_configs = {"tensor_init_seed": self.seed}
+
+                    mp_config = split_parallel_config(self.tensor_parallel_config)
+
+                    for x in mp_config:
+                        if len(x) > 0:
+                            if x not in [
+                                "enable_mp_async_allreduce",
+                                "enable_mp_skip_c_identity",
+                                "enable_mp_fused_linear_param_grad_add",
+                                "enable_sp_async_reduce_scatter",
+                                "enable_delay_scale_loss",
+                                "sync_param",
+                                "sync_grad",
+                                "sync_moment",
+                            ]:
+                                raise ValueError(
+                                    f"Found unknown tensor parallell config {x}, "
+                                    f"accept config is enable_mp_async_allreduce, enable_mp_skip_c_identity, enable_mp_fused_linear_param_grad_add, enable_sp_async_reduce_scatter, enable_delay_scale_loss, sync_param, sync_grad and sync_moment."
+                                )
+                    try:
+                        if "enable_mp_async_allreduce" in mp_config:
+                            strategy.hybrid_configs["mp_configs"].mp_async_allreduce = True
+                            if "enable_mp_skip_c_identity" in mp_config:
+                                strategy.hybrid_configs["mp_configs"].mp_skip_c_identity = True
+                            if "enable_mp_fused_linear_param_grad_add" in mp_config:
+                                strategy.hybrid_configs["mp_configs"].mp_fused_linear_param_grad_add = True
+                        else:
+                            if "enable_mp_skip_c_identity" in mp_config:
+                                warnings.warn(
+                                    "enable_mp_skip_c_identity only works with enable_mp_async_allreduce. It will not work."
+                                )
+                            if "enable_mp_fused_linear_param_grad_add" in mp_config:
+                                warnings.warn(
+                                    "enable_mp_fused_linear_param_grad_add only works with enable_mp_async_allreduce. It will not work."
+                                )
+                        if "enable_sp_async_reduce_scatter" in mp_config:
+                            strategy.hybrid_configs["mp_configs"].sp_async_reduce_scatter = True
+
+                        sync_param = "sync_param" in mp_config
+                        sync_grad = "sync_grad" in mp_config
+                        sync_moment = "sync_moment" in mp_config
+
+                        # sync_param_name = [""] matches any parameter name.
+                        # If sync_param, sync_grad and sync_moment are not set, the default value in Paddle is :
+                        # sync_param = True, sync_grad = False, sync_moment = False, sync_param_name = ["embedding", "layer_norm", ".b_"].
+
+                        if sync_param or sync_grad or sync_moment:
+                            logger.info("setting sync_param_name")
+                            strategy.sync_param_name = [""]
+
+                        if sync_param:
+                            logger.info("setting sync_param")
+                            strategy.hybrid_configs["mp_configs"].sync_param = True
+
+                        if sync_grad:
+                            logger.info("setting sync_grad")
+                            strategy.hybrid_configs["mp_configs"].sync_grad = True
+
+                        if sync_moment:
+                            logger.info("setting sync_moment")
+                            strategy.hybrid_configs["mp_configs"].sync_moment = True
+
+                    except:
+                        warnings.warn(
+                            "The enable_mp_async_allreduce, enable_mp_skip_c_identity and enable_mp_fused_linear_param_grad_add are not supported "
+                            "by current version of Paddle. Please try latest develop Paddle."
+                        )
+
+                def is_segment_parallel_supported():
+                    import inspect
+
+                    members = [name for (name, date) in inspect.getmembers(fleet.HybridCommunicateGroup)]
+                    support_sep = "get_sep_parallel_world_size" in members
+                    if not support_sep:
+                        logger.warning("segment parallel is not supported!!!, Ignore it.")
+                    return support_sep
+
+                if self.hybrid_parallel_topo_order == "pp_first":
+                    if is_segment_parallel_supported():
+                        order = ["dp", "pp", "sharding", "sep", "mp"]
+                    else:
+                        order = ["dp", "pp", "sharding", "mp"]
+                if self.hybrid_parallel_topo_order == "sharding_first":
+                    if is_segment_parallel_supported():
+                        order = ["dp", "sharding", "pp", "sep", "mp"]
+                    else:
+                        order = ["dp", "sharding", "pp", "mp"]
+                if self.use_expert_parallel:
+                    order = order[1:-1] + ["dp", "mp"]
+
+                if is_segment_parallel_supported():
+                    hybrid_configs = {
+                        "dp_degree": self.data_parallel_degree,
+                        "mp_degree": self.tensor_parallel_degree,
+                        "pp_degree": self.pipeline_parallel_degree,
+                        "sharding_degree": self.sharding_parallel_degree,
+                        "sep_degree": self.sep_parallel_degree
+                        if self.sep_parallel_degree > 1
+                        else self.context_parallel_degree,
+                        "order": order,
+                    }
+                else:
+                    hybrid_configs = {
+                        "dp_degree": self.data_parallel_degree,
+                        "mp_degree": self.tensor_parallel_degree,
+                        "pp_degree": self.pipeline_parallel_degree,
+                        "sharding_degree": self.sharding_parallel_degree,
+                        "order": order,
+                    }
+
+                if self.pipeline_parallel_degree > 1:
+                    hybrid_configs["pp_configs"] = dygraph_pp_configs
+                    logger.info(f"using pipeline configs:{dygraph_pp_configs}")
+
+                # setter once https://github.com/PaddlePaddle/Paddle/blob/b7295120b0e78b293cd7ae29706e21769d06a3cc/python/paddle/distributed/fleet/base/distributed_strategy.py#L1692
+                strategy.hybrid_configs = hybrid_configs
+
+                if self.sharding_parallel_degree > 1:
+                    sharding_parallel_config = split_parallel_config(self.sharding_parallel_config)
+
+                    for x in sharding_parallel_config:
+                        if len(x) > 0:
+                            if x not in [
+                                "enable_stage1_tensor_fusion",
+                                "enable_stage1_overlap",
+                                "enable_stage2_overlap",
+                                "split_param",
+                                "disable_stage1_reduce_avg",
+                                "enable_stage1_broadcast_overlap",
+                                "enable_stage1_allgather_overlap",
+                                "enable_release_grads",
+                            ]:
+                                raise ValueError(
+                                    f"Found unknown pipeline mode config {x}, "
+                                    f"accpet config is enable_stage1_tensor_fusion, enable_stage1_overlap, enable_stage2_overlap, split_param, disable_stage1_reduce_avg, enable_stage1_broadcast_overlap, enable_stage1_allgather_overlap."
+                                )
+                    if "disable_stage1_reduce_avg" in sharding_parallel_config:
+                        assert self.sharding == [
+                            ShardingOption.SHARD_OP
+                        ], "Only sharding stage1 supports to disable reduce_avg strategy."
+                        try:
+                            strategy.hybrid_configs["sharding_configs"].use_reduce_avg = False
+                        except:
+                            warnings.warn(
+                                "The reduce_avg strategy is not supported by current version of Paddle so you don't need to disable it. The nccl comm in sharding still use reduce_sum and scale of gradients."
+                            )
+
+                    try:
+                        if "split_param" in sharding_parallel_config:
+                            strategy.hybrid_configs["sharding_configs"].split_param = True
+
+                        if "enable_release_grads" in sharding_parallel_config:
+                            strategy.hybrid_configs["sharding_configs"].release_gradients = True
+
+                        if self.pipeline_parallel_degree == 1:
+                            strategy.hybrid_configs["sharding_configs"].tensor_fusion = (
+                                True if "enable_stage1_tensor_fusion" in sharding_parallel_config else False
+                            )
+                            if "enable_stage1_overlap" in sharding_parallel_config:
+                                strategy.hybrid_configs["sharding_configs"].comm_overlap = True
+                                strategy.hybrid_configs[
+                                    "sharding_configs"
+                                ].accumulate_steps = self.gradient_accumulation_steps
+
+                        else:
+                            warnings.warn(
+                                "For pipeline parallel with sharding, the sharding overlap and tensor fusion "
+                                "should be configured in pipeline_parallel_config."
+                                '"enable_stage1_tensor_fusion" and "enable_stage1_overlap" in sharding_parallel_config will be ignored.'
+                            )
+                    except (KeyError, AttributeError):
+                        warnings.warn(
+                            "The enable_stage1_tensor_fusion or enable_stage1_overlap is not supported "
+                            "by current version of Paddle. Please try latest develop Paddle."
+                        )
+                    if "enable_stage2_overlap" in sharding_parallel_config:
+                        assert (
+                            ShardingOption.SHARD_GRAD_OP in self.sharding
+                        ), f"enable_stage2_overlap expects sharding=stage2, but got {self.sharding}."
+                        assert self.logging_steps > 1, (
+                            "The logging_steps should be greater than 1 for stage2 overlap, "
+                            f"but got logging_steps={self.logging_steps}."
+                        )
+                    if "enable_stage1_broadcast_overlap" in sharding_parallel_config:
+                        assert (
+                            ShardingOption.SHARD_OP in self.sharding
+                        ), f"enable_stage1_broadcast_overlap expects sharding=stage1, but got {self.sharding}."
+
+                        assert (
+                            "split_param" not in sharding_parallel_config
+                        ), "split_param should not be set when enable_stage1_broadcast_overlap."
+                        use_casual_mask = os.getenv("USE_CASUAL_MASK", "False")
+                        assert use_casual_mask, "enable_stage1_broadcast_overlap requires USE_CASUAL_MASK=True."
+                        assert self.logging_steps > 1, (
+                            "The logging_steps should be greater than 1 for stage1_broadcast_overlap, "
+                            f"but got logging_steps={self.logging_steps}."
+                        )
+                    if "enable_stage1_allgather_overlap" in sharding_parallel_config:
+                        assert (
+                            ShardingOption.SHARD_OP in self.sharding
+                        ), f"enable_stage1_allgather_overlap expects sharding=stage1, but got {self.sharding}."
+
+                        assert (
+                            "split_param" in sharding_parallel_config
+                        ), "split_param should be set when enable_stage1_allgather_overlap."
+                        use_casual_mask = os.getenv("USE_CASUAL_MASK", "False")
+                        assert use_casual_mask, "enable_stage1_allgather_overlap requires USE_CASUAL_MASK=True."
+                        assert self.logging_steps > 1, (
+                            "The logging_steps should be greater than 1 for enable_stage1_allgather_overlap, "
+                            f"but got logging_steps={self.logging_steps}."
+                        )
+
+                fleet.init(is_collective=True, strategy=strategy)
+                logger.info(strategy)
+
+        elif self.enable_auto_parallel:
+            self.tensor_parallel_degree = max(self.tensor_parallel_degree, 1)
+            self.sep_parallel_degree = max(self.sep_parallel_degree, 1)
+            self.context_parallel_degree = max(self.context_parallel_degree, 1)
+            self.pipeline_parallel_degree = max(self.pipeline_parallel_degree, 1)
+
+            assert (
+                world_size % (self.tensor_parallel_degree * self.pipeline_parallel_degree) == 0
+            ), f"Total world_size:{world_size} shoule be devided by tensor_parallel_degree: {self.tensor_parallel_degree} and pipeline_parallel_degree: {self.pipeline_parallel_degree}."
+
+            if self.sharding_parallel_degree == -1:
+                if len(self.sharding) > 0:
+                    self.sharding_parallel_degree = world_size // (
+                        self.tensor_parallel_degree
+                        * self.sep_parallel_degree
+                        * self.context_parallel_degree
+                        * self.pipeline_parallel_degree
+                    )
+
+            self.sharding_parallel_degree = max(self.sharding_parallel_degree, 1)
+            if self.sharding_parallel_degree == 1 and len(self.sharding) > 0:
+                logger.warning("sharding_parallel_degree=1 means no sharding, please set sharding to empty!")
+                self.sharding = []
+
+            self.data_parallel_degree = world_size // (
+                self.sharding_parallel_degree
+                * self.tensor_parallel_degree
+                * self.sep_parallel_degree
+                * self.context_parallel_degree
+                * self.pipeline_parallel_degree
+            )
+
+            if ShardingOption.OFFLOAD in self.sharding:
+                warnings.warn("`offload` is not supported NOW!")
+
+            strategy = fleet.auto.Strategy()
+            if self.dataset_world_size > 1:
+                data_parallel_config = set(self.data_parallel_config.split(" "))
+                for x in data_parallel_config:
+                    if len(x) > 0:
+                        if x not in ["enable_allreduce_avg_in_gradinent_scale", "gradient_sync_after_accumulate"]:
+                            raise ValueError(
+                                f"Found unknown data parallel config {x}, accpet config is enable_allreduce_avg_in_gradinent_scale."
+                            )
+                if "enable_allreduce_avg_in_gradinent_scale" in data_parallel_config:
+                    strategy.gradient_scale_using_allreduce_avg = True
+                if "gradient_sync_after_accumulate" in data_parallel_config:
+                    strategy.dp_optimization.gradient_sync_after_accumulate = True
+            sequence_parallel_config = set(self.sequence_parallel_config.split(" "))
+            for x in sequence_parallel_config:
+                if len(x) > 0:
+                    if x not in ["enable_allreduce_avg_in_gradinent_scale"]:
+                        raise ValueError(
+                            f"Found unknown sequence parallel config {x}, accpet config is enable_allreduce_avg_in_gradinent_scale."
+                        )
+            if "enable_allreduce_avg_in_gradinent_scale" in sequence_parallel_config:
+                strategy.gradient_scale_using_allreduce_avg = True
+
+            # navie-pp: pipeline_parallel_degree > 1 and gradient_accumulation_steps == 1
+            if self.pipeline_parallel_degree > 1 and self.gradient_accumulation_steps > 1:
+                pipeline_parallel_config = split_parallel_config(self.pipeline_parallel_config)
+                for x in pipeline_parallel_config:
+                    if len(x) > 0:
+                        if x not in [
+                            "enable_send_recv_overlap",
+                            # "disable_p2p_cache_shape",      # no need for auto_parallel
+                            # "disable_partial_send_recv",    # no implemenation for auto_parallel
+                            # "enable_delay_scale_loss",      # default True in auto_parallel, non-configurable
+                            # "enable_dp_comm_overlap",       # no implemenation for auto_parallel
+                            # "enable_sharding_comm_overlap", # no implemenation for auto_parallel
+                            # "enable_timer",                 # no implemenation for auto_parallel
+                            # "disable_batch_p2p_comm",       # no implemenation for auto_parallel
+                            "enable_split_backward",
+                        ]:
+                            raise ValueError(
+                                f"Found unknown pipeline mode config {x}, accpet config is enable_send_recv_overlap."
+                            )
+
+                pipeline = strategy.pipeline
+                pipeline.enable = True
+                pipeline.enable_send_recv_overlap = "enable_send_recv_overlap" in pipeline_parallel_config
+                pipeline.split_backward = "enable_split_backward" in pipeline_parallel_config
+                pipeline.accumulate_steps = self.gradient_accumulation_steps
+                pipeline.micro_batch_size = self.per_device_train_batch_size
+                pipeline.schedule_mode = self.pipeline_schedule_mode
+
+                logger.info(f"PP configs:{strategy.pipeline}, use master_grad: {self.amp_master_grad}")
+
+                if self.do_eval:
+                    if (
+                        self.per_device_train_batch_size * self.gradient_accumulation_steps
+                        != self.per_device_eval_batch_size
+                    ):
+                        logger.warning(
+                            "In pipeline model, the evaluation also shares same setting with training. "
+                            "We will enforce that per_device_eval_batch_size=per_device_train_batch_size * gradient_accumulation_steps."
+                        )
+                        self.per_device_eval_batch_size = (
+                            self.per_device_train_batch_size * self.gradient_accumulation_steps
+                        )
+
+            elif self.gradient_accumulation_steps > 1:
+                gradient_merge = strategy.gradient_merge
+                gradient_merge.enable = True
+                gradient_merge.k_steps = self.gradient_accumulation_steps
+                gradient_merge.avg = True
+
+            if self.tensor_parallel_degree > 1:
+                mp_optimization = strategy.mp_optimization
+                mp_config = split_parallel_config(self.tensor_parallel_config)
+
+                for x in mp_config:
+                    if len(x) > 0:
+                        if x not in [
+                            "enable_mp_async_allreduce",  # allreduce_matmul_grad_overlapping in auto_parallel
+                            # "enable_mp_skip_c_identity",
+                            # "enable_mp_fused_linear_param_grad_add",
+                        ]:
+                            raise ValueError(
+                                f"Found unknown tensor parallell config {x}, "
+                                f"accept config is enable_mp_async_allreduce, enable_mp_skip_c_identity and enable_mp_fused_linear_param_grad_add"
+                            )
+                try:
+                    if "enable_mp_async_allreduce" in mp_config:
+                        mp_optimization.allreduce_matmul_grad_overlapping = True
+                except:
+                    warnings.warn(
+                        "The enable_mp_async_allreduce, enable_mp_skip_c_identity and enable_mp_fused_linear_param_grad_add are not supported "
+                        "by current version of Paddle. Please try latest develop Paddle."
+                    )
+
+            if self.sharding_parallel_degree > 1:
+                sharding = strategy.sharding
+                sharding.enable = True
+                sharding.degree = self.sharding_parallel_degree
+                if ShardingOption.SHARD_OP in self.sharding:
+                    sharding.stage = 1
+                elif ShardingOption.SHARD_GRAD_OP in self.sharding:
+                    sharding.stage = 2
+                elif ShardingOption.FULL_SHARD in self.sharding:
+                    sharding.stage = 3
+
+                sharding_parallel_config = split_parallel_config(self.sharding_parallel_config)
+                for x in sharding_parallel_config:
+                    if len(x) > 0:
+                        if x not in [
+                            "enable_stage1_tensor_fusion",
+                            "enable_stage1_overlap",
+                            "enable_stage2_overlap",
+                        ]:
+                            raise ValueError(
+                                f"Found unknown pipeline mode config {x}, " f"accpet config is reduce_overlap."
+                            )
+
+                    if (
+                        "enable_stage1_overlap" in sharding_parallel_config
+                        or "enable_stage2_overlap" in sharding_parallel_config
+                    ):
+                        sharding.enable_overlap = True
+
+                    if "enable_stage1_tensor_fusion" in sharding_parallel_config:
+                        sharding.grad_bucket_size_numel = 210355872
+
+            if self.bf16 or self.fp16:
+                amp = strategy.amp
+                amp.enable = True
+                amp.dtype = "bfloat16" if self.bf16 else "float16"
+                amp.level = self.fp16_opt_level.lower()
+                amp.use_master_grad = self.amp_master_grad
+                amp.init_loss_scaling = self.scale_loss
+                amp.custom_black_list = self.amp_custom_black_list if self.amp_custom_black_list is not None else []
+                amp.custom_white_list = self.amp_custom_white_list if self.amp_custom_white_list is not None else []
+
+            if self.recompute:
+                recompute = strategy.recompute
+                recompute.enable = True
+                recompute.sr = self.sr if self.sr is not None else 0
+                recompute.refined_ops_patterns = []
+                if self.refined_ops_patterns is not None:
+                    for pattern in self.refined_ops_patterns:
+                        recompute.refined_ops_patterns.append(eval(pattern))
+
+            self.strategy = strategy
+            if self.hybrid_parallel_topo_order == "pp_first":
+                order = ["pp", "dp", "mp"]
+                degree = [self.pipeline_parallel_degree, self.dataset_world_size, self.tensor_parallel_degree]
+            elif self.hybrid_parallel_topo_order == "sharding_first":
+                order = ["dp", "pp", "mp"]
+                degree = [self.dataset_world_size, self.pipeline_parallel_degree, self.tensor_parallel_degree]
+            mesh_dims = list(zip(order, degree))
+            fleet.auto.create_mesh(mesh_dims)
+
+            # init hcg for communication in trainer
+            if self.hybrid_parallel_topo_order == "pp_first":
+                order = ["pp", "dp", "sharding", "sep", "mp"]
+            elif self.hybrid_parallel_topo_order == "sharding_first":
+                order = ["dp", "sharding", "pp", "sep", "mp"]
+
+            strategy = fleet.DistributedStrategy()
+            strategy.hybrid_configs = {
+                "dp_degree": self.dataset_world_size,
+                "mp_degree": self.tensor_parallel_degree,
+                "pp_degree": self.pipeline_parallel_degree,
+                "order": order,
+            }
+            fleet.init(is_collective=True, strategy=strategy)
+
+        else:
+            if world_size > 1:
+                if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized():
+                    if self.unified_checkpoint:
+                        # DP use hybrid group
+                        strategy = fleet.DistributedStrategy()
+                        fleet.init(is_collective=True, strategy=strategy)
+                    else:
+                        paddle.distributed.init_parallel_env()
+
+        if (
+            self.unified_checkpoint
+            and self.sharding_parallel_degree > 0
+            and ShardingOption.FULL_SHARD in self.sharding
+        ):
+            logger.warning(
+                "Unified checkpoint currently do not support sharding stage3, set `unified_checkpoint` to False."
+            )
+            self.unified_checkpoint = False
+
+        if self.unified_checkpoint:
+            unified_checkpoint_config = set(self.unified_checkpoint_config.split(" "))
+            if sys.platform.startswith("win") and "async_save" in self.unified_checkpoint_config:
+                raise ValueError("Currently do not support asynchronous saving for Windows system!")
+            if (
+                "skip_save_model_weight" in self.unified_checkpoint_config
+                and "ignore_merge_optimizer" in self.unified_checkpoint_config
+            ):
+                raise ValueError("`skip_save_model_weight` and `ignore_merge_optimizer` cannot both be True.")
+            for x in unified_checkpoint_config:
+                if len(x) > 0:
+                    if x not in [
+                        "skip_save_model_weight",
+                        "master_weight_compatible",
+                        "async_save",
+                        "enable_all_options",
+                        "ignore_merge_optimizer",
+                    ]:
+                        raise ValueError(
+                            f"Found unknown unified_checkpoint config {x}, accpet config is skip_save_model_weight, "
+                            + "master_weight_compatible, async_save, enable_all_options, ignore_merge_optimizer."
+                        )
+            if "enable_all_options" in unified_checkpoint_config:
+                self.unified_checkpoint_config = [
+                    "skip_save_model_weight",
+                    "master_weight_compatible",
+                    # "async_save",
+                ]
+            else:
+                self.unified_checkpoint_config = self.unified_checkpoint_config.split(" ")
+
+        if self.report_to is None:
+            logger.info(
+                "The default value for the training argument `--report_to` will change in v5 (from all installed "
+                "integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as "
+                "now. You should start updating your code and make this info disappear :-)."
+            )
+            self.report_to = "visualdl"
+        if self.report_to == "all" or self.report_to == ["all"]:
+            # Import at runtime to avoid a circular import.
+            from .integrations import get_available_reporting_integrations
+
+            self.report_to = get_available_reporting_integrations()
+        elif self.report_to == "none" or self.report_to == ["none"]:
+            self.report_to = []
+        elif not isinstance(self.report_to, list):
+            self.report_to = [self.report_to]
+
+        if self.warmup_ratio < 0 or self.warmup_ratio > 1:
+            raise ValueError("warmup_ratio must lie in range [0,1]")
+        elif self.warmup_ratio > 0 and self.warmup_steps > 0:
+            logger.info(
+                "Both warmup_ratio and warmup_steps given, warmup_steps will override any effect of warmup_ratio during training"
+            )
+
+        if self.flatten_param_grads and self.device != "npu":
+            raise ValueError("flatten_param_grads can only be used on npu devices in temporary.")
+
+        if self.world_size != paddle.distributed.get_world_size():
+            raise ValueError(
+                f"The local_ran: {self.local_rank} should be consistent with the world size: {paddle.distributed.get_world_size()}."
+            )
+
+    def __str__(self):
+        self_as_dict = asdict(self)
+        self_as_dict = {k: f"<{k.upper()}>" if k.endswith("_token") else v for k, v in self_as_dict.items()}
+
+        attrs_as_str = [f"{k}={v},\n" for k, v in sorted(self_as_dict.items())]
+        return f"{self.__class__.__name__}(\n{''.join(attrs_as_str)})"
+
+    __repr__ = __str__
+
+    @property
+    def train_batch_size(self) -> int:
+        """
+        The actual batch size for training.
+        """
+        train_batch_size = self.per_device_train_batch_size
+        return train_batch_size
+
+    @property
+    def eval_batch_size(self) -> int:
+        """
+        The actual batch size for evaluation.
+        """
+        eval_batch_size = self.per_device_eval_batch_size
+        return eval_batch_size
+
+    @property
+    def current_device(self) -> "paddle.device":
+        """
+        The device used by this process.
+        """
+        return paddle.device.get_device()
+
+    @property
+    def world_size(self):
+        """
+        The number of processes used in parallel.
+        """
+        if self.local_rank != -1:
+            return paddle.distributed.get_world_size()
+        return 1
+
+    @property
+    def data_parallel_rank(self):
+        if self.use_hybrid_parallel:
+            hcg = fleet.get_hybrid_communicate_group()
+            dp_group = hcg.get_data_parallel_group()
+            if dp_group.rank == -1:
+                return 0
+            return dp_group.rank
+        elif self.enable_auto_parallel:
+            mesh = fleet.auto.get_mesh()
+            return mesh.get_rank_by_dim_and_process_id("dp", dist.get_rank())
+        else:
+            return paddle.distributed.get_rank()
+
+    @property
+    def dataset_rank(self):
+        if self.use_hybrid_parallel:
+            return max(self.sharding_parallel_degree, 1) * self.data_parallel_rank + self.sharding_parallel_rank
+        elif self.enable_auto_parallel:
+            return self.data_parallel_rank
+        else:
+            return paddle.distributed.get_rank()
+
+    @property
+    def dataset_world_size(self):
+        if self.use_hybrid_parallel:
+            return max(self.sharding_parallel_degree, 1) * max(self.data_parallel_degree, 1)
+        elif self.enable_auto_parallel:
+            return max(self.sharding_parallel_degree, 1) * max(self.data_parallel_degree, 1)
+        else:
+            return paddle.distributed.get_world_size()
+
+    @property
+    def sharding_parallel_rank(self):
+        if self.use_hybrid_parallel:
+            hcg = fleet.get_hybrid_communicate_group()
+            sharding_group = hcg.get_sharding_parallel_group()
+            return max(sharding_group.rank, 0)
+        else:
+            return 0
+
+    @property
+    def tensor_parallel_rank(self):
+        if self.use_hybrid_parallel:
+            hcg = fleet.get_hybrid_communicate_group()
+            tp_group = hcg.get_model_parallel_group()
+            return max(tp_group.rank, 0)
+        elif self.enable_auto_parallel:
+            mesh = fleet.auto.get_mesh()
+            return mesh.get_rank_by_dim_and_process_id("mp", dist.get_rank())
+        else:
+            return 0
+
+    @property
+    def pipeline_parallel_rank(self):
+        if self.use_hybrid_parallel:
+            hcg = fleet.get_hybrid_communicate_group()
+            rank = hcg.get_stage_id()
+            return max(rank, 0)
+        elif self.enable_auto_parallel:
+            mesh = fleet.auto.get_mesh()
+            return mesh.get_rank_by_dim_and_process_id("pp", dist.get_rank())
+        else:
+            return 0
+
+    def _format_name(self, prefix, rank, degree):
+        size = 2
+        return f"{prefix}{rank:0>{size}d}"
+
+    @property
+    def optimizer_name_suffix(self):
+        if self.use_hybrid_parallel:
+            name = []
+            if self.tensor_parallel_degree > 1:
+                name.append(self._format_name("tp", self.tensor_parallel_rank, self.tensor_parallel_degree))
+            if self.pipeline_parallel_degree > 1:
+                name.append(self._format_name("pp", self.pipeline_parallel_rank, self.pipeline_parallel_degree))
+            if self.sharding_parallel_degree > 1:
+                name.append(self._format_name("shard", self.sharding_parallel_rank, self.sharding_parallel_degree))
+            if self.use_expert_parallel:
+                name.append(self._format_name("moe", self.data_parallel_rank, self.data_parallel_degree))
+            return "_".join(name)
+        else:
+            if self.use_expert_parallel:
+                return self._format_name("moe", self.data_parallel_rank, self.data_parallel_degree)
+            return None
+
+    @property
+    def weight_name_suffix(self):
+        if self.use_hybrid_parallel:
+            name = []
+            if self.tensor_parallel_degree > 1:
+                name.append(self._format_name("tp", self.tensor_parallel_rank, self.tensor_parallel_degree))
+            if self.pipeline_parallel_degree > 1:
+                name.append(self._format_name("pp", self.pipeline_parallel_rank, self.pipeline_parallel_degree))
+            if self.use_expert_parallel:
+                name.append(self._format_name("moe", self.data_parallel_rank, self.data_parallel_degree))
+            return "_".join(name)
+
+        else:
+            if self.use_expert_parallel:
+                return self._format_name("moe", self.data_parallel_rank, self.data_parallel_degree)
+            return None
+
+    def sharded_name_suffix(self, shard_id=None, pp_id=None, moe_id=None):
+        if self.use_hybrid_parallel:
+            name = []
+            if self.tensor_parallel_degree > 1:
+                name.append(self._format_name("tp", self.tensor_parallel_rank, self.tensor_parallel_degree))
+            if self.pipeline_parallel_degree > 1:
+                if pp_id is None:
+                    pp_id = self.pipeline_parallel_rank
+                assert isinstance(pp_id, int)
+                name.append(self._format_name("pp", pp_id, self.pipeline_parallel_degree))
+            if self.sharding_parallel_degree > 1:
+                if shard_id is None:
+                    shard_id = self.sharding_parallel_rank
+                assert isinstance(shard_id, int)
+                name.append(self._format_name("shard", shard_id, self.sharding_parallel_degree))
+            if self.use_expert_parallel:
+                if moe_id is None:
+                    moe_id = self.data_parallel_rank
+                assert isinstance(moe_id, int)
+                name.append(self._format_name("moe", moe_id, self.data_parallel_degree))
+            return "_".join(name)
+        else:
+            if self.use_expert_parallel:
+                if moe_id is None:
+                    moe_id = self.data_parallel_rank
+                return self._format_name("moe", moe_id, self.data_parallel_degree)
+            return None
+
+    @property
+    def process_index(self):
+        """
+        The index of the current process used.
+        """
+        if self.local_rank != -1:
+            return paddle.distributed.get_rank()
+        return 0
+
+    @property
+    def logical_process_index(self):
+        """
+        The index of the current process used.
+        """
+        if self.local_rank != -1:
+            sd_size = max(self.sharding_parallel_degree, 1)
+            pp_size = max(self.pipeline_parallel_degree, 1)
+            tp_size = max(self.tensor_parallel_degree, 1)
+
+            dp_rank = max(self.data_parallel_rank, 0)
+            sd_rank = max(self.sharding_parallel_rank, 0)
+            pp_rank = max(self.pipeline_parallel_rank, 0)
+            tp_rank = max(self.tensor_parallel_rank, 0)
+
+            rank = (
+                dp_rank * (sd_size * pp_size * tp_size) + sd_rank * (pp_size * tp_size) + pp_rank * tp_size + tp_rank
+            )
+
+            return rank
+        return 0
+
+    @property
+    def local_process_index(self):
+        """
+        The index of the local process used.
+        """
+        if self.local_rank != -1:
+            return self.local_rank
+        return 0
+
+    @property
+    def should_log(self):
+        """
+        Whether or not the current process should produce log.
+        """
+        if self.enable_auto_parallel:
+            return True
+        elif self.log_on_each_node:
+            return self.local_process_index == 0
+        else:
+            return self.process_index == 0
+
+    @property
+    def should_save(self):
+        """
+        Whether or not the current process should write to disk, e.g., to save models and checkpoints.
+
+        For model state:
+            work for data parallel, tensor parallel, sharding
+        For optimizer state:
+            work for data parallel, tensor parallel
+            not work for sharding
+        """
+        if self.save_on_each_node:
+            return self.local_process_index == 0
+        else:
+            if self.enable_auto_parallel:
+                return True
+            return self.process_index == 0
+
+    @property
+    def should_save_model_state(self):
+        """
+        Whether or not the current process should write to disk, e.g., to save models and checkpoints.
+
+        For model state:
+            work for data parallel, tensor parallel, sharding
+        For optimizer state:
+            work for data parallel, tensor parallel
+            not work for sharding
+        """
+        if self.save_on_each_node:
+            return self.local_process_index == 0
+        else:
+            if self.should_save_sharding_stage1_model:
+                return True
+            elif self.enable_auto_parallel:
+                return True
+            elif self.use_hybrid_parallel:
+                # save on dataset rank 0
+                return self.sharding_parallel_rank == 0 and (self.data_parallel_rank == 0 or self.use_expert_parallel)
+            else:
+                return self.process_index == 0 or self.use_expert_parallel
+
+    @property
+    def _no_sync_in_gradient_accumulation(self):
+        """
+        Whether or not to use no_sync for the gradients when doing gradient accumulation.
+        """
+        return True
+
+    @property
+    def should_save_sharding_stage1_model(self):
+        if self.enable_auto_parallel:
+            return False
+        return (
+            ShardingOption.SHARD_OP in self.sharding and self.sharding_parallel_degree > 1 and self.save_sharded_model
+        )
+
+    @property
+    def should_load_sharding_stage1_model(self):
+        if self.enable_auto_parallel:
+            return False
+        return (
+            ShardingOption.SHARD_OP in self.sharding and self.sharding_parallel_degree > 1 and self.load_sharded_model
+        )
+
+    @property
+    def should_load_dataset(self):
+        if not self.distributed_dataloader:
+            return True
+        else:
+            if self.tensor_parallel_rank == 0 and self.pipeline_parallel_rank == 0:
+                return True
+            else:
+                return False
+
+    @contextlib.contextmanager
+    def main_process_first(self, local=True, desc="work"):
+        """
+        A context manager for paddle distributed environment where on needs to do something on the main process, while
+        blocking replicas, and when it's finished releasing the replicas.
+
+        One such use is for `datasets`'s `map` feature which to be efficient should be run once on the main process,
+        which upon completion saves a cached version of results and which then automatically gets loaded by the
+        replicas.
+
+        Args:
+            local (`bool`, *optional*, defaults to `True`):
+                if `True` first means process of rank 0 of each node if `False` first means process of rank 0 of node
+                rank 0 In multi-node environment with a shared filesystem you most likely will want to use
+                `local=False` so that only the main process of the first node will do the processing. If however, the
+                filesystem is not shared, then the main process of each node will need to do the processing, which is
+                the default behavior.
+            desc (`str`, *optional*, defaults to `"work"`):
+                a work description to be used in debug logs
+
+        """
+        if self.world_size > 1:
+            if local:
+                is_main_process = self.local_process_index == 0
+                main_process_desc = "main local process"
+            else:
+                is_main_process = self.process_index == 0
+                main_process_desc = "main process"
+
+            try:
+                if not is_main_process:
+                    # tell all replicas to wait
+                    logger.debug(f"{self.process_index}: waiting for the {main_process_desc} to perform {desc}")
+                    paddle.distributed.barrier()
+                yield
+            finally:
+                if is_main_process:
+                    # the wait is over
+                    logger.debug(f"{self.process_index}: {main_process_desc} completed {desc}, releasing all replicas")
+                    paddle.distributed.barrier()
+        else:
+            yield
+
+    def get_warmup_steps(self, num_training_steps: int):
+        """
+        Get number of steps used for a linear warmup.
+        """
+        warmup_steps = (
+            self.warmup_steps if self.warmup_steps > 0 else math.ceil(num_training_steps * self.warmup_ratio)
+        )
+        return warmup_steps
+
+    def to_dict(self):
+        """
+        Serializes this instance while replace `Enum` by their values (for JSON serialization support). It obfuscates
+        the token values by removing their value.
+        """
+        d = asdict(self)
+        for k, v in d.items():
+            if isinstance(v, Enum):
+                d[k] = v.value
+            if isinstance(v, list) and len(v) > 0 and isinstance(v[0], Enum):
+                d[k] = [x.value for x in v]
+            if k.endswith("_token"):
+                d[k] = f"<{k.upper()}>"
+        return d
+
+    def to_json_string(self):
+        """
+        Serializes this instance to a JSON string.
+        """
+        return json.dumps(self.to_dict(), indent=2)
+
+    def to_sanitized_dict(self) -> Dict[str, Any]:
+        """
+        Sanitized serialization
+        """
+        d = self.to_dict()
+        d = {**d, **{"train_batch_size": self.train_batch_size, "eval_batch_size": self.eval_batch_size}}
+
+        valid_types = [bool, int, float, str]
+        valid_types.append(paddle.Tensor)
+
+        return {k: v if type(v) in valid_types else str(v) for k, v in d.items()}
+
+    def print_config(self, args=None, key=""):
+        """
+        print all config values.
+        """
+        logger.debug("=" * 60)
+        if args is None:
+            args = self
+            key = "Training"
+
+        import paddlenlp
+
+        logger.debug("{:^40}".format("{} Configuration Arguments".format(key)))
+        logger.debug("{:30}: {}".format("paddle commit id", paddle.version.commit))
+        logger.debug("{:30}: {}".format("paddlenlp commit id", paddlenlp.version.commit))
+
+        for a in dir(args):
+            if a[:2] != "__":  # don't print double underscore methods
+                v = getattr(args, a)
+                if not isinstance(v, types.MethodType):
+                    logger.debug("{:30}: {}".format(a, v))
+
+        logger.debug("")
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/training_args_seq2seq.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/training_args_seq2seq.py
new file mode 100644
index 000000000..3885944a4
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/training_args_seq2seq.py
@@ -0,0 +1,68 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Optional
+
+from .training_args import TrainingArguments
+from .utils import add_start_docstrings
+
+__all__ = [
+    "Seq2SeqTrainingArguments",
+]
+
+
+@dataclass
+@add_start_docstrings(TrainingArguments.__doc__)
+class Seq2SeqTrainingArguments(TrainingArguments):
+    """
+    Args:
+        sortish_sampler (`bool`, *optional*, defaults to `False`):
+            Whether to use a *sortish sampler* or not. Only possible if the underlying datasets are *Seq2SeqDataset*
+            for now but will become generally available in the near future.
+
+            It sorts the inputs according to lengths in order to minimize the padding size, with a bit of randomness
+            for the training set.
+        predict_with_generate (`bool`, *optional*, defaults to `False`):
+            Whether to use generate to calculate generative metrics (ROUGE, BLEU).
+        generation_max_length (`int`, *optional*):
+            The `max_length` to use on each evaluation loop when `predict_with_generate=True`. Will default to the
+            `max_length` value of the model configuration.
+        generation_num_beams (`int`, *optional*):
+            The `num_beams` to use on each evaluation loop when `predict_with_generate=True`. Will default to the
+            `num_beams` value of the model configuration.
+    """
+
+    sortish_sampler: bool = field(default=False, metadata={"help": "Whether to use SortishSampler or not."})
+    predict_with_generate: bool = field(
+        default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."}
+    )
+    generation_max_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The `max_length` to use on each evaluation loop when `predict_with_generate=True`. Will default "
+                "to the `max_length` value of the model configuration."
+            )
+        },
+    )
+    generation_num_beams: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The `num_beams` to use on each evaluation loop when `predict_with_generate=True`. Will default "
+                "to the `num_beams` value of the model configuration."
+            )
+        },
+    )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/__init__.py
new file mode 100644
index 000000000..d432b9716
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you smay not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .helper import *
+
+from .doc import (
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/async_save.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/async_save.py
new file mode 100644
index 000000000..c652fd1e3
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/async_save.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import atexit
+import copy
+import multiprocessing
+import os
+import time
+
+import paddle
+
+from paddlenlp.utils.log import logger
+
+
+def _save_optimizer(obj, name_mapping, path, saved_signal_path, protocol):
+    start_time = time.time()
+    for k, v in obj.items():
+        if k == "master_weights" and isinstance(v, dict):
+            for kk, vv in v.items():
+                if isinstance(vv, paddle.Tensor):
+                    vv.name = name_mapping["master_weights"][kk]
+        else:
+            if k in name_mapping and isinstance(v, paddle.Tensor):
+                v.name = name_mapping[k]
+    paddle.save(obj, path, protocol)
+    # dump saved_signal
+    with open(saved_signal_path, mode="w+") as f:
+        f.write("1")
+        f.flush()
+        os.fsync(f.fileno())
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    logger.info(f"Async save optimizer took {elapsed_time:.6f} seconds to execute.")
+
+
+class AsyncSaver:
+    def __init__(self):
+        self.context = multiprocessing.get_context("spawn")
+        self.cpu_optimizer_state_dict = {}
+        self.pool = self.context.Pool(1)
+        self.result = None
+        self.name_mapping = None
+
+        atexit.register(self.shutdown)
+
+    def run(self, optimizer_state_dict, path, saved_signal_path, protocol=4):
+        logger.info(f"Started saving optimizer_state_dict to {os.path.abspath(path)}.")
+        self._wait_for_previous_result()
+
+        self._reset_state(path, saved_signal_path, protocol)
+        self._process_optimizer_state_dict(optimizer_state_dict)
+
+        self.result = self.pool.apply_async(
+            _save_optimizer,
+            args=(self.cpu_optimizer_state_dict, self.name_mapping, self.path, self.saved_signal_path, self.protocol),
+        )
+
+        logger.info("Finished launching saving optimizer_state_dict process")
+
+    def _wait_for_previous_result(self):
+        if self.result is not None:
+            max_retries = 5
+            for retries in range(max_retries):
+                try:
+                    self.result.get()
+                    break
+                except Exception as e:
+                    if retries == max_retries - 1:
+                        raise RuntimeError(f"Failed after {max_retries} retries during async save.")
+
+                    time.sleep(1 + retries * 2)
+                    logger.warning(f"An error occurred during async save: {e}. Retrying...")
+                    self.result = self.pool.apply_async(
+                        _save_optimizer,
+                        args=(
+                            self.cpu_optimizer_state_dict,
+                            self.name_mapping,
+                            self.path,
+                            self.saved_signal_path,
+                            self.protocol,
+                        ),
+                    )
+
+            if self.result.ready() and not self.result.successful():
+                raise RuntimeError("The previous async save task failed.")
+        else:
+            pass
+
+    def _reset_state(self, path, saved_signal_path, protocol):
+        self.cpu_optimizer_state_dict.clear()
+        self.name_mapping = {"master_weights": {}}
+        self.path = path
+        self.saved_signal_path = saved_signal_path
+        self.protocol = protocol
+
+    def _process_optimizer_state_dict(self, optimizer_state_dict):
+        for k, v in optimizer_state_dict.items():
+            if k == "master_weights":
+                self.cpu_optimizer_state_dict[k] = {}
+                for kk, vv in v.items():
+                    self.cpu_optimizer_state_dict[k][kk] = vv.pin_memory()
+                    self.name_mapping[k][kk] = vv.name
+            elif k == "LR_Scheduler":
+                self.cpu_optimizer_state_dict[k] = copy.deepcopy(v)
+            else:
+                self.cpu_optimizer_state_dict[k] = v.pin_memory()
+                self.name_mapping[k] = v.name
+            paddle.device.synchronize()
+
+    def shutdown(self):
+        self.pool.close()
+        self.pool.join()
+
+    def __del__(self):
+        self.shutdown()
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/doc.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/doc.py
new file mode 100644
index 000000000..1b4c0c9d7
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/doc.py
@@ -0,0 +1,54 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Doc utilities: Utilities related to documentation
+"""
+
+
+def add_start_docstrings(*docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
+        return fn
+
+    return docstring_decorator
+
+
+def add_start_docstrings_to_model_forward(*docstr):
+    def docstring_decorator(fn):
+        docstring = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
+        class_name = f"[`{fn.__qualname__.split('.')[0]}`]"
+        intro = f"   The {class_name} forward method, overrides the `__call__` special method."
+        note = r"""
+
+    <Tip>
+
+    Although the recipe for forward pass needs to be defined within this function, one should call the [`Layer`]
+    instance afterwards instead of this since the former takes care of running the pre and post processing steps while
+    the latter silently ignores them.
+
+    </Tip>
+"""
+
+        fn.__doc__ = intro + note + docstring
+        return fn
+
+    return docstring_decorator
+
+
+def add_end_docstrings(*docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = (fn.__doc__ if fn.__doc__ is not None else "") + "".join(docstr)
+        return fn
+
+    return docstring_decorator
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/helper.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/helper.py
new file mode 100644
index 000000000..4de9b993c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/helper.py
@@ -0,0 +1,338 @@
+# Copyright 2020-present the HuggingFace Inc. team.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is modified from
+#  https://github.com/huggingface/transformers/blob/main/src/transformers
+
+import os
+from typing import Any, Optional
+
+import numpy as np
+import paddle
+import paddle.distributed as dist
+from paddle.distributed import fleet
+from paddle.distributed.parallel import sync_params_buffers
+
+from paddlenlp.utils.log import logger
+from paddlenlp.utils.nested import nested_broadcast_tensor_with_empty  # noqa: F401
+from paddlenlp.utils.nested import (
+    nested_broadcast_tensor,
+    nested_empty_tensor,
+    nested_reduce_tensor,
+)
+
+__all__ = [
+    "distributed_concat",
+    "paddle_pad_and_concatenate",
+    "nested_concat",
+    "nested_detach",
+    "nested_numpify",
+    "nested_truncate",
+]
+
+
+def distributed_concat(tensor: Any, num_total_examples: Optional[int] = None) -> Any:
+    try:
+        if isinstance(tensor, (tuple, list)):
+            return type(tensor)(distributed_concat(t, num_total_examples) for t in tensor)
+        output_tensors = []
+        dist.all_gather(output_tensors, tensor)
+        output_tensors = [t if len(t.shape) > 0 else t.reshape_([-1]) for t in output_tensors]
+        concat = paddle.concat(output_tensors, axis=0)
+
+        # truncate the dummy elements added by SequentialDistributedSampler
+        if num_total_examples is not None:
+            concat = concat[:num_total_examples]
+        return concat
+    except AssertionError:
+        raise AssertionError("Not currently using distributed training")
+
+
+def paddle_pad_and_concatenate(tensor1, tensor2, padding_index=-100):
+    """Concatenates `tensor1` and `tensor2` on first axis, applying padding on the second if necessary."""
+    if len(tensor1.shape) == 1 or tensor1.shape[1] == tensor2.shape[1]:
+        return paddle.concat((tensor1, tensor2), axis=0)
+
+    # raise ValueError("Error")
+    # Let's figure out the new shape
+    new_shape = (tensor1.shape[0] + tensor2.shape[0], max(tensor1.shape[1], tensor2.shape[1])) + tuple(
+        tensor1.shape[2:]
+    )
+
+    # Now let's fill the result tensor
+    # result = tensor1.new_full(new_shape, padding_index)
+    result = paddle.full(new_shape, padding_index, dtype=tensor1.dtype)
+
+    result[: tensor1.shape[0], : tensor1.shape[1]] = tensor1
+    result[tensor1.shape[0] :, : tensor2.shape[1]] = tensor2
+    return result
+
+
+def numpy_pad_and_concatenate(array1, array2, padding_index=-100):
+    """Concatenates `array1` and `array2` on first axis, applying padding on the second if necessary."""
+    if len(array1.shape) == 1 or array1.shape[1] == array2.shape[1]:
+        return np.concatenate((array1, array2), axis=0)
+
+    # Let's figure out the new shape
+    new_shape = (array1.shape[0] + array2.shape[0], max(array1.shape[1], array2.shape[1])) + array1.shape[2:]
+
+    # Now let's fill the result tensor
+    result = np.full_like(array1, padding_index, shape=new_shape)
+    result[: array1.shape[0], : array1.shape[1]] = array1
+    result[array1.shape[0] :, : array2.shape[1]] = array2
+    return result
+
+
+def nested_concat(tensors, new_tensors, padding_index=-100):
+    """
+    Concat the `new_tensors` to `tensors` on the first dim and pad them on the second if needed. Works for tensors or
+    nested list/tuples of tensors.
+    """
+    assert type(tensors) == type(
+        new_tensors
+    ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}."
+    if isinstance(tensors, (list, tuple)):
+        return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
+    elif isinstance(tensors, paddle.Tensor):
+        return paddle_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
+    elif isinstance(tensors, np.ndarray):
+        return numpy_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
+    else:
+        raise TypeError(f"Unsupported type for concatenation: got {type(tensors)}")
+
+
+def nested_detach(tensors):
+    "Detach `tensors` (even if it's a nested list/tuple of tensors)."
+    if isinstance(tensors, (list, tuple)):
+        return type(tensors)(nested_detach(t) for t in tensors)
+    return tensors.detach()
+
+
+def nested_numpify(tensors):
+    "Numpify `tensors` (even if it's a nested list/tuple of tensors)."
+    if isinstance(tensors, (list, tuple)):
+        return type(tensors)(nested_numpify(t) for t in tensors)
+    t = tensors.cpu()
+    if t.dtype == paddle.float16:
+        t = t.cast(paddle.float32)
+    return t.cpu().numpy()
+
+
+def nested_truncate(tensors, limit):
+    "Truncate `tensors` at `limit` (even if it's a nested list/tuple of tensors)."
+    if isinstance(tensors, (list, tuple)):
+        return type(tensors)(nested_truncate(t, limit) for t in tensors)
+    return tensors[:limit]
+
+
+def distributed_isfile(filename):
+    """Check all machine nodes. return False if no machine have such file."""
+    trainers_num = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
+    if trainers_num <= 1:
+        return os.path.isfile(filename)
+    else:
+        local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0))
+        file_count = paddle.zeros([1], dtype="int64")
+        if local_rank == 0 and os.path.isfile(filename):
+            file_count += 1
+
+        paddle.distributed.all_reduce(file_count)
+        return file_count >= 1
+
+
+def distributed_file(filename):
+    trainers_num = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
+    if trainers_num <= 1:
+        return filename
+    else:
+        local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0))
+        found_file = paddle.to_tensor([2**20], dtype="int64")
+        if local_rank == 0 and os.path.isfile(filename):
+            found_file = paddle.to_tensor([paddle.distributed.get_rank()], dtype="int64")
+
+        tensor_list = []
+        paddle.distributed.all_gather(tensor_list, found_file)
+        src = paddle.min(paddle.concat(tensor_list)).item()
+
+        file_object_list = [None]
+        if paddle.distributed.get_rank() == src:
+            file_object_list = [open(filename, "rb").read()]
+
+        paddle.distributed.broadcast_object_list(file_object_list, src=src)
+        file_object = file_object_list[0]
+
+        if local_rank == 0 and not os.path.isfile(filename):
+            if not os.path.exists(os.path.dirname(filename)):
+                os.makedirs(os.path.dirname(filename))
+
+            with open(filename, "wb") as f:
+                f.write(file_object)
+
+        paddle.distributed.barrier()
+
+        return filename
+
+
+def broadcast_dp_optimizer(state_dict):
+    if paddle.distributed.get_world_size() <= 1:
+        return state_dict
+
+    logger.info("Start broadcast optimizer in data parallel group.")
+    try:
+        hcg = fleet.get_hybrid_communicate_group()
+        dp_group = hcg.get_data_parallel_group()
+        src_rank = hcg.get_data_parallel_group_src_rank()
+        process_rank = paddle.distributed.get_rank()
+        # Don't broadcast optimizer for dp rank is 1.
+        if dp_group.nranks <= 1:
+            return state_dict
+    except:
+        dp_group = None
+        src_rank = 0
+        process_rank = paddle.distributed.get_rank()
+
+    if process_rank == src_rank:
+        if state_dict is None:
+            logger.warning(
+                f"Your local rank {paddle.distributed.get_rank()} must have a state_dict. dp_rank:{process_rank}, src_rank:{src_rank}"
+            )
+        fake_state_dict = [nested_reduce_tensor(state_dict)]
+    else:
+        if state_dict is not None:
+            logger.warning(
+                f"Your local rank {paddle.distributed.get_rank()}  are forbidden to have a state_dict. dp_rank:{process_rank}, src_rank:{src_rank}"
+            )
+        fake_state_dict = [None]
+
+    paddle.distributed.broadcast_object_list(
+        fake_state_dict,
+        src=src_rank,
+        group=dp_group,
+    )
+    fake_state_dict = fake_state_dict[0]
+    if process_rank != src_rank:
+        state_dict = nested_empty_tensor(fake_state_dict)
+
+    state_dict = nested_broadcast_tensor(state_dict, src=src_rank, group=dp_group)
+
+    return state_dict
+
+
+def broadcast_moe_optimizer(state_dict, model_state_dict=None, broadcast_dp=True):
+    try:
+        hcg = fleet.get_hybrid_communicate_group()
+        dp_group = hcg.get_data_parallel_group()
+        src_rank = hcg.get_data_parallel_group_src_rank()
+        data_parallel_rank = hcg.get_data_parallel_rank()
+        # Don't broadcast optimizer for dp rank is 1.
+        if dp_group.nranks <= 1:
+            return state_dict
+    except:
+        dp_group = None
+        src_rank = 0
+        data_parallel_rank = dist.get_rank()
+
+    def _filter_sync_optimizer_state(model_state_dict, opt_state_dict):
+        # get sync name
+        sync_vname = []
+        for k, v in model_state_dict.items():
+            if not getattr(v, "no_sync", False):
+                sync_vname.append(v.name)
+
+        filter_opt_state_dict = {"master_weights": {}}
+        filter_opt_state_dict["LR_Scheduler"] = opt_state_dict.get("LR_Scheduler", {})
+        for op_k, op_v in opt_state_dict.items():
+            if op_k not in ["master_weights", "LR_Scheduler"]:
+                for sync_v in sync_vname:
+                    if op_k.startswith(sync_v):
+                        filter_opt_state_dict[op_k] = op_v
+                        break
+            elif op_k == "master_weights":
+                for k, v in op_v.items():
+                    for sync_v in sync_vname:
+                        if k.startswith(sync_v):
+                            filter_opt_state_dict["master_weights"][k] = v
+        return filter_opt_state_dict
+
+    def _broadcast_moe_optimizer_state(state_dict):
+        # boardcast_keys
+        base_state_dict = {"master_weights": {}}
+        buf = [
+            {i: j.shape for i, j in state_dict.items() if i not in ["master_weights", "LR_Scheduler"]},
+            {i: j.shape for i, j in state_dict["master_weights"].items()},
+            {"LR_Scheduler": state_dict.get("LR_Scheduler", {})},
+        ]
+
+        dist.broadcast_object_list(buf, src=src_rank, group=dp_group)
+        # logger.info(f"moe-optimizer-gather-keys{buf}")
+        for k, s in buf[0].items():
+            v = state_dict.get(k, paddle.zeros(s, "float32")).cuda()
+            v.name = k
+            # k = k.replace("_fp32_master_0", "")
+            dist.broadcast(v, src=src_rank, group=dp_group)
+            logger.info(f"broadcast moe optimizer {k} from {src_rank}")
+            base_state_dict[k] = v.cpu()
+        for k, s in buf[1].items():
+            v = state_dict["master_weights"].get(k, paddle.zeros(s, "float32")).cuda()
+            v.name = k
+            dist.broadcast(v, src=src_rank, group=dp_group)
+            logger.info(f"broadcast moe optimizer-master_weights {k} from {src_rank}")
+            base_state_dict["master_weights"][k] = v.cpu()
+        base_state_dict.update(buf[2])
+        return base_state_dict
+
+    if broadcast_dp:
+        filter_opt_state_dict = _filter_sync_optimizer_state(model_state_dict, state_dict)
+        base_state_dict = broadcast_dp_optimizer(filter_opt_state_dict)
+    else:
+        base_state_dict = _broadcast_moe_optimizer_state(state_dict)
+
+    if data_parallel_rank > 0:
+        master_weight = state_dict.pop("master_weights", {})
+        base_state_dict.update(state_dict)
+        if master_weight:
+            if "master_weights" in base_state_dict:
+                base_state_dict["master_weights"].update(master_weight)
+            else:
+                base_state_dict["master_weights"] = master_weight
+        state_dict = base_state_dict
+        del base_state_dict
+    return state_dict
+
+
+def broadcast_dataset_rank0_model(model):
+    if paddle.distributed.get_world_size() <= 1:
+        return
+
+    logger.info("Start broadcast model in sharding group or data parallel group.")
+    hcg = fleet.get_hybrid_communicate_group()
+    sharding_group = hcg.get_sharding_parallel_group()
+    dp_group = hcg.get_data_parallel_group()
+    if sharding_group.nranks > 1:
+        sync_params_buffers(
+            model,
+            sharding_group,
+            hcg.get_sharding_parallel_group_src_rank(),
+            is_model_parallel=False,
+            fuse_params=False,
+        )
+    if dp_group.nranks > 1:
+        sync_params_buffers(
+            model,
+            dp_group,
+            hcg.get_data_parallel_group_src_rank(),
+            is_model_parallel=False,
+            fuse_params=False,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/reshard/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/reshard/__init__.py
new file mode 100644
index 000000000..55287d3b1
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/reshard/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import pp_reshard, sharding_v1, sharding_v2
+from .common import (
+    SHARDING_STRATEGY_V1,
+    SHARDING_STRATEGY_V2,
+    NodeModelState,
+    all_gather_state_dict,
+    get_sharding_strategy,
+    is_sharding_opt,
+)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/reshard/common.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/reshard/common.py
new file mode 100644
index 000000000..66e3c3569
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/reshard/common.py
@@ -0,0 +1,587 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+
+import numpy as np
+import paddle
+from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.dygraph_sharding_optimizer import (
+    DygraphShardingOptimizer,
+)
+from paddle.distributed.fleet.utils.log_util import logger
+
+try:
+    from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.dygraph_sharding_optimizer import (
+        DygraphShardingOptimizerV2,
+    )
+except:
+    DygraphShardingOptimizerV2 = None
+
+
+from ....transformers.model_utils import unwrap_optimizer
+
+SHARDING_STRATEGY_V1 = "ShardingV1"
+SHARDING_STRATEGY_V2 = "ShardingV2"
+
+
+def is_sharding_opt(optimizer):
+    def check(cls):
+        tmp = unwrap_optimizer(optimizer, cls)
+        if tmp is not None:
+            return True
+        return False
+
+    if check(DygraphShardingOptimizer):
+        return True
+
+    if DygraphShardingOptimizerV2 is not None:
+        if check(DygraphShardingOptimizerV2):
+            return True
+
+    return False
+
+
+def get_sharding_strategy(optimizer):
+    if DygraphShardingOptimizerV2 is not None:
+        tmp = unwrap_optimizer(optimizer, DygraphShardingOptimizerV2)
+        if tmp is not None:
+            return SHARDING_STRATEGY_V2
+    return SHARDING_STRATEGY_V1
+
+
+class NodeModelState:
+    def __init__(self, mp_rank=None, sharding_rank=None, pp_rank=None):
+        self._model_weights = OrderedDict()
+        self._opt_state = OrderedDict()
+        self._master_weights = OrderedDict()
+        self._lr_scheduler = None
+        self.set_node_rank(mp_rank, sharding_rank, pp_rank)
+
+    def set_node_rank(self, mp_rank, sharding_rank, pp_rank):
+        self._mp_rank = mp_rank
+        self._sharding_rank = sharding_rank
+        self._pp_rank = pp_rank
+
+    def _add_kv(self, d, k, v):
+        assert k not in d
+        d[k] = v
+
+    @property
+    def model_weights(self):
+        return self._model_weights
+
+    def add_weight(self, k, v):
+        self._add_kv(self._model_weights, k, v)
+
+    def add_weights(self, model_state_dict, rank=None):
+        for (k, v) in model_state_dict.items():
+            if rank is not None:
+                k = (k, rank)
+            self.add_weight(k, v)
+
+    def set_weights(self, model_state_dict):
+        self._model_weights = model_state_dict
+
+    def set_opt_state(self, opt_state_dict):
+        self._opt_state = opt_state_dict
+
+    def set_master_weights(self, master_weights):
+        self._master_weights = master_weights
+
+    @property
+    def opt_state(self):
+        return self._opt_state
+
+    def add_opt(self, k, v):
+        self._add_kv(self._opt_state, k, v)
+
+    def add_opts(self, opts, rank=None):
+        if "master_weights" in opts:
+            s_master = opts["master_weights"]
+            opts.pop("master_weights")
+            self.add_master_weights(s_master, rank)
+
+        if "LR_Scheduler" in opts:
+            lr_scheduler = opts["LR_Scheduler"]
+            opts.pop("LR_Scheduler")
+            self.set_lr_scheduler(lr_scheduler)
+
+        for (k, v) in opts.items():
+            if rank is not None:
+                k = (k, rank)
+            self.add_opt(k, v)
+
+    @property
+    def master_weights(self):
+        return self._master_weights
+
+    def add_master_weight(self, k, v):
+        self._add_kv(self._master_weights, k, v)
+
+    def add_master_weights(self, master, rank=None):
+        for (k, v) in master.items():
+            if rank is not None:
+                k = (k, rank)
+            self.add_master_weight(k, v)
+
+    @property
+    def lr_scheduler(self):
+        return self._lr_scheduler
+
+    def set_lr_scheduler(self, lr_scheduler):
+        if lr_scheduler is not None:
+            self._lr_scheduler = lr_scheduler
+
+    def map_names(self, map_func):
+        """
+        rename param names and change the keys of the dicts(model_weights, opt, master_weights) accordingly
+        """
+
+        def map_key(state_dict, map_key_func):
+            state_dict_tmp = OrderedDict()
+            (state_dict_tmp, state_dict) = (state_dict, state_dict_tmp)
+            for key in list(state_dict_tmp.keys()):
+                key_new = map_key_func(key)
+                state_dict[key_new] = state_dict_tmp[key]
+                del state_dict_tmp[key]
+            return state_dict
+
+        def map_model_state_key(key):
+            packed = isinstance(key[0], tuple)
+            structure_name, t_name = key[0] if packed else key
+            t_name_new = map_func(structure_name, t_name)
+            key_new = ((structure_name, t_name_new), key[1]) if packed else (structure_name, t_name_new)
+            return key_new
+
+        def map_opt_key(key):
+            packed = isinstance(key[0], tuple)
+            structure_name, t_name, opt_name = key[0] if packed else key
+            t_name_new = map_func(structure_name, t_name)
+            opt_name_new = t_name_new + opt_name[len(t_name) :]
+            key_new = (
+                ((structure_name, t_name_new, opt_name_new), key[1])
+                if packed
+                else (structure_name, t_name_new, opt_name_new)
+            )
+            return key_new
+
+        self._model_weights = map_key(self._model_weights, map_model_state_key)
+        self._opt_state = map_key(self._opt_state, map_opt_key)
+        self._master_weights = map_key(self._master_weights, map_opt_key)
+        return self
+
+    def drop_rank(self):
+        """
+        drop rank in the keys of the state dict
+        change dict of (key, rank)=>tensor to dict of key =>tensor
+        """
+
+        def drop(state, l=2):
+            tmp_state = OrderedDict()
+            (state, tmp_state) = (tmp_state, state)
+            for key in list(tmp_state.keys()):
+                k, rank = key
+                assert len(key) == 2
+                assert len(k) == l
+                state[k] = tmp_state[key]
+                del tmp_state[key]
+            return state
+
+        self._model_weights = drop(self._model_weights, 2)
+        self._opt_state = drop(self._opt_state, 3)
+        self._master_weights = drop(self._master_weights, 3)
+        return self
+
+    def collapse_key(self):
+        """
+        collapse dict of (key, rank)=>tensor to dict of key=>list[(rank, tensor)]
+        """
+
+        def collapse(state, l):
+            tmp_state = OrderedDict()
+            (state, tmp_state) = (tmp_state, state)
+            state_keys = list(tmp_state.keys())
+            state_keys = sorted(state_keys)
+            pre = None
+            for key in state_keys:
+                assert len(key) == 2
+                k, rank = key
+                if isinstance(k, tuple):
+                    assert len(k) == l
+                if k != pre:
+                    pre = k
+                    state[k] = []
+                state[k].append((rank, tmp_state[key]))
+                del tmp_state[key]
+            return state
+
+        self._model_weights = collapse(self._model_weights, 2)
+        self._opt_state = collapse(self._opt_state, 3)
+        self._master_weights = collapse(self._master_weights, 3)
+        return self
+
+    def flatten_key(self):
+        """
+        flatten dict of key=>list[(rank, tensor)], to dict of (key, rank)=>tensor
+        """
+
+        def flatten(state, l):
+            tmp_state = OrderedDict()
+            (state, tmp_state) = (tmp_state, state)
+            state_keys = list(tmp_state.keys())
+            for key in state_keys:
+                assert len(key) == l
+                for (rank, items) in tmp_state[key]:
+                    state[(key, rank)] = items
+                del tmp_state[key]
+            return state
+
+        self._model_weights = flatten(self._model_weights, 2)
+        self._opt_state = flatten(self._opt_state, 3)
+        self._master_weights = flatten(self._master_weights, 3)
+        return self
+
+    def pack_keys(self, structure_name_mapping=None):
+        """
+        change the key of model_weights dict from param_name to (structure_name, param_name);
+        change the key of opt dict from opt_name to (structure_name, param_name, opt_name);
+        chnage the key of master weights dict from param_name to (structure_name, param_name)
+        """
+        # pack key for pp convert
+        def _opt_name_to_tname(tensor_names, opt_names):
+            tensor_names = set(tensor_names)
+            all_names = []
+            all_names.extend(list(tensor_names))
+            all_names.extend(opt_names)
+            all_names.sort()
+            pre_t_name = ""
+            suffix = [
+                "_fp32_master_0_beta1_pow_acc_0",
+                "_fp32_master_0_beta2_pow_acc_0",
+                "_fp32_master_0_moment1_0",
+                "_fp32_master_0_moment2_0",
+                "_beta1_pow_acc_0",
+                "_beta2_pow_acc_0",
+                "_moment1_0",
+                "_moment2_0",
+            ]
+            opt_to_t = {}
+            for n in all_names:
+                if n in tensor_names:
+                    # we get a param
+                    pre_t_name = n
+                else:
+                    assert pre_t_name
+                    opt_to_t[n] = pre_t_name
+
+            for t in opt_names:
+                _find = False
+                for s in suffix:
+                    if t.endswith(s):
+                        logger.info(f"{t}-{t[:-len(s)]}--{t[:-len(s)] in tensor_names}")
+                        opt_to_t[t] = t[: -len(s)]
+                        _find = True
+                        break
+                assert _find
+            return opt_to_t
+
+        if structure_name_mapping is not None:
+            tname_to_structure_name = {v: k for (k, v) in structure_name_mapping.items()}
+        else:
+            structure_name_mapping = {k: v.name for (k, v) in self._model_weights.items()}
+            tname_to_structure_name = {v: k for (k, v) in structure_name_mapping.items()}
+
+        tensor_names = list(tname_to_structure_name.keys())
+        opt_names = list(self._opt_state.keys())
+        opt_name_to_tname = _opt_name_to_tname(tensor_names, opt_names)
+
+        # model state
+        model_weights_tmp = OrderedDict()
+        (self._model_weights, model_weights_tmp) = (model_weights_tmp, self._model_weights)
+        for k in list(model_weights_tmp.keys()):
+            t_name = structure_name_mapping[k]
+            self._model_weights[(k, t_name)] = paddle.to_tensor(model_weights_tmp[k]).cpu()
+            del model_weights_tmp[k]
+
+        # opt
+        opt_tmp = OrderedDict()
+        (self._opt_state, opt_tmp) = (opt_tmp, self._opt_state)
+        for opt_name in list(opt_tmp.keys()):
+            assert opt_name in opt_name_to_tname
+            t_name = opt_name_to_tname[opt_name]
+            assert t_name in tname_to_structure_name
+            structure_name = tname_to_structure_name[t_name]
+            self._opt_state[(structure_name, t_name, opt_name)] = opt_tmp[opt_name].cpu()
+            del opt_tmp[opt_name]
+
+        # master weights
+        master_weights_tmp = OrderedDict()
+        (self._master_weights, master_weights_tmp) = (master_weights_tmp, self._master_weights)
+        for t_name in list(master_weights_tmp.keys()):
+            assert t_name in tname_to_structure_name
+            structure_name = tname_to_structure_name[t_name]
+            master_name = getattr(master_weights_tmp[t_name], "name", "")
+            self._master_weights[(structure_name, t_name, master_name)] = master_weights_tmp[t_name].cpu()
+            del master_weights_tmp[t_name]
+
+        return self
+
+    def unpack_keys(self):
+        """
+        the opposite of pack_keys,
+        revert the key of model_weights dict from  (structure_name, param_name) to param_name
+        revert the key of opt dict from  (structure_name, param_name, opt_name) to opt_name
+        revert the key of master weights dict from (structure_name, param_name) to param_name
+        """
+        # model weights
+        model_weights_tmp = OrderedDict()
+        (self._model_weights, model_weights_tmp) = (model_weights_tmp, self._model_weights)
+        for key in list(model_weights_tmp.keys()):
+            structure_name, t_name = key
+            self._model_weights[structure_name] = model_weights_tmp[key]
+            self._model_weights[structure_name].name = t_name
+            del model_weights_tmp[key]
+        # opt
+        opt_tmp = OrderedDict()
+        (self._opt_state, opt_tmp) = (opt_tmp, self._opt_state)
+        for key in list(opt_tmp.keys()):
+            structure_name, t_name, opt_name = key
+            if structure_name in self._model_weights:
+                assert self._model_weights[structure_name].name == t_name
+            self._opt_state[opt_name] = opt_tmp[key]
+            self._opt_state[opt_name].name = opt_name
+            del opt_tmp[key]
+
+        # master weights
+        master_weights_tmp = OrderedDict()
+        (self._master_weights, master_weights_tmp) = (master_weights_tmp, self._master_weights)
+        for key in list(master_weights_tmp.keys()):
+            structure_name, t_name, master_name = key
+            if structure_name in self._model_weights:
+                assert self._model_weights[structure_name].name == t_name
+            self._master_weights[t_name] = master_weights_tmp[key]
+            self._master_weights[t_name].name = master_name
+        return self
+
+    def split_state(self, split_func):
+        """
+        split this node state to multiple node state according to the passed in split_func
+        """
+        node_model_states = {}
+        for (k, v) in self._model_weights.items():
+            rank = split_func(k)
+            if rank not in node_model_states:
+                node_model_states[rank] = NodeModelState()
+            node_model_states[rank].add_weight(k, v)
+
+        for (k, v) in self._opt_state.items():
+            rank = split_func(k)
+            if rank not in node_model_states:
+                node_model_states[rank] = NodeModelState()
+            node_model_states[rank].add_opt(k, v)
+
+        for (k, v) in self._master_weights.items():
+            rank = split_func(k)
+            if rank not in node_model_states:
+                node_model_states[rank] = NodeModelState()
+            node_model_states[rank].add_master_weight(k, v)
+
+        return node_model_states
+
+    def even_distribute(self, group):
+        """
+        distribute the node state evenly among all workers in group， and make sure
+        in the dicts of (key, rank)=>tensor, items keys of the same key but different rank are distributed to the
+        same worker
+        """
+        # sharding degree == 1
+        if group is None or group.nranks < 2:
+            return self
+
+        def build_router(state_dict):
+            state_keys_list = all_gather_simple_object([(k, v.shape) for (k, v) in state_dict.items()], group)
+
+            key_to_size = {}
+            for l in state_keys_list:
+                for (k, shape) in l:
+                    key, rank = k
+                    if key not in key_to_size:
+                        key_to_size[key] = 0
+                    key_to_size[key] = key_to_size[key] + np.prod(shape)
+
+            key_to_size = sorted(list(key_to_size.items()), key=lambda x: x[1], reverse=True)
+            node_distributed = [0 for _ in range(group.nranks)]
+            key_to_rank = {}
+            for (k, v) in key_to_size:
+                min_val = min(node_distributed)
+                min_index = node_distributed.index(min_val)
+                key_to_rank[k] = min_index
+                node_distributed[min_index] = node_distributed[min_index] + v
+
+            return key_to_rank
+
+        def distribute(state_dict):
+
+            key_to_rank = build_router(state_dict)
+
+            def filter_func(key):
+                assert key[0] in key_to_rank, key
+                dst_rank = key_to_rank[key[0]]
+                return dst_rank == group.rank
+
+            return _all_gather_state_dict(state_dict, filter_func, group)
+
+        self._model_weights = distribute(self._model_weights)
+        self._opt_state = distribute(self._opt_state)
+        self._master_weights = distribute(self._master_weights)
+        return self
+
+    def reshard(self, group, filter_func):
+        """
+        reshard according to the passed in filter_func
+        """
+        self._model_weights = _all_gather_state_dict(self._model_weights, filter_func, group)
+        self._opt_state = _all_gather_state_dict(self._opt_state, filter_func, group)
+        self._master_weights = _all_gather_state_dict(self._master_weights, filter_func, group)
+        lr_schedulers = all_gather_simple_object(self._lr_scheduler, group)
+        self._lr_scheduler = lr_schedulers[0]
+        return self
+
+    def split_items(self, split_func):
+        """
+        split tensor in the dicts of key=tensor, change the dicts to dicts of key=>list[(rank, tensor)]
+        """
+
+        def split(state, l):
+            tmp_state = OrderedDict()
+            (state, tmp_state) = (tmp_state, state)
+            state_keys = list(tmp_state.keys())
+            for key in state_keys:
+                assert len(key) == l
+                v = tmp_state[key]
+                state[key] = split_func(key, v)
+                del tmp_state[key]
+            return state
+
+        self._model_weights = split(self._model_weights, 2)
+        self._opt_state = split(self._opt_state, 3)
+        self._master_weights = split(self._master_weights, 3)
+        return self
+
+    def merge_items(self, merge_func):
+        """
+        merge list in the dicts of key=>list[(rank, tensor)]  a tensor, change the dicts to dicts of key=>tensor
+        """
+
+        def merge(state, l):
+            tmp_state = OrderedDict()
+            (state, tmp_state) = (tmp_state, state)
+            state_keys = list(tmp_state.keys())
+            for key in state_keys:
+                if isinstance(key, tuple):
+                    assert len(key) == l
+                v = tmp_state[key]
+                v = sorted(v, key=lambda x: x[0])
+                state[key] = merge_func(key, v)
+                del tmp_state[key]
+            return state
+
+        self._model_weights = merge(self._model_weights, 2)
+        self._opt_state = merge(self._opt_state, 3)
+        self._master_weights = merge(self._master_weights, 3)
+        return self
+
+    def merge_from(self, other, rank=None):
+        self.add_weights(other.model_weights, rank)
+        self.add_opts(other.opt_state, rank)
+        self.add_master_weights(other.master_weights, rank)
+        if other.lr_scheduler is not None:
+            self.set_lr_scheduler(other.lr_scheduler)
+        return self
+
+    def get_opt_state_dict(self):
+        opt_state_dict = OrderedDict()
+        for (k, v) in self.opt_state.items():
+            opt_state_dict[k] = v
+        if self._lr_scheduler is not None:
+            opt_state_dict["LR_Scheduler"] = self._lr_scheduler
+        opt_state_dict["master_weights"] = self._master_weights
+        return opt_state_dict
+
+
+def all_gather_simple_object(obj, group):
+    res = []
+    if group.nranks < 2:
+        return [obj]
+    paddle.distributed.all_gather_object(res, obj, group)
+    return res
+
+
+def all_gather_state_dict(state_dict, filter_func, group):
+    res = OrderedDict()
+
+    def map_func(weight):
+        if isinstance(weight, paddle.Tensor):
+            weight = weight.numpy()
+        return weight
+
+    state_dict = {k: map_func(v) for (k, v) in state_dict.items()}
+
+    meta_dict = {}
+    for (k, v) in state_dict.items():
+        # src rank
+        meta_dict[k] = (v.dtype, v.shape, group.rank)
+
+    meta_dict_list = all_gather_simple_object(meta_dict, group)
+
+    total_meta_dict = {}
+    for meta_dict in meta_dict_list:
+        for (k, v) in meta_dict.items():
+            assert k not in total_meta_dict
+            total_meta_dict[k] = v
+
+    meta_list = list(total_meta_dict.items())
+    meta_list = sorted(meta_list, key=lambda x: x[0])
+    for (k, meta) in meta_list:
+        dtype, shape, rank = meta
+        if rank == group.rank:
+            assert k in state_dict
+            tensor = paddle.to_tensor(state_dict[k])
+            del state_dict[k]
+        else:
+            tensor = paddle.to_tensor(np.empty(shape, dtype))
+        logger.info(f"broadcast {k} from {rank}")
+        # broadcast the tensor
+        paddle.distributed.broadcast(
+            tensor,
+            src=group.ranks[rank],
+            group=group,
+            sync_op=True,
+        )
+        if filter_func(k):
+            res[k] = tensor.cpu()
+        del tensor
+    return res
+
+
+def _all_gather_state_dict(state_dict, filter_func, group):
+    remote_state_dict_keys = [k for k in state_dict.keys() if not filter_func(k)]
+    tmp_state_dict = OrderedDict()
+    for k in remote_state_dict_keys:
+        tmp_state_dict[k] = state_dict[k]
+        state_dict.pop(k)
+    tmp_state_dict = all_gather_state_dict(tmp_state_dict, filter_func, group)
+    for (k, v) in tmp_state_dict.items():
+        state_dict[k] = v
+    return state_dict
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/reshard/pp_reshard.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/reshard/pp_reshard.py
new file mode 100644
index 000000000..5c98e6069
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/reshard/pp_reshard.py
@@ -0,0 +1,336 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from collections import OrderedDict
+
+from paddle.distributed.fleet.model import PipelineParallel
+from paddle.distributed.fleet.utils.log_util import logger
+
+_GLOBAL_EXTRACT_LAYER_NAME_FUNC = None
+
+
+def regitser_extract_layer_name_func(func):
+    global _GLOBAL_EXTRACT_LAYER_NAME_FUNC
+    _GLOBAL_EXTRACT_LAYER_NAME_FUNC = func
+
+
+def get_extract_layer_name_func():
+    global _GLOBAL_EXTRACT_LAYER_NAME_FUNC
+    assert _GLOBAL_EXTRACT_LAYER_NAME_FUNC is not None, "extract layer func is not registered yet"
+    return _GLOBAL_EXTRACT_LAYER_NAME_FUNC
+
+
+_GLOBAL_INDEX_LAYER_FUNC = None
+
+
+def register_index_layer_func(func):
+    global _GLOBAL_INDEX_LAYER_FUNC
+    _GLOBAL_INDEX_LAYER_FUNC = func
+
+
+def get_index_layer_func():
+    global _GLOBAL_INDEX_LAYER_FUNC
+    assert _GLOBAL_INDEX_LAYER_FUNC is not None, "index layer func is not registered yet"
+    return _GLOBAL_INDEX_LAYER_FUNC
+
+
+class LayerNameScope:
+    """
+    layer name scope for a layer, layer name of the same kind of layer will be named consecutively
+    """
+
+    registered_layers = []
+
+    def __init__(self, prefix, template):
+        self.prefix = prefix
+        self.last_layer_id = ""
+        self.last_old_layer_name = ""
+        self.template = template
+        self.index = -1
+        self.sub_scopes = OrderedDict()
+
+    @classmethod
+    def get_layer_prefix(cls, old_layer_name):
+        for k in cls.registered_layers:
+            if old_layer_name.startswith(k):
+                return k
+        return None
+
+    @classmethod
+    def register_layer_prefix(cls, prefix):
+        if prefix not in cls.registered_layers:
+            cls.registered_layers.append(prefix)
+            cls.registered_layers.sort(key=lambda x: len(x), reverse=True)
+
+    def get_next_scope(self, layer_id, old_layer_name):
+        if old_layer_name != self.last_old_layer_name or layer_id != self.last_layer_id:
+            self.index = self.index + 1
+            self.last_old_layer_name = old_layer_name
+            self.last_layer_id = layer_id
+            self.sub_scopes = OrderedDict()
+        return self
+
+    def get_layer_name(self):
+        name = ""
+        if self.template:
+            name = self.template.format(self.index)
+        if self.prefix:
+            name = self.prefix + "_" + name
+        return name
+
+    def get_sub_scope(self, sub_layer_name):
+        layer_prefix = self.get_layer_prefix(sub_layer_name)
+        assert layer_prefix, f"{sub_layer_name} invalid, prefix {self.prefix}"
+        if layer_prefix in self.sub_scopes:
+            return self.sub_scopes[layer_prefix]
+        layer_template = f"{layer_prefix}_{{}}"
+        prefix = self.get_layer_name()
+        scope = LayerNameScope(prefix, layer_template)
+        self.sub_scopes[layer_prefix] = scope
+        return scope
+
+
+def register_layername_prefix(layer_name):
+    LayerNameScope.register_layer_prefix(layer_name)
+
+
+def extract_param_names_groupby_layer(
+    meta,
+    mp_rank=0,
+):
+    param_names_by_layer = OrderedDict()
+    assert "parallel_config" in meta
+    parallel_config = meta["parallel_config"]
+    assert "pp_degree" in parallel_config
+    pp_degree = int(parallel_config["pp_degree"])
+    sharding_metas = meta["sharding_metas"]
+    for pp_rank in range(pp_degree):
+        suffix = f"tp{mp_rank:0>2d}_pp{pp_rank:0>2d}"
+        assert suffix in sharding_metas
+        assert "structure_name_mapping" in sharding_metas[suffix]
+        name_mapping = sharding_metas[suffix]["structure_name_mapping"]
+        for (k, v) in name_mapping.items():
+            layer_name = get_extract_layer_name_func()(k)
+            if layer_name not in param_names_by_layer:
+                param_names_by_layer[layer_name] = []
+            param_names_by_layer[layer_name].append((k, v))
+    return param_names_by_layer
+
+
+def build_pipeline_context(meta, pp_model):
+    assert isinstance(pp_model, PipelineParallel), type(pp_model)
+    layer_params = extract_param_names_groupby_layer(meta, 0)
+    # 2、rename tensor names
+    pipeline_context = PipeLineSegmentContext(
+        pp_model,
+        layer_params,
+    )
+    return pipeline_context
+
+
+class LayerReNamingManager:
+    def __init__(self):
+        self.top_layer_name_scope = LayerNameScope(None, None)
+
+    def get_new_layer_name(self, layer_id: str, old_name: str):
+        name_scope = self.top_layer_name_scope.get_sub_scope(old_name).get_next_scope(layer_id, old_name)
+        return name_scope.get_layer_name()
+
+    def get_new_param_name(self, layer_id, old_name: str):
+        names = old_name.split(".")
+        layer_name = self.get_new_layer_name(layer_id, names[0])
+        names[0] = layer_name
+        return ".".join(names)
+
+
+class PipeLinelayer:
+    def __init__(self, layer_name, param_names):
+        self._layer_name = layer_name
+
+        # make sure name with the same sublayer type is ordered
+        def sort_key(x):
+            # assume param_name is of the type layer_type_{same_layer_index}.w_{weight_index}
+            structure_name, param_name = x
+            same_layer_index = param_name.split(".")[0].split("_")[-1]
+            return int(same_layer_index)
+
+        param_names = sorted(param_names, key=sort_key)
+        self._params = OrderedDict()
+        for (k, v) in param_names:
+            self._params[k] = v
+
+    @property
+    def params(self):
+        return self._params
+
+    @property
+    def name(self):
+        return self._layer_name
+
+
+class PipeLineSegment:
+    def __init__(self, start_index, end_index):
+        self._start_index = start_index
+        self._end_index = end_index
+        self._cur_index = start_index
+        self._layers = OrderedDict()
+
+    def add_layer(self, layer_name, param_names):
+        assert self._cur_index < self._end_index
+        layer = PipeLinelayer(layer_name, param_names)
+        self._layers[layer_name] = layer
+        self._cur_index = self._cur_index + 1
+
+    @property
+    def layers(self):
+        assert self._cur_index <= self._end_index
+        return self._layers
+
+
+class PipeLineStage:
+    def __init__(self):
+        self._rename_mgr = LayerReNamingManager()
+        # map segement start index to segment
+        self._segments = OrderedDict()
+        self._layer_to_segment = OrderedDict()
+        self._param_to_tname = OrderedDict()
+
+    def add_segment(self, start_index, end_index):
+        segment = PipeLineSegment(start_index, end_index)
+        self._segments[start_index] = segment
+        for i in range(start_index, end_index):
+            self._layer_to_segment[i] = segment
+
+    def add_layer(self, layer_index, layer_name, param_names):
+        assert layer_index in self._layer_to_segment
+        segment = self._layer_to_segment[layer_index]
+        segment.add_layer(layer_name, param_names)
+
+    def build_name_mapping(self):
+        for (k, segment) in self._segments.items():
+            for (i, layer) in segment.layers.items():
+                for param in layer.params.items():
+                    (param_name, tensor_name) = param
+                    # map to a new name
+                    n_name = self._rename_mgr.get_new_param_name(layer.name, tensor_name)
+                    # logger.info(f"{param_name} {tensor_name}=>{n_name}")
+                    self._param_to_tname[param_name] = (tensor_name, n_name)
+
+    def map_name(self, param_name, t_name):
+        assert param_name in self._param_to_tname
+        tensor_name, n_name = self._param_to_tname[param_name]
+        assert tensor_name == t_name
+        return n_name
+
+    def print_name_mapping(self):
+        for (name, mapping) in self._param_to_tname.items():
+            logger.info(f"{name} mapping {mapping[0]} => {mapping[1]}\n")
+
+
+# segment context for pp X sharding
+class PipeLineSegmentContext:
+    def __init__(
+        self,
+        pp_model,
+        param_names_by_layer,
+    ):
+        self._pp_degree = pp_model._layers._num_stages
+        self._vpp_degree = pp_model._layers._num_virtual_pipeline_stages
+        self._segment_method = "layer"
+        self._layers = list(param_names_by_layer.keys())
+        self._pp_model = pp_model
+        self._stages = []
+        self._layer_index_to_stage = {}
+        self._layer_name_to_index = {}
+        self._layer_index_to_name = {}
+        self._layer_name_to_stage = {}
+        self._param_names_by_layer = param_names_by_layer
+
+        self._index_layers()
+
+        stage_segments = self._segment()
+        for (i, stage_seg) in enumerate(stage_segments):
+            pipe_stage = PipeLineStage()
+            self._stages.append(pipe_stage)
+            for seg in stage_seg:
+                pipe_stage.add_segment(seg[0], seg[1])
+                for j in range(*seg):
+                    if j in self._layer_index_to_name:
+                        layer_name = self._layer_index_to_name[j]
+                        assert layer_name in self._param_names_by_layer
+                        pipe_stage.add_layer(j, layer_name, self._param_names_by_layer[layer_name])
+                    self._layer_index_to_stage[j] = i
+                    self._layer_name_to_stage[layer_name] = i
+
+        for stage in self._stages:
+            stage.build_name_mapping()
+
+    def _index_layers(self):
+        for layer_name in self._param_names_by_layer.keys():
+            index = get_index_layer_func()(layer_name)
+            self._layer_name_to_index[layer_name] = index
+            self._layer_index_to_name[index] = layer_name
+
+    def _segment(self):
+        index_segments = [[] for _ in range(self._pp_degree)]
+        segment_parts = self._pp_model._layers.segment_parts
+        for i in range(self._pp_model._layers._total_stages_with_virtual_stages):
+            stage = i % self._pp_degree
+            index_segments[stage].append((segment_parts[i], segment_parts[i + 1]))
+        print(f"segment results {index_segments}")
+        return index_segments
+
+    def map_name(self, param_name, t_name):
+        layer_name = get_extract_layer_name_func()(param_name)
+        assert layer_name in self._layer_name_to_index
+        layer_index = self._layer_name_to_index[layer_name]
+        stage_index = self._layer_index_to_stage[layer_index]
+        stage = self._stages[stage_index]
+        return stage.map_name(param_name, t_name)
+
+    def map_name_to_stage(self, name):
+        layer_name = get_extract_layer_name_func()(name)
+        assert layer_name in self._layer_name_to_index
+        layer_index = self._layer_name_to_index[layer_name]
+        stage_index = self._layer_index_to_stage[layer_index]
+        return stage_index
+
+    def print_name_mapping(self):
+        for (i, stage) in enumerate(self._stages):
+            print(f"{'='*30}stage {i} {'='*30}")
+            stage.print_name_mapping()
+
+
+def reshard(node_model_state, reshard_context, hcg):
+    pp_degree = hcg.get_pipe_parallel_world_size()
+    pp_rank = hcg.get_stage_id()
+    group = hcg.get_pipe_parallel_group()
+
+    # all gather
+    def filter_func(name):
+        names, rank = name
+        stage_id = reshard_context.map_name_to_stage(names[0])
+        assert stage_id < pp_degree
+        return stage_id == pp_rank
+
+    node_model_state.reshard(group, filter_func)
+
+    def name_map_func(structure_name, p_name):
+        map_name = reshard_context.map_name(structure_name, p_name)
+        return map_name
+
+    node_model_state.map_names(name_map_func)
+
+    return node_model_state
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/reshard/sharding_v1.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/reshard/sharding_v1.py
new file mode 100644
index 000000000..6c7e637ec
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/reshard/sharding_v1.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.dygraph_sharding_optimizer import (
+    DygraphShardingOptimizer,
+)
+
+from ....transformers.model_utils import unwrap_optimizer
+
+
+def shard(node_model_state, model, optimizer, hcg):
+    group = hcg.get_sharding_parallel_group()
+    cur_rank = group.rank
+    optimizer = unwrap_optimizer(optimizer, DygraphShardingOptimizer)
+    assert optimizer is not None
+    param2rank = optimizer._param2rank
+
+    def filter_func(key):
+        names = key
+        param_name = names[1]
+        assert param_name in param2rank
+        dst_rank = param2rank[param_name]
+        return dst_rank == cur_rank
+
+    node_model_state.reshard(group, filter_func)
+    return node_model_state
+
+
+def restore(node_model_state, model, optimizer, hcg):
+    node_model_state.drop_rank()
+    return node_model_state
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/reshard/sharding_v2.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/reshard/sharding_v2.py
new file mode 100644
index 000000000..d5df4666a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/reshard/sharding_v2.py
@@ -0,0 +1,232 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.distributed.fleet as fleet
+from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer import (
+    HybridParallelOptimizer,
+)
+from paddle.distributed.fleet.model import PipelineParallel
+
+from paddlenlp.utils.log import logger
+
+from ....transformers.model_utils import unwrap_optimizer
+
+try:
+    from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.dygraph_sharding_optimizer import (
+        DygraphShardingOptimizerV2,
+    )
+except:
+    DygraphShardingOptimizerV2 = None
+
+
+from paddle.distributed.communication.reduce import ReduceOp
+
+
+def shard(node_model_state, model, optimizer, hcg):
+    assert DygraphShardingOptimizerV2 is not None
+    group = hcg.get_sharding_parallel_group()
+    cur_rank = group.rank
+    split_infos = collect_split_info(optimizer, model)
+
+    def split_func(k, v):
+        param_name = k[1]
+        opt_name = k[-1]
+        assert param_name in split_infos, f"param_name {param_name}, split_infos{split_infos}"
+        is_beta = is_bata(opt_name)
+        index, padded_size, buffer_size, has_slice_grad = split_infos[param_name]
+
+        if not is_beta:
+            v = pad_tensor(k, v, padded_size)
+
+        def get_slice(v, begin, end):
+            if is_beta:
+                return v
+            return slice_tensor(v, begin, end)
+
+        assert buffer_size % group.nranks == 0, f"buffer_size {buffer_size} group.nranks {group.nranks}"
+        buffer_slice = buffer_size // group.nranks
+
+        # has slice grad in cur rank
+        if has_slice_grad:
+            assert index < (cur_rank + 1) * buffer_slice
+            assert index + padded_size > cur_rank * buffer_slice
+
+        offset = buffer_slice - index % buffer_slice
+        tensors = []
+        tensors.append((index // buffer_slice, get_slice(v, 0, min(offset, padded_size))))
+
+        left_size = padded_size - offset
+
+        if left_size > 0:
+            for _ in range((left_size + buffer_slice - 1) // buffer_slice):
+                end = min(offset + buffer_slice, padded_size)
+                assert end <= buffer_size
+                tensors.append(((offset + index) // buffer_slice, get_slice(v, offset, end)))
+                offset = end
+
+        return tensors
+
+    node_model_state.split_items(split_func).flatten_key()
+
+    def filter_func(k):
+        names, rank = k
+        assert rank < group.nranks
+        return rank == cur_rank
+
+    # reshard
+    node_model_state.reshard(group, filter_func)
+    node_model_state.drop_rank()
+    return node_model_state
+
+
+def restore(node_model_state, model, optimizer, hcg):
+    group = hcg.get_sharding_parallel_group()
+    # evenly distribute param
+    node_model_state.even_distribute(group)
+    param_shapes = {k: v.shape for (k, v) in model.state_dict().items()}
+
+    def merge_func(k, v):
+        structure_name = k[0]
+        opt_name = k[-1]
+        assert structure_name in param_shapes, structure_name
+        tensor_list = [e[1] for e in v]
+        # do not merge beta acc
+        if is_bata(opt_name):
+            return tensor_list[0]
+        shape = param_shapes[structure_name]
+        return merge_tensors(k, tensor_list, shape)
+
+    node_model_state.collapse_key().merge_items(merge_func)
+    return node_model_state
+
+
+def merge_tensors(k, tensor_list, shape):
+    assert len(tensor_list) > 0
+    if len(tensor_list) == 1:
+        t = tensor_list[0]
+    else:
+        assert len(tensor_list[0].shape) == 1
+        t = paddle.concat(x=tensor_list, axis=0)
+    tensor_size = np.prod(shape)
+    padded_size = t._numel()
+    assert padded_size >= tensor_size, f"{k} padded_size {padded_size} tensor_size {tensor_size}"
+    t = t._slice(0, tensor_size)
+    t.get_tensor()._set_dims(shape)
+    return t
+
+
+def pad_tensor(k, tensor, padded_size):
+    tensor_shape = tensor.shape
+    tensor_size = np.prod(tensor_shape)
+    assert tensor_size <= padded_size, f"{k} tensor_size {tensor_size} padded_size {padded_size}"
+    t = paddle.zeros([padded_size], dtype=tensor.dtype)
+    tensor.flatten_()
+    t[0:tensor_size] = tensor
+    tensor.get_tensor()._set_dims(tensor_shape)
+    return t
+
+
+def slice_tensor(tensor, begin, end):
+    return tensor[begin:end]
+
+
+def collect_split_info(optimizer, model, only_return_lengths=False):
+    split_infos = {}
+
+    def gather_infos(comm_buffer):
+        for (k, v) in comm_buffer._sharding_param_grad_view.items():
+            index = v._index
+            padded_size = v._padded_size
+            buffer_size = v._param_buffer._numel()
+            has_slice_grad = v._slice_grad is not None
+            if only_return_lengths:
+                if v._param_begin < v._param_end:
+                    split_infos[k] = v._param_end - v._param_begin
+                else:
+                    split_infos[k] = None
+            else:
+                split_infos[k] = (index, padded_size, buffer_size, has_slice_grad)
+
+    if isinstance(model, PipelineParallel) and model._sharding_comm_overlap > 0:
+        optimizer = unwrap_optimizer(optimizer, HybridParallelOptimizer)
+        assert optimizer is not None
+        # dalayed comm_overlap_hook register
+        model.register_sharding_comm_overlap_hook(optimizer)
+        for (k, v) in model._chunk_2_comm_buffers.items():
+            for comm_buffer in v:
+                gather_infos(comm_buffer)
+
+    else:
+        optimizer = unwrap_optimizer(optimizer, DygraphShardingOptimizerV2)
+        assert optimizer is not None
+        for comm_buffer in optimizer._comm_buffer_list:
+            gather_infos(comm_buffer)
+
+    assert len(split_infos)
+    return split_infos
+
+
+def is_matched_optimizer_state_dict(opt_state_dict, optimizer, model, hcg=None, need_allgather=True):
+    split_infos = collect_split_info(optimizer, model, only_return_lengths=True)
+    master_weights = opt_state_dict.get("master_weights", None)
+
+    def get_matched_length(name):
+        if master_weights and name in master_weights:
+            tensor = master_weights[name]
+        else:
+            moment_name = name + "_moment1_0"
+            if moment_name not in opt_state_dict:
+                return None
+
+            tensor = opt_state_dict[moment_name]
+            if isinstance(tensor, (list, tuple)):
+                assert len(tensor) == 2, tensor
+                assert isinstance(tensor[0], str), tensor[0]
+                tensor = tensor[1]
+        shape = tensor.shape
+        assert len(shape) == 1, shape
+        length = shape[0]
+        return length
+
+    is_matched = 1
+    for k, length in split_infos.items():
+        matched_length = get_matched_length(k)
+        if length != matched_length:
+            is_matched = 0
+            break
+
+    if need_allgather:
+        if hcg is None:
+            hcg = fleet.get_hybrid_communicate_group()
+        group = hcg.get_sharding_parallel_group()
+        if group is not None and group.nranks > 1:
+            x = paddle.to_tensor([is_matched], dtype=paddle.int32)
+            paddle.distributed.stream.all_reduce(x, op=ReduceOp.MIN, group=group, sync_op=True, use_calc_stream=True)
+            global_is_matched = int(x.numpy()[0])
+    else:
+        global_is_matched = is_matched
+
+    global_is_matched = True if global_is_matched else False
+    logger.info(f"Sharding reshard checkpoint: local_match = {is_matched} , global_match = {global_is_matched}")
+    return global_is_matched
+
+
+def is_bata(name):
+    if "_beta1_pow_acc_" in name:
+        return True
+    if "_beta2_pow_acc_" in name:
+        return True
+    return False
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/sharding_io.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/sharding_io.py
new file mode 100644
index 000000000..59ad5e5e5
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trainer/utils/sharding_io.py
@@ -0,0 +1,605 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import json
+import os
+from collections import OrderedDict
+
+import paddle
+import paddle.distributed as dist
+from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer import (
+    DygraphShardingOptimizer,
+)
+
+try:
+    from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.dygraph_sharding_optimizer import (
+        DygraphShardingOptimizerV2,
+    )
+except:
+    DygraphShardingOptimizerV2 = None
+
+from paddlenlp.transformers.model_utils import (
+    _add_variant,
+    get_parameter_dtype,
+    unwrap_optimizer,
+)
+from paddlenlp.transformers.utils import paddlenlp_load
+from paddlenlp.utils.log import logger
+
+from . import reshard as reshard_util
+from .reshard import SHARDING_STRATEGY_V1, SHARDING_STRATEGY_V2, pp_reshard
+
+# Name of the files used for checkpointing
+TRAINING_ARGS_NAME = "training_args.bin"
+TRAINER_STATE_NAME = "trainer_state.json"
+
+OPTIMIZER_NAME = "optimizer.pdopt"
+SCHEDULER_NAME = "scheduler.pdparams"
+SCALER_NAME = "scaler.pdparams"
+MODEL_META_NAME = "model_meta.json"
+SHARDING_META_NAME = "shard_meta.json"
+
+
+def filter_sharded_params(state_dict, optimizer, sharding_group):
+
+    sharding_rank = sharding_group.rank
+    sharding_world_size = sharding_group.nranks
+    from paddlenlp.trainer.utils import reshard as reshard_util
+
+    logger.info(f"filter sharded_params not placed in sharding_rank {sharding_rank} .")
+    if not reshard_util.is_sharding_opt(optimizer):
+        return state_dict
+
+    filtered_state_dict = OrderedDict()
+    if reshard_util.get_sharding_strategy(optimizer) == reshard_util.SHARDING_STRATEGY_V1:
+        optimizer = unwrap_optimizer(optimizer, DygraphShardingOptimizer)
+        for (k, v) in state_dict.items():
+            if v.name in optimizer._param2rank:
+                sharded_rank = optimizer._param2rank[v.name]
+                if sharded_rank != sharding_rank:
+                    continue
+                filtered_state_dict[k] = v
+            else:
+                if sharding_rank == 0:
+                    filtered_state_dict[k] = v
+    else:
+        optimizer = unwrap_optimizer(optimizer, DygraphShardingOptimizerV2)
+        parameters = optimizer._parameter_list
+        filtered_parameters = [p.name for (i, p) in enumerate(parameters) if i % sharding_world_size == sharding_rank]
+        filtered_parameters = set(filtered_parameters)
+        for (k, v) in state_dict.items():
+            if v.name in filtered_parameters:
+                filtered_state_dict[k] = v
+    return filtered_state_dict
+
+
+def exclude_paramters_in_state_dict(
+    model_state_dict, param_names_in_master_weights, sharding_group, should_save_sharding_stage1_model=True
+):
+    assert sharding_group is not None
+    assert isinstance(model_state_dict, dict) and isinstance(
+        param_names_in_master_weights, (list, set)
+    ), "param_names_in_master_weights type:{}".format(type(param_names_in_master_weights))
+    state_param_names = [v.name for k, v in model_state_dict.items()]
+    logger.debug(
+        "param_names_in_master_weights:{}, state_param_names:{}".format(
+            param_names_in_master_weights, state_param_names
+        )
+    )
+    # allgather parameter names in sharding group
+    tmp = []
+    paddle.distributed.all_gather_object(tmp, param_names_in_master_weights, group=sharding_group)
+    param_names_in_master_weights = set([v for item in tmp for v in item])
+    logger.info("sharding_group_param_names:{}".format(param_names_in_master_weights))
+    non_parameters_state_dict = copy.copy(model_state_dict)
+    for k, v in model_state_dict.items():
+        if v.name in param_names_in_master_weights:
+            non_parameters_state_dict.pop(k)
+
+    return non_parameters_state_dict
+
+
+class ShardingIO:
+    def __init__(self, args, model, optimizer=None, hcg=None):
+        self.args = args
+        self.model = model
+        self.optimizer = optimizer
+        self.hcg = hcg
+        self.sharding_group = None
+        if self.hcg is None and paddle.distributed.get_world_size() > 1 and self.args.use_hybrid_parallel:
+            self.hcg = fleet.get_hybrid_communicate_group()
+            self.sharding_group = self.hcg.get_sharding_parallel_group()
+
+    def set_optimizer(self, optimizer):
+        self.optimizer = optimizer
+
+    def load_state_dict_from_checkpoint_with_reshard(
+        self, checkpoint, base_weight_name, model_wrapped, opt_state_dict=None
+    ):
+        """load state_dict from_checkpoint with reshard, Only load model state dict.
+        Args:
+            checkpoint (str): The directory of the checkpoint.
+            base_weight_name (str): The name of the checkpoint file.
+            model_wrapped (nn.Layer): The wrapped model.
+        """
+        parallel_config = self._load_distributed_strategy(checkpoint)
+        pp_degree = parallel_config["pp_degree"]
+        mp_degree = parallel_config["mp_degree"]
+        sharding_degree = parallel_config["sharding_degree"]
+        assert (
+            self.args.tensor_parallel_degree == mp_degree
+        ), f"mp_degree of the script {self.args.tensor_parallel_degree} and mp of the model {mp_degree} are not matched"
+        cur_sharding_degree = self.args.sharding_parallel_degree
+        cur_pp_degree = self.args.pipeline_parallel_degree
+        if pp_degree > 1:
+            assert cur_pp_degree > 1, "can not reshard from pp to non pp"
+        if pp_degree <= 1:
+            assert cur_pp_degree <= 1, "can not reshard from non pp to pp"
+
+        def load_model_slices():
+            model_state = reshard_util.NodeModelState()
+            for j in range(self.args.pipeline_parallel_rank, pp_degree, cur_pp_degree):
+                cur_sharding_meta = self._load_sharding_meta(checkpoint, j)
+                assert "structure_name_mapping" in cur_sharding_meta
+                structure_name_map = cur_sharding_meta["structure_name_mapping"]
+                for i in range(self.args.sharding_parallel_rank, sharding_degree, cur_sharding_degree):
+                    tmp = self._load_one_state_dict_from_checkpoint(
+                        checkpoint, base_weight_name, self.args.sharded_name_suffix(i, j)
+                    )
+                    node_model_state_tmp = reshard_util.NodeModelState()
+                    node_model_state_tmp.add_weights(tmp)
+                    node_model_state_tmp.pack_keys(structure_name_map)
+                    model_state.merge_from(node_model_state_tmp, i)
+            return model_state
+
+        node_model_state = load_model_slices()
+
+        if self._need_reshard_pp(checkpoint):
+            meta = self._load_model_meta(checkpoint)
+            reshard_context = pp_reshard.build_pipeline_context(meta, model_wrapped)
+            node_model_state = pp_reshard.reshard(node_model_state, reshard_context, self.hcg)
+
+        node_model_state.drop_rank()
+        node_model_state.unpack_keys()
+        state_dict = node_model_state.model_weights
+
+        def filter_func(name):
+            return True
+
+        state_dict = reshard_util.all_gather_state_dict(state_dict, filter_func, self.sharding_group)
+
+        if self.args.bf16:
+            state_dict = self._recover_params_from_master_weights(state_dict, opt_state_dict=opt_state_dict)
+
+        return state_dict
+
+    def _load_one_state_dict_from_checkpoint(self, resume_from_checkpoint, base_weight_name, weight_name_suffix):
+        """
+        load state_dict of one shard from_checkpoint, Only load model state dict.
+        """
+        file_path = os.path.join(resume_from_checkpoint, _add_variant(base_weight_name, weight_name_suffix))
+        if not os.path.isfile(file_path):
+            raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}, no {file_path}")
+
+        logger.info(f"Loading model from {resume_from_checkpoint} .")
+        # We load the model state dict on the CPU to avoid an OOM error.
+        state_dict = paddle.load(file_path, return_numpy=True)
+        return state_dict
+
+    def _load_optimizer_state_of_one_shard(self, checkpoint, base_opt_name, optimizer_name_suffix):
+        optimizer_name = _add_variant(base_opt_name, optimizer_name_suffix)
+        path = os.path.join(checkpoint, optimizer_name)
+        logger.info(f"load optimizer state from {path}")
+        if os.path.isfile(path):
+            return self._modify_ckpt_for_compatibility(paddlenlp_load(path, map_location="cpu"))
+        logger.info(f"{path} not exists")
+        return None
+
+    def _modify_ckpt_for_compatibility(self, ckpt):
+        master_weights = ckpt.get("master_weights", None)
+        if master_weights:
+            for k, v in master_weights.items():
+                assert isinstance(v, paddle.Tensor), v
+                if not v.name.startswith(k):
+                    new_name = k + "_fp32_master_0"
+                    logger.info(f"Modify master weights {v.name} -> {new_name}")
+                    v.name = new_name
+        return ckpt
+
+    def _need_reshard(self, checkpoint):
+        if self._need_reshard_pp(checkpoint):
+            return True
+        parallel_config = self._load_distributed_strategy(checkpoint)
+        sharding_meta = self._load_sharding_meta(checkpoint)
+        sharding_degree = parallel_config["sharding_degree"]
+        sharding_strategy = SHARDING_STRATEGY_V1
+        if "sharding_strategy" in sharding_meta:
+            sharding_strategy = sharding_meta["sharding_strategy"]
+        cur_sharding_degree = self.args.sharding_parallel_degree
+        cur_sharding_strategy = reshard_util.get_sharding_strategy(self.optimizer)
+        if sharding_degree != cur_sharding_degree or sharding_strategy != cur_sharding_strategy:
+            return True
+        if sharding_strategy == SHARDING_STRATEGY_V1:
+            param2rank = sharding_meta["param2rank"]
+            optimizer = unwrap_optimizer(self.optimizer, DygraphShardingOptimizer)
+            assert optimizer
+            assert len(param2rank) == len(optimizer._param2rank)
+            for (k, v) in param2rank.items():
+                assert k in optimizer._param2rank
+                if optimizer._param2rank[k] != int(v):
+                    return True
+        else:
+            pp_overlap = None
+            # backward compatibility
+            if "enable_overlap" in sharding_meta:
+                pp_overlap = sharding_meta["enable_overlap"]
+
+            cur_pp_overlap = unwrap_optimizer(self.optimizer, DygraphShardingOptimizerV2).pp_overlap
+            return pp_overlap != cur_pp_overlap
+
+        return False
+
+    def _need_reshard_pp(self, checkpoint):
+        parallel_config = self._load_distributed_strategy(checkpoint)
+        pp_degree = parallel_config["pp_degree"]
+        cur_pp_degree = self.args.pipeline_parallel_degree
+        if pp_degree != cur_pp_degree:
+            return True
+        # vpp、segment method changes is not auto supported yet
+        return self.args.force_reshard_pp
+
+    def load_optimizer_state_with_reshard(self, checkpoint, base_opt_name, model_wrapped):
+        """load state_dict of multiple shard from_checkpoint, Only load model state dict."""
+
+        parallel_config = self._load_distributed_strategy(checkpoint)
+        sharding_meta = self._load_sharding_meta(checkpoint, 0)
+        pp_degree = parallel_config["pp_degree"]
+        mp_degree = parallel_config["mp_degree"]
+        sharding_degree = parallel_config["sharding_degree"]
+        sharding_strategy = SHARDING_STRATEGY_V1
+        if "sharding_strategy" in sharding_meta:
+            sharding_strategy = sharding_meta["sharding_strategy"]
+        assert self.args.tensor_parallel_degree == mp_degree
+        cur_pp_degree = self.args.pipeline_parallel_degree
+
+        if pp_degree > 1:
+            assert cur_pp_degree > 1, "can not reshard from pp to non pp"
+        if pp_degree <= 1:
+            assert cur_pp_degree <= 1, "can not reshard from non pp to pp"
+
+        cur_sharding_degree = self.args.sharding_parallel_degree
+        cur_sharding_strategy = reshard_util.get_sharding_strategy(self.optimizer)
+
+        if not self._need_reshard(checkpoint):
+            one_shard_opt_state_dict = self._load_optimizer_state_of_one_shard(
+                checkpoint, base_opt_name, self.args.optimizer_name_suffix
+            )
+
+            if sharding_strategy == SHARDING_STRATEGY_V2 and cur_sharding_strategy == SHARDING_STRATEGY_V2:
+                is_matched = reshard_util.sharding_v2.is_matched_optimizer_state_dict(
+                    one_shard_opt_state_dict, self.optimizer, model_wrapped
+                )
+            else:
+                is_matched = True
+
+            if is_matched:
+                logger.info("do not need reshard")
+                return one_shard_opt_state_dict
+        else:
+            one_shard_opt_state_dict = None
+
+        logger.info("reshard optimizer state")
+
+        def load_model_slices():
+            model_state = reshard_util.NodeModelState()
+            for j in range(self.args.pipeline_parallel_rank, pp_degree, cur_pp_degree):
+                cur_sharding_meta = self._load_sharding_meta(checkpoint, j)
+                assert "structure_name_mapping" in cur_sharding_meta
+                structure_name_map = cur_sharding_meta["structure_name_mapping"]
+                for i in range(self.args.sharding_parallel_rank, sharding_degree, cur_sharding_degree):
+                    sharded_name_suffix = self.args.sharded_name_suffix(i, j)
+                    if one_shard_opt_state_dict is None:
+                        tmp = self._load_optimizer_state_of_one_shard(checkpoint, base_opt_name, sharded_name_suffix)
+                    else:
+                        assert (
+                            self.args.optimizer_name_suffix == sharded_name_suffix
+                        ), f"{self.args.optimizer_name_suffix} vs {sharded_name_suffix}"
+                        tmp = one_shard_opt_state_dict
+                    node_model_state_tmp = reshard_util.NodeModelState()
+                    node_model_state_tmp.add_opts(tmp)
+                    node_model_state_tmp.pack_keys(structure_name_map)
+                    model_state.merge_from(node_model_state_tmp, i)
+            return model_state
+
+        def reshard_pp(model_state):
+            # pp reshard
+            if self._need_reshard_pp(checkpoint):
+                meta = self._load_model_meta(checkpoint)
+                reshard_context = pp_reshard.build_pipeline_context(meta, model_wrapped)
+                model_state = pp_reshard.reshard(model_state, reshard_context, self.hcg)
+            return model_state
+
+        def reshard_sharding(node_model_state):
+            # shard reshard
+            restore_func = (
+                reshard_util.sharding_v1.restore
+                if sharding_strategy == SHARDING_STRATEGY_V1
+                else reshard_util.sharding_v2.restore
+            )
+            node_model_state = restore_func(node_model_state, self.model, self.optimizer, self.hcg)
+
+            shard_func = (
+                reshard_util.sharding_v1.shard
+                if cur_sharding_strategy == SHARDING_STRATEGY_V1
+                else reshard_util.sharding_v2.shard
+            )
+            node_model_state = shard_func(node_model_state, model_wrapped, self.optimizer, self.hcg)
+            # drop structural name in the key
+            node_model_state.unpack_keys()
+            return node_model_state.get_opt_state_dict()
+
+        node_model_state = load_model_slices()
+        node_model_state = reshard_pp(node_model_state)
+        return reshard_sharding(node_model_state)
+
+    def manipulate_state_dict_and_config(self, model_to_save, merge_tensor_parallel=False, state_dict=None):
+        weight_name_suffix = self.args.sharded_name_suffix()
+
+        if state_dict is None:
+            state_dict = model_to_save.state_dict()
+            if self.args.should_save_sharding_stage1_model:
+                state_dict = filter_sharded_params(state_dict, self.optimizer, self.sharding_group)
+
+        config_to_save = None
+        merge_tensor_parallel = merge_tensor_parallel and self.args.use_hybrid_parallel
+        if merge_tensor_parallel:
+            dtype = get_parameter_dtype(model_to_save)
+            assert hasattr(model_to_save, "config")
+            model_to_save.config.dtype = str(dtype).split(".")[1]
+            config_to_save = copy.deepcopy(model_to_save.config)
+            if config_to_save.tensor_parallel_degree > 1:
+                state_dict = model_to_save.merge_tensor_parallel(state_dict, config_to_save)
+                config_to_save.tensor_parallel_degree = 1
+                if config_to_save.tensor_parallel_rank != 0:
+                    logger.info("Saving with merge_tensor_parallel, tensor_parallel_rank > 0 don't need save")
+                    return
+                # if variant is not None and "tp" in variant:
+                if "tp" in weight_name_suffix:
+                    weight_name_suffix = "_".join([x for x in weight_name_suffix.split("_") if "tp" not in x])
+
+        if self.args.bf16 and self.args.should_save_sharding_stage1_model:
+            param_names_in_master_weights = []
+            optimzier_state_dict = self.optimizer.state_dict()
+            assert "master_weights" in optimzier_state_dict
+            param_names_in_master_weights = list(optimzier_state_dict["master_weights"].keys())
+            state_dict = exclude_paramters_in_state_dict(
+                state_dict, param_names_in_master_weights, self.sharding_group
+            )
+            logger.info(
+                "param_names_in_master_weights len:{}, bf16 state_dict len:{}, :{}".format(
+                    len(param_names_in_master_weights), len(state_dict), state_dict.keys()
+                )
+            )
+        return state_dict, config_to_save, weight_name_suffix
+
+    def save_distributed_model_meta(self, dir):
+        if not self.args.use_hybrid_parallel:
+            return
+
+        if not self.args.should_save_sharding_stage1_model:
+            return
+
+        nranks = dist.get_world_size()
+        if nranks <= 1:
+            return
+
+        model_meta = {}
+        parallel_config = self._get_distributed_strategy()
+        if parallel_config:
+            model_meta["parallel_config"] = parallel_config
+        sharding_metas = self._gather_sharding_metas()
+        if sharding_metas:
+            model_meta["sharding_metas"] = sharding_metas
+
+        if dist.get_rank():
+            return
+
+        path = os.path.join(dir, MODEL_META_NAME)
+        with open(path, "w") as f:
+            json.dump(model_meta, f)
+
+    def _get_distributed_strategy(self):
+        pp_degree = 1
+        mp_degree = 1
+        sharding_degree = 1
+        vpp_degree = 1
+        nranks = dist.get_world_size()
+        if self.args.use_hybrid_parallel and nranks > 1:
+            if dist.get_rank():
+                return
+            hcg = fleet.get_hybrid_communicate_group()
+            mp_degree = hcg.get_model_parallel_world_size()
+            pp_degree = hcg.get_pipe_parallel_world_size()
+            sharding_degree = hcg.get_sharding_parallel_world_size()
+            """
+            if pp_degree > 1:
+                assert isinstance(model, fleet.meta_parallel.PipelineParallel), "must be pipeline model"
+                vpp_degree = model._layers.get_num_virtual_stages()
+            """
+        parallel_config = {
+            "pp_degree": pp_degree,
+            "mp_degree": mp_degree,
+            "sharding_degree": sharding_degree,
+            "vpp_degree": vpp_degree,
+        }
+        return parallel_config
+
+    def _recover_params_from_master_weights(self, state_dict, opt_state_dict=None):
+        if opt_state_dict is None:
+            opt_state_dict = self.optimizer.state_dict()
+        assert "master_weights" in opt_state_dict, opt_state_dict.keys()
+        master_weights = opt_state_dict["master_weights"]
+        tmp = OrderedDict()
+        (master_weights, tmp) = (tmp, master_weights)
+        # cast to before
+        for (k, v) in tmp.items():
+            name = v.name
+            master_weights[k] = paddle.cast(v.cuda(), paddle.bfloat16).cpu()
+            master_weights[k].name = name
+
+        structure_name_map = {k: v.name for (k, v) in self.model.state_dict().items()}
+        node_model_state = reshard_util.NodeModelState()
+        node_model_state_tmp = reshard_util.NodeModelState()
+        node_model_state_tmp.add_master_weights(master_weights)
+        node_model_state_tmp.pack_keys(structure_name_map)
+        node_model_state.merge_from(node_model_state_tmp, self.sharding_group.rank)
+        del node_model_state_tmp
+        assert reshard_util.is_sharding_opt(self.optimizer)
+        sharding_strategy = reshard_util.get_sharding_strategy(self.optimizer)
+        restore_func = (
+            reshard_util.sharding_v1.restore
+            if sharding_strategy == SHARDING_STRATEGY_V1
+            else reshard_util.sharding_v2.restore
+        )
+        node_model_state = restore_func(node_model_state, self.model, self.optimizer, self.hcg)
+        node_model_state.unpack_keys()
+        master_weights = node_model_state.master_weights
+
+        def filter_func(name):
+            return True
+
+        master_weights = reshard_util.all_gather_state_dict(master_weights, filter_func, self.sharding_group)
+        model_state_dict = self.model.state_dict()
+        logger.info(f"state-dict-keys: {state_dict.keys()}, nums: {len(state_dict.keys())}")
+        logger.info("before recover, model_state_dict number: {}".format(len(model_state_dict)))
+        for key, param in model_state_dict.items():
+            if param.name in master_weights:
+                assert param.shape == master_weights[param.name].shape
+                paddle.assign(paddle.cast(master_weights[param.name].cuda(), paddle.bfloat16), model_state_dict[key])
+            elif key in state_dict:
+                logger.info(f"key: {key} is in state_dict, but not in master_weights")
+                paddle.assign(state_dict[key], model_state_dict[key])
+            else:
+                logger.info(f"key: {key} is not in state_dict and master_weights")
+        logger.info("after recover, casted model_state_dict number: {}".format(len(model_state_dict)))
+        state_dict.update(model_state_dict)
+        return state_dict
+
+    def _all_gather_simple_object(self, obj, group=None):
+        if group is None:
+            group = self.hcg.get_sharding_parallel_group()
+        res = []
+        if group.nranks < 2:
+            return [obj]
+        paddle.distributed.all_gather_object(res, obj, group)
+        return res
+
+    def _load_model_meta(self, dir):
+        meta_path = os.path.join(dir, MODEL_META_NAME)
+        assert os.path.exists(meta_path), f"{meta_path} not exist"
+        with open(meta_path, "r") as handle:
+            model_dist_meta = json.load(handle)
+        assert "parallel_config" in model_dist_meta
+        return model_dist_meta
+
+    def _load_distributed_strategy(self, dir):
+        model_dist_meta = self._load_model_meta(dir)
+        parallel_config = model_dist_meta["parallel_config"]
+        assert "pp_degree" in parallel_config
+        assert "mp_degree" in parallel_config
+        assert "sharding_degree" in parallel_config
+        return parallel_config
+
+    def _load_sharding_meta(self, dir, pp_rank=None):
+        if pp_rank is None:
+            pp_rank = self.args.pipeline_parallel_rank
+        suffix = f"tp{self.args.tensor_parallel_rank:0>2d}_pp{pp_rank:0>2d}"
+        distributed_model_meta = self._load_model_meta(dir)
+        if "sharding_metas" in distributed_model_meta:
+            sharding_metas = distributed_model_meta["sharding_metas"]
+            assert suffix in sharding_metas
+            sharding_meta = sharding_metas[suffix]
+            assert "param2rank" in sharding_meta
+            return sharding_meta
+
+        # for backward compatibility
+        meta_path = os.path.join(dir, _add_variant(SHARDING_META_NAME, suffix))
+        assert os.path.exists(meta_path), f"{meta_path} not exist"
+        with open(meta_path, "r") as f:
+            sharding_meta = json.load(f)
+        assert "param2rank" in sharding_meta
+        return sharding_meta
+
+    def _map_optimizer_state_to_param(self, optimizer_state_names):
+        optimizer = unwrap_optimizer(self.optimizer, DygraphShardingOptimizer)
+        all_names = list(optimizer._param2rank.keys())
+        all_names.extend(list(optimizer_state_names))
+        all_names.sort()
+        pre_p_name = ""
+        opt_to_p = {}
+        for n in all_names:
+            if n in optimizer._param2rank:
+                # we get a param
+                pre_p_name = n
+            else:
+                assert pre_p_name, n
+                opt_to_p[n] = pre_p_name
+        return opt_to_p
+
+    def _gather_sharding_metas(self):
+        nranks = dist.get_world_size()
+        if not self.args.use_hybrid_parallel or nranks <= 1:
+            return None
+        if self.args.sharding_parallel_rank != 0:
+            return None
+        if self.args.data_parallel_rank != 0:
+            return None
+        if not reshard_util.is_sharding_opt(self.optimizer):
+            return None
+
+        sharding_strategy = reshard_util.get_sharding_strategy(self.optimizer)
+        param2rank = {}
+        pp_overlap = False
+        if sharding_strategy == SHARDING_STRATEGY_V1:
+            optimizer = unwrap_optimizer(self.optimizer, DygraphShardingOptimizer)
+            param2rank = {k: v for (k, v) in optimizer._param2rank.items()}
+        else:
+            pp_overlap = unwrap_optimizer(self.optimizer, DygraphShardingOptimizerV2).pp_overlap
+
+        model = self.model
+        structure_name_mapping = {}
+        param_meta = {}
+        for k, v in model.state_dict().items():
+            structure_name_mapping[k] = v.name
+            param_meta[k] = (v.shape, int(v.dtype))
+
+        sharding_metas = {}
+        sharding_meta = {}
+
+        sharding_meta["param2rank"] = param2rank
+        sharding_meta["structure_name_mapping"] = structure_name_mapping
+        sharding_meta["param_meta"] = param_meta
+        sharding_meta["sharding_strategy"] = sharding_strategy
+        sharding_meta["enable_overlap"] = pp_overlap
+        suffix = f"tp{self.args.tensor_parallel_rank:0>2d}_pp{self.args.pipeline_parallel_rank:0>2d}"
+        sharding_metas[suffix] = sharding_meta
+        sharding_metas_list = self._all_gather_simple_object(sharding_metas, self.hcg.get_model_parallel_group())
+        sharding_metas = {k: v for e in sharding_metas_list for (k, v) in e.items()}
+        if self.args.tensor_parallel_rank != 0:
+            return None
+        sharding_metas_list = self._all_gather_simple_object(sharding_metas, self.hcg.get_pipe_parallel_group())
+        sharding_metas = {k: v for e in sharding_metas_list for (k, v) in e.items()}
+        return sharding_metas
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/__init__.py
new file mode 100644
index 000000000..deddfb976
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/__init__.py
@@ -0,0 +1,308 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .configuration_utils import PretrainedConfig
+from .model_utils import PretrainedModel, register_base_model
+from .tokenizer_utils import (
+    PretrainedTokenizer,
+    BPETokenizer,
+    tokenize_chinese_chars,
+    is_chinese_char,
+    AddedToken,
+    normalize_chars,
+    tokenize_special_chars,
+    convert_to_unicode,
+)
+from .tokenizer_utils_fast import PretrainedTokenizerFast
+from .processing_utils import ProcessorMixin
+from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin
+from .image_processing_utils import ImageProcessingMixin
+from .attention_utils import create_bigbird_rand_mask_idx_list
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        GatherOp,
+        ScatterOp,
+        AllGatherOp,
+        ReduceScatterOp,
+        ColumnSequenceParallelLinear,
+        RowSequenceParallelLinear,
+        mark_as_sequence_parallel_parameter,
+        register_sequence_parallel_allreduce_hooks,
+    )
+except:
+    pass
+from .export import export_model
+
+# isort: split
+from .bert.modeling import *
+from .bert.tokenizer import *
+from .bert.configuration import *
+
+# isort: split
+from .gpt import *
+from .roberta.modeling import *
+from .roberta.tokenizer import *
+from .roberta.configuration import *
+from .electra.modeling import *
+from .electra.tokenizer import *
+from .electra.configuration import *
+from .albert.configuration import *
+from .albert.modeling import *
+from .albert.tokenizer import *
+from .bit.modeling import *
+from .bit.configuration import *
+from .bit.image_processing import *
+from .bart.modeling import *
+from .bart.tokenizer import *
+from .bart.configuration import *
+from .bert_japanese.tokenizer import *
+from .bigbird.modeling import *
+from .bigbird.configuration import *
+from .bigbird.tokenizer import *
+from .blenderbot.modeling import *
+from .blenderbot.tokenizer import *
+from .blenderbot.configuration import *
+from .blenderbot_small.modeling import *
+from .blenderbot_small.tokenizer import *
+from .blenderbot_small.configuration import *
+from .blip.modeling import *
+from .blip.modeling_text import *
+from .blip.configuration import *
+from .blip.processing import *
+from .blip.image_processing import *
+from .chinesebert.configuration import *
+from .chinesebert.modeling import *
+from .chinesebert.tokenizer import *
+from .convbert.configuration import *
+from .convbert.modeling import *
+from .convbert.tokenizer import *
+from .ctrl.modeling import *
+from .ctrl.tokenizer import *
+from .ctrl.configuration import *
+from .dpt.modeling import *
+from .dpt.configuration import *
+from .dpt.image_processing import *
+from .distilbert.configuration import *
+from .distilbert.modeling import *
+from .distilbert.tokenizer import *
+from .ernie.configuration import *
+from .ernie.modeling import *
+from .ernie.tokenizer import *
+from .ernie_ctm.modeling import *
+from .ernie_ctm.tokenizer import *
+from .ernie_ctm.configuration import *
+from .ernie_doc.modeling import *
+from .ernie_doc.tokenizer import *
+from .ernie_doc.configuration import *
+from .ernie_gen.modeling import ErnieForGeneration
+from .ernie_gram.modeling import *
+from .ernie_gram.tokenizer import *
+from .ernie_gram.configuration import *
+from .ernie_layout.modeling import *
+from .ernie_layout.tokenizer import *
+from .ernie_layout.configuration import *
+from .ernie_m.configuration import *
+from .ernie_m.modeling import *
+from .ernie_m.tokenizer import *
+from .fnet.modeling import *
+from .fnet.tokenizer import *
+from .fnet.configuration import *
+from .funnel.modeling import *
+from .funnel.tokenizer import *
+from .funnel.configuration import *
+from .llama import *
+from .layoutlm.configuration import *
+from .layoutlm.modeling import *
+from .layoutlm.tokenizer import *
+from .layoutlmv2.modeling import *
+from .layoutlmv2.tokenizer import *
+from .layoutlmv2.configuration import *
+from .layoutxlm.modeling import *
+from .layoutxlm.tokenizer import *
+from .layoutxlm.configuration import *
+from .luke.modeling import *
+from .luke.tokenizer import *
+from .luke.configuration import *
+from .mbart.modeling import *
+from .mbart.tokenizer import *
+from .mbart.configuration import *
+from .megatronbert.modeling import *
+from .megatronbert.tokenizer import *
+from .megatronbert.configuration import *
+from .prophetnet.modeling import *
+from .prophetnet.tokenizer import *
+from .prophetnet.configuration import *
+from .mobilebert.configuration import *
+from .mobilebert.modeling import *
+from .mobilebert.tokenizer import *
+from .mpnet.configuration import *
+from .mpnet.modeling import *
+from .mpnet.tokenizer import *
+from .mt5.configuration import *
+from .mt5.modeling import *
+from .nezha.configuration import *
+from .nezha.modeling import *
+from .nezha.tokenizer import *
+from .ppminilm.modeling import *
+from .ppminilm.tokenizer import *
+from .reformer.modeling import *
+from .reformer.tokenizer import *
+from .reformer.configuration import *
+from .rembert.modeling import *
+from .rembert.tokenizer import *
+from .rembert.configuration import *
+from .roformer.modeling import *
+from .roformer.configuration import *
+from .roformer.tokenizer import *
+from .semantic_search.modeling import *
+from .skep.configuration import *
+from .skep.modeling import *
+from .skep.tokenizer import *
+from .squeezebert.modeling import *
+from .squeezebert.tokenizer import *
+from .squeezebert.configuration import *
+from .t5.modeling import *
+from .t5.tokenizer import *
+from .t5.configuration import *
+from .tinybert.configuration import *
+from .tinybert.modeling import *
+from .tinybert.tokenizer import *
+from .transformer.modeling import *
+from .unified_transformer.modeling import *
+from .unified_transformer.tokenizer import *
+from .unified_transformer.configuration import *
+from .ernie_code.tokenizer import *
+from .ernie_code.modeling import *
+from .ernie_code.configuration import *
+from .ernie_vil.configuration import *
+from .ernie_vil.modeling import *
+from .ernie_vil.feature_extraction import *
+from .ernie_vil.tokenizer import *
+from .ernie_vil.processing import *
+from .ernie_vil.image_processing import *
+from .unimo.modeling import *
+from .unimo.tokenizer import *
+from .unimo.configuration import *
+from .xlnet.modeling import *
+from .xlnet.tokenizer import *
+from .xlnet.configuration import *
+from .xlm.modeling import *
+from .xlm.tokenizer import *
+from .xlm.configuration import *
+from .gau_alpha.modeling import *
+from .gau_alpha.tokenizer import *
+from .gau_alpha.configuration import *
+from .gemma import *
+from .roformerv2.modeling import *
+from .roformerv2.tokenizer import *
+from .roformerv2.configuration import *
+from .optimization import *
+from .opt.configuration import *
+from .opt.modeling import *
+from .auto.modeling import *
+from .auto.tokenizer import *
+from .auto.processing import *
+from .auto.image_processing import *
+from .auto.configuration import *
+from .codegen.modeling import *
+from .codegen.tokenizer import *
+from .codegen.configuration import *
+from .artist.modeling import *
+from .artist.tokenizer import *
+from .artist.configuration import *
+from .dallebart.modeling import *
+from .dallebart.tokenizer import *
+from .dallebart.configuration import *
+from .clip.modeling import *
+from .clip.configuration import *
+from .clip.feature_extraction import *
+from .clip.tokenizer import *
+from .clip.processing import *
+from .clip.image_processing import *
+from .chineseclip.modeling import *
+from .chineseclip.configuration import *
+from .chineseclip.feature_extraction import *
+from .chineseclip.processing import *
+from .chineseclip.image_processing import *
+from .chineseclip.tokenizer import *
+from .gptj.modeling import *
+from .gptj.tokenizer import *
+from .gptj.configuration import *
+from .pegasus.modeling import *
+from .pegasus.tokenizer import *
+from .pegasus.configuration import *
+from .glm.configuration import *
+from .glm.modeling import *
+from .glm.tokenizer import *
+from .nystromformer.configuration import *
+from .nystromformer.modeling import *
+from .nystromformer.tokenizer import *
+from .bloom.configuration import *
+from .bloom.modeling import *
+from .bloom.tokenizer import *
+from .clipseg.configuration import *
+from .clipseg.modeling import *
+from .clipseg.processing import *
+from .clipseg.image_processing import *
+from .blip_2.modeling import *
+from .blip_2.configuration import *
+from .blip_2.processing import *
+from .chatglm.configuration import *
+from .chatglm.modeling import *
+from .chatglm.tokenizer import *
+from .chatglm_v2.configuration import *
+from .chatglm_v2.modeling import *
+from .chatglm_v2.tokenizer import *
+from .speecht5.configuration import *
+from .speecht5.modeling import *
+from .speecht5.tokenizer import *
+from .speecht5.processing import *
+from .speecht5.feature_extraction import *
+from .minigpt4.modeling import *
+from .minigpt4.configuration import *
+from .minigpt4.processing import *
+from .minigpt4.image_processing import *
+from .clap.configuration import *
+from .clap.feature_extraction import *
+from .clap.modeling import *
+from .clap.processing import *
+from .visualglm.modeling import *
+from .visualglm.configuration import *
+from .visualglm.processing import *
+from .visualglm.image_processing import *
+from .rw.modeling import *
+from .rw.configuration import *
+from .rw.tokenizer import *
+from .mistral.modeling import *
+from .mistral.configuration import *
+from .qwen import *
+from .mixtral.modeling import *
+from .mixtral.configuration import *
+from .deberta.modeling import *
+from .deberta.tokenizer import *
+from .deberta.configuration import *
+from .deberta_v2.modeling import *
+from .deberta_v2.tokenizer import *
+from .deberta_v2.configuration import *
+from .qwen2 import *
+from .qwen2_moe import *
+from .yuan import *
+from .mamba.configuration import *
+from .mamba.modeling import *
+from .mamba.tokenizer import *
+from .jamba.modeling import *
+from .jamba.configuration import *
+from .jamba.tokenizer import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/activations.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/activations.py
new file mode 100644
index 000000000..ab9be1167
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/activations.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from collections import OrderedDict
+
+import paddle
+import paddle.nn.functional as F
+from paddle import Tensor, nn
+
+
+class NewGELUActivation(nn.Layer):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
+    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return (
+            0.5 * input * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * paddle.pow(input, 3.0))))
+        )
+
+
+class GELUActivation(nn.Layer):
+    """
+    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
+    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
+    Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def __init__(self, use_gelu_python: bool = False):
+        super().__init__()
+        if use_gelu_python:
+            self.act = self._gelu_python
+        else:
+            self.act = nn.functional.gelu
+
+    def _gelu_python(self, input: Tensor) -> Tensor:
+        return input * 0.5 * (1.0 + paddle.erf(input / math.sqrt(2.0)))
+
+    def forward(self, input: Tensor) -> Tensor:
+        return self.act(input)
+
+
+class FastGELUActivation(nn.Layer):
+    """
+    Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return 0.5 * input * (1.0 + paddle.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input)))
+
+
+class QuickGELUActivation(nn.Layer):
+    """
+    Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return input * F.sigmoid(1.702 * input)
+
+
+class ClippedGELUActivation(nn.Layer):
+    """
+    Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
+    it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
+    https://arxiv.org/abs/2004.09602.
+
+    Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
+    initially created.
+
+    For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://arxiv.org/abs/1606.08415
+    """
+
+    def __init__(self, min: float, max: float):
+        if min > max:
+            raise ValueError(f"min should be < max (got min: {min}, max: {max})")
+
+        super().__init__()
+        self.min = min
+        self.max = max
+
+    def forward(self, x: Tensor) -> Tensor:
+        return paddle.clip(gelu(x), self.min, self.max)
+
+
+class SiLUActivation(nn.Layer):
+    """
+    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
+    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
+    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
+    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
+    later.
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.silu(input)
+
+
+class MishActivation(nn.Layer):
+    """
+    See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also
+    visit the official repository for the paper: https://github.com/digantamisra98/Mish
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.mish(input)
+
+
+class LinearActivation(nn.Layer):
+    """
+    Applies the linear activation function, i.e. forwarding input directly to output.
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return input
+
+
+class ClassInstantier(OrderedDict):
+    def __getitem__(self, key):
+        content = super().__getitem__(key)
+        cls, kwargs = content if isinstance(content, tuple) else (content, {})
+        return cls(**kwargs)
+
+
+ACT2CLS = {
+    "gelu": GELUActivation,
+    "gelu_10": (ClippedGELUActivation, {"min": -10, "max": 10}),
+    "gelu_fast": FastGELUActivation,
+    "gelu_new": NewGELUActivation,
+    "gelu_python": (GELUActivation, {"use_gelu_python": True}),
+    "linear": LinearActivation,
+    "mish": MishActivation,
+    "quick_gelu": QuickGELUActivation,
+    "relu": nn.ReLU,
+    "relu6": nn.ReLU6,
+    "sigmoid": nn.Sigmoid,
+    "silu": SiLUActivation,
+    "swish": SiLUActivation,
+    "tanh": nn.Tanh,
+}
+ACT2FN = ClassInstantier(ACT2CLS)
+
+
+def get_activation(activation_string):
+    if activation_string in ACT2FN:
+        return ACT2FN[activation_string]
+    else:
+        raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")
+
+
+# For backwards compatibility with: from activations import gelu_python
+gelu_python = get_activation("gelu_python")
+gelu_new = get_activation("gelu_new")
+gelu = get_activation("gelu")
+gelu_fast = get_activation("gelu_fast")
+quick_gelu = get_activation("quick_gelu")
+silu = get_activation("silu")
+mish = get_activation("mish")
+linear_act = get_activation("linear")
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/aistudio_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/aistudio_utils.py
new file mode 100644
index 000000000..6c1756ed3
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/aistudio_utils.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+from aistudio_sdk.hub import download
+
+
+class UnauthorizedError(Exception):
+    pass
+
+
+class EntryNotFoundError(Exception):
+    pass
+
+
+def _add_subfolder(weights_name: str, subfolder: Optional[str] = None) -> str:
+    if subfolder is not None and subfolder != "":
+        weights_name = "/".join([subfolder, weights_name])
+    return weights_name
+
+
+def aistudio_download(
+    repo_id: str,
+    filename: str = None,
+    cache_dir: Optional[str] = None,
+    subfolder: Optional[str] = "",
+    revision: Optional[str] = None,
+    **kwargs,
+):
+    if revision is None:
+        revision = "master"
+    filename = _add_subfolder(filename, subfolder)
+    download_kwargs = {}
+    if revision is not None:
+        download_kwargs["revision"] = revision
+    if cache_dir is not None:
+        download_kwargs["cache_dir"] = cache_dir
+    res = download(
+        repo_id=repo_id,
+        filename=filename,
+        **download_kwargs,
+    )
+    if "path" in res:
+        return res["path"]
+    else:
+        if res["error_code"] == 10001:
+            raise ValueError("Illegal argument error")
+        elif res["error_code"] == 10002:
+            raise UnauthorizedError(
+                "Unauthorized Access. Please ensure that you have provided the AIStudio Access Token and you have access to the requested asset"
+            )
+        elif res["error_code"] == 12001:
+            raise EntryNotFoundError(f"Cannot find the requested file '{filename}' in repo '{repo_id}'")
+        else:
+            raise Exception(f"Unknown error: {res}")
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/albert/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/albert/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/albert/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/albert/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/albert/configuration.py
new file mode 100644
index 000000000..ce125f13c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/albert/configuration.py
@@ -0,0 +1,448 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Albert model configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from ..configuration_utils import PretrainedConfig
+
+__all__ = ["ALBERT_PRETRAINED_INIT_CONFIGURATION", "AlbertConfig", "ALBERT_PRETRAINED_RESOURCE_FILES_MAP"]
+
+ALBERT_PRETRAINED_INIT_CONFIGURATION = {
+    "albert-base-v1": {
+        "attention_probs_dropout_prob": 0.1,
+        "bos_token_id": 2,
+        "embedding_size": 128,
+        "eos_token_id": 3,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "inner_group_num": 1,
+        "intermediate_size": 3072,
+        "layer_norm_eps": 1e-12,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_groups": 1,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30000,
+    },
+    "albert-large-v1": {
+        "attention_probs_dropout_prob": 0.1,
+        "bos_token_id": 2,
+        "embedding_size": 128,
+        "eos_token_id": 3,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "inner_group_num": 1,
+        "intermediate_size": 4096,
+        "layer_norm_eps": 1e-12,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 16,
+        "num_hidden_groups": 1,
+        "num_hidden_layers": 24,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30000,
+    },
+    "albert-xlarge-v1": {
+        "attention_probs_dropout_prob": 0.1,
+        "bos_token_id": 2,
+        "embedding_size": 128,
+        "eos_token_id": 3,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 2048,
+        "initializer_range": 0.02,
+        "inner_group_num": 1,
+        "intermediate_size": 8192,
+        "layer_norm_eps": 1e-12,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 16,
+        "num_hidden_groups": 1,
+        "num_hidden_layers": 24,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30000,
+    },
+    "albert-xxlarge-v1": {
+        "attention_probs_dropout_prob": 0,
+        "bos_token_id": 2,
+        "embedding_size": 128,
+        "eos_token_id": 3,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0,
+        "hidden_size": 4096,
+        "initializer_range": 0.02,
+        "inner_group_num": 1,
+        "intermediate_size": 16384,
+        "layer_norm_eps": 1e-12,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 64,
+        "num_hidden_groups": 1,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30000,
+    },
+    "albert-base-v2": {
+        "attention_probs_dropout_prob": 0,
+        "bos_token_id": 2,
+        "embedding_size": 128,
+        "eos_token_id": 3,
+        "hidden_act": "gelu_new",
+        "hidden_dropout_prob": 0,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "inner_group_num": 1,
+        "intermediate_size": 3072,
+        "layer_norm_eps": 1e-12,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_groups": 1,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30000,
+    },
+    "albert-large-v2": {
+        "attention_probs_dropout_prob": 0,
+        "bos_token_id": 2,
+        "embedding_size": 128,
+        "eos_token_id": 3,
+        "hidden_act": "gelu_new",
+        "hidden_dropout_prob": 0,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "inner_group_num": 1,
+        "intermediate_size": 4096,
+        "layer_norm_eps": 1e-12,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 16,
+        "num_hidden_groups": 1,
+        "num_hidden_layers": 24,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30000,
+    },
+    "albert-xlarge-v2": {
+        "attention_probs_dropout_prob": 0,
+        "bos_token_id": 2,
+        "embedding_size": 128,
+        "eos_token_id": 3,
+        "hidden_act": "gelu_new",
+        "hidden_dropout_prob": 0,
+        "hidden_size": 2048,
+        "initializer_range": 0.02,
+        "inner_group_num": 1,
+        "intermediate_size": 8192,
+        "layer_norm_eps": 1e-12,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 16,
+        "num_hidden_groups": 1,
+        "num_hidden_layers": 24,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30000,
+    },
+    "albert-xxlarge-v2": {
+        "attention_probs_dropout_prob": 0,
+        "bos_token_id": 2,
+        "embedding_size": 128,
+        "eos_token_id": 3,
+        "hidden_act": "gelu_new",
+        "hidden_dropout_prob": 0,
+        "hidden_size": 4096,
+        "initializer_range": 0.02,
+        "inner_group_num": 1,
+        "intermediate_size": 16384,
+        "layer_norm_eps": 1e-12,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 64,
+        "num_hidden_groups": 1,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30000,
+    },
+    "albert-chinese-tiny": {
+        "attention_probs_dropout_prob": 0.0,
+        "bos_token_id": 2,
+        "embedding_size": 128,
+        "eos_token_id": 3,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.0,
+        "hidden_size": 312,
+        "initializer_range": 0.02,
+        "inner_group_num": 1,
+        "intermediate_size": 1248,
+        "layer_norm_eps": 1e-12,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_groups": 1,
+        "num_hidden_layers": 4,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 21128,
+    },
+    "albert-chinese-small": {
+        "attention_probs_dropout_prob": 0.0,
+        "bos_token_id": 2,
+        "embedding_size": 128,
+        "eos_token_id": 3,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.0,
+        "hidden_size": 384,
+        "initializer_range": 0.02,
+        "inner_group_num": 1,
+        "intermediate_size": 1536,
+        "layer_norm_eps": 1e-12,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_groups": 1,
+        "num_hidden_layers": 6,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 21128,
+    },
+    "albert-chinese-base": {
+        "attention_probs_dropout_prob": 0,
+        "bos_token_id": 2,
+        "embedding_size": 128,
+        "eos_token_id": 3,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "inner_group_num": 1,
+        "intermediate_size": 3072,
+        "layer_norm_eps": 1e-12,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_groups": 1,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 21128,
+    },
+    "albert-chinese-large": {
+        "attention_probs_dropout_prob": 0,
+        "bos_token_id": 2,
+        "embedding_size": 128,
+        "eos_token_id": 3,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "inner_group_num": 1,
+        "intermediate_size": 4096,
+        "layer_norm_eps": 1e-12,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 16,
+        "num_hidden_groups": 1,
+        "num_hidden_layers": 24,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 21128,
+    },
+    "albert-chinese-xlarge": {
+        "attention_probs_dropout_prob": 0,
+        "bos_token_id": 2,
+        "embedding_size": 128,
+        "eos_token_id": 3,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0,
+        "hidden_size": 2048,
+        "initializer_range": 0.014,
+        "inner_group_num": 1,
+        "intermediate_size": 8192,
+        "layer_norm_eps": 1e-12,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 16,
+        "num_hidden_groups": 1,
+        "num_hidden_layers": 24,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 21128,
+    },
+    "albert-chinese-xxlarge": {
+        "attention_probs_dropout_prob": 0,
+        "bos_token_id": 2,
+        "embedding_size": 128,
+        "eos_token_id": 3,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0,
+        "hidden_size": 4096,
+        "initializer_range": 0.01,
+        "inner_group_num": 1,
+        "intermediate_size": 16384,
+        "layer_norm_eps": 1e-12,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 16,
+        "num_hidden_groups": 1,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 21128,
+    },
+}
+
+ALBERT_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "albert-base-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-base-v1.pdparams",
+        "albert-large-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-large-v1.pdparams",
+        "albert-xlarge-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xlarge-v1.pdparams",
+        "albert-xxlarge-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xxlarge-v1.pdparams",
+        "albert-base-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-base-v2.pdparams",
+        "albert-large-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-large-v2.pdparams",
+        "albert-xlarge-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xlarge-v2.pdparams",
+        "albert-xxlarge-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xxlarge-v2.pdparams",
+        "albert-chinese-tiny": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-tiny.pdparams",
+        "albert-chinese-small": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-small.pdparams",
+        "albert-chinese-base": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-base.pdparams",
+        "albert-chinese-large": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-large.pdparams",
+        "albert-chinese-xlarge": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-xlarge.pdparams",
+        "albert-chinese-xxlarge": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-xxlarge.pdparams",
+    }
+}
+
+
+class AlbertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`AlbertModel`]. It is used to instantiate
+    an ALBERT model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the ALBERT
+    [albert-xxlarge-v2](https://huggingface.co/albert-xxlarge-v2) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (int, optional):
+            Vocabulary size of `inputs_ids` in `AlbertModel`. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `AlbertModel`.
+            Defaults to `30000`.
+        embedding_size (int, optional):
+            Dimensionality of the embedding layer. Defaults to `128`.
+        hidden_size (int, optional):
+            Dimensionality of the encoder layer and pooler layer. Defaults to `768`.
+        num_hidden_layers (int, optional):
+            Number of hidden layers in the Transformer encoder. Defaults to `12`.
+        inner_group_num (int, optional):
+            Number of hidden groups in the Transformer encoder. Defaults to `1`.
+        num_attention_heads (int, optional):
+            Number of attention heads for each attention layer in the Transformer encoder.
+            Defaults to `12`.
+        intermediate_size (int, optional):
+            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
+            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
+            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
+        inner_group_num (int, optional):
+            Number of inner groups in a hidden group. Default to `1`.
+        hidden_act (str, optional):
+            The non-linear activation function in the feed-forward layer.
+            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
+            are supported.
+        hidden_dropout_prob (float, optional):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+            Defaults to `0`.
+        attention_probs_dropout_prob (float, optional):
+            The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
+            Defaults to `0`.
+        classifier_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for attached classifiers.
+        max_position_embeddings (int, optional):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
+            sequence. Defaults to `512`.
+        type_vocab_size (int, optional):
+            The vocabulary size of `token_type_ids`. Defaults to `12`.
+
+        initializer_range (float, optional):
+            The standard deviation of the normal initializer. Defaults to `0.02`.
+
+            .. note::
+                A normal_initializer initializes weight matrices as normal distributions.
+                See :meth:`BertPretrainedModel.init_weights()` for how weights are initialized in `ElectraModel`.
+
+        layer_norm_eps(float, optional):
+            The `epsilon` parameter used in :class:`paddle.nn.LayerNorm` for initializing layer normalization layers.
+            A small value to the variance added to the normalization layer to prevent division by zero.
+            Default to `1e-12`.
+        pad_token_id (int, optional):
+            The index of padding token in the token vocabulary. Defaults to `0`.
+        add_pooling_layer(bool, optional):
+            Whether or not to add the pooling layer. Default to `False`.
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import AlbertConfig, AlbertModel
+    >>> # Initializing an ALBERT style configuration
+    >>> configuration = AlbertConfig()
+    >>> # Initializing a model (with random weights) from the ALBERT-base style configuration
+    >>> model = AlbertModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    attribute_map: Dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
+    pretrained_init_configuration = ALBERT_PRETRAINED_INIT_CONFIGURATION
+    model_type = "albert"
+
+    def __init__(
+        self,
+        vocab_size=30000,
+        embedding_size=128,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_hidden_groups=1,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        inner_group_num=1,
+        hidden_act="gelu",
+        hidden_dropout_prob=0,
+        attention_probs_dropout_prob=0,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        bos_token_id=2,
+        eos_token_id=3,
+        add_pooling_layer=True,
+        classifier_dropout_prob=0.1,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.embedding_size = embedding_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_hidden_groups = num_hidden_groups
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.inner_group_num = inner_group_num
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.classifier_dropout_prob = classifier_dropout_prob
+        self.add_pooling_layer = add_pooling_layer
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/albert/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/albert/modeling.py
new file mode 100644
index 000000000..465bb0738
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/albert/modeling.py
@@ -0,0 +1,1554 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Modeling classes for ALBERT model."""
+
+import math
+from typing import List, Optional, Tuple
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Layer
+
+from ...layers import Linear as TransposedLinear
+from ...utils.converter import StateDictNameMapping, init_name_mappings
+from ...utils.env import CONFIG_NAME
+from .. import PretrainedModel, register_base_model
+from ..activations import ACT2FN
+from ..model_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    MaskedLMOutput,
+    ModelOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    tuple_output,
+)
+from .configuration import (
+    ALBERT_PRETRAINED_INIT_CONFIGURATION,
+    ALBERT_PRETRAINED_RESOURCE_FILES_MAP,
+    AlbertConfig,
+)
+
+__all__ = [
+    "AlbertPretrainedModel",
+    "AlbertModel",
+    "AlbertForPretraining",
+    "AlbertForMaskedLM",
+    "AlbertForSequenceClassification",
+    "AlbertForTokenClassification",
+    "AlbertForQuestionAnswering",
+    "AlbertForMultipleChoice",
+]
+
+dtype_float = paddle.get_default_dtype()
+
+
+class AlbertForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`AlbertForPreTraining`].
+
+    Args:
+        loss (*optional*, returned when `labels` is provided, `paddle.Tensor` of shape `(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        sop_logits (`paddle.Tensor` of shape `(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    prediction_logits: paddle.Tensor = None
+    sop_logits: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+class AlbertEmbeddings(Layer):
+    """
+    Constructs the embeddings from word, position and token_type embeddings.
+    """
+
+    def __init__(self, config: AlbertConfig):
+        super(AlbertEmbeddings, self).__init__()
+
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)
+
+        self.layer_norm = nn.LayerNorm(config.embedding_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # Position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", paddle.arange(config.max_position_embeddings, dtype="int64").expand((1, -1))
+        )
+
+    def forward(
+        self,
+        input_ids,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        past_key_values_length=0,
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.shape
+        else:
+            input_shape = inputs_embeds.shape[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(input_shape, dtype="int64")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = inputs_embeds + token_type_embeddings
+
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings += position_embeddings
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class AlbertAttention(Layer):
+    def __init__(self, config: AlbertConfig):
+        super(AlbertAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.attention_head_size = config.hidden_size // config.num_attention_heads
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(self.hidden_size, self.all_head_size)
+        self.key = nn.Linear(self.hidden_size, self.all_head_size)
+        self.value = nn.Linear(self.hidden_size, self.all_head_size)
+
+        self.attention_dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.output_dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+
+    # Copied from transformers.models.bert.modeling_bert.BertSelfAttention.transpose_for_scores
+    def transpose_for_scores(self, x):
+        new_x_shape = x.shape[:-1] + [self.num_attention_heads, self.attention_head_size]
+        x = x.reshape(new_x_shape)
+        return x.transpose([0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_layer, key_layer, transpose_y=True)
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = F.softmax(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.attention_dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = paddle.matmul(attention_probs, value_layer)
+        context_layer = context_layer.transpose([0, 2, 1, 3])
+        context_layer = context_layer.reshape([0, 0, -1])
+
+        # dense layer shape to be checked
+        projected_context_layer = self.dense(context_layer)
+
+        projected_context_layer_dropout = self.output_dropout(projected_context_layer)
+        layer_normed_context_layer = self.layer_norm(hidden_states + projected_context_layer_dropout)
+        return (layer_normed_context_layer, attention_probs) if output_attentions else (layer_normed_context_layer,)
+
+
+class AlbertLayer(Layer):
+    def __init__(self, config: AlbertConfig):
+        super(AlbertLayer, self).__init__()
+        self.seq_len_dim = 1
+        self.full_layer_layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.attention = AlbertAttention(config)
+        self.ffn = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.ffn_output = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.activation = ACT2FN[config.hidden_act]
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+    ):
+        attention_output = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+        )
+
+        ffn_output = self.ffn(attention_output[0])
+        ffn_output = self.activation(ffn_output)
+        ffn_output = self.ffn_output(ffn_output)
+
+        hidden_states = self.full_layer_layer_norm(ffn_output + attention_output[0])
+
+        return (hidden_states,) + attention_output[1:]  # add attentions if we output them
+
+
+class AlbertLayerGroup(Layer):
+    def __init__(self, config: AlbertConfig):
+        super(AlbertLayerGroup, self).__init__()
+
+        self.albert_layers = nn.LayerList([AlbertLayer(config) for _ in range(config.inner_group_num)])
+
+    def forward(
+        self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False
+    ):
+        layer_attentions = () if output_attentions else None
+        all_hidden_states = (hidden_states,) if output_hidden_states else None
+
+        for layer_index, albert_layer in enumerate(self.albert_layers):
+            layer_output = albert_layer(
+                hidden_states,
+                attention_mask,
+                head_mask[layer_index],
+                output_attentions=output_attentions,
+            )
+            hidden_states = layer_output[0]
+
+            if output_attentions:
+                layer_attentions = layer_attentions + (layer_output[1],)
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+        outputs = (hidden_states,)
+
+        if output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+
+        if output_attentions:
+            outputs = outputs + (layer_attentions,)
+
+        return outputs
+
+
+class AlbertTransformer(Layer):
+    def __init__(self, config: AlbertConfig):
+        super(AlbertTransformer, self).__init__()
+
+        self.num_hidden_layers = config.num_hidden_layers
+        self.num_hidden_groups = config.num_hidden_groups
+
+        self.embedding_hidden_mapping_in = nn.Linear(config.embedding_size, config.hidden_size)
+        self.albert_layer_groups = nn.LayerList([AlbertLayerGroup(config) for _ in range(config.num_hidden_groups)])
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_hidden_states=False,
+        output_attentions=False,
+        return_dict=False,
+    ):
+        hidden_states = self.embedding_hidden_mapping_in(hidden_states)
+
+        all_hidden_states = (hidden_states,) if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        for i in range(self.num_hidden_layers):
+            # Number of layers in a hidden group
+            layers_per_group = int(self.num_hidden_layers / self.num_hidden_groups)
+            # Index of the hidden group
+            group_idx = int(i / (self.num_hidden_layers / self.num_hidden_groups))
+
+            layer_group_output = self.albert_layer_groups[group_idx](
+                hidden_states,
+                attention_mask,
+                head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group],
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            hidden_states = layer_group_output[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + layer_group_output[-1]
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+
+
+class AlbertPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained ALBERT models. It provides ALBERT related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models. See `PretrainedModel` for more details.
+    """
+
+    model_config_file = CONFIG_NAME
+    config_class = AlbertConfig
+
+    resource_files_names = {"model_state": "model_state.pdparams"}
+    base_model_prefix = "transformer"
+
+    pretrained_init_configuration = ALBERT_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = ALBERT_PRETRAINED_RESOURCE_FILES_MAP
+
+    @classmethod
+    def _get_name_mappings(cls, config: AlbertConfig) -> List[StateDictNameMapping]:
+        model_mappings = [
+            "embeddings.word_embeddings.weight",
+            "embeddings.position_embeddings.weight",
+            "embeddings.token_type_embeddings.weight",
+            ["embeddings.LayerNorm.weight", "embeddings.layer_norm.weight"],
+            ["embeddings.LayerNorm.bias", "embeddings.layer_norm.bias"],
+            ["encoder.embedding_hidden_mapping_in.weight", None, "transpose"],
+            "encoder.embedding_hidden_mapping_in.bias",
+        ]
+
+        if config.add_pooling_layer:
+            model_mappings.extend(
+                [
+                    ["pooler.weight", None, "transpose"],
+                    ["pooler.bias"],
+                ]
+            )
+
+        for group_index in range(config.num_hidden_groups):
+            group_mappings = [
+                f"encoder.albert_layer_groups.{group_index}.albert_layers.0.full_layer_layer_norm.weight",
+                f"encoder.albert_layer_groups.{group_index}.albert_layers.0.full_layer_layer_norm.bias",
+                [
+                    f"encoder.albert_layer_groups.{group_index}.albert_layers.0.attention.query.weight",
+                    None,
+                    "transpose",
+                ],
+                f"encoder.albert_layer_groups.{group_index}.albert_layers.0.attention.query.bias",
+                [
+                    f"encoder.albert_layer_groups.{group_index}.albert_layers.0.attention.key.weight",
+                    None,
+                    "transpose",
+                ],
+                f"encoder.albert_layer_groups.{group_index}.albert_layers.0.attention.key.bias",
+                [
+                    f"encoder.albert_layer_groups.{group_index}.albert_layers.0.attention.value.weight",
+                    None,
+                    "transpose",
+                ],
+                f"encoder.albert_layer_groups.{group_index}.albert_layers.0.attention.value.bias",
+                [
+                    f"encoder.albert_layer_groups.{group_index}.albert_layers.0.attention.dense.weight",
+                    None,
+                    "transpose",
+                ],
+                f"encoder.albert_layer_groups.{group_index}.albert_layers.0.attention.dense.bias",
+                [
+                    f"encoder.albert_layer_groups.{group_index}.albert_layers.0.attention.LayerNorm.weight",
+                    f"encoder.albert_layer_groups.{group_index}.albert_layers.0.attention.layer_norm.weight",
+                ],
+                [
+                    f"encoder.albert_layer_groups.{group_index}.albert_layers.0.attention.LayerNorm.bias",
+                    f"encoder.albert_layer_groups.{group_index}.albert_layers.0.attention.layer_norm.bias",
+                ],
+                [
+                    f"encoder.albert_layer_groups.{group_index}.albert_layers.0.ffn.weight",
+                    None,
+                    "transpose",
+                ],
+                f"encoder.albert_layer_groups.{group_index}.albert_layers.0.ffn.bias",
+                [
+                    f"encoder.albert_layer_groups.{group_index}.albert_layers.0.ffn_output.weight",
+                    None,
+                    "transpose",
+                ],
+                f"encoder.albert_layer_groups.{group_index}.albert_layers.0.ffn_output.bias",
+            ]
+            model_mappings.extend(group_mappings)
+
+        init_name_mappings(model_mappings)
+        # base-model prefix "AlbertModel"
+        if "AlbertModel" not in config.architectures:
+            for mapping in model_mappings:
+                mapping[0] = "albert." + mapping[0]
+                mapping[1] = "transformer." + mapping[1]
+
+        # downstream mappings
+        if "AlbertForQuestionAnswering" in config.architectures:
+            model_mappings.extend(
+                [["qa_outputs.weight", "qa_outputs.weight", "transpose"], ["qa_outputs.bias", "qa_outputs.bias"]]
+            )
+        if (
+            "AlbertForMultipleChoice" in config.architectures
+            or "AlbertForSequenceClassification" in config.architectures
+            or "AlbertForTokenClassification" in config.architectures
+        ):
+            model_mappings.extend(
+                [["classifier.weight", "classifier.weight", "transpose"], ["classifier.bias", "classifier.bias"]]
+            )
+
+        mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)]
+        return mappings
+
+    def _init_weights(self, layer):
+        # Initialize the weights.
+        if isinstance(layer, nn.Linear):
+            layer.weight.set_value(
+                paddle.tensor.normal(
+                    mean=0.0,
+                    std=self.config.initializer_range,
+                    shape=layer.weight.shape,
+                )
+            )
+            if layer.bias is not None:
+                layer.bias.set_value(paddle.zeros_like(layer.bias))
+        elif isinstance(layer, nn.Embedding):
+            layer.weight.set_value(
+                paddle.tensor.normal(
+                    mean=0.0,
+                    std=self.config.initializer_range,
+                    shape=layer.weight.shape,
+                )
+            )
+            if layer._padding_idx is not None:
+                layer.weight[layer._padding_idx].set_value(paddle.zeros_like(layer.weight[layer._padding_idx]))
+        elif isinstance(layer, nn.LayerNorm):
+            layer.bias.set_value(paddle.zeros_like(layer.bias))
+            layer.weight.set_value(paddle.ones_like(layer.weight))
+
+
+@register_base_model
+class AlbertModel(AlbertPretrainedModel):
+    """
+    The bare Albert Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`AlbertConfig`):
+            An instance of AlbertConfig used to construct AlbertModel.
+    """
+
+    def __init__(self, config: AlbertConfig):
+        super(AlbertModel, self).__init__(config)
+        self.pad_token_id = config.pad_token_id
+        self.bos_token_id = config.bos_token_id
+        self.eos_token_id = config.eos_token_id
+        self.initializer_range = config.initializer_range
+        self.num_hidden_layers = config.num_hidden_layers
+        self.embeddings = AlbertEmbeddings(config)
+        self.encoder = AlbertTransformer(config)
+        self.config = config
+
+        if config.add_pooling_layer:
+            self.pooler = nn.Linear(config.hidden_size, config.hidden_size)
+            self.pooler_activation = nn.Tanh()
+        else:
+            self.pooler = None
+            self.pooler_activation = None
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
+        """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
+        if head_mask.dim() == 1:
+            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+            head_mask = head_mask.expand(num_hidden_layers, -1, -1, -1, -1)
+        elif head_mask.dim() == 2:
+            head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+        assert head_mask.dim() == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
+        head_mask = paddle.cast(head_mask, dtype=dtype_float)
+        return head_mask
+
+    def get_head_mask(self, head_mask, num_hidden_layers, is_attention_chunked=False):
+        if head_mask is not None:
+            head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
+            if is_attention_chunked is True:
+                head_mask = head_mask.unsqueeze(-1)
+        else:
+            head_mask = [None] * num_hidden_layers
+
+        return head_mask
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_hidden_states=False,
+        output_attentions=False,
+        return_dict=False,
+    ):
+        r"""
+        The AlbertModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            head_mask (Tensor, optional):
+                Mask to nullify selected heads of the self-attention modules. Masks values can either be 0 or 1:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicated the head is **masked**.
+            inputs_embeds (Tensor, optional):
+               If you want to control how to convert `inputs_ids` indices into associated vectors, you can
+               pass an embedded representation directly instead of passing `inputs_ids`.
+           output_hidden_states (bool, optional):
+               Whether to return the hidden states of all layers.
+               Defaults to `False`.
+           output_attentions (bool, optional):
+               Whether to return the attentions tensors of all attention layers.
+               Defaults to `False`.
+           return_dict (bool, optional):
+               Whether to return a :class:`~paddlenlp.transformers.model_outputs.ModelOutput` object. If `False`, the output
+               will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            tuple or Dict: Returns tuple (`sequence_output`, `pooled_output`) or a dict with
+            `last_hidden_state`, `pooled_output`, `all_hidden_states`, `all_attentions` fields.
+
+            With the fields:
+
+            - `sequence_output` (Tensor):
+               Sequence of hidden-states at the last layer of the model.
+               It's data type should be float32 and has a shape of [`batch_size, sequence_length, hidden_size`].
+
+            - `pooled_output` (Tensor):
+               The output of first token (`[CLS]`) in sequence.
+               We "pool" the model by simply taking the hidden state corresponding to the first token.
+               Its data type should be float32 and
+               has a shape of [batch_size, hidden_size].
+
+            - `last_hidden_state` (Tensor):
+               The output of the last encoder layer, it is also the `sequence_output`.
+               It's data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size].
+
+            - `all_hidden_states` (Tensor):
+               Hidden_states of all layers in the Transformer encoder. The length of `all_hidden_states` is `num_hidden_layers + 1`.
+               For all element in the tuple, its data type should be float32 and its shape is [`batch_size, sequence_length, hidden_size`].
+
+            - `all_attentions` (Tensor):
+               Attentions of all layers of in the Transformer encoder. The length of `all_attentions` is `num_hidden_layers`.
+               For all element in the tuple, its data type should be float32 and its shape is
+               [`batch_size, num_attention_heads, sequence_length, sequence_length`].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import AlbertModel, AlbertTokenizer
+
+                tokenizer = AlbertTokenizer.from_pretrained('albert-base-v1')
+                model = AlbertModel.from_pretrained('albert-base-v1')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+
+        """
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if attention_mask is None:
+            attention_mask = paddle.ones(shape=input_shape)
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(shape=input_shape, dtype="int64")
+
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+        extended_attention_mask = paddle.cast(extended_attention_mask, dtype=dtype_float)
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        head_mask = self.get_head_mask(head_mask, self.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            extended_attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = encoder_outputs[0]
+
+        pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0])) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class AlbertForPretraining(AlbertPretrainedModel):
+    """
+    Albert Model with a `masked language modeling` head and a `sentence order prediction` head
+    on top.
+
+    Args:
+        config (:class:`AlbertConfig`):
+            An instance of AlbertConfig used to construct AlbertModel.
+
+    """
+
+    def __init__(self, config: AlbertConfig):
+        super(AlbertForPretraining, self).__init__(config)
+
+        self.transformer = AlbertModel(config)
+        self.predictions = AlbertMLMHead(config)
+        self.sop_classifier = AlbertSOPHead(config)
+        self.config = config
+        self.vocab_size = config.vocab_size
+
+    def get_output_embeddings(self):
+        return self.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.predictions.decoder = new_embeddings
+
+    def get_input_embeddings(self):
+        return self.transformer.embeddings.word_embeddings
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        sentence_order_label=None,
+        labels=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
+    ):
+        r"""
+        The AlbertForPretraining forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`AlbertModel`.
+            attention_mask (list, optional):
+                See :class:`AlbertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`AlbertModel`.
+            position_ids(Tensor, optional):
+                See :class:`AlbertModel`.
+            head_mask(Tensor, optional):
+                See :class:`AlbertModel`.
+            inputs_embeds(Tensor, optional):
+                See :class:`AlbertModel`.
+            sentence_order_label(Tensor, optional):
+                Labels of the next sequence prediction. Input should be a sequence pair
+                Indices should be 0 or 1. ``0`` indicates original order (sequence A, then sequence B),
+                and ``1`` indicates switched order (sequence B, then sequence A). Defaults to `None`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.ModelOutput` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            tuple or Dict: Returns tuple (`prediction_scores`, `sop_scores`) or a dict with
+            `prediction_logits`, `sop_logits`, `pooled_output`, `hidden_states`, `attentions` fields.
+
+            With the fields:
+
+            - `prediction_scores` (Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                and its shape is [batch_size, sequence_length, vocab_size].
+
+            - `sop_scores` (Tensor):
+                The scores of sentence order prediction.
+                Its data type should be float32 and its shape is [batch_size, 2].
+
+            - `prediction_logits` (Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                and its shape is [batch_size, sequence_length, vocab_size].
+
+            - `sop_logits` (Tensor):
+                The scores of sentence order prediction.
+                Its data type should be float32 and its shape is [batch_size, 2].
+
+            - `hidden_states` (Tensor):
+                Hidden_states of all layers in the Transformer encoder. The length of `hidden_states` is `num_hidden_layers + 1`.
+                For all element in the tuple, its data type should be float32 and its shape is [`batch_size, sequence_length, hidden_size`].
+
+            - `attentions` (Tensor):
+                Attentions of all layers of in the Transformer encoder. The length of `attentions` is `num_hidden_layers`.
+                For all element in the tuple, its data type should be float32 and its shape is
+                [`batch_size, num_attention_heads, sequence_length, sequence_length`].
+
+        """
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output, pooled_output = outputs[:2]
+
+        prediction_scores = self.predictions(sequence_output)
+        sop_scores = self.sop_classifier(pooled_output)
+
+        total_loss = None
+        if labels is not None and sentence_order_label is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.reshape([-1, self.config.vocab_size]), labels.reshape([-1]))
+            sentence_order_loss = loss_fct(sop_scores.reshape([-1, 2]), sentence_order_label.reshape([-1]))
+            total_loss = masked_lm_loss + sentence_order_loss
+
+        if not return_dict:
+            output = (prediction_scores, sop_scores) + outputs[2:]
+            return tuple_output(output, total_loss)
+
+        return AlbertForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            sop_logits=sop_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class AlbertMLMHead(Layer):
+    def __init__(self, config: AlbertConfig):
+        super(AlbertMLMHead, self).__init__()
+
+        self.layer_norm = nn.LayerNorm(config.embedding_size)
+        self.bias = self.create_parameter(
+            [config.vocab_size], is_bias=True, default_initializer=nn.initializer.Constant(value=0)
+        )
+        self.dense = nn.Linear(config.hidden_size, config.embedding_size)
+        self.decoder = TransposedLinear(config.embedding_size, config.vocab_size)
+
+        self.activation = ACT2FN[config.hidden_act]
+
+        # link bias
+        self.bias = self.decoder.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        prediction_scores = hidden_states
+        return prediction_scores
+
+
+class AlbertSOPHead(Layer):
+    def __init__(self, config: AlbertConfig):
+        super(AlbertSOPHead, self).__init__()
+        self.dropout = nn.Dropout(config.classifier_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, pooled_output):
+        dropout_pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(dropout_pooled_output)
+        return logits
+
+
+class AlbertForMaskedLM(AlbertPretrainedModel):
+    """
+    Albert Model with a `masked language modeling` head on top.
+
+    Args:
+        config (:class:`AlbertConfig`):
+            An instance of AlbertConfig used to construct AlbertModel.
+
+    """
+
+    def __init__(self, config: AlbertConfig):
+        super(AlbertForMaskedLM, self).__init__(config)
+
+        self.transformer = AlbertModel(config)
+        self.predictions = AlbertMLMHead(config)
+        self.config = config
+        self.tie_weights()
+
+    def get_output_embeddings(self):
+        return self.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.predictions.decoder = new_embeddings
+
+    def get_input_embeddings(self):
+        return self.transformer.embeddings.word_embeddings
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_hidden_states=False,
+        output_attentions=False,
+        return_dict=False,
+    ):
+        r"""
+        The AlbertForPretraining forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`AlbertModel`.
+            attention_mask (list, optional):
+                See :class:`AlbertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`AlbertModel`.
+            position_ids(Tensor, optional):
+                See :class:`AlbertModel`.
+            head_mask(Tensor, optional):
+                See :class:`AlbertModel`.
+            inputs_embeds(Tensor, optional):
+                See :class:`AlbertModel`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.ModelOutput` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            Tensor or Dict: Returns tensor `prediction_scores` or a dict with `logits`,
+            `hidden_states`, `attentions` fields.
+
+            With the fields:
+
+            - `prediction_scores` (Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                and its shape is [batch_size, sequence_length, vocab_size].
+
+            - `logits` (Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                and its shape is [batch_size, sequence_length, vocab_size].
+
+            - `hidden_states` (Tensor):
+                Hidden_states of all layers in the Transformer encoder. The length of `hidden_states` is `num_hidden_layers + 1`.
+                For all element in the tuple, its data type should be float32 and its shape is [`batch_size, sequence_length, hidden_size`].
+
+            - `attentions` (Tensor):
+                Attentions of all layers of in the Transformer encoder. The length of `attentions` is `num_hidden_layers`.
+                For all element in the tuple, its data type should be float32 and its shape is
+                [`batch_size, num_attention_heads, sequence_length, sequence_length`].
+
+        """
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if isinstance(transformer_outputs, type(input_ids)):
+            transformer_outputs = [transformer_outputs]
+
+        hidden_states = transformer_outputs[0]
+        logits = self.predictions(hidden_states)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(logits.reshape((-1, logits.shape[-1])), labels.reshape((-1,)))
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[2:]
+            return tuple_output(output, masked_lm_loss)
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+class AlbertForSequenceClassification(AlbertPretrainedModel):
+    """
+    Albert Model with a linear layer on top of the output layer,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        config (:class:`AlbertConfig`):
+            An instance of AlbertConfig used to construct AlbertModel.
+
+    """
+
+    def __init__(self, config: AlbertConfig):
+        super(AlbertForSequenceClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.transformer = AlbertModel(config)
+        self.dropout = nn.Dropout(config.classifier_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_hidden_states=False,
+        output_attentions=False,
+        return_dict=False,
+    ):
+        r"""
+        The AlbertForSequenceClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`AlbertModel`.
+            attention_mask (list, optional):
+                See :class:`AlbertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`AlbertModel`.
+            position_ids(Tensor, optional):
+                See :class:`AlbertModel`.
+            head_mask(Tensor, optional):
+                See :class:`AlbertModel`.
+            inputs_embeds(Tensor, optional):
+                See :class:`AlbertModel`.
+            labels (Tensor of shape `(batch_size,)`, optional):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in `[0, ..., num_labels - 1]`. If `num_labels == 1`
+                a regression loss is computed (Mean-Square loss), If `num_labels > 1`
+                a classification loss is computed (Cross-Entropy).
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            Tensor or Dict: Returns tensor `logits`, or a dict with `logits`, `hidden_states`, `attentions` fields.
+
+            With the fields:
+
+            - `logits` (Tensor):
+                A tensor of the input text classification logits.
+                Shape as `[batch_size, num_labels]` and dtype as float32.
+
+            - `hidden_states` (Tensor):
+                Hidden_states of all layers in the Transformer encoder. The length of `hidden_states` is `num_hidden_layers + 1`.
+                For all element in the tuple, its data type should be float32 and its shape is [`batch_size, sequence_length, hidden_size`].
+
+            - `attentions` (Tensor):
+                Attentions of all layers of in the Transformer encoder. The length of `attentions` is `num_hidden_layers`.
+                For all element in the tuple, its data type should be float32 and its shape is
+                [`batch_size, num_attention_heads, sequence_length, sequence_length`].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import AlbertForSequenceClassification, AlbertTokenizer
+
+                tokenizer = AlbertTokenizer.from_pretrained('albert-base-v1')
+                model = AlbertForSequenceClassification.from_pretrained('albert-base-v1')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                logits = outputs[0]
+        """
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = transformer_outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = paddle.nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = paddle.nn.CrossEntropyLoss()
+                loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = paddle.nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[2:]
+            return tuple_output(output, loss)
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+class AlbertForTokenClassification(AlbertPretrainedModel):
+    """
+    Albert Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        config (:class:`AlbertConfig`):
+            An instance of AlbertConfig used to construct AlbertModel.
+    """
+
+    def __init__(self, config: AlbertConfig):
+        super(AlbertForTokenClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        self.transformer = AlbertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, self.num_labels)
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_hidden_states=False,
+        output_attentions=False,
+        return_dict=False,
+    ):
+        r"""
+        The AlbertForTokenClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`AlbertModel`.
+            attention_mask (list, optional):
+                See :class:`AlbertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`AlbertModel`.
+            position_ids(Tensor, optional):
+                See :class:`AlbertModel`.
+            head_mask(Tensor, optional):
+                See :class:`AlbertModel`.
+            inputs_embeds(Tensor, optional):
+                See :class:`AlbertModel`.
+            labels (Tensor of shape `(batch_size, sequence_length)`, optional):
+                Labels for computing the token classification loss. Indices should be in `[0, ..., num_labels - 1]`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            Tensor or Dict: Returns tensor `logits`, or a dict with `logits`, `hidden_states`, `attentions` fields.
+
+            With the fields:
+
+            - `logits` (Tensor):
+                A tensor of the input token classification logits.
+                Shape as `[batch_size, sequence_length, num_labels]` and dtype as `float32`.
+
+            - `hidden_states` (Tensor):
+                Hidden_states of all layers in the Transformer encoder. The length of `hidden_states` is `num_hidden_layers + 1`.
+                For all element in the tuple, its data type should be float32 and its shape is [`batch_size, sequence_length, hidden_size`].
+
+            - `attentions` (Tensor):
+                Attentions of all layers of in the Transformer encoder. The length of `attentions` is `num_hidden_layers`.
+                For all element in the tuple, its data type should be float32 and its shape is
+                [`batch_size, num_attention_heads, sequence_length, sequence_length`].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import AlbertForTokenClassification, AlbertTokenizer
+
+                tokenizer = AlbertTokenizer.from_pretrained('albert-base-v1')
+                model = AlbertForTokenClassification.from_pretrained('albert-base-v1')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                logits = outputs[0]
+        """
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = transformer_outputs[0]
+
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+        if not return_dict:
+            output = (logits,) + transformer_outputs[2:]
+            return tuple_output(output, loss)
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+class AlbertForQuestionAnswering(AlbertPretrainedModel):
+    """
+    Albert Model with a linear layer on top of the hidden-states output to compute `span_start_logits`
+    and `span_end_logits`, designed for question-answering tasks like SQuAD.
+
+    Args:
+        config (:class:`AlbertConfig`):
+            An instance of AlbertConfig used to construct AlbertModel.
+
+    """
+
+    def __init__(self, config: AlbertConfig):
+        super(AlbertForQuestionAnswering, self).__init__(config)
+        self.config = config
+        self.transformer = AlbertModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_hidden_states=False,
+        output_attentions=False,
+        return_dict=False,
+    ):
+        r"""
+        The AlbertForQuestionAnswering forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`AlbertModel`.
+            attention_mask (list, optional):
+                See :class:`AlbertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`AlbertModel`.
+            position_ids(Tensor, optional):
+                See :class:`AlbertModel`.
+            head_mask(Tensor, optional):
+                See :class:`AlbertModel`.
+            inputs_embeds(Tensor, optional):
+                See :class:`AlbertModel`.
+            start_positions (Tensor of shape `(batch_size,)`, optional):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            end_positions (Tensor of shape `(batch_size,)`, optional):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            tuple or Dict: Returns tuple (`start_logits, end_logits`)or a dict
+            with `start_logits`, `end_logits`, `hidden_states`, `attentions` fields.
+
+            With the fields:
+
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - `hidden_states` (Tensor):
+                Hidden_states of all layers in the Transformer encoder. The length of `hidden_states` is `num_hidden_layers + 1`.
+                For all element in the tuple, its data type should be float32 and its shape is [`batch_size, sequence_length, hidden_size`].
+
+            - `attentions` (Tensor):
+                Attentions of all layers of in the Transformer encoder. The length of `attentions` is `num_hidden_layers`.
+                For all element in the tuple, its data type should be float32 and its shape is
+                [`batch_size, num_attention_heads, sequence_length, sequence_length`].
+
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import AlbertForQuestionAnswering, AlbertTokenizer
+
+                tokenizer = AlbertTokenizer.from_pretrained('albert-base-v1')
+                model = AlbertForQuestionAnswering.from_pretrained('albert-base-v1')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                logits = outputs[0]
+        """
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = transformer_outputs[0]
+        logits = self.qa_outputs(sequence_output)
+
+        start_logits, end_logits = paddle.split(logits, num_or_sections=2, axis=-1)
+        start_logits = start_logits.squeeze(axis=-1)
+        end_logits = start_logits.squeeze(axis=-1)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if start_positions.ndim > 1:
+                start_positions = start_positions.squeeze(-1)
+            if start_positions.ndim > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clip(0, ignored_index)
+            end_positions = end_positions.clip(0, ignored_index)
+
+            loss_fct = paddle.nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        if not return_dict:
+            output = (start_logits, end_logits) + transformer_outputs[2:]
+            return tuple_output(output, total_loss)
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+class AlbertForMultipleChoice(AlbertPretrainedModel):
+    """
+    Albert Model with a linear layer on top of the hidden-states output layer,
+    designed for multiple choice tasks like SWAG tasks .
+
+    Args:
+        config (:class:`AlbertConfig`):
+            An instance of AlbertConfig used to construct AlbertModel.
+
+    """
+
+    def __init__(self, config: AlbertConfig):
+        super(AlbertForMultipleChoice, self).__init__(config)
+        self.transformer = AlbertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+        self.config = config
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_hidden_states=False,
+        output_attentions=False,
+        return_dict=False,
+    ):
+        r"""
+        The AlbertForQuestionAnswering forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`AlbertModel`.
+            attention_mask (list, optional):
+                See :class:`AlbertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`AlbertModel`.
+            position_ids(Tensor, optional):
+                See :class:`AlbertModel`.
+            head_mask(Tensor, optional):
+                See :class:`AlbertModel`.
+            inputs_embeds(Tensor, optional):
+                See :class:`AlbertModel`.
+            labels (Tensor of shape `(batch_size, )`, optional):
+                Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+                num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+                `input_ids` above)
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            Tensor or Dict: Returns tensor `reshaped_logits` or a dict
+            with `reshaped_logits`, `hidden_states`, `attentions` fields.
+
+            With the fields:
+
+            - `reshaped_logits` (Tensor):
+                A tensor of the input multiple choice classification logits.
+                Shape as `[batch_size, num_labels]` and dtype as `float32`.
+
+            - `hidden_states` (Tensor):
+                Hidden_states of all layers in the Transformer encoder. The length of `hidden_states` is `num_hidden_layers + 1`.
+                For all element in the tuple, its data type should be float32 and its shape is [`batch_size, sequence_length, hidden_size`].
+
+            - `attentions` (Tensor):
+                Attentions of all layers of in the Transformer encoder. The length of `attentions` is `num_hidden_layers`.
+                For all element in the tuple, its data type should be float32 and its shape is
+                [`batch_size, num_attention_heads, sequence_length, sequence_length`].
+        """
+
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.reshape([-1, input_ids.shape[-1]]) if input_ids is not None else None
+        attention_mask = attention_mask.reshape([-1, attention_mask.shape[-1]]) if attention_mask is not None else None
+        token_type_ids = token_type_ids.reshape([-1, token_type_ids.shape[-1]]) if token_type_ids is not None else None
+        position_ids = position_ids.reshape([-1, position_ids.shape[-1]]) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.reshape([-1, inputs_embeds.shape[-2], inputs_embeds.shape[-1]])
+            if inputs_embeds is not None
+            else None
+        )
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = transformer_outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.reshape([-1, num_choices])
+
+        loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+        if not return_dict:
+            output = (reshaped_logits,) + transformer_outputs[2:]
+            return tuple_output(output, loss)
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/albert/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/albert/tokenizer.py
new file mode 100644
index 000000000..a7d80d0b2
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/albert/tokenizer.py
@@ -0,0 +1,801 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for ALBERT model."""
+
+import os
+import unicodedata
+from shutil import copyfile
+
+import sentencepiece as spm
+
+from .. import PretrainedTokenizer, BertTokenizer, AddedToken
+
+__all__ = ["AlbertTokenizer"]
+
+SPIECE_UNDERLINE = "▁"
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "albert-base-v1": 512,
+    "albert-large-v1": 512,
+    "albert-xlarge-v1": 512,
+    "albert-xxlarge-v1": 512,
+    "albert-base-v2": 512,
+    "albert-large-v2": 512,
+    "albert-xlarge-v2": 512,
+    "albert-xxlarge-v2": 512,
+    "albert-chinese-tiny": 512,
+    "albert-chinese-small": 512,
+    "albert-chinese-base": 512,
+    "albert-chinese-large": 512,
+    "albert-chinese-xlarge": 512,
+    "albert-chinese-xxlarge": 512,
+}
+
+
+class AlbertTokenizer(PretrainedTokenizer):
+    """
+    Constructs an Albert tokenizer based on SentencePiece or `BertTokenizer`.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            The vocabulary file path (ends with '.txt') required to instantiate
+            a `WordpieceTokenizer`.
+        sentence_model_file (str):
+            The vocabulary file (ends with '.spm') required to instantiate
+            a `SentencePiece <https://github.com/google/sentencepiece>`__ tokenizer.
+        do_lower_case (bool):
+            Whether or not to lowercase the input when tokenizing. Defaults to `True`.
+        remove_space (bool):
+            Whether or note to remove space when tokenizing. Defaults to `True`.
+        keep_accents (bool):
+            Whether or note to keep accents when tokenizing. Defaults to `False`.
+        unk_token (str):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import AlbertTokenizer
+            tokenizer = AlbertTokenizer.from_pretrained('albert-base-v1')
+            tokens = tokenizer('He was a puppeteer')
+            '''
+            {'input_ids': [2, 24, 23, 21, 10956, 7911, 3],
+             'token_type_ids': [0, 0, 0, 0, 0, 0, 0]}
+            '''
+
+    """
+
+    resource_files_names = {
+        "sentencepiece_model_file": "spiece.model",
+        "vocab_file": "vocab.txt",
+    }
+
+    pretrained_resource_files_map = {
+        "sentencepiece_model_file": {
+            "albert-base-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-base-v1.spiece.model",
+            "albert-large-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-large-v1.spiece.model",
+            "albert-xlarge-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xlarge-v1.spiece.model",
+            "albert-xxlarge-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xxlarge-v1.spiece.model",
+            "albert-base-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-base-v2.spiece.model",
+            "albert-large-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-large-v2.spiece.model",
+            "albert-xlarge-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xlarge-v2.spiece.model",
+            "albert-xxlarge-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xxlarge-v2.spiece.model",
+            "albert-chinese-tiny": None,
+            "albert-chinese-small": None,
+            "albert-chinese-base": None,
+            "albert-chinese-large": None,
+            "albert-chinese-xlarge": None,
+            "albert-chinese-xxlarge": None,
+        },
+        "vocab_file": {
+            "albert-base-v1": None,
+            "albert-large-v1": None,
+            "albert-xlarge-v1": None,
+            "albert-xxlarge-v1": None,
+            "albert-base-v2": None,
+            "albert-large-v2": None,
+            "albert-xlarge-v2": None,
+            "albert-xxlarge-v2": None,
+            "albert-chinese-tiny": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-tiny.vocab.txt",
+            "albert-chinese-small": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-small.vocab.txt",
+            "albert-chinese-base": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-base.vocab.txt",
+            "albert-chinese-large": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-large.vocab.txt",
+            "albert-chinese-xlarge": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-xlarge.vocab.txt",
+            "albert-chinese-xxlarge": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-xxlarge.vocab.txt",
+        },
+    }
+
+    pretrained_init_configuration = {
+        "albert-base-v1": {
+            "do_lower_case": True,
+            "remove_space": True,
+            "keep_accents": False,
+            "unk_token": "<unk>",
+            "pad_token": "<pad>",
+        },
+        "albert-large-v1": {
+            "do_lower_case": True,
+            "remove_space": True,
+            "keep_accents": False,
+            "unk_token": "<unk>",
+            "pad_token": "<pad>",
+        },
+        "albert-xlarge-v1": {
+            "do_lower_case": True,
+            "remove_space": True,
+            "keep_accents": False,
+            "unk_token": "<unk>",
+            "pad_token": "<pad>",
+        },
+        "albert-xxlarge-v1": {
+            "do_lower_case": True,
+            "remove_space": True,
+            "keep_accents": False,
+            "unk_token": "<unk>",
+            "pad_token": "<pad>",
+        },
+        "albert-base-v2": {
+            "do_lower_case": True,
+            "remove_space": True,
+            "keep_accents": False,
+            "unk_token": "<unk>",
+            "pad_token": "<pad>",
+        },
+        "albert-large-v2": {
+            "do_lower_case": True,
+            "remove_space": True,
+            "keep_accents": False,
+            "unk_token": "<unk>",
+            "pad_token": "<pad>",
+        },
+        "albert-xlarge-v2": {
+            "do_lower_case": True,
+            "remove_space": True,
+            "keep_accents": False,
+            "unk_token": "<unk>",
+            "pad_token": "<pad>",
+        },
+        "albert-xxlarge-v2": {
+            "do_lower_case": True,
+            "remove_space": True,
+            "keep_accents": False,
+            "unk_token": "<unk>",
+            "pad_token": "<pad>",
+        },
+        "albert-chinese-tiny": {
+            "do_lower_case": False,
+            "unk_token": "[UNK]",
+            "pad_token": "[PAD]",
+        },
+        "albert-chinese-small": {
+            "do_lower_case": False,
+            "unk_token": "[UNK]",
+            "pad_token": "[PAD]",
+        },
+        "albert-chinese-base": {
+            "do_lower_case": False,
+            "unk_token": "[UNK]",
+            "pad_token": "[PAD]",
+        },
+        "albert-chinese-large": {
+            "do_lower_case": False,
+            "unk_token": "[UNK]",
+            "pad_token": "[PAD]",
+        },
+        "albert-chinese-xlarge": {
+            "do_lower_case": False,
+            "unk_token": "[UNK]",
+            "pad_token": "[PAD]",
+        },
+        "albert-chinese-xxlarge": {
+            "do_lower_case": False,
+            "unk_token": "[UNK]",
+            "pad_token": "[PAD]",
+        },
+    }
+
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        sentencepiece_model_file,
+        do_lower_case=True,
+        remove_space=True,
+        keep_accents=False,
+        bos_token="[CLS]",
+        eos_token="[SEP]",
+        unk_token="<unk>",
+        sep_token="[SEP]",
+        pad_token="<pad>",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+        self._build_special_tokens_map_extended(mask_token=mask_token)
+
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.keep_accents = keep_accents
+        self.vocab_file = vocab_file
+        self.sentencepiece_model_file = sentencepiece_model_file
+
+        if vocab_file is not None:
+            self.tokenizer = AlbertChineseTokenizer(
+                vocab_file=vocab_file,
+                do_lower_case=do_lower_case,
+                unk_token=unk_token,
+                sep_token=sep_token,
+                pad_token=pad_token,
+                cls_token=cls_token,
+                mask_token=mask_token,
+                **kwargs,
+            )
+        elif sentencepiece_model_file is not None:
+            self.tokenizer = AlbertEnglishTokenizer(
+                sentencepiece_model_file=sentencepiece_model_file,
+                do_lower_case=do_lower_case,
+                remove_space=remove_space,
+                keep_accents=keep_accents,
+                bos_token=bos_token,
+                eos_token=eos_token,
+                unk_token=unk_token,
+                sep_token=sep_token,
+                pad_token=pad_token,
+                cls_token=cls_token,
+                mask_token=mask_token,
+                **kwargs,
+            )
+        else:
+            raise ValueError(
+                "You should only specify either one(not both) of 'vocal_file'"
+                "and 'sentencepiece_model_file' to construct an albert tokenizer."
+                "Specify 'vocal_file' for Chinese tokenizer and "
+                "'sentencepiece_model_file' for English tokenizer"
+            )
+
+    @property
+    def vocab_size(self):
+        """
+        Return the size of vocabulary.
+
+        Returns:
+            int: The size of vocabulary.
+        """
+        return self.tokenizer.vocab_size
+
+    def _tokenize(self, text):
+        return self.tokenizer._tokenize(text)
+
+    def tokenize(self, text):
+        """
+        Converts a string to a list of tokens.
+
+        Args:
+            text (str): The text to be tokenized.
+
+        Returns:
+            List(str): A list of string representing converted tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import RobertaTokenizer
+
+                tokenizer = RobertaTokenizer.from_pretrained('roberta-wwm-ext')
+                tokens = tokenizer.tokenize('He was a puppeteer')
+
+        """
+
+        return self.tokenizer.tokenize(text)
+
+    def _convert_token_to_id(self, token):
+        """
+        Converts a sequence of tokens (list of string) to a list of ids.
+
+        Args:
+            tokens (list): A list of string representing tokens to be converted.
+
+        Returns:
+            list: Converted ids from tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import AlbertTokenizer
+
+                tokenizer = AlbertTokenizer.from_pretrained('bert-base-uncased')
+                tokens = tokenizer.tokenize('He was a puppeteer')
+                #['▁he', '▁was', '▁a', '▁puppet', 'eer']
+
+                ids = tokenizer.convert_tokens_to_ids(tokens)
+                #[24, 23, 21, 10956, 7911]
+        """
+        return self.tokenizer._convert_token_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """
+        Converts a sequence of tokens (list of string) to a list of ids.
+
+        Args:
+            ids (list): A list of ids to be converted.
+            skip_special_tokens (bool, optional):
+                Whether or not to skip specical tokens. Defaults to `False`.
+
+        Returns:
+            list: A list of converted tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import AlbertTokenizer
+
+                tokenizer = AlbertTokenizer.from_pretrained('bert-base-uncased')
+                ids = [24, 23, 21, 10956, 7911]
+                tokens = tokenizer.convert_ids_to_tokens(ids)
+                #['▁he', '▁was', '▁a', '▁puppet', 'eer']
+        """
+        return self.tokenizer._convert_id_to_token(index)
+
+    def convert_tokens_to_string(self, tokens):
+        """
+        Converts a sequence of tokens (list of string) to a single string.
+
+        Args:
+            tokens (list): A list of string representing tokens to be converted.
+
+        Returns:
+            str: Converted string from tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import AlbertTokenizer
+
+                tokenizer = AlbertTokenizer.from_pretrained('bert-base-uncased')
+                tokens = tokenizer.tokenize('He was a puppeteer')
+                '''
+                ['▁he', '▁was', '▁a', '▁puppet', 'eer']
+                '''
+                strings = tokenizer.convert_tokens_to_string(tokens)
+                '''
+                he was a puppeteer
+                '''
+        """
+        return self.tokenizer.convert_tokens_to_string(tokens)
+
+    def num_special_tokens_to_add(self, pair=False):
+        """
+        Returns the number of added tokens when encoding a sequence with special tokens.
+
+        Args:
+            pair(bool):
+                Whether the input is a sequence pair or a single sequence.
+                Defaults to `False` and the input is a single sequence.
+
+        Returns:
+            int: Number of tokens added to sequences.
+        """
+        return self.tokenizer.num_special_tokens_to_add(pair=pair)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens.
+
+        An Albert sequence has the following format:
+
+        - single sequence:      ``[CLS] X [SEP]``
+        - pair of sequences:        ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+
+        Returns:
+            List[int]: List of input_id with the appropriate special tokens.
+        """
+        return self.tokenizer.build_inputs_with_special_tokens(token_ids_0, token_ids_1=token_ids_1)
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        """
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        A Albert offset_mapping has the following format:
+
+        - single sequence:      ``(0,0) X (0,0)``
+        - pair of sequences:        ``(0,0) A (0,0) B (0,0)``
+
+        Args:
+            offset_mapping_ids_0 (List[tuple]):
+                List of wordpiece offsets to which the special tokens will be added.
+            offset_mapping_ids_1 (List[tuple], optional):
+                Optional second list of wordpiece offsets for offset mapping pairs. Defaults to None.
+
+        Returns:
+            List[tuple]: A list of wordpiece offsets with the appropriate offsets of special tokens.
+        """
+        return self.tokenizer.build_offset_mapping_with_special_tokens(
+            offset_mapping_0, offset_mapping_1=offset_mapping_1
+        )
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+            already_has_special_tokens (bool, optional): Whether or not the token list is already
+                formatted with special tokens for the model. Defaults to None.
+
+        Returns:
+            List[int]: The list of integers either be 0 or 1: 1 for a special token, 0 for a sequence token.
+        """
+        return self.tokenizer.get_special_tokens_mask(
+            token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=already_has_special_tokens
+        )
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        return self.tokenizer.create_token_type_ids_from_sequences(token_ids_0, token_ids_1=token_ids_1)
+
+    def save_resources(self, save_directory):
+        return self.tokenizer.save_resources(save_directory)
+
+
+class AlbertEnglishTokenizer(PretrainedTokenizer):
+    resource_files_names = {
+        "sentencepiece_model_file": "spiece.model",
+    }
+
+    pretrained_resource_files_map = {
+        "sentencepiece_model_file": {
+            "albert-base-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-base-v1.spiece.model",
+            "albert-large-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-large-v1.spiece.model",
+            "albert-xlarge-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xlarge-v1.spiece.model",
+            "albert-xxlarge-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xxlarge-v1.spiece.model",
+            "albert-base-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-base-v2.spiece.model",
+            "albert-large-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-large-v2.spiece.model",
+            "albert-xlarge-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xlarge-v2.spiece.model",
+            "albert-xxlarge-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xxlarge-v2.spiece.model",
+        },
+    }
+
+    pretrained_init_configuration = {
+        "albert-base-v1": {
+            "do_lower_case": True,
+            "remove_space": True,
+            "keep_accents": False,
+            "unk_token": "<unk>",
+            "pad_token": "<pad>",
+        },
+        "albert-large-v1": {
+            "do_lower_case": True,
+            "remove_space": True,
+            "keep_accents": False,
+            "unk_token": "<unk>",
+            "pad_token": "<pad>",
+        },
+        "albert-xlarge-v1": {
+            "do_lower_case": True,
+            "remove_space": True,
+            "keep_accents": False,
+            "unk_token": "<unk>",
+            "pad_token": "<pad>",
+        },
+        "albert-xxlarge-v1": {
+            "do_lower_case": True,
+            "remove_space": True,
+            "keep_accents": False,
+            "unk_token": "<unk>",
+            "pad_token": "<pad>",
+        },
+        "albert-base-v2": {
+            "do_lower_case": True,
+            "remove_space": True,
+            "keep_accents": False,
+            "unk_token": "<unk>",
+            "pad_token": "<pad>",
+        },
+        "albert-large-v2": {
+            "do_lower_case": True,
+            "remove_space": True,
+            "keep_accents": False,
+            "unk_token": "<unk>",
+            "pad_token": "<pad>",
+        },
+        "albert-xlarge-v2": {
+            "do_lower_case": True,
+            "remove_space": True,
+            "keep_accents": False,
+            "unk_token": "<unk>",
+            "pad_token": "<pad>",
+        },
+        "albert-xxlarge-v2": {
+            "do_lower_case": True,
+            "remove_space": True,
+            "keep_accents": False,
+            "unk_token": "<unk>",
+            "pad_token": "<pad>",
+        },
+    }
+    max_model_input_sizes = {
+        "albert-base-v1": 512,
+        "albert-large-v1": 512,
+        "albert-xlarge-v1": 512,
+        "albert-xxlarge-v1": 512,
+        "albert-base-v2": 512,
+        "albert-large-v2": 512,
+        "albert-xlarge-v2": 512,
+        "albert-xxlarge-v2": 512,
+    }
+
+    def __init__(
+        self,
+        sentencepiece_model_file,
+        do_lower_case=True,
+        remove_space=True,
+        keep_accents=False,
+        bos_token="[CLS]",
+        eos_token="[SEP]",
+        unk_token="<unk>",
+        sep_token="[SEP]",
+        pad_token="<pad>",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        sp_model_kwargs=None,
+        **kwargs
+    ):
+
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.keep_accents = keep_accents
+        self.sentencepiece_model_file = sentencepiece_model_file
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(sentencepiece_model_file)
+
+    @property
+    def vocab_size(self):
+        return len(self.sp_model)
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.sentencepiece_model_file)
+
+    def preprocess_text(self, inputs):
+        if self.remove_space:
+            outputs = " ".join(inputs.strip().split())
+        else:
+            outputs = inputs
+        outputs = outputs.replace("``", '"').replace("''", '"')
+
+        if not self.keep_accents:
+            outputs = unicodedata.normalize("NFKD", outputs)
+            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
+        if self.do_lower_case:
+            outputs = outputs.lower()
+
+        return outputs
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        text = self.preprocess_text(text)
+        pieces = self.sp_model.encode(text, out_type=str)
+        new_pieces = []
+        for piece in pieces:
+            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
+                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
+                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
+                    if len(cur_pieces[0]) == 1:
+                        cur_pieces = cur_pieces[1:]
+                    else:
+                        cur_pieces[0] = cur_pieces[0][1:]
+                cur_pieces.append(piece[-1])
+                new_pieces.extend(cur_pieces)
+            else:
+                new_pieces.append(piece)
+
+        return new_pieces
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) to an id using the vocab."""
+        return self.sp_model.PieceToId(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) to a token (str) using the vocab."""
+        return self.sp_model.IdToPiece(index)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    def num_special_tokens_to_add(self, pair=False):
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return cls + token_ids_0 + sep
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+    def save_resources(self, save_directory):
+        for name, file_name in self.resource_files_names.items():
+            save_path = os.path.join(save_directory, file_name)
+            if os.path.abspath(self.sentencepiece_model_file) != os.path.abspath(save_path) and os.path.isfile(
+                self.sentencepiece_model_file
+            ):
+                copyfile(self.sentencepiece_model_file, save_path)
+            elif not os.path.isfile(self.sentencepiece_model_file):
+                with open(save_path, "wb") as fi:
+                    content_spiece_model = self.sp_model.serialized_model_proto()
+                    fi.write(content_spiece_model)
+
+
+class AlbertChineseTokenizer(BertTokenizer):
+    resource_files_names = {"vocab_file": "vocab.txt"}
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "albert-chinese-tiny": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-tiny.vocab.txt",
+            "albert-chinese-small": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-small.vocab.txt",
+            "albert-chinese-base": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-base.vocab.txt",
+            "albert-chinese-large": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-large.vocab.txt",
+            "albert-chinese-xlarge": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-xlarge.vocab.txt",
+            "albert-chinese-xxlarge": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-xxlarge.vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "albert-chinese-tiny": {
+            "do_lower_case": False,
+            "unk_token": "[UNK]",
+            "pad_token": "[PAD]",
+        },
+        "albert-chinese-small": {
+            "do_lower_case": False,
+            "unk_token": "[UNK]",
+            "pad_token": "[PAD]",
+        },
+        "albert-chinese-base": {
+            "do_lower_case": False,
+            "unk_token": "[UNK]",
+            "pad_token": "[PAD]",
+        },
+        "albert-chinese-large": {
+            "do_lower_case": False,
+            "unk_token": "[UNK]",
+            "pad_token": "[PAD]",
+        },
+        "albert-chinese-xlarge": {
+            "do_lower_case": False,
+            "unk_token": "[UNK]",
+            "pad_token": "[PAD]",
+        },
+        "albert-chinese-xxlarge": {
+            "do_lower_case": False,
+            "unk_token": "[UNK]",
+            "pad_token": "[PAD]",
+        },
+    }
+    max_model_input_sizes = {
+        "albert-chinese-tiny": 512,
+        "albert-chinese-small": 512,
+        "albert-chinese-base": 512,
+        "albert-chinese-large": 512,
+        "albert-chinese-xlarge": 512,
+        "albert-chinese-xxlarge": 512,
+    }
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs
+    ):
+        super(AlbertChineseTokenizer, self).__init__(
+            vocab_file,
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/artist/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/artist/__init__.py
new file mode 100644
index 000000000..595add0ae
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/artist/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/artist/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/artist/configuration.py
new file mode 100644
index 000000000..8a0fd4c0e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/artist/configuration.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MBart model configuration"""
+from __future__ import annotations
+
+from paddlenlp.transformers import GPTConfig
+
+__all__ = ["ARTIST_PRETRAINED_INIT_CONFIGURATION", "ARTIST_PRETRAINED_RESOURCE_FILES_MAP", "ArtistConfig"]
+
+ARTIST_PRETRAINED_INIT_CONFIGURATION = {
+    "pai-painter-base-zh": {
+        "vocab_size": 37512,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu_python",
+        "hidden_dropout_prob": 0.0,
+        "attention_probs_dropout_prob": 0.0,
+        "max_position_embeddings": 288,
+        "type_vocab_size": 1,  # no use
+        "initializer_range": 0.02,
+        "pad_token_id": 16384,  # 0 + 16384
+        "eos_token_id": 16486,  # 102 + 16384
+        "bos_token_id": 16485,  # 101 + 16384
+        "eol_token_id": 16486,  # 102 + 16384
+    },
+    "pai-painter-painting-base-zh": {
+        "vocab_size": 37512,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu_python",
+        "hidden_dropout_prob": 0.0,
+        "attention_probs_dropout_prob": 0.0,
+        "max_position_embeddings": 288,
+        "type_vocab_size": 1,  # no use
+        "initializer_range": 0.02,
+        "pad_token_id": 16384,  # 0 + 16384
+        "eos_token_id": 16486,  # 102 + 16384
+        "bos_token_id": 16485,  # 101 + 16384
+        "eol_token_id": 16486,  # 102 + 16384
+    },
+    "pai-painter-scenery-base-zh": {
+        "vocab_size": 37512,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu_python",
+        "hidden_dropout_prob": 0.0,
+        "attention_probs_dropout_prob": 0.0,
+        "max_position_embeddings": 288,
+        "type_vocab_size": 1,  # no use
+        "initializer_range": 0.02,
+        "pad_token_id": 16384,  # 0 + 16384
+        "eos_token_id": 16486,  # 102 + 16384
+        "bos_token_id": 16485,  # 101 + 16384
+        "eol_token_id": 16486,  # 102 + 16384
+    },
+    "pai-painter-commercial-base-zh": {
+        "vocab_size": 37512,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu_python",
+        "hidden_dropout_prob": 0.0,
+        "attention_probs_dropout_prob": 0.0,
+        "max_position_embeddings": 288,
+        "type_vocab_size": 1,  # no use
+        "initializer_range": 0.02,
+        "pad_token_id": 16384,  # 0 + 16384
+        "eos_token_id": 16486,  # 102 + 16384
+        "bos_token_id": 16485,  # 101 + 16384
+        "eol_token_id": 16486,  # 102 + 16384
+    },
+    "pai-painter-large-zh": {
+        "vocab_size": 37512,
+        "hidden_size": 1024,
+        "num_hidden_layers": 24,
+        "num_attention_heads": 16,
+        "intermediate_size": 4096,
+        "hidden_act": "gelu_python",
+        "hidden_dropout_prob": 0.0,
+        "attention_probs_dropout_prob": 0.0,
+        "max_position_embeddings": 288,
+        "type_vocab_size": 1,
+        "initializer_range": 0.02,
+        "pad_token_id": 16384,  # 0 + 16384
+        "eos_token_id": 16486,  # 102 + 16384
+        "bos_token_id": 16485,  # 101 + 16384
+        "eol_token_id": 16486,  # 102 + 16384
+    },
+}
+ARTIST_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "pai-painter-base-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/artist/pai-painter-base-zh/model_state.pdparams",
+        "pai-painter-painting-base-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/artist/pai-painter-painting-base-zh/model_state.pdparams",
+        "pai-painter-scenery-base-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/artist/pai-painter-scenery-base-zh/model_state.pdparams",
+        "pai-painter-commercial-base-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/artist/pai-painter-commercial-base-zh/model_state.pdparams",
+        "pai-painter-large-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/artist/pai-painter-large-zh/model_state.pdparams",
+    }
+}
+
+
+class ArtistConfig(GPTConfig):
+    pretrained_init_configuration = ARTIST_PRETRAINED_INIT_CONFIGURATION
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/artist/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/artist/modeling.py
new file mode 100644
index 000000000..4910d2011
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/artist/modeling.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 Alibaba PAI team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import paddle
+import paddle.nn.functional as F
+
+from ..gpt.modeling import GPTLMHead, GPTLMHeadModel, GPTModel
+from .configuration import (
+    ARTIST_PRETRAINED_INIT_CONFIGURATION,
+    ARTIST_PRETRAINED_RESOURCE_FILES_MAP,
+    ArtistConfig,
+)
+
+__all__ = [
+    "ArtistModel",
+    "ArtistForConditionalGeneration",
+]
+
+# set gelu_new
+F.gelu_python = F.gelu
+
+
+class ArtistModel(GPTModel):
+    config_class = ArtistConfig
+    pretrained_init_configuration = ARTIST_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = ARTIST_PRETRAINED_RESOURCE_FILES_MAP
+
+
+class ArtistForConditionalGeneration(GPTLMHeadModel):
+    """
+    The ArtistT(GPT) Model with a `language modeling` head on top.
+
+    Args:
+        gpt (:class:`ArtistModel`):
+            An instance of :class:`ArtistModel`.
+
+    """
+
+    config_class = ArtistConfig
+    pretrained_init_configuration = ARTIST_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = ARTIST_PRETRAINED_RESOURCE_FILES_MAP
+
+    def __init__(self, config: ArtistConfig):
+        super().__init__(config)
+        self.lm_head = GPTLMHead(config)
+
+    @staticmethod
+    def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id):
+        # we don't use attention_mask
+        attention_mask = paddle.zeros_like(input_ids, dtype=paddle.get_default_dtype())
+        return paddle.unsqueeze(attention_mask, axis=[1, 2])
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/artist/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/artist/tokenizer.py
new file mode 100644
index 000000000..2a4074e2f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/artist/tokenizer.py
@@ -0,0 +1,253 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..bert.tokenizer import BertTokenizer
+
+__all__ = [
+    "ArtistTokenizer",
+]
+
+
+class ArtistTokenizer(BertTokenizer):
+    """
+    Constructs an Artist tokenizer. `ArtistTokenizer` is almost identical to `BertTokenizer`.
+
+    Args:
+        vocab_file (str):
+            The vocabulary file path (ends with '.txt') required to instantiate
+            a `WordpieceTokenizer`.
+        do_lower_case (bool, optional):
+            Whether to lowercase the input when tokenizing.
+            Defaults to `True`.
+        image_vocab_size (int, optional):
+            The vocabulary size of image.
+            Defaults to `16384`.
+        do_basic_tokenize (bool, optional):
+            Whether to use a basic tokenizer before a WordPiece tokenizer.
+            Defaults to `True`.
+        never_split (Iterable, optional):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`. Defaults to `None`.
+        unk_token (str, optional):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str, optional):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str, optional):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str, optional):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str, optional):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+        tokenize_chinese_chars (bool, optional):
+            Whether to tokenize Chinese characters.
+            Defaults to `True`.
+        strip_accents: (bool, optional):
+            Whether to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+            Defaults to `None`.
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import ArtistTokenizer
+            tokenizer = ArtistTokenizer.from_pretrained('pai-painter-painting-base-zh')
+
+            inputs = tokenizer('风阁水帘今在眼，且来先看早梅红', return_token_type_ids=False)
+            print(inputs)
+
+            '''
+            {'input_ids': [23983, 23707, 20101, 18750, 17175, 18146, 21090, 24408, 17068,
+                           19725, 17428, 21076, 19577, 19833, 21657]}
+            '''
+
+    """
+
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "pai-painter-base-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/artist/pai-painter-base-zh/vocab.txt",
+            "pai-painter-painting-base-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/artist/pai-painter-painting-base-zh/vocab.txt",
+            "pai-painter-scenery-base-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/artist/pai-painter-scenery-base-zh/vocab.txt",
+            "pai-painter-commercial-base-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/artist/pai-painter-commercial-base-zh/vocab.txt",
+            "pai-painter-large-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/artist/pai-painter-large-zh/vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "pai-painter-base-zh": {
+            "do_lower_case": True,
+            "image_vocab_size": 16384,
+        },
+        "pai-painter-painting-base-zh": {
+            "do_lower_case": True,
+            "image_vocab_size": 16384,
+        },
+        "pai-painter-scenery-base-zh": {
+            "do_lower_case": True,
+            "image_vocab_size": 16384,
+        },
+        "pai-painter-commercial-base-zh": {
+            "do_lower_case": True,
+            "image_vocab_size": 16384,
+        },
+        "pai-painter-large-zh": {
+            "do_lower_case": True,
+            "image_vocab_size": 16384,
+        },
+    }
+    max_model_input_sizes = {
+        "pai-painter-base-zh": 32,
+        "pai-painter-painting-base-zh": 32,
+        "pai-painter-scenery-base-zh": 32,
+        "pai-painter-commercial-base-zh": 32,
+        "pai-painter-large-zh": 32,
+    }
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        image_vocab_size=16384,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs
+    ):
+        super().__init__(
+            vocab_file,
+            do_lower_case,
+            do_basic_tokenize,
+            never_split,
+            unk_token,
+            sep_token,
+            pad_token,
+            cls_token,
+            mask_token,
+            tokenize_chinese_chars,
+            strip_accents,
+            **kwargs,
+        )
+        # we need add image_vocab_size offset
+        # for example [523, 102, 0, 0]
+        # => [523 + image_vocab_size, 102 + image_vocab_size, 0 + image_vocab_size, 0 + image_vocab_size]
+        self.image_vocab_size = image_vocab_size
+
+    def _convert_token_to_id_with_added_voc(self, token):
+        if token is None:
+            return None
+
+        if token in self.added_tokens_encoder:
+            # note: process image_vocab_size offset
+            return self.added_tokens_encoder[token] + self.image_vocab_size
+        # note: process image_vocab_size offset
+        return self._convert_token_to_id(token) + self.image_vocab_size
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        if isinstance(ids, int):
+            if ids - self.image_vocab_size in self.added_tokens_decoder:
+                return self.added_tokens_decoder[ids - self.image_vocab_size]
+            else:
+                # note: process image_vocab_size offset
+                return self._convert_id_to_token(ids - self.image_vocab_size)
+        tokens = []
+        for index in ids:
+            index = int(index)
+            if skip_special_tokens and index in self.all_special_ids:
+                continue
+            if index - self.image_vocab_size in self.added_tokens_decoder:
+                tokens.append(self.added_tokens_decoder[index - self.image_vocab_size])
+            else:
+                # note: process image_vocab_size offset
+                tokens.append(self._convert_id_to_token(index - self.image_vocab_size))
+        return tokens
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence (we don't add special tokens).
+
+        An Artist sequence has the following format:
+
+        - single sequence:      ``X``
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs.
+                We do'nt use sequence pairs.
+                Defaults to None.
+
+        Returns:
+            List[int]: List of input_id.
+        """
+        return token_ids_0
+
+    def __call__(
+        self,
+        text,
+        text_pair=None,
+        max_length=32,  # default
+        stride=0,
+        is_split_into_words=False,
+        padding="max_length",  # default
+        truncation=True,  # default
+        return_position_ids=False,
+        return_token_type_ids=False,  # don't return token_type_ids
+        return_attention_mask=False,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_special_tokens_mask=False,
+        return_dict=True,
+        return_offsets_mapping=False,
+        add_special_tokens=True,
+        pad_to_multiple_of=None,
+        return_tensors=None,
+        verbose: bool = True,
+        **kwargs
+    ):
+        return super().__call__(
+            text,
+            text_pair,
+            max_length,
+            stride,
+            is_split_into_words,
+            padding,
+            truncation,
+            return_position_ids,
+            return_token_type_ids,
+            return_attention_mask,
+            return_length,
+            return_overflowing_tokens,
+            return_special_tokens_mask,
+            return_dict,
+            return_offsets_mapping,
+            add_special_tokens,
+            pad_to_multiple_of,
+            return_tensors,
+            verbose,
+            **kwargs,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/attention_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/attention_utils.py
new file mode 100644
index 000000000..0bcd644df
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/attention_utils.py
@@ -0,0 +1,619 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import copy
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn import Layer
+
+
+class Registry(object):
+    def __init__(self):
+        self.cls_dict = {}
+
+    def register(self, name):
+        def add_item(name, cls):
+            self.cls_dict[name] = cls
+            return cls
+
+        return lambda cls: add_item(name, cls)
+
+
+AttentionRegistry = Registry()
+
+
+def create_bigbird_rand_mask_idx(
+    num_layers, query_length, key_length, num_heads, block_size, window_size, num_global_blocks, num_rand_blocks, seed
+):
+    # TODO(zsj): need to simplify
+    num_key_blocks = key_length // block_size
+    num_query_blocks = query_length // block_size
+    num_window_blocks = window_size // 2
+    all_key_blocks_idx = np.arange(0, num_key_blocks, dtype=np.int32)
+    rand_mask_idx = [[] for i in range(num_heads)]
+    for query_block_idx in range(num_query_blocks):
+        left_key_block_idx = max(0, query_block_idx - num_window_blocks)
+        right_key_block_idx = min(query_block_idx + num_window_blocks, num_key_blocks - 1)
+
+        illegal_blocks_idx = [i for i in range(left_key_block_idx, right_key_block_idx + 1)]
+        illegal_blocks_idx.extend([i for i in range(num_global_blocks)])
+        left_key_block_idx = query_block_idx - num_window_blocks
+        right_key_block_idx = query_block_idx + num_window_blocks
+
+        if num_global_blocks > left_key_block_idx:
+            num_fill_blocks = num_global_blocks - left_key_block_idx
+            illegal_blocks_idx.extend([i for i in range(num_key_blocks - num_fill_blocks, num_key_blocks)])
+        if right_key_block_idx >= num_key_blocks:
+            num_fill_blocks = right_key_block_idx - num_key_blocks + 1
+            illegal_blocks_idx.extend([i for i in range(num_global_blocks, num_global_blocks + num_fill_blocks)])
+
+        illegal_blocks_idx = set(illegal_blocks_idx)
+
+        for i in range(num_heads):
+            legal_blocks_idx = []
+            perm_block = np.random.permutation(all_key_blocks_idx)
+            for j in perm_block:
+                if j not in illegal_blocks_idx:
+                    legal_blocks_idx.append(j)
+                if len(legal_blocks_idx) == num_rand_blocks:
+                    break
+            rand_mask_idx[i].append(legal_blocks_idx)
+    rand_mask_idx = np.stack(rand_mask_idx, axis=0)
+    rand_mask_idx = rand_mask_idx[:, num_global_blocks:] - num_global_blocks // 2
+    # transform rand_mask_idx
+    H = rand_mask_idx.shape[0]
+    L = rand_mask_idx.shape[1]
+    R = rand_mask_idx.shape[2]
+    rand_mask_idx = rand_mask_idx.reshape([-1, 1])
+    head_idx = np.arange(H).reshape([-1, 1])
+    head_idx = np.pad(head_idx, ([0, 0], [0, L * R - 1]), mode="edge").reshape([-1, 1])
+    rand_mask_idx_list = np.concatenate([head_idx, rand_mask_idx], axis=1)
+    return rand_mask_idx_list
+
+
+def create_bigbird_rand_mask_idx_list(
+    num_layers, query_length, key_length, num_heads, block_size, window_size, num_global_blocks, num_rand_blocks, seed
+):
+    rand_mask_idx_list = [
+        create_bigbird_rand_mask_idx(
+            num_layers,
+            query_length,
+            key_length,
+            num_heads,
+            block_size,
+            window_size,
+            num_global_blocks,
+            num_rand_blocks,
+            seed,
+        )
+        for i in range(num_layers)
+    ]
+    rand_mask_idx_list = np.stack(rand_mask_idx_list)
+    return rand_mask_idx_list
+
+
+def _convert_param_attr_to_list(param_attr, n):
+    if isinstance(param_attr, (list, tuple)):
+        assert len(param_attr) == n, "length of param_attr should be %d when it is a list/tuple" % n
+        param_attrs = []
+        for attr in param_attr:
+            if isinstance(attr, bool):
+                if attr:
+                    param_attrs.append(ParamAttr._to_attr(None))
+                else:
+                    param_attrs.append(False)
+            else:
+                param_attrs.append(ParamAttr._to_attr(attr))
+    elif isinstance(param_attr, bool):
+        param_attrs = []
+        if param_attr:
+            param_attrs = [ParamAttr._to_attr(None) for i in range(n)]
+        else:
+            param_attrs = [False] * n
+    else:
+        param_attrs = []
+        attr = ParamAttr._to_attr(param_attr)
+        for i in range(n):
+            attr_i = copy.deepcopy(attr)
+            if attr.name:
+                attr_i.name = attr_i.name + "_" + str(i)
+            param_attrs.append(attr_i)
+    return param_attrs
+
+
+class Linear3D(Layer):
+    def __init__(self, hidden_size, num_attention_heads, size_per_head, weight_attr=None, bias_attr=None):
+        super(Linear3D, self).__init__()
+        self._dtype = self._helper.get_default_dtype()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self.weight = self.create_parameter(
+            shape=[hidden_size, hidden_size], attr=self._weight_attr, dtype=self._dtype, is_bias=False
+        )
+        self.bias = self.create_parameter(shape=[hidden_size], attr=self._bias_attr, dtype=self._dtype, is_bias=True)
+        self.size_per_head = size_per_head
+        self.num_attention_heads = num_attention_heads
+        self.hidden_size = hidden_size
+
+    def forward(self, input):
+        # abc,cde->adbe
+        B, T, D = input.shape
+        H = self.num_attention_heads
+        result = paddle.matmul(input, self.weight)
+        reshape_b = paddle.reshape(self.bias, [1, 1, D])
+        result += reshape_b
+        result = paddle.reshape(result, [B, T, H, -1])
+        result = paddle.transpose(result, [0, 2, 1, 3])
+        return result
+
+
+class Attention(Layer):
+    def __init__(self, num_heads=1, block_size=1, window_size=3, num_global_blocks=1, num_rand_blocks=1, seed=None):
+        super().__init__()
+
+    def forward(
+        self,
+        query_matrix,
+        key_matrix,
+        value_matrix,
+        d_head,
+        attn_mask=None,
+        rand_mask_idx=None,
+        query_mask=None,
+        key_mask=None,
+        dropout=None,
+    ):
+        raise NotImplementedError
+
+
+@AttentionRegistry.register("default_attention")
+class DefaultAttention(Attention):
+    def forward(
+        self,
+        query_matrix,
+        key_matrix,
+        value_matrix,
+        d_head,
+        attn_mask=None,
+        rand_mask_idx=None,
+        query_mask=None,
+        key_mask=None,
+        dropout=None,
+    ):
+        # scale dot product attention
+        product = paddle.matmul(x=query_matrix, y=key_matrix, transpose_y=True)
+        product = product * (d_head**-0.5)
+        product += (1 - paddle.matmul(query_mask, key_mask)) * -1e6
+        if attn_mask is not None:
+            product = product + attn_mask
+        weights = F.softmax(product)
+        if dropout:
+            weights = F.dropout(weights, dropout, training=self.training, mode="upscale_in_train")
+
+        out = paddle.matmul(weights, value_matrix)
+        return out
+
+
+@AttentionRegistry.register("bigbird")
+class BigBirdSparseAttention(Attention):
+    def __init__(self, num_heads=1, block_size=1, window_size=3, num_global_blocks=1, num_rand_blocks=1, seed=None):
+        super(BigBirdSparseAttention, self).__init__(
+            num_heads, block_size, window_size, num_global_blocks, num_rand_blocks, seed
+        )
+        for k, v in locals().items():
+            if k != "self":
+                setattr(self, k, v)
+        self.num_global_blocks_back = num_global_blocks // 2
+        self.num_global_blocks_front = (
+            num_global_blocks // 2 if num_global_blocks % 2 == 0 else num_global_blocks // 2 + 1
+        )
+
+    def _get_band_mask(self, blocked_query_mask, blocked_key_mask, batch_size, sequence_length):
+        """
+        Return second mask: [B, 1, L-G, bs, G+W]
+        """
+        GB = self.num_global_blocks_back
+        GF = self.num_global_blocks_front
+        G = self.num_global_blocks
+        W = self.window_size
+        bs = self.block_size
+        T = sequence_length
+        L = T // bs  # blocked length
+        B = batch_size
+        H = self.num_heads
+        # G+W+R
+        # query_mask: [B, L, bs]
+        # key_mask: [B, L, bs]
+        # [B, L-G, bs, 1] * [B, L-G, 1, G*bs] -> [B, L-G, bs, G*bs]
+        temp_query_mask = paddle.reshape(blocked_query_mask[:, GF:-GB], [B, L - G, bs, 1])
+        temp_key_mask_front = paddle.reshape(blocked_key_mask[:, :GF], [B, 1, 1, GF * bs])
+        global_block_mask_front = paddle.einsum("blqd,bmdk->blqk", temp_query_mask, temp_key_mask_front)
+
+        temp_key_mask_back = paddle.reshape(blocked_key_mask[:, -GB:], [B, 1, 1, GB * bs])
+        global_block_mask_back = paddle.einsum("blqd,bmdk->blqk", temp_query_mask, temp_key_mask_back)
+        # create window block mask
+        key_mask_list = []
+        for query_block_id in range(GF, GF + W // 2):
+            left_block_id = query_block_id - W // 2
+            right_block_id = query_block_id + W // 2
+            zero_key_mask = paddle.zeros_like(blocked_key_mask[:, -(W - (right_block_id + 1 - G)) : -GB])
+            temp_key_mask = paddle.concat([blocked_key_mask[:, GF : (right_block_id + 1)], zero_key_mask], axis=1)
+            temp_key_mask = paddle.unsqueeze(temp_key_mask, 1)
+            key_mask_list.append(temp_key_mask)
+        roll_key_mask1 = paddle.concat(key_mask_list, axis=1)
+        roll_key_mask1 = paddle.reshape(roll_key_mask1, [0, 0, W * bs])
+        key_mask_list = []
+
+        band_length = L - G - W // 2 * 2
+        for query_block_id in range(GF + W // 2, GF + W // 2 + W):
+            left_block_id = query_block_id - W // 2
+            right_block_id = query_block_id + W // 2
+            key_mask_list.append(blocked_key_mask[:, left_block_id : left_block_id + band_length])
+        window_key_mask = paddle.concat(key_mask_list, axis=2)
+        window_key_mask = paddle.reshape(window_key_mask, [0, 0, W * bs])
+
+        key_mask_list = []
+        for query_block_id in range((L - GB) - W // 2, L - GB):
+            left_block_id = query_block_id - W // 2
+            right_block_id = query_block_id + W // 2
+            zero_key_mask = paddle.zeros_like(blocked_key_mask[:, GF : GF + W - (L - left_block_id - GB)])
+            temp_key_mask = paddle.concat([zero_key_mask, blocked_key_mask[:, left_block_id:-GB]], axis=1)
+            temp_key_mask = paddle.unsqueeze(temp_key_mask, 1)
+            key_mask_list.append(temp_key_mask)
+        roll_key_mask2 = paddle.concat(key_mask_list, axis=1)
+        roll_key_mask2 = paddle.reshape(roll_key_mask2, [0, 0, W * bs])
+
+        window_key_mask = paddle.concat([roll_key_mask1, window_key_mask, roll_key_mask2], axis=1)
+        window_key_mask = paddle.unsqueeze(window_key_mask, axis=2)
+        # [B, L-G, bs, 1] * [B, L-G, 1, W*bs] -> [B, L-G, bs, W*bs]
+        window_block_mask = paddle.einsum("blkd,bldq->blkq", temp_query_mask, window_key_mask)
+        band_mask = paddle.concat([global_block_mask_front, window_block_mask, global_block_mask_back], axis=3)
+        band_mask = paddle.unsqueeze(band_mask, 1)  # for head
+        band_mask = paddle.expand(band_mask, [B, H, L - G, bs, -1])
+        return band_mask
+
+    def _get_band_matrix(self, blocked_matrix, B, T):
+        """
+        return global and window matrix: [B, H, L-G, (G+W) * bs, -1]
+        """
+        # blocked_matrix: [B, H, L, bs, -1]
+        GB = self.num_global_blocks_back
+        GF = self.num_global_blocks_front
+        G = self.num_global_blocks
+        W = self.window_size
+        bs = self.block_size
+        L = T // bs  # blocked length
+        H = self.num_heads
+
+        # get roll matrix
+        blocked_list = []
+        for query_block_id in range(GF, GF + W // 2):
+            left_block_id = query_block_id - W // 2
+            right_block_id = query_block_id + W // 2
+            temp_blocked_matrix_list = [
+                blocked_matrix[:, :, 0 : (right_block_id + 1)],
+                blocked_matrix[:, :, -(G + W - right_block_id - 1) :],
+            ]
+            temp_blocked_matrix = paddle.concat(temp_blocked_matrix_list, axis=2)
+            temp_blocked_matrix = paddle.unsqueeze(temp_blocked_matrix, axis=2)
+            blocked_list.append(temp_blocked_matrix)
+
+        # get window matrix
+        band_length = L - G - W // 2 * 2
+        band_matrix_list = []
+        for query_block_id in range(GF + W // 2, GF + W // 2 + W):
+            left_block_id = query_block_id - W // 2
+            right_block_id = query_block_id + W // 2
+            band_matrix_list.append(
+                paddle.unsqueeze(blocked_matrix[:, :, left_block_id : left_block_id + band_length], axis=3)
+            )
+        band_matrix = paddle.concat(band_matrix_list, axis=3)
+
+        global_blocked_front_matrix = paddle.unsqueeze(blocked_matrix[:, :, :GF], axis=2)
+        global_blocked_front_matrix = paddle.expand(global_blocked_front_matrix, [B, H, band_length, GF, bs, -1])
+        global_blocked_back_matrix = paddle.unsqueeze(blocked_matrix[:, :, -GB:], axis=2)
+        global_blocked_back_matrix = paddle.expand(global_blocked_back_matrix, [B, H, band_length, GB, bs, -1])
+        band_matrix = paddle.concat([global_blocked_front_matrix, band_matrix, global_blocked_back_matrix], axis=3)
+        blocked_list.append(band_matrix)
+
+        for query_block_id in range(L - GB - W // 2, L - GB):
+            left_block_id = query_block_id - W // 2
+            right_block_id = query_block_id + W // 2
+            temp_blocked_matrix_list = [
+                blocked_matrix[:, :, 0 : G + W - (L - left_block_id)],
+                blocked_matrix[:, :, left_block_id:],
+            ]
+            temp_blocked_matrix = paddle.concat(temp_blocked_matrix_list, axis=2)
+            temp_blocked_matrix = paddle.unsqueeze(temp_blocked_matrix, axis=2)
+            blocked_list.append(temp_blocked_matrix)
+
+        band_matrix = paddle.concat(blocked_list, axis=2)
+        band_matrix = paddle.reshape(band_matrix, [B, H, L - G, (G + W) * bs, -1])
+        return band_matrix
+
+    def _get_rand_mask(self, blocked_query_mask, blocked_key_mask, rand_mask_idx, batch_size, sequence_length):
+        """
+        return random mask: [B, H, L-G, bs, R * bs]
+        """
+        # rand_mask_idx: [H, T]
+        # blocked_query_mask: [B, L, bs]
+        # blocked_key_mask: [B, L, bs]
+        bs = self.block_size
+        B = batch_size
+        L = sequence_length // bs
+        H = self.num_heads
+        GB = self.num_global_blocks_back
+        GF = self.num_global_blocks_front
+        R = self.num_rand_blocks
+        temp_block_key_mask = paddle.unsqueeze(blocked_key_mask, 1)
+        temp_block_key_mask = paddle.expand(temp_block_key_mask, [B, H, L, -1])
+        temp_block_key_mask_list = [paddle.gather_nd(temp_block_key_mask[b], rand_mask_idx) for b in range(B)]
+        temp_block_key_mask = paddle.concat(temp_block_key_mask_list, 0)
+        temp_block_key_mask = paddle.reshape(
+            temp_block_key_mask, [B, temp_block_key_mask.shape[0] // B // (L - GF - GB) // R, L - GF - GB, -1]
+        )
+        rand_mask = paddle.einsum("blq,bhlk->bhlqk", blocked_query_mask[:, GF:-GB], temp_block_key_mask)
+        return rand_mask
+
+    def _gather_random_key_value(self, blocked_matrix, rand_mask_idx, B, T):
+        """
+        return random key matrix: [B, H, L-G, R * bs, -1]
+        """
+        # blocked_matrix: [B, H, L, bs, -1]
+        # rand_mask_idx: [H, T]
+        G = self.num_global_blocks
+        H = self.num_heads
+        bs = self.block_size
+        L = T // bs
+        R = self.num_rand_blocks
+        gathered_matrix = paddle.concat(
+            [paddle.gather_nd(blocked_matrix[b, :], rand_mask_idx) for b in range(B)], axis=0
+        )
+        gathered_matrix = paddle.reshape(gathered_matrix, [B, H, L - G, R * bs, -1])
+        return gathered_matrix
+
+    def _get_global_out(self, query_matrix, key_matrix, value_matrix, key_mask, d_head, dropout, is_front=True):
+        GB = self.num_global_blocks_back
+        GF = self.num_global_blocks_front
+        if is_front:
+            global_query_matrix = query_matrix[:, :, 0 : GF * self.block_size]
+        else:
+            global_query_matrix = query_matrix[:, :, -GB * self.block_size :]
+        global_product = paddle.matmul(global_query_matrix, key_matrix, transpose_y=True)
+        global_product = global_product * (d_head**-0.5)
+        global_product += (1 - key_mask) * -1e6
+        global_weights = F.softmax(global_product)
+        # [B, H, GF*bs, T] * [B, H, T, D] -> [B, H, GF*bs, D]
+        global_product = paddle.matmul(global_weights, value_matrix)
+        return global_product
+
+    def _get_splited_matrix(self, matrix):
+        W = self.window_size // 2
+        return matrix[:, :, 0:W], matrix[:, :, W:-W], matrix[:, :, -W:]
+
+    def forward(
+        self,
+        query_matrix,
+        key_matrix,
+        value_matrix,
+        d_head,
+        attn_mask=None,
+        rand_mask_idx=None,
+        query_mask=None,
+        key_mask=None,
+        dropout=None,
+    ):
+        """
+        query_matrix: [B, H, T, D]
+        key_matrix: [B, H, T, D]
+        value_matrix: [B, H, T, D]
+        query_mask: [B, 1, T, 1]  bool mask
+        key_mask: [B, 1, 1, T]    bool mask
+        rand_mask_idx: [H, T//bs, bs]
+        Global Attention
+        Random Attention
+        Window Attention
+        """
+        B = query_matrix.shape[0]  # batch_size
+        H = self.num_heads
+        T = query_matrix.shape[2]  # sequence_length
+        G = self.num_global_blocks
+        GB = self.num_global_blocks_back
+        GF = self.num_global_blocks_front
+        R = self.num_rand_blocks
+        bs = self.block_size
+        L = T // bs  # blocked length
+
+        blocked_query_matrix = paddle.reshape(query_matrix, [B, H, L, bs, -1])
+        blocked_key_matrix = paddle.reshape(key_matrix, [B, H, L, bs, -1])
+        blocked_value_matrix = paddle.reshape(value_matrix, [B, H, L, bs, -1])
+        blocked_query_mask = paddle.reshape(query_mask, [B, L, bs])
+        blocked_key_mask = paddle.reshape(key_mask, [B, L, bs])
+
+        # 1. global_front_product
+        global_front_out = self._get_global_out(query_matrix, key_matrix, value_matrix, key_mask, d_head, dropout)
+
+        # 2. global_back_product
+        global_back_out = self._get_global_out(
+            query_matrix, key_matrix, value_matrix, key_mask, d_head, dropout, False
+        )
+
+        # 3. second_product
+
+        # create second matrix
+        # [B, 1, L-G, bs, (G+W)*bs]
+        band_mask = self._get_band_mask(blocked_query_mask, blocked_key_mask, B, T)
+        # [B, H, L-G, bs, R*bs]
+        rand_mask = self._get_rand_mask(blocked_query_mask, blocked_key_mask, rand_mask_idx, B, T)
+        # [B, H, L-G, bs, (G+W+R)*bs]
+        second_mask = paddle.concat([band_mask, rand_mask], axis=4)
+
+        # [B, H, L-G, R * bs, -1]
+        random_keys = self._gather_random_key_value(blocked_key_matrix, rand_mask_idx, B, T)
+        random_values = self._gather_random_key_value(blocked_value_matrix, rand_mask_idx, B, T)
+
+        band_keys_matrix = self._get_band_matrix(blocked_key_matrix, B, T)
+        band_value_matrix = self._get_band_matrix(blocked_value_matrix, B, T)
+
+        # [B, H, L - G, bs, -1]
+        second_query_matrix = blocked_query_matrix[:, :, GF:-GB]
+        # [B, H, L - G, (G+W+R)*bs, -1]
+        second_key_matrix = paddle.concat([band_keys_matrix, random_keys], axis=3)
+        # [B, H, L - G, (G+W+R)*bs, -1]
+        second_value_matrix = paddle.concat([band_value_matrix, random_values], axis=3)
+        second_top_value_matrix, second_middle_value_matrix, second_bottom_value_matrix = self._get_splited_matrix(
+            second_value_matrix
+        )
+        second_product = paddle.einsum("bhlqd,bhlkd->bhlqk", second_query_matrix, second_key_matrix)
+        second_product = second_product * (d_head**-0.5)
+        second_product += (1 - second_mask) * -1e6
+        second_weights = F.softmax(second_product)
+
+        second_top_weights, second_middle_weights, second_bottom_weights = self._get_splited_matrix(second_weights)
+        second_top_out = paddle.einsum("bhlqk,bhlkd->bhlqd", second_top_weights, second_top_value_matrix)
+
+        second_middle_out = paddle.einsum(
+            "bhlqk,bhlkd->bhlqd",
+            second_middle_weights[:, :, :, :, GF * bs : -(GB + R) * bs],
+            second_middle_value_matrix[:, :, :, GF * bs : -(GB + R) * bs],
+        )
+        # add global block attention
+        second_middle_out += paddle.einsum(
+            "bhlqk,bhkd->bhlqd", second_middle_weights[:, :, :, :, : GF * bs], blocked_value_matrix[:, :, 0]
+        )
+        second_middle_out += paddle.einsum(
+            "bhlqk,bhkd->bhlqd",
+            second_middle_weights[:, :, :, :, -(GB + R) * bs : -R * bs],
+            blocked_value_matrix[:, :, -GB],
+        )
+        # add random block attention
+        second_middle_out += paddle.einsum(
+            "...qk,...kd->...qd", second_middle_weights[:, :, :, :, -R * bs :], random_values[:, :, GF:-GB]
+        )
+
+        second_bottom_out = paddle.einsum("bhlqk,bhlkd->bhlqd", second_bottom_weights, second_bottom_value_matrix)
+
+        second_out = paddle.concat([second_top_out, second_middle_out, second_bottom_out], axis=2)
+        second_out = paddle.reshape(second_out, [B, H, (L - G) * bs, -1])
+
+        # [B, H, T, D]
+        out = paddle.concat([global_front_out, second_out, global_back_out], axis=2)
+        out = out * query_mask
+        return out
+
+
+class MultiHeadAttention(Layer):
+
+    Cache = collections.namedtuple("Cache", ["k", "v"])
+    StaticCache = collections.namedtuple("StaticCache", ["k", "v"])
+
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        dropout=0.0,
+        kdim=None,
+        vdim=None,
+        weight_attr=None,
+        bias_attr=None,
+        block_size=1,
+        window_size=3,
+        num_global_blocks=1,
+        num_rand_blocks=1,
+        seed=None,
+        attention_type="bigbird",
+    ):
+
+        super(MultiHeadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        self.q_proj = Linear3D(embed_dim, num_heads, self.head_dim, weight_attr, bias_attr=bias_attr)
+        self.k_proj = Linear3D(embed_dim, num_heads, self.head_dim, weight_attr, bias_attr=bias_attr)
+        self.v_proj = Linear3D(embed_dim, num_heads, self.head_dim, weight_attr, bias_attr=bias_attr)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
+
+        self.attn_impl = AttentionRegistry.cls_dict[attention_type](
+            num_heads, block_size, window_size, num_global_blocks, num_rand_blocks, seed
+        )
+
+    def _prepare_qkv(self, query, key, value, cache=None):
+        q = self.q_proj(query)
+
+        if isinstance(cache, self.StaticCache):
+            # for encoder-decoder attention in inference and has cached
+            k, v = cache.k, cache.v
+        else:
+            k, v = self.compute_kv(key, value)
+
+        if isinstance(cache, self.Cache):
+            # for decoder self-attention in inference
+            k = paddle.concat([cache.k, k], axis=2)
+            v = paddle.concat([cache.v, v], axis=2)
+            cache = self.Cache(k, v)
+
+        return (q, k, v) if cache is None else (q, k, v, cache)
+
+    def compute_kv(self, key, value):
+        k = self.k_proj(key)
+        v = self.v_proj(value)
+        return k, v
+
+    def gen_cache(self, key, value=None, type=Cache):
+        if type == MultiHeadAttention.StaticCache:  # static_kv
+            k, v = self.compute_kv(key, value)
+            return self.StaticCache(k, v)
+        elif value is None:  # incremental_state
+            k = paddle.full(shape=[-1, self.num_heads, 0, self.head_dim], fill_value=0, dtype=key.dtype)
+
+            v = paddle.full(shape=[-1, self.num_heads, 0, self.head_dim], fill_value=0, dtype=key.dtype)
+            return self.Cache(k, v)
+        else:
+            # incremental_state with initial value, mainly for usage like UniLM
+            return self.Cache(key, value)
+
+    def forward(
+        self, query, key, value, attn_mask=None, rand_mask_idx=None, query_mask=None, key_mask=None, cache=None
+    ):
+        key = query if key is None else key
+        value = query if value is None else value
+        # compute q ,k ,v
+        if cache is None:
+            q, k, v = self._prepare_qkv(query, key, value, cache)
+        else:
+            q, k, v, cache = self._prepare_qkv(query, key, value, cache)
+
+        out = self.attn_impl(q, k, v, self.head_dim, attn_mask, rand_mask_idx, query_mask, key_mask, self.dropout)
+        # combine heads
+        out = paddle.transpose(out, perm=[0, 2, 1, 3])
+        out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+
+        # project to output
+        out = self.out_proj(out)
+
+        outs = [out]
+        if cache is not None:
+            outs.append(cache)
+        return out if len(outs) == 1 else tuple(outs)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/audio_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/audio_utils.py
new file mode 100644
index 000000000..31795a062
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/audio_utils.py
@@ -0,0 +1,694 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Audio processing functions to extract features from audio waveforms. This code is pure numpy to support all frameworks
+and remove unnecessary dependencies.
+"""
+import warnings
+from typing import Optional, Union
+
+import numpy as np
+
+
+def hertz_to_mel(freq: Union[float, np.ndarray], mel_scale: str = "htk") -> Union[float, np.ndarray]:
+    """
+    Convert frequency from hertz to mels.
+
+    Args:
+        freq (`float` or `np.ndarray`):
+            The frequency, or multiple frequencies, in hertz (Hz).
+        mel_scale (`str`, *optional*, defaults to `"htk"`):
+            The mel frequency scale to use, `"htk"` or `"slaney"`.
+
+    Returns:
+        `float` or `np.ndarray`: The frequencies on the mel scale.
+    """
+
+    if mel_scale not in ["slaney", "htk"]:
+        raise ValueError('mel_scale should be one of "htk" or "slaney".')
+
+    if mel_scale == "htk":
+        return 2595.0 * np.log10(1.0 + (freq / 700.0))
+
+    min_log_hertz = 1000.0
+    min_log_mel = 15.0
+    logstep = 27.0 / np.log(6.4)
+    mels = 3.0 * freq / 200.0
+
+    if isinstance(freq, np.ndarray):
+        log_region = freq >= min_log_hertz
+        mels[log_region] = min_log_mel + np.log(freq[log_region] / min_log_hertz) * logstep
+    elif freq >= min_log_hertz:
+        mels = min_log_mel + np.log(freq / min_log_hertz) * logstep
+
+    return mels
+
+
+def mel_to_hertz(mels: Union[float, np.ndarray], mel_scale: str = "htk") -> Union[float, np.ndarray]:
+    """
+    Convert frequency from mels to hertz.
+
+    Args:
+        mels (`float` or `np.ndarray`):
+            The frequency, or multiple frequencies, in mels.
+        mel_scale (`str`, *optional*, `"htk"`):
+            The mel frequency scale to use, `"htk"` or `"slaney"`.
+
+    Returns:
+        `float` or `np.ndarray`: The frequencies in hertz.
+    """
+
+    if mel_scale not in ["slaney", "htk"]:
+        raise ValueError('mel_scale should be one of "htk" or "slaney".')
+
+    if mel_scale == "htk":
+        return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)
+
+    min_log_hertz = 1000.0
+    min_log_mel = 15.0
+    logstep = np.log(6.4) / 27.0
+    freq = 200.0 * mels / 3.0
+
+    if isinstance(mels, np.ndarray):
+        log_region = mels >= min_log_mel
+        freq[log_region] = min_log_hertz * np.exp(logstep * (mels[log_region] - min_log_mel))
+    elif mels >= min_log_mel:
+        freq = min_log_hertz * np.exp(logstep * (mels - min_log_mel))
+
+    return freq
+
+
+def _create_triangular_filter_bank(fft_freqs: np.ndarray, filter_freqs: np.ndarray) -> np.ndarray:
+    """
+    Creates a triangular filter bank.
+
+    Adapted from *torchaudio* and *librosa*.
+
+    Args:
+        fft_freqs (`np.ndarray` of shape `(num_frequency_bins,)`):
+            Discrete frequencies of the FFT bins in Hz.
+        filter_freqs (`np.ndarray` of shape `(num_mel_filters,)`):
+            Center frequencies of the triangular filters to create, in Hz.
+
+    Returns:
+        `np.ndarray` of shape `(num_frequency_bins, num_mel_filters)`
+    """
+    filter_diff = np.diff(filter_freqs)
+    slopes = np.expand_dims(filter_freqs, 0) - np.expand_dims(fft_freqs, 1)
+    down_slopes = -slopes[:, :-2] / filter_diff[:-1]
+    up_slopes = slopes[:, 2:] / filter_diff[1:]
+    return np.maximum(np.zeros(1), np.minimum(down_slopes, up_slopes))
+
+
+def mel_filter_bank(
+    num_frequency_bins: int,
+    num_mel_filters: int,
+    min_frequency: float,
+    max_frequency: float,
+    sampling_rate: int,
+    norm: Optional[str] = None,
+    mel_scale: str = "htk",
+) -> np.ndarray:
+    """
+    Creates a frequency bin conversion matrix used to obtain a mel spectrogram. This is called a *mel filter bank*, and
+    various implementation exist, which differ in the number of filters, the shape of the filters, the way the filters
+    are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The goal of these
+    features is to approximate the non-linear human perception of the variation in pitch with respect to the frequency.
+
+    Different banks of mel filters were introduced in the literature. The following variations are supported:
+
+    - MFCC FB-20: introduced in 1980 by Davis and Mermelstein, it assumes a sampling frequency of 10 kHz and a speech
+      bandwidth of `[0, 4600]` Hz.
+    - MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) (1995) uses a filter bank of 24 filters for a speech
+      bandwidth of `[0, 8000]` Hz. This assumes sampling rate ≥ 16 kHz.
+    - MFCC FB-40: from the Auditory Toolbox for MATLAB written by Slaney in 1998, assumes a sampling rate of 16 kHz and
+      speech bandwidth of `[133, 6854]` Hz. This version also includes area normalization.
+    - HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris (2004), assumes a sampling rate of
+      12.5 kHz and speech bandwidth of `[0, 6250]` Hz.
+
+    This code is adapted from *torchaudio* and *librosa*. Note that the default parameters of torchaudio's
+    `melscale_fbanks` implement the `"htk"` filters while librosa uses the `"slaney"` implementation.
+
+    Args:
+        num_frequency_bins (`int`):
+            Number of frequencies used to compute the spectrogram (should be the same as in `stft`).
+        num_mel_filters (`int`):
+            Number of mel filters to generate.
+        min_frequency (`float`):
+            Lowest frequency of interest in Hz.
+        max_frequency (`float`):
+            Highest frequency of interest in Hz. This should not exceed `sampling_rate / 2`.
+        sampling_rate (`int`):
+            Sample rate of the audio waveform.
+        norm (`str`, *optional*):
+            If `"slaney"`, divide the triangular mel weights by the width of the mel band (area normalization).
+        mel_scale (`str`, *optional*, defaults to `"htk"`):
+            The mel frequency scale to use, `"htk"` or `"slaney"`.
+
+    Returns:
+        `np.ndarray` of shape (`num_frequency_bins`, `num_mel_filters`): Triangular filter bank matrix. This is a
+        projection matrix to go from a spectrogram to a mel spectrogram.
+    """
+    if norm is not None and norm != "slaney":
+        raise ValueError('norm must be one of None or "slaney"')
+
+    # frequencies of FFT bins in Hz
+    fft_freqs = np.linspace(0, sampling_rate // 2, num_frequency_bins)
+
+    # center points of the triangular mel filters
+    mel_min = hertz_to_mel(min_frequency, mel_scale=mel_scale)
+    mel_max = hertz_to_mel(max_frequency, mel_scale=mel_scale)
+    mel_freqs = np.linspace(mel_min, mel_max, num_mel_filters + 2)
+    filter_freqs = mel_to_hertz(mel_freqs, mel_scale=mel_scale)
+
+    mel_filters = _create_triangular_filter_bank(fft_freqs, filter_freqs)
+
+    if norm is not None and norm == "slaney":
+        # Slaney-style mel is scaled to be approx constant energy per channel
+        enorm = 2.0 / (filter_freqs[2 : num_mel_filters + 2] - filter_freqs[:num_mel_filters])
+        mel_filters *= np.expand_dims(enorm, 0)
+
+    if (mel_filters.max(axis=0) == 0.0).any():
+        warnings.warn(
+            "At least one mel filter has all zero values. "
+            f"The value for `num_mel_filters` ({num_mel_filters}) may be set too high. "
+            f"Or, the value for `num_frequency_bins` ({num_frequency_bins}) may be set too low."
+        )
+
+    return mel_filters
+
+
+def optimal_fft_length(window_length: int) -> int:
+    """
+    Finds the best FFT input size for a given `window_length`. This function takes a given window length and, if not
+    already a power of two, rounds it up to the next power or two.
+
+    The FFT algorithm works fastest when the length of the input is a power of two, which may be larger than the size
+    of the window or analysis frame. For example, if the window is 400 samples, using an FFT input size of 512 samples
+    is more optimal than an FFT size of 400 samples. Using a larger FFT size does not affect the detected frequencies,
+    it simply gives a higher frequency resolution (i.e. the frequency bins are smaller).
+    """
+    return 2 ** int(np.ceil(np.log2(window_length)))
+
+
+def window_function(
+    window_length: int,
+    name: str = "hann",
+    periodic: bool = True,
+    frame_length: Optional[int] = None,
+    center: bool = True,
+) -> np.ndarray:
+    """
+    Returns an array containing the specified window. This window is intended to be used with `stft`.
+
+    The following window types are supported:
+
+        - `"boxcar"`: a rectangular window
+        - `"hamming"`: the Hamming window
+        - `"hann"`: the Hann window
+
+    Args:
+        window_length (`int`):
+            The length of the window in samples.
+        name (`str`, *optional*, defaults to `"hann"`):
+            The name of the window function.
+        periodic (`bool`, *optional*, defaults to `True`):
+            Whether the window is periodic or symmetric.
+        frame_length (`int`, *optional*):
+            The length of the analysis frames in samples. Provide a value for `frame_length` if the window is smaller
+            than the frame length, so that it will be zero-padded.
+        center (`bool`, *optional*, defaults to `True`):
+            Whether to center the window inside the FFT buffer. Only used when `frame_length` is provided.
+
+    Returns:
+        `np.ndarray` of shape `(window_length,)` or `(frame_length,)` containing the window.
+    """
+    length = window_length + 1 if periodic else window_length
+
+    if name == "boxcar":
+        window = np.ones(length)
+    elif name in ["hamming", "hamming_window"]:
+        window = np.hamming(length)
+    elif name in ["hann", "hann_window"]:
+        window = np.hanning(length)
+    else:
+        raise ValueError(f"Unknown window function '{name}'")
+
+    if periodic:
+        window = window[:-1]
+
+    if frame_length is None:
+        return window
+
+    if window_length > frame_length:
+        raise ValueError(
+            f"Length of the window ({window_length}) may not be larger than frame_length ({frame_length})"
+        )
+
+    padded_window = np.zeros(frame_length)
+    offset = (frame_length - window_length) // 2 if center else 0
+    padded_window[offset : offset + window_length] = window
+    return padded_window
+
+
+# TODO This method does not support batching yet as we are mainly focused on inference.
+def spectrogram(
+    waveform: np.ndarray,
+    window: np.ndarray,
+    frame_length: int,
+    hop_length: int,
+    fft_length: Optional[int] = None,
+    power: Optional[float] = 1.0,
+    center: bool = True,
+    pad_mode: str = "reflect",
+    onesided: bool = True,
+    preemphasis: Optional[float] = None,
+    mel_filters: Optional[np.ndarray] = None,
+    mel_floor: float = 1e-10,
+    log_mel: Optional[str] = None,
+    reference: float = 1.0,
+    min_value: float = 1e-10,
+    db_range: Optional[float] = None,
+    dtype: np.dtype = np.float32,
+) -> np.ndarray:
+    """
+    Calculates a spectrogram over one waveform using the Short-Time Fourier Transform.
+
+    This function can create the following kinds of spectrograms:
+
+      - amplitude spectrogram (`power = 1.0`)
+      - power spectrogram (`power = 2.0`)
+      - complex-valued spectrogram (`power = None`)
+      - log spectrogram (use `log_mel` argument)
+      - mel spectrogram (provide `mel_filters`)
+      - log-mel spectrogram (provide `mel_filters` and `log_mel`)
+
+    How this works:
+
+      1. The input waveform is split into frames of size `frame_length` that are partially overlapping by `frame_length
+         - hop_length` samples.
+      2. Each frame is multiplied by the window and placed into a buffer of size `fft_length`.
+      3. The DFT is taken of each windowed frame.
+      4. The results are stacked into a spectrogram.
+
+    We make a distinction between the following "blocks" of sample data, each of which may have a different lengths:
+
+      - The analysis frame. This is the size of the time slices that the input waveform is split into.
+      - The window. Each analysis frame is multiplied by the window to avoid spectral leakage.
+      - The FFT input buffer. The length of this determines how many frequency bins are in the spectrogram.
+
+    In this implementation, the window is assumed to be zero-padded to have the same size as the analysis frame. A
+    padded window can be obtained from `window_function()`. The FFT input buffer may be larger than the analysis frame,
+    typically the next power of two.
+
+    Note: This function is not optimized for speed yet. It should be mostly compatible with `librosa.stft` and
+    `torchaudio.functional.transforms.Spectrogram`, although it is more flexible due to the different ways spectrograms
+    can be constructed.
+
+    Args:
+        waveform (`np.ndarray` of shape `(length,)`):
+            The input waveform. This must be a single real-valued, mono waveform.
+        window (`np.ndarray` of shape `(frame_length,)`):
+            The windowing function to apply, including zero-padding if necessary. The actual window length may be
+            shorter than `frame_length`, but we're assuming the array has already been zero-padded.
+        frame_length (`int`):
+            The length of the analysis frames in samples. With librosa this is always equal to `fft_length` but we also
+            allow smaller sizes.
+        hop_length (`int`):
+            The stride between successive analysis frames in samples.
+        fft_length (`int`, *optional*):
+            The size of the FFT buffer in samples. This determines how many frequency bins the spectrogram will have.
+            For optimal speed, this should be a power of two. If `None`, uses `frame_length`.
+        power (`float`, *optional*, defaults to 1.0):
+            If 1.0, returns the amplitude spectrogram. If 2.0, returns the power spectrogram. If `None`, returns
+            complex numbers.
+        center (`bool`, *optional*, defaults to `True`):
+            Whether to pad the waveform so that frame `t` is centered around time `t * hop_length`. If `False`, frame
+            `t` will start at time `t * hop_length`.
+        pad_mode (`str`, *optional*, defaults to `"reflect"`):
+            Padding mode used when `center` is `True`. Possible values are: `"constant"` (pad with zeros), `"edge"`
+            (pad with edge values), `"reflect"` (pads with mirrored values).
+        onesided (`bool`, *optional*, defaults to `True`):
+            If True, only computes the positive frequencies and returns a spectrogram containing `fft_length // 2 + 1`
+            frequency bins. If False, also computes the negative frequencies and returns `fft_length` frequency bins.
+        preemphasis (`float`, *optional*)
+            Coefficient for a low-pass filter that applies pre-emphasis before the DFT.
+        mel_filters (`np.ndarray` of shape `(num_freq_bins, num_mel_filters)`, *optional*):
+            The mel filter bank. If supplied, applies a this filter bank to create a mel spectrogram.
+        mel_floor (`float`, *optional*, defaults to 1e-10):
+            Minimum value of mel frequency banks.
+        log_mel (`str`, *optional*):
+            How to convert the spectrogram to log scale. Possible options are: `None` (don't convert), `"log"` (take
+            the natural logarithm) `"log10"` (take the base-10 logarithm), `"dB"` (convert to decibels). Can only be
+            used when `power` is not `None`.
+        reference (`float`, *optional*, defaults to 1.0):
+            Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
+            the loudest part to 0 dB. Must be greater than zero.
+        min_value (`float`, *optional*, defaults to `1e-10`):
+            The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
+            `log(0)`. For a power spectrogram, the default of `1e-10` corresponds to a minimum of -100 dB. For an
+            amplitude spectrogram, the value `1e-5` corresponds to -100 dB. Must be greater than zero.
+        db_range (`float`, *optional*):
+            Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
+            peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
+        dtype (`np.dtype`, *optional*, defaults to `np.float32`):
+            Data type of the spectrogram tensor. If `power` is None, this argument is ignored and the dtype will be
+            `np.complex64`.
+
+    Returns:
+        `nd.array` containing a spectrogram of shape `(num_frequency_bins, length)` for a regular spectrogram or shape
+        `(num_mel_filters, length)` for a mel spectrogram.
+    """
+    window_length = len(window)
+
+    if fft_length is None:
+        fft_length = frame_length
+
+    if frame_length > fft_length:
+        raise ValueError(f"frame_length ({frame_length}) may not be larger than fft_length ({fft_length})")
+
+    if window_length != frame_length:
+        raise ValueError(f"Length of the window ({window_length}) must equal frame_length ({frame_length})")
+
+    if hop_length <= 0:
+        raise ValueError("hop_length must be greater than zero")
+
+    if waveform.ndim != 1:
+        raise ValueError(f"Input waveform must have only one dimension, shape is {waveform.shape}")
+
+    if np.iscomplexobj(waveform):
+        raise ValueError("Complex-valued input waveforms are not currently supported")
+
+    # center pad the waveform
+    if center:
+        padding = [(int(frame_length // 2), int(frame_length // 2))]
+        waveform = np.pad(waveform, padding, mode=pad_mode)
+
+    # promote to float64, since np.fft uses float64 internally
+    waveform = waveform.astype(np.float64)
+    window = window.astype(np.float64)
+
+    # split waveform into frames of frame_length size
+    num_frames = int(1 + np.floor((waveform.size - frame_length) / hop_length))
+
+    num_frequency_bins = (fft_length // 2) + 1 if onesided else fft_length
+    spectrogram = np.empty((num_frames, num_frequency_bins), dtype=np.complex64)
+
+    # rfft is faster than fft
+    fft_func = np.fft.rfft if onesided else np.fft.fft
+    buffer = np.zeros(fft_length)
+
+    timestep = 0
+    for frame_idx in range(num_frames):
+        buffer[:frame_length] = waveform[timestep : timestep + frame_length]
+
+        if preemphasis is not None:
+            buffer[1:frame_length] -= preemphasis * buffer[: frame_length - 1]
+            buffer[0] *= 1 - preemphasis
+
+        buffer[:frame_length] *= window
+
+        spectrogram[frame_idx] = fft_func(buffer)
+        timestep += hop_length
+
+    # note: ** is much faster than np.power
+    if power is not None:
+        spectrogram = np.abs(spectrogram, dtype=np.float64) ** power
+
+    spectrogram = spectrogram.T
+
+    if mel_filters is not None:
+        spectrogram = np.maximum(mel_floor, np.dot(mel_filters.T, spectrogram))
+
+    if power is not None and log_mel is not None:
+        if log_mel == "log":
+            spectrogram = np.log(spectrogram)
+        elif log_mel == "log10":
+            spectrogram = np.log10(spectrogram)
+        elif log_mel == "dB":
+            if power == 1.0:
+                spectrogram = amplitude_to_db(spectrogram, reference, min_value, db_range)
+            elif power == 2.0:
+                spectrogram = power_to_db(spectrogram, reference, min_value, db_range)
+            else:
+                raise ValueError(f"Cannot use log_mel option '{log_mel}' with power {power}")
+        else:
+            raise ValueError(f"Unknown log_mel option: {log_mel}")
+
+        spectrogram = np.asarray(spectrogram, dtype)
+
+    return spectrogram
+
+
+def power_to_db(
+    spectrogram: np.ndarray,
+    reference: float = 1.0,
+    min_value: float = 1e-10,
+    db_range: Optional[float] = None,
+) -> np.ndarray:
+    """
+    Converts a power spectrogram to the decibel scale. This computes `10 * log10(spectrogram / reference)`, using basic
+    logarithm properties for numerical stability.
+
+    The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a
+    linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it.
+    This means that large variations in energy may not sound all that different if the sound is loud to begin with.
+    This compression operation makes the (mel) spectrogram features match more closely what humans actually hear.
+
+    Based on the implementation of `librosa.power_to_db`.
+
+    Args:
+        spectrogram (`np.ndarray`):
+            The input power (mel) spectrogram. Note that a power spectrogram has the amplitudes squared!
+        reference (`float`, *optional*, defaults to 1.0):
+            Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
+            the loudest part to 0 dB. Must be greater than zero.
+        min_value (`float`, *optional*, defaults to `1e-10`):
+            The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
+            `log(0)`. The default of `1e-10` corresponds to a minimum of -100 dB. Must be greater than zero.
+        db_range (`float`, *optional*):
+            Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
+            peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
+
+    Returns:
+        `np.ndarray`: the spectrogram in decibels
+    """
+    if reference <= 0.0:
+        raise ValueError("reference must be greater than zero")
+    if min_value <= 0.0:
+        raise ValueError("min_value must be greater than zero")
+
+    reference = max(min_value, reference)
+
+    spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None)
+    spectrogram = 10.0 * (np.log10(spectrogram) - np.log10(reference))
+
+    if db_range is not None:
+        if db_range <= 0.0:
+            raise ValueError("db_range must be greater than zero")
+        spectrogram = np.clip(spectrogram, a_min=spectrogram.max() - db_range, a_max=None)
+
+    return spectrogram
+
+
+def amplitude_to_db(
+    spectrogram: np.ndarray,
+    reference: float = 1.0,
+    min_value: float = 1e-5,
+    db_range: Optional[float] = None,
+) -> np.ndarray:
+    """
+    Converts an amplitude spectrogram to the decibel scale. This computes `20 * log10(spectrogram / reference)`, using
+    basic logarithm properties for numerical stability.
+
+    The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a
+    linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it.
+    This means that large variations in energy may not sound all that different if the sound is loud to begin with.
+    This compression operation makes the (mel) spectrogram features match more closely what humans actually hear.
+
+    Args:
+        spectrogram (`np.ndarray`):
+            The input amplitude (mel) spectrogram.
+        reference (`float`, *optional*, defaults to 1.0):
+            Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
+            the loudest part to 0 dB. Must be greater than zero.
+        min_value (`float`, *optional*, defaults to `1e-5`):
+            The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
+            `log(0)`. The default of `1e-5` corresponds to a minimum of -100 dB. Must be greater than zero.
+        db_range (`float`, *optional*):
+            Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
+            peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
+
+    Returns:
+        `np.ndarray`: the spectrogram in decibels
+    """
+    if reference <= 0.0:
+        raise ValueError("reference must be greater than zero")
+    if min_value <= 0.0:
+        raise ValueError("min_value must be greater than zero")
+
+    reference = max(min_value, reference)
+
+    spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None)
+    spectrogram = 20.0 * (np.log10(spectrogram) - np.log10(reference))
+
+    if db_range is not None:
+        if db_range <= 0.0:
+            raise ValueError("db_range must be greater than zero")
+        spectrogram = np.clip(spectrogram, a_min=spectrogram.max() - db_range, a_max=None)
+
+    return spectrogram
+
+
+def get_mel_filter_banks(
+    nb_frequency_bins: int,
+    nb_mel_filters: int,
+    frequency_min: float,
+    frequency_max: float,
+    sample_rate: int,
+    norm: Optional[str] = None,
+    mel_scale: str = "htk",
+) -> np.array:
+    warnings.warn(
+        "The function `get_mel_filter_banks` is deprecated and will be removed in version 4.31.0 of Transformers",
+        FutureWarning,
+    )
+    return mel_filter_bank(
+        num_frequency_bins=nb_frequency_bins,
+        num_mel_filters=nb_mel_filters,
+        min_frequency=frequency_min,
+        max_frequency=frequency_max,
+        sampling_rate=sample_rate,
+        norm=norm,
+        mel_scale=mel_scale,
+    )
+
+
+def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int = 400, center: bool = True):
+    """
+    In order to compute the short time fourier transform, the waveform needs to be split in overlapping windowed
+    segments called `frames`.
+
+    The window length (window_length) defines how much of the signal is contained in each frame, while the hop length
+    defines the step between the beginning of each new frame.
+
+
+    Args:
+        waveform (`np.array` of shape `(sample_length,)`):
+            The raw waveform which will be split into smaller chunks.
+        hop_length (`int`, *optional*, defaults to 160):
+            Step between each window of the waveform.
+        fft_window_size (`int`, *optional*, defaults to 400):
+            Defines the size of the window.
+        center (`bool`, defaults to `True`):
+            Whether or not to center each frame around the middle of the frame. Centering is done by reflecting the
+            waveform on the left and on the right.
+
+    Return:
+        framed_waveform (`np.array` of shape `(waveform.shape // hop_length , fft_window_size)`):
+            The framed waveforms that can be fed to `np.fft`.
+    """
+    warnings.warn(
+        "The function `fram_wave` is deprecated and will be removed in version 4.31.0 of Transformers",
+        FutureWarning,
+    )
+    frames = []
+    for i in range(0, waveform.shape[0] + 1, hop_length):
+        if center:
+            half_window = (fft_window_size - 1) // 2 + 1
+            start = i - half_window if i > half_window else 0
+            end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0]
+            frame = waveform[start:end]
+            if start == 0:
+                padd_width = (-i + half_window, 0)
+                frame = np.pad(frame, pad_width=padd_width, mode="reflect")
+
+            elif end == waveform.shape[0]:
+                padd_width = (0, (i - waveform.shape[0] + half_window))
+                frame = np.pad(frame, pad_width=padd_width, mode="reflect")
+
+        else:
+            frame = waveform[i : i + fft_window_size]
+            frame_width = frame.shape[0]
+            if frame_width < waveform.shape[0]:
+                frame = np.lib.pad(
+                    frame, pad_width=(0, fft_window_size - frame_width), mode="constant", constant_values=0
+                )
+        frames.append(frame)
+
+    frames = np.stack(frames, 0)
+    return frames
+
+
+def stft(frames: np.array, windowing_function: np.array, fft_window_size: int = None):
+    """
+    Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same results
+    as `torch.stft`.
+
+    Args:
+        frames (`np.array` of dimension `(num_frames, fft_window_size)`):
+            A framed audio signal obtained using `audio_utils.fram_wav`.
+        windowing_function (`np.array` of dimension `(nb_frequency_bins, nb_mel_filters)`:
+            A array reprensenting the function that will be used to reduces the amplitude of the discontinuities at the
+            boundaries of each frame when computing the STFT. Each frame will be multiplied by the windowing_function.
+            For more information on the discontinuities, called *Spectral leakage*, refer to [this
+            tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf
+        fft_window_size (`int`, *optional*):
+            Size of the window om which the Fourier transform is applied. This controls the frequency resolution of the
+            spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples. The number of
+            frequency bins (`nb_frequency_bins`) used to divide the window into equal strips is equal to
+            `(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionnally.
+
+    Example:
+
+    ```python
+    >>> from paddlenlp.transformers.audio_utils import stft, fram_wave
+    >>> import numpy as np
+
+    >>> audio = np.random.rand(50)
+    >>> fft_window_size = 10
+    >>> hop_length = 2
+    >>> framed_audio = fram_wave(audio, hop_length, fft_window_size)
+    >>> spectrogram = stft(framed_audio, np.hanning(fft_window_size + 1))
+    ```
+
+    Returns:
+        spectrogram (`np.ndarray`):
+            A spectrogram of shape `(num_frames, nb_frequency_bins)` obtained using the STFT algorithm
+    """
+    warnings.warn(
+        "The function `stft` is deprecated and will be removed in version 4.31.0 of Transformers",
+        FutureWarning,
+    )
+    frame_size = frames.shape[1]
+
+    if fft_window_size is None:
+        fft_window_size = frame_size
+
+    if fft_window_size < frame_size:
+        raise ValueError("FFT size must greater or equal the frame size")
+    # number of FFT bins to store
+    nb_frequency_bins = (fft_window_size >> 1) + 1
+
+    spectrogram = np.empty((len(frames), nb_frequency_bins), dtype=np.complex64)
+    fft_signal = np.zeros(fft_window_size)
+
+    for f, frame in enumerate(frames):
+        if windowing_function is not None:
+            np.multiply(frame, windowing_function, out=fft_signal[:frame_size])
+        else:
+            fft_signal[:frame_size] = frame
+        spectrogram[f] = np.fft.fft(fft_signal, axis=0)[:nb_frequency_bins]
+    return spectrogram.T
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/auto/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/auto/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/auto/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/auto/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/auto/configuration.py
new file mode 100644
index 000000000..8407154c9
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/auto/configuration.py
@@ -0,0 +1,207 @@
+# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import inspect
+import io
+import json
+import os
+from collections import defaultdict
+from typing import Dict, List, Type
+
+from ...utils.download import resolve_file_path
+from ...utils.import_utils import import_module
+from ...utils.log import logger
+from ..configuration_utils import PretrainedConfig
+from ..model_utils import PretrainedModel
+
+__all__ = [
+    "AutoConfig",
+]
+
+
+def get_configurations() -> Dict[str, List[Type[PretrainedConfig]]]:
+    """load the configurations of PretrainedConfig mapping: {<model-name>: [<class-name>, <class-name>, ...], }
+
+    Returns:
+        dict[str, str]: the mapping of model-name to model-classes
+    """
+    # 1. search the subdir<model-name> to find model-names
+    transformers_dir = os.path.dirname(os.path.dirname(__file__))
+    exclude_models = ["auto"]
+
+    mappings = defaultdict(list)
+    for model_name in os.listdir(transformers_dir):
+        if model_name in exclude_models:
+            continue
+
+        model_dir = os.path.join(transformers_dir, model_name)
+        if not os.path.isdir(model_dir):
+            continue
+
+        # 2. find the `configuration.py` file as the identifier of PretrainedConfig class
+        configuration_path = os.path.join(model_dir, "configuration.py")
+        if not os.path.exists(configuration_path):
+            continue
+
+        configuration_module = import_module(f"paddlenlp.transformers.{model_name}.configuration")
+        for key in dir(configuration_module):
+            value = getattr(configuration_module, key)
+            if inspect.isclass(value) and issubclass(value, PretrainedConfig):
+                mappings[model_name].append(value)
+
+    return mappings
+
+
+class AutoConfig(PretrainedConfig):
+    """
+    AutoConfig is a generic config class that will be instantiated as one of the
+    base PretrainedConfig classes when created with the AutoConfig.from_pretrained() classmethod.
+    """
+
+    MAPPING_NAMES: Dict[str, List[Type[PretrainedConfig]]] = get_configurations()
+
+    # cache the builtin pretrained-model-name to Model Class
+    name2class = None
+    config_file = "config.json"
+
+    # TODO(wj-Mcat): the supporting should be removed after v2.6
+    legacy_config_file = "config.json"
+
+    @classmethod
+    def _get_config_class_from_config(
+        cls, pretrained_model_name_or_path: str, config_file_path: str
+    ) -> PretrainedConfig:
+        with io.open(config_file_path, encoding="utf-8") as f:
+            config = json.load(f)
+
+        # add support for legacy config
+        if "init_class" in config:
+            architectures = [config.pop("init_class")]
+        else:
+            architectures = config.pop("architectures", None)
+            if architectures is None:
+                return cls
+
+        model_name = architectures[0]
+        model_class = import_module(f"paddlenlp.transformers.{model_name}")
+
+        # To make AutoConfig support loading config with custom model_class
+        # which is not in paddlenlp.transformers. Using "model_type" to load
+        # here actually conforms to what PretrainedConfig doc describes.
+        if model_class is None and "model_type" in config:
+            model_type = config["model_type"]
+            # MAPPING_NAMES is a dict with item like ('llama', [LlamaConfig, PretrainedConfig])
+            for config_class in cls.MAPPING_NAMES[model_type]:
+                if config_class is not PretrainedConfig:
+                    model_config_class = config_class
+                    return model_config_class
+
+        assert inspect.isclass(model_class) and issubclass(
+            model_class, PretrainedModel
+        ), f"<{model_class}> should be a PretarinedModel class, but <{type(model_class)}>"
+
+        return cls if model_class.config_class is None else model_class.config_class
+
+    @classmethod
+    def from_file(cls, config_file: str, **kwargs) -> AutoConfig:
+        """construct configuration with AutoConfig class to enable normal loading
+
+        Args:
+            config_file (str): the path of config file
+
+        Returns:
+            AutoConfig: the instance of AutoConfig
+        """
+        with open(config_file, "r", encoding="utf-8") as f:
+            config = json.load(f)
+
+        config.update(kwargs)
+        return cls(**config)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, *model_args, **kwargs):
+        """
+        Creates an instance of `AutoConfig`. Related resources are loaded by
+        specifying name of a built-in pretrained model, or a community-contributed
+        pretrained model, or a local file directory path.
+
+        Args:
+            pretrained_model_name_or_path (str): Name of pretrained model or dir path
+                to load from. The string can be:
+
+                - Name of built-in pretrained model
+                - Name of a community-contributed pretrained model.
+                - Local directory path which contains processor related resources
+                  and processor config file ("processor_config.json").
+            *args (tuple): position arguments for model `__init__`. If provided,
+                use these as position argument values for processor initialization.
+            **kwargs (dict): keyword arguments for model `__init__`. If provided,
+                use these to update pre-defined keyword argument values for processor
+                initialization.
+
+        Returns:
+            PretrainedConfig: An instance of `PretrainedConfig`.
+
+
+        Example:
+            .. code-block::
+            from paddlenlp.transformers import AutoConfig
+            config = AutoConfig.from_pretrained("bert-base-uncased")
+            config.save_pretrained('./bert-base-uncased')
+        """
+
+        if not cls.name2class:
+            cls.name2class = {}
+            for model_classes in cls.MAPPING_NAMES.values():
+                for model_class in model_classes:
+                    cls.name2class.update(
+                        {model_name: model_class for model_name in model_class.pretrained_init_configuration.keys()}
+                    )
+
+        # From built-in pretrained models
+        if pretrained_model_name_or_path in cls.name2class:
+            return cls.name2class[pretrained_model_name_or_path].from_pretrained(
+                pretrained_model_name_or_path, *model_args, **kwargs
+            )
+
+        subfolder = kwargs.get("subfolder", "")
+        if subfolder is None:
+            subfolder = ""
+        from_aistudio = kwargs.pop("from_aistudio", False)
+        from_hf_hub = kwargs.pop("from_hf_hub", False)
+        cache_dir = kwargs.pop("cache_dir", None)
+
+        config_file = resolve_file_path(
+            pretrained_model_name_or_path,
+            [cls.config_file, cls.legacy_config_file],
+            subfolder,
+            cache_dir=cache_dir,
+            from_hf_hub=from_hf_hub,
+            from_aistudio=from_aistudio,
+        )
+        if config_file is not None and os.path.exists(config_file):
+            config_class = cls._get_config_class_from_config(pretrained_model_name_or_path, config_file)
+            logger.info("We are using %s to load '%s'." % (config_class, pretrained_model_name_or_path))
+            if config_class is cls:
+                return cls.from_file(config_file)
+            return config_class.from_pretrained(config_file, *model_args, **kwargs)
+        else:
+            raise RuntimeError(
+                f"Can't load config for '{pretrained_model_name_or_path}'.\n"
+                f"Please make sure that '{pretrained_model_name_or_path}' is:\n"
+                "- a correct model-identifier of built-in pretrained models,\n"
+                "- or a correct model-identifier of community-contributed pretrained models,\n"
+                "- or the correct path to a directory containing relevant config files.\n"
+            )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/auto/image_processing.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/auto/image_processing.py
new file mode 100644
index 000000000..8cee48aba
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/auto/image_processing.py
@@ -0,0 +1,183 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import io
+import json
+import os
+from collections import OrderedDict
+
+from ...utils.download import resolve_file_path
+from ...utils.import_utils import import_module
+from ...utils.log import logger
+
+__all__ = [
+    "AutoImageProcessor",
+]
+
+IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
+    [
+        ("ChineseCLIPImageProcessor", "chineseclip"),
+        ("CLIPImageProcessor", "clip"),
+        ("ErnieViLImageProcessor", "ernie_vil"),
+        ("ViTImageProcessor", "clipseg"),
+    ]
+)
+
+
+def get_configurations():
+    MAPPING_NAMES = OrderedDict()
+    for key, class_name in IMAGE_PROCESSOR_MAPPING_NAMES.items():
+        import_class = importlib.import_module(f"paddlenlp.transformers.{class_name}.image_processing")
+        processor_name = getattr(import_class, key)
+        name = tuple(processor_name.pretrained_init_configuration.keys())
+        if MAPPING_NAMES.get(name, None) is None:
+            MAPPING_NAMES[name] = []
+        MAPPING_NAMES[name].append(processor_name)
+    return MAPPING_NAMES
+
+
+class AutoImageProcessor:
+    """
+    AutoClass can help you automatically retrieve the relevant model given the provided
+    pretrained weights/vocabulary.
+    AutoImageProcessor is a generic processor class that will be instantiated as one of the
+    base processor classes when created with the AutoImageProcessor.from_pretrained() classmethod.
+    """
+
+    MAPPING_NAMES = get_configurations()
+    _processor_mapping = MAPPING_NAMES
+    _name_mapping = IMAGE_PROCESSOR_MAPPING_NAMES
+    image_processor_config_file = "preprocessor_config.json"
+
+    def __init__(self, *args, **kwargs):
+        raise EnvironmentError(
+            f"{self.__class__.__name__} is designed to be instantiated "
+            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path).`"
+        )
+
+    @classmethod
+    def _get_image_processor_class_from_config(cls, pretrained_model_name_or_path, config_file_path):
+        with io.open(config_file_path, encoding="utf-8") as f:
+            init_kwargs = json.load(f)
+        # class name corresponds to this configuration
+        init_class = init_kwargs.pop("init_class", None)
+        if init_class is None:
+            init_class = init_kwargs.pop("image_processor_type", init_kwargs.pop("feature_extractor_type", None))
+
+        if init_class:
+            # replace old name to new name
+            init_class = init_class.replace("FeatureExtractor", "ImageProcessor")
+            try:
+                class_name = cls._name_mapping[init_class]
+                import_class = import_module(f"paddlenlp.transformers.{class_name}.image_processing")
+                processor_class = getattr(import_class, init_class)
+                return processor_class
+            except Exception:
+                init_class = None
+
+        # If no `init_class`, we use pattern recognition to recognize the processor class.
+        if init_class is None:
+            logger.info("We use pattern recognition to recognize the processor class.")
+            for key, pattern in cls._name_mapping.items():
+                if pattern in pretrained_model_name_or_path.lower():
+                    init_class = key
+                    class_name = cls._name_mapping[init_class]
+                    import_class = import_module(f"paddlenlp.transformers.{class_name}.image_processing")
+                    processor_class = getattr(import_class, init_class)
+                    break
+            return processor_class
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Creates an instance of `AutoImageProcessor`. Related resources are loaded by
+        specifying name of a built-in pretrained model, or a community-contributed
+        pretrained model, or a local file directory path.
+
+        Args:
+            pretrained_model_name_or_path (str): Name of pretrained model or dir path
+                to load from. The string can be:
+
+                - Name of built-in pretrained model
+                - Name of a community-contributed pretrained model.
+                - Local directory path which contains processor related resources
+                  and processor config file ("processor_config.json").
+            *args (tuple): position arguments for model `__init__`. If provided,
+                use these as position argument values for processor initialization.
+            **kwargs (dict): keyword arguments for model `__init__`. If provided,
+                use these to update pre-defined keyword argument values for processor
+                initialization.
+
+        Returns:
+            Pretrainedprocessor: An instance of `Pretrainedprocessor`.
+
+
+        Example:
+            .. code-block::
+            from paddlenlp.transformers import AutoImageProcessor
+            processor = AutoImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
+            processor.save_pretrained('clip_processor')
+        """
+        cache_dir = kwargs.get("cache_dir", None)
+        subfolder = kwargs.get("subfolder", "")
+        if subfolder is None:
+            subfolder = ""
+        from_aistudio = kwargs.get("from_aistudio", False)
+        from_hf_hub = kwargs.get("from_hf_hub", False)
+        kwargs["subfolder"] = subfolder
+        kwargs["cache_dir"] = cache_dir
+
+        all_processor_names = []
+        for names, processor_class in cls._processor_mapping.items():
+            for name in names:
+                all_processor_names.append(name)
+
+        # From built-in pretrained models
+        if pretrained_model_name_or_path in all_processor_names:
+            for names, processor_classes in cls._processor_mapping.items():
+                for pattern in names:
+                    if pattern == pretrained_model_name_or_path:
+                        actual_processor_class = processor_classes[0]
+                        logger.info(
+                            "We are using %s to load '%s'." % (actual_processor_class, pretrained_model_name_or_path)
+                        )
+                        return actual_processor_class.from_pretrained(
+                            pretrained_model_name_or_path, *model_args, **kwargs
+                        )
+
+        config_file = resolve_file_path(
+            pretrained_model_name_or_path,
+            [cls.image_processor_config_file],
+            subfolder,
+            cache_dir=cache_dir,
+            from_hf_hub=from_hf_hub,
+            from_aistudio=from_aistudio,
+        )
+        if config_file is not None and os.path.exists(config_file):
+            processor_class = cls._get_image_processor_class_from_config(
+                pretrained_model_name_or_path,
+                config_file,
+            )
+            logger.info(f"We are using {processor_class} to load '{pretrained_model_name_or_path}'.")
+            return processor_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        else:
+            raise RuntimeError(
+                f"Can't load image_processor for '{pretrained_model_name_or_path}'.\n"
+                f"Please make sure that '{pretrained_model_name_or_path}' is:\n"
+                "- a correct model-identifier of built-in pretrained image_processor,\n"
+                "- or a correct model-identifier of community-contributed pretrained models,\n"
+                "- or the correct path to a directory containing relevant image_processor files.\n"
+            )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/auto/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/auto/modeling.py
new file mode 100644
index 000000000..d44d299e5
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/auto/modeling.py
@@ -0,0 +1,1024 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import io
+import json
+import os
+from collections import OrderedDict
+
+from ...utils.download import resolve_file_path
+from ...utils.log import logger
+from .. import *  # noqa
+from ..configuration_utils import is_standard_config
+
+__all__ = [
+    "AutoBackbone",
+    "AutoModel",
+    "AutoModelForPretraining",
+    "AutoModelForSequenceClassification",
+    "AutoModelForTokenClassification",
+    "AutoModelForQuestionAnswering",
+    "AutoModelForMultipleChoice",
+    "AutoModelForMaskedLM",
+    "AutoModelForCausalLM",
+    "AutoModelForCausalLMPipe",
+    "AutoEncoder",
+    "AutoDecoder",
+    "AutoGenerator",
+    "AutoDiscriminator",
+    "AutoModelForConditionalGeneration",
+]
+
+MAPPING_NAMES = OrderedDict(
+    [
+        # Base model mapping
+        ("Albert", "albert"),
+        ("BigBird", "bigbird"),
+        ("BlenderbotSmall", "blenderbot_small"),
+        ("Blenderbot", "blenderbot"),
+        ("ChatGLMv2", "chatglm_v2"),
+        ("ChatGLM", "chatglm"),
+        ("ChineseCLIP", "chineseclip"),
+        ("ChineseBert", "chinesebert"),
+        ("ConvBert", "convbert"),
+        ("CTRL", "ctrl"),
+        ("DistilBert", "distilbert"),
+        ("DalleBart", "dallebart"),
+        ("Electra", "electra"),
+        ("ErnieViL", "ernie_vil"),
+        ("ErnieCtm", "ernie_ctm"),
+        ("ErnieDoc", "ernie_doc"),
+        ("ErnieGen", "ernie_gen"),
+        ("ErnieGram", "ernie_gram"),
+        ("ErnieLayout", "ernie_layout"),
+        ("ErnieM", "ernie_m"),
+        ("ErnieCode", "ernie_code"),
+        ("Ernie", "ernie"),
+        ("FNet", "fnet"),
+        ("Funnel", "funnel"),
+        ("Llama", "llama"),
+        ("LayoutXLM", "layoutxlm"),
+        ("LayoutLMv2", "layoutlmv2"),
+        ("LayoutLM", "layoutlm"),
+        ("Luke", "luke"),
+        ("MBart", "mbart"),
+        ("MegatronBert", "megatronbert"),
+        ("MobileBert", "mobilebert"),
+        ("MPNet", "mpnet"),
+        ("NeZha", "nezha"),
+        ("Nystromformer", "nystromformer"),
+        ("PPMiniLM", "ppminilm"),
+        ("ProphetNet", "prophetnet"),
+        ("Reformer", "reformer"),
+        ("RemBert", "rembert"),
+        ("Roberta", "roberta"),
+        ("RoFormerv2", "roformerv2"),
+        ("RoFormer", "roformer"),
+        ("Skep", "skep"),
+        ("SqueezeBert", "squeezebert"),
+        ("TinyBert", "tinybert"),
+        ("UnifiedTransformer", "unified_transformer"),
+        ("UNIMO", "unimo"),
+        ("XLNet", "xlnet"),
+        ("XLM", "xlm"),
+        ("GPT", "gpt"),
+        ("GLM", "glm"),
+        ("MT5", "mt5"),
+        ("T5", "t5"),
+        ("Bert", "bert"),
+        ("Bart", "bart"),
+        ("GAUAlpha", "gau_alpha"),
+        ("CodeGen", "codegen"),
+        ("CLIPVision", "clip"),
+        ("CLIPText", "clip"),
+        ("CLIP", "clip"),
+        ("ChineseCLIPVision", "chineseclip"),
+        ("ChineseCLIPText", "chineseclip"),
+        ("ChineseCLIP", "chineseclip"),
+        ("Artist", "artist"),
+        ("OPT", "opt"),
+        ("Pegasus", "pegasus"),
+        ("DPT", "dpt"),
+        ("Bit", "bit"),
+        ("BlipText", "blip"),
+        ("BlipVision", "blip"),
+        ("Blip", "blip"),
+        ("Bloom", "bloom"),
+        ("QWen", "qwen"),
+        ("Mistral", "mistral"),
+        ("Mixtral", "mixtral"),
+        ("Qwen2", "qwen2"),
+        ("Qwen2Moe", "qwen2_moe"),
+        ("Gemma", "gemma"),
+        ("Yuan", "yuan"),
+        ("Mamba", "mamba"),
+        ("Jamba", "jamba"),
+    ]
+)
+
+MAPPING_TASKS = OrderedDict(
+    [
+        ("Backbone", "AutoBackbone"),
+        ("Model", "AutoModel"),
+        ("ForPretraining", "AutoModelForPretraining"),
+        ("ForSequenceClassification", "AutoModelForSequenceClassification"),
+        ("ForTokenClassification", "AutoModelForTokenClassification"),
+        ("ForQuestionAnswering", "AutoModelForQuestionAnswering"),
+        ("ForMultipleChoice", "AutoModelForMultipleChoice"),
+        ("ForMaskedLM", "AutoModelForMaskedLM"),
+        ("ForCausalLM", "AutoModelForCausalLM"),
+        ("ForCausalLMPipe", "AutoModelForCausalLMPipe"),
+        ("Encoder", "AutoEncoder"),
+        ("Decoder", "AutoDecoder"),
+        ("Generator", "AutoGenerator"),
+        ("Discriminator", "AutoDiscriminator"),
+        ("ForConditionalGeneration", "AutoModelForConditionalGeneration"),
+    ]
+)
+
+MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Causal LM mapping
+        ("opt", "OPTForCausalLM"),
+    ]
+)
+
+
+def get_name_mapping(task="Model"):
+    """
+    Task can be 'Backbone', 'Model', 'ForPretraining', 'ForSequenceClassification', 'ForTokenClassification',
+    'ForQuestionAnswering', 'ForMultipleChoice', 'ForMaskedLM', 'ForCausalLM', 'Encoder', 'Decoder',
+    'Generator', 'Discriminator', 'ForConditionalGeneration'
+    """
+    NAME_MAPPING = OrderedDict()
+    for key, value in MAPPING_NAMES.items():
+        import_class = key + task
+        new_key = key + "Model_Import_Class"
+        NAME_MAPPING[new_key] = import_class
+        NAME_MAPPING[import_class] = value
+
+    return NAME_MAPPING
+
+
+def get_task_name(model_class):
+    for key, value in MAPPING_TASKS.items():
+        if model_class.endswith(key):
+            return value
+    return None
+
+
+def get_init_configurations():
+    CONFIGURATION_MODEL_MAPPING = OrderedDict()
+    for key, class_name in MAPPING_NAMES.items():
+        import_class = importlib.import_module(f"paddlenlp.transformers.{class_name}.modeling")
+        model_name = getattr(import_class, key + "Model")
+        if key == "ErnieGen":
+            name = tuple(model_name.ernie_gen_pretrained_init_configuration.keys())
+        else:
+            name = tuple(model_name.pretrained_init_configuration.keys())
+        CONFIGURATION_MODEL_MAPPING[name] = key + "Model"
+
+    return CONFIGURATION_MODEL_MAPPING
+
+
+class _BaseAutoModelClass:
+    # Base class for auto models.
+    _pretrained_model_dict = None
+    _name_mapping = None
+    _task_choice = False
+    model_config_file = "config.json"
+    legacy_model_config_file = "model_config.json"
+
+    def __init__(self, *args, **kwargs):
+        raise EnvironmentError(
+            f"{self.__class__.__name__} is designed to be instantiated "
+            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path).`"
+        )
+
+    # TODO: Refactor into AutoConfig when available
+    @classmethod
+    def _get_model_class_from_config(cls, pretrained_model_name_or_path, config_file_path, config=None):
+        if config is None:
+            with io.open(config_file_path, encoding="utf-8") as f:
+                config = json.load(f)
+
+        # Get class name corresponds to this configuration
+        if is_standard_config(config):
+            architectures = config["architectures"]
+            init_class = architectures.pop() if len(architectures) > 0 else None
+        else:
+            init_class = config.pop("init_class", None)
+        init_class = init_class[:-5] if init_class is not None and init_class.endswith("Model") else init_class
+
+        # Sort the MAPPING_NAMES to reorder the model class names with longest-first rule
+        # thus the names with same prefix can be correctly inferred
+        # such as QWen and QWen2MOE, QWen2MOE is the longest prefix of QWen2MOEModel
+        model_name = None
+        SORTED_MAPPING_NAMES = dict(sorted(MAPPING_NAMES.items(), key=lambda x: len(x[0]), reverse=True))
+        if init_class:
+            for model_flag, name in SORTED_MAPPING_NAMES.items():
+                if model_flag in init_class:
+                    model_name = model_flag + "Model"
+                    break
+        else:
+            # From pretrained_model_name_or_path
+            for model_flag, name in SORTED_MAPPING_NAMES.items():
+                if name in pretrained_model_name_or_path.lower():
+                    model_name = model_flag + "Model"
+                    break
+        if model_name is None:
+            raise AttributeError(
+                f"Unable to parse 'architectures' or 'init_class' from {config_file_path}. Also unable to infer model class from 'pretrained_model_name_or_path'"
+            )
+        init_class = cls._name_mapping[model_name + "_Import_Class"]
+        class_name = cls._name_mapping[init_class]
+        import_class = importlib.import_module(f"paddlenlp.transformers.{class_name}.modeling")
+        try:
+            model_class = getattr(import_class, init_class)
+            return model_class
+        except AttributeError as err:
+            try:
+                new_import_class = importlib.import_module(f"paddlenlp.transformers.{class_name}")
+                model_class = getattr(new_import_class, init_class)
+                return model_class
+            except AttributeError:
+                logger.error(err)
+                all_model_classes = import_class.__all__
+                all_tasks = {get_task_name(m) for m in all_model_classes if get_task_name(m) is not None}
+                raise AttributeError(
+                    f"module '{import_class.__name__}' only supports the following classes: "
+                    + ", ".join(m for m in all_model_classes)
+                    + "\n"
+                    "Hint: you can use interface "
+                    + " or ".join(task + ".from_pretrained" for task in all_tasks)
+                    + f" to load '{pretrained_model_name_or_path}'\n"
+                )
+
+    @classmethod
+    def from_config(cls, config, **kwargs):
+        model_class = cls._get_model_class_from_config(None, None, config)
+        return model_class._from_config(config, **kwargs)
+
+    @classmethod
+    def _from_pretrained(cls, pretrained_model_name_or_path, task=None, *model_args, **kwargs):
+        if task:
+            if cls._task_choice:
+                cls._name_mapping = get_name_mapping(task)
+            else:
+                print("We only support task choice for AutoModel.")
+        cache_dir = kwargs.get("cache_dir", None)
+        from_aistudio = kwargs.get("from_aistudio", False)
+        from_hf_hub = kwargs.get("from_hf_hub", False)
+        subfolder = kwargs.get("subfolder", "")
+        if subfolder is None:
+            subfolder = ""
+        kwargs["cache_dir"] = cache_dir
+        kwargs["subfolder"] = subfolder
+        all_model_names = []
+        for pretrained_model_names, model_name in cls._pretrained_model_dict.items():
+            for name in pretrained_model_names:
+                all_model_names.append(name)
+
+        # From built-in pretrained models
+        if pretrained_model_name_or_path in all_model_names:
+            for pretrained_model_names, model_name in cls._pretrained_model_dict.items():
+                # From built-in pretrained models
+                for pattern in pretrained_model_names:
+                    if pattern == pretrained_model_name_or_path:
+                        init_class = cls._name_mapping[model_name + "_Import_Class"]
+                        class_name = cls._name_mapping[init_class]
+                        import_class = importlib.import_module(f"paddlenlp.transformers.{class_name}.modeling")
+                        try:
+                            model_class = getattr(import_class, init_class)
+                        except AttributeError as err:
+                            try:
+                                import_class2 = importlib.import_module(f"paddlenlp.transformers.{class_name}")
+                                model_class = getattr(import_class2, init_class)
+                            except AttributeError:
+                                logger.error(err)
+                                all_model_classes = import_class.__all__
+                                all_tasks = {
+                                    get_task_name(m) for m in all_model_classes if get_task_name(m) is not None
+                                }
+                                raise AttributeError(
+                                    f"module '{import_class.__name__}' only supports the following classes: "
+                                    + ", ".join(m for m in all_model_classes)
+                                    + "\n"
+                                    "Hint: you can use interface "
+                                    + " or ".join(task + ".from_pretrained" for task in all_tasks)
+                                    + f" to load '{pretrained_model_name_or_path}'\n"
+                                )
+                        logger.info(f"We are using {model_class} to load '{pretrained_model_name_or_path}'.")
+                        return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        config_file = resolve_file_path(
+            pretrained_model_name_or_path,
+            [cls.model_config_file, cls.legacy_model_config_file],
+            subfolder,
+            cache_dir=cache_dir,
+            from_hf_hub=from_hf_hub,
+            from_aistudio=from_aistudio,
+        )
+        if config_file is not None and os.path.exists(config_file):
+            model_class = cls._get_model_class_from_config(pretrained_model_name_or_path, config_file)
+            logger.info(f"We are using {model_class} to load '{pretrained_model_name_or_path}'.")
+            return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        else:
+            raise RuntimeError(
+                f"Can't load model for '{pretrained_model_name_or_path}'.\n"
+                f"Please make sure that '{pretrained_model_name_or_path}' is:\n"
+                "- a correct model-identifier of built-in pretrained models,\n"
+                "- or a correct model-identifier of community-contributed pretrained models,\n"
+                "- or the correct path to a directory containing relevant model files.\n"
+            )
+
+
+class AutoBackbone(_BaseAutoModelClass):
+    """
+    AutoBackbone.
+    """
+
+    CONFIGURATION_MODEL_MAPPING = get_init_configurations()
+    _pretrained_model_dict = CONFIGURATION_MODEL_MAPPING
+    _name_mapping = get_name_mapping("Backbone")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Creates an instance of `AutoBackbone`. Model weights are loaded
+        by specifying name of a built-in pretrained model, or a community contributed model,
+        or a local file directory path.
+
+        Args:
+            pretrained_model_name_or_path (str): See :class:`AutoModel`.
+            *args (tuple): See :class:`AutoModel`.
+            **kwargs (dict): See :class:`AutoModel`.
+
+        Returns:
+            PretrainedModel: An instance of `AutoBackbone`.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import AutoBackbone
+
+                # Name of built-in pretrained model
+                model = AutoBackbone.from_pretrained("google/bit-50")
+                print(type(model))
+                # <class 'paddlenlp.transformers.bit.modeling.BitBackbone'>
+
+
+                # Load from local directory path
+                model = AutoBackbone.from_pretrained("./bit-50")
+                print(type(model))
+                # <class 'paddlenlp.transformers.bit.modeling.BitBackbone'>
+        """
+        return cls._from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+
+class AutoModel(_BaseAutoModelClass):
+    """
+    AutoClass can help you automatically retrieve the relevant model given the provided
+    pretrained weights/vocabulary.
+    AutoModel is a generic model class that will be instantiated as one of the base model classes
+    when created with the from_pretrained() classmethod.
+    """
+
+    CONFIGURATION_MODEL_MAPPING = get_init_configurations()
+    _pretrained_model_dict = CONFIGURATION_MODEL_MAPPING
+    _name_mapping = get_name_mapping("Model")
+    _task_choice = True
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, task=None, *model_args, **kwargs):
+        """
+        Creates an instance of `AutoModel`. Model weights are loaded
+        by specifying name of a built-in pretrained model, a pretrained model on HF, a community contributed model,
+        or a local file directory path.
+
+        Args:
+            pretrained_model_name_or_path (str): Name of pretrained model or dir path
+                to load from. The string can be:
+
+                - Name of a built-in pretrained model
+                - Name of a community-contributed pretrained model.
+                - Local directory path which contains model weights file("model_state.pdparams")
+                  and model config file ("model_config.json").
+            task (str): Specify a downstream task. Task can be 'Model', 'ForPretraining',
+                'ForSequenceClassification', 'ForTokenClassification', 'ForQuestionAnswering',
+                'ForMultipleChoice', 'ForMaskedLM', 'ForCausalLM', 'Encoder', 'Decoder',
+                'Generator', 'Discriminator', 'ForConditionalGeneration'.
+                We only support specify downstream tasks in AutoModel. Defaults to `None`.
+            *args (tuple): Position arguments for model `__init__`. If provided,
+                use these as position argument values for model initialization.
+            **kwargs (dict): Keyword arguments for model `__init__`. If provided,
+                use these to update pre-defined keyword argument values for model
+                initialization. If the keyword is in `__init__` argument names of
+                base model, update argument values of the base model; else update
+                argument values of derived model.
+
+        Returns:
+            PretrainedModel: An instance of `AutoModel`.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import AutoModel
+
+                # Name of built-in pretrained model
+                model = AutoModel.from_pretrained('bert-base-uncased')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bert.modeling.BertModel'>
+
+                # Name of community-contributed pretrained model
+                model = AutoModel.from_pretrained('yingyibiao/bert-base-uncased-sst-2-finetuned')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bert.modeling.BertModel'>
+
+                # Load from local directory path
+                model = AutoModel.from_pretrained('./my_bert/')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bert.modeling.BertModel'>
+
+                # choose task
+                model = AutoModel.from_pretrained('bert-base-uncased', task='ForPretraining')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bert.modeling.BertForPretraining'>
+        """
+        return cls._from_pretrained(pretrained_model_name_or_path, task, *model_args, **kwargs)
+
+
+class AutoModelForPretraining(_BaseAutoModelClass):
+    """
+    AutoModelForPretraining.
+    """
+
+    CONFIGURATION_MODEL_MAPPING = get_init_configurations()
+    _pretrained_model_dict = CONFIGURATION_MODEL_MAPPING
+    _name_mapping = get_name_mapping("ForPretraining")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Creates an instance of `AutoModelForPretraining`. Model weights are loaded
+        by specifying name of a built-in pretrained model, or a community contributed model,
+        or a local file directory path.
+
+        Args:
+            pretrained_model_name_or_path (str): See :class:`AutoModel`.
+            *args (tuple): See :class:`AutoModel`.
+            **kwargs (dict): See :class:`AutoModel`.
+
+        Returns:
+            PretrainedModel: An instance of `AutoModelForPretraining`.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import AutoModelForPretraining
+
+                # Name of built-in pretrained model
+                model = AutoModelForPretraining.from_pretrained('bert-base-uncased')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bert.modeling.BertModelForPretraining'>
+
+                # Name of community-contributed pretrained model
+                model = AutoModelForPretraining.from_pretrained('iverxin/bert-base-japanese')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bert.modeling.BertModelForPretraining'>
+
+                # Load from local directory path
+                model = AutoModelForPretraining.from_pretrained('./my_bert/')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bert.modeling.BertModelForPretraining'>
+        """
+        return cls._from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+
+class AutoModelForSequenceClassification(_BaseAutoModelClass):
+    """
+    AutoModelForSequenceClassification.
+    """
+
+    CONFIGURATION_MODEL_MAPPING = get_init_configurations()
+    _pretrained_model_dict = CONFIGURATION_MODEL_MAPPING
+    _name_mapping = get_name_mapping("ForSequenceClassification")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Creates an instance of `AutoModelForSequenceClassification`. Model weights are loaded
+        by specifying name of a built-in pretrained model, or a community contributed model,
+        or a local file directory path.
+
+        Args:
+            pretrained_model_name_or_path (str): See :class:`AutoModel`.
+            *args (tuple): See :class:`AutoModel`.
+            **kwargs (dict): See :class:`AutoModel`.
+
+        Returns:
+            PretrainedModel: An instance of `AutoModelForSequenceClassification`.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import AutoModelForSequenceClassification
+
+                # Name of built-in pretrained model
+                model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bert.modeling.BertModelForSequenceClassification'>
+
+                # Name of community-contributed pretrained model
+                model = AutoModelForSequenceClassification.from_pretrained('iverxin/bert-base-japanese')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bert.modeling.BertModelForSequenceClassification'>
+
+                # Load from local directory path
+                model = AutoModelForSequenceClassification.from_pretrained('./my_bert/')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bert.modeling.BertModelForSequenceClassification'>
+        """
+        return cls._from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+
+class AutoModelForTokenClassification(_BaseAutoModelClass):
+    """
+    AutoModelForTokenClassification.
+    """
+
+    CONFIGURATION_MODEL_MAPPING = get_init_configurations()
+    _pretrained_model_dict = CONFIGURATION_MODEL_MAPPING
+    _name_mapping = get_name_mapping("ForTokenClassification")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Creates an instance of `AutoModelForTokenClassification`. Model weights are loaded
+        by specifying name of a built-in pretrained model, or a community contributed model,
+        or a local file directory path.
+
+        Args:
+            pretrained_model_name_or_path (str): See :class:`AutoModel`.
+            *args (tuple): See :class:`AutoModel`.
+            **kwargs (dict): See :class:`AutoModel`.
+
+        Returns:
+            PretrainedModel: An instance of `AutoModelForTokenClassification`.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import AutoModelForTokenClassification
+
+                # Name of built-in pretrained model
+                model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bert.modeling.BertModelForTokenClassification'>
+
+                # Name of community-contributed pretrained model
+                model = AutoModelForTokenClassification.from_pretrained('iverxin/bert-base-japanese')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bert.modeling.BertModelForTokenClassification'>
+
+                # Load from local directory path
+                model = AutoModelForTokenClassification.from_pretrained('./my_bert/')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bert.modeling.BertModelForTokenClassification'>
+        """
+        return cls._from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+
+class AutoModelForQuestionAnswering(_BaseAutoModelClass):
+    """
+    AutoModelForQuestionAnswering.
+    """
+
+    CONFIGURATION_MODEL_MAPPING = get_init_configurations()
+    _pretrained_model_dict = CONFIGURATION_MODEL_MAPPING
+    _name_mapping = get_name_mapping("ForQuestionAnswering")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Creates an instance of `AutoModelForQuestionAnswering`. Model weights are loaded
+        by specifying name of a built-in pretrained model, or a community contributed model,
+        or a local file directory path.
+
+        Args:
+            pretrained_model_name_or_path (str): See :class:`AutoModel`.
+            *args (tuple): See :class:`AutoModel`.
+            **kwargs (dict): See :class:`AutoModel`.
+
+        Returns:
+            PretrainedModel: An instance of `AutoModelForQuestionAnswering`.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import AutoModelForQuestionAnswering
+
+                # Name of built-in pretrained model
+                model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bert.modeling.BertModelForQuestionAnswering'>
+
+                # Name of community-contributed pretrained model
+                model = AutoModelForQuestionAnswering.from_pretrained('iverxin/bert-base-japanese')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bert.modeling.BertModelForQuestionAnswering'>
+
+                # Load from local directory path
+                model = AutoModelForQuestionAnswering.from_pretrained('./my_bert/')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bert.modeling.BertModelForQuestionAnswering'>
+        """
+        return cls._from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+
+class AutoModelForMultipleChoice(_BaseAutoModelClass):
+    """
+    AutoModelForMultipleChoice.
+    """
+
+    CONFIGURATION_MODEL_MAPPING = get_init_configurations()
+    _pretrained_model_dict = CONFIGURATION_MODEL_MAPPING
+    _name_mapping = get_name_mapping("ForMultipleChoice")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Creates an instance of `AutoModelForMultipleChoice`. Model weights are loaded
+        by specifying name of a built-in pretrained model, or a community contributed model,
+        or a local file directory path.
+
+        Args:
+            pretrained_model_name_or_path (str): See :class:`AutoModel`.
+            *args (tuple): See :class:`AutoModel`.
+            **kwargs (dict): See :class:`AutoModel`.
+
+        Returns:
+            PretrainedModel: An instance of `AutoModelForMultipleChoice`.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import AutoModelForMultipleChoice
+
+                # Name of built-in pretrained model
+                model = AutoModelForMultipleChoice.from_pretrained('bert-base-uncased')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bert.modeling.BertModelForMultipleChoice'>
+
+                # Name of community-contributed pretrained model
+                model = AutoModelForMultipleChoice.from_pretrained('iverxin/bert-base-japanese')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bert.modeling.BertModelForMultipleChoice'>
+
+                # Load from local directory path
+                model = AutoModelForMultipleChoice.from_pretrained('./my_bert/')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bert.modeling.BertModelForMultipleChoice'>
+        """
+        return cls._from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+
+class AutoModelForMaskedLM(_BaseAutoModelClass):
+    """
+    AutoModelForMaskedLM.
+    """
+
+    CONFIGURATION_MODEL_MAPPING = get_init_configurations()
+    _pretrained_model_dict = CONFIGURATION_MODEL_MAPPING
+    _name_mapping = get_name_mapping("ForMaskedLM")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Creates an instance of `AutoModelForMaskedLM`. Model weights are loaded
+        by specifying name of a built-in pretrained model, or a community contributed model,
+        or a local file directory path.
+
+        Args:
+            pretrained_model_name_or_path (str): See :class:`AutoModel`.
+            *args (tuple): See :class:`AutoModel`.
+            **kwargs (dict): See :class:`AutoModel`.
+
+        Returns:
+            PretrainedModel: An instance of `AutoModelForMaskedLM`.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import AutoModelForMaskedLM
+
+                # Name of built-in pretrained model
+                model = AutoModelForMaskedLM.from_pretrained('bert-base-uncased')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bert.modeling.BertModelForMaskedLM'>
+
+                # Name of community-contributed pretrained model
+                model = AutoModelForMaskedLM.from_pretrained('iverxin/bert-base-japanese')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bert.modeling.BertModelForMaskedLM'>
+
+                # Load from local directory path
+                model = AutoModelForMaskedLM.from_pretrained('./my_bert/')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bert.modeling.BertModelForMaskedLM'>
+        """
+        return cls._from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+
+class AutoModelForCausalLM(_BaseAutoModelClass):
+    """
+    AutoModelForCausalLM.
+    """
+
+    CONFIGURATION_MODEL_MAPPING = get_init_configurations()
+    _pretrained_model_dict = CONFIGURATION_MODEL_MAPPING
+    _name_mapping = get_name_mapping("ForCausalLM")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Creates an instance of `AutoModelForCausalLM`. Model weights are loaded
+        by specifying name of a built-in pretrained model, or a community contributed model,
+        or a local file directory path.
+
+        Args:
+            pretrained_model_name_or_path (str): See :class:`AutoModel`.
+            *args (tuple): See :class:`AutoModel`.
+            **kwargs (dict): See :class:`AutoModel`.
+
+        Returns:
+            PretrainedModel: An instance of `AutoModelForCausalLM`.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import AutoModelForCausalLM
+
+                # Name of built-in pretrained model
+                model = AutoModelForCausalLM.from_pretrained('gpt2-en')
+                print(type(model))
+                # <class 'paddlenlp.transformers.gpt.modeling.GPTLMHeadModel'>
+
+                # Name of community-contributed pretrained model
+                model = AutoModelForCausalLM.from_pretrained('junnyu/distilgpt2')
+                print(type(model))
+                # <class 'paddlenlp.transformers.gpt.modeling.GPTLMHeadModel'>
+
+                # Load from local directory path
+                model = AutoModelForCausalLM.from_pretrained('./my_gpt/')
+                print(type(model))
+                # <class 'paddlenlp.transformers.gpt.modeling.GPTLMHeadModel'>
+        """
+        return cls._from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+
+class AutoModelForCausalLMPipe(_BaseAutoModelClass):
+    """
+    Pipeline model for AutoModelForCausalLM.
+    """
+
+    CONFIGURATION_MODEL_MAPPING = get_init_configurations()
+    _pretrained_model_dict = CONFIGURATION_MODEL_MAPPING
+    _name_mapping = get_name_mapping("ForCausalLMPipe")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        return cls._from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+
+class AutoEncoder(_BaseAutoModelClass):
+    """
+    AutoEncoder.
+    """
+
+    CONFIGURATION_MODEL_MAPPING = get_init_configurations()
+    _pretrained_model_dict = CONFIGURATION_MODEL_MAPPING
+    _name_mapping = get_name_mapping("Encoder")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Creates an instance of `AutoEncoder`. Model weights are loaded
+        by specifying name of a built-in pretrained model, or a community contributed model,
+        or a local file directory path.
+
+        Args:
+            pretrained_model_name_or_path (str): See :class:`AutoModel`.
+            *args (tuple): See :class:`AutoModel`.
+            **kwargs (dict): See :class:`AutoModel`.
+
+        Returns:
+            PretrainedModel: An instance of `AutoEncoder`.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import AutoEncoder
+
+                # Name of built-in pretrained model
+                model = AutoEncoder.from_pretrained('bart-base',vocab_size=20000)
+                print(type(model))
+                # <class 'paddlenlp.transformers.bart.modeling.BartEncoder'>
+
+                # Load from local directory path
+                model = AutoEncoder.from_pretrained('./my_bart/')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bart.modeling.BartEncoder'>
+        """
+        return cls._from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+
+class AutoDecoder(_BaseAutoModelClass):
+    """
+    AutoDecoder.
+    """
+
+    CONFIGURATION_MODEL_MAPPING = get_init_configurations()
+    _pretrained_model_dict = CONFIGURATION_MODEL_MAPPING
+    _name_mapping = get_name_mapping("Decoder")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Creates an instance of `AutoDecoder`. Model weights are loaded
+        by specifying name of a built-in pretrained model, or a community contributed model,
+        or a local file directory path.
+
+        Args:
+            pretrained_model_name_or_path (str): See :class:`AutoModel`.
+            *args (tuple): See :class:`AutoModel`.
+            **kwargs (dict): See :class:`AutoModel`.
+
+        Returns:
+            PretrainedModel: An instance of `AutoDecoder`.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import AutoDecoder
+
+                # Name of built-in pretrained model
+                model = AutoDecoder.from_pretrained('bart-base', vocab_size=20000)
+                print(type(model))
+                # <class 'paddlenlp.transformers.bart.modeling.BartEncoder'>
+
+                # Load from local directory path
+                model = AutoDecoder.from_pretrained('./my_bart/')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bart.modeling.BartEncoder'>
+        """
+        return cls._from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+
+class AutoGenerator(_BaseAutoModelClass):
+    """
+    AutoGenerator.
+    """
+
+    CONFIGURATION_MODEL_MAPPING = get_init_configurations()
+    _pretrained_model_dict = CONFIGURATION_MODEL_MAPPING
+    _name_mapping = get_name_mapping("Generator")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Creates an instance of `AutoGenerator`. Model weights are loaded
+        by specifying name of a built-in pretrained model, or a community contributed model,
+        or a local file directory path.
+
+        Args:
+            pretrained_model_name_or_path (str): See :class:`AutoModel`.
+            *args (tuple): See :class:`AutoModel`.
+            **kwargs (dict): See :class:`AutoModel`.
+
+        Returns:
+            PretrainedModel: An instance of `AutoGenerator`.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import AutoGenerator
+
+                # Name of built-in pretrained model
+                model = AutoGenerator.from_pretrained('electra-small')
+                print(type(model))
+                # <class 'paddlenlp.transformers.electra.modeling.ElectraGenerator'>
+
+                # Name of community-contributed pretrained model
+                model = AutoGenerator.from_pretrained('junnyu/hfl-chinese-legal-electra-small-generator')
+                print(type(model))
+                # <class 'paddlenlp.transformers.electra.modeling.ElectraGenerator'>
+
+                # Load from local directory path
+                model = AutoGenerator.from_pretrained('./my_electra/')
+                print(type(model))
+                # <class 'paddlenlp.transformers.electra.modeling.ElectraGenerator'>
+        """
+        return cls._from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+
+class AutoDiscriminator(_BaseAutoModelClass):
+    """
+    AutoDiscriminator.
+    """
+
+    CONFIGURATION_MODEL_MAPPING = get_init_configurations()
+    _pretrained_model_dict = CONFIGURATION_MODEL_MAPPING
+    _name_mapping = get_name_mapping("Discriminator")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Creates an instance of `AutoDiscriminator`. Model weights are loaded
+        by specifying name of a built-in pretrained model, or a community contributed model,
+        or a local file directory path.
+
+        Args:
+            pretrained_model_name_or_path (str): See :class:`AutoModel`.
+            *args (tuple): See :class:`AutoModel`.
+            **kwargs (dict): See :class:`AutoModel`.
+
+        Returns:
+            PretrainedModel: An instance of `AutoDiscriminator`.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import AutoDiscriminator
+
+                # Name of built-in pretrained model
+                model = AutoDiscriminator.from_pretrained('electra-small')
+                print(type(model))
+                # <class 'paddlenlp.transformers.electra.modeling.ElectraDiscriminator'>
+
+                # Name of community-contributed pretrained model
+                model = AutoDiscriminator.from_pretrained('junnyu/hfl-chinese-legal-electra-small-generator')
+                print(type(model))
+                # <class 'paddlenlp.transformers.electra.modeling.ElectraDiscriminator'>
+
+                # Load from local directory path
+                model = AutoDiscriminator.from_pretrained('./my_electra/')
+                print(type(model))
+                # <class 'paddlenlp.transformers.electra.modeling.ElectraDiscriminator'>
+        """
+        return cls._from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+
+class AutoModelForConditionalGeneration(_BaseAutoModelClass):
+    """
+    AutoModelForConditionalGeneration.
+    """
+
+    CONFIGURATION_MODEL_MAPPING = get_init_configurations()
+    _pretrained_model_dict = CONFIGURATION_MODEL_MAPPING
+    _name_mapping = get_name_mapping("ForConditionalGeneration")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Creates an instance of `AutoModelForConditionalGeneration`. Model weights are loaded
+        by specifying name of a built-in pretrained model, or a community contributed model,
+        or a local file directory path.
+
+        Args:
+            pretrained_model_name_or_path (str): See :class:`AutoModel`.
+            *args (tuple): See :class:`AutoModel`.
+            **kwargs (dict): See :class:`AutoModel`.
+
+        Returns:
+            PretrainedModel: An instance of `AutoModelForConditionalGeneration`.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import AutoModelForConditionalGeneration
+
+                # Name of built-in pretrained model
+                model = AutoModelForConditionalGeneration.from_pretrained('bart-base')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bart.modeling.BartForConditionalGeneration'>
+
+
+                # Load from local directory path
+                model = AutoModelForConditionalGeneration.from_pretrained('./my_bart/')
+                print(type(model))
+                # <class 'paddlenlp.transformers.bart.modeling.BartForConditionalGeneration'>
+        """
+        return cls._from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/auto/processing.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/auto/processing.py
new file mode 100644
index 000000000..29820de72
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/auto/processing.py
@@ -0,0 +1,193 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import io
+import json
+import os
+from collections import OrderedDict
+
+from ...utils.download import resolve_file_path
+from ...utils.import_utils import import_module
+from ...utils.log import logger
+
+__all__ = [
+    "AutoProcessor",
+]
+
+PROCESSOR_MAPPING_NAMES = OrderedDict(
+    [
+        ("ChineseCLIPProcessor", "chineseclip"),
+        ("CLIPProcessor", "clip"),
+        ("ErnieViLProcessor", "ernie_vil"),
+        ("CLIPSegProcessor", "clipseg"),
+        ("SpeechT5Processor", "speecht5"),
+        ("ClapProcessor", "clap"),
+    ]
+)
+
+
+def get_configurations():
+    MAPPING_NAMES = OrderedDict()
+    for key, class_name in PROCESSOR_MAPPING_NAMES.items():
+        import_class = importlib.import_module(f"paddlenlp.transformers.{class_name}.processing")
+        processor_name = getattr(import_class, key)
+        name = tuple(processor_name.pretrained_init_configuration.keys())
+        if MAPPING_NAMES.get(name, None) is None:
+            MAPPING_NAMES[name] = []
+        MAPPING_NAMES[name].append(processor_name)
+    return MAPPING_NAMES
+
+
+class AutoProcessor:
+    """
+    AutoClass can help you automatically retrieve the relevant model given the provided
+    pretrained weights/vocabulary.
+    Autoprocessor is a generic processor class that will be instantiated as one of the
+    base processor classes when created with the Autoprocessor.from_pretrained() classmethod.
+    """
+
+    MAPPING_NAMES = get_configurations()
+    _processor_mapping = MAPPING_NAMES
+    _name_mapping = PROCESSOR_MAPPING_NAMES
+    processor_config_file = "preprocessor_config.json"
+
+    def __init__(self, *args, **kwargs):
+        raise EnvironmentError(
+            f"{self.__class__.__name__} is designed to be instantiated "
+            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path).`"
+        )
+
+    @classmethod
+    def _get_processor_class_from_config(cls, pretrained_model_name_or_path, config_file_path):
+        with io.open(config_file_path, encoding="utf-8") as f:
+            init_kwargs = json.load(f)
+        # class name corresponds to this configuration
+        init_class = init_kwargs.pop("init_class", None)
+        if init_class is None:
+            init_class = init_kwargs.pop("processor_class", None)
+            if init_class is None:
+                init_class = init_kwargs.pop("image_processor_type", None)
+                # replace old name to new name
+                if init_class is not None and init_class.endswith("ImageProcessor"):
+                    init_class = init_class.replace("ImageProcessor", "Processor")
+            if init_class is None:
+                init_class = init_kwargs.pop("feature_extractor_type", None)
+                # replace old name to new name
+                if init_class is not None and init_class.endswith("FeatureExtractor"):
+                    init_class = init_class.replace("FeatureExtractor", "Processor")
+
+        if init_class:
+            try:
+                class_name = cls._name_mapping[init_class]
+                import_class = import_module(f"paddlenlp.transformers.{class_name}.processing")
+                processor_class = getattr(import_class, init_class)
+                return processor_class
+            except Exception:
+                init_class = None
+
+        # If no `init_class`, we use pattern recognition to recognize the processor class.
+        if init_class is None:
+            logger.info("We use pattern recognition to recognize the processor class.")
+            for key, pattern in cls._name_mapping.items():
+                if pattern in pretrained_model_name_or_path.lower():
+                    init_class = key
+                    class_name = cls._name_mapping[init_class]
+                    import_class = import_module(f"paddlenlp.transformers.{class_name}.processor")
+                    processor_class = getattr(import_class, init_class)
+                    break
+            return processor_class
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Creates an instance of `Autoprocessor`. Related resources are loaded by
+        specifying name of a built-in pretrained model, or a community-contributed
+        pretrained model, or a local file directory path.
+
+        Args:
+            pretrained_model_name_or_path (str): Name of pretrained model or dir path
+                to load from. The string can be:
+
+                - Name of built-in pretrained model
+                - Name of a community-contributed pretrained model.
+                - Local directory path which contains processor related resources
+                  and processor config file ("processor_config.json").
+            *args (tuple): position arguments for model `__init__`. If provided,
+                use these as position argument values for processor initialization.
+            **kwargs (dict): keyword arguments for model `__init__`. If provided,
+                use these to update pre-defined keyword argument values for processor
+                initialization.
+
+        Returns:
+            Pretrainedprocessor: An instance of `Pretrainedprocessor`.
+
+
+        Example:
+            .. code-block::
+            from paddlenlp.transformers import AutoProcessor
+            processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+            processor.save_pretrained('clip_processor')
+        """
+        cache_dir = kwargs.get("cache_dir", None)
+        subfolder = kwargs.get("subfolder", "")
+        if subfolder is None:
+            subfolder = ""
+        from_aistudio = kwargs.get("from_aistudio", False)
+        from_hf_hub = kwargs.get("from_hf_hub", False)
+        kwargs["subfolder"] = subfolder
+        kwargs["cache_dir"] = cache_dir
+
+        all_processor_names = []
+        for names, processor_class in cls._processor_mapping.items():
+            for name in names:
+                all_processor_names.append(name)
+
+        # From built-in pretrained models
+        if pretrained_model_name_or_path in all_processor_names:
+            for names, processor_classes in cls._processor_mapping.items():
+                for pattern in names:
+                    if pattern == pretrained_model_name_or_path:
+                        actual_processor_class = processor_classes[0]
+                        logger.info(
+                            "We are using %s to load '%s'." % (actual_processor_class, pretrained_model_name_or_path)
+                        )
+                        return actual_processor_class.from_pretrained(
+                            pretrained_model_name_or_path, *model_args, **kwargs
+                        )
+
+        config_file = resolve_file_path(
+            pretrained_model_name_or_path,
+            [cls.processor_config_file],
+            subfolder,
+            cache_dir=cache_dir,
+            from_hf_hub=from_hf_hub,
+            from_aistudio=from_aistudio,
+        )
+        if config_file is not None and os.path.exists(config_file):
+            processor_class = cls._get_processor_class_from_config(
+                pretrained_model_name_or_path,
+                config_file,
+            )
+            logger.info(f"We are using {processor_class} to load '{pretrained_model_name_or_path}'.")
+            return processor_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        else:
+            raise RuntimeError(
+                f"Can't load processor for '{pretrained_model_name_or_path}'.\n"
+                f"Please make sure that '{pretrained_model_name_or_path}' is:\n"
+                "- a correct model-identifier of built-in pretrained processor,\n"
+                "- or a correct model-identifier of community-contributed pretrained models,\n"
+                "- or the correct path to a directory containing relevant processor files.\n"
+            )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/auto/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/auto/tokenizer.py
new file mode 100644
index 000000000..46efa4efb
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/auto/tokenizer.py
@@ -0,0 +1,269 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import io
+import json
+import os
+from collections import OrderedDict
+
+from ...utils.download import resolve_file_path
+from ...utils.import_utils import import_module
+from ...utils.log import logger
+
+__all__ = [
+    "AutoTokenizer",
+]
+
+TOKENIZER_MAPPING_NAMES = OrderedDict(
+    [
+        ("AlbertEnglishTokenizer", "albert"),
+        ("AlbertChineseTokenizer", "albert"),
+        ("BertJapaneseTokenizer", "bert_japanese"),
+        ("BigBirdTokenizer", "bigbird"),
+        ("BlenderbotSmallTokenizer", "blenderbot_small"),
+        ("BlenderbotTokenizer", "blenderbot"),
+        ("ChatGLMTokenizer", "chatglm"),
+        ("ChatGLMv2Tokenizer", "chatglm_v2"),
+        ("ChineseBertTokenizer", "chinesebert"),
+        ("ConvBertTokenizer", "convbert"),
+        ("CTRLTokenizer", "ctrl"),
+        ("DalleBartTokenizer", "dallebart"),
+        ("DistilBertTokenizer", "distilbert"),
+        ("ElectraTokenizer", "electra"),
+        ("ErnieCtmTokenizer", "ernie_ctm"),
+        ("ErnieDocTokenizer", "ernie_doc"),
+        ("ErnieDocBPETokenizer", "ernie_doc"),
+        ("ErnieGramTokenizer", "ernie_gram"),
+        ("ErnieLayoutTokenizer", "ernie_layout"),
+        ("ErnieMTokenizer", "ernie_m"),
+        ("ErnieCodeTokenizer", "ernie_code"),
+        ("ErnieTokenizer", "ernie"),
+        ("FNetTokenizer", "fnet"),
+        ("FunnelTokenizer", "funnel"),
+        ("LlamaTokenizer", "llama"),
+        ("LayoutXLMTokenizer", "layoutxlm"),
+        ("LayoutLMv2Tokenizer", "layoutlmv2"),
+        ("LayoutLMTokenizer", "layoutlm"),
+        ("LukeTokenizer", "luke"),
+        ("MBartTokenizer", "mbart"),
+        ("MBart50Tokenizer", "mbart"),
+        ("MegatronBertTokenizer", "megatronbert"),
+        ("MobileBertTokenizer", "mobilebert"),
+        ("MPNetTokenizer", "mpnet"),
+        ("NeZhaTokenizer", "nezha"),
+        ("NystromformerTokenizer", "nystromformer"),
+        ("PPMiniLMTokenizer", "ppminilm"),
+        ("ProphetNetTokenizer", "prophetnet"),
+        ("ReformerTokenizer", "reformer"),
+        ("RemBertTokenizer", "rembert"),
+        ("RobertaChineseTokenizer", "roberta"),
+        ("RobertaBPETokenizer", "roberta"),
+        ("RoFormerTokenizer", "roformer"),
+        ("RoFormerv2Tokenizer", "roformerv2"),
+        ("SkepTokenizer", "skep"),
+        ("SqueezeBertTokenizer", "squeezebert"),
+        ("TinyBertTokenizer", "tinybert"),
+        ("UnifiedTransformerTokenizer", "unified_transformer"),
+        ("UNIMOTokenizer", "unimo"),
+        ("XLNetTokenizer", "xlnet"),
+        ("XLMTokenizer", "xlm"),
+        ("GPTTokenizer", "gpt"),
+        ("GPTChineseTokenizer", "gpt"),
+        ("T5Tokenizer", "t5"),
+        ("BertTokenizer", "bert"),
+        ("BartTokenizer", "bart"),
+        ("GAUAlphaTokenizer", "gau_alpha"),
+        ("CodeGenTokenizer", "codegen"),
+        ("CLIPTokenizer", "clip"),
+        ("ArtistTokenizer", "artist"),
+        ("ChineseCLIPTokenizer", "chineseclip"),
+        ("ErnieViLTokenizer", "ernie_vil"),
+        ("PegasusChineseTokenizer", "pegasus"),
+        ("GLMBertTokenizer", "glm"),
+        ("GLMChineseTokenizer", "glm"),
+        ("GLMGPT2Tokenizer", "glm"),
+        ("BloomTokenizer", "bloom"),
+        ("SpeechT5Tokenizer", "speecht5"),
+        ("QWenTokenizer", "qwen"),
+        ("GemmaTokenizer", "gemma"),
+        ("YuanTokenizer", "yuan"),
+        ("MambaTokenizer", "mamba"),
+        ("JambaTokenizer", "jamba"),
+    ]
+)
+
+
+def get_configurations():
+    MAPPING_NAMES = OrderedDict()
+    for key, class_name in TOKENIZER_MAPPING_NAMES.items():
+        import_class = importlib.import_module(f"paddlenlp.transformers.{class_name}.tokenizer")
+        tokenizer_name = getattr(import_class, key)
+        name = tuple(tokenizer_name.pretrained_init_configuration.keys())
+        MAPPING_NAMES[name] = tokenizer_name
+    return MAPPING_NAMES
+
+
+class AutoTokenizer:
+    """
+    AutoClass can help you automatically retrieve the relevant model given the provided
+    pretrained weights/vocabulary.
+    AutoTokenizer is a generic tokenizer class that will be instantiated as one of the
+    base tokenizer classes when created with the AutoTokenizer.from_pretrained() classmethod.
+    """
+
+    MAPPING_NAMES = get_configurations()
+    _tokenizer_mapping = MAPPING_NAMES
+    _name_mapping = TOKENIZER_MAPPING_NAMES
+    tokenizer_config_file = "tokenizer_config.json"
+
+    def __init__(self, *args, **kwargs):
+        raise EnvironmentError(
+            f"{self.__class__.__name__} is designed to be instantiated "
+            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path).`"
+        )
+
+    @classmethod
+    def _get_tokenizer_class_from_config(cls, pretrained_model_name_or_path, config_file_path, use_fast=None):
+        if use_fast is not None:
+            raise ValueError("use_fast is deprecated")
+        with io.open(config_file_path, encoding="utf-8") as f:
+            init_kwargs = json.load(f)
+        # class name corresponds to this configuration
+        init_class = init_kwargs.pop("init_class", None)
+        if init_class is None:
+            init_class = init_kwargs.pop("tokenizer_class", None)
+
+        if init_class:
+            if init_class in cls._name_mapping:
+                class_name = cls._name_mapping[init_class]
+                import_class = import_module(f"paddlenlp.transformers.{class_name}.tokenizer")
+                tokenizer_class = None
+                try:
+                    if tokenizer_class is None:
+                        tokenizer_class = getattr(import_class, init_class)
+                except:
+                    raise ValueError(f"Tokenizer class {init_class} is not currently imported.")
+                return tokenizer_class
+            else:
+                import_class = import_module("paddlenlp.transformers")
+                tokenizer_class = getattr(import_class, init_class, None)
+                assert tokenizer_class is not None, f"Can't find tokenizer {init_class}"
+                return tokenizer_class
+
+        # If no `init_class`, we use pattern recognition to recognize the tokenizer class.
+        else:
+            # TODO: Potential issue https://github.com/PaddlePaddle/PaddleNLP/pull/3786#discussion_r1024689810
+            logger.info("We use pattern recognition to recognize the Tokenizer class.")
+            for key, pattern in cls._name_mapping.items():
+                if pattern in pretrained_model_name_or_path.lower():
+                    init_class = key
+                    class_name = cls._name_mapping[init_class]
+                    import_class = import_module(f"paddlenlp.transformers.{class_name}.tokenizer")
+                    tokenizer_class = getattr(import_class, init_class)
+                    break
+            return tokenizer_class
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        """
+        Creates an instance of `AutoTokenizer`. Related resources are loaded by
+        specifying name of a built-in pretrained model, or a community-contributed
+        pretrained model, or a local file directory path.
+
+        Args:
+            pretrained_model_name_or_path (str): Name of pretrained model or dir path
+                to load from. The string can be:
+
+                - Name of built-in pretrained model
+                - Name of a community-contributed pretrained model.
+                - Local directory path which contains tokenizer related resources
+                  and tokenizer config file ("tokenizer_config.json").
+            *args (tuple): position arguments for model `__init__`. If provided,
+                use these as position argument values for tokenizer initialization.
+            **kwargs (dict): keyword arguments for model `__init__`. If provided,
+                use these to update pre-defined keyword argument values for tokenizer
+                initialization.
+
+        Returns:
+            PretrainedTokenizer: An instance of `PretrainedTokenizer`.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import AutoTokenizer
+
+                # Name of built-in pretrained model
+                tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+                print(type(tokenizer))
+                # <class 'paddlenlp.transformers.bert.tokenizer.BertTokenizer'>
+
+                # Name of community-contributed pretrained model
+                tokenizer = AutoTokenizer.from_pretrained('yingyibiao/bert-base-uncased-sst-2-finetuned')
+                print(type(tokenizer))
+                # <class 'paddlenlp.transformers.bert.tokenizer.BertTokenizer'>
+
+                # Load from local directory path
+                tokenizer = AutoTokenizer.from_pretrained('./my_bert/')
+                print(type(tokenizer))
+                # <class 'paddlenlp.transformers.bert.tokenizer.BertTokenizer'>
+        """
+        # Default not to use fast tokenizer
+        use_faster = kwargs.pop("use_faster", None)
+        use_fast = kwargs.pop("use_fast", None)
+        if use_fast is not None or use_faster is not None:
+            raise ValueError("use_fast is deprecated")
+
+        cache_dir = kwargs.get("cache_dir", None)
+        subfolder = kwargs.get("subfolder", "")
+        if subfolder is None:
+            subfolder = ""
+        from_aistudio = kwargs.get("from_aistudio", False)
+        from_hf_hub = kwargs.get("from_hf_hub", False)
+
+        all_tokenizer_names = []
+        for names, tokenizer_class in cls._tokenizer_mapping.items():
+            for name in names:
+                all_tokenizer_names.append(name)
+
+        # From built-in pretrained models
+        if pretrained_model_name_or_path in all_tokenizer_names:
+            for names, tokenizer_class in cls._tokenizer_mapping.items():
+                for pattern in names:
+                    if pattern == pretrained_model_name_or_path:
+                        logger.info("We are using %s to load '%s'." % (tokenizer_class, pretrained_model_name_or_path))
+                        return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        config_file = resolve_file_path(
+            pretrained_model_name_or_path,
+            cls.tokenizer_config_file,
+            subfolder,
+            cache_dir=cache_dir,
+            from_hf_hub=from_hf_hub,
+            from_aistudio=from_aistudio,
+        )
+        if config_file is not None and os.path.exists(config_file):
+            tokenizer_class = cls._get_tokenizer_class_from_config(
+                pretrained_model_name_or_path, config_file, use_fast
+            )
+            logger.info(f"We are using {tokenizer_class} to load '{pretrained_model_name_or_path}'.")
+            return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        else:
+            raise RuntimeError(
+                f"Can't load tokenizer for '{pretrained_model_name_or_path}'.\n"
+                f"Please make sure that '{pretrained_model_name_or_path}' is:\n"
+                "- a correct model-identifier of built-in pretrained models,\n"
+                "- or a correct model-identifier of community-contributed pretrained models,\n"
+                "- or the correct path to a directory containing relevant tokenizer files.\n"
+            )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bart/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bart/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bart/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bart/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bart/configuration.py
new file mode 100644
index 000000000..3326c2111
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bart/configuration.py
@@ -0,0 +1,197 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Bart model configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+from ...utils.log import logger
+
+__all__ = ["BART_PRETRAINED_INIT_CONFIGURATION", "BartConfig", "BART_PRETRAINED_RESOURCE_FILES_MAP"]
+
+BART_PRETRAINED_INIT_CONFIGURATION = {
+    "bart-base": {
+        "vocab_size": 50265,
+        "bos_token_id": 0,
+        "pad_token_id": 1,
+        "eos_token_id": 2,
+        "forced_eos_token_id": 2,
+        "decoder_start_token_id": 2,
+        "d_model": 768,
+        "num_encoder_layers": 6,
+        "num_decoder_layers": 6,
+        "encoder_attention_heads": 12,
+        "decoder_attention_heads": 12,
+        "encoder_ffn_dim": 3072,
+        "decoder_ffn_dim": 3072,
+        "dropout": 0.1,
+        "activation_function": "gelu",
+        "attention_dropout": 0.1,
+        "activation_dropout": 0.1,
+        "max_position_embeddings": 1024,
+        "init_std": 0.02,
+        "scale_embedding": False,
+    },
+    "bart-large": {
+        "vocab_size": 50265,
+        "bos_token_id": 0,
+        "pad_token_id": 1,
+        "eos_token_id": 2,
+        "forced_eos_token_id": 2,
+        "decoder_start_token_id": 2,
+        "d_model": 1024,
+        "num_encoder_layers": 12,
+        "num_decoder_layers": 12,
+        "encoder_attention_heads": 16,
+        "decoder_attention_heads": 16,
+        "encoder_ffn_dim": 4096,
+        "decoder_ffn_dim": 4096,
+        "dropout": 0.1,
+        "activation_function": "gelu",
+        "attention_dropout": 0.1,
+        "activation_dropout": 0.1,
+        "max_position_embeddings": 1024,
+        "init_std": 0.02,
+        "scale_embedding": False,
+    },
+}
+
+BART_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "bart-base": "https://bj.bcebos.com/paddlenlp/models/transformers/bart/bart-base.pdparams",
+        "bart-large": "https://bj.bcebos.com/paddlenlp/models/transformers/bart/bart-large.pdparams",
+    }
+}
+
+
+class BartConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`BartModel`]. It is used to instantiate a BART
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the BART bart-base architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, optional):
+            Vocabulary size of the BART model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`BartModel`] or [`TFBartModel`]. Default to 50265.
+        d_model (`int`, optional):
+            Dimensionality of the layers and the pooler layer. Default to 1024
+        encoder_layers (`int`, optional):
+            Number of encoder layers. Default to 6.
+        decoder_layers (`int`, optional):
+            Number of decoder layers. Default to 6.
+        encoder_attention_heads (`int`, optional):
+            Number of attention heads for each attention layer in the Transformer encoder. Default to 12.
+        decoder_attention_heads (`int`, optional):
+            Number of attention heads for each attention layer in the Transformer decoder. Default to 12.
+        decoder_ffn_dim (`int`, optional):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. Default to 3072.
+        encoder_ffn_dim (`int`, optional):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. Default to 3072.
+        activation_function (`str` or `function`, optional):
+            The non-linear activation function in the feed-forward layer.
+            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions are supported.
+            Default to `"gelu"`.
+        dropout (`float`, optional):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. Default to 0.1.
+        attention_dropout (`float`, optional):
+            The dropout ratio for the attention probabilities. Default to 0.1.
+        activation_dropout (`float`, optional):
+            The dropout ratio for activations inside the fully connected layer. Default to 0.1.
+        max_position_embeddings (`int`, optional):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048). Default to 1024.
+        init_std (`float`, optional):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices. Default to 0.02.
+        num_labels (`int`, optional):
+            The number of labels to use in [`BartForSequenceClassification`]. Default to 3.
+        forced_eos_token_id (`int`, optional):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`. Default to 2.
+        scale_embedding (`bool`, optional):
+            Scale embeddings by diving by sqrt(d_model). Default to `False`.
+
+    """
+    model_type = "bart"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map: Dict[str, str] = {
+        "num_encoder_layers": "encoder_layers",
+        "num_decoder_layers": "decoder_layers",
+        "num_classes": "num_labels",
+    }
+    pretrained_init_configuration = BART_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size: int = 50265,
+        max_position_embeddings: int = 1024,
+        encoder_layers: int = 6,
+        encoder_ffn_dim: int = 3072,
+        encoder_attention_heads: int = 12,
+        decoder_layers: int = 6,
+        decoder_ffn_dim: int = 3072,
+        decoder_attention_heads: int = 12,
+        activation_function: str = "gelu",
+        d_model: int = 768,
+        dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        init_std: float = 0.02,
+        pad_token_id: int = 1,
+        bos_token_id: int = 0,
+        eos_token_id: int = 2,
+        is_encoder_decoder: bool = True,
+        decoder_start_token_id: int = 2,
+        forced_eos_token_id: int = 2,
+        scale_embedding: bool = False,
+        **kwargs
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            forced_eos_token_id=forced_eos_token_id,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding
+
+        # ensure backward compatibility for BART CNN models
+        if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
+            self.forced_bos_token_id = self.bos_token_id
+            logger.warning(
+                f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
+                "The config can simply be saved and uploaded again to be fixed."
+            )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bart/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bart/modeling.py
new file mode 100644
index 000000000..7b62163d6
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bart/modeling.py
@@ -0,0 +1,1407 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import Tensor
+from paddle.nn import Embedding, Layer, MultiHeadAttention
+
+from ...utils.converter import StateDictNameMapping, init_name_mappings
+from ...utils.env import CONFIG_NAME
+from ...utils.log import logger
+from .. import PretrainedModel, register_base_model
+from ..model_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    ModelOutput,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+    Seq2SeqQuestionAnsweringModelOutput,
+    Seq2SeqSequenceClassifierOutput,
+    convert_encoder_output,
+)
+from .configuration import (
+    BART_PRETRAINED_INIT_CONFIGURATION,
+    BART_PRETRAINED_RESOURCE_FILES_MAP,
+    BartConfig,
+)
+
+__all__ = [
+    "BartModel",
+    "BartPretrainedModel",
+    "BartEncoder",
+    "BartDecoder",
+    "BartClassificationHead",
+    "BartForSequenceClassification",
+    "BartForQuestionAnswering",
+    "BartForConditionalGeneration",
+]
+
+Cache = MultiHeadAttention.Cache
+StaticCache = MultiHeadAttention.StaticCache
+
+
+def shift_tokens_right(input_ids, decoder_start_token_id):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = paddle.zeros_like(input_ids)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    return shifted_input_ids
+
+
+class BartPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained Bart models. It provides Bart related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    model_config_file = CONFIG_NAME
+    pretrained_init_configuration = BART_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = BART_PRETRAINED_RESOURCE_FILES_MAP
+    base_model_prefix = "bart"
+    config_class = BartConfig
+
+    @classmethod
+    def _get_name_mappings(cls, config: BartConfig) -> List[StateDictNameMapping]:
+        model_mappings = [
+            "shared.weight",
+        ]
+
+        num_encoder_layers = config.num_encoder_layers or 0
+        num_decoder_layers = config.num_decoder_layers or 0
+
+        if num_encoder_layers:
+            encoder_mappings = [
+                ["encoder.embed_positions.weight", "encoder.encoder_embed_positions.weight"],
+                ["encoder.layernorm_embedding.weight", "encoder.encoder_layernorm_embedding.weight"],
+                ["encoder.layernorm_embedding.bias", "encoder.encoder_layernorm_embedding.bias"],
+            ]
+
+            model_mappings.extend(encoder_mappings)
+
+            for layer_index in range(num_encoder_layers):
+                encoder_mappings = [
+                    [
+                        f"encoder.layers.{layer_index}.self_attn.k_proj.weight",
+                        f"encoder.encoder.layers.{layer_index}.self_attn.k_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"encoder.layers.{layer_index}.self_attn.k_proj.bias",
+                        f"encoder.encoder.layers.{layer_index}.self_attn.k_proj.bias",
+                    ],
+                    [
+                        f"encoder.layers.{layer_index}.self_attn.v_proj.weight",
+                        f"encoder.encoder.layers.{layer_index}.self_attn.v_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"encoder.layers.{layer_index}.self_attn.v_proj.bias",
+                        f"encoder.encoder.layers.{layer_index}.self_attn.v_proj.bias",
+                    ],
+                    [
+                        f"encoder.layers.{layer_index}.self_attn.q_proj.weight",
+                        f"encoder.encoder.layers.{layer_index}.self_attn.q_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"encoder.layers.{layer_index}.self_attn.q_proj.bias",
+                        f"encoder.encoder.layers.{layer_index}.self_attn.q_proj.bias",
+                    ],
+                    [
+                        f"encoder.layers.{layer_index}.self_attn.out_proj.weight",
+                        f"encoder.encoder.layers.{layer_index}.self_attn.out_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"encoder.layers.{layer_index}.self_attn.out_proj.bias",
+                        f"encoder.encoder.layers.{layer_index}.self_attn.out_proj.bias",
+                    ],
+                    [
+                        f"encoder.layers.{layer_index}.fc1.weight",
+                        f"encoder.encoder.layers.{layer_index}.linear1.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"encoder.layers.{layer_index}.fc1.bias",
+                        f"encoder.encoder.layers.{layer_index}.linear1.bias",
+                    ],
+                    [
+                        f"encoder.layers.{layer_index}.fc2.weight",
+                        f"encoder.encoder.layers.{layer_index}.linear2.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"encoder.layers.{layer_index}.fc2.bias",
+                        f"encoder.encoder.layers.{layer_index}.linear2.bias",
+                    ],
+                    [
+                        f"encoder.layers.{layer_index}.self_attn_layer_norm.weight",
+                        f"encoder.encoder.layers.{layer_index}.norm1.weight",
+                    ],
+                    [
+                        f"encoder.layers.{layer_index}.self_attn_layer_norm.bias",
+                        f"encoder.encoder.layers.{layer_index}.norm1.bias",
+                    ],
+                    [
+                        f"encoder.layers.{layer_index}.final_layer_norm.weight",
+                        f"encoder.encoder.layers.{layer_index}.norm2.weight",
+                    ],
+                    [
+                        f"encoder.layers.{layer_index}.final_layer_norm.bias",
+                        f"encoder.encoder.layers.{layer_index}.norm2.bias",
+                    ],
+                ]
+
+                model_mappings.extend(encoder_mappings)
+
+        if num_decoder_layers:
+            decoder_mappings = [
+                ["decoder.embed_positions.weight", "decoder.decoder_embed_positions.weight"],
+                ["decoder.layernorm_embedding.weight", "decoder.decoder_layernorm_embedding.weight"],
+                ["decoder.layernorm_embedding.bias", "decoder.decoder_layernorm_embedding.bias"],
+            ]
+
+            model_mappings.extend(decoder_mappings)
+
+            for layer_index in range(num_decoder_layers):
+                decoder_mappings = [
+                    [
+                        f"decoder.layers.{layer_index}.self_attn.k_proj.weight",
+                        f"decoder.decoder.layers.{layer_index}.self_attn.k_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.self_attn.k_proj.bias",
+                        f"decoder.decoder.layers.{layer_index}.self_attn.k_proj.bias",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.self_attn.v_proj.weight",
+                        f"decoder.decoder.layers.{layer_index}.self_attn.v_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.self_attn.v_proj.bias",
+                        f"decoder.decoder.layers.{layer_index}.self_attn.v_proj.bias",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.self_attn.q_proj.weight",
+                        f"decoder.decoder.layers.{layer_index}.self_attn.q_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.self_attn.q_proj.bias",
+                        f"decoder.decoder.layers.{layer_index}.self_attn.q_proj.bias",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.self_attn.out_proj.weight",
+                        f"decoder.decoder.layers.{layer_index}.self_attn.out_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.self_attn.out_proj.bias",
+                        f"decoder.decoder.layers.{layer_index}.self_attn.out_proj.bias",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.encoder_attn.k_proj.weight",
+                        f"decoder.decoder.layers.{layer_index}.cross_attn.k_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.encoder_attn.k_proj.bias",
+                        f"decoder.decoder.layers.{layer_index}.cross_attn.k_proj.bias",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.encoder_attn.v_proj.weight",
+                        f"decoder.decoder.layers.{layer_index}.cross_attn.v_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.encoder_attn.v_proj.bias",
+                        f"decoder.decoder.layers.{layer_index}.cross_attn.v_proj.bias",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.encoder_attn.q_proj.weight",
+                        f"decoder.decoder.layers.{layer_index}.cross_attn.q_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.encoder_attn.q_proj.bias",
+                        f"decoder.decoder.layers.{layer_index}.cross_attn.q_proj.bias",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.encoder_attn.out_proj.weight",
+                        f"decoder.decoder.layers.{layer_index}.cross_attn.out_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.encoder_attn.out_proj.bias",
+                        f"decoder.decoder.layers.{layer_index}.cross_attn.out_proj.bias",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.fc1.weight",
+                        f"decoder.decoder.layers.{layer_index}.linear1.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.fc1.bias",
+                        f"decoder.decoder.layers.{layer_index}.linear1.bias",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.fc2.weight",
+                        f"decoder.decoder.layers.{layer_index}.linear2.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.fc2.bias",
+                        f"decoder.decoder.layers.{layer_index}.linear2.bias",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.self_attn_layer_norm.weight",
+                        f"decoder.decoder.layers.{layer_index}.norm1.weight",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.self_attn_layer_norm.bias",
+                        f"decoder.decoder.layers.{layer_index}.norm1.bias",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.encoder_attn_layer_norm.weight",
+                        f"decoder.decoder.layers.{layer_index}.norm2.weight",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.encoder_attn_layer_norm.bias",
+                        f"decoder.decoder.layers.{layer_index}.norm2.bias",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.final_layer_norm.weight",
+                        f"decoder.decoder.layers.{layer_index}.norm3.weight",
+                    ],
+                    [
+                        f"decoder.layers.{layer_index}.final_layer_norm.bias",
+                        f"decoder.decoder.layers.{layer_index}.norm3.bias",
+                    ],
+                ]
+
+                model_mappings.extend(decoder_mappings)
+
+        init_name_mappings(model_mappings)
+
+        # base-model prefix "BartModel"
+        if "BartModel" not in config.architectures:
+            for mapping in model_mappings:
+                mapping[0] = "model." + mapping[0]
+                mapping[1] = "bart." + mapping[1]
+
+        if "BartForQuestionAnswering" in config.architectures:
+            model_mappings.extend(
+                [
+                    ["qa_outputs.weight", "classifier.weight", "transpose"],
+                    ["qa_outputs.bias", "classifier.bias"],
+                ]
+            )
+
+        if "BartForSequenceClassification" in config.architectures:
+            model_mappings.extend(
+                [
+                    ["classification_head.dense.weight", "classifier.dense.weight", "transpose"],
+                    ["classification_head.dense.bias", "classifier.dense.bias"],
+                    ["classification_head.out_proj.weight", "classifier.out_proj.weight", "transpose"],
+                    ["classification_head.out_proj.bias", "classifier.out_proj.bias"],
+                ]
+            )
+
+        if "BartForConditionalGeneration" in config.architectures:
+            model_mappings.extend(
+                [
+                    ["lm_head.weight", "lm_head_weight"],
+                    ["final_logits_bias", "final_logits_bias"],
+                ]
+            )
+
+        mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)]
+        return mappings
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.init_std,
+                        shape=layer.weight.shape,
+                    )
+                )
+
+
+class BartLearnedPositionalEmbedding(Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, num_embeddings, embedding_dim):
+        # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models dont have this hack
+        self.offset = 2
+        super().__init__(num_embeddings + self.offset, embedding_dim)
+
+    def forward(self, input_ids_shape: Tuple, past_key_values_length: int = 0) -> Tensor:
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        bsz, seq_len = input_ids_shape[:2]
+        positions = paddle.arange(past_key_values_length, past_key_values_length + seq_len, dtype="int64")
+        # (gongenlei) For dygraph to static graph
+        return Embedding.forward(self, positions + self.offset)
+
+
+class BartEncoder(BartPretrainedModel):
+    """
+    The Transformer Encoder of BartModel. The arguments of BartEncoder can see :class:`BartModel`.
+    """
+
+    def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.init_std = config.init_std
+        self.pad_token_id = config.pad_token_id
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model)
+
+        self.embed_scale = (config.d_model**0.5) if config.scale_embedding else 1.0
+        self.encoder_embed_positions = BartLearnedPositionalEmbedding(config.max_position_embeddings, config.d_model)
+
+        self.encoder_dropout = nn.Dropout(config.dropout)
+        self.encoder_layernorm_embedding = nn.LayerNorm(config.d_model)
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=config.d_model,
+            nhead=config.encoder_attention_heads,
+            dim_feedforward=config.encoder_ffn_dim,
+            dropout=config.dropout,
+            activation=config.activation_function,
+            attn_dropout=config.attention_dropout,
+            act_dropout=config.activation_dropout,
+        )
+        self.encoder = nn.TransformerEncoder(encoder_layer, config.encoder_layers)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ) -> Union[Tensor, Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        """
+        The BartEncoder forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor, optional):
+                See :class:`BartModel`.
+            attention_mask (Tensor, optional):
+                See :class:`BartModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`BartModel`.
+            output_attentions (bool, optional):
+                See :class:`BartModel`.
+            output_hidden_states (bool, optional):
+                See :class:`BartModel`.
+            return_dict (bool, optional):
+                See :class:`BartModel`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions`.
+            Especially, When `return_dict=output_hidden_states=output_attentions=False`,
+            returns tensor `encoder_outputs` which is the output at the last layer of the model.
+            Its data type should be float32 and has a shape of [batch_size, sequence_length, d_model].
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            inputs_shape = input_ids.shape
+            input_ids = input_ids.reshape((-1, inputs_shape[-1]))
+        elif inputs_embeds is not None:
+            inputs_shape = inputs_embeds.shape[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        inputs_embed_pos = self.encoder_embed_positions(inputs_shape)
+        hidden_states = inputs_embeds + inputs_embed_pos
+        hidden_states = self.encoder_layernorm_embedding(hidden_states)
+        encoder_input = self.encoder_dropout(hidden_states)
+
+        if attention_mask is None and input_ids is not None:
+            attention_mask = (
+                paddle.cast(input_ids == self.pad_token_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
+            )
+        # For 2D attention_mask from tokenizer
+        elif attention_mask.ndim == 2:
+            attention_mask = paddle.unsqueeze(attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
+            attention_mask = (1.0 - attention_mask) * -1e4
+        attention_mask.stop_gradient = True
+
+        encoder_output = self.encoder(
+            encoder_input,
+            src_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return encoder_output
+
+
+class BartDecoder(BartPretrainedModel):
+    """
+    The Transformer Decoder of BartModel. The arguments of BartDecoder can see :class:`BartModel`.
+    """
+
+    def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.init_std = config.init_std
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model)
+
+        self.embed_scale = (config.d_model**0.5) if config.scale_embedding else 1.0
+        self.decoder_embed_positions = BartLearnedPositionalEmbedding(config.max_position_embeddings, config.d_model)
+        self.decoder_dropout = nn.Dropout(config.dropout)
+        self.decoder_layernorm_embedding = nn.LayerNorm(config.d_model)
+
+        decoder_layer = nn.TransformerDecoderLayer(
+            d_model=config.d_model,
+            nhead=config.decoder_attention_heads,
+            dim_feedforward=config.decoder_ffn_dim,
+            dropout=config.dropout,
+            activation=config.activation_function,
+            attn_dropout=config.attention_dropout,
+            act_dropout=config.activation_dropout,
+        )
+        self.decoder = nn.TransformerDecoder(decoder_layer, config.decoder_layers)
+
+    def forward(
+        self,
+        decoder_input_ids: Optional[Tensor] = None,
+        decoder_attention_mask: Optional[Tensor] = None,
+        encoder_output: Union[Tuple[Tensor], ModelOutput, None] = None,
+        memory_mask: Optional[Tensor] = None,
+        decoder_inputs_embeds: Optional[Tensor] = None,
+        cache: Optional[List[Tuple[Cache, StaticCache]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tensor, Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        """
+        The BartDecoder forward method, overrides the `__call__()` special method.
+
+        Args:
+            decoder_input_ids (Tensor, optional):
+                See :class:`BartModel`.
+            decoder_attention_mask (Tensor, optional):
+                See :class:`BartModel`.
+            encoder_output (Tensor, optional):
+                See :class:`BartModel`.
+            memory_mask (Tensor, optional):
+                See :class:`BartModel`.
+            decoder_inputs_embeds (Tensor, optional):
+                See :class:`BartModel`.
+            cache (Tensor, optional):
+                See :class:`BartModel`.
+            output_attentions (bool, optional):
+                See :class:`BartModel`.
+            output_hidden_states (bool, optional):
+                See :class:`BartModel`.
+            return_dict (bool, optional):
+                See :class:`BartModel`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions`.
+            Especially, When `return_dict=output_hidden_states=output_attentions=False`,
+            returns tensor `decoder_outputs` which is the output at the last layer of the model.
+            Its data type should be float32 and has a shape of [batch_size, sequence_length, d_model].
+
+        """
+        # retrieve input_ids and inputs_embeds
+        if decoder_input_ids is not None and decoder_inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif decoder_input_ids is not None:
+            inputs_shape = decoder_input_ids.shape
+            decoder_input_ids = decoder_input_ids.reshape((-1, inputs_shape[-1]))
+        elif decoder_inputs_embeds is not None:
+            inputs_shape = decoder_inputs_embeds.shape[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if decoder_attention_mask is None:
+            decoder_length = inputs_shape[-1]
+            decoder_attention_mask = paddle.tensor.triu(
+                (paddle.full((decoder_length, decoder_length), -np.inf, dtype=paddle.get_default_dtype())), 1
+            )
+
+        if decoder_inputs_embeds is None:
+            decoder_inputs_embeds = self.embed_tokens(decoder_input_ids) * self.embed_scale
+
+        past_key_values_length = cache[0][0].k.shape[2] if cache is not None else 0
+        decoder_inputs_embed_pos = self.decoder_embed_positions(inputs_shape, past_key_values_length)
+        hidden_states = decoder_inputs_embeds + decoder_inputs_embed_pos
+        hidden_states = self.decoder_layernorm_embedding(hidden_states)
+        decoder_input = self.decoder_dropout(hidden_states)
+
+        decoder_output = self.decoder(
+            tgt=decoder_input,
+            memory=encoder_output if isinstance(encoder_output, type(decoder_input)) else encoder_output[0],
+            tgt_mask=decoder_attention_mask,
+            memory_mask=memory_mask,
+            cache=cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return decoder_output
+
+
+@register_base_model
+class BartModel(BartPretrainedModel):
+    r"""
+    The bare Bart Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`BartConfig`):
+            An instance of BartConfig used to construct BartModel.
+    """
+
+    def __init__(self, config: BartConfig):
+        super().__init__(config)
+        self.init_std = config.init_std
+        self.pad_token_id = config.pad_token_id
+        self.decoder_start_token_id = config.decoder_start_token_id
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+        self.encoder = BartEncoder(config, self.shared)
+        self.decoder = BartDecoder(config, self.shared)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, value):
+        self.shared = value
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        decoder_input_ids: Optional[Tensor] = None,
+        decoder_attention_mask: Optional[Tensor] = None,
+        encoder_output: Union[Tuple[Tensor], ModelOutput, None] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        decoder_inputs_embeds: Optional[Tensor] = None,
+        use_cache: Optional[bool] = None,
+        cache: Optional[List[Tuple[Cache, StaticCache]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Seq2SeqModelOutput]:
+        r"""
+        The BartModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor, optional):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, encoder_attention_heads, sequence_length, sequence_length]`.
+                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
+                [batch_size, encoder_attention_heads, sequence_length, sequence_length].
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            decoder_input_ids (Tensor, optional):
+                Indices of decoder input sequence tokens in the vocabulary.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means no `decoder_input_ids` is provided, the model will create the tensor
+                by shifting the `input_ids` to the right.
+            decoder_attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention to some unwanted positions in `decoder_input_ids`.
+                Its data type and shape is the same as `attention_mask`. Defaults to `None`.
+            encoder_output (tuple, optional):
+                The output of the encoder, a tuple consists `last_hidden_state`, `hidden_states`(optional), `attentions`(optional).
+                The data type of `last_hidden_state` is float32 and its shape is `[batch_size, sequence_length, d_model]`.
+                `hidden_states` is hidden_states of all layers in the Transformer encoder. The length of `hidden_states` is `num_hidden_layers + 1`.
+                For all element in the tuple, its data type should be float32 and its shape is [`batch_size, sequence_length, d_model`].
+                `attentions` is attentions of all layers of in the Transformer encoder. The length of `attentions` is `num_hidden_layers`.
+                For all element in the tuple, its data type should be float32 and its shape is [`batch_size, num_attention_heads, sequence_length, sequence_length`].
+            inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation
+                of shape `(batch_size, sequence_length, hidden_size)`. This is useful if you want more control over
+                how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+                Default to None.
+            decoder_inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+                representation  of shape `(batch_size, target_sequence_length, hidden_size)`. If `cache` is used,
+                optionally only the last `decoder_inputs_embeds` have to be input (see `past_key_values`).
+                This is useful if you want more control over how to convert `decoder_input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix. Default to None.
+                If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+                of `inputs_embeds`.
+            use_cache (bool, optional):
+                 Whether or not to use cache. Defaults to `False`. If set to `True`, key value states will be returned and
+                 can be used to speed up decoding.
+            cache (list, optional):
+                It is a list, and each element in the list is a tuple `(incremental_cache, static_cache)`.
+                See `TransformerDecoder.gen_cache <https://github.com/PaddlePaddle/Paddle/blob/release/2.1/python/paddle/nn/layer/transformer.py#L1060>`__ for more details.
+                It is only used for inference and should be None for training.
+                Default to `None`.
+            output_attentions (bool, optional):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail. Defaults to `False`.
+            output_hidden_states (bool, optional):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail. Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions`.
+            Especially, When `return_dict=output_hidden_states=output_attentions=False`,
+            returns tensor `decoder_output`, which is the output at the last layer of the model.
+            Its data type should be float32 and has a shape of [batch_size, sequence_length, d_model].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import BartModel, BartTokenizer
+
+                tokenizer = BartTokenizer.from_pretrained('bart-base')
+                model = BartModel.from_pretrained('bart-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+        """
+        # different to other models, Bart automatically creates decoder_input_ids from
+        # inputBartForSequenceClassification_ids if no decoder_input_ids are provided
+        if input_ids is None and inputs_embeds is None and encoder_output is None:
+            raise ValueError("You have to specify either input_ids or encoder_output")
+
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError(
+                    "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
+                    "passed, `input_ids` cannot be `None`. Please pass either "
+                    "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
+                )
+            decoder_input_ids = shift_tokens_right(input_ids, self.decoder_start_token_id)
+        if attention_mask is None and input_ids is not None:
+            # only generate attention_mask when input_ids is specified
+            attention_mask = (
+                paddle.cast(input_ids == self.pad_token_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
+            )
+        if inputs_embeds is not None and input_ids is None and attention_mask is None:
+            logger.warning("provided inputs_embeds without attention_mask")
+        # For 2D attention_mask from tokenizer
+        elif attention_mask.ndim == 2:
+            attention_mask = paddle.unsqueeze(attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
+            attention_mask = (1.0 - attention_mask) * -1e4
+            attention_mask.stop_gradient = True
+
+        input_type = type(decoder_input_ids) if decoder_input_ids is not None else type(decoder_inputs_embeds)
+        if encoder_output is None:
+            encoder_output = self.encoder(
+                input_ids,
+                attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_output, ModelOutput):
+            if isinstance(encoder_output, input_type):
+                encoder_output = (encoder_output,)
+            encoder_output = convert_encoder_output(encoder_output)
+        if isinstance(encoder_output, input_type):
+            encoder_last_hidden_state = encoder_output
+        else:
+            encoder_last_hidden_state = encoder_output[0]
+        if use_cache:
+            if cache is None:
+                cache = self.decoder.decoder.gen_cache(encoder_last_hidden_state)
+        else:
+            cache = None
+
+        memory_mask = attention_mask
+        if attention_mask is not None:
+            if attention_mask.ndim == 4:
+                memory_mask = attention_mask[:, :, -1:, :]
+            elif attention_mask.ndim == 3:
+                memory_mask = attention_mask[:, -1:, :].unsqueeze([1])
+            elif attention_mask.ndim == 2:
+                memory_mask = attention_mask.unsqueeze([1, 2])
+            else:
+                raise ValueError("Invalid attention mask shape. ")
+
+        decoder_output = self.decoder(
+            decoder_input_ids,
+            decoder_attention_mask,
+            encoder_last_hidden_state,
+            memory_mask,
+            cache=cache,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if not return_dict:
+            if isinstance(decoder_output, input_type):
+                decoder_output = (decoder_output,)
+            if isinstance(encoder_output, input_type):
+                encoder_output = (encoder_output,)
+            return decoder_output + encoder_output
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_output.last_hidden_state,
+            past_key_values=decoder_output.past_key_values,
+            decoder_hidden_states=decoder_output.hidden_states,
+            decoder_attentions=decoder_output.attentions,
+            cross_attentions=decoder_output.cross_attentions,
+            encoder_last_hidden_state=encoder_output.last_hidden_state,
+            encoder_hidden_states=encoder_output.hidden_states,
+            encoder_attentions=encoder_output.attentions,
+        )
+
+
+class BartClassificationHead(Layer):
+    """
+    Perform sentence-level classification tasks.
+    """
+
+    def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+
+    def forward(self, hidden_states: Tensor) -> Tensor:
+        """
+        Args:
+            hidden_states (Tensor):
+                Hidden states of the classification model.
+        """
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = F.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
+class BartForSequenceClassification(BartPretrainedModel):
+    r"""
+    Bart Model with a linear layer on top of the pooled output,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        config (:class:`BartConfig`):
+            An instance of BartConfig used to construct BartForSequenceClassification.
+    """
+
+    def __init__(self, config: BartConfig):
+        super().__init__(config)
+        self.bart = BartModel(config)
+        self.num_labels = config.num_labels
+        self.classifier = BartClassificationHead(
+            config.d_model,
+            config.d_model,
+            config.num_labels,
+            config.classifier_dropout if config.classifier_dropout is not None else config.dropout,
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        decoder_input_ids: Optional[Tensor] = None,
+        decoder_attention_mask: Optional[Tensor] = None,
+        encoder_output: Union[Tuple[Tensor], ModelOutput, None] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        decoder_inputs_embeds: Optional[Tensor] = None,
+        use_cache: Optional[bool] = None,
+        cache: Optional[List[Tuple[Cache, StaticCache]]] = None,
+        labels: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tensor, Tuple, Seq2SeqSequenceClassifierOutput]:
+        r"""
+        The BartForSequenceClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor, optional):
+                See :class:`BartModel`.
+            attention_mask (Tensor, optional):
+                See :class:`BartModel`.
+            decoder_input_ids (Tensor, `optional`):
+                See :class:`BartModel`.
+            decoder_attention_mask (Tensor, optional):
+                See :class:`BartModel`.
+            encoder_output (Tensor, optonal):
+                See :class:`BartModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`BartModel`.
+            decoder_inputs_embeds (Tensor, optional):
+                See :class:`BartModel`.
+            use_cache (bool, optional):
+                See :class:`BartModel`. Forcely set to `False` when `labels` is provided that can save memory during training.
+            cache (Tensor, optional):
+                See :class:`BartModel`.
+            labels (Tensor, optional):
+                Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+                num_labels - 1]`. If `num_labels > 1` a classification loss is computed (Cross-Entropy).
+                Default to `None`.
+            output_attentions (bool, optional):
+                See :class:`BartModel`.
+            output_hidden_states (bool, optional):
+                See :class:`BartModel`.
+            return_dict (bool, optional):
+                See :class:`BartModel`.
+
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqSequenceClassifierOutput` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.Seq2SeqSequenceClassifierOutput`.
+            Especially, When `return_dict=output_hidden_states=output_attentions=False` and labels=None,
+            returns tensor `logits`, a tensor of the input text classification logits.
+            Shape as `[batch_size, num_labels]` and dtype as float32.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import BartForSequenceClassification, BartTokenizer
+
+                tokenizer = BartTokenizer.from_pretrained('bart-base')
+                model = BartForSequenceClassification.from_pretrained('bart-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+
+        if input_ids is None and inputs_embeds is not None:
+            logger.warning(
+                f"{self.__class__.__name__} will not detect eos tokens in `inputs_embeds`. Results may be "
+                "unexpected if using eos tokens in conjunction with `inputs_embeds.`"
+            )
+
+        outputs = self.bart(
+            input_ids,
+            attention_mask,
+            decoder_input_ids,
+            decoder_attention_mask,
+            encoder_output,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            cache=cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        output = outputs[0]
+        output_shape = output.shape
+
+        if input_ids is not None:
+            eos_mask = paddle.cast(input_ids == self.bart.config["eos_token_id"], dtype="int64")
+            if len(paddle.unique(paddle.sum(eos_mask, axis=1))) > 1:
+                raise ValueError("All examples must have the same number of <eos> tokens.")
+
+            # TODO(gongenlei): support bool tensor index
+            output = output.masked_select(eos_mask.unsqueeze(-1).astype("bool").tile([1, 1, output_shape[-1]]))
+
+        sentence_representation = output.reshape([output_shape[0], -1, output_shape[-1]])[:, -1, :]
+        logits = self.classifier(sentence_representation)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = paddle.nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = paddle.nn.CrossEntropyLoss()
+                loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = paddle.nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            if len(outputs) == 2:
+                return (loss, logits) if loss is not None else logits
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+class BartForQuestionAnswering(BartPretrainedModel):
+    r"""
+    Bart Model with a linear layer on top of the hidden-states output to
+    compute `span_start_logits` and `span_end_logits`, designed for question-answering tasks like SQuAD.
+
+    Args:
+        config (:class:`BartConfig`):
+            An instance of BartConfig used to construct BartForQuestionAnswering.
+    """
+
+    def __init__(self, config: BartConfig):
+        super().__init__(config)
+        self.bart = BartModel(config)
+        self.classifier = nn.Linear(config.d_model, 2)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        decoder_input_ids: Optional[Tensor] = None,
+        decoder_attention_mask: Optional[Tensor] = None,
+        encoder_output: Union[Tuple[Tensor], ModelOutput, None] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        decoder_inputs_embeds: Optional[Tensor] = None,
+        use_cache: Optional[bool] = None,
+        cache: Optional[List[Tuple[Cache, StaticCache]]] = None,
+        start_positions: Optional[Tensor] = None,
+        end_positions: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Seq2SeqQuestionAnsweringModelOutput]:
+        r"""
+        The BartForQuestionAnswering forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor, optional):
+                See :class:`BartModel`.
+            attention_mask (Tensor, optional):
+                See :class:`BartModel`.
+            decoder_input_ids (Tensor, `optional`):
+                See :class:`BartModel`.
+            decoder_attention_mask (Tensor, optional):
+                See :class:`BartModel`.
+            encoder_output (Tensor, optonal):
+                See :class:`BartModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`BartModel`.
+            decoder_inputs_embeds (Tensor, optional):
+                See :class:`BartModel`.
+            use_cache (bool, optional):
+                See :class:`BartModel`. Forcely set to `False` when `start_positions` and `end_positions` are provided that can save memory during training.
+            cache (Tensor, optional):
+                See :class:`BartModel`.
+            start_positions (Tensor, optional):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (sequence_length). Position outside of the sequence
+                are not taken into account for computing the loss.
+                A tensor of shape `(batch_size, )`. Default to `None`.
+            end_positions (Tensor, optional):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (sequence_length). Position outside of the sequence
+                are not taken into account for computing the loss.
+                A tensor of shape `(batch_size, )`. Default to `None`.
+            output_attentions (bool, optional):
+                See :class:`BartModel`.
+            output_hidden_states (bool, optional):
+                See :class:`BartModel`.
+            return_dict (bool, optional):
+                See :class:`BartModel`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqQuestionAnsweringModelOutput` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.Seq2SeqQuestionAnsweringModelOutput`.
+            Especially, When `return_dict=output_hidden_states=output_attentions=False` and `start_positions=end_positions=None`,
+            returns tuple (`start_logits`, `end_logits`).
+
+            With the fields:
+
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import BartForQuestionAnswering, BartTokenizer
+
+                tokenizer = BartTokenizer.from_pretrained('bart-base')
+                model = BartForQuestionAnswering.from_pretrained('bart-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+                start_logits = outputs[0]
+                end_logits  =outputs[1]
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if start_positions is not None and end_positions is not None:
+            logger.warning(
+                "The `use_cache` argument is changed to `False` since `start_positions` and `end_positions` are provided."
+            )
+            use_cache = False
+
+        outputs = self.bart(
+            input_ids,
+            attention_mask,
+            decoder_input_ids,
+            decoder_attention_mask,
+            encoder_output,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            cache=cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = self.classifier(outputs[0])
+        logits = paddle.transpose(logits, perm=[2, 0, 1])
+        start_logits, end_logits = paddle.unstack(x=logits, axis=0)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if start_positions.ndim > 1:
+                start_positions = start_positions.squeeze(-1)
+            if start_positions.ndim > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clip(0, ignored_index)
+            end_positions = end_positions.clip(0, ignored_index)
+
+            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            outputs = (start_logits, end_logits) + (outputs[1:] if len(outputs) > 2 else ())
+            return ((total_loss,) + outputs) if total_loss else outputs
+
+        return Seq2SeqQuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+class BartForConditionalGeneration(BartPretrainedModel):
+    r"""
+    Bart Model with a `language modeling` head on top.
+
+    Args:
+        config (:class:`BartConfig`):
+            An instance of BartConfig used to construct BartForConditionalGeneration.
+    """
+
+    def __init__(self, config: BartConfig):
+        super().__init__(config)
+        self.bart = BartModel(config)
+        self.lm_head_weight = self.create_parameter(
+            shape=[config.vocab_size, config.d_model], dtype=self.bart.shared.weight.dtype, is_bias=False
+        )
+        self.register_buffer("final_logits_bias", paddle.zeros((1, config.vocab_size)))
+
+    def get_encoder(self):
+        return self.bart.get_encoder()
+
+    def get_decoder(self):
+        return self.bart.get_decoder()
+
+    def prepare_fast_entry(self, kwargs):
+        from paddlenlp.ops import FasterBART
+
+        decode_strategy = kwargs.get("decode_strategy")
+        use_fp16_decoding = kwargs.get("use_fp16_decoding", False)
+        decoding_lib = kwargs.get("decoding_lib", None)
+        enable_fast_encoder = kwargs.get("enable_fast_encoder", True)
+        if decode_strategy == "sampling" and kwargs.get("top_k") != 0 and kwargs.get("top_p") != 1:
+            raise AttributeError(
+                "Only topk sampling or topp sampling are supported. "
+                "Topk sampling and topp sampling cannot be both applied in the fast version."
+            )
+        if kwargs["repetition_penalty"] != 1.0:
+            # not support for repetition_penalty yet in the fast version
+            raise AttributeError("'repetition_penalty != 1' is not supported yet in the fast version")
+        if kwargs["min_length"] != 0:
+            # not support for min_length yet in the fast version
+            raise AttributeError("'min_length != 0' is not supported yet in the fast version")
+        if kwargs["forced_bos_token_id"] is not None:
+            # not support for min_length yet in the fast version
+            raise AttributeError("'forced_bos_token_id != None' is not supported yet in the fast version")
+        self._fast_entry = FasterBART(
+            self,
+            use_fp16_decoding=use_fp16_decoding,
+            decoding_lib=decoding_lib,
+            enable_fast_encoder=enable_fast_encoder,
+        ).forward
+        return self._fast_entry
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        decoder_input_ids: Optional[Tensor] = None,
+        decoder_attention_mask: Optional[Tensor] = None,
+        encoder_output: Union[Tuple[Tensor], ModelOutput, None] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        decoder_inputs_embeds: Optional[Tensor] = None,
+        use_cache: Optional[bool] = None,
+        cache: Optional[List[Tuple[Cache, StaticCache]]] = None,
+        labels: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tensor, Tuple, Seq2SeqLMOutput]:
+        r"""
+        The BartForConditionalGeneration forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor, optional):
+                See :class:`BartModel`.
+            attention_mask (Tensor, optional):
+                See :class:`BartModel`.
+            decoder_input_ids (Tensor, `optional`):
+                See :class:`BartModel`.
+            decoder_attention_mask (Tensor, optional):
+                See :class:`BartModel`.
+            encoder_output (Tensor, optonal):
+                See :class:`BartModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`BartModel`.
+            decoder_inputs_embeds (Tensor, optional):
+                See :class:`BartModel`.
+            use_cache (bool, optional):
+                See :class:`BartModel`.
+            cache (Tensor, optional):
+                See :class:`BartModel`.
+            labels (Tensor, optional):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., vocab_size]`.
+                A tensor of shape `(batch_size, sequence_length)`. Default to `None`.
+            output_attentions (bool, optional):
+                See :class:`BartModel`.
+            output_hidden_states (bool, optional):
+                See :class:`BartModel`.
+            return_dict (bool, optional):
+                See :class:`BartModel`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqLMOutput` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.Seq2SeqLMOutput`.
+            Especially, When `use_cache=return_dict=output_hidden_states=output_attentions=False` and labels=None,
+            returns tensor `logits`, a tensor of the input text classification logits.
+
+            With the fields:
+
+            - `lm_logits` (Tensor):
+                The generated sentence of the model.
+                Its data type should be float32 and has a shape of [batch_size, sequence_length, vocab_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import BartForConditionalGeneration, BartTokenizer
+
+                tokenizer = BartTokenizer.from_pretrained('bart-base')
+                model = BartForConditionalGeneration.from_pretrained('bart-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+
+        outputs = self.bart(
+            input_ids,
+            attention_mask,
+            decoder_input_ids,
+            decoder_attention_mask,
+            encoder_output,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            cache=cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        lm_logits = paddle.tensor.matmul(outputs[0], self.lm_head_weight, transpose_y=True) + self.final_logits_bias
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            masked_lm_loss = loss_fct(lm_logits.reshape((-1, self.bart.config["vocab_size"])), labels.reshape((-1,)))
+
+        if not return_dict:
+            if len(outputs) == 2:
+                return (masked_lm_loss, lm_logits) if masked_lm_loss is not None else lm_logits
+            else:
+                outputs = (lm_logits,) + outputs[1:]
+                return ((masked_lm_loss,) + outputs) if masked_lm_loss is not None else outputs
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def prepare_decoder_input_ids_from_labels(self, labels):
+        return shift_tokens_right(labels, self.bart.config["decoder_start_token_id"])
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        attention_mask=None,
+        decoder_attention_mask=None,
+        cache=None,
+        use_cache=False,
+        encoder_output=None,
+        **kwargs
+    ):
+        # cut decoder_input_ids if past is used
+        if cache is not None:
+            decoder_input_ids = decoder_input_ids[:, -1].unsqueeze(-1)
+            if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask[:, :, -1, :].unsqueeze(2)
+
+        return {
+            "input_ids": None,
+            "decoder_input_ids": decoder_input_ids,
+            "encoder_output": encoder_output,
+            "decoder_attention_mask": decoder_attention_mask,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,
+            "cache": cache,
+        }
+
+    def __getattr__(self, name):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(getattr(self, self.base_model_prefix), name)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bart/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bart/tokenizer.py
new file mode 100644
index 000000000..43e8d7fde
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bart/tokenizer.py
@@ -0,0 +1,398 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shutil
+from functools import lru_cache
+
+from paddle.utils import try_import
+
+from .. import AddedToken, PretrainedTokenizer
+
+__all__ = ["BartTokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "bart-base": 1024,
+    "bart-large": 1024,
+}
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    _chr = chr
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [_chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class BartTokenizer(PretrainedTokenizer):
+    r"""
+    Construct a BART tokenizer based on byte-level Byte-Pair-Encoding.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.gpt.tokenizer.GPTTokenizer`.
+    For more information regarding those methods, please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            Path to the vocabulary file.
+            The vocab file contains a mapping from vocabulary strings to indices.
+        merges_file (str):
+            Path to the merge file.
+            The merge file is used to split the input sentence into "subword" units.
+            The vocab file is then used to encode those units as intices.
+        errors (str):
+            Paradigm to follow when decoding bytes to UTF-8.
+            Defaults to `'replace'`.
+        max_len (int, optional):
+            The maximum value of the input sequence length.
+            Defaults to `None`.
+        bos_token (str, optional):
+            The beginning of sequence token that was used during pretraining. Can be
+            used a sequence classifier token.
+            Defaults to `"<s>"`.
+        eos_token (str, optional):
+            A special token representing the end of a sequence that was used during pretraining.
+            Defaults to `"</s>"`.
+        cls_token (str, optional):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens.
+            Defaults to `"<s>"`.
+        sep_token (str, optional):
+            A special token separating two different sentences in the same input.
+            Defaults to `"</s>"`.
+        unk_token (str, optional):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to `"<unk>"`.
+        pad_token (str, optional):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to `"<pad>"`.
+        mask_token (str, optional):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to `"<mask>"`.
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import BartTokenizer
+
+            tokenizer = BartTokenizer.from_pretrained('bart-base')
+            print(tokenizer('He was a puppeteer'))
+
+            '''
+            {'input_ids': [0, 894, 21, 10, 32986, 9306, 254, 2],
+            'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}
+            '''
+
+    """
+    # merges and vocab same as GPT2
+    resource_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "bart-base": "https://bj.bcebos.com/paddlenlp/models/transformers/bart/bart-base-vocab.json",
+            "bart-large": "https://bj.bcebos.com/paddlenlp/models/transformers/bart/bart-large-vocab.json",
+        },
+        "merges_file": {
+            "bart-base": "https://bj.bcebos.com/paddlenlp/models/transformers/bart/bart-base-merges.txt",
+            "bart-large": "https://bj.bcebos.com/paddlenlp/models/transformers/bart/bart-large-merges.txt",
+        },
+    }
+    pretrained_init_configuration = {"bart-base": {}, "bart-large": {}}
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        bos_token="<s>",
+        eos_token="</s>",
+        cls_token="<s>",
+        sep_token="</s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        **kwargs
+    ):
+
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+        self._build_special_tokens_map_extended(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+        )
+
+        self._vocab_file = vocab_file
+        self._merges_file = merges_file
+        self.num_command_tokens = 2
+        self.num_type_tokens = 2
+
+        with open(vocab_file, "r", encoding="utf-8") as f:
+            self.encoder = json.load(f)
+
+        self.decoder = {v: k for k, v in self.encoder.items()}
+
+        self.num_tokens = len(self.encoder)
+        self.num_text_tokens = self.num_tokens - 1
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+
+        with open(merges_file, encoding="utf-8") as f:
+            bpe_data = f.read().split("\n")[1:-1]
+
+        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        re = try_import("regex")
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+    def _bpe_encode(self, text):
+        bpe_tokens = []
+        re = try_import("regex")
+        for token in re.findall(self.pat, text):
+            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification
+        tasks by concatenating and adding special tokens.
+        """
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        if token_ids_1 is None:
+            return _cls + token_ids_0 + _sep
+        return _cls + token_ids_0 + _sep + _sep + token_ids_1 + _sep
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is
+        called when adding special tokens using the tokenizer ``encode`` methods.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    @property
+    def vocab_size(self):
+        """
+        Returns the size of vocabulary.
+
+        Returns:
+            int: The sum of size of vocabulary and the size of speical tokens.
+
+        """
+
+        return len(self.encoder)
+
+    @property
+    def eol_token_id(self):
+        if self.eol_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.eol_token)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:  # noqa: E722
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        re = try_import("regex")
+        for token in re.findall(self.pat, text):
+            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    def _convert_token_to_id(self, token):
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+
+        return self.decoder[index]
+
+    def convert_ids_to_string(self, ids):
+        """
+        Converts a single index or a sequence of indices to texts.
+
+        Args:
+            ids (int|List[int]):
+                The token id (or token ids) to be converted to text.
+
+        Returns:
+            str: The decoded text.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import GPTTokenizer
+                tokenizer = GPTTokenizer.from_pretrained('gpt2-medium-en')
+                print(tokenizer.convert_ids_to_string(tokenizer.convert_ids_to_string([14618, 284, 779, 350, 37382, 47, 37382, 290, 350, 37382, 45, 19930]))
+                # 'Welcome to use PaddlePaddle and PaddleNLP'
+
+        """
+
+        text = "".join([self.decoder[id] for id in ids])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    def save_resources(self, save_directory):
+        """
+        Saves `SentencePiece <https://github.com/google/sentencepiece>`__ file
+        (ends with '.spm') under `save_directory`.
+
+        Args:
+            save_directory (str): Directory to save files into.
+        """
+        for name, file_name in self.resource_files_names.items():
+            source_path = getattr(self, "_%s" % name)
+
+            save_path = os.path.join(save_directory, file_name)
+            if os.path.abspath(source_path) != os.path.abspath(save_path):
+                shutil.copyfile(source_path, save_path)
+
+    def convert_tokens_to_string(self, tokens):
+        """
+        Converts a sequence of tokens (string) in a single string.
+        """
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        """
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        A BERT offset_mapping has the following format:
+
+        - single sequence:      ``(0,0) X (0,0)``
+        - pair of sequences:        ``(0,0) A (0,0) B (0,0)``
+
+        Args:
+            offset_mapping_ids_0 (List[tuple]):
+                List of wordpiece offsets to which the special tokens will be added.
+            offset_mapping_ids_1 (List[tuple], optional):
+                Optional second list of wordpiece offsets for offset mapping pairs. Defaults to None.
+
+        Returns:
+            List[tuple]: A list of wordpiece offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0), (0, 0)] + offset_mapping_1 + [(0, 0)]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert/configuration.py
new file mode 100644
index 000000000..d1327783b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert/configuration.py
@@ -0,0 +1,407 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BERT model configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["BERT_PRETRAINED_INIT_CONFIGURATION", "BertConfig", "BERT_PRETRAINED_RESOURCE_FILES_MAP"]
+
+BERT_PRETRAINED_INIT_CONFIGURATION = {
+    "bert-base-uncased": {
+        "vocab_size": 30522,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+    },
+    "bert-large-uncased": {
+        "vocab_size": 30522,
+        "hidden_size": 1024,
+        "num_hidden_layers": 24,
+        "num_attention_heads": 16,
+        "intermediate_size": 4096,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+    },
+    "bert-base-multilingual-uncased": {
+        "vocab_size": 105879,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+    },
+    "bert-base-cased": {
+        "vocab_size": 28996,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+    },
+    "bert-base-chinese": {
+        "vocab_size": 21128,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+    },
+    "bert-base-multilingual-cased": {
+        "vocab_size": 119547,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+    },
+    "bert-large-cased": {
+        "vocab_size": 28996,
+        "hidden_size": 1024,
+        "num_hidden_layers": 24,
+        "num_attention_heads": 16,
+        "intermediate_size": 4096,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+    },
+    "bert-wwm-chinese": {
+        "vocab_size": 21128,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+    },
+    "bert-wwm-ext-chinese": {
+        "vocab_size": 21128,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+    },
+    "macbert-base-chinese": {
+        "vocab_size": 21128,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+    },
+    "macbert-large-chinese": {
+        "vocab_size": 21128,
+        "hidden_size": 1024,
+        "num_hidden_layers": 24,
+        "num_attention_heads": 16,
+        "intermediate_size": 4096,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+    },
+    "simbert-base-chinese": {
+        "vocab_size": 13685,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+    },
+    "uer/chinese-roberta-base": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 2,
+        "vocab_size": 21128,
+        "pad_token_id": 0,
+    },
+    "uer/chinese-roberta-medium": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 512,
+        "initializer_range": 0.02,
+        "intermediate_size": 2048,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 8,
+        "num_hidden_layers": 8,
+        "type_vocab_size": 2,
+        "vocab_size": 21128,
+        "pad_token_id": 0,
+    },
+    "uer/chinese-roberta-6l-768h": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 6,
+        "type_vocab_size": 2,
+        "vocab_size": 21128,
+        "pad_token_id": 0,
+    },
+    "uer/chinese-roberta-small": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 512,
+        "initializer_range": 0.02,
+        "intermediate_size": 2048,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 8,
+        "num_hidden_layers": 4,
+        "type_vocab_size": 2,
+        "vocab_size": 21128,
+        "pad_token_id": 0,
+    },
+    "uer/chinese-roberta-mini": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 256,
+        "initializer_range": 0.02,
+        "intermediate_size": 1024,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 4,
+        "num_hidden_layers": 4,
+        "type_vocab_size": 2,
+        "vocab_size": 21128,
+        "pad_token_id": 0,
+    },
+    "uer/chinese-roberta-tiny": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 128,
+        "initializer_range": 0.02,
+        "intermediate_size": 512,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 2,
+        "num_hidden_layers": 2,
+        "type_vocab_size": 2,
+        "vocab_size": 21128,
+        "pad_token_id": 0,
+    },
+}
+
+BERT_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "bert-base-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/bert-base-uncased.pdparams",
+        "bert-large-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/bert-large-uncased.pdparams",
+        "bert-base-multilingual-uncased": "http://bj.bcebos.com/paddlenlp/models/transformers/bert-base-multilingual-uncased.pdparams",
+        "bert-base-cased": "http://bj.bcebos.com/paddlenlp/models/transformers/bert/bert-base-cased.pdparams",
+        "bert-base-chinese": "http://bj.bcebos.com/paddlenlp/models/transformers/bert/bert-base-chinese.pdparams",
+        "bert-base-multilingual-cased": "http://bj.bcebos.com/paddlenlp/models/transformers/bert/bert-base-multilingual-cased.pdparams",
+        "bert-large-cased": "http://bj.bcebos.com/paddlenlp/models/transformers/bert/bert-large-cased.pdparams",
+        "bert-wwm-chinese": "http://bj.bcebos.com/paddlenlp/models/transformers/bert/bert-wwm-chinese.pdparams",
+        "bert-wwm-ext-chinese": "http://bj.bcebos.com/paddlenlp/models/transformers/bert/bert-wwm-ext-chinese.pdparams",
+        "macbert-base-chinese": "https://bj.bcebos.com/paddlenlp/models/transformers/macbert/macbert-base-chinese.pdparams",
+        "macbert-large-chinese": "https://bj.bcebos.com/paddlenlp/models/transformers/macbert/macbert-large-chinese.pdparams",
+        "simbert-base-chinese": "https://bj.bcebos.com/paddlenlp/models/transformers/simbert/simbert-base-chinese-v1.pdparams",
+        "uer/chinese-roberta-base": "https://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_base.pdparams",
+        "uer/chinese-roberta-medium": "https://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_medium.pdparams",
+        "uer/chinese-roberta-6l-768h": "https://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_6l_768h.pdparams",
+        "uer/chinese-roberta-small": "https://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_small.pdparams",
+        "uer/chinese-roberta-mini": "https://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_mini.pdparams",
+        "uer/chinese-roberta-tiny": "https://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_tiny.pdparams",
+    }
+}
+
+
+class BertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`BertModel`] or a [`TFBertModel`]. It is used to
+    instantiate a BERT model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the BERT
+    bert-base-uncased architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python
+    >>> from paddlenlp.transformers import BertModel, BertConfig
+
+    >>> # Initializing a BERT bert-base-uncased style configuration
+    >>> configuration = BertConfig()
+
+    >>> # Initializing a model from the bert-base-uncased style configuration
+    >>> model = BertModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "bert"
+    attribute_map: Dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
+    pretrained_init_configuration = BERT_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size: int = 30522,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        max_position_embeddings: int = 512,
+        type_vocab_size: int = 16,
+        initializer_range: float = 0.02,
+        pad_token_id: int = 0,
+        pool_act: str = "tanh",
+        fuse: bool = False,
+        layer_norm_eps=1e-12,
+        use_cache=False,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.pool_act = pool_act
+        self.fuse = fuse
+
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert/modeling.py
new file mode 100644
index 000000000..03095def4
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert/modeling.py
@@ -0,0 +1,1421 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import warnings
+from typing import Optional, Tuple
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import Tensor
+from paddle.nn import Layer
+
+try:
+    from paddle.incubate.nn import FusedTransformerEncoderLayer
+except ImportError:
+    FusedTransformerEncoderLayer = None
+from dataclasses import dataclass
+
+from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model
+
+from ...layers import Linear as TransposedLinear
+from ...utils.converter import StateDictNameMapping, init_name_mappings
+from ...utils.env import CONFIG_NAME
+from ..model_outputs import (
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    MaskedLMOutput,
+    ModelOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from .configuration import (
+    BERT_PRETRAINED_INIT_CONFIGURATION,
+    BERT_PRETRAINED_RESOURCE_FILES_MAP,
+    BertConfig,
+)
+
+__all__ = [
+    "BertModel",
+    "BertPretrainedModel",
+    "BertForPretraining",
+    "BertPretrainingCriterion",
+    "BertPretrainingHeads",
+    "BertForSequenceClassification",
+    "BertForTokenClassification",
+    "BertForQuestionAnswering",
+    "BertForMultipleChoice",
+    "BertForMaskedLM",
+]
+
+
+class BertEmbeddings(Layer):
+    """
+    Include embeddings from word, position and token_type embeddings
+    """
+
+    def __init__(self, config: BertConfig):
+        super(BertEmbeddings, self).__init__()
+
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        past_key_values_length: Optional[int] = None,
+    ):
+
+        if position_ids is None:
+            ones = paddle.ones_like(input_ids, dtype="int64")
+            seq_length = paddle.cumsum(ones, axis=-1)
+
+            position_ids = seq_length - ones
+            if past_key_values_length is not None:
+                position_ids += past_key_values_length
+            position_ids.stop_gradient = True
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros_like(input_ids, dtype="int64")
+
+        input_embedings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = input_embedings + position_embeddings + token_type_embeddings
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertPooler(Layer):
+    """
+    Pool the result of BertEncoder.
+    """
+
+    def __init__(self, config: BertConfig):
+        """init the bert pooler with config & args/kwargs
+
+        Args:
+            config (BertConfig): BertConfig instance. Defaults to None.
+        """
+        super(BertPooler, self).__init__()
+
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.pool_act = config.pool_act
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        if self.pool_act == "tanh":
+            pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained BERT models. It provides BERT related
+    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
+    `pretrained_init_configuration`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    model_config_file = CONFIG_NAME
+    config_class = BertConfig
+    resource_files_names = {"model_state": "model_state.pdparams"}
+    base_model_prefix = "bert"
+
+    pretrained_init_configuration = BERT_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = BERT_PRETRAINED_RESOURCE_FILES_MAP
+
+    @classmethod
+    def _get_name_mappings(cls, config: BertConfig) -> list[StateDictNameMapping]:
+        mappings: list[StateDictNameMapping] = []
+        model_mappings = [
+            "embeddings.word_embeddings.weight",
+            "embeddings.position_embeddings.weight",
+            "embeddings.token_type_embeddings.weight",
+            ["embeddings.LayerNorm.weight", "embeddings.layer_norm.weight"],
+            ["embeddings.LayerNorm.bias", "embeddings.layer_norm.bias"],
+            ["pooler.dense.weight", None, "transpose"],
+            "pooler.dense.bias",
+            # for TokenClassification
+        ]
+        for layer_index in range(config.num_hidden_layers):
+            layer_mappings = [
+                [
+                    f"encoder.layer.{layer_index}.attention.self.query.weight",
+                    f"encoder.layers.{layer_index}.self_attn.q_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.query.bias",
+                    f"encoder.layers.{layer_index}.self_attn.q_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.key.weight",
+                    f"encoder.layers.{layer_index}.self_attn.k_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.key.bias",
+                    f"encoder.layers.{layer_index}.self_attn.k_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.value.weight",
+                    f"encoder.layers.{layer_index}.self_attn.v_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.value.bias",
+                    f"encoder.layers.{layer_index}.self_attn.v_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.dense.weight",
+                    f"encoder.layers.{layer_index}.self_attn.out_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.dense.bias",
+                    f"encoder.layers.{layer_index}.self_attn.out_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.intermediate.dense.weight",
+                    f"encoder.layers.{layer_index}.linear1.weight",
+                    "transpose",
+                ],
+                [f"encoder.layer.{layer_index}.intermediate.dense.bias", f"encoder.layers.{layer_index}.linear1.bias"],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.LayerNorm.weight",
+                    f"encoder.layers.{layer_index}.norm1.weight",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.LayerNorm.bias",
+                    f"encoder.layers.{layer_index}.norm1.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.output.dense.weight",
+                    f"encoder.layers.{layer_index}.linear2.weight",
+                    "transpose",
+                ],
+                [f"encoder.layer.{layer_index}.output.dense.bias", f"encoder.layers.{layer_index}.linear2.bias"],
+                [f"encoder.layer.{layer_index}.output.LayerNorm.weight", f"encoder.layers.{layer_index}.norm2.weight"],
+                [f"encoder.layer.{layer_index}.output.LayerNorm.bias", f"encoder.layers.{layer_index}.norm2.bias"],
+            ]
+            model_mappings.extend(layer_mappings)
+
+        init_name_mappings(model_mappings)
+
+        # base-model prefix "BertModel"
+        if "BertModel" not in config.architectures:
+            for mapping in model_mappings:
+                mapping[0] = "bert." + mapping[0]
+                mapping[1] = "bert." + mapping[1]
+
+        # downstream mappings
+        if "BertForQuestionAnswering" in config.architectures:
+            model_mappings.extend(
+                [["qa_outputs.weight", "classifier.weight", "transpose"], ["qa_outputs.bias", "classifier.bias"]]
+            )
+        if (
+            "BertForMultipleChoice" in config.architectures
+            or "BertForSequenceClassification" in config.architectures
+            or "BertForTokenClassification" in config.architectures
+        ):
+            model_mappings.extend([["classifier.weight", "classifier.weight", "transpose"]])
+
+        mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)]
+        return mappings
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range,
+                        shape=layer.weight.shape,
+                    )
+                )
+
+        elif isinstance(layer, nn.LayerNorm):
+            layer._epsilon = self.config.layer_norm_eps
+
+
+@register_base_model
+class BertModel(BertPretrainedModel):
+    """
+    The bare BERT Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`BertConfig`):
+            An instance of BertConfig used to construct BertModel.
+    """
+
+    def __init__(self, config: BertConfig):
+        super(BertModel, self).__init__(config)
+
+        self.pad_token_id = config.pad_token_id
+        self.initializer_range = config.initializer_range
+        self.embeddings = BertEmbeddings(config)
+        if config.fuse and FusedTransformerEncoderLayer is None:
+            warnings.warn(
+                "FusedTransformerEncoderLayer is not supported by the running Paddle. "
+                "The flag fuse_transformer will be ignored. Try Paddle >= 2.3.0"
+            )
+        self.fuse = config.fuse and FusedTransformerEncoderLayer is not None
+        if self.fuse:
+            self.encoder = nn.LayerList(
+                [
+                    FusedTransformerEncoderLayer(
+                        config.hidden_size,
+                        config.num_attention_heads,
+                        config.intermediate_size,
+                        dropout_rate=config.hidden_dropout_prob,
+                        activation=config.hidden_act,
+                        attn_dropout_rate=config.attention_probs_dropout_prob,
+                        act_dropout_rate=0.0,
+                    )
+                    for _ in range(config.num_hidden_layers)
+                ]
+            )
+        else:
+            encoder_layer = nn.TransformerEncoderLayer(
+                config.hidden_size,
+                config.num_attention_heads,
+                config.intermediate_size,
+                dropout=config.hidden_dropout_prob,
+                activation=config.hidden_act,
+                attn_dropout=config.attention_probs_dropout_prob,
+                act_dropout=0,
+            )
+            self.encoder = nn.TransformerEncoder(encoder_layer, config.num_hidden_layers)
+        self.pooler = BertPooler(config)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[Tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The BertModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            past_key_values (tuple(tuple(Tensor)), optional):
+                The length of tuple equals to the number of layers, and each inner
+                tuple haves 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`)
+                which contains precomputed key and value hidden states of the attention blocks.
+                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that
+                don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+                `input_ids` of shape `(batch_size, sequence_length)`.
+            use_cache (`bool`, optional):
+                If set to `True`, `past_key_values` key value states are returned.
+                Defaults to `None`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `None`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `None`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.ModelOutput` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `None`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import BertModel, BertTokenizer
+
+                tokenizer = BertTokenizer.from_pretrained('bert-wwm-chinese')
+                model = BertModel.from_pretrained('bert-wwm-chinese')
+
+                inputs = tokenizer("欢迎使用百度飞桨!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        past_key_values_length = None
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+        if attention_mask is None:
+            attention_mask = paddle.unsqueeze(
+                (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e4, axis=[1, 2]
+            )
+            if past_key_values is not None:
+                batch_size = past_key_values[0][0].shape[0]
+                past_mask = paddle.zeros([batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype)
+                attention_mask = paddle.concat([past_mask, attention_mask], axis=-1)
+        else:
+            if attention_mask.ndim == 2:
+                # attention_mask [batch_size, sequence_length] -> [batch_size, 1, 1, sequence_length]
+                attention_mask = attention_mask.unsqueeze(axis=[1, 2]).astype(paddle.get_default_dtype())
+                attention_mask = (1.0 - attention_mask) * -1e4
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            past_key_values_length=past_key_values_length,
+        )
+        if self.fuse:
+            assert not output_attentions, "Not support attentions output currently."
+            assert past_key_values is None, "Not support past_key_values currently."
+            hidden_states = embedding_output
+            all_hidden_states = [] if output_hidden_states else None
+            for layer in self.encoder:
+                hidden_states = layer(hidden_states, attention_mask)
+                if output_hidden_states:
+                    all_hidden_states.append(hidden_states)
+            pooled_output = self.pooler(hidden_states)
+
+            if return_dict:
+                return BaseModelOutputWithPoolingAndCrossAttentions(
+                    last_hidden_state=hidden_states, pooler_output=pooled_output, hidden_states=all_hidden_states
+                )
+            else:
+                return (
+                    (hidden_states, pooled_output, all_hidden_states)
+                    if output_hidden_states
+                    else (hidden_states, pooled_output)
+                )
+        else:
+            self.encoder._use_cache = use_cache  # To be consistent with HF
+            encoder_outputs = self.encoder(
+                embedding_output,
+                src_mask=attention_mask,
+                cache=past_key_values,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            if isinstance(encoder_outputs, type(embedding_output)):
+                sequence_output = encoder_outputs
+                pooled_output = self.pooler(sequence_output)
+                return (sequence_output, pooled_output)
+            else:
+                sequence_output = encoder_outputs[0]
+                pooled_output = self.pooler(sequence_output)
+                if not return_dict:
+                    return (sequence_output, pooled_output) + encoder_outputs[1:]
+                return BaseModelOutputWithPoolingAndCrossAttentions(
+                    last_hidden_state=sequence_output,
+                    pooler_output=pooled_output,
+                    past_key_values=encoder_outputs.past_key_values,
+                    hidden_states=encoder_outputs.hidden_states,
+                    attentions=encoder_outputs.attentions,
+                )
+
+
+class BertForQuestionAnswering(BertPretrainedModel):
+    """
+    Bert Model with a linear layer on top of the hidden-states output to compute `span_start_logits`
+    and `span_end_logits`, designed for question-answering tasks like SQuAD.
+
+    Args:
+        config (:class:`BertConfig`):
+            An instance of BertConfig used to construct BertForQuestionAnswering.
+    """
+
+    def __init__(self, config: BertConfig):
+        super(BertForQuestionAnswering, self).__init__(config)
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, 2)
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        start_positions: Optional[Tensor] = None,
+        end_positions: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The BertForQuestionAnswering forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`BertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`BertModel`.
+            position_ids(Tensor, optional):
+                See :class:`BertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`BertModel`.
+            start_positions (Tensor of shape `(batch_size,)`, optional):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            end_positions (Tensor of shape `(batch_size,)`, optional):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `None`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `None`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `None`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers.bert.modeling import BertForQuestionAnswering
+                from paddlenlp.transformers.bert.tokenizer import BertTokenizer
+
+                tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+                model = BertForQuestionAnswering.from_pretrained('bert-base-cased')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                start_logits = outputs[0]
+                end_logits = outputs[1]
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output)
+        logits = paddle.transpose(logits, perm=[2, 0, 1])
+        start_logits, end_logits = paddle.unstack(x=logits, axis=0)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if start_positions.ndim > 1:
+                start_positions = start_positions.squeeze(-1)
+            if start_positions.ndim > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clip(0, ignored_index)
+            end_positions = end_positions.clip(0, ignored_index)
+
+            loss_fct = paddle.nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class BertForSequenceClassification(BertPretrainedModel):
+    """
+    Bert Model with a linear layer on top of the output layer,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        config (:class:`BertConfig`):
+            An instance of BertConfig used to construct BertForSequenceClassification.
+    """
+
+    def __init__(self, config: BertConfig):
+        super(BertForSequenceClassification, self).__init__(config)
+
+        self.bert = BertModel(config)
+        self.num_labels = config.num_labels
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The BertForSequenceClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`BertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`BertModel`.
+            position_ids(Tensor, optional):
+                See :class:`BertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`BertModel`.
+            labels (Tensor of shape `(batch_size,)`, optional):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in `[0, ..., num_labels - 1]`. If `num_labels == 1`
+                a regression loss is computed (Mean-Square loss), If `num_labels > 1`
+                a classification loss is computed (Cross-Entropy).
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `None`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `None`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `None`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers.bert.modeling import BertForSequenceClassification
+                from paddlenlp.transformers.bert.tokenizer import BertTokenizer
+
+                tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+                model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+
+                logits = model(**inputs)
+                print(logits.shape)
+                # [1, 2]
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = paddle.nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = paddle.nn.CrossEntropyLoss()
+                loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = paddle.nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class BertForTokenClassification(BertPretrainedModel):
+    """
+    Bert Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        config (:class:`BertConfig`):
+            An instance of BertConfig used to construct BertForTokenClassification.
+    """
+
+    def __init__(self, config: BertConfig):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        self.num_labels = config.num_labels
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The BertForTokenClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`BertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`BertModel`.
+            position_ids(Tensor, optional):
+                See :class:`BertModel`.
+            attention_mask (list, optional):
+                See :class:`BertModel`.
+            labels (Tensor of shape `(batch_size, sequence_length)`, optional):
+                Labels for computing the token classification loss. Indices should be in `[0, ..., num_labels - 1]`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `None`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `None`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `None`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers.bert.modeling import BertForTokenClassification
+                from paddlenlp.transformers.bert.tokenizer import BertTokenizer
+
+                tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+                model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=2)
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+
+                logits = model(**inputs)
+                print(logits.shape)
+                # [1, 13, 2]
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class BertLMPredictionHead(Layer):
+    """
+    Bert Model with a `language modeling` head on top for CLM fine-tuning.
+    """
+
+    def __init__(self, config: BertConfig):
+        super(BertLMPredictionHead, self).__init__()
+
+        self.transform = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = getattr(nn.functional, config.hidden_act)
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.decoder = TransposedLinear(config.hidden_size, config.vocab_size)
+        # link bias to load pretrained weights
+        self.decoder_bias = self.decoder.bias
+
+    def forward(self, hidden_states, masked_positions=None):
+        if masked_positions is not None:
+            hidden_states = paddle.reshape(hidden_states, [-1, hidden_states.shape[-1]])
+            hidden_states = paddle.tensor.gather(hidden_states, masked_positions)
+        # gather masked tokens might be more quick
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertPretrainingHeads(Layer):
+    """
+    Perform language modeling task and next sentence classification task.
+
+    Args:
+        config (:class:`BertConfig`):
+            An instance of BertConfig used to construct BertForPretraining.
+
+    """
+
+    def __init__(self, config: BertConfig):
+        super(BertPretrainingHeads, self).__init__()
+        self.predictions = BertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output, masked_positions=None):
+        """
+        Args:
+            sequence_output(Tensor):
+                Sequence of hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+            pooled_output(Tensor):
+                The output of first token (`[CLS]`) in sequence.
+                We "pool" the model by simply taking the hidden state corresponding to the first token.
+                Its data type should be float32 and its shape is [batch_size, hidden_size].
+            masked_positions(Tensor, optional):
+                A tensor indicates positions to be masked in the position embedding.
+                Its data type should be int64 and its shape is [batch_size, mask_token_num].
+                `mask_token_num` is the number of masked tokens. It should be no bigger than `sequence_length`.
+                Defaults to `None`, which means we output hidden-states of all tokens in masked token prediction.
+
+        Returns:
+            tuple: Returns tuple (``prediction_scores``, ``seq_relationship_score``).
+
+            With the fields:
+
+            - `prediction_scores` (Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size].
+                Otherwise, its shape is [batch_size, mask_token_num, vocab_size].
+
+            - `seq_relationship_score` (Tensor):
+                The scores of next sentence prediction.
+                Its data type should be float32 and its shape is [batch_size, 2].
+
+        """
+        prediction_scores = self.predictions(sequence_output, masked_positions)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+@dataclass
+class BertForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`BertForPreTraining`].
+
+    Args:
+        loss (*optional*, returned when `labels` is provided, `paddle.Tensor` of shape `(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (`paddle.Tensor` of shape `(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    prediction_logits: paddle.Tensor = None
+    seq_relationship_logits: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+class BertForPretraining(BertPretrainedModel):
+    """
+    Bert Model with pretraining tasks on top.
+
+    Args:
+        config (:class:`BertConfig`):
+            An instance of BertConfig used to construct BertForPretraining.
+
+    """
+
+    def __init__(self, config: BertConfig):
+        super(BertForPretraining, self).__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertPretrainingHeads(config)
+        self.tie_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        masked_positions: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        next_sentence_label: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`BertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`BertModel`.
+            position_ids (Tensor, optional):
+                See :class:`BertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`BertModel`.
+            masked_positions(Tensor, optional):
+                See :class:`BertPretrainingHeads`.
+            labels (Tensor of shape `(batch_size, sequence_length)`, optional):
+                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+                vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
+                the loss is only computed for the tokens with labels in `[0, ..., vocab_size]`.
+            next_sentence_label (Tensor of shape `(batch_size,)`, optional):
+                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
+                pair (see `input_ids` docstring) Indices should be in `[0, 1]`:
+
+                - 0 indicates sequence B is a continuation of sequence A,
+                - 1 indicates sequence B is a random sequence.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `None`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `None`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.bert.BertForPreTrainingOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `None`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.bert.BertForPreTrainingOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.bert.BertForPreTrainingOutput`.
+
+        """
+        with paddle.static.amp.fp16_guard():
+            outputs = self.bert(
+                input_ids,
+                token_type_ids=token_type_ids,
+                position_ids=position_ids,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            sequence_output, pooled_output = outputs[:2]
+            prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output, masked_positions)
+
+            total_loss = None
+            if labels is not None and next_sentence_label is not None:
+                loss_fct = paddle.nn.CrossEntropyLoss()
+                masked_lm_loss = loss_fct(
+                    prediction_scores.reshape((-1, prediction_scores.shape[-1])), labels.reshape((-1,))
+                )
+                next_sentence_loss = loss_fct(
+                    seq_relationship_score.reshape((-1, 2)), next_sentence_label.reshape((-1,))
+                )
+                total_loss = masked_lm_loss + next_sentence_loss
+            if not return_dict:
+                output = (prediction_scores, seq_relationship_score) + outputs[2:]
+                return ((total_loss,) + output) if total_loss is not None else output
+
+            return BertForPreTrainingOutput(
+                loss=total_loss,
+                prediction_logits=prediction_scores,
+                seq_relationship_logits=seq_relationship_score,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+
+
+class BertPretrainingCriterion(paddle.nn.Layer):
+    """
+
+    Args:
+        vocab_size(int):
+            Vocabulary size of `inputs_ids` in `BertModel`. Defines the number of different tokens that can
+            be represented by the `inputs_ids` passed when calling `BertModel`.
+
+    """
+
+    def __init__(self, vocab_size):
+        super(BertPretrainingCriterion, self).__init__()
+        # CrossEntropyLoss is expensive since the inner reshape (copy)
+        self.loss_fn = paddle.nn.loss.CrossEntropyLoss(ignore_index=-1)
+        self.vocab_size = vocab_size
+
+    def forward(
+        self, prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels, masked_lm_scale
+    ):
+        """
+        Args:
+            prediction_scores(Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size].
+                Otherwise, its shape is [batch_size, mask_token_num, vocab_size]
+            seq_relationship_score(Tensor):
+                The scores of next sentence prediction. Its data type should be float32 and
+                its shape is [batch_size, 2]
+            masked_lm_labels(Tensor):
+                The labels of the masked language modeling, its dimensionality is equal to `prediction_scores`.
+                Its data type should be int64. If `masked_positions` is None, its shape is [batch_size, sequence_length, 1].
+                Otherwise, its shape is [batch_size, mask_token_num, 1]
+            next_sentence_labels(Tensor):
+                The labels of the next sentence prediction task, the dimensionality of `next_sentence_labels`
+                is equal to `seq_relation_labels`. Its data type should be int64 and
+                its shape is [batch_size, 1]
+            masked_lm_scale(Tensor or int):
+                The scale of masked tokens. Used for the normalization of masked language modeling loss.
+                If it is a `Tensor`, its data type should be int64 and its shape is equal to `prediction_scores`.
+
+        Returns:
+            Tensor: The pretraining loss, equals to the sum of `masked_lm_loss` plus the mean of `next_sentence_loss`.
+            Its data type should be float32 and its shape is [1].
+
+
+        """
+        with paddle.static.amp.fp16_guard():
+            masked_lm_loss = F.cross_entropy(prediction_scores, masked_lm_labels, reduction="none", ignore_index=-1)
+            masked_lm_loss = masked_lm_loss / masked_lm_scale
+            next_sentence_loss = F.cross_entropy(seq_relationship_score, next_sentence_labels, reduction="none")
+        return paddle.sum(masked_lm_loss) + paddle.mean(next_sentence_loss)
+
+
+class BertForMultipleChoice(BertPretrainedModel):
+    """
+    Bert Model with a linear layer on top of the hidden-states output layer,
+    designed for multiple choice tasks like RocStories/SWAG tasks.
+
+    Args:
+        config (:class:`BertConfig`):
+            An instance of BertConfig used to construct BertForMultipleChoice.
+
+    Examples:
+        >>> model = BertForMultipleChoice(config, dropout=0.1)
+        >>> # or
+        >>> config.hidden_dropout_prob = 0.1
+        >>> model = BertForMultipleChoice(config)
+    """
+
+    def __init__(self, config: BertConfig):
+        super(BertForMultipleChoice, self).__init__(config)
+
+        self.bert = BertModel(config)
+        self.num_choices = config.num_choices
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The BertForMultipleChoice forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`BertModel` and shape as [batch_size, num_choice, sequence_length].
+            token_type_ids(Tensor, optional):
+                See :class:`BertModel` and shape as [batch_size, num_choice, sequence_length].
+            position_ids(Tensor, optional):
+                See :class:`BertModel` and shape as [batch_size, num_choice, sequence_length].
+            attention_mask (list, optional):
+                See :class:`BertModel` and shape as [batch_size, num_choice, sequence_length].
+            labels (Tensor of shape `(batch_size, )`, optional):
+                Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+                num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+                `input_ids` above)
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `None`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `None`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `None`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import BertForMultipleChoice, BertTokenizer
+                from paddlenlp.data import Pad, Dict
+
+                tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+                model = BertForMultipleChoice.from_pretrained('bert-base-uncased', num_choices=2)
+
+                data = [
+                    {
+                        "question": "how do you turn on an ipad screen?",
+                        "answer1": "press the volume button.",
+                        "answer2": "press the lock button.",
+                        "label": 1,
+                    },
+                    {
+                        "question": "how do you indent something?",
+                        "answer1": "leave a space before starting the writing",
+                        "answer2": "press the spacebar",
+                        "label": 0,
+                    },
+                ]
+
+                text = []
+                text_pair = []
+                for d in data:
+                    text.append(d["question"])
+                    text_pair.append(d["answer1"])
+                    text.append(d["question"])
+                    text_pair.append(d["answer2"])
+
+                inputs = tokenizer(text, text_pair)
+                batchify_fn = lambda samples, fn=Dict(
+                    {
+                        "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input_ids
+                        "token_type_ids": Pad(
+                            axis=0, pad_val=tokenizer.pad_token_type_id
+                        ),  # token_type_ids
+                    }
+                ): fn(samples)
+                inputs = batchify_fn(inputs)
+
+                reshaped_logits = model(
+                    input_ids=paddle.to_tensor(inputs[0], dtype="int64"),
+                    token_type_ids=paddle.to_tensor(inputs[1], dtype="int64"),
+                )
+                print(reshaped_logits.shape)
+                # [2, 2]
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # input_ids: [bs, num_choice, seq_l]
+        input_ids = input_ids.reshape(shape=(-1, input_ids.shape[-1]))  # flat_input_ids: [bs*num_choice,seq_l]
+
+        if position_ids is not None:
+            position_ids = position_ids.reshape(shape=(-1, position_ids.shape[-1]))
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.reshape(shape=(-1, token_type_ids.shape[-1]))
+
+        if attention_mask is not None:
+            attention_mask = attention_mask.reshape(shape=(-1, attention_mask.shape[-1]))
+
+        outputs = self.bert(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+
+        logits = self.classifier(pooled_output)  # logits: (bs*num_choice,1)
+        reshaped_logits = logits.reshape(shape=(-1, self.num_choices))  # logits: (bs, num_choice)
+
+        loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class BertOnlyMLMHead(nn.Layer):
+    def __init__(self, config: BertConfig):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config=config)
+
+    def forward(self, sequence_output, masked_positions=None):
+        prediction_scores = self.predictions(sequence_output, masked_positions)
+        return prediction_scores
+
+
+class BertForMaskedLM(BertPretrainedModel):
+    """
+    Bert Model with a `masked language modeling` head on top.
+
+    Args:
+        config (:class:`BertConfig`):
+            An instance of BertConfig used to construct BertForMaskedLM.
+
+    """
+
+    def __init__(self, config: BertConfig):
+        super(BertForMaskedLM, self).__init__(config)
+        self.bert = BertModel(config)
+
+        self.cls = BertOnlyMLMHead(config=config)
+        self.tie_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        masked_positions: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`BertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`BertModel`.
+            position_ids (Tensor, optional):
+                See :class:`BertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`BertModel`.
+            labels (Tensor of shape `(batch_size, sequence_length)`, optional):
+                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+                vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+                loss is only computed for the tokens with labels in `[0, ..., vocab_size]`
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `None`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `None`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.MaskedLMOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `None`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.MaskedLMOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.MaskedLMOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import BertForMaskedLM, BertTokenizer
+
+                tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+                model = BertForMaskedLM.from_pretrained('bert-base-uncased')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+
+                logits = model(**inputs)
+                print(logits.shape)
+                # [1, 13, 30522]
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.bert(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output, masked_positions=masked_positions)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.reshape((-1, prediction_scores.shape[-1])), labels.reshape((-1,))
+            )
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return (
+                ((masked_lm_loss,) + output)
+                if masked_lm_loss is not None
+                else (output[0] if len(output) == 1 else output)
+            )
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert/modeling.pyi b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert/modeling.pyi
new file mode 100644
index 000000000..5717c0c92
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert/modeling.pyi
@@ -0,0 +1,345 @@
+import paddle.nn as nn
+import paddle
+from ..model_outputs import ModelOutput
+from .configuration import BertConfig
+from _typeshed import Incomplete
+from paddle import Tensor
+from paddle.nn import Layer, Embedding, Linear
+from paddlenlp.transformers.model_utils import PretrainedModel
+from typing import Dict, Optional, Tuple, Union, overload
+
+class BertEmbeddings(Layer):
+    word_embeddings: Embedding
+    position_embeddings: Embedding
+    token_type_embeddings: Embedding
+    layer_norm: Layer
+    dropout: float
+    def __init__(self, config: BertConfig) -> None: ...
+    def forward(
+        self,
+        input_ids: Tensor,
+        token_type_ids: Tensor | None = ...,
+        position_ids: Tensor | None = ...,
+        past_key_values_length: int = 0,
+    ): ...
+
+class BertPooler(Layer):
+    dense: Linear
+    activation: Layer
+    pool_act: Layer
+    def __init__(self, config: BertConfig) -> None: ...
+    def forward(self, hidden_states): ...
+
+class BertPretrainedModel(PretrainedModel):
+    model_config_file: str
+    config_class: Incomplete
+    resource_files_names: Dict[str, str]
+    base_model_prefix: str
+    pretrained_init_configuration: Dict[str, dict]
+    pretrained_resource_files_map: Dict[str, str]
+    def init_weights(self, layer) -> None: ...
+
+class BertModel(BertPretrainedModel):
+    pad_token_id: int
+    initializer_range: float
+    embeddings: Embedding
+    fuse: bool
+    encoder: nn.TransformerDecoder
+    pooler: BertPooler
+
+    def __init__(self, config: BertConfig) -> None: ...
+    def get_input_embeddings(self): ...
+    def set_input_embeddings(self, value) -> None: ...
+    def forward(
+        self,
+        input_ids,
+        token_type_ids: Tensor | None = ...,
+        position_ids: Tensor | None = ...,
+        attention_mask: Tensor | None = ...,
+        past_key_values: Tensor | None = ...,
+        use_cache: Tensor | None = ...,
+        output_hidden_states: bool = ...,
+        output_attentions: bool = ...,
+        return_dict: bool = ...,
+    ): ...
+    @staticmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        cache_dir: str | None = None,
+        config: Optional[BertConfig] = None,
+        *args,
+        **kwargs
+    ) -> BertModel: ...
+
+class BertForQuestionAnswering(BertPretrainedModel):
+    bert: BertModel
+    dropout: nn.Dropout
+    classifier: Linear
+    def __init__(self, config: BertConfig): ...
+    def forward(
+        self,
+        input_ids,
+        token_type_ids: Tensor | None = ...,
+        position_ids: Tensor | None = ...,
+        attention_mask: Tensor | None = ...,
+        start_positions: Tensor | None = ...,
+        end_positions: Tensor | None = ...,
+        output_hidden_states: bool = ...,
+        output_attentions: bool = ...,
+        return_dict: bool = ...,
+    ): ...
+    def __call__(
+        self,
+        input_ids,
+        token_type_ids: Tensor | None = ...,
+        position_ids: Tensor | None = ...,
+        attention_mask: Tensor | None = ...,
+        start_positions: Tensor | None = ...,
+        end_positions: Tensor | None = ...,
+        output_hidden_states: bool = ...,
+        output_attentions: bool = ...,
+        return_dict: bool = ...,
+    ): ...
+    @staticmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        cache_dir: str | None = None,
+        classifier_dropout: float | None = None,
+        config: Optional[BertConfig] = None,
+        *args,
+        **kwargs
+    ) -> BertForQuestionAnswering: ...
+
+class BertForSequenceClassification(BertPretrainedModel):
+    bert: BertModel
+    num_labels: int
+    dropout: nn.Dropout
+    classifier: Linear
+    def __init__(self, config: BertConfig): ...
+    def forward(
+        self,
+        input_ids: Tensor,
+        token_type_ids: Tensor | None = ...,
+        position_ids: Tensor | None = ...,
+        attention_mask: Tensor | None = ...,
+        labels: Tensor | None = ...,
+        output_hidden_states: bool = ...,
+        output_attentions: bool = ...,
+        return_dict: bool = ...,
+    ): ...
+    def __call__(
+        self,
+        input_ids: Tensor,
+        token_type_ids: Tensor | None = ...,
+        position_ids: Tensor | None = ...,
+        attention_mask: Tensor | None = ...,
+        labels: Tensor | None = ...,
+        output_hidden_states: bool = ...,
+        output_attentions: bool = ...,
+        return_dict: bool = ...,
+    ): ...
+    @staticmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        cache_dir: str | None = None,
+        num_labels: int | None = 2,
+        classifier_dropout: float | None = None,
+        config: Optional[BertConfig] = None,
+        *args,
+        **kwargs
+    ) -> BertForSequenceClassification: ...
+
+class BertForTokenClassification(BertPretrainedModel):
+    bert: BertModel
+    num_labels: int
+    dropout: nn.Dropout
+    classifier: Linear
+    def __init__(self, config: BertConfig): ...
+    def forward(
+        self,
+        input_ids,
+        token_type_ids: Tensor | None = ...,
+        position_ids: Tensor | None = ...,
+        attention_mask: Tensor | None = ...,
+        labels: Tensor | None = ...,
+        output_hidden_states: bool = ...,
+        output_attentions: bool = ...,
+        return_dict: bool = ...,
+    ): ...
+    def __call__(
+        self,
+        input_ids,
+        token_type_ids: Tensor | None = ...,
+        position_ids: Tensor | None = ...,
+        attention_mask: Tensor | None = ...,
+        labels: Tensor | None = ...,
+        output_hidden_states: bool = ...,
+        output_attentions: bool = ...,
+        return_dict: bool = ...,
+    ): ...
+    @staticmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        cache_dir: str | None = None,
+        num_labels: int | None = 2,
+        classifier_dropout: float | None = None,
+        config: Optional[BertConfig] = None,
+        *args,
+        **kwargs
+    ) -> BertForTokenClassification: ...
+
+class BertLMPredictionHead(Layer):
+    transform: Incomplete
+    activation: Incomplete
+    layer_norm: nn.LayerNorm
+    decoder_weight: paddle.ParamAttr
+    decoder_bias: paddle.ParamAttr
+    def __init__(self, config: BertConfig, embedding_weights: Tensor | None = ...) -> None: ...
+    def forward(self, hidden_states, masked_positions: Tensor | None = ...): ...
+
+class BertPretrainingHeads(Layer):
+    predictions: Incomplete
+    seq_relationship: Incomplete
+    def __init__(self, config: BertConfig, embedding_weights: Tensor | None = ...) -> None: ...
+    def forward(self, sequence_output, pooled_output, masked_positions: Tensor | None = ...): ...
+
+class BertForPreTrainingOutput(ModelOutput):
+    loss: Optional[paddle.Tensor]
+    prediction_logits: paddle.Tensor
+    seq_relationship_logits: paddle.Tensor
+    hidden_states: Optional[Tuple[paddle.Tensor]]
+    attentions: Optional[Tuple[paddle.Tensor]]
+    def __init__(
+        self,
+        loss: Tensor | None,
+        prediction_logits: Tensor | None,
+        seq_relationship_logits: Tensor | None,
+        hidden_states: Tensor | None,
+        attentions: Tensor | None,
+    ) -> None: ...
+
+class BertForPretraining(BertPretrainedModel):
+    bert: BertModel
+    cls: Incomplete
+    def __init__(self, config: BertConfig) -> None: ...
+    def forward(
+        self,
+        input_ids,
+        token_type_ids: Tensor | None = ...,
+        position_ids: Tensor | None = ...,
+        attention_mask: Tensor | None = ...,
+        masked_positions: Tensor | None = ...,
+        labels: Tensor | None = ...,
+        next_sentence_label: Tensor | None = ...,
+        output_hidden_states: bool = ...,
+        output_attentions: bool = ...,
+        return_dict: bool = ...,
+    ): ...
+    def __call__(
+        self,
+        input_ids,
+        token_type_ids: Tensor | None = ...,
+        position_ids: Tensor | None = ...,
+        attention_mask: Tensor | None = ...,
+        masked_positions: Tensor | None = ...,
+        labels: Tensor | None = ...,
+        next_sentence_label: Tensor | None = ...,
+        output_hidden_states: bool = ...,
+        output_attentions: bool = ...,
+        return_dict: bool = ...,
+    ): ...
+    @staticmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        cache_dir: str | None = None,
+        config: Optional[BertConfig] = None,
+        *args,
+        **kwargs
+    ) -> BertForQuestionAnswering: ...
+
+class BertPretrainingCriterion(paddle.nn.Layer):
+    loss_fn: nn.Layer
+    vocab_size: int
+    def __init__(self, vocab_size) -> None: ...
+    def forward(
+        self, prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels, masked_lm_scale
+    ): ...
+    def __call__(
+        self, prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels, masked_lm_scale
+    ): ...
+
+class BertForMultipleChoice(BertPretrainedModel):
+    bert: BertModel
+    num_choices: int
+    dropout: nn.Dropout
+    classifier: Linear
+    @overload
+    def __init__(self, config: BertConfig) -> None: ...
+    def forward(
+        self,
+        input_ids,
+        token_type_ids: Tensor | None = ...,
+        position_ids: Tensor | None = ...,
+        attention_mask: Tensor | None = ...,
+        labels: Tensor | None = ...,
+        output_hidden_states: bool = ...,
+        output_attentions: bool = ...,
+        return_dict: bool = ...,
+    ): ...
+    @staticmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        cache_dir: str | None = None,
+        num_choices: int | None = 2,
+        classifier_dropout: float | None = None,
+        config: Optional[BertConfig] = None,
+        *args,
+        **kwargs
+    ) -> BertForMultipleChoice: ...
+
+class BertOnlyMLMHead(nn.Layer):
+    predictions: BertLMPredictionHead
+    def __init__(self, config: BertConfig, embedding_weights: Tensor | None = ...) -> None: ...
+    def forward(self, sequence_output, masked_positions: Tensor | None = ...): ...
+
+class BertForMaskedLM(BertPretrainedModel):
+    bert: BertModel
+    cls: BertOnlyMLMHead
+    def __init__(self, config: BertConfig) -> None: ...
+    def forward(
+        self,
+        input_ids,
+        token_type_ids: Tensor | None = ...,
+        position_ids: Tensor | None = ...,
+        attention_mask: Tensor | None = ...,
+        labels: Tensor | None = ...,
+        output_hidden_states: bool = ...,
+        output_attentions: bool = ...,
+        return_dict: bool = ...,
+    ): ...
+    def __call__(
+        self,
+        input_ids,
+        token_type_ids: Tensor | None = ...,
+        position_ids: Tensor | None = ...,
+        attention_mask: Tensor | None = ...,
+        labels: Tensor | None = ...,
+        output_hidden_states: bool = ...,
+        output_attentions: bool = ...,
+        return_dict: bool = ...,
+    ): ...
+    @staticmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        cache_dir: str | None = None,
+        config: Optional[BertConfig] = None,
+        *args,
+        **kwargs
+    ) -> BertForMaskedLM: ...
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert/tokenizer.py
new file mode 100644
index 000000000..d43002c48
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert/tokenizer.py
@@ -0,0 +1,630 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unicodedata
+
+from ..tokenizer_utils import (
+    PretrainedTokenizer,
+    _is_control,
+    _is_punctuation,
+    _is_symbol,
+    _is_whitespace,
+    convert_to_unicode,
+    whitespace_tokenize,
+)
+
+__all__ = [
+    "BasicTokenizer",
+    "BertTokenizer",
+    "WordpieceTokenizer",
+]
+
+
+class BasicTokenizer(object):
+    """
+    Runs basic tokenization (punctuation splitting, lower casing, etc.).
+
+    Args:
+        do_lower_case (bool):
+            Whether to lowercase the input when tokenizing.
+            Defaults to `True`.
+        never_split (Iterable):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (bool):
+            Whether to tokenize Chinese characters.
+        strip_accents: (bool):
+            Whether to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+    """
+
+    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+        """Constructs a BasicTokenizer."""
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+        self.strip_accents = strip_accents
+
+    def tokenize(self, text, never_split=None):
+        """
+        Tokenizes a piece of text using basic tokenizer.
+
+        Args:
+            text (str): A piece of text.
+            never_split (List[str]): List of token not to split.
+
+        Returns:
+            list(str): A list of tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import BasicTokenizer
+                basictokenizer = BasicTokenizer()
+                tokens = basictokenizer.tokenize('He was a puppeteer')
+                '''
+                ['he', 'was', 'a', 'puppeteer']
+                '''
+        """
+        text = convert_to_unicode(text)
+        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
+
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if token not in never_split:
+                if self.do_lower_case:
+                    token = token.lower()
+                    if self.strip_accents is not False:
+                        token = self._run_strip_accents(token)
+                elif self.strip_accents:
+                    token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """
+        Strips accents from a piece of text.
+        """
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text, never_split=None):
+        """
+        Splits punctuation on a piece of text.
+        """
+        if never_split is not None and text in never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            # punctuation and symbol should be treat as single char.
+            if _is_punctuation(char) or _is_symbol(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """
+        Adds whitespace around any CJK character.
+        """
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """
+        Checks whether CP is the codepoint of a CJK character.
+        """
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)  #
+            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+        ):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """
+        Performs invalid character removal and whitespace cleanup on text.
+        """
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """
+    Runs WordPiece tokenization.
+
+    Args:
+        vocab (Vocab|dict):
+            Vocab of the word piece tokenizer.
+        unk_token (str):
+            A specific token to replace all unknown tokens.
+        max_input_chars_per_word (int):
+            If a word's length is more than
+            max_input_chars_per_word, it will be dealt as unknown word.
+            Defaults to 100.
+    """
+
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces.
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through `BasicTokenizer`.
+
+        Returns:
+            list (str): A list of wordpiece tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import BertTokenizer, WordpieceTokenizer
+
+                berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+                vocab  = berttokenizer.vocab
+                unk_token = berttokenizer.unk_token
+
+                wordpiecetokenizer = WordpieceTokenizer(vocab,unk_token)
+                inputs = wordpiecetokenizer.tokenize("unaffable")
+                print(inputs)
+                '''
+                ["un", "##aff", "##able"]
+                '''
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+class BertTokenizer(PretrainedTokenizer):
+    """
+    Constructs a BERT tokenizer. It uses a basic tokenizer to do punctuation
+    splitting, lower casing and so on, and follows a WordPiece tokenizer to
+    tokenize as subwords.
+
+    Args:
+        vocab_file (str):
+            The vocabulary file path (ends with '.txt') required to instantiate
+            a `WordpieceTokenizer`.
+        do_lower_case (bool, optional):
+            Whether to lowercase the input when tokenizing.
+            Defaults to `True`.
+        do_basic_tokenize (bool, optional):
+            Whether to use a basic tokenizer before a WordPiece tokenizer.
+            Defaults to `True`.
+        never_split (Iterable, optional):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`. Defaults to `None`.
+        unk_token (str, optional):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str, optional):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str, optional):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str, optional):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str, optional):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+        tokenize_chinese_chars (bool, optional):
+            Whether to tokenize Chinese characters.
+            Defaults to `True`.
+        strip_accents: (bool, optional):
+            Whether to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+            Defaults to `None`.
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import BertTokenizer
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+
+            inputs = tokenizer('He was a puppeteer')
+            print(inputs)
+
+            '''
+            {'input_ids': [101, 2002, 2001, 1037, 13997, 11510, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0]}
+            '''
+    """
+
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "bert-base-uncased": "https://bj.bcebos.com/paddle-hapi/models/bert/bert-base-uncased-vocab.txt",
+            "bert-large-uncased": "https://bj.bcebos.com/paddle-hapi/models/bert/bert-large-uncased-vocab.txt",
+            "bert-base-cased": "https://bj.bcebos.com/paddle-hapi/models/bert/bert-base-cased-vocab.txt",
+            "bert-large-cased": "https://bj.bcebos.com/paddle-hapi/models/bert/bert-large-cased-vocab.txt",
+            "bert-base-multilingual-uncased": "https://bj.bcebos.com/paddle-hapi/models/bert/bert-base-multilingual-uncased-vocab.txt",
+            "bert-base-multilingual-cased": "https://bj.bcebos.com/paddle-hapi/models/bert/bert-base-multilingual-cased-vocab.txt",
+            "bert-base-chinese": "https://bj.bcebos.com/paddle-hapi/models/bert/bert-base-chinese-vocab.txt",
+            "bert-wwm-chinese": "http://bj.bcebos.com/paddlenlp/models/transformers/bert/bert-wwm-chinese-vocab.txt",
+            "bert-wwm-ext-chinese": "http://bj.bcebos.com/paddlenlp/models/transformers/bert/bert-wwm-ext-chinese-vocab.txt",
+            "macbert-large-chinese": "https://bj.bcebos.com/paddle-hapi/models/bert/bert-base-chinese-vocab.txt",
+            "macbert-base-chinese": "https://bj.bcebos.com/paddle-hapi/models/bert/bert-base-chinese-vocab.txt",
+            "simbert-base-chinese": "https://bj.bcebos.com/paddlenlp/models/transformers/simbert/vocab.txt",
+            "uer/chinese-roberta-base": "https://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_vocab.txt",
+            "uer/chinese-roberta-medium": "https://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_vocab.txt",
+            "uer/chinese-roberta-6l-768h": "https://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_vocab.txt",
+            "uer/chinese-roberta-small": "https://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_vocab.txt",
+            "uer/chinese-roberta-mini": "https://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_vocab.txt",
+            "uer/chinese-roberta-tiny": "https://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "bert-base-uncased": {"do_lower_case": True},
+        "bert-large-uncased": {"do_lower_case": True},
+        "bert-base-cased": {"do_lower_case": False},
+        "bert-large-cased": {"do_lower_case": False},
+        "bert-base-multilingual-uncased": {"do_lower_case": True},
+        "bert-base-multilingual-cased": {"do_lower_case": False},
+        "bert-base-chinese": {"do_lower_case": False},
+        "bert-wwm-chinese": {"do_lower_case": False},
+        "bert-wwm-ext-chinese": {"do_lower_case": False},
+        "macbert-large-chinese": {"do_lower_case": False},
+        "macbert-base-chinese": {"do_lower_case": False},
+        "simbert-base-chinese": {"do_lower_case": True},
+        "uer/chinese-roberta-base": {"do_lower_case": True},
+        "uer/chinese-roberta-medium": {"do_lower_case": True},
+        "uer/chinese-roberta-6l-768h": {"do_lower_case": True},
+        "uer/chinese-roberta-small": {"do_lower_case": True},
+        "uer/chinese-roberta-mini": {"do_lower_case": True},
+        "uer/chinese-roberta-tiny": {"do_lower_case": True},
+    }
+    max_model_input_sizes = {
+        "bert-base-uncased": 512,
+        "bert-large-uncased": 512,
+        "bert-base-cased": 512,
+        "bert-large-cased": 512,
+        "bert-base-multilingual-uncased": 512,
+        "bert-base-multilingual-cased": 512,
+        "bert-base-chinese": 512,
+        "bert-wwm-chinese": 512,
+        "bert-wwm-ext-chinese": 512,
+        "macbert-large-chinese": 512,
+        "macbert-base-chinese": 512,
+        "simbert-base-chinese": 512,
+        "uer/chinese-roberta-base": 512,
+        "uer/chinese-roberta-medium": 512,
+        "uer/chinese-roberta-6l-768h": 512,
+        "uer/chinese-roberta-small": 512,
+        "uer/chinese-roberta-mini": 512,
+        "uer/chinese-roberta-tiny": 512,
+    }
+    padding_side = "right"
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs
+    ):
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the "
+                "vocabulary from a pretrained model please use "
+                "`tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+        self.do_lower_case = do_lower_case
+        self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token)
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=unk_token)
+
+    @property
+    def vocab_size(self):
+        """
+        Return the size of vocabulary.
+
+        Returns:
+            int: The size of vocabulary.
+        """
+
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab.token_to_idx, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        """
+        End-to-end tokenization for BERT models.
+
+        Args:
+            text (str): The text to be tokenized.
+
+        Returns:
+            list: A list of string representing converted tokens.
+        """
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
+                # If the token is part of the never_split set
+                if token in self.basic_tokenizer.never_split:
+                    split_tokens.append(token)
+                else:
+                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def convert_tokens_to_string(self, tokens):
+        """
+        Converts a sequence of tokens (list of string) to a single string. Since
+        the usage of WordPiece introducing `##` to concat subwords, also removes
+        `##` when converting.
+
+        Args:
+            tokens (list): A list of string representing tokens to be converted.
+
+        Returns:
+            str: Converted string from tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import BertTokenizer
+
+                berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+                tokens = berttokenizer.tokenize('He was a puppeteer')
+                '''
+                ['he', 'was', 'a', 'puppet', '##eer']
+                '''
+                strings = tokenizer.convert_tokens_to_string(tokens)
+                '''
+                he was a puppeteer
+                '''
+        """
+
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def num_special_tokens_to_add(self, pair=False):
+        """
+        Returns the number of added tokens when encoding a sequence with special tokens.
+
+        Args:
+            pair(bool):
+                Whether the input is a sequence pair or a single sequence.
+                Defaults to `False` and the input is a single sequence.
+
+        Returns:
+            int: Number of tokens added to sequences.
+        """
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens.
+
+        A BERT sequence has the following format:
+
+        - single sequence:      ``[CLS] X [SEP]``
+        - pair of sequences:        ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+
+        Returns:
+            List[int]: List of input_id with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        return _cls + token_ids_0 + _sep + token_ids_1 + _sep
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        """
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        A BERT offset_mapping has the following format:
+
+        - single sequence:      ``(0,0) X (0,0)``
+        - pair of sequences:        ``(0,0) A (0,0) B (0,0)``
+
+        Args:
+            offset_mapping_ids_0 (List[tuple]):
+                List of wordpiece offsets to which the special tokens will be added.
+            offset_mapping_ids_1 (List[tuple], optional):
+                Optional second list of wordpiece offsets for offset mapping pairs. Defaults to None.
+
+        Returns:
+            List[tuple]: A list of wordpiece offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+
+        A BERT sequence pair mask has the following format:
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+
+        Returns:
+            List[int]: List of token_type_id according to the given sequence(s).
+        """
+        _sep = [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(_cls + token_ids_0 + _sep) * [0]
+        return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 + _sep) * [1]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+            already_has_special_tokens (bool, optional): Whether or not the token list is already
+                formatted with special tokens for the model. Defaults to None.
+
+        Returns:
+            List[int]: The list of integers either be 0 or 1: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in self.all_special_ids else 0, token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.vocab._idx_to_token.get(index, self.unk_token)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert_japanese/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert_japanese/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert_japanese/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert_japanese/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert_japanese/tokenizer.py
new file mode 100644
index 000000000..3ddd0ddea
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bert_japanese/tokenizer.py
@@ -0,0 +1,354 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+import unicodedata
+import collections
+
+from .. import BertTokenizer, BasicTokenizer, WordpieceTokenizer
+
+__all__ = ["BertJapaneseTokenizer", "MecabTokenizer", "CharacterTokenizer"]
+
+
+class BertJapaneseTokenizer(BertTokenizer):
+    """
+    Construct a BERT tokenizer for Japanese text, based on a MecabTokenizer.
+
+    Args:
+        vocab_file (str):
+            The vocabulary file path (ends with '.txt') required to instantiate
+            a `WordpieceTokenizer`.
+        do_lower_case (bool, optional):
+            Whether or not to lowercase the input when tokenizing.
+            Defaults to`False`.
+        do_word_tokenize (bool, optional):
+            Whether to do word tokenization. Defaults to`True`.
+        do_subword_tokenize (bool, optional):
+            Whether to do subword tokenization. Defaults to`True`.
+        word_tokenizer_type (str, optional):
+            Type of word tokenizer. Defaults to`basic`.
+        subword_tokenizer_type (str, optional):
+            Type of subword tokenizer. Defaults to`wordpiece`.
+        never_split (bool, optional):
+            Kept for backward compatibility purposes. Defaults to`None`.
+        mecab_kwargs (str, optional):
+            Dictionary passed to the `MecabTokenizer` constructor.
+        unk_token (str):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import BertJapaneseTokenizer
+            tokenizer = BertJapaneseTokenizer.from_pretrained('iverxin/bert-base-japanese/')
+
+            inputs = tokenizer('こんにちは')
+            print(inputs)
+
+            '''
+            {'input_ids': [2, 10350, 25746, 28450, 3], 'token_type_ids': [0, 0, 0, 0, 0]}
+            '''
+
+    """
+
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "cl-tohoku/bert-base-japanese": "http://bj.bcebos.com/paddlenlp/models/community/cl-tohoku/bert-base-japanese/vocab.txt",
+            "cl-tohoku/bert-base-japanese-whole-word-masking": "http://bj.bcebos.com/paddlenlp/models/community/cl-tohoku/bert-base-japanese-whole-word-masking/vocab.txt",
+            "cl-tohoku/bert-base-japanese-char": "http://bj.bcebos.com/paddlenlp/models/community/cl-tohoku/bert-base-japanese-char/vocab.txt",
+            "cl-tohoku/bert-base-japanese-char-whole-word-masking": "http://bj.bcebos.com/paddlenlp/models/community/cl-tohoku/bert-base-japanese-char-whole-word-masking/vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "cl-tohoku/bert-base-japanese": {
+            "do_lower_case": False,
+            "word_tokenizer_type": "mecab",
+            "subword_tokenizer_type": "wordpiece",
+        },
+        "cl-tohoku/bert-base-japanese-whole-word-masking": {
+            "do_lower_case": False,
+            "word_tokenizer_type": "mecab",
+            "subword_tokenizer_type": "wordpiece",
+        },
+        "cl-tohoku/bert-base-japanese-char": {
+            "do_lower_case": False,
+            "word_tokenizer_type": "mecab",
+            "subword_tokenizer_type": "character",
+        },
+        "cl-tohoku/bert-base-japanese-char-whole-word-masking": {
+            "do_lower_case": False,
+            "word_tokenizer_type": "mecab",
+            "subword_tokenizer_type": "character",
+        },
+    }
+    padding_side = "right"
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=False,
+        do_word_tokenize=True,
+        do_subword_tokenize=True,
+        word_tokenizer_type="mecab",
+        subword_tokenizer_type="wordpiece",
+        never_split=None,
+        mecab_kwargs=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the "
+                "vocabulary from a pretrained model please use "
+                "`tokenizer = BertJapaneseTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+
+        self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token)
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.idx_to_token.items()])
+
+        self.do_word_tokenize = do_word_tokenize
+        self.word_tokenizer_type = word_tokenizer_type
+        self.lower_case = do_lower_case
+        self.never_split = never_split
+        self.mecab_kwargs = copy.deepcopy(mecab_kwargs)
+        if do_word_tokenize:
+            if word_tokenizer_type == "basic":
+                self.basic_tokenizer = BasicTokenizer(
+                    do_lower_case=do_lower_case,
+                )
+            elif word_tokenizer_type == "mecab":
+                self.basic_tokenizer = MecabTokenizer(
+                    do_lower_case=do_lower_case, never_split=never_split, **(mecab_kwargs or {})
+                )
+            else:
+                raise ValueError(f"Invalid word_tokenizer_type '{word_tokenizer_type}' is specified.")
+
+        self.do_subword_tokenize = do_subword_tokenize
+        self.subword_tokenizer_type = subword_tokenizer_type
+        if do_subword_tokenize:
+            if subword_tokenizer_type == "wordpiece":
+                self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=unk_token)
+            elif subword_tokenizer_type == "character":
+                self.wordpiece_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=unk_token)
+            else:
+                raise ValueError(f"Invalid subword_tokenizer_type '{subword_tokenizer_type}' is specified.")
+
+    @property
+    def do_lower_case(self):
+        return self.lower_case
+
+    def __getstate__(self):
+        state = dict(self.__dict__)
+        if self.word_tokenizer_type == "mecab":
+            del state["basic_tokenizer"]
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__ = state
+        if self.word_tokenizer_type == "mecab":
+            self.basic_tokenizer = MecabTokenizer(
+                do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.mecab_kwargs or {})
+            )
+
+    def _tokenize(self, text):
+        if self.do_word_tokenize:
+            if self.word_tokenizer_type == "basic":
+                tokens = self.basic_tokenizer.tokenize(text)
+            elif self.word_tokenizer_type == "mecab":
+                tokens = self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens)
+        else:
+            tokens = [text]
+
+        if self.do_subword_tokenize:
+            split_tokens = [sub_token for token in tokens for sub_token in self.wordpiece_tokenizer.tokenize(token)]
+        else:
+            split_tokens = tokens
+
+        return split_tokens
+
+
+class MecabTokenizer:
+    """Runs basic tokenization with MeCab morphological parser."""
+
+    def __init__(
+        self,
+        do_lower_case=False,
+        never_split=None,
+        normalize_text=True,
+        mecab_dic="ipadic",
+        mecab_option=None,
+    ):
+        """
+        Constructs a MecabTokenizer.
+
+        Args:
+            do_lower_case (bool):
+                Whether to lowercase the input. Defaults to`True`.
+            never_split: (list):
+                Kept for backward compatibility purposes. Defaults to`None`.
+            normalize_text (bool):
+                Whether to apply unicode normalization to text before tokenization.  Defaults to`True`.
+            mecab_dic (string):
+                Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary,
+                set this option to `None` and modify `mecab_option`. Defaults to`ipadic`.
+            mecab_option (string):
+                String passed to MeCab constructor. Defaults to`None`.
+        """
+        self.do_lower_case = do_lower_case
+        self.never_split = never_split if never_split is not None else []
+        self.normalize_text = normalize_text
+
+        try:
+            import fugashi
+        except ModuleNotFoundError as error:
+            raise error.__class__(
+                "You need to install fugashi to use MecabTokenizer. "
+                "See https://pypi.org/project/fugashi/ for installation."
+            )
+
+        mecab_option = mecab_option or ""
+
+        if mecab_dic is not None:
+            if mecab_dic == "ipadic":
+                try:
+                    import ipadic
+                except ModuleNotFoundError as error:
+                    raise error.__class__(
+                        "The ipadic dictionary is not installed. "
+                        "See https://github.com/polm/ipadic-py for installation."
+                    )
+
+                dic_dir = ipadic.DICDIR
+
+            elif mecab_dic == "unidic_lite":
+                try:
+                    import unidic_lite
+                except ModuleNotFoundError as error:
+                    raise error.__class__(
+                        "The unidic_lite dictionary is not installed. "
+                        "See https://github.com/polm/unidic-lite for installation."
+                    )
+
+                dic_dir = unidic_lite.DICDIR
+
+            elif mecab_dic == "unidic":
+                try:
+                    import unidic
+                except ModuleNotFoundError as error:
+                    raise error.__class__(
+                        "The unidic dictionary is not installed. "
+                        "See https://github.com/polm/unidic-py for installation."
+                    )
+
+                dic_dir = unidic.DICDIR
+                if not os.path.isdir(dic_dir):
+                    raise RuntimeError(
+                        "The unidic dictionary itself is not found."
+                        "See https://github.com/polm/unidic-py for installation."
+                    )
+            else:
+                raise ValueError("Invalid mecab_dic is specified.")
+
+            mecabrc = os.path.join(dic_dir, "mecabrc")
+            mecab_option = f'-d "{dic_dir}" -r "{mecabrc}" ' + mecab_option
+
+        self.mecab = fugashi.GenericTagger(mecab_option)
+
+    def tokenize(self, text, never_split=None, **kwargs):
+        """Tokenizes a piece of text."""
+        if self.normalize_text:
+            text = unicodedata.normalize("NFKC", text)
+
+        never_split = self.never_split + (never_split if never_split is not None else [])
+        tokens = []
+
+        for word in self.mecab(text):
+            token = word.surface
+
+            if self.do_lower_case and token not in never_split:
+                token = token.lower()
+
+            tokens.append(token)
+
+        return tokens
+
+
+class CharacterTokenizer:
+    """Runs Character tokenization."""
+
+    def __init__(self, vocab, unk_token, normalize_text=True):
+        """
+        Constructs a CharacterTokenizer.
+
+        Args:
+            vocab:
+                Vocabulary object.
+            unk_token (str):
+                A special symbol for out-of-vocabulary token.
+            normalize_text (boolean):
+                Whether to apply unicode normalization to text before tokenization. Defaults to True.
+        """
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.normalize_text = normalize_text
+
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into characters.
+
+        For example, `input = "apple""` wil return as output `["a", "p", "p", "l", "e"]`.
+
+        Args:
+            text: A single token or whitespace separated tokens.
+                This should have already been passed through `BasicTokenizer`.
+
+        Returns:
+            A list of characters.
+        """
+        if self.normalize_text:
+            text = unicodedata.normalize("NFKC", text)
+
+        output_tokens = []
+        for char in text:
+            if char not in self.vocab:
+                output_tokens.append(self.unk_token)
+                continue
+
+            output_tokens.append(char)
+
+        return output_tokens
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bigbird/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bigbird/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bigbird/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bigbird/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bigbird/configuration.py
new file mode 100644
index 000000000..7427f107e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bigbird/configuration.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BIGBIRD model configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["BIGBIRD_PRETRAINED_INIT_CONFIGURATION", "BigBirdConfig", "BIGBIRD_PRETRAINED_RESOURCE_FILES_MAP"]
+
+BIGBIRD_PRETRAINED_INIT_CONFIGURATION = {
+    "bigbird-base-uncased": {
+        "num_layers": 12,
+        "vocab_size": 50358,
+        "nhead": 12,
+        "attn_dropout": 0.1,
+        "dim_feedforward": 3072,
+        "activation": "gelu",
+        "normalize_before": False,
+        "block_size": 16,
+        "window_size": 3,
+        "num_global_blocks": 2,
+        "num_rand_blocks": 3,
+        "seed": None,
+        "pad_token_id": 0,
+        "hidden_size": 768,
+        "hidden_dropout_prob": 0.1,
+        "max_position_embeddings": 4096,
+        "type_vocab_size": 2,
+        "num_labels": 2,
+        "initializer_range": 0.02,
+    },
+}
+
+BIGBIRD_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "bigbird-base-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/bigbird/bigbird-base-uncased.pdparams",
+    }
+}
+
+
+class BigBirdConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`BigBirdModel`]. It is used to instantiate an
+    BigBird model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the BigBird
+    [google/bigbird-roberta-base](https://huggingface.co/google/bigbird-roberta-base) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50358):
+            Vocabulary size of the BigBird model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`BigBirdModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_new"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 1024 or 2048 or 4096).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`BigBirdModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        attention_type (`str`, *optional*, defaults to `"bigbird"`)
+            Whether to use block sparse attention (with n complexity) as introduced in paper or original attention
+            layer (with n^2 complexity). Possible values are `"original_full"` and `"bigbird"`.
+        use_bias (`bool`, *optional*, defaults to `True`)
+            Whether to use bias in query, key, value.
+        rescale_embeddings (`bool`, *optional*, defaults to `False`)
+            Whether to rescale embeddings with (hidden_size ** 0.5).
+        block_size (`int`, *optional*, defaults to 64)
+            Size of each block. Useful only when `attention_type == "bigbird"`.
+        num_random_blocks (`int`, *optional*, defaults to 3)
+            Each query is going to attend these many number of random blocks. Useful only when `attention_type ==
+            "bigbird"`.
+        dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+    Example:
+    ```python
+    >>> from transformers import BigBirdConfig, BigBirdModel
+    >>> # Initializing a BigBird google/bigbird-roberta-base style configuration
+    >>> configuration = BigBirdConfig()
+    >>> # Initializing a model (with random weights) from the google/bigbird-roberta-base style configuration
+    >>> model = BigBirdModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "big_bird"
+    attribute_map: Dict[str, str] = {
+        "num_classes": "num_labels",
+        "nhead": "num_attention_heads",
+        "num_layers": "num_hidden_layers",
+        "dim_feedforward": "intermediate_size",
+        "d_model": "hidden_size",
+    }
+    pretrained_init_configuration = BIGBIRD_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size=50358,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu_new",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=4096,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        sep_token_id=66,
+        attention_type="bigbird",
+        use_bias=True,
+        rescale_embeddings=False,
+        block_size=1,
+        num_random_blocks=3,
+        dropout=0.1,
+        padding_idx=0,
+        attn_dropout=0.1,
+        act_dropout=None,
+        normalize_before=False,
+        weight_attr=None,
+        bias_attr=None,
+        window_size=3,
+        num_global_blocks=2,
+        num_rand_blocks=2,
+        seed=None,
+        activation="relu",
+        embedding_weights=None,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            sep_token_id=sep_token_id,
+            **kwargs,
+        )
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.type_vocab_size = type_vocab_size
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+
+        self.rescale_embeddings = rescale_embeddings
+        self.attention_type = attention_type
+        self.use_bias = use_bias
+        self.block_size = block_size
+        self.num_random_blocks = num_random_blocks
+        self.dropout = dropout
+
+        self.padding_idx = padding_idx
+        self.attn_dropout = attn_dropout
+        self.act_dropout = act_dropout
+        self.normalize_before = normalize_before
+        self.weight_attr = weight_attr
+        self.bias_attr = bias_attr
+        self.window_size = window_size
+        self.num_global_blocks = num_global_blocks
+        self.num_rand_blocks = num_rand_blocks
+        self.seed = seed
+        self.activation = activation
+        self.embedding_weights = embedding_weights
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bigbird/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bigbird/modeling.py
new file mode 100644
index 000000000..0effe9d18
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bigbird/modeling.py
@@ -0,0 +1,1706 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 Google Research and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import Tensor
+from paddle.nn import Dropout, Layer, LayerList, LayerNorm, Linear
+
+from paddlenlp.transformers import create_bigbird_rand_mask_idx_list
+
+from ...utils.env import CONFIG_NAME
+from .. import PretrainedModel, register_base_model
+from ..activations import ACT2FN
+from ..attention_utils import MultiHeadAttention, _convert_param_attr_to_list
+from ..model_outputs import (
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    MaskedLMOutput,
+    ModelOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from .configuration import (
+    BIGBIRD_PRETRAINED_INIT_CONFIGURATION,
+    BIGBIRD_PRETRAINED_RESOURCE_FILES_MAP,
+    BigBirdConfig,
+)
+
+__all__ = [
+    "BigBirdModel",
+    "BigBirdPretrainedModel",
+    "BigBirdForPretraining",
+    "BigBirdPretrainingCriterion",
+    "BigBirdForSequenceClassification",
+    "BigBirdPretrainingHeads",
+    "BigBirdForQuestionAnswering",
+    "BigBirdForTokenClassification",
+    "BigBirdForMultipleChoice",
+    "BigBirdForMaskedLM",
+    "BigBirdForCausalLM",
+]
+
+BIG_BIRD_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "google/bigbird-roberta-base",
+    "google/bigbird-roberta-large",
+    "google/bigbird-base-trivia-itc",
+]
+
+
+@dataclass
+class BigBirdEncoderLayerOutput(ModelOutput):
+
+    src: Optional[Tuple[paddle.Tensor]] = None
+    attn_output: Optional[Tuple[paddle.Tensor]] = None
+
+
+class TransformerEncoderLayer(Layer):
+    def __init__(self, config: BigBirdConfig):
+        super(TransformerEncoderLayer, self).__init__()
+        self.config = config
+        attn_dropout = config.dropout if config.attn_dropout is None else config.attn_dropout
+        act_dropout = config.dropout if config.act_dropout is None else config.act_dropout
+        self.normalize_before = config.normalize_before
+
+        weight_attrs = _convert_param_attr_to_list(config.weight_attr, 2)
+        bias_attrs = _convert_param_attr_to_list(config.bias_attr, 2)
+
+        self.self_attn = MultiHeadAttention(
+            config.d_model,
+            config.nhead,
+            dropout=attn_dropout,
+            weight_attr=weight_attrs[0],
+            bias_attr=bias_attrs[0],
+            attention_type=config.attention_type,
+            block_size=config.block_size,
+            window_size=config.window_size,
+            num_global_blocks=config.num_global_blocks,
+            num_rand_blocks=config.num_rand_blocks,
+            seed=config.seed,
+        )
+        self.linear1 = Linear(config.d_model, config.dim_feedforward, weight_attrs[1], bias_attr=bias_attrs[1])
+        self.dropout = Dropout(act_dropout, mode="upscale_in_train")
+        self.linear2 = Linear(config.dim_feedforward, config.d_model, weight_attrs[1], bias_attr=bias_attrs[1])
+        self.norm1 = LayerNorm(config.d_model, epsilon=1e-12)
+        self.norm2 = LayerNorm(config.d_model, epsilon=1e-12)
+        self.dropout1 = Dropout(config.dropout, mode="upscale_in_train")
+        self.dropout2 = Dropout(config.dropout, mode="upscale_in_train")
+        self.activation = getattr(F, config.activation)
+        self.d_model = config.d_model
+        self.nhead = config.nhead
+
+    def forward(self, src, src_mask=None, rand_mask_idx=None, query_mask=None, key_mask=None):
+        residual = src
+        if self.normalize_before:
+            src = self.norm1(src)
+        src = self.self_attn(src, src, src, src_mask, rand_mask_idx, query_mask, key_mask)
+
+        attn_output = paddle.reshape(x=src, shape=[src.shape[0], src.shape[1], self.nhead, -1])
+        attn_output = paddle.transpose(attn_output, perm=[0, 2, 1, 3])
+
+        src = residual + self.dropout1(src)
+        if not self.normalize_before:
+            src = self.norm1(src)
+        residual = src
+        if self.normalize_before:
+            src = self.norm2(src)
+        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = residual + self.dropout2(src)
+        if not self.normalize_before:
+            src = self.norm2(src)
+
+        return BigBirdEncoderLayerOutput(
+            src=src,
+            attn_output=attn_output,
+        )
+
+
+@dataclass
+class BigBirdEncoderOutput(ModelOutput):
+
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    all_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    all_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+class TransformerEncoder(Layer):
+    def __init__(self, encoder_layer, num_layers):
+        super(TransformerEncoder, self).__init__()
+        self.layers = LayerList(
+            [(encoder_layer if i == 0 else type(encoder_layer)(encoder_layer.config)) for i in range(num_layers)]
+        )
+        self.num_layers = num_layers
+        self.norm = LayerNorm(self.layers[0].d_model, epsilon=1e-12)
+        self.normalize_before = self.layers[0].normalize_before
+
+    def forward(
+        self,
+        src,
+        src_mask_list=None,
+        rand_mask_idx_list=None,
+        query_mask=None,
+        key_mask=None,
+        output_hidden_states=False,
+        output_attentions=False,
+    ):
+        # hidden_states and attention lists to be filled if wished
+        all_hidden_states = []
+        all_attentions = []
+
+        output = src
+        if not self.normalize_before:
+            output = self.norm(output)
+
+        hidden_states = output
+
+        for i, mod in enumerate(self.layers):
+            if output_hidden_states is True:
+                all_hidden_states.append(hidden_states)
+
+            rand_mask_id = None
+            if rand_mask_idx_list is not None:
+                rand_mask_id = rand_mask_idx_list[i]
+            if i != 0:
+                output = mod(output.src, None, rand_mask_id, query_mask, key_mask)
+            if i == 0:
+                output = mod(output, None, rand_mask_id, query_mask, key_mask)
+            hidden_states = output.src
+            attn_output = output.attn_output
+
+            if output_attentions:
+                all_attentions.append(attn_output)
+
+        # Add last layer
+        if output_hidden_states is True:
+            all_hidden_states.append(hidden_states)
+
+        if self.normalize_before:
+            output = self.norm(output)
+        return BigBirdEncoderOutput(
+            hidden_states=output.src,
+            all_hidden_states=all_hidden_states,
+            all_attentions=all_attentions,
+        )
+
+
+class BigBirdPooler(Layer):
+    """
+    Pool the result of BigBird Encoder
+    """
+
+    def __init__(self, hidden_size):
+        super(BigBirdPooler, self).__init__()
+        self.dense = nn.Linear(hidden_size, hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BigBirdEmbeddings(Layer):
+    """
+    Include embeddings from word, position and token_type embeddings
+    """
+
+    def __init__(self, config: BigBirdConfig):
+        super(BigBirdEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.padding_idx)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.shape
+            inputs_embeds = self.word_embeddings(input_ids)
+        else:
+            input_shape = inputs_embeds.shape[:-1]
+
+        if position_ids is None:
+            ones = paddle.ones(input_shape, dtype="int64")
+            seq_length = paddle.cumsum(ones, axis=-1)
+            position_ids = seq_length - ones
+            position_ids.stop_gradient = True
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(input_shape, dtype="int64")
+
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BigBirdPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained BigBird models. It provides BigBird related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    pretrained_init_configuration = BIGBIRD_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = BIGBIRD_PRETRAINED_RESOURCE_FILES_MAP
+    base_model_prefix = "bigbird"
+    model_config_file = CONFIG_NAME
+    config_class = BigBirdConfig
+
+    def _init_weights(self, layer):
+        # Initialization hook
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.initializer_range
+                        if hasattr(self, "initializer_range")
+                        else self.config["initializer_range"],
+                        shape=layer.weight.shape,
+                    )
+                )
+        elif isinstance(layer, nn.LayerNorm):
+            layer._epsilon = 1e-12
+
+
+@register_base_model
+class BigBirdModel(BigBirdPretrainedModel):
+    """
+    The bare BigBird Model outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        num_layers (int):
+            Number of hidden layers in the Transformer encoder.
+        vocab_size (int):
+            Vocabulary size of `inputs_ids` in `BigBirdModel`. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `BigBirdModel`.
+        nhead (int):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        attn_dropout (float, optional):
+            The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
+            Defaults to `0.1`.
+        dim_feedforward (int, optional):
+            Dimensionality of the feed-forward (ff) layer in the Transformer encoder. Input tensors
+            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
+            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
+            Defaults to `3072`.
+        activation (str, optional):
+            The non-linear activation function in the feed-forward layer.
+            ``"gelu"``, ``"relu"``, ``"silu"`` and ``"gelu_new"`` are supported.
+            Defaults to `"gelu"`.
+        normalize_before (bool, optional):
+            Indicates whether to put layer normalization into preprocessing of MHA and FFN sub-layers.
+            If True, pre-process is layer normalization and post-precess includes dropout,
+            residual connection. Otherwise, no pre-process and post-precess includes dropout,
+            residual connection, layer normalization.
+            Defaults to `False`.
+        block_size (int, optional):
+            The block size for the attention mask.
+            Defaults to `1`.
+        window_size (int, optional):
+            The number of block in a window.
+            Defaults to `3`.
+        num_global_blocks (int, optional):
+            Number of global blocks per sequence.
+            Defaults to `1`.
+        num_rand_blocks (int, optional):
+            Number of random blocks per row.
+            Defaults to `2`.
+        seed (int, optional):
+            The random seed for generating random block id.
+            Defaults to ``None``.
+        pad_token_id (int, optional):
+            The index of padding token for BigBird embedding.
+            Defaults to ``0``.
+        hidden_size (int, optional):
+            Dimensionality of the embedding layer, encoder layer and pooler layer.
+            Defaults to `768`.
+        hidden_dropout_prob (float, optional):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+            Defaults to `0.1`.
+        max_position_embeddings (int, optional):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
+            sequence. Defaults to `512`.
+        type_vocab_size (int, optional):
+            The vocabulary size of the `token_type_ids`.
+            Defaults to `2`.
+    """
+
+    def __init__(self, config: BigBirdConfig):
+        super(BigBirdModel, self).__init__(config)
+        # embedding
+        self.embeddings = BigBirdEmbeddings(config)
+
+        # encoder
+        encoder_layer = TransformerEncoderLayer(config)
+        self.encoder = TransformerEncoder(encoder_layer, config.num_layers)
+        # pooler
+        self.pooler = BigBirdPooler(config.hidden_size)
+        self.pad_token_id = config.pad_token_id
+        self.num_layers = config.num_layers
+        self.config = config
+
+    def _process_mask(self, input_ids, inputs_embeds, attention_mask=None):
+        # [B, T]
+        if input_ids is not None:
+            attention_mask = (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype)
+        else:
+            input_shape = inputs_embeds.shape[:-1]
+            attention_mask = paddle.zeros(input_shape, dtype=self.pooler.dense.weight.dtype)
+
+        # [B, 1, T, 1]
+        query_mask = paddle.unsqueeze(attention_mask, axis=[1, 3])
+        # [B, 1, 1, T]
+        key_mask = paddle.unsqueeze(attention_mask, axis=[1, 2])
+        query_mask = 1 - query_mask
+        key_mask = 1 - key_mask
+        return attention_mask, query_mask, key_mask
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        rand_mask_idx_list: Optional[List] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The BigBirdModel forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (`Tensor`):
+                Indices of input sequence tokens in the vocabulary.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            token_type_ids (`Tensor`, optional):
+                Segment token indices to indicate first and second portions of the inputs.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to ``None``, which means we don't add segment embeddings.
+            inputs_embeds (Tensor, optional):
+                If you want to control how to convert `inputs_ids` indices into associated vectors, you can
+                pass an embedded representation directly instead of passing `inputs_ids`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
+                [batch_size, num_attention_heads, sequence_length, sequence_length].
+                We use whole-word-mask in ERNIE, so the whole word will have the same value. For example, "使用" as a word,
+                "使" and "用" will have the same value.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            rand_mask_idx_list (`list`, optional):
+                A list which contains some tensors used in bigbird random block.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.ModelOutput` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions`.
+
+        Examples:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import BigBirdModel, BigBirdTokenizer
+                from paddlenlp.transformers import create_bigbird_rand_mask_idx_list
+
+                tokenizer = BigBirdTokenizer.from_pretrained('bigbird-base-uncased')
+                model = BigBirdModel.from_pretrained('bigbird-base-uncased')
+                config = model.config
+                max_seq_len = 512
+                input_ids = tokenizer.convert_tokens_to_ids(
+                    tokenizer(
+                        "This is a docudrama story on the Lindy Chamberlain case and a look at "
+                        "its impact on Australian society It especially looks at the problem of "
+                        "innuendo gossip and expectation when dealing with reallife dramasbr br "
+                        "One issue the story deals with is the way it is expected people will all "
+                        "give the same emotional response to similar situations Not everyone goes "
+                        "into wild melodramatic hysterics to every major crisis Just because the "
+                        "characters in the movies and on TV act in a certain way is no reason to "
+                        "expect real people to do so"
+                    ))
+                input_ids.extend([0] * (max_seq_len - len(input_ids)))
+                seq_len = len(input_ids)
+                input_ids = paddle.to_tensor([input_ids])
+                rand_mask_idx_list = create_bigbird_rand_mask_idx_list(
+                    config["num_layers"], seq_len, seq_len, config["nhead"],
+                    config["block_size"], config["window_size"], config["num_global_blocks"],
+                    config["num_rand_blocks"], config["seed"])
+                rand_mask_idx_list = [
+                    paddle.to_tensor(rand_mask_idx) for rand_mask_idx in rand_mask_idx_list
+                ]
+                output = model(input_ids, rand_mask_idx_list=rand_mask_idx_list)
+        """
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        output_attentions = output_attentions if output_attentions is not None else False
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else False
+        return_dict = return_dict if return_dict is not None else False
+
+        embedding_output = self.embeddings(input_ids, token_type_ids, inputs_embeds=inputs_embeds)
+        attention_mask, query_mask, key_mask = self._process_mask(input_ids, inputs_embeds, attention_mask)
+        batch_size, seq_len = input_shape
+        rand_mask_idx_list = create_bigbird_rand_mask_idx_list(
+            self.config["num_layers"],
+            seq_len,
+            seq_len,
+            self.config["nhead"],
+            self.config["block_size"],
+            self.config["window_size"],
+            self.config["num_global_blocks"],
+            self.config["num_rand_blocks"],
+            self.config["seed"],
+        )
+        rand_mask_idx_list = [paddle.to_tensor(rand_mask_idx) for rand_mask_idx in rand_mask_idx_list]
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask,
+            rand_mask_idx_list,
+            query_mask,
+            key_mask,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+        )
+        sequence_output = encoder_outputs.hidden_states
+        hidden_states = encoder_outputs.all_hidden_states if output_hidden_states else None
+        attentions = encoder_outputs.all_attentions if output_attentions else None
+        pooled_output = self.pooler(encoder_outputs.hidden_states)
+        if not return_dict:
+            return sequence_output, pooled_output
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=hidden_states,
+            attentions=attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+
+class BigBirdForSequenceClassification(BigBirdPretrainedModel):
+    """
+    BigBird Model with a linear layer on top of the output layer,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        bigbird (:class:`BigBirdModel`):
+            An instance of :class:`BigBirdModel`.
+        num_labels (int, optional):
+            The number of classes. Defaults to `None`.
+    """
+
+    def __init__(self, config: BigBirdConfig):
+        super(BigBirdForSequenceClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        self.bigbird = BigBirdModel(config)
+        self.linear = nn.Linear(config.hidden_size, self.num_labels)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob, mode="upscale_in_train")
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        rand_mask_idx_list: Optional[List] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The BigBirdForSequenceClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`BigBirdModel`.
+            token_type_ids (Tensor):
+                See :class:`BigBirdModel`.
+            attention_mask (Tensor):
+                See :class:`BigBirdModel`.
+            rand_mask_idx_list (list):
+                See :class:`BigBirdModel`.
+            inputs_embeds(Tensor, optional):
+                See :class:`BigBirdModel`.
+            labels (Tensor of shape `(batch_size,)`, optional):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in `[0, ..., num_labels - 1]`. If `num_labels == 1`
+                a regression loss is computed (Mean-Square loss), If `num_labels > 1`
+                a classification loss is computed (Cross-Entropy).
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `None`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `None`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `None`.
+
+        Returns:
+            Tensor: Returns tensor `output`, a tensor of the input text classification logits.
+            Its data type should be float32 and it has a shape of [batch_size, num_labels].
+
+        Examples:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import BigBirdForSequenceClassification, BigBirdTokenizer
+                from paddlenlp.transformers import create_bigbird_rand_mask_idx_list
+
+                tokenizer = BigBirdTokenizer.from_pretrained('bigbird-base-uncased')
+                model = BigBirdForSequenceClassification.from_pretrained('bigbird-base-uncased')
+                config = model.bigbird.config
+                max_seq_len = 512
+                input_ids = tokenizer.convert_tokens_to_ids(
+                    tokenizer(
+                        "This is a docudrama story on the Lindy Chamberlain case and a look at "
+                        "its impact on Australian society It especially looks at the problem of "
+                        "innuendo gossip and expectation when dealing with reallife dramasbr br "
+                        "One issue the story deals with is the way it is expected people will all "
+                        "give the same emotional response to similar situations Not everyone goes "
+                        "into wild melodramatic hysterics to every major crisis Just because the "
+                        "characters in the movies and on TV act in a certain way is no reason to "
+                        "expect real people to do so"
+                    ))
+                input_ids.extend([0] * (max_seq_len - len(input_ids)))
+                seq_len = len(input_ids)
+                input_ids = paddle.to_tensor([input_ids])
+                rand_mask_idx_list = create_bigbird_rand_mask_idx_list(
+                    config["num_layers"], seq_len, seq_len, config["nhead"],
+                    config["block_size"], config["window_size"], config["num_global_blocks"],
+                    config["num_rand_blocks"], config["seed"])
+                rand_mask_idx_list = [
+                    paddle.to_tensor(rand_mask_idx) for rand_mask_idx in rand_mask_idx_list
+                ]
+                output = model(input_ids, rand_mask_idx_list=rand_mask_idx_list)
+                print(output)
+        """
+        outputs = self.bigbird(
+            input_ids,
+            token_type_ids,
+            attention_mask=attention_mask,
+            rand_mask_idx_list=rand_mask_idx_list,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.linear(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = paddle.nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = paddle.nn.CrossEntropyLoss()
+                loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = paddle.nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class BigBirdLMPredictionHead(Layer):
+    def __init__(self, config: BigBirdConfig):
+        super(BigBirdLMPredictionHead, self).__init__()
+        self.transform = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = getattr(nn.functional, config.activation)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=1e-12)
+        self.decoder = nn.Linear(config.vocab_size, config.hidden_size)
+        self.decoder.weight = (
+            self.create_parameter(
+                shape=[config.vocab_size, config.hidden_size], dtype=self.transform.weight.dtype, is_bias=False
+            )
+            if config.embedding_weights is None
+            else config.embedding_weights
+        )
+        self.decoder_bias = self.create_parameter(
+            shape=[config.vocab_size], dtype=self.decoder.weight.dtype, is_bias=True
+        )
+
+    def forward(self, hidden_states, masked_positions=None):
+        if masked_positions is not None:
+            hidden_states = paddle.reshape(hidden_states, [-1, hidden_states.shape[-1]])
+            hidden_states = paddle.tensor.gather(hidden_states, masked_positions)
+        # gather masked tokens might be more quick
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = paddle.tensor.matmul(hidden_states, self.decoder.weight, transpose_y=True) + self.decoder_bias
+        return hidden_states
+
+
+class BigBirdPretrainingHeads(Layer):
+    """
+    The BigBird pretraining heads for a pretraining task.
+
+    Args:
+        hidden_size (int):
+            See :class:`BigBirdModel`.
+        vocab_size (int):
+            See :class:`BigBirdModel`.
+        activation (str):
+            See :class:`BigBirdModel`.
+        embedding_weights (Tensor, optional):
+            The weight of pretraining embedding layer. Its data type should be float32
+            and its shape is [hidden_size, vocab_size].
+            If set to `None`, use normal distribution to initialize weight.
+            Defaults to `None`.
+    """
+
+    def __init__(self, config: BigBirdConfig):
+        super(BigBirdPretrainingHeads, self).__init__()
+        self.predictions = BigBirdLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output, masked_positions=None):
+        r"""
+        The BigBirdPretrainingHeads forward method, overrides the __call__() special method.
+
+        Args:
+            sequence_output (Tensor):
+                The sequence output of BigBirdModel. Its data type should be float32 and
+                has a shape of [batch_size, sequence_length, hidden_size].
+            pooled_output (Tensor):
+                The pooled output of BigBirdModel. Its data type should be float32 and
+                has a shape of [batch_size, hidden_size].
+            masked_positions (Tensor):
+                A tensor indicates positions to be masked in the position embedding.
+                Its data type should be int64 and its shape is [batch_size, mask_token_num].
+                `mask_token_num` is the number of masked tokens. It should be no bigger than `sequence_length`.
+                Defaults to `None`, which means we output hidden-states of all tokens in masked token prediction.
+
+        Returns:
+            tuple: (``prediction_scores``, ``seq_relationship_score``).
+
+            With the fields:
+
+            - prediction_scores (Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size].
+                Otherwise, its shape is [batch_size, mask_token_num, vocab_size].
+
+            - seq_relationship_score (Tensor):
+                The logits whether 2 sequences are NSP relationship. Its data type should be float32 and
+                has a shape of [batch_size, 2].
+        """
+        prediction_scores = self.predictions(sequence_output, masked_positions)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+@dataclass
+class BigBirdForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`BertForPreTraining`].
+
+    Args:
+        loss (*optional*, returned when `labels` is provided, `paddle.Tensor` of shape `(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (`paddle.Tensor` of shape `(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    prediction_logits: paddle.Tensor = None
+    seq_relationship_logits: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+class BigBirdForPretraining(BigBirdPretrainedModel):
+    """
+    BigBird Model with pretraining tasks on top.
+
+    Args:
+        bigbird (:class:`BigBirdModel`):
+            An instance of :class:`BigBirdModel`.
+
+    """
+
+    def __init__(self, config: BigBirdConfig):
+        super(BigBirdForPretraining, self).__init__(config)
+        self.bigbird = BigBirdModel(config)
+        self.cls = BigBirdPretrainingHeads(config)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        rand_mask_idx_list: Optional[List] = None,
+        masked_positions: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        rand_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        next_sentence_label: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The BigBirdForPretraining forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`BigBirdModel`.
+            token_type_ids (Tensor):
+                See :class:`BigBirdModel`.
+            attention_mask (Tensor):
+                See :class:`BigBirdModel`.
+            rand_mask_idx_list (list):
+                See :class:`BigBirdModel`.
+            masked_positions (list):
+                A tensor indicates positions to be masked in the position embedding.
+                Its data type should be int64 and its shape is [batch_size, mask_token_num].
+                `mask_token_num` is the number of masked tokens. It should be no bigger than `sequence_length`.
+                Defaults to `None`, which means we output hidden-states of all tokens in masked token prediction.
+            inputs_embeds(Tensor, optional):
+                See :class:`BigBirdModel`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `None`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `None`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.bert.BertForPreTrainingOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `None`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.bert.BertForPreTrainingOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.bert.BertForPreTrainingOutput`.
+
+        Examples:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import BigBirdForPretraining, BigBirdTokenizer
+                from paddlenlp.transformers import create_bigbird_rand_mask_idx_list
+
+                tokenizer = BigBirdTokenizer.from_pretrained('bigbird-base-uncased')
+                model = BigBirdForPretraining.from_pretrained('bigbird-base-uncased')
+                config = model.bigbird.config
+                max_seq_len = 512
+                input_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights = tokenizer.encode(
+                        "This is a docudrama story on the Lindy Chamberlain case and a look at "
+                        "its impact on Australian society It especially looks at the problem of "
+                        "innuendo gossip and expectation when dealing with reallife dramasbr br "
+                        "One issue the story deals with is the way it is expected people will all "
+                        "give the same emotional response to similar situations Not everyone goes "
+                        "into wild melodramatic hysterics to every major crisis Just because the "
+                        "characters in the movies and on TV act in a certain way is no reason to "
+                        "expect real people to do so", max_seq_len=max_seq_len)
+
+                seq_len = len(input_ids)
+                input_ids = paddle.to_tensor([input_ids])
+                rand_mask_idx_list = create_bigbird_rand_mask_idx_list(
+                    config["num_layers"], seq_len, seq_len, config["nhead"],
+                    config["block_size"], config["window_size"], config["num_global_blocks"],
+                    config["num_rand_blocks"], config["seed"])
+                rand_mask_idx_list = [
+                    paddle.to_tensor(rand_mask_idx) for rand_mask_idx in rand_mask_idx_list
+                ]
+                output = model(input_ids, rand_mask_idx_list=rand_mask_idx_list)
+                print(output)
+        """
+        outputs = self.bigbird(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=None,
+            rand_mask_idx_list=rand_mask_idx_list,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output, masked_positions)
+
+        total_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            masked_lm_loss = loss_fct(
+                prediction_scores.reshape((-1, prediction_scores.shape[-1])), labels.reshape((-1,))
+            )
+            next_sentence_loss = loss_fct(seq_relationship_score.reshape((-1, 2)), next_sentence_label.reshape((-1,)))
+            total_loss = masked_lm_loss + next_sentence_loss
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return BigBirdForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class BigBirdPretrainingCriterion(paddle.nn.Layer):
+    """
+    BigBird Criterion for a pretraining task on top.
+
+    Args:
+        vocab_size (int):
+            See :class:`BigBirdModel`.
+        use_nsp (bool, optional):
+            It decides whether it considers Next Sentence Prediction loss.
+            Defaults to `False`.
+        ignore_index (int):
+            Specifies a target value that is ignored and does
+            not contribute to the input gradient. Only valid
+            if :attr:`soft_label` is set to :attr:`False`.
+            Defaults to `0`.
+    """
+
+    def __init__(self, config: BigBirdConfig, use_nsp=False, ignore_index=0):
+        super(BigBirdPretrainingCriterion, self).__init__()
+        # CrossEntropyLoss is expensive since the inner reshape (copy)
+        self.loss_fn = paddle.nn.loss.CrossEntropyLoss(ignore_index=-1)
+        self.vocab_size = config.vocab_size
+        self.use_nsp = use_nsp
+        self.ignore_index = ignore_index
+
+    def forward(
+        self,
+        prediction_scores,
+        seq_relationship_score,
+        masked_lm_labels,
+        next_sentence_labels,
+        masked_lm_scale,
+        masked_lm_weights,
+    ):
+        r"""
+        The BigBirdPretrainingCriterion forward method, overrides the __call__() special method.
+
+        Args:
+            prediction_scores (Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size].
+                Otherwise, its shape is [batch_size, mask_token_num, vocab_size].
+            seq_relationship_score (Tensor):
+                The scores of next sentence prediction.
+                Its data type should be float32 and its shape is [batch_size, 2].
+            masked_lm_labels (Tensor):
+                The labels of the masked language modeling, its dimensionality is equal to `prediction_scores`.
+                Its data type should be int64. If `masked_positions` is None, its shape is [batch_size, sequence_length, 1].
+                Otherwise, its shape is [batch_size, mask_token_num, 1].
+            next_sentence_labels (Tensor):
+                The labels of the next sentence prediction task, the dimensionality of `next_sentence_labels`
+                is equal to `seq_relation_labels`. Its data type should be int64 and its shape is [batch_size, 1].
+            masked_lm_scale (Tensor or int):
+                The scale of masked tokens. Used for the normalization of masked language modeling loss.
+                If it is a `Tensor`, its data type should be int64 and its shape is equal to `prediction_scores`.
+            masked_lm_weights (Tensor):
+                The weight of masked tokens. Its data type should be float32 and its shape
+                is [mask_token_num, 1].
+
+        Returns:
+            Tensor: The pretraining loss, equals to the sum of `masked_lm_loss` plus the mean of `next_sentence_loss`.
+            Its data type should be float32 and its shape is [1].
+
+        Example:
+            .. code-block::
+
+                import numpy as np
+                import paddle
+                from paddlenlp.transformers import BigBirdForPretraining, BigBirdTokenizer, BigBirdPretrainingCriterion
+                from paddlenlp.transformers import create_bigbird_rand_mask_idx_list
+
+                tokenizer = BigBirdTokenizer.from_pretrained('bigbird-base-uncased')
+                model = BigBirdForPretraining.from_pretrained('bigbird-base-uncased')
+                config = model.bigbird.config
+                criterion = BigBirdPretrainingCriterion(config["vocab_size"], False)
+                max_seq_len = 512
+                max_pred_length=75
+                input_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights = tokenizer.encode(
+                        "This is a docudrama story on the Lindy Chamberlain case and a look at "
+                        "its impact on Australian society It especially looks at the problem of "
+                        "innuendo gossip and expectation when dealing with reallife dramasbr br "
+                        "One issue the story deals with is the way it is expected people will all "
+                        "give the same emotional response to similar situations Not everyone goes "
+                        "into wild melodramatic hysterics to every major crisis Just because the "
+                        "characters in the movies and on TV act in a certain way is no reason to "
+                        "expect real people to do so", max_seq_len=max_seq_len, max_pred_len=max_pred_length)
+
+                seq_len = len(input_ids)
+                masked_lm_positions_tmp = np.full(seq_len, 0, dtype=np.int32)
+                masked_lm_ids_tmp = np.full([seq_len, 1], -1, dtype=np.int64)
+                masked_lm_weights_tmp = np.full([seq_len], 0, dtype="float32")
+
+                mask_token_num = 0
+                for i, x in enumerate([input_ids]):
+                    for j, pos in enumerate(masked_lm_positions):
+                        masked_lm_positions_tmp[mask_token_num] = i * seq_len + pos
+                        masked_lm_ids_tmp[mask_token_num] = masked_lm_ids[j]
+                        masked_lm_weights_tmp[mask_token_num] = masked_lm_weights[j]
+
+                masked_lm_positions = masked_lm_positions_tmp
+                masked_lm_ids = masked_lm_ids_tmp
+                masked_lm_weights = masked_lm_weights_tmp
+                print(masked_lm_ids.shape)
+                input_ids = paddle.to_tensor([input_ids])
+                masked_lm_positions = paddle.to_tensor(masked_lm_positions)
+                masked_lm_ids = paddle.to_tensor(masked_lm_ids, dtype='int64')
+                masked_lm_weights = paddle.to_tensor(masked_lm_weights)
+                masked_lm_scale = 1.0
+                next_sentence_labels = paddle.zeros(shape=(1, 1), dtype='int64')
+
+                rand_mask_idx_list = create_bigbird_rand_mask_idx_list(
+                    config["num_layers"], seq_len, seq_len, config["nhead"],
+                    config["block_size"], config["window_size"], config["num_global_blocks"],
+                    config["num_rand_blocks"], config["seed"])
+                rand_mask_idx_list = [
+                    paddle.to_tensor(rand_mask_idx) for rand_mask_idx in rand_mask_idx_list
+                ]
+                prediction_scores, seq_relationship_score = model(input_ids, rand_mask_idx_list=rand_mask_idx_list, masked_positions=masked_lm_positions)
+
+                loss = criterion(prediction_scores, seq_relationship_score,
+                                masked_lm_ids, next_sentence_labels,
+                                masked_lm_scale, masked_lm_weights)
+                print(loss)
+        """
+        masked_lm_loss = F.cross_entropy(
+            prediction_scores, masked_lm_labels, ignore_index=self.ignore_index, reduction="none"
+        )
+        masked_lm_loss = paddle.transpose(masked_lm_loss, [1, 0])
+        masked_lm_loss = paddle.sum(masked_lm_loss * masked_lm_weights) / (paddle.sum(masked_lm_weights) + 1e-5)
+        scale = 1.0
+        if not self.use_nsp:
+            scale = 0.0
+        next_sentence_loss = F.cross_entropy(seq_relationship_score, next_sentence_labels, reduction="none")
+        return masked_lm_loss + paddle.mean(next_sentence_loss) * scale
+
+
+class BigBirdIntermediate(Layer):
+    def __init__(self, config: BigBirdConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.dim_feedforward)
+        if isinstance(config.activation, str):
+            self.intermediate_act_fn = ACT2FN[config.activation]
+        else:
+            self.intermediate_act_fn = config.activation
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BigBirdOutput(Layer):
+    def __init__(self, config: BigBirdConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.dim_feedforward, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BigBirdForQuestionAnswering(BigBirdPretrainedModel):
+    """
+    BigBird Model with a linear layer on top of the hidden-states output to compute `span_start_logits`
+    and `span_end_logits`, designed for question-answering tasks like SQuAD.
+
+    Args:
+        bigbird (:class:`BigBirdModel`):
+            An instance of BigBirdModel.
+        dropout (float, optional):
+            The dropout probability for output of BigBirdModel.
+            If None, use the same value as `hidden_dropout_prob` of `BigBirdModel`
+            instance `bigbird`. Defaults to `None`.
+    """
+
+    def __init__(self, config: BigBirdConfig):
+        super(BigBirdForQuestionAnswering, self).__init__(config)
+        self.bigbird = BigBirdModel(config)  # allow bigbird to be config
+        self.dropout = nn.Dropout(
+            config.dropout if config.dropout is not None else self.bigbird.config["hidden_dropout_prob"]
+        )
+        self.classifier = nn.Linear(self.bigbird.config["hidden_size"], 2)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        start_positions: Optional[Tensor] = None,
+        end_positions: Optional[Tensor] = None,
+        rand_mask_idx_list: Optional[List] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The BigBirdForQuestionAnswering forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`BigBirdModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`BigBirdModel`.
+            attention_mask (Tensor):
+                See :class:`BigBirdModel`.
+            rand_mask_idx_list (`List`):
+                See :class:`BigBirdModel`.
+            inputs_embeds(Tensor, optional):
+                See :class:`BigBirdModel`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `None`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `None`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `None`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers.bigbird.modeling import BigBirdForQuestionAnswering
+                from paddlenlp.transformers.bigbird.tokenizer import BigBirdTokenizer
+
+                tokenizer = BigBirdTokenizer.from_pretrained('bigbird-base-uncased')
+                model = BigBirdForQuestionAnswering.from_pretrained('bigbird-base-uncased')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!", return_tensors='pd')
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                start_logits = outputs[0]
+                end_logits  =outputs[1]
+        """
+        output_attentions = output_attentions if output_attentions is not None else False
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else False
+        return_dict = return_dict if return_dict is not None else False
+
+        outputs = self.bigbird(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            rand_mask_idx_list=rand_mask_idx_list,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output)
+        logits = paddle.transpose(logits, perm=[2, 0, 1])
+        start_logits, end_logits = paddle.unstack(x=logits, axis=0)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if start_positions.ndim > 1:
+                start_positions = start_positions.squeeze(-1)
+            if start_positions.ndim > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clip(0, ignored_index)
+            end_positions = end_positions.clip(0, ignored_index)
+
+            loss_fct = paddle.nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    @staticmethod
+    def prepare_question_mask(q_lengths, maxlen):
+        mask = paddle.arange(0, maxlen, dtype="int64").unsqueeze_(0)
+        mask = mask < q_lengths
+        return mask
+
+
+class BigBirdForTokenClassification(BigBirdPretrainedModel):
+    """
+    BigBird Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        bigbird (:class:`BigBirdModel`):
+            An instance of BigBirdModel.
+        num_labels (int, optional):
+            The number of classes. Defaults to `2`.
+        dropout (float, optional):
+            The dropout probability for output of BIGBIRD.
+            If None, use the same value as `hidden_dropout_prob` of `BigBirdModel`
+            instance `bigbird`. Defaults to None.
+    """
+
+    def __init__(self, config: BigBirdConfig):
+        super(BigBirdForTokenClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.bigbird = BigBirdModel(config)
+        self.dropout = nn.Dropout(config.dropout if config.dropout is not None else config.hidden_dropout_prob)
+        self.classifier = nn.Linear(self.bigbird.config["hidden_size"], self.num_labels)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        rand_mask_idx_list: Optional[List] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The BigBirdForSequenceClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`BigBirdModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`BigBirdModel`.
+            attention_mask (Tensor):
+                See :class:`BigBirdModel`.
+            rand_mask_idx_list (`List`):
+                See :class:`BigBirdModel`.
+            labels (Tensor of shape `(batch_size, sequence_length)`, optional):
+                Labels for computing the token classification loss. Indices should be in `[0, ..., num_labels - 1]`.
+            inputs_embeds(Tensor, optional):
+                See :class:`BigBirdModel`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `None`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `None`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` object. If
+
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers.bigbird.modeling import BigBirdForTokenClassification
+                from paddlenlp.transformers.bigbird.tokenizer import BigBirdTokenizer
+
+                tokenizer = BigBirdTokenizer.from_pretrained('bigbird-base-uncased')
+                model = BigBirdForTokenClassification.from_pretrained('bigbird-base-uncased')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!", return_tensors='pd')
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                logits = outputs
+        """
+        output_attentions = output_attentions if output_attentions is not None else False
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else False
+        return_dict = return_dict if return_dict is not None else False
+
+        outputs = self.bigbird(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            rand_mask_idx_list=rand_mask_idx_list,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class BigBirdForMultipleChoice(BigBirdPretrainedModel):
+    """
+    BigBird Model with a linear layer on top of the hidden-states output layer,
+    designed for multiple choice tasks like RocStories/SWAG tasks .
+
+    Args:
+        bigbird (:class:`BigBirdModel`):
+            An instance of BigBirdModel.
+        num_choices (int, optional):
+            The number of choices. Defaults to `2`.
+        dropout (float, optional):
+            The dropout probability for output of BIGBIRD.
+            If None, use the same value as `hidden_dropout_prob` of `BigBirdModel`
+            instance `bigbird`. Defaults to None.
+    """
+
+    def __init__(self, config: BigBirdConfig):
+        super(BigBirdForMultipleChoice, self).__init__(config)
+        self.bigbird = BigBirdModel(config)  # allow bigbird to be config
+        self.num_choices = config.num_choices
+        self.dropout = nn.Dropout(
+            config.dropout if config.dropout is not None else self.bigbird.config["hidden_dropout_prob"]
+        )
+        self.classifier = nn.Linear(self.bigbird.config["hidden_size"], 1)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        rand_mask_idx_list: Optional[List] = None,
+        token_type_ids: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The BigBirdForMultipleChoice forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`BigBirdModel` and shape as [batch_size, num_choice, sequence_length].
+            attention_mask (Tensor):
+                See :class:`BigBirdModel`  and shape as [batch_size, num_choice, n_head, sequence_length, sequence_length].
+            rand_mask_idx_list (`List`):
+                See :class:`BigBirdModel`.
+            labels (Tensor of shape `(batch_size, )`, optional):
+                Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+                num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+                `input_ids` above)
+            inputs_embeds(Tensor, optional):
+                See :class:`BigBirdModel`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `None`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `None`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `None`.
+
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers.bigbird.modeling import BigBirdForMultipleChoice
+                from paddlenlp.transformers.bigbird.tokenizer import BigBirdTokenizer
+
+                tokenizer = BigBirdTokenizer.from_pretrained('bigbird-base-uncased')
+                model = BigBirdForTokenClassification.from_pretrained('bigbird-base-uncased')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!", return_tensors='pd')
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                logits = outputs
+        """
+        output_attentions = output_attentions if output_attentions is not None else False
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else False
+        return_dict = return_dict if return_dict is not None else False
+
+        # input_ids: [bs, num_choice, seq_l]
+        if input_ids is not None:
+            input_ids = input_ids.reshape(shape=(-1, input_ids.shape[-1]))  # flat_input_ids: [bs*num_choice,seq_l]
+
+        if attention_mask is not None:
+            attention_mask = attention_mask.reshape(shape=(-1, *attention_mask.shape[2:]))
+
+        if rand_mask_idx_list is not None:
+            rand_mask_idx_list = rand_mask_idx_list.reshape(shape=(-1, *rand_mask_idx_list.shape[2:]))
+
+        if inputs_embeds is not None:
+            inputs_embeds = inputs_embeds.reshape(shape=(-1, inputs_embeds.shape[-2], inputs_embeds.shape[-1]))
+
+        outputs = self.bigbird(
+            input_ids,
+            attention_mask=attention_mask,
+            rand_mask_idx_list=rand_mask_idx_list,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+
+        logits = self.classifier(pooled_output)  # logits: (bs*num_choice,1)
+        reshaped_logits = logits.reshape(shape=(-1, self.num_choices))  # logits: (bs, num_choice)
+
+        loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class BigBirdForMaskedLM(BigBirdPretrainedModel):
+    """
+    BigBird Model with a `language modeling` head on top.
+
+    Args:
+        BigBird (:class:`BigBirdModel`):
+            An instance of :class:`BigBirdModel`.
+    """
+
+    def __init__(self, config: BigBirdConfig):
+        super(BigBirdForMaskedLM, self).__init__(config)
+        self.bigbird = BigBirdModel(config)
+        self.lm_head = BigBirdLMPredictionHead(config)
+        self.tie_weights()
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        rand_mask_idx_list: Optional[List] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`BigBirdModel`.
+            attention_mask (Tensor):
+                See :class:`BigBirdModel`.
+            rand_mask_idx_list (`List`):
+                See :class:`BigBirdModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`BigBirdModel`.
+            labels (Tensor, optional):
+                The Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., vocab_size]`` Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., vocab_size]`` Its shape is [batch_size, sequence_length].
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `None`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `None`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.MaskedLMOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `None`.
+
+        Returns:
+            tuple: Returns tuple (`masked_lm_loss`, `prediction_scores`, ``sequence_output`).
+
+            With the fields:
+
+            - `masked_lm_loss` (Tensor):
+                The masked lm loss. Its data type should be float32 and its shape is [1].
+
+            - `prediction_scores` (Tensor):
+                The scores of masked token prediction. Its data type should be float32. Its shape is [batch_size, sequence_length, vocab_size].
+
+            - `sequence_output` (Tensor):
+                Sequence of hidden-states at the last layer of the model. Its data type should be float32. Its shape is `[batch_size, sequence_length, hidden_size]`.
+
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else False
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else False
+        return_dict = return_dict if return_dict is not None else False
+
+        outputs = self.bigbird(
+            input_ids,
+            attention_mask=attention_mask,
+            rand_mask_idx_list=rand_mask_idx_list,
+            inputs_embeds=inputs_embeds,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            masked_lm_loss = loss_fct(
+                prediction_scores.reshape(shape=(-1, self.bigbird.config["vocab_size"])),
+                labels.reshape(shape=(-1,)),
+            )
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return (
+                ((masked_lm_loss,) + output)
+                if masked_lm_loss is not None
+                else (output[0] if len(output) == 1 else output)
+            )
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class BigBirdForCausalLM(BigBirdPretrainedModel):
+    """
+    BigBird Model for casual language model tasks.
+
+    Args:
+        BigBird (:class:`BigBirdModel`):
+            An instance of :class:`BigBirdModel`.
+
+    """
+
+    def __init__(self, config: BigBirdConfig):
+        super(BigBirdForCausalLM, self).__init__(config)
+        self.bigbird = BigBirdModel(config)
+        self.lm_head = BigBirdLMPredictionHead(config)
+        self.tie_weights()
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        rand_mask_idx_list: Optional[List] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`BigBirdModel`.
+            attention_mask (Tensor):
+                See :class:`BigBirdModel`.
+            rand_mask_idx_list (`List`):
+                See :class:`BigBirdModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`BigBirdModel`.
+            labels (Tensor, optional):
+                The Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., vocab_size]`` Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., vocab_size]`` Its shape is [batch_size, sequence_length].
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            tuple: Returns tuple (`masked_lm_loss`, `prediction_scores`, ``sequence_output`).
+
+            With the fields:
+
+            - `masked_lm_loss` (Tensor):
+                The masked lm loss. Its data type should be float32 and its shape is [1].
+
+            - `prediction_scores` (Tensor):
+                The scores of masked token prediction. Its data type should be float32. Its shape is [batch_size, sequence_length, vocab_size].
+
+            - `sequence_output` (Tensor):
+                Sequence of hidden-states at the last layer of the model. Its data type should be float32. Its shape is `[batch_size, sequence_length, hidden_size]`.
+
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else False
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else False
+        return_dict = return_dict if return_dict is not None else False
+
+        outputs = self.bigbird(
+            input_ids,
+            attention_mask=attention_mask,
+            rand_mask_idx_list=rand_mask_idx_list,
+            inputs_embeds=inputs_embeds,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :]
+            labels = labels[:, 1:]
+            loss_fct = nn.CrossEntropyLoss()
+            lm_loss = loss_fct(
+                paddle.reshape(shifted_prediction_scores, [-1, self.bigbird.config["vocab_size"]]),
+                paddle.reshape(labels, [-1]),
+            )
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else (output[0] if len(output) == 1 else output)
+
+        return MaskedLMOutput(
+            loss=lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bigbird/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bigbird/tokenizer.py
new file mode 100644
index 000000000..9bc5ee70e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bigbird/tokenizer.py
@@ -0,0 +1,400 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 Google Research and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import warnings
+
+import numpy as np
+import sentencepiece as spm
+
+from paddlenlp.data.vocab import Vocab
+
+from ..albert.tokenizer import AlbertEnglishTokenizer
+
+__all__ = ["BigBirdTokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"bigbird-base-uncased": 4096}
+
+
+class BigBirdTokenizer(AlbertEnglishTokenizer):
+    """
+    Constructs an BigBird tokenizer based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        sentencepiece_model_file (str):
+            The vocabulary file (ends with '.spm') required to instantiate
+            a `SentencePiece <https://github.com/google/sentencepiece>`__ tokenizer.
+        do_lower_case (bool): Whether the text strips accents and convert to
+            Whether or not to lowercase the input when tokenizing.
+            Defaults to`True`.
+        unk_token (str):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+
+    Raises:
+        ValueError: If file sentencepiece_model_file doesn't exist.
+
+    """
+
+    resource_files_names = {
+        "sentencepiece_model_file": "sentencepiece_gpt2.model",
+    }  # for save_pretrained
+    pretrained_resource_files_map = {
+        "sentencepiece_model_file": {
+            "bigbird-base-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/bigbird/sentencepiece_gpt2.model",
+        },
+    }
+    pretrained_init_configuration = {
+        "bigbird-base-uncased": {"do_lower_case": False},
+    }
+
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        sentencepiece_model_file,
+        do_lower_case=False,
+        remove_space=True,
+        keep_accents=True,
+        eos_token="</s>",
+        unk_token="<unk>",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        extra_ids=100,
+        additional_special_tokens=[],
+        sp_model_kwargs=None,
+        encoding="utf8",
+        **kwargs
+    ):
+
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.keep_accents = keep_accents
+        self.extra_ids = extra_ids
+        self.sentencepiece_model_file = sentencepiece_model_file
+
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(sentencepiece_model_file)
+        self.encoding = encoding
+        vocab_dict = {}
+        for id in range(self.sp_model.get_piece_size()):
+            vocab_dict[self.sp_model.id_to_piece(id)] = id
+        vocab_ = Vocab.from_dict(vocab_dict, unk_token=unk_token)
+        self.start_word_tokens = np.array([vocab_._idx_to_token[i][0] == "▁" for i in range(0, len(vocab_))])
+
+        self.unk_token = unk_token
+        self.mask_id = vocab_dict[mask_token] if mask_token in vocab_dict else 0
+        self.unk_id = vocab_dict[unk_token] if unk_token in vocab_dict else 0
+        self.cls_id = vocab_dict[cls_token] if cls_token in vocab_dict else 0
+        self.sep_id = vocab_dict[sep_token] if sep_token in vocab_dict else 0
+        self.pad_id = vocab_dict[pad_token] if pad_token in vocab_dict else 0
+
+    def __call__(
+        self,
+        text,
+        text_pair=None,
+        max_length=None,
+        stride=0,
+        is_split_into_words=False,
+        padding=None,
+        truncation="longest_first",
+        return_position_ids=False,
+        return_token_type_ids=False,
+        return_attention_mask=True,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_special_tokens_mask=False,
+        **kwargs
+    ):
+        if "pad_to_max_seq_len" in kwargs and padding is None:
+            pad_to_max_seq_len = kwargs.pop("pad_to_max_seq_len")
+            padding = "max_length" if pad_to_max_seq_len else False
+        elif padding is None:
+            padding = False
+
+        if "max_seq_len" in kwargs and max_length is None:
+            max_length = kwargs["max_seq_len"]
+
+        if "truncation_strategy" in kwargs and kwargs["truncation_strategy"] != "longest_first":
+            truncation = kwargs["truncation_strategy"]
+
+        return super(BigBirdTokenizer, self).__call__(
+            text=text,
+            text_pair=text_pair,
+            max_length=max_length,
+            stride=stride,
+            is_split_into_words=is_split_into_words,
+            padding=padding,
+            truncation=truncation,
+            return_position_ids=return_position_ids,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_length=return_length,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        return len(self.sp_model) + self.extra_ids
+
+    def _add_eos_if_not_present(self, token_ids):
+        """Do not add eos again if user already added it."""
+        if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
+            warnings.warn(
+                f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
+            )
+            return token_ids
+        else:
+            return token_ids + [self.eos_token_id]
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1):
+        """
+        Build model inputs from a sequence or a pair of sequence.
+
+        An BigBird sequence has the following format:
+
+        - single sequence:      ``X </s>``
+        - pair of sequences:        ``A </s> B </s>``
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+
+        Returns:
+            List[int]: List of input_id with the appropriate special tokens.
+
+        """
+        token_ids_0 = self._add_eos_if_not_present(token_ids_0)
+        if token_ids_1 is None:
+            return token_ids_0
+        else:
+            token_ids_1 = self._add_eos_if_not_present(token_ids_1)
+            return token_ids_0 + token_ids_1
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        """
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        Should be overridden in a subclass if the model has a special way of building those.
+
+        Args:
+            offset_mapping_0 (List[tuple]):
+                List of char offsets to which the special tokens will be added.
+            offset_mapping_1 (List[tuple], optional):
+                Optional second list of char offsets for offset mapping pairs.
+
+        Returns:
+            List[tuple]: List of char offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return offset_mapping_0 + [(0, 0)]
+
+        return offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Create a mask from the two sequences.
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            List[int]: List of token_type_id according to the given sequence(s).
+
+        """
+        eos = [self.eos_token_id]
+        if token_ids_1 is None:
+            return len(token_ids_0 + eos) * [0]
+        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+
+        Args:
+            token_ids_0 (List[int]): List of ids of the first sequence.
+            token_ids_1 (List[int], optional): List of ids of the second sequence.
+            already_has_special_tokens (bool, optional): Whether or not the token list is already
+                formatted with special tokens for the model. Defaults to None.
+
+        Returns:
+            List[int]: The list of integers in the range [0, 1]:
+                1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True,
+            )
+
+        # normal case: some special tokens
+        if token_ids_1 is None:
+            return ([0] * len(token_ids_0)) + [1]
+        return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                out_string += self.sp_model.decode_pieces(current_sub_tokens) + token + " "
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+        out_string += self.sp_model.decode_pieces(current_sub_tokens)
+        return out_string.strip()
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if token.startswith("<extra_id_"):
+            match = re.match(r"<extra_id_(\d+)>", token)
+            num = int(match.group(1))
+            return self.vocab_size - num - 1
+        return self.sp_model.piece_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index < self.sp_model.get_piece_size():
+            token = self.sp_model.IdToPiece(index)
+        else:
+            token = f"<extra_id_{self.vocab_size - 1 - index}>"
+        return token
+
+    def _encode(self, text, max_seq_len=None, max_pred_len=None, masked_lm_prob=0.15):
+        """
+        Returns a tuple containing the encoded sequence and mask information.
+
+        Args:
+            text (str,list[str] or list[int]):
+                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
+                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
+                method)
+            max_seq_len (int, optional):
+                If set to a number, will limit the total sequence returned so that it has a maximum length.
+                If set to None, will not limit the total sequence.
+                Defaults to None.
+            max_pred_len (int, optional):
+                If set to a number, will limit the mask sequence returned so that it has a maximum prediction length.
+                If set to None, will not limit the mask sequence.
+            masked_lm_prob (float, optional):
+                The probability of the token to be masked. Defaults to `0.15`.
+        Returns:
+            tuple: Returns tuple (span_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights).
+        """
+
+        def get_input_ids(text):
+            if isinstance(text, str):
+                text = re.sub("[\n]+", "", text)
+                tokens = self._tokenize(text)
+                return self.convert_tokens_to_ids(tokens)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
+                return self.convert_tokens_to_ids(text)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
+                return text
+            else:
+                raise ValueError(
+                    "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
+                )
+
+        ids = get_input_ids(text)
+        # Find the span for in the text
+        max_seq_len = len(ids) if max_seq_len is None else max_seq_len
+        max_pred_len = len(ids) if max_pred_len is None else max_pred_len
+
+        end_pos = max_seq_len - 2 + np.random.randint(max(1, len(ids) - max_seq_len - 2))
+        start_pos = max(0, end_pos - max_seq_len + 2)
+        span_ids = ids[start_pos:end_pos]
+
+        word_begin_flag = self.start_word_tokens[span_ids]
+        word_begin_pos = np.flatnonzero(word_begin_flag).astype(np.int32)
+        if word_begin_pos.size == 0:
+            word_begin_pos = np.arange(len(span_ids), dtype=np.int32)
+            word_begin_flag = np.logical_not(word_begin_flag)
+
+        first_start_pos = word_begin_pos[0]
+        span_ids = span_ids[first_start_pos:]
+        num_tokens = len(span_ids)
+        word_begin_pos = word_begin_pos - first_start_pos
+        words = np.split(np.arange(len(span_ids), dtype="int32"), word_begin_pos)[1:]
+        assert len(words) == len(word_begin_pos)
+        num_to_predict = min(max_pred_len, max(1, int(round(len(word_begin_pos) * masked_lm_prob))))
+
+        masked_lm_positions = np.concatenate(
+            np.random.choice(np.array([[]] + words, dtype=np.object)[1:], num_to_predict, replace=False), 0
+        )
+        if len(masked_lm_positions) > max_pred_len:
+            masked_lm_positions = masked_lm_positions[: max_pred_len + 1]
+            truncate_masking_flag = np.flatnonzero(word_begin_flag[masked_lm_positions])
+            if len(truncate_masking_flag) == 0:
+                truncate_masking_index = max_pred_len
+            else:
+                truncate_masking_index = truncate_masking_flag[-1]
+            masked_lm_positions = masked_lm_positions[:truncate_masking_index]
+        span_ids = np.array(span_ids, dtype="int32")
+        masked_lm_positions = np.sort(masked_lm_positions)
+        masked_lm_ids = np.array(span_ids)[masked_lm_positions]
+
+        random_prob = np.random.rand(len(masked_lm_positions))
+        mask_pos = masked_lm_positions[random_prob < 0.8]
+        random_pos = masked_lm_positions[random_prob > 0.9]
+        span_ids[mask_pos] = self.mask_id
+        span_ids[random_pos] = np.random.randint(self.unk_id + 1, self.vocab_size, len(random_pos), dtype=np.int32)
+        span_ids = np.concatenate(
+            [np.array([self.cls_id], dtype=np.int32), span_ids, np.array([self.sep_id], dtype=np.int32)]
+        )
+        padding_len = max_seq_len - num_tokens - 2
+        span_ids = np.pad(span_ids, [0, padding_len], "constant")
+        pred_padding_len = max_pred_len - len(masked_lm_positions)
+        masked_lm_weights = np.pad(
+            np.ones_like(masked_lm_positions, dtype=np.float32), [0, pred_padding_len], "constant"
+        )
+        masked_lm_positions = np.pad(masked_lm_positions + 1, [0, pred_padding_len], "constant")
+        masked_lm_ids = np.pad(masked_lm_ids, [0, pred_padding_len], "constant")
+        return span_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bit/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bit/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bit/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bit/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bit/configuration.py
new file mode 100644
index 000000000..786e446a9
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bit/configuration.py
@@ -0,0 +1,130 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BiT model configuration"""
+
+from ..configuration_utils import PretrainedConfig
+
+__all__ = ["BitConfig"]
+
+
+class BitConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`BitModel`]. It is used to instantiate an BiT
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the BiT
+    [google/bit-50](https://huggingface.co/google/bit-50) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embedding_size (`int`, *optional*, defaults to 64):
+            Dimensionality (hidden size) for the embedding layer.
+        hidden_sizes (`List[int]`, *optional*, defaults to `[256, 512, 1024, 2048]`):
+            Dimensionality (hidden size) at each stage.
+        depths (`List[int]`, *optional*, defaults to `[3, 4, 6, 3]`):
+            Depth (number of layers) for each stage.
+        layer_type (`str`, *optional*, defaults to `"preactivation"`):
+            The layer to use, it can be either `"preactivation"` or `"bottleneck"`.
+        hidden_act (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function in each block. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"`
+            are supported.
+        global_padding (`str`, *optional*):
+            Padding strategy to use for the convolutional layers. Can be either `"valid"`, `"same"`, or `None`.
+        num_groups (`int`, *optional*, defaults to `32`):
+            Number of groups used for the `BitGroupNormActivation` layers.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            The drop path rate for the stochastic depth.
+        embedding_dynamic_padding (`bool`, *optional*, defaults to `False`):
+            Whether or not to make use of dynamic padding for the embedding layer.
+        output_stride (`int`, *optional*, defaults to 32):
+            The output stride of the model.
+        width_factor (`int`, *optional*, defaults to 1):
+            The width factor for the model.
+        out_features (`List[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). Will default to the last stage if unset.
+
+    Examples:
+
+    ```python
+    >>> from paddlenlp.transformers import BitConfig, BitModel
+
+    >>> # Initializing a BiT bit-50 style configuration
+    >>> configuration = BitConfig()
+
+    >>> # Initializing a model (with random weights) from the bit-50 style configuration
+    >>> model = BitModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "bit"
+    layer_types = ["preactivation", "bottleneck"]
+    supported_padding = ["SAME", "VALID"]
+
+    def __init__(
+        self,
+        num_channels=3,
+        embedding_size=64,
+        hidden_sizes=[256, 512, 1024, 2048],
+        depths=[3, 4, 6, 3],
+        layer_type="preactivation",
+        hidden_act="relu",
+        global_padding=None,
+        num_groups=32,
+        drop_path_rate=0.0,
+        embedding_dynamic_padding=False,
+        output_stride=32,
+        width_factor=1,
+        out_features=None,
+        **kwargs
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(**kwargs)
+        if layer_type not in self.layer_types:
+            raise ValueError(f"layer_type={layer_type} is not one of {','.join(self.layer_types)}")
+        if global_padding is not None:
+            if global_padding.upper() in self.supported_padding:
+                global_padding = global_padding.upper()
+            else:
+                raise ValueError(f"Padding strategy {global_padding} not supported")
+        self.num_channels = num_channels
+        self.embedding_size = embedding_size
+        self.hidden_sizes = hidden_sizes
+        self.depths = depths
+        self.layer_type = layer_type
+        self.hidden_act = hidden_act
+        self.global_padding = global_padding
+        self.num_groups = num_groups
+        self.drop_path_rate = drop_path_rate
+        self.embedding_dynamic_padding = embedding_dynamic_padding
+        self.output_stride = output_stride
+        self.width_factor = width_factor
+
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+        if out_features is not None:
+            if not isinstance(out_features, list):
+                raise ValueError("out_features should be a list")
+            for feature in out_features:
+                if feature not in self.stage_names:
+                    raise ValueError(
+                        f"Feature {feature} is not a valid feature name. Valid names are {self.stage_names}"
+                    )
+        self.out_features = out_features
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bit/image_processing.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bit/image_processing.py
new file mode 100644
index 000000000..d6b373b88
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bit/image_processing.py
@@ -0,0 +1,328 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for BiT."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import PIL
+
+from ..image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ..image_transforms import (
+    center_crop,
+    convert_to_rgb,
+    get_resize_output_image_size,
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from ..image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    is_batched,
+    to_numpy_array,
+    valid_images,
+)
+from ..tokenizer_utils_base import TensorType
+
+__all__ = ["BitImageProcessor"]
+
+
+class BitImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a BiT image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
+            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
+            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+            `preprocess` method.
+        crop_size (`Dict[str, int]` *optional*, defaults to 224):
+            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize:
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Image standard deviation.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 224}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else [0.48145466, 0.4578275, 0.40821073]
+        self.image_std = image_std if image_std is not None else [0.26862954, 0.26130258, 0.27577711]
+        self.do_convert_rgb = do_convert_rgb
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size, default_to_square=False)
+        if "shortest_edge" not in size:
+            raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
+        output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
+        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
+
+    def center_crop(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the
+        returned result will always be of size `size`).
+
+        Args:
+            image (`np.ndarray`):
+                Image to center crop.
+            size (`Dict[str, int]`):
+                Size of the output image in the form of a dictionary with keys `height` and `width`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` parameter must contain the keys (height, width). Got {size.keys()}")
+        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
+
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ):
+        """
+        Rescale an image by a scale factor. image = image * scale.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`int` or `float`):
+                Scale to apply to the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return rescale(image, scale=scale, data_format=data_format, **kwargs)
+
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: Union[float, List[float]],
+        std: Union[float, List[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            image_mean (`float` or `List[float]`):
+                Image mean.
+            image_std (`float` or `List[float]`):
+                Image standard deviation.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: int = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        **kwargs
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.PADDLE` or `'pt'`: Return a batch of type `paddle.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: defaults to the channel dimension format of the input image.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        if not is_batched(images):
+            images = [images]
+
+        if not valid_images(images):
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
+
+        if do_resize and size is None:
+            raise ValueError("Size must be specified if do_resize is True.")
+
+        if do_center_crop and crop_size is None:
+            raise ValueError("Crop size must be specified if do_center_crop is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_resize:
+            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+
+        if do_center_crop:
+            images = [self.center_crop(image=image, size=crop_size) for image in images]
+
+        if do_rescale:
+            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+
+        if do_normalize:
+            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
+
+        images = [to_channel_dimension_format(image, data_format) for image in images]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bit/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bit/modeling.py
new file mode 100644
index 000000000..63a6a71ee
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bit/modeling.py
@@ -0,0 +1,915 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 Google AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Paddle BiT model. Also supports backbone for ViT hybrid."""
+
+import collections
+import math
+from typing import Optional, Tuple
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from paddle import Tensor, nn
+from paddle.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...utils.initializer import kaiming_normal_, ones_, zeros_
+from ..activations import ACT2FN
+from ..model_outputs import (
+    BackboneOutput,
+    BaseModelOutputWithNoAttention,
+    BaseModelOutputWithPoolingAndNoAttention,
+    ImageClassifierOutputWithNoAttention,
+)
+from ..model_utils import BackboneMixin, PretrainedModel
+from .configuration import BitConfig
+
+__all__ = [
+    "BitPretrainedModel",
+    "BitModel",
+    "BitForImageClassification",
+    "BitBackbone",
+]
+
+
+def get_padding_value(padding=None, kernel_size=7, stride=1, dilation=1) -> Tuple[Tuple, bool]:
+    r"""
+    Utility function to get the tuple padding value given the kernel_size and padding.
+
+    Args:
+        padding (Union[`str`, `int`], *optional*):
+            Padding value, can be either `"same"`, `"valid"`. If a different value is provided the default padding from
+            Paddle is used.
+        kernel_size (`int`, *optional*, defaults to 7):
+            Kernel size of the convolution layers.
+        stride (`int`, *optional*, defaults to 1):
+            Stride value of the convolution layers.
+        dilation (`int`, *optional*, defaults to 1):
+            Dilation value of the convolution layers.
+    """
+    dynamic = False
+    if padding is None:
+        padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
+        return padding, dynamic
+
+    if isinstance(padding, str):
+        # for any string padding, the padding will be calculated for you, one of three ways
+        padding = padding.lower()
+        if padding == "same":
+            # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact
+            if stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0:
+                # static case, no extra overhead
+                padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
+            else:
+                # dynamic 'SAME' padding, has runtime/GPU memory overhead
+                padding = 0
+                dynamic = True
+        elif padding == "valid":
+            # 'VALID' padding, same as padding=0
+            padding = 0
+        else:
+            # Default to PyTorch style 'same'-ish symmetric padding
+            padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
+    return padding, dynamic
+
+
+class WeightStandardizedConv2D(nn.Conv2D):
+    """Conv2d with Weight Standardization. Includes TensorFlow compatible SAME padding. Used for ViT Hybrid model.
+
+    Paper: [Micro-Batch Training with Batch-Channel Normalization and Weight
+    Standardization](https://arxiv.org/abs/1903.10520v2)
+    """
+
+    def __init__(
+        self,
+        in_channel,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding="SAME",
+        dilation=1,
+        groups=1,
+        bias=False,
+        epsilon=1e-6,
+    ):
+        padding, is_dynamic = get_padding_value(padding, kernel_size, stride=stride, dilation=dilation)
+        super().__init__(
+            in_channel,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias_attr=bias,
+        )
+        if is_dynamic:
+            self.pad = DynamicPad2d(kernel_size, stride, dilation)
+        else:
+            self.pad = None
+        self.epsilon = epsilon
+
+    def forward(self, hidden_state):
+        if self.pad is not None:
+            hidden_state = self.pad(hidden_state)
+        w = self.weight
+        v, m = paddle.var(w, axis=[1, 2, 3], keepdim=True, unbiased=False), paddle.mean(
+            w, axis=[1, 2, 3], keepdim=True
+        )
+        w = (w - m) / paddle.sqrt(v + self.epsilon)
+
+        hidden_state = F.conv2d(
+            hidden_state, w, self.bias, self._stride, self._padding, self._dilation, self._groups, self._data_format
+        )
+        return hidden_state
+
+
+class BitGroupNormActivation(nn.GroupNorm):
+    r"""
+    A module that combines group normalization with an activation function.
+    """
+
+    def __init__(self, config, num_channels, epsilon=1e-5, apply_activation=True):
+        super(BitGroupNormActivation, self).__init__(config.num_groups, num_channels, epsilon=epsilon)
+        if apply_activation:
+            self.activation = ACT2FN[config.hidden_act]
+        else:
+            self.activation = nn.Identity()
+
+    def forward(self, hidden_state):
+        hidden_state = super().forward(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+
+class DynamicPad2d(nn.Layer):
+    r"""
+    A module that wraps dynamic padding of any input, given the parameters of the convolutional layer and the input
+    hidden states.
+    """
+
+    def __init__(self, kernel_size, stride, dilation, value=0):
+        super().__init__()
+        # Safety checkers
+        if isinstance(kernel_size, int):
+            kernel_size = (kernel_size, kernel_size)
+
+        if isinstance(stride, int):
+            stride = (stride, stride)
+
+        if isinstance(dilation, int):
+            dilation = (dilation, dilation)
+
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+        self.value = value
+
+        def compute_padding(x, kernel_size, stride, dilation):
+            return max((math.ceil(x / stride) - 1) * stride + (kernel_size - 1) * dilation + 1 - x, 0)
+
+        self.compute_padding = compute_padding
+
+    def __call__(self, input):
+        # Get width and height
+        input_height, input_width = input.shape[-2:]
+
+        # Compute the padding values
+        padding_height = self.compute_padding(input_height, self.kernel_size[0], self.stride[0], self.dilation[0])
+        padding_width = self.compute_padding(input_width, self.kernel_size[1], self.stride[1], self.dilation[1])
+
+        # apply pad
+        if padding_height > 0 or padding_width > 0:
+            input = F.pad(
+                input,
+                [
+                    padding_width // 2,
+                    padding_width - padding_width // 2,
+                    padding_height // 2,
+                    padding_height - padding_height // 2,
+                ],
+                value=self.value,
+            )
+        return input
+
+
+class BitMaxPool2D(nn.MaxPool2D):
+    """Tensorflow like 'SAME' wrapper for 2D max pooling"""
+
+    def __init__(
+        self,
+        kernel_size: int,
+        stride=None,
+        dilation=1,
+        ceil_mode=False,
+        padding=(0, 0),
+        padding_value=0,
+        use_dynamic_padding=True,
+    ):
+        # must be 1
+        assert dilation == 1
+        kernel_size = kernel_size if isinstance(kernel_size, collections.abc.Iterable) else (kernel_size, kernel_size)
+        stride = stride if isinstance(stride, collections.abc.Iterable) else (stride, stride)
+        dilation = dilation if isinstance(dilation, collections.abc.Iterable) else (dilation, dilation)
+        super().__init__(kernel_size, stride, padding, ceil_mode=ceil_mode)
+        if use_dynamic_padding:
+            self.pad = DynamicPad2d(kernel_size, stride, dilation, padding_value)
+        else:
+            self.pad = nn.Identity()
+
+    def forward(self, hidden_states):
+        hidden_states = self.pad(hidden_states)
+        return super().forward(hidden_states)
+
+
+class BitEmbeddings(nn.Layer):
+    """
+    BiT Embeddings (stem) composed of a single aggressive convolution.
+    """
+
+    def __init__(self, config: BitConfig):
+        super().__init__()
+
+        self.convolution = WeightStandardizedConv2D(
+            config.num_channels,
+            config.embedding_size,
+            kernel_size=7,
+            stride=2,
+            epsilon=1e-8,
+            padding=config.global_padding,
+        )
+
+        self.pooler = BitMaxPool2D(kernel_size=3, stride=2, use_dynamic_padding=config.embedding_dynamic_padding)
+
+        # Use the same padding strategy as convolutional layers
+        if config.global_padding is not None and config.global_padding.upper() == "SAME":
+            self.pad = nn.Identity()
+        else:
+            self.pad = nn.Pad2D(padding=(1, 1, 1, 1), value=0.0)
+
+        if not config.layer_type == "preactivation":
+            self.norm = BitGroupNormActivation(config, num_channels=config.embedding_size)
+        else:
+            self.norm = nn.Identity()
+
+        self.num_channels = config.num_channels
+
+    def forward(self, pixel_values: Tensor) -> Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+
+        embedding = self.convolution(pixel_values)
+
+        embedding = self.pad(embedding)
+
+        embedding = self.norm(embedding)
+
+        embedding = self.pooler(embedding)
+
+        return embedding
+
+
+def drop_path(input, drop_prob: float = 0.0, training: bool = False):
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + paddle.rand(shape, dtype=input.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = (input / keep_prob) * random_tensor
+    return output
+
+
+class BitDropPath(nn.Layer):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+def make_div(value, divisor=8):
+    min_value = divisor
+    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+    if new_value < 0.9 * value:
+        new_value += divisor
+    return new_value
+
+
+class BitPreActivationBottleneckLayer(nn.Layer):
+    """Pre-activation (v2) bottleneck block.
+    Follows the implementation of "Identity Mappings in Deep Residual Networks":
+    https://github.com/KaimingHe/resnet-1k-layers/blob/master/resnet-pre-act.lua
+
+    Except it puts the stride on 3x3 conv when available.
+    """
+
+    def __init__(
+        self,
+        config,
+        in_channels,
+        out_channels=None,
+        bottle_ratio=0.25,
+        stride=1,
+        dilation=1,
+        first_dilation=None,
+        groups=1,
+        drop_path_rate=0.0,
+        is_first_layer=False,
+    ):
+        super().__init__()
+
+        first_dilation = first_dilation or dilation
+
+        out_channels = out_channels or in_channels
+        mid_channels = make_div(out_channels * bottle_ratio)
+
+        if is_first_layer:
+            self.downsample = BitDownsampleConv(
+                config,
+                in_channels,
+                out_channels,
+                stride=stride,
+                preact=True,
+            )
+        else:
+            self.downsample = None
+
+        self.norm1 = BitGroupNormActivation(config, in_channels)
+        self.conv1 = WeightStandardizedConv2D(
+            in_channels, mid_channels, 1, epsilon=1e-8, padding=config.global_padding
+        )
+
+        self.norm2 = BitGroupNormActivation(config, num_channels=mid_channels)
+        self.conv2 = WeightStandardizedConv2D(
+            mid_channels, mid_channels, 3, stride=stride, groups=groups, epsilon=1e-8, padding=config.global_padding
+        )
+
+        self.norm3 = BitGroupNormActivation(config, mid_channels)
+        self.conv3 = WeightStandardizedConv2D(
+            mid_channels, out_channels, 1, epsilon=1e-8, padding=config.global_padding
+        )
+
+        self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+
+    def forward(self, hidden_states):
+        hidden_states_preact = self.norm1(hidden_states)
+
+        # shortcut branch
+        shortcut = hidden_states
+        if self.downsample is not None:
+            shortcut = self.downsample(hidden_states_preact)
+
+        # residual branch
+        hidden_states = self.conv1(hidden_states_preact)
+        hidden_states = self.conv2(self.norm2(hidden_states))
+        hidden_states = self.conv3(self.norm3(hidden_states))
+        hidden_states = self.drop_path(hidden_states)
+        return hidden_states + shortcut
+
+
+class BitBottleneckLayer(nn.Layer):
+    """Non Pre-activation bottleneck block, equivalent to V1.5/V1b bottleneck. Used for ViT Hybrid."""
+
+    def __init__(
+        self,
+        config,
+        in_channels,
+        out_channels=None,
+        bottle_ratio=0.25,
+        stride=1,
+        dilation=1,
+        first_dilation=None,
+        groups=1,
+        drop_path_rate=0.0,
+        is_first_layer=False,
+    ):
+        super().__init__()
+        first_dilation = first_dilation or dilation
+
+        out_channels = out_channels or in_channels
+        mid_chs = make_div(out_channels * bottle_ratio)
+
+        if is_first_layer:
+            self.downsample = BitDownsampleConv(
+                config,
+                in_channels,
+                out_channels,
+                stride=stride,
+                preact=False,
+            )
+        else:
+            self.downsample = None
+
+        self.conv1 = WeightStandardizedConv2D(in_channels, mid_chs, 1, epsilon=1e-8, padding=config.global_padding)
+        self.norm1 = BitGroupNormActivation(config, num_channels=mid_chs)
+        self.conv2 = WeightStandardizedConv2D(
+            mid_chs,
+            mid_chs,
+            3,
+            stride=stride,
+            dilation=first_dilation,
+            groups=groups,
+            epsilon=1e-8,
+            padding=config.global_padding,
+        )
+        self.norm2 = BitGroupNormActivation(config, num_channels=mid_chs)
+        self.conv3 = WeightStandardizedConv2D(mid_chs, out_channels, 1, epsilon=1e-8, padding=config.global_padding)
+        self.norm3 = BitGroupNormActivation(config, num_channels=out_channels, apply_activation=False)
+        self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
+
+        self.activation = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states):
+        # shortcut branch
+        shortcut = hidden_states
+        if self.downsample is not None:
+            shortcut = self.downsample(hidden_states)
+
+        # residual
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.norm1(hidden_states)
+
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.norm2(hidden_states)
+
+        hidden_states = self.conv3(hidden_states)
+        hidden_states = self.norm3(hidden_states)
+
+        hidden_states = self.drop_path(hidden_states)
+        hidden_states = self.activation(hidden_states + shortcut)
+        return hidden_states
+
+
+class BitDownsampleConv(nn.Layer):
+    def __init__(
+        self,
+        config,
+        in_channels,
+        out_channels,
+        stride=1,
+        preact=True,
+    ):
+        super().__init__()
+        self.conv = WeightStandardizedConv2D(
+            in_channels, out_channels, 1, stride=stride, epsilon=1e-8, padding=config.global_padding
+        )
+        self.norm = (
+            nn.Identity()
+            if preact
+            else BitGroupNormActivation(config, num_channels=out_channels, apply_activation=False)
+        )
+
+    def forward(self, x):
+        return self.norm(self.conv(x))
+
+
+class BitStage(nn.Layer):
+    """
+    A ResNet v2 stage composed by stacked layers.
+    """
+
+    def __init__(
+        self,
+        config,
+        in_channels,
+        out_channels,
+        stride,
+        dilation,
+        depth,
+        bottle_ratio=0.25,
+        layer_dropout=None,
+    ):
+        super().__init__()
+
+        first_dilation = 1 if dilation in (1, 2) else 2
+
+        # Get the layer type
+        if config.layer_type == "bottleneck":
+            layer_cls = BitBottleneckLayer
+        else:
+            layer_cls = BitPreActivationBottleneckLayer
+
+        prev_chs = in_channels
+        self.layers = nn.Sequential()
+        for layer_idx in range(depth):
+            # Get the current hyper-parameters
+            stride, drop_path_rate, is_first_layer = self._get_updated_hyperparameters(
+                layer_idx, stride, layer_dropout
+            )
+
+            self.layers.add_sublayer(
+                str(layer_idx),
+                layer_cls(
+                    config,
+                    prev_chs,
+                    out_channels,
+                    stride=stride,
+                    dilation=dilation,
+                    bottle_ratio=bottle_ratio,
+                    first_dilation=first_dilation,
+                    drop_path_rate=drop_path_rate,
+                    is_first_layer=is_first_layer,
+                ),
+            )
+            prev_chs = out_channels
+            first_dilation = dilation
+
+    def _get_updated_hyperparameters(self, layer_idx, stride, layer_dropout):
+        r"""
+        Get the new hyper-parameters with respect to the previous ones and the index of the current layer.
+        """
+        if layer_dropout:
+            drop_path_rate = layer_dropout[layer_idx]
+        else:
+            drop_path_rate = 0.0
+
+        if layer_idx != 0:
+            stride = 1
+
+        is_first_layer = layer_idx == 0
+
+        return stride, drop_path_rate, is_first_layer
+
+    def forward(self, input: Tensor) -> Tensor:
+        hidden_state = input
+        for _, layer in enumerate(self.layers):
+            hidden_state = layer(hidden_state)
+        return hidden_state
+
+
+class BitEncoder(nn.Layer):
+    def __init__(self, config: BitConfig):
+        super().__init__()
+        self.stages = nn.LayerList([])
+
+        prev_chs = config.embedding_size
+
+        # These needs to stay hardcoded
+        current_stride = 4
+        dilation = 1
+
+        layer_dropouts = [
+            x.tolist()
+            for x in paddle.to_tensor(np.linspace(0, config.drop_path_rate, sum(config.depths))).split(config.depths)
+        ]
+
+        for stage_idx, (current_depth, current_hidden_size, layer_dropout) in enumerate(
+            zip(config.depths, config.hidden_sizes, layer_dropouts)
+        ):
+            # Get the updated hyper params
+            out_channels, stride, dilation = self._get_updated_hyperparameters(
+                stage_idx, current_stride, current_hidden_size, dilation, config
+            )
+
+            stage = BitStage(
+                config,
+                prev_chs,
+                out_channels,
+                stride=stride,
+                dilation=dilation,
+                depth=current_depth,
+                layer_dropout=layer_dropout,
+            )
+
+            prev_chs = out_channels
+            current_stride *= stride
+
+            self.stages.add_sublayer(str(stage_idx), stage)
+
+    def _get_updated_hyperparameters(self, stage_idx, current_stride, current_hidden_size, dilation, config):
+        out_channels = make_div(current_hidden_size * config.width_factor)
+        stride = 1 if stage_idx == 0 else 2
+        if current_stride >= config.output_stride:
+            dilation *= stride
+            stride = 1
+        return out_channels, stride, dilation
+
+    def forward(
+        self, hidden_state: Tensor, output_hidden_states: bool = False, return_dict: bool = True
+    ) -> BaseModelOutputWithNoAttention:
+        hidden_states = () if output_hidden_states else None
+
+        for stage_module in self.stages:
+            if output_hidden_states:
+                hidden_states = hidden_states + (hidden_state,)
+
+            hidden_state = stage_module(hidden_state)
+
+        if output_hidden_states:
+            hidden_states = hidden_states + (hidden_state,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_state, hidden_states] if v is not None)
+
+        return BaseModelOutputWithNoAttention(
+            last_hidden_state=hidden_state,
+            hidden_states=hidden_states,
+        )
+
+
+class BitPretrainedModel(PretrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BitConfig
+    base_model_prefix = "bit"
+    main_input_name = "pixel_values"
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Conv2D):
+            kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+        elif isinstance(module, (nn.BatchNorm2D, nn.GroupNorm)):
+            ones_(module.weight)
+            zeros_(module.bias)
+
+
+class BitModel(BitPretrainedModel):
+    """
+    The bare BiT model outputting raw features without any specific head on top.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`BitConfig`):
+            An instance of BitConfig used to construct BitModel.
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.embedder = BitEmbeddings(config)
+
+        self.encoder = BitEncoder(config)
+        self.norm = (
+            BitGroupNormActivation(config, num_channels=config.hidden_sizes[-1])
+            if config.layer_type == "preactivation"
+            else nn.Identity()
+        )
+
+        self.pooler = nn.AdaptiveAvgPool2D((1, 1))
+
+    def forward(
+        self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
+    ) -> BaseModelOutputWithPoolingAndNoAttention:
+        r"""
+        The BitModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            pixel_values (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values. Pixel values can be obtained using [`BitImageProcessor`]. See [`BitImageProcessor.__call__`]
+                for details.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (bool, optional):
+                Whether to return a :class:`BaseModelOutputWithPoolingAndNoAttention` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `None`.
+        """
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        embedding_output = self.embedder(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict
+        )
+
+        last_hidden_state = encoder_outputs[0]
+
+        last_hidden_state = self.norm(last_hidden_state)
+
+        pooled_output = self.pooler(last_hidden_state)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+class BitForImageClassification(BitPretrainedModel):
+    """
+    BiT Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`BitConfig`):
+            An instance of BitConfig used to construct BitForImageClassification.
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.bit = BitModel(config)
+        # classification head
+        self.classifier = nn.Sequential(
+            nn.Flatten(),
+            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity(),
+        )
+
+    def forward(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> ImageClassifierOutputWithNoAttention:
+        r"""
+        The BitForImageClassification forward method, overrides the `__call__()` special method.
+
+        Args:
+            pixel_values (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values. Pixel values can be obtained using [`BitImageProcessor`]. See [`BitImageProcessor.__call__`]
+                for details.
+            labels (`paddle.Tensor` of shape `(batch_size,)`, *optional*):
+                Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+                config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (bool, optional):
+                Whether to return a :class:`ImageClassifierOutputWithNoAttention` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `None`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+
+        pooled_output = outputs.pooler_output if return_dict else outputs[1]
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.reshape([-1, self.num_labels]), labels.flatten())
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return (loss,) + output if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
+
+
+class BitBackbone(BitPretrainedModel, BackboneMixin):
+    """
+    BiT backbone, to be used with frameworks like DETR and MaskFormer.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`DPTConfig`):
+            An instance of DPTConfig used to construct BitBackbone.
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.stage_names = config.stage_names
+        self.bit = BitModel(config)
+
+        self.out_features = config.out_features if config.out_features is not None else [self.stage_names[-1]]
+
+        out_feature_channels = {}
+        out_feature_channels["stem"] = config.embedding_size
+        for idx, stage in enumerate(self.stage_names[1:]):
+            out_feature_channels[stage] = config.hidden_sizes[idx]
+
+        self.out_feature_channels = out_feature_channels
+
+    @property
+    def channels(self):
+        return [self.out_feature_channels[name] for name in self.out_features]
+
+    def forward(
+        self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
+    ) -> BackboneOutput:
+        r"""
+        The BitBackbone forward method, overrides the `__call__()` special method.
+
+        Args:
+            pixel_values (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values. Pixel values can be obtained using [`BitImageProcessor`]. See [`BitImageProcessor.__call__`]
+                for details.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (bool, optional):
+                Whether to return a :class:`BackboneOutput` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `None`.
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from paddlenlp.transformers import BitImageProcessor, BitBackbone
+        >>> import paddle
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = BitImageProcessor.from_pretrained("google/bit-50")
+        >>> model = BitBackbone.from_pretrained("google/bit-50")
+
+        >>> inputs = processor(image, return_tensors="pd")
+        >>> outputs = model(**inputs)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        outputs = self.bit(pixel_values, output_hidden_states=True, return_dict=True)
+
+        hidden_states = outputs.hidden_states
+
+        feature_maps = ()
+        for idx, stage in enumerate(self.stage_names):
+            if stage in self.out_features:
+                feature_maps += (hidden_states[idx],)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output += (outputs.hidden_states,)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=None,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot/configuration.py
new file mode 100644
index 000000000..c7ee87173
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot/configuration.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" blenderbot model configuration"""
+from __future__ import annotations
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["BLENDERBOT_PRETRAINED_INIT_CONFIGURATION", "BlenderbotConfig", "BLENDERBOT_PRETRAINED_RESOURCE_FILES_MAP"]
+
+BLENDERBOT_PRETRAINED_INIT_CONFIGURATION = {
+    "blenderbot-3B": {
+        "vocab_size": 8008,
+        "bos_token_id": 1,
+        "pad_token_id": 0,
+        "eos_token_id": 2,
+        "decoder_start_token_id": 1,
+        "d_model": 2560,
+        "num_encoder_layers": 2,
+        "num_decoder_layers": 24,
+        "encoder_attention_heads": 32,
+        "decoder_attention_heads": 32,
+        "encoder_ffn_dim": 10240,
+        "decoder_ffn_dim": 10240,
+        "dropout": 0.1,
+        "activation_function": "gelu",
+        "init_std": 0.02,
+        "max_position_embeddings": 128,
+        "attention_dropout": 0.0,
+        "activation_dropout": 0.0,
+        "scale_embedding": True,
+        "normalize_before": True,
+    },
+    "blenderbot-400M-distill": {
+        "vocab_size": 8008,
+        "bos_token_id": 1,
+        "pad_token_id": 0,
+        "eos_token_id": 2,
+        "decoder_start_token_id": 1,
+        "d_model": 1280,
+        "num_encoder_layers": 2,
+        "num_decoder_layers": 12,
+        "encoder_attention_heads": 32,
+        "decoder_attention_heads": 32,
+        "encoder_ffn_dim": 5120,
+        "decoder_ffn_dim": 5120,
+        "dropout": 0.1,
+        "activation_function": "gelu",
+        "init_std": 0.02,
+        "max_position_embeddings": 128,
+        "attention_dropout": 0.0,
+        "activation_dropout": 0.0,
+        "scale_embedding": True,
+        "normalize_before": True,
+    },
+    "blenderbot-1B-distill": {
+        "vocab_size": 8008,
+        "bos_token_id": 1,
+        "pad_token_id": 0,
+        "eos_token_id": 2,
+        "decoder_start_token_id": 1,
+        "d_model": 2560,
+        "num_encoder_layers": 2,
+        "num_decoder_layers": 12,
+        "encoder_attention_heads": 32,
+        "decoder_attention_heads": 32,
+        "decoder_ffn_dim": 10240,
+        "encoder_ffn_dim": 10240,
+        "dropout": 0.1,
+        "activation_function": "gelu",
+        "init_std": 0.02,
+        "max_position_embeddings": 128,
+        "attention_dropout": 0.0,
+        "activation_dropout": 0.0,
+        "normalize_before": True,
+        "scale_embedding": True,
+    },
+}
+
+BLENDERBOT_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "blenderbot-3B": "https://bj.bcebos.com/paddlenlp/models/transformers/blenderbot/blenderbot-3B.pdparams",
+        "blenderbot-1B-distill": "https://bj.bcebos.com/paddlenlp/models/transformers/blenderbot/blenderbot-1B-distill.pdparams",
+        "blenderbot-400M-distill": "https://bj.bcebos.com/paddlenlp/models/transformers/blenderbot/blenderbot-400M-distill.pdparams",
+    }
+}
+
+
+class BlenderbotConfig(PretrainedConfig):
+    """
+    Args:
+        vocab_size (`int`):
+            Vocabulary size of the Blenderbot model.
+        bos_token_id (`int`, optional):
+           The id for begging of sentences token. Defaults to ``1``.
+        pad_token_id (`int`, optional):
+           The id for padding token. Defaults to ``0``.
+        eos_token_id (`int`, optional):
+           The id for end of sentence token. Defaults to ``2``.
+        decoder_start_token_id (`int`, optional):
+           The id indicating the start of decoding sentence. Defaults to ``1``.
+        d_model (`int`, optional):
+           Dimensionality of the layers and the pooler layer. Defaults to ``1280``.
+        num_encoder_layers (`int`, optional):
+           Number of Transformer encoder layers for BlenderbotEncoder. Defaults to ``2``.
+        num_decoder_layers (`int`, optional):
+           Number of Transformer decoder layers for BlenderbotDecoder. Defaults to ``12``.
+        encoder_attention_heads (`int`, optional):
+           Number of attention heads for each Transformer encoder layer in BlenderbotEncoder.
+           Defaults to ``32``.
+        decoder_attention_heads (`int`, optional):
+           Number of attention heads for each Transformer decoder layer in BlenderbotDecoder.
+           Defaults to ``32``.
+        encoder_ffn_dim (`int`, optional):
+           Dimensionality of the feed-forward layer for each Transformer encoder layer in
+           BlenderbotEncoder. Defaults to ``5120``.
+        decoder_ffn_dim (`int`, optional):
+           Dimensionality of the feed-forward layer for each Transformer dncoder layer in
+           BlenderbotDncoder. Defaults to ``5120``.
+        dropout (`float`, optional):
+           The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+           Defaults to ``0.1``.
+        activation_function (`str`, optional):
+           The non-linear activation function (function or string) in the encoder and pooler.
+           ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
+           are supported. Defaults to ``"gelu"``.
+        attention_dropout (`float`, optional):
+           The dropout ratio for the attention probabilities.
+           Defaults to ``0.0``.
+        activation_dropout (`float`, optional):
+           The dropout ratio for activations inside the fully connected layer.
+        max_position_embeddings (`int`, optional):,
+           The max position index of an input sequence. Defaults to ``128``.
+        init_std (`float`, optional):
+           The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+           Defaults to ``0.02``.
+        scale_embedding (`bool`, optional):
+           Indicate whether to scale embeddings by diving by sqrt(d_model). Defaults to ``True``.
+        normalize_before (bool, optional):
+           Indicate whether to put layer normalization into preprocessing of MHA and FFN sub-layers.
+           If True, pre-process is layer normalization and post-precess includes dropout,
+           residual connection. Otherwise, no pre-process and post-precess includes dropout,
+           residual connection, layer normalization. Defaults to ``True``.
+    """
+
+    model_type = "blenderbot"
+    pretrained_init_configuration = BLENDERBOT_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size=8008,
+        bos_token_id=1,
+        pad_token_id=0,
+        eos_token_id=2,
+        decoder_start_token_id=1,
+        d_model=1280,
+        num_encoder_layers=2,
+        num_decoder_layers=12,
+        encoder_attention_heads=32,
+        decoder_attention_heads=32,
+        encoder_ffn_dim=5120,
+        decoder_ffn_dim=5120,
+        dropout=0.1,
+        activation_function="gelu",
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        max_position_embeddings=128,
+        init_std=0.02,
+        scale_embedding=True,
+        normalize_before=True,
+        **kwargs
+    ):
+        super(BlenderbotConfig, self).__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.bos_token_id = bos_token_id
+        self.pad_token_id = pad_token_id
+        self.eos_token_id = eos_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.d_model = d_model
+        self.num_encoder_layers = num_encoder_layers
+        self.num_decoder_layers = num_decoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_attention_heads = decoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.dropout = dropout
+        self.activation_function = activation_function
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.init_std = init_std
+        self.scale_embedding = scale_embedding
+        self.normalize_before = normalize_before
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot/modeling.py
new file mode 100644
index 000000000..fb1fcfcd7
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot/modeling.py
@@ -0,0 +1,749 @@
+# encoding=utf-8
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The Facebook, Inc. and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.tensor as tensor
+from paddle.nn import Embedding
+from paddle.nn.layer.transformer import _convert_attention_mask
+
+from .. import PretrainedModel, register_base_model
+from .configuration import (
+    BLENDERBOT_PRETRAINED_INIT_CONFIGURATION,
+    BLENDERBOT_PRETRAINED_RESOURCE_FILES_MAP,
+    BlenderbotConfig,
+)
+
+__all__ = [
+    "BlenderbotModel",
+    "BlenderbotPretrainedModel",
+    "BlenderbotEncoder",
+    "BlenderbotDecoder",
+    "BlenderbotForConditionalGeneration",
+    "BlenderbotForCausalLM",
+]
+
+
+# Copied from paddlenlp.transformers.bart.modeling.shift_tokens_right
+def shift_tokens_right(input_ids: tensor, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = paddle.zeros_like(input_ids)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+    return shifted_input_ids
+
+
+class BlenderbotPretrainedModel(PretrainedModel):
+    r"""
+    An abstract class for pretrained Blenderbot models. It provides Blenderbot related
+    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
+    `pretrained_init_configuration`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    Refer to :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+    base_model_prefix = "blenderbot"
+    config_class = BlenderbotConfig
+
+    pretrained_init_configuration = BLENDERBOT_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = BLENDERBOT_PRETRAINED_RESOURCE_FILES_MAP
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if paddle.get_default_dtype() not in ["float32", "float64"]:
+            # gaussian/standard_normal/randn/normal only supports [float32, float64]
+            return
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.init_std,
+                        shape=layer.weight.shape,
+                    )
+                )
+
+
+class BlenderbotLearnedPositionalEmbedding(Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+
+    Please refer to the superclass for more information regarding methods and arguments.
+    """
+
+    def __init__(self, config: BlenderbotConfig):
+        super().__init__(num_embeddings=config.max_position_embeddings, embedding_dim=config.d_model)
+
+    def forward(self, input_ids_shape, past_key_values_length=0):
+        """
+        Args:
+            input_ids_shape (`tuple`): Expected to be [batch_size, sequence_length].
+            past_key_values_length (`int`, optional): The length of past_key_value,
+            which is used only when ``use_cache=True`` during prediction generating.
+
+        Returns:
+            (Tensor): The generated positional embedding.
+        """
+        bsz, seq_len = input_ids_shape[:2]
+        positions = paddle.arange(past_key_values_length, past_key_values_length + seq_len, dtype="int64")
+        return super().forward(positions)
+
+
+class BlenderbotEncoder(BlenderbotPretrainedModel):
+    """
+    The encoder of Blenderbot Model.
+    Please refer to :class:`~paddlenlp.transformers.model_utils.PretrainedModel` or
+    :class:`~paddlenlp.transformers.Blenderbot.BlenderbotModel` for more information
+    regarding methods and arguments.
+    """
+
+    def __init__(self, config: BlenderbotConfig, embed_tokens=None):
+        super().__init__(config)
+        self.init_std = config.init_std
+        self.pad_token_id = config.pad_token_id
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(
+                num_embeddings=config.vocab_size, embedding_dim=config.d_model, padding_idx=config.pad_token_id
+            )
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        self.encoder_embed_positions = BlenderbotLearnedPositionalEmbedding(config)
+
+        self.encoder_dropout = nn.Dropout(config.dropout)
+        self.encoder_layernorm = nn.LayerNorm(normalized_shape=config.d_model)
+
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=config.d_model,
+            nhead=config.encoder_attention_heads,
+            dim_feedforward=config.encoder_ffn_dim,
+            dropout=config.dropout,
+            activation=config.activation_function,
+            attn_dropout=config.attention_dropout,
+            act_dropout=config.activation_dropout,
+            normalize_before=config.normalize_before,
+        )
+        self.encoder = nn.TransformerEncoder(encoder_layer=encoder_layer, num_layers=config.num_encoder_layers)
+
+    def forward(self, input_ids, attention_mask=None):
+        """
+        Returns:
+            Tensor: The last hidden states at the last layer of the encoder.
+            It's data type should be `float` and has a shape of `(batch_size, seq_lens, hidden_size)`.
+            ``seq_lens`` corresponds to the length of input sequence.
+        """
+        if input_ids is None:
+            raise ValueError("Input_ids cannot be None.")
+
+        inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+        inputs_embed_pos = self.encoder_embed_positions(input_ids.shape)
+
+        hidden_states = inputs_embeds + inputs_embed_pos
+        encoder_input = self.encoder_dropout(hidden_states)
+
+        if attention_mask is None:
+            attention_mask = (
+                paddle.cast(input_ids == self.pad_token_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
+            )
+        else:
+            attention_mask = attention_mask.unsqueeze([1, 2]) * -1e4
+
+        attention_mask.stop_gradient = True
+        encoder_output = self.encoder(encoder_input, src_mask=attention_mask)
+        # Different from BlenderbotSmall, Blenderbot Encoder apply the final layer norm on encoder output
+        encoder_output = self.encoder_layernorm(encoder_output)
+        return encoder_output
+
+
+class BlenderbotDecoderLayer(nn.TransformerDecoderLayer):
+    """
+    Construct decoder layer for BlenderbotForCausalLM.
+    Different from BlenderbotModel, BLenderbotForCausalLM does not apply
+    cross-attention.
+    """
+
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dim_feedforward,
+        dropout=0.1,
+        activation="gelu",
+        attn_dropout=None,
+        act_dropout=None,
+        normalize_before=True,
+        weight_attr=None,
+        bias_attr=None,
+        *args,
+        **kwargs,
+    ):
+        super(BlenderbotDecoderLayer, self).__init__(
+            d_model=d_model,
+            nhead=nhead,
+            dim_feedforward=dim_feedforward,
+            dropout=dropout,
+            activation=activation,
+            attn_dropout=attn_dropout,
+            act_dropout=act_dropout,
+            normalize_before=normalize_before,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            *args,
+            **kwargs,
+        )
+
+    def forward(self, tgt, memory=None, tgt_mask=None, memory_mask=None, cache=None):
+        """
+        Please refer to  :class:`~paddlenlp.nn.TransformerDecoderLayer`
+        for more information regarding arguments.
+        """
+        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm1(tgt)
+        if cache is None:
+            tgt = self.self_attn(query=tgt, key=tgt, value=tgt, attn_mask=tgt_mask, cache=None)
+        else:
+            tgt, incremental_cache = self.self_attn(query=tgt, key=tgt, value=tgt, attn_mask=tgt_mask, cache=cache[0])
+        tgt = residual + self.dropout1(tgt)
+        if not self.normalize_before:
+            tgt = self.norm1(tgt)
+
+        # Cross-attention will not be applied for BlenderbotForCausalLM
+        if memory is not None:
+            residual = tgt
+            if self.normalize_before:
+                tgt = self.norm2(tgt)
+            memory_mask = _convert_attention_mask(memory_mask, memory.dtype)
+            if cache is None:
+                tgt = self.cross_attn(query=tgt, key=memory, value=memory, attn_mask=memory_mask, cache=None)
+            else:
+                tgt, static_cache = self.cross_attn(
+                    query=tgt, key=memory, value=memory, attn_mask=memory_mask, cache=cache[1]
+                )
+            tgt = residual + self.dropout2(tgt)
+            if not self.normalize_before:
+                tgt = self.norm2(tgt)
+        else:
+            static_cache = cache[1] if cache is not None else None
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm3(tgt)
+        tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = residual + self.dropout3(tgt)
+        if not self.normalize_before:
+            tgt = self.norm3(tgt)
+        return tgt if cache is None else (tgt, (incremental_cache, static_cache))
+
+
+class TransformerDecoder(nn.TransformerDecoder):
+    """
+    Construct Transformer decoder for BlenderbotForCausalLM.
+    """
+
+    def __init__(self, decoder_layer, num_layers, norm=None):
+        super(TransformerDecoder, self).__init__(decoder_layer=decoder_layer, num_layers=num_layers, norm=norm)
+
+    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
+        """
+        Please refer to  :class:`~paddlenlp.nn.TransformerDecoder`
+        for more information regarding arguments and methods.
+        """
+
+        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
+        if memory is not None:
+            memory_mask = _convert_attention_mask(memory_mask, memory.dtype)
+
+        output = tgt
+        new_caches = []
+        for i, mod in enumerate(self.layers):
+            if cache is None:
+                output = mod(output, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, cache=None)
+            else:
+                output, new_cache = mod(output, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, cache=cache[i])
+                new_caches.append(new_cache)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output if cache is None else (output, new_caches)
+
+
+class BlenderbotDecoder(BlenderbotPretrainedModel):
+    """
+    The decoder of Blenderbot Model.
+    Please refer to :class:`~paddlenlp.transformers.model_utils.PretrainedModel` and
+    :class:`~paddlenlp.transformers.Blenderbot.BlenderbotModel` for more information
+    regarding methods and arguments.
+    """
+
+    def __init__(self, config: BlenderbotConfig, embed_tokens=None):
+        super().__init__(config)
+        self.init_std = config.init_std
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(
+                num_embeddings=config.vocab_size, embedding_dim=config.d_model, padding_idx=config.pad_token_id
+            )
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        self.decoder_embed_positions = BlenderbotLearnedPositionalEmbedding(config)
+        self.decoder_dropout = nn.Dropout(config.dropout)
+        self.decoder_layernorm = nn.LayerNorm(normalized_shape=config.d_model)
+
+        decoder_layer = BlenderbotDecoderLayer(
+            d_model=config.d_model,
+            nhead=config.decoder_attention_heads,
+            dim_feedforward=config.decoder_ffn_dim,
+            dropout=config.dropout,
+            activation=config.activation_function,
+            attn_dropout=config.attention_dropout,
+            act_dropout=config.activation_dropout,
+            normalize_before=config.normalize_before,
+        )
+        self.decoder = TransformerDecoder(decoder_layer=decoder_layer, num_layers=config.num_decoder_layers)
+
+    def forward(
+        self,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_output=None,
+        memory_mask=None,
+        use_cache=False,
+        cache=None,
+    ):
+        """
+        Please refer to :class:`~paddlenlp.transformers.Blenderbot.BlenderbotModel` for more
+        information regarding the arguments.
+        """
+        if decoder_input_ids is None:
+            raise ValueError("Decoder_input_ids cannot be None.")
+        if decoder_attention_mask is None:
+            decoder_length = decoder_input_ids.shape[-1]
+            decoder_attention_mask = paddle.tensor.triu(
+                (paddle.full((decoder_length, decoder_length), -np.inf, dtype=paddle.get_default_dtype())), 1
+            )
+        decoder_inputs_embeds = self.embed_tokens(decoder_input_ids) * self.embed_scale
+        # cache[num_layer][0] is an instance of `MultiHeadAttention.Cache` containing
+        # tensor k and v with shape of `[batch_size, num_heads, len_seq, embed_dim // num_heads]`
+        # Refer to paddle.nn.MultiHeadAttention.gen_cache for more details regarding cache.
+        past_key_values_length = cache[0][0].k.shape[2] if cache is not None else 0
+
+        decoder_inputs_embed_pos = self.decoder_embed_positions(
+            input_ids_shape=decoder_input_ids.shape, past_key_values_length=past_key_values_length
+        )
+
+        hidden_states = decoder_inputs_embeds + decoder_inputs_embed_pos
+        decoder_input = self.decoder_dropout(hidden_states)
+
+        decoder_output = self.decoder(
+            tgt=decoder_input,
+            memory=encoder_output,
+            tgt_mask=decoder_attention_mask,
+            memory_mask=memory_mask,
+            cache=cache,
+        )
+        if use_cache:
+            decoder_output, cache = decoder_output
+            decoder_output = self.decoder_layernorm(decoder_output)
+            return decoder_output, cache
+        else:
+            decoder_output = self.decoder_layernorm(decoder_output)
+            return decoder_output
+
+
+@register_base_model
+class BlenderbotModel(BlenderbotPretrainedModel):
+    """
+    Construct a bare Blenderbot Model.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Check the superclass documentation for the generic methods and the library implements for all its model.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    """
+
+    def __init__(self, config: BlenderbotConfig):
+        super(BlenderbotModel, self).__init__(config)
+        self.init_std = config.init_std
+        self.pad_token_id = config.pad_token_id
+        self.bos_token_id = config.bos_token_id
+        self.eos_token_id = config.eos_token_id
+        self.decoder_start_token_id = config.decoder_start_token_id
+        self.shared = nn.Embedding(
+            num_embeddings=config.vocab_size, embedding_dim=config.d_model, padding_idx=config.pad_token_id
+        )
+        self.encoder = BlenderbotEncoder(config)
+        self.decoder = BlenderbotDecoder(config)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_output=None,
+        use_cache=False,
+        cache=None,
+        **kwargs
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
+
+            attention_mask (Tensor, optional):
+                Mask to indicate whether to perform attention on each input token or not.
+                The values should be either 0 or 1. The attention scores will be set
+                to **-infinity** for any positions in the mask that are **0**, and will be
+                **unchanged** for positions that are **1**.
+
+                - **1** for tokens that are **not masked**,
+                - **0** for tokens that are **masked**.
+
+                It's data type should be `float32` and has a shape of [batch_size, sequence_length].
+                Defaults to `None`.
+
+            decoder_input_ids (Tensor, optional):
+                If not provided, ``decoder_input_ids`` will be automatically generated based
+                on ``decoder_start_token_id`` and ``input_ids``.
+
+            decoder_attention_mask (Tensor, optional):
+                If not provided, the default ``decoder_attention_mask`` will be a tensor with
+                upper triangular part being ``-np.inf``. the shape will be ``(decoder_length, decoder_length)``
+
+            encoder_output (Tensor, optional):
+                The output of encoder. If not provided, a ``encoder_output`` will be generated
+                from BlenderbotEncoder. Defaults to ``None``.
+
+            use_cache (bool, optional):
+                Indicates whether to use cache to speed up decoding. Defaults to ``False``
+
+            cache (list, optional): It is a list, and each element in the list
+                is a tuple( :code:`(incremental_cache, static_cache)` ). See
+                `paddle.nn.TransformerDecoder.gen_cache` for more details. It is only
+                used for inference and should be None for training. Default None.
+        Returns:
+            Tensor|tuple:
+                If ``use_cache=False``, the return will be the last hidden state of decoder with shape
+                of [batch_size, seq_lens, hidden_size]. ``seq_lens`` corresponds to the length of input sequence.
+                Otherwise, the return will be a tuple of ``(decoder_output, cache)``. Please refer to
+                class :class:`paddle.nn.TransformerDecoder` for more information regarding ``cache``.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import BlenderbotTokenizer, BlenderbotModel
+
+                # "blenderbot-400M-distill" is the pretrained weight of BlenderbotForConditionalGeneration,
+                # Therefore some weight of additional layers in BlenderbotForConditionalGeneration
+                # might not be loaded and used regarding the following sample code.
+                pretrained_model_name = "blenderbot-400M-distill"
+                tokenizer = BlenderbotTokenizer.from_pretrained(pretrained_model_name)
+                model = BlenderbotModel.from_pretrained(pretrained_model_name)
+
+                sample_text = "My friends are cool but they eat too many carbs."
+                inputs = tokenizer(sample_text, return_attention_mask=True, return_token_type_ids=False)
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                decoder_output = model(**inputs)
+        """
+        if decoder_input_ids is None:
+            decoder_input_ids = shift_tokens_right(
+                input_ids=input_ids, decoder_start_token_id=self.decoder_start_token_id
+            )
+        if encoder_output is None:
+            encoder_output = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
+        if use_cache:
+            if cache is None:
+                cache = self.decoder.decoder.gen_cache(encoder_output)
+        else:
+            cache = None
+
+        if input_ids is not None:
+            memory_mask = (
+                paddle.cast(input_ids == self.pad_token_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
+            )
+            memory_mask.stop_gradient = True
+        else:
+            memory_mask = attention_mask
+
+        decoder_output = self.decoder(
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_output=encoder_output,
+            memory_mask=memory_mask,
+            use_cache=use_cache,
+            cache=cache,
+        )
+        return decoder_output
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, value):
+        self.shared = value
+
+    def get_encoder(self):
+        """This method is required for model with encoder-decoder architecture."""
+        return self.encoder
+
+
+class BlenderbotForConditionalGeneration(BlenderbotPretrainedModel):
+    def __init__(self, config: BlenderbotConfig):
+        super(BlenderbotForConditionalGeneration, self).__init__(config)
+        self.blenderbot = BlenderbotModel(config)
+        self.eos_token_id = config.eos_token_id
+        self.bos_token_id = config.bos_token_id
+        self.pad_token_id = config.pad_token_id
+        self.lm_head_weight = self.create_parameter(
+            shape=[config.vocab_size, config.d_model],
+            dtype=self.blenderbot.shared.weight.dtype,
+            is_bias=False,
+        )
+
+        if hasattr(self, "final_logits_bias"):
+            self.final_logits_bias = paddle.zeros((1, config.vocab_size), dtype=paddle.get_default_dtype())
+        else:
+            self.register_buffer(
+                "final_logits_bias",
+                paddle.zeros((1, config.vocab_size), dtype=paddle.get_default_dtype()),
+            )
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_output=None,
+        use_cache=False,
+        cache=None,
+        **kwargs
+    ):
+        """
+        Please refer to :class:`~paddlenlp.transformers.Blenderbot.BlenderbotModel` for more
+        information regarding arguments.
+        Return:
+            Tensor|tuple: If ``use_cache=False``, the return will be a tensor with shape of
+                [batch_size, seq_lens, hidden_size]. Otherwise, the return will be a tuple
+                of ``(decoder_output, cache)``.
+        Example:
+            .. code-block::
+
+            import paddle
+            from paddlenlp.transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
+
+            pretrained_model_name = "blenderbot-400M-distill"
+            tokenizer = BlenderbotTokenizer.from_pretrained(pretrained_model_name)
+            model = BlenderbotForConditionalGeneration.from_pretrained(pretrained_model_name)
+
+            sample_text = "My friends are cool but they eat too many carbs."
+            inputs = tokenizer(sample_text, return_attention_mask=True, return_token_type_ids=False)
+            inputs = {k: paddle.to_tensor([v]) for (k, v) in inputs.items()}
+
+            # Generate response using beam search
+            result_ids, scores = model.generate(input_ids=inputs['input_ids'],
+                                                max_length=60,
+                                                min_length=20,
+                                                decode_strategy='beam_search',
+                                                num_beams=10,
+                                                length_penalty=0.65)
+            for sequence_ids in result_ids.numpy().tolist():
+                print("User:\t", sample_text)
+                print("bot:\t", tokenizer.convert_ids_to_string(sequence_ids))
+                # "bot:	  That's unfortunate. Are they trying to lose weight?"
+        """
+        decoder_outputs = self.blenderbot(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_output=encoder_output,
+            use_cache=use_cache,
+            cache=cache,
+        )
+
+        lm_logits = (
+            paddle.tensor.matmul(
+                decoder_outputs[0] if use_cache else decoder_outputs, self.lm_head_weight, transpose_y=True
+            )
+            + self.final_logits_bias
+        )
+
+        if use_cache:
+            cache = decoder_outputs[1]
+            return lm_logits, cache
+        return lm_logits
+
+    def prepare_inputs_for_generation(
+        self, decoder_input_ids, attention_mask=None, encoder_output=None, use_cache=True, cache=None, **kwargs
+    ):
+        """
+        Prepare inputs for decoder to generate sentences.
+        Return:
+            dict: A dictionary containing necessary inputs for generating next token.
+        """
+
+        if encoder_output is not None:
+            expand_size = int(decoder_input_ids.shape[0] / encoder_output.shape[0])
+            if expand_size > 1:
+                index = paddle.tile(paddle.arange(encoder_output.shape[0]).unsqueeze(-1), [1, expand_size]).reshape(
+                    [-1]
+                )
+                encoder_output = paddle.index_select(encoder_output, index)
+
+        if cache is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # during prediction, Encoder_output is provided, do not need input_ids.
+            "decoder_input_ids": decoder_input_ids,
+            "encoder_output": encoder_output,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,
+            "cache": cache,
+        }
+
+    def get_encoder(self):
+        """This method is required for model with encoder-decoder architecture."""
+        return self.encoder
+
+    def __getattr__(self, name):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(getattr(self, self.base_model_prefix), name)
+
+
+class BlenderbotForCausalLM(BlenderbotPretrainedModel):
+    """
+    Constructs BLenderbot For Causal Language Model. This model is equivalent to the
+    blenderbot decoder without cross-attention.
+    """
+
+    def __init__(self, config: BlenderbotConfig):
+        super().__init__(config)
+        self.blenderbot = BlenderbotModel(config)
+        self.decoder = self.blenderbot.decoder
+
+        self.lm_head_weight = self.create_parameter(
+            shape=[config.vocab_size, config.d_model],
+            dtype=self.blenderbot.shared.weight.dtype,
+            is_bias=False,
+        )
+
+        if hasattr(self, "final_logits_bias"):
+            self.final_logits_bias = paddle.zeros((1, config.vocab_size), dtype=paddle.get_default_dtype())
+        else:
+            self.register_buffer(
+                "final_logits_bias",
+                paddle.zeros((1, config.vocab_size), dtype=paddle.get_default_dtype()),
+            )
+
+    def forward(self, input_ids=None, attention_mask=None, use_cache=False, cache=None, **kwargs):
+        """
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
+
+            attention_mask (Tensor, optional):
+                Mask to indicate whether to perform attention on each input token or not.
+                The values should be either 0 or 1. The attention scores will be set
+                to **-infinity** for any positions in the mask that are **0**, and will be
+                **unchanged** for positions that are **1**.
+
+                - **1** for tokens that are **not masked**,
+                - **0** for tokens that are **masked**.
+
+                It's data type should be `float32` and has a shape of [batch_size, sequence_length].
+                Defaults to `None`.
+
+            use_cache (bool, optional):
+                Indicates whether to use cache to speed up decoding. Defaults to ``False``
+
+            cache (list, optional): It is a list, and each element in the list
+                is a tuple( :code:`(incremental_cache, static_cache)` ). See
+                `paddle.nn.TransformerDecoder.gen_cache` for more details. It is only
+                used for inference and should be None for training. Default None.
+        Return:
+            Tensor|tuple: If ``use_cache=False``, the return will be a tensor with shape of
+                [batch_size, seq_lens, hidden_size]. Otherwise, the return will be a tuple
+                of ``(lm_logits, cache)``.
+        Example:
+            .. code-block::
+
+            import paddle
+            from paddlenlp.transformers import BlenderbotTokenizer, BlenderbotForCausalLM
+            use_cache = False
+            text = "My friends are cool but they eat too many carbs."
+            model_name = "blenderbot-400M-distill"
+            tokenizer = BlenderbotTokenizer.from_pretrained(model_name)
+            model = BlenderbotForCausalLM.from_pretrained(model_name)
+            model.eval()
+            inputs = tokenizer(text)
+            inputs = {k: paddle.to_tensor([v]) for (k, v) in inputs.items()}
+
+            with paddle.no_grad():
+                outputs = model(**inputs, use_cache=use_cache)
+                # outputs is a tuple of (lm_logits, cache) if ``use_cache=True``.
+        """
+        if use_cache and cache is None:
+            # Generating incremental cache. A random tensor with shape of
+            # (batch_size, len_seq, hidden_size) is passed for memory argument.
+            # since the `static_cache` will not be used in BlenderbotForCausalLM
+            batch_size, len_seq = input_ids.shape
+            cache = self.decoder.decoder.gen_cache(memory=paddle.zeros((batch_size, len_seq, self.config.d_model)))
+        decoder_outputs = self.decoder(
+            decoder_input_ids=input_ids, encoder_output=None, memory_mask=None, use_cache=use_cache, cache=cache
+        )
+
+        lm_logits = (
+            paddle.tensor.matmul(
+                decoder_outputs[0] if use_cache else decoder_outputs, self.lm_head_weight, transpose_y=True
+            )
+            + self.final_logits_bias
+        )
+
+        if use_cache:
+            cache = decoder_outputs[1]
+            return lm_logits, cache
+        return lm_logits
+
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, use_cache=True, cache=None, **kwargs):
+        """
+        Prepare inputs for decoder to generate sentences.
+        Return:
+            dict: A dictionary containing necessary inputs for generating next token.
+        """
+        if cache is not None:
+            input_ids = input_ids[:, -1:].unsqueeze(-1)
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "use_cache": use_cache, "cache": cache}
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot/tokenizer.py
new file mode 100644
index 000000000..20748ad43
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot/tokenizer.py
@@ -0,0 +1,161 @@
+# encoding=utf-8
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The Facebook, Inc. and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.utils import try_import
+
+from .. import AddedToken, GPTTokenizer
+
+__all__ = ["BlenderbotTokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "blenderbot-3B": 128,
+    "blenderbot-400M-distill": 128,
+    "blenderbot-1B-distill": 128,
+}
+
+
+class BlenderbotTokenizer(GPTTokenizer):
+    r"""
+    Construct a Blenderbot tokenizer, derived from the GPT tokenizer, using
+    byte-level Byte-Pair-Encoding.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.GPTTokenizer`,
+    which contains most of the main methods.
+    Please should refer to the superclass for more information regarding methods.
+    Args:
+        vocab_file (str): file path of the vocabulary
+        merges_file (str): file path of the merges file.
+        errors (str): The method to handle errors in decoding
+        max_len (int): The specified maximum sequence length. Default: "None".
+        special_tokens (dict): The additional special tokens. Default: "None".
+        bos_token (str): The special token for beginning of sequence token. Default: "<s>".
+        eos_token (str): The special token for end of sequence token. Default: "</s>".
+        cls_token (str): The special token for cls. Default: "<s>".
+        sep_token (str): The special token for separator token . Default: "</s>".
+        pad_token (str): The special token for padding. Default: "<pad>".
+        eol_token (str): The special token for newline. Default: "\u010a".
+        add_prefix (bool): Whether or not to add an initial space to the input.
+            This allows to treat the leading word just as any other word.
+            (Blenderbot adds an initial space when tokenizes input text, which
+             is differnt from BlenderbotSmall)
+    Examples:
+        .. code-block:: python
+            from paddlenlp.transformers import BlenderbotTokenizer
+            tokenizer = BlenderbotTokenizer.from_pretrained("blenderbot-400M-distill")
+            text = "My friends are cool but they eat too many carbs."
+            inputs = tokenizer(text)
+            # above line outputs:
+            # {'input_ids': [863, 1329, 366, 1449, 373, 382, 1861, 618, 847, 911, 1372, 21, 2],
+            # 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
+    """
+    resource_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "blenderbot-400M-distill": "https://bj.bcebos.com/paddlenlp/models/transformers/blenderbot/blenderbot-400M-distill-vocab.json",
+            "blenderbot-3B": "https://bj.bcebos.com/paddlenlp/models/transformers/blenderbot/blenderbot-3B-vocab.json",
+            "blenderbot-1B-distill": "https://bj.bcebos.com/paddlenlp/models/transformers/blenderbot/blenderbot-1B-distill-vocab.json",
+        },
+        "merges_file": {
+            "blenderbot-400M-distill": "https://bj.bcebos.com/paddlenlp/models/transformers/blenderbot/blenderbot-400M-distill-merges.txt",
+            "blenderbot-3B": "https://bj.bcebos.com/paddlenlp/models/transformers/blenderbot/blenderbot-3B-merges.txt",
+            "blenderbot-1B-distill": "https://bj.bcebos.com/paddlenlp/models/transformers/blenderbot/blenderbot-1B-distill-merges.txt",
+        },
+    }
+    pretrained_init_configuration = {
+        "blenderbot-3B": {"add_prefix": True},
+        "blenderbot-400M-distill": {"add_prefix": True},
+        "blenderbot-1B-distill": {"add_prefix": True},
+    }
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        max_len=None,
+        special_tokens=None,
+        bos_token="<s>",
+        eos_token="</s>",
+        cls_token="<s>",
+        sep_token="</s>",
+        pad_token="<pad>",
+        unk_token="<unk>",
+        mask_token="<mask>",
+        eol_token="\u010a",
+        add_prefix=True,
+        **kwargs
+    ):
+
+        sep_token = (
+            AddedToken(sep_token, lstrip=False, rstrip=False, single_word=False, normalized=True)
+            if isinstance(sep_token, str)
+            else sep_token
+        )
+
+        self._build_special_tokens_map_extended(sep_token=sep_token)
+
+        super(BlenderbotTokenizer, self).__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            errors=errors,
+            max_len=max_len,
+            special_tokens=special_tokens,
+            pad_token=pad_token,
+            eos_token=eos_token,
+            eol_token=eol_token,
+            **kwargs,
+        )
+        self.add_prefix = add_prefix
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        A Blenderbot sequence has the following format:
+        ::
+            - single sequence: ``X </s>``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                token_ids_1 Will be ignored
+
+        Returns:
+            :obj:`List[int]`: List of input_id with the appropriate special tokens.
+        """
+        return token_ids_0 + [self.eos_token_id]
+
+    def _tokenize(self, text):
+        """
+        End-to-end tokenization for Blenderbot models.
+        Args:
+            text (str): The text to be tokenized.
+
+        Returns:
+            list: A list of string representing converted tokens.
+        """
+        bpe_tokens = []
+        re = try_import("regex")
+        for token in re.findall(self.pat, text):
+            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        add_prefix = kwargs.pop("add_prefix", self.add_prefix)
+        if is_split_into_words or add_prefix:
+            text = " " + text
+        return text, kwargs
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot_small/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot_small/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot_small/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot_small/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot_small/configuration.py
new file mode 100644
index 000000000..81b544cb7
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot_small/configuration.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" blenderbot model configuration"""
+from __future__ import annotations
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = [
+    "BLENDERBOTSMALL_PRETRAINED_INIT_CONFIGURATION",
+    "BlenderbotSmallConfig",
+    "BLENDERBOTSMALL_PRETRAINED_RESOURCE_FILES_MAP",
+]
+
+BLENDERBOTSMALL_PRETRAINED_INIT_CONFIGURATION = {
+    "blenderbot_small-90M": {
+        "vocab_size": 54944,
+        "bos_token_id": 1,
+        "pad_token_id": 0,
+        "eos_token_id": 2,
+        "decoder_start_token_id": 1,
+        "d_model": 512,
+        "num_encoder_layers": 8,
+        "num_decoder_layers": 8,
+        "encoder_attention_heads": 16,
+        "decoder_attention_heads": 16,
+        "decoder_ffn_dim": 2048,
+        "encoder_ffn_dim": 2048,
+        "dropout": 0.1,
+        "activation_function": "gelu",
+        "init_std": 0.02,
+        "max_position_embeddings": 512,
+        "attention_dropout": 0.0,
+        "activation_dropout": 0.0,
+        "scale_embedding": True,
+        "normalize_before": False,
+    },
+}
+
+BLENDERBOTSMALL_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "blenderbot_small-90M": "https://bj.bcebos.com/paddlenlp/models/transformers/blenderbot_small/blenderbot_small-90M.pdparams",
+    }
+}
+
+
+class BlenderbotSmallConfig(PretrainedConfig):
+    """
+    Args:
+         vocab_size (`int`):
+             Vocabulary size of the BlenderbotSmall model.
+         bos_token_id (`int`, optional):
+            The id for begging of sentences token. Defaults to ``1``.
+         pad_token_id (`int`, optional):
+            The id for padding token. Defaults to ``0``.
+         eos_token_id (`int`, optional):
+            The id for end of sentence token. Defaults to ``2``.
+         decoder_start_token_id (`int`, optional):
+            The id indicating the start of decoding sentence. Defaults to ``1``.
+         d_model (`int`, optional):
+            Dimensionality of the layers and the pooler layer. Defaults to ``512``.
+         num_encoder_layers (`int`, optional):
+            Number of Transformer encoder layers for BlenderbotSmallEncoder. Defaults to ``8``.
+         num_decoder_layers (`int`, optional):
+            Number of Transformer decoder layers for BlenderbotSmallDecoder. Defaults to ``8``.
+         encoder_attention_heads (`int`, optional):
+            Number of attention heads for each Transformer encoder layer in BlenderbotSmallEncoder.
+            Defaults to ``16``.
+         decoder_attention_heads (`int`, optional):
+            Number of attention heads for each Transformer decoder layer in BlenderbotSmallDecoder.
+            Defaults to ``16``.
+         encoder_ffn_dim (`int`, optional):
+            Dimensionality of the feed-forward layer for each Transformer encoder layer in
+            BlenderbotSmallEncoder. Defaults to ``2048``.
+         decoder_ffn_dim (`int`, optional):
+            Dimensionality of the feed-forward layer for each Transformer dncoder layer in
+            BlenderbotSmallDncoder. Defaults to ``2048``.
+         dropout (`float`, optional):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+            Defaults to ``0.1``.
+         activation_function (`str`, optional):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
+            are supported. Defaults to ``"gelu"``.
+         attention_dropout (`float`, optional):
+            The dropout ratio for the attention probabilities.
+            Defaults to ``0.0``.
+         activation_dropout (`float`, optional):
+            The dropout ratio for activations inside the fully connected layer.
+         max_position_embeddings (`int`, optional):,
+            The max position index of an input sequence. Defaults to ``512``.
+         init_std (`float`, optional):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            Defaults to ``0.02``.
+         scale_embedding (`bool`, optional):
+            Indicate whether to scale embeddings by diving by sqrt(d_model). Defaults to ``True``.
+         normalize_before (bool, optional):
+            Indicate whether to put layer normalization into preprocessing of MHA and FFN sub-layers.
+            If True, pre-process is layer normalization and post-precess includes dropout,
+            residual connection. Otherwise, no pre-process and post-precess includes dropout,
+            residual connection, layer normalization. Defaults to ``False``.
+    """
+
+    model_type = "blenderbot_small"
+    pretrained_init_configuration = BLENDERBOTSMALL_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size=54944,
+        bos_token_id=1,
+        pad_token_id=0,
+        eos_token_id=2,
+        decoder_start_token_id=1,
+        d_model=512,
+        num_encoder_layers=8,
+        num_decoder_layers=8,
+        encoder_attention_heads=16,
+        decoder_attention_heads=16,
+        encoder_ffn_dim=2048,
+        decoder_ffn_dim=2048,
+        dropout=0.1,
+        activation_function="gelu",
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        max_position_embeddings=512,
+        init_std=0.02,
+        scale_embedding=True,
+        normalize_before=False,
+        **kwargs
+    ):
+        super(BlenderbotSmallConfig, self).__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.bos_token_id = bos_token_id
+        self.pad_token_id = pad_token_id
+        self.eos_token_id = eos_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.d_model = d_model
+        self.num_encoder_layers = num_encoder_layers
+        self.num_decoder_layers = num_decoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_attention_heads = decoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.dropout = dropout
+        self.activation_function = activation_function
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.init_std = init_std
+        self.scale_embedding = scale_embedding
+        self.normalize_before = normalize_before
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot_small/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot_small/modeling.py
new file mode 100644
index 000000000..74fe6b764
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot_small/modeling.py
@@ -0,0 +1,752 @@
+# encoding=utf-8
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The Facebook, Inc. and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.tensor as tensor
+from paddle.nn import Embedding
+from paddle.nn.layer.transformer import _convert_attention_mask
+
+from .. import PretrainedModel, register_base_model
+from .configuration import (
+    BLENDERBOTSMALL_PRETRAINED_INIT_CONFIGURATION,
+    BLENDERBOTSMALL_PRETRAINED_RESOURCE_FILES_MAP,
+    BlenderbotSmallConfig,
+)
+
+__all__ = [
+    "BlenderbotSmallModel",
+    "BlenderbotSmallPretrainedModel",
+    "BlenderbotSmallEncoder",
+    "BlenderbotSmallDecoder",
+    "BlenderbotSmallForConditionalGeneration",
+    "BlenderbotSmallForCausalLM",
+]
+
+
+# Copied from paddlenlp.transformers.bart.modeling.shift_tokens_right
+def shift_tokens_right(input_ids: tensor, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = paddle.zeros_like(input_ids)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+    return shifted_input_ids
+
+
+class BlenderbotSmallLearnedPositionalEmbedding(Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+
+    Please should refer to the superclass for more information regarding methods and arguments.
+    """
+
+    def __init__(self, config: BlenderbotSmallConfig):
+        super().__init__(num_embeddings=config.max_position_embeddings, embedding_dim=config.d_model)
+
+    def forward(self, input_ids_shape, past_key_values_length=0):
+        """
+        Generate positional embeddings up based on input_ids_shape.
+        Args:
+            input_ids_shape (`tuple`): expected to be [batch_size, sequence_length].
+            past_key_values_length (`int`, optional): The length of past_key_value,
+            which is used only when the ``use_cache=True`` during prediction generating.
+
+        Returns:
+            (Tensor): The generated positional embedding.
+        """
+        bsz, seq_len = input_ids_shape[:2]
+        positions = paddle.arange(past_key_values_length, past_key_values_length + seq_len, dtype="int64")
+        return super().forward(positions)
+
+
+class BlenderbotSmallPretrainedModel(PretrainedModel):
+    r"""
+    An abstract class for pretrained BlenderbotSmall models. It provides BlenderbotSmall related
+    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
+    `pretrained_init_configuration`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    Refer to :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+    pretrained_init_configuration = BLENDERBOTSMALL_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = BLENDERBOTSMALL_PRETRAINED_RESOURCE_FILES_MAP
+    base_model_prefix = "blenderbot_small"
+    config_class = BlenderbotSmallConfig
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if paddle.get_default_dtype() not in ["float32", "float64"]:
+            # gaussian/standard_normal/randn/normal only supports [float32, float64]
+            return
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.init_std,
+                        shape=layer.weight.shape,
+                    )
+                )
+
+
+class BlenderbotSmallDecoderLayer(nn.TransformerDecoderLayer):
+    """
+    Construct decoder layer for BlenderbotSmallDecoder.
+    Please refer to :class:`~paddlenlp.nn.TransformerDecoderLayer` for more details.
+    """
+
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dim_feedforward,
+        dropout=0.1,
+        activation="gelu",
+        attn_dropout=None,
+        act_dropout=None,
+        normalize_before=True,
+        weight_attr=None,
+        bias_attr=None,
+        *args,
+        **kwargs,
+    ):
+        super(BlenderbotSmallDecoderLayer, self).__init__(
+            d_model=d_model,
+            nhead=nhead,
+            dim_feedforward=dim_feedforward,
+            dropout=dropout,
+            activation=activation,
+            attn_dropout=attn_dropout,
+            act_dropout=act_dropout,
+            normalize_before=normalize_before,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            *args,
+            **kwargs,
+        )
+
+    def forward(self, tgt, memory=None, tgt_mask=None, memory_mask=None, cache=None):
+        """
+        Please refer to  :class:`~paddlenlp.nn.TransformerDecoderLayer`
+        for more information regarding arguments.
+        """
+        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm1(tgt)
+        if cache is None:
+            tgt = self.self_attn(query=tgt, key=tgt, value=tgt, attn_mask=tgt_mask, cache=None)
+        else:
+            tgt, incremental_cache = self.self_attn(query=tgt, key=tgt, value=tgt, attn_mask=tgt_mask, cache=cache[0])
+        tgt = residual + self.dropout1(tgt)
+        if not self.normalize_before:
+            tgt = self.norm1(tgt)
+
+        # Cross-attention will not be applied for BlenderbotSmallForCausalLM
+        if memory is not None:
+            residual = tgt
+            if self.normalize_before:
+                tgt = self.norm2(tgt)
+            memory_mask = _convert_attention_mask(memory_mask, memory.dtype)
+            if cache is None:
+                tgt = self.cross_attn(query=tgt, key=memory, value=memory, attn_mask=memory_mask, cache=None)
+            else:
+                tgt, static_cache = self.cross_attn(
+                    query=tgt, key=memory, value=memory, attn_mask=memory_mask, cache=cache[1]
+                )
+            tgt = residual + self.dropout2(tgt)
+            if not self.normalize_before:
+                tgt = self.norm2(tgt)
+        else:
+            static_cache = cache[1] if cache is not None else None
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm3(tgt)
+        tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = residual + self.dropout3(tgt)
+        if not self.normalize_before:
+            tgt = self.norm3(tgt)
+        return tgt if cache is None else (tgt, (incremental_cache, static_cache))
+
+
+class TransformerDecoder(nn.TransformerDecoder):
+    """
+    Construct Transformer decoder for BlenderbotSmallDecoder.
+    """
+
+    def __init__(self, decoder_layer, num_layers, norm=None):
+        super(TransformerDecoder, self).__init__(decoder_layer=decoder_layer, num_layers=num_layers, norm=norm)
+
+    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
+        """
+        Please refer to  :class:`~paddlenlp.nn.TransformerDecoder`
+        for more information regarding arguments and methods.
+        """
+
+        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
+        if memory is not None:
+            memory_mask = _convert_attention_mask(memory_mask, memory.dtype)
+
+        output = tgt
+        new_caches = []
+        for i, mod in enumerate(self.layers):
+            if cache is None:
+                output = mod(output, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, cache=None)
+            else:
+                output, new_cache = mod(output, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, cache=cache[i])
+                new_caches.append(new_cache)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output if cache is None else (output, new_caches)
+
+
+class BlenderbotSmallEncoder(BlenderbotSmallPretrainedModel):
+    """
+    The encoder of BlenderbotSmall Model.
+    Please refer to :class:`~paddlenlp.transformers.model_utils.PretrainedModel` or
+    :class:`~paddlenlp.transformers.Blenderbot.BlenderbotSmallModel` for more details
+    regarding methods and arguments.
+    """
+
+    def __init__(
+        self,
+        config: BlenderbotSmallConfig,
+        embed_tokens=None,
+    ):
+        super().__init__(config)
+        self.init_std = config.init_std
+        self.pad_token_id = config.pad_token_id
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(
+                num_embeddings=config.vocab_size, embedding_dim=config.d_model, padding_idx=config.pad_token_id
+            )
+        self.encoder_embed_positions = BlenderbotSmallLearnedPositionalEmbedding(config)
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        self.encoder_dropout = nn.Dropout(config.dropout)
+        self.encoder_layernorm_embedding = nn.LayerNorm(config.d_model)
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=config.d_model,
+            nhead=config.encoder_attention_heads,
+            dim_feedforward=config.encoder_ffn_dim,
+            dropout=config.dropout,
+            activation=config.activation_function,
+            attn_dropout=config.attention_dropout,
+            act_dropout=config.activation_dropout,
+            normalize_before=config.normalize_before,
+        )
+        self.encoder = nn.TransformerEncoder(encoder_layer=encoder_layer, num_layers=config.num_encoder_layers)
+
+    def forward(self, input_ids=None, attention_mask=None):
+        """
+        Returns:
+            Tensor: The last hidden-states at the last layer of the encoder.
+            It's data type should be `float` and has a shape of `(batch_size, seq_lens, hidden_size)`.
+            ``seq_lens`` corresponds to the length of input sequence.
+        """
+        if input_ids is None:
+            raise ValueError("Input_ids cannot be None.")
+        inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+        inputs_embed_pos = self.encoder_embed_positions(input_ids.shape)
+        hidden_states = inputs_embeds + inputs_embed_pos
+        hidden_states = self.encoder_layernorm_embedding(hidden_states)
+        encoder_input = self.encoder_dropout(hidden_states)
+
+        if attention_mask is None:
+            attention_mask = (
+                paddle.cast(input_ids == self.pad_token_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
+            )
+        else:
+            attention_mask = attention_mask.unsqueeze([1, 2]) * -1e4
+        attention_mask.stop_gradient = True
+        encoder_output = self.encoder(encoder_input, src_mask=attention_mask)
+        return encoder_output
+
+
+class BlenderbotSmallDecoder(BlenderbotSmallPretrainedModel):
+    """
+    The decoder of BlenderbotSmall Model.
+    Please refer to :class:`~paddlenlp.transformers.model_utils.PretrainedModel` and
+    :class:`~paddlenlp.transformers.Blenderbot.BlenderbotModel` for more information
+    regarding methods and arguments.
+    """
+
+    def __init__(
+        self,
+        config: BlenderbotSmallConfig,
+        embed_tokens=None,
+    ):
+        super().__init__(config)
+        self.init_std = config.init_std
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(
+                num_embeddings=config.vocab_size, embedding_dim=config.d_model, padding_idx=config.pad_token_id
+            )
+
+        self.decoder_embed_positions = BlenderbotSmallLearnedPositionalEmbedding(config)
+        self.decoder_dropout = nn.Dropout(config.dropout)
+        self.decoder_layernorm_embedding = nn.LayerNorm(normalized_shape=config.d_model)
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        decoder_layer = BlenderbotSmallDecoderLayer(
+            d_model=config.d_model,
+            nhead=config.decoder_attention_heads,
+            dim_feedforward=config.decoder_ffn_dim,
+            dropout=config.dropout,
+            activation=config.activation_function,
+            attn_dropout=config.attention_dropout,
+            act_dropout=config.activation_dropout,
+            normalize_before=config.normalize_before,
+        )
+        self.decoder = TransformerDecoder(decoder_layer=decoder_layer, num_layers=config.num_decoder_layers)
+
+    def forward(
+        self,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_output=None,
+        memory_mask=None,
+        use_cache=False,
+        cache=None,
+    ):
+        """
+        Please refer to :class:`~paddlenlp.transformers.Blenderbot.BlenderbotModel` for more
+        information regarding the arguments.
+        """
+        if decoder_input_ids is None:
+            raise ValueError("Decoder_input_ids cannot be None.")
+        if decoder_attention_mask is None:
+            decoder_length = decoder_input_ids.shape[-1]
+            decoder_attention_mask = paddle.tensor.triu(
+                (paddle.full((decoder_length, decoder_length), -np.inf, dtype=paddle.get_default_dtype())), 1
+            )
+        decoder_inputs_embeds = self.embed_tokens(decoder_input_ids) * self.embed_scale
+        # cache[num_layer][0] is an instance of `MultiHeadAttention.Cache` containing
+        # tensor k and v with shape of `[batch_size, num_heads, len_seq, embed_dim // num_heads]`
+        # ``len_seq`` refer to the length of ``decoder_input_ids``
+        # Refer to paddle.nn.MultiHeadAttention.gen_cache for more details regarding cache.
+        past_key_values_length = cache[0][0].k.shape[2] if cache is not None else 0
+
+        decoder_inputs_embed_pos = self.decoder_embed_positions(
+            input_ids_shape=decoder_input_ids.shape, past_key_values_length=past_key_values_length
+        )
+
+        # Different from BLenderbot, BlenderbotSmall Apply layer norm on decoder_inputs_embeds
+        decoder_inputs_embeds = self.decoder_layernorm_embedding(decoder_inputs_embeds)
+
+        hidden_states = decoder_inputs_embeds + decoder_inputs_embed_pos
+        decoder_input = self.decoder_dropout(hidden_states)
+
+        decoder_output = self.decoder(
+            tgt=decoder_input,
+            memory=encoder_output,
+            tgt_mask=decoder_attention_mask,
+            memory_mask=memory_mask,
+            cache=cache,
+        )
+        return decoder_output
+
+
+@register_base_model
+class BlenderbotSmallModel(BlenderbotSmallPretrainedModel):
+    r"""
+    Construct a bare BlenderbotSmall Model.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Check the superclass documentation for the generic methods and the library implements for all its model.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    """
+
+    def __init__(self, config: BlenderbotSmallConfig):
+        super().__init__(config)
+        self.init_std = config.init_std
+        self.pad_token_id = config.pad_token_id
+        self.bos_token_id = config.bos_token_id
+        self.eos_token_id = config.eos_token_id
+        self.decoder_start_token_id = config.decoder_start_token_id
+        self.shared = nn.Embedding(
+            num_embeddings=config.vocab_size, embedding_dim=config.d_model, padding_idx=config.pad_token_id
+        )
+
+        self.encoder = BlenderbotSmallEncoder(config, embed_tokens=self.shared)
+        self.decoder = BlenderbotSmallDecoder(config, embed_tokens=self.shared)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_output=None,
+        use_cache=False,
+        cache=None,
+        **kwargs
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
+
+            attention_mask (Tensor, optional):
+                Mask to indicate whether to perform attention on each input token or not.
+                The values should be either 0 or 1. The attention scores will be set
+                to **-infinity** for any positions in the mask that are **0**, and will be
+                **unchanged** for positions that are **1**.
+
+                - **1** for tokens that are **not masked**,
+                - **0** for tokens that are **masked**.
+
+                It's data type should be `float32` and has a shape of [batch_size, sequence_length].
+                Defaults to `None`.
+
+            decoder_input_ids (Tensor, optional):
+                If not provided, ``decoder_input_ids`` will be automatically generated based
+                on ``decoder_start_token_id`` and ``input_ids``.
+
+            decoder_attention_mask (Tensor, optional):
+                If not provided, the default ``decoder_attention_mask`` will be a tensor with
+                upper triangular part being ``-np.inf``. the shape will be ``(decoder_length, decoder_length)``
+
+            encoder_output (Tensor, optional):
+                The output of encoder. If not provided, a new ``encoder_output`` will be generated
+                from BlenderbotEncoder. Defaults to ``None``.
+
+            use_cache (bool, optional):
+                Indicates whether to use cache to speed up decoding. Defaults to ``False``
+
+            cache (list, optional): It is a list, and each element in the list
+                is a tuple( :code:`(incremental_cache, static_cache)` ). See
+                `TransformerDecoder.gen_cache` for more details. It is only
+                used for inference and should be None for training. Default None.
+        Returns:
+            Tensor|tuple:
+                If ``use_cache=False``, the return will be the last hidden state of decoder with shape
+                of [batch_size, seq_lens, hidden_size]. ``seq_lens`` corresponds to the length of input sequence.
+                Otherwise, the return will be a tuple of ``(decoder_output, cache)``. Please refer to
+                class :class:`paddle.nn.TransformerDecoder` for more information regarding ``cache``.
+
+        Example:
+            .. code-block::
+
+            import paddle
+            from paddlenlp.transformers import BlenderbotSmallTokenizer, BlenderbotSmallModel
+
+            # "blenderbot_small-90M" is pretrained weight of BlenderbotSmallForConditionalGeneration,
+            # Therefore some weight of additional layers in BlenderbotSmallForConditionalGeneration
+            # might not be loaded and used.
+            pretrained_model_name = "blenderbot_small-90M"
+            tokenizer = BlenderbotSmallTokenizer.from_pretrained(pretrained_model_name)
+            model = BlenderbotSmallModel.from_pretrained(pretrained_model_name)
+
+            sample_text = "My friends are cool but they eat too many carbs."
+            inputs = tokenizer(sample_text, return_attention_mask=True, return_token_type_ids=False)
+            inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+            decoder_output = model(**inputs)
+        """
+        if decoder_input_ids is None:
+            decoder_input_ids = shift_tokens_right(
+                input_ids=input_ids, decoder_start_token_id=self.decoder_start_token_id
+            )
+        if encoder_output is None:
+            encoder_output = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
+        # initialize cache based on encoder output for decoding at 1st time step.
+        if use_cache:
+            if cache is None:
+                cache = self.decoder.decoder.gen_cache(encoder_output)
+        else:
+            cache = None
+
+        if attention_mask is None:
+            assert input_ids is not None, "input_ids should be " "specified when generating attention_mask"
+            memory_mask = (
+                paddle.cast(input_ids == self.pad_token_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
+            )
+        else:
+            memory_mask = attention_mask.unsqueeze([1, 2]) * -1e4
+
+        memory_mask.stop_gradient = True
+        decoder_output = self.decoder(
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_output=encoder_output,
+            memory_mask=memory_mask,
+            use_cache=use_cache,
+            cache=cache,
+        )
+        return decoder_output
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, value):
+        self.shared = value
+
+    def get_encoder(self):
+        """
+        This method is required for model with encoder-decoder architecture.
+        """
+        return self.encoder
+
+
+class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPretrainedModel):
+    """
+    Please refer to :class:`~paddlenlp.transformers.Blenderbot.BlenderbotModel` for more
+    information regarding arguments.
+    Return:
+        Tensor|tuple: If ``use_cache=False``, the return will be a tensor with shape of
+            [batch_size, seq_lens, hidden_size]. Otherwise, the return will be a tuple
+            of ``(decoder_output, cache)``.
+    Example:
+        .. code-block::
+
+            import paddle
+            from paddlenlp.transformers import BlenderbotSmallTokenizer, BlenderbotSmallForConditionalGeneration
+
+            pretrained_model_name = "blenderbot_small-90M"
+            tokenizer = BlenderbotSmallTokenizer.from_pretrained(pretrained_model_name)
+            model = BlenderbotSmallForConditionalGeneration.from_pretrained(pretrained_model_name)
+
+            sample_text = "My friends are cool but they eat too many carbs."
+            inputs = tokenizer(sample_text, return_attention_mask=True, return_token_type_ids=False)
+            inputs = {k: paddle.to_tensor([v]) for (k, v) in inputs.items()}
+            result_ids, score = model.generate(input_ids=inputs['input_ids'],
+                                               max_length=60,
+                                               min_length=20,
+                                               decode_strategy='beam_search',
+                                               num_beams=10,
+                                               length_penalty=0.65
+                                               )
+            for sequence_ids in result_ids.numpy().tolist():
+                print("User:\t", sample_text)
+                print("bot:\t", tokenizer.convert_ids_to_string(sequence_ids))
+    """
+
+    def __init__(self, config: BlenderbotSmallConfig):
+        super(BlenderbotSmallForConditionalGeneration, self).__init__(config)
+        self.eos_token_id = config.eos_token_id
+        self.bos_token_id = config.bos_token_id
+        self.pad_token_id = config.pad_token_id
+        self.blenderbot_small = BlenderbotSmallModel(config)
+        self.lm_head_weight = self.create_parameter(
+            shape=[config.vocab_size, config.d_model],
+            dtype=self.blenderbot_small.shared.weight.dtype,
+            is_bias=False,
+        )
+
+        if hasattr(self, "final_logits_bias"):
+            self.final_logits_bias = paddle.zeros((1, config.vocab_size), dtype=paddle.get_default_dtype())
+        else:
+            self.register_buffer(
+                "final_logits_bias",
+                paddle.zeros((1, config.vocab_size), dtype=paddle.get_default_dtype()),
+            )
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_output=None,
+        use_cache=False,
+        cache=None,
+    ):
+        decoder_outputs = self.blenderbot_small(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_output=encoder_output,
+            use_cache=use_cache,
+            cache=cache,
+        )
+
+        lm_logits = (
+            paddle.tensor.matmul(
+                decoder_outputs[0] if use_cache else decoder_outputs, self.lm_head_weight, transpose_y=True
+            )
+            + self.final_logits_bias
+        )
+        if use_cache:
+            cache = decoder_outputs[1]
+            return lm_logits, cache
+        return lm_logits
+
+    def prepare_inputs_for_generation(
+        self, decoder_input_ids, attention_mask=None, encoder_output=None, use_cache=True, cache=None, **kwargs
+    ):
+
+        if encoder_output is not None:
+            expand_size = int(decoder_input_ids.shape[0] / encoder_output.shape[0])
+            if expand_size > 1:
+                index = paddle.tile(paddle.arange(encoder_output.shape[0]).unsqueeze(-1), [1, expand_size]).reshape(
+                    [-1]
+                )
+                encoder_output = paddle.index_select(encoder_output, index)
+
+        if use_cache and cache is None:
+            if encoder_output is None:
+                raise ValueError("Encoder output can not be none if `use_cache` is True")
+            cache = self.decoder.decoder.gen_cache(memory=encoder_output)
+
+        if cache is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+        return {
+            "input_ids": None,  # during prediction, Encoder_output is provided, do not need input_ids.
+            "decoder_input_ids": decoder_input_ids,
+            "encoder_output": encoder_output,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,
+            "cache": cache,
+        }
+
+    def get_encoder(self):
+        """
+        This method is required for model with encoder-decoder architecture.
+        """
+        return self.encoder
+
+    def __getattr__(self, name):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(getattr(self, self.base_model_prefix), name)
+
+
+class BlenderbotSmallForCausalLM(BlenderbotSmallPretrainedModel):
+    """
+    Constructs BLenderbotSmall For Causal Language Model. This model is equivalent to the
+    blenderbotSmall decoder without cross-attention.
+    """
+
+    def __init__(self, config: BlenderbotSmallConfig):
+        super().__init__(config)
+        self.blenderbot_small = BlenderbotSmallModel(config)
+        self.decoder = self.blenderbot_small.decoder
+
+        self.lm_head_weight = self.create_parameter(
+            shape=[config.vocab_size, config.d_model],
+            dtype=self.blenderbot_small.shared.weight.dtype,
+            is_bias=False,
+        )
+        if hasattr(self, "final_logits_bias"):
+            self.final_logits_bias = paddle.zeros((1, config.vocab_size), dtype=paddle.get_default_dtype())
+        else:
+            self.register_buffer(
+                "final_logits_bias",
+                paddle.zeros((1, config.vocab_size), dtype=paddle.get_default_dtype()),
+            )
+
+    def forward(self, input_ids=None, attention_mask=None, use_cache=False, cache=None, **kwargs):
+        """
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
+
+            attention_mask (Tensor, optional):
+                Mask to indicate whether to perform attention on each input token or not.
+                The values should be either 0 or 1. The attention scores will be set
+                to **-infinity** for any positions in the mask that are **0**, and will be
+                **unchanged** for positions that are **1**.
+
+                - **1** for tokens that are **not masked**,
+                - **0** for tokens that are **masked**.
+
+                It's data type should be `float32` and has a shape of [batch_size, sequence_length].
+                Defaults to `None`.
+
+            use_cache (bool, optional):
+                Indicates whether to use cache to speed up decoding. Defaults to ``False``
+
+            cache (list, optional): It is a list, and each element in the list
+                is a tuple( :code:`(incremental_cache, static_cache)` ). See
+                `paddle.nn.TransformerDecoder.gen_cache` for more details. It is only
+                used for inference and should be None for training. Default None.
+        Return:
+            Tensor|tuple: If ``use_cache=False``, the return will be a tensor with shape of
+                [batch_size, seq_lens, hidden_size]. Otherwise, the return will be a tuple
+                of ``(lm_logits, cache)``.
+        Example:
+            .. code-block::
+
+            import paddle
+            from paddlenlp.transformers import BlenderbotSmallTokenizer, BlenderbotSmallForCausalLM
+            use_cache = False
+            text = "My friends are cool but they eat too many carbs."
+            model_name = "blenderbot_small-90M"
+            tokenizer = BlenderbotSmallTokenizer.from_pretrained(model_name)
+            model = BlenderbotSmallForCausalLM.from_pretrained(model_name)
+            model.eval()
+            inputs = tokenizer(text, return_attention_mask=True, return_token_type_ids=False)
+            inputs = {k: paddle.to_tensor([v]) for (k, v) in inputs.items()}
+
+            with paddle.no_grad():
+                outputs = model(**inputs, use_cache=use_cache)
+                # outputs is a tuple of (lm_logits, cache) if ``use_cache=True``.
+
+        """
+        if use_cache and cache is None:
+            # Generating incremental cache. A random tensor with shape of
+            # (batch_size, len_seq, hidden_size) is passed for memory argument.
+            # since the `static_cache` will not be used in BlenderbotSmallForCausalLM
+            batch_size, len_seq = input_ids.shape
+            cache = self.decoder.decoder.gen_cache(memory=paddle.zeros((batch_size, len_seq, self.config.d_model)))
+        decoder_outputs = self.decoder(
+            decoder_input_ids=input_ids, encoder_output=None, memory_mask=None, use_cache=use_cache, cache=cache
+        )
+
+        lm_logits = (
+            paddle.tensor.matmul(
+                decoder_outputs[0] if use_cache else decoder_outputs, self.lm_head_weight, transpose_y=True
+            )
+            + self.final_logits_bias
+        )
+
+        if use_cache:
+            cache = decoder_outputs[1]
+            return lm_logits, cache
+        return lm_logits
+
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, use_cache=True, cache=None, **kwargs):
+        """
+        Prepare inputs for decoder to generate sentences.
+        Return:
+            dict: A dictionary containing necessary inputs for generating next token.
+        """
+        if cache is not None:
+            input_ids = input_ids[:, -1:].unsqueeze(-1)
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "use_cache": use_cache, "cache": cache}
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot_small/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot_small/tokenizer.py
new file mode 100644
index 000000000..af6d86198
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blenderbot_small/tokenizer.py
@@ -0,0 +1,220 @@
+# encoding=utf-8
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The Facebook, Inc. and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+from ..gpt.tokenizer import GPTTokenizer
+
+__all__ = ["BlenderbotSmallTokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"blenderbot_small-90M": 512}
+
+
+# Copy from paddlenlp.transformers.gpt.tokenizer.get_pairs
+def get_pairs(word):
+    """
+    Args:
+        word (tuple): tuple of symbols (symbols being variable-length strings).
+
+    Returns:
+        set: symbol pairs in a word.
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class BlenderbotSmallTokenizer(GPTTokenizer):
+    r"""
+    Constructs a BlenderbotSmall tokenizer based on Byte-Pair-Encoding.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.GPTTokenizer`,
+    which contains most of the main methods.
+    Please should refer to the superclass for more information regarding methods.
+    Args:
+        vocab_file (str): file path of the vocabulary
+        merges_file (str): file path of the merges file.
+        errors (str): The method to handle errors in decoding
+        max_len (int): The specified maximum sequence length. Default: "None".
+        special_tokens (dict): The additional special tokens. Default: "None".
+        bos_token (str): The special token for beginning of sequence token. Default: "__start__".
+        eos_token (str): The special token for end of sequence token. Default: "__end__".
+        unk_token (str): The special token for unknown tokens. Default: "__unk__"
+        pad_token (str): The special token for padding. Default: "__null__".
+        eol_token (str): The special token for newline. Default: "__newln__".
+    Examples:
+        .. code-block:: python
+            from paddlenlp.transformers import BlenderbotSmallTokenizer
+            tokenizer = BlenderbotSmallTokenizer.from_pretrained("blenderbot_small-90M")
+            text = "My friends are cool but they eat too many carbs."
+            inputs = tokenizer(text)
+            # above line outputs:
+            #   {'input_ids': [42, 643, 46, 1430, 45, 52, 1176, 146, 177, 753, 2430, 5],
+            #   'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
+    """
+    resource_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "blenderbot_small-90M": "https://bj.bcebos.com/paddlenlp/models/transformers/blenderbot_small/blenderbot_small-90M-vocab.json",
+        },
+        "merges_file": {
+            "blenderbot_small-90M": "https://bj.bcebos.com/paddlenlp/models/transformers/blenderbot_small/blenderbot_small-90M-merges.txt",
+        },
+    }
+    pretrained_init_configuration = {"blenderbot_small-90M": {}}
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        max_len=None,
+        special_tokens=None,
+        bos_token="__start__",
+        eos_token="__end__",
+        unk_token="__unk__",
+        pad_token="__null__",
+        eol_token="__newln__",
+        **kwargs
+    ):
+        super(BlenderbotSmallTokenizer, self).__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            errors=errors,
+            max_len=max_len,
+            special_tokens=special_tokens,
+            pad_token=pad_token,
+            eos_token=eos_token,
+            eol_token=eol_token,
+            **kwargs,
+        )
+        self.pat = r"\S+\n?"  # String matching pattern of BlenderbotSmall is different from Blenderbot
+        self.unk_id = self.encoder[unk_token]
+        self.eol_token = eol_token
+
+    def bpe(self, token):
+        """
+        Apply Byte-Pair-Encoding on token.
+        The process of bpe in BlenderbotSmall is different from Blenderbot.
+        Args:
+            token (str): The token to be converted.
+
+        Returns:
+            str: Converted token.
+        """
+        if token in self.cache:
+            return self.cache[token]
+        token = re.sub("([.,!?()])", r" \1", token)
+        token = re.sub("(')", r" \1 ", token)
+        token = re.sub(r"\s{2,}", " ", token)
+        if "\n" in token:
+            token = token.replace("\n", self.eol_token)
+        tokens = token.split(" ")
+        words = []
+        for token in tokens:
+            if not len(token):
+                continue
+
+            token = token.lower()
+            word = tuple(token)
+            word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
+            pairs = get_pairs(word)
+
+            if not pairs:
+                words.append(token)
+                continue
+
+            while True:
+                bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+                if bigram not in self.bpe_ranks:
+                    break
+                first, second = bigram
+                new_word = []
+                i = 0
+
+                while i < len(word):
+                    try:
+                        j = word.index(first, i)
+                        new_word.extend(word[i:j])
+                        i = j
+                    except ValueError:
+                        new_word.extend(word[i:])
+                        break
+
+                    if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                        new_word.append(first + second)
+                        i += 2
+                    else:
+                        new_word.append(word[i])
+                        i += 1
+                new_word = tuple(new_word)
+                word = new_word
+                if len(word) == 1:
+                    break
+                else:
+                    pairs = get_pairs(word)
+            word = "@@ ".join(word)
+            word = word[:-4]
+
+            self.cache[token] = word
+            words.append(word)
+        return " ".join(words)
+
+    def convert_tokens_to_string(self, tokens):
+        """
+        Converts a sequence of tokens (list of string) to a single string.
+        Args:
+            tokens (list[str]): A sequence of tokens.
+
+        Returns:
+            str: Converted string.
+        """
+        return " ".join(tokens).replace("@@ ", "").strip()
+
+    def convert_ids_to_string(self, ids, skip_special_tokens=True, clean_up_tokenization_spaces=True):
+        """
+        Converts a sequence of ids (list of integers) to a single string.
+        Args:
+            ids (list[int]):
+                A sequence of ids corresponding to tokens.
+            skip_special_tokens (bool, optional):
+                Whether to skip and not decode special tokens when converting. Defaults to `False`.
+            clean_up_tokenization_spaces (bool, optional):
+                Whether to Clean up a list of simple English tokenization artifacts
+                like spaces before punctuations and abbreviated forms.
+        Returns:
+            str: Converted string.
+        """
+        tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
+        output_string = self.convert_tokens_to_string(tokens)
+        if clean_up_tokenization_spaces:
+            output_string = (
+                output_string.replace(" .", ".")
+                .replace(" ?", "?")
+                .replace(" !", "!")
+                .replace(" ,", ",")
+                .replace(" ' ", "'")
+                .replace(" n't", "n't")
+                .replace(" 'm", "'m")
+                .replace(" 's", "'s")
+                .replace(" 've", "'ve")
+                .replace(" 're", "'re")
+            )
+        return output_string
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip/__init__.py
new file mode 100644
index 000000000..595add0ae
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip/configuration.py
new file mode 100644
index 000000000..59955092a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip/configuration.py
@@ -0,0 +1,393 @@
+# coding=utf-8
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Blip model configuration"""
+
+import copy
+import os
+from typing import Union
+
+from ...utils.log import logger
+from ..configuration_utils import PretrainedConfig
+
+__all__ = [
+    "BlipTextConfig",
+    "BlipVisionConfig",
+    "BlipConfig",
+]
+
+
+class BlipTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`BlipTextModel`]. It is used to instantiate a BLIP
+    text model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the `BlipText` used by the [base
+    architectures](https://huggingface.co/Salesforce/blip-vqa-base).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the `Blip` text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`BlipModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        encoder_hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers from the vision model.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 77):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults
+            to 1e-12): The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float``, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        bos_token_id (`int`, *optional*, defaults to 30522):
+            The id of the `beginning-of-sequence` token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the `end-of-sequence` token.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the `padding` token.
+        sep_token_id (`int`, *optional*, defaults to 102):
+            The id of the `separator` token.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+
+    Example:
+
+    ```python
+    >>> from paddlenlp.transformers import BlipTextConfig, BlipTextModel
+
+    >>> # Initializing a BlipTextConfig with Salesforce/blip-vqa-base style configuration
+    >>> configuration = BlipTextConfig()
+
+    >>> # Initializing a BlipTextModel (with random weights) from the Salesforce/blip-vqa-base style configuration
+    >>> model = BlipTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "blip_text_model"
+
+    def __init__(
+        self,
+        vocab_size=30524,
+        hidden_size=768,
+        encoder_hidden_size=768,
+        intermediate_size=3072,
+        projection_dim=768,
+        num_hidden_layers=12,
+        num_attention_heads=8,
+        max_position_embeddings=512,
+        hidden_act="gelu",
+        layer_norm_eps=1e-12,
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        bos_token_id=30522,
+        eos_token_id=2,
+        pad_token_id=0,
+        sep_token_id=102,
+        is_decoder=True,
+        use_cache=True,
+        **kwargs
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            sep_token_id=sep_token_id,
+            **kwargs,
+        )
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.encoder_hidden_size = encoder_hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.is_decoder = is_decoder
+        self.use_cache = use_cache
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> PretrainedConfig:
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from BlipConfig
+        if config_dict.get("model_type") == "blip":
+            config_dict = config_dict["text_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class BlipVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`BlipVisionModel`]. It is used to instantiate a
+    BLIP vision model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration defaults will yield a similar configuration to that of the Blip-base
+    [Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults
+            to 1e-6): The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float``, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from paddlenlp.transformers import BlipVisionConfig, BlipVisionModel
+
+    >>> # Initializing a BlipVisionConfig with Salesforce/blip-vqa-base style configuration
+    >>> configuration = BlipVisionConfig()
+
+    >>> # Initializing a BlipVisionModel (with random weights) from the Salesforce/blip-vqa-base style configuration
+    >>> model = BlipVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "blip_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=384,
+        patch_size=16,
+        hidden_act="gelu",
+        layer_norm_eps=0.000001,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=1e-10,
+        initializer_factor=1.0,
+        **kwargs
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> PretrainedConfig:
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from BlipConfig
+        if config_dict.get("model_type") == "blip":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class BlipConfig(PretrainedConfig):
+    r"""
+    [`BlipConfig`] is the configuration class to store the configuration of a [`BlipModel`]. It is used to instantiate
+    a BLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
+    a configuration with the defaults will yield a similar configuration to that of the BLIP-base
+    [Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`BlipTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`BlipVisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimentionality of text and vision projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The inital value of the *logit_scale* paramter. Default is used as per the original BLIP implementation.
+        image_text_hidden_size (`int`, *optional*, defaults to 768):
+            Dimentionality of the hidden state of the image-text fusion layer.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from paddlenlp.transformers import BlipConfig, BlipModel
+
+    >>> # Initializing a BlipConfig with Salesforce/blip-vqa-base style configuration
+    >>> configuration = BlipConfig()
+
+    >>> # Initializing a BlipPModel (with random weights) from the Salesforce/blip-vqa-base style configuration
+    >>> model = BlipModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a BlipConfig from a BlipTextConfig and a BlipVisionConfig
+
+    >>> # Initializing a BLIPText and BLIPVision configuration
+    >>> config_text = BlipTextConfig()
+    >>> config_vision = BlipVisionConfig()
+
+    >>> config = BlipConfig.from_text_vision_configs(config_text, config_vision)
+    ```"""
+
+    model_type = "blip"
+    is_composition = True
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        projection_dim=512,
+        logit_scale_init_value=2.6592,
+        image_text_hidden_size=256,
+        **kwargs
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(**kwargs)
+
+        # If `_config_dict` exist, we use them for the backward compatibility.
+        text_config_dict = kwargs.pop("text_config_dict", None)
+        vision_config_dict = kwargs.pop("vision_config_dict", None)
+        if text_config_dict is not None:
+            text_config = text_config_dict
+        if vision_config_dict is not None:
+            vision_config = vision_config_dict
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the BlipTextConfig with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the BlipVisionConfig with default values.")
+
+        text_config["projection_dim"] = projection_dim
+        vision_config["projection_dim"] = projection_dim
+        self.text_config = BlipTextConfig(**text_config)
+        self.vision_config = BlipVisionConfig(**vision_config)
+
+        self.text_config.encoder_hidden_size = self.vision_config.hidden_size
+
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = 1.0
+        self.initializer_range = 0.02
+        self.image_text_hidden_size = image_text_hidden_size
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: BlipTextConfig, vision_config: BlipVisionConfig, **kwargs):
+        r"""
+        Instantiate a [`BlipConfig`] (or a derived class) from blip text model configuration and blip vision model
+        configuration.
+
+        Returns:
+            [`BlipConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+
+    def to_dict(self, *args, **kwargs):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["text_config"] = self.text_config.to_dict()
+        output["vision_config"] = self.vision_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip/image_processing.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip/image_processing.py
new file mode 100644
index 000000000..569f387ea
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip/image_processing.py
@@ -0,0 +1,285 @@
+# coding=utf-8
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for BLIP."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import PIL
+
+from ..image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ..image_transforms import (
+    convert_to_rgb,
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from ..image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    is_batched,
+    to_numpy_array,
+    valid_images,
+)
+from ..tokenizer_utils_base import TensorType
+
+__all__ = [
+    "BlipImageProcessor",
+]
+
+
+class BlipImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a BLIP image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+            overridden by the `resample` parameter in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Wwhether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
+            overridden by the `rescale_factor` parameter in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs
+    ) -> None:
+
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 384, "width": 384}
+        size = get_size_dict(size, default_to_square=True)
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.do_convert_rgb = do_convert_rgb
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Resize an image.
+
+        Resizes the shorter side of the image to `size["shortest_edge"]` while preserving the aspect ratio. If the
+        longer side is larger than the max size `(int(`size["shortest_edge"]` * 1333 / 800))`, the longer side is then
+        resized to the max size while preserving the aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Controls the size of the output image. Should be of the form `{"shortest_edge": int}`.
+            resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size, default_to_square=True)
+        output_size = (size["width"], size["height"])
+        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
+
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ):
+        """
+        Rescale an image by a scale factor. image = image * scale.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`int` or `float`):
+                Scale to apply to the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return rescale(image, scale=scale, data_format=data_format, **kwargs)
+
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: Union[float, List[float]],
+        std: Union[float, List[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            mean (`float` or `List[float]`):
+                Image mean.
+            std (`float` or `List[float]`):
+                Image standard deviation.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        do_convert_rgb: bool = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Controls the size of the image after `resize`. The shortest edge of the image is resized to
+                `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image
+                is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
+                edge equal to `int(size["shortest_edge"] * (1333 / 800))`.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to normalize the image by if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.PADDLE` or `'pt'`: Return a batch of type `paddle.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: defaults to the channel dimension format of the input image.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+
+        if not is_batched(images):
+            images = [images]
+
+        if not valid_images(images):
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
+
+        if do_resize and size is None or resample is None:
+            raise ValueError("Size and resample must be specified if do_resize is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_resize:
+            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+
+        if do_rescale:
+            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+
+        if do_normalize:
+            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
+
+        images = [to_channel_dimension_format(image, data_format) for image in images]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip/modeling.py
new file mode 100644
index 000000000..66c2f9cb6
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip/modeling.py
@@ -0,0 +1,1590 @@
+# coding=utf-8
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Paddle BLIP model."""
+
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Optional, Tuple, Union
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.distributed.fleet.utils import recompute
+
+from ...utils.initializer import normal_, ones_, zeros_
+from ..activations import ACT2FN
+from ..model_outputs import BaseModelOutput, BaseModelOutputWithPooling, ModelOutput
+from ..model_utils import PretrainedModel
+from .configuration import BlipConfig, BlipTextConfig, BlipVisionConfig
+from .modeling_text import BlipTextLMHeadModel, BlipTextModel
+
+BLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "Salesforce/blip-vqa-base",
+    "Salesforce/blip-vqa-capfilt-large",
+    "Salesforce/blip-image-captioning-base",
+    "Salesforce/blip-image-captioning-large",
+    "Salesforce/blip-itm-base-coco",
+    "Salesforce/blip-itm-large-coco",
+    "Salesforce/blip-itm-base-flickr",
+    "Salesforce/blip-itm-large-flickr",
+]
+
+__all__ = [
+    "BlipPretrainedModel",
+    "BlipVisionModel",
+    "BlipModel",
+    "BlipForConditionalGeneration",
+    "BlipForQuestionAnswering",
+    "BlipForImageTextRetrieval",
+]
+
+
+# Copied from transformers.models.clip.modeling_clip.contrastive_loss
+def contrastive_loss(logits: paddle.Tensor) -> paddle.Tensor:
+    return F.cross_entropy(logits, paddle.arange(len(logits)))
+
+
+# Copied from transformers.models.clip.modeling_clip.clip_loss with clip->blip
+def blip_loss(similarity: paddle.Tensor) -> paddle.Tensor:
+    caption_loss = contrastive_loss(similarity)
+    image_loss = contrastive_loss(similarity.t())
+    return (caption_loss + image_loss) / 2.0
+
+
+def Parameter(tensor):
+    return paddle.create_parameter(tensor.shape, dtype=tensor.dtype, default_initializer=nn.initializer.Assign(tensor))
+
+
+@dataclass
+class BlipForConditionalGenerationModelOutput(ModelOutput):
+    """
+    Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
+    last hidden states. This class also adds the loss term from the text decoder.
+
+    Args:
+        loss (`paddle.Tensor`, *optional*, returned when `labels` is provided, `paddle.Tensor` of shape `(1,)`):
+            Languge modeling loss from the text decoder.
+        decoder_logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
+            Prediction scores of the language modeling head of the text decoder model.
+        image_embeds (`paddle.Tensor` of shape `(batch_size, output_dim)`, *optional*):
+            The image embeddings obtained after applying the Vision Transformer model to the input image.
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[Tuple[paddle.Tensor]] = None
+    decoder_logits: Optional[Tuple[paddle.Tensor]] = None
+    image_embeds: Optional[paddle.Tensor] = None
+    last_hidden_state: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BlipTextVisionModelOutput(ModelOutput):
+    """
+    Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
+    last hidden states. This class also adds the loss term from the text decoder.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Languge modeling loss from the text decoder.
+        image_embeds (`paddle.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    image_embeds: Optional[paddle.Tensor] = None
+    last_hidden_state: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BlipImageTextMatchingModelOutput(ModelOutput):
+    """
+    Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
+    last hidden states. This class also adds the loss term from the text decoder as well as the image-text similarity
+    scores.
+
+    Args:
+        itm_score (`paddle.Tensor`):
+            The image-text similarity scores.
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Languge modeling loss from the text decoder.
+        image_embeds (`paddle.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        vision_pooler_output (`paddle.Tensor` of shape `(batch_size, hidden_size)`, *optional*):
+            Last layer hidden-state of the vision of the vision-only branch of the model.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        question_embeds (`paddle.Tensor`):
+            The question embeddings obtained by the text projection layer.
+    """
+
+    itm_score: Optional[paddle.Tensor] = None
+    loss: Optional[paddle.Tensor] = None
+    image_embeds: Optional[paddle.Tensor] = None
+    last_hidden_state: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    vision_pooler_output: Optional[paddle.Tensor] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+    question_embeds: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BlipOutput(ModelOutput):
+    """
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
+        logits_per_image:(`paddle.Tensor` of shape `(image_batch_size, text_batch_size)`):
+            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+            similarity scores.
+        logits_per_text:(`paddle.Tensor` of shape `(text_batch_size, image_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+            similarity scores.
+        text_embeds(`paddle.Tensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of [`BlipTextModel`].
+        image_embeds(`paddle.Tensor` of shape `(batch_size, output_dim`):
+            The image embeddings obtained by applying the projection layer to the pooled output of [`BlipVisionModel`].
+        text_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`BlipTextModel`].
+        vision_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`BlipVisionModel`].
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits_per_image: paddle.Tensor = None
+    logits_per_text: paddle.Tensor = None
+    text_embeds: paddle.Tensor = None
+    image_embeds: paddle.Tensor = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+class BlipVisionEmbeddings(nn.Layer):
+    def __init__(self, config: BlipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = Parameter(paddle.randn([1, 1, self.embed_dim]))
+        self.patch_embedding = nn.Conv2D(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = Parameter(paddle.randn([1, self.num_positions, self.embed_dim]))
+
+    def forward(self, pixel_values: paddle.Tensor) -> paddle.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose([0, 2, 1])
+
+        class_embeds = self.class_embedding.expand([batch_size, 1, -1]).cast(target_dtype)
+        embeddings = paddle.concat([class_embeds, patch_embeds], axis=1)
+        embeddings = embeddings + self.position_embedding[:, : embeddings.shape[1], :].cast(target_dtype)
+        return embeddings
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->Blip
+class BlipTextEmbeddings(nn.Layer):
+    def __init__(self, config: BlipTextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", paddle.arange(config.max_position_embeddings, dtype="int64").reshape((1, -1))
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+    ) -> paddle.Tensor:
+        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        return embeddings
+
+
+class BlipAttention(nn.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = nn.Dropout(config.attention_dropout)
+
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim)
+
+        self.projection = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
+        return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = hidden_states.shape
+
+        mixed_qkv = self.qkv(hidden_states)
+        mixed_qkv = (
+            self.qkv(hidden_states)
+            .reshape([bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads])
+            .transpose([2, 0, 3, 1, 4])
+        )
+        query_states, key_states, value_states = (
+            mixed_qkv[0],
+            mixed_qkv[1],
+            mixed_qkv[2],
+        )
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_states, key_states, transpose_y=True)
+
+        attention_scores = attention_scores * self.scale
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = F.softmax(attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = paddle.matmul(attention_probs, value_states).transpose([0, 2, 1, 3])
+
+        new_context_layer_shape = context_layer.shape[:-2] + [
+            self.embed_dim,
+        ]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        output = self.projection(context_layer)
+
+        outputs = (output, attention_probs) if output_attentions else (output, None)
+
+        return outputs
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Blip
+class BlipMLP(nn.Layer):
+    def __init__(self, config: BlipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class BlipEncoderLayer(nn.Layer):
+    def __init__(self, config: BlipVisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = BlipAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
+        self.mlp = BlipMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: paddle.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        """
+        Args:
+            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`paddle.Tensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = hidden_states + residual
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class BlipPretrainedModel(PretrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BlipConfig
+    base_model_prefix = "blip"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def init_weights(self):
+        """
+        A method executed at the end of each Transformer model initialization, to execute code that needs the model's
+        modules properly initialized (such as weight initialization).
+        """
+        self.apply(self._init_weights)
+
+    def gradient_checkpointing_enable(self):
+        """
+        Activates gradient checkpointing for the current model.
+
+        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+        activations".
+        """
+        if not self.supports_gradient_checkpointing:
+            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
+        self.apply(partial(self._set_gradient_checkpointing, value=True))
+
+    def gradient_checkpointing_disable(self):
+        """
+        Deactivates gradient checkpointing for the current model.
+
+        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+        activations".
+        """
+        if self.supports_gradient_checkpointing:
+            self.apply(partial(self._set_gradient_checkpointing, value=False))
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_range
+        if isinstance(module, nn.Conv2D) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
+            normal_(module.weight, mean=0.0, std=factor)
+            if hasattr(module, "bias") and module.bias is not None:
+                zeros_(module.bias)
+
+        if isinstance(module, BlipVisionEmbeddings):
+            if hasattr(self.config, "vision_config"):
+                factor = self.config.vision_config.initializer_range
+            trunc_normal_ = nn.initializer.TruncatedNormal(mean=0.0, std=factor)
+            trunc_normal_(module.position_embedding)
+            trunc_normal_(
+                module.class_embedding,
+            )
+
+        elif isinstance(module, nn.LayerNorm):
+            zeros_(module.bias)
+            ones_(module.weight)
+
+        elif isinstance(module, nn.Linear) and module.bias is not None:
+            zeros_(module.bias)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, BlipEncoder):
+            module.gradient_checkpointing = value
+
+
+class BlipEncoder(nn.Layer):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`BlipEncoderLayer`].
+
+    Args:
+        config (`BlipVisionConfig`):
+            The corresponding vision configuration for the `BlipEncoder`.
+    """
+
+    def __init__(self, config: BlipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.LayerList([BlipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`BaseModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = recompute(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class BlipVisionModel(BlipPretrainedModel):
+    r"""
+    The vision model from BLIP without any head or projection on top.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`BlipVisionConfig`):
+            An instance of BlipVisionConfig used to construct BlipVisionModel.
+    """
+    main_input_name = "pixel_values"
+    config_class = BlipVisionConfig
+
+    def __init__(self, config: BlipVisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = BlipVisionEmbeddings(config)
+        self.encoder = BlipEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
+
+        self.init_weights()
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.embeddings
+
+    def forward(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+                [`BlipImageProcessor`]. See [`BlipImageProcessor.__call__`] for details.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~BaseModelOutputWithPooling`] instead of a plain tuple.
+        Returns:
+            An instance of :class:`BaseModelOutputWithPooling` if `return_dict=True`. Otherwise it returns a tuple of tensors
+            corresponding to ordered and not None (depending on the input arguments) fields of :class:`BaseModelOutputWithPooling`.
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import BLIPProcessor, BLIPVisionModel
+
+        >>> model = BLIPVisionModel.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> model.eval()
+        >>> processor = BLIPProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pd")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = last_hidden_state[:, 0, :]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class BlipModel(BlipPretrainedModel):
+    r"""
+    The bare BLIP Model outputting logits_per_image and logits_per_text.
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`BlipConfig`):
+            An instance of BlipConfig used to construct BlipModel.
+    """
+    config_class = BlipConfig
+
+    def __init__(self, config: BlipConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, BlipTextConfig):
+            raise ValueError(
+                "config.text_config is expected to be of type BlipTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, BlipVisionConfig):
+            raise ValueError(
+                "config.vision_config is expected to be of type BlipVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = BlipTextModel(text_config)
+        self.vision_model = BlipVisionModel(vision_config)
+
+        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias_attr=False)
+        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias_attr=False)
+        self.logit_scale = Parameter(
+            paddle.ones(
+                [
+                    1,
+                ]
+            )
+            * config.logit_scale_init_value
+        )
+
+        # Initialize weights and apply final processing
+        self.init_weights()
+
+    def get_text_features(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> paddle.Tensor:
+        r"""
+        Args:
+            input_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+                it.
+                Indices can be obtained using [`BertTokenizer`].
+            attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            position_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+                config.max_position_embeddings - 1]`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`BaseModelOutputWithPooling`] instead of a plain tuple.
+
+        Returns:
+            text_features (`paddle.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`BlipTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from paddlenlp.transformers import BlipProcessor, BlipModel
+
+        >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> model.eval()
+        >>> processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+
+        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pd")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+        text_features = self.text_projection(pooled_output)
+
+        return text_features
+
+    def get_image_features(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> paddle.Tensor:
+        r"""
+        Args:
+            pixel_values (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+                [`BLIPImageProcessor`]. See [`BLIPImageProcessor.__call__`] for details.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`BaseModelOutputWithPooling`] instead of a plain tuple.
+
+        Returns:
+            image_features (`paddle.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`BlipVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import BlipProcessor, BlipModel
+
+        >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> model.eval()
+        >>> processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pd")
+
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
+
+        return image_features
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        pixel_values: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BlipOutput]:
+        r"""
+        The BLIPPModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it.
+                Its data type should be `int64` and it has a shape of [text_batch_size, sequence_length].
+            pixel_values (Tensor):
+                Pixel values. Padding will be ignored by default should you provide it.
+                Its data type should be `float32` and it has a shape of [image_batch_size, num_channels, height, width].
+            position_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in
+                the range ``[0, max_text_length - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `0.0` values and the others have `1.0` values.
+                It is a tensor with shape `[batch_size, sequence_length`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            return_loss (`bool`, *optional*):
+                Whether or not to return the contrastive loss.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`BlipOutput` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `True`.
+        Returns:
+            An instance of :class:`BlipOutput` if `return_dict=True`. Otherwise it returns a tuple of tensors
+            corresponding to ordered and not None (depending on the input arguments) fields of :class:`BlipOutput`.
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import BlipProcessor, BlipModel
+
+        >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> model.eval()
+        >>> processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pd", padding=True
+        ... )
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = F.softmax(logits_per_image, axis=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        # Use BLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = F.normalize(image_embeds, axis=-1)
+        text_embeds = F.normalize(text_embeds, axis=-1)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_text = paddle.matmul(text_embeds, image_embeds, transpose_y=True) * logit_scale
+        logits_per_image = logits_per_text.t()
+
+        loss = None
+        if return_loss:
+            loss = blip_loss(logits_per_text)
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return BlipOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+
+
+class BlipForConditionalGeneration(BlipPretrainedModel):
+    r"""
+    BLIP Model for image captioning. The model consists of a vision encoder and a text decoder. One can optionally pass
+    `input_ids` to the model, which serve as a text prompt, to make the text decoder continue the prompt. Otherwise,
+    the decoder starts generating text from the [BOS] (beginning-of-sequence) token. will start generating the caption
+    from the text input. If no text input is provided, the decoder will start with the [BOS] token only.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`BlipConfig`):
+            An instance of BlipConfig used to construct BlipForConditionalGeneration.
+    """
+    config_class = BlipConfig
+    _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"]
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: BlipConfig):
+        super().__init__(config)
+
+        self.vision_model = BlipVisionModel(config.vision_config)
+
+        self.text_decoder = BlipTextLMHeadModel(config.text_config)
+
+        self.decoder_input_ids = config.text_config.bos_token_id
+        self.decoder_pad_token_id = config.text_config.pad_token_id
+
+        # Initialize weights and apply final processing
+        self.init_weights()
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(
+        self,
+        pixel_values: paddle.Tensor,
+        input_ids: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[paddle.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BlipForConditionalGenerationModelOutput]:
+        r"""
+        Args:
+            pixel_values (Tensor):
+                Pixel values. Padding will be ignored by default should you provide it.
+                Its data type should be `float32` and it has a shape of [image_batch_size, num_channels, height, width].
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it.
+                Its data type should be `int64` and it has a shape of [text_batch_size, sequence_length].
+            position_ids (Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in
+                the range ``[0, max_text_length - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `0.0` values and the others have `1.0` values.
+                It is a tensor with shape `[batch_size, sequence_length`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`BlipForConditionalGenerationModelOutput`] instead of a plain tuple.
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import BlipProcessor, BlipForConditionalGeneration
+
+        >>> processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> model.eval()
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pd")
+
+        >>> outputs = model(**inputs)
+        ```"""
+        batch_size = pixel_values.shape[0]
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[0]
+
+        if input_ids is None:
+            input_ids = paddle.to_tensor([[self.decoder_input_ids] * batch_size])
+
+        if labels is None:
+            labels = paddle.where(input_ids == self.decoder_pad_token_id, paddle.to_tensor(-100), input_ids)
+
+        outputs = self.text_decoder(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=image_embeds,
+            labels=labels,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            outputs = (outputs[0], outputs[1], image_embeds, vision_outputs[0]) + vision_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
+
+        return BlipForConditionalGenerationModelOutput(
+            loss=outputs.loss,
+            decoder_logits=outputs.logits,
+            image_embeds=image_embeds,
+            last_hidden_state=vision_outputs.last_hidden_state,
+            hidden_states=vision_outputs.hidden_states,
+            attentions=vision_outputs.attentions,
+        )
+
+    @paddle.no_grad()
+    def generate(
+        self,
+        pixel_values: paddle.Tensor,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        **generate_kwargs
+    ) -> paddle.Tensor:
+        r"""
+        Overrides *generate* function to be able to use the model as a conditional generator
+
+        Args:
+            pixel_values (*paddle.Tensor* of shape *(batch_size, image_width, image_height)*:
+                Input image to be processed
+            input_ids (*paddle.Tensor* of shape *(batch_size, sequence_length)*, *optional*):
+                The sequence used as a prompt for the generation.
+            attention_mask (*paddle.Tensor* of shape *(batch_size, sequence_length)*, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import BlipProcessor, BlipForConditionalGeneration
+
+        >>> model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> model.eval()
+        >>> processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pd")
+
+        >>> outputs = model.generate(**inputs)[0]
+        >>> print(processor.decode(outputs[0], skip_special_tokens=True))
+        two cats are laying on a couch
+        ```
+        """
+
+        batch_size = pixel_values.shape[0]
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+        )
+
+        image_embeds = vision_outputs[0]
+
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype=paddle.int64)
+
+        if isinstance(input_ids, list):
+            input_ids = paddle.to_tensor(input_ids)
+        elif input_ids is None:
+            input_ids = paddle.to_tensor([[self.decoder_input_ids, self.config.text_config.eos_token_id]]).tile(
+                [batch_size, 1]
+            )
+
+        input_ids[:, 0] = self.config.text_config.bos_token_id
+        attention_mask = attention_mask[:, :-1] if attention_mask is not None else None
+
+        outputs = self.text_decoder.generate(
+            input_ids=input_ids[:, :-1],
+            eos_token_id=self.config.text_config.sep_token_id,
+            pad_token_id=self.config.text_config.pad_token_id,
+            attention_mask=attention_mask,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            **generate_kwargs,
+        )
+
+        return outputs
+
+
+class BlipForQuestionAnswering(BlipPretrainedModel):
+    r"""
+    BLIP Model for visual question answering. The model consists of a vision encoder, a text encoder as well as a text
+    decoder. The vision encoder will encode the input image, the text encoder will encode the input question together
+    with the encoding of the image, and the text decoder will output the answer to the question.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`BlipConfig`):
+            An instance of BlipConfig used to construct BlipForQuestionAnswering.
+    """
+    config_class = BlipConfig
+    _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"]
+
+    def __init__(self, config: BlipConfig):
+        super().__init__(config)
+
+        self.vision_model = BlipVisionModel(config.vision_config)
+
+        self.text_encoder = BlipTextModel(config.text_config, add_pooling_layer=False)
+
+        self.text_decoder = BlipTextLMHeadModel(config.text_config)
+
+        self.decoder_pad_token_id = config.text_config.pad_token_id
+        self.decoder_bos_token_id = config.text_config.bos_token_id
+
+        # Initialize weights and apply final processing
+        self.init_weights()
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(
+        self,
+        input_ids: paddle.Tensor,
+        pixel_values: paddle.Tensor,
+        decoder_input_ids: Optional[paddle.Tensor] = None,
+        decoder_attention_mask: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[paddle.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BlipTextVisionModelOutput]:
+        r"""
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it.
+                Its data type should be `int64` and it has a shape of [text_batch_size, sequence_length].
+            pixel_values (Tensor):
+                Pixel values. Padding will be ignored by default should you provide it.
+                Its data type should be `float32` and it has a shape of [image_batch_size, num_channels, height, width].
+            decoder_input_ids (Tensor, optional):
+                Indices of decoder input sequence tokens in the vocabulary.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means no `decoder_input_ids` is provided, the model will create the tensor
+                by shifting the `input_ids` to the right.
+            decoder_attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention to some unwanted positions in `decoder_input_ids`.
+                Its data type and shape is the same as `attention_mask`. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `0.0` values and the others have `1.0` values.
+                It is a tensor with shape `[batch_size, sequence_length`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`BlipTextVisionModelOutput`] instead of a plain tuple.
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import BlipProcessor, BlipForQuestionAnswering
+
+        >>> model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
+        >>> model.eval()
+        >>> processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "How many cats are in the picture?"
+
+        >>> inputs = processor(images=image, text=text, return_tensors="pd")
+
+        >>> outputs = model(**inputs)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        batch_size = input_ids.shape[0]
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[0]
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype=paddle.int64)
+
+        question_embeds = self.text_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=return_dict,
+        )
+
+        question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state
+
+        if decoder_input_ids is None:
+            # (TODO, junnyu) [batch_size, 2]
+            decoder_input_ids = paddle.to_tensor([self.decoder_bos_token_id]).tile((batch_size, 2))
+
+        if labels is None:
+            labels = paddle.where(
+                decoder_input_ids == self.decoder_pad_token_id, paddle.to_tensor(-100), decoder_input_ids
+            )
+
+        answer_output = self.text_decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=question_embeds,
+            encoder_attention_mask=attention_mask,
+            labels=labels,
+            return_dict=return_dict,
+            reduction="none",
+        )
+
+        decoder_loss = answer_output.loss.mean() if return_dict else answer_output[0].mean()
+
+        if not return_dict:
+            outputs = (decoder_loss, image_embeds, vision_outputs[0]) + vision_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
+
+        return BlipTextVisionModelOutput(
+            loss=decoder_loss,
+            image_embeds=image_embeds,
+            last_hidden_state=vision_outputs.last_hidden_state,
+            hidden_states=vision_outputs.hidden_states,
+            attentions=vision_outputs.attentions,
+        )
+
+    @paddle.no_grad()
+    def generate(
+        self,
+        input_ids: paddle.Tensor,
+        pixel_values: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        **generate_kwargs
+    ) -> paddle.Tensor:
+        r"""
+        Overrides *generate* function to be able to use the model as a conditional generator
+
+        Args:
+            input_ids (*paddle.Tensor* of shape *(batch_size, sequence_length)*):
+                The sequence used as a prompt for the generation.
+            pixel_values (*paddle.Tensor* of shape *(batch_size, image_width, image_height)*:
+                Input image to be processed
+            attention_mask (*paddle.Tensor* of shape *(batch_size, sequence_length)*, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`. `1` for
+                tokens that are NOT MASKED, `0` for MASKED tokens.
+            **generate_kwargs:
+                Additional arguments passed to the *generate* function of the decoder
+
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import BlipProcessor, BlipForQuestionAnswering
+
+        >>> model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
+        >>> model.eval()
+        >>> processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "How many cats are in the picture?"
+
+        >>> inputs = processor(images=image, text=text, return_tensors="pd")
+
+        >>> outputs = model.generate(**inputs)
+        >>> print(processor.decode(outputs[0], skip_special_tokens=True))
+        2
+        ```
+        """
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+        )
+
+        image_embeds = vision_outputs[0]
+
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype=paddle.int64)
+
+        if isinstance(input_ids, list):
+            input_ids = paddle.to_tensor(input_ids)
+
+        question_outputs = self.text_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+        )
+
+        question_embeds = question_outputs[0]
+
+        question_attention_mask = paddle.ones(question_embeds.shape[:-1], dtype=paddle.int64)
+
+        bos_ids = paddle.full((question_embeds.shape[0], 1), fill_value=self.decoder_bos_token_id, dtype=paddle.int64)
+
+        outputs = self.text_decoder.generate(
+            input_ids=bos_ids,
+            eos_token_id=self.config.text_config.sep_token_id,
+            pad_token_id=self.config.text_config.pad_token_id,
+            encoder_hidden_states=question_embeds,
+            encoder_attention_mask=question_attention_mask,
+            **generate_kwargs,
+        )
+
+        return outputs
+
+
+class BlipForImageTextRetrieval(BlipPretrainedModel):
+    r"""
+    BLIP Model with a vision and text projector, and a classification head on top. The model is used in the context of
+    image-text retrieval. Given an image and a text, the model returns the probability of the text being relevant to
+    the image.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`BlipConfig`):
+            An instance of BlipConfig used to construct BlipForImageTextRetrieval.
+    """
+    config_class = BlipConfig
+
+    def __init__(self, config: BlipConfig):
+        super().__init__(config)
+
+        self.vision_model = BlipVisionModel(config.vision_config)
+
+        self.text_encoder = BlipTextModel(config.text_config, add_pooling_layer=False)
+
+        # vision projection layer
+        self.vision_proj = nn.Linear(config.vision_config.hidden_size, config.image_text_hidden_size)
+
+        # text projection layer
+        self.text_proj = nn.Linear(config.text_config.hidden_size, config.image_text_hidden_size)
+
+        # image text matching head
+        self.itm_head = nn.Linear(config.text_config.hidden_size, 2)
+
+        self.decoder_pad_token_id = config.text_config.pad_token_id
+        self.decoder_bos_token_id = config.text_config.bos_token_id
+
+        # Initialize weights and apply final processing
+        self.init_weights()
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(
+        self,
+        input_ids: paddle.Tensor,
+        pixel_values: paddle.Tensor,
+        use_itm_head: Optional[bool] = True,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BlipTextVisionModelOutput]:
+        r"""
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it.
+                Its data type should be `int64` and it has a shape of [text_batch_size, sequence_length].
+            pixel_values (Tensor):
+                Pixel values. Padding will be ignored by default should you provide it.
+                Its data type should be `float32` and it has a shape of [image_batch_size, num_channels, height, width].
+            use_itm_head (bool, optional):
+                Whether to use itm head.
+                Defaults to `True`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `0.0` values and the others have `1.0` values.
+                It is a tensor with shape `[batch_size, sequence_length`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`BlipTextVisionModelOutput`] instead of a plain tuple.
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import BlipProcessor, BlipForImageTextRetrieval
+
+        >>> model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base")
+        >>> model.eval()
+        >>> processor = BlipProcessor.from_pretrained("Salesforce/blip-itm-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "an image of a cat"
+
+        >>> inputs = processor(images=image, text=text, return_tensors="pd")
+        >>> outputs = model(**inputs)
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[0]
+        image_atts = paddle.ones(image_embeds.shape[:-1], dtype=paddle.int64)
+
+        if use_itm_head:
+            question_embeds = self.text_encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_atts,
+                return_dict=return_dict,
+            )
+            question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state
+
+            output = self.itm_head(question_embeds[:, 0, :])
+        else:
+            question_embeds = self.text_encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                return_dict=return_dict,
+            )
+            question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state
+
+            image_feat = F.normalize(self.vision_proj(image_embeds[:, 0, :]), axis=-1)
+            text_feat = F.normalize(self.text_proj(question_embeds[:, 0, :]), axis=-1)
+
+            output = paddle.matmul(image_feat, text_feat, transpose_y=True)
+
+        if not return_dict:
+            outputs = (output, vision_outputs[0]) + vision_outputs[2:] + (question_embeds,)
+            return tuple(output for output in outputs if output is not None)
+
+        return BlipImageTextMatchingModelOutput(
+            itm_score=output,
+            last_hidden_state=vision_outputs.last_hidden_state,
+            hidden_states=vision_outputs.hidden_states,
+            attentions=vision_outputs.attentions,
+            question_embeds=question_embeds,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip/modeling_text.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip/modeling_text.py
new file mode 100644
index 000000000..71c0a7060
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip/modeling_text.py
@@ -0,0 +1,1101 @@
+# coding=utf-8
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the BSD-3-clause license (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import inspect
+import math
+from functools import partial
+from typing import Callable, Optional, Tuple
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.distributed.fleet.utils import recompute
+
+from ...utils.initializer import normal_, ones_, zeros_
+from ...utils.log import logger
+from ..activations import ACT2FN
+from ..model_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+)
+from ..model_utils import PretrainedModel
+from .configuration import BlipTextConfig
+
+__all__ = [
+    "BlipTextPretrainedModel",
+    "BlipTextModel",
+    "BlipTextLMHeadModel",
+]
+
+
+def apply_chunking_to_forward(
+    forward_fn: Callable[..., paddle.Tensor], chunk_size: int, chunk_dim: int, *input_tensors
+) -> paddle.Tensor:
+    """
+    This function chunks the `input_tensors` into smaller input tensor parts of size `chunk_size` over the dimension
+    `chunk_dim`. It then applies a layer `forward_fn` to each chunk independently to save memory.
+
+    If the `forward_fn` is independent across the `chunk_dim` this function will yield the same result as directly
+    applying `forward_fn` to `input_tensors`.
+
+    Args:
+        forward_fn (`Callable[..., paddle.Tensor]`):
+            The forward function of the model.
+        chunk_size (`int`):
+            The chunk size of a chunked tensor: `num_chunks = len(input_tensors[0]) / chunk_size`.
+        chunk_dim (`int`):
+            The dimension over which the `input_tensors` should be chunked.
+        input_tensors (`Tuple[paddle.Tensor]`):
+            The input tensors of `forward_fn` which will be chunked
+
+    Returns:
+        `paddle.Tensor`: A tensor with the same shape as the `forward_fn` would have given if applied`.
+
+
+    Examples:
+
+    ```python
+    # rename the usual forward() fn to forward_chunk()
+    def forward_chunk(self, hidden_states):
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+    # implement a chunked forward function
+    def forward(self, hidden_states):
+        return apply_chunking_to_forward(self.forward_chunk, self.chunk_size_lm_head, self.seq_len_dim, hidden_states)
+    ```"""
+
+    assert len(input_tensors) > 0, f"{input_tensors} has to be a tuple/list of tensors"
+
+    # inspect.signature exist since python 3.5 and is a python method -> no problem with backward compatibility
+    num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters)
+    if num_args_in_forward_chunk_fn != len(input_tensors):
+        raise ValueError(
+            f"forward_chunk_fn expects {num_args_in_forward_chunk_fn} arguments, but only {len(input_tensors)} input "
+            "tensors are given"
+        )
+
+    if chunk_size > 0:
+        tensor_shape = input_tensors[0].shape[chunk_dim]
+        for input_tensor in input_tensors:
+            if input_tensor.shape[chunk_dim] != tensor_shape:
+                raise ValueError(
+                    f"All input tenors have to be of the same shape: {tensor_shape}, "
+                    f"found shape {input_tensor.shape[chunk_dim]}"
+                )
+
+        if input_tensors[0].shape[chunk_dim] % chunk_size != 0:
+            raise ValueError(
+                f"The dimension to be chunked {input_tensors[0].shape[chunk_dim]} has to be a multiple of the chunk "
+                f"size {chunk_size}"
+            )
+
+        num_chunks = input_tensors[0].shape[chunk_dim] // chunk_size
+
+        # chunk input tensor into tuples
+        input_tensors_chunks = tuple(input_tensor.chunk(num_chunks, axis=chunk_dim) for input_tensor in input_tensors)
+        # apply forward fn to every tuple
+        output_chunks = tuple(forward_fn(*input_tensors_chunk) for input_tensors_chunk in zip(*input_tensors_chunks))
+        # concatenate output at same dimension
+        return paddle.concat(output_chunks, axis=chunk_dim)
+
+    return forward_fn(*input_tensors)
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L52
+class BlipTextEmbeddings(nn.Layer):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config: BlipTextConfig):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size, config.hidden_size
+        )  # , padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", paddle.arange(config.max_position_embeddings, dtype="int64").reshape((1, -1))
+        )
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+
+        self.config = config
+
+    def forward(self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0):
+        if input_ids is not None:
+            input_shape = input_ids.shape
+        else:
+            input_shape = inputs_embeds.shape[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        embeddings = inputs_embeds
+
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L97
+class BlipTextSelfAttention(nn.Layer):
+    def __init__(self, config: BlipTextConfig, is_cross_attention: bool = False):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
+                % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.scale = math.sqrt(self.attention_head_size)
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.shape[:-1] + [self.num_attention_heads, self.attention_head_size]
+        x = x.reshape(new_x_shape)
+        return x.transpose([0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = paddle.concat([past_key_value[0], key_layer], axis=2)
+            value_layer = paddle.concat([past_key_value[1], value_layer], axis=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_layer, key_layer, transpose_y=True)
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.shape[1]
+            position_ids_l = paddle.arange(seq_length, dtype=paddle.int64).reshape([-1, 1])
+            position_ids_r = paddle.arange(seq_length, dtype=paddle.int64).reshape([1, -1])
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.cast(query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = paddle.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / self.scale
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BlipTextModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = F.softmax(attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        context_layer = paddle.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.transpose([0, 2, 1, 3])
+        new_context_layer_shape = context_layer.shape[:-2] + [
+            self.all_head_size,
+        ]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert -> BlipText
+class BlipTextSelfOutput(nn.Layer):
+    def __init__(self, config: BlipTextConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#242
+class BlipTextAttention(nn.Layer):
+    def __init__(self, config: BlipTextConfig, is_cross_attention: bool = False):
+        super().__init__()
+        self.self = BlipTextSelfAttention(config, is_cross_attention)
+        self.output = BlipTextSelfOutput(config)
+        self.pruned_heads = set()
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        encoder_attention_mask: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert -> BlipText
+class BlipTextIntermediate(nn.Layer):
+    def __init__(self, config: BlipTextConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert -> BlipText
+class BlipTextOutput(nn.Layer):
+    def __init__(self, config: BlipTextConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BlipTextLayer(nn.Layer):
+    def __init__(self, config: BlipTextConfig, layer_num: int):
+        super().__init__()
+        self.config = config
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BlipTextAttention(config)
+        self.layer_num = layer_num
+        if self.config.is_decoder:
+            self.crossattention = BlipTextAttention(config, is_cross_attention=self.config.is_decoder)
+        self.intermediate = BlipTextIntermediate(config)
+        self.output = BlipTextOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        outputs = self_attention_outputs[1:-1]
+        present_key_value = self_attention_outputs[-1]
+
+        if encoder_hidden_states is not None:
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L386
+class BlipTextEncoder(nn.Layer):
+    def __init__(self, config: BlipTextConfig):
+        super().__init__()
+        self.config = config
+        self.layer = nn.LayerList([BlipTextLayer(config, i) for i in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.is_decoder else None
+
+        next_decoder_cache = () if use_cache else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = recompute(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->BlipText
+class BlipTextPooler(nn.Layer):
+    def __init__(self, config: BlipTextConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->BlipText
+class BlipTextPredictionHeadTransform(nn.Layer):
+    def __init__(self, config: BlipTextConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->BlipText
+class BlipTextLMPredictionHead(nn.Layer):
+    def __init__(self, config: BlipTextConfig, embedding_weights=None):
+        super().__init__()
+        self.transform = BlipTextPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder_weight = (
+            self.create_parameter(
+                shape=[config.vocab_size, config.hidden_size], dtype=self.transform.weight.dtype, is_bias=False
+            )
+            if embedding_weights is None
+            else embedding_weights
+        )
+
+        self.bias = self.create_parameter(
+            shape=[
+                config.vocab_size,
+            ],
+            dtype=self.decoder_weight.dtype,
+            is_bias=True,
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = paddle.matmul(hidden_states, self.decoder_weight, transpose_y=True) + self.bias
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->BlipText
+class BlipTextOnlyMLMHead(nn.Layer):
+    """
+    Perform language modeling task.
+
+    Args:
+        config (:class:`BlipTextConfig`):
+            An instance of BlipTextConfig used to construct BlipTextLMHeadModel.
+        embedding_weights (Tensor, optional):
+            Decoding weights used to map hidden_states to logits of the masked token prediction.
+            Its data type should be float32 and its shape is [vocab_size, hidden_size].
+            Defaults to `None`, which means use the same weights of the embedding layer.
+
+    """
+
+    def __init__(self, config: BlipTextConfig, embedding_weights=None):
+        super().__init__()
+        self.predictions = BlipTextLMPredictionHead(config, embedding_weights)
+
+    def forward(self, sequence_output: paddle.Tensor) -> paddle.Tensor:
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L548
+class BlipTextPretrainedModel(PretrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BlipTextConfig
+    base_model_prefix = "bert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            zeros_(module.bias)
+            ones_(module.weight)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            zeros_(module.bias)
+
+    def gradient_checkpointing_enable(self):
+        """
+        Activates gradient checkpointing for the current model.
+
+        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+        activations".
+        """
+        if not self.supports_gradient_checkpointing:
+            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
+        self.apply(partial(self._set_gradient_checkpointing, value=True))
+
+    def gradient_checkpointing_disable(self):
+        """
+        Deactivates gradient checkpointing for the current model.
+
+        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+        activations".
+        """
+        if self.supports_gradient_checkpointing:
+            self.apply(partial(self._set_gradient_checkpointing, value=False))
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, BlipTextEncoder):
+            module.gradient_checkpointing = value
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/3a29b7410476bf5f2ba0955827390eb6ea1f4f9d/models/med.py#L571
+class BlipTextModel(BlipTextPretrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. argument and `is_decoder` set to `True`; an
+    `encoder_hidden_states` is then expected as an input to the forward pass.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`BlipTextConfig`):
+            An instance of BlipTextConfig used to construct BlipTextModel.
+    """
+
+    def __init__(self, config: BlipTextConfig, add_pooling_layer: bool = True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = BlipTextEmbeddings(config)
+        self.encoder = BlipTextEncoder(config)
+        self.pooler = BlipTextPooler(config) if add_pooling_layer else None
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    @property
+    def dtype(self):
+        return self.embeddings.word_embeddings.weight.dtype
+
+    def get_extended_attention_mask(
+        self, attention_mask: paddle.Tensor, input_shape: Tuple[int], is_decoder: bool
+    ) -> paddle.Tensor:
+        if attention_mask.ndim == 3:
+            extended_attention_mask = attention_mask.unsqueeze(1)
+        elif attention_mask.ndim == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if is_decoder:
+                batch_size, seq_length = input_shape
+                seq_ids = paddle.arange(seq_length)
+                causal_mask = paddle.tile(
+                    seq_ids.unsqueeze(axis=[0, 1]), [batch_size, seq_length, 1]
+                ) <= seq_ids.unsqueeze(axis=[0, 2])
+                causal_mask = causal_mask.cast(attention_mask.dtype)
+
+                if causal_mask.shape[1] < attention_mask.shape[1]:
+                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+                    causal_mask = paddle.concat(
+                        [
+                            paddle.ones(
+                                [batch_size, seq_length, prefix_seq_len],
+                                dtype=causal_mask.dtype,
+                            ),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+
+                extended_attention_mask = causal_mask.unsqueeze(1) * attention_mask.unsqueeze([1, 2])
+            else:
+                extended_attention_mask = attention_mask.unsqueeze([1, 2])
+        else:
+            raise ValueError(
+                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
+            )
+
+        extended_attention_mask = extended_attention_mask.cast(self.dtype)
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def invert_attention_mask(self, encoder_attention_mask):
+        if encoder_attention_mask.ndim == 4:
+            encoder_extended_attention_mask = encoder_attention_mask
+        elif encoder_attention_mask.ndim == 3:
+            encoder_extended_attention_mask = encoder_attention_mask.unsqueeze(1)
+        elif encoder_attention_mask.ndim == 2:
+            encoder_extended_attention_mask = encoder_attention_mask.unsqueeze([1, 2])
+        encoder_extended_attention_mask = encoder_extended_attention_mask.cast(self.dtype)  # fp16 compatibility
+
+        if self.dtype == paddle.float16:
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4
+        elif self.dtype == paddle.float32:
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4
+        else:
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4
+
+        return encoder_extended_attention_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        inputs_embeds=None,
+        encoder_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_decoder=False,
+    ):
+        r"""
+        input_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`BertTokenizer`]. See [`PretrainedTokenizer.encode`] and
+            [`PretrainedTokenizer.__call__`] for details.
+
+        attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        position_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+        inputs_embeds (`paddle.Tensor` of shape `(batch_size, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        encoder_embeds (`paddle.Tensor` of shape `(batch_size, hidden_size)`, *optional*):
+            Optionally, same as inputs_embeds.
+        encoder_hidden_states  (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        past_key_values (`tuple(tuple(paddle.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`BaseModelOutputWithPoolingAndCrossAttentions`] instead of a plain tuple.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+            batch_size, seq_length = input_shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+            batch_size, seq_length = input_shape
+        elif encoder_embeds is not None:
+            input_shape = encoder_embeds.shape[:-1]
+            batch_size, seq_length = input_shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds or encoder_embeds")
+
+        # cache_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = paddle.ones((batch_size, seq_length + past_key_values_length))
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: paddle.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, is_decoder
+        )
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if isinstance(encoder_hidden_states, (list, tuple)):
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].shape
+            else:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+            if isinstance(encoder_attention_mask, (list, tuple)):
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = paddle.ones(encoder_hidden_shape)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        if encoder_embeds is None:
+            embedding_output = self.embeddings(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                inputs_embeds=inputs_embeds,
+                past_key_values_length=past_key_values_length,
+            )
+        else:
+            embedding_output = encoder_embeds
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            if pooled_output is None:
+                # note: we donot output pooled_output
+                return (sequence_output,) + encoder_outputs[1:]
+            else:
+                return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811
+class BlipTextLMHeadModel(BlipTextPretrainedModel):
+    """
+    Bert Model with a `masked language modeling` head on top.
+
+    Args:
+        config (:class:`BlipTextConfig`):
+            An instance of BlipTextConfig used to construct BlipTextLMHeadModel.
+
+    """
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config: BlipTextConfig):
+        super().__init__(config)
+
+        self.bert = BlipTextModel(config, add_pooling_layer=False)
+        self.cls = BlipTextOnlyMLMHead(config, embedding_weights=self.bert.embeddings.word_embeddings.weight)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=True,
+        reduction="mean",
+    ):
+        r"""
+        input_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`BertTokenizer`]. See [`PretrainedTokenizer.encode`] and
+            [`PretrainedTokenizer.__call__`] for details.
+
+        attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        position_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+        inputs_embeds (`paddle.Tensor` of shape `(batch_size, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        encoder_hidden_states  (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        past_key_values (`tuple(tuple(paddle.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`CausalLMOutputWithCrossAttentions`] instead of a plain tuple.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        if return_logits:
+            return prediction_scores[:, :-1, :]
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :]
+            labels = labels[:, 1:]
+            loss_fct = nn.CrossEntropyLoss(reduction=reduction)  # TODO label_smoothing=0.1
+            lm_loss = loss_fct(shifted_prediction_scores.reshape([-1, self.config.vocab_size]), labels.flatten())
+            if reduction == "none":
+                lm_loss = lm_loss.reshape([prediction_scores.shape[0], -1]).sum(1)
+
+        if not return_dict:
+            # note: we donot output pooler
+            if self.bert.pooler is None:
+                output = (prediction_scores,) + outputs[1:]
+            else:
+                output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = paddle.ones(input_shape, dtype=input_ids.dtype)
+
+        # cut decoder_input_ids if past_key_values is used
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "past_key_values": past_key_values,
+            "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
+            "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
+            "is_decoder": True,
+            # we must set return_dict False
+            "return_dict": False,
+        }
+
+    def prepare_attention_mask_for_generation(
+        self,
+        inputs: paddle.Tensor,
+        pad_token_id: Optional[int],
+        eos_token_id: Optional[int],
+    ) -> paddle.Tensor:
+        # donot create 4d attention mask
+        is_input_ids = len(inputs.shape) == 2 and inputs.dtype in [paddle.int32, paddle.int64]
+        is_pad_token_in_inputs = (pad_token_id is not None) and (pad_token_id in inputs.tolist())
+        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (pad_token_id != eos_token_id)
+
+        # Check if input is input_ids and padded -> only then is attention_mask defined
+        if is_input_ids and is_pad_token_in_inputs and is_pad_token_not_equal_to_eos_token_id:
+            return (inputs != pad_token_id).cast("int64")
+        else:
+            return paddle.ones(inputs.shape[:2], dtype=paddle.int64)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip/processing.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip/processing.py
new file mode 100644
index 000000000..e32bdfa5a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip/processing.py
@@ -0,0 +1,119 @@
+# coding=utf-8
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Blip.
+"""
+
+
+from ..processing_utils import ProcessorMixin
+from ..tokenizer_utils_base import BatchEncoding
+
+__all__ = [
+    "BlipProcessor",
+]
+
+
+class BlipProcessor(ProcessorMixin):
+    r"""
+    Constructs a BLIP processor which wraps a BERT tokenizer and BLIP image processor into a single processor.
+
+    [`BlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`BertTokenizer`]. See the
+    docstring of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information.
+
+    Args:
+        image_processor (`BlipImageProcessor`):
+            An instance of [`BlipImageProcessor`]. The image processor is a required input.
+        tokenizer (`BertTokenizer`):
+            An instance of ['BertTokenizer`]. The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "BlipImageProcessor"
+    tokenizer_class = "BertTokenizer"
+
+    def __init__(self, image_processor, tokenizer):
+        tokenizer.model_input_names = ["input_ids", "attention_mask"]
+        super().__init__(image_processor, tokenizer)
+        self.current_processor = self.image_processor
+
+    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to Bert's [`~BertTokenizer.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        BlipImageProcessor's [`~BlipImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `paddle.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[paddle.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or Paddle
+                tensor. In case of a NumPy array/Paddle tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'pd'`: Return Paddle `paddle.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+
+        Returns:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+        if text is not None:
+            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+
+        if images is not None:
+            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+
+        if text is not None and images is not None:
+            encoding["pixel_values"] = image_features.pixel_values
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip_2/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip_2/__init__.py
new file mode 100644
index 000000000..595add0ae
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip_2/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip_2/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip_2/configuration.py
new file mode 100644
index 000000000..0dbf73dd7
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip_2/configuration.py
@@ -0,0 +1,366 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" BLIP-2 model configuration"""
+import copy
+import os
+from typing import Union
+
+from paddlenlp.transformers import AutoConfig
+
+from ...utils.log import logger
+from ..auto.modeling import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+from ..configuration_utils import PretrainedConfig
+from ..opt.configuration import OPTConfig
+from ..t5.configuration import T5Config
+
+__all__ = [
+    "Blip2VisionConfig",
+    "Blip2QFormerConfig",
+    "Blip2Config",
+]
+
+
+class Blip2VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Blip2VisionModel`]. It is used to instantiate a
+    BLIP-2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration defaults will yield a similar configuration to that of the BLIP-2
+    [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1408):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 6144):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 39):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
+            to 1e-5): The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float``, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import Blip2VisionConfig, Blip2VisionModel
+    >>> # Initializing a Blip2VisionConfig with Salesforce/blip2-opt-2.7b style configuration
+    >>> configuration = Blip2VisionConfig()
+    >>> # Initializing a Blip2VisionModel (with random weights) from the Salesforce/blip2-opt-2.7b style configuration
+    >>> model = Blip2VisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "blip_2_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=1408,
+        intermediate_size=6144,
+        projection_dim=512,
+        num_hidden_layers=39,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=224,
+        patch_size=14,
+        hidden_act="gelu",
+        layer_norm_eps=0.00001,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=1e-10,
+        initializer_factor=1.0,
+        qkv_bias=True,
+        **kwargs,
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.qkv_bias = qkv_bias
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from Blip2Config
+        if config_dict.get("model_type") == "blip-2":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class Blip2QFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Blip2QFormerModel`]. It is used to instantiate a
+    BLIP-2 Querying Transformer (Q-Former) model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the BLIP-2
+    [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture. Configuration objects
+    inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.
+    Note that [`Blip2QFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling the model.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+        cross_attention_frequency (`int`, *optional*, defaults to 2):
+            The frequency of adding cross-attention to the Transformer layers.
+        encoder_hidden_size (`int`, *optional*, defaults to 1408):
+            The hidden size of the hidden states for cross-attention.
+    Examples:
+    ```python
+    >>> from paddlenlp.transformers import Blip2QFormerConfig, Blip2QFormerModel
+    >>> # Initializing a BLIP-2 Salesforce/blip2-opt-2.7b style configuration
+    >>> configuration = Blip2QFormerConfig()
+    >>> # Initializing a model (with random weights) from the Salesforce/blip2-opt-2.7b style configuration
+    >>> model = Blip2QFormerModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "blip_2_qformer"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        classifier_dropout=None,
+        cross_attention_frequency=2,
+        encoder_hidden_size=1408,
+        **kwargs,
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.classifier_dropout = classifier_dropout
+        self.cross_attention_frequency = cross_attention_frequency
+        self.encoder_hidden_size = encoder_hidden_size
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the qformer config dict if we are loading from Blip2Config
+        if config_dict.get("model_type") == "blip-2":
+            config_dict = config_dict["qformer_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class Blip2Config(PretrainedConfig):
+    r"""
+    [`Blip2Config`] is the configuration class to store the configuration of a [`Blip2ForConditionalGeneration`]. It is
+    used to instantiate a BLIP-2 model according to the specified arguments, defining the vision model, Q-Former model
+    and language model configs. Instantiating a configuration with the defaults will yield a similar configuration to
+    that of the BLIP-2 [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`Blip2VisionConfig`].
+        qformer_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`Blip2QFormerConfig`].
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
+        num_query_tokens (`int`, *optional*, defaults to 32):
+            The number of query tokens passed through the Transformer.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import (
+    ...     Blip2VisionConfig,
+    ...     Blip2QFormerConfig,
+    ...     OPTConfig,
+    ...     Blip2Config,
+    ...     Blip2ForConditionalGeneration,
+    ... )
+    >>> # Initializing a Blip2Config with Salesforce/blip2-opt-2.7b style configuration
+    >>> configuration = Blip2Config()
+    >>> # Initializing a Blip2ForConditionalGeneration (with random weights) from the Salesforce/blip2-opt-2.7b style configuration
+    >>> model = Blip2ForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    >>> # We can also initialize a Blip2Config from a Blip2VisionConfig, Blip2QFormerConfig and any PretrainedConfig
+    >>> # Initializing BLIP-2 vision, BLIP-2 Q-Former and language model configurations
+    >>> vision_config = Blip2VisionConfig()
+    >>> qformer_config = Blip2QFormerConfig()
+    >>> text_config = OPTConfig()
+    >>> config = Blip2Config.from_text_vision_configs(vision_config, qformer_config, text_config)
+    ```"""
+
+    model_type = "blip-2"
+    is_composition = True
+
+    def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the Blip2VisionConfig with default values.")
+
+        if qformer_config is None:
+            qformer_config = {}
+            logger.info("qformer_config is None. Initializing the Blip2QFormerConfig with default values.")
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).")
+        self.vision_config = Blip2VisionConfig(**vision_config)
+        self.qformer_config = Blip2QFormerConfig(**qformer_config)
+        text_model_type = text_config["model_type"] if "model_type" in text_config else "opt"
+        # self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
+        if text_model_type == "t5":
+            self.text_config = T5Config(**text_config)
+        elif text_model_type == "opt":
+            self.text_config = OPTConfig(**text_config)
+        else:
+            self.text_config = AutoConfig(**text_config)
+
+        self.num_query_tokens = num_query_tokens
+        self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
+        self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+        # CONFIGURATION_MODEL_MAPPING = get_init_configurations()
+        # self.use_decoder_only_language_model = self.text_config.model_type in CONFIGURATION_MODEL_MAPPING
+        self.initializer_factor = 1.0
+        self.initializer_range = 0.02
+
+    @classmethod
+    def from_vision_qformer_text_configs(
+        cls,
+        vision_config: Blip2VisionConfig,
+        qformer_config: Blip2QFormerConfig,
+        text_config: PretrainedConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`Blip2Config`] (or a derived class) from a BLIP-2 vision model, Q-Former and language model
+        configurations.
+        Returns:
+            [`Blip2Config`]: An instance of a configuration object
+        """
+
+        return cls(
+            vision_config=vision_config.to_dict(),
+            qformer_config=qformer_config.to_dict(),
+            text_config=text_config.to_dict(),
+            **kwargs,
+        )
+
+    def to_dict(self, *args, **kwargs):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["vision_config"] = self.vision_config.to_dict()
+        output["qformer_config"] = self.qformer_config.to_dict()
+        output["text_config"] = self.text_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip_2/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip_2/modeling.py
new file mode 100644
index 000000000..4c350d8d5
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip_2/modeling.py
@@ -0,0 +1,1679 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Paddle BLIP2 model."""
+
+import math
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.distributed.fleet.utils import recompute
+from paddle.nn import CrossEntropyLoss
+
+from paddlenlp.utils.log import logger
+
+from ...utils.initializer import normal_, ones_, zeros_
+from ..activations import ACT2FN
+from ..model_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPooling,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    ModelOutput,
+)
+from ..model_utils import (
+    PretrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from ..opt.configuration import OPTConfig
+from ..opt.modeling import OPTForCausalLM
+from ..t5.configuration import T5Config
+from ..t5.modeling import T5ForConditionalGeneration
+from .configuration import Blip2Config, Blip2QFormerConfig, Blip2VisionConfig
+
+BLIP_2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "Salesforce/blip2-flan-t5-xl",
+]
+
+__all__ = [
+    "Blip2QFormerModel",
+    "Blip2Model",
+    "Blip2PretrainedModel",
+    "Blip2VisionModel",
+    "Blip2ForConditionalGeneration",
+]
+
+
+def Parameter(tensor):
+    return paddle.create_parameter(tensor.shape, dtype=tensor.dtype, default_initializer=nn.initializer.Assign(tensor))
+
+
+@dataclass
+class Blip2ForConditionalGenerationModelOutput(ModelOutput):
+    """
+    Class defining the outputs of [`Blip2ForConditionalGeneration`].
+    Args:
+        loss (`paddle.Tensor`, *optional*, returned when `labels` is provided, `paddle.Tensor` of shape `(1,)`):
+            Language modeling loss from the language model.
+        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head of the language model.
+        vision_outputs (`BaseModelOutputWithPooling`):
+            Outputs of the vision encoder.
+        qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
+            Outputs of the Q-Former (Querying Transformer).
+        language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
+            Outputs of the language model.
+    """
+
+    loss: Optional[Tuple[paddle.Tensor]] = None
+    logits: Optional[Tuple[paddle.Tensor]] = None
+    vision_outputs: Optional[paddle.Tensor] = None
+    qformer_outputs: Optional[Tuple[paddle.Tensor]] = None
+    language_model_outputs: Optional[Tuple[paddle.Tensor]] = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k]
+            if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"]
+            else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+# Copied from paddlenlp.transformers.blip.modeling.BlipVisionEmbeddings with Blip->Blip2
+class Blip2VisionEmbeddings(nn.Layer):
+    def __init__(self, config: Blip2VisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = Parameter(
+            paddle.randn([1, 1, self.embed_dim], dtype=paddle.get_default_dtype()),
+        )
+
+        self.patch_embedding = nn.Conv2D(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = Parameter(
+            paddle.randn([1, self.num_positions, self.embed_dim], dtype=paddle.get_default_dtype())
+        )
+
+    def forward(self, pixel_values: paddle.Tensor) -> paddle.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose([0, 2, 1])
+
+        class_embeds = self.class_embedding.expand([batch_size, 1, -1]).cast(target_dtype)
+        embeddings = paddle.concat([class_embeds, patch_embeds], axis=1)
+        embeddings = embeddings + self.position_embedding[:, : embeddings.shape[1], :].cast(target_dtype)
+        return embeddings
+
+
+class Blip2Attention(nn.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = nn.Dropout(config.attention_dropout)
+
+        # small tweak here compared to CLIP, no bias here
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias_attr=False)
+
+        if config.qkv_bias:
+            q_bias = Parameter(paddle.zeros([self.embed_dim], dtype=paddle.get_default_dtype()))
+            v_bias = Parameter(paddle.zeros([self.embed_dim], dtype=paddle.get_default_dtype()))
+        else:
+            q_bias = None
+            v_bias = None
+
+        if q_bias is not None:
+            qkv_bias = paddle.concat((q_bias, paddle.zeros_like(v_bias), v_bias))
+            self.qkv.bias = Parameter(qkv_bias)
+
+        self.projection = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
+        return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        head_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = hidden_states.shape
+
+        mixed_qkv = self.qkv(hidden_states)
+
+        mixed_qkv = mixed_qkv.reshape([bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads]).transpose(
+            [2, 0, 3, 1, 4]
+        )
+        query_states, key_states, value_states = (
+            mixed_qkv[0],
+            mixed_qkv[1],
+            mixed_qkv[2],
+        )
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_states, key_states, transpose_y=True)
+
+        attention_scores = attention_scores * self.scale
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = F.softmax(attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = paddle.matmul(attention_probs, value_states).transpose([0, 2, 1, 3])
+
+        new_context_layer_shape = context_layer.shape[:-2] + [
+            self.embed_dim,
+        ]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        output = self.projection(context_layer)
+
+        outputs = (output, attention_probs) if output_attentions else (output, None)
+
+        return outputs
+
+
+# Copied from paddlenlp.transformers.blip.modeling.BlipMLP
+class Blip2MLP(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from paddlenlp.transformers.blip.modeling.BlipEncoderLayer with Blip->Blip2
+class Blip2EncoderLayer(nn.Layer):
+    def __init__(self, config: Blip2Config):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = Blip2Attention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
+        self.mlp = Blip2MLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: paddle.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        """
+        Args:
+            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`paddle.Tensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            head_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = hidden_states + residual
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class Blip2PretrainedModel(PretrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = Blip2Config
+    base_model_prefix = "blip"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [
+        r"position_ids",
+        r"language_model.encoder.embed_tokens.weight",
+        r"language_model.decoder.embed_tokens.weight",
+    ]
+    _no_split_modules = ["Blip2Attention", "T5Block", "OPTDecoderLayer"]
+    _keep_in_fp32_modules = ["wo"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_range
+        if isinstance(module, nn.Conv2D) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
+            normal_(module.weight, mean=0.0, std=factor)
+            if hasattr(module, "bias") and module.bias is not None:
+                zeros_(module.bias)
+
+        if isinstance(module, Blip2VisionEmbeddings):
+            if hasattr(self.config, "vision_config"):
+                factor = self.config.vision_config.initializer_range
+            trunc_normal_ = nn.initializer.TruncatedNormal(mean=0.0, std=factor)
+            trunc_normal_(module.position_embedding)
+            trunc_normal_(
+                module.class_embedding,
+            )
+
+        elif isinstance(module, nn.LayerNorm):
+            zeros_(module.bias)
+            ones_(module.weight)
+        elif isinstance(module, nn.Linear) and module.bias is not None:
+            zeros_(module.bias)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, Blip2Encoder):
+            module.gradient_checkpointing = value
+
+
+# Copied from paddlenlp.transformers.blip.modeling.BlipEncoder with Blip->Blip2
+class Blip2Encoder(nn.Layer):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`Blip2EncoderLayer`].
+    Args:
+        config (`Blip2Config`):
+            The corresponding vision configuration for the `Blip2Encoder`.
+    """
+
+    def __init__(self, config: Blip2Config):
+        super().__init__()
+        self.config = config
+        self.layers = nn.LayerList([Blip2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = recompute(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+# Copied from paddlenlp.transformers.blip.modeling.BlipVisionModel with Blip->Blip2, BLIP->BLIP_2
+class Blip2VisionModel(Blip2PretrainedModel):
+    main_input_name = "pixel_values"
+    config_class = Blip2VisionConfig
+
+    def __init__(self, config: Blip2VisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = Blip2VisionEmbeddings(config)
+        self.encoder = Blip2Encoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
+
+    def forward(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+
+class Blip2QFormerMultiHeadAttention(nn.Layer):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
+                % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.save_attention = False
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.shape[:-1] + [self.num_attention_heads, self.attention_head_size]
+        x = x.reshape(new_x_shape)
+        return x.transpose([0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = paddle.concat([past_key_value[0], key_layer], axis=2)
+            value_layer = paddle.concat([past_key_value[1], value_layer], axis=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        mixed_query_layer = self.query(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_layer, key_layer, transpose_y=True)
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.shape[1]
+            position_ids_l = paddle.arange(seq_length, dtype="int64").reshape([-1, 1])
+            position_ids_r = paddle.arange(seq_length, dtype="int64").reshape([1, -1])
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.cast(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = paddle.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(axis=-1)(attention_scores)
+
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = paddle.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.transpose([0, 2, 1, 3])
+        new_context_layer_shape = context_layer.shape[:-2] + [
+            self.all_head_size,
+        ]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Copied from paddlenlp.transformers.bert.modeling.BertSelfOutput with Bert->Blip2QFormer
+class Blip2QFormerSelfOutput(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class Blip2QFormerAttention(nn.Layer):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.attention = Blip2QFormerMultiHeadAttention(config, is_cross_attention)
+        self.output = Blip2QFormerSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        encoder_attention_mask: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        self_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from paddlenlp.transformers.bert.modeling.BertIntermediate with Bert->Blip2QFormer
+class Blip2QFormerIntermediate(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from paddlenlp.transformers.bert.modeling.BertOutput with Bert->Blip2QFormer
+class Blip2QFormerOutput(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class Blip2QFormerLayer(nn.Layer):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = Blip2QFormerAttention(config)
+
+        self.layer_idx = layer_idx
+
+        if layer_idx % config.cross_attention_frequency == 0:
+            self.crossattention = Blip2QFormerAttention(config, is_cross_attention=True)
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+
+        self.intermediate_query = Blip2QFormerIntermediate(config)
+        self.output_query = Blip2QFormerOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+
+        present_key_value = self_attention_outputs[-1]
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                if encoder_hidden_states is None:
+                    raise ValueError("encoder_hidden_states must be given for cross-attention layers")
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                # add cross attentions if we output attention weights
+                outputs = outputs + cross_attention_outputs[1:-1]
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = paddle.concat([layer_output, layer_output_text], axis=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+
+        outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+
+class Blip2QFormerEncoder(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.LayerList(
+            [Blip2QFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+
+        next_decoder_cache = () if use_cache else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions, query_length)
+
+                    return custom_forward
+
+                layer_outputs = recompute(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if layer_module.has_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class Blip2QFormerModel(Blip2PretrainedModel):
+    """
+    Querying Transformer (Q-Former), used in BLIP-2.
+    """
+
+    def __init__(self, config: Blip2QFormerConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.encoder = Blip2QFormerEncoder(config)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self,
+        attention_mask: paddle.Tensor,
+        input_shape: Tuple[int],
+        has_query: bool = False,
+    ) -> paddle.Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+        Arguments:
+            attention_mask (`paddle.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`Tuple[int]`):
+                The shape of the input to the model.
+        Returns:
+            `paddle.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.cast(dtype=self.config.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def invert_attention_mask(self, encoder_attention_mask: paddle.Tensor) -> paddle.Tensor:
+        """
+        Invert an attention mask (e.g., switches 0. and 1.).
+        Args:
+            encoder_attention_mask (`paddle.Tensor`): An attention mask.
+        Returns:
+            `paddle.Tensor`: The inverted attention mask.
+        """
+        if encoder_attention_mask.ndim == 3:
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+        if encoder_attention_mask.ndim == 2:
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
+        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
+        # /transformer/transformer_layers.py#L270
+        # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
+        # encoder_extended_attention_mask.transpose(-1, -2))
+        encoder_extended_attention_mask = encoder_extended_attention_mask.cast(
+            dtype=self.config.dtype
+        )  # fp16 compatibility
+        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4
+
+        return encoder_extended_attention_mask
+
+    def get_head_mask(
+        self, head_mask: Optional[paddle.Tensor], num_hidden_layers: int, is_attention_chunked: bool = False
+    ) -> paddle.Tensor:
+        """
+        Prepare the head mask if needed.
+        Args:
+            head_mask (`paddle.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):
+                The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
+            num_hidden_layers (`int`):
+                The number of hidden layers in the model.
+            is_attention_chunked: (`bool`, *optional*, defaults to `False`):
+                Whether or not the attentions scores are computed by chunks or not.
+        Returns:
+            `paddle.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with
+            `[None]` for each layer.
+        """
+        if head_mask is not None:
+            head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
+            if is_attention_chunked is True:
+                head_mask = head_mask.unsqueeze(-1)
+        else:
+            head_mask = [None] * num_hidden_layers
+
+        return head_mask
+
+    def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
+        """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
+        if head_mask.ndim == 1:
+            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+            head_mask = head_mask.expand([num_hidden_layers, -1, -1, -1, -1])
+        elif head_mask.ndim == 2:
+            head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+        assert head_mask.ndim == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
+        head_mask = head_mask.cast(dtype=self.config.dtype)  # switch to float if need + fp16 compatibility
+        return head_mask
+
+    def forward(
+        self,
+        query_embeds,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(paddle.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of:
+            shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
+            value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
+            used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
+            `(batch_size, sequence_length)`.
+        use_cache (`bool`, `optional`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0
+        )
+
+        query_length = query_embeds.shape[1] if query_embeds is not None else 0
+
+        embedding_output = self.layernorm(query_embeds.cast(self.layernorm.weight.dtype))
+        embedding_output = self.dropout(embedding_output)
+
+        input_shape = embedding_output.shape[:-1]
+        batch_size, seq_length = input_shape
+
+        if attention_mask is None:
+            attention_mask = paddle.ones(((batch_size, seq_length + past_key_values_length)))
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].shape
+            else:
+                (
+                    encoder_batch_size,
+                    encoder_sequence_length,
+                    _,
+                ) = encoder_hidden_states.shape
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = paddle.ones(encoder_hidden_shape)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = sequence_output[:, 0, :]
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+class Blip2Model(Blip2PretrainedModel):
+    config_class = Blip2Config
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: Blip2Config):
+        super().__init__(config)
+
+        self.vision_model = Blip2VisionModel(config.vision_config)
+
+        self.query_tokens = Parameter(paddle.zeros([1, config.num_query_tokens, config.qformer_config.hidden_size]))
+        self.qformer = Blip2QFormerModel(config.qformer_config)
+
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+        if config.use_decoder_only_language_model:
+            if isinstance(config.text_config, OPTConfig):
+                language_model = OPTForCausalLM(config.text_config)
+            else:
+                raise NotImplementedError
+        else:
+            if isinstance(config.text_config, T5Config):
+                language_model = T5ForConditionalGeneration(config.text_config)
+            else:
+                raise NotImplementedError
+        self.language_model = language_model
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.vision_model.embeddings.patch_embedding
+
+    def get_text_features(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        decoder_input_ids: Optional[paddle.Tensor] = None,
+        decoder_attention_mask: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns:
+            text_outputs (`CausalLMOutputWithPast`, or `tuple(paddle.Tensor)` if `return_dict=False`):
+                The language model outputs. If `return_dict=True`, the output is a [`CausalLMOutputWithPast`] that
+                contains the language model logits, the past key values and the hidden states if
+                `output_hidden_states=True`.
+        Examples:
+        ```python
+        >>> import paddle
+        >>> from paddlenlp.transformers import AutoTokenizer, Blip2Model
+        >>> model = Blip2Model.from_pretrained("Salesforce/blip2-flan-t5-xl")
+        >>> model.to(device)  # doctest: +IGNORE_RESULT
+        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/blip2-flan-t5-xl")
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt").to(device)
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.use_decoder_only_language_model:
+            text_outputs = self.language_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        else:
+            inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+
+            text_outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                labels=labels,
+            )
+
+        return text_outputs
+
+    def get_image_features(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns:
+            vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`):
+                The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that
+                contains the image features, the pooled image features and the hidden states if
+                `output_hidden_states=True`.
+        Examples:
+        ```python
+        >>> import paddle
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import AutoProcessor, Blip2Model
+        >>> model = Blip2Model.from_pretrained("Salesforce/blip2-flan-t5-xl")
+        >>> model.to(device)  # doctest: +IGNORE_RESULT
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip2-flan-t5-xl")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pd")
+        >>> image_outputs = model.get_image_features(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return vision_outputs
+
+    def get_qformer_features(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns:
+            vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`):
+                The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that
+                contains the image features, the pooled image features and the hidden states if
+                `output_hidden_states=True`.
+        Examples:
+        ```python
+        >>> import paddle
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import Blip2Processor, Blip2Model
+        >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
+        >>> model = Blip2Model.from_pretrained("Salesforce/blip2-flan-t5-xl")
+        >>> model.to(device)  # doctest: +IGNORE_RESULT
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pt")
+        >>> qformer_outputs = model.get_qformer_features(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[0]
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+
+        query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return query_outputs
+
+    def forward(
+        self,
+        pixel_values: paddle.Tensor,
+        input_ids: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        decoder_input_ids: Optional[paddle.Tensor] = None,
+        decoder_attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[paddle.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Blip2ForConditionalGenerationModelOutput]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import Blip2Processor, Blip2Model
+        >>> import paddle
+        >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
+        >>> model = Blip2Model.from_pretrained("Salesforce/blip2-flan-t5-xl")
+        >>> model.to(device)  # doctest: +IGNORE_RESULT
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> prompt = "Question: how many cats are there? Answer:"
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pd")
+        >>> outputs = model(pixel_values=inputs["pixel_values"],input_ids=inputs["input_ids"])
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[0]
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+
+        query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        query_output = query_outputs[0]
+
+        # step 3: use the language model, conditioned on the query outputs and the prompt
+        language_model_inputs = self.language_projection(query_output)
+        language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64")
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+        inputs_embeds = paddle.concat([language_model_inputs, inputs_embeds], axis=1)
+
+        if attention_mask is None:
+            attention_mask = paddle.ones_like(input_ids)
+
+        attention_mask = paddle.concat([language_model_attention_mask, attention_mask], axis=1)
+
+        if self.config.use_decoder_only_language_model:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            logits = outputs.logits if return_dict else outputs[0]
+            loss = None
+            # we compute the loss here since we need to take into account the sequence length of the query embeds
+            if labels is not None:
+                logits = logits[:, -labels.shape[1] :, :]
+                # Shift so that tokens < n predict n
+                shift_logits = logits[..., :-1, :]
+                shift_labels = labels[..., 1:]
+
+                # Flatten the tokens
+                loss_fct = CrossEntropyLoss(reduction="mean")
+
+                loss = loss_fct(
+                    shift_logits.reshape([-1, self.config.text_config.vocab_size]), shift_labels.reshape([-1])
+                )
+        else:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                labels=labels,
+            )
+            loss = outputs.loss if return_dict else outputs[0]
+            logits = outputs.logits if return_dict else outputs[1]
+
+        if not return_dict:
+            output = (logits, vision_outputs, query_outputs, outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return Blip2ForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+
+
+class Blip2ForConditionalGeneration(Blip2PretrainedModel):
+    config_class = Blip2Config
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: Blip2Config):
+        super().__init__(config)
+
+        self.vision_model = Blip2VisionModel(config.vision_config)
+
+        self.query_tokens = Parameter(paddle.zeros([1, config.num_query_tokens, config.qformer_config.hidden_size]))
+        self.qformer = Blip2QFormerModel(config.qformer_config)
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+        if config.use_decoder_only_language_model:
+            if isinstance(config.text_config, OPTConfig):
+                language_model = OPTForCausalLM(config.text_config)
+            else:
+                raise NotImplementedError
+        else:
+            if isinstance(config.text_config, T5Config):
+                language_model = T5ForConditionalGeneration(config.text_config)
+            else:
+                raise NotImplementedError
+        self.language_model = language_model
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(
+        self,
+        pixel_values: paddle.Tensor,
+        input_ids: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        decoder_input_ids: Optional[paddle.Tensor] = None,
+        decoder_attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[paddle.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Blip2ForConditionalGenerationModelOutput]:
+        r"""
+        Returns:
+        Examples:
+        Image captioning (without providing a text prompt):
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import Blip2Processor, Blip2ForConditionalGeneration
+        >>> import paddle
+        >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
+        >>> model = Blip2ForConditionalGeneration.from_pretrained(
+        ...     "Salesforce/blip2-flan-t5-xl"
+        ... )
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pd")
+        >>> generated_ids, scores = model.generate(**inputs)
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        >>> print(generated_text)
+        two cats laying on a couch
+        ```
+        Visual question answering (prompt = question):
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import Blip2Processor, Blip2ForConditionalGeneration
+        >>> import paddle
+        >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
+        >>> model = Blip2ForConditionalGeneration.from_pretrained(
+        ...     "Salesforce/blip2-flan-t5-xl"
+        ... )
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> prompt = "Question: how many cats are there? Answer:"
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pd")
+        >>> generated_ids, scores= model.generate(**inputs)
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        >>> print(generated_text)
+        two
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[0]
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+
+        query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        query_output = query_outputs[0]
+
+        # step 3: use the language model, conditioned on the query outputs and the prompt
+        language_model_inputs = self.language_projection(query_output)
+        language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64")
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids).cast(dtype=language_model_inputs.dtype)
+        inputs_embeds = paddle.concat([language_model_inputs, inputs_embeds], axis=1)
+
+        if attention_mask is None:
+            attention_mask = paddle.ones_like(input_ids)
+
+        attention_mask = paddle.concat([language_model_attention_mask, attention_mask], axis=1)
+
+        if self.config.use_decoder_only_language_model:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            logits = outputs.logits if return_dict else outputs[0]
+            loss = None
+            # we compute the loss here since we need to take into account the sequence length of the query embeds
+            if labels is not None:
+                logits = logits[:, -labels.shape[1] :, :]
+                # Shift so that tokens < n predict n
+                shift_logits = logits[..., :-1, :]
+                shift_labels = labels[..., 1:]
+
+                # Flatten the tokens
+                loss_fct = CrossEntropyLoss(reduction="mean")
+
+                loss = loss_fct(
+                    shift_logits.reshape([-1, self.config.text_config.vocab_size]), shift_labels.reshape([-1])
+                )
+        else:
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                labels=labels,
+            )
+            loss = outputs.loss if return_dict else outputs[0]
+            logits = outputs.logits if return_dict else outputs[1]
+
+        if not return_dict:
+            output = (logits, vision_outputs, query_outputs, outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return Blip2ForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+
+    @paddle.no_grad()
+    def generate(
+        self,
+        pixel_values: paddle.Tensor,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        **generate_kwargs,
+    ) -> paddle.Tensor:
+        """
+        Overrides `generate` function to be able to use the model as a conditional generator.
+        Args:
+            pixel_values (`paddle.Tensor` of shape (batch_size, num_channels, height, width)):
+                Input images to be processed.
+            input_ids (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*):
+                The sequence used as a prompt for the generation.
+            attention_mask (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*):
+                Mask to avoid performing attention on padding token indices
+        Returns:
+            captions (list): A list of strings of length batch_size * num_captions.
+        """
+        batch_size = pixel_values.shape[0]
+        image_embeds = self.vision_model(pixel_values, return_dict=True).last_hidden_state
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+
+        query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=True,
+        )
+        query_output = query_outputs.last_hidden_state
+
+        language_model_inputs = self.language_projection(query_output)
+        language_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64")
+        if input_ids is None:
+            input_ids = paddle.to_tensor([[self.config.text_config.bos_token_id]]).tile([batch_size, 1])
+        if attention_mask is None:
+            attention_mask = paddle.ones_like(input_ids)
+        attention_mask = paddle.concat([language_attention_mask, attention_mask], axis=1)
+
+        # concatenate query embeddings with prompt embeddings
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+        inputs_embeds = paddle.concat([language_model_inputs, inputs_embeds], axis=1)
+
+        outputs = self.language_model.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            **generate_kwargs,
+        )
+
+        return outputs
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip_2/processing.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip_2/processing.py
new file mode 100644
index 000000000..65fdd7af2
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/blip_2/processing.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Processor class for BLIP-2.
+"""
+
+from typing import List, Optional, Union
+
+from ..processing_utils import ProcessorMixin
+from ..tokenizer_utils_base import (
+    BatchEncoding,
+    PreTokenizedInput,
+    TensorType,
+    TextInput,
+)
+
+__all__ = [
+    "Blip2Processor",
+]
+
+
+class Blip2Processor(ProcessorMixin):
+    r"""
+    Constructs a BLIP-2 processor which wraps a BLIP image processor and an OPT/T5 tokenizer into a single processor.
+    [`BlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`AutoTokenizer`]. See the docstring
+    of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information.
+    Args:
+        image_processor (`BlipImageProcessor`):
+            An instance of [`BlipImageProcessor`]. The image processor is a required input.
+        tokenizer (`AutoTokenizer`):
+            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "BlipImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    # Copied from paddlenlp.transformers.blip.processing.BlipProcessor.__init__
+    def __init__(self, image_processor, tokenizer):
+        tokenizer.return_token_type_ids = False
+        super().__init__(image_processor, tokenizer)
+        self.current_processor = self.image_processor
+
+    # Copied from paddlenlp.transformers.blip.processing.BlipProcessor.__call__
+    def __call__(
+        self,
+        images=None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and
+        [`BertTokenizerFast.__call__`] to prepare text for the model.
+        Please refer to the docstring of the above two methods for more information.
+        """
+        if images is None and text is None:
+            raise ValueError("You have to specify either images or text.")
+
+        # Get only text
+        if images is None:
+            self.current_processor = self.tokenizer
+            text_encoding = self.tokenizer(
+                text=text,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+            return text_encoding
+
+        # add pixel_values
+        encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
+
+        if text is not None:
+            text_encoding = self.tokenizer(
+                text=text,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+        else:
+            text_encoding = None
+
+        if text_encoding is not None:
+            encoding_image_processor.update(text_encoding)
+
+        return encoding_image_processor
+
+    # Copied from paddlenlp.transformers.blip.processing.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    # Copied from paddlenlp.transformers.blip.processing.BlipProcessor.decode with BertTokenizerFast->PreTrainedTokenizer
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    # Copied from paddlenlp.transformers.blip.processing.BlipProcessor.model_input_names
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bloom/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bloom/__init__.py
new file mode 100644
index 000000000..595add0ae
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bloom/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bloom/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bloom/configuration.py
new file mode 100644
index 000000000..adc227b33
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bloom/configuration.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Bloom model configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["BLOOM_PRETRAINED_INIT_CONFIGURATION", "BloomConfig", "BLOOM_PRETRAINED_RESOURCE_FILES_MAP"]
+
+
+def _construct_resource_file_url(model_names: list[str], file_name: str) -> dict[str, str]:
+    """construct resource file dict object according to the file type
+
+    TODO(wj-Mcat): this method will be moved into `PretrainedConfig` later
+
+    Args:
+        file_name (str): the name of target file
+
+    Returns:
+        dict[str, str]: the dict info of pretrained
+    """
+    return {
+        model_name: f"https://paddlenlp.bj.bcebos.com/models/community/{model_name}/{file_name}"
+        for model_name in model_names
+    }
+
+
+BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "bigscience/bloom",
+    "bigscience/bloom-560m",
+    "bigscience/bloom-1b1",
+    "bigscience/bloom-1b3",
+    "bigscience/bloom-1b7",
+    "bigscience/bloom-3b",
+    "bigscience/bloom-7b1",
+    "bigscience/bloomz",
+    "bigscience/bloomz-mt",
+    "bigscience/bloomz-560m",
+    "bigscience/bloomz-1b1",
+    "bigscience/bloomz-1b3",
+    "bigscience/bloomz-1b7",
+    "bigscience/bloomz-3b",
+    "bigscience/bloomz-7b1",
+]
+
+BLOOM_PRETRAINED_INIT_CONFIGURATION = _construct_resource_file_url(BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST, "config.json")
+
+
+BLOOM_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": _construct_resource_file_url(BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST, "model_state.pdparams")
+}
+
+
+class BloomConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`BloomModel`]. It is used to
+    instantiate a BLOOM model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the BLOOM
+    bigscience/bloom-560m architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python
+    >>> from paddlenlp.transformers import BloomModel, BloomConfig
+
+    >>> # Initializing a BLOOM bigscience/bloom-560m style configuration
+    >>> configuration = BloomConfig()
+
+    >>> # Initializing a model from the bigscience/bloom-560m style configuration
+    >>> model = BloomModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "bloom"
+    attribute_map: Dict[str, str] = {}  # noqa: F811
+    attribute_map = {"num_attention_heads": "n_head", "n_embed": "hidden_size"}
+
+    pretrained_init_configuration = BLOOM_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size=250880,
+        hidden_size=64,
+        n_layer=2,
+        n_head=8,
+        masked_softmax_fusion=True,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_cache=False,
+        bos_token_id=1,
+        eos_token_id=2,
+        pad_token_id=3,
+        apply_residual_connection_post_layernorm=False,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        attention_softmax_in_fp32=True,
+        pretraining_tp=1,  # TP rank used when training with megatron
+        slow_but_exact=False,
+        long_sequence_strategy_type=None,
+        long_sequence_strategy_name=None,
+        long_sequence_init_args=None,
+        use_long_sequence_strategies=False,
+        **kwargs,
+    ):
+
+        self.n_head = n_head
+        self.hidden_size = hidden_size
+
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.n_layer = n_layer
+        self.masked_softmax_fusion = masked_softmax_fusion
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.pretraining_tp = pretraining_tp
+        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.slow_but_exact = slow_but_exact
+
+        self.long_sequence_strategy_type = long_sequence_strategy_type
+        self.long_sequence_strategy_name = long_sequence_strategy_name
+        self.long_sequence_init_args = {} if long_sequence_init_args is None else long_sequence_init_args
+        self.use_long_sequence_strategies = use_long_sequence_strategies
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bloom/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bloom/modeling.py
new file mode 100644
index 000000000..f18b88f40
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bloom/modeling.py
@@ -0,0 +1,1907 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc. team and BigScience workshop.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Paddle BLOOM model."""
+from __future__ import annotations
+
+import math
+from functools import partial
+from typing import Optional, Tuple, Union
+
+import paddle
+import paddle.nn.functional as F
+from paddle import Tensor, nn
+from paddle.autograd import PyLayer
+from paddle.distributed import fleet
+from paddle.distributed.fleet.utils import recompute
+
+from paddlenlp.transformers.long_sequence_strategies import LongSequenceStrategies
+from paddlenlp.transformers.model_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from paddlenlp.transformers.model_utils import PretrainedModel
+from paddlenlp.utils.converter import StateDictNameMapping, init_name_mappings
+from paddlenlp.utils.log import logger
+
+from .configuration import BloomConfig
+from .processor import (
+    ForcedBOSTokenLogitsProcessor,
+    ForcedEOSTokenLogitsProcessor,
+    HammingDiversityLogitsProcessor,
+    LogitsProcessorList,
+    RepetitionPenaltyLogitsProcessor,
+)
+
+__all__ = [
+    "BloomModel",
+    "BloomForPretraining",
+    "BloomForCausalLM",
+    "BloomForSequenceClassification",
+    "BloomForTokenClassification",
+    "BloomForGeneration",
+]
+
+
+def parallel_matmul(x: Tensor, y: Tensor, parallel_output=True):
+    is_fleet_init = True
+    world_size = 1
+    try:
+        hcg = fleet.get_hybrid_communicate_group()
+        model_parallel_group = hcg.get_model_parallel_group()
+        world_size = hcg.get_model_parallel_world_size()
+    except:
+        is_fleet_init = False
+    if is_fleet_init and world_size > 1:
+        # if not running under distributed.launch, it will raise AttributeError: 'Fleet' object has no attribute '_hcg'
+        hcg = fleet.get_hybrid_communicate_group()
+        model_parallel_group = hcg.get_model_parallel_group()
+        input_parallel = paddle.distributed.collective._c_identity(x, group=model_parallel_group)
+        logits = paddle.matmul(input_parallel, y, transpose_y=True)
+        if parallel_output:
+            return logits
+        return paddle.distributed.collective._c_concat(logits, group=model_parallel_group)
+    else:
+        logits = paddle.matmul(x, y, transpose_y=True)
+        return logits
+
+
+def split_tensor_along_last_dim(tensor: Tensor, num_partitions: int, contiguous_split_chunks: bool = False):
+    """Split a tensor along its last dimension -> query/key/value layer
+    Args:
+        tensor: ([`paddle.Tensor`], *required*):
+            input tensor to split
+        num_partitions ([`int`], *required*):
+            number of partitions to split the tensor
+        contiguous_split_chunks ([`bool`], *optional*, default=`False`)::
+            If True, make each chunk contiguous in memory.
+    """
+    return paddle.split(tensor, 3, axis=-1)
+
+
+def _make_causal_mask(input_ids_shape, past_key_values_length: int) -> Tensor:
+    """
+    Make causal mask used for self-attention.
+    """
+    batch_size, target_length = input_ids_shape
+    mask = paddle.ones((target_length, target_length + past_key_values_length), dtype="bool")
+    # ONNX doesn't support `Tensor.triu` properly, thus we use this workaround
+    seq_ids = paddle.arange(target_length)
+    mask[:, past_key_values_length:] = seq_ids[:, None] >= seq_ids[None, :]
+
+    expanded_mask = mask.unsqueeze(axis=[0, 1]).expand(
+        [batch_size, 1, target_length, target_length + past_key_values_length]
+    )
+    return expanded_mask
+
+
+def _expand_2d_mask(mask: Tensor, tgt_length: int) -> Tensor:
+    """
+    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
+    """
+    batch_size, src_length = mask.shape[0], mask.shape[-1]
+    tgt_length = tgt_length if tgt_length is not None else src_length
+
+    mask.stop_gradient = True
+    return mask.unsqueeze(axis=[1, 2]).expand([batch_size, 1, tgt_length, src_length])
+
+
+def build_alibi_tensor(attention_mask: Tensor, num_heads: int, dtype) -> Tensor:
+    """
+    Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it
+    relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value
+    `softmax(l+a) = softmax(l)`. Based on
+    https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742
+    TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.
+
+    Args:
+    Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
+        attention_mask (`Tensor`):
+            Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
+        num_heads (`int`, *required*):
+            number of heads
+        dtype (`paddle.dtype`, *optional*, default=`paddle.bfloat16`):
+            dtype of the output tensor
+    """
+    # _, seq_length = attention_mask.shape[0], attention_mask.shape[-1]
+    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
+    base = paddle.full([], 2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), dtype=paddle.float32)
+    powers = paddle.arange(1, 1 + closest_power_of_2, dtype=paddle.float32)
+    slopes = paddle.pow(base, powers)
+
+    if closest_power_of_2 != num_heads:
+        extra_base = paddle.to_tensor(2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), dtype=paddle.float32)
+        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
+        extra_powers = paddle.arange(1, 1 + 2 * num_remaining_heads, 2, dtype=paddle.float32)
+        slopes = paddle.concat([slopes, paddle.pow(extra_base, extra_powers)], axis=0)
+
+    # Note: alibi will added to the attention bias that will be applied to the query, key product of attention
+    # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length)
+    # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length)
+    # => the query_length dimension will then be broadcasted correctly
+    # This is more or less identical to T5's relative position bias:
+    # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527
+    arange_tensor = (
+        (attention_mask.astype(paddle.float32).cumsum(axis=-1) - 1) * attention_mask.astype(paddle.float32)
+    )[:, None, :]
+    alibi = slopes[..., None] * arange_tensor
+    # return alibi
+    return paddle.cast(alibi, dtype)
+    # return paddle.cast(alibi.reshape([batch_size * num_heads, 1, seq_length]), dtype)
+
+
+def dropout_add(x: Tensor, residual: Tensor, prob: float, training: bool) -> Tensor:
+    """
+    Dropout add function
+
+    Args:
+        x (`paddle.tensor`, *required*):
+            input tensor
+        residual (`paddle.tensor`, *required*):
+            esidual tensor
+        prob (`float`, *required*):
+            dropout probability
+        training (`bool`, *required*):
+            training mode
+    """
+    out = F.dropout(x, p=prob, training=training)
+    out = residual + out
+    return out
+
+
+def pre_process_alibi_for_pad(alibi, attention_mask, num_heads):
+    """
+    Args:
+    Pre-process the alibi tensor for padding.
+        alibi: ([`paddle.tensor`], *required*):
+            alibi tensor to pre-process
+        attention_mask: ([`paddle.tensor`], *required*):
+            attention mask to pre-process"""
+
+    # Sanity check if we are not inferring less tokens than the total sequence length
+    # This usually happens when the inference is done with past_key_values
+    # In this case we re-create the alibi tensor with the correct sequence length
+    if attention_mask.shape[-1] != alibi.shape[-1]:
+        alibi = build_alibi_tensor(attention_mask, num_heads, alibi.dtype).repeat_interleave(
+            attention_mask.shape[0], axis=0
+        )
+    # Get the indexes of the padding tokens
+    index_x0, index_y0 = paddle.where(attention_mask == 0.0)
+    index_x1, index_y1 = paddle.where(attention_mask == 1.0)
+
+    # Clone the embeddings  - we can detach because the embeddings are not learned
+    # Get a refence tensor
+    slice_reference_alibi = build_alibi_tensor(attention_mask, num_heads, alibi.dtype)
+
+    # Loop over the batch where the padding is and replace the alibi tensor by the reference tensor
+    # Only where you do not have padding. Replace padding tokens by zeros
+    # This operation can be seen as a shifting operation.
+    for i, index in enumerate(paddle.unique(index_x0)):
+        slice_to_modify = paddle.zeros_like(slice_reference_alibi)
+        index_shift = index_y1[index_x1 == index]
+        shift_value = len(index_shift)
+        slice_to_modify[:, :, index_shift] = slice_reference_alibi[:, :, :shift_value]
+        alibi[index * num_heads : (index + 1) * num_heads] = slice_to_modify
+    return alibi
+
+
+def bloom_gelu_forward(x):
+    """
+    Custom bias GELU function. Adapted from Megatron-DeepSpeed code. Here we use a simple implementation (inference) to
+    make the model jitable.
+
+    Args:
+        x (`paddle.tensor`, *required*):
+            input hidden states
+    """
+    return x * 0.5 * (1.0 + paddle.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+
+
+def bloom_gelu_back(g, x):
+    """
+    gradient of tanh approximation of gelu gradient of actual gelu is: 0.5 * (1. + paddle.erf(x * 0.70710678)) +
+    0.3989423 * x * paddle.exp(-0.5 * x * x)
+
+    Args:
+        g (`paddle.tensor`, *required*):
+            gradient output tensor
+        x (`paddle.tensor`, *required*):
+            input tensor
+    """
+    x = x[0]  # x is a tuple of 1 element, needs to unpack it first
+    tanh_out = paddle.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
+    return ff * g
+
+
+def baddbmm(input, batch1, batch2, beta=1.0, alpha=1.0):
+    return beta * input + alpha * paddle.matmul(batch1, batch2)
+
+
+class GeLUFunction(PyLayer):
+    @staticmethod
+    def forward(ctx, input):
+        ctx.save_for_backward(input)
+        return bloom_gelu_forward(input)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input = ctx.saved_tensors
+        return bloom_gelu_back(grad_output, input)
+
+
+class BloomGelu(nn.Layer):
+    """
+    BloomBiasGelu wrapper function that make use of the simple function on inference mode to make the model
+    paddlescriptable and use the autograd function in training mode to get the accurate results of the gradients Partly
+    copied from Megatron-DeepSpeed code and adapted for our needs
+
+    See here why autograd functions are not paddlescriptable: https://github.com/pypaddle/pypaddle/issues/22329
+
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return bloom_gelu_forward(x)
+        # if self.training and in_dygraph_mode():
+        #    return GeLUFunction.apply(x)
+        # else:
+        #    return bloom_gelu_forward(x)
+
+
+class BloomAttention(nn.Layer):
+    def __init__(self, config, layer_number=None):
+        super().__init__()
+
+        self.pretraining_tp = config.pretraining_tp
+        self.slow_but_exact = config.slow_but_exact
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.n_head
+        self.head_dim = self.hidden_size // self.num_heads
+        self.split_size = self.hidden_size
+        self.hidden_dropout = config.hidden_dropout
+        self.config = config
+
+        if config.tensor_parallel_degree > 1:
+            assert self.num_heads % config.tensor_parallel_degree == 0
+            self.num_heads = self.num_heads // config.tensor_parallel_degree
+
+        # Layer-wise attention scaling
+        self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
+        self.beta = 1.0
+
+        if config.tensor_parallel_degree > 1:
+            self.query_key_value = fleet.meta_parallel.ColumnParallelLinear(
+                self.hidden_size, 3 * self.hidden_size, has_bias=True, gather_output=False
+            )
+        else:
+            self.query_key_value = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias_attr=True)
+
+        if config.tensor_parallel_degree > 1:
+            self.dense = fleet.meta_parallel.RowParallelLinear(
+                self.hidden_size, self.hidden_size, has_bias=True, input_is_parallel=True
+            )
+        else:
+            self.dense = nn.Linear(self.hidden_size, self.hidden_size)
+
+        self.attention_dropout = nn.Dropout(config.attention_dropout)
+
+    def _split_heads(self, fused_qkv: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
+        """
+        Split the last dimension into (num_heads, head_dim) without making any copies, results share same memory
+        storage as `fused_qkv`
+
+        Args:
+            fused_qkv (`paddle.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
+
+        Returns:
+            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
+            value: [batch_size, seq_length, num_heads, head_dim]
+        """
+        batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
+        fused_qkv = fused_qkv.reshape([batch_size, seq_length, self.num_heads, 3, self.head_dim])
+        return fused_qkv[..., 0, :], fused_qkv[..., 1, :], fused_qkv[..., 2, :]
+
+    def _merge_heads(self, x: Tensor) -> Tensor:
+        """
+        Merge heads together over the last dimenstion
+
+        Args:
+            x: (`paddle.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
+
+        Returns:
+            paddle.tensor: [batch_size, seq_length, num_heads * head_dim]
+        """
+        # What we want to achieve is:
+        # batch_size * num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads * head_dim
+        batch_size_and_num_heads, seq_length, _ = x.shape
+        batch_size = batch_size_and_num_heads // self.num_heads
+
+        # First view to decompose the batch size
+        # batch_size * num_heads, seq_length, head_dim -> batch_size, num_heads, seq_length, head_dim
+        x = x.reshape([batch_size, self.num_heads, seq_length, self.head_dim])
+
+        # batch_size, num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads, head_dim
+        x = x.transpose([0, 2, 1, 3])
+
+        # batch_size, seq_length, num_heads, head_dim -> batch_size, seq_length, num_heads * head_dim
+        return x.reshape([batch_size, seq_length, self.num_heads * self.head_dim])
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        residual: Tensor,
+        alibi: Tensor,
+        attention_mask: Tensor,
+        layer_past: Optional[Tuple[Tensor, Tensor]] = None,
+        head_mask: Optional[Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
+
+        # 3 x [batch_size, seq_length, num_heads, head_dim]
+        (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
+
+        batch_size, q_length, _, _ = query_layer.shape
+
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            # concatenate along seq_length dimension:
+            #  - key: [batch_size, kv_length, self.num_heads, head_dim]
+            #  - value: [batch_size, kv_length, self.num_heads, head_dim]
+            key_layer = paddle.concat((past_key, key_layer), axis=1)
+            value_layer = paddle.concat((past_value, value_layer), axis=1)
+
+        if use_cache is True:
+            present = (key_layer, value_layer)
+        else:
+            present = None
+
+        version = paddle.version.full_version
+        version_check = True
+        if self.config.use_flash_attention and version != "0.0.0" and version <= "2.5.2":
+            logger.warning(
+                "PaddlePaddle version 2.5.3 or higher is required, please upgrade your PaddlePaddle to 2.5.3 or other higher version."
+            )
+            version_check = False
+        if self.config.use_flash_attention and version_check:
+            query_states, key_states, value_states = query_layer, key_layer, value_layer
+
+            attention_mask = attention_mask.cast(alibi.dtype) + alibi
+            attention_mask = attention_mask.reshape(
+                [query_states.shape[0], -1, attention_mask.shape[-2], attention_mask.shape[-1]]
+            )
+            attn_output = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                dropout_p=self.config.attention_dropout,
+                training=self.training,
+                is_causal=False,
+            )
+            attn_weights = None
+            # [batch_size, seq_len, num_heads, head_dim] = > [batch_size, seq_len, hidden_size]
+            attn_output = attn_output.reshape([attn_output.shape[0], attn_output.shape[1], -1])
+            output_tensor = self.dense(attn_output)
+
+        else:
+            query_layer = query_layer.transpose([0, 2, 1, 3])
+            key_layer = key_layer.transpose([0, 2, 3, 1])
+            value_layer = value_layer.transpose([0, 2, 1, 3])
+            _, _, _, kv_length = key_layer.shape
+
+            query_layer = query_layer.reshape([batch_size * self.num_heads, q_length, self.head_dim])
+            key_layer = key_layer.reshape([batch_size * self.num_heads, self.head_dim, kv_length])
+            value_layer = value_layer.reshape([batch_size * self.num_heads, kv_length, self.head_dim])
+
+            # [batch_size * num_heads, q_length, kv_length]
+            # alibi:[batch_size * num_heads, q_length, kv_length]
+            # we use `Tensor.baddbmm` instead of `paddle.baddbmm` as the latter isn't supported by TorchScript v1.11
+            attention_scores = baddbmm(
+                alibi, batch1=query_layer, batch2=key_layer, beta=self.beta, alpha=self.inv_norm_factor
+            )
+            # change view to [batch_size, num_heads, q_length, kv_length]
+            # attention_scores = matmul_result.reshape([batch_size, self.num_heads, q_length, kv_length])
+
+            # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
+            input_dtype = query_layer.dtype
+            # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38`
+            if input_dtype != paddle.float32:
+                attention_scores = paddle.cast(attention_scores, paddle.float32)
+                attn_weights = attention_scores + attention_mask
+                attention_probs = paddle.cast(
+                    F.softmax(attn_weights, axis=-1, dtype=paddle.float32), dtype=input_dtype
+                )
+            else:
+                attn_weights = attention_scores + attention_mask
+                attention_probs = F.softmax(attn_weights, axis=-1)
+
+            # [batch_size, num_heads, q_length, kv_length]
+            attention_probs = self.attention_dropout(attention_probs)
+
+            if head_mask is not None:
+                attention_probs = attention_probs * head_mask
+
+            # change view [batch_size x num_heads, q_length, kv_length]
+            attention_probs_reshaped = attention_probs.reshape([batch_size * self.num_heads, q_length, kv_length])
+
+            # matmul: [batch_size * num_heads, q_length, head_dim]
+            context_layer = paddle.matmul(attention_probs_reshaped, value_layer)
+
+            # change view [batch_size, num_heads, q_length, head_dim]
+            context_layer = self._merge_heads(context_layer)
+
+            # aggregate results across tp ranks. See here: https://github.com/pypaddle/pypaddle/issues/76232
+            if self.pretraining_tp > 1 and self.slow_but_exact:
+                slices = self.hidden_size / self.pretraining_tp
+                output_tensor = paddle.zeros_like(context_layer)
+                for i in range(self.pretraining_tp):
+                    output_tensor = output_tensor + F.linear(
+                        context_layer[:, :, int(i * slices) : int((i + 1) * slices)],
+                        self.dense.weight[:, int(i * slices) : int((i + 1) * slices)],
+                    )
+            else:
+                output_tensor = self.dense(context_layer)
+
+        output_tensor = dropout_add(output_tensor, residual, self.hidden_dropout, self.training)
+
+        outputs = (output_tensor, present)
+        if output_attentions:
+            # output attentions should be: [batch_size, self.num_heads, q_length, kv_length]
+            attention_probs = attention_probs.reshape([batch_size, self.num_heads, q_length, kv_length])
+            outputs += (attention_probs,)
+
+        return outputs
+
+
+class BloomMLP(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        hidden_size = config.hidden_size
+
+        self.pretraining_tp = config.pretraining_tp
+        self.slow_but_exact = config.slow_but_exact
+        if config.tensor_parallel_degree > 1:
+            self.dense_h_to_4h = fleet.meta_parallel.ColumnParallelLinear(
+                hidden_size, 4 * hidden_size, gather_output=False, has_bias=True
+            )
+
+            self.dense_4h_to_h = fleet.meta_parallel.RowParallelLinear(
+                4 * hidden_size, hidden_size, input_is_parallel=True, has_bias=True
+            )
+
+        else:
+            self.dense_h_to_4h = nn.Linear(hidden_size, 4 * hidden_size)
+            self.dense_4h_to_h = nn.Linear(4 * hidden_size, hidden_size)
+        self.hidden_dropout = config.hidden_dropout
+        self.gelu_impl = BloomGelu()
+
+    def forward(self, hidden_states, residual):
+        hidden_states = self.gelu_impl(self.dense_h_to_4h(hidden_states))
+
+        if self.pretraining_tp > 1 and self.slow_but_exact:
+            intermediate_output = paddle.zeros_like(residual)
+            slices = self.dense_4h_to_h.weight.shape[-1] / self.pretraining_tp
+            for i in range(self.pretraining_tp):
+                intermediate_output = intermediate_output + nn.functional.linear(
+                    hidden_states[:, :, int(i * slices) : int((i + 1) * slices)],
+                    self.dense_4h_to_h.weight[:, int(i * slices) : int((i + 1) * slices)],
+                )
+        else:
+            intermediate_output = self.dense_4h_to_h(hidden_states)
+
+        output = dropout_add(intermediate_output, residual, self.hidden_dropout, self.training)
+
+        return output
+
+
+class BloomBlock(nn.Layer):
+    def __init__(self, config, layer_number=None):
+        super().__init__()
+        hidden_size = config.hidden_size
+
+        self.input_layernorm = nn.LayerNorm(hidden_size, epsilon=config.layer_norm_epsilon)
+        self.n_head = config.n_head
+        self.self_attention = BloomAttention(config, layer_number=layer_number)
+        self.post_attention_layernorm = nn.LayerNorm(hidden_size, epsilon=config.layer_norm_epsilon)
+
+        self.mlp = BloomMLP(config)
+
+        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
+        self.hidden_dropout = config.hidden_dropout
+
+    def forward(
+        self,
+        hidden_states,
+        layer_past=None,
+        attention_mask=None,
+        head_mask=None,
+        use_cache=False,
+        output_attentions=False,
+        alibi=None,
+    ):
+        # hidden_states: [batch_size, seq_length, hidden_size]
+
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+
+        # Layer norm post the self attention.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        # Self attention.
+
+        attn_outputs = self.self_attention(
+            layernorm_output,
+            residual,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            alibi=alibi,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+
+        attention_output = attn_outputs[0]
+
+        outputs = attn_outputs[1:]
+
+        layernorm_output = self.post_attention_layernorm(attention_output)
+
+        # Get residual
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = attention_output
+
+        # MLP.
+        output = self.mlp(layernorm_output, residual)
+
+        if use_cache:
+            outputs = (output,) + outputs
+        else:
+            outputs = (output,) + outputs[1:]
+        return outputs  # hidden_states, present, attentions
+
+
+class BloomPreTrainedModel(PretrainedModel):
+    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BloomConfig
+    base_model_prefix = "bloom"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["BloomBlock"]
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config, is_split=True):
+
+        from paddlenlp.transformers.conversion_utils import split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def get_tensor_parallel_split_mappings(num_layers):
+            final_actions = {}
+            base_actions = {
+                # Column Linear
+                "h.0.self_attention.query_key_value.weight": partial(fn, is_column=True),
+                "h.0.self_attention.query_key_value.bias": partial(fn, is_column=True),
+                "h.0.mlp.dense_h_to_4h.bias": partial(fn, is_column=True),
+                "h.0.mlp.dense_h_to_4h.weight": partial(fn, is_column=True),
+                # Row Linear
+                "word_embeddings.weight": partial(fn, is_column=False),
+                "h.0.self_attention.dense.weight": partial(fn, is_column=False),
+                "h.0.mlp.dense_4h_to_h.weight": partial(fn, is_column=False),
+            }
+            for key, action in base_actions.items():
+                if "h.0." in key:
+                    for i in range(num_layers):
+                        final_actions[key.replace("h.0.", f"h.{i}.")] = action
+                final_actions[key] = action
+            return final_actions
+
+        mappings = get_tensor_parallel_split_mappings(config.n_layer)
+
+        return mappings
+
+    def _init_weights(self, layer):
+        """Initialize the weights."""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            layer.weight.set_value(
+                paddle.tensor.normal(mean=0.0, std=self.config.initializer_range, shape=layer.weight.shape)
+            )
+            if getattr(layer, "bias", None) is not None:
+                layer.weight.set_value(paddle.zeros(shape=layer.weight.shape, dtype=paddle.get_default_dtype()))
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, BloomModel):
+            module.gradient_checkpointing = value
+
+    @staticmethod
+    def _convert_to_bloom_cache(past_key_value: Tuple[Tuple[Tensor, Tensor]]) -> Tuple[Tuple[Tensor, Tensor]]:
+        """
+        Converts the cache to the format expected by Bloom, i.e. to tuple(tuple([batch_size * num_heads, ...]))
+        """
+        batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape
+        batch_size_times_num_heads = batch_size * num_heads
+        # key:  [batch_size, num_heads, head_dim, seq_length] -> [batch_size * num_heads, head_dim, seq_length]
+        # value: [batch_size, num_heads, seq_length, head_dim] -> [batch_size * num_heads, seq_length, head_dim]
+        return tuple(
+            (
+                layer_past[0].reshape([batch_size_times_num_heads, head_dim, seq_length]),
+                layer_past[1].reshape([batch_size_times_num_heads, seq_length, head_dim]),
+            )
+            for layer_past in past_key_value
+        )
+
+    @staticmethod
+    def _convert_to_standard_cache(
+        past_key_value: Tuple[Tuple[Tensor, Tensor]], batch_size: int
+    ) -> Tuple[Tuple[Tensor, Tensor]]:
+        """
+        Standardizes the format of the cache so as to match most implementations, i.e. to tuple(tuple([batch_size,
+        num_heads, ...]))
+        """
+        batch_size_times_num_heads, head_dim, seq_length = past_key_value[0][0].shape
+        num_heads = batch_size_times_num_heads // batch_size
+        # key: [batch_size * num_heads, head_dim, seq_length] -> [batch_size, num_heads, head_dim, seq_length]
+        # value: [batch_size * num_heads, seq_length, head_dim] -> [batch_size, num_heads, seq_length, head_dim]
+        return tuple(
+            (
+                layer_past[0].reshape([batch_size, num_heads, head_dim, seq_length]),
+                layer_past[1].reshape([batch_size, num_heads, seq_length, head_dim]),
+            )
+            for layer_past in past_key_value
+        )
+
+    def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
+        """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
+        if head_mask.dim() == 1:
+            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+            head_mask = head_mask.expand(num_hidden_layers, -1, -1, -1, -1)
+        elif head_mask.dim() == 2:
+            head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+        assert head_mask.dim() == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
+
+        head_mask = paddle.cast(head_mask, dtype=self.dtype)
+        return head_mask
+
+    def get_head_mask(
+        self, head_mask: Optional[Tensor], num_hidden_layers: int, is_attention_chunked: bool = False
+    ) -> Tensor:
+        """
+        Prepare the head mask if needed.
+        Args:
+            head_mask (`paddle.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):
+                The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
+            num_hidden_layers (`int`):
+                The number of hidden layers in the model.
+            is_attention_chunked: (`bool`, *optional*, defaults to `False`):
+                Whether or not the attentions scores are computed by chunks or not.
+        Returns:
+            `paddle.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with
+            `[None]` for each layer.
+        """
+        if head_mask is not None:
+            head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
+            if is_attention_chunked is True:
+                head_mask = head_mask.unsqueeze(-1)
+        else:
+            head_mask = [None] * num_hidden_layers
+
+        return head_mask
+
+    @classmethod
+    def _get_name_mappings(cls, config: BloomConfig) -> list[StateDictNameMapping]:
+        hard_mapping = [
+            "word_embeddings.weight",
+            "word_embeddings_layernorm.weight",
+            "word_embeddings_layernorm.bias",
+            "ln_f.weight",
+            "ln_f.bias",
+        ]
+        for i in range(config.n_layer):
+            hard_mapping.extend(
+                [
+                    f"h.{i}.input_layernorm.weight",
+                    f"h.{i}.input_layernorm.bias",
+                    [
+                        f"h.{i}.self_attention.query_key_value.weight",
+                        None,
+                        "transpose",
+                    ],
+                    f"h.{i}.self_attention.query_key_value.bias",
+                    [f"h.{i}.self_attention.dense.weight", None, "transpose"],
+                    f"h.{i}.self_attention.dense.bias",
+                    f"h.{i}.post_attention_layernorm.weight",
+                    f"h.{i}.post_attention_layernorm.bias",
+                    [f"h.{i}.mlp.dense_h_to_4h.weight", None, "transpose"],
+                    [f"h.{i}.mlp.dense_4h_to_h.weight", None, "transpose"],
+                    f"h.{i}.mlp.dense_h_to_4h.bias",
+                    f"h.{i}.mlp.dense_4h_to_h.bias",
+                ]
+            )
+
+        init_name_mappings(hard_mapping)
+
+        mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(hard_mapping)]
+        model_class_name = config.architectures[0]
+
+        if model_class_name != "BloomModel":
+            for mapping in mappings:
+                mapping.source_name = "transformer." + mapping.source_name
+                mapping.target_name = "bloom." + mapping.target_name
+
+        if model_class_name == "BloomForSequenceClassification":
+            mappings.append(StateDictNameMapping("score.weight", None, "transpose"))
+        if model_class_name == "BloomForTokenClassification":
+            mappings.append(StateDictNameMapping("classifier.weight", None, "transpose"))
+            mappings.append(StateDictNameMapping("classifier.bias"))
+
+        return mappings
+
+
+class BloomModel(BloomPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.padding_idx = 0
+
+        # Recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.n_head = config.n_head
+
+        # Embedding + LN Embedding
+        # self.word_embeddings = nn.Embedding(config.vocab_size, self.embed_dim)
+        if config.tensor_parallel_degree > 1:
+            self.word_embeddings = fleet.meta_parallel.VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                weight_attr=paddle.ParamAttr(
+                    initializer=nn.initializer.Normal(mean=0.0, std=config.initializer_range)
+                ),
+            )
+        else:
+            self.word_embeddings = nn.Embedding(config.vocab_size, self.embed_dim)
+
+        self.word_embeddings_layernorm = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_epsilon)
+
+        # Transformer blocks
+        self.h = nn.LayerList([BloomBlock(config, layer_number=i) for i in range(config.n_layer)])
+
+        # Final Layer Norm
+        self.ln_f = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_epsilon)
+
+        self.gradient_checkpointing = False
+
+    def get_input_embeddings(self):
+        return self.word_embeddings
+
+    def _prepare_attn_mask(
+        self, attention_mask: Tensor, input_shape: Tuple[int, int], past_key_values_length: int, num_heads: int
+    ) -> Tensor:
+        # create causal mask
+        # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
+        combined_attention_mask = None
+        _, src_length = input_shape
+
+        if src_length > 1:
+            combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
+
+        # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
+        if len(attention_mask.shape) == 2:
+            expanded_attn_mask = _expand_2d_mask(attention_mask, tgt_length=src_length)
+        elif len(attention_mask.shape) == 3:
+            # [batch_size,tgt_length, src_length] -> [batch_size, 1, tgt_length, src_length]
+            expanded_attn_mask = attention_mask.unsqueeze(1)
+        elif len(attention_mask.shape) == 4:
+            expanded_attn_mask = attention_mask
+
+        if combined_attention_mask is not None:
+            expanded_attn_mask = expanded_attn_mask & combined_attention_mask
+
+        mask_shape = expanded_attn_mask.shape
+        expanded_attn_mask = expanded_attn_mask.expand([mask_shape[0], num_heads, mask_shape[2], mask_shape[3]])
+        # Attention score will be cast to float32 in the following calculation, therefore we set attention_mask dtype as float32
+        zero = paddle.zeros(expanded_attn_mask.shape, dtype=paddle.float32)
+        neg_inf = paddle.full(expanded_attn_mask.shape, paddle.finfo(paddle.float32).min, dtype=paddle.float32)
+        expanded_attn_mask = paddle.where(expanded_attn_mask, zero, neg_inf)
+        batch_size, num_heads, sq_len, kv_len = expanded_attn_mask.shape
+        return expanded_attn_mask.reshape([batch_size * num_heads, sq_len, kv_len])
+
+    def set_input_embeddings(self, new_embeddings: Tensor):
+        self.word_embeddings = new_embeddings
+
+    @paddle.jit.not_to_static
+    def recompute_training(
+        self, block, hidden_states, layer_past, attention_mask, head_mask, use_cache, output_attentions, alibi
+    ):
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+
+            return custom_forward
+
+        hidden_states = recompute(
+            create_custom_forward(block),
+            hidden_states,
+            layer_past,
+            attention_mask,
+            head_mask,
+            use_cache,
+            output_attentions,
+            alibi,
+            use_reentrant=self.config.recompute_use_reentrant,
+        )
+        return hidden_states
+
+    def forward(
+        self,
+        input_ids=None,
+        past_key_values=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **kwargs,
+    ) -> Union[Tuple[Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+
+        past_key_values = kwargs.get("cache", past_key_values)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.h))
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape batch_size x num_heads x N x N
+        # head_mask has shape n_layer x batch x num_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        hidden_states = self.word_embeddings_layernorm(inputs_embeds)
+
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        # Compute alibi tensor: check build_alibi_tensor documentation
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values[0] is not None:
+            past_key_values_length = past_key_values[0][0].shape[1]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+
+        if attention_mask is None:
+            attention_mask = paddle.ones([batch_size, seq_length_with_past], dtype="bool")
+        elif attention_mask.dtype != paddle.bool:
+            attention_mask = paddle.cast(attention_mask, "bool")
+        if len(attention_mask.shape) > 2:
+            _attention_mask = paddle.ones([batch_size, seq_length_with_past], dtype="bool")
+            if self.config.use_long_sequence_strategies:
+                alibi_layer = LongSequenceStrategies.build_long_sequence_strategy(
+                    self.config.long_sequence_strategy_type,
+                    self.config.long_sequence_strategy_name,
+                    **self.config.long_sequence_init_args,
+                )
+                alibi = alibi_layer(_attention_mask, self.config.n_head, dtype=hidden_states.dtype)
+                alibi = paddle.squeeze(alibi)
+            else:
+                alibi = build_alibi_tensor(_attention_mask, self.config.n_head, dtype=hidden_states.dtype)
+        else:
+            if self.config.use_long_sequence_strategies:
+                alibi_layer = LongSequenceStrategies.build_long_sequence_strategy(
+                    self.config.long_sequence_strategy_type,
+                    self.config.long_sequence_strategy_name,
+                    **self.config.long_sequence_init_args,
+                )
+                alibi = alibi_layer(attention_mask, self.config.n_head, dtype=hidden_states.dtype)
+                alibi = paddle.squeeze(alibi)
+            else:
+                alibi = build_alibi_tensor(attention_mask, self.config.n_head, dtype=hidden_states.dtype)
+        if self.config.tensor_parallel_degree > 1:
+            block_size = self.config.n_head // self.config.tensor_parallel_degree
+            alibi = alibi[
+                :, self.config.tensor_parallel_rank * block_size : (self.config.tensor_parallel_rank + 1) * block_size
+            ]
+            alibi = alibi.reshape([batch_size * block_size, 1, seq_length_with_past])
+            causal_mask = self._prepare_attn_mask(
+                attention_mask,
+                input_shape=(batch_size, seq_length),
+                past_key_values_length=past_key_values_length,
+                num_heads=block_size,
+            )
+        else:
+            alibi = alibi.reshape([batch_size * self.config.n_head, 1, seq_length_with_past])
+            causal_mask = self._prepare_attn_mask(
+                attention_mask,
+                input_shape=(batch_size, seq_length),
+                past_key_values_length=past_key_values_length,
+                num_heads=self.config.n_head,
+            )
+
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            has_gradient = not hidden_states.stop_gradient
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.enable_recompute and has_gradient:
+                outputs = self.recompute_training(
+                    block,
+                    hidden_states,
+                    layer_past=layer_past,
+                    attention_mask=causal_mask,
+                    head_mask=head_mask[i],
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    alibi=alibi,
+                )
+            else:
+                outputs = block(
+                    hidden_states,
+                    layer_past=layer_past,
+                    attention_mask=causal_mask,
+                    head_mask=head_mask[i],
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    alibi=alibi,
+                )
+
+            hidden_states = outputs[0]
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+
+        # Add last hidden state
+        hidden_states = self.ln_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class BloomLMHead(nn.Layer):
+    def __init__(self, config, embedding_weights=None):
+        super(BloomLMHead, self).__init__()
+        self.decoder_weight = (
+            self.create_parameter(shape=[config.vocab_size, config.hidden_size], dtype=paddle.get_default_dtype())
+            if embedding_weights is None
+            else embedding_weights
+        )
+        self.config = config
+
+    def forward(self, hidden_states, parallel_output):
+        logits = parallel_matmul(hidden_states, self.decoder_weight, parallel_output=parallel_output)
+        return logits
+
+
+class BloomPretrainingCriterion(paddle.nn.Layer):
+    """
+    Criterion for GPT.
+    It calculates the final loss.
+    """
+
+    def __init__(self, ignore_index=-100, tensor_parallel_degree=1, tensor_parallel_output=False):
+        super(BloomPretrainingCriterion, self).__init__()
+        if tensor_parallel_degree > 1 and tensor_parallel_output:
+            self.loss_func = fleet.meta_parallel.ParallelCrossEntropy()
+        else:
+            self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none")
+        self.ignore_index = ignore_index
+
+    def forward(self, prediction_scores, masked_lm_labels, loss_mask=None):
+        masked_lm_loss = self.loss_func(prediction_scores, masked_lm_labels.unsqueeze(2))
+        with paddle.amp.auto_cast(False):
+            masked_lm_loss = masked_lm_loss.astype("float32")
+            if loss_mask is not None:
+                loss_mask = loss_mask.reshape([-1])
+                masked_lm_loss = paddle.sum(masked_lm_loss.reshape([-1]) * loss_mask)
+                loss = masked_lm_loss / loss_mask.sum()
+            else:
+                masked_lm_loss = masked_lm_loss[masked_lm_labels != self.ignore_index]
+                loss = paddle.mean(masked_lm_loss)
+
+        return loss
+
+
+class BloomForPretraining(BloomPreTrainedModel):
+    """
+    The pretraining model of Bloom.
+    It returns some logits and cached_kvs.
+    """
+
+    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.bloom = BloomModel(config)
+        self.criterion = BloomPretrainingCriterion(tensor_parallel_degree=config.tensor_parallel_degree)
+        self.extra_parameters = [self.bloom.word_embeddings.weight]
+
+    def forward(
+        self,
+        input_ids,
+        labels=None,
+        loss_mask=None,
+        attention_mask=None,
+        use_cache=False,
+        cache=None,
+    ):
+        outputs = self.bloom(input_ids, attention_mask=attention_mask, use_cache=use_cache, cache=cache)
+        if use_cache:
+            encoder_outputs, cached_kvs = outputs[:2]
+        else:
+            encoder_outputs = outputs
+
+        logits = parallel_matmul(
+            encoder_outputs[0],
+            self.bloom.word_embeddings.weight,
+            parallel_output=False,
+        )
+        if labels is None:
+            return logits
+
+        loss = self.criterion(logits, labels, loss_mask)
+        return loss, logits
+
+
+class BloomForCausalLM(BloomPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.decoder_weight"]
+    _keys_to_ignore_on_save = [r"lm_head.decoder_weight"]
+    _tied_weights_keys = ["lm_head.decoder_weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.bloom = BloomModel(config)
+        self.lm_head = BloomLMHead(config, self.bloom.word_embeddings.weight)
+        self.criterion = BloomPretrainingCriterion(
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_output=config.tensor_parallel_output,
+        )
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    @staticmethod
+    def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False):
+        # update cache
+        if isinstance(outputs, tuple):
+            model_kwargs["cache"] = outputs[1]
+
+        if isinstance(outputs, CausalLMOutputWithCrossAttentions) and "past_key_values" in outputs:
+            model_kwargs["cache"] = outputs.past_key_values
+
+        # update token_type_ids with last value
+        if "token_type_ids" in model_kwargs and model_kwargs["token_type_ids"] is not None:
+            token_type_ids = model_kwargs["token_type_ids"]
+            model_kwargs["token_type_ids"] = paddle.concat([token_type_ids, token_type_ids[:, -1:]], axis=-1)
+
+        if not is_encoder_decoder:
+            # update attention mask
+            if "attention_mask" in model_kwargs:
+                attention_mask = model_kwargs["attention_mask"]
+                if len(attention_mask.shape) == 2:
+                    model_kwargs["attention_mask"] = paddle.concat(
+                        [attention_mask, paddle.ones([attention_mask.shape[0], 1], dtype=attention_mask.dtype)],
+                        axis=-1,
+                    )
+                elif len(attention_mask.shape) == 4:
+                    model_kwargs["attention_mask"] = paddle.concat(
+                        [attention_mask, paddle.ones([*attention_mask.shape[:3], 1], dtype=attention_mask.dtype)],
+                        axis=-1,
+                    )[:, :, -1:, :]
+        # update role_ids
+        if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None:
+            role_ids = model_kwargs["role_ids"]
+            model_kwargs["role_ids"] = paddle.concat([role_ids, role_ids[:, -1:]], axis=-1)
+
+        return model_kwargs
+
+    def prepare_inputs_for_generation(self, input_ids, use_cache=False, cache=None, **kwargs):
+        # only last token for inputs_ids if cache is defined in kwargs
+        attention_mask = kwargs.get("attention_mask", None)
+        if cache is not None:
+            input_ids = input_ids[:, -1].unsqueeze(axis=-1)
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "cache": cache, "use_cache": True}
+
+    # TODO(wawltor) attention_mask is not need
+    @staticmethod
+    def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id):
+        attention_mask = paddle.ones_like(input_ids, dtype="bool")
+        attention_mask = (input_ids != pad_token_id).astype("bool")
+        return attention_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        cache=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ) -> Union[Tuple[Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.bloom(
+            input_ids,
+            past_key_values=cache,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        lm_logits = self.lm_head(hidden_states, self.config.tensor_parallel_output)
+
+        loss = None
+        if labels is not None:
+            loss = self.criterion(lm_logits, labels)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    @staticmethod
+    def _reorder_cache(past: Tuple[Tuple[Tensor]], beam_idx: Tensor) -> Tuple[Tuple[Tensor]]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+        """
+        return tuple(tuple(past_state.index_select(0, beam_idx) for past_state in layer_past) for layer_past in past)
+
+
+class BloomForSequenceClassification(BloomPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.bloom = BloomModel(config)
+        self.score = nn.Linear(config.hidden_size, config.num_labels, bias_attr=False)
+
+    def forward(
+        self,
+        input_ids=None,
+        past_key_values=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ) -> Union[Tuple[Tensor], SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`paddle.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.bloom(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+            sequence_length = input_ids.shape[1]
+        else:
+            batch_size = inputs_embeds.shape[0]
+            sequence_length = inputs_embeds.shape[1]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+
+        if self.config.pad_token_id is None:
+            pooled_logits = logits[:, -1]
+        else:
+            if input_ids is not None:
+                # select the last word of batch sentence
+                sequence_lengths = paddle.where(input_ids != self.config.pad_token_id, 1, 0).sum(axis=-1) - 1
+                sequence_lengths += paddle.to_tensor([i * input_ids.shape[1] for i in range(batch_size)])
+                pooled_logits = paddle.index_select(
+                    logits.reshape([batch_size * sequence_length, -1]), sequence_lengths, axis=0
+                )
+
+            else:
+                pooled_logits = logits[:, -1]
+                logger.warning(
+                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+                )
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and labels.dtype == paddle.int64:
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.reshape([-1, self.num_labels]), labels.reshape([-1]))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = nn.BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+class BloomForTokenClassification(BloomPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bloom = BloomModel(config)
+        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
+            classifier_dropout = config.classifier_dropout
+        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(
+        self,
+        input_ids=None,
+        past_key_values=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ) -> Union[Tuple[Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`paddle.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.bloom(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.reshape([-1, self.num_labels]), labels.reshape([-1]))
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+class BloomForGeneration(BloomPreTrainedModel):
+    """
+    Bloom Model with pretraining tasks on top.
+
+    Args:
+        bloom (:class:`BloomModel`):
+            An instance of :class:`BloomModel`.
+
+    """
+
+    def __init__(self, config: BloomConfig):
+        # when running generation, it must be True
+        config.use_cache = True
+
+        super(BloomForGeneration, self).__init__(config)
+        self.bloom = BloomModel(config)
+        self.config = config
+
+        self.max_length = self.config.get("max_dec_len", 20)
+        self.min_length = self.config.get("min_dec_len", 0)
+        self.decode_strategy = self.config.get("decode_strategy", "sampling")
+        self.temperature = self.config.get("temperature", 1.0)
+        self.top_k = self.config.get("top_k", 0)
+        self.top_p = self.config.get("top_p", 1.0)
+        self.use_topp_sampling = self.config.get("use_topp_sampling", False)
+        self.inference = self.config.get("inference", False)
+        self.repetition_penalty = self.config.get("repetition_penalty", 1.0)
+        self.num_beams = self.config.get("num_beams", 1)
+        self.num_beam_groups = self.config.get("num_beam_groups", 1)
+        self.length_penalty = self.config.get("length_penalty", 0.0)
+        self.early_stopping = self.config.get("early_stopping", False)
+        self.bos_token_id = self.config.get("bos_token_id", None)
+        self.eos_token_id = self.config.get("eos_token_id", None)
+        self.pad_token_id = self.config.get("pad_token_id", None)
+        self.decoder_start_token_id = self.config.get("decoder_start_token_id", None)
+        self.forced_bos_token_id = self.config.get("forced_bos_token_id", None)
+        self.forced_eos_token_id = self.config.get("forced_eos_token_id", None)
+        self.num_return_sequences = self.config.get("num_return_sequences", 1)
+        self.diversity_rate = self.config.get("diversity_rate", 0.0)
+        self.use_cache = self.config.get("use_cache", True)
+
+    def prepare_input_ids_for_generation(self, bos_token_id, encoder_output=None):
+        batch_size = 1
+        if bos_token_id is None:
+            raise ValueError("`bos_token_id` should be defined when no " "`input_ids` are provided.")
+        if encoder_output is not None:
+            batch_size = encoder_output.shape[0]
+        return paddle.ones([batch_size, 1], dtype="int64") * bos_token_id
+
+    def prepare_attention_mask_for_generation(self, input_ids, pad_token_id, eos_token_id):
+        is_pad_token_in_inputs_ids = (pad_token_id is not None) and paddle.any(input_ids == pad_token_id).item()
+        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (
+            (eos_token_id is not None) and (pad_token_id != eos_token_id)
+        )
+        if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id:
+            attention_mask = (input_ids != pad_token_id).astype("bool")
+        else:
+            attention_mask = paddle.ones_like(input_ids, dtype="bool")
+        return attention_mask
+
+    def update_scores_for_generation(self, scores, next_scores, length, unfinished_flag):
+        # update scores
+
+        unfinished_scores = (scores * length + next_scores) / (length + 1)
+        scores = paddle.where(unfinished_flag, unfinished_scores, scores)
+        return scores
+
+    def get_logits_processor(
+        self,
+        min_length=None,
+        max_length=None,
+        eos_token_id=None,
+        forced_bos_token_id=None,
+        forced_eos_token_id=None,
+        num_beams=1,
+        num_beam_groups=1,
+        diversity_rate=0.0,
+        repetition_penalty=None,
+    ):
+        processors = LogitsProcessorList()
+
+        # if min_length is not None and eos_token_id is not None and min_length > -1:
+        #     processors.append(
+        #         MinLengthLogitsProcessor(min_length, eos_token_id))
+
+        if num_beam_groups > 1 and diversity_rate > 0.0:
+            processors.append(
+                HammingDiversityLogitsProcessor(
+                    diversity_rate=diversity_rate, num_beams=num_beams, num_beam_groups=num_beam_groups
+                )
+            )
+        if repetition_penalty is not None and repetition_penalty != 1.0:
+            processors.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
+        if forced_bos_token_id is not None:
+            processors.append(ForcedBOSTokenLogitsProcessor(forced_bos_token_id))
+        if forced_eos_token_id is not None:
+            processors.append(ForcedEOSTokenLogitsProcessor(max_length, forced_eos_token_id))
+        # TODO
+        # Add more pre_processing for distribution
+
+        return processors
+
+    def expand_inputs_for_generation(self, input_ids, expand_size, attention_mask=None, **model_kwargs):
+
+        index = paddle.tile(paddle.arange(input_ids.shape[0]).unsqueeze(-1), [1, expand_size]).reshape([-1])
+
+        input_ids = paddle.gather(input_ids, index)
+
+        if attention_mask is not None:
+            model_kwargs["attention_mask"] = paddle.gather(attention_mask, index)
+
+        if "token_type_ids" in model_kwargs and model_kwargs["token_type_ids"] is not None:
+            token_type_ids = model_kwargs["token_type_ids"]
+            model_kwargs["token_type_ids"] = paddle.gather(token_type_ids, index)
+
+        if "seq_len" in model_kwargs and model_kwargs["seq_len"] is not None:
+            seq_len = model_kwargs["seq_len"]
+            model_kwargs["seq_len"] = paddle.gather(seq_len, index)
+
+        if "encoder_output" in model_kwargs and model_kwargs["encoder_output"] is not None:
+            encoder_output = model_kwargs["encoder_output"]
+            model_kwargs["encoder_output"] = paddle.gather(encoder_output, index)
+
+        if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None:
+            role_ids = model_kwargs["role_ids"]
+            model_kwargs["role_ids"] = paddle.gather(role_ids, index)
+
+        return input_ids, model_kwargs
+
+    def prepare_inputs_for_generation(self, input_ids, use_cache=False, cache=None, **kwargs):
+        # only last token for inputs_ids if cache is defined in kwargs
+        attention_mask = kwargs.get("attention_mask", None)
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "cache": cache}
+
+    def update_model_kwargs_for_generation(self, next_tokens, outputs, model_kwargs, is_encoder_decoder=False):
+        # Update the model inputs during generation.
+        # Note that If `token_type_ids` and `attention_mask` in `model_kwargs`
+        # and they contain pad value, the result vectors updated by this method
+        # may be different from expected. In this case, you need to rewrite the
+        # method.
+
+        # update cache
+        if isinstance(outputs, tuple):
+            model_kwargs["cache"] = outputs[1]
+
+        # update token_type_ids with last value
+        if "token_type_ids" in model_kwargs and model_kwargs["token_type_ids"] is not None:
+            token_type_ids = model_kwargs["token_type_ids"]
+            model_kwargs["token_type_ids"] = paddle.concat([token_type_ids, token_type_ids[:, -1:]], axis=-1)
+
+        if not is_encoder_decoder:
+            # update attention mask
+            if "attention_mask" in model_kwargs:
+                attention_mask = model_kwargs["attention_mask"]
+                model_kwargs["attention_mask"] = paddle.concat(
+                    [attention_mask, paddle.ones([attention_mask.shape[0], 1], dtype="bool")], axis=-1
+                )
+
+        # update role_ids
+        if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None:
+            role_ids = model_kwargs["role_ids"]
+            model_kwargs["role_ids"] = paddle.concat([role_ids, role_ids[:, -1:]], axis=-1)
+
+        model_kwargs["res"] = paddle.concat([model_kwargs["res"], next_tokens], axis=1)
+
+        return model_kwargs
+
+    def sample(
+        self,
+        input_ids,
+        logits_processors,
+        max_length,
+        pad_token_id,
+        eos_token_id,
+        top_k=None,
+        top_p=None,
+        temperature=None,
+        min_tokens_to_keep=1,
+        **model_kwargs
+    ):
+        def TopKProcess(probs, top_k, min_tokens_to_keep):
+            top_k = min(max(top_k, min_tokens_to_keep), probs.shape[-1])
+            # Remove all tokens with a probability less than the last token of the top-k
+            topk_probs, _ = paddle.topk(probs, k=top_k)
+            probs = paddle.where(probs >= topk_probs[:, -1:], probs, paddle.full_like(probs, 0.0))
+            return probs
+
+        def TopPProcess(probs, top_p, min_tokens_to_keep):
+            sorted_probs = paddle.sort(probs, descending=True)
+            sorted_indices = paddle.argsort(probs, descending=True)
+            cumulative_probs = paddle.cumsum(sorted_probs, axis=-1)
+
+            # Remove tokens with cumulative probs above the top_p, But keep at
+            # least min_tokens_to_keep tokens
+            sorted_indices_to_remove = cumulative_probs > top_p
+            if min_tokens_to_keep > 1:
+                # Set 'min_tokens_to_keep - 1' because the first token is kept
+                sorted_indices_to_remove[:, : min_tokens_to_keep - 1] = 0
+            # Keep the first token
+            sorted_indices_to_remove = paddle.cast(sorted_indices_to_remove, dtype="int64")
+            sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
+            sorted_indices_to_remove[:, 0] = 0
+
+            # Scatter sorted tensors to original indexing
+            sorted_indices = sorted_indices + paddle.arange(probs.shape[0]).unsqueeze(-1) * probs.shape[-1]
+            condition = paddle.scatter(
+                sorted_indices_to_remove.flatten(), sorted_indices.flatten(), sorted_indices_to_remove.flatten()
+            )
+            condition = paddle.cast(condition, "bool").reshape(probs.shape)
+            probs = paddle.where(condition, paddle.full_like(probs, 0.0), probs)
+            return probs
+
+        batch_size, cur_len = input_ids.shape
+
+        # used for compute on gpu, avoid memcpy D2H
+        cur_len_gpu = paddle.full([1], cur_len)
+
+        origin_len = input_ids.shape[1]
+        # used for compute on gpu, avoid memcpy D2H
+        origin_len_gpu = paddle.full([1], origin_len)
+
+        unfinished_flag = paddle.full([batch_size, 1], True, dtype="bool")
+        scores = paddle.full([batch_size, 1], 0.0, dtype=paddle.get_default_dtype())
+
+        res = paddle.assign(input_ids)
+        model_kwargs["res"] = res
+
+        # use_cache is immutable, we split it off other mutable kwargs.
+        assert "use_cache" in model_kwargs
+        immutable = {"use_cache": model_kwargs["use_cache"]}
+        del model_kwargs["use_cache"]
+
+        def _forward_(**args):
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **args, **immutable)
+            return self.bloom(**model_inputs, **immutable)
+
+        def _post_process_(outputs, input_ids, cur_len, origin_len, scores, unfinished_flag, model_kwargs):
+
+            logits = outputs[0] if isinstance(outputs, tuple) else outputs
+
+            # logits = paddle.matmul(
+            #     logits,
+            #     self.bloom.embeddings.word_embeddings.weight,
+            #     transpose_y=True)
+
+            # x_dims_mapping = [self.bloom.mesh.dp] + [
+            #     None for i in range(len(logits.shape) - 1)
+            # ]
+            # w_dims_mapping = [self.bloom.mesh.mp, None]
+            # matmul = auto.shard_op(paddle.matmul, self.bloom.mesh[-1],
+            #                        [x_dims_mapping, w_dims_mapping, None])
+
+            logits = paddle.matmul(logits, self.bloom.word_embeddings.weight, transpose_y=True)
+
+            # [batch_size, vocab_size]
+            logits = logits[:, -1, :]
+
+            # pre-process distribution
+            logits = logits_processors(input_ids, logits)
+
+            # sample
+            origin_probs = F.softmax(logits)
+            if temperature is None or temperature == 1.0:
+                probs = paddle.assign(origin_probs)
+                origin_probs = paddle.log(origin_probs)
+            else:
+                origin_probs = paddle.log(origin_probs)
+                logits = logits / temperature
+                probs = F.softmax(logits)
+            if top_k is not None and top_k != 0:
+                probs = TopKProcess(probs, top_k, min_tokens_to_keep)
+            if top_p is not None and top_p < 1.0:
+                if self.use_topp_sampling:
+                    try:
+                        from ppfleetx_ops import topp_sampling
+                    except ImportError:
+                        raise ImportError(
+                            "please install ppfleetx_ops by 'cd ppfleetx/ops && python setup_cuda.py install'!"
+                        )
+                    top_ps_tensor = paddle.full(shape=[probs.shape[0]], fill_value=top_p, dtype=probs.dtype)
+                    next_tokens = topp_sampling(probs, top_ps_tensor)
+                else:
+                    probs = TopPProcess(probs, top_p, min_tokens_to_keep)
+
+            if not self.use_topp_sampling:
+                # TODO(wj-Mcat): multinomial do not support fp16, so convert it to fp32
+                # refer to: https://github.com/PaddlePaddle/Paddle/issues/51852
+                next_tokens = paddle.multinomial(paddle.cast(probs, paddle.float32))
+                # next_tokens = paddle.multinomial(probs)
+
+            next_scores = paddle.index_sample(origin_probs, next_tokens)
+
+            if eos_token_id is not None:
+                next_tokens = paddle.where(unfinished_flag, next_tokens, paddle.full_like(next_tokens, pad_token_id))
+
+            scores = self.update_scores_for_generation(scores, next_scores, cur_len - origin_len, unfinished_flag)
+
+            input_ids = next_tokens
+
+            if eos_token_id is not None:
+                unfinished_flag = paddle.logical_and(unfinished_flag, next_tokens != eos_token_id)
+
+            model_kwargs = self.update_model_kwargs_for_generation(
+                next_tokens, outputs, model_kwargs, is_encoder_decoder=self.is_encoder_decoder
+            )
+
+            return input_ids, scores, unfinished_flag, model_kwargs
+
+        # Note(GuoxiaWang):Pre-while call for inference, simulate a do while loop statement
+        # the value in model_kwargs should be tensor before while loop
+        outputs = _forward_(**model_kwargs)
+
+        input_ids, scores, unfinished_flag, model_kwargs = _post_process_(
+            outputs, input_ids, cur_len_gpu, origin_len_gpu, scores, unfinished_flag, model_kwargs
+        )
+        if not self.inference:
+            cur_len += 1
+        else:
+            # Note(ZhenyuLi): Avoid the synchronization caused by scale in dy2static
+            paddle.increment(cur_len)
+        paddle.increment(cur_len_gpu)
+
+        attn_mask = model_kwargs["attention_mask"]
+        # make the shape of attention_mask = (-1, -1, -1, -1) in dy2static.
+        model_kwargs["attention_mask"] = paddle.reshape(attn_mask, attn_mask.shape)
+        model_kwargs["cache"] = outputs[1] if isinstance(outputs, tuple) else None
+        max_length = paddle.to_tensor(max_length)
+        while cur_len < max_length:
+            # Note(GuoxiaWang): Remove outputs = _forward_(**model_kwargs)
+            # and change it to pass directly to _post_process_ to avoid
+            # closed-loop problem of dynamic-to-static model
+            input_ids, scores, unfinished_flag, model_kwargs = _post_process_(
+                _forward_(**model_kwargs),
+                input_ids,
+                cur_len_gpu,
+                origin_len_gpu,
+                scores,
+                unfinished_flag,
+                model_kwargs,
+            )
+            if not self.inference:
+                cur_len += 1
+            else:
+                # Note(ZhenyuLi): Avoid the synchronization caused by scale in dy2static
+                paddle.increment(cur_len)
+            paddle.increment(cur_len_gpu)
+
+            if not paddle.any(unfinished_flag):
+                break
+
+        return model_kwargs["res"][:, origin_len:], scores
+
+    def forward(self, input_ids=None, **model_kwargs):
+
+        max_length = self.max_length
+        min_length = self.min_length
+        decode_strategy = self.decode_strategy
+        temperature = self.temperature
+        top_k = self.top_k
+        top_p = self.top_p
+        repetition_penalty = self.repetition_penalty
+        num_beams = self.num_beams
+        num_beam_groups = self.num_beam_groups
+        bos_token_id = self.bos_token_id
+        eos_token_id = self.eos_token_id
+        pad_token_id = self.pad_token_id
+        decoder_start_token_id = self.decoder_start_token_id
+        forced_bos_token_id = self.forced_bos_token_id
+        forced_eos_token_id = self.forced_eos_token_id
+        num_return_sequences = self.num_return_sequences
+        diversity_rate = self.diversity_rate
+        use_cache = self.use_cache
+
+        assert decode_strategy in [
+            "greedy_search",
+            "sampling",
+            "beam_search",
+        ], "`decode_strategy` must be one of 'greedy_search', 'sampling' or 'beam_search' but received {}.".format(
+            decode_strategy
+        )
+
+        bos_token_id = bos_token_id if bos_token_id is not None else getattr(self.config, "bos_token_id", None)
+        eos_token_id = eos_token_id if eos_token_id is not None else getattr(self.config, "eos_token_id", None)
+        pad_token_id = pad_token_id if pad_token_id is not None else getattr(self.config, "pad_token_id", None)
+        forced_bos_token_id = (
+            forced_bos_token_id
+            if forced_bos_token_id is not None
+            else getattr(self.config, "forced_bos_token_id", None)
+        )
+        forced_eos_token_id = (
+            forced_eos_token_id
+            if forced_eos_token_id is not None
+            else getattr(self.config, "forced_eos_token_id", None)
+        )
+        decoder_start_token_id = (
+            decoder_start_token_id
+            if decoder_start_token_id is not None
+            else getattr(self.config, "decoder_start_token_id", None)
+        )
+
+        # params check
+        if input_ids is None:
+            # Init `input_ids` with bos_token_id
+            input_ids = self.prepare_input_ids_for_generation(bos_token_id)
+
+        if model_kwargs.get("attention_mask", None) is None:
+            # Init `attention_mask` depending on `pad_token_id`
+            model_kwargs["attention_mask"] = self.prepare_attention_mask_for_generation(
+                input_ids, pad_token_id, eos_token_id
+            )
+
+        if model_kwargs.get("position_ids", None) is None:
+            model_kwargs["position_ids"] = paddle.arange(
+                0, model_kwargs["attention_mask"].shape[-1], dtype=input_ids.dtype
+            ).unsqueeze(0)
+
+        self.is_encoder_decoder = False
+
+        model_kwargs["use_cache"] = use_cache
+
+        if self.inference:
+            # Note(ZhenyuLi): Avoid the synchronization caused by scale in dy2static
+            min_len = int(input_ids.shape[-1])
+            max_len = int(input_ids.shape[-1])
+            paddle.increment(min_len, min_length)
+            paddle.increment(max_len, max_length)
+        else:
+            input_len = input_ids.shape[-1]
+            max_len = max_length + input_len
+            min_len = min_length + input_len
+
+        logits_processors = self.get_logits_processor(
+            min_length=min_len,
+            max_length=max_len,
+            eos_token_id=eos_token_id,
+            forced_bos_token_id=forced_bos_token_id,
+            forced_eos_token_id=forced_eos_token_id,
+            num_beams=num_beams,
+            num_beam_groups=num_beam_groups,
+            diversity_rate=diversity_rate,
+            repetition_penalty=repetition_penalty,
+        )
+
+        if decode_strategy == "sampling":
+            if num_return_sequences > 1:
+                input_ids, model_kwargs = self.expand_inputs_for_generation(
+                    input_ids, expand_size=num_return_sequences, **model_kwargs
+                )
+
+            ret = self.sample(
+                input_ids,
+                logits_processors,
+                max_len,
+                pad_token_id,
+                eos_token_id,
+                top_k,
+                top_p,
+                temperature,
+                **model_kwargs,
+            )
+        else:
+            raise ValueError(f"Not support {decode_strategy} strategy yet!")
+        return ret
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bloom/processor.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bloom/processor.py
new file mode 100644
index 000000000..19558f620
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bloom/processor.py
@@ -0,0 +1,176 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import inspect
+from abc import ABC
+
+import paddle
+
+
+class LogitsProcessorList(list):
+    def __call__(self, input_ids, logits, **kwargs):
+        for processor in self:
+            processor_args = inspect.signature(processor.__call__).parameters
+            if len(processor_args) > 2:
+                assert all(
+                    arg in kwargs for arg in list(processor_args.keys())[2:]
+                ), f"The parameters don't match for {processor.__class__}"
+                logits = processor(input_ids, logits, **kwargs)
+            else:
+                logits = processor(input_ids, logits)
+        return logits
+
+
+class LogitsProcessor(ABC):
+    """
+    Abstract base class for all logit processors that can be applied during
+    generation.
+    """
+
+    def __call__(self, input_ids, logits):
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. " "Only classes inheriting this class can be called."
+        )
+
+
+class MinLengthLogitsProcessor(LogitsProcessor):
+    r"""
+    Enforcing a min-length by setting EOS probability to 0.
+    Args:
+        min_length (int): The minimum length of generation sequence.
+        eos_token_id (int): The id of the `end-of-sequence` token.
+    """
+
+    def __init__(self, min_length, eos_token_id):
+        self.min_length = min_length
+        self.eos_token_id = eos_token_id
+
+    def __call__(self, input_ids, logits):
+        cur_len = input_ids.shape[-1]
+        if cur_len < self.min_length:
+            logits[:, self.eos_token_id] = -float("inf")
+        return logits
+
+
+class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
+    r"""
+    Enforcing an exponential penalty on repeated sequences.
+    Args:
+        repetition_penalty (float):
+            The parameter for repetition penalty. 1.0 means no penalty. See `this paper
+            <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
+    """
+
+    def __init__(self, penalty: float):
+        if not isinstance(penalty, float) or not (penalty > 0):
+            raise ValueError(f"`penalty` has to be a strictly positive float, but is {penalty}")
+
+        self.penalty = penalty
+
+    def __call__(self, input_ids, logits):
+        score = paddle.index_sample(logits, input_ids)
+        score = paddle.where(score < 0, score * self.penalty, score / self.penalty)
+        input_ids = input_ids + paddle.arange(logits.shape[0]).unsqueeze(-1) * logits.shape[-1]
+        outputs = paddle.scatter(logits.flatten(), input_ids.flatten(), score.flatten()).reshape(logits.shape)
+        return outputs
+
+
+class HammingDiversityLogitsProcessor(LogitsProcessor):
+    """
+    This `LogitsProcessor` enforces diverse beam search. Note that this logits
+    processor is only effective for `group_beam_search`. See
+    `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__ for more details.
+    Args:
+        diversity_rate (float): This value is subtracted from a beam's score if
+            it generates a token same as any beam from other group at a particular
+            time.
+        num_beams (int): Number of beams used for group beam search.
+        num_beam_groups (int): Number of groups to divide `num_beams` into in order
+            to ensure diversity among different groups of beams.
+    """
+
+    def __init__(self, diversity_rate, num_beams, num_beam_groups):
+        if not isinstance(diversity_rate, float) or (not diversity_rate > 0.0):
+            raise ValueError("`diversity_rate` should be a float strictly larger than 0.")
+        self._diversity_rate = diversity_rate
+        if not isinstance(num_beams, int) or num_beams < 2:
+            raise ValueError("`num_beams` should be an integer strictly larger than 1.")
+        self._num_beams = num_beams
+        if not isinstance(num_beam_groups, int) or num_beam_groups < 2:
+            raise ValueError("`num_beam_groups` should be an integer strictly larger than 1.")
+        self._num_sub_beams = num_beams // num_beam_groups
+
+    def __call__(self, input_ids, scores, current_tokens, beam_group_idx):
+        batch_size = current_tokens.shape[0] // self._num_beams
+        group_start_idx = beam_group_idx * self._num_sub_beams
+        group_end_idx = min(group_start_idx + self._num_sub_beams, self._num_beams)
+        group_size = group_end_idx - group_start_idx
+        vocab_size = scores.shape[-1]
+
+        if group_start_idx == 0:
+            return scores
+
+        for batch_idx in range(batch_size):
+            previous_group_tokens = current_tokens[
+                batch_idx * self._num_beams : batch_idx * self._num_beams + group_start_idx
+            ]
+            token_frequency = paddle.bincount(previous_group_tokens, minlength=vocab_size)
+            scores[batch_idx * group_size : (batch_idx + 1) * group_size] -= self._diversity_rate * token_frequency
+
+        return scores
+
+
+class ForcedBOSTokenLogitsProcessor(LogitsProcessor):
+    """
+    This `LogitsProcessor` enforces the first generated token to be the selected `forced_bos_token`.
+    Args:
+        forced_bos_token_id (:obj:`int`):
+            The id of the token to be generated as the first token.
+    """
+
+    def __init__(self, forced_bos_token_id):
+        self.forced_bos_token_id = forced_bos_token_id
+
+    def __call__(self, input_ids, scores):
+        cur_len = input_ids.shape[-1]
+        if cur_len == 1:
+            num_tokens = scores.shape[1]
+            scores[:, [i for i in range(num_tokens) if i != self.forced_bos_token_id]] = -float("inf")
+            scores[:, self.forced_bos_token_id] = 0
+        return scores
+
+
+class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
+    """
+    This `LogitsProcessor` enforces the last generated token to be the selected `forced_eos_token`.
+    Args:
+        max_length (int): The maximum length of the sequence to be generated.
+        forced_eos_token_id (int): The id of the token to be generated as the last token.
+    """
+
+    def __init__(self, max_length, forced_eos_token_id):
+        self.max_length = max_length
+        self.forced_eos_token_id = forced_eos_token_id
+
+    def __call__(self, input_ids, scores):
+        cur_len = input_ids.shape[-1]
+        if cur_len == self.max_length - 1:
+            num_tokens = scores.shape[1]
+            scores[
+                :, [i for i in range(num_tokens) if i != self.forced_eos_token_id]
+            ] = -1e9  # TODO change back to -inf after paddle.topk is fixed
+            scores[:, self.forced_eos_token_id] = 0
+        return scores
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bloom/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bloom/tokenizer.py
new file mode 100644
index 000000000..4ba02b9b9
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/bloom/tokenizer.py
@@ -0,0 +1,411 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import json
+import os
+import shutil
+from functools import lru_cache
+from typing import Dict, Optional, Union
+
+import numpy as np
+from paddle.utils import try_import
+
+from paddlenlp.transformers import AddedToken, PretrainedTokenizer
+
+from ..tokenizer_utils_base import BatchEncoding, EncodedInput, PaddingStrategy
+from .configuration import (
+    BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST,
+    _construct_resource_file_url,
+)
+
+__all__ = [
+    "BloomTokenizer",
+]
+
+PRETRAINED_RESOURCE_FILES_MAP = {
+    "vocab_file": _construct_resource_file_url(BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST, "vocab.json"),
+    "merges_file": _construct_resource_file_url(BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST, "merges.txt"),
+    "tokenizer_file": _construct_resource_file_url(BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST, "tokenizer.json"),
+}
+
+
+def split_tokenizer_json_file(tokenizer_file: str):
+    base_dir = os.path.dirname(tokenizer_file)
+    with open(tokenizer_file, "r", encoding="utf-8") as f:
+        tokenizer = json.load(f)
+
+    def save_to_file(file: str, content: str):
+        if os.path.exists(file):
+            return
+        with open(file, "w", encoding="utf-8") as f:
+            f.write(content)
+
+    # vocab.json
+    save_to_file(os.path.join(base_dir, "vocab.json"), json.dumps(tokenizer["model"]["vocab"], ensure_ascii=False))
+    # merge file
+    save_to_file(os.path.join(base_dir, "merges.txt"), "\n".join(tokenizer["model"]["merges"]))
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    _chr = chr
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [_chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class BloomTokenizer(PretrainedTokenizer):
+    """
+    Constructs a GPT tokenizer based on byte-level Byte-Pair-Encoding.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            Path to the vocab file.
+            The vocab file contains a mapping from vocabulary strings to indices.
+        merges_file (str):
+            Path to the merge file.
+            The merge file is used to split the input sentence into "subword" units.
+            The vocab file is then used to encode those units as intices.
+        errors (str):
+            Paradigm to follow when decoding bytes to UTF-8.
+            Defaults to `'replace'`.
+        max_len (int, optional):
+            The maximum value of the input sequence length.
+            Defaults to `None`.
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import BloomTokenizer
+
+            tokenizer = BloomTokenizer.from_pretrained('bigscience/bloom-560m')
+            print(tokenizer('Welcome to use PaddlePaddle and PaddleNLP'))
+
+            '''
+            {'input_ids': [14618, 284, 779, 350, 37382, 47, 37382, 290, 350, 37382, 45, 19930],
+            'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
+            '''
+
+    """
+
+    resource_files_names = {
+        "vocab_file": "vocab.json",
+        "merges_file": "merges.txt",
+        "tokenizer_file": "tokenizer.json",
+    }  # for save_pretrained
+    pretrained_resource_files_map = PRETRAINED_RESOURCE_FILES_MAP
+
+    # TODO(wj-Mcat): disable max-model input size of bloom model
+    max_model_input_sizes = {
+        "bigscience/bloom-560m": 102400,
+    }
+    padding_side = "left"
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        max_len=None,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token="<pad>",
+        eol_token="<s>",
+        add_prefix_space=False,
+        add_bos_token=False,
+        **kwargs  # The token of newline.
+    ):
+
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        self.eol_token = eol_token
+        self._build_special_tokens_map_extended(
+            bos_token=pad_token if getattr(self, "bos_token", None) is None else self.bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+        )
+
+        self._vocab_file = vocab_file
+        self._merges_file = merges_file
+        self.max_len = max_len if max_len is not None else int(1e12)
+        self.num_command_tokens = 2
+        self.num_type_tokens = 2
+
+        with open(vocab_file, "r", encoding="utf-8") as f:
+            self.encoder = json.load(f)
+
+        self.decoder = {v: k for k, v in self.encoder.items()}
+
+        self.num_tokens = len(self.encoder)
+        self.num_text_tokens = self.num_tokens - 1
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+
+        with open(merges_file, encoding="utf-8") as f:
+            bpe_data = f.read().split("\n")[1:-1]
+
+        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        self.add_prefix_space = add_prefix_space
+        self.add_bos_token = add_bos_token
+
+        re = try_import("regex")
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+    @property
+    def vocab_size(self):
+        """
+        Returns the size of vocabulary.
+
+        Returns:
+            int: The sum of size of vocabulary and the size of speical tokens.
+
+        """
+
+        return len(self.encoder)
+
+    @property
+    def eol_token_id(self):
+        if self.eol_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.eol_token)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        re = try_import("regex")
+        for token in re.findall(self.pat, text):
+            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    def _convert_token_to_id(self, token):
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+
+        return self.decoder[index]
+
+    def convert_ids_to_string(self, ids):
+        """
+        Converts a single index or a sequence of indices to texts.
+
+        Args:
+            ids (int|List[int]):
+                The token id (or token ids) to be converted to text.
+
+        Returns:
+            str: The decoded text.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import BloomTokenizer
+                tokenizer = BloomTokenizer.from_pretrained('gpt2-medium-en')
+                print(tokenizer.convert_ids_to_string(tokenizer.convert_ids_to_string([14618, 284, 779, 350, 37382, 47, 37382, 290, 350, 37382, 45, 19930]))
+                # 'Welcome to use PaddlePaddle and PaddleNLP'
+
+        """
+
+        text = "".join([self.decoder[id] for id in ids])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    def save_resources(self, save_directory):
+        """
+        Saves `SentencePiece <https://github.com/google/sentencepiece>`__ file
+        (ends with '.spm') under `save_directory`.
+
+        Args:
+            save_directory (str): Directory to save files into.
+        """
+        for name, file_name in self.resource_files_names.items():
+            source_path = getattr(self, "_%s" % name, None)
+            if source_path is None:
+                continue
+
+            save_path = os.path.join(save_directory, file_name)
+            if os.path.abspath(source_path) != os.path.abspath(save_path):
+                shutil.copyfile(source_path, save_path)
+
+    def convert_tokens_to_string(self, tokens):
+        """
+        Converts a sequence of tokens (string) in a single string.
+        """
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
+        if is_split_into_words or add_prefix_space:
+            text = " " + text
+        return (text, kwargs)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        if self.add_bos_token:
+            bos_token_ids = [self.bos_token_id]
+        else:
+            bos_token_ids = []
+
+        output = bos_token_ids + token_ids_0
+
+        if token_ids_1 is None:
+            return output
+
+        return output + bos_token_ids + token_ids_1
+
+    def _pad(
+        self,
+        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                >= 7.5 (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        if "attention_mask" in encoded_inputs and len(np.shape(encoded_inputs["attention_mask"])) > 2:
+            attention_mask = encoded_inputs["attention_mask"]
+            encoded_inputs.pop("attention_mask")
+        else:
+            attention_mask = None
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+        encoded_inputs = super()._pad(
+            encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, return_attention_mask
+        )
+        if attention_mask is not None and len(np.shape(attention_mask)) > 2:
+            encoded_inputs["attention_mask"] = attention_mask
+            needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+            if needs_to_be_padded:
+                difference = max_length - len(required_input)
+                if "attention_mask" in encoded_inputs:
+                    encoded_inputs["attention_mask"] = np.pad(
+                        encoded_inputs["attention_mask"],
+                        pad_width=[(0, 0), (difference, 0), (difference, 0)],
+                        mode="constant",
+                        constant_values=0,
+                    )
+        return encoded_inputs
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm/LICENSE b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm/LICENSE
new file mode 100644
index 000000000..f8e273182
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm/LICENSE
@@ -0,0 +1,65 @@
+The ChatGLM-6B License
+
+一、定义
+
+“许可方”是指分发其软件的 ChatGLM-6B 模型团队。
+
+“软件”是指根据本许可提供的 ChatGLM-6B 模型参数。
+
+2. 许可授予
+
+根据本许可的条款和条件，许可方特此授予您非排他性、全球性、不可转让、不可再许可、可撤销、免版税的版权许可，仅用于您的非商业研究目的。
+
+上述版权声明和本许可声明应包含在本软件的所有副本或重要部分中。
+
+3.限制
+
+您不得出于任何商业、军事或非法目的使用、复制、修改、合并、发布、分发、复制或创建本软件的全部或部分衍生作品。
+
+您不得利用本软件从事任何危害国家安全和国家统一、危害社会公共利益、侵犯人身权益的行为。
+
+4.免责声明
+
+本软件“按原样”提供，不提供任何明示或暗示的保证，包括但不限于对适销性、特定用途的适用性和非侵权性的保证。 在任何情况下，作者或版权持有人均不对任何索赔、损害或其他责任负责，无论是在合同诉讼、侵权行为还是其他方面，由软件或软件的使用或其他交易引起、由软件引起或与之相关 软件。
+
+5. 责任限制
+
+除适用法律禁止的范围外，在任何情况下且根据任何法律理论，无论是基于侵权行为、疏忽、合同、责任或其他原因，任何许可方均不对您承担任何直接、间接、特殊、偶然、示范性、 或间接损害，或任何其他商业损失，即使许可人已被告知此类损害的可能性。
+
+6.争议解决
+
+本许可受中华人民共和国法律管辖并按其解释。 因本许可引起的或与本许可有关的任何争议应提交北京市海淀区人民法院。
+
+请注意，许可证可能会更新到更全面的版本。 有关许可和版权的任何问题，请通过 glm-130b@googlegroups.com 与我们联系。
+
+1. Definitions
+
+“Licensor” means the ChatGLM-6B Model Team that distributes its Software.
+
+“Software” means the ChatGLM-6B model parameters made available under this license.
+
+2. License Grant
+
+Subject to the terms and conditions of this License, the Licensor hereby grants to you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license to use the Software solely for your non-commercial research purposes.
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+3. Restriction
+
+You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any commercial, military, or illegal purposes.
+
+You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings.
+
+4. Disclaimer
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+5. Limitation of Liability
+
+EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+
+6. Dispute Resolution
+
+This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing.
+
+Note that the license is subject to update to a more comprehensive version.  For any questions related to the license and copyright, please contact us at glm-130b@googlegroups.com.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm/__init__.py
new file mode 100644
index 000000000..595add0ae
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm/configuration.py
new file mode 100644
index 000000000..6ba479167
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm/configuration.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" ChatGLM model configuration """
+
+from ..configuration_utils import PretrainedConfig
+
+__all__ = [
+    "ChatGLMConfig",
+    "CHATGLM_PRETRAINED_RESOURCE_FILES_MAP",
+]
+
+CHATGLM_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "THUDM/chatglm-6b": "https://paddlenlp.bj.bcebos.com/models/community/THUDM/chatglm-6b/model_state.pdparams",
+        "THUDM/chatglm-6b-v1.1": "https://paddlenlp.bj.bcebos.com/models/community/THUDM/chatglm-6b-v1.1/model_state.pdparams",
+    }
+}
+
+
+class ChatGLMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`~ChatGLMModel`].
+    It is used to instantiate an ChatGLM model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the ChatGLM-6B [THUDM/ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b) architecture.
+
+    Configuration objects inherit from  [`PretrainedConfig`] and can be used
+    to control the model outputs. Read the documentation from  [`PretrainedConfig`]
+    for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 150528):
+            Vocabulary size of the ChatGLM-6B model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~ChatGLMModel`] or
+            [`~TFChatGLMModel`].
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 28):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        inner_hidden_size (`int`, *optional*, defaults to 16384):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        max_sequence_length (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        layernorm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether the model should return the last key/values attentions (not used by all models).
+        Example:
+
+    ```python
+    >>> from configuration import ChatGLMConfig
+    >>> from modeling import ChatGLMModel
+
+    >>> # Initializing a ChatGLM-6B THUDM/ChatGLM-6B style configuration
+    >>> configuration = ChatGLMConfig()
+
+    >>> # Initializing a model from the THUDM/ChatGLM-6B style configuration
+    >>> model = ChatGLMModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "chatglm"
+    attribute_map = {"num_layers": "num_hidden_layers"}
+
+    def __init__(
+        self,
+        vocab_size=130528,
+        hidden_size=4096,
+        num_hidden_layers=28,
+        num_attention_heads=32,
+        layernorm_epsilon=1e-5,
+        use_cache=False,
+        bos_token_id=130004,
+        eos_token_id=130005,
+        pad_token_id=3,
+        mask_token_id=130000,
+        gmask_token_id=130001,
+        max_sequence_length=2048,
+        inner_hidden_size=16384,
+        position_encoding_2d=True,
+        quantization_bit=0,
+        pre_seq_len=None,
+        prefix_projection=False,
+        output_predict=True,
+        attention_scale=True,
+        activation="gelu",
+        num_image_tokens=0,
+        long_sequence_strategy_type=None,
+        long_sequence_strategy_name=None,
+        long_sequence_init_args=None,
+        use_long_sequence_strategies=False,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.num_hidden_layers = num_hidden_layers
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.max_sequence_length = max_sequence_length
+        self.layernorm_epsilon = layernorm_epsilon
+        self.inner_hidden_size = inner_hidden_size
+        self.use_cache = use_cache
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.mask_token_id = mask_token_id
+        self.gmask_token_id = gmask_token_id
+        self.position_encoding_2d = position_encoding_2d
+        self.quantization_bit = quantization_bit
+        self.pre_seq_len = pre_seq_len
+        self.prefix_projection = prefix_projection
+        self.output_predict = output_predict
+        self.attention_scale = attention_scale
+        self.activation = activation
+        self.num_image_tokens = num_image_tokens
+
+        self.long_sequence_strategy_type = long_sequence_strategy_type
+        self.long_sequence_strategy_name = long_sequence_strategy_name
+        self.long_sequence_init_args = {} if long_sequence_init_args is None else long_sequence_init_args
+        self.use_long_sequence_strategies = use_long_sequence_strategies
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm/modeling.py
new file mode 100644
index 000000000..5e3d8e493
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm/modeling.py
@@ -0,0 +1,986 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GLM model"""
+from __future__ import annotations
+
+import math
+import re
+from functools import partial
+from typing import Any, Dict, Optional
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import Tensor
+from paddle.distributed import fleet
+from paddle.distributed.fleet.utils import recompute
+from paddle.utils import map_structure
+
+from paddlenlp.transformers.long_sequence_strategies import LongSequenceStrategies
+
+from ...utils.env import CONFIG_NAME
+from ...utils.log import logger
+from .. import PretrainedModel, register_base_model
+from ..model_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithPast,
+)
+from .configuration import CHATGLM_PRETRAINED_RESOURCE_FILES_MAP, ChatGLMConfig
+
+__all__ = [
+    "ChatGLMModel",
+    "ChatGLMPretrainedModel",
+    "ChatGLMForCausalLM",
+]
+
+
+def parallel_matmul(lm_output, logit_weights, parallel_output):
+    hcg = fleet.get_hybrid_communicate_group()
+    model_parallel_group = hcg.get_model_parallel_group()
+    world_size = hcg.get_model_parallel_world_size()
+
+    if world_size > 1:
+        # _c_identity is backwards is reduce
+        input_parallel = paddle.distributed.collective._c_identity(lm_output, group=model_parallel_group)
+
+        logits = paddle.matmul(input_parallel, logit_weights, transpose_y=True)
+
+        if parallel_output:
+            return logits
+
+        # _c_concat has not grad backwards
+        return paddle.distributed.collective._c_concat(logits, group=model_parallel_group)
+    else:
+        logits = paddle.matmul(lm_output, logit_weights, transpose_y=True)
+        return logits
+
+
+class PrefixEncoder(nn.Layer):
+    """
+    The prefix encoder for P-Tuning v2.
+    Input shape: [batch_size, prefix_length]
+    Output shape: [batch_size, prefix_length, 2 * num_layers * hidden_size]
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.prefix_projection = config.prefix_projection
+        if self.prefix_projection:
+            # Use a two-layer MLP to encode the prefix
+            self.embedding = nn.Embedding(config.pre_seq_len, config.hidden_size)
+            self.trans = nn.Sequential(
+                nn.Linear(config.hidden_size, config.hidden_size),
+                nn.Tanh(),
+                nn.Linear(config.hidden_size, config.num_layers * config.hidden_size * 2),
+            )
+        else:
+            self.embedding = nn.Embedding(config.pre_seq_len, config.num_layers * config.hidden_size * 2)
+
+    def forward(self, prefix: paddle.Tensor):
+        if self.prefix_projection:
+            prefix_tokens = self.embedding(prefix)
+            past_key_values = self.trans(prefix_tokens)
+        else:
+            past_key_values = self.embedding(prefix)
+        return past_key_values
+
+
+class RotaryEmbeddings(nn.Layer):
+    def __init__(self, hidden_size, base=10000.0, position_encoding_2d=True):
+        super().__init__()
+        self.default_dtype = paddle.get_default_dtype()
+        self.inv_freq = 1.0 / (base ** (paddle.arange(0, hidden_size, 2).astype("float32") / hidden_size))
+        self.position_encoding_2d = position_encoding_2d
+        self.max_seq_len_cached = -1
+        self.cos_cached = None
+        self.sin_cached = None
+
+    def get_rotary_embeds(self, cos, sin, position_ids):
+        # [s, b, 1, h/n]
+        cos = cos.squeeze(1)[position_ids].unsqueeze(2)
+        sin = sin.squeeze(1)[position_ids].unsqueeze(2)
+        return paddle.stack([cos, sin], axis=0)
+
+    def forward(self, position_ids):
+
+        seq_len = position_ids.max() + 1
+        # seq_len = position_ids.shape[-1]
+
+        if self.max_seq_len_cached < 0 or seq_len > self.max_seq_len_cached:
+            self.max_seq_len_cached = seq_len
+
+            # x.shape = [b, s, n, h/n/2]
+            # TODO(duanyanhui): npu arange kernel don't support fp16, and
+            # it can't be fallbacked to cpu. It will be fixed in future.
+            t = paddle.arange(start=0, end=seq_len, dtype="float32")
+            # [s, h/n/2]
+            freqs = paddle.einsum("i,j->ij", t, self.inv_freq)
+            # [s, h/n]
+            emb = paddle.concat([freqs, freqs], axis=-1)
+            # [s, 1, h/n]
+            cos_cached = emb.cos().unsqueeze(1).cast(self.default_dtype)
+            sin_cached = emb.sin().unsqueeze(1).cast(self.default_dtype)
+
+            if hasattr(paddle.framework, "_no_check_dy2st_diff"):
+                # TODO(daisiming): _no_check_dy2st_diff is used to turn off the checking of behavior
+                # inconsistency between dynamic graph and static graph. _no_check_dy2st_diff should be
+                # removed after static graphs support inplace and stride.
+                with paddle.framework._no_check_dy2st_diff():
+                    self.cos_cached, self.sin_cached = cos_cached, sin_cached
+            else:
+                self.cos_cached, self.sin_cached = cos_cached, sin_cached
+
+        cos, sin = self.cos_cached[:seq_len, ...], self.sin_cached[:seq_len, ...]
+        if self.position_encoding_2d:
+            block_position_ids = position_ids[:, 1, :].transpose([1, 0])
+            position_ids = position_ids[:, 0, :].transpose([1, 0])
+            block_rotary_embeds = self.get_rotary_embeds(cos, sin, block_position_ids)
+            position_rotary_embeds = self.get_rotary_embeds(cos, sin, position_ids)
+            rotary_embeds = paddle.stack([position_rotary_embeds, block_rotary_embeds], axis=0)
+        else:
+            position_ids = position_ids.transpose([1, 0])
+            rotary_embeds = self.get_rotary_embeds(cos, sin, position_ids)
+
+        return rotary_embeds
+
+
+class ChatGLMAttention(nn.Layer):
+    """
+    Self-attention layer performs multiple attention to jointly attending to
+    information from different representation subspaces.
+    """
+
+    def __init__(self, config: ChatGLMConfig):
+        super(ChatGLMAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+        self.config = config
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = config.hidden_size // config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.position_encoding_2d = config.position_encoding_2d
+        self.scale_mask_softmax = False
+        self.default_dtype = paddle.get_default_dtype()
+
+        self.attention_scale = config.attention_scale
+
+        if config.tensor_parallel_degree > 1:
+            self.query_key_value = fleet.meta_parallel.ColumnParallelLinear(
+                config.hidden_size, 3 * config.hidden_size, has_bias=True, gather_output=False
+            )
+            self.dense = fleet.meta_parallel.RowParallelLinear(
+                config.hidden_size, config.hidden_size, input_is_parallel=True, has_bias=True
+            )
+            self.num_attention_heads = config.num_attention_heads // config.tensor_parallel_degree
+        else:
+            self.query_key_value = nn.Linear(config.hidden_size, 3 * config.hidden_size)
+            self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+
+        # self.attention_dropout = nn.Dropout(config.attention_dropout_prob)
+        # self.output_dropout = nn.Dropout(config.output_dropout_prob)
+
+    def _rotate_half(self, x):
+        x1, x2 = paddle.chunk(x, 2, axis=-1)
+        return paddle.concat([-x2, x1], axis=-1)
+
+    def _apply_rotary_position_embed_index(self, q, k, cos, sin):
+        # q.shape = [s, b, n, h/n/2], cos.shape = [s, 1, h/n], position_ids.shape = [s, b]
+        # [s, b, n, h/n]
+        q = q * cos + self._rotate_half(q) * sin
+        k = k * cos + self._rotate_half(k) * sin
+        return q, k
+
+    def _core_attention(self, q_layer: Tensor, k_layer: Tensor, position_ids: Tensor, rotary_embeds: Tensor):
+        # Set store_true, position_encoding_2d=False by default.
+        if self.config.position_encoding_2d:
+            # [s, b, n, h/n/2]
+            q1, q2 = paddle.chunk(q_layer, 2, axis=-1)
+            k1, k2 = paddle.chunk(k_layer, 2, axis=-1)
+
+            pcos, psin = rotary_embeds[0][0], rotary_embeds[0][1]
+            bcos, bsin = rotary_embeds[1][0], rotary_embeds[1][1]
+
+            # [s, b, n, h/n]
+            q1, k1 = self._apply_rotary_position_embed_index(q1, k1, pcos, psin)
+            q2, k2 = self._apply_rotary_position_embed_index(q2, k2, bcos, bsin)
+            q_layer = paddle.concat([q1, q2], axis=-1)
+            k_layer = paddle.concat([k1, k2], axis=-1)
+        else:
+            cos, sin = rotary_embeds[0], rotary_embeds[1]
+            # [s, b, n, h/n]
+            q_layer, k_layer = self._apply_rotary_position_embed_index(q_layer, k_layer, cos, sin)
+        return q_layer, k_layer
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Tensor,
+        position_ids: Tensor,
+        use_cache: bool = False,
+        cache: Tensor = None,
+        layer_id=0,
+        rotary_embeds=None,
+    ):
+        # [s, b, h]
+        query_length, batch_size = hidden_states.shape[:2]
+        # [s, b, 3h]
+        mixed_layer = self.query_key_value(hidden_states)
+        # [s, b, n, 3h//n]
+        mixed_layer = mixed_layer.reshape(
+            [query_length, batch_size, self.num_attention_heads, self.attention_head_size * 3]
+        )
+        # [s, b, n, h//n]
+        q_layer, k_layer, v_layer = paddle.split(mixed_layer, 3, axis=-1)
+        # [s, b, n, h/n]
+        q_layer, k_layer = self._core_attention(q_layer, k_layer, position_ids, rotary_embeds)
+
+        if cache is not None:
+            cache_k, cache_v = cache[0], cache[1]
+            # [s + c, b, n, h/n]
+            k_layer = paddle.concat([cache_k, k_layer], axis=0)
+            v_layer = paddle.concat([cache_v, v_layer], axis=0)
+
+        cache_kv = None
+        if use_cache:
+            cache_kv = (k_layer, v_layer)
+        version = paddle.version.full_version
+        version_check = True
+        if self.config.use_flash_attention and version != "0.0.0" and version <= "2.5.2":
+            logger.warning(
+                "PaddlePaddle version 2.5.3 or higher is required, please upgrade your PaddlePaddle to 2.5.3 or other higher version."
+            )
+            version_check = False
+        if self.config.use_flash_attention and version_check:
+            # Paddle Flash Attention input [ bz, seqlen, nhead, head_dim]
+            # Torch Flash Attention input [ bz, nhead, seqlen, head_dim]
+            # [s, b, n, h/n] = > [batch_size, seq_len, num_heads, head_dim]
+            q_layer = paddle.transpose(q_layer, [1, 0, 2, 3])
+            k_layer = paddle.transpose(k_layer, [1, 0, 2, 3])
+            v_layer = paddle.transpose(v_layer, [1, 0, 2, 3])
+            query_states, key_states, value_states = q_layer, k_layer, v_layer
+
+            attn_output = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                is_causal=False,
+            )
+            attn_weights = None
+            # [batch_size, seq_len, num_heads, head_dim] => [ batch_size, seq_len, hidden_size]
+            attn_output = paddle.reshape(attn_output, [attn_output.shape[0], attn_output.shape[1], -1])
+            # [ batch_size, seq_len, hidden_size] = > [ seq_len, batch_size, hidden_size]
+            attn_output = paddle.transpose(attn_output, [1, 0, 2])
+            attn_output = self.dense(attn_output)
+
+            output, attention_probs = attn_output, attn_weights
+        else:
+
+            seq_length, batch_size, num_heads, hidden_size = k_layer.shape
+
+            attention_scale_coeff = float(layer_id) + 1.0
+            if self.attention_scale:
+                # [s, b, n, h/n]
+                q_layer = q_layer / (math.sqrt(self.attention_head_size) * attention_scale_coeff)
+                q_layer = q_layer.astype(self.default_dtype)
+
+            # [b, n, s, s]
+            output_shape = [q_layer.shape[1], q_layer.shape[2], q_layer.shape[0], k_layer.shape[0]]
+
+            # [s, b * n, h/n]
+            q_layer = q_layer.reshape([output_shape[2], output_shape[0] * output_shape[1], -1])
+            k_layer = k_layer.reshape([output_shape[3], output_shape[0] * output_shape[1], -1])
+
+            # [b * n , s, s] = matmul([b * n, s, h/n],  [b * n, h/n, s])
+            attention_scores = paddle.matmul(q_layer.transpose([1, 0, 2]), k_layer.transpose([1, 2, 0]))
+            # [b, n, s, s]
+            attention_scores = attention_scores.reshape(output_shape)
+
+            if self.scale_mask_softmax:
+                self.scale_mask_softmax.scale = attention_scale_coeff
+                attention_probs = self.scale_mask_softmax(attention_scores, attention_mask)
+            else:
+                attention_scores = attention_scores.astype("float32")
+                attention_scores = attention_scores * attention_scale_coeff
+                attention_scores = attention_scores + attention_mask
+
+                attention_probs = F.softmax(attention_scores, axis=-1)
+                attention_probs = attention_probs.astype(self.default_dtype)
+                v_layer = v_layer.astype(self.default_dtype)
+
+            # [b, n, s, h/n]
+            output_shape = [v_layer.shape[1], v_layer.shape[2], q_layer.shape[0], v_layer.shape[3]]
+            # [s, b * n, h/n]
+            v_layer = v_layer.reshape([v_layer.shape[0], output_shape[0] * output_shape[1], -1])
+            # [b * n, s, s]
+            attention_probs = attention_probs.reshape([output_shape[0] * output_shape[1], output_shape[2], -1])
+
+            # [b * n, s, h/n]
+            context_layer = paddle.bmm(attention_probs, v_layer.transpose([1, 0, 2]))
+            context_layer = context_layer.reshape(output_shape)
+
+            # [s, b, n, h/n]
+            context_layer = context_layer.transpose([2, 0, 1, 3])
+
+            # [s, b, h]
+            new_context_shape = context_layer.shape[:-2] + [self.num_attention_heads * self.attention_head_size]
+            context_layer = context_layer.reshape(new_context_shape)
+
+            output = self.dense(context_layer)
+
+        return output, cache_kv, attention_probs
+
+
+class ChatGLMBlock(nn.Layer):
+    """
+    The Transformer layer.
+    """
+
+    def __init__(self, config: ChatGLMConfig, layer_id: int):
+        super(ChatGLMBlock, self).__init__()
+        self.config = config
+        self.layer_id = layer_id
+        self.default_dtype = paddle.get_default_dtype()
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layernorm_epsilon)
+        self.attention = ChatGLMAttention(config)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layernorm_epsilon)
+        self.mlp = ChatGLMMLP(config)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Tensor,
+        position_ids: Tensor,
+        use_cache: bool = False,
+        cache: Tensor = None,
+        rotary_embeds: Tensor = None,
+    ):
+        # Layer norm before transformer layer
+        attention_input = self.input_layernorm(hidden_states)
+        # Self attention
+        attention_output, cache, _ = self.attention(
+            hidden_states=attention_input,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            cache=cache,
+            use_cache=use_cache,
+            layer_id=self.layer_id,
+            rotary_embeds=rotary_embeds,
+        )
+        # Residual connection
+        alpha = (2 * self.config.num_hidden_layers) ** 0.5
+        layernorm_input = alpha * attention_input + attention_output
+        # Layernorm after attention
+        mlp_input = self.post_attention_layernorm(layernorm_input)
+        # MLP
+        mlp_output = self.mlp(mlp_input)
+        # Second residual connection
+        output = mlp_input * alpha + mlp_output
+        return output, cache
+
+
+class ChatGLMMLP(nn.Layer):
+    def __init__(self, config: ChatGLMConfig):
+        super(ChatGLMMLP, self).__init__()
+        self.config = config
+        if config.inner_hidden_size is None:
+            inner_hidden_size = config.hidden_size * 4
+        else:
+            inner_hidden_size = config.inner_hidden_size
+
+        if config.tensor_parallel_degree > 1:
+            self.dense_h_to_4h = fleet.meta_parallel.ColumnParallelLinear(
+                config.hidden_size, inner_hidden_size, has_bias=True, gather_output=False
+            )
+            self.dense_4h_to_h = fleet.meta_parallel.RowParallelLinear(
+                inner_hidden_size, config.hidden_size, input_is_parallel=True, has_bias=True
+            )
+        else:
+            self.dense_h_to_4h = nn.Linear(config.hidden_size, inner_hidden_size)
+            self.dense_4h_to_h = nn.Linear(inner_hidden_size, config.hidden_size)
+        # self.dropout = nn.Dropout(config.output_dropout_prob)
+        self.activation = self.geglue if self.config.activation == "geglu" else self.gelu
+
+    def geglu(self, x):
+        x1, x2 = paddle.chunk(x, chunks=2, axis=-1)
+        x = x1 * F.gelu(x2)
+        return x
+
+    def gelu(self, x):
+        return F.gelu(x, approximate=True)
+
+    def forward(self, hidden_states):
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+        intermediate_parallel = self.activation(intermediate_parallel)
+        output = self.dense_4h_to_h(intermediate_parallel)
+        # output = self.dropout(output)
+        return output
+
+
+class ChatGLMStack(nn.Layer):
+    """
+    GLM Transformer
+    """
+
+    def __init__(self, config: ChatGLMConfig):
+        super(ChatGLMStack, self).__init__()
+        self.config = config
+        self.position_encoding_2d = config.position_encoding_2d
+        self.hidden_size = config.hidden_size
+        # Recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        self.num_attention_heads = config.num_attention_heads
+
+        if config.use_long_sequence_strategies:
+            self.rotary_embeddings = LongSequenceStrategies.build_long_sequence_strategy(
+                config.long_sequence_strategy_type,
+                config.long_sequence_strategy_name,
+                **config.long_sequence_init_args,
+            )
+
+        else:
+            self.rotary_embeddings = RotaryEmbeddings(
+                self.hidden_size // (self.num_attention_heads * 2)
+                if self.position_encoding_2d
+                else self.hidden_size // self.num_attention_heads,
+                base=10000.0,
+            )
+        # self.embedding_dropout = nn.Dropout(config.embedding_dropout_prob)
+
+        if self.config.tensor_parallel_degree > 1:
+            self.word_embeddings = fleet.meta_parallel.VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()),
+            )
+        else:
+            self.word_embeddings = nn.Embedding(
+                config.vocab_size,
+                config.hidden_size,
+                weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()),
+            )
+
+        self.layers = nn.LayerList()
+        for index in range(config.num_hidden_layers):
+            self.layers.append(ChatGLMBlock(config, index))
+
+        self.final_layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layernorm_epsilon)
+
+        if self.config.pre_seq_len is not None:
+            for param in self.parameters():
+                param.requires_grad = False
+            self.prefix_tokens = paddle.arange(self.pre_seq_len, dtype="int64")
+            self.prefix_encoder = PrefixEncoder(config)
+            self.dropout = nn.Dropout(0.1)
+
+    def get_prompt(self, batch_size, dtype=paddle.float16):
+        prefix_tokens = self.prefix_tokens.unsqueeze(0).expand([batch_size, -1])
+        past_key_values = self.prefix_encoder(prefix_tokens).astype(dtype)
+        past_key_values = past_key_values.reshape(
+            batch_size,
+            self.config.pre_seq_len,
+            self.config.num_layers * 2,
+            self.config.num_attention_heads,
+            self.config.hidden_size // self.config.num_attention_heads,
+        )
+        # seq_len, b, nh, hidden_size
+        past_key_values = self.dropout(past_key_values)
+        past_key_values = past_key_values.transpose([2, 1, 0, 3, 4]).split(2)
+        # past_key_values = [(v[0], v[1]) for v in past_key_values]
+        return past_key_values
+
+    @paddle.jit.not_to_static
+    def recompute_training(
+        self,
+        layer_module: nn.Layer,
+        hidden_states: Tensor,
+        attention_mask: Tensor,
+        position_ids: Tensor,
+        use_cache: bool,
+        cache: Tensor,
+        rotary_embeds: Tensor,
+    ):
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+
+            return custom_forward
+
+        hidden_states = recompute(
+            create_custom_forward(layer_module),
+            hidden_states,
+            attention_mask,
+            position_ids,
+            use_cache,
+            cache,
+            rotary_embeds,
+            use_reentrant=self.config.recompute_use_reentrant,
+        )
+        return hidden_states
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        position_ids: Tensor,
+        attention_mask: Tensor,
+        inputs_embeds: Tensor = None,
+        cache: Optional[Tensor] = None,
+        use_cache: bool = False,
+    ):
+        if input_ids is not None and inputs_embeds is not None:
+            input_ids = None
+            logger.warning("Specify both input_ids and inputs_embeds at the same time, will use inputs_embeds")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        inputs_embeds = inputs_embeds.transpose([1, 0, 2])
+        if self.config.use_long_sequence_strategies:
+            cos, sin = self.rotary_embeddings(seq_len=seq_length)
+            block_position_ids = position_ids[:, 1, :].transpose([1, 0])
+            position_ids = position_ids[:, 0, :].transpose([1, 0])
+            block_rotary_embeds = paddle.stack(
+                [cos[block_position_ids].unsqueeze(2), sin[block_position_ids].unsqueeze(2)]
+            )
+            position_rotary_embeds = paddle.stack([cos[position_ids].unsqueeze(2), sin[position_ids].unsqueeze(2)])
+            rotary_embeds = paddle.stack([position_rotary_embeds, block_rotary_embeds], axis=0)
+        else:
+            rotary_embeds = self.rotary_embeddings(position_ids)
+
+        if cache is None:
+            if self.config.pre_seq_len is not None:
+                cache = self.get_prompt(batch_size=input_ids.shape[0], dtype=inputs_embeds.dtype)
+            else:
+                cache = tuple([None] * len(self.layers))
+
+        # this branch is deprecated
+        if self.config.pre_seq_len is not None and attention_mask is not None:
+            prefix_attention_mask = paddle.ones([batch_size, 1, input_ids.shape[-1], self.config.pre_seq_len])
+            prefix_attention_mask = (prefix_attention_mask < 0.5).astype("int64")
+            attention_mask = paddle.concat((prefix_attention_mask, attention_mask), axis=3)
+
+        zero = paddle.zeros(attention_mask.shape, dtype=inputs_embeds.dtype)
+        neg_inf = paddle.full_like(attention_mask, paddle.finfo(inputs_embeds.dtype).min, dtype=inputs_embeds.dtype)
+        attention_mask = paddle.where(attention_mask, zero, neg_inf)
+
+        hidden_states = inputs_embeds
+
+        current_caches = [] if use_cache else None
+
+        for i, layer in enumerate(self.layers):
+            cache_i = cache[i]
+
+            if self.enable_recompute and not hidden_states.stop_gradient:
+                hidden_states, new_cache = self.recompute_training(
+                    layer,
+                    hidden_states=hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    use_cache=use_cache,
+                    cache=cache_i,
+                    rotary_embeds=rotary_embeds,
+                )
+            else:
+                hidden_states, new_cache = layer(
+                    hidden_states=hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    use_cache=use_cache,
+                    cache=cache_i,
+                    rotary_embeds=rotary_embeds,
+                )
+
+            if use_cache:
+                current_caches.append(new_cache)
+
+        output = self.final_layernorm(hidden_states)
+        return (output, current_caches)
+
+
+class ChatGLMPretrainedModel(PretrainedModel):
+    """
+    An abstarct class for pretrained ChatGLM models. It provides GLM related
+    `model_config_file`, `resource_file_names`, `pretrained_resource_files_map`,
+    `pretrained_init_configuration`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    base_model_prefix = "chatglm"
+    config_class = ChatGLMConfig
+    model_config_file = CONFIG_NAME
+    resource_files_names = {"model_state": "model_state.pdparams"}
+    pretrained_resource_files_map = CHATGLM_PRETRAINED_RESOURCE_FILES_MAP
+    _keys_to_ignore_on_load_missing = [r"transformer.rotary_embeddings.inv_freq", r"lm_head.decoder_weight"]
+    _keys_to_ignore_on_load_unexpected = [r"transformer.rotary_emb.inv_freq"]
+
+    def init_weights(self, layer):
+        """Initialization hook"""
+        return None
+
+    def get_position_ids(self, input_ids, mask_positions, use_gmasks=None):
+        batch_size, seq_length = input_ids.shape
+        if use_gmasks is None:
+            use_gmasks = [False] * batch_size
+
+        context_lengths = []
+        for seq in input_ids:
+            context_lengths.append(paddle.where(seq == self.config.bos_token_id)[0][0])
+
+        if self.config.position_encoding_2d:
+            position_ids = paddle.arange(seq_length, dtype="int64").unsqueeze(0).tile([batch_size, 1])
+            for i, context_length in enumerate(context_lengths):
+                position_ids[i, context_length:] = mask_positions[i]
+            block_position_ids = [
+                paddle.concat(
+                    (
+                        paddle.zeros([context_length], dtype="int64"),
+                        paddle.arange(seq_length - context_length, dtype="int64") + 1,
+                    )
+                )
+                for context_length in context_lengths
+            ]
+            block_position_ids = paddle.stack(block_position_ids, axis=0)
+            position_ids = paddle.stack((position_ids, block_position_ids), axis=1)
+        else:
+            position_ids = paddle.arange(seq_length, dtype="int64").unsqueeze(0).tile([batch_size, 1])
+            for i, context_length in enumerate(context_lengths):
+                if not use_gmasks[i]:
+                    position_ids[context_length:] = mask_positions[i]
+
+        return position_ids
+
+    def _get_model_inputs_spec(self, dtype: str):
+        return {
+            "input_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+            "attention_mask": paddle.static.InputSpec(shape=[None, None, None, None], dtype="int64"),
+            "position_ids": paddle.static.InputSpec(shape=[None, 2, None], dtype="int64"),
+        }
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config, is_split=True):
+
+        from paddlenlp.transformers.conversion_utils import split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def get_tensor_parallel_split_mappings(num_hidden_layers):
+            final_actions = {}
+            base_actions = {
+                # Column Linear
+                "transformer.layers.0.mlp.dense_h_to_4h.bias": partial(fn, is_column=True),
+                "transformer.layers.0.mlp.dense_h_to_4h.weight": partial(fn, is_column=True),
+                "transformer.layers.0.attention.query_key_value.bias": partial(fn, is_column=True),
+                "transformer.layers.0.attention.query_key_value.weight": partial(fn, is_column=True),
+                # Row Linear
+                "transformer.word_embeddings.weight": partial(fn, is_column=False),
+                "transformer.layers.0.attention.dense.weight": partial(fn, is_column=False),
+                "transformer.layers.0.mlp.dense_4h_to_h.weight": partial(fn, is_column=False),
+            }
+            for key, action in base_actions.items():
+                if "layers.0." in key:
+                    for i in range(num_hidden_layers):
+                        final_actions[key.replace("layers.0.", f"layers.{i}.")] = action
+                final_actions[key] = action
+
+            return final_actions
+
+        mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
+
+        return mappings
+
+
+@register_base_model
+class ChatGLMModel(ChatGLMPretrainedModel):
+    r"""
+    The GLM Model transformer can behave as an encoder (with only self-attention) as well as a decoder, where
+    a layer of cross-attention is added between the self-attention layers, following the architecture
+    described in [Attention is all you need](https://arxiv.org/abs/1706.03762).
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+    """
+    _keys_to_ignore_on_load_unexpected = [r"transformer.layers.*.attention.rotary_emb.inv_freq", r"lm_head.weight"]
+
+    def __init__(self, config: ChatGLMConfig):
+        super(ChatGLMModel, self).__init__(config)
+        self.config = config
+        self.transformer = ChatGLMStack(config)
+        self.apply(self.init_weights)
+
+    def get_input_embeddings(self):
+        return self.transformer.word_embeddings
+
+    def set_input_embeddings(self, new_embeddings):
+        self.transformer.word_embeddings = new_embeddings
+
+    def forward(
+        self,
+        input_ids: Tensor = None,
+        position_ids: Tensor = None,
+        attention_mask: Tensor = None,
+        cache=None,
+        inputs_embeds: Tensor = None,
+        use_cache: bool = None,
+        return_dict: bool = None,
+    ):
+        if input_ids is None:
+            assert position_ids is not None, "`position_ids` must be explicitly specified when input_ids is None."
+            assert attention_mask is not None, "`attention_mask` must be explicitly specified when input_ids is None."
+
+        if attention_mask is None or len(attention_mask.shape) != 4:
+            raise ValueError(f"attention mask should'nt be None or has size other than 4Dim. Found {attention_mask}")
+
+        attention_mask = attention_mask.astype("bool")
+
+        if position_ids is None:
+            MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
+
+            use_gmasks = []
+            mask_positions = []
+            for seq in input_ids:
+                mask_token = gMASK if gMASK in seq else MASK
+                use_gmask = mask_token == gMASK
+                use_gmasks.append(use_gmask)
+                mask_positions.append(paddle.where(seq == mask_token)[0][0])
+            position_ids = self.get_position_ids(input_ids, mask_positions=mask_positions, use_gmasks=use_gmasks)
+
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        logits, new_caches = self.transformer(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache=cache,
+            use_cache=use_cache,
+        )
+
+        if not return_dict:
+            return (logits, new_caches)
+
+        return BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=logits, past_key_values=new_caches)
+
+
+class ChatGLMHead(nn.Layer):
+    def __init__(self, config, embedding_weights=None):
+        super(ChatGLMHead, self).__init__()
+        self.decoder_weight = (
+            self.create_parameter(shape=[config.vocab_size, config.hidden_size], dtype=paddle.get_default_dtype())
+            if embedding_weights is None
+            else embedding_weights
+        )
+        self.config = config
+
+    def forward(self, hidden_states):
+        if self.config.tensor_parallel_degree > 1:
+            logits = parallel_matmul(hidden_states, self.decoder_weight, self.config.tensor_parallel_output)
+        else:
+            logits = F.linear(hidden_states, self.decoder_weight.T)
+        return logits
+
+
+class ChatGLMForCausalLM(ChatGLMPretrainedModel):
+    _keys_to_ignore_on_save = [r"lm_head.decoder_weight"]
+    _tied_weights_keys = ["lm_head.decoder_weight"]
+
+    def __init__(self, config: ChatGLMConfig):
+        super(ChatGLMForCausalLM, self).__init__(config)
+
+        self.config = config
+        self.max_sequence_length = config.max_sequence_length
+        self.position_encoding_2d = config.position_encoding_2d
+        self.chatglm = ChatGLMModel(config)
+
+        self.lm_head = ChatGLMHead(config, self.chatglm.transformer.word_embeddings.weight)
+        # from paddlenlp.transformers import ChatGLMTokenizer
+        # self.tokenizer = ChatGLMTokenizer.from_pretrained("THUDM/chatglm-6b")
+
+    def prepare_inputs_for_generation(
+        self, input_ids, position_ids=None, attention_mask=None, past_key_values=None, cache=None, **kwargs
+    ):
+        batch_size, seq_length = input_ids.shape
+        MASK, gMASK = self.config.mask_token_id, self.config.gmask_token_id
+        use_gmasks = []
+        mask_positions = []
+        for seq in input_ids:
+            mask_token = gMASK if gMASK in seq else MASK
+            use_gmask = mask_token == gMASK
+            use_gmasks.append(use_gmask)
+            mask_positions.append(paddle.where(seq == mask_token)[0][0])
+
+        if cache is not None or past_key_values is not None:
+            last_token = input_ids[:, -1].unsqueeze(-1)
+
+            attention_mask = attention_mask[:, :, -1:]
+
+            if position_ids is not None:
+                position_ids = position_ids[..., -1:]
+            else:
+                if self.position_encoding_2d:
+                    context_lengths = []
+                    for seq in input_ids:
+                        context_lengths.append(paddle.where(seq == self.config.bos_token_id)[0][0])
+
+                    context_lengths = paddle.to_tensor(context_lengths, dtype="int64")
+                    block_position_ids = seq_length - context_lengths
+                    position_ids = paddle.concat(
+                        [paddle.to_tensor(mask_positions, dtype="int64"), block_position_ids], axis=1
+                    ).unsqueeze(-1)
+                else:
+                    position_ids = paddle.to_tensor(mask_positions, dtype="int64").unsqueeze(-1)
+
+            if cache is None:
+                cache = past_key_values
+            return {
+                "input_ids": last_token,
+                "cache": cache[-1],
+                "position_ids": position_ids,
+                "use_cache": True,
+                "attention_mask": attention_mask,
+            }
+        else:
+            if position_ids is None:
+                position_ids = self.get_position_ids(input_ids, mask_positions=mask_positions, use_gmasks=use_gmasks)
+
+            return {
+                "input_ids": input_ids,
+                "cache": cache,
+                "position_ids": position_ids,
+                "use_cache": True,
+                "attention_mask": attention_mask,
+            }
+
+    def reorder_cache(self, cache: paddle.Tensor, beam_idx):
+        cache = map_structure(lambda x: paddle.index_select(x, beam_idx, axis=1), cache)
+        return cache
+
+    def update_model_kwargs_for_generation(
+        self,
+        outputs,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        standardize_cache_format: bool = False,
+    ) -> Dict[str, Any]:
+        # update cache
+        model_kwargs["cache"] = outputs[1] if isinstance(outputs, tuple) else outputs["past_key_values"]
+
+        # update attention mask
+        if "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            if attention_mask is not None:
+                attention_mask = paddle.concat(
+                    [attention_mask, paddle.zeros([*attention_mask.shape[:3], 1], attention_mask.dtype)], axis=3
+                )
+                new_attention_mask = attention_mask[:, :, -1:].clone()
+                new_attention_mask[..., -1] = 1
+                model_kwargs["attention_mask"] = paddle.concat([attention_mask, new_attention_mask], axis=2)
+
+        # update position ids
+        if "position_ids" in model_kwargs:
+            position_ids = model_kwargs["position_ids"]
+            new_position_id = position_ids[..., -1:].clone()
+            new_position_id[:, 1, :] += 1
+            model_kwargs["position_ids"] = paddle.concat([position_ids, new_position_id], axis=-1)
+
+        return model_kwargs
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        cache=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        return_dict=False,
+    ):
+        transformer_outputs = self.chatglm(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            cache=cache,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs.last_hidden_state if return_dict else transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+        lm_logits = lm_logits.transpose([1, 0, 2]).astype("float32")
+        loss = None
+        if labels is not None:
+            if self.config.tensor_parallel_degree > 1 and self.config.tensor_parallel_output:
+                self.parallel_loss_func = fleet.meta_parallel.ParallelCrossEntropy()
+                filtered_logits = lm_logits[labels != -100]
+                filtered_labels = labels[labels != -100]
+                loss = self.parallel_loss_func(filtered_logits, filtered_labels).mean()
+            else:
+                loss = nn.functional.cross_entropy(lm_logits, labels, ignore_index=-100)
+            loss = loss.astype(lm_logits.dtype)
+
+        if not return_dict:
+            if loss is not None:
+                return (loss, lm_logits, transformer_outputs[1:])
+            else:
+                return (lm_logits, transformer_outputs[1:])
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+        )
+
+    @staticmethod
+    def _reorder_cache(cache, beam_idx):
+        return tuple(
+            (
+                layer_past[0].index_select(1, beam_idx),
+                layer_past[1].index_select(1, beam_idx),
+            )
+            for layer_past in cache
+        )
+
+    @staticmethod
+    def process_response(response):
+        response = response.strip()
+        response = response.replace("[[训练时间]]", "2023年")
+        punkts = [
+            [",", "，"],
+            ["!", "！"],
+            [":", "："],
+            [";", "；"],
+            ["\?", "？"],
+        ]
+        for item in punkts:
+            response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
+            response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
+        return response
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm/tokenizer.py
new file mode 100644
index 000000000..08b8ad9d4
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm/tokenizer.py
@@ -0,0 +1,287 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization classes for ChatGLM."""
+import os
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import sentencepiece as spm
+
+from .. import PretrainedTokenizer
+from ..tokenizer_utils_base import BatchEncoding, PaddingStrategy
+
+
+class ChatGLMTokenizer(PretrainedTokenizer):
+    """
+    Construct a ChatGLM tokenizer.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+    """
+
+    resource_files_names = {"vocab_file": "ice_text.model"}
+    max_model_input_sizes = {"THUDM/chatglm-6b": 2048, "THUDM/chatglm-6b-v1.1": 2048}
+    model_input_names = ["input_ids", "attention_mask"]
+    pretrained_resource_files_map = {
+        "model_file": {
+            "THUDM/chatglm-6b": "https://paddlenlp.bj.bcebos.com/models/community/THUDM/chatglm-6b/ice_text.model",
+            "THUDM/chatglm-6b-v1.1": "https://paddlenlp.bj.bcebos.com/models/community/THUDM/chatglm-6b-v1.1/ice_text.model",
+        }
+    }
+
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<sop>",
+        eos_token="<eop>",
+        end_token="</s>",
+        mask_token="[MASK]",
+        gmask_token="[gMASK]",
+        pad_token="<pad>",
+        padding_side="left",
+        do_lower_case=False,
+        num_image_tokens=20000,
+        **kwargs
+    ) -> None:
+        kwargs["additional_special_tokens"] = kwargs.pop("additional_special_tokens", []) + [gmask_token]
+        super().__init__(
+            pad_token=pad_token,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            mask_token=mask_token,
+            padding_side=padding_side,
+            **kwargs,
+        )
+        self.end_token = end_token
+        self.gmask_token = gmask_token
+        self.do_lower_case = do_lower_case
+        self.vocab_file = vocab_file
+        self.num_image_tokens = num_image_tokens
+        self.max_blank_length = kwargs.get("max_blank_length", 80)
+
+        self.sp_tokenizer = spm.SentencePieceProcessor()
+        self.sp_tokenizer.Load(self.vocab_file)
+
+    @property
+    def gmask_token_id(self) -> Optional[int]:
+        if self.gmask_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.gmask_token)
+
+    @property
+    def end_token_id(self) -> Optional[int]:
+        if self.end_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.end_token)
+
+    @property
+    def tab_token(self):
+        return "<|tab|>"
+
+    @staticmethod
+    def get_blank_token(length: int):
+        assert length >= 2
+        return f"<|blank_{length}|>"
+
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_tokenizer.vocab_size() + self.num_image_tokens
+
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        if kwargs.get("remove_space", False):
+            text = " ".join(text.strip().split())
+        if kwargs.get("linebreak", True):
+            text = text.replace("\n", "<n>")
+        if kwargs.get("whitespaces", True):
+            text = text.replace("\t", self.tab_token)
+            for i in range(self.max_blank_length, 1, -1):
+                text = text.replace(" " * i, self.get_blank_token(i))
+        return (text, kwargs)
+
+    def _tokenize(self, text, **kwargs):
+        """Returns a tokenized string."""
+        add_dummy_prefix = kwargs.get("add_dummy_prefix", True)
+
+        if not add_dummy_prefix:
+            text = "<n>" + text
+        tokens = self.sp_tokenizer.EncodeAsPieces(text)
+        return tokens if add_dummy_prefix else tokens[2:]
+
+    def _decode(
+        self,
+        token_ids: List[int],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = True,
+        spaces_between_special_tokens: bool = True,
+        **kwargs
+    ) -> str:
+        token_ids = [int(_id) - self.num_image_tokens for _id in token_ids]
+        token_ids = [_id for _id in token_ids if _id >= 0]
+        text = super()._decode(
+            token_ids,
+            skip_special_tokens,
+            clean_up_tokenization_spaces,
+            spaces_between_special_tokens,
+            **kwargs,
+        )
+        return self.postprocess(text)
+
+    def postprocess(self, text):
+        # Postprocess.
+        text = text.replace("<n>", "\n")
+        text = text.replace(self.tab_token, "\t")
+        for i in range(2, self.max_blank_length + 1):
+            text = text.replace(self.get_blank_token(i), " " * i)
+        return text
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if token.startswith("<image_") and token.endswith(">") and token[7:-1].isdigit():
+            return int(token[7:-1])
+        else:
+            return self.sp_tokenizer.PieceToId(token) + self.num_image_tokens
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index >= self.vocab_size:
+            return self.unk_token
+        else:
+            if index < self.num_image_tokens:
+                return "<image_{}>".format(index)
+            else:
+                return self.sp_tokenizer.IdToPiece(index - self.num_image_tokens)
+
+    def convert_tokens_to_string(self, tokens):
+        text = self.sp_tokenizer.DecodePieces(tokens)
+        text = self.postprocess(text)
+        return text
+
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (`str`, *optional*):
+                An optional prefix to add to the named of the saved files.
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(save_directory, self.vocab_files_names["vocab_file"])
+        else:
+            vocab_file = save_directory
+
+        with open(self.vocab_file, "rb") as fin:
+            proto_str = fin.read()
+
+        with open(vocab_file, "wb") as writer:
+            writer.write(proto_str)
+
+        return (vocab_file,)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        token_ids_0 += [self.gmask_token_id, self.bos_token_id]
+        if token_ids_1 is not None:
+            token_ids_0 = token_ids_0 + token_ids_1 + [self.eos_token_id]
+        return token_ids_0
+
+    def _pad(
+        self,
+        encoded_inputs: Union[Dict, BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy=PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        # Load from model defaults
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names or "attention_mask" in encoded_inputs
+
+        assert self.padding_side == "left"
+        required_input = encoded_inputs[self.model_input_names[0]]
+        seq_length = len(required_input)
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if max_length is not None:
+            if self.bos_token_id in required_input:
+                context_length = required_input.index(self.bos_token_id)
+            else:
+                context_length = seq_length
+            if "attention_mask" not in encoded_inputs:
+                attention_mask = np.ones((1, seq_length, seq_length))
+                attention_mask = np.tril(attention_mask)
+                attention_mask[:, :, :context_length] = 1
+                encoded_inputs["attention_mask"] = attention_mask
+
+            if "position_ids" not in encoded_inputs:
+                position_ids = np.arange(seq_length, dtype=np.int64)
+                mask_token = self.mask_token_id if self.mask_token_id in required_input else self.gmask_token_id
+                if mask_token in required_input:
+                    mask_position = required_input.index(mask_token)
+                    position_ids[context_length:] = mask_position
+                block_position_ids = np.concatenate(
+                    [
+                        np.zeros(context_length, dtype=np.int64),
+                        np.arange(1, seq_length - context_length + 1, dtype=np.int64),
+                    ]
+                )
+                encoded_inputs["position_ids"] = np.stack([position_ids, block_position_ids], axis=0)
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = np.pad(
+                    encoded_inputs["attention_mask"],
+                    pad_width=[(0, 0), (difference, 0), (difference, 0)],
+                    mode="constant",
+                    constant_values=0,
+                )
+            if "token_type_ids" in encoded_inputs:
+                encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                    "token_type_ids"
+                ]
+            if "special_tokens_mask" in encoded_inputs:
+                encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+            if "position_ids" in encoded_inputs:
+                encoded_inputs["position_ids"] = np.pad(
+                    encoded_inputs["position_ids"], pad_width=[(0, 0), (difference, 0)]
+                )
+            encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+
+        return encoded_inputs
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm_v2/LICENSE b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm_v2/LICENSE
new file mode 100644
index 000000000..535dd587a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm_v2/LICENSE
@@ -0,0 +1,65 @@
+The ChatGLM-6B License
+
+一、定义
+
+“许可方”是指分发其软件的 ChatGLM2-6B 模型团队。
+
+“软件”是指根据本许可提供的 ChatGLM2-6B 模型参数。
+
+2. 许可授予
+
+根据本许可的条款和条件，许可方特此授予您非排他性、全球性、不可转让、不可再许可、可撤销、免版税的版权许可，仅用于您的非商业研究目的。
+
+上述版权声明和本许可声明应包含在本软件的所有副本或重要部分中。
+
+3.限制
+
+您不得出于任何商业、军事或非法目的使用、复制、修改、合并、发布、分发、复制或创建本软件的全部或部分衍生作品。
+
+您不得利用本软件从事任何危害国家安全和国家统一、危害社会公共利益、侵犯人身权益的行为。
+
+4.免责声明
+
+本软件“按原样”提供，不提供任何明示或暗示的保证，包括但不限于对适销性、特定用途的适用性和非侵权性的保证。 在任何情况下，作者或版权持有人均不对任何索赔、损害或其他责任负责，无论是在合同诉讼、侵权行为还是其他方面，由软件或软件的使用或其他交易引起、由软件引起或与之相关 软件。
+
+5. 责任限制
+
+除适用法律禁止的范围外，在任何情况下且根据任何法律理论，无论是基于侵权行为、疏忽、合同、责任或其他原因，任何许可方均不对您承担任何直接、间接、特殊、偶然、示范性、 或间接损害，或任何其他商业损失，即使许可人已被告知此类损害的可能性。
+
+6.争议解决
+
+本许可受中华人民共和国法律管辖并按其解释。 因本许可引起的或与本许可有关的任何争议应提交北京市海淀区人民法院。
+
+请注意，许可证可能会更新到更全面的版本。 有关许可和版权的任何问题，请通过 glm-130b@googlegroups.com 与我们联系。
+
+1. Definitions
+
+“Licensor” means the ChatGLM2-6B Model Team that distributes its Software.
+
+“Software” means the ChatGLM2-6B model parameters made available under this license.
+
+2. License Grant
+
+Subject to the terms and conditions of this License, the Licensor hereby grants to you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty-free copyright license to use the Software solely for your non-commercial research purposes.
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+3. Restriction
+
+You will not use, copy, modify, merge, publish, distribute, reproduce, or create derivative works of the Software, in whole or in part, for any commercial, military, or illegal purposes.
+
+You will not use the Software for any act that may undermine China's national security and national unity, harm the public interest of society, or infringe upon the rights and interests of human beings.
+
+4. Disclaimer
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+5. Limitation of Liability
+
+EXCEPT TO THE EXTENT PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL THEORY, WHETHER BASED IN TORT, NEGLIGENCE, CONTRACT, LIABILITY, OR OTHERWISE WILL ANY LICENSOR BE LIABLE TO YOU FOR ANY DIRECT, INDIRECT, SPECIAL, INCIDENTAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES, OR ANY OTHER COMMERCIAL LOSSES, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+
+6. Dispute Resolution
+
+This license shall be governed and construed in accordance with the laws of People’s Republic of China. Any dispute arising from or in connection with this License shall be submitted to Haidian District People's Court in Beijing.
+
+Note that the license is subject to update to a more comprehensive version.  For any questions related to the license and copyright, please contact us at glm-130b@googlegroups.com.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm_v2/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm_v2/__init__.py
new file mode 100644
index 000000000..775d34cf8
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm_v2/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 ChatGLM2-6B Model Team and PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm_v2/chatglm-legacy-checkpoints-convert.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm_v2/chatglm-legacy-checkpoints-convert.py
new file mode 100644
index 000000000..cfc4828cf
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm_v2/chatglm-legacy-checkpoints-convert.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+sd = paddle.load("old/model_state.pdparams")
+
+layers = 28
+for l in range(layers):
+    # qkv spilt --> fuse
+    qkv_weight_name, qkv_bias_name = (
+        f"encoder.layers.{l}.self_attention.query_key_value.weight",
+        f"encoder.layers.{l}.self_attention.query_key_value.bias",
+    )
+    q_weight_name, q_bias_name = (
+        f"encoder.layers.{l}.self_attention.query.weight",
+        f"encoder.layers.{l}.self_attention.query.bias",
+    )
+    k_weight_name, k_bias_name = (
+        f"encoder.layers.{l}.self_attention.key.weight",
+        f"encoder.layers.{l}.self_attention.key.bias",
+    )
+    v_weight_name, v_bias_name = (
+        f"encoder.layers.{l}.self_attention.value.weight",
+        f"encoder.layers.{l}.self_attention.value.bias",
+    )
+    sd[qkv_weight_name] = paddle.concat([sd[q_weight_name], sd[k_weight_name], sd[v_weight_name]], axis=1)
+    sd[qkv_bias_name] = paddle.concat([sd[q_bias_name], sd[k_bias_name], sd[v_bias_name]], axis=0)
+    sd.pop(q_weight_name)
+    sd.pop(q_bias_name)
+    sd.pop(k_weight_name)
+    sd.pop(k_bias_name)
+    sd.pop(v_weight_name)
+    sd.pop(v_bias_name)
+
+    # MLP
+    mlp_weight_name = f"encoder.layers.{l}.mlp.dense_h_to_4h.weight"
+    sd[mlp_weight_name] = sd[mlp_weight_name].reshape([4096, -1, 2]).transpose([0, 2, 1]).reshape([4096, -1])
+
+paddle.save(sd, "new/model_state.pdparams")
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm_v2/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm_v2/configuration.py
new file mode 100644
index 000000000..469ff9c15
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm_v2/configuration.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2023 ChatGLM2-6B Model Team and PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..configuration_utils import PretrainedConfig
+
+CHATGLM_V2_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "THUDM/chatglm2-6b": "https://paddlenlp.bj.bcebos.com/models/community/THUDM/chatglm2-6b/model_state.pdparams",
+    }
+}
+
+
+class ChatGLMv2Config(PretrainedConfig):
+    model_type = "chatglm_v2"
+    attribute_map = {
+        "num_layers": "num_hidden_layers",
+        "padded_vocab_size": "vocab_size",
+        "seq_length": "max_sequence_length",
+    }
+
+    def __init__(
+        self,
+        num_hidden_layers=28,
+        vocab_size=65024,
+        hidden_size=4096,
+        ffn_hidden_size=13696,
+        kv_channels=128,
+        num_attention_heads=32,
+        max_sequence_length=2048,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        layernorm_epsilon=1e-5,
+        use_cache=True,
+        rmsnorm=True,
+        apply_residual_connection_post_layernorm=False,
+        post_layer_norm=True,
+        add_bias_linear=False,
+        add_qkv_bias=False,
+        interleaved_qkv=False,
+        bias_dropout_fusion=True,
+        multi_query_group_num=1,
+        apply_query_key_layer_scaling=True,
+        attention_softmax_in_fp32=True,
+        fp32_residual_connection=False,
+        eos_token_id=2,
+        pad_token_id=0,
+        long_sequence_strategy_type=None,
+        long_sequence_strategy_name=None,
+        long_sequence_init_args=None,
+        use_long_sequence_strategies=False,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.num_hidden_layers = num_hidden_layers
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.kv_channels = kv_channels
+        self.num_attention_heads = num_attention_heads
+        self.max_sequence_length = max_sequence_length
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.layernorm_epsilon = layernorm_epsilon
+        self.use_cache = use_cache
+        self.rmsnorm = rmsnorm
+        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
+        self.post_layer_norm = post_layer_norm
+        self.add_bias_linear = add_bias_linear
+        self.add_qkv_bias = add_qkv_bias
+        self.bias_dropout_fusion = bias_dropout_fusion
+        self.multi_query_group_num = multi_query_group_num
+        self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = attention_softmax_in_fp32
+        self.fp32_residual_connection = fp32_residual_connection
+
+        self.long_sequence_strategy_type = long_sequence_strategy_type
+        self.long_sequence_strategy_name = long_sequence_strategy_name
+        self.long_sequence_init_args = {} if long_sequence_init_args is None else long_sequence_init_args
+        self.use_long_sequence_strategies = use_long_sequence_strategies
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm_v2/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm_v2/modeling.py
new file mode 100644
index 000000000..bbfb6e52f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm_v2/modeling.py
@@ -0,0 +1,859 @@
+# Copyright (c) 2023 ChatGLM2-6B Model Team and PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Any, Dict, List, Optional, Tuple
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.distributed.fleet.utils import recompute
+from paddle.utils import map_structure
+
+from paddlenlp.transformers.long_sequence_strategies import LongSequenceStrategies
+
+from ...utils.converter import StateDictNameMapping, init_name_mappings
+from .. import PretrainedModel, register_base_model
+from ..model_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithPast,
+    ModelOutput,
+)
+from .configuration import CHATGLM_V2_PRETRAINED_RESOURCE_FILES_MAP, ChatGLMv2Config
+
+__all__ = [
+    "ChatGLMv2Model",
+    "ChatGLMv2PretrainedModel",
+    "ChatGLMv2ForCausalLM",
+]
+
+
+class RotaryEmbedding(nn.Layer):
+    def __init__(self, dim, original_impl=False):
+        super().__init__()
+        self.default_dtype = paddle.get_default_dtype()
+        inv_freq = 1.0 / (10000 ** (paddle.arange(0, dim, 2, dtype="float32") / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.dim = dim
+        self.original_impl = original_impl
+
+    def forward_impl(self, seq_len: int, n_elem: int, base: int = 10000):
+        """Enhanced Transformer with Rotary Position Embedding.
+        Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
+        transformers/rope/__init__.py. MIT License:
+        https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
+        """
+        # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
+        theta = 1.0 / (base ** (paddle.arange(0, n_elem, 2, dtype="float32") / n_elem))
+
+        # Create position indexes `[0, 1, ..., seq_len - 1]`
+        seq_idx = paddle.arange(0, seq_len, dtype=theta.dtype)
+
+        # Calculate the product of position index and $\theta_i$
+        idx_theta = paddle.outer(seq_idx, theta).astype(self.default_dtype)
+
+        cache = paddle.stack([paddle.cos(idx_theta), paddle.sin(idx_theta)], axis=-1)
+
+        # this is to mimic the behaviour of complex32, else we will get different results
+        if self.default_dtype in (paddle.float16, paddle.bfloat16, paddle.int8):
+            cache = cache.astype(self.default_dtype)
+            # cache = cache.bfloat16() if dtype == paddle.bfloat16 else cache.astype("float16")
+        return cache
+
+    def forward(self, max_seq_len, offset=0):
+        return self.forward_impl(seq_len=max_seq_len, n_elem=self.dim)
+
+
+# @paddle.jit.script
+def apply_rotary_pos_emb(x: paddle.Tensor, rope_cache: paddle.Tensor) -> paddle.Tensor:
+    # x: [sq, b, np, hn]
+    sq, b, np, hn = x.shape
+    rot_dim = rope_cache.shape[-2] * 2
+    x, x_pass = x[..., :rot_dim], x[..., rot_dim:]
+    # truncate to support variable sizes
+    rope_cache = rope_cache[:sq]
+    xshaped = x.reshape([sq, -1, np, rot_dim // 2, 2])
+    rope_cache = rope_cache.reshape([sq, -1, 1, xshaped.shape[3], 2])
+    x_out2 = paddle.stack(
+        [
+            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
+            xshaped[..., 1] * rope_cache[..., 0] + xshaped[..., 0] * rope_cache[..., 1],
+        ],
+        -1,
+    )
+    x_out2 = x_out2.flatten(3)
+    return paddle.concat((x_out2, x_pass), axis=-1)
+
+
+class RMSNorm(nn.Layer):
+    def __init__(self, hidden_size, epsilon=None):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.weight = paddle.create_parameter(
+            shape=[self.hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(1.0),
+        )
+        self.epsilon = 1e-5 if epsilon is None else epsilon
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True)
+        hidden_states = paddle.rsqrt(variance + self.epsilon) * hidden_states
+        output = (hidden_states * self.weight).astype(input_dtype)
+
+        # if self.weight.dtype in [paddle.float16, paddle.bfloat16]:
+        #     hidden_states = paddle.cast(hidden_states, self.weight.dtype)
+        return output
+
+
+class CoreAttention(nn.Layer):
+    def __init__(self, config: ChatGLMv2Config, layer_number):
+        super(CoreAttention, self).__init__()
+
+        self.default_dtype = paddle.get_default_dtype()
+        self.apply_query_key_layer_scaling = config.apply_query_key_layer_scaling
+        self.attention_softmax_in_fp32 = config.attention_softmax_in_fp32
+        if self.apply_query_key_layer_scaling:
+            self.attention_softmax_in_fp32 = True
+        self.layer_number = max(1, layer_number)
+        self.num_attention_heads_per_partition = config.num_attention_heads
+        self.hidden_size_per_partition = config.kv_channels * self.num_attention_heads_per_partition
+        self.hidden_size_per_attention_head = self.hidden_size_per_partition // self.num_attention_heads_per_partition
+
+        coeff = None
+        self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
+        if self.apply_query_key_layer_scaling:
+            coeff = self.layer_number
+            self.norm_factor *= coeff
+        self.coeff = coeff
+
+        self.attention_dropout = nn.Dropout(config.attention_dropout)
+
+    def forward(self, query_layer, key_layer, value_layer, attention_mask):
+        # Raw attention scores
+        # [batch_size, num_heads, query_length, key_length]
+        output_size = (query_layer.shape[1], query_layer.shape[2], query_layer.shape[0], key_layer.shape[0])
+
+        # [query_length, batch_size, num_heads, hidden] -> [query_length, batch_size * num_heads, hidden]
+        query_layer = query_layer.reshape([output_size[2], output_size[0] * output_size[1], -1])
+        # [key_length, batch_size, num_heads, hidden] -> [key_length, batch_size * num_heads, hidden]
+        key_layer = key_layer.reshape([output_size[3], output_size[0] * output_size[1], -1])
+
+        # Raw attention scores. [batch_size * num_heads, query_length, key_length]
+        matmul_result = paddle.bmm(query_layer.transpose([1, 0, 2]), key_layer.transpose([1, 2, 0])) * (
+            1.0 / self.norm_factor
+        )
+
+        # change view to [batch_size, num_heads, query_length, key_length]
+        attention_scores = matmul_result.reshape(output_size)
+
+        # ===========================
+        # Attention probs and dropout
+        # ===========================
+
+        # attention scores and attention mask [batch_size, num_heads, query_length, key_length]
+        if self.attention_softmax_in_fp32:
+            attention_scores = attention_scores.astype("float32")
+        if self.coeff is not None:
+            attention_scores = attention_scores * self.coeff
+
+        attention_scores = attention_scores + attention_mask
+
+        attention_probs = F.softmax(attention_scores.astype("float32"), axis=-1)
+        attention_probs = attention_probs.astype(self.default_dtype)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.attention_dropout(attention_probs)
+        # [batch_size, num_heads, query_length, key_length]
+
+        # value_layer -> context layer.
+        # [sk, b, np, hn] --> [b, np, sq, hn]
+
+        # context layer shape: [b, np, sq, hn]
+        output_size = (value_layer.shape[1], value_layer.shape[2], query_layer.shape[0], value_layer.shape[3])
+        # change view [sk, b * np, hn]
+        value_layer = value_layer.reshape([value_layer.shape[0], output_size[0] * output_size[1], -1])
+        # change view [b * np, sq, sk]
+        attention_probs = attention_probs.reshape([output_size[0] * output_size[1], output_size[2], -1])
+        # matmul: [b * np, sq, hn]
+        context_layer = paddle.bmm(attention_probs, value_layer.transpose([1, 0, 2]))
+        # change view [b, np, sq, hn]
+        context_layer = context_layer.reshape(output_size)
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context_layer = context_layer.transpose([2, 0, 1, 3])
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_shape = context_layer.shape[:-2] + [self.hidden_size_per_partition]
+        context_layer = context_layer.reshape(new_context_shape)
+
+        return context_layer
+
+
+class SelfAttention(nn.Layer):
+    """Parallel self-attention layer abstract class.
+
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def __init__(self, config: ChatGLMv2Config, layer_number, device=None):
+        super(SelfAttention, self).__init__()
+        self.layer_number = max(1, layer_number)
+        assert (
+            config.kv_channels * config.num_attention_heads == config.hidden_size
+        ), "`kv_channels` * `num_attention_heads` must equal to `hidden_size`"
+
+        # Per attention head and per partition values.
+        self.hidden_size_per_attention_head = config.hidden_size // config.num_attention_heads
+        self.core_attention = CoreAttention(config, self.layer_number)
+        self.num_multi_query_groups_per_partition = config.multi_query_group_num
+        self.multi_query_group_num = config.multi_query_group_num
+        self.num_attention_heads_per_partition = config.num_attention_heads
+
+        self.query_key_value = nn.Linear(
+            config.hidden_size,
+            config.hidden_size + 2 * self.hidden_size_per_attention_head * config.multi_query_group_num,
+            bias_attr=config.add_bias_linear or config.add_qkv_bias,
+        )
+        # Output.
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size, bias_attr=config.add_bias_linear)
+
+    def forward(self, hidden_states, attention_mask, rotary_pos_emb, kv_cache=None, use_cache=True):
+        seq_length, batch_size, hidden_size = hidden_states.shape
+        mixed_x_layer = self.query_key_value(hidden_states)
+
+        (query_layer, key_layer, value_layer) = mixed_x_layer.split(
+            [
+                self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
+                self.hidden_size_per_attention_head * self.multi_query_group_num,
+                self.hidden_size_per_attention_head * self.multi_query_group_num,
+            ],
+            axis=-1,
+        )
+
+        query_layer = query_layer.reshape(
+            [seq_length, batch_size, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head]
+        )
+        key_layer = key_layer.reshape([seq_length, batch_size, -1, self.hidden_size_per_attention_head])
+        value_layer = value_layer.reshape([seq_length, batch_size, -1, self.hidden_size_per_attention_head])
+
+        # apply relative positional encoding (rotary embedding)
+        if rotary_pos_emb is not None:
+            query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
+            key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
+
+        # adjust key and value for inference
+        if use_cache:
+            if kv_cache is not None:
+                cache_k, cache_v = kv_cache
+                key_layer = paddle.concat((cache_k, key_layer), axis=0)
+                value_layer = paddle.concat((cache_v, value_layer), axis=0)
+            kv_cache = (key_layer, value_layer)
+        else:
+            kv_cache = None
+
+        multiplier = self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition
+
+        key_layer = key_layer.unsqueeze(-2).tile([1, 1, 1, multiplier, 1])
+        key_layer = key_layer.reshape(
+            key_layer.shape[:2] + [self.num_attention_heads_per_partition, self.hidden_size_per_attention_head]
+        )
+        value_layer = value_layer.unsqueeze(-2).tile([1, 1, 1, multiplier, 1])
+        value_layer = value_layer.reshape(
+            value_layer.shape[:2] + [self.num_attention_heads_per_partition, self.hidden_size_per_attention_head]
+        )
+
+        # ==================================
+        # core attention computation
+        # ==================================
+
+        context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
+
+        # =================
+        # Output. [seq_length, b, h]
+        # =================
+
+        output = self.dense(context_layer)
+        return output, kv_cache
+
+
+class MLP(nn.Layer):
+    """MLP.
+
+    MLP will take the input with h hidden state, project it to 4*h
+    hidden dimension, perform nonlinear transformation, and project the
+    state back into h hidden dimension.
+    """
+
+    def __init__(self, config: ChatGLMv2Config):
+        super(MLP, self).__init__()
+
+        self.add_bias = config.add_bias_linear
+
+        # Project to 4h due to swiglu doubling the output width, see https://arxiv.org/pdf/2002.05202.pdf
+        self.dense_h_to_4h = nn.Linear(config.hidden_size, config.ffn_hidden_size * 2, bias_attr=self.add_bias)
+        # Project back to h.
+        self.dense_4h_to_h = nn.Linear(
+            config.ffn_hidden_size,
+            config.hidden_size,
+            bias_attr=self.add_bias,
+        )
+
+    def forward(self, hidden_states):
+        # [s, b, 4hp]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+        # Special Slicing to accomodate Tensor Parallel
+        # Even channels is ffc_fc, odd channels is gate
+        dim_size = intermediate_parallel.shape[-1]
+        ffn_fc = intermediate_parallel[..., : dim_size // 2]
+        gate = intermediate_parallel[..., dim_size // 2 :]
+        intermediate_parallel = F.silu(ffn_fc) * gate
+        # [s, b, h]
+        output = self.dense_4h_to_h(intermediate_parallel)
+        return output
+
+
+class GLMBlock(nn.Layer):
+    """A single transformer layer.
+
+    Transformer layer takes input with size [s, b, h] and returns an
+    output of the same size.
+    """
+
+    def __init__(self, config: ChatGLMv2Config, layer_number):
+        super(GLMBlock, self).__init__()
+        self.layer_number = layer_number
+        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
+
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+        LayerNormFunc = RMSNorm if config.rmsnorm else nn.LayerNorm
+        # Layernorm on the input data.
+        self.input_layernorm = LayerNormFunc(config.hidden_size, epsilon=config.layernorm_epsilon)
+
+        # Self attention.
+        self.self_attention = SelfAttention(config, layer_number)
+        self.hidden_dropout = config.hidden_dropout
+
+        # Layernorm on the attention output
+        self.post_attention_layernorm = LayerNormFunc(config.hidden_size, epsilon=config.layernorm_epsilon)
+
+        # MLP
+        self.mlp = MLP(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        rotary_pos_emb,
+        kv_cache=None,
+        use_cache=True,
+    ):
+        # hidden_states: [s, b, h]
+
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+
+        # Self attention.
+        attention_output, kv_cache = self.self_attention(
+            layernorm_output, attention_mask, rotary_pos_emb, kv_cache=kv_cache, use_cache=use_cache
+        )
+
+        # Residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        layernorm_input = F.dropout(attention_output, p=self.hidden_dropout, training=self.training)
+        layernorm_input = residual + layernorm_input
+
+        # Layer norm post the self attention.
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+
+        # Second residual connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = layernorm_input
+
+        output = F.dropout(mlp_output, p=self.hidden_dropout, training=self.training)
+        output = residual + output
+        return output, kv_cache
+
+
+class GLMTransformer(nn.Layer):
+    """Transformer class."""
+
+    def __init__(self, config: ChatGLMv2Config):
+        super(GLMTransformer, self).__init__()
+        self.config = config
+        self.enable_recompute = False
+        self.fp32_residual_connection = config.fp32_residual_connection
+        self.post_layer_norm = config.post_layer_norm
+
+        # Number of layers.
+        self.num_hidden_layers = config.num_hidden_layers
+
+        # Transformer layers.
+        def build_layer(layer_number):
+            return GLMBlock(config, layer_number)
+
+        self.layers = nn.LayerList([build_layer(i + 1) for i in range(self.num_hidden_layers)])
+
+        if self.post_layer_norm:
+            LayerNormFunc = RMSNorm if config.rmsnorm else nn.LayerNorm
+            # Final layer norm before output.
+            self.final_layernorm = LayerNormFunc(config.hidden_size, epsilon=config.layernorm_epsilon)
+
+    def _get_layer(self, layer_number):
+        return self.layers[layer_number]
+
+    @paddle.jit.not_to_static
+    def recompute_training(
+        self,
+        layer_module: nn.Layer,
+        hidden_states: paddle.Tensor,
+        attention_mask: paddle.Tensor,
+        rotary_embeds: paddle.Tensor,
+        kv_cache: paddle.Tensor,
+        use_cache: bool,
+    ):
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+
+            return custom_forward
+
+        hidden_states, kv_cache = recompute(
+            create_custom_forward(layer_module),
+            hidden_states,
+            attention_mask,
+            rotary_embeds,
+            kv_cache,
+            use_cache,
+            use_reentrant=self.config.recompute_use_reentrant,
+        )
+        return hidden_states, kv_cache
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        rotary_pos_emb,
+        kv_caches=None,
+        use_cache: Optional[bool] = True,
+        output_hidden_states: Optional[bool] = False,
+    ):
+        if not kv_caches:
+            kv_caches = [None for _ in range(self.num_hidden_layers)]
+        presents = () if use_cache else None
+        all_self_attentions = None
+        all_hidden_states = () if output_hidden_states else None
+
+        zero = paddle.zeros(attention_mask.shape, dtype=hidden_states.dtype)
+        neg_inf = paddle.full_like(attention_mask, paddle.finfo(hidden_states.dtype).min, dtype=hidden_states.dtype)
+        attention_mask = paddle.where(attention_mask, zero, neg_inf)
+
+        for index in range(self.num_hidden_layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer = self._get_layer(index)
+
+            if self.enable_recompute and not hidden_states.stop_gradient:
+                hidden_states, kv_cache = self.recompute_training(
+                    layer,
+                    hidden_states,
+                    attention_mask,
+                    rotary_pos_emb,
+                    kv_cache=kv_caches[index],
+                    use_cache=use_cache,
+                )
+            else:
+                hidden_states, kv_cache = layer(
+                    hidden_states, attention_mask, rotary_pos_emb, kv_cache=kv_caches[index], use_cache=use_cache
+                )
+
+            if use_cache:
+                presents = presents + (kv_cache,)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        # Final layer norm.
+        if self.post_layer_norm:
+            hidden_states = self.final_layernorm(hidden_states)
+
+        return hidden_states, presents, all_hidden_states, all_self_attentions
+
+
+class ChatGLMv2PretrainedModel(PretrainedModel):
+    """
+    An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
+    """
+
+    config_class = ChatGLMv2Config
+    pretrained_resource_files_map = CHATGLM_V2_PRETRAINED_RESOURCE_FILES_MAP
+    base_model_prefix = "chatglm_v2"
+
+    def get_masks(self, input_ids, past_key_values, padding_mask=None):
+        batch_size, seq_length = input_ids.shape
+
+        # casual mask
+        casual_mask = paddle.tril(paddle.ones([batch_size, 1, seq_length, seq_length])).astype("bool")
+        past_length = 0
+        if past_key_values:
+            past_length = past_key_values[0][0].shape[0]
+        if past_length:
+            casual_mask = paddle.concat(
+                [paddle.ones([batch_size, 1, seq_length, past_length], dtype="bool"), casual_mask], axis=-1
+            )
+
+        # seq_mask
+        if padding_mask is None:
+            padding_mask = paddle.ones((batch_size, 1, seq_length, seq_length + past_length), dtype="bool")
+        if len(padding_mask.shape) == 2:
+            # from Tokenizer
+            padding_mask = (
+                padding_mask.unsqueeze(axis=[1, 2])
+                .expand([batch_size, 1, seq_length, seq_length + past_length])
+                .astype("bool")
+            )
+        elif len(padding_mask.shape) == 3:
+            # [batch_size,tgt_length, src_length] -> [batch_size, 1, tgt_length, src_length]
+            padding_mask = padding_mask.unsqueeze(1).astype("bool")
+        elif len(padding_mask.shape) == 4:
+            padding_mask = padding_mask.astype("bool")
+
+        casual_mask = casual_mask & padding_mask
+
+        return casual_mask
+
+    def get_position_ids(self, input_ids):
+        batch_size, seq_length = input_ids.shape
+        position_ids = paddle.arange(seq_length, dtype="int64").unsqueeze(0).tile([batch_size, 1])
+        return position_ids
+
+    @classmethod
+    def _get_name_mappings(cls, config: ChatGLMv2Config) -> List[StateDictNameMapping]:
+        mappings = [
+            "embedding.word_embeddings.weight",
+            "rotary_pos_emb.inv_freq",
+            "encoder.final_layernorm.weight",
+        ]
+
+        for layer_index in range(config.num_hidden_layers):
+            layer_mappings = [
+                [
+                    f"encoder.layers.{layer_index}.input_layernorm.weight",
+                    f"encoder.layers.{layer_index}.input_layernorm.weight",
+                ],
+                [
+                    f"encoder.layers.{layer_index}.self_attention.query_key_value.weight",
+                    f"encoder.layers.{layer_index}.self_attention.query_key_value.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layers.{layer_index}.self_attention.query_key_value.bias",
+                    f"encoder.layers.{layer_index}.self_attention.query_key_value.bias",
+                ],
+                [
+                    f"encoder.layers.{layer_index}.self_attention.dense.weight",
+                    f"encoder.layers.{layer_index}.self_attention.dense.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layers.{layer_index}.post_attention_layernorm.weight",
+                    f"encoder.layers.{layer_index}.post_attention_layernorm.weight",
+                ],
+                [
+                    f"encoder.layers.{layer_index}.mlp.dense_h_to_4h.weight",
+                    f"encoder.layers.{layer_index}.mlp.dense_h_to_4h.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layers.{layer_index}.mlp.dense_4h_to_h.weight",
+                    f"encoder.layers.{layer_index}.mlp.dense_4h_to_h.weight",
+                    "transpose",
+                ],
+            ]
+            mappings.extend(layer_mappings)
+
+        init_name_mappings(mappings)
+
+        if config.architectures is not None:
+            if "ChatGLMv2ForCausalLM" in config.architectures:
+                mappings.extend(
+                    [
+                        [
+                            "output_layer.weight",
+                            "output_layer.weight",
+                            "transpose",
+                        ]
+                    ]
+                )
+
+        for mapping in mappings:
+            mapping[0] = "transformer." + mapping[0]
+            if len(mapping) > 1 and mapping[1] is not None:
+                mapping[1] = "chatglm_v2." + mapping[1]
+
+        init_name_mappings(mappings)
+        return [StateDictNameMapping(*mapping) for mapping in mappings]
+
+
+class Embedding(nn.Layer):
+    """Language model embeddings."""
+
+    def __init__(self, config: ChatGLMv2Config):
+        super(Embedding, self).__init__()
+
+        self.hidden_size = config.hidden_size
+        self.word_embeddings = nn.Embedding(config.padded_vocab_size, self.hidden_size)
+        self.fp32_residual_connection = config.fp32_residual_connection
+
+    def forward(self, input_ids):
+        # Embeddings.
+        embeddings = self.word_embeddings(input_ids)
+        # Data format change to avoid explicit tranposes
+        # [batch_size, seq_length, hidden_size] --> [seq_length, batch_size, hidden_size].
+        embeddings = embeddings.transpose([1, 0, 2])
+        # If the input flag for fp32 residual connection is set, convert for float.
+        if self.fp32_residual_connection:
+            embeddings = embeddings.astype("float32")
+        return embeddings
+
+
+@register_base_model
+class ChatGLMv2Model(ChatGLMv2PretrainedModel):
+    def __init__(self, config: ChatGLMv2Config, empty_init=True):
+        super().__init__(config)
+        self.embedding = Embedding(config)
+
+        # Rotary positional embeddings
+        self.max_sequence_length = config.max_sequence_length
+        rotary_dim = (
+            config.hidden_size // config.num_attention_heads if config.kv_channels is None else config.kv_channels
+        )
+        if config.use_long_sequence_strategies:
+            self.config = config
+            self.rotary_pos_emb = LongSequenceStrategies.build_long_sequence_strategy(
+                config.long_sequence_strategy_type,
+                config.long_sequence_strategy_name,
+                **config.long_sequence_init_args,
+            )
+        else:
+            self.rotary_pos_emb = RotaryEmbedding(rotary_dim // 2)
+        self.encoder = GLMTransformer(config)
+        self.output_layer = nn.Linear(config.hidden_size, config.padded_vocab_size, bias_attr=False)
+
+    def get_input_embeddings(self):
+        return self.embedding.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embedding.word_embeddings = value
+
+    def forward(
+        self,
+        input_ids,
+        position_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        full_attention_mask: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[paddle.Tensor, paddle.Tensor], ...]] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        batch_size, seq_length = input_ids.shape
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embedding(input_ids)
+
+        full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
+
+        # Rotary positional embeddings
+        if self.config.use_long_sequence_strategies:
+            cos, sin = self.rotary_pos_emb(seq_len=self.max_sequence_length)
+            cos, cos = paddle.chunk(cos, 2, axis=-1)
+            sin, sin = paddle.chunk(sin, 2, axis=-1)
+            rotary_pos_emb = paddle.stack([cos, sin], axis=-1)
+        else:
+            rotary_pos_emb = self.rotary_pos_emb(self.max_sequence_length)
+
+        if position_ids is not None:
+            rotary_pos_emb = rotary_pos_emb[position_ids]
+        else:
+            rotary_pos_emb = rotary_pos_emb[None, :seq_length]
+
+        rotary_pos_emb = rotary_pos_emb.transpose([1, 0, 2, 3])
+
+        # Run encoder.
+        hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
+            inputs_embeds,
+            full_attention_mask,
+            rotary_pos_emb=rotary_pos_emb,
+            kv_caches=past_key_values,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+        )
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+        )
+
+
+class ChatGLMv2ForCausalLM(ChatGLMv2PretrainedModel):
+    def __init__(self, config: ChatGLMv2Config):
+        super().__init__(config)
+        self.max_sequence_length = config.max_sequence_length
+        self.chatglm_v2 = ChatGLMv2Model(config)
+
+    def reorder_cache(self, cache: paddle.Tensor, beam_idx):
+        cache = map_structure(lambda x: paddle.index_select(x, beam_idx, axis=1), cache)
+        return cache
+
+    def update_model_kwargs_for_generation(
+        self,
+        outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        standardize_cache_format: bool = False,
+    ) -> Dict[str, Any]:
+        # update past_key_values
+        model_kwargs["past_key_values"] = outputs[1] if isinstance(outputs, tuple) else outputs["past_key_values"]
+
+        # update attention mask
+        if "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            new_attention_mask = paddle.ones((attention_mask.shape[0], 1), dtype=attention_mask.dtype)
+            model_kwargs["attention_mask"] = paddle.concat([attention_mask, new_attention_mask], axis=-1)
+
+        # update position ids
+        if model_kwargs.get("position_ids", None) is not None:
+            position_ids = model_kwargs["position_ids"]
+            new_position_id = position_ids[..., -1:].clone()
+            new_position_id += 1
+            model_kwargs["position_ids"] = paddle.concat([position_ids, new_position_id], axis=-1)
+
+        model_kwargs["is_first_forward"] = False
+        return model_kwargs
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: paddle.Tensor,
+        past_key_values: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        is_first_forward: bool = True,
+        **kwargs
+    ) -> dict:
+        # only last token for input_ids if past is not None
+        if position_ids is None:
+            position_ids = self.get_position_ids(input_ids)
+        if not is_first_forward:
+            position_ids = position_ids[..., -1:]
+            input_ids = input_ids[:, -1:]
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "return_last_logit": True,
+            "use_cache": True,
+        }
+
+    def _get_model_inputs_spec(self, dtype: str):
+        return {
+            "input_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+            "attention_mask": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+            "position_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+        }
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[Tuple[paddle.Tensor]] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        return_last_logit: Optional[bool] = False,
+    ):
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.chatglm_v2(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+
+        if return_last_logit:
+            hidden_states = hidden_states[-1:]
+        lm_logits = self.chatglm_v2.output_layer(hidden_states)
+        lm_logits = lm_logits.transpose([1, 0, 2])
+
+        loss = None
+        if labels is not None:
+            reshaped_logits = lm_logits.reshape([-1, lm_logits.shape[-1]]).astype("float32")
+            reshaped_labels = labels.reshape([-1])
+
+            loss_fn = nn.CrossEntropyLoss(reduction="none")
+
+            loss_mask = (labels != -100).astype("float32")
+            loss = loss_fn(reshaped_logits, reshaped_labels)
+            loss = paddle.sum(loss.reshape([-1]).cast(paddle.float32) * loss_mask.reshape([-1]).cast(paddle.float32))
+            loss = loss / loss_mask.sum()
+
+            lm_logits = lm_logits.astype(hidden_states.dtype)
+            loss = loss.astype(hidden_states.dtype)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm_v2/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm_v2/tokenizer.py
new file mode 100644
index 000000000..6913418a0
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chatglm_v2/tokenizer.py
@@ -0,0 +1,322 @@
+# Copyright (c) 2023 ChatGLM2-6B Model Team and PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import os
+import re
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+from sentencepiece import SentencePieceProcessor
+
+from .. import PretrainedTokenizer
+from ..tokenizer_utils_base import BatchEncoding, PaddingStrategy
+
+
+class SPTokenizer:
+    def __init__(self, model_path: str):
+        # reload tokenizer
+        assert os.path.isfile(model_path), model_path
+        self.sp_model = SentencePieceProcessor(model_file=model_path)
+
+        # BOS / EOS token IDs
+        self.n_words: int = self.sp_model.vocab_size()
+        self.bos_id: int = self.sp_model.bos_id()
+        self.eos_id: int = self.sp_model.eos_id()
+        self.pad_id: int = self.sp_model.unk_id()
+        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
+
+        special_tokens = [
+            "[MASK]",
+            "[gMASK]",
+            "[sMASK]",
+            "sop",
+            "eop",
+            "<|system|>",
+            "<|user|>",
+            "<|assistant|>",
+            "<|observation|>",
+        ]
+
+        self.special_tokens = {}
+        self.index_special_tokens = {}
+        for token in special_tokens:
+            self.special_tokens[token] = self.n_words
+            self.index_special_tokens[self.n_words] = token
+            self.n_words += 1
+
+        # add eos/pad/unk token to special_token_expression
+        all_special_tokens = list(self.special_tokens.keys()) + ["</s>", "<unk>"]
+        self.special_token_expression = "|".join([re.escape(token) for token in all_special_tokens])
+
+    def tokenize(self, s: str, encode_special_tokens=False):
+        if encode_special_tokens:
+            last_index = 0
+            t = []
+            for match in re.finditer(self.special_token_expression, s):
+                if last_index < match.start():
+                    t.extend(self.sp_model.EncodeAsPieces(s[last_index : match.start()]))
+                t.append(s[match.start() : match.end()])
+                last_index = match.end()
+            if last_index < len(s):
+                t.extend(self.sp_model.EncodeAsPieces(s[last_index:]))
+            return t
+        else:
+            return self.sp_model.EncodeAsPieces(s)
+
+    def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
+        assert type(s) is str
+        t = self.sp_model.encode(s)
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+        return t
+
+    def decode(self, t: List[int]) -> str:
+        return self.sp_model.decode(t)
+
+    def decode_tokens(self, tokens: List[str]) -> str:
+        text = self.sp_model.DecodePieces(tokens)
+        return text
+
+    def convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        return self.sp_model.PieceToId(token)
+
+    def convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.index_special_tokens or index in [self.eos_id, self.bos_id, self.pad_id] or index < 0:
+            return ""
+        return self.sp_model.IdToPiece(index)
+
+
+class ChatGLMv2Tokenizer(PretrainedTokenizer):
+    resource_files_names = {"vocab_file": "tokenizer.model"}
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+    pretrained_resource_files_map = {
+        "model_file": {
+            "THUDM/chatglm2-6b": "https://paddlenlp.bj.bcebos.com/models/community/THUDM/chatglm2-6b/tokenizer.model",
+        }
+    }
+
+    # always encode special tokens, eg: </s>, [gMASK], [MASK] ...
+    def __init__(self, vocab_file, padding_side="left", encode_special_tokens=True, **kwargs):
+        super().__init__(padding_side=padding_side, **kwargs)
+        self.name = "ChatGLMv2Tokenizer"
+
+        self.vocab_file = vocab_file
+        self.tokenizer = SPTokenizer(vocab_file)
+        self.special_tokens = {
+            "<bos>": self.tokenizer.bos_id,
+            "<eos>": self.tokenizer.eos_id,
+            "<unk>": self.tokenizer.pad_id,
+            "<pad>": self.tokenizer.pad_id,
+        }
+        self.encode_special_tokens = encode_special_tokens
+
+    def get_command(self, token):
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}"
+        return self.tokenizer.special_tokens[token]
+
+    @property
+    def pad_token(self) -> str:
+        return "<unk>"
+
+    @property
+    def pad_token_id(self):
+        return self.get_command("<pad>")
+
+    @property
+    def eos_token(self) -> str:
+        return "</s>"
+
+    @property
+    def eos_token_id(self):
+        return self.get_command("<eos>")
+
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_words
+
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text, **kwargs):
+        return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.tokenizer.convert_token_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.tokenizer.convert_id_to_token(index)
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return self.tokenizer.decode_tokens(tokens)
+
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (`str`, *optional*):
+                An optional prefix to add to the named of the saved files.
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(save_directory, self.vocab_files_names["vocab_file"])
+        else:
+            vocab_file = save_directory
+
+        with open(self.vocab_file, "rb") as fin:
+            proto_str = fin.read()
+
+        with open(vocab_file, "wb") as writer:
+            writer.write(proto_str)
+
+        return (vocab_file,)
+
+    def get_prefix_tokens(self):
+        prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")]
+        return prefix_tokens
+
+    def build_prompt(self, query, history=None):
+        if history is None:
+            history = []
+        prompt = ""
+        for i, (old_query, response) in enumerate(history):
+            prompt += "[Round {}]\n\n问：{}\n\n答：{}\n\n".format(i + 1, old_query, response)
+        prompt += "[Round {}]\n\n问：{}\n\n答：".format(len(history) + 1, query)
+        return prompt
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        prefix_tokens = self.get_prefix_tokens()
+        token_ids_0 = prefix_tokens + token_ids_0
+        if token_ids_1 is not None:
+            token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("<eos>")]
+        return token_ids_0
+
+    def _pad(
+        self,
+        encoded_inputs: Union[Dict, BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        assert self.padding_side == "left"
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+        seq_length = len(required_input)
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * seq_length
+
+        if "position_ids" not in encoded_inputs:
+            encoded_inputs["position_ids"] = list(range(seq_length))
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+
+            if "attention_mask" in encoded_inputs:
+                # 3D/4D attention mask
+                if len(np.shape(encoded_inputs["attention_mask"])) > 2:
+                    encoded_inputs["attention_mask"] = np.pad(
+                        encoded_inputs["attention_mask"],
+                        pad_width=[(0, 0), (difference, 0), (difference, 0)],
+                        mode="constant",
+                        constant_values=0,
+                    )
+                # 2D attention mask
+                else:
+                    encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
+            if "position_ids" in encoded_inputs:
+                encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
+            encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+
+        return encoded_inputs
+
+    def encode_chat_inputs(self, conversations: List[List[str, str]], context_data: Dict[str, Any] = {}):
+        # encode system
+        result = super().encode_chat_inputs(conversations, context_data=context_data)
+        if "system" in result:
+            result["system"] = self.get_prefix_tokens() + result["system"]
+        else:
+            result["conversations"][0][0] = self.get_prefix_tokens() + result["conversations"][0][0]
+
+        return result
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chinesebert/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chinesebert/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chinesebert/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chinesebert/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chinesebert/configuration.py
new file mode 100644
index 000000000..f8000b7a3
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chinesebert/configuration.py
@@ -0,0 +1,181 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ChineseBERT model configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = [
+    "CHINESEBERT_PRETRAINED_INIT_CONFIGURATION",
+    "ChineseBertConfig",
+    "CHINESEBERT_PRETRAINED_RESOURCE_FILES_MAP",
+]
+
+CHINESEBERT_PRETRAINED_INIT_CONFIGURATION = {
+    "ChineseBERT-base": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "layer_norm_eps": 1e-12,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 23236,
+        "glyph_embedding_dim": 1728,
+        "pinyin_map_len": 32,
+    },
+    "ChineseBERT-large": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "intermediate_size": 4096,
+        "layer_norm_eps": 1e-12,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 24,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 23236,
+        "glyph_embedding_dim": 1728,
+        "pinyin_map_len": 32,
+    },
+}
+
+CHINESEBERT_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "ChineseBERT-base": "https://bj.bcebos.com/paddlenlp/models/transformers/chinese_bert/chinesebert-base/model_state.pdparams",
+        "ChineseBERT-large": "https://bj.bcebos.com/paddlenlp/models/transformers/chinese_bert/chinesebert-large/model_state.pdparams",
+    }
+}
+
+
+class ChineseBertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ChineseBertModel`]. It is used to
+    instantiate a ChineseBERT model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the ChineseBERT
+    ChineseBERT-base architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the ChineseBERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        glyph_embedding_dim (`int`, *optional*):
+            The dim of glyph_embedding.
+        pinyin_embedding_size (`int`, *optional*):
+            pinyin embedding size
+        pinyin_map_len (int, *optional*):
+            The length of pinyin map.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python
+    >>> from paddlenlp.transformers import BertModel, BertConfig
+
+    >>> # Initializing a ChineseBERT bert-base-uncased style configuration
+    >>> configuration = ChineseBertConfig()
+
+    >>> # Initializing a model from the bert-base-uncased style configuration
+    >>> model = ChineseBertModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "chinesebert"
+    attribute_map: Dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
+    pretrained_init_configuration = CHINESEBERT_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size: int = 23236,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        max_position_embeddings: int = 512,
+        type_vocab_size: int = 16,
+        initializer_range: float = 0.02,
+        pad_token_id: int = 0,
+        pool_act: str = "tanh",
+        layer_norm_eps=1e-12,
+        glyph_embedding_dim=1728,
+        pinyin_embedding_size=128,
+        pinyin_map_len=32,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.pool_act = pool_act
+        self.layer_norm_eps = layer_norm_eps
+        self.glyph_embedding_dim = glyph_embedding_dim
+        self.pinyin_embedding_size = pinyin_embedding_size
+        self.pinyin_map_len = pinyin_map_len
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chinesebert/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chinesebert/modeling.py
new file mode 100644
index 000000000..c23e50d1b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chinesebert/modeling.py
@@ -0,0 +1,822 @@
+# encoding=utf8
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# MIT License
+
+# Copyright (c) 2021 ShannonAI
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlenlp.transformers import PretrainedModel, register_base_model
+
+from .configuration import (
+    CHINESEBERT_PRETRAINED_INIT_CONFIGURATION,
+    CHINESEBERT_PRETRAINED_RESOURCE_FILES_MAP,
+    ChineseBertConfig,
+)
+
+__all__ = [
+    "ChineseBertModel",
+    "ChineseBertPretrainedModel",
+    "ChineseBertForPretraining",
+    "ChineseBertPretrainingCriterion",
+    "ChineseBertForSequenceClassification",
+    "ChineseBertForTokenClassification",
+    "ChineseBertForQuestionAnswering",
+]
+
+
+class PinyinEmbedding(nn.Layer):
+    def __init__(self, config: ChineseBertConfig):
+        """Pinyin Embedding Layer"""
+        super(PinyinEmbedding, self).__init__()
+        self.embedding = nn.Embedding(config.pinyin_map_len, config.pinyin_embedding_size)
+        self.pinyin_out_dim = config.hidden_size
+        self.conv = nn.Conv1D(
+            in_channels=config.pinyin_embedding_size,
+            out_channels=self.pinyin_out_dim,
+            kernel_size=2,
+            stride=1,
+            padding=0,
+        )
+
+    def forward(self, pinyin_ids):
+        """
+        Args:
+            pinyin_ids (Tensor): Its shape is (bs*sentence_length*pinyin_locs).
+
+        Returns:
+            pinyin_embed (Tensor): Its shape is (bs,sentence_length,pinyin_out_dim).
+
+        """
+        # input pinyin ids for 1-D conv
+        embed = self.embedding(pinyin_ids)  # [bs,sentence_length*pinyin_locs,embed_size]
+        bs, sentence_length, pinyin_locs, embed_size = embed.shape
+        view_embed = embed.reshape(
+            shape=[-1, pinyin_locs, embed_size]
+        )  # [(bs*sentence_length),pinyin_locs,embed_size]
+        input_embed = view_embed.transpose([0, 2, 1])  # [(bs*sentence_length), embed_size, pinyin_locs]
+        # conv + max_pooling
+        pinyin_conv = self.conv(input_embed)  # [(bs*sentence_length),pinyin_out_dim,H]
+        pinyin_embed = F.max_pool1d(pinyin_conv, pinyin_conv.shape[-1])  # [(bs*sentence_length),pinyin_out_dim,1]
+        return pinyin_embed.reshape(
+            shape=[bs, sentence_length, self.pinyin_out_dim]
+        )  # [bs,sentence_length,pinyin_out_dim]
+
+
+class GlyphEmbedding(nn.Layer):
+    """Glyph2Image Embedding."""
+
+    def __init__(self, config: ChineseBertConfig):
+        super(GlyphEmbedding, self).__init__()
+        self.embedding = nn.Embedding(num_embeddings=config.vocab_size, embedding_dim=config.glyph_embedding_dim)
+
+    def forward(self, input_ids):
+        """
+        Get glyph images for batch inputs.
+
+        Args:
+            input_ids (Tensor): Its shape is [batch, sentence_length].
+
+        Returns:
+            images (Tensor): Its shape is [batch, sentence_length, self.font_num*self.font_size*self.font_size].
+
+        """
+        return self.embedding(input_ids)
+
+
+class FusionBertEmbeddings(nn.Layer):
+    """
+    Construct the embeddings from word, position, glyph, pinyin and token_type embeddings.
+    """
+
+    def __init__(self, config: ChineseBertConfig):
+        super(FusionBertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        self.pinyin_embeddings = PinyinEmbedding(config)
+        self.glyph_embeddings = GlyphEmbedding(config)
+
+        self.glyph_map = nn.Linear(config.glyph_embedding_dim, config.hidden_size)
+        self.map_fc = nn.Linear(config.hidden_size * 3, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids",
+            paddle.expand(paddle.arange(config.max_position_embeddings, dtype="int64"), shape=[1, -1]),
+        )
+
+    def forward(self, input_ids, pinyin_ids, token_type_ids=None, position_ids=None):
+
+        input_shape = input_ids.shape
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(input_shape, dtype="int64")
+        # get char embedding, pinyin embedding and glyph embedding
+        word_embeddings = self.word_embeddings(input_ids)  # [bs,l,hidden_size]
+
+        pinyin_embeddings = self.pinyin_embeddings(
+            pinyin_ids.reshape(shape=[input_shape[0], seq_length, 8])
+        )  # [bs,l,hidden_size]
+
+        glyph_embeddings = self.glyph_map(self.glyph_embeddings(input_ids))  # [bs,l,hidden_size]
+        # fusion layer
+        concat_embeddings = paddle.concat((word_embeddings, pinyin_embeddings, glyph_embeddings), axis=2)
+        inputs_embeds = self.map_fc(concat_embeddings)
+
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+# Same as BertLMPredictionHead
+class ChineseBertLMPredictionHead(nn.Layer):
+    """
+    Language Modeling head
+    """
+
+    def __init__(self, config: ChineseBertConfig, embedding_weights=None):
+        super(ChineseBertLMPredictionHead, self).__init__()
+
+        self.transform = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = getattr(nn.functional, config.hidden_act)
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.decoder_weight = (
+            self.create_parameter(
+                shape=[config.vocab_size, config.hidden_size], dtype=self.transform.weight.dtype, is_bias=False
+            )
+            if embedding_weights is None
+            else embedding_weights
+        )
+
+        self.decoder_bias = self.create_parameter(
+            shape=[config.vocab_size], dtype=self.decoder_weight.dtype, is_bias=True
+        )
+
+    def forward(self, hidden_states, masked_positions=None):
+        if masked_positions is not None:
+            hidden_states = paddle.reshape(hidden_states, [-1, hidden_states.shape[-1]])
+            hidden_states = paddle.tensor.gather(hidden_states, masked_positions)
+        # gather masked tokens might be more quick
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = paddle.tensor.matmul(hidden_states, self.decoder_weight, transpose_y=True) + self.decoder_bias
+        return hidden_states
+
+
+# Same as BertPretrainingHeads
+class ChineseBertPretrainingHeads(nn.Layer):
+    """
+    Perform language modeling task and next sentence classification task.
+
+    Args:
+        config (:class:`ChineseBertConfig`):
+            An instance of ChineseBertConfig used to construct ChineseBertPretrainingHeads.
+        embedding_weights (Tensor, optional):
+            Decoding weights used to map hidden_states to logits of the masked token prediction.
+            Its data type should be float32 and its shape is [vocab_size, hidden_size].
+            Defaults to `None`, which means use the same weights of the embedding layer.
+
+    """
+
+    def __init__(self, config: ChineseBertConfig, embedding_weights=None):
+        super(ChineseBertPretrainingHeads, self).__init__()
+        self.predictions = ChineseBertLMPredictionHead(config, embedding_weights)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output, masked_positions=None):
+        """
+        Args:
+            sequence_output(Tensor):
+                Sequence of hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+            pooled_output(Tensor):
+                The output of first token (`[CLS]`) in sequence.
+                We "pool" the model by simply taking the hidden state corresponding to the first token.
+                Its data type should be float32 and its shape is [batch_size, hidden_size].
+            masked_positions(Tensor, optional):
+                A tensor indicates positions to be masked in the position embedding.
+                Its data type should be int64 and its shape is [batch_size, mask_token_num].
+                `mask_token_num` is the number of masked tokens. It should be no bigger than `sequence_length`.
+                Defaults to `None`, which means we output hidden-states of all tokens in masked token prediction.
+
+        Returns:
+            tuple: Returns tuple (``prediction_scores``, ``seq_relationship_score``).
+
+            With the fields:
+
+            - `prediction_scores` (Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size].
+                Otherwise, its shape is [batch_size, mask_token_num, vocab_size].
+
+            - `seq_relationship_score` (Tensor):
+                The scores of next sentence prediction.
+                Its data type should be float32 and its shape is [batch_size, 2].
+
+        """
+        prediction_scores = self.predictions(sequence_output, masked_positions)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+# Same as BertPooler
+class ChineseBertPooler(nn.Layer):
+    """
+    Pool the result of ChineseBertEncoder.
+    """
+
+    def __init__(self, config):
+        """init the bert pooler with config & args/kwargs
+
+        Args:
+            config (:class:`ChineseBertConfig`): An instance of ChineseBertConfig.
+        """
+        super(ChineseBertPooler, self).__init__()
+
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.pool_act = config.pool_act
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        if self.pool_act == "tanh":
+            pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class ChineseBertPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained ChineseBert models. It provides ChineseBert related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    base_model_prefix = "chinesebert"
+    pretrained_resource_files_map = CHINESEBERT_PRETRAINED_RESOURCE_FILES_MAP
+    pretrained_init_configuration = CHINESEBERT_PRETRAINED_INIT_CONFIGURATION
+    config_class = ChineseBertConfig
+
+    def _init_weights(self, layer):
+        """Initialize the weights."""
+
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range,
+                        shape=layer.weight.shape,
+                    )
+                )
+        elif isinstance(layer, nn.LayerNorm):
+            layer._epsilon = self.config.layer_norm_eps
+
+
+@register_base_model
+class ChineseBertModel(ChineseBertPretrainedModel):
+    """
+    The bare ChineseBert Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`ChineseBertConfig`):
+            An instance of ChineseBertConfig used to construct ChineseBertModel.
+
+    """
+
+    def __init__(self, config: ChineseBertConfig):
+        super(ChineseBertModel, self).__init__(config)
+        self.pad_token_id = config.pad_token_id
+        self.layer_norm_eps = config.layer_norm_eps
+        self.initializer_range = config.initializer_range
+        self.embeddings = FusionBertEmbeddings(config)
+        encoder_layer = nn.TransformerEncoderLayer(
+            config.hidden_size,
+            config.num_attention_heads,
+            config.intermediate_size,
+            dropout=config.hidden_dropout_prob,
+            activation=config.hidden_act,
+            attn_dropout=config.attention_probs_dropout_prob,
+            act_dropout=0,
+        )
+        self.encoder = nn.TransformerEncoder(encoder_layer, config.num_hidden_layers)
+        self.pooler = ChineseBertPooler(config)
+
+    def forward(
+        self,
+        input_ids,
+        pinyin_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        output_hidden_states=False,
+    ):
+        r"""
+        The ChineseBert forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            pinyin_ids (Tensor, optional):
+                Indices of input sequence tokens pinyin. We apply a CNN model with width 2 on the pinyin
+                sequence, followed by max-pooling to derive the resulting pinyin embedding. This makes output
+                dimensionality immune to the length of the input pinyin sequence. The length of the input pinyin
+                sequence is fixed at 8.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length, 8].
+                Defaults to `None`, which means we don't add pinyin embeddings.
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            output_hidden_states (bool, optional):
+                Whether to return the output of each hidden layers.
+                Defaults to `False`.
+
+        Returns:
+            tuple: Returns tuple (`sequence_output`, `pooled_output`) or (`encoder_outputs`, `pooled_output`).
+
+            With the fields:
+
+            - `sequence_output` (Tensor):
+                Sequence of hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+            - `pooled_output` (Tensor):
+                The output of first token (`[CLS]`) in sequence.
+                We "pool" the model by simply taking the hidden state corresponding to the first token.
+                Its data type should be float32 and its shape is [batch_size, hidden_size].
+
+            - `encoder_outputs` (List(Tensor)):
+                A list of Tensor containing hidden-states of the model at each hidden layer in the Transformer encoder.
+                The length of the list is `num_hidden_layers`.
+                Each Tensor has a data type of float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ChineseBertModel, ChineseBertTokenizer
+
+                tokenizer = ChineseBertTokenizer.from_pretrained('ChineseBERT-base')
+                model = ChineseBertModel.from_pretrained('ChineseBERT-base')
+
+                inputs = tokenizer("欢迎使用百度飞桨!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+        """
+
+        if attention_mask is None:
+            attention_mask = paddle.unsqueeze(
+                (input_ids == self.pad_token_id).astype(paddle.get_default_dtype()) * -1e4,
+                axis=[1, 2],
+            )
+        elif attention_mask.ndim == 2:
+            # attention_mask [batch_size, sequence_length] -> [batch_size, 1, 1, sequence_length]
+            attention_mask = attention_mask.unsqueeze(axis=[1, 2]).astype(paddle.get_default_dtype())
+            attention_mask = (1.0 - attention_mask) * -1e4
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            pinyin_ids=pinyin_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+        )
+        print(embedding_output.shape)
+
+        if output_hidden_states:
+            output = embedding_output
+            encoder_outputs = []
+            for mod in self.encoder.layers:
+                output = mod(output, src_mask=attention_mask)
+                encoder_outputs.append(output)
+            if self.encoder.norm is not None:
+                encoder_outputs[-1] = self.encoder.norm(encoder_outputs[-1])
+            pooled_output = self.pooler(encoder_outputs[-1])
+        else:
+            sequence_output = self.encoder(embedding_output, src_mask=attention_mask)
+            pooled_output = self.pooler(sequence_output)
+        if output_hidden_states:
+            return encoder_outputs, pooled_output
+        else:
+            return sequence_output, pooled_output
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+
+class ChineseBertForQuestionAnswering(ChineseBertPretrainedModel):
+    """
+    ChineseBert Model with a linear layer on top of the hidden-states output to compute `span_start_logits`
+    and `span_end_logits`, designed for question-answering tasks like SQuAD.
+
+    Args:
+        config (:class:`ChineseBertConfig`):
+            An instance of ChineseBertConfig used to construct ChineseBertForQuestionAnswering.
+    """
+
+    def __init__(self, config: ChineseBertConfig):
+        super(ChineseBertForQuestionAnswering, self).__init__(config)
+        self.chinesebert = ChineseBertModel(config)
+        self.classifier = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, input_ids, pinyin_ids=None, token_type_ids=None, attention_mask=None):
+        r"""
+        The ChineseBertForQuestionAnswering forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ChineseBertModel`.
+            pinyin_ids (Tensor, optional):
+                See :class:`ChineseBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ChineseBertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ChineseBertModel`.
+
+        Returns:
+            tuple: Returns tuple (`start_logits`, `end_logits`).
+
+            With the fields:
+
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers.chinesebert.modeling import ChineseBertForQuestionAnswering
+                from paddlenlp.transformers.chinesebert.tokenizer import ChineseBertTokenizer
+
+                tokenizer = ChineseBertTokenizer.from_pretrained('ChineseBERT-base')
+                model = ChineseBertForQuestionAnswering.from_pretrained('ChineseBERT-base')
+
+                inputs = tokenizer("欢迎使用百度飞桨!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                start_logits = outputs[0]
+                end_logits = outputs[1]
+        """
+        sequence_output, _ = self.chinesebert(
+            input_ids, pinyin_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, position_ids=None
+        )
+
+        logits = self.classifier(sequence_output)
+        logits = paddle.transpose(logits, perm=[2, 0, 1])
+        start_logits, end_logits = paddle.unstack(x=logits, axis=0)
+
+        return start_logits, end_logits
+
+
+class ChineseBertForSequenceClassification(ChineseBertPretrainedModel):
+    """
+    ChineseBert Model with a linear layer on top of the output layer,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        config (:class:`ChineseBertConfig`):
+            An instance of ChineseBertConfig used to construct ChineseBertForSequenceClassification.e.
+    """
+
+    def __init__(self, config: ChineseBertConfig):
+        super(ChineseBertForSequenceClassification, self).__init__(config)
+        self.chinesebert = ChineseBertModel(config)
+        self.num_labels = config.num_labels
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, input_ids, pinyin_ids=None, token_type_ids=None, position_ids=None, attention_mask=None):
+        r"""
+        The ChineseBertForSequenceClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ChineseBertModel`.
+            pinyin_ids (Tensor, optional):
+                See :class:`ChineseBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ChineseBertModel`.
+            position_ids(Tensor, optional):
+                See :class:`ChineseBertModel`.
+            attention_mask (list, optional):
+                See :class:`ChineseBertModel`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input text classification logits.
+            Shape as `[batch_size, num_classes]` and dtype as float32.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers.chinesebert.modeling import ChineseBertForSequenceClassification
+                from paddlenlp.transformers.chinesebert.tokenizer import ChineseBertTokenizer
+
+                tokenizer = ChineseBertTokenizer.from_pretrained('ChineseBERT-base')
+                model = ChineseBertForSequenceClassification.from_pretrained('ChineseBERT-base', num_classes=2)
+
+                inputs = tokenizer("欢迎使用百度飞桨!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+
+                logits = model(**inputs)
+                print(logits.shape)
+                # [1, 2]
+
+        """
+
+        _, pooled_output = self.chinesebert(
+            input_ids,
+            pinyin_ids=pinyin_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+        )
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        return logits
+
+
+class ChineseBertForTokenClassification(ChineseBertPretrainedModel):
+    """
+    ChineseBert Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        config (:class:`ChineseBertConfig`):
+            An instance of ChineseBertConfig used to construct ChineseBertForTokenClassification.e.
+    """
+
+    def __init__(self, config: ChineseBertConfig):
+        super(ChineseBertForTokenClassification, self).__init__(config)
+        self.chinesebert = ChineseBertModel(config)
+        self.num_labels = config.num_labels
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, input_ids, pinyin_ids=None, token_type_ids=None, position_ids=None, attention_mask=None):
+        r"""
+        The ChineseBertForTokenClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ChineseBertModel`.
+            pinyin_ids (Tensor, optional):
+                See :class:`ChineseBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ChineseBertModel`.
+            position_ids(Tensor, optional):
+                See :class:`ChineseBertModel`.
+            attention_mask (list, optional):
+                See :class:`ChineseBertModel`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input token classification logits.
+            Shape as `[batch_size, sequence_length, num_classes]` and dtype as `float32`.
+
+        Example:
+             .. code-block::
+
+                import paddle
+                from paddlenlp.transformers.chinesebert.modeling import ChineseBertForSequenceClassification
+                from paddlenlp.transformers.chinesebert.tokenizer import ChineseBertTokenizer
+
+                tokenizer = ChineseBertTokenizer.from_pretrained('ChineseBERT-base')
+                model = ChineseBertForSequenceClassification.from_pretrained('ChineseBERT-base', num_classes=2)
+
+                inputs = tokenizer("欢迎使用百度飞桨!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+
+                logits = model(**inputs)
+                print(logits.shape)
+                # [1, 13, 2]
+
+        """
+        sequence_output, _ = self.chinesebert(
+            input_ids,
+            pinyin_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+        )
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        return logits
+
+
+class ChineseBertForPretraining(ChineseBertPretrainedModel):
+    """
+    ChineseBert Model with pretraining tasks on top.
+
+    Args:
+        config (:class:`ChineseBertConfig`):
+            An instance of ChineseBertConfig used to construct ChineseBertForPretraining.e.
+
+    """
+
+    def __init__(self, config: ChineseBertConfig):
+        super(ChineseBertForPretraining, self).__init__(config)
+        self.chinesebert = ChineseBertModel(config)
+        self.cls = ChineseBertPretrainingHeads(
+            config,
+            embedding_weights=self.chinesebert.embeddings.word_embeddings.weight,
+        )
+
+    def forward(
+        self,
+        input_ids,
+        pinyin_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        masked_positions=None,
+    ):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ChineseBertModel`.
+            pinyin_ids (Tensor, optional):
+                See :class:`ChineseBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ChineseBertModel`.
+            position_ids (Tensor, optional):
+                See :class:`ChineseBertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ChineseBertModel`.
+            masked_positions(Tensor, optional):
+                See :class:`ChineseBertPretrainingHeads`.
+
+        Returns:
+            tuple: Returns tuple (``prediction_scores``, ``seq_relationship_score``).
+
+            With the fields:
+
+            - `prediction_scores` (Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size].
+                Otherwise, its shape is [batch_size, mask_token_num, vocab_size].
+
+            - `seq_relationship_score` (Tensor):
+                The scores of next sentence prediction.
+                Its data type should be float32 and its shape is [batch_size, 2].
+
+        """
+        with paddle.static.amp.fp16_guard():
+            outputs = self.chinesebert(
+                input_ids,
+                pinyin_ids,
+                token_type_ids=token_type_ids,
+                position_ids=position_ids,
+                attention_mask=attention_mask,
+            )
+            sequence_output, pooled_output = outputs[:2]
+            prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output, masked_positions)
+            return prediction_scores, seq_relationship_score
+
+
+class ChineseBertPretrainingCriterion(nn.Layer):
+    """
+
+    Args:
+        vocab_size(int):
+            Vocabulary size of `inputs_ids` in `ChineseBertModel`. Defines the number of different tokens that can
+            be represented by the `inputs_ids` passed when calling `ChineseBertBertModel`.
+
+    """
+
+    def __init__(self, vocab_size):
+        super(ChineseBertPretrainingCriterion, self).__init__()
+        # CrossEntropyLoss is expensive since the inner reshape (copy)
+        self.loss_fn = nn.loss.CrossEntropyLoss(ignore_index=-1)
+        self.vocab_size = vocab_size
+
+    def forward(
+        self, prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels, masked_lm_scale
+    ):
+        """
+        Args:
+            prediction_scores(Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size].
+                Otherwise, its shape is [batch_size, mask_token_num, vocab_size]
+            seq_relationship_score(Tensor):
+                The scores of next sentence prediction. Its data type should be float32 and
+                its shape is [batch_size, 2]
+            masked_lm_labels(Tensor):
+                The labels of the masked language modeling, its dimensionality is equal to `prediction_scores`.
+                Its data type should be int64. If `masked_positions` is None, its shape is [batch_size, sequence_length, 1].
+                Otherwise, its shape is [batch_size, mask_token_num, 1]
+            next_sentence_labels(Tensor):
+                The labels of the next sentence prediction task, the dimensionality of `next_sentence_labels`
+                is equal to `seq_relation_labels`. Its data type should be int64 and
+                its shape is [batch_size, 1]
+            masked_lm_scale(Tensor or int):
+                The scale of masked tokens. Used for the normalization of masked language modeling loss.
+                If it is a `Tensor`, its data type should be int64 and its shape is equal to `prediction_scores`.
+
+        Returns:
+            Tensor: The pretraining loss, equals to the sum of `masked_lm_loss` plus the mean of `next_sentence_loss`.
+            Its data type should be float32 and its shape is [1].
+
+
+        """
+        with paddle.static.amp.fp16_guard():
+            masked_lm_loss = F.cross_entropy(prediction_scores, masked_lm_labels, reduction="none", ignore_index=-1)
+            masked_lm_loss = masked_lm_loss / masked_lm_scale
+            next_sentence_loss = F.cross_entropy(seq_relationship_score, next_sentence_labels, reduction="none")
+        return paddle.sum(masked_lm_loss) + paddle.mean(next_sentence_loss)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chinesebert/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chinesebert/tokenizer.py
new file mode 100644
index 000000000..40ffba601
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chinesebert/tokenizer.py
@@ -0,0 +1,759 @@
+# encoding=utf8
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# MIT License
+
+# Copyright (c) 2021 ShannonAI
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from functools import lru_cache
+
+from paddle.utils import try_import
+
+from paddlenlp.transformers import BertTokenizer
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"ChineseBERT-base": 512, "ChineseBERT-large": 512}
+
+
+class ChineseBertTokenizer(BertTokenizer):
+    """
+    Construct a ChineseBert tokenizer. `ChineseBertTokenizer` is similar to `BertTokenizerr`.
+    The difference between them is that ChineseBert has the extra process about pinyin id.
+    For more information regarding those methods, please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            The vocabulary file path (ends with '.txt') required to instantiate
+            a `WordpieceTokenizer`.
+        do_lower_case (bool):
+            Whether or not to lowercase the input when tokenizing.
+            Defaults to `True`.
+        pinyin_map (dict):
+            A dict of pinyin map, the map between pinyin char and id. pinyin char is 26 Romanian characters and 0-5 numbers.
+            Defaults to None.
+        id2pinyin (dict):
+            A dict of char id map tensor.
+            Defaults to None.
+        pinyin2tensor (dict):
+            A dict of pinyin map tensor.
+            Defaults to None.
+        unk_token (str):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import ChineseBertTokenizer
+            tokenizer = ChineseBertTokenizer.from_pretrained('ChineseBERT-base')
+
+            inputs = tokenizer('欢迎使用飞桨！')
+            print(inputs)
+
+            '''
+            {'input_ids': [101, 3614, 6816, 886, 4500, 7607, 3444, 8013, 102],
+            'pinyin_ids': [0, 0, 0, 0, 0, 0, 0, 0, 13, 26, 6, 19, 1, 0, 0, 0, 30, 14, 19, 12, 2, 0, 0, 0, 24, 13, 14, 3, 0, 0, 0, 0, 30, 20, 19, 12, 4, 0, 0, 0, 11, 10, 14, 1, 0, 0, 0, 0, 15, 14, 6, 19, 12, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+            'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0]}
+            '''
+
+    """
+
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "ChineseBERT-base": "https://bj.bcebos.com/paddlenlp/models/transformers/chinese_bert/chinesebert-base/vocab.txt",
+            "ChineseBERT-large": "https://bj.bcebos.com/paddlenlp/models/transformers/chinese_bert/chinesebert-base/vocab.txt",
+        },
+        "tokenizer_config_file": {
+            "ChineseBERT-base": "https://bj.bcebos.com/paddlenlp/models/transformers/chinese_bert/chinesebert-large/tokenizer_config.json",
+            "ChineseBERT-large": "https://bj.bcebos.com/paddlenlp/models/transformers/chinese_bert/chinesebert-large/tokenizer_config.json",
+        },
+    }
+    pretrained_init_configuration = {
+        "ChineseBERT-base": {"do_lower_case": True},
+        "ChineseBERT-large": {"do_lower_case": True},
+    }
+    padding_side = "right"
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        pinyin_map=None,
+        id2pinyin=None,
+        pinyin2tensor=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+        super().__init__(vocab_file, do_lower_case, unk_token, sep_token, pad_token, cls_token, mask_token, **kwargs)
+        self.pinyin_dict = pinyin_map
+        self.id2pinyin = id2pinyin
+        self.pinyin2tensor = pinyin2tensor
+        self.special_tokens_pinyin_ids = [0] * 8
+
+    def encode(
+        self,
+        text,
+        text_pair=None,
+        max_seq_len=512,
+        pad_to_max_seq_len=False,
+        truncation_strategy="longest_first",
+        return_position_ids=False,
+        return_token_type_ids=True,
+        return_attention_mask=False,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_special_tokens_mask=False,
+    ):
+        """
+        Performs tokenization and uses the tokenized tokens to prepare model
+        inputs. It supports sequence or sequence pair as input, and batch input
+        is not allowed.
+
+        Args:
+            text (str, List[str] or List[int]):
+                The sequence to be processed. One sequence is a string, a list
+                of strings, or a list of integers depending on whether it has
+                been pretokenized and converted to ids.
+            text_pair (str, List[str] or List[List[str]]):
+                Same as `text` argument, while it represents for the latter
+                sequence of the sequence pair.
+            max_seq_len (int, optional):
+                If set to a number, will limit the total sequence returned so
+                that it has a maximum length. If there are overflowing tokens,
+                those overflowing tokens will be added to the returned dictionary
+                when `return_overflowing_tokens` is `True`. Defaults to `None`.
+            stride (int, optional):
+                Only available for batch input of sequence pair and mainly for
+                question answering usage. When for QA, `text` represents questions
+                and `text_pair` represents contexts. If `stride` is set to a
+                positive number, the context will be split into multiple spans
+                where `stride` defines the number of (tokenized) tokens to skip
+                from the start of one span to get the next span, thus will produce
+                a bigger batch than inputs to include all spans. Moreover, 'overflow_to_sample'
+                and 'offset_mapping' preserving the original example and position
+                information will be added to the returned dictionary. Defaults to 0.
+            pad_to_max_seq_len (bool, optional):
+                If set to `True`, the returned sequences would be padded up to
+                `max_seq_len` specified length according to padding side
+                (`self.padding_side`) and padding token id. Defaults to `False`.
+            truncation_strategy (str, optional):
+                String selected in the following options:
+
+                - 'longest_first' (default) Iteratively reduce the inputs sequence
+                until the input is under `max_seq_len` starting from the longest
+                one at each token (when there is a pair of input sequences).
+                - 'only_first': Only truncate the first sequence.
+                - 'only_second': Only truncate the second sequence.
+                - 'do_not_truncate': Do not truncate (raise an error if the input
+                sequence is longer than `max_seq_len`).
+
+                Defaults to 'longest_first'.
+            return_position_ids (bool, optional):
+                Whether to include tokens position ids in the returned dictionary.
+                Defaults to `False`.
+            return_token_type_ids (bool, optional):
+                Whether to include token type ids in the returned dictionary.
+                Defaults to `True`.
+            return_attention_mask (bool, optional):
+                Whether to include the attention mask in the returned dictionary.
+                Defaults to `False`.
+            return_length (bool, optional):
+                Whether to include the length of each encoded inputs in the
+                returned dictionary. Defaults to `False`.
+            return_overflowing_tokens (bool, optional):
+                Whether to include overflowing token information in the returned
+                dictionary. Defaults to `False`.
+            return_special_tokens_mask (bool, optional):
+                Whether to include special tokens mask information in the returned
+                dictionary. Defaults to `False`.
+
+        Returns:
+            dict:
+                The dict has the following optional items:
+
+                - **input_ids** (list[int]): List of token ids to be fed to a model.
+                - **pinyin_ids** (list[int]): List of pinyin ids to be fed to a model.
+                - **position_ids** (list[int], optional): List of token position ids to be
+                  fed to a model. Included when `return_position_ids` is `True`
+                - **token_type_ids** (list[int], optional): List of token type ids to be
+                  fed to a model. Included when `return_token_type_ids` is `True`.
+                - **attention_mask** (list[int], optional): List of integers valued 0 or 1,
+                  where 0 specifies paddings and should not be attended to by the
+                  model. Included when `return_attention_mask` is `True`.
+                - **seq_len** (int, optional): The input_ids length. Included when `return_length`
+                  is `True`.
+                - **overflowing_tokens** (list[int], optional): List of overflowing tokens.
+                  Included when if `max_seq_len` is specified and `return_overflowing_tokens`
+                  is True.
+                - **num_truncated_tokens** (int, optional): The number of overflowing tokens.
+                  Included when if `max_seq_len` is specified and `return_overflowing_tokens`
+                  is True.
+                - **special_tokens_mask** (list[int], optional): List of integers valued 0 or 1,
+                  with 0 specifying special added tokens and 1 specifying sequence tokens.
+                  Included when `return_special_tokens_mask` is `True`.
+        """
+
+        def get_input_ids(text):
+            if isinstance(text, str):
+                tokens = self.tokenize(text)
+                return self.convert_tokens_to_ids(tokens)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
+                return self.convert_tokens_to_ids(text)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
+                return text
+            else:
+                raise ValueError(
+                    "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
+                )
+
+        ids = get_input_ids(text)
+        pair_ids = get_input_ids(text_pair) if text_pair is not None else None
+
+        pair = bool(pair_ids is not None)
+        len_ids = len(ids)
+        len_pair_ids = len(pair_ids) if pair else 0
+
+        encoded_inputs = {}
+
+        # Truncation: Handle max sequence length
+        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair))
+
+        token_offset_mapping = self.get_offset_mapping(text)
+
+        if pair:
+            token_pair_offset_mapping = self.get_offset_mapping(text_pair)
+        else:
+            token_pair_offset_mapping = None
+
+        if max_seq_len and total_len > max_seq_len:
+            (
+                ids,
+                pair_ids,
+                token_offset_mapping,
+                token_pair_offset_mapping,
+                overflowing_tokens,
+            ) = self.truncate_sequences(
+                ids,
+                pair_ids=pair_ids,
+                token_offset_mapping=token_offset_mapping,
+                token_pair_offset_mapping=token_pair_offset_mapping,
+                num_tokens_to_remove=total_len - max_seq_len,
+                truncation_strategy=truncation_strategy,
+            )
+
+            if return_overflowing_tokens:
+                encoded_inputs["overflowing_tokens"] = overflowing_tokens
+                encoded_inputs["num_truncated_tokens"] = total_len - max_seq_len
+
+        # Add special tokens
+
+        sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
+        token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
+
+        offset_mapping = self.build_offset_mapping_with_special_tokens(token_offset_mapping, token_pair_offset_mapping)
+
+        # Build output dictionnary
+        encoded_inputs["input_ids"] = sequence
+        encoded_inputs["pinyin_ids"] = self.get_pinyin_ids(text, text_pair, offset_mapping)
+
+        if return_token_type_ids:
+            encoded_inputs["token_type_ids"] = token_type_ids
+        if return_special_tokens_mask:
+            encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
+        if return_length:
+            encoded_inputs["seq_len"] = len(encoded_inputs["input_ids"])
+
+        # Check lengths
+        assert max_seq_len is None or len(encoded_inputs["input_ids"]) <= max_seq_len
+
+        # Padding
+        needs_to_be_padded = pad_to_max_seq_len and max_seq_len and len(encoded_inputs["input_ids"]) < max_seq_len
+
+        if needs_to_be_padded:
+            difference = max_seq_len - len(encoded_inputs["input_ids"])
+            encoded_inputs["pinyin_ids"] = encoded_inputs["pinyin_ids"] + self.special_tokens_pinyin_ids * difference
+            if self.padding_side == "right":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference
+                if return_token_type_ids:
+                    encoded_inputs["token_type_ids"] = (
+                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
+                    )
+                if return_special_tokens_mask:
+                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
+                encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
+            elif self.padding_side == "left":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"])
+                if return_token_type_ids:
+                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                        "token_type_ids"
+                    ]
+                if return_special_tokens_mask:
+                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+                encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"]
+        else:
+            if return_attention_mask:
+                encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
+
+        if return_position_ids:
+            encoded_inputs["position_ids"] = list(range(len(encoded_inputs["input_ids"])))
+
+        return encoded_inputs
+
+    def batch_encode(
+        self,
+        batch_text_or_text_pairs,
+        max_seq_len=512,
+        pad_to_max_seq_len=False,
+        stride=0,
+        is_split_into_words=False,
+        truncation_strategy="longest_first",
+        return_position_ids=False,
+        return_token_type_ids=True,
+        return_attention_mask=False,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_special_tokens_mask=False,
+    ):
+        """
+        Performs tokenization and uses the tokenized tokens to prepare model
+        inputs. It supports batch inputs of sequence or sequence pair.
+
+        Args:
+            batch_text_or_text_pairs (list):
+                The element of list can be sequence or sequence pair, and the
+                sequence is a string or a list of strings depending on whether
+                it has been pretokenized. If each sequence is provided as a list
+                of strings (pretokenized), you must set `is_split_into_words` as
+                `True` to disambiguate with a sequence pair.
+            max_seq_len (int, optional):
+                If set to a number, will limit the total sequence returned so
+                that it has a maximum length. If there are overflowing tokens,
+                those overflowing tokens will be added to the returned dictionary
+                when `return_overflowing_tokens` is `True`. Defaults to `None`.
+            stride (int, optional):
+                Only available for batch input of sequence pair and mainly for
+                question answering usage. When for QA, `text` represents questions
+                and `text_pair` represents contexts. If `stride` is set to a
+                positive number, the context will be split into multiple spans
+                where `stride` defines the number of (tokenized) tokens to skip
+                from the start of one span to get the next span, thus will produce
+                a bigger batch than inputs to include all spans. Moreover, 'overflow_to_sample'
+                and 'offset_mapping' preserving the original example and position
+                information will be added to the returned dictionary. Defaults to 0.
+            pad_to_max_seq_len (bool, optional):
+                If set to `True`, the returned sequences would be padded up to
+                `max_seq_len` specified length according to padding side
+                (`self.padding_side`) and padding token id. Defaults to `False`.
+            truncation_strategy (str, optional):
+                String selected in the following options:
+
+                - 'longest_first' (default) Iteratively reduce the inputs sequence
+                until the input is under `max_seq_len` starting from the longest
+                one at each token (when there is a pair of input sequences).
+                - 'only_first': Only truncate the first sequence.
+                - 'only_second': Only truncate the second sequence.
+                - 'do_not_truncate': Do not truncate (raise an error if the input
+                sequence is longer than `max_seq_len`).
+
+                Defaults to 'longest_first'.
+            return_position_ids (bool, optional):
+                Whether to include tokens position ids in the returned dictionary.
+                Defaults to `False`.
+            return_token_type_ids (bool, optional):
+                Whether to include token type ids in the returned dictionary.
+                Defaults to `True`.
+            return_attention_mask (bool, optional):
+                Whether to include the attention mask in the returned dictionary.
+                Defaults to `False`.
+            return_length (bool, optional):
+                Whether to include the length of each encoded inputs in the
+                returned dictionary. Defaults to `False`.
+            return_overflowing_tokens (bool, optional):
+                Whether to include overflowing token information in the returned
+                dictionary. Defaults to `False`.
+            return_special_tokens_mask (bool, optional):
+                Whether to include special tokens mask information in the returned
+                dictionary. Defaults to `False`.
+
+        Returns:
+            list[dict]:
+                The dict has the following optional items:
+
+                - **input_ids** (list[int]): List of token ids to be fed to a model.
+                - **pinyin_ids** (list[int]): List of pinyin ids to be fed to a model.
+                - **position_ids** (list[int], optional): List of token position ids to be
+                  fed to a model. Included when `return_position_ids` is `True`
+                - **token_type_ids** (list[int], optional): List of token type ids to be
+                  fed to a model. Included when `return_token_type_ids` is `True`.
+                - **attention_mask** (list[int], optional): List of integers valued 0 or 1,
+                  where 0 specifies paddings and should not be attended to by the
+                  model. Included when `return_attention_mask` is `True`.
+                - **seq_len** (int, optional): The input_ids length. Included when `return_length`
+                  is `True`.
+                - **overflowing_tokens** (list[int], optional): List of overflowing tokens.
+                  Included when if `max_seq_len` is specified and `return_overflowing_tokens`
+                  is True.
+                - **num_truncated_tokens** (int, optional): The number of overflowing tokens.
+                  Included when if `max_seq_len` is specified and `return_overflowing_tokens`
+                  is True.
+                - **special_tokens_mask** (list[int], optional): List of integers valued 0 or 1,
+                  with 0 specifying special added tokens and 1 specifying sequence tokens.
+                  Included when `return_special_tokens_mask` is `True`.
+                - **offset_mapping** (list[int], optional): list of pair preserving the
+                  index of start and end char in original input for each token.
+                  For a sqecial token, the index pair is `(0, 0)`. Included when
+                  `stride` works.
+                - **overflow_to_sample** (int, optional): Index of example from which this
+                  feature is generated. Included when `stride` works.
+        """
+
+        def get_input_ids(text):
+            if isinstance(text, str):
+                tokens = self.tokenize(text)
+                return self.convert_tokens_to_ids(tokens)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
+                return self.convert_tokens_to_ids(text)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
+                return text
+            else:
+                raise ValueError(
+                    "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
+                )
+
+        batch_encode_inputs = []
+        for example_id, tokens_or_pair_tokens in enumerate(batch_text_or_text_pairs):
+            if not isinstance(tokens_or_pair_tokens, (list, tuple)):
+                text, text_pair = tokens_or_pair_tokens, None
+            elif is_split_into_words and not isinstance(tokens_or_pair_tokens[0], (list, tuple)):
+                text, text_pair = tokens_or_pair_tokens, None
+            else:
+                text, text_pair = tokens_or_pair_tokens
+
+            if stride > 0 and text_pair is not None:
+                first_ids = get_input_ids(text)
+                second_ids = get_input_ids(text_pair)
+
+                max_len_for_pair = max_seq_len - len(first_ids) - self.num_special_tokens_to_add(pair=True)
+                token_offset_mapping = self.get_offset_mapping(text)
+                token_pair_offset_mapping = self.get_offset_mapping(text_pair)
+
+                while True:
+                    encoded_inputs = {}
+
+                    ids = first_ids
+                    mapping = token_offset_mapping
+                    if len(second_ids) <= max_len_for_pair:
+                        pair_ids = second_ids
+                        pair_mapping = token_pair_offset_mapping
+                    else:
+                        pair_ids = second_ids[:max_len_for_pair]
+                        pair_mapping = token_pair_offset_mapping[:max_len_for_pair]
+
+                    offset_mapping = self.build_offset_mapping_with_special_tokens(mapping, pair_mapping)
+
+                    sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
+                    token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
+
+                    # Build output dictionnary
+                    encoded_inputs["input_ids"] = sequence
+                    # add_pinyin_ids
+                    encoded_inputs["pinyin_ids"] = self.get_pinyin_ids(text, text_pair, offset_mapping)
+                    if return_token_type_ids:
+                        encoded_inputs["token_type_ids"] = token_type_ids
+                    if return_special_tokens_mask:
+                        encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
+                    if return_length:
+                        encoded_inputs["seq_len"] = len(encoded_inputs["input_ids"])
+
+                    # Check lengths
+                    assert max_seq_len is None or len(encoded_inputs["input_ids"]) <= max_seq_len
+
+                    # Padding
+                    needs_to_be_padded = (
+                        pad_to_max_seq_len and max_seq_len and len(encoded_inputs["input_ids"]) < max_seq_len
+                    )
+
+                    encoded_inputs["offset_mapping"] = offset_mapping
+
+                    if needs_to_be_padded:
+                        difference = max_seq_len - len(encoded_inputs["input_ids"])
+                        # padding pinyin_ids
+                        encoded_inputs["pinyin_ids"] = (
+                            encoded_inputs["pinyin_ids"] + self.special_tokens_pinyin_ids * difference
+                        )
+                        if self.padding_side == "right":
+                            if return_attention_mask:
+                                encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [
+                                    0
+                                ] * difference
+                            if return_token_type_ids:
+                                # 0 for padding token mask
+                                encoded_inputs["token_type_ids"] = (
+                                    encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
+                                )
+                            if return_special_tokens_mask:
+                                encoded_inputs["special_tokens_mask"] = (
+                                    encoded_inputs["special_tokens_mask"] + [1] * difference
+                                )
+                            encoded_inputs["input_ids"] = (
+                                encoded_inputs["input_ids"] + [self.pad_token_id] * difference
+                            )
+                            encoded_inputs["offset_mapping"] = encoded_inputs["offset_mapping"] + [(0, 0)] * difference
+                        elif self.padding_side == "left":
+                            if return_attention_mask:
+                                encoded_inputs["attention_mask"] = [0] * difference + [1] * len(
+                                    encoded_inputs["input_ids"]
+                                )
+                            if return_token_type_ids:
+                                # 0 for padding token mask
+                                encoded_inputs["token_type_ids"] = [
+                                    self.pad_token_type_id
+                                ] * difference + encoded_inputs["token_type_ids"]
+                            if return_special_tokens_mask:
+                                encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs[
+                                    "special_tokens_mask"
+                                ]
+                            encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs[
+                                "input_ids"
+                            ]
+                            encoded_inputs["offset_mapping"] = [(0, 0)] * difference + encoded_inputs["offset_mapping"]
+                    else:
+                        if return_attention_mask:
+                            encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
+
+                    if return_position_ids:
+                        encoded_inputs["position_ids"] = list(range(len(encoded_inputs["input_ids"])))
+
+                    encoded_inputs["overflow_to_sample"] = example_id
+                    batch_encode_inputs.append(encoded_inputs)
+
+                    if len(second_ids) <= max_len_for_pair:
+                        break
+                    else:
+                        second_ids = second_ids[max_len_for_pair - stride :]
+                        token_pair_offset_mapping = token_pair_offset_mapping[max_len_for_pair - stride :]
+
+            else:
+                batch_encode_inputs.append(
+                    self.encode(
+                        text,
+                        text_pair,
+                        max_seq_len=max_seq_len,
+                        pad_to_max_seq_len=pad_to_max_seq_len,
+                        truncation_strategy=truncation_strategy,
+                        return_position_ids=return_position_ids,
+                        return_token_type_ids=return_token_type_ids,
+                        return_attention_mask=return_attention_mask,
+                        return_length=return_length,
+                        return_overflowing_tokens=return_overflowing_tokens,
+                        return_special_tokens_mask=return_special_tokens_mask,
+                    )
+                )
+
+        return batch_encode_inputs
+
+    def truncate_sequences(
+        self,
+        ids,
+        pair_ids=None,
+        token_offset_mapping=None,
+        token_pair_offset_mapping=None,
+        num_tokens_to_remove=0,
+        truncation_strategy="longest_first",
+        stride=0,
+    ):
+        """
+        Truncates a sequence pair in place to the maximum length.
+
+        Args:
+            ids: list of tokenized input ids. Can be obtained from a string by chaining the
+                `tokenize` and `convert_tokens_to_ids` methods.
+            pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the
+                `tokenize` and `convert_tokens_to_ids` methods.
+            token_offset_mapping (list): The map of tokens and the start and end index of their start and end character
+            token_pair_offset_mapping(list): The map of token pairs and the start and end index of their start and end character
+            num_tokens_to_remove (:obj:`int`, `optional`, defaults to ``0``):
+                number of tokens to remove using the truncation strategy
+            truncation_strategy: string selected in the following options:
+                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_seq_len
+                    starting from the longest one at each token (when there is a pair of input sequences).
+                    Overflowing tokens only contains overflow from the first sequence.
+                - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove.
+                - 'only_second': Only truncate the second sequence
+                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_seq_len)
+            stride (:obj:`int`, `optional`, defaults to ``0``):
+                If set to a number along with max_seq_len, the overflowing tokens returned will contain some tokens
+                from the main sequence returned. The value of this argument defines the number of additional tokens.
+        """
+
+        if num_tokens_to_remove <= 0:
+            return ids, pair_ids, []
+
+        if truncation_strategy == "longest_first":
+            overflowing_tokens = []
+            for _ in range(num_tokens_to_remove):
+                if pair_ids is None or len(ids) > len(pair_ids):
+                    overflowing_tokens = [ids[-1]] + overflowing_tokens
+                    ids = ids[:-1]
+                    token_offset_mapping = token_offset_mapping[:-1]
+                else:
+                    pair_ids = pair_ids[:-1]
+                    token_pair_offset_mapping = token_pair_offset_mapping[:-1]
+            window_len = min(len(ids), stride)
+            if window_len > 0:
+                overflowing_tokens = ids[-window_len:] + overflowing_tokens
+        elif truncation_strategy == "only_first":
+            assert len(ids) > num_tokens_to_remove
+            window_len = min(len(ids), stride + num_tokens_to_remove)
+            overflowing_tokens = ids[-window_len:]
+            ids = ids[:-num_tokens_to_remove]
+            token_offset_mapping = token_offset_mapping[:-num_tokens_to_remove]
+        elif truncation_strategy == "only_second":
+            assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove
+            window_len = min(len(pair_ids), stride + num_tokens_to_remove)
+            overflowing_tokens = pair_ids[-window_len:]
+            pair_ids = pair_ids[:-num_tokens_to_remove]
+            token_pair_offset_mapping = token_pair_offset_mapping[:-num_tokens_to_remove]
+        elif truncation_strategy == "do_not_truncate":
+            raise ValueError("Input sequence are too long for max_length. Please select a truncation strategy.")
+        else:
+            raise ValueError(
+                "Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']"
+            )
+        return (ids, pair_ids, token_offset_mapping, token_pair_offset_mapping, overflowing_tokens)
+
+    @lru_cache(9999)
+    def pinyin_locs_map(self, text):
+        """
+        Get the map of pinyin locations and pinyin tensor.
+
+        Args:
+            text (str):
+                The sequence to be processed.
+
+        Returns:
+            dict: the map of pinyin locations and pinyin tensor.
+        """
+        pinyin = try_import("pypinyin.pinyin")
+        Style = try_import("pypinyin.Style")
+        pinyin_list = pinyin(
+            text,
+            style=Style.TONE3,
+            heteronym=True,
+            errors=lambda x: [["not chinese"] for _ in x],
+        )
+        pinyin_locs = {}
+        # get pinyin of each location
+        for index, item in enumerate(pinyin_list):
+            pinyin_string = item[0]
+            # not a Chinese character, pass
+            if pinyin_string == "not chinese":
+                continue
+            if pinyin_string in self.pinyin2tensor:
+                pinyin_locs[index] = self.pinyin2tensor[pinyin_string]
+            else:
+                ids = [0] * 8
+                for i, p in enumerate(pinyin_string):
+                    if p not in self.pinyin_dict["char2idx"]:
+                        ids = [0] * 8
+                        break
+                    ids[i] = self.pinyin_dict["char2idx"][p]
+                pinyin_locs[index] = ids
+        return pinyin_locs
+
+    def get_pinyin_ids(self, text, text_pair=None, offset_mapping=None):
+        """
+        Find chinese character location, and generate pinyin ids.
+
+        Args:
+            text (str):
+                The sequence to be processed.
+            text_pair (str, optional):
+                Same as `text` argument, while it represents for the latter sequence of the sequence pair.
+                Defaults to `None`.
+            offset_mapping (list, optional):
+                A list of wordpiece offsets with the appropriate offsets of special tokens.
+                Defaults to `None`.
+
+        Returns:
+            list: The list of pinyin id tensor.
+        """
+
+        text_pinyin_locs = self.pinyin_locs_map(text)
+        if text_pair:
+            text_pair_pinyin_locs = self.pinyin_locs_map(text_pair)
+        else:
+            text_pair_pinyin_locs = None
+
+        pinyin_ids = []
+        special_token_count = 0
+
+        for offset in offset_mapping:
+            if offset == (0, 0):
+                special_token_count += 1
+
+            if special_token_count <= 1:
+                pinyin_locs_maps = text_pinyin_locs
+            else:
+                pinyin_locs_maps = text_pair_pinyin_locs
+
+            if offset[1] - offset[0] != 1:
+                pinyin_ids.extend([0] * 8)
+                continue
+            if offset[0] in pinyin_locs_maps:
+                pinyin_ids.extend(pinyin_locs_maps[offset[0]])
+            else:
+                pinyin_ids.extend([0] * 8)
+
+        return pinyin_ids
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/configuration.py
new file mode 100644
index 000000000..65c094aaf
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/configuration.py
@@ -0,0 +1,380 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The OFA-Sys Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Chinese-CLIP model configuration"""
+
+import copy
+import os
+from typing import Union
+
+from ...utils.log import logger
+from ..configuration_utils import PretrainedConfig
+
+__all__ = [
+    "ChineseCLIPTextConfig",
+    "ChineseCLIPVisionConfig",
+    "ChineseCLIPConfig",
+]
+
+
+class ChineseCLIPTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ChineseCLIPModel`]. It is used to instantiate a
+    Chinese CLIP model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Chinese CLIP
+    [OFA-Sys/chinese-clip-vit-base-patch16](https:
+        //huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the CHINESE_CLIP model. Defines the number of different tokens that can be represented
+            by the `inputs_ids` passed when calling [`ChineseCLIPModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`ChineseCLIPModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+
+    Example:
+
+    ```python
+    >>> from paddlenlp.transformers import ChineseCLIPTextConfig, ChineseCLIPTextModel
+
+    >>> # Initializing a ChineseCLIPTextConfig with OFA-Sys/chinese-clip-vit-base-patch16 style configuration
+    >>> configuration = ChineseCLIPTextConfig()
+
+    >>> # Initializing a ChineseCLIPTextModel (with random weights) from the OFA-Sys/chinese-clip-vit-base-patch16 style configuration
+    >>> model = ChineseCLIPTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "chinese_clip_text_model"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        projection_dim=512,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        pool_act: str = "tanh",
+        fuse: bool = False,
+        position_embedding_type="absolute",
+        use_cache=False,  # may has OOM bug, must set this to False,
+        **kwargs
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.pool_act = pool_act
+        self.fuse = fuse
+        self.initializer_factor = initializer_factor
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> PretrainedConfig:
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from ChineseCLIPConfig
+        if config_dict.get("model_type") == "chinese_clip":
+            projection_dim = config_dict.get("projection_dim", None)
+            config_dict = config_dict["text_config"]
+            if projection_dim is not None:
+                config_dict["projection_dim"] = projection_dim
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class ChineseCLIPVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ChineseCLIPModel`]. It is used to instantiate an
+    ChineseCLIP model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the ChineseCLIP
+    [OFA-Sys/chinese-clip-vit-base-patch16](https:
+        //huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*,
+            defaults to 1e-5): The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float``, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import ChineseCLIPVisionConfig, ChineseCLIPVisionModel
+
+    >>> # Initializing a ChineseCLIPVisionConfig with OFA-Sys/chinese-clip-vit-base-patch16 style configuration
+    >>> configuration = ChineseCLIPVisionConfig()
+
+    >>> # Initializing a ChineseCLIPVisionModel (with random weights) from the OFA-Sys/chinese-clip-vit-base-patch16 style configuration
+    >>> model = ChineseCLIPVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "chinese_clip_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=32,
+        hidden_act="quick_gelu",
+        layer_norm_eps=0.00001,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        **kwargs
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> PretrainedConfig:
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from ChineseCLIPConfig
+        if config_dict.get("model_type") == "chinese_clip":
+            projection_dim = config_dict.get("projection_dim", None)
+            config_dict = config_dict["vision_config"]
+            if projection_dim is not None:
+                config_dict["projection_dim"] = projection_dim
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class ChineseCLIPConfig(PretrainedConfig):
+    r"""
+    [`ChineseCLIPConfig`] is the configuration class to store the configuration of a [`ChineseCLIPModel`]. It is used
+    to instantiate Chinese-CLIP model according to the specified arguments, defining the text model and vision model
+    configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    Chinese-CLIP [OFA-Sys/chinese-clip-vit-base-patch16](https://huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`ChineseCLIPTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`ChineseCLIPVisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimentionality of text and vision projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The inital value of the *logit_scale* paramter. Default is used as per the original ChineseCLIP
+            implementation.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from paddlenlp.transformers import ChineseCLIPConfig, ChineseCLIPModel
+
+    >>> # Initializing a ChineseCLIPConfig with OFA-Sys/chinese-clip-vit-base-patch16 style configuration
+    >>> configuration = ChineseCLIPConfig()
+
+    >>> # Initializing a ChineseCLIPModel (with random weights) from the OFA-Sys/chinese-clip-vit-base-patch16 style configuration
+    >>> model = ChineseCLIPModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a ChineseCLIPConfig from a ChineseCLIPTextConfig and a ChineseCLIPVisionConfig
+
+    >>> # Initializing a ChineseCLIPTextConfig and ChineseCLIPVisionConfig configuration
+    >>> config_text = ChineseCLIPTextConfig()
+    >>> config_vision = ChineseCLIPVisionConfig()
+
+    >>> config = ChineseCLIPConfig.from_text_vision_configs(config_text, config_vision)
+    ```
+    """
+
+    model_type = "chinese_clip"
+    is_composition = True
+
+    def __init__(
+        self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(**kwargs)
+        # If `_config_dict` exist, we use them for the backward compatibility.
+        text_config_dict = kwargs.pop("text_config_dict", None)
+        vision_config_dict = kwargs.pop("vision_config_dict", None)
+        if text_config_dict is not None:
+            text_config = text_config_dict
+        if vision_config_dict is not None:
+            vision_config = vision_config_dict
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the ChineseCLIPTextConfig with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the ChineseCLIPVisionConfig with default values.")
+
+        text_config["projection_dim"] = projection_dim
+        vision_config["projection_dim"] = projection_dim
+        self.text_config = ChineseCLIPTextConfig(**text_config)
+        self.vision_config = ChineseCLIPVisionConfig(**vision_config)
+
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = 1.0
+        self.initializer_range = 0.02
+
+    @classmethod
+    def from_text_vision_configs(
+        cls, text_config: ChineseCLIPTextConfig, vision_config: ChineseCLIPVisionConfig, **kwargs
+    ):
+        r"""
+        Instantiate a [`ChineseCLIPConfig`] (or a derived class) from Chinese-CLIP text model configuration and
+        Chinese-CLIP vision model configuration. Returns:
+            [`ChineseCLIPConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+
+    def to_dict(self, *args, **kwargs):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["text_config"] = self.text_config.to_dict()
+        output["vision_config"] = self.vision_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/converter.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/converter.py
new file mode 100644
index 000000000..476a145ad
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/converter.py
@@ -0,0 +1,301 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import os
+from typing import Dict, List, Type, Union
+
+from ...utils.converter import Converter, StateDictNameMapping
+from ...utils.import_utils import import_module
+from ..model_utils import PretrainedModel
+from .modeling import ChineseCLIPModel
+
+
+class ChineseCLIPConverter(Converter):
+    """Chinese-CLIP Converter which handle the converting operations"""
+
+    num_layer_key = "num_hidden_layers"
+    _ignore_state_dict_keys = ["text_model.embeddings.position_ids", "vision_model.embeddings.position_ids"]
+    architectures: Dict[str, Type[PretrainedModel]] = {"ChineseCLIPModel": ChineseCLIPModel}
+    try_compare_logits: bool = False
+
+    def resolve_num_layer(self, config_or_num_layers: Union[dict, int] = None) -> int:
+        """resolve the number of transformer layer based on the key of model config, eg: `num_hidden_layers` in CLIPModel
+        Args:
+            config_or_num_layers (Union[dict, int], optional): the instance of config or num_layers. Defaults to None.
+        Raises:
+            ValueError: when `config_or_num_layers` is not dict/int, it will raise the error
+        Returns:
+            int: the number of transformer layer
+        """
+        if isinstance(config_or_num_layers, dict):
+            num_text_layer = 0
+            num_vision_layer = 0
+
+            if self.model_type in ["chinese_clip", "chinese_clip_text_model"]:
+                if "text_config" in config_or_num_layers:
+                    num_text_layer = config_or_num_layers["text_config"][self.num_layer_key]
+                else:
+                    num_text_layer = config_or_num_layers[self.num_layer_key]
+
+            if self.model_type in ["chinese_clip", "chinese_clip_vision_model"]:
+                if "vision_config" in config_or_num_layers:
+                    num_vision_layer = config_or_num_layers["vision_config"][self.num_layer_key]
+                else:
+                    num_vision_layer = config_or_num_layers[self.num_layer_key]
+
+            return num_text_layer, num_vision_layer
+        elif isinstance(config_or_num_layers, int):
+            num_layer = config_or_num_layers
+        else:
+            raise ValueError(f"the type of config_or_num_layers<{config_or_num_layers}> should be one of <dict, int>")
+        return num_layer, num_layer
+
+    def load_torch_weight_file(self, model_file: str):
+        """load torch weight file with torch which should be removed later.
+        Args:
+            model_file (str): the path of pytorch model file
+        Returns:
+            Dict[str, ndarray]: the state dict object of loaded pytorch state dict
+        """
+        import torch
+
+        state_dict = torch.load(model_file)
+        for key in state_dict.keys():
+            state_dict[key] = state_dict[key].cpu().numpy()
+            if state_dict[key].ndim == 0:
+                state_dict[key] = state_dict[key].reshape((1,))
+        return state_dict
+
+    def get_paddle_pytorch_model_classes(self):
+        paddle_clip_model_class = import_module(f"paddlenlp.transformers.{self.architecture}")
+        pytorch_clip_model_class = import_module(f"transformers.{self.architecture}")
+        return paddle_clip_model_class, pytorch_clip_model_class
+
+    def get_name_mapping(self, config_or_num_layers: Union[dict, int] = None) -> List[StateDictNameMapping]:
+        self.model_type = (
+            config_or_num_layers.get("model_type", "chinese_clip")
+            if isinstance(config_or_num_layers, dict)
+            else "chinese_clip"
+        )
+        self.architecture = (
+            config_or_num_layers.get("architectures", ["ChineseCLIPModel"])[0]
+            if isinstance(config_or_num_layers, dict)
+            else "ChineseCLIPModel"
+        )
+
+        num_text_layer, num_vision_layer = self.resolve_num_layer(config_or_num_layers)
+
+        mappings: List[StateDictNameMapping] = []
+        if self.model_type == "chinese_clip":
+            hard_mappings = [["logit_scale", "logit_scale"]]
+        else:
+            hard_mappings = []
+
+        # text model (bert model)
+        if num_text_layer > 0:
+            text_model_layer_mappings = [
+                ["text_model.embeddings.word_embeddings.weight", "text_model.embeddings.word_embeddings.weight"],
+                [
+                    "text_model.embeddings.position_embeddings.weight",
+                    "text_model.embeddings.position_embeddings.weight",
+                ],
+                [
+                    "text_model.embeddings.token_type_embeddings.weight",
+                    "text_model.embeddings.token_type_embeddings.weight",
+                ],
+                ["text_model.embeddings.LayerNorm.weight", "text_model.embeddings.layer_norm.weight"],
+                ["text_model.embeddings.LayerNorm.bias", "text_model.embeddings.layer_norm.bias"],
+                ["text_projection.weight", "text_projection", "transpose"],
+                # donot add pooler
+                # ["text_model.pooler.dense.weight", "text_model.pooler.dense.weight", "transpose"],
+                # ["text_model.pooler.dense.bias", "text_model.pooler.dense.bias"],
+            ]
+            hard_mappings.extend(text_model_layer_mappings)
+
+            for layer_index in range(num_text_layer):
+                text_model_layer_mappings = [
+                    [
+                        f"text_model.encoder.layer.{layer_index}.attention.self.query.weight",
+                        f"text_model.encoder.layers.{layer_index}.self_attn.q_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"text_model.encoder.layer.{layer_index}.attention.self.query.bias",
+                        f"text_model.encoder.layers.{layer_index}.self_attn.q_proj.bias",
+                    ],
+                    [
+                        f"text_model.encoder.layer.{layer_index}.attention.self.key.weight",
+                        f"text_model.encoder.layers.{layer_index}.self_attn.k_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"text_model.encoder.layer.{layer_index}.attention.self.key.bias",
+                        f"text_model.encoder.layers.{layer_index}.self_attn.k_proj.bias",
+                    ],
+                    [
+                        f"text_model.encoder.layer.{layer_index}.attention.self.value.weight",
+                        f"text_model.encoder.layers.{layer_index}.self_attn.v_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"text_model.encoder.layer.{layer_index}.attention.self.value.bias",
+                        f"text_model.encoder.layers.{layer_index}.self_attn.v_proj.bias",
+                    ],
+                    [
+                        f"text_model.encoder.layer.{layer_index}.attention.output.dense.weight",
+                        f"text_model.encoder.layers.{layer_index}.self_attn.out_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"text_model.encoder.layer.{layer_index}.attention.output.dense.bias",
+                        f"text_model.encoder.layers.{layer_index}.self_attn.out_proj.bias",
+                    ],
+                    [
+                        f"text_model.encoder.layer.{layer_index}.intermediate.dense.weight",
+                        f"text_model.encoder.layers.{layer_index}.linear1.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"text_model.encoder.layer.{layer_index}.intermediate.dense.bias",
+                        f"text_model.encoder.layers.{layer_index}.linear1.bias",
+                    ],
+                    [
+                        f"text_model.encoder.layer.{layer_index}.attention.output.LayerNorm.weight",
+                        f"text_model.encoder.layers.{layer_index}.norm1.weight",
+                    ],
+                    [
+                        f"text_model.encoder.layer.{layer_index}.attention.output.LayerNorm.bias",
+                        f"text_model.encoder.layers.{layer_index}.norm1.bias",
+                    ],
+                    [
+                        f"text_model.encoder.layer.{layer_index}.output.dense.weight",
+                        f"text_model.encoder.layers.{layer_index}.linear2.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"text_model.encoder.layer.{layer_index}.output.dense.bias",
+                        f"text_model.encoder.layers.{layer_index}.linear2.bias",
+                    ],
+                    [
+                        f"text_model.encoder.layer.{layer_index}.output.LayerNorm.weight",
+                        f"text_model.encoder.layers.{layer_index}.norm2.weight",
+                    ],
+                    [
+                        f"text_model.encoder.layer.{layer_index}.output.LayerNorm.bias",
+                        f"text_model.encoder.layers.{layer_index}.norm2.bias",
+                    ],
+                ]
+                hard_mappings.extend(text_model_layer_mappings)
+
+        # vision model
+        if num_vision_layer > 0:
+            vision_model_layer_mappings = [
+                ["vision_model.embeddings.class_embedding", "vision_model.class_embedding"],
+                ["vision_model.embeddings.patch_embedding.weight", "vision_model.conv1.weight"],
+                ["vision_model.embeddings.position_embedding.weight", "vision_model.positional_embedding.weight"],
+                ["vision_model.pre_layrnorm.weight", "vision_model.ln_pre.weight"],
+                ["vision_model.pre_layrnorm.bias", "vision_model.ln_pre.bias"],
+                ["vision_model.post_layernorm.weight", "vision_model.ln_post.weight"],
+                ["vision_model.post_layernorm.bias", "vision_model.ln_post.bias"],
+                ["visual_projection.weight", "vision_projection", "transpose"],
+            ]
+            hard_mappings.extend(vision_model_layer_mappings)
+            for layer_index in range(num_vision_layer):
+                vision_model_layer_mappings = [
+                    # qkv out
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.self_attn.q_proj.weight",
+                        f"vision_model.transformer.layers.{layer_index}.self_attn.q_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.self_attn.q_proj.bias",
+                        f"vision_model.transformer.layers.{layer_index}.self_attn.q_proj.bias",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.self_attn.k_proj.weight",
+                        f"vision_model.transformer.layers.{layer_index}.self_attn.k_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.self_attn.k_proj.bias",
+                        f"vision_model.transformer.layers.{layer_index}.self_attn.k_proj.bias",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.self_attn.v_proj.weight",
+                        f"vision_model.transformer.layers.{layer_index}.self_attn.v_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.self_attn.v_proj.bias",
+                        f"vision_model.transformer.layers.{layer_index}.self_attn.v_proj.bias",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.self_attn.out_proj.weight",
+                        f"vision_model.transformer.layers.{layer_index}.self_attn.out_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.self_attn.out_proj.bias",
+                        f"vision_model.transformer.layers.{layer_index}.self_attn.out_proj.bias",
+                    ],
+                    # fc1
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.mlp.fc1.weight",
+                        f"vision_model.transformer.layers.{layer_index}.linear1.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.mlp.fc1.bias",
+                        f"vision_model.transformer.layers.{layer_index}.linear1.bias",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.layer_norm1.weight",
+                        f"vision_model.transformer.layers.{layer_index}.norm1.weight",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.layer_norm1.bias",
+                        f"vision_model.transformer.layers.{layer_index}.norm1.bias",
+                    ],
+                    # fc2
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.mlp.fc2.weight",
+                        f"vision_model.transformer.layers.{layer_index}.linear2.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.mlp.fc2.bias",
+                        f"vision_model.transformer.layers.{layer_index}.linear2.bias",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.layer_norm2.weight",
+                        f"vision_model.transformer.layers.{layer_index}.norm2.weight",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.layer_norm2.bias",
+                        f"vision_model.transformer.layers.{layer_index}.norm2.bias",
+                    ],
+                ]
+                hard_mappings.extend(vision_model_layer_mappings)
+
+        mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(hard_mappings)]
+        return mappings
+
+    def convert(self, input_dir: str, output_dir: str):
+        super().convert(input_dir, output_dir)
+        old_config_file = os.path.join(output_dir, "model_config.json")
+        new_config_file = os.path.join(output_dir, "config.json")
+        os.rename(old_config_file, new_config_file)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/feature_extraction.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/feature_extraction.py
new file mode 100644
index 000000000..be35ddca3
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/feature_extraction.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for Chinese-CLIP."""
+
+__all__ = ["ChineseCLIPFeatureExtractor"]
+
+
+import warnings
+
+from .image_processing import ChineseCLIPImageProcessor
+
+
+class ChineseCLIPFeatureExtractor(ChineseCLIPImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The class ChineseCLIPFeatureExtractor is deprecated and will be removed in version 5 of PaddleNLP. Please"
+            " use CLIPImageProcessor instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/image_processing.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/image_processing.py
new file mode 100644
index 000000000..f6b9c23ca
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/image_processing.py
@@ -0,0 +1,328 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The OFA-Sys Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Chinese-CLIP."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import PIL
+
+from ..image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ..image_transforms import (
+    center_crop,
+    convert_to_rgb,
+    get_resize_output_image_size,
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from ..image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    is_batched,
+    to_numpy_array,
+    valid_images,
+)
+from ..tokenizer_utils_base import TensorType
+
+__all__ = ["ChineseCLIPImageProcessor"]
+
+
+class ChineseCLIPImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Chinese-CLIP image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
+            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
+            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+            `preprocess` method.
+        crop_size (`Dict[str, int]` *optional*, defaults to 224):
+            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize:
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Image standard deviation.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 224}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else [0.48145466, 0.4578275, 0.40821073]
+        self.image_std = image_std if image_std is not None else [0.26862954, 0.26130258, 0.27577711]
+        self.do_convert_rgb = do_convert_rgb
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size, default_to_square=False)
+        if "shortest_edge" not in size:
+            raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
+        output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
+        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
+
+    def center_crop(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the
+        returned result will always be of size `size`).
+
+        Args:
+            image (`np.ndarray`):
+                Image to center crop.
+            size (`Dict[str, int]`):
+                Size of the output image in the form of a dictionary with keys `height` and `width`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` parameter must contain the keys (height, width). Got {size.keys()}")
+        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
+
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ):
+        """
+        Rescale an image by a scale factor. image = image * scale.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`int` or `float`):
+                Scale to apply to the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return rescale(image, scale=scale, data_format=data_format, **kwargs)
+
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: Union[float, List[float]],
+        std: Union[float, List[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            image_mean (`float` or `List[float]`):
+                Image mean.
+            image_std (`float` or `List[float]`):
+                Image standard deviation.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: int = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        **kwargs
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.PADDLE` or `'pt'`: Return a batch of type `paddle.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: defaults to the channel dimension format of the input image.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        if not is_batched(images):
+            images = [images]
+
+        if not valid_images(images):
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
+
+        if do_resize and size is None:
+            raise ValueError("Size must be specified if do_resize is True.")
+
+        if do_center_crop and crop_size is None:
+            raise ValueError("Crop size must be specified if do_center_crop is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_resize:
+            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+
+        if do_center_crop:
+            images = [self.center_crop(image=image, size=crop_size) for image in images]
+
+        if do_rescale:
+            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+
+        if do_normalize:
+            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
+
+        images = [to_channel_dimension_format(image, data_format) for image in images]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/modeling.py
new file mode 100644
index 000000000..a376cc7ff
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/modeling.py
@@ -0,0 +1,1036 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The OFA-Sys Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Optional, Tuple, Union
+
+import paddle
+import paddle.distributed as dist
+import paddle.nn.functional as F
+from paddle import nn
+
+from ...utils.initializer import normal_, ones_, zeros_
+from ..bert.modeling import BertEmbeddings as ChineseCLIPTextEmbeddings
+from ..bert.modeling import BertModel
+from ..clip.modeling import CLIPVisionTransformer as ChineseCLIPVisionTransformer
+from ..model_outputs import (
+    BaseModelOutputWithPooling,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    ModelOutput,
+)
+from ..model_utils import PretrainedModel
+from .configuration import (
+    ChineseCLIPConfig,
+    ChineseCLIPTextConfig,
+    ChineseCLIPVisionConfig,
+)
+
+CHINESE_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "OFA-Sys/chinese-clip-vit-base-patch16",
+    "OFA-Sys/chinese-clip-vit-huge-patch14",
+    "OFA-Sys/chinese-clip-vit-large-patch14",
+    "OFA-Sys/chinese-clip-vit-large-patch14-336px",
+    # See all Chinese-CLIP models at https://huggingface.co/models?filter=chinese_clip
+]
+
+__all__ = [
+    "ChineseCLIPTextModel",
+    "ChineseCLIPVisionModel",
+    "ChineseCLIPPretrainedModel",
+    "ChineseCLIPModel",
+    "ChineseCLIPTextModelWithProjection",
+    "ChineseCLIPVisionModelWithProjection",
+]
+
+
+def quick_gelu(x):
+    return x * F.sigmoid(1.702 * x)
+
+
+F.quick_gelu = quick_gelu
+
+# contrastive loss function, adapted from
+# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
+
+
+def contrastive_loss(logits: paddle.Tensor) -> paddle.Tensor:
+    return F.cross_entropy(logits, paddle.arange(len(logits)))
+
+
+def chinese_clip_loss(similarity: paddle.Tensor) -> paddle.Tensor:
+    caption_loss = contrastive_loss(similarity)
+    image_loss = contrastive_loss(similarity.t())
+    return (caption_loss + image_loss) / 2.0
+
+
+@dataclass
+class ChineseCLIPVisionModelOutput(ModelOutput):
+    """
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+
+    Args:
+        image_embeds (`paddle.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    image_embeds: Optional[paddle.Tensor] = None
+    last_hidden_state: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class ChineseCLIPTextModelOutput(ModelOutput):
+    """
+    Base class for text model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        text_embeds (`paddle.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The text embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    text_embeds: Optional[paddle.Tensor] = None
+    last_hidden_state: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class ChineseCLIPOutput(ModelOutput):
+    """
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
+        logits_per_image:(`paddle.Tensor` of shape `(image_batch_size, text_batch_size)`):
+            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+            similarity scores.
+        logits_per_text:(`paddle.Tensor` of shape `(text_batch_size, image_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+            similarity scores.
+        text_embeds(`paddle.Tensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of [`ChineseCLIPTextModel`].
+        image_embeds(`paddle.Tensor` of shape `(batch_size, output_dim`):
+            The image embeddings obtained by applying the projection layer to the pooled output of [`ChineseCLIPVisionModel`].
+        text_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`BaseModelOutputWithPoolingAndCrossAttentions`].
+        vision_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`ChineseCLIPVisionModel`].
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits_per_image: paddle.Tensor = None
+    logits_per_text: paddle.Tensor = None
+    text_embeds: paddle.Tensor = None
+    image_embeds: paddle.Tensor = None
+    text_model_output: BaseModelOutputWithPoolingAndCrossAttentions = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+class ChineseCLIPPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained ChineseCLIP models. It provides ChineseCLIP related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    config_class = ChineseCLIPConfig
+    base_model_prefix = "chinese_clip"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, nn.TransformerEncoder):
+            module.enable_recompute = value
+
+    def gradient_checkpointing_enable(self):
+        """
+        Activates gradient checkpointing for the current model.
+
+        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+        activations".
+        """
+        if not self.supports_gradient_checkpointing:
+            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
+        self.apply(partial(self._set_gradient_checkpointing, value=True))
+
+    def gradient_checkpointing_disable(self):
+        """
+        Deactivates gradient checkpointing for the current model.
+
+        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+        activations".
+        """
+        if self.supports_gradient_checkpointing:
+            self.apply(partial(self._set_gradient_checkpointing, value=False))
+
+    def _init_weights(self, layer):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor
+        if isinstance(layer, ChineseCLIPVisionTransformer):
+            vision_embed_dim = layer.config.hidden_size
+            vision_layers = layer.config.num_hidden_layers
+            initializer_range = layer.config.initializer_range
+
+            # vision embedding
+            normal_(layer.class_embedding, std=vision_embed_dim**-0.5 * factor)
+            normal_(layer.conv1.weight, std=initializer_range * factor)
+            normal_(layer.positional_embedding.weight, std=initializer_range * factor)
+
+            # init CLIPAttention + CLIPMLP
+            for sub_layer in layer.sublayers():
+                if isinstance(sub_layer, nn.TransformerEncoderLayer):
+                    # self_attn
+                    in_proj_std = (sub_layer.self_attn.embed_dim**-0.5) * ((2 * vision_layers) ** -0.5) * factor
+                    out_proj_std = (sub_layer.self_attn.embed_dim**-0.5) * factor
+                    normal_(sub_layer.self_attn.q_proj.weight, std=in_proj_std)
+                    normal_(sub_layer.self_attn.k_proj.weight, std=in_proj_std)
+                    normal_(sub_layer.self_attn.v_proj.weight, std=in_proj_std)
+                    normal_(sub_layer.self_attn.out_proj.weight, std=out_proj_std)
+                    # ffn
+                    in_proj_std = (sub_layer._config["d_model"] ** -0.5) * ((2 * vision_layers) ** -0.5) * factor
+                    fc_std = (2 * sub_layer._config["d_model"]) ** -0.5 * factor
+                    normal_(sub_layer.linear1.weight, std=fc_std)
+                    normal_(sub_layer.linear2.weight, std=in_proj_std)
+
+        elif isinstance(layer, ChineseCLIPTextEmbeddings):
+            normal_(layer.word_embeddings.weight, mean=0.0, std=self.config.initializer_range)
+            normal_(layer.position_embeddings.weight, mean=0.0, std=self.config.initializer_range)
+            normal_(layer.token_type_embeddings.weight, mean=0.0, std=self.config.initializer_range)
+            with paddle.no_grad():
+                for embedding in [layer.word_embeddings, layer.position_embeddings, layer.token_type_embeddings]:
+                    if embedding._padding_idx is not None:
+                        embedding.weight[embedding._padding_idx] = 0
+
+        elif isinstance(layer, ChineseCLIPModel):
+            normal_(layer.text_projection, std=layer.text_embed_dim**-0.5 * self.config.initializer_factor)
+            normal_(layer.vision_projection, std=layer.vision_embed_dim**-0.5 * self.config.initializer_factor)
+        elif isinstance(layer, ChineseCLIPVisionModelWithProjection):
+            normal_(layer.vision_projection, std=self.config.hidden_size**-0.5 * self.config.initializer_factor)
+        elif isinstance(layer, ChineseCLIPTextModelWithProjection):
+            normal_(layer.text_projection, std=self.config.hidden_size**-0.5 * self.config.initializer_factor)
+
+        if isinstance(layer, nn.LayerNorm):
+            zeros_(layer.bias)
+            ones_(layer.weight)
+
+        if isinstance(layer, nn.Linear):
+            normal_(layer.weight, mean=0.0, std=self.config.initializer_range)
+            if layer.bias is not None:
+                zeros_(layer.bias)
+
+
+class FirstTokenPooler(nn.Layer):
+    def forward(self, hidden_states):
+        pooled_output = hidden_states[:, 0]
+        return pooled_output
+
+
+class ChineseCLIPTextModel(ChineseCLIPPretrainedModel):
+    r"""
+    The text model [bert model] from ChineseCLIP without any head or projection on top.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`ChineseCLIPTextConfig`):
+            An instance of ChineseCLIPTextConfig used to construct ChineseCLIPTextModel.
+    """
+
+    config_class = ChineseCLIPTextConfig
+
+    def __init__(self, config: ChineseCLIPTextConfig, add_pooling_layer=False):
+        super().__init__(config)
+        self.text_model = BertModel(config)
+        if not add_pooling_layer:
+            self.text_model.pooler = FirstTokenPooler()
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.text_model.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.word_embeddings = value
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        token_type_ids: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        Args:
+            input_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+                it.
+                Indices can be obtained using [`ChineseCLIPTokenizer`].
+            attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+                config.max_position_embeddings - 1]`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`BaseModelOutputWithPoolingAndCrossAttentions`] instead of a plain tuple.
+
+        Returns:
+            An instance of :class:`BaseModelOutputWithPoolingAndCrossAttentions` if `return_dict=True`. Otherwise it returns a tuple of tensors
+            corresponding to ordered and not None (depending on the input arguments) fields of :class:`BaseModelOutputWithPoolingAndCrossAttentions`.
+
+        Examples:
+
+        ```python
+        >>> from paddlenlp.transformers import ChineseCLIPTokenizer, ChineseCLIPTextModel
+
+        >>> model = ChineseCLIPTextModel.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
+        >>> model.eval()
+        >>> tokenizer = ChineseCLIPTokenizer.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
+
+        >>> inputs = tokenizer(["一只猫的照片", "一条狗的照片"], padding=True, return_tensors="pd")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if attention_mask is None:
+            attention_mask = paddle.ones_like(input_ids)
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class ChineseCLIPVisionModel(ChineseCLIPPretrainedModel):
+    r"""
+    The vision model from Chinese-CLIP without any head or projection on top.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`ChineseCLIPVisionConfig`):
+            An instance of ChineseCLIPVisionConfig used to construct ChineseCLIPVisionModel.
+    """
+    config_class = ChineseCLIPVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: ChineseCLIPVisionConfig):
+        super().__init__(config)
+
+        self.vision_model = ChineseCLIPVisionTransformer(config)
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.vision_model.conv1
+
+    def forward(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Args:
+            pixel_values (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+                [`ChineseCLIPProcessor`]. See [`ChineseCLIPProcessor.__call__`] for details.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`BaseModelOutputWithPooling`] instead of a plain tuple.
+
+        Returns:
+            An instance of :class:`BaseModelOutputWithPooling` if `return_dict=True`. Otherwise it returns a tuple of tensors
+            corresponding to ordered and not None (depending on the input arguments) fields of :class:`BaseModelOutputWithPooling`.
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import ChineseCLIPProcessor, ChineseCLIPVisionModel
+
+        >>> model = ChineseCLIPVisionModel.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
+        >>> model.eval()
+        >>> processor = CLIPProcessor.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
+
+        >>> url = "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pd")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class ChineseCLIPModel(ChineseCLIPPretrainedModel):
+    r"""
+    The bare Chinese-CLIP Model outputting logits_per_image and logits_per_text.
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`ChineseCLIPConfig`):
+            An instance of ChineseCLIPConfig used to construct ChineseCLIPModel.
+    """
+    config_class = ChineseCLIPConfig
+
+    def __init__(self, config: ChineseCLIPConfig, add_pooling_layer=False):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, ChineseCLIPTextConfig):
+            raise ValueError(
+                "config.text_config is expected to be of type ChineseCLIPTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, ChineseCLIPVisionConfig):
+            raise ValueError(
+                "config.vision_config is expected to be of type ChineseCLIPVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = BertModel(text_config)
+        if not add_pooling_layer:
+            self.text_model.pooler = FirstTokenPooler()
+        self.vision_model = ChineseCLIPVisionTransformer(vision_config)
+
+        self.vision_projection = paddle.create_parameter(
+            (self.vision_embed_dim, self.projection_dim), paddle.get_default_dtype()
+        )
+        self.text_projection = paddle.create_parameter(
+            (self.text_embed_dim, self.projection_dim), paddle.get_default_dtype()
+        )
+
+        self.logit_scale = paddle.create_parameter(
+            (1,),
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(config.logit_scale_init_value),
+        )
+
+    def get_text_features(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        token_type_ids: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> paddle.Tensor:
+        r"""
+        Args:
+            input_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+                it.
+                Indices can be obtained using [`ChineseCLIPTokenizer`].
+            attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            token_type_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+                Its data type should be `int64`. Defaults to `None`, which means we don't add segment embeddings.
+            position_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+                config.max_position_embeddings - 1]`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`BaseModelOutputWithPooling`] instead of a plain tuple.
+
+        Returns:
+            text_features (`paddle.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`ChineseCLIPTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from paddlenlp.transformers import ChineseCLIPTokenizer, ChineseCLIPModel
+
+        >>> model = ChineseCLIPModel.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
+        >>> model.eval()
+        >>> tokenizer = ChineseCLIPTokenizer.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
+
+        >>> inputs = tokenizer(["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"], padding=True, return_tensors="pd")
+        >>> text_features = model.get_text_features(**inputs)
+        >>> text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)
+        ```
+        """
+        # Use Chinese-CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if attention_mask is None:
+            attention_mask = paddle.ones_like(input_ids)
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+        text_features = paddle.matmul(pooled_output, self.text_projection)
+
+        return text_features
+
+    def get_image_features(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> paddle.Tensor:
+        r"""
+        Args:
+            pixel_values (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+                [`ChineseCLIPProcessor`]. See [`ChineseCLIPProcessor.__call__`] for details.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`BaseModelOutputWithPooling`] instead of a plain tuple.
+
+        Returns:
+            image_features (`paddle.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import ChineseCLIPProcessor, ChineseCLIPModel
+
+        >>> model = ChineseCLIPModel.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
+        >>> model.eval()
+        >>> processor = ChineseCLIPProcessor.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
+
+        >>> url = "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> image_features = model.get_image_features(**inputs)
+        >>> image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
+        ```
+        """
+        # Use Chinese-CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = paddle.matmul(pooled_output, self.vision_projection)
+
+        return image_features
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        pixel_values: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        token_type_ids: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, ChineseCLIPOutput]:
+        r"""
+        The ChineseCLIPModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it.
+                Its data type should be `int64` and it has a shape of [text_batch_size, sequence_length].
+            pixel_values (Tensor):
+                Pixel values. Padding will be ignored by default should you provide it.
+                Its data type should be `float32` and it has a shape of [image_batch_size, num_channels, height, width].
+            position_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings (CLIPTextTransformer). Selected in
+                the range ``[0, max_text_length - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention (CLIPTextTransformer) to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `0.0` values and the others have `1.0` values.
+                It is a tensor with shape `[batch_size, sequence_length`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`CLIPOutput` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `True`.
+
+        Returns:
+            An instance of :class:`CLIPOutput` if `return_dict=True`. Otherwise it returns a tuple of tensors
+            corresponding to ordered and not None (depending on the input arguments) fields of :class:`CLIPOutput`.
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> import paddle.nn.functional as F
+        >>> from paddlenlp.transformers import ChineseCLIPProcessor, ChineseCLIPModel
+
+        >>> model = ChineseCLIPModel.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
+        >>> model.eval()
+        >>> processor = ChineseCLIPProcessor.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
+
+        >>> url = "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(text=["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"], images=image, return_tensors="pd", padding=True)
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = F.softmax(logits_per_image, axis=1)  # we can take the softmax to get the label probabilities
+        ```
+        """
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if attention_mask is None:
+            attention_mask = paddle.ones_like(input_ids)
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[1]
+        image_embeds = paddle.matmul(image_embeds, self.vision_projection)
+
+        text_embeds = text_outputs[1]
+        text_embeds = paddle.matmul(text_embeds, self.text_projection)
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, axis=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, axis=-1, keepdim=True)
+
+        if paddle.distributed.is_initialized() and dist.get_world_size() > 1:
+            world_size = dist.get_world_size()
+            rank = dist.get_rank()
+            gathered_image_features = [paddle.zeros_like(image_embeds) for _ in range(world_size)]
+            gathered_text_features = [paddle.zeros_like(text_embeds) for _ in range(world_size)]
+            dist.all_gather(gathered_image_features, image_embeds)
+            dist.all_gather(gathered_text_features, text_embeds)
+            # Add current text_embeds image_embeds into the batch for gradient update
+            image_embeds = paddle.concat(
+                [image_embeds] + gathered_image_features[:rank] + gathered_image_features[rank + 1 :]
+            )
+            text_embeds = paddle.concat(
+                [text_embeds] + gathered_text_features[:rank] + gathered_text_features[rank + 1 :]
+            )
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_text = paddle.matmul(text_embeds, image_embeds, transpose_y=True) * logit_scale
+        logits_per_image = logits_per_text.t()
+
+        loss = None
+        if return_loss:
+            loss = chinese_clip_loss(logits_per_text)
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return ChineseCLIPOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+
+
+class ChineseCLIPTextModelWithProjection(ChineseCLIPPretrainedModel):
+    r"""
+    Chinese-CLIP Text Model with a projection layer on top (a linear layer on top of the pooled output).
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`ChineseCLIPTextConfig`):
+            An instance of ChineseCLIPTextConfig used to construct ChineseCLIPTextModelWithProjection.
+    """
+    config_class = ChineseCLIPTextConfig
+
+    def __init__(self, config: ChineseCLIPTextConfig, add_pooling_layer=False):
+        super().__init__(config)
+
+        self.text_model = BertModel(config)
+        if not add_pooling_layer:
+            self.text_model.pooler = FirstTokenPooler()
+        self.text_projection = paddle.create_parameter(
+            (config.hidden_size, config.projection_dim), paddle.get_default_dtype()
+        )
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.text_model.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.word_embeddings = value
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        token_type_ids: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, ChineseCLIPTextModelOutput]:
+        r"""
+        Args:
+            input_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+                it.
+                Indices can be obtained using [`ChineseCLIPTokenizer`].
+            attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+                config.max_position_embeddings - 1]`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`ChineseCLIPTextModelOutput`] instead of a plain tuple.
+                If `False`, the output will be a tuple of tensors. Defaults to `None`.
+
+        Returns:
+            An instance of :class:`ChineseCLIPTextModelOutput` if `return_dict=True`. Otherwise it returns a tuple of tensors
+            corresponding to ordered and not None (depending on the input arguments) fields of :class:`ChineseCLIPTextModelOutput`.
+
+        Examples:
+
+        ```python
+        >>> from paddlenlp.transformers import ChineseCLIPTokenizer, ChineseCLIPTextModelWithProjection
+
+        >>> model = ChineseCLIPTextModelWithProjection.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
+        >>> model.eval()
+        >>> tokenizer = ChineseCLIPTokenizer.from_pretrained(""OFA-Sys/chinese-clip-vit-base-patch16")
+
+        >>> inputs = tokenizer(["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"], padding=True, return_tensors="pd")
+
+        >>> outputs = model(**inputs)
+        >>> text_embeds = outputs.text_embeds
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if attention_mask is None:
+            attention_mask = paddle.ones_like(input_ids)
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+        text_embeds = paddle.matmul(pooled_output, self.text_projection)
+
+        if not return_dict:
+            outputs = (text_embeds, text_outputs[0]) + text_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
+
+        return ChineseCLIPTextModelOutput(
+            text_embeds=text_embeds,
+            last_hidden_state=text_outputs.last_hidden_state,
+            hidden_states=text_outputs.hidden_states,
+            attentions=text_outputs.attentions,
+        )
+
+
+class ChineseCLIPVisionModelWithProjection(ChineseCLIPPretrainedModel):
+    r"""
+    Chinese-CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output).
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`ChineseCLIPVisionConfig`):
+            An instance of ChineseCLIPVisionConfig used to construct ChineseCLIPVisionModelWithProjection.
+    """
+    config_class = ChineseCLIPVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: ChineseCLIPVisionConfig):
+        super().__init__(config)
+
+        self.vision_model = ChineseCLIPVisionTransformer(config)
+        self.vision_projection = paddle.create_parameter(
+            (config.hidden_size, config.projection_dim), paddle.get_default_dtype()
+        )
+
+    def get_input_embeddings(self) -> nn.Layer:
+        if isinstance(self.vision_model, ChineseCLIPVisionTransformer):
+            return self.vision_model.conv1
+        else:
+            return None
+
+    def forward(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, ChineseCLIPVisionModelOutput]:
+        r"""
+        Args:
+            pixel_values (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+                [`ChineseCLIPProcessor`]. See [`ChineseCLIPProcessor.__call__`] for details.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`ChineseCLIPVisionModelOutput`] instead of a plain tuple.
+
+        Returns:
+            An instance of :class:`ChineseCLIPVisionModelOutput` if `return_dict=True`. Otherwise it returns a tuple of tensors
+            corresponding to ordered and not None (depending on the input arguments) fields of :class:`ChineseCLIPVisionModelOutput`.
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import ChineseCLIPProcessor, ChineseCLIPVisionModelWithProjection
+
+        >>> model = ChineseCLIPVisionModelWithProjection.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
+        >>> model.eval()
+        >>> processor = ChineseCLIPProcessor.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
+
+        >>> url = "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pd")
+
+        >>> outputs = model(**inputs)
+        >>> image_embeds = outputs.image_embeds
+
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = vision_outputs[1]  # pooled_output
+
+        image_embeds = paddle.matmul(pooled_output, self.vision_projection)
+
+        if not return_dict:
+            outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
+
+        return ChineseCLIPVisionModelOutput(
+            image_embeds=image_embeds,
+            last_hidden_state=vision_outputs.last_hidden_state,
+            hidden_states=vision_outputs.hidden_states,
+            attentions=vision_outputs.attentions,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/processing.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/processing.py
new file mode 100644
index 000000000..701dfda83
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/processing.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The OFA-Sys Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image/Text processor class for Chinese-CLIP
+"""
+import warnings
+
+from ..processing_utils import ProcessorMixin
+from ..tokenizer_utils_base import BatchEncoding
+
+__all__ = ["ChineseCLIPProcessor"]
+
+
+class ChineseCLIPProcessor(ProcessorMixin):
+    r"""
+    Constructs a Chinese-CLIP processor which wraps a Chinese-CLIP image processor and a Chinese-CLIP tokenizer into a
+    single processor.
+
+    [`ChineseCLIPProcessor`] offers all the functionalities of [`ChineseCLIPImageProcessor`] and [`ChineseCLIPTokenizer`].
+    See the [`~ChineseCLIPProcessor.__call__`] and [`~ChineseCLIPProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`ChineseCLIPImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`ChineseCLIPTokenizer`]):
+            The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "ChineseCLIPImageProcessor"
+    tokenizer_class = "ChineseCLIPTokenizer"
+
+    pretrained_init_configuration = {
+        "OFA-Sys/chinese-clip-vit-base-patch16": {"do_lower_case": True},
+        "OFA-Sys/chinese-clip-vit-huge-patch14": {"do_lower_case": True},
+        "OFA-Sys/chinese-clip-vit-large-patch14": {"do_lower_case": True},
+        "OFA-Sys/chinese-clip-vit-large-patch14-336px": {"do_lower_case": True},
+    }
+
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        if "feature_extractor" in kwargs:
+            warnings.warn(
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                " instead.",
+                FutureWarning,
+            )
+            feature_extractor = kwargs.pop("feature_extractor")
+
+        image_processor = image_processor if image_processor is not None else feature_extractor
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to ChineseCLIPTokenizer's [`~ChineseCLIPTokenizer.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        ChineseCLIPImageProcessor's [`~ChineseCLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `paddle.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[paddle.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or Paddle
+                tensor. In case of a NumPy array/Paddle tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'pd'`: Return Paddle `paddle.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+
+        Returns:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+        if text is not None:
+            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+
+        if images is not None:
+            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+
+        if text is not None and images is not None:
+            encoding["pixel_values"] = image_features.pixel_values
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to ChineseCLIPTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to ChineseCLIPTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+    @property
+    def feature_extractor_class(self):
+        warnings.warn(
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
+            FutureWarning,
+        )
+        return self.image_processor_class
+
+    @property
+    def feature_extractor(self):
+        warnings.warn(
+            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
+            FutureWarning,
+        )
+        return self.image_processor
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/tokenizer.py
new file mode 100644
index 000000000..d23dd6cc2
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/chineseclip/tokenizer.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..bert.tokenizer import BertTokenizer
+
+__all__ = ["ChineseCLIPTokenizer"]
+
+
+class ChineseCLIPTokenizer(BertTokenizer):
+    resource_files_names = {"vocab_file": "vocab.txt"}
+    pretrained_resource_files_map = {"vocab_file": {}}
+    pretrained_init_configuration = {}
+    model_input_names = [
+        "input_ids",
+        "token_type_ids",
+        "attention_mask",
+    ]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clap/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clap/__init__.py
new file mode 100644
index 000000000..595add0ae
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clap/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clap/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clap/configuration.py
new file mode 100644
index 000000000..149ee9b7e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clap/configuration.py
@@ -0,0 +1,450 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+from typing import Union
+
+from ...utils.log import logger
+from ..configuration_utils import PretrainedConfig
+
+__all__ = [
+    "ClapTextConfig",
+    "ClapAudioConfig",
+    "ClapConfig",
+]
+
+
+class ClapTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ClapTextModel`]. It is used to instantiate a CLAP
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the CLAP
+    [calp-hsat-fused](https://huggingface.co/laion/clap-hsat-fused) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the CLAP model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ClapTextModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"relu"`,
+            `"relu"`, `"silu"` and `"relu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`ClapTextModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+        projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the projection layer. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        projection_dim (`int`, *optional*, defaults to 512)
+            Dimension of the projection head of the `ClapTextModelWithProjection`.
+
+    Examples:
+
+    ```python
+    >>> from paddlenlp.transformers import ClapTextConfig, ClapTextModel
+
+    >>> # Initializing a CLAP text configuration
+    >>> configuration = ClapTextConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = ClapTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "clap_text_model"
+
+    def __init__(
+        self,
+        vocab_size=50265,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=514,
+        type_vocab_size=1,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        layer_norm_eps=1e-12,
+        projection_dim=512,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        projection_hidden_act="relu",
+        **kwargs,
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+        self.projection_hidden_act = projection_hidden_act
+        self.projection_dim = projection_dim
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from ClapConfig
+        if config_dict.get("model_type") == "clap":
+            config_dict = config_dict["text_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class ClapAudioConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ClapAudioModel`]. It is used to instantiate a
+    CLAP audio encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the audio encoder of the CLAP
+    [laion/clap-htsat-fused](https://huggingface.co/laion/clap-htsat-fused) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        window_size (`int`, *optional*, defaults to 8):
+            Image size of the spectrogram
+        num_mel_bins (`int`, *optional*, defaults to 64):
+            Number of mel features used per frames. Should correspond to the value used in the `ClapProcessor` class.
+        spec_size (`int`, *optional*, defaults to 256):
+            Desired input size of the spectrogram that the model supports. It can be different from the output of the
+            `ClapFeatureExtractor`, in which case the input features will be resized. Corresponds to the `image_size`
+            of the audio models.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        patch_size (`int`, *optional*, defaults to 4):
+            Patch size for the audio spectrogram
+        patch_stride (`list`, *optional*, defaults to `[4, 4]`):
+            Patch stride for the audio spectrogram
+        num_classes (`int`, *optional*, defaults to 527):
+            Number of classes used for the head training
+        hidden_size (`int`, *optional*, defaults to 768):
+            Hidden size of the output of the audio encoder. Correspond to the dimension of the penultimate layer's
+            output,which is sent to the projection MLP layer.
+        projection_dim (`int`, *optional*, defaults to 512):
+            Hidden size of the projection layer.
+        depths (`list`, *optional*, defaults to `[2, 2, 6, 2]`):
+            Depths used for the Swin Layers of the audio model
+        num_attention_heads (`list`, *optional*, defaults to `[4, 8, 16, 32]`):
+            Number of attention heads used for the Swin Layers of the audio model
+        enable_fusion (`bool`, *optional*, defaults to `False`):
+            Whether or not to enable patch fusion. This is the main contribution of the authors, and should give the
+            best results.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the encoder.
+        fusion_type (`[type]`, *optional*):
+            Fusion type used for the patch fusion.
+        patch_embed_input_channels (`int`, *optional*, defaults to 1):
+            Number of channels used for the input spectrogram
+        flatten_patch_embeds (`bool`, *optional*, defaults to `True`):
+            Whether or not to flatten the patch embeddings
+        patch_embeds_hidden_size (`int`, *optional*, defaults to 96):
+            Hidden size of the patch embeddings. It is used as the number of output channels.
+        enable_patch_layer_norm (`bool`, *optional*, defaults to `True`):
+            Whether or not to enable layer normalization for the patch embeddings
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Drop path rate for the patch fusion
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not to add a bias to the query, key, value projections.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            Ratio of the mlp hidden dim to embedding dim.
+        aff_block_r (`int`, *optional*, defaults to 4):
+            downsize_ratio used in the AudioFF block
+        num_hidden_layers (`int`, *optional*, defaults to 4):
+            Number of hidden layers in the Transformer encoder.
+        projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the projection layer. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        layer_norm_eps (`[type]`, *optional*, defaults to `1e-5`):
+            The epsilon used by the layer normalization layers.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from paddlenlp.transformers import ClapAudioConfig, ClapAudioModel
+
+    >>> # Initializing a ClapAudioConfig with laion/clap-htsat-fused style configuration
+    >>> configuration = ClapAudioConfig()
+
+    >>> # Initializing a ClapAudioModel (with random weights) from the laion/clap-htsat-fused style configuration
+    >>> model = ClapAudioModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "clap_audio_model"
+
+    def __init__(
+        self,
+        window_size=8,
+        num_mel_bins=64,
+        spec_size=256,
+        hidden_act="gelu",
+        patch_size=4,
+        patch_stride=[4, 4],
+        num_classes=527,
+        hidden_size=768,
+        projection_dim=512,
+        depths=[2, 2, 6, 2],
+        num_attention_heads=[4, 8, 16, 32],
+        enable_fusion=False,
+        hidden_dropout_prob=0.1,
+        fusion_type=None,
+        patch_embed_input_channels=1,
+        flatten_patch_embeds=True,
+        patch_embeds_hidden_size=96,
+        enable_patch_layer_norm=True,
+        drop_path_rate=0.0,
+        attention_probs_dropout_prob=0.0,
+        qkv_bias=True,
+        mlp_ratio=4.0,
+        aff_block_r=4,
+        num_hidden_layers=4,
+        projection_hidden_act="relu",
+        layer_norm_eps=1e-5,
+        initializer_factor=1.0,
+        **kwargs,
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(**kwargs)
+        self.window_size = window_size
+        self.num_mel_bins = num_mel_bins
+        self.spec_size = spec_size
+        self.patch_size = patch_size
+        self.patch_stride = patch_stride
+        self.num_classes = num_classes
+        self.hidden_size = hidden_size
+        self.depths = depths
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.window_size = window_size
+        self.enable_fusion = enable_fusion
+        self.fusion_type = fusion_type
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.projection_dim = projection_dim
+        self.flatten_patch_embeds = flatten_patch_embeds
+        self.patch_embeds_hidden_size = patch_embeds_hidden_size
+        self.enable_patch_layer_norm = enable_patch_layer_norm
+        self.drop_path_rate = drop_path_rate
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.qkv_bias = qkv_bias
+        self.mlp_ratio = mlp_ratio
+        self.patch_embed_input_channels = patch_embed_input_channels
+        self.aff_block_r = aff_block_r
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_factor = initializer_factor
+        self.projection_hidden_act = projection_hidden_act
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the audio config dict if we are loading from ClapConfig
+        if config_dict.get("model_type") == "clap":
+            config_dict = config_dict["audio_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class ClapConfig(PretrainedConfig):
+    r"""
+    [`ClapConfig`] is the configuration class to store the configuration of a [`ClapModel`]. It is used to instantiate
+    a CLAP model according to the specified arguments, defining the text model and audio model configs. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the CLAP
+    [laion/clap-htsat-fused](https://huggingface.co/laion/clap-htsat-fused) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`ClapTextConfig`].
+        audio_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`ClapAudioConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimentionality of text and audio projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The inital value of the *logit_scale* paramter. Default is used as per the original CLAP implementation.
+        projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
+            Activation function for the projection layers.
+        initializer_factor (`float`, *optional*, defaults to 1.0):
+            Factor to scale the initialization of the model weights.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from paddlenlp.transformers import ClapConfig, ClapModel
+
+    >>> # Initializing a ClapConfig with laion-ai/base style configuration
+    >>> configuration = ClapConfig()
+
+    >>> # Initializing a ClapModel (with random weights) from the laion-ai/base style configuration
+    >>> model = ClapModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a ClapConfig from a ClapTextConfig and a ClapAudioConfig
+    >>> from paddlenlp.transformers import ClapTextConfig, ClapAudioConfig
+
+    >>> # Initializing a ClapText and ClapAudioConfig configuration
+    >>> config_text = ClapTextConfig()
+    >>> config_audio = ClapAudioConfig()
+
+    >>> config = ClapConfig.from_text_audio_configs(config_text, config_audio)
+    ```"""
+
+    model_type = "clap"
+    is_composition = True
+
+    def __init__(
+        self,
+        text_config=None,
+        audio_config=None,
+        logit_scale_init_value=(1 / 0.07),
+        projection_dim=512,
+        projection_hidden_act="relu",
+        initializer_factor=1.0,
+        **kwargs,
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(**kwargs)
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the ClapTextConfig with default values.")
+
+        if audio_config is None:
+            audio_config = {}
+            logger.info("audio_config is None. initializing the ClapAudioConfig with default values.")
+
+        self.text_config = ClapTextConfig(**text_config)
+        self.audio_config = ClapAudioConfig(**audio_config)
+        self.text_config.projection_dim = projection_dim
+        self.audio_config.projection_dim = projection_dim
+
+        self.text_config.projection_hidden_act = projection_hidden_act
+        self.audio_config.projection_hidden_act = projection_hidden_act
+
+        self.projection_dim = projection_dim
+        self.projection_hidden_act = projection_hidden_act
+        self.hidden_size = self.text_config.hidden_size
+
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = initializer_factor
+        self.num_hidden_layers = self.text_config.num_hidden_layers + len(self.audio_config.depths)
+
+    @classmethod
+    def from_text_audio_configs(cls, text_config: ClapTextConfig, audio_config: ClapAudioConfig, **kwargs):
+        r"""
+        Instantiate a [`ClapConfig`] (or a derived class) from clap text model configuration and clap audio model
+        configuration.
+
+        Returns:
+            [`ClapConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(), audio_config=audio_config.to_dict(), **kwargs)
+
+    def to_dict(self, *args, **kwargs):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["text_config"] = self.text_config.to_dict()
+        output["audio_config"] = self.audio_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clap/feature_extraction.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clap/feature_extraction.py
new file mode 100644
index 000000000..e3938998d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clap/feature_extraction.py
@@ -0,0 +1,358 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+import paddle
+
+from ...utils.log import logger
+from ..audio_utils import mel_filter_bank, spectrogram, window_function
+from ..feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ..feature_extraction_utils import BatchFeature
+
+
+class ClapFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs a CLAP feature extractor.
+
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods.
+
+    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the *Short Time
+    Fourier Transform* (STFT) which should match pytorch's `torch.stft` equivalent.
+
+    Args:
+        feature_size (`int`, defaults to 64):
+            The feature dimension of the extracted Mel spectrograms. This corresponds to the number of mel filters
+            (`n_mels`).
+        sampling_rate (`int`, defaults to 48_000):
+            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). This only serves
+            to warn users if the audio fed to the feature extractor does not have the same sampling rate.
+        hop_length (`int`, defaults to 480):
+            Length of the overlaping windows for the STFT used to obtain the Mel Spectrogram. The audio will be split
+            in smaller `frames` with a step of `hop_length` between each frame.
+        max_length_s (`int`, defaults to 10):
+            The maximum input lenght of the model in seconds. This is used to pad the audio.
+        fft_window_size (`int`, defaults to 1024):
+            Size of the window (in samples) on which the Fourier transform is applied. This controls the frequency
+            resolution of the spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples.
+        padding_value (`float`, *optional*, defaults to 0.0):
+            Padding value used to pad the audio. Should correspond to silences.
+        return_attention_mask (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should return the attention masks coresponding to the input.
+        frequency_min (`float`, *optional*, default to 0):
+            The lowest frequency of interest. The STFT will not be computed for values below this.
+        frequency_max (`float`, *optional*, default to 14_000):
+            The highest frequency of interest. The STFT will not be computed for values above this.
+        top_db (`float`, *optional*):
+            The highest decibel value used to convert the mel spectrogram to the log scale. For more details see the
+            `audio_utils.power_to_db` function
+        truncation (`str`, *optional*, default to `"fusions"`):
+            Truncation pattern for long audio inputs. Two patterns are available:
+                - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and a
+                  downsampled version of the entire mel spectrogram.
+            If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a copy
+            of the original mel obtained from the padded audio.
+                - `rand_trunc` will select a random crop of the mel spectrogram.
+        padding (`str`, *optional*, defaults to `"repeatpad"`):
+               Padding pattern for shorter audio inputs. Three patterns were originally implemented:
+                - `repeatpad`: the audio is repeated, and then padded to fit the `max_length`.
+                - `repeat`: the audio is repeated and then cut to fit the `max_length`
+                - `pad`: the audio is padded.
+    """
+
+    model_input_names = ["input_features", "is_longer"]
+
+    def __init__(
+        self,
+        feature_size=64,
+        sampling_rate=48_000,
+        hop_length=480,
+        max_length_s=10,
+        fft_window_size=1024,
+        padding_value=0.0,
+        return_attention_mask=False,  # pad inputs to max length with silence token (zero) and no attention mask
+        frequency_min: float = 0,
+        frequency_max: float = 14_000,
+        top_db: int = None,
+        truncation: str = "fusion",
+        padding: str = "repeatpad",
+        **kwargs,
+    ):
+        super().__init__(
+            feature_size=feature_size,
+            sampling_rate=sampling_rate,
+            padding_value=padding_value,
+            return_attention_mask=return_attention_mask,
+            **kwargs,
+        )
+        self.top_db = top_db
+        self.truncation = truncation
+        self.padding = padding
+        self.fft_window_size = fft_window_size
+        self.nb_frequency_bins = (fft_window_size >> 1) + 1
+        self.hop_length = hop_length
+        self.max_length_s = max_length_s
+        self.nb_max_samples = max_length_s * sampling_rate
+        self.sampling_rate = sampling_rate
+        self.frequency_min = frequency_min
+        self.frequency_max = frequency_max
+        self.mel_filters = mel_filter_bank(
+            num_frequency_bins=self.nb_frequency_bins,
+            num_mel_filters=feature_size,
+            min_frequency=frequency_min,
+            max_frequency=frequency_max,
+            sampling_rate=sampling_rate,
+            norm=None,
+            mel_scale="htk",
+        )
+        self.mel_filters_slaney = mel_filter_bank(
+            num_frequency_bins=self.nb_frequency_bins,
+            num_mel_filters=feature_size,
+            min_frequency=frequency_min,
+            max_frequency=frequency_max,
+            sampling_rate=sampling_rate,
+            norm="slaney",
+            mel_scale="slaney",
+        )
+
+    def to_dict(self, *args, **kwargs) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, excpet for the
+            mel filter banks, which do not need to be saved or printed as they are too long.
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["feature_extractor_type"] = self.__class__.__name__
+        if "mel_filters" in output:
+            del output["mel_filters"]
+        if "mel_filters_slaney" in output:
+            del output["mel_filters_slaney"]
+        return output
+
+    def _np_extract_fbank_features(self, waveform: np.array, mel_filters: Optional[np.array] = None) -> np.ndarray:
+        """
+        Compute the log-mel spectrogram of the provided `waveform` using the Hann window. In CLAP, two different filter
+        banks are used depending on the truncation pattern:
+            - `self.mel_filters`: they correspond to the default parameters of `torchaudio` which can be obtained from
+              calling `torchaudio.transforms.MelSpectrogram().mel_scale.fb`. These filters are used when `truncation`
+              is set to `"fusion"`.
+            - `self.mel_filteres_slaney` : they correspond to the default parameters of `librosa` which used
+              `librosa.filters.mel` when computing the mel spectrogram. These filters were only used in the original
+              implementation when the truncation mode is not `"fusion"`.
+        """
+        log_mel_spectrogram = spectrogram(
+            waveform,
+            window_function(self.fft_window_size, "hann"),
+            frame_length=self.fft_window_size,
+            hop_length=self.hop_length,
+            power=2.0,
+            mel_filters=mel_filters,
+            log_mel="dB",
+        )
+        return log_mel_spectrogram.T
+
+    def _random_mel_fusion(self, mel, total_frames, chunk_frames):
+        ranges = np.array_split(list(range(0, total_frames - chunk_frames + 1)), 3)
+        if len(ranges[1]) == 0:
+            # if the audio is too short, we just use the first chunk
+            ranges[1] = [0]
+        if len(ranges[2]) == 0:
+            # if the audio is too short, we just use the first chunk
+            ranges[2] = [0]
+        # randomly choose index for each part
+        idx_front = np.random.choice(ranges[0])
+        idx_middle = np.random.choice(ranges[1])
+        idx_back = np.random.choice(ranges[2])
+
+        mel_chunk_front = mel[idx_front : idx_front + chunk_frames, :]
+        mel_chunk_middle = mel[idx_middle : idx_middle + chunk_frames, :]
+        mel_chunk_back = mel[idx_back : idx_back + chunk_frames, :]
+
+        mel = paddle.to_tensor(mel[None, None, :])
+        mel_shrink = paddle.nn.functional.interpolate(
+            mel, size=[chunk_frames, 64], mode="bilinear", align_corners=False
+        )
+        mel_shrink = mel_shrink[0][0].numpy()
+        mel_fusion = np.stack([mel_shrink, mel_chunk_front, mel_chunk_middle, mel_chunk_back], axis=0)
+        return mel_fusion
+
+    def _get_input_mel(self, waveform: np.array, max_length, truncation, padding) -> np.array:
+        """
+        Extracts the mel spectrogram and prepares it for the mode based on the `truncation` and `padding` arguments.
+        Four different path are possible:
+            - `truncation="fusion"` and the length of the waveform is greater than the max length: the mel spectrogram
+              will be computed on the entire audio. 3 random crops and a dowsampled version of the full mel spectrogram
+              are then stacked together. They will later be used for `feature_fusion`.
+            - `truncation="rand_trunc"` and the length of the waveform is smaller than the max length: the audio is
+              padded based on `padding`.
+            - `truncation="fusion"` and the length of the waveform is smaller than the max length: the audio is padded
+              based on `padding`, and is repeated `4` times.
+            - `truncation="rand_trunc"` and the length of the waveform is greater than the max length: the mel
+              spectrogram will be computed on a random crop of the waveform.
+
+        """
+        if waveform.shape[0] > max_length:
+            if truncation == "rand_trunc":
+                longer = True
+                # random crop to max_length (for compatibility) -> this should be handled by self.pad
+                overflow = len(waveform) - max_length
+                idx = np.random.randint(0, overflow + 1)
+                waveform = waveform[idx : idx + max_length]
+                input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)[None, :]
+            elif truncation == "fusion":
+                mel = self._np_extract_fbank_features(waveform, self.mel_filters)
+                chunk_frames = max_length // self.hop_length + 1  # the +1 related to how the spectrogram is computed
+                total_frames = mel.shape[0]
+                if chunk_frames == total_frames:
+                    # there is a corner case where the audio length is larger than max_length but smaller than max_length+hop_length.
+                    # In this case, we just use the whole audio.
+                    input_mel = np.stack([mel, mel, mel, mel], axis=0)
+                    longer = False
+                else:
+                    input_mel = self._random_mel_fusion(mel, total_frames, chunk_frames)
+                    longer = True
+            else:
+                raise NotImplementedError(f"data_truncating {truncation} not implemented")
+
+        else:
+            longer = False
+            # only use repeat as a new possible value for padding. you repeat the audio before applying the usual max_length padding
+            if waveform.shape[0] < max_length:
+                if padding == "repeat":
+                    n_repeat = int(max_length / len(waveform))
+                    waveform = np.stack(np.tile(waveform, n_repeat + 1))[:max_length]
+                if padding == "repeatpad":
+                    n_repeat = int(max_length / len(waveform))
+                    waveform = np.stack(np.tile(waveform, n_repeat))
+                waveform = np.pad(waveform, (0, max_length - waveform.shape[0]), mode="constant", constant_values=0)
+
+            if truncation == "fusion":
+                input_mel = self._np_extract_fbank_features(waveform, self.mel_filters)
+                input_mel = np.stack([input_mel, input_mel, input_mel, input_mel], axis=0)
+            else:
+                input_mel = self._np_extract_fbank_features(waveform, self.mel_filters_slaney)[None, :]
+
+        return input_mel, longer
+
+    def __call__(
+        self,
+        raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
+        truncation: str = None,
+        padding: Optional[str] = None,
+        max_length: Optional[int] = None,
+        sampling_rate: Optional[int] = None,
+        return_tensors: Optional[str] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s).
+
+        Args:
+            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
+                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
+            truncation (`str`, *optional*):
+                Truncation pattern for long audio inputs. Two patterns are available:
+                    - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and
+                      a downsampled version of the entire mel spectrogram.
+                If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a
+                copy of the original mel obtained from the padded audio.
+                    - `rand_trunc` will select a random crop of the mel spectrogram.
+            padding (`str`, *optional*):
+               Padding pattern for shorter audio inputs. Three patterns were originally implemented:
+                    - `repeatpad`: the audio is repeated, and then padded to fit the `max_length`.
+                    - `repeat`: the audio is repeated and then cut to fit the `max_length`
+                    - `pad`: the audio is padded.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.np.array` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
+                pipeline.
+        """
+        truncation = truncation if truncation is not None else self.truncation
+        padding = padding if padding else self.padding
+
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a"
+                    f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input"
+                    f" was sampled with {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
+        )
+
+        if is_batched:
+            raw_speech = [np.asarray(speech, dtype=np.float64) for speech in raw_speech]
+        elif not is_batched and not isinstance(raw_speech, np.ndarray):
+            raw_speech = np.asarray(raw_speech, dtype=np.float64)
+        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
+            raw_speech = raw_speech.astype(np.float64)
+
+        # always return batch
+        if not is_batched:
+            raw_speech = [np.asarray(raw_speech)]
+
+        # convert to mel spectrogram, truncate and pad if needed.
+        padded_inputs = [
+            self._get_input_mel(waveform, max_length if max_length else self.nb_max_samples, truncation, padding)
+            for waveform in raw_speech
+        ]
+
+        input_mel = []
+        is_longer = []
+        for mel, longer in padded_inputs:
+            input_mel.append(mel)
+            is_longer.append(longer)
+
+        if truncation == "fusion" and sum(is_longer) == 0:
+            # if no audio is longer than 10s, then randomly select one audio to be longer
+            rand_idx = np.random.randint(0, len(input_mel))
+            is_longer[rand_idx] = True
+
+        if isinstance(input_mel[0], List):
+            input_mel = [np.asarray(feature, dtype=np.float64) for feature in input_mel]
+
+        # is_longer is a list of bool
+        is_longer = [[longer] for longer in is_longer]
+
+        input_features = {"input_features": input_mel, "is_longer": is_longer}
+        input_features = BatchFeature(input_features)
+
+        if return_tensors is not None:
+            input_features = input_features.convert_to_tensors(return_tensors)
+
+        return input_features
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clap/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clap/modeling.py
new file mode 100644
index 000000000..47e4a61c2
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clap/modeling.py
@@ -0,0 +1,2285 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The LAION-AI Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import math
+from dataclasses import dataclass
+from typing import Any, List, Optional, Tuple, Union
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.distributed.fleet.utils import recompute
+
+from paddlenlp.utils.log import logger
+
+from ...utils.initializer import normal_, ones_, zeros_
+from ..activations import ACT2FN
+from ..model_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPooling,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    ModelOutput,
+)
+from ..model_utils import (
+    PretrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from .configuration import ClapAudioConfig, ClapConfig, ClapTextConfig
+
+CLAP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "laion/clap-htsat-fused",
+    "laion/clap-htsat-unfused",
+    # See all clap models at https://huggingface.co/models?filter=clap
+]
+
+
+__all__ = [
+    "ClapTextModelWithProjection",
+    "ClapAudioModelWithProjection",
+    "ClapModel",
+    "ClapAudioConfig",
+    "ClapAudioModel",
+    "ClapTextModel",
+]
+
+
+def Parameter(tensor):
+    return paddle.create_parameter(tensor.shape, dtype=tensor.dtype, default_initializer=nn.initializer.Assign(tensor))
+
+
+def masked_fill(x, mask, value):
+    y = paddle.full(x.shape, value, x.dtype)
+    return paddle.where(mask, y, x)
+
+
+# Adapted from: https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/utils.py#L191
+def interpolate(hidden_states, ratio):
+    """
+    Interpolate data in time domain. This is used to compensate the resolution reduction in downsampling of a CNN.
+
+    Args:
+        hidden_states (`paddle.Tensor` of shape (batch_size, time_length, classes_num)):
+            Input hidden states
+        ratio (`int`):
+            The ratio of the length of the output to the length of the input.
+    """
+    (batch_size, time_length, classes_num) = hidden_states.shape
+    upsampled = hidden_states[:, :, None, :].tile([1, 1, ratio, 1])
+    upsampled = upsampled.reshape([batch_size, time_length * ratio, classes_num])
+    return upsampled
+
+
+# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/htsat.py#L249
+def window_partition(hidden_states, window_size):
+    """
+    Returns the resized hidden states. The output shape should be `(batch_size * num_windows, window_size, window_size,
+    num_channels)`
+
+    Args:
+        hidden_states (`paddle.Tensor` of shape `(batch_size, height, width, num_channels)`):
+            Input hidden states
+        window_size (`int`):
+            Window size
+    """
+    batch_size, height, width, num_channels = hidden_states.shape
+
+    hidden_states = hidden_states.reshape(
+        [batch_size, height // window_size, window_size, width // window_size, window_size, num_channels]
+    )
+    windows = hidden_states.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, window_size, window_size, num_channels])
+    return windows
+
+
+# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/htsat.py#L263
+def window_reverse(windows, window_size, height, width):
+    """
+    Args:
+        windows (`paddle.Tensor` of shape `(num_windows * batch_size, window_size, window_size, num_channels)`):
+            Input windows
+        window_size (`int`):
+            Window size
+        height (`int`):
+            Height of the resized audio
+        width (`int`):
+            Width of the resized audio
+    """
+    batch_size = int(windows.shape[0] / (height * width / window_size / window_size))
+
+    hidden_states = windows.reshape(
+        [batch_size, height // window_size, width // window_size, window_size, window_size, -1]
+    )
+    hidden_states = hidden_states.transpose([0, 1, 3, 2, 4, 5]).reshape([batch_size, height, width, -1])
+    return hidden_states
+
+
+# Copied from paddlenlp.transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: paddle.Tensor x:
+
+    Returns: paddle.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.cast("int32").not_equal(paddle.to_tensor([padding_idx], dtype="int32")).cast("int32")
+    incremental_indices = (paddle.cumsum(mask, axis=1).cast(mask.dtype) + past_key_values_length) * mask
+    return incremental_indices.cast("int64") + padding_idx
+
+
+# contrastive loss function, adapted from
+# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html#CLIP-loss-function
+def contrastive_loss(logits: paddle.Tensor) -> paddle.Tensor:
+    labels = paddle.arange(len(logits))
+    return nn.functional.cross_entropy(logits, labels)
+
+
+@dataclass
+# Copied from paddlenlp.transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->Clap
+class ClapTextModelOutput(ModelOutput):
+    """
+    Base class for text model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        text_embeds (`paddle.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The text embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    text_embeds: Optional[paddle.Tensor] = None
+    last_hidden_state: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class ClapAudioModelOutput(ModelOutput):
+    """
+    ClapAudio model output to mimic the output of the original implementation.
+
+    Args:
+        audio_embeds (`paddle.Tensor` of shape `(batch_size, hidden_size)`):
+            The Audio embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+    """
+
+    audio_embeds: Optional[paddle.Tensor] = None
+    last_hidden_state: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+# Copied from paddlenlp.transformers.models.clip.modeling_clip.CLIPOutput with CLIP->Clap, vision->audio, Vision->Audio, image->audio
+class ClapOutput(ModelOutput):
+    """
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for audio-text similarity.
+        logits_per_audio:(`paddle.Tensor` of shape `(audio_batch_size, text_batch_size)`):
+            The scaled dot product scores between `audio_embeds` and `text_embeds`. This represents the audio-text
+            similarity scores.
+        logits_per_text:(`paddle.Tensor` of shape `(text_batch_size, audio_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `audio_embeds`. This represents the text-audio
+            similarity scores.
+        text_embeds(`paddle.Tensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of [`ClapTextModel`].
+        audio_embeds(`paddle.Tensor` of shape `(batch_size, output_dim`):
+            The audio embeddings obtained by applying the projection layer to the pooled output of [`ClapAudioModel`].
+        text_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`ClapTextModel`].
+        audio_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`ClapAudioModel`].
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits_per_audio: paddle.Tensor = None
+    logits_per_text: paddle.Tensor = None
+    text_embeds: paddle.Tensor = None
+    audio_embeds: paddle.Tensor = None
+    text_model_output: BaseModelOutputWithPooling = None
+    audio_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "audio_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+# Adapted from paddlenlp.transformers.models.swin.modeling_swin.SwinDropPath
+class ClapDropPath(nn.Layer):
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is a slightly
+    refactored version of the `SwinDropPath` implementation.
+    """
+
+    def __init__(self, drop_prob=None):
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states):
+        if self.drop_prob == 0.0 or not self.training:
+            return hidden_states
+
+        keep_prob = 1 - self.drop_prob
+        # work with diff dim tensors, not just 2D ConvNets
+        shape = (hidden_states.shape[0],) + (1,) * (hidden_states.ndim - 1)
+
+        random_tensor = keep_prob + paddle.rand(shape, dtype=hidden_states.dtype)
+        random_tensor = paddle.floor(random_tensor)  # binarize
+        output = (hidden_states / keep_prob) * random_tensor
+        return output
+
+
+# Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/feature_fusion.py#L133
+class ClapAudioAFFBlock(nn.Layer):
+    r"""
+    ATTENTIONAL FEATURE FUSION Block from CLAP, since in CLAP we are always in 2D mode, it is not needed to implement
+    the 1D version.
+    """
+
+    def __init__(self, config: ClapAudioConfig):
+        super().__init__()
+        channels = config.patch_embeds_hidden_size
+        downsize_ratio = config.aff_block_r
+        inter_channels = int(channels // downsize_ratio)
+
+        self.local_att = nn.Sequential(
+            nn.Conv2D(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2D(inter_channels),
+            nn.ReLU(),
+            nn.Conv2D(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2D(channels),
+        )
+        self.global_att = nn.Sequential(
+            nn.AdaptiveAvgPool2D(1),
+            nn.Conv2D(channels, inter_channels, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2D(inter_channels),
+            nn.ReLU(),
+            nn.Conv2D(inter_channels, channels, kernel_size=1, stride=1, padding=0),
+            nn.BatchNorm2D(channels),
+        )
+
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, hidden_states, residual):
+        attention_input = hidden_states + residual
+
+        fused_layer_output = self.local_att(attention_input) + self.global_att(attention_input)
+        fused_layer_output = self.sigmoid(fused_layer_output)
+
+        output = 2 * hidden_states * fused_layer_output + 2 * residual * (1 - fused_layer_output)
+        return output
+
+
+class ClapAudioPatchEmbed(nn.Layer):
+    """
+    This module converts the hidden states reshaped as an image to patch embeddings ready to be passed to the
+    Transformer block.
+    """
+
+    def __init__(self, config: ClapAudioConfig):
+        super().__init__()
+        img_size = (config.spec_size, config.spec_size) if isinstance(config.spec_size, int) else config.spec_size
+        patch_size = (
+            (config.patch_size, config.patch_size) if isinstance(config.patch_size, int) else config.patch_size
+        )
+        patch_stride = (
+            (config.patch_stride, config.patch_stride) if isinstance(config.patch_stride, int) else config.patch_stride
+        )
+
+        self.img_size = img_size
+        self.patch_stride = patch_stride
+
+        self.grid_size = (img_size[0] // patch_stride[0], img_size[1] // patch_stride[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+
+        self.flatten = config.flatten_patch_embeds
+        self.enable_fusion = config.enable_fusion
+
+        padding = ((patch_size[0] - patch_stride[0]) // 2, (patch_size[1] - patch_stride[1]) // 2)
+
+        scale_factor = 4 if (self.enable_fusion) and (config.fusion_type == "channel_map") else 1
+
+        self.proj = nn.Conv2D(
+            config.patch_embed_input_channels * scale_factor,
+            config.patch_embeds_hidden_size,
+            kernel_size=patch_size,
+            stride=patch_stride,
+            padding=padding,
+        )
+
+        self.norm = nn.LayerNorm(config.patch_embeds_hidden_size) if config.enable_patch_layer_norm else nn.Identity()
+        if self.enable_fusion:
+            self.fusion_model = ClapAudioAFFBlock(config)
+            self.mel_conv2d = nn.Conv2D(
+                config.patch_embed_input_channels,
+                config.patch_embeds_hidden_size,
+                kernel_size=(patch_size[0], patch_size[1] * 3),
+                stride=(patch_stride[0], patch_stride[1] * 3),
+                padding=padding,
+            )
+
+    def forward(self, hidden_states, is_longer_idx=None):
+        if self.enable_fusion:
+            # retrieve the last mel as we have transposed the input
+            global_hidden_states = hidden_states[:, 0:1, :, :]
+
+            # global processing
+            batch_size, num_channels, height, width = global_hidden_states.shape
+
+            if height != self.img_size[0] or width != self.img_size[1]:
+                raise ValueError(
+                    f"Input audio size ({height}*{width}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+                )
+
+            global_hidden_states = self.proj(global_hidden_states)
+            output_width = global_hidden_states.shape[-1]
+            if len(is_longer_idx) > 0:
+                # local processing
+                local_hidden_states = paddle.gather(hidden_states[:, 1:, :, :], is_longer_idx, axis=0)
+
+                batch_size, num_channels, height, width = local_hidden_states.shape
+                local_hidden_states = local_hidden_states.reshape([batch_size * num_channels, 1, height, width])
+
+                local_hidden_states = self.mel_conv2d(local_hidden_states)
+
+                _, features, height, width = local_hidden_states.shape
+                local_hidden_states = local_hidden_states.reshape([batch_size, num_channels, features, height, width])
+                local_hidden_states = local_hidden_states.transpose((0, 2, 3, 1, 4)).flatten(3)
+
+                local_width = local_hidden_states.shape[-1]
+
+                local_hidden_states = nn.functional.pad(
+                    local_hidden_states, (0, output_width - local_width, 0, 0), mode="constant", value=0.0
+                )
+
+                global_hidden_states[is_longer_idx] = self.fusion_model(
+                    paddle.gather(global_hidden_states, is_longer_idx, axis=0), local_hidden_states
+                )
+            hidden_states = global_hidden_states
+        else:
+            _, _, height, width = hidden_states.shape
+            if height != self.img_size[0] or width != self.img_size[1]:
+                raise ValueError(
+                    f"Input audio size ({height}*{width}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+                )
+            hidden_states = self.proj(hidden_states)
+
+        if self.flatten:
+            hidden_states = hidden_states.flatten(2).transpose([0, 2, 1])
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+# Copied from paddlenlp.transformers.models.swin.modeling_swin.SwinSelfAttention with Swin->ClapAudio
+class ClapAudioSelfAttention(nn.Layer):
+    def __init__(self, config, dim, num_heads, window_size):
+        super().__init__()
+        if dim % num_heads != 0:
+            raise ValueError(
+                f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})"
+            )
+
+        self.num_attention_heads = num_heads
+        self.attention_head_size = int(dim / num_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.window_size = (
+            window_size if isinstance(window_size, collections.abc.Iterable) else (window_size, window_size)
+        )
+        self.relative_position_bias_table = Parameter(
+            paddle.zeros([(2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1), num_heads])
+        )
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = paddle.arange(self.window_size[0])
+        coords_w = paddle.arange(self.window_size[1])
+        coords = paddle.stack(paddle.meshgrid([coords_h, coords_w], indexing="ij"))
+        coords_flatten = paddle.flatten(coords, 1)
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        relative_coords = relative_coords.transpose([1, 2, 0])
+        relative_coords[:, :, 0] += self.window_size[0] - 1
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.query = nn.Linear(self.all_head_size, self.all_head_size, bias_attr=config.qkv_bias)
+        self.key = nn.Linear(self.all_head_size, self.all_head_size, bias_attr=config.qkv_bias)
+        self.value = nn.Linear(self.all_head_size, self.all_head_size, bias_attr=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.shape[:-1] + [self.num_attention_heads, self.attention_head_size]
+        x = x.reshape(new_x_shape)
+        return x.transpose([0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        batch_size, dim, num_channels = hidden_states.shape
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_layer, key_layer.transpose([0, 1, 3, 2]))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.reshape([-1])]
+        relative_position_bias = relative_position_bias.reshape(
+            [self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1]
+        )
+
+        relative_position_bias = relative_position_bias.transpose([2, 0, 1])
+        attention_scores = attention_scores + relative_position_bias.unsqueeze(0)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in ClapAudioModel forward() function)
+            mask_shape = attention_mask.shape[0]
+            attention_scores = attention_scores.reshape(
+                [batch_size // mask_shape, mask_shape, self.num_attention_heads, dim, dim]
+            )
+            attention_scores = attention_scores + attention_mask.unsqueeze(1).unsqueeze(0)
+            attention_scores = attention_scores.reshape([-1, self.num_attention_heads, dim, dim])
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = paddle.matmul(attention_probs, value_layer)
+        context_layer = context_layer.transpose([0, 2, 1, 3])
+        new_context_layer_shape = context_layer.shape[:-2] + [
+            self.all_head_size,
+        ]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from paddlenlp.transformers.models.swin.modeling_swin.SwinSelfOutput with Swin->ClapAudio
+class ClapAudioSelfOutput(nn.Layer):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from paddlenlp.transformers.models.swin.modeling_swin.SwinAttention with Swin->ClapAudio
+class ClapAudioAttention(nn.Layer):
+    def __init__(self, config, dim, num_heads, window_size):
+        super().__init__()
+        self.self = ClapAudioSelfAttention(config, dim, num_heads, window_size)
+        self.output = ClapAudioSelfOutput(config, dim)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, axis=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        self_outputs = self.self(hidden_states, attention_mask, head_mask, output_attentions)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from paddlenlp.transformers.models.swin.modeling_swin.SwinIntermediate with Swin->ClapAudio
+class ClapAudioIntermediate(nn.Layer):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from paddlenlp.transformers.models.swin.modeling_swin.SwinOutput with Swin->ClapAudio
+class ClapAudioOutput(nn.Layer):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+# Copied from paddlenlp.transformers.models.swin.modeling_swin.SwinLayer with SwinDropPath->ClapDropPath, Swin->ClapAudio
+class ClapAudioLayer(nn.Layer):
+    def __init__(self, config, dim, input_resolution, num_heads, shift_size=0):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.shift_size = shift_size
+        self.window_size = config.window_size
+        self.input_resolution = input_resolution
+        self.layernorm_before = nn.LayerNorm(dim, epsilon=config.layer_norm_eps)
+        self.attention = ClapAudioAttention(config, dim, num_heads, window_size=self.window_size)
+        self.drop_path = ClapDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+        self.layernorm_after = nn.LayerNorm(dim, epsilon=config.layer_norm_eps)
+        self.intermediate = ClapAudioIntermediate(config, dim)
+        self.output = ClapAudioOutput(config, dim)
+
+    def set_shift_and_window_size(self, input_resolution):
+        if min(input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(input_resolution)
+
+    def get_attn_mask(self, height, width, dtype):
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            img_mask = paddle.zeros((1, height, width, 1), dtype=dtype)
+            height_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            width_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            count = 0
+            for height_slice in height_slices:
+                for width_slice in width_slices:
+                    img_mask[:, height_slice, width_slice, :] = count
+                    count += 1
+
+            mask_windows = window_partition(img_mask, self.window_size)
+            mask_windows = mask_windows.reshape([-1, self.window_size * self.window_size])
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = masked_fill(attn_mask, attn_mask != 0, float(-100.0))
+            attn_mask = masked_fill(attn_mask, attn_mask == 0, float(0.0))
+        else:
+            attn_mask = None
+        return attn_mask
+
+    def maybe_pad(self, hidden_states, height, width):
+        pad_right = (self.window_size - width % self.window_size) % self.window_size
+        pad_bottom = (self.window_size - height % self.window_size) % self.window_size
+        # (padding_left,padding_right,padding_top, padding_bottom)
+        pad_values = (0, 0, 0, pad_right, 0, pad_bottom)
+        # hidden_states = nn.functional.pad(hidden_states, pad_values)
+        # TODO(wugaosheng): torch pad is different from paddle pad
+        hidden_states = nn.functional.pad(hidden_states, (0, pad_right, 0, pad_bottom), data_format="NHWC")
+
+        return hidden_states, pad_values
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        input_dimensions: Tuple[int, int],
+        head_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        if not always_partition:
+            self.set_shift_and_window_size(input_dimensions)
+        else:
+            pass
+        height, width = input_dimensions
+        batch_size, _, channels = hidden_states.shape
+        shortcut = hidden_states
+
+        hidden_states = self.layernorm_before(hidden_states)
+
+        hidden_states = hidden_states.reshape([batch_size, height, width, channels])
+
+        # pad hidden_states to multiples of window size
+        hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
+
+        _, height_pad, width_pad, _ = hidden_states.shape
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_hidden_states = paddle.roll(
+                hidden_states, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2)
+            )
+        else:
+            shifted_hidden_states = hidden_states
+
+        # partition windows
+        hidden_states_windows = window_partition(shifted_hidden_states, self.window_size)
+        hidden_states_windows = hidden_states_windows.reshape([-1, self.window_size * self.window_size, channels])
+        attn_mask = self.get_attn_mask(height_pad, width_pad, dtype=hidden_states.dtype)
+
+        attention_outputs = self.attention(
+            hidden_states_windows, attn_mask, head_mask, output_attentions=output_attentions
+        )
+
+        attention_output = attention_outputs[0]
+
+        attention_windows = attention_output.reshape([-1, self.window_size, self.window_size, channels])
+        shifted_windows = window_reverse(attention_windows, self.window_size, height_pad, width_pad)
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            attention_windows = paddle.roll(shifted_windows, shifts=(self.shift_size, self.shift_size), axis=(1, 2))
+        else:
+            attention_windows = shifted_windows
+
+        was_padded = pad_values[3] > 0 or pad_values[5] > 0
+        if was_padded:
+            attention_windows = attention_windows[:, :height, :width, :]
+
+        attention_windows = attention_windows.reshape([batch_size, height * width, channels])
+
+        hidden_states = shortcut + self.drop_path(attention_windows)
+
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+        layer_output = hidden_states + self.output(layer_output)
+
+        layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
+        return layer_outputs
+
+
+# Copied from paddlenlp.transformers.models.swin.modeling_swin.SwinStage with Swin->ClapAudio
+class ClapAudioStage(nn.Layer):
+    def __init__(self, config, dim, input_resolution, depth, num_heads, drop_path, downsample):
+        super().__init__()
+        self.config = config
+        self.dim = dim
+        self.blocks = nn.LayerList(
+            [
+                ClapAudioLayer(
+                    config=config,
+                    dim=dim,
+                    input_resolution=input_resolution,
+                    num_heads=num_heads,
+                    shift_size=0 if (i % 2 == 0) else config.window_size // 2,
+                )
+                for i in range(depth)
+            ]
+        )
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim, norm_layer=nn.LayerNorm)
+        else:
+            self.downsample = None
+
+        self.pointing = False
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        input_dimensions: Tuple[int, int],
+        head_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        height, width = input_dimensions
+        for i, layer_module in enumerate(self.blocks):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
+            )
+
+            hidden_states = layer_outputs[0]
+
+        hidden_states_before_downsampling = hidden_states
+        if self.downsample is not None:
+            height_downsampled, width_downsampled = (height + 1) // 2, (width + 1) // 2
+            output_dimensions = (height, width, height_downsampled, width_downsampled)
+            hidden_states = self.downsample(hidden_states_before_downsampling, input_dimensions)
+        else:
+            output_dimensions = (height, width, height, width)
+
+        stage_outputs = (hidden_states, hidden_states_before_downsampling, output_dimensions)
+
+        if output_attentions:
+            stage_outputs += layer_outputs[1:]
+        return stage_outputs
+
+
+# Copied from paddlenlp.transformers.models.swin.modeling_swin.SwinPatchMerging with Swin->ClapAudio
+class ClapAudioPatchMerging(nn.Layer):
+    """
+    Patch Merging Layer.
+
+    Args:
+        input_resolution (`Tuple[int]`):
+            Resolution of input feature.
+        dim (`int`):
+            Number of input channels.
+        norm_layer (`nn.Layer`, *optional*, defaults to `nn.LayerNorm`):
+            Normalization layer class.
+    """
+
+    def __init__(self, input_resolution: Tuple[int], dim: int, norm_layer: nn.Layer = nn.LayerNorm) -> None:
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
+        self.norm = norm_layer(4 * dim)
+
+    def maybe_pad(self, input_feature, height, width):
+        should_pad = (height % 2 == 1) or (width % 2 == 1)
+        if should_pad:
+            pad_values = (0, width % 2, 0, height % 2)
+            input_feature = nn.functional.pad(input_feature, pad_values, data_format="NHWC")
+
+        return input_feature
+
+    def forward(self, input_feature: paddle.Tensor, input_dimensions: Tuple[int, int]) -> paddle.Tensor:
+        height, width = input_dimensions
+        # `dim` is height * width
+        batch_size, dim, num_channels = input_feature.shape
+
+        input_feature = input_feature.reshape([batch_size, height, width, num_channels])
+        # pad input to be disible by width and height, if needed
+        input_feature = self.maybe_pad(input_feature, height, width)
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_0 = input_feature[:, 0::2, 0::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_1 = input_feature[:, 1::2, 0::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_2 = input_feature[:, 0::2, 1::2, :]
+        # [batch_size, height/2, width/2, num_channels]
+        input_feature_3 = input_feature[:, 1::2, 1::2, :]
+        # batch_size height/2 width/2 4*num_channels
+        input_feature = paddle.concat([input_feature_0, input_feature_1, input_feature_2, input_feature_3], -1)
+        input_feature = input_feature.reshape([batch_size, -1, 4 * num_channels])  # batch_size height/2*width/2 4*C
+
+        input_feature = self.norm(input_feature)
+        input_feature = self.reduction(input_feature)
+
+        return input_feature
+
+
+class ClapAudioEncoder(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.num_layers = len(config.depths)
+
+        self.config = config
+        self.patch_embed = ClapAudioPatchEmbed(config)
+        self.enable_fusion = config.enable_fusion
+        self.patch_stride = self.patch_embed.patch_stride
+        self.spec_size = config.spec_size
+        self.freq_ratio = config.spec_size // config.num_mel_bins
+
+        self.num_features = int(config.patch_embeds_hidden_size * 2 ** (self.num_layers - 1))
+
+        drop_path_rate = [x.item() for x in paddle.linspace(0, config.drop_path_rate, sum(config.depths))]
+
+        grid_size = self.patch_embed.grid_size
+        self.input_resolutions = [(grid_size[0] // (2**i), grid_size[1] // (2**i)) for i in range(self.num_layers)]
+
+        self.layers = nn.LayerList(
+            [
+                ClapAudioStage(
+                    config=config,
+                    dim=int(config.patch_embeds_hidden_size * 2**i_layer),
+                    input_resolution=self.input_resolutions[i_layer],
+                    depth=config.depths[i_layer],
+                    num_heads=config.num_attention_heads[i_layer],
+                    drop_path=drop_path_rate[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
+                    downsample=ClapAudioPatchMerging if (i_layer < self.num_layers - 1) else None,
+                )
+                for i_layer in range(self.num_layers)
+            ]
+        )
+
+        self.gradient_checkpointing = False
+
+        self.batch_norm = nn.BatchNorm2D(config.num_mel_bins, momentum=0.1)
+        self.norm = nn.LayerNorm(self.num_features)
+        self.depths = config.depths
+        self.avgpool = nn.AdaptiveAvgPool1D(1)
+
+    def reshape_mel2img(self, normalized_input_features):
+        """
+        The input is 4 normalized log mel spectrograms. It is reshape to the common shape of images. Each channel
+        should represent 1 of the 4 crops of the spectrogram. For more details, refer to the [`ClapFeatureExtractor`].
+        """
+        _, _, time_length, freq_length = normalized_input_features.shape
+
+        spec_width = int(self.spec_size * self.freq_ratio)
+        spec_heigth = self.spec_size // self.freq_ratio
+
+        if time_length > spec_width or freq_length > spec_heigth:
+            raise ValueError("the wav size should be less than or equal to the swin input size")
+
+        # to avoid bicubic zero error
+        if time_length < spec_width:
+            normalized_input_features = nn.functional.interpolate(
+                normalized_input_features, (spec_width, freq_length), mode="bicubic", align_corners=True
+            )
+        if freq_length < spec_heigth:
+            normalized_input_features = nn.functional.interpolate(
+                normalized_input_features, (time_length, spec_heigth), mode="bicubic", align_corners=True
+            )
+
+        batch, channels, time, freq = normalized_input_features.shape
+
+        # batch_size, channels, spec_width, spec_heigth --> batch_size, channels, spec_heigth * freq_ratio, spec_width // freq_ratio
+        normalized_input_features = normalized_input_features.reshape(
+            [batch, channels * self.freq_ratio, time // self.freq_ratio, freq]
+        )
+        normalized_input_features = normalized_input_features.transpose([0, 1, 3, 2])
+        normalized_input_features = normalized_input_features.reshape(
+            [batch, channels, freq * self.freq_ratio, time // self.freq_ratio]
+        )
+
+        return normalized_input_features
+
+    def forward(
+        self,
+        input_features,
+        is_longer: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        output_hidden_states_before_downsampling: Optional[bool] = False,
+        always_partition: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple, ClapAudioModelOutput]:
+
+        input_features = input_features.transpose([0, 3, 2, 1])
+        normalized_input_features = self.batch_norm(input_features)
+        normalized_input_features = normalized_input_features.transpose([0, 3, 2, 1])
+        is_longer_list_idx = None
+        if self.enable_fusion:
+            is_longer_list = is_longer
+            is_longer_list_idx = paddle.where(is_longer_list == 1)[0]
+
+        hidden_states = self.reshape_mel2img(normalized_input_features)
+
+        frames_num = hidden_states.shape[2]
+
+        hidden_states = self.patch_embed(hidden_states, is_longer_list_idx)
+
+        all_hidden_states = () if output_hidden_states else None
+        all_reshaped_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        input_dimensions = self.input_resolutions[0]
+
+        if output_hidden_states:
+            batch_size, _, hidden_size = hidden_states.shape
+            # rearrange batch_size (height width) channels -> batch_size channel height width
+            reshaped_hidden_state = hidden_states.reshape([batch_size, *input_dimensions, hidden_size])
+            reshaped_hidden_state = reshaped_hidden_state.transpose([0, 3, 1, 2])
+            all_hidden_states += (hidden_states,)
+            all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+        for i, layer_module in enumerate(self.layers):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            input_dimensions = self.input_resolutions[i]
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = recompute(
+                    create_custom_forward(layer_module), hidden_states, input_dimensions, layer_head_mask
+                )
+
+            else:
+                layer_outputs = layer_module(
+                    hidden_states, input_dimensions, layer_head_mask, output_attentions, always_partition
+                )
+
+            hidden_states = layer_outputs[0]
+
+            hidden_states_before_downsampling = layer_outputs[1]
+            output_dimensions = layer_outputs[2]
+
+            input_dimensions = (output_dimensions[-2], output_dimensions[-1])
+
+            if output_hidden_states and output_hidden_states_before_downsampling:
+                batch_size, _, hidden_size = hidden_states_before_downsampling.shape
+                # rearrange batch_size (height width) channels -> batch_size channel height width
+                # here we use the original (not downsampled) height and width
+                reshaped_hidden_state = hidden_states_before_downsampling.reshape(
+                    [batch_size, *(output_dimensions[0], output_dimensions[1]), hidden_size]
+                )
+                reshaped_hidden_state = reshaped_hidden_state.transpose([0, 3, 1, 2])
+                all_hidden_states += (hidden_states_before_downsampling,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+            elif output_hidden_states and not output_hidden_states_before_downsampling:
+                batch_size, _, hidden_size = hidden_states.shape
+                # rearrange batch_size (height width) channels -> batch_size channel height width
+                reshaped_hidden_state = hidden_states.reshape([batch_size, *input_dimensions, hidden_size])
+                reshaped_hidden_state = reshaped_hidden_state.transpose([0, 3, 1, 2])
+                all_hidden_states += (hidden_states,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+            if output_attentions:
+                all_self_attentions += layer_outputs[3:]
+
+        last_hidden_state = self.norm(hidden_states)
+
+        batch_size, _, n_channels = last_hidden_state.shape
+
+        freq_shape = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[0]
+        temporal_shape = frames_num // (2 ** (len(self.depths) - 1)) // self.patch_stride[1]
+
+        last_hidden_state = last_hidden_state.transpose([0, 2, 1]).reshape(
+            [batch_size, n_channels, freq_shape, temporal_shape]
+        )
+
+        batch_size, n_channels, n_frequencies, n_temp = last_hidden_state.shape
+        # group 2D CNN
+        c_freq_bin = n_frequencies // self.freq_ratio
+        last_hidden_state = last_hidden_state.reshape(
+            [batch_size, n_channels, n_frequencies // c_freq_bin, c_freq_bin, n_temp]
+        )
+        last_hidden_state = last_hidden_state.transpose([0, 1, 3, 2, 4]).reshape(
+            [batch_size, n_channels, c_freq_bin, -1]
+        )
+        latent_output = self.avgpool(paddle.flatten(last_hidden_state, 2))
+        latent_output = paddle.flatten(latent_output, 1)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    last_hidden_state,
+                    latent_output,
+                    all_reshaped_hidden_states,
+                    all_self_attentions,
+                ]
+                if v is not None
+            )
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=latent_output,
+            hidden_states=all_reshaped_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class ClapProjectionLayer(nn.Layer):
+    def __init__(self, config: Union[ClapAudioConfig, ClapTextConfig]):
+        super().__init__()
+        self.config = config
+        hidden_size = config.hidden_size
+        projection_dim = config.projection_dim
+
+        self.linear1 = nn.Linear(hidden_size, projection_dim)
+        self.activation = ACT2FN[config.projection_hidden_act]
+        self.linear2 = nn.Linear(projection_dim, projection_dim)
+
+    def forward(self, hidden_states):
+        hidden_states = self.linear1(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.linear2(hidden_states)
+        return hidden_states
+
+
+# Copied from paddlenlp.transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->ClapText, persistent=False->persistent=True
+class ClapTextEmbeddings(nn.Layer):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    # Copied from paddlenlp.transformers.models.bert.modeling_bert.BertEmbeddings.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer("position_ids", paddle.arange(config.max_position_embeddings).expand((1, -1)))
+        self.register_buffer("token_type_ids", paddle.zeros(self.position_ids.shape, dtype="int64"), persistable=True)
+
+        # End copy
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+        if input_ids is not None:
+            input_shape = input_ids.shape
+        else:
+            input_shape = inputs_embeds.shape[:-1]
+
+        seq_length = input_shape[1]
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand([input_shape[0], seq_length])
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = paddle.zeros(input_shape, dtype="int64")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids.cast("int64"))
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: paddle.Tensor
+
+        Returns: paddle.Tensor
+        """
+        input_shape = inputs_embeds.shape[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = paddle.arange(self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype="int64")
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+
+# Copied from paddlenlp.transformers.models.bert.modeling_bert.BertSelfAttention with Bert->ClapText
+class ClapTextSelfAttention(nn.Layer):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: paddle.Tensor) -> paddle.Tensor:
+        new_x_shape = x.shape[:-1] + [self.num_attention_heads, self.attention_head_size]
+        x = x.reshape(new_x_shape)
+        return x.transpose([0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        encoder_attention_mask: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = paddle.concat([past_key_value[0], key_layer], axis=2)
+            value_layer = paddle.concat([past_key_value[1], value_layer], axis=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        use_cache = past_key_value is not None
+        if self.is_decoder:
+            # if cross_attention save Tuple(paddle.Tensor, paddle.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(paddle.Tensor, paddle.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_layer, key_layer.transpose([0, 1, 3, 2]))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = paddle.to_tensor([key_length - 1], dtype="int64").reshape([-1, 1])
+            else:
+                position_ids_l = paddle.arange(query_length, dtype="int64").reshape([-1, 1])
+            position_ids_r = paddle.arange(key_length, dtype="int64").reshape([1, -1])
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.cast(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = paddle.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in ClapTextModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = paddle.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.transpose([0, 2, 1, 3])
+        new_context_layer_shape = context_layer.shape[:-2] + [
+            self.all_head_size,
+        ]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Copied from paddlenlp.transformers.models.bert.modeling_bert.BertSelfOutput
+class ClapTextSelfOutput(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from paddlenlp.transformers.models.bert.modeling_bert.BertAttention with Bert->ClapText
+class ClapTextAttention(nn.Layer):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = ClapTextSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = ClapTextSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, axis=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        encoder_attention_mask: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from paddlenlp.transformers.models.bert.modeling_bert.BertIntermediate
+class ClapTextIntermediate(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from paddlenlp.transformers.models.bert.modeling_bert.BertOutput
+class ClapTextOutput(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from paddlenlp.transformers.models.bert.modeling_bert.BertLayer with Bert->ClapText
+class ClapTextLayer(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = ClapTextAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = ClapTextAttention(config, position_embedding_type="absolute")
+        self.intermediate = ClapTextIntermediate(config)
+        self.output = ClapTextOutput(config)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        encoder_attention_mask: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# Copied from paddlenlp.transformers.models.bert.modeling_bert.BertEncoder with Bert->ClapText
+class ClapTextEncoder(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.LayerList([ClapTextLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        encoder_attention_mask: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple[paddle.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = recompute(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from paddlenlp.transformers.models.bert.modeling_bert.BertPooler
+class ClapTextPooler(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class ClapPreTrainedModel(PretrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = ClapConfig
+    base_model_prefix = "clap"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"logit_scale_a", r"logit_scale_t"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor
+
+        if isinstance(module, ClapTextEmbeddings):
+            normal_(module.position_embeddings.weight, mean=0.0, std=factor * 0.02)
+            normal_(module.token_type_embeddings.weight, mean=0.0, std=factor * 0.02)
+        elif isinstance(module, ClapModel):
+            normal_(module.logit_scale_a, std=factor * 0.02)
+            normal_(module.logit_scale_t, std=factor * 0.02)
+        elif isinstance(module, nn.Embedding):
+            normal_(module.weight, mean=0.0, std=factor * 0.02)
+        elif isinstance(module, nn.LayerNorm):
+            zeros_(module.bias)
+            ones_(module.weight)
+        elif isinstance(module, (nn.Conv2D, nn.Linear)):
+            in_proj_std = (self.config.hidden_size**-0.5) * ((2 * self.config.num_hidden_layers) ** -0.5) * factor
+            normal_(module.weight, std=in_proj_std)
+            if module.bias is not None:
+                zeros_(module.bias)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, ClapTextEncoder):
+            module.gradient_checkpointing = value
+
+    def get_extended_attention_mask(
+        self,
+        attention_mask: paddle.Tensor,
+        input_shape: Tuple[int],
+        has_query: bool = False,
+    ) -> paddle.Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+        Arguments:
+            attention_mask (`paddle.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`Tuple[int]`):
+                The shape of the input to the model.
+        Returns:
+            `paddle.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.cast(dtype=self.config.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def get_head_mask(
+        self, head_mask: Optional[paddle.Tensor], num_hidden_layers: int, is_attention_chunked: bool = False
+    ) -> paddle.Tensor:
+        """
+        Prepare the head mask if needed.
+        Args:
+            head_mask (`paddle.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):
+                The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
+            num_hidden_layers (`int`):
+                The number of hidden layers in the model.
+            is_attention_chunked: (`bool`, *optional*, defaults to `False`):
+                Whether or not the attentions scores are computed by chunks or not.
+        Returns:
+            `paddle.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with
+            `[None]` for each layer.
+        """
+        if head_mask is not None:
+            head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
+            if is_attention_chunked is True:
+                head_mask = head_mask.unsqueeze(-1)
+        else:
+            head_mask = [None] * num_hidden_layers
+
+        return head_mask
+
+    def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
+        """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
+        if head_mask.ndim == 1:
+            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+            head_mask = head_mask.expand([num_hidden_layers, -1, -1, -1, -1])
+        elif head_mask.ndim == 2:
+            head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+        assert head_mask.ndim == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
+        head_mask = head_mask.cast(dtype=self.config.dtype)  # switch to float if need + fp16 compatibility
+        return head_mask
+
+
+class ClapAudioModel(ClapPreTrainedModel):
+    config_class = ClapAudioConfig
+    main_input_name = "input_features"
+
+    def __init__(self, config: ClapAudioConfig):
+        super().__init__(config)
+        self.audio_encoder = ClapAudioEncoder(config)
+        # Initialize weights and apply final processing
+        self.init_weights()
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.audio_encoder.patch_embed.proj
+
+    def forward(
+        self,
+        input_features: Optional[paddle.Tensor] = None,
+        is_longer: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from datasets import load_dataset
+        >>> from paddlenlp.transformers import AutoProcessor, ClapAudioModel
+
+        >>> dataset = load_dataset("ashraq/esc50")
+        >>> audio_sample = dataset["train"]["audio"][0]["array"]
+
+        >>> model = ClapAudioModel.from_pretrained("laion/clap-htsat-fused")
+        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-fused")
+
+        >>> inputs = processor(audios=audio_sample, return_tensors="pd")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        return self.audio_encoder(
+            input_features=input_features,
+            is_longer=is_longer,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class ClapTextModel(ClapPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
+    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+    Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+
+    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
+
+    """
+
+    config_class = ClapTextConfig
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    # Copied from paddlenlp.transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->ClapText
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = ClapTextEmbeddings(config)
+        self.encoder = ClapTextEncoder(config)
+
+        self.pooler = ClapTextPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    # Copied from paddlenlp.transformers.models.bert.modeling_bert.BertModel.forward
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        token_type_ids: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        encoder_attention_mask: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[List[paddle.Tensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[paddle.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(paddle.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = paddle.ones(((batch_size, seq_length + past_key_values_length)))
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand([batch_size, seq_length])
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = paddle.zeros(input_shape, dtype="int64")
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: paddle.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = paddle.ones(encoder_hidden_shape)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+class ClapModel(ClapPreTrainedModel):
+    config_class = ClapConfig
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config: ClapConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, ClapTextConfig):
+            raise ValueError(
+                "config.text_config is expected to be of type ClapTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.audio_config, ClapAudioConfig):
+            raise ValueError(
+                "config.audio_config is expected to be of type ClapAudioConfig but is of type"
+                f" {type(config.audio_config)}."
+            )
+
+        text_config = config.text_config
+        audio_config = config.audio_config
+
+        self.logit_scale_a = Parameter(paddle.ones([1]) * np.log(config.logit_scale_init_value))
+        self.logit_scale_t = Parameter(paddle.ones([1]) * np.log(config.logit_scale_init_value))
+        self.projection_dim = config.projection_dim
+
+        self.text_model = ClapTextModel(text_config)
+        self.text_projection = ClapProjectionLayer(text_config)
+
+        self.audio_model = ClapAudioModel(audio_config)
+        self.audio_projection = ClapProjectionLayer(audio_config)
+
+        # Initialize weights and apply final processing
+        self.init_weights()
+
+    def get_text_features(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> paddle.Tensor:
+        r"""
+        Returns:
+            text_features (`paddle.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`ClapTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from paddlenlp.transformers import AutoTokenizer, ClapModel
+
+        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
+        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
+
+        >>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pd")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use CLAP model's config for some fields (if specified) instead of those of audio & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1] if return_dict is not None else text_outputs.pooler_output
+        text_features = self.text_projection(pooled_output)
+        text_features = F.normalize(text_features, axis=-1)
+
+        return text_features
+
+    def get_audio_features(
+        self,
+        input_features: Optional[paddle.Tensor] = None,
+        is_longer: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> paddle.Tensor:
+        r"""
+        Returns:
+            audio_features (`paddle.Tensor` of shape `(batch_size, output_dim`): The audio embeddings obtained by
+            applying the projection layer to the pooled output of [`ClapAudioModel`].
+
+        Examples:
+
+        ```python
+        >>> from paddlenlp.transformers import AutoFeatureExtractor, ClapModel
+        >>> import paddle
+
+        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")
+        >>> random_audio = paddle.rand((16_000))
+        >>> inputs = feature_extractor(random_audio, return_tensors="pd")
+        >>> audio_features = model.get_audio_features(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        audio_outputs = self.audio_model(
+            input_features=input_features,
+            is_longer=is_longer,
+            return_dict=return_dict,
+        )
+
+        pooled_output = audio_outputs[1] if not return_dict else audio_outputs.pooler_output
+
+        audio_features = self.audio_projection(pooled_output)
+        audio_features = F.normalize(audio_features, axis=-1)
+
+        return audio_features
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        input_features: Optional[paddle.Tensor] = None,
+        is_longer: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, ClapOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from datasets import load_dataset
+        >>> from paddlenlp.transformers import AutoProcessor, ClapModel
+        >>> import paddle.nn.functional as F
+
+        >>> dataset = load_dataset("ashraq/esc50")
+        >>> audio_sample = dataset["train"]["audio"][0]["array"]
+
+        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
+        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused")
+
+        >>> input_text = ["Sound of a dog", "Sound of vaccum cleaner"]
+
+        >>> inputs = processor(text=input_text, audios=audio_sample, return_tensors="pd", padding=True)
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_audio = outputs.logits_per_audio  # this is the audio-text similarity score
+        >>> probs = F.softmax(logits_per_audio, axis=-1) # we can take the softmax to get the label probabilities
+        ```"""
+        # Use CLAP model's config for some fields (if specified) instead of those of audio & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        audio_outputs = self.audio_model(
+            input_features=input_features,
+            is_longer=is_longer,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        audio_embeds = audio_outputs[1] if not return_dict else audio_outputs.pooler_output
+        audio_embeds = self.audio_projection(audio_embeds)
+
+        text_embeds = text_outputs[1] if not return_dict else text_outputs.pooler_output
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        audio_embeds = audio_embeds / audio_embeds.norm(p=2, axis=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, axis=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale_text = self.logit_scale_t.exp()
+        logit_scale_audio = self.logit_scale_a.exp()
+        logits_per_text = paddle.matmul(text_embeds, audio_embeds.t()) * logit_scale_text
+        logits_per_audio = paddle.matmul(audio_embeds, text_embeds.t()) * logit_scale_audio
+
+        loss = None
+        if return_loss:
+            caption_loss = contrastive_loss(logits_per_text)
+            audio_loss = contrastive_loss(logits_per_audio.t())
+            loss = (caption_loss + audio_loss) / 2.0
+
+        if not return_dict:
+            output = (logits_per_audio, logits_per_text, text_embeds, audio_embeds, text_outputs, audio_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return ClapOutput(
+            loss=loss,
+            logits_per_audio=logits_per_audio,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            audio_embeds=audio_embeds,
+            text_model_output=text_outputs,
+            audio_model_output=audio_outputs,
+        )
+
+
+class ClapTextModelWithProjection(ClapPreTrainedModel):
+    config_class = ClapTextConfig
+
+    def __init__(self, config: ClapTextConfig):
+        super().__init__(config)
+        self.text_model = ClapTextModel(config)
+        self.text_projection = ClapProjectionLayer(config)
+        # Initialize weights and apply final processing
+        self.init_weights()
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.text_model.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.word_embeddings = value
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, ClapTextModelOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from paddlenlp.transformers import AutoTokenizer, ClapTextModelWithProjection
+
+        >>> model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
+        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
+
+        >>> inputs = tokenizer(["a sound of a cat", "a sound of a dog"], padding=True, return_tensors="pd")
+
+        >>> outputs = model(**inputs)
+        >>> text_embeds = outputs.text_embeds
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1] if not return_dict else text_outputs.pooler_output
+
+        text_embeds = self.text_projection(pooled_output)
+
+        if not return_dict:
+            outputs = (text_embeds, text_outputs[0]) + text_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
+
+        return ClapTextModelOutput(
+            text_embeds=text_embeds,
+            last_hidden_state=text_outputs.last_hidden_state,
+            hidden_states=text_outputs.hidden_states,
+            attentions=text_outputs.attentions,
+        )
+
+
+class ClapAudioModelWithProjection(ClapPreTrainedModel):
+    config_class = ClapAudioConfig
+    main_input_name = "input_features"
+
+    def __init__(self, config: ClapAudioConfig):
+        super().__init__(config)
+        self.audio_model = ClapAudioModel(config)
+        self.audio_projection = ClapProjectionLayer(config)
+        # Initialize weights and apply final processing
+        self.init_weights()
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.audio_model.audio_encoder.patch_embed.proj
+
+    def forward(
+        self,
+        input_features: Optional[paddle.Tensor] = None,
+        is_longer: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, ClapAudioModelOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from datasets import load_dataset
+        >>> from paddlenlp.transformers import ClapAudioModelWithProjection, ClapProcessor
+
+        >>> model = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-fused")
+        >>> processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")
+
+        >>> dataset = load_dataset("ashraq/esc50")
+        >>> audio_sample = dataset["train"]["audio"][0]["array"]
+
+        >>> inputs = processor(audios=audio_sample, return_tensors="pd")
+        >>> outputs = model(**inputs)
+        >>> audio_embeds = outputs.audio_embeds
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        audio_outputs = self.audio_model(
+            input_features=input_features,
+            is_longer=is_longer,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = audio_outputs[1] if not return_dict else audio_outputs.pooler_output
+
+        audio_embeds = self.audio_projection(pooled_output)
+
+        if not return_dict:
+            outputs = (audio_embeds, audio_outputs[0]) + audio_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
+
+        return ClapAudioModelOutput(
+            audio_embeds=audio_embeds,
+            last_hidden_state=audio_outputs.last_hidden_state,
+            attentions=audio_outputs.attentions,
+            hidden_states=audio_outputs.hidden_states,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clap/processing.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clap/processing.py
new file mode 100644
index 000000000..1c08b0e4a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clap/processing.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Audio/Text processor class for CLAP
+"""
+
+from ..processing_utils import ProcessorMixin
+from ..tokenizer_utils_base import BatchEncoding
+
+__all__ = [
+    "ClapProcessor",
+]
+
+
+class ClapProcessor(ProcessorMixin):
+    r"""
+    Constructs a CLAP processor which wraps a CLAP feature extractor and a RoBerta tokenizer into a single processor.
+
+    [`ClapProcessor`] offers all the functionalities of [`ClapFeatureExtractor`] and [`RobertaTokenizerFast`]. See the
+    [`~ClapProcessor.__call__`] and [`~ClapProcessor.decode`] for more information.
+
+    Args:
+        feature_extractor ([`ClapFeatureExtractor`]):
+            The audio processor is a required input.
+        tokenizer ([`RobertaTokenizerFast`]):
+            The tokenizer is a required input.
+    """
+    feature_extractor_class = "ClapFeatureExtractor"
+    tokenizer_class = "RobertaTokenizer"
+
+    pretrained_init_configuration = {"laion/clap-htsat-unfused": {"do_lower_case": True}}
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+
+    def __call__(self, text=None, audios=None, return_tensors=None, **kwargs):
+        """
+        Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
+        and `kwargs` arguments to RobertaTokenizerFast's [`~RobertaTokenizerFast.__call__`] if `text` is not `None` to
+        encode the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to
+        ClapFeatureExtractor's [`~ClapFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the
+        doctsring of the above two methods for more information.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            audios (`np.ndarray`, `paddle.Tensor`, `List[np.ndarray]`, `List[paddle.Tensor]`):
+                The audio or batch of audios to be prepared. Each audio can be NumPy array or PaddlePaddle tensor. In case
+                of a NumPy array/PaddlePaddle tensor, each audio should be of shape (C, T), where C is a number of channels,
+                and T the sample length of the audio.
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'pd'`: Return PaddlePaddle `paddle.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+
+        Returns:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **audio_features** -- Audio features to be fed to a model. Returned when `audios` is not `None`.
+        """
+        sampling_rate = kwargs.pop("sampling_rate", None)
+
+        if text is None and audios is None:
+            raise ValueError("You have to specify either text or audios. Both cannot be none.")
+
+        if text is not None:
+            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+
+        if audios is not None:
+            audio_features = self.feature_extractor(
+                audios, sampling_rate=sampling_rate, return_tensors=return_tensors, **kwargs
+            )
+
+        if text is not None and audios is not None:
+            encoding["input_features"] = audio_features.input_features
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return BatchEncoding(data=dict(**audio_features), tensor_type=return_tensors)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to RobertaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to RobertaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        feature_extractor_input_names = self.feature_extractor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names))
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/configuration.py
new file mode 100644
index 000000000..1cb7a4c95
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/configuration.py
@@ -0,0 +1,509 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" CLIP model configuration"""
+
+import copy
+import os
+from typing import Union
+
+from ...utils.log import logger
+from ..configuration_utils import (
+    PretrainedConfig,
+    convert_to_legacy_config,
+    flatten_model_config,
+)
+
+__all__ = [
+    "CLIPTextConfig",
+    "CLIPVisionConfig",
+    "CLIPConfig",
+]
+
+
+class Old2NewPretrainedConfig(PretrainedConfig):
+    old_config_dict = [
+        "image_resolution",
+        "vision_layers",
+        "vision_heads",
+        "vision_embed_dim",
+        "vision_patch_size",
+        "vision_mlp_ratio",
+        "vision_hidden_act",
+        "max_text_length",
+        "vocab_size",
+        "text_embed_dim",
+        "text_heads",
+        "text_layers",
+        "text_hidden_act",
+        "projection_dim",
+        "initializer_range",
+        "initializer_factor",
+        "logit_scale_init_value",
+        "init_class",
+    ]
+    text_name_mapping = {
+        "max_text_length": "max_position_embeddings",
+        "vocab_size": "vocab_size",
+        "text_embed_dim": "hidden_size",
+        "text_heads": "num_attention_heads",
+        "text_layers": "num_hidden_layers",
+        "text_hidden_act": "hidden_act",
+        "initializer_range": "initializer_range",
+        "initializer_factor": "initializer_factor",
+        "projection_dim": "projection_dim",
+    }
+    vision_name_mapping = {
+        "image_resolution": "image_size",
+        "vision_layers": "num_hidden_layers",
+        "vision_heads": "num_attention_heads",
+        "vision_embed_dim": "hidden_size",
+        "vision_patch_size": "patch_size",
+        "vision_hidden_act": "hidden_act",
+        "initializer_range": "initializer_range",
+        "initializer_factor": "initializer_factor",
+        "projection_dim": "projection_dim",
+    }
+
+    @classmethod
+    def from_dict(cls, config_dict, **kwargs) -> "PretrainedConfig":
+        """
+        Instantiates a [`PretrainedConfig`] from a Python dictionary of parameters.
+
+        Args:
+            config_dict (`Dict[str, Any]`):
+                Dictionary that will be used to instantiate the configuration object. Such a dictionary can be
+                retrieved from a pretrained checkpoint by leveraging the [`~PretrainedConfig.get_config_dict`] method.
+            kwargs (`Dict[str, Any]`):
+                Additional parameters from which to initialize the configuration object.
+
+        Returns:
+            [`PretrainedConfig`]: The configuration object instantiated from those parameters.
+        """
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+        # Those arguments may be passed along for our internal telemetry.
+        # We remove them so they don't appear in `return_unused_kwargs`.
+        # convert local config to legacy config
+        # do standard config map: there are some old-school pretrained-config not refactored.
+        config_dict = convert_to_legacy_config(cls.attribute_map, config_dict)
+        config_dict = flatten_model_config(config_dict)
+
+        # check old_config?
+        is_old_config = "vision_layers" in config_dict or "text_layers" in config_dict
+        if is_old_config:
+            # convert to new_config
+            old_config_dict = {}
+            for old_name in cls.old_config_dict:
+                value = config_dict.pop(old_name, None)
+                if value is not None:
+                    old_config_dict[old_name] = value
+
+            # convert text config
+            if cls.model_type in ["clip", "clip_text_model"]:
+                text_config = {}
+                for old_name, new_name in cls.text_name_mapping.items():
+                    old_value = old_config_dict.get(old_name, None)
+                    if old_value is not None:
+                        text_config[new_name] = old_value
+                if "hidden_size" in text_config:
+                    text_config["intermediate_size"] = 4 * text_config["hidden_size"]
+
+                if cls.model_type == "clip":
+                    config_dict["text_config_dict"] = text_config
+                else:
+                    config_dict.update(text_config)
+
+            # convert vision config
+            if cls.model_type in ["clip", "clip_vision_model"]:
+                vision_config = {}
+                for old_name, new_name in cls.vision_name_mapping.items():
+                    old_value = old_config_dict.get(old_name, None)
+                    if old_value is not None:
+                        vision_config[new_name] = old_value
+                if "hidden_size" in vision_config:
+                    radio = old_config_dict.get("vision_mlp_ratio", 4)
+                    vision_config["intermediate_size"] = radio * vision_config["hidden_size"]
+                if cls.model_type == "clip":
+                    config_dict["vision_config_dict"] = vision_config
+                else:
+                    config_dict.update(vision_config)
+
+            if cls.model_type == "clip":
+                # convert common config
+                if "projection_dim" in old_config_dict:
+                    config_dict["projection_dim"] = old_config_dict["projection_dim"]
+                if "logit_scale_init_value" in old_config_dict:
+                    config_dict["logit_scale_init_value"] = old_config_dict["logit_scale_init_value"]
+
+        config = cls(**config_dict)
+
+        if hasattr(config, "pruned_heads"):
+            config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
+
+        # Update config with kwargs if needed
+        if "num_labels" in kwargs and "id2label" in kwargs:
+            num_labels = kwargs["num_labels"]
+            id2label = kwargs["id2label"] if kwargs["id2label"] is not None else []
+            if len(id2label) != num_labels:
+                raise ValueError(
+                    f"You passed along `num_labels={num_labels }` with an incompatible id to label map: "
+                    f"{kwargs['id2label']}. Since those arguments are inconsistent with each other, you should remove "
+                    "one of them."
+                )
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+                if key != "dtype":
+                    to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        logger.info(f"Model config {config}")
+        if return_unused_kwargs:
+            return config, kwargs
+        else:
+            return config
+
+
+class CLIPTextConfig(Old2NewPretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate an CLIP
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 49408):
+            Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`CLIPModel`].
+        hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 77):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*,
+            defaults to 1e-5): The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float``, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from paddlenlp.transformers import CLIPTextConfig, CLIPTextModel
+
+    >>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPTextConfig()
+
+    >>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "clip_text_model"
+
+    def __init__(
+        self,
+        vocab_size=49408,
+        hidden_size=512,
+        intermediate_size=2048,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=8,
+        max_position_embeddings=77,
+        hidden_act="quick_gelu",
+        layer_norm_eps=0.00001,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        # This differs from `CLIPTokenizer`'s default and from openai/clip
+        # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
+        pad_token_id=1,
+        bos_token_id=49406,
+        eos_token_id=49407,
+        **kwargs
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> PretrainedConfig:
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from CLIPConfig
+        if config_dict.get("model_type") == "clip":
+            projection_dim = config_dict.get("projection_dim", None)
+            config_dict = config_dict["text_config"]
+            if projection_dim is not None:
+                config_dict["projection_dim"] = projection_dim
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class CLIPVisionConfig(Old2NewPretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate an CLIP
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*,
+            defaults to 1e-5): The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float``, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from paddlenlp.transformers import CLIPVisionConfig, CLIPVisionModel
+
+    >>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPVisionConfig()
+
+    >>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "clip_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=32,
+        hidden_act="quick_gelu",
+        layer_norm_eps=0.00001,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        **kwargs
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> PretrainedConfig:
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from CLIPConfig
+        if config_dict.get("model_type") == "clip":
+            projection_dim = config_dict.get("projection_dim", None)
+            config_dict = config_dict["vision_config"]
+            if projection_dim is not None:
+                config_dict["projection_dim"] = projection_dim
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class CLIPConfig(Old2NewPretrainedConfig):
+    r"""
+    [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
+    CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the CLIP
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimentionality of text and vision projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from paddlenlp.transformers import CLIPConfig, CLIPModel
+
+    >>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPConfig()
+
+    >>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
+
+    >>> # Initializing a CLIPText and CLIPVision configuration
+    >>> config_text = CLIPTextConfig()
+    >>> config_vision = CLIPVisionConfig()
+
+    >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
+    ```"""
+
+    model_type = "clip"
+    is_composition = True
+
+    def __init__(
+        self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(**kwargs)
+
+        # If `_config_dict` exist, we use them for the backward compatibility.
+        text_config_dict = kwargs.pop("text_config_dict", None)
+        vision_config_dict = kwargs.pop("vision_config_dict", None)
+        if text_config_dict is not None:
+            text_config = text_config_dict
+        if vision_config_dict is not None:
+            vision_config = vision_config_dict
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the CLIPTextConfig with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the CLIPVisionConfig with default values.")
+
+        text_config["projection_dim"] = projection_dim
+        vision_config["projection_dim"] = projection_dim
+        self.text_config = CLIPTextConfig(**text_config)
+        self.vision_config = CLIPVisionConfig(**vision_config)
+
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = 1.0
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs):
+        r"""
+        Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
+        configuration.
+
+        Returns:
+            [`CLIPConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+
+    def to_dict(self, *args, **kwargs):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["text_config"] = self.text_config.to_dict()
+        output["vision_config"] = self.vision_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/feature_extraction.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/feature_extraction.py
new file mode 100644
index 000000000..7baf66ffd
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/feature_extraction.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for CLIP."""
+
+__all__ = ["CLIPFeatureExtractor"]
+
+
+import warnings
+
+from .image_processing import CLIPImageProcessor
+
+
+class CLIPFeatureExtractor(CLIPImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of PaddleNLP. Please"
+            " use CLIPImageProcessor instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/image_processing.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/image_processing.py
new file mode 100644
index 000000000..f14c88a44
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/image_processing.py
@@ -0,0 +1,327 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for CLIP."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import PIL
+
+from ..image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ..image_transforms import (
+    center_crop,
+    convert_to_rgb,
+    get_resize_output_image_size,
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from ..image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ..tokenizer_utils_base import TensorType
+
+__all__ = ["CLIPImageProcessor"]
+
+
+class CLIPImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a CLIP image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
+            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
+            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+            `preprocess` method.
+        crop_size (`Dict[str, int]` *optional*, defaults to 224):
+            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize:
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Image standard deviation.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 224}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else [0.48145466, 0.4578275, 0.40821073]
+        self.image_std = image_std if image_std is not None else [0.26862954, 0.26130258, 0.27577711]
+        self.do_convert_rgb = do_convert_rgb
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size, default_to_square=False)
+        if "shortest_edge" not in size:
+            raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
+        output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
+        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
+
+    def center_crop(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the
+        returned result will always be of size `size`).
+
+        Args:
+            image (`np.ndarray`):
+                Image to center crop.
+            size (`Dict[str, int]`):
+                Size of the output image in the form of a dictionary with keys `height` and `width`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` parameter must contain the keys (height, width). Got {size.keys()}")
+        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
+
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ):
+        """
+        Rescale an image by a scale factor. image = image * scale.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`int` or `float`):
+                Scale to apply to the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return rescale(image, scale=scale, data_format=data_format, **kwargs)
+
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: Union[float, List[float]],
+        std: Union[float, List[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            image_mean (`float` or `List[float]`):
+                Image mean.
+            image_std (`float` or `List[float]`):
+                Image standard deviation.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: int = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        **kwargs
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.PADDLE` or `'pt'`: Return a batch of type `paddle.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: defaults to the channel dimension format of the input image.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
+
+        if do_resize and size is None:
+            raise ValueError("Size must be specified if do_resize is True.")
+
+        if do_center_crop and crop_size is None:
+            raise ValueError("Crop size must be specified if do_center_crop is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_resize:
+            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+
+        if do_center_crop:
+            images = [self.center_crop(image=image, size=crop_size) for image in images]
+
+        if do_rescale:
+            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+
+        if do_normalize:
+            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
+
+        images = [to_channel_dimension_format(image, data_format) for image in images]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/modeling.py
new file mode 100644
index 000000000..9598b93ff
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/modeling.py
@@ -0,0 +1,1705 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The OpenAI Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, List, Optional, Tuple, Union
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from ...utils.converter import StateDictNameMapping
+from ...utils.initializer import normal_, ones_, zeros_
+from ..model_outputs import BaseModelOutputWithPooling, ModelOutput
+from ..model_utils import PretrainedModel
+from .configuration import CLIPConfig, CLIPTextConfig, CLIPVisionConfig
+
+CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    # vit model
+    "openai/clip-vit-base-patch32",  # ViT-B/32
+    "openai/clip-vit-base-patch16",  # ViT-B/16
+    "openai/clip-vit-large-patch14",  # ViT-L/14
+    "laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
+    "laion/CLIP-ViT-B-32-laion2B-s34B-b79K",
+    # resnet model
+    "openai/clip-rn50",  # RN50
+    "openai/clip-rn101",  # RN101
+    "openai/clip-rn50x4",  # RN50x4
+]
+
+__all__ = [
+    "ModifiedResNet",
+    "CLIPVisionTransformer",
+    "CLIPTextTransformer",
+    "CLIPTextModel",
+    "CLIPVisionModel",
+    "CLIPPretrainedModel",
+    "CLIPModel",
+    "CLIPTextModelWithProjection",
+    "CLIPVisionModelWithProjection",
+]
+
+
+def quick_gelu(x):
+    return x * F.sigmoid(1.702 * x)
+
+
+F.quick_gelu = quick_gelu
+
+NEG_INF = -1e4  # float("-inf") -1e4 -1e9
+
+# contrastive loss function, adapted from
+# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
+
+
+def contrastive_loss(logits: paddle.Tensor) -> paddle.Tensor:
+    return F.cross_entropy(logits, paddle.arange(len(logits)))
+
+
+def clip_loss(similarity: paddle.Tensor) -> paddle.Tensor:
+    caption_loss = contrastive_loss(similarity)
+    image_loss = contrastive_loss(similarity.t())
+    return (caption_loss + image_loss) / 2.0
+
+
+@dataclass
+class CLIPVisionModelOutput(ModelOutput):
+    """
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+
+    Args:
+        image_embeds (`paddle.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    image_embeds: Optional[paddle.Tensor] = None
+    last_hidden_state: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class CLIPTextModelOutput(ModelOutput):
+    """
+    Base class for text model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        text_embeds (`paddle.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The text embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    text_embeds: Optional[paddle.Tensor] = None
+    last_hidden_state: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class CLIPOutput(ModelOutput):
+    """
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
+        logits_per_image:(`paddle.Tensor` of shape `(image_batch_size, text_batch_size)`):
+            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+            similarity scores.
+        logits_per_text:(`paddle.Tensor` of shape `(text_batch_size, image_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+            similarity scores.
+        text_embeds(`paddle.Tensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
+        image_embeds(`paddle.Tensor` of shape `(batch_size, output_dim`):
+            The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
+        text_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`CLIPTextModel`].
+        vision_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`CLIPVisionModel`].
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits_per_image: paddle.Tensor = None
+    logits_per_text: paddle.Tensor = None
+    text_embeds: paddle.Tensor = None
+    image_embeds: paddle.Tensor = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+class ModifiedResNet(nn.Layer):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+
+    def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+
+        # the 3-layer stem
+        self.conv1 = nn.Conv2D(3, width // 2, kernel_size=3, stride=2, padding=1, bias_attr=False)
+        self.bn1 = nn.BatchNorm2D(width // 2)
+        self.conv2 = nn.Conv2D(width // 2, width // 2, kernel_size=3, padding=1, bias_attr=False)
+        self.bn2 = nn.BatchNorm2D(width // 2)
+        self.conv3 = nn.Conv2D(width // 2, width, kernel_size=3, padding=1, bias_attr=False)
+        self.bn3 = nn.BatchNorm2D(width)
+        self.avgpool = nn.AvgPool2D(2)
+        self.relu = nn.ReLU()
+
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
+
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        def stem(x):
+            for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), (self.conv3, self.bn3)]:
+                x = self.relu(bn(conv(x)))
+            x = self.avgpool(x)
+            return x
+
+        x = stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+
+        return x
+
+
+def multi_head_attention_forward(
+    x: paddle.Tensor,
+    num_heads: int,
+    q_proj: nn.Linear,
+    k_proj: nn.Linear,
+    v_proj: nn.Linear,
+    c_proj: nn.Linear,
+    attn_mask: Optional[paddle.Tensor] = None,
+):
+    max_len, batch_size, emb_dim = x.shape
+    head_dim = emb_dim // num_heads
+    scaling = float(head_dim) ** -0.5
+    q = q_proj(x)  # L, N, E
+    k = k_proj(x)  # L, N, E
+    v = v_proj(x)  # L, N, E
+
+    v = v.reshape((-1, batch_size * num_heads, head_dim)).transpose((1, 0, 2))
+    k = k.reshape((-1, batch_size * num_heads, head_dim)).transpose((1, 0, 2))
+    q = q.reshape((-1, batch_size * num_heads, head_dim)).transpose((1, 0, 2))
+
+    q = q * scaling
+    qk = paddle.matmul(q, k, transpose_y=True)
+    if attn_mask is not None:
+        if attn_mask.ndim == 2:
+            attn_mask.unsqueeze_(0)
+        assert attn_mask.shape[0] == 1 and attn_mask.shape[1] == max_len and attn_mask.shape[2] == max_len
+        qk += attn_mask
+
+    qk = F.softmax(qk, axis=-1)
+    atten = paddle.bmm(qk, v)
+    atten = atten.transpose((1, 0, 2))
+    atten = atten.reshape((max_len, batch_size, emb_dim))
+    atten = c_proj(atten)
+    return atten
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        return x
+
+
+class Bottleneck(nn.Layer):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2D(inplanes, planes, 1, bias_attr=False)
+        self.bn1 = nn.BatchNorm2D(planes)
+
+        self.conv2 = nn.Conv2D(planes, planes, 3, padding=1, bias_attr=False)
+        self.bn2 = nn.BatchNorm2D(planes)
+
+        self.avgpool = nn.AvgPool2D(stride) if stride > 1 else Identity()
+
+        self.conv3 = nn.Conv2D(planes, planes * self.expansion, 1, bias_attr=False)
+        self.bn3 = nn.BatchNorm2D(planes * self.expansion)
+
+        self.relu = nn.ReLU()
+        self.downsample = None
+        self.stride = stride
+
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            self.downsample = nn.Sequential(
+                ("-1", nn.AvgPool2D(stride)),
+                ("0", nn.Conv2D(inplanes, planes * self.expansion, 1, stride=1, bias_attr=False)),
+                ("1", nn.BatchNorm2D(planes * self.expansion)),
+            )
+
+    def forward(self, x):
+        identity = x
+
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.relu(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+        return out
+
+
+class AttentionPool2d(nn.Layer):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+
+        self.positional_embedding = nn.Embedding(spacial_dim**2 + 1, embed_dim)
+
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias_attr=True)
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias_attr=True)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias_attr=True)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim, bias_attr=True)
+        self.num_heads = num_heads
+
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+
+    def forward(self, x):
+
+        x = x.reshape((x.shape[0], x.shape[1], x.shape[2] * x.shape[3])).transpose((2, 0, 1))  # NCHW -> (HW)NC
+        x = paddle.concat([x.mean(axis=0, keepdim=True), x], axis=0)
+        x = x + paddle.unsqueeze(self.positional_embedding.weight, 1)
+        out = multi_head_attention_forward(x, self.num_heads, self.q_proj, self.k_proj, self.v_proj, self.c_proj)
+
+        return out[0]
+
+
+class CLIPPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained CLIP models. It provides CLIP related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    config_class = CLIPConfig
+    base_model_prefix = "clip"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    @classmethod
+    def _get_name_mappings(cls, config: CLIPConfig) -> List[StateDictNameMapping]:
+        mappings: List[StateDictNameMapping] = []
+
+        model_type = config.get("model_type", "clip")
+
+        num_layer_key = "num_hidden_layers"
+        num_text_layer = 0
+        num_vision_layer = 0
+
+        if model_type in ["clip", "clip_text_model"]:
+            text_config = config.get("text_config")
+            if text_config:
+                num_text_layer = text_config.get(num_layer_key, 0)
+            else:
+                num_text_layer = config.get(num_layer_key, 0)
+
+        if model_type in ["clip", "clip_vision_model"]:
+            vision_config = config.get("vision_config")
+            if vision_config:
+                num_vision_layer = vision_config.get(num_layer_key, 0)
+            else:
+                num_vision_layer = config.get(num_layer_key, 0)
+
+        has_text_layer = num_text_layer > 0
+        has_text_projection_layer = has_text_layer and (
+            "CLIPModel" in (config.architectures or [])
+            or "CLIPTextModelWithProjection" in (config.architectures or [])
+            or cls.__name__ in ["CLIPModel", "CLIPTextModelWithProjection"]
+        )
+
+        has_vision_layer = num_vision_layer > 0
+        has_vision_projection_layer = has_vision_layer and (
+            "CLIPModel" in (config.architectures or [])
+            or "CLIPVisionModelWithProjection" in (config.architectures or [])
+            or cls.__name__ in ["CLIPModel", "CLIPVisionModelWithProjection"]
+        )
+
+        if model_type == "clip":
+            hard_mappings = [["logit_scale", "logit_scale"]]
+        else:
+            hard_mappings = []
+
+        # text model
+        if has_text_layer:
+            text_model_layer_mappings = [
+                ["text_model.embeddings.token_embedding.weight", "text_model.token_embedding.weight"],
+                ["text_model.embeddings.position_embedding.weight", "text_model.positional_embedding.weight"],
+                ["text_model.final_layer_norm.weight", "text_model.ln_final.weight"],
+                ["text_model.final_layer_norm.bias", "text_model.ln_final.bias"],
+            ]
+
+            if has_text_projection_layer:
+                text_model_layer_mappings.extend([["text_projection.weight", "text_projection", "transpose"]])
+
+            hard_mappings.extend(text_model_layer_mappings)
+
+            for layer_index in range(num_text_layer):
+                text_model_layer_mappings = [
+                    # qkv out
+                    [
+                        f"text_model.encoder.layers.{layer_index}.self_attn.q_proj.weight",
+                        f"text_model.transformer.layers.{layer_index}.self_attn.q_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"text_model.encoder.layers.{layer_index}.self_attn.q_proj.bias",
+                        f"text_model.transformer.layers.{layer_index}.self_attn.q_proj.bias",
+                    ],
+                    [
+                        f"text_model.encoder.layers.{layer_index}.self_attn.k_proj.weight",
+                        f"text_model.transformer.layers.{layer_index}.self_attn.k_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"text_model.encoder.layers.{layer_index}.self_attn.k_proj.bias",
+                        f"text_model.transformer.layers.{layer_index}.self_attn.k_proj.bias",
+                    ],
+                    [
+                        f"text_model.encoder.layers.{layer_index}.self_attn.v_proj.weight",
+                        f"text_model.transformer.layers.{layer_index}.self_attn.v_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"text_model.encoder.layers.{layer_index}.self_attn.v_proj.bias",
+                        f"text_model.transformer.layers.{layer_index}.self_attn.v_proj.bias",
+                    ],
+                    [
+                        f"text_model.encoder.layers.{layer_index}.self_attn.out_proj.weight",
+                        f"text_model.transformer.layers.{layer_index}.self_attn.out_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"text_model.encoder.layers.{layer_index}.self_attn.out_proj.bias",
+                        f"text_model.transformer.layers.{layer_index}.self_attn.out_proj.bias",
+                    ],
+                    # fc1
+                    [
+                        f"text_model.encoder.layers.{layer_index}.mlp.fc1.weight",
+                        f"text_model.transformer.layers.{layer_index}.linear1.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"text_model.encoder.layers.{layer_index}.mlp.fc1.bias",
+                        f"text_model.transformer.layers.{layer_index}.linear1.bias",
+                    ],
+                    [
+                        f"text_model.encoder.layers.{layer_index}.layer_norm1.weight",
+                        f"text_model.transformer.layers.{layer_index}.norm1.weight",
+                    ],
+                    [
+                        f"text_model.encoder.layers.{layer_index}.layer_norm1.bias",
+                        f"text_model.transformer.layers.{layer_index}.norm1.bias",
+                    ],
+                    # fc2
+                    [
+                        f"text_model.encoder.layers.{layer_index}.mlp.fc2.weight",
+                        f"text_model.transformer.layers.{layer_index}.linear2.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"text_model.encoder.layers.{layer_index}.mlp.fc2.bias",
+                        f"text_model.transformer.layers.{layer_index}.linear2.bias",
+                    ],
+                    [
+                        f"text_model.encoder.layers.{layer_index}.layer_norm2.weight",
+                        f"text_model.transformer.layers.{layer_index}.norm2.weight",
+                    ],
+                    [
+                        f"text_model.encoder.layers.{layer_index}.layer_norm2.bias",
+                        f"text_model.transformer.layers.{layer_index}.norm2.bias",
+                    ],
+                ]
+                hard_mappings.extend(text_model_layer_mappings)
+
+        # vision model
+        if has_vision_layer:
+            vision_model_layer_mappings = [
+                ["vision_model.embeddings.class_embedding", "vision_model.class_embedding"],
+                ["vision_model.embeddings.patch_embedding.weight", "vision_model.conv1.weight"],
+                ["vision_model.embeddings.position_embedding.weight", "vision_model.positional_embedding.weight"],
+                ["vision_model.pre_layrnorm.weight", "vision_model.ln_pre.weight"],
+                ["vision_model.pre_layrnorm.bias", "vision_model.ln_pre.bias"],
+                ["vision_model.post_layernorm.weight", "vision_model.ln_post.weight"],
+                ["vision_model.post_layernorm.bias", "vision_model.ln_post.bias"],
+            ]
+
+            if has_vision_projection_layer:
+                vision_model_layer_mappings.extend([["visual_projection.weight", "vision_projection", "transpose"]])
+
+            hard_mappings.extend(vision_model_layer_mappings)
+            for layer_index in range(num_vision_layer):
+                vision_model_layer_mappings = [
+                    # qkv out
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.self_attn.q_proj.weight",
+                        f"vision_model.transformer.layers.{layer_index}.self_attn.q_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.self_attn.q_proj.bias",
+                        f"vision_model.transformer.layers.{layer_index}.self_attn.q_proj.bias",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.self_attn.k_proj.weight",
+                        f"vision_model.transformer.layers.{layer_index}.self_attn.k_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.self_attn.k_proj.bias",
+                        f"vision_model.transformer.layers.{layer_index}.self_attn.k_proj.bias",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.self_attn.v_proj.weight",
+                        f"vision_model.transformer.layers.{layer_index}.self_attn.v_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.self_attn.v_proj.bias",
+                        f"vision_model.transformer.layers.{layer_index}.self_attn.v_proj.bias",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.self_attn.out_proj.weight",
+                        f"vision_model.transformer.layers.{layer_index}.self_attn.out_proj.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.self_attn.out_proj.bias",
+                        f"vision_model.transformer.layers.{layer_index}.self_attn.out_proj.bias",
+                    ],
+                    # fc1
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.mlp.fc1.weight",
+                        f"vision_model.transformer.layers.{layer_index}.linear1.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.mlp.fc1.bias",
+                        f"vision_model.transformer.layers.{layer_index}.linear1.bias",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.layer_norm1.weight",
+                        f"vision_model.transformer.layers.{layer_index}.norm1.weight",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.layer_norm1.bias",
+                        f"vision_model.transformer.layers.{layer_index}.norm1.bias",
+                    ],
+                    # fc2
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.mlp.fc2.weight",
+                        f"vision_model.transformer.layers.{layer_index}.linear2.weight",
+                        "transpose",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.mlp.fc2.bias",
+                        f"vision_model.transformer.layers.{layer_index}.linear2.bias",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.layer_norm2.weight",
+                        f"vision_model.transformer.layers.{layer_index}.norm2.weight",
+                    ],
+                    [
+                        f"vision_model.encoder.layers.{layer_index}.layer_norm2.bias",
+                        f"vision_model.transformer.layers.{layer_index}.norm2.bias",
+                    ],
+                ]
+                hard_mappings.extend(vision_model_layer_mappings)
+
+        mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(hard_mappings)]
+        return mappings
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, nn.TransformerEncoder):
+            module.enable_recompute = value
+
+    def gradient_checkpointing_enable(self):
+        """
+        Activates gradient checkpointing for the current model.
+
+        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+        activations".
+        """
+        if not self.supports_gradient_checkpointing:
+            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
+        self.apply(partial(self._set_gradient_checkpointing, value=True))
+
+    def gradient_checkpointing_disable(self):
+        """
+        Deactivates gradient checkpointing for the current model.
+
+        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+        activations".
+        """
+        if self.supports_gradient_checkpointing:
+            self.apply(partial(self._set_gradient_checkpointing, value=False))
+
+    def _init_weights(self, layer):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor
+        if isinstance(layer, CLIPVisionTransformer):
+            vision_embed_dim = layer.config.hidden_size
+            vision_layers = layer.config.num_hidden_layers
+            initializer_range = layer.config.initializer_range
+
+            # vision embedding
+            normal_(layer.class_embedding, std=vision_embed_dim**-0.5 * factor)
+            normal_(layer.conv1.weight, std=initializer_range * factor)
+            normal_(layer.positional_embedding.weight, std=initializer_range * factor)
+
+            # init CLIPAttention + CLIPMLP
+            for sub_layer in layer.sublayers():
+                if isinstance(sub_layer, nn.TransformerEncoderLayer):
+                    # self_attn
+                    in_proj_std = (sub_layer.self_attn.embed_dim**-0.5) * ((2 * vision_layers) ** -0.5) * factor
+                    out_proj_std = (sub_layer.self_attn.embed_dim**-0.5) * factor
+                    normal_(sub_layer.self_attn.q_proj.weight, std=in_proj_std)
+                    normal_(sub_layer.self_attn.k_proj.weight, std=in_proj_std)
+                    normal_(sub_layer.self_attn.v_proj.weight, std=in_proj_std)
+                    normal_(sub_layer.self_attn.out_proj.weight, std=out_proj_std)
+                    # ffn
+                    in_proj_std = (sub_layer._config["d_model"] ** -0.5) * ((2 * vision_layers) ** -0.5) * factor
+                    fc_std = (2 * sub_layer._config["d_model"]) ** -0.5 * factor
+                    normal_(sub_layer.linear1.weight, std=fc_std)
+                    normal_(sub_layer.linear2.weight, std=in_proj_std)
+
+        elif isinstance(layer, CLIPTextTransformer):
+            text_layers = layer.config.num_hidden_layers
+            initializer_range = layer.config.initializer_range
+
+            # text embedding
+            normal_(layer.token_embedding.weight, std=factor * 0.02)
+            normal_(layer.positional_embedding.weight, std=factor * 0.02)
+
+            # init CLIPAttention + CLIPMLP
+            for sub_layer in layer.sublayers():
+                if isinstance(sub_layer, nn.TransformerEncoderLayer):
+                    # self_attn
+                    in_proj_std = (sub_layer.self_attn.embed_dim**-0.5) * ((2 * text_layers) ** -0.5) * factor
+                    out_proj_std = (sub_layer.self_attn.embed_dim**-0.5) * factor
+                    normal_(sub_layer.self_attn.q_proj.weight, std=in_proj_std)
+                    normal_(sub_layer.self_attn.k_proj.weight, std=in_proj_std)
+                    normal_(sub_layer.self_attn.v_proj.weight, std=in_proj_std)
+                    normal_(sub_layer.self_attn.out_proj.weight, std=out_proj_std)
+                    # ffn
+                    in_proj_std = (sub_layer._config["d_model"] ** -0.5) * ((2 * text_layers) ** -0.5) * factor
+                    fc_std = (2 * sub_layer._config["d_model"]) ** -0.5 * factor
+                    normal_(sub_layer.linear1.weight, std=fc_std)
+                    normal_(sub_layer.linear2.weight, std=in_proj_std)
+
+        elif isinstance(layer, ModifiedResNet):
+            if layer.attnpool is not None:
+                std = layer.output_dim**-0.5
+                normal_(layer.attnpool.q_proj.weight, std=std)
+                normal_(layer.attnpool.k_proj.weight, std=std)
+                normal_(layer.attnpool.v_proj.weight, std=std)
+                normal_(layer.attnpool.c_proj.weight, std=std)
+
+            for resnet_block in [layer.layer1, layer.layer2, layer.layer3, layer.layer4]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        zeros_(param)
+
+        elif isinstance(layer, CLIPModel):
+            normal_(layer.text_projection, std=layer.text_embed_dim**-0.5 * self.config.initializer_factor)
+            if hasattr(layer, "vision_projection"):
+                normal_(layer.vision_projection, std=layer.vision_embed_dim**-0.5 * self.config.initializer_factor)
+        elif isinstance(layer, CLIPVisionModelWithProjection):
+            if hasattr(layer, "vision_projection"):
+                normal_(layer.vision_projection, std=self.config.hidden_size**-0.5 * self.config.initializer_factor)
+        elif isinstance(layer, CLIPTextModelWithProjection):
+            normal_(layer.text_projection, std=self.config.hidden_size**-0.5 * self.config.initializer_factor)
+
+        if isinstance(layer, nn.LayerNorm):
+            zeros_(layer.bias)
+            ones_(layer.weight)
+
+        if isinstance(layer, nn.Linear) and layer.bias is not None:
+            zeros_(layer.bias)
+
+
+class CLIPTextTransformer(nn.Layer):
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.positional_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
+
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=config.hidden_size,
+            nhead=config.num_attention_heads,
+            dim_feedforward=config.intermediate_size,
+            normalize_before=True,
+            dropout=0.0,
+            activation=config.hidden_act,
+            attn_dropout=config.attention_dropout,
+            act_dropout=0.0,
+        )
+        self.transformer = nn.TransformerEncoder(encoder_layer, config.num_hidden_layers)
+        self.ln_final = nn.LayerNorm(embed_dim)
+
+        # For `pooled_output` computation
+        self.eos_token_id = config.eos_token_id
+
+        self.register_buffer(
+            "causal_mask",
+            paddle.triu(
+                paddle.ones((1, 1, config.max_position_embeddings, config.max_position_embeddings)) * NEG_INF,
+                diagonal=1,
+            ),
+            persistable=False,
+        )
+        self.register_buffer(
+            "position_ids",
+            paddle.arange(config.max_position_embeddings, dtype="int64").reshape((1, -1)),
+            persistable=False,
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Args:
+            input_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+                it.
+                Indices can be obtained using [`CLIPTokenizer`].
+            attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            position_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+                config.max_position_embeddings - 1]`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`BaseModelOutputWithPooling`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        bs, seqlen = input_ids.shape
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seqlen].cast("int64")
+
+        embedding_output = self.token_embedding(input_ids) + self.positional_embedding(
+            position_ids
+        )  # [batch_size, n_ctx, d_model]
+
+        causal_mask = self.causal_mask[:, :, :seqlen, :seqlen]
+        if attention_mask is not None:
+            assert attention_mask.ndim == 2
+            expanded_mask = attention_mask[:, None, None, :].expand([bs, 1, seqlen, -1]).cast(causal_mask.dtype)
+            inverted_mask = (1.0 - expanded_mask) * NEG_INF
+            attention_mask = inverted_mask + causal_mask
+        else:
+            attention_mask = causal_mask
+        attention_mask.stop_gradient = True
+
+        encoder_outputs = self.transformer(
+            embedding_output,
+            src_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if isinstance(encoder_outputs, type(embedding_output)):
+            last_hidden_state = encoder_outputs
+        else:
+            last_hidden_state = encoder_outputs[0]
+
+        last_hidden_state = self.ln_final(last_hidden_state)
+
+        if self.eos_token_id == 2:
+            # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
+            # A CLIP model with such `eos_token_id` in the config can't work correctly with extra new tokens added
+            # ------------------------------------------------------------
+            # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+            # take features from the eot embedding (eot_token is the highest number in each sequence)
+            # casting to paddle.int32 for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+            pooled_output = last_hidden_state.gather_nd(
+                paddle.stack(
+                    [paddle.arange(last_hidden_state.shape[0], dtype="int32"), input_ids.argmax(-1, dtype="int32")],
+                    axis=-1,
+                )
+            )
+        else:
+            # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
+            # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
+            pooled_output = last_hidden_state.gather_nd(
+                paddle.stack(
+                    [
+                        paddle.arange(last_hidden_state.shape[0], dtype="int32"),
+                        # make sure we have 1D tensor, not 0D tensor
+                        (input_ids == paddle.to_tensor([self.eos_token_id], dtype=input_ids.dtype))
+                        .cast("int32")
+                        .argmax(axis=-1, dtype="int32"),
+                    ],
+                    axis=-1,
+                )
+            )
+
+        if isinstance(encoder_outputs, type(embedding_output)):
+            return (last_hidden_state, pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class CLIPTextModel(CLIPPretrainedModel):
+    r"""
+    The text model from CLIP without any head or projection on top.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`CLIPTextConfig`):
+            An instance of CLIPTextConfig used to construct CLIPTextModel.
+    """
+
+    config_class = CLIPTextConfig
+
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__(config)
+        self.text_model = CLIPTextTransformer(config)
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.text_model.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.token_embedding = value
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Args:
+            input_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+                it.
+                Indices can be obtained using [`CLIPTokenizer`].
+            attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            position_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+                config.max_position_embeddings - 1]`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`BaseModelOutputWithPooling`] instead of a plain tuple.
+
+        Returns:
+            An instance of :class:`BaseModelOutputWithPooling` if `return_dict=True`. Otherwise it returns a tuple of tensors
+            corresponding to ordered and not None (depending on the input arguments) fields of :class:`BaseModelOutputWithPooling`.
+
+        Examples:
+
+        ```python
+        >>> from paddlenlp.transformers import CLIPTokenizer, CLIPTextModel
+
+        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pd")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class CLIPVisionTransformer(nn.Layer):
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.input_resolution = config.image_size
+        self.class_embedding = self.create_parameter(
+            (embed_dim,),
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Assign(paddle.randn((embed_dim,))),
+        )
+        self.conv1 = nn.Conv2D(
+            in_channels=config.num_channels,
+            out_channels=embed_dim,
+            kernel_size=config.patch_size,
+            stride=config.patch_size,
+            bias_attr=False,
+        )
+        self.num_patches = (config.image_size // config.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.positional_embedding = nn.Embedding(self.num_positions, embed_dim)
+
+        self.ln_pre = nn.LayerNorm(embed_dim)
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=config.hidden_size,
+            nhead=config.num_attention_heads,
+            dim_feedforward=config.intermediate_size,
+            normalize_before=True,
+            dropout=0.0,
+            activation=config.hidden_act,
+            attn_dropout=config.attention_dropout,
+            act_dropout=0.0,
+        )
+        self.transformer = nn.TransformerEncoder(encoder_layer, config.num_hidden_layers)
+        self.ln_post = nn.LayerNorm(embed_dim)
+        self.register_buffer(
+            "position_ids",
+            paddle.arange(self.num_positions).reshape((1, -1)),
+            persistable=False,
+        )
+
+    def forward(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Args:
+            pixel_values (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+                [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`BaseModelOutputWithPooling`] instead of a plain tuple.
+
+        Returns:
+            An instance of :class:`BaseModelOutputWithPooling` if `return_dict=True`. Otherwise it returns a tuple of tensors
+            corresponding to ordered and not None (depending on the input arguments) fields of :class:`BaseModelOutputWithPooling`.
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        target_dtype = self.conv1.weight.dtype
+        pixel_values = self.conv1(pixel_values.cast(target_dtype))
+
+        pixel_values = pixel_values.reshape((pixel_values.shape[0], pixel_values.shape[1], -1))
+        pixel_values = pixel_values.transpose((0, 2, 1))
+        embedding_output = paddle.concat(
+            [self.class_embedding.unsqueeze([0, 1]).expand([pixel_values.shape[0], -1, -1]), pixel_values], axis=1
+        )
+        hidden_states = embedding_output + self.positional_embedding.weight
+        hidden_states = self.ln_pre(hidden_states)
+
+        encoder_outputs = self.transformer(
+            hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if isinstance(encoder_outputs, type(embedding_output)):
+            last_hidden_state = encoder_outputs
+        else:
+            last_hidden_state = encoder_outputs[0]
+
+        pooled_output = self.ln_post(last_hidden_state[:, 0])
+
+        if isinstance(encoder_outputs, type(embedding_output)):
+            return (last_hidden_state, pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def forward_pre(self, x):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape([x.shape[0], x.shape[1], -1])  # shape = [*, width, grid ** 2]
+        x = x.transpose((0, 2, 1))  # shape = [*, grid ** 2, width]
+        # t = self.class_embedding.weight + paddle.zeros([x.shape[0], 1, x.shape[-1]], dtype=x.dtype)
+        t = self.class_embedding.unsqueeze([0, 1]).expand([x.shape[0], -1, -1]) + paddle.zeros(
+            [x.shape[0], 1, x.shape[-1]], dtype=x.dtype
+        )
+        x = paddle.concat([t, x], axis=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.weight
+        x = self.ln_pre(x)
+        return x
+
+    def forward_post(self, x):
+        x = self.ln_post(x)
+        return x
+
+
+class CLIPVisionModel(CLIPPretrainedModel):
+    r"""
+    The vision model from CLIP without any head or projection on top.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`CLIPVisionConfig`):
+            An instance of CLIPVisionConfig used to construct CLIPVisionModel.
+    """
+    config_class = CLIPVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__(config)
+        if isinstance(config.num_hidden_layers, (tuple, list)):
+            raise NotImplementedError("We only support VIT CLIP Vision Transformer!")
+
+        self.vision_model = CLIPVisionTransformer(config)
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.vision_model.conv1
+
+    def forward(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Args:
+            pixel_values (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+                [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`BaseModelOutputWithPooling`] instead of a plain tuple.
+
+        Returns:
+            An instance of :class:`BaseModelOutputWithPooling` if `return_dict=True`. Otherwise it returns a tuple of tensors
+            corresponding to ordered and not None (depending on the input arguments) fields of :class:`BaseModelOutputWithPooling`.
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import CLIPProcessor, CLIPVisionModel
+
+        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pd")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class CLIPModel(CLIPPretrainedModel):
+    r"""
+    The bare CLIP Model outputting logits_per_image and logits_per_text.
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`CLIPConfig`):
+            An instance of CLIPConfig used to construct CLIPModel.
+    """
+    config_class = CLIPConfig
+
+    def __init__(self, config: CLIPConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, CLIPTextConfig):
+            raise ValueError(
+                "config.text_config is expected to be of type CLIPTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, CLIPVisionConfig):
+            raise ValueError(
+                "config.vision_config is expected to be of type CLIPVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = CLIPTextTransformer(text_config)
+
+        if isinstance(vision_config.num_hidden_layers, (tuple, list)):
+            if vision_config.num_attention_heads is None:
+                vision_heads = vision_config.hidden_size * 32 // 64
+            else:
+                vision_heads = vision_config.num_attention_heads
+            self.vision_model = ModifiedResNet(
+                layers=vision_config.num_hidden_layers,
+                output_dim=self.projection_dim,
+                heads=vision_heads,
+                input_resolution=vision_config.image_size,
+                width=vision_config.hidden_size,
+            )
+        else:
+            self.vision_model = CLIPVisionTransformer(vision_config)
+            self.vision_projection = paddle.create_parameter(
+                (self.vision_embed_dim, self.projection_dim), paddle.get_default_dtype()
+            )
+        self.text_projection = paddle.create_parameter(
+            (self.text_embed_dim, self.projection_dim), paddle.get_default_dtype()
+        )
+
+        self.logit_scale = paddle.create_parameter(
+            (1,),
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(config.logit_scale_init_value),
+        )
+
+    def get_text_features(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> paddle.Tensor:
+        r"""
+        Args:
+            input_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+                it.
+                Indices can be obtained using [`CLIPTokenizer`].
+            attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            position_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+                config.max_position_embeddings - 1]`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`BaseModelOutputWithPooling`] instead of a plain tuple.
+
+        Returns:
+            text_features (`paddle.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from paddlenlp.transformers import CLIPTokenizer, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pd")
+        >>> text_features = model.get_text_features(**inputs)
+        ```
+        """
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+        text_features = paddle.matmul(pooled_output, self.text_projection)
+
+        return text_features
+
+    def get_image_features(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> paddle.Tensor:
+        r"""
+        Args:
+            pixel_values (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+                [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`BaseModelOutputWithPooling`] instead of a plain tuple.
+
+        Returns:
+            image_features (`paddle.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import CLIPProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pd")
+
+        >>> image_features = model.get_image_features(**inputs)
+        ```
+        """
+        if isinstance(self.vision_model, ModifiedResNet):
+            return self.vision_model(pixel_values)
+        else:
+            # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+            output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+            output_hidden_states = (
+                output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+            )
+            return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+            vision_outputs = self.vision_model(
+                pixel_values=pixel_values,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+            pooled_output = vision_outputs[1]  # pooled_output
+            image_features = paddle.matmul(pooled_output, self.vision_projection)
+
+            return image_features
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        pixel_values: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLIPOutput]:
+        r"""
+        The CLIPModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it.
+                Its data type should be `int64` and it has a shape of [text_batch_size, sequence_length].
+            pixel_values (Tensor):
+                Pixel values. Padding will be ignored by default should you provide it.
+                Its data type should be `float32` and it has a shape of [image_batch_size, num_channels, height, width].
+            position_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings (CLIPTextTransformer). Selected in
+                the range ``[0, max_text_length - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention (CLIPTextTransformer) to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `0.0` values and the others have `1.0` values.
+                It is a tensor with shape `[batch_size, sequence_length`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`CLIPOutput` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `True`.
+
+        Returns:
+            An instance of :class:`CLIPOutput` if `return_dict=True`. Otherwise it returns a tuple of tensors
+            corresponding to ordered and not None (depending on the input arguments) fields of :class:`CLIPOutput`.
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> import paddle.nn.functional as F
+        >>> from paddlenlp.transformers import CLIPProcessor, CLIPModel
+
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> model.eval()
+        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pd", padding=True
+        ... )
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = F.softmax(logits_per_image, axis=1)  # we can take the softmax to get the label probabilities
+        ```
+        """
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if isinstance(self.vision_model, ModifiedResNet):
+            vision_outputs = None
+            image_embeds = self.vision_model(pixel_values)
+        else:
+            vision_outputs = self.vision_model(
+                pixel_values=pixel_values,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            image_embeds = vision_outputs[1]
+            image_embeds = paddle.matmul(image_embeds, self.vision_projection)
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_embeds = text_outputs[1]
+        text_embeds = paddle.matmul(text_embeds, self.text_projection)
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(axis=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(axis=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_text = paddle.matmul(text_embeds, image_embeds, transpose_y=True) * logit_scale
+        logits_per_image = logits_per_text.t()
+
+        loss = None
+        if return_loss:
+            loss = clip_loss(logits_per_text)
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return CLIPOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+
+
+class CLIPTextModelWithProjection(CLIPPretrainedModel):
+    r"""
+    CLIP Text Model with a projection layer on top (a linear layer on top of the pooled output).
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`CLIPTextConfig`):
+            An instance of CLIPTextConfig used to construct CLIPTextModelWithProjection.
+    """
+    config_class = CLIPTextConfig
+
+    def __init__(self, config: CLIPTextConfig):
+        super().__init__(config)
+
+        self.text_model = CLIPTextTransformer(config)
+
+        self.text_projection = paddle.create_parameter(
+            (config.hidden_size, config.projection_dim), paddle.get_default_dtype()
+        )
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.text_model.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.token_embedding = value
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLIPTextModelOutput]:
+        r"""
+        Args:
+            input_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+                it.
+                Indices can be obtained using [`CLIPTokenizer`].
+            attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            position_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+                config.max_position_embeddings - 1]`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`CLIPTextModelOutput`] instead of a plain tuple.
+                If `False`, the output will be a tuple of tensors. Defaults to `None`.
+
+        Returns:
+            An instance of :class:`CLIPTextModelOutput` if `return_dict=True`. Otherwise it returns a tuple of tensors
+            corresponding to ordered and not None (depending on the input arguments) fields of :class:`CLIPTextModelOutput`.
+
+        Examples:
+
+        ```python
+        >>> from paddlenlp.transformers import CLIPTokenizer, CLIPTextModelWithProjection
+
+        >>> model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pd")
+
+        >>> outputs = model(**inputs)
+        >>> text_embeds = outputs.text_embeds
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+
+        text_embeds = paddle.matmul(pooled_output, self.text_projection)
+
+        if not return_dict:
+            outputs = (text_embeds, text_outputs[0]) + text_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
+
+        return CLIPTextModelOutput(
+            text_embeds=text_embeds,
+            last_hidden_state=text_outputs.last_hidden_state,
+            hidden_states=text_outputs.hidden_states,
+            attentions=text_outputs.attentions,
+        )
+
+
+class CLIPVisionModelWithProjection(CLIPPretrainedModel):
+    r"""
+    CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output).
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`CLIPVisionConfig`):
+            An instance of CLIPVisionConfig used to construct CLIPVisionModelWithProjection.
+    """
+    config_class = CLIPVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: CLIPVisionConfig):
+        super().__init__(config)
+
+        # support resnet vision model
+        if isinstance(config.num_hidden_layers, (tuple, list)):
+            if config.num_attention_heads is None:
+                vision_heads = config.hidden_size * 32 // 64
+            else:
+                vision_heads = config.num_attention_heads
+            self.vision_model = ModifiedResNet(
+                layers=config.num_hidden_layers,
+                output_dim=config.projection_dim,
+                heads=vision_heads,
+                input_resolution=config.image_size,
+                width=config.hidden_size,
+            )
+        else:
+            self.vision_model = CLIPVisionTransformer(config)
+            self.vision_projection = paddle.create_parameter(
+                (config.hidden_size, config.projection_dim), paddle.get_default_dtype()
+            )
+
+    def get_input_embeddings(self) -> nn.Layer:
+        if isinstance(self.vision_model, CLIPVisionTransformer):
+            return self.vision_model.conv1
+        else:
+            return None
+
+    def forward(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLIPVisionModelOutput]:
+        r"""
+        Args:
+            pixel_values (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+                [`CLIPFeatureExtractor`]. See [`CLIPFeatureExtractor.__call__`] for details.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`CLIPVisionModelOutput`] instead of a plain tuple.
+
+        Returns:
+            An instance of :class:`CLIPVisionModelOutput` if `return_dict=True`. Otherwise it returns a tuple of tensors
+            corresponding to ordered and not None (depending on the input arguments) fields of :class:`CLIPVisionModelOutput`.
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import CLIPProcessor, CLIPVisionModelWithProjection
+
+        >>> model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pd")
+
+        >>> outputs = model(**inputs)
+        >>> image_embeds = outputs.image_embeds
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if isinstance(self.vision_model, ModifiedResNet):
+            image_embeds = self.vision_model(pixel_values)
+            if not return_dict:
+                return (image_embeds,)
+            else:
+                return CLIPVisionModelOutput(image_embeds=image_embeds)
+        else:
+            vision_outputs = self.vision_model(
+                pixel_values=pixel_values,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            pooled_output = vision_outputs[1]  # pooled_output
+
+            image_embeds = paddle.matmul(pooled_output, self.vision_projection)
+
+            if not return_dict:
+                outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:]
+                return tuple(output for output in outputs if output is not None)
+
+            return CLIPVisionModelOutput(
+                image_embeds=image_embeds,
+                last_hidden_state=vision_outputs.last_hidden_state,
+                hidden_states=vision_outputs.hidden_states,
+                attentions=vision_outputs.attentions,
+            )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/processing.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/processing.py
new file mode 100644
index 000000000..3424f643e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/processing.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image/Text processor class for CLIP
+"""
+import warnings
+
+from ..processing_utils import ProcessorMixin
+from ..tokenizer_utils_base import BatchEncoding
+
+__all__ = ["CLIPProcessor"]
+
+
+class CLIPProcessor(ProcessorMixin):
+    r"""
+    Constructs a CLIP processor which wraps a CLIP image processor and a CLIP tokenizer into a single processor.
+
+    [`CLIPProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`CLIPTokenizer`]. See the
+    [`~CLIPProcessor.__call__`] and [`~CLIPProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`CLIPImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`CLIPTokenizer`]):
+            The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "CLIPImageProcessor"
+    tokenizer_class = "CLIPTokenizer"
+
+    pretrained_init_configuration = {
+        "openai/clip-vit-base-patch32": {"do_lower_case": True},
+        "openai/clip-vit-base-patch16": {"do_lower_case": True},
+        "openai/clip-vit-large-patch14": {"do_lower_case": True},
+        "laion/CLIP-ViT-H-14-laion2B-s32B-b79K": {"do_lower_case": True},
+        "laion/CLIP-ViT-B-32-laion2B-s34B-b79K": {"do_lower_case": True},
+        "openai/clip-rn50": {"do_lower_case": True},
+        "openai/clip-rn101": {"do_lower_case": True},
+        "openai/clip-rn50x4": {"do_lower_case": True},
+    }
+
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        if "feature_extractor" in kwargs:
+            warnings.warn(
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                " instead.",
+                FutureWarning,
+            )
+            feature_extractor = kwargs.pop("feature_extractor")
+
+        image_processor = image_processor if image_processor is not None else feature_extractor
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to CLIPTokenizer's [`~CLIPTokenizer.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `paddle.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[paddle.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or Paddle
+                tensor. In case of a NumPy array/Paddle tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'pd'`: Return Paddle `paddle.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+
+        Returns:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+        if text is not None:
+            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+
+        if images is not None:
+            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+
+        if text is not None and images is not None:
+            encoding["pixel_values"] = image_features.pixel_values
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+    @property
+    def feature_extractor_class(self):
+        warnings.warn(
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
+            FutureWarning,
+        )
+        return self.image_processor_class
+
+    @property
+    def feature_extractor(self):
+        warnings.warn(
+            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
+            FutureWarning,
+        )
+        return self.image_processor
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/tokenizer.py
new file mode 100644
index 000000000..64bab28d4
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clip/tokenizer.py
@@ -0,0 +1,553 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shutil
+import unicodedata
+from functools import lru_cache
+from typing import List, Optional
+
+from paddle.utils import try_import
+
+from ...utils.log import logger
+from .. import AddedToken, PretrainedTokenizer
+from ..bert.tokenizer import _is_control, _is_punctuation, _is_whitespace
+
+__all__ = ["CLIPTokenizer"]
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+    characters the bpe code barfs on.
+
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
+    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
+    tables between utf-8 bytes and unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+def whitespace_clean(text, re):
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
+class BasicTokenizer(object):
+    """
+    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
+
+    Args:
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
+    """
+
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+        self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
+
+    def tokenize(self, text, never_split=None):
+        """
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
+
+        Args:
+            never_split (`List[str]`, *optional*)
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
+                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
+        """
+        # union() returns a new set by concatenating the two sets.
+        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
+        split_tokens = []
+        for token in orig_tokens:
+            if token not in never_split:
+                if self.do_lower_case:
+                    token = token.lower()
+                    if self.strip_accents is not False:
+                        token = self._run_strip_accents(token)
+                elif self.strip_accents:
+                    token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)  #
+            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+        ):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class CLIPTokenizer(PretrainedTokenizer):
+    r"""
+    Construct a CLIP tokenizer based on byte-level Byte-Pair-Encoding.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.gpt.tokenizer.GPTTokenizer`.
+    For more information regarding those methods, please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            Path to the vocabulary file.
+            The vocab file contains a mapping from vocabulary strings to indices.
+        merges_file (str):
+            Path to the merge file.
+            The merge file is used to split the input sentence into "subword" units.
+            The vocab file is then used to encode those units as intices.
+        errors (str):
+            Paradigm to follow when decoding bytes to UTF-8.
+            Defaults to `'replace'`.
+        max_len (int, optional):
+            The maximum value of the input sequence length.
+            Defaults to `77`.
+        bos_token (str, optional):
+            The beginning of sequence token that was used during pretraining. Can be
+            used a sequence classifier token.
+            Defaults to `"<|startoftext|>"`.
+        eos_token (str, optional):
+            A special token representing the end of a sequence that was used during pretraining.
+            Defaults to `"<|endoftext|>"`.
+        unk_token (str, optional):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to `"<|endoftext|>"`.
+        pad_token (str, optional):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to `"<|endoftext|>"`.
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import AutoTokenizer
+
+            tokenizer = AutoTokenizer.from_pretrained('openai/clip-vit-base-patch32')
+            print(tokenizer('He was a puppeteer'))
+
+            '''
+            {'input_ids': [49406, 797, 739, 320, 7116, 38820, 528, 49407]}
+            '''
+
+    """
+    # merges and vocab same as GPT2
+    resource_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
+    pretrained_resource_files_map = {"vocab_file": {}, "merges_file": {}}
+    pretrained_init_configuration = {}
+    model_input_names = [
+        "input_ids",
+    ]
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        max_len=77,
+        bos_token="<|startoftext|>",
+        eos_token="<|endoftext|>",
+        unk_token="<|endoftext|>",
+        pad_token="<|endoftext|>",
+        **kwargs
+    ):
+
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+
+        self._build_special_tokens_map_extended(
+            bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token
+        )
+
+        try:
+            import ftfy
+
+            self.fix_text = ftfy.fix_text
+        except ImportError:
+            logger.info("ftfy or spacy is not installed using custom BasicTokenizer instead of ftfy.")
+            self.nlp = BasicTokenizer(strip_accents=False, do_split_on_punc=False, do_lower_case=True)
+            self.fix_text = None
+        self.re = try_import("regex")
+
+        self._vocab_file = vocab_file
+        self._merges_file = merges_file
+        self.max_len = max_len if max_len is not None else int(1e12)
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            bpe_merges = merges_handle.read().strip().split("\n")[1 : 49152 - 256 - 2 + 1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {"<|startoftext|>": "<|startoftext|>", "<|endoftext|>": "<|endoftext|>"}
+
+        self.pat = self.re.compile(
+            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            self.re.IGNORECASE,
+        )
+
+    @property
+    def vocab_size(self):
+        """
+        Returns the size of vocabulary.
+
+        Returns:
+            int: The sum of size of vocabulary and the size of speical tokens.
+
+        """
+        return len(self.encoder)
+
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A CLIP sequence has the following format:
+
+        - single sequence: `<|startoftext|> X <|endoftext|>`
+
+        Pairs of sequences are not the expected use case, but they will be handled without a separator.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of input IDs with the appropriate special tokens.
+        """
+        bos_token = [self.bos_token_id]
+        eos_token = [self.eos_token_id]
+
+        if token_ids_1 is None:
+            return bos_token + token_ids_0 + eos_token
+        return bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        """
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        Should be overridden in a subclass if the model has a special way of building those.
+
+        Args:
+            offset_mapping_0 (List[tuple]):
+                List of char offsets to which the special tokens will be added.
+            offset_mapping_1 (List[tuple], optional):
+                Optional second list of char offsets for offset mapping pairs.
+
+        Returns:
+            List[tuple]: List of char offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0), (0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1] + [1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed. CLIP does not make use of token type ids, therefore a list of
+        zeros is returned.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        bos_token = [self.bos_token_id]
+        eos_token = [self.eos_token_id]
+
+        if token_ids_1 is None:
+            return len(bos_token + token_ids_0 + eos_token) * [0]
+        return len(bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token) * [0]
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + (token[-1] + "</w>",)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token + "</w>"
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        if self.fix_text is None:
+            text = " ".join(self.nlp.tokenize(text))
+        else:
+            text = whitespace_clean(self.fix_text(text), self.re).lower()
+
+        for token in self.re.findall(self.pat, text):
+            token = "".join(
+                self.byte_encoder[b] for b in token.encode("utf-8")
+            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        text = "".join(tokens)
+        byte_array = bytearray([self.byte_decoder[c] for c in text])
+        text = byte_array.decode("utf-8", errors=self.errors).replace("</w>", " ").strip()
+        return text
+
+    def save_resources(self, save_directory):
+        """
+        Saves `SentencePiece <https://github.com/google/sentencepiece>`__ file
+        (ends with '.spm') under `save_directory`.
+
+        Args:
+            save_directory (str): Directory to save files into.
+        """
+        for name, file_name in self.resource_files_names.items():
+            source_path = getattr(self, "_%s" % name)
+
+            save_path = os.path.join(save_directory, file_name)
+            if os.path.abspath(source_path) != os.path.abspath(save_path):
+                shutil.copyfile(source_path, save_path)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clipseg/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clipseg/__init__.py
new file mode 100644
index 000000000..595add0ae
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clipseg/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clipseg/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clipseg/configuration.py
new file mode 100644
index 000000000..da0d6bf31
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clipseg/configuration.py
@@ -0,0 +1,413 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The OpenAI Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" CLIPSeg model configuration"""
+
+import copy
+import os
+from typing import Union
+
+from ...utils.log import logger
+from ..configuration_utils import PretrainedConfig
+
+CLIPSEG_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "CIDAS/clipseg-rd64": "https://huggingface.co/CIDAS/clipseg-rd64/resolve/main/config.json",
+}
+
+__all__ = [
+    "CLIPSegTextConfig",
+    "CLIPSegVisionConfig",
+    "CLIPSegConfig",
+]
+
+
+class CLIPSegTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CLIPSegModel`]. It is used to instantiate an
+    CLIPSeg model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the CLIPSeg
+    [CIDAS/clipseg-rd64](https://huggingface.co/CIDAS/clipseg-rd64) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 49408):
+            Vocabulary size of the CLIPSeg text model. Defines the number of different tokens that can be represented
+            by the `inputs_ids` passed when calling [`CLIPSegModel`].
+        hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 77):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float``, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import CLIPSegTextConfig, CLIPSegTextModel
+    >>> # Initializing a CLIPSegTextConfig with CIDAS/clipseg-rd64 style configuration
+    >>> configuration = CLIPSegTextConfig()
+    >>> # Initializing a CLIPSegTextModel (with random weights) from the CIDAS/clipseg-rd64 style configuration
+    >>> model = CLIPSegTextModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "clipseg_text_model"
+
+    def __init__(
+        self,
+        vocab_size=49408,
+        hidden_size=512,
+        intermediate_size=2048,
+        num_hidden_layers=12,
+        num_attention_heads=8,
+        max_position_embeddings=77,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        pad_token_id=1,
+        bos_token_id=49406,
+        eos_token_id=49407,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from CLIPSegConfig
+        if config_dict.get("model_type") == "clipseg":
+            config_dict = config_dict["text_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class CLIPSegVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CLIPSegModel`]. It is used to instantiate an
+    CLIPSeg model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the CLIPSeg
+    [CIDAS/clipseg-rd64](https://huggingface.co/CIDAS/clipseg-rd64) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 32):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float``, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import CLIPSegVisionConfig, CLIPSegVisionModel
+    >>> # Initializing a CLIPSegVisionConfig with CIDAS/clipseg-rd64 style configuration
+    >>> configuration = CLIPSegVisionConfig()
+    >>> # Initializing a CLIPSegVisionModel (with random weights) from the CIDAS/clipseg-rd64 style configuration
+    >>> model = CLIPSegVisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "clipseg_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=32,
+        hidden_act="quick_gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from CLIPSegConfig
+        if config_dict.get("model_type") == "clipseg":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class CLIPSegConfig(PretrainedConfig):
+    r"""
+    [`CLIPSegConfig`] is the configuration class to store the configuration of a [`CLIPSegModel`]. It is used to
+    instantiate a CLIPSeg model according to the specified arguments, defining the text model and vision model configs.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the CLIPSeg
+    [CIDAS/clipseg-rd64](https://huggingface.co/CIDAS/clipseg-rd64) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPSegTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPSegVisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of text and vision projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The inital value of the *logit_scale* paramter. Default is used as per the original CLIPSeg implementation.
+        extract_layers (`List[int]`, *optional*, defaults to [3, 6, 9]):
+            Layers to extract when forwarding the query image through the frozen visual backbone of CLIP.
+        reduce_dim (`int`, *optional*, defaults to 64):
+            Dimensionality to reduce the CLIP vision embedding.
+        decoder_num_attention_heads (`int`, *optional*, defaults to 4):
+            Number of attention heads in the decoder of CLIPSeg.
+        decoder_attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        decoder_hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        decoder_intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layers in the Transformer decoder.
+        conditional_layer (`int`, *optional*, defaults to 0):
+            The layer to use of the Transformer encoder whose activations will be combined with the condition
+            embeddings using FiLM (Feature-wise Linear Modulation). If 0, the last layer is used.
+        use_complex_transposed_convolution (`bool`, *optional*, defaults to `False`):
+            Whether to use a more complex transposed convolution in the decoder, enabling more fine-grained
+            segmentation.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import CLIPSegConfig, CLIPSegModel
+    >>> # Initializing a CLIPSegConfig with CIDAS/clipseg-rd64 style configuration
+    >>> configuration = CLIPSegConfig()
+    >>> # Initializing a CLIPSegModel (with random weights) from the CIDAS/clipseg-rd64 style configuration
+    >>> model = CLIPSegModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    >>> # We can also initialize a CLIPSegConfig from a CLIPSegTextConfig and a CLIPSegVisionConfig
+    >>> # Initializing a CLIPSegText and CLIPSegVision configuration
+    >>> config_text = CLIPSegTextConfig()
+    >>> config_vision = CLIPSegVisionConfig()
+    >>> config = CLIPSegConfig.from_text_vision_configs(config_text, config_vision)
+    ```"""
+
+    model_type = "clipseg"
+    is_composition = True
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        projection_dim=512,
+        logit_scale_init_value=2.6592,
+        extract_layers=[3, 6, 9],
+        reduce_dim=64,
+        decoder_num_attention_heads=4,
+        decoder_attention_dropout=0.0,
+        decoder_hidden_act="quick_gelu",
+        decoder_intermediate_size=2048,
+        conditional_layer=0,
+        use_complex_transposed_convolution=False,
+        **kwargs,
+    ):
+        # If `_config_dict` exist, we use them for the backward compatibility.
+        # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
+        # of confusion!).
+        text_config_dict = kwargs.pop("text_config_dict", None)
+        vision_config_dict = kwargs.pop("vision_config_dict", None)
+
+        super().__init__(**kwargs)
+
+        # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
+        # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
+        # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
+        if text_config_dict is not None:
+            if text_config is None:
+                text_config = {}
+
+            # This is the complete result when using `text_config_dict`.
+            _text_config_dict = CLIPSegTextConfig(**text_config_dict).to_dict()
+
+            # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
+            for key, value in _text_config_dict.items():
+                if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
+                    # If specified in `text_config_dict`
+                    if key in text_config_dict:
+                        message = (
+                            f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
+                            f'The value `text_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`text_config_dict` is provided which will be used to initialize `CLIPSegTextConfig`. The "
+                            f'value `text_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `text_config` with the ones in `_text_config_dict`.
+            text_config.update(_text_config_dict)
+
+        if vision_config_dict is not None:
+            if vision_config is None:
+                vision_config = {}
+
+            # This is the complete result when using `vision_config_dict`.
+            _vision_config_dict = CLIPSegVisionConfig(**vision_config_dict).to_dict()
+            # convert keys to string instead of integer
+            if "id2label" in _vision_config_dict:
+                _vision_config_dict["id2label"] = {
+                    str(key): value for key, value in _vision_config_dict["id2label"].items()
+                }
+
+            # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
+            for key, value in _vision_config_dict.items():
+                if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
+                    # If specified in `vision_config_dict`
+                    if key in vision_config_dict:
+                        message = (
+                            f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
+                            f'values. The value `vision_config_dict["{key}"]` will be used instead.'
+                        )
+                    # If inferred from default argument values (just to be super careful)
+                    else:
+                        message = (
+                            f"`vision_config_dict` is provided which will be used to initialize `CLIPSegVisionConfig`. "
+                            f'The value `vision_config["{key}"]` will be overriden.'
+                        )
+                    logger.warning(message)
+
+            # Update all values in `vision_config` with the ones in `_vision_config_dict`.
+            vision_config.update(_vision_config_dict)
+
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `CLIPSegTextConfig` with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. initializing the `CLIPSegVisionConfig` with default values.")
+
+        self.text_config = CLIPSegTextConfig(**text_config)
+        self.vision_config = CLIPSegVisionConfig(**vision_config)
+
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value
+        self.extract_layers = extract_layers
+        self.reduce_dim = reduce_dim
+        self.decoder_num_attention_heads = decoder_num_attention_heads
+        self.decoder_attention_dropout = decoder_attention_dropout
+        self.decoder_hidden_act = decoder_hidden_act
+        self.decoder_intermediate_size = decoder_intermediate_size
+        self.conditional_layer = conditional_layer
+        self.initializer_factor = 1.0
+        self.use_complex_transposed_convolution = use_complex_transposed_convolution
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: CLIPSegTextConfig, vision_config: CLIPSegVisionConfig, **kwargs):
+        r"""
+        Instantiate a [`CLIPSegConfig`] (or a derived class) from clipseg text model configuration and clipseg vision
+        model configuration.
+        Returns:
+            [`CLIPSegConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+
+    def to_dict(self, *args, **kwargs):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["text_config"] = self.text_config.to_dict()
+        output["vision_config"] = self.vision_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clipseg/image_processing.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clipseg/image_processing.py
new file mode 100644
index 000000000..ae410d7b8
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clipseg/image_processing.py
@@ -0,0 +1,263 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The OpenAI Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Image processor class for ViT."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from ..image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ..image_transforms import normalize, rescale, resize, to_channel_dimension_format
+from ..image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ..tokenizer_utils_base import TensorType
+
+__all__ = ["ViTImageProcessor"]
+
+
+class ViTImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a ViT image processor.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
+            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize:
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 224, "width": 224}
+        size = get_size_dict(size)
+        self.do_resize = do_resize
+        self.do_rescale = do_rescale
+        self.do_normalize = do_normalize
+        self.size = size
+        self.resample = resample
+        self.rescale_factor = rescale_factor
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample:
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        return resize(
+            image, size=(size["height"], size["width"]), resample=resample, data_format=data_format, **kwargs
+        )
+
+    def rescale(
+        self, image: np.ndarray, scale: float, data_format: Optional[Union[str, ChannelDimension]] = None, **kwargs
+    ) -> np.ndarray:
+        """
+        Rescale an image by a scale factor. image = image * scale.
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`float`):
+                The scaling factor to rescale pixel values by.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        Returns:
+            `np.ndarray`: The rescaled image.
+        """
+        return rescale(image, scale=scale, data_format=data_format, **kwargs)
+
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: Union[float, List[float]],
+        std: Union[float, List[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            mean (`float` or `List[float]`):
+                Image mean to use for normalization.
+            std (`float` or `List[float]`):
+                Image standard deviation to use for normalization.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        Returns:
+            `np.ndarray`: The normalized image.
+        """
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        **kwargs,
+    ):
+        """
+        Preprocess an image or batch of images.
+        Args:
+            images (`ImageInput`):
+                Image to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
+                resizing.
+            resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
+                `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
+                an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use if `do_normalize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.PADDLE` or `'pd'`: Return a batch of type `paddle.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        resample = resample if resample is not None else self.resample
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        size = size if size is not None else self.size
+        size_dict = get_size_dict(size)
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "paddle.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        if do_resize and size is None:
+            raise ValueError("Size must be specified if do_resize is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_resize:
+            images = [self.resize(image=image, size=size_dict, resample=resample) for image in images]
+
+        if do_rescale:
+            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+
+        if do_normalize:
+            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
+
+        images = [to_channel_dimension_format(image, data_format) for image in images]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clipseg/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clipseg/modeling.py
new file mode 100644
index 000000000..44f8d2848
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clipseg/modeling.py
@@ -0,0 +1,1364 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The OpenAI Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" PaddlePaddle CLIPSeg model."""
+
+import copy
+import math
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.distributed.fleet.utils import recompute
+
+from ...utils.initializer import normal_, ones_, zeros_
+from ..activations import ACT2FN
+from ..model_outputs import BaseModelOutput, BaseModelOutputWithPooling, ModelOutput
+from ..model_utils import PretrainedModel
+from .configuration import CLIPSegConfig, CLIPSegTextConfig, CLIPSegVisionConfig
+
+_CHECKPOINT_FOR_DOC = "CIDAS/clipseg-rd64-refined"
+
+CLIPSEG_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "CIDAS/clipseg-rd64-refined",
+]
+
+__all__ = [
+    "CLIPSegPreTrainedModel",
+    "CLIPSegTextModel",
+    "CLIPSegVisionModel",
+    "CLIPSegModel",
+    "CLIPSegForImageSegmentation",
+]
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: paddle.Tensor, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.shape
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand([bsz, 1, tgt_len, src_len])
+
+    inverted_mask = 1.0 - expanded_mask
+
+    def masked_fill(x, mask, value):
+        y = paddle.full(x.shape, value, x.dtype)
+        return paddle.where(mask, y, x)
+
+    return masked_fill(inverted_mask, inverted_mask.cast("bool"), -1e4)
+
+
+# contrastive loss function, adapted from
+# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
+def contrastive_loss(logits: paddle.Tensor) -> paddle.Tensor:
+    return F.cross_entropy(logits, paddle.arange(len(logits)))
+
+
+# Copied from paddlenlp.transformers.clip.modeling.clip_loss with clip->clipseg
+def clipseg_loss(similarity: paddle.Tensor) -> paddle.Tensor:
+    caption_loss = contrastive_loss(similarity)
+    image_loss = contrastive_loss(similarity.t())
+    return (caption_loss + image_loss) / 2.0
+
+
+@dataclass
+# Copied from paddlenlp.transformers.clip.modeling.CLIPOutput with CLIP->CLIPSeg
+class CLIPSegOutput(ModelOutput):
+    """
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
+        logits_per_image:(`paddle.Tensor` of shape `(image_batch_size, text_batch_size)`):
+            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+            similarity scores.
+        logits_per_text:(`paddle.Tensor` of shape `(text_batch_size, image_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+            similarity scores.
+        text_embeds(`paddle.Tensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
+        image_embeds(`paddle.Tensor` of shape `(batch_size, output_dim`):
+            The image embeddings obtained by applying the projection layer to the pooled output of
+            [`CLIPSegVisionModel`].
+        text_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`CLIPSegTextModel`].
+        vision_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`CLIPSegVisionModel`].
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits_per_image: paddle.Tensor = None
+    logits_per_text: paddle.Tensor = None
+    text_embeds: paddle.Tensor = None
+    image_embeds: paddle.Tensor = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+@dataclass
+class CLIPSegDecoderOutput(ModelOutput):
+    """
+    Args:
+        logits (`paddle.Tensor` of shape `(batch_size, height, width)`):
+            Classification scores for each pixel.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    logits: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class CLIPSegImageSegmentationOutput(ModelOutput):
+    """
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
+        ...
+        vision_model_output (`BaseModelOutputWithPooling`):
+            The output of the [`CLIPSegVisionModel`].
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    conditional_embeddings: paddle.Tensor = None
+    pooled_output: paddle.Tensor = None
+    vision_model_output: BaseModelOutputWithPooling = None
+    decoder_output: CLIPSegDecoderOutput = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["vision_model_output", "decoder_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+class CLIPSegVisionEmbeddings(nn.Layer):
+    # Copied from paddlenlp.transformers.clip.modeling.CLIPVisionEmbeddings.__init__
+    def __init__(self, config: CLIPSegVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = paddle.create_parameter(
+            (self.embed_dim,),
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Assign(paddle.randn((self.embed_dim,))),
+        )
+
+        self.patch_embedding = nn.Conv2D(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias_attr=False,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", paddle.arange(self.num_positions).expand((1, -1)), persistable=False)
+
+    def interpolate_position_embeddings(self, new_size):
+        if len(new_size) != 2:
+            raise ValueError("new_size should consist of 2 values")
+
+        num_patches_one_direction = int(self.num_patches**0.5)
+        # we interpolate the position embeddings in 2D
+        a = self.position_embedding.weight[1:].T.reshape(
+            [1, self.config.hidden_size, num_patches_one_direction, num_patches_one_direction]
+        )
+        b = (
+            nn.functional.interpolate(a, new_size, mode="bicubic", align_corners=False)
+            .squeeze(0)
+            .reshape([self.config.hidden_size, new_size[0] * new_size[1]])
+            .T
+        )
+        result = paddle.concat([self.position_embedding.weight[:1], b])
+
+        return result
+
+    def forward(self, pixel_values: paddle.Tensor) -> paddle.Tensor:
+        batch_size = pixel_values.shape[0]
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose([0, 2, 1])
+
+        class_embeds = self.class_embedding.expand([batch_size, 1, -1])
+        embeddings = paddle.concat([class_embeds, patch_embeds], axis=1)
+
+        if embeddings.shape[1] != self.num_positions:
+            new_shape = int(math.sqrt(embeddings.shape[1] - 1))
+            embeddings = embeddings + self.interpolate_position_embeddings((new_shape, new_shape))
+            embeddings = embeddings
+        else:
+            embeddings = embeddings + self.position_embedding(self.position_ids)
+
+        return embeddings
+
+
+# Copied from paddlenlp.transformers.clip.modeling.CLIPTextEmbeddings with CLIP->CLIPSeg
+class CLIPSegTextEmbeddings(nn.Layer):
+    def __init__(self, config: CLIPSegTextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids",
+            paddle.arange(config.max_position_embeddings, dtype="int64").expand((1, -1)),
+            persistable=False,
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+    ) -> paddle.Tensor:
+        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        return embeddings
+
+
+# Copied from paddlenlp.transformers.clip.modeling.CLIPAttention with CLIP->CLIPSeg
+class CLIPSegAttention(nn.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
+        return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        causal_attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = hidden_states.shape
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scale
+        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).reshape(proj_shape)
+        key_states = key_states.reshape(proj_shape)
+        value_states = value_states.reshape(proj_shape)
+
+        src_len = key_states.shape[1]
+        attn_weights = paddle.bmm(query_states, key_states.transpose([0, 2, 1]))
+
+        if attn_weights.shape != [bsz * self.num_heads, tgt_len, src_len]:
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.shape}"
+            )
+
+        # apply the causal_attention_mask first
+        if causal_attention_mask is not None:
+            if causal_attention_mask.shape != [bsz, 1, tgt_len, src_len]:
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {causal_attention_mask.shape}"
+                )
+            attn_weights = attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len]) + causal_attention_mask
+            attn_weights = attn_weights.reshape([bsz * self.num_heads, tgt_len, src_len])
+
+        if attention_mask is not None:
+            if attention_mask.shape != [bsz, 1, tgt_len, src_len]:
+                raise ValueError(
+                    f"Attention mask should be of size {[bsz, 1, tgt_len, src_len]}, but is {attention_mask.shape}"
+                )
+            attn_weights = attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len]) + attention_mask
+            attn_weights = attn_weights.reshape([bsz * self.num_heads, tgt_len, src_len])
+
+        attn_weights = nn.functional.softmax(attn_weights, axis=-1)
+
+        if output_attentions:
+            # this operation is a bit akward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len])
+            attn_weights = attn_weights_reshaped.reshape([bsz * self.num_heads, tgt_len, src_len])
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = paddle.bmm(attn_probs, value_states)
+
+        if attn_output.shape != [bsz * self.num_heads, tgt_len, self.head_dim]:
+            raise ValueError(
+                f"`attn_output` should be of size {[bsz, self.num_heads, tgt_len, self.head_dim]}, but is"
+                f" {attn_output.shape}"
+            )
+
+        attn_output = attn_output.reshape([bsz, self.num_heads, tgt_len, self.head_dim])
+        attn_output = attn_output.transpose([0, 2, 1, 3])
+        attn_output = attn_output.reshape([bsz, tgt_len, embed_dim])
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+# Copied from paddlenlp.transformers.clip.modeling.CLIPMLP with CLIP->CLIPSeg
+class CLIPSegMLP(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from paddlenlp.transformers.clip.modeling.CLIPEncoderLayer with CLIP->CLIPSeg
+class CLIPSegEncoderLayer(nn.Layer):
+    def __init__(self, config: CLIPSegConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = CLIPSegAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
+        self.mlp = CLIPSegMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: paddle.Tensor,
+        causal_attention_mask: paddle.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        """
+        Args:
+            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`paddle.Tensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class CLIPSegPreTrainedModel(PretrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = CLIPSegConfig
+    base_model_prefix = "clip"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor
+        if isinstance(module, CLIPSegTextEmbeddings):
+            normal_(module.token_embedding.weight, mean=0.0, std=factor * 0.02)
+            normal_(module.position_embedding.weight, mean=0.0, std=factor * 0.02)
+        elif isinstance(module, CLIPSegVisionEmbeddings):
+            factor = self.config.initializer_factor
+            normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
+            normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
+            normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
+        elif isinstance(module, CLIPSegAttention):
+            factor = self.config.initializer_factor
+            in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            out_proj_std = (module.embed_dim**-0.5) * factor
+            normal_(module.q_proj.weight, std=in_proj_std)
+            normal_(module.k_proj.weight, std=in_proj_std)
+            normal_(module.v_proj.weight, std=in_proj_std)
+            normal_(module.out_proj.weight, std=out_proj_std)
+        elif isinstance(module, CLIPSegMLP):
+            factor = self.config.initializer_factor
+            in_proj_std = (
+                (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
+            )
+            fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
+            normal_(module.fc1.weight, std=fc_std)
+            normal_(module.fc2.weight, std=in_proj_std)
+        elif isinstance(module, CLIPSegModel):
+            normal_(
+                module.text_projection.weight,
+                std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+            normal_(
+                module.visual_projection.weight,
+                std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
+            )
+
+        if isinstance(module, nn.LayerNorm):
+            zeros_(module.bias)
+            ones_(module.weight)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            zeros_(module.bias)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, CLIPSegEncoder):
+            module.enable_recompute = value
+
+
+# Copied from paddlenlp.transformers.clip.modeling.CLIPEncoder with CLIP->CLIPSeg
+class CLIPSegEncoder(nn.Layer):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`CLIPSegEncoderLayer`].
+    Args:
+        config: CLIPSegConfig
+    """
+
+    def __init__(self, config: CLIPSegConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.LayerList([CLIPSegEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[paddle.Tensor] = None,
+        causal_attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = recompute(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    causal_attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class CLIPSegTextTransformer(nn.Layer):
+    # Copied from paddlenlp.transformers.clip.modeling.CLIPTextTransformer.__init__ with CLIP->CLIPSeg
+    def __init__(self, config: CLIPSegTextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = CLIPSegTextEmbeddings(config)
+        self.encoder = CLIPSegEncoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
+
+        # For `pooled_output` computation
+        self.eos_token_id = config.eos_token_id
+
+    # Copied from paddlenlp.transformers.clip.modeling.CLIPTextTransformer.forward with clip->clipseg, CLIP->CLIPSeg
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+
+        input_shape = input_ids.shape
+        input_ids = input_ids.reshape([-1, input_shape[-1]])
+
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+
+        bsz, seq_len = input_shape
+        # CLIPSeg's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIPSeg/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clipseg/model.py#L324
+        causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype)
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+
+        if self.eos_token_id == 2:
+            # The `eos_token_id` was incorrect before PR #24773: Let's keep what have been done here.
+            # A CLIPSeg model with such `eos_token_id` in the config can't work correctly with extra new tokens added
+            # ------------------------------------------------------------
+            # text_embeds.shape = [batch_size, sequence_length, transformer.width]
+            # take features from the eot embedding (eot_token is the highest number in each sequence)
+            # casting to paddle.int32 for onnx compatibility: argmax doesn't support int64 inputs with opset 14
+            pooled_output = last_hidden_state.gather_nd(
+                paddle.stack(
+                    [paddle.arange(last_hidden_state.shape[0], dtype="int32"), input_ids.argmax(-1, dtype="int32")],
+                    axis=-1,
+                )
+            )
+        else:
+            # The config gets updated `eos_token_id` from PR #24773 (so the use of exta new tokens is possible)
+            # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`)
+            pooled_output = last_hidden_state.gather_nd(
+                paddle.stack(
+                    [
+                        paddle.arange(last_hidden_state.shape[0], dtype="int32"),
+                        (input_ids == self.eos_token_id).cast("int32").argmax(axis=-1, dtype="int32"),
+                    ],
+                    axis=-1,
+                )
+            )
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def _build_causal_attention_mask(self, bsz, seq_len, dtype):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = paddle.full([bsz, seq_len, seq_len], fill_value=-1e9, dtype=dtype)
+        mask = paddle.triu(mask, diagonal=1)  # zero out the upper diagonal
+        mask = mask.unsqueeze(1)  # expand mask
+        return mask
+
+
+class CLIPSegTextModel(CLIPSegPreTrainedModel):
+    config_class = CLIPSegTextConfig
+
+    _no_split_modules = ["CLIPSegEncoderLayer"]
+
+    def __init__(self, config: CLIPSegTextConfig):
+        super().__init__(config)
+        self.text_model = CLIPSegTextTransformer(config)
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.text_model.embeddings.token_embedding
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from paddlenlp.transformers import AutoTokenizer, CLIPSegTextModel
+        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
+        >>> model = CLIPSegTextModel.from_pretrained("CIDAS/clipseg-rd64-refined")
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pd")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```"""
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class CLIPSegVisionTransformer(nn.Layer):
+    # Copied from paddlenlp.transformers.clip.modeling.CLIPVisionTransformer.__init__ with CLIP->CLIPSeg
+    def __init__(self, config: CLIPSegVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = CLIPSegVisionEmbeddings(config)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
+        self.encoder = CLIPSegEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
+
+    # Copied from paddlenlp.transformers.clip.modeling.CLIPVisionTransformer.forward
+    def forward(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class CLIPSegVisionModel(CLIPSegPreTrainedModel):
+    config_class = CLIPSegVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: CLIPSegVisionConfig):
+        super().__init__(config)
+        self.vision_model = CLIPSegVisionTransformer(config)
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import AutoProcessor, CLIPSegVisionModel
+        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
+        >>> model = CLIPSegVisionModel.from_pretrained("CIDAS/clipseg-rd64-refined")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pd")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        ```"""
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class CLIPSegModel(CLIPSegPreTrainedModel):
+    config_class = CLIPSegConfig
+
+    def __init__(self, config: CLIPSegConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, CLIPSegTextConfig):
+            raise ValueError(
+                "config.text_config is expected to be of type CLIPSegTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, CLIPSegVisionConfig):
+            raise ValueError(
+                "config.vision_config is expected to be of type CLIPSegVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = CLIPSegTextTransformer(text_config)
+        self.vision_model = CLIPSegVisionTransformer(vision_config)
+
+        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias_attr=False)
+        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias_attr=False)
+        self.logit_scale = paddle.create_parameter(
+            (1,),
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(self.config.logit_scale_init_value),
+        )
+
+    def get_text_features(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> paddle.Tensor:
+        r"""
+        Returns:
+            text_features (`paddle.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPSegTextModel`].
+        Examples:
+        ```python
+        >>> from paddlenlp.transformers import AutoTokenizer, CLIPSegModel
+        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
+        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pd")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        # Use CLIPSEG model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+        text_features = self.text_projection(pooled_output)
+
+        return text_features
+
+    def get_image_features(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> paddle.Tensor:
+        r"""
+        Returns:
+            image_features (`paddle.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import AutoProcessor, CLIPSegModel
+        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
+        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, return_tensors="pd")
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+        # Use CLIPSEG model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
+
+        return image_features
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        pixel_values: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLIPSegOutput]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import AutoProcessor, CLIPSegModel
+        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
+        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pd", padding=True
+        ... )
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        # Use CLIPSEG model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, axis=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, axis=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_text = paddle.matmul(text_embeds, image_embeds, transpose_y=True) * logit_scale
+        logits_per_image = logits_per_text.t()
+
+        loss = None
+        if return_loss:
+            loss = clipseg_loss(logits_per_text)
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return CLIPSegOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+
+
+class CLIPSegDecoderLayer(nn.Layer):
+    """
+    CLIPSeg decoder layer, which is identical to `CLIPSegEncoderLayer`, except that normalization is applied after
+    self-attention/MLP, rather than before.
+    """
+
+    # Copied from paddlenlp.transformers.clip.modeling.CLIPEncoderLayer.__init__ with CLIP->CLIPSeg
+    def __init__(self, config: CLIPSegConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = CLIPSegAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
+        self.mlp = CLIPSegMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: paddle.Tensor,
+        causal_attention_mask: paddle.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        """
+        Args:
+            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`paddle.Tensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class CLIPSegDecoder(CLIPSegPreTrainedModel):
+    def __init__(self, config: CLIPSegConfig):
+        super().__init__(config)
+
+        self.conditional_layer = config.conditional_layer
+
+        self.film_mul = nn.Linear(config.projection_dim, config.reduce_dim)
+        self.film_add = nn.Linear(config.projection_dim, config.reduce_dim)
+
+        if config.use_complex_transposed_convolution:
+            transposed_kernels = (config.vision_config.patch_size // 4, config.vision_config.patch_size // 4)
+
+            self.transposed_convolution = nn.Sequential(
+                nn.Conv2D(config.reduce_dim, config.reduce_dim, kernel_size=3, padding=1),
+                nn.ReLU(),
+                nn.Conv2DTranspose(
+                    config.reduce_dim,
+                    config.reduce_dim // 2,
+                    kernel_size=transposed_kernels[0],
+                    stride=transposed_kernels[0],
+                ),
+                nn.ReLU(),
+                nn.Conv2DTranspose(
+                    config.reduce_dim // 2, 1, kernel_size=transposed_kernels[1], stride=transposed_kernels[1]
+                ),
+            )
+        else:
+            self.transposed_convolution = nn.Conv2DTranspose(
+                config.reduce_dim, 1, config.vision_config.patch_size, stride=config.vision_config.patch_size
+            )
+
+        depth = len(config.extract_layers)
+        self.reduces = nn.LayerList(
+            [nn.Linear(config.vision_config.hidden_size, config.reduce_dim) for _ in range(depth)]
+        )
+
+        decoder_config = copy.deepcopy(config.vision_config)
+        decoder_config.hidden_size = config.reduce_dim
+        decoder_config.num_attention_heads = config.decoder_num_attention_heads
+        decoder_config.intermediate_size = config.decoder_intermediate_size
+        decoder_config.hidden_act = "relu"
+        self.layers = nn.LayerList([CLIPSegDecoderLayer(decoder_config) for _ in range(len(config.extract_layers))])
+
+    def forward(
+        self,
+        hidden_states: Tuple[paddle.Tensor],
+        conditional_embeddings: paddle.Tensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        activations = hidden_states[::-1]
+
+        output = None
+        for i, (activation, layer, reduce) in enumerate(zip(activations, self.layers, self.reduces)):
+            if output is not None:
+                output = reduce(activation) + output
+            else:
+                output = reduce(activation)
+
+            if i == self.conditional_layer:
+                output = self.film_mul(conditional_embeddings) * output.transpose([1, 0, 2]) + self.film_add(
+                    conditional_embeddings
+                )
+                output = output.transpose([1, 0, 2])
+
+            layer_outputs = layer(
+                output, attention_mask=None, causal_attention_mask=None, output_attentions=output_attentions
+            )
+
+            output = layer_outputs[0]
+
+            if output_hidden_states:
+                all_hidden_states += (output,)
+
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+
+        output = output[:, 1:, :].transpose(
+            [0, 2, 1]
+        )  # remove cls token and reshape to [batch_size, reduce_dim, seq_len]
+
+        size = int(math.sqrt(output.shape[2]))
+
+        batch_size = conditional_embeddings.shape[0]
+        output = output.reshape([batch_size, output.shape[1], size, size])
+
+        logits = self.transposed_convolution(output).squeeze()
+
+        if not return_dict:
+            return tuple(v for v in [logits, all_hidden_states, all_attentions] if v is not None)
+
+        return CLIPSegDecoderOutput(
+            logits=logits,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
+
+
+class CLIPSegForImageSegmentation(CLIPSegPreTrainedModel):
+    config_class = CLIPSegConfig
+
+    def __init__(self, config: CLIPSegConfig):
+        super().__init__(config)
+
+        self.config = config
+
+        self.clip = CLIPSegModel(config)
+        self.extract_layers = config.extract_layers
+
+        self.decoder = CLIPSegDecoder(config)
+
+    def get_conditional_embeddings(
+        self,
+        batch_size: int = None,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        conditional_pixel_values: Optional[paddle.Tensor] = None,
+    ):
+        if input_ids is not None:
+            # compute conditional embeddings from texts
+            if len(input_ids) != batch_size:
+                raise ValueError("Make sure to pass as many prompt texts as there are query images")
+            with paddle.no_grad():
+                conditional_embeddings = self.clip.get_text_features(
+                    input_ids, attention_mask=attention_mask, position_ids=position_ids
+                )
+        elif conditional_pixel_values is not None:
+            # compute conditional embeddings from images
+            if len(conditional_pixel_values) != batch_size:
+                raise ValueError("Make sure to pass as many prompt images as there are query images")
+            with paddle.no_grad():
+                conditional_embeddings = self.clip.get_image_features(conditional_pixel_values)
+        else:
+            raise ValueError(
+                "Invalid conditional, should be either provided as `input_ids` or `conditional_pixel_values`"
+            )
+
+        return conditional_embeddings
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        pixel_values: Optional[paddle.Tensor] = None,
+        conditional_pixel_values: Optional[paddle.Tensor] = None,
+        conditional_embeddings: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CLIPSegOutput]:
+        r"""
+        labels (`paddle.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        Returns:
+        Examples:
+        ```python
+        >>> from paddlenlp.transformers import AutoProcessor, CLIPSegForImageSegmentation
+        >>> from PIL import Image
+        >>> import requests
+        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
+        >>> model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> texts = ["a cat", "a remote", "a blanket"]
+        >>> inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pd")
+        >>> outputs = model(**inputs)
+        >>> logits = outputs.logits
+        >>> print(logits.shape)
+           [3, 352, 352]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # step 1: forward the query images through the frozen CLIP vision encoder
+        with paddle.no_grad():
+            vision_outputs = self.clip.vision_model(
+                pixel_values=pixel_values,
+                output_attentions=output_attentions,
+                output_hidden_states=True,  # we need the intermediate hidden states
+                return_dict=return_dict,
+            )
+            pooled_output = self.clip.visual_projection(vision_outputs[1])
+
+            hidden_states = vision_outputs.hidden_states if return_dict else vision_outputs[2]
+            # we add +1 here as the hidden states also include the initial embeddings
+            activations = [hidden_states[i + 1] for i in self.extract_layers]
+
+            # update vision_outputs
+            if return_dict:
+                vision_outputs = BaseModelOutputWithPooling(
+                    last_hidden_state=vision_outputs.last_hidden_state,
+                    pooler_output=vision_outputs.pooler_output,
+                    hidden_states=vision_outputs.hidden_states if output_hidden_states else None,
+                    attentions=vision_outputs.attentions,
+                )
+            else:
+                vision_outputs = (
+                    vision_outputs[:2] + vision_outputs[3:] if not output_hidden_states else vision_outputs
+                )
+
+        # step 2: compute conditional embeddings, either from text, images or an own provided embedding
+        if conditional_embeddings is None:
+            conditional_embeddings = self.get_conditional_embeddings(
+                batch_size=pixel_values.shape[0],
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                conditional_pixel_values=conditional_pixel_values,
+            )
+        else:
+            if conditional_embeddings.shape[0] != pixel_values.shape[0]:
+                raise ValueError(
+                    "Make sure to pass as many conditional embeddings as there are query images in the batch"
+                )
+            if conditional_embeddings.shape[1] != self.config.projection_dim:
+                raise ValueError(
+                    "Make sure that the feature dimension of the conditional embeddings matches"
+                    " `config.projection_dim`."
+                )
+
+        # step 3: forward both the pooled output and the activations through the lightweight decoder to predict masks
+        decoder_outputs = self.decoder(
+            activations,
+            conditional_embeddings,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = decoder_outputs.logits if return_dict else decoder_outputs[0]
+
+        loss = None
+        if labels is not None:
+            loss_fn = nn.BCEWithLogitsLoss()
+            loss = loss_fn(logits, labels)
+
+        if not return_dict:
+            output = (logits, conditional_embeddings, pooled_output, vision_outputs, decoder_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return CLIPSegImageSegmentationOutput(
+            loss=loss,
+            logits=logits,
+            conditional_embeddings=conditional_embeddings,
+            pooled_output=pooled_output,
+            vision_model_output=vision_outputs,
+            decoder_output=decoder_outputs,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clipseg/processing.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clipseg/processing.py
new file mode 100644
index 000000000..ddbdea9c2
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/clipseg/processing.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The OpenAI Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Image/Text processor class for CLIPSeg
+"""
+import warnings
+
+from ..processing_utils import ProcessorMixin
+from ..tokenizer_utils_base import BatchEncoding
+
+__all__ = ["CLIPSegProcessor"]
+
+
+class CLIPSegProcessor(ProcessorMixin):
+    r"""
+    Constructs a CLIPSeg processor which wraps a CLIPSeg image processor and a CLIP tokenizer into a single processor.
+    [`CLIPSegProcessor`] offers all the functionalities of [`ViTImageProcessor`] and [`CLIPTokenizerFast`]. See the
+    [`~CLIPSegProcessor.__call__`] and [`~CLIPSegProcessor.decode`] for more information.
+    Args:
+        image_processor ([`ViTImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`CLIPTokenizerFast`]):
+            The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "ViTImageProcessor"
+    tokenizer_class = "CLIPTokenizer"
+
+    pretrained_init_configuration = {
+        "CIDAS/clipseg-rd64-refined": {"do_lower_case": True},
+    }
+
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        if "feature_extractor" in kwargs:
+            warnings.warn(
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                " instead.",
+                FutureWarning,
+            )
+            feature_extractor = kwargs.pop("feature_extractor")
+
+        image_processor = image_processor if image_processor is not None else feature_extractor
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(self, text=None, images=None, visual_prompt=None, return_tensors=None, **kwargs):
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        ViTImageProcessor's [`~ViTImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring of
+        the above two methods for more information.
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PaddlePaddle
+                tensor. In case of a NumPy array/PaddlePaddle tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+            visual_prompt (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The visual prompt image or batch of images to be prepared. Each visual prompt image can be a PIL image,
+                NumPy array or PaddlePaddle tensor. In case of a NumPy array/PaddlePaddle tensor, each image should be of shape
+                (C, H, W), where C is a number of channels, H and W are image height and width.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'pd'`: Return PaddlePaddle `paddle.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+        Returns:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if text is None and visual_prompt is None and images is None:
+            raise ValueError("You have to specify either text, visual prompt or images.")
+
+        if text is not None and visual_prompt is not None:
+            raise ValueError("You have to specify exactly one type of prompt. Either text or visual prompt.")
+
+        if text is not None:
+            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+
+        if visual_prompt is not None:
+            prompt_features = self.image_processor(visual_prompt, return_tensors=return_tensors, **kwargs)
+
+        if images is not None:
+            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+
+        if visual_prompt is not None and images is not None:
+            encoding = {
+                "pixel_values": image_features.pixel_values,
+                "conditional_pixel_values": prompt_features.pixel_values,
+            }
+            return encoding
+        elif text is not None and images is not None:
+            encoding["pixel_values"] = image_features.pixel_values
+            return encoding
+        elif text is not None:
+            return encoding
+        elif visual_prompt is not None:
+            encoding = {
+                "conditional_pixel_values": prompt_features.pixel_values,
+            }
+            return encoding
+        else:
+            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def feature_extractor_class(self):
+        warnings.warn(
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
+            FutureWarning,
+        )
+        return self.image_processor_class
+
+    @property
+    def feature_extractor(self):
+        warnings.warn(
+            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
+            FutureWarning,
+        )
+        return self.image_processor
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/codegen/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/codegen/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/codegen/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/codegen/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/codegen/configuration.py
new file mode 100644
index 000000000..abfb343d8
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/codegen/configuration.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" CODEGEN model configuration"""
+from __future__ import annotations
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["CODEGEN_PRETRAINED_INIT_CONFIGURATION", "CodeGenConfig", "CODEGEN_PRETRAINED_RESOURCE_FILES_MAP"]
+
+CODEGEN_PRETRAINED_INIT_CONFIGURATION = {}
+CODEGEN_PRETRAINED_RESOURCE_FILES_MAP = {"model_state": {}}
+
+
+class CodeGenConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CodeGenModel`]. It is used to instantiate a
+    CodeGen model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the CodeGen
+    Salesforce/codegen-350M-mono architecture. Configuration objects
+    inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (int, optional):
+            Vocabulary size of `inputs_ids` in `CodeGenModel`. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `CodeGenModel`.
+            Defaulta to `50400`.
+        n_embed (int, optional):
+            Dimensionality of the embedding layer, decoder layer. Defaults to `4096`.
+        n_layer (int, optional):
+            Number of hidden layers. Defaults to `28`.
+        n_head (int, optional):
+            Number of attention heads for each attention layer in the Transformer decoder.
+            Defaults to `16`.
+        n_ctx (int, optional):
+            Dimensionality of the causal mask (usually same as n_positions).
+            Defaults to `2048`.
+        n_positions (int, optional):
+            The maximum sequence length that this model might ever be used with.
+            Defaults to `2048`.
+        attn_pdrop (float, optional):
+            The dropout probability used in MultiHeadAttention in all decoder layers to drop some attention target.
+            Defaults to `0.0`.
+        resid_pdrop (float, optional):
+            The dropout probability for all residual layers in the decoder.
+            Defaults to `0.0`.
+        embd_pdrop (float, optional):
+            The dropout probability used in embedding layers. Defaults to `0.0`.
+        rotary_dim (int, optional):
+            Dimensionality of rotay position embeddings.
+            Defaults to `64`.
+        activation_function (str, optional):
+            The non-linear activation function in the feed-forward layer.
+            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions are supported.
+            Defaults to `"gelu_new"`.
+        layer_norm_epsilon (float, optional):
+            The epsilon to use in the layer normalization layers.
+            Defaults to `1e-05`.
+        initializer_range (float, optional):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            Default to `0.02`.
+    ```"""
+    model_type = "codegen"
+    pretrained_init_configuration = CODEGEN_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size: int = 50400,
+        bos_token_id: int = 1,
+        eos_token_id: int = 50256,
+        pad_token_id: int = 50256,
+        n_embd: int = 4096,
+        n_layer: int = 28,
+        n_head: int = 16,
+        n_ctx: int = 2048,
+        n_positions: int = 2048,
+        attn_pdrop: float = 0.0,
+        resid_pdrop: float = 0.0,
+        embd_pdrop: float = 0.0,
+        rotary_dim: int = 64,
+        activation_function: str = "gelu_new",
+        layer_norm_epsilon: float = 1e-05,
+        initializer_range: float = 0.02,
+        n_inner: int = None,
+        tie_word_embeddings: bool = False,
+        **kwargs,
+    ):
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.n_ctx = n_ctx
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.rotary_dim = rotary_dim
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/codegen/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/codegen/modeling.py
new file mode 100644
index 000000000..2759b203c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/codegen/modeling.py
@@ -0,0 +1,688 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The Salesforce authors, The Open AI Team Authors and The HuggingFace Inc. team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Tuple, Union
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import Tensor
+from paddle.nn import Layer
+
+from ...utils.env import CONFIG_NAME
+from ...utils.log import logger
+from .. import PretrainedModel, register_base_model
+from ..activations import ACT2FN
+from ..model_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+)
+from .configuration import (
+    CODEGEN_PRETRAINED_INIT_CONFIGURATION,
+    CODEGEN_PRETRAINED_RESOURCE_FILES_MAP,
+    CodeGenConfig,
+)
+
+CODEGEN_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "Salesforce/codegen-350M-nl",
+    "Salesforce/codegen-350M-multi",
+    "Salesforce/codegen-350M-mono",
+    "Salesforce/codegen-2B-nl",
+    "Salesforce/codegen-2B-multi",
+    "Salesforce/codegen-2B-mono",
+    "Salesforce/codegen-6B-nl",
+    "Salesforce/codegen-6B-multi",
+    "Salesforce/codegen-6B-mono",
+    "Salesforce/codegen-16B-nl",
+    "Salesforce/codegen-16B-multi",
+    "Salesforce/codegen-16B-mono",
+]
+
+
+def fixed_pos_embedding(x, seq_dim=1, seq_len=None):
+    dim = x.shape[-1]
+    if seq_len is None:
+        seq_len = x.shape[seq_dim]
+    inv_freq = 1.0 / (10000 ** (paddle.arange(0, dim, 2) / dim))
+    sinusoid_inp = paddle.einsum("i,j->ij", paddle.arange(seq_len, dtype="float32"), inv_freq)
+    return paddle.sin(sinusoid_inp), paddle.cos(sinusoid_inp)
+
+
+def rotate_every_two(x):
+    x1 = x[:, :, :, ::2]
+    x2 = x[:, :, :, 1::2]
+    x = paddle.stack((-x2, x1), axis=-1)
+    # In einsum notation: rearrange(x, '... d j -> ... (d j)')
+    return x.flatten(-2)
+
+
+def duplicate_interleave(m):
+    return paddle.repeat_interleave(m, 2, axis=1)
+
+
+def apply_rotary_pos_emb(x, sincos, offset=0):
+    sin, cos = map(lambda t: duplicate_interleave(t)[None, offset : x.shape[1] + offset, None, :], sincos)
+    # einsum notation for lambda t: repeat(t[offset:x.shape[1]+offset,:], "n d -> () n () (d j)", j=2)
+    return (x * cos) + (rotate_every_two(x) * sin)
+
+
+class CodeGenAttention(Layer):
+    def __init__(self, config: CodeGenConfig):
+        super().__init__()
+
+        self.causal_mask = paddle.tril(
+            paddle.ones((config.n_positions, config.n_positions), dtype=paddle.get_default_dtype())
+        ).reshape((1, 1, config.n_positions, config.n_positions))
+
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+
+        self.embed_dim = config.n_embd
+        self.num_attention_heads = config.n_head
+        self.head_dim = self.embed_dim // self.num_attention_heads
+        if self.head_dim * self.num_attention_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_attention_heads (got `embed_dim`: {self.embed_dim} and"
+                f" `num_attention_heads`: {self.num_attention_heads})."
+            )
+        self.scale_attn = paddle.sqrt(paddle.to_tensor(self.head_dim, dtype="float32"))
+        self.qkv_proj = nn.Linear(self.embed_dim, self.embed_dim * 3, bias_attr=False)
+
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias_attr=False)
+        self.rotary_dim = config.rotary_dim
+
+    def _split_heads(self, x, n_head, dim_head, mp_num):
+        reshaped = x.reshape(x.shape[:-1] + [n_head // mp_num, dim_head])
+        reshaped = reshaped.reshape(x.shape[:-2] + [-1] + reshaped.shape[-1:])
+        return reshaped
+
+    def _merge_heads(self, tensor, num_attention_heads, attn_head_size):
+        """
+        Merges attn_head_size dim and num_attn_heads dim into n_ctx
+        """
+        if len(tensor.shape) == 5:
+            tensor = tensor.transpose([0, 1, 3, 2, 4])
+        elif len(tensor.shape) == 4:
+            tensor = tensor.transpose([0, 2, 1, 3])
+        else:
+            raise ValueError(f"Input tensor rank should be one of [4, 5], but is: {len(tensor.shape)}")
+        new_shape = tensor.shape[:-2] + [num_attention_heads * attn_head_size]
+        return tensor.reshape(new_shape)
+
+    def _attn(self, query, key, value, attention_mask=None):
+
+        # compute causal mask from causal mask buffer
+        query_length, key_length = query.shape[-2], key.shape[-2]
+        causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length]
+
+        # Keep the attention weights computation in fp32 to avoid overflow issues
+        query = paddle.cast(query, "float32")
+        key = paddle.cast(key, "float32")
+        attn_weights = paddle.matmul(query, key, transpose_y=True)
+
+        attn_weights = attn_weights / self.scale_attn
+        mask_value = paddle.to_tensor(-1e4, dtype=attn_weights.dtype)
+        # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+        attn_weights = paddle.where(causal_mask, attn_weights, mask_value)
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+
+        attn_weights = F.softmax(attn_weights, axis=-1, dtype=value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+
+        attn_output = paddle.matmul(attn_weights, value)
+
+        return attn_output, attn_weights
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        use_cache: Optional[bool] = False,
+        cache: Optional[Tuple[Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple:
+        qkv = self.qkv_proj(hidden_states)
+        mp_num = 4
+        qkv_split = qkv.reshape(qkv.shape[:-1] + [mp_num, -1])
+
+        local_dim = qkv_split.shape[-1] // (self.head_dim * self.num_attention_heads // mp_num)
+        query, value, key = paddle.split(qkv_split, local_dim, axis=-1)
+        query = self._split_heads(query, self.num_attention_heads, self.head_dim, mp_num=mp_num)
+        key = self._split_heads(key, self.num_attention_heads, self.head_dim, mp_num=mp_num)
+
+        value = self._split_heads(value, self.num_attention_heads, self.head_dim, mp_num=mp_num)
+        value = value.transpose([0, 2, 1, 3])
+
+        seq_len = key.shape[1]
+        offset = 0
+
+        if cache is not None:
+            offset = cache[0].shape[-2]
+            seq_len += offset
+
+        if self.rotary_dim is not None:
+            k_rot = key[:, :, :, : self.rotary_dim]
+            k_pass = key[:, :, :, self.rotary_dim :]
+
+            q_rot = query[:, :, :, : self.rotary_dim]
+            q_pass = query[:, :, :, self.rotary_dim :]
+
+            sincos = fixed_pos_embedding(k_rot, 1, seq_len=seq_len)
+            k_rot = apply_rotary_pos_emb(k_rot, sincos, offset=offset)
+            q_rot = apply_rotary_pos_emb(q_rot, sincos, offset=offset)
+
+            key = paddle.concat([k_rot, k_pass], axis=-1)
+            query = paddle.concat([q_rot, q_pass], axis=-1)
+        else:
+            sincos = fixed_pos_embedding(key, 1, seq_len=seq_len)
+            key = apply_rotary_pos_emb(key, sincos, offset=offset)
+            query = apply_rotary_pos_emb(query, sincos, offset=offset)
+
+        key = key.transpose([0, 2, 1, 3])
+        query = query.transpose([0, 2, 1, 3])
+
+        if cache is not None:
+            past_key = cache[0]
+            past_value = cache[1]
+            key = paddle.concat((past_key, key), axis=-2)
+            value = paddle.concat((past_value, value), axis=-2)
+
+        if use_cache is True:
+            present = (key, value)
+        else:
+            present = None
+
+        # compute self-attention: V x Softmax(QK^T)
+        attn_output, attn_weights = self._attn(query, key, value, attention_mask)
+
+        attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_dim)
+        attn_output = self.out_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+
+        if output_attentions:
+            return attn_output, present, attn_weights
+        return attn_output, present
+
+
+class CodeGenMLP(Layer):
+    def __init__(self, config: CodeGenConfig):
+        super().__init__()
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * config.n_embd
+        self.fc_in = nn.Linear(config.n_embd, inner_dim)
+        self.fc_out = nn.Linear(inner_dim, config.n_embd)
+
+        self.act = ACT2FN[config.activation_function]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+
+    def forward(self, hidden_states: Tensor) -> Tensor:
+        hidden_states = self.fc_in(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.fc_out(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class CodeGenBlock(Layer):
+    def __init__(self, config: CodeGenConfig):
+        super().__init__()
+        self.ln_1 = nn.LayerNorm(config.n_embd, epsilon=config.layer_norm_epsilon)
+        self.attn = CodeGenAttention(config)
+        self.mlp = CodeGenMLP(config)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        use_cache: Optional[bool] = False,
+        cache: Optional[Tuple[Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_outputs = self.attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            cache=cache,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
+        outputs = attn_outputs[1:]
+
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        hidden_states = attn_output + feed_forward_hidden_states + residual
+
+        if use_cache:
+            outputs = (hidden_states,) + outputs
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+
+        return outputs  # hidden_states, (present, attentions)  outputs is a tuple
+
+
+class CodeGenPreTrainedModel(PretrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    model_config_file = CONFIG_NAME
+    pretrained_init_configuration = CODEGEN_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = CODEGEN_PRETRAINED_RESOURCE_FILES_MAP
+    config_class = CodeGenConfig
+    base_model_prefix = "transformer"
+
+    def _init_weights(self, layer):
+        """Initialize the weights."""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            if isinstance(layer.weight, paddle.Tensor) and paddle.get_default_dtype() == "float32":
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range,
+                        shape=layer.weight.shape,
+                    )
+                )
+        elif isinstance(layer, nn.LayerNorm):
+            layer.bias.set_value(paddle.zeros_like(layer.bias))
+            layer.weight.set_value(paddle.full_like(layer.weight, 1.0))
+            layer._epsilon = self.config.layer_norm_epsilon
+        if isinstance(layer, nn.Linear) and layer.bias is not None:
+            layer.bias.set_value(paddle.zeros_like(layer.bias))
+
+
+@register_base_model
+class CodeGenModel(CodeGenPreTrainedModel):
+    r"""
+    The bare CodeGen Model outputting raw hidden-states.
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+    Args:
+        config (:class:`CodeGenConfig`):
+            An instance of CodeGenConfig used to construct CodeGenModel.
+    """
+
+    def __init__(self, config: CodeGenConfig):
+        super().__init__(config)
+
+        self.vocab_size = config.vocab_size
+        self.bos_token_id = config.bos_token_id
+        self.pad_token_id = config.pad_token_id
+        self.eos_token_id = config.eos_token_id
+        self.embed_dim = config.n_embd
+        self.initializer_range = config.initializer_range
+        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.LayerList([CodeGenBlock(config) for _ in range(config.n_layer)])
+        self.ln_f = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_epsilon)
+        self.rotary_dim = min(config.rotary_dim, config.n_ctx // config.n_head)
+
+    def get_input_embeddings(self):
+        return self.wte
+
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        use_cache: Optional[bool] = None,
+        cache: Optional[List[Tuple[Tensor]]] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        r"""
+        The CodeGenModel forward method, overrides the `__call__()` special method.
+        Args:
+            input_ids (Tensor, optional):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
+                [batch_size, num_attention_heads, sequence_length, sequence_length].
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            use_cache (bool, optional):
+                 Whether or not to use cache. Defaults to `False`. If set to `True`, key value states will be returned and
+                 can be used to speed up decoding.
+            cache (list, optional):
+                It is a list, and each element in the list is a tuple `(incremental_cache, static_cache)`.
+                See `TransformerDecoder.gen_cache <https://github.com/PaddlePaddle/Paddle/blob/release/2.1/python/paddle/nn/layer/transformer.py#L1060>`__ for more details.
+                It is only used for inference and should be None for training.
+                Default to `None`.
+            inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation
+                of shape `(batch_size, sequence_length, hidden_size)`. This is useful if you want more control over
+                how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+                Default to None.
+            output_attentions (bool, optional):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail. Defaults to `False`.
+            output_hidden_states (bool, optional):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail. Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` object.
+                If `False`, the output will be a tuple of tensors. Defaults to `False`.
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions`.
+            Especially, When `return_dict=output_hidden_states=output_attentions=False` and `cache=None`,
+            returns a tensor representing the output of :class:`CodeGenModel`.
+            Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size].
+        Example:
+            .. code-block::
+                import paddle
+                from paddlenlp.transformers import CodeGenModel, CodeGenTokenizer
+                tokenizer = CodeGenTokenizer.from_pretrained('Salesforce/codegen-350M-mono')
+                model = CodeGenModel.from_pretrained('Salesforce/codegen-350M-mono')
+                inputs = tokenizer("def hello_world():", return_token_type_ids=False)
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+            input_ids = input_ids.reshape((-1, input_shape[-1]))
+            batch_size = input_ids.shape[0]
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+            batch_size = inputs_embeds.shape[0]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if cache is None:
+            past_length = 0
+            cache = tuple([None] * len(self.h))
+        else:
+            past_length = cache[0][0].shape[-2]
+
+        # Attention mask.
+        if attention_mask is None:
+            if input_ids is not None:
+                if batch_size == 1 and past_length != 0:
+                    batch_size, seq_len = input_shape
+                    attention_mask = paddle.zeros(
+                        [batch_size, 1, 1, seq_len + past_length], dtype=paddle.get_default_dtype()
+                    )
+                else:
+                    attention_mask = (
+                        paddle.cast(input_ids == self.pad_token_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2])
+                        * -1e4
+                    )
+            else:
+                logger.warning(
+                    "Provided inputs_embeds while attention_mask is None, attention weights will not be masked during forwarding."
+                )
+        # For 2D attention_mask from tokenizer
+        elif attention_mask.ndim == 2:
+            attention_mask = paddle.unsqueeze(attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
+            attention_mask = (1.0 - attention_mask) * -1e4
+        if attention_mask is not None:
+            attention_mask.stop_gradient = True
+        # TODO: CodeGen Attention Mask is TOO confusion.
+        # When it's 2D, it must be int and it's denoted by 1/0.
+        # When using model.generate() without providing attention mask
+        # or using 4D attention mask,
+        # the attention mask's dtype must be float and it's denoted by 0/-inf.
+        # Moreover, cannot support 3D attention mask.
+
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+        if token_type_ids is not None:
+            token_type_embeds = self.wte(token_type_ids)
+            inputs_embeds = inputs_embeds + token_type_embeds
+
+        hidden_states = self.drop(inputs_embeds)
+        output_shape = input_shape[:] + [hidden_states.shape[-1]]
+
+        presents = () if use_cache else None
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        for i, (block, old_cache) in enumerate(zip(self.h, cache)):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            outputs = block(
+                hidden_states,
+                attention_mask=attention_mask,
+                use_cache=use_cache,
+                cache=old_cache,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = outputs[0]
+            if use_cache:
+                presents = presents + (outputs[1],)
+            if output_attentions:
+                all_self_attentions += (outputs[-1],)
+
+        hidden_states = self.ln_f(hidden_states)
+        hidden_states = hidden_states.reshape(shape=output_shape)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        last_hidden_state = hidden_states
+        new_cache = presents
+
+        if not return_dict:
+            temp_list = [
+                last_hidden_state,
+                new_cache,
+                all_hidden_states,
+                all_self_attentions,
+            ]
+            return tuple(v for v in temp_list if v is not None)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=last_hidden_state,
+            past_key_values=new_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=None,
+        )
+
+
+class CodeGenForCausalLM(CodeGenPreTrainedModel):
+    r"""
+    CodeGen Model with a `language modeling` head on top.
+    Args:
+        config (:class:`CodeGenConfig`):
+            An instance of CodeGenConfig used to construct CodeGenForCausalLM.
+    """
+    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"]
+
+    def __init__(self, config: CodeGenConfig):
+        super().__init__(config)
+        self.transformer = CodeGenModel(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def prepare_fast_entry(self, kwargs):
+        from paddlenlp.ops import FasterCodeGen
+
+        use_fp16_decoding = kwargs.get("use_fp16_decoding", False)
+        decoding_lib = kwargs.get("decoding_lib", None)
+        decode_strategy = kwargs.get("decode_strategy")
+        if decode_strategy == "beam_search":
+            raise AttributeError("'beam_search' is not supported yet in the fast version of GPTJ")
+        # Currently, FasterTransformer only support restricted size_per_head.
+        size_per_head = self.transformer.config.n_embd // self.transformer.config.n_head
+        if size_per_head not in [32, 64, 80, 96, 128, 160, 192, 224, 256]:
+            raise AttributeError(
+                "'size_per_head = %d' is not supported yet in the fast version of GPTJ" % size_per_head
+            )
+        if kwargs["forced_bos_token_id"] is not None:
+            # not support for min_length yet in the fast version
+            raise AttributeError("'forced_bos_token_id != None' is not supported yet in the fast version")
+        self._fast_entry = FasterCodeGen(self, decoding_lib=decoding_lib, use_fp16_decoding=use_fp16_decoding).forward
+        return self._fast_entry
+
+    def prepare_inputs_for_generation(self, input_ids, cache=None, **kwargs):
+        # only last token for inputs_ids if past is defined in kwargs
+        token_type_ids = kwargs.get("token_type_ids", None)
+
+        if cache:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+            if token_type_ids is not None:
+                token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
+
+        attention_mask = kwargs.get("attention_mask", None)
+        if attention_mask is not None:
+            if len(attention_mask.shape) == 4:
+                attention_mask = attention_mask[:, :, -1:, :]
+
+        return {
+            "input_ids": input_ids,
+            "cache": cache,
+            "use_cache": kwargs.get("use_cache"),
+            "attention_mask": attention_mask,
+            "token_type_ids": token_type_ids,
+        }
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        use_cache: Optional[bool] = None,
+        cache: Optional[List[Tuple[Tensor]]] = None,
+        labels: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        The CodeGenForCausalLM forward method, overrides the __call__() special method.
+        Args:
+            input_ids (Tensor, optional):
+                See :class:`CodeGenModel`.
+            attention_mask (Tensor, optional):
+                See :class:`CodeGenModel`.
+            use_cache (bool, optional):
+                See :class:`CodeGenModel`.
+            cache (Tensor, optional):
+                See :class:`CodeGenModel`.
+            labels: (Tensor, optional):
+                Labels for language modeling. Note that the labels are shifted inside the model, i.e. you can set
+                `labels = input_ids` Indices are selected in `[-100, 0, ..., vocab_size]` All labels set to `-100`
+                are ignored (masked), the loss is only computed for labels in `[0, ..., vocab_size]`
+            inputs_embeds (Tensor, optional):
+                See :class:`CodeGenModel`.
+            output_attentions (bool, optional):
+                See :class: `CodeGenModel`.
+            output_hidden_states (bool, optional):
+                See :class: `CodeGenModel`.
+            return_dict (bool, optional):
+                See :class: `CodeGenModel`.
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.CausalLMOutputWithPastAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.CausalLMOutputWithPastAndCrossAttentions`.
+            Especially, When `return_dict=output_hidden_states=output_attentions=False` and `cache=labels=None`,
+            returns tensor `lm_logits` of shape [batch_size, sequence_length, vocab_size],
+
+        Example:
+            .. code-block::
+                import paddle
+                from paddlenlp.transformers import CodeGenForCausalLM, CodeGenTokenizer
+                tokenizer = CodeGenTokenizer.from_pretrained('Salesforce/codegen-350M-mono')
+                model = CodeGenForCausalLM.from_pretrained('Salesforce/codegen-350M-mono')
+                inputs = tokenizer("def hello_world():", return_token_type_ids=False)
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            use_cache=use_cache,
+            cache=cache,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+
+        # make sure sampling in fp16 works correctly and
+        # compute loss in fp32 to match with mesh-tf version
+        # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
+        lm_logits = paddle.cast(self.lm_head(hidden_states), "float32")
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[:, :-1, :]
+            shift_labels = labels[:, 1:]
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(shift_logits.reshape((-1, shift_logits.shape[-1])), shift_labels.reshape((-1,)))
+
+        if not return_dict:
+            # if isinstance(transformer_outputs, type(input_ids)):
+            #     return (loss, lm_logits) if loss is not None else lm_logits
+            outputs = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + outputs) if loss is not None else outputs
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def __getattr__(self, name):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(getattr(self, self.base_model_prefix), name)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/codegen/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/codegen/tokenizer.py
new file mode 100644
index 000000000..2bc72cfcc
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/codegen/tokenizer.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The Salesforce authors, The Open AI Team Authors and The HuggingFace Inc. team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.utils import try_import
+from .. import GPTTokenizer
+
+__all__ = ["CodeGenTokenizer"]
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+
+
+class CodeGenTokenizer(GPTTokenizer):
+
+    resource_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
+    pretrained_resource_files_map = {"vocab_file": {}, "merges_file": {}}
+    pretrained_init_configuration = {}
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        max_len=None,
+        pad_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        unk_token="<|endoftext|>",
+        eol_token="\u010a",
+        **kwargs
+    ):
+        super().__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            errors=errors,
+            max_len=max_len,
+            pad_token=pad_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            eol_token=eol_token,
+            **kwargs,
+        )
+
+    def decode(
+        self,
+        token_ids,
+        skip_special_tokens=False,
+        clean_up_tokenization_spaces=True,
+        truncate_before_pattern=None,
+        **kwargs
+    ):
+        """
+        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
+        tokens and clean up tokenization spaces.
+
+        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
+
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, paddle.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean up the tokenization spaces.
+            truncate_before_pattern (`List[str]`, *optional*, defaults to `None`):
+                A list of regular expression strings that will be used to truncate the returned string. This can be
+                used to remove extra pieces of code (e.g. truncate if observing a comment symbol "#" at the beginning
+                of a new line). An example pattern could be `["^#", re.escape("<|endoftext|>"), "^'''", "\n\n\n"]`.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific decode method.
+
+        Returns:
+            `str`: The decoded sentence.
+        """
+
+        decoded_text = super()._decode(
+            token_ids=token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+
+        if truncate_before_pattern is not None and len(truncate_before_pattern) > 0:
+            decoded_text = self.truncate(decoded_text, truncate_before_pattern)
+
+        return decoded_text
+
+    def truncate(self, completion, truncate_before_pattern):
+        def find_re(string, pattern, start_pos):
+            m = pattern.search(string, start_pos)
+            return m.start() if m else -1
+
+        re = try_import("regex")
+        terminals = [re.compile(pattern, re.MULTILINE) for pattern in truncate_before_pattern]
+
+        prints = list(re.finditer("^print", completion, re.MULTILINE))
+
+        if len(prints) > 1:
+            completion = completion[: prints[1].start()]
+
+        defs = list(re.finditer("^def", completion, re.MULTILINE))
+
+        if len(defs) > 1:
+            completion = completion[: defs[1].start()]
+
+        start_pos = 0
+
+        terminals_pos = [
+            pos for pos in [find_re(completion, terminal, start_pos) for terminal in terminals] if pos != -1
+        ]
+
+        if len(terminals_pos) > 0:
+            return completion[: min(terminals_pos)]
+        else:
+            return completion
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/configuration_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/configuration_utils.py
new file mode 100644
index 000000000..ebb905a68
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/configuration_utils.py
@@ -0,0 +1,1231 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Configuration base class and utilities."""
+from __future__ import annotations
+
+import copy
+import inspect
+import json
+import os
+import re
+import shutil
+import sys
+import warnings
+from dataclasses import field
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import paddle
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import EntryNotFoundError
+
+from .. import __version__
+from ..quantization.quantization_config import QuantizationConfig
+from ..utils import CONFIG_NAME, LEGACY_CONFIG_NAME
+from ..utils.download import resolve_file_path
+from ..utils.downloader import hf_file_exists
+from ..utils.log import logger
+
+_re_configuration_file = re.compile(r"config\.(.*)\.json")
+
+
+def custom_object_save(obj, folder, config=None):
+    """
+    Save the modeling files corresponding to a custom model/configuration/tokenizer etc. in a given folder. Optionally
+    adds the proper fields in a config.
+
+    Args:
+        obj (`Any`): The object for which to save the module files.
+        folder (`str` or `os.PathLike`): The folder where to save.
+        config (`PretrainedConfig` or dictionary, `optional`):
+            A config in which to register the auto_map corresponding to this custom object.
+    """
+    if obj.__module__ == "__main__":
+        logger.warning(
+            f"We can't save the code defining {obj} in {folder} as it's been defined in __main__. You should put "
+            "this code in a separate module so we can include it in the saved folder and make it easier to share via "
+            "the Hub."
+        )
+
+    def _set_auto_map_in_config(_config):
+        module_name = obj.__class__.__module__
+        last_module = module_name.split(".")[-1]
+        full_name = f"{last_module}.{obj.__class__.__name__}"
+        if isinstance(_config, dict):
+            auto_map = _config.get("auto_map", {})
+            auto_map[obj._auto_class] = full_name
+            _config["auto_map"] = auto_map
+        elif getattr(_config, "auto_map", None) is not None:
+            _config.auto_map[obj._auto_class] = full_name
+        else:
+            _config.auto_map = {obj._auto_class: full_name}
+
+    # Add object class to the config auto_map
+    if isinstance(config, (list, tuple)):
+        for cfg in config:
+            _set_auto_map_in_config(cfg)
+    elif config is not None:
+        _set_auto_map_in_config(config)
+
+    # Copy module file to the output folder.
+    object_file = sys.modules[obj.__module__].__file__
+    dest_file = Path(folder) / (Path(object_file).name)
+    shutil.copy(object_file, dest_file)
+
+    # Gather all relative imports recursively and make sure they are copied as well.
+    # TODO(wujingjing): `get_relative_import_files` havn't supported yet.
+    # for needed_file in get_relative_import_files(object_file):
+    #     dest_file = Path(folder) / (Path(needed_file).name)
+    #     shutil.copy(needed_file, dest_file)
+
+
+def attribute_map(config: PretrainedConfig, kwargs: Dict[str, Any]) -> Dict[str, Any]:
+    """map the <old-attr> to <new-attr> with configuration
+
+    Args:
+        config (PretrainedConfig): the instance of PretrainedConfig
+        kwargs (Dict[str, Any]): the kwargs of attribute
+    """
+    for old_key, new_key in config.attribute_map.items():
+        if old_key in kwargs:
+            if new_key in kwargs:
+                logger.warning(f"receive param<{old_key}> and param<{new_key}>, but the first one will be adopt")
+            kwargs[new_key] = kwargs.pop(old_key)
+    return kwargs
+
+
+def convert_to_legacy_config(attribute_map: Dict[str, str], config: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    works when there are different fields between huggingface and paddle
+    Args:
+        attribute_map (Dict[str, str]): mapping of between standard config and paddle config
+        config (Dict[str, Any]): config of huggingface transformers models
+    Returns: the config which can be mapped into config of paddle model
+    """
+    if "init_args" in config:
+        args = []
+        for init_arg in config["init_args"]:
+            init_arg = convert_to_legacy_config(attribute_map, init_arg)
+            args.append(init_arg)
+        config["init_args"] = args
+
+    # TODO(wj-Mcat): to improve compatibility for: old local config and new PretrainedConfig, eg:
+    # { "init_args": [], "init_class": "", "num_classes": 12 }
+    for standard_field, paddle_field in attribute_map.items():
+        value = config.pop(standard_field, None) or config.pop(paddle_field, None)
+        if value is not None:
+            config[paddle_field] = value
+    return config
+
+
+def flatten_model_config(config: dict) -> dict:
+    """flatten the model config which can be old-style model config
+
+    Args:
+        config (dict): the source of config which can be flatten config or nest config
+
+    Returns:
+        dict: the flatten config
+    """
+    # 1. extract the init_args into the top level
+    init_args = config.pop("init_args", [])
+
+    index = 0
+    while index < len(init_args):
+        if isinstance(init_args[index], dict):
+            for key, value in init_args[index].items():
+                if key not in config:
+                    config[key] = value
+            init_args.pop(index)
+        else:
+            index += 1
+
+    if init_args:
+        config["init_args"] = init_args
+
+    # 2. convert `init_class` into `architectures`
+    if "init_class" in config:
+        config["architectures"] = [config.pop("init_class")]
+
+    return config
+
+
+def is_standard_config(config: Union[PretrainedConfig, Dict[str, Any]]) -> bool:
+    """
+    check whether the config is standard
+    Args:
+        config: the dict data of config
+    """
+    if isinstance(config, PretrainedConfig):
+        return True
+
+    return "init_class" not in config and "architectures" in config
+
+
+def resolve_hf_config_path(repo_id: str, cache_dir: str, subfolder=None) -> str:
+    """resolve config file from hf hub
+
+    Args:
+        repo_id (str): the repo name from huggingface hub
+        cache_dir (str): the cachedir
+        subfolder (str, optional) An optional value corresponding to a folder inside the repo.
+
+    Returns:
+        str: the downloaded config file
+    """
+    if hf_file_exists(repo_id=repo_id, filename=CONFIG_NAME, subfolder=subfolder):
+        file_name = CONFIG_NAME
+    else:
+        raise EntryNotFoundError(f"can not find the paddle/pytorch config file from: https://huggingface.co/{repo_id}")
+
+    return hf_hub_download(
+        repo_id=repo_id,
+        filename=file_name,
+        cache_dir=cache_dir,
+        subfolder=subfolder,
+        library_name="PaddleNLP",
+        library_version=__version__,
+    )
+
+
+def set_expected_keys(config, llm_meta, kwargs):
+    for key, value in llm_meta.items():
+        if key in kwargs:
+            value = kwargs.pop(key)
+        setattr(config, key, value)
+
+    return kwargs
+
+
+def llmmetaclass(cls):
+    # https://github.com/python/cpython/blob/2b091b9aa9a6ca5e2a34654dde909c5bdfc52fa8/Lib/dataclasses.py#L970C31-L970C46
+    llm_meta = LlmMetaConfig._get_all_meta()
+
+    for name, datatype, default_value, comment in llm_meta:
+        if not hasattr(cls, name):
+            value = field(
+                default=default_value,
+                metadata={"help": comment},
+            )
+            setattr(cls, name, value)
+            cls.__annotations__[name] = datatype
+
+    return cls
+
+
+class LlmMetaConfig:
+    op_fusion_attributes = [
+        # name, type, default_value, comment
+        ("use_flash_attention", bool, False, "Whether to use flash attention to accelerate training."),
+        ("use_fused_rms_norm", bool, False, "llama or other model, use_fused_rms_norm"),
+        ("use_fused_rope", bool, False, "Enable rope fusion or not."),
+        ("use_fused_linear", bool, False, "GPT3 model, use fused linear layer"),
+        ("use_fused_dropout_add", bool, False, "GPT3 model, use fused `dropout + residual add` op."),
+    ]
+
+    hybrid_parallel_attributes = [
+        # tensor_parallel
+        ("tensor_parallel_degree", int, 1, "tensor_parallel_degree"),
+        ("tensor_parallel_rank", int, 0, "tensor_parallel_rank"),
+        ("tensor_parallel_output", bool, True, "tensor_parallel_output"),
+        # pipeline_parallel
+        ("pipeline_parallel_degree", int, 1, "pipeline_parallel_degree"),
+        ("virtual_pp_degree", int, 1, "Virtual pipeline degree"),
+        # pp refine recompute
+        ("no_recompute_layers", Optional[List[int]], None, "no_recompute_layers"),
+        (
+            "pp_recompute_interval",
+            int,
+            1,
+            "The interval for the number of layers at which recomputation occurs. A value of 0 indicates no recomputation. Default is 0.",
+        ),
+        # sep_parallel
+        ("sep_parallel_degree", int, 1, "sep_parallel_degree"),
+        ("context_parallel_degree", int, 1, "context_parallel_degree"),
+        ("sequence_parallel", bool, False, "Whether to use sequence parallel"),
+        ("fuse_sequence_parallel_allreduce", bool, False, "Whether to use fuse sequence parallel allreduce"),
+    ]
+    recompute_attributes = [
+        ("recompute", bool, False, "recompute"),
+        (
+            "recompute_granularity",
+            str,
+            "full",
+            "Recompute granularity, Choose among ['full', 'core_attn', 'full_attn']",
+        ),
+        ("recompute_use_reentrant", bool, False, "recompute_use_reentrant"),
+    ]
+
+    @classmethod
+    def _get_defaults(cls):
+        ret = {}
+        for attrs in [
+            cls.op_fusion_attributes,
+            cls.hybrid_parallel_attributes,
+            cls.recompute_attributes,
+        ]:
+            for attr in attrs:
+                # return dict of key and default values
+                ret[attr[0]] = attr[2]
+        return ret
+
+    @classmethod
+    def _get_all_meta(cls):
+        ret = []
+        for attrs in [
+            cls.op_fusion_attributes,
+            cls.hybrid_parallel_attributes,
+            cls.recompute_attributes,
+        ]:
+            for attr in attrs:
+                # return dict of key and default values
+                ret.append(attr)
+        return ret
+
+    @classmethod
+    def _get_unsavable_keys(cls):
+        ret = set()
+        for attrs in [
+            cls.op_fusion_attributes,
+            cls.hybrid_parallel_attributes,
+            cls.recompute_attributes,
+        ]:
+            for attr in attrs:
+                ret.add(attr[0])
+        return ret
+
+    @classmethod
+    def set_llm_config(cls, config, args):
+        for key, value in cls._get_defaults().items():
+            setattr(config, key, getattr(args, key, value))
+
+
+class PretrainedConfig:
+    r"""
+    Base class for all configuration classes. Handles a few parameters common to all models' configurations as well as
+    methods for loading/downloading/saving configurations.
+
+    <Tip>
+
+    A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to
+    initialize a model does **not** load the model weights. It only affects the model's configuration.
+
+    </Tip>
+
+    Class attributes (overridden by derived classes):
+
+    - **model_type** (`str`) -- An identifier for the model type, serialized into the JSON file, and used to recreate
+      the correct object in [`~paddlenlp.AutoConfig`].
+    - **is_composition** (`bool`) -- Whether the config class is composed of multiple sub-configs. In this case the
+      config has to be initialized from two or more configs of type [`~paddlenlp.PretrainedConfig`] like:
+      [`~paddlenlp.EncoderDecoderConfig`] or [`~RagConfig`].
+    - **keys_to_ignore_at_inference** (`List[str]`) -- A list of keys to ignore by default when looking at dictionary
+      outputs of the model during inference.
+    - **attribute_map** (`Dict[str, str]`) -- A dict that maps model specific attribute names to the standardized
+      naming of attributes.
+
+    Common attributes (present in all subclasses):
+
+    - **vocab_size** (`int`) -- The number of tokens in the vocabulary, which is also the first dimension of the
+      embeddings matrix (this attribute may be missing for models that don't have a text modality like ViT).
+    - **hidden_size** (`int`) -- The hidden size of the model.
+    - **num_attention_heads** (`int`) -- The number of attention heads used in the multi-head attention layers of the
+      model.
+    - **num_hidden_layers** (`int`) -- The number of blocks in the model.
+
+    Arg:
+        name_or_path (`str`, *optional*, defaults to `""`):
+            Store the string that was passed to [`PreTrainedModel.from_pretrained`] or
+            [`PreTrainedModel.from_pretrained`] as `pretrained_model_name_or_path` if the configuration was created
+            with such a method.
+        output_hidden_states (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should return all hidden-states.
+        output_attentions (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should returns all attentions.
+        return_dict (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return a [`~paddlenlp.transformers.model_outputs.ModelOutput`] instead of a plain tuple.
+        is_encoder_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as an encoder/decoder or not.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as decoder or not (in which case it's used as an encoder).
+        cross_attention_hidden_size** (`bool`, *optional*):
+            The hidden size of the cross-attention layer in case the model is used as a decoder in an encoder-decoder
+            setting and the cross-attention hidden dimension differs from `self.config.hidden_size`.
+        add_cross_attention (`bool`, *optional*, defaults to `False`):
+            Whether cross-attention layers should be added to the model. Note, this option is only relevant for models
+            that can be used as decoder models within the [`EncoderDecoderModel`] class, which consists of all models
+            in `AUTO_MODELS_FOR_CAUSAL_LM`.
+        tie_encoder_decoder (`bool`, *optional*, defaults to `False`):
+            Whether all encoder weights should be tied to their equivalent decoder weights. This requires the encoder
+            and decoder model to have the exact same parameter names.
+        prune_heads (`Dict[int, List[int]]`, *optional*, defaults to `{}`):
+            Pruned heads of the model. The keys are the selected layer indices and the associated values, the list of
+            heads to prune in said layer.
+
+            For instance `{1: [0, 2], 2: [2, 3]}` will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
+        chunk_size_feed_forward (`int`, *optional*, defaults to `0`):
+            The chunk size of all feed forward layers in the residual attention blocks. A chunk size of `0` means that
+            the feed forward layer is not chunked. A chunk size of n means that the feed forward layer processes `n` <
+            sequence_length embeddings at a time. For more information on feed forward chunking, see [How does Feed
+            Forward Chunking work?](../glossary.html#feed-forward-chunking).
+
+        > Parameters for sequence generation
+
+        max_length (`int`, *optional*, defaults to 20):
+            Maximum length that will be used by default in the `generate` method of the model.
+        min_length (`int`, *optional*, defaults to 10):
+            Minimum length that will be used by default in the `generate` method of the model.
+        do_sample (`bool`, *optional*, defaults to `False`):
+            Flag that will be used by default in the `generate` method of the model. Whether or not to use sampling ;
+            use greedy decoding otherwise.
+        early_stopping (`bool`, *optional*, defaults to `False`):
+            Flag that will be used by default in the `generate` method of the model. Whether to stop the beam search
+            when at least `num_beams` sentences are finished per batch or not.
+        num_beams (`int`, *optional*, defaults to 1):
+            Number of beams for beam search that will be used by default in the `generate` method of the model. 1 means
+            no beam search.
+        num_beam_groups (`int`, *optional*, defaults to 1):
+            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams
+            that will be used by default in the `generate` method of the model. 1 means no group beam search.
+        diversity_penalty (`float`, *optional*, defaults to 0.0):
+            Value to control diversity for group beam search. that will be used by default in the `generate` method of
+            the model. 0 means no diversity penalty. The higher the penalty, the more diverse are the outputs.
+        temperature (`float`, *optional*, defaults to 1):
+            The value used to module the next token probabilities that will be used by default in the `generate` method
+            of the model. Must be strictly positive.
+        top_k (`int`, *optional*, defaults to 50):
+            Number of highest probability vocabulary tokens to keep for top-k-filtering that will be used by default in
+            the `generate` method of the model.
+        top_p (`float`, *optional*, defaults to 1):
+            Value that will be used by default in the `generate` method of the model for `top_p`. If set to float < 1,
+            only the most probable tokens with probabilities that add up to `top_p` or higher are kept for generation.
+        repetition_penalty (`float`, *optional*, defaults to 1):
+            Parameter for repetition penalty that will be used by default in the `generate` method of the model. 1.0
+            means no penalty.
+        length_penalty (`float`, *optional*, defaults to 1):
+            Exponential penalty to the length that will be used by default in the `generate` method of the model.
+        no_repeat_ngram_size (`int`, *optional*, defaults to 0) -- Value that will be used by default in the
+            `generate` method of the model for `no_repeat_ngram_size`. If set to int > 0, all ngrams of that size can
+            only occur once.
+        encoder_no_repeat_ngram_size (`int`, *optional*, defaults to 0) -- Value that will be used by
+            default in the `generate` method of the model for `encoder_no_repeat_ngram_size`. If set to int > 0, all
+            ngrams of that size that occur in the `encoder_input_ids` cannot occur in the `decoder_input_ids`.
+        bad_words_ids (`List[int]`, *optional*):
+            List of token ids that are not allowed to be generated that will be used by default in the `generate`
+            method of the model. In order to get the tokens of the words that should not appear in the generated text,
+            use `tokenizer.encode(bad_word, add_prefix_space=True)`.
+        num_return_sequences (`int`, *optional*, defaults to 1):
+            Number of independently computed returned sequences for each element in the batch that will be used by
+            default in the `generate` method of the model.
+        output_scores (`bool`, *optional*, defaults to `False`):
+            Whether the model should return the logits when used for generation.
+        return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+            Whether the model should return a [`~paddlenlp.transformers.model_outputs.ModelOutput`] instead of a `paddlenlp.Tensor`.
+        forced_bos_token_id (`int`, *optional*):
+            The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful for
+            multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be the target
+            language token.
+        forced_eos_token_id (`int`, *optional*):
+            The id of the token to force as the last generated token when `max_length` is reached.
+        remove_invalid_values (`bool`, *optional*):
+            Whether to remove possible _nan_ and _inf_ outputs of the model to prevent the generation method to crash.
+            Note that using `remove_invalid_values` can slow down generation.
+
+        > Parameters for fine-tuning tasks
+
+        architectures (`List[str]`, *optional*):
+            Model architectures that can be used with the model pretrained weights.
+        finetuning_task (`str`, *optional*):
+            Name of the task used to fine-tune the model. This can be used when converting from an original checkpoint.
+        id2label (`Dict[int, str]`, *optional*):
+            A map from index (for instance prediction index, or target index) to label.
+        label2id (`Dict[str, int]`, *optional*): A map from label to index for the model.
+        num_labels (`int`, *optional*):
+            Number of labels to use in the last layer added to the model, typically for a classification task.
+        task_specific_params (`Dict[str, Any]`, *optional*):
+            Additional keyword arguments to store for the current task.
+        problem_type (`str`, *optional*):
+            Problem type for `XxxForSequenceClassification` models. Can be one of `"regression"`,
+            `"single_label_classification"` or `"multi_label_classification"`.
+
+        > Parameters linked to the tokenizer
+
+        tokenizer_class (`str`, *optional*):
+            The name of the associated tokenizer class to use (if none is set, will use the tokenizer associated to the
+            model by default).
+        prefix (`str`, *optional*):
+            A specific prompt that should be added at the beginning of each text before calling the model.
+        bos_token_id (`int`, *optional*): The id of the _beginning-of-stream_ token.
+        pad_token_id (`int`, *optional*): The id of the _padding_ token.
+        eos_token_id (`int`, *optional*): The id of the _end-of-stream_ token.
+        decoder_start_token_id (`int`, *optional*):
+            If an encoder-decoder model starts decoding with a different token than _bos_, the id of that token.
+        sep_token_id (`int`, *optional*): The id of the _separation_ token.
+
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
+            model has a output word embedding layer.
+        dtype (`str`, *optional*):
+            The `dtype` of the weights. This attribute can be used to initialize the model to a non-default `dtype`
+            (which is normally `float32`) and thus allow for optimal storage allocation. For example, if the saved
+            model is `float16`, ideally we want to load it back using the minimal amount of memory needed to load
+            `float16` weights. Since the config object is stored in plain text, this attribute contains just the
+            floating type string without the `paddle.` prefix. For example, for `paddle.float16` ``dtype` is the
+            `"float16"` string.
+
+            This attribute is currently not being used during model loading time, but this may change in the future
+            versions. But we can already start preparing for the future by saving the dtype with save_pretrained.
+    """
+    model_type: str = ""
+    is_composition: bool = False
+
+    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+    # TODO(wj-Mcat): this comment should be removed after this feature is accepted by PaddleNLP teams
+    # `pretrained_init_configuration` can be `dict` or `url`: eg:
+    #     {
+    #         "bert-base-uncased": {
+    #             "vocab_size": 30522,
+    #             "hidden_size": 768,
+    #         },
+    #         "bert-large-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/model_config.json"
+    #     }
+    #
+    # advantages:
+    #     1. reuse the concept: `pretrained_init_configuration` and extend it
+    #     2. make code more concise when support resource file
+    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+    pretrained_init_configuration = {}
+
+    # global attribute mapping
+    attribute_map: Dict[str, str] = {"num_classes": "num_labels"}
+
+    _auto_class: Optional[str] = None
+
+    # Fix me, it is global for all config
+    _unsavable_keys = set()
+
+    def __setattr__(self, key, value):
+        if key in super().__getattribute__("attribute_map"):
+            key = super().__getattribute__("attribute_map")[key]
+        super().__setattr__(key, value)
+        assert hasattr(self, key)
+
+    def __getattribute__(self, key):
+        if key != "attribute_map" and key in super().__getattribute__("attribute_map"):
+            key = super().__getattribute__("attribute_map")[key]
+        return super().__getattribute__(key)
+
+    def __getitem__(self, key):
+        return getattr(self, key, None)
+
+    def __setitem__(self, key, value):
+        if hasattr(self, key):
+            setattr(self, key, value)
+
+    def __init__(self, **kwargs):
+        # Attributes with defaults
+        # map the old attr to new atr, eg: num_classes -> num_labels
+        kwargs = attribute_map(self, kwargs=kwargs)
+        kwargs.pop("transformers_version", None)
+        llm_meta = LlmMetaConfig._get_defaults()
+        self._unsavable_keys.update(LlmMetaConfig._get_unsavable_keys())
+        self._unsavable_keys.remove("tensor_parallel_degree")
+
+        kwargs = set_expected_keys(self, llm_meta, kwargs)
+        if self.sequence_parallel:
+            assert (
+                self.tensor_parallel_degree > 1
+            ), f"senquence-parallel only works in tensor parallel, got tensor parallel degree={self.tensor_parallel_degree}"
+
+        self.chunk_size_feed_forward = kwargs.pop("chunk_size_feed_forward", 0)
+        self.return_dict = kwargs.pop("return_dict", False)
+        self.output_hidden_states = kwargs.pop("output_hidden_states", False)
+        self.output_attentions = kwargs.pop("output_attentions", False)
+        self.use_cache = kwargs.pop("use_cache", False)
+
+        # for transformers fuse
+        self.fuse_attention_qkv = kwargs.pop("fuse_attention_qkv", False)
+        self.fuse_attention_ffn = kwargs.pop("fuse_attention_ffn", False)
+
+        if "quantization_config" in kwargs and isinstance(kwargs["quantization_config"], Dict):
+            kwargs["quantization_config"] = QuantizationConfig.from_dict(kwargs["quantization_config"])
+        self.quantization_config = kwargs.pop("quantization_config", QuantizationConfig())
+
+        self.pruned_heads = kwargs.pop("pruned_heads", {})
+        self.tie_word_embeddings = kwargs.pop(
+            "tie_word_embeddings", True
+        )  # Whether input and output word embeddings should be tied for all MLM, LM and Seq2Seq models.
+
+        # parameter for model dtype
+        if "torch_dtype" in kwargs:
+            self.dtype = kwargs.pop("torch_dtype")
+        else:
+            self.dtype = kwargs.pop("dtype", paddle.get_default_dtype())
+
+        # Is decoder is used in encoder-decoder models to differentiate encoder from decoder
+        self.is_encoder_decoder = kwargs.pop("is_encoder_decoder", False)
+        self.is_decoder = kwargs.pop("is_decoder", False)
+        self.cross_attention_hidden_size = kwargs.pop("cross_attention_hidden_size", None)
+        self.add_cross_attention = kwargs.pop("add_cross_attention", False)
+        self.tie_encoder_decoder = kwargs.pop("tie_encoder_decoder", False)
+
+        # Retrocompatibility: Parameters for sequence generation. While we will keep the ability to load these
+        # parameters, saving them will be deprecated. In a distant future, we won't need to load them.
+        for parameter_name, default_value in self._get_generation_defaults().items():
+            setattr(self, parameter_name, kwargs.pop(parameter_name, default_value))
+
+        # Fine-tuning task arguments
+        self.architectures = kwargs.pop("architectures", None)
+        self.finetuning_task = kwargs.pop("finetuning_task", None)
+        self.id2label = kwargs.pop("id2label", None)
+        self.label2id = kwargs.pop("label2id", None)
+        if self.id2label is not None:
+            num_labels = kwargs.pop("num_labels", None)
+            if num_labels is not None and len(self.id2label) != num_labels:
+                logger.warning(
+                    f"You passed along `num_labels={num_labels}` with an incompatible id to label map: "
+                    f"{self.id2label}. The number of labels wil be overwritten to {self.num_labels}."
+                )
+            self.id2label = dict((int(key), value) for key, value in self.id2label.items())
+            # Keys are always strings in JSON so convert ids to int here.
+        else:
+            self.num_labels = kwargs.pop("num_labels", 2)
+        self.num_choices = kwargs.pop("num_choices", None)
+
+        self.classifier_dropout = kwargs.pop("classifier_dropout", None)
+
+        # Tokenizer arguments TODO: eventually tokenizer and models should share the same config
+        self.tokenizer_class = kwargs.pop("tokenizer_class", None)
+        self.prefix = kwargs.pop("prefix", None)
+        self.bos_token_id = kwargs.pop("bos_token_id", None)
+        self.pad_token_id = kwargs.pop("pad_token_id", None)
+        self.eos_token_id = kwargs.pop("eos_token_id", None)
+        self.sep_token_id = kwargs.pop("sep_token_id", None)
+
+        self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None)
+
+        # task specific arguments
+        self.task_specific_params = kwargs.pop("task_specific_params", None)
+
+        # regression / multi-label classification
+        self.problem_type = kwargs.pop("problem_type", None)
+        allowed_problem_types = ("regression", "single_label_classification", "multi_label_classification")
+        if self.problem_type is not None and self.problem_type not in allowed_problem_types:
+            raise ValueError(
+                f"The config parameter `problem_type` was not understood: received {self.problem_type} "
+                "but only 'regression', 'single_label_classification' and 'multi_label_classification' are valid."
+            )
+
+        # Name or path to the pretrained checkpoint
+        self._name_or_path = str(kwargs.pop("name_or_path", ""))
+
+        # Drop the transformers version info
+        self.paddlenlp_version = kwargs.pop("paddlenlp_version", None)
+
+        # Deal with gradient checkpointing
+        if kwargs.get("gradient_checkpointing", False):
+            warnings.warn(
+                "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "
+                "Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the "
+                "`Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`."
+            )
+
+        # Additional attributes without default values
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                logger.error(f"Can't set {key} with value {value} for {self}")
+                raise err
+
+    @staticmethod
+    def _get_generation_defaults() -> Dict[str, Any]:
+        return {
+            "max_length": 20,
+            "min_length": 0,
+            "do_sample": False,
+            "early_stopping": False,
+            "num_beams": 1,
+            "num_beam_groups": 1,
+            "diversity_penalty": 0.0,
+            "temperature": 1.0,
+            "top_k": 50,
+            "top_p": 1.0,
+            "typical_p": 1.0,
+            "repetition_penalty": 1.0,
+            "length_penalty": 1.0,
+            "no_repeat_ngram_size": 0,
+            "encoder_no_repeat_ngram_size": 0,
+            "bad_words_ids": None,
+            "num_return_sequences": 1,
+            "output_scores": False,
+            "return_dict_in_generate": False,
+            "forced_bos_token_id": None,
+            "forced_eos_token_id": None,
+            "remove_invalid_values": False,
+            "exponential_decay_length_penalty": None,
+            "suppress_tokens": None,
+            "begin_suppress_tokens": None,
+        }
+
+    def _has_non_default_generation_parameters(self) -> bool:
+        """
+        Whether or not this instance holds non-default generation parameters.
+        """
+        for parameter_name, default_value in self._get_generation_defaults().items():
+            if hasattr(self, parameter_name) and getattr(self, parameter_name) != default_value:
+                return True
+        return False
+
+    @property
+    def name_or_path(self) -> str:
+        return getattr(self, "_name_or_path", None)
+
+    @name_or_path.setter
+    def name_or_path(self, value):
+        self._name_or_path = str(value)  # Make sure that name_or_path is a string (for JSON encoding)
+
+    @property
+    def use_return_dict(self) -> bool:
+        """
+        `bool`: Whether or not return [`~paddlenlp.transformers.model_outputs.ModelOutput`] instead of tuples.
+        """
+        return self.return_dict
+
+    @property
+    def num_labels(self) -> int:
+        """
+        `int`: The number of labels for classification models.
+        """
+        return len(self.id2label)
+
+    @num_labels.setter
+    def num_labels(self, num_labels: int):
+        if not hasattr(self, "id2label") or self.id2label is None or len(self.id2label) != num_labels:
+            self.id2label = {i: f"LABEL_{i}" for i in range(num_labels)}
+            self.label2id = dict(zip(self.id2label.values(), self.id2label.keys()))
+
+    def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs):
+        """
+        Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the
+        [`~PretrainedConfig.from_pretrained`] class method.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the configuration JSON file will be saved (will be created if it does not exist).
+            kwargs:
+                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
+        # loaded from the Hub.
+        if self._auto_class is not None:
+            custom_object_save(self, save_directory, config=self)
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_config_file = os.path.join(save_directory, CONFIG_NAME)
+
+        self.to_json_file(output_config_file, use_diff=True)
+        logger.info(f"Configuration saved in {output_config_file}")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> PretrainedConfig:
+        r"""
+        Instantiate a [`PretrainedConfig`] (or a derived class) from a pretrained model configuration.
+
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                This can be either:
+
+                - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+                  paddlenlp bos server. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a configuration file saved using the
+                  [`~PretrainedConfig.save_pretrained`] method, e.g., `./my_model_directory/`.
+                - a path or url to a saved configuration JSON *file*, e.g., `./my_model_directory/configuration.json`.
+            kwargs (`Dict[str, Any]`, *optional*):
+                The values in kwargs of any keys which are configuration attributes will be used to override the loaded
+                values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
+                by the `return_unused_kwargs` keyword parameter.
+
+        <Tip>
+
+        Passing `use_auth_token=True` is required when you want to use a private model.
+
+        </Tip>
+
+        Returns:
+            [`PretrainedConfig`]: The configuration object instantiated from this pretrained model.
+
+        Examples:
+
+        ```python
+        # We can't instantiate directly the base class *PretrainedConfig* so let's show the examples on a
+        # derived class: BertConfig
+        config = BertConfig.from_pretrained(
+            "bert-base-uncased"
+        )  # Download configuration from huggingface.co and cache.
+        config = BertConfig.from_pretrained(
+            "./test/saved_model/"
+        )  # E.g. config (or model) was saved using *save_pretrained('./test/saved_model/')*
+        config = BertConfig.from_pretrained("./test/saved_model/my_configuration.json")
+        config = BertConfig.from_pretrained("bert-base-uncased", output_attentions=True, foo=False)
+        assert config.output_attentions == True
+        config, unused_kwargs = BertConfig.from_pretrained(
+            "bert-base-uncased", output_attentions=True, foo=False, return_unused_kwargs=True
+        )
+        assert config.output_attentions == True
+        assert unused_kwargs == {"foo": False}
+        ```"""
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        return cls.from_dict(config_dict, **kwargs)
+
+    @classmethod
+    def get_config_dict(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        """
+        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
+        [`PretrainedConfig`] using `from_dict`.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
+
+        Returns:
+            `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object.
+
+        """
+        original_kwargs = copy.deepcopy(kwargs)
+        cache_dir = kwargs.pop("cache_dir", None)
+        subfolder = kwargs.get("subfolder", "")
+        if subfolder is None:
+            subfolder = ""
+
+        kwargs["cache_dir"] = cache_dir
+        kwargs["subfolder"] = subfolder
+
+        # Get config dict associated with the base config file
+        config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # That config file may point us toward another config file to use.
+        if "configuration_files" in config_dict:
+            original_kwargs["cache_dir"] = os.path.join(cache_dir, pretrained_model_name_or_path, subfolder)
+            configuration_file = get_configuration_file(config_dict["configuration_files"])
+            config_dict, kwargs = cls._get_config_dict(
+                pretrained_model_name_or_path, _configuration_file=configuration_file, **original_kwargs
+            )
+
+        return config_dict, kwargs
+
+    @classmethod
+    def _get_config_dict(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        cache_dir = kwargs.pop("cache_dir", None)
+        from_hf_hub = kwargs.pop("from_hf_hub", False)
+        from_aistudio = kwargs.pop("from_aistudio", False)
+        subfolder = kwargs.pop("subfolder", "")
+        if subfolder is None:
+            subfolder = ""
+        force_download = kwargs.pop("force_download", False)
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+
+        resolved_config_file = None
+
+        # 0. init from pretrained_init_configuration
+        if pretrained_model_name_or_path in cls.pretrained_init_configuration:
+            # which can be: dict or url
+            pretrained_model_name_or_path_ = cls.pretrained_init_configuration[pretrained_model_name_or_path]
+
+            if isinstance(pretrained_model_name_or_path_, dict):
+                return pretrained_model_name_or_path_, kwargs
+
+        configuration_file = kwargs.pop("_configuration_file", CONFIG_NAME)
+        filenames = (
+            [configuration_file, LEGACY_CONFIG_NAME]
+            if configuration_file == CONFIG_NAME
+            else [configuration_file, CONFIG_NAME, LEGACY_CONFIG_NAME]
+        )
+        resolved_config_file = resolve_file_path(
+            pretrained_model_name_or_path,
+            filenames,
+            subfolder,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            from_aistudio=from_aistudio,
+            from_hf_hub=from_hf_hub,
+        )
+        assert (
+            resolved_config_file is not None
+        ), f"please make sure one of the {filenames} under {pretrained_model_name_or_path}"
+        try:
+            logger.info(f"Loading configuration file {resolved_config_file}")
+            # Load config dict
+            config_dict = cls._dict_from_json_file(resolved_config_file)
+        except (json.JSONDecodeError, UnicodeDecodeError):
+            raise EnvironmentError(f"Config file<'{resolved_config_file}'> is not a valid JSON file.")
+
+        return config_dict, kwargs
+
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PretrainedConfig":
+        """
+        Instantiates a [`PretrainedConfig`] from a Python dictionary of parameters.
+
+        Args:
+            config_dict (`Dict[str, Any]`):
+                Dictionary that will be used to instantiate the configuration object. Such a dictionary can be
+                retrieved from a pretrained checkpoint by leveraging the [`~PretrainedConfig.get_config_dict`] method.
+            kwargs (`Dict[str, Any]`):
+                Additional parameters from which to initialize the configuration object.
+
+        Returns:
+            [`PretrainedConfig`]: The configuration object instantiated from those parameters.
+        """
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+
+        # do standard config map: there are some old-school pretrained-config not refactored.
+        config_dict = convert_to_legacy_config(cls.attribute_map, config_dict)
+
+        config_dict = flatten_model_config(config_dict)
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        config = cls(**config_dict)
+
+        if hasattr(config, "pruned_heads"):
+            config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
+
+        # Update config with kwargs if needed
+        if "num_labels" in kwargs and "id2label" in kwargs:
+            num_labels = kwargs["num_labels"]
+            id2label = kwargs["id2label"] if kwargs["id2label"] is not None else []
+            if len(id2label) != num_labels:
+                raise ValueError(
+                    f"You passed along `num_labels={num_labels }` with an incompatible id to label map: "
+                    f"{kwargs['id2label']}. Since those arguments are inconsistent with each other, you should remove "
+                    "one of them."
+                )
+        to_remove = []
+        for key, value in kwargs.items():
+            if key == "quantization_config" and isinstance(value, Dict):
+                for q_key in value:
+                    setattr(config.quantization_config, q_key, value[q_key])
+                to_remove.append(key)
+                continue
+            if hasattr(config, key):
+                setattr(config, key, value)
+                if key != "dtype":
+                    to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        if return_unused_kwargs:
+            return config, kwargs
+        else:
+            return config
+
+    @classmethod
+    def from_json_file(cls, json_file: Union[str, os.PathLike]) -> "PretrainedConfig":
+        """
+        Instantiates a [`PretrainedConfig`] from the path to a JSON file of parameters.
+
+        Args:
+            json_file (`str` or `os.PathLike`):
+                Path to the JSON file containing the parameters.
+
+        Returns:
+            [`PretrainedConfig`]: The configuration object instantiated from that JSON file.
+
+        """
+        config_dict = cls._dict_from_json_file(json_file)
+        return cls(**config_dict)
+
+    @classmethod
+    def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
+        with open(json_file, "r", encoding="utf-8") as reader:
+            text = reader.read()
+        return json.loads(text)
+
+    def __eq__(self, other):
+        return self.__dict__ == other.__dict__
+
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+
+    def to_diff_dict(self, saving_file=False) -> Dict[str, Any]:
+        """
+        Removes all attributes from config which correspond to the default config attributes for better readability and
+        serializes to a Python dictionary.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        config_dict = self.to_dict(saving_file=saving_file)
+
+        # get the default config dict
+        default_config_dict = PretrainedConfig().to_dict(saving_file=saving_file)
+
+        # get class specific config dict
+        class_config_dict = self.__class__().to_dict(saving_file=saving_file) if not self.is_composition else {}
+
+        serializable_config_dict = {}
+
+        # only serialize values that differ from the default config
+        for key, value in config_dict.items():
+            if key == "quantization_config":
+                quantization_diff_dict = self.quantization_config.to_diff_dict()
+                if len(quantization_diff_dict) > 0:
+                    serializable_config_dict[key] = quantization_diff_dict
+                continue
+            if (
+                key not in default_config_dict
+                or key == "paddlenlp_version"
+                or value != default_config_dict[key]
+                or (key in class_config_dict and value != class_config_dict[key])
+            ):
+                serializable_config_dict[key] = value
+
+        return serializable_config_dict
+
+    def register_unsavable_keys(self, keys):
+        # Save: not save it in any case
+        # Print: show it if non defalut value
+        if type(keys) == list or type(keys) == tuple:
+            for key in keys:
+                self._unsavable_keys.add(key)
+        else:
+            self._unsavable_keys.add(keys)
+
+    def to_dict(self, saving_file=False) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+        """
+        output = copy.deepcopy(self.__dict__)
+        if hasattr(self.__class__, "model_type"):
+            output["model_type"] = self.__class__.model_type
+        if "_auto_class" in output:
+            del output["_auto_class"]
+        if "moe_group" in output:
+            del output["moe_group"]
+
+        # PaddleNLP version when serializing the model
+        output["paddlenlp_version"] = __version__
+
+        for key, value in output.items():
+            # Deal with nested configs like CLIP
+            if isinstance(value, PretrainedConfig):
+                value = value.to_dict()
+                del value["paddlenlp_version"]
+
+            output[key] = value
+
+        # Fix for rewrited from_pretrained method, hasattr
+        if saving_file and hasattr(self, "_unsavable_keys"):
+            for key in list(output.keys()):
+                if key in self._unsavable_keys:
+                    output.pop(key)
+
+        if hasattr(self, "quantization_config"):
+            output["quantization_config"] = (
+                self.quantization_config.to_dict()
+                if not isinstance(self.quantization_config, dict)
+                else self.quantization_config
+            )
+
+            # pop the `_pre_quantization_dtype` as torch.dtypes are not serializable.
+            _ = output.pop("_pre_quantization_dtype", None)
+
+        return output
+
+    def to_json_string(self, use_diff: bool = True, saving_file=False) -> str:
+        """
+        Serializes this instance to a JSON string.
+
+        Args:
+            use_diff (`bool`, *optional*, defaults to `True`):
+                If set to `True`, only the difference between the config instance and the default `PretrainedConfig()`
+                is serialized to JSON string.
+
+        Returns:
+            `str`: String containing all the attributes that make up this configuration instance in JSON format.
+        """
+        if use_diff is True:
+            config_dict = self.to_diff_dict(saving_file=saving_file)
+        else:
+            config_dict = self.to_dict(saving_file=saving_file)
+
+        return json.dumps(config_dict, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
+
+    def to_json_file(self, json_file_path: Union[str, os.PathLike], use_diff: bool = True, saving_file=True):
+        """
+        Save this instance to a JSON file.
+
+        Args:
+            json_file_path (`str` or `os.PathLike`):
+                Path to the JSON file in which this configuration instance's parameters will be saved.
+            use_diff (`bool`, *optional*, defaults to `True`):
+                If set to `True`, only the difference between the config instance and the default `PretrainedConfig()`
+                is serialized to JSON file.
+        """
+        spec = inspect.getfullargspec(self.to_json_string)
+        has_saving_file_arg = "saving_file" in spec.args or spec.varkw
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            if has_saving_file_arg:
+                s = self.to_json_string(use_diff=use_diff, saving_file=saving_file)
+            else:
+                s = self.to_json_string(use_diff=use_diff)
+            writer.write(s)
+
+    def update(self, config_dict: Dict[str, Any]):
+        """
+        Updates attributes of this class with attributes from `config_dict`.
+
+        Args:
+            config_dict (`Dict[str, Any]`): Dictionary of attributes that should be updated for this class.
+        """
+        for key, value in config_dict.items():
+            setattr(self, key, value)
+
+    def update_from_string(self, update_str: str):
+        """
+        Updates attributes of this class with attributes from `update_str`.
+
+        The expected format is ints, floats and strings as is, and for booleans use `true` or `false`. For example:
+        "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
+
+        The keys to change have to already exist in the config object.
+
+        Args:
+            update_str (`str`): String with attributes that should be updated for this class.
+
+        """
+
+        d = dict(x.split("=") for x in update_str.split(","))
+        for k, v in d.items():
+            if not hasattr(self, k):
+                raise ValueError(f"key {k} isn't in the original config dict")
+
+            old_v = getattr(self, k)
+            if isinstance(old_v, bool):
+                if v.lower() in ["true", "1", "y", "yes"]:
+                    v = True
+                elif v.lower() in ["false", "0", "n", "no"]:
+                    v = False
+                else:
+                    raise ValueError(f"can't derive true or false from {v} (key {k})")
+            elif isinstance(old_v, int):
+                v = int(v)
+            elif isinstance(old_v, float):
+                v = float(v)
+            elif not isinstance(old_v, str):
+                raise ValueError(
+                    f"You can only update int, float, bool or string values in the config, got {v} for key {k}"
+                )
+
+            setattr(self, k, v)
+
+    @classmethod
+    def register_for_auto_class(cls, auto_class="AutoConfig"):
+        """
+        Register this class with a given auto class. This should only be used for custom configurations as the ones in
+        the library are already mapped with `AutoConfig`.
+
+        <Tip warning={true}>
+
+        This API is experimental and may have some slight breaking changes in the next releases.
+
+        </Tip>
+
+        Args:
+            auto_class (`str` or `type`, *optional*, defaults to `"AutoConfig"`):
+                The auto class to register this new configuration with.
+        """
+        if not isinstance(auto_class, str):
+            auto_class = auto_class.__name__
+
+        import transformers.models.auto as auto_module
+
+        if not hasattr(auto_module, auto_class):
+            raise ValueError(f"{auto_class} is not a valid auto class.")
+
+        cls._auto_class = auto_class
+
+    def get(self, key, default=None):
+        """
+        Return the value for key if config class has the attribute , else default.
+        If default is not given, it defaults to None, so that this method never raises a AttributeError.
+        """
+        try:
+            value = self.__getattribute__(key)
+        except AttributeError:
+            return default
+        else:
+            return value
+
+
+def get_configuration_file(configuration_files: List[str]) -> str:
+    """
+    Get the configuration file to use for this version of paddlenlp.
+
+    # TODO: there is not supported actual application models, but useful.
+        this method has not been tested, so be caution to use this feature.
+
+    Args:
+        configuration_files (`List[str]`): The list of available configuration files.
+
+    Returns:
+        `str`: The configuration file to use.
+    """
+    configuration_files_map = {}
+    for file_name in configuration_files:
+        search = _re_configuration_file.search(file_name)
+        if search is not None:
+            v = search.groups()[0]
+            configuration_files_map[v] = file_name
+    available_versions = sorted(configuration_files_map.keys())
+
+    # Defaults to FULL_CONFIGURATION_FILE and then try to look at some newer versions.
+    configuration_file = CONFIG_NAME
+
+    # FIXME: (wj-Mcat) remove the hard dependency of `packaging` which can compare
+    # the version of package, also be uesed in `transfromer`.
+    # **But**, we don't support version compare function now. so remove the hard dependency.
+    from packaging import version
+
+    paddlenlp_version = version.parse(__version__)
+    for v in available_versions:
+        if version.parse(v) <= paddlenlp_version:
+            configuration_file = configuration_files_map[v]
+        else:
+            # No point going further since the versions are sorted.
+            break
+
+    return configuration_file
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/context_parallel_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/context_parallel_utils.py
new file mode 100644
index 000000000..7f8a69352
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/context_parallel_utils.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import paddle
+from paddle.distributed.fleet import fleet
+
+
+def split_inputs_sequence_dim_load_balance(inputs, rank=None, degree=None):
+    if degree is None and rank is None:
+        _hcg = fleet.get_hybrid_communicate_group()
+        degree = _hcg.get_sep_parallel_world_size()
+        rank = _hcg.get_sep_parallel_rank()
+    assert isinstance(degree, int) and isinstance(
+        rank, int
+    ), f"degree:{type(degree)} and rank:{type(rank)} must be int"
+    if degree <= 1:
+        return inputs
+
+    def do_split_sequence_dim_load_balance(data, rank, degree):
+        if data is None:
+            return None
+        assert isinstance(data, paddle.Tensor), f"data should be paddle.Tensor, but is type:{type(data)}"
+        assert len(data.shape) == 2, f"data dims should be 2, but shaped: {data.shape}"
+        sliced_datas = paddle.split(data, num_or_sections=degree * 2, axis=-1)
+        sliced_data0, sliced_data1 = sliced_datas[rank], sliced_datas[degree * 2 - 1 - rank]
+        return paddle.concat([sliced_data0, sliced_data1], axis=-1)
+
+    if isinstance(inputs, paddle.Tensor):
+        return do_split_sequence_dim_load_balance(inputs, rank, degree)
+    elif isinstance(inputs, dict):
+        res = {}
+        for k, tensor in inputs.items():
+            res[k] = do_split_sequence_dim_load_balance(tensor, rank, degree)
+    elif isinstance(inputs, list):
+        res = []
+        for tensor in inputs:
+            res.append(do_split_sequence_dim_load_balance(tensor, rank, degree))
+    else:
+        raise ValueError(f"the inputs should be a list or a dict, but is type: {type(inputs)}")
+    return res
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/convbert/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/convbert/__init__.py
new file mode 100644
index 000000000..595add0ae
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/convbert/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/convbert/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/convbert/configuration.py
new file mode 100644
index 000000000..f99d16020
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/convbert/configuration.py
@@ -0,0 +1,313 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ConvBERT model configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["CONVBERT_PRETRAINED_INIT_CONFIGURATION", "ConvBertConfig", "CONVBERT_PRETRAINED_RESOURCE_FILES_MAP"]
+
+CONVBERT_PRETRAINED_INIT_CONFIGURATION = {
+    "convbert-base": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 768,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30522,
+        "conv_kernel_size": 9,
+        "head_ratio": 2,
+        "num_groups": 1,
+    },
+    "convbert-medium-small": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 128,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 384,
+        "initializer_range": 0.02,
+        "intermediate_size": 1536,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 8,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30522,
+        "conv_kernel_size": 9,
+        "head_ratio": 2,
+        "num_groups": 2,
+    },
+    "convbert-small": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 128,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 256,
+        "initializer_range": 0.02,
+        "intermediate_size": 1024,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 4,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30522,
+        "conv_kernel_size": 9,
+        "head_ratio": 2,
+        "num_groups": 1,
+    },
+    "convbert-base-generator": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 768,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 256,
+        "initializer_range": 0.02,
+        "intermediate_size": 1024,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 4,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30522,
+        "conv_kernel_size": 9,
+        "head_ratio": 2,
+        "num_groups": 1,
+    },
+    "convbert-medium-small-generator": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 128,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 96,
+        "initializer_range": 0.02,
+        "intermediate_size": 384,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 2,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30522,
+        "conv_kernel_size": 9,
+        "head_ratio": 2,
+        "num_groups": 2,
+    },
+    "convbert-small-generator": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 128,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 64,
+        "initializer_range": 0.02,
+        "intermediate_size": 256,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 1,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30522,
+        "conv_kernel_size": 9,
+        "head_ratio": 2,
+        "num_groups": 1,
+    },
+    "convbert-base-discriminator": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 768,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30522,
+        "conv_kernel_size": 9,
+        "head_ratio": 2,
+        "num_groups": 1,
+    },
+    "convbert-medium-small-discriminator": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 128,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 384,
+        "initializer_range": 0.02,
+        "intermediate_size": 1536,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 8,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30522,
+        "conv_kernel_size": 9,
+        "head_ratio": 2,
+        "num_groups": 2,
+    },
+    "convbert-small-discriminator": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 128,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 256,
+        "initializer_range": 0.02,
+        "intermediate_size": 1024,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 4,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30522,
+        "conv_kernel_size": 9,
+        "head_ratio": 2,
+        "num_groups": 1,
+    },
+}
+
+CONVBERT_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "convbert-base": "http://bj.bcebos.com/paddlenlp/models/transformers/convbert/convbert-base/model_state.pdparams",
+        "convbert-medium-small": "http://bj.bcebos.com/paddlenlp/models/transformers/convbert/convbert-medium-small/model_state.pdparams",
+        "convbert-small": "http://bj.bcebos.com/paddlenlp/models/transformers/convbert/convbert-small/model_state.pdparams",
+    }
+}
+
+
+class ConvBertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ConvBertModel`]. It is used to instantiate a
+    ConvBERT model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the ConvBert
+    convbert-base architecture. Configuration objects.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    ======================================================
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        pad_token_id(int, optional):
+            The index of padding token in the token vocabulary.
+            Defaults to `0`.
+        pool_act (`str`, *optional*):
+            The non-linear activation function in the pooler.
+            Defaults to `"tanh"`.
+        embedding_size (int, optional):
+            Dimensionality of the embedding layer. Defaults to `768`.
+        conv_kernel_size (int, optional):
+            The size of the convolutional kernel.
+            Defaults to `9`.
+        head_ratio (int, optional):
+            Ratio gamma to reduce the number of attention heads.
+            Defaults to `2`.
+        num_groups (int, optional):
+            The number of groups for grouped linear layers for ConvBert model.
+            Defaults to `1`.
+
+    Examples:
+
+    ```python
+    >>> from paddlenlp.transformers import ConvBertModel, ConvBertConfig
+
+    >>> # Initializing a ConvBERT configuration
+    >>> configuration = ConvBertConfig()
+
+    >>> # Initializing a model from the ConvBERT-base style configuration model
+    >>> model = ConvBertModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ======================================================
+    ```"""
+    model_type = "convbert"
+    attribute_map: Dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
+    pretrained_init_configuration = CONVBERT_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size: int = 30522,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        max_position_embeddings: int = 512,
+        type_vocab_size: int = 2,
+        initializer_range: float = 0.02,
+        layer_norm_eps: float = 1e-12,
+        pad_token_id: int = 0,
+        pool_act: str = "tanh",
+        embedding_size: int = 768,
+        conv_kernel_size: int = 9,
+        head_ratio: int = 2,
+        num_groups: int = 1,
+        **kwargs
+    ):
+
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.pool_act = pool_act
+        self.layer_norm_eps = layer_norm_eps
+        self.embedding_size = embedding_size
+        self.conv_kernel_size = conv_kernel_size
+        self.head_ratio = head_ratio
+        self.num_groups = num_groups
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/convbert/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/convbert/modeling.py
new file mode 100644
index 000000000..7c9dfd3a1
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/convbert/modeling.py
@@ -0,0 +1,1546 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import Tensor, tensor
+from paddle.nn import Layer
+
+from .. import PretrainedModel, register_base_model
+from ..activations import get_activation
+from ..model_outputs import (
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    tuple_output,
+)
+from .configuration import (
+    CONVBERT_PRETRAINED_INIT_CONFIGURATION,
+    CONVBERT_PRETRAINED_RESOURCE_FILES_MAP,
+    ConvBertConfig,
+)
+
+__all__ = [
+    "ConvBertModel",
+    "ConvBertForMaskedLM",
+    "ConvBertPretrainedModel",
+    "ConvBertForTotalPretraining",
+    "ConvBertDiscriminator",
+    "ConvBertGenerator",
+    "ConvBertClassificationHead",
+    "ConvBertForSequenceClassification",
+    "ConvBertForTokenClassification",
+    "ConvBertPretrainingCriterion",
+    "ConvBertForQuestionAnswering",
+    "ConvBertForMultipleChoice",
+    "ConvBertForPretraining",
+]
+dtype_float = paddle.get_default_dtype()
+
+
+def _convert_attention_mask(attn_mask, dtype):
+    if attn_mask is not None and attn_mask.dtype != dtype:
+        attn_mask_dtype = attn_mask.dtype
+        if attn_mask_dtype in [paddle.bool, paddle.int8, paddle.int16, paddle.int32, paddle.int64]:
+            attn_mask = (paddle.cast(attn_mask, dtype) - 1.0) * 1e9
+        else:
+            attn_mask = paddle.cast(attn_mask, dtype)
+    return attn_mask
+
+
+class GroupedLinear(nn.Layer):
+    def __init__(self, input_size, output_size, num_groups):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.num_groups = num_groups
+        self.group_in_dim = self.input_size // self.num_groups
+        self.group_out_dim = self.output_size // self.num_groups
+        self.weight = paddle.create_parameter(
+            shape=[self.num_groups, self.group_in_dim, self.group_out_dim], dtype=dtype_float
+        )
+        self.bias = paddle.create_parameter(shape=[output_size], dtype=dtype_float, is_bias=True)
+
+    def forward(self, hidden_states):
+        batch_size = hidden_states.shape[0]
+        x = tensor.reshape(hidden_states, [-1, self.num_groups, self.group_in_dim])
+        x = tensor.transpose(x, perm=[1, 0, 2])
+        x = tensor.matmul(x, self.weight)
+        x = tensor.transpose(x, perm=[1, 0, 2])
+        x = tensor.reshape(x, [batch_size, -1, self.output_size])
+        x = x + self.bias
+        return x
+
+
+class SeparableConv1D(nn.Layer):
+    """This class implements separable convolution, i.e. a depthwise and a pointwise layer"""
+
+    def __init__(self, input_filters, output_filters, kernel_size):
+        super().__init__()
+        self.depthwise = nn.Conv1D(
+            input_filters,
+            input_filters,
+            kernel_size=kernel_size,
+            groups=input_filters,
+            padding=kernel_size // 2,
+            bias_attr=False,
+            data_format="NLC",
+        )
+        self.pointwise = nn.Conv1D(
+            input_filters,
+            output_filters,
+            kernel_size=1,
+            bias_attr=False,
+            data_format="NLC",
+        )
+        self.bias = paddle.create_parameter(shape=[output_filters], dtype=dtype_float, is_bias=True)
+
+    def forward(self, hidden_states):
+        x = self.depthwise(hidden_states)
+        x = self.pointwise(x) + self.bias
+        return x
+
+
+class MultiHeadAttentionWithConv(Layer):
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        dropout=0.0,
+        kdim=None,
+        vdim=None,
+        need_weights=False,
+        conv_kernel_size=None,
+        head_ratio=None,
+    ):
+        super(MultiHeadAttentionWithConv, self).__init__()
+
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.need_weights = need_weights
+        self.head_dim = embed_dim // num_heads
+        self.scale = self.head_dim**-0.5
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        new_num_attention_heads = num_heads // head_ratio
+        if num_heads // head_ratio < 1:
+            self.num_heads = 1
+            self.conv_type = "noconv"
+        else:
+            self.num_heads = new_num_attention_heads
+            self.conv_type = "sdconv"
+
+        self.all_head_size = self.num_heads * self.head_dim
+
+        self.dropout = nn.Dropout(dropout)
+        self.q_proj = nn.Linear(embed_dim, self.all_head_size)
+        self.k_proj = nn.Linear(self.kdim, self.all_head_size)
+        self.v_proj = nn.Linear(self.vdim, self.all_head_size)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+
+        if self.conv_type == "sdconv":
+            self.conv_kernel_size = conv_kernel_size
+            self.key_conv_attn_layer = SeparableConv1D(embed_dim, self.all_head_size, self.conv_kernel_size)
+            self.conv_kernel_layer = nn.Linear(self.all_head_size, self.num_heads * self.conv_kernel_size)
+            self.conv_out_layer = nn.Linear(embed_dim, self.all_head_size)
+            self.padding = (self.conv_kernel_size - 1) // 2
+
+    def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
+        key = query if key is None else key
+        value = query if value is None else value
+
+        q = self.q_proj(query)
+        k = self.k_proj(key)
+        v = self.v_proj(value)
+
+        if self.conv_type == "sdconv":
+            bs = q.shape[0]
+            seqlen = q.shape[1]
+            mixed_key_conv_attn_layer = self.key_conv_attn_layer(query)
+            conv_attn_layer = mixed_key_conv_attn_layer * q
+
+            # conv_kernel_layer
+            conv_kernel_layer = self.conv_kernel_layer(conv_attn_layer)
+            conv_kernel_layer = tensor.reshape(conv_kernel_layer, shape=[-1, self.conv_kernel_size, 1])
+            conv_kernel_layer = F.softmax(conv_kernel_layer, axis=1)
+            conv_out_layer = self.conv_out_layer(query)
+            conv_out_layer = F.pad(conv_out_layer, pad=[self.padding, self.padding], data_format="NLC")
+            conv_out_layer = paddle.stack(
+                [
+                    paddle.slice(conv_out_layer, axes=[1], starts=[i], ends=[i + seqlen])
+                    for i in range(self.conv_kernel_size)
+                ],
+                axis=-1,
+            )
+            conv_out_layer = tensor.reshape(conv_out_layer, shape=[-1, self.head_dim, self.conv_kernel_size])
+            conv_out_layer = tensor.matmul(conv_out_layer, conv_kernel_layer)
+            conv_out = tensor.reshape(conv_out_layer, shape=[bs, seqlen, self.num_heads, self.head_dim])
+
+        q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
+        q = tensor.transpose(x=q, perm=[0, 2, 1, 3])
+        k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
+        k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
+        v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
+        v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
+
+        product = tensor.matmul(x=q, y=k, transpose_y=True) * self.scale
+        if attn_mask is not None:
+            attn_mask = _convert_attention_mask(attn_mask, product.dtype)
+            product = product + attn_mask
+
+        weights = F.softmax(product)
+        weights = self.dropout(weights)
+        out = tensor.matmul(weights, v)
+
+        # combine heads
+        out = tensor.transpose(out, perm=[0, 2, 1, 3])
+        if self.conv_type == "sdconv":
+            out = tensor.concat([out, conv_out], axis=2)
+        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+
+        # project to output
+        out = self.out_proj(out)
+
+        outs = [out]
+        if self.need_weights:
+            outs.append(weights)
+        if cache is not None:
+            outs.append(cache)
+        return out if len(outs) == 1 else tuple(outs)
+
+
+class TransformerEncoderLayerWithConv(nn.TransformerEncoderLayer):
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dim_feedforward,
+        dropout=0.1,
+        activation="relu",
+        attn_dropout=None,
+        act_dropout=None,
+        normalize_before=False,
+        conv_kernel_size=None,
+        head_ratio=None,
+        num_groups=None,
+        **kwargs
+    ):
+        super().__init__(
+            d_model,
+            nhead,
+            dim_feedforward,
+            dropout=dropout,
+            activation=activation,
+            attn_dropout=attn_dropout,
+            act_dropout=act_dropout,
+            normalize_before=normalize_before,
+        )
+        self.self_attn = MultiHeadAttentionWithConv(
+            d_model,
+            nhead,
+            dropout=attn_dropout,
+            conv_kernel_size=conv_kernel_size,
+            head_ratio=head_ratio,
+        )
+        if num_groups > 1:
+            self.linear1 = GroupedLinear(d_model, dim_feedforward, num_groups=num_groups)
+            self.linear2 = GroupedLinear(dim_feedforward, d_model, num_groups=num_groups)
+        self._config.update({"conv_kernel_size": conv_kernel_size, "head_ratio": head_ratio, "num_groups": num_groups})
+
+
+class ConvBertEmbeddings(nn.Layer):
+    """
+    Include embeddings from word, position and token_type embeddings
+    """
+
+    def __init__(self, config: ConvBertConfig):
+        super(ConvBertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)
+
+        self.layer_norm = nn.LayerNorm(config.embedding_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+    ):
+        if input_ids is not None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        input_shape = inputs_embeds.shape[:-1]
+
+        ones = paddle.ones(input_shape, dtype="int64")
+        seq_length = paddle.cumsum(ones, axis=1)
+        position_ids = seq_length - ones
+        position_ids.stop_gradient = True
+
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros_like(input_ids, dtype="int64")
+
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class ConvBertDiscriminatorPredictions(nn.Layer):
+    """
+    Prediction layer for the discriminator.
+    """
+
+    def __init__(self, hidden_size, hidden_act):
+        super(ConvBertDiscriminatorPredictions, self).__init__()
+
+        self.dense = nn.Linear(hidden_size, hidden_size)
+        self.dense_prediction = nn.Linear(hidden_size, 1)
+        self.act = get_activation(hidden_act)
+
+    def forward(self, discriminator_hidden_states):
+        hidden_states = self.dense(discriminator_hidden_states)
+        hidden_states = self.act(hidden_states)
+        logits = self.dense_prediction(hidden_states).squeeze()
+
+        return logits
+
+
+class ConvBertGeneratorPredictions(nn.Layer):
+    """
+    Prediction layer for the generator.
+    """
+
+    def __init__(self, config: ConvBertConfig):
+        super(ConvBertGeneratorPredictions, self).__init__()
+
+        self.layer_norm = nn.LayerNorm(config.embedding_size, epsilon=config.layer_norm_eps)
+        self.dense = nn.Linear(config.hidden_size, config.embedding_size)
+        self.act = get_activation(config.hidden_act)
+
+    def forward(self, generator_hidden_states):
+        hidden_states = self.dense(generator_hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+
+        return hidden_states
+
+
+class ConvBertPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained ConvBert models. It provides ConvBert related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    base_model_prefix = "convbert"
+
+    # pretrained general configuration
+    gen_weight = 1.0
+    disc_weight = 50.0
+    tie_word_embeddings = True
+    untied_generator_embeddings = False
+    use_softmax_sample = True
+
+    # model init configuration
+    pretrained_init_configuration = CONVBERT_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = CONVBERT_PRETRAINED_RESOURCE_FILES_MAP
+    config_class = ConvBertConfig
+
+    def tie_weights(self):
+        """
+        Tie the weights between the input embeddings and the output embeddings.
+        """
+        if hasattr(self, "get_output_embeddings") and hasattr(self, "get_input_embeddings"):
+            output_embeddings = self.get_output_embeddings()
+            if output_embeddings is not None:
+                self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
+
+    def _init_weights(self, layer):
+        """Initialize the weights"""
+        if isinstance(layer, (nn.Linear, nn.Embedding, GroupedLinear)):
+            layer.weight.set_value(
+                paddle.tensor.normal(
+                    mean=0.0,
+                    std=self.config.initializer_range,
+                    shape=layer.weight.shape,
+                )
+            )
+        elif isinstance(layer, nn.LayerNorm):
+            layer.bias.set_value(paddle.zeros_like(layer.bias))
+            layer.weight.set_value(paddle.full_like(layer.weight, 1.0))
+            layer._epsilon = self.config.layer_norm_eps
+        elif isinstance(layer, SeparableConv1D):
+            layer.depthwise.weight.set_value(
+                paddle.tensor.normal(
+                    mean=0.0,
+                    std=self.config.initializer_range,
+                    shape=layer.depthwise.weight.shape,
+                )
+            )
+            layer.pointwise.weight.set_value(
+                paddle.tensor.normal(
+                    mean=0.0,
+                    std=self.config.initializer_range,
+                    shape=layer.pointwise.weight.shape,
+                )
+            )
+
+        if isinstance(layer, (nn.Linear, GroupedLinear, SeparableConv1D)) and layer.bias is not None:
+            layer.bias.set_value(paddle.zeros_like(layer.bias))
+
+    def _tie_or_clone_weights(self, output_embeddings, input_embeddings):
+        """Tie or clone layer weights"""
+        if output_embeddings.weight.shape == input_embeddings.weight.shape:
+            output_embeddings.weight = input_embeddings.weight
+        elif output_embeddings.weight.shape == input_embeddings.weight.t().shape:
+            output_embeddings.weight.set_value(input_embeddings.weight.t())
+        else:
+            raise ValueError(
+                "when tie input/output embeddings, the shape of output embeddings: {}"
+                "should be equal to shape of input embeddings: {}"
+                "or should be equal to the shape of transpose input embeddings: {}".format(
+                    output_embeddings.weight.shape,
+                    input_embeddings.weight.shape,
+                    input_embeddings.weight.t().shape,
+                )
+            )
+        if getattr(output_embeddings, "bias", None) is not None:
+            if output_embeddings.weight.shape[-1] != output_embeddings.bias.shape[0]:
+                raise ValueError(
+                    "the weight lase shape: {} of output_embeddings is not equal to the bias shape: {}"
+                    "please check output_embeddings configuration".format(
+                        output_embeddings.weight.shape[-1],
+                        output_embeddings.bias.shape[0],
+                    )
+                )
+
+
+@register_base_model
+class ConvBertModel(ConvBertPretrainedModel):
+    """
+    The bare ConvBert Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`ConvBertConfig`):
+            An instance of ConvBertConfig
+
+    """
+
+    def __init__(self, config: ConvBertConfig):
+        super(ConvBertModel, self).__init__(config)
+        self.pad_token_id = config.pad_token_id
+        self.initializer_range = config.initializer_range
+        self.embeddings = ConvBertEmbeddings(config)
+
+        if config.embedding_size != config.hidden_size:
+            self.embeddings_project = nn.Linear(config.embedding_size, config.hidden_size)
+
+        encoder_layer = TransformerEncoderLayerWithConv(
+            config.hidden_size,
+            config.num_attention_heads,
+            config.intermediate_size,
+            dropout=config.hidden_dropout_prob,
+            activation=config.hidden_act,
+            attn_dropout=config.attention_probs_dropout_prob,
+            act_dropout=0,
+            conv_kernel_size=config.conv_kernel_size,
+            head_ratio=config.head_ratio,
+            num_groups=config.num_groups,
+        )
+        self.encoder = nn.TransformerEncoder(encoder_layer, config.num_hidden_layers)
+        # self.config = config
+        self.pooler = ConvBertPooler(config)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+
+        r"""
+        The ConvBertModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                If its data type is int, the values should be either 0 or 1.
+
+                - **1** for tokens that **not masked**,
+                - **0** for tokens that **masked**.
+
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+                        inputs_embeds (Tensor, optional):
+                If you want to control how to convert `inputs_ids` indices into associated vectors, you can
+                pass an embedded representation directly instead of passing `inputs_ids`.
+            inputs_embeds (Tensor, optional):
+                Instead of passing input_ids you can choose to directly pass an embedded representation.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.ModelOutput` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ConvBertModel, ConvBertTokenizer
+
+                tokenizer = ConvBertTokenizer.from_pretrained('convbert-base')
+                model = ConvBertModel.from_pretrained('convbert-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros_like(input_ids)
+
+        if attention_mask is None:
+            attention_mask = paddle.unsqueeze(
+                (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e4, axis=[1, 2]
+            )
+        else:
+            if attention_mask.ndim == 2:
+                # attention_mask [batch_size, sequence_length] -> [batch_size, 1, 1, sequence_length]
+                attention_mask = attention_mask.unsqueeze(axis=[1, 2]).astype(paddle.get_default_dtype())
+                attention_mask = (1.0 - attention_mask) * -1e4
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+
+        if hasattr(self, "embeddings_project"):
+            embedding_output = self.embeddings_project(embedding_output)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            src_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        # output_attentions may be False
+        if isinstance(encoder_outputs, type(embedding_output)):
+            sequence_output = encoder_outputs
+            pooled_output = self.pooler(sequence_output)
+            return (sequence_output, pooled_output)
+        else:
+            sequence_output = encoder_outputs[0]
+            pooled_output = self.pooler(sequence_output)
+            if not return_dict:
+                return (sequence_output, pooled_output) + encoder_outputs[1:]
+            return BaseModelOutputWithPoolingAndCrossAttentions(
+                last_hidden_state=sequence_output,
+                pooler_output=pooled_output,
+                hidden_states=encoder_outputs.hidden_states,
+                attentions=encoder_outputs.attentions,
+            )
+
+
+class ConvBertDiscriminator(ConvBertPretrainedModel):
+    """
+    ConvBert Model with a discriminator prediction head on top.
+
+    Args:
+        config (:class:`ConvBertConfig`):
+            An instance of ConvBertConfig
+    """
+
+    def __init__(self, config: ConvBertConfig):
+        super(ConvBertDiscriminator, self).__init__(config)
+
+        self.convbert = ConvBertModel(config)
+        self.discriminator_predictions = ConvBertDiscriminatorPredictions(config.hidden_size, config.hidden_act)
+
+    def forward(
+        self,
+        input_ids,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+    ):
+        r"""
+        The ConvBertDiscriminator forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                If its data type is int, the values should be either 0 or 1.
+
+                - **1** for tokens that **not masked**,
+                - **0** for tokens that **masked**.
+
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            inputs_embeds (Tensor, optional):
+                Instead of passing input_ids you can choose to directly pass an embedded representation.
+
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the discriminator prediction logits.
+            Shape as `[batch_size, sequence_length]` and dtype as float32.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ConvBertDiscriminatorPredictions, ConvBertTokenizer
+
+                tokenizer = ConvBertTokenizer.from_pretrained('convbert-base')
+                model = ConvBertDiscriminator.from_pretrained('convbert-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+        """
+
+        discriminator_sequence_output = self.convbert(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+        )
+
+        logits = self.discriminator_predictions(discriminator_sequence_output)
+
+        return logits
+
+
+class ConvBertGenerator(ConvBertPretrainedModel):
+    """
+    ConvBert Model with a generator prediction head on top.
+
+    Args:
+        config (:class:`ConvBertConfig`):
+            An instance of ConvBertConfig
+    """
+
+    def __init__(self, config: ConvBertConfig):
+        super(ConvBertGenerator, self).__init__(config)
+        self.config = config
+        self.convbert = ConvBertModel(config)
+        self.generator_predictions = ConvBertGeneratorPredictions(config)
+
+        if not self.tie_word_embeddings:
+            self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size)
+        else:
+            self.generator_lm_head_bias = paddle.create_parameter(
+                shape=[config.vocab_size],
+                dtype=dtype_float,
+                is_bias=True,
+            )
+
+    def get_input_embeddings(self):
+        return self.convbert.embeddings.word_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
+    ):
+        r"""
+        The ConvBertGenerator forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ConvBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ConvBertModel`.
+            position_ids (Tensor, optional):
+                See :class:`ConvBertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ConvBertModel`.
+            output_hidden_states (bool, optional):
+                See :class:`ConvBertModel`.
+            output_attentions (bool, optional):
+                See :class:`ConvBertModel`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            Tensor: Returns tensor `prediction_scores`, a tensor of the generator prediction scores.
+            Shape as `[batch_size, sequence_length, vocab_size]` and dtype as float32.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ConvBertGenerator, ConvBertTokenizer
+
+                tokenizer = ConvBertTokenizer.from_pretrained('convbert-base')
+                model = ConvBertGenerator.from_pretrained('convbert-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                prediction_scores = model(**inputs)
+        """
+        convbert_outputs = self.convbert(
+            input_ids,
+            token_type_ids,
+            position_ids,
+            attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        prediction_scores = self.generator_predictions(convbert_outputs[0])
+        if not self.tie_word_embeddings:
+            prediction_scores = self.generator_lm_head(prediction_scores)
+        else:
+            prediction_scores = paddle.add(
+                paddle.matmul(prediction_scores, self.get_input_embeddings().weight, transpose_y=True),
+                self.generator_lm_head_bias,
+            )
+        loss = None
+        # # Masked language modeling softmax layer
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()  # -100 index = padding token
+            loss = loss_fct(prediction_scores.reshape([-1, self.config.vocab_size]), labels.reshape([-1]))
+
+        if not return_dict:
+            output = (prediction_scores,) + convbert_outputs[1:]
+            return tuple_output(output, loss)
+
+        return MaskedLMOutput(
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=convbert_outputs.hidden_states,
+            attentions=convbert_outputs.attentions,
+        )
+
+
+class ConvBertClassificationHead(nn.Layer):
+    """
+    ConvBert head for sentence-level classification tasks.
+
+    Args:
+        config (:class:`ConvBertConfig`):
+            An instance of ConvBertConfig
+    """
+
+    def __init__(self, config: ConvBertConfig):
+        super(ConvBertClassificationHead, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+        self.act = get_activation(config.hidden_act)
+
+    def forward(self, features, **kwargs):
+        x = self.dropout(features)
+        x = self.dense(x)
+        x = self.act(x)  # ConvBert paper used gelu here
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+class ConvBertForSequenceClassification(ConvBertPretrainedModel):
+    """
+    ConvBert Model with a linear layer on top of the output layer,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        config (:class:`ConvBertConfig`):
+            An instance of ConvBertConfig
+    """
+
+    def __init__(self, config: ConvBertConfig):
+        super(ConvBertForSequenceClassification, self).__init__(config)
+        self.convbert = ConvBertModel(config)
+        self.num_labels = config.num_labels
+        self.classifier = ConvBertClassificationHead(config)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The ConvBertForSequenceClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ConvBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ConvBertModel`.
+            position_ids (Tensor, optional):
+                See :class:`ConvBertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ConvBertModel`.
+            inputs_embeds (Tensor, optional):
+                Instead of passing input_ids you can choose to directly pass an embedded representation.
+            labels (Tensor of shape `(batch_size,)`, optional):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in `[0, ..., num_labels - 1]`. If `num_labels == 1`
+                a regression loss is computed (Mean-Square loss), If `num_labels > 1`
+                a classification loss is computed (Cross-Entropy).
+            output_hidden_states (bool, optional):
+                See :class:`ConvBertModel`.
+            output_attentions (bool, optional):
+                See :class:`ConvBertModel`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input text classification logits.
+            Shape as `[batch_size, num_classes]` and dtype as float32.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ConvBertForSequenceClassification, ConvBertTokenizer
+
+                tokenizer = ConvBertTokenizer.from_pretrained('convbert-base')
+                model = ConvBertForSequenceClassification.from_pretrained('convbert-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.convbert(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = paddle.nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = paddle.nn.CrossEntropyLoss()
+                loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = paddle.nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return tuple_output(output, loss)
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class ConvBertForTokenClassification(ConvBertPretrainedModel):
+    """
+    ConvBert Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+    Args:
+        config (:class:`ConvBertConfig`):
+            An instance of ConvBertConfig
+    """
+
+    def __init__(self, config: ConvBertConfig):
+        super(ConvBertForTokenClassification, self).__init__(config)
+        self.convbert = ConvBertModel(config)
+        self.num_labels = config.num_labels
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The ConvBertForTokenClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ConvBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ConvBertModel`.
+            position_ids (Tensor, optional):
+                See :class:`ConvBertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ConvBertModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`ConvBertModel`.
+            labels (Tensor of shape `(batch_size, sequence_length)`, optional):
+                Labels for computing the token classification loss. Indices should be in `[0, ..., num_labels - 1]`.
+            output_hidden_states (bool, optional):
+                See :class:`ConvBertModel`.
+            output_attentions (bool, optional):
+                See :class:`ConvBertModel`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput`.
+
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ConvBertForTokenClassification, ConvBertTokenizer
+
+                tokenizer = ConvBertTokenizer.from_pretrained('convbert-base')
+                model = ConvBertForTokenClassification.from_pretrained('convbert-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.convbert(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = self.dropout(outputs[0])
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return tuple_output(output, loss)
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class ConvBertForTotalPretraining(ConvBertPretrainedModel):
+    """
+    Combine generator with discriminator for Replaced Token Detection (RTD) pretraining.
+    """
+
+    def __init__(self, config: ConvBertConfig):
+        super(ConvBertForTotalPretraining, self).__init__(config)
+        self.generator = ConvBertGenerator(config)
+        self.discriminator = ConvBertDiscriminator(config)
+        self.initializer_range = config.initializer_range
+        self.tie_weights()
+
+    def get_input_embeddings(self):
+        if not self.untied_generator_embeddings:
+            return self.generator.convbert.embeddings.word_embeddings
+        else:
+            return None
+
+    def get_output_embeddings(self):
+        if not self.untied_generator_embeddings:
+            return self.discriminator.convbert.embeddings.word_embeddings
+        else:
+            return None
+
+    def get_discriminator_inputs(self, inputs, raw_inputs, generator_logits, generator_labels, use_softmax_sample):
+        """Sample from the generator to create discriminator input."""
+        # get generator token result
+        sampled_tokens = (self.sample_from_softmax(generator_logits, use_softmax_sample)).detach()
+        sampled_tokids = paddle.argmax(sampled_tokens, axis=-1)
+        # update token only at mask position
+        # generator_labels : [B, L], L contains -100(unmasked) or token value(masked)
+        # mask_positions : [B, L], L contains 0(unmasked) or 1(masked)
+        umask_positions = paddle.zeros_like(generator_labels)
+        mask_positions = paddle.ones_like(generator_labels)
+        mask_positions = paddle.where(generator_labels == -100, umask_positions, mask_positions)
+        updated_inputs = self.update_inputs(inputs, sampled_tokids, mask_positions)
+        # use inputs and updated_input to get discriminator labels
+        labels = mask_positions * (paddle.ones_like(inputs) - paddle.equal(updated_inputs, raw_inputs).astype("int32"))
+        return updated_inputs, labels, sampled_tokids
+
+    def sample_from_softmax(self, logits, use_softmax_sample=True):
+        if use_softmax_sample:
+            # uniform_noise = paddle.uniform(logits.shape, dtype="float32", min=0, max=1)
+            uniform_noise = paddle.rand(logits.shape, dtype=paddle.get_default_dtype())
+            gumbel_noise = -paddle.log(-paddle.log(uniform_noise + 1e-9) + 1e-9)
+        else:
+            gumbel_noise = paddle.zeros_like(logits)
+        # softmax_sample equal to sampled_tokids.unsqueeze(-1)
+        softmax_sample = paddle.argmax(F.softmax(logits + gumbel_noise), axis=-1)
+        # one hot
+        return F.one_hot(softmax_sample, logits.shape[-1])
+
+    def update_inputs(self, sequence, updates, positions):
+        shape = sequence.shape
+        assert len(shape) == 2, "the dimension of inputs should be [batch_size, sequence_length]"
+        B, L = shape
+        N = positions.shape[1]
+        assert N == L, "the dimension of inputs and mask should be same as [batch_size, sequence_length]"
+
+        updated_sequence = ((paddle.ones_like(sequence) - positions) * sequence) + (
+            positions * updates.astype(positions.dtype)
+        )
+
+        return updated_sequence
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        raw_input_ids: Optional[Tensor] = None,
+        generator_labels: Optional[Tensor] = None,
+    ):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ConvBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ConvBertModel`.
+            position_ids (Tensor, optional):
+                See :class:`ConvBertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ConvBertModel`.
+            raw_input_ids(Tensor, optional):
+                The raw input_ids. Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            generator_labels(Tensor, optional):
+                The generator labels. Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+
+        Returns:
+            tuple: Returns tuple (``generator_logits``, ``disc_logits``, ``disc_labels``, ``attention_mask``).
+
+            With the fields:
+
+            - `generator_logits` (Tensor):
+                a tensor of the generator prediction logits. Shape as `[batch_size, sequence_length, vocab_size]` and dtype as float32.
+
+            - `disc_logits` (Tensor):
+                a tensor of the discriminator prediction logits. Shape as `[batch_size, sequence_length]` and dtype as float32.
+
+            - `disc_labels` (Tensor):
+                a tensor of the discriminator prediction labels. Shape as `[batch_size, sequence_length]` and dtype as int64.
+
+            - `attention_mask` (Tensor):
+                See :class:`ConvBertModel`.
+        """
+
+        assert (
+            generator_labels is not None
+        ), "generator_labels should not be None, please check DataCollatorForLanguageModeling"
+
+        generator_logits = self.generator(input_ids, token_type_ids, position_ids, attention_mask)[0]
+
+        disc_inputs, disc_labels, generator_predict_tokens = self.get_discriminator_inputs(
+            input_ids, raw_input_ids, generator_logits, generator_labels, self.use_softmax_sample
+        )
+
+        disc_logits = self.discriminator(disc_inputs, token_type_ids, position_ids, attention_mask)
+
+        if attention_mask is None:
+            attention_mask = input_ids != self.discriminator.convbert.config.pad_token_id
+        else:
+            attention_mask = attention_mask.astype("bool")
+
+        return generator_logits, disc_logits, disc_labels, attention_mask
+
+
+class ConvBertPretrainingCriterion(nn.Layer):
+    """
+
+    Args:
+        vocab_size(int):
+            Vocabulary size of `inputs_ids` in `ConvBertModel`. Defines the number of different tokens that can
+            be represented by the `inputs_ids` passed when calling `ConvBertModel`.
+        gen_weight(float):
+            This is the generator weight.
+        disc_weight(float):
+            This is the discriminator weight.
+
+    """
+
+    def __init__(self, vocab_size, gen_weight, disc_weight):
+        super(ConvBertPretrainingCriterion, self).__init__()
+
+        self.vocab_size = vocab_size
+        self.gen_weight = gen_weight
+        self.disc_weight = disc_weight
+        self.gen_loss_fct = nn.CrossEntropyLoss(reduction="none")
+        self.disc_loss_fct = nn.BCEWithLogitsLoss(reduction="none")
+
+    def forward(
+        self,
+        generator_prediction_scores,
+        discriminator_prediction_scores,
+        generator_labels,
+        discriminator_labels,
+        attention_mask,
+    ):
+        # generator loss
+        gen_loss = self.gen_loss_fct(
+            paddle.reshape(generator_prediction_scores, [-1, self.vocab_size]),
+            paddle.reshape(generator_labels, [-1]),
+        )
+        # todo: we can remove 4 lines after when CrossEntropyLoss(reduction='mean') improved
+        umask_positions = paddle.zeros_like(generator_labels).astype(dtype_float)
+        mask_positions = paddle.ones_like(generator_labels).astype(dtype_float)
+        mask_positions = paddle.where(generator_labels == -100, umask_positions, mask_positions)
+        if mask_positions.sum() == 0:
+            gen_loss = paddle.to_tensor([0.0])
+        else:
+            gen_loss = gen_loss.sum() / mask_positions.sum()
+
+        # discriminator loss
+        seq_length = discriminator_labels.shape[1]
+        disc_loss = self.disc_loss_fct(
+            paddle.reshape(discriminator_prediction_scores, [-1, seq_length]),
+            discriminator_labels.astype(dtype_float),
+        )
+        if attention_mask is not None:
+            umask_positions = paddle.ones_like(discriminator_labels).astype(dtype_float)
+            mask_positions = paddle.zeros_like(discriminator_labels).astype(dtype_float)
+            use_disc_loss = paddle.where(attention_mask, disc_loss, mask_positions)
+            umask_positions = paddle.where(attention_mask, umask_positions, mask_positions)
+            disc_loss = use_disc_loss.sum() / umask_positions.sum()
+        else:
+            total_positions = paddle.ones_like(discriminator_labels).astype(dtype_float)
+            disc_loss = disc_loss.sum() / total_positions.sum()
+
+        return self.gen_weight * gen_loss + self.disc_weight * disc_loss
+
+
+class ConvBertPooler(Layer):
+    def __init__(self, config: ConvBertConfig):
+        super(ConvBertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.pool_act = config.pool_act
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        if self.pool_act == "tanh":
+            pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class ConvBertForMultipleChoice(ConvBertPretrainedModel):
+    """
+    ConvBert Model with a linear layer on top of the hidden-states output layer,
+    designed for multiple choice tasks like RocStories/SWAG tasks .
+
+    Args:
+        config (:class:`ConvBertConfig`):
+            An instance of ConvBertConfig
+    """
+
+    def __init__(self, config: ConvBertConfig):
+        super(ConvBertForMultipleChoice, self).__init__(config)
+        self.num_choices = config.num_choices
+        self.convbert = ConvBertModel(config)
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The ConvBertForMultipleChoice forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ConvBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ConvBertModel`.
+            position_ids (Tensor, optional):
+                See :class:`ConvBertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ConvBertModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`ConvBertModel`.
+            labels (Tensor of shape `(batch_size, )`, optional):
+                Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+                num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+                `input_ids` above)
+            output_hidden_states (bool, optional):
+                See :class:`ConvBertModel`.
+            output_attentions (bool, optional):
+                See :class:`ConvBertModel`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            Tensor: Returns tensor `reshaped_logits`, a tensor of the multiple choice classification logits.
+            Shape as `[batch_size, num_choice]` and dtype as `float32`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ConvBertForMultipleChoice, ConvBertTokenizer
+
+                tokenizer = ConvBertTokenizer.from_pretrained('convbert-base')
+                model = ConvBertForMultipleChoice.from_pretrained('convbert-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+        """
+        if input_ids is not None:
+            input_ids = input_ids.reshape((-1, input_ids.shape[-1]))  # flat_input_ids: [bs*num_choice,seq_l]
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.reshape((-1, token_type_ids.shape[-1]))
+        if position_ids is not None:
+            position_ids = position_ids.reshape((-1, position_ids.shape[-1]))
+        if attention_mask is not None:
+            attention_mask = attention_mask.reshape((-1, attention_mask.shape[-1]))
+
+        if inputs_embeds is not None:
+            inputs_embeds = inputs_embeds.reshape(shape=(-1, inputs_embeds.shape[-2], inputs_embeds.shape[-1]))
+
+        outputs = self.convbert(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+
+        logits = self.classifier(pooled_output)  # logits: (bs*num_choice,1)
+        reshaped_logits = logits.reshape((-1, self.num_choices))  # logits: (bs, num_choice)
+
+        loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return tuple_output(output, loss)
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class ConvBertForQuestionAnswering(ConvBertPretrainedModel):
+    """
+    ConvBert Model with a linear layer on top of the hidden-states output to compute `span_start_logits`
+    and `span_end_logits`, designed for question-answering tasks like SQuAD.
+
+    Args:
+        config (:class:`ConvBertConfig`):
+            An instance of ConvBertConfig
+
+    """
+
+    def __init__(self, config: ConvBertConfig):
+        super(ConvBertForQuestionAnswering, self).__init__(config)
+        self.convbert = ConvBertModel(config)
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, 2)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        start_positions: Optional[Tensor] = None,
+        end_positions: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The ConvBertForQuestionAnswering forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ConvBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ConvBertModel`.
+            position_ids(Tensor, optional):
+                See :class:`ConvBertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ConvBertModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`ConvBertModel`.
+            start_positions (Tensor of shape `(batch_size,)`, optional):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            end_positions (Tensor of shape `(batch_size,)`, optional):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            output_hidden_states (bool, optional):
+                See :class:`ConvBertModel`.
+            output_attentions (bool, optional):
+                See :class:`ConvBertModel`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+        Returns:
+            tuple: Returns tuple (`start_logits`, `end_logits`).
+
+            With the fields:
+
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ConvBertForQuestionAnswering, ConvBertTokenizer
+
+                tokenizer = ConvBertTokenizer.from_pretrained('convbert-base')
+                model = ConvBertForQuestionAnswering.from_pretrained('convbert-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                start_logits = outputs[0]
+                end_logits  = outputs[1]
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.convbert(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        logits = self.classifier(outputs[0])
+        logits = paddle.transpose(logits, perm=[2, 0, 1])
+        start_logits, end_logits = paddle.unstack(x=logits, axis=0)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if start_positions.ndim > 1:
+                start_positions = start_positions.squeeze(-1)
+            if start_positions.ndim > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clip(0, ignored_index)
+            end_positions = end_positions.clip(0, ignored_index)
+
+            loss_fct = paddle.nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return tuple_output(output, total_loss)
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# ConvBertForMaskedLM is the same as ConvBertGenerator
+ConvBertForMaskedLM = ConvBertGenerator
+ConvBertForPretraining = ConvBertForTotalPretraining
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/convbert/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/convbert/tokenizer.py
new file mode 100644
index 000000000..f21ce42cf
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/convbert/tokenizer.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..electra.tokenizer import ElectraTokenizer
+
+__all__ = [
+    "ConvBertTokenizer",
+]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"convbert-base": 512, "convbert-medium-small": 512, "convbert-small": 512}
+
+
+class ConvBertTokenizer(ElectraTokenizer):
+    """
+    Construct a ConvBERT tokenizer. `ConvBertTokenizer` is identical to `ElectraTokenizer`.
+    For more information regarding those methods, please refer to this superclass.
+    """
+
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "convbert-base": "https://bj.bcebos.com/paddlenlp/models/transformers/convbert/convbert-base/vocab.txt",
+            "convbert-medium-small": "https://bj.bcebos.com/paddlenlp/models/transformers/convbert/convbert-medium-small/vocab.txt",
+            "convbert-small": "https://bj.bcebos.com/paddlenlp/models/transformers/convbert/convbert-small/vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "convbert-base": {"do_lower_case": True},
+        "convbert-medium-small": {"do_lower_case": True},
+        "convbert-small": {"do_lower_case": True},
+    }
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/conversion_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/conversion_utils.py
new file mode 100644
index 000000000..f457bf28e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/conversion_utils.py
@@ -0,0 +1,1544 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import inspect
+import json
+import os
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import (
+    TYPE_CHECKING,
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+)
+
+import numpy as np
+import paddle
+from numpy import allclose, ndarray, transpose
+from paddle import Tensor
+from paddle.nn import Layer
+
+from paddlenlp.utils.distributed import distributed_allgather, distributed_gather
+from paddlenlp.utils.env import CONFIG_NAME, PADDLE_WEIGHTS_NAME, PYTORCH_WEIGHTS_NAME
+from paddlenlp.utils.import_utils import (
+    is_package_available,
+    is_torch_available,
+    is_transformers_available,
+)
+from paddlenlp.utils.log import logger
+from paddlenlp.utils.serialization import load_torch
+from paddlenlp.utils.tools import get_env_device
+
+if TYPE_CHECKING:
+    from paddlenlp.transformers import PretrainedConfig, PretrainedModel
+
+from ..utils import device_guard
+
+# the type hinting for pytorch model & layer & tensor
+Module = TypeVar("Module")
+PytorchTensor = TypeVar("PytorchTensor")
+
+
+def tensor_summary(tensor: Union[str, Tensor, PytorchTensor, tuple, list, ndarray]):
+    """get summary of values which can be some of different values
+
+    Args:
+        tensor (ndarray): the source data of tensor which can be: string, Paddle Tensor, Pytorch Tensor, tuple/list tensor, ndarray
+
+    Returns:
+        str: the summary info
+    """
+    if tensor is None:
+        return "None"
+
+    if isinstance(tensor, str):
+        return tensor
+
+    # Modeling Output from paddlenlp/transformers
+    if isinstance(tensor, dict):
+        tensor = list(tensor.values())
+
+    if isinstance(tensor, (tuple, list)):
+        infos = []
+        for item in tensor:
+            infos.append(tensor_summary(item))
+        return "\n".join(infos)
+
+    # check whether contains `.numpy` method
+    # numpy is wrapped from C++, so it will be the `builtin` method
+    if hasattr(tensor, "numpy") and inspect.isbuiltin(getattr(tensor, "numpy")):
+        tensor = tensor.detach().cpu().numpy()
+        tensor = np.reshape(tensor, [-1])
+        top_3_tensor = str(tensor[1:4])
+        return top_3_tensor
+
+    return str(tensor)
+
+
+def compare_model_weights(first_state_dict: Dict[str, ndarray], second_state_dict: Dict[str, ndarray]) -> List[str]:
+    """compare the values of two state_dict.
+       This function has an assumption: the keys between `first_state_dict` and `second_state_dict` are exactly the same.
+
+    Args:
+        first_state_dict (Dict[str, ndarray]): first state_dict
+        second_state_dict (Dict[str, ndarray]): second state_dict
+
+    Returns:
+        mismatched keys (List[str]): the mismatched keys of state_dict because of some reason
+    """
+    mismatched_keys = []
+    for key in first_state_dict.keys():
+        is_close = np.allclose(first_state_dict[key], second_state_dict[key], atol=1e-4)
+        if not is_close:
+            mismatched_keys.append(key)
+    return mismatched_keys
+
+
+def state_dict_contains_prefix(state_dict: Dict[str, ndarray], prefix: str) -> bool:
+    """check whether state-dict contains `prefix`"""
+    prefix_count = sum([1 for key in state_dict.keys() if key.startswith(prefix)])
+    return prefix_count > 0
+
+
+def init_name_mappings(mappings: list[StateDictNameMapping]) -> list[StateDictNameMapping]:
+    """init name mapping which are simple mappings"""
+    for index in range(len(mappings)):
+        sub_mapping = mappings[index]
+
+        # if sub_mapping is `str`, so repeat it. eg: [ "word_embedding.weight", ["layer_norm", "LayerNorm"] ]
+        if isinstance(sub_mapping, str):
+            sub_mapping = [sub_mapping]
+
+        if len(sub_mapping) == 1:
+            sub_mapping = sub_mapping * 2
+
+        elif sub_mapping[1] is None:
+            sub_mapping[1] = sub_mapping[0]
+
+        mappings[index] = sub_mapping
+
+
+class StateDictKeysChecker:
+    """State Dict Keys Checker"""
+
+    def __init__(
+        self,
+        model_or_state_dict: Union[Layer, Dict[str, ndarray]],
+        loaded_state_dict: Dict[str, ndarray],
+        check_shape: bool = True,
+        base_model_prefix: Optional[str] = None,
+        ignore_keys: Optional[List[str]] = None,
+    ) -> None:
+        if isinstance(model_or_state_dict, Layer):
+            base_model_prefix = base_model_prefix or getattr(model_or_state_dict, "base_model_prefix", None)
+            model_or_state_dict = {
+                key: value.detach().cpu().numpy() for key, value in model_or_state_dict.state_dict().items()
+            }
+
+        self.model_state_dict = model_or_state_dict
+        self.loaded_state_dict = loaded_state_dict
+        self.check_shape = check_shape
+        self.ignore_keys = ignore_keys or []
+        self.base_model_prefix = base_model_prefix
+
+    def change_base_downstream_mismatched_keys(self):
+        """when model is base-model, loaded state-dict is downstream-model,
+        it should re-change the downstream state-dict.
+
+        eg: init `BertModel` with `BertForTokenClassification` state-dict
+
+        # <model-base>-<loaded-downstream>
+        # remove base-prefix
+        """
+        for key in list(self.loaded_state_dict.keys()):
+            if key.startswith(self.base_model_prefix):
+                value = self.loaded_state_dict.pop(key)
+                new_key = key.replace(f"{self.base_model_prefix}.", "")
+                self.loaded_state_dict[new_key] = value
+
+    def change_downstream_base_mismatched_keys(self):
+        """when model is downstream-model, loaded state-dict is base-model,
+        it should re-change the downstream state-dict.
+
+        eg: init `BertModel` with `BertForTokenClassification` state-dict
+
+        # <model>-<loaded>: <downstream>-<base>
+        """
+        for key in list(self.model_state_dict.keys()):
+            if key.startswith(self.base_model_prefix):
+
+                key_in_loaded = key.replace(f"{self.base_model_prefix}.", "")
+                assert key_in_loaded in self.loaded_state_dict
+                # check loaded keys
+                value = self.loaded_state_dict.pop(key_in_loaded)
+                self.loaded_state_dict[key] = value
+
+    def change_diff_keys(self) -> List[str]:
+        """change the loaded-state-dict by base-model & base_model_prefix
+
+        Returns:
+            List[str]: the diff keys between models and loaded-state-dict
+        """
+        # 1. is absolute same
+        all_diff_keys, not_in_model_keys, not_in_loaded_keys = self.get_diff_keys(return_all_diff=True)
+        if len(all_diff_keys) == 0:
+            return []
+
+        if self.base_model_prefix is None:
+            return all_diff_keys
+
+        # 2. <model>-<loaded>: <base>-<downstream>
+        if not state_dict_contains_prefix(self.model_state_dict, self.base_model_prefix):
+
+            # the base-static must be same
+            if not state_dict_contains_prefix(self.loaded_state_dict, self.base_model_prefix):
+                error_msg = ["also the base model, but contains the diff keys: \n"]
+                if not_in_model_keys:
+                    error_msg.append(f"in loaded state-dict, not in model keys: <{not_in_model_keys}>\n")
+                if not_in_loaded_keys:
+                    error_msg.append(f"in model keys, not in loaded state-dict keys: <{not_in_model_keys}>\n")
+                logger.error(error_msg)
+                return []
+            self.change_base_downstream_mismatched_keys()
+        elif not state_dict_contains_prefix(self.loaded_state_dict, self.base_model_prefix):
+            # <model>-<loaded>: <downstream>-<base>
+            self.change_downstream_base_mismatched_keys()
+
+    def get_unexpected_keys(self):
+        """get unexpected keys which are not in model"""
+        self.change_diff_keys()
+        _, unexpected_keys, _ = self.get_diff_keys(True)
+        return unexpected_keys
+
+    def get_mismatched_keys(self):
+        """get mismatched keys which not found in loaded state-dict"""
+        self.change_diff_keys()
+        _, _, mismatched_keys = self.get_diff_keys(True)
+        return mismatched_keys
+
+    def get_diff_keys(self, return_all_diff: bool = False) -> List[str]:
+        """get diff keys
+
+        Args:
+            return_all_diff (bool, optional): return. Defaults to False.
+
+        Returns:
+            List[str]: the diff keys betweens model and loaded state-dict
+        """
+        mismatched_keys = set(self.model_state_dict.keys()) - set(self.loaded_state_dict.keys())
+        unexpected_keys = set(self.loaded_state_dict.keys()) - set(self.model_state_dict.keys())
+
+        all_diff_keys = mismatched_keys | unexpected_keys
+        if return_all_diff:
+            return all_diff_keys, unexpected_keys, mismatched_keys
+        return all_diff_keys
+
+
+def naive_fuse_merge_tp(weight_list, is_column=True, fuse_tensor_parts=2):
+    """
+
+    [A1 B1],[A2 B2]  => [A1, A2, B1, B2]
+
+    Args:
+        weight_list (List[np.ndarray]): The splited tensor parallel weight list.
+        is_column (bool, optional): Is ColumnLinear or RowLinear. Defaults to True.
+
+    Returns:
+        weight (np.ndarray): the merged weight.
+    """
+    if is_column:
+        axis = -1
+    else:
+        axis = 0
+
+    reorder = []
+    if isinstance(weight_list[0], np.ndarray):
+        for item in weight_list:
+            reorder.extend(np.split(item, fuse_tensor_parts, axis=axis))
+    else:
+        for item in weight_list:
+            reorder.extend(paddle.split(item, fuse_tensor_parts, axis=axis))
+    # 0 1 2 3 -> 0 2 1 3
+    index = (
+        np.transpose(np.arange(len(reorder)).reshape([len(weight_list), fuse_tensor_parts]), [1, 0])
+        .reshape(-1)
+        .tolist()
+    )
+
+    if isinstance(weight_list[0], np.ndarray):
+        return np.concatenate([reorder[i] for i in index], axis=axis)
+    else:
+        tensor = paddle.concat([reorder[i] for i in index], axis=axis)
+
+        if tensor.place.is_gpu_place():
+            tensor = tensor._copy_to(paddle.CUDAPinnedPlace(), False)
+        return tensor
+
+
+def naive_fuse_split_tp(
+    weight, tensor_parallel_degree, tensor_parallel_rank=None, is_column=True, fuse_tensor_parts=2
+):
+    """
+
+    [A1, A2, B1, B2] => [A1 B1],[A2 B2]
+
+    Args:
+        weight (numpy.ndarray): the tensor weight,
+        tensor_parallel_degree (int): tensor_parallel_degree
+        tensor_parallel_rank (int): tensor_parallel_rank
+        is_column (bool, optional): is ColumnLinear . Defaults to True.
+
+    Returns:
+        tensor (numpy.ndarray): splited weight.
+
+    """
+    axis = -1 if is_column else 0
+    if "PySafeSlice" in str(type(weight)):
+        size = weight.get_shape()[axis]
+        block_size = size // (fuse_tensor_parts * tensor_parallel_degree)
+
+        splited = []
+        if tensor_parallel_rank is None:
+            begin, end, step = 0, fuse_tensor_parts * tensor_parallel_degree, 1
+        else:
+            begin, end, step = tensor_parallel_rank, fuse_tensor_parts * tensor_parallel_degree, tensor_parallel_degree
+        for rank in range(begin, end, step):
+            start = rank * block_size
+            stop = (rank + 1) * block_size
+            if axis == 0 or len(weight.get_shape()) == 1:
+                tensor = weight[start:stop]
+            else:
+                tensor = weight[:, start:stop]
+            splited.append(tensor)
+
+        if tensor_parallel_rank is None:
+            ret = []
+            for tensor_parallel_rank in range(tensor_parallel_degree):
+                ret.append(np.concatenate(splited[tensor_parallel_rank::tensor_parallel_degree], axis=axis))
+            return ret
+
+        return np.concatenate(splited, axis=axis)
+
+    splited = np.split(weight, fuse_tensor_parts * tensor_parallel_degree, axis=axis)
+
+    if tensor_parallel_rank is None:
+        ret = []
+        for tensor_parallel_rank in range(tensor_parallel_degree):
+            ret.append(np.concatenate(splited[tensor_parallel_rank::tensor_parallel_degree], axis=axis))
+        return ret
+
+    return np.concatenate(splited[tensor_parallel_rank::tensor_parallel_degree], axis=axis)
+
+
+def normal_fuse_merge_tp(weight_list, is_column=True):
+    """
+
+    [A1],[A2]  => [A1, A2]
+
+    Args:
+        weight_list (List[np.ndarray]): The splited tensor parallel weight list.
+        is_column (bool, optional): Is ColumnLinear or RowLinear. Defaults to True.
+
+    Returns:
+        weight (np.ndarray): the merged weight.
+    """
+
+    if is_column:
+        if isinstance(weight_list[0], np.ndarray):
+            return np.concatenate(weight_list, axis=-1)
+        else:
+            tensor = paddle.concat(weight_list, axis=-1)
+            if tensor.place.is_gpu_place():
+                tensor = tensor._copy_to(paddle.CUDAPinnedPlace(), False)
+            return tensor
+    else:
+        if isinstance(weight_list[0], np.ndarray):
+            return np.concatenate(weight_list, axis=0)
+        else:
+            tensor = paddle.concat(weight_list, axis=0)
+            if tensor.place.is_gpu_place():
+                tensor = tensor._copy_to(paddle.CUDAPinnedPlace(), False)
+            return tensor
+
+
+def normal_fuse_split_tp(weight, tensor_parallel_degree, tensor_parallel_rank=None, is_column=True):
+    """
+
+    [A1, A2]  =>  [A1],[A2]
+
+    Args:
+        weight (numpy.ndarray): the tensor weight,
+        tensor_parallel_degree (int): tensor_parallel_degree
+        tensor_parallel_rank (int): tensor_parallel_rank
+        is_column (bool, optional): is ColumnLinear . Defaults to True.
+
+    Returns:
+        tensor (numpy.ndarray): splited weight.
+    """
+    dim = -1 if is_column else 0
+    if "PySafeSlice" in str(type(weight)):
+        size = weight.get_shape()[dim]
+        block_size = size // tensor_parallel_degree
+
+        if tensor_parallel_rank is None:
+            begin, end, step = 0, tensor_parallel_degree, 1
+        else:
+            begin, end, step = tensor_parallel_rank, tensor_parallel_rank + 1, 1
+
+        splited = []
+        for rank in range(begin, end, step):
+            start = rank * block_size
+            stop = (rank + 1) * block_size
+
+            if dim == 0 or len(weight.get_shape()) == 1:
+                tensor = weight[start:stop]
+            elif dim == -1:
+                tensor = weight[:, start:stop]
+            else:
+                raise NotImplementedError("Let's make that generic when needed")
+            if tensor_parallel_rank is not None:
+                return tensor
+
+            splited.append(tensor)
+
+        return splited
+
+    size = weight.shape[dim]
+    assert (
+        size % tensor_parallel_degree == 0
+    ), f"The choosen size {size} is not compatible with sharding on {tensor_parallel_degree} shards. for tensor shape {weight.shape}"
+
+    if is_column:
+        splited_weights = np.split(weight, tensor_parallel_degree, axis=-1)
+    else:
+        splited_weights = np.split(weight, tensor_parallel_degree, axis=0)
+
+    if tensor_parallel_rank is not None:
+        return splited_weights[tensor_parallel_rank]
+
+    return splited_weights
+
+
+"""
+There're three types of MultiHeadAttention QKV Layout in Transfomers
+
+tensor_parallel_qkv = [q1, k1, v1, q2, k2, v2]
+naive_merged_qkv    = [q1, q1, k1, k2, v1, v2]
+splited_qkv         = [q1, q1], [k1, k2], [v1, v2]
+
+naive_merged_qkv -> tensor_parallel_qkv
+    : naive_merged_qkv_to_tensor_parallel_qkv
+
+splited_qkv -> tensor_parallel_qkv
+    : splited_qkv_to_tensor_parallel_qkv
+
+
+"""
+
+
+def tensor_parallel_qkv_to_naive_merged_qkv(weight, num_attention_heads):
+    """
+    [q1, k1, v1, q2, k2, v2] => [q1, q1, k1, k2, v1, v2]
+    """
+    qkvs = []
+    partition_dim = -1
+    split_heads = np.split(weight, 3 * num_attention_heads, axis=partition_dim)
+    qkv_weight_num = 3
+
+    for i in range(qkv_weight_num):
+        qkv = np.concatenate(split_heads[i::qkv_weight_num], axis=partition_dim)
+        qkvs.append(qkv)
+
+    return np.concatenate(qkvs, axis=partition_dim)
+
+
+def naive_merged_qkv_to_tensor_parallel_qkv(weight, num_attention_heads):
+    """
+    [q1, q1, k1, k2, v1, v2] => [q1, k1, v1, q2, k2, v2]
+    """
+    qkv_pairs = []
+    partition_dim = -1
+    split_heads = np.split(weight, 3 * num_attention_heads, axis=partition_dim)
+
+    for i in range(num_attention_heads):
+        qkv_pair = np.concatenate(split_heads[i::num_attention_heads], axis=partition_dim)
+        qkv_pairs.append(qkv_pair)
+
+    return np.concatenate(qkv_pairs, axis=partition_dim)
+
+
+def splited_qkv_to_tensor_parallel_qkv(weight_list, num_attention_heads):
+    """
+    [q1, k1, v1], [q2, k2, v2] => [q1, q1, k1, k2, v1, v2]
+
+    Args:
+        weight_list (_type_): [Q,K,V] tensor list
+    """
+    assert len(
+        weight_list
+    ), f"weight_list length is not equal 3, it should be Q K V list. but got length {len(weight_list)}"
+    weight = np.concatenate(weight_list, axis=-1)
+    return naive_merged_qkv_to_tensor_parallel_qkv(weight)
+
+
+def fuse_param_func():
+    def fn(fuse_params, is_qkv=False, num_heads=None, num_key_value_heads=None):
+        """fuse function for fusing weights
+
+        (1) fuse_attention_qkv
+            q => [q1,q2,q3,q4]
+            k => [k1,k2,k3,k4] or [k1,k2] for GQA
+            v => [v1,v2,v3,v4] or [v1,v2] for GQA
+            fused weight => [q1,k1,v1,q2,k2,v2,q3,k3,v3,q4,k4,v4]
+                 or for GQA [q1,q2,k1,v1,q3,q4,k2,v2]
+        (2) fuse_attention_ffn
+            directly fuse weights to 1 parts
+            [gate_weight], [up_weight] => [gate_weight, up_weight]
+
+        Args:
+            fuse_params (_type_): to be fused weights
+            is_qkv (bool, optional): for attention qkv weights. Defaults to False.
+            num_heads (_type_, optional): query heads. Defaults to None.
+            num_key_value_heads (_type_, optional): key and value heads. Defaults to None.
+
+        Returns:
+            _type_: fused weights
+        """
+        concat_fn = np.concatenate
+        split_fn = np.split
+        if isinstance(fuse_params[0], paddle.Tensor):
+            concat_fn = paddle.concat
+            split_fn = paddle.split
+
+        if is_qkv:
+            # fuse_attention_qkv
+            assert num_heads, f"num_heads should be number of heads for Q, but got {num_heads}"
+            assert (
+                num_key_value_heads
+            ), f"num_key_value_heads should be number of key_value_heads for K and V, but got {num_key_value_heads}"
+            assert (
+                len(fuse_params) == 3
+            ), f"fuse_params length is not equal 3, it should be Q K V list. but got length {len(fuse_params)}"
+            num_query_groups = num_heads // num_key_value_heads
+            q_list = split_fn(fuse_params[0], num_heads, axis=-1)
+            k_list = split_fn(fuse_params[1], num_key_value_heads, axis=-1)
+            v_list = split_fn(fuse_params[2], num_key_value_heads, axis=-1)
+
+            qkv_pairs = []
+            for i in range(num_key_value_heads):
+                qkv_pairs += q_list[i * num_query_groups : (i + 1) * num_query_groups]
+                qkv_pairs.append(k_list[i])
+                qkv_pairs.append(v_list[i])
+            return concat_fn(qkv_pairs, axis=-1)
+        else:
+            # fuse_attention_ffn
+            return concat_fn(fuse_params, axis=-1)
+
+    return fn
+
+
+def split_param_func():
+    def fn(fused_param, split_nums=2, is_qkv=False, num_heads=None, num_key_value_heads=None):
+        """split function for splitting weights
+
+        (1) fuse_attention_qkv
+            fused weight => [q1,k1,v1,q2,k2,v2,q3,k3,v3,q4,k4,v4]
+                 or for GQA [q1,q2,k1,v1,q3,q4,k2,v2]
+            after split
+            q => [q1,q2,q3,q4]
+            k => [k1,k2,k3,k4] or [k1,k2] for GQA
+            v => [v1,v2,v3,v4] or [v1,v2] for GQA
+        (2) fuse_attention_ffn
+            directly split weight to 2 parts
+            [gate_weight, up_weight] => [gate_weight], [up_weight]
+
+        Args:
+            fused_param (_type_): len(fused_param)=1, only one weight to be splitted
+            split_nums (int, optional): split_nums. Defaults to 2.
+            is_qkv (bool, optional): for attention qkv weights. Defaults to False.
+            num_heads (_type_, optional): query heads. Defaults to None.
+            num_key_value_heads (_type_, optional): key and value heads. Defaults to None.
+
+        Returns:
+            _type_: splitted weights
+        """
+        concat_fn = np.concatenate
+        split_fn = np.split
+        if isinstance(fused_param, paddle.Tensor):
+            concat_fn = paddle.concat
+            split_fn = paddle.split
+
+        if is_qkv:
+            # fuse_attention_qkv
+            assert num_heads, f"num_heads should be number of heads for Q, but got {num_heads}"
+            assert (
+                num_key_value_heads
+            ), f"num_key_value_heads should be number of key_value_heads for K and V, but got {num_key_value_heads}"
+            num_query_groups = num_heads // num_key_value_heads
+            q_list, k_list, v_list = [], [], []
+            split_heads = split_fn(fused_param, num_heads + 2 * num_key_value_heads, axis=-1)
+            for i in range(num_key_value_heads):
+                q_list += split_heads[i * (num_query_groups + 2) : (i + 1) * (num_query_groups + 2) - 2]
+                k_list.append(split_heads[(i + 1) * (num_query_groups + 2) - 2])
+                v_list.append(split_heads[(i + 1) * (num_query_groups + 2) - 1])
+            return concat_fn(q_list, axis=-1), concat_fn(k_list, axis=-1), concat_fn(v_list, axis=-1)
+        else:
+            # fuse_attention_ffn
+            return split_fn(fused_param, split_nums, axis=-1)
+
+    return fn
+
+
+def split_or_fuse_func(is_fuse=True):
+    return fuse_param_func() if is_fuse else split_param_func()
+
+
+def get_tensor_parallel_merge_func(tensor_parallel_degree, tensor_parallel_rank, num_attention_heads=None):
+    def fn(
+        x,
+        is_column=True,
+        transpose=False,
+        is_old_qkv=False,
+        is_naive_2fuse=False,
+        is_naive_3fuse=False,
+    ):
+        if x is None:
+            return None
+
+        if is_naive_2fuse:
+            return naive_fuse_merge_tp(x, is_column=is_column, fuse_tensor_parts=2)
+        elif is_naive_3fuse:
+            return naive_fuse_merge_tp(x, is_column=is_column, fuse_tensor_parts=3)
+        else:
+            x = normal_fuse_merge_tp(x, is_column=is_column)
+
+        if is_old_qkv:
+            assert is_column, "QKV tensor should be column parallel linear."
+            assert num_attention_heads is not None, "is_old_qkv need num_attention_heads"
+            x = tensor_parallel_qkv_to_naive_merged_qkv(x, num_attention_heads)
+        if transpose:
+            x = np.transpose(x, [1, 0])
+
+        return x
+
+    return fn
+
+
+def get_tensor_parallel_split_func(tensor_parallel_degree, tensor_parallel_rank, num_attention_heads=None):
+    def fn(x, is_column=True, transpose=False, is_old_qkv=False, is_naive_2fuse=False, is_naive_3fuse=False):
+        if x is None:
+            return None
+        if transpose:
+            x = np.transpose(x, [1, 0])
+        if is_old_qkv:
+            assert is_column, "QKV tensor should be column parallel linear."
+            assert num_attention_heads is not None, "is_old_qkv need num_attention_heads"
+            x = naive_merged_qkv_to_tensor_parallel_qkv(x, num_attention_heads)
+        if is_naive_2fuse:
+            return naive_fuse_split_tp(
+                x, tensor_parallel_degree, tensor_parallel_rank, is_column=is_column, fuse_tensor_parts=2
+            )
+        if is_naive_3fuse:
+            return naive_fuse_split_tp(
+                x, tensor_parallel_degree, tensor_parallel_rank, is_column=is_column, fuse_tensor_parts=3
+            )
+
+        return normal_fuse_split_tp(x, tensor_parallel_degree, tensor_parallel_rank, is_column=is_column)
+
+    return fn
+
+
+def split_or_merge_func(is_split, tensor_parallel_degree, tensor_parallel_rank, num_attention_heads=None):
+    if is_split:
+        return get_tensor_parallel_split_func(tensor_parallel_degree, tensor_parallel_rank, num_attention_heads)
+    return get_tensor_parallel_merge_func(tensor_parallel_degree, tensor_parallel_rank, num_attention_heads)
+
+
+@dataclass
+class StateDictNameMapping:
+    """NameMapping of StateDict between two models"""
+
+    source_name: str
+    target_name: str = None
+
+    action: Optional[str] = None  # the value can be: transpose, merge_last_two_dim
+    index: Optional[int] = None
+
+    slots: list[str] = None
+
+    def __post_init__(self):
+        self.target_name = self.target_name or self.source_name
+
+    def should_transpose(self) -> bool:
+        return self.action == "transpose"
+
+    def should_merge_last_two_dim(self) -> bool:
+        """check that wether merge last two dim"""
+        return self.action == "merge_last_two_dim"
+
+    def run(self, state_dict: dict[str, ndarray], name: str) -> ndarray:
+        """run some custom operation on ndarray, eg: transpose, merge_last_two_dim
+
+        Args:
+            tensor (ndarray): the source of the tensor data
+
+        Returns:
+            ndarray: the final tensor
+        """
+        tensor = state_dict.pop(name)
+        if callable(self.action):
+            return self.action(tensor)
+        if self.action == "transpose":
+            return transpose(tensor, [1, 0])
+        if self.action == "merge_last_two_dim":
+            shape = tensor.shape
+            assert len(shape) == 3
+            return np.reshape(tensor, [shape[0], -1])
+        if self.action == "split":
+            assert self.index is not None, "when action is `split`, index field is required."
+            # FIXME if the order of split starts from index=2, no tensor left.
+            if self.index < 2:
+                state_dict[name] = tensor
+            # qkv is stored in same tensor, so it should be split into 3 arr
+            tensors = np.split(tensor, 3, axis=-1)
+            return tensors[self.index]
+
+        return tensor
+
+    def matched(self, text: str) -> bool:
+        """check whether the layer_name match the current pattern
+
+        Args:
+            text (str): the name of layer
+
+        Returns:
+            bool: whether the
+        """
+        if text == self.source_name:
+            return True
+
+        if not self.slots:
+            return False
+
+
+class TensorInfoSaver:
+    def __init__(self) -> None:
+        self.series = {}
+
+    def add(self, state_dict_key: str, key: str, values: Union[float, ndarray, Tensor, PytorchTensor]):
+        """add
+
+        Args:
+            state_dict_key (str): the state_dict key to compare, eg: embedding.weight
+            key (str): the field to compare, eg: paddle_input
+            values (Union[float, ndarray, Tensor]): the tensor
+        """
+        if state_dict_key not in self.series:
+            self.series[state_dict_key] = {}
+
+        if state_dict_key not in self.series[state_dict_key]:
+            self.series[state_dict_key]["state_dict_key"] = state_dict_key
+
+        self.series[state_dict_key][key] = tensor_summary(values)
+
+    def summary(self, output_path: Optional[str] = None):
+        """output the summary info into different terminal
+
+        Args:
+            output_path (Optional[str], optional): the dir/file of sumamry file. Defaults to None.
+        """
+        if output_path and os.path.isdir(output_path):
+            output_path = os.path.join(output_path, "tensor_summary.xlsx")
+            self.summary_to_excel(output_path)
+
+        self.summary_to_terminal()
+
+    def summary_to_excel(self, file: str):
+        if not is_package_available("pandas"):
+            return False
+        if not is_package_available("openpyxl"):
+            logger.warning(
+                "detect that pandas is installed, but openpyxl is not installed so can't save info into excel file. "
+                "you can run command: `pip install openpyxl` to get the great feature"
+            )
+            return False
+
+        import pandas as pd
+
+        with pd.ExcelWriter(file, "a", engine="openpyxl", if_sheet_exists="new") as writer:
+            pd.DataFrame(list(self.series.values())).to_excel(writer, index=False)
+
+    def summary_to_terminal(self):
+        """print table info into terminal with tabulate"""
+        from tabulate import tabulate
+
+        headers = {key: key for key in self.series.keys()}
+        print(tabulate(list(self.series.values()), tablefmt="grid", headers=headers))
+
+    def clear(self):
+        """clear the series data"""
+        self.series.clear()
+
+
+class LogitHooker:
+    """hooks for pytorch model and paddle model, used to generate the logits of elment layers"""
+
+    def __init__(self, mappings: List[StateDictNameMapping], tensor_info_saver: Optional[TensorInfoSaver] = None):
+        """registe the logit hooks to compare the inputs * outputs model
+
+        Args:
+            mappings (List[StateDictNameMapping]): the mappings between paddle & pytorch model
+            tensor_info_saver (Optional[TensorInfoSaver], optional): the saver for model logit. Defaults to None.
+        """
+        self.mappings = mappings
+        self.tensor_info_saver = tensor_info_saver or TensorInfoSaver()
+
+    def _paddle_hooks(self, layer: Layer, inputs: Tuple[Tensor], outputs: Union[Tensor, Tuple[Tensor]]):
+        """internal paddle hooks to save the logit of paddle layer
+
+        Args:
+            layer (Layer): the layer of paddle element
+            inputs (Tuple[Tensor]): the inputs of paddle layer
+            outputs (Union[Tensor, Tuple[Tensor]]): the outputs of paddle layer
+        """
+        state_dict_name = layer.__state_dict_name__
+
+        self.tensor_info_saver.add(state_dict_name, "paddle-input", inputs)
+
+        self.tensor_info_saver.add(state_dict_name, "paddle-outputs", outputs)
+
+    def _pytorch_hooks(
+        self,
+        layer: Layer,
+        inputs: Tuple[PytorchTensor],
+        outputs: Union[Dict[str, PytorchTensor], Tuple[PytorchTensor]],
+    ):
+        """internal pytorch hooks to save the logit of pytorch module
+
+        Args:
+            layer (torch.nn.Module): the module of pytorch model
+            inputs (Tuple[PytorchTensor]): the inputs of pytorch layer
+            outputs (Union[Dict[str, PytorchTensor], Tuple[PytorchTensor]]): the outputs of pytorch layer
+        """
+        state_dict_name = layer.__state_dict_name__
+
+        self.tensor_info_saver.add(
+            state_dict_name,
+            "pytorch-input",
+            inputs,
+        )
+
+        self.tensor_info_saver.add(state_dict_name, "pytorch-outputs", outputs)
+
+    def register_paddle_model_hooks(self, model: Layer):
+        """regist post forward hook to save the inputs & outputs of paddle model
+
+        Args:
+            model (Layer): paddle model
+        """
+
+        # 1. register paddle model hook to save the logits of target layer
+        def register_hook_by_name(model: Layer, mapping: StateDictNameMapping, hook: Callable[..., None]):
+            """register hook by name of state_dict, eg: encoder.layers.0.linear1.bias
+
+            Args:
+                model (Layer): the source model
+                mapping (StateDictNameMapping): the name mapping object
+                hook (Callable[..., None]): the hook for paddle model
+            """
+            name = mapping.target_name
+            attributes = name.split(".")
+            last_layer: Layer = model
+            for attribute in attributes:
+                if getattr(model, attribute, None) is not None:
+                    model = getattr(model, attribute)
+                    if isinstance(model, Layer):
+                        last_layer = model
+            if (
+                hasattr(last_layer, "register_forward_post_hook")
+                and getattr(last_layer, "__state_dict_name__", None) is None
+            ):
+                last_layer.register_forward_post_hook(hook)
+                # set state_dict key into layer as the private attribute
+                last_layer.__state_dict_name__ = name
+
+        for mapping in self.mappings:
+            register_hook_by_name(model, mapping, self._paddle_hooks)
+
+    def register_pytorch_model_hooks(self, model: Module):
+        """regist hook for pytorch model to save the inputs & outputs of pytorch model
+
+        Args:
+            model (_type_): pytorch model
+        """
+        from torch import nn
+
+        # 1. register paddle model hook to save the logits of target layer
+        def register_hook_by_name(model: Module, mapping: StateDictNameMapping, hook: Callable[..., None]):
+            name = mapping.source_name
+            attributes, index = name.split("."), 0
+            last_layer: Module = model
+            while index < len(attributes):
+                attribute = attributes[index]
+                if getattr(model, attribute, None) is not None:
+                    if isinstance(model, nn.ModuleList) and attribute.isdigit():
+                        model = model[int(attribute)]
+                        last_layer = model
+                    else:
+                        model = getattr(model, attribute)
+                        if isinstance(model, nn.Module):
+                            last_layer = model
+                index += 1
+            if (
+                hasattr(last_layer, "register_forward_hook")
+                and getattr(last_layer, "__state_dict_name__", None) is None
+            ):
+                last_layer.register_forward_hook(hook)
+                # set state_dict key into layer as the private attribute
+                last_layer.__state_dict_name__ = mapping.target_name
+
+        for mapping in self.mappings:
+            register_hook_by_name(model, mapping, self._pytorch_hooks)
+
+    def summary(self):
+        """print the summary info to terminal/excel to analysis"""
+        self.tensor_info_saver.summary()
+
+
+class LogitComparer:
+    """Model Weight Converter for developer to convert pytorch/tensorflow/jax pretrained model weight to paddle.
+
+    * you can convert model weight in online/offline mode.
+    * you can convert weight and config file.
+    * you can convert weight/config file in some customization ways.
+    """
+
+    _ignore_state_dict_keys = []
+    num_layer_regex = r"\.\d+\."
+
+    num_layer_key: str = "num_hidden_layers"
+
+    # when field-name is same as hf models, so you only need to
+    # change this attribute to map the configuration
+    config_fields_to_be_removed: List[str] = ["transformers_version"]
+    architectures: Dict[str, Type[PretrainedModel]] = {}
+
+    def __init__(self, input_dir: str) -> None:
+        self.input_dir = input_dir
+
+    def get_paddle_pytorch_model_classes(self) -> Tuple[object, object]:
+        """return the [PaddleModelClass, PytorchModelClass] to
+            1. generate paddle model automatically
+            2. compare the logits from pytorch model and paddle model automatically
+
+        Returns:
+            Tuple[object, object]: [PaddleModelClass, PytorchModelClass]
+        """
+        raise NotImplementedError
+
+    def get_inputs(self):
+        """the numpy inputs for paddle & pytorch model"""
+        input_ids = paddle.arange(600, 700)
+        input_ids = paddle.unsqueeze(input_ids, axis=0).detach().cpu().numpy()
+        return [input_ids]
+
+    def resolve_paddle_output_logits(self, paddle_outputs: Tuple[Tensor]):
+        """resolve the logit from paddle model which can be `last_hidden_state`"""
+        output = None
+        if isinstance(paddle_outputs, (tuple, list)):
+            output = paddle_outputs[0]
+        elif paddle.is_tensor(paddle_outputs):
+            output = paddle_outputs
+
+        if output is None:
+            raise NotImplementedError("can't resolve paddle model outputs")
+
+        return output.detach().cpu().reshape([-1]).numpy()
+
+    def resolve_pytorch_output_logits(self, pytorch_outputs: Module):
+        """resolve the logit from pytorch model which can be `last_hidden_state`"""
+        output = pytorch_outputs[0]
+        if output is None:
+            raise NotImplementedError("can't resolve paddle model outputs")
+
+        return output.detach().cpu().reshape([-1]).numpy()
+
+    @staticmethod
+    def get_model_state_dict(model: Union[Layer, Module], copy: bool = False) -> Dict[str, ndarray]:
+        """get the state_dict of pytorch/paddle model
+
+        Args:
+            model (Union[Layer, Module]): can be paddle/pytorch model
+
+        Returns:
+            Dict[str, ndarray]: the final state_dict data
+        """
+        from torch import nn
+
+        assert isinstance(model, (Layer, nn.Module))
+        state_dict = {key: value.detach().cpu().numpy() for key, value in model.state_dict().items()}
+        if copy:
+            state_dict = deepcopy(state_dict)
+        return state_dict
+
+    def compare_model_state_dicts(
+        self,
+        paddle_model: Union[Layer, Dict[str, ndarray]],
+        pytorch_model: Union[Module, Dict[str, ndarray]],
+        name_mappings: List[StateDictNameMapping],
+    ):
+        """compare the pytorch and paddle mdoel state with name mappings
+
+        Args:
+            paddle_model (Union[Layer, Dict[str, ndarray]]): paddle model instance
+            pytorch_model (Union[Module, Dict[str, ndarray]]): pytorch model instance
+            name_mappings (List[StateDictNameMapping]): the name mappings
+        """
+        if not isinstance(paddle_model, dict):
+            paddle_state_dict = {key: value.detach().cpu().numpy() for key, value in paddle_model.state_dict().items()}
+        else:
+            paddle_state_dict = paddle_model
+
+        if not isinstance(pytorch_model, dict):
+            pytorch_state_dict = {
+                key: value.detach().cpu().numpy() for key, value in pytorch_model.state_dict().items()
+            }
+        else:
+            pytorch_state_dict = pytorch_model
+
+        model_state_saver = TensorInfoSaver()
+        for name_mapping in name_mappings:
+            model_state_saver.add(name_mapping.target_name, "pytorch_key", name_mapping.source_name)
+
+            if name_mapping.target_name in paddle_state_dict:
+                paddle_numpy = paddle_state_dict.pop(name_mapping.target_name)
+                model_state_saver.add(name_mapping.target_name, "paddle", paddle_numpy)
+                model_state_saver.add(name_mapping.target_name, "paddle-shape", str(paddle_numpy.shape))
+
+            if name_mapping.source_name in pytorch_state_dict:
+                pytorch_numpy = pytorch_state_dict.pop(name_mapping.source_name)
+                model_state_saver.add(name_mapping.target_name, "pytorch", pytorch_numpy)
+                model_state_saver.add(name_mapping.target_name, "pytorch-shape", str(pytorch_numpy.shape))
+
+        model_state_saver.summary()
+
+    def compare_logits(self) -> bool:
+        """compare the logit of pytorch & paddle model
+
+        Returns:
+            bool: if the logits is absolutly same
+        """
+        PaddleModel, PytorchModel = self.get_paddle_pytorch_model_classes()
+        paddle_model = PaddleModel.from_pretrained(self.input_dir)
+
+        # 0. init the name_mapping & tensor_info_saver & logit_hooker
+        name_mappings = self.get_name_mapping(paddle_model.config)
+        tensor_info_saver = TensorInfoSaver()
+
+        logit_hooker = LogitHooker(name_mappings, tensor_info_saver)
+        inputs = self.get_inputs()
+
+        # 1. get the logits of paddle model
+        logit_hooker.register_paddle_model_hooks(paddle_model)
+        paddle_inputs = [paddle.to_tensor(input_item) for input_item in inputs]
+        paddle_model.eval()
+
+        paddle_outputs = paddle_model(*paddle_inputs)
+        # remove paddle_model and free gpu memory
+        paddle_model_state_dict = self.get_model_state_dict(paddle_model)
+        del paddle_model
+        paddle_logits = self.resolve_paddle_output_logits(paddle_outputs)
+
+        logger.info("===============the summary of paddle Model logits: ===============")
+        logger.info(tensor_summary(paddle_logits))
+
+        # 2. get the logits of pytorch model
+        import torch
+
+        pytorch_model = PytorchModel.from_pretrained(self.input_dir)
+        logit_hooker.register_pytorch_model_hooks(pytorch_model)
+
+        pytorch_model.eval()
+        pytorch_inputs = [torch.tensor(input_item) for input_item in inputs]
+        torch_outputs = pytorch_model(*pytorch_inputs)
+        # remove paddle_model and free gpu memory
+        pytorch_model_state_dict = self.get_model_state_dict(pytorch_model)
+        del pytorch_model
+
+        pytorch_logits = self.resolve_pytorch_output_logits(torch_outputs)
+
+        logger.info("===============the summary of pytorch Model logits: ===============")
+        logger.info(tensor_summary(pytorch_logits))
+
+        # 3. compare the logits
+        result = allclose(paddle_logits[1:4], pytorch_logits[1:4], atol=1e-4)
+
+        if not result:
+            print("============================== compare model state dict ==============================")
+
+            self.compare_model_state_dicts(paddle_model_state_dict, pytorch_model_state_dict, name_mappings)
+
+            print("============================== compare model inputs & outputs ==============================")
+            logit_hooker.summary()
+
+        return result
+
+    def on_converted(self):
+
+        PaddleModelClass, PytorchModelClass = self.get_paddle_pytorch_model_classes()
+
+        # 1. try to compare two loaded paddle weight file
+        first_paddle_model = PaddleModelClass.from_pretrained(self.input_dir)
+        second_paddle_model = PaddleModelClass.from_pretrained(self.input_dir)
+        mismatched_keys = compare_model_weights(
+            self.get_model_state_dict(first_paddle_model),
+            self.get_model_state_dict(second_paddle_model),
+        )
+        for key in mismatched_keys:
+            logger.error(f"the key<{key}> is not set correctly with weight")
+
+        # 2. try to compare logits between paddle & pytorch model
+        if is_torch_available() and is_transformers_available():
+            result = self.compare_logits()
+            if result is True:
+                logger.info("the logits between pytorch model and paddle model is absolutly same")
+            else:
+                logger.error(
+                    "the logits between pytorch model and paddle model is not same, please check it out more carefully."
+                )
+        else:
+            logger.warning(
+                "you don't install `torch` and `transformers` package, so we can't compare the logits between paddle & pytorch model"
+            )
+
+
+class ConversionMixin:
+    @classmethod
+    def support_conversion(cls, config: PretrainedConfig) -> bool:
+        """check wether the model support conversion"""
+        try:
+            # try to get the name-mapping info
+            _ = cls._get_name_mappings(config)
+        except NotImplementedError:
+            return False
+        finally:
+            return True
+
+    @classmethod
+    def convert(cls, weight_file: str, config: PretrainedConfig, cache_dir: str) -> None:
+        """the entry of converting config and converting model file
+
+        Args:
+            input_dir (str | None): the input dir which contains `pytorch_model.bin` and `config.json` file
+            config (PretrainedConfig): the PretrainedConfig instance of model
+        """
+        # FIXME(wj-Mcat): add compatibility with downstream models
+        name_mappings = cls._get_name_mappings(config)
+        if weight_file.endswith(".index.json"):
+            if ".safetensors." in weight_file:
+                files = [file for file in os.listdir(os.path.dirname(weight_file)) if file.startswith("model-")]
+            else:
+                files = [
+                    file for file in os.listdir(os.path.dirname(weight_file)) if file.startswith("pytorch_model-")
+                ]
+            state_dict = {}
+            for file in files:
+                sub_state_dict = load_torch(os.path.join(os.path.dirname(weight_file), file))
+                state_dict.update(sub_state_dict)
+        else:
+            state_dict = load_torch(weight_file)
+
+        # 3. convert state_dict
+        all_layer_names = set(state_dict.keys())
+        for name_mapping in name_mappings:
+            if name_mapping.source_name not in state_dict:
+                logger.warning(f"key<{name_mapping.source_name}> not in the pytorch weight file.")
+                continue
+
+            state_dict[name_mapping.target_name] = name_mapping.run(state_dict, name_mapping.source_name)
+            if name_mapping.source_name in all_layer_names:
+                all_layer_names.remove(name_mapping.source_name)
+
+        if all_layer_names:
+            logger.warning(f"there are {len(all_layer_names)} tensors not initialized:")
+            for layer_name in all_layer_names:
+                logger.warning(f"--- {layer_name}")
+
+        model_weight_file = os.path.join(cache_dir, PADDLE_WEIGHTS_NAME)
+        if not os.path.isfile(model_weight_file):
+            paddle.save(state_dict, model_weight_file)
+        return state_dict
+
+    @classmethod
+    def _get_name_mappings(cls, config: PretrainedConfig) -> List[StateDictNameMapping]:
+        """get name mapping of PretrainedModel
+
+        Args:
+            config (PretrainedConfig): the configuration of name-mapping
+
+        Raises:
+            NotImplementedError:
+
+        Returns:
+            List[StateDictNameMapping]: the name-mappings of pretrained model
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def get_tensor_parallel_convert_actions(
+        cls, config: PretrainedConfig, loaded_state_dict_keys, is_split=True, ignore_error=False
+    ):
+        name_action_mappings = cls._get_tensor_parallel_mappings(config, is_split=is_split)
+        state_keys_map = cls._resolve_prefix_keys(name_action_mappings.keys(), loaded_state_dict_keys, ignore_error)
+        for k, v in state_keys_map.items():
+            name_action_mappings[v] = name_action_mappings.pop(k)
+        return name_action_mappings
+
+    @classmethod
+    def convert_tensor_parallel(
+        cls, weight_file: str, config: PretrainedConfig, state_dict=None, ignore_error=False
+    ) -> None:
+        """the entry of converting config and converting model file
+
+        Args:
+            weight_file (str | None): the weight file path of `model_state.pdparams` file
+            config (PretrainedConfig): the PretrainedConfig instance of model
+        """
+
+        name_action_mappings = cls._get_tensor_parallel_mappings(config)
+        if state_dict is None:
+            with device_guard("cpu"):
+                state_dict = paddle.load(weight_file, return_numpy=False)
+            logger.info("Starting to convert orignal state_dict to tensor parallel state_dict.")
+
+        state_keys_map = cls._resolve_prefix_keys(name_action_mappings.keys(), state_dict.keys(), ignore_error)
+
+        for k, v in state_keys_map.items():
+            name_action_mappings[v] = name_action_mappings.pop(k)
+
+        for name, action in name_action_mappings.items():
+            if name not in state_dict:
+                if not ignore_error:
+                    logger.warning(f"Key <{name}> not in the model state weight file.")
+                continue
+            tensor = state_dict.pop(name)
+            new_tensor = action(tensor)
+            with device_guard("cpu"):
+                state_dict[name] = paddle.Tensor(new_tensor, zero_copy=True)
+
+        return state_dict
+
+    @classmethod
+    def merge_tensor_parallel(cls, state_dict, config) -> None:
+        """the entry of converting config and converting model file
+
+        Args:
+            input_dir (str | None): the input dir which contains `pytorch_model.bin` and `config.json` file
+            config (PretrainedConfig): the PretrainedConfig instance of model
+        """
+        name_action_mappings = cls._get_tensor_parallel_mappings(config, is_split=False)
+        state_keys_map = cls._resolve_prefix_keys(name_action_mappings.keys(), state_dict.keys())
+
+        for k, v in state_keys_map.items():
+            name_action_mappings[v] = name_action_mappings.pop(k)
+
+        state_dict_to_save = {}
+
+        hcg = paddle.distributed.fleet.get_hybrid_communicate_group()
+        mp_group = hcg.get_model_parallel_group()
+        is_dst = paddle.distributed.get_rank(mp_group) == 0
+
+        for key in state_dict.keys():
+            tensor = state_dict[key]
+            if key in name_action_mappings:
+                if get_env_device() == "xpu":
+                    ret = distributed_allgather(tensor, group=mp_group, offload=True)
+                else:
+                    ret = distributed_gather(tensor, group=mp_group, offload=True)
+                action = name_action_mappings.pop(key)
+                tensor = action(ret) if is_dst else None
+            else:
+                tensor = tensor.cpu().numpy() if is_dst else None
+
+            # keep state dict use paddle.tensor
+            if isinstance(tensor, np.ndarray):
+                with device_guard("cpu"):
+                    tensor = paddle.Tensor(tensor, zero_copy=True)
+
+            state_dict_to_save[key] = tensor
+
+        if len(name_action_mappings) > 0:
+            for x in name_action_mappings.keys():
+                logger.debug(f"key <{x}> need to merge tensor parallel but we can't find in model state.")
+
+        return state_dict_to_save
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config: PretrainedConfig, is_split=True) -> List[StateDictNameMapping]:
+        """get name mapping of PretrainedModel
+
+        Args:
+            config (PretrainedConfig): the configuration of name-mapping
+
+        Raises:
+            NotImplementedError:
+
+        Returns:
+            List[StateDictNameMapping]: the name-mappings for tensor_parallel
+        """
+        raise NotImplementedError
+
+    @staticmethod
+    def _resolve_prefix_keys(state_keys_base, state_keys_real, ignore_error=False):
+        # state_keys_map base to real
+        state_keys_map = {}
+
+        state_keys_base = set(state_keys_base)
+        state_keys_real = set(state_keys_real)
+
+        for key in state_keys_base:
+            for x in state_keys_real:
+                if x.endswith(key):
+                    state_keys_map[key] = x
+                    break
+            if key not in state_keys_map:
+                if not ignore_error:
+                    logger.debug(f"tensor parallel conversion: could not find name {key} in loaded state dict!")
+            else:
+                state_keys_real.remove(state_keys_map[key])
+
+        return state_keys_map
+
+    @classmethod
+    def convert_fuse_and_split(cls, config: PretrainedConfig, state_dict, tp_actions=None):
+        loaded_keys = state_dict.keys()
+        # collect and convert fuse/split action
+        fused_and_split_keys = []
+        convert_with_same_keys = []
+        fuse_actions, resume_keys = cls.get_fuse_or_split_param_convert_actions(config, loaded_keys, is_fuse=True)
+        for keys, action in fuse_actions.items():
+            if keys[-1] in keys[:-1]:
+                assert len(keys) == 2, "only 2 keys can be converted with the same name"
+                convert_with_same_keys.append(keys[-1])
+            origin_states = [state_dict.pop(key) for key in keys[:-1]]
+            state_dict[keys[-1]] = action(origin_states)
+            fused_and_split_keys.append(keys[-1])
+            logger.debug(f"Fusing parameter: {keys[:-1]} into {keys[-1]}")
+
+        split_actions, _ = cls.get_fuse_or_split_param_convert_actions(config, loaded_keys, is_fuse=False)
+        for keys, action in split_actions.items():
+            if keys[-1] in keys[:-1]:
+                assert len(keys) == 2, "only 2 keys can be converted with the same name"
+                convert_with_same_keys.append(keys[-1])
+            origin_state = state_dict.pop(keys[-1])
+            split_states = action(origin_state)
+            for key_idx, key in enumerate(keys[:-1]):
+                state_dict[key] = split_states[key_idx]
+                fused_and_split_keys.append(key)
+            logger.debug(f"Splitting parameter: {keys[-1]} into {keys[:-1]}")
+
+        if tp_actions is not None:
+            for key in fused_and_split_keys:
+                if key in convert_with_same_keys:
+                    continue
+
+                for name in tp_actions.keys():
+                    if key.endswith(name):
+                        with device_guard():
+                            state_dict[key] = paddle.Tensor(tp_actions[name](state_dict.pop(key)), zero_copy=True)
+                        break
+
+        # when shard file split the weight as follows, some weights need to be resumed for next shard file
+        # shard-001-file: q_weight, k_weight
+        # shard_002-file: v_weight
+        resume_state_dict = {k: state_dict[k] for k in resume_keys if k in state_dict}
+        return state_dict, resume_state_dict
+
+    @classmethod
+    def get_fuse_or_split_param_convert_actions(
+        cls,
+        config: PretrainedConfig,
+        loaded_state_dict_keys,
+        is_fuse=True,
+        ignore_error=False,
+    ):
+        name_action_mappings = cls._get_fuse_or_split_param_mappings(config, is_fuse)
+        state_keys_map = cls._resolve_prefix_keys_for_fuse_and_split(
+            name_action_mappings.keys(), loaded_state_dict_keys, ignore_error, is_fuse
+        )
+        for k, v in state_keys_map.items():
+            name_action_mappings[v] = name_action_mappings.pop(k)
+
+        # filter name_action_mappings with corresponding weights
+        # fusing: verify all of the keys in name_action_mappings are in loaded_state_dict_keys
+        # splitting: verify the last key in name_action_mappings is in loaded_state_dict_keys
+        filter_name_action = {}
+        resume_keys = []
+        if is_fuse:
+            for k, v in name_action_mappings.items():
+                cond = True
+                if not all(item in loaded_state_dict_keys for item in k[:-1]):
+                    # resume keys for next fuse
+                    resume_keys += k[:-1]
+                    cond = False
+                if cond:
+                    filter_name_action[k] = v
+        else:
+            for k, v in name_action_mappings.items():
+                if k[-1] in loaded_state_dict_keys:
+                    filter_name_action[k] = v
+
+        return filter_name_action, resume_keys
+
+    @classmethod
+    def _get_fuse_or_split_param_mappings(cls, config: PretrainedConfig, is_fuse=True) -> List[StateDictNameMapping]:
+        """get fused parameter mapping of PretrainedModel
+
+        Args:
+            config (PretrainedConfig): the configuration of name-mapping
+
+        Raises:
+            NotImplementedError:
+
+        Returns:
+            List[StateDictNameMapping]: the name-mappings for tensor_parallel
+        """
+        # raise NotImplementedError(
+        #     f"`_get_fuse_or_split_param_mappings` is not implemented for {cls.__name__}`. To implement it, you should "
+        #     f"overwrite this method in the class {cls.__name__} in `{cls.__module__}.py`"
+        # )
+        return {}
+
+    @staticmethod
+    def _resolve_prefix_keys_for_fuse_and_split(state_keys_base, state_keys_real, ignore_error=False, is_fuse=True):
+        state_keys_map = {}
+
+        # use the tuple (x1,x2,x3,x4) as one key, and the prefix of x1,x2,x3 is used as a new key x4 or
+        # the last key x4 is used as new keys x1,x2,x3. And, the tuple also could be (a) (x1, x1) -> convert x1 to x1;
+        # (b) (x1,x2,x3) -> fuse x1 and x2 to x3; (c) (x1,x2,x3,x4) -> fuse x1, x2 and x3 to x4.
+
+        # is_fuse: True -> fuse, False -> split
+        # True: (x1,x2,x3,x4) -> [x1,x2,x3] are exist in state_keys_real, x4 is not exist in state_keys_real
+        # False: (x1,x2,x3,x4) -> [x1,x2,x3] are not exist in state_keys_real, x4 is exist in state_keys_real
+
+        for keys in state_keys_base:
+            prefix = ""
+            if is_fuse:
+                for x in state_keys_real:
+                    for base_key in keys[:-1]:
+                        if x.endswith(base_key):
+                            prefix = x.replace(base_key, "")
+                            break
+                    if prefix != "":
+                        break
+            else:
+                base_key = keys[-1]
+                for x in state_keys_real:
+                    if x.endswith(base_key):
+                        prefix = x.replace(base_key, "")
+                        break
+
+            new_keys = tuple([prefix + key for key in keys])
+            state_keys_map[keys] = new_keys
+
+        return state_keys_map
+
+
+class Converter(ConversionMixin, LogitComparer):
+    """some converters are implemented in ppdiffusers, so if remove it directly, it will make ppdiffusers down.
+    TODO(wj-Mcat): this class will be removed after v2.6
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        logger.warning(
+            "`paddlenlp.utils.converter` module will be deprecated soon, you "
+            "should change it to `paddlenlp.transformers.conversion_utils`"
+        )
+
+    @classmethod
+    def resolve_num_layer(cls, config_or_num_layers: Union[dict, int] = None) -> int:
+        """resolve the number of transformer layer based on the key of model config, eg: `num_hidden_layers` in BertModel
+        Args:
+            config_or_num_layers (Union[dict, int], optional): the instance of config or num_layers. Defaults to None.
+        Raises:
+            ValueError: when `config_or_num_layers` is not dict/int, it will raise the error
+        Returns:
+            int: the number of transformer layer
+        """
+        from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+        if isinstance(config_or_num_layers, (dict, PretrainedConfig)):
+            num_layer = config_or_num_layers[cls.num_layer_key]
+        elif isinstance(config_or_num_layers, int):
+            num_layer = config_or_num_layers
+        else:
+            raise ValueError(f"the type of config_or_num_layers<{config_or_num_layers}> should be one of <dict, int>")
+
+        return num_layer
+
+    def convert(self, input_dir: str | None = None) -> None:
+        """the entry of converting config and converting model file
+
+        Args:
+            input_dir (str | None): the input dir which contains `pytorch_model.bin` and `config.json` file
+        """
+        input_dir = input_dir or getattr(self, "input_dir", None)
+        os.makedirs(input_dir, exist_ok=True)
+
+        # 1. get pytorch weight file
+        weight_file = os.path.join(input_dir, PYTORCH_WEIGHTS_NAME)
+        if not os.path.exists(weight_file):
+            raise FileNotFoundError(f"pytorch weight file<{weight_file}> not found")
+
+        config_file = os.path.join(input_dir, CONFIG_NAME)
+        if not os.path.exists(config_file):
+            raise FileNotFoundError(f"config file<{weight_file}> not found")
+
+        # 2. construct name mapping
+        # TODO(wj-Mcat): when AutoConfig is ready, construct config from AutoConfig.
+        with open(config_file, "r", encoding="utf-8") as f:
+            config = json.load(f)
+
+        state_dict = load_torch(weight_file)
+
+        # FIXME(wj-Mcat): add compatibility with downstream models
+        name_mappings = self.get_name_mapping(config)
+
+        # 3. convert state_dict
+        all_layer_names = set(state_dict.keys())
+        for name_mapping in name_mappings:
+            if name_mapping.source_name not in state_dict:
+                logger.warning(f"key<{name_mapping.source_name}> not in the pytorch weight file.")
+                continue
+
+            state_dict[name_mapping.target_name] = name_mapping.run(state_dict.pop(name_mapping.source_name))
+            all_layer_names.remove(name_mapping.source_name)
+
+        if all_layer_names:
+            logger.warning(f"there are {len(all_layer_names)} tensors not initialized:")
+            for layer_name in all_layer_names:
+                logger.warning(f"--- {layer_name}")
+
+        model_weight_file = os.path.join(input_dir, PADDLE_WEIGHTS_NAME)
+        paddle.save(state_dict, model_weight_file)
+        return state_dict
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/convert_slow_tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/convert_slow_tokenizer.py
new file mode 100644
index 000000000..eafa3572a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/convert_slow_tokenizer.py
@@ -0,0 +1,324 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Optional, Tuple
+
+import tokenizers
+from packaging import version
+from tokenizers import (
+    AddedToken,
+    Regex,
+    Tokenizer,
+    decoders,
+    normalizers,
+    pre_tokenizers,
+)
+from tokenizers.models import BPE, Unigram
+
+
+# Copied from transformers, adapted for tokenizers >= 0.19.0
+def _get_prepend_scheme(add_prefix_space: bool, original_tokenizer) -> str:
+    if add_prefix_space:
+        prepend_scheme = "always"
+        if hasattr(original_tokenizer, "legacy") and not original_tokenizer.legacy:
+            prepend_scheme = "first"
+    else:
+        prepend_scheme = "never"
+    return prepend_scheme
+
+
+# Extract the vocab and merge file from sentencepiece file
+class SentencePieceExtractor:
+    def __init__(self, model: str):
+        from sentencepiece import SentencePieceProcessor
+
+        self.sp = SentencePieceProcessor()
+        self.sp.Load(model)
+
+    def extract(self, vocab_scores: Optional[Tuple[str, float]] = None) -> Tuple[Dict[str, int], List[Tuple]]:
+        sp = self.sp
+        vocab = {sp.id_to_piece(index): index for index in range(sp.GetPieceSize())}
+        if vocab_scores is not None:
+            vocab_scores, reverse = dict(vocab_scores), True
+        else:
+            vocab_scores, reverse = vocab, False
+
+        # Merges
+        merges = []
+        for merge, piece_score in vocab_scores.items():
+            local = []
+            for index in range(1, len(merge)):
+                piece_l, piece_r = merge[:index], merge[index:]
+                if piece_l in vocab and piece_r in vocab:
+                    local.append((piece_l, piece_r, piece_score))
+            local = sorted(local, key=lambda x: (vocab[x[0]], vocab[x[1]]))
+            merges.extend(local)
+
+        merges = sorted(merges, key=lambda val: val[2], reverse=reverse)
+        merges = [(val[0], val[1]) for val in merges]
+
+        return vocab, merges
+
+
+def check_number_comma(piece: str) -> bool:
+    return len(piece) < 2 or piece[-1] != "," or not piece[-2].isdigit()
+
+
+class Converter:
+    def __init__(self, original_tokenizer):
+        self.original_tokenizer = original_tokenizer
+
+    def converted(self) -> Tokenizer:
+        raise NotImplementedError()
+
+
+class SpmConverter(Converter):
+    def __init__(self, *args):
+
+        super().__init__(*args)
+
+        from . import sentencepiece_model_pb2 as model_pb2
+
+        m = model_pb2.ModelProto()
+        if hasattr(self.original_tokenizer, "sentencepiece_model_file"):
+            spm_vocab_file = self.original_tokenizer.sentencepiece_model_file
+        else:
+            spm_vocab_file = self.original_tokenizer.vocab_file
+        with open(spm_vocab_file, "rb") as f:
+            m.ParseFromString(f.read())
+        self.proto = m
+
+        if self.proto.trainer_spec.byte_fallback:
+            if not getattr(self, "handle_byte_fallback", None):
+                import warnings
+
+                warnings.warn(
+                    "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
+                    " which is not implemented in the fast tokenizers. In practice this means that the fast version of the"
+                    " tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these "
+                    "unknown tokens into a sequence of byte tokens matching the original piece of text."
+                )
+
+    def vocab(self, proto):
+        return [(piece.piece, piece.score) for piece in proto.pieces]
+
+    def unk_id(self, proto):
+        return proto.trainer_spec.unk_id
+
+    def tokenizer(self, proto):
+        model_type = proto.trainer_spec.model_type
+        vocab_scores = self.vocab(proto)
+        unk_id = self.unk_id(proto)
+
+        if model_type == 1:
+            tokenizer = Tokenizer(Unigram(vocab_scores, unk_id))
+        elif model_type == 2:
+            _, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract()
+            bpe_vocab = {word: i for i, (word, score) in enumerate(vocab_scores)}
+            tokenizer = Tokenizer(
+                BPE(
+                    bpe_vocab,
+                    merges,
+                    unk_token=proto.trainer_spec.unk_piece,
+                    fuse_unk=True,
+                )
+            )
+        else:
+            raise Exception(
+                "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
+            )
+
+        return tokenizer
+
+    def normalizer(self, proto):
+        precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
+        _normalizers = [
+            normalizers.Strip(left=False, right=True),  # stripping is important
+            normalizers.Replace(Regex(" {2,}"), "▁"),
+        ]
+        if not precompiled_charsmap:
+            return normalizers.Sequence(_normalizers)
+        else:
+            return normalizers.Sequence([normalizers.Precompiled(precompiled_charsmap)] + _normalizers)
+
+    def pre_tokenizer(self, replacement, add_prefix_space):
+        prepend_scheme = "always"
+        if hasattr(self.original_tokenizer, "legacy") and not self.original_tokenizer.legacy:
+            prepend_scheme = "first"
+        if version.parse(tokenizers.__version__) >= version.parse("0.19.0"):
+            prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer)
+            return pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
+        else:
+            return pre_tokenizers.Metaspace(
+                replacement=replacement, add_prefix_space=add_prefix_space, prepend_scheme=prepend_scheme
+            )
+
+    def post_processor(self):
+        return None
+
+    def decoder(self, replacement, add_prefix_space):
+        if version.parse(tokenizers.__version__) >= version.parse("0.19.0"):
+            prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer)
+            return decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
+        else:
+            return decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
+
+    def converted(self) -> Tokenizer:
+        tokenizer = self.tokenizer(self.proto)
+
+        # Tokenizer assemble
+        normalizer = self.normalizer(self.proto)
+        if normalizer is not None:
+            tokenizer.normalizer = normalizer
+
+        replacement = "▁"
+        add_prefix_space = True
+        pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space)
+        if pre_tokenizer is not None:
+            tokenizer.pre_tokenizer = pre_tokenizer
+
+        tokenizer.decoder = self.decoder(replacement, add_prefix_space)
+        post_processor = self.post_processor()
+        if post_processor:
+            tokenizer.post_processor = post_processor
+
+        return tokenizer
+
+
+class TikTokenConverter(Converter):
+    def extract(self, tiktoken_file: str):
+        from .tiktoken_model_utils import bpe, bytes_to_unicode, load_tiktoken_bpe
+
+        bpe_ranks = (
+            self.original_tokenizer.mergeable_ranks
+            if hasattr(self.original_tokenizer, "mergeable_ranks") and self.original_tokenizer.mergeable_ranks
+            else load_tiktoken_bpe(tiktoken_file)
+        )
+        byte_encoder = bytes_to_unicode()
+
+        def token_bytes_to_string(b):
+            return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")])
+
+        merges = []
+        vocab = {}
+        for token, rank in bpe_ranks.items():
+            vocab[token_bytes_to_string(token)] = rank
+            if len(token) == 1:
+                continue
+            merged = tuple(bpe(bpe_ranks, token, max_rank=rank))
+            if len(merged) == 2:
+                merges.append(tuple(map(token_bytes_to_string, merged)))
+
+        return vocab, merges
+
+
+class LlamaConverter(SpmConverter):
+    handle_byte_fallback = True
+
+    def vocab(self, proto):
+        vocab = [
+            ("<unk>", 0.0),
+            ("<s>", 0.0),
+            ("</s>", 0.0),
+        ]
+        vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
+        return vocab
+
+    def unk_id(self, proto):
+        return 0
+
+    def decoder(self, replacement, add_prefix_space):
+        return decoders.Sequence(
+            [
+                decoders.Replace("▁", " "),
+                decoders.ByteFallback(),
+                decoders.Fuse(),
+                decoders.Strip(content=" ", left=1),
+            ]
+        )
+
+    def tokenizer(self, proto):
+        model_type = proto.trainer_spec.model_type
+        vocab_scores = self.vocab(proto)
+        if model_type == 1:
+
+            if version.parse(tokenizers.__version__) < version.parse("0.14.0"):
+                tokenizer = Tokenizer(Unigram(vocab_scores, 0))
+            else:
+                tokenizer = Tokenizer(Unigram(vocab_scores, 0, byte_fallback=True))
+
+        elif model_type == 2:
+            _, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
+            bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
+            tokenizer = Tokenizer(
+                BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True)
+            )
+            tokenizer.add_special_tokens(
+                [
+                    AddedToken("<unk>", normalized=False, special=True),
+                    AddedToken("<s>", normalized=False, special=True),
+                    AddedToken("</s>", normalized=False, special=True),
+                ]
+            )
+        else:
+            raise Exception(
+                "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
+            )
+
+        return tokenizer
+
+    def normalizer(self, proto):
+        return normalizers.Sequence(
+            [
+                normalizers.Prepend(prepend="▁"),
+                normalizers.Replace(pattern=" ", content="▁"),
+            ]
+        )
+
+    def pre_tokenizer(self, replacement, add_prefix_space):
+        return None
+
+
+SLOW_TO_FAST_CONVERTERS = {
+    "LlamaTokenizer": LlamaConverter,
+}
+
+
+def convert_slow_tokenizer(transformer_tokenizer) -> Tokenizer:
+    """
+    Utilities to convert a slow tokenizer instance in a fast tokenizer instance.
+
+    Args:
+        transformer_tokenizer ([`~tokenizer_utils_base.PretrainedTokenizer`]):
+            Instance of a slow tokenizer to convert in the backend tokenizer for
+            [`~tokenizer_utils_base.PretrainedTokenizerFast`].
+
+    Return:
+        A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
+        [`~tokenizer_utils_base.PretrainedTokenizerFast`]
+    """
+
+    tokenizer_class_name = transformer_tokenizer.__class__.__name__
+    if tokenizer_class_name not in SLOW_TO_FAST_CONVERTERS:
+        raise ValueError(
+            f"An instance of tokenizer class {tokenizer_class_name} cannot be converted in a Fast tokenizer instance. "
+            f"No converter was found. Currently available slow->fast convertors: {list(SLOW_TO_FAST_CONVERTERS.keys())}"
+        )
+
+    converter_class = SLOW_TO_FAST_CONVERTERS[tokenizer_class_name]
+
+    return converter_class(transformer_tokenizer).converted()
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ctrl/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ctrl/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ctrl/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ctrl/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ctrl/configuration.py
new file mode 100644
index 000000000..162ccccf1
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ctrl/configuration.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" CTRL configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["CTRL_PRETRAINED_INIT_CONFIGURATION", "CTRLConfig", "CTRL_PRETRAINED_RESOURCE_FILES_MAP"]
+
+CTRL_PRETRAINED_INIT_CONFIGURATION = {
+    "ctrl": {
+        "tie_word_embeddings": True,
+        "intermediate_size": 8192,
+        "embd_pdrop": 0.1,
+        "initializer_range": 0.02,
+        "layer_norm_epsilon": 1e-06,
+        "hidden_size": 1280,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 48,
+        "max_position_embeddings": 50000,
+        "resid_pdrop": 0.1,
+        "vocab_size": 246534,
+        "pad_token_id": None,
+    },
+    "sshleifer-tiny-ctrl": {
+        "tie_word_embeddings": True,
+        "intermediate_size": 2,
+        "embd_pdrop": 0.1,
+        "initializer_range": 0.02,
+        "layer_norm_epsilon": 1e-06,
+        "hidden_size": 16,
+        "num_attention_heads": 2,
+        "num_hidden_layers": 2,
+        "max_position_embeddings": 50000,
+        "resid_pdrop": 0.1,
+        "vocab_size": 246534,
+        "pad_token_id": None,
+    },
+}
+
+CTRL_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "ctrl": "https://bj.bcebos.com/paddlenlp/models/transformers/ctrl/model_state.pdparams",
+        "sshleifer-tiny-ctrl": "https://bj.bcebos.com/paddlenlp/models/transformers/sshleifer-tiny-ctrl/model_state.pdparams",
+    }
+}
+
+
+class CTRLConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`CTRLModel`]. It is used to
+    instantiate a CTRL model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the
+    [ctrl] architecture from SalesForce.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 246534):
+            Vocabulary size of the CTRL model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`CTRLModel`] or [`TFCTRLModel`].
+        n_positions (`int`, *optional*, defaults to 256):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        n_embd (`int`, *optional*, defaults to 1280):
+            Dimensionality of the embeddings and hidden states.
+        dff (`int`, *optional*, defaults to 8192):
+            Dimensionality of the inner dimension of the feed forward networks (FFN).
+        n_layer (`int`, *optional*, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (`int`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-6):
+            The epsilon to use in the layer normalization layers
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+    Examples:
+    ```python
+    >>> from transformers import CTRLConfig, CTRLModel
+    >>> # Initializing a CTRL configuration
+    >>> configuration = CTRLConfig()
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = CTRLModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    pretrained_init_configuration = CTRL_PRETRAINED_INIT_CONFIGURATION
+    model_type = "ctrl"
+    attribute_map: Dict[str, str] = {
+        "max_position_embeddings": "n_positions",
+        "hidden_size": "n_embd",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+        "intermediate_size": "dff",
+        "num_classes": "num_labels",
+    }
+
+    def __init__(
+        self,
+        vocab_size=246534,
+        n_positions=256,
+        n_embd=1280,
+        dff=8192,
+        n_layer=48,
+        n_head=16,
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        layer_norm_epsilon=1e-6,
+        initializer_range=0.02,
+        use_cache=True,
+        **kwargs,
+    ):
+
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.dff = dff
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+
+        self.use_cache = use_cache
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ctrl/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ctrl/modeling.py
new file mode 100644
index 000000000..e5b3f640c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ctrl/modeling.py
@@ -0,0 +1,748 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 Salesforce and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import CrossEntropyLoss, MSELoss
+
+from ...layers import Linear as TransposedLinear
+from ...utils.env import CONFIG_NAME
+from .. import PretrainedModel, register_base_model
+from .configuration import (
+    CTRL_PRETRAINED_INIT_CONFIGURATION,
+    CTRL_PRETRAINED_RESOURCE_FILES_MAP,
+    CTRLConfig,
+)
+
+__all__ = [
+    "CTRLPreTrainedModel",
+    "CTRLModel",
+    "CTRLLMHeadModel",
+    "CTRLForSequenceClassification",
+    "SinusoidalPositionalEmbedding",
+    "CTRLForCausalLM",
+]
+
+
+class SinusoidalPositionalEmbedding(nn.Embedding):
+    """
+    This module produces sinusoidal positional embeddings of any length.
+    """
+
+    def __init__(self, num_embeddings, embedding_dim):
+        super().__init__(num_embeddings, embedding_dim)
+        self.weight = self._init_weight(self.weight)
+
+    @staticmethod
+    def _init_weight(out):
+        n_pos, dim = out.shape
+        out.stop_gradient = True
+        position_ids = paddle.arange(0, n_pos, dtype=out.dtype).unsqueeze(1)
+        indices = paddle.arange(0, dim // 2, dtype=out.dtype).unsqueeze(0)
+
+        indices = 10000.0 ** (-2 * indices / dim)
+        embeddings = paddle.matmul(position_ids, indices)
+        sentinel = dim // 2
+        out[:, 0:sentinel] = paddle.sin(embeddings)
+        out[:, sentinel:] = paddle.cos(embeddings)
+
+        return out
+
+    @paddle.no_grad()
+    def forward(self, position_ids):
+        return super().forward(position_ids)
+
+
+def scaled_dot_product_attention(q, k, v, mask, attention_mask=None):
+    # calculate attention
+    matmul_qk = paddle.matmul(q, k, transpose_y=True)
+
+    scaled_attention_logits = matmul_qk / np.sqrt(k.shape[-1])
+
+    if mask is not None:
+        nd, ns = scaled_attention_logits.shape[-2], scaled_attention_logits.shape[-1]
+        scaled_attention_logits += mask[ns - nd : ns, :ns] * -1e4
+
+    if attention_mask is not None:
+        # Apply the attention mask
+        scaled_attention_logits = scaled_attention_logits + attention_mask
+
+    attention_weights = F.softmax(scaled_attention_logits, axis=-1)
+
+    output = paddle.matmul(attention_weights, v)
+
+    return output, attention_weights
+
+
+class MultiHeadAttention(nn.Layer):
+    """
+    Attention mapps queries and a set of key-value pairs to outputs, and
+    Multi-Head Attention performs multiple parallel attention to jointly attending
+    to information from different representation subspaces.
+
+    """
+
+    def __init__(self, hidden_size, num_heads):
+        super().__init__()
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+
+        self.depth = hidden_size // self.num_heads
+
+        self.Wq = nn.Linear(hidden_size, hidden_size)
+        self.Wk = nn.Linear(hidden_size, hidden_size)
+        self.Wv = nn.Linear(hidden_size, hidden_size)
+
+        self.dense = nn.Linear(hidden_size, hidden_size)
+
+    def split_into_heads(self, x, batch_size):
+        x = x.reshape([batch_size, -1, self.num_heads, self.depth])
+        return x.transpose(perm=[0, 2, 1, 3])
+
+    def forward(self, v, k, q, mask, layer_past=None, attention_mask=None, use_cache=False, output_attentions=False):
+        batch_size = q.shape[0]
+
+        q = self.Wq(q)
+        k = self.Wk(k)
+        v = self.Wv(v)
+
+        q = self.split_into_heads(q, batch_size)
+        k = self.split_into_heads(k, batch_size)
+        v = self.split_into_heads(v, batch_size)
+        if layer_past is not None:
+            past_key, past_value = layer_past[0], layer_past[1]
+            k = paddle.concat([past_key, k], axis=-2)
+            v = paddle.concat([past_value, v], axis=-2)
+
+        if use_cache is True:
+            present = paddle.stack([k, v])
+        else:
+            present = (None,)
+
+        scaled_attention, attn = scaled_dot_product_attention(q, k, v, mask, attention_mask)
+        scaled_attention = scaled_attention.transpose([0, 2, 1, 3])
+
+        original_size_attention = scaled_attention.reshape(shape=[batch_size, -1, self.hidden_size])
+        output = self.dense(original_size_attention)
+
+        outputs = (output, present)
+        if output_attentions:
+            outputs = outputs + (attn,)
+        return outputs
+
+
+class EncoderLayer(nn.Layer):
+    def __init__(self, hidden_size, num_heads, intermediate_size, rate=0.1, epsilon=1e-6):
+        super().__init__()
+
+        self.multi_head_attention = MultiHeadAttention(hidden_size, num_heads)
+        self.ffn = nn.Sequential(
+            nn.Linear(hidden_size, intermediate_size), nn.ReLU(), nn.Linear(intermediate_size, hidden_size)
+        )
+        self.layernorm1 = nn.LayerNorm(hidden_size, epsilon=epsilon)
+        self.layernorm2 = nn.LayerNorm(hidden_size, epsilon=epsilon)
+
+        self.dropout1 = nn.Dropout(rate)
+        self.dropout2 = nn.Dropout(rate)
+
+    def forward(self, x, mask, layer_past=None, attention_mask=None, use_cache=False, output_attentions=False):
+        normed = self.layernorm1(x)
+        attn_outputs = self.multi_head_attention(
+            normed,
+            normed,
+            normed,
+            mask,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        attn_output = attn_outputs[0]
+        attn_output = self.dropout1(attn_output)
+        out1 = x + attn_output
+
+        out2 = self.layernorm2(out1)
+        ffn_output = self.ffn(out2)
+        ffn_output = self.dropout2(ffn_output)
+        out2 = out1 + ffn_output
+
+        outputs = (out2,) + attn_outputs[1:]
+        return outputs
+
+
+class CTRLPreTrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained CTRL models. It provides CTRL related
+    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
+    `pretrained_init_configuration`, `base_model_prefix` for downloading and
+    loading pretrained models. See `PretrainedModel` for more details.
+    """
+
+    base_model_prefix = "ctrl"
+    model_config_file = CONFIG_NAME
+    pretrained_init_configuration = CTRL_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = CTRL_PRETRAINED_RESOURCE_FILES_MAP
+    config_class = CTRLConfig
+
+    def _init_weights(self, layer):
+        if isinstance(layer, nn.Linear):
+            layer.weight.set_value(
+                paddle.normal(
+                    mean=0.0,
+                    std=self.config.initializer_range,
+                    shape=layer.weight.shape,
+                )
+            )
+            if layer.bias is not None:
+                layer.bias.set_value(paddle.zeros_like(layer.bias))
+        elif isinstance(layer, SinusoidalPositionalEmbedding):
+            pass
+        elif isinstance(layer, nn.Embedding):
+            layer.weight.set_value(
+                paddle.normal(
+                    mean=0.0,
+                    std=self.config.initializer_range,
+                    shape=layer.weight.shape,
+                )
+            )
+            if layer._padding_idx is not None:
+                emb_weight = layer.weight.numpy()
+                emb_weight[layer._padding_idx] = np.zeros_like(emb_weight[layer._padding_idx])
+                layer.weight.set_value(paddle.to_tensor(emb_weight))
+        elif isinstance(layer, nn.LayerNorm):
+            layer.weight.set_value(paddle.ones_like(layer.weight))
+            layer.bias.set_value(paddle.zeros_like(layer.bias))
+
+
+@register_base_model
+class CTRLModel(CTRLPreTrainedModel):
+    """
+    The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`CTRLConfig`):
+            An instance of :class:`CTRLConfig`.
+
+            .. note::
+                A normal_initializer initializes weight matrices as normal distributions.
+                See :meth:`CTRLPreTrainedModel._init_weights()` for how weights are initialized in `CTRLModel`.
+
+    """
+
+    def __init__(self, config: CTRLConfig):
+        super().__init__(config)
+
+        self.hidden_size = config.hidden_size
+        self.num_layers = config.num_hidden_layers
+        self.initializer_range = config.initializer_range
+
+        self.pos_encoding = SinusoidalPositionalEmbedding(config.max_position_embeddings, self.hidden_size)
+
+        self.w = nn.Embedding(config.vocab_size, config.hidden_size)
+
+        self.dropout = nn.Dropout(config.embd_pdrop)
+        self.h = nn.LayerList(
+            [
+                EncoderLayer(
+                    config.hidden_size,
+                    config.num_attention_heads,
+                    config.intermediate_size,
+                    config.resid_pdrop,
+                    config.layer_norm_epsilon,
+                )
+                for _ in range(self.num_layers)
+            ]
+        )
+        self.layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_epsilon)
+
+    def get_input_embeddings(self):
+        return self.w
+
+    def set_input_embeddings(self, new_embeddings):
+        self.w = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        cache=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        use_cache=False,
+        output_attentions=False,
+        output_hidden_states=False,
+    ):
+        r"""
+        The CTRLModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            cache (Tuple[Tuple[Tensor]], optional):
+                Contains pre-computed hidden-states (key and values in the attention blocks)
+                as computed by the model. Can be used to speed up sequential decoding.
+                The `input_ids` which have their past given to this model should not be
+                passed as input ids as they have already been computed.
+                Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some
+                unwanted positions, usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others
+                have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `0.0` values and the others have `1.0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range `[0, type_vocab_size - 1]`.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected
+                in the range `[0, max_position_embeddings - 1]`.
+                Shape as [batch_size, num_tokens] and dtype as int64. Defaults to `None`.
+            use_cache (bool, optional):
+                Whether or not to use cache. Defaults to `False`. If set to `True`, key value states
+                will be returned and can be used to speed up decoding.
+            output_attentions (bool, optional):
+                Whether or not to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            output_hidden_states (bool, optional):
+                Whether or not to return the output of all hidden layers.
+                Defaults to `False`.
+
+        Returns:
+            tuple: Returns tuple (`last_hidden_state`, `caches`, `hidden_states`, `attentions`)
+
+            With the fields:
+
+            - `last_hidden_state` (Tensor):
+                Sequence of hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+            - `caches` (tuple(tuple(Tensor), optional):
+                returned when `use_cache=True` is passed.
+                Tuple of `tuple(Tensor)` of length `num_hidden_layers`, with each tuple having 2
+                tensors of shape [batch_size, num_heads, sequence_length, embed_size_per_head] and float32 dtype.
+
+            - `hidden_states` (tuple(Tensor), optional):
+                returned when `output_hidden_states=True` is passed.
+                Tuple of `Tensor` (one for the output of the embeddings + one for the output of
+                each layer). Each Tensor has a data type of float32 and its shape is
+                [batch_size, sequence_length, hidden_size].
+
+            - `attentions` (tuple(Tensor), optional):
+                returned when `output_attentions=True` is passed.
+                Tuple of `Tensor` (one for each layer) of shape. Each Tensor has a data type of
+                float32 and its shape is [batch_size, num_heads, sequence_length, sequence_length].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import CTRLModel, CTRLTokenizer
+
+                tokenizer = CTRLTokenizer.from_pretrained('ctrl')
+                model = CTRLModel.from_pretrained('ctrl')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+
+        """
+
+        seq_len = input_ids.shape[-1]
+        input_ids = input_ids.reshape([-1, seq_len])
+        batch_size = input_ids.shape[0]
+
+        if cache is None:
+            past_length = 0
+            cache = tuple([None] * len(self.h))
+        else:
+            past_length = cache[0][0].shape[-2]
+
+        if position_ids is None:
+            position_ids = paddle.arange(past_length, seq_len + past_length)
+            position_ids = position_ids.unsqueeze(0).reshape(shape=[-1, seq_len])
+
+        # Attention mask.
+        if attention_mask is not None:
+            assert batch_size > 0, "batch_size has to be defined and > 0"
+            attention_mask = attention_mask.reshape(shape=[batch_size, -1])
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask = attention_mask.unsqueeze([1, 2])
+
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and -10000.0 for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+            attention_mask = attention_mask.astype(dtype=paddle.get_default_dtype())  # fp16 compatibility
+            attention_mask = (1.0 - attention_mask) * -10000.0
+
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.reshape(shape=[-1, seq_len])
+            token_type_embeds = self.w(token_type_ids) * np.sqrt(self.hidden_size)
+        else:
+            token_type_embeds = 0.0
+
+        inputs_embeds = self.w(input_ids) * np.sqrt(self.hidden_size)
+        pos_embeds = self.pos_encoding(position_ids)
+
+        hidden_states = inputs_embeds + pos_embeds + token_type_embeds
+
+        hidden_states = self.dropout(hidden_states)
+        mask = paddle.triu(paddle.ones(shape=[seq_len + past_length, seq_len + past_length]), 1)
+
+        presents = () if use_cache else None
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for i, (h, layer_past) in enumerate(zip(self.h, cache)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            outputs = h(
+                hidden_states,
+                mask,
+                layer_past=layer_past,
+                attention_mask=attention_mask,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states, present = outputs[:2]
+            if use_cache is True:
+                presents = presents + (present,)
+
+            if output_attentions:
+                all_attentions += (outputs[2],)
+
+        hidden_states = self.layernorm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None)
+
+
+class CTRLLMHeadModel(CTRLPreTrainedModel):
+    """
+    The CTRL Model transformer with a language modeling head on top (linear
+    layer with weights tied to the input embeddings).
+
+    Args:
+        config (:class:`CTRLConfig`):
+            An instance of :class:`CTRLConfig`.
+
+    """
+
+    def __init__(self, config: CTRLConfig):
+        super().__init__(config)
+        self.ctrl = CTRLModel(config)
+        self.lm_head = TransposedLinear(config.hidden_size, config.vocab_size)
+        self.tie_weights()
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def prepare_inputs_for_generation(self, input_ids, use_cache=False, cache=None, **kwargs):
+        # only last token for inputs_ids if cache is defined in kwargs
+        if cache is not None:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+
+        return {"input_ids": input_ids, "use_cache": use_cache, "cache": cache}
+
+    def forward(
+        self,
+        input_ids=None,
+        cache=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        labels=None,
+        use_cache=False,
+        output_attentions=False,
+        output_hidden_states=False,
+    ):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`CTRLModel`.
+            cache (Tensor, optional):
+                See :class:`CTRLModel`.
+            attention_mask (Tensor, optional):
+                See :class:`CTRLModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`CTRLModel`.
+            position_ids (Tensor, optional):
+                See :class:`CTRLModel`.
+            labels (Tensor, optional):
+                Labels for language modeling. Note that the labels **are shifted**
+                inside the model, i.e. you can set `labels = input_ids` Indices are
+                selected in `[-100, 0, ..., vocab_size]` All labels set to `-100` are
+                ignored (masked), the loss is only computed for labels in `[0, ..., vocab_size]`.
+                Shape is [batch_size, sequence_length] and dtype is int64.
+            use_cache (bool, optional):
+                See :class:`CTRLModel`.
+            output_attentions (bool, optional):
+                See :class:`CTRLModel`.
+            output_hidden_states (bool, optional):
+                See :class:`CTRLModel`.
+
+        Returns:
+            tuple: Returns tuple `(loss, logits, caches, hidden_states, attentions)`.
+            With the fields:
+
+            - `loss` (Tensor):
+                returned when `labels` is provided.
+                Language modeling loss (for next-token prediction).
+                It's data type should be float32 and its shape is [1,].
+
+            - `logits` (Tensor):
+                Prediction scores of the language modeling head (scores for each vocabulary
+                token before SoftMax).
+                It's data type should be float32 and
+                its shape is [batch_size, sequence_length, vocab_size].
+
+            - `caches` (tuple(tuple(Tensor), optional):
+                See :class:`CTRLModel`.
+
+            - `hidden_states` (tuple(Tensor), optional):
+                See :class:`CTRLModel`.
+
+            - `attentions` (tuple(Tensor), optional):
+                See :class:`CTRLModel`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import CTRLLMHeadModel, CTRLTokenizer
+
+                tokenizer = CTRLTokenizer.from_pretrained('ctrl')
+                model = CTRLLMHeadModel.from_pretrained('ctrl')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs, labels=inputs["input_ids"])
+
+                loss = output[0]
+                logits = output[1]
+
+        """
+
+        ctrl_outputs = self.ctrl(
+            input_ids,
+            cache=cache,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        hidden_states = ctrl_outputs[0]
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[:, :-1]
+            shift_labels = labels[:, 1:]
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.reshape([-1, shift_logits.shape[-1]]),
+                shift_labels.flatten(),
+            )
+
+        output = (lm_logits,) + ctrl_outputs[1:]
+        return ((loss,) + output) if loss is not None else output
+
+
+class CTRLForSequenceClassification(CTRLPreTrainedModel):
+    """
+    The CTRL Model transformer with a sequence classification head on top (linear layer).
+    `CTRLForSequenceClassification` uses the last token in order to do the classification,
+    as other causal models (e.g. GPT-2) do. Since it does classification on the last token,
+    it requires to know the position of the last token. If a `pad_token_id` is defined in the
+    configuration, it finds the last token that is not a padding token in each row. If no
+    `pad_token_id` is defined, it simply takes the last value in each row of the batch.
+
+    Args:
+        config (:class:`CTRLConfig`):
+            An instance of :class:`CTRLConfig`.
+
+    """
+
+    def __init__(self, config: CTRLConfig):
+        super().__init__(config)
+        self.num_classes = config.num_classes
+        self.ctrl = CTRLModel(config)
+        self.classifier = nn.Linear(config.hidden_size, self.num_classes, bias_attr=False)
+
+    def forward(
+        self,
+        input_ids=None,
+        cache=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        labels=None,
+        use_cache=False,
+        output_attentions=False,
+        output_hidden_states=False,
+    ):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`CTRLModel`.
+            cache (Tensor, optional):
+                See :class:`CTRLModel`.
+            attention_mask (Tensor, optional):
+                See :class:`CTRLModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`CTRLModel`.
+            position_ids (Tensor, optional):
+                See :class:`CTRLModel`.
+            labels (Tensor, optional):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in `[0, ...,num_classes - 1]`. If `num_classes == 1`
+                a regression loss is computed (Mean-Square loss), If `num_classes > 1`
+                a classification loss is computed (Cross-Entropy).
+                Shape is [batch_size,] and dtype is int64.
+            use_cache (bool, optional):
+                See :class:`CTRLModel`.
+            output_attentions (bool, optional):
+                See :class:`CTRLModel`.
+            output_hidden_states (bool, optional):
+                See :class:`CTRLModel`.
+
+        Returns:
+            tuple: Returns tuple `(loss, logits, caches, hidden_states, attentions)`.
+            With the fields:
+
+            - `loss` (Tensor):
+                returned when `labels` is provided.
+                Language modeling loss (for next-token prediction).
+                It's data type should be float32 and its shape is [1,].
+
+            - `logits` (Tensor):
+                Prediction scores of the language modeling head (scores for each vocabulary
+                token before SoftMax).
+                It's data type should be float32 and its shape is [batch_size, num_classes].
+
+            - `caches` (tuple(tuple(Tensor), optional):
+                See :class:`CTRLModel`.
+
+            - `hidden_states` (tuple(Tensor), optional):
+                See :class:`CTRLModel`.
+
+            - `attentions` (tuple(Tensor), optional):
+                See :class:`CTRLModel`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import CTRLForSequenceClassification, CTRLTokenizer
+
+                tokenizer = CTRLTokenizer.from_pretrained('ctrl')
+                model = CTRLForSequenceClassification.from_pretrained('ctrl', pad_token_id=0)
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs, labels=paddle.to_tensor([1]))
+
+                loss = output[0]
+                logits = output[1]
+
+        """
+        ctrl_outputs = self.ctrl(
+            input_ids,
+            cache=cache,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        hidden_states = ctrl_outputs[0]
+        logits = self.classifier(hidden_states)
+        batch_size = input_ids.shape[0]
+
+        assert (
+            self.config.pad_token_id is not None or batch_size == 1
+        ), "Cannot handle batch sizes > 1 if no padding token is defined."
+
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            sequence_lengths = (
+                paddle.not_equal(
+                    input_ids,
+                    paddle.full(shape=input_ids.shape, fill_value=self.config.pad_token_id, dtype=input_ids.dtype),
+                )
+                .astype(paddle.int64)
+                .sum(-1)
+                - 1
+            )
+
+        pooled_logits = logits.gather_nd(paddle.stack([paddle.arange(batch_size), sequence_lengths], axis=-1))
+
+        loss = None
+        if labels is not None:
+            if self.num_classes == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(pooled_logits.flatten(), labels.astype(pooled_logits.dtype).flatten())
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.reshape([-1, self.num_classes]), labels.flatten())
+
+        output = (pooled_logits,) + ctrl_outputs[1:]
+        return ((loss,) + output) if loss is not None else output
+
+
+CTRLForCausalLM = CTRLLMHeadModel
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ctrl/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ctrl/tokenizer.py
new file mode 100644
index 000000000..b4e7c108d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ctrl/tokenizer.py
@@ -0,0 +1,357 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 Salesforce and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shutil
+
+from paddle.utils import try_import
+from .. import PretrainedTokenizer
+from paddlenlp.utils.log import logger
+
+__all__ = ["CTRLTokenizer"]
+
+CONTROL_CODES = {
+    "Pregnancy": 168629,
+    "Christianity": 7675,
+    "Explain": 106423,
+    "Fitness": 63440,
+    "Saving": 63163,
+    "Ask": 27171,
+    "Ass": 95985,
+    "Joke": 163509,
+    "Questions": 45622,
+    "Thoughts": 49605,
+    "Retail": 52342,
+    "Feminism": 164338,
+    "Writing": 11992,
+    "Atheism": 192263,
+    "Netflix": 48616,
+    "Computing": 39639,
+    "Opinion": 43213,
+    "Alone": 44967,
+    "Funny": 58917,
+    "Gaming": 40358,
+    "Human": 4088,
+    "India": 1331,
+    "Joker": 77138,
+    "Diet": 36206,
+    "Legal": 11859,
+    "Norman": 4939,
+    "Tip": 72689,
+    "Weight": 52343,
+    "Movies": 46273,
+    "Running": 23425,
+    "Science": 2090,
+    "Horror": 37793,
+    "Confession": 60572,
+    "Finance": 12250,
+    "Politics": 16360,
+    "Scary": 191985,
+    "Support": 12654,
+    "Technologies": 32516,
+    "Teenage": 66160,
+    "Event": 32769,
+    "Learned": 67460,
+    "Notion": 182770,
+    "Wikipedia": 37583,
+    "Books": 6665,
+    "Extract": 76050,
+    "Confessions": 102701,
+    "Conspiracy": 75932,
+    "Links": 63674,
+    "Narcissus": 150425,
+    "Relationship": 54766,
+    "Relationships": 134796,
+    "Reviews": 41671,
+    "News": 4256,
+    "Translation": 26820,
+    "multilingual": 128406,
+}
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+
+    pairs = set(pairs)
+    return pairs
+
+
+class CTRLTokenizer(PretrainedTokenizer):
+    """
+    Constructs a CTRL tokenizer based on byte-level Byte-Pair-Encoding.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            Path to the vocab file.
+            The vocab file contains a mapping from vocabulary strings to indices.
+        merges_file (str):
+            Path to the merge file.
+            The merge file is used to split the input sentence into "subword" units.
+            The vocab file is then used to encode those units as intices.
+        max_len (int, optional):
+            The maximum value of the input sequence length.
+            Defaults to `None`.
+        unk_token (str):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "<unk>".
+
+    """
+
+    resource_files_names = {
+        "vocab_file": "vocab.json",
+        "merges_file": "merges.txt",
+    }
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "ctrl": "http://bj.bcebos.com/paddlenlp/models/transformers/ctrl/vocab.json",
+            "sshleifer-tiny-ctrl": "http://bj.bcebos.com/paddlenlp/models/transformers/sshleifer-tiny-ctrl/vocab.json",
+        },
+        "merges_file": {
+            "ctrl": "http://bj.bcebos.com/paddlenlp/models/transformers/ctrl/merges.txt",
+            "sshleifer-tiny-ctrl": "http://bj.bcebos.com/paddlenlp/models/transformers/sshleifer-tiny-ctrl/merges.txt",
+        },
+    }
+    pretrained_init_configuration = {"ctrl": {}, "sshleifer-tiny-ctrl": {"max_len": 256}}
+
+    CONTROL_CODES = CONTROL_CODES
+
+    def __init__(self, vocab_file, merges_file, max_len=None, unk_token="<unk>", **kwargs):
+        self._vocab_file = vocab_file
+        self._merges_file = merges_file
+        self.max_len = max_len if max_len is not None else int(1e12)
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            merges = merges_handle.read().split("\n")[1:-1]
+        merges = [tuple(merge.split()) for merge in merges]
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {}
+
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+
+    def get_vocab(self):
+        return dict(self.encoder)
+
+    def __len__(self):
+        return len(self.encoder)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = "@@ ".join(word)
+        word = word[:-4]
+        self.cache[token] = word
+        return word
+
+    def tokenize(self, text):
+        """
+        Converts a string to a list of tokens.
+
+        Args:
+            text (str): The text to be tokenized.
+
+        Returns:
+            List[str]: A list of string representing converted tokens.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import CTRLTokenizer
+
+                tokenizer = CTRLTokenizer.from_pretrained('ctrl')
+                print(tokenizer.tokenize('Welcome to use PaddlePaddle and PaddleNLP'))
+                # ['Welcome', 'to', 'use', 'Padd@@', 'le@@', 'Padd@@', 'le', 'and', 'Padd@@', 'le@@', 'N@@', 'LP']
+
+        """
+        return self._tokenize(text)
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        split_tokens = []
+        re = try_import("regex")
+        words = re.findall(r"\S+\n?", text)
+        for token in words:
+            split_tokens.extend([t for t in self.bpe(token).split(" ")])
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) to an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) to a token (str) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """
+        Converts a sequence of tokens (list of string) to a single string.
+
+        Args:
+            tokens (List[str]): A sequence of tokens.
+
+        Returns:
+            str: Converted string.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import CTRLTokenizer
+
+                tokenizer = CTRLTokenizer.from_pretrained('crtl')
+                print(tokenizer.convert_tokens_to_string(['Welcome', 'to', 'use', 'Padd@@', 'le@@', 'Padd@@', 'le', 'and', 'Padd@@', 'le@@', 'N@@', 'LP']))
+                # 'Welcome to use PaddlePaddle and PaddleNLP'
+
+        """
+        out_string = " ".join(tokens).replace("@@ ", "").strip()
+        return out_string
+
+    def convert_tokens_to_ids(self, tokens):
+        """
+        Converts a single token or a sequence of tokens to an index or a
+        sequence of indices using the vocab.
+
+        Args:
+            tokens (str|List[str]|tuple(str)):
+                A single token or a sequence of tokens.
+
+        Returns:
+            int|List[int]: The converted token id or token ids.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import CTRLTokenizer
+
+                tokenizer = CTRLTokenizer.from_pretrained('crtl')
+                print(tokenizer.convert_tokens_to_ids(['Welcome', 'to', 'use', 'Padd@@', 'le@@', 'Padd@@', 'le', 'and', 'Padd@@', 'le@@', 'N@@', 'LP']))
+                # [41116, 3, 191, 40324, 1162, 40324, 992, 2, 40324, 1162, 633, 11135]
+
+        """
+        ids = []
+        if isinstance(tokens, str):
+            return self._convert_token_to_id(tokens)
+        for token in tokens:
+            ids.append(self._convert_token_to_id(token))
+        if len(ids) > self.max_len:
+            logger.warning(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this CTRL model ({} > {}). Running this"
+                " sequence through the model will result in indexing errors".format(len(ids), self.max_len)
+            )
+        return ids
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """
+        Converts an index or a sequence indices to a single
+        token or a sequence of tokens.
+
+        Args:
+            ids (int|List[int]):
+                The token id (or token ids) to be converted to text.
+            skip_special_tokens (bool, optional):
+                Whether or not to skip the special tokens.
+                Defaults to `False`, which means we don't skip the special tokens.
+
+        Returns:
+            str|List[str]: The converted token or the sequence of tokens.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import CTRLTokenizer
+
+                tokenizer = CTRLTokenizer.from_pretrained('ctrl')
+                print(tokenizer.convert_ids_to_tokens([41116, 3, 191, 40324, 1162, 40324, 992, 2, 40324, 1162, 633, 11135]))
+                # ['Welcome', 'to', 'use', 'Padd@@', 'le@@', 'Padd@@', 'le', 'and', 'Padd@@', 'le@@', 'N@@', 'LP']
+
+        """
+        if isinstance(ids, int):
+            return self._convert_id_to_token(ids)
+        tokens = []
+        for index in ids:
+            index = int(index)
+            if skip_special_tokens and index in self.all_special_ids:
+                continue
+            tokens.append(self._convert_id_to_token(index))
+        return tokens
+
+    def save_resources(self, save_directory):
+        """
+        Save tokenizer related resources to files under `save_directory`.
+
+        Args:
+            save_directory (str): Directory to save files into.
+
+        """
+        for name, file_name in self.resource_files_names.items():
+            source_path = getattr(self, "_%s" % name)
+            save_path = os.path.join(save_directory, file_name)
+            if os.path.abspath(source_path) != os.path.abspath(save_path):
+                shutil.copyfile(source_path, save_path)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dallebart/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dallebart/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dallebart/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dallebart/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dallebart/configuration.py
new file mode 100644
index 000000000..9a616db12
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dallebart/configuration.py
@@ -0,0 +1,254 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" DalleBart model configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["DALLEBART_PRETRAINED_INIT_CONFIGURATION", "DalleBartConfig", "DALLEBART_PRETRAINED_RESOURCE_FILES_MAP"]
+
+DALLEBART_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "dalle-mini": "https://bj.bcebos.com/paddlenlp/models/transformers/dallebart/dalle-mini/model_state.pdparams",
+        "dalle-mega-v16": "https://bj.bcebos.com/paddlenlp/models/transformers/dallebart/dalle-mega-v16/model_state.pdparams",
+        "dalle-mega-v26": "https://bj.bcebos.com/paddlenlp/models/transformers/dallebart/dalle-mega-v26/model_state.pdparams",
+        "dalle-mega": "https://bj.bcebos.com/paddlenlp/models/transformers/dallebart/dalle-mega-v26/model_state.pdparams",
+    }
+}
+
+DALLEBART_PRETRAINED_INIT_CONFIGURATION = {
+    "dalle-mini": {
+        "text_vocab_size": 50264,
+        "image_vocab_size": 16384,
+        "bos_token_id": 16384,
+        "pad_token_id": 16384,
+        "eos_token_id": 16384,
+        "max_text_length": 64,
+        "max_image_length": 256,
+        "decoder_start_token_id": 16384,
+        "d_model": 1024,
+        "num_encoder_layers": 12,
+        "num_decoder_layers": 12,
+        "encoder_attention_heads": 16,
+        "decoder_attention_heads": 16,
+        "encoder_ffn_dim": 2730,
+        "decoder_ffn_dim": 2730,
+        "dropout": 0.0,
+        "activation_function": "gelu",
+        "attention_dropout": 0.0,
+        "activation_dropout": 0.0,
+        "use_bias": False,
+        "init_std": 0.02,
+    },
+    "dalle-mega-v16": {
+        "text_vocab_size": 50272,
+        "image_vocab_size": 16415,
+        "bos_token_id": 16384,
+        "pad_token_id": 16384,
+        "eos_token_id": 16384,
+        "max_text_length": 64,
+        "max_image_length": 256,
+        "decoder_start_token_id": 16384,
+        "d_model": 2048,
+        "num_encoder_layers": 24,
+        "num_decoder_layers": 24,
+        "encoder_attention_heads": 32,
+        "decoder_attention_heads": 32,
+        "encoder_ffn_dim": 4096,
+        "decoder_ffn_dim": 4096,
+        "dropout": 0.0,
+        "activation_function": "gelu",
+        "attention_dropout": 0.0,
+        "activation_dropout": 0.0,
+        "use_bias": False,
+        "init_std": 0.02,
+    },
+    "dalle-mega-v26": {
+        "text_vocab_size": 50272,
+        "image_vocab_size": 16415,
+        "bos_token_id": 16384,
+        "pad_token_id": 16384,
+        "eos_token_id": 16384,
+        "max_text_length": 64,
+        "max_image_length": 256,
+        "decoder_start_token_id": 16384,
+        "d_model": 2048,
+        "num_encoder_layers": 24,
+        "num_decoder_layers": 24,
+        "encoder_attention_heads": 32,
+        "decoder_attention_heads": 32,
+        "encoder_ffn_dim": 4096,
+        "decoder_ffn_dim": 4096,
+        "dropout": 0.0,
+        "activation_function": "gelu",
+        "attention_dropout": 0.0,
+        "activation_dropout": 0.0,
+        "use_bias": False,
+        "init_std": 0.02,
+    },
+    "dalle-mega": {
+        "text_vocab_size": 50272,
+        "image_vocab_size": 16415,
+        "bos_token_id": 16384,
+        "pad_token_id": 16384,
+        "eos_token_id": 16384,
+        "max_text_length": 64,
+        "max_image_length": 256,
+        "decoder_start_token_id": 16384,
+        "d_model": 2048,
+        "num_encoder_layers": 24,
+        "num_decoder_layers": 24,
+        "encoder_attention_heads": 32,
+        "decoder_attention_heads": 32,
+        "encoder_ffn_dim": 4096,
+        "decoder_ffn_dim": 4096,
+        "dropout": 0.0,
+        "activation_function": "gelu",
+        "attention_dropout": 0.0,
+        "activation_dropout": 0.0,
+        "use_bias": False,
+        "init_std": 0.02,
+    },
+}
+
+
+class DalleBartConfig(PretrainedConfig):
+    r"""
+    The bare DalleBart Model outputting raw hidden-states.
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+    Args:
+        text_vocab_size (int):
+            Vocabulary size of `inputs_ids` in `DalleBartModel`. Also is the vocab size of text token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `DalleBartModel`.
+        image_vocab_size (int):
+            Vocabulary size of `decoder_inputs_ids` in `DalleBartModel`. Also is the vocab size of image token embedding matrix.
+            Defines the number of different tokens that can be represented by the `decoder_inputs_ids` passed when calling `DalleBartModel`.
+        bos_token (int, optional):
+            The beginning of image sequence token that was used during pretraining.
+            Defaults to `16384`.
+        pad_token_id(int, optional):
+            The index of padding token in the image token vocabulary.
+            Defaults to `16384`.
+        eos_token (int, optional):
+            A special token representing the end of a image sequence.
+            Defaults to `16384`.
+        max_text_length (int, optional):
+            The maximum value of the dimensionality of text position encoding, which dictates the maximum supported length of the text
+            input sequence. Defaults to `64`.
+        max_image_length (int, optional):
+            The maximum value of the dimensionality of image position encoding, which dictates the maximum supported length of the image
+            input sequence. Defaults to `256`.
+        decoder_start_token_id (int, optional):
+            The id indicating the start of decoding image sentence. Defaults to `16384`.
+        d_model (int, optional):
+            Dimensionality of the embedding layer, encoder layer and decoder layer. Defaults to `1024`.
+        num_encoder_layers (int, optional):
+            Number of hidden layers in the :class:`DalleBartEncoder`. Defaults to `12`.
+        num_decoder_layers (int, optional):
+            Number of hidden layers in the :class:`DalleBartDecoder`. Defaults to `12`.
+        encoder_attention_heads (int, optional):
+            Number of attention heads for each attention layer in the :class:`DalleBartEncoder`.
+            Defaults to `16`.
+        decoder_attention_heads (int, optional):
+            Number of attention heads for each attention layer in the :class:`DalleBartDecoder`.
+            Defaults to `16`.
+        encoder_ffn_dim (int, optional):
+            Dimensionality of the Gated Linear Units (glu) layer in the encoder. Input tensors
+            to glu layers are firstly projected from `d_model` to `encoder_ffn_dim`,
+            and then projected back to `d_model`. Typically `encoder_ffn_dim` is larger than `d_model`.
+            Defaults to `2730`.
+        decoder_ffn_dim (int, optional):
+            Dimensionality of the Gated Linear Units (glu) layer in the encoder. Input tensors
+            to glu layers are firstly projected from `d_model` to `decoder_ffn_dim`,
+            and then projected back to `d_model`. Typically `decoder_ffn_dim` is larger than `d_model`.
+            Defaults to `2730`.
+        dropout (float, optional):
+            The dropout probability used in all fully connected layers (pre-process and post-process of MHA and FFN sub-layer)
+            in the encoders and decoders. Defaults to `0.`.
+        activation_function (str, optional):
+            The non-linear activation function in the glu layer.
+            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions are supported.
+            Defaults to `"gelu"`.
+        attention_dropout (float, optional):
+            The dropout probability used in MultiHeadAttention in all encoder layers and decoder layers to drop some attention target.
+            Defaults to `0.`.
+        activation_dropout (float, optional):
+            The dropout probability used after glu activation in all encoder layers and decoder layers.
+            Defaults to `0.`.
+        use_bias (bool, optional):
+            Whether or not use bias in all linear layers. Defaults to `False`.
+        init_std (float, optional):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            Default to `0.02`.
+    """
+    pretrained_init_configuration = DALLEBART_PRETRAINED_INIT_CONFIGURATION
+    model_type = "dallebart"
+    attribute_map: Dict[str, str] = {
+        "text_vocab_size": "vocab_size",
+    }
+
+    def __init__(
+        self,
+        vocab_size=50264,
+        image_vocab_size=16384,
+        bos_token_id=16384,
+        pad_token_id=16384,
+        eos_token_id=16384,
+        max_text_length=64,
+        max_image_length=256,
+        decoder_start_token_id=16384,
+        d_model=1024,
+        num_encoder_layers=12,
+        num_decoder_layers=12,
+        encoder_attention_heads=16,
+        decoder_attention_heads=16,
+        encoder_ffn_dim=2730,
+        decoder_ffn_dim=2730,
+        dropout=0.0,
+        activation_function="gelu",
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        use_bias=False,
+        init_std=0.02,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.image_vocab_size = image_vocab_size
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.max_text_length = max_text_length
+        self.max_image_length = max_image_length
+        self.d_model = d_model
+        self.num_encoder_layers = num_encoder_layers
+        self.num_decoder_layers = num_decoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_attention_heads = decoder_attention_heads
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.dropout = dropout
+        self.activation_function = activation_function
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.use_bias = use_bias
+        self.init_std = init_std
+        self.pad_token_id = pad_token_id
+        self.decoder_start_token_id = decoder_start_token_id
+        self.text_pad_token_id = 1  # encoder pad id must be 1
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dallebart/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dallebart/modeling.py
new file mode 100644
index 000000000..06ced8874
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dallebart/modeling.py
@@ -0,0 +1,1350 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021-2022 The Fairseq Authors and The Google Flax
+# Team Authors And The HuggingFace Inc. team and & DALL·E Mini team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.common_ops_import import convert_dtype
+
+from ...generation import BeamSearchScorer
+from ...transformers import PretrainedModel, register_base_model
+from ...utils.env import CONFIG_NAME
+from ...utils.log import logger
+from .configuration import (
+    DALLEBART_PRETRAINED_INIT_CONFIGURATION,
+    DALLEBART_PRETRAINED_RESOURCE_FILES_MAP,
+    DalleBartConfig,
+)
+
+__all__ = [
+    "DalleBartModel",
+    "DalleBartPretrainedModel",
+    "DalleBartEncoder",
+    "DalleBartDecoder",
+    "DalleBartForConditionalGeneration",
+]
+
+
+def shift_tokens_right(input_ids, decoder_start_token_id):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = paddle.zeros_like(input_ids)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    return shifted_input_ids
+
+
+def _convert_attention_mask(attn_mask, dtype):
+    """
+    Convert the attention mask to the target dtype we expect.
+
+    Parameters:
+        attn_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
+                When the data type is bool, the unwanted positions have `False`
+                values and the others have `True` values. When the data type is
+                int, the unwanted positions have 0 values and the others have 1
+                values. When the data type is float, the unwanted positions have
+                `-INF` values and the others have 0 values. It can be None when
+                nothing wanted or needed to be prevented attention to. Default None.
+        dtype (VarType): The target type of `attn_mask` we expect.
+
+    Returns:
+        Tensor: A Tensor with shape same as input `attn_mask`, with data type `dtype`.
+    """
+    if attn_mask is not None and attn_mask.dtype != dtype:
+        attn_mask_dtype = convert_dtype(attn_mask.dtype)
+        if attn_mask_dtype == "bool" or "int" in attn_mask_dtype:
+            attn_mask = (paddle.cast(attn_mask, dtype) - 1.0) * 1e4
+        else:
+            attn_mask = paddle.cast(attn_mask, dtype)
+    return attn_mask
+
+
+class DalleBartPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained Bart models. It provides DalleBart related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    base_model_prefix = "dallebart"
+    model_config_file = CONFIG_NAME
+    pretrained_init_configuration = DALLEBART_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = DALLEBART_PRETRAINED_RESOURCE_FILES_MAP
+    config_class = DalleBartConfig
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding, DalleBartLearnedPositionalEmbedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.normal(
+                        mean=0.0,
+                        std=self.config.init_std,
+                        shape=layer.weight.shape,
+                    )
+                )
+
+
+class DalleBartLearnedPositionalEmbedding(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, num_embeddings, embedding_dim):
+        # DalleBart is set up so that if padding_idx is specified then offset the embedding ids by 0
+        # and adjust num_embeddings appropriately. Other models dont have this hack
+        self.offset = 0
+        super().__init__(num_embeddings + self.offset, embedding_dim)
+
+    def forward(self, input_ids_shape, past_key_values_length=0):
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        seq_len = input_ids_shape[1]
+        positions = paddle.arange(past_key_values_length, past_key_values_length + seq_len, dtype="int64")
+        # (gongenlei) For dygraph to static graph
+        return nn.Embedding.forward(self, positions + self.offset)
+
+
+class GLU(nn.Layer):
+    """
+    From "GLU Variants Improve Transformer" by https://arxiv.org/abs/2002.05202
+    """
+
+    def __init__(
+        self,
+        count_in_out: int,
+        count_middle: int,
+        activation_dropout: float,
+        dropout: float,
+        activation_function: str = "gelu",
+        use_bias: bool = False,
+    ):
+        super().__init__()
+        self.ln0 = nn.LayerNorm(count_in_out)
+        self.ln1 = nn.LayerNorm(count_middle)
+        self.fc0 = nn.Linear(count_in_out, count_middle, bias_attr=use_bias)
+        self.fc1 = nn.Linear(count_in_out, count_middle, bias_attr=use_bias)
+        self.fc2 = nn.Linear(count_middle, count_in_out, bias_attr=use_bias)
+        self.dropout1 = nn.Dropout(activation_dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.act = getattr(F, activation_function)
+
+    def forward(self, z):
+        z = self.ln0(z)
+        w = self.fc0(z)
+        w = self.act(w)
+        v = self.fc1(z)
+        z = self.dropout1(self.ln1(w * v))
+        z = self.dropout2(self.fc2(z))
+        return z
+
+
+class DalleBartEncoderLayer(nn.Layer):
+    """
+    The Encoder Layer of DalleBartEncoder. The arguments of DalleBartEncoderLayer can see :class:`DalleBartEncoder`.
+    """
+
+    def __init__(self, config: DalleBartConfig):
+        super().__init__()
+        assert config.d_model > 0, "Expected d_model to be greater than 0, " "but received {}".format(config.d_model)
+        assert (
+            config.encoder_attention_heads > 0
+        ), "Expected encoder_attention_heads to be greater than 0, " "but received {}".format(
+            config.encoder_attention_heads
+        )
+        assert config.encoder_ffn_dim > 0, "Expected encoder_ffn_dim to be greater than 0, " "but received {}".format(
+            config.encoder_ffn_dim
+        )
+
+        attention_dropout = config.dropout if config.attention_dropout is None else config.attention_dropout
+        activation_dropout = config.dropout if config.activation_dropout is None else config.activation_dropout
+        self.self_attn = nn.MultiHeadAttention(
+            config.d_model, config.encoder_attention_heads, dropout=attention_dropout, bias_attr=config.use_bias
+        )
+        self.glu = GLU(
+            config.d_model,
+            config.encoder_ffn_dim,
+            activation_dropout,
+            config.dropout,
+            config.activation_function,
+            use_bias=config.use_bias,
+        )
+
+        self.pre_self_attn_layer_norm = nn.LayerNorm(config.d_model)
+        self.self_attn_layer_norm = nn.LayerNorm(config.d_model)
+        self.dropout1 = nn.Dropout(config.dropout)
+
+    def forward(self, src, src_mask=None):
+        src_mask = _convert_attention_mask(src_mask, src.dtype)
+        residual = src
+
+        # pre_self_attn_layer_norm
+        src = self.pre_self_attn_layer_norm(src)
+        src = self.self_attn(src, src, src, src_mask)
+
+        # self_attn_layer_norm
+        src = self.self_attn_layer_norm(src)
+        src = residual + self.dropout1(src)
+
+        residual = src
+        src = self.glu(src)
+        src = residual + src
+        return src
+
+
+class DalleBartEncoder(DalleBartPretrainedModel):
+    """
+    The Encoder of DalleBartModel. The arguments of DalleBartEncoder can see :class:`DalleBartModel`.
+    """
+
+    def __init__(self, config: DalleBartConfig):
+        super().__init__(config)
+        self.init_std = config.init_std
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model)
+        self.embed_positions = DalleBartLearnedPositionalEmbedding(config.max_text_length, config.d_model)
+
+        self.layers = nn.LayerList([DalleBartEncoderLayer(config) for _ in range(config.num_encoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+        self.final_ln = nn.LayerNorm(config.d_model)
+        self.embedding_dropout = nn.Dropout(config.dropout)
+        self.text_pad_token_id = config.text_pad_token_id
+
+    def forward(self, input_ids, attention_mask=None, **kwargs):
+        """
+        The DalleBartEncoder forward method, overrides the `__call__()` special method.
+        Args:
+            input_ids (Tensor, optional):
+                See :class:`DalleBartModel`.
+            attention_mask (Tensor, optional):
+                See :class:`DalleBartModel`.
+        Returns:
+            Tensor: Returns tensor `encoder_output`, which is the output at the last layer of the model.
+            Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size].
+        """
+        if input_ids is None:
+            raise ValueError("Input_ids cannot be None.")
+
+        if attention_mask is None:
+            attention_mask = (
+                paddle.cast(input_ids == self.text_pad_token_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2])
+                * -1e4
+            )
+        # For 2D attention_mask from tokenizer
+        elif attention_mask.ndim == 2:
+            attention_mask = paddle.unsqueeze(attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
+            attention_mask = (1.0 - attention_mask) * -1e4
+        attention_mask.stop_gradient = True
+
+        inputs_embeds = self.embed_tokens(input_ids)
+        inputs_embed_pos = self.embed_positions(input_ids.shape)
+        hidden_states = self.layernorm_embedding(inputs_embeds + inputs_embed_pos)
+        hidden_states = self.embedding_dropout(hidden_states)
+
+        for layer in self.layers:
+            hidden_states = layer(hidden_states, attention_mask)
+        hidden_states = self.final_ln(hidden_states)
+
+        return hidden_states
+
+
+class DalleBartDecoderLayer(nn.Layer):
+    """
+    The Decoder Layer of DalleBartDecoder. The arguments of DalleBartDecoderLayer can see :class:`DalleBartDecoder`.
+    """
+
+    def __init__(self, config: DalleBartConfig):
+        super().__init__()
+
+        assert config.d_model > 0, "Expected d_model to be greater than 0, " "but received {}".format(config.d_model)
+        assert (
+            config.decoder_attention_heads > 0
+        ), "Expected decoder_attention_heads to be greater than 0, " "but received {}".format(
+            config.decoder_attention_heads
+        )
+        assert config.decoder_ffn_dim > 0, "Expected decoder_ffn_dim to be greater than 0, " "but received {}".format(
+            config.decoder_ffn_dim
+        )
+
+        attention_dropout = config.dropout if config.attention_dropout is None else config.attention_dropout
+        activation_dropout = config.dropout if config.activation_dropout is None else config.activation_dropout
+
+        self.self_attn = nn.MultiHeadAttention(
+            config.d_model, config.decoder_attention_heads, dropout=attention_dropout, bias_attr=config.use_bias
+        )
+        self.cross_attn = nn.MultiHeadAttention(
+            config.d_model, config.decoder_attention_heads, dropout=attention_dropout, bias_attr=config.use_bias
+        )
+
+        self.glu = GLU(
+            config.d_model,
+            config.decoder_ffn_dim,
+            activation_dropout,
+            config.dropout,
+            config.activation_function,
+            use_bias=config.use_bias,
+        )
+
+        self.pre_self_attn_layer_norm = nn.LayerNorm(config.d_model)
+        self.self_attn_layer_norm = nn.LayerNorm(config.d_model)
+        self.pre_cross_attn_layer_norm = nn.LayerNorm(config.d_model)
+        self.cross_attn_layer_norm = nn.LayerNorm(config.d_model)
+
+        self.dropout1 = nn.Dropout(config.dropout)
+        self.dropout2 = nn.Dropout(config.dropout)
+
+    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
+
+        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
+        memory_mask = _convert_attention_mask(memory_mask, memory.dtype)
+
+        # self attn
+        residual = tgt
+        tgt = self.pre_self_attn_layer_norm(tgt)
+
+        if cache is None:
+            tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, None)
+        else:
+            tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask, cache[0])
+
+        tgt = self.self_attn_layer_norm(tgt)
+        tgt = residual + self.dropout1(tgt)
+
+        # cross attn
+        residual = tgt
+        tgt = self.pre_cross_attn_layer_norm(tgt)
+
+        if cache is None:
+            tgt = self.cross_attn(tgt, memory, memory, memory_mask, None)
+        else:
+            tgt, static_cache = self.cross_attn(tgt, memory, memory, memory_mask, cache[1])
+        tgt = self.cross_attn_layer_norm(tgt)
+        tgt = residual + self.dropout2(tgt)
+
+        # glu
+        residual = tgt
+        tgt = self.glu(tgt)
+        tgt = residual + tgt
+        return tgt if cache is None else (tgt, (incremental_cache, static_cache))
+
+    def gen_cache(self, memory):
+        incremental_cache = self.self_attn.gen_cache(memory, type=self.self_attn.Cache)
+        static_cache = self.cross_attn.gen_cache(memory, memory, type=self.cross_attn.StaticCache)
+        return incremental_cache, static_cache
+
+
+class DalleBartDecoder(DalleBartPretrainedModel):
+    """
+    The Decoder of DalleBartModel. The arguments of DalleBartDecoder can see :class:`DalleBartModel`.
+    """
+
+    def __init__(self, config: DalleBartConfig):
+        super().__init__(config)
+        self.init_std = config.init_std
+        self.embed_tokens = nn.Embedding(config.image_vocab_size + 1, config.d_model)
+
+        self.embed_positions = DalleBartLearnedPositionalEmbedding(config.max_image_length, config.d_model)
+        self.layers = nn.LayerList([DalleBartDecoderLayer(config) for _ in range(config.num_decoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+        self.dropout = nn.Dropout(config.dropout)
+        self.final_ln = nn.LayerNorm(config.d_model)
+
+    def forward(
+        self,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_output=None,
+        memory_mask=None,
+        cache=None,
+    ):
+        """
+        The DalleBartDecoder forward method, overrides the `__call__()` special method.
+        Args:
+            decoder_input_ids (Tensor, optional):
+                See :class:`DalleBartModel`.
+            decoder_attention_mask (Tensor, optional):
+                See :class:`DalleBartModel`.
+            encoder_output (Tensor, optional):
+                See :class:`DalleBartModel`.
+            memory_mask (Tensor, optional):
+                See :class:`DalleBartModel`.
+            cache (Tensor, optional):
+                See :class:`DalleBartModel`.
+        Returns:
+            Tensor: Returns tensor `decoder_output`, which is the output at the last layer of the model.
+            Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size].
+        """
+        if decoder_attention_mask is None:
+            decoder_length = decoder_input_ids.shape[-1]
+            decoder_attention_mask = paddle.triu(
+                (
+                    paddle.full(
+                        (decoder_length, decoder_length),
+                        -1e4,
+                        dtype=paddle.get_default_dtype(),
+                    )
+                ),
+                1,
+            )
+        decoder_inputs_embeds = self.embed_tokens(decoder_input_ids)
+        past_key_values_length = cache[0][0].k.shape[2] if cache is not None else 0
+        decoder_inputs_embed_pos = self.embed_positions(decoder_input_ids.shape, past_key_values_length)
+        hidden_states = decoder_inputs_embeds + decoder_inputs_embed_pos
+        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        # layers
+        new_caches = []
+        for i, layer in enumerate(self.layers):
+            if cache is None:
+                hidden_states = layer(
+                    hidden_states,
+                    encoder_output,
+                    tgt_mask=decoder_attention_mask,
+                    memory_mask=memory_mask,
+                    cache=None,
+                )
+            else:
+                hidden_states, new_cache = layer(
+                    hidden_states,
+                    encoder_output,
+                    tgt_mask=decoder_attention_mask,
+                    memory_mask=memory_mask,
+                    cache=cache[i],
+                )
+                new_caches.append(new_cache)
+
+        hidden_states = self.final_ln(hidden_states)
+
+        return hidden_states if cache is None else (hidden_states, new_caches)
+
+    def gen_cache(self, memory, do_zip=False):
+        cache = [layer.gen_cache(memory) for layer in self.layers]
+        if do_zip:
+            cache = list(zip(*cache))
+        return cache
+
+
+@register_base_model
+class DalleBartModel(DalleBartPretrainedModel):
+    def __init__(self, config: DalleBartConfig):
+        super().__init__(config)
+        self.init_std = config.init_std
+        self.pad_token_id = config.pad_token_id
+        self.decoder_start_token_id = config.decoder_start_token_id
+        self.text_pad_token_id = 1  # encoder pad id must be 1
+        self.encoder = DalleBartEncoder(config)
+
+        self.decoder = DalleBartDecoder(config)
+
+    def get_input_embeddings(self):
+        return self.encoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.encoder.embed_tokens = value
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_output=None,
+        use_cache=False,
+        cache=None,
+    ):
+        r"""
+        The DalleBartModel forward method, overrides the `__call__()` special method.
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
+                [batch_size, num_attention_heads, sequence_length, sequence_length].
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            decoder_input_ids (Tensor, optional):
+                Indices of decoder input sequence tokens in the vocabulary.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means no `decoder_input_ids` is provided, the model will create the tensor
+                by shifting the `input_ids` to the right.
+            decoder_attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention to some unwanted positions in `decoder_input_ids`.
+                Its data type and shape is the same as `attention_mask`. Defaults to `None`.
+            encoder_output (tuple, optional):
+                The output of the encoder, a tuple consists `last_hidden_state`, `hidden_states`(optional), `attentions`(optional).
+                The data type of `last_hidden_state` is float32 and its shape is `[batch_size, sequence_length, hidden_size]`.
+                `hidden_states` is hidden_states of all layers in the Transformer encoder. The length of `hidden_states` is `num_hidden_layers + 1`.
+                For all element in the tuple, its data type should be float32 and its shape is [`batch_size, sequence_length, hidden_size`].
+                `attentions` is attentions of all layers of in the Transformer encoder. The length of `attentions` is `num_hidden_layers`.
+                For all element in the tuple, its data type should be float32 and its shape is [`batch_size, num_attention_heads, sequence_length, sequence_length`].
+            use_cache (bool, optional):
+                 Whether or not to use cache. Defaults to `False`. If set to `True`, key value states will be returned and
+                 can be used to speed up decoding.
+            cache (list, optional):
+                It is a list, and each element in the list is a tuple `(incremental_cache, static_cache)`.
+                See `TransformerDecoder.gen_cache <https://github.com/PaddlePaddle/Paddle/blob/release/2.1/python/paddle/nn/layer/transformer.py#L1060>`__ for more details.
+                It is only used for inference and should be None for training.
+                Default to `None`.
+        Returns:
+            Tensor: Returns tensor `decoder_output`, which is the output at the last layer of the model.
+            Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size].
+        Example:
+            .. code-block::
+                import paddle
+                from paddlenlp.transformers import DalleBartModel, DalleBartTokenizer
+                tokenizer = DalleBartTokenizer.from_pretrained('dalle-mini')
+                model = DalleBartModel.from_pretrained('dalle-mini')
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+        """
+        if input_ids is None and encoder_output is None:
+            raise ValueError("You have to specify either input_ids or encoder_output")
+        if decoder_input_ids is None:
+            assert input_ids is not None, "input_ids should be " "specified when generating decoder_input_ids"
+            decoder_input_ids = shift_tokens_right(input_ids, self.decoder_start_token_id)
+        if attention_mask is None:
+            assert input_ids is not None, "input_ids should be " "specified when generating attention_mask"
+            attention_mask = (
+                paddle.cast(input_ids == self.text_pad_token_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2])
+                * -1e4
+            )
+        # For 2D attention_mask from tokenizer
+        elif attention_mask.ndim == 2:
+            attention_mask = paddle.unsqueeze(attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
+            attention_mask = (1.0 - attention_mask) * -1e4
+            attention_mask.stop_gradient = True
+        if encoder_output is None:
+            encoder_output = self.encoder(input_ids, attention_mask)
+        if use_cache:
+            if cache is None:
+                cache = self.decoder.gen_cache(encoder_output)
+        else:
+            cache = None
+        decoder_output = self.decoder(
+            decoder_input_ids,
+            decoder_attention_mask,
+            encoder_output,
+            attention_mask,
+            cache,
+        )
+
+        return decoder_output
+
+
+class DalleBartForConditionalGeneration(DalleBartPretrainedModel):
+    r"""
+    DalleBart Model with a `language modeling` head on top.
+    Args:
+        config (:class:`DalleBartConfig`):
+            An instance of DalleBartConfig used to construct DalleBartForConditionalGeneration.
+    """
+
+    def __init__(self, config: DalleBartConfig):
+        super().__init__(config)
+        self.dallebart = DalleBartModel(config)
+        self.lm_head = nn.Linear(
+            config.d_model,
+            config.image_vocab_size + 1,
+            bias_attr=config.use_bias,
+        )
+        # input_ids_uncond
+        # [0, 2, 1, 1, 1,...,1]
+        # attention_mask_uncond
+        # [1, 1, 0, 0, 0,...,0]
+        input_ids_uncond = [0, 2] + [1] * (config.max_text_length - 2)
+        attention_mask_uncond = [1, 1] + [0] * (config.max_text_length - 2)
+        if hasattr(self, "input_ids_uncond"):
+            self.input_ids_uncond = paddle.to_tensor([input_ids_uncond], dtype="int64")
+        else:
+            self.register_buffer(
+                "input_ids_uncond", paddle.to_tensor([input_ids_uncond], dtype="int64"), persistable=False
+            )
+        if hasattr(self, "attention_mask_uncond"):
+            self.attention_mask_uncond = paddle.to_tensor([attention_mask_uncond], dtype="int64")
+        else:
+            self.register_buffer(
+                "attention_mask_uncond", paddle.to_tensor([attention_mask_uncond], dtype="int64"), persistable=False
+            )
+
+    def get_encoder(self):
+        return self.dallebart.get_encoder()
+
+    def get_decoder(self):
+        return self.dallebart.get_decoder()
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_output=None,
+        use_cache=False,
+        cache=None,
+    ):
+        r"""
+        The DalleBartForConditionalGeneration forward method, overrides the __call__() special method.
+        Args:
+            input_ids (Tensor):
+                See :class:`DalleBartModel`.
+            attention_mask (Tensor, optional):
+                See :class:`DalleBartModel`.
+            decoder_input_ids (Tensor, `optional`):
+                See :class:`DalleBartModel`.
+            decoder_attention_mask (Tensor, optional):
+                See :class:`DalleBartModel`.
+            encoder_output (Tensor, optonal):
+                See :class:`DalleBartModel`.
+            use_cache (bool, optional):
+                See :class:`DalleBartModel`.
+            cache (Tensor, optional):
+                See :class:`DalleBartModel`.
+        Returns:
+            Tensor or tuple: Returns Tensor `lm_logits` if `use_cache` is `False`, otherwise, returns tuple (`lm_logits`, `cache`).
+            With the fields:
+            - `lm_logits` (Tensor):
+                The generated sentence of the model.
+                Its data type should be float32 and has a shape of [batch_size, sequence_length, vocab_size].
+            - `cache` (Tensor):
+                See :class:`DalleBartModel`.
+        Example:
+            .. code-block::
+                import paddle
+                from paddlenlp.transformers import DalleBartForConditionalGeneration, DalleBartTokenizer
+                tokenizer = DalleBartTokenizer.from_pretrained('dalle-mini')
+                model = DalleBartForConditionalGeneration.from_pretrained('dalle-mini')
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+        """
+        output = self.dallebart(
+            input_ids,
+            attention_mask,
+            decoder_input_ids,
+            decoder_attention_mask,
+            encoder_output,
+            use_cache,
+            cache,
+        )
+        lm_logits = self.lm_head(output)
+        if use_cache:
+            cache = output[1]
+            return lm_logits, cache
+        else:
+            return lm_logits
+
+    def prepare_decoder_input_ids_from_labels(self, labels):
+        return shift_tokens_right(labels, self.config.decoder_start_token_id)
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        attention_mask=None,
+        decoder_attention_mask=None,
+        cache=None,
+        use_cache=False,
+        encoder_output=None,
+        **kwargs
+    ):
+        # cut decoder_input_ids if past is used
+        if cache is not None:
+            decoder_input_ids = decoder_input_ids[:, -1].unsqueeze(-1)
+            if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask[:, :, -1, :].unsqueeze(-2)
+
+        return {
+            "input_ids": None,
+            "decoder_input_ids": decoder_input_ids,
+            "encoder_output": encoder_output,
+            "decoder_attention_mask": decoder_attention_mask,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,
+            "cache": cache,
+        }
+
+    def sample(
+        self,
+        input_ids,
+        logits_processors,
+        max_length,
+        pad_token_id,
+        eos_token_id,
+        top_k=None,
+        top_p=None,
+        temperature=None,
+        min_tokens_to_keep=1,
+        condition_scale=1.0,
+        model_kwargs_uncond=None,
+        **model_kwargs
+    ):
+        def TopKProcess(probs, top_k, min_tokens_to_keep):
+            top_k = min(max(top_k, min_tokens_to_keep), probs.shape[-1])
+            # Remove all tokens with a probability less than the last token of the top-k
+            topk_probs, _ = paddle.topk(probs, k=top_k)
+            probs = paddle.where(probs >= topk_probs[:, -1:], probs, paddle.full_like(probs, 0.0))
+            return probs
+
+        def TopPProcess(probs, top_p, min_tokens_to_keep):
+            sorted_probs = paddle.sort(probs, descending=True)
+            sorted_indices = paddle.argsort(probs, descending=True)
+            cumulative_probs = paddle.cumsum(sorted_probs, axis=-1)
+
+            # Remove tokens with cumulative probs above the top_p, But keep at
+            # least min_tokens_to_keep tokens
+            sorted_indices_to_remove = cumulative_probs > top_p
+            if min_tokens_to_keep > 1:
+                # Set 'min_tokens_to_keep - 1' because the first token is kept
+                sorted_indices_to_remove[:, : min_tokens_to_keep - 1] = 0
+            # Keep the first token
+            sorted_indices_to_remove = paddle.cast(sorted_indices_to_remove, dtype="int64")
+            sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
+            sorted_indices_to_remove[:, 0] = 0
+
+            # Scatter sorted tensors to original indexing
+            sorted_indices = sorted_indices + paddle.arange(probs.shape[0]).unsqueeze(-1) * probs.shape[-1]
+            condition = paddle.scatter(
+                sorted_indices_to_remove.flatten(), sorted_indices.flatten(), sorted_indices_to_remove.flatten()
+            )
+            condition = paddle.cast(condition, "bool").reshape(probs.shape)
+            probs = paddle.where(condition, paddle.full_like(probs, 0.0), probs)
+            return probs
+
+        batch_size, cur_len = input_ids.shape
+        origin_len = cur_len
+        unfinished_flag = paddle.full([batch_size, 1], True, dtype="bool")
+        scores = paddle.full([batch_size, 1], 0.0, dtype=paddle.get_default_dtype())
+
+        while cur_len < max_length:
+            # prepare model inputs & get model output
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            outputs = self(**model_inputs)
+            logits = outputs[0] if isinstance(outputs, tuple) else outputs
+            # [batch_size, vocab_size]
+            logits = logits[:, -1, :]
+
+            # perform super conditioning
+            # Source: @RiversHaveWings - https://twitter.com/RiversHaveWings/status/1478093658716966912?s=20&t=xdm-wZ61Wf7OLnE_NJHZ1w
+            if condition_scale != 1.0:
+                model_inputs_uncond = self.prepare_inputs_for_generation(input_ids, **model_kwargs_uncond)
+                outputs_uncond = self(**model_inputs_uncond)
+                logits_uncond = outputs_uncond[0] if isinstance(outputs_uncond, tuple) else outputs_uncond
+                # [batch_size, vocab_size]
+                logits_uncond = logits_uncond[:, -1, :]
+                logits = logits_uncond + condition_scale * (logits - logits_uncond)
+
+            else:
+                outputs_uncond = None
+
+            # pre-process distribution
+            logits = self.adjust_logits_during_generation(logits)
+            logits = logits_processors(input_ids, logits)
+
+            # sample
+            origin_probs = F.softmax(logits)
+            origin_probs = paddle.log(origin_probs)
+            if temperature is not None and temperature != 1.0:
+                logits = logits / temperature
+            probs = F.softmax(logits)
+            if top_k is not None and top_k != 0:
+                probs = TopKProcess(probs, top_k, min_tokens_to_keep)
+            if top_p is not None and top_p < 1.0:
+                probs = TopPProcess(probs, top_p, min_tokens_to_keep)
+            next_tokens = paddle.multinomial(probs)
+
+            next_scores = paddle.index_sample(origin_probs, next_tokens)
+
+            if eos_token_id is not None:
+                next_tokens = paddle.where(unfinished_flag, next_tokens, paddle.full_like(next_tokens, pad_token_id))
+
+            scores = self.update_scores_for_generation(scores, next_scores, cur_len - origin_len, unfinished_flag)
+
+            cur_len += 1
+            input_ids = paddle.concat([input_ids, next_tokens], axis=1)
+
+            if eos_token_id is not None:
+                unfinished_flag = paddle.logical_and(unfinished_flag, next_tokens != eos_token_id)
+
+            # Stop when there is a </s> in all sentences
+            if not paddle.any(unfinished_flag):
+                break
+
+            model_kwargs = self.update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.is_encoder_decoder
+            )
+
+            if condition_scale != 1.0:
+                model_kwargs_uncond = self.update_model_kwargs_for_generation(
+                    outputs_uncond, model_kwargs_uncond, is_encoder_decoder=self.is_encoder_decoder
+                )
+            else:
+                model_kwargs_uncond = None
+
+        return input_ids[:, origin_len:], scores
+
+    @paddle.no_grad()
+    def generate(
+        self,
+        input_ids=None,
+        max_length=256,
+        min_length=256,
+        decode_strategy="sampling",
+        temperature=1.0,
+        top_k=0,
+        top_p=1.0,
+        repetition_penalty=1.0,
+        num_beams=1,
+        num_beam_groups=1,
+        length_penalty=0.0,
+        early_stopping=False,
+        bos_token_id=None,
+        eos_token_id=None,
+        pad_token_id=None,
+        text_pad_token_id=1,
+        decoder_start_token_id=None,
+        forced_bos_token_id=None,
+        forced_eos_token_id=None,
+        num_return_sequences=1,
+        diversity_rate=0.0,
+        use_cache=True,
+        use_fast=False,
+        use_fp16_decoding=False,
+        condition_scale=1.0,
+        **model_kwargs
+    ):
+        r"""
+        The interface for generation task. This method can generate sequences
+        by using decoding strategy. Currently, there are three decoding
+        strategies supported: "greedy_search", "sampling" and "beam_search".
+
+        Args:
+            input_ids (Tensor, optional): The input sequence ids for the
+                generation. It is a Tensor with shape [batch_size, sequence_length].
+                The data type should be int32 or int64. Default to None, which
+                we will initialize it as a Tensor with shape [1, 1], filled
+                with the value `bos_token_id`.
+            max_length (int, optional): The maximum length of the sequence to
+                be generated. Default to 256.
+            min_length (int, optional): The minimum length of the sequence to
+                be generated. Default to 256.
+            decode_strategy (str, optional): The decoding strategy in generation.
+                Currently, there are three decoding strategies supported:
+                "greedy_search", "sampling" and "beam_search". Default to
+                "sampling".
+            temperature (float, optional): The value used to module the next
+                token probabilities in the "sampling" strategy. Default to 1.0,
+                which means no effect.
+            top_k (int, optional): The number of highest probability tokens to
+                keep for top-k-filtering in the "sampling" strategy. Default to
+                0, which means no effect.
+            top_p (float, optional): The cumulative probability for
+                top-p-filtering in the "sampling" strategy. The value should
+                satisfy :math:`0 <= top\_p < 1`. Default to 1.0, which means no
+                effect.
+            repetition_penalty (float, optional):
+                The parameter for repetition penalty. 1.0 means no penalty. See `this paper
+                <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details. Defaults to 1.0.
+            num_beams (int, optional): The number of beams in the "beam_search"
+                strategy. Default to 1.
+            num_beam_groups (int, optional):
+                Number of groups to divide `num_beams` into in order to use DIVERSE
+                BEAM SEARCH. See `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__
+                for more details. Default to 1.
+            length_penalty (float, optional): The exponential penalty to the
+                sequence length in the "beam_search" strategy. The larger this
+                param is, the more that the model would generate shorter
+                sequences. Default to 0.0, which means no penalty.
+            early_stopping (bool, optional): Whether to stop searching in the
+                "beam_search" strategy when at least `num_beams` sentences are
+                finished per batch or not. Default to False.
+            bos_token_id (int, optional): The id of the `bos_token`. Default to
+                None.
+            eos_token_id (int, optional): The id of the `eos_token`. Default to
+                None.
+            pad_token_id (int, optional): The id of the `pad_token`. Default to
+                None.
+            decoder_start_token_id (int, optional): The start token id for
+                encoder-decoder models. Default to None.
+            forced_bos_token_id (int, optional): The id of the token to force as
+                the first generated token. Usually use for multilingual models.
+                Default to None.
+            forced_eos_token_id (int, optional): The id of the token to force as
+                the last generated token. Default to None.
+            num_return_sequences (int, optional): The number of returned
+                sequences for each sequence in the batch. Default to 1.
+            diversity_rate (float, optional): If num_beam_groups is 1, this is the
+                diversity_rate for Diverse Siblings Search. See
+                `this paper https://arxiv.org/abs/1611.08562`__ for more details.
+                If not, this is the diversity_rate for DIVERSE BEAM SEARCH.
+            use_cache: (bool, optional): Whether to use the model cache to
+                speed up decoding. Default to True.
+            use_fast: (bool, optional): Whether to use fast entry of model
+                for FastGeneration. Default to False.
+            use_fp16_decoding: (bool, optional): Whether to use fp16 for decoding.
+                Only works when fast entry is avalible. Default to False.
+            condition_scale (float, optional): The scale of super conditioning. See
+                `this twitter <https://twitter.com/RiversHaveWings/status/1478093658716966912>`__
+                Default to 1.0.
+            model_kwargs (dict): It can be used to specify additional kwargs
+                passed to the model.
+
+        Returns:
+            tuple[Tensor]: It is a tuple contains two elements: ids and scores.
+            Each element is a Tensor.
+
+            With the fields:
+
+            - ids (Tensor):
+                The ids of the generated sequences. It is a Tensor with shape
+                [batch_size * num_return_sequences, sequence_length]. The data
+                type is same as the input `input_ids`.
+            - scores (Tensor):
+                The scores of the generated sequences. It is a Tensor with shape
+                [batch_size * num_return_sequences, 1]. The data type is float32
+                or float64, which is the same as the parameters in the model.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import (
+                    DalleBartForConditionalGeneration,
+                    DalleBartTokenizer
+                )
+
+                # Initialize the model and tokenizer
+                model_name_or_path = 'dalle-mini'
+                model = DalleBartForConditionalGeneration.from_pretrained(model_name_or_path)
+                tokenizer = DalleBartTokenizer.from_pretrained(model_name_or_path)
+
+                # Prepare the model inputs.
+                prompts = "graphite sketch of Elon Musk"
+                tokenized_inputs = tokenizer(
+                    prompts,
+                    return_tensors="pd",
+                    padding="max_length",
+                    truncation=True,
+                    return_attention_mask=True,
+                    max_length=64,
+                )
+
+                # Generate 4 sequences by using "sampling" strategy (top_k=64, condition_scale=10.0)
+                image_token_ids, scores = model.generate(
+                    input_ids=tokenized_inputs['input_ids'],
+                    attention_mask=tokenized_inputs['attention_mask'],
+                    decode_strategy="sampling",
+                    condition_scale=10.0,
+                    top_k=64,
+                    num_return_sequences=4)
+                print(image_token_ids.shape, scores.shape)
+                # [4, 256] [4, 1]
+        """
+        assert decode_strategy in [
+            "greedy_search",
+            "sampling",
+            "beam_search",
+        ], "`decode_strategy` must be one of 'greedy_search', 'sampling' or 'beam_search' but received {}.".format(
+            decode_strategy
+        )
+
+        bos_token_id = bos_token_id if bos_token_id is not None else getattr(self, "bos_token_id", None)
+        eos_token_id = eos_token_id if eos_token_id is not None else getattr(self, "eos_token_id", None)
+        pad_token_id = pad_token_id if pad_token_id is not None else getattr(self, "pad_token_id", None)
+        forced_bos_token_id = (
+            forced_bos_token_id if forced_bos_token_id is not None else getattr(self, "forced_bos_token_id", None)
+        )
+        forced_eos_token_id = (
+            forced_eos_token_id if forced_eos_token_id is not None else getattr(self, "forced_eos_token_id", None)
+        )
+        decoder_start_token_id = (
+            decoder_start_token_id
+            if decoder_start_token_id is not None
+            else getattr(self, "decoder_start_token_id", None)
+        )
+
+        if getattr(self, "_fast_entry", None) is not False and use_fast:
+            args = locals()
+            args.pop("self")
+            args.pop("__class__", None)
+            model_kwargs = args.pop("model_kwargs")
+            args.update(model_kwargs)
+            try:
+                if not hasattr(self, "_fast_entry"):
+                    self._build_fast(args)
+                if self._fast_entry:
+                    output = self._fast_entry(**args)
+                    if isinstance(output, tuple):
+                        output_ids, dummy_srore = output
+                    else:
+                        output_ids = output
+                        # make result and faster result oneconsistent
+                        dummy_srore = None
+                    if decode_strategy == "beam_search":
+                        output_ids = output_ids.transpose([1, 2, 0])
+                        output_ids = output_ids[:, :num_return_sequences, :].reshape([-1, output_ids.shape[-1]])
+                        if dummy_srore is not None:
+                            dummy_srore = dummy_srore[:, :num_return_sequences].flatten()
+                    else:
+                        output_ids = output_ids.transpose([1, 0])
+                    return output_ids, dummy_srore
+
+            except Exception as e:
+                args["model_kwargs"] = model_kwargs
+                # Prevent self._convert_to_fast to throw Exception
+                self._convert_to_fast(args)
+                logger.warning(e)
+                logger.warning("FastGeneration is not available, " "and the original version would be used instead.")
+
+        # params check
+        if input_ids is None:
+            # Init `input_ids` with bos_token_id
+            input_ids = self.prepare_input_ids_for_generation(bos_token_id)
+
+        if model_kwargs.get("attention_mask", None) is None:
+            # Init `attention_mask` depending on `text_pad_token_id`
+            model_kwargs["attention_mask"] = self.prepare_attention_mask_for_generation(
+                input_ids, text_pad_token_id, eos_token_id
+            )
+
+        self.is_encoder_decoder = hasattr(self, "encoder") and hasattr(self, "decoder")
+        if self.is_encoder_decoder:
+
+            if condition_scale != 1.0:
+                assert decode_strategy == "sampling", "`do_sample` has to be True for super conditioning."
+                assert num_beams == 1, "`num_beams` has to be 1 for super conditioning."
+                input_ids_uncond = self.input_ids_uncond.expand_as(input_ids)
+                model_kwargs_uncond = {"attention_mask": self.attention_mask_uncond.expand_as(input_ids)}
+                model_kwargs_uncond = self.prepare_encoder_decoder_kwargs_for_generation(
+                    input_ids_uncond,
+                    model_kwargs_uncond,
+                )
+                model_kwargs_uncond["use_cache"] = use_cache
+            else:
+                model_kwargs_uncond = None
+
+            model_kwargs = self.prepare_encoder_decoder_kwargs_for_generation(input_ids, model_kwargs)
+
+            # set input_ids as decoder_input_ids
+            if "decoder_input_ids" in model_kwargs:
+                input_ids = model_kwargs.pop("decoder_input_ids")
+            else:
+                input_ids = self.prepare_decoder_input_ids_for_generation(
+                    input_ids, decoder_start_token_id, bos_token_id
+                )
+
+        if pad_token_id is None and eos_token_id is not None:
+            print("Setting `pad_token_id` to `eos_token_id`:{} for " "open-end generation.".format(eos_token_id))
+            pad_token_id = eos_token_id
+
+        model_kwargs["use_cache"] = use_cache
+        max_length += input_ids.shape[-1]
+        min_length += input_ids.shape[-1]
+
+        logits_processors = self.get_logits_processor(
+            min_length=min_length,
+            max_length=max_length,
+            eos_token_id=eos_token_id,
+            forced_bos_token_id=forced_bos_token_id,
+            forced_eos_token_id=forced_eos_token_id,
+            num_beams=num_beams,
+            num_beam_groups=num_beam_groups,
+            diversity_rate=diversity_rate,
+            repetition_penalty=repetition_penalty,
+        )
+
+        if decode_strategy == "greedy_search":
+            if num_return_sequences > 1:
+                raise ValueError(
+                    "`num_return_sequences` has to be 1, but is {} "
+                    "when doing greedy search.".format(num_return_sequences)
+                )
+
+            return self.greedy_search(
+                input_ids, logits_processors, max_length, pad_token_id, eos_token_id, **model_kwargs
+            )
+
+        elif decode_strategy == "sampling":
+
+            if num_return_sequences > 1:
+                tmpinput_ids = input_ids.clone()
+                input_ids, model_kwargs = self.expand_inputs_for_generation(
+                    input_ids, expand_size=num_return_sequences, **model_kwargs
+                )
+
+                if condition_scale != 1.0:
+                    _, model_kwargs_uncond = self.expand_inputs_for_generation(
+                        tmpinput_ids, expand_size=num_return_sequences, **model_kwargs_uncond
+                    )
+
+            return self.sample(
+                input_ids,
+                logits_processors,
+                max_length,
+                pad_token_id,
+                eos_token_id,
+                top_k,
+                top_p,
+                temperature,
+                condition_scale=condition_scale,
+                model_kwargs_uncond=model_kwargs_uncond,
+                **model_kwargs,
+            )
+
+        elif decode_strategy == "beam_search":
+            batch_size = input_ids.shape[0]
+            if num_return_sequences > num_beams:
+                raise ValueError(
+                    "`num_return_sequences` has to be smaller or equal to "
+                    "`num_beams`. But received `num_return_sequences` is {}, "
+                    "`num_beams` is {}".format(num_return_sequences, num_beams)
+                )
+            if num_beams <= 1:
+                raise ValueError(
+                    "`num_beams` has to be bigger than 1. But received "
+                    "`num_beams` is {}. If `num_beams` is 1, `decode_strategy` "
+                    "should be 'greedy_search'".format(num_beams)
+                )
+            if num_beam_groups > 1:
+                diverse_beam_scorer = BeamSearchScorer(
+                    batch_size=batch_size,
+                    max_length=max_length,
+                    num_beams=num_beams,
+                    length_penalty=length_penalty,
+                    do_early_stopping=early_stopping,
+                    num_beam_hyps_to_keep=num_return_sequences,
+                    num_beam_groups=num_beam_groups,
+                )
+
+                # interleave with `num_beams`
+                input_ids, model_kwargs = self.expand_inputs_for_generation(
+                    input_ids, expand_size=num_beams, **model_kwargs
+                )
+
+                return self.group_beam_search(
+                    input_ids,
+                    diverse_beam_scorer,
+                    logits_processors,
+                    max_length,
+                    diversity_rate,
+                    pad_token_id,
+                    eos_token_id,
+                    **model_kwargs,
+                )
+            else:
+                beam_scorer = BeamSearchScorer(
+                    batch_size=batch_size,
+                    max_length=max_length,
+                    num_beams=num_beams,
+                    length_penalty=length_penalty,
+                    do_early_stopping=early_stopping,
+                    num_beam_hyps_to_keep=num_return_sequences,
+                )
+
+                input_ids, model_kwargs = self.expand_inputs_for_generation(
+                    input_ids, expand_size=num_beams, **model_kwargs
+                )
+
+                return self.beam_search(
+                    input_ids,
+                    beam_scorer,
+                    logits_processors,
+                    max_length,
+                    diversity_rate,
+                    pad_token_id,
+                    eos_token_id,
+                    **model_kwargs,
+                )
+
+    def __getattr__(self, name):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(getattr(self, self.base_model_prefix), name)
+
+
+class ResnetBlock(nn.Layer):
+    def __init__(self, log2_count_in: int, log2_count_out: int):
+        super().__init__()
+        m, n = 2**log2_count_in, 2**log2_count_out
+        self.is_middle = m == n
+        self.norm1 = nn.GroupNorm(2**5, m)
+        self.conv1 = nn.Conv2D(m, n, 3, padding=1)
+        self.norm2 = nn.GroupNorm(2**5, n)
+        self.conv2 = nn.Conv2D(n, n, 3, padding=1)
+        if not self.is_middle:
+            self.nin_shortcut = nn.Conv2D(m, n, 1)
+
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = F.swish(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = F.swish(h)
+        h = self.conv2(h)
+        if not self.is_middle:
+            x = self.nin_shortcut(x)
+        return x + h
+
+
+class AttentionBlock(nn.Layer):
+    def __init__(self):
+        super().__init__()
+        n = 2**9
+        self.norm = nn.GroupNorm(2**5, n)
+        self.q = nn.Conv2D(n, n, 1)
+        self.k = nn.Conv2D(n, n, 1)
+        self.v = nn.Conv2D(n, n, 1)
+        self.proj_out = nn.Conv2D(n, n, 1)
+
+    def forward(self, x):
+        n, m = 2**9, x.shape[0]
+        h = x
+        h = self.norm(h)
+        k = self.k(h)
+        v = self.v(h)
+        q = self.q(h)
+        k = k.reshape(shape=[m, n, -1])
+        v = v.reshape(shape=[m, n, -1])
+        q = q.reshape(shape=[m, n, -1])
+        q = q.transpose(perm=[0, 2, 1])
+        w = paddle.bmm(q, k)
+        w /= n**0.5
+        w = F.softmax(w, axis=2)
+        w = w.transpose(perm=[0, 2, 1])
+        h = paddle.bmm(v, w)
+        token_count = int(math.sqrt(h.shape[-1]))
+        h = h.reshape(shape=[m, n, token_count, token_count])
+        h = self.proj_out(h)
+        return x + h
+
+
+class MiddleLayer(nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.block_1 = ResnetBlock(9, 9)
+        self.attn_1 = AttentionBlock()
+        self.block_2 = ResnetBlock(9, 9)
+
+    def forward(self, h):
+        h = self.block_1(h)
+        h = self.attn_1(h)
+        h = self.block_2(h)
+        return h
+
+
+class Upsample(nn.Layer):
+    def __init__(self, log2_count):
+        super().__init__()
+        n = 2**log2_count
+        self.upsample = nn.UpsamplingNearest2D(scale_factor=2)
+        self.conv = nn.Conv2D(n, n, 3, padding=1)
+
+    def forward(self, x):
+        x = self.upsample(x)
+        x = self.conv(x)
+        return x
+
+
+class UpsampleBlock(nn.Layer):
+    def __init__(self, log2_count_in: int, log2_count_out: int, has_attention: bool, has_upsample: bool):
+        super().__init__()
+        self.has_attention = has_attention
+        self.has_upsample = has_upsample
+
+        self.block = nn.LayerList(
+            [
+                ResnetBlock(log2_count_in, log2_count_out),
+                ResnetBlock(log2_count_out, log2_count_out),
+                ResnetBlock(log2_count_out, log2_count_out),
+            ]
+        )
+
+        if has_attention:
+            self.attn = nn.LayerList([AttentionBlock(), AttentionBlock(), AttentionBlock()])
+
+        if has_upsample:
+            self.upsample = Upsample(log2_count_out)
+
+    def forward(self, h):
+        for j in range(3):
+            h = self.block[j](h)
+            if self.has_attention:
+                h = self.attn[j](h)
+        if self.has_upsample:
+            h = self.upsample(h)
+        return h
+
+
+class Decoder(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+        self.conv_in = nn.Conv2D(2**8, 2**9, 3, padding=1)
+        self.mid = MiddleLayer()
+
+        self.up = nn.LayerList(
+            [
+                UpsampleBlock(7, 7, False, False),
+                UpsampleBlock(8, 7, False, True),
+                UpsampleBlock(8, 8, False, True),
+                UpsampleBlock(9, 8, False, True),
+                UpsampleBlock(9, 9, True, True),
+            ]
+        )
+
+        self.norm_out = nn.GroupNorm(2**5, 2**7)
+        self.conv_out = nn.Conv2D(2**7, 3, 3, padding=1)
+
+    def forward(self, z):
+        z = self.conv_in(z)
+        z = self.mid(z)
+
+        for i in reversed(range(5)):
+            z = self.up[i](z)
+
+        z = self.norm_out(z)
+        z = F.swish(z)
+        z = self.conv_out(z)
+        return z
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dallebart/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dallebart/tokenizer.py
new file mode 100644
index 000000000..c9d25946a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dallebart/tokenizer.py
@@ -0,0 +1,503 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021-2022 The Fairseq Authors and The Google Flax
+# Team Authors And The HuggingFace Inc. team and & DALL·E Mini team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import html
+import math
+import random
+import re
+from pathlib import Path
+
+from paddle.utils import try_import
+
+from ...transformers import AddedToken, GPTTokenizer
+
+__all__ = ["DalleBartTokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "dalle-mini": 64,
+    "dalle-mega-v16": 64,
+    "dalle-mega-v26": 64,
+    "dalle-mega": 64,
+}
+
+# based on wiki word occurrence
+person_token = [("a person", 282265), ("someone", 121194), ("somebody", 12219)]
+temp_token = "xtokx"  # avoid repeating chars
+
+
+class HashtagProcessor:
+    # Adapted from wordninja library
+    # We use our wikipedia word count + a good heuristic to make it work
+    def __init__(self, wiki_word_frequency):
+        self._word_cost = (l.split()[0] for l in Path(wiki_word_frequency).read_text(encoding="utf8").splitlines())
+        self._word_cost = {str(k): math.log(float(i + 1)) for i, k in enumerate(self._word_cost)}
+        self._max_word = max(len(x) for x in self._word_cost.keys())
+        self._SPLIT_RE = re.compile("[^a-zA-Z0-9']+")
+
+    def __call__(self, s):
+        """Uses dynamic programming to infer the location of spaces in a string without spaces."""
+        l = [self._split(x) for x in self._SPLIT_RE.split(s)]
+        return " ".join([item for sublist in l for item in sublist])
+
+    def _split(self, s):
+        # Find the best match for the i first characters, assuming cost has
+        # been built for the i-1 first characters.
+        # Returns a pair (match_cost, match_length).
+        def best_match(i):
+            candidates = enumerate(reversed(cost[max(0, i - self._max_word) : i]))
+            return min((c + self._word_cost.get(s[i - k - 1 : i].lower(), 9e999), k + 1) for k, c in candidates)
+
+        # Build the cost array
+        cost = [0]
+        for i in range(1, len(s) + 1):
+            c, k = best_match(i)
+            cost.append(c)
+
+        # Backtrack to recover the minimal-cost string.
+        out = []
+        i = len(s)
+        while i > 0:
+            c, k = best_match(i)
+            assert c == cost[i]
+            newToken = True
+            if not s[i - k : i] == "'":  # ignore a lone apostrophe
+                if len(out) > 0:
+                    # re-attach split 's and split digits
+                    if out[-1] == "'s" or (s[i - 1].isdigit() and out[-1][0].isdigit()):  # digit followed by digit
+                        out[-1] = s[i - k : i] + out[-1]  # combine current token with previous token
+                        newToken = False
+
+            if newToken:
+                out.append(s[i - k : i])
+
+            i -= k
+
+        return reversed(out)
+
+
+def replace_person_token(t):
+    "Used for CC12M"
+    t = re.sub(r"<person>([,\s]*(and)*[,\s]*<person>)+", " people ", t)
+    while "<person>" in t:
+        t = t.replace("<person>", f" {random.choices(*tuple(zip(*person_token)))[0]} ", 1)
+    return t
+
+
+def fix_html(t):
+    # from OpenAI CLIP
+    return html.unescape(html.unescape(t))
+
+
+def replace_punctuation_with_commas(t):
+    return re.sub(r"[()[\].,|:;?!=+~\-\/{}]", ",", t)
+
+
+def simplify_quotes(t):
+    return re.sub("""['"`]""", ' " ', t)
+
+
+def merge_quotes(t):
+    return re.sub(r'(\s*"+\s*)+', ' " ', t)
+
+
+def remove_comma_numbers(t):
+    def _f(t):
+        return re.sub(r"(\d),(\d{3})", r"\1\2", t)
+
+    return _f(_f(t))
+
+
+def pre_process_dot_numbers(t):
+    return re.sub(r"(\w)\.(\w)", rf"\1{temp_token}dot{temp_token}\2", t)
+
+
+def post_process_dot_numbers(t):
+    return re.sub(f"{temp_token}dot{temp_token}", ".", t)
+
+
+def pre_process_quotes(t):
+    # allows quotes only for 's, 't, 'd, 'm, 'll, 're, 've
+    return re.sub(r"'(?=([stdm]|(ll)|(re)|(ve)|(ll))\b)", rf"{temp_token}quote{temp_token}", t)
+
+
+def post_process_quotes(t):
+    return re.sub(f"{temp_token}quote{temp_token}", "'", t)
+
+
+def pre_process_dates(t):
+    return re.sub(r"(\d)/(\d)", rf"\1{temp_token}slash{temp_token}\2", t)
+
+
+def post_process_dates(t):
+    return re.sub(f"{temp_token}slash{temp_token}", "/", t)
+
+
+def merge_commas(t):
+    return re.sub(r"(\s*,+\s*)+", ", ", t)
+
+
+def add_space_after_commas(t):
+    return re.sub(",", ", ", t)
+
+
+def handle_special_chars(t):
+    "Handle special characters"
+    # replace "-" with a space when between words without space
+    t = re.sub(r"(\w)-(\w)", r"\1 \2", t)
+    # always add space around some characters
+    return re.sub(r"([%&\/$*])", r" \1 ", t)
+
+
+def expand_hashtags(t, hashtag_processor):
+    "Remove # and try to split words"
+    return re.sub(r"#(\w+)", lambda m: hashtag_processor(m.group(1)), t)
+
+
+_re_ignore_chars = r"[_#\\]"
+
+
+def ignore_chars(t):
+    "Ignore useless characters"
+    return re.sub(_re_ignore_chars, " ", t)
+
+
+def remove_extra_spaces(t):
+    "Remove extra spaces (including \t and \n)"
+    return re.sub(r"\s+", " ", t)
+
+
+def remove_repeating_chars(t):
+    "If the same character is present 4+ times (not 3 because of roman 'VIII'), replace with single instance"
+    return re.sub(r"(\D)(\1{3,})", r"\1", t)
+
+
+def remove_urls(t):
+    return re.sub(r"http\S+", "", t)
+
+
+def remove_html_tags(t):
+    return re.sub("<[^<]+?>", " ", t)
+
+
+def remove_first_last_commas(t):
+    t = t.strip()
+    t = t[:-1] if t and t[-1] == "," else t
+    t = t[1:] if t and t[0] == "," else t
+    return t.strip()
+
+
+def remove_wiki_ref(t):
+    t = re.sub(r"\A\s*\[\d+\]", "", t)
+    return re.sub(r"\[\d+\]\s*\Z", "", t)
+
+
+class TextNormalizer:
+    def __init__(self, wiki_word_frequency_file):
+        self._hashtag_processor = HashtagProcessor(wiki_word_frequency_file)
+        self.emoji = try_import("emoji")
+        self.ftfy = try_import("ftfy")
+        self.unidecode = try_import("unidecode")
+
+    def __call__(self, t):
+        # fix some characters
+        t = self.ftfy.fix_text(t)
+        # fix html
+        t = fix_html(t)
+        # decode emojis (would be removed by unidecode)
+        t = self.emoji.demojize(t)
+        # decode and simplify text: see unidecode library
+        t = self.unidecode.unidecode(t)
+        # lower case
+        t = t.lower()
+        # replace <PERSON> (for CC12M)
+        t = replace_person_token(t)
+        # remove wiki reference (for WIT)
+        t = remove_wiki_ref(t)
+        # remove html tags
+        t = remove_html_tags(t)
+        # remove urls
+        t = remove_urls(t)
+        # remove commas in numbers
+        t = remove_comma_numbers(t)
+        # handle dots in numbers and quotes - Part 1
+        t = pre_process_dot_numbers(t)
+        t = pre_process_quotes(t)
+        t = pre_process_dates(t)
+        # handle special characters
+        t = handle_special_chars(t)
+        # handle hashtags
+        t = expand_hashtags(t, self._hashtag_processor)
+        # ignore useless characters
+        t = ignore_chars(t)
+        # simplify quotes
+        t = simplify_quotes(t)
+        # all punctuation becomes commas
+        t = replace_punctuation_with_commas(t)
+        # handle dots in numbers and quotes - Part 2
+        t = post_process_dot_numbers(t)
+        t = post_process_quotes(t)
+        t = post_process_dates(t)
+        # handle repeating characters
+        t = remove_repeating_chars(t)
+        # merge quotes
+        t = merge_quotes(t)
+        # merge commas
+        t = merge_commas(t)
+        # remove multiple spaces
+        t = remove_extra_spaces(t)
+        # remove first and last comma
+        t = remove_first_last_commas(t)
+        # always start with a space
+        return f" {t}"
+
+
+class DalleBartTokenizer(GPTTokenizer):
+    r"""
+    Construct a DalleBart tokenizer based on byte-level Byte-Pair-Encoding.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.gpt.tokenizer.GPTTokenizer`.
+    For more information regarding those methods, please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            Path to the vocabulary file.
+            The vocab file contains a mapping from vocabulary strings to indices.
+        merges_file (str):
+            Path to the merge file.
+            The merge file is used to split the input sentence into "subword" units.
+            The vocab file is then used to encode those units as intices.
+        wiki_word_frequency_file (str):
+            Path to the wiki_word_frequency file when we need normlize text.
+        errors (str):
+            Paradigm to follow when decoding bytes to UTF-8.
+            Defaults to `'replace'`.
+        max_len (int, optional):
+            The maximum value of the input sequence length.
+            Defaults to `None`.
+        bos_token (str, optional):
+            The beginning of sequence token that was used during pretraining. Can be
+            used a sequence classifier token.
+            Defaults to `"<s>"`.
+        eos_token (str, optional):
+            A special token representing the end of a sequence that was used during pretraining.
+            Defaults to `"</s>"`.
+        cls_token (str, optional):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens.
+            Defaults to `"<s>"`.
+        sep_token (str, optional):
+            A special token separating two different sentences in the same input.
+            Defaults to `"</s>"`.
+        unk_token (str, optional):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to `"<unk>"`.
+        pad_token (str, optional):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to `"<pad>"`.
+        mask_token (str, optional):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to `"<mask>"`.
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import DalleBartTokenizer
+
+            tokenizer = DalleBartTokenizer.from_pretrained('dalle-mini')
+            print(tokenizer('Donald Trump in Animal Crossing'))
+
+            # {'input_ids': [0, 7083, 3252, 91, 2203, 7807, 2]}
+
+    """
+    resource_files_names = {
+        "vocab_file": "vocab.json",
+        "merges_file": "merges.txt",
+        "wiki_word_frequency_file": "enwiki-words-frequency.txt",
+    }
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "dalle-mini": "https://bj.bcebos.com/paddlenlp/models/transformers/dallebart/dalle-mini/vocab.json",
+            "dalle-mega-v16": "https://bj.bcebos.com/paddlenlp/models/transformers/dallebart/dalle-mega-v16/vocab.json",
+            "dalle-mega-v26": "https://bj.bcebos.com/paddlenlp/models/transformers/dallebart/dalle-mega-v26/vocab.json",
+            "dalle-mega": "https://bj.bcebos.com/paddlenlp/models/transformers/dallebart/dalle-mega-v26/vocab.json",
+        },
+        "merges_file": {
+            "dalle-mini": "https://bj.bcebos.com/paddlenlp/models/transformers/dallebart/dalle-mini/merges.txt",
+            "dalle-mega-v16": "https://bj.bcebos.com/paddlenlp/models/transformers/dallebart/dalle-mega-v16/merges.txt",
+            "dalle-mega-v26": "https://bj.bcebos.com/paddlenlp/models/transformers/dallebart/dalle-mega-v26/merges.txt",
+            "dalle-mega": "https://bj.bcebos.com/paddlenlp/models/transformers/dallebart/dalle-mega-v26/merges.txt",
+        },
+        "wiki_word_frequency_file": {
+            "dalle-mini": "https://bj.bcebos.com/paddlenlp/models/transformers/dallebart/dalle-mini/enwiki-words-frequency.txt",
+            "dalle-mega-v16": "https://bj.bcebos.com/paddlenlp/models/transformers/dallebart/dalle-mega-v16/enwiki-words-frequency.txt",
+            "dalle-mega-v26": "https://bj.bcebos.com/paddlenlp/models/transformers/dallebart/dalle-mega-v26/enwiki-words-frequency.txt",
+            "dalle-mega": "https://bj.bcebos.com/paddlenlp/models/transformers/dallebart/dalle-mega-v26/enwiki-words-frequency.txt",
+        },
+    }
+    pretrained_init_configuration = {
+        "dalle-mini": {"normalize_text": True},
+        "dalle-mega-v16": {"normalize_text": True},
+        "dalle-mega-v26": {"normalize_text": True},
+        "dalle-mega": {"normalize_text": True},
+    }
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        wiki_word_frequency_file,
+        normalize_text=True,
+        errors="replace",
+        max_len=None,
+        bos_token="<s>",
+        eos_token="</s>",
+        cls_token="<s>",
+        sep_token="</s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        **kwargs
+    ):
+
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+        self._build_special_tokens_map_extended(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+        )
+        self.normalize_text = normalize_text
+        # in order to save wiki_word_frequency_file, we need set this attr
+        self._wiki_word_frequency_file = wiki_word_frequency_file
+        if self.normalize_text:
+            self.text_processor = TextNormalizer(wiki_word_frequency_file)
+        super().__init__(vocab_file, merges_file, errors, max_len, pad_token, eos_token, unk_token, **kwargs)
+
+    def _bpe_encode(self, text):
+        bpe_tokens = []
+        re = try_import("regex")
+        for token in re.findall(self.pat, text):
+            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification
+        tasks by concatenating and adding special tokens.
+        """
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        if token_ids_1 is None:
+            return _cls + token_ids_0 + _sep
+        return _cls + token_ids_0 + _sep + _sep + token_ids_1 + _sep
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is
+        called when adding special tokens using the tokenizer ``encode`` methods.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    def __call__(
+        self,
+        text,
+        text_pair=None,
+        max_length=64,  # default
+        stride=0,
+        is_split_into_words=False,
+        padding="max_length",  # default
+        truncation=True,  # default
+        return_position_ids=False,
+        return_token_type_ids=False,  # don't return token_type_ids
+        return_attention_mask=True,  # default
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_special_tokens_mask=False,
+        return_dict=True,
+        return_offsets_mapping=False,
+        add_special_tokens=True,
+        pad_to_multiple_of=None,
+        return_tensors=None,
+        verbose: bool = True,
+        **kwargs
+    ):
+        if self.normalize_text:
+            is_batched = isinstance(text, (list, tuple))
+            if is_batched:
+                text = [self.text_processor(t) for t in text]
+                if text_pair:
+                    text_pair = [self.text_processor(t) for t in text_pair]
+            else:
+                text = self.text_processor(text)
+                if text_pair:
+                    text_pair = self.text_processor(text_pair)
+
+        return super().__call__(
+            text,
+            text_pair,
+            max_length,
+            stride,
+            is_split_into_words,
+            padding,
+            truncation,
+            return_position_ids,
+            return_token_type_ids,
+            return_attention_mask,
+            return_length,
+            return_overflowing_tokens,
+            return_special_tokens_mask,
+            return_dict,
+            return_offsets_mapping,
+            add_special_tokens,
+            pad_to_multiple_of,
+            return_tensors,
+            verbose,
+            **kwargs,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta/__init__.py
new file mode 100644
index 000000000..fd05a9208
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta/configuration.py
new file mode 100644
index 000000000..bd094641e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta/configuration.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" DeBERTa model configuration"""
+from __future__ import annotations
+
+from typing import Dict, List
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["DEBERTA_PRETRAINED_INIT_CONFIGURATION", "DebertaConfig", "DEBERTA_PRETRAINED_RESOURCE_FILES_MAP"]
+
+DEBERTA_PRETRAINED_INIT_CONFIGURATION = {
+    "deberta-base": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 768,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "layer_norm_eps": 1e-07,
+        "max_position_embeddings": 512,
+        "max_relative_positions": -1,
+        "model_type": "deberta",
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "output_hidden_states": True,
+        "pad_token_id": 0,
+        "pos_att_type": ["c2p", "p2c"],
+        "position_biased_input": False,
+        "relative_attention": True,
+        "type_vocab_size": 0,
+        "vocab_size": 50265,
+    },
+}
+
+DEBERTA_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "microsoft/deberta-base": "https://paddlenlp.bj.bcebos.com/models/community/microsoft/deberta-base/model_state.pdparams"
+    }
+}
+
+
+class DebertaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DeBERTaModel`] . It is used to
+    instantiate a DeBERTa model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the DeBERTa
+    DeBERTa-v1-base architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+            Vocabulary size of the DeBERTa model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling [`DeBERTaModel`].
+        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        embedding_size (:obj:`int`, `optional`, defaults to 768):
+            Dimensionality of the embedding layer.
+        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 0):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling [`DeBERTaModel`].
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        pad_token_id (:obj:`int`, `optional`, defaults to 0):
+            The value used to pad input_ids.
+        position_biased_input (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether add position bias to the input embeddings.
+        pos_att_type (:obj:`List[str]`, `optional`, defaults to :obj:`["p2c", "c2p"]`):
+            The type of relative position attention. It should be a subset of `["p2c", "c2p", "p2p"]`.
+        output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether the model returns attentions weights.
+        output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether the model returns all hidden-states.
+        relative_attention (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether use relative position encoding.
+
+    Examples:
+
+    ```python
+    >>> from paddlenlp.transformers import DeBERTaModel, DeBERTaConfig
+
+    >>> # Initializing a DeBERTa DeBERTa-base style configuration
+    >>> configuration = DeBERTaConfig()
+
+    >>> # Initializing a model from the DeBERTa-base-uncased style configuration
+    >>> model = DeBERTaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "deberta"
+    attribute_map: Dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
+    pretrained_init_configuration = DEBERTA_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size: int = 50265,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        max_position_embeddings: int = 512,
+        type_vocab_size: int = 0,
+        initializer_range: float = 0.02,
+        layer_norm_eps: float = 1e-7,
+        pad_token_id: int = 0,
+        position_biased_input: bool = False,
+        pos_att_type: List[str] = ["p2c", "c2p"],
+        output_attentions: bool = False,
+        output_hidden_states: bool = True,
+        relative_attention: bool = True,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.embedding_size = kwargs.get("embedding_size", hidden_size)
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_biased_input = position_biased_input
+        self.pos_att_type = pos_att_type
+        self.output_attentions = output_attentions
+        self.output_hidden_states = output_hidden_states
+        self.relative_attention = relative_attention
+        self.pad_token_id = pad_token_id
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta/modeling.py
new file mode 100644
index 000000000..806e77d38
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta/modeling.py
@@ -0,0 +1,1378 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Sequence
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model
+
+from ...utils.converter import StateDictNameMapping
+from ...utils.env import CONFIG_NAME
+from ..activations import ACT2FN
+from ..model_outputs import (
+    BaseModelOutput,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from .configuration import (
+    DEBERTA_PRETRAINED_INIT_CONFIGURATION,
+    DEBERTA_PRETRAINED_RESOURCE_FILES_MAP,
+    DebertaConfig,
+)
+
+__all__ = [
+    "DebertaModel",
+    "DebertaForSequenceClassification",
+    "DebertaForQuestionAnswering",
+    "DebertaForTokenClassification",
+    "DebertaPreTrainedModel",
+    "DebertaForMultipleChoice",
+]
+
+
+class DropoutContext(object):
+    def __init__(self):
+        self.dropout = 0
+        self.mask = None
+        self.scale = 1
+        self.reuse_mask = True
+
+
+def get_mask(input, local_context):
+    if not isinstance(local_context, DropoutContext):
+        dropout = local_context
+        mask = None
+    else:
+        dropout = local_context.dropout
+        dropout *= local_context.scale
+        mask = local_context.mask if local_context.reuse_mask else None
+
+    if dropout > 0 and mask is None:
+        # mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).to(torch.bool)
+        probability_matrix = paddle.full(paddle.empty_like(input).shape, 1 - dropout)
+        mask = (1 - paddle.bernoulli(probability_matrix)).cast("bool")
+
+    if isinstance(local_context, DropoutContext):
+        if local_context.mask is None:
+            local_context.mask = mask
+
+    return mask, dropout
+
+
+class XDropout(paddle.autograd.PyLayer):
+    """Optimized dropout function to save computation and memory by using mask operation instead of multiplication."""
+
+    @staticmethod
+    def forward(ctx, input, local_ctx):
+        mask, dropout = get_mask(input, local_ctx)
+        ctx.scale = 1.0 / (1 - dropout)
+        if dropout > 0:
+            ctx.save_for_backward(mask)
+            return input.masked_fill(mask, 0) * ctx.scale
+        else:
+            return input
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        if ctx.scale > 1:
+            (mask,) = ctx.saved_tensor()
+            return grad_output.masked_fill(mask, 0) * ctx.scale, None
+        else:
+            return grad_output, None
+
+
+class StableDropout(nn.Layer):
+    """
+    Optimized dropout module for stabilizing the training
+
+    Args:
+        drop_prob (float): the dropout probabilities
+    """
+
+    def __init__(self, drop_prob):
+        super().__init__()
+        self.drop_prob = drop_prob
+        self.count = 0
+        self.context_stack = None
+
+    def forward(self, x):
+        """
+        Call the module
+
+        Args:
+            x (`paddle.Tensor`): The input tensor to apply dropout
+        """
+        if self.training and self.drop_prob > 0:
+            return XDropout.apply(x, self.get_context())
+        return x
+
+    def clear_context(self):
+        self.count = 0
+        self.context_stack = None
+
+    def init_context(self, reuse_mask=True, scale=1):
+        if self.context_stack is None:
+            self.context_stack = []
+        self.count = 0
+        for c in self.context_stack:
+            c.reuse_mask = reuse_mask
+            c.scale = scale
+
+    def get_context(self):
+        if self.context_stack is not None:
+            if self.count >= len(self.context_stack):
+                self.context_stack.append(DropoutContext())
+            ctx = self.context_stack[self.count]
+            ctx.dropout = self.drop_prob
+            self.count += 1
+            return ctx
+        else:
+            return self.drop_prob
+
+
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+    Args:
+        x: paddle.Tensor x:
+    Returns: paddle.Tensor
+    """
+    if past_key_values_length is None:
+        past_key_values_length = 0
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = (input_ids != padding_idx).cast("int64")
+    incremental_indices = (paddle.cumsum(mask, axis=1) + past_key_values_length) * mask
+    return incremental_indices + padding_idx
+
+
+def softmax_with_mask(x, mask, axis):
+    rmask = paddle.logical_not(mask.astype("bool"))
+    y = paddle.full(x.shape, -float("inf"), x.dtype)
+    return F.softmax(paddle.where(rmask, y, x), axis=axis)
+
+
+class DebertaEmbeddings(nn.Layer):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+
+        pad_token_id = getattr(config, "pad_token_id", 0)
+        self.position_biased_input = getattr(config, "position_biased_input", True)
+        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
+
+        if not self.position_biased_input:
+            self.position_embeddings = None
+        else:
+            self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.embedding_size)
+
+        self.word_embeddings = nn.Embedding(config.vocab_size, self.embedding_size, padding_idx=pad_token_id)
+
+        if config.type_vocab_size > 0:
+            self.token_type_embeddings = nn.Embedding(config.type_vocab_size, self.embedding_size)
+        if self.embedding_size != config.hidden_size:
+            self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias_attr=False)
+        self.LayerNorm = DebertaLayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = StableDropout(config.hidden_dropout_prob)
+        self.config = config
+
+    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None):
+        if input_ids is not None:
+            input_shape = input_ids.shape
+        else:
+            input_shape = inputs_embeds.shape[:-1]
+
+        seq_length = input_shape[1]
+        if position_ids is None:
+            position_ids = paddle.arange(seq_length, dtype="int64")
+            position_ids = position_ids.unsqueeze(0).expand(input_shape)
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(input_shape, dtype="int64")
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        if self.position_embeddings is not None:
+            position_embeds = self.position_embeddings(position_ids)
+        else:
+            position_embeds = paddle.zeros_like(inputs_embeds)
+        embeddings = inputs_embeds
+        if self.position_biased_input:
+            embeddings = embeddings + position_embeds
+        if self.config.type_vocab_size > 0:
+            token_type_embeds = self.token_type_embeddings(token_type_ids)
+            embeddings = embeddings + token_type_embeds
+        if self.config.embedding_size != self.config.hidden_size:
+            embeddings = self.embed_proj(embeddings)
+        embeddings = self.LayerNorm(embeddings)
+        if mask is not None:
+            if mask.dim() != embeddings.dim():
+                if mask.dim() == 4:
+                    mask = mask.squeeze(1).squeeze(1)
+                mask = mask.unsqueeze(2)
+            embeddings = embeddings * mask.astype(embeddings.dtype)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class DebertaLayerNorm(nn.Layer):
+    """LayerNorm module in the TF style (epsilon inside the square root)."""
+
+    def __init__(self, size, eps=1e-12):
+        super().__init__()
+        self.weight = paddle.create_parameter(
+            shape=[size], default_initializer=nn.initializer.Constant(1.0), dtype="float32"
+        )
+        self.add_parameter("weight", self.weight)
+        self.bias = paddle.create_parameter(
+            shape=[size], default_initializer=nn.initializer.Constant(0.0), dtype="float32"
+        )
+        self.add_parameter("bias", self.bias)
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        mean = hidden_states.mean(-1, keepdim=True)
+        variance = (hidden_states - mean).pow(2).mean(-1, keepdim=True)
+        hidden_states = (hidden_states - mean) / paddle.sqrt(variance + self.variance_epsilon)
+        y = self.weight * hidden_states + self.bias
+        return y
+
+
+class DebertaSelfOutput(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = DebertaLayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = StableDropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+def build_relative_position(query_size, key_size):
+    q_ids = paddle.arange(query_size, dtype="int64")
+    k_ids = paddle.arange(key_size, dtype="int64")
+    rel_pos_ids = q_ids[:, None] - paddle.tile(k_ids[None], [query_size, 1])
+    rel_pos_ids = rel_pos_ids.unsqueeze(0)
+    return rel_pos_ids
+
+
+def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos):
+    return paddle.expand(
+        c2p_pos, [query_layer.shape[0], query_layer.shape[1], query_layer.shape[2], relative_pos.shape[-1]]
+    )
+
+
+def p2c_dynamic_expand(c2p_pos, query_layer, key_layer):
+    return paddle.expand(
+        c2p_pos, [query_layer.shape[0], query_layer.shape[1], key_layer.shape[-2], key_layer.shape[-2]]
+    )
+
+
+def pos_dynamic_expand(pos_index, p2c_att, key_layer):
+    return paddle.expand(pos_index, p2c_att.shape[:2] + (pos_index.shape[-2], key_layer.shape[-2]))
+
+
+class DisentangledSelfAttention(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.in_proj = nn.Linear(config.hidden_size, self.all_head_size * 3, bias_attr=False)
+        self.q_bias = paddle.create_parameter(
+            shape=[self.all_head_size], default_initializer=nn.initializer.Constant(0.0), dtype="float32"
+        )
+        self.v_bias = paddle.create_parameter(
+            shape=[self.all_head_size], default_initializer=nn.initializer.Constant(0.0), dtype="float32"
+        )
+        self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else []
+
+        # To transform c2p|p2c" into ["c2p","p2c"]
+        if isinstance(self.pos_att_type, str):
+            self.pos_att_type = self.pos_att_type.split("|")
+
+        self.relative_attention = getattr(config, "relative_attention", True)
+        self.talking_head = getattr(config, "talking_head", False)
+
+        if self.talking_head:
+            self.head_logits_proj = nn.Linear(config.num_attention_heads, config.num_attention_heads, bias_attr=False)
+            self.head_weights_proj = nn.Linear(config.num_attention_heads, config.num_attention_heads, bias_attr=False)
+
+        if self.relative_attention:
+            self.max_relative_positions = getattr(config, "max_relative_positions", -1)
+            if self.max_relative_positions < 1:
+                self.max_relative_positions = config.max_position_embeddings
+            self.pos_dropout = nn.Dropout(config.hidden_dropout_prob)
+            if "c2p" in self.pos_att_type:
+                self.pos_proj = nn.Linear(config.hidden_size, self.all_head_size, bias_attr=False)
+            if "p2c" in self.pos_att_type:
+                self.pos_q_proj = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = StableDropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.shape[:-1] + [self.num_attention_heads, -1]
+        x = paddle.reshape(x, new_x_shape)
+        return paddle.transpose(x, perm=[0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_attentions=False,
+        query_states=None,
+        relative_pos=None,
+        rel_embeddings=None,
+    ):
+        if query_states is None:
+            query_states = self.in_proj(hidden_states)
+            query_states = self.transpose_for_scores(query_states)
+            query_layer, key_layer, value_layer = paddle.chunk(query_states, 3, axis=-1)
+        else:
+
+            def linear(w, b, x):
+                if b is not None:
+                    return paddle.matmul(x, w, transpose_y=True) + b
+                else:
+                    return paddle.matmul(x, w, transpose_y=True)
+
+            ws = paddle.chunk(self.in_proj.weight, self.num_attention_heads * 3, axis=0)
+            qkvw = [paddle.concat([ws[i * 3 + k] for i in range(self.num_attention_heads)], axis=0) for k in range(3)]
+            qkvb = [None] * 3
+
+            q = linear(qkvw[0], qkvb[0], query_states.astype(qkvw[0].dtype))
+            k, v = [linear(qkvw[i], qkvb[i], hidden_states.astype(qkvw[i].dtype)) for i in range(1, 3)]
+            query_layer, key_layer, value_layer = [self.transpose_for_scores(x) for x in [q, k, v]]
+
+        query_layer = query_layer + self.transpose_for_scores(self.q_bias[None, None, :])
+        value_layer = value_layer + self.transpose_for_scores(self.v_bias[None, None, :])
+
+        rel_att = None
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        scale_factor = 1 + len(self.pos_att_type)
+        scale = paddle.sqrt(paddle.to_tensor(query_layer.shape[-1], dtype="float32") * scale_factor)
+        query_layer = query_layer / scale
+        attention_scores = paddle.matmul(query_layer, key_layer.transpose([0, 1, 3, 2]))
+        if self.relative_attention:
+            rel_embeddings = self.pos_dropout(rel_embeddings)
+            rel_att = self.disentangled_att_bias(query_layer, key_layer, relative_pos, rel_embeddings, scale_factor)
+
+        if rel_att is not None:
+            attention_scores = attention_scores + rel_att
+
+        # bxhxlxd
+        if self.talking_head:
+            attention_scores = self.head_logits_proj(paddle.transpose(attention_scores, [0, 2, 3, 1]))
+            attention_scores = paddle.transpose(attention_scores, [0, 3, 1, 2])
+
+        attention_probs = softmax_with_mask(attention_scores, attention_mask, -1)
+        attention_probs = self.dropout(attention_probs)
+
+        if self.talking_head:
+            attention_probs = self.head_weights_proj(paddle.transpose(attention_probs, [0, 2, 3, 1]))
+            attention_probs = paddle.transpose(attention_probs, [0, 3, 1, 2])
+
+        context_layer = paddle.matmul(attention_probs, value_layer)
+        context_layer = paddle.transpose(context_layer, [0, 2, 1, 3])
+        context_layer = paddle.reshape(context_layer, context_layer.shape[:-2] + [-1])
+
+        if output_attentions:
+            return (context_layer, attention_probs)
+        else:
+            return context_layer
+
+    def disentangled_att_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor):
+        if relative_pos is None:
+            q = query_layer.shape[-2]
+            relative_pos = build_relative_position(q, key_layer.shape[-2])
+        if relative_pos.ndim == 2:
+            relative_pos = relative_pos.unsqueeze(0).unsqueeze(0)
+        elif relative_pos.ndim == 3:
+            relative_pos = relative_pos.unsqueeze(1)
+        # bxhxqxk
+        elif relative_pos.ndim != 4:
+            raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.ndim}")
+
+        att_span = min(max(query_layer.shape[-2], key_layer.shape[-2]), self.max_relative_positions)
+        relative_pos = relative_pos.astype("int64")
+        rel_embeddings = rel_embeddings[
+            self.max_relative_positions - att_span : self.max_relative_positions + att_span, :
+        ]
+        rel_embeddings = paddle.unsqueeze(rel_embeddings, axis=0)
+
+        score = 0
+
+        if "c2p" in self.pos_att_type:
+            pos_key_layer = self.pos_proj(rel_embeddings)
+            pos_key_layer = self.transpose_for_scores(pos_key_layer)
+            c2p_att = paddle.matmul(query_layer, pos_key_layer.transpose([0, 1, 3, 2]))
+            c2p_pos = paddle.clip(relative_pos + att_span, 0, att_span * 2 - 1)
+            c2p_att = paddle.take_along_axis(
+                c2p_att, axis=-1, indices=c2p_dynamic_expand(c2p_pos, query_layer, relative_pos)
+            )
+            score += c2p_att
+
+        if "p2c" in self.pos_att_type:
+            pos_query_layer = self.pos_q_proj(rel_embeddings)
+            pos_query_layer = self.transpose_for_scores(pos_query_layer)
+            pos_query_layer /= paddle.sqrt(paddle.to_tensor(pos_query_layer.shape[-1], dtype="float32") * scale_factor)
+            if query_layer.shape[-2] != key_layer.shape[-2]:
+                r_pos = build_relative_position(key_layer.shape[-2], key_layer.shape[-2])
+            else:
+                r_pos = relative_pos
+            p2c_pos = paddle.clip(-r_pos + att_span, 0, att_span * 2 - 1)
+            p2c_att = paddle.matmul(key_layer, pos_query_layer.transpose([0, 1, 3, 2]).astype(key_layer.dtype))
+            p2c_att = paddle.take_along_axis(
+                p2c_att, axis=-1, indices=p2c_dynamic_expand(p2c_pos, query_layer, key_layer)
+            ).transpose([0, 1, 3, 2])
+
+            if query_layer.shape[-2] != key_layer.shape[-2]:
+                pos_index = relative_pos[:, :, :, 0].unsqueeze(-1)
+                p2c_att = paddle.gather(p2c_att, axis=-2, index=pos_dynamic_expand(pos_index, p2c_att, key_layer))
+            score += p2c_att
+
+        return score
+
+
+class DebertaAttention(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.self = DisentangledSelfAttention(config)
+        self.output = DebertaSelfOutput(config)
+        self.config = config
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_attentions=False,
+        query_states=None,
+        relative_pos=None,
+        rel_embeddings=None,
+    ):
+        self_output = self.self(
+            hidden_states,
+            attention_mask,
+            output_attentions,
+            query_states=query_states,
+            relative_pos=relative_pos,
+            rel_embeddings=rel_embeddings,
+        )
+
+        if output_attentions:
+            self_output, att_matrix = self_output
+
+        if query_states is None:
+            query_states = hidden_states
+
+        attention_output = self.output(self_output, query_states)
+
+        if output_attentions:
+            return (attention_output, att_matrix)
+        else:
+            return attention_output
+
+
+class DebertaIntermediate(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.intermediate_act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class DebertaOutput(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = DebertaLayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = StableDropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class DebertaLayer(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = DebertaAttention(config)
+        self.intermediate = DebertaIntermediate(config)
+        self.output = DebertaOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        query_states=None,
+        relative_pos=None,
+        rel_embeddings=None,
+        output_attentions=False,
+    ):
+        attention_output = self.attention(
+            hidden_states,
+            attention_mask,
+            output_attentions=output_attentions,
+            query_states=query_states,
+            relative_pos=relative_pos,
+            rel_embeddings=rel_embeddings,
+        )
+        if output_attentions:
+            attention_output, att_matrix = attention_output
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        if output_attentions:
+            return (layer_output, att_matrix)
+        else:
+            return layer_output
+
+
+class DebertaEncoder(paddle.nn.Layer):
+    """Modified BertEncoder with relative position bias support"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.layer = paddle.nn.LayerList([DebertaLayer(config) for _ in range(config.num_hidden_layers)])
+        self.relative_attention = getattr(config, "relative_attention", False)
+        if self.relative_attention:
+            self.max_relative_positions = getattr(config, "max_relative_positions", -1)
+            if self.max_relative_positions < 1:
+                self.max_relative_positions = config.max_position_embeddings
+            self.rel_embeddings = nn.Embedding(self.max_relative_positions * 2, config.hidden_size)
+        self.gradient_checkpointing = False
+
+    def get_rel_embedding(self):
+        rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None
+        return rel_embeddings
+
+    def get_attention_mask(self, attention_mask):
+        if attention_mask.dim() <= 2:
+            extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+            attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1)
+            attention_mask = attention_mask.astype("float32")
+        elif attention_mask.dim() == 3:
+            attention_mask = attention_mask.unsqueeze(1)
+
+        return attention_mask
+
+    def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):
+        if self.relative_attention and relative_pos is None:
+            q = query_states.shape[-2] if query_states is not None else hidden_states.shape[-2]
+            relative_pos = build_relative_position(q, hidden_states.shape[-2])
+        return relative_pos
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_hidden_states=True,
+        output_attentions=False,
+        query_states=None,
+        relative_pos=None,
+        return_dict=None,
+    ):
+        attention_mask = self.get_attention_mask(attention_mask)
+        relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)
+
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        if isinstance(hidden_states, Sequence):
+            next_kv = hidden_states[0]
+        else:
+            next_kv = hidden_states
+        rel_embeddings = self.get_rel_embedding()
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                hidden_states = paddle.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    next_kv,
+                    attention_mask,
+                    query_states,
+                    relative_pos,
+                    rel_embeddings,
+                )
+            else:
+                hidden_states = layer_module(
+                    next_kv,
+                    attention_mask,
+                    query_states=query_states,
+                    relative_pos=relative_pos,
+                    rel_embeddings=rel_embeddings,
+                    output_attentions=output_attentions,
+                )
+
+            if output_attentions:
+                hidden_states, att_m = hidden_states
+
+            if query_states is not None:
+                query_states = hidden_states
+                if isinstance(hidden_states, Sequence):
+                    next_kv = hidden_states[i + 1] if i + 1 < len(self.layer) else None
+            else:
+                next_kv = hidden_states
+
+            if output_attentions:
+                all_attentions = all_attentions + (att_m,)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
+
+
+class DebertaPreTrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained BERT models. It provides BERT related
+    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
+    `pretrained_init_configuration`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    model_config_file = CONFIG_NAME
+    config_class = DebertaConfig
+    resource_files_names = {"model_state": "model_state.pdparams"}
+    base_model_prefix = "deberta"
+
+    pretrained_init_configuration = DEBERTA_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = DEBERTA_PRETRAINED_RESOURCE_FILES_MAP
+
+    @classmethod
+    def _get_name_mappings(cls, config):
+        mappings = []
+        model_mappings = [
+            ["embeddings.word_embeddings.weight", "embeddings.word_embeddings.weight"],
+            ["embeddings.LayerNorm.weight", "embeddings.LayerNorm.weight"],
+            ["embeddings.LayerNorm.bias", "embeddings.LayerNorm.bias"],
+            ["embeddings.position_embeddings.weight", "embeddings.position_embeddings.weight"],
+            ["embeddings.token_type_embeddings.weight", "embeddings.token_type_embeddings.weight"],
+            ["encoder.rel_embeddings.weight", "encoder.rel_embeddings.weight"],
+        ]
+        for layer_index in range(config.num_hidden_layers):
+
+            layer_mappings = [
+                [
+                    f"encoder.layer.{layer_index}.attention.self.q_bias",
+                    f"encoder.layer.{layer_index}.attention.self.q_bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.v_bias",
+                    f"encoder.layer.{layer_index}.attention.self.v_bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.in_proj.weight",
+                    f"encoder.layer.{layer_index}.attention.self.in_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.pos_proj.weight",
+                    f"encoder.layer.{layer_index}.attention.self.pos_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.pos_q_proj.weight",
+                    f"encoder.layer.{layer_index}.attention.self.pos_q_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.pos_q_proj.bias",
+                    f"encoder.layer.{layer_index}.attention.self.pos_q_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.dense.weight",
+                    f"encoder.layer.{layer_index}.attention.output.dense.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.dense.bias",
+                    f"encoder.layer.{layer_index}.attention.output.dense.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.LayerNorm.weight",
+                    f"encoder.layer.{layer_index}.attention.output.LayerNorm.weight",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.LayerNorm.bias",
+                    f"encoder.layer.{layer_index}.attention.output.LayerNorm.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.intermediate.dense.weight",
+                    f"encoder.layer.{layer_index}.intermediate.dense.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.intermediate.dense.bias",
+                    f"encoder.layer.{layer_index}.intermediate.dense.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.output.dense.weight",
+                    f"encoder.layer.{layer_index}.output.dense.weight",
+                    "transpose",
+                ],
+                [f"encoder.layer.{layer_index}.output.dense.bias", f"encoder.layer.{layer_index}.output.dense.bias"],
+                [
+                    f"encoder.layer.{layer_index}.output.LayerNorm.weight",
+                    f"encoder.layer.{layer_index}.output.LayerNorm.weight",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.output.LayerNorm.bias",
+                    f"encoder.layer.{layer_index}.output.LayerNorm.bias",
+                ],
+            ]
+            model_mappings.extend(layer_mappings)
+        # adapt for hf-internal-testing/tiny-random-DebertaModel
+        if config.architectures is not None and "DebertaModel" in config.architectures:
+            pass
+        else:
+            for mapping in model_mappings:
+                mapping[0] = "deberta." + mapping[0]
+                mapping[1] = "deberta." + mapping[1]
+        mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)]
+        return mappings
+
+    def init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range,
+                        shape=layer.weight.shape,
+                    )
+                )
+
+        elif isinstance(layer, nn.LayerNorm):
+            layer._epsilon = self.config.layer_norm_eps
+
+
+@register_base_model
+class DebertaModel(DebertaPreTrainedModel):
+    def __init__(self, config: DebertaConfig):
+        super(DebertaModel, self).__init__(config)
+        self.config = config
+        self.embeddings = DebertaEmbeddings(config)
+        self.encoder = DebertaEncoder(config)
+        self.z_steps = getattr(config, "z_steps", 0)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if attention_mask is None:
+            attention_mask = paddle.ones(input_shape, dtype="int64")
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(input_shape, dtype="int64")
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask,
+            output_hidden_states=True,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+        if not return_dict:
+            encoded_layers = encoder_outputs[1]
+        else:
+            encoded_layers = encoder_outputs.hidden_states
+
+        if self.z_steps > 1:
+            hidden_states = encoded_layers[-2]
+            layers = [self.encoder.layer[-1] for _ in range(self.z_steps)]
+            query_states = encoded_layers[-1]
+            rel_embeddings = self.encoder.get_rel_embedding()
+            attention_mask = self.encoder.get_attention_mask(attention_mask)
+            rel_pos = self.encoder.get_rel_pos(embedding_output)
+            for layer in layers[1:]:
+                query_states = layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=False,
+                    query_states=query_states,
+                    relative_pos=rel_pos,
+                    rel_embeddings=rel_embeddings,
+                )
+                encoded_layers.append(query_states)
+
+        sequence_output = encoded_layers[-1]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[(1 if output_hidden_states else 2) :]
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states if output_hidden_states else None,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class DebertaPredictionHeadTransform(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class DebertaLMPredictionHead(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = DebertaPredictionHeadTransform(config)
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias_attr=False)
+        self.bias = paddle.create_parameter(
+            shape=[config.vocab_size], default_initializer=nn.initializer.Constant(0.0), dtype="float32"
+        )
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class DebertaOnlyMLMHead(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = DebertaLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class DebertaForMaskedLM(DebertaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.deberta = DebertaModel(config)
+        self.cls = DebertaOnlyMLMHead(config)
+
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.deberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.reshape(-1, self.config.vocab_size), labels.reshape(-1))
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class ContextPooler(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        hidden_size = config.pooler_hidden_size if config.pooler_hidden_size is not None else config.hidden_size
+        self.dense = nn.Linear(config.hidden_size, hidden_size)
+        self.dropout = StableDropout(config.pooler_dropout)
+        self.config = config
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        context_token = hidden_states[:, 0, :]
+        context_token = self.dropout(context_token)
+        pooled_output = self.dense(context_token)
+        pooled_output = F.gelu(pooled_output)
+        return pooled_output
+
+    @property
+    def output_dim(self):
+        return self.config.hidden_size
+
+
+class DebertaForSequenceClassification(DebertaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.deberta = DebertaModel(config)
+
+        self.pooler = ContextPooler(config)
+        output_dim = self.pooler.output_dim if self.pooler is not None else config.hidden_size
+        self.classifier = nn.Linear(output_dim, config.num_labels)
+
+        drop_out = getattr(config, "cls_dropout", None)
+        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
+
+        self.dropout = StableDropout(drop_out)
+
+    def get_input_embeddings(self):
+        return self.deberta.get_input_embeddings()
+
+    def set_input_embeddings(self, new_embeddings):
+        return self.deberta.set_input_embeddings(new_embeddings)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.deberta(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = self.pooler(outputs[0])
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.num_labels == 1:
+                loss_fct = paddle.nn.MSELoss()
+                loss = loss_fct(logits, labels)
+            elif labels.dtype == paddle.int64 or labels.dtype == paddle.int32:
+                loss_fct = paddle.nn.CrossEntropyLoss()
+                loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+            else:
+                loss_fct = paddle.nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class DebertaForTokenClassification(DebertaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.deberta = DebertaModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.deberta(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class DebertaForQuestionAnswering(DebertaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.deberta = DebertaModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.deberta(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        logits = paddle.transpose(logits, perm=[2, 0, 1])
+        start_logits, end_logits = paddle.unstack(x=logits, axis=0)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if start_positions.ndim > 1:
+                start_positions = start_positions.squeeze(-1)
+            if start_positions.ndim > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clip(0, ignored_index)
+            end_positions = end_positions.clip(0, ignored_index)
+
+            loss_fct = paddle.nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class DebertaForMultipleChoice(DebertaPreTrainedModel):
+
+    """
+    Deberta Model with a linear layer on top of the hidden-states output layer,
+    designed for multiple choice tasks like RocStories/SWAG tasks.
+
+    Args:
+        bert (:class:`DebertaModel`):
+            An instance of DebertaModel.
+        num_choices (int, optional):
+            The number of choices. Defaults to `2`.
+        dropout (float, optional):
+            The dropout probability for output of Bert.
+            If None, use the same value as `hidden_dropout_prob` of `DebertaModel`
+            instance `bert`. Defaults to None.
+    """
+
+    def __init__(self, config: DebertaConfig):
+        super(DebertaForMultipleChoice, self).__init__(config)
+        self.deberta = DebertaModel(config)
+        self.dropout = StableDropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.pooler = ContextPooler(config)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+        self.apply(self.init_weights)
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_hidden_states=None,
+        output_attentions=None,
+        return_dict=None,
+    ):
+        r"""
+        The DebertaForMultipleChoice forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`DebertaModel` and shape as [batch_size, num_choice, sequence_length].
+            token_type_ids(Tensor, optional):
+                See :class:`DebertaModel` and shape as [batch_size, num_choice, sequence_length].
+            position_ids(Tensor, optional):
+                See :class:`DebertaModel` and shape as [batch_size, num_choice, sequence_length].
+            attention_mask (list, optional):
+                See :class:`DebertaModel` and shape as [batch_size, num_choice, sequence_length].
+            inputs_embeds (list, optional):
+                See :class:`DebertaModel` and shape as [batch_size, num_choice, sequence_length].
+            labels (Tensor of shape `(batch_size, )`, optional):
+                Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+                num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+                `input_ids` above)
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import BertForMultipleChoice, BertTokenizer
+                from paddlenlp.data import Pad, Dict
+
+                tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+                model = BertForMultipleChoice.from_pretrained('bert-base-uncased', num_choices=2)
+
+                data = [
+                    {
+                        "question": "how do you turn on an ipad screen?",
+                        "answer1": "press the volume button.",
+                        "answer2": "press the lock button.",
+                        "label": 1,
+                    },
+                    {
+                        "question": "how do you indent something?",
+                        "answer1": "leave a space before starting the writing",
+                        "answer2": "press the spacebar",
+                        "label": 0,
+                    },
+                ]
+
+                text = []
+                text_pair = []
+                for d in data:
+                    text.append(d["question"])
+                    text_pair.append(d["answer1"])
+                    text.append(d["question"])
+                    text_pair.append(d["answer2"])
+
+                inputs = tokenizer(text, text_pair)
+                batchify_fn = lambda samples, fn=Dict(
+                    {
+                        "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input_ids
+                        "token_type_ids": Pad(
+                            axis=0, pad_val=tokenizer.pad_token_type_id
+                        ),  # token_type_ids
+                    }
+                ): fn(samples)
+                inputs = batchify_fn(inputs)
+
+                reshaped_logits = model(
+                    input_ids=paddle.to_tensor(inputs[0], dtype="int64"),
+                    token_type_ids=paddle.to_tensor(inputs[1], dtype="int64"),
+                )
+                print(reshaped_logits.shape)
+                # [2, 2]
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is not None:
+            num_choices = input_ids.shape[1]
+        elif inputs_embeds is not None:
+            num_choices = inputs_embeds.shape[1]
+
+        input_ids = input_ids.reshape((-1, input_ids.shape[-1])) if input_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.reshape((-1, inputs_embeds.shape[-2], inputs_embeds.shape[-1]))
+            if inputs_embeds is not None
+            else None
+        )
+        position_ids = position_ids.reshape((-1, position_ids.shape[-1])) if position_ids is not None else None
+        token_type_ids = token_type_ids.reshape((-1, token_type_ids.shape[-1])) if token_type_ids is not None else None
+        attention_mask = attention_mask.reshape((-1, attention_mask.shape[-1])) if attention_mask is not None else None
+
+        outputs = self.deberta(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = self.pooler(outputs[0])
+        pooled_output = self.dropout(pooled_output)
+
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.reshape((-1, num_choices))
+
+        loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta/tokenizer.py
new file mode 100644
index 000000000..bbfb46fef
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta/tokenizer.py
@@ -0,0 +1,413 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shutil
+from functools import lru_cache
+
+import regex as re
+
+from .. import AddedToken, PretrainedTokenizer
+
+__all__ = [
+    "DebertaTokenizer",
+]
+
+# false
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "deberta-base": 512,
+}
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+    characters the bpe code barfs on.
+
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
+    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
+    tables between utf-8 bytes and unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class DebertaTokenizer(PretrainedTokenizer):
+    """
+    Constructs a DeBERTa tokenizer based on byte-level Byte-Pair-Encoding.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            Path to the vocab file.
+            The vocab file contains a mapping from vocabulary strings to indices.
+        merges_file (str):
+            Path to the merge file.
+            The merge file is used to split the input sentence into "subword" units.
+            The vocab file is then used to encode those units as intices.
+        errors (str):
+            Paradigm to follow when decoding bytes to UTF-8.
+            Defaults to `'replace'`.
+        max_len (int, optional):
+            The maximum value of the input sequence length.
+            Defaults to `None`.
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import DebertaTokenizer
+
+            tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
+            print(tokenizer('Welcome to use PaddlePaddle and PaddleNLP'))
+
+            '''
+            {'input_ids': [1, 25194, 7, 304, 221, 33151, 510, 33151, 8, 221, 33151, 487, 21992, 2],
+            'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
+            '''
+
+    """
+
+    resource_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "deberta-base": "https://paddlenlp.bj.bcebos.com/models/community/microsoft/deberta-base/vocab.json",
+        },
+        "merges_file": {
+            "deberta-base": "https://paddlenlp.bj.bcebos.com/models/community/microsoft/deberta-base/merges.txt",
+        },
+    }
+
+    # TODO: Add pretrained init configuration
+    pretrained_init_configuration = {
+        "deberta-base": {"do_lower_case": True},
+    }
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        max_len=None,
+        bos_token="[CLS]",
+        eos_token="[SEP]",
+        sep_token="[SEP]",
+        cls_token="[CLS]",
+        unk_token="[UNK]",
+        pad_token="[PAD]",
+        mask_token="[MASK]",
+        add_prefix_space=False,
+        add_bos_token=False,
+        **kwargs  # The token of newline.
+    ):
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        mask_token = AddedToken(mask_token, lstrip=False, rstrip=False) if isinstance(mask_token, str) else mask_token
+        self._build_special_tokens_map_extended(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            unk_token=unk_token,
+        )
+
+        self._vocab_file = vocab_file
+        self._merges_file = merges_file
+        self.max_len = max_len if max_len is not None else int(1e12)
+        self.num_command_tokens = 2
+        self.num_type_tokens = 2
+
+        with open(vocab_file, "r", encoding="utf-8") as f:
+            self.encoder = json.load(f)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.num_tokens = len(self.encoder)
+        self.num_text_tokens = self.num_tokens - 1
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        with open(merges_file, encoding="utf-8") as f:
+            bpe_data = f.read().split("\n")[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        self.add_prefix_space = add_prefix_space
+        self.add_bos_token = add_bos_token
+
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+    @property
+    def vocab_size(self):
+        """
+        Returns the size of vocabulary.
+
+        Returns:
+            int: The sum of size of vocabulary and the size of speical tokens.
+
+        """
+
+        return len(self.encoder)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    # no
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            token = "".join(
+                self.byte_encoder[b] for b in token.encode("utf-8")
+            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    def _convert_token_to_id(self, token):
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+
+        return self.decoder[index]
+
+    def convert_ids_to_string(self, ids):
+        """
+        Converts a single index or a sequence of indices to texts.
+
+        Args:
+            ids (int|List[int]):
+                The token id (or token ids) to be converted to text.
+
+        Returns:
+            str: The decoded text.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import DebertaTokenizer
+                tokenizer = DebertaTokenizer.from_pretrained('deberta-base')
+                print(tokenizer.convert_ids_to_string(tokenizer.convert_ids_to_string([14618, 284, 779, 350, 37382, 47, 37382, 290, 350, 37382, 45, 19930]))
+                # 'Welcome to use PaddlePaddle and PaddleNLP'
+
+        """
+
+        text = "".join([self.decoder[id] for id in ids])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        r"""
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+
+        A ERNIE sequence pair mask has the following format:
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs.
+                Defaults to `None`.
+
+        Returns:
+            List[int]: List of token_type_id according to the given sequence(s).
+        """
+        _sep = [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(_cls + token_ids_0 + _sep) * [0]
+        return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 + _sep) * [1]
+
+    def save_resources(self, save_directory):
+        """
+        Saves `SentencePiece <https://github.com/google/sentencepiece>`__ file
+        (ends with '.spm') under `save_directory`.
+
+        Args:
+            save_directory (str): Directory to save files into.
+        """
+        for name, file_name in self.resource_files_names.items():
+            source_path = getattr(self, "_%s" % name)
+
+            save_path = os.path.join(save_directory, file_name)
+            if os.path.abspath(source_path) != os.path.abspath(save_path):
+                shutil.copyfile(source_path, save_path)
+
+    def convert_tokens_to_string(self, tokens):
+        """
+        Converts a sequence of tokens (string) in a single string.
+        """
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
+        if is_split_into_words or add_prefix_space:
+            text = " " + text
+        return (text, kwargs)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        r"""
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens.
+
+        - single sequence:      ``[CLS] X [SEP]``
+        - pair of sequences:        ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs.
+                Defaults to `None`.
+
+        Returns:
+            List[int]: List of input_id with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        return _cls + token_ids_0 + _sep + token_ids_1 + _sep
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        r"""
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+        Args:
+            token_ids_0 (List[int]):
+                List of ids of the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs.
+                Defaults to `None`.
+            already_has_special_tokens (str, optional):
+                Whether or not the token list is already formatted with special tokens for the model.
+                Defaults to `False`.
+        Returns:
+            List[int]:
+                The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        """
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        A BERT offset_mapping has the following format:
+
+        - single sequence:      ``(0,0) X (0,0)``
+        - pair of sequences:        ``(0,0) A (0,0) B (0,0)``
+
+        Args:
+            offset_mapping_ids_0 (List[tuple]):
+                List of wordpiece offsets to which the special tokens will be added.
+            offset_mapping_ids_1 (List[tuple], optional):
+                Optional second list of wordpiece offsets for offset mapping pairs. Defaults to None.
+
+        Returns:
+            List[tuple]: A list of wordpiece offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta_v2/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta_v2/__init__.py
new file mode 100644
index 000000000..fd05a9208
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta_v2/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta_v2/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta_v2/configuration.py
new file mode 100644
index 000000000..470e3cd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta_v2/configuration.py
@@ -0,0 +1,260 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" DeBERTa model configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["DEBERTA_V2_PRETRAINED_INIT_CONFIGURATION", "DebertaV2Config", "DEBERTA_V2_PRETRAINED_RESOURCE_FILES_MAP"]
+
+DEBERTA_V2_PRETRAINED_INIT_CONFIGURATION = {
+    "microsoft/deberta-v3-base": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "max_position_embeddings": 512,
+        "relative_attention": True,
+        "position_buckets": 256,
+        "norm_rel_ebd": "layer_norm",
+        "share_att_key": True,
+        "pos_att_type": ["p2c", "c2p"],
+        "layer_norm_eps": 1e-7,
+        "max_relative_positions": -1,
+        "position_biased_input": False,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 0,
+        "vocab_size": 128100,
+    },
+    "microsoft/deberta-v3-large": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "intermediate_size": 4096,
+        "max_position_embeddings": 512,
+        "relative_attention": True,
+        "position_buckets": 256,
+        "norm_rel_ebd": "layer_norm",
+        "share_att_key": True,
+        "pos_att_type": ["p2c", "c2p"],
+        "layer_norm_eps": 1e-7,
+        "max_relative_positions": -1,
+        "position_biased_input": False,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 24,
+        "type_vocab_size": 0,
+        "vocab_size": 128100,
+    },
+    "microsoft/deberta-v2-xlarge": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1536,
+        "initializer_range": 0.02,
+        "intermediate_size": 6144,
+        "max_position_embeddings": 512,
+        "relative_attention": True,
+        "position_buckets": 256,
+        "norm_rel_ebd": "layer_norm",
+        "share_att_key": True,
+        "pos_att_type": ["p2c", "c2p"],
+        "layer_norm_eps": 1e-7,
+        "conv_kernel_size": 3,
+        "conv_act": "gelu",
+        "max_relative_positions": -1,
+        "position_biased_input": False,
+        "num_attention_heads": 24,
+        "attention_head_size": 64,
+        "num_hidden_layers": 24,
+        "type_vocab_size": 0,
+        "vocab_size": 128100,
+    },
+    "deepset/deberta-v3-large-squad2": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "intermediate_size": 4096,
+        "language": "english",
+        "layer_norm_eps": 1e-07,
+        "max_position_embeddings": 512,
+        "max_relative_positions": -1,
+        "norm_rel_ebd": "layer_norm",
+        "num_attention_heads": 16,
+        "num_hidden_layers": 24,
+        "pad_token_id": 0,
+        "pooler_dropout": 0,
+        "pooler_hidden_act": "gelu",
+        "pooler_hidden_size": 1024,
+        "pos_att_type": ["p2c", "c2p"],
+        "position_biased_input": False,
+        "position_buckets": 256,
+        "relative_attention": True,
+        "share_att_key": True,
+        "summary_activation": "tanh",
+        "summary_last_dropout": 0,
+        "summary_type": "first",
+        "summary_use_proj": False,
+        "type_vocab_size": 0,
+        "vocab_size": 128100,
+    },
+}
+
+DEBERTA_V2_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "microsoft/deberta-v2-xlarge": "https://paddlenlp.bj.bcebos.com/models/community/microsoft/deberta-v2-xlarge/model_state.pdparams",
+        "microsoft/deberta-v3-base": "https://paddlenlp.bj.bcebos.com/models/community/microsoft/deberta-v3-base/model_state.pdparams",
+        "microsoft/deberta-v3-large": "https://paddlenlp.bj.bcebos.com/models/community/microsoft/deberta-v3-large/model_state.pdparams",
+        "deepset/deberta-v3-large-squad2": "https://paddlenlp.bj.bcebos.com/models/community/deepset/deberta-v3-large-squad2/model_state.pdparams",
+    }
+}
+
+
+class DebertaV2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DeBERTaV2Model`] . It is used to
+    instantiate a DeBERTaV2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the DeBERTa
+    DeBERTa-v2-xlarge architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+            Vocabulary size of the DeBERTa model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling [`DeBERTaModel`].
+        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        embedding_size (:obj:`int`, `optional`, defaults to 768):
+            Dimensionality of the embedding layer.
+        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 0):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling [`DeBERTaModel`].
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        pad_token_id (:obj:`int`, `optional`, defaults to 0):
+            The value used to pad input_ids.
+        position_biased_input (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether add position bias to the input embeddings.
+        pos_att_type (:obj:`List[str]`, `optional`, defaults to :obj:`["p2c", "c2p"]`):
+            The type of relative position attention. It should be a subset of `["p2c", "c2p", "p2p"]`.
+        output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether the model returns attentions weights.
+        output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether the model returns all hidden-states.
+        relative_attention (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether use relative position encoding.
+
+    Examples:
+
+    ```python
+    >>> from paddlenlp.transformers import DeBERTaModel, DeBERTaConfig
+
+    >>> # Initializing a DeBERTa DeBERTa-v2-base style configuration
+    >>> configuration = DeBERTaV2Config()
+
+    >>> # Initializing a model from the DeBERTa-base-uncased style configuration
+    >>> model = DeBERTaV2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "deberta-v2"
+    attribute_map: Dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
+    pretrained_init_configuration = DEBERTA_V2_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size=128100,
+        hidden_size=1536,
+        num_hidden_layers=24,
+        num_attention_heads=24,
+        intermediate_size=6144,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-7,
+        relative_attention=False,
+        max_relative_positions=-1,
+        pad_token_id=0,
+        position_biased_input=True,
+        pos_att_type=None,
+        pooler_dropout=0,
+        pooler_hidden_act="gelu",
+        share_attn_key=True,
+        output_hidden_states=True,
+        output_attentions=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.relative_attention = relative_attention
+        self.max_relative_positions = max_relative_positions
+        self.pad_token_id = pad_token_id
+        self.position_biased_input = position_biased_input
+
+        # Backwards compatibility
+        if type(pos_att_type) == str:
+            pos_att_type = [x.strip() for x in pos_att_type.lower().split("|")]
+
+        self.pos_att_type = pos_att_type
+        self.vocab_size = vocab_size
+        self.layer_norm_eps = layer_norm_eps
+
+        self.pooler_hidden_size = kwargs.get("pooler_hidden_size", hidden_size)
+        self.pooler_dropout = pooler_dropout
+        self.pooler_hidden_act = pooler_hidden_act
+        self.share_attn_key = share_attn_key
+        self.output_hidden_states = output_hidden_states
+        self.output_attentions = output_attentions
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta_v2/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta_v2/modeling.py
new file mode 100644
index 000000000..0779780fe
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta_v2/modeling.py
@@ -0,0 +1,1482 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model
+
+from ...utils.converter import StateDictNameMapping
+from ...utils.env import CONFIG_NAME
+from ..activations import ACT2FN
+from ..model_outputs import (
+    BaseModelOutput,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from .configuration import (
+    DEBERTA_V2_PRETRAINED_INIT_CONFIGURATION,
+    DEBERTA_V2_PRETRAINED_RESOURCE_FILES_MAP,
+    DebertaV2Config,
+)
+
+__all__ = [
+    "DebertaV2Model",
+    "DebertaV2ForSequenceClassification",
+    "DebertaV2ForQuestionAnswering",
+    "DebertaV2ForTokenClassification",
+    "DebertaV2PreTrainedModel",
+    "DebertaV2ForMultipleChoice",
+]
+from collections.abc import Sequence
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+def softmax_with_mask(x, mask, axis):
+    rmask = paddle.logical_not(mask.astype("bool"))
+    y = paddle.full(x.shape, -float("inf"), x.dtype)
+    return F.softmax(paddle.where(rmask, y, x), axis=axis)
+
+
+class DropoutContext(object):
+    def __init__(self):
+        self.dropout = 0
+        self.mask = None
+        self.scale = 1
+        self.reuse_mask = True
+
+
+def get_mask(input, local_context):
+    if not isinstance(local_context, DropoutContext):
+        dropout = local_context
+        mask = None
+    else:
+        dropout = local_context.dropout
+        dropout *= local_context.scale
+        mask = local_context.mask if local_context.reuse_mask else None
+
+    if dropout > 0 and mask is None:
+        # mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).to(torch.bool)
+        probability_matrix = paddle.full(paddle.empty_like(input).shape, 1 - dropout)
+        mask = (1 - paddle.bernoulli(probability_matrix)).cast("bool")
+
+    if isinstance(local_context, DropoutContext):
+        if local_context.mask is None:
+            local_context.mask = mask
+
+    return mask, dropout
+
+
+class XDropout(paddle.autograd.PyLayer):
+    """Optimized dropout function to save computation and memory by using mask operation instead of multiplication."""
+
+    @staticmethod
+    def forward(ctx, input, local_ctx):
+        mask, dropout = get_mask(input, local_ctx)
+        ctx.scale = 1.0 / (1 - dropout)
+        if dropout > 0:
+            ctx.save_for_backward(mask)
+            return input.masked_fill(mask, 0) * ctx.scale
+        else:
+            return input
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        if ctx.scale > 1:
+            (mask,) = ctx.saved_tensor()
+            return grad_output.masked_fill(mask, 0) * ctx.scale, None
+        else:
+            return grad_output, None
+
+
+class StableDropout(nn.Layer):
+    """
+    Optimized dropout module for stabilizing the training
+
+    Args:
+        drop_prob (float): the dropout probabilities
+    """
+
+    def __init__(self, drop_prob):
+        super().__init__()
+        self.drop_prob = drop_prob
+        self.count = 0
+        self.context_stack = None
+
+    def forward(self, x):
+        """
+        Call the module
+
+        Args:
+            x (`paddle.Tensor`): The input tensor to apply dropout
+        """
+        if self.training and self.drop_prob > 0:
+            return XDropout.apply(x, self.get_context())
+        return x
+
+    def clear_context(self):
+        self.count = 0
+        self.context_stack = None
+
+    def init_context(self, reuse_mask=True, scale=1):
+        if self.context_stack is None:
+            self.context_stack = []
+        self.count = 0
+        for c in self.context_stack:
+            c.reuse_mask = reuse_mask
+            c.scale = scale
+
+    def get_context(self):
+        if self.context_stack is not None:
+            if self.count >= len(self.context_stack):
+                self.context_stack.append(DropoutContext())
+            ctx = self.context_stack[self.count]
+            ctx.dropout = self.drop_prob
+            self.count += 1
+            return ctx
+        else:
+            return self.drop_prob
+
+
+class GELUActivation(nn.Layer):
+    """
+    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
+    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
+    Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def __init__(self, use_gelu_python: bool = False):
+        super().__init__()
+        self.act = nn.functional.gelu
+
+    def _gelu_python(self, input):
+        return input * 0.5 * (1.0 + paddle.erf(input / math.sqrt(2.0)))
+
+    def forward(self, input):
+        return self.act(input)
+
+
+class DebertaV2Embeddings(nn.Layer):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+
+        pad_token_id = getattr(config, "pad_token_id", 0)
+        self.position_biased_input = getattr(config, "position_biased_input", True)
+        self.embedding_size = getattr(config, "embedding_size", config.hidden_size)
+
+        if not self.position_biased_input:
+            self.position_embeddings = None
+        else:
+            self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.embedding_size)
+
+        self.word_embeddings = nn.Embedding(config.vocab_size, self.embedding_size, padding_idx=pad_token_id)
+
+        if config.type_vocab_size > 0:
+            self.token_type_embeddings = nn.Embedding(config.type_vocab_size, self.embedding_size)
+        if self.embedding_size != config.hidden_size:
+            self.embed_proj = nn.Linear(self.embedding_size, config.hidden_size, bias_attr=False)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = StableDropout(config.hidden_dropout_prob)
+        self.config = config
+
+    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None):
+        if input_ids is not None:
+            input_shape = input_ids.shape
+        else:
+            input_shape = inputs_embeds.shape[:-1]
+
+        seq_length = input_shape[1]
+        if position_ids is None:
+            position_ids = paddle.arange(seq_length, dtype="int64")
+            position_ids = position_ids.unsqueeze(0).expand(input_shape)
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(input_shape, dtype="int64")
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        if self.position_embeddings is not None:
+            position_embeds = self.position_embeddings(position_ids)
+        else:
+            position_embeds = paddle.zeros_like(inputs_embeds)
+        embeddings = inputs_embeds
+        if self.position_biased_input:
+            embeddings = embeddings + position_embeds
+        if self.config.type_vocab_size > 0:
+            token_type_embeds = self.token_type_embeddings(token_type_ids)
+            embeddings = embeddings + token_type_embeds
+        if self.embedding_size != self.config.hidden_size:
+            embeddings = self.embed_proj(embeddings)
+        embeddings = self.LayerNorm(embeddings)
+        if mask is not None:
+            if mask.dim() != embeddings.dim():
+                if mask.dim() == 4:
+                    mask = mask.squeeze(1).squeeze(1)
+                mask = mask.unsqueeze(2)
+            embeddings = embeddings * mask.astype(embeddings.dtype)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class DebertaV2SelfOutput(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = StableDropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class DebertaV2Attention(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.self = DisentangledSelfAttention(config)
+        self.output = DebertaV2SelfOutput(config)
+        self.config = config
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_attentions=False,
+        query_states=None,
+        relative_pos=None,
+        rel_embeddings=None,
+    ):
+        self_output = self.self(
+            hidden_states,
+            attention_mask,
+            output_attentions,
+            query_states=query_states,
+            relative_pos=relative_pos,
+            rel_embeddings=rel_embeddings,
+        )
+
+        if output_attentions:
+            self_output, att_matrix = self_output
+
+        if query_states is None:
+            query_states = hidden_states
+
+        attention_output = self.output(self_output, query_states)
+
+        if output_attentions:
+            return (attention_output, att_matrix)
+        else:
+            return attention_output
+
+
+class DebertaV2Intermediate(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.intermediate_act_fn = GELUActivation()
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class DebertaV2Output(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, config.layer_norm_eps)
+        self.dropout = StableDropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class DebertaV2Layer(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = DebertaV2Attention(config)
+        self.intermediate = DebertaV2Intermediate(config)
+        self.output = DebertaV2Output(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        query_states=None,
+        relative_pos=None,
+        rel_embeddings=None,
+        output_attentions=False,
+    ):
+        attention_output = self.attention(
+            hidden_states,
+            attention_mask,
+            output_attentions=output_attentions,
+            query_states=query_states,
+            relative_pos=relative_pos,
+            rel_embeddings=rel_embeddings,
+        )
+        if output_attentions:
+            attention_output, att_matrix = attention_output
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        if output_attentions:
+            return (layer_output, att_matrix)
+        else:
+            return layer_output
+
+
+class ConvLayer(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        kernel_size = getattr(config, "conv_kernel_size", 3)
+        groups = getattr(config, "conv_groups", 1)
+        self.conv_act = getattr(config, "conv_act", "tanh")
+        self.conv = nn.Conv1D(
+            in_channels=config.hidden_size,
+            out_channels=config.hidden_size,
+            kernel_size=kernel_size,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+        )
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = StableDropout(config.hidden_dropout_prob)
+        self.config = config
+
+    def forward(self, hidden_states, residual_states, input_mask):
+        out = self.conv(hidden_states.transpose([0, 2, 1]))
+        out = out.transpose([0, 2, 1])
+        rmask = (1 - input_mask).astype(bool)
+        mask = rmask.unsqueeze(-1).tile([1, 1, out.shape[2]])
+        out = paddle.where(mask, paddle.zeros_like(out), out)
+        out = GELUActivation()(self.dropout(out))
+
+        layer_norm_input = residual_states + out
+        output = self.LayerNorm(layer_norm_input)
+
+        if input_mask is None:
+            output_states = output
+        else:
+            if input_mask.ndim != layer_norm_input.ndim:
+                if input_mask.ndim == 4:
+                    input_mask = paddle.squeeze(input_mask, [1, 2])
+                input_mask = input_mask.unsqueeze(2)
+
+            input_mask = input_mask.astype(output.dtype)
+            output_states = output * input_mask
+
+        return output_states
+
+
+def make_log_bucket_position(relative_pos, bucket_size, max_position):
+    relative_pos = relative_pos.astype("float32")
+    sign = paddle.sign(relative_pos)
+    mid = bucket_size // 2
+    abs_pos = paddle.where(
+        (relative_pos < mid) & (relative_pos > -mid),
+        paddle.to_tensor(mid - 1).astype(relative_pos.dtype),
+        paddle.abs(relative_pos),
+    )
+    log_pos = (
+        paddle.ceil(paddle.log(abs_pos / mid) / paddle.log(paddle.to_tensor((max_position - 1) / mid)) * (mid - 1))
+        + mid
+    )
+    bucket_pos = paddle.where(abs_pos <= mid, relative_pos.astype(log_pos.dtype), log_pos * sign)
+    return bucket_pos
+
+
+def build_relative_position(query_size, key_size, bucket_size=-1, max_position=-1):
+    """
+    Build relative position according to the query and key
+
+    We assume the absolute position of query \\(P_q\\) is range from (0, query_size) and the absolute position of key
+    \\(P_k\\) is range from (0, key_size), The relative positions from query to key is \\(R_{q \\rightarrow k} = P_q -
+    P_k\\)
+
+    Args:
+        query_size (int): the length of query
+        key_size (int): the length of key
+        bucket_size (int): the size of position bucket
+        max_position (int): the maximum allowed absolute position
+
+    Return:
+        `paddle.Tensor`: A tensor with shape [1, query_size, key_size]
+    """
+
+    q_ids = paddle.arange(0, query_size, dtype="int64")
+    k_ids = paddle.arange(0, key_size, dtype="int64")
+    rel_pos_ids = q_ids.unsqueeze(1) - k_ids.unsqueeze(0)
+    if bucket_size > 0 and max_position > 0:
+        rel_pos_ids = make_log_bucket_position(rel_pos_ids, bucket_size, max_position)
+    rel_pos_ids = rel_pos_ids.astype("int64")
+    rel_pos_ids = rel_pos_ids[:query_size, :]
+    rel_pos_ids = rel_pos_ids.unsqueeze(0)
+    return rel_pos_ids
+
+
+def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos):
+    return paddle.expand(
+        c2p_pos, [query_layer.shape[0], query_layer.shape[1], query_layer.shape[2], relative_pos.shape[-1]]
+    )
+
+
+def p2c_dynamic_expand(c2p_pos, query_layer, key_layer):
+    return paddle.expand(
+        c2p_pos, [query_layer.shape[0], query_layer.shape[1], key_layer.shape[-2], key_layer.shape[-2]]
+    )
+
+
+def pos_dynamic_expand(pos_index, p2c_att, key_layer):
+    return paddle.expand(pos_index, p2c_att.shape[:2] + (pos_index.shape[-2], key_layer.shape[-2]))
+
+
+class DisentangledSelfAttention(nn.Layer):
+    """
+    Disentangled self-attention module
+
+    Parameters:
+        config (`DebertaV2Config`):
+            A model config class instance with the configuration to build a new model. The schema is similar to
+            *BertConfig*, for more details, please refer [`DebertaV2Config`]
+
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+        self.num_attention_heads = config.num_attention_heads
+        _attention_head_size = config.hidden_size // config.num_attention_heads
+        self.attention_head_size = getattr(config, "attention_head_size", _attention_head_size)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query_proj = nn.Linear(config.hidden_size, self.all_head_size, bias_attr=True)
+        self.key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias_attr=True)
+        self.value_proj = nn.Linear(config.hidden_size, self.all_head_size, bias_attr=True)
+
+        self.share_att_key = getattr(config, "share_att_key", False)
+        self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else []
+        self.relative_attention = getattr(config, "relative_attention", False)
+
+        if self.relative_attention:
+            self.position_buckets = getattr(config, "position_buckets", -1)
+            self.max_relative_positions = getattr(config, "max_relative_positions", -1)
+            if self.max_relative_positions < 1:
+                self.max_relative_positions = config.max_position_embeddings
+            self.pos_ebd_size = self.max_relative_positions
+            if self.position_buckets > 0:
+                self.pos_ebd_size = self.position_buckets
+
+            self.pos_dropout = StableDropout(config.hidden_dropout_prob)
+
+            if not self.share_att_key:
+                if "c2p" in self.pos_att_type:
+                    self.pos_key_proj = nn.Linear(config.hidden_size, self.all_head_size, bias_attr=True)
+                if "p2c" in self.pos_att_type:
+                    self.pos_query_proj = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = StableDropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x, attention_heads):
+        new_x_shape = x.shape[:-1] + [attention_heads, -1]
+        x = x.reshape(new_x_shape)
+        return x.transpose(perm=[0, 2, 1, 3]).reshape([-1, x.shape[1], x.shape[-1]])
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_attentions=False,
+        query_states=None,
+        relative_pos=None,
+        rel_embeddings=None,
+    ):
+        if query_states is None:
+            query_states = hidden_states
+        query_layer = self.transpose_for_scores(self.query_proj(query_states), self.num_attention_heads)
+        key_layer = self.transpose_for_scores(self.key_proj(hidden_states), self.num_attention_heads)
+        value_layer = self.transpose_for_scores(self.value_proj(hidden_states), self.num_attention_heads)
+
+        rel_att = None
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        scale_factor = 1
+        if "c2p" in self.pos_att_type:
+            scale_factor += 1
+        if "p2c" in self.pos_att_type:
+            scale_factor += 1
+        scale = paddle.sqrt(paddle.to_tensor(query_layer.shape[-1], dtype=paddle.float32) * scale_factor)
+        attention_scores = paddle.bmm(query_layer, key_layer.transpose([0, 2, 1])) / scale.astype(
+            dtype=query_layer.dtype
+        )
+        if self.relative_attention:
+            rel_embeddings = self.pos_dropout(rel_embeddings)
+            rel_att = self.disentangled_attention_bias(
+                query_layer, key_layer, relative_pos, rel_embeddings, scale_factor
+            )
+
+        if rel_att is not None:
+            attention_scores = attention_scores + rel_att
+        attention_scores = attention_scores
+        attention_scores = attention_scores.reshape(
+            [-1, self.num_attention_heads, attention_scores.shape[-2], attention_scores.shape[-1]]
+        )
+
+        # bsz x height x length x dimension
+        attention_probs = softmax_with_mask(attention_scores, attention_mask, -1)
+        attention_probs = self.dropout(attention_probs)
+        context_layer = paddle.bmm(
+            attention_probs.reshape([-1, attention_probs.shape[-2], attention_probs.shape[-1]]), value_layer
+        )
+        context_layer = context_layer.reshape(
+            [-1, self.num_attention_heads, context_layer.shape[-2], context_layer.shape[-1]]
+        ).transpose([0, 2, 1, 3])
+        new_context_layer_shape = context_layer.shape[:-2] + [
+            -1,
+        ]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+        if output_attentions:
+            return (context_layer, attention_probs)
+        else:
+            return context_layer
+
+    def disentangled_attention_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor):
+        if relative_pos is None:
+            q = query_layer.shape[-2]
+            relative_pos = build_relative_position(
+                q,
+                key_layer.shape[-2],
+                bucket_size=self.position_buckets,
+                max_position=self.max_relative_positions,
+            )
+        if relative_pos.ndim == 2:
+            relative_pos = relative_pos.unsqueeze(0).unsqueeze(0)
+        elif relative_pos.ndim == 3:
+            relative_pos = relative_pos.unsqueeze(1)
+        # bsz x height x query x key
+        elif relative_pos.ndim != 4:
+            raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {relative_pos.ndim}")
+
+        att_span = self.pos_ebd_size
+        relative_pos = relative_pos.astype("int64")
+
+        rel_embeddings = rel_embeddings[0 : att_span * 2, :].unsqueeze(0)
+        if self.share_att_key:
+            pos_query_layer = self.transpose_for_scores(
+                self.query_proj(rel_embeddings), self.num_attention_heads
+            ).tile([query_layer.shape[0] // self.num_attention_heads, 1, 1])
+            pos_key_layer = self.transpose_for_scores(self.key_proj(rel_embeddings), self.num_attention_heads).tile(
+                [query_layer.shape[0] // self.num_attention_heads, 1, 1]
+            )
+        else:
+            if "c2p" in self.pos_att_type:
+                pos_key_layer = self.transpose_for_scores(
+                    self.pos_key_proj(rel_embeddings), self.num_attention_heads
+                ).tile([query_layer.shape[0] // self.num_attention_heads, 1, 1])
+            if "p2c" in self.pos_att_type:
+                pos_query_layer = self.transpose_for_scores(
+                    self.pos_query_proj(rel_embeddings), self.num_attention_heads
+                ).tile([query_layer.shape[0] // self.num_attention_heads, 1, 1])
+
+        score = 0
+        # content->position
+        if "c2p" in self.pos_att_type:
+            scale = paddle.sqrt(paddle.to_tensor(pos_key_layer.shape[-1], dtype=paddle.float32) * scale_factor)
+            c2p_att = paddle.bmm(query_layer, pos_key_layer.transpose([0, 2, 1]))
+            c2p_pos = paddle.clip(relative_pos + att_span, 0, att_span * 2 - 1)
+            c2p_att = paddle.take_along_axis(
+                c2p_att,
+                axis=-1,
+                indices=c2p_pos.squeeze(0).expand(
+                    [query_layer.shape[0], query_layer.shape[1], relative_pos.shape[-1]]
+                ),
+            )
+            score += c2p_att / scale.astype(dtype=c2p_att.dtype)
+        # position->content
+        if "p2c" in self.pos_att_type:
+            scale = paddle.sqrt(paddle.to_tensor(pos_query_layer.shape[-1], dtype=paddle.float32) * scale_factor)
+            if key_layer.shape[-2] != query_layer.shape[-2]:
+                r_pos = build_relative_position(
+                    key_layer.shape[-2],
+                    key_layer.shape[-2],
+                    bucket_size=self.position_buckets,
+                    max_position=self.max_relative_positions,
+                )
+                r_pos = r_pos.unsqueeze(0)
+            else:
+                r_pos = relative_pos
+
+            p2c_pos = paddle.clip(-r_pos + att_span, 0, att_span * 2 - 1)
+            p2c_att = paddle.bmm(key_layer, pos_query_layer.transpose([0, 2, 1]))
+            p2c_att = paddle.take_along_axis(
+                p2c_att,
+                axis=-1,
+                indices=p2c_pos.squeeze(0).expand([query_layer.shape[0], key_layer.shape[-2], key_layer.shape[-2]]),
+            ).transpose([0, 2, 1])
+            score += p2c_att / scale.astype(dtype=p2c_att.dtype)
+
+        return score
+
+
+class DebertaV2Encoder(nn.Layer):
+    """Modified BertEncoder with relative position bias support"""
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.layer = nn.LayerList([DebertaV2Layer(config) for _ in range(config.num_hidden_layers)])
+        self.relative_attention = getattr(config, "relative_attention", False)
+
+        if self.relative_attention:
+            self.max_relative_positions = getattr(config, "max_relative_positions", -1)
+            if self.max_relative_positions < 1:
+                self.max_relative_positions = config.max_position_embeddings
+
+            self.position_buckets = getattr(config, "position_buckets", -1)
+            pos_ebd_size = self.max_relative_positions * 2
+
+            if self.position_buckets > 0:
+                pos_ebd_size = self.position_buckets * 2
+
+            self.rel_embeddings = nn.Embedding(pos_ebd_size, config.hidden_size)
+
+        self.norm_rel_ebd = [x.strip() for x in getattr(config, "norm_rel_ebd", "none").lower().split("|")]
+
+        if "layer_norm" in self.norm_rel_ebd:
+            self.LayerNorm = nn.LayerNorm(config.hidden_size, config.layer_norm_eps, bias_attr=True, weight_attr=True)
+
+        self.conv = ConvLayer(config) if getattr(config, "conv_kernel_size", 0) > 0 else None
+        self.gradient_checkpointing = False
+
+    def get_rel_embedding(self):
+        rel_embeddings = self.rel_embeddings.weight if self.relative_attention else None
+        if rel_embeddings is not None and ("layer_norm" in self.norm_rel_ebd):
+            rel_embeddings = self.LayerNorm(rel_embeddings)
+        return rel_embeddings
+
+    def get_attention_mask(self, attention_mask):
+        if attention_mask.dim() <= 2:
+            extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+            attention_mask = extended_attention_mask * extended_attention_mask.squeeze(-2).unsqueeze(-1)
+            attention_mask = attention_mask.astype(paddle.int8)
+        elif attention_mask.dim() == 3:
+            attention_mask = attention_mask.unsqueeze(1)
+
+        return attention_mask
+
+    def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None):
+        if self.relative_attention and relative_pos is None:
+            q = query_states.shape[-2] if query_states is not None else hidden_states.shape[-2]
+            relative_pos = build_relative_position(
+                q,
+                hidden_states.shape[-2],
+                bucket_size=self.position_buckets,
+                max_position=self.max_relative_positions,
+            )
+        return relative_pos
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_hidden_states=True,
+        output_attentions=False,
+        query_states=None,
+        relative_pos=None,
+        return_dict=None,
+    ):
+
+        if attention_mask.ndim <= 2:
+            input_mask = attention_mask
+        else:
+            input_mask = (attention_mask.sum(-2) > 0).astype(paddle.int8)
+        attention_mask = self.get_attention_mask(attention_mask)
+        relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos)
+
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        if isinstance(hidden_states, Sequence):
+            next_kv = hidden_states[0]
+        else:
+            next_kv = hidden_states
+        rel_embeddings = self.get_rel_embedding()
+        output_states = next_kv
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (output_states,)
+
+            output_states = layer_module(
+                next_kv,
+                attention_mask,
+                query_states=query_states,
+                relative_pos=relative_pos,
+                rel_embeddings=rel_embeddings,
+                output_attentions=output_attentions,
+            )
+
+            if output_attentions:
+                output_states, att_m = output_states
+
+            if i == 0 and self.conv is not None:
+                output_states = self.conv(hidden_states, output_states, input_mask)
+
+            if query_states is not None:
+                query_states = output_states
+                if isinstance(hidden_states, Sequence):
+                    next_kv = hidden_states[i + 1] if i + 1 < len(self.layer) else None
+            else:
+                next_kv = output_states
+
+            if output_attentions:
+                all_attentions = all_attentions + (att_m,)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (output_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=output_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
+
+
+class DebertaV2PreTrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained BERT models. It provides BERT related
+    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
+    `pretrained_init_configuration`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    model_config_file = CONFIG_NAME
+    config_class = DebertaV2Config
+    resource_files_names = {"model_state": "model_state.pdparams"}
+    base_model_prefix = "deberta"
+
+    pretrained_init_configuration = DEBERTA_V2_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = DEBERTA_V2_PRETRAINED_RESOURCE_FILES_MAP
+
+    @classmethod
+    def _get_name_mappings(cls, config):
+        mappings = []
+        model_mappings = [
+            ["embeddings.word_embeddings.weight", "embeddings.word_embeddings.weight"],
+            ["embeddings.LayerNorm.weight", "embeddings.LayerNorm.weight"],
+            ["embeddings.LayerNorm.bias", "embeddings.LayerNorm.bias"],
+            ["embeddings.position_embeddings.weight", "embeddings.position_embeddings.weight"],
+            ["encoder.rel_embeddings.weight", "encoder.rel_embeddings.weight"],
+            ["encoder.LayerNorm.weight", "encoder.LayerNorm.weight"],
+            ["encoder.LayerNorm.bias", "encoder.LayerNorm.bias"],
+        ]
+        for layer_index in range(config.num_hidden_layers):
+
+            layer_mappings = [
+                [
+                    f"encoder.layer.{layer_index}.attention.self.query_proj.weight",
+                    f"encoder.layer.{layer_index}.attention.self.query_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.query_proj.bias",
+                    f"encoder.layer.{layer_index}.attention.self.query_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.key_proj.weight",
+                    f"encoder.layer.{layer_index}.attention.self.key_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.key_proj.bias",
+                    f"encoder.layer.{layer_index}.attention.self.key_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.value_proj.weight",
+                    f"encoder.layer.{layer_index}.attention.self.value_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.value_proj.bias",
+                    f"encoder.layer.{layer_index}.attention.self.value_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.dense.weight",
+                    f"encoder.layer.{layer_index}.attention.output.dense.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.dense.bias",
+                    f"encoder.layer.{layer_index}.attention.output.dense.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.LayerNorm.weight",
+                    f"encoder.layer.{layer_index}.attention.output.LayerNorm.weight",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.LayerNorm.bias",
+                    f"encoder.layer.{layer_index}.attention.output.LayerNorm.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.intermediate.dense.weight",
+                    f"encoder.layer.{layer_index}.intermediate.dense.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.intermediate.dense.bias",
+                    f"encoder.layer.{layer_index}.intermediate.dense.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.output.dense.weight",
+                    f"encoder.layer.{layer_index}.output.dense.weight",
+                    "transpose",
+                ],
+                [f"encoder.layer.{layer_index}.output.dense.bias", f"encoder.layer.{layer_index}.output.dense.bias"],
+                [
+                    f"encoder.layer.{layer_index}.output.LayerNorm.weight",
+                    f"encoder.layer.{layer_index}.output.LayerNorm.weight",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.output.LayerNorm.bias",
+                    f"encoder.layer.{layer_index}.output.LayerNorm.bias",
+                ],
+            ]
+            model_mappings.extend(layer_mappings)
+        # adapt for hf-tiny-model-private/tiny-random-DebertaV2Model
+        if config.architectures is not None and "DebertaV2Model" in config.architectures:
+            pass
+        else:
+            for mapping in model_mappings:
+                mapping[0] = "deberta." + mapping[0]
+                mapping[1] = "deberta." + mapping[1]
+        if config.architectures is not None and "DebertaV2ForQuestionAnswering" in config.architectures:
+            model_mappings.extend(
+                [["qa_outputs.weight", "qa_outputs.weight", "transpose"], ["qa_outputs.bias", "qa_outputs.bias"]]
+            )
+        mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)]
+        return mappings
+
+    def init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range,
+                        shape=layer.weight.shape,
+                    )
+                )
+
+        elif isinstance(layer, nn.LayerNorm):
+            layer._epsilon = self.config.layer_norm_eps
+
+
+@register_base_model
+class DebertaV2Model(DebertaV2PreTrainedModel):
+    def __init__(self, config: DebertaV2Config):
+        super(DebertaV2Model, self).__init__(config)
+        self.config = config
+        self.embeddings = DebertaV2Embeddings(config)
+        self.encoder = DebertaV2Encoder(config)
+        self.z_steps = getattr(config, "z_steps", 0)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if attention_mask is None:
+            attention_mask = paddle.ones(input_shape, dtype="int64")
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(input_shape, dtype="int64")
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask,
+            output_hidden_states=True,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+        if not return_dict:
+            encoded_layers = encoder_outputs[1]
+        else:
+            encoded_layers = encoder_outputs.hidden_states
+
+        if self.z_steps > 1:
+            hidden_states = encoded_layers[-2]
+            layers = [self.encoder.layer[-1] for _ in range(self.z_steps)]
+            query_states = encoded_layers[-1]
+            rel_embeddings = self.encoder.get_rel_embedding()
+            attention_mask = self.encoder.get_attention_mask(attention_mask)
+            rel_pos = self.encoder.get_rel_pos(embedding_output)
+            for layer in layers[1:]:
+                query_states = layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=False,
+                    query_states=query_states,
+                    relative_pos=rel_pos,
+                    rel_embeddings=rel_embeddings,
+                )
+                encoded_layers.append(query_states)
+
+        sequence_output = encoded_layers[-1]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[(1 if output_hidden_states else 2) :]
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states if output_hidden_states else None,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class DebertaV2PredictionHeadTransform(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class DebertaV2LMPredictionHead(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = DebertaV2PredictionHeadTransform(config)
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias_attr=False)
+        self.bias = paddle.create_parameter(
+            shape=[config.vocab_size], default_initializer=nn.initializer.Constant(0.0), dtype="float32"
+        )
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class DebertaV2OnlyMLMHead(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = DebertaV2LMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.deberta = DebertaV2Model(config)
+        self.cls = DebertaV2OnlyMLMHead(config)
+
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.deberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.reshape(-1, self.config.vocab_size), labels.reshape(-1))
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class ContextPooler(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        hidden_size = config.pooler_hidden_size if config.pooler_hidden_size is not None else config.hidden_size
+        self.dense = nn.Linear(config.hidden_size, hidden_size)
+        self.dropout = StableDropout(config.pooler_dropout)
+        self.config = config
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        context_token = hidden_states[:, 0, :]
+        context_token = self.dropout(context_token)
+        pooled_output = self.dense(context_token)
+        pooled_output = F.gelu(pooled_output)
+        return pooled_output
+
+    @property
+    def output_dim(self):
+        return self.config.hidden_size
+
+
+class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.deberta = DebertaV2Model(config)
+
+        self.pooler = ContextPooler(config)
+        output_dim = self.pooler.output_dim if self.pooler is not None else config.hidden_size
+        self.classifier = nn.Linear(output_dim, config.num_labels)
+
+        drop_out = getattr(config, "cls_dropout", None)
+        drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out
+
+        self.dropout = StableDropout(drop_out)
+
+    def get_input_embeddings(self):
+        return self.deberta.get_input_embeddings()
+
+    def set_input_embeddings(self, new_embeddings):
+        return self.deberta.set_input_embeddings(new_embeddings)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.deberta(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = self.pooler(outputs[0])
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.num_labels == 1:
+                loss_fct = paddle.nn.MSELoss()
+                loss = loss_fct(logits, labels)
+            elif labels.dtype == paddle.int64 or labels.dtype == paddle.int32:
+                loss_fct = paddle.nn.CrossEntropyLoss()
+                loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+            else:
+                loss_fct = paddle.nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.deberta = DebertaV2Model(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.deberta(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.deberta = DebertaV2Model(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.deberta(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        logits = paddle.transpose(logits, perm=[2, 0, 1])
+        start_logits, end_logits = paddle.unstack(x=logits, axis=0)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if start_positions.ndim > 1:
+                start_positions = start_positions.squeeze(-1)
+            if start_positions.ndim > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clip(0, ignored_index)
+            end_positions = end_positions.clip(0, ignored_index)
+
+            loss_fct = paddle.nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class DebertaV2ForMultipleChoice(DebertaV2PreTrainedModel):
+
+    """
+    Deberta Model with a linear layer on top of the hidden-states output layer,
+    designed for multiple choice tasks like RocStories/SWAG tasks.
+
+    Args:
+        bert (:class:`DebertaModel`):
+            An instance of DebertaModel.
+        num_choices (int, optional):
+            The number of choices. Defaults to `2`.
+        dropout (float, optional):
+            The dropout probability for output of Bert.
+            If None, use the same value as `hidden_dropout_prob` of `DebertaModel`
+            instance `bert`. Defaults to None.
+    """
+
+    def __init__(self, config: DebertaV2Config):
+        super(DebertaV2ForMultipleChoice, self).__init__(config)
+        self.deberta = DebertaV2Model(config)
+        self.dropout = StableDropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.pooler = ContextPooler(config)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+        self.apply(self.init_weights)
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_hidden_states=None,
+        output_attentions=None,
+        return_dict=None,
+    ):
+        r"""
+        The DebertaForMultipleChoice forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`DebertaModel` and shape as [batch_size, num_choice, sequence_length].
+            token_type_ids(Tensor, optional):
+                See :class:`DebertaModel` and shape as [batch_size, num_choice, sequence_length].
+            position_ids(Tensor, optional):
+                See :class:`DebertaModel` and shape as [batch_size, num_choice, sequence_length].
+            attention_mask (list, optional):
+                See :class:`DebertaModel` and shape as [batch_size, num_choice, sequence_length].
+            inputs_embeds (list, optional):
+                See :class:`DebertaModel` and shape as [batch_size, num_choice, sequence_length].
+            labels (Tensor of shape `(batch_size, )`, optional):
+                Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+                num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+                `input_ids` above)
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import BertForMultipleChoice, BertTokenizer
+                from paddlenlp.data import Pad, Dict
+
+                tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+                model = BertForMultipleChoice.from_pretrained('bert-base-uncased', num_choices=2)
+
+                data = [
+                    {
+                        "question": "how do you turn on an ipad screen?",
+                        "answer1": "press the volume button.",
+                        "answer2": "press the lock button.",
+                        "label": 1,
+                    },
+                    {
+                        "question": "how do you indent something?",
+                        "answer1": "leave a space before starting the writing",
+                        "answer2": "press the spacebar",
+                        "label": 0,
+                    },
+                ]
+
+                text = []
+                text_pair = []
+                for d in data:
+                    text.append(d["question"])
+                    text_pair.append(d["answer1"])
+                    text.append(d["question"])
+                    text_pair.append(d["answer2"])
+
+                inputs = tokenizer(text, text_pair)
+                batchify_fn = lambda samples, fn=Dict(
+                    {
+                        "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input_ids
+                        "token_type_ids": Pad(
+                            axis=0, pad_val=tokenizer.pad_token_type_id
+                        ),  # token_type_ids
+                    }
+                ): fn(samples)
+                inputs = batchify_fn(inputs)
+
+                reshaped_logits = model(
+                    input_ids=paddle.to_tensor(inputs[0], dtype="int64"),
+                    token_type_ids=paddle.to_tensor(inputs[1], dtype="int64"),
+                )
+                print(reshaped_logits.shape)
+                # [2, 2]
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is not None:
+            num_choices = input_ids.shape[1]
+        elif inputs_embeds is not None:
+            num_choices = inputs_embeds.shape[1]
+
+        input_ids = input_ids.reshape((-1, input_ids.shape[-1])) if input_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.reshape((-1, inputs_embeds.shape[-2], inputs_embeds.shape[-1]))
+            if inputs_embeds is not None
+            else None
+        )
+        position_ids = position_ids.reshape((-1, position_ids.shape[-1])) if position_ids is not None else None
+        token_type_ids = token_type_ids.reshape((-1, token_type_ids.shape[-1])) if token_type_ids is not None else None
+        attention_mask = attention_mask.reshape((-1, attention_mask.shape[-1])) if attention_mask is not None else None
+
+        outputs = self.deberta(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = self.pooler(outputs[0])
+        pooled_output = self.dropout(pooled_output)
+
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.reshape((-1, num_choices))
+
+        loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta_v2/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta_v2/tokenizer.py
new file mode 100644
index 000000000..a302fcfe7
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/deberta_v2/tokenizer.py
@@ -0,0 +1,587 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import unicodedata
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as sp
+
+from .. import AddedToken, PretrainedTokenizer
+
+__all__ = ["DebertaV2Tokenizer"]
+
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "sentencepiece_model_file": {
+        "microsoft/deberta-v2-xlarge": "https://paddlenlp.bj.bcebos.com/models/community/microsoft/deberta-v2-xlarge/spm.model",
+        "microsoft/deberta-v3-base": "https://paddlenlp.bj.bcebos.com/models/community/microsoft/deberta-v3-base/spm.model",
+        "microsoft/deberta-v3-large": "https://paddlenlp.bj.bcebos.com/models/community/microsoft/deberta-v3-large/spm.model",
+        "deepset/deberta-v3-large-squad2": "https://paddlenlp.bj.bcebos.com/models/community/deepset/deberta-v3-large-squad2/spm.model",
+    }
+}
+
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "microsoft/deberta-v2-xlarge": 512,
+    "microsoft/deberta-v3-base": 512,
+    "microsoft/deberta-v3-large": 512,
+    "deepset/deberta-v3-large-squad2": 512,
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "microsoft/deberta-v2-xlarge": {"do_lower_case": False},
+    "microsoft/deberta-v3-base": {"do_lower_case": False},
+    "microsoft/deberta-v3-large": {"do_lower_case": False},
+    "deepset/deberta-v3-large-squad2": {"do_lower_case": False},
+}
+
+
+class DebertaV2Tokenizer(PretrainedTokenizer):
+    r"""
+    Constructs a DeBERTa-v2 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        do_lower_case (`bool`, *optional*, defaults to `False`):
+            Whether or not to lowercase the input when tokenizing.
+        bos_token (`string`, *optional*, defaults to `"[CLS]"`):
+            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+        eos_token (`string`, *optional*, defaults to `"[SEP]"`):
+            The end of sequence token. When building a sequence using special tokens, this is not the token that is
+            used for the end of sequence. The token used is the `sep_token`.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+    """
+
+    resource_files_names = {"sentencepiece_model_file": "spm.model"}
+    pretrained_resource_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        sentencepiece_model_file,
+        vocab_file=None,
+        do_lower_case=False,
+        split_by_punct=False,
+        bos_token="[CLS]",
+        eos_token="[SEP]",
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        mask_token = AddedToken(mask_token, lstrip=False, rstrip=False) if isinstance(mask_token, str) else mask_token
+        self._build_special_tokens_map_extended(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            unk_token=unk_token,
+        )
+        self._sentencepiece_model_file = sentencepiece_model_file
+        if vocab_file is None:
+            self.vocab_file = sentencepiece_model_file
+        else:
+            self.vocab_file = vocab_file
+
+        if not os.path.isfile(self.vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{self.vocab_file}'. To load the vocabulary from a Google pretrained"
+                " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        self.do_lower_case = do_lower_case
+        self.split_by_punct = split_by_punct
+        # breakpoint()
+        self._tokenizer = SPMTokenizer(
+            self.vocab_file,
+            self.all_special_tokens,
+            split_by_punct=split_by_punct,
+            sp_model_kwargs=self.sp_model_kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    @property
+    def vocab(self):
+        return self._tokenizer.vocab
+
+    def get_vocab(self):
+        vocab = self.vocab.copy()
+        vocab.update(self.get_added_vocab())
+        return vocab
+
+    def _tokenize(self, text: str) -> List[str]:
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+        if self.do_lower_case:
+            text = text.lower()
+        return self._tokenizer.tokenize(text)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self._tokenizer.spm.PieceToId(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self._tokenizer.spm.IdToPiece(index) if index < self.vocab_size else self.unk_token
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        return self._tokenizer.decode(tokens)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A DeBERTa sequence has the following format:
+
+        - single sequence: [CLS] X [SEP]
+        - pair of sequences: [CLS] A [SEP] B [SEP]
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
+        sequence pair mask has the following format:
+
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        add_prefix_space = kwargs.pop("add_prefix_space", False)
+        if is_split_into_words or add_prefix_space:
+            text = " " + text
+        return (text, kwargs)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        return self._tokenizer.save_pretrained(save_directory, filename_prefix=filename_prefix)
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        """
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        A BERT offset_mapping has the following format:
+
+        - single sequence:      ``(0,0) X (0,0)``
+        - pair of sequences:        ``(0,0) A (0,0) B (0,0)``
+
+        Args:
+            offset_mapping_ids_0 (List[tuple]):
+                List of wordpiece offsets to which the special tokens will be added.
+            offset_mapping_ids_1 (List[tuple], optional):
+                Optional second list of wordpiece offsets for offset mapping pairs. Defaults to None.
+
+        Returns:
+            List[tuple]: A list of wordpiece offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def save_resources(self, save_directory):
+        """
+        Saves `SentencePiece <https://github.com/google/sentencepiece>`__ file
+        (ends with '.spm') under `save_directory`.
+
+        Args:
+            save_directory (str): Directory to save files into.
+        """
+        for name, file_name in self.resource_files_names.items():
+            source_path = getattr(self, "_%s" % name)
+            save_path = os.path.join(save_directory, file_name)
+            if os.path.abspath(source_path) != os.path.abspath(save_path):
+                shutil.copyfile(source_path, save_path)
+
+
+class SPMTokenizer:
+    r"""
+    Constructs a tokenizer based on [SentencePiece](https://github.com/google/sentencepiece).
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+    """
+
+    def __init__(
+        self, vocab_file, special_tokens, split_by_punct=False, sp_model_kwargs: Optional[Dict[str, Any]] = None
+    ):
+        self.split_by_punct = split_by_punct
+        self.vocab_file = vocab_file
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
+        if not os.path.exists(vocab_file):
+            raise FileNotFoundError(f"{vocab_file} does not exist!")
+        spm.load(vocab_file)
+        bpe_vocab_size = spm.GetPieceSize()
+        # Token map
+        # <unk> 0+1
+        # <s> 1+1
+        # </s> 2+1
+        self.vocab = {spm.IdToPiece(i): i for i in range(bpe_vocab_size)}
+        self.ids_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)]
+        # self.vocab['[PAD]'] = 0
+        # self.vocab['[CLS]'] = 1
+        # self.vocab['[SEP]'] = 2
+        # self.vocab['[UNK]'] = 3
+
+        self.spm = spm
+        self.special_tokens = special_tokens
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["spm"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.spm.Load(self.vocab_file)
+
+    def tokenize(self, text):
+        return self._encode_as_pieces(text)
+
+    def convert_ids_to_tokens(self, ids):
+        tokens = []
+        for i in ids:
+            tokens.append(self.ids_to_tokens[i])
+        return tokens
+
+    def decode(self, tokens, start=-1, end=-1, raw_text=None):
+        if raw_text is None:
+            current_sub_tokens = []
+            out_string = ""
+            prev_is_special = False
+            for token in tokens:
+                # make sure that special tokens are not decoded using sentencepiece model
+                if token in self.special_tokens:
+                    if not prev_is_special:
+                        out_string += " "
+                    out_string += self.spm.decode_pieces(current_sub_tokens) + token
+                    prev_is_special = True
+                    current_sub_tokens = []
+                else:
+                    current_sub_tokens.append(token)
+                    prev_is_special = False
+            out_string += self.spm.decode_pieces(current_sub_tokens)
+            return out_string.strip()
+        else:
+            words = self.split_to_words(raw_text)
+            word_tokens = [self.tokenize(w) for w in words]
+            token2words = [0] * len(tokens)
+            tid = 0
+            for i, w in enumerate(word_tokens):
+                for k, t in enumerate(w):
+                    token2words[tid] = i
+                    tid += 1
+            word_start = token2words[start]
+            word_end = token2words[end] if end < len(tokens) else len(words)
+            text = "".join(words[word_start:word_end])
+            return text
+
+    def add_special_token(self, token):
+        if token not in self.special_tokens:
+            self.special_tokens.append(token)
+            if token not in self.vocab:
+                self.vocab[token] = len(self.vocab) - 1
+                self.ids_to_tokens.append(token)
+        return self.id(token)
+
+    def part_of_whole_word(self, token, is_bos=False):
+        if is_bos:
+            return True
+        if (
+            len(token) == 1
+            and (_is_whitespace(list(token)[0]) or _is_control(list(token)[0]) or _is_punctuation(list(token)[0]))
+        ) or token in self.special_tokens:
+            return False
+
+        word_start = b"\xe2\x96\x81".decode("utf-8")
+        return not token.startswith(word_start)
+
+    def pad(self):
+        return "[PAD]"
+
+    def bos(self):
+        return "[CLS]"
+
+    def eos(self):
+        return "[SEP]"
+
+    def unk(self):
+        return "[UNK]"
+
+    def mask(self):
+        return "[MASK]"
+
+    def sym(self, id):
+        return self.ids_to_tokens[id]
+
+    def id(self, sym):
+        return self.vocab[sym] if sym in self.vocab else 1
+
+    def _encode_as_pieces(self, text):
+        text = convert_to_unicode(text)
+        if self.split_by_punct:
+            words = self._run_split_on_punc(text)
+            pieces = [self.spm.encode(w, out_type=str) for w in words]
+            return [p for w in pieces for p in w]
+        else:
+            return self.spm.encode(text, out_type=str)
+
+    def split_to_words(self, text):
+        pieces = self._encode_as_pieces(text)
+        word_start = b"\xe2\x96\x81".decode("utf-8")
+        words = []
+        offset = 0
+        prev_end = 0
+        for i, p in enumerate(pieces):
+            if p.startswith(word_start):
+                if offset > prev_end:
+                    words.append(text[prev_end:offset])
+                prev_end = offset
+                w = p.replace(word_start, "")
+            else:
+                w = p
+            try:
+                s = text.index(w, offset)
+                pn = ""
+                k = i + 1
+                while k < len(pieces):
+                    pn = pieces[k].replace(word_start, "")
+                    if len(pn) > 0:
+                        break
+                    k += 1
+
+                if len(pn) > 0 and pn in text[offset:s]:
+                    offset = offset + 1
+                else:
+                    offset = s + len(w)
+            except Exception:
+                offset = offset + 1
+
+        if prev_end < offset:
+            words.append(text[prev_end:offset])
+
+        return words
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically control characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
+
+
+def convert_to_unicode(text):
+    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+    if isinstance(text, str):
+        return text
+    elif isinstance(text, bytes):
+        return text.decode("utf-8", "ignore")
+    else:
+        raise ValueError(f"Unsupported string type: {type(text)}")
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/distilbert/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/distilbert/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/distilbert/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/distilbert/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/distilbert/configuration.py
new file mode 100644
index 000000000..724af5911
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/distilbert/configuration.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from typing import Dict
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["DISTILBERT_PRETRAINED_INIT_CONFIGURATION", "DistilBertConfig", "DISTILBERT_PRETRAINED_RESOURCE_FILES_MAP"]
+
+DISTILBERT_PRETRAINED_INIT_CONFIGURATION = {
+    "distilbert-base-uncased": {
+        "vocab_size": 30522,
+        "hidden_size": 768,
+        "num_hidden_layers": 6,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+    },
+    "distilbert-base-cased": {
+        "vocab_size": 28996,
+        "hidden_size": 768,
+        "num_hidden_layers": 6,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+    },
+}
+
+DISTILBERT_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "distilbert-base-uncased": "http://bj.bcebos.com/paddlenlp/models/transformers/distilbert/distilbert-base-uncased.pdparams",
+        "distilbert-base-cased": "http://bj.bcebos.com/paddlenlp/models/transformers/distilbert/distilbert-base-cased.pdparams",
+    }
+}
+
+
+class DistilBertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DistilBertModel`]. It is used to
+    instantiate a DistilBERT model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the BERT
+    bert-base-uncased architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the DistilBERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python
+    >>> from paddlenlp.transformers import BertModel, BertConfig
+
+    >>> # Initializing a DistilBERT distilbert-base-uncased style configuration
+    >>> configuration = DistilBertConfig()
+
+    >>> # Initializing a model from the distilbert-base-uncased style configuration
+    >>> model = DistilBertModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "distilbert"
+    attribute_map: Dict[str, str] = {
+        "dropout": "classifier_dropout",
+        "num_classes": "num_labels",
+        "n_layers": "num_hidden_layers",  # for `transformers`
+        "n_heads": "num_attention_heads",  # for `transformers`
+        "dim": "hidden_size",  # for `transformers`
+        "hidden_dim": "intermediate_size",  # for `transformers`
+    }
+    pretrained_init_configuration = DISTILBERT_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size: int = 30522,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 6,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        max_position_embeddings: int = 512,
+        initializer_range: float = 0.02,
+        pad_token_id: int = 0,
+        pool_act: str = "tanh",
+        fuse: bool = False,
+        layer_norm_eps=1e-12,
+        use_cache=False,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.pool_act = pool_act
+        self.fuse = fuse
+
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/distilbert/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/distilbert/modeling.py
new file mode 100644
index 000000000..8c5c04969
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/distilbert/modeling.py
@@ -0,0 +1,585 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+import paddle
+import paddle.nn as nn
+
+from paddlenlp.utils.env import CONFIG_NAME
+
+from ...utils.converter import StateDictNameMapping, init_name_mappings
+from .. import PretrainedModel, register_base_model
+from .configuration import (
+    DISTILBERT_PRETRAINED_INIT_CONFIGURATION,
+    DISTILBERT_PRETRAINED_RESOURCE_FILES_MAP,
+    DistilBertConfig,
+)
+
+__all__ = [
+    "DistilBertModel",
+    "DistilBertPretrainedModel",
+    "DistilBertForSequenceClassification",
+    "DistilBertForTokenClassification",
+    "DistilBertForQuestionAnswering",
+    "DistilBertForMaskedLM",
+]
+
+
+class BertEmbeddings(nn.Layer):
+    """
+    Includes embeddings from word, position and does not include
+    token_type embeddings.
+    """
+
+    def __init__(self, config: DistilBertConfig):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, position_ids=None):
+        if position_ids is None:
+            ones = paddle.ones_like(input_ids, dtype="int64")
+            seq_length = paddle.cumsum(ones, axis=-1)
+
+            position_ids = seq_length - ones
+            position_ids.stop_gradient = True
+
+        input_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = input_embeddings + position_embeddings
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class DistilBertPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained DistilBert models. It provides DistilBert related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    pretrained_init_configuration = DISTILBERT_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = DISTILBERT_PRETRAINED_RESOURCE_FILES_MAP
+    base_model_prefix = "distilbert"
+    config_class = DistilBertConfig
+    model_config_file = CONFIG_NAME
+
+    @classmethod
+    def _get_name_mappings(cls, config: DistilBertConfig) -> List[StateDictNameMapping]:
+        mappings: list[StateDictNameMapping] = []
+        model_mappings = [
+            "embeddings.word_embeddings.weight",
+            "embeddings.position_embeddings.weight",
+            ["embeddings.LayerNorm.weight", "embeddings.layer_norm.weight"],
+            ["embeddings.LayerNorm.bias", "embeddings.layer_norm.bias"],
+        ]
+        for layer_index in range(config.num_hidden_layers):
+            layer_mappings = [
+                [
+                    f"transformer.layer.{layer_index}.attention.q_lin.weight",
+                    f"encoder.layers.{layer_index}.self_attn.q_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"transformer.layer.{layer_index}.attention.q_lin.bias",
+                    f"encoder.layers.{layer_index}.self_attn.q_proj.bias",
+                ],
+                [
+                    f"transformer.layer.{layer_index}.attention.k_lin.weight",
+                    f"encoder.layers.{layer_index}.self_attn.k_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"transformer.layer.{layer_index}.attention.k_lin.bias",
+                    f"encoder.layers.{layer_index}.self_attn.k_proj.bias",
+                ],
+                [
+                    f"transformer.layer.{layer_index}.attention.v_lin.weight",
+                    f"encoder.layers.{layer_index}.self_attn.v_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"transformer.layer.{layer_index}.attention.v_lin.bias",
+                    f"encoder.layers.{layer_index}.self_attn.v_proj.bias",
+                ],
+                [
+                    f"transformer.layer.{layer_index}.attention.out_lin.weight",
+                    f"encoder.layers.{layer_index}.self_attn.out_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"transformer.layer.{layer_index}.attention.out_lin.bias",
+                    f"encoder.layers.{layer_index}.self_attn.out_proj.bias",
+                ],
+                [
+                    f"transformer.layer.{layer_index}.sa_layer_norm.weight",
+                    f"encoder.layers.{layer_index}.norm1.weight",
+                ],
+                [
+                    f"transformer.layer.{layer_index}.sa_layer_norm.bias",
+                    f"encoder.layers.{layer_index}.norm1.bias",
+                ],
+                [
+                    f"transformer.layer.{layer_index}.output_layer_norm.weight",
+                    f"encoder.layers.{layer_index}.norm2.weight",
+                ],
+                [
+                    f"transformer.layer.{layer_index}.output_layer_norm.bias",
+                    f"encoder.layers.{layer_index}.norm2.bias",
+                ],
+                [
+                    f"transformer.layer.{layer_index}.ffn.lin1.weight",
+                    f"encoder.layers.{layer_index}.linear1.weight",
+                    "transpose",
+                ],
+                [
+                    f"transformer.layer.{layer_index}.ffn.lin1.bias",
+                    f"encoder.layers.{layer_index}.linear1.bias",
+                ],
+                [
+                    f"transformer.layer.{layer_index}.ffn.lin2.weight",
+                    f"encoder.layers.{layer_index}.linear2.weight",
+                    "transpose",
+                ],
+                [
+                    f"transformer.layer.{layer_index}.ffn.lin2.bias",
+                    f"encoder.layers.{layer_index}.linear2.bias",
+                ],
+            ]
+            model_mappings.extend(layer_mappings)
+
+        init_name_mappings(model_mappings)
+        # base-model prefix "DistilBertModel"
+        if "DistilBertModel" not in config.architectures:
+            for mapping in model_mappings:
+                mapping[0] = "distilbert." + mapping[0]
+                mapping[1] = "distilbert." + mapping[1]
+
+        # downstream mappings
+        if "DistilBertForSequenceClassification" in config.architectures:
+            model_mappings.extend(
+                [
+                    ["pre_classifier.weight", None, "transpose"],
+                    "pre_classifier.bias",
+                    ["classifier.weight", None, "transpose"],
+                    "classifier.bias",
+                ]
+            )
+
+        if "DistilBertForTokenClassification" in config.architectures:
+            model_mappings.extend(
+                [
+                    ["classifier.weight", None, "transpose"],
+                    "classifier.bias",
+                ]
+            )
+
+        if "DistilBertForQuestionAnswering" in config.architectures:
+            model_mappings.extend(
+                [["qa_outputs.weight", "classifier.weight", "transpose"], ["qa_outputs.bias", "classifier.bias"]]
+            )
+
+        init_name_mappings(model_mappings)
+        mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)]
+        return mappings
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range,
+                        shape=layer.weight.shape,
+                    )
+                )
+        elif isinstance(layer, nn.LayerNorm):
+            layer._epsilon = 1e-12
+
+
+@register_base_model
+class DistilBertModel(DistilBertPretrainedModel):
+    """
+    The bare DistilBert Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        vocab_size (int):
+            Vocabulary size of `inputs_ids` in `DistilBertModel`. Defines the number of different tokens that can
+            be represented by the `inputs_ids` passed when calling `DistilBertModel`.
+        hidden_size (int, optional):
+            Dimensionality of the embedding layer, encoder layers and the pooler layer. Defaults to `768`.
+        num_hidden_layers (int, optional):
+            Number of hidden layers in the Transformer encoder. Defaults to `12`.
+        num_attention_heads (int, optional):
+            Number of attention heads for each attention layer in the Transformer encoder.
+            Defaults to `12`.
+        intermediate_size (int, optional):
+            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
+            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
+            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
+            Defaults to `3072`.
+        hidden_act (str, optional):
+            The non-linear activation function in the feed-forward layer.
+            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
+            are supported. Defaults to `"gelu"`.
+        hidden_dropout_prob (float, optional):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+            Defaults to `0.1`.
+        attention_probs_dropout_prob (float, optional):
+            The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
+            Defaults to `0.1`.
+        max_position_embeddings (int, optional):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
+            sequence. Defaults to `512`.
+        initializer_range (float, optional):
+            The standard deviation of the normal initializer.
+            Defaults to `0.02`.
+
+            .. note::
+                A normal_initializer initializes weight matrices as normal distributions.
+                See :meth:`DistilBertPretrainedModel.init_weights()` for how weights are initialized in `DistilBertModel`.
+
+        pad_token_id (int, optional):
+            The index of padding token in the token vocabulary.
+            Defaults to `0`.
+
+    """
+
+    def __init__(self, config: DistilBertConfig):
+        super(DistilBertModel, self).__init__(config)
+        self.pad_token_id = config.pad_token_id
+        self.initializer_range = config.initializer_range
+        self.embeddings = BertEmbeddings(config)
+        encoder_layer = nn.TransformerEncoderLayer(
+            config.hidden_size,
+            config.num_attention_heads,
+            config.intermediate_size,
+            dropout=config.hidden_dropout_prob,
+            activation=config.hidden_act,
+            attn_dropout=config.attention_probs_dropout_prob,
+            act_dropout=0,
+        )
+        self.encoder = nn.TransformerEncoder(encoder_layer, config.num_hidden_layers)
+
+    def forward(self, input_ids, attention_mask=None):
+        r"""
+        The DistilBertModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
+                [batch_size, num_attention_heads, sequence_length, sequence_length].
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+
+        Returns:
+            Tensor: Returns tensor `encoder_output`, which means the sequence of hidden-states at the last layer of the model.
+            Its data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import DistilBertModel, DistilBertTokenizer
+
+                tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+                model = DistilBertModel.from_pretrained('distilbert-base-uncased')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+        """
+
+        if attention_mask is None:
+            attention_mask = paddle.unsqueeze(
+                (input_ids == self.pad_token_id).astype(self.encoder.layers[0].norm1.weight.dtype) * -1e4, axis=[1, 2]
+            )
+        else:
+            if attention_mask.ndim == 2:
+                # attention_mask [batch_size, sequence_length] -> [batch_size, 1, 1, sequence_length]
+                attention_mask = attention_mask.unsqueeze(axis=[1, 2]).astype(
+                    self.encoder.layers[0].norm1.weight.dtype
+                )
+                attention_mask = (1.0 - attention_mask) * -1e4
+        embedding_output = self.embeddings(input_ids=input_ids)
+        encoder_outputs = self.encoder(embedding_output, attention_mask)
+        return encoder_outputs
+
+
+class DistilBertForSequenceClassification(DistilBertPretrainedModel):
+    """
+    DistilBert Model with a linear layer on top of the output layer, designed for
+    sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        config (:class:`DistilBertConfig`):
+            An instance of DistilBertConfig used to construct DistilBertForSequenceClassification.
+    """
+
+    def __init__(self, config: DistilBertConfig):
+        super(DistilBertForSequenceClassification, self).__init__(config)
+        self.num_classes = config.num_labels
+        self.distilbert = DistilBertModel(config)
+        self.pre_classifier = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.ReLU()
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, config.num_classes)
+
+    def forward(self, input_ids, attention_mask=None):
+        r"""
+        The DistilBertForSequenceClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`DistilBertModel`.
+            attention_mask (list, optional):
+                See :class:`DistilBertModel`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input text classification logits.
+            Shape as `[batch_size, num_classes]` and dtype as `float32`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers.distilbert.modeling import DistilBertForSequenceClassification
+                from paddlenlp.transformers.distilbert.tokenizer import DistilBertTokenizer
+
+                tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+                model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                logits = outputs[0]
+        """
+
+        distilbert_output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
+
+        pooled_output = distilbert_output[:, 0]
+        pooled_output = self.pre_classifier(pooled_output)
+        pooled_output = self.activation(pooled_output)
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        return logits
+
+
+class DistilBertForQuestionAnswering(DistilBertPretrainedModel):
+    """
+    DistilBert Model with a linear layer on top of the hidden-states output to compute `span_start_logits`
+    and `span_end_logits`, designed for question-answering tasks like SQuAD.
+
+    Args:
+        config (:class:`DistilBertConfig`):
+            An instance of DistilBertConfig used to construct DistilBertForQuestionAnswering.
+    """
+
+    def __init__(self, config: DistilBertConfig):
+        super(DistilBertForQuestionAnswering, self).__init__(config)
+        self.distilbert = DistilBertModel(config)
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, input_ids, attention_mask=None):
+        r"""
+        The DistilBertForQuestionAnswering forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`DistilBertModel`.
+            attention_mask (list, optional):
+                See :class:`DistilBertModel`.
+
+        Returns:
+            tuple: Returns tuple (`start_logits`, `end_logits`).
+
+            With the fields:
+
+            - start_logits(Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - end_logits(Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers.distilbert.modeling import DistilBertForQuestionAnswering
+                from paddlenlp.transformers.distilbert.tokenizer import DistilBertTokenizer
+
+                tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+                model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                start_logits = outputs[0]
+                end_logits  =outputs[1]
+        """
+
+        sequence_output = self.distilbert(input_ids, attention_mask=attention_mask)
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        logits = paddle.transpose(logits, perm=[2, 0, 1])
+        start_logits, end_logits = paddle.unstack(x=logits, axis=0)
+        return start_logits, end_logits
+
+
+class DistilBertForTokenClassification(DistilBertPretrainedModel):
+    """
+    DistilBert Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        config (:class:`DistilBertConfig`):
+            An instance of DistilBertConfig used to construct DistilBertForTokenClassification.
+    """
+
+    def __init__(self, config: DistilBertConfig):
+        super(DistilBertForTokenClassification, self).__init__(config)
+        self.num_classes = config.num_labels
+        self.distilbert = DistilBertModel(config)
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, input_ids, attention_mask=None):
+        r"""
+        The DistilBertForTokenClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`DistilBertModel`.
+            attention_mask (list, optional):
+                See :class:`DistilBertModel`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input token classification logits.
+            Shape as `[batch_size, sequence_length, num_classes]` and dtype as `float32`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers.distilbert.modeling import DistilBertForTokenClassification
+                from paddlenlp.transformers.distilbert.tokenizer import DistilBertTokenizer
+
+                tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+                model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                logits = outputs[0]
+        """
+
+        sequence_output = self.distilbert(input_ids, attention_mask=attention_mask)
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        return logits
+
+
+class DistilBertForMaskedLM(DistilBertPretrainedModel):
+    """
+    DistilBert Model with a `language modeling` head on top.
+
+    Args:
+        config (:class:`DistilBertConfig`):
+            An instance of DistilBertConfig used to construct DistilBertForMaskedLM
+    """
+
+    def __init__(self, config: DistilBertConfig):
+        super(DistilBertForMaskedLM, self).__init__(config)
+        self.distilbert = DistilBertModel(config)
+        self.vocab_transform = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.GELU()
+        self.vocab_layer_norm = nn.LayerNorm(config.hidden_size)
+        self.vocab_projector = nn.Linear(config.hidden_size, config.vocab_size)
+
+    def forward(self, input_ids=None, attention_mask=None):
+        r"""
+        The DistilBertForMaskedLM forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`DistilBertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`DistilBertModel`.
+
+        Returns:
+            Tensor: Returns tensor `prediction_logits`, the scores of masked token prediction.
+            Its data type should be float32 and its shape is [batch_size, sequence_length, vocab_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import DistilBertForMaskedLM, DistilBertTokenizer
+
+                tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+                model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                prediction_logits = model(**inputs)
+        """
+
+        distilbert_output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
+        prediction_logits = self.vocab_transform(distilbert_output)
+        prediction_logits = self.activation(prediction_logits)
+        prediction_logits = self.vocab_layer_norm(prediction_logits)
+        prediction_logits = self.vocab_projector(prediction_logits)
+        return prediction_logits
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/distilbert/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/distilbert/tokenizer.py
new file mode 100644
index 000000000..c97aad81f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/distilbert/tokenizer.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..bert.tokenizer import BertTokenizer
+
+__all__ = ["DistilBertTokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"distilbert-base-uncased": 512, "distilbert-base-cased": 512}
+
+
+class DistilBertTokenizer(BertTokenizer):
+    """
+    Constructs a DistilBert tokenizer. The usage of DistilBertTokenizer is the same as
+    `BertTokenizer <https://paddlenlp.readthedocs.io/zh/latest/source/paddlenlp.transformers.bert.tokenizer.html>`__.
+    For more information regarding those methods, please refer to this superclass.
+    """
+
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "distilbert-base-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/distilbert/distilbert-base-uncased-vocab.txt",
+            "distilbert-base-cased": "https://bj.bcebos.com/paddlenlp/models/transformers/distilbert/distilbert-base-cased-vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "distilbert-base-uncased": {"do_lower_case": True},
+        "distilbert-base-cased": {"do_lower_case": False},
+    }
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __call__(
+        self,
+        text,
+        text_pair=None,
+        max_seq_len=None,
+        stride=0,
+        is_split_into_words=False,
+        pad_to_max_seq_len=False,
+        truncation_strategy="longest_first",
+        return_position_ids=False,
+        return_token_type_ids=False,
+        return_attention_mask=False,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_special_tokens_mask=False,
+    ):
+        return super(DistilBertTokenizer, self).__call__(
+            text,
+            text_pair,
+            max_seq_len,
+            stride,
+            is_split_into_words,
+            pad_to_max_seq_len,
+            truncation_strategy,
+            return_position_ids,
+            return_token_type_ids,
+            return_attention_mask,
+            return_length,
+            return_overflowing_tokens,
+            return_special_tokens_mask,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/distill_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/distill_utils.py
new file mode 100644
index 000000000..2be090d12
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/distill_utils.py
@@ -0,0 +1,397 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+from paddle import tensor
+import paddle.nn.functional as F
+from paddle.nn import MultiHeadAttention, TransformerEncoderLayer, TransformerEncoder
+from paddle.common_ops_import import convert_dtype
+
+from paddlenlp.utils.log import logger
+from paddlenlp.transformers import PPMiniLMForSequenceClassification
+from paddlenlp.transformers import TinyBertForPretraining
+from paddlenlp.transformers import BertForSequenceClassification
+
+__all__ = ["to_distill", "calc_minilm_loss", "calc_multi_relation_loss"]
+
+
+def calc_multi_relation_loss(loss_fct, s, t, attn_mask, num_relation_heads=0, alpha=0.0, beta=0.0):
+    """
+    Calculates loss for multiple Q-Q, K-K and V-V relation. It supports
+    head-head relation, sample-sample relation and origin token-token relation.
+    The final loss value could be balanced by weight `alpha` and `beta`.
+
+    Args:
+        loss_fct (callable):
+            Loss function for distillation. It only supports kl_div loss now.
+        s (Tensor):
+            Q, K, V of Student.
+        t (Tensor):
+            Q, K, V of teacher.
+        attn_mask (Tensor):
+            Attention mask for relation.
+        num_relation_heads (int):
+            The number of relation heads. 0 means `num_relation_heads` equals
+            to origin head num.
+            Defaults to 0.
+        alpha (float):
+            The weight for head-head relation.
+            Defaults to 0.0.
+        beta (float):
+            The weight for sample-sample relation.
+            Defaults to 0.0.
+
+    Returns:
+        Tensor: Weighted loss of token-token loss, head-head loss and
+            sample-sample loss.
+
+    """
+    # Initialize head_num
+    if num_relation_heads > 0 and num_relation_heads != s.shape[1]:
+        # s'shape: [bs, seq_len, head_num, head_dim]
+        s = tensor.transpose(x=s, perm=[0, 2, 1, 3])
+        # s'shape: [bs, seq_len, num_relation_heads, head_dim_new]
+        s = tensor.reshape(x=s, shape=[0, 0, num_relation_heads, -1])
+        s1 = tensor.transpose(x=s, perm=[0, 2, 1, 3])
+    if num_relation_heads > 0 and num_relation_heads != t.shape[1]:
+        t = tensor.transpose(x=t, perm=[0, 2, 1, 3])
+        t = tensor.reshape(x=t, shape=[0, 0, num_relation_heads, -1])
+        t1 = tensor.transpose(x=t, perm=[0, 2, 1, 3])
+
+    s_head_dim, t_head_dim = s.shape[3], t.shape[3]
+
+    if alpha + beta == 1.0:
+        loss_token_token = 0.0
+    else:
+        scaled_dot_product_s1 = tensor.matmul(x=s1, y=s1, transpose_y=True) / math.sqrt(s_head_dim)
+        del s1
+        scaled_dot_product_s1 += attn_mask
+        scaled_dot_product_t1 = tensor.matmul(x=t1, y=t1, transpose_y=True) / math.sqrt(t_head_dim)
+        del t1
+        scaled_dot_product_t1 += attn_mask
+        loss_token_token = loss_fct(F.log_softmax(scaled_dot_product_s1), F.softmax(scaled_dot_product_t1))
+
+    if alpha == 0.0:
+        loss_head_head = 0.0
+    else:
+        scaled_dot_product_s = tensor.matmul(x=s, y=s, transpose_y=True) / math.sqrt(s_head_dim)
+        attn_mask_head_head = tensor.transpose(x=attn_mask, perm=[0, 3, 1, 2])
+
+        scaled_dot_product_s += attn_mask_head_head
+        scaled_dot_product_t = tensor.matmul(x=t, y=t, transpose_y=True) / math.sqrt(t_head_dim)
+        scaled_dot_product_t += attn_mask_head_head
+        loss_head_head = loss_fct(F.log_softmax(scaled_dot_product_s), F.softmax(scaled_dot_product_t))
+    if beta == 0.0:
+        loss_sample_sample = 0.0
+    else:
+        s2 = tensor.transpose(x=s, perm=[1, 2, 0, 3])
+        scaled_dot_product_s2 = tensor.matmul(x=s2, y=s2, transpose_y=True) / math.sqrt(s_head_dim)
+
+        del s, s2
+        # Shape: [seq_len, 1, batch_size, 1]
+        attn_mask_sample_sample = tensor.transpose(x=attn_mask, perm=[3, 1, 0, 2])
+
+        # Shape: [seq_len, head_num, batch_size, batch_size]
+        scaled_dot_product_s2 += attn_mask_sample_sample
+        t2 = tensor.transpose(x=t, perm=[1, 2, 0, 3])
+        scaled_dot_product_t2 = tensor.matmul(x=t2, y=t2, transpose_y=True) / math.sqrt(t_head_dim)
+
+        del t, t2
+        scaled_dot_product_t2 += attn_mask_sample_sample
+        loss_sample_sample = loss_fct(F.log_softmax(scaled_dot_product_s2), F.softmax(scaled_dot_product_t2))
+
+    return (1 - alpha - beta) * loss_token_token + alpha * loss_head_head + beta * loss_sample_sample
+
+
+def calc_minilm_loss(loss_fct, s, t, attn_mask, num_relation_heads=0):
+    """
+    Calculates loss for Q-Q, K-K, V-V relation from MiniLMv2.
+    Args:
+        loss_fct (callable):
+            Loss function for distillation. It only supports kl_div loss now.
+        s (Tensor):
+            Q, K, V of Student.
+        t (Tensor):
+            Q, K, V of teacher.
+        attn_mask (Tensor):
+            Attention mask for relation.
+        num_relation_heads (int):
+            The number of relation heads. 0 means `num_relation_heads` equals
+            to origin head num.
+            Defaults to 0.
+
+    Returns:
+        Tensor: MiniLM loss value.
+
+    """
+    # Initialize head_num
+    if num_relation_heads > 0 and num_relation_heads != s.shape[1]:
+        # s'shape: [bs, seq_len, head_num, head_dim]
+        s = tensor.transpose(x=s, perm=[0, 2, 1, 3])
+        # s'shape: [bs, seq_len, num_relation_heads, head_dim_new]
+        s = tensor.reshape(x=s, shape=[0, 0, num_relation_heads, -1])
+        # s' shape: [bs, num_relation_heads, seq_len, head_dim_new]
+        s = tensor.transpose(x=s, perm=[0, 2, 1, 3])
+    if num_relation_heads > 0 and num_relation_heads != t.shape[1]:
+        t = tensor.transpose(x=t, perm=[0, 2, 1, 3])
+        t = tensor.reshape(x=t, shape=[0, 0, num_relation_heads, -1])
+        t = tensor.transpose(x=t, perm=[0, 2, 1, 3])
+
+    s_head_dim, t_head_dim = s.shape[3], t.shape[3]
+    scaled_dot_product_s = tensor.matmul(x=s, y=s, transpose_y=True) / math.sqrt(s_head_dim)
+    del s
+    scaled_dot_product_s += attn_mask
+
+    scaled_dot_product_t = tensor.matmul(x=t, y=t, transpose_y=True) / math.sqrt(t_head_dim)
+    del t
+    scaled_dot_product_t += attn_mask
+    loss = loss_fct(F.log_softmax(scaled_dot_product_s), F.softmax(scaled_dot_product_t))
+    return loss
+
+
+def to_distill(self, return_qkv=False, return_attentions=False, return_layer_outputs=False, layer_index=-1):
+    """
+    Can be bound to object with transformer encoder layers, and make model
+    expose attributes `outputs.q`, `outputs.k`, `outputs.v`,
+    `outputs.scaled_qks`, `outputs.hidden_states`and `outputs.attentions` of
+    the object for distillation.
+    It could be returned intermediate tensor using in MiniLM and TinyBERT
+    strategy.
+    """
+    logger.warning("`to_distill` is an experimental API and subject to change.")
+    MultiHeadAttention._forward = attention_forward
+    TransformerEncoderLayer._forward = transformer_encoder_layer_forward
+    TransformerEncoder._forward = transformer_encoder_forward
+    BertForSequenceClassification._forward = bert_forward
+
+    if return_qkv:
+        # forward function of student class should be replaced for distributed training.
+        TinyBertForPretraining._forward = minilm_pretraining_forward
+        PPMiniLMForSequenceClassification._forward = minilm_pretraining_forward
+    else:
+        TinyBertForPretraining._forward = tinybert_forward
+
+    def init_func(layer):
+        if isinstance(
+            layer,
+            (
+                MultiHeadAttention,
+                TransformerEncoderLayer,
+                TransformerEncoder,
+                TinyBertForPretraining,
+                BertForSequenceClassification,
+                PPMiniLMForSequenceClassification,
+            ),
+        ):
+            layer.forward = layer._forward
+            if isinstance(layer, TransformerEncoder):
+                layer.return_layer_outputs = return_layer_outputs
+                layer.layer_index = layer_index
+            if isinstance(layer, MultiHeadAttention):
+                layer.return_attentions = return_attentions
+                layer.return_qkv = return_qkv
+
+    for layer in self.children():
+        layer.apply(init_func)
+
+    base_model_prefix = (
+        self._layers.base_model_prefix if isinstance(self, paddle.DataParallel) else self.base_model_prefix
+    )
+
+    # For distribute training
+    if isinstance(self, paddle.DataParallel):
+        if hasattr(self._layers, base_model_prefix):
+            self.outputs = getattr(self._layers, base_model_prefix).encoder
+        else:
+            self.outputs = self._layers.encoder
+    else:
+        if hasattr(self, base_model_prefix):
+            self.outputs = getattr(self, base_model_prefix).encoder
+        else:
+            self.outputs = self.encoder
+    return self
+
+
+def _convert_attention_mask(attn_mask, dtype):
+    if attn_mask is not None and attn_mask.dtype != dtype:
+        attn_mask_dtype = convert_dtype(attn_mask.dtype)
+        if attn_mask_dtype == "bool" or "int" in attn_mask_dtype:
+            attn_mask = (paddle.cast(attn_mask, dtype) - 1.0) * 1e9
+        else:
+            attn_mask = paddle.cast(attn_mask, dtype)
+    return attn_mask
+
+
+def attention_forward(self, query, key=None, value=None, attn_mask=None, cache=None):
+    """
+    Redefines the `forward` function of `paddle.nn.MultiHeadAttention`.
+    """
+    key = query if key is None else key
+    value = query if value is None else value
+    # Computes q ,k ,v
+    if cache is None:
+        q, k, v = self._prepare_qkv(query, key, value, cache)
+    else:
+        q, k, v, cache = self._prepare_qkv(query, key, value, cache)
+
+    # Scale dot product attention
+    product = tensor.matmul(x=q, y=k, transpose_y=True)
+    product /= math.sqrt(self.head_dim)
+
+    if attn_mask is not None:
+        # Support bool or int mask
+        attn_mask = _convert_attention_mask(attn_mask, product.dtype)
+        product = product + attn_mask
+
+    self.attention_matrix = product if self.return_attentions else None
+    weights = F.softmax(product)
+    if self.dropout:
+        weights = F.dropout(weights, self.dropout, training=self.training, mode="upscale_in_train")
+
+    out = tensor.matmul(weights, v)
+    if self.return_qkv:
+        self.q = q
+        self.k = k
+        self.v = v
+
+    # Combine heads
+    out = tensor.transpose(out, perm=[0, 2, 1, 3])
+    out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+
+    # Project to output
+    out = self.out_proj(out)
+
+    outs = [out]
+    if self.need_weights:
+        outs.append(weights)
+    if cache is not None:
+        outs.append(cache)
+    return out if len(outs) == 1 else tuple(outs)
+
+
+def transformer_encoder_layer_forward(self, src, src_mask=None, cache=None):
+    """
+    Redefines the `forward` function of `paddle.nn.TransformerEncoderLayer`.
+    """
+    src_mask = _convert_attention_mask(src_mask, src.dtype)
+
+    residual = src
+    if self.normalize_before:
+        src = self.norm1(src)
+    # Add cache for encoder for the usage like UniLM
+    if cache is None:
+        src = self.self_attn(src, src, src, src_mask)
+    else:
+        src, incremental_cache = self.self_attn(src, src, src, src_mask, cache)
+    src = residual + self.dropout1(src)
+    if not self.normalize_before:
+        src = self.norm1(src)
+
+    residual = src
+    if self.normalize_before:
+        src = self.norm2(src)
+    src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+    src = residual + self.dropout2(src)
+    if not self.normalize_before:
+        src = self.norm2(src)
+    if hasattr(self.self_attn, "attention_matrix"):
+        self.attention_matrix = self.self_attn.attention_matrix
+    if hasattr(self.self_attn, "q"):
+        self.q = self.self_attn.q
+        self.k = self.self_attn.k
+        self.v = self.self_attn.v
+    return src if cache is None else (src, incremental_cache)
+
+
+def transformer_encoder_forward(self, src, src_mask=None, cache=None):
+    """
+    Redefines the `forward` function of `paddle.nn.TransformerEncoder`.
+    """
+    src_mask = _convert_attention_mask(src_mask, src.dtype)
+
+    output = src
+    new_caches = []
+
+    self.attentions = []
+    self.hidden_states = []
+
+    for i, mod in enumerate(self.layers):
+        if self.return_layer_outputs:
+            self.hidden_states.append(output)
+        if cache is None:
+            output = mod(output, src_mask=src_mask)
+        else:
+            output, new_cache = mod(output, src_mask=src_mask, cache=cache[i])
+            new_caches.append(new_cache)
+        if hasattr(mod, "attention_matrix"):
+            self.attentions.append(mod.attention_matrix)
+        if i == self.layer_index and hasattr(mod, "q"):
+            self.q = mod.q
+            self.k = mod.k
+            self.v = mod.v
+
+    if self.norm is not None:
+        output = self.norm(output)
+    if self.return_layer_outputs:
+        self.hidden_states.append(output)
+    return output if cache is None else (output, new_caches)
+
+
+def minilm_pretraining_forward(self, input_ids, token_type_ids=None, attention_mask=None):
+    """
+    Replaces `forward` function while using multi gpus to train. If training on
+    single GPU, this `forward` could not be replaced.
+    The type of `self` should inherit from base class of pretrained LMs, such as
+    `TinyBertForPretraining`.
+    Strategy MINILM only needs q, k and v of transformers.
+    """
+    assert hasattr(self, self.base_model_prefix), "Student class should inherit from %s" % (self.base_model_class)
+    model = getattr(self, self.base_model_prefix)
+    encoder = model.encoder
+
+    sequence_output, pooled_output = model(input_ids, token_type_ids, attention_mask)
+    return encoder.q, encoder.k, encoder.v
+
+
+def tinybert_forward(self, input_ids, token_type_ids=None, attention_mask=None):
+    """
+    Replaces `forward` function while using multi gpus to train.
+    """
+    assert hasattr(self, self.base_model_prefix), "Student class should inherit from %s" % (self.base_model_class)
+    model = getattr(self, self.base_model_prefix)
+    encoder = model.encoder
+
+    sequence_output, pooled_output = model(input_ids, token_type_ids, attention_mask)
+    for i in range(len(encoder.hidden_states)):
+        # While using tinybert-4l-312d, tinybert-6l-768d, tinybert-4l-312d-zh,
+        # tinybert-6l-768d-zh
+        # While using tinybert-4l-312d-v2, tinybert-6l-768d-v2
+        # encoder.hidden_states[i] = self.tinybert.fit_dense(encoder.hidden_states[i])
+        encoder.hidden_states[i] = self.tinybert.fit_denses[i](encoder.hidden_states[i])
+
+    return encoder.attentions, encoder.hidden_states
+
+
+def bert_forward(self, input_ids, token_type_ids=None, attention_mask=None):
+    """
+    Replaces `forward` function while using multi gpus to train.
+    """
+    assert hasattr(self, self.base_model_prefix), "Student class should inherit from %s" % (self.base_model_class)
+    model = getattr(self, self.base_model_prefix)
+    encoder = model.encoder
+
+    sequence_output, pooled_output = model(input_ids, token_type_ids, attention_mask)
+    return encoder.attentions, encoder.hidden_states
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dpt/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dpt/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dpt/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dpt/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dpt/configuration.py
new file mode 100644
index 000000000..e27221da2
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dpt/configuration.py
@@ -0,0 +1,226 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" DPT model configuration"""
+
+import copy
+
+from ...utils.log import logger
+from ..bit.configuration import BitConfig
+from ..configuration_utils import PretrainedConfig
+
+__all__ = ["DPTConfig"]
+
+
+class DPTConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DPTModel`]. It is used to instantiate an DPT
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the DPT
+    [Intel/dpt-large](https://huggingface.co/Intel/dpt-large) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 384):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        backbone_out_indices (`List[int]`, *optional*, defaults to `[2, 5, 8, 11]`):
+            Indices of the intermediate hidden states to use from backbone.
+        readout_type (`str`, *optional*, defaults to `"project"`):
+            The readout type to use when processing the readout token (CLS token) of the intermediate hidden states of
+            the ViT backbone. Can be one of [`"ignore"`, `"add"`, `"project"`].
+            - "ignore" simply ignores the CLS token.
+            - "add" passes the information from the CLS token to all other tokens by adding the representations.
+            - "project" passes information to the other tokens by concatenating the readout to all other tokens before
+              projecting the
+            representation to the original feature dimension D using a linear layer followed by a GELU non-linearity.
+        is_hybrid (`bool`, *optional*, defaults to `False`):
+            Whether to use a hybrid backbone. Useful in the context of loading DPT-Hybrid models.
+        reassemble_factors (`List[int]`, *optional*, defaults to `[4, 2, 1, 0.5]`):
+            The up/downsampling factors of the reassemble layers.
+        neck_hidden_sizes (`List[str]`, *optional*, defaults to [96, 192, 384, 768]):
+            The hidden sizes to project to for the feature maps of the backbone.
+        fusion_hidden_size (`int`, *optional*, defaults to 256):
+            The number of channels before fusion.
+        head_in_index (`int`, *optional*, defaults to -1):
+            The index of the features to use in the heads.
+        use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`):
+            Whether to use batch normalization in the pre-activate residual units of the fusion blocks.
+        use_auxiliary_head (`bool`, *optional*, defaults to `True`):
+            Whether to use an auxiliary head during training.
+        auxiliary_loss_weight (`float`, *optional*, defaults to 0.4):
+            Weight of the cross-entropy loss of the auxiliary head.
+        semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
+            The index that is ignored by the loss function of the semantic segmentation model.
+        semantic_classifier_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the semantic classification head.
+        backbone_featmap_shape (`List[int]`, *optional*, defaults to `[1, 1024, 24, 24]`):
+            Used only for the `hybrid` embedding type. The shape of the feature maps of the backbone.
+        neck_ignore_stages (`List[int]`, *optional*, defaults to `[0, 1]`):
+            Used only for the `hybrid` embedding type. The stages of the readout layers to ignore.
+        backbone_config (`Union[Dict[str, Any], PretrainedConfig]`, *optional*):
+            Used only for the `hybrid` embedding type. The configuration of the backbone in a dictionary.
+
+    Examples:
+
+    ```python
+    >>> from paddlenlp.transformers import DPTModel, DPTConfig
+
+    >>> # Initializing a DPT dpt-large style configuration
+    >>> configuration = DPTConfig()
+
+    >>> # Initializing a model from the dpt-large style configuration
+    >>> model = DPTModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "dpt"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        image_size=384,
+        patch_size=16,
+        num_channels=3,
+        is_hybrid=False,
+        qkv_bias=True,
+        backbone_out_indices=[2, 5, 8, 11],
+        readout_type="project",
+        reassemble_factors=[4, 2, 1, 0.5],
+        neck_hidden_sizes=[96, 192, 384, 768],
+        fusion_hidden_size=256,
+        head_in_index=-1,
+        use_batch_norm_in_fusion_residual=False,
+        use_auxiliary_head=True,
+        auxiliary_loss_weight=0.4,
+        semantic_loss_ignore_index=255,
+        semantic_classifier_dropout=0.1,
+        backbone_featmap_shape=[1, 1024, 24, 24],
+        neck_ignore_stages=[0, 1],
+        backbone_config=None,
+        **kwargs
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.is_hybrid = is_hybrid
+
+        if self.is_hybrid:
+            if backbone_config is None:
+                logger.info("Initializing the config with a `BiT` backbone.")
+                backbone_config = {
+                    "global_padding": "same",
+                    "layer_type": "bottleneck",
+                    "depths": [3, 4, 9],
+                    "out_features": ["stage1", "stage2", "stage3"],
+                    "embedding_dynamic_padding": True,
+                }
+                self.backbone_config = BitConfig(**backbone_config)
+            elif isinstance(backbone_config, dict):
+                logger.info("Initializing the config with a `BiT` backbone.")
+                self.backbone_config = BitConfig(**backbone_config)
+            elif isinstance(backbone_config, PretrainedConfig):
+                self.backbone_config = backbone_config
+            else:
+                raise ValueError(
+                    f"backbone_config must be a dictionary or a `PretrainedConfig`, got {backbone_config.__class__}."
+                )
+
+            self.backbone_featmap_shape = backbone_featmap_shape
+            self.neck_ignore_stages = neck_ignore_stages
+
+            if readout_type != "project":
+                raise ValueError("Readout type must be 'project' when using `DPT-hybrid` mode.")
+        else:
+            self.backbone_config = None
+            self.backbone_featmap_shape = None
+            self.neck_ignore_stages = []
+
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.backbone_out_indices = backbone_out_indices
+        if readout_type not in ["ignore", "add", "project"]:
+            raise ValueError("Readout_type must be one of ['ignore', 'add', 'project']")
+        self.readout_type = readout_type
+        self.reassemble_factors = reassemble_factors
+        self.neck_hidden_sizes = neck_hidden_sizes
+        self.fusion_hidden_size = fusion_hidden_size
+        self.head_in_index = head_in_index
+        self.use_batch_norm_in_fusion_residual = use_batch_norm_in_fusion_residual
+        # auxiliary head attributes (semantic segmentation)
+        self.use_auxiliary_head = use_auxiliary_head
+        self.auxiliary_loss_weight = auxiliary_loss_weight
+        self.semantic_loss_ignore_index = semantic_loss_ignore_index
+        self.semantic_classifier_dropout = semantic_classifier_dropout
+
+    def to_dict(self, *args, **kwargs):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+
+        if output["backbone_config"] is not None:
+            output["backbone_config"] = self.backbone_config.to_dict()
+
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dpt/image_processing.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dpt/image_processing.py
new file mode 100644
index 000000000..cff5d4969
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dpt/image_processing.py
@@ -0,0 +1,373 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for DPT."""
+
+import math
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+import PIL
+
+from ..image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ..image_transforms import normalize, rescale, resize, to_channel_dimension_format
+from ..image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    is_batched,
+    to_numpy_array,
+    valid_images,
+)
+from ..tokenizer_utils_base import TensorType
+
+__all__ = ["DPTImageProcessor"]
+
+
+def get_resize_output_image_size(
+    input_image: np.ndarray, output_size: Union[int, Iterable[int]], keep_aspect_ratio: bool, multiple: int
+) -> Tuple[int, int]:
+    def constraint_to_multiple_of(val, multiple, min_val=0, max_val=None):
+        x = round(val / multiple) * multiple
+
+        if max_val is not None and x > max_val:
+            x = math.floor(val / multiple) * multiple
+
+        if x < min_val:
+            x = math.ceil(val / multiple) * multiple
+
+        return x
+
+    output_size = (output_size, output_size) if isinstance(output_size, int) else output_size
+
+    input_height, input_width = get_image_size(input_image)
+    output_height, output_width = output_size
+
+    # determine new height and width
+    scale_height = output_height / input_height
+    scale_width = output_width / input_width
+
+    if keep_aspect_ratio:
+        # scale as little as possible
+        if abs(1 - scale_width) < abs(1 - scale_height):
+            # fit width
+            scale_height = scale_width
+        else:
+            # fit height
+            scale_width = scale_height
+
+    new_height = constraint_to_multiple_of(scale_height * input_height, multiple=multiple)
+    new_width = constraint_to_multiple_of(scale_width * input_width, multiple=multiple)
+
+    return (new_height, new_width)
+
+
+class DPTImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a DPT image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions. Can be overidden by `do_resize` in `preprocess`.
+        size (`Dict[str, int]` *optional*, defaults to `{"height": 384, "width": 384}`):
+            Size of the image after resizing. Can be overidden by `size` in `preprocess`.
+        keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
+            If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved. Can
+            be overidden by `keep_aspect_ratio` in `preprocess`.
+        ensure_multiple_of (`int`, *optional*, defaults to `1`):
+            If `do_resize` is `True`, the image is resized to a size that is a multiple of this value. Can be overidden
+            by `ensure_multiple_of` in `preprocess`.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Defines the resampling filter to use if resizing the image. Can be overidden by `resample` in `preprocess`.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overidden by `do_rescale` in
+            `preprocess`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overidden by `rescale_factor` in `preprocess`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        keep_aspect_ratio: bool = False,
+        ensure_multiple_of: int = 1,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 384, "width": 384}
+        size = get_size_dict(size)
+        self.do_resize = do_resize
+        self.size = size
+        self.keep_aspect_ratio = keep_aspect_ratio
+        self.ensure_multiple_of = ensure_multiple_of
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        keep_aspect_ratio: bool = False,
+        ensure_multiple_of: int = 1,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Resize an image to target size `(size["height"], size["width"])`. If `keep_aspect_ratio` is `True`, the image
+        is resized to the largest possible size such that the aspect ratio is preserved. If `ensure_multiple_of` is
+        set, the image is resized to a size that is a multiple of this value.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Target size of the output image.
+            keep_aspect_ratio (`bool`, *optional*, defaults to `False`):
+                If `True`, the image is resized to the largest possible size such that the aspect ratio is preserved.
+            ensure_multiple_of (`int`, *optional*, defaults to `1`):
+                The image is resized to a size that is a multiple of this value.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Defines the resampling filter to use if resizing the image. Otherwise, the image is resized to size
+                specified in `size`.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The size dictionary must contain the keys 'height' and 'width'. Got {size.keys()}")
+        output_size = get_resize_output_image_size(
+            image,
+            output_size=(size["height"], size["width"]),
+            keep_aspect_ratio=keep_aspect_ratio,
+            multiple=ensure_multiple_of,
+        )
+        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
+
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ):
+        """
+        Rescale an image by a scale factor. image = image * scale.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`int` or `float`):
+                Scale to apply to the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return rescale(image, scale=scale, data_format=data_format, **kwargs)
+
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: Union[float, List[float]],
+        std: Union[float, List[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            image_mean (`float` or `List[float]`):
+                Image mean.
+            image_std (`float` or `List[float]`):
+                Image standard deviation.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        size: int = None,
+        keep_aspect_ratio: bool = None,
+        ensure_multiple_of: int = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after reszing. If `keep_aspect_ratio` is `True`, the image is resized to the largest
+                possible size such that the aspect ratio is preserved. If `ensure_multiple_of` is set, the image is
+                resized to a size that is a multiple of this value.
+            keep_aspect_ratio (`bool`, *optional*, defaults to `self.keep_aspect_ratio`):
+                Whether to keep the aspect ratio of the image. If False, the image will be resized to (size, size). If
+                True, the image will be resized to keep the aspect ratio and the size will be the maximum possible.
+            ensure_multiple_of (`int`, *optional*, defaults to `self.ensure_multiple_of`):
+                Ensure that the image size is a multiple of this value.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.PADDLE` or `'pt'`: Return a batch of type `paddle.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size)
+        keep_aspect_ratio = keep_aspect_ratio if keep_aspect_ratio is not None else self.keep_aspect_ratio
+        ensure_multiple_of = ensure_multiple_of if ensure_multiple_of is not None else self.ensure_multiple_of
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        if not is_batched(images):
+            images = [images]
+
+        if not valid_images(images):
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
+
+        if do_resize and size is None or resample is None:
+            raise ValueError("Size and resample must be specified if do_resize is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_resize:
+            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+
+        if do_rescale:
+            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+
+        if do_normalize:
+            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
+
+        images = [to_channel_dimension_format(image, data_format) for image in images]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None):
+        """
+        Converts the output of [`DPTForSemanticSegmentation`] into semantic segmentation maps. Only supports Paddle.
+
+        Args:
+            outputs ([`DPTForSemanticSegmentation`]):
+                Raw outputs of the model.
+            target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
+                List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
+                predictions will not be resized.
+
+        Returns:
+            semantic_segmentation: `List[paddle.Tensor]` of length `batch_size`, where each item is a semantic
+            segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
+            specified). Each entry of each `paddle.Tensor` correspond to a semantic class id.
+        """
+        # TODO: add support for other frameworks
+        logits = outputs.logits
+
+        # Resize logits and compute semantic segmentation maps
+        if target_sizes is not None:
+            if len(logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+            if paddle.is_tensor(target_sizes):
+                target_sizes = target_sizes.numpy()
+
+            semantic_segmentation = []
+
+            for idx in range(len(logits)):
+                resized_logits = F.interpolate(
+                    logits[idx].unsqueeze(axis=0), size=target_sizes[idx], mode="bilinear", align_corners=False
+                )
+                semantic_map = resized_logits[0].argmax(axis=0)
+                semantic_segmentation.append(semantic_map)
+        else:
+            semantic_segmentation = logits.argmax(axis=1)
+            semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
+
+        return semantic_segmentation
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dpt/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dpt/modeling.py
new file mode 100644
index 000000000..caf5d06af
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/dpt/modeling.py
@@ -0,0 +1,1336 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 Intel Labs, OpenMMLab and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Paddle DPT (Dense Prediction Transformers) model.
+This implementation is heavily inspired by OpenMMLab's implementation, found here:
+https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/dpt_head.py.
+"""
+
+
+import collections.abc
+import math
+from dataclasses import dataclass
+from functools import partial
+from typing import List, Optional, Tuple, Union
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.distributed.fleet.utils import recompute
+from paddle.nn import CrossEntropyLoss
+
+from ...utils.initializer import normal_, ones_, zeros_
+from ..activations import ACT2FN
+from ..bit.configuration import BitConfig
+from ..bit.modeling import BitBackbone
+from ..model_outputs import (
+    BaseModelOutput,
+    DepthEstimatorOutput,
+    ModelOutput,
+    SemanticSegmenterOutput,
+)
+from ..model_utils import PretrainedModel
+from .configuration import DPTConfig
+
+__all__ = [
+    "DPTPretrainedModel",
+    "DPTModel",
+    "DPTForDepthEstimation",
+    "DPTForSemanticSegmentation",
+]
+
+
+@dataclass
+class BaseModelOutputWithIntermediateActivations(ModelOutput):
+    """
+    Base class for model's outputs that also contains intermediate activations that can be used at later stages. Useful
+    in the context of Vision models.:
+    Args:
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        intermediate_activations (`tuple(paddle.Tensor)`, *optional*):
+            Intermediate activations that can be used to compute hidden states of the model at various layers.
+    """
+
+    last_hidden_states: paddle.Tensor = None
+    intermediate_activations: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BaseModelOutputWithPoolingAndIntermediateActivations(ModelOutput):
+    """
+    Base class for model's outputs that also contains a pooling of the last hidden states as well as intermediate
+    activations that can be used by the model at later stages.
+    Args:
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (`paddle.Tensor` of shape `(batch_size, hidden_size)`):
+            Last layer hidden-state of the first token of the sequence (classification token) after further processing
+            through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
+            the classification token after processing through a linear layer and a tanh activation function. The linear
+            layer weights are trained from the next sentence prediction (classification) objective during pretraining.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        intermediate_activations (`tuple(paddle.Tensor)`, *optional*):
+            Intermediate activations that can be used to compute hidden states of the model at various layers.
+    """
+
+    last_hidden_state: paddle.Tensor = None
+    pooler_output: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+    intermediate_activations: Optional[Tuple[paddle.Tensor]] = None
+
+
+class DPTViTHybridEmbeddings(nn.Layer):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config, feature_size=None):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+
+        if isinstance(config.backbone_config, BitConfig):
+            self.backbone = BitBackbone(config.backbone_config)
+        else:
+            raise NotImplementedError
+        feature_dim = self.backbone.channels[-1]
+        if len(config.backbone_config.out_features) != 3:
+            raise ValueError(
+                f"Expected backbone to have 3 output features, got {len(config.backbone_config.out_features)}"
+            )
+        self.residual_feature_map_index = [0, 1]  # Always take the output of the first and second backbone stage
+
+        if feature_size is None:
+            feat_map_shape = config.backbone_featmap_shape
+            feature_size = feat_map_shape[-2:]
+            feature_dim = feat_map_shape[1]
+        else:
+            feature_size = (
+                feature_size if isinstance(feature_size, collections.abc.Iterable) else (feature_size, feature_size)
+            )
+            feature_dim = self.backbone.channels[-1]
+
+        self.image_size = image_size
+        self.patch_size = patch_size[0]
+        self.num_channels = num_channels
+
+        self.projection = nn.Conv2D(feature_dim, hidden_size, kernel_size=1)
+
+        self.cls_token = self.create_parameter(
+            [1, 1, config.hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(0.0),
+        )
+
+        self.position_embeddings = self.create_parameter(
+            [1, num_patches + 1, config.hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(0.0),
+        )
+
+    def _resize_pos_embed(self, posemb, grid_size_height, grid_size_width, start_index=1):
+        posemb_tok = posemb[:, :start_index]
+        posemb_grid = posemb[0, start_index:]
+
+        old_grid_size = int(math.sqrt(len(posemb_grid)))
+
+        posemb_grid = posemb_grid.reshape([1, old_grid_size, old_grid_size, -1]).transpose([0, 3, 1, 2])
+        posemb_grid = F.interpolate(posemb_grid, size=(grid_size_height, grid_size_width), mode="bilinear")
+        posemb_grid = posemb_grid.transpose([0, 2, 3, 1]).reshape([1, grid_size_height * grid_size_width, -1])
+
+        posemb = paddle.concat([posemb_tok, posemb_grid], axis=1)
+
+        return posemb
+
+    def forward(
+        self, pixel_values: paddle.Tensor, interpolate_pos_encoding: bool = False, return_dict: bool = False
+    ) -> paddle.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        if not interpolate_pos_encoding:
+            if height != self.image_size[0] or width != self.image_size[1]:
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model"
+                    f" ({self.image_size[0]}*{self.image_size[1]})."
+                )
+
+        position_embeddings = self._resize_pos_embed(
+            self.position_embeddings, height // self.patch_size, width // self.patch_size
+        )
+
+        backbone_output = self.backbone(pixel_values)
+
+        features = backbone_output.feature_maps[-1]
+
+        # Retrieve also the intermediate activations to use them at later stages
+        output_hidden_states = [backbone_output.feature_maps[index] for index in self.residual_feature_map_index]
+
+        embeddings = self.projection(features).flatten(2).transpose([0, 2, 1])
+
+        cls_tokens = self.cls_token.expand([batch_size, -1, -1])
+        embeddings = paddle.concat((cls_tokens, embeddings), axis=1)
+
+        # add positional encoding to each token
+        embeddings = embeddings + position_embeddings
+
+        if not return_dict:
+            return (embeddings, output_hidden_states)
+
+        # Return hidden states and intermediate activations
+        return BaseModelOutputWithIntermediateActivations(
+            last_hidden_states=embeddings,
+            intermediate_activations=output_hidden_states,
+        )
+
+
+class DPTViTEmbeddings(nn.Layer):
+    """
+    Construct the CLS token, position and patch embeddings.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.cls_token = self.create_parameter(
+            [1, 1, config.hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(0.0),
+        )
+
+        self.patch_embeddings = DPTViTPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+
+        self.position_embeddings = self.create_parameter(
+            [1, num_patches + 1, config.hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(0.0),
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.config = config
+
+    def _resize_pos_embed(self, posemb, grid_size_height, grid_size_width, start_index=1):
+        posemb_tok = posemb[:, :start_index]
+        posemb_grid = posemb[0, start_index:]
+
+        old_grid_size = int(math.sqrt(len(posemb_grid)))
+
+        posemb_grid = posemb_grid.reshape([1, old_grid_size, old_grid_size, -1]).transpose([0, 3, 1, 2])
+        posemb_grid = F.interpolate(posemb_grid, size=(grid_size_height, grid_size_width), mode="bilinear")
+        posemb_grid = posemb_grid.transpose([0, 2, 3, 1]).reshape([1, grid_size_height * grid_size_width, -1])
+
+        posemb = paddle.concat([posemb_tok, posemb_grid], axis=1)
+
+        return posemb
+
+    def forward(self, pixel_values, return_dict=False):
+        batch_size, num_channels, height, width = pixel_values.shape
+
+        # possibly interpolate position encodings to handle varying image sizes
+        patch_size = self.config.patch_size
+        position_embeddings = self._resize_pos_embed(
+            self.position_embeddings, height // patch_size, width // patch_size
+        )
+
+        embeddings = self.patch_embeddings(pixel_values)
+
+        batch_size = embeddings.shape[0]
+
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand([batch_size, -1, -1])
+        embeddings = paddle.concat((cls_tokens, embeddings), axis=1)
+
+        # add positional encoding to each token
+        embeddings = embeddings + position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        if not return_dict:
+            return (embeddings,)
+
+        return BaseModelOutputWithIntermediateActivations(last_hidden_states=embeddings)
+
+
+class DPTViTPatchEmbeddings(nn.Layer):
+    """
+    Image to Patch Embedding.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2D(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values):
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        embeddings = self.projection(pixel_values).flatten(2).transpose([0, 2, 1])
+        return embeddings
+
+
+class DPTViTSelfAttention(nn.Layer):
+    def __init__(self, config: DPTConfig) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.scale = math.sqrt(self.attention_head_size)
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias_attr=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias_attr=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias_attr=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x: paddle.Tensor) -> paddle.Tensor:
+        new_x_shape = x.shape[:-1] + [self.num_attention_heads, self.attention_head_size]
+        x = x.reshape(new_x_shape)
+        return x.transpose([0, 2, 1, 3])
+
+    def forward(
+        self, hidden_states, output_attentions: bool = False
+    ) -> Union[Tuple[paddle.Tensor, paddle.Tensor], Tuple[paddle.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_layer, key_layer, transpose_y=True)
+
+        attention_scores = attention_scores / self.scale
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = F.softmax(attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = paddle.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.transpose([0, 2, 1, 3])
+        new_context_layer_shape = context_layer.shape[:-2] + [
+            self.all_head_size,
+        ]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class DPTViTSelfOutput(nn.Layer):
+    """
+    The residual connection is defined in DPTLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: DPTConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor:
+
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+class DPTViTAttention(nn.Layer):
+    def __init__(self, config: DPTConfig) -> None:
+        super().__init__()
+        self.attention = DPTViTSelfAttention(config)
+        self.output = DPTViTSelfOutput(config)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[paddle.Tensor, paddle.Tensor], Tuple[paddle.Tensor]]:
+        self_outputs = self.attention(hidden_states, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class DPTViTIntermediate(nn.Layer):
+    def __init__(self, config: DPTConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+
+class DPTViTOutput(nn.Layer):
+    def __init__(self, config: DPTConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+
+class DPTViTLayer(nn.Layer):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config: DPTConfig) -> None:
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = DPTViTAttention(config)
+        self.intermediate = DPTViTIntermediate(config)
+        self.output = DPTViTOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[paddle.Tensor, paddle.Tensor], Tuple[paddle.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.layernorm_before(hidden_states),  # in ViT, layernorm is applied before self-attention
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in ViT, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class DPTViTEncoder(nn.Layer):
+    def __init__(self, config: DPTConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.LayerList([DPTViTLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = recompute(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class DPTReassembleStage(nn.Layer):
+    """
+    This class reassembles the hidden states of the backbone into image-like feature representations at various
+    resolutions.
+    This happens in 3 stages:
+    1. Map the N + 1 tokens to a set of N tokens, by taking into account the readout ([CLS]) token according to
+       `config.readout_type`.
+    2. Project the channel dimension of the hidden states according to `config.neck_hidden_sizes`.
+    3. Resizing the spatial dimensions (height, width).
+    Args:
+        config (`[DPTConfig]`):
+            Model configuration class defining the model architecture.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+        self.layers = nn.LayerList()
+        if config.is_hybrid:
+            self._init_reassemble_dpt_hybrid(config)
+        else:
+            self._init_reassemble_dpt(config)
+
+        self.neck_ignore_stages = config.neck_ignore_stages
+
+    def _init_reassemble_dpt_hybrid(self, config):
+        r""" "
+        For DPT-Hybrid the first 2 reassemble layers are set to `nn.Identity()`, please check the official
+        implementation: https://github.com/isl-org/DPT/blob/f43ef9e08d70a752195028a51be5e1aff227b913/dpt/vit.py#L438
+        for more details.
+        """
+        for i, factor in zip(range(len(config.neck_hidden_sizes)), config.reassemble_factors):
+            if i <= 1:
+                self.layers.append(nn.Identity())
+            elif i > 1:
+                self.layers.append(DPTReassembleLayer(config, channels=config.neck_hidden_sizes[i], factor=factor))
+
+        if config.readout_type != "project":
+            raise ValueError(f"Readout type {config.readout_type} is not supported for DPT-Hybrid.")
+
+        # When using DPT-Hybrid the readout type is set to "project". The sanity check is done on the config file
+        self.readout_projects = nn.LayerList()
+        for i in range(len(config.neck_hidden_sizes)):
+            if i <= 1:
+                self.readout_projects.append(nn.Sequential(nn.Identity()))
+            elif i > 1:
+                self.readout_projects.append(
+                    nn.Sequential(nn.Linear(2 * config.hidden_size, config.hidden_size), ACT2FN[config.hidden_act])
+                )
+
+    def _init_reassemble_dpt(self, config):
+        for i, factor in zip(range(len(config.neck_hidden_sizes)), config.reassemble_factors):
+            self.layers.append(DPTReassembleLayer(config, channels=config.neck_hidden_sizes[i], factor=factor))
+
+        if config.readout_type == "project":
+            self.readout_projects = nn.LayerList()
+            for _ in range(len(config.neck_hidden_sizes)):
+                self.readout_projects.append(
+                    nn.Sequential(nn.Linear(2 * config.hidden_size, config.hidden_size), ACT2FN[config.hidden_act])
+                )
+
+    def forward(self, hidden_states: List[paddle.Tensor]) -> List[paddle.Tensor]:
+        """
+        Args:
+            hidden_states (`List[paddle.Tensor]`, each of shape `(batch_size, sequence_length + 1, hidden_size)`):
+                List of hidden states from the backbone.
+        """
+        out = []
+
+        for i, hidden_state in enumerate(hidden_states):
+            if i not in self.neck_ignore_stages:
+                # reshape to (B, C, H, W)
+                hidden_state, cls_token = hidden_state[:, 1:], hidden_state[:, 0]
+                batch_size, sequence_length, num_channels = hidden_state.shape
+                size = int(math.sqrt(sequence_length))
+                hidden_state = hidden_state.reshape([batch_size, size, size, num_channels])
+                hidden_state = hidden_state.transpose([0, 3, 1, 2])
+
+                feature_shape = hidden_state.shape
+                if self.config.readout_type == "project":
+                    # reshape to (B, H*W, C)
+                    hidden_state = hidden_state.flatten(2).transpose([0, 2, 1])
+                    readout = cls_token.unsqueeze(1).expand_as(hidden_state)
+                    # concatenate the readout token to the hidden states and project
+                    hidden_state = self.readout_projects[i](paddle.concat((hidden_state, readout), axis=-1))
+                    # reshape back to (B, C, H, W)
+                    hidden_state = hidden_state.transpose([0, 2, 1]).reshape(feature_shape)
+                elif self.config.readout_type == "add":
+                    hidden_state = hidden_state.flatten(2) + cls_token.unsqueeze(-1)
+                    hidden_state = hidden_state.reshape(feature_shape)
+                hidden_state = self.layers[i](hidden_state)
+            out.append(hidden_state)
+
+        return out
+
+
+class DPTReassembleLayer(nn.Layer):
+    def __init__(self, config, channels, factor):
+        super().__init__()
+        # projection
+        self.projection = nn.Conv2D(in_channels=config.hidden_size, out_channels=channels, kernel_size=1)
+
+        # up/down sampling depending on factor
+        if factor > 1:
+            self.resize = nn.Conv2DTranspose(channels, channels, kernel_size=factor, stride=factor, padding=0)
+        elif factor == 1:
+            self.resize = nn.Identity()
+        elif factor < 1:
+            # so should downsample
+            self.resize = nn.Conv2D(channels, channels, kernel_size=3, stride=int(1 / factor), padding=1)
+
+    def forward(self, hidden_state):
+        hidden_state = self.projection(hidden_state)
+        hidden_state = self.resize(hidden_state)
+        return hidden_state
+
+
+class DPTFeatureFusionStage(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.layers = nn.LayerList()
+        for _ in range(len(config.neck_hidden_sizes)):
+            self.layers.append(DPTFeatureFusionLayer(config))
+
+    def forward(self, hidden_states):
+        # reversing the hidden_states, we start from the last
+        hidden_states = hidden_states[::-1]
+
+        fused_hidden_states = []
+        # first layer only uses the last hidden_state
+        fused_hidden_state = self.layers[0](hidden_states[0])
+        fused_hidden_states.append(fused_hidden_state)
+        # looping from the last layer to the second
+        for hidden_state, layer in zip(hidden_states[1:], self.layers[1:]):
+            fused_hidden_state = layer(fused_hidden_state, hidden_state)
+            fused_hidden_states.append(fused_hidden_state)
+
+        return fused_hidden_states
+
+
+class DPTPreActResidualLayer(nn.Layer):
+    """
+    ResidualConvUnit, pre-activate residual unit.
+    Args:
+        config (`[DPTConfig]`):
+            Model configuration class defining the model architecture.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.use_batch_norm = config.use_batch_norm_in_fusion_residual
+        self.activation1 = ACT2FN["relu"]
+        self.convolution1 = nn.Conv2D(
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias_attr=not self.use_batch_norm,
+        )
+
+        self.activation2 = ACT2FN["relu"]
+        self.convolution2 = nn.Conv2D(
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias_attr=not self.use_batch_norm,
+        )
+
+        if self.use_batch_norm:
+            self.batch_norm1 = nn.BatchNorm2D(config.fusion_hidden_size)
+            self.batch_norm2 = nn.BatchNorm2D(config.fusion_hidden_size)
+
+    def forward(self, hidden_state: paddle.Tensor) -> paddle.Tensor:
+        residual = hidden_state
+        hidden_state = self.activation1(hidden_state)
+
+        hidden_state = self.convolution1(hidden_state)
+
+        if self.use_batch_norm:
+            hidden_state = self.batch_norm1(hidden_state)
+
+        hidden_state = self.activation2(hidden_state)
+        hidden_state = self.convolution2(hidden_state)
+
+        if self.use_batch_norm:
+            hidden_state = self.batch_norm2(hidden_state)
+
+        return hidden_state + residual
+
+
+class DPTFeatureFusionLayer(nn.Layer):
+    """Feature fusion layer, merges feature maps from different stages.
+    Args:
+        config (`[DPTConfig]`):
+            Model configuration class defining the model architecture.
+        align_corners (`bool`, *optional*, defaults to `True`):
+            The align_corner setting for bilinear upsample.
+    """
+
+    def __init__(self, config, align_corners=True):
+        super().__init__()
+
+        self.align_corners = align_corners
+
+        self.projection = nn.Conv2D(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1)
+
+        self.residual_layer1 = DPTPreActResidualLayer(config)
+        self.residual_layer2 = DPTPreActResidualLayer(config)
+
+    def forward(self, hidden_state, residual=None):
+        if residual is not None:
+            if hidden_state.shape != residual.shape:
+                residual = F.interpolate(
+                    residual, size=(hidden_state.shape[2], hidden_state.shape[3]), mode="bilinear", align_corners=False
+                )
+            hidden_state = hidden_state + self.residual_layer1(residual)
+
+        hidden_state = self.residual_layer2(hidden_state)
+        hidden_state = F.interpolate(hidden_state, scale_factor=2, mode="bilinear", align_corners=self.align_corners)
+        hidden_state = self.projection(hidden_state)
+
+        return hidden_state
+
+
+class DPTPretrainedModel(PretrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = DPTConfig
+    base_model_prefix = "dpt"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2D, nn.Conv2DTranspose)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            zeros_(module.bias)
+            ones_(module.weight)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, DPTViTEncoder):
+            module.gradient_checkpointing = value
+
+    def gradient_checkpointing_enable(self):
+        """
+        Activates gradient checkpointing for the current model.
+
+        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+        activations".
+        """
+        if not self.supports_gradient_checkpointing:
+            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
+        self.apply(partial(self._set_gradient_checkpointing, value=True))
+
+    def gradient_checkpointing_disable(self):
+        """
+        Deactivates gradient checkpointing for the current model.
+
+        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+        activations".
+        """
+        if self.supports_gradient_checkpointing:
+            self.apply(partial(self._set_gradient_checkpointing, value=False))
+
+
+class DPTModel(DPTPretrainedModel):
+    """
+    The bare DPT Model transformer outputting raw hidden-states without any specific head on top.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`DPTConfig`):
+            An instance of DPTConfig used to construct DPTModel.
+        add_pooling_layer (`bool`, *optional*, defaults to True):
+            Whether to add a pooler layer.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        # vit encoder
+        if config.is_hybrid:
+            self.embeddings = DPTViTHybridEmbeddings(config)
+        else:
+            self.embeddings = DPTViTEmbeddings(config)
+        self.encoder = DPTViTEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.pooler = DPTViTPooler(config) if add_pooling_layer else None
+
+    def get_input_embeddings(self):
+        if self.config.is_hybrid:
+            return self.embeddings
+        else:
+            return self.embeddings.patch_embeddings
+
+    def forward(
+        self,
+        pixel_values: paddle.Tensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPoolingAndIntermediateActivations]:
+        """
+        The DPTModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            pixel_values (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values. Pixel values can be obtained using [`DPTImageProcessor`]. See [`DPTImageProcessor.__call__`]
+                for details.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (bool, optional):
+                Whether to return a :class:`BaseModelOutputWithPoolingAndIntermediateActivations` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `None`.
+
+        Returns:
+            An instance of :class:`BaseModelOutputWithPoolingAndIntermediateActivations` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`BaseModelOutputWithPoolingAndIntermediateActivations`.
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        embedding_output = self.embeddings(pixel_values, return_dict=return_dict)
+
+        embedding_last_hidden_states = embedding_output[0] if not return_dict else embedding_output.last_hidden_states
+
+        encoder_outputs = self.encoder(
+            embedding_last_hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
+            return head_outputs + encoder_outputs[1:] + embedding_output[1:]
+
+        return BaseModelOutputWithPoolingAndIntermediateActivations(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            intermediate_activations=embedding_output.intermediate_activations,
+        )
+
+
+class DPTViTPooler(nn.Layer):
+    def __init__(self, config: DPTConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class DPTNeck(nn.Layer):
+    """
+    DPTNeck. A neck is a module that is normally used between the backbone and the head. It takes a list of tensors as
+    input and produces another list of tensors as output. For DPT, it includes 2 stages:
+    * DPTReassembleStage
+    * DPTFeatureFusionStage.
+    Args:
+        config (dict): config dict.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        # postprocessing
+        self.reassemble_stage = DPTReassembleStage(config)
+        self.convs = nn.LayerList()
+        for channel in config.neck_hidden_sizes:
+            self.convs.append(nn.Conv2D(channel, config.fusion_hidden_size, kernel_size=3, padding=1, bias_attr=False))
+
+        # fusion
+        self.fusion_stage = DPTFeatureFusionStage(config)
+
+    def forward(self, hidden_states: List[paddle.Tensor]) -> List[paddle.Tensor]:
+        if not isinstance(hidden_states, list):
+            raise ValueError("hidden_states should be a list of tensors")
+
+        if len(hidden_states) != len(self.config.neck_hidden_sizes):
+            raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.")
+
+        # postprocess hidden states
+        features = self.reassemble_stage(hidden_states)
+
+        features = [self.convs[i](feature) for i, feature in enumerate(features)]
+
+        # fusion blocks
+        output = self.fusion_stage(features)
+
+        return output
+
+
+class DPTDepthEstimationHead(nn.Layer):
+    """
+    Output head head consisting of 3 convolutional layers. It progressively halves the feature dimension and upsamples
+    the predictions to the input resolution after the first convolutional layer (details can be found in the paper's
+    supplementary material).
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+
+        features = config.fusion_hidden_size
+        self.head = nn.Sequential(
+            nn.Conv2D(features, features // 2, kernel_size=3, stride=1, padding=1),
+            nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True),
+            nn.Conv2D(features // 2, 32, kernel_size=3, stride=1, padding=1),
+            ACT2FN["relu"],
+            nn.Conv2D(32, 1, kernel_size=1, stride=1, padding=0),
+            ACT2FN["relu"],
+        )
+
+    def forward(self, hidden_states: List[paddle.Tensor]) -> paddle.Tensor:
+        # use last features
+        hidden_states = hidden_states[self.config.head_in_index]
+
+        predicted_depth = self.head(hidden_states)
+
+        predicted_depth = predicted_depth.squeeze(axis=1)
+
+        return predicted_depth
+
+
+class DPTForDepthEstimation(DPTPretrainedModel):
+    """
+    DPT Model with a depth estimation head on top (consisting of 3 convolutional layers) e.g. for KITTI, NYUv2.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`DPTConfig`):
+            An instance of DPTConfig used to construct DPTForDepthEstimation.
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.dpt = DPTModel(config, add_pooling_layer=False)
+
+        # Neck
+        self.neck = DPTNeck(config)
+
+        # Depth estimation head
+        self.head = DPTDepthEstimationHead(config)
+
+    def forward(
+        self,
+        pixel_values: paddle.Tensor,
+        labels: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[paddle.Tensor], DepthEstimatorOutput]:
+        r"""
+        Args:
+            pixel_values (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values. Pixel values can be obtained using [`DPTImageProcessor`]. See [`DPTImageProcessor.__call__`]
+                for details.
+            labels (`paddle.Tensor` of shape `(batch_size, height, width)`, *optional*):
+                Ground truth depth estimation maps for computing the loss.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (bool, optional):
+                Whether to return a :class:`DepthEstimatorOutput` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `None`.
+
+        Returns:
+            An instance of :class:`DepthEstimatorOutput` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`DepthEstimatorOutput`.
+
+        Examples:
+
+        ```python
+        >>> from paddlenlp.transformers import DPTImageProcessor, DPTForDepthEstimation
+        >>> import paddle
+        >>> import paddle.nn.functional as F
+        >>> import numpy as np
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large")
+        >>> model = DPTForDepthEstimation.from_pretrained("Intel/dpt-large")
+
+        >>> # prepare image for the model
+        >>> inputs = image_processor(images=image, return_tensors="pd")
+
+        >>> with paddle.no_grad():
+        ...     outputs = model(**inputs)
+        ...     predicted_depth = outputs.predicted_depth
+
+        >>> # interpolate to original size
+        >>> prediction = F.interpolate(
+        ...     predicted_depth.unsqueeze(1),
+        ...     size=image.size[::-1],
+        ...     mode="bicubic",
+        ...     align_corners=False,
+        ... )
+
+        >>> # visualize the prediction
+        >>> output = prediction.squeeze().cpu().numpy()
+        >>> formatted = (output * 255 / np.max(output)).astype("uint8")
+        >>> depth = Image.fromarray(formatted)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        outputs = self.dpt(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=True,  # we need the intermediate hidden states
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs.hidden_states if return_dict else outputs[1]
+
+        # only keep certain features based on config.backbone_out_indices
+        # note that the hidden_states also include the initial embeddings
+        if not self.config.is_hybrid:
+            hidden_states = [
+                feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.config.backbone_out_indices
+            ]
+        else:
+            backbone_hidden_states = list(outputs.intermediate_activations) if return_dict else list(outputs[-1])
+            backbone_hidden_states.extend(
+                feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.config.backbone_out_indices[2:]
+            )
+
+            hidden_states = backbone_hidden_states
+
+        hidden_states = self.neck(hidden_states)
+
+        predicted_depth = self.head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not implemented yet")
+
+        if not return_dict:
+            if output_hidden_states:
+                output = (predicted_depth,) + outputs[1:]
+            else:
+                output = (predicted_depth,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return DepthEstimatorOutput(
+            loss=loss,
+            predicted_depth=predicted_depth,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )
+
+
+class DPTSemanticSegmentationHead(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+
+        features = config.fusion_hidden_size
+        self.head = nn.Sequential(
+            nn.Conv2D(features, features, kernel_size=3, padding=1, bias_attr=False),
+            nn.BatchNorm2D(features),
+            ACT2FN["relu"],
+            nn.Dropout(config.semantic_classifier_dropout),
+            nn.Conv2D(features, config.num_labels, kernel_size=1),
+            nn.Upsample(scale_factor=2, mode="bilinear", align_corners=True),
+        )
+
+    def forward(self, hidden_states: List[paddle.Tensor]) -> paddle.Tensor:
+        # use last features
+        hidden_states = hidden_states[self.config.head_in_index]
+
+        logits = self.head(hidden_states)
+
+        return logits
+
+
+class DPTAuxiliaryHead(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+
+        features = config.fusion_hidden_size
+        self.head = nn.Sequential(
+            nn.Conv2D(features, features, kernel_size=3, padding=1, bias_attr=False),
+            nn.BatchNorm2D(features),
+            ACT2FN["relu"],
+            nn.Dropout(0.1, False),
+            nn.Conv2D(features, config.num_labels, kernel_size=1),
+        )
+
+    def forward(self, hidden_states):
+        logits = self.head(hidden_states)
+
+        return logits
+
+
+class DPTForSemanticSegmentation(DPTPretrainedModel):
+    """
+    DPT Model with a semantic segmentation head on top e.g. for ADE20k, CityScapes.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`DPTConfig`):
+            An instance of DPTConfig used to construct DPTForSemanticSegmentation.
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.dpt = DPTModel(config, add_pooling_layer=False)
+
+        # Neck
+        self.neck = DPTNeck(config)
+
+        # Segmentation head(s)
+        self.head = DPTSemanticSegmentationHead(config)
+        self.auxiliary_head = DPTAuxiliaryHead(config) if config.use_auxiliary_head else None
+
+    def forward(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[paddle.Tensor], SemanticSegmenterOutput]:
+        r"""
+        Args:
+            pixel_values (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values. Pixel values can be obtained using [`DPTImageProcessor`]. See [`DPTImageProcessor.__call__`]
+                for details.
+            labels (`paddle.Tensor` of shape `(batch_size, height, width)`, *optional*):
+                Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
+                config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (bool, optional):
+                Whether to return a :class:`SemanticSegmenterOutput` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `None`.
+
+        Returns:
+            An instance of :class:`SemanticSegmenterOutput` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`SemanticSegmenterOutput`.
+
+        Examples:
+        ```python
+        >>> from paddlenlp.transformers import DPTImageProcessor, DPTForSemanticSegmentation
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = DPTImageProcessor.from_pretrained("Intel/dpt-large-ade")
+        >>> model = DPTForSemanticSegmentation.from_pretrained("Intel/dpt-large-ade")
+
+        >>> inputs = image_processor(images=image, return_tensors="pd")
+
+        >>> outputs = model(**inputs)
+        >>> logits = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        outputs = self.dpt(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=True,  # we need the intermediate hidden states
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs.hidden_states if return_dict else outputs[1]
+
+        # only keep certain features based on config.backbone_out_indices
+        # note that the hidden_states also include the initial embeddings
+        if not self.config.is_hybrid:
+            hidden_states = [
+                feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.config.backbone_out_indices
+            ]
+        else:
+            backbone_hidden_states = list(outputs.intermediate_activations) if return_dict else list(outputs[-1])
+            backbone_hidden_states.extend(
+                feature for idx, feature in enumerate(hidden_states[1:]) if idx in self.config.backbone_out_indices[2:]
+            )
+
+            hidden_states = backbone_hidden_states
+
+        hidden_states = self.neck(hidden_states)
+
+        logits = self.head(hidden_states)
+
+        auxiliary_logits = None
+        if self.auxiliary_head is not None:
+            auxiliary_logits = self.auxiliary_head(hidden_states[-1])
+
+        loss = None
+        if labels is not None:
+            if self.config.num_labels == 1:
+                raise ValueError("The number of labels should be greater than one")
+            else:
+                # upsample logits to the images' original size
+                upsampled_logits = F.interpolate(logits, size=labels.shape[-2:], mode="bilinear", align_corners=False)
+                if auxiliary_logits is not None:
+                    upsampled_auxiliary_logits = F.interpolate(
+                        auxiliary_logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
+                    )
+                # compute weighted loss
+                loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
+                # upsampled_logits and upsampled_auxiliary_logits 's shape [b, num_labels, h, w] -> [b, h, w, num_labels]
+                main_loss = loss_fct(upsampled_logits.transpose([0, 2, 3, 1]), labels)
+                auxiliary_loss = loss_fct(upsampled_auxiliary_logits.transpose([0, 2, 3, 1]), labels)
+                loss = main_loss + self.config.auxiliary_loss_weight * auxiliary_loss
+
+        if not return_dict:
+            if output_hidden_states:
+                output = (logits,) + outputs[1:]
+            else:
+                output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SemanticSegmenterOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/electra/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/electra/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/electra/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/electra/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/electra/configuration.py
new file mode 100644
index 000000000..117bb5869
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/electra/configuration.py
@@ -0,0 +1,293 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Electra model configuration """
+from __future__ import annotations
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["ElectraConfig", "ELECTRA_PRETRAINED_INIT_CONFIGURATION", "ELECTRA_PRETRAINED_RESOURCE_FILES_MAP"]
+
+ELECTRA_PRETRAINED_INIT_CONFIGURATION = {
+    "electra-small": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 128,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 256,
+        "initializer_range": 0.02,
+        "intermediate_size": 1024,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 4,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30522,
+    },
+    "electra-base": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 768,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30522,
+    },
+    "electra-large": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 1024,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "intermediate_size": 4096,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 24,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30522,
+    },
+    "chinese-electra-small": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 128,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 256,
+        "initializer_range": 0.02,
+        "intermediate_size": 1024,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 4,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 21128,
+    },
+    "chinese-electra-base": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 768,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 21128,
+    },
+    "ernie-health-chinese": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 768,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 22608,
+        "layer_norm_eps": 1e-5,
+    },
+    "electra-small-generator": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 128,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 256,
+        "initializer_range": 0.02,
+        "intermediate_size": 1024,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 4,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30522,
+    },
+    "electra-base-generator": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 768,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 256,
+        "initializer_range": 0.02,
+        "intermediate_size": 1024,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 4,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30522,
+    },
+    "electra-large-generator": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 1024,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 256,
+        "initializer_range": 0.02,
+        "intermediate_size": 1024,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 4,
+        "num_hidden_layers": 24,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30522,
+    },
+    "electra-small-discriminator": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 128,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 256,
+        "initializer_range": 0.02,
+        "intermediate_size": 1024,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 4,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30522,
+    },
+    "electra-base-discriminator": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 768,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30522,
+    },
+    "electra-large-discriminator": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 1024,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "intermediate_size": 4096,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 24,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 30522,
+    },
+    "ernie-health-chinese-generator": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 768,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 256,
+        "initializer_range": 0.02,
+        "intermediate_size": 1024,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 4,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 22608,
+        "layer_norm_eps": 1e-12,
+    },
+    "ernie-health-chinese-discriminator": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 768,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 22608,
+        "layer_norm_eps": 1e-12,
+    },
+}
+
+ELECTRA_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "electra-small": "https://bj.bcebos.com/paddlenlp/models/transformers/electra/electra-small.pdparams",
+        "electra-base": "https://bj.bcebos.com/paddlenlp/models/transformers/electra/electra-base.pdparams",
+        "electra-large": "https://bj.bcebos.com/paddlenlp/models/transformers/electra/electra-large.pdparams",
+        "chinese-electra-small": "https://bj.bcebos.com/paddlenlp/models/transformers/chinese-electra-small/chinese-electra-small.pdparams",
+        "chinese-electra-base": "https://bj.bcebos.com/paddlenlp/models/transformers/chinese-electra-base/chinese-electra-base.pdparams",
+        "ernie-health-chinese": "https://paddlenlp.bj.bcebos.com/models/transformers/ernie-health-chinese/ernie-health-chinese.pdparams",
+    }
+}
+
+
+class ElectraConfig(PretrainedConfig):
+    model_type = "electra"
+    pretrained_init_configuration = ELECTRA_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size: int = 22608,
+        embedding_size: int = 768,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        max_position_embeddings: int = 512,
+        type_vocab_size: int = 2,
+        initializer_range: float = 0.02,
+        pad_token_id: int = 0,
+        layer_norm_eps: float = 1e-12,
+        num_choices: int = 2,
+        gen_weight: float = 1.0,
+        disc_weight: float = 50.0,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.vocab_size = vocab_size
+        self.embedding_size = embedding_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.pad_token_id = pad_token_id
+        self.layer_norm_eps = layer_norm_eps
+        self.num_choices = num_choices
+        self.gen_weight = gen_weight
+        self.disc_weight = disc_weight
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/electra/converter.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/electra/converter.py
new file mode 100644
index 000000000..bd5fb3d0d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/electra/converter.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+from typing import List, Union, Dict, Type
+
+from paddlenlp.transformers import PretrainedModel, ElectraModel
+from paddlenlp.utils.converter import StateDictNameMapping, Converter
+
+__all__ = ["ElectraConverter"]
+
+
+class ElectraConverter(Converter):
+    _ignore_state_dict_keys = ["embeddings.position_ids"]
+    architectures: Dict[str, Type[PretrainedModel]] = {"ElectraModel": ElectraModel}
+
+    def get_paddle_pytorch_model_classes(self):
+        from paddlenlp.transformers import ElectraModel as PaddleRobertaModel
+        from transformers import ElectraModel as PytorchRobertaModel
+
+        return PaddleRobertaModel, PytorchRobertaModel
+
+    def get_name_mapping(self, config_or_num_layers: Union[dict, int] = None) -> List[StateDictNameMapping]:
+        num_layer = self.resolve_num_layer(config_or_num_layers)
+
+        mappings = [
+            ["embeddings.word_embeddings.weight", "embeddings.word_embeddings.weight"],
+            ["embeddings.position_embeddings.weight", "embeddings.position_embeddings.weight"],
+            ["embeddings.token_type_embeddings.weight", "embeddings.token_type_embeddings.weight"],
+            ["embeddings.LayerNorm.weight", "embeddings.layer_norm.weight"],
+            ["embeddings.LayerNorm.bias", "embeddings.layer_norm.bias"],
+            ["embeddings_project.weight", "embeddings_project.weight", "transpose"],
+            ["embeddings_project.bias", "embeddings_project.bias"],
+        ]
+
+        for layer_index in range(num_layer):
+            layer_mappings = [
+                [
+                    f"encoder.layer.{layer_index}.attention.self.query.weight",
+                    f"encoder.layers.{layer_index}.self_attn.q_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.query.bias",
+                    f"encoder.layers.{layer_index}.self_attn.q_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.key.weight",
+                    f"encoder.layers.{layer_index}.self_attn.k_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.key.bias",
+                    f"encoder.layers.{layer_index}.self_attn.k_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.value.weight",
+                    f"encoder.layers.{layer_index}.self_attn.v_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.value.bias",
+                    f"encoder.layers.{layer_index}.self_attn.v_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.dense.weight",
+                    f"encoder.layers.{layer_index}.self_attn.out_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.dense.bias",
+                    f"encoder.layers.{layer_index}.self_attn.out_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.LayerNorm.weight",
+                    f"encoder.layers.{layer_index}.norm1.weight",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.LayerNorm.bias",
+                    f"encoder.layers.{layer_index}.norm1.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.intermediate.dense.weight",
+                    f"encoder.layers.{layer_index}.linear1.weight",
+                    "transpose",
+                ],
+                [f"encoder.layer.{layer_index}.intermediate.dense.bias", f"encoder.layers.{layer_index}.linear1.bias"],
+                [
+                    f"encoder.layer.{layer_index}.output.dense.weight",
+                    f"encoder.layers.{layer_index}.linear2.weight",
+                    "transpose",
+                ],
+                [f"encoder.layer.{layer_index}.output.dense.bias", f"encoder.layers.{layer_index}.linear2.bias"],
+                [f"encoder.layer.{layer_index}.output.LayerNorm.weight", f"encoder.layers.{layer_index}.norm2.weight"],
+                [f"encoder.layer.{layer_index}.output.LayerNorm.bias", f"encoder.layers.{layer_index}.norm2.bias"],
+            ]
+            mappings.extend(layer_mappings)
+        return [StateDictNameMapping(*mapping) for mapping in mappings]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/electra/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/electra/modeling.py
new file mode 100644
index 000000000..fb906f386
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/electra/modeling.py
@@ -0,0 +1,1813 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import List, Optional
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import Tensor
+from paddle.nn import TransformerEncoder, TransformerEncoderLayer
+
+from ...utils.converter import StateDictNameMapping, init_name_mappings
+from .. import PretrainedModel, register_base_model
+from ..activations import get_activation
+from ..model_outputs import (
+    MaskedLMOutput,
+    ModelOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    tuple_output,
+)
+from .configuration import (
+    ELECTRA_PRETRAINED_INIT_CONFIGURATION,
+    ELECTRA_PRETRAINED_RESOURCE_FILES_MAP,
+    ElectraConfig,
+)
+
+__all__ = [
+    "ElectraModel",
+    "ElectraPretrainedModel",
+    "ElectraForTotalPretraining",
+    "ElectraDiscriminator",
+    "ElectraGenerator",
+    "ElectraClassificationHead",
+    "ElectraForSequenceClassification",
+    "ElectraForTokenClassification",
+    "ElectraPretrainingCriterion",
+    "ElectraForMultipleChoice",
+    "ElectraForQuestionAnswering",
+    "ElectraForMaskedLM",
+    "ElectraForPretraining",
+    "ErnieHealthForTotalPretraining",
+    "ErnieHealthPretrainingCriterion",
+    "ErnieHealthDiscriminator",
+]
+
+
+class ElectraEmbeddings(nn.Layer):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config: ElectraConfig):
+        super(ElectraEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)
+
+        self.layer_norm = nn.LayerNorm(config.embedding_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self, input_ids, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=None
+    ):
+        if position_ids is None:
+            ones = paddle.ones_like(input_ids, dtype="int64")
+            seq_length = paddle.cumsum(ones, axis=-1)
+            position_ids = seq_length - ones
+            if past_key_values_length is not None:
+                position_ids += past_key_values_length
+            position_ids.stop_gradient = True
+        position_ids = position_ids.astype("int64")
+
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros_like(input_ids, dtype="int64")
+
+        if input_ids is not None:
+            input_embeddings = self.word_embeddings(input_ids)
+        else:
+            input_embeddings = inputs_embeds
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = input_embeddings + position_embeddings + token_type_embeddings
+
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class ElectraDiscriminatorPredictions(nn.Layer):
+    """Prediction layer for the discriminator, made up of two dense layers."""
+
+    def __init__(self, config: ElectraConfig):
+        super(ElectraDiscriminatorPredictions, self).__init__()
+
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dense_prediction = nn.Linear(config.hidden_size, 1)
+        self.act = get_activation(config.hidden_act)
+
+    def forward(self, discriminator_hidden_states):
+        hidden_states = self.dense(discriminator_hidden_states)
+        hidden_states = self.act(hidden_states)
+        logits = self.dense_prediction(hidden_states).squeeze()
+
+        return logits
+
+
+class ElectraGeneratorPredictions(nn.Layer):
+    """Prediction layer for the generator, made up of two dense layers."""
+
+    def __init__(self, config: ElectraConfig):
+        super(ElectraGeneratorPredictions, self).__init__()
+
+        self.layer_norm = nn.LayerNorm(config.embedding_size)
+        self.dense = nn.Linear(config.hidden_size, config.embedding_size)
+        self.act = get_activation(config.hidden_act)
+
+    def forward(self, generator_hidden_states):
+        hidden_states = self.dense(generator_hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+
+        return hidden_states
+
+
+class ElectraPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained Electra models. It provides Electra related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    base_model_prefix = "electra"
+
+    # pretrained general configuration
+    gen_weight = 1.0
+    disc_weight = 50.0
+    tie_word_embeddings = True
+    untied_generator_embeddings = False
+    use_softmax_sample = True
+
+    # model init configuration
+    pretrained_init_configuration = ELECTRA_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = ELECTRA_PRETRAINED_RESOURCE_FILES_MAP
+    config_class = ElectraConfig
+
+    @classmethod
+    def _get_name_mappings(cls, config: ElectraConfig) -> List[StateDictNameMapping]:
+        model_mappings = [
+            "embeddings.word_embeddings.weight",
+            "embeddings.position_embeddings.weight",
+            "embeddings.token_type_embeddings.weight",
+            ["embeddings.LayerNorm.weight", "embeddings.layer_norm.weight"],
+            ["embeddings.LayerNorm.bias", "embeddings.layer_norm.bias"],
+            ["embeddings_project.weight", None, "transpose"],
+            "embeddings_project.bias",
+        ]
+
+        for layer_index in range(config.num_hidden_layers):
+            layer_mappings = [
+                [
+                    f"encoder.layer.{layer_index}.attention.self.query.weight",
+                    f"encoder.layers.{layer_index}.self_attn.q_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.query.bias",
+                    f"encoder.layers.{layer_index}.self_attn.q_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.key.weight",
+                    f"encoder.layers.{layer_index}.self_attn.k_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.key.bias",
+                    f"encoder.layers.{layer_index}.self_attn.k_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.value.weight",
+                    f"encoder.layers.{layer_index}.self_attn.v_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.value.bias",
+                    f"encoder.layers.{layer_index}.self_attn.v_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.dense.weight",
+                    f"encoder.layers.{layer_index}.self_attn.out_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.dense.bias",
+                    f"encoder.layers.{layer_index}.self_attn.out_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.LayerNorm.weight",
+                    f"encoder.layers.{layer_index}.norm1.weight",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.LayerNorm.bias",
+                    f"encoder.layers.{layer_index}.norm1.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.intermediate.dense.weight",
+                    f"encoder.layers.{layer_index}.linear1.weight",
+                    "transpose",
+                ],
+                [f"encoder.layer.{layer_index}.intermediate.dense.bias", f"encoder.layers.{layer_index}.linear1.bias"],
+                [
+                    f"encoder.layer.{layer_index}.output.dense.weight",
+                    f"encoder.layers.{layer_index}.linear2.weight",
+                    "transpose",
+                ],
+                [f"encoder.layer.{layer_index}.output.dense.bias", f"encoder.layers.{layer_index}.linear2.bias"],
+                [f"encoder.layer.{layer_index}.output.LayerNorm.weight", f"encoder.layers.{layer_index}.norm2.weight"],
+                [f"encoder.layer.{layer_index}.output.LayerNorm.bias", f"encoder.layers.{layer_index}.norm2.bias"],
+            ]
+            model_mappings.extend(layer_mappings)
+
+        init_name_mappings(model_mappings)
+        # base-model prefix "ElectraModel"
+        if "ElectraModel" not in config.architectures:
+            for mapping in model_mappings:
+                mapping[0] = "electra." + mapping[0]
+                mapping[1] = "electra." + mapping[1]
+
+        # downstream mappings
+        if "ElectraForQuestionAnswering" in config.architectures:
+            model_mappings.extend(
+                [["qa_outputs.weight", "classifier.weight", "transpose"], ["qa_outputs.bias", "classifier.bias"]]
+            )
+
+        if "ElectraForMultipleChoice" in config.architectures:
+            model_mappings.extend(
+                [
+                    ["sequence_summary.summary.weight", "sequence_summary.dense.weight", "transpose"],
+                    ["sequence_summary.summary.bias", "sequence_summary.dense.bias"],
+                    ["classifier.weight", "classifier.weight", "transpose"],
+                    ["classifier.bias", "classifier.bias"],
+                ]
+            )
+
+        if "ElectraForSequenceClassification" in config.architectures:
+            model_mappings.extend(
+                [
+                    ["classifier.dense.weight", "classifier.dense.weight", "transpose"],
+                    ["classifier.dense.bias", "classifier.dense.bias"],
+                    ["classifier.out_proj.weight", "classifier.out_proj.weight", "transpose"],
+                    ["classifier.out_proj.bias", "classifier.out_proj.bias"],
+                ]
+            )
+
+        if "ElectraForTokenClassification" in config.architectures:
+            model_mappings.extend(
+                [
+                    ["classifier.weight", "classifier.weight", "transpose"],
+                    "classifier.bias",
+                ]
+            )
+
+        # TODO: need to tie weights
+        if "ElectraForMaskedLM" in config.architectures:
+            model_mappings.extend(
+                [
+                    ["generator_predictions.LayerNorm.weight", "generator_predictions.layer_norm.weight", "transpose"],
+                    ["generator_predictions.LayerNorm.bias", "generator_predictions.layer_norm.bias"],
+                    ["generator_predictions.dense.weight", None, "transpose"],
+                    "generator_predictions.dense.bias",
+                    ["generator_lm_head.bias", "generator_lm_head_bias"],
+                ]
+            )
+
+        init_name_mappings(model_mappings)
+        return [StateDictNameMapping(*mapping) for mapping in model_mappings]
+
+    def _init_weights(self, layer):
+        """Initialize the weights"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            layer.weight.set_value(
+                paddle.tensor.normal(
+                    mean=0.0,
+                    std=self.config.initializer_range,
+                    shape=layer.weight.shape,
+                )
+            )
+        elif isinstance(layer, nn.LayerNorm):
+            layer.bias.set_value(paddle.zeros_like(layer.bias))
+            layer.weight.set_value(paddle.full_like(layer.weight, 1.0))
+            layer._epsilon = getattr(self, "layer_norm_eps", 1e-12)
+        if isinstance(layer, nn.Linear) and layer.bias is not None:
+            layer.bias.set_value(paddle.zeros_like(layer.bias))
+
+
+@register_base_model
+class ElectraModel(ElectraPretrainedModel):
+    """
+    The bare Electra Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`ElectraConfig`):
+            An instance of ElectraConfig
+    """
+
+    def __init__(self, config: ElectraConfig):
+        super(ElectraModel, self).__init__(config)
+        self.pad_token_id = config.pad_token_id
+        self.initializer_range = config.initializer_range
+        self.layer_norm_eps = config.layer_norm_eps
+        self.embeddings = ElectraEmbeddings(config)
+
+        if config.embedding_size != config.hidden_size:
+            self.embeddings_project = nn.Linear(config.embedding_size, config.hidden_size)
+
+        encoder_layer = TransformerEncoderLayer(
+            d_model=config.hidden_size,
+            nhead=config.num_attention_heads,
+            dim_feedforward=config.intermediate_size,
+            dropout=config.hidden_dropout_prob,
+            activation=config.hidden_act,
+            attn_dropout=config.attention_probs_dropout_prob,
+            act_dropout=0,
+        )
+        self.encoder = TransformerEncoder(encoder_layer, config.num_hidden_layers)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def forward(
+        self,
+        input_ids,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
+    ):
+        r"""
+        The ElectraModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            inputs_embeds (Tensor, optional):
+                Instead of passing input_ids you can choose to directly pass an embedded representation.
+                This is useful for use cases such as P-Tuning, where you want more control over how to convert input_ids indices
+                into the embedding space.
+                Its data type should be `float32` and it has a shape of [batch_size, sequence_length, embedding_size].
+            past_key_values (tuple(tuple(Tensor)), optional):
+                Precomputed key and value hidden states of the attention blocks of each layer. This can be used to speedup
+                auto-regressive decoding for generation tasks or to support use cases such as Prefix-Tuning where vectors are prepended
+                to each attention layer. The length of tuple equals to the number of layers, and each tuple having 2 tensors of shape
+                `(batch_size, num_heads, past_key_values_length, embed_size_per_head)`)
+                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that
+                don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+                `input_ids` of shape `(batch_size, sequence_length)`.
+            use_cache (`bool`, optional):
+                If set to `True`, `past_key_values` key value states are returned.
+                Defaults to `None`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            Tensor: Returns tensor `encoder_outputs`, which is the output at the last layer of the model.
+            Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ElectraModel, ElectraTokenizer
+
+                tokenizer = ElectraTokenizer.from_pretrained('electra-small')
+                model = ElectraModel.from_pretrained('electra-small')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+
+        """
+        past_key_values_length = None
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+
+        if attention_mask is None:
+            attention_mask = paddle.unsqueeze(
+                (input_ids == self.pad_token_id).astype(paddle.get_default_dtype()) * -1e4, axis=[1, 2]
+            )
+            if past_key_values is not None:
+                batch_size = past_key_values[0][0].shape[0]
+                past_mask = paddle.zeros([batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype)
+                attention_mask = paddle.concat([past_mask, attention_mask], axis=-1)
+        else:
+            if attention_mask.ndim == 2:
+                attention_mask = attention_mask.unsqueeze(axis=[1, 2])
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        if hasattr(self, "embeddings_project"):
+            embedding_output = self.embeddings_project(embedding_output)
+
+        self.encoder._use_cache = use_cache  # To be consistent with HF
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask,
+            cache=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return encoder_outputs
+
+
+class ElectraDiscriminator(ElectraPretrainedModel):
+    """
+    The Electra Discriminator can detect the tokens that are replaced by the Electra Generator.
+
+    Args:
+        config (:class:`ElectraConfig`):
+            An instance of ElectraConfig
+
+    """
+
+    def __init__(self, config: ElectraConfig):
+        super(ElectraDiscriminator, self).__init__(config)
+
+        self.electra = ElectraModel(config)
+        self.discriminator_predictions = ElectraDiscriminatorPredictions(config)
+
+    def forward(
+        self,
+        input_ids,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+    ):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ElectraModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ElectraModel`.
+            position_ids (Tensor, optional):
+                See :class:`ElectraModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ElectraModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`ElectraModel`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, the prediction result of replaced tokens.
+            Its data type should be float32 and if batch_size>1, its shape is [batch_size, sequence_length],
+            if batch_size=1, its shape is [sequence_length].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ElectraDiscriminator, ElectraTokenizer
+
+                tokenizer = ElectraTokenizer.from_pretrained('electra-small')
+                model = ElectraDiscriminator.from_pretrained('electra-small')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+
+        """
+        discriminator_sequence_output = self.electra(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+        )
+
+        logits = self.discriminator_predictions(discriminator_sequence_output)
+        return logits
+
+
+class ElectraGenerator(ElectraPretrainedModel):
+    """
+    The Electra Generator will replace some tokens of the given sequence, it is trained as
+    a masked language model.
+
+    Args:
+        config (:class:`ElectraConfig`):
+            An instance of ElectraConfig
+    """
+
+    def __init__(self, config: ElectraConfig):
+        super(ElectraGenerator, self).__init__(config)
+
+        self.electra = ElectraModel(config)
+        self.generator_predictions = ElectraGeneratorPredictions(config)
+
+        if not self.tie_word_embeddings:
+            self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size)
+        else:
+            self.generator_lm_head_bias = self.create_parameter(
+                shape=[config.vocab_size], dtype=paddle.get_default_dtype(), is_bias=True
+            )
+
+    def get_input_embeddings(self):
+        return self.electra.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.electra.embeddings.word_embeddings = value
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
+    ):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ElectraModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ElectraModel`.
+            position_ids (Tensor, optional):
+                See :class:`ElectraModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ElectraModel`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+        Returns:
+            Tensor: Returns tensor `prediction_scores`, the scores of Electra Generator.
+            Its data type should be int64 and its shape is [batch_size, sequence_length, vocab_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ElectraGenerator, ElectraTokenizer
+
+                tokenizer = ElectraTokenizer.from_pretrained('electra-small')
+                model = ElectraGenerator.from_pretrained('electra-small')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                prediction_scores = model(**inputs)
+
+        """
+        generator_sequence_output = self.electra(
+            input_ids,
+            token_type_ids,
+            position_ids,
+            attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if isinstance(generator_sequence_output, type(input_ids)):
+            generator_sequence_output = (generator_sequence_output,)
+
+        prediction_scores = self.generator_predictions(generator_sequence_output[0])
+        if not self.tie_word_embeddings:
+            prediction_scores = self.generator_lm_head(prediction_scores)
+        else:
+            prediction_scores = paddle.add(
+                paddle.matmul(prediction_scores, self.get_input_embeddings().weight, transpose_y=True),
+                self.generator_lm_head_bias,
+            )
+        loss = None
+        # Masked language modeling softmax layer
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()  # -100 index = padding token
+            loss = loss_fct(prediction_scores.reshape([-1, self.electra.config["vocab_size"]]), labels.reshape([-1]))
+
+        if not return_dict:
+            output = (prediction_scores,) + generator_sequence_output[1:]
+            return tuple_output(output, loss)
+
+        return MaskedLMOutput(
+            loss=loss,
+            logits=prediction_scores,
+            hidden_states=generator_sequence_output.hidden_states,
+            attentions=generator_sequence_output.attentions,
+        )
+
+
+class ElectraClassificationHead(nn.Layer):
+    """
+    Perform sentence-level classification tasks.
+
+    Args:
+        config (:class:`ElectraConfig`):
+            An instance of ElectraConfig
+
+    """
+
+    def __init__(self, config: ElectraConfig):
+        super(ElectraClassificationHead, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+        self.act = get_activation(config.hidden_act)
+
+    def forward(self, features, **kwargs):
+        r"""
+        The ElectraClassificationHead forward method, overrides the __call__() special method.
+
+        Args:
+            features(Tensor):
+               Input sequence, usually the `sequence_output` of electra model.
+               Its data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+        Returns:
+            Tensor: Returns a tensor of the input text classification logits.
+            Shape as `[batch_size, num_labels]` and dtype as float32.
+        """
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = self.act(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+class ErnieHealthDiscriminator(ElectraPretrainedModel):
+    """
+    The Discriminators in ERNIE-Health (https://arxiv.org/abs/2110.07244), including
+        - token-level Replaced Token Detection (RTD) task
+        - token-level Multi-Token Selection (MTS) task
+        - sequence-level Contrastive Sequence Prediction (CSP) task.
+
+    Args:
+         config (:class:`ElectraConfig`):
+            An instance of ElectraConfig to construct ErnieHealthDiscriminator
+
+    """
+
+    def __init__(self, config: ElectraConfig):
+        super(ErnieHealthDiscriminator, self).__init__(config)
+
+        self.electra = ElectraModel(config)
+        self.discriminator_rtd = ElectraDiscriminatorPredictions(config)
+
+        self.discriminator_mts = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation_mts = get_activation(config.hidden_act)
+        self.bias_mts = nn.Embedding(config.vocab_size, 1)
+
+        self.discriminator_csp = ElectraClassificationHead(config)
+
+    def forward(self, input_ids, candidate_ids, token_type_ids=None, position_ids=None, attention_mask=None):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ElectraModel`.
+            candidate_ids (Tensor):
+                The candidate indices of input sequence tokens in the vocabulary for MTS task.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            token_type_ids (Tensor, optional):
+                See :class:`ElectraModel`.
+            position_ids (Tensor, optional):
+                See :class:`ElectraModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ElectraModel`.
+
+        Returns:
+            Tensor: Returns list of tensors, the prediction results of RTD, MTS and CSP.
+            The logits' data type should be float32 and if batch_size > 1,
+                - the shape of `logits_rtd` is [batch_size, sequence_length],
+                - the shape of `logits_mts` is [batch_size, sequence_length, num_candidate],
+                - the shape of `logits_csp` is [batch_size, 128].
+            If batch_size=1, the shapes are [sequence_length], [sequence_length, num_cadidate],
+            [128], separately.
+
+        """
+
+        discriminator_sequence_output = self.electra(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+        )
+
+        logits_rtd = self.discriminator_rtd(discriminator_sequence_output)
+
+        cands_embs = self.electra.embeddings.word_embeddings(candidate_ids)
+        hidden_mts = self.discriminator_mts(discriminator_sequence_output)
+        hidden_mts = self.activation_mts(hidden_mts)
+        hidden_mts = paddle.matmul(hidden_mts.unsqueeze(2), cands_embs, transpose_y=True).squeeze(2)
+        logits_mts = paddle.add(hidden_mts, self.bias_mts(candidate_ids).squeeze(3))
+
+        logits_csp = self.discriminator_csp(discriminator_sequence_output)
+
+        return logits_rtd, logits_mts, logits_csp
+
+
+class ElectraForSequenceClassification(ElectraPretrainedModel):
+    """
+    Electra Model with a linear layer on top of the output layer,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        config (:class:`ElectraConfig`):
+            An instance of ElectraConfig to construct ElectraForSequenceClassification
+    """
+
+    def __init__(self, config: ElectraConfig):
+        super(ElectraForSequenceClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.electra = ElectraModel(config)
+        self.classifier = ElectraClassificationHead(config)
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions: bool = None,
+        output_hidden_states: bool = None,
+        return_dict: bool = None,
+    ):
+        r"""
+        The ElectraForSequenceClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ElectraModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ElectraModel`.
+            position_ids(Tensor, optional):
+                See :class:`ElectraModel`.
+            attention_mask (list, optional):
+                See :class:`ElectraModel`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input text classification logits.
+            Shape as `[batch_size, num_labels]` and dtype as float32.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ElectraForSequenceClassification
+                from paddlenlp.transformers import ElectraTokenizer
+
+                tokenizer = ElectraTokenizer.from_pretrained('electra-small')
+                model = ElectraForSequenceClassification.from_pretrained('electra-small')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+
+        """
+        sequence_output = self.electra(
+            input_ids,
+            token_type_ids,
+            position_ids,
+            attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if isinstance(sequence_output, type(input_ids)):
+            sequence_output = (sequence_output,)
+
+        logits = self.classifier(sequence_output[0])
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = paddle.nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = paddle.nn.CrossEntropyLoss()
+                loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = paddle.nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + sequence_output[1:]
+            return tuple_output(output, loss)
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=sequence_output.hidden_states,
+            attentions=sequence_output.attentions,
+        )
+
+
+class ElectraForTokenClassification(ElectraPretrainedModel):
+    """
+    Electra  Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        config (:class:`ElectraConfig`):
+            An instance of ElectraConfig to construct ElectraForTokenClassification
+    """
+
+    def __init__(self, config: ElectraConfig):
+        super(ElectraForTokenClassification, self).__init__(config)
+        self.electra = ElectraModel(config)
+        self.num_labels = config.num_labels
+        self.dropout = nn.Dropout(
+            config.hidden_dropout_prob if config.classifier_dropout is None else config.classifier_dropout
+        )
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The ElectraForTokenClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ElectraModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ElectraModel`.
+            position_ids(Tensor, optional):
+                See :class:`ElectraModel`.
+            attention_mask (list, optional):
+                See :class:`ElectraModel`.
+            labels (Tensor of shape `(batch_size, )`, optional):
+                Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+                num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+                `input_ids` above)
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input token classification logits.
+            Shape as `[batch_size, sequence_length, num_labels]` and dtype as `float32`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ElectraForTokenClassification
+                from paddlenlp.transformers import ElectraTokenizer
+
+                tokenizer = ElectraTokenizer.from_pretrained('electra-small')
+                model = ElectraForTokenClassification.from_pretrained('electra-small')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+
+        """
+        sequence_output = self.electra(
+            input_ids,
+            token_type_ids,
+            position_ids,
+            attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if isinstance(sequence_output, type(input_ids)):
+            sequence_output = (sequence_output,)
+
+        logits = self.classifier(self.dropout(sequence_output[0]))
+
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.reshape([-1, self.num_labels]), labels.reshape([-1]))
+
+        if not return_dict:
+            output = (logits,) + sequence_output[1:]
+            return tuple_output(output, loss)
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=sequence_output.hidden_states,
+            attentions=sequence_output.attentions,
+        )
+
+
+class ElectraForTotalPretraining(ElectraPretrainedModel):
+    """
+    Electra Model for pretraining tasks.
+
+    Args:
+        config (:class:`ElectraConfig`):
+            An instance of ElectraConfig to construct ElectraForTotalPretraining
+
+    """
+
+    def __init__(self, config: ElectraConfig):
+        super(ElectraForTotalPretraining, self).__init__(config)
+        self.generator = ElectraGenerator(config)
+        self.discriminator = ElectraDiscriminator(config)
+        self.initializer_range = config.initializer_range
+        self.tie_weights()
+
+    def get_input_embeddings(self):
+        if not self.untied_generator_embeddings:
+            return self.generator.electra.embeddings.word_embeddings
+        else:
+            return None
+
+    def get_output_embeddings(self):
+        if not self.untied_generator_embeddings:
+            return self.discriminator.electra.embeddings.word_embeddings
+        else:
+            return None
+
+    def get_discriminator_inputs(self, inputs, raw_inputs, generator_logits, generator_labels, use_softmax_sample):
+        # get generator token result
+        sampled_tokens = (self.sample_from_softmax(generator_logits, use_softmax_sample)).detach()
+        sampled_tokids = paddle.argmax(sampled_tokens, axis=-1)
+        # update token only at mask position
+        # generator_labels : [B, L], L contains -100(unmasked) or token value(masked)
+        # mask_positions : [B, L], L contains 0(unmasked) or 1(masked)
+        umask_positions = paddle.zeros_like(generator_labels)
+        mask_positions = paddle.ones_like(generator_labels)
+        mask_positions = paddle.where(generator_labels == -100, umask_positions, mask_positions)
+        updated_inputs = self.update_inputs(inputs, sampled_tokids, mask_positions)
+        # use inputs and updated_input to get discriminator labels
+        labels = mask_positions * (
+            paddle.ones_like(inputs) - paddle.equal(updated_inputs, raw_inputs).astype(raw_inputs.dtype)
+        )
+        return updated_inputs, labels, sampled_tokids
+
+    def sample_from_softmax(self, logits, use_softmax_sample=True):
+        if use_softmax_sample:
+            # uniform_noise = paddle.uniform(logits.shape, dtype="float32", min=0, max=1)
+            uniform_noise = paddle.rand(logits.shape, dtype=paddle.get_default_dtype())
+            gumbel_noise = -paddle.log(-paddle.log(uniform_noise + 1e-9) + 1e-9)
+        else:
+            gumbel_noise = paddle.zeros_like(logits)
+        # softmax_sample equal to sampled_tokids.unsqueeze(-1)
+        softmax_sample = paddle.argmax(F.softmax(logits + gumbel_noise), axis=-1)
+        # one hot
+        return F.one_hot(softmax_sample, logits.shape[-1])
+
+    def update_inputs(self, sequence, updates, positions):
+        shape = sequence.shape
+        assert len(shape) == 2, "the dimension of inputs should be [B, L]"
+        B, L = shape
+        N = positions.shape[1]
+        assert N == L, "the dimension of inputs and mask should be same as [B, L]"
+
+        updated_sequence = ((paddle.ones_like(sequence) - positions) * sequence) + (
+            positions * updates.astype(positions.dtype)
+        )
+
+        return updated_sequence
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        raw_input_ids=None,
+        generator_labels=None,
+    ):
+        r"""
+        The ElectraForPretraining forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ElectraModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ElectraModel`.
+            position_ids(Tensor, optional):
+                See :class:`ElectraModel`.
+            attention_mask (list, optional):
+                See :class:`ElectraModel`.
+            raw_input_ids(Tensor, optional):
+                Raw inputs used to get discriminator labels.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            generator_labels(Tensor, optional):
+                Labels to compute the discriminator inputs.
+                Its data type should be int64 and its shape is [batch_size, sequence_length].
+                The value for unmasked tokens should be -100 and value for masked tokens should be 0.
+
+        Returns:
+            tuple: Returns tuple (generator_logits, disc_logits, disc_labels, attention_mask).
+
+            With the fields:
+
+            - `generator_logits` (Tensor):
+                The scores of Electra Generator.
+                Its data type should be int64 and its shape is [batch_size, sequence_length, vocab_size].
+
+            - `disc_logits` (Tensor):
+                The prediction result of replaced tokens.
+                Its data type should be float32 and if batch_size>1, its shape is [batch_size, sequence_length],
+                if batch_size=1, its shape is [sequence_length].
+
+            - `disc_labels` (Tensor):
+                The labels of electra discriminator. Its data type should be int32,
+                and its shape is [batch_size, sequence_length].
+
+            - `attention_mask` (Tensor):
+                See :class:`ElectraModel`. Its data type should be bool.
+
+        """
+
+        assert (
+            generator_labels is not None
+        ), "generator_labels should not be None, please check DataCollatorForLanguageModeling"
+
+        generator_logits = self.generator(input_ids, token_type_ids, position_ids, attention_mask)
+
+        disc_inputs, disc_labels, generator_predict_tokens = self.get_discriminator_inputs(
+            input_ids, raw_input_ids, generator_logits, generator_labels, self.use_softmax_sample
+        )
+
+        disc_logits = self.discriminator(disc_inputs, token_type_ids, position_ids, attention_mask)
+
+        if attention_mask is None:
+            attention_mask = input_ids != self.discriminator.electra.config["pad_token_id"]
+        else:
+            attention_mask = attention_mask.astype("bool")
+
+        return generator_logits, disc_logits, disc_labels, attention_mask
+
+
+class ElectraPooler(nn.Layer):
+    def __init__(self, config: ElectraConfig):
+        super(ElectraPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = get_activation(config.hidden_act)
+        self.pool_act = config.hidden_act
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+@dataclass
+class ErnieHealthForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`ErnieHealthForPreTraining`].
+
+    Args:
+        loss (*optional*, returned when `labels` is provided, `paddle.Tensor` of shape `(1,)`):
+            Total loss of the ELECTRA objective.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    gen_loss: Optional[paddle.Tensor] = None
+    disc_rtd_loss: Optional[paddle.Tensor] = None
+    disc_mts_loss: Optional[paddle.Tensor] = None
+    disc_csp_loss: Optional[paddle.Tensor] = None
+
+
+class ErnieHealthForTotalPretraining(ElectraForTotalPretraining):
+    """
+    ERNIE-Health Model for pretraining task.
+
+    Args:
+        config (:class:`ElectraConfig`):
+            An instance of ElectraConfig to construct ElectraForMultipleChoice
+    """
+
+    def __init__(self, config: ElectraConfig):
+        super(ErnieHealthForTotalPretraining, self).__init__(config)
+        self.generator = ElectraGenerator(config)
+        self.discriminator = ErnieHealthDiscriminator(config)
+        self.initializer_range = config.initializer_range
+
+    def get_discriminator_inputs_ernie_health(
+        self, inputs, raw_inputs, generator_logits, generator_labels, use_softmax_sample
+    ):
+        updated_inputs, labels, sampled_tokids = self.get_discriminator_inputs(
+            inputs, raw_inputs, generator_logits, generator_labels, use_softmax_sample
+        )
+
+        # Get negative samples to construct candidates.
+        neg_samples_ids = self.sample_negatives_from_softmax(generator_logits, raw_inputs, use_softmax_sample)
+        candidate_ids = paddle.concat([raw_inputs.unsqueeze(2), neg_samples_ids], axis=2).detach()
+        return updated_inputs, labels, sampled_tokids, candidate_ids
+
+    def sample_negatives_from_softmax(self, logits, raw_inputs, use_softmax_sample=True):
+        r"""
+        Sample K=5 non-original negative samples for candidate set.
+
+        Returns:
+            Tensor: Returns tensor `neg_samples_ids`, a tensor of the negative
+            samples of original inputs.
+            Shape as ` [batch_size, sequence_length, K, vocab_size]` and dtype
+            as `int64`.
+        """
+        K = 5
+        # Initialize excluded_ids by original inputs in one-hot encoding.
+        # Its shape is [batch_size, sequence_length, vocab_size].
+        excluded_ids = F.one_hot(raw_inputs, logits.shape[-1]) * -10000
+        neg_sample_one_hot = None
+        neg_samples_ids = None
+        for sample in range(K):
+            # Update excluded_ids.
+            if neg_sample_one_hot is not None:
+                excluded_ids = excluded_ids + neg_sample_one_hot * -10000
+            if use_softmax_sample:
+                uniform_noise = paddle.rand(logits.shape, dtype=paddle.get_default_dtype())
+                gumbel_noise = -paddle.log(-paddle.log(uniform_noise + 1e-9) + 1e-9)
+            else:
+                gumbel_noise = paddle.zeros_like(logits)
+            sampled_ids = paddle.argmax(F.softmax(logits + gumbel_noise + excluded_ids), axis=-1)
+            # One-hot encoding of sample_ids.
+            neg_sample_one_hot = F.one_hot(sampled_ids, logits.shape[-1])
+            if neg_samples_ids is None:
+                neg_samples_ids = sampled_ids.unsqueeze(2)
+            else:
+                neg_samples_ids = paddle.concat([neg_samples_ids, sampled_ids.unsqueeze(2)], axis=2)
+        return neg_samples_ids
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        raw_input_ids=None,
+        generator_labels=None,
+        return_dict: Optional[bool] = None,
+    ):
+        assert generator_labels is not None, "generator_labels should not be None, please check DataCollator"
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        generator_logits = self.generator(input_ids, token_type_ids, position_ids, attention_mask)
+
+        disc_input_list = self.get_discriminator_inputs_ernie_health(
+            input_ids, raw_input_ids, generator_logits, generator_labels, self.use_softmax_sample
+        )
+        disc_inputs, disc_labels, _, disc_candidates = disc_input_list
+
+        logits_rtd, logits_mts, logits_csp = self.discriminator(
+            disc_inputs, disc_candidates, token_type_ids, position_ids, attention_mask
+        )
+
+        if attention_mask is None:
+            pad_id = self.generator.electra.pad_token_id
+            attention_mask = input_ids != pad_id
+        else:
+            attention_mask = attention_mask.astype("bool")
+
+        total_loss = None
+        gen_loss = None
+        disc_rtd_loss = None
+        disc_mts_loss = None
+        disc_csp_loss = None
+
+        if generator_labels is not None and disc_labels is not None:
+            loss_fct = ErnieHealthPretrainingCriterion(self.config)
+            total_loss, gen_loss, disc_rtd_loss, disc_mts_loss, disc_csp_loss = loss_fct(
+                generator_logits, generator_labels, logits_rtd, logits_mts, logits_csp, disc_labels, attention_mask
+            )
+
+        if not return_dict:
+            # return total_loss
+            return total_loss, gen_loss, disc_rtd_loss, disc_mts_loss, disc_csp_loss
+
+        return ErnieHealthForPreTrainingOutput(total_loss, gen_loss, disc_rtd_loss, disc_mts_loss, disc_csp_loss)
+
+
+class ElectraForMultipleChoice(ElectraPretrainedModel):
+    """
+    Electra Model with a linear layer on top of the hidden-states output layer,
+    designed for multiple choice tasks like RocStories/SWAG tasks.
+
+    Args:
+        config (:class:`ElectraConfig`):
+            An instance of ElectraConfig to construct ElectraForMultipleChoice
+    """
+
+    def __init__(self, config: ElectraConfig):
+        super(ElectraForMultipleChoice, self).__init__(config)
+        self.num_choices = config.num_choices
+        self.electra = ElectraModel(config)
+        self.sequence_summary = ElectraPooler(config)
+        dropout_p = config.hidden_dropout_prob if config.classifier_dropout is None else config.classifier_dropout
+        self.dropout = nn.Dropout(dropout_p)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The ElectraForMultipleChoice forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ElectraModel` and shape as [batch_size, num_choice, sequence_length].
+            token_type_ids (Tensor, optional):
+                See :class:`ElectraModel` and shape as [batch_size, num_choice, sequence_length].
+            position_ids(Tensor, optional):
+                See :class:`ElectraModel` and shape as [batch_size, num_choice, sequence_length].
+            attention_mask (list, optional):
+                See :class:`ElectraModel` and shape as [batch_size, num_choice, sequence_length].
+            labels (Tensor of shape `(batch_size, )`, optional):
+                Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+                num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+                `input_ids` above)
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            Tensor: Returns tensor `reshaped_logits`, a tensor of the multiple choice classification logits.
+            Shape as `[batch_size, num_choice]` and dtype as `float32`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ElectraForMultipleChoice, ElectraTokenizer
+                from paddlenlp.data import Pad, Dict
+
+                tokenizer = ElectraTokenizer.from_pretrained('electra-small')
+                model = ElectraForMultipleChoice.from_pretrained('electra-small', num_choices=2)
+
+                data = [
+                    {
+                        "question": "how do you turn on an ipad screen?",
+                        "answer1": "press the volume button.",
+                        "answer2": "press the lock button.",
+                        "label": 1,
+                    },
+                    {
+                        "question": "how do you indent something?",
+                        "answer1": "leave a space before starting the writing",
+                        "answer2": "press the spacebar",
+                        "label": 0,
+                    },
+                ]
+
+                text = []
+                text_pair = []
+                for d in data:
+                    text.append(d["question"])
+                    text_pair.append(d["answer1"])
+                    text.append(d["question"])
+                    text_pair.append(d["answer2"])
+
+                inputs = tokenizer(text, text_pair)
+                batchify_fn = lambda samples, fn=Dict(
+                    {
+                        "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input_ids
+                        "token_type_ids": Pad(
+                            axis=0, pad_val=tokenizer.pad_token_type_id
+                        ),  # token_type_ids
+                    }
+                ): fn(samples)
+                inputs = batchify_fn(inputs)
+
+                reshaped_logits = model(
+                    input_ids=paddle.to_tensor(inputs[0], dtype="int64"),
+                    token_type_ids=paddle.to_tensor(inputs[1], dtype="int64"),
+                )
+                print(reshaped_logits.shape)
+                # [2, 2]
+
+        """
+        input_ids = input_ids.reshape((-1, input_ids.shape[-1]))  # flat_input_ids: [bs*num_choice,seq_l]
+
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.reshape((-1, token_type_ids.shape[-1]))
+        if position_ids is not None:
+            position_ids = position_ids.reshape((-1, position_ids.shape[-1]))
+        if attention_mask is not None:
+            attention_mask = attention_mask.reshape((-1, attention_mask.shape[-1]))
+
+        sequence_output = self.electra(
+            input_ids,
+            token_type_ids,
+            position_ids,
+            attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if isinstance(sequence_output, type(input_ids)):
+            sequence_output = (sequence_output,)
+
+        pooled_output = self.sequence_summary(sequence_output[0])
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)  # logits: (bs*num_choice,1)
+        reshaped_logits = logits.reshape((-1, self.num_choices))  # logits: (bs, num_choice)
+
+        loss = None
+        output = (reshaped_logits,) + sequence_output[1:]
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+            output = (loss,) + output
+
+        if not return_dict:
+            output = (reshaped_logits,) + sequence_output[1:]
+            return tuple_output(output, loss)
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=sequence_output.hidden_states,
+            attentions=sequence_output.attentions,
+        )
+
+
+class ElectraPretrainingCriterion(paddle.nn.Layer):
+    """
+
+    Args:
+        config (:class:`ElectraConfig`):
+            An instance of ElectraConfig
+
+    """
+
+    def __init__(self, config: ElectraConfig):
+        super(ElectraPretrainingCriterion, self).__init__()
+
+        self.vocab_size = config.vocab_size
+        self.gen_weight = config.gen_weight
+        self.disc_weight = config.disc_weight
+        self.gen_loss_fct = nn.CrossEntropyLoss(reduction="none")
+        self.disc_loss_fct = nn.BCEWithLogitsLoss(reduction="none")
+
+    def forward(
+        self,
+        generator_prediction_scores,
+        discriminator_prediction_scores,
+        generator_labels,
+        discriminator_labels,
+        attention_mask,
+    ):
+        """
+        Args:
+            generator_prediction_scores(Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                and its shape is [batch_size, sequence_length, vocab_size].
+            discriminator_prediction_scores(Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                and its shape is [batch_size, sequence_length] or [sequence length] if batch_size=1.
+            generator_labels(Tensor):
+                The labels of the generator, its dimensionality is equal to `generator_prediction_scores`.
+                Its data type should be int64 and its shape is [batch_size, sequence_size, 1].
+            discriminator_labels(Tensor):
+                The labels of the discriminator, its dimensionality is equal to `discriminator_prediction_scores`.
+                The labels should be numbers between 0 and 1.
+                Its data type should be float32 and its shape is [batch_size, sequence_size] or [sequence length] if batch_size=1.
+            attention_mask(Tensor):
+                See :class:`ElectraModel`.
+
+        Returns:
+            Tensor: The pretraining loss, equals to weighted generator loss plus the weighted discriminator loss.
+            Its data type should be float32 and its shape is [1].
+
+        """
+        # generator loss
+        gen_loss = self.gen_loss_fct(
+            paddle.reshape(generator_prediction_scores, [-1, self.vocab_size]), paddle.reshape(generator_labels, [-1])
+        )
+        # todo: we can remove 4 lines after when CrossEntropyLoss(reduction='mean') improved
+        umask_positions = paddle.zeros_like(generator_labels).astype(paddle.get_default_dtype())
+        mask_positions = paddle.ones_like(generator_labels).astype(paddle.get_default_dtype())
+        mask_positions = paddle.where(generator_labels == -100, umask_positions, mask_positions)
+        if mask_positions.sum() == 0:
+            gen_loss = paddle.to_tensor([0.0])
+        else:
+            gen_loss = gen_loss.sum() / mask_positions.sum()
+
+        # discriminator loss
+        seq_length = discriminator_labels.shape[1]
+        disc_loss = self.disc_loss_fct(
+            paddle.reshape(discriminator_prediction_scores, [-1, seq_length]),
+            discriminator_labels.astype(paddle.get_default_dtype()),
+        )
+        if attention_mask is not None:
+            umask_positions = paddle.ones_like(discriminator_labels).astype(paddle.get_default_dtype())
+            mask_positions = paddle.zeros_like(discriminator_labels).astype(paddle.get_default_dtype())
+            use_disc_loss = paddle.where(attention_mask, disc_loss, mask_positions)
+            umask_positions = paddle.where(attention_mask, umask_positions, mask_positions)
+            disc_loss = use_disc_loss.sum() / umask_positions.sum()
+        else:
+            total_positions = paddle.ones_like(discriminator_labels).astype(paddle.get_default_dtype())
+            disc_loss = disc_loss.sum() / total_positions.sum()
+
+        return self.gen_weight * gen_loss + self.disc_weight * disc_loss
+
+
+class ErnieHealthPretrainingCriterion(paddle.nn.Layer):
+    """
+
+    Args:
+        config (:class:`ElectraConfig`):
+            An instance of ElectraConfig
+
+    """
+
+    def __init__(self, config: ElectraConfig):
+        super(ErnieHealthPretrainingCriterion, self).__init__()
+
+        self.vocab_size = config.vocab_size
+        self.gen_weight = config.gen_weight
+        self.rtd_weight = 50.0
+        self.mts_weight = 20.0
+        self.csp_weight = 1.0
+        self.gen_loss_fct = nn.CrossEntropyLoss(reduction="none")
+        self.disc_rtd_loss_fct = nn.BCEWithLogitsLoss(reduction="none")
+        self.disc_csp_loss_fct = nn.CrossEntropyLoss(reduction="none")
+        self.disc_mts_loss_fct = nn.CrossEntropyLoss(reduction="none")
+        self.temperature = 0.07
+
+    def forward(
+        self,
+        generator_logits,
+        generator_labels,
+        logits_rtd,
+        logits_mts,
+        logits_csp,
+        discriminator_labels,
+        attention_mask,
+    ):
+        """
+        Args:
+            generator_logits(Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                and its shape is [batch_size, sequence_length, vocab_size].
+            generator_labels(Tensor):
+                The labels of the generator, its dimensionality is equal to `generator_prediction_scores`.
+                Its data type should be int64 and its shape is [batch_size, sequence_size, 1].
+            logits_rtd(Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                and its shape is [batch_size, sequence_length] or [sequence length] if batch_size=1.
+            discriminator_labels(Tensor):
+                The labels of the discriminator, its dimensionality is equal to `discriminator_prediction_scores`.
+                The labels should be numbers between 0 and 1.
+                Its data type should be float32 and its shape is [batch_size, sequence_size] or [sequence length] if batch_size=1.
+            attention_mask(Tensor):
+                See :class:`ElectraModel`.
+
+        Returns:
+            Tensor: The pretraining loss, equals to weighted generator loss plus the weighted discriminator loss.
+            Its data type should be float32 and its shape is [1].
+
+        """
+        # generator loss
+        gen_loss = self.gen_loss_fct(
+            paddle.reshape(generator_logits, [-1, self.vocab_size]), paddle.reshape(generator_labels, [-1])
+        )
+        # todo: we can remove 4 lines after when CrossEntropyLoss(reduction='mean') improved
+        umask_positions = paddle.zeros_like(generator_labels).astype(paddle.get_default_dtype())
+        mask_positions = paddle.ones_like(generator_labels).astype(paddle.get_default_dtype())
+        mask_positions = paddle.where(generator_labels == -100, umask_positions, mask_positions)
+        if mask_positions.sum() == 0:
+            gen_loss = paddle.to_tensor([0.0])
+        else:
+            gen_loss = gen_loss.sum() / mask_positions.sum()
+
+        # RTD discriminator loss
+        seq_length = discriminator_labels.shape[1]
+        rtd_labels = discriminator_labels
+
+        disc_rtd_loss = self.disc_rtd_loss_fct(
+            paddle.reshape(logits_rtd, [-1, seq_length]), rtd_labels.astype(logits_rtd.dtype)
+        )
+        if attention_mask is not None:
+            umask_positions = paddle.ones_like(rtd_labels).astype(paddle.get_default_dtype())
+            mask_positions = paddle.zeros_like(rtd_labels).astype(paddle.get_default_dtype())
+            umask_positions = paddle.where(attention_mask, umask_positions, mask_positions)
+            # Mask has different meanings here. It denotes [mask] token in
+            # generator and denotes [pad] token in discriminator.
+            disc_rtd_loss = paddle.where(attention_mask, disc_rtd_loss, mask_positions)
+            disc_rtd_loss = disc_rtd_loss.sum() / umask_positions.sum()
+        else:
+            total_positions = paddle.ones_like(rtd_labels).astype(paddle.get_default_dtype())
+            disc_rtd_loss = disc_rtd_loss.sum() / total_positions.sum()
+
+        # MTS discriminator loss
+        replaced_positions = discriminator_labels.astype("bool")
+        mts_labels = paddle.zeros([logits_mts.shape[0] * logits_mts.shape[1]], dtype=generator_labels.dtype).detach()
+        disc_mts_loss = self.disc_mts_loss_fct(paddle.reshape(logits_mts, [-1, logits_mts.shape[-1]]), mts_labels)
+        disc_mts_loss = paddle.reshape(disc_mts_loss, [-1, seq_length])
+        original_positions = paddle.zeros_like(replaced_positions).astype(paddle.get_default_dtype())
+        disc_mts_loss = paddle.where(replaced_positions, disc_mts_loss, original_positions)
+        if discriminator_labels.sum() == 0:
+            disc_mts_loss = paddle.to_tensor([0.0])
+        else:
+            disc_mts_loss = disc_mts_loss.sum() / discriminator_labels.sum()
+
+        # CSP discriminator loss
+        logits_csp = F.normalize(logits_csp, axis=-1)
+        # Gather from all devices (split first)
+        logit_csp_0, logit_csp_1 = paddle.split(logits_csp, num_or_sections=2, axis=0)
+        if paddle.distributed.get_world_size() > 1:
+            csp_list_0, csp_list_1 = [], []
+            paddle.distributed.all_gather(csp_list_0, logit_csp_0)
+            paddle.distributed.all_gather(csp_list_1, logit_csp_1)
+            logit_csp_0 = paddle.concat(csp_list_0, axis=0)
+            logit_csp_1 = paddle.concat(csp_list_1, axis=0)
+        batch_size = logit_csp_0.shape[0]
+        logits_csp = paddle.concat([logit_csp_0, logit_csp_1], axis=0)
+        # Similarity matrix
+        logits_csp = paddle.matmul(logits_csp, logits_csp, transpose_y=True)
+        # Temperature scale
+        logits_csp = logits_csp / self.temperature
+        # Mask self-contrast
+        mask = -1e4 * paddle.eye(logits_csp.shape[0])
+        logits_csp = logits_csp + mask
+        # Create labels for bundle
+        csp_labels = paddle.concat([paddle.arange(batch_size, 2 * batch_size), paddle.arange(batch_size)], axis=0)
+        # Calculate SimCLR loss
+        disc_csp_loss = self.disc_csp_loss_fct(logits_csp, csp_labels)
+        disc_csp_loss = disc_csp_loss.sum() / (batch_size * 2)
+
+        loss = (
+            self.gen_weight * gen_loss
+            + self.rtd_weight * disc_rtd_loss
+            + self.mts_weight * disc_mts_loss
+            + self.csp_weight * disc_csp_loss
+        )
+
+        return loss, gen_loss, disc_rtd_loss, disc_mts_loss, disc_csp_loss
+
+
+class ElectraForQuestionAnswering(ElectraPretrainedModel):
+    """
+    Electra Model with a linear layer on top of the hidden-states output to compute `span_start_logits`
+    and `span_end_logits`, designed for question-answering tasks like SQuAD.
+
+    Args:
+        config (:class:`ElectraConfig`):
+            An instance of ElectraConfig used to construct ElectraForQuestionAnswering.
+
+    """
+
+    def __init__(self, config: ElectraConfig):
+        super(ElectraForQuestionAnswering, self).__init__(config)
+        self.electra = ElectraModel(config)
+        self.classifier = nn.Linear(config.hidden_size, 2)
+
+    def forward(
+        self,
+        input_ids,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        start_positions: Optional[Tensor] = None,
+        end_positions: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The ElectraForQuestionAnswering forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ElectraModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ElectraModel`.
+            position_ids(Tensor, optional):
+                See :class:`ElectraModel`.
+            attention_mask (list, optional):
+                See :class:`ElectraModel`.
+            start_positions (Tensor of shape `(batch_size,)`, optional):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            end_positions (Tensor of shape `(batch_size,)`, optional):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+        Returns:
+            tuple: Returns tuple (`start_logits`, `end_logits`).
+
+            With the fields:
+
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ElectraForQuestionAnswering, ElectraTokenizer
+
+                tokenizer = ElectraTokenizer.from_pretrained('electra-small')
+                model = ElectraForQuestionAnswering.from_pretrained('electra-small')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                start_logits = outputs[0]
+                end_logits  = outputs[1]
+
+        """
+        sequence_output = self.electra(
+            input_ids,
+            token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if isinstance(sequence_output, type(input_ids)):
+            sequence_output = (sequence_output,)
+
+        logits = self.classifier(sequence_output[0])
+        logits = paddle.transpose(logits, perm=[2, 0, 1])
+        start_logits, end_logits = paddle.unstack(x=logits, axis=0)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if start_positions.ndim > 1:
+                start_positions = start_positions.squeeze(-1)
+            if start_positions.ndim > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clip(0, ignored_index)
+            end_positions = end_positions.clip(0, ignored_index)
+
+            loss_fct = paddle.nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        if not return_dict:
+            output = (start_logits, end_logits) + sequence_output[2:]
+            return tuple_output(output, total_loss)
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=sequence_output.hidden_states,
+            attentions=sequence_output.attentions,
+        )
+
+
+# ElectraForMaskedLM is the same as ElectraGenerator
+ElectraForMaskedLM = ElectraGenerator
+ElectraForPretraining = ElectraForTotalPretraining
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/electra/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/electra/tokenizer.py
new file mode 100644
index 000000000..417247e7d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/electra/tokenizer.py
@@ -0,0 +1,309 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from .. import BasicTokenizer, PretrainedTokenizer, WordpieceTokenizer
+
+__all__ = [
+    "ElectraTokenizer",
+]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "electra-small": 512,
+    "electra-base": 512,
+    "electra-large": 512,
+    "chinese-electra-base": 512,
+    "chinese-electra-small": 512,
+    "ernie-health-chinese": 512,
+}
+
+
+class ElectraTokenizer(PretrainedTokenizer):
+    """
+    Constructs an Electra tokenizer. It uses a basic tokenizer to do punctuation
+    splitting, lower casing and so on, and follows a WordPiece tokenizer to
+    tokenize as subwords.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            The vocabulary file path (ends with '.txt') required to instantiate
+            a `WordpieceTokenizer`.
+        do_lower_case (bool):
+            Whether or not to lowercase the input when tokenizing.
+            Defaults to `True`.
+        unk_token (str):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import ElectraTokenizer
+            tokenizer = ElectraTokenizer.from_pretrained('electra-small')
+            tokens = tokenizer('He was a puppeteer')
+            '''
+            {'input_ids': [101, 2002, 2001, 1037, 13997, 11510, 102],
+             'token_type_ids': [0, 0, 0, 0, 0, 0, 0]}
+            '''
+
+    """
+
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "electra-small": "https://bj.bcebos.com/paddlenlp/models/transformers/electra/electra-small-vocab.txt",
+            "electra-base": "https://bj.bcebos.com/paddlenlp/models/transformers/electra/electra-base-vocab.txt",
+            "electra-large": "https://bj.bcebos.com/paddlenlp/models/transformers/electra/electra-large-vocab.txt",
+            "chinese-electra-base": "http://bj.bcebos.com/paddlenlp/models/transformers/chinese-electra-base/vocab.txt",
+            "chinese-electra-small": "http://bj.bcebos.com/paddlenlp/models/transformers/chinese-electra-small/vocab.txt",
+            "ernie-health-chinese": "https://paddlenlp.bj.bcebos.com/models/transformers/ernie-health-chinese/vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "electra-small": {"do_lower_case": True},
+        "electra-base": {"do_lower_case": True},
+        "electra-large": {"do_lower_case": True},
+        "chinese-electra-base": {"do_lower_case": True},
+        "chinese-electra-small": {"do_lower_case": True},
+        "ernie-health-chinese": {"do_lower_case": True},
+    }
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs
+    ):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the "
+                "vocabulary from a pretrained model please use "
+                "`tokenizer = ElectraTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+        self.do_lower_case = do_lower_case
+        self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token)
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=unk_token)
+
+    @property
+    def vocab_size(self):
+        """
+        Return the size of vocabulary.
+
+        Returns:
+            int: The size of vocabulary.
+        """
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab._token_to_idx, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        """
+        End-to-end tokenization for Electra models.
+        Args:
+            text (str): The text to be tokenized.
+
+        Returns:
+            list: A list of string representing converted tokens.
+        """
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
+                # If the token is part of the never_split set
+                if token in self.basic_tokenizer.never_split:
+                    split_tokens.append(token)
+                else:
+                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def convert_tokens_to_string(self, tokens):
+        """
+        Converts a sequence of tokens (list of string) in a single string. Since
+        the usage of WordPiece introducing `##` to concat subwords, also remove
+        `##` when converting.
+
+        Args:
+            tokens (list): A list of string representing tokens to be converted.
+
+        Returns:
+            str: Converted string from tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import ElectraTokenizer
+                tokenizer = ElectraTokenizer.from_pretrained('electra-small')
+                tokens = tokenizer.tokenize('He was a puppeteer')
+                string = tokenizer.convert_tokens_to_string(tokens)
+
+        """
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def num_special_tokens_to_add(self, pair=False):
+        """
+        Returns the number of added tokens when encoding a sequence with special tokens.
+
+        Args:
+            pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the
+                number of added tokens in the case of a single sequence if set to False.
+
+        Returns:
+           int: Number of tokens added to sequences.
+
+        """
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens.
+
+        A ELECTRA sequence has the following format:
+
+        - single sequence:      ``[CLS] X [SEP]``
+        - pair of sequences:        ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            List[int]: List of input_id with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        return _cls + token_ids_0 + _sep + token_ids_1 + _sep
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        """
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        A ELECTRA offset_mapping has the following format:
+
+        - single sequence:      ``(0,0) X (0,0)``
+        - pair of sequences:        ``(0,0) A (0,0) B (0,0)``
+
+        Args:
+            offset_mapping_ids_0 (:obj:`List[tuple]`):
+                List of char offsets to which the special tokens will be added.
+            offset_mapping_ids_1 (:obj:`List[tuple]`, `optional`):
+                Optional second list of char offsets for offset mapping pairs.
+
+        Returns:
+            List[tuple]: List of char offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+
+        A ELECTRA sequence pair mask has the following format:
+        ::
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            List[int]: List of token_type_id according to the given sequence(s).
+        """
+        _sep = [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(_cls + token_ids_0 + _sep) * [0]
+        return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 + _sep) * [1]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+
+        Args:
+            token_ids_0 (List[int]): List of ids of the first sequence.
+            token_ids_1 (List[int], optional): List of ids of the second sequence.
+            already_has_special_tokens (bool, optional): Whether or not the token list is already
+                formatted with special tokens for the model. Defaults to None.
+
+        Returns:
+            List[int]: The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie/README.md b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie/README.md
new file mode 100644
index 000000000..ee32cbf65
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie/README.md
@@ -0,0 +1 @@
+# ERNIE
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie/configuration.py
new file mode 100644
index 000000000..38ccc431f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie/configuration.py
@@ -0,0 +1,1291 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ERNIE model configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from ..configuration_utils import PretrainedConfig
+
+__all__ = ["ERNIE_PRETRAINED_INIT_CONFIGURATION", "ErnieConfig", "ERNIE_PRETRAINED_RESOURCE_FILES_MAP"]
+
+ERNIE_PRETRAINED_INIT_CONFIGURATION = {
+    # Deprecated, alias for ernie-1.0-base-zh
+    "ernie-1.0": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 513,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 2,
+        "vocab_size": 18000,
+        "pad_token_id": 0,
+    },
+    "ernie-1.0-base-zh": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 513,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 2,
+        "vocab_size": 18000,
+        "pad_token_id": 0,
+    },
+    "ernie-1.0-base-zh-cw": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "task_type_vocab_size": 3,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "ernie-1.0-large-zh-cw": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,  # it is 3072 instead of 4096
+        "max_position_embeddings": 512,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 24,
+        "type_vocab_size": 2,
+        "vocab_size": 18000,
+        "pad_token_id": 0,
+    },
+    "ernie-tiny": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "intermediate_size": 4096,
+        "max_position_embeddings": 600,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 3,
+        "type_vocab_size": 2,
+        "vocab_size": 50006,
+        "pad_token_id": 0,
+    },
+    "ernie-2.0-base-zh": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 513,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 4,
+        "vocab_size": 18000,
+    },
+    "ernie-2.0-large-zh": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "intermediate_size": 4096,  # special for large model
+        "initializer_range": 0.02,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 24,
+        "type_vocab_size": 4,
+        "vocab_size": 12800,
+    },
+    "ernie-2.0-base-en": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 4,
+        "vocab_size": 30522,
+        "pad_token_id": 0,
+    },
+    "ernie-2.0-base-en-finetuned-squad": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 4,
+        "vocab_size": 30522,
+        "pad_token_id": 0,
+    },
+    "ernie-2.0-large-en": {
+        "attention_probs_dropout_prob": 0.1,
+        "intermediate_size": 4096,  # special for ernie-2.0-large-en
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 24,
+        "type_vocab_size": 4,
+        "vocab_size": 30522,
+        "pad_token_id": 0,
+    },
+    "rocketqa-zh-dureader-query-encoder": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 513,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 2,
+        "vocab_size": 18000,
+        "pad_token_id": 0,
+    },
+    "rocketqa-zh-dureader-para-encoder": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 513,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 2,
+        "vocab_size": 18000,
+        "pad_token_id": 0,
+    },
+    "rocketqa-v1-marco-query-encoder": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 4,
+        "vocab_size": 30522,
+        "pad_token_id": 0,
+    },
+    "rocketqa-v1-marco-para-encoder": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 4,
+        "vocab_size": 30522,
+        "pad_token_id": 0,
+    },
+    "rocketqa-zh-dureader-cross-encoder": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 513,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 2,
+        "vocab_size": 18000,
+        "pad_token_id": 0,
+    },
+    "rocketqa-v1-marco-cross-encoder": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 4,
+        "vocab_size": 30522,
+        "pad_token_id": 0,
+    },
+    "ernie-3.0-xbase-zh": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "intermediate_size": 4096,  # special for large model
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 20,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "ernie-3.0-base-zh": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "task_type_vocab_size": 3,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "ernie-3.0-medium-zh": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "intermediate_size": 3072,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 6,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "ernie-3.0-mini-zh": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 384,
+        "intermediate_size": 1536,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 6,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "ernie-3.0-micro-zh": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 384,
+        "intermediate_size": 1536,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 4,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "ernie-3.0-nano-zh": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 312,
+        "intermediate_size": 1248,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 4,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "ernie-3.0-tiny-base-v1-zh": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "task_type_vocab_size": 3,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "ernie-3.0-tiny-medium-v1-zh": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "intermediate_size": 3072,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 6,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "ernie-3.0-tiny-mini-v1-zh": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 384,
+        "intermediate_size": 1536,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 6,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "ernie-3.0-tiny-micro-v1-zh": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 384,
+        "intermediate_size": 1536,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 4,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "ernie-3.0-tiny-nano-v1-zh": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 312,
+        "intermediate_size": 1248,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 4,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "rocketqa-base-cross-encoder": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "task_type_vocab_size": 3,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "rocketqa-medium-cross-encoder": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "intermediate_size": 3072,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 6,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "rocketqa-mini-cross-encoder": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 384,
+        "intermediate_size": 1536,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 6,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "rocketqa-micro-cross-encoder": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 384,
+        "intermediate_size": 1536,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 4,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "rocketqa-nano-cross-encoder": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 312,
+        "intermediate_size": 1248,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 4,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "rocketqa-zh-base-query-encoder": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "task_type_vocab_size": 3,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "rocketqa-zh-base-para-encoder": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "task_type_vocab_size": 3,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "rocketqa-zh-medium-query-encoder": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "intermediate_size": 3072,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 6,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "rocketqa-zh-medium-para-encoder": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "intermediate_size": 3072,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 6,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "rocketqa-zh-mini-query-encoder": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 384,
+        "intermediate_size": 1536,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 6,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "rocketqa-zh-mini-para-encoder": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 384,
+        "intermediate_size": 1536,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 6,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "rocketqa-zh-micro-query-encoder": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 384,
+        "intermediate_size": 1536,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 4,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "rocketqa-zh-micro-para-encoder": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 384,
+        "intermediate_size": 1536,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 4,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "rocketqa-zh-nano-query-encoder": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 312,
+        "intermediate_size": 1248,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 4,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "rocketqa-zh-nano-para-encoder": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 312,
+        "intermediate_size": 1248,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 4,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "rocketqav2-en-marco-cross-encoder": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 4,
+        "vocab_size": 30522,
+        "pad_token_id": 0,
+    },
+    "rocketqav2-en-marco-query-encoder": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 4,
+        "vocab_size": 30522,
+        "pad_token_id": 0,
+    },
+    "rocketqav2-en-marco-para-encoder": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 4,
+        "vocab_size": 30522,
+        "pad_token_id": 0,
+    },
+    "uie-base": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "task_type_vocab_size": 3,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "uie-medium": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "intermediate_size": 3072,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 6,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "uie-mini": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 384,
+        "intermediate_size": 1536,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 6,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "uie-micro": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 384,
+        "intermediate_size": 1536,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 4,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "uie-nano": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 312,
+        "intermediate_size": 1248,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 4,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "uie-base-en": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 4,
+        "vocab_size": 30522,
+        "pad_token_id": 0,
+    },
+    "uie-senta-base": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "task_type_vocab_size": 3,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "uie-senta-medium": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "intermediate_size": 3072,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 6,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "uie-senta-mini": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 384,
+        "intermediate_size": 1536,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 6,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "uie-senta-micro": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 384,
+        "intermediate_size": 1536,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 4,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "uie-senta-nano": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 312,
+        "intermediate_size": 1248,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 4,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "uie-base-answer-extractor": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "task_type_vocab_size": 3,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "uie-base-qa-filter": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "task_type_vocab_size": 3,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 40000,
+    },
+    "ernie-search-base-dual-encoder-marco-en": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 4,
+        "vocab_size": 30522,
+        "pad_token_id": 0,
+    },
+    "ernie-search-large-cross-encoder-marco-en": {
+        "attention_probs_dropout_prob": 0.1,
+        "intermediate_size": 4096,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 24,
+        "type_vocab_size": 4,
+        "vocab_size": 30522,
+        "pad_token_id": 0,
+    },
+    "ernie-3.0-tiny-base-v2-zh": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "task_type_vocab_size": 3,
+        "type_vocab_size": 4,
+        "use_task_id": False,
+        "vocab_size": 40000,
+    },
+    "ernie-3.0-tiny-medium-v2-zh": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "intermediate_size": 3072,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 6,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": False,
+        "vocab_size": 40000,
+    },
+    "ernie-3.0-tiny-mini-v2-zh": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 384,
+        "intermediate_size": 1536,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 6,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": False,
+        "vocab_size": 40000,
+    },
+    "ernie-3.0-tiny-mini-v2-en": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 384,
+        "intermediate_size": 1536,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 514,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 6,
+        "type_vocab_size": 1,
+        "use_task_id": False,
+        "vocab_size": 50265,
+    },
+    "ernie-3.0-tiny-micro-v2-zh": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 384,
+        "intermediate_size": 1536,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 4,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": False,
+        "vocab_size": 40000,
+    },
+    "ernie-3.0-tiny-nano-v2-zh": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 312,
+        "intermediate_size": 1248,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 4,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": False,
+        "vocab_size": 40000,
+    },
+    "ernie-3.0-tiny-pico-v2-zh": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 128,
+        "intermediate_size": 512,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 2,
+        "num_hidden_layers": 3,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": False,
+        "vocab_size": 40000,
+    },
+    "utc-large": {
+        "attention_probs_dropout_prob": 0.1,
+        "dtype": "float32",
+        "fuse": False,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,  # it is 3072 instead of 4096
+        "layer_norm_eps": 1e-12,
+        "max_position_embeddings": 512,
+        "model_type": "ernie",
+        "num_attention_heads": 16,
+        "pool_act": "tanh",
+        "num_hidden_layers": 24,
+        "task_type_vocab_size": 3,
+        "use_task_id": True,
+        "task_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 17965,
+        "pad_token_id": 0,
+    },
+    "utc-xbase": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "intermediate_size": 4096,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 20,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": True,
+        "vocab_size": 39981,
+    },
+    "utc-base": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "task_type_vocab_size": 3,
+        "type_vocab_size": 4,
+        "use_task_id": False,
+        "vocab_size": 39981,
+    },
+    "utc-medium": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "intermediate_size": 3072,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 6,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": False,
+        "vocab_size": 39981,
+    },
+    "utc-mini": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 384,
+        "intermediate_size": 1536,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 6,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": False,
+        "vocab_size": 39981,
+    },
+    "utc-micro": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 384,
+        "intermediate_size": 1536,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 4,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": False,
+        "vocab_size": 39981,
+    },
+    "utc-nano": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 312,
+        "intermediate_size": 1248,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 4,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": False,
+        "vocab_size": 39981,
+    },
+    "utc-pico": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 128,
+        "intermediate_size": 512,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 2048,
+        "num_attention_heads": 2,
+        "num_hidden_layers": 3,
+        "task_type_vocab_size": 16,
+        "type_vocab_size": 4,
+        "use_task_id": False,
+        "vocab_size": 39981,
+    },
+}
+
+ERNIE_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        # Deprecated, alias for ernie-1.0-base-zh
+        "ernie-1.0": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/ernie_v1_chn_base.pdparams",
+        "ernie-1.0-base-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/ernie_v1_chn_base.pdparams",
+        "ernie-1.0-base-zh-cw": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/ernie_1.0_base_zh_cw.pdparams",
+        "ernie-1.0-large-zh-cw": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/ernie_1.0_large_zh_cw.pdparams",
+        "ernie-tiny": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_tiny/ernie_tiny.pdparams",
+        "ernie-2.0-base-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_2.0/ernie_2.0_base_zh.pdparams",
+        "ernie-2.0-large-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_2.0/ernie_2.0_large_zh.pdparams",
+        "ernie-2.0-base-en": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_v2_base/ernie_v2_eng_base.pdparams",
+        "ernie-2.0-base-en-finetuned-squad": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_v2_base/ernie_v2_eng_base_finetuned_squad.pdparams",
+        "ernie-2.0-large-en": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_v2_large/ernie_v2_eng_large.pdparams",
+        "rocketqa-zh-dureader-query-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/rocketqa/rocketqa_zh_dureader_query_encoder.pdparams",
+        "rocketqa-zh-dureader-para-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/rocketqa/rocketqa_zh_dureader_para_encoder.pdparams",
+        "rocketqa-v1-marco-query-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/rocketqa/rocketqa_v1_marco_query_encoder.pdparams",
+        "rocketqa-v1-marco-para-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/rocketqa/rocketqa_v1_marco_para_encoder.pdparams",
+        "rocketqa-zh-dureader-cross-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/rocketqa/rocketqa_zh_dureader_cross_encoder.pdparams",
+        "rocketqa-v1-marco-cross-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/rocketqa/rocketqa_v1_marco_cross_encoder.pdparams",
+        "ernie-3.0-base-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_base_zh.pdparams",
+        "ernie-3.0-xbase-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_xbase_zh.pdparams",
+        "ernie-3.0-medium-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_medium_zh.pdparams",
+        "ernie-3.0-mini-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_mini_zh.pdparams",
+        "ernie-3.0-micro-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_micro_zh.pdparams",
+        "ernie-3.0-nano-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_nano_zh.pdparams",
+        "ernie-3.0-tiny-base-v1-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_base_zh.pdparams",
+        "ernie-3.0-tiny-medium-v1-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_medium_zh.pdparams",
+        "ernie-3.0-tiny-mini-v1-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_mini_zh.pdparams",
+        "ernie-3.0-tiny-micro-v1-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_micro_zh.pdparams",
+        "ernie-3.0-tiny-nano-v1-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_nano_zh.pdparams",
+        "rocketqa-zh-base-query-encoder": "https://paddlenlp.bj.bcebos.com/models/transformers/rocketqa/rocketqa-zh-base-query-encoder.pdparams",
+        "rocketqa-zh-base-para-encoder": "https://paddlenlp.bj.bcebos.com/models/transformers/rocketqa/rocketqa-zh-base-para-encoder.pdparams",
+        "rocketqa-zh-medium-query-encoder": "https://paddlenlp.bj.bcebos.com/models/transformers/rocketqa/rocketqa-zh-medium-query-encoder.pdparams",
+        "rocketqa-zh-medium-para-encoder": "https://paddlenlp.bj.bcebos.com/models/transformers/rocketqa/rocketqa-zh-medium-para-encoder.pdparams",
+        "rocketqa-zh-mini-query-encoder": "https://paddlenlp.bj.bcebos.com/models/transformers/rocketqa/rocketqa-zh-mini-query-encoder.pdparams",
+        "rocketqa-zh-mini-para-encoder": "https://paddlenlp.bj.bcebos.com/models/transformers/rocketqa/rocketqa-zh-mini-para-encoder.pdparams",
+        "rocketqa-zh-micro-query-encoder": "https://paddlenlp.bj.bcebos.com/models/transformers/rocketqa/rocketqa-zh-micro-query-encoder.pdparams",
+        "rocketqa-zh-micro-para-encoder": "https://paddlenlp.bj.bcebos.com/models/transformers/rocketqa/rocketqa-zh-micro-para-encoder.pdparams",
+        "rocketqa-zh-nano-query-encoder": "https://paddlenlp.bj.bcebos.com/models/transformers/rocketqa/rocketqa-zh-nano-query-encoder.pdparams",
+        "rocketqa-zh-nano-para-encoder": "https://paddlenlp.bj.bcebos.com/models/transformers/rocketqa/rocketqa-zh-nano-para-encoder.pdparams",
+        "rocketqa-base-cross-encoder": "https://paddlenlp.bj.bcebos.com/models/transformers/rocketqa/rocketqa-base-cross-encoder.pdparams",
+        "rocketqa-medium-cross-encoder": "https://paddlenlp.bj.bcebos.com/models/transformers/rocketqa/rocketqa-medium-cross-encoder.pdparams",
+        "rocketqa-mini-cross-encoder": "https://paddlenlp.bj.bcebos.com/models/transformers/rocketqa/rocketqa-mini-cross-encoder.pdparams",
+        "rocketqa-micro-cross-encoder": "https://paddlenlp.bj.bcebos.com/models/transformers/rocketqa/rocketqa-micro-cross-encoder.pdparams",
+        "rocketqa-nano-cross-encoder": "https://paddlenlp.bj.bcebos.com/models/transformers/rocketqa/rocketqa-nano-cross-encoder.pdparams",
+        "rocketqav2-en-marco-cross-encoder": "https://paddlenlp.bj.bcebos.com/models/transformers/rocketqa/rocketqav2_en_marco_cross_encoder.pdparams",
+        "rocketqav2-en-marco-query-encoder": "https://paddlenlp.bj.bcebos.com/models/transformers/rocketqa/rocketqav2_en_marco_query_encoder.pdparams",
+        "rocketqav2-en-marco-para-encoder": "https://paddlenlp.bj.bcebos.com/models/transformers/rocketqa/rocketqav2_en_marco_para_encoder.pdparams",
+        "uie-base": "https://bj.bcebos.com/paddlenlp/models/transformers/uie/uie_base.pdparams",
+        "uie-medium": "https://bj.bcebos.com/paddlenlp/models/transformers/uie/uie_medium.pdparams",
+        "uie-mini": "https://bj.bcebos.com/paddlenlp/models/transformers/uie/uie_mini.pdparams",
+        "uie-micro": "https://bj.bcebos.com/paddlenlp/models/transformers/uie/uie_micro.pdparams",
+        "uie-nano": "https://bj.bcebos.com/paddlenlp/models/transformers/uie/uie_nano.pdparams",
+        "uie-base-en": "https://bj.bcebos.com/paddlenlp/models/transformers/uie/uie_base_en.pdparams",
+        "uie-senta-base": "https://paddlenlp.bj.bcebos.com/models/transformers/uie/uie_senta_base.pdparams",
+        "uie-senta-medium": "https://bj.bcebos.com/paddlenlp/models/transformers/uie/uie_senta_medium.pdparams",
+        "uie-senta-mini": "https://bj.bcebos.com/paddlenlp/models/transformers/uie/uie_senta_mini.pdparams",
+        "uie-senta-micro": "https://bj.bcebos.com/paddlenlp/models/transformers/uie/uie_senta_micro.pdparams",
+        "uie-senta-nano": "https://bj.bcebos.com/paddlenlp/models/transformers/uie/uie_senta_nano.pdparams",
+        "uie-base-answer-extractor": "https://bj.bcebos.com/paddlenlp/models/transformers/uie/uie_base_answer_extractor.pdparams",
+        "uie-base-qa-filter": "https://bj.bcebos.com/paddlenlp/models/transformers/uie/uie_base_qa_filter.pdparams",
+        "ernie-search-base-dual-encoder-marco-en": "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_search/ernie_search_base_dual_encoder_marco_en.pdparams",
+        "ernie-search-large-cross-encoder-marco-en": "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_search/ernie_search_large_cross_encoder_marco_en.pdparams",
+        "ernie-3.0-tiny-base-v2-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_tiny_base_v2.pdparams",
+        "ernie-3.0-tiny-medium-v2-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_tiny_medium_v2.pdparams",
+        "ernie-3.0-tiny-mini-v2-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_tiny_mini_v2.pdparams",
+        "ernie-3.0-tiny-mini-v2-en": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_tiny_mini_v2_en.pdparams",
+        "ernie-3.0-tiny-micro-v2-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_tiny_micro_v2.pdparams",
+        "ernie-3.0-tiny-nano-v2-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_tiny_nano_v2.pdparams",
+        "ernie-3.0-tiny-pico-v2-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_tiny_pico_v2.pdparams",
+        "utc-large": "https://bj.bcebos.com/paddlenlp/models/transformers/utc/utc_large.pdparams",
+        "utc-xbase": "https://paddlenlp.bj.bcebos.com/models/transformers/utc/utc-xbase.pdparams",
+        "utc-base": "https://paddlenlp.bj.bcebos.com/models/transformers/utc/utc-base.pdparams",
+        "utc-medium": "https://paddlenlp.bj.bcebos.com/models/transformers/utc/utc-medium.pdparams",
+        "utc-micro": "https://paddlenlp.bj.bcebos.com/models/transformers/utc/utc-micro.pdparams",
+        "utc-mini": "https://paddlenlp.bj.bcebos.com/models/transformers/utc/utc-mini.pdparams",
+        "utc-nano": "https://paddlenlp.bj.bcebos.com/models/transformers/utc/utc-nano.pdparams",
+        "utc-pico": "https://paddlenlp.bj.bcebos.com/models/transformers/utc/utc-pico.pdparams",
+    }
+}
+
+
+class ErnieConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ErnieModel`]. It is used to
+    instantiate a ERNIE model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the ERNIE
+    ernie-3.0-medium-zh architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the ERNIE model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ErnieModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`ErnieModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+    Examples:
+    ```python
+    >>> from paddlenlp.transformers import ErnieModel, ErnieConfig
+    >>> # Initializing a ERNIE ernie-3.0-medium-zhstyle configuration
+    >>> configuration = ErnieConfig()
+    >>> # Initializing a model from the  style configuration
+    >>> model = ErnieModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "ernie"
+    attribute_map: Dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
+    pretrained_init_configuration = ERNIE_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size: int = 30522,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        task_id=0,
+        intermediate_size: int = 3072,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        max_position_embeddings: int = 512,
+        task_type_vocab_size: int = 3,
+        type_vocab_size: int = 16,
+        initializer_range: float = 0.02,
+        pad_token_id: int = 0,
+        pool_act: str = "tanh",
+        fuse: bool = False,
+        layer_norm_eps=1e-12,
+        use_cache=False,
+        use_task_id=False,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.task_id = task_id
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.task_type_vocab_size = task_type_vocab_size
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.pool_act = pool_act
+        self.fuse = fuse
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+        self.use_task_id = use_task_id
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie/modeling.py
new file mode 100644
index 000000000..0833a5a8b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie/modeling.py
@@ -0,0 +1,1381 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import Tensor
+
+# TODO(guosheng): update this workaround import for in_declarative_mode
+from paddle.nn.layer.layers import in_declarative_mode
+
+from ...layers import Linear as TransposedLinear
+from ...utils.env import CONFIG_NAME
+from .. import PretrainedModel, register_base_model
+from ..model_outputs import (
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    MaskedLMOutput,
+    ModelOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from .configuration import (
+    ERNIE_PRETRAINED_INIT_CONFIGURATION,
+    ERNIE_PRETRAINED_RESOURCE_FILES_MAP,
+    ErnieConfig,
+)
+
+__all__ = [
+    "ErnieModel",
+    "ErniePretrainedModel",
+    "ErnieForSequenceClassification",
+    "ErnieForTokenClassification",
+    "ErnieForQuestionAnswering",
+    "ErnieForPretraining",
+    "ErniePretrainingCriterion",
+    "ErnieForMaskedLM",
+    "ErnieForMultipleChoice",
+    "UIE",
+    "UTC",
+]
+
+
+class ErnieEmbeddings(nn.Layer):
+    r"""
+    Include embeddings from word, position and token_type embeddings.
+    """
+
+    def __init__(self, config: ErnieConfig, weight_attr):
+        super(ErnieEmbeddings, self).__init__()
+
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id, weight_attr=weight_attr
+        )
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, weight_attr=weight_attr
+        )
+        self.type_vocab_size = config.type_vocab_size
+        if self.type_vocab_size > 0:
+            self.token_type_embeddings = nn.Embedding(
+                config.type_vocab_size, config.hidden_size, weight_attr=weight_attr
+            )
+        self.use_task_id = config.use_task_id
+        self.task_id = config.task_id
+        if self.use_task_id:
+            self.task_type_embeddings = nn.Embedding(
+                config.task_type_vocab_size, config.hidden_size, weight_attr=weight_attr
+            )
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        task_type_ids: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        past_key_values_length: int = 0,
+    ):
+
+        if input_ids is not None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        input_shape = inputs_embeds.shape[:-1] if in_declarative_mode() else inputs_embeds.shape[:-1]
+
+        if position_ids is None:
+            # maybe need use shape op to unify static graph and dynamic graph
+            ones = paddle.ones(input_shape, dtype="int64")
+            seq_length = paddle.cumsum(ones, axis=1)
+            position_ids = seq_length - ones
+
+            if past_key_values_length > 0:
+                position_ids = position_ids + past_key_values_length
+
+            position_ids.stop_gradient = True
+
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        if self.type_vocab_size > 0:
+            if token_type_ids is None:
+                token_type_ids = paddle.zeros(input_shape, dtype="int64")
+            token_type_embeddings = self.token_type_embeddings(token_type_ids)
+            embeddings = embeddings + token_type_embeddings
+
+        if self.use_task_id:
+            if task_type_ids is None:
+                task_type_ids = paddle.ones(input_shape, dtype="int64") * self.task_id
+            task_type_embeddings = self.task_type_embeddings(task_type_ids)
+            embeddings = embeddings + task_type_embeddings
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class ErniePooler(nn.Layer):
+    def __init__(self, config: ErnieConfig, weight_attr):
+        super(ErniePooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size, weight_attr=weight_attr)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class ErniePretrainedModel(PretrainedModel):
+    r"""
+    An abstract class for pretrained ERNIE models. It provides ERNIE related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    Refer to :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+
+    """
+
+    model_config_file = CONFIG_NAME
+    config_class = ErnieConfig
+    resource_files_names = {"model_state": "model_state.pdparams"}
+    base_model_prefix = "ernie"
+
+    pretrained_init_configuration = ERNIE_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = ERNIE_PRETRAINED_RESOURCE_FILES_MAP
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # only support dygraph, use truncated_normal and make it inplace
+            # and configurable later
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range,
+                        shape=layer.weight.shape,
+                    )
+                )
+        elif isinstance(layer, nn.LayerNorm):
+            layer._epsilon = 1e-12
+
+
+@register_base_model
+class ErnieModel(ErniePretrainedModel):
+    r"""
+    The bare ERNIE Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`ErnieConfig`):
+            An instance of ErnieConfig used to construct ErnieModel
+    """
+
+    def __init__(self, config: ErnieConfig):
+        super(ErnieModel, self).__init__(config)
+        self.pad_token_id = config.pad_token_id
+        self.initializer_range = config.initializer_range
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.TruncatedNormal(mean=0.0, std=self.initializer_range)
+        )
+        self.embeddings = ErnieEmbeddings(config=config, weight_attr=weight_attr)
+        encoder_layer = nn.TransformerEncoderLayer(
+            config.hidden_size,
+            config.num_attention_heads,
+            config.intermediate_size,
+            dropout=config.hidden_dropout_prob,
+            activation=config.hidden_act,
+            attn_dropout=config.attention_probs_dropout_prob,
+            act_dropout=0,
+            weight_attr=weight_attr,
+            normalize_before=False,
+        )
+        self.encoder = nn.TransformerEncoder(encoder_layer, config.num_hidden_layers)
+        self.pooler = ErniePooler(config, weight_attr)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        task_type_ids: Optional[Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[Tensor]]] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids (Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `[batch_size, num_tokens]` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
+                [batch_size, num_attention_heads, sequence_length, sequence_length].
+                We use whole-word-mask in ERNIE, so the whole word will have the same value. For example, "使用" as a word,
+                "使" and "用" will have the same value.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            inputs_embeds (Tensor, optional):
+                If you want to control how to convert `inputs_ids` indices into associated vectors, you can
+                pass an embedded representation directly instead of passing `inputs_ids`.
+            past_key_values (tuple(tuple(Tensor)), optional):
+                The length of tuple equals to the number of layers, and each inner
+                tuple haves 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`)
+                which contains precomputed key and value hidden states of the attention blocks.
+                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that
+                don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+                `input_ids` of shape `(batch_size, sequence_length)`.
+            use_cache (`bool`, optional):
+                If set to `True`, `past_key_values` key value states are returned.
+                Defaults to `None`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.ModelOutput` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ErnieModel, ErnieTokenizer
+
+                tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')
+                model = ErnieModel.from_pretrained('ernie-1.0')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                sequence_output, pooled_output = model(**inputs)
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time.")
+
+        # init the default bool value
+        output_attentions = output_attentions if output_attentions is not None else False
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else False
+        return_dict = return_dict if return_dict is not None else False
+        use_cache = use_cache if use_cache is not None else False
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+
+        if attention_mask is None:
+            attention_mask = paddle.unsqueeze(
+                (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e4, axis=[1, 2]
+            )
+            if past_key_values is not None:
+                batch_size = past_key_values[0][0].shape[0]
+                past_mask = paddle.zeros([batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype)
+                attention_mask = paddle.concat([past_mask, attention_mask], axis=-1)
+
+        # For 2D attention_mask from tokenizer
+        elif attention_mask.ndim == 2:
+            attention_mask = paddle.unsqueeze(attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
+            attention_mask = (1.0 - attention_mask) * -1e4
+
+        attention_mask.stop_gradient = True
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            task_type_ids=task_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        self.encoder._use_cache = use_cache  # To be consistent with HF
+        encoder_outputs = self.encoder(
+            embedding_output,
+            src_mask=attention_mask,
+            cache=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if isinstance(encoder_outputs, type(embedding_output)):
+            sequence_output = encoder_outputs
+            pooled_output = self.pooler(sequence_output)
+            return (sequence_output, pooled_output)
+        else:
+            sequence_output = encoder_outputs[0]
+            pooled_output = self.pooler(sequence_output)
+            if not return_dict:
+                return (sequence_output, pooled_output) + encoder_outputs[1:]
+            return BaseModelOutputWithPoolingAndCrossAttentions(
+                last_hidden_state=sequence_output,
+                pooler_output=pooled_output,
+                past_key_values=encoder_outputs.past_key_values,
+                hidden_states=encoder_outputs.hidden_states,
+                attentions=encoder_outputs.attentions,
+            )
+
+
+class ErnieForSequenceClassification(ErniePretrainedModel):
+    r"""
+    Ernie Model with a linear layer on top of the output layer,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        config (:class:`ErnieConfig`):
+            An instance of ErnieConfig used to construct ErnieForSequenceClassification.
+    """
+
+    def __init__(self, config):
+        super(ErnieForSequenceClassification, self).__init__(config)
+        self.ernie = ErnieModel(config)
+        self.num_labels = config.num_labels
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`ErnieModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ErnieModel`.
+            position_ids (Tensor, optional):
+                See :class:`ErnieModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ErnieModel`.
+            inputs_embeds(Tensor, optional):
+                See :class:`ErnieModel`.
+            labels (Tensor of shape `(batch_size,)`, optional):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in `[0, ..., num_labels - 1]`. If `num_labels == 1`
+                a regression loss is computed (Mean-Square loss), If `num_labels > 1`
+                a classification loss is computed (Cross-Entropy).
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput`.
+
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
+
+                tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')
+                model = ErnieForSequenceClassification.from_pretrained('ernie-1.0')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.ernie(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = paddle.nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = paddle.nn.CrossEntropyLoss()
+                loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = paddle.nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class ErnieForQuestionAnswering(ErniePretrainedModel):
+    """
+    Ernie Model with a linear layer on top of the hidden-states
+    output to compute `span_start_logits` and `span_end_logits`,
+    designed for question-answering tasks like SQuAD.
+
+    Args:
+        config (:class:`ErnieConfig`):
+            An instance of ErnieConfig used to construct ErnieForQuestionAnswering.
+    """
+
+    def __init__(self, config):
+        super(ErnieForQuestionAnswering, self).__init__(config)
+        self.ernie = ErnieModel(config)
+        self.classifier = nn.Linear(config.hidden_size, 2)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        start_positions: Optional[Tensor] = None,
+        end_positions: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`ErnieModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ErnieModel`.
+            position_ids (Tensor, optional):
+                See :class:`ErnieModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ErnieModel`.
+            inputs_embeds(Tensor, optional):
+                See :class:`ErnieModel`.
+            start_positions (Tensor of shape `(batch_size,)`, optional):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            end_positions (Tensor of shape `(batch_size,)`, optional):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ErnieForQuestionAnswering, ErnieTokenizer
+
+                tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')
+                model = ErnieForQuestionAnswering.from_pretrained('ernie-1.0')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.ernie(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output)
+        logits = paddle.transpose(logits, perm=[2, 0, 1])
+        start_logits, end_logits = paddle.unstack(x=logits, axis=0)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if start_positions.ndim > 1:
+                start_positions = start_positions.squeeze(-1)
+            if start_positions.ndim > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clip(0, ignored_index)
+            end_positions = end_positions.clip(0, ignored_index)
+
+            loss_fct = paddle.nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class ErnieForTokenClassification(ErniePretrainedModel):
+    r"""
+    ERNIE Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        config (:class:`ErnieConfig`):
+            An instance of ErnieConfigused to construct ErnieForTokenClassification.
+    """
+
+    def __init__(self, config: ErnieConfig):
+        super(ErnieForTokenClassification, self).__init__(config)
+        self.ernie = ErnieModel(config)
+        self.num_labels = config.num_labels
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`ErnieModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ErnieModel`.
+            position_ids (Tensor, optional):
+                See :class:`ErnieModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ErnieModel`.
+            inputs_embeds(Tensor, optional):
+                See :class:`ErnieModel`.
+            labels (Tensor of shape `(batch_size, sequence_length)`, optional):
+                Labels for computing the token classification loss. Indices should be in `[0, ..., num_labels - 1]`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ErnieForTokenClassification, ErnieTokenizer
+
+                tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')
+                model = ErnieForTokenClassification.from_pretrained('ernie-1.0')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.ernie(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class ErnieLMPredictionHead(nn.Layer):
+    r"""
+    Ernie Model with a `language modeling` head on top.
+    """
+
+    def __init__(
+        self,
+        config: ErnieConfig,
+        weight_attr=None,
+    ):
+        super(ErnieLMPredictionHead, self).__init__()
+
+        self.transform = nn.Linear(config.hidden_size, config.hidden_size, weight_attr=weight_attr)
+        self.activation = getattr(nn.functional, config.hidden_act)
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.decoder = TransposedLinear(config.hidden_size, config.vocab_size)
+        # link bias to load pretrained weights
+        self.decoder_bias = self.decoder.bias
+
+    def forward(self, hidden_states, masked_positions=None):
+        if masked_positions is not None:
+            hidden_states = paddle.reshape(hidden_states, [-1, hidden_states.shape[-1]])
+            hidden_states = paddle.tensor.gather(hidden_states, masked_positions)
+        # gather masked tokens might be more quick
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        # hidden_states = paddle.tensor.matmul(hidden_states, self.decoder.weight, transpose_y=True) + self.decoder_bias
+        return hidden_states
+
+
+class ErniePretrainingHeads(nn.Layer):
+    def __init__(
+        self,
+        config: ErnieConfig,
+        weight_attr=None,
+    ):
+        super(ErniePretrainingHeads, self).__init__()
+        self.predictions = ErnieLMPredictionHead(config, weight_attr)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2, weight_attr=weight_attr)
+
+    def forward(self, sequence_output, pooled_output, masked_positions=None):
+        prediction_scores = self.predictions(sequence_output, masked_positions)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+@dataclass
+class ErnieForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`ErnieForPreTraining`].
+    Args:
+        loss (*optional*, returned when `labels` is provided, `paddle.Tensor` of shape `(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (`paddle.Tensor` of shape `(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    prediction_logits: paddle.Tensor = None
+    seq_relationship_logits: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+class ErnieForPretraining(ErniePretrainedModel):
+    r"""
+    Ernie Model with a `masked language modeling` head and a `sentence order prediction` head
+    on top.
+
+    """
+
+    def __init__(self, config: ErnieConfig):
+        super(ErnieForPretraining, self).__init__(config)
+        self.ernie = ErnieModel(config)
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.TruncatedNormal(mean=0.0, std=self.ernie.initializer_range)
+        )
+        self.cls = ErniePretrainingHeads(
+            config=config,
+            weight_attr=weight_attr,
+        )
+
+        self.tie_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        masked_positions: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        next_sentence_label: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`ErnieModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ErnieModel`.
+            position_ids (Tensor, optional):
+                See :class:`ErnieModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ErnieModel`.
+            inputs_embeds(Tensor, optional):
+                See :class:`ErnieModel`.
+            labels (Tensor of shape `(batch_size, sequence_length)`, optional):
+                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+                vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
+                the loss is only computed for the tokens with labels in `[0, ..., vocab_size]`.
+            next_sentence_label (Tensor of shape `(batch_size,)`, optional):
+                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
+                pair (see `input_ids` docstring) Indices should be in `[0, 1]`:
+
+                - 0 indicates sequence B is a continuation of sequence A,
+                - 1 indicates sequence B is a random sequence.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.bert.ErnieForPreTrainingOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.bert.ErnieForPreTrainingOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.bert.ErnieForPreTrainingOutput`.
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        with paddle.static.amp.fp16_guard():
+            outputs = self.ernie(
+                input_ids,
+                token_type_ids=token_type_ids,
+                position_ids=position_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            sequence_output, pooled_output = outputs[:2]
+            prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output, masked_positions)
+
+            total_loss = None
+            if labels is not None and next_sentence_label is not None:
+                loss_fct = paddle.nn.CrossEntropyLoss()
+                masked_lm_loss = loss_fct(
+                    prediction_scores.reshape((-1, prediction_scores.shape[-1])), labels.reshape((-1,))
+                )
+                next_sentence_loss = loss_fct(
+                    seq_relationship_score.reshape((-1, 2)), next_sentence_label.reshape((-1,))
+                )
+                total_loss = masked_lm_loss + next_sentence_loss
+            if not return_dict:
+                output = (prediction_scores, seq_relationship_score) + outputs[2:]
+                return ((total_loss,) + output) if total_loss is not None else output
+
+            return ErnieForPreTrainingOutput(
+                loss=total_loss,
+                prediction_logits=prediction_scores,
+                seq_relationship_logits=seq_relationship_score,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+
+
+class ErniePretrainingCriterion(paddle.nn.Layer):
+    r"""
+    The loss output of Ernie Model during the pretraining:
+    a `masked language modeling` head and a `next sentence prediction (classification)` head.
+
+    """
+
+    def __init__(self, with_nsp_loss=True):
+        super(ErniePretrainingCriterion, self).__init__()
+        self.with_nsp_loss = with_nsp_loss
+        # self.loss_fn = paddle.nn.loss.CrossEntropyLoss(ignore_index=-1)
+
+    def forward(self, prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels=None):
+        """
+        Args:
+            prediction_scores(Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size].
+                Otherwise, its shape is [batch_size, mask_token_num, vocab_size]
+            seq_relationship_score(Tensor):
+                The scores of next sentence prediction. Its data type should be float32 and
+                its shape is [batch_size, 2]
+            masked_lm_labels(Tensor):
+                The labels of the masked language modeling, its dimensionality is equal to `prediction_scores`.
+                Its data type should be int64. If `masked_positions` is None, its shape is [batch_size, sequence_length, 1].
+                Otherwise, its shape is [batch_size, mask_token_num, 1]
+            next_sentence_labels(Tensor):
+                The labels of the next sentence prediction task, the dimensionality of `next_sentence_labels`
+                is equal to `seq_relation_labels`. Its data type should be int64 and
+                its shape is [batch_size, 1]
+
+        Returns:
+            Tensor: The pretraining loss, equals to the sum of `masked_lm_loss` plus the mean of `next_sentence_loss`.
+            Its data type should be float32 and its shape is [1].
+
+        """
+
+        with paddle.static.amp.fp16_guard():
+            masked_lm_loss = F.cross_entropy(prediction_scores, masked_lm_labels, ignore_index=-1, reduction="none")
+
+            if not self.with_nsp_loss:
+                return paddle.mean(masked_lm_loss)
+
+            next_sentence_loss = F.cross_entropy(seq_relationship_score, next_sentence_labels, reduction="none")
+            return paddle.mean(masked_lm_loss), paddle.mean(next_sentence_loss)
+
+
+class ErnieOnlyMLMHead(nn.Layer):
+    def __init__(self, config: ErnieConfig):
+        super().__init__()
+        self.predictions = ErnieLMPredictionHead(config=config)
+
+    def forward(self, sequence_output, masked_positions=None):
+        prediction_scores = self.predictions(sequence_output, masked_positions)
+        return prediction_scores
+
+
+class ErnieForMaskedLM(ErniePretrainedModel):
+    """
+    Ernie Model with a `masked language modeling` head on top.
+
+    Args:
+        config (:class:`ErnieConfig`):
+            An instance of ErnieConfig used to construct ErnieForMaskedLM.
+
+    """
+
+    def __init__(self, config: ErnieConfig):
+        super(ErnieForMaskedLM, self).__init__(config)
+        self.ernie = ErnieModel(config)
+        self.cls = ErnieOnlyMLMHead(config=config)
+        self.tie_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        masked_positions: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ErnieModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ErnieModel`.
+            position_ids (Tensor, optional):
+                See :class:`ErnieModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ErnieModel`.
+            masked_positions:
+                masked positions of output.
+            inputs_embeds(Tensor, optional):
+                See :class:`ErnieModel`.
+            labels (Tensor of shape `(batch_size, sequence_length)`, optional):
+                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+                vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+                loss is only computed for the tokens with labels in `[0, ..., vocab_size]`
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.MaskedLMOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.MaskedLMOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.MaskedLMOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ErnieForMaskedLM, ErnieTokenizer
+
+                tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')
+                model = ErnieForMaskedLM.from_pretrained('ernie-1.0')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+
+                logits = model(**inputs)
+                print(logits.shape)
+                # [1, 17, 18000]
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.ernie(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output, masked_positions=masked_positions)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.reshape((-1, prediction_scores.shape[-1])), labels.reshape((-1,))
+            )
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return (
+                ((masked_lm_loss,) + output)
+                if masked_lm_loss is not None
+                else (output[0] if len(output) == 1 else output)
+            )
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class ErnieForMultipleChoice(ErniePretrainedModel):
+    """
+    Ernie Model with a linear layer on top of the hidden-states output layer,
+    designed for multiple choice tasks like RocStories/SWAG tasks.
+
+    Args:
+        config (:class:`ErnieConfig`):
+            An instance of ErnieConfig used to construct ErnieForMultipleChoice
+    """
+
+    def __init__(self, config: ErnieConfig):
+        super(ErnieForMultipleChoice, self).__init__(config)
+        self.ernie = ErnieModel(config)
+        self.num_choices = config.num_choices if config.num_choices is not None else 2
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The ErnieForMultipleChoice forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ErnieModel` and shape as [batch_size, num_choice, sequence_length].
+            token_type_ids(Tensor, optional):
+                See :class:`ErnieModel` and shape as [batch_size, num_choice, sequence_length].
+            position_ids(Tensor, optional):
+                See :class:`ErnieModel` and shape as [batch_size, num_choice, sequence_length].
+            attention_mask (list, optional):
+                See :class:`ErnieModel` and shape as [batch_size, num_choice, sequence_length].
+            inputs_embeds(Tensor, optional):
+                See :class:`ErnieModel` and shape as [batch_size, num_choice, sequence_length, hidden_size].
+            labels (Tensor of shape `(batch_size, )`, optional):
+                Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+                num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+                `input_ids` above)
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput`.
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # input_ids: [bs, num_choice, seq_l]
+        if input_ids is not None:
+            input_ids = input_ids.reshape(shape=(-1, input_ids.shape[-1]))  # flat_input_ids: [bs*num_choice,seq_l]
+
+        if position_ids is not None:
+            position_ids = position_ids.reshape(shape=(-1, position_ids.shape[-1]))
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.reshape(shape=(-1, token_type_ids.shape[-1]))
+
+        if attention_mask is not None:
+            attention_mask = attention_mask.reshape(shape=(-1, attention_mask.shape[-1]))
+
+        if inputs_embeds is not None:
+            inputs_embeds = inputs_embeds.reshape(shape=(-1, inputs_embeds.shape[-2], inputs_embeds.shape[-1]))
+
+        outputs = self.ernie(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+
+        logits = self.classifier(pooled_output)  # logits: (bs*num_choice,1)
+        reshaped_logits = logits.reshape(shape=(-1, self.num_choices))  # logits: (bs, num_choice)
+
+        loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class UIE(ErniePretrainedModel):
+    """
+    Ernie Model with two linear layer on top of the hidden-states
+    output to compute `start_prob` and `end_prob`,
+    designed for Universal Information Extraction.
+    Args:
+        config (:class:`ErnieConfig`):
+            An instance of ErnieConfig used to construct UIE
+    """
+
+    def __init__(self, config: ErnieConfig):
+        super(UIE, self).__init__(config)
+        self.ernie = ErnieModel(config)
+        self.linear_start = paddle.nn.Linear(config.hidden_size, 1)
+        self.linear_end = paddle.nn.Linear(config.hidden_size, 1)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        return_dict: Optional[Tensor] = None,
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`ErnieModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ErnieModel`.
+            position_ids (Tensor, optional):
+                See :class:`ErnieModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ErnieModel`.
+        Example:
+            .. code-block::
+                import paddle
+                from paddlenlp.transformers import UIE, ErnieTokenizer
+                tokenizer = ErnieTokenizer.from_pretrained('uie-base')
+                model = UIE.from_pretrained('uie-base')
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                start_prob, end_prob = model(**inputs)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        sequence_output, _ = self.ernie(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            return_dict=return_dict,
+        )
+        start_logits = self.linear_start(sequence_output)
+        start_logits = paddle.squeeze(start_logits, -1)
+        start_prob = self.sigmoid(start_logits)
+        end_logits = self.linear_end(sequence_output)
+        end_logits = paddle.squeeze(end_logits, -1)
+        end_prob = self.sigmoid(end_logits)
+        return start_prob, end_prob
+
+
+class UTC(ErniePretrainedModel):
+    """
+    Ernie Model with two linear layer on the top of the hidden-states output to compute
+    probability of candidate labels, designed for Unified Tag Classification.
+    """
+
+    def __init__(self, config: ErnieConfig):
+        super(UTC, self).__init__(config)
+        self.ernie = ErnieModel(config)
+        self.predict_size = 64
+        self.linear_q = paddle.nn.Linear(config.hidden_size, self.predict_size)
+        self.linear_k = paddle.nn.Linear(config.hidden_size, self.predict_size)
+
+    def forward(
+        self,
+        input_ids,
+        token_type_ids,
+        position_ids,
+        attention_mask,
+        omask_positions,
+        cls_positions,
+        inputs_embeds: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`ErnieModel`.
+            token_type_ids (Tensor):
+                See :class:`ErnieModel`.
+            position_ids (Tensor):
+                See :class:`ErnieModel`.
+            attention_mask (Tensor):
+                See :class:`ErnieModel`.
+            omask_positions (Tensor of shape `(batch_size, max_option)`):
+                Masked positions of [O-MASK] tokens padded with 0.
+            cls_positions (Tensor of shape `(batch_size)`):
+                Masked positions of the second [CLS] token.
+            labels (Tensor of shape `(num_labels_in_batch,)`, optional):
+                Labels for computing classification loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.ernie(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+
+        batch_size, seq_len, hidden_size = sequence_output.shape
+        flat_sequence_output = paddle.reshape(sequence_output, [-1, hidden_size])
+        flat_length = paddle.arange(batch_size) * seq_len
+        flat_length = flat_length.unsqueeze(axis=1).astype("int64")
+
+        cls_output = paddle.tensor.gather(flat_sequence_output, cls_positions + flat_length.squeeze(1))
+        q = self.linear_q(cls_output)
+
+        option_output = paddle.tensor.gather(flat_sequence_output, paddle.reshape(omask_positions + flat_length, [-1]))
+        option_output = paddle.reshape(option_output, [batch_size, -1, hidden_size])
+        k = self.linear_k(option_output)
+
+        option_logits = paddle.matmul(q.unsqueeze(1), k, transpose_y=True).squeeze(1)
+        option_logits = option_logits / self.predict_size**0.5
+
+        if hasattr(paddle.framework, "_no_check_dy2st_diff"):
+            # TODO(wanghuancoder): _no_check_dy2st_diff is used to turn off the checking of behavior
+            # inconsistency between dynamic graph and static graph. _no_check_dy2st_diff should be
+            # removed after static graphs support inplace and stride.
+            with paddle.framework._no_check_dy2st_diff():
+                for index, logit in enumerate(option_logits):
+                    option_logits[index] -= (1 - (omask_positions[index] > 0).astype("float32")) * 1e12
+        else:
+            for index, logit in enumerate(option_logits):
+                option_logits[index] -= (1 - (omask_positions[index] > 0).astype("float32")) * 1e12
+        loss = None
+        if not return_dict:
+            output = (option_logits,)
+            if output_hidden_states:
+                output = output + (outputs.hidden_states,)
+            if output_attentions:
+                output = output + (output.attentions,)
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=option_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie/static_to_dygraph_params/match_static_to_dygraph.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie/static_to_dygraph_params/match_static_to_dygraph.py
new file mode 100644
index 000000000..e355c6619
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie/static_to_dygraph_params/match_static_to_dygraph.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pickle
+
+import paddle
+
+
+def match_embedding_param(convert_parameter_name_dict, static_para_prefix=""):
+    convert_parameter_name_dict["embeddings.word_embeddings.weight"] = static_para_prefix + "word_embedding"
+    convert_parameter_name_dict["embeddings.position_embeddings.weight"] = static_para_prefix + "pos_embedding"
+    convert_parameter_name_dict["embeddings.token_type_embeddings.weight"] = static_para_prefix + "sent_embedding"
+    convert_parameter_name_dict["embeddings.task_type_embeddings.weight"] = static_para_prefix + "task_embedding"
+    convert_parameter_name_dict["embeddings.layer_norm.weight"] = static_para_prefix + "pre_encoder_layer_norm_scale"
+    convert_parameter_name_dict["embeddings.layer_norm.bias"] = static_para_prefix + "pre_encoder_layer_norm_bias"
+    return convert_parameter_name_dict
+
+
+def match_encoder_param(convert_parameter_name_dict, layer_num=4, static_para_prefix=""):
+    dygraph_proj_names = ["q", "k", "v", "out"]
+    static_proj_names = ["query", "key", "value", "output"]
+    dygraph_param_names = ["weight", "bias"]
+    static_param_names = ["w", "b"]
+    dygraph_layer_norm_param_names = ["weight", "bias"]
+    static_layer_norm_param_names = ["scale", "bias"]
+
+    # Firstly, converts the multihead_attention to the parameter.
+    dygraph_format_name = "encoder.layers.{}.self_attn.{}_proj.{}"
+    static_format_name = static_para_prefix + "encoder_layer_{}_multi_head_att_{}_fc.{}_0"
+    for i in range(0, layer_num):
+        for dygraph_proj_name, static_proj_name in zip(dygraph_proj_names, static_proj_names):
+            for dygraph_param_name, static_param_name in zip(dygraph_param_names, static_param_names):
+                convert_parameter_name_dict[
+                    dygraph_format_name.format(i, dygraph_proj_name, dygraph_param_name)
+                ] = static_format_name.format(i, static_proj_name, static_param_name)
+
+    # Secondly, converts the encoder ffn parameter.
+    dygraph_ffn_linear_format_name = "encoder.layers.{}.linear{}.{}"
+    static_ffn_linear_format_name = static_para_prefix + "encoder_layer_{}_ffn_fc_{}.{}_0"
+    for i in range(0, layer_num):
+        for j in range(0, 2):
+            for dygraph_param_name, static_param_name in zip(dygraph_param_names, static_param_names):
+                convert_parameter_name_dict[
+                    dygraph_ffn_linear_format_name.format(i, j + 1, dygraph_param_name)
+                ] = static_ffn_linear_format_name.format(i, j, static_param_name)
+
+    # Thirdly, converts the multi_head layer_norm parameter.
+    dygraph_encoder_attention_layer_norm_format_name = "encoder.layers.{}.norm1.{}"
+    static_encoder_attention_layer_norm_format_name = static_para_prefix + "encoder_layer_{}_post_att_layer_norm_{}"
+    for i in range(0, layer_num):
+        for dygraph_param_name, static_pararm_name in zip(
+            dygraph_layer_norm_param_names, static_layer_norm_param_names
+        ):
+            convert_parameter_name_dict[
+                dygraph_encoder_attention_layer_norm_format_name.format(i, dygraph_param_name)
+            ] = static_encoder_attention_layer_norm_format_name.format(i, static_pararm_name)
+
+    dygraph_encoder_ffn_layer_norm_format_name = "encoder.layers.{}.norm2.{}"
+    static_encoder_ffn_layer_norm_format_name = static_para_prefix + "encoder_layer_{}_post_ffn_layer_norm_{}"
+    for i in range(0, layer_num):
+        for dygraph_param_name, static_pararm_name in zip(
+            dygraph_layer_norm_param_names, static_layer_norm_param_names
+        ):
+            convert_parameter_name_dict[
+                dygraph_encoder_ffn_layer_norm_format_name.format(i, dygraph_param_name)
+            ] = static_encoder_ffn_layer_norm_format_name.format(i, static_pararm_name)
+    return convert_parameter_name_dict
+
+
+def match_pooler_parameter(convert_parameter_name_dict, static_para_prefix=""):
+    convert_parameter_name_dict["pooler.dense.weight"] = static_para_prefix + "pooled_fc.w_0"
+    convert_parameter_name_dict["pooler.dense.bias"] = static_para_prefix + "pooled_fc.b_0"
+    return convert_parameter_name_dict
+
+
+def match_mlm_parameter(convert_parameter_name_dict, static_para_prefix=""):
+    # convert_parameter_name_dict["cls.predictions.decoder_weight"] = "word_embedding"
+    convert_parameter_name_dict["cls.predictions.decoder_bias"] = static_para_prefix + "mask_lm_out_fc.b_0"
+    convert_parameter_name_dict["cls.predictions.transform.weight"] = static_para_prefix + "mask_lm_trans_fc.w_0"
+    convert_parameter_name_dict["cls.predictions.transform.bias"] = static_para_prefix + "mask_lm_trans_fc.b_0"
+    convert_parameter_name_dict["cls.predictions.layer_norm.weight"] = (
+        static_para_prefix + "mask_lm_trans_layer_norm_scale"
+    )
+    convert_parameter_name_dict["cls.predictions.layer_norm.bias"] = (
+        static_para_prefix + "mask_lm_trans_layer_norm_bias"
+    )
+    return convert_parameter_name_dict
+
+
+def match_last_fc_parameter(convert_parameter_name_dict, static_para_prefix=""):
+    convert_parameter_name_dict["classifier.weight"] = "_cls_out_w"
+    convert_parameter_name_dict["classifier.bias"] = "_cls_out_b"
+    return convert_parameter_name_dict
+
+
+def convert_static_to_dygraph_params(
+    dygraph_params_save_path, static_params_dir, static_to_dygraph_param_name, model_name="static"
+):
+    files = os.listdir(static_params_dir)
+
+    state_dict = {}
+    model_name = model_name
+    for name in files:
+        path = os.path.join(static_params_dir, name)
+        # static_para_name = name.replace('@HUB_chinese-roberta-wwm-ext-large@',
+        #                                 '')  # for hub module params
+        static_para_name = name.replace(".npy", "")
+        if static_para_name not in static_to_dygraph_param_name:
+            print(static_para_name, "not in static_to_dygraph_param_name")
+            continue
+        dygraph_para_name = static_to_dygraph_param_name[static_para_name]
+        value = paddle.load(path).numpy()
+        if "cls" in dygraph_para_name or "classifier" in dygraph_para_name:
+            # Note: cls.predictions parameters do not need add `model_name.` prefix
+            state_dict[dygraph_para_name] = value
+        else:
+            state_dict[model_name + "." + dygraph_para_name] = value
+
+    with open(dygraph_params_save_path, "wb") as f:
+        pickle.dump(state_dict, f)
+    params = paddle.load(dygraph_params_save_path)
+
+    for name in state_dict.keys():
+        if name in params:
+            assert (state_dict[name] == params[name].numpy()).all()
+        else:
+            print(name, "not in params")
+
+
+if __name__ == "__main__":
+    convert_parameter_name_dict = {}
+
+    convert_parameter_name_dict = match_embedding_param(convert_parameter_name_dict)
+    convert_parameter_name_dict = match_encoder_param(convert_parameter_name_dict, layer_num=12)
+    convert_parameter_name_dict = match_pooler_parameter(convert_parameter_name_dict)
+    convert_parameter_name_dict = match_mlm_parameter(convert_parameter_name_dict)
+
+    static_to_dygraph_param_name = {value: key for key, value in convert_parameter_name_dict.items()}
+
+    for static_name, dygraph_name in static_to_dygraph_param_name.items():
+        print("{}:-------:{}".format(static_name, dygraph_name))
+
+    convert_static_to_dygraph_params(
+        dygraph_params_save_path="./dygraph_model/ernie_v1_chn_base.pdparams",
+        static_params_dir="./ernie1.0_numpy/",
+        static_to_dygraph_param_name=static_to_dygraph_param_name,
+        model_name="ernie",
+    )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie/tokenizer.py
new file mode 100644
index 000000000..63205a42b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie/tokenizer.py
@@ -0,0 +1,918 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import pickle
+import shutil
+
+import sentencepiece as spm
+import six
+
+from paddlenlp.utils.env import MODEL_HOME
+from paddlenlp.utils.log import logger
+
+from .. import BasicTokenizer, PretrainedTokenizer, WordpieceTokenizer
+
+__all__ = ["ErnieTokenizer", "ErnieTinyTokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "ernie-1.0": 513,
+    "ernie-1.0-base-zh": 513,
+    "ernie-1.0-base-zh-cw": 512,
+    "ernie-1.0-large-zh-cw": 512,
+    "ernie-tiny": 600,
+    "ernie-2.0-base-zh": 513,
+    "ernie-2.0-large-zh": 512,
+    "ernie-2.0-base-en": 512,
+    "ernie-2.0-base-en-finetuned-squad": 512,
+    "ernie-2.0-large-en": 512,
+    "ernie-gen-base-en": 1024,
+    "ernie-gen-large-en": 1024,
+    "ernie-gen-large-en-430g": 1024,
+    "rocketqa-zh-dureader-query-encoder": 513,
+    "rocketqa-zh-dureader-para-encoder": 513,
+    "rocketqa-v1-marco-query-encoder": 512,
+    "rocketqa-v1-marco-para-encoder": 512,
+    "rocketqa-zh-dureader-cross-encoder": 513,
+    "rocketqa-v1-marco-cross-encoder": 512,
+    "ernie-3.0-base-zh": 2048,
+    "ernie-3.0-xbase-zh": 2048,
+    "ernie-3.0-medium-zh": 2048,
+    "ernie-3.0-mini-zh": 2048,
+    "ernie-3.0-micro-zh": 2048,
+    "ernie-3.0-nano-zh": 2048,
+    "ernie-3.0-tiny-base-v1-zh": 2048,
+    "ernie-3.0-tiny-medium-v1-zh": 2048,
+    "ernie-3.0-tiny-mini-v1-zh": 2048,
+    "ernie-3.0-tiny-micro-v1-zh": 2048,
+    "ernie-3.0-tiny-nano-v1-zh": 2048,
+    "rocketqa-zh-base-query-encoder": 2048,
+    "rocketqa-zh-base-para-encoder": 2048,
+    "rocketqa-zh-medium-query-encoder": 2048,
+    "rocketqa-zh-medium-para-encoder": 2048,
+    "rocketqa-zh-mini-query-encoder": 2048,
+    "rocketqa-zh-mini-para-encoder": 2048,
+    "rocketqa-zh-micro-query-encoder": 2048,
+    "rocketqa-zh-micro-para-encoder": 2048,
+    "rocketqa-zh-nano-query-encoder": 2048,
+    "rocketqa-zh-nano-para-encoder": 2048,
+    "rocketqa-base-cross-encoder": 2048,
+    "rocketqa-medium-cross-encoder": 2048,
+    "rocketqa-mini-cross-encoder": 2048,
+    "rocketqa-micro-cross-encoder": 2048,
+    "rocketqa-nano-cross-encoder": 2048,
+    "rocketqav2-en-marco-cross-encoder": 512,
+    "rocketqav2-en-marco-query-encoder": 512,
+    "rocketqav2-en-marco-para-encoder": 512,
+    "uie-base": 512,
+    "uie-medium": 512,
+    "uie-mini": 512,
+    "uie-micro": 512,
+    "uie-nano": 512,
+    "uie-base-en": 512,
+    "uie-senta-base": 512,
+    "uie-senta-medium": 512,
+    "uie-senta-mini": 512,
+    "uie-senta-micro": 512,
+    "uie-senta-nano": 512,
+    "uie-base-answer-extractor": 512,
+    "uie-base-qa-filter": 512,
+    "ernie-search-base-dual-encoder-marco-en": 512,
+    "ernie-search-large-cross-encoder-marco-en": 512,
+    "ernie-3.0-tiny-base-v2-zh": 2048,
+    "ernie-3.0-tiny-medium-v2-zh": 2048,
+    "ernie-3.0-tiny-mini-v2-zh": 2048,
+    "ernie-3.0-tiny-mini-v2-en": 514,
+    "ernie-3.0-tiny-micro-v2-zh": 2048,
+    "ernie-3.0-tiny-nano-v2-zh": 2048,
+    "ernie-3.0-tiny-pico-v2-zh": 2048,
+    "utc-large": 512,
+    "utc-xbase": 2048,
+    "utc-base": 2048,
+    "utc-medium": 2048,
+    "utc-mini": 2048,
+    "utc-micro": 2048,
+    "utc-nano": 2048,
+    "utc-pico": 2048,
+}
+
+
+class ErnieTokenizer(PretrainedTokenizer):
+    r"""
+    Constructs an ERNIE tokenizer. It uses a basic tokenizer to do punctuation
+    splitting, lower casing and so on, and follows a WordPiece tokenizer to
+    tokenize as subwords.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            The vocabulary file path (ends with '.txt') required to instantiate
+            a `WordpieceTokenizer`.
+        do_lower_case (str, optional):
+            Whether or not to lowercase the input when tokenizing.
+            Defaults to`True`.
+        unk_token (str, optional):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` in order to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str, optional):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str, optional):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str, optional):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str, optional):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import ErnieTokenizer
+            tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')
+
+            encoded_inputs = tokenizer('He was a puppeteer')
+            # encoded_inputs:
+            # { 'input_ids': [1, 4444, 4385, 1545, 6712, 10062, 9568, 9756, 9500, 2],
+            #  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
+            # }
+
+    """
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            # Deprecated, alias for ernie-1.0-base-zh
+            "ernie-1.0": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/vocab.txt",
+            "ernie-1.0-base-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/vocab.txt",
+            "ernie-1.0-base-zh-cw": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/ernie_1.0_base_zh_cw_vocab.txt",
+            "ernie-1.0-large-zh-cw": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/vocab.txt",
+            "ernie-tiny": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_tiny/vocab.txt",
+            "ernie-2.0-base-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_2.0/ernie_2.0_base_zh_vocab.txt",
+            "ernie-2.0-large-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_2.0/ernie_2.0_large_zh_vocab.txt",
+            "ernie-2.0-base-en": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_v2_base/vocab.txt",
+            "ernie-2.0-base-en-finetuned-squad": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_v2_base/vocab.txt",
+            "ernie-2.0-large-en": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_v2_large/vocab.txt",
+            "ernie-gen-base-en": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie-gen-base-en/vocab.txt",
+            "ernie-gen-large-en": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie-gen-large/vocab.txt",
+            "ernie-gen-large-en-430g": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie-gen-large-430g/vocab.txt",
+            "rocketqa-zh-dureader-query-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/rocketqa/rocketqa-zh-dureader-vocab.txt",
+            "rocketqa-zh-dureader-para-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/rocketqa/rocketqa-zh-dureader-vocab.txt",
+            "rocketqa-v1-marco-query-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/rocketqa/rocketqa-v1-marco-vocab.txt",
+            "rocketqa-v1-marco-para-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/rocketqa/rocketqa-v1-marco-vocab.txt",
+            "rocketqa-zh-dureader-cross-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/rocketqa/rocketqa-zh-dureader-vocab.txt",
+            "rocketqa-v1-marco-cross-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/rocketqa/rocketqa-v1-marco-vocab.txt",
+            "ernie-3.0-base-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_base_zh_vocab.txt",
+            "ernie-3.0-xbase-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_xbase_zh_vocab.txt",
+            "ernie-3.0-medium-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_medium_zh_vocab.txt",
+            "ernie-3.0-mini-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_mini_zh_vocab.txt",
+            "ernie-3.0-micro-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_micro_zh_vocab.txt",
+            "ernie-3.0-nano-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_nano_zh_vocab.txt",
+            "ernie-3.0-tiny-base-v1-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_base_zh_vocab.txt",
+            "ernie-3.0-tiny-medium-v1-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_medium_zh_vocab.txt",
+            "ernie-3.0-tiny-mini-v1-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_mini_zh_vocab.txt",
+            "ernie-3.0-tiny-micro-v1-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_micro_zh_vocab.txt",
+            "ernie-3.0-tiny-nano-v1-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_nano_zh_vocab.txt",
+            "rocketqa-zh-base-query-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_base_zh_vocab.txt",
+            "rocketqa-zh-base-para-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_base_zh_vocab.txt",
+            "rocketqa-zh-medium-query-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_medium_zh_vocab.txt",
+            "rocketqa-zh-medium-para-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_medium_zh_vocab.txt",
+            "rocketqa-zh-mini-query-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_mini_zh_vocab.txt",
+            "rocketqa-zh-mini-para-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_mini_zh_vocab.txt",
+            "rocketqa-zh-micro-query-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_micro_zh_vocab.txt",
+            "rocketqa-zh-micro-para-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_micro_zh_vocab.txt",
+            "rocketqa-zh-nano-query-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_nano_zh_vocab.txt",
+            "rocketqa-zh-nano-para-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_nano_zh_vocab.txt",
+            "rocketqa-base-cross-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_base_zh_vocab.txt",
+            "rocketqa-medium-cross-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_medium_zh_vocab.txt",
+            "rocketqa-mini-cross-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_mini_zh_vocab.txt",
+            "rocketqa-micro-cross-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_micro_zh_vocab.txt",
+            "rocketqa-nano-cross-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_nano_zh_vocab.txt",
+            "rocketqav2-en-marco-cross-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_v2_base/vocab.txt",
+            "rocketqav2-en-marco-query-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_v2_base/vocab.txt",
+            "rocketqav2-en-marco-para-encoder": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_v2_base/vocab.txt",
+            "uie-base": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_base_zh_vocab.txt",
+            "uie-medium": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_medium_zh_vocab.txt",
+            "uie-mini": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_mini_zh_vocab.txt",
+            "uie-micro": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_micro_zh_vocab.txt",
+            "uie-nano": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_nano_zh_vocab.txt",
+            "uie-base-en": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_v2_base/vocab.txt",
+            "uie-senta-base": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_base_zh_vocab.txt",
+            "uie-senta-medium": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_medium_zh_vocab.txt",
+            "uie-senta-mini": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_mini_zh_vocab.txt",
+            "uie-senta-micro": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_micro_zh_vocab.txt",
+            "uie-senta-nano": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_nano_zh_vocab.txt",
+            "uie-base-answer-extractor": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_base_zh_vocab.txt",
+            "uie-base-qa-filter": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_base_zh_vocab.txt",
+            "ernie-search-base-dual-encoder-marco-en": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_v2_base/vocab.txt",
+            "ernie-search-large-cross-encoder-marco-en": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_v2_large/vocab.txt",
+            "ernie-3.0-tiny-base-v2-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_tiny_base_v2_vocab.txt",
+            "ernie-3.0-tiny-medium-v2-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_tiny_medium_v2_vocab.txt",
+            "ernie-3.0-tiny-mini-v2-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_tiny_mini_v2_vocab.txt",
+            "ernie-3.0-tiny-mini-v2-en": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_tiny_mini_v2_en_vocab.txt",
+            "ernie-3.0-tiny-micro-v2-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_tiny_micro_v2_vocab.txt",
+            "ernie-3.0-tiny-nano-v2-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_tiny_nano_v2_vocab.txt",
+            "ernie-3.0-tiny-pico-v2-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_3.0/ernie_3.0_tiny_pico_v2_vocab.txt",
+            "utc-large": "https://paddlenlp.bj.bcebos.com/models/transformers/utc/utc_large_vocab.txt",
+            "utc-xbase": "https://paddlenlp.bj.bcebos.com/models/transformers/utc/utc_xbase_vocab.txt",
+            "utc-base": "https://paddlenlp.bj.bcebos.com/models/transformers/utc/utc_base_vocab.txt",
+            "utc-medium": "https://paddlenlp.bj.bcebos.com/models/transformers/utc/utc_medium_vocab.txt",
+            "utc-mini": "https://paddlenlp.bj.bcebos.com/models/transformers/utc/utc_mini_vocab.txt",
+            "utc-micro": "https://paddlenlp.bj.bcebos.com/models/transformers/utc/utc_micro_vocab.txt",
+            "utc-nano": "https://paddlenlp.bj.bcebos.com/models/transformers/utc/utc_nano_vocab.txt",
+            "utc-pico": "https://paddlenlp.bj.bcebos.com/models/transformers/utc/utc_pico_vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "ernie-1.0": {"do_lower_case": True},
+        "ernie-1.0-base-zh": {"do_lower_case": True},
+        "ernie-1.0-base-zh-cw": {"do_lower_case": True},
+        "ernie-1.0-large-zh-cw": {"do_lower_case": True},
+        "ernie-tiny": {"do_lower_case": True},
+        "ernie-2.0-base-zh": {"do_lower_case": True},
+        "ernie-2.0-large-zh": {"do_lower_case": True},
+        "ernie-2.0-base-en": {"do_lower_case": True},
+        "ernie-2.0-base-en-finetuned-squad": {"do_lower_case": True},
+        "ernie-2.0-large-en": {"do_lower_case": True},
+        "ernie-gen-base-en": {"do_lower_case": True},
+        "ernie-gen-large-en": {"do_lower_case": True},
+        "ernie-gen-large-en-430g": {"do_lower_case": True},
+        "rocketqa-zh-dureader-query-encoder": {"do_lower_case": True},
+        "rocketqa-zh-dureader-para-encoder": {"do_lower_case": True},
+        "rocketqa-v1-marco-query-encoder": {"do_lower_case": True},
+        "rocketqa-v1-marco-para-encoder": {"do_lower_case": True},
+        "rocketqa-zh-dureader-cross-encoder": {"do_lower_case": True},
+        "rocketqa-v1-marco-cross-encoder": {"do_lower_case": True},
+        "ernie-3.0-base-zh": {"do_lower_case": True},
+        "ernie-3.0-xbase-zh": {"do_lower_case": True},
+        "ernie-3.0-medium-zh": {"do_lower_case": True},
+        "ernie-3.0-mini-zh": {"do_lower_case": True},
+        "ernie-3.0-micro-zh": {"do_lower_case": True},
+        "ernie-3.0-nano-zh": {"do_lower_case": True},
+        "ernie-3.0-tiny-base-v1-zh": {"do_lower_case": True},
+        "ernie-3.0-tiny-medium-v1-zh": {"do_lower_case": True},
+        "ernie-3.0-tiny-mini-v1-zh": {"do_lower_case": True},
+        "ernie-3.0-tiny-micro-v1-zh": {"do_lower_case": True},
+        "ernie-3.0-tiny-nano-v1-zh": {"do_lower_case": True},
+        "rocketqa-zh-base-query-encoder": {"do_lower_case": True},
+        "rocketqa-zh-base-para-encoder": {"do_lower_case": True},
+        "rocketqa-zh-medium-query-encoder": {"do_lower_case": True},
+        "rocketqa-zh-medium-para-encoder": {"do_lower_case": True},
+        "rocketqa-zh-mini-query-encoder": {"do_lower_case": True},
+        "rocketqa-zh-mini-para-encoder": {"do_lower_case": True},
+        "rocketqa-zh-micro-query-encoder": {"do_lower_case": True},
+        "rocketqa-zh-micro-para-encoder": {"do_lower_case": True},
+        "rocketqa-zh-nano-query-encoder": {"do_lower_case": True},
+        "rocketqa-zh-nano-para-encoder": {"do_lower_case": True},
+        "rocketqa-base-cross-encoder": {"do_lower_case": True},
+        "rocketqa-medium-cross-encoder": {"do_lower_case": True},
+        "rocketqa-mini-cross-encoder": {"do_lower_case": True},
+        "rocketqa-micro-cross-encoder": {"do_lower_case": True},
+        "rocketqa-nano-cross-encoder": {"do_lower_case": True},
+        "rocketqav2-en-marco-cross-encoder": {"do_lower_case": True},
+        "rocketqav2-en-marco-query-encoder": {"do_lower_case": True},
+        "rocketqav2-en-marco-para-encoder": {"do_lower_case": True},
+        "uie-base": {"do_lower_case": True},
+        "uie-medium": {"do_lower_case": True},
+        "uie-mini": {"do_lower_case": True},
+        "uie-micro": {"do_lower_case": True},
+        "uie-nano": {"do_lower_case": True},
+        "uie-base-en": {"do_lower_case": True},
+        "uie-senta-base": {"do_lower_case": True},
+        "uie-senta-medium": {"do_lower_case": True},
+        "uie-senta-mini": {"do_lower_case": True},
+        "uie-senta-micro": {"do_lower_case": True},
+        "uie-senta-nano": {"do_lower_case": True},
+        "uie-base-answer-extractor": {"do_lower_case": True},
+        "uie-base-qa-filter": {"do_lower_case": True},
+        "ernie-search-base-dual-encoder-marco-en": {"do_lower_case": True},
+        "ernie-search-large-cross-encoder-marco-en": {"do_lower_case": True},
+        "ernie-3.0-tiny-base-v2-zh": {"do_lower_case": True},
+        "ernie-3.0-tiny-medium-v2-zh": {"do_lower_case": True},
+        "ernie-3.0-tiny-mini-v2-zh": {"do_lower_case": True},
+        "ernie-3.0-tiny-mini-v2-en": {"do_lower_case": True},
+        "ernie-3.0-tiny-micro-v2-zh": {"do_lower_case": True},
+        "ernie-3.0-tiny-nano-v2-zh": {"do_lower_case": True},
+        "ernie-3.0-tiny-pico-v2-zh": {"do_lower_case": True},
+        "utc-large": {"do_lower_case": True},
+        "utc-xbase": {"do_lower_case": True},
+        "utc-base": {"do_lower_case": True},
+        "utc-medium": {"do_lower_case": True},
+        "utc-mini": {"do_lower_case": True},
+        "utc-micro": {"do_lower_case": True},
+        "utc-nano": {"do_lower_case": True},
+        "utc-pico": {"do_lower_case": True},
+    }
+
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the "
+                "vocabulary from a pretrained model please use "
+                "`tokenizer = ErnieTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+        self.do_lower_case = do_lower_case
+        self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token)
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=unk_token)
+
+    @property
+    def vocab_size(self):
+        """
+        Return the size of vocabulary.
+
+        Returns:
+            int: The size of vocabulary.
+        """
+        return len(self.vocab)
+
+    def extend_chinese_char(self):
+        """
+        For, char level model such as ERNIE, we need add ## chinese token
+        to demonstrate the segment information.
+        """
+        vocab_set = set(self.vocab.token_to_idx.keys())
+        extend_list = []
+        for i in range(len(self.vocab)):
+            if i not in self.vocab.idx_to_token:
+                continue
+            w = self.vocab.idx_to_token[i]
+            # Chose chinese char in [0x4E00, Ox9FA5], and try add  ## char to vocab.
+            if len(w) == 1 and ord(w) >= 0x4E00 and ord(w) <= 0x9FA5:
+                new_char = "##" + w
+                if new_char not in vocab_set:
+                    extend_list.append(new_char)
+        if len(self.vocab) + len(extend_list) > 2**16:
+            logger.warnings("The vocab size is larger than uint16")
+        new_tokens = [str(tok) for tok in extend_list]
+
+        tokens_to_add = []
+        for token in new_tokens:
+            if not isinstance(token, str):
+                raise TypeError(f"Token {token} is not a string but a {type(token)}.")
+            if hasattr(self, "do_lower_case") and self.do_lower_case:
+                token = token.lower()
+            if (
+                token != self.unk_token
+                and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
+                and token not in tokens_to_add
+            ):
+                tokens_to_add.append(token)
+
+        if self.verbose:
+            print(f"Adding {len(tokens_to_add)} ## chinese tokens to the vocabulary")
+
+        added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add))
+        added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
+        self.added_tokens_encoder.update(added_tok_encoder)
+        self.added_tokens_decoder.update(added_tok_decoder)
+
+    def get_vocab(self):
+        return dict(self.vocab._token_to_idx, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        r"""
+        End-to-end tokenization for ERNIE models.
+
+        Args:
+            text (str): The text to be tokenized.
+
+        Returns:
+            List[str]: A list of string representing converted tokens.
+        """
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+
+    def convert_tokens_to_string(self, tokens):
+        r"""
+        Converts a sequence of tokens (list of string) in a single string. Since
+        the usage of WordPiece introducing `##` to concat subwords, also remove
+        `##` when converting.
+
+        Args:
+            tokens (List[str]): A list of string representing tokens to be converted.
+
+        Returns:
+            str: Converted string from tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import ErnieTokenizer
+                tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')
+
+                tokens = tokenizer.tokenize('He was a puppeteer')
+                strings = tokenizer.convert_tokens_to_string(tokens)
+                #he was a puppeteer
+
+        """
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def num_special_tokens_to_add(self, pair=False):
+        r"""
+        Returns the number of added tokens when encoding a sequence with special tokens.
+
+        Note:
+            This encodes inputs and checks the number of added tokens, and is therefore not efficient.
+            Do not put this inside your training loop.
+
+        Args:
+            pair (bool, optional):
+                Whether the input is a sequence pair or a single sequence.
+                Defaults to `False` and the input is a single sequence.
+
+        Returns:
+            int: Number of tokens added to sequences
+        """
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        r"""
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens.
+
+        An Ernie sequence has the following format:
+
+        - single sequence:      ``[CLS] X [SEP]``
+        - pair of sequences:        ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs.
+                Defaults to `None`.
+
+        Returns:
+            List[int]: List of input_id with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        return _cls + token_ids_0 + _sep + token_ids_1 + _sep
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        r"""
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        An ERNIE offset_mapping has the following format:
+
+        - single sequence:      ``(0,0) X (0,0)``
+        - pair of sequences:        ``(0,0) A (0,0) B (0,0)``
+
+        Args:
+            offset_mapping_ids_0 (List[tuple]):
+                List of char offsets to which the special tokens will be added.
+            offset_mapping_ids_1 (List[tuple], optional):
+                Optional second list of wordpiece offsets for offset mapping pairs.
+                Defaults to `None`.
+
+        Returns:
+            List[tuple]: A list of wordpiece offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        r"""
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+
+        A ERNIE sequence pair mask has the following format:
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs.
+                Defaults to `None`.
+
+        Returns:
+            List[int]: List of token_type_id according to the given sequence(s).
+        """
+        _sep = [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(_cls + token_ids_0 + _sep) * [0]
+        return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 + _sep) * [1]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        r"""
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+        Args:
+            token_ids_0 (List[int]):
+                List of ids of the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs.
+                Defaults to `None`.
+            already_has_special_tokens (str, optional):
+                Whether or not the token list is already formatted with special tokens for the model.
+                Defaults to `False`.
+        Returns:
+            List[int]:
+                The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+
+class ErnieTinyTokenizer(PretrainedTokenizer):
+    r"""
+    Constructs a ErnieTiny tokenizer. It uses the `dict.wordseg.pickle` cut the text to words, and
+    use the `sentencepiece` tools to cut the words to sub-words.
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import ErnieTokenizer
+            tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')
+
+            encoded_inputs = tokenizer('He was a puppeteer')
+            # encoded_inputs:
+            # { 'input_ids': [1, 4444, 4385, 1545, 6712, 10062, 9568, 9756, 9500, 2],
+            #  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
+            # }
+
+    Args:
+        vocab_file (str):
+            The file path of the vocabulary.
+        sentencepiece_model_file (str):
+            The file path of sentencepiece model.
+        word_dict(str):
+            The file path of word vocabulary, which is used to do chinese word segmentation.
+        do_lower_case (str, optional):
+            Whether or not to lowercase the input when tokenizing.
+            Defaults to`True`.
+        unk_token (str, optional):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str, optional):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str, optional):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str, optional):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str, optional):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import ErnieTinyTokenizer
+            tokenizer = ErnieTinyTokenizer.from_pretrained('ernie-tiny')
+            inputs = tokenizer('He was a puppeteer')
+            '''
+            {'input_ids': [3, 941, 977, 16690, 269, 11346, 11364, 1337, 13742, 1684, 5],
+            'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
+            '''
+    """
+    resource_files_names = {
+        "sentencepiece_model_file": "spm_cased_simp_sampled.model",
+        "vocab_file": "vocab.txt",
+        "word_dict": "dict.wordseg.pickle",
+    }  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {"ernie-tiny": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_tiny/vocab.txt"},
+        "sentencepiece_model_file": {
+            "ernie-tiny": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_tiny/spm_cased_simp_sampled.model"
+        },
+        "word_dict": {
+            "ernie-tiny": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_tiny/dict.wordseg.pickle"
+        },
+    }
+    pretrained_init_configuration = {"ernie-tiny": {"do_lower_case": True}}
+
+    def __init__(
+        self,
+        vocab_file,
+        sentencepiece_model_file,
+        word_dict,
+        do_lower_case=True,
+        encoding="utf8",
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+        self.sp_model = spm.SentencePieceProcessor()
+        self.word_dict = word_dict
+
+        self.do_lower_case = do_lower_case
+        self.encoding = encoding
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the "
+                "vocabulary from a pretrained model please use "
+                "`tokenizer = ErnieTinyTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+        if not os.path.isfile(word_dict):
+            raise ValueError(
+                "Can't find a file at path '{}'. To load the "
+                "word dict from a pretrained model please use "
+                "`tokenizer = ErnieTinyTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(word_dict)
+            )
+        self.dict = pickle.load(open(word_dict, "rb"))
+        self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token)
+
+        # if the sentencepiece_model_file is not exists, just the default sentence-piece model
+        if os.path.isfile(sentencepiece_model_file):
+            self.sp_model.Load(sentencepiece_model_file)
+
+    @property
+    def vocab_size(self):
+        r"""
+        Return the size of vocabulary.
+
+        Returns:
+            int: The size of vocabulary.
+        """
+        return len(self.vocab)
+
+    def cut(self, chars):
+        words = []
+        idx = 0
+        window_size = 5
+        while idx < len(chars):
+            matched = False
+
+            for i in range(window_size, 0, -1):
+                cand = chars[idx : idx + i]
+                if cand in self.dict:
+                    words.append(cand)
+                    matched = True
+                    break
+            if not matched:
+                i = 1
+                words.append(chars[idx])
+            idx += i
+        return words
+
+    def _tokenize(self, text):
+        r"""
+        End-to-end tokenization for ErnieTiny models.
+
+        Args:
+            text (str):
+                The text to be tokenized.
+
+        Returns:
+            List(str):
+                A list of string representing converted tokens.
+        """
+        if len(text) == 0:
+            return []
+        if not isinstance(text, six.string_types):
+            text = text.decode(self.encoding)
+
+        text = [s for s in self.cut(text) if s != " "]
+        text = " ".join(text)
+        text = text.lower()
+
+        tokens = self.sp_model.EncodeAsPieces(text)
+        in_vocab_tokens = []
+        unk_token = self.vocab.unk_token
+        for token in tokens:
+            if token in self.vocab:
+                in_vocab_tokens.append(token)
+            else:
+                in_vocab_tokens.append(unk_token)
+        return in_vocab_tokens
+
+    def convert_tokens_to_string(self, tokens):
+        r"""
+        Converts a sequence of tokens (list of string) to a single string. Since
+        the usage of WordPiece introducing `##` to concat subwords, also removes
+        `##` when converting.
+
+        Args:
+            tokens (list): A list of string representing tokens to be converted.
+
+        Returns:
+            str: Converted string from tokens.
+
+        Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import ErnieTinyTokenizer
+            tokenizer = ErnieTinyTokenizer.from_pretrained('ernie-tiny')
+            inputs = tokenizer.tokenize('He was a puppeteer')
+            #['▁h', '▁e', '▁was', '▁a', '▁pu', 'pp', 'e', '▁te', 'er']
+            strings = tokenizer.convert_tokens_to_string(tokens)
+
+        """
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def save_resources(self, save_directory):
+        r"""
+        Save tokenizer related resources to files under `save_directory`.
+
+        Args:
+            save_directory (str): Directory to save files into.
+        """
+        for name, file_name in self.resource_files_names.items():
+            # TODO: make the name 'ernie-tiny' as a variable
+            source_path = os.path.join(MODEL_HOME, "ernie-tiny", file_name)
+            save_path = os.path.join(save_directory, self.resource_files_names[name])
+
+            if os.path.abspath(source_path) != os.path.abspath(save_path):
+                shutil.copyfile(source_path, save_path)
+
+    def num_special_tokens_to_add(self, pair=False):
+        r"""
+        Returns the number of added tokens when encoding a sequence with special tokens.
+
+        Note:
+            This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this
+            inside your training loop.
+
+        Args:
+            pair (bool, optional):
+                Whether the input is a sequence pair or a single sequence.
+                Defaults to `False` and the input is a single sequence.
+
+        Returns:
+            int: Number of tokens added to sequences
+        """
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        r"""
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens.
+
+        An ERNIE sequence has the following format:
+
+        - single sequence:       ``[CLS] X [SEP]``
+        - pair of sequences:        ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs.
+                Defaults to `None`.
+
+        Returns:
+            List[int]: List of input_id with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        return _cls + token_ids_0 + _sep + token_ids_1 + _sep
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        r"""
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        An ERNIE offset_mapping has the following format:
+
+        - single sequence:      ``(0,0) X (0,0)``
+        - pair of sequences:        ``(0,0) A (0,0) B (0,0)``
+
+        Args:
+            offset_mapping_ids_0 (List[tuple]):
+                List of char offsets to which the special tokens will be added.
+            offset_mapping_ids_1 (List[tuple], optional):
+                Optional second list of wordpiece offsets for offset mapping pairs.
+                Defaults to `None`.
+
+        Returns:
+            List[tuple]: List of wordpiece offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        r"""
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+
+        A ERNIE sequence pair mask has the following format:
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs.
+                Defaults to `None`.
+
+        Returns:
+            List[int]: List of token_type_id according to the given sequence(s).
+        """
+        _sep = [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(_cls + token_ids_0 + _sep) * [0]
+        return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 + _sep) * [1]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        r"""
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+
+        Args:
+            token_ids_0 (List[int]):
+                List of ids of the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs.
+                Defaults to `None`.
+            already_has_special_tokens (str, optional):
+                Whether or not the token list is already formatted with special tokens for the model.
+                Defaults to `False`.
+
+        Returns:
+            List[int]:
+                The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_code/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_code/__init__.py
new file mode 100644
index 000000000..595add0ae
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_code/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_code/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_code/configuration.py
new file mode 100644
index 000000000..e225dec51
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_code/configuration.py
@@ -0,0 +1,198 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ErnieCode model configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["ERNIECODE_PRETRAINED_INIT_CONFIGURATION", "ErnieCodeConfig", "ERNIECODE_PRETRAINED_RESOURCE_FILES_MAP"]
+
+ERNIECODE_PRETRAINED_INIT_CONFIGURATION = {
+    "ernie-code-base": {
+        "d_ff": 2048,
+        "d_kv": 64,
+        "d_model": 768,
+        "decoder_start_token_id": 0,
+        "dense_act_fn": "gelu_new",
+        "dropout_rate": 0.1,
+        "enable_recompute": False,
+        "eos_token_id": 1,
+        "feed_forward_proj": "gated-gelu",
+        "initializer_factor": 1.0,
+        "is_encoder_decoder": True,
+        "is_gated_act": True,
+        "layer_norm_epsilon": 1e-06,
+        "model_type": "ErnieCode",
+        "num_decoder_layers": 12,
+        "num_heads": 12,
+        "num_layers": 12,
+        "output_past": True,
+        "pad_token_id": 0,
+        "relative_attention_max_distance": 128,
+        "relative_attention_num_buckets": 32,
+        "tie_word_embeddings": False,
+        "tokenizer_class": "ErnieCodeTokenizer",
+        "transformers_version": "4.20.1",
+        "use_cache": True,
+        "vocab_size": 250105,
+    },
+    "ernie-code-base-L512": {
+        "d_ff": 2048,
+        "d_kv": 64,
+        "d_model": 768,
+        "decoder_start_token_id": 0,
+        "dense_act_fn": "gelu_new",
+        "dropout_rate": 0.1,
+        "enable_recompute": False,
+        "eos_token_id": 1,
+        "feed_forward_proj": "gated-gelu",
+        "initializer_factor": 1.0,
+        "is_encoder_decoder": True,
+        "is_gated_act": True,
+        "layer_norm_epsilon": 1e-06,
+        "model_type": "ErnieCode",
+        "num_decoder_layers": 12,
+        "num_heads": 12,
+        "num_layers": 12,
+        "output_past": True,
+        "pad_token_id": 0,
+        "relative_attention_max_distance": 128,
+        "relative_attention_num_buckets": 32,
+        "tie_word_embeddings": False,
+        "tokenizer_class": "ErnieCodeTokenizer",
+        "transformers_version": "4.20.1",
+        "use_cache": True,
+        "vocab_size": 250105,
+    },
+}
+
+ERNIECODE_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "ernie-code-base": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie-code/ernie-code-base/model_state.pdparams",
+        "ernie-code-base-L512": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie-code/ernie-code-base-L512/model_state.pdparams",
+    }
+}
+
+
+class ErnieCodeConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ErnieCodeModel`]. It is used to
+    instantiate a bert model according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 250112):
+            Vocabulary size of the ErnieCode model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ErnieCodeModel`].
+        d_model (`int`, *optional*, defaults to 512):
+            Size of the encoder layers and the pooler layer.
+        d_kv (`int`, *optional*, defaults to 64):
+            Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model //
+            num_heads`.
+        d_ff (`int`, *optional*, defaults to 1024):
+            Size of the intermediate feed forward layer in each `ErnieCodeBlock`.
+        num_layers (`int`, *optional*, defaults to 8):
+            Number of hidden layers in the Transformer encoder.
+        num_decoder_layers (`int`, *optional*):
+            Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set.
+        num_heads (`int`, *optional*, defaults to 6):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
+            The number of buckets to use for each attention layer.
+        relative_attention_max_distance (`int`, *optional*, defaults to 128):
+            The maximum distance of the longer sequences for the bucket separation.
+        dropout_rate (`float`, *optional*, defaults to 0.1):
+            The ratio for all dropout layers.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        feed_forward_proj (`string`, *optional*, defaults to `"gated-gelu"`):
+            he non-linear activation function (function or string) in the feed forward layer in the residual attention block.
+            If string, `"relu"`, `"gated-gelu"` are supported. Defaults to `"gated-gelu"`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        pad_token_id (int, optional):
+            The id of the `padding` token. Defaults to `0`.
+        bos_token_id (int, optional):
+            The id of the `bos` token. Defaults to `0`.
+        eos_token_id (int, optional):
+            The id of the `eos` token. Defaults to `1`.
+        enable_recompute (bool, optional):
+            Whether to recompute cache.
+
+    """
+    model_type = "ErnieCode"
+    attribute_map: Dict[str, str] = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "num_heads",
+        "num_hidden_layers": "num_layers",
+        "num_classes": "num_labels",
+    }
+    pretrained_init_configuration = ERNIECODE_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size: int = 250112,
+        d_model: int = 512,
+        d_kv: int = 64,
+        d_ff: int = 1024,
+        num_layers: int = 8,
+        num_decoder_layers: int = None,
+        num_heads: int = 6,
+        relative_attention_num_buckets: int = 32,
+        relative_attention_max_distance: int = 128,
+        dropout_rate: float = 0.1,
+        layer_norm_epsilon: float = 1e-6,
+        initializer_factor: float = 1.0,
+        feed_forward_proj: str = "gated-gelu",
+        is_encoder_decoder: bool = True,
+        use_cache: bool = True,
+        bos_token_id: int = 0,
+        pad_token_id: int = 0,
+        eos_token_id: int = 1,
+        enable_recompute: bool = False,
+        **kwargs
+    ):
+
+        super().__init__(
+            bos_token_id=bos_token_id,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
+        )
+        self.enable_recompute = enable_recompute
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.d_kv = d_kv
+        self.d_ff = d_ff
+        self.num_layers = num_layers
+        self.num_decoder_layers = (
+            num_decoder_layers if num_decoder_layers is not None else self.num_layers
+        )  # default = symmetry
+        self.num_heads = num_heads
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.relative_attention_max_distance = relative_attention_max_distance
+        self.dropout_rate = dropout_rate
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_factor = initializer_factor
+        self.feed_forward_proj = feed_forward_proj
+        self.use_cache = use_cache
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_code/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_code/modeling.py
new file mode 100644
index 000000000..d83e1423b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_code/modeling.py
@@ -0,0 +1,1751 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 Baidu ErnieCode Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import copy
+import math
+from typing import Optional, Tuple
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import Tensor
+from paddle.distributed.fleet.utils import recompute
+
+from ...utils.converter import StateDictNameMapping, init_name_mappings
+from ...utils.log import logger
+from ..activations import ACT2FN
+from ..model_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+    convert_encoder_output,
+)
+from ..model_utils import PretrainedModel, register_base_model
+from .configuration import (
+    ERNIECODE_PRETRAINED_INIT_CONFIGURATION,
+    ERNIECODE_PRETRAINED_RESOURCE_FILES_MAP,
+    ErnieCodeConfig,
+)
+
+__all__ = [
+    "ErnieCodeModel",
+    "ErnieCodePretrainedModel",
+    "ErnieCodeForConditionalGeneration",
+    "ErnieCodeEncoderModel",
+    "ERNIECODE_PRETRAINED_MODEL_ARCHIVE_LIST",
+]
+
+ERNIECODE_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "ernie-code-base",
+    "ernie-code-base-L512",
+]
+
+DATA_TYPE_MAP = {
+    paddle.int64: "int64",
+    paddle.int32: "int32",
+    paddle.float32: "float32",
+    paddle.float64: "float64",
+    paddle.float16: "float16",
+}
+
+
+def data_type_converter(tensor):
+    return DATA_TYPE_MAP[tensor.dtype]
+
+
+def finfo(dtype):
+    if dtype == paddle.float32:
+        return np.finfo(np.float32)
+    if dtype == paddle.float16:
+        return np.finfo(np.float16)
+    if dtype == paddle.float64:
+        return np.finfo(np.float64)
+
+
+class ErnieCodeLayerNorm(nn.Layer):
+    """
+    Construct a layernorm module in the ErnieCode style No bias and no subtraction of mean.
+    """
+
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = self.create_parameter(shape=[hidden_size], default_initializer=nn.initializer.Constant(1.0))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        # layer norm should always be calculated in float32
+        variance = paddle.pow(hidden_states.astype(paddle.float32), 2).mean(axis=-1, keepdim=True)
+        hidden_states = hidden_states * paddle.rsqrt(variance + self.variance_epsilon)
+
+        # convert into float16 if necessary
+        if self.weight.dtype == paddle.float16:
+            hidden_states = hidden_states.astype(paddle.float16)
+        return self.weight * hidden_states
+
+
+class ErnieCodeDenseReluDense(nn.Layer):
+    """
+    Construct a dense-relu-dense module.
+    """
+
+    def __init__(self, config: ErnieCodeConfig):
+        super().__init__()
+        self.wi = nn.Linear(config.d_model, config.d_ff, bias_attr=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias_attr=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        hidden_states = self.wi(hidden_states)
+        hidden_states = F.relu(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class ErnieCodeDenseGatedGeluDense(nn.Layer):
+    """
+    Construct a dense-gated_gelu-dense module.
+    """
+
+    def __init__(self, config: ErnieCodeConfig):
+        super().__init__()
+        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias_attr=False)
+        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias_attr=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias_attr=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.gelu_act = ACT2FN["gelu_new"]
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class ErnieCodeDenseGatedSiluDense(nn.Layer):
+    """
+    Construct a dense-gated_gelu-dense module.
+    """
+
+    def __init__(self, config: ErnieCodeConfig):
+        super().__init__()
+        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias_attr=False)
+        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias_attr=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias_attr=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        hidden_silu = F.silu(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_silu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class ErnieCodeLayerFF(nn.Layer):
+    def __init__(self, config: ErnieCodeConfig):
+        super().__init__()
+        if config.feed_forward_proj == "relu":
+            self.DenseReluDense = ErnieCodeDenseReluDense(config)
+        elif config.feed_forward_proj == "gated-gelu":
+            self.DenseReluDense = ErnieCodeDenseGatedGeluDense(config)
+        elif config.feed_forward_proj == "gated-silu":
+            self.DenseReluDense = ErnieCodeDenseGatedSiluDense(config)
+        else:
+            raise ValueError(f"{config.feed_forward_proj} is not supported. Choose between `relu` and `gated-gelu`")
+
+        self.layer_norm = ErnieCodeLayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        forwarded_states = self.layer_norm(hidden_states)
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        hidden_states = hidden_states + self.dropout(forwarded_states)
+        return hidden_states
+
+
+class ErnieCodeAttention(nn.Layer):
+    def __init__(self, config: ErnieCodeConfig, has_relative_attention_bias: bool = False):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.has_relative_attention_bias = has_relative_attention_bias
+
+        self.relative_attention_num_buckets = config.relative_attention_num_buckets
+        self.d_model = config.d_model
+        self.key_value_proj_dim = config.d_kv
+        self.n_heads = config.num_heads
+        self.dropout = config.dropout_rate
+        self.inner_dim = self.n_heads * self.key_value_proj_dim
+        self.enable_recompute = False
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.q = nn.Linear(self.d_model, self.inner_dim, bias_attr=False)
+        self.k = nn.Linear(self.d_model, self.inner_dim, bias_attr=False)
+        self.v = nn.Linear(self.d_model, self.inner_dim, bias_attr=False)
+        self.o = nn.Linear(self.inner_dim, self.d_model, bias_attr=False)
+
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
+
+    @staticmethod
+    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention. The relative position is defined as
+        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
+        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
+        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
+        This should allow for more graceful generalization to longer sequences than the model has been trained on
+
+        Args:
+            relative_position: an int64 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+
+        Returns:
+            a Tensor with the same shape as relative_position, containing int64 values in the range [0, num_buckets)
+
+        """
+        relative_buckets = 0
+        if bidirectional:
+            num_buckets //= 2
+            relative_buckets += (relative_position > 0).astype(paddle.int64) * num_buckets
+            relative_position = paddle.abs(relative_position)
+        else:
+            relative_position = -paddle.minimum(relative_position, paddle.zeros_like(relative_position))
+        # now relative_position is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = relative_position < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        relative_postion_if_large = max_exact + (
+            paddle.log(relative_position.astype(paddle.get_default_dtype()) / max_exact)
+            / math.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).astype(paddle.int64)
+        relative_postion_if_large = paddle.minimum(
+            relative_postion_if_large,
+            paddle.full_like(relative_postion_if_large, num_buckets - 1),
+        )
+
+        relative_buckets += paddle.where(is_small, relative_position, relative_postion_if_large)
+        return relative_buckets
+
+    def compute_bias(self, query_length, key_length):
+        """Compute binned relative position bias"""
+        context_position = paddle.arange(query_length).unsqueeze(-1)
+        memory_position = paddle.arange(key_length).unsqueeze(0)
+        relative_position = memory_position - context_position  # shape (query_length, key_length)
+        relative_position_bucket = self._relative_position_bucket(
+            relative_position,  # shape (query_length, key_length)
+            bidirectional=(not self.is_decoder),
+            num_buckets=self.relative_attention_num_buckets,
+        )
+        values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
+        values = values.transpose(perm=[2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
+        return values
+
+    def forward(
+        self,
+        hidden_states,
+        mask=None,
+        key_value_states=None,
+        position_bias=None,
+        cache=None,
+        query_length=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        """
+        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+        """
+        # Input is (batch_size, seq_length, dim)
+        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
+        # cache[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        real_seq_length = seq_length
+
+        if cache is not None:
+            assert len(cache) == 2, f"cache should have 2 past states: keys and values. Got { len(cache)} past states"
+            real_seq_length += cache[0].shape[2] if query_length is None else query_length
+
+        key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
+
+        def shape(states):
+            """projection"""
+            return states.reshape(shape=[batch_size, -1, self.n_heads, self.key_value_proj_dim]).transpose(
+                perm=[0, 2, 1, 3]
+            )
+
+        def unshape(states):
+            """reshape"""
+            return states.transpose(perm=[0, 2, 1, 3]).reshape(shape=[batch_size, -1, self.inner_dim])
+
+        def project(hidden_states, proj_layer, key_value_states, cache):
+            """projects hidden states correctly to key/query states"""
+            if key_value_states is None:
+                # self-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(hidden_states))
+            elif cache is None:
+                # cross-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(key_value_states))
+
+            if cache is not None:
+                if key_value_states is None:
+                    # self-attn
+                    # (batch_size, n_heads, key_length, dim_per_head)
+                    hidden_states = paddle.concat([cache, hidden_states], axis=2)
+                else:
+                    # cross-attn
+                    hidden_states = cache
+            return hidden_states
+
+        # get query states
+        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
+
+        # get key/value states
+        key_states = project(
+            hidden_states,
+            self.k,
+            key_value_states,
+            cache[0] if cache is not None else None,
+        )
+        value_states = project(
+            hidden_states,
+            self.v,
+            key_value_states,
+            cache[1] if cache is not None else None,
+        )
+
+        # compute scores
+        scores = paddle.matmul(query_states, key_states, transpose_y=True)
+
+        if position_bias is None:
+            if not self.has_relative_attention_bias:
+                position_bias = paddle.zeros(
+                    shape=(1, self.n_heads, real_seq_length, key_length),
+                    dtype=scores.dtype,
+                )
+                if self.training and self.enable_recompute:
+                    position_bias.stop_gradient = False
+            else:
+                position_bias = self.compute_bias(real_seq_length, key_length)
+
+            # if key and values are already calculated
+            # we want only the last query position bias
+            if cache is not None:
+                position_bias = position_bias[:, :, -hidden_states.shape[1] :, :]
+
+            if mask is not None:
+                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
+
+        scores += position_bias
+        attn_weights = F.softmax(scores.astype(paddle.float32), axis=-1).astype(
+            scores.dtype
+        )  # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = F.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )  # (batch_size, n_heads, seq_length, key_length)
+
+        attn_output = unshape(paddle.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
+
+        attn_output = self.o(attn_output)
+
+        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
+        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
+
+        if output_attentions:
+            outputs = outputs + (attn_weights,)
+        return outputs
+
+
+class ErnieCodeLayerSelfAttention(nn.Layer):
+    def __init__(self, config: ErnieCodeConfig, has_relative_attention_bias: bool = False):
+        super().__init__()
+        self.SelfAttention = ErnieCodeAttention(config, has_relative_attention_bias=has_relative_attention_bias)
+        self.layer_norm = ErnieCodeLayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        cache=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.SelfAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            position_bias=position_bias,
+            cache=cache,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + self.dropout(attention_output[0])
+        outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class ErnieCodeLayerCrossAttention(nn.Layer):
+    def __init__(self, config: ErnieCodeConfig):
+        super().__init__()
+        self.EncDecAttention = ErnieCodeAttention(config, has_relative_attention_bias=False)
+        self.layer_norm = ErnieCodeLayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        key_value_states,
+        attention_mask=None,
+        position_bias=None,
+        cache=None,
+        use_cache=False,
+        query_length=None,
+        output_attentions=False,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+
+        attention_output = self.EncDecAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            key_value_states=key_value_states,
+            position_bias=position_bias,
+            cache=cache,
+            use_cache=use_cache,
+            query_length=query_length,
+            output_attentions=output_attentions,
+        )
+        layer_output = hidden_states + self.dropout(attention_output[0])
+        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class ErnieCodeBlock(nn.Layer):
+    def __init__(self, config: ErnieCodeConfig, has_relative_attention_bias: bool = False):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.layer = nn.LayerList()
+        self.layer.append(ErnieCodeLayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
+        if self.is_decoder:
+            self.layer.append(ErnieCodeLayerCrossAttention(config))
+
+        self.layer.append(ErnieCodeLayerFF(config))
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+        cache=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+
+        if cache is not None:
+            assert self.is_decoder, "Only decoder can use `caches`"
+            expected_num_caches = 2 if encoder_hidden_states is None else 4
+
+            if len(cache) != expected_num_caches:
+                raise ValueError(
+                    f"There should be {expected_num_caches} past states. "
+                    f"{'2 (past / key) for cross attention. ' if expected_num_caches == 4 else ''}"
+                    f"Got {len(cache)} past key / value states"
+                )
+
+            self_attn_cache = cache[:2]
+            cross_attn_cache = cache[2:]
+        else:
+            self_attn_cache, cross_attn_cache = None, None
+
+        self_attention_outputs = self.layer[0](
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            cache=self_attn_cache,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states, present_key_value_state = self_attention_outputs[:2]
+
+        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == paddle.float16 and paddle.isinf(hidden_states).any():
+            # TODO finfo
+            clamp_value = finfo(hidden_states.dtype).max - 1000
+            hidden_states = paddle.clip(hidden_states, min=-clamp_value, max=clamp_value)
+
+        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
+        if do_cross_attention:
+            # the actual query length is unknown for cross attention
+            # if using past key value states. Need to inject it here
+            if present_key_value_state is not None:
+                query_length = present_key_value_state[0].shape[2]
+            else:
+                query_length = None
+
+            cross_attention_outputs = self.layer[1](
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                position_bias=encoder_decoder_position_bias,
+                cache=cross_attn_cache,
+                query_length=query_length,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = cross_attention_outputs[0]
+
+            # clamp inf values to enable fp16 training
+            if hidden_states.dtype == paddle.float16 and paddle.isinf(hidden_states).any():
+                clamp_value = finfo(hidden_states.dtype).max - 1000
+                hidden_states = paddle.clip(hidden_states, min=-clamp_value, max=clamp_value)
+
+            # Combine self attn and cross attn key value states
+            if present_key_value_state is not None:
+                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
+
+            # Keep cross-attention outputs and relative position weights
+            attention_outputs = attention_outputs + cross_attention_outputs[2:]
+
+        # Apply Feed Forward layer
+        hidden_states = self.layer[-1](hidden_states)
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == paddle.float16 and paddle.isinf(hidden_states).any():
+            clamp_value = finfo(hidden_states.dtype).max - 1000
+            hidden_states = paddle.clip(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if use_cache:
+            outputs = outputs + (present_key_value_state,) + attention_outputs
+        else:
+            outputs = outputs + attention_outputs
+
+        return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+
+
+class ErnieCodePretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained ErnieCode models. It provides ErnieCode related
+    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
+    `pretrained_init_configuration`, `base_model_prefix` for downloading and
+    loading pretrained models. See `PretrainedModel` for more details.
+    """
+
+    base_model_prefix = "ErnieCode"
+    config_class = ErnieCodeConfig
+
+    pretrained_init_configuration = ERNIECODE_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = ERNIECODE_PRETRAINED_RESOURCE_FILES_MAP
+
+    # support AutoConverter after fix load_torch function
+    @classmethod
+    def _get_name_mappings(cls, config: ErnieCodeConfig) -> list[StateDictNameMapping]:
+        mappings: list[StateDictNameMapping] = []
+        model_mappings = [
+            "shared.weight",
+            "encoder.embed_tokens.weight",
+            "encoder.final_layer_norm.weight",
+            "decoder.embed_tokens.weight",
+            "decoder.final_layer_norm.weight",
+            "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight",
+            "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight",
+        ]
+        for layer_index in range(config.num_hidden_layers):
+            for att_head in ["q", "k", "v", "o"]:
+                model_mappings.extend(
+                    [
+                        [
+                            f"encoder.block.{layer_index}.layer.0.SelfAttention.{att_head}.weight",
+                            None,
+                            "transpose",
+                        ],
+                        [
+                            f"decoder.block.{layer_index}.layer.0.SelfAttention.{att_head}.weight",
+                            None,
+                            "transpose",
+                        ],
+                        [
+                            f"decoder.block.{layer_index}.layer.1.EncDecAttention.{att_head}.weight",
+                            None,
+                            "transpose",
+                        ],
+                    ]
+                )
+
+            layer_mappings = [
+                [
+                    f"encoder.block.{layer_index}.layer.1.DenseReluDense.wo.weight",
+                    None,
+                    "transpose",
+                ],
+                [
+                    f"decoder.block.{layer_index}.layer.2.DenseReluDense.wo.weight",
+                    None,
+                    "transpose",
+                ],
+                f"encoder.block.{layer_index}.layer.0.layer_norm.weight",
+                f"encoder.block.{layer_index}.layer.1.layer_norm.weight",
+                f"decoder.block.{layer_index}.layer.0.layer_norm.weight",
+                f"decoder.block.{layer_index}.layer.1.layer_norm.weight",
+                f"decoder.block.{layer_index}.layer.2.layer_norm.weight",
+            ]
+
+            if config.feed_forward_proj == "relu":
+                layer_mappings.extend(
+                    [
+                        [
+                            f"encoder.block.{layer_index}.layer.1.DenseReluDense.wi.weight",
+                            None,
+                            "transpose",
+                        ],
+                        [
+                            f"decoder.block.{layer_index}.layer.2.DenseReluDense.wi.weight",
+                            None,
+                            "transpose",
+                        ],
+                    ]
+                )
+            elif config.feed_forward_proj == "gated-gelu":
+                for i in range(2):
+                    layer_mappings.extend(
+                        [
+                            [
+                                f"encoder.block.{layer_index}.layer.1.DenseReluDense.wi_{i}.weight",
+                                None,
+                                "transpose",
+                            ],
+                            [
+                                f"decoder.block.{layer_index}.layer.2.DenseReluDense.wi_{i}.weight",
+                                None,
+                                "transpose",
+                            ],
+                        ]
+                    )
+
+            model_mappings.extend(layer_mappings)
+
+        init_name_mappings(model_mappings)
+
+        if cls.__name__ != "ErnieCodeModel":
+            for mapping in model_mappings:
+                mapping[1] = "ErnieCode." + mapping[1]
+
+        if config.architectures is not None and "ErnieCodeForConditionalGeneration" in config.architectures:
+            model_mappings.append(["lm_head.weight", "lm_head.weight", "transpose"])
+
+        mappings = [StateDictNameMapping(*mapping) for mapping in model_mappings]
+        return mappings
+
+    @property
+    def dummy_inputs(self):
+        DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
+        DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]
+        input_ids = paddle.assign(np.asarray(DUMMY_INPUTS, dtype="int64"))
+        input_mask = paddle.assign(np.asarray(DUMMY_MASK, dtype="int64"))
+        dummy_inputs = {
+            "decoder_input_ids": input_ids,
+            "input_ids": input_ids,
+            "decoder_attention_mask": input_mask,
+        }
+        return dummy_inputs
+
+    def _init_weights(self, layer):
+        """Initialize the weights"""
+        # Used for testing weights initialization
+        factor = self.config.initializer_factor
+        d_model = self.config.d_model
+        d_ff = self.config.d_ff
+        n_heads = self.config.num_heads
+        key_value_proj_dim = self.config.d_kv
+
+        if isinstance(layer, ErnieCodeLayerNorm):
+            layer.weight.set_value(paddle.ones_like(layer.weight) * factor)
+        elif isinstance(layer, ErnieCodeModel):
+            # Mesh TensorFlow embeddings initialization
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
+            layer.shared.weight.set_value(paddle.normal(mean=0.0, std=factor * 1.0, shape=layer.shared.weight.shape))
+        elif isinstance(layer, (ErnieCodeForConditionalGeneration,)):
+            layer.ErnieCode.shared.weight.set_value(
+                paddle.normal(mean=0.0, std=factor * 1.0, shape=layer.ErnieCode.shared.weight.shape)
+            )
+
+        elif isinstance(layer, ErnieCodeDenseReluDense):
+            # Mesh TensorFlow FF initialization
+            # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
+            # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
+            layer.wi.weight.set_value(
+                paddle.normal(mean=0.0, std=factor * ((d_model) ** -0.5), shape=layer.wi.weight.shape)
+            )
+
+            if hasattr(layer.wi, "bias") and layer.wi.bias is not None:
+                layer.wi.bias.set_value(paddle.zeros_like(layer.wi.bias))
+
+            layer.wo.weight.set_value(
+                paddle.normal(mean=0.0, std=factor * ((d_ff) ** -0.5), shape=layer.wo.weight.shape)
+            )
+
+            if hasattr(layer.wo, "bias") and layer.wo.bias is not None:
+                layer.wo.bias.set_value(paddle.zeros_like(layer.wo.bias))
+
+        elif isinstance(layer, ErnieCodeDenseGatedGeluDense):
+            layer.wi_0.weight.set_value(
+                paddle.normal(mean=0.0, std=factor * ((d_model) ** -0.5), shape=layer.wi_0.weight.shape)
+            )
+            if hasattr(layer.wi_0, "bias") and layer.wi_0.bias is not None:
+                layer.wi_0.bias.set_value(paddle.zeros_like(layer.wi_0.bias))
+
+            layer.wi_1.weight.set_value(
+                paddle.normal(mean=0.0, std=factor * ((d_model) ** -0.5), shape=layer.wi_1.weight.shape)
+            )
+            if hasattr(layer.wi_1, "bias") and layer.wi_1.bias is not None:
+                layer.wi_1.bias.set_value(paddle.zeros_like(layer.wi_1.bias))
+
+            layer.wo.weight.set_value(
+                paddle.normal(mean=0.0, std=factor * ((d_ff) ** -0.5), shape=layer.wo.weight.shape)
+            )
+
+            if hasattr(layer.wo, "bias") and layer.wo.bias is not None:
+                layer.wo.bias.set_value(paddle.zeros_like(layer.wo.bias))
+        elif isinstance(layer, ErnieCodeAttention):
+            # Mesh TensorFlow attention initialization to avoid scaling before softmax
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
+
+            layer.q.weight.set_value(
+                paddle.normal(
+                    mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5), shape=layer.q.weight.shape
+                )
+            )
+
+            layer.k.weight.set_value(
+                paddle.normal(mean=0.0, std=factor * (d_model**-0.5), shape=layer.k.weight.shape)
+            )
+
+            layer.v.weight.set_value(
+                paddle.normal(mean=0.0, std=factor * (d_model**-0.5), shape=layer.v.weight.shape)
+            )
+
+            layer.o.weight.set_value(
+                paddle.normal(
+                    mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5), shape=layer.o.weight.shape
+                )
+            )
+
+            if layer.has_relative_attention_bias:
+                layer.relative_attention_bias.weight.set_value(
+                    paddle.normal(
+                        mean=0.0, std=factor * ((d_model) ** -0.5), shape=layer.relative_attention_bias.weight.shape
+                    )
+                )
+
+    def _shift_right(self, input_ids):
+        bos_token_id = self.config.bos_token_id
+        pad_token_id = self.config.pad_token_id
+
+        assert (
+            bos_token_id is not None
+        ), "bos_token_id has to be defined. In ErnieCode it is usually set to the pad_token_id. See ErnieCode docs for more information"
+
+        # shift inputs to the right
+        shifted_input_ids = paddle.zeros_like(input_ids)
+        shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+        shifted_input_ids[:, 0] = bos_token_id
+
+        assert pad_token_id is not None, "pad_token_id has to be defined."
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids = paddle.where(
+            shifted_input_ids == -100,
+            paddle.assign(np.asarray(pad_token_id, dtype=data_type_converter(shifted_input_ids)).reshape([1])),
+            shifted_input_ids,
+        )
+
+        assert paddle.all(shifted_input_ids >= 0), "Verify that `shifted_input_ids` has only positive values"
+
+        return shifted_input_ids
+
+
+class ErnieCodeStack(nn.Layer):
+    def __init__(self, config: ErnieCodeConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.embed_tokens = embed_tokens
+        self.block = nn.LayerList(
+            [ErnieCodeBlock(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
+        )
+        self.final_layer_norm = ErnieCodeLayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.enable_recompute = config.enable_recompute
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embed_tokens = new_embeddings
+
+    @property
+    def dtype(self):
+        return self.embed_tokens.weight.dtype
+
+    @paddle.jit.not_to_static
+    def recompute_training(
+        self,
+        layer_module,
+        hidden_states,
+        extended_attention_mask,
+        position_bias,
+        encoder_hidden_states,
+        encoder_extended_attention_mask,
+        encoder_decoder_position_bias,
+        use_cache,
+        output_attentions,
+    ):
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return tuple(module(*inputs, use_cache, output_attentions))
+
+            return custom_forward
+
+        layer_outputs = recompute(
+            create_custom_forward(layer_module),
+            hidden_states,
+            extended_attention_mask,
+            position_bias,
+            encoder_hidden_states,
+            encoder_extended_attention_mask,
+            encoder_decoder_position_bias,
+            None,
+        )
+
+        return layer_outputs
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        inputs_embeds=None,
+        cache=None,
+        use_cache=False,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
+        **model_kwargs
+    ):
+
+        if input_ids is not None and inputs_embeds is not None:
+            err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(
+                f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+            # input_ids = input_ids.reshape(shape=[-1, input_shape[-1]])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+        else:
+            err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
+
+        if inputs_embeds is None:
+            assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        batch_size, seq_length = input_shape
+
+        # required mask seq length can be calculated via length of past
+        mask_seq_length = cache[0][0].shape[2] + seq_length if cache is not None else seq_length
+
+        if use_cache is True:
+            assert self.is_decoder, f"`use_cache` can only be set to `True` if {self.__class__} is used as a decoder"
+
+        if attention_mask is None:
+            attention_mask = paddle.ones(shape=[batch_size, mask_seq_length])
+        if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
+            encoder_seq_length = encoder_hidden_states.shape[1]
+            encoder_attention_mask = paddle.ones([batch_size, encoder_seq_length], dtype=paddle.int64)
+
+        # initialize caches with `None` if past does not exist
+        if cache is None:
+            cache = [None] * len(self.block)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = paddle.ones(shape=encoder_hidden_shape)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        present_key_value_states = () if use_cache else None
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and self.is_decoder) else None
+        position_bias = None
+        encoder_decoder_position_bias = None
+
+        hidden_states = self.dropout(inputs_embeds)
+
+        for i, (layer_module, past_key_value) in enumerate(zip(self.block, cache)):
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.enable_recompute and self.training:
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with `config.enable_recompute=True`. Setting "
+                        "`use_cache=False`..."
+                    )
+                    use_cache = False
+
+                layer_outputs = self.recompute_training(
+                    layer_module,
+                    hidden_states,
+                    extended_attention_mask,
+                    position_bias,
+                    encoder_hidden_states,
+                    encoder_extended_attention_mask,
+                    encoder_decoder_position_bias,
+                    use_cache,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask=extended_attention_mask,
+                    position_bias=position_bias,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_extended_attention_mask,
+                    encoder_decoder_position_bias=encoder_decoder_position_bias,
+                    cache=past_key_value,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+
+            # layer_outputs is a tuple with:
+            # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+            if not use_cache:
+                layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
+
+            hidden_states, present_key_value_state = layer_outputs[:2]
+
+            # We share the position biases between the layers - the first layer store them
+            # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
+            # (cross-attention position bias), (cross-attention weights)
+            position_bias = layer_outputs[2]
+            if self.is_decoder and encoder_hidden_states is not None:
+                encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
+            # append next layer key value states
+            if use_cache:
+                present_key_value_states = present_key_value_states + (present_key_value_state,)
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[3],)
+                if self.is_decoder:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[5],)
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    present_key_value_states,
+                    all_hidden_states,
+                    all_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=present_key_value_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+    def get_extended_attention_mask(self, attention_mask, input_shape):
+        if attention_mask.ndim == 3:
+            extended_attention_mask = attention_mask.unsqueeze(1)
+        elif attention_mask.ndim == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if self.is_decoder:
+                batch_size, seq_length = input_shape
+                seq_ids = paddle.arange(seq_length)
+                causal_mask = paddle.tile(
+                    seq_ids.unsqueeze(axis=[0, 1]), [batch_size, seq_length, 1]
+                ) <= seq_ids.unsqueeze(axis=[0, 2])
+                causal_mask = causal_mask.astype(attention_mask.dtype)
+
+                if causal_mask.shape[1] < attention_mask.shape[1]:
+                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+                    causal_mask = paddle.concat(
+                        [
+                            paddle.ones(
+                                [batch_size, seq_length, prefix_seq_len],
+                                dtype=causal_mask.dtype,
+                            ),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+
+                extended_attention_mask = causal_mask.unsqueeze(1) * attention_mask.unsqueeze([1, 2])
+            else:
+                extended_attention_mask = attention_mask.unsqueeze([1, 2])
+        elif attention_mask.ndim == 4:
+            if self.is_decoder:
+                batch_size, seq_length = input_shape
+                seq_ids = paddle.arange(seq_length)
+                causal_mask = paddle.tile(
+                    seq_ids.unsqueeze(axis=[0, 1]), [batch_size, seq_length, 1]
+                ) <= seq_ids.unsqueeze(axis=[0, 2])
+                # in case cache are used we need to add a prefix ones mask to the causal mask
+                # causal and attention masks must have same type
+                causal_mask = causal_mask.astype(attention_mask.dtype)
+
+                if causal_mask.shape[1] < attention_mask.shape[-1]:
+                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+                    causal_mask = paddle.concat(
+                        [
+                            paddle.ones(
+                                [batch_size, seq_length, prefix_seq_len],
+                                dtype=causal_mask.dtype,
+                            ),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+
+                extended_attention_mask = causal_mask.unsqueeze(1) * attention_mask
+            else:
+                extended_attention_mask = attention_mask
+        else:
+            raise ValueError(
+                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
+            )
+
+        extended_attention_mask = extended_attention_mask.astype(self.dtype)
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def invert_attention_mask(self, encoder_attention_mask):
+        if encoder_attention_mask.ndim == 4:
+            encoder_extended_attention_mask = encoder_attention_mask
+        elif encoder_attention_mask.ndim == 3:
+            encoder_extended_attention_mask = encoder_attention_mask.unsqueeze(1)
+        elif encoder_attention_mask.ndim == 2:
+            encoder_extended_attention_mask = encoder_attention_mask.unsqueeze([1, 2])
+        encoder_extended_attention_mask = encoder_extended_attention_mask.astype(self.dtype)  # fp16 compatibility
+
+        if self.dtype == paddle.float16:
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4
+        elif self.dtype == paddle.float32:
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4
+        else:
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4
+
+            # raise ValueError(
+            #     f"{self.dtype} not recognized. `dtype` should be set to either `paddle.float32` or `paddle.float16`"
+            # )
+
+        return encoder_extended_attention_mask
+
+
+@register_base_model
+class ErnieCodeModel(ErnieCodePretrainedModel):
+    """
+    The bare ErnieCode Model transformer outputting raw hidden-states without any specific head on top.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (class:`ErnieCodeConfig`):
+            Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+    """
+
+    def __init__(self, config: ErnieCodeConfig):
+        super().__init__(config)
+        self.bos_token_id = config.bos_token_id
+        self.pad_token_id = config.pad_token_id
+        self.initializer_factor = config.initializer_factor
+        self.d_model = config.d_model
+        self.num_heads = config.num_heads
+        self.d_kv = config.d_kv
+        self.d_ff = config.d_ff
+        self.tie_word_embeddings = config.tie_word_embeddings
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = ErnieCodeStack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = ErnieCodeStack(decoder_config, self.shared)
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_output=None,
+        cache=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        The ErnieCodeModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on
+                to some unwanted positions, usually the paddings or the subsequent positions.
+                Its data type can be int, float.
+                When the data type is int, the `masked` tokens have `0` values and the others
+                have `1` values.
+                When the data type is float, the `masked` tokens have `0.0` values and the
+                others have `1.0` values.
+                It is a tensor with shape broadcasted to [batch_size, num_attention_heads, sequence_length, sequence_length].
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            decoder_input_ids (Tensor, optional):
+                Indices of decoder input sequence tokens in the vocabulary.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means no `decoder_input_ids` is provided, the model will create the tensor
+                by shifting the `input_ids` to the right.
+            decoder_attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention to some unwanted positions in `decoder_input_ids`.
+                Its data type and shape is the same as `attention_mask`. Defaults to `None`.
+            encoder_output (tuple, optional):
+                The output of the encoder, a tuple consists `last_hidden_state`, `hidden_states`(optional), `attentions`(optional).
+                The data type of `last_hidden_state` is float32 and its shape is [batch_size, sequence_length, hidden_size].
+                `hidden_states` is hidden_states of all layers in the Transformer encoder. The length of `hidden_states` is `num_hidden_layers + 1`.
+                For all element in the tuple, its data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+                `attentions` is attentions of all layers of in the Transformer encoder. The length of `attentions` is `num_hidden_layers`.
+                For all element in the tuple, its data type should be float32 and its shape is [batch_size, num_attention_heads, sequence_length, sequence_length].
+            cache (Tuple[Tuple[Tensor]], optional):
+                Contains pre-computed hidden-states (key and values in the attention blocks)
+                as computed by the model. Can be used to speed up sequential decoding.
+                The `input_ids` which have their past given to this model should not be
+                passed as input ids as they have already been computed.
+                Defaults to `None`.
+            inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation
+                of shape `(batch_size, sequence_length, hidden_size)`. This is useful if you want more control over
+                how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+                Default to None.
+            decoder_inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+                representation  of shape `(batch_size, target_sequence_length, hidden_size)`. If `cache` is used,
+                optionally only the last `decoder_inputs_embeds` have to be input (see `past_key_values`).
+                This is useful if you want more control over how to convert `decoder_input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix. Default to None.
+
+                If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+                of `inputs_embeds`.
+            use_cache (bool, optional):
+                Whether or not to use cache. If set to `True`, `past_buckets_states` states are returned
+                and can be used to speed up decoding.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether or not to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            output_hidden_states (bool, optional):
+                Whether or not to return the output of all hidden layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether or not to return a class:`~paddlenlp.transformers.model_outputs.Seq2SeqModelOutput`. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqModelOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.Seq2SeqModelOutput`.
+
+            tuple: Returns tuple (`last_hidden_state`, `cache`, `decoder_hidden_states`, `decoder_attentions`,
+            `cross_attentions`, `encoder_last_hidden_state`, `encoder_hidden_states`, `encoder_attentions`)
+
+            With the fields:
+
+            - `last_hidden_state` (Tensor):
+                Sequence of hidden-states at the last layer of the decoder of the model.
+                It's data type should be float32 and
+                its shape is [batch_size, sequence_length, hidden_size].
+
+            - `cache` (List[tuple(Tensor, Tensor)], optional):
+                returned when `use_cache=True` is passed.
+                List of `tuple(Tensor, Tensor)` of length `config["num_layers"]`,
+                with the first element being the previous `buckets` of shape
+                `[batch_size, num_heads, num_hashes, sequence_length]` and the second
+                being the previous `hidden_states` of shape `[batch_size, sequence_length, hidden_size]`.
+
+            - `decoder_hidden_states` (tuple(Tensor), optional)
+                returned when ``output_hidden_states=True`` is passed.
+                Tuple of `Tensor` (one for the output of the embeddings + one for the output of decoder each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            - `decoder_attentions` (tuple(Tensor), optional):
+                returned when `output_attentions=True` is passed.
+                tuple of `Tensor` (one for each layer) of shape. Each Tensor has a data
+                type of float32 and its shape is [batch_size, num_heads, sequence_length, sequence_length].
+
+            - `cross_attentions` (tuple(Tensor), optional):
+                returned when `output_attentions=True` is passed.
+                tuple of `Tensor` (one for each layer) of shape. Each Tensor has a data
+                type of float32 and its shape is [batch_size, num_heads, sequence_length, sequence_length].
+
+            - `encoder_last_hidden_state` (Tensor):
+                Sequence of hidden-states at the last layer of the encoder of the model.
+                It's data type should be float32 and
+                its shape is [batch_size, sequence_length, hidden_size].
+
+            - `encoder_hidden_states` (tuple(Tensor), optional):
+                returned when `output_hidden_states=True` is passed.
+                tuple of `Tensor` (one for the output of the embeddings + one for the
+                output of encoder each layer). Each Tensor has a data type of float32
+                and its shape is [batch_size, sequence_length, hidden_size].
+
+            - `encoder_attentions` (tuple(Tensor), optional):
+                returned when `output_attentions=True` is passed.
+                tuple of `Tensor` (one for each layer) of shape. Each Tensor has a data
+                type of float32 and its shape is [batch_size, num_heads, sequence_length, sequence_length].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ErnieCodeModel, AutoTokenizer
+
+                tokenizer = AutoTokenizer.from_pretrained('ErnieCode-base')
+                model = ErnieCodeModel.from_pretrained('ErnieCode-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                input_ids = paddle.to_tensor([inputs["input_ids"]], dtype="int64")
+                decoder_inputs = tokenizer("It means you can")
+                decoder_input_ids = paddle.to_tensor([decoder_inputs["input_ids"]], dtype="int64")
+
+                outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+                last_hidden_state = outputs[0]
+                print(last_hidden_state.shape)
+                # [1, 5, 768]
+
+        """
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Encode if needed (training, first prediction pass)
+        if encoder_output is None:
+            encoder_output = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_output, BaseModelOutput):
+            encoder_output = convert_encoder_output(encoder_output)
+        hidden_states = encoder_output[0]
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            cache=cache,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_output
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_output.last_hidden_state,
+            encoder_hidden_states=encoder_output.hidden_states,
+            encoder_attentions=encoder_output.attentions,
+        )
+
+
+class ErnieCodeForConditionalGeneration(ErnieCodePretrainedModel):
+    """
+    The ErnieCode Model transformer with a language modeling head on top.
+
+    Args:
+        config (:class:`ErnieCodeConfig`):
+            An instance of ErnieCodeConfig used to construct ErnieCodeForConditionalGeneration.
+
+    """
+
+    def __init__(self, config: ErnieCodeConfig):
+        super().__init__(config)
+        self.ErnieCode = ErnieCodeModel(config)
+        if not self.ErnieCode.config["tie_word_embeddings"]:
+            self.lm_head = nn.Linear(
+                self.ErnieCode.config["d_model"], self.ErnieCode.config["vocab_size"], bias_attr=False
+            )
+
+    def get_input_embeddings(self):
+        return self.ErnieCode.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.ErnieCode.shared = new_embeddings
+        self.ErnieCode.encoder.set_input_embeddings(new_embeddings)
+        self.ErnieCode.decoder.set_input_embeddings(new_embeddings)
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def get_output_embeddings(self):
+        if self.ErnieCode.config["tie_word_embeddings"]:
+            return self.ErnieCode.shared
+        else:
+            return self.lm_head
+
+    def get_encoder(self):
+        return self.ErnieCode.encoder
+
+    def get_decoder(self):
+        return self.ErnieCode.decoder
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_output=None,
+        cache=None,
+        labels=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+
+        Args:
+            input_ids (Tensor, optional):
+                See :class:`ErnieCodeModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ErnieCodeModel`.
+            decoder_input_ids (Tensor, optional):
+                See :class:`ErnieCodeModel`.
+            decoder_attention_mask (Tensor, optional):
+                See :class:`ErnieCodeModel`.
+            encoder_output (tuple(Tensor), optional):
+                See :class:`ErnieCodeModel`.
+            cache (List[tuple(Tensor, Tensor)], optional):
+                See :class:`ErnieCodeModel`.
+            labels (Tensor, optional):
+                Labels for language modeling. Note that the labels **are shifted**
+                inside the model, i.e. you can set `labels = input_ids` Indices are
+                selected in `[-100, 0, ..., vocab_size]` All labels set to `-100` are
+                ignored (masked), the loss is only computed for labels in `[0, ..., vocab_size]`.
+                Shape is [batch_size, sequence_length] and dtype is int64.
+            inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation
+                of shape `(batch_size, sequence_length, hidden_size)`. This is useful if you want more control over
+                how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+                Default to None.
+            decoder_inputs_embeds (Tensor , optional):
+                Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+                representation of shape `(batch_size, target_sequence_length, hidden_size)`. If `past_key_values` is used,
+                optionally only the last `decoder_inputs_embeds` have to be input (see `past_key_values`). This is useful
+                if you want more control over how to convert `decoder_input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix. Default to None.
+
+                If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+                of `inputs_embeds`.
+            use_cache (bool, optional):
+                See :class:`ErnieCodeModel`.
+            output_attentions (bool, optional):
+                See :class:`ErnieCodeModel`.
+            output_hidden_states (bool, optional):
+                See :class:`ErnieCodeModel`.
+            return_dict (bool, optional):
+                Whether or not to return a class:`~paddlenlp.transformers.model_outputs.Seq2SeqLMOutput`. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqLMOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.Seq2SeqLMOutput`.
+
+            tuple: Returns tuple (`loss`, `logits`, `cache`, `decoder_hidden_states`, `decoder_attentions`,
+            `cross_attentions`, `encoder_last_hidden_state`, `encoder_hidden_states`, `encoder_attentions`)
+
+            With the fields:
+
+            - `loss` (Tensor):
+                returned when `labels` is provided.
+                Language modeling loss. It's data type should be float32 and its shape is [1,].
+
+            - `logits` (Tensor):
+                Prediction scores of the language modeling head
+                (scores for each vocabulary token before SoftMax).
+                It's data type should be float32 and its shape is
+                [batch_size, sequence_length, vocab_size].
+
+            - `cache` (List[tuple(Tensor, Tensor)], optional):
+                See :class:`ErnieCodeModel`.
+
+            - `decoder_hidden_states` (tuple(Tensor), optional)
+                See :class:`ErnieCodeModel`.
+
+            - `decoder_attentions` (tuple(Tensor), optional):
+                See :class:`ErnieCodeModel`.
+
+            - `cross_attentions` (tuple(Tensor), optional):
+                See :class:`ErnieCodeModel`.
+
+            - `encoder_last_hidden_state` (Tensor):
+                See :class:`ErnieCodeModel`.
+
+            - `encoder_hidden_states` (tuple(Tensor), optional):
+                See :class:`ErnieCodeModel`.
+
+            - `encoder_attentions` (tuple(Tensor), optional):
+                See :class:`ErnieCodeModel`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ErnieCodeForConditionalGeneration, AutoTokenizer
+
+                tokenizer = AutoTokenizer.from_pretrained('ErnieCode-base')
+                model = ErnieCodeForConditionalGeneration.from_pretrained('ErnieCode-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs, labels=inputs["input_ids"])
+
+                loss = output[0]
+                logits = output[1]
+
+        """
+
+        input_type = type(decoder_input_ids) if decoder_input_ids is not None else type(decoder_inputs_embeds)
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Encode if needed (training, first prediction pass)
+        if encoder_output is None:
+            # Convert encoder inputs in embeddings if needed
+            encoder_output = self.ErnieCode.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        else:
+            if isinstance(encoder_output, input_type):
+                encoder_output = (encoder_output,)
+            if return_dict and not isinstance(encoder_output, BaseModelOutput):
+                encoder_output = convert_encoder_output(encoder_output)
+
+        hidden_states = encoder_output[0]
+
+        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(labels)
+
+        # If decoding with past key value states, only the last tokens
+        # should be given as an input
+        if cache is not None:
+            assert labels is None, "Decoder should not use cached key value states when training."
+            if decoder_input_ids is not None:
+                decoder_input_ids = decoder_input_ids[:, -1:]
+
+        encoder_attention_mask = attention_mask
+        if attention_mask is not None:
+            if attention_mask.ndim == 4:
+                encoder_attention_mask = attention_mask[:, :, -1:, :]
+            elif attention_mask.ndim == 3:
+                encoder_attention_mask = attention_mask[:, -1:, :].unsqueeze([1])
+            elif attention_mask.ndim == 2:
+                encoder_attention_mask = attention_mask.unsqueeze([1, 2])
+            else:
+                raise ValueError("Invalid attention mask shape. ")
+
+        # Decode
+        decoder_outputs = self.ErnieCode.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            cache=cache,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        if self.ErnieCode.config["tie_word_embeddings"]:
+            # Rescale output before projecting on vocab
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+            sequence_output = sequence_output * (self.ErnieCode.config["d_model"] ** -0.5)
+            lm_logits = paddle.matmul(sequence_output, self.ErnieCode.shared.weight, transpose_y=True)
+        else:
+            lm_logits = self.lm_head(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(lm_logits.reshape(shape=[-1, lm_logits.shape[-1]]).astype("float32"), labels.flatten())
+
+        if not return_dict:
+            output = (lm_logits,) + decoder_outputs[1:] + encoder_output
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_output.last_hidden_state,
+            encoder_hidden_states=encoder_output.hidden_states,
+            encoder_attentions=encoder_output.attentions,
+        )
+
+    @staticmethod
+    def prepare_input_ids_for_generation(bos_token_id, encoder_output=None):
+        batch_size = 1
+        if bos_token_id is None:
+            raise ValueError("`bos_token_id` should be defined when no " "`input_ids` are provided.")
+        if encoder_output is not None:
+            if isinstance(encoder_output, tuple):
+                encoder_output = encoder_output[0]
+            batch_size = encoder_output.shape[0]
+        return paddle.ones([batch_size, 1], dtype="int64") * bos_token_id
+
+    def prepare_inputs_for_generation(
+        self, input_ids, cache=None, attention_mask=None, use_cache=None, encoder_output=None, **kwargs
+    ):
+
+        # cut decoder_input_ids if past is used
+        if cache is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "decoder_input_ids": input_ids,
+            "cache": cache,
+            "encoder_output": encoder_output,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,
+        }
+
+    def prepare_decoder_input_ids_from_labels(self, labels: paddle.Tensor):
+        return self._shift_right(labels)
+
+    @staticmethod
+    def expand_inputs_for_generation(input_ids, expand_size, attention_mask=None, **model_kwargs):
+        index = paddle.tile(paddle.arange(input_ids.shape[0]).unsqueeze(-1), [1, expand_size]).reshape([-1])
+
+        input_ids = paddle.index_select(input_ids, index)
+
+        if attention_mask is not None:
+            model_kwargs["attention_mask"] = paddle.index_select(attention_mask, index)
+
+        if "token_type_ids" in model_kwargs:
+            token_type_ids = model_kwargs["token_type_ids"]
+            model_kwargs["token_type_ids"] = paddle.index_select(token_type_ids, index)
+
+        if "position_ids" in model_kwargs:
+            position_ids = model_kwargs["position_ids"]
+            model_kwargs["position_ids"] = paddle.index_select(position_ids, index)
+
+        if "seq_len" in model_kwargs:
+            seq_len = model_kwargs["seq_len"]
+            model_kwargs["seq_len"] = paddle.index_select(seq_len, index)
+
+        if "encoder_output" in model_kwargs:
+            encoder_output = model_kwargs["encoder_output"]
+            if isinstance(encoder_output, tuple):
+                model_kwargs["encoder_output"] = (paddle.index_select(encoder_output[0], index),) + encoder_output[1:]
+            else:
+                model_kwargs["encoder_output"] = paddle.index_select(encoder_output, index)
+        return input_ids, model_kwargs
+
+    @staticmethod
+    def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id):
+        is_pad_token_in_inputs_ids = (pad_token_id is not None) and paddle.any(input_ids == pad_token_id).item()
+        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (
+            (eos_token_id is not None) and (pad_token_id != eos_token_id)
+        )
+        if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id:
+            attention_mask = (input_ids != pad_token_id).astype("int64")
+            return attention_mask
+        else:
+            attention_mask = paddle.ones_like(input_ids)
+        return attention_mask
+
+    def __getattr__(self, name):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(getattr(self, self.base_model_prefix), name)
+
+
+class ErnieCodeEncoderModel(ErnieCodePretrainedModel):
+    base_model_class = None
+
+    def __init__(self, config: ErnieCodeConfig):
+        super().__init__(config)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.shared = nn.Embedding(encoder_config.vocab_size, encoder_config.d_model)
+        self.encoder = ErnieCodeStack(encoder_config, embed_tokens=self.shared)
+
+    @property
+    def ErnieCode(self):
+        return self
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings: nn.Embedding) -> None:
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+
+    def get_encoder(self) -> ErnieCodeStack:
+        return self.encoder
+
+    def forward(
+        self,
+        input_ids: Tensor = None,
+        attention_mask: Optional[Tensor] = None,
+        encoder_hidden_states: Optional[Tuple[Tensor]] = None,
+        encoder_attention_mask: Optional[Tensor] = None,
+        cache=None,
+        inputs_embeds: Optional[Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        encoder_outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            cache=cache,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return encoder_outputs
+
+
+ErnieCodeEncoderModel.base_model_class = ErnieCodeEncoderModel
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_code/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_code/tokenizer.py
new file mode 100644
index 000000000..69793513f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_code/tokenizer.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 Baidu ErnieCode Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unicodedata
+from collections import UserDict
+from typing import List, Union
+
+import numpy as np
+import paddle
+
+from ..t5.tokenizer import T5Tokenizer
+
+__all__ = [
+    "ErnieCodeTokenizer",
+]
+
+formate_dict = {" ": "<|space|>"}
+
+
+def to_py_obj(obj):
+    """
+    Convert a Paddle tensor, Numpy array or python list to a python list.
+    """
+    if isinstance(obj, (dict, UserDict)):
+        return {k: to_py_obj(v) for k, v in obj.items()}
+    elif isinstance(obj, (list, tuple)):
+        return [to_py_obj(o) for o in obj]
+    elif isinstance(obj, paddle.Tensor):
+        return obj.numpy().tolist()
+    elif isinstance(obj, (np.ndarray, np.number)):  # tolist also works on 0d np arrays
+        return obj.tolist()
+    else:
+        return obj
+
+
+def clean_up_codem_spaces(s: str):
+    # post process
+    # ===========================
+    new_tokens = ["<pad>", "</s>", "<unk>", "\n", "\t", "<|space|>" * 4, "<|space|>" * 2, "<|space|>"]
+    for tok in new_tokens:
+        s = s.replace(f"{tok} ", tok)
+
+    cleaned_tokens = ["<pad>", "</s>", "<unk>"]
+    for tok in cleaned_tokens:
+        s = s.replace(tok, "")
+    s = s.replace("<|space|>", " ")
+    # ===========================
+    return s
+
+
+class ErnieCodeTokenizer(T5Tokenizer):
+    """
+    Constructs a ErnieCode tokenizer based on SentencePiece .
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        sentencepiece_model_file (str):
+            The vocabulary file (ends with '.spm') required to instantiate
+            a `SentencePiece <https://github.com/google/sentencepiece>`__ tokenizer.
+        do_lower_case (bool):
+            Whether or not to lowercase the input when tokenizing. Defaults to `False`.
+        remove_space (bool):
+            Whether or note to remove space when tokenizing. Defaults to `True`.
+        keep_accents (bool):
+            Whether or note to keep accents when tokenizing. Defaults to `False`.
+        eos_token (str):
+            A special token representing the *eos (end-of-sentence)* token.
+            Defaults to "</s>".
+        unk_token (str):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "<unk>".
+        pad_token (str):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "<pad>".
+
+    """
+
+    resource_files_names = {"sentencepiece_model_file": "spiece.model"}
+    pretrained_resource_files_map = {
+        "sentencepiece_model_file": {
+            "ernie-code-base": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie-code/ernie-code-base/spiece.model",
+            "ernie-code-base-L512": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie-code/ernie-code-base-L512/spiece.model",
+        },
+    }
+
+    pretrained_init_configuration = {
+        "ernie-code-base": {"do_lower_case": False},
+        "ernie-code-base-L512": {"do_lower_case": False},
+    }
+
+    def __init__(
+        self,
+        sentencepiece_model_file,
+        do_lower_case=False,
+        remove_space=True,
+        keep_accents=True,
+        eos_token="</s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        extra_ids=0,
+        additional_special_tokens=[],
+        sp_model_kwargs=None,
+        **kwargs
+    ):
+        if additional_special_tokens is None or 0 == len(additional_special_tokens):
+            additional_special_tokens = [
+                "\n",
+                "\t",
+                "<|space|><|space|><|space|><|space|>",
+                "<|space|><|space|>",
+                "<|space|>",
+            ]
+
+        super(ErnieCodeTokenizer, self).__init__(
+            sentencepiece_model_file,
+            do_lower_case,
+            remove_space,
+            keep_accents,
+            eos_token,
+            unk_token,
+            pad_token,
+            extra_ids,
+            additional_special_tokens,
+            sp_model_kwargs,
+            **kwargs,
+        )
+
+    def preprocess_text(self, inputs: str):
+        if self.remove_space:
+            outputs = " ".join(inputs.strip().split())
+        else:
+            outputs = inputs
+        outputs = outputs.replace("``", '"').replace("''", '"')
+
+        if not self.keep_accents:
+            outputs = unicodedata.normalize("NFKD", outputs)
+            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
+        if self.do_lower_case:
+            outputs = outputs.lower()
+
+        tokens = list(outputs)
+        i = 0
+        while i < len(tokens):
+            if "\n" == outputs[i]:
+
+                while i + 1 < len(tokens) and " " == tokens[i + 1]:
+                    tokens[i + 1] = formate_dict.get(" ")
+                    i += 1
+            i += 1
+        formatted_line = "".join(tokens)
+        return formatted_line
+
+    def decode(
+        self,
+        token_ids: Union[int, List[int], "np.ndarray", "paddle.Tensor"],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = True,
+        **kwargs
+    ) -> str:
+        """
+        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
+        tokens and clean up tokenization spaces.
+        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, paddle.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean up the tokenization spaces.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific decode method.
+        Returns:
+            `str`: The decoded sentence.
+        """
+        # Convert inputs to python lists
+        token_ids = to_py_obj(token_ids)
+
+        decoded_preds = self._decode(
+            token_ids=token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+        return clean_up_codem_spaces(decoded_preds)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_ctm/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_ctm/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_ctm/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_ctm/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_ctm/configuration.py
new file mode 100644
index 000000000..d8631c228
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_ctm/configuration.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Ernie-CTM model configuration """
+from __future__ import annotations
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = [
+    "ERNIE_CTM_CONFIG",
+    "ERNIE_CTM_PRETRAINED_INIT_CONFIGURATION",
+    "ERNIE_CTM_PRETRAINED_RESOURCE_FILES_MAP",
+    "ErnieCtmConfig",
+]
+
+
+ERNIE_CTM_CONFIG = {
+    "vocab_size": 23000,
+    "embedding_size": 128,
+    "num_hidden_layers": 12,
+    "num_attention_heads": 12,
+    "intermediate_size": 3072,
+    "hidden_dropout_prob": 0.1,
+    "layer_norm_eps": 1e-12,
+    "max_position_embeddings": 512,
+    "type_vocab_size": 2,
+    "initializer_range": 0.02,
+    "pad_token_id": 0,
+    "use_content_summary": True,
+    "content_summary_index": 1,
+    "cls_num": 2,
+    "num_prompt_placeholders": 5,
+    "prompt_vocab_ids": None,
+}
+
+
+ERNIE_CTM_PRETRAINED_INIT_CONFIGURATION = {
+    "ernie-ctm": ERNIE_CTM_CONFIG,
+    "wordtag": ERNIE_CTM_CONFIG,
+    "nptag": ERNIE_CTM_CONFIG,
+}
+
+ERNIE_CTM_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "ernie-ctm": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_ctm/ernie_ctm_v3.pdparams",
+        "wordtag": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_ctm/wordtag_v3.pdparams",
+        "nptag": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_ctm/nptag_v3.pdparams",
+    }
+}
+
+
+class ErnieCtmConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ErnieCtmModel`]. It is used to instantiate
+    a Ernie-CTM model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Ernie-CTM-base architecture.
+
+    Configure objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documents from [`PretrainedConfig`] for more informations.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 23000):
+            Vocabulary size of the Ernie-CTM model. Defines the number of different tokens that can be represented by
+            the `input_ids` passed when calling [`ErnieCtmModel`].
+        embedding_size (`int` *optional*, defaults to 128):
+            Dimensionality of vocabulary embeddings.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            The dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large.
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when call [`ErnieCtmModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_content_summary (`bool`, *optional*, defaults to True):
+            Whether to use content summary token and content representation when inputs passed into [`ErnieCtmModel`].
+        content_summary_index (`int`, *optional*, defaults to 1):
+            If `use_content_summary` is set, content summary token position is defined by this argument.
+        cls_num (`int`, *optional*, defaults to 2):
+            Number of [CLS] token in model.
+        num_prompt_placeholders (`int`, *optional*, defaults to 5):
+            Number of maximum length of prompt answer.
+        prompt_vocab_ids (`dict`, *optional*, defaults to None):
+            Prompt vocabulary of decode procedure.
+    """
+    model_type = "ernie-ctm"
+    pretrained_init_configuration = ERNIE_CTM_PRETRAINED_INIT_CONFIGURATION
+    attribute_map = {"num_tag": "num_labels", "dropout": "classifier_dropout", "num_classes": "num_labels"}
+
+    def __init__(
+        self,
+        vocab_size: int = 23000,
+        embedding_size: int = 128,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        max_position_embeddings: int = 512,
+        layer_norm_eps: float = 1e-12,
+        type_vocab_size: int = 2,
+        initializer_range: float = 0.02,
+        use_content_summary: bool = True,
+        content_summary_index: int = 1,
+        cls_num: int = 2,
+        pad_token_id: int = 0,
+        num_prompt_placeholders: int = 5,
+        prompt_vocab_ids: set = None,
+        **kwargs
+    ):
+        super(ErnieCtmConfig, self).__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.embedding_size = embedding_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.use_content_summary = use_content_summary
+        self.content_summary_index = content_summary_index
+        self.cls_num = cls_num
+        self.num_prompt_placeholders = num_prompt_placeholders
+        self.prompt_vocab_ids = prompt_vocab_ids
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_ctm/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_ctm/modeling.py
new file mode 100644
index 000000000..c3449ddc1
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_ctm/modeling.py
@@ -0,0 +1,830 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import Tensor
+from paddle.nn import Layer
+
+from paddlenlp.layers.crf import LinearChainCrf, LinearChainCrfLoss
+from paddlenlp.transformers.model_outputs import ModelOutput, TokenClassifierOutput
+from paddlenlp.utils.tools import compare_version
+
+from .configuration import (
+    ERNIE_CTM_PRETRAINED_INIT_CONFIGURATION,
+    ERNIE_CTM_PRETRAINED_RESOURCE_FILES_MAP,
+    ErnieCtmConfig,
+)
+
+if compare_version(paddle.version.full_version, "2.2.0") >= 0:
+    # paddle.text.ViterbiDecoder is supported by paddle after version 2.2.0
+    from paddle.text import ViterbiDecoder
+else:
+    from paddlenlp.layers.crf import ViterbiDecoder
+
+from .. import PretrainedModel, register_base_model
+
+__all__ = [
+    "ErnieCtmPretrainedModel",
+    "ErnieCtmModel",
+    "ErnieCtmWordtagModel",
+    "ErnieCtmNptagModel",
+    "ErnieCtmForTokenClassification",
+]
+
+
+@dataclass
+class ErnieCtmModelOutput(ModelOutput):
+    """
+    Base class for model's outputs, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (`paddle.Tensor` of shape :obj:`(batch_size, hidden_size)`):
+            Last layer hidden-state of the first token of the sequence (classification token) further processed by a
+            Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence
+            prediction (classification) objective during pretraining.
+        content_output
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+
+    """
+
+    last_hidden_state: paddle.Tensor = None
+    pooler_output: paddle.Tensor = None
+    content_output: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+class ErnieCtmEmbeddings(Layer):
+    """
+    Construct the embeddings from word, position and token_type embeddings.
+    """
+
+    def __init__(self, config: ErnieCtmConfig):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)
+        self.layer_norm = nn.LayerNorm(config.embedding_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.cls_num = config.cls_num
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, inputs_embeds=None):
+        if position_ids is None:
+
+            content_len = input_ids.shape[1] - self.cls_num
+            position_ids = paddle.concat(
+                [
+                    paddle.zeros(shape=[self.cls_num], dtype="int64"),
+                    paddle.linspace(1, content_len, content_len, dtype="int64"),
+                ]
+            )
+            position_ids.stop_gradient = True
+
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros_like(input_ids, dtype="int64")
+
+        if input_ids is not None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings + position_embeddings
+
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class ErnieCtmPooler(Layer):
+    """ """
+
+    def __init__(self, hidden_size):
+        super().__init__()
+        self.dense = nn.Linear(hidden_size, hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class ErnieCtmPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained ErnieCtm models. It provides ErnieCtm related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading
+     and loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    model_config_file = "model_config.json"
+    config_class = ErnieCtmConfig
+    resource_files_names = {"model_state": "model_state.pdparams"}
+
+    base_model_prefix = "ernie_ctm"
+
+    pretrained_init_configuration = ERNIE_CTM_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = ERNIE_CTM_PRETRAINED_RESOURCE_FILES_MAP
+
+    def _init_weights(self, layer):
+        # Initialize weights
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.initializer_range
+                        if hasattr(self, "initializer_range")
+                        else self.ernie_ctm.config["initializer_range"],
+                        shape=layer.weight.shape,
+                    )
+                )
+        elif isinstance(layer, nn.LayerNorm):
+            layer._epsilon = 1e-12
+
+
+@register_base_model
+class ErnieCtmModel(ErnieCtmPretrainedModel):
+    """
+    The bare ErnieCtm Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        vocab_size (int):
+            Vocabulary size of `inputs_ids` in `ErnieCtmModel`. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids`
+            passed when calling `ErnieCtmModel`.
+        embedding_size (int, optional):
+            Dimensionality of the embedding layer.
+            Defaults to `128`.
+        hidden_size (int, optional):
+            Dimensionality of the encoder layers and the pooler layer.
+            Defaults to `768`.
+        num_hidden_layers (int, optional):
+            Number of hidden layers in the Transformer encoder. Defaults to `12`.
+        num_attention_heads (int, optional):
+            Number of attention heads for each attention layer in the Transformer encoder.
+            Defaults to `12`.
+        intermediate_size (int, optional):
+            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
+            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
+            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
+            Defaults to `3072`.
+        hidden_dropout_prob (float, optional):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+            Defaults to `0.1`.
+        attention_probs_dropout_prob (float, optional):
+            The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
+            Defaults to `0.1`.
+        max_position_embeddings (int, optional):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported
+            length of an input sequence. Defaults to `512`.
+        type_vocab_size (int, optional):
+            The vocabulary size of the `token_type_ids`.
+            Defaults to `16`.
+        initializer_range (float, optional):
+            The standard deviation of the normal initializer for initializing all weight matrices.
+            Defaults to `0.02`.
+        pad_token_id (int, optional):
+            The index of padding token in the token vocabulary.
+            Defaults to `0`.
+        use_content_summary (`bool`, optional):
+            Whether or not to add content summary tokens.
+            Defaults to `True`.
+        content_summary_index (int, optional):
+            The number of the content summary tokens. Only valid when use_content_summary is True.
+            Defaults to `1`.
+        cls_num (int, optional):
+            The number of the CLS tokens. Only valid when use_content_summary is True.
+            Defaults to `2`.
+    """
+
+    def __init__(self, config: ErnieCtmConfig):
+        super(ErnieCtmModel, self).__init__(config)
+
+        self.config = config
+        self.pad_token_id = config.pad_token_id
+        self.content_summary_index = config.content_summary_index
+        self.initializer_range = config.initializer_range
+        self.embeddings = ErnieCtmEmbeddings(config)
+        self.embedding_hidden_mapping_in = nn.Linear(config.embedding_size, config.hidden_size)
+
+        def construct_encoder_layer():
+            encoder_layer = nn.TransformerEncoderLayer(
+                config.hidden_size,
+                config.num_attention_heads,
+                config.intermediate_size,
+                dropout=config.hidden_dropout_prob,
+                activation="gelu",
+                attn_dropout=config.attention_probs_dropout_prob,
+                act_dropout=0,
+            )
+            encoder_layer.activation = nn.GELU(approximate=True)
+            return encoder_layer
+
+        self.encoder = nn.TransformerEncoder(construct_encoder_layer(), config.num_hidden_layers)
+        self.pooler = ErnieCtmPooler(config.hidden_size)
+
+        self.use_content_summary = config.use_content_summary
+        self.content_summary_index = config.content_summary_index
+        if config.use_content_summary is True:
+            self.feature_fuse = nn.Linear(config.hidden_size * 2, config.intermediate_size)
+            self.feature_output = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        content_clone=False,
+        output_hidden_states=None,
+        output_attentions=None,
+        return_dict=None,
+    ):
+        r"""
+        The ErnieCtmModel forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (`Tensor`):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
+            token_type_ids (`Tensor`, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids (Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                ``[0, max_position_embeddings - 1]``.
+                Shape as `[batch_size, num_tokens]` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to
+                `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                For example, its shape can be
+                [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
+                [batch_size, num_attention_heads, sequence_length, sequence_length].
+                We use whole-word-mask in ERNIE, so the whole word will have the same value.
+                For example, "使用" as a word, "使" and "用" will have the same value.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation
+                of shape `(batch_size, sequence_length, hidden_size)`. This is useful if you want more control over
+                how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+                Default to None.
+            content_clone (bool, optional):
+                Whether the `content_output` is clone from `sequence_output`. If set to `True`, the content_output is
+                clone from sequence_output, which may cause the classification task impact on the sequence labeling
+                task.
+                Defaults to `False`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `None`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `None`. (currently not supported)
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.ModelOutput` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `None`.
+
+        Returns:
+            tuple: Returns tuple (``sequence_output``, ``pooled_output``, ``content_output``).
+
+            With the fields:
+
+            - `sequence_output` (Tensor):
+                Sequence of output at the last layer of the model. Its data type should be float32 and
+                has a shape of [batch_size, sequence_length, hidden_size].
+
+            - `pooled_output` (Tensor):
+                The output of first token (`[CLS]`) in sequence.
+                We "pool" the model by simply taking the hidden state corresponding to the first token.
+                Its data type should be float32 and its shape is [batch_size, hidden_size].
+
+            - `content_output` (Tensor):
+                The output of content summary token (`[CLS1]` in sequence). Its data type should be float32 and
+                has a shape of [batch_size, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ErnieModel, ErnieTokenizer
+
+                tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')
+                model = ErnieModel.from_pretrained('ernie-1.0')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                sequence_output, pooled_output, content_output = model(**inputs)
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        # check the variable of `input_ids` and `inputs_embeds`
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+
+        if attention_mask is None:
+            attention_mask = paddle.unsqueeze(
+                (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e4, axis=[1, 2]
+            )
+        # For 2D attention_mask from tokenizer
+        elif attention_mask.ndim == 2:
+            attention_mask = paddle.unsqueeze(attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
+            attention_mask = (1.0 - attention_mask) * -1e4
+        attention_mask.stop_gradient = True
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+        embedding_output = self.embedding_hidden_mapping_in(embedding_output)
+
+        hidden_states = embedding_output
+
+        encoder_output = self.encoder(
+            hidden_states,
+            src_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        # when `output_attentions` and `output_hidden_states` are False, it wll return tensor object.
+        encoder_output = (encoder_output,) if paddle.is_tensor(encoder_output) else encoder_output
+
+        sequence_output = encoder_output[0]
+
+        pooled_output = self.pooler(sequence_output)
+        content_output = sequence_output[:, self.content_summary_index] if self.use_content_summary else None
+
+        if self.use_content_summary is True:
+            if content_clone is True:
+                sequence_output = paddle.concat(
+                    (
+                        sequence_output,
+                        sequence_output[:, self.content_summary_index]
+                        .clone()
+                        .unsqueeze([1])
+                        .expand_as(sequence_output),
+                    ),
+                    2,
+                )
+            else:
+                content_output = paddle.expand(
+                    content_output.unsqueeze([1]),
+                    shape=(sequence_output.shape[0], sequence_output.shape[1], sequence_output.shape[2]),
+                )
+
+                sequence_output = paddle.concat((sequence_output, content_output), 2)
+
+            sequence_output = self.feature_fuse(sequence_output)
+
+            sequence_output = self.feature_output(sequence_output)
+
+        if not return_dict:
+            return (
+                sequence_output,
+                pooled_output,
+                content_output,
+            ) + encoder_output[1:]
+
+        return ErnieCtmModelOutput(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            content_output=content_output,
+            hidden_states=encoder_output.hidden_states,
+            attentions=encoder_output.attentions,
+        )
+
+
+class ErnieCtmWordtagModel(ErnieCtmPretrainedModel):
+    """
+    ErnieCtmWordtag Model with a token classification head on top (a crf layer on top of the hidden-states output) .
+    e.g. for Named-Entity-Recognition (NER) tasks.
+
+    Args:
+        ernie_ctm (:clss:`ErnieCtmModel`):
+            An instance of :class:`ErnieCtmModel`.
+        num_tag (int):
+            The number of different tags.
+        crf_lr (float):
+            The learning rate of the crf. Defaults to `100`.
+    """
+
+    def __init__(self, config: ErnieCtmConfig):
+        super(ErnieCtmWordtagModel, self).__init__(config)
+        self.num_tag = config.num_labels
+        self.ernie_ctm = ErnieCtmModel(config)
+        self.tag_classifier = nn.Linear(config.hidden_size, self.num_tag)
+        self.crf = LinearChainCrf(self.num_tag, with_start_stop_tag=False)
+        self.crf_loss = LinearChainCrfLoss(self.crf)
+        self.viterbi_decoder = ViterbiDecoder(self.crf.transitions, False)
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        lengths=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        tag_labels=None,
+        output_hidden_states=None,
+        output_attentions=None,
+        return_dict=None,
+        **kwargs
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`ErnieCtmModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ErnieCtmModel`.
+            position_ids (Tensor, optional):
+                See :class:`ErnieCtmModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ErnieCtmModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`ErnieCtmModel`.
+            lengths (Tensor, optional):
+                The input length. Its dtype is int64 and has a shape of `[batch_size]`.
+                Defaults to `None`.
+            tag_labels (Tensor, optional):
+                The input predicted tensor.
+                Its dtype is float32 and has a shape of `[batch_size, sequence_length, num_tags]`.
+                Defaults to `None`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `None`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `None`. (currently not supported)
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.ModelOutput` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `None`.
+
+
+        Returns:
+            tuple: Returns tuple (`seq_logits`, `cls_logits`).
+
+            With the fields:
+
+            - `seq_logits` (Tensor):
+                A tensor of next sentence prediction logits.
+                Its data type should be float32 and its shape is [batch_size, sequence_length, num_tag].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ErnieCtmWordtagModel, ErnieCtmTokenizer
+
+                tokenizer = ErnieCtmTokenizer.from_pretrained('ernie-ctm')
+                model = ErnieCtmWordtagModel.from_pretrained('ernie-ctm', num_tag=2)
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+
+        """
+        # author want to keep the name of `tab_labels`, so add this code to keep style consistent with paddlenlp.
+        tag_labels = kwargs.get("labels", tag_labels)
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        outputs = self.ernie_ctm(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        seq_logits = self.tag_classifier(sequence_output)
+        loss = None
+
+        if lengths is None:
+            lengths = paddle.sum(input_ids != self.config.pad_token_id, axis=-1)
+
+        if tag_labels is not None:
+            crf_loss = self.crf_loss(seq_logits, lengths, tag_labels)
+            seq_loss = F.cross_entropy(seq_logits.reshape((-1, self.num_tag)), tag_labels.reshape((-1,)))
+            loss = crf_loss + seq_loss
+            output = (loss, seq_logits)
+        else:
+            _, seq_logits = self.viterbi_decoder(seq_logits, lengths)
+            output = (seq_logits,)
+
+        if not return_dict:
+            return output + outputs[1:]
+
+        return TokenClassifierOutput(
+            loss=loss, logits=seq_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+class ErnieCtmMLMHead(Layer):
+    def __init__(self, config: ErnieCtmConfig):
+        super(ErnieCtmMLMHead, self).__init__()
+        self.layer_norm = nn.LayerNorm(config.embedding_size)
+
+        self.bias = self.create_parameter(
+            [config.vocab_size], is_bias=True, default_initializer=nn.initializer.Constant(value=0.0)
+        )
+        self.dense = nn.Linear(config.hidden_size, config.embedding_size)
+        self.decoder = nn.Linear(config.embedding_size, config.vocab_size)
+        self.activation = nn.GELU(approximate=True)
+        # Link bias
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        prediction_scores = hidden_states
+        return prediction_scores
+
+
+class ErnieCtmNptagModel(ErnieCtmPretrainedModel):
+    r"""
+    ErnieCtmNptag Model with a `masked language modeling` head on top.
+
+    Args:
+        ernie_ctm (:clss:`ErnieCtmModel`):
+            An instance of :class:`ErnieCtmModel`.
+    """
+
+    def __init__(self, config: ErnieCtmConfig):
+        super(ErnieCtmNptagModel, self).__init__(config)
+
+        self.ernie_ctm = ErnieCtmModel(config)
+        self.predictions = ErnieCtmMLMHead(config)
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`ErnieCtmModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ErnieCtmModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ErnieCtmModel`.
+            position_ids (Tensor, optional):
+                See :class:`ErnieCtmModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`ErnieCtmModel`.
+            output_hidden_states (bool, optional):
+                See :class:`ErnieCtmModel`.
+            output_attentions (bool, optional):
+                See :class:`ErnieCtmModel`.
+            return_dict (bool, optional):
+                See :class:`ErnieCtmModel`.
+
+        Returns:
+            tuple: Returns tensor `logits`, the scores of masked token prediction.
+            Its data type should be float32 and shape is [batch_size, sequence_length, vocab_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ErnieCtmNptagModel, ErnieCtmTokenizer
+
+                tokenizer = ErnieCtmTokenizer.from_pretrained('ernie-ctm')
+                model = ErnieCtmNptagModel.from_pretrained('ernie-ctm')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+
+                logits = model(**inputs)
+                print(logits.shape)
+                # [1, 45, 23000]
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        outputs = self.ernie_ctm(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.predictions(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss = F.cross_entropy(logits.reshape([-1, self.config.vocab_size]), labels.reshape([-1]))
+
+        if not return_dict:
+            outputs = (logits,) + outputs[2:]
+            return (loss,) + outputs if loss is not None else outputs
+
+        return TokenClassifierOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+class ErnieCtmForTokenClassification(ErnieCtmPretrainedModel):
+    r"""
+    ERNIECtm Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        ernie (`ErnieModel`):
+            An instance of `ErnieModel`.
+        num_tag (int, optional):
+            The number of classes. Defaults to `2`.
+        dropout (float, optional):
+            The dropout probability for output of ERNIE.
+            If None, use the same value as `hidden_dropout_prob`
+            of `ErnieCtmModel` instance `ernie`. Defaults to `None`.
+    """
+
+    def __init__(self, config: ErnieCtmConfig):
+        super(ErnieCtmForTokenClassification, self).__init__(config)
+        self.num_tag = config.num_labels
+        self.ernie_ctm = ErnieCtmModel(config)
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(
+        self,
+        input_ids: Tensor,
+        token_type_ids: Tensor | None = None,
+        position_ids: Tensor | None = None,
+        attention_mask: Tensor | None = None,
+        inputs_embeds: Tensor | None = None,
+        labels: Tensor | None = None,
+        output_hidden_states: bool | None = None,
+        output_attentions: bool | None = None,
+        return_dict: bool | None = None,
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`ErnieCtmModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ErnieCtmModel`.
+            position_ids (Tensor, optional):
+                See :class:`ErnieCtmModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ErnieCtmModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`ErnieCtmModel`.
+            labels (Tensor, optional): labels for model to compute the loss
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input token classification logits.
+            Shape as `[sequence_length, num_tag]` and dtype as `float32`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ErnieCtmForTokenClassification, ErnieCtmTokenizer
+
+                tokenizer = ErnieCtmTokenizer.from_pretrained('ernie-ctm')
+                model = ErnieCtmForTokenClassification.from_pretrained('ernie-ctm')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        output = self.ernie_ctm(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+
+        sequence_output = output[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(logits.reshape((-1, self.num_tag)), labels.reshape((-1,)))
+
+        if not return_dict:
+            output = (logits,) + output[2:]
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=output.hidden_states,
+            attentions=output.attentions,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_ctm/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_ctm/tokenizer.py
new file mode 100644
index 000000000..51d443b52
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_ctm/tokenizer.py
@@ -0,0 +1,282 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from .. import PretrainedTokenizer
+
+__all__ = ["ErnieCtmTokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"ernie-ctm": 512, "wordtag": 512, "nptag": 512}
+
+
+class ErnieCtmTokenizer(PretrainedTokenizer):
+    r"""
+    Construct an ERNIE-CTM tokenizer.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            File path of the vocabulary.
+        do_lower_case (bool, optional):
+            Whether or not to lowercase the input when tokenizing. Defaults to `True`
+        do_basic_tokenize (bool, optional):
+            Whether or not to do basic tokenization before WordPiece. Defaults to `True`
+        unk_token (str, optional):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str, optional):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str, optional):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token_template (str, optional)
+            The template of summary token for multiple summary placeholders. Defaults to `"[CLS{}]"`
+        cls_num (int, optional):
+            Summary placeholder used in ernie-ctm model. For catching a sentence global feature from multiple aware.
+            Defaults to `1`.
+        mask_token (str, optional):
+            A special token representing a masked token. This is the token used in the masked
+            language modeling task. This is the token which the model will try to predict the original unmasked ones.
+            Defaults to `"[MASK]"`.
+        strip_accents: (bool, optional):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import ErnieCtmTokenizer
+            tokenizer = ErnieCtmTokenizer.from_pretrained('ernie-ctm')
+
+            encoded_inputs = tokenizer('He was a puppeteer')
+            # encoded_inputs:
+            # {'input_ids': [101, 98, 153, 150, 99, 168, 146, 164, 99, 146, 99, 161, 166, 161,
+            #  161, 150, 165, 150, 150, 163, 102],
+            # 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
+    """
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "ernie-ctm": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_ctm/vocab.txt",
+            "wordtag": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_ctm/vocab.txt",
+            "nptag": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_ctm/vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "ernie-ctm": {"do_lower_case": True, "cls_num": 2},
+        "wordtag": {"do_lower_case": True, "cls_num": 2},
+        "nptag": {"do_lower_case": True, "cls_num": 2},
+    }
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token_template="[CLS{}]",
+        cls_num=1,
+        mask_token="[MASK]",
+        **kwargs
+    ):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the "
+                "vocabulary from a pretrained model please use "
+                "`tokenizer = ErnieTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+        self.do_lower_case = do_lower_case
+        self.cls_token_template = cls_token_template
+        self.cls_num = cls_num
+        self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token)
+
+    @property
+    def vocab_size(self):
+        """
+        Return the size of vocabulary.
+
+        Returns:
+            int: The size of vocabulary.
+        """
+        return len(self.vocab)
+
+    def convert_tokens_to_string(self, tokens):
+        r"""
+        Converts a sequence of tokens (list of string) in a single string. Since
+        the usage of WordPiece introducing `##` to concat subwords, also remove
+        `##` when converting.
+
+        Args:
+            tokens (List[str]): A list of string representing tokens to be converted.
+
+        Returns:
+            str: Converted string from tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import ErnieCtmTokenizer
+                tokenizer = ErnieCtmTokenizer.from_pretrained('ernie-ctm')
+
+                tokens = tokenizer.tokenize('He was a puppeteer')
+                strings = tokenizer.convert_tokens_to_string(tokens)
+                #he was a puppeteer
+
+        """
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequences for sequence classification tasks by
+        concatenating and add special tokens.
+
+        A ERNIE-CTM sequence has the following format:
+
+        - single sequence:      [CLS0][CLS1]... X [SEP]
+        - pair of sequences:        [CLS0][CLS1]... X [SEP] X [SEP]
+
+        Args:
+            token_ids_0 (List):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (List, optional):
+                Optional second list of IDs for sequence pairs. Defaults to ``None``.
+
+        Returns:
+            List[int]: The input_id with the appropriate special tokens.
+        """
+        cls_token_ids = [
+            self.convert_tokens_to_ids(self.cls_token_template.format(sid)) for sid in range(self.cls_num)
+        ]
+        if token_ids_1 is None:
+            return cls_token_ids + token_ids_0 + [self.sep_token_id]
+        return cls_token_ids + token_ids_0 + [self.sep_token_id] + token_ids_1 + [self.sep_token_id]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Creates a special tokens mask from the input sequences.
+        This method is called when adding special tokens using the tokenizer `encode` method.
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of `inputs_ids` for the second sequence.
+                Defaults to `None`.
+            already_has_special_tokens (bool, optional):
+                Whether or not the token list already contains special tokens for the model.
+                Defaults to `False`.
+
+        Returns:
+            List[int]: A list of integers which is either 0 or 1: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Creates a token_type mask from the input sequences.
+
+        If `token_ids_1` is not `None`, then a sequence pair
+        token_type mask has the following format:
+
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 2
+            | first sequence    | second sequence |
+
+        Else if `token_ids_1` is `None`, then a single sequence
+        token_type mask has the following format:
+
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2
+            |            first sequence           |
+
+        - 0 stands for the segment id of **first segment tokens**,
+        - 1 stands for the segment id of **second segment tokens**,
+        - 2 stands for the segment id of **cls_token**.
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of `inputs_ids` for the second sequence.
+                Defaults to `None`.
+
+        Returns:
+            List[int]: List of token type IDs according to the given sequence(s).
+        """
+        sep = [self.sep_token_id]
+        if token_ids_1 is None:
+            return (self.cls_num + len(token_ids_0 + sep)) * [0]
+        return (self.cls_num + len(token_ids_0 + sep)) * [0] + len(token_ids_1 + sep) * [1]
+
+    def num_special_tokens_to_add(self, pair=False):
+        """
+        Returns the number of added tokens when encoding a sequence with special tokens.
+
+        Note:
+            This encodes inputs and checks the number of added tokens, and is therefore not efficient.
+            Do not put this inside your training loop.
+
+        Args:
+            pair (bool, optional):
+                Whether the input is a sequence pair or a single sequence.
+                Defaults to `False` and the input is a single sequence.
+
+        Returns:
+            int: Number of tokens added to sequences.
+        """
+        if pair is True:
+            return self.cls_num + 2
+        else:
+            return self.cls_num + 1
+
+    def _tokenize(self, text, **kwargs):
+        r"""
+        Converts a string to a list of tokens.
+
+        Args:
+            text (str): The text to be tokenized.
+
+        Returns:
+            List[str]: A list of string representing converted tokens.
+        """
+        orig_tokens = list(text)
+        output_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case is True:
+                token = token.lower()
+            output_tokens.append(token)
+        return output_tokens
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_doc/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_doc/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_doc/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_doc/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_doc/configuration.py
new file mode 100644
index 000000000..d162b23f5
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_doc/configuration.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" DalleBart model configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["ERNIE_DOC_PRETRAINED_INIT_CONFIGURATION", "ErnieDocConfig", "ERNIE_DOC_PRETRAINED_RESOURCE_FILES_MAP"]
+
+ERNIE_DOC_PRETRAINED_INIT_CONFIGURATION = {
+    "ernie-doc-base-en": {
+        "attention_dropout_prob": 0.0,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.0,
+        "relu_dropout": 0.0,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "task_type_vocab_size": 3,
+        "vocab_size": 50265,
+        "memory_len": 128,
+        "epsilon": 1e-12,
+        "pad_token_id": 1,
+    },
+    "ernie-doc-base-zh": {
+        "attention_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "relu_dropout": 0.0,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "task_type_vocab_size": 3,
+        "vocab_size": 28000,
+        "memory_len": 128,
+        "epsilon": 1e-12,
+        "pad_token_id": 0,
+    },
+}
+
+ERNIE_DOC_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "ernie-doc-base-en": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie-doc-base-en/ernie-doc-base-en.pdparams",
+        "ernie-doc-base-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie-doc-base-zh/ernie-doc-base-zh.pdparams",
+    }
+}
+
+
+class ErnieDocConfig(PretrainedConfig):
+    """
+    The bare ERNIE-Doc Model outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        num_hidden_layers (int):
+            The number of hidden layers in the Transformer encoder.
+        num_attention_heads (int):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        hidden_size (int):
+            Dimensionality of the embedding layers, encoder layers and pooler layer.
+        hidden_dropout_prob (int):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        attention_dropout_prob (int):
+            The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
+        relu_dropout (int):
+            The dropout probability of FFN.
+        hidden_act (str):
+            The non-linear activation function of FFN.
+        memory_len (int):
+            The number of tokens to cache. If not 0, the last `memory_len` hidden states
+            in each layer will be cached into memory.
+        vocab_size (int):
+            Vocabulary size of `inputs_ids` in `ErnieDocModel`. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `ErnieDocModel`.
+        max_position_embeddings (int):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
+            sequence. Defaults to `512`.
+        task_type_vocab_size (int, optional):
+            The vocabulary size of the `token_type_ids`. Defaults to `3`.
+        normalize_before (bool, optional):
+            Indicate whether to put layer normalization into preprocessing of MHA and FFN sub-layers.
+            If True, pre-process is layer normalization and post-precess includes dropout,
+            residual connection. Otherwise, no pre-process and post-precess includes dropout,
+            residual connection, layer normalization. Defaults to `False`.
+        epsilon (float, optional):
+            The `epsilon` parameter used in :class:`paddle.nn.LayerNorm` for
+            initializing layer normalization layers. Defaults to `1e-5`.
+        rel_pos_params_sharing (bool, optional):
+            Whether to share the relative position parameters.
+            Defaults to `False`.
+        initializer_range (float, optional):
+            The standard deviation of the normal initializer for initializing all weight matrices.
+            Defaults to `0.02`.
+        pad_token_id (int, optional):
+            The token id of [PAD] token whose parameters won't be updated when training.
+            Defaults to `0`.
+        cls_token_idx (int, optional):
+            The token id of [CLS] token. Defaults to `-1`.
+    """
+
+    model_type = "ernie_doc"
+    pretrained_init_configuration = ERNIE_DOC_PRETRAINED_INIT_CONFIGURATION
+    attribute_map: Dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
+
+    def __init__(
+        self,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        hidden_size=768,
+        hidden_dropout_prob=0.1,
+        attention_dropout_prob=0.1,
+        relu_dropout=0.0,
+        hidden_act="gelu",
+        memory_len=128,
+        vocab_size=28000,
+        max_position_embeddings=512,
+        task_type_vocab_size=3,
+        normalize_before=False,
+        epsilon=1e-5,
+        rel_pos_params_sharing=False,
+        initializer_range=0.02,
+        pad_token_id=0,
+        cls_token_idx=-1,
+        **kwargs
+    ):
+        super(ErnieDocConfig, self).__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.attention_dropout_prob = attention_dropout_prob
+        self.relu_dropout = relu_dropout
+        self.hidden_act = hidden_act
+        self.memory_len = memory_len
+        self.hidden_size = hidden_size
+        self.task_type_vocab_size = task_type_vocab_size
+        self.normalize_before = normalize_before
+        self.epsilon = epsilon
+        self.rel_pos_params_sharing = rel_pos_params_sharing
+        self.cls_token_idx = cls_token_idx
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_doc/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_doc/modeling.py
new file mode 100644
index 000000000..c39de8b4c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_doc/modeling.py
@@ -0,0 +1,808 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .. import PretrainedModel, register_base_model
+from ..attention_utils import _convert_param_attr_to_list
+from .configuration import (
+    ERNIE_DOC_PRETRAINED_INIT_CONFIGURATION,
+    ERNIE_DOC_PRETRAINED_RESOURCE_FILES_MAP,
+    ErnieDocConfig,
+)
+
+__all__ = [
+    "ErnieDocModel",
+    "ErnieDocPretrainedModel",
+    "ErnieDocForSequenceClassification",
+    "ErnieDocForTokenClassification",
+    "ErnieDocForQuestionAnswering",
+]
+
+
+class PointwiseFFN(nn.Layer):
+    def __init__(self, d_inner_hid, d_hid, dropout_rate, hidden_act, weight_attr=None, bias_attr=None):
+        super(PointwiseFFN, self).__init__()
+        self.linear1 = nn.Linear(d_hid, d_inner_hid, weight_attr, bias_attr=bias_attr)
+        self.dropout = nn.Dropout(dropout_rate, mode="upscale_in_train")
+        self.linear2 = nn.Linear(d_inner_hid, d_hid, weight_attr, bias_attr=bias_attr)
+        self.activation = getattr(F, hidden_act)
+
+    def forward(self, x):
+        return self.linear2(self.dropout(self.activation(self.linear1(x))))
+
+
+class MultiHeadAttention(nn.Layer):
+    def __init__(
+        self,
+        d_key,
+        d_value,
+        d_model,
+        n_head=1,
+        r_w_bias=None,
+        r_r_bias=None,
+        r_t_bias=None,
+        dropout_rate=0.0,
+        weight_attr=None,
+        bias_attr=None,
+    ):
+        super(MultiHeadAttention, self).__init__()
+        self.d_key = d_key
+        self.d_value = d_value
+        self.d_model = d_model
+        self.n_head = n_head
+
+        assert d_key * n_head == d_model, "d_model must be divisible by n_head"
+
+        self.q_proj = nn.Linear(d_model, d_key * n_head, weight_attr=weight_attr, bias_attr=bias_attr)
+        self.k_proj = nn.Linear(d_model, d_key * n_head, weight_attr=weight_attr, bias_attr=bias_attr)
+        self.v_proj = nn.Linear(d_model, d_value * n_head, weight_attr=weight_attr, bias_attr=bias_attr)
+        self.r_proj = nn.Linear(d_model, d_key * n_head, weight_attr=weight_attr, bias_attr=bias_attr)
+        self.t_proj = nn.Linear(d_model, d_key * n_head, weight_attr=weight_attr, bias_attr=bias_attr)
+        self.out_proj = nn.Linear(d_model, d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+        self.r_w_bias = r_w_bias
+        self.r_r_bias = r_r_bias
+        self.r_t_bias = r_t_bias
+        self.dropout = nn.Dropout(dropout_rate, mode="upscale_in_train") if dropout_rate else None
+
+    def __compute_qkv(self, queries, keys, values, rel_pos, rel_task):
+
+        q = self.q_proj(queries)
+        k = self.k_proj(keys)
+        v = self.v_proj(values)
+        r = self.r_proj(rel_pos)
+        t = self.t_proj(rel_task)
+
+        return q, k, v, r, t
+
+    def __split_heads(self, x, d_model, n_head):
+        # x shape: [B, T, H]
+        x = x.reshape(shape=[0, 0, n_head, d_model // n_head])
+        # shape: [B, N, T, HH]
+        return paddle.transpose(x=x, perm=[0, 2, 1, 3])
+
+    def __rel_shift(self, x, klen=-1):
+        """
+        To perform relative attention, it should relatively shift the attention score matrix
+        See more details on: https://github.com/kimiyoung/transformer-xl/issues/8#issuecomment-454458852
+        """
+        # input shape: [B, N, T, 2 * T + M]
+        x_shape = x.shape
+
+        x = x.reshape([x_shape[0], x_shape[1], x_shape[3], x_shape[2]])
+        x = x[:, :, 1:, :]
+        x = x.reshape([x_shape[0], x_shape[1], x_shape[2], x_shape[3] - 1])
+
+        # output shape: [B, N, T, T + M]
+        return x[:, :, :, :klen]
+
+    def __scaled_dot_product_attention(self, q, k, v, r, t, attn_mask):
+        q_w, q_r, q_t = q
+        score_w = paddle.matmul(q_w, k, transpose_y=True)
+        score_r = paddle.matmul(q_r, r, transpose_y=True)
+        score_r = self.__rel_shift(score_r, k.shape[2])
+        score_t = paddle.matmul(q_t, t, transpose_y=True)
+
+        score = score_w + score_r + score_t
+        score = score * (self.d_key**-0.5)
+        if attn_mask is not None:
+            score += attn_mask
+        weights = F.softmax(score)
+        if self.dropout:
+            weights = self.dropout(weights)
+        out = paddle.matmul(weights, v)
+        return out
+
+    def __combine_heads(self, x):
+        if len(x.shape) == 3:
+            return x
+        if len(x.shape) != 4:
+            raise ValueError("Input(x) should be a 4-D Tensor.")
+        # x shape: [B, N, T, HH]
+        x = paddle.transpose(x, [0, 2, 1, 3])
+        # target shape:[B, T, H]
+        return x.reshape([0, 0, x.shape[2] * x.shape[3]])
+
+    def forward(self, queries, keys, values, rel_pos, rel_task, memory, attn_mask):
+
+        if memory is not None and len(memory.shape) > 1:
+            cat = paddle.concat([memory, queries], 1)
+        else:
+            cat = queries
+        keys, values = cat, cat
+
+        if not (
+            len(queries.shape)
+            == len(keys.shape)
+            == len(values.shape)
+            == len(rel_pos.shape)
+            == len(rel_task.shape)
+            == 3
+        ):
+            raise ValueError("Inputs: quries, keys, values, rel_pos and rel_task should all be 3-D tensors.")
+
+        q, k, v, r, t = self.__compute_qkv(queries, keys, values, rel_pos, rel_task)
+
+        q_w, q_r, q_t = list(map(lambda x: q + x.unsqueeze([0, 1]), [self.r_w_bias, self.r_r_bias, self.r_t_bias]))
+        q_w, q_r, q_t = list(map(lambda x: self.__split_heads(x, self.d_model, self.n_head), [q_w, q_r, q_t]))
+        k, v, r, t = list(map(lambda x: self.__split_heads(x, self.d_model, self.n_head), [k, v, r, t]))
+
+        ctx_multiheads = self.__scaled_dot_product_attention([q_w, q_r, q_t], k, v, r, t, attn_mask)
+
+        out = self.__combine_heads(ctx_multiheads)
+        out = self.out_proj(out)
+        return out
+
+
+class ErnieDocEncoderLayer(nn.Layer):
+    def __init__(
+        self,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        prepostprocess_dropout,
+        attention_dropout,
+        relu_dropout,
+        hidden_act,
+        normalize_before=False,
+        epsilon=1e-5,
+        rel_pos_params_sharing=False,
+        r_w_bias=None,
+        r_r_bias=None,
+        r_t_bias=None,
+        weight_attr=None,
+        bias_attr=None,
+    ):
+        self._config = locals()
+        self._config.pop("self")
+        self._config.pop("__class__", None)  # py3
+        super(ErnieDocEncoderLayer, self).__init__()
+        if not rel_pos_params_sharing:
+            r_w_bias, r_r_bias, r_t_bias = list(
+                map(
+                    lambda x: self.create_parameter(shape=[n_head * d_key], dtype="float32"),
+                    ["r_w_bias", "r_r_bias", "r_t_bias"],
+                )
+            )
+
+        weight_attrs = _convert_param_attr_to_list(weight_attr, 2)
+        bias_attrs = _convert_param_attr_to_list(bias_attr, 2)
+        self.attn = MultiHeadAttention(
+            d_key,
+            d_value,
+            d_model,
+            n_head,
+            r_w_bias,
+            r_r_bias,
+            r_t_bias,
+            attention_dropout,
+            weight_attr=weight_attrs[0],
+            bias_attr=bias_attrs[0],
+        )
+        self.ffn = PointwiseFFN(
+            d_inner_hid, d_model, relu_dropout, hidden_act, weight_attr=weight_attrs[1], bias_attr=bias_attrs[1]
+        )
+        self.norm1 = nn.LayerNorm(d_model, epsilon=epsilon)
+        self.norm2 = nn.LayerNorm(d_model, epsilon=epsilon)
+        self.dropout1 = nn.Dropout(prepostprocess_dropout, mode="upscale_in_train")
+        self.dropout2 = nn.Dropout(prepostprocess_dropout, mode="upscale_in_train")
+        self.d_model = d_model
+        self.epsilon = epsilon
+        self.normalize_before = normalize_before
+
+    def forward(self, enc_input, memory, rel_pos, rel_task, attn_mask):
+        residual = enc_input
+        if self.normalize_before:
+            enc_input = self.norm1(enc_input)
+
+        attn_output = self.attn(enc_input, enc_input, enc_input, rel_pos, rel_task, memory, attn_mask)
+        attn_output = residual + self.dropout1(attn_output)
+        if not self.normalize_before:
+            attn_output = self.norm1(attn_output)
+        residual = attn_output
+        if self.normalize_before:
+            attn_output = self.norm2(attn_output)
+        ffn_output = self.ffn(attn_output)
+        output = residual + self.dropout2(ffn_output)
+        if not self.normalize_before:
+            output = self.norm2(output)
+        return output
+
+
+class ErnieDocEncoder(nn.Layer):
+    def __init__(self, num_layers, encoder_layer, mem_len):
+        super(ErnieDocEncoder, self).__init__()
+        self.layers = nn.LayerList(
+            [(encoder_layer if i == 0 else type(encoder_layer)(**encoder_layer._config)) for i in range(num_layers)]
+        )
+        self.num_layers = num_layers
+        self.normalize_before = self.layers[0].normalize_before
+        self.mem_len = mem_len
+
+    def _cache_mem(self, curr_out, prev_mem):
+        if self.mem_len is None or self.mem_len == 0:
+            return None
+        if prev_mem is None:
+            new_mem = curr_out[:, -self.mem_len :, :]
+        else:
+            new_mem = paddle.concat([prev_mem, curr_out], 1)[:, -self.mem_len :, :]
+        new_mem.stop_gradient = True
+        return new_mem
+
+    def forward(self, enc_input, memories, rel_pos, rel_task, attn_mask):
+        # no need to normalize enc_input, cause it's already normalized outside.
+        new_mem = []
+        for i, encoder_layer in enumerate(self.layers):
+            enc_input = encoder_layer(enc_input, memories[i], rel_pos, rel_task, attn_mask)
+            new_mem += [self._cache_mem(enc_input, memories[i])]
+            # free the old memories explicitly to save gpu memory
+            memories[i] = None
+        return enc_input, new_mem
+
+
+class ErnieDocPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained ErnieDoc models. It provides ErnieDoc related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading
+    and loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    base_model_prefix = "ernie_doc"
+    config_class = ErnieDocConfig
+    resource_files_names = {"model_state": "model_state.pdparams"}
+
+    pretrained_init_configuration = ERNIE_DOC_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = ERNIE_DOC_PRETRAINED_RESOURCE_FILES_MAP
+
+    def _init_weights(self, layer):
+        # Initialization hook
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range,
+                        shape=layer.weight.shape,
+                    )
+                )
+
+
+class ErnieDocEmbeddings(nn.Layer):
+    def __init__(self, config: ErnieDocConfig):
+        super(ErnieDocEmbeddings, self).__init__()
+        self.word_emb = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.pos_emb = nn.Embedding(config.max_position_embeddings * 2 + config.memory_len, config.hidden_size)
+        self.token_type_emb = nn.Embedding(config.task_type_vocab_size, config.hidden_size)
+        self.memory_len = config.memory_len
+        self.dropouts = nn.LayerList([nn.Dropout(config.hidden_dropout_prob) for i in range(3)])
+        self.norms = nn.LayerList([nn.LayerNorm(config.hidden_size) for i in range(3)])
+
+    def forward(self, input_ids, token_type_ids, position_ids):
+        # input_embeddings: [B, T, H]
+        input_embeddings = self.word_emb(input_ids.squeeze(-1))
+        # position_embeddings: [B, 2 * T + M, H]
+        position_embeddings = self.pos_emb(position_ids.squeeze(-1))
+
+        batch_size = input_ids.shape[0]
+        token_type_ids = paddle.concat(
+            [
+                paddle.zeros(shape=[batch_size, self.memory_len, 1], dtype="int64") + token_type_ids[0, 0, 0],
+                token_type_ids,
+            ],
+            axis=1,
+        )
+        token_type_ids.stop_gradient = True
+        # token_type_embeddings: [B, M + T, H]
+        token_type_embeddings = self.token_type_emb(token_type_ids.squeeze(-1))
+        embs = [input_embeddings, position_embeddings, token_type_embeddings]
+        for i in range(len(embs)):
+            embs[i] = self.dropouts[i](self.norms[i](embs[i]))
+        return embs
+
+
+class ErnieDocPooler(nn.Layer):
+    """
+    get pool output
+    """
+
+    def __init__(self, config: ErnieDocConfig):
+        super(ErnieDocPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.cls_token_idx = config.cls_token_idx
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the last token.
+        cls_token_tensor = hidden_states[:, self.cls_token_idx]
+        pooled_output = self.dense(cls_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+@register_base_model
+class ErnieDocModel(ErnieDocPretrainedModel):
+    def __init__(self, config: ErnieDocConfig):
+        super(ErnieDocModel, self).__init__(config)
+        r_w_bias, r_r_bias, r_t_bias = None, None, None
+        if config.rel_pos_params_sharing:
+            r_w_bias, r_r_bias, r_t_bias = list(
+                map(
+                    lambda x: self.create_parameter(shape=[config.num_attention_heads * d_key], dtype="float32"),
+                    ["r_w_bias", "r_r_bias", "r_t_bias"],
+                )
+            )
+        d_key = config.hidden_size // config.num_attention_heads
+        d_value = config.hidden_size // config.num_attention_heads
+        d_inner_hid = config.hidden_size * 4
+        encoder_layer = ErnieDocEncoderLayer(
+            config.num_attention_heads,
+            d_key,
+            d_value,
+            config.hidden_size,
+            d_inner_hid,
+            config.hidden_dropout_prob,
+            config.attention_dropout_prob,
+            config.relu_dropout,
+            config.hidden_act,
+            normalize_before=config.normalize_before,
+            epsilon=config.epsilon,
+            rel_pos_params_sharing=config.rel_pos_params_sharing,
+            r_w_bias=r_w_bias,
+            r_r_bias=r_r_bias,
+            r_t_bias=r_t_bias,
+        )
+        self.initializer_range = config.initializer_range
+        self.n_head = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.memory_len = config.memory_len
+        self.encoder = ErnieDocEncoder(config.num_hidden_layers, encoder_layer, config.memory_len)
+        self.pad_token_id = config.pad_token_id
+        self.embeddings = ErnieDocEmbeddings(config)
+        self.pooler = ErnieDocPooler(config)
+
+    def _create_n_head_attn_mask(self, attn_mask, batch_size):
+        # attn_mask shape: [B, T, 1]
+        # concat an data_mask, shape: [B, M + T, 1]
+        data_mask = paddle.concat(
+            [paddle.ones(shape=[batch_size, self.memory_len, 1], dtype=attn_mask.dtype), attn_mask], axis=1
+        )
+        data_mask.stop_gradient = True
+        # create a self_attn_mask, shape: [B, T, M + T]
+        self_attn_mask = paddle.matmul(attn_mask, data_mask, transpose_y=True)
+        self_attn_mask = (self_attn_mask - 1) * 1e8
+        n_head_self_attn_mask = paddle.stack([self_attn_mask] * self.n_head, axis=1)
+        n_head_self_attn_mask.stop_gradient = True
+        return n_head_self_attn_mask
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_emb
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_emb = value
+
+    def forward(self, input_ids, memories, token_type_ids, position_ids, attn_mask):
+        r"""
+        The ErnieDocModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length, 1].
+            memories (List[Tensor]):
+                A list of length `n_layers` with each Tensor being a pre-computed hidden-state for each layer.
+                Each Tensor has a dtype `float32` and a shape of [batch_size, sequence_length, hidden_size].
+            token_type_ids (Tensor):
+                Segment token indices to indicate first and second portions of the inputs.
+                Indices can be either 0 or 1:
+
+                - 0 corresponds to a **sentence A** token,
+                - 1 corresponds to a **sentence B** token.
+
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length, 1].
+                Defaults to None, which means no segment embeddings is added to token embeddings.
+            position_ids (Tensor):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                config.max_position_embeddings - 1]``. Shape as `(batch_sie, num_tokens)` and dtype as `int32` or `int64`.
+            attn_mask (Tensor):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
+                [batch_size, num_attention_heads, sequence_length, sequence_length].
+                We use whole-word-mask in ERNIE, so the whole word will have the same value. For example, "使用" as a word,
+                "使" and "用" will have the same value.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+
+        Returns:
+            tuple : Returns tuple (``encoder_output``, ``pooled_output``, ``new_mem``).
+
+            With the fields:
+
+            - `encoder_output` (Tensor):
+                Sequence of hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+            - `pooled_output` (Tensor):
+                The output of first token (`[CLS]`) in sequence.
+                We "pool" the model by simply taking the hidden state corresponding to the first token.
+                Its data type should be float32 and its shape is [batch_size, hidden_size].
+
+            - `new_mem` (List[Tensor]):
+                A list of pre-computed hidden-states. The length of the list is `n_layers`.
+                Each element in the list is a Tensor with dtype `float32` and shape as [batch_size, memory_length, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import numpy as np
+                import paddle
+                from paddlenlp.transformers import ErnieDocModel
+                from paddlenlp.transformers import ErnieDocTokenizer
+
+                def get_related_pos(insts, seq_len, memory_len=128):
+                    beg = seq_len + seq_len + memory_len
+                    r_position = [list(range(beg - 1, seq_len - 1, -1)) + \
+                                list(range(0, seq_len)) for i in range(len(insts))]
+                    return np.array(r_position).astype('int64').reshape([len(insts), beg, 1])
+
+                tokenizer = ErnieDocTokenizer.from_pretrained('ernie-doc-base-zh')
+                model = ErnieDocModel.from_pretrained('ernie-doc-base-zh')
+
+                inputs = tokenizer("欢迎使用百度飞桨！")
+                inputs = {k:paddle.to_tensor([v + [0] * (128-len(v))]).unsqueeze(-1) for (k, v) in inputs.items()}
+
+                memories = [paddle.zeros([1, 128, 768], dtype="float32") for _ in range(12)]
+                position_ids = paddle.to_tensor(get_related_pos(inputs['input_ids'], 128, 128))
+                attn_mask = paddle.ones([1, 128, 1])
+
+                inputs['memories'] = memories
+                inputs['position_ids'] = position_ids
+                inputs['attn_mask'] = attn_mask
+
+                outputs = model(**inputs)
+
+                encoder_output = outputs[0]
+                pooled_output = outputs[1]
+                new_mem = outputs[2]
+
+        """
+
+        input_embeddings, position_embeddings, token_embeddings = self.embeddings(
+            input_ids, token_type_ids, position_ids
+        )
+
+        batch_size = input_embeddings.shape[0]
+        # [B, N, T, M + T]
+        n_head_self_attn_mask = self._create_n_head_attn_mask(attn_mask, batch_size)
+        # memories contains n_layer memory whose shape is [B, M, H]
+        encoder_output, new_mem = self.encoder(
+            enc_input=input_embeddings,
+            memories=memories,
+            rel_pos=position_embeddings,
+            rel_task=token_embeddings,
+            attn_mask=n_head_self_attn_mask,
+        )
+        pooled_output = self.pooler(encoder_output)
+        return encoder_output, pooled_output, new_mem
+
+
+class ErnieDocForSequenceClassification(ErnieDocPretrainedModel):
+    """
+    ErnieDoc Model with a linear layer on top of the output layer,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        config (:class:`ErnieDocConfig`):
+            An instance of ErnieDocConfig used to construct ErnieDocForSequenceClassification.
+    """
+
+    def __init__(self, config: ErnieDocConfig):
+        super(ErnieDocForSequenceClassification, self).__init__(config)
+        self.ernie_doc = ErnieDocModel(config)
+        self.num_labels = config.num_labels
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob,
+            mode="upscale_in_train",
+        )
+        self.linear = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, input_ids, memories, token_type_ids, position_ids, attn_mask):
+        r"""
+        The ErnieDocForSequenceClassification forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ErnieDocModel`.
+            memories (List[Tensor]):
+                See :class:`ErnieDocModel`.
+            token_type_ids (Tensor):
+                See :class:`ErnieDocModel`.
+            position_ids (Tensor):
+                See :class:`ErnieDocModel`.
+            attn_mask (Tensor):
+                See :class:`ErnieDocModel`.
+
+        Returns:
+            tuple : Returns tuple (`logits`, `mem`).
+
+            With the fields:
+
+            - `logits` (Tensor):
+                A tensor containing the [CLS] of hidden-states of the model at the output of last layer.
+                Each Tensor has a data type of `float32` and has a shape of [batch_size, num_labels].
+
+            - `mem` (List[Tensor]):
+                A list of pre-computed hidden-states. The length of the list is `n_layers`.
+                Each element in the list is a Tensor with dtype `float32` and has a shape of
+                [batch_size, memory_length, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import numpy as np
+                import paddle
+                from paddlenlp.transformers import ErnieDocForSequenceClassification
+                from paddlenlp.transformers import ErnieDocTokenizer
+
+                def get_related_pos(insts, seq_len, memory_len=128):
+                    beg = seq_len + seq_len + memory_len
+                    r_position = [list(range(beg - 1, seq_len - 1, -1)) + \
+                                list(range(0, seq_len)) for i in range(len(insts))]
+                    return np.array(r_position).astype('int64').reshape([len(insts), beg, 1])
+
+                tokenizer = ErnieDocTokenizer.from_pretrained('ernie-doc-base-zh')
+                model = ErnieDocForSequenceClassification.from_pretrained('ernie-doc-base-zh', num_labels=2)
+
+                inputs = tokenizer("欢迎使用百度飞桨！")
+                inputs = {k:paddle.to_tensor([v + [0] * (128-len(v))]).unsqueeze(-1) for (k, v) in inputs.items()}
+
+                memories = [paddle.zeros([1, 128, 768], dtype="float32") for _ in range(12)]
+                position_ids = paddle.to_tensor(get_related_pos(inputs['input_ids'], 128, 128))
+                attn_mask = paddle.ones([1, 128, 1])
+
+                inputs['memories'] = memories
+                inputs['position_ids'] = position_ids
+                inputs['attn_mask'] = attn_mask
+
+                outputs = model(**inputs)
+
+                logits = outputs[0]
+                mem = outputs[1]
+
+        """
+        _, pooled_output, mem = self.ernie_doc(input_ids, memories, token_type_ids, position_ids, attn_mask)
+        pooled_output = self.dropout(pooled_output)
+        logits = self.linear(pooled_output)
+        return logits, mem
+
+
+class ErnieDocForTokenClassification(ErnieDocPretrainedModel):
+    """
+    ErnieDoc Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        config (:class:`ErnieDocConfig`):
+            An instance of ErnieDocConfig used to construct ErnieDocForTokenClassification.
+    """
+
+    def __init__(self, config: ErnieDocConfig):
+        super(ErnieDocForTokenClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.ernie_doc = ErnieDocModel(config)
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob,
+            mode="upscale_in_train",
+        )
+        self.linear = nn.Linear(config.hidden_size, self.num_labels)
+
+    def forward(self, input_ids, memories, token_type_ids, position_ids, attn_mask):
+        r"""
+        The ErnieDocForTokenClassification forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ErnieDocModel`.
+            memories (List[Tensor]):
+                See :class:`ErnieDocModel`.
+            token_type_ids (Tensor):
+                See :class:`ErnieDocModel`.
+                Defaults to None, which means no segment embeddings is added to token embeddings.
+            position_ids (Tensor):
+                See :class:`ErnieDocModel`.
+            attn_mask (Tensor):
+                See :class:`ErnieDocModel`.
+
+        Returns:
+            tuple : Returns tuple (`logits`, `mem`).
+
+            With the fields:
+
+            - `logits` (Tensor):
+                A tensor containing the hidden-states of the model at the output of last layer.
+                Each Tensor has a data type of `float32` and has a shape of [batch_size, sequence_length, num_labels].
+
+            - `mem` (List[Tensor]):
+                A list of pre-computed hidden-states. The length of the list is `n_layers`.
+                Each element in the list is a Tensor with dtype `float32` and has a shape of
+                [batch_size, memory_length, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import numpy as np
+                import paddle
+                from paddlenlp.transformers import ErnieDocForTokenClassification
+                from paddlenlp.transformers import ErnieDocTokenizer
+
+                def get_related_pos(insts, seq_len, memory_len=128):
+                    beg = seq_len + seq_len + memory_len
+                    r_position = [list(range(beg - 1, seq_len - 1, -1)) + \
+                                list(range(0, seq_len)) for i in range(len(insts))]
+                    return np.array(r_position).astype('int64').reshape([len(insts), beg, 1])
+
+                tokenizer = ErnieDocTokenizer.from_pretrained('ernie-doc-base-zh')
+                model = ErnieDocForTokenClassification.from_pretrained('ernie-doc-base-zh', num_labels=2)
+
+                inputs = tokenizer("欢迎使用百度飞桨！")
+                inputs = {k:paddle.to_tensor([v + [0] * (128-len(v))]).unsqueeze(-1) for (k, v) in inputs.items()}
+
+                memories = [paddle.zeros([1, 128, 768], dtype="float32") for _ in range(12)]
+                position_ids = paddle.to_tensor(get_related_pos(inputs['input_ids'], 128, 128))
+                attn_mask = paddle.ones([1, 128, 1])
+
+                inputs['memories'] = memories
+                inputs['position_ids'] = position_ids
+                inputs['attn_mask'] = attn_mask
+
+                outputs = model(**inputs)
+
+                logits = outputs[0]
+                mem = outputs[1]
+
+        """
+        sequence_output, _, mem = self.ernie_doc(input_ids, memories, token_type_ids, position_ids, attn_mask)
+        sequence_output = self.dropout(sequence_output)
+        logits = self.linear(sequence_output)
+        return logits, mem
+
+
+class ErnieDocForQuestionAnswering(ErnieDocPretrainedModel):
+    """
+    ErnieDoc Model with a linear layer on top of the hidden-states
+    output to compute `span_start_logits` and `span_end_logits`,
+    designed for question-answering tasks like SQuAD.
+
+    Args:
+        config (:class:`ErnieDocConfig`):
+            An instance of ErnieDocConfig used to construct ErnieDocForQuestionAnswering.
+    """
+
+    def __init__(self, config: ErnieDocConfig):
+        super(ErnieDocForQuestionAnswering, self).__init__(config)
+        self.ernie_doc = ErnieDocModel(config)
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob,
+            mode="upscale_in_train",
+        )
+        self.linear = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, input_ids, memories, token_type_ids, position_ids, attn_mask):
+        r"""
+        The ErnieDocForQuestionAnswering forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ErnieDocModel`.
+            memories (List[Tensor]):
+                See :class:`ErnieDocModel`.
+            token_type_ids (Tensor):
+                See :class:`ErnieDocModel`.
+            position_ids (Tensor):
+                See :class:`ErnieDocModel`.
+            attn_mask (Tensor):
+                See :class:`ErnieDocModel`.
+
+        Returns:
+            tuple : Returns tuple (`start_logits`, `end_logits`, `mem`).
+
+            With the fields:
+
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - `mem` (List[Tensor]):
+                A list of pre-computed hidden-states. The length of the list is `n_layers`.
+                Each element in the list is a Tensor with dtype `float32` and has a shape of
+                [batch_size, memory_length, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import numpy as np
+                import paddle
+                from paddlenlp.transformers import ErnieDocForQuestionAnswering
+                from paddlenlp.transformers import ErnieDocTokenizer
+
+                def get_related_pos(insts, seq_len, memory_len=128):
+                    beg = seq_len + seq_len + memory_len
+                    r_position = [list(range(beg - 1, seq_len - 1, -1)) + \
+                                list(range(0, seq_len)) for i in range(len(insts))]
+                    return np.array(r_position).astype('int64').reshape([len(insts), beg, 1])
+
+                tokenizer = ErnieDocTokenizer.from_pretrained('ernie-doc-base-zh')
+                model = ErnieDocForQuestionAnswering.from_pretrained('ernie-doc-base-zh')
+
+                inputs = tokenizer("欢迎使用百度飞桨！")
+                inputs = {k:paddle.to_tensor([v + [0] * (128-len(v))]).unsqueeze(-1) for (k, v) in inputs.items()}
+
+                memories = [paddle.zeros([1, 128, 768], dtype="float32") for _ in range(12)]
+                position_ids = paddle.to_tensor(get_related_pos(inputs['input_ids'], 128, 128))
+                attn_mask = paddle.ones([1, 128, 1])
+
+                inputs['memories'] = memories
+                inputs['position_ids'] = position_ids
+                inputs['attn_mask'] = attn_mask
+
+                outputs = model(**inputs)
+
+                start_logits = outputs[0]
+                end_logits = outputs[1]
+                mem = outputs[2]
+
+        """
+        sequence_output, _, mem = self.ernie_doc(input_ids, memories, token_type_ids, position_ids, attn_mask)
+        sequence_output = self.dropout(sequence_output)
+        logits = self.linear(sequence_output)
+        start_logits, end_logits = paddle.transpose(logits, perm=[2, 0, 1])
+        return start_logits, end_logits, mem
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_doc/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_doc/tokenizer.py
new file mode 100644
index 000000000..904f52a5c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_doc/tokenizer.py
@@ -0,0 +1,193 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .. import BPETokenizer
+from ..ernie.tokenizer import ErnieTokenizer
+
+__all__ = ["ErnieDocTokenizer", "ErnieDocBPETokenizer"]
+
+
+class ErnieDocTokenizer(ErnieTokenizer):
+    r"""
+    Constructs an ERNIE-Doc tokenizer.
+    It uses a basic tokenizer to do punctuation splitting, lower casing and so on,
+    and follows a WordPiece tokenizer to tokenize as subwords.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.ernie.tokenizer.ErnieTokenizer`.
+    For more information regarding those methods, please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            The vocabulary file path (ends with '.txt') required to instantiate
+            a `WordpieceTokenizer`.
+        do_lower_case (str, optional):
+            Whether or not to lowercase the input when tokenizing.
+            Defaults to`True`.
+        unk_token (str, optional):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str, optional):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str, optional):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str, optional):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str, optional):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import ErnieDocTokenizer
+            tokenizer = ErnieDocTokenizer.from_pretrained('ernie-doc-base-zh')
+            encoded_inputs = tokenizer('He was a puppeteer')
+
+    """
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "ernie-doc-base-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie-doc-base-zh/vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "ernie-doc-base-zh": {"do_lower_case": True},
+    }
+
+    max_model_input_sizes = {
+        "ernie-doc-base-zh": 512,
+    }
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+        super(ErnieDocTokenizer, self).__init__(
+            vocab_file,
+            do_lower_case=do_lower_case,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+
+
+class ErnieDocBPETokenizer(BPETokenizer):
+    r"""
+    Constructs an ERNIE-Doc BPE tokenizer. It uses a bpe tokenizer to do punctuation
+    splitting, lower casing and so on, then tokenize words as subwords.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.BPETokenizer`.
+    For more information regarding those methods, please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            File path of the vocabulary.
+        encoder_json_path (str, optional):
+            File path of the id to vocab.
+        vocab_bpe_path (str, optional):
+            File path of word merge text.
+        unk_token (str, optional):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str, optional):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str, optional):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str, optional):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str, optional):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import ErnieDocBPETokenizer
+            tokenizer = ErnieDocBPETokenizer.from_pretrained('ernie-doc-base-en')
+            encoded_inputs = tokenizer('He was a puppeteer')
+
+    """
+    resource_files_names = {
+        "vocab_file": "vocab.txt",
+        "encoder_json_path": "encoder.json",
+        "vocab_bpe_path": "vocab.bpe",
+    }  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "ernie-doc-base-en": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie-doc-base-en/vocab.txt"
+        },
+        "encoder_json_path": {
+            "ernie-doc-base-en": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie-doc-base-en/encoder.json"
+        },
+        "vocab_bpe_path": {
+            "ernie-doc-base-en": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie-doc-base-en/vocab.bpe"
+        },
+    }
+    pretrained_init_configuration = {
+        "ernie-doc-base-en": {"unk_token": "[UNK]"},
+    }
+
+    def __init__(
+        self,
+        vocab_file,
+        encoder_json_path="./configs/encoder.json",
+        vocab_bpe_path="./configs/vocab.bpe",
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+        super(ErnieDocBPETokenizer, self).__init__(
+            vocab_file,
+            encoder_json_path=encoder_json_path,
+            vocab_bpe_path=vocab_bpe_path,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        """
+        Return the size of vocabulary.
+
+        Returns:
+            int: The size of vocabulary.
+        """
+        return len(self.vocab)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gen/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gen/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gen/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gen/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gen/modeling.py
new file mode 100644
index 000000000..3a0a2f5fa
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gen/modeling.py
@@ -0,0 +1,633 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import io
+import json
+import os
+
+import paddle
+import six
+from paddle import nn
+from paddle.nn import functional as F
+
+from paddlenlp.transformers import (
+    BertPretrainedModel,
+    ElectraPretrainedModel,
+    ErniePretrainedModel,
+    RobertaPretrainedModel,
+)
+from paddlenlp.utils.download import resolve_file_path
+from paddlenlp.utils.log import logger
+
+from .. import PretrainedModel, register_base_model
+from ..utils import InitTrackerMeta, fn_args_to_dict
+
+__all__ = ["ErnieGenPretrainedModel", "ErnieForGeneration", "ErnieGenModel"]
+
+
+def _build_linear(n_in, n_out, name, init):
+    return nn.Linear(
+        n_in,
+        n_out,
+        weight_attr=paddle.ParamAttr(name="%s.w_0" % name if name is not None else None, initializer=init),
+        bias_attr="%s.b_0" % name if name is not None else None,
+    )
+
+
+def _build_ln(n_in, name):
+    return nn.LayerNorm(
+        normalized_shape=n_in,
+        weight_attr=paddle.ParamAttr(
+            name="%s_layer_norm_scale" % name if name is not None else None, initializer=nn.initializer.Constant(1.0)
+        ),
+        bias_attr=paddle.ParamAttr(
+            name="%s_layer_norm_bias" % name if name is not None else None, initializer=nn.initializer.Constant(1.0)
+        ),
+    )
+
+
+def append_name(name, postfix):
+    if name is None:
+        ret = None
+    elif name == "":
+        ret = postfix
+    else:
+        ret = "%s_%s" % (name, postfix)
+    return ret
+
+
+class AttentionLayer(nn.Layer):
+    def __init__(self, cfg, name=None):
+        super(AttentionLayer, self).__init__()
+        initializer = nn.initializer.TruncatedNormal(std=cfg["initializer_range"])
+        d_model = cfg["hidden_size"]
+        n_head = cfg["num_attention_heads"]
+        assert d_model % n_head == 0
+        d_model_q = cfg.get("query_hidden_size_per_head", d_model // n_head) * n_head
+        d_model_v = cfg.get("value_hidden_size_per_head", d_model // n_head) * n_head
+        self.n_head = n_head
+        self.d_key = d_model_q // n_head
+        self.q = _build_linear(d_model, d_model_q, append_name(name, "query_fc"), initializer)
+        self.k = _build_linear(d_model, d_model_q, append_name(name, "key_fc"), initializer)
+        self.v = _build_linear(d_model, d_model_v, append_name(name, "value_fc"), initializer)
+        self.o = _build_linear(d_model_v, d_model, append_name(name, "output_fc"), initializer)
+        self.dropout = nn.Dropout(p=cfg["attention_probs_dropout_prob"])
+
+    def forward(self, queries, keys, values, attn_bias, past_cache):
+        assert len(queries.shape) == len(keys.shape) == len(values.shape) == 3
+        # bsz, q_len, q_dim = queries.shape
+        # bsz, k_len, k_dim = keys.shape
+        # bsz, v_len, v_dim = values.shape
+        # assert k_len == v_len
+
+        q = self.q(queries)
+        k = self.k(keys)
+        v = self.v(values)
+
+        cache = (k, v)
+        if past_cache is not None:
+            cached_k, cached_v = past_cache
+            k = paddle.concat([cached_k, k], 1)
+            v = paddle.concat([cached_v, v], 1)
+
+        q = q.reshape([0, 0, self.n_head, q.shape[-1] // self.n_head]).transpose(
+            [0, 2, 1, 3]
+        )  # [batch, head, seq, dim]
+        k = k.reshape([0, 0, self.n_head, k.shape[-1] // self.n_head]).transpose(
+            [0, 2, 1, 3]
+        )  # [batch, head, seq, dim]
+        v = v.reshape([0, 0, self.n_head, v.shape[-1] // self.n_head]).transpose(
+            [0, 2, 1, 3]
+        )  # [batch, head, seq, dim]
+
+        q = q.scale(self.d_key**-0.5)
+        score = q.matmul(k, transpose_y=True)
+        if attn_bias is not None:
+            score += attn_bias
+        score = F.softmax(score)
+        score = self.dropout(score)
+
+        out = score.matmul(v).transpose([0, 2, 1, 3])
+        out = out.reshape([0, 0, out.shape[2] * out.shape[3]])
+        out = self.o(out)
+        return out, cache
+
+
+class PositionwiseFeedForwardLayer(nn.Layer):
+    def __init__(self, cfg, name=None):
+        super(PositionwiseFeedForwardLayer, self).__init__()
+        initializer = nn.initializer.TruncatedNormal(std=cfg["initializer_range"])
+        d_model = cfg["hidden_size"]
+        d_ffn = cfg.get("intermediate_size", 4 * d_model)
+        self.act = getattr(paddle.nn.functional, cfg["hidden_act"])
+        self.i = _build_linear(
+            d_model,
+            d_ffn,
+            append_name(name, "fc_0"),
+            initializer,
+        )
+        self.o = _build_linear(d_ffn, d_model, append_name(name, "fc_1"), initializer)
+        prob = cfg.get("intermediate_dropout_prob", 0.0)
+        self.dropout = nn.Dropout(p=prob)
+
+    def forward(self, inputs):
+        hidden = self.act(self.i(inputs))
+        hidden = self.dropout(hidden)
+        out = self.o(hidden)
+        return out
+
+
+class ErnieEncoderLayer(nn.Layer):
+    def __init__(self, cfg, name=None):
+        super(ErnieEncoderLayer, self).__init__()
+        d_model = cfg["hidden_size"]
+        self.attn = AttentionLayer(cfg, name=append_name(name, "multi_head_att"))
+        self.ln1 = _build_ln(d_model, name=append_name(name, "post_att"))
+        self.ffn = PositionwiseFeedForwardLayer(cfg, name=append_name(name, "ffn"))
+        self.ln2 = _build_ln(d_model, name=append_name(name, "post_ffn"))
+        prob = cfg.get("intermediate_dropout_prob", cfg["hidden_dropout_prob"])
+        self.dropout = nn.Dropout(p=prob)
+
+    def forward(self, inputs, attn_bias=None, past_cache=None):
+        attn_out, cache = self.attn(inputs, inputs, inputs, attn_bias, past_cache=past_cache)  # self attn
+        attn_out = self.dropout(attn_out)
+        hidden = attn_out + inputs
+        hidden = self.ln1(hidden)  # dropout/ add/ norm
+
+        ffn_out = self.ffn(hidden)
+        ffn_out = self.dropout(ffn_out)
+        hidden = ffn_out + hidden
+        hidden = self.ln2(hidden)
+        return hidden, cache
+
+
+class ErnieEncoderStack(nn.Layer):
+    def __init__(self, cfg, name=None):
+        super(ErnieEncoderStack, self).__init__()
+        n_layers = cfg["num_hidden_layers"]
+        self.block = nn.LayerList([ErnieEncoderLayer(cfg, append_name(name, "layer_%d" % i)) for i in range(n_layers)])
+
+    def forward(self, inputs, attn_bias=None, past_cache=None):
+        if past_cache is not None:
+            assert isinstance(past_cache, tuple), "unknown type of `past_cache`, expect tuple or list. got %s" % repr(
+                type(past_cache)
+            )
+            past_cache = list(zip(*past_cache))
+        else:
+            past_cache = [None] * len(self.block)
+        cache_list_k, cache_list_v, hidden_list = [], [], [inputs]
+
+        for b, p in zip(self.block, past_cache):
+            inputs, cache = b(inputs, attn_bias=attn_bias, past_cache=p)
+            cache_k, cache_v = cache
+            cache_list_k.append(cache_k)
+            cache_list_v.append(cache_v)
+            hidden_list.append(inputs)
+
+        return inputs, hidden_list, (cache_list_k, cache_list_v)
+
+
+@six.add_metaclass(InitTrackerMeta)
+class ErnieGenPretrainedModel(PretrainedModel):
+    r"""
+    An abstract class for pretrained ErnieGen models. It provides ErnieGen related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+
+    """
+    ernie_gen_pretrained_init_configuration = {
+        "ernie-gen-base-en": {
+            "attention_probs_dropout_prob": 0.1,
+            "hidden_act": "gelu",
+            "hidden_dropout_prob": 0.1,
+            "hidden_size": 768,
+            "initializer_range": 0.02,
+            "intermediate_size": 3072,
+            "max_position_embeddings": 1024,
+            "num_attention_heads": 12,
+            "num_hidden_layers": 12,
+            "type_vocab_size": 4,
+            "vocab_size": 30522,
+            "pad_token_id": 0,
+        },
+        "ernie-gen-large-en": {
+            "attention_probs_dropout_prob": 0.1,
+            "hidden_act": "gelu",
+            "hidden_dropout_prob": 0.1,
+            "hidden_size": 1024,
+            "initializer_range": 0.02,
+            "intermediate_size": 4096,
+            "max_position_embeddings": 1024,
+            "num_attention_heads": 16,
+            "num_hidden_layers": 24,
+            "type_vocab_size": 4,
+            "vocab_size": 30522,
+            "pad_token_id": 0,
+        },
+        "ernie-gen-large-en-430g": {
+            "attention_probs_dropout_prob": 0.1,
+            "hidden_act": "gelu",
+            "hidden_dropout_prob": 0.1,
+            "hidden_size": 1024,
+            "initializer_range": 0.02,
+            "intermediate_size": 4096,
+            "max_position_embeddings": 1024,
+            "num_attention_heads": 16,
+            "num_hidden_layers": 24,
+            "type_vocab_size": 4,
+            "vocab_size": 30522,
+            "pad_token_id": 0,
+        },
+    }
+    ernie_gen_pretrained_resource_files_map = {
+        "model_state": {
+            "ernie-gen-base-en": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie-gen-base/ernie_gen_base.pdparams",
+            "ernie-gen-large-en": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie-gen-large/ernie_gen_large.pdparams",
+            "ernie-gen-large-en-430g": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie-gen-large-430g/ernie_gen_large_430g.pdparams",
+        }
+    }
+
+    # Support more model to warm start.
+    pretrained_init_configuration = {
+        **ernie_gen_pretrained_init_configuration,
+        **BertPretrainedModel.pretrained_init_configuration,
+        **ElectraPretrainedModel.pretrained_init_configuration,
+        **RobertaPretrainedModel.pretrained_init_configuration,
+        **ErniePretrainedModel.pretrained_init_configuration,
+    }
+    pretrained_resource_files_map = {
+        "model_state": {
+            **ernie_gen_pretrained_resource_files_map["model_state"],
+            **BertPretrainedModel.pretrained_resource_files_map["model_state"],
+            **ElectraPretrainedModel.pretrained_resource_files_map["model_state"],
+            **RobertaPretrainedModel.pretrained_resource_files_map["model_state"],
+            **ErniePretrainedModel.pretrained_resource_files_map["model_state"],
+        }
+    }
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        cache_dir = kwargs.pop("cache_dir", None)
+        from_hf_hub = kwargs.pop("from_hf_hub", False)
+        from_aistudio = kwargs.pop("from_aistudio", False)
+        subfolder = kwargs.pop("subfolder", "")
+
+        pretrained_models = list(cls.pretrained_init_configuration.keys())
+        resource_files = {}
+        init_configuration = {}
+        if pretrained_model_name_or_path in pretrained_models:
+            for file_id, map_list in cls.pretrained_resource_files_map.items():
+                resource_files[file_id] = map_list[pretrained_model_name_or_path]
+            init_configuration = copy.deepcopy(cls.pretrained_init_configuration[pretrained_model_name_or_path])
+        else:
+            if os.path.isdir(pretrained_model_name_or_path):
+                for file_id, file_name in cls.resource_files_names.items():
+                    full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
+                    if os.path.isfile(full_file_name):
+                        resource_files[file_id] = full_file_name
+                resource_files["model_config_file"] = os.path.join(
+                    pretrained_model_name_or_path, cls.model_config_file
+                )
+            else:
+                raise ValueError(
+                    "Calling {}.from_pretrained() with a model identifier or the "
+                    "path to a directory instead. The supported model "
+                    "identifiers are as follows: {}".format(cls.__name__, cls.pretrained_init_configuration.keys())
+                )
+
+        # default_root = os.path.join(MODEL_HOME, pretrained_model_name_or_path)
+        resolved_resource_files = {}
+        for file_id, file_path in resource_files.items():
+            if file_path is None or os.path.isfile(file_path):
+                resolved_resource_files[file_id] = file_path
+                continue
+            resolved_resource_files[file_id] = resolve_file_path(
+                pretrained_model_name_or_path,
+                [file_path],
+                subfolder,
+                cache_dir=cache_dir,
+                from_aistudio=from_aistudio,
+                from_hf_hub=from_hf_hub,
+            )
+
+        # Prepare model initialization kwargs
+        # Did we saved some inputs and kwargs to reload ?
+        model_config_file = resolved_resource_files.pop("model_config_file", None)
+        if model_config_file is not None:
+            with io.open(model_config_file, encoding="utf-8") as f:
+                init_kwargs = json.load(f)
+        else:
+            init_kwargs = init_configuration
+
+        # position args are stored in kwargs, maybe better not include
+        init_args = init_kwargs.pop("init_args", [{}])[0]
+        if len(init_args) == 0:
+            init_args = init_kwargs
+
+        name_prefix = kwargs.pop("name", None)
+        init_kwargs.pop("name", None)
+        init_args.pop("name", None)
+
+        model = cls(init_args, name=name_prefix)
+
+        weight_path = resolved_resource_files["model_state"]
+        logger.info("loading pretrained model from %s" % weight_path)
+
+        if os.path.exists(weight_path):
+            m = paddle.load(weight_path)
+            params_name = list(m.keys())
+            if "mlm.weight" not in params_name:
+                # ernie_gen is not implemented with paddle.transformer.
+                # So, when loading the params saved by paddle.transformer, we should convert the params name.
+                # We will update ernie_gen with paddle.transformer in the future.
+                name_index_begin = params_name[0].index(".") + 1
+                for old_name in params_name:
+                    new_name = (
+                        old_name[name_index_begin:]
+                        .replace("embeddings.word_embeddings", "word_emb")
+                        .replace("embeddings.position_embeddings", "pos_emb")
+                        .replace("embeddings.token_type_embeddings", "sent_emb")
+                        .replace("embeddings.layer_norm", "ln")
+                        .replace("encoder.layers", "encoder_stack.block")
+                        .replace("self_attn", "attn")
+                        .replace("k_proj", "k")
+                        .replace("q_proj", "q")
+                        .replace("v_proj", "v")
+                        .replace("out_proj", "o")
+                        .replace("linear1", "ffn.i")
+                        .replace("linear2", "ffn.o")
+                        .replace("norm1", "ln1")
+                        .replace("norm2", "ln2")
+                        .replace("pooler.dense", "pooler")
+                    )
+                    m[new_name] = m.pop(old_name)
+            for k, v in model.state_dict().items():
+                if k not in m:
+                    logger.info("param:%s not set in pretrained model, skip" % k)
+                    m[k] = v  # FIXME: no need to do this in the future
+            model.set_state_dict(m)
+        else:
+            raise ValueError("weight file not found in pretrain dir: %s" % weight_path)
+        return model
+
+    def _post_init(self, original_init, *args, **kwargs):
+        """
+        It would be hooked after `__init__` to add a dict including arguments of
+        `__init__` as a attribute named `config` of the prtrained model instance.
+        """
+        init_dict = fn_args_to_dict(original_init, *args, **kwargs)
+        self.config = init_dict
+
+
+@register_base_model
+class ErnieModel(ErnieGenPretrainedModel):
+    def __init__(self, cfg, name=None):
+        """
+        Fundamental pretrained Ernie model
+        """
+        logger.debug("init ErnieModel with config: %s" % repr(cfg))
+        nn.Layer.__init__(self)
+        d_model = cfg["hidden_size"]
+        d_emb = cfg.get("emb_size", cfg["hidden_size"])
+        d_vocab = cfg["vocab_size"]
+        d_pos = cfg["max_position_embeddings"]
+        d_sent = cfg.get("sent_type_vocab_size") or cfg["type_vocab_size"]
+        self.n_head = cfg["num_attention_heads"]
+        self.return_additional_info = cfg.get("return_additional_info", False)
+        initializer = nn.initializer.TruncatedNormal(std=cfg["initializer_range"])
+
+        self.ln = _build_ln(d_model, name=append_name(name, "pre_encoder"))
+        self.word_emb = nn.Embedding(
+            d_vocab,
+            d_emb,
+            weight_attr=paddle.ParamAttr(name=append_name(name, "word_embedding"), initializer=initializer),
+        )
+        self.pos_emb = nn.Embedding(
+            d_pos,
+            d_emb,
+            weight_attr=paddle.ParamAttr(name=append_name(name, "pos_embedding"), initializer=initializer),
+        )
+        self.sent_emb = nn.Embedding(
+            d_sent,
+            d_emb,
+            weight_attr=paddle.ParamAttr(name=append_name(name, "sent_embedding"), initializer=initializer),
+        )
+        prob = cfg["hidden_dropout_prob"]
+        self.dropout = nn.Dropout(p=prob)
+
+        self.encoder_stack = ErnieEncoderStack(cfg, append_name(name, "encoder"))
+
+    def forward(
+        self,
+        src_ids,
+        sent_ids=None,
+        pos_ids=None,
+        input_mask=None,
+        attn_bias=None,
+        past_cache=None,
+        use_causal_mask=False,
+    ):
+        """
+        Args:
+            src_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary.
+                They are numerical representations of tokens that build the input sequence.
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
+            sent_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            pos_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `[batch_size, num_tokens]` and dtype as int64. Defaults to `None`.
+            input_mask(Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
+                [batch_size, num_attention_heads, sequence_length, sequence_length].
+                We use whole-word-mask in ERNIE, so the whole word will have the same value. For example, "使用" as a word,
+                "使" and "用" will have the same value.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            attn_bias(Tensor, optional):
+                3D version of `input_mask`, if set, overrides `input_mask`;
+                if set not False, attention mask willed not be applied.
+            past_cache(Tensor, optional, tuple of two lists: cached key and cached value,
+                Each is a list of `Variable`s of shape `[batch_size, seq_len, hidden_size]`:
+                cached key/value tensor that will be concated to generated key/value when performing self attention.
+                if set, `attn_bias` should not be None.
+
+        Returns:
+            tuple: Returns tuple (`encoded`, `additional_info`).
+
+            With the fields:
+
+            - `encoded`(Tensor):
+                The output logits of transformer stack.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+            - `additional_info` (dict):
+                Additional middle level info, inclues all hidden stats and k/v caches.
+        """
+        assert len(src_ids.shape) == 2, "expect src_ids.shape = [batch, sequecen], got %s" % (repr(src_ids.shape))
+        assert (
+            attn_bias is not None if past_cache else True
+        ), "if `past_cache` is specified; attn_bias should not be None"
+        d_seqlen = src_ids.shape[1]
+        if pos_ids is None:
+            pos_ids = paddle.arange(0, d_seqlen, 1, dtype="int32").reshape([1, -1]).cast("int64")
+        if attn_bias is None:
+            if input_mask is None:
+                input_mask = paddle.cast(src_ids != 0, "float32")
+            assert len(input_mask.shape) == 2
+            input_mask = input_mask.unsqueeze(-1)
+            attn_bias = input_mask.matmul(input_mask, transpose_y=True)
+            if use_causal_mask:
+                sequence = paddle.reshape(paddle.arange(0, d_seqlen, 1, dtype="float32") + 1.0, [1, 1, -1, 1])
+                causal_mask = (sequence.matmul(1.0 / sequence, transpose_y=True) >= 1.0).cast("float32")
+                attn_bias *= causal_mask
+        else:
+            assert len(attn_bias.shape) == 3, "expect attn_bias tobe rank 3, got %r" % attn_bias.shape
+        attn_bias = (1.0 - attn_bias) * -10000.0
+        attn_bias = attn_bias.unsqueeze(1).tile([1, self.n_head, 1, 1])  # avoid broadcast =_=
+
+        if sent_ids is None:
+            sent_ids = paddle.zeros_like(src_ids)
+
+        src_embedded = self.word_emb(src_ids)
+        pos_embedded = self.pos_emb(pos_ids)
+        sent_embedded = self.sent_emb(sent_ids)
+        embedded = src_embedded + pos_embedded + sent_embedded
+
+        embedded = self.dropout(self.ln(embedded))
+
+        encoded, hidden_list, cache_list = self.encoder_stack(embedded, attn_bias, past_cache=past_cache)
+
+        additional_info = {
+            "hiddens": hidden_list,
+            "caches": cache_list,
+        }
+
+        return encoded, additional_info
+
+
+class ErnieForGeneration(ErnieModel):
+    """
+    Ernie Model for sequence to sequence generation.
+
+    This model inherits from :class:`~paddlenlp.transformers.ernie.modeling.ErnieModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    """
+
+    def __init__(self, cfg, name=None):
+        super(ErnieForGeneration, self).__init__(cfg, name=name)
+        initializer = nn.initializer.TruncatedNormal(std=cfg["initializer_range"])
+        d_model = cfg["hidden_size"]
+        d_vocab = cfg["vocab_size"]
+
+        self.mlm = _build_linear(
+            d_model,
+            d_model,
+            append_name(name, "mask_lm_trans_fc"),
+            initializer,
+        )
+        self.act = getattr(paddle.nn.functional, cfg["hidden_act"])
+        self.mlm_ln = _build_ln(d_model, name=append_name(name, "mask_lm_trans"))
+        self.mlm_bias = paddle.create_parameter(
+            dtype="float32",
+            shape=[d_vocab],
+            attr=paddle.ParamAttr(
+                name=append_name(name, "mask_lm_out_fc.b_0"), initializer=nn.initializer.Constant(value=0.0)
+            ),
+            is_bias=True,
+        )
+
+    def forward(self, *args, **kwargs):
+        """
+        Args:
+            tgt_labels(Tensor, optional):
+                The ground truth target sequence id (hard label) or distribution (soft label).
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length] or
+                [batch_size, sequence_length, sequence_length].
+            tgt_pos(Tensor, optional):
+                Index of tgt_labels in `src_ids`.
+                It's data type should be `int64` and has a shape of [n_targets, 2]).
+            encode_only(bool, optional):
+                Whether the model will output the logits or only encode the inputs.
+                If `encode_only` is `True`, `loss` and `logits_2d` will not be returned.
+
+        Returns:
+            tuple: Returns tuple (`None`, `None`, `info`) if `encode_only` is `True`,
+            returns (`output_ids`, `logits`, `info`) if `tgt_labels` or `tgt_pos` is `None`,
+            else, returns (`loss`, `logits_2d`, `info`).
+
+            With the fields:
+
+            - `info`(dict):
+                 Middle level info, includes all hidden stats and k/v caches.
+
+            - `output_ids`(Tensor):
+                The output index. Its data type should be float32 and its shape is [batch_size].
+                If `encode_only`, returns None.
+
+            - `logits`(Tensor):
+                Logits for every targets.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+                If `encode_only`, returns None.
+
+            - `loss`(Tensor):
+                Cross entropy loss mean over every target label.
+                If `encode_only`, returns None.
+
+            - `logits_2d`(Tensor):
+                Logits for every targets if `tgt_labels` or `tgt_pos` is not `None` .
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+        """
+        tgt_labels = kwargs.pop("tgt_labels", None)
+        tgt_pos = kwargs.pop("tgt_pos", None)
+        encode_only = kwargs.pop("encode_only", False)
+        encoded, info = ErnieModel.forward(self, *args, **kwargs)
+        if encode_only:
+            return None, None, info
+        if tgt_labels is None or tgt_pos is None:
+            encoded = self.act(self.mlm(encoded))
+            encoded = self.mlm_ln(encoded)
+            logits = encoded.matmul(self.word_emb.weight, transpose_y=True) + self.mlm_bias
+            output_ids = logits.argmax(-1)
+            return output_ids, logits, info
+        else:
+            encoded_2d = encoded.gather_nd(tgt_pos)
+            encoded_2d = self.act(self.mlm(encoded_2d))
+            encoded_2d = self.mlm_ln(encoded_2d)
+            logits_2d = encoded_2d.matmul(self.word_emb.weight, transpose_y=True) + self.mlm_bias
+            if len(tgt_labels.shape) == 1:
+                tgt_labels = paddle.reshape(tgt_labels, [-1, 1])
+
+            loss = F.cross_entropy(logits_2d, tgt_labels, reduction="none", soft_label=(tgt_labels.shape[-1] != 1))
+
+            return loss, logits_2d, info
+
+
+ErnieGenModel = ErnieForGeneration
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gen/params_map.json b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gen/params_map.json
new file mode 100644
index 000000000..320e940ee
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gen/params_map.json
@@ -0,0 +1 @@
+{"embeddings.word_embeddings.weight": "word_emb.weight", "embeddings.position_embeddings.weight": "pos_emb.weight", "embeddings.token_type_embeddings.weight": "sent_emb.weight", "embeddings.layer_norm.weight": "ln.weight", "embeddings.layer_norm.bias": "ln.bias", "encoder.layers.0.self_attn.q_proj.weight": "encoder_stack.block.0.attn.q.weight", "encoder.layers.0.self_attn.q_proj.bias": "encoder_stack.block.0.attn.q.bias", "encoder.layers.0.self_attn.k_proj.weight": "encoder_stack.block.0.attn.k.weight", "encoder.layers.0.self_attn.k_proj.bias": "encoder_stack.block.0.attn.k.bias", "encoder.layers.0.self_attn.v_proj.weight": "encoder_stack.block.0.attn.v.weight", "encoder.layers.0.self_attn.v_proj.bias": "encoder_stack.block.0.attn.v.bias", "encoder.layers.0.self_attn.out_proj.weight": "encoder_stack.block.0.attn.o.weight", "encoder.layers.0.self_attn.out_proj.bias": "encoder_stack.block.0.attn.o.bias", "encoder.layers.1.self_attn.q_proj.weight": "encoder_stack.block.1.attn.q.weight", "encoder.layers.1.self_attn.q_proj.bias": "encoder_stack.block.1.attn.q.bias", "encoder.layers.1.self_attn.k_proj.weight": "encoder_stack.block.1.attn.k.weight", "encoder.layers.1.self_attn.k_proj.bias": "encoder_stack.block.1.attn.k.bias", "encoder.layers.1.self_attn.v_proj.weight": "encoder_stack.block.1.attn.v.weight", "encoder.layers.1.self_attn.v_proj.bias": "encoder_stack.block.1.attn.v.bias", "encoder.layers.1.self_attn.out_proj.weight": "encoder_stack.block.1.attn.o.weight", "encoder.layers.1.self_attn.out_proj.bias": "encoder_stack.block.1.attn.o.bias", "encoder.layers.2.self_attn.q_proj.weight": "encoder_stack.block.2.attn.q.weight", "encoder.layers.2.self_attn.q_proj.bias": "encoder_stack.block.2.attn.q.bias", "encoder.layers.2.self_attn.k_proj.weight": "encoder_stack.block.2.attn.k.weight", "encoder.layers.2.self_attn.k_proj.bias": "encoder_stack.block.2.attn.k.bias", "encoder.layers.2.self_attn.v_proj.weight": "encoder_stack.block.2.attn.v.weight", "encoder.layers.2.self_attn.v_proj.bias": "encoder_stack.block.2.attn.v.bias", "encoder.layers.2.self_attn.out_proj.weight": "encoder_stack.block.2.attn.o.weight", "encoder.layers.2.self_attn.out_proj.bias": "encoder_stack.block.2.attn.o.bias", "encoder.layers.3.self_attn.q_proj.weight": "encoder_stack.block.3.attn.q.weight", "encoder.layers.3.self_attn.q_proj.bias": "encoder_stack.block.3.attn.q.bias", "encoder.layers.3.self_attn.k_proj.weight": "encoder_stack.block.3.attn.k.weight", "encoder.layers.3.self_attn.k_proj.bias": "encoder_stack.block.3.attn.k.bias", "encoder.layers.3.self_attn.v_proj.weight": "encoder_stack.block.3.attn.v.weight", "encoder.layers.3.self_attn.v_proj.bias": "encoder_stack.block.3.attn.v.bias", "encoder.layers.3.self_attn.out_proj.weight": "encoder_stack.block.3.attn.o.weight", "encoder.layers.3.self_attn.out_proj.bias": "encoder_stack.block.3.attn.o.bias", "encoder.layers.4.self_attn.q_proj.weight": "encoder_stack.block.4.attn.q.weight", "encoder.layers.4.self_attn.q_proj.bias": "encoder_stack.block.4.attn.q.bias", "encoder.layers.4.self_attn.k_proj.weight": "encoder_stack.block.4.attn.k.weight", "encoder.layers.4.self_attn.k_proj.bias": "encoder_stack.block.4.attn.k.bias", "encoder.layers.4.self_attn.v_proj.weight": "encoder_stack.block.4.attn.v.weight", "encoder.layers.4.self_attn.v_proj.bias": "encoder_stack.block.4.attn.v.bias", "encoder.layers.4.self_attn.out_proj.weight": "encoder_stack.block.4.attn.o.weight", "encoder.layers.4.self_attn.out_proj.bias": "encoder_stack.block.4.attn.o.bias", "encoder.layers.5.self_attn.q_proj.weight": "encoder_stack.block.5.attn.q.weight", "encoder.layers.5.self_attn.q_proj.bias": "encoder_stack.block.5.attn.q.bias", "encoder.layers.5.self_attn.k_proj.weight": "encoder_stack.block.5.attn.k.weight", "encoder.layers.5.self_attn.k_proj.bias": "encoder_stack.block.5.attn.k.bias", "encoder.layers.5.self_attn.v_proj.weight": "encoder_stack.block.5.attn.v.weight", "encoder.layers.5.self_attn.v_proj.bias": "encoder_stack.block.5.attn.v.bias", "encoder.layers.5.self_attn.out_proj.weight": "encoder_stack.block.5.attn.o.weight", "encoder.layers.5.self_attn.out_proj.bias": "encoder_stack.block.5.attn.o.bias", "encoder.layers.6.self_attn.q_proj.weight": "encoder_stack.block.6.attn.q.weight", "encoder.layers.6.self_attn.q_proj.bias": "encoder_stack.block.6.attn.q.bias", "encoder.layers.6.self_attn.k_proj.weight": "encoder_stack.block.6.attn.k.weight", "encoder.layers.6.self_attn.k_proj.bias": "encoder_stack.block.6.attn.k.bias", "encoder.layers.6.self_attn.v_proj.weight": "encoder_stack.block.6.attn.v.weight", "encoder.layers.6.self_attn.v_proj.bias": "encoder_stack.block.6.attn.v.bias", "encoder.layers.6.self_attn.out_proj.weight": "encoder_stack.block.6.attn.o.weight", "encoder.layers.6.self_attn.out_proj.bias": "encoder_stack.block.6.attn.o.bias", "encoder.layers.7.self_attn.q_proj.weight": "encoder_stack.block.7.attn.q.weight", "encoder.layers.7.self_attn.q_proj.bias": "encoder_stack.block.7.attn.q.bias", "encoder.layers.7.self_attn.k_proj.weight": "encoder_stack.block.7.attn.k.weight", "encoder.layers.7.self_attn.k_proj.bias": "encoder_stack.block.7.attn.k.bias", "encoder.layers.7.self_attn.v_proj.weight": "encoder_stack.block.7.attn.v.weight", "encoder.layers.7.self_attn.v_proj.bias": "encoder_stack.block.7.attn.v.bias", "encoder.layers.7.self_attn.out_proj.weight": "encoder_stack.block.7.attn.o.weight", "encoder.layers.7.self_attn.out_proj.bias": "encoder_stack.block.7.attn.o.bias", "encoder.layers.8.self_attn.q_proj.weight": "encoder_stack.block.8.attn.q.weight", "encoder.layers.8.self_attn.q_proj.bias": "encoder_stack.block.8.attn.q.bias", "encoder.layers.8.self_attn.k_proj.weight": "encoder_stack.block.8.attn.k.weight", "encoder.layers.8.self_attn.k_proj.bias": "encoder_stack.block.8.attn.k.bias", "encoder.layers.8.self_attn.v_proj.weight": "encoder_stack.block.8.attn.v.weight", "encoder.layers.8.self_attn.v_proj.bias": "encoder_stack.block.8.attn.v.bias", "encoder.layers.8.self_attn.out_proj.weight": "encoder_stack.block.8.attn.o.weight", "encoder.layers.8.self_attn.out_proj.bias": "encoder_stack.block.8.attn.o.bias", "encoder.layers.9.self_attn.q_proj.weight": "encoder_stack.block.9.attn.q.weight", "encoder.layers.9.self_attn.q_proj.bias": "encoder_stack.block.9.attn.q.bias", "encoder.layers.9.self_attn.k_proj.weight": "encoder_stack.block.9.attn.k.weight", "encoder.layers.9.self_attn.k_proj.bias": "encoder_stack.block.9.attn.k.bias", "encoder.layers.9.self_attn.v_proj.weight": "encoder_stack.block.9.attn.v.weight", "encoder.layers.9.self_attn.v_proj.bias": "encoder_stack.block.9.attn.v.bias", "encoder.layers.9.self_attn.out_proj.weight": "encoder_stack.block.9.attn.o.weight", "encoder.layers.9.self_attn.out_proj.bias": "encoder_stack.block.9.attn.o.bias", "encoder.layers.10.self_attn.q_proj.weight": "encoder_stack.block.10.attn.q.weight", "encoder.layers.10.self_attn.q_proj.bias": "encoder_stack.block.10.attn.q.bias", "encoder.layers.10.self_attn.k_proj.weight": "encoder_stack.block.10.attn.k.weight", "encoder.layers.10.self_attn.k_proj.bias": "encoder_stack.block.10.attn.k.bias", "encoder.layers.10.self_attn.v_proj.weight": "encoder_stack.block.10.attn.v.weight", "encoder.layers.10.self_attn.v_proj.bias": "encoder_stack.block.10.attn.v.bias", "encoder.layers.10.self_attn.out_proj.weight": "encoder_stack.block.10.attn.o.weight", "encoder.layers.10.self_attn.out_proj.bias": "encoder_stack.block.10.attn.o.bias", "encoder.layers.11.self_attn.q_proj.weight": "encoder_stack.block.11.attn.q.weight", "encoder.layers.11.self_attn.q_proj.bias": "encoder_stack.block.11.attn.q.bias", "encoder.layers.11.self_attn.k_proj.weight": "encoder_stack.block.11.attn.k.weight", "encoder.layers.11.self_attn.k_proj.bias": "encoder_stack.block.11.attn.k.bias", "encoder.layers.11.self_attn.v_proj.weight": "encoder_stack.block.11.attn.v.weight", "encoder.layers.11.self_attn.v_proj.bias": "encoder_stack.block.11.attn.v.bias", "encoder.layers.11.self_attn.out_proj.weight": "encoder_stack.block.11.attn.o.weight", "encoder.layers.11.self_attn.out_proj.bias": "encoder_stack.block.11.attn.o.bias", "encoder.layers.0.linear1.weight": "encoder_stack.block.0.ffn.i.weight", "encoder.layers.0.linear1.bias": "encoder_stack.block.0.ffn.i.bias", "encoder.layers.0.linear2.weight": "encoder_stack.block.0.ffn.o.weight", "encoder.layers.0.linear2.bias": "encoder_stack.block.0.ffn.o.bias", "encoder.layers.1.linear1.weight": "encoder_stack.block.1.ffn.i.weight", "encoder.layers.1.linear1.bias": "encoder_stack.block.1.ffn.i.bias", "encoder.layers.1.linear2.weight": "encoder_stack.block.1.ffn.o.weight", "encoder.layers.1.linear2.bias": "encoder_stack.block.1.ffn.o.bias", "encoder.layers.2.linear1.weight": "encoder_stack.block.2.ffn.i.weight", "encoder.layers.2.linear1.bias": "encoder_stack.block.2.ffn.i.bias", "encoder.layers.2.linear2.weight": "encoder_stack.block.2.ffn.o.weight", "encoder.layers.2.linear2.bias": "encoder_stack.block.2.ffn.o.bias", "encoder.layers.3.linear1.weight": "encoder_stack.block.3.ffn.i.weight", "encoder.layers.3.linear1.bias": "encoder_stack.block.3.ffn.i.bias", "encoder.layers.3.linear2.weight": "encoder_stack.block.3.ffn.o.weight", "encoder.layers.3.linear2.bias": "encoder_stack.block.3.ffn.o.bias", "encoder.layers.4.linear1.weight": "encoder_stack.block.4.ffn.i.weight", "encoder.layers.4.linear1.bias": "encoder_stack.block.4.ffn.i.bias", "encoder.layers.4.linear2.weight": "encoder_stack.block.4.ffn.o.weight", "encoder.layers.4.linear2.bias": "encoder_stack.block.4.ffn.o.bias", "encoder.layers.5.linear1.weight": "encoder_stack.block.5.ffn.i.weight", "encoder.layers.5.linear1.bias": "encoder_stack.block.5.ffn.i.bias", "encoder.layers.5.linear2.weight": "encoder_stack.block.5.ffn.o.weight", "encoder.layers.5.linear2.bias": "encoder_stack.block.5.ffn.o.bias", "encoder.layers.6.linear1.weight": "encoder_stack.block.6.ffn.i.weight", "encoder.layers.6.linear1.bias": "encoder_stack.block.6.ffn.i.bias", "encoder.layers.6.linear2.weight": "encoder_stack.block.6.ffn.o.weight", "encoder.layers.6.linear2.bias": "encoder_stack.block.6.ffn.o.bias", "encoder.layers.7.linear1.weight": "encoder_stack.block.7.ffn.i.weight", "encoder.layers.7.linear1.bias": "encoder_stack.block.7.ffn.i.bias", "encoder.layers.7.linear2.weight": "encoder_stack.block.7.ffn.o.weight", "encoder.layers.7.linear2.bias": "encoder_stack.block.7.ffn.o.bias", "encoder.layers.8.linear1.weight": "encoder_stack.block.8.ffn.i.weight", "encoder.layers.8.linear1.bias": "encoder_stack.block.8.ffn.i.bias", "encoder.layers.8.linear2.weight": "encoder_stack.block.8.ffn.o.weight", "encoder.layers.8.linear2.bias": "encoder_stack.block.8.ffn.o.bias", "encoder.layers.9.linear1.weight": "encoder_stack.block.9.ffn.i.weight", "encoder.layers.9.linear1.bias": "encoder_stack.block.9.ffn.i.bias", "encoder.layers.9.linear2.weight": "encoder_stack.block.9.ffn.o.weight", "encoder.layers.9.linear2.bias": "encoder_stack.block.9.ffn.o.bias", "encoder.layers.10.linear1.weight": "encoder_stack.block.10.ffn.i.weight", "encoder.layers.10.linear1.bias": "encoder_stack.block.10.ffn.i.bias", "encoder.layers.10.linear2.weight": "encoder_stack.block.10.ffn.o.weight", "encoder.layers.10.linear2.bias": "encoder_stack.block.10.ffn.o.bias", "encoder.layers.11.linear1.weight": "encoder_stack.block.11.ffn.i.weight", "encoder.layers.11.linear1.bias": "encoder_stack.block.11.ffn.i.bias", "encoder.layers.11.linear2.weight": "encoder_stack.block.11.ffn.o.weight", "encoder.layers.11.linear2.bias": "encoder_stack.block.11.ffn.o.bias", "encoder.layers.0.norm1.weight": "encoder_stack.block.0.ln1.weight", "encoder.layers.0.norm1.bias": "encoder_stack.block.0.ln1.bias", "encoder.layers.1.norm1.weight": "encoder_stack.block.1.ln1.weight", "encoder.layers.1.norm1.bias": "encoder_stack.block.1.ln1.bias", "encoder.layers.2.norm1.weight": "encoder_stack.block.2.ln1.weight", "encoder.layers.2.norm1.bias": "encoder_stack.block.2.ln1.bias", "encoder.layers.3.norm1.weight": "encoder_stack.block.3.ln1.weight", "encoder.layers.3.norm1.bias": "encoder_stack.block.3.ln1.bias", "encoder.layers.4.norm1.weight": "encoder_stack.block.4.ln1.weight", "encoder.layers.4.norm1.bias": "encoder_stack.block.4.ln1.bias", "encoder.layers.5.norm1.weight": "encoder_stack.block.5.ln1.weight", "encoder.layers.5.norm1.bias": "encoder_stack.block.5.ln1.bias", "encoder.layers.6.norm1.weight": "encoder_stack.block.6.ln1.weight", "encoder.layers.6.norm1.bias": "encoder_stack.block.6.ln1.bias", "encoder.layers.7.norm1.weight": "encoder_stack.block.7.ln1.weight", "encoder.layers.7.norm1.bias": "encoder_stack.block.7.ln1.bias", "encoder.layers.8.norm1.weight": "encoder_stack.block.8.ln1.weight", "encoder.layers.8.norm1.bias": "encoder_stack.block.8.ln1.bias", "encoder.layers.9.norm1.weight": "encoder_stack.block.9.ln1.weight", "encoder.layers.9.norm1.bias": "encoder_stack.block.9.ln1.bias", "encoder.layers.10.norm1.weight": "encoder_stack.block.10.ln1.weight", "encoder.layers.10.norm1.bias": "encoder_stack.block.10.ln1.bias", "encoder.layers.11.norm1.weight": "encoder_stack.block.11.ln1.weight", "encoder.layers.11.norm1.bias": "encoder_stack.block.11.ln1.bias", "encoder.layers.0.norm2.weight": "encoder_stack.block.0.ln2.weight", "encoder.layers.0.norm2.bias": "encoder_stack.block.0.ln2.bias", "encoder.layers.1.norm2.weight": "encoder_stack.block.1.ln2.weight", "encoder.layers.1.norm2.bias": "encoder_stack.block.1.ln2.bias", "encoder.layers.2.norm2.weight": "encoder_stack.block.2.ln2.weight", "encoder.layers.2.norm2.bias": "encoder_stack.block.2.ln2.bias", "encoder.layers.3.norm2.weight": "encoder_stack.block.3.ln2.weight", "encoder.layers.3.norm2.bias": "encoder_stack.block.3.ln2.bias", "encoder.layers.4.norm2.weight": "encoder_stack.block.4.ln2.weight", "encoder.layers.4.norm2.bias": "encoder_stack.block.4.ln2.bias", "encoder.layers.5.norm2.weight": "encoder_stack.block.5.ln2.weight", "encoder.layers.5.norm2.bias": "encoder_stack.block.5.ln2.bias", "encoder.layers.6.norm2.weight": "encoder_stack.block.6.ln2.weight", "encoder.layers.6.norm2.bias": "encoder_stack.block.6.ln2.bias", "encoder.layers.7.norm2.weight": "encoder_stack.block.7.ln2.weight", "encoder.layers.7.norm2.bias": "encoder_stack.block.7.ln2.bias", "encoder.layers.8.norm2.weight": "encoder_stack.block.8.ln2.weight", "encoder.layers.8.norm2.bias": "encoder_stack.block.8.ln2.bias", "encoder.layers.9.norm2.weight": "encoder_stack.block.9.ln2.weight", "encoder.layers.9.norm2.bias": "encoder_stack.block.9.ln2.bias", "encoder.layers.10.norm2.weight": "encoder_stack.block.10.ln2.weight", "encoder.layers.10.norm2.bias": "encoder_stack.block.10.ln2.bias", "encoder.layers.11.norm2.weight": "encoder_stack.block.11.ln2.weight", "encoder.layers.11.norm2.bias": "encoder_stack.block.11.ln2.bias", "pooler.dense.weight": "pooler.weight", "pooler.dense.bias": "pooler.bias"}
\ No newline at end of file
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gram/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gram/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gram/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gram/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gram/configuration.py
new file mode 100644
index 000000000..dcdf5b818
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gram/configuration.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Ernie Doc model configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from ..configuration_utils import PretrainedConfig
+
+__all__ = ["ERNIE_GRAM_PRETRAINED_INIT_CONFIGURATION", "ErnieGramConfig", "ERNIE_GRAM_PRETRAINED_RESOURCE_FILES_MAP"]
+
+ERNIE_GRAM_PRETRAINED_INIT_CONFIGURATION = {
+    "ernie-gram-zh": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 768,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 2,
+        "vocab_size": 18018,
+    },
+    "ernie-gram-zh-finetuned-dureader-robust": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 768,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 2,
+        "vocab_size": 18018,
+    },
+}
+
+ERNIE_GRAM_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "ernie-gram-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_gram_zh/ernie_gram_zh.pdparams",
+        "ernie-gram-zh-finetuned-dureader-robust": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie-gram-zh-finetuned-dureader-robust/model_state.pdparams",
+    },
+}
+
+
+class ErnieGramConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ErnieGramModel`]. It is used to instantiate
+    an ErnieGram model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (int):
+            Vocabulary size of the ERNIE-Gram model. Also is the vocab size of token embedding matrix.
+        hidden_size (int, optional):
+            Dimensionality of the embedding layer, encoder layers and pooler layer. Defaults to `768`.
+        num_hidden_layers (int, optional):
+            Number of hidden layers in the Transformer encoder. Defaults to `12`.
+        num_attention_heads (int, optional):
+            Number of attention heads for each attention layer in the Transformer encoder.
+            Defaults to `12`.
+        intermediate_size (int, optional):
+            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
+            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
+            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
+            Defaults to `3072`.
+        hidden_act (str, optional):
+            The non-linear activation function in the feed-forward layer.
+            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
+            are supported. Defaults to ``"gelu"``.
+        hidden_dropout_prob (float, optional):
+            The dropout probability for all fully connected layers in the embeddings and encoders.
+            Defaults to `0.1`.
+        attention_probs_dropout_prob (float, optional):
+            The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
+            Defaults to `0.1`.
+        max_position_embeddings (int, optional):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
+            sequence. Defaults to `512`.
+        type_vocab_size (int, optional):
+            The vocabulary size of the `token_type_ids`.
+            Defaults to `2`.
+        initializer_range (float, optional):
+            The standard deviation of the normal initializer for initializing all weight matrices.
+            Defaults to `0.02`.
+
+            .. note::
+                A normal_initializer initializes weight matrices as normal distributions.
+                See :meth:`ErniePretrainedModel._init_weights()` for how weights are initialized in `ErnieGramModel`.
+
+        rel_pos_size (int, optional):
+            The relative position size just for ERNIE-Gram English model. Defaults to None.
+        pad_token_id(int, optional):
+            The index of padding token in the token vocabulary.
+            Defaults to `0`.
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import ErnieGramConfig, ErnieGramModel
+    >>> # Initializing an ErnieGram style configuration
+    >>> configuration = ErnieGramConfig()
+    >>> # Initializing a model (with random weights) from the ErnieGram-base style configuration
+    >>> model = ErnieGramModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    attribute_map: Dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
+    pretrained_init_configuration = ERNIE_GRAM_PRETRAINED_INIT_CONFIGURATION
+    model_type = "ernie_gram"
+
+    def __init__(
+        self,
+        vocab_size=18018,
+        embedding_size=768,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        pad_token_id=0,
+        rel_pos_size=None,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.embedding_size = embedding_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.rel_pos_size = rel_pos_size
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gram/matching_param_name.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gram/matching_param_name.py
new file mode 100644
index 000000000..643848513
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gram/matching_param_name.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+
+def match_embedding_param(convert_parameter_name_dict):
+    convert_parameter_name_dict["word_emb.weight"] = "embeddings.word_embeddings.weight"
+    convert_parameter_name_dict["pos_emb.weight"] = "embeddings.position_embeddings.weight"
+    convert_parameter_name_dict["sent_emb.weight"] = "embeddings.token_type_embeddings.weight"
+    convert_parameter_name_dict["ln.weight"] = "embeddings.layer_norm.weight"
+    convert_parameter_name_dict["ln.bias"] = "embeddings.layer_norm.bias"
+    convert_parameter_name_dict["rel_pos_bias_emb.weight"] = "embeddings.rel_pos_embedding.weight"
+    return convert_parameter_name_dict
+
+
+def match_encoder_param(convert_parameter_name_dict, layer_num=4):
+    # Firstly, converts the multihead_attention to the parameter.
+    proj_names = ["q", "k", "v", "o"]
+    param_names = ["weight", "bias"]
+    nlp_format = "encoder.layers.{}.self_attn.{}_proj.{}"
+    ernie_format = "encoder_stack.block.{}.attn.{}.{}"
+    for i in range(0, layer_num):
+        for proj_name in proj_names:
+            for param_name in param_names:
+                if proj_name == "o":
+                    nlp_name = nlp_format.format(i, "out", param_name)
+                else:
+                    nlp_name = nlp_format.format(i, proj_name, param_name)
+                ernie_name = ernie_format.format(i, proj_name, param_name)
+                convert_parameter_name_dict[ernie_name] = nlp_name
+
+    # Secondly, converts the encoder ffn parameter.
+    nlp_format = "encoder.layers.{}.linear{}.{}"
+    ernie_format = "encoder_stack.block.{}.ffn.{}.{}"
+    nlp_param_names = ["1", "2"]
+    ernie_param_names = ["i", "o"]
+    param_names = ["weight", "bias"]
+    for i in range(0, layer_num):
+        for nlp_name, ernie_name in zip(nlp_param_names, ernie_param_names):
+            for param_name in param_names:
+                nlp_format_name = nlp_format.format(i, nlp_name, param_name)
+                ernie_format_name = ernie_format.format(i, ernie_name, param_name)
+                convert_parameter_name_dict[ernie_format_name] = nlp_format_name
+
+    # Thirdly, converts the multi_head layer_norm parameter.
+    nlp_format = "encoder.layers.{}.norm{}.{}"
+    ernie_format = "encoder_stack.block.{}.ln{}.{}"
+    proj_names = ["1", "2"]
+    param_names = ["weight", "bias"]
+    for i in range(0, layer_num):
+        for proj_name in proj_names:
+            for param_name in param_names:
+                nlp_format_name = nlp_format.format(i, proj_name, param_name)
+                ernie_format_name = ernie_format.format(i, proj_name, param_name)
+                convert_parameter_name_dict[ernie_format_name] = nlp_format_name
+
+    return convert_parameter_name_dict
+
+
+def match_pooler_parameter(convert_parameter_name_dict):
+    convert_parameter_name_dict["pooler.weight"] = "pooler.dense.weight"
+    convert_parameter_name_dict["pooler.bias"] = "pooler.dense.bias"
+    return convert_parameter_name_dict
+
+
+def match_mlm_parameter(convert_parameter_name_dict):
+    # convert_parameter_name_dict["cls.predictions.decoder_weight"] = "word_embedding"
+    convert_parameter_name_dict["cls.predictions.decoder_bias"] = "mask_lm_out_fc.b_0"
+    convert_parameter_name_dict["cls.predictions.transform.weight"] = "mask_lm_trans_fc.w_0"
+    convert_parameter_name_dict["cls.predictions.transform.bias"] = "mask_lm_trans_fc.b_0"
+    convert_parameter_name_dict["cls.predictions.layer_norm.weight"] = "mask_lm_trans_layer_norm_scale"
+    convert_parameter_name_dict["cls.predictions.layer_norm.bias"] = "mask_lm_trans_layer_norm_bias"
+    return convert_parameter_name_dict
+
+
+def write_vocab(vocab_file):
+    with open(vocab_file, "r", encoding="utf8") as f, open("ernie-gram-zh/new_vocab.txt", "w", encoding="utf8") as nf:
+        for line in f:
+            word, word_id = line.strip().split("\t")
+            nf.write(word + "\n")
+
+
+if __name__ == "__main__":
+    convert_parameter_name_dict = {}
+
+    convert_parameter_name_dict = match_embedding_param(convert_parameter_name_dict)
+    convert_parameter_name_dict = match_encoder_param(convert_parameter_name_dict, layer_num=12)
+    convert_parameter_name_dict = match_pooler_parameter(convert_parameter_name_dict)
+    ernie_state_dict = paddle.load("./ernie-gram-zh/saved_weights.pdparams")
+    nlp_state_dict = {}
+    for name, value in ernie_state_dict.items():
+        nlp_name = convert_parameter_name_dict[name]
+        nlp_state_dict["ernie_gram." + nlp_name] = value
+
+    paddle.save(nlp_state_dict, "./ernie-gram-zh/ernie_gram_zh.pdparams")
+
+    for ernie_name, nlp_name in convert_parameter_name_dict.items():
+        print(ernie_name, "          ", nlp_name)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gram/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gram/modeling.py
new file mode 100644
index 000000000..438ee1b95
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gram/modeling.py
@@ -0,0 +1,703 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple
+
+import paddle
+import paddle.nn as nn
+from paddle import Tensor
+
+from ...utils.env import CONFIG_NAME
+from .. import PretrainedModel, register_base_model
+from ..model_outputs import (
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    tuple_output,
+)
+from .configuration import (
+    ERNIE_GRAM_PRETRAINED_INIT_CONFIGURATION,
+    ERNIE_GRAM_PRETRAINED_RESOURCE_FILES_MAP,
+    ErnieGramConfig,
+)
+
+__all__ = [
+    "ErnieGramModel",
+    "ErnieGramPretrainedModel",
+    "ErnieGramForSequenceClassification",
+    "ErnieGramForTokenClassification",
+    "ErnieGramForQuestionAnswering",
+]
+
+
+class ErnieGramEmbeddings(nn.Layer):
+    r"""
+    Include embeddings from word, position and token_type embeddings.
+    """
+
+    def __init__(self, config: ErnieGramConfig):
+        super(ErnieGramEmbeddings, self).__init__()
+
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)
+        if config.rel_pos_size and config.num_attention_heads:
+            self.rel_pos_embeddings = nn.Embedding(config.rel_pos_size, config.num_attention_heads)
+        self.layer_norm = nn.LayerNorm(config.embedding_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        past_key_values_length: int = 0,
+    ):
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        input_shape = inputs_embeds.shape[:-1]
+
+        if position_ids is None:
+            # maybe need use shape op to unify static graph and dynamic graph
+            ones = paddle.ones(input_shape, dtype="int64")
+            seq_length = paddle.cumsum(ones, axis=1)
+            position_ids = seq_length - ones
+
+            if past_key_values_length > 0:
+                position_ids = position_ids + past_key_values_length
+
+            position_ids.stop_gradient = True
+
+        if token_type_ids is None:
+            token_type_ids_shape = input_shape
+            token_type_ids = paddle.zeros(token_type_ids_shape, dtype="int64")
+
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class ErnieGramPooler(nn.Layer):
+    def __init__(self, config: ErnieGramConfig, weight_attr=None):
+        super(ErnieGramPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size, weight_attr=weight_attr)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class ErnieGramPretrainedModel(PretrainedModel):
+    r"""
+    An abstract class for pretrained ERNIE-Gram models. It provides ERNIE-Gram related
+    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
+    `pretrained_init_configuration`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    pretrained_init_configuration = ERNIE_GRAM_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = ERNIE_GRAM_PRETRAINED_RESOURCE_FILES_MAP
+    base_model_prefix = "ernie_gram"
+    config_class = ErnieGramConfig
+    model_config_file = CONFIG_NAME
+    resource_files_names = {"model_state": "model_state.pdparams"}
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # only support dygraph, use truncated_normal and make it inplace
+            # and configurable later
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range,
+                        shape=layer.weight.shape,
+                    )
+                )
+        elif isinstance(layer, nn.LayerNorm):
+            layer._epsilon = 1e-5
+
+
+@register_base_model
+class ErnieGramModel(ErnieGramPretrainedModel):
+    r"""
+    The bare ERNIE-Gram Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`ErnieGramConfig`):
+            An instance of ErnieGramConfig used to construct ErnieGramModel.
+    """
+
+    def __init__(self, config: ErnieGramConfig):
+        super(ErnieGramModel, self).__init__(config)
+        self.config = config
+        self.pad_token_id = config.pad_token_id
+        self.initializer_range = config.initializer_range
+        self.embeddings = ErnieGramEmbeddings(config)
+        encoder_layer = nn.TransformerEncoderLayer(
+            config.hidden_size,
+            config.num_attention_heads,
+            config.intermediate_size,
+            dropout=config.hidden_dropout_prob,
+            activation=config.hidden_act,
+            attn_dropout=config.attention_probs_dropout_prob,
+            act_dropout=0,
+        )
+        self.encoder = nn.TransformerEncoder(encoder_layer, config.num_hidden_layers)
+        self.pooler = ErnieGramPooler(config)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[Tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate first and second portions of the inputs.
+                Indices can be either 0 or 1:
+
+                - 0 corresponds to a **sentence A** token,
+                - 1 corresponds to a **sentence B** token.
+
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
+                Defaults to None, which means no segment embeddings is added to token embeddings.
+            position_ids (Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                config.max_position_embeddings - 1]``.
+                Defaults to `None`. Shape as `(batch_sie, num_tokens)` and dtype as `int32` or `int64`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
+                [batch_size, num_attention_heads, sequence_length, sequence_length].
+                We use whole-word-mask in ERNIE, so the whole word will have the same value. For example, "使用" as a word,
+                "使" and "用" will have the same value.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            inputs_embeds (Tensor, optional):
+                If you want to control how to convert `inputs_ids` indices into associated vectors, you can
+                pass an embedded representation directly instead of passing `inputs_ids`.
+            past_key_values (tuple(tuple(Tensor)), optional):
+                The length of tuple equals to the number of layers, and each inner
+                tuple haves 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`)
+                which contains precomputed key and value hidden states of the attention blocks.
+                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that
+                don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+                `input_ids` of shape `(batch_size, sequence_length)`.
+            use_cache (`bool`, optional):
+                If set to `True`, `past_key_values` key value states are returned.
+                Defaults to `None`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.ModelOutput` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            tuple: Returns tuple (``sequence_output``, ``pooled_output``).
+
+            With the fields:
+
+            - `sequence_output` (Tensor):
+                Sequence of hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+            - `pooled_output` (Tensor):
+                The output of first token (`[CLS]`) in sequence.
+                We "pool" the model by simply taking the hidden state corresponding to the first token.
+                Its data type should be float32 and its shape is [batch_size, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ErnieGramModel, ErnieGramTokenizer
+
+                tokenizer = ErnieGramTokenizer.from_pretrained('ernie-gram-zh')
+                model = ErnieGramModel.from_pretrained('ernie-gram-zh)
+
+                inputs = tokenizer("欢迎使用百度飞桨！")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                sequence_output, pooled_output = model(**inputs)
+
+        """
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time.")
+
+        # init the default bool value
+        output_attentions = output_attentions if output_attentions is not None else False
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else False
+        return_dict = return_dict if return_dict is not None else False
+        use_cache = use_cache if use_cache is not None else False
+
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+
+        if attention_mask is None:
+            attention_mask = paddle.unsqueeze(
+                (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e4, axis=[1, 2]
+            )
+            if past_key_values is not None:
+                batch_size = past_key_values[0][0].shape[0]
+                past_mask = paddle.zeros([batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype)
+                attention_mask = paddle.concat([past_mask, attention_mask], axis=-1)
+
+        # For 2D attention_mask from tokenizer
+        elif attention_mask.ndim == 2:
+            attention_mask = paddle.unsqueeze(attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
+            attention_mask = (1.0 - attention_mask) * -1e4
+        attention_mask.stop_gradient = True
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        self.encoder._use_cache = use_cache  # To be consistent with HF
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask,
+            cache=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if isinstance(encoder_outputs, type(input_ids)):
+            sequence_output = encoder_outputs
+            pooled_output = self.pooler(sequence_output)
+            return (sequence_output, pooled_output)
+        else:
+            sequence_output = encoder_outputs[0]
+            pooled_output = self.pooler(sequence_output)
+            if not return_dict:
+                return (sequence_output, pooled_output) + encoder_outputs[1:]
+            return BaseModelOutputWithPoolingAndCrossAttentions(
+                last_hidden_state=sequence_output,
+                pooler_output=pooled_output,
+                past_key_values=encoder_outputs.past_key_values,
+                hidden_states=encoder_outputs.hidden_states,
+                attentions=encoder_outputs.attentions,
+            )
+
+
+class ErnieGramForTokenClassification(ErnieGramPretrainedModel):
+    r"""
+    ERNIE-Gram Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        config (:class:`ErnieGramConfig`):
+            An instance of ErnieGramConfig used to construct ErnieGramForTokenClassification.
+    """
+
+    def __init__(self, config: ErnieGramConfig):
+        super(ErnieGramForTokenClassification, self).__init__(config)
+        self.config = config
+        self.num_labels = config.num_labels
+        self.ernie_gram = ErnieGramModel(config)  # allow ernie_gram to be config
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(
+            config.hidden_size,
+            config.num_labels,
+            weight_attr=paddle.ParamAttr(initializer=nn.initializer.TruncatedNormal(std=config.initializer_range)),
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`ErnieGramModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ErnieGramModel`.
+            position_ids (Tensor, optional):
+                See :class:`ErnieGramModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ErnieGramModel`.
+            labels (Tensor of shape `(batch_size, sequence_length)`, optional):
+                Labels for computing the token classification loss. Indices should be in `[0, ..., num_labels - 1]`.
+            inputs_embeds(Tensor, optional):
+                See :class:`ErnieGramModel`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input token classification logits.
+            Shape as `[batch_size, sequence_length, num_labels]` and dtype as `float32`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ErnieGramForTokenClassification, ErnieGramTokenizer
+
+                tokenizer = ErnieGramTokenizer.from_pretrained('ernie-gram-zh')
+                model = ErnieGramForTokenClassification.from_pretrained('ernie-gram-zh')
+
+                inputs = tokenizer("欢迎使用百度飞桨！")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+        """
+        outputs = self.ernie_gram(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return tuple_output(output, loss)
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class ErnieGramForQuestionAnswering(ErnieGramPretrainedModel):
+    """
+    ERNIE-Gram Model with a linear layer on top of the hidden-states
+    output to compute `span_start_logits` and `span_end_logits`,
+    designed for question-answering tasks like SQuAD..
+
+    Args:
+        config (:class:`ErnieGramConfig`):
+            An instance of ErnieGramConfig used to construct ErnieGramForQuestionAnswering.
+    """
+
+    def __init__(self, config: ErnieGramConfig):
+        super(ErnieGramForQuestionAnswering, self).__init__(config)
+        self.config = config
+        self.ernie_gram = ErnieGramModel(config)
+        self.classifier = nn.Linear(config.hidden_size, 2)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        start_positions: Optional[Tensor] = None,
+        end_positions: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`ErnieGramModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ErnieGramModel`.
+            position_ids (Tensor, optional):
+                See :class:`ErnieGramModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ErnieGramModel`.
+            inputs_embeds(Tensor, optional):
+                See :class:`ErnieGramModel`.
+            start_positions (Tensor of shape `(batch_size,)`, optional):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            end_positions (Tensor of shape `(batch_size,)`, optional):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+
+        Returns:
+            tuple: Returns tuple (`start_logits`, `end_logits`).
+
+            With the fields:
+
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ErnieGramForQuestionAnswering, ErnieGramTokenizer
+
+                tokenizer = ErnieGramTokenizer.from_pretrained('ernie-gram-zh')
+                model = ErnieGramForQuestionAnswering.from_pretrained('ernie-gram-zh')
+
+                inputs = tokenizer("欢迎使用百度飞桨！")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+        """
+
+        outputs = self.ernie_gram(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        logits = self.classifier(outputs[0])
+        logits = paddle.transpose(logits, perm=[2, 0, 1])
+        start_logits, end_logits = paddle.unstack(x=logits, axis=0)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if start_positions.ndim > 1:
+                start_positions = start_positions.squeeze(-1)
+            if start_positions.ndim > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clip(0, ignored_index)
+            end_positions = end_positions.clip(0, ignored_index)
+
+            loss_fct = paddle.nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return tuple_output(output, total_loss)
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class ErnieGramForSequenceClassification(ErnieGramPretrainedModel):
+    r"""
+    ERNIE-Gram Model with a linear layer on top of the output layer,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        config (:class:`ErnieGramConfig`):
+            An instance of ErnieGramConfig used to construct ErnieGramForSequenceClassification.
+    """
+
+    def __init__(self, config: ErnieGramConfig):
+        super(ErnieGramForSequenceClassification, self).__init__(config)
+        self.config = config
+        self.num_labels = config.num_labels
+        self.ernie_gram = ErnieGramModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`ErnieGramModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`ErnieGramModel`.
+            position_ids (Tensor, optional):
+                See :class:`ErnieGramModel`.
+            attention_mask (Tensor, optional):
+                See :class:`BertModel`.
+            labels (Tensor of shape `(batch_size,)`, optional):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in `[0, ..., num_labels - 1]`. If `num_labels == 1`
+                a regression loss is computed (Mean-Square loss), If `num_labels > 1`
+                a classification loss is computed (Cross-Entropy).
+            inputs_embeds(Tensor, optional):
+                See :class:`ErnieGramModel`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input text classification logits.
+            Shape as `[batch_size, num_labels]` and dtype as float32.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ErnieGramForSequenceClassification, ErnieGramTokenizer
+
+                tokenizer = ErnieGramTokenizer.from_pretrained('ernie-gram-zh')
+                model = ErnieGramForSequenceClassification.from_pretrained('ernie-gram-zh')
+
+                inputs = tokenizer("欢迎使用百度飞桨！")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+
+        """
+        outputs = self.ernie_gram(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = self.dropout(outputs[1])
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = paddle.nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = paddle.nn.CrossEntropyLoss()
+                loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = paddle.nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return tuple_output(output, loss)
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gram/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gram/tokenizer.py
new file mode 100644
index 000000000..038bade08
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_gram/tokenizer.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..ernie.tokenizer import ErnieTokenizer
+
+__all__ = ["ErnieGramTokenizer"]
+
+
+class ErnieGramTokenizer(ErnieTokenizer):
+    r"""
+    Constructs an ERNIE-Gram tokenizer. It uses a basic tokenizer to do punctuation splitting, lower casing and so on,
+    and follows a WordPiece tokenizer to tokenize as subwords.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.ernie.tokenizer.ErnieTokenizer`.
+    For more information regarding those methods, please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            The vocabulary file path (ends with '.txt') required to instantiate
+            a `WordpieceTokenizer`.
+        do_lower_case (str, optional):
+            Whether or not to lowercase the input when tokenizing.
+            Defaults to `True`.
+        unk_token (str, optional):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str, optional):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str, optional):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str, optional):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str, optional):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import ErnieGramTokenizer
+            tokenizer = ErnieGramTokenizer.from_pretrained('ernie-gram-zh')
+            encoded_inputs = tokenizer('He was a puppeteer')
+            # encoded_inputs:
+            # {
+            #   'input_ids': [1, 4444, 4385, 1545, 6712, 10062, 9568, 9756, 9500, 2],
+            #   'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+            # }
+
+    """
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "ernie-gram-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_gram_zh/vocab.txt",
+            "ernie-gram-zh-finetuned-dureader-robust": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_gram_zh/vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "ernie-gram-zh": {"do_lower_case": True},
+        "ernie-gram-zh-finetuned-dureader-robust": {"do_lower_case": True},
+    }
+    max_model_input_sizes = {
+        "ernie-gram-zh": 512,
+        "ernie-gram-zh-finetuned-dureader-robust": 512,
+    }
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+        super(ErnieGramTokenizer, self).__init__(
+            vocab_file,
+            do_lower_case=do_lower_case,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_layout/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_layout/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_layout/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_layout/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_layout/configuration.py
new file mode 100644
index 000000000..59ad9b4dc
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_layout/configuration.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ERNIE-Layout model configuration"""
+
+from typing import Dict
+
+from ..configuration_utils import PretrainedConfig
+
+__all__ = [
+    "ERNIE_LAYOUT_PRETRAINED_INIT_CONFIGURATION",
+    "ErnieLayoutConfig",
+    "ERNIE_LAYOUT_PRETRAINED_RESOURCE_FILES_MAP",
+]
+
+ERNIE_LAYOUT_PRETRAINED_INIT_CONFIGURATION = {
+    "ernie-layoutx-base-uncased": {
+        "attention_probs_dropout_prob": 0.1,
+        "bos_token_id": 0,
+        "coordinate_size": 128,
+        "eos_token_id": 2,
+        "gradient_checkpointing": False,
+        "has_relative_attention_bias": True,
+        "has_spatial_attention_bias": True,
+        "has_visual_segment_embedding": False,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "image_feature_pool_shape": [7, 7, 256],
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "layer_norm_eps": 1e-12,
+        "max_2d_position_embeddings": 1024,
+        "max_position_embeddings": 514,
+        "max_rel_2d_pos": 256,
+        "max_rel_pos": 128,
+        "model_type": "ernie_layout",
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "output_past": True,
+        "pad_token_id": 1,
+        "shape_size": 128,
+        "rel_2d_pos_bins": 64,
+        "rel_pos_bins": 32,
+        "type_vocab_size": 100,
+        "vocab_size": 250002,
+    },
+    "uie-x-base": {
+        "attention_probs_dropout_prob": 0.1,
+        "bos_token_id": 0,
+        "coordinate_size": 128,
+        "eos_token_id": 2,
+        "gradient_checkpointing": False,
+        "has_relative_attention_bias": True,
+        "has_spatial_attention_bias": True,
+        "has_visual_segment_embedding": False,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "image_feature_pool_shape": [7, 7, 256],
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "layer_norm_eps": 1e-12,
+        "max_2d_position_embeddings": 1024,
+        "max_position_embeddings": 514,
+        "max_rel_2d_pos": 256,
+        "max_rel_pos": 128,
+        "model_type": "ernie_layout",
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "output_past": True,
+        "pad_token_id": 1,
+        "shape_size": 128,
+        "rel_2d_pos_bins": 64,
+        "rel_pos_bins": 32,
+        "type_vocab_size": 100,
+        "vocab_size": 250002,
+    },
+}
+
+ERNIE_LAYOUT_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "ernie-layoutx-base-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_layout/ernie_layoutx_base_uncased.pdparams",
+        "uie-x-base": "https://bj.bcebos.com/paddlenlp/models/transformers/uie_x/uie_x_base.pdparams",
+    },
+}
+
+
+class ErnieLayoutConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ErnieLayoutModel`]. It is used to
+    instantiate a ErnieLayout model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the ErnieLayout
+    ernie-layoutx-base-uncased architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 250002):
+            Vocabulary size of the ErnieLayout model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ErnieLayoutModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 514):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 514 or 1028 or 2056).
+        type_vocab_size (`int`, *optional*, defaults to 100):
+            The vocabulary size of the `token_type_ids` passed when calling [`ErnieModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for classifier.
+        has_visual_segment_embedding (`bool`, *optional*, defaults to `False`):
+            Whether or not the model has visual segment embedding.
+    Examples:
+    ```python
+    >>> from paddlenlp.transformers import ErnieLayoutModel, ErnieLayoutConfig
+    >>> # Initializing a ErnieLayout ernie-layoutx-base-uncased configuration
+    >>> configuration = ErnieLayoutConfig()
+    >>> # Initializing a model from the  style configuration
+    >>> model = ErnieLayoutModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "ernie_layout"
+    attribute_map: Dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
+    pretrained_init_configuration = ERNIE_LAYOUT_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size: int = 30522,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        task_id=0,
+        intermediate_size: int = 3072,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        max_position_embeddings: int = 512,
+        max_2d_position_embeddings: int = 1024,
+        task_type_vocab_size: int = 3,
+        type_vocab_size: int = 16,
+        initializer_range: float = 0.02,
+        pad_token_id: int = 0,
+        pool_act: str = "tanh",
+        fuse: bool = False,
+        image_feature_pool_shape=[7, 7, 256],
+        layer_norm_eps=1e-12,
+        use_cache=False,
+        use_task_id=True,
+        classifier_dropout=None,
+        has_visual_segment_embedding=False,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.task_id = task_id
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.max_2d_position_embeddings = max_2d_position_embeddings
+        self.task_type_vocab_size = task_type_vocab_size
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.pool_act = pool_act
+        self.fuse = fuse
+        self.image_feature_pool_shape = image_feature_pool_shape
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+        self.use_task_id = use_task_id
+        self.classifier_dropout = classifier_dropout
+        self.has_visual_segment_embedding = has_visual_segment_embedding
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_layout/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_layout/modeling.py
new file mode 100644
index 000000000..fde6f36bc
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_layout/modeling.py
@@ -0,0 +1,1183 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Modeling classes for ErnieLayout model."""
+
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.distributed.fleet.utils import recompute
+from paddle.nn import Layer
+
+from paddlenlp.utils.log import logger
+
+from ...utils.env import CONFIG_NAME
+from .. import PretrainedModel, register_base_model
+from .configuration import (
+    ERNIE_LAYOUT_PRETRAINED_INIT_CONFIGURATION,
+    ERNIE_LAYOUT_PRETRAINED_RESOURCE_FILES_MAP,
+    ErnieLayoutConfig,
+)
+from .visual_backbone import ResNet
+
+__all__ = [
+    "ErnieLayoutModel",
+    "ErnieLayoutPretrainedModel",
+    "ErnieLayoutForTokenClassification",
+    "ErnieLayoutForSequenceClassification",
+    "ErnieLayoutForPretraining",
+    "ErnieLayoutForQuestionAnswering",
+    "UIEX",
+]
+
+
+def relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+    """
+    Adapted from Mesh Tensorflow:
+    https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+    Translate relative position to a bucket number for relative attention. The relative position is defined as
+    memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+    position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for small
+    absolute relative_position and larger buckets for larger absolute relative_positions. All relative positions
+    >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket. This should
+    allow for more graceful generalization to longer sequences than the model has been trained on.
+
+    Args:
+        relative_position: an int32 Tensor
+        bidirectional: a boolean - whether the attention is bidirectional
+        num_buckets: an integer
+        max_distance: an integer
+
+    Returns:
+        a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
+    """
+
+    ret = 0
+    if bidirectional:
+        num_buckets //= 2
+        ret += (relative_position > 0).astype(paddle.int64) * num_buckets
+        n = paddle.abs(relative_position)
+    else:
+        n = paddle.max(-relative_position, paddle.zeros_like(relative_position))
+    # Now n is in the range [0, inf)
+    # half of the buckets are for exact increments in positions
+    max_exact = num_buckets // 2
+    is_small = n < max_exact
+
+    # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+    val_if_large = max_exact + (
+        paddle.log(n.astype(paddle.float32) / max_exact)
+        / math.log(max_distance / max_exact)
+        * (num_buckets - max_exact)
+    ).astype(paddle.int64)
+
+    val_if_large = paddle.minimum(val_if_large, paddle.full_like(val_if_large, num_buckets - 1))
+
+    ret += paddle.where(is_small, n, val_if_large)
+    return ret
+
+
+class ErnieLayoutPooler(Layer):
+    def __init__(self, hidden_size, with_pool):
+        super(ErnieLayoutPooler, self).__init__()
+        self.dense = nn.Linear(hidden_size, hidden_size)
+        self.activation = nn.Tanh()
+        self.with_pool = with_pool
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        if self.with_pool == "tanh":
+            pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class ErnieLayoutEmbeddings(Layer):
+    """
+    Include embeddings from word, position and token_type embeddings
+    """
+
+    def __init__(self, config: ErnieLayoutConfig):
+        super(ErnieLayoutEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+
+        self.x_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.hidden_size)
+        self.y_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.hidden_size)
+        self.h_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.hidden_size)
+        self.w_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.hidden_size)
+
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.register_buffer(
+            "position_ids", paddle.arange(config.max_position_embeddings, dtype="int64").expand((1, -1))
+        )
+
+    def _cal_spatial_position_embeddings(self, bbox):
+        try:
+            left_position_embeddings = self.x_position_embeddings(bbox[:, :, 0])
+            upper_position_embeddings = self.y_position_embeddings(bbox[:, :, 1])
+            right_position_embeddings = self.x_position_embeddings(bbox[:, :, 2])
+            lower_position_embeddings = self.y_position_embeddings(bbox[:, :, 3])
+        except IndexError as e:
+            raise IndexError("The :obj:`bbox`coordinate values should be within 0-1000 range.") from e
+
+        h_position_embeddings = self.h_position_embeddings(bbox[:, :, 3] - bbox[:, :, 1])
+        w_position_embeddings = self.w_position_embeddings(bbox[:, :, 2] - bbox[:, :, 0])
+        return (
+            left_position_embeddings,
+            upper_position_embeddings,
+            right_position_embeddings,
+            lower_position_embeddings,
+            h_position_embeddings,
+            w_position_embeddings,
+        )
+
+    def forward(self, input_ids, bbox=None, token_type_ids=None, position_ids=None):
+        if position_ids is None:
+            ones = paddle.ones_like(input_ids, dtype="int64")
+            seq_length = paddle.cumsum(ones, axis=-1)
+
+            position_ids = seq_length - ones
+            position_ids.stop_gradient = True
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros_like(input_ids, dtype="int64")
+
+        input_embedings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+
+        x1, y1, x2, y2, h, w = self.embeddings._cal_spatial_position_embeddings(bbox)
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = input_embedings + position_embeddings + x1 + y1 + x2 + y2 + h + w + token_type_embeddings
+
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class ErnieLayoutPretrainedModel(PretrainedModel):
+    model_config_file = CONFIG_NAME
+    pretrained_init_configuration = ERNIE_LAYOUT_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = ERNIE_LAYOUT_PRETRAINED_RESOURCE_FILES_MAP
+    base_model_prefix = "ernie_layout"
+    config_class = ErnieLayoutConfig
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range,
+                        shape=layer.weight.shape,
+                    )
+                )
+
+
+class ErnieLayoutSelfOutput(nn.Layer):
+    def __init__(self, config):
+        super(ErnieLayoutSelfOutput, self).__init__()
+        self.dense = nn.Linear(config["hidden_size"], config["hidden_size"])
+        self.LayerNorm = nn.LayerNorm(config["hidden_size"], epsilon=config["layer_norm_eps"])
+        self.dropout = nn.Dropout(config["hidden_dropout_prob"])
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class ErnieLayoutSelfAttention(nn.Layer):
+    def __init__(self, config):
+        super(ErnieLayoutSelfAttention, self).__init__()
+        if config["hidden_size"] % config["num_attention_heads"] != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size {} is not a multiple of the number of attention "
+                "heads {}".format(config["hidden_size"], config["num_attention_heads"])
+            )
+        self.num_attention_heads = config["num_attention_heads"]
+        self.attention_head_size = int(config["hidden_size"] / config["num_attention_heads"])
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.has_relative_attention_bias = config["has_relative_attention_bias"]
+        self.has_spatial_attention_bias = config["has_spatial_attention_bias"]
+
+        self.query = nn.Linear(config["hidden_size"], self.all_head_size)
+        self.key = nn.Linear(config["hidden_size"], self.all_head_size)
+        self.value = nn.Linear(config["hidden_size"], self.all_head_size)
+
+        self.dropout = nn.Dropout(config["attention_probs_dropout_prob"])
+
+    def transpose_for_scores(self, x):
+        x = x.reshape([x.shape[0], x.shape[1], self.num_attention_heads, self.attention_head_size])
+        return x.transpose([0, 2, 1, 3])
+
+    def compute_qkv(self, hidden_states):
+        q = self.query(hidden_states)
+        k = self.key(hidden_states)
+        v = self.value(hidden_states)
+        return q, k, v
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        output_attentions=False,
+        rel_pos=None,
+        rel_2d_pos=None,
+    ):
+        q, k, v = self.compute_qkv(hidden_states)
+
+        # (B, L, H*D) -> (B, H, L, D)
+        query_layer = self.transpose_for_scores(q)
+        key_layer = self.transpose_for_scores(k)
+        value_layer = self.transpose_for_scores(v)
+
+        query_layer = query_layer / math.sqrt(self.attention_head_size)
+        # [BSZ, NAT, L, L]
+        attention_scores = paddle.matmul(query_layer, key_layer, transpose_y=True)
+
+        if self.has_relative_attention_bias:
+            attention_scores += rel_pos
+        if self.has_spatial_attention_bias:
+            attention_scores += rel_2d_pos
+        bool_attention_mask = attention_mask.astype(paddle.bool)
+        bool_attention_mask.stop_gradient = True
+        attention_scores_shape = attention_scores.shape
+        attention_scores = paddle.where(
+            bool_attention_mask.expand(attention_scores_shape),
+            paddle.ones(attention_scores_shape) * float("-1e10"),
+            attention_scores,
+        )
+        attention_probs = F.softmax(attention_scores, axis=-1)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        context_layer = paddle.matmul(attention_probs, value_layer)
+        context_layer = context_layer.transpose([0, 2, 1, 3])
+        context_layer = context_layer.reshape([context_layer.shape[0], context_layer.shape[1], self.all_head_size])
+
+        if output_attentions:
+            outputs = [context_layer, attention_probs]
+        else:
+            outputs = [context_layer]
+        return outputs
+
+
+class ErnieLayoutAttention(nn.Layer):
+    def __init__(self, config):
+        super(ErnieLayoutAttention, self).__init__()
+        self.self = ErnieLayoutSelfAttention(config)
+        self.output = ErnieLayoutSelfOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        output_attentions=False,
+        rel_pos=None,
+        rel_2d_pos=None,
+    ):
+
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_values,
+            output_attentions,
+            rel_pos=rel_pos,
+            rel_2d_pos=rel_2d_pos,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        # add attentions if we output them
+        if output_attentions:
+            outputs = [
+                attention_output,
+            ] + self_outputs[1:]
+        else:
+            outputs = [attention_output]
+        return outputs
+
+
+class ErnieLayoutEncoder(nn.Layer):
+    def __init__(self, config):
+        super(ErnieLayoutEncoder, self).__init__()
+        self.config = config
+        # Recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        self.layer = nn.LayerList([ErnieLayoutLayer(config) for _ in range(config["num_hidden_layers"])])
+
+        self.has_relative_attention_bias = config["has_relative_attention_bias"]
+        self.has_spatial_attention_bias = config["has_spatial_attention_bias"]
+        if self.has_relative_attention_bias:
+            self.rel_pos_bins = config["rel_pos_bins"]
+            self.max_rel_pos = config["max_rel_pos"]
+            self.rel_pos_onehot_size = config["rel_pos_bins"]
+            self.rel_pos_bias = paddle.create_parameter(
+                shape=[self.rel_pos_onehot_size, config["num_attention_heads"]], dtype=paddle.get_default_dtype()
+            )
+
+        if self.has_spatial_attention_bias:
+            self.max_rel_2d_pos = config["max_rel_2d_pos"]
+            self.rel_2d_pos_bins = config["rel_2d_pos_bins"]
+            self.rel_2d_pos_onehot_size = config["rel_2d_pos_bins"]
+            self.rel_pos_x_bias = paddle.create_parameter(
+                shape=[self.rel_2d_pos_onehot_size, config["num_attention_heads"]], dtype=paddle.get_default_dtype()
+            )
+            self.rel_pos_y_bias = paddle.create_parameter(
+                shape=[self.rel_2d_pos_onehot_size, config["num_attention_heads"]], dtype=paddle.get_default_dtype()
+            )
+
+    def _cal_1d_pos_emb(self, hidden_states, position_ids):
+        rel_pos_mat = position_ids.unsqueeze(-2) - position_ids.unsqueeze(-1)
+        rel_pos = relative_position_bucket(
+            rel_pos_mat,
+            num_buckets=self.rel_pos_bins,
+            max_distance=self.max_rel_pos,
+        )
+        rel_pos = paddle.nn.functional.one_hot(rel_pos, num_classes=self.rel_pos_onehot_size).astype(
+            hidden_states.dtype
+        )
+        rel_pos = paddle.matmul(rel_pos, self.rel_pos_bias).transpose([0, 3, 1, 2])
+        return rel_pos
+
+    def _cal_2d_pos_emb(self, hidden_states, bbox):
+        position_coord_x = bbox[:, :, 0]
+        position_coord_y = bbox[:, :, 3]
+        rel_pos_x_2d_mat = position_coord_x.unsqueeze(-2) - position_coord_x.unsqueeze(-1)
+        rel_pos_y_2d_mat = position_coord_y.unsqueeze(-2) - position_coord_y.unsqueeze(-1)
+        rel_pos_x = relative_position_bucket(
+            rel_pos_x_2d_mat,
+            num_buckets=self.rel_2d_pos_bins,
+            max_distance=self.max_rel_2d_pos,
+        )
+        rel_pos_y = relative_position_bucket(
+            rel_pos_y_2d_mat,
+            num_buckets=self.rel_2d_pos_bins,
+            max_distance=self.max_rel_2d_pos,
+        )
+        rel_pos_x = F.one_hot(rel_pos_x, num_classes=self.rel_2d_pos_onehot_size).astype(hidden_states.dtype)
+        rel_pos_y = F.one_hot(rel_pos_y, num_classes=self.rel_2d_pos_onehot_size).astype(hidden_states.dtype)
+        rel_pos_x = paddle.matmul(rel_pos_x, self.rel_pos_x_bias).transpose([0, 3, 1, 2])
+        rel_pos_y = paddle.matmul(rel_pos_y, self.rel_pos_y_bias).transpose([0, 3, 1, 2])
+        rel_2d_pos = rel_pos_x + rel_pos_y
+        return rel_2d_pos
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        bbox=None,
+        position_ids=None,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+
+        rel_pos = self._cal_1d_pos_emb(hidden_states, position_ids) if self.has_relative_attention_bias else None
+        rel_2d_pos = self._cal_2d_pos_emb(hidden_states, bbox) if self.has_spatial_attention_bias else None
+
+        hidden_save = dict()
+        hidden_save["input_hidden_states"] = hidden_states
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_values = past_key_values[i] if past_key_values is not None else None
+
+            # gradient_checkpointing is set as False here so we remove some codes here
+            hidden_save["input_attention_mask"] = attention_mask
+            hidden_save["input_layer_head_mask"] = layer_head_mask
+
+            if self.enable_recompute and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return tuple(module(*inputs))
+
+                    return custom_forward
+
+                layer_outputs = recompute(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_values,
+                    output_attentions,
+                    rel_pos,
+                    rel_2d_pos,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_values,
+                    output_attentions,
+                    rel_pos=rel_pos,
+                    rel_2d_pos=rel_2d_pos,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            hidden_save["{}_data".format(i)] = hidden_states
+
+        return (hidden_states,)
+
+
+class ErnieLayoutIntermediate(nn.Layer):
+    def __init__(self, config):
+        super(ErnieLayoutIntermediate, self).__init__()
+        self.dense = nn.Linear(config["hidden_size"], config["intermediate_size"])
+        if config["hidden_act"] == "gelu":
+            self.intermediate_act_fn = nn.GELU()
+        else:
+            assert False, "hidden_act is set as: {}, please check it..".format(config["hidden_act"])
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class ErnieLayoutOutput(nn.Layer):
+    def __init__(self, config):
+        super(ErnieLayoutOutput, self).__init__()
+        self.dense = nn.Linear(config["intermediate_size"], config["hidden_size"])
+        self.LayerNorm = nn.LayerNorm(config["hidden_size"], epsilon=config["layer_norm_eps"])
+        self.dropout = nn.Dropout(config["hidden_dropout_prob"])
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class ErnieLayoutLayer(nn.Layer):
+    def __init__(self, config):
+        super(ErnieLayoutLayer, self).__init__()
+        # since chunk_size_feed_forward is 0 as default, no chunk is needed here.
+        self.seq_len_dim = 1
+        self.attention = ErnieLayoutAttention(config)
+        self.add_cross_attention = False  # default as false
+        self.intermediate = ErnieLayoutIntermediate(config)
+        self.output = ErnieLayoutOutput(config)
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        output_attentions=False,
+        rel_pos=None,
+        rel_2d_pos=None,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_values = past_key_values[:2] if past_key_values is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_values=self_attn_past_key_values,
+            rel_pos=rel_pos,
+            rel_2d_pos=rel_2d_pos,
+        )
+        attention_output = self_attention_outputs[0]
+        layer_output = self.feed_forward_chunk(attention_output)
+
+        if output_attentions:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+            outputs = [
+                layer_output,
+            ] + list(outputs)
+        else:
+            outputs = [layer_output]
+        return outputs
+
+
+class VisualBackbone(nn.Layer):
+    def __init__(self, config):
+        super(VisualBackbone, self).__init__()
+
+        self.backbone = ResNet(layers=101)
+
+        self.register_buffer("pixel_mean", paddle.to_tensor([103.53, 116.28, 123.675]).reshape([3, 1, 1]))
+        self.register_buffer("pixel_std", paddle.to_tensor([57.375, 57.12, 58.395]).reshape([3, 1, 1]))
+
+        self.pool = nn.AdaptiveAvgPool2D(config["image_feature_pool_shape"][:2])
+
+    def forward(self, images):
+        images_input = (paddle.to_tensor(images) - self.pixel_mean) / self.pixel_std
+        features = self.backbone(images_input)
+        features = self.pool(features).flatten(start_axis=2).transpose([0, 2, 1])
+        return features
+
+
+@register_base_model
+class ErnieLayoutModel(ErnieLayoutPretrainedModel):
+    """
+    The bare ErnieLayout Model outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+       vocab_size (`int`, *optional*, defaults to 250002):
+            Vocabulary size of the ErnieLayout model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ErnieLayoutModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 514):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 514 or 1028 or 2056).
+        type_vocab_size (`int`, *optional*, defaults to 100):
+            The vocabulary size of the `token_type_ids` passed when calling [`ErnieModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for classifier.
+    """
+
+    def __init__(self, config: ErnieLayoutConfig):
+        super(ErnieLayoutModel, self).__init__(config)
+        self.has_visual_segment_embedding = config["has_visual_segment_embedding"]
+        self.embeddings = ErnieLayoutEmbeddings(config)
+
+        self.visual = VisualBackbone(config)
+        self.visual_proj = nn.Linear(config["image_feature_pool_shape"][-1], config["hidden_size"])
+        self.visual_act_fn = nn.GELU()
+        if self.has_visual_segment_embedding:
+            self.visual_segment_embedding = self.create_parameter(
+                shape=[
+                    config["hidden_size"],
+                ],
+                dtype=self.embedding.weight.dtype,
+            )
+        self.visual_LayerNorm = nn.LayerNorm(config["hidden_size"], epsilon=config["layer_norm_eps"])
+        self.visual_dropout = nn.Dropout(config["hidden_dropout_prob"])
+        self.encoder = ErnieLayoutEncoder(config)
+        self.pooler = ErnieLayoutPooler(config["hidden_size"], "tanh")
+
+    def _calc_text_embeddings(self, input_ids, bbox, position_ids, token_type_ids):
+        words_embeddings = self.embeddings.word_embeddings(input_ids)
+        position_embeddings = self.embeddings.position_embeddings(position_ids)
+        x1, y1, x2, y2, h, w = self.embeddings._cal_spatial_position_embeddings(bbox)
+        token_type_embeddings = self.embeddings.token_type_embeddings(token_type_ids)
+        embeddings = words_embeddings + position_embeddings + x1 + y1 + x2 + y2 + w + h + token_type_embeddings
+
+        embeddings = self.embeddings.LayerNorm(embeddings)
+        embeddings = self.embeddings.dropout(embeddings)
+        return embeddings
+
+    def _calc_img_embeddings(self, image, bbox, position_ids):
+        if image is not None:
+            visual_embeddings = self.visual_act_fn(self.visual_proj(self.visual(image.astype(paddle.float32))))
+        position_embeddings = self.embeddings.position_embeddings(position_ids)
+        x1, y1, x2, y2, h, w = self.embeddings._cal_spatial_position_embeddings(bbox)
+        if image is not None:
+            embeddings = visual_embeddings + position_embeddings + x1 + y1 + x2 + y2 + w + h
+        else:
+            embeddings = position_embeddings + x1 + y1 + x2 + y2 + w + h
+
+        if self.has_visual_segment_embedding:
+            embeddings += self.visual_segment_embedding
+        embeddings = self.visual_LayerNorm(embeddings)
+        embeddings = self.visual_dropout(embeddings)
+        return embeddings
+
+    def _calc_visual_bbox(self, image_feature_pool_shape, bbox, visual_shape):
+        visual_bbox_x = (
+            paddle.arange(
+                0,
+                1000 * (image_feature_pool_shape[1] + 1),
+                1000,
+                dtype=bbox.dtype,
+            )
+            // image_feature_pool_shape[1]
+        )
+        visual_bbox_y = (
+            paddle.arange(
+                0,
+                1000 * (image_feature_pool_shape[0] + 1),
+                1000,
+                dtype=bbox.dtype,
+            )
+            // image_feature_pool_shape[0]
+        )
+
+        expand_shape = image_feature_pool_shape[0:2]
+        visual_bbox = paddle.stack(
+            [
+                visual_bbox_x[:-1].expand(expand_shape),
+                visual_bbox_y[:-1].expand(expand_shape[::-1]).transpose([1, 0]),
+                visual_bbox_x[1:].expand(expand_shape),
+                visual_bbox_y[1:].expand(expand_shape[::-1]).transpose([1, 0]),
+            ],
+            axis=-1,
+        ).reshape([expand_shape[0] * expand_shape[1], bbox.shape[-1]])
+
+        visual_bbox = visual_bbox.expand([visual_shape[0], visual_bbox.shape[0], visual_bbox.shape[1]])
+        return visual_bbox
+
+    def resize_position_embeddings(self, new_num_position_embeddings):
+        """
+        Resizes position embeddings of the model if `new_num_position_embeddings != config["max_position_embeddings"]`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embedding matrix. If position embeddings are learned, increasing the size
+                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
+                end.
+        """
+        num_position_embeds_diff = new_num_position_embeddings - self.config["max_position_embeddings"]
+
+        # no resizing needs to be done if the length stays the same
+        if num_position_embeds_diff == 0:
+            return
+
+        logger.info(f"Setting `config.max_position_embeddings={new_num_position_embeddings}`...")
+        self.config["max_position_embeddings"] = new_num_position_embeddings
+
+        old_position_embeddings_weight = self.embeddings.position_embeddings.weight
+
+        self.embeddings.position_embeddings = nn.Embedding(
+            self.config["max_position_embeddings"], self.config["hidden_size"]
+        )
+
+        with paddle.no_grad():
+            if num_position_embeds_diff > 0:
+                self.embeddings.position_embeddings.weight[:-num_position_embeds_diff] = old_position_embeddings_weight
+            else:
+                self.embeddings.position_embeddings.weight = old_position_embeddings_weight[:num_position_embeds_diff]
+
+    def forward(
+        self,
+        input_ids=None,
+        bbox=None,
+        image=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        output_hidden_states=False,
+        output_attentions=False,
+    ):
+        input_shape = input_ids.shape
+        visual_shape = list(input_shape)
+        visual_shape[1] = self.config["image_feature_pool_shape"][0] * self.config["image_feature_pool_shape"][1]
+        visual_bbox = self._calc_visual_bbox(self.config["image_feature_pool_shape"], bbox, visual_shape)
+
+        final_bbox = paddle.concat([bbox, visual_bbox], axis=1)
+        if attention_mask is None:
+            attention_mask = paddle.ones(input_shape)
+
+        visual_attention_mask = paddle.ones(visual_shape)
+
+        attention_mask = attention_mask.astype(visual_attention_mask.dtype)
+
+        final_attention_mask = paddle.concat([attention_mask, visual_attention_mask], axis=1)
+
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64)
+
+        if position_ids is None:
+            seq_length = input_shape[1]
+            position_ids = self.embeddings.position_ids[:, :seq_length]
+            position_ids = position_ids.expand(input_shape)
+
+        visual_position_ids = paddle.arange(0, visual_shape[1]).expand([input_shape[0], visual_shape[1]])
+        final_position_ids = paddle.concat([position_ids, visual_position_ids], axis=1)
+
+        if bbox is None:
+            bbox = paddle.zeros(input_shape + [4])
+
+        text_layout_emb = self._calc_text_embeddings(
+            input_ids=input_ids,
+            bbox=bbox,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+        )
+
+        visual_emb = self._calc_img_embeddings(
+            image=image,
+            bbox=visual_bbox,
+            position_ids=visual_position_ids,
+        )
+        final_emb = paddle.concat([text_layout_emb, visual_emb], axis=1)
+
+        extended_attention_mask = final_attention_mask.unsqueeze(1).unsqueeze(2)
+
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config["num_hidden_layers"], -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+        else:
+            head_mask = [None] * self.config["num_hidden_layers"]
+
+        encoder_outputs = self.encoder(
+            final_emb,
+            extended_attention_mask,
+            bbox=final_bbox,
+            position_ids=final_position_ids,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output)
+        return sequence_output, pooled_output
+
+
+class ErnieLayoutForSequenceClassification(ErnieLayoutPretrainedModel):
+    def __init__(self, config: ErnieLayoutConfig):
+        super(ErnieLayoutForSequenceClassification, self).__init__(config)
+        self.ernie_layout = ErnieLayoutModel(config)
+        self.num_labels = config.num_labels
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config["hidden_size"] * 3, config.num_labels)
+
+    def get_input_embeddings(self):
+        return self.ernie_layout.embeddings.word_embeddings
+
+    def resize_position_embeddings(self, new_num_position_embeddings):
+        """
+        Resizes position embeddings of the model if `new_num_position_embeddings != config["max_position_embeddings"]`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embedding matrix. If position embeddings are learned, increasing the size
+                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
+                end.
+        """
+        self.ernie_layout.resize_position_embeddings(new_num_position_embeddings)
+
+    def forward(
+        self,
+        input_ids=None,
+        bbox=None,
+        image=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        labels=None,
+    ):
+        input_shape = input_ids.shape
+        visual_shape = list(input_shape)
+        visual_shape[1] = (
+            self.ernie_layout.config["image_feature_pool_shape"][0]
+            * self.ernie_layout.config["image_feature_pool_shape"][1]
+        )
+        visual_bbox = self.ernie_layout._calc_visual_bbox(
+            self.ernie_layout.config["image_feature_pool_shape"], bbox, visual_shape
+        )
+
+        visual_position_ids = paddle.arange(0, visual_shape[1]).expand([input_shape[0], visual_shape[1]])
+
+        initial_image_embeddings = self.ernie_layout._calc_img_embeddings(
+            image=image,
+            bbox=visual_bbox,
+            position_ids=visual_position_ids,
+        )
+
+        outputs = self.ernie_layout(
+            input_ids=input_ids,
+            bbox=bbox,
+            image=image,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+        )
+        seq_length = input_ids.shape[1]
+        # sequence out and image out
+        sequence_output, final_image_embeddings = outputs[0][:, :seq_length], outputs[0][:, seq_length:]
+
+        cls_final_output = sequence_output[:, 0, :]
+
+        # average-pool the visual embeddings
+        pooled_initial_image_embeddings = initial_image_embeddings.mean(axis=1)
+        pooled_final_image_embeddings = final_image_embeddings.mean(axis=1)
+        # concatenate with cls_final_output
+        sequence_output = paddle.concat(
+            [cls_final_output, pooled_initial_image_embeddings, pooled_final_image_embeddings], axis=1
+        )
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        outputs = (logits,)
+
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+
+            loss = loss_fct(
+                logits.reshape([-1, self.num_labels]),
+                labels.reshape(
+                    [
+                        -1,
+                    ]
+                ),
+            )
+
+            outputs = (loss,) + outputs
+
+        return outputs
+
+
+class ErnieLayoutPredictionHead(Layer):
+    """
+    Bert Model with a `language modeling` head on top for CLM fine-tuning.
+    """
+
+    def __init__(self, hidden_size, vocab_size, activation, embedding_weights=None):
+        super(ErnieLayoutPredictionHead, self).__init__()
+        self.transform = nn.Linear(hidden_size, hidden_size)
+        self.activation = getattr(nn.functional, activation)
+        self.layer_norm = nn.LayerNorm(hidden_size)
+        self.decoder_weight = (
+            self.create_parameter(shape=[vocab_size, hidden_size], dtype=self.transform.weight.dtype, is_bias=False)
+            if embedding_weights is None
+            else embedding_weights
+        )
+        self.decoder_bias = self.create_parameter(shape=[vocab_size], dtype=self.decoder_weight.dtype, is_bias=True)
+
+    def forward(self, hidden_states, masked_positions=None):
+        if masked_positions is not None:
+            hidden_states = paddle.reshape(hidden_states, [-1, hidden_states.shape[-1]])
+            hidden_states = paddle.tensor.gather(hidden_states, masked_positions)
+        # gather masked tokens might be more quick
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = paddle.tensor.matmul(hidden_states, self.decoder_weight, transpose_y=True) + self.decoder_bias
+        return hidden_states
+
+
+class ErnieLayoutPretrainingHeads(Layer):
+    def __init__(self, hidden_size, vocab_size, activation, embedding_weights=None):
+        super(ErnieLayoutPretrainingHeads, self).__init__()
+        self.predictions = ErnieLayoutPredictionHead(hidden_size, vocab_size, activation, embedding_weights)
+
+    def forward(self, sequence_output, masked_positions=None):
+        prediction_scores = self.predictions(sequence_output, masked_positions)
+        return prediction_scores
+
+
+class ErnieLayoutForPretraining(ErnieLayoutPretrainedModel):
+    def __init__(self, config: ErnieLayoutConfig):
+        super(ErnieLayoutForPretraining, self).__init__(config)
+        self.ernie_layout = ErnieLayoutModel(config)
+        self.cls = ErnieLayoutPretrainingHeads(
+            config.hidden_size,
+            config.vocab_size,
+            config.hidden_act,
+            embedding_weights=self.ernie_layout.embeddings.word_embeddings.weight,
+        )
+
+    def resize_position_embeddings(self, new_num_position_embeddings):
+        """
+        Resizes position embeddings of the model if `new_num_position_embeddings != config["max_position_embeddings"]`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embedding matrix. If position embeddings are learned, increasing the size
+                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
+                end.
+        """
+        self.ernie_layout.resize_position_embeddings(new_num_position_embeddings)
+
+    def forward(
+        self,
+        input_ids=None,
+        bbox=None,
+        image=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        masked_positions=None,
+    ):
+        outputs = self.ernie_layout(
+            input_ids=input_ids,
+            bbox=bbox,
+            image=image,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output, masked_positions)
+        return prediction_scores
+
+
+class ErnieLayoutForTokenClassification(ErnieLayoutPretrainedModel):
+    def __init__(self, config: ErnieLayoutConfig):
+        super(ErnieLayoutForTokenClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.ernie_layout = ErnieLayoutModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config["hidden_size"], config.num_labels)
+
+    def get_input_embeddings(self):
+        return self.ernie_layout.embeddings.word_embeddings
+
+    def resize_position_embeddings(self, new_num_position_embeddings):
+        """
+        Resizes position embeddings of the model if `new_num_position_embeddings != config["max_position_embeddings"]`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embedding matrix. If position embeddings are learned, increasing the size
+                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
+                end.
+        """
+        self.ernie_layout.resize_position_embeddings(new_num_position_embeddings)
+
+    def forward(
+        self,
+        input_ids=None,
+        bbox=None,
+        image=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        labels=None,
+    ):
+        outputs = self.ernie_layout(
+            input_ids=input_ids,
+            bbox=bbox,
+            image=image,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+        )
+        seq_length = input_ids.shape[1]
+        sequence_output = outputs[0][:, :seq_length]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        outputs = (logits,)
+
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+
+            if attention_mask is not None:
+                active_loss = (
+                    attention_mask.reshape(
+                        [
+                            -1,
+                        ]
+                    )
+                    == 1
+                )
+                active_logits = logits.reshape([-1, self.num_labels])[active_loss]
+                active_labels = labels.reshape(
+                    [
+                        -1,
+                    ]
+                )[active_loss]
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(
+                    logits.reshape([-1, self.num_labels]),
+                    labels.reshape(
+                        [
+                            -1,
+                        ]
+                    ),
+                )
+
+            outputs = (loss,) + outputs
+
+        return outputs
+
+
+class ErnieLayoutForQuestionAnswering(ErnieLayoutPretrainedModel):
+    def __init__(self, config: ErnieLayoutConfig):
+        super(ErnieLayoutForQuestionAnswering, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.ernie_layout = ErnieLayoutModel(config)
+        self.has_visual_segment_embedding = config.has_visual_segment_embedding
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.qa_outputs = nn.Linear(config["hidden_size"], 2)
+
+    def get_input_embeddings(self):
+        return self.ernie_layout.embeddings.word_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        bbox=None,
+        image=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        start_positions=None,
+        end_positions=None,
+    ):
+        outputs = self.ernie_layout(
+            input_ids=input_ids,
+            bbox=bbox,
+            image=image,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+        )
+        seq_length = input_ids.shape[1]
+        sequence_output = outputs[0][:, :seq_length]
+        sequence_output = self.dropout(sequence_output)
+
+        if token_type_ids is not None:
+            span_mask = -token_type_ids * 1e8
+        else:
+            span_mask = 0
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = paddle.split(logits, num_or_sections=2, axis=-1)
+        start_logits = start_logits.squeeze(-1) + span_mask
+        end_logits = end_logits.squeeze(-1) + span_mask
+
+        outputs = (start_logits, end_logits) + outputs[2:]
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.shape) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.shape) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clip(0, ignored_index)
+            end_positions = end_positions.clip(0, ignored_index)
+
+            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not total_loss:
+            return outputs
+        else:
+            outputs = (total_loss,) + outputs
+            return outputs
+
+
+class UIEX(ErnieLayoutPretrainedModel):
+    def __init__(self, config: ErnieLayoutConfig):
+        super(UIEX, self).__init__(config)
+        self.ernie_layout = ErnieLayoutModel(config)
+        self.linear_start = nn.Linear(config.hidden_size, 1)
+        self.linear_end = nn.Linear(config.hidden_size, 1)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, bbox=None, image=None):
+        sequence_output, _ = self.ernie_layout(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            bbox=bbox,
+            image=image,
+        )
+        seq_length = input_ids.shape[1]
+        sequence_output = sequence_output[:, :seq_length]
+        start_logits = self.linear_start(sequence_output)
+        start_logits = paddle.squeeze(start_logits, -1)
+        start_prob = self.sigmoid(start_logits)
+        end_logits = self.linear_end(sequence_output)
+        end_logits = paddle.squeeze(end_logits, -1)
+        end_prob = self.sigmoid(end_logits)
+        return start_prob, end_prob
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_layout/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_layout/tokenizer.py
new file mode 100644
index 000000000..49b9df650
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_layout/tokenizer.py
@@ -0,0 +1,299 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tokenization classes for ErnieLayout model."""
+
+import os
+import unicodedata
+from typing import List, Optional
+
+import sentencepiece as spm
+
+from .. import AddedToken, PretrainedTokenizer
+from ..tokenizer_utils import _is_control, _is_punctuation, _is_whitespace
+
+SPIECE_UNDERLINE = "▁"
+
+
+def _is_end_of_word(text):
+    """Checks whether the last character in text is one of a punctuation, control or whitespace character."""
+    last_char = text[-1]
+    return bool(_is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(last_char))
+
+
+def _is_start_of_word(text):
+    """Checks whether the first character in text is one of a punctuation, control or whitespace character."""
+    first_char = text[0]
+    return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char))
+
+
+class ErnieLayoutTokenizer(PretrainedTokenizer):
+    resource_files_names = {
+        "sentencepiece_model_file": "sentencepiece.bpe.model",
+        "vocab_file": "vocab.txt",
+    }  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "ernie-layoutx-base-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_layout/vocab.txt",
+            "uie-x-base": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_layout/vocab.txt",
+        },
+        "sentencepiece_model_file": {
+            "ernie-layoutx-base-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_layout/sentencepiece.bpe.model",
+            "uie-x-base": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_layout/sentencepiece.bpe.model",
+        },
+    }
+    pretrained_init_configuration = {
+        "ernie-layoutx-base-uncased": {"do_lower_case": True, "do_tokenize_postprocess": False},
+        "uie-x-base": {"do_lower_case": True, "do_tokenize_postprocess": True},
+    }
+    pretrained_positional_embedding_sizes = {"ernie-layoutx-base-uncased": 514, "uie-x-base": 514}
+    max_model_input_sizes = pretrained_positional_embedding_sizes
+    # Ernie-M model doesn't have token_type embedding.
+    model_input_names: List[str] = ["input_ids"]
+
+    SPECIAL_TOKENS_ATTRIBUTES = [
+        "unk_token",
+        "sep_token",
+        "pad_token",
+        "cls_token",
+        "mask_token",
+        "additional_special_tokens",
+    ]
+
+    def __init__(
+        self,
+        vocab_file,
+        sentencepiece_model_file,
+        do_tokenize_postprocess=False,
+        sep_token="[SEP]",
+        cls_token="[CLS]",
+        unk_token="[UNK]",
+        pad_token="[PAD]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+        self._sep_token = sep_token
+        self._cls_token = cls_token
+        self._unk_token = unk_token
+        self._pad_token = pad_token
+        self._mask_token = mask_token
+        self.sp_model = spm.SentencePieceProcessor()
+        self.vocab_file = vocab_file
+        self.sentencepiece_model_file = sentencepiece_model_file
+        if os.path.isfile(sentencepiece_model_file):
+            self.sp_model.Load(sentencepiece_model_file)
+        self.vocab_file = vocab_file
+        self.do_tokenize_postprocess = do_tokenize_postprocess
+
+        self.tokens_to_ids = {"[CLS]": 0, "[PAD]": 1, "[SEP]": 2, "[UNK]": 3}
+
+        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
+        self.offset = 1
+
+        self.tokens_to_ids["[MASK]"] = len(self.sp_model) + self.offset
+        self.ids_to_tokens = {v: k for k, v in self.tokens_to_ids.items()}
+        self.SP_CHAR_MAPPING = {}
+
+        for ch in range(65281, 65375):
+            if ch in [ord("～")]:
+                self.SP_CHAR_MAPPING[chr(ch)] = chr(ch)
+                continue
+            self.SP_CHAR_MAPPING[chr(ch)] = chr(ch - 65248)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        r"""
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        An ERNIE-LayoutX offset_mapping has the following format:
+        - single sequence:      ``(0,0) X (0,0)``
+        - pair of sequences:        ``(0,0) A (0,0) (0,0) B (0,0)``
+
+        Args:
+            offset_mapping_ids_0 (List[tuple]):
+                List of char offsets to which the special tokens will be added.
+            offset_mapping_ids_1 (List[tuple], optional):
+                Optional second list of wordpiece offsets for offset mapping pairs.
+                Defaults to `None`.
+        Returns:
+            List[tuple]: List of wordpiece offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0), (0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    def get_offset_mapping(self, text):
+        split_tokens = self.tokenize(text)
+        normalized_text, char_mapping = "", []
+
+        for i, ch in enumerate(text):
+
+            if ch in self.SP_CHAR_MAPPING:
+                ch = self.SP_CHAR_MAPPING.get(ch)
+            else:
+                ch = unicodedata.normalize("NFKC", ch)
+            if self.is_whitespace(ch):
+                continue
+            normalized_text += ch
+            char_mapping.extend([i] * len(ch))
+
+        text, token_mapping, offset = normalized_text, [], 0
+        for token in split_tokens:
+            if token[:1] == "▁":
+                token = token[1:]
+                if not token:
+                    continue
+            start = text[offset:].index(token) + offset
+            end = start + len(token)
+
+            token_mapping.append((char_mapping[start], char_mapping[end - 1] + 1))
+            offset = end
+        return token_mapping
+
+    @property
+    def vocab_size(self):
+        return len(self.sp_model) + self.offset + 1  # Add the <mask> token
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        pieces = self.sp_model.EncodeAsPieces(text)
+        if self.do_tokenize_postprocess:
+            new_pieces = []
+            for piece in pieces:
+                if piece == SPIECE_UNDERLINE:
+                    continue
+                lst_i = 0
+                for i, c in enumerate(piece):
+                    if c == SPIECE_UNDERLINE:
+                        continue
+                    if self.is_ch_char(c) or self.is_punct(c):
+                        if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE:
+                            new_pieces.append(piece[lst_i:i])
+                        new_pieces.append(c)
+                        lst_i = i + 1
+                    elif c.isdigit() and i > 0 and not piece[i - 1].isdigit():
+                        if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE:
+                            new_pieces.append(piece[lst_i:i])
+                        lst_i = i
+                    elif not c.isdigit() and i > 0 and piece[i - 1].isdigit():
+                        if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE:
+                            new_pieces.append(piece[lst_i:i])
+                        lst_i = i
+                if len(piece) > lst_i:
+                    new_pieces.append(piece[lst_i:])
+            pieces = new_pieces
+        return pieces
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if token in self.tokens_to_ids:
+            return self.tokens_to_ids[token]
+        spm_id = self.sp_model.PieceToId(token)
+
+        # Need to return unknown token if the SP model returned 0
+        return spm_id + self.offset if spm_id else self.unk_token_id
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.ids_to_tokens:
+            return self.ids_to_tokens[index]
+        return self.sp_model.IdToPiece(index - self.offset)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    def num_special_tokens_to_add(self, pair=False):
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+
+    def is_ch_char(self, char):
+        """
+        is_ch_char
+        """
+        if "\u4e00" <= char <= "\u9fff":
+            return True
+        return False
+
+    def is_alpha(self, char):
+        """
+        is_alpha
+        """
+        if "a" <= char <= "z":
+            return True
+        if "A" <= char <= "Z":
+            return True
+        return False
+
+    def is_punct(self, char):
+        """
+        is_punct
+        """
+        if char in ",;:.?!~，；：。？！《》【】":
+            return True
+        return False
+
+    def is_whitespace(self, char):
+        """
+        is whitespace
+        """
+        if char == " " or char == "\t" or char == "\n" or char == "\r":
+            return True
+        if len(char) == 1:
+            cat = unicodedata.category(char)
+            if cat == "Zs":
+                return True
+        return False
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_layout/visual_backbone.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_layout/visual_backbone.py
new file mode 100644
index 000000000..2ea545643
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_layout/visual_backbone.py
@@ -0,0 +1,214 @@
+# -*- coding: utf-8 -*-
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import BatchNorm, Conv2D, MaxPool2D
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(
+        self, num_channels, num_filters, filter_size, stride=1, groups=1, act=None, name=None, data_format="NCHW"
+    ):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            bias_attr=False,
+            data_format=data_format,
+        )
+        self._batch_norm = BatchNorm(num_filters, act=act, data_layout=data_format)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+class BottleneckBlock(nn.Layer):
+    def __init__(self, num_channels, num_filters, stride, shortcut=True, name=None, data_format="NCHW"):
+        super(BottleneckBlock, self).__init__()
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels, num_filters=num_filters, filter_size=1, act="relu", data_format=data_format
+        )
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            data_format=data_format,
+        )
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters, num_filters=num_filters * 4, filter_size=1, act=None, data_format=data_format
+        )
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride,
+                data_format=data_format,
+            )
+
+        self.shortcut = shortcut
+
+        self._num_channels_out = num_filters * 4
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+        conv2 = self.conv2(conv1)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        y = paddle.add(x=short, y=conv2)
+        y = F.relu(y)
+        return y
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self, num_channels, num_filters, stride, shortcut=True, name=None, data_format="NCHW"):
+        super(BasicBlock, self).__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            data_format=data_format,
+        )
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters, num_filters=num_filters, filter_size=3, act=None, data_format=data_format
+        )
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=stride,
+                data_format=data_format,
+            )
+
+        self.shortcut = shortcut
+
+    def forward(self, inputs):
+        y = self.conv0(inputs)
+        conv1 = self.conv1(y)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+        y = paddle.add(x=short, y=conv1)
+        y = F.relu(y)
+        return y
+
+
+class ResNet(nn.Layer):
+    def __init__(self, layers=50, class_dim=1000, input_image_channel=3, data_format="NCHW"):
+        super(ResNet, self).__init__()
+
+        self.layers = layers
+        self.data_format = data_format
+        self.input_image_channel = input_image_channel
+
+        supported_layers = [18, 34, 50, 101, 152]
+        assert layers in supported_layers, "supported layers are {} but input layer is {}".format(
+            supported_layers, layers
+        )
+
+        if layers == 18:
+            depth = [2, 2, 2, 2]
+        elif layers == 34 or layers == 50:
+            depth = [3, 4, 6, 3]
+        elif layers == 101:
+            depth = [3]
+        elif layers == 152:
+            depth = [3, 8, 36, 3]
+        num_channels = [64, 256, 512, 1024] if layers >= 50 else [64, 64, 128, 256]
+        num_filters = [64, 128, 256, 512]
+
+        self.conv = ConvBNLayer(
+            num_channels=self.input_image_channel,
+            num_filters=64,
+            filter_size=7,
+            stride=2,
+            act="relu",
+            data_format=self.data_format,
+        )
+        self.pool2d_max = MaxPool2D(kernel_size=3, stride=2, padding=1, data_format=self.data_format)
+
+        self.block_list = []
+        if layers >= 50:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    if layers in [101, 152] and block == 2:
+                        if i == 0:
+                            conv_name = "res" + str(block + 2) + "a"
+                        else:
+                            conv_name = "res" + str(block + 2) + "b" + str(i)
+                    else:
+                        conv_name = "res" + str(block + 2) + chr(97 + i)
+                    bottleneck_block = self.add_sublayer(
+                        conv_name,
+                        BottleneckBlock(
+                            num_channels=num_channels[block] if i == 0 else num_filters[block] * 4,
+                            num_filters=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            data_format=self.data_format,
+                        ),
+                    )
+                    self.block_list.append(bottleneck_block)
+                    shortcut = True
+        else:
+            for block in range(len(depth)):
+                shortcut = False
+                for i in range(depth[block]):
+                    conv_name = "res" + str(block + 2) + chr(97 + i)
+                    basic_block = self.add_sublayer(
+                        conv_name,
+                        BasicBlock(
+                            num_channels=num_channels[block] if i == 0 else num_filters[block],
+                            num_filters=num_filters[block],
+                            stride=2 if i == 0 and block != 0 else 1,
+                            shortcut=shortcut,
+                            data_format=self.data_format,
+                        ),
+                    )
+                    self.block_list.append(basic_block)
+                    shortcut = True
+
+    def forward(self, inputs):
+        y = self.conv(inputs)
+        y = self.pool2d_max(y)
+
+        for block in self.block_list:
+            y = block(y)
+        return y
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_m/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_m/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_m/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_m/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_m/configuration.py
new file mode 100644
index 000000000..87d8ee768
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_m/configuration.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ERNIE-M model configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from ..configuration_utils import PretrainedConfig
+
+__all__ = ["ERNIE_M_PRETRAINED_INIT_CONFIGURATION", "ErnieMConfig", "ERNIE_M_PRETRAINED_RESOURCE_FILES_MAP"]
+
+ERNIE_M_PRETRAINED_INIT_CONFIGURATION = {
+    "ernie-m-base": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 514,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "vocab_size": 250002,
+        "pad_token_id": 1,
+    },
+    "ernie-m-large": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 514,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 24,
+        "vocab_size": 250002,
+        "pad_token_id": 1,
+    },
+    "uie-m-base": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 514,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "vocab_size": 250002,
+        "pad_token_id": 1,
+    },
+    "uie-m-large": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 514,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 24,
+        "vocab_size": 250002,
+        "pad_token_id": 1,
+    },
+}
+
+ERNIE_M_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "ernie-m-base": "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_m/ernie_m_base.pdparams",
+        "ernie-m-large": "https://paddlenlp.bj.bcebos.com/models/transformers/ernie_m/ernie_m_large.pdparams",
+        "uie-m-base": "https://paddlenlp.bj.bcebos.com/models/transformers/uie_m/uie_m_base.pdparams",
+        "uie-m-large": "https://paddlenlp.bj.bcebos.com/models/transformers/uie_m/uie_m_large.pdparams",
+    }
+}
+
+
+class ErnieMConfig(PretrainedConfig):
+    r"""
+        This is the configuration class to store the configuration of a [`ErnieModel`]. It is used to
+        instantiate a ERNIE model according to the specified arguments, defining the model architecture. Instantiating a
+        configuration with the defaults will yield a similar configuration to that of the ERNIE
+        ernie-3.0-medium-zh architecture.
+        Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+        documentation from [`PretrainedConfig`] for more information.
+    Args:
+            vocab_size (int):
+                Vocabulary size of `inputs_ids` in `ErnieMModel`. Also is the vocab size of token embedding matrix.
+                Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `ErnieMModel`.
+            hidden_size (int, optional):
+                Dimensionality of the embedding layer, encoder layers and pooler layer. Defaults to `768`.
+            num_hidden_layers (int, optional):
+                Number of hidden layers in the Transformer encoder. Defaults to `12`.
+            num_attention_heads (int, optional):
+                Number of attention heads for each attention layer in the Transformer encoder.
+                Defaults to `12`.
+            intermediate_size (int, optional):
+                Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
+                to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
+                and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
+                Defaults to `3072`.
+            hidden_act (str, optional):
+                The non-linear activation function in the feed-forward layer.
+                ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
+                are supported. Defaults to `"gelu"`.
+            hidden_dropout_prob (float, optional):
+                The dropout probability for all fully connected layers in the embeddings and encoder.
+                Defaults to `0.1`.
+            attention_probs_dropout_prob (float, optional):
+                The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
+                Defaults to `0.1`.
+            max_position_embeddings (int, optional):
+                The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
+                sequence. Defaults to `512`.
+            type_vocab_size (int, optional):
+                The vocabulary size of the `token_type_ids`.
+                Defaults to `2`.
+            initializer_range (float, optional):
+                The standard deviation of the normal initializer for initializing all weight matrices.
+                Defaults to `0.02`.
+
+                .. note::
+                    A normal_initializer initializes weight matrices as normal distributions.
+                    See :meth:`ErnieMPretrainedModel._init_weights()` for how weights are initialized in `ErnieMModel`.
+
+            pad_token_id(int, optional):
+                The index of padding token in the token vocabulary.
+                Defaults to `1`.
+
+        Examples:
+        ```python
+        >>> from paddlenlp.transformers import ErnieMModel, ErnieMConfig
+        >>> # Initializing a configuration
+        >>> configuration = ErnieMConfig()
+        >>> # Initializing a model from the configuration
+        >>> model = ErnieMModel(configuration)
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+        ```"""
+    model_type = "ernie_m"
+    attribute_map: Dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
+    pretrained_init_configuration = ERNIE_M_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size: int = 250002,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        max_position_embeddings: int = 514,
+        type_vocab_size: int = 16,
+        initializer_range: float = 0.02,
+        pad_token_id: int = 1,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_m/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_m/modeling.py
new file mode 100644
index 000000000..aead16f86
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_m/modeling.py
@@ -0,0 +1,834 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple
+
+import paddle
+import paddle.nn as nn
+from paddle import Tensor
+
+from paddlenlp.utils.env import CONFIG_NAME
+
+from .. import PretrainedModel, register_base_model
+from ..model_outputs import (
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    tuple_output,
+)
+from .configuration import (
+    ERNIE_M_PRETRAINED_INIT_CONFIGURATION,
+    ERNIE_M_PRETRAINED_RESOURCE_FILES_MAP,
+    ErnieMConfig,
+)
+
+__all__ = [
+    "ErnieMModel",
+    "ErnieMPretrainedModel",
+    "ErnieMForSequenceClassification",
+    "ErnieMForTokenClassification",
+    "ErnieMForQuestionAnswering",
+    "ErnieMForMultipleChoice",
+    "UIEM",
+]
+
+
+class ErnieMEmbeddings(nn.Layer):
+    r"""
+    Include embeddings from word, position.
+    """
+
+    def __init__(self, config: ErnieMConfig):
+        super(ErnieMEmbeddings, self).__init__()
+
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        past_key_values_length: int = 0,
+    ):
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        if position_ids is None:
+            input_shape = inputs_embeds.shape[:-1]
+            # maybe need use shape op to unify static graph and dynamic graph
+            ones = paddle.ones(input_shape, dtype="int64")
+            seq_length = paddle.cumsum(ones, axis=1)
+            position_ids = seq_length - ones
+
+            if past_key_values_length > 0:
+                position_ids = position_ids + past_key_values_length
+
+            position_ids.stop_gradient = True
+
+        position_ids += 2
+
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class ErnieMPooler(nn.Layer):
+    def __init__(self, config: ErnieMConfig):
+        super(ErnieMPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class ErnieMPretrainedModel(PretrainedModel):
+    r"""
+    An abstract class for pretrained ERNIE-M models. It provides ERNIE-M related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    Refer to :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+
+    """
+
+    model_config_file = CONFIG_NAME
+    config_class = ErnieMConfig
+    resource_files_names = {"model_state": "model_state.pdparams"}
+
+    pretrained_init_configuration = ERNIE_M_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = ERNIE_M_PRETRAINED_RESOURCE_FILES_MAP
+    base_model_prefix = "ernie_m"
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # only support dygraph, use truncated_normal and make it inplace
+            # and configurable later
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range,
+                        shape=layer.weight.shape,
+                    )
+                )
+
+
+@register_base_model
+class ErnieMModel(ErnieMPretrainedModel):
+    r"""
+    The bare ERNIE-M Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`ErnieMConfig`):
+            An instance of ErnieMConfig used to construct ErnieMModel.
+    """
+
+    def __init__(self, config: ErnieMConfig):
+        super(ErnieMModel, self).__init__(config)
+        self.pad_token_id = config.pad_token_id
+        self.initializer_range = config.initializer_range
+        self.embeddings = ErnieMEmbeddings(config)
+        encoder_layer = nn.TransformerEncoderLayer(
+            config.hidden_size,
+            config.num_attention_heads,
+            dim_feedforward=4 * config.hidden_size,
+            dropout=config.hidden_dropout_prob,
+            activation=config.hidden_act,
+            attn_dropout=config.attention_probs_dropout_prob,
+            act_dropout=0,
+            normalize_before=False,
+        )
+        self.encoder = nn.TransformerEncoder(encoder_layer, config.num_hidden_layers)
+        self.pooler = ErnieMPooler(config)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[Tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
+            position_ids (Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `[batch_size, num_tokens]` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
+                [batch_size, num_attention_heads, sequence_length, sequence_length].
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            inputs_embeds (Tensor, optional):
+                If you want to control how to convert `inputs_ids` indices into associated vectors, you can
+                pass an embedded representation directly instead of passing `inputs_ids`.
+            past_key_values (tuple(tuple(Tensor)), optional):
+                The length of tuple equals to the number of layers, and each inner
+                tuple haves 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`)
+                which contains precomputed key and value hidden states of the attention blocks.
+                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that
+                don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+                `input_ids` of shape `(batch_size, sequence_length)`.
+            use_cache (`bool`, optional):
+                If set to `True`, `past_key_values` key value states are returned.
+                Defaults to `None`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.ModelOutput` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions`.
+            tuple: Returns tuple (``sequence_output``, ``pooled_output``).
+
+            With the fields:
+
+            - `sequence_output` (Tensor):
+                Sequence of hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+            - `pooled_output` (Tensor):
+                The output of first token (`[CLS]`) in sequence.
+                We "pool" the model by simply taking the hidden state corresponding to the first token.
+                Its data type should be float32 and its shape is [batch_size, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ErnieMModel, ErnieMTokenizer
+
+                tokenizer = ErnieMTokenizer.from_pretrained('ernie-m-base')
+                model = ErnieMModel.from_pretrained('ernie-m-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                sequence_output, pooled_output = model(**inputs)
+
+        """
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time.")
+
+        # init the default bool value
+        output_attentions = output_attentions if output_attentions is not None else False
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else False
+        return_dict = return_dict if return_dict is not None else False
+        use_cache = use_cache if use_cache is not None else False
+
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+
+        if attention_mask is None:
+            # TODO(linjieccc): fix attention mask after uie-m related models updated
+            attention_mask = paddle.unsqueeze(
+                (input_ids == 0).astype(self.pooler.dense.weight.dtype) * -1e4, axis=[1, 2]
+            )
+            if past_key_values is not None:
+                batch_size = past_key_values[0][0].shape[0]
+                past_mask = paddle.zeros([batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype)
+                attention_mask = paddle.concat([past_mask, attention_mask], axis=-1)
+
+        # For 2D attention_mask from tokenizer
+        elif attention_mask.ndim == 2:
+            attention_mask = paddle.unsqueeze(attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
+            attention_mask = (1.0 - attention_mask) * -1e4
+        attention_mask.stop_gradient = True
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        self.encoder._use_cache = use_cache  # To be consistent with HF
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask,
+            cache=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if isinstance(encoder_outputs, type(embedding_output)):
+            sequence_output = encoder_outputs
+            pooled_output = self.pooler(sequence_output)
+            return (sequence_output, pooled_output)
+
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output)
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class ErnieMForSequenceClassification(ErnieMPretrainedModel):
+    r"""
+    Ernie-M Model with a linear layer on top of the output layer,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        config (:class:`ErnieMConfig`):
+            An instance of ErnieMConfig used to construct ErnieMForSequenceClassification.
+    """
+
+    def __init__(self, config: ErnieMConfig):
+        super(ErnieMForSequenceClassification, self).__init__(config)
+        self.ernie_m = ErnieMModel(config)
+        self.num_labels = config.num_labels
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`ErnieMModel`.
+            position_ids (Tensor, optional):
+                See :class:`ErnieMModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ErnieMModel`.
+            labels (Tensor of shape `(batch_size,)`, optional):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in `[0, ..., num_labels - 1]`. If `num_labels == 1`
+                a regression loss is computed (Mean-Square loss), If `num_labels > 1`
+                a classification loss is computed (Cross-Entropy).
+            inputs_embeds (Tensor, optional):
+                If you want to control how to convert `inputs_ids` indices into associated vectors, you can
+                pass an embedded representation directly instead of passing `inputs_ids`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ErnieMForSequenceClassification, ErnieMTokenizer
+
+                tokenizer = ErnieMTokenizer.from_pretrained('ernie-m-base')
+                model = ErnieMForSequenceClassification.from_pretrained('ernie-m-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+
+        """
+        outputs = self.ernie_m(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = self.dropout(outputs[1])
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = paddle.nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = paddle.nn.CrossEntropyLoss()
+                loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = paddle.nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return tuple_output(output, loss)
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class ErnieMForQuestionAnswering(ErnieMPretrainedModel):
+    """
+    Ernie-M Model with a linear layer on top of the hidden-states
+    output to compute `span_start_logits` and `span_end_logits`,
+    designed for question-answering tasks like SQuAD.
+
+    Args:
+        config (:class:`ErnieMConfig`):
+            An instance of ErnieMConfig used to construct ErnieMForQuestionAnswering.
+    """
+
+    def __init__(self, config: ErnieMConfig):
+        super(ErnieMForQuestionAnswering, self).__init__(config)
+        self.ernie_m = ErnieMModel(config)
+        self.classifier = nn.Linear(config.hidden_size, 2)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        start_positions: Optional[Tensor] = None,
+        end_positions: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`ErnieMModel`.
+            position_ids (Tensor, optional):
+                See :class:`ErnieMModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ErnieMModel`.
+            start_positions (Tensor of shape `(batch_size,)`, optional):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            end_positions (Tensor of shape `(batch_size,)`, optional):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            inputs_embeds (Tensor, optional):
+                If you want to control how to convert `inputs_ids` indices into associated vectors, you can
+                pass an embedded representation directly instead of passing `inputs_ids`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            tuple: Returns tuple (`start_logits`, `end_logits`).
+
+            With the fields:
+
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ErnieMForQuestionAnswering, ErnieMTokenizer
+
+                tokenizer = ErnieMTokenizer.from_pretrained('ernie-m-base')
+                model = ErnieMForQuestionAnswering.from_pretrained('ernie-m-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+        """
+
+        outputs = self.ernie_m(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        logits = self.classifier(outputs[0])
+        logits = paddle.transpose(logits, perm=[2, 0, 1])
+        start_logits, end_logits = paddle.unstack(x=logits, axis=0)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if start_positions.ndim > 1:
+                start_positions = start_positions.squeeze(-1)
+            if start_positions.ndim > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clip(0, ignored_index)
+            end_positions = end_positions.clip(0, ignored_index)
+
+            loss_fct = paddle.nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return tuple_output(output, total_loss)
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class ErnieMForTokenClassification(ErnieMPretrainedModel):
+    r"""
+    ERNIE-M Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        config (:class:`ErnieMConfig`):
+            An instance of ErnieMConfig used to construct ErnieMForTokenClassification.
+    """
+
+    def __init__(self, config: ErnieMConfig):
+        super(ErnieMForTokenClassification, self).__init__(config)
+        self.ernie_m = ErnieMModel(config)
+        self.num_labels = config.num_labels
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`ErnieMModel`.
+            position_ids (Tensor, optional):
+                See :class:`ErnieMModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ErnieMModel`.
+            labels (Tensor of shape `(batch_size, sequence_length)`, optional):
+                Labels for computing the token classification loss. Indices should be in `[0, ..., num_labels - 1]`.
+            inputs_embeds (Tensor, optional):
+                If you want to control how to convert `inputs_ids` indices into associated vectors, you can
+                pass an embedded representation directly instead of passing `inputs_ids`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input token classification logits.
+            Shape as `[batch_size, sequence_length, num_labels]` and dtype as `float32`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ErnieMForTokenClassification, ErnieMTokenizer
+
+                tokenizer = ErnieMTokenizer.from_pretrained('ernie-m-base')
+                model = ErnieMForTokenClassification.from_pretrained('ernie-m-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+        """
+        outputs = self.ernie_m(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = self.dropout(outputs[0])
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return tuple_output(output, loss)
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class ErnieMForMultipleChoice(ErnieMPretrainedModel):
+    """
+    ERNIE-M with a linear layer on top of the hidden-states output layer,
+    designed for multiple choice tasks like RocStories/SWAG tasks.
+
+    Args:
+        config (:class:`ErnieMConfig`):
+            An instance of ErnieMConfig used to construct ErnieMForMultipleChoice.
+    """
+
+    def __init__(self, config: ErnieMConfig):
+        super(ErnieMForMultipleChoice, self).__init__(config)
+        self.ernie_m = ErnieMModel(config)
+        self.num_choices = config.num_choices
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The ErnieMForMultipleChoice forward method, overrides the __call__() special method.
+        Args:
+            input_ids (Tensor):
+                See :class:`ErnieMModel` and shape as [batch_size, num_choice, sequence_length].
+            position_ids(Tensor, optional):
+                See :class:`ErnieMModel` and shape as [batch_size, num_choice, sequence_length].
+            attention_mask (list, optional):
+                See :class:`ErnieMModel` and shape as [batch_size, num_choice, sequence_length].
+            labels (Tensor of shape `(batch_size, )`, optional):
+                Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+                num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+                `input_ids` above)
+            inputs_embeds (Tensor, optional):
+                If you want to control how to convert `inputs_ids` indices into associated vectors, you can
+                pass an embedded representation directly instead of passing `inputs_ids`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput`.
+        """
+        # input_ids: [bs, num_choice, seq_l]
+        input_ids = input_ids.reshape(shape=(-1, input_ids.shape[-1]))  # flat_input_ids: [bs*num_choice,seq_l]
+
+        if position_ids is not None:
+            position_ids = position_ids.reshape(shape=(-1, position_ids.shape[-1]))
+
+        if attention_mask is not None:
+            attention_mask = attention_mask.reshape(shape=(-1, attention_mask.shape[-1]))
+
+        outputs = self.ernie_m(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = self.dropout(outputs[1])
+
+        logits = self.classifier(pooled_output)  # logits: (bs*num_choice,1)
+        reshaped_logits = logits.reshape(shape=(-1, self.num_choices))  # logits: (bs, num_choice)
+
+        loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return tuple_output(output, loss)
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class UIEM(ErnieMPretrainedModel):
+    """
+    Ernie-M Model with two linear layer on top of the hidden-states
+    output to compute `start_prob` and `end_prob`,
+    designed for Universal Information Extraction.
+
+    Args:
+        config (:class:`ErnieMConfig`):
+            An instance of ErnieMConfig used to construct UIEM.
+    """
+
+    def __init__(self, config: ErnieMConfig):
+        super(UIEM, self).__init__(config)
+        self.ernie_m = ErnieMModel(config)
+        self.linear_start = paddle.nn.Linear(config.hidden_size, 1)
+        self.linear_end = paddle.nn.Linear(config.hidden_size, 1)
+        self.sigmoid = nn.Sigmoid()
+
+    def forward(self, input_ids, position_ids=None, attention_mask=None):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`ErnieMModel`.
+            position_ids (Tensor, optional):
+                See :class:`ErnieMModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ErnieMModel`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import UIEM, ErnieMTokenizer
+
+                tokenizer = ErnieMTokenizer.from_pretrained('uie-m-base')
+                model = UIEM.from_pretrained('uie-m-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                start_prob, end_prob = model(**inputs)
+        """
+        sequence_output, _ = self.ernie_m(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+        )
+        start_logits = self.linear_start(sequence_output)
+        start_logits = paddle.squeeze(start_logits, -1)
+        start_prob = self.sigmoid(start_logits)
+        end_logits = self.linear_end(sequence_output)
+        end_logits = paddle.squeeze(end_logits, -1)
+        end_prob = self.sigmoid(end_logits)
+        # TODO: add return dict support
+        return start_prob, end_prob
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_m/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_m/tokenizer.py
new file mode 100644
index 000000000..7a0b8449f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_m/tokenizer.py
@@ -0,0 +1,348 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unicodedata
+from typing import List, Optional
+
+import sentencepiece as spm
+
+from .. import PretrainedTokenizer
+
+__all__ = ["ErnieMTokenizer"]
+
+SPIECE_UNDERLINE = "▁"
+
+
+class ErnieMTokenizer(PretrainedTokenizer):
+    r"""
+    Constructs a ErnieM tokenizer. It uses the `sentencepiece` tools to cut the words to sub-words.
+
+    Args:
+        vocab_file (str):
+            The file path of the vocabulary.
+        sentencepiece_model_file (str):
+            The file path of sentencepiece model.
+        do_lower_case (str, optional):
+            Whether or not to lowercase the input when tokenizing.
+            Defaults to`True`.
+        unk_token (str, optional):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str, optional):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str, optional):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str, optional):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str, optional):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+    """
+    resource_files_names = {
+        "sentencepiece_model_file": "sentencepiece.bpe.model",
+        "vocab_file": "vocab.txt",
+    }  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "ernie-m-base": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_m/ernie_m.vocab.txt",
+            "ernie-m-large": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_m/ernie_m.vocab.txt",
+            "uie-m-base": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_m/ernie_m.vocab.txt",
+            "uie-m-large": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_m/ernie_m.vocab.txt",
+        },
+        "sentencepiece_model_file": {
+            "ernie-m-base": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_m/ernie_m.sentencepiece.bpe.model",
+            "ernie-m-large": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_m/ernie_m.sentencepiece.bpe.model",
+            "uie-m-base": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_m/ernie_m.sentencepiece.bpe.model",
+            "uie-m-large": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_m/ernie_m.sentencepiece.bpe.model",
+        },
+    }
+    pretrained_init_configuration = {
+        "ernie-m-base": {"do_lower_case": False},
+        "ernie-m-large": {"do_lower_case": False},
+        "uie-m-base": {"do_lower_case": False},
+        "uie-m-large": {"do_lower_case": False},
+    }
+    max_model_input_sizes = {"ernie-m-base": 514, "ernie-m-large": 514, "uie-m-base": 514, "uie-m-large": 514}
+    # Ernie-M model doesn't have token_type embedding.
+    model_input_names: List[str] = ["input_ids"]
+
+    def __init__(
+        self,
+        vocab_file,
+        sentencepiece_model_file,
+        do_lower_case=False,
+        encoding="utf8",
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+        self.sp_model = spm.SentencePieceProcessor()
+
+        self.do_lower_case = do_lower_case
+        self.encoding = encoding
+        if not os.path.isfile(vocab_file):
+            raise ValueError("Can't find a vocabulary file at path '{}'.".format(vocab_file))
+        self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token)
+        self.vocab_file = vocab_file
+        self.sentencepiece_model_file = sentencepiece_model_file
+        if os.path.isfile(sentencepiece_model_file):
+            self.sp_model.Load(sentencepiece_model_file)
+
+        self.SP_CHAR_MAPPING = {}
+
+        for ch in range(65281, 65375):
+            if ch in [ord("～")]:
+                self.SP_CHAR_MAPPING[chr(ch)] = chr(ch)
+                continue
+            self.SP_CHAR_MAPPING[chr(ch)] = chr(ch - 65248)
+
+    def get_offset_mapping(self, text):
+        if text is None:
+            return None
+
+        split_tokens = self.tokenize(text)
+        normalized_text, char_mapping = "", []
+
+        for i, ch in enumerate(text):
+
+            if ch in self.SP_CHAR_MAPPING:
+                ch = self.SP_CHAR_MAPPING.get(ch)
+            else:
+                ch = unicodedata.normalize("NFKC", ch)
+            if self.is_whitespace(ch):
+                continue
+            normalized_text += ch
+            char_mapping.extend([i] * len(ch))
+
+        text, token_mapping, offset = normalized_text, [], 0
+
+        if self.do_lower_case:
+            text = text.lower()
+
+        for token in split_tokens:
+            if token[:1] == "▁":
+                token = token[1:]
+            start = text[offset:].index(token) + offset
+            end = start + len(token)
+
+            token_mapping.append((char_mapping[start], char_mapping[end - 1] + 1))
+            offset = end
+        return token_mapping
+
+    @property
+    def vocab_size(self):
+        r"""
+        Return the size of vocabulary.
+
+        Returns:
+            int: The size of vocabulary.
+        """
+        return self.sp_model.vocab_size()
+
+    def get_vocab(self):
+        return dict(self.vocab.token_to_idx, **self.added_tokens_encoder)
+
+    def clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        return "".join((self.SP_CHAR_MAPPING.get(c, c) for c in text))
+
+    def _tokenize(self, text, sample=False):
+        """Tokenize a string."""
+        if not sample:
+            pieces = self.sp_model.EncodeAsPieces(text)
+        else:
+            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
+        new_pieces = []
+        for piece in pieces:
+            if piece == SPIECE_UNDERLINE:
+                continue
+            lst_i = 0
+            for i, c in enumerate(piece):
+                if c == SPIECE_UNDERLINE:
+                    continue
+                if self.is_ch_char(c) or self.is_punct(c):
+                    if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE:
+                        new_pieces.append(piece[lst_i:i])
+                    new_pieces.append(c)
+                    lst_i = i + 1
+                elif c.isdigit() and i > 0 and not piece[i - 1].isdigit():
+                    if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE:
+                        new_pieces.append(piece[lst_i:i])
+                    lst_i = i
+                elif not c.isdigit() and i > 0 and piece[i - 1].isdigit():
+                    if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE:
+                        new_pieces.append(piece[lst_i:i])
+                    lst_i = i
+            if len(piece) > lst_i:
+                new_pieces.append(piece[lst_i:])
+        return new_pieces
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    def convert_ids_to_string(self, ids):
+        """
+        Converts a sequence of tokens (strings for sub-words) in a single string.
+        """
+        tokens = self.convert_ids_to_tokens(ids)
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        r"""
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens.
+
+        An ERNIE-M sequence has the following format:
+        - single sequence:       ``[CLS] X [SEP]``
+        - pair of sequences:        ``[CLS] A [SEP] [SEP] B [SEP]``
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs.
+                Defaults to `None`.
+        Returns:
+            List[int]: List of input_id with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        return _cls + token_ids_0 + _sep + _sep + token_ids_1 + _sep
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        r"""
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        An ERNIE-M offset_mapping has the following format:
+        - single sequence:      ``(0,0) X (0,0)``
+        - pair of sequences:        ``(0,0) A (0,0) (0,0) B (0,0)``
+
+        Args:
+            offset_mapping_ids_0 (List[tuple]):
+                List of char offsets to which the special tokens will be added.
+            offset_mapping_ids_1 (List[tuple], optional):
+                Optional second list of wordpiece offsets for offset mapping pairs.
+                Defaults to `None`.
+        Returns:
+            List[tuple]: List of wordpiece offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0), (0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        r"""
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+        Args:
+            token_ids_0 (List[int]):
+                List of ids of the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs.
+                Defaults to `None`.
+            already_has_special_tokens (str, optional):
+                Whether or not the token list is already formatted with special tokens for the model.
+                Defaults to `False`.
+        Returns:
+            List[int]:
+                The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create the token type IDs corresponding to the sequences passed. [What are token type
+        IDs?](../glossary#token-type-ids)
+
+        Should be overridden in a subclass if the model has a special way of building those.
+
+        Args:
+            token_ids_0 (`List[int]`): The first tokenized sequence.
+            token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
+
+        Returns:
+            `List[int]`: The token type ids.
+        """
+        # called when `add_special_tokens` is True, so align with `build_inputs_with_special_tokens` method
+        if token_ids_1 is None:
+            # [CLS] X [SEP]
+            return (len(token_ids_0) + 2) * [0]
+
+        # [CLS] A [SEP] [SEP] B [SEP]
+        return [0] * (len(token_ids_0) + 1) + [1] * (len(token_ids_1) + 3)
+
+    def is_ch_char(self, char):
+        """
+        is_ch_char
+        """
+        if "\u4e00" <= char <= "\u9fff":
+            return True
+        return False
+
+    def is_alpha(self, char):
+        """
+        is_alpha
+        """
+        if "a" <= char <= "z":
+            return True
+        if "A" <= char <= "Z":
+            return True
+        return False
+
+    def is_punct(self, char):
+        """
+        is_punct
+        """
+        if char in ",;:.?!~，；：。？！《》【】":
+            return True
+        return False
+
+    def is_whitespace(self, char):
+        """
+        is whitespace
+        """
+        if char == " " or char == "\t" or char == "\n" or char == "\r":
+            return True
+        if len(char) == 1:
+            cat = unicodedata.category(char)
+            if cat == "Zs":
+                return True
+        return False
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/configuration.py
new file mode 100644
index 000000000..41e9a43b5
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/configuration.py
@@ -0,0 +1,345 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ErnieViL model configuration"""
+
+import copy
+import os
+from typing import Union
+
+from ...utils.log import logger
+from ..configuration_utils import PretrainedConfig
+
+__all__ = [
+    "ErnieViLTextConfig",
+    "ErnieViLVisionConfig",
+    "ErnieViLConfig",
+]
+
+
+class ErnieViLTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ErnieViLTextModel`]. It is used to
+    instantiate a ERNIE model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the ERNIE
+    ernie-3.0-medium-zh architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 40000):
+            Vocabulary size of the ERNIE model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ErnieModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 0):
+            The vocabulary size of the `token_type_ids` passed when calling [`ErnieModel`].
+        task_type_vocab_size (`int`, *optional*, defaults to 3):
+            The vocabulary size of the `task_ids`.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        task_id (`int`, *optional*, defaults to 0):
+            Task id.
+        use_task_id (`bool`, *optional*, defaults to `False`):
+            Whether or not use task_id.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The index of padding token in the token vocabulary.
+
+    Examples:
+
+    ```python
+    >>> from paddlenlp.transformers import ErnieViLTextConfig, ErnieViLTextModel
+
+    >>> configuration = ErnieViLTextConfig()
+
+    >>> model = ErnieViLTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "ernie_vil_text_model"
+
+    def __init__(
+        self,
+        vocab_size: int = 40000,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        max_position_embeddings: int = 2048,
+        task_type_vocab_size: int = 3,
+        type_vocab_size: int = 0,
+        initializer_range: float = 0.02,
+        pad_token_id: int = 0,
+        layer_norm_eps=1e-5,
+        task_id: int = 0,
+        use_task_id: bool = False,
+        fuse: bool = False,
+        use_cache: bool = False,
+        **kwargs
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.task_id = task_id
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.task_type_vocab_size = task_type_vocab_size
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.fuse = fuse
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+        self.use_task_id = use_task_id
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> PretrainedConfig:
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from ErnieViLConfig
+        if config_dict.get("model_type") == "ernie_vil":
+            config_dict = config_dict["text_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class ErnieViLVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ErnieViLVisionModel`]. It is used to instantiate an ErnieViL
+    model according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*,
+            defaults to 1e-6): The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float``, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+
+    Example:
+
+    ```python
+    >>> from paddlenlp.transformers import ErnieViLVisionConfig, ErnieViLVisionModel
+
+    >>> configuration = ErnieViLVisionConfig()
+
+    >>> model = ErnieViLVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "ernie_vil_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=16,
+        hidden_act="quick_gelu",
+        layer_norm_eps=0.000001,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=1.0,
+        **kwargs
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> PretrainedConfig:
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from ErnieViLConfig
+        if config_dict.get("model_type") == "ernie_vil":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class ErnieViLConfig(PretrainedConfig):
+    r"""
+    [`ErnieViLConfig`] is the configuration class to store the configuration of a [`ErnieViLModel`]. It is used to instantiate
+    ErnieViL model according to the specified arguments, defining the text model and vision model configs.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`ErnieViLTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`ErnieViLVisionConfig`].
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The inital value of the *logit_scale* paramter. Default is used as per the original ErnieViL implementation.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from paddlenlp.transformers import ErnieViLConfig, ErnieViLModel
+
+    >>> configuration = ErnieViLConfig()
+
+    >>> model = ErnieViLModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # Initializing a ErnieViLText and ErnieViLVision configuration
+    >>> config_text = ErnieViLTextConfig()
+    >>> config_vision = ErnieViLVisionConfig()
+
+    >>> config = ErnieViLConfig.from_text_vision_configs(config_text, config_vision)
+    ```
+    """
+
+    model_type = "ernie_vil"
+    is_composition = True
+
+    def __init__(self, text_config=None, vision_config=None, logit_scale_init_value=2.6592, **kwargs):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(**kwargs)
+
+        # If `_config_dict` exist, we use them for the backward compatibility.
+        text_config_dict = kwargs.pop("text_config_dict", None)
+        vision_config_dict = kwargs.pop("vision_config_dict", None)
+        if text_config_dict is not None:
+            text_config = text_config_dict
+        if vision_config_dict is not None:
+            vision_config = vision_config_dict
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the ErnieViLTextConfig with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the ErnieViLVisionConfig with default values.")
+
+        self.text_config = ErnieViLTextConfig(**text_config)
+        self.vision_config = ErnieViLVisionConfig(**vision_config)
+
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = 1.0
+
+    @classmethod
+    def from_text_vision_configs(cls, text_config: ErnieViLTextConfig, vision_config: ErnieViLVisionConfig, **kwargs):
+        r"""
+        Instantiate a [`ErnieViLConfig`] (or a derived class) from ernie_vil text model configuration and ernie_vil vision model
+        configuration.
+
+        Returns:
+            [`ErnieViLConfig`]: An instance of a configuration object
+        """
+
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+
+    def to_dict(self, *args, **kwargs):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["text_config"] = self.text_config.to_dict()
+        output["vision_config"] = self.vision_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/feature_extraction.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/feature_extraction.py
new file mode 100644
index 000000000..4920f8405
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/feature_extraction.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for ErnieViL."""
+
+__all__ = ["ErnieViLFeatureExtractor"]
+
+
+import warnings
+
+from .image_processing import ErnieViLImageProcessor
+
+
+class ErnieViLFeatureExtractor(ErnieViLImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The class ErnieViLFeatureExtractor is deprecated and will be removed in version 5 of PaddleNLP. Please"
+            " use ErnieViLImageProcessor instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/image_processing.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/image_processing.py
new file mode 100644
index 000000000..5873eb2a5
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/image_processing.py
@@ -0,0 +1,328 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for ErnieViL."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import PIL
+
+from ..image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ..image_transforms import (
+    center_crop,
+    convert_to_rgb,
+    get_resize_output_image_size,
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from ..image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    is_batched,
+    to_numpy_array,
+    valid_images,
+)
+from ..tokenizer_utils_base import TensorType
+
+__all__ = ["ErnieViLImageProcessor"]
+
+
+class ErnieViLImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a ErnieViL image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
+            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
+            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+            `preprocess` method.
+        crop_size (`Dict[str, int]` *optional*, defaults to 224):
+            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize:
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Image standard deviation.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 224}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else [0.485, 0.456, 0.406]
+        self.image_std = image_std if image_std is not None else [0.229, 0.224, 0.225]
+        self.do_convert_rgb = do_convert_rgb
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size, default_to_square=False)
+        if "shortest_edge" not in size:
+            raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
+        output_size = get_resize_output_image_size(image, size=size["shortest_edge"], default_to_square=False)
+        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
+
+    def center_crop(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Center crop an image. If the image is too small to be cropped to the size given, it will be padded (so the
+        returned result will always be of size `size`).
+
+        Args:
+            image (`np.ndarray`):
+                Image to center crop.
+            size (`Dict[str, int]`):
+                Size of the output image in the form of a dictionary with keys `height` and `width`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` parameter must contain the keys (height, width). Got {size.keys()}")
+        return center_crop(image, size=(size["height"], size["width"]), data_format=data_format, **kwargs)
+
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ):
+        """
+        Rescale an image by a scale factor. image = image * scale.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`int` or `float`):
+                Scale to apply to the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return rescale(image, scale=scale, data_format=data_format, **kwargs)
+
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: Union[float, List[float]],
+        std: Union[float, List[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            image_mean (`float` or `List[float]`):
+                Image mean.
+            image_std (`float` or `List[float]`):
+                Image standard deviation.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: int = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        **kwargs
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.PADDLE` or `'pt'`: Return a batch of type `paddle.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: defaults to the channel dimension format of the input image.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        if not is_batched(images):
+            images = [images]
+
+        if not valid_images(images):
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
+
+        if do_resize and size is None:
+            raise ValueError("Size must be specified if do_resize is True.")
+
+        if do_center_crop and crop_size is None:
+            raise ValueError("Crop size must be specified if do_center_crop is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_resize:
+            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+
+        if do_center_crop:
+            images = [self.center_crop(image=image, size=crop_size) for image in images]
+
+        if do_rescale:
+            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+
+        if do_normalize:
+            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
+
+        images = [to_channel_dimension_format(image, data_format) for image in images]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/modeling.py
new file mode 100644
index 000000000..d3dd2bbcf
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/modeling.py
@@ -0,0 +1,672 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Optional, Tuple, Union
+
+import paddle
+import paddle.distributed as dist
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ...utils.initializer import normal_
+from .. import PretrainedModel
+from ..clip.modeling import CLIPVisionTransformer as ErnieViLVisionTransformer
+from ..clip.modeling import clip_loss
+from ..ernie.modeling import ErnieModel
+from ..model_outputs import (
+    BaseModelOutputWithPooling,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    ModelOutput,
+)
+from .configuration import ErnieViLConfig, ErnieViLTextConfig, ErnieViLVisionConfig
+
+__all__ = [
+    "ErnieViLModel",
+    "ErnieViLTextModel",
+    "ErnieViLVisionModel",
+    "ErnieViLPretrainedModel",
+]
+
+ERNIE_VIL_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    # vit model
+    "PaddlePaddle/ernie_vil-2.0-base-zh",
+    "PaddlePaddle/disco_diffusion_ernie_vil-2.0-base-zh",
+]
+
+
+def quick_gelu(x):
+    return x * F.sigmoid(1.702 * x)
+
+
+F.quick_gelu = quick_gelu
+
+
+@dataclass
+class ErnieViLOutput(ModelOutput):
+    """
+    Args:
+        loss: (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
+        logits_per_image: (`paddle.Tensor` of shape `(image_batch_size, text_batch_size)`):
+            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+            similarity scores.
+        logits_per_text: (`paddle.Tensor` of shape `(text_batch_size, image_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+            similarity scores.
+        text_embeds: (`paddle.Tensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of [`ErnieModel`].
+        image_embeds: (`paddle.Tensor` of shape `(batch_size, output_dim`):
+            The image embeddings obtained by applying the projection layer to the pooled output of [`ErnieViLVisionTransformer`].
+        text_model_output: (:class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions`):
+            The output of the [`ErnieModel`].
+        vision_model_output: (:class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPooling`):
+            The output of the [`VisionTransformer`].
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits_per_image: paddle.Tensor = None
+    logits_per_text: paddle.Tensor = None
+    text_embeds: paddle.Tensor = None
+    image_embeds: paddle.Tensor = None
+    text_model_output: BaseModelOutputWithPoolingAndCrossAttentions = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+class ErnieViLPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained ErnieViL models. It provides ErnieViL related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    config_class = ErnieViLConfig
+    base_model_prefix = "ernie_vil"
+    supports_gradient_checkpointing = True
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, nn.TransformerEncoder):
+            module.enable_recompute = value
+
+    def gradient_checkpointing_enable(self):
+        """
+        Activates gradient checkpointing for the current model.
+
+        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+        activations".
+        """
+        if not self.supports_gradient_checkpointing:
+            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
+        self.apply(partial(self._set_gradient_checkpointing, value=True))
+
+    def gradient_checkpointing_disable(self):
+        """
+        Deactivates gradient checkpointing for the current model.
+
+        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+        activations".
+        """
+        if self.supports_gradient_checkpointing:
+            self.apply(partial(self._set_gradient_checkpointing, value=False))
+
+    def _init_weights(self, layer):
+        """Initialize the weights"""
+        if isinstance(layer, ErnieViLVisionTransformer):
+            # find nn.LayerNorm
+            for sub_layer in layer.sublayers():
+                if isinstance(sub_layer, nn.LayerNorm):
+                    sub_layer._epsilon = layer.config.layer_norm_eps
+
+        elif isinstance(layer, ErnieModel):
+            # find nn.LayerNorm
+            for sub_layer in layer.sublayers():
+                if isinstance(sub_layer, nn.LayerNorm):
+                    sub_layer._epsilon = layer.config.layer_norm_eps
+                elif isinstance(layer, (nn.Linear, nn.Embedding)):
+                    normal_(layer.weight, mean=0.0, std=layer.config.initializer_range)
+
+
+class ErnieViLModel(ErnieViLPretrainedModel):
+    r"""
+    The bare ErnieViL Model outputting logits_per_image and logits_per_text.
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`ErnieViLConfig`):
+            An instance of ErnieViLConfig used to construct ErnieViLModel.
+    """
+    config_class = ErnieViLConfig
+
+    def __init__(self, config: ErnieViLConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, ErnieViLTextConfig):
+            raise ValueError(
+                "config.text_config is expected to be of type ErnieViLTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, ErnieViLVisionConfig):
+            raise ValueError(
+                "config.vision_config is expected to be of type ErnieViLVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.text_model = ErnieModel(text_config)
+
+        self.vision_model = ErnieViLVisionTransformer(vision_config)
+
+        self.temperature = self.create_parameter(
+            shape=(1,),
+            default_initializer=nn.initializer.Constant(config.logit_scale_init_value),
+            dtype=paddle.get_default_dtype(),
+        )
+
+    def get_image_features(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> paddle.Tensor:
+        r"""
+        Args:
+            pixel_values (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+                [`ErnieViLFeatureExtractor`]. See [`ErnieViLFeatureExtractor.__call__`] for details.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`BaseModelOutputWithPooling`] instead of a plain tuple.
+
+        Returns:
+            image_features (`paddle.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`ErnieViLVisionModel`].
+
+        Examples:
+            .. code-block::
+
+                import requests
+                from PIL import Image
+                from paddlenlp.transformers import ErnieViLProcessor, ErnieViLModel
+
+                model = ErnieViLModel.from_pretrained("PaddlePaddle/ernie_vil-2.0-base-zh")
+                processor = ErnieViLProcessor.from_pretrained("PaddlePaddle/ernie_vil-2.0-base-zh")
+
+                url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+                image = Image.open(requests.get(url, stream=True).raw)
+                inputs = processor(images=image, return_tensors="pd")
+                image_features = model.get_image_features(**inputs)
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_features = vision_outputs[1]
+        return image_features
+
+    def get_text_features(
+        self,
+        input_ids,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        token_type_ids: Optional[paddle.Tensor] = None,
+        task_type_ids: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+                it.
+                Indices can be obtained using [`ErnieViLTokenizer`].
+            attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            position_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+                config.max_position_embeddings - 1]`.
+            token_type_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+                Its data type should be `int64`. Defaults to `None`, which means we don't add segment embeddings.
+            task_type_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of tasks of each input sequence tokens in the task embeddings (ErnieModel). Selected in
+                the range ``[0, task_type_vocab_size - 1]``. Defaults to `None`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`BaseModelOutputWithPoolingAndCrossAttentions`] instead of a plain tuple.
+
+        Returns:
+            text_features (`paddle.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            the pooled output of [`ErnieModel`].
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import ErnieViLModel, ErnieViLTokenizer
+
+                model = ErnieViLModel.from_pretrained("PaddlePaddle/ernie_vil-2.0-base-zh")
+                tokenizer = ErnieViLTokenizer.from_pretrained("PaddlePaddle/ernie_vil-2.0-base-zh")
+
+                inputs = tokenizer(["一只猫的照片", "一条狗的照片"], padding=True, return_tensors="pd")
+                text_features = model.get_text_features(**inputs)
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            task_type_ids=task_type_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        text_features = text_outputs[1]
+        return text_features
+
+    def forward(
+        self,
+        input_ids,
+        pixel_values,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        token_type_ids: Optional[paddle.Tensor] = None,
+        task_type_ids: Optional[paddle.Tensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, ErnieViLOutput]:
+        r"""
+        The ErnieViLModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it.
+                Its data type should be `int64` and it has a shape of [text_batch_size, sequence_length].
+            pixel_values (Tensor):
+                Pixel values. Padding will be ignored by default should you provide it.
+                Its data type should be `float32` and it has a shape of [image_batch_size, num_channels, height, width].
+            position_ids (Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings (ErnieModel). Selected in
+                the range ``[0, max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            task_type_ids (Tensor, optional):
+                Indices of tasks of each input sequence tokens in the task embeddings (ErnieModel). Selected in
+                the range ``[0, task_type_vocab_size - 1]``.
+                Shape as `(batch_size, sequence_length)` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention (ErnieModel) to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`ErnieViLOutput` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`ErnieViLOutput` if `return_dict=True`. Otherwise it returns a tuple of tensors
+            corresponding to ordered and not None (depending on the input arguments) fields of :class:`ErnieViLOutput`.
+
+        Example:
+            .. code-block::
+
+                import requests
+                import paddle.nn.functional as F
+                from PIL import Image
+                from paddlenlp.transformers import ErnieViLModel, ErnieViLProcessor
+
+                processor = ErnieViLProcessor.from_pretrained("PaddlePaddle/ernie_vil-2.0-base-zh")
+                model = ErnieViLModel.from_pretrained("PaddlePaddle/ernie_vil-2.0-base-zh")
+                model.eval()
+
+                url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+                image = Image.open(requests.get(url, stream=True).raw)
+
+                inputs = processor(text=["一只猫的照片", "一条狗的照片"],
+                                images=image,
+                                padding=True,
+                                return_tensors="pd")
+
+                outputs = model(**inputs)
+
+                logits_per_image = outputs[0]
+                probs = F.softmax(logits_per_image, axis=1)  # we can take the softmax to get the label probabilities
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            task_type_ids=task_type_ids,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[1]
+        text_embeds = text_outputs[1]
+        # normalized features
+        image_embeds = F.normalize(image_embeds)
+        text_embeds = F.normalize(text_embeds)
+        if paddle.distributed.is_initialized() and dist.get_world_size() > 1:
+            world_size = dist.get_world_size()
+            rank = dist.get_rank()
+            gathered_image_features = [paddle.zeros_like(image_embeds) for _ in range(world_size)]
+            gathered_text_features = [paddle.zeros_like(text_embeds) for _ in range(world_size)]
+            dist.all_gather(gathered_image_features, image_embeds)
+            dist.all_gather(gathered_text_features, text_embeds)
+            # Add current text_embeds image_embeds into the batch for gradient update
+            image_embeds = paddle.concat(
+                [image_embeds] + gathered_image_features[:rank] + gathered_image_features[rank + 1 :]
+            )
+            text_embeds = paddle.concat(
+                [text_embeds] + gathered_text_features[:rank] + gathered_text_features[rank + 1 :]
+            )
+        # cosine similarity as logits
+        logit_scale = self.temperature.exp()
+
+        logits_per_text = paddle.matmul(text_embeds * logit_scale, image_embeds, transpose_y=True)
+        logits_per_image = logits_per_text.t()
+
+        # clip temperature
+        self.temperature.clip(-100.0, 100.0)
+
+        loss = None
+
+        if return_loss:
+            loss = clip_loss(logits_per_text)
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return ErnieViLOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+
+
+class ErnieViLTextModel(ErnieViLPretrainedModel):
+    r"""
+    The text model from ErnieViL without any head or projection on top.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`ErnieViLTextConfig`):
+            An instance of ErnieViLTextConfig used to construct ErnieViLTextModel.
+    """
+
+    config_class = ErnieViLTextConfig
+
+    def __init__(self, config: ErnieViLTextConfig):
+        super().__init__(config)
+        self.text_model = ErnieModel(config)
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.text_model.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.word_embeddings = value
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        token_type_ids: Optional[paddle.Tensor] = None,
+        task_type_ids: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        Args:
+            input_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+                it.
+                Indices can be obtained using [`ErnieViLTokenizer`].
+            attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            position_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+                config.max_position_embeddings - 1]`.
+            token_type_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+                Its data type should be `int64`. Defaults to `None`, which means we don't add segment embeddings.
+            task_type_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of tasks of each input sequence tokens in the task embeddings (ErnieModel). Selected in
+                the range ``[0, task_type_vocab_size - 1]``. Defaults to `None`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`BaseModelOutputWithPoolingAndCrossAttentions`] instead of a plain tuple.
+
+        Returns:
+            An instance of :class:`BaseModelOutputWithPoolingAndCrossAttentions` if `return_dict=True`. Otherwise it returns a tuple of tensors
+            corresponding to ordered and not None (depending on the input arguments) fields of :class:`BaseModelOutputWithPoolingAndCrossAttentions`.
+
+        Examples:
+
+        ```python
+        >>> from paddlenlp.transformers import ErnieViLTokenizer, ErnieViLTextModel
+
+        >>> model = ErnieViLTextModel.from_pretrained("PaddlePaddle/ernie_vil-2.0-base-zh")
+        >>> tokenizer = ErnieViLTokenizer.from_pretrained("PaddlePaddle/ernie_vil-2.0-base-zh")
+
+        >>> inputs = tokenizer(["一只猫的照片", "一条狗的照片"], padding=True, return_tensors="pd")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            task_type_ids=task_type_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class ErnieViLVisionModel(ErnieViLPretrainedModel):
+    r"""
+    The vision model from ErnieViL without any head or projection on top.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`ErnieViLVisionConfig`):
+            An instance of ErnieViLVisionConfig used to construct ErnieViLVisionModel.
+    """
+
+    config_class = ErnieViLVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: ErnieViLVisionConfig):
+        super().__init__(config)
+
+        self.vision_model = ErnieViLVisionTransformer(config)
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.vision_model.conv1
+
+    def forward(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Args:
+            pixel_values (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
+                Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+                [`ErnieViLFeatureExtractor`]. See [`ErnieViLFeatureExtractor.__call__`] for details.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`BaseModelOutputWithPooling`] instead of a plain tuple.
+
+        Returns:
+            An instance of :class:`BaseModelOutputWithPooling` if `return_dict=True`. Otherwise it returns a tuple of tensors
+            corresponding to ordered and not None (depending on the input arguments) fields of :class:`BaseModelOutputWithPooling`.
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import ErnieViLProcessor, ErnieViLVisionModel
+
+        >>> model = ErnieViLVisionModel.from_pretrained("PaddlePaddle/ernie_vil-2.0-base-zh")
+        >>> processor = ErnieViLProcessor.from_pretrained("PaddlePaddle/ernie_vil-2.0-base-zh")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pd")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/processing.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/processing.py
new file mode 100644
index 000000000..e89ab381f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/processing.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image/Text processor class for ErnieViL
+"""
+import warnings
+
+from ..processing_utils import ProcessorMixin
+from ..tokenizer_utils_base import BatchEncoding
+
+__all__ = ["ErnieViLProcessor"]
+
+
+class ErnieViLProcessor(ProcessorMixin):
+    r"""
+    Constructs a ErnieViL processor which wraps a ErnieViL image processor and a ErnieViL tokenizer into a single processor.
+
+    [`ErnieViLProcessor`] offers all the functionalities of [`ErnieViLProcessor`] and [`ErnieViLTokenizer`]. See the
+    [`~ErnieViLProcessor.__call__`] and [`~ErnieViLProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`ErnieViLImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`ErnieViLTokenizer`]):
+            The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "ErnieViLImageProcessor"
+    tokenizer_class = "ErnieViLTokenizer"
+
+    pretrained_init_configuration = {
+        "PaddlePaddle/ernie_vil-2.0-base-zh": {"do_lower_case": True},
+    }
+
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        if "feature_extractor" in kwargs:
+            warnings.warn(
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                " instead.",
+                FutureWarning,
+            )
+            feature_extractor = kwargs.pop("feature_extractor")
+
+        image_processor = image_processor if image_processor is not None else feature_extractor
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to ErnieViLTokenizer's [`~ErnieViLTokenizer.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        ErnieViLImageProcessor's [`~ErnieViLImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `paddle.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[paddle.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or Paddle
+                tensor. In case of a NumPy array/Paddle tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'pd'`: Return Paddle `paddle.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+
+        Returns:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+
+        if text is None and images is None:
+            raise ValueError("You have to specify either text or images. Both cannot be none.")
+
+        if text is not None:
+            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+
+        if images is not None:
+            image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+
+        if text is not None and images is not None:
+            encoding["pixel_values"] = image_features.pixel_values
+            return encoding
+        elif text is not None:
+            return encoding
+        else:
+            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to ErnieViLTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to ErnieViLTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+    @property
+    def feature_extractor_class(self):
+        warnings.warn(
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
+            FutureWarning,
+        )
+        return self.image_processor_class
+
+    @property
+    def feature_extractor(self):
+        warnings.warn(
+            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
+            FutureWarning,
+        )
+        return self.image_processor
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/tokenizer.py
new file mode 100644
index 000000000..c875eae42
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ernie_vil/tokenizer.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..ernie.tokenizer import ErnieTokenizer
+
+__all__ = ["ErnieViLTokenizer"]
+
+
+class ErnieViLTokenizer(ErnieTokenizer):
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "ernie_vil-2.0-base-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_vil/ernie_vil-2.0-base-zh/vocab.txt",
+            "disco_diffusion_ernie_vil-2.0-base-zh": "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_vil/disco_diffusion_ernie_vil-2.0-base-zh/vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "ernie_vil-2.0-base-zh": {"do_lower_case": True},
+        "disco_diffusion_ernie_vil-2.0-base-zh": {"do_lower_case": True},
+    }
+    max_model_input_sizes = {"ernie_vil-2.0-base-zh": 64, "disco_diffusion_ernie_vil-2.0-base-zh": 64}
+
+    model_input_names = [
+        "input_ids",
+    ]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/export.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/export.py
new file mode 100644
index 000000000..46c957ab9
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/export.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import List, Optional, Tuple
+
+import paddle
+
+from ..utils.log import logger
+from .model_utils import PretrainedModel, unwrap_model
+
+__all__ = ["export_model"]
+
+
+def export_model(
+    model: "PretrainedModel", input_spec=None, path: Optional[str] = None, model_format: Optional[str] = "paddle"
+) -> Tuple[List[str], List[str]]:
+    """
+    Export paddle inference model or onnx model.
+
+    Args:
+        model ([`PretrainedModel`]:
+            The model to export.
+        input_spec (paddle.static.InputSpec, optional):
+            Describes the input of the saved model’s forward method, which can be described
+            by InputSpec or example Tensor.  Default None.
+        path (Optional[str], optional):
+            Output dir to save the exported model. Defaults to None.
+        model_format (Optional[str], optional):
+            Export model format. There are two options: paddle or onnx, defaults to paddle.
+
+    """
+    if path is None:
+        path = "./"
+        logger.info("Export path is missing, set default path to current dir.")
+
+    if issubclass(type(model), PretrainedModel):
+        model = unwrap_model(model)
+    model.eval()
+
+    model_format = model_format.lower()
+    file_prefix = "model"
+    if model_format == "paddle":
+        # Convert to static graph with specific input description
+        model = paddle.jit.to_static(model, input_spec=input_spec)
+        # Save in static graph model.
+        save_path = os.path.join(path, file_prefix)
+        logger.info("Exporting inference model to %s" % save_path)
+        paddle.jit.save(model, save_path)
+        logger.info("Inference model exported.")
+    elif model_format == "onnx":
+        # Export ONNX model.
+        save_path = os.path.join(path, file_prefix)
+        logger.info("Exporting ONNX model to %s" % save_path)
+        paddle.onnx.export(model, save_path, input_spec=input_spec)
+        logger.info("ONNX model exported.")
+    else:
+        logger.info("This export format is not supported, please select paddle or onnx!")
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/feature_extraction_sequence_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/feature_extraction_sequence_utils.py
new file mode 100644
index 000000000..24684a7b8
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/feature_extraction_sequence_utils.py
@@ -0,0 +1,366 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+ Sequence feature extraction class for common feature extractors to preprocess sequences.
+"""
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import paddle
+
+from paddlenlp.transformers.tokenizer_utils_base import PaddingStrategy
+
+from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin
+
+
+class SequenceFeatureExtractor(FeatureExtractionMixin):
+    """
+    This is a general feature extraction class for speech recognition.
+
+    Args:
+        feature_size (`int`):
+            The feature dimension of the extracted features.
+        sampling_rate (`int`):
+            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
+        padding_value (`float`):
+            The value that is used to fill the padding values / vectors.
+    """
+
+    def __init__(self, feature_size: int, sampling_rate: int, padding_value: float, **kwargs):
+        self.feature_size = feature_size
+        self.sampling_rate = sampling_rate
+        self.padding_value = padding_value
+
+        self.padding_side = kwargs.pop("padding_side", "right")
+        self.return_attention_mask = kwargs.pop("return_attention_mask", True)
+
+        super().__init__(**kwargs)
+
+    def pad(
+        self,
+        processed_features: Union[
+            BatchFeature,
+            List[BatchFeature],
+            Dict[str, BatchFeature],
+            Dict[str, List[BatchFeature]],
+            List[Dict[str, BatchFeature]],
+        ],
+        padding: Union[bool, str, PaddingStrategy] = True,
+        max_length: Optional[int] = None,
+        truncation: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_tensors: Optional[str] = None,
+    ) -> BatchFeature:
+        """
+        Pad input values / input vectors or a batch of input values / input vectors up to predefined length or to the
+        max sequence length in the batch.
+
+        Padding side (left/right) padding values are defined at the feature extractor level (with `self.padding_side`,
+        `self.padding_value`)
+
+        <Tip>
+
+        If the `processed_features` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
+        result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
+        PyTorch tensors, you will lose the specific device of your tensors however.
+
+        </Tip>
+
+        Args:
+            processed_features ([`BatchFeature`], list of [`BatchFeature`], `Dict[str, List[float]]`, `Dict[str, List[List[float]]` or `List[Dict[str, List[float]]]`):
+                Processed inputs. Can represent one input ([`BatchFeature`] or `Dict[str, List[float]]`) or a batch of
+                input values / vectors (list of [`BatchFeature`], *Dict[str, List[List[float]]]* or *List[Dict[str,
+                List[float]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
+                collate function.
+
+                Instead of `List[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
+                see the note above for the return type.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value.
+
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
+            return_attention_mask (`bool`, *optional*):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific feature_extractor's default.
+
+                [What are attention masks?](../glossary#attention-mask)
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+                - `'pd'`: Return PaddlePaddle `paddle.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+        """
+        # If we have a list of dicts, let's convert it in a dict of lists
+        # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
+        if isinstance(processed_features, (list, tuple)) and isinstance(processed_features[0], (dict, BatchFeature)):
+            processed_features = {
+                key: [example[key] for example in processed_features] for key in processed_features[0].keys()
+            }
+
+        # The model's main input name, usually `input_values`, has be passed for padding
+        if self.model_input_names[0] not in processed_features:
+            raise ValueError(
+                "You should supply an instance of `transformers.BatchFeature` or list of `transformers.BatchFeature`"
+                f" to this method that includes {self.model_input_names[0]}, but you provided"
+                f" {list(processed_features.keys())}"
+            )
+
+        required_input = processed_features[self.model_input_names[0]]
+        return_attention_mask = (
+            return_attention_mask if return_attention_mask is not None else self.return_attention_mask
+        )
+
+        if len(required_input) == 0:
+            if return_attention_mask:
+                processed_features["attention_mask"] = []
+            return processed_features
+
+        # If we have PyTorch/TF tensors or lists as inputs, we cast them as Numpy arrays
+        # and rebuild them afterwards if no return_tensors is specified
+        # Note that we lose the specific device the tensor may be on for PyTorch
+
+        first_element = required_input[0]
+        if isinstance(first_element, (list, tuple)):
+            # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
+            index = 0
+            while len(required_input[index]) == 0:
+                index += 1
+            if index < len(required_input):
+                first_element = required_input[index][0]
+
+        if return_tensors is None:
+            if isinstance(first_element, paddle.Tensor):
+                return_tensors = "pd"
+            elif isinstance(first_element, (int, float, list, tuple, np.ndarray)):
+                return_tensors = "np"
+            else:
+                raise ValueError(
+                    f"type of {first_element} unknown: {type(first_element)}. "
+                    "Should be one of a python, numpy, pytorch or tensorflow object."
+                )
+
+        for key, value in processed_features.items():
+            if isinstance(value[0], (int, float)):
+                processed_features[key] = np.array(value)
+            else:
+                processed_features[key] = [np.array(v) for v in value]
+
+        # Convert padding_strategy in PaddingStrategy
+        padding_strategy = self._get_padding_strategies(padding=padding, max_length=max_length)
+
+        required_input = processed_features[self.model_input_names[0]]
+
+        batch_size = len(required_input)
+        if not all(len(v) == batch_size for v in processed_features.values()):
+            raise ValueError("Some items in the output dictionary have a different batch size than others.")
+
+        truncated_inputs = []
+        for i in range(batch_size):
+            inputs = {k: v[i] for k, v in processed_features.items()}
+            # truncation
+            inputs_slice = self._truncate(
+                inputs,
+                max_length=max_length,
+                pad_to_multiple_of=pad_to_multiple_of,
+                truncation=truncation,
+            )
+            truncated_inputs.append(inputs_slice)
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            # make sure that `max_length` cannot be longer than the longest truncated length
+            max_length = max(len(input_slice[self.model_input_names[0]]) for input_slice in truncated_inputs)
+            padding_strategy = PaddingStrategy.MAX_LENGTH
+
+        batch_outputs = {}
+        for i in range(batch_size):
+            # padding
+            outputs = self._pad(
+                truncated_inputs[i],
+                max_length=max_length,
+                padding_strategy=padding_strategy,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+            )
+
+            for key, value in outputs.items():
+                if key not in batch_outputs:
+                    batch_outputs[key] = []
+                if value.dtype is np.dtype(np.float64):
+                    value = value.astype(np.float32)
+                batch_outputs[key].append(value)
+
+        return BatchFeature(batch_outputs, tensor_type=return_tensors)
+
+    def _pad(
+        self,
+        processed_features: Union[Dict[str, np.ndarray], BatchFeature],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            processed_features (`Union[Dict[str, np.ndarray], BatchFeature]`):
+                Dictionary of input values (`np.ndarray[float]`) / input vectors (`List[np.ndarray[float]]`) or batch
+                of inputs values (`List[np.ndarray[int]]`) / input vectors (`List[np.ndarray[int]]`)
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see below)
+            padding_strategy (`PaddingStrategy`, *optional*, default to `PaddingStrategy.DO_NOT_PAD`):
+                PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The feature_extractor padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of (`int`, *optional*):
+                Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to
+                enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta), or on TPUs
+                which benefit from having sequence lengths be a multiple of 128.
+            return_attention_mask (`bool`, *optional*):
+                Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        required_input = processed_features[self.model_input_names[0]]
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) < max_length
+
+        if return_attention_mask and "attention_mask" not in processed_features:
+            processed_features["attention_mask"] = np.ones(len(required_input), dtype=np.int32)
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+            if self.padding_side == "right":
+                if return_attention_mask:
+                    processed_features["attention_mask"] = np.pad(
+                        processed_features["attention_mask"], (0, difference)
+                    )
+                padding_shape = ((0, difference), (0, 0)) if self.feature_size > 1 else (0, difference)
+                processed_features[self.model_input_names[0]] = np.pad(
+                    required_input, padding_shape, "constant", constant_values=self.padding_value
+                )
+            elif self.padding_side == "left":
+                if return_attention_mask:
+                    processed_features["attention_mask"] = np.pad(
+                        processed_features["attention_mask"], (difference, 0)
+                    )
+                padding_shape = ((difference, 0), (0, 0)) if self.feature_size > 1 else (difference, 0)
+                processed_features[self.model_input_names[0]] = np.pad(
+                    required_input, padding_shape, "constant", constant_values=self.padding_value
+                )
+            else:
+                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+
+        return processed_features
+
+    def _truncate(
+        self,
+        processed_features: Union[Dict[str, np.ndarray], BatchFeature],
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        truncation: Optional[bool] = None,
+    ):
+        """
+        Truncate inputs to predefined length or max length in the batch
+
+        Args:
+            processed_features(`Union[Dict[str, np.ndarray], BatchFeature]`):
+                Dictionary of input values (`np.ndarray[float]`) / input vectors (`List[np.ndarray[float]]`) or batch
+                of inputs values (`List[np.ndarray[int]]`) / input vectors (`List[np.ndarray[int]]`)
+            max_length (`int`, *optional*):
+                maximum length of the returned list and optionally padding length (see below)
+            pad_to_multiple_of (`int`, *optional*) :
+                Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to
+                enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta), or on TPUs
+                which benefit from having sequence lengths be a multiple of 128.
+            truncation (`bool`, *optional*):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+        """
+        if not truncation:
+            return processed_features
+        elif truncation and max_length is None:
+            raise ValueError("When setting ``truncation=True``, make sure that ``max_length`` is defined.")
+
+        required_input = processed_features[self.model_input_names[0]]
+
+        # find `max_length` that fits `pad_to_multiple_of`
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_truncated = len(required_input) > max_length
+
+        if needs_to_be_truncated:
+            processed_features[self.model_input_names[0]] = processed_features[self.model_input_names[0]][:max_length]
+            if "attention_mask" in processed_features:
+                processed_features["attention_mask"] = processed_features["attention_mask"][:max_length]
+
+        return processed_features
+
+    def _get_padding_strategies(self, padding=False, max_length=None):
+        """
+        Find the correct padding strategy
+        """
+
+        # Get padding strategy
+        if padding is not False:
+            if padding is True:
+                padding_strategy = PaddingStrategy.LONGEST  # Default to pad to the longest sequence in the batch
+            elif not isinstance(padding, PaddingStrategy):
+                padding_strategy = PaddingStrategy(padding)
+            elif isinstance(padding, PaddingStrategy):
+                padding_strategy = padding
+        else:
+            padding_strategy = PaddingStrategy.DO_NOT_PAD
+
+        # Set max length if needed
+        if max_length is None:
+            if padding_strategy == PaddingStrategy.MAX_LENGTH:
+                raise ValueError(
+                    f"When setting ``padding={PaddingStrategy.MAX_LENGTH}``, make sure that max_length is defined"
+                )
+
+        # Test if we have a padding value
+        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.padding_value is None):
+            raise ValueError(
+                "Asking to pad but the feature_extractor does not have a padding value. Please select a value to use"
+                " as `padding_value`. For example: `feature_extractor.padding_value = 0.0`."
+            )
+
+        return padding_strategy
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/feature_extraction_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/feature_extraction_utils.py
new file mode 100644
index 000000000..9030586a1
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/feature_extraction_utils.py
@@ -0,0 +1,378 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import json
+import os
+from collections import UserDict
+from typing import Any, Dict, Optional, Tuple, Union
+
+import numpy as np
+import paddle
+
+from paddlenlp.utils.download import resolve_file_path
+
+from ..utils.log import logger
+from .tokenizer_utils_base import TensorType
+
+FEATURE_EXTRACTOR_NAME = "preprocessor_config.json"
+
+
+class BatchFeature(UserDict):
+    r"""
+    Holds the feature extractor specific `__call__` methods.
+    This class is derived from a python dictionary and can be used as a dictionary.
+    Args:
+        data (`dict`):
+            Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('input_values', 'attention_mask',
+            etc.).
+        tensor_type (`Union[None, str, TensorType]`, *optional*):
+            You can give a tensor_type here to convert the lists of integers in Paddle/Numpy Tensors at
+            initialization.
+    """
+
+    def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
+        super().__init__(data)
+        self.convert_to_tensors(tensor_type=tensor_type)
+
+    def __getitem__(self, item: str):
+        """
+        If the key is a string, returns the value of the dict associated to `key` ('input_values', 'attention_mask',
+        etc.).
+        """
+        if isinstance(item, str):
+            return self.data[item]
+        else:
+            raise KeyError("Indexing with integers is not available when using Python based feature extractors")
+
+    def __getattr__(self, item: str):
+        try:
+            return self.data[item]
+        except KeyError:
+            raise AttributeError
+
+    def __getstate__(self):
+        return {"data": self.data}
+
+    def __setstate__(self, state):
+        if "data" in state:
+            self.data = state["data"]
+
+    def keys(self):
+        return self.data.keys()
+
+    def values(self):
+        return self.data.values()
+
+    def items(self):
+        return self.data.items()
+
+    def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
+        """
+        Convert the inner content to tensors.
+        Args:
+            tensor_type (`str` or [`TensorType`], *optional*):
+                The type of tensors to use. If `str`, should be one of the values of the enum [`TensorType`]. If
+                `None`, no modification is done.
+        """
+        if tensor_type is None:
+            return self
+
+        # Convert to TensorType
+        if not isinstance(tensor_type, TensorType):
+            tensor_type = TensorType(tensor_type)
+
+        # Get a function reference for the correct framework
+        if tensor_type == TensorType.PADDLE:
+            as_tensor = paddle.to_tensor
+            is_tensor = paddle.is_tensor
+        else:
+            as_tensor = np.asarray
+
+            def is_tensor(x):
+                return isinstance(x, np.ndarray)
+
+        # Do the tensor conversion in batch
+        for key, value in self.items():
+            try:
+                if not is_tensor(value):
+                    tensor = as_tensor(value)
+
+                    self[key] = tensor
+            except:  # noqa E722
+                if key == "overflowing_tokens":
+                    raise ValueError(
+                        "Unable to create tensor returning overflowing tokens of different lengths. "
+                        "Please see if a fast version of this tokenizer is available to have this feature available."
+                    )
+                raise ValueError(
+                    "Unable to create tensor, you should probably activate truncation and/or padding "
+                    "with 'padding=True' 'truncation=True' to have batched tensors with the same length."
+                )
+
+        return self
+
+
+class FeatureExtractionMixin(object):
+    """
+    This is a feature extraction mixin used to provide saving/loading functionality for sequential and image feature
+    extractors.
+    """
+
+    pretrained_init_configuration = {}
+
+    pretrained_feature_extractor_file = []
+    _auto_class = None
+
+    def __init__(self, **kwargs):
+        """Set elements of `kwargs` as attributes."""
+        # Pop "processor_class" as it should be saved as private attribute
+        self._processor_class = kwargs.pop("processor_class", None)
+        # Additional attributes without default values
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                logger.error(f"Can't set {key} with value {value} for {self}")
+                raise err
+
+    def _set_processor_class(self, processor_class: str):
+        """Sets processor class as an attribute."""
+        self._processor_class = processor_class
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs):
+        r"""
+        Instantiate a type of [`~feature_extraction_utils.FeatureExtractionMixin`] from a feature extractor, *e.g.* a
+        derived class of [`SequenceFeatureExtractor`].
+
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                This can be either:
+
+                - a string, the name of a community-contributed pretrained or built-in pretrained model.
+                - a path to a *directory* containing a feature extractor file saved using the
+                  [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved feature extractor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                If `False`, then this function returns just the final feature extractor object. If `True`, then this
+                functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a dictionary
+                consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the part of
+                `kwargs` which has not been used to update `feature_extractor` and is otherwise ignored.
+            kwargs (`Dict[str, Any]`, *optional*):
+                The values in kwargs of any keys which are feature extractor attributes will be used to override the
+                loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is
+                controlled by the `return_unused_kwargs` keyword parameter.
+
+        Returns:
+            A feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`].
+
+        Examples:
+
+        ```python
+            # We can't instantiate directly the base class *FeatureExtractionMixin* nor *SequenceFeatureExtractor* so let's show the examples on a
+            # derived class: *CLIPFeatureExtractor*
+            feature_extractor = CLIPFeatureExtractor.from_pretrained(
+                "openai/clip-vit-base-patch32"
+            )  # Download feature_extraction_config from bos and cache.
+            feature_extractor = CLIPFeatureExtractor.from_pretrained(
+                "./test/saved_model/"
+            )  # E.g. feature_extractor (or model) was saved using *save_pretrained('./test/saved_model/')*
+            feature_extractor = CLIPFeatureExtractor.from_pretrained("./test/saved_model/preprocessor_config.json")
+            feature_extractor, unused_kwargs = CLIPFeatureExtractor.from_pretrained(
+                "openai/clip-vit-base-patch32", foo=False, return_unused_kwargs=True
+            )
+            assert unused_kwargs == {"foo": False}
+            ```
+        """
+        feature_extractor_dict, kwargs = cls.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)
+
+        return cls.from_dict(feature_extractor_dict, **kwargs)
+
+    def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs):
+        """
+        Save a feature_extractor object to the directory `save_directory`, so that it can be re-loaded using the
+        [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] class method.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the feature extractor JSON file will be saved (will be created if it does not exist).
+            kwargs:
+                Additional key word arguments.
+        """
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_feature_extractor_file = os.path.join(save_directory, FEATURE_EXTRACTOR_NAME)
+
+        self.to_json_file(output_feature_extractor_file)
+        logger.info(f"Feature extractor saved in {output_feature_extractor_file}")
+
+        return [output_feature_extractor_file]
+
+    @classmethod
+    def get_feature_extractor_dict(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        """
+        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
+        feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`] using `from_dict`.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
+
+        Returns:
+            `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the feature extractor object.
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        from_hf_hub = kwargs.pop("from_hf_hub", False)
+        from_aistudio = kwargs.pop("from_aistudio", False)
+        subfolder = kwargs.pop("subfolder", "")
+        if subfolder is None:
+            subfolder = ""
+
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        resolved_feature_extractor_file = resolve_file_path(
+            pretrained_model_name_or_path,
+            [FEATURE_EXTRACTOR_NAME],
+            subfolder,
+            cache_dir=cache_dir,
+            from_aistudio=from_aistudio,
+            from_hf_hub=from_hf_hub,
+        )
+        assert (
+            resolved_feature_extractor_file is not None
+        ), f"please make sure {FEATURE_EXTRACTOR_NAME} under {pretrained_model_name_or_path}"
+        try:
+            # Load feature_extractor dict
+            with open(resolved_feature_extractor_file, "r", encoding="utf-8") as reader:
+                text = reader.read()
+            feature_extractor_dict = json.loads(text)
+
+        except json.JSONDecodeError:
+            raise EnvironmentError(
+                f"It looks like the config file at '{resolved_feature_extractor_file}' is not a valid JSON file."
+            )
+
+        return feature_extractor_dict, kwargs
+
+    @classmethod
+    def from_dict(cls, feature_extractor_dict: Dict[str, Any], **kwargs):
+        """
+        Instantiates a type of [`~feature_extraction_utils.FeatureExtractionMixin`] from a Python dictionary of
+        parameters.
+
+        Args:
+            feature_extractor_dict (`Dict[str, Any]`):
+                Dictionary that will be used to instantiate the feature extractor object. Such a dictionary can be
+                retrieved from a pretrained checkpoint by leveraging the
+                [`~feature_extraction_utils.FeatureExtractionMixin.to_dict`] method.
+            kwargs (`Dict[str, Any]`):
+                Additional parameters from which to initialize the feature extractor object.
+
+        Returns:
+            [`~feature_extraction_utils.FeatureExtractionMixin`]: The feature extractor object instantiated from those
+            parameters.
+        """
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+
+        feature_extractor = cls(**feature_extractor_dict)
+
+        # Update feature_extractor with kwargs if needed
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(feature_extractor, key):
+                setattr(feature_extractor, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        if return_unused_kwargs:
+            return feature_extractor, kwargs
+        else:
+            return feature_extractor
+
+    def to_dict(self, *args, **kwargs) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this feature extractor instance.
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["feature_extractor_type"] = self.__class__.__name__
+
+        return output
+
+    @classmethod
+    def from_json_file(cls, json_file: Union[str, os.PathLike]):
+        """
+        Instantiates a feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`] from the path to
+        a JSON file of parameters.
+
+        Args:
+            json_file (`str` or `os.PathLike`):
+                Path to the JSON file containing the parameters.
+
+        Returns:
+            A feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The feature_extractor
+            object instantiated from that JSON file.
+        """
+        with open(json_file, "r", encoding="utf-8") as reader:
+            text = reader.read()
+        feature_extractor_dict = json.loads(text)
+        return cls(**feature_extractor_dict)
+
+    def to_json_string(self) -> str:
+        """
+        Serializes this instance to a JSON string.
+
+        Returns:
+            `str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
+        """
+        dictionary = self.to_dict()
+
+        for key, value in dictionary.items():
+            if isinstance(value, np.ndarray):
+                dictionary[key] = value.tolist()
+
+        # make sure private name "_processor_class" is correctly
+        # saved as "processor_class"
+        _processor_class = dictionary.pop("_processor_class", None)
+        if _processor_class is not None:
+            dictionary["processor_class"] = _processor_class
+
+        return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
+        """
+        Save this instance to a JSON file.
+
+        Args:
+            json_file_path (`str` or `os.PathLike`):
+                Path to the JSON file in which this feature_extractor instance's parameters will be saved.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            writer.write(self.to_json_string())
+
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/fnet/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/fnet/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/fnet/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/fnet/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/fnet/configuration.py
new file mode 100644
index 000000000..32a159367
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/fnet/configuration.py
@@ -0,0 +1,142 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" fnet model configuration"""
+from __future__ import annotations
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = [
+    "FNET_PRETRAINED_INIT_CONFIGURATION",
+    "FNET_PRETRAINED_RESOURCE_FILES_MAP",
+    "FNetConfig",
+]
+
+FNET_PRETRAINED_INIT_CONFIGURATION = {
+    "fnet-base": {
+        "vocab_size": 32000,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu_new",
+        "hidden_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 4,
+        "initializer_range": 0.02,
+        "layer_norm_eps": 1e-12,
+        "pad_token_id": 3,
+        "bos_token_id": 1,
+        "eos_token_id": 2,
+    },
+    "fnet-large": {
+        "vocab_size": 32000,
+        "hidden_size": 1024,
+        "num_hidden_layers": 24,
+        "intermediate_size": 4096,
+        "hidden_act": "gelu_new",
+        "hidden_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 4,
+        "initializer_range": 0.02,
+        "layer_norm_eps": 1e-12,
+        "pad_token_id": 3,
+        "bos_token_id": 1,
+        "eos_token_id": 2,
+    },
+}
+FNET_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "fnet-base": "https://bj.bcebos.com/paddlenlp/models/transformers/fnet/fnet-base/model_state.pdparams",
+        "fnet-large": "https://bj.bcebos.com/paddlenlp/models/transformers/fnet/fnet-large/model_state.pdparams",
+    }
+}
+
+
+class FNetConfig(PretrainedConfig):
+    r"""
+    Args:
+        vocab_size (int, optional):
+            Vocabulary size of `inputs_ids` in `FNetModel`. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `FNetModel`.
+            Defaults to `32000`.
+        hidden_size (int, optional):
+            Dimensionality of the encoder layer and pooler layer. Defaults to `768`.
+        num_hidden_layers (int, optional):
+            Number of hidden layers in the Transformer encoder. Defaults to `12`.
+        intermediate_size (int, optional):
+            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
+            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
+            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
+            Defaults to `3072`.
+        hidden_act (str, optional):
+            The non-linear activation function in the feed-forward layer.
+            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
+            are supported. Defaults to `glue_new`.
+        hidden_dropout_prob (float, optional):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+            Defaults to `0.1`.
+        max_position_embeddings (int, optional):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
+            sequence. Defaults to `512`.
+        type_vocab_size (int, optional):
+            The vocabulary size of `token_type_ids`. Defaults to `4`.
+        initializer_range (float, optional):
+            The standard deviation of the normal initializer. Defaults to `0.02`.
+            .. note::
+                A normal_initializer initializes weight matrices as normal distributions.
+                See :meth:`BertPretrainedModel.init_weights()` for how weights are initialized in `ElectraModel`.
+        layer_norm_eps(float, optional):
+            The `epsilon` parameter used in :class:`paddle.nn.LayerNorm` for initializing layer normalization layers.
+            A small value to the variance added to the normalization layer to prevent division by zero.
+            Defaults to `1e-12`.
+        pad_token_id (int, optional):
+            The index of padding token in the token vocabulary. Defaults to `3`.
+        add_pooling_layer(bool, optional):
+            Whether or not to add the pooling layer. Defaults to `True`.
+    """
+
+    model_type = "fnet"
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=768,
+        num_hidden_layers=12,
+        intermediate_size=3072,
+        hidden_act="gelu_new",
+        hidden_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=4,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=3,
+        bos_token_id=1,
+        eos_token_id=2,
+        add_pooling_layer=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.add_pooling_layer = add_pooling_layer
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/fnet/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/fnet/modeling.py
new file mode 100644
index 000000000..8a159412c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/fnet/modeling.py
@@ -0,0 +1,936 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Modeling classes for FNet model."""
+
+import paddle
+import paddle.nn as nn
+from paddle.nn import Layer
+
+from .. import PretrainedModel, register_base_model
+from ..activations import ACT2FN
+from .configuration import (
+    FNET_PRETRAINED_INIT_CONFIGURATION,
+    FNET_PRETRAINED_RESOURCE_FILES_MAP,
+    FNetConfig,
+)
+
+__all__ = [
+    "FNetPretrainedModel",
+    "FNetModel",
+    "FNetForSequenceClassification",
+    "FNetForPreTraining",
+    "FNetForMaskedLM",
+    "FNetForNextSentencePrediction",
+    "FNetForMultipleChoice",
+    "FNetForTokenClassification",
+    "FNetForQuestionAnswering",
+]
+
+
+class FNetBasicOutput(Layer):
+    def __init__(self, config: FNetConfig):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.layer_norm(input_tensor + hidden_states)
+        return hidden_states
+
+
+class FNetOutput(Layer):
+    def __init__(self, config: FNetConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.layer_norm(input_tensor + hidden_states)
+        return hidden_states
+
+
+class FNetIntermediate(Layer):
+    def __init__(self, config: FNetConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class FNetLayer(Layer):
+    def __init__(self, config: FNetConfig):
+        super().__init__()
+        self.fourier = FNetFourierTransform(config)
+        self.intermediate = FNetIntermediate(config)
+        self.output = FNetOutput(config)
+
+    def forward(self, hidden_states):
+        self_fourier_outputs = self.fourier(hidden_states)
+        fourier_output = self_fourier_outputs[0]
+        intermediate_output = self.intermediate(fourier_output)
+        layer_output = self.output(intermediate_output, fourier_output)
+
+        return (layer_output,)
+
+
+class FNetEncoder(Layer):
+    def __init__(self, config: FNetConfig):
+        super().__init__()
+        self.layers = nn.LayerList([FNetLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(self, hidden_states, output_hidden_states=False, return_dict=True):
+        all_hidden_states = () if output_hidden_states else None
+        for i, layer_module in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_outputs = layer_module(hidden_states)
+            hidden_states = layer_outputs[0]
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if return_dict:
+            return {"last_hidden_state": hidden_states, "all_hidden_states": all_hidden_states}
+        return (hidden_states,)
+
+
+class FNetPooler(Layer):
+    def __init__(self, config: FNetConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class FNetEmbeddings(Layer):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config: FNetConfig):
+        super(FNetEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        # NOTE: This is the project layer and will be needed. The original code allows for different embedding and different model dimensions.
+        self.projection = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", paddle.arange(config.max_position_embeddings, dtype="int64").expand((1, -1))
+        )
+
+    def forward(
+        self,
+        input_ids,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.shape
+        else:
+            input_shape = inputs_embeds.shape[:-1]
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(input_shape, dtype="int64")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = inputs_embeds + token_type_embeddings
+
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings += position_embeddings
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.projection(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class FNetBasicFourierTransform(Layer):
+    def __init__(self):
+        super().__init__()
+        self.fourier_transform = paddle.fft.fftn
+
+    def forward(self, hidden_states):
+        outputs = self.fourier_transform(hidden_states).real()
+        return (outputs,)
+
+
+class FNetFourierTransform(Layer):
+    def __init__(self, config: FNetConfig):
+        super().__init__()
+        self.fourier_transform = FNetBasicFourierTransform()
+        self.output = FNetBasicOutput(config)
+
+    def forward(self, hidden_states):
+        self_outputs = self.fourier_transform(hidden_states)
+        fourier_output = self.output(self_outputs[0], hidden_states)
+        return (fourier_output,)
+
+
+class FNetPredictionHeadTransform(Layer):
+    def __init__(self, config: FNetConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        return hidden_states
+
+
+class FNetLMPredictionHead(Layer):
+    def __init__(self, config: FNetConfig):
+        super().__init__()
+        self.transform = FNetPredictionHeadTransform(config)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.vocab_size, config.hidden_size)
+
+        self.bias = self.create_parameter(
+            [config.vocab_size], is_bias=True, default_initializer=nn.initializer.Constant(value=0)
+        )
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = paddle.matmul(hidden_states, self.decoder.weight, transpose_y=True) + self.bias
+        return hidden_states
+
+
+class FNetOnlyMLMHead(Layer):
+    def __init__(self, config: FNetConfig):
+        super().__init__()
+        self.predictions = FNetLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class FNetOnlyNSPHead(Layer):
+    def __init__(self, config: FNetConfig):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class FNetPreTrainingHeads(Layer):
+    def __init__(self, config: FNetConfig):
+        super().__init__()
+        self.predictions = FNetLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class FNetPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained FNet models. It provides FNet related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models. See `PretrainedModel` for more details.
+    """
+
+    pretrained_init_configuration = FNET_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = FNET_PRETRAINED_RESOURCE_FILES_MAP
+    base_model_prefix = "fnet"
+    config_class = FNetConfig
+
+    def _init_weights(self, layer):
+        # Initialize the weights.
+        if isinstance(layer, nn.Linear):
+            layer.weight.set_value(
+                paddle.tensor.normal(
+                    mean=0.0,
+                    std=self.config.initializer_range,
+                    shape=layer.weight.shape,
+                )
+            )
+            if layer.bias is not None:
+                layer.bias.set_value(paddle.zeros_like(layer.bias))
+        elif isinstance(layer, nn.Embedding):
+            layer.weight.set_value(
+                paddle.tensor.normal(
+                    mean=0.0,
+                    std=self.config.initializer_range,
+                    shape=layer.weight.shape,
+                )
+            )
+            if layer._padding_idx is not None:
+                layer.weight[layer._padding_idx].set_value(paddle.zeros_like(layer.weight[layer._padding_idx]))
+        elif isinstance(layer, nn.LayerNorm):
+            layer.bias.set_value(paddle.zeros_like(layer.bias))
+            layer.weight.set_value(paddle.ones_like(layer.weight))
+
+
+@register_base_model
+class FNetModel(FNetPretrainedModel):
+    """
+    The model can behave as an encoder, following the architecture described in `FNet: Mixing Tokens with Fourier
+    Transforms <https://arxiv.org/abs/2105.03824>`__ by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago
+    Ontanon.
+    """
+
+    def __init__(self, config: FNetConfig):
+        super(FNetModel, self).__init__(config)
+        self.initializer_range = config.initializer_range
+        self.num_hidden_layers = config.num_hidden_layers
+        self.embeddings = FNetEmbeddings(config)
+        self.encoder = FNetEncoder(config)
+        self.pooler = FNetPooler(config) if config.add_pooling_layer else None
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        The FNetModel forward method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            inputs_embeds (Tensor, optional):
+               If you want to control how to convert `inputs_ids` indices into associated vectors, you can
+               pass an embedded representation directly instead of passing `inputs_ids`.
+           output_hidden_states (bool, optional):
+               Whether or not to return all hidden states. Default to `None`.
+            return_dict (bool, optional):
+                Whether or not to return a dict instead of a plain tuple. Default to `None`.
+
+
+        Returns:
+            tuple or Dict: Returns tuple (`sequence_output`, `pooled_output`, `encoder_outputs[1:]`)
+            or a dict with last_hidden_state`, `pooled_output`, `all_hidden_states`, fields.
+
+            With the fields:
+
+            - `sequence_output` (Tensor):
+               Sequence of hidden-states at the last layer of the model.
+               It's data type should be float32 and has a shape of [`batch_size, sequence_length, hidden_size`].
+
+            - `pooled_output` (Tensor):
+               The output of first token (`[CLS]`) in sequence.
+               We "pool" the model by simply taking the hidden state corresponding to the first token.
+               Its data type should be float32 and
+               has a shape of [batch_size, hidden_size].
+
+            - `last_hidden_state` (Tensor):
+               The output of the last encoder layer, it is also the `sequence_output`.
+               It's data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size].
+
+            - `all_hidden_states` (Tensor):
+               Hidden_states of all layers in the Transformer encoder. The length of `all_hidden_states` is `num_hidden_layers + 1`.
+               For all element in the tuple, its data type should be float32 and its shape is [`batch_size, sequence_length, hidden_size`].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers.fnet.modeling import FNetModel
+                from paddlenlp.transformers.fnet.tokenizer import FNetTokenizer
+
+                tokenizer = FNetTokenizer.from_pretrained('fnet-base')
+                model = FNetModel.from_pretrained('fnet-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+        """
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(shape=input_shape, dtype="int64")
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs["last_hidden_state"] if return_dict else encoder_outputs[0]
+
+        pooler_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if return_dict:
+            return {
+                "last_hidden_state": sequence_output,
+                "pooler_output": pooler_output,
+                "all_hidden_states": encoder_outputs["all_hidden_states"],
+            }
+        return (sequence_output, pooler_output) + encoder_outputs[1:]
+
+
+class FNetForSequenceClassification(FNetPretrainedModel):
+    """
+    FNet Model with a linear layer on top of the output layer,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        fnet (:class:`FNetModel`):
+            An instance of FNetModel.
+        num_classes (int, optional):
+            The number of classes. Defaults to `2`.
+
+    """
+
+    def __init__(self, config: FNetConfig, num_classes=2):
+        super(FNetForSequenceClassification, self).__init__(config)
+        self.num_classes = num_classes
+        self.fnet = FNetModel(config)
+
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, num_classes)
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        The FNetForSequenceClassification forward method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            inputs_embeds (Tensor, optional):
+               If you want to control how to convert `inputs_ids` indices into associated vectors, you can
+               pass an embedded representation directly instead of passing `inputs_ids`.
+           output_hidden_states (bool, optional):
+               Whether or not to return all hidden states. Default to `None`.
+            return_dict (bool, optional):
+                Whether or not to return a dict instead of a plain tuple. Default to `None`.
+
+
+        Returns:
+           Tensor or Dict: Returns tensor `logits`, or a dict with `logits`, `hidden_states`, `attentions` fields.
+
+           With the fields:
+
+           - `logits` (Tensor):
+               A tensor of the input text classification logits.
+               Shape as `[batch_size, num_classes]` and dtype as float32.
+
+           - `hidden_states` (Tensor):
+               Hidden_states of all layers in the Transformer encoder. The length of `hidden_states` is `num_hidden_layers + 1`.
+               For all element in the tuple, its data type should be float32 and its shape is [`batch_size, sequence_length, hidden_size`].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers.fnet.modeling import FNetForSequenceClassification
+                from paddlenlp.transformers.fnet.tokenizer import FNetTokenizer
+
+                tokenizer = FNetTokenizer.from_pretrained('fnet-base')
+                model = FNetModel.from_pretrained('fnet-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+        """
+        outputs = self.fnet(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs["pooler_output"] if return_dict else outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        if return_dict:
+            return {
+                "logits": logits,
+                "hidden_states": outputs["all_hidden_states"],
+            }
+        return logits
+
+
+class FNetForPreTraining(FNetPretrainedModel):
+    """
+    FNet Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
+    sentence prediction (classification)` head.
+    """
+
+    def __init__(self, config: FNetConfig):
+        super().__init__(config)
+
+        self.fnet = FNetModel(config)
+        self.cls = FNetPreTrainingHeads(config)
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def get_input_embeddings(self):
+        return self.fnet.embeddings.word_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        next_sentence_label=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        The FNetForPretraining forward method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`FNetModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`FNetModel`.
+            position_ids(Tensor, optional):
+                See :class:`FNetModel`.
+            labels (LongTensor of shape (batch_size, sequence_length), optional):
+                Labels for computing the masked language modeling loss.
+            inputs_embeds(Tensor, optional):
+                See :class:`FNetModel`.
+            next_sentence_labels(Tensor):
+                The labels of the next sentence prediction task, the dimensionality of `next_sentence_labels`
+                is equal to `seq_relation_labels`. Its data type should be int64 and
+                its shape is [batch_size, 1]
+            output_hidden_states (bool, optional):
+                See :class:`FNetModel`.
+            return_dict (bool, optional):
+                See :class:`FNetModel`.
+
+        Returns:
+            tuple or Dict: Returns tuple (`prediction_scores`, `seq_relationship_score`) or a dict with
+            `prediction_logits`, `seq_relationship_logits`,  `hidden_states` fields.
+        """
+        outputs = self.fnet(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0] if not return_dict else outputs["last_hidden_state"]
+        pooled_output = outputs[1] if not return_dict else outputs["pooler_output"]
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+
+        if return_dict:
+            return {
+                "prediction_logits": prediction_scores,
+                "seq_relationship_logits": seq_relationship_score,
+                "hidden_states": outputs["all_hidden_states"],
+            }
+        return prediction_scores, seq_relationship_score, outputs["all_hidden_states"]
+
+
+class FNetForMaskedLM(FNetPretrainedModel):
+    """
+    FNet Model with a `masked language modeling` head on top.
+
+    Args:
+        fnet (:class:`FNetModel`):
+            An instance of :class:`FNetModel`.
+
+    """
+
+    def __init__(self, config: FNetConfig):
+        super().__init__(config)
+
+        self.fnet = FNetModel(config)
+        self.cls = FNetOnlyMLMHead(config)
+        self.tie_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def get_input_embeddings(self):
+        return self.fnet.embeddings.word_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        next_sentence_label=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        The FNetForMaskedLM forward method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`FNetModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`FNetModel`.
+            position_ids(Tensor, optional):
+                See :class:`FNetModel`.
+            inputs_embeds(Tensor, optional):
+                See :class:`FNetModel`.
+            labels(Tensor, optional):
+                See :class:`FNetForPreTraining`.
+            next_sentence_label(Tensor, optional):
+                See :class:`FNetForPreTraining`.
+            output_hidden_states(Tensor, optional):
+                See :class:`FNetModel`.
+            return_dict(bool, optional):
+                See :class:`FNetModel`.
+
+        Returns:
+            Tensor or Dict: Returns tensor `prediction_scores` or a dict with `prediction_logits`, `hidden_states` fields.
+
+            With the fields:
+
+            - `prediction_scores` (Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                and its shape is [batch_size, sequence_length, vocab_size].
+
+            - `hidden_states` (Tensor):
+                Hidden_states of all layers in the Transformer encoder. The length of `hidden_states` is `num_hidden_layers + 1`.
+                For all element in the tuple, its data type should be float32 and its shape is [`batch_size, sequence_length, hidden_size`].
+        """
+        outputs = self.fnet(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0] if not return_dict else outputs["last_hidden_state"]
+        prediction_scores = self.cls(sequence_output)
+
+        if return_dict:
+            return {"prediction_logits": prediction_scores, "hidden_states": outputs["all_hidden_states"]}
+        return prediction_scores, outputs["all_hidden_states"]
+
+
+class FNetForNextSentencePrediction(FNetPretrainedModel):
+    """
+    FNet Model with a `next sentence prediction` head on top.
+
+    Args:
+        fnet (:class:`FNetModel`):
+            An instance of :class:`FNetModel`.
+
+    """
+
+    def __init__(self, config: FNetConfig):
+        super().__init__(config)
+
+        self.fnet = FNetModel(config)
+        self.cls = FNetOnlyNSPHead(config)
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def get_input_embeddings(self):
+        return self.fnet.embeddings.word_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        next_sentence_label=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        outputs = self.fnet(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1] if not return_dict else outputs["pooler_output"]
+        seq_relationship_score = self.cls(pooled_output)
+
+        if return_dict:
+            return {"seq_relationship_logits": seq_relationship_score, "hidden_states": outputs["all_hidden_states"]}
+        return seq_relationship_score, outputs["all_hidden_states"]
+
+
+class FNetForMultipleChoice(FNetPretrainedModel):
+    """
+    FNet Model with a linear layer on top of the hidden-states output layer,
+    designed for multiple choice tasks like SWAG tasks .
+
+    Args:
+        fnet (:class:`FNetModel`):
+            An instance of FNetModel.
+
+    """
+
+    def __init__(self, config: FNetConfig):
+        super(FNetForMultipleChoice, self).__init__(config)
+        self.fnet = FNetModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+        input_ids = input_ids.reshape([-1, input_ids.shape[-1]]) if input_ids is not None else None
+        token_type_ids = token_type_ids.reshape([-1, token_type_ids.shape[-1]]) if token_type_ids is not None else None
+        position_ids = position_ids.reshape([-1, position_ids.shape[-1]]) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.reshape([-1, inputs_embeds.shape[-2], inputs_embeds.shape[-1]])
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.fnet(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs["pooler_output"] if return_dict else outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.reshape([-1, num_choices])
+
+        if return_dict:
+            return {
+                "logits": reshaped_logits,
+                "hidden_states": outputs["all_hidden_states"],
+            }
+        return reshaped_logits
+
+
+class FNetForTokenClassification(FNetPretrainedModel):
+    """
+    FNet Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        fnet (:class:`FNetModel`):
+            An instance of FNetModel.
+        num_classes (int, optional):
+            The number of classes. Defaults to `2`.
+    """
+
+    def __init__(self, config: FNetConfig, num_classes=2):
+        super(FNetForTokenClassification, self).__init__(config)
+        self.fnet = FNetModel(config)
+        self.num_classes = num_classes
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, self.num_classes)
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        outputs = self.fnet(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0] if not return_dict else outputs["last_hidden_state"]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        if return_dict:
+            return {
+                "logits": logits,
+                "hidden_states": outputs["all_hidden_states"],
+            }
+        return logits
+
+
+class FNetForQuestionAnswering(FNetPretrainedModel):
+    """
+    FNet Model with a linear layer on top of the hidden-states output to compute `span_start_logits`
+    and `span_end_logits`, designed for question-answering tasks like SQuAD.
+
+    Args:
+        fnet (:class:`FNetModel`):
+            An instance of FNetModel.
+        num_labels (int):
+            The number of labels.
+
+    """
+
+    def __init__(self, config: FNetConfig, num_labels):
+        super(FNetForQuestionAnswering, self).__init__(config)
+        self.num_labels = num_labels
+        self.fnet = FNetModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, self.num_labels)
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        outputs = self.fnet(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0] if not return_dict else outputs["last_hidden_state"]
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = paddle.split(logits, num_or_sections=2, axis=-1)
+        start_logits = start_logits.squeeze(axis=-1)
+        end_logits = start_logits.squeeze(axis=-1)
+        if return_dict:
+            return {
+                "start_logits": start_logits,
+                "end_logits": end_logits,
+                "hidden_states": outputs["all_hidden_states"],
+            }
+        return start_logits, end_logits
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/fnet/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/fnet/tokenizer.py
new file mode 100644
index 000000000..36456a4ae
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/fnet/tokenizer.py
@@ -0,0 +1,208 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for FNet model."""
+
+from typing import Any, Dict, List, Optional
+
+import sentencepiece as spm
+
+from ..albert.tokenizer import AddedToken, AlbertEnglishTokenizer
+
+__all__ = ["FNetTokenizer"]
+
+SPIECE_UNDERLINE = "▁"
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"fnet-base": 512, "fnet-large": 512}
+
+
+class FNetTokenizer(AlbertEnglishTokenizer):
+    """
+    Construct a FNet tokenizer. Inherit from :class:`AlbertEnglishTokenizer`. Based on `SentencePiece
+    <https://github.com/google/sentencepiece>`__.
+
+    Args:
+        sentencepiece_model_file (:obj:`str`):
+            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to lowercase the input when tokenizing.
+        remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
+        keep_accents (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to keep accents when tokenizing.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        sp_model_kwargs (:obj:`dict`, `optional`):
+            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
+            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+
+            - ``enable_sampling``: Enable subword regularization.
+            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - ``nbest_size = {0,1}``: No sampling is performed.
+              - ``nbest_size > 1``: samples from the nbest_size results.
+              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+
+    Attributes:
+        sp_model (:obj:`SentencePieceProcessor`):
+            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+    """
+
+    resource_files_names = {
+        "sentencepiece_model_file": "spiece.model",
+    }
+    pretrained_resource_files_map = {
+        "sentencepiece_model_file": {
+            "fnet-base": "https://bj.bcebos.com/paddlenlp/models/transformers/fnet/fnet-base/spiece.model",
+            "fnet-large": "https://bj.bcebos.com/paddlenlp/models/transformers/fnet/fnet-large/spiece.model",
+        }
+    }
+    pretrained_init_configuration = {
+        "fnet-base": {
+            "do_lower_case": False,
+        },
+        "fnet-large": {
+            "do_lower_case": False,
+        },
+    }
+    model_input_names = ["input_ids", "token_type_ids"]
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        sentencepiece_model_file,
+        do_lower_case=False,
+        remove_space=True,
+        keep_accents=True,
+        unk_token="<unk>",
+        sep_token="[SEP]",
+        pad_token="<pad>",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs
+    ):
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+        super().__init__(
+            sentencepiece_model_file=sentencepiece_model_file,
+            do_lower_case=do_lower_case,
+            remove_space=remove_space,
+            keep_accents=keep_accents,
+            bos_token=cls_token,
+            eos_token=sep_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            sp_model_kwargs=sp_model_kwargs,
+            **kwargs,
+        )
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.keep_accents = keep_accents
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(sentencepiece_model_file)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An FNet sequence has the following format:
+
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return cls + token_ids_0 + sep
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. An FNet sequence
+        pair mask has the following format: ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence |
+
+        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/funnel/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/funnel/__init__.py
new file mode 100644
index 000000000..3bd752713
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/funnel/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .modeling import *
+from .tokenizer import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/funnel/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/funnel/configuration.py
new file mode 100644
index 000000000..58338ca28
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/funnel/configuration.py
@@ -0,0 +1,206 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" funnel model configuration"""
+from __future__ import annotations
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = [
+    "FUNNEL_PRETRAINED_INIT_CONFIGURATION",
+    "FUNNEL_PRETRAINED_RESOURCE_FILES_MAP",
+    "FunnelConfig",
+]
+
+FUNNEL_PRETRAINED_INIT_CONFIGURATION = {
+    "funnel-transformer/small": {},  # B4-4-4H768
+    "funnel-transformer/small-base": {},  # B4-4-4H768, no decoder
+    "funnel-transformer/medium": {},  # B6-3x2-3x2H768
+    "funnel-transformer/medium-base": {},  # B6-3x2-3x2H768, no decoder
+    "funnel-transformer/intermediate": {},  # B6-6-6H768
+    "funnel-transformer/intermediate-base": {},  # B6-6-6H768, no decoder
+    "funnel-transformer/large": {},  # B8-8-8H1024
+    "funnel-transformer/large-base": {},  # B8-8-8H1024, no decoder
+    "funnel-transformer/xlarge-base": {},  # B10-10-10H1024
+    "funnel-transformer/xlarge": {},  # B10-10-10H1024, no decoder
+}
+
+FUNNEL_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "funnel-transformer/small": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/small/model_state.pdparams",
+        "funnel-transformer/small-base": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/small-base/model_state.pdparams",
+        "funnel-transformer/medium": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/medium/model_state.pdparams",
+        "funnel-transformer/medium-base": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/medium-base/model_state.pdparams",
+        "funnel-transformer/intermediate": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/intermediate/model_state.pdparams",
+        "funnel-transformer/intermediate-base": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/intermediate-base/model_state.pdparams",
+        "funnel-transformer/large": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/large/model_state.pdparams",
+        "funnel-transformer/large-base": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/large-base/model_state.pdparams",
+        "funnel-transformer/xlarge-base": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/xlarge-base/model_state.pdparams",
+        "funnel-transformer/xlarge": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/xlarge/model_state.pdparams",
+    },
+    "model_config": {
+        "funnel-transformer/small": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/small/model_config.json",
+        "funnel-transformer/small-base": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/small-base/model_config.json",
+        "funnel-transformer/medium": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/medium/model_config.json",
+        "funnel-transformer/medium-base": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/medium-base/model_config.json",
+        "funnel-transformer/intermediate": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/intermediate/model_config.json",
+        "funnel-transformer/intermediate-base": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/intermediate-base/model_config.json",
+        "funnel-transformer/large": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/large/model_config.json",
+        "funnel-transformer/large-base": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/large-base/model_config.json",
+        "funnel-transformer/xlarge-base": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/xlarge-base/model_config.json",
+        "funnel-transformer/xlarge": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/xlarge/model_config.json",
+    },
+}
+
+FUNNEL_RESOURCE_FILES_NAMES = {"model_state": "model_state.pdparams", "model_config": "model_config.json"}
+
+
+class FunnelConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~hf_paddle.FunnelModel` or a
+    :class:`~hf_paddle.TFBertModel`. It is used to instantiate a Funnel Transformer model according to the specified
+    arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
+    configuration to that of the Funnel Transformer `funnel-transformer/small
+    <https://huggingface.co/funnel-transformer/small>`__ architecture.
+
+    Configuration objects inherit from :class:`~hf_paddle.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~hf_paddle.PretrainedConfig` for more information.
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+            Vocabulary size of the Funnel transformer. Defines the number of different tokens that can be represented
+            by the :obj:`inputs_ids` passed when calling :class:`~hf_paddle.FunnelModel` or
+            :class:`~hf_paddle.TFFunnelModel`.
+        block_sizes (:obj:`List[int]`, `optional`, defaults to :obj:`[4, 4, 4]`):
+            The sizes of the blocks used in the model.
+        block_repeats (:obj:`List[int]`, `optional`):
+            If passed along, each layer of each block is repeated the number of times indicated.
+        num_decoder_layers (:obj:`int`, `optional`, defaults to 2):
+            The number of layers in the decoder (when not using the base model).
+        d_model (:obj:`int`, `optional`, defaults to 768):
+            Dimensionality of the model's hidden states.
+        n_head (:obj:`int`, `optional`, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        d_head (:obj:`int`, `optional`, defaults to 64):
+            Dimensionality of the model's heads.
+        d_inner (:obj:`int`, `optional`, defaults to 3072):
+            Inner dimension in the feed-forward blocks.
+        hidden_act (:obj:`str` or :obj:`callable`, `optional`, defaults to :obj:`"gelu_new"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for the attention probabilities.
+        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout probability used between the two layers of the feed-forward blocks.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 3):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~hf_paddle.FunnelModel` or
+            :class:`~hf_paddle.TFFunnelModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.1):
+            The standard deviation of the `uniform initializer` for initializing all weight matrices in attention
+            layers.
+        initializer_std (:obj:`float`, `optional`):
+            The standard deviation of the `normal initializer` for initializing the embedding matrix and the weight of
+            linear layers. Will default to 1 for the embedding matrix and the value given by Xavier initialization for
+            linear layers.
+        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-9):
+            The epsilon used by the layer normalization layers.
+        pooling_type (:obj:`str`, `optional`, defaults to :obj:`"mean"`):
+            Possible values are ``"mean"`` or ``"max"``. The way pooling is performed at the beginning of each block.
+        attention_type (:obj:`str`, `optional`, defaults to :obj:`"relative_shift"`):
+            Possible values are ``"relative_shift"`` or ``"factorized"``. The former is faster on CPU/GPU while the
+            latter is faster on TPU.
+        separate_cls (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to separate the cls token when applying pooling.
+        truncate_seq (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            When using ``separate_cls``, whether or not to truncate the last token when pooling, to avoid getting a
+            sequence length that is not a multiple of 2.
+        pool_q_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to apply the pooling only to the query or to query, key and values for the attention layers.
+    """
+    model_type = "funnel"
+    attribute_map = {"hidden_size": "d_model", "num_attention_heads": "n_head"}
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        block_sizes=[4, 4, 4],
+        block_repeats=None,
+        num_decoder_layers=2,
+        d_model=768,
+        n_head=12,
+        d_head=64,
+        d_inner=3072,
+        hidden_act="gelu_new",
+        hidden_dropout=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.0,
+        max_position_embeddings=512,
+        type_vocab_size=3,
+        initializer_range=0.1,
+        initializer_std=None,
+        layer_norm_eps=1e-9,
+        pooling_type="mean",
+        attention_type="relative_shift",
+        separate_cls=True,
+        truncate_seq=True,
+        pool_q_only=True,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.vocab_size = vocab_size
+        self.block_sizes = block_sizes
+        self.block_repeats = [1] * len(block_sizes) if block_repeats is None else block_repeats
+        assert len(block_sizes) == len(
+            self.block_repeats
+        ), "`block_sizes` and `block_repeats` should have the same length."
+        self.num_decoder_layers = num_decoder_layers
+        self.d_model = d_model
+        self.n_head = n_head
+        self.d_head = d_head
+        self.d_inner = d_inner
+        self.hidden_act = hidden_act
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.initializer_std = initializer_std
+        self.layer_norm_eps = layer_norm_eps
+        assert pooling_type in [
+            "mean",
+            "max",
+        ], f"Got {pooling_type} for `pooling_type` but only 'mean' and 'max' are supported."
+        self.pooling_type = pooling_type
+        assert attention_type in [
+            "relative_shift",
+            "factorized",
+        ], f"Got {attention_type} for `attention_type` but only 'relative_shift' and 'factorized' are supported."
+        self.attention_type = attention_type
+        self.separate_cls = separate_cls
+        self.truncate_seq = truncate_seq
+        self.pool_q_only = pool_q_only
+
+    @property
+    def num_hidden_layers(self):
+        return sum(self.block_sizes)
+
+    @property
+    def num_blocks(self):
+        return len(self.block_sizes)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/funnel/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/funnel/modeling.py
new file mode 100644
index 000000000..7dc097ef6
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/funnel/modeling.py
@@ -0,0 +1,1581 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from collections import OrderedDict
+from collections.abc import Iterable
+from dataclasses import dataclass, fields
+from typing import Optional, Tuple
+
+import numpy as np
+import paddle
+from paddle import nn
+from paddle.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
+
+from .. import PretrainedModel as PreTrainedModel
+from .. import register_base_model
+from ..activations import ACT2FN
+from .configuration import (
+    FUNNEL_PRETRAINED_INIT_CONFIGURATION,
+    FUNNEL_PRETRAINED_RESOURCE_FILES_MAP,
+    FUNNEL_RESOURCE_FILES_NAMES,
+    FunnelConfig,
+)
+
+FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "funnel-transformer/small",  # B4-4-4H768
+    "funnel-transformer/small-base",  # B4-4-4H768, no decoder
+    "funnel-transformer/medium",  # B6-3x2-3x2H768
+    "funnel-transformer/medium-base",  # B6-3x2-3x2H768, no decoder
+    "funnel-transformer/intermediate",  # B6-6-6H768
+    "funnel-transformer/intermediate-base",  # B6-6-6H768, no decoder
+    "funnel-transformer/large",  # B8-8-8H1024
+    "funnel-transformer/large-base",  # B8-8-8H1024, no decoder
+    "funnel-transformer/xlarge-base",  # B10-10-10H1024
+    "funnel-transformer/xlarge",  # B10-10-10H1024, no decoder
+]
+
+__all__ = [
+    "FunnelModel",
+    "FunnelForSequenceClassification",
+    "FunnelForTokenClassification",
+    "FunnelForQuestionAnswering",
+]
+
+INF = 1e6
+
+
+def expand(self, *sizes):
+    if isinstance(sizes[0], Iterable):
+        sizes = sizes[0]
+    # handle -1 case
+    if len(sizes) > len(self.shape):
+        for _ in range(len(sizes) - len(self.shape)):
+            self = self.unsqueeze(axis=0)
+    x = paddle.expand(self, sizes, name=None)
+    return x
+
+
+def repeat_interleave(x, repeats, dim=None):
+    orig_shape = list(x.shape)
+    if dim is None:
+        dim = 1
+        x = paddle.reshape(x, (-1, 1))  # x.reshape(-1,1)
+        size = [1] * len(x.shape)
+        size[dim] = repeats
+        x = paddle.tile(x, size)
+        return paddle.reshape(x, (-1))
+    else:
+        if len(orig_shape) == dim + 1:
+            x = x.unsqueeze(-1)
+        # x=x.reshape(-1,1)
+        size = [1] * len(orig_shape)
+        size[-1] = repeats
+        x = paddle.tile(x, size)
+        orig_shape[dim] = -1
+        return paddle.reshape(x, orig_shape)
+
+
+def gather(x, dim, index):
+    index_shape = index.shape
+    index_flatten = index.flatten()
+    if dim < 0:
+        dim = len(x.shape) + dim
+    nd_index = []
+    for k in range(len(x.shape)):
+        if k == dim:
+            nd_index.append(index_flatten)
+        else:
+            reshape_shape = [1] * len(x.shape)
+            reshape_shape[k] = x.shape[k]
+            dim_index = paddle.expand(
+                paddle.reshape(paddle.arange(x.shape[k], dtype=index.dtype), reshape_shape), index_shape
+            ).flatten()
+            nd_index.append(dim_index)
+
+    ind2 = paddle.transpose(paddle.stack(nd_index), [1, 0])
+    paddle_out = paddle.gather_nd(x, ind2).reshape(index_shape)
+    return paddle_out
+
+
+def split(x, batch_size, dim=0):
+    if isinstance(batch_size, int):
+        if batch_size > x.shape[dim]:
+            return [x]  # do nothing
+        return [y for y in paddle.split(x, x.shape[dim] // batch_size, dim)]
+    else:
+        return [y for y in paddle.split(x, batch_size, dim)]
+
+
+def normal_(x, m=0, std=1):
+    y = paddle.randn(x.shape) * std + m
+    paddle.assign(y, x)
+    return x
+
+
+def uniform_(x, a=0, b=1.0):
+    temp_value = paddle.uniform(min=a, max=b, shape=x.shape)
+    x.set_value(temp_value)
+    return x
+
+
+def constant_(x, val):
+    temp_value = paddle.full_like(x, fill_value=val)
+    x.set_value(temp_value)
+    return x
+
+
+class FunnelEmbeddings(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.layer_norm = LayerNorm(config.d_model, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+
+    def forward(self, input_ids=None, inputs_embeds=None):
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        embeddings = self.layer_norm(inputs_embeds)
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+def pad(input, pad, mode="constant", value=0):
+    pad2 = []
+    for _ in range(len(input.shape) * 2 - len(pad)):
+        pad2.append(0)
+    if isinstance(pad, tuple):
+        pad = list(pad)
+    pad2 = pad2 + pad
+    return paddle.nn.functional.pad(input, pad2, mode=mode, value=value)
+
+
+class FunnelAttentionStructure(nn.Layer):
+    """
+    Contains helpers for `FunnelRelMultiheadAttention `.
+    """
+
+    cls_token_type_id: int = 2
+
+    def __init__(self, config):
+        super().__init__()
+        self.config2 = config
+        self.sin_dropout = nn.Dropout(config.hidden_dropout)
+        self.cos_dropout = nn.Dropout(config.hidden_dropout)
+        # Track where we are at in terms of pooling from the original input, e.g., by how much the sequence length was
+        # divided.
+        self.pooling_mult = None
+
+    def init_attention_inputs(self, inputs_embeds, attention_mask=None, token_type_ids=None):
+        """Returns the attention inputs associated to the inputs of the model."""
+        # inputs_embeds has shape batch_size x seq_len x d_model
+        # attention_mask and token_type_ids have shape batch_size x seq_len
+        self.pooling_mult = 1
+        self.seq_len = seq_len = inputs_embeds.shape[1]
+        position_embeds = self.get_position_embeds(seq_len, inputs_embeds.dtype)
+        token_type_mat = self.token_type_ids_to_mat(token_type_ids) if token_type_ids is not None else None
+        cls_mask = (
+            pad(
+                paddle.ones([seq_len - 1, seq_len - 1], dtype=inputs_embeds.dtype), (1, 0, 1, 0)
+            )  # nn.functional.pad(inputs_embeds.new_ones([seq_len - 1, seq_len - 1]), (1, 0, 1, 0))
+            if self.config2.separate_cls
+            else None
+        )
+        return (position_embeds, token_type_mat, attention_mask, cls_mask)
+
+    def token_type_ids_to_mat(self, token_type_ids):
+        """Convert `token_type_ids` to `token_type_mat`."""
+        # token_type_mat = token_type_ids[:, :, None] == token_type_ids[:, None]
+        token_type_mat = token_type_ids.unsqueeze(2) == token_type_ids.unsqueeze(1)
+        # Treat <cls> as in the same segment as both A & B
+        cls_ids = token_type_ids == self.cls_token_type_id
+        # cls_mat = cls_ids[:, :, None] | cls_ids[:, None]
+        cls_mat = paddle.logical_or(cls_ids.unsqueeze(2), cls_ids.unsqueeze(1))
+        return paddle.logical_or(cls_mat, token_type_mat)
+
+    def get_position_embeds(self, seq_len, dtype):
+        """
+        Create and cache inputs related to relative position encoding. Those are very different depending on whether we
+        are using the factorized or the relative shift attention:
+
+        For the factorized attention, it returns the matrices (phi, pi, psi, omega) used in the paper, appendix A.2.2,
+        final formula.
+
+        For the relative shift attention, it returns all possible vectors R used in the paper, appendix A.2.1, final
+        formula.
+
+        Paper link: https://arxiv.org/abs/2006.03236
+        """
+        d_model = self.config2.d_model
+        if self.config2.attention_type == "factorized":
+            # Notations from the paper, appending A.2.2, final formula.
+            # We need to create and return the matrices phi, psi, pi and omega.
+            pos_seq = paddle.arange(0, seq_len, 1.0, dtype=dtype)
+            freq_seq = paddle.arange(0, d_model // 2, 1.0, dtype=dtype)
+            inv_freq = 1 / (10000 ** (freq_seq / (d_model // 2)))
+            sinusoid = pos_seq.unsqueeze(1) * inv_freq.unsqueeze(0)
+            sin_embed = paddle.sin(sinusoid)
+            sin_embed_d = self.sin_dropout(sin_embed)
+            cos_embed = paddle.cos(sinusoid)
+            cos_embed_d = self.cos_dropout(cos_embed)
+            # This is different from the formula on the paper...
+            phi = paddle.concat([sin_embed_d, sin_embed_d], axis=-1)
+            psi = paddle.concat([cos_embed, sin_embed], axis=-1)
+            pi = paddle.concat([cos_embed_d, cos_embed_d], axis=-1)
+            omega = paddle.concat([-sin_embed, cos_embed], axis=-1)
+            return (phi, pi, psi, omega)
+        else:
+            # Notations from the paper, appending A.2.1, final formula.
+            # We need to create and return all the possible vectors R for all blocks and shifts.
+            freq_seq = paddle.arange(0, d_model // 2, 1, dtype=dtype)
+            inv_freq = 1 / (10000 ** (freq_seq / (d_model // 2)))
+            # Maximum relative positions for the first input
+            rel_pos_id = paddle.arange(-seq_len * 2, seq_len * 2, 1, dtype=dtype)
+            zero_offset = seq_len * 2
+            sinusoid = rel_pos_id.unsqueeze(1) * inv_freq.unsqueeze(0)
+            sin_embed = self.sin_dropout(paddle.sin(sinusoid))
+            cos_embed = self.cos_dropout(paddle.cos(sinusoid))
+            pos_embed = paddle.concat([sin_embed, cos_embed], axis=-1)
+
+            pos = paddle.arange(0, seq_len, dtype=dtype)
+            pooled_pos = pos
+            position_embeds_list = []
+            for block_index in range(0, self.config2.num_blocks):
+                # For each block with block_index > 0, we need two types position embeddings:
+                #   - Attention(pooled-q, unpooled-kv)
+                #   - Attention(pooled-q, pooled-kv)
+                # For block_index = 0 we only need the second one and leave the first one as None.
+
+                # First type
+                if block_index == 0:
+                    position_embeds_pooling = None
+                else:
+                    pooled_pos = self.stride_pool_pos(pos, block_index)
+
+                    # construct rel_pos_id
+                    stride = 2 ** (block_index - 1)
+                    rel_pos = self.relative_pos(pos, stride, pooled_pos, shift=2)
+                    rel_pos = rel_pos.unsqueeze(1) + zero_offset
+                    rel_pos = expand(rel_pos, (rel_pos.shape[0], d_model))
+                    position_embeds_pooling = gather(pos_embed, 0, rel_pos)
+
+                # Second type
+                pos = pooled_pos
+                stride = 2**block_index
+                rel_pos = self.relative_pos(pos, stride)
+
+                rel_pos = rel_pos.unsqueeze(1) + zero_offset
+                rel_pos = expand(rel_pos, (rel_pos.shape[0], d_model))
+                position_embeds_no_pooling = gather(pos_embed, 0, rel_pos)
+
+                position_embeds_list.append([position_embeds_no_pooling, position_embeds_pooling])
+            return position_embeds_list
+
+    def stride_pool_pos(self, pos_id, block_index):
+        """
+        Pool `pos_id` while keeping the cls token separate (if `config.separate_cls=True`).
+        """
+        if self.config2.separate_cls:
+            # Under separate <cls>, we treat the <cls> as the first token in
+            # the previous block of the 1st real block. Since the 1st real
+            # block always has position 1, the position of the previous block
+            # will be at `1 - 2 ** block_index`.
+            cls_pos = paddle.to_tensor([-(2**block_index) + 1]).astype(pos_id.dtype)
+            pooled_pos_id = pos_id[1:-1] if self.config2.truncate_seq else pos_id[1:]
+            return paddle.concat([cls_pos, pooled_pos_id[::2]], axis=0)
+        else:
+            return pos_id[::2]
+
+    def relative_pos(self, pos, stride, pooled_pos=None, shift=1):
+        """
+        Build the relative positional vector between `pos` and `pooled_pos`.
+        """
+        if pooled_pos is None:
+            pooled_pos = pos
+
+        ref_point = pooled_pos[0] - pos[0]
+        num_remove = shift * len(pooled_pos)
+        max_dist = ref_point + num_remove * stride
+        min_dist = pooled_pos[0] - pos[-1]
+
+        return paddle.arange(max_dist, min_dist - 1, -stride, dtype=paddle.int64)
+
+    def stride_pool(self, tensor, axis):
+        """
+        Perform pooling by stride slicing the tensor along the given axis.
+        """
+        if tensor is None:
+            return None
+        tensor = tensor.astype("float32")
+        # Do the stride pool recursively if axis is a list or a tuple of ints.
+        if isinstance(axis, (list, tuple)):
+            for ax in axis:
+                tensor = self.stride_pool(tensor, ax)
+            return tensor
+
+        # Do the stride pool recursively if tensor is a list or tuple of tensors.
+        if isinstance(tensor, (tuple, list)):
+            return type(tensor)(self.stride_pool(x, axis) for x in tensor)
+
+        # Deal with negative axis
+        axis %= tensor.ndim
+
+        if self.config2.separate_cls:
+            # tensor = paddle.cat([tensor[cls_slice], tensor], axis=axis)
+            if axis == 1:
+                tensor = paddle.concat([tensor[:, :1], tensor], axis=axis)
+            if axis == 2:
+                tensor = paddle.concat([tensor[:, :, :1], tensor], axis=axis)
+            if axis == 0:
+                tensor = paddle.concat([tensor[:1], tensor], axis=axis)
+        if axis == 1:
+            return tensor[:, 0:-1:2].astype("bool")
+        if axis == 0:
+            return tensor[0:-1:2].astype("bool")
+        if axis == 2:
+            return tensor[:, :, 0:-1:2].astype("bool")
+
+    def pool_tensor(self, tensor, mode="mean", stride=2):
+        """Apply 1D pooling to a tensor of size [B x T (x H)]."""
+        if tensor is None:
+            return None
+
+        # Do the pool recursively if tensor is a list or tuple of tensors.
+        if isinstance(tensor, (tuple, list)):
+            return type(tensor)(self.pool_tensor(tensor, mode=mode, stride=stride) for x in tensor)
+
+        if self.config2.separate_cls:
+            suffix = tensor[:, :-1] if self.config2.truncate_seq else tensor
+            tensor = paddle.concat([tensor[:, :1], suffix], axis=1)
+
+        ndim = tensor.ndim
+        if ndim == 2:
+            tensor = tensor.unsqueeze(1).unsqueeze(3)  # [:, None, :, None]
+        elif ndim == 3:
+            tensor = tensor.unsqueeze(1)  # [:, None, :, :]
+        # Stride is applied on the second-to-last dimension.
+        stride = (stride, 1)
+
+        if mode == "mean":
+            tensor = nn.functional.avg_pool2d(tensor, stride, stride=stride, ceil_mode=True)
+        elif mode == "max":
+            tensor = nn.functional.max_pool2d(tensor, stride, stride=stride, ceil_mode=True)
+        elif mode == "min":
+            tensor = -nn.functional.max_pool2d(-tensor, stride, stride=stride, ceil_mode=True)
+        else:
+            raise NotImplementedError("The supported modes are 'mean', 'max' and 'min'.")
+
+        if ndim == 2:
+            return tensor[:, 0, :, 0]
+        elif ndim == 3:
+            return tensor[:, 0]
+        return tensor
+
+    def pre_attention_pooling(self, output, attention_inputs):
+        """Pool `output` and the proper parts of `attention_inputs` before the attention layer."""
+        position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs
+        if self.config2.pool_q_only:
+            if self.config2.attention_type == "factorized":
+                position_embeds = self.stride_pool(position_embeds[:2], 0) + position_embeds[2:]
+            token_type_mat = self.stride_pool(token_type_mat, 1)
+            cls_mask = self.stride_pool(cls_mask, 0)
+            output = self.pool_tensor(output, mode=self.config2.pooling_type)
+
+        else:
+            self.pooling_mult *= 2
+            if self.config2.attention_type == "factorized":
+                position_embeds = self.stride_pool(position_embeds, 0)
+            token_type_mat = self.stride_pool(token_type_mat, [1, 2])
+            cls_mask = self.stride_pool(cls_mask, [1, 2])
+            attention_mask = self.pool_tensor(attention_mask, mode="min")
+            output = self.pool_tensor(output, mode=self.config2.pooling_type)
+
+        attention_inputs = (position_embeds, token_type_mat, attention_mask, cls_mask)
+        return output, attention_inputs
+
+    def post_attention_pooling(self, attention_inputs):
+        """Pool the proper parts of `attention_inputs` after the attention layer."""
+        position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs
+        if self.config2.pool_q_only:
+            self.pooling_mult *= 2
+            if self.config2.attention_type == "factorized":
+                position_embeds = position_embeds[:2] + self.stride_pool(position_embeds[2:], 0)
+            token_type_mat = self.stride_pool(token_type_mat, 2)
+            cls_mask = self.stride_pool(cls_mask, 1)
+            attention_mask = self.pool_tensor(attention_mask, mode="min")
+        attention_inputs = (position_embeds, token_type_mat, attention_mask, cls_mask)
+        return attention_inputs
+
+
+def _relative_shift_gather(positional_attn, context_len, shift):
+    batch_size, n_head, seq_len, max_rel_len = positional_attn.shape
+    # max_rel_len = 2 * context_len + shift -1 is the numbers of possible relative positions i-j
+
+    # What's next is the same as doing the following gather, which might be clearer code but less efficient.
+    # idxs = context_len + paddle.arange(0, context_len).unsqueeze(0) - paddle.arange(0, seq_len).unsqueeze(1)
+    # # matrix of context_len + i-j
+    # return positional_attn.gather(3, idxs.expand([batch_size, n_head, context_len, context_len]))
+
+    positional_attn = paddle.reshape(positional_attn, [batch_size, n_head, max_rel_len, seq_len])
+    positional_attn = positional_attn[:, :, shift:, :]
+    positional_attn = paddle.reshape(positional_attn, [batch_size, n_head, seq_len, max_rel_len - shift])
+    positional_attn = positional_attn[:, :, :, :context_len]
+    return positional_attn
+
+
+def Parameter(shape_or_tensor, fill_value=None, requires_grad=True):
+    if isinstance(shape_or_tensor, paddle.Tensor):
+        X = Parameter(shape_or_tensor.shape, 0.0)
+        paddle.assign(shape_or_tensor.astype("float32"), X)
+    else:
+        if isinstance(shape_or_tensor, int):
+            shape_or_tensor = [shape_or_tensor]
+
+        X = paddle.create_parameter(
+            shape=shape_or_tensor,
+            dtype="float32",
+            attr=paddle.ParamAttr(name=None, initializer=paddle.nn.initializer.Constant(value=fill_value)),
+            is_bias=False,
+        )
+    if not requires_grad:
+        X.stop_gradient = True
+
+    return X
+
+
+class FunnelRelMultiheadAttention(nn.Layer):
+    def __init__(self, config, block_index):
+        super().__init__()
+        self.config2 = config
+        self.block_index = block_index
+        d_model, n_head, d_head = config.d_model, config.n_head, config.d_head
+
+        self.hidden_dropout = nn.Dropout(config.hidden_dropout)
+        self.attention_dropout = nn.Dropout(config.attention_dropout)
+
+        self.q_head = nn.Linear(d_model, n_head * d_head, bias_attr=False)
+        self.k_head = nn.Linear(d_model, n_head * d_head)
+        self.v_head = nn.Linear(d_model, n_head * d_head)
+
+        self.r_w_bias = Parameter(paddle.zeros([n_head, d_head]))
+        self.r_r_bias = Parameter(paddle.zeros([n_head, d_head]))
+        self.r_kernel = Parameter(paddle.zeros([d_model, n_head, d_head]))
+        self.r_s_bias = Parameter(paddle.zeros([n_head, d_head]))
+        self.seg_embed = Parameter(paddle.zeros([2, n_head, d_head]))
+
+        self.post_proj = nn.Linear(n_head * d_head, d_model)
+        self.layer_norm = LayerNorm(d_model, epsilon=config.layer_norm_eps)
+        self.scale = 1.0 / (d_head**0.5)
+
+    def relative_positional_attention(self, position_embeds, q_head, context_len, cls_mask=None):
+        """Relative attention score for the positional encodings"""
+        # q_head has shape batch_size x sea_len x n_head x d_head
+        if self.config2.attention_type == "factorized":
+            # Notations from the paper, appending A.2.2, final formula (https://arxiv.org/abs/2006.03236)
+            # phi and pi have shape seq_len x d_model, psi and omega have shape context_len x d_model
+            phi, pi, psi, omega = position_embeds
+            # Shape n_head x d_head
+            u = self.r_r_bias * self.scale
+            # Shape d_model x n_head x d_head
+            w_r = self.r_kernel
+
+            # Shape batch_size x sea_len x n_head x d_model
+            q_r_attention = paddle.einsum("binh,dnh->bind", q_head + u, w_r)
+            q_r_attention_1 = q_r_attention * phi.unsqueeze(1)  # [:, None]
+            q_r_attention_2 = q_r_attention * pi.unsqueeze(1)  # [:, None]
+
+            # Shape batch_size x n_head x seq_len x context_len
+            positional_attn = paddle.einsum("bind,jd->bnij", q_r_attention_1, psi) + paddle.einsum(
+                "bind,jd->bnij", q_r_attention_2, omega
+            )
+        else:
+            shift = 2 if q_head.shape[1] != context_len else 1
+            # Notations from the paper, appending A.2.1, final formula (https://arxiv.org/abs/2006.03236)
+            # Grab the proper positional encoding, shape max_rel_len x d_model
+            r = position_embeds[self.block_index][shift - 1]
+            # Shape n_head x d_head
+            v = self.r_r_bias * self.scale
+            # Shape d_model x n_head x d_head
+            w_r = self.r_kernel
+
+            # Shape max_rel_len x n_head x d_model
+            r_head = paddle.einsum("td,dnh->tnh", r, w_r)
+            # Shape batch_size x n_head x seq_len x max_rel_len
+            positional_attn = paddle.einsum("binh,tnh->bnit", q_head + v, r_head)
+            # Shape batch_size x n_head x seq_len x context_len
+            positional_attn = _relative_shift_gather(positional_attn, context_len, shift)
+
+        if cls_mask is not None:
+            positional_attn *= cls_mask.astype(positional_attn.dtype)
+        return positional_attn
+
+    def relative_token_type_attention(self, token_type_mat, q_head, cls_mask=None):
+        """Relative attention score for the token_type_ids"""
+        if token_type_mat is None:
+            return 0
+        batch_size, seq_len, context_len = token_type_mat.shape
+        # q_head has shape batch_size x seq_len x n_head x d_head
+        # Shape n_head x d_head
+        r_s_bias = self.r_s_bias * self.scale
+
+        # Shape batch_size x n_head x seq_len x 2
+        token_type_bias = paddle.einsum("bind,snd->bnis", q_head + r_s_bias, self.seg_embed)
+
+        # Shape batch_size x n_head x seq_len x context_len
+        # token_type_mat = token_type_mat[:, None].expand([batch_size, q_head.shape[2], seq_len, context_len])
+        token_type_mat = expand(token_type_mat.unsqueeze(1), ([batch_size, q_head.shape[2], seq_len, context_len]))
+        # Shapes batch_size x n_head x seq_len
+        diff_token_type, same_token_type = split(token_type_bias, 1, dim=-1)
+        # Shape batch_size x n_head x seq_len x context_len
+        token_type_attn = paddle.where(
+            token_type_mat,
+            expand(same_token_type, (token_type_mat.shape)),
+            expand(diff_token_type, (token_type_mat.shape)),
+        )
+
+        if cls_mask is not None:
+            token_type_attn *= cls_mask.astype(token_type_attn.dtype)
+        return token_type_attn
+
+    def forward(self, query, key, value, attention_inputs, output_attentions=False):
+        # query has shape batch_size x seq_len x d_model
+        # key and value have shapes batch_size x context_len x d_model
+        position_embeds, token_type_mat, attention_mask, cls_mask = attention_inputs
+
+        batch_size, seq_len, _ = query.shape
+        context_len = key.shape[1]
+        n_head, d_head = self.config2.n_head, self.config2.d_head
+
+        # Shape batch_size x seq_len x n_head x d_head
+        q_head = paddle.reshape(
+            self.q_head(query), (batch_size, seq_len, n_head, d_head)
+        )  # self.q_head(query).reshape(batch_size, seq_len, n_head, d_head)
+        # Shapes batch_size x context_len x n_head x d_head
+        k_head = paddle.reshape(
+            self.k_head(key), (batch_size, context_len, n_head, d_head)
+        )  # self.k_head(key).reshape(batch_size, context_len, n_head, d_head)
+        v_head = paddle.reshape(self.v_head(value), (batch_size, context_len, n_head, d_head))
+
+        q_head = q_head * self.scale
+        # Shape n_head x d_head
+        r_w_bias = self.r_w_bias * self.scale
+        # Shapes batch_size x n_head x seq_len x context_len
+
+        content_score = paddle.einsum("bind,bjnd->bnij", q_head + r_w_bias, k_head)
+
+        positional_attn = self.relative_positional_attention(position_embeds, q_head, context_len, cls_mask)
+        token_type_attn = self.relative_token_type_attention(token_type_mat, q_head, cls_mask)
+
+        # merge attention scores
+        attn_score = content_score + positional_attn + token_type_attn
+
+        # precision safe in case of mixed precision training
+        dtype = attn_score.dtype
+        attn_score = attn_score.astype("float32")
+        # perform masking
+        if attention_mask is not None:
+            # attn_score = attn_score - INF * (1 - attention_mask[:, None, None].float())
+            attn_score = attn_score - INF * (1 - attention_mask.unsqueeze(1).unsqueeze(2).astype("float32"))
+        # attention probability
+        attn_prob = paddle.nn.functional.softmax(attn_score, axis=-1, dtype=dtype)
+        attn_prob = self.attention_dropout(attn_prob)
+
+        # attention output, shape batch_size x seq_len x n_head x d_head
+        attn_vec = paddle.einsum("bnij,bjnd->bind", attn_prob, v_head)
+
+        # Shape shape batch_size x seq_len x d_model
+        attn_out = self.post_proj(attn_vec.reshape((batch_size, seq_len, n_head * d_head)))
+        attn_out = self.hidden_dropout(attn_out)
+
+        output = self.layer_norm(query + attn_out)
+        return (output, attn_prob) if output_attentions else (output,)
+
+
+class FunnelPositionwiseFFN(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.linear_1 = nn.Linear(config.d_model, config.d_inner)
+        self.activation_function = ACT2FN[config.hidden_act]
+        self.activation_dropout = nn.Dropout(config.activation_dropout)
+        self.linear_2 = nn.Linear(config.d_inner, config.d_model)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = LayerNorm(config.d_model, epsilon=config.layer_norm_eps)
+
+    def forward(self, hidden):
+        h = self.linear_1(hidden)
+        h = self.activation_function(h)
+        h = self.activation_dropout(h)
+        h = self.linear_2(h)
+        h = self.dropout(h)
+        return self.layer_norm(hidden + h)
+
+
+class FunnelLayer(nn.Layer):
+    def __init__(self, config, block_index):
+        super().__init__()
+        self.attention = FunnelRelMultiheadAttention(config, block_index)
+        self.ffn = FunnelPositionwiseFFN(config)
+
+    def forward(self, query, key, value, attention_inputs, output_attentions=False):
+        attn = self.attention(query, key, value, attention_inputs, output_attentions=output_attentions)
+        output = self.ffn(attn[0])
+        return (output, attn[1]) if output_attentions else (output,)
+
+
+class FunnelEncoder(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config2 = config
+        self.attention_structure = FunnelAttentionStructure(config)
+        self.blocks = nn.LayerList(
+            [
+                nn.LayerList([FunnelLayer(config, block_index) for _ in range(block_size)])
+                for block_index, block_size in enumerate(config.block_sizes)
+            ]
+        )
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask=None,
+        token_type_ids=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        # The pooling is not implemented on long tensors, so we convert this mask.
+        attention_mask = attention_mask.astype(inputs_embeds.dtype)
+        attention_inputs = self.attention_structure.init_attention_inputs(
+            inputs_embeds,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+        )
+        hidden = inputs_embeds
+
+        all_hidden_states = (inputs_embeds,) if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        for block_index, block in enumerate(self.blocks):
+            pooling_flag = hidden.shape[1] > (2 if self.config2.separate_cls else 1)
+            pooling_flag = pooling_flag and block_index > 0
+            if pooling_flag:
+                pooled_hidden, attention_inputs = self.attention_structure.pre_attention_pooling(
+                    hidden, attention_inputs
+                )
+            for (layer_index, layer) in enumerate(block):
+                for repeat_index in range(self.config2.block_repeats[block_index]):
+                    do_pooling = (repeat_index == 0) and (layer_index == 0) and pooling_flag
+                    if do_pooling:
+                        query = pooled_hidden
+                        key = value = hidden if self.config2.pool_q_only else pooled_hidden
+                    else:
+                        query = key = value = hidden
+                    # if layer_index==8 and block_index==0 and repeat_index==0 :
+                    #     print(block_index,layer_index,repeat_index,layer,query.mean(), key.mean(), value.mean())
+                    layer_output = layer(query, key, value, attention_inputs, output_attentions=output_attentions)
+
+                    hidden = layer_output[0]
+
+                    if do_pooling:
+                        attention_inputs = self.attention_structure.post_attention_pooling(attention_inputs)
+
+                    if output_attentions:
+                        all_attentions = all_attentions + layer_output[1:]
+                    if output_hidden_states:
+                        all_hidden_states = all_hidden_states + (hidden,)
+        if not return_dict:
+            return tuple(v for v in [hidden, all_hidden_states, all_attentions] if v is not None)
+        return BaseModelOutput(last_hidden_state=hidden, hidden_states=all_hidden_states, attentions=all_attentions)
+
+
+def upsample(x, stride, target_len, separate_cls=True, truncate_seq=False):
+    """
+    Upsample tensor `x` to match `target_len` by repeating the tokens `stride` time on the sequence length dimension.
+    """
+    if stride == 1:
+        return x
+    if separate_cls:
+        cls = x[:, :1]
+        x = x[:, 1:]
+    output = repeat_interleave(x, repeats=stride, dim=1)
+    if separate_cls:
+        if truncate_seq:
+            output = pad(output, (0, 0, 0, stride - 1, 0, 0))
+        output = output[:, : target_len - 1]
+        output = paddle.concat([cls, output], axis=1)
+    else:
+        output = output[:, :target_len]
+    return output
+
+
+class FunnelDecoder(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config2 = config
+        self.attention_structure = FunnelAttentionStructure(config)
+        self.layers = nn.LayerList([FunnelLayer(config, 0) for _ in range(config.num_decoder_layers)])
+
+    def forward(
+        self,
+        final_hidden,
+        first_block_hidden,
+        attention_mask=None,
+        token_type_ids=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        upsampled_hidden = upsample(
+            final_hidden,
+            stride=2 ** (len(self.config2.block_sizes) - 1),
+            target_len=first_block_hidden.shape[1],
+            separate_cls=self.config2.separate_cls,
+            truncate_seq=self.config2.truncate_seq,
+        )
+
+        hidden = upsampled_hidden + first_block_hidden
+        all_hidden_states = (hidden,) if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        attention_inputs = self.attention_structure.init_attention_inputs(
+            hidden,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+        )
+
+        for layer in self.layers:
+            layer_output = layer(hidden, hidden, hidden, attention_inputs, output_attentions=output_attentions)
+            hidden = layer_output[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + layer_output[1:]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden,)
+        if not return_dict:
+            return tuple(v for v in [hidden, all_hidden_states, all_attentions] if v is not None)
+
+        return BaseModelOutput(last_hidden_state=hidden, hidden_states=all_hidden_states, attentions=all_attentions)
+
+
+class FunnelDiscriminatorPredictions(nn.Layer):
+    """Prediction module for the discriminator, made up of two dense layers."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config2 = config
+        self.dense = nn.Linear(config.d_model, config.d_model)
+        self.dense_prediction = nn.Linear(config.d_model, 1)
+
+    def forward(self, discriminator_hidden_states):
+        hidden_states = self.dense(discriminator_hidden_states)
+        hidden_states = ACT2FN[self.config2.hidden_act](hidden_states)
+        logits = self.dense_prediction(hidden_states).squeeze()
+        return logits
+
+
+class FunnelPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    pretrained_init_configuration = FUNNEL_PRETRAINED_INIT_CONFIGURATION
+    resource_files_names = FUNNEL_RESOURCE_FILES_NAMES
+    pretrained_resource_files_map = FUNNEL_PRETRAINED_RESOURCE_FILES_MAP
+
+    config_class = FunnelConfig
+    base_model_prefix = "funnel"
+
+    def _init_weights(self, module):
+        classname = module.__class__.__name__
+        if classname.find("Linear") != -1:
+            if getattr(module, "weight", None) is not None:
+                if self.config.initializer_std is None:
+                    fan_out, fan_in = module.weight.shape
+                    std = np.sqrt(1.0 / float(fan_in + fan_out))
+                else:
+                    std = self.config.initializer_std
+                normal_(module.weight, std=std)
+            if getattr(module, "bias", None) is not None:
+                constant_(module.bias, 0.0)
+        elif classname == "FunnelRelMultiheadAttention":
+            uniform_(module.r_w_bias, b=self.config.initializer_range)
+            uniform_(module.r_r_bias, b=self.config.initializer_range)
+            uniform_(module.r_kernel, b=self.config.initializer_range)
+            uniform_(module.r_s_bias, b=self.config.initializer_range)
+            uniform_(module.seg_embed, b=self.config.initializer_range)
+        elif classname == "FunnelEmbeddings":
+            std = 1.0 if self.config.initializer_std is None else self.config.initializer_std
+            normal_(module.word_embeddings.weight, std=std)
+            if module.word_embeddings._padding_idx is not None:
+                module.word_embeddings.weight.data[module._padding_idx].zero_()
+
+    def init_weights(self):
+        """
+        If needed prunes and maybe initializes weights.
+        """
+        # Prune heads if needed
+        if self.config.pruned_heads:
+            self.prune_heads(self.config.pruned_heads)
+        _init_weights = True
+        if _init_weights:
+            # Initialize weights
+            self.apply(self._init_weights)
+
+            # Tie weights should be skipped when not initializing all weights
+            # since from_pretrained(...) calls tie weights anyways
+            # self.tie_weights()
+
+    def prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the base model.
+
+        Arguments:
+            heads_to_prune (:obj:`Dict[int, List[int]]`):
+                Dictionary with keys being selected layer indices (:obj:`int`) and associated values being the list of
+                heads to prune in said layer (list of :obj:`int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads
+                0 and 2 on layer 1 and heads 2 and 3 on layer 2.
+        """
+        # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads
+        for layer, heads in heads_to_prune.items():
+            union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads)
+            self.config2.pruned_heads[layer] = list(union_heads)  # Unfortunately we have to store it as list for JSON
+
+        self.base_model._prune_heads(heads_to_prune)
+
+
+class FunnelClassificationHead(nn.Layer):
+    def __init__(self, config, n_labels):
+        super().__init__()
+        self.linear_hidden = nn.Linear(config.d_model, config.d_model)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.linear_out = nn.Linear(config.d_model, n_labels)
+
+    def forward(self, hidden):
+        hidden = self.linear_hidden(hidden)
+        hidden = paddle.tanh(hidden)
+        hidden = self.dropout(hidden)
+        return self.linear_out(hidden)
+
+
+class FunnelBaseModel(FunnelPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        if isinstance(config, PreTrainedModel):
+            config = config.config
+        if isinstance(config, dict):
+            config = FunnelConfig(**config)
+        self.config2 = config
+        self.embeddings = FunnelEmbeddings(config)
+        self.encoder = FunnelEncoder(config)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embeddings.word_embeddings = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config2.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config2.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config2.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if attention_mask is None:
+            attention_mask = paddle.ones(input_shape)
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64)
+
+        # TODO: deal with head_mask
+        if inputs_embeds is None:
+            inputs_embeds = self.embeddings(input_ids)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return encoder_outputs
+
+
+@register_base_model
+class FunnelModel(FunnelPreTrainedModel):
+    base_model_prefix = "model"
+
+    def __init__(self, config: FunnelConfig):
+        super().__init__(config)
+
+        self.config2 = config
+        self.embeddings = FunnelEmbeddings(config)
+        self.encoder = FunnelEncoder(config)
+        self.decoder = FunnelDecoder(config)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embeddings.word_embeddings = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+
+        output_attentions = output_attentions if output_attentions is not None else self.config2.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config2.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config2.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if attention_mask is None:
+            attention_mask = paddle.ones(input_shape)
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64)
+        else:
+            token_type_ids = token_type_ids.astype("int64")
+        # TODO: deal with head_mask
+        if inputs_embeds is None:
+            inputs_embeds = self.embeddings(input_ids)
+            encoder_outputs = self.encoder(
+                inputs_embeds,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                output_attentions=output_attentions,
+                output_hidden_states=True,
+                return_dict=return_dict,
+            )
+        decoder_outputs = self.decoder(
+            final_hidden=encoder_outputs.last_hidden_state,
+            first_block_hidden=encoder_outputs.hidden_states[self.config2.block_sizes[0]],
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if not return_dict:
+            idx = 0
+            outputs = (decoder_outputs.last_hidden_state,)
+            if output_hidden_states:
+                idx += 1
+                outputs = outputs + (encoder_outputs.hidden_states + decoder_outputs[idx],)
+            if output_attentions:
+                idx += 1
+                outputs = outputs + (encoder_outputs.attentions + decoder_outputs[idx],)
+            return outputs
+        return BaseModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            hidden_states=(encoder_outputs.hidden_states + decoder_outputs.hidden_states)
+            if output_hidden_states
+            else None,
+            attentions=(encoder_outputs.attentions + decoder_outputs.attentions) if output_attentions else None,
+        )
+
+
+class FunnelForPreTraining(FunnelPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.funnel = FunnelModel(config)
+        self.discriminator_predictions = FunnelDiscriminatorPredictions(config)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (``paddle.Tensor`` of shape ``(batch_size, sequence_length)``, `optional`):
+            Labels for computing the ELECTRA-style loss. Input should be a sequence of tokens (see :obj:`input_ids`
+            docstring) Indices should be in ``[0, 1]``:
+
+            - 0 indicates the token is an original token,
+            - 1 indicates the token was replaced.
+
+        Returns:
+
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        discriminator_hidden_states = self.funnel(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        discriminator_sequence_output = discriminator_hidden_states[0]
+
+        logits = self.discriminator_predictions(discriminator_sequence_output)
+        loss = None
+        if labels is not None:
+            loss_fct = nn.BCEWithLogitsLoss()
+            if attention_mask is not None:
+                active_loss = attention_mask.reshape(-1, discriminator_sequence_output.shape[1]) == 1
+                active_logits = logits.reshape(-1, discriminator_sequence_output.shape[1])[active_loss]
+                active_labels = labels[active_loss]
+                loss = loss_fct(active_logits, active_labels.astype("float32"))
+            else:
+                loss = loss_fct(logits.reshape(-1, discriminator_sequence_output.shape[1]), labels.astype("float32"))
+
+        if not return_dict:
+            output = (logits,) + discriminator_hidden_states[1:]
+            return ((loss,) + output) if loss is not None else output
+        return FunnelForPreTrainingOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=discriminator_hidden_states.hidden_states,
+            attentions=discriminator_hidden_states.attentions,
+        )
+
+
+class FunnelForMaskedLM(FunnelPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.funnel = FunnelModel(config)
+        self.lm_head = nn.Linear(config.vocab_size, config.d_model)
+        self.tie_weights()
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`paddle.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.funnel(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = outputs[0]
+        prediction_logits = paddle.matmul(last_hidden_state, self.lm_head.weight, transpose_y=True) + self.lm_head.bias
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_logits.reshape(-1, self.config.vocab_size), labels.reshape(-1))
+
+        if not return_dict:
+            output = (prediction_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return prediction_logits
+
+
+class FunnelForSequenceClassification(FunnelPreTrainedModel):
+    base_model_class = FunnelModel
+
+    def __init__(self, config, num_classes=2):
+        super().__init__(config)
+        self.num_classes = num_classes
+
+        self.num_labels = config.num_labels
+
+        self.funnel = FunnelBaseModel(config)
+        self.classifier = FunnelClassificationHead(config, config.num_labels)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`paddle.Tensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.funnel(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = outputs[0]
+        pooled_output = last_hidden_state[:, 0]
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.reshape(-1, self.num_labels), labels.reshape(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return logits
+
+
+class FunnelForMultipleChoice(FunnelPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.funnel = FunnelBaseModel(config)
+        self.classifier = FunnelClassificationHead(config, 1)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`paddle.Tensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
+            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
+            :obj:`input_ids` above)
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None
+        attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None
+        token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.reshape(-1, inputs_embeds.shape[-2], inputs_embeds.shape[-1])
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.funnel(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = outputs[0]
+        pooled_output = last_hidden_state[:, 0]
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.reshape(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return reshaped_logits
+
+
+class FunnelForTokenClassification(FunnelPreTrainedModel):
+    def __init__(self, config, num_classes=2):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.funnel = FunnelModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.num_classes = num_classes
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`paddle.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
+            1]``.
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.funnel(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = outputs[0]
+        last_hidden_state = self.dropout(last_hidden_state)
+        logits = self.classifier(last_hidden_state)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.reshape(-1) == 1
+                active_logits = logits.reshape(-1, self.num_labels)
+                active_labels = paddle.where(
+                    active_loss, labels.reshape(-1), paddle.tensor(loss_fct.ignore_index).astype(labels.dtype)
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.reshape(-1, self.num_labels), paddle.reshape(labels, -1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return logits
+
+
+class FunnelForQuestionAnswering(FunnelPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.config2 = config
+        self.num_labels = config.num_labels
+
+        self.funnel = FunnelModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (:obj:`paddle.Tensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        end_positions (:obj:`paddle.Tensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config2.use_return_dict
+
+        outputs = self.funnel(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = outputs[0]
+
+        logits = self.qa_outputs(last_hidden_state)
+        start_logits, end_logits = split(logits, 1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.shape) > 1:
+                start_positions = start_positions.squeze(-1)
+            if len(end_positions.shape) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return start_logits, end_logits
+
+
+def is_tensor(x):
+    """
+    Tests if ``x`` is a :obj:`paddle.Tensor`,   or
+    :obj:`np.ndarray`.
+    """
+
+    if isinstance(x, paddle.Tensor):
+        return True
+
+    return isinstance(x, np.ndarray)
+
+
+class ModelOutput(OrderedDict):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def __post_init__(self):
+        class_fields = fields(self)
+
+        # Safety and consistency checks
+        assert len(class_fields), f"{self.__class__.__name__} has no fields."
+        assert all(
+            field.default is None for field in class_fields[1:]
+        ), f"{self.__class__.__name__} should not have more than one required field."
+
+        first_field = getattr(self, class_fields[0].name)
+        other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:])
+
+        if other_fields_are_none and not is_tensor(first_field):
+            try:
+                iterator = iter(first_field)
+                first_field_iterator = True
+            except TypeError:
+                first_field_iterator = False
+
+            # if we provided an iterator as first field and the iterator is a (key, value) iterator
+            # set the associated fields
+            if first_field_iterator:
+                for element in iterator:
+                    if (
+                        not isinstance(element, (list, tuple))
+                        or not len(element) == 2
+                        or not isinstance(element[0], str)
+                    ):
+                        break
+                    setattr(self, element[0], element[1])
+                    if element[1] is not None:
+                        self[element[0]] = element[1]
+            elif first_field is not None:
+                self[class_fields[0].name] = first_field
+        else:
+            for field in class_fields:
+                v = getattr(self, field.name)
+                if v is not None:
+                    self[field.name] = v
+
+    def __delitem__(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
+
+    def setdefault(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
+
+    def pop(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
+
+    def update(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
+
+    def __getitem__(self, k):
+        if isinstance(k, str):
+            inner_dict = {k: v for (k, v) in self.items()}
+            return inner_dict[k]
+        else:
+            return self.to_tuple()[k]
+
+    def __setattr__(self, name, value):
+        if value is not None:
+            # Don't call self.__setitem__ to avoid recursion errors
+            super().__setitem__(name, value)
+        super().__setattr__(name, value)
+
+    def __setitem__(self, key, value):
+        # Will raise a KeyException if needed
+        super().__setitem__(key, value)
+        # Don't call self.__setattr__ to avoid recursion errors
+        super().__setattr__(key, value)
+
+    def to_tuple(self):
+        """
+        Convert self to a tuple containing all the attributes/keys that are not ``None``.
+        """
+        return tuple(self[k] for k in self.keys())
+
+
+@dataclass
+class FunnelForPreTrainingOutput(ModelOutput):
+    """
+    Output type of :class:`~hf_paddle.FunnelForPreTraining`.
+
+    Args:
+        loss (`optional`, returned when ``labels`` is provided, ``paddle.Tensor`` of shape :obj:`(1,)`):
+            Total loss of the ELECTRA-style objective.
+        logits (:obj:`paddle.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+            Prediction scores of the head (scores for each token before SoftMax).
+        hidden_states (:obj:`tuple(paddle.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`paddle.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(paddle.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`paddle.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss = None
+    logits = None
+    hidden_states = None
+    attentions = None
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+
+@dataclass
+class BaseModelOutput(ModelOutput):
+    """
+    Base class for model's outputs, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (:obj:`paddle.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (:obj:`tuple(paddle.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`paddle.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(paddle.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`paddle.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/funnel/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/funnel/tokenizer.py
new file mode 100644
index 000000000..62161f32a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/funnel/tokenizer.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ["FunnelTokenizer"]
+
+import os
+from typing import List, Optional
+
+from .. import BasicTokenizer, WordpieceTokenizer
+from ..bert.tokenizer import BertTokenizer
+
+
+class FunnelTokenizer(BertTokenizer):
+    cls_token_type_id = 2
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "funnel-transformer/small": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/small/vocab.txt",
+            "funnel-transformer/small-base": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/small-base/vocab.txt",
+            "funnel-transformer/medium": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/medium/vocab.txt",
+            "funnel-transformer/medium-base": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/medium-base/vocab.txt",
+            "funnel-transformer/intermediate": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/intermediate/vocab.txt",
+            "funnel-transformer/intermediate-base": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/intermediate-base/vocab.txt",
+            "funnel-transformer/large": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/large/vocab.txt",
+            "funnel-transformer/large-base": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/large-base/vocab.txt",
+            "funnel-transformer/xlarge": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/xlarge/vocab.txt",
+            "funnel-transformer/xlarge-base": "https://bj.bcebos.com/paddlenlp/models/transformers/funnel-transformer/xlarge-base/vocab.txt",
+        },
+    }
+    pretrained_init_configuration = {
+        "funnel-transformer/small": {"do_lower_case": True},
+        "funnel-transformer/small-base": {"do_lower_case": True},
+        "funnel-transformer/medium": {"do_lower_case": True},
+        "funnel-transformer/medium-base": {"do_lower_case": True},
+        "funnel-transformer/intermediate": {"do_lower_case": True},
+        "funnel-transformer/intermediate-base": {"do_lower_case": True},
+        "funnel-transformer/large": {"do_lower_case": True},
+        "funnel-transformer/large-base": {"do_lower_case": True},
+        "funnel-transformer/xlarge": {"do_lower_case": True},
+        "funnel-transformer/xlarge-base": {"do_lower_case": True},
+    }
+
+    max_model_input_sizes = {
+        "funnel-transformer/small": 512,
+        "funnel-transformer/small-base": 512,
+        "funnel-transformer/medium": 512,
+        "funnel-transformer/medium-base": 512,
+        "funnel-transformer/intermediate": 512,
+        "funnel-transformer/intermediate-base": 512,
+        "funnel-transformer/large": 512,
+        "funnel-transformer/large-base": 512,
+        "funnel-transformer/xlarge": 512,
+        "funnel-transformer/xlarge-base": 512,
+    }
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        unk_token="<unk>",
+        sep_token="<sep>",
+        pad_token="<pad>",
+        cls_token="<cls>",
+        mask_token="<mask>",
+        bos_token="<s>",
+        eos_token="</s>",
+        do_basic_tokenize=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs
+    ):
+        super().__init__(
+            vocab_file,
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the "
+                "vocabulary from a pretrained model please use "
+                "`tokenizer = FunnelTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+        self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token)
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=unk_token)
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A Funnel
+        Transformer sequence pair mask has the following format:
+        ```
+        2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0]
+        return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gau_alpha/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gau_alpha/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gau_alpha/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gau_alpha/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gau_alpha/configuration.py
new file mode 100644
index 000000000..bf47e202d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gau_alpha/configuration.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from ..configuration_utils import PretrainedConfig
+
+__all__ = ["GAUAlPHA_PRETRAINED_INIT_CONFIGURATION", "GAUAlphaConfig", "GAUAlPHA_PRETRAINED_RESOURCE_FILES_MAP"]
+
+GAUAlPHA_PRETRAINED_INIT_CONFIGURATION = {
+    "chinese_GAU-alpha-char_L-24_H-768": {
+        "vocab_size": 12000,
+        "hidden_size": 768,
+        "intermediate_size": 1536,
+        "num_hidden_layers": 24,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "attention_key_size": 128,
+        "norm_eps": 1e-12,
+        "pad_token_id": 0,
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "swish",
+        "use_bias": False,
+        "normalization": "softmax_plus",
+        "attention_scale": True,
+    },
+}
+
+GAUAlPHA_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "chinese_GAU-alpha-char_L-24_H-768": "https://bj.bcebos.com/paddlenlp/models/transformers/gau_alpha/chinese_GAU-alpha-char_L-24_H-768/model_state.pdparams",
+    }
+}
+
+
+class GAUAlphaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GAUAlphaModel`]. It is used to
+    instantiate a GAUAlpha model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the GAUAlpha
+    chinese_GAU-alpha-char_L-24_H-768 architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the GAUAlpha model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GAUAlphaModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`GAUAlphaModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+    Examples:
+    ```python
+    >>> from paddlenlp.transformers import GAUAlphaModel, GAUAlphaConfig
+    >>> # Initializing a GAUAlpha chinese_GAU-alpha-char_L-24_H-768style configuration
+    >>> configuration = GAUAlphaConfig()
+    >>> # Initializing a model from the  style configuration
+    >>> model = GAUAlphaModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "gau_alpha"
+    pretrained_init_configuration = GAUAlPHA_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size: int = 30522,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        task_id=0,
+        intermediate_size: int = 3072,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        max_position_embeddings: int = 512,
+        task_type_vocab_size: int = 3,
+        type_vocab_size: int = 16,
+        attention_key_size=128,
+        initializer_range: float = 0.02,
+        pad_token_id: int = 0,
+        pool_act: str = "tanh",
+        activation: str = "swish",
+        normalization: str = "softmax_plus",
+        fuse: bool = False,
+        layer_norm_eps=1e-12,
+        norm_eps=1e-12,
+        use_cache=False,
+        use_task_id=True,
+        use_bias=False,
+        attention_scale=True,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.task_id = task_id
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.task_type_vocab_size = task_type_vocab_size
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.pool_act = pool_act
+        self.fuse = fuse
+        self.layer_norm_eps = layer_norm_eps
+        self.norm_eps = norm_eps
+        self.use_cache = use_cache
+        self.use_task_id = use_task_id
+        self.use_bias = use_bias
+        self.activation = activation
+        self.attention_key_size = attention_key_size
+        self.normalization = normalization
+        self.attention_scale = attention_scale
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gau_alpha/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gau_alpha/modeling.py
new file mode 100644
index 000000000..4a4ab981f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gau_alpha/modeling.py
@@ -0,0 +1,810 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Layer
+
+from paddlenlp.utils.env import CONFIG_NAME
+
+from .. import PretrainedModel, register_base_model
+from ..albert.modeling import ACT2FN
+from .configuration import (
+    GAUAlPHA_PRETRAINED_INIT_CONFIGURATION,
+    GAUAlPHA_PRETRAINED_RESOURCE_FILES_MAP,
+    GAUAlphaConfig,
+)
+
+__all__ = [
+    "GAUAlphaModel",
+    "GAUAlphaForMaskedLM",
+    "GAUAlphaPretrainedModel",
+    "GAUAlphaForSequenceClassification",
+    "GAUAlphaForTokenClassification",
+    "GAUAlphaForQuestionAnswering",
+    "GAUAlphaForMultipleChoice",
+]
+
+INF = 1e4
+
+
+class Norm(Layer):
+    def __init__(self, epsilon=1e-12):
+        super().__init__()
+        self._epsilon = epsilon
+
+    def forward(self, x):
+        variance = paddle.mean(paddle.square(x), axis=-1, keepdim=True)
+        return x / paddle.sqrt(variance + self._epsilon)
+
+
+def attention_normalize(a, mask=None, axis=-1, method="softmax"):
+    if method == "softmax":
+        return F.softmax(a, axis=axis)
+    else:
+        if mask is not None:
+            l = mask.sum(-1, keepdim=True)
+        else:
+            l = paddle.ones_like(a) * a.shape[-2]
+        if method == "squared_relu":
+            return F.relu(a) ** 2 / l
+        elif method == "softmax_plus":
+            scale = paddle.log(l) / np.log(512)
+            # mask: 1 for not padding, 0 for padding
+            # padding position's scale is 1
+            if mask is not None:
+                scale = scale * mask + 1 - mask
+            return F.softmax(a * scale, axis=axis)
+    return a
+
+
+class ScaleOffset(Layer):
+    def __init__(
+        self,
+        hidden_size=768,
+        scale=True,
+        offset=True,
+    ):
+        super().__init__()
+        self.scale = scale
+        self.offset = offset
+
+        if self.scale:
+            self.weight = self.create_parameter((hidden_size,), default_initializer=nn.initializer.Constant(1.0))
+        if self.offset:
+            self.bias = self.create_parameter((hidden_size,), is_bias=True)
+
+    def forward(self, inputs):
+        if self.scale:
+            inputs = inputs * self.weight
+        if self.offset:
+            inputs = inputs + self.bias
+
+        return inputs
+
+
+class GatedAttentionUnit(Layer):
+    """
+    https://github.com/ZhuiyiTechnology/GAU-alpha/blob/ea15e08a85d35652775c360218090cbaed98da18/models.py#L6-L85
+    """
+
+    def __init__(self, config: GAUAlphaConfig):
+        super().__init__()
+        self.activation = ACT2FN[config.activation]
+        self.intermediate_size = config.intermediate_size
+        self.attention_key_size = config.attention_key_size
+        self.use_bias = config.use_bias
+        self.normalization = config.normalization
+        self.attention_scale = config.attention_scale
+        self.attention_dropout = config.attention_probs_dropout_prob
+
+        self.i_dense = nn.Linear(
+            config.hidden_size,
+            2 * config.intermediate_size + config.attention_key_size,
+            bias_attr=self.use_bias,
+        )
+        self.o_dense = nn.Linear(config.intermediate_size, config.hidden_size, bias_attr=self.use_bias)
+
+        self.q_scaleoffset = ScaleOffset(config.attention_key_size, offset=self.use_bias)
+        self.k_scaleoffset = ScaleOffset(config.attention_key_size, offset=self.use_bias)
+        self.rotary = RotaryPositionEmbedding(config)
+
+    def forward(self, hidden_states, attention_mask=None):
+        x = self.i_dense(hidden_states)
+        u, v, qk = paddle.split(
+            self.activation(x),
+            [self.intermediate_size, self.intermediate_size, self.attention_key_size],
+            axis=-1,
+        )
+        q, k = self.q_scaleoffset(qk), self.k_scaleoffset(qk)
+
+        # apply_rotary
+        q, k = self.rotary(q), self.rotary(k)
+
+        # Attention
+        a = paddle.matmul(q, k, transpose_y=True)
+
+        if self.attention_scale:
+            a = a / self.attention_key_size**0.5
+
+        if attention_mask is not None:
+            a = a * attention_mask + (attention_mask - 1) * INF
+
+        A = attention_normalize(a, attention_mask, axis=-1, method=self.normalization)
+
+        A = F.dropout(A, p=self.attention_dropout, training=self.training)
+
+        o = self.o_dense(u * paddle.matmul(A, v))
+
+        return o
+
+
+class GAULayer(Layer):
+    def __init__(self, config: GAUAlphaConfig):
+        super().__init__()
+        self.gau = GatedAttentionUnit(config)
+        self.norm = Norm(config.norm_eps)
+        self.hidden_dropout = config.hidden_dropout_prob
+
+    def forward(self, hidden_states, attention_mask=None):
+        gau_output = self.gau(hidden_states, attention_mask=attention_mask)
+
+        # dropout and residual
+        o = F.dropout(gau_output[0], p=self.hidden_dropout, training=self.training)
+        o = self.norm(hidden_states + o)
+
+        return o
+
+
+def initializer(tensor, num_hidden_layers=12, order=2, gain=1.0):
+    """
+    https://github.com/bojone/bert4keras/blob/5572ed481a14f5a62be7107e3846c88a5d6b617d/bert4keras/models.py#L1226-L1235
+    """
+    shape = tensor.shape
+    if shape[0] > 10000 or shape[0] < 10:
+        hidden_size = shape[1]
+    else:
+        hidden_size = shape[0]
+    gain *= num_hidden_layers ** (-1.0 / order)
+    std = 1.13684723 / hidden_size**0.5 * gain
+
+    return nn.initializer.TruncatedNormal(std=std)
+
+
+class RotaryPositionEmbedding(Layer):
+    def __init__(self, config: GAUAlphaConfig):
+        super().__init__()
+        inv_freq = 1.0 / (
+            10000
+            ** (
+                paddle.arange(0, config.attention_key_size, 2, dtype=paddle.get_default_dtype())
+                / config.attention_key_size
+            )
+        )
+        t = paddle.arange(config.max_position_embeddings, dtype=paddle.get_default_dtype())
+        freqs = paddle.matmul(t.unsqueeze(1), inv_freq.unsqueeze(0))
+        self.register_buffer("sin", freqs.sin(), persistable=False)
+        self.register_buffer("cos", freqs.cos(), persistable=False)
+
+    def forward(self, x, offset=0):
+        # x shape [batch_size, seqlen, dim]
+        seqlen = x.shape[-2]
+        sin, cos = (
+            self.sin[offset : offset + seqlen, :],
+            self.cos[offset : offset + seqlen, :],
+        )
+        x1, x2 = x[..., 0::2], x[..., 1::2]
+        # [cos_nθ, -sin_nθ] [x1]
+        # [sin_nθ,  cos_nθ] [x2]
+        # => [x1 * cos_nθ - x2 * sin_nθ, x1 * sin_nθ + x2 * cos_nθ]
+        return paddle.stack([x1 * cos - x2 * sin, x1 * sin + x2 * cos], axis=-1).flatten(-2, -1)
+
+
+class GAUAlphaPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained GAU-alpha models. It provides GAU-alpha related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    model_config_file = CONFIG_NAME
+    config_class = GAUAlphaConfig
+    resource_files_names = {"model_state": "model_state.pdparams"}
+    base_model_prefix = "gau_alpha"
+
+    pretrained_init_configuration = GAUAlPHA_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = GAUAlPHA_PRETRAINED_RESOURCE_FILES_MAP
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                initializer(layer.weight, self.config.num_hidden_layers, order=2, gain=1.0)
+            if isinstance(layer, nn.Linear):
+                use_bias = self.use_bias if hasattr(self, "use_bias") else self.gau_alpha.config["use_bias"]
+                if layer.bias is not None and not use_bias:
+                    layer.bias = None
+
+
+@register_base_model
+class GAUAlphaModel(GAUAlphaPretrainedModel):
+    """
+    The bare GAUAlpha Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        vocab_size (int):
+            Vocabulary size of `inputs_ids` in `GAUAlphaModel`. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `GAUAlphaModel`.
+        hidden_size (int, optional):
+            Dimensionality of the, encoder layers and pooler layer. Defaults to `768`.
+        intermediate_size (int, optional):
+            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
+            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
+            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
+            Defaults to `3072`.
+        num_hidden_layers (int, optional):
+            Number of hidden layers in the gau_alpha encoder. Defaults to `12`.
+        max_position_embeddings (int, optional):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
+            sequence. Defaults to `512`.
+        type_vocab_size (int, optional):
+            The vocabulary size of `token_type_ids`.
+            Defaults to `2`.
+        attention_key_size (int, optional):
+            The dimensionality of the key used in the gau layer. Defaults to `128`.
+        norm_eps (float, optional):
+            The epsilon value used in the normalization layer.
+            Defaults to `1e-12`.
+        pad_token_id (int, optional):
+            The index of padding token in the token vocabulary.
+            Defaults to `0`.
+        hidden_dropout_prob (float, optional):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+            Defaults to `0.1`.
+        attention_probs_dropout_prob (float, optional):
+            The dropout probability used in gau in all encoder layers to drop some attention target.
+            Defaults to `0.1`.
+        hidden_act (str, optional):
+            The activation function used in gau layer. Defaults to `swish`.
+        use_bias (bool, optional):
+            Whether or not use bias.
+            Defaults to `False`.
+        normalization (str, optional):
+            The normalization method used in gau layer.
+            Defaults to `softmax_plus`.
+        attention_scale (bool, optional):
+            Whether or not to scale the attention scores.
+            Defaults to `True`.
+    """
+
+    def __init__(self, config: GAUAlphaConfig):
+        super(GAUAlphaModel, self).__init__(config)
+        self.pad_token_id = config.pad_token_id
+        self.norm_eps = config.norm_eps
+        self.num_hidden_layers = config.num_hidden_layers
+        self.use_bias = config.use_bias
+        self.embeddings = GAUAlphaEmbeddings(config)
+
+        self.encoder = GAUAlphaEncoder(config)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+    ):
+        r"""
+        The GAUAlphaModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            attention_mask (Tensor, optional):
+                Mask used in gau to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                It is a tensor with shape broadcasted to `[batch_size, sequence_length, sequence_length]`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+
+        Returns:
+            tuple: Returns `last_hidden_state` (Tensor)
+                Sequence of hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import GAUAlphaModel, GAUAlphaTokenizer
+
+                tokenizer = GAUAlphaTokenizer.from_pretrained('chinese_GAU-alpha-char_L-24_H-768')
+                model = GAUAlphaModel.from_pretrained('chinese_GAU-alpha-char_L-24_H-768')
+
+                inputs = tokenizer("欢迎使用百度飞桨!")
+                inputs = {k:paddle.to_tensor([v], dtype="int64") for (k, v) in inputs.items()}
+                last_hidden_state = model(**inputs)
+
+        """
+
+        if attention_mask is None:
+            attention_mask = input_ids != self.pad_token_id
+        if attention_mask.ndim == 2:
+            attention_mask = attention_mask.unsqueeze(1)  # bs, 1, seqlen
+        attention_mask = attention_mask.astype(paddle.get_default_dtype())
+        attention_mask.stop_gradient = True
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+        )
+
+        last_hidden_state = self.encoder(embedding_output, attention_mask=attention_mask)
+
+        return last_hidden_state
+
+
+class GAUAlphaEmbeddings(Layer):
+    """
+    Include embeddings from word and token_type embeddings
+    """
+
+    def __init__(self, config: GAUAlphaConfig):
+        super(GAUAlphaEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        self.norm = Norm(config.norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, token_type_ids=None):
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros_like(input_ids, dtype="int64")
+
+        input_embedings = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = input_embedings + token_type_embeddings
+        embeddings = self.norm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class GAUAlphaEncoder(Layer):
+    def __init__(self, config: GAUAlphaConfig):
+        super().__init__()
+        self.layer = nn.LayerList([GAULayer(config) for _ in range(config.num_hidden_layers)])
+
+    def forward(self, hidden_states, attention_mask=None):
+        for layer_module in self.layer:
+            hidden_states = layer_module(
+                hidden_states,
+                attention_mask,
+            )
+        return hidden_states
+
+
+class GAUAlphaForQuestionAnswering(GAUAlphaPretrainedModel):
+    """
+    GAUAlpha with a linear layer on top of the hidden-states output to compute `span_start_logits`
+    and `span_end_logits`, designed for question-answering tasks like SQuAD.
+
+    Args:
+        gau_alpha (:class:`GAUAlphaModel`):
+            An instance of GAUAlphaModel.
+        dropout (float, optional):
+            The dropout probability for output of GAUAlpha.
+            If None, use the same value as `hidden_dropout_prob` of `GAUAlphaModel`
+            instance `gau_alpha`. Defaults to `None`.
+    """
+
+    def __init__(self, config: GAUAlphaConfig):
+        super(GAUAlphaForQuestionAnswering, self).__init__(config)
+        self.gau_alpha = GAUAlphaModel(config)
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
+        r"""
+        The GAUAlphaForQuestionAnswering forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`GAUAlphaModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`GAUAlphaModel`.
+            attention_mask (Tensor, optional):
+                See :class:`GAUAlphaModel`.
+
+        Returns:
+            tuple: Returns tuple (`start_logits`, `end_logits`).
+
+            With the fields:
+
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import GAUAlphaForQuestionAnswering, GAUAlphaTokenizer
+
+                tokenizer = GAUAlphaTokenizer.from_pretrained('chinese_GAU-alpha-char_L-24_H-768')
+                model = GAUAlphaForQuestionAnswering.from_pretrained('chinese_GAU-alpha-char_L-24_H-768')
+
+                inputs = tokenizer("欢迎使用百度飞桨!")
+                inputs = {k:paddle.to_tensor([v], dtype="int64") for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                start_logits = outputs[0]
+                end_logits = outputs[1]
+        """
+        sequence_output = self.gau_alpha(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
+
+        logits = self.classifier(sequence_output)
+        start_logits, end_logits = paddle.unstack(logits, axis=-1)
+
+        return start_logits, end_logits
+
+
+class GAUAlphaForSequenceClassification(GAUAlphaPretrainedModel):
+    """
+    GAUAlpha Model with a linear layer on top of the output layer,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        gau_alpha (`GAUAlphaModel`):
+            An instance of `paddlenlp.transformers.GAUAlphaModel`.
+        num_labels (int, optional):
+            The number of classes. Default to `2`.
+        dropout (float, optional):
+            The dropout probability for output of GAUAlpha.
+            If None, use the same value as `hidden_dropout_prob`
+            of `paddlenlp.transformers.GAUAlphaModel` instance. Defaults to `None`.
+    """
+
+    def __init__(self, config: GAUAlphaConfig):
+        super(GAUAlphaForSequenceClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.gau_alpha = GAUAlphaModel(config)
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`GAUAlphaModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`GAUAlphaModel`.
+            attention_mask (Tensor, optional):
+                See :class:`GAUAlphaModel`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input text classification logits.
+            Shape as `[batch_size, num_labels]` and dtype as float32.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import GAUAlphaForSequenceClassification, GAUAlphaTokenizer
+
+                tokenizer = GAUAlphaTokenizer.from_pretrained('chinese_GAU-alpha-char_L-24_H-768')
+                model = GAUAlphaForSequenceClassification.from_pretrained('chinese_GAU-alpha-char_L-24_H-768')
+
+                inputs = tokenizer("欢迎使用百度飞桨!")
+                inputs = {k:paddle.to_tensor([v], dtype="int64") for (k, v) in inputs.items()}
+                logits = model(**inputs)
+
+        """
+        sequence_output = self.gau_alpha(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
+        pooled_output = sequence_output[:, 0]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        return logits
+
+
+class GAUAlphaForTokenClassification(GAUAlphaPretrainedModel):
+    """
+    GAUAlpha Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        gau_alpha (`GAUAlphaModel`):
+            An instance of `paddlenlp.transformers.GAUAlphaModel`.
+        num_labels (int, optional):
+            The number of classes. Default to `2`.
+        dropout (float, optional):
+            The dropout probability for output of GAUAlpha.
+            If None, use the same value as `hidden_dropout_prob`
+            of `paddlenlp.transformers.GAUAlphaModel` instance. Defaults to `None`.
+    """
+
+    def __init__(self, config: GAUAlphaConfig):
+        super(GAUAlphaForTokenClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.gau_alpha = GAUAlphaModel(config)  # allow gau_alpha to be config
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`GAUAlphaModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`GAUAlphaModel`.
+            attention_mask (Tensor, optional):
+                See :class:`GAUAlphaModel`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input token classification logits.
+            Shape as `[batch_size, sequence_length, num_labels]` and dtype as `float32`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import GAUAlphaForTokenClassification, GAUAlphaTokenizer
+
+                tokenizer = GAUAlphaTokenizer.from_pretrained('chinese_GAU-alpha-char_L-24_H-768')
+                model = GAUAlphaForTokenClassification.from_pretrained('chinese_GAU-alpha-char_L-24_H-768')
+
+                inputs = tokenizer("欢迎使用百度飞桨!")
+                inputs = {k:paddle.to_tensor([v], dtype="int64") for (k, v) in inputs.items()}
+                logits = model(**inputs)
+
+        """
+        sequence_output = self.gau_alpha(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        return logits
+
+
+class GAUAlphaForMultipleChoice(GAUAlphaPretrainedModel):
+    """
+    GAUAlpha Model with a linear layer on top of the hidden-states output layer,
+    designed for multiple choice tasks like RocStories/SWAG tasks.
+
+    Args:
+        gau_alpha (:class:`GAUAlphaModel`):
+            An instance of GAUAlphaModel.
+        num_choices (int, optional):
+            The number of choices. Defaults to `2`.
+        dropout (float, optional):
+            The dropout probability for output of GAUAlpha.
+            If None, use the same value as `hidden_dropout_prob` of `GAUAlphaModel`
+            instance `gau_alpha`. Defaults to None.
+    """
+
+    def __init__(self, config: GAUAlphaConfig):
+        super(GAUAlphaForMultipleChoice, self).__init__(config)
+        self.num_choices = config.num_choices
+        self.gau_alpha = GAUAlphaModel(config)
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
+        r"""
+        The GAUAlphaForMultipleChoice forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`GAUAlphaModel` and shape as [batch_size, num_choice, sequence_length].
+            token_type_ids(Tensor, optional):
+                See :class:`GAUAlphaModel` and shape as [batch_size, num_choice, sequence_length].
+            attention_mask (list, optional):
+                See :class:`GAUAlphaModel` and shape as [batch_size, num_choice, sequence_length].
+
+        Returns:
+            Tensor: Returns tensor `reshaped_logits`, a tensor of the multiple choice classification logits.
+            Shape as `[batch_size, num_choice]` and dtype as `float32`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import GAUAlphaForMultipleChoice, GAUAlphaTokenizer
+                from paddlenlp.data import Pad
+
+                tokenizer = GAUAlphaTokenizer.from_pretrained('chinese_GAU-alpha-char_L-24_H-768')
+                model = GAUAlphaForMultipleChoice.from_pretrained('chinese_GAU-alpha-char_L-24_H-768', num_choices=2)
+
+                data = [
+                    {
+                        "question": "如何打开ipad屏幕？",
+                        "answer1": "按音量按钮。",
+                        "answer2": "按下锁定按钮。",
+                        "label": 1,
+                    },
+                    {
+                        "question": "如何缩进一些文本？",
+                        "answer1": "在开始写之前留一些空格。",
+                        "answer2": "按空格键。",
+                        "label": 0,
+                    },
+                ]
+
+                text = []
+                text_pair = []
+                for d in data:
+                    text.append(d["question"])
+                    text_pair.append(d["answer1"])
+                    text.append(d["question"])
+                    text_pair.append(d["answer2"])
+
+                inputs = tokenizer(text, text_pair)
+                input_ids = Pad(axis=0, pad_val=tokenizer.pad_token_id)(inputs["input_ids"])
+                token_type_ids = Pad(axis=0, pad_val=tokenizer.pad_token_type_id)(inputs["token_type_ids"])
+
+                reshaped_logits = model(
+                    input_ids=paddle.to_tensor(input_ids, dtype="int64"),
+                    token_type_ids=paddle.to_tensor(token_type_ids, dtype="int64"),
+                )
+                print(reshaped_logits.shape)
+                # [2, 2]
+
+        """
+        # input_ids: [bs, num_choice, seq_l]
+        input_ids = input_ids.reshape(shape=(-1, input_ids.shape[-1]))  # flat_input_ids: [bs*num_choice,seq_l]
+
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.reshape(shape=(-1, token_type_ids.shape[-1]))
+
+        if attention_mask is not None:
+            attention_mask = attention_mask.reshape(shape=(-1, attention_mask.shape[-1]))
+
+        sequence_output = self.gau_alpha(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
+
+        pooled_output = sequence_output[:, 0]
+        pooled_output = self.dropout(pooled_output)
+
+        logits = self.classifier(pooled_output)  # logits: (bs*num_choice,1)
+        reshaped_logits = logits.reshape(shape=(-1, self.num_choices))  # logits: (bs, num_choice)
+
+        return reshaped_logits
+
+
+class GAUAlphaLMPredictionHead(Layer):
+    def __init__(
+        self,
+        config: GAUAlphaConfig,
+        embedding_weights=None,
+    ):
+        super(GAUAlphaLMPredictionHead, self).__init__()
+        self.use_bias = config.use_bias
+        self.decoder_weight = (
+            self.create_parameter(shape=[config.vocab_size, config.hidden_size], dtype=self.transform.weight.dtype)
+            if embedding_weights is None
+            else embedding_weights
+        )
+        if self.use_bias:
+            self.decoder_bias = self.create_parameter(
+                shape=[config.vocab_size], dtype=self.decoder_weight.dtype, is_bias=True
+            )
+
+    def forward(self, hidden_states):
+        hidden_states = paddle.matmul(hidden_states, self.decoder_weight, transpose_y=True)
+        if self.use_bias:
+            hidden_states = hidden_states + self.decoder_bias
+
+        return hidden_states
+
+
+class GAUAlphaForMaskedLM(GAUAlphaPretrainedModel):
+    """
+    GAUAlpha Model with a `masked language modeling` head on top.
+
+    Args:
+        gau_alpha (:class:`GAUAlphaModel`):
+            An instance of :class:`GAUAlphaModel`.
+
+    """
+
+    def __init__(
+        self,
+        config: GAUAlphaConfig,
+    ):
+        super(GAUAlphaForMaskedLM, self).__init__(config)
+        self.gau_alpha = GAUAlphaModel(config)
+        self.cls = GAUAlphaLMPredictionHead(
+            config=config,
+            embedding_weights=self.gau_alpha.embeddings.word_embeddings.weight,
+        )
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`GAUAlphaModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`GAUAlphaModel`.
+            attention_mask (Tensor, optional):
+                See :class:`GAUAlphaModel`.
+
+        Returns:
+            Tensor: Returns tensor `prediction_scores`, The scores of masked token prediction.
+            Its data type should be float32 and shape is [batch_size, sequence_length, vocab_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import GAUAlphaForMaskedLM, GAUAlphaTokenizer
+
+                tokenizer = GAUAlphaTokenizer.from_pretrained('chinese_GAU-alpha-char_L-24_H-768')
+                model = GAUAlphaForMaskedLM.from_pretrained('chinese_GAU-alpha-char_L-24_H-768')
+
+                inputs = tokenizer("欢迎使用百度飞桨!")
+                inputs = {k:paddle.to_tensor([v], dtype="int64") for (k, v) in inputs.items()}
+
+                logits = model(**inputs)
+                print(logits.shape)
+                # [1, 11, 12000]
+
+        """
+        sequence_output = self.gau_alpha(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
+
+        prediction_scores = self.cls(sequence_output)
+        return prediction_scores
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gau_alpha/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gau_alpha/tokenizer.py
new file mode 100644
index 000000000..eca6b9aeb
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gau_alpha/tokenizer.py
@@ -0,0 +1,292 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from ..bert.tokenizer import BasicTokenizer, WordpieceTokenizer
+from ..tokenizer_utils import PretrainedTokenizer
+
+__all__ = ["GAUAlphaTokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"chinese_GAU-alpha-char_L-24_H-768": 512}
+
+
+class GAUAlphaTokenizer(PretrainedTokenizer):
+    """
+    Constructs a GAUAlpha tokenizer. It uses a basic tokenizer to do punctuation
+    splitting, lower casing and so on, and follows a WordPiece tokenizer to
+    tokenize as subwords.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            The vocabulary file path (ends with '.txt') required to instantiate
+            a `WordpieceTokenizer`.
+        do_lower_case (bool,optional):
+            Whether or not to lowercase the input when tokenizing.
+            If you use the GAUAlpha pretrained model, lower is set to
+            False when using the cased model, otherwise it is set to True.
+            Defaults to`True`.
+        unk_token (str,optional):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str,optional):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str,optional):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str,optional):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str,optional):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import GAUAlphaTokenizer
+            tokenizer = GAUAlphaTokenizer.from_pretrained('chinese_GAU-alpha-char_L-24_H-768')
+
+            tokens = tokenizer('欢迎使用百度飞桨')
+            '''
+            {'input_ids': [101, 3223, 6500, 421, 4179, 4331, 2008, 7263, 3055, 102],
+             'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
+            '''
+
+    """
+
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "chinese_GAU-alpha-char_L-24_H-768": "https://bj.bcebos.com/paddlenlp/models/transformers/gau_alpha/chinese_GAU-alpha-char_L-24_H-768/vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "chinese_GAU-alpha-char_L-24_H-768": {"do_lower_case": True},
+    }
+    padding_side = "right"
+
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the "
+                "vocabulary from a pretrained model please use "
+                "`tokenizer = GAUAlphaTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+        self.do_lower_case = do_lower_case
+        self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token)
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=unk_token)
+
+    @property
+    def vocab_size(self):
+        """
+        Return the size of vocabulary.
+
+        Returns:
+            int: The size of vocabulary.
+        """
+
+        return len(self.vocab)
+
+    def _tokenize(self, text):
+        """
+        End-to-end tokenization for GAUAlpha models.
+
+        Args:
+            text (str): The text to be tokenized.
+
+        Returns:
+            list: A list of string representing converted tokens.
+        """
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+
+    def convert_tokens_to_string(self, tokens):
+        """
+        Converts a sequence of tokens (list of string) to a single string. Since
+        the usage of WordPiece introducing `##` to concat subwords, also removes
+        `##` when converting.
+
+        Args:
+            tokens (list): A list of string representing tokens to be converted.
+
+        Returns:
+            str: Converted string from tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import GAUAlphaTokenizer
+
+                tokenizer = GAUAlphaTokenizer.from_pretrained('chinese_GAU-alpha-char_L-24_H-768')
+                tokens = tokenizer.tokenize('欢迎使用百度飞桨!')
+                '''
+                ['欢', '迎', '使', '用', '百', '度', '飞', '桨', '!']
+                '''
+                strings = tokenizer.convert_tokens_to_string(tokens)
+                '''
+                '欢 迎 使 用 百 度 飞 桨 !'
+                '''
+        """
+
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def num_special_tokens_to_add(self, pair=False):
+        """
+        Returns the number of added tokens when encoding a sequence with special tokens.
+
+        Args:
+            pair(bool):
+                Whether the input is a sequence pair or a single sequence.
+                Defaults to `False` and the input is a single sequence.
+
+        Returns:
+            int: Number of tokens added to sequences.
+        """
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens.
+
+        A GAUAlpha sequence has the following format:
+
+        - single sequence:      ``[CLS] X [SEP]``
+        - pair of sequences:        ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+
+        Returns:
+            List[int]: List of input_id with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        return _cls + token_ids_0 + _sep + token_ids_1 + _sep
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        """
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        A GAUAlpha offset_mapping has the following format:
+
+        - single sequence:      ``(0,0) X (0,0)``
+        - pair of sequences:        ``(0,0) A (0,0) B (0,0)``
+
+        Args:
+            offset_mapping_ids_0 (List[tuple]):
+                List of wordpiece offsets to which the special tokens will be added.
+            offset_mapping_ids_1 (List[tuple], optional):
+                Optional second list of wordpiece offsets for offset mapping pairs. Defaults to None.
+
+        Returns:
+            List[tuple]: A list of wordpiece offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+
+        A GAUAlpha sequence pair mask has the following format:
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+
+        Returns:
+            List[int]: List of token_type_id according to the given sequence(s).
+        """
+        _sep = [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(_cls + token_ids_0 + _sep) * [0]
+        return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 + _sep) * [1]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+            already_has_special_tokens (bool, optional): Whether or not the token list is already
+                formatted with special tokens for the model. Defaults to None.
+
+        Returns:
+            List[int]: The list of integers either be 0 or 1: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def get_vocab(self):
+        return dict(self.vocab.token_to_idx, **self.added_tokens_encoder)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gemma/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gemma/__init__.py
new file mode 100644
index 000000000..af3692458
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gemma/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .configuration import *
+from .modeling import *
+from .modeling_pp import *
+from .tokenizer import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gemma/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gemma/configuration.py
new file mode 100644
index 000000000..310e1f308
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gemma/configuration.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Gemma model configuration"""
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = [
+    "GEMMA_PRETRAINED_INIT_CONFIGURATION",
+    "GemmaConfig",
+    "GEMMA_PRETRAINED_RESOURCE_FILES_MAP",
+]
+
+GEMMA_PRETRAINED_INIT_CONFIGURATION = {
+    "google/gemma-2b": {
+        "architectures": ["GemmaForCausalLM"],
+        "hidden_size": 2048,
+        "initializer_range": 0.02,
+        "intermediate_size": 16384,
+        "max_position_embeddings": 8192,
+        "model_type": "gemma",
+        "num_attention_heads": 8,
+        "num_key_value_heads": 1,
+        "num_hidden_layers": 28,
+        "rms_norm_eps": 1e-06,
+        "vocab_size": 256000,
+        "bos_token_id": 2,
+        "eos_token_id": 1,
+        "pad_token_id": 0,
+        "use_cache": True,
+    },
+}
+
+
+GEMMA_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "google/gemma-2b": "https://bj.bcebos.com/paddlenlp/models/community/google/gemma-2b/model.safetensors",
+        "google/gemma-2b-it": "https://bj.bcebos.com/paddlenlp/models/community/google/gemma-2b-it/model.safetensors",
+    },
+}
+
+
+class GemmaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`~GemmaModel`]. It is used to instantiate a gemma
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Gemma-7B.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Gemma model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~GemmaModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        Example:
+    ```python
+    >>> from paddlenlp.transformer import GemmaModel, GemmaModel
+
+    >>> # Initializing a Gemma gemma-7b style configuration
+    >>> configuration = GemmaModel()
+
+    >>> # Initializing a model from the gemma-7b style configuration
+    >>> model = GemmaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "gemma"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=256000,
+        hidden_size=3072,
+        intermediate_size=24576,
+        num_hidden_layers=28,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        head_dim=256,
+        hidden_act="gelu",
+        max_position_embeddings=8192,
+        seq_length=8192,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        eos_token_id=1,
+        bos_token_id=2,
+        tie_word_embeddings=True,
+        rope_theta=10000.0,
+        attention_bias=False,
+        attention_dropout=0.0,
+        fuse_attention_qkv=False,
+        fuse_attention_ffn=False,
+        alibi=False,
+        rope_scaling_factor=1.0,
+        rope_scaling_type=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.seq_length = seq_length
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+
+        self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        self.fuse_attention_qkv = fuse_attention_qkv
+        self.fuse_attention_ffn = fuse_attention_ffn
+        self.alibi = alibi
+
+        self.rope_scaling_factor = rope_scaling_factor
+        self.rope_scaling_type = rope_scaling_type
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @property
+    def rope(self):
+        return not self.alibi
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gemma/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gemma/modeling.py
new file mode 100644
index 000000000..1be5a2453
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gemma/modeling.py
@@ -0,0 +1,1547 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import warnings
+from functools import partial
+from typing import List, Optional, Tuple
+
+import paddle
+import paddle.distributed.fleet.meta_parallel as mpu
+import paddle.nn.functional as F
+from paddle import Tensor, nn
+from paddle.autograd import PyLayer
+from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
+from paddle.distributed.fleet.utils import recompute
+from paddle.utils import try_import
+
+try:
+    from paddle.incubate.nn.functional import fused_rotary_position_embedding
+except ImportError:
+    fused_rotary_position_embedding = None
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        GatherOp,
+        ScatterOp,
+        mark_as_sequence_parallel_parameter,
+    )
+except:
+    pass
+
+from paddlenlp.transformers.conversion_utils import (
+    StateDictNameMapping,
+    init_name_mappings,
+)
+from paddlenlp.transformers.model_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+)
+from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model
+
+from .. import linear_utils
+from ..linear_utils import Linear
+from ..segment_parallel_utils import ReshardLayer
+from .configuration import (
+    GEMMA_PRETRAINED_INIT_CONFIGURATION,
+    GEMMA_PRETRAINED_RESOURCE_FILES_MAP,
+    GemmaConfig,
+)
+
+try:
+    from paddle.nn.functional.flash_attention import flash_attention
+except:
+    flash_attention = None
+
+
+def _get_interleave(n):
+    def _get_interleave_power_of_2(n):
+        start = 2 ** (-(2 ** -(math.log2(n) - 3)))
+        ratio = start
+        return [start * ratio**i for i in range(n)]
+
+    if math.log2(n).is_integer():
+        return _get_interleave_power_of_2(n)
+    else:
+        closest_power_of_2 = 2 ** math.floor(math.log2(n))
+        return (
+            _get_interleave_power_of_2(closest_power_of_2)
+            + _get_interleave(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
+        )
+
+
+def rms_norm_fused(x_in, w, eps):
+    fused_ln = try_import("fused_ln")
+    return fused_ln.fused_rms_norm(x_in, w, eps)[0]
+
+
+def assign_kv_heads(num_kv_heads: int, num_gpus: int):
+    # Initialize the assignment list
+    """
+    Assign kv heads to different GPUs in the Tensor Parallel Setup
+
+    Examples:
+        assign_kv_heads(num_kv_heads=1, num_gpus=2): [[0], [0]]
+        assign_kv_heads(num_kv_heads=2, num_gpus=2): [[0], [1]]
+        assign_kv_heads(num_kv_heads=4, num_gpus=2): [[0,1], [2,3]]
+        assign_kv_heads(num_kv_heads=1, num_gpus=4): [[0],[0],[0],[0]]
+        assign_kv_heads(num_kv_heads=2, num_gpus=4): [[0],[0],[1],[1]]
+        assign_kv_heads(num_kv_heads=4, num_gpus=4): [[0],[1],[2],[3]]
+    """
+    assignment_list = [[] for _ in range(num_gpus)]
+    # Case 1: more heads than cards
+    if num_kv_heads > num_gpus:
+        num_heads_per_card = num_kv_heads // num_gpus
+        for i in range(num_gpus):
+            for j in range(num_heads_per_card):
+                assignment_list[i].append(i * num_heads_per_card + j)
+    # Case 2: more cards than heads. each card get only 1 head.
+    else:
+        num_card_per_heads = num_gpus // num_kv_heads
+        for i in range(num_kv_heads):
+            for j in range(num_card_per_heads):
+                assignment_list[i * num_card_per_heads + j].append(i)
+    return assignment_list
+
+
+def build_alibi_tensor(
+    bool_attention_mask: Tensor, num_heads: int, dtype: paddle.dtype, tensor_parallel_degree=1
+) -> Tensor:
+    attention_mask = bool_attention_mask.astype("float32")
+    batch_size, seq_length = attention_mask.shape[0], attention_mask.shape[-1]
+    slopes = paddle.to_tensor(_get_interleave(num_heads), dtype="float32")
+    alibi = slopes.unsqueeze(axis=[1, 2]) * paddle.arange(seq_length, dtype="float32").unsqueeze(axis=[0, 1]).expand(
+        [num_heads, -1, -1]
+    )
+    alibi = alibi.reshape(shape=(1, num_heads, 1, seq_length)).expand([batch_size, -1, -1, -1])
+    return paddle.cast(alibi, dtype)
+
+
+def get_triangle_upper_mask(x, mask=None):
+    if mask is not None:
+        return mask
+    # [bsz, n_head, q_len, kv_seq_len]
+    shape = x.shape
+    #  [bsz, 1, q_len, kv_seq_len]
+    shape[1] = 1
+    mask = paddle.full(shape, paddle.finfo(x.dtype).min, dtype=x.dtype)
+    mask = paddle.triu(mask, diagonal=1)
+    mask.stop_gradient = True
+    return mask
+
+
+def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor:
+    """
+    This is the equivalent of paddle.repeat_interleave(hidden_states, n_rep, axis=1). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, slen, num_key_value_heads, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+
+    hidden_states = hidden_states.unsqueeze(-2).tile([1, 1, 1, n_rep, 1])
+    return hidden_states.reshape([batch, slen, num_key_value_heads * n_rep, head_dim])
+
+
+def parallel_matmul(x: Tensor, y, tensor_parallel_output=True, transpose_y=False):
+    is_fleet_init = True
+    tensor_parallel_degree = 1
+    try:
+        hcg = fleet.get_hybrid_communicate_group()
+        model_parallel_group = hcg.get_model_parallel_group()
+        tensor_parallel_degree = hcg.get_model_parallel_world_size()
+    except:
+        is_fleet_init = False
+
+    if paddle.in_dynamic_mode():
+        y_is_distributed = y.is_distributed
+    else:
+        y_is_distributed = tensor_parallel_degree > 1
+
+    if is_fleet_init and tensor_parallel_degree > 1 and y_is_distributed:
+        # if not running under distributed.launch, it will raise AttributeError: 'Fleet' object has no attribute '_hcg'
+        input_parallel = paddle.distributed.collective._c_identity(x, group=model_parallel_group)
+        logits = paddle.matmul(input_parallel, y, transpose_y=transpose_y)
+
+        if tensor_parallel_output:
+            return logits
+
+        return paddle.distributed.collective._c_concat(logits, group=model_parallel_group)
+
+    else:
+        logits = paddle.matmul(x, y, transpose_y=transpose_y)
+        return logits
+
+
+def scaled_dot_product_attention(
+    query_states,
+    config,
+    key_states,
+    value_states,
+    attention_mask,
+    output_attentions,
+    alibi=None,
+    sequence_parallel=False,
+    reshard_layer=None,
+    attn_dropout_prob=0.0,
+    trainer_mode=False,
+):
+    bsz, q_len, num_heads, head_dim = query_states.shape
+    _, kv_seq_len, _, _ = value_states.shape
+
+    if config.use_flash_attention and flash_attention:
+        # Paddle Flash Attention input [ bz, seqlen, nhead, head_dim]
+        # Torch Flash Attention input [ bz, nhead, seqlen, head_dim]
+
+        version = paddle.version.full_version
+        if version != "0.0.0" and version <= "2.5.2":
+            if alibi is not None:
+                raise ValueError("Flash Attention doesn't support alibi")
+            attn_output, attn_weights = flash_attention(
+                query_states,
+                key_states,
+                value_states,
+                causal=True,
+                dropout=attn_dropout_prob,
+                return_softmax=output_attentions,
+            )
+        else:
+            if alibi is not None:
+                alibi = alibi.reshape([bsz, num_heads, 1, -1])
+                attention_mask = attention_mask.cast(alibi.dtype) + alibi
+            attn_output = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                is_causal=attention_mask is None,
+            )
+            attn_weights = None
+
+        if reshard_layer is not None:
+            # attn_output shape: [bs, seqlen, num_head/sep, head_dim]
+            attn_output = reshard_layer(
+                attn_output,
+                split_axis=1,
+                concat_axis=2,
+            )
+            # attn_output shape: [bs, seqlen/sep, num_head, head_dim]
+            assert (
+                config.sep_parallel_degree > 1 and q_len % config.sep_parallel_degree == 0
+            ), f"q_len:{q_len}, config.sep_parallel_degree:{config.sep_parallel_degree}"
+            q_len = q_len // config.sep_parallel_degree
+            num_heads = num_heads * config.sep_parallel_degree
+
+        if sequence_parallel:
+            attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])
+        else:
+            attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
+        return (attn_output, attn_weights) if output_attentions else attn_output
+    else:
+        #  [ bz, seqlen, nhead, head_dim] -> [bs, nhead, seq_len, head_dim]
+        query_states = paddle.transpose(query_states, [0, 2, 1, 3])
+        # merge with the next tranpose
+        key_states = paddle.transpose(key_states, [0, 2, 1, 3])
+        value_states = paddle.transpose(value_states, [0, 2, 1, 3])
+
+        # matmul and devide by sqrt(head_dim)
+        attn_weights = paddle.matmul(query_states / math.sqrt(head_dim), key_states.transpose([0, 1, 3, 2]))
+        # then add alibi bias
+        if alibi is not None:
+            alibi = alibi.reshape([bsz, num_heads, 1, -1])
+            attn_weights = attn_weights + alibi
+
+        if attn_weights.shape != [bsz, num_heads, q_len, kv_seq_len]:
+            raise ValueError(
+                f"Attention weights should be of shape {(bsz, num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.shape}"
+            )
+
+        # In sep mode, the attenion mask should be created in the runtime.
+        if reshard_layer is not None:
+            attention_mask = None
+
+        # NOTE: we only call get_triangle_upper_mask under PP setup
+        # FIXME ZHUI when we use pipeline parallel, the attention_mask can be None
+        # we just make it triangle_upper_mask
+        if attention_mask is None:
+            attention_mask = get_triangle_upper_mask(attn_weights)
+        attention_mask = attention_mask.reshape([bsz, 1, q_len, kv_seq_len])
+        if attention_mask.shape != [bsz, 1, q_len, kv_seq_len]:
+            raise ValueError(
+                f"Attention mask should be of shape {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.shape}"
+            )
+
+        attn_weights = attn_weights + attention_mask
+        if not paddle.in_dynamic_mode():
+            attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query_states.dtype)
+        else:
+            with paddle.amp.auto_cast(False):
+                attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query_states.dtype)
+        attn_weights = F.dropout(attn_weights, attn_dropout_prob, training=trainer_mode)
+        attn_output = paddle.matmul(attn_weights, value_states)
+        attn_output = attn_output.transpose([0, 2, 1, 3])
+
+        if reshard_layer is not None:
+            attn_output = reshard_layer(
+                attn_output,
+                split_axis=1,
+                concat_axis=2,
+            )
+            q_len = q_len // config.sep_parallel_degree
+            num_heads = num_heads * config.sep_parallel_degree
+
+        if sequence_parallel:
+            attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])
+        else:
+            attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
+        return (attn_output, attn_weights) if output_attentions else attn_output
+
+
+def is_casual_mask(attention_mask):
+    """
+    Upper triangular of attention_mask equals to attention_mask is casual
+    """
+    return (paddle.triu(attention_mask) == attention_mask).all().item()
+
+
+def _make_causal_mask(input_ids_shape, past_key_values_length):
+    """
+    Make causal mask used for self-attention
+    """
+    batch_size, target_length = input_ids_shape  # target_length: seq_len
+
+    mask = paddle.tril(paddle.ones((target_length, target_length), dtype="bool"))
+
+    if past_key_values_length > 0:
+        # [tgt_len, tgt_len + past_len]
+        mask = paddle.concat([paddle.ones([target_length, past_key_values_length], dtype="bool"), mask], axis=-1)
+
+    # [bs, 1, tgt_len, tgt_len + past_len]
+    return mask[None, None, :, :].expand([batch_size, 1, target_length, target_length + past_key_values_length])
+
+
+def _expand_2d_mask(mask, dtype, tgt_length):
+    """
+    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
+    """
+    batch_size, src_length = mask.shape[0], mask.shape[-1]
+    tgt_length = tgt_length if tgt_length is not None else src_length
+
+    mask = mask[:, None, None, :].astype("bool")
+    mask.stop_gradient = True
+    expanded_mask = mask.expand([batch_size, 1, tgt_length, src_length])
+
+    return expanded_mask
+
+
+class GemmaRMSNorm(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.weight = paddle.create_parameter(
+            shape=[self.hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(1.0),
+        )
+        self.variance_epsilon = config.rms_norm_eps
+        self.config = config
+
+        if config.sequence_parallel:
+            mark_as_sequence_parallel_parameter(self.weight)
+
+    def _norm(self, x):
+        return x * paddle.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.variance_epsilon)
+
+    def forward(self, x):
+        if self.config.use_fused_rms_norm:
+            return rms_norm_fused(x, self.weight + 1, self.variance_epsilon)
+
+        output = self._norm(x.astype(paddle.float32)).astype(x.dtype)
+        return output * (self.weight + 1)
+
+
+class GemmaRotaryEmbedding(nn.Layer):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.inv_freq = 1.0 / (self.base ** (paddle.cast(paddle.arange(0, self.dim, 2), dtype="float32") / self.dim))
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        t = paddle.arange(seq_len, dtype="float32")
+        freqs = paddle.einsum("i,j->ij", t, self.inv_freq)
+        emb = paddle.concat([freqs, freqs], axis=-1)
+        return (emb.cos()[None, :, None, :].cast(dtype=x.dtype), emb.sin()[None, :, None, :].cast(dtype=x.dtype))
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return paddle.concat([-x2, x1], axis=-1)  # shape is the same as x
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+
+    if position_ids is None:
+        # Note: Only for ForCausalLMPipe model pretraining
+        cos = cos[:, : q.shape[1], :, :]  # [bs, seq_len, 1, dim]
+        sin = sin[:, : q.shape[1], :, :]  # [bs, seq_len, 1, dim]
+    else:
+        cos = cos.squeeze(axis=[0, 2])  # [seq_len, dim]
+        sin = sin.squeeze(axis=[0, 2])  # [seq_len, dim]
+        cos = cos[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
+        sin = sin[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class GemmaMLP(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.tensor_parallel_degree = config.tensor_parallel_degree
+
+        if config.sequence_parallel:
+            ColumnParallelLinear = linear_utils.ColumnSequenceParallelLinear
+            RowParallelLinear = linear_utils.RowSequenceParallelLinear
+        else:
+            ColumnParallelLinear = linear_utils.ColumnParallelLinear
+            RowParallelLinear = linear_utils.RowParallelLinear
+
+        if config.tensor_parallel_degree > 1:
+            self.gate_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.intermediate_size,
+                gather_output=False,
+                has_bias=False,
+            )
+            self.up_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.intermediate_size,
+                gather_output=False,
+                has_bias=False,
+            )
+            self.down_proj = RowParallelLinear(
+                self.intermediate_size,
+                self.hidden_size,
+                input_is_parallel=True,
+                has_bias=False,
+            )
+        else:
+            self.gate_proj = Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
+            self.up_proj = Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
+            self.down_proj = Linear(self.intermediate_size, self.hidden_size, bias_attr=False)
+
+    def forward(self, x):
+        # GeGLU
+        out = self.down_proj(F.gelu(self.gate_proj(x)) * self.up_proj(x))
+        return out
+
+
+class GemmaAttention(nn.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: GemmaConfig, layerwise_recompute: bool = False):
+        super().__init__()
+
+        self.config = config
+        self.attention_dropout = config.attention_dropout  # add
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.head_dim
+
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+
+        self.max_position_embeddings = config.max_position_embeddings
+        self.seq_length = config.seq_length
+        self.rope_theta = config.rope_theta
+        self.sequence_parallel = config.sequence_parallel
+
+        self.kv_indices = None
+        # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True
+        # Enable_recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        self.layerwise_recompute = layerwise_recompute
+        self.recompute_granularity = config.recompute_granularity
+        if config.tensor_parallel_degree > 1:
+            assert (
+                self.num_heads % config.tensor_parallel_degree == 0
+            ), f"num_heads: {self.num_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}"
+            self.num_heads = self.num_heads // config.tensor_parallel_degree
+
+            if self.num_key_value_heads % config.tensor_parallel_degree == 0:
+                self.num_key_value_heads = self.num_key_value_heads // config.tensor_parallel_degree
+            else:
+                self.kv_indices = paddle.to_tensor(
+                    assign_kv_heads(self.num_key_value_heads, config.tensor_parallel_degree)[
+                        config.tensor_parallel_rank
+                    ]
+                )
+
+        self.use_fused_rope = config.use_fused_rope
+        if self.use_fused_rope:
+            if "gpu" not in paddle.device.get_device() or fused_rotary_position_embedding is None:
+                warnings.warn(
+                    "Enable fuse rope in the config, but fuse rope is not available. "
+                    "Will disable fuse rope. Try using latest gpu version of Paddle."
+                )
+                self.use_fused_rope = False
+
+        if config.sequence_parallel:
+            ColumnParallelLinear = linear_utils.ColumnSequenceParallelLinear
+            RowParallelLinear = linear_utils.RowSequenceParallelLinear
+        else:
+            ColumnParallelLinear = linear_utils.ColumnParallelLinear
+            RowParallelLinear = linear_utils.RowParallelLinear
+
+        if config.tensor_parallel_degree > 1:
+            self.q_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.config.num_attention_heads * self.head_dim,
+                has_bias=config.attention_bias,
+                gather_output=False,
+            )
+            if self.kv_indices is None:
+                # to revise shape
+                self.k_proj = ColumnParallelLinear(
+                    self.hidden_size,
+                    self.config.num_key_value_heads * self.head_dim,
+                    has_bias=config.attention_bias,
+                    gather_output=False,
+                )
+                self.v_proj = ColumnParallelLinear(
+                    self.hidden_size,
+                    self.config.num_key_value_heads * self.head_dim,
+                    has_bias=config.attention_bias,
+                    gather_output=False,
+                )
+            else:
+                self.k_proj = Linear(
+                    self.hidden_size,
+                    self.config.num_key_value_heads * self.head_dim,
+                    bias_attr=False,
+                )
+                self.v_proj = Linear(
+                    self.hidden_size,
+                    self.config.num_key_value_heads * self.head_dim,
+                    bias_attr=False,
+                )
+
+        else:
+            self.q_proj = Linear(
+                self.hidden_size,
+                self.config.num_attention_heads * self.head_dim,
+                bias_attr=False,
+            )
+            self.k_proj = Linear(
+                self.hidden_size,
+                self.config.num_key_value_heads * self.head_dim,
+                bias_attr=False,
+            )
+            self.v_proj = Linear(
+                self.hidden_size,
+                self.config.num_key_value_heads * self.head_dim,
+                bias_attr=False,
+            )
+
+        if config.tensor_parallel_degree > 1:
+            self.o_proj = RowParallelLinear(
+                self.config.num_attention_heads * self.head_dim,
+                self.hidden_size,
+                has_bias=False,
+                input_is_parallel=True,
+            )
+        else:
+            self.o_proj = Linear(
+                self.config.num_attention_heads * self.head_dim,
+                self.hidden_size,
+                bias_attr=False,
+            )
+        self.rotary_emb = GemmaRotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+
+        self.reshard_layer = None
+        if config.sep_parallel_degree > 1:
+            assert self.num_key_value_heads % config.sep_parallel_degree == 0
+            assert self.num_heads % config.sep_parallel_degree == 0
+            self.reshard_layer = ReshardLayer()
+
+        self.config = config
+
+    def forward(
+        self,
+        hidden_states,
+        position_ids: Optional[Tuple[paddle.Tensor]] = None,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        alibi: Optional[paddle.Tensor] = None,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism)
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        if self.reshard_layer is not None:
+            if self.sequence_parallel:
+                assert self.seq_length % self.config.sep_parallel_degree == 0
+                query_states = paddle.reshape(
+                    query_states,
+                    [-1, self.seq_length // self.config.sep_parallel_degree, self.num_heads * self.head_dim],
+                )
+                key_states = paddle.reshape(
+                    key_states,
+                    [-1, self.seq_length // self.config.sep_parallel_degree, self.num_heads * self.head_dim],
+                )
+                value_states = paddle.reshape(
+                    value_states,
+                    [-1, self.seq_length // self.config.sep_parallel_degree, self.num_heads * self.head_dim],
+                )
+            query_states = self.reshard_layer(
+                query_states,
+                split_axis=2,
+                concat_axis=1,
+            )
+            key_states = self.reshard_layer(
+                key_states,
+                split_axis=2,
+                concat_axis=1,
+            )
+            value_states = self.reshard_layer(
+                value_states,
+                split_axis=2,
+                concat_axis=1,
+            )
+            query_states = paddle.reshape(
+                query_states, [0, self.seq_length, -1, self.head_dim]
+            )  # [bs, seq_len, num_head/k, head_dim], k is sep degree
+            key_states = paddle.reshape(key_states, [0, self.seq_length, -1, self.head_dim])
+            value_states = paddle.reshape(value_states, [0, self.seq_length, -1, self.head_dim])
+        else:
+            if self.sequence_parallel:
+                target_query_shape = [-1, self.seq_length, self.num_heads, self.head_dim]
+                target_key_value_shape = [-1, self.seq_length, self.num_key_value_heads, self.head_dim]
+            else:
+                target_query_shape = [0, 0, self.num_heads, self.head_dim]
+                target_key_value_shape = [0, 0, self.num_key_value_heads, self.head_dim]
+            query_states = query_states.reshape(shape=target_query_shape)
+            key_states = key_states.reshape(shape=target_key_value_shape)
+            value_states = value_states.reshape(shape=target_key_value_shape)
+
+        kv_seq_len = key_states.shape[-3]
+
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-3]
+
+        if self.config.rope:
+            if self.reshard_layer is not None:
+                batch_size, seq_length, _, _ = query_states.shape
+                position_ids = paddle.arange(seq_length, dtype="int64").expand((batch_size, seq_length))
+            if self.use_fused_rope:
+                assert past_key_value is None, "fuse rotary not support cache kv for now"
+                cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+                paddle_version = float(paddle.__version__[:3])
+                if ((paddle_version != 0.0) and (paddle_version <= 2.6)) and (
+                    self.num_heads != self.num_key_value_heads
+                ):
+                    query_states, _, _ = fused_rotary_position_embedding(
+                        query_states,
+                        None,
+                        None,
+                        sin=sin,
+                        cos=cos,
+                        position_ids=position_ids,
+                        use_neox_rotary_style=False,
+                    )
+                    key_states, _, _ = fused_rotary_position_embedding(
+                        key_states,
+                        None,
+                        None,
+                        sin=sin,
+                        cos=cos,
+                        position_ids=position_ids,
+                        use_neox_rotary_style=False,
+                    )
+                else:
+                    query_states, key_states, _ = fused_rotary_position_embedding(
+                        query_states,
+                        key_states,
+                        v=None,
+                        sin=sin,
+                        cos=cos,
+                        position_ids=position_ids,
+                        use_neox_rotary_style=False,
+                    )
+            else:
+                cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+                query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        # [bs, seq_len, num_head, head_dim]
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = paddle.concat([past_key_value[0], key_states], axis=1)
+            value_states = paddle.concat([past_key_value[1], value_states], axis=1)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        if self.kv_indices is not None:
+            key_states = paddle.index_select(key_states, self.kv_indices, axis=2)
+            value_states = paddle.index_select(value_states, self.kv_indices, axis=2)
+            key_states = paddle.broadcast_to(key_states, query_states.shape)
+            value_states = paddle.broadcast_to(value_states, query_states.shape)
+        else:
+            # repeat k/v heads if n_kv_heads < n_heads
+            key_states = repeat_kv(key_states, self.num_key_value_groups)
+            value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        has_gradient = not (query_states.stop_gradient and key_states.stop_gradient and value_states.stop_gradient)
+        if (
+            self.enable_recompute
+            and self.layerwise_recompute
+            and has_gradient
+            and self.recompute_granularity == "core_attn"
+        ):
+            outputs = recompute(
+                scaled_dot_product_attention,
+                query_states,
+                self.config,
+                key_states,
+                value_states,
+                attention_mask,
+                output_attentions,
+                alibi,
+                self.sequence_parallel,
+                reshard_layer=self.reshard_layer,
+                use_reentrant=self.config.recompute_use_reentrant,
+                attn_dropout_prob=self.attention_dropout,
+                trainer_mode=self.training,
+            )
+        else:
+            outputs = scaled_dot_product_attention(
+                query_states,
+                self.config,
+                key_states,
+                value_states,
+                attention_mask,
+                output_attentions,
+                alibi,
+                self.sequence_parallel,
+                reshard_layer=self.reshard_layer,
+                attn_dropout_prob=self.attention_dropout,
+                trainer_mode=self.training,
+            )
+        if output_attentions:
+            attn_output, attn_weights = outputs
+        else:
+            attn_output = outputs
+
+        # if sequence_parallel is true, out shape are [q_len / n, bs, num_head * head_dim]
+        # else their shape are [bs, q_len, num_head * head_dim], n is mp parallelism.
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        outputs = (attn_output,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        if use_cache:
+            outputs += (past_key_value,)
+
+        if type(outputs) is tuple and len(outputs) == 1:
+            outputs = outputs[0]
+
+        return outputs
+
+
+class GemmaDecoderLayer(nn.Layer):
+    def __init__(self, config, layerwise_recompute: bool = False):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.self_attn = GemmaAttention(config, layerwise_recompute)
+        self.mlp = GemmaMLP(config)
+        self.input_layernorm = GemmaRMSNorm(config)
+        self.post_attention_layernorm = GemmaRMSNorm(config)
+        self.sequence_parallel = config.sequence_parallel
+        # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True
+        # Enable_recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        self.layerwise_recompute = layerwise_recompute
+        self.recompute_granularity = config.recompute_granularity
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        position_ids: Optional[Tuple[paddle.Tensor]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        use_cache: Optional[bool] = False,
+        alibi: Optional[paddle.Tensor] = None,
+    ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
+        """
+        Args:
+            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`paddle.Tensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `cache` key value states are returned and can be used to speed up decoding
+                (see `cache`).
+            cache (`Tuple(paddle.Tensor)`, *optional*): cached past key and value projection states
+        """
+
+        # [bs * seq_len, embed_dim] -> [seq_len * bs / n, embed_dim] (sequence_parallel)
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        has_gradient = not hidden_states.stop_gradient
+        if (
+            self.enable_recompute
+            and self.layerwise_recompute
+            and has_gradient
+            and self.recompute_granularity == "full_attn"
+        ):
+            outputs = recompute(
+                self.self_attn,
+                hidden_states,
+                position_ids,
+                past_key_value,
+                attention_mask,
+                output_attentions,
+                use_cache,
+                alibi,
+                use_reentrant=self.config.recompute_use_reentrant,
+            )
+        else:
+            outputs = self.self_attn(
+                hidden_states,
+                position_ids,
+                past_key_value,
+                attention_mask,
+                output_attentions,
+                use_cache,
+                alibi,
+            )
+
+        if type(outputs) is tuple:
+            hidden_states = outputs[0]
+        else:
+            hidden_states = outputs
+
+        if output_attentions:
+            self_attn_weights = outputs[1]
+
+        if use_cache:
+            present_key_value = outputs[2 if output_attentions else 1]
+
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        # remove empty tuple for pipeline parallel
+        if type(outputs) is tuple and len(outputs) == 1:
+            outputs = outputs[0]
+
+        return outputs
+
+
+class GemmaPretrainedModel(PretrainedModel):
+    config_class = GemmaConfig
+    base_model_prefix = "gemma"
+    pretrained_init_configuration = GEMMA_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = GEMMA_PRETRAINED_RESOURCE_FILES_MAP
+    _keys_to_ignore_on_load_unexpected = []
+    _keep_in_fp32_modules = ["inv_freq", "rotary_emb", "cos_cached", "sin_cached"]
+
+    @classmethod
+    def _get_name_mappings(cls, config: GemmaConfig) -> List[StateDictNameMapping]:
+        mappings: list[StateDictNameMapping] = []
+        model_mappings = [
+            ["embed_tokens.weight"],
+            ["norm.weight"],
+        ]
+        for layer_index in range(config.num_hidden_layers):
+            layer_mappings = [
+                [f"layers.{layer_index}.self_attn.q_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.k_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.v_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.o_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.mlp.gate_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.mlp.down_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.mlp.up_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.input_layernorm.weight"],
+                [f"layers.{layer_index}.post_attention_layernorm.weight"],
+            ]
+            model_mappings.extend(layer_mappings)
+        init_name_mappings(mappings=model_mappings)
+        # base-model prefix "GemmaModel"
+        if "GemmaModel" not in config.architectures:
+            for mapping in model_mappings:
+                mapping[0] = "model." + mapping[0]
+                mapping[1] = "gemma." + mapping[1]
+            model_mappings.append(["lm_head.weight", "lm_head.weight", "transpose"])
+
+        mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)]
+        return mappings
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config: GemmaConfig, is_split=True):
+
+        from paddlenlp.transformers.conversion_utils import split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def get_tensor_parallel_split_mappings(num_layers):
+            final_actions = {}
+
+            base_actions = {
+                # Column Linear
+                "lm_head.weight": partial(fn, is_column=not config.tie_word_embeddings),
+                # Row Linear
+                "embed_tokens.weight": partial(fn, is_column=False),
+                "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False),
+                "layers.0.mlp.down_proj.weight": partial(fn, is_column=False),
+            }
+
+            if not config.vocab_size % config.tensor_parallel_degree == 0:
+                base_actions.pop("lm_head.weight")
+                base_actions.pop("embed_tokens.weight")
+
+            base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True)
+            # if we have enough num_key_value_heads to split, then split it.
+            if config.num_key_value_heads % config.tensor_parallel_degree == 0:
+                base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True)
+                base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True)
+
+            if config.fuse_attention_ffn:
+                base_actions["layers.0.mlp.gate_up_fused_proj.weight"] = partial(
+                    fn, is_column=True, is_naive_2fuse=True
+                )
+            else:
+                base_actions["layers.0.mlp.gate_proj.weight"] = partial(fn, is_column=True)
+                base_actions["layers.0.mlp.up_proj.weight"] = partial(fn, is_column=True)
+
+            for key, action in base_actions.items():
+                if "layers.0." in key:
+                    for i in range(num_layers):
+                        final_actions[key.replace("layers.0.", f"layers.{i}.")] = action
+                final_actions[key] = action
+
+            return final_actions
+
+        mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
+
+        return mappings
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if self.config.tensor_parallel_degree > 1:
+            rng_tracker = get_rng_state_tracker().rng_state
+        if isinstance(
+            layer,
+            (
+                nn.Linear,
+                nn.Embedding,
+                mpu.VocabParallelEmbedding,
+                mpu.RowParallelLinear,
+                mpu.ColumnParallelLinear,
+                linear_utils.RowSequenceParallelLinear,
+                linear_utils.ColumnSequenceParallelLinear,
+                GemmaLMHead,
+            ),
+        ):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                if layer.weight.is_distributed:
+                    with rng_tracker():
+                        layer.weight.set_value(
+                            paddle.tensor.normal(
+                                mean=0.0,
+                                std=self.config.initializer_range
+                                if hasattr(self.config, "initializer_range")
+                                else self.gemma.config.initializer_range,
+                                shape=layer.weight.shape,
+                            )
+                        )
+                else:
+                    layer.weight.set_value(
+                        paddle.tensor.normal(
+                            mean=0.0,
+                            std=self.config.initializer_range
+                            if hasattr(self.config, "initializer_range")
+                            else self.gemma.config.initializer_range,
+                            shape=layer.weight.shape,
+                        )
+                    )
+        # Layer.apply is DFS https://github.com/PaddlePaddle/Paddle/blob/a6f5021fcc58b21f4414bae6bf4731ef6971582c/python/paddle/nn/layer/layers.py#L527-L530
+        # sublayer is init first
+        # scale RowParallelLinear weight
+        with paddle.no_grad():
+            if isinstance(layer, GemmaMLP):
+                factor = 1 / math.sqrt(2 * self.config.num_hidden_layers)
+                layer.down_proj.weight.scale_(factor)
+            if isinstance(layer, GemmaAttention):
+                factor = 1 / math.sqrt(2 * self.config.num_hidden_layers)
+                layer.o_proj.weight.scale_(factor)
+
+
+@register_base_model
+class GemmaModel(GemmaPretrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`GemmaDecoderLayer`]
+    Args:
+        config: GemmaConfig
+    """
+
+    def __init__(self, config: GemmaConfig):
+        super().__init__(config)
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+        self.sequence_parallel = config.sequence_parallel
+        self.recompute_granularity = config.recompute_granularity
+        self.no_recompute_layers = config.no_recompute_layers if config.no_recompute_layers is not None else []
+
+        # Recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0:
+            self.embed_tokens = mpu.VocabParallelEmbedding(
+                self.vocab_size,
+                self.hidden_size,
+                weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()),
+            )
+            self.embed_tokens.weight.is_distributed = True
+        else:
+            self.embed_tokens = nn.Embedding(
+                self.vocab_size,
+                self.hidden_size,
+            )
+            self.embed_tokens.weight.is_distributed = False
+
+        self.layers = nn.LayerList(
+            [GemmaDecoderLayer(config, i not in self.no_recompute_layers) for i in range(config.num_hidden_layers)]
+        )
+        self.norm = GemmaRMSNorm(config)
+
+        self.gradient_checkpointing = False
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @staticmethod
+    def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length, dtype):
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            if len(attention_mask.shape) == 2:
+                expanded_attn_mask = _expand_2d_mask(attention_mask, dtype, tgt_length=input_shape[-1])
+                # For decoding phase in generation, seq_length = 1, we don't need to add causal mask
+                if input_shape[-1] > 1:
+                    combined_attention_mask = _make_causal_mask(
+                        input_shape, past_key_values_length=past_key_values_length
+                    )
+                    expanded_attn_mask = expanded_attn_mask & combined_attention_mask
+            # [bsz, seq_len, seq_len] -> [bsz, 1, seq_len, seq_len]
+            elif len(attention_mask.shape) == 3:
+                expanded_attn_mask = attention_mask.unsqueeze(1).astype("bool")
+            # if attention_mask is already 4-D, do nothing
+            else:
+                expanded_attn_mask = attention_mask
+        else:
+            expanded_attn_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
+        # Convert bool attention_mask to float attention mask, which will be added to attention_scores later
+        expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
+        return expanded_attn_mask
+
+    @paddle.jit.not_to_static
+    def recompute_training_full(
+        self,
+        layer_module: nn.Layer,
+        hidden_states: Tensor,
+        position_ids: Optional[Tensor],
+        attention_mask: Tensor,
+        output_attentions: bool,
+        past_key_value: Tensor,
+        use_cache: bool,
+        alibi=None,
+    ):
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+
+            return custom_forward
+
+        hidden_states = recompute(
+            create_custom_forward(layer_module),
+            hidden_states,
+            position_ids,
+            attention_mask,
+            output_attentions,
+            past_key_value,
+            use_cache,
+            alibi,
+            use_reentrant=self.config.recompute_use_reentrant,
+        )
+
+        return hidden_states
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=None,
+        return_dict=False,
+        **kwargs,
+    ):
+        if self.sequence_parallel and use_cache:
+            raise ValueError("We currently only support sequence parallel without cache.")
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        if self.sequence_parallel:
+            # [bs, seq_len, num_head * head_dim] -> [bs * seq_len, num_head * head_dim]
+            bs, seq_len, hidden_size = inputs_embeds.shape
+            inputs_embeds = paddle.reshape_(inputs_embeds, [bs * seq_len, hidden_size])
+            # [seq_len * bs / n, num_head * head_dim] (n is mp parallelism)
+            inputs_embeds = ScatterOp.apply(inputs_embeds)
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.layers))
+        # NOTE: to make cache can be clear in-time
+        past_key_values = list(past_key_values)
+
+        seq_length_with_past = seq_length
+        cache_length = 0
+        if past_key_values[0] is not None:
+            cache_length = paddle.shape(past_key_values[0][0])[1]
+            seq_length_with_past += cache_length
+
+        if position_ids is None:
+            position_ids = paddle.arange(seq_length, dtype="int64").expand((batch_size, seq_length))
+
+        if attention_mask is None:
+            # [bs, seq_len]
+            attention_mask = paddle.ones((batch_size, seq_length_with_past), dtype=paddle.bool)
+        if self.config.alibi:
+            alibi = build_alibi_tensor(attention_mask, self.config.num_attention_heads, dtype=inputs_embeds.dtype)
+            if self.config.tensor_parallel_degree > 1:
+                block_size = self.config.num_attention_heads // self.config.tensor_parallel_degree
+                alibi = alibi[
+                    :,
+                    self.config.tensor_parallel_rank
+                    * block_size : (self.config.tensor_parallel_rank + 1)
+                    * block_size,
+                ]
+                alibi = alibi.reshape([batch_size * block_size, 1, seq_length_with_past])
+            else:
+                alibi = alibi.reshape([batch_size * self.config.num_attention_heads, 1, seq_length_with_past])
+        else:
+            alibi = None
+
+        if position_ids is None:
+            position_ids = paddle.arange(seq_length, dtype="int64").expand((batch_size, seq_length))
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), cache_length, inputs_embeds.dtype
+        )  # [bs, 1, seq_len, seq_len]
+        if self.config.use_flash_attention:
+            is_casual = is_casual_mask(attention_mask)
+            if is_casual and alibi is None:
+                attention_mask = None
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        # normalized
+        hidden_states = hidden_states * (self.config.hidden_size**0.5)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, (decoder_layer) in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            has_gradient = not hidden_states.stop_gradient
+            if (
+                self.enable_recompute
+                and idx not in self.no_recompute_layers
+                and has_gradient
+                and self.recompute_granularity == "full"
+            ):
+                layer_outputs = self.recompute_training_full(
+                    decoder_layer,
+                    hidden_states,
+                    position_ids,
+                    attention_mask,
+                    output_attentions,
+                    past_key_value,
+                    use_cache,
+                    alibi=alibi,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    position_ids,
+                    attention_mask,
+                    output_attentions,
+                    past_key_value,
+                    use_cache,
+                    alibi=alibi,
+                )
+
+            # NOTE: clear outdate cache after it has been used for memory saving
+            past_key_value = past_key_values[idx] = None
+            if type(layer_outputs) is tuple:
+                hidden_states = layer_outputs[0]
+            else:
+                hidden_states = layer_outputs
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=None,
+        )
+
+
+class GemmaPretrainingCriterion(nn.Layer):
+    """
+    Criterion for gemma. Copied From Llama
+    It calculates the final loss.
+    """
+
+    def __init__(self, config):
+
+        super().__init__()
+        self.ignore_index = getattr(config, "ignore_index", -100)
+        self.config = config
+        self.enable_parallel_cross_entropy = config.tensor_parallel_degree > 1 and config.tensor_parallel_output
+
+        if self.enable_parallel_cross_entropy:  # and False: # and lm_head is distributed
+            self.loss_func = mpu.ParallelCrossEntropy(ignore_index=self.ignore_index)
+        else:
+            self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index)
+
+    def forward(self, prediction_scores, masked_lm_labels):
+        if self.enable_parallel_cross_entropy:
+            if prediction_scores.shape[-1] == self.config.vocab_size:
+                warnings.warn(
+                    f"enable_parallel_cross_entropy, the vocab_size should be splited: {prediction_scores.shape[-1]}, {self.config.vocab_size}"
+                )
+                self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index)
+
+        with paddle.amp.auto_cast(False):
+            masked_lm_loss = self.loss_func(prediction_scores.astype("float32"), masked_lm_labels.unsqueeze(2))
+
+            if self.config.sep_parallel_degree > 1:
+                _hcg = fleet.get_hybrid_communicate_group()
+                masked_lm_loss = ConcatSePMaskedLoss.apply(masked_lm_loss, axis=1, group=_hcg.get_sep_parallel_group())
+            # skip ignore_index which loss == 0
+            masked_lm_loss = masked_lm_loss[masked_lm_loss > 0]
+            loss = paddle.mean(masked_lm_loss)
+
+        return loss
+
+
+class ConcatSePMaskedLoss(PyLayer):
+    @staticmethod
+    def forward(ctx, inp, axis, group):
+        inputs = []
+        paddle.distributed.all_gather(inputs, inp, group=group)
+        with paddle.no_grad():
+            cat = paddle.concat(inputs, axis=axis)
+        ctx.args_axis = axis
+        ctx.args_group = group
+        return cat
+
+    @staticmethod
+    def backward(ctx, grad):
+        axis = ctx.args_axis
+        group = ctx.args_group
+        with paddle.no_grad():
+            grads = paddle.split(grad, paddle.distributed.get_world_size(group), axis=axis)
+        grad = grads[paddle.distributed.get_rank(group)]
+        return grad
+
+
+class GemmaLMHead(nn.Layer):
+    def __init__(self, config: GemmaConfig):
+        super().__init__()
+        self.config = config
+        if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0:
+            vocab_size = config.vocab_size // config.tensor_parallel_degree
+        else:
+            vocab_size = config.vocab_size
+
+        if vocab_size != config.vocab_size:
+            with get_rng_state_tracker().rng_state():
+                self.weight = self.create_parameter(
+                    shape=[vocab_size, config.hidden_size]
+                    if config.tie_word_embeddings
+                    else [config.hidden_size, vocab_size],
+                    dtype=paddle.get_default_dtype(),
+                )
+        else:
+            self.weight = self.create_parameter(
+                shape=[vocab_size, config.hidden_size]
+                if config.tie_word_embeddings
+                else [config.hidden_size, vocab_size],
+                dtype=paddle.get_default_dtype(),
+            )
+        # Must set distributed attr for Tensor Parallel !
+        self.weight.is_distributed = True if (vocab_size != config.vocab_size) else False
+        if self.weight.is_distributed:
+            self.weight.split_axis = 1
+
+    def forward(self, hidden_states, tensor_parallel_output=None):
+        if self.config.sequence_parallel:
+            hidden_states = GatherOp.apply(hidden_states)
+            seq_length = self.config.seq_length
+            if self.config.sep_parallel_degree > 1:
+                assert seq_length % self.config.sep_parallel_degree == 0
+                seq_length = seq_length // self.config.sep_parallel_degree
+            hidden_states = paddle.reshape_(hidden_states, [-1, seq_length, self.config.hidden_size])
+
+        if tensor_parallel_output is None:
+            tensor_parallel_output = self.config.tensor_parallel_output
+
+        logits = parallel_matmul(
+            hidden_states,
+            self.weight,
+            tensor_parallel_output=tensor_parallel_output,
+            transpose_y=self.config.tie_word_embeddings,
+        )
+        return logits
+
+
+class GemmaForCausalLM(GemmaPretrainedModel):
+    enable_to_static_method = True
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.lm_head = self.lm_head = GemmaLMHead(config)
+        self.gemma = GemmaModel(config)
+        self.criterion = GemmaPretrainingCriterion(config)
+
+        self.tie_weights()
+
+    def get_input_embeddings(self):
+        return self.gemma.embed_tokens
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_input_embeddings(self, value):
+        self.gemma.embed_tokens = value
+
+    def set_decoder(self, decoder):
+        self.gemma = decoder
+
+    def get_decoder(self):
+        return self.gemma
+
+    def prepare_inputs_for_generation(
+        self, input_ids, use_cache=False, past_key_values=None, inputs_embeds=None, **kwargs
+    ):
+        batch_size, seq_length = input_ids.shape
+        position_ids = kwargs.get("position_ids", paddle.arange(seq_length).expand((batch_size, seq_length)))
+        attention_mask = kwargs.get("attention_mask", None)
+        if past_key_values:
+            input_ids = input_ids[:, -1].unsqueeze(axis=-1)
+            position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    def _get_model_inputs_spec(self, dtype: str):
+        return {
+            "input_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+            "attention_mask": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+            "position_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+        }
+
+    @staticmethod
+    def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False):
+        # update cache
+        if isinstance(outputs, tuple) and len(outputs) > 1 and not isinstance(outputs[1], paddle.Tensor):
+            model_kwargs["past_key_values"] = outputs[1]
+
+        if isinstance(outputs, CausalLMOutputWithCrossAttentions) and "past_key_values" in outputs:
+            model_kwargs["past_key_values"] = outputs.past_key_values
+
+        # update position_ids
+        if "position_ids" in model_kwargs and model_kwargs["position_ids"] is not None:
+            position_ids = model_kwargs["position_ids"]
+            model_kwargs["position_ids"] = paddle.concat([position_ids, position_ids[..., -1:] + 1], axis=-1)
+
+        if not is_encoder_decoder and "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = paddle.concat(
+                [attention_mask, paddle.ones([attention_mask.shape[0], 1], dtype=attention_mask.dtype)], axis=-1
+            )
+
+        return model_kwargs
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=False,
+        past_key_values=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.gemma(
+            input_ids,  # [bs, seq_len]
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]  # [bs, seq_len, dim]
+
+        # if labels is None，means we need full output, instead of tensor_parallel_output
+        # tensor_parallel_output is togather with ParallelCrossEntropy
+        tensor_parallel_output = (
+            self.config.tensor_parallel_output and labels is not None and self.config.tensor_parallel_degree > 1
+        )
+
+        logits = self.lm_head(hidden_states, tensor_parallel_output=tensor_parallel_output)
+
+        loss = None
+        if labels is not None:
+            loss = self.criterion(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gemma/modeling_pp.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gemma/modeling_pp.py
new file mode 100644
index 000000000..8839248a2
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gemma/modeling_pp.py
@@ -0,0 +1,313 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.distributed.fleet as fleet
+import paddle.nn as nn
+from paddle.distributed.fleet.meta_parallel import (
+    LayerDesc,
+    PipelineLayer,
+    SharedLayerDesc,
+)
+from paddle.distributed.fleet.utils import recompute
+
+from paddlenlp.transformers.model_utils import PipelinePretrainedModel
+
+from .modeling import (
+    GemmaConfig,
+    GemmaDecoderLayer,
+    GemmaLMHead,
+    GemmaModel,
+    GemmaPretrainedModel,
+    GemmaPretrainingCriterion,
+    GemmaRMSNorm,
+    build_alibi_tensor,
+)
+
+
+def __repr__(self):
+    return self.layer_func.__name__
+
+
+# hack LayerDesc for showing to much config
+LayerDesc.__repr__ = __repr__
+
+__all__ = [
+    "GemmaForCausalLMPipe",
+]
+
+
+def parse_args(args):
+    if isinstance(args, tuple):
+        if len(args) == 4:
+            hidden_states, attention_mask, position_ids, alibi = args
+        if len(args) == 3:
+            hidden_states, attention_mask, position_ids = args
+            alibi = None
+        elif len(args) == 2:
+            hidden_states, attention_mask = args
+            position_ids = None
+            alibi = None
+    else:
+        hidden_states = args
+        attention_mask, position_ids, alibi = None, None, None
+
+    if position_ids is not None:
+        position_ids.stop_gradient = True
+
+    if attention_mask is not None:
+        attention_mask.stop_gradient = True
+
+    if alibi is not None:
+        alibi.stop_gradient = True
+
+    return hidden_states, attention_mask, position_ids, alibi
+
+
+def return_args(hidden_states, attention_mask=None, position_ids=None, alibi=None):
+    ret = (hidden_states,)
+
+    if attention_mask is not None:
+        ret += (attention_mask.clone(),)
+    if position_ids is not None:
+        ret += (position_ids.clone(),)
+    if alibi is not None:
+        ret += (alibi.clone(),)
+
+    if len(ret) == 1:
+        ret = ret[0]
+
+    return ret
+
+
+def get_attr(layer, name):
+    if getattr(layer, name, None) is not None:
+        return getattr(layer, name, None)
+    else:
+        return get_attr(layer._layer, name)
+
+
+class GemmaEmbeddingPipe(nn.Layer):
+    """Extends GemmaEmbeddings to forward attention_mask through the pipeline."""
+
+    def __init__(self, config):
+        super(GemmaEmbeddingPipe, self).__init__()
+        self.config = config
+        self.sequence_parallel = config.sequence_parallel
+        self.hidden_size = config.hidden_size
+        if config.tensor_parallel_degree > 1:
+            self.embed_tokens = fleet.meta_parallel.VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()),
+            )
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+
+    @property
+    def embedding_weight(self):
+        return get_attr(self.embed_tokens, "weight")
+
+    def forward(self, args):
+        """_summary_
+
+        Args:
+            input (_type_): _description_
+
+        Returns:
+            _type_: _description_
+        """
+        input_ids, attention_mask, position_ids, alibi = parse_args(args)
+        input_embeds = self.embed_tokens(input_ids)
+        if self.sequence_parallel:
+            from paddlenlp.transformers import ScatterOp
+
+            # [bs, seq_len, num_head * head_dim] -> [bs * seq_len, num_head * head_dim]
+            bs, seq_len, hidden_size = input_embeds.shape
+            input_embeds = paddle.reshape_(input_embeds, [bs * seq_len, hidden_size])
+            # [seq_len * bs / n, num_head * head_dim] (n is mp parallelism)
+            input_embeds = ScatterOp.apply(input_embeds)
+
+        batch_size, seq_length = input_ids.shape
+        alibi = None
+        if self.config.alibi:
+            # embed positions
+            mask = (
+                attention_mask
+                if attention_mask is not None
+                else paddle.ones((batch_size, seq_length), dtype=paddle.bool)
+            )
+            alibi = build_alibi_tensor(mask, self.config.num_attention_heads, dtype=input_embeds.dtype)
+
+            if self.config.tensor_parallel_degree > 1:
+                block_size = self.config.num_attention_heads // self.config.tensor_parallel_degree
+                alibi = alibi[
+                    :,
+                    self.config.tensor_parallel_rank
+                    * block_size : (self.config.tensor_parallel_rank + 1)
+                    * block_size,
+                ]
+                alibi = alibi.reshape([batch_size * block_size, 1, seq_length])
+            else:
+                alibi = alibi.reshape([batch_size * self.config.num_attention_heads, 1, seq_length])
+            alibi.stop_gradient = True
+
+        if attention_mask is not None:
+            attention_mask = GemmaModel._prepare_decoder_attention_mask(
+                attention_mask, (batch_size, seq_length), 0, input_embeds.dtype
+            )
+            attention_mask.stop_gradient = True
+
+        if self.config.alibi and attention_mask is None:
+            attention_mask = GemmaModel._prepare_decoder_attention_mask(
+                None, (batch_size, seq_length), 0, input_embeds.dtype
+            )
+            attention_mask.stop_gradient = True
+
+        hidden_states = input_embeds * (self.config.hidden_size**0.5)
+        return return_args(hidden_states, attention_mask, position_ids, alibi)
+
+
+class GemmaDecoderLayerPipe(GemmaDecoderLayer):
+    def forward(self, args):
+        hidden_states, attention_mask, position_ids, alibi = parse_args(args)
+        # we can't distinguish
+        # hidden_states, attention_mask, position_ids or
+        # hidden_states, attention_mask, alibi
+        if self.config.alibi and alibi is None and position_ids is not None:
+            alibi = position_ids
+            position_ids = None
+
+        has_gradient = not hidden_states.stop_gradient
+        if self.enable_recompute and self.config.recompute_granularity == "full" and has_gradient:
+            if attention_mask is not None or alibi is not None:
+                hidden_states = recompute(
+                    super().forward, hidden_states, attention_mask=attention_mask, alibi=alibi, use_reentrant=False
+                )
+            else:
+                # for pretrain
+                hidden_states = recompute(
+                    super().forward, hidden_states, use_reentrant=self.config.recompute_use_reentrant
+                )
+        else:
+            hidden_states = super().forward(hidden_states, attention_mask=attention_mask, alibi=alibi)
+
+        return return_args(hidden_states, attention_mask, position_ids, alibi)
+
+
+class GemmaRMSNormPipe(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.norm = GemmaRMSNorm(config)
+
+    def forward(self, args):
+        hidden_states, attention_mask, position_ids, alibi = parse_args(args)
+        return self.norm(hidden_states)
+
+
+class GemmaLMHeadPipe(GemmaLMHead):
+    def __init__(self, config):
+        super(GemmaLMHeadPipe, self).__init__(config)
+
+    @property
+    def embedding_weight(self):
+        return get_attr(self, "weight")
+
+
+class GemmaForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):
+    """GemmaForPretraining adapted for pipeline parallelism.
+
+    The largest change is flattening the GemmaModel class so we can express it as a
+    sequence of layers including embedding, transformer layers, and output.
+    """
+
+    config_class = GemmaConfig
+
+    _get_tensor_parallel_mappings = GemmaPretrainedModel._get_tensor_parallel_mappings
+    _init_weights = GemmaPretrainedModel._init_weights
+    _keys_to_ignore_on_load_unexpected = GemmaPretrainedModel._keys_to_ignore_on_load_unexpected
+
+    # DONOT Add base_model_prefix !!!!
+
+    def __init__(self, config):
+        self.config = config
+
+        self.recompute_granularity = self.config.recompute_granularity
+        self.pp_recompute_interval = self.config.pp_recompute_interval
+        self.no_recompute_layers = config.no_recompute_layers if config.no_recompute_layers is not None else []
+        if self.recompute_granularity == "full":
+            assert len(self.no_recompute_layers) == 0, "for pp with full recompute, no_recompute_layers is not support"
+
+        virtual_pp_degree = getattr(self.config, "virtual_pp_degree", 1)
+
+        def get_hcg():
+            return fleet.get_hybrid_communicate_group()
+
+        hcg = get_hcg()
+        tensor_parallel_degree = max(hcg.get_model_parallel_world_size(), 1)
+        tensor_parallel_rank = max(hcg.get_model_parallel_rank(), 0)
+
+        # TODO: fix tensor_parallel_degree rewrite in here
+        config.tensor_parallel_degree = tensor_parallel_degree
+        config.tensor_parallel_rank = tensor_parallel_rank
+
+        self.add_sequential_layer(
+            SharedLayerDesc(
+                key="gemma_weigt_share",
+                layer_func=GemmaEmbeddingPipe,
+                shared_weight_attr="embedding_weight",
+                config=config,
+            ),
+            "gemma",
+        )
+        for i in range(config.num_hidden_layers):
+            self.add_sequential_layer(
+                LayerDesc(GemmaDecoderLayerPipe, config=config, layerwise_recompute=i not in self.no_recompute_layers),
+                f"gemma.layers.{i}",
+            )
+
+        self.add_sequential_layer(LayerDesc(GemmaRMSNormPipe, config=config), "gemma")
+        self.add_sequential_layer(
+            SharedLayerDesc(
+                key="gemma_weigt_share",
+                layer_func=GemmaLMHeadPipe,
+                shared_weight_attr="embedding_weight",
+                config=config,
+            ),
+            "lm_head",
+        )
+
+        recompute_interval = 0
+
+        seg_method = "layer:GemmaDecoderLayer"
+        if config.num_hidden_layers % get_hcg().topology().get_dim_size("pipe") != 0:
+            seg_method = "uniform"
+
+        PipelineLayer.__init__(
+            self,
+            layers=self.get_sequential_layers(),
+            loss_fn=GemmaPretrainingCriterion(config),
+            topology=get_hcg().topology(),
+            seg_method=seg_method,
+            recompute_interval=recompute_interval,
+            recompute_ctx={
+                "mp_group": get_hcg().get_model_parallel_group(),
+                "offload": False,
+                "partition": False,
+            },
+            num_virtual_pipeline_stages=virtual_pp_degree,
+        )
+        self.apply(self._init_weights)
+        # DON'T init PipelinePretrainedModel
+        # PipelinePretrainedModel.__init__(self.super(), config=config)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gemma/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gemma/tokenizer.py
new file mode 100644
index 000000000..bf0804bbe
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gemma/tokenizer.py
@@ -0,0 +1,360 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import sentencepiece as spm
+
+from ...utils.log import logger
+from .. import PretrainedTokenizer
+from ..tokenizer_utils_base import (
+    AddedToken,
+    BatchEncoding,
+    EncodedInput,
+    PaddingStrategy,
+)
+
+__all__ = ["GemmaTokenizer"]
+
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
+
+SPIECE_UNDERLINE = "▁"
+
+
+class GemmaTokenizer(PretrainedTokenizer):
+    model_input_names = ["input_ids", "attention_mask"]
+    resource_files_names = VOCAB_FILES_NAMES
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "google/gemma-7b": "https://bj.bcebos.com/paddlenlp/models/community/google/gemma-7b/tokenizer.model",
+            "google/gemma-2b": "https://bj.bcebos.com/paddlenlp/models/community/google/gemma-2b/tokenizer.model",
+        },
+    }
+
+    pretrained_init_configuration = {
+        "google/gemma-7b": {},
+    }
+
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<bos>",
+        eos_token="<eos>",
+        pad_token="<pad>",
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        add_bos_token=True,
+        add_eos_token=False,
+        clean_up_tokenization_spaces=False,
+        use_default_system_prompt=False,
+        spaces_between_special_tokens=False,
+        **kwargs,
+    ):
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        bos_token = AddedToken(bos_token, normalized=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, normalized=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, normalized=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, normalized=False) if isinstance(pad_token, str) else pad_token
+
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.use_default_system_prompt = use_default_system_prompt
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            add_bos_token=add_bos_token,
+            add_eos_token=add_eos_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            use_default_system_prompt=use_default_system_prompt,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            **kwargs,
+        )
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
+        return state
+
+    # Copied from transformers.models.llama.tokenizer_llama.LlamaTokenizer.__setstate__
+    def __setstate__(self, d):
+        self.__dict__ = d
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+
+    @property
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.vocab_size
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_vocab
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text, **kwargs):
+        """
+        Returns a tokenized string. The Gemma tokenizer never adds a prefix space.
+        """
+        return self.sp_model.encode(text, out_type=str)
+
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer._convert_token_to_id
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer._convert_id_to_token
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+
+    def _decode(
+        self,
+        token_ids: List[int],
+        skip_special_tokens: bool = False,
+        spaces_between_special_tokens: bool = False,
+        **kwargs,
+    ) -> str:
+        sub_texts = []
+        current_sub_text = []
+        for ids in token_ids:
+            if skip_special_tokens and ids in self.all_special_ids:
+                continue
+            if ids in self.added_tokens_decoder:
+                if current_sub_text:
+                    sub_texts.append(self.sp_model.decode(current_sub_text))
+                cur_id = self.added_tokens_decoder[ids]
+                if isinstance(cur_id, AddedToken):
+                    sub_texts.append(cur_id.content)
+                elif isinstance(cur_id, str):
+                    sub_texts.append(cur_id)
+                current_sub_text = []
+            elif ids in self.all_special_ids:
+                if current_sub_text:
+                    sub_texts.append(self.sp_model.decode(current_sub_text))
+                sub_texts.append(self._convert_id_to_token(ids))
+                current_sub_text = []
+            else:
+                current_sub_text.append(ids)
+        if current_sub_text:
+            sub_texts.append(self.sp_model.decode(current_sub_text))
+
+        if spaces_between_special_tokens:
+            sub_texts = " ".join(sub_texts)
+        else:
+            sub_texts = "".join(sub_texts)
+
+        return sub_texts
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.added_tokens_encoder:
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                current_sub_tokens = []
+            elif token in self.all_special_tokens:
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+            else:
+                current_sub_tokens.append(token)
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.save_vocabulary
+    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+        output = bos_token_id + token_ids_0 + eos_token_id
+
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+
+        return output
+
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        bos_token_id = [1] if self.add_bos_token else []
+        eos_token_id = [1] if self.add_eos_token else []
+
+        if token_ids_1 is None:
+            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+        return (
+            bos_token_id
+            + ([0] * len(token_ids_0))
+            + eos_token_id
+            + bos_token_id
+            + ([0] * len(token_ids_1))
+            + eos_token_id
+        )
+
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.create_token_type_ids_from_sequences
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
+        sequence pair mask has the following format:
+
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of ids.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+
+        if token_ids_1 is not None:
+            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+
+        return output
+
+    def _pad(
+        self,
+        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        For Zero Padding, Copied from llama
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                >= 7.5 (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+
+        # attention_mask shape [1,seq_len,seq_len]
+        if "attention_mask" in encoded_inputs and len(np.shape(encoded_inputs["attention_mask"])) > 2:
+            attention_mask = encoded_inputs["attention_mask"]
+            encoded_inputs.pop("attention_mask")
+        else:
+            attention_mask = None
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+        encoded_inputs = super()._pad(
+            encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, return_attention_mask
+        )
+        if attention_mask is not None and len(np.shape(attention_mask)) > 2:
+            encoded_inputs["attention_mask"] = attention_mask
+            needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+            if needs_to_be_padded:
+                difference = max_length - len(required_input)
+                if "attention_mask" in encoded_inputs:
+                    encoded_inputs["attention_mask"] = np.pad(
+                        encoded_inputs["attention_mask"],
+                        pad_width=[(0, 0), (difference, 0), (difference, 0)],
+                        mode="constant",
+                        constant_values=0,
+                    )
+        return encoded_inputs
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/glm/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/glm/__init__.py
new file mode 100644
index 000000000..595add0ae
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/glm/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/glm/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/glm/configuration.py
new file mode 100644
index 000000000..f898c8c87
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/glm/configuration.py
@@ -0,0 +1,252 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GLM model configuration"""
+
+from __future__ import annotations
+
+from typing import Dict
+
+from ..configuration_utils import PretrainedConfig
+
+__all__ = [
+    "GLMConfig",
+    "GLM_PRETRAINED_INIT_CONFIGURATION",
+    "GLM_PRETRAINED_RESOURCE_FILES_MAP",
+]
+
+
+GLM_PRETRAINED_INIT_CONFIGURATION = {
+    "THUDM/glm-515m": {
+        "attention_dropout_prob": 0.1,
+        "attention_scale": 1.0,
+        "block_position_encoding": True,
+        "checkpoint_num_layers": 1,
+        "embedding_dropout_prob": 0.1,
+        "hidden_size": 1152,
+        "initializer_range": 0.02,
+        "max_sequence_length": 512,
+        "model_type": "glm",
+        "num_attention_heads": 18,
+        "num_layers": 30,
+        "layernorm_epsilon": 1e-5,
+        "output_dropout_prob": 0.1,
+        "output_predict": True,
+        "parallel_output": False,
+        "pool_token": "cls",
+        "relative_encoding": False,
+        "spell_func": "lstm",
+        "spell_length": None,
+        "use_scaled_init_for_output_weights": True,
+        "vocab_size": 30592,
+    },
+    "THUDM/glm-2b": {
+        "attention_dropout_prob": 0.1,
+        "attention_scale": 1.0,
+        "block_position_encoding": True,
+        "checkpoint_num_layers": 1,
+        "embedding_dropout_prob": 0.1,
+        "hidden_size": 2048,
+        "initializer_range": 0.02,
+        "max_sequence_length": 1024,
+        "model_type": "glm",
+        "num_attention_heads": 32,
+        "num_layers": 36,
+        "output_dropout_prob": 0.1,
+        "output_predict": True,
+        "parallel_output": True,
+        "pool_token": "cls",
+        "relative_encoding": False,
+        "spell_func": "lstm",
+        "spell_length": None,
+        "vocab_size": 50304,
+    },
+    "THUDM/glm-10b": {
+        "attention_dropout_prob": 0.1,
+        "attention_scale": 1.0,
+        "block_position_encoding": True,
+        "checkpoint_num_layers": 1,
+        "embedding_dropout_prob": 0.1,
+        "hidden_size": 4096,
+        "initializer_range": 0.02,
+        "max_sequence_length": 1024,
+        "model_type": "glm",
+        "num_attention_heads": 64,
+        "num_layers": 48,
+        "output_dropout_prob": 0.1,
+        "output_predict": True,
+        "parallel_output": True,
+        "pool_token": "cls",
+        "relative_encoding": False,
+        "spell_func": "lstm",
+        "spell_length": None,
+        "vocab_size": 50304,
+    },
+    "THUDM/glm-large-chinese": {
+        "attention_dropout_prob": 0.1,
+        "attention_scale": 1.0,
+        "block_position_encoding": True,
+        "checkpoint_num_layers": 1,
+        "embedding_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "max_sequence_length": 1024,
+        "model_type": "glm",
+        "num_attention_heads": 16,
+        "num_layers": 24,
+        "layernorm_epsilon": 1e-5,
+        "output_dropout_prob": 0.1,
+        "output_predict": True,
+        "parallel_output": False,
+        "pool_token": "cls",
+        "relative_encoding": False,
+        "spell_func": "lstm",
+        "spell_length": None,
+        "vocab_size": 50048,
+    },
+    "THUDM/glm-10b-chinese": {
+        "attention_dropout_prob": 0.1,
+        "attention_scale": 1.0,
+        "block_position_encoding": True,
+        "checkpoint_num_layers": 1,
+        "embedding_dropout_prob": 0.1,
+        "hidden_size": 4096,
+        "initializer_range": 0.02,
+        "max_sequence_length": 1024,
+        "model_type": "glm",
+        "num_attention_heads": 64,
+        "num_layers": 48,
+        "output_dropout_prob": 0.1,
+        "output_predict": True,
+        "parallel_output": True,
+        "pool_token": "cls",
+        "relative_encoding": False,
+        "spell_func": "lstm",
+        "spell_length": None,
+        "vocab_size": 50048,
+        "bad_words_id": [50009],
+    },
+}
+
+GLM_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "THUDM/glm-515m": "https://paddlenlp.bj.bcebos.com/models/community/THUDM/glm-515m.pdparams",
+        "THUDM/glm-2b": "https://paddlenlp.bj.bcebos.com/models/community/THUDM/glm-2b.pdparams",
+        "THUDM/glm-10b": "https://paddlenlp.bj.bcebos.com/models/community/THUDM/glm-10b.pdparams",
+        "THUDM/glm-large-chinese": "https://paddlenlp.bj.bcebos.com/models/community/THUDM/glm-large-chinese.pdparams",
+        "THUDM/glm-10b-chinese": "https://paddlenlp.bj.bcebos.com/models/community/THUDM/glm-10b-chinese.pdparams",
+    }
+}
+
+
+class GLMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`~GLMModel`].
+    It is used to instantiate an GLM model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the GLM [shunxing1234/GLM-base-cased](https://huggingface.co/shunxing1234/GLM-base-cased) architecture.
+    Configuration objects inherit from  [`PretrainedConfig`] and can be used
+    to control the model outputs. Read the documentation from  [`PretrainedConfig`]
+    for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the GLM model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~GLMModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`~GLMModel`] or
+            [`~TFGLMModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import GLMModel, GLMConfig
+    >>> # Initializing a GLM shunxing1234/GLM-base-cased style configuration
+    >>> configuration = GLMConfig()
+    >>> # Initializing a model from the shunxing1234/GLM-base-cased style configuration
+    >>> model = GLMModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "glm"
+    attribute_map: Dict[str, str] = {"num_hidden_layers": "num_layers", "torch_dtype": "dtype"}
+    pretrained_init_configuration = GLM_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        num_layers=24,
+        vocab_size=30592,
+        hidden_size=1024,
+        num_attention_heads=16,
+        embedding_dropout_prob=0.1,
+        attention_dropout_prob=0.1,
+        output_dropout_prob=0.1,
+        max_sequence_length=512,
+        checkpoint_num_layers=1,
+        parallel_output=True,
+        relative_encoding=False,
+        block_position_encoding=True,
+        output_predict=False,
+        spell_length=None,
+        spell_func="lstm",
+        attention_scale=1.0,
+        initializer_range=0.02,
+        pool_token="cls",
+        layernorm_epsilon=1e-5,
+        use_scaled_init_for_output_weights=False,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.num_layers = num_layers
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.embedding_dropout_prob = embedding_dropout_prob
+        self.attention_dropout_prob = attention_dropout_prob
+        self.output_dropout_prob = output_dropout_prob
+        self.max_sequence_length = max_sequence_length
+        self.checkpoint_num_layers = checkpoint_num_layers
+        self.parallel_output = parallel_output
+        self.relative_encoding = relative_encoding
+        self.block_position_encoding = block_position_encoding
+        self.output_predict = output_predict
+        self.spell_length = spell_length
+        self.spell_func = spell_func
+        self.attention_scale = attention_scale
+        self.initializer_range = initializer_range
+        self.pool_token = pool_token
+        self.layernorm_epsilon = layernorm_epsilon
+        self.use_scaled_init_for_output_weights = use_scaled_init_for_output_weights
+        self._fast_entry = None
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/glm/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/glm/modeling.py
new file mode 100644
index 000000000..bdd9ba221
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/glm/modeling.py
@@ -0,0 +1,878 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GLM model"""
+from __future__ import annotations
+
+import math
+from functools import partial
+from typing import Optional
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import Tensor
+from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
+from paddle.distributed.fleet.utils import recompute
+
+from ...utils.converter import StateDictNameMapping, init_name_mappings
+from ...utils.env import CONFIG_NAME
+from ...utils.initializer import normal_, ones_, zeros_
+from ...utils.log import logger
+from .. import PretrainedModel, register_base_model
+from ..model_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MultipleChoiceModelOutput,
+)
+from .configuration import (
+    GLM_PRETRAINED_INIT_CONFIGURATION,
+    GLM_PRETRAINED_RESOURCE_FILES_MAP,
+    GLMConfig,
+)
+
+__all__ = [
+    "GLMModel",
+    "GLMPretrainedModel",
+    "GLMForMultipleChoice",
+    "GLMForConditionalGeneration",
+]
+
+
+class GLMAttention(nn.Layer):
+    """
+    Self-attention layer performs multiple attention to jointly attending to
+    information from different representation subspaces.
+    """
+
+    def __init__(self, config: GLMConfig):
+        super(GLMAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.config = config
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = config.hidden_size // config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.attention_scale = config.attention_scale
+
+        if config.tensor_parallel_degree > 1:
+            self.query_key_value = fleet.meta_parallel.ColumnParallelLinear(
+                config.hidden_size, 3 * config.hidden_size, has_bias=True, gather_output=False
+            )
+            self.dense = fleet.meta_parallel.RowParallelLinear(
+                config.hidden_size, config.hidden_size, input_is_parallel=True, has_bias=True
+            )
+            self.num_attention_heads = config.num_attention_heads // config.tensor_parallel_degree
+        else:
+            self.query_key_value = nn.Linear(config.hidden_size, 3 * config.hidden_size)
+            self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+
+        self.attention_dropout = nn.Dropout(config.attention_dropout_prob)
+        self.output_dropout = nn.Dropout(config.output_dropout_prob)
+
+    def _transpose_for_scores(self, inputs: Tensor):
+        """
+        Transpose a 3D tensor [b, s, n/p*h/n] into a 4D tensor [b, n/p, s, h/n],
+        where b means batch_size, s means sequence_length, n means num_attention_heads,
+        h means hidden_size and p means number of partitions.
+        """
+        new_shape = [*inputs.shape[:-1], self.num_attention_heads, self.attention_head_size]
+        outputs = inputs.reshape(new_shape)
+        outputs = paddle.transpose(outputs, [0, 2, 1, 3])
+        return outputs
+
+    def _core_attention(self, hidden_states: Tensor, cache: Tensor = None):
+        # [bs, seq_len, num_head * head_dim]
+        query_length = hidden_states.shape[1]
+        if cache is None:
+            mixed_layer = self.query_key_value(hidden_states)
+            mixed_q_layer, mixed_k_layer, mixed_v_layer = paddle.split(mixed_layer, 3, axis=-1)
+        else:
+            # [bs, cache_len + seq_len, num_head * head_dim]
+            concat_hidden_states = paddle.concat([cache, hidden_states], axis=1)
+            # [bs, cache_len + seq_len, num_head * head_dim * 3]
+            mixed_layer = self.query_key_value(concat_hidden_states)
+            # [bs, cache_len + seq_len, num_head * head_dim]
+            mixed_q_layer, mixed_k_layer, mixed_v_layer = paddle.split(mixed_layer, 3, axis=-1)
+            # [bs, cache_len + seq_len, num_head * head_dim]
+            mixed_q_layer = mixed_q_layer[:, -query_length:]
+            # [bs, seq_len, num_head * head_dim]
+
+        # [bs, num_head, seq_len, head_dim]
+        q_layer = self._transpose_for_scores(mixed_q_layer)
+        # [bs, num_head, cache_len + seq_len, head_dim]
+        k_layer = self._transpose_for_scores(mixed_k_layer)
+        # [bs, num_head, cache_len + seq_len, head_dim]
+        v_layer = self._transpose_for_scores(mixed_v_layer)
+
+        return q_layer, k_layer, v_layer
+
+    def _core_parallel_attention(self, hidden_states: Tensor, cache: Tensor = None):
+        query_length = hidden_states.shape[1]
+        if cache is None:
+            mixed_layer = self.query_key_value(hidden_states)
+            # [bs, seq_len, num_attention_heads, 3* head_dim]
+            mixed_layer = paddle.reshape_(mixed_layer, [0, 0, self.num_attention_heads, 3 * self.attention_head_size])
+            # [bs,  num_attention_heads, seq_len, 3* head_dim]
+            mixed_layer = paddle.transpose(mixed_layer, [0, 2, 1, 3])
+            # [bs,  num_attention_heads, seq_len, head_dim]
+            mixed_q_layer, mixed_k_layer, mixed_v_layer = paddle.split(mixed_layer, num_or_sections=3, axis=-1)
+
+        else:
+            # [bs, seq_len(+cache_len), num_head * head_dim]
+            concat_hidden_states = paddle.concat([cache, hidden_states], axis=1)
+            mixed_layer = self.query_key_value(concat_hidden_states)
+            # [bs. seq_len(+cache_len), num_attention_heads, 3* head_dim]
+            mixed_layer = paddle.reshape_(mixed_layer, [0, 0, self.num_attention_heads, 3 * self.attention_head_size])
+            # [bs, num_attention_heads, seq_len(+cache_len),  3* head_dim]
+            mixed_layer = paddle.transpose(mixed_layer, [0, 2, 1, 3])
+            mixed_q_layer, mixed_k_layer, mixed_v_layer = paddle.split(mixed_layer, num_or_sections=3, axis=-1)
+            # [bs, num_attention_heads, seq_len, head_dim]
+            mixed_q_layer = mixed_q_layer[:, :, -query_length:]
+
+        return mixed_q_layer, mixed_k_layer, mixed_v_layer
+
+    def forward(self, hidden_states: Tensor, ltor_mask: Tensor, cache: Tensor = None):
+        # [bs, seq_len, num_head * head_dim]
+        if self.config.tensor_parallel_degree > 1:
+            q_layer, k_layer, v_layer = self._core_parallel_attention(hidden_states, cache)
+        else:
+            # [bs,  num_head, seq_len, head_dim]
+            q_layer, k_layer, v_layer = self._core_attention(hidden_states, cache)
+
+        if self.attention_scale > 1.0:
+            attention_scores = paddle.matmul(
+                q_layer / math.sqrt(self.attention_scale),
+                k_layer.transpose([0, 1, 3, 2]) / math.sqrt(self.attention_head_size * self.attention_scale),
+            )
+        else:
+            # [bs,  num_head, seq_len, head_dim] * [bs,  num_head,  head_dim, seq_len]
+            # [bs,  num_head, seq_len, seq_len] / [bs,  num_head, seq_len, cache_len + seq_len]
+            attention_scores = paddle.matmul(
+                q_layer, k_layer.transpose([0, 1, 3, 2]) / math.sqrt(self.attention_head_size)
+            )
+
+        ltor_mask = ltor_mask.astype(attention_scores.dtype)
+        # [bs,  num_head, seq_len, seq_len(+cache_len)]
+        attention_scores = paddle.multiply(attention_scores, ltor_mask)
+        if self.attention_scale > 1.0:
+            # Fixme for max op not support fp16 https://github.com/PaddlePaddle/Paddle/issues/52601
+            if attention_scores.dtype != paddle.float32:
+                old_type = attention_scores.dtype
+                max_attention_scores = attention_scores.astype("float32").max(axis=-1, keepdim=True)[0]
+                max_attention_scores = max_attention_scores.astype(old_type)
+            else:
+                max_attention_scores = attention_scores.max(axis=-1, keepdim=True)[0]
+
+            attention_scores -= max_attention_scores
+            attention_scores *= self.attention_scale
+
+        attention_scores = attention_scores + (-65504.0) * (1.0 - ltor_mask)
+        attention_probs = F.softmax(attention_scores, axis=-1)
+
+        if "local_seed" in get_rng_state_tracker().states_:
+            with get_rng_state_tracker().rng_state("local_seed"):
+                attention_probs = self.attention_dropout(attention_probs)
+        else:
+            attention_probs = self.attention_dropout(attention_probs)
+
+        # [bs,  num_head, seq_len, seq_len(+cache_len)] * [bs,  num_head, seq_len(+cache_len), head_dim]
+        # [bs,  num_head, seq_len, head_dim]
+        context_layer = paddle.matmul(attention_probs, v_layer)
+        # [bs, seq_len, num_head, head_dim]
+        context_layer = context_layer.transpose([0, 2, 1, 3])
+        # [bs, seq_len, num_head * head_dim]
+        new_context_shape = context_layer.shape[:-2] + [self.num_attention_heads * self.attention_head_size]
+        context_layer = context_layer.reshape(new_context_shape)
+        output = self.dense(context_layer)
+
+        if "global_seed" in get_rng_state_tracker().states_:
+            with get_rng_state_tracker().rng_state("global_seed"):
+                output = self.output_dropout(output)
+        else:
+            output = self.output_dropout(output)
+
+        return output
+
+
+class GLMBlock(nn.Layer):
+    """
+    The Transformer layer.
+    """
+
+    def __init__(self, config: GLMConfig):
+        super(GLMBlock, self).__init__()
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layernorm_epsilon)
+        self.attention = GLMAttention(config)
+
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layernorm_epsilon)
+        self.mlp = GPT2MLP(config)
+
+    def forward(self, hidden_states: Tensor, ltor_mask: Tensor, cache: Tensor = None):
+        layernorm_output = self.input_layernorm(hidden_states)
+        # Layer norm before transformer layer
+        cache = self.input_layernorm(cache) if cache is not None else None
+        # Self attention
+        attention_output = self.attention(layernorm_output, ltor_mask, cache)
+        # Residual connection
+        layernorm_input = hidden_states + attention_output
+        # Layernorm after attention
+        layernorm_output = self.post_attention_layernorm(layernorm_input)
+        # MLP
+        mlp_output = self.mlp(layernorm_output)
+        # Second residual connection
+        output = layernorm_input + mlp_output
+        return output
+
+
+class GPT2MLP(nn.Layer):
+    """
+    MLP takes the input with an h hidden state, project it to 4*h hidden
+    dimension, perform gelu transformation, and project the state back
+    into h hidden dimension. At the end, dropout is also applied.
+    """
+
+    def __init__(self, config: GLMConfig):
+        super(GPT2MLP, self).__init__()
+        if config.tensor_parallel_degree > 1:
+            self.dense_h_to_4h = fleet.meta_parallel.ColumnParallelLinear(
+                config.hidden_size, config.hidden_size * 4, has_bias=True, gather_output=False
+            )
+            self.dense_4h_to_h = fleet.meta_parallel.RowParallelLinear(
+                config.hidden_size * 4, config.hidden_size, input_is_parallel=True, has_bias=True
+            )
+        else:
+            self.dense_h_to_4h = nn.Linear(config.hidden_size, config.hidden_size * 4)
+            self.dense_4h_to_h = nn.Linear(config.hidden_size * 4, config.hidden_size)
+        self.dropout = nn.Dropout(config.output_dropout_prob)
+
+    def forward(self, hidden_states):
+        # [batch_size, sequence_length, 4h / number of partitions]
+        intermediate_parallel = self.dense_h_to_4h(hidden_states)
+        intermediate_parallel = F.gelu(intermediate_parallel, approximate=True)
+
+        # [batch_size, sequence_length, h]
+        output = self.dense_4h_to_h(intermediate_parallel)
+
+        if "global_seed" in get_rng_state_tracker().states_:
+            with get_rng_state_tracker().rng_state("global_seed"):
+                output = self.dropout(output)
+        else:
+            output = self.dropout(output)
+
+        return output
+
+
+class GLMStack(nn.Layer):
+    """
+    GLM Transformer
+    """
+
+    def __init__(self, config: GLMConfig):
+        super(GLMStack, self).__init__()
+        self.hidden_size = config.hidden_size
+        # Recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        self.checkpoint_num_layers = config.checkpoint_num_layers
+
+        self.embedding_dropout = nn.Dropout(config.embedding_dropout_prob)
+        self.block_position_encoding = config.block_position_encoding
+
+        if self.block_position_encoding:
+            self.position_embeddings = nn.Embedding(
+                config.max_sequence_length + 1,
+                config.hidden_size,
+                weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal(mean=0, std=config.initializer_range)),
+            )
+            self.block_position_embeddings = nn.Embedding(
+                config.max_sequence_length + 1,
+                config.hidden_size,
+                weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal(mean=0, std=config.initializer_range)),
+            )
+        else:
+            self.position_embeddings = nn.Embedding(
+                config.max_sequence_length,
+                config.hidden_size,
+                weight_attr=paddle.ParamAttr(initializer=nn.initializer.Normal(mean=0, std=config.initializer_range)),
+            )
+
+        self.layers = nn.LayerList()
+        for _ in range(config.num_layers):
+            self.layers.append(GLMBlock(config))
+
+        self.final_layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layernorm_epsilon)
+
+    @paddle.jit.not_to_static
+    def recompute_training(self, layer_module: nn.Layer, hidden_states: Tensor, ltor_mask: Tensor, cache: Tensor):
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+
+            return custom_forward
+
+        hidden_states = recompute(create_custom_forward(layer_module), hidden_states, ltor_mask, cache)
+        return hidden_states
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        position_ids: Tensor,
+        attention_mask: Tensor,
+        cache: Optional[Tensor] = None,
+        return_dict: bool = False,
+    ):
+        batch_size, query_length = hidden_states.shape[:2]
+        memory_length = cache[0].shape[1] if cache is not None else 0
+
+        if attention_mask.dim == 1:
+            is_scalar = bool(paddle.numel(attention_mask) == 1)
+            scalar_sep = attention_mask[0] if is_scalar else attention_mask
+
+            # attention mask is the beginning postion of B region in [0, query_len)
+            def build_mask_matrix(seq_length, sep, memory_length=0):
+                mask = paddle.ones([1, seq_length, seq_length])
+                mask = paddle.tril(mask)
+                if is_scalar:
+                    mask[0, :, : int(sep)] = 1
+                else:
+                    mask = mask.expand([batch_size, -1, -1])
+                    ids = paddle.arange(seq_length, dtype=sep.dtype).unsqueeze(0)
+                    m = (ids < sep.reshape([-1, 1])).astype("float32")
+                    m = m.unsqueeze(1).expand_as(mask).astype("bool")
+                    y = paddle.full(mask.shape, 1, mask.dtype)
+                    mask = paddle.where(m, y, mask)
+                if memory_length > 0:
+                    mask = mask.expand([batch_size, -1, -1])
+                    mask = paddle.concat([paddle.ones([batch_size, seq_length, memory_length]), mask], axis=2)
+                mask = mask.unsqueeze(1)
+                return mask
+
+            attention_mask = build_mask_matrix(query_length, scalar_sep, memory_length=memory_length)
+        elif attention_mask.dim == 2 or attention_mask.dim == 4:
+            if attention_mask.dim() == 2:
+                attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)
+            attention_mask = attention_mask[:, :, :, -query_length - memory_length :]
+
+        if self.block_position_encoding:
+            position_ids, block_position_ids = position_ids[:, 0], position_ids[:, 1]
+        position_embeddings = self.position_embeddings(position_ids)
+
+        hidden_states = hidden_states + position_embeddings
+
+        if self.block_position_encoding:
+            block_position_embeddings = self.block_position_embeddings(block_position_ids)
+            hidden_states = hidden_states + block_position_embeddings
+
+        if "local_seed" in get_rng_state_tracker().states_:
+            with get_rng_state_tracker().rng_state("local_seed"):
+                hidden_states = self.embedding_dropout(hidden_states)
+        else:
+            hidden_states = self.embedding_dropout(hidden_states)
+
+        all_hidden_states = [hidden_states.detach()]
+        for i, layer in enumerate(self.layers):
+            mem_i = cache[i] if cache is not None else None
+            has_gradient = not hidden_states.stop_gradient
+            if self.enable_recompute and has_gradient:
+                # TODO Should the attention_mask be added, it seems missing in original application.
+                hidden_states = self.recompute_training(layer, hidden_states, attention_mask, cache=mem_i)
+            else:
+                hidden_states = layer(hidden_states, attention_mask, cache=mem_i)
+
+            if isinstance(hidden_states, tuple):
+                hidden_states = hidden_states[0]
+
+            all_hidden_states.append(hidden_states.detach())
+
+        output = self.final_layernorm(hidden_states)
+        new_caches = self.update_memories(all_hidden_states, cache)
+
+        if not return_dict:
+            return (output, new_caches)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=output,
+            past_key_values=new_caches,
+            hidden_states=all_hidden_states,
+        )
+
+    def update_memories(self, hiddens, cache):
+        memory_length = cache[0].shape[1] if cache else 0
+        query_length = hiddens[0].shape[1]
+        new_memory_length = memory_length + query_length
+
+        new_memories = cache if cache is not None else []
+        for i in range(len(hiddens)):
+            if cache is None:
+                new_memories.append((hiddens[i][-new_memory_length:]))
+            else:
+                new_memories[i] = paddle.concat([cache[i][:, -memory_length:], hiddens[i]], axis=1)
+        return new_memories
+
+
+class GLMPretrainedModel(PretrainedModel):
+    """
+    An abstarct class for pretrained GLM models. It provides GLM related
+    `model_config_file`, `resource_file_names`, `pretrained_resource_files_map`,
+    `pretrained_init_configuration`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    base_model_prefix = "glm"
+    config_class = GLMConfig
+    model_config_file = CONFIG_NAME
+    resource_files_names = {"model_state": "model_state.pdparams"}
+    pretrained_init_configuration = GLM_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = GLM_PRETRAINED_RESOURCE_FILES_MAP
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config, is_split=True):
+
+        from paddlenlp.transformers.conversion_utils import split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def get_tensor_parallel_split_mappings(num_layers):
+            final_actions = {}
+            base_actions = {
+                # Column Linear
+                "transformer.layers.0.mlp.dense_h_to_4h.bias": partial(fn, is_column=True),
+                "transformer.layers.0.mlp.dense_h_to_4h.weight": partial(fn, is_column=True),
+                "transformer.layers.0.attention.query_key_value.bias": partial(fn, is_column=True, is_old_qkv=True),
+                "transformer.layers.0.attention.query_key_value.weight": partial(fn, is_column=True, is_old_qkv=True),
+                # Row Linear
+                "word_embeddings.weight": partial(fn, is_column=False),
+                # 'transformer.layers.0.attention.dense.bias',
+                "transformer.layers.0.attention.dense.weight": partial(fn, is_column=False),
+                # 'transformer.layers.0.mlp.dense_4h_to_h.bias',
+                "transformer.layers.0.mlp.dense_4h_to_h.weight": partial(fn, is_column=False),
+            }
+            for key, action in base_actions.items():
+                if "layers.0." in key:
+                    for i in range(num_layers):
+                        final_actions[key.replace("layers.0.", f"layers.{i}.")] = action
+                final_actions[key] = action
+
+            return final_actions
+
+        mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
+
+        return mappings
+
+    @classmethod
+    def _get_name_mappings(cls, config):
+        mappings: list[StateDictNameMapping] = []
+        model_mappings = [
+            "word_embeddings.weight",
+            "transformer.position_embeddings.weight",
+            "transformer.block_position_embeddings.weight",
+            "transformer.final_layernorm.weight",
+            "transformer.final_layernorm.bias",
+        ]
+        for layer_index in range(config.num_hidden_layers):
+            layer_mappings = []
+            transpose_names = [
+                "attention.query_key_value.weight",
+                "attention.dense.weight",
+                "mlp.dense_h_to_4h.weight",
+                "mlp.dense_4h_to_h.weight",
+            ]
+            mapping_names = [
+                "attention.query_key_value.bias",
+                "input_layernorm.weight",
+                "input_layernorm.bias",
+                "attention.dense.bias",
+                "post_attention_layernorm.weight",
+                "post_attention_layernorm.bias",
+                "mlp.dense_h_to_4h.bias",
+                "mlp.dense_4h_to_h.bias",
+            ]
+            for name in mapping_names:
+                layer_mappings.append(
+                    [f"transformer.layers.{layer_index}.{name}", f"transformer.layers.{layer_index}.{name}"]
+                )
+            for name in transpose_names:
+                layer_mappings.append(
+                    [
+                        f"transformer.layers.{layer_index}.{name}",
+                        f"transformer.layers.{layer_index}.{name}",
+                        "transpose",
+                    ]
+                )
+
+            model_mappings.extend(layer_mappings)
+        init_name_mappings(model_mappings)
+
+        import numpy as np
+
+        from paddlenlp.transformers.conversion_utils import (
+            naive_merged_qkv_to_tensor_parallel_qkv,
+            split_tensor_parallel_weight,
+        )
+
+        def fn(x, is_column=True, transpose=False, is_old_qkv=False):
+            if transpose:
+                x = np.transpose(x, [1, 0])
+            if is_old_qkv:
+                assert is_column, "QKV vectors should be column parallel linear."
+                x = naive_merged_qkv_to_tensor_parallel_qkv(x, config.num_attention_heads)
+            return split_tensor_parallel_weight(
+                x,
+                tensor_parallel_degree=config.tensor_parallel_degree,
+                tensor_parallel_rank=config.tensor_parallel_rank,
+                is_column=is_column,
+            )
+
+        def get_tensor_parallel_split_mappings(num_layers):
+            final_actions = {}
+            base_actions = {
+                # Column Linear
+                "transformer.layers.0.mlp.dense_h_to_4h.bias": partial(
+                    fn, is_column=True, transpose=False, is_old_qkv=False
+                ),
+                "transformer.layers.0.mlp.dense_h_to_4h.weight": partial(
+                    fn, is_column=True, transpose=True, is_old_qkv=False
+                ),
+                "transformer.layers.0.attention.query_key_value.bias": partial(
+                    fn, is_column=True, transpose=False, is_old_qkv=True
+                ),
+                "transformer.layers.0.attention.query_key_value.weight": partial(
+                    fn, is_column=True, transpose=True, is_old_qkv=True
+                ),
+                # Row Linear
+                "word_embeddings.weight": partial(fn, is_column=False, transpose=False, is_old_qkv=False),
+                # 'transformer.layers.0.attention.dense.bias',
+                "transformer.layers.0.attention.dense.weight": partial(
+                    fn, is_column=False, transpose=True, is_old_qkv=False
+                ),
+                # 'transformer.layers.0.mlp.dense_4h_to_h.bias',
+                "transformer.layers.0.mlp.dense_4h_to_h.weight": partial(
+                    fn, is_column=False, transpose=True, is_old_qkv=False
+                ),
+            }
+            for key, action in base_actions.items():
+                if "layers.0." in key:
+                    for i in range(num_layers):
+                        final_actions[key.replace("layers.0.", f"layers.{i}.")] = action
+                final_actions[key] = action
+
+            return final_actions
+
+        if config.tensor_parallel_degree > 1:
+            tp_split_mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
+            for mapping in model_mappings:
+                if mapping[1] in tp_split_mappings:
+                    if len(mapping) == 3:
+                        mapping[2] = tp_split_mappings[mapping[1]]
+                    else:
+                        mapping.append(tp_split_mappings[mapping[1]])
+
+        if cls.__name__ != "GLMModel":
+            for mapping in model_mappings:
+                mapping[1] = "glm." + mapping[1]
+
+        mappings = [StateDictNameMapping(*mapping) for mapping in model_mappings]
+        return mappings
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, nn.Linear):
+            std = self.config.initializer_range
+            # TODO: initialization for glm-515m
+            # if self.config.use_scaled_init_for_output_weights and _is_output_dense(layer):
+            #     std = self.config.initializer_range / math.sqrt(2.0 * self.config.num_layers)
+            normal_(layer.weight, mean=0.0, std=std)
+            if layer.bias is not None:
+                zeros_(layer.bias)
+        elif isinstance(layer, nn.Embedding):
+            normal_(layer.weight, mean=0.0, std=self.config.initializer_range)
+        elif isinstance(layer, nn.LayerNorm):
+            ones_(layer.weight)
+            zeros_(layer.bias)
+
+
+def parallel_matmul(lm_output, logit_weights, parallel_output):
+    hcg = fleet.get_hybrid_communicate_group()
+    model_parallel_group = hcg.get_model_parallel_group()
+    world_size = hcg.get_model_parallel_world_size()
+    # rank = hcg.get_model_parallel_rank()
+
+    if world_size > 1:
+        # _c_identity is backwards is reduce
+        input_parallel = paddle.distributed.collective._c_identity(lm_output, group=model_parallel_group)
+
+        logits = paddle.matmul(input_parallel, logit_weights, transpose_y=True)
+
+        if parallel_output:
+            return logits
+
+        # _c_concat has not grad backwards
+        return paddle.distributed.collective._c_concat(logits, group=model_parallel_group)
+    else:
+        logits = paddle.matmul(lm_output, logit_weights, transpose_y=True)
+        return logits
+
+
+@register_base_model
+class GLMModel(GLMPretrainedModel):
+    r"""
+    The GLM Model transformer can behave as an encoder (with only self-attention) as well as a decoder, where
+    a layer of cross-attention is added between the self-attention layers, following the architecture
+    described in [Attention is all you need](https://arxiv.org/abs/1706.03762).
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+    """
+
+    def __init__(self, config: GLMConfig):
+        super(GLMModel, self).__init__(config)
+        self.config = config
+        self.output_predict = config.output_predict
+        if self.config.tensor_parallel_degree > 1:
+            self.word_embeddings = fleet.meta_parallel.VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()),
+            )
+        else:
+            self.word_embeddings = nn.Embedding(
+                config.vocab_size,
+                config.hidden_size,
+                weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()),
+            )
+
+        self.transformer = GLMStack(config)
+
+    def get_input_embeddings(self):
+        return self.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.word_embeddings = value
+
+    def forward(
+        self,
+        input_ids: Tensor = None,
+        position_ids: Tensor = None,
+        attention_mask: Tensor = None,
+        cache: Tensor = None,
+        return_dict: bool = True,
+    ):
+        batch_size = input_ids.shape[0]
+        word_embeddings = self.word_embeddings(input_ids)
+        input_shape = input_ids.shape
+
+        if position_ids is None:
+            position_ids = paddle.arange(0, input_shape[-1], dtype="int64")
+            block_position_ids = paddle.zeros(input_shape[-1:], dtype="int64")
+            position_ids = paddle.stack([position_ids, block_position_ids], axis=0).unsqueeze(0)
+
+        if attention_mask is None:
+            attention_mask = paddle.zeros([batch_size])
+
+        outputs = self.transformer(word_embeddings, position_ids, attention_mask, cache, return_dict)
+
+        if self.output_predict:
+            if return_dict:
+                hidden_states = outputs.last_hidden_state
+            else:
+                hidden_states = outputs[0] if isinstance(outputs, tuple) else outputs
+
+            if self.config.tensor_parallel_degree > 1:
+                # FIXME: @ZHUI fix for jit_to_static
+                logits = parallel_matmul(
+                    hidden_states, self.word_embeddings.weight, self.config.tensor_parallel_output
+                )
+            else:
+                logits = F.linear(hidden_states, self.word_embeddings.weight.T)
+
+            if not return_dict:
+                outputs = (logits,) + outputs[1:]
+
+                return outputs
+
+            return CausalLMOutputWithCrossAttentions(
+                logits=logits,
+                past_key_values=outputs.past_key_values,
+                hidden_states=outputs.hidden_states,
+            )
+        else:
+            return outputs
+
+
+class GLMForMultipleChoice(GLMPretrainedModel):
+    """
+    GLM Model transformer for multiple choice classification
+    """
+
+    def __init__(self, config: GLMConfig):
+        super(GLMForMultipleChoice, self).__init__(config)
+        # GLMForMultipleChoice need loggit
+        if not config.output_predict:
+            logger.warning("GLMForMultipleChoice need loggit, please set config.output_predict to True.")
+            config.output_predict = True
+
+        self.glm = GLMModel(config)
+
+    def forward(
+        self,
+        input_ids: Tensor = None,
+        position_ids: Tensor = None,
+        attention_mask: Tensor = None,
+        choice_ids: Tensor = None,
+        choice_indices: Tensor = None,
+        labels: Tensor = None,
+        return_dict: bool = None,
+    ):
+        model_output = self.glm(input_ids, position_ids, attention_mask, return_dict=return_dict)
+        lm_logits = model_output.logits if return_dict else model_output
+        # [bs, seq_len, vocab]
+        lm_logits = lm_logits[0] if isinstance(lm_logits, tuple) else lm_logits
+        log_probs = []
+        for output, choices, choice_index in zip(F.log_softmax(lm_logits, axis=-1), choice_ids, choice_indices):
+            log_probs_single = []
+            for choice, choice_target_id in zip(choices, choice_index):
+                log_prob = output[choice_target_id, choice].sum()
+                if len(log_prob.shape) == 0:
+                    log_prob = log_prob.unsqueeze(0)
+                log_probs_single.append(log_prob)
+            log_probs.append(paddle.stack(log_probs_single))
+        log_probs = paddle.stack(log_probs).squeeze(2)
+        loss = None
+        if labels is not None:
+            if self.glm.config.tensor_parallel_degree > 1:
+                assert (
+                    self.glm.config.tensor_parallel_output is False
+                ), "GLMForMultipleChoice not avaliable for tensor_parallel_output!"
+
+            loss = F.cross_entropy(log_probs, labels)
+
+        if not return_dict:
+            output = (log_probs, lm_logits)
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=log_probs,
+            hidden_states=lm_logits,
+        )
+
+
+class GLMForConditionalGeneration(GLMPretrainedModel):
+    """
+    GLM Model transformer with a `language modeling` head on top.
+    """
+
+    def __init__(self, config: GLMConfig):
+        super(GLMForConditionalGeneration, self).__init__(config)
+        # GLMForConditionalGeneration need loggit
+        if not config.output_predict:
+            logger.warning("GLMForConditionalGeneration need loggit, please set config.output_predict to True.")
+            config.output_predict = True
+
+        self.glm = GLMModel(config)
+
+    def _reorder_cache(self, cache, beam_index):
+        # Speedy decoding is disabled and no reorder is needed if decoder cache is not given.
+        if cache is None:
+            return None
+
+        reordered_decoder_cache = ()
+        for layer_cache_states in cache:
+            # Get correct batch index from layer cache batch dimension
+            reordered_decoder_cache = reordered_decoder_cache + (layer_cache_states.index_select(0, beam_index),)
+        return reordered_decoder_cache
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: Tensor,
+        position_ids: Tensor = None,
+        attention_mask: Tensor = None,
+        cache: Tensor = None,
+        **kwargs
+    ):
+        attention_mask_gen = attention_mask
+        seq_length = input_ids.shape[1]
+        if cache:
+            if position_ids is not None:
+                position_ids = position_ids[:, :, seq_length - 1].unsqueeze(-1)
+            if attention_mask is not None:
+                attention_mask_gen = attention_mask[:, :, seq_length - 1, :seq_length].unsqueeze(-2)
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+        else:
+            if position_ids is not None:
+                position_ids = position_ids[:, :, :seq_length]
+            if attention_mask is not None:
+                attention_mask_gen = attention_mask[:, :, :seq_length, :seq_length]
+        return {
+            "input_ids": input_ids,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask_gen,
+            "cache": cache,
+            "use_cache": True,
+        }
+
+    def forward(
+        self,
+        input_ids: Tensor = None,
+        position_ids: Tensor = None,
+        attention_mask: Tensor = None,
+        labels: Tensor = None,
+        cache: Tensor = None,
+        return_dict: bool = None,
+        loss_mask: Tensor = None,
+        use_cache=True,
+    ):
+        model_output = self.glm(input_ids, position_ids, attention_mask, cache=cache, return_dict=return_dict)
+        if return_dict:
+            lm_logits, cache = model_output.logits, model_output.past_key_values
+        else:
+            lm_logits, cache = model_output
+        # lm_logits [bs, seq_length, vocab_size]
+        loss = None
+        if labels is not None:
+            # Since ParallelCrossEntropy not support -100 ingore index.
+            # we use pad_token_id
+            if self.glm.config.tensor_parallel_degree > 1 and self.glm.config.tensor_parallel_output:
+                self.parallel_loss_func = fleet.meta_parallel.ParallelCrossEntropy()
+                loss = self.parallel_loss_fun(lm_logits, labels)
+            else:
+                loss = F.cross_entropy(
+                    lm_logits.reshape([-1, lm_logits.shape[-1]]), labels.reshape([-1]), reduction="none"
+                )
+            label_smoothing = getattr(self.config, "label_smoothing", 0)
+            if label_smoothing > 0:
+                smooth_loss = (-F.log_softmax(lm_logits, axis=-1) / lm_logits.shape[2]).sum(axis=-1)
+                loss = (1 - label_smoothing) * loss + label_smoothing * smooth_loss
+            if loss_mask is not None:
+                loss_mask = loss_mask.reshape([-1])
+                loss = paddle.sum(loss.reshape([-1]) * loss_mask) / paddle.sum(loss_mask)
+
+        if not return_dict:
+            output = (lm_logits, cache)
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(loss=loss, logits=lm_logits, past_key_values=cache)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/glm/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/glm/tokenizer.py
new file mode 100644
index 000000000..6f535ac26
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/glm/tokenizer.py
@@ -0,0 +1,501 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from shutil import copyfile
+from typing import List, Optional, Tuple
+
+import numpy as np
+import paddle
+import sentencepiece as spm
+from scipy.linalg import block_diag
+
+from ...utils.log import logger
+from .. import BertTokenizer, GPTTokenizer
+from ..tokenizer_utils import PretrainedTokenizer
+from ..tokenizer_utils_base import BatchEncoding
+
+
+class GLMTokenizerMixin:
+    """
+    BOS and EOS tokens are used for autoregressive blank filling.
+    """
+
+    @property
+    def sop_token(self) -> Optional[str]:
+        return "<|startofpiece|>"
+
+    @property
+    def sop_token_id(self) -> Optional[int]:
+        return self.convert_tokens_to_ids(self.sop_token)
+
+    @property
+    def eop_token(self) -> Optional[str]:
+        return "<|endofpiece|>"
+
+    @property
+    def eop_token_id(self) -> Optional[int]:
+        return self.convert_tokens_to_ids(self.eop_token)
+
+    @property
+    def gmask_token_id(self) -> int:
+        return self.convert_tokens_to_ids("[gMASK]")
+
+    @property
+    def smask_token_id(self) -> int:
+        return self.convert_tokens_to_ids("[sMASK]")
+
+    @property
+    def mask_token_ids(self):
+        return [self.mask_token_id, self.smask_token_id, self.gmask_token_id]
+
+    def _build_input_for_multiple_choice(self, context, choices):
+        context_id = context["input_ids"]
+        if isinstance(context_id, paddle.Tensor):
+            context_id = context_id.tolist()
+
+        division = len(context_id)
+        mask_position = context_id.index(self.mask_token_id)
+        token = np.array(context_id, dtype="int64")
+        attention_mask = [context["attention_mask"].repeat(division, axis=0)]
+        position_id = np.arange(division, dtype="int64")
+        block_position_id = np.zeros([division], dtype="int64")
+
+        choice_ids, choice_indices = [], []
+
+        for choice_str in choices:
+            choice = np.array(
+                self(choice_str, add_special_tokens=False, padding=False)["input_ids"],
+                dtype="int64",
+            )
+            choice_ids.append(choice)
+
+            choice_indices.append(np.arange(len(token), len(token) + len(choice), dtype="int64"))
+            attention_mask.append(np.tril(np.ones([len(choice), len(choice)], dtype="int64")))
+
+            token = np.concatenate([token, np.array([self.sop_token_id], dtype="int64"), choice[:-1]])
+            position_id = np.concatenate([position_id, np.array([mask_position] * len(choice), dtype="int64")])
+            block_position_id = np.concatenate([block_position_id, np.arange(1, len(choice) + 1, dtype="int64")])
+
+        attention_mask = np.array(block_diag(*[x.tolist() for x in attention_mask]))
+        attention_mask[division:, :division] = context["attention_mask"][None, :]
+
+        return {
+            "input_ids": token,
+            "position_ids": np.stack([position_id, block_position_id]),
+            "attention_mask": attention_mask,
+            "choice_ids": choice_ids,
+            "choice_indices": choice_indices,
+        }
+
+    def _pad_batch(self, tokens, position_ids, attention_mask, max_seq_length):
+        pad_length = max_seq_length - len(tokens)
+        attention_mask = np.pad(attention_mask, [0, pad_length, 0, pad_length], mode="constant", constant_values=0)
+        tokens = np.concatenate([tokens, np.zeros([pad_length], dtype="int64")])
+        if pad_length > 0:
+            position_ids = np.concatenate([position_ids, position_ids[..., -1:].repeat(pad_length, axis=1)], axis=-1)
+        return tokens, position_ids, attention_mask
+
+    def _collate(self, samples):
+        TILE = 1
+        length_to_pad = (max(map(lambda spl: len(spl["input_ids"]), samples)) + TILE - 1) // TILE * TILE
+
+        token_batch, position_id_batch, attention_mask_batch = [], [], []
+        choices_batch, choice_target_ids_batch = [], []
+
+        for sample in samples:
+            token, position_id, attention_mask = self._pad_batch(
+                sample["input_ids"], sample["position_ids"], sample["attention_mask"], length_to_pad
+            )
+            token_batch.append(token)
+            position_id_batch.append(position_id)
+            attention_mask_batch.append(attention_mask)
+            choices_batch.append(sample["choice_ids"])
+            choice_target_ids_batch.append(sample["choice_indices"])
+        return BatchEncoding(
+            {
+                "input_ids": np.stack(token_batch),
+                "position_ids": np.stack(position_id_batch),
+                "attention_mask": np.stack(attention_mask_batch).unsqueeze(1),
+                "choice_ids": choices_batch,
+                "choice_indices": choice_target_ids_batch,
+            }
+        )
+
+    def build_inputs_for_multiple_choice(self, model_input: BatchEncoding, choices, max_length=None):
+        samples = [{key: value[i] for key, value in model_input.items()} for i in range(len(model_input["input_ids"]))]
+        samples = [self._build_input_for_multiple_choice(sample, choice) for sample, choice in zip(samples, choices)]
+        inputs = self._collate(samples)
+        return BatchEncoding(inputs)
+
+    def build_inputs_for_generation(
+        self,
+        model_input: BatchEncoding,
+        max_gen_length=512,
+        targets=None,
+        padding=False,
+        is_train=False,
+    ):
+        mask_ids = self.mask_token_ids
+        input_ids = model_input.input_ids
+        batch_size, seq_length = input_ids.shape[:2]
+        position_id, block_position_id = list(range(seq_length)), [0 for _ in range(seq_length)]
+        position_ids, block_position_ids = [], []
+        labels = None
+        loss_mask = None
+        if targets is not None:
+            is_batched = isinstance(targets, (list, tuple))
+            targets = self(
+                targets,
+                add_special_tokens=False,
+                padding=False,
+                max_length=max_gen_length - 2,
+                truncation=True,
+                truncation_side="right",
+            ).input_ids
+            if not is_batched:
+                targets = [targets]
+            assert len(targets) == len(input_ids)
+            targets = [(target + [self.eop_token_id])[:max_gen_length] for target in targets]
+            if not padding:
+                max_gen_length = max(map(len, targets))
+            targets = [[self.sop_token_id] + target for target in targets]
+            labels = [target[1:] for target in targets]
+            targets = [target + [self.pad_token_id] * (max_gen_length + 1 - len(target)) for target in targets]
+            labels = [label + [self.pad_token_id] * (max_gen_length - len(label)) for label in labels]
+            targets = np.array(targets, dtype="int64")
+            loss_mask = np.logical_and(targets != self.pad_token_id, targets != self.eop_token_id).astype("int64")
+            labels = np.array(labels, dtype="int64")
+            labels = np.concatenate([np.zeros([batch_size, seq_length], dtype="int64"), labels], axis=1)
+
+        for i in range(batch_size):
+            mask_positions = []
+            for mask_id in mask_ids:
+                mask_positions += np.nonzero(input_ids[i] == mask_id)[0].tolist()
+            if not mask_positions:
+                raise ValueError("Cannot find mask token in the input.")
+            mask_positions.sort()
+            mask_pos = mask_positions[0]
+            position_ids.append(position_id + [mask_pos] * max_gen_length)
+            block_position_ids.append(block_position_id + list(range(1, max_gen_length + 1)))
+        position_ids = np.array(position_ids, dtype="int64")
+        block_position_ids = np.array(block_position_ids, dtype="int64")
+        position_ids = np.stack([position_ids, block_position_ids], axis=1)
+
+        attention_mask = model_input.attention_mask
+        attention_mask = attention_mask[:, None, :].repeat(seq_length + max_gen_length, axis=1)
+        generation_attention_mask = np.concatenate(
+            [
+                np.zeros([seq_length, max_gen_length], dtype=attention_mask.dtype),
+                np.tril(np.ones([max_gen_length, max_gen_length], dtype=attention_mask.dtype)),
+            ],
+            axis=0,
+        )[None, :, :].repeat(batch_size, axis=0)
+        attention_mask = np.concatenate([attention_mask, generation_attention_mask], axis=2)[:, None, :, :]
+
+        if targets is None:
+            input_ids = np.concatenate(
+                [input_ids, np.full([batch_size, 1], self.sop_token_id, dtype=input_ids.dtype)], axis=-1
+            )
+        else:
+            loss_mask = np.concatenate([np.zeros_like(input_ids), loss_mask], axis=1)
+            input_ids = np.concatenate([input_ids, targets[:, :-1]], axis=1)
+            loss_mask = loss_mask[:, : len(input_ids[0])]
+
+        batch = {"input_ids": input_ids, "position_ids": position_ids}
+        if labels is None:
+            batch["attention_mask"] = attention_mask
+        else:
+            batch["attention_mask"] = attention_mask
+            batch["loss_mask"] = loss_mask
+            batch["label_ids"] = labels
+        return BatchEncoding(batch, tensor_type="np")
+
+
+class GLMChineseTokenizer(PretrainedTokenizer, GLMTokenizerMixin):
+    model_input_names = ["input_ids", "position_ids", "attention_mask"]
+    resource_files_names = {"model_file": "cog-pretrain.model"}
+    truncation_side: str = "left"
+    pretrained_init_configuration = {
+        "THUDM/glm-large-chinese": {"do_lower_case": True},
+        "THUDM/glm-10b-chinese": {"do_lower_case": True},
+    }
+    cog_model_link = "https://paddlenlp.bj.bcebos.com/models/community/THUDM/cog-pretrain.model"
+    pretrained_resource_files_map = {
+        "model_file": {
+            "THUDM/glm-large-chinese": cog_model_link,
+            "THUDM/glm-10b-chinese": cog_model_link,
+        },
+    }
+    max_model_input_sizes = {"THUDM/glm-10b-chinese": 1024, "THUDM/glm-large-chinese": 1024}
+
+    def __init__(
+        self,
+        model_file,
+        cls_token="[CLS]",
+        sep_token="[SEP]",
+        unk_token="[UNK]",
+        mask_token="[MASK]",
+        pad_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        additional_special_tokens=None,
+        **kwargs
+    ):
+        if additional_special_tokens is None:
+            additional_special_tokens = [
+                "[UNUSED1]",
+                "[UNUSED2]",
+                "<|startofpiece|>",
+                "<|endofpiece|>",
+                "[sMASK]",
+                "[gMASK]",
+            ]
+        super().__init__(
+            cls_token=cls_token,
+            sep_token=sep_token,
+            unk_token=unk_token,
+            mask_token=mask_token,
+            pad_token=pad_token,
+            eos_token=eos_token,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+        self._model_file = model_file
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(model_file)
+
+    @property
+    def vocab_size(self):
+        return len(self.sp_model)
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text, **kwargs):
+        return self.sp_model.Encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.PieceToId(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.sp_model.IdToPiece(index)
+
+    def convert_tokens_to_string(self, tokens):
+        return self.sp_model.Decode(tokens)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is not None:
+            logger.warning("Support single input text and the second one is ignored.")
+        cls = [self.cls_token_id]
+        eos = [self.eos_token_id]
+        return cls + token_ids_0 + eos
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens: bool = False):
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.eos_token_id, self.cls_token_id] else 0, token_ids_0))
+        if token_ids_1 is not None:
+            logger.warning("Support single input text and the second one is ignored.")
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        if token_ids_1 is not None:
+            logger.warning("Support single input text and the second one is ignored.")
+        return len([self.cls_token_id] + token_ids_0 + [self.eos_token_id]) * [0]
+
+
+class GLMGPT2Tokenizer(GPTTokenizer, GLMTokenizerMixin):
+    model_input_names = ["input_ids", "position_ids", "attention_mask"]
+    truncation_side: str = "left"
+    pretrained_init_configuration = {
+        "THUDM/glm-2b": {},
+        "THUDM/glm-10b": {},
+    }
+    added_tokens_link = "https://paddlenlp.bj.bcebos.com/models/community/THUDM/glm-added-tokens.json"
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "THUDM/glm-2b": "https://paddlenlp.bj.bcebos.com/models/community/THUDM/glm-2b-vocab.json",
+            "THUDM/glm-10b": "https://paddlenlp.bj.bcebos.com/models/community/THUDM/glm-10b-vocab.json",
+        },
+        "merges_file": {
+            "THUDM/glm-2b": "https://paddlenlp.bj.bcebos.com/models/community/THUDM/glm-2b-merges.txt",
+            "THUDM/glm-10b": "https://paddlenlp.bj.bcebos.com/models/community/THUDM/glm-10b-merges.txt",
+        },
+        "added_tokens_file": {
+            "THUDM/glm-2b": added_tokens_link,
+            "THUDM/glm-10b": added_tokens_link,
+        },
+    }
+    max_model_input_sizes = {
+        "THUDM/glm-2b": 1024,
+        "THUDM/glm-10b": 1024,
+    }
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        cls_token="[CLS]",
+        sep_token="[SEP]",
+        unk_token="[UNK]",
+        mask_token="[MASK]",
+        pad_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        **kwargs
+    ):
+        super().__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            cls_token=cls_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+
+    def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None):
+        if token_ids_1 is not None:
+            logger.warning("Support single input text and the second one is ignored.")
+        cls = [self.cls_token_id]
+        eos = [self.eos_token_id]
+        return cls + token_ids_0 + eos
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens: bool = False):
+        if already_has_special_tokens:
+            raise ValueError(
+                "You should not supply a second sequence if the provided sequence of "
+                "ids is already formatted with special tokens for the model."
+            )
+            return list(map(lambda x: 1 if x in [self.eos_token_id, self.cls_token_id] else 0, token_ids_0))
+        if token_ids_1 is not None:
+            logger.warning("Support single input text and the second one is ignored.")
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        if token_ids_1 is not None:
+            logger.warning("Support single input text and the second one is ignored.")
+        return len([self.cls_token_id] + token_ids_0 + [self.eos_token_id]) * [0]
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+
+class GLMBertTokenizer(BertTokenizer, GLMTokenizerMixin):
+    model_input_names = ["input_ids", "position_ids", "attention_mask"]
+    truncation_side: str = "left"
+    pretrained_init_configuration = {
+        "THUDM/glm-515m": {"do_lower_case": True},
+    }
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "THUDM/glm-515m": "https://paddlenlp.bj.bcebos.com/models/community/THUDM/glm-515m-vocab.txt",
+        },
+    }
+    max_model_input_sizes = {
+        "THUDM/glm-515m": 512,
+    }
+
+
+class GLMTokenizer:
+    """
+    GLMTokenizer is a generic tokenizer class that will be instantiated as GLMChineseTokenizer,
+    GLMGPT2Tokenizer or GLMBertTokenizer when created with GLMTokenizer.from_pretrained() class method.
+    """
+
+    bert_model_names = GLMBertTokenizer.pretrained_init_configuration.keys()
+    chinese_model_names = GLMChineseTokenizer.pretrained_init_configuration.keys()
+    gpt2_model_names = GLMGPT2Tokenizer.pretrained_init_configuration.keys()
+    tokenizer_config_file = "tokenizer_config.json"
+
+    def __init__(self, *args, **kwargs):
+        raise EnvironmentError(
+            f"{self.__class__.__name__} is designed to be instantiated "
+            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path).`"
+        )
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
+        # From built-in pretrained models
+        if pretrained_model_name_or_path in cls.bert_model_names:
+            return GLMBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif pretrained_model_name_or_path in cls.chinese_model_names:
+            return GLMChineseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif pretrained_model_name_or_path in cls.gpt2_model_names:
+            return GLMGPT2Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        # From local dir path
+        elif os.path.isdir(pretrained_model_name_or_path):
+            config_file = os.path.join(pretrained_model_name_or_path, cls.tokenizer_config_file)
+            with open(config_file, "r", encoding="utf-8") as fp:
+                tokenizer_config = json.load(fp)
+            config_tokenizer_class = tokenizer_config.get("tokenizer_class")
+            if config_tokenizer_class == "GLMChineseTokenizer":
+                tokenizer_class = GLMChineseTokenizer
+            elif config_tokenizer_class == "GLMGPT2Tokenizer":
+                tokenizer_class = GLMGPT2Tokenizer
+            elif config_tokenizer_class == "GLMBertTokenizer":
+                tokenizer_class = GLMBertTokenizer
+            else:
+                raise NotImplementedError("Not implemented tokenizer type:", config_tokenizer_class)
+            return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        # TODO: Assuming from community-contributed pretrained models
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gpt/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gpt/__init__.py
new file mode 100644
index 000000000..564ae17b1
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gpt/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .configuration import *
+from .modeling import *
+from .modeling_auto import *
+from .modeling_pp import *
+from .tokenizer import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gpt/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gpt/configuration.py
new file mode 100644
index 000000000..ad4730a48
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gpt/configuration.py
@@ -0,0 +1,303 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" GPT model configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["GPT_PRETRAINED_INIT_CONFIGURATION", "GPTConfig", "GPT_PRETRAINED_RESOURCE_FILES_MAP"]
+
+GPT_PRETRAINED_INIT_CONFIGURATION = {
+    "gpt-cpm-large-cn": {  # 2.6B
+        "vocab_size": 30000,
+        "hidden_size": 2560,
+        "num_hidden_layers": 32,
+        "num_attention_heads": 32,
+        "intermediate_size": 10240,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 1024,
+        "type_vocab_size": 1,  # no use
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+        "eos_token_id": 7,
+        "bos_token_id": 0,
+        "eol_token_id": 3,
+    },
+    "gpt-cpm-small-cn-distill": {  # 109M
+        "vocab_size": 30000,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 1024,
+        "type_vocab_size": 1,  # no use
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+        "eos_token_id": 7,
+        "bos_token_id": 0,
+        "eol_token_id": 3,
+    },
+    "gpt3-89B-en": {  # 89B
+        "vocab_size": 51200,
+        "hidden_size": 12288,
+        "num_hidden_layers": 48,
+        "num_attention_heads": 96,
+        "intermediate_size": 49152,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 1024,
+        "type_vocab_size": 1,  # no use
+        "initializer_range": 0.02,
+        "eos_token_id": 50256,
+        "eol_token_id": 198,
+    },
+    "gpt3-175B-en": {  # 175B
+        "vocab_size": 51200,
+        "hidden_size": 12288,
+        "num_hidden_layers": 96,
+        "num_attention_heads": 96,
+        "intermediate_size": 49152,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 1024,
+        "type_vocab_size": 1,  # no use
+        "initializer_range": 0.02,
+        "eos_token_id": 50256,
+        "eol_token_id": 198,
+    },
+    "gpt3-13B-en": {  # 13B
+        "vocab_size": 50304,
+        "hidden_size": 5120,
+        "num_hidden_layers": 40,
+        "num_attention_heads": 40,
+        "intermediate_size": 20480,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 1024,
+        "type_vocab_size": 1,  # no use
+        "initializer_range": 0.02,
+        "eos_token_id": 50256,
+        "eol_token_id": 198,
+    },
+    "gpt3-6.7B-en": {  # 6.7B
+        "vocab_size": 50304,
+        "hidden_size": 4096,
+        "num_hidden_layers": 32,
+        "num_attention_heads": 32,
+        "intermediate_size": 16384,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 1024,
+        "type_vocab_size": 16,  # no use
+        "initializer_range": 0.02,
+        "eos_token_id": 50256,
+        "eol_token_id": 198,
+    },
+    "gpt3-1.3B-en": {  # 1.3B
+        "vocab_size": 50304,
+        "hidden_size": 2048,
+        "num_hidden_layers": 24,
+        "num_attention_heads": 16,
+        "intermediate_size": 8192,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 1024,
+        "type_vocab_size": 1,  # no use
+        "initializer_range": 0.02,
+        "eos_token_id": 50256,
+        "eol_token_id": 198,
+    },
+    "gpt2-small-en": {  # config for CE
+        "vocab_size": 50304,
+        "hidden_size": 1024,
+        "num_hidden_layers": 4,
+        "num_attention_heads": 4,
+        "intermediate_size": 4096,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 1024,
+        "type_vocab_size": 1,  # no use
+        "initializer_range": 0.02,
+        "eos_token_id": 50256,
+        "eol_token_id": 198,
+    },
+}
+
+GPT_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "gpt-cpm-large-cn": "https://bj.bcebos.com/paddlenlp/models/transformers/gpt/gpt-cpm-large-cn.pdparams",
+        "gpt-cpm-small-cn-distill": "https://bj.bcebos.com/paddlenlp/models/transformers/gpt/gpt-cpm-small-cn-distill.pdparams",
+        "gpt2-en": "https://bj.bcebos.com/paddlenlp/models/transformers/gpt/gpt2-en.pdparams",
+        "gpt2-medium-en": "https://bj.bcebos.com/paddlenlp/models/transformers/gpt/gpt2-medium-en.pdparams",
+        "gpt2-large-en": "https://bj.bcebos.com/paddlenlp/models/transformers/gpt/gpt2-large-en.pdparams",
+        "gpt2-xl-en": "https://bj.bcebos.com/paddlenlp/models/transformers/gpt/gpt2-xl-en.pdparams",
+    }
+}
+
+
+class GPTConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GPTModel`] or a [`TFGPTModel`]. It is used to
+    instantiate a GPT model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the GPT
+    gpt-base-uncased architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the GPT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GPTModel`] or [`TFGPTModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_activation (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`GPTModel`] or [`TFGPTModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python
+    >>> from paddlenlp.transformers import GPTModel, GPTConfig
+
+    >>> # Initializing a GPT gpt-base-uncased style configuration
+    >>> configuration = GPTConfig()
+
+    >>> # Initializing a model from the gpt-base-uncased style configuration
+    >>> model = GPTModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "gpt"
+    attribute_map: Dict[str, str] = {
+        "num_classes": "num_labels",
+        "dropout": "classifier_dropout",
+        "n_positions": "max_position_embeddings",
+        "n_embd": "hidden_size",
+        "n_layer": "num_hidden_layers",
+        "n_head": "num_attention_heads",
+        "n_inner": "intermediate_size",
+        "activation_function": "hidden_act",
+        "resid_pdrop": "attention_probs_dropout_prob",
+    }
+
+    pretrained_init_configuration = GPT_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        seq_length=1024,
+        vocab_size: int = 50304,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_activation: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        max_position_embeddings: int = 512,
+        type_vocab_size: int = 16,
+        initializer_range: float = 0.02,
+        layer_norm_eps=1e-5,
+        pad_token_id: int = 0,
+        eos_token_id: int = 7,
+        bos_token_id: int = 0,
+        eol_token_id: int = 3,
+        num_partitions: int = 1,
+        normalize_before: bool = True,
+        scale_qk_coeff: float = 1.0,
+        output_attentions: bool = False,
+        ignore_index: int = 0,
+        use_fast_layer_norm: bool = False,
+        fuse_attention_qkv: bool = False,
+        fuse_attention_ffn: bool = False,
+        fused_softmax_with_triangular: bool = False,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.seq_length = seq_length
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_activation = hidden_activation
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+
+        self.pad_token_id = pad_token_id
+        self.eos_token_id = eos_token_id
+        self.bos_token_id = bos_token_id
+        self.eol_token_id = eol_token_id
+
+        self.fuse_attention_qkv = fuse_attention_qkv
+        self.fuse_attention_ffn = fuse_attention_ffn
+
+        self.num_partitions = num_partitions
+        self.normalize_before = normalize_before
+        self.scale_qk_coeff = scale_qk_coeff
+
+        self.output_attentions = output_attentions
+        self.ignore_index = ignore_index
+
+        self.use_fast_layer_norm = use_fast_layer_norm
+        self.fused_softmax_with_triangular = fused_softmax_with_triangular
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gpt/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gpt/modeling.py
new file mode 100644
index 000000000..a618600fc
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gpt/modeling.py
@@ -0,0 +1,1913 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import collections
+import contextlib
+import math
+from functools import partial
+
+import numpy as np
+import paddle
+import paddle.distributed.fleet.meta_parallel as mpu
+import paddle.incubate as incubate
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.tensor as tensor
+from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
+from paddle.distributed.fleet.utils import recompute
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        GatherOp,
+        ScatterOp,
+        mark_as_sequence_parallel_parameter,
+    )
+except:
+    pass
+from paddle.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from paddle.utils import try_import
+
+from ...utils.converter import StateDictNameMapping
+from ...utils.log import logger
+from .. import PretrainedModel, linear_utils, register_base_model
+from ..linear_utils import Linear
+from ..model_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from ..model_utils import dy2st_nocheck_guard_context
+from .configuration import (
+    GPT_PRETRAINED_INIT_CONFIGURATION,
+    GPT_PRETRAINED_RESOURCE_FILES_MAP,
+    GPTConfig,
+)
+
+try:
+    from paddle.nn.functional.flash_attention import flash_attention
+except:
+    flash_attention = None
+try:
+    from paddle.incubate.nn.layer.fused_dropout_add import FusedDropoutAdd
+except:
+    FusedDropoutAdd = None
+
+OriginLayerNorm = paddle.nn.LayerNorm
+
+
+__all__ = [
+    "GPTModel",
+    "GPTPretrainedModel",
+    "GPTPretrainingCriterion",
+    "GPTForGreedyGeneration",
+    "GPTLMHeadModel",
+    "GPTForTokenClassification",
+    "GPTForSequenceClassification",
+    "GPTForCausalLM",
+    "GPTEmbeddings",
+    "GPTDecoderLayer",
+    "GPTLayerNorm",
+]
+
+
+def get_triangle_upper_mask(x, mask=None):
+    if mask is not None:
+        return mask
+    if paddle.is_compiled_with_xpu():
+        # xpu does not support set constant to -np.inf
+        mask = paddle.full_like(x, -1e4)
+    else:
+        mask = paddle.full_like(x, -np.inf)
+    mask.stop_gradient = True
+    mask = paddle.triu(mask, diagonal=1)
+    mask.stop_gradient = True
+    return mask
+
+
+def parallel_matmul(x: paddle.Tensor, y: paddle.Tensor, transpose_y=True, tensor_parallel_output=True):
+    is_fleet_init = True
+    tensor_parallel_degree = 1
+    try:
+        hcg = fleet.get_hybrid_communicate_group()
+        model_parallel_group = hcg.get_model_parallel_group()
+        tensor_parallel_degree = hcg.get_model_parallel_world_size()
+    except:
+        is_fleet_init = False
+
+    if is_fleet_init and tensor_parallel_degree > 1 and y.is_distributed:
+        # if not running under distributed.launch, it will raise AttributeError: 'Fleet' object has no attribute '_hcg'
+        input_parallel = paddle.distributed.collective._c_identity(x, group=model_parallel_group)
+        logits = paddle.matmul(input_parallel, y, transpose_y=transpose_y)
+
+        if tensor_parallel_output:
+            return logits
+
+        return paddle.distributed.collective._c_concat(logits, group=model_parallel_group)
+
+    else:
+        logits = paddle.matmul(x, y, transpose_y=transpose_y)
+        return logits
+
+
+def seed_guard_context(name=None):
+    if (
+        not isinstance(paddle.base.framework._current_expected_place(), paddle.core.CPUPlace)
+        and name in get_rng_state_tracker().states_
+    ):
+        # todo fix it
+        #  ValueError: Length of gpu state list should be equal to the gpu device count
+        #  /usr/local/lib/python3.10/dist-packages/paddle/incubate/framework/random.py:119: ValueError
+        # return contextlib.nullcontext()
+        return get_rng_state_tracker().rng_state(name)
+    else:
+        return contextlib.nullcontext()
+
+
+def fast_layer_norm(input, weight, bias, eps):
+    fast_ln_lib = try_import("fast_ln")
+    return fast_ln_lib.fast_ln(input, weight, bias, eps)[0]
+
+
+def _make_causal_mask(input_ids_shape, past_key_values_length):
+    """
+    Make causal mask used for self-attention
+    """
+    batch_size, target_length = input_ids_shape  # target_length: seq_len
+
+    mask = paddle.tril(paddle.ones((target_length, target_length), dtype="bool"))
+
+    if past_key_values_length > 0:
+        # [tgt_len, tgt_len + past_len]
+        mask = paddle.concat([paddle.ones([target_length, past_key_values_length], dtype="bool"), mask], axis=-1)
+
+    # [bs, 1, tgt_len, tgt_len + past_len]
+    return mask[None, None, :, :].expand([batch_size, 1, target_length, target_length + past_key_values_length])
+
+
+def _expand_2d_mask(mask, dtype, tgt_length):
+    """
+    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
+    """
+    batch_size, src_length = mask.shape[0], mask.shape[-1]
+    tgt_length = tgt_length if tgt_length is not None else src_length
+
+    mask = mask[:, None, None, :].astype("bool")
+    mask.stop_gradient = True
+    expanded_mask = mask.expand([batch_size, 1, tgt_length, src_length])
+
+    return expanded_mask
+
+
+def _check_normalized_shape(normalized_shape):
+    if isinstance(normalized_shape, (list, tuple)):
+        assert len(normalized_shape) == 1
+
+
+class MultiHeadAttention(nn.Layer):
+    """
+    Attention mapps queries and a set of key-value pairs to outputs, and
+    Multi-Head Attention performs multiple parallel attention to jointly attending
+    to information from different representation subspaces.
+
+    """
+
+    Cache = collections.namedtuple("Cache", ["k", "v"])
+
+    def __init__(
+        self,
+        config,
+    ):
+        super(MultiHeadAttention, self).__init__()
+
+        self.config = config
+
+        # Recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+
+        self.use_flash_attention = config.use_flash_attention if flash_attention else False
+
+        self.head_dim = config.hidden_size // config.num_attention_heads
+        assert (
+            self.head_dim * config.num_attention_heads == config.hidden_size
+        ), "hidden_size must be divisible by num_attention_heads"
+
+        self.num_attention_heads = config.num_attention_heads  # default, without tensor parallel
+
+        if config.sequence_parallel:
+            ColumnParallelLinear = linear_utils.ColumnSequenceParallelLinear
+            RowParallelLinear = linear_utils.RowSequenceParallelLinear
+        else:
+            ColumnParallelLinear = linear_utils.ColumnParallelLinear
+            RowParallelLinear = linear_utils.RowParallelLinear
+
+        if config.tensor_parallel_degree > 1:
+            assert config.num_attention_heads % config.tensor_parallel_degree == 0
+            self.num_attention_heads = config.num_attention_heads // config.tensor_parallel_degree
+
+            if config.fuse_attention_qkv:
+                self.qkv_proj = ColumnParallelLinear(
+                    config.hidden_size,
+                    3 * config.hidden_size,
+                    has_bias=True,
+                    gather_output=False,
+                    fuse_matmul_bias=config.use_fused_linear,
+                )
+            else:
+                self.q_proj = ColumnParallelLinear(
+                    config.hidden_size,
+                    config.hidden_size,
+                    has_bias=True,
+                    gather_output=False,
+                    fuse_matmul_bias=config.use_fused_linear,
+                )
+
+                self.k_proj = ColumnParallelLinear(
+                    config.hidden_size,
+                    config.hidden_size,
+                    has_bias=True,
+                    gather_output=False,
+                    fuse_matmul_bias=config.use_fused_linear,
+                )
+
+                self.v_proj = ColumnParallelLinear(
+                    config.hidden_size,
+                    config.hidden_size,
+                    has_bias=True,
+                    gather_output=False,
+                    fuse_matmul_bias=config.use_fused_linear,
+                )
+
+            self.out_proj = RowParallelLinear(
+                config.hidden_size,
+                config.hidden_size,
+                has_bias=True,
+                input_is_parallel=True,
+                fuse_matmul_bias=config.use_fused_linear,
+            )
+        else:
+            if self.config.fuse_attention_qkv:
+                self.qkv_proj = Linear(config.hidden_size, 3 * config.hidden_size, bias_attr=True)
+            else:
+                self.q_proj = Linear(config.hidden_size, config.hidden_size, bias_attr=True)
+                self.k_proj = Linear(config.hidden_size, config.hidden_size, bias_attr=True)
+                self.v_proj = Linear(config.hidden_size, config.hidden_size, bias_attr=True)
+
+            self.out_proj = Linear(config.hidden_size, config.hidden_size, bias_attr=True)
+
+    def _fuse_prepare_qkv(self, query, use_cache=False, past_key_value=None):
+        if self.config.sequence_parallel:
+            # [bs, seq_len, num_head * head_dim] -> [bs / n, seq_len, num_head, head_dim] (n is model parallelism)
+            target_shape = [-1, self.config.seq_length, self.num_attention_heads, 3 * self.head_dim]
+        else:
+            target_shape = [0, 0, self.num_attention_heads, 3 * self.head_dim]
+
+        # bs, seq_len, num_head * 3*head_dim
+        mix_layer = self.qkv_proj(query)
+        # bs, seq_len, num_head, 3*head_dim
+        mix_layer = paddle.reshape_(mix_layer, target_shape)
+        # query_states, key_states, value_states => bs, seq_len, num_head, head_dim
+        query_states, key_states, value_states = paddle.split(mix_layer, num_or_sections=3, axis=-1)
+
+        # [bs, seq_len, num_head, head_dim]
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            # concat along seqlen dimension
+            key_states = paddle.concat([past_key_value[0], key_states], axis=1)
+            value_states = paddle.concat([past_key_value[1], value_states], axis=1)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        return query_states, key_states, value_states, past_key_value
+
+    def _prepare_qkv(self, query, key, value, use_cache=False, past_key_value=None):
+        r"""
+        Prapares linear projected queries, keys and values for usage of subsequnt
+        multiple parallel attention. If `cache` is not None, using cached results
+        to reduce redundant calculations.
+
+        """
+        if self.config.sequence_parallel:
+            # [bs, seq_len, num_head * head_dim] -> [bs/n, seq_len, num_head * head_dim] (n is model parallelism)
+            target_shape = [-1, self.config.seq_length, self.num_attention_heads, self.head_dim]
+        else:
+            target_shape = [0, 0, self.num_attention_heads, self.head_dim]
+
+        query_states = self.q_proj(query)
+        # [bs, seq_len, num_head, head_dim]
+        query_states = tensor.reshape(x=query_states, shape=target_shape)
+
+        key_states = self.k_proj(key)
+        # [bs, seq_len, num_head, head_dim]
+        key_states = tensor.reshape(x=key_states, shape=target_shape)
+
+        value_states = self.v_proj(value)
+        # [bs, seq_len, num_head, head_dim]
+        value_states = tensor.reshape(x=value_states, shape=target_shape)
+
+        # [bs, seq_len, num_head, head_dim]
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            # concat along seqlen dimension
+            key_states = paddle.concat([past_key_value[0], key_states], axis=1)
+            value_states = paddle.concat([past_key_value[1], value_states], axis=1)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        return query_states, key_states, value_states, past_key_value
+
+    def _flash_attention(self, q, k, v, attention_mask=None, output_attentions=False):
+        with seed_guard_context("local_seed"):
+            out, weights = flash_attention(
+                query=q,
+                key=k,
+                value=v,
+                dropout=self.config.attention_probs_dropout_prob,
+                causal=q.shape[1] != 1,
+                return_softmax=output_attentions,
+                training=self.training,
+            )
+        # [bs, seq_len, num_head, head_dim] -> [bs, seq_len, num_head * head_dim]
+        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+        return (out, weights) if output_attentions else out
+
+    def _core_attention(self, q, k, v, attention_mask=None, output_attentions=False):
+        # [bs, seq_len, num_head, head_dim] -> [bs, num_head, seq_len, head_dim]
+        perm = [0, 2, 1, 3]
+        q = tensor.transpose(x=q, perm=perm)
+        k = tensor.transpose(x=k, perm=perm)
+        v = tensor.transpose(x=v, perm=perm)
+        # scale dot product attention
+        product = paddle.matmul(x=q * ((self.config.scale_qk_coeff * self.head_dim) ** -0.5), y=k, transpose_y=True)
+        if self.config.scale_qk_coeff != 1.0:
+            product = product.scale(self.config.scale_qk_coeff)
+
+        # softmax_mask_fuse_upper_triangle is not supported sif paddle is not compiled with cuda/rocm
+        if not paddle.is_compiled_with_cuda():
+            attention_mask = get_triangle_upper_mask(product, attention_mask)
+
+        if attention_mask is not None:
+            product = product + attention_mask.astype(product.dtype)
+            weights = F.softmax(product)
+        else:
+            weights = incubate.softmax_mask_fuse_upper_triangle(product)
+
+        if self.config.attention_probs_dropout_prob:
+            with seed_guard_context("local_seed"):
+                weights = F.dropout(
+                    weights, self.config.attention_probs_dropout_prob, training=self.training, mode="upscale_in_train"
+                )
+
+        out = paddle.matmul(weights, v)
+
+        # combine heads
+        out = tensor.transpose(out, perm=[0, 2, 1, 3])  # bs, seq_len, num_head, head_dim
+        out = tensor.reshape(x=out, shape=[0, 0, -1])  # bs, seq_len, dim
+
+        return (out, weights) if output_attentions else out
+
+    def forward(
+        self, query, key, value, attention_mask=None, use_cache=False, past_key_value=None, output_attentions=False
+    ):
+        r"""
+        Applies multi-head attention to map queries and a set of key-value pairs
+        to outputs.
+        """
+        key = query if key is None else key
+        value = query if value is None else value
+        if self.config.fuse_attention_qkv:
+            # [bs, seq_len, num_head, head_dim]
+            q, k, v, past_key_value = self._fuse_prepare_qkv(query, use_cache, past_key_value)
+        else:
+            # [bs, seq_len, num_head, head_dim]
+            q, k, v, past_key_value = self._prepare_qkv(query, key, value, use_cache, past_key_value)
+
+        if self.config.use_flash_attention:
+            # Flash Attention now ignore attention mask
+            # Current Flash Attention doesn't support attn maskt
+            # Paddle Flash Attention input [batch_size, seq_len, num_heads, head_dim]
+            # Torch Flash Attention input (batch_size, seqlen, nheads, headdim)
+            # bsz, q_len, num_heads, head_dim = q.shape
+            # TODO: Support attention mask for flash attention
+            attention_func = self._flash_attention
+        else:
+            # scale dot product attention
+            # [bs, seq_len, num_head,]
+            attention_func = self._core_attention
+
+        has_gradient = (not q.stop_gradient) or (not k.stop_gradient) or (not v.stop_gradient)
+        if self.enable_recompute and self.config.recompute_granularity == "core_attn" and has_gradient:
+            outputs = recompute(attention_func, q, k, v, attention_mask, output_attentions, use_reentrant=False)
+        else:
+            outputs = attention_func(q, k, v, attention_mask=attention_mask, output_attentions=output_attentions)
+
+        if output_attentions:
+            out, weights = outputs
+        else:
+            out = outputs
+
+        # if sequence_parallel is true, out shape are [bs, seq_len, num_head * head_dim / n]
+        # else their shape are [bs, q_len, num_head * head_dim / n], n is mp parallelism.
+
+        if self.config.sequence_parallel:
+            bs, seq_len, dim = out.shape
+            out = out.reshape([bs * seq_len, dim])  # [bs, seq_len, dim / n] => [bs * seq_len, dim / n]
+
+        # project to output
+        out = self.out_proj(out)
+        # if sequence_parallel is true, out shape are [bs * seq_len / n, dim]
+        # else their shape are [bs, seq_len, dim], n is mp parallelism.
+
+        outs = [out]
+        if output_attentions:
+            outs.append(weights)
+        if use_cache:
+            outs.append(past_key_value)
+        return out if len(outs) == 1 else tuple(outs)
+
+
+class TransformerDecoder(nn.Layer):
+    """
+    TransformerDecoder is a stack of N decoder layers.
+    """
+
+    def __init__(self, config, decoder_layers, norm=None, hidden_size=None):
+        super(TransformerDecoder, self).__init__()
+
+        self.config = config
+        self.layers = decoder_layers
+        self.norm = GPTLayerNorm(config, config.hidden_size, epsilon=1e-5)
+
+        if config.sequence_parallel:
+            mark_as_sequence_parallel_parameter(self.norm.weight)
+            mark_as_sequence_parallel_parameter(self.norm.bias)
+
+        # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True
+        # Enable_recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+
+    @paddle.jit.not_to_static
+    def recompute_training(
+        self,
+        layer_module: nn.Layer,
+        hidden_states: paddle.Tensor,
+        past_key_value: paddle.Tensor,
+        attention_mask: paddle.Tensor,
+        use_cache: bool,
+        output_attentions: paddle.Tensor,
+    ):
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs, output_attentions)
+
+            return custom_forward
+
+        # GPTDecoderLayer
+        # def forward(
+        #     self, hidden_states, attention_mask=None, use_cache=False, past_key_value=None, output_attentions=False
+        # ):
+        hidden_states = recompute(
+            create_custom_forward(layer_module),
+            hidden_states,
+            attention_mask,
+            use_cache,
+            past_key_value,
+            use_reentrant=self.config.recompute_use_reentrant,
+        )
+        return hidden_states
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        use_cache=False,
+        past_key_values=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
+    ):
+        r"""
+        Applies a stack of N Transformer decoder layers on inputs. If `norm` is
+        provided, also applies layer normalization on the output of last decoder
+        layer.
+        """
+
+        # [bs * seq_len, embed_dim] -> [seq_len * bs / n, embed_dim] (sequence_parallel)
+
+        output = hidden_states
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        next_decoder_cache = () if use_cache else None
+
+        for i, mod in enumerate(self.layers):
+            has_gradient = not output.stop_gradient
+            # def forward(self, hidden_states, attention_mask=None, use_cache=False, past_key_value=None, output_attentions=False):
+            if self.enable_recompute and has_gradient and self.config.recompute_granularity == "full_attn":
+                outputs = self.recompute_training(
+                    layer_module=mod,
+                    hidden_states=output,
+                    attention_mask=attention_mask,
+                    use_cache=use_cache,
+                    past_key_value=None,
+                    output_attentions=output_attentions,
+                )
+            else:
+                outputs = mod(
+                    output,
+                    attention_mask=attention_mask,
+                    use_cache=use_cache,
+                    past_key_value=past_key_values[i] if past_key_values is not None else None,
+                    output_attentions=output_attentions,
+                )
+
+            # outputs = hidden_states if both use_cache and output_attentions are False
+            # Otherwise, outputs = (hidden_states, attention if output_attentions, cache if use_cache)
+            output = outputs[0] if (use_cache or output_attentions) else outputs
+            all_self_attentions = all_self_attentions + (outputs[1],) if output_attentions else None
+            all_hidden_states = all_hidden_states + (output,) if output_hidden_states else None
+            next_decoder_cache = next_decoder_cache + (outputs[-1],) if use_cache else None
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            temp_list = [output, next_cache, all_hidden_states, all_self_attentions]
+
+            if not (use_cache or output_attentions or output_hidden_states):
+                return output
+
+            return tuple(v for v in temp_list if v is not None)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=output,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=None,
+        )
+
+
+class GPTDecoderLayer(nn.Layer):
+    """
+    The transformer decoder layer.
+
+    It contains multiheadattention and some linear layers.
+    """
+
+    def __init__(self, config: GPTConfig):
+        super(GPTDecoderLayer, self).__init__()
+        self.config = config
+
+        # Recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+
+        if not FusedDropoutAdd:
+            config.use_fused_dropout_add = False
+
+        self.self_attn = MultiHeadAttention(config=config)
+
+        if config.sequence_parallel:
+            ColumnParallelLinear = linear_utils.ColumnSequenceParallelLinear
+            RowParallelLinear = linear_utils.RowSequenceParallelLinear
+        else:
+            ColumnParallelLinear = linear_utils.ColumnParallelLinear
+            RowParallelLinear = linear_utils.RowParallelLinear
+
+        # TODO:config.fuse_attention_ffn @DrownFish19
+        if config.tensor_parallel_degree > 1:
+            self.linear1 = ColumnParallelLinear(
+                config.hidden_size,
+                config.intermediate_size,
+                gather_output=False,
+                has_bias=True,
+                fuse_matmul_bias=self.config.use_fused_linear,
+            )
+
+            self.linear2 = RowParallelLinear(
+                config.intermediate_size,
+                config.hidden_size,
+                input_is_parallel=True,
+                has_bias=True,
+                fuse_matmul_bias=self.config.use_fused_linear,
+            )
+        else:
+            self.linear1 = Linear(config.hidden_size, config.intermediate_size, bias_attr=True)
+            self.linear2 = Linear(config.intermediate_size, config.hidden_size, bias_attr=True)
+
+        self.norm1 = GPTLayerNorm(config, config.hidden_size, epsilon=1e-5)
+        self.norm2 = GPTLayerNorm(config, config.hidden_size, epsilon=1e-5)
+
+        if config.sequence_parallel:
+            mark_as_sequence_parallel_parameter(self.norm1.weight)
+            mark_as_sequence_parallel_parameter(self.norm1.bias)
+            mark_as_sequence_parallel_parameter(self.norm2.weight)
+            mark_as_sequence_parallel_parameter(self.norm2.bias)
+
+        if config.use_fused_dropout_add:
+            self.fused_dropout_add1 = FusedDropoutAdd(config.attention_probs_dropout_prob, mode="upscale_in_train")
+            self.fused_dropout_add2 = FusedDropoutAdd(config.hidden_dropout_prob, mode="upscale_in_train")
+        else:
+            self.dropout1 = nn.Dropout(config.attention_probs_dropout_prob, mode="upscale_in_train")
+            self.dropout2 = nn.Dropout(config.hidden_dropout_prob, mode="upscale_in_train")
+
+        if config.hidden_activation == "gelu":
+            self.activation = F.gelu
+        else:
+            self.activation = getattr(F, config.hidden_activation)
+
+    def forward(
+        self, hidden_states, attention_mask=None, use_cache=False, past_key_value=None, output_attentions=False
+    ):
+        # when sequence_parallel=True:
+        # hidden_states => [bs * seq_len / n, embed_dim]
+        residual = hidden_states
+
+        if self.config.normalize_before:
+            hidden_states = self.norm1(hidden_states)
+        # self.self_attn:
+        # def forward(
+        #     self, query, key, value, attention_mask=None, use_cache=False, past_key_value=None, output_attentions=False
+        # ):
+        # self.self_attn(...) --> hidden_states, weights, (past_key_value)
+        has_gradient = not hidden_states.stop_gradient
+        if self.enable_recompute and has_gradient and self.config.recompute_granularity == "full_attn":
+            hidden_states = recompute(
+                self.self_attn,
+                hidden_states,
+                None,
+                None,
+                attention_mask,
+                use_cache,
+                past_key_value,
+                output_attentions,
+                use_reentrant=False,
+            )
+        else:
+            hidden_states = self.self_attn(
+                hidden_states, None, None, attention_mask, use_cache, past_key_value, output_attentions
+            )
+        # when sequence_parallel=True:
+        # hidden_states => [bs * seq_len / n, embed_dim]
+        incremental_cache = hidden_states[-1] if use_cache else None
+        attention_weights = hidden_states[1] if output_attentions else None
+        hidden_states = hidden_states[0] if (use_cache or output_attentions) else hidden_states
+
+        # Use a ternary operator for a more concise assignment of current_seed
+        current_seed = "local_seed" if self.config.sequence_parallel else "global_seed"
+
+        # The 'with' block ensures the correct seed context is used
+        with seed_guard_context(current_seed):
+            if self.config.use_fused_dropout_add:
+                hidden_states = self.fused_dropout_add1(hidden_states, residual)
+            else:
+                hidden_states = residual + self.dropout1(hidden_states)
+
+        if not self.config.normalize_before:
+            hidden_states = self.norm1(hidden_states)
+
+        residual = hidden_states
+        if self.config.normalize_before:
+            hidden_states = self.norm2(hidden_states)
+
+        # when sequence_parallel=True:
+        # hidden_states => [bs * seq_len / n, embed_dim]
+        with seed_guard_context(current_seed):
+            if not self.config.use_fused_dropout_add:
+                hidden_states = residual + self.dropout2(
+                    self.linear2(self.activation(self.linear1(hidden_states), approximate=True))
+                )
+            else:
+                hidden_states = self.fused_dropout_add2(
+                    self.linear2(self.activation(self.linear1(hidden_states), approximate=True)), residual
+                )
+        if not self.config.normalize_before:
+            hidden_states = self.norm2(hidden_states)
+
+        if not (output_attentions or use_cache):
+            return hidden_states
+
+        temp_list = [
+            hidden_states,
+            attention_weights,
+            incremental_cache,
+        ]
+
+        return tuple(v for v in temp_list if v is not None)
+
+
+class GPTEmbeddings(nn.Layer):
+    """
+    Include embeddings from word and position embeddings.
+    """
+
+    def __init__(
+        self,
+        config,
+    ):
+        super(GPTEmbeddings, self).__init__()
+
+        self.config = config
+
+        if config.tensor_parallel_degree > 1:
+            self.word_embeddings = fleet.meta_parallel.VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+            )
+        else:
+            self.word_embeddings = nn.Embedding(
+                config.vocab_size,
+                config.hidden_size,
+            )
+
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings,
+            config.hidden_size,
+        )
+
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, position_ids=None, inputs_embeddings=None):
+        if input_ids is not None:
+            input_shape = input_ids.shape
+            inputs_embeddings = self.word_embeddings(input_ids)
+        else:
+            input_shape = inputs_embeddings.shape[:-1]
+
+        if position_ids is None:
+            ones = paddle.ones(input_shape, dtype="int64")
+            seq_length = paddle.cumsum(ones, axis=-1)
+            position_ids = seq_length - ones
+
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = inputs_embeddings + position_embeddings
+
+        if self.config.sequence_parallel:
+            bs, seq_len, hidden_size = embeddings.shape
+            # [bs, seq_len, dim] -> [bs * seq_len, dim]
+            embeddings = paddle.reshape_(embeddings, [bs * seq_len, hidden_size])
+            # [bs * seq_len / n, dim] (n is mp parallelism)
+            embeddings = ScatterOp.apply(embeddings)
+
+        # Use a ternary operator for a more concise assignment of current_seed
+        current_seed = "local_seed" if self.config.sequence_parallel else "global_seed"
+        # The 'with' block ensures the correct seed context is used
+        with seed_guard_context(current_seed):
+            embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class GPTLayerNorm(OriginLayerNorm):
+    def __init__(self, config, normalized_shape, epsilon=1e-05, weight_attr=None, bias_attr=None, name=None):
+        super().__init__(
+            normalized_shape=normalized_shape, epsilon=epsilon, weight_attr=weight_attr, bias_attr=bias_attr
+        )
+
+        self.config = config
+        _check_normalized_shape(self._normalized_shape)
+
+    def forward(self, input):
+        if self.config.use_fast_layer_norm:
+            return fast_layer_norm(input, self.weight, self.bias, self._epsilon)
+        return super().forward(input)
+
+
+class GPTPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained GPT models. It provides GPT related
+    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
+    `pretrained_init_configuration`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    model_config_file = "model_config.json"
+    resource_files_names = {"model_state": "model_state.pdparams"}
+    base_model_prefix = "gpt"
+    config_class = GPTConfig
+    pretrained_init_configuration = GPT_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = GPT_PRETRAINED_RESOURCE_FILES_MAP
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config, is_split=True):
+
+        from paddlenlp.transformers.conversion_utils import split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def get_tensor_parallel_split_mappings(num_layers):
+            final_actions = {}
+            base_actions = {
+                # Column Linear
+                "layers.0.linear1.weight": partial(fn, is_column=True),
+                "layers.0.linear1.bias": partial(fn, is_column=True),
+                # Row Linear
+                "word_embeddings.weight": partial(fn, is_column=False),
+                "layers.0.self_attn.out_proj.weight": partial(fn, is_column=False),
+                "layers.0.linear2.weight": partial(fn, is_column=False),
+            }
+
+            if config.fuse_attention_qkv:
+                base_actions["layers.0.self_attn.qkv_proj.weight"] = partial(fn, is_column=True)
+                base_actions["layers.0.self_attn.qkv_proj.bias"] = partial(fn, is_column=True)
+            else:
+                base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True)
+                base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True)
+                base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True)
+                base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True)
+                base_actions["layers.0.self_attn.k_proj.bias"] = partial(fn, is_column=True)
+                base_actions["layers.0.self_attn.v_proj.bias"] = partial(fn, is_column=True)
+
+            for key, action in base_actions.items():
+                if "layers.0." in key:
+                    for i in range(num_layers):
+                        final_actions[key.replace("layers.0.", f"layers.{i}.")] = action
+                final_actions[key] = action
+
+            return final_actions
+
+        mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
+
+        return mappings
+
+    @classmethod
+    def _get_fuse_or_split_param_mappings(cls, config: GPTConfig, is_fuse=False):
+        # return parameter fuse utils
+        from paddlenlp.transformers.conversion_utils import split_or_fuse_func
+
+        fn = split_or_fuse_func(is_fuse=is_fuse)
+
+        # last key is fused key, other keys are to be fused.
+        fuse_qkv_keys = (
+            "decoder.layers.0.self_attn.q_proj.weight",
+            "decoder.layers.0.self_attn.k_proj.weight",
+            "decoder.layers.0.self_attn.v_proj.weight",
+            "decoder.layers.0.self_attn.qkv_proj.weight",
+        )
+        fuse_qkv_bias_keys = (
+            "decoder.layers.0.self_attn.q_proj.bias",
+            "decoder.layers.0.self_attn.k_proj.bias",
+            "decoder.layers.0.self_attn.v_proj.bias",
+            "decoder.layers.0.self_attn.qkv_proj.bias",
+        )
+        num_heads = config.num_attention_heads
+        num_key_value_heads = getattr(config, "num_key_value_heads", num_heads)
+        fuse_attention_qkv = getattr(config, "fuse_attention_qkv", False)
+
+        final_actions = {}
+        if is_fuse:
+            if fuse_attention_qkv:
+                for i in range(config.num_hidden_layers):
+                    for keys in [fuse_qkv_keys, fuse_qkv_bias_keys]:
+                        new_keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in keys])
+                        final_actions[new_keys] = partial(
+                            fn, is_qkv=True, num_heads=num_heads, num_key_value_heads=num_key_value_heads
+                        )
+        else:
+            if not fuse_attention_qkv:
+                for i in range(config.num_hidden_layers):
+                    for keys in [fuse_qkv_keys, fuse_qkv_bias_keys]:
+                        new_keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in keys])
+                        final_actions[new_keys] = partial(
+                            fn, split_nums=3, is_qkv=True, num_heads=num_heads, num_key_value_heads=num_key_value_heads
+                        )
+        return final_actions
+
+    @classmethod
+    def _get_name_mappings(cls, config: GPTConfig) -> list[StateDictNameMapping]:
+        mappings: list[StateDictNameMapping] = []
+        model_mappings = [
+            ["wte.weight", "embeddings.word_embeddings.weight"],
+            ["wpe.weight", "embeddings.position_embeddings.weight"],
+            ["ln_f.weight", "decoder.norm.weight"],
+            ["ln_f.bias", "decoder.norm.bias"],
+        ]
+        for layer_index in range(config.num_hidden_layers):
+            layer_mappings = [
+                [f"h.{layer_index}.ln_1.weight", f"decoder.layers.{layer_index}.norm1.weight"],
+                [f"h.{layer_index}.ln_1.bias", f"decoder.layers.{layer_index}.norm1.bias"],
+                [f"h.{layer_index}.ln_2.weight", f"decoder.layers.{layer_index}.norm2.weight"],
+                [f"h.{layer_index}.ln_2.bias", f"decoder.layers.{layer_index}.norm2.bias"],
+                [f"h.{layer_index}.mlp.c_fc.weight", f"decoder.layers.{layer_index}.linear1.weight"],
+                [f"h.{layer_index}.mlp.c_fc.bias", f"decoder.layers.{layer_index}.linear1.bias"],
+                [f"h.{layer_index}.mlp.c_proj.weight", f"decoder.layers.{layer_index}.linear2.weight"],
+                [f"h.{layer_index}.mlp.c_proj.bias", f"decoder.layers.{layer_index}.linear2.bias"],
+                [f"h.{layer_index}.attn.c_proj.weight", f"decoder.layers.{layer_index}.self_attn.out_proj.weight"],
+                [f"h.{layer_index}.attn.c_proj.bias", f"decoder.layers.{layer_index}.self_attn.out_proj.bias"],
+                # attention
+                [
+                    f"h.{layer_index}.attn.c_attn.weight",
+                    f"decoder.layers.{layer_index}.self_attn.q_proj.weight",
+                    "split",
+                    0,
+                ],
+                [
+                    f"h.{layer_index}.attn.c_attn.bias",
+                    f"decoder.layers.{layer_index}.self_attn.q_proj.bias",
+                    "split",
+                    0,
+                ],
+                [
+                    f"h.{layer_index}.attn.c_attn.weight",
+                    f"decoder.layers.{layer_index}.self_attn.k_proj.weight",
+                    "split",
+                    1,
+                ],
+                [
+                    f"h.{layer_index}.attn.c_attn.bias",
+                    f"decoder.layers.{layer_index}.self_attn.k_proj.bias",
+                    "split",
+                    1,
+                ],
+                [
+                    f"h.{layer_index}.attn.c_attn.weight",
+                    f"decoder.layers.{layer_index}.self_attn.v_proj.weight",
+                    "split",
+                    2,
+                ],
+                [
+                    f"h.{layer_index}.attn.c_attn.bias",
+                    f"decoder.layers.{layer_index}.self_attn.v_proj.bias",
+                    "split",
+                    2,
+                ],
+            ]
+
+            model_mappings.extend(layer_mappings)
+
+        # downstream mappings
+        if "GPT2Model" not in config.architectures:
+            for mapping in model_mappings:
+                mapping[0] = "transformer." + mapping[0]
+                mapping[1] = "gpt." + mapping[1]
+        if "GPT2ForTokenClassification" in config.architectures:
+            model_mappings.extend([["classifier.weight", "classifier.weight", "transpose"]])
+        if "GPT2ForSequenceClassification" in config.architectures:
+            model_mappings.extend([["score.weight", "score.weight", "transpose"]])
+        if "GPT2LMHeadModel" in config.architectures:
+            model_mappings.append(["lm_head.weight", "lm_head.decoder.weight"])
+
+        mappings = [StateDictNameMapping(*mapping) for mapping in model_mappings]
+        return mappings
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if self.config.tensor_parallel_degree > 1:
+            rng_tracker = get_rng_state_tracker().rng_state
+        if isinstance(
+            layer,
+            (
+                nn.Linear,
+                nn.Embedding,
+                mpu.VocabParallelEmbedding,
+                mpu.RowParallelLinear,
+                mpu.ColumnParallelLinear,
+                linear_utils.RowSequenceParallelLinear,
+                linear_utils.ColumnSequenceParallelLinear,
+            ),
+        ):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                if layer.weight.is_distributed:
+                    with rng_tracker():
+                        layer.weight.set_value(
+                            paddle.tensor.normal(
+                                mean=0.0,
+                                std=self.config.initializer_range,
+                                shape=layer.weight.shape,
+                            )
+                        )
+                else:
+                    layer.weight.set_value(
+                        paddle.tensor.normal(
+                            mean=0.0,
+                            std=self.config.initializer_range,
+                            shape=layer.weight.shape,
+                        )
+                    )
+        # Layer.apply is DFS https://github.com/PaddlePaddle/Paddle/blob/a6f5021fcc58b21f4414bae6bf4731ef6971582c/python/paddle/nn/layer/layers.py#L527-L530
+        # sublayer is init first
+        # scale RowParallelLinear weight
+        with paddle.no_grad():
+            if isinstance(layer, GPTDecoderLayer):
+                factor = 1 / math.sqrt(2 * self.config.num_hidden_layers)
+                layer.linear2.weight.scale_(factor)
+            if isinstance(layer, MultiHeadAttention):
+                factor = 1 / math.sqrt(2 * self.config.num_hidden_layers)
+                layer.out_proj.weight.scale_(factor)
+
+
+@register_base_model
+class GPTModel(GPTPretrainedModel):
+    r"""
+    The bare GPT Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        vocab_size (int):
+            Vocabulary size of `inputs_ids` in `GPTModel`. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `GPTModel`.
+        hidden_size (int, optional):
+            Dimensionality of the embedding layer and decoder layer. Defaults to `768`.
+        num_hidden_layers (int, optional):
+            Number of hidden layers in the Transformer decoder. Defaults to `12`.
+        num_attention_heads (int, optional):
+            Number of attention heads for each attention layer in the Transformer decoder.
+            Defaults to `12`.
+        intermediate_size (int, optional):
+            Dimensionality of the feed-forward (ff) layer in the decoder. Input tensors
+            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
+            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
+            Defaults to `3072`.
+        hidden_act (str, optional):
+            The non-linear activation function in the feed-forward layer.
+            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
+            are supported. Defaults to `"gelu"`.
+        hidden_dropout_prob (float, optional):
+            The dropout probability for all fully connected layers in the embeddings and decoder.
+            Defaults to `0.1`.
+        attention_probs_dropout_prob (float, optional):
+            The dropout probability used in MultiHeadAttention in all decoder layers to drop some attention target.
+            Defaults to `0.1`.
+        max_position_embeddings (int, optional):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
+            sequence. Defaults to `512`.
+        type_vocab_size (int, optional):
+            The vocabulary size of the `token_type_ids`. Defaults to `16`.
+
+            .. note::
+                Please NOT using `type_vocab_size`, for it will be obsolete in the future..
+
+        initializer_range (float, optional):
+            The standard deviation of the normal initializer. Default to `0.02`.
+
+            .. note::
+                A normal_initializer initializes weight matrices as normal distributions.
+                See :meth:`GPTPretrainedModel._init_weights()` for how weights are initialized in `GPTModel`.
+
+        pad_token_id(int, optional):
+            The index of padding token in the token vocabulary.
+            Defaults to `0`.
+
+    """
+
+    def __init__(self, config: GPTConfig):
+        super(GPTModel, self).__init__(config)
+
+        self.config = config
+
+        self.pad_token_id = config.pad_token_id
+        self.eos_token_id = config.eos_token_id
+        self.bos_token_id = config.bos_token_id
+        self.eol_token_id = config.eol_token_id
+        self.vocab_size = config.vocab_size
+
+        self.bias = paddle.tril(
+            paddle.ones([1, 1, config.max_position_embeddings, config.max_position_embeddings], dtype="int64")
+        )
+
+        self.embeddings = GPTEmbeddings(config)
+
+        decoder_layers = nn.LayerList()
+        for i in range(config.num_hidden_layers):
+            decoder_layers.append(GPTDecoderLayer(config))
+
+        self.decoder = TransformerDecoder(
+            config,
+            decoder_layers,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    @staticmethod
+    def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length, dtype):
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            if len(attention_mask.shape) == 2:
+                expanded_attn_mask = _expand_2d_mask(attention_mask, dtype, tgt_length=input_shape[-1])
+                # For decoding phase in generation, seq_length = 1, we don't need to add causal mask
+                if input_shape[-1] > 1:
+                    combined_attention_mask = _make_causal_mask(
+                        input_shape, past_key_values_length=past_key_values_length
+                    )
+                    expanded_attn_mask = expanded_attn_mask & combined_attention_mask
+            # [bsz, seq_len, seq_len] -> [bsz, 1, seq_len, seq_len]
+            elif len(attention_mask.shape) == 3:
+                expanded_attn_mask = attention_mask.unsqueeze(1).astype("bool")
+            # if attention_mask is already 4-D, do nothing
+            else:
+                expanded_attn_mask = attention_mask
+        else:
+            expanded_attn_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
+        # Convert bool attention_mask to float attention mask, which will be added to attention_scores later
+        expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
+        return expanded_attn_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        use_cache=False,
+        past_key_values=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
+    ):
+        r"""
+        The GPTModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor, optional):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to None.
+            position_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in self attention to avoid performing attention to some unwanted positions,
+                usually the subsequent positions.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
+                [batch_size, num_attention_heads, sequence_length, sequence_length].
+                Its data type should be int64.
+                The `masked` tokens have `0` values, and the `unmasked` tokens have `1` values.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation
+                of shape `(batch_size, sequence_length, hidden_size)`. This is useful if you want more control over
+                how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+                Default to None.
+            use_cache (bool, optional):
+                Whether or not to use cache. Defaults to `False`. If set to `True`, key value states will be returned and
+                can be used to speed up decoding.
+            past_key_values (list, optional):
+                It is only used for inference and should be None for training.
+                Default to `None`.
+            output_attentions (bool, optional):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail. Defaults to `False`.
+            output_hidden_states (bool, optional):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail. Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions`.
+
+            Especially, When `return_dict=output_hidden_states=output_attentions=False`,
+            returns tensor `outputs` which is the output at the last layer of the model.
+            Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import GPTModel, GPTTokenizer
+
+                tokenizer = GPTTokenizer.from_pretrained('gpt2-medium-en')
+                model = GPTModel.from_pretrained('gpt2-medium-en')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!", return_token_type_ids=False)
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+        """
+
+        if self.config.sequence_parallel and use_cache:
+            raise ValueError("We currently only support sequence parallel without cache.")
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+            input_ids = input_ids.reshape((-1, input_shape[-1]))
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        # input_shape => bs, seq_len
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.decoder.layers))
+
+        if position_ids is None:
+            past_length = 0
+            if past_key_values[0] is not None:
+                # bs, seq_len, num_head, head_dim
+                past_length = past_key_values[0][0].shape[1]
+            position_ids = paddle.arange(past_length, input_shape[-1] + past_length, dtype="int64")
+            position_ids = position_ids.unsqueeze(0)
+            position_ids = paddle.expand(position_ids, input_shape)
+        embedding_output = self.embeddings(
+            input_ids=input_ids, position_ids=position_ids, inputs_embeddings=inputs_embeds
+        )
+
+        # TODO, use registered buffer
+        length = input_shape[-1]
+        if past_key_values[0] is not None:
+            cache_length = past_key_values[0][0].shape[1]
+            length = length + cache_length
+        else:
+            cache_length = 0
+
+        causal_mask = self.bias[:, :, cache_length:length, :length]
+        if attention_mask is not None:
+            if attention_mask.dtype != paddle.int64:
+                attention_mask = paddle.cast(attention_mask, dtype=paddle.int64)
+            if len(attention_mask.shape) == 2:
+                attention_mask = attention_mask[:, None, None, :]
+            attention_mask = (1.0 - (attention_mask & causal_mask)) * -1e4
+        else:
+            attention_mask = (1.0 - causal_mask) * -1e4
+
+        # The tensor returned by triu not in static graph.
+        attention_mask.stop_gradient = True
+
+        outputs = self.decoder(
+            embedding_output,
+            attention_mask=attention_mask,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+
+        if output_hidden_states:
+            if return_dict:
+                outputs.hidden_states = (embedding_output,) + outputs.hidden_states
+            else:  # outputs is a tuple
+                idx = 2 if use_cache else 1
+                all_hidden_states = (embedding_output,) + outputs[idx]
+                outputs[idx] = all_hidden_states
+
+        return outputs
+
+
+class GPTPretrainingCriterion(paddle.nn.Layer):
+    """
+    Criterion for GPT. It calculates the final loss.
+    """
+
+    def __init__(self, config):
+        super(GPTPretrainingCriterion, self).__init__()
+        self.config = config
+        if config.tensor_parallel_degree > 1 and config.tensor_parallel_output:
+            self.loss_func = mpu.ParallelCrossEntropy(ignore_index=config.ignore_index)
+        else:
+            self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=config.ignore_index)
+
+    def forward(self, prediction_scores, masked_lm_labels, loss_mask=None):
+        """
+        Args:
+            prediction_scores(Tensor):
+                The logits of masked token prediction. Its data type should be float32 and
+                its shape is [batch_size, sequence_length, vocab_size].
+            masked_lm_labels(Tensor):
+                The labels of the masked language modeling, the dimensionality of `masked_lm_labels`
+                is equal to `prediction_scores`. Its data type should be int64 and
+                its shape is [batch_size, sequence_length, 1].
+            loss_mask(Tensor):
+                Mask used for calculating the loss of the masked language modeling to avoid
+                calculating some unwanted tokens.
+                Its data type should be float32 and its shape is [batch_size, sequence_length, 1].
+
+        Returns:
+            Tensor: The pretraining loss. Its data type should be float32 and its shape is [1].
+
+        """
+        with paddle.amp.auto_cast(False):
+            masked_lm_loss = self.loss_func(prediction_scores.astype("float32"), masked_lm_labels.unsqueeze(2))
+            # skip ignore_index which loss == 0
+            if loss_mask is None:
+                loss_mask = (masked_lm_loss > 0).astype("float32")
+                loss_mask = loss_mask.reshape([-1])
+            masked_lm_loss = paddle.sum(masked_lm_loss.reshape([-1]) * loss_mask)
+            loss = masked_lm_loss / loss_mask.sum()
+        return loss
+
+
+class GPTForGreedyGeneration(GPTPretrainedModel):
+    """
+    The generate model for GPT-2.
+    It use the greedy strategy and generate the output sequence with highest probability.
+
+    Args:
+        gpt (:class:`GPTModel`):
+            An instance of `paddlenlp.transformers.GPTModel`.
+        max_predict_len(int):
+            The max length of the prediction.
+
+    """
+
+    def __init__(self, config: GPTConfig, max_predict_len: int = 32):
+        super(GPTForGreedyGeneration, self).__init__(config)
+        self.gpt = GPTModel(config)
+        self.max_predict_len = paddle.to_tensor(max_predict_len, dtype="int32")
+        self.eol_token_id = config.eol_token_id
+
+    def model(
+        self,
+        input_ids,
+        position_ids=None,
+        attention_mask=None,
+        masked_positions=None,
+        use_cache=False,
+        past_key_values=None,
+    ):
+        r"""
+
+        Args:
+            input_ids (Tensor, optional):
+                See :class:`GPTModel`.
+            position_ids (Tensor, optional):
+                See :class:`GPTModel`.
+            attention_mask (Tensor, optional):
+                See :class:`GPTModel`.
+            use_cache (bool, optional):
+                See :class:`GPTModel`.
+            cache (Tensor, optional):
+                See :class:`GPTModel`.
+
+        Returns:
+            Tensor or tuple: Returns tensor `logits` or tuple `(logits, cached_kvs)`. If `use_cache` is True,
+            tuple (`logits, cached_kvs`) will be returned. Otherwise, tensor `logits` will be returned.
+            `logits` is the output of the gpt model.
+            `cache_kvs` is the cache output of gpt model if `use_cache` is True.
+
+        """
+
+        outputs = self.gpt(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+        )
+        if use_cache:
+            encoder_outputs, cached_kvs = outputs[:2]
+        else:
+            encoder_outputs = outputs
+        logits = paddle.matmul(encoder_outputs, self.gpt.embeddings.word_embeddings.weight, transpose_y=True)
+
+        if use_cache:
+            return logits, cached_kvs
+        else:
+            return logits
+
+    def forward(self, input_ids):
+        """
+
+        Args:
+            input_ids(Tensor):
+                See :class:`GPTModel`.
+
+        Returns:
+            Tensor: Returns tensor `src_ids`, which means the indices of output sequence tokens in the vocabulary.
+            They are numerical representations of tokens that build the output sequence.
+        """
+        output, cached_kvs = self.model(input_ids, use_cache=True, past_key_values=None)
+        src_ids = input_ids
+        nid = paddle.argmax(output[:, -1, :], axis=-1).reshape([-1, 1])
+        src_ids = paddle.concat([src_ids, nid], axis=1)
+        cur_len = 0
+        with dy2st_nocheck_guard_context():
+            while cur_len < self.max_predict_len:
+                output, cached_kvs = self.model(nid, use_cache=True, past_key_values=cached_kvs)
+                nid = paddle.argmax(output[:, -1, :], axis=-1).reshape([-1, 1])
+                src_ids = paddle.concat([src_ids, nid], axis=1)
+                cur_len += 1
+                if paddle.max(nid) == self.eol_token_id:
+                    break
+        return src_ids
+
+
+class GPTLMHead(nn.Layer):
+    def __init__(self, config: GPTConfig, embedding_weights=None):
+        super(GPTLMHead, self).__init__()
+        self.config = config
+        self.transpose_y = True
+
+        if embedding_weights is not None:
+            self.transpose_y = True
+            self.weight = embedding_weights
+        else:
+            if config.tensor_parallel_degree > 1:
+                vocab_size = config.vocab_size // config.tensor_parallel_degree
+            else:
+                vocab_size = config.vocab_size
+
+            if vocab_size != config.vocab_size:
+                with get_rng_state_tracker().rng_state():
+                    self.weight = self.create_parameter(
+                        shape=[vocab_size, config.hidden_size],
+                        dtype=paddle.get_default_dtype(),
+                    )
+            else:
+                self.weight = self.create_parameter(
+                    shape=[vocab_size, config.hidden_size],
+                    dtype=paddle.get_default_dtype(),
+                )
+            # Must set distributed attr for Tensor Parallel !
+            self.weight.is_distributed = True if (vocab_size != config.vocab_size) else False
+            if self.weight.is_distributed:
+                self.weight.split_axis = 0
+
+    def forward(self, hidden_states, tensor_parallel_output=None):
+        if self.config.sequence_parallel:
+            hidden_states = GatherOp.apply(hidden_states)
+            hidden_states = paddle.reshape_(hidden_states, [-1, self.config.seq_length, self.config.hidden_size])
+
+        if tensor_parallel_output is None:
+            tensor_parallel_output = self.config.tensor_parallel_output
+
+        logits = parallel_matmul(
+            hidden_states, self.weight, transpose_y=self.transpose_y, tensor_parallel_output=tensor_parallel_output
+        )
+        return logits
+
+
+class GPTForCausalLM(GPTPretrainedModel):
+    """
+    The GPT Model with a `language modeling` head on top.
+
+    Args:
+        gpt (:class:`GPTModel`):
+            An instance of :class:`GPTModel`.
+
+    """
+
+    def __init__(self, config: GPTConfig):
+        super(GPTForCausalLM, self).__init__(config)
+        self.gpt = GPTModel(config)
+        self.lm_head = GPTLMHead(config, embedding_weights=self.gpt.embeddings.word_embeddings.weight)
+
+        self.tie_weights()
+        self.criterion = GPTPretrainingCriterion(config)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        use_cache=False,
+        past_key_values=None,
+        labels=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
+    ):
+        r"""
+
+        Args:
+            input_ids (Tensor, optional):
+                See :class:`GPTModel`.
+            position_ids (Tensor, optional):
+                See :class:`GPTModel`.
+            attention_mask (Tensor, optional):
+                See :class:`GPTModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`GPTModel`.
+            use_cache (bool, optional):
+                See :class:`GPTModel`.
+            past_key_values (Tensor, optional):
+                See :class:`GPTModel`.
+            labels (paddle.Tensor, optional):
+                A Tensor of shape `(batch_size, sequence_length)`.
+                Labels for language modeling. Note that the labels are shifted inside the model, i.e. you can set
+                `labels = input_ids` Indices are selected in `[-100, 0, ..., vocab_size]` All labels set to `-100`
+                are ignored (masked), the loss is only computed for labels in `[0, ..., vocab_size]`
+                Defaults to None.
+            output_attentions (bool, optional):
+                See :class:`GPTModel`.
+            output_hidden_states (bool, optional):
+                See :class:`GPTModel`.
+            return_dict (bool, optional):
+                See :class:`GPTModel`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions`.
+
+            Especialy, when `return_dict=use_cache=output_attentions=output_hidden_states=False`,
+            returns a tensor `logits` which is the output of the gpt model.
+        """
+        input_type = type(input_ids) if input_ids is not None else type(inputs_embeds)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        outputs = self.gpt(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if isinstance(outputs, input_type):
+            hidden_states = outputs
+        else:
+            hidden_states = outputs[0]
+
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss = self.criterion(logits, labels)
+            # # Shift so that tokens < n predict n
+            # shift_logits = logits[:, :-1, :]
+            # shift_labels = labels[:, 1:]
+            # # Flatten the tokens
+            # loss_fct = CrossEntropyLoss()
+            # loss = loss_fct(shift_logits.reshape((-1, shift_logits.shape[-1])), shift_labels.reshape((-1,)))
+
+        # outputs = [output, all_hidden_states, new_caches, all_self_attentions]
+        if not return_dict:
+            if isinstance(outputs, input_type):
+                return (loss, logits) if loss is not None else logits
+
+            outputs = (logits,) + outputs[1:]
+            return ((loss,) + outputs) if loss is not None else outputs
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_fast_entry(self, kwargs):
+        from paddlenlp.ops import FasterGPT
+
+        use_fp16_decoding = kwargs.get("use_fp16_decoding", False)
+        decode_strategy = kwargs.get("decode_strategy")
+        if decode_strategy == "beam_search":
+            raise AttributeError("'beam_search' is not supported yet in the fast version of GPT")
+        # Currently, FasterTransformer only support restricted size_per_head.
+        size_per_head = self.gpt.config["hidden_size"] // self.gpt.config["num_attention_heads"]
+        if size_per_head not in [32, 64, 80, 96, 128]:
+            raise AttributeError(
+                "'size_per_head = %d' is not supported yet in the fast version of GPT" % size_per_head
+            )
+        if kwargs["forced_bos_token_id"] is not None:
+            # not support for min_length yet in the fast version
+            raise AttributeError("'forced_bos_token_id != None' is not supported yet in the fast version")
+        if kwargs["min_length"] != 0:
+            # not support for min_length yet in the fast version
+            raise AttributeError("'min_length != 0' is not supported yet in the fast version")
+        self._fast_entry = FasterGPT(self, use_fp16_decoding=use_fp16_decoding).forward
+        return self._fast_entry
+
+    def prepare_inputs_for_generation(self, input_ids, use_cache=False, past_key_values=None, **kwargs):
+        # only last token for inputs_ids if cache is defined in kwargs
+        position_ids = kwargs.get("position_ids", None)
+        # attention_mask = kwargs.get("attention_mask", None)
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+            if position_ids is not None:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        return {
+            "input_ids": input_ids,
+            "position_ids": position_ids,
+            "attention_mask": None,
+            "use_cache": use_cache,
+            "past_key_values": past_key_values,
+        }
+
+    @staticmethod
+    def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id):
+        is_pad_token_in_inputs_ids = (pad_token_id is not None) and float(paddle.any(input_ids == pad_token_id))
+        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (
+            (eos_token_id is not None) and (pad_token_id != eos_token_id)
+        )
+        if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id:
+            attention_mask = (input_ids != pad_token_id).astype("int64")
+        else:
+            attention_mask = paddle.ones_like(input_ids, dtype="int64")
+        return paddle.unsqueeze(attention_mask, axis=[1, 2])
+
+
+class GPTForTokenClassification(GPTPretrainedModel):
+    """
+    GPT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
+
+    Args:
+        gpt (:class:`GPTModel`):
+            An instance of GPTModel.
+        num_labels (int, optional):
+            The number of classes. Defaults to `2`.
+        dropout (float, optional):
+            The dropout probability for output of GPT.
+            If None, use the same value as `hidden_dropout_prob` of `GPTModel`
+            instance `gpt`. Defaults to None.
+    """
+
+    def __init__(self, config: GPTConfig):
+        super(GPTForTokenClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.gpt = GPTModel(config)  # allow gpt to be config
+        dropout_p = config.hidden_dropout_prob if config.classifier_dropout is None else config.classifier_dropout
+        self.dropout = nn.Dropout(dropout_p)
+        self.classifier = Linear(config.hidden_size, config.num_labels)
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
+    ):
+        r"""
+        The GPTForTokenClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor, optional):
+                See :class:`GPTModel`.
+            position_ids(Tensor, optional):
+                See :class:`GPTModel`.
+            attention_mask (list, optional):
+                See :class:`GPTModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`GPTModel`.
+            labels (Tensor, optional):
+                Labels of shape `(batch_size, sequence_length)` for computing the sequence classification/regression loss. Indices should be in
+                `[0, ..., num_labels - 1]`. If `num_labels == 1` a regression loss is computed (Mean-Square loss), If
+                `num_labels > 1` a classification loss is computed (Cross-Entropy). Defaults to None.
+            output_attentions (bool, optional):
+                See :class:`GPTModel`.
+            output_hidden_states (bool, optional):
+                See :class:`GPTModel`.
+            return_dict (bool, optional):
+                See :class:`GPTModel`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput`.
+
+            Especialy, when `return_dict=output_attentions=output_hidden_states=False`,
+            returns tensor `logits`, a tensor of the input token classification logits.
+            Shape as `[batch_size, sequence_length, num_labels]` and dtype as `float32`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import GPTForTokenClassification, GPTTokenizer
+
+                tokenizer = GPTTokenizer.from_pretrained('gpt2-medium-en')
+                model = GPTForTokenClassification.from_pretrained('gpt2-medium-en')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!", return_token_type_ids=False)
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+
+        """
+        input_type = type(input_ids) if input_ids is not None else type(inputs_embeds)
+        sequence_output = self.gpt(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if isinstance(sequence_output, input_type):
+            hidden_states = sequence_output
+        else:
+            hidden_states = sequence_output[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+
+        if not return_dict:
+            if isinstance(sequence_output, input_type):
+                return (loss, logits) if loss is not None else logits
+
+            outputs = (logits,) + sequence_output[1:]
+            return ((loss,) + outputs) if loss is not None else outputs
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=sequence_output.hidden_states,
+            attentions=sequence_output.attentions,
+        )
+
+
+class GPTForSequenceClassification(GPTPretrainedModel):
+    """
+    GPT Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.
+    for GLUE tasks.
+
+    Args:
+        gpt (:class:`GPTModel`):
+            An instance of GPTModel.
+        num_labels (int, optional):
+            The number of classes. Defaults to `2`.
+
+    """
+
+    def __init__(self, config: GPTConfig):
+        super(GPTForSequenceClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.gpt = GPTModel(config)
+        self.score = Linear(config.hidden_size, config.num_labels, bias_attr=False)
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=False,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
+    ):
+        r"""
+        The GPTForSequenceClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor, optional):
+                See :class:`GPTModel`.
+            position_ids(Tensor, optional):
+                See :class:`GPTModel`.
+            attention_mask (list, optional):
+                See :class:`GPTModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`GPTModel`.
+            labels (Tensor, optional):
+                Labels of shape `(batch_size, sequence_length)` for computing the sequence classification/regression loss. Indices should be in
+                `[0, ..., num_labels - 1]`. If `num_labels == 1` a regression loss is computed (Mean-Square loss), If
+                `num_labels > 1` a classification loss is computed (Cross-Entropy). Defaults to None.
+            use_cache (bool, optional):
+                See :classL `GPTModel`.
+            output_attentions (bool, optional):
+                See :class:`GPTModel`.
+            output_hidden_states (bool, optional):
+                See :class:`GPTModel`.
+            return_dict (bool, optional):
+                See :class:`GPTModel`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutputWithPast` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutputWithPast`.
+
+            Especialy, when `return_dict=output_attentions=output_hidden_states=False`,
+            returns tensor `logits`, a tensor of the input text classification logits.
+            Shape as `[batch_size, num_labels]` and dtype as float32.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import GPTForSequenceClassification, GPTTokenizer
+
+                tokenizer = GPTTokenizer.from_pretrained('gpt2-medium-en')
+                model = GPTForSequenceClassification.from_pretrained('gpt2-medium-en')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!", return_token_type_ids=False)
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+
+        """
+        input_type = type(input_ids) if input_ids is not None else type(inputs_embeds)
+        # sequence_output shape [bs, seq_len, hidden_size]
+        sequence_output = self.gpt(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if isinstance(sequence_output, input_type):
+            hidden_states = sequence_output
+        else:
+            hidden_states = sequence_output[0]
+        # logits shape [bs, seq_len, num_class]
+        logits = self.score(hidden_states)
+        # padding index maybe 0
+        eos_token_id = self.gpt.config.eos_token_id or 0
+        # sequence_lengths shape [bs,]
+        if input_ids is not None:
+            sequence_lengths = (input_ids != eos_token_id).astype("int64").sum(axis=-1) - 1
+        else:
+            inputs_shape = inputs_embeds.shape[:-1]
+            sequence_lengths = paddle.ones(inputs_shape[:-1], dtype="int64") * (inputs_shape[1] - 1)
+            logger.warning(
+                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+            )
+
+        pooled_logits = logits.gather_nd(paddle.stack([paddle.arange(logits.shape[0]), sequence_lengths], axis=-1))
+
+        loss = None
+
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+
+        if not return_dict:
+            if isinstance(sequence_output, input_type):
+                return (loss, pooled_logits) if loss is not None else pooled_logits
+
+            outputs = (pooled_logits,) + sequence_output[1:]
+            return ((loss,) + outputs) if loss is not None else outputs
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=sequence_output.past_key_values,
+            hidden_states=sequence_output.hidden_states,
+            attentions=sequence_output.attentions,
+        )
+
+
+GPTLMHeadModel = GPTForCausalLM
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gpt/modeling_auto.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gpt/modeling_auto.py
new file mode 100644
index 000000000..719d4ca4a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gpt/modeling_auto.py
@@ -0,0 +1,1333 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import collections
+import contextlib
+import math
+from functools import partial
+
+import numpy as np
+import paddle
+import paddle.distributed as dist
+import paddle.incubate as incubate
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.tensor as tensor
+from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
+from paddle.distributed.fleet.utils import recompute
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        ScatterOp,
+        mark_as_sequence_parallel_parameter,
+    )
+except:
+    pass
+
+from ...utils.converter import StateDictNameMapping
+from .. import PretrainedModel, register_base_model
+from ..model_outputs import BaseModelOutputWithPastAndCrossAttentions
+from .configuration import GPT_PRETRAINED_INIT_CONFIGURATION, GPTConfig
+
+try:
+    from paddle.nn.functional.flash_attention import flash_attention
+except:
+    flash_attention = None
+try:
+    from paddle.incubate.nn.layer.fused_dropout_add import FusedDropoutAdd
+except:
+    FusedDropoutAdd = None
+
+__all__ = [
+    "GPTModelAuto",
+    "GPTPretrainedModelAuto",
+    "GPTPretrainingCriterionAuto",
+    "GPTLMHeadModelAuto",
+    "GPTForCausalLMAuto",
+    "GPTEmbeddingsAuto",
+    "GPTDecoderLayerAuto",
+]
+
+
+def get_mesh(pp_idx=0):
+    mesh = fleet.auto.get_mesh()
+    if "pp" in mesh.dim_names:
+        mesh = mesh.get_mesh_with_dim("pp")[pp_idx]
+    return mesh
+
+
+def get_triangle_upper_mask(x, mask=None):
+    if mask is not None:
+        return mask
+    if paddle.is_compiled_with_xpu():
+        # xpu does not support set constant to -np.inf
+        mask = paddle.full_like(x, -1e4)
+    else:
+        mask = paddle.full_like(x, -np.inf)
+    mask.stop_gradient = True
+    mask = paddle.triu(mask, diagonal=1)
+    mask.stop_gradient = True
+    return mask
+
+
+def seed_guard_context(name=None):
+    if name in get_rng_state_tracker().states_:
+        return get_rng_state_tracker().rng_state(name)
+    else:
+        return contextlib.nullcontext()
+
+
+def _make_causal_mask(input_ids_shape, past_key_values_length):
+    """
+    Make causal mask used for self-attention
+    """
+    batch_size, target_length = input_ids_shape  # target_length: seq_len
+
+    mask = paddle.tril(paddle.ones((target_length, target_length), dtype="bool"))
+
+    if past_key_values_length > 0:
+        # [tgt_len, tgt_len + past_len]
+        mask = paddle.concat([paddle.ones([target_length, past_key_values_length], dtype="bool"), mask], axis=-1)
+
+    # [bs, 1, tgt_len, tgt_len + past_len]
+    return mask[None, None, :, :].expand([batch_size, 1, target_length, target_length + past_key_values_length])
+
+
+def _expand_2d_mask(mask, dtype, tgt_length):
+    """
+    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
+    """
+    batch_size, src_length = mask.shape[0], mask.shape[-1]
+    tgt_length = tgt_length if tgt_length is not None else src_length
+
+    mask = mask[:, None, None, :].astype("bool")
+    mask.stop_gradient = True
+    expanded_mask = mask.expand([batch_size, 1, tgt_length, src_length])
+
+    return expanded_mask
+
+
+class MultiHeadAttentionAuto(nn.Layer):
+    """
+    Attention mapps queries and a set of key-value pairs to outputs, and
+    Multi-Head Attention performs multiple parallel attention to jointly attending
+    to information from different representation subspaces.
+
+    """
+
+    Cache = collections.namedtuple("Cache", ["k", "v"])
+
+    def __init__(self, config, ipp=None):
+        super(MultiHeadAttentionAuto, self).__init__()
+
+        self.config = config
+
+        # Recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+
+        self.use_flash_attention = config.use_flash_attention if flash_attention else False
+
+        self.head_dim = config.hidden_size // config.num_attention_heads
+        assert (
+            self.head_dim * config.num_attention_heads == config.hidden_size
+        ), "hidden_size must be divisible by num_attention_heads"
+
+        self.num_attention_heads = config.num_attention_heads  # default, without tensor parallel
+        self.ipp = ipp
+
+        if self.config.fuse_attention_qkv:
+            self.qkv_proj = nn.Linear(config.hidden_size, 3 * config.hidden_size, bias_attr=True)
+        else:
+            self.q_proj = nn.Linear(config.hidden_size, config.hidden_size, bias_attr=True)
+            self.k_proj = nn.Linear(config.hidden_size, config.hidden_size, bias_attr=True)
+            self.v_proj = nn.Linear(config.hidden_size, config.hidden_size, bias_attr=True)
+            self.q_proj.weight = dist.shard_tensor(
+                self.q_proj.weight, get_mesh(self.ipp), [dist.Replicate(), dist.Shard(1)]
+            )
+            self.k_proj.weight = dist.shard_tensor(
+                self.k_proj.weight, get_mesh(self.ipp), [dist.Replicate(), dist.Shard(1)]
+            )
+            self.v_proj.weight = dist.shard_tensor(
+                self.v_proj.weight, get_mesh(self.ipp), [dist.Replicate(), dist.Shard(1)]
+            )
+
+        self.out_proj = nn.Linear(config.hidden_size, config.hidden_size, bias_attr=True)
+        self.out_proj.weight = dist.shard_tensor(
+            self.out_proj.weight, get_mesh(self.ipp), [dist.Replicate(), dist.Shard(0)]
+        )
+
+    def _fuse_prepare_qkv(self, query, use_cache=False, past_key_value=None):
+        if self.config.sequence_parallel:
+            # [bs, seq_len, num_head * head_dim] -> [bs / n, seq_len, num_head, head_dim] (n is model parallelism)
+            target_shape = [-1, self.config.seq_length, self.num_attention_heads, 3 * self.head_dim]
+        else:
+            target_shape = [0, 0, self.num_attention_heads, 3 * self.head_dim]
+
+        # bs, seq_len, num_head * 3*head_dim
+        mix_layer = self.qkv_proj(query)
+        # bs, seq_len, num_head, 3*head_dim
+        mix_layer = paddle.reshape_(mix_layer, target_shape)
+        # query_states, key_states, value_states => bs, seq_len, num_head, head_dim
+        query_states, key_states, value_states = paddle.split(mix_layer, num_or_sections=3, axis=-1)
+
+        # [bs, seq_len, num_head, head_dim]
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            # concat along seqlen dimension
+            key_states = paddle.concat([past_key_value[0], key_states], axis=1)
+            value_states = paddle.concat([past_key_value[1], value_states], axis=1)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        return query_states, key_states, value_states, past_key_value
+
+    def _prepare_qkv(self, query, key, value, use_cache=False, past_key_value=None):
+        r"""
+        Prapares linear projected queries, keys and values for usage of subsequnt
+        multiple parallel attention. If `cache` is not None, using cached results
+        to reduce redundant calculations.
+
+        """
+        if self.config.sequence_parallel:
+            # [bs, seq_len, num_head * head_dim] -> [bs/n, seq_len, num_head * head_dim] (n is model parallelism)
+            target_shape = [-1, self.config.seq_length, self.num_attention_heads, self.head_dim]
+        else:
+            target_shape = [0, 0, self.num_attention_heads, self.head_dim]
+
+        query_states = self.q_proj(query)
+        # [bs, seq_len, num_head, head_dim]
+        query_states = tensor.reshape(x=query_states, shape=target_shape)
+
+        key_states = self.k_proj(key)
+        # [bs, seq_len, num_head, head_dim]
+        key_states = tensor.reshape(x=key_states, shape=target_shape)
+
+        value_states = self.v_proj(value)
+        # [bs, seq_len, num_head, head_dim]
+        value_states = tensor.reshape(x=value_states, shape=target_shape)
+
+        # [bs, seq_len, num_head, head_dim]
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            # concat along seqlen dimension
+            key_states = paddle.concat([past_key_value[0], key_states], axis=1)
+            value_states = paddle.concat([past_key_value[1], value_states], axis=1)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        return query_states, key_states, value_states, past_key_value
+
+    def _flash_attention(self, q, k, v, attention_mask=None, output_attentions=False):
+        with seed_guard_context("local_seed"):
+            out, weights = flash_attention(
+                query=q,
+                key=k,
+                value=v,
+                dropout=self.config.attention_probs_dropout_prob,
+                causal=q.shape[1] != 1,
+                return_softmax=output_attentions,
+                training=self.training,
+            )
+        # [bs, seq_len, num_head, head_dim] -> [bs, seq_len, num_head * head_dim]
+        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+        return (out, weights) if output_attentions else out
+
+    def _core_attention(self, q, k, v, attention_mask=None, output_attentions=False):
+        # [bs, seq_len, num_head, head_dim] -> [bs, num_head, seq_len, head_dim]
+        perm = [0, 2, 1, 3]
+        q = tensor.transpose(x=q, perm=perm)
+        k = tensor.transpose(x=k, perm=perm)
+        v = tensor.transpose(x=v, perm=perm)
+        # scale dot product attention
+        product = paddle.matmul(x=q * ((self.config.scale_qk_coeff * self.head_dim) ** -0.5), y=k, transpose_y=True)
+        if self.config.scale_qk_coeff != 1.0:
+            product = product.scale(self.config.scale_qk_coeff)
+
+        # softmax_mask_fuse_upper_triangle is not supported sif paddle is not compiled with cuda/rocm
+        if not paddle.is_compiled_with_cuda():
+            attention_mask = get_triangle_upper_mask(product, attention_mask)
+
+        if attention_mask is not None:
+            product = product + attention_mask.astype(product.dtype)
+            weights = F.softmax(product)
+        else:
+            weights = incubate.softmax_mask_fuse_upper_triangle(product)
+
+        if self.config.attention_probs_dropout_prob:
+            with seed_guard_context("local_seed"):
+                weights = F.dropout(
+                    weights, self.config.attention_probs_dropout_prob, training=self.training, mode="upscale_in_train"
+                )
+
+        out = paddle.matmul(weights, v)
+
+        # combine heads
+        out = tensor.transpose(out, perm=[0, 2, 1, 3])  # bs, seq_len, num_head, head_dim
+        out = tensor.reshape(x=out, shape=[0, 0, -1])  # bs, seq_len, dim
+
+        return (out, weights) if output_attentions else out
+
+    def forward(
+        self, query, key, value, attention_mask=None, use_cache=False, past_key_value=None, output_attentions=False
+    ):
+        r"""
+        Applies multi-head attention to map queries and a set of key-value pairs
+        to outputs.
+        """
+        key = query if key is None else key
+        value = query if value is None else value
+        if self.config.fuse_attention_qkv:
+            # [bs, seq_len, num_head, head_dim]
+            q, k, v, past_key_value = self._fuse_prepare_qkv(query, use_cache, past_key_value)
+        else:
+            # [bs, seq_len, num_head, head_dim]
+            q, k, v, past_key_value = self._prepare_qkv(query, key, value, use_cache, past_key_value)
+
+        if self.config.use_flash_attention:
+            # Flash Attention now ignore attention mask
+            # Current Flash Attention doesn't support attn maskt
+            # Paddle Flash Attention input [batch_size, seq_len, num_heads, head_dim]
+            # Torch Flash Attention input (batch_size, seqlen, nheads, headdim)
+            # bsz, q_len, num_heads, head_dim = q.shape
+            # TODO: Support attention mask for flash attention
+            attention_func = self._flash_attention
+        else:
+            # scale dot product attention
+            # [bs, seq_len, num_head,]
+            attention_func = self._core_attention
+
+        has_gradient = (not q.stop_gradient) or (not k.stop_gradient) or (not v.stop_gradient)
+        if self.enable_recompute and self.config.recompute_granularity == "core_attn" and has_gradient:
+            outputs = recompute(attention_func, q, k, v, attention_mask, output_attentions, use_reentrant=False)
+        else:
+            outputs = attention_func(q, k, v, attention_mask=attention_mask, output_attentions=output_attentions)
+
+        if output_attentions:
+            out, weights = outputs
+        else:
+            out = outputs
+
+        # if sequence_parallel is true, out shape are [bs, seq_len, num_head * head_dim / n]
+        # else their shape are [bs, q_len, num_head * head_dim / n], n is mp parallelism.
+
+        if self.config.sequence_parallel:
+            bs, seq_len, dim = out.shape
+            out = out.reshape([bs * seq_len, dim])  # [bs, seq_len, dim / n] => [bs * seq_len, dim / n]
+
+        # project to output
+        out = self.out_proj(out)
+        # if sequence_parallel is true, out shape are [bs * seq_len / n, dim]
+        # else their shape are [bs, seq_len, dim], n is mp parallelism.
+        outs = [out]
+        if output_attentions:
+            outs.append(weights)
+        if use_cache:
+            outs.append(past_key_value)
+        return out if len(outs) == 1 else tuple(outs)
+
+
+class TransformerDecoder(nn.Layer):
+    """
+    TransformerDecoder is a stack of N decoder layers.
+    """
+
+    def __init__(self, config, decoder_layers, norm=None, hidden_size=None):
+        super(TransformerDecoder, self).__init__()
+
+        self.config = config
+        self.layers = decoder_layers
+        self.norm = nn.LayerNorm(config.hidden_size, epsilon=1e-5, bias_attr=True)
+
+        if config.sequence_parallel:
+            mark_as_sequence_parallel_parameter(self.norm.weight)
+            mark_as_sequence_parallel_parameter(self.norm.bias)
+
+        # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True
+        # Enable_recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+
+    @paddle.jit.not_to_static
+    def recompute_training(
+        self,
+        layer_module: nn.Layer,
+        hidden_states: paddle.Tensor,
+        past_key_value: paddle.Tensor,
+        attention_mask: paddle.Tensor,
+        use_cache: bool,
+        output_attentions: paddle.Tensor,
+    ):
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs, output_attentions)
+
+            return custom_forward
+
+        # GPTDecoderLayer
+        # def forward(
+        #     self, hidden_states, attention_mask=None, use_cache=False, past_key_value=None, output_attentions=False
+        # ):
+        hidden_states = recompute(
+            create_custom_forward(layer_module),
+            hidden_states,
+            attention_mask,
+            use_cache,
+            past_key_value,
+            use_reentrant=self.config.recompute_use_reentrant,
+        )
+        return hidden_states
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        use_cache=False,
+        past_key_values=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
+    ):
+        r"""
+        Applies a stack of N Transformer decoder layers on inputs. If `norm` is
+        provided, also applies layer normalization on the output of last decoder
+        layer.
+        """
+
+        # [bs * seq_len, embed_dim] -> [seq_len * bs / n, embed_dim] (sequence_parallel)
+
+        output = hidden_states
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        next_decoder_cache = () if use_cache else None
+
+        pre_ipp = None
+        for i, decoder_layer in enumerate(self.layers):
+            if decoder_layer.ipp is not None and pre_ipp != decoder_layer.ipp:
+                output = dist.reshard(output, get_mesh(decoder_layer.ipp), [dist.Shard(0), dist.Replicate()])
+            has_gradient = not output.stop_gradient
+            if self.enable_recompute and has_gradient and self.config.recompute_granularity == "full_attn":
+                outputs = self.recompute_training(
+                    layer_module=decoder_layer,
+                    hidden_states=output,
+                    attention_mask=attention_mask,
+                    use_cache=use_cache,
+                    past_key_value=None,
+                    output_attentions=output_attentions,
+                )
+            else:
+                outputs = decoder_layer(
+                    output,
+                    attention_mask=attention_mask,
+                    use_cache=use_cache,
+                    past_key_value=past_key_values[i] if past_key_values is not None else None,
+                    output_attentions=output_attentions,
+                )
+
+            # outputs = hidden_states if both use_cache and output_attentions are False
+            # Otherwise, outputs = (hidden_states, attention if output_attentions, cache if use_cache)
+            output = outputs[0] if (use_cache or output_attentions) else outputs
+            all_self_attentions = all_self_attentions + (outputs[1],) if output_attentions else None
+            all_hidden_states = all_hidden_states + (output,) if output_hidden_states else None
+            next_decoder_cache = next_decoder_cache + (outputs[-1],) if use_cache else None
+            pre_ipp = decoder_layer.ipp
+
+        if self.norm is not None:
+            output = self.norm(output)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            temp_list = [output, next_cache, all_hidden_states, all_self_attentions]
+
+            if not (use_cache or output_attentions or output_hidden_states):
+                return output
+
+            return tuple(v for v in temp_list if v is not None)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=output,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=None,
+        )
+
+
+class GPTDecoderLayerAuto(nn.Layer):
+    """
+    The transformer decoder layer.
+
+    It contains multiheadattention and some linear layers.
+    """
+
+    def __init__(self, config: GPTConfig, ipp=None):
+        super(GPTDecoderLayerAuto, self).__init__()
+        self.config = config
+        self.ipp = ipp
+
+        # Recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+
+        if not FusedDropoutAdd:
+            config.use_fused_dropout_add = False
+
+        self.self_attn = MultiHeadAttentionAuto(config, ipp)
+
+        self.linear1 = nn.Linear(config.hidden_size, config.intermediate_size, bias_attr=True)
+        self.linear2 = nn.Linear(config.intermediate_size, config.hidden_size, bias_attr=True)
+
+        self.linear1.weight = dist.shard_tensor(self.linear1.weight, get_mesh(ipp), [dist.Replicate(), dist.Shard(1)])
+        self.linear2.weight = dist.shard_tensor(self.linear2.weight, get_mesh(ipp), [dist.Replicate(), dist.Shard(0)])
+
+        self.norm1 = nn.LayerNorm(config.hidden_size, epsilon=1e-5, bias_attr=True)
+        self.norm2 = nn.LayerNorm(config.hidden_size, epsilon=1e-5, bias_attr=True)
+
+        if config.sequence_parallel:
+            mark_as_sequence_parallel_parameter(self.norm1.weight)
+            mark_as_sequence_parallel_parameter(self.norm1.bias)
+            mark_as_sequence_parallel_parameter(self.norm2.weight)
+            mark_as_sequence_parallel_parameter(self.norm2.bias)
+
+        if config.use_fused_dropout_add:
+            self.fused_dropout_add1 = FusedDropoutAdd(config.attention_probs_dropout_prob, mode="upscale_in_train")
+            self.fused_dropout_add2 = FusedDropoutAdd(config.hidden_dropout_prob, mode="upscale_in_train")
+        else:
+            self.dropout1 = nn.Dropout(config.attention_probs_dropout_prob, mode="upscale_in_train")
+            self.dropout2 = nn.Dropout(config.hidden_dropout_prob, mode="upscale_in_train")
+
+        if config.hidden_activation == "gelu":
+            self.activation = F.gelu
+        else:
+            self.activation = getattr(F, config.hidden_activation)
+
+    def forward(
+        self, hidden_states, attention_mask=None, use_cache=False, past_key_value=None, output_attentions=False
+    ):
+        # when sequence_parallel=True:
+        # hidden_states => [bs * seq_len / n, embed_dim]
+        residual = hidden_states
+        if self.config.normalize_before:
+            hidden_states = self.norm1(hidden_states)
+
+        # self.self_attn:
+        # def forward(
+        #     self, query, key, value, attention_mask=None, use_cache=False, past_key_value=None, output_attentions=False
+        # ):
+        # self.self_attn(...) --> hidden_states, weights, (past_key_value)
+        has_gradient = not hidden_states.stop_gradient
+        if self.enable_recompute and has_gradient and self.config.recompute_granularity == "full_attn":
+            hidden_states = recompute(
+                self.self_attn,
+                hidden_states,
+                None,
+                None,
+                attention_mask,
+                use_cache,
+                past_key_value,
+                output_attentions,
+                use_reentrant=False,
+            )
+        else:
+            hidden_states = self.self_attn(
+                hidden_states, None, None, attention_mask, use_cache, past_key_value, output_attentions
+            )
+
+        # when sequence_parallel=True:
+        # hidden_states => [bs * seq_len / n, embed_dim]
+        incremental_cache = hidden_states[-1] if use_cache else None
+        attention_weights = hidden_states[1] if output_attentions else None
+        hidden_states = hidden_states[0] if (use_cache or output_attentions) else hidden_states
+
+        # Use a ternary operator for a more concise assignment of current_seed
+        current_seed = "local_seed" if self.config.sequence_parallel else "global_seed"
+
+        # The 'with' block ensures the correct seed context is used
+        with seed_guard_context(current_seed):
+            if self.config.use_fused_dropout_add:
+                hidden_states = self.fused_dropout_add1(hidden_states, residual)
+            else:
+                hidden_states = residual + self.dropout1(hidden_states)
+
+        if not self.config.normalize_before:
+            hidden_states = self.norm1(hidden_states)
+
+        residual = hidden_states
+        if self.config.normalize_before:
+            hidden_states = self.norm2(hidden_states)
+
+        # when sequence_parallel=True:
+        # hidden_states => [bs * seq_len / n, embed_dim]
+        with seed_guard_context(current_seed):
+            if not self.config.use_fused_dropout_add:
+                act = self.activation(self.linear1(hidden_states), approximate=True)
+                l_2 = self.linear2(act)
+                hidden_states = residual + self.dropout2(l_2)
+            else:
+                hidden_states = self.fused_dropout_add2(
+                    self.linear2(self.activation(self.linear1(hidden_states), approximate=True)), residual
+                )
+        if not self.config.normalize_before:
+            hidden_states = self.norm2(hidden_states)
+
+        if not (output_attentions or use_cache):
+            return hidden_states
+
+        temp_list = [
+            hidden_states,
+            attention_weights,
+            incremental_cache,
+        ]
+
+        return tuple(v for v in temp_list if v is not None)
+
+
+class GPTEmbeddingsAuto(nn.Layer):
+    """
+    Include embeddings from word and position embeddings.
+    """
+
+    def __init__(
+        self,
+        config,
+    ):
+        super(GPTEmbeddingsAuto, self).__init__()
+
+        self.config = config
+
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings,
+            config.hidden_size,
+        )
+        self.word_embeddings.weight = dist.shard_tensor(
+            self.word_embeddings.weight, get_mesh(), [dist.Replicate(), dist.Shard(1)]
+        )
+        self.position_embeddings.weight = dist.shard_tensor(
+            self.position_embeddings.weight, get_mesh(), [dist.Replicate(), dist.Shard(1)]
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, position_ids=None, inputs_embeddings=None):
+        if position_ids is None and inputs_embeddings is None:
+            raise ValueError("You have to specify either `inputs_embeddings` or `position_ids`)")
+        if position_ids is not None and inputs_embeddings is not None:
+            raise ValueError("You cannot specify both `inputs_embeddings` and `position_ids`)")
+
+        # if input_ids is not None:
+        #     input_shape = input_ids.shape
+        #     inputs_embeddings = self.word_embeddings(input_ids)
+
+        if input_ids is not None:
+            input_shape = input_ids.shape
+            inputs_embeddings = self.word_embeddings(input_ids)
+        else:
+            input_shape = inputs_embeddings.shape[:-1]
+
+        if position_ids is None:
+            ones = paddle.ones(input_shape, dtype="int64")
+            seq_length = paddle.cumsum(ones, axis=-1)
+            position_ids = seq_length - ones
+
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = inputs_embeddings + position_embeddings
+
+        if self.config.sequence_parallel:
+            bs, seq_len, hidden_size = embeddings.shape
+            # [bs, seq_len, dim] -> [bs * seq_len, dim]
+            embeddings = paddle.reshape_(embeddings, [bs * seq_len, hidden_size])
+            # [bs * seq_len / n, dim] (n is mp parallelism)
+            embeddings = ScatterOp.apply(embeddings)
+
+        # Use a ternary operator for a more concise assignment of current_seed
+        current_seed = "local_seed" if self.config.sequence_parallel else "global_seed"
+        # The 'with' block ensures the correct seed context is used
+        with seed_guard_context(current_seed):
+            embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class GPTPretrainedModelAuto(PretrainedModel):
+    """
+    An abstract class for pretrained GPT models. It provides GPT related
+    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
+    `pretrained_init_configuration`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    model_config_file = "model_config.json"
+    resource_files_names = {"model_state": "model_state.pdparams"}
+    base_model_prefix = "gpt"
+    config_class = GPTConfig
+    pretrained_init_configuration = GPT_PRETRAINED_INIT_CONFIGURATION
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config, is_split=True):
+
+        from paddlenlp.transformers.conversion_utils import split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def get_tensor_parallel_split_mappings(num_layers):
+            final_actions = {}
+            base_actions = {
+                # Column Linear
+                "layers.0.linear1.weight": partial(fn, is_column=True),
+                "layers.0.linear1.bias": partial(fn, is_column=True),
+                # Row Linear
+                "word_embeddings.weight": partial(fn, is_column=False),
+                "layers.0.self_attn.out_proj.weight": partial(fn, is_column=False),
+                "layers.0.linear2.weight": partial(fn, is_column=False),
+            }
+
+            if config.fuse_attention_qkv:
+                base_actions["layers.0.self_attn.qkv_proj.weight"] = partial(fn, is_column=True)
+                base_actions["layers.0.self_attn.qkv_proj.bias"] = partial(fn, is_column=True)
+            else:
+                base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True)
+                base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True)
+                base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True)
+                base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True)
+                base_actions["layers.0.self_attn.k_proj.bias"] = partial(fn, is_column=True)
+                base_actions["layers.0.self_attn.v_proj.bias"] = partial(fn, is_column=True)
+
+            for key, action in base_actions.items():
+                if "layers.0." in key:
+                    for i in range(num_layers):
+                        final_actions[key.replace("layers.0.", f"layers.{i}.")] = action
+                final_actions[key] = action
+
+            return final_actions
+
+        mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
+
+        return mappings
+
+    @classmethod
+    def _get_name_mappings(cls, config: GPTConfig) -> list[StateDictNameMapping]:
+        mappings: list[StateDictNameMapping] = []
+        model_mappings = [
+            ["wte.weight", "embeddings.word_embeddings.weight"],
+            ["wpe.weight", "embeddings.position_embeddings.weight"],
+            ["ln_f.weight", "decoder.norm.weight"],
+            ["ln_f.bias", "decoder.norm.bias"],
+        ]
+        for layer_index in range(config.num_hidden_layers):
+            layer_mappings = [
+                [f"h.{layer_index}.ln_1.weight", f"decoder.layers.{layer_index}.norm1.weight"],
+                [f"h.{layer_index}.ln_1.bias", f"decoder.layers.{layer_index}.norm1.bias"],
+                [f"h.{layer_index}.ln_2.weight", f"decoder.layers.{layer_index}.norm2.weight"],
+                [f"h.{layer_index}.ln_2.bias", f"decoder.layers.{layer_index}.norm2.bias"],
+                [f"h.{layer_index}.mlp.c_fc.weight", f"decoder.layers.{layer_index}.linear1.weight"],
+                [f"h.{layer_index}.mlp.c_fc.bias", f"decoder.layers.{layer_index}.linear1.bias"],
+                [f"h.{layer_index}.mlp.c_proj.weight", f"decoder.layers.{layer_index}.linear2.weight"],
+                [f"h.{layer_index}.mlp.c_proj.bias", f"decoder.layers.{layer_index}.linear2.bias"],
+                [f"h.{layer_index}.attn.c_proj.weight", f"decoder.layers.{layer_index}.self_attn.out_proj.weight"],
+                [f"h.{layer_index}.attn.c_proj.bias", f"decoder.layers.{layer_index}.self_attn.out_proj.bias"],
+                # attention
+                [
+                    f"h.{layer_index}.attn.c_attn.weight",
+                    f"decoder.layers.{layer_index}.self_attn.q_proj.weight",
+                    "split",
+                    0,
+                ],
+                [
+                    f"h.{layer_index}.attn.c_attn.bias",
+                    f"decoder.layers.{layer_index}.self_attn.q_proj.bias",
+                    "split",
+                    0,
+                ],
+                [
+                    f"h.{layer_index}.attn.c_attn.weight",
+                    f"decoder.layers.{layer_index}.self_attn.k_proj.weight",
+                    "split",
+                    1,
+                ],
+                [
+                    f"h.{layer_index}.attn.c_attn.bias",
+                    f"decoder.layers.{layer_index}.self_attn.k_proj.bias",
+                    "split",
+                    1,
+                ],
+                [
+                    f"h.{layer_index}.attn.c_attn.weight",
+                    f"decoder.layers.{layer_index}.self_attn.v_proj.weight",
+                    "split",
+                    2,
+                ],
+                [
+                    f"h.{layer_index}.attn.c_attn.bias",
+                    f"decoder.layers.{layer_index}.self_attn.v_proj.bias",
+                    "split",
+                    2,
+                ],
+            ]
+
+            model_mappings.extend(layer_mappings)
+
+        # downstream mappings
+        if "GPT2Model" not in config.architectures:
+            for mapping in model_mappings:
+                mapping[0] = "transformer." + mapping[0]
+                mapping[1] = "gpt." + mapping[1]
+        if "GPT2ForTokenClassification" in config.architectures:
+            model_mappings.extend([["classifier.weight", "classifier.weight", "transpose"]])
+        if "GPT2ForSequenceClassification" in config.architectures:
+            model_mappings.extend([["score.weight", "score.weight", "transpose"]])
+        if "GPT2LMHeadModel" in config.architectures:
+            model_mappings.append(["lm_head.weight", "lm_head.decoder.weight"])
+
+        mappings = [StateDictNameMapping(*mapping) for mapping in model_mappings]
+        return mappings
+
+
+@register_base_model
+class GPTModelAuto(GPTPretrainedModelAuto):
+    r"""
+    The bare GPT Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        vocab_size (int):
+            Vocabulary size of `inputs_ids` in `GPTModel`. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `GPTModel`.
+        hidden_size (int, optional):
+            Dimensionality of the embedding layer and decoder layer. Defaults to `768`.
+        num_hidden_layers (int, optional):
+            Number of hidden layers in the Transformer decoder. Defaults to `12`.
+        num_attention_heads (int, optional):
+            Number of attention heads for each attention layer in the Transformer decoder.
+            Defaults to `12`.
+        intermediate_size (int, optional):
+            Dimensionality of the feed-forward (ff) layer in the decoder. Input tensors
+            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
+            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
+            Defaults to `3072`.
+        hidden_act (str, optional):
+            The non-linear activation function in the feed-forward layer.
+            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
+            are supported. Defaults to `"gelu"`.
+        hidden_dropout_prob (float, optional):
+            The dropout probability for all fully connected layers in the embeddings and decoder.
+            Defaults to `0.1`.
+        attention_probs_dropout_prob (float, optional):
+            The dropout probability used in MultiHeadAttention in all decoder layers to drop some attention target.
+            Defaults to `0.1`.
+        max_position_embeddings (int, optional):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
+            sequence. Defaults to `512`.
+        type_vocab_size (int, optional):
+            The vocabulary size of the `token_type_ids`. Defaults to `16`.
+
+            .. note::
+                Please NOT using `type_vocab_size`, for it will be obsolete in the future..
+
+        initializer_range (float, optional):
+            The standard deviation of the normal initializer. Default to `0.02`.
+
+            .. note::
+                A normal_initializer initializes weight matrices as normal distributions.
+                See :meth:`GPTPretrainedModelAuto._init_weights()` for how weights are initialized in `GPTModelAuto`.
+
+        pad_token_id(int, optional):
+            The index of padding token in the token vocabulary.
+            Defaults to `0`.
+
+    """
+
+    def __init__(self, config: GPTConfig):
+        super(GPTModelAuto, self).__init__(config)
+
+        self.config = config
+
+        self.pad_token_id = config.pad_token_id
+        self.eos_token_id = config.eos_token_id
+        self.bos_token_id = config.bos_token_id
+        self.eol_token_id = config.eol_token_id
+        self.vocab_size = config.vocab_size
+
+        self.bias = paddle.tril(
+            paddle.ones([1, 1, config.max_position_embeddings, config.max_position_embeddings], dtype="int64")
+        )
+
+        self.embeddings = GPTEmbeddingsAuto(config)
+
+        decoder_layers = nn.LayerList()
+        for i in range(config.num_hidden_layers):
+            decoder_layers.append(GPTDecoderLayerAuto(config, self.get_layer_ipp(i)))
+
+        self.decoder = TransformerDecoder(
+            config,
+            decoder_layers,
+        )
+
+    def get_layer_ipp(self, layer_index):
+        mesh = fleet.auto.get_mesh()
+        if "pp" not in mesh.dim_names:
+            return None
+        else:
+            pp_degree = mesh.get_dim_size("pp")
+            layer_per_stage = math.ceil(self.config.num_hidden_layers / pp_degree)
+            return layer_index // layer_per_stage
+
+    def get_last_layer_ipp(self):
+        return self.get_layer_ipp(self.config.num_hidden_layers - 1)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    @staticmethod
+    def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length, dtype):
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            if len(attention_mask.shape) == 2:
+                expanded_attn_mask = _expand_2d_mask(attention_mask, dtype, tgt_length=input_shape[-1])
+                # For decoding phase in generation, seq_length = 1, we don't need to add causal mask
+                if input_shape[-1] > 1:
+                    combined_attention_mask = _make_causal_mask(
+                        input_shape, past_key_values_length=past_key_values_length
+                    )
+                    # NOTE(zhaoyingli): infer spmd does not support [seq_len, seq_len] --> [batch, 1, seq_len, seq_len] in data_parallel
+                    combined_attention_mask = dist.shard_tensor(
+                        combined_attention_mask,
+                        get_mesh(),
+                        [dist.Replicate(), dist.Replicate()],
+                    )
+                    expanded_attn_mask = expanded_attn_mask & combined_attention_mask
+            # [bsz, seq_len, seq_len] -> [bsz, 1, seq_len, seq_len]
+            elif len(attention_mask.shape) == 3:
+                expanded_attn_mask = attention_mask.unsqueeze(1).astype("bool")
+            # if attention_mask is already 4-D, do nothing
+            else:
+                expanded_attn_mask = attention_mask
+        else:
+            expanded_attn_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
+        # Convert bool attention_mask to float attention mask, which will be added to attention_scores later
+        expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
+        return expanded_attn_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        use_cache=False,
+        past_key_values=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
+    ):
+        r"""
+        The GPTModelAuto forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor, optional):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to None.
+            position_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in self attention to avoid performing attention to some unwanted positions,
+                usually the subsequent positions.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                It is a tensor with shape bro   adcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
+                [batch_size, num_attention_heads, sequence_length, sequence_length].
+                Its data type should be int64.
+                The `masked` tokens have `0` values, and the `unmasked` tokens have `1` values.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation
+                of shape `(batch_size, sequence_length, hidden_size)`. This is useful if you want more control over
+                how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+                Default to None.
+            use_cache (bool, optional):
+                Whether or not to use cache. Defaults to `False`. If set to `True`, key value states will be returned and
+                can be used to speed up decoding.
+            past_key_values (list, optional):
+                It is only used for inference and should be None for training.
+                Default to `None`.
+            output_attentions (bool, optional):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail. Defaults to `False`.
+            output_hidden_states (bool, optional):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail. Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions`.
+
+            Especially, When `return_dict=output_hidden_states=output_attentions=False`,
+            returns tensor `outputs` which is the output at the last layer of the model.
+            Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import GPTModelAuto, GPTTokenizer
+
+                tokenizer = GPTTokenizer.from_pretrained('gpt2-medium-en')
+                model = GPTModelAuto.from_pretrained('gpt2-medium-en')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!", return_token_type_ids=False)
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+        """
+
+        if self.config.sequence_parallel and use_cache:
+            raise ValueError("We currently only support sequence parallel without cache.")
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+            input_ids = input_ids.reshape((-1, input_shape[-1]))
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        # input_shape => bs, seq_len
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.decoder.layers))
+
+        if position_ids is None:
+            past_length = 0
+            if past_key_values[0] is not None:
+                # bs, seq_len, num_head, head_dim
+                past_length = past_key_values[0][0].shape[1]
+            position_ids = paddle.arange(past_length, input_shape[-1] + past_length, dtype="int64")
+            position_ids = position_ids.unsqueeze(0)
+            position_ids = paddle.expand(position_ids, input_shape)
+        embedding_output = self.embeddings(
+            input_ids=input_ids, position_ids=position_ids, inputs_embeddings=inputs_embeds
+        )
+        # TODO, use registered buffer
+        length = input_shape[-1]
+        if past_key_values[0] is not None:
+            cache_length = past_key_values[0][0].shape[1]
+            length = length + cache_length
+        else:
+            cache_length = 0
+
+        causal_mask = self.bias[:, :, cache_length:length, :length]
+        if attention_mask is not None:
+            if attention_mask.dtype != paddle.int64:
+                attention_mask = paddle.cast(attention_mask, dtype=paddle.int64)
+            if len(attention_mask.shape) == 2:
+                attention_mask = attention_mask[:, None, None, :]
+            attention_mask = (1.0 - (attention_mask & causal_mask)) * -1e4
+        else:
+            attention_mask = (1.0 - causal_mask) * -1e4
+
+        # The tensor returned by triu not in static graph.
+        attention_mask.stop_gradient = True
+
+        outputs = self.decoder(
+            embedding_output,
+            attention_mask=attention_mask,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+
+        if output_hidden_states:
+            if return_dict:
+                outputs.hidden_states = (embedding_output,) + outputs.hidden_states
+            else:  # outputs is a tuple
+                idx = 2 if use_cache else 1
+                all_hidden_states = (embedding_output,) + outputs[idx]
+                outputs[idx] = all_hidden_states
+
+        return outputs
+
+
+class GPTPretrainingCriterionAuto(paddle.nn.Layer):
+    """
+    Criterion for GPT. It calculates the final loss.
+    """
+
+    def __init__(self, config):
+        super(GPTPretrainingCriterionAuto, self).__init__()
+        self.config = config
+        self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=config.ignore_index)
+
+    def forward(self, prediction_scores, masked_lm_labels, loss_mask=None):
+        """
+        Args:
+            prediction_scores(Tensor):
+                The logits of masked token prediction. Its data type should be float32 and
+                its shape is [batch_size, sequence_length, vocab_size].
+            masked_lm_labels(Tensor):
+                The labels of the masked language modeling, the dimensionality of `masked_lm_labels`
+                is equal to `prediction_scores`. Its data type should be int64 and
+                its shape is [batch_size, sequence_length, 1].
+            loss_mask(Tensor):
+                Mask used for calculating the loss of the masked language modeling to avoid
+                calculating some unwanted tokens.
+                Its data type should be float32 and its shape is [batch_size, sequence_length, 1].
+
+        Returns:
+            Tensor: The pretraining loss. Its data type should be float32 and its shape is [1].
+
+        """
+        with paddle.amp.auto_cast(False):
+            masked_lm_loss = self.loss_func(prediction_scores.astype("float32"), masked_lm_labels.unsqueeze(2))
+            masked_lm_loss = paddle.masked_select(masked_lm_loss, masked_lm_loss > 0).astype("float32")
+            loss = paddle.mean(masked_lm_loss)
+        return loss
+
+
+class GPTLMHeadAuto(nn.Layer):
+    def __init__(self, config: GPTConfig, embedding_weights=None, ipp=None):
+        super(GPTLMHeadAuto, self).__init__()
+        self.config = config
+        self.transpose_y = True
+        self.ipp = ipp
+
+        if embedding_weights is not None:
+            self.transpose_y = True
+            self.weight = embedding_weights
+        else:
+            if config.tensor_parallel_degree > 1:
+                vocab_size = config.vocab_size // config.tensor_parallel_degree
+            else:
+                vocab_size = config.vocab_size
+
+            if vocab_size != config.vocab_size:
+                with get_rng_state_tracker().rng_state():
+                    self.weight = self.create_parameter(
+                        shape=[vocab_size, config.hidden_size],
+                        dtype=paddle.get_default_dtype(),
+                    )
+            else:
+                self.weight = self.create_parameter(
+                    shape=[vocab_size, config.hidden_size],
+                    dtype=paddle.get_default_dtype(),
+                )
+            # Must set distributed attr for Tensor Parallel !
+            self.weight.is_distributed = True if (vocab_size != config.vocab_size) else False
+            if self.weight.is_distributed:
+                self.weight.split_axis = 0
+
+    def forward(self, hidden_states, tensor_parallel_output=None):
+        if tensor_parallel_output is None:
+            tensor_parallel_output = self.config.tensor_parallel_output
+
+        y = dist.reshard(self.weight, get_mesh(self.ipp), [dist.Replicate(), dist.Shard(0)])
+        logits = paddle.matmul(hidden_states, y, transpose_y=self.transpose_y)
+        return logits
+
+
+class GPTForCausalLMAuto(GPTPretrainedModelAuto):
+    """
+    The GPT Model with a `language modeling` head on top.
+
+    Args:
+        gpt (:class:`GPTModelAuto`):
+            An instance of :class:`GPTModelAuto`.
+
+    """
+
+    def __init__(self, config: GPTConfig):
+        super(GPTForCausalLMAuto, self).__init__(config)
+        self.gpt = GPTModelAuto(config)
+        self.ipp = self.gpt.get_last_layer_ipp()
+        self.lm_head = GPTLMHeadAuto(
+            config, embedding_weights=self.gpt.embeddings.word_embeddings.weight, ipp=self.ipp
+        )
+
+        self.tie_weights()
+        self.criterion = GPTPretrainingCriterionAuto(config)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def get_input_embeddings(self):
+        return self.gpt.embeddings.word_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        use_cache=False,
+        past_key_values=None,
+        labels=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
+    ):
+        r"""
+
+        Args:
+            input_ids (Tensor, optional):
+                See :class:`GPTModelAuto`.
+            position_ids (Tensor, optional):
+                See :class:`GPTModelAuto`.
+            attention_mask (Tensor, optional):
+                See :class:`GPTModelAuto`.
+            inputs_embeds (Tensor, optional):
+                See :class:`GPTModelAuto`.
+            use_cache (bool, optional):
+                See :class:`GPTModelAuto`.
+            past_key_values (Tensor, optional):
+                See :class:`GPTModelAuto`.
+            labels (paddle.Tensor, optional):
+                A Tensor of shape `(batch_size, sequence_length)`.
+                Labels for language modeling. Note that the labels are shifted inside the model, i.e. you can set
+                `labels = input_ids` Indices are selected in `[-100, 0, ..., vocab_size]` All labels set to `-100`
+                are ignored (masked), the loss is only computed for labels in `[0, ..., vocab_size]`
+                Defaults to None.
+            output_attentions (bool, optional):
+                See :class:`GPTModelAuto`.
+            output_hidden_states (bool, optional):
+                See :class:`GPTModelAuto`.
+            return_dict (bool, optional):
+                See :class:`GPTModelAuto`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions`.
+
+            Especialy, when `return_dict=use_cache=output_attentions=output_hidden_states=False`,
+            returns a tensor `logits` which is the output of the gpt model.
+        """
+        input_type = type(input_ids) if input_ids is not None else type(inputs_embeds)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        outputs = self.gpt(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if isinstance(outputs, input_type):
+            hidden_states = outputs
+        else:
+            hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        return logits
+
+        # NOTE: The following code failed to run from dynamic to static mode
+        # loss = None
+        # if labels is not None:
+        #     loss = self.criterion(logits, labels)
+        # if not return_dict:
+        #     if isinstance(outputs, input_type):
+        #         return (loss, logits) if loss is not None else logits
+        #     outputs = (logits,) + outputs[1:]
+        #     return ((loss,) + outputs) if loss is not None else outputs
+        # return CausalLMOutputWithCrossAttentions(
+        #     loss=loss,
+        #     logits=logits,
+        #     past_key_values=outputs.past_key_values,
+        #     hidden_states=outputs.hidden_states,
+        #     attentions=outputs.attentions,
+        #     cross_attentions=outputs.cross_attentions,
+        # )
+
+    def prepare_fast_entry(self, kwargs):
+        from paddlenlp.ops import FasterGPT
+
+        use_fp16_decoding = kwargs.get("use_fp16_decoding", False)
+        decode_strategy = kwargs.get("decode_strategy")
+        if decode_strategy == "beam_search":
+            raise AttributeError("'beam_search' is not supported yet in the fast version of GPT")
+        # Currently, FasterTransformer only support restricted size_per_head.
+        size_per_head = self.gpt.config["hidden_size"] // self.gpt.config["num_attention_heads"]
+        if size_per_head not in [32, 64, 80, 96, 128]:
+            raise AttributeError(
+                "'size_per_head = %d' is not supported yet in the fast version of GPT" % size_per_head
+            )
+        if kwargs["forced_bos_token_id"] is not None:
+            # not support for min_length yet in the fast version
+            raise AttributeError("'forced_bos_token_id != None' is not supported yet in the fast version")
+        if kwargs["min_length"] != 0:
+            # not support for min_length yet in the fast version
+            raise AttributeError("'min_length != 0' is not supported yet in the fast version")
+        self._fast_entry = FasterGPT(self, use_fp16_decoding=use_fp16_decoding).forward
+        return self._fast_entry
+
+    def prepare_inputs_for_generation(self, input_ids, use_cache=False, past_key_values=None, **kwargs):
+        # only last token for inputs_ids if cache is defined in kwargs
+        position_ids = kwargs.get("position_ids", None)
+        # attention_mask = kwargs.get("attention_mask", None)
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+            if position_ids is not None:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        return {
+            "input_ids": input_ids,
+            "position_ids": position_ids,
+            "attention_mask": None,
+            "use_cache": use_cache,
+            "past_key_values": past_key_values,
+        }
+
+    @staticmethod
+    def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id):
+        is_pad_token_in_inputs_ids = (pad_token_id is not None) and float(paddle.any(input_ids == pad_token_id))
+        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (
+            (eos_token_id is not None) and (pad_token_id != eos_token_id)
+        )
+        if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id:
+            attention_mask = (input_ids != pad_token_id).astype("int64")
+        else:
+            attention_mask = paddle.ones_like(input_ids, dtype="int64")
+        return paddle.unsqueeze(attention_mask, axis=[1, 2])
+
+
+GPTLMHeadModelAuto = GPTForCausalLMAuto
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gpt/modeling_pp.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gpt/modeling_pp.py
new file mode 100644
index 000000000..7734e8a99
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gpt/modeling_pp.py
@@ -0,0 +1,231 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.distributed.fleet as fleet
+from paddle.distributed.fleet.meta_parallel import (
+    LayerDesc,
+    PipelineLayer,
+    SharedLayerDesc,
+)
+from paddle.distributed.fleet.utils import recompute
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        mark_as_sequence_parallel_parameter,
+    )
+except:
+    pass
+
+from paddlenlp.transformers.model_utils import PipelinePretrainedModel
+
+from .modeling import (
+    GPTConfig,
+    GPTDecoderLayer,
+    GPTEmbeddings,
+    GPTLayerNorm,
+    GPTLMHead,
+    GPTPretrainedModel,
+    GPTPretrainingCriterion,
+)
+
+__all__ = [
+    "GPTForCausalLMPipe",
+]
+
+
+def get_hcg():
+    return fleet.get_hybrid_communicate_group()
+
+
+def get_attr(layer, name):
+    if getattr(layer, name, None) is not None:
+        return getattr(layer, name, None)
+    else:
+        return get_attr(layer._layer, name)
+
+
+def parse_args(args):
+    if isinstance(args, tuple):
+        if len(args) == 3:
+            hidden_states, attention_mask, position_ids = args
+        elif len(args) == 2:
+            hidden_states, attention_mask = args
+            position_ids = None
+    else:
+        hidden_states = args
+        attention_mask, position_ids = None, None
+
+    if position_ids is not None:
+        position_ids.stop_gradient = True
+
+    if attention_mask is not None:
+        attention_mask.stop_gradient = True
+
+    return hidden_states, attention_mask, position_ids
+
+
+def return_args(hidden_states, attention_mask=None, position_ids=None):
+    ret = (hidden_states,)
+
+    if attention_mask is not None:
+        ret += (attention_mask.clone(),)
+    if position_ids is not None:
+        ret += (position_ids.clone(),)
+    if len(ret) == 1:
+        ret = ret[0]
+
+    return ret
+
+
+class GPTEmbeddingPipe(GPTEmbeddings):
+    """Extends GPTEmbeddings to forward attention_mask through the pipeline."""
+
+    def __init__(self, config):
+        super(GPTEmbeddingPipe, self).__init__(config)
+        self.bias = paddle.tril(
+            paddle.ones([1, 1, config.max_position_embeddings, config.max_position_embeddings], dtype="int64")
+        )
+
+    @property
+    def embedding_weight(self):
+        return get_attr(self.word_embeddings, "weight")
+
+    def forward(self, args):
+        input_ids, attention_mask, position_ids = parse_args(args)
+        input_ids.stop_gradient = True
+        embeddings = super().forward(input_ids=input_ids, position_ids=position_ids)
+
+        batch_size, seq_length = input_ids.shape
+        if attention_mask is not None:
+            if attention_mask.dtype != paddle.int64:
+                attention_mask = paddle.cast(attention_mask, dtype=paddle.int64)
+            if len(attention_mask.shape) == 2:
+                attention_mask = attention_mask[:, None, None, :]
+            causal_mask = self.bias[:, :, 0:seq_length, :seq_length]
+            attention_mask = (1.0 - (attention_mask & causal_mask)) * -1e4
+
+        return return_args(embeddings, attention_mask, position_ids)
+
+
+class GPTDecoderLayerPipe(GPTDecoderLayer):
+    def forward(self, args):
+        hidden_states, attention_mask, position_ids = parse_args(args)
+        if self.enable_recompute and self.config.recompute_granularity == "full":
+            hidden_states = recompute(super().forward, hidden_states, attention_mask)
+        else:
+            hidden_states = super().forward(hidden_states, attention_mask)
+
+        return return_args(hidden_states, attention_mask, position_ids)
+
+
+class LayerNormPipe(GPTLayerNorm):
+    def __init__(self, config):
+        super(LayerNormPipe, self).__init__(config, config.hidden_size, epsilon=1e-05)
+        if config.sequence_parallel:
+            mark_as_sequence_parallel_parameter(self.weight)
+            mark_as_sequence_parallel_parameter(self.bias)
+
+    def forward(self, args):
+        hidden_states, attention_mask, position_ids = parse_args(args)
+        hidden_states = super().forward(hidden_states)
+        return hidden_states
+
+
+class GPTLMHeadPipe(GPTLMHead):
+    def __init__(self, config):
+        super(GPTLMHeadPipe, self).__init__(config)
+
+    @property
+    def embedding_weight(self):
+        return get_attr(self, "weight")
+
+
+class GPTForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):
+    """LlamaForPretraining adapted for pipeline parallelism.
+
+    The largest change is flattening the LlamaModel class so we can express it as a
+    sequence of layers including embedding, transformer layers, and output.
+    """
+
+    config_class = GPTConfig
+
+    _get_tensor_parallel_mappings = GPTPretrainedModel._get_tensor_parallel_mappings
+    _get_fuse_or_split_param_mappings = GPTPretrainedModel._get_fuse_or_split_param_mappings
+    _init_weights = GPTPretrainedModel._init_weights
+
+    pretrained_init_configuration = GPTPretrainedModel.pretrained_init_configuration
+    pretrained_resource_files_map = GPTPretrainedModel.pretrained_resource_files_map
+
+    # NO base_model_prefix !!!!
+
+    def __init__(
+        self,
+        config,
+        pp_recompute_interval=1,
+    ):
+        self.config = config
+
+        virtual_pp_degree = getattr(self.config, "virtual_pp_degree", 1)
+
+        hcg = get_hcg()
+        tensor_parallel_degree = max(hcg.get_model_parallel_world_size(), 1)
+        tensor_parallel_rank = max(hcg.get_model_parallel_rank(), 0)
+
+        config.tensor_parallel_degree = tensor_parallel_degree
+        config.tensor_parallel_rank = tensor_parallel_rank
+
+        self.add_sequential_layer(
+            SharedLayerDesc(
+                "gpt_shared_weight", GPTEmbeddingPipe, shared_weight_attr="embedding_weight", config=config
+            ),
+            "gpt.embeddings",
+        )
+        for i in range(config.num_hidden_layers):
+            self.add_sequential_layer(
+                LayerDesc(GPTDecoderLayerPipe, config=config),
+                f"gpt.decoder.layers.{i}",
+            )
+
+        self.add_sequential_layer(LayerDesc(LayerNormPipe, config=config), "gpt.decoder.norm")
+        self.add_sequential_layer(
+            SharedLayerDesc("gpt_shared_weight", GPTLMHeadPipe, shared_weight_attr="embedding_weight", config=config),
+            "gpt.embeddings.word_embeddings",
+        )
+
+        recompute_interval = 0
+        # if self.config.recompute and recompute_granularity == "full":
+        #    assert pp_recompute_interval <= config.num_hidden_layers // (
+        #        virtual_pp_degree * get_hcg().topology().get_dim_size("pipe")
+        #    ), "pp recompute interval should smaller than num layers of each pp chunk"
+        #    recompute_interval = pp_recompute_interval
+
+        seg_method = "layer:GPTDecoderLayer"
+        if config.num_hidden_layers % get_hcg().topology().get_dim_size("pipe") != 0:
+            seg_method = "uniform"
+
+        PipelineLayer.__init__(
+            self,
+            layers=self.get_sequential_layers(),
+            loss_fn=GPTPretrainingCriterion(config),
+            topology=get_hcg().topology(),
+            seg_method=seg_method,
+            recompute_interval=recompute_interval,
+            recompute_ctx={
+                "mp_group": get_hcg().get_model_parallel_group(),
+                "offload": False,
+                "partition": False,
+            },
+            num_virtual_pipeline_stages=virtual_pp_degree,
+        )
+        self.apply(self._init_weights)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gpt/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gpt/tokenizer.py
new file mode 100644
index 000000000..bb0876e2d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gpt/tokenizer.py
@@ -0,0 +1,637 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shutil
+from functools import lru_cache
+from typing import Dict, Optional, Union
+
+import jieba
+import numpy as np
+import sentencepiece as spm
+from paddle.utils import try_import
+
+from .. import AddedToken, PretrainedTokenizer
+from ..tokenizer_utils_base import BatchEncoding, EncodedInput, PaddingStrategy
+
+__all__ = [
+    "GPTTokenizer",
+    "GPTChineseTokenizer",
+]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "gpt-cpm-large-cn": 1024,
+    "gpt-cpm-small-cn-distill": 1024,
+    "gpt3-175B-en": 1024,
+    "gpt3-89B-en": 1024,
+    "gpt3-13B-en": 1024,
+    "gpt3-6.7B-en": 1024,
+    "gpt3-1.3B-en": 1024,
+    "gpt2-xl-en": 1024,
+    "gpt2-large-en": 1024,
+    "gpt2-medium-en": 1024,
+    "gpt2-en": 1024,
+    "gpt2-small-en": 1024,
+}
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    _chr = chr
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [_chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class GPTChineseTokenizer(PretrainedTokenizer):
+    """
+    Constructs a GPT Chinese tokenizer based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            The vocabulary file required to instantiate
+            a `SentencePiece <https://github.com/google/sentencepiece>`__ tokenizer.
+        max_len (int):
+            The maximum value of the input sequence length.
+            Defaults to `512`.
+        unk_token (str):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import GPTChineseTokenizer
+
+            tokenizer = GPTChineseTokenizer.from_pretrained('gpt-cpm-large-cn')
+            print(tokenizer('欢迎使用百度飞桨！'))
+            '''
+            {'input_ids': [2092, 260, 1014, 1596, 17620, 45], 'token_type_ids': [0, 0, 0, 0, 0, 0]}
+            '''
+    """
+
+    resource_files_names = {"model_file": "sentencepiece.model"}  # for save_pretrained
+
+    cpm_model_link = "https://bj.bcebos.com/paddlenlp/models/transformers/gpt/gpt-cpm-cn-sentencepiece.model"
+    pretrained_resource_files_map = {
+        "model_file": {
+            "gpt-cpm-large-cn": cpm_model_link,
+            "gpt-cpm-small-cn-distill": cpm_model_link,
+        }
+    }
+    pretrained_init_configuration = {
+        "gpt-cpm-large-cn": {},
+        "gpt-cpm-small-cn-distill": {},
+    }
+
+    def __init__(
+        self,
+        model_file,
+        max_len=512,
+        unk_token="<unk>",
+        bos_token="<bod>",
+        eos_token="<eod>",
+        eol_token="\u2583",
+        **kwargs  # The token of newline.
+    ):
+        self._model_file = model_file
+        self.eol_token = eol_token
+        if not os.path.isfile(model_file):
+            raise ValueError(
+                "Can't find a model file at path '{}'. To load the "
+                "model from a pretrained model please use "
+                "`tokenizer = GPTTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(model_file)
+            )
+        self.max_len = max_len if max_len is not None else int(1e12)
+        self.sp = spm.SentencePieceProcessor()
+        self.sp.Load(model_file)
+        self.translator = str.maketrans(" \n", "\u2582\u2583")
+
+    @property
+    def eol_token_id(self):
+        if self.eol_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.eol_token)
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        seg_list = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)]
+        new_seg = " ".join(seg_list)
+        return self.sp.encode(new_seg, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) to an id using the vocab."""
+        return self.sp.PieceToId(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) to a token (str) using the vocab."""
+        return self.sp.IdToPiece(index)
+
+    '''
+    def convert_tokens_to_ids(self, tokens):
+        """
+        Converts a single token or a sequence of tokens to an index or a
+        sequence of indices.
+
+        Args:
+            tokens (str|List[str]|tuple(str)):
+                A single token or a sequence of tokens.
+
+        Returns:
+            int|List[int]: The converted token id or token ids.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import GPTChineseTokenizer
+
+                tokenizer = GPTChineseTokenizer.from_pretrained('gpt-cpm-large-cn')
+                print(tokenizer.convert_tokens_to_ids(['▁欢迎', '▁使用', '▁百度', '▁飞', '桨', '▁!']))
+                # [2092, 260, 1014, 1596, 17620, 45]
+        """
+
+        if not isinstance(tokens, (list, tuple)):
+            return self._convert_token_to_id(tokens)
+        else:
+            return [self._convert_token_to_id(token) for token in tokens]
+    '''
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """
+        Converts a single index or a sequence of indices to a token or a
+        sequence of tokens.
+
+        Args:
+            ids (int|List[int]|tuple(int)):
+                The token id (or token ids) to be converted to token(s).
+
+        Returns:
+            str|List[str]: The converted token or sequence of tokens.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import GPTChineseTokenizer
+
+                tokenizer = GPTChineseTokenizer.from_pretrained('gpt-cpm-large-cn')
+                print(tokenizer.convert_ids_to_tokens([2092, 260, 1014, 1596, 17620, 45]))
+                #['▁欢迎', '▁使用', '▁百度', '▁飞', '桨', '▁!']
+
+        """
+
+        if not isinstance(ids, (list, tuple)):
+            return self._convert_id_to_token(ids)
+        tokens = [self._convert_id_to_token(_id) for _id in ids]
+        return tokens
+
+    @property
+    def vocab_size(self):
+        """
+        Returns the size of vocabulary.
+
+        Returns:
+            int: The size of vocabulary.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import GPTChineseTokenizer
+                tokenizer = GPTChineseTokenizer.from_pretrained('gpt-cpm-large-cn')
+                print(tokenizer.vocab_size)
+                # 50257
+
+        """
+        return len(self.sp)
+
+    def get_vocab(self):
+        """
+        Returns the vocabulary as a dictionary of token to index.
+
+        `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the
+        vocab.
+
+        Returns:
+            `Dict[str, int]`: The vocabulary.
+        """
+        return dict({self.sp.IdToPiece(i): i for i in range(self.sp.GetPieceSize())}, **self.added_tokens_encoder)
+
+    def convert_ids_to_string(self, ids):
+        """
+        Converts a single index or a sequence of indices to texts.
+
+        Args:
+            ids (int|List[int]):
+                The token id (or token ids) to be converted to text.
+
+        Returns:
+            str: The decoded text.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import GPTChineseTokenizer
+                tokenizer = GPTChineseTokenizer.from_pretrained('gpt-cpm-large-cn')
+                print(tokenizer.convert_ids_to_string([2092, 260, 1014, 1596, 17620, 45]))
+                # '欢迎使用百度飞桨!'
+
+        """
+
+        text = self.sp.decode(ids)
+        text = text.replace(" ", "").replace("\u2582", " ").replace("\u2583", "\n")
+        return text
+
+    def save_resources(self, save_directory):
+        """
+        Save tokenizer related resources to files under `save_directory`.
+
+        Args:
+            save_directory (str): Directory to save files into.
+        """
+        for name, file_name in self.resource_files_names.items():
+            save_path = os.path.join(save_directory, file_name)
+            shutil.copyfile(getattr(self, "_%s" % name), save_path)
+
+
+class GPTTokenizer(PretrainedTokenizer):
+    """
+    Constructs a GPT tokenizer based on byte-level Byte-Pair-Encoding.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            Path to the vocab file.
+            The vocab file contains a mapping from vocabulary strings to indices.
+        merges_file (str):
+            Path to the merge file.
+            The merge file is used to split the input sentence into "subword" units.
+            The vocab file is then used to encode those units as intices.
+        errors (str):
+            Paradigm to follow when decoding bytes to UTF-8.
+            Defaults to `'replace'`.
+        max_len (int, optional):
+            The maximum value of the input sequence length.
+            Defaults to `None`.
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import GPTTokenizer
+
+            tokenizer = GPTTokenizer.from_pretrained('gpt2-medium-en')
+            print(tokenizer('Welcome to use PaddlePaddle and PaddleNLP'))
+
+            '''
+            {'input_ids': [14618, 284, 779, 350, 37382, 47, 37382, 290, 350, 37382, 45, 19930],
+            'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
+            '''
+
+    """
+
+    resource_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}  # for save_pretrained
+    gpt_vocab_link = "http://bj.bcebos.com/paddlenlp/models/transformers/gpt/gpt-en-vocab.json"
+    gpt_merges_link = "http://bj.bcebos.com/paddlenlp/models/transformers/gpt/gpt-en-merges.txt"
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "gpt3-175B-en": gpt_vocab_link,
+            "gpt3-89B-en": gpt_vocab_link,
+            "gpt3-13B-en": gpt_vocab_link,
+            "gpt3-6.7B-en": gpt_vocab_link,
+            "gpt3-1.3B-en": gpt_vocab_link,
+            "gpt2-xl-en": gpt_vocab_link,
+            "gpt2-large-en": gpt_vocab_link,
+            "gpt2-medium-en": gpt_vocab_link,
+            "gpt2-en": gpt_vocab_link,
+            "gpt2-small-en": gpt_vocab_link,
+        },
+        "merges_file": {
+            "gpt3-175B-en": gpt_merges_link,
+            "gpt3-89B-en": gpt_merges_link,
+            "gpt3-13B-en": gpt_merges_link,
+            "gpt3-6.7B-en": gpt_merges_link,
+            "gpt3-1.3B-en": gpt_merges_link,
+            "gpt2-xl-en": gpt_merges_link,
+            "gpt2-large-en": gpt_merges_link,
+            "gpt2-medium-en": gpt_merges_link,
+            "gpt2-en": gpt_merges_link,
+            "gpt2-small-en": gpt_merges_link,
+        },
+    }
+    pretrained_init_configuration = {
+        "gpt3-175B-en": {},
+        "gpt3-89B-en": {},
+        "gpt3-13B-en": {},
+        "gpt3-6.7B-en": {},
+        "gpt3-1.3B-en": {},
+        "gpt2-xl-en": {},
+        "gpt2-large-en": {},
+        "gpt2-medium-en": {},
+        "gpt2-en": {},
+        "gpt2-small-en": {},
+    }
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        max_len=None,
+        pad_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        unk_token="<|endoftext|>",
+        eol_token="\u010a",
+        add_prefix_space=False,
+        add_bos_token=False,
+        **kwargs  # The token of newline.
+    ):
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        self.eol_token = eol_token
+        self._build_special_tokens_map_extended(
+            bos_token=pad_token if getattr(self, "bos_token", None) is None else self.bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+        )
+
+        self._vocab_file = vocab_file
+        self._merges_file = merges_file
+        self.max_len = max_len if max_len is not None else int(1e12)
+        self.num_command_tokens = 2
+        self.num_type_tokens = 2
+
+        with open(vocab_file, "r", encoding="utf-8") as f:
+            self.encoder = json.load(f)
+
+        self.decoder = {v: k for k, v in self.encoder.items()}
+
+        self.num_tokens = len(self.encoder)
+        self.num_text_tokens = self.num_tokens - 1
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+
+        with open(merges_file, encoding="utf-8") as f:
+            bpe_data = f.read().split("\n")[1:-1]
+
+        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        self.add_prefix_space = add_prefix_space
+        self.add_bos_token = add_bos_token
+
+        re = try_import("regex")
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+    @property
+    def vocab_size(self):
+        """
+        Returns the size of vocabulary.
+
+        Returns:
+            int: The sum of size of vocabulary and the size of speical tokens.
+
+        """
+
+        return len(self.encoder)
+
+    @property
+    def eol_token_id(self):
+        if self.eol_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.eol_token)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        re = try_import("regex")
+        for token in re.findall(self.pat, text):
+            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    def _convert_token_to_id(self, token):
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        return self.decoder[index]
+
+    def convert_ids_to_string(self, ids):
+        """
+        Converts a single index or a sequence of indices to texts.
+
+        Args:
+            ids (int|List[int]):
+                The token id (or token ids) to be converted to text.
+
+        Returns:
+            str: The decoded text.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import GPTTokenizer
+                tokenizer = GPTTokenizer.from_pretrained('gpt2-medium-en')
+                print(tokenizer.convert_ids_to_string(tokenizer.convert_ids_to_string([14618, 284, 779, 350, 37382, 47, 37382, 290, 350, 37382, 45, 19930]))
+                # 'Welcome to use PaddlePaddle and PaddleNLP'
+
+        """
+
+        text = "".join([self.decoder[id] for id in ids])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    def save_resources(self, save_directory):
+        """
+        Saves `SentencePiece <https://github.com/google/sentencepiece>`__ file
+        (ends with '.spm') under `save_directory`.
+
+        Args:
+            save_directory (str): Directory to save files into.
+        """
+        for name, file_name in self.resource_files_names.items():
+            source_path = getattr(self, "_%s" % name)
+
+            save_path = os.path.join(save_directory, file_name)
+            if os.path.abspath(source_path) != os.path.abspath(save_path):
+                shutil.copyfile(source_path, save_path)
+
+    def convert_tokens_to_string(self, tokens):
+        """
+        Converts a sequence of tokens (string) in a single string.
+        """
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
+        if is_split_into_words or add_prefix_space:
+            text = " " + text
+        return (text, kwargs)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        if self.add_bos_token:
+            bos_token_ids = [self.bos_token_id]
+        else:
+            bos_token_ids = []
+
+        output = bos_token_ids + token_ids_0
+
+        if token_ids_1 is None:
+            return output
+
+        return output + bos_token_ids + token_ids_1
+
+    def _pad(
+        self,
+        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                >= 7.5 (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+
+        # attention_mask shape [1,seq_len,seq_len]
+        if "attention_mask" in encoded_inputs and len(np.shape(encoded_inputs["attention_mask"])) > 2:
+            attention_mask = encoded_inputs["attention_mask"]
+            encoded_inputs.pop("attention_mask")
+        else:
+            attention_mask = None
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+        encoded_inputs = super()._pad(
+            encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, return_attention_mask
+        )
+        if attention_mask is not None and len(np.shape(attention_mask)) > 2:
+            encoded_inputs["attention_mask"] = attention_mask
+            needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+            if needs_to_be_padded:
+                difference = max_length - len(required_input)
+                if "attention_mask" in encoded_inputs:
+                    encoded_inputs["attention_mask"] = np.pad(
+                        encoded_inputs["attention_mask"],
+                        pad_width=[(0, 0), (difference, 0), (difference, 0)],
+                        mode="constant",
+                        constant_values=0,
+                    )
+        return encoded_inputs
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gptj/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gptj/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gptj/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gptj/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gptj/configuration.py
new file mode 100644
index 000000000..bac54435a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gptj/configuration.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" GPT-J model configuration"""
+from __future__ import annotations
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+GPTJ_PRETRAINED_INIT_CONFIGURATION = {
+    "EleutherAI/gpt-j-6B": {
+        "vocab_size": 50400,
+        "bos_token_id": 50256,
+        "pad_token_id": 50256,
+        "eos_token_id": 50256,
+        "n_embd": 4096,
+        "n_layer": 28,
+        "n_head": 16,
+        "n_positions": 2048,
+        "attn_pdrop": 0.0,
+        "resid_pdrop": 0.0,
+        "embd_pdrop": 0.0,
+        "rotary_dim": 64,
+        "activation_function": "gelu_new",
+        "layer_norm_epsilon": 1e-05,
+        "initializer_range": 0.02,
+        "init_class": "GPTJModel",
+    },
+}
+
+GPTJ_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "EleutherAI/gpt-j-6B": "https://paddlenlp.bj.bcebos.com/models/community/EleutherAI/gpt-j-6B/model_state.pdparams",
+    }
+}
+
+
+class GPTJConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GPTJModel`]. It is used to instantiate a GPT-J
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the GPT-J
+    EleutherAI/gpt-j-6B architecture. Configuration objects inherit from
+    [`PretrainedConfig`] and can be used to control the model outputs.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50400):
+            Vocabulary size of the GPT-J model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GPTJModel`].
+        n_positions (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        n_embd (`int`, *optional*, defaults to 4096):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (`int`, *optional*, defaults to 28):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        rotary_dim (`int`, *optional*, defaults to 64):
+            Number of dimensions in the embedding that Rotary Position Embedding is applied to.
+        n_inner (`int`, *optional*, defaults to None):
+            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"gelu_new"`):
+            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (`int`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import GPTJModel, GPTJConfig
+    >>> # Initializing a GPT-J 6B configuration
+    >>> configuration = GPTJConfig()
+    >>> # Initializing a model from the configuration
+    >>> model = GPTJModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "gptj"
+    attribute_map = {
+        "max_position_embeddings": "n_positions",
+        "hidden_size": "n_embd",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+        "embed_dim": "n_embd",
+    }
+
+    def __init__(
+        self,
+        vocab_size=50400,
+        n_positions=2048,
+        n_embd=4096,
+        n_layer=28,
+        n_head=16,
+        rotary_dim=64,
+        n_inner=None,
+        activation_function="gelu_new",
+        resid_pdrop=0.0,
+        embd_pdrop=0.0,
+        attn_pdrop=0.0,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.rotary_dim = rotary_dim
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        super().__init__(
+            bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gptj/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gptj/modeling.py
new file mode 100644
index 000000000..df8ea5e7f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gptj/modeling.py
@@ -0,0 +1,799 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The EleutherAI Authors and The HuggingFace Inc. team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union
+
+import paddle
+import paddle.nn as nn
+from paddle.nn import Layer
+
+from .. import PretrainedModel, register_base_model
+from ..activations import ACT2FN
+from ..model_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutputWithPast,
+)
+from .configuration import GPTJConfig
+
+__all__ = [
+    "GPTJModel",
+    "GPTJPretrainedModel",
+    "GPTJForCausalLM",
+    "GPTJForSequenceClassification",
+    "GPTJForQuestionAnswering",
+]
+
+
+def fixed_pos_embedding(x, seq_dim=1, seq_len=None):
+    dim = x.shape[-1]
+    if seq_len is None:
+        seq_len = x.shape[seq_dim]
+    inv_freq = 1.0 / (10000 ** (paddle.arange(0, dim, 2) / dim))
+    sinusoid_inp = paddle.einsum("i , j -> i j", paddle.arange(seq_len, dtype="float32"), inv_freq)
+    return paddle.sin(sinusoid_inp), paddle.cos(sinusoid_inp)
+
+
+def rotate_every_two(x):
+    x1 = x[:, :, :, ::2]
+    x2 = x[:, :, :, 1::2]
+    x = paddle.stack((-x2, x1), axis=-1)
+    # In einsum notation: rearrange(x, '... d j -> ... (d j)')
+    return x.flatten(-2)
+
+
+def duplicate_interleave(m):
+    return paddle.repeat_interleave(m, 2, axis=1)
+
+
+def apply_rotary_pos_emb(x, sincos, offset=0):
+    sin, cos = map(lambda t: duplicate_interleave(t)[None, offset : x.shape[1] + offset, None, :], sincos)
+    # einsum notation for lambda t: repeat(t[offset:x.shape[1]+offset,:], "n d -> () n () (d j)", j=2)
+    return (x * cos) + (rotate_every_two(x) * sin)
+
+
+class GPTJAttention(Layer):
+    def __init__(self, config: GPTJConfig):
+        super().__init__()
+
+        max_positions = config.max_position_embeddings
+        self.register_buffer(
+            "bias",
+            paddle.tril(paddle.ones((max_positions, max_positions), dtype=paddle.get_default_dtype())).reshape(
+                (1, 1, max_positions, max_positions)
+            ),
+        )
+        self.register_buffer("masked_bias", paddle.to_tensor(-1e9))
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+
+        self.embed_dim = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_attention_heads
+        if self.head_dim * self.num_attention_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_attention_heads (got `embed_dim`: {self.embed_dim} and"
+                f" `num_attention_heads`: {self.num_attention_heads})."
+            )
+        self.scale_attn = paddle.sqrt(paddle.to_tensor(self.head_dim, dtype="float32"))
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias_attr=False)
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias_attr=False)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias_attr=False)
+
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias_attr=False)
+        self.rotary_dim = None
+        if config.rotary_dim is not None:
+            self.rotary_dim = config.rotary_dim
+
+    def _split_heads(self, tensor, num_attention_heads, attn_head_size, rotary):
+        """
+        Splits hidden dim into attn_head_size and num_attention_heads
+        """
+        new_shape = tensor.shape[:-1] + [num_attention_heads, attn_head_size]
+        tensor = tensor.reshape(new_shape)
+        if rotary:
+            return tensor
+        if len(tensor.shape) == 5:
+
+            return tensor.transpose([0, 1, 3, 2, 4])  # (batch, blocks, head, block_length, head_features)
+        elif len(tensor.shape) == 4:
+            return tensor.transpose([0, 2, 1, 3])  # (batch, head, seq_length, head_features)
+        else:
+            raise ValueError(f"Input tensor rank should be one of [4, 5], but is: {len(tensor.shape)}")
+
+    def _merge_heads(self, tensor, num_attention_heads, attn_head_size):
+        """
+        Merges attn_head_size dim and num_attn_heads dim into hidden dim
+        """
+        if len(tensor.shape) == 5:
+            tensor = tensor.transpose([0, 1, 3, 2, 4])
+        elif len(tensor.shape) == 4:
+            tensor = tensor.transpose([0, 2, 1, 3])
+        else:
+            raise ValueError(f"Input tensor rank should be one of [4, 5], but is: {len(tensor.shape)}")
+        new_shape = tensor.shape[:-2] + [num_attention_heads * attn_head_size]
+        return tensor.reshape(new_shape)
+
+    def _attn(
+        self,
+        query,
+        key,
+        value,
+        attention_mask=None,
+    ):
+        # compute causal mask from causal mask buffer
+        query_length, key_length = query.shape[-2], key.shape[-2]
+        causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
+
+        # Keep the attention weights computation in fp32 to avoid overflow issues
+        query = paddle.cast(query, "float32")
+        key = paddle.cast(key, "float32")
+
+        attn_weights = paddle.matmul(query, key, transpose_y=True)
+
+        if attn_weights.dtype == paddle.float16:
+            mask_value = -65504.0  # smallest representable value for float16
+        else:
+            mask_value = -1e9  # default value used
+        mask_value = paddle.to_tensor(mask_value, dtype=attn_weights.dtype)
+
+        # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+        # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+        mask_value = paddle.to_tensor(mask_value, dtype=attn_weights.dtype, place=attn_weights.place)
+        attn_weights = paddle.where(causal_mask, attn_weights, mask_value)
+
+        attn_weights = attn_weights / self.scale_attn
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask.astype(attn_weights.dtype)
+
+        attn_weights = paddle.nn.functional.softmax(attn_weights, axis=-1)
+        attn_weights = attn_weights.astype(value.dtype)
+
+        attn_weights = self.attn_dropout(attn_weights)
+
+        attn_output = paddle.matmul(attn_weights, value)
+
+        return attn_output, attn_weights
+
+    def forward(
+        self,
+        hidden_states: Optional[paddle.Tensor],
+        attention_mask: Optional[paddle.Tensor] = None,
+        layer_past: Optional[Tuple[paddle.Tensor]] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+    ) -> Union[
+        Tuple[paddle.Tensor, Tuple[paddle.Tensor]],
+        Optional[Tuple[paddle.Tensor, Tuple[paddle.Tensor], Tuple[paddle.Tensor, ...]]],
+    ]:
+        query = self.q_proj(hidden_states)
+        key = self.k_proj(hidden_states)
+        value = self.v_proj(hidden_states)
+
+        query = self._split_heads(query, self.num_attention_heads, self.head_dim, True)
+        key = self._split_heads(key, self.num_attention_heads, self.head_dim, True)
+        value = self._split_heads(value, self.num_attention_heads, self.head_dim, False)
+
+        seq_len = key.shape[1]
+        offset = 0
+
+        if layer_past is not None:
+            offset = layer_past[0].shape[-2]
+            seq_len += offset
+
+        if self.rotary_dim is not None:
+            k_rot = key[:, :, :, : self.rotary_dim]
+            k_pass = key[:, :, :, self.rotary_dim :]
+
+            q_rot = query[:, :, :, : self.rotary_dim]
+            q_pass = query[:, :, :, self.rotary_dim :]
+
+            sincos = fixed_pos_embedding(k_rot, 1, seq_len=seq_len)
+            k_rot = apply_rotary_pos_emb(k_rot, sincos, offset=offset)
+            q_rot = apply_rotary_pos_emb(q_rot, sincos, offset=offset)
+
+            key = paddle.concat([k_rot, k_pass], axis=-1)
+            query = paddle.concat([q_rot, q_pass], axis=-1)
+        else:
+            sincos = fixed_pos_embedding(key, 1, seq_len=seq_len)
+            key = apply_rotary_pos_emb(key, sincos, offset=offset)
+            query = apply_rotary_pos_emb(query, sincos, offset=offset)
+
+        key = key.transpose([0, 2, 1, 3])
+        query = query.transpose([0, 2, 1, 3])
+
+        if layer_past is not None:
+            past_key = layer_past[0]
+            past_value = layer_past[1]
+            key = paddle.concat((past_key, key), axis=-2)
+            value = paddle.concat((past_value, value), axis=-2)
+
+        if use_cache is True:
+            present = (key, value)
+        else:
+            present = None
+
+        # compute self-attention: V x Softmax(QK^T)
+        attn_output, attn_weights = self._attn(query, key, value, attention_mask)
+
+        attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_dim)
+        attn_output = self.out_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs  # a, present, (attentions)
+
+
+class GPTJMLP(Layer):
+    def __init__(self, intermediate_size, config):  # in MLP: intermediate_size= 4 * embed_dim
+        super().__init__()
+        embed_dim = config.n_embd
+
+        self.fc_in = nn.Linear(embed_dim, intermediate_size)
+        self.fc_out = nn.Linear(intermediate_size, embed_dim)
+
+        self.act = ACT2FN[config.activation_function]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+
+    def forward(self, hidden_states: Optional[paddle.Tensor]) -> paddle.Tensor:
+        hidden_states = self.fc_in(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.fc_out(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class GPTJBlock(Layer):
+    def __init__(self, config):
+        super().__init__()
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * config.n_embd
+        self.ln_1 = nn.LayerNorm(config.n_embd, epsilon=config.layer_norm_epsilon)
+        self.attn = GPTJAttention(config)
+        self.mlp = GPTJMLP(inner_dim, config)
+
+    def forward(
+        self,
+        hidden_states: Optional[paddle.Tensor],
+        layer_past: Optional[Tuple[paddle.Tensor]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+    ) -> Union[Tuple[paddle.Tensor], Optional[Tuple[paddle.Tensor, Tuple[paddle.Tensor, ...]]]]:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_outputs = self.attn(
+            hidden_states,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
+        outputs = attn_outputs[1:]
+
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        hidden_states = attn_output + feed_forward_hidden_states + residual
+
+        if use_cache:
+            outputs = (hidden_states,) + outputs
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+
+        return outputs  # hidden_states, present, (attentions)
+
+
+class GPTJPretrainedModel(PretrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = GPTJConfig
+    base_model_prefix = "transformer"
+    is_parallelizable = True
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["GPTJBlock"]
+
+    def _init_weights(self, layer):
+        """Initialize the weights."""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            if isinstance(layer.weight, paddle.Tensor) and paddle.get_default_dtype() == "float32":
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.initializer_range
+                        if hasattr(self, "initializer_range")
+                        else self.transformer.config["initializer_range"],
+                        shape=layer.weight.shape,
+                    )
+                )
+        elif isinstance(layer, nn.LayerNorm):
+            layer.bias.set_value(paddle.zeros_like(layer.bias))
+            layer.weight.set_value(paddle.full_like(layer.weight, 1.0))
+            layer._epsilon = getattr(self, "layer_norm_epsilon", 1e-05)
+        if isinstance(layer, nn.Linear) and layer.bias is not None:
+            layer.bias.set_value(paddle.zeros_like(layer.bias))
+
+
+@register_base_model
+class GPTJModel(GPTJPretrainedModel):
+    def __init__(self, config):
+        super(GPTJModel, self).__init__(config)
+
+        self.embed_dim = config.n_embd
+        self.vocab_size = config.vocab_size
+        self.bos_token_id = config.bos_token_id
+        self.pad_token_id = config.pad_token_id
+        self.eos_token_id = config.eos_token_id
+        self.embed_dim = config.n_embd
+        self.initializer_range = config.initializer_range
+        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.LayerList([GPTJBlock(config) for _ in range(config.n_layer)])
+        self.ln_f = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_epsilon)
+
+    def get_input_embeddings(self):
+        return self.wte
+
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        token_type_ids: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+            input_ids = input_ids.reshape(shape=(-1, input_shape[-1]))
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.reshape((-1, input_shape[-1]))
+
+        if position_ids is not None:
+            position_ids = position_ids.reshape((-1, input_shape[-1]))
+
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = tuple([None] * len(self.h))
+        else:
+            past_length = past_key_values[0][0].shape[-2]
+
+        if position_ids is None:
+            position_ids = paddle.arange(past_length, input_shape[-1] + past_length, dtype="int64")
+            position_ids = position_ids.unsqueeze(0).reshape((-1, input_shape[-1]))
+
+        # Attention mask.
+        if attention_mask is None:
+            assert input_ids is not None, "input_ids should be " "specified when generating attention_mask"
+            attention_mask = (
+                paddle.cast(input_ids == self.pad_token_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
+            )
+        # For 2D attention_mask from tokenizer
+        elif attention_mask.ndim == 2:
+            attention_mask = paddle.unsqueeze(attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
+            attention_mask = (1.0 - attention_mask) * -1e4
+            attention_mask.stop_gradient = True
+        # TODO(zhangxu): Add head_mask if PretrainedModel supports get_head_mask method
+
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+
+        hidden_states = inputs_embeds
+
+        if token_type_ids is not None:
+            token_type_embeds = self.wte(token_type_ids)
+            hidden_states = hidden_states + token_type_embeds
+
+        hidden_states = self.drop(hidden_states)
+
+        output_shape = input_shape[:] + [hidden_states.shape[-1]]
+
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            outputs = block(
+                hidden_states,
+                layer_past=layer_past,
+                attention_mask=attention_mask,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = outputs[0]
+            if use_cache:
+                presents = presents + (outputs[1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+
+        hidden_states = self.ln_f(hidden_states)
+
+        hidden_states = hidden_states.reshape(shape=output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class GPTJForCausalLM(GPTJPretrainedModel):
+    r"""
+    GPTJ Model with a `language modeling` head on top.
+    Args:
+        GPTJ (:class:`GPTJModel`):
+            An instance of GPTJModel.
+    """
+
+    def __init__(self, config):
+        super(GPTJForCausalLM, self).__init__(config)
+        self.transformer = GPTJModel(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def prepare_fast_entry(self, kwargs):
+        from paddlenlp.ops import FasterGPTJ
+
+        use_fp16_decoding = kwargs.get("use_fp16_decoding", False)
+        decoding_lib = kwargs.get("decoding_lib", None)
+        decode_strategy = kwargs.get("decode_strategy")
+        if decode_strategy == "beam_search":
+            raise AttributeError("'beam_search' is not supported yet in the fast version of GPTJ")
+        # Currently, FasterTransformer only support restricted size_per_head.
+        size_per_head = self.transformer.config["n_embd"] // self.transformer.config["n_head"]
+        if size_per_head not in [32, 64, 80, 96, 128, 160, 192, 224, 256]:
+            raise AttributeError(
+                "'size_per_head = %d' is not supported yet in the fast version of GPTJ" % size_per_head
+            )
+        if kwargs["forced_bos_token_id"] is not None:
+            # not support for min_length yet in the fast version
+            raise AttributeError("'forced_bos_token_id != None' is not supported yet in the fast version")
+        self._fast_entry = FasterGPTJ(self, decoding_lib=decoding_lib, use_fp16_decoding=use_fp16_decoding).forward
+        return self._fast_entry
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
+        token_type_ids = kwargs.get("token_type_ids", None)
+        # only last token for inputs_ids if past is defined in kwargs
+        if past_key_values:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+            if token_type_ids is not None:
+                token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
+
+        attention_mask = kwargs.get("attention_mask", None)
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None:
+            if len(attention_mask.shape) == 4:
+                attention_mask = attention_mask[:, :, -1:, :]
+
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "use_cache": kwargs.get("use_cache"),
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "token_type_ids": token_type_ids,
+        }
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        token_type_ids: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        The GPTJForCausalLM forward method, overrides the __call__() special method.
+        Example:
+            .. code-block::
+                import paddle
+                from paddlenlp.transformers import GPTJForCausalLM, GPTJTokenizer
+                tokenizer = GPTJTokenizer.from_pretrained('EleutherAI/gpt-j-6B')
+                model = GPTJForCausalLM.from_pretrained('EleutherAI/gpt-j-6B')
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!", return_token_type_ids=False)
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+
+        # make sure sampling in fp16 works correctly and
+        # compute loss in fp32 to match with mesh-tf version
+        lm_logits = self.lm_head(hidden_states).astype("float32")
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+            # Flatten the tokens
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(shift_logits.reshape([-1, shift_logits.shape[-1]]), shift_labels.reshape([-1]))
+
+            loss = loss.astype(hidden_states.dtype)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    @staticmethod
+    def _reorder_cache(past: Tuple[Tuple[paddle.Tensor]], beam_idx: paddle.Tensor) -> Tuple[Tuple[paddle.Tensor]]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PretrainedModel.beam_search`] or
+        [`~PretrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+        """
+        return tuple(
+            tuple(past_state.index_select(0, beam_idx.astype(past_state.dtype)) for past_state in layer_past)
+            for layer_past in past
+        )
+
+    def __getattr__(self, name):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(getattr(self, self.base_model_prefix), name)
+
+
+class GPTJForSequenceClassification(GPTJPretrainedModel):
+    r"""
+    GPTJ Model with a linear layer on top of the pooled output,
+    designed for sequence classification/regression tasks like GLUE tasks.
+    Since it does classification on the last token, it requires to know the
+    position of the last token. If a `pad_token_id` is defined in the configuration,
+    it finds the last token that is not a padding token in each row. If no `pad_token_id`
+    is defined, it simply takes the last value in each row of the batch.
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = GPTJModel(config)
+        self.score = nn.Linear(config.n_embd, self.num_labels, bias_attr=False)
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        token_type_ids: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (
+                    paddle.not_equal(
+                        input_ids, paddle.to_tensor(self.config.pad_token_id).astype(input_ids.dtype)
+                    ).sum(-1)
+                    - 1
+                )
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[paddle.arange(batch_size), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = nn.BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels.astype("float32"))
+
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+class GPTJForQuestionAnswering(GPTJPretrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = GPTJModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        token_type_ids: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        start_positions: Optional[paddle.Tensor] = None,
+        end_positions: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`paddle.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`paddle.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = paddle.split(logits, logits.shape[-1], axis=-1)
+        start_logits = paddle.squeeze(start_logits, axis=-1)
+        end_logits = paddle.squeeze(end_logits, axis=-1)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.shape) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.shape) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clip(0, ignored_index)
+            end_positions = end_positions.clip(0, ignored_index)
+
+            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gptj/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gptj/tokenizer.py
new file mode 100644
index 000000000..6e67eccbb
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/gptj/tokenizer.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The Open AI Team Authors and The HuggingFace Inc. team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .. import GPTTokenizer
+
+__all__ = ["GPTJTokenizer"]
+
+
+class GPTJTokenizer(GPTTokenizer):
+
+    resource_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
+    pretrained_resource_files_map = {"vocab_file": {}, "merges_file": {}}
+    pretrained_init_configuration = {}
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        max_len=None,
+        pad_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        unk_token="<|endoftext|>",
+        eol_token="\u010a",
+        **kwargs
+    ):
+        super().__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            errors=errors,
+            max_len=max_len,
+            pad_token=pad_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            eol_token=eol_token,
+            **kwargs,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/image_processing_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/image_processing_utils.py
new file mode 100644
index 000000000..7d6eff51f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/image_processing_utils.py
@@ -0,0 +1,547 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import json
+import os
+import tempfile
+from typing import Any, Dict, Iterable, Optional, Tuple, Union
+
+import aistudio_sdk
+import numpy as np
+from huggingface_hub import (
+    create_repo,
+    get_hf_file_metadata,
+    hf_hub_url,
+    repo_type_and_id_from_hf_id,
+    upload_folder,
+)
+from huggingface_hub.utils import EntryNotFoundError
+
+from ..utils.download import resolve_file_path
+from ..utils.log import logger
+from .feature_extraction_utils import BatchFeature as BaseBatchFeature
+
+IMAGE_PROCESSOR_NAME = "preprocessor_config.json"
+
+
+class BatchFeature(BaseBatchFeature):
+    r"""
+    Holds the output of the image processor specific `__call__` methods.
+
+    This class is derived from a python dictionary and can be used as a dictionary.
+
+    Args:
+        data (`dict`):
+            Dictionary of lists/arrays/tensors returned by the __call__ method ('pixel_values', etc.).
+        tensor_type (`Union[None, str, TensorType]`, *optional*):
+            You can give a tensor_type here to convert the lists of integers in Paddle/Numpy Tensors at
+            initialization.
+    """
+
+
+class ImageProcessingMixin(object):
+    """
+    This is an image processor mixin used to provide saving/loading functionality for sequential and image feature
+    extractors.
+    """
+
+    pretrained_init_configuration = {}
+    _auto_class = None
+
+    def __init__(self, **kwargs):
+        """Set elements of `kwargs` as attributes."""
+        # Pop "processor_class" as it should be saved as private attribute
+        self._processor_class = kwargs.pop("processor_class", None)
+        # Additional attributes without default values
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                logger.error(f"Can't set {key} with value {value} for {self}")
+                raise err
+
+    def _set_processor_class(self, processor_class: str):
+        """Sets processor class as an attribute."""
+        self._processor_class = processor_class
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs):
+        r"""
+        Instantiate a type of [`~image_processing_utils.ImageProcessingMixin`] from an image processor.
+
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                This can be either:
+
+                - a string, the *model id* of a pretrained image_processor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a image processor file saved using the
+                  [`~image_processing_utils.ImageProcessingMixin.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved image processor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
+            cache_dir (`str` or `os.PathLike`, *optional*):
+                Path to a directory in which a downloaded pretrained model image processor should be cached if the
+                standard cache should not be used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force to (re-)download the image processor files and override the cached versions if
+                they exist.
+            resume_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to delete incompletely received file. Attempts to resume the download if such a file
+                exists.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+            use_auth_token (`str` or `bool`, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
+                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+                identifier allowed by git.
+
+
+                <Tip>
+
+                To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
+
+                </Tip>
+
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                If `False`, then this function returns just the final image processor object. If `True`, then this
+                functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary
+                consisting of the key/value pairs whose keys are not image processor attributes: i.e., the part of
+                `kwargs` which has not been used to update `image_processor` and is otherwise ignored.
+            kwargs (`Dict[str, Any]`, *optional*):
+                The values in kwargs of any keys which are image processor attributes will be used to override the
+                loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is
+                controlled by the `return_unused_kwargs` keyword parameter.
+
+        Returns:
+            A image processor of type [`~image_processing_utils.ImageProcessingMixin`].
+
+        Examples:
+
+        ```python
+        # We can't instantiate directly the base class *ImageProcessingMixin* so let's show the examples on a
+        # derived class: *CLIPImageProcessor*
+        image_processor = CLIPImageProcessor.from_pretrained(
+            "openai/clip-vit-base-patch32"
+        )  # Download image_processing_config from huggingface.co and cache.
+        image_processor = CLIPImageProcessor.from_pretrained(
+            "./test/saved_model/"
+        )  # E.g. image processor (or model) was saved using *save_pretrained('./test/saved_model/')*
+        image_processor = CLIPImageProcessor.from_pretrained("./test/saved_model/preprocessor_config.json")
+        image_processor = CLIPImageProcessor.from_pretrained(
+            "openai/clip-vit-base-patch32", do_normalize=False, foo=False
+        )
+        assert image_processor.do_normalize is False
+        image_processor, unused_kwargs = CLIPImageProcessor.from_pretrained(
+            "openai/clip-vit-base-patch32", do_normalize=False, foo=False, return_unused_kwargs=True
+        )
+        assert image_processor.do_normalize is False
+        assert unused_kwargs == {"foo": False}
+        ```"""
+        image_processor_dict, kwargs = cls.get_image_processor_dict(pretrained_model_name_or_path, **kwargs)
+
+        return cls.from_dict(image_processor_dict, **kwargs)
+
+    def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs):
+        """
+        Save an image processor object to the directory `save_directory`, so that it can be re-loaded using the
+        [`~image_processing_utils.ImageProcessingMixin.from_pretrained`] class method.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the image processor JSON file will be saved (will be created if it does not exist).
+            kwargs:
+                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_image_processor_file = os.path.join(save_directory, IMAGE_PROCESSOR_NAME)
+
+        self.to_json_file(output_image_processor_file)
+        logger.info(f"Image processor saved in {output_image_processor_file}")
+
+        return [output_image_processor_file]
+
+    def save_to_hf_hub(
+        self,
+        repo_id: str,
+        private: Optional[bool] = None,
+        subfolder: Optional[str] = None,
+        commit_message: Optional[str] = None,
+        revision: Optional[str] = None,
+        create_pr: bool = False,
+    ):
+        """
+        Uploads all elements of this processor to a new HuggingFace Hub repository.
+        Args:
+            repo_id (str): Repository name for your processor in the Hub.
+            private (bool, optional): Whether theprocessor is set to private
+            subfolder (str, optional): Push to a subfolder of the repo instead of the root
+            commit_message (str, optional) — The summary / title / first line of the generated commit. Defaults to: f"Upload {path_in_repo} with huggingface_hub"
+            revision (str, optional) — The git revision to commit from. Defaults to the head of the "main" branch.
+            create_pr (boolean, optional) — Whether or not to create a Pull Request with that commit. Defaults to False.
+                If revision is not set, PR is opened against the "main" branch. If revision is set and is a branch, PR is opened against this branch.
+                If revision is set and is not a branch name (example: a commit oid), an RevisionNotFoundError is returned by the server.
+
+        Returns: The url of the commit of your model in the given repository.
+        """
+        repo_url = create_repo(repo_id, private=private, exist_ok=True)
+
+        # Infer complete repo_id from repo_url
+        # Can be different from the input `repo_id` if repo_owner was implicit
+        _, repo_owner, repo_name = repo_type_and_id_from_hf_id(repo_url)
+
+        repo_id = f"{repo_owner}/{repo_name}"
+
+        # Check if README file already exist in repo
+        try:
+            get_hf_file_metadata(hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision))
+            has_readme = True
+        except EntryNotFoundError:
+            has_readme = False
+
+        with tempfile.TemporaryDirectory() as root_dir:
+            if subfolder is not None:
+                save_dir = os.path.join(root_dir, subfolder)
+            else:
+                save_dir = root_dir
+            # save model
+            self.save_pretrained(save_dir)
+            # Add readme if does not exist
+            logger.info("README.md not found, adding the default README.md")
+            if not has_readme:
+                with open(os.path.join(root_dir, "README.md"), "w") as f:
+                    f.write(f"---\nlibrary_name: paddlenlp\n---\n# {repo_id}")
+
+            # Upload model and return
+            logger.info(f"Pushing to the {repo_id}. This might take a while")
+            return upload_folder(
+                repo_id=repo_id,
+                repo_type="model",
+                folder_path=root_dir,
+                commit_message=commit_message,
+                revision=revision,
+                create_pr=create_pr,
+            )
+
+    def save_to_aistudio(
+        self, repo_id, private=True, license="Apache License 2.0", exist_ok=True, subfolder=None, **kwargs
+    ):
+        """
+        Uploads all elements of this model to a new AiStudio Hub repository.
+        Args:
+            repo_id (str): Repository name for your model/tokenizer in the Hub.
+            token (str): Your token for the Hub.
+            private (bool, optional): Whether the model/tokenizer is set to private. Defaults to True.
+            license (str): The license of your model/tokenizer. Defaults to: "Apache License 2.0".
+            exist_ok (bool, optional): Whether to override existing repository. Defaults to: True.
+            subfolder (str, optional): Push to a subfolder of the repo instead of the root
+        """
+
+        res = aistudio_sdk.hub.create_repo(repo_id=repo_id, private=private, license=license, **kwargs)
+        if "error_code" in res:
+            if res["error_code"] == 10003 and exist_ok:
+                logger.info(
+                    f"Repo {repo_id} already exists, it will override files with the same name. To avoid this, please set exist_ok=False"
+                )
+            else:
+                logger.error(
+                    f"Failed to create repo {repo_id}, error_code: {res['error_code']}, error_msg: {res['error_msg']}"
+                )
+        else:
+            logger.info(f"Successfully created repo {repo_id}")
+
+        with tempfile.TemporaryDirectory() as root_dir:
+            if subfolder is not None:
+                save_dir = os.path.join(root_dir, subfolder)
+            else:
+                save_dir = root_dir
+            # save model
+            self.save_pretrained(save_dir)
+
+            # Upload model and return
+            logger.info(f"Pushing to the {repo_id}. This might take a while")
+            for filename in os.listdir(save_dir):
+                res = aistudio_sdk.hub.upload(
+                    repo_id=repo_id, path_or_fileobj=os.path.join(save_dir, filename), path_in_repo=filename, **kwargs
+                )
+                if "error_code" in res:
+                    logger.error(
+                        f"Failed to upload {filename}, error_code: {res['error_code']}, error_msg: {res['error_msg']}"
+                    )
+                else:
+                    logger.info(f"{filename}: {res['message']}")
+
+    @classmethod
+    def get_image_processor_dict(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        """
+        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
+        image processor of type [`~image_processor_utils.ImageProcessingMixin`] using `from_dict`.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
+            from_hf_hub (bool, optional): whether to load from Huggingface Hub
+            subfolder (str, optional) An optional value corresponding to a folder inside the repo.
+
+
+        Returns:
+            `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the image processor object.
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        from_hf_hub = kwargs.pop("from_hf_hub", False)
+        from_aistudio = kwargs.pop("from_aistudio", False)
+        subfolder = kwargs.pop("subfolder", "")
+        if subfolder is None:
+            subfolder = ""
+
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        is_local = os.path.isdir(pretrained_model_name_or_path)
+        resolved_image_processor_file = resolve_file_path(
+            pretrained_model_name_or_path,
+            [IMAGE_PROCESSOR_NAME],
+            subfolder,
+            cache_dir=cache_dir,
+            from_hf_hub=from_hf_hub,
+            from_aistudio=from_aistudio,
+        )
+        assert (
+            resolved_image_processor_file is not None
+        ), f"please make sure {IMAGE_PROCESSOR_NAME} under {pretrained_model_name_or_path}"
+        try:
+            # Load image_processor dict
+            with open(resolved_image_processor_file, "r", encoding="utf-8") as reader:
+                text = reader.read()
+            image_processor_dict = json.loads(text)
+
+        except json.JSONDecodeError:
+            raise EnvironmentError(
+                f"It looks like the config file at '{resolved_image_processor_file}' is not a valid JSON file."
+            )
+
+        if is_local:
+            logger.info(f"loading configuration file {resolved_image_processor_file}")
+        else:
+            logger.info(f"loading configuration file from cache at {resolved_image_processor_file}")
+
+        return image_processor_dict, kwargs
+
+    @classmethod
+    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
+        """
+        Instantiates a type of [`~image_processing_utils.ImageProcessingMixin`] from a Python dictionary of parameters.
+
+        Args:
+            image_processor_dict (`Dict[str, Any]`):
+                Dictionary that will be used to instantiate the image processor object. Such a dictionary can be
+                retrieved from a pretrained checkpoint by leveraging the
+                [`~image_processing_utils.ImageProcessingMixin.to_dict`] method.
+            kwargs (`Dict[str, Any]`):
+                Additional parameters from which to initialize the image processor object.
+
+        Returns:
+            [`~image_processing_utils.ImageProcessingMixin`]: The image processor object instantiated from those
+            parameters.
+        """
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+
+        image_processor = cls(**image_processor_dict)
+
+        # Update image_processor with kwargs if needed
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(image_processor, key):
+                setattr(image_processor, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        if return_unused_kwargs:
+            return image_processor, kwargs
+        else:
+            return image_processor
+
+    def to_dict(self, *args, **kwargs) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary.
+
+        Returns:
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this image processor instance.
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["image_processor_type"] = self.__class__.__name__
+
+        return output
+
+    @classmethod
+    def from_json_file(cls, json_file: Union[str, os.PathLike]):
+        """
+        Instantiates a image processor of type [`~image_processing_utils.ImageProcessingMixin`] from the path to a JSON
+        file of parameters.
+
+        Args:
+            json_file (`str` or `os.PathLike`):
+                Path to the JSON file containing the parameters.
+
+        Returns:
+            A image processor of type [`~image_processing_utils.ImageProcessingMixin`]: The image_processor object
+            instantiated from that JSON file.
+        """
+        with open(json_file, "r", encoding="utf-8") as reader:
+            text = reader.read()
+        image_processor_dict = json.loads(text)
+        return cls(**image_processor_dict)
+
+    def to_json_string(self) -> str:
+        """
+        Serializes this instance to a JSON string.
+
+        Returns:
+            `str`: String containing all the attributes that make up this feature_extractor instance in JSON format.
+        """
+        dictionary = self.to_dict()
+
+        for key, value in dictionary.items():
+            if isinstance(value, np.ndarray):
+                dictionary[key] = value.tolist()
+
+        # make sure private name "_processor_class" is correctly
+        # saved as "processor_class"
+        _processor_class = dictionary.pop("_processor_class", None)
+        if _processor_class is not None:
+            dictionary["processor_class"] = _processor_class
+
+        return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path: Union[str, os.PathLike]):
+        """
+        Save this instance to a JSON file.
+
+        Args:
+            json_file_path (`str` or `os.PathLike`):
+                Path to the JSON file in which this image_processor instance's parameters will be saved.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            writer.write(self.to_json_string())
+
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+
+
+class BaseImageProcessor(ImageProcessingMixin):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def __call__(self, images, **kwargs) -> BatchFeature:
+        """Preprocess an image or a batch of images."""
+        return self.preprocess(images, **kwargs)
+
+    def preprocess(self, images, **kwargs) -> BatchFeature:
+        raise NotImplementedError("Each image processor must implement its own preprocess method")
+
+
+VALID_SIZE_DICT_KEYS = ({"height", "width"}, {"shortest_edge"}, {"shortest_edge", "longest_edge"})
+
+
+def is_valid_size_dict(size_dict):
+    if not isinstance(size_dict, dict):
+        return False
+
+    size_dict_keys = set(size_dict.keys())
+    for allowed_keys in VALID_SIZE_DICT_KEYS:
+        if size_dict_keys == allowed_keys:
+            return True
+    return False
+
+
+def convert_to_size_dict(
+    size, max_size: Optional[int] = None, default_to_square: bool = True, height_width_order: bool = True
+):
+    # By default, if size is an int we assume it represents a tuple of (size, size).
+    if isinstance(size, int) and default_to_square:
+        if max_size is not None:
+            raise ValueError("Cannot specify both size as an int, with default_to_square=True and max_size")
+        return {"height": size, "width": size}
+    # In other configs, if size is an int and default_to_square is False, size represents the length of
+    # the shortest edge after resizing.
+    elif isinstance(size, int) and not default_to_square:
+        size_dict = {"shortest_edge": size}
+        if max_size is not None:
+            size_dict["longest_edge"] = max_size
+        return size_dict
+    # Otherwise, if size is a tuple it's either (height, width) or (width, height)
+    elif isinstance(size, (tuple, list)) and height_width_order:
+        return {"height": size[0], "width": size[1]}
+    elif isinstance(size, (tuple, list)) and not height_width_order:
+        return {"height": size[1], "width": size[0]}
+
+    raise ValueError(f"Could not convert size input to size dict: {size}")
+
+
+def get_size_dict(
+    size: Union[int, Iterable[int], Dict[str, int]] = None,
+    max_size: Optional[int] = None,
+    height_width_order: bool = True,
+    default_to_square: bool = True,
+    param_name="size",
+) -> dict:
+    """
+    Converts the old size parameter in the config into the new dict expected in the config. This is to ensure backwards
+    compatibility with the old image processor configs and removes ambiguity over whether the tuple is in (height,
+    width) or (width, height) format.
+
+    - If `size` is tuple, it is converted to `{"height": size[0], "width": size[1]}` or `{"height": size[1], "width":
+    size[0]}` if `height_width_order` is `False`.
+    - If `size` is an int, and `default_to_square` is `True`, it is converted to `{"height": size, "width": size}`.
+    - If `size` is an int and `default_to_square` is False, it is converted to `{"shortest_edge": size}`. If `max_size`
+      is set, it is added to the dict as `{"longest_edge": max_size}`.
+
+    Args:
+        size (`Union[int, Iterable[int], Dict[str, int]]`, *optional*):
+            The `size` parameter to be cast into a size dictionary.
+        max_size (`Optional[int]`, *optional*):
+            The `max_size` parameter to be cast into a size dictionary.
+        height_width_order (`bool`, *optional*, defaults to `True`):
+            If `size` is a tuple, whether it's in (height, width) or (width, height) order.
+        default_to_square (`bool`, *optional*, defaults to `True`):
+            If `size` is an int, whether to default to a square image or not.
+    """
+    if not isinstance(size, dict):
+        size_dict = convert_to_size_dict(size, max_size, default_to_square, height_width_order)
+        logger.info(
+            f"{param_name} should be a dictionary on of the following set of keys: {VALID_SIZE_DICT_KEYS}, got {size}."
+            f" Converted to {size_dict}.",
+        )
+    else:
+        size_dict = size
+
+    if not is_valid_size_dict(size_dict):
+        raise ValueError(
+            f"{param_name} must have one of the following set of keys: {VALID_SIZE_DICT_KEYS}, got {size_dict.keys()}"
+        )
+    return size_dict
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/image_transforms.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/image_transforms.py
new file mode 100644
index 000000000..fb07f14f4
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/image_transforms.py
@@ -0,0 +1,655 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from typing import Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+import paddle
+import PIL
+
+from .image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_channel_dimension_axis,
+    get_image_size,
+    infer_channel_dimension_format,
+    to_numpy_array,
+)
+from .tokenizer_utils_base import ExplicitEnum, TensorType
+
+
+def is_paddle_tensor(tensor):
+    return paddle.is_tensor(tensor)
+
+
+def to_channel_dimension_format(
+    image: np.ndarray,
+    channel_dim: Union[ChannelDimension, str],
+    input_channel_dim: Optional[Union[ChannelDimension, str]] = None,
+) -> np.ndarray:
+    """
+    Converts `image` to the channel dimension format specified by `channel_dim`.
+
+    Args:
+        image (`numpy.ndarray`):
+            The image to have its channel dimension set.
+        channel_dim (`ChannelDimension`):
+            The channel dimension format to use.
+
+    Returns:
+        `np.ndarray`: The image with the channel dimension set to `channel_dim`.
+    """
+    if not isinstance(image, np.ndarray):
+        raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
+
+    if input_channel_dim is None:
+        input_channel_dim = infer_channel_dimension_format(image)
+
+    target_channel_dim = ChannelDimension(channel_dim)
+    if input_channel_dim == target_channel_dim:
+        return image
+
+    if target_channel_dim == ChannelDimension.FIRST:
+        image = image.transpose((2, 0, 1))
+    elif target_channel_dim == ChannelDimension.LAST:
+        image = image.transpose((1, 2, 0))
+    else:
+        raise ValueError("Unsupported channel dimension format: {}".format(channel_dim))
+
+    return image
+
+
+def rescale(
+    image: np.ndarray, scale: float, data_format: Optional[ChannelDimension] = None, dtype=np.float32
+) -> np.ndarray:
+    """
+    Rescales `image` by `scale`.
+
+    Args:
+        image (`np.ndarray`):
+            The image to rescale.
+        scale (`float`):
+            The scale to use for rescaling the image.
+        data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the image. If not provided, it will be the same as the input image.
+        dtype (`np.dtype`, *optional*, defaults to `np.float32`):
+            The dtype of the output image. Defaults to `np.float32`. Used for backwards compatibility with feature
+            extractors.
+
+    Returns:
+        `np.ndarray`: The rescaled image.
+    """
+    if not isinstance(image, np.ndarray):
+        raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
+
+    rescaled_image = image * scale
+    if data_format is not None:
+        rescaled_image = to_channel_dimension_format(rescaled_image, data_format)
+    rescaled_image = rescaled_image.astype(dtype)
+    return rescaled_image
+
+
+def to_pil_image(
+    image: Union[np.ndarray, "PIL.Image.Image", "paddle.Tensor"],
+    do_rescale: Optional[bool] = None,
+) -> "PIL.Image.Image":
+    """
+    Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
+    needed.
+
+    Args:
+        image (`PIL.Image.Image` or `numpy.ndarray` or `paddle.Tensor`):
+            The image to convert to the `PIL.Image` format.
+        do_rescale (`bool`, *optional*):
+            Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will default
+            to `True` if the image type is a floating type, `False` otherwise.
+
+    Returns:
+        `PIL.Image.Image`: The converted image.
+    """
+    if isinstance(image, PIL.Image.Image):
+        return image
+
+    # Convert all tensors to numpy arrays before converting to PIL image
+    if is_paddle_tensor(image):
+        image = image.cpu().numpy()
+    elif not isinstance(image, np.ndarray):
+        raise ValueError("Input image type not supported: {}".format(type(image)))
+
+    # If the channel as been moved to first dim, we put it back at the end.
+    image = to_channel_dimension_format(image, ChannelDimension.LAST)
+
+    # If there is a single channel, we squeeze it, as otherwise PIL can't handle it.
+    image = np.squeeze(image, axis=-1) if image.shape[-1] == 1 else image
+
+    # PIL.Image can only store uint8 values, so we rescale the image to be between 0 and 255 if needed.
+    do_rescale = isinstance(image.flat[0], (float, np.float32, np.float64)) if do_rescale is None else do_rescale
+    if do_rescale:
+        image = rescale(image, 255)
+    image = image.astype(np.uint8)
+    return PIL.Image.fromarray(image)
+
+
+# Logic adapted from torchvision resizing logic: https://github.com/pytorch/vision/blob/511924c1ced4ce0461197e5caa64ce5b9e558aab/torchvision/transforms/functional.py#L366
+def get_resize_output_image_size(
+    input_image: np.ndarray,
+    size: Union[int, Tuple[int, int], List[int], Tuple[int]],
+    default_to_square: bool = True,
+    max_size: Optional[int] = None,
+) -> tuple:
+    """
+    Find the target (height, width) dimension of the output image after resizing given the input image and the desired
+    size.
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        size (`int` or `Tuple[int, int]` or List[int] or Tuple[int]):
+            The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to
+            this.
+
+            If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If
+            `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to this
+            number. i.e, if height > width, then image will be rescaled to (size * height / width, size).
+        default_to_square (`bool`, *optional*, defaults to `True`):
+            How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a square
+            (`size`,`size`). If set to `False`, will replicate
+            [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize)
+            with support for resizing only the smallest edge and providing an optional `max_size`.
+        max_size (`int`, *optional*):
+            The maximum allowed for the longer edge of the resized image: if the longer edge of the image is greater
+            than `max_size` after being resized according to `size`, then the image is resized again so that the longer
+            edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller edge may be shorter
+            than `size`. Only used if `default_to_square` is `False`.
+
+    Returns:
+        `tuple`: The target (height, width) dimension of the output image after resizing.
+    """
+    if isinstance(size, (tuple, list)):
+        if len(size) == 2:
+            return tuple(size)
+        elif len(size) == 1:
+            # Perform same logic as if size was an int
+            size = size[0]
+        else:
+            raise ValueError("size must have 1 or 2 elements if it is a list or tuple")
+
+    if default_to_square:
+        return (size, size)
+
+    height, width = get_image_size(input_image)
+    short, long = (width, height) if width <= height else (height, width)
+    requested_new_short = size
+
+    new_short, new_long = requested_new_short, int(requested_new_short * long / short)
+
+    if max_size is not None:
+        if max_size <= requested_new_short:
+            raise ValueError(
+                f"max_size = {max_size} must be strictly greater than the requested "
+                f"size for the smaller edge size = {size}"
+            )
+        if new_long > max_size:
+            new_short, new_long = int(max_size * new_short / new_long), max_size
+
+    return (new_long, new_short) if width <= height else (new_short, new_long)
+
+
+def resize(
+    image,
+    size: Tuple[int, int],
+    resample: "PILImageResampling" = None,
+    reducing_gap: Optional[int] = None,
+    data_format: Optional[ChannelDimension] = None,
+    return_numpy: bool = True,
+) -> np.ndarray:
+    """
+    Resizes `image` to `(height, width)` specified by `size` using the PIL library.
+
+    Args:
+        image (`PIL.Image.Image` or `np.ndarray` or `paddle.Tensor`):
+            The image to resize.
+        size (`Tuple[int, int]`):
+            The size to use for resizing the image.
+        resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            The filter to user for resampling.
+        reducing_gap (`int`, *optional*):
+            Apply optimization by resizing the image in two steps. The bigger `reducing_gap`, the closer the result to
+            the fair resampling. See corresponding Pillow documentation for more details.
+        data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the output image. If unset, will use the inferred format from the input.
+        return_numpy (`bool`, *optional*, defaults to `True`):
+            Whether or not to return the resized image as a numpy array. If False a `PIL.Image.Image` object is
+            returned.
+
+    Returns:
+        `np.ndarray`: The resized image.
+    """
+    resample = resample if resample is not None else PILImageResampling.BILINEAR
+
+    if not len(size) == 2:
+        raise ValueError("size must have 2 elements")
+
+    # For all transformations, we want to keep the same data format as the input image unless otherwise specified.
+    # The resized image from PIL will always have channels last, so find the input format first.
+    data_format = infer_channel_dimension_format(image) if data_format is None else data_format
+
+    # To maintain backwards compatibility with the resizing done in previous image feature extractors, we use
+    # the pillow library to resize the image and then convert back to numpy
+    if not isinstance(image, PIL.Image.Image):
+        image = to_pil_image(image)
+    height, width = size
+    # PIL images are in the format (width, height)
+    resized_image = image.resize((width, height), resample=resample, reducing_gap=reducing_gap)
+
+    if return_numpy:
+        resized_image = np.array(resized_image)
+        # If the input image channel dimension was of size 1, then it is dropped when converting to a PIL image
+        # so we need to add it back if necessary.
+        resized_image = np.expand_dims(resized_image, axis=-1) if resized_image.ndim == 2 else resized_image
+        # The image is always in channels last format after converting from a PIL image
+        resized_image = to_channel_dimension_format(
+            resized_image, data_format, input_channel_dim=ChannelDimension.LAST
+        )
+    return resized_image
+
+
+def normalize(
+    image: np.ndarray,
+    mean: Union[float, Iterable[float]],
+    std: Union[float, Iterable[float]],
+    data_format: Optional[ChannelDimension] = None,
+) -> np.ndarray:
+    """
+    Normalizes `image` using the mean and standard deviation specified by `mean` and `std`.
+
+    image = (image - mean) / std
+
+    Args:
+        image (`np.ndarray`):
+            The image to normalize.
+        mean (`float` or `Iterable[float]`):
+            The mean to use for normalization.
+        std (`float` or `Iterable[float]`):
+            The standard deviation to use for normalization.
+        data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the output image. If unset, will use the inferred format from the input.
+    """
+    if isinstance(image, PIL.Image.Image):
+        warnings.warn(
+            "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.",
+            FutureWarning,
+        )
+        # Convert PIL image to numpy array with the same logic as in the previous feature extractor normalize -
+        # casting to numpy array and dividing by 255.
+        image = to_numpy_array(image)
+        image = rescale(image, scale=1 / 255)
+
+    if not isinstance(image, np.ndarray):
+        raise ValueError("image must be a numpy array")
+
+    input_data_format = infer_channel_dimension_format(image)
+    channel_axis = get_channel_dimension_axis(image)
+    num_channels = image.shape[channel_axis]
+
+    if isinstance(mean, Iterable):
+        if len(mean) != num_channels:
+            raise ValueError(f"mean must have {num_channels} elements if it is an iterable, got {len(mean)}")
+    else:
+        mean = [mean] * num_channels
+    mean = np.array(mean, dtype=image.dtype)
+
+    if isinstance(std, Iterable):
+        if len(std) != num_channels:
+            raise ValueError(f"std must have {num_channels} elements if it is an iterable, got {len(std)}")
+    else:
+        std = [std] * num_channels
+    std = np.array(std, dtype=image.dtype)
+
+    if input_data_format == ChannelDimension.LAST:
+        image = (image - mean) / std
+    else:
+        image = ((image.T - mean) / std).T
+
+    image = to_channel_dimension_format(image, data_format) if data_format is not None else image
+    return image
+
+
+def center_crop(
+    image: np.ndarray,
+    size: Tuple[int, int],
+    data_format: Optional[Union[str, ChannelDimension]] = None,
+    return_numpy: Optional[bool] = None,
+) -> np.ndarray:
+    """
+    Crops the `image` to the specified `size` using a center crop. Note that if the image is too small to be cropped to
+    the size given, it will be padded (so the returned result will always be of size `size`).
+
+    Args:
+        image (`np.ndarray`):
+            The image to crop.
+        size (`Tuple[int, int]`):
+            The target size for the cropped image.
+        data_format (`str` or `ChannelDimension`, *optional*):
+            The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            If unset, will use the inferred format of the input image.
+        return_numpy (`bool`, *optional*):
+            Whether or not to return the cropped image as a numpy array. Used for backwards compatibility with the
+            previous ImageFeatureExtractionMixin method.
+                - Unset: will return the same type as the input image.
+                - `True`: will return a numpy array.
+                - `False`: will return a `PIL.Image.Image` object.
+    Returns:
+        `np.ndarray`: The cropped image.
+    """
+    if isinstance(image, PIL.Image.Image):
+        warnings.warn(
+            "PIL.Image.Image inputs are deprecated and will be removed in v4.26.0. Please use numpy arrays instead.",
+            FutureWarning,
+        )
+        image = to_numpy_array(image)
+        return_numpy = False if return_numpy is None else return_numpy
+    else:
+        return_numpy = True if return_numpy is None else return_numpy
+
+    if not isinstance(image, np.ndarray):
+        raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}")
+
+    if not isinstance(size, Iterable) or len(size) != 2:
+        raise ValueError("size must have 2 elements representing the height and width of the output image")
+
+    input_data_format = infer_channel_dimension_format(image)
+    output_data_format = data_format if data_format is not None else input_data_format
+
+    # We perform the crop in (C, H, W) format and then convert to the output format
+    image = to_channel_dimension_format(image, ChannelDimension.FIRST)
+
+    orig_height, orig_width = get_image_size(image)
+    crop_height, crop_width = size
+    crop_height, crop_width = int(crop_height), int(crop_width)
+
+    # In case size is odd, (image_shape[0] + size[0]) // 2 won't give the proper result.
+    top = (orig_height - crop_height) // 2
+    bottom = top + crop_height
+    # In case size is odd, (image_shape[1] + size[1]) // 2 won't give the proper result.
+    left = (orig_width - crop_width) // 2
+    right = left + crop_width
+
+    # Check if cropped area is within image boundaries
+    if top >= 0 and bottom <= orig_height and left >= 0 and right <= orig_width:
+        image = image[..., top:bottom, left:right]
+        image = to_channel_dimension_format(image, output_data_format)
+        return image
+
+    # Otherwise, we may need to pad if the image is too small. Oh joy...
+    new_height = max(crop_height, orig_height)
+    new_width = max(crop_width, orig_width)
+    new_shape = image.shape[:-2] + (new_height, new_width)
+    new_image = np.zeros_like(image, shape=new_shape)
+
+    # If the image is too small, pad it with zeros
+    top_pad = (new_height - orig_height) // 2
+    bottom_pad = top_pad + orig_height
+    left_pad = (new_width - orig_width) // 2
+    right_pad = left_pad + orig_width
+    new_image[..., top_pad:bottom_pad, left_pad:right_pad] = image
+
+    top += top_pad
+    bottom += top_pad
+    left += left_pad
+    right += left_pad
+
+    new_image = new_image[..., max(0, top) : min(new_height, bottom), max(0, left) : min(new_width, right)]
+    new_image = to_channel_dimension_format(new_image, output_data_format)
+
+    if not return_numpy:
+        new_image = to_pil_image(new_image)
+
+    return new_image
+
+
+def _center_to_corners_format_paddle(bboxes_center: "paddle.Tensor") -> "paddle.Tensor":
+    center_x, center_y, width, height = bboxes_center.unbind(-1)
+    bbox_corners = paddle.stack(
+        # top left x, top left y, bottom right x, bottom right y
+        [(center_x - 0.5 * width), (center_y - 0.5 * height), (center_x + 0.5 * width), (center_y + 0.5 * height)],
+        axis=-1,
+    )
+    return bbox_corners
+
+
+def _center_to_corners_format_numpy(bboxes_center: np.ndarray) -> np.ndarray:
+    center_x, center_y, width, height = bboxes_center.T
+    bboxes_corners = np.stack(
+        # top left x, top left y, bottom right x, bottom right y
+        [center_x - 0.5 * width, center_y - 0.5 * height, center_x + 0.5 * width, center_y + 0.5 * height],
+        axis=-1,
+    )
+    return bboxes_corners
+
+
+# 2 functions below inspired by https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
+def center_to_corners_format(bboxes_center: TensorType) -> TensorType:
+    """
+    Converts bounding boxes from center format to corners format.
+
+    center format: contains the coordinate for the center of the box and its width, height dimensions
+        (center_x, center_y, width, height)
+    corners format: contains the coodinates for the top-left and bottom-right corners of the box
+        (top_left_x, top_left_y, bottom_right_x, bottom_right_y)
+    """
+    # Function is used during model forward pass, so we use the input framework if possible, without
+    # converting to numpy
+    if is_paddle_tensor(bboxes_center):
+        return _center_to_corners_format_paddle(bboxes_center)
+    elif isinstance(bboxes_center, np.ndarray):
+        return _center_to_corners_format_numpy(bboxes_center)
+
+    raise ValueError(f"Unsupported input type {type(bboxes_center)}")
+
+
+def _corners_to_center_format_paddle(bboxes_corners: "paddle.Tensor") -> "paddle.Tensor":
+    top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.unbind(-1)
+    b = [
+        (top_left_x + bottom_right_x) / 2,  # center x
+        (top_left_y + bottom_right_y) / 2,  # center y
+        (bottom_right_x - top_left_x),  # width
+        (bottom_right_y - top_left_y),  # height
+    ]
+    return paddle.stack(b, axis=-1)
+
+
+def _corners_to_center_format_numpy(bboxes_corners: np.ndarray) -> np.ndarray:
+    top_left_x, top_left_y, bottom_right_x, bottom_right_y = bboxes_corners.T
+    bboxes_center = np.stack(
+        [
+            (top_left_x + bottom_right_x) / 2,  # center x
+            (top_left_y + bottom_right_y) / 2,  # center y
+            (bottom_right_x - top_left_x),  # width
+            (bottom_right_y - top_left_y),  # height
+        ],
+        axis=-1,
+    )
+    return bboxes_center
+
+
+def corners_to_center_format(bboxes_corners: TensorType) -> TensorType:
+    """
+    Converts bounding boxes from corners format to center format.
+
+    corners format: contains the coodinates for the top-left and bottom-right corners of the box
+        (top_left_x, top_left_y, bottom_right_x, bottom_right_y)
+    center format: contains the coordinate for the center of the box and its the width, height dimensions
+        (center_x, center_y, width, height)
+    """
+    # Inverse function accepts different input types so implemented here too
+    if is_paddle_tensor(bboxes_corners):
+        return _corners_to_center_format_paddle(bboxes_corners)
+    elif isinstance(bboxes_corners, np.ndarray):
+        return _corners_to_center_format_numpy(bboxes_corners)
+
+    raise ValueError(f"Unsupported input type {type(bboxes_corners)}")
+
+
+# 2 functions below copied from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
+# Copyright (c) 2018, Alexander Kirillov
+# All rights reserved.
+def rgb_to_id(color):
+    """
+    Converts RGB color to unique ID.
+    """
+    if isinstance(color, np.ndarray) and len(color.shape) == 3:
+        if color.dtype == np.uint8:
+            color = color.astype(np.int32)
+        return color[:, :, 0] + 256 * color[:, :, 1] + 256 * 256 * color[:, :, 2]
+    return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
+
+
+def id_to_rgb(id_map):
+    """
+    Converts unique ID to RGB color.
+    """
+    if isinstance(id_map, np.ndarray):
+        id_map_copy = id_map.copy()
+        rgb_shape = tuple(list(id_map.shape) + [3])
+        rgb_map = np.zeros(rgb_shape, dtype=np.uint8)
+        for i in range(3):
+            rgb_map[..., i] = id_map_copy % 256
+            id_map_copy //= 256
+        return rgb_map
+    color = []
+    for _ in range(3):
+        color.append(id_map % 256)
+        id_map //= 256
+    return color
+
+
+class PaddingMode(ExplicitEnum):
+    """
+    Enum class for the different padding modes to use when padding images.
+    """
+
+    CONSTANT = "constant"
+    REFLECT = "reflect"
+    REPLICATE = "replicate"
+    SYMMETRIC = "symmetric"
+
+
+def pad(
+    image: np.ndarray,
+    padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]],
+    mode: PaddingMode = PaddingMode.CONSTANT,
+    constant_values: Union[float, Iterable[float]] = 0.0,
+    data_format: Optional[Union[str, ChannelDimension]] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> np.ndarray:
+    """
+    Pads the `image` with the specified (height, width) `padding` and `mode`.
+
+    Args:
+        image (`np.ndarray`):
+            The image to pad.
+        padding (`int` or `Tuple[int, int]` or `Iterable[Tuple[int, int]]`):
+            Padding to apply to the edges of the height, width axes. Can be one of three formats:
+            - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis.
+            - `((before, after),)` yields same before and after pad for height and width.
+            - `(pad,)` or int is a shortcut for before = after = pad width for all axes.
+        mode (`PaddingMode`):
+            The padding mode to use. Can be one of:
+                - `"constant"`: pads with a constant value.
+                - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the
+                  vector along each axis.
+                - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis.
+                - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array.
+        constant_values (`float` or `Iterable[float]`, *optional*):
+            The value to use for the padding if `mode` is `"constant"`.
+        data_format (`str` or `ChannelDimension`, *optional*):
+            The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            If unset, will use same as the input image.
+        input_data_format (`str` or `ChannelDimension`, *optional*):
+            The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            If unset, will use the inferred format of the input image.
+
+    Returns:
+        `np.ndarray`: The padded image.
+
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(image)
+
+    def _expand_for_data_format(values):
+        """
+        Convert values to be in the format expected by np.pad based on the data format.
+        """
+        if isinstance(values, (int, float)):
+            values = ((values, values), (values, values))
+        elif isinstance(values, tuple) and len(values) == 1:
+            values = ((values[0], values[0]), (values[0], values[0]))
+        elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], int):
+            values = (values, values)
+        elif isinstance(values, tuple) and len(values) == 2 and isinstance(values[0], tuple):
+            values = values
+        else:
+            raise ValueError(f"Unsupported format: {values}")
+
+        # add 0 for channel dimension
+        values = ((0, 0), *values) if input_data_format == ChannelDimension.FIRST else (*values, (0, 0))
+
+        # Add additional padding if there's a batch dimension
+        values = (0, *values) if image.ndim == 4 else values
+        return values
+
+    padding = _expand_for_data_format(padding)
+
+    if mode == PaddingMode.CONSTANT:
+        constant_values = _expand_for_data_format(constant_values)
+        image = np.pad(image, padding, mode="constant", constant_values=constant_values)
+    elif mode == PaddingMode.REFLECT:
+        image = np.pad(image, padding, mode="reflect")
+    elif mode == PaddingMode.REPLICATE:
+        image = np.pad(image, padding, mode="edge")
+    elif mode == PaddingMode.SYMMETRIC:
+        image = np.pad(image, padding, mode="symmetric")
+    else:
+        raise ValueError(f"Invalid padding mode: {mode}")
+
+    image = to_channel_dimension_format(image, data_format) if data_format is not None else image
+    return image
+
+
+def convert_to_rgb(image: ImageInput) -> ImageInput:
+    """
+    Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image
+    as is.
+
+    Args:
+        image (Image):
+            The image to convert.
+    """
+
+    if not isinstance(image, PIL.Image.Image):
+        return image
+
+    image = image.convert("RGB")
+    return image
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/image_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/image_utils.py
new file mode 100644
index 000000000..2cb8eb63b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/image_utils.py
@@ -0,0 +1,621 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from collections import UserDict
+from typing import Dict, Iterable, List, Tuple, Union
+
+import numpy as np
+import paddle
+import PIL.Image
+import PIL.ImageOps
+import requests
+from packaging import version
+
+from .tokenizer_utils_base import ExplicitEnum
+
+IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
+IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225]
+IMAGENET_STANDARD_MEAN = [0.5, 0.5, 0.5]
+IMAGENET_STANDARD_STD = [0.5, 0.5, 0.5]
+
+
+def is_paddle_tensor(tensor):
+    return paddle.is_tensor(tensor)
+
+
+def to_numpy(obj):
+    """
+    Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a Numpy array.
+    """
+    if isinstance(obj, (dict, UserDict)):
+        return {k: to_numpy(v) for k, v in obj.items()}
+    elif isinstance(obj, (list, tuple)):
+        return np.array(obj)
+    elif is_paddle_tensor(obj):
+        return obj.detach().cpu().numpy()
+    else:
+        return obj
+
+
+if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+    PILImageResampling = PIL.Image.Resampling
+else:
+    PILImageResampling = PIL.Image
+
+
+ImageInput = Union[
+    "PIL.Image.Image", np.ndarray, "paddle.Tensor", List["PIL.Image.Image"], List[np.ndarray], List["paddle.Tensor"]
+]  # noqa
+
+
+class ChannelDimension(ExplicitEnum):
+    FIRST = "channels_first"
+    LAST = "channels_last"
+
+
+def is_valid_image(img):
+    return isinstance(img, PIL.Image.Image) or isinstance(img, np.ndarray) or is_paddle_tensor(img)
+
+
+def valid_images(imgs):
+    # If we have an list of images, make sure every image is valid
+    if isinstance(imgs, (list, tuple)):
+        for img in imgs:
+            if not valid_images(img):
+                return False
+    # If not a list of tuple, we have been given a single image or batched tensor of images
+    elif not is_valid_image(imgs):
+        return False
+    return True
+
+
+def is_batched(img):
+    if isinstance(img, (list, tuple)):
+        return is_valid_image(img[0])
+    return False
+
+
+def make_list_of_images(images, expected_ndims: int = 3) -> List[ImageInput]:
+    """
+    Ensure that the input is a list of images. If the input is a single image, it is converted to a list of length 1.
+    If the input is a batch of images, it is converted to a list of images.
+    Args:
+        images (`ImageInput`):
+            Image of images to turn into a list of images.
+        expected_ndims (`int`, *optional*, defaults to 3):
+            Expected number of dimensions for a single input image. If the input image has a different number of
+            dimensions, an error is raised.
+    """
+    if is_batched(images):
+        return images
+
+    # Either the input is a single image, in which case we create a list of length 1
+    if isinstance(images, PIL.Image.Image):
+        # PIL images are never batched
+        return [images]
+
+    if is_valid_image(images):
+        if images.ndim == expected_ndims + 1:
+            # Batch of images
+            images = list(images)
+        elif images.ndim == expected_ndims:
+            # Single image
+            images = [images]
+        else:
+            raise ValueError(
+                f"Invalid image shape. Expected either {expected_ndims + 1} or {expected_ndims} dimensions, but got"
+                f" {images.ndim} dimensions."
+            )
+        return images
+    raise ValueError(
+        "Invalid image type. Expected either PIL.Image.Image, numpy.ndarray, paddle.Tensor " f"but got {type(images)}."
+    )
+
+
+def to_numpy_array(img) -> np.ndarray:
+    if not is_valid_image(img):
+        raise ValueError(f"Invalid image type: {type(img)}")
+
+    if isinstance(img, PIL.Image.Image):
+        return np.array(img)
+    return to_numpy(img)
+
+
+def infer_channel_dimension_format(image: np.ndarray) -> ChannelDimension:
+    """
+    Infers the channel dimension format of `image`.
+
+    Args:
+        image (`np.ndarray`):
+            The image to infer the channel dimension of.
+
+    Returns:
+        The channel dimension of the image.
+    """
+    if image.ndim == 3:
+        first_dim, last_dim = 0, 2
+    elif image.ndim == 4:
+        first_dim, last_dim = 1, 3
+    else:
+        raise ValueError(f"Unsupported number of image dimensions: {image.ndim}")
+
+    if image.shape[first_dim] in (1, 3):
+        return ChannelDimension.FIRST
+    elif image.shape[last_dim] in (1, 3):
+        return ChannelDimension.LAST
+    raise ValueError("Unable to infer channel dimension format")
+
+
+def get_channel_dimension_axis(image: np.ndarray) -> int:
+    """
+    Returns the channel dimension axis of the image.
+
+    Args:
+        image (`np.ndarray`):
+            The image to get the channel dimension axis of.
+
+    Returns:
+        The channel dimension axis of the image.
+    """
+    channel_dim = infer_channel_dimension_format(image)
+    if channel_dim == ChannelDimension.FIRST:
+        return image.ndim - 3
+    elif channel_dim == ChannelDimension.LAST:
+        return image.ndim - 1
+    raise ValueError(f"Unsupported data format: {channel_dim}")
+
+
+def get_image_size(image: np.ndarray, channel_dim: ChannelDimension = None) -> Tuple[int, int]:
+    """
+    Returns the (height, width) dimensions of the image.
+
+    Args:
+        image (`np.ndarray`):
+            The image to get the dimensions of.
+        channel_dim (`ChannelDimension`, *optional*):
+            Which dimension the channel dimension is in. If `None`, will infer the channel dimension from the image.
+
+    Returns:
+        A tuple of the image's height and width.
+    """
+    if channel_dim is None:
+        channel_dim = infer_channel_dimension_format(image)
+
+    if channel_dim == ChannelDimension.FIRST:
+        return image.shape[-2], image.shape[-1]
+    elif channel_dim == ChannelDimension.LAST:
+        return image.shape[-3], image.shape[-2]
+    else:
+        raise ValueError(f"Unsupported data format: {channel_dim}")
+
+
+def is_valid_annotation_coco_detection(annotation: Dict[str, Union[List, Tuple]]) -> bool:
+    if (
+        isinstance(annotation, dict)
+        and "image_id" in annotation
+        and "annotations" in annotation
+        and isinstance(annotation["annotations"], (list, tuple))
+        and (
+            # an image can have no annotations
+            len(annotation["annotations"]) == 0
+            or isinstance(annotation["annotations"][0], dict)
+        )
+    ):
+        return True
+    return False
+
+
+def is_valid_annotation_coco_panoptic(annotation: Dict[str, Union[List, Tuple]]) -> bool:
+    if (
+        isinstance(annotation, dict)
+        and "image_id" in annotation
+        and "segments_info" in annotation
+        and "file_name" in annotation
+        and isinstance(annotation["segments_info"], (list, tuple))
+        and (
+            # an image can have no segments
+            len(annotation["segments_info"]) == 0
+            or isinstance(annotation["segments_info"][0], dict)
+        )
+    ):
+        return True
+    return False
+
+
+def valid_coco_detection_annotations(annotations: Iterable[Dict[str, Union[List, Tuple]]]) -> bool:
+    return all(is_valid_annotation_coco_detection(ann) for ann in annotations)
+
+
+def valid_coco_panoptic_annotations(annotations: Iterable[Dict[str, Union[List, Tuple]]]) -> bool:
+    return all(is_valid_annotation_coco_panoptic(ann) for ann in annotations)
+
+
+def load_image(image: Union[str, "PIL.Image.Image"]) -> "PIL.Image.Image":
+    """
+    Loads `image` to a PIL Image.
+
+    Args:
+        image (`str` or `PIL.Image.Image`):
+            The image to convert to the PIL Image format.
+
+    Returns:
+        `PIL.Image.Image`: A PIL Image.
+    """
+    if isinstance(image, str):
+        if image.startswith("http://") or image.startswith("https://"):
+            # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+            # like http_huggingface_co.png
+            image = PIL.Image.open(requests.get(image, stream=True).raw)
+        elif os.path.isfile(image):
+            image = PIL.Image.open(image)
+        else:
+            raise ValueError(
+                f"Incorrect path or url, URLs must start with `http://` or `https://`, and {image} is not a valid path"
+            )
+    elif isinstance(image, PIL.Image.Image):
+        image = image
+    else:
+        raise ValueError(
+            "Incorrect format used for image. Should be an url linking to an image, a local path, or a PIL image."
+        )
+    image = PIL.ImageOps.exif_transpose(image)
+    image = image.convert("RGB")
+    return image
+
+
+class ImageFeatureExtractionMixin:
+    """
+    Mixin that contain utilities for preparing image features.
+    """
+
+    def _ensure_format_supported(self, image):
+        if not isinstance(image, (PIL.Image.Image, np.ndarray)) and not is_paddle_tensor(image):
+            raise ValueError(
+                f"Got type {type(image)} which is not supported, only `PIL.Image.Image`, `np.array` and "
+                "`paddle.Tensor` are."
+            )
+
+    def to_pil_image(self, image, rescale=None):
+        """
+        Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
+        needed.
+
+        Args:
+            image (`PIL.Image.Image` or `numpy.ndarray` or `paddle.Tensor`):
+                The image to convert to the PIL Image format.
+            rescale (`bool`, *optional*):
+                Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will
+                default to `True` if the image type is a floating type, `False` otherwise.
+        """
+        self._ensure_format_supported(image)
+
+        if is_paddle_tensor(image):
+            image = image.cpu().numpy()
+
+        if isinstance(image, np.ndarray):
+            if rescale is None:
+                # rescale default to the array being of floating type.
+                rescale = isinstance(image.flat[0], np.floating)
+            # If the channel as been moved to first dim, we put it back at the end.
+            if image.ndim == 3 and image.shape[0] in [1, 3]:
+                image = image.transpose(1, 2, 0)
+            if rescale:
+                image = image * 255
+            image = image.astype(np.uint8)
+            return PIL.Image.fromarray(image)
+        return image
+
+    def convert_rgb(self, image):
+        """
+        Converts `PIL.Image.Image` to RGB format.
+
+        Args:
+            image (`PIL.Image.Image`):
+                The image to convert.
+        """
+        self._ensure_format_supported(image)
+        if not isinstance(image, PIL.Image.Image):
+            return image
+
+        return image.convert("RGB")
+
+    def rescale(self, image: np.ndarray, scale: Union[float, int]) -> np.ndarray:
+        """
+        Rescale a numpy image by scale amount
+        """
+        self._ensure_format_supported(image)
+        return image * scale
+
+    def to_numpy_array(self, image, rescale=None, channel_first=True):
+        """
+        Converts `image` to a numpy array. Optionally rescales it and puts the channel dimension as the first
+        dimension.
+
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `paddle.Tensor`):
+                The image to convert to a NumPy array.
+            rescale (`bool`, *optional*):
+                Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Will
+                default to `True` if the image is a PIL Image or an array/tensor of integers, `False` otherwise.
+            channel_first (`bool`, *optional*, defaults to `True`):
+                Whether or not to permute the dimensions of the image to put the channel dimension first.
+        """
+        self._ensure_format_supported(image)
+
+        if isinstance(image, PIL.Image.Image):
+            image = np.array(image)
+
+        if is_paddle_tensor(image):
+            image = image.cpu().numpy()
+
+        rescale = isinstance(image.flat[0], np.integer) if rescale is None else rescale
+
+        if rescale:
+            image = self.rescale(image.astype(np.float32), 1 / 255.0)
+
+        if channel_first and image.ndim == 3:
+            image = image.transpose(2, 0, 1)
+
+        return image
+
+    def expand_dims(self, image):
+        """
+        Expands 2-dimensional `image` to 3 dimensions.
+
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `paddle.Tensor`):
+                The image to expand.
+        """
+        self._ensure_format_supported(image)
+
+        # Do nothing if PIL image
+        if isinstance(image, PIL.Image.Image):
+            return image
+
+        if is_paddle_tensor(image):
+            image = image.unsqueeze(0)
+        else:
+            image = np.expand_dims(image, axis=0)
+        return image
+
+    def normalize(self, image, mean, std, rescale=False):
+        """
+        Normalizes `image` with `mean` and `std`. Note that this will trigger a conversion of `image` to a NumPy array
+        if it's a PIL Image.
+
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `paddle.Tensor`):
+                The image to normalize.
+            mean (`List[float]` or `np.ndarray` or `paddle.Tensor`):
+                The mean (per channel) to use for normalization.
+            std (`List[float]` or `np.ndarray` or `paddle.Tensor`):
+                The standard deviation (per channel) to use for normalization.
+            rescale (`bool`, *optional*, defaults to `False`):
+                Whether or not to rescale the image to be between 0 and 1. If a PIL image is provided, scaling will
+                happen automatically.
+        """
+        self._ensure_format_supported(image)
+
+        if isinstance(image, PIL.Image.Image):
+            image = self.to_numpy_array(image, rescale=True)
+        # If the input image is a PIL image, it automatically gets rescaled. If it's another
+        # type it may need rescaling.
+        elif rescale:
+            if isinstance(image, np.ndarray):
+                image = self.rescale(image.astype(np.float32), 1 / 255.0)
+            elif is_paddle_tensor(image):
+                image = self.rescale(image.astype("float32"), 1 / 255.0)
+
+        if isinstance(image, np.ndarray):
+            if not isinstance(mean, np.ndarray):
+                mean = np.array(mean).astype(image.dtype)
+            if not isinstance(std, np.ndarray):
+                std = np.array(std).astype(image.dtype)
+        elif is_paddle_tensor(image):
+
+            if not isinstance(mean, paddle.Tensor):
+                mean = paddle.to_tensor(mean).astype(image.dtype)
+            if not isinstance(std, paddle.Tensor):
+                std = paddle.to_tensor(std).astype(image.dtype)
+
+        if image.ndim == 3 and image.shape[0] in [1, 3]:
+            return (image - mean[:, None, None]) / std[:, None, None]
+        else:
+            return (image - mean) / std
+
+    def resize(self, image, size, resample=None, default_to_square=True, max_size=None):
+        """
+        Resizes `image`. Enforces conversion of input to PIL.Image.
+
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `paddle.Tensor`):
+                The image to resize.
+            size (`int` or `Tuple[int, int]`):
+                The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be
+                matched to this.
+
+                If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If
+                `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to
+                this number. i.e, if height > width, then image will be rescaled to (size * height / width, size).
+            resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                The filter to user for resampling.
+            default_to_square (`bool`, *optional*, defaults to `True`):
+                How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a
+                square (`size`,`size`). If set to `False`, will replicate
+                [`paddle.vision.transforms.Resize`](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/vision/transforms/Resize_cn.html#resize)
+                with support for resizing only the smallest edge and providing an optional `max_size`.
+            max_size (`int`, *optional*, defaults to `None`):
+                The maximum allowed for the longer edge of the resized image: if the longer edge of the image is
+                greater than `max_size` after being resized according to `size`, then the image is resized again so
+                that the longer edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller
+                edge may be shorter than `size`. Only used if `default_to_square` is `False`.
+
+        Returns:
+            image: A resized `PIL.Image.Image`.
+        """
+        resample = resample if resample is not None else PILImageResampling.BILINEAR
+
+        self._ensure_format_supported(image)
+
+        if not isinstance(image, PIL.Image.Image):
+            image = self.to_pil_image(image)
+
+        if isinstance(size, list):
+            size = tuple(size)
+
+        if isinstance(size, int) or len(size) == 1:
+            if default_to_square:
+                size = (size, size) if isinstance(size, int) else (size[0], size[0])
+            else:
+                width, height = image.size
+                # specified size only for the smallest edge
+                short, long = (width, height) if width <= height else (height, width)
+                requested_new_short = size if isinstance(size, int) else size[0]
+
+                if short == requested_new_short:
+                    return image
+
+                new_short, new_long = requested_new_short, int(requested_new_short * long / short)
+
+                if max_size is not None:
+                    if max_size <= requested_new_short:
+                        raise ValueError(
+                            f"max_size = {max_size} must be strictly greater than the requested "
+                            f"size for the smaller edge size = {size}"
+                        )
+                    if new_long > max_size:
+                        new_short, new_long = int(max_size * new_short / new_long), max_size
+
+                size = (new_short, new_long) if width <= height else (new_long, new_short)
+
+        return image.resize(size, resample=resample)
+
+    def center_crop(self, image, size):
+        """
+        Crops `image` to the given size using a center crop. Note that if the image is too small to be cropped to the
+        size given, it will be padded (so the returned result has the size asked).
+
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `paddle.Tensor` of shape (n_channels, height, width) or (height, width, n_channels)):
+                The image to resize.
+            size (`int` or `Tuple[int, int]`):
+                The size to which crop the image.
+
+        Returns:
+            new_image: A center cropped `PIL.Image.Image` or `np.ndarray` or `paddle.Tensor` of shape: (n_channels,
+            height, width).
+        """
+        self._ensure_format_supported(image)
+
+        if not isinstance(size, tuple):
+            size = (size, size)
+
+        # PIL Image.size is (width, height) but NumPy array and paddle Tensors have (height, width)
+        if is_paddle_tensor(image) or isinstance(image, np.ndarray):
+            if image.ndim == 2:
+                image = self.expand_dims(image)
+            image_shape = image.shape[1:] if image.shape[0] in [1, 3] else image.shape[:2]
+        else:
+            image_shape = (image.size[1], image.size[0])
+
+        top = (image_shape[0] - size[0]) // 2
+        bottom = top + size[0]  # In case size is odd, (image_shape[0] + size[0]) // 2 won't give the proper result.
+        left = (image_shape[1] - size[1]) // 2
+        right = left + size[1]  # In case size is odd, (image_shape[1] + size[1]) // 2 won't give the proper result.
+
+        # For PIL Images we have a method to crop directly.
+        if isinstance(image, PIL.Image.Image):
+            return image.crop((left, top, right, bottom))
+
+        # Check if image is in (n_channels, height, width) or (height, width, n_channels) format
+        channel_first = True if image.shape[0] in [1, 3] else False
+
+        # Transpose (height, width, n_channels) format images
+        if not channel_first:
+            if isinstance(image, np.ndarray):
+                image = image.transpose(2, 0, 1)
+            if is_paddle_tensor(image):
+                image = image.transpose([2, 0, 1])
+
+        # Check if cropped area is within image boundaries
+        if top >= 0 and bottom <= image_shape[0] and left >= 0 and right <= image_shape[1]:
+            return image[..., top:bottom, left:right]
+
+        # Otherwise, we may need to pad if the image is too small. Oh joy...
+        new_shape = image.shape[:-2] + (max(size[0], image_shape[0]), max(size[1], image_shape[1]))
+        if isinstance(image, np.ndarray):
+            new_image = np.zeros_like(image, shape=new_shape)
+        elif is_paddle_tensor(image):
+            new_image = paddle.zeros(new_shape, dtype=image.dtype)
+
+        top_pad = (new_shape[-2] - image_shape[0]) // 2
+        bottom_pad = top_pad + image_shape[0]
+        left_pad = (new_shape[-1] - image_shape[1]) // 2
+        right_pad = left_pad + image_shape[1]
+        new_image[..., top_pad:bottom_pad, left_pad:right_pad] = image
+
+        top += top_pad
+        bottom += top_pad
+        left += left_pad
+        right += left_pad
+
+        new_image = new_image[
+            ..., max(0, top) : min(new_image.shape[-2], bottom), max(0, left) : min(new_image.shape[-1], right)
+        ]
+
+        return new_image
+
+    def flip_channel_order(self, image):
+        """
+        Flips the channel order of `image` from RGB to BGR, or vice versa. Note that this will trigger a conversion of
+        `image` to a NumPy array if it's a PIL Image.
+
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `paddle.Tensor`):
+                The image whose color channels to flip. If `np.ndarray` or `paddle.Tensor`, the channel dimension should
+                be first.
+        """
+        self._ensure_format_supported(image)
+
+        if isinstance(image, PIL.Image.Image):
+            image = self.to_numpy_array(image)
+
+        return image[::-1, :, :]
+
+    def rotate(self, image, angle, resample=None, expand=0, center=None, translate=None, fillcolor=None):
+        """
+        Returns a rotated copy of `image`. This method returns a copy of `image`, rotated the given number of degrees
+        counter clockwise around its centre.
+
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `paddle.Tensor`):
+                The image to rotate. If `np.ndarray` or `paddle.Tensor`, will be converted to `PIL.Image.Image` before
+                rotating.
+
+        Returns:
+            image: A rotated `PIL.Image.Image`.
+        """
+        resample = resample if resample is not None else PIL.Image.NEAREST
+
+        self._ensure_format_supported(image)
+
+        if not isinstance(image, PIL.Image.Image):
+            image = self.to_pil_image(image)
+
+        return image.rotate(
+            angle, resample=resample, expand=expand, center=center, translate=translate, fillcolor=fillcolor
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/jamba/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/jamba/__init__.py
new file mode 100644
index 000000000..fd05a9208
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/jamba/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/jamba/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/jamba/configuration.py
new file mode 100644
index 000000000..7f4de1786
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/jamba/configuration.py
@@ -0,0 +1,223 @@
+# coding=utf-8
+# Copyright 2024 AI21 Labs Ltd. and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Jamba model configuration"""
+import math
+
+from ..configuration_utils import PretrainedConfig
+
+__all__ = [
+    "JambaConfig",
+]
+
+
+class JambaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`JambaModel`]. It is used to instantiate a
+    Jamba model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Jamba-v0.1 model.
+
+    [ai21labs/Jamba-v0.1](https://huggingface.co/ai21labs/Jamba-v0.1)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 65536):
+            Vocabulary size of the Jamba model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`JambaModel`]
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
+            model has a output word embedding layer.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
+            Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an
+            integer value, only last `num_logits_to_keep` logits will be calculated. Default is 1 because only the
+            logits of the last prompt token are needed for generation. For long sequences, the logits for the entire
+            sequence may use a lot of memory so, setting `num_logits_to_keep=1` will reduce memory footprint
+            significantly.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also
+            allow the model to output the auxiliary loss. See [here]() for more details
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If not specified, will default to `None`.
+        max_position_embeddings (`int`, *optional*, defaults to 262144):
+            This value doesn't have any real effect. The maximum sequence length that this model is intended to be
+            used with. It can be used with longer sequences, but performance may degrade.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        num_experts_per_tok (`int`, *optional*, defaults to 2):
+            The number of experts to root per-token, can be also interpreted as the `top-p` routing
+            parameter
+        num_experts (`int`, *optional*, defaults to 16):
+            Number of experts per Sparse MLP layer.
+        expert_layer_period (`int`, *optional*, defaults to 2):
+            Once in this many layers, we will have an expert layer
+        expert_layer_offset (`int`, *optional*, defaults to 1):
+            The first layer index that contains an expert mlp layer
+        attn_layer_period (`int`, *optional*, defaults to 8):
+            Once in this many layers, we will have a vanilla attention layer
+        attn_layer_offset (`int`, *optional*, defaults to 4):
+            The first layer index that contains a vanilla attention mlp layer
+        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use the fast mamba kernels. These are available only if `mamba-ssm` and
+            `causal-conv1d` are installed, and the mamba modules are running on a CUDA device. Raises ValueError if
+            `True` and kernels are not available
+        mamba_d_state (`int`, *optional*, defaults to 16):
+            The dimension the mamba state space latents
+        mamba_d_conv (`int`, *optional*, defaults to 4):
+            The size of the mamba convolution kernel
+        mamba_expand (`int`, *optional*, defaults to 2):
+            Expanding factor (relative to hidden_size) used to determine the mamba intermediate size
+        mamba_dt_rank (`Union[int,str]`, *optional*, defaults to `"auto"`):
+            Rank of the the mamba discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)`
+        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use bias in the convolution layer of the mamba mixer block.
+        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
+            Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the mamba mixer block
+
+    """
+
+    model_type = "jamba"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=65536,
+        tie_word_embeddings=False,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        num_logits_to_keep=1,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        sliding_window=None,
+        max_position_embeddings=262144,
+        attention_dropout=0.0,
+        num_experts_per_tok=2,
+        num_experts=16,
+        expert_layer_period=2,
+        expert_layer_offset=1,
+        attn_layer_period=8,
+        attn_layer_offset=4,
+        use_mamba_kernels=True,
+        mamba_d_state=16,
+        mamba_d_conv=4,
+        mamba_expand=2,
+        mamba_dt_rank="auto",
+        mamba_conv_bias=True,
+        mamba_proj_bias=False,
+        **kwargs,
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.tie_word_embeddings = tie_word_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+        self.max_position_embeddings = max_position_embeddings
+        self.attention_dropout = attention_dropout
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+
+        self.use_cache = use_cache
+        self.num_logits_to_keep = num_logits_to_keep
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.expert_layer_period = expert_layer_period
+        self.expert_layer_offset = expert_layer_offset
+        self.attn_layer_period = attn_layer_period
+        self.attn_layer_offset = attn_layer_offset
+
+        self.use_mamba_kernels = use_mamba_kernels
+        self.mamba_d_state = mamba_d_state
+        self.mamba_d_conv = mamba_d_conv
+        self.mamba_expand = mamba_expand
+        self.mamba_dt_rank = math.ceil(self.hidden_size / 16) if mamba_dt_rank == "auto" else mamba_dt_rank
+        self.mamba_conv_bias = mamba_conv_bias
+        self.mamba_proj_bias = mamba_proj_bias
+
+    @property
+    def layers_block_type(self):
+        return [
+            "attention" if i % self.attn_layer_period == self.attn_layer_offset else "mamba"
+            for i in range(self.num_hidden_layers)
+        ]
+
+    @property
+    def layers_num_experts(self):
+        return [
+            self.num_experts if i % self.expert_layer_period == self.expert_layer_offset else 1
+            for i in range(self.num_hidden_layers)
+        ]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/jamba/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/jamba/modeling.py
new file mode 100644
index 000000000..efeb4b6f2
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/jamba/modeling.py
@@ -0,0 +1,2010 @@
+# coding=utf-8
+# Copyright 2024 AI21 Labs Ltd. and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Paddle Jamba model."""
+
+import math
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import paddle
+import paddle.distributed.fleet.meta_parallel as mpu
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
+from paddle.distributed.fleet.utils import recompute
+
+from paddlenlp.transformers.conversion_utils import (
+    StateDictNameMapping,
+    init_name_mappings,
+)
+
+from ...utils.initializer import normal_, zeros_
+from ..activations import ACT2FN
+from ..model_outputs import MoECausalLMOutputWithPast, MoEModelOutputWithPast
+from ..model_utils import PretrainedModel
+from .configuration import JambaConfig
+
+try:
+    from mamba_ssm_paddle.ops.selective_scan_interface import (
+        mamba_inner_fn,
+        selective_scan_fn,
+    )
+    from mamba_ssm_paddle.ops.triton.selective_state_update import (
+        selective_state_update,
+    )
+except ImportError:
+    selective_state_update, selective_scan_fn, mamba_inner_fn = None, None, None
+
+try:
+    from mamba_ssm_paddle.ops.causal_conv1d_interface import (
+        causal_conv1d_fn,
+        causal_conv1d_update,
+    )
+except ImportError:
+    causal_conv1d_fn, causal_conv1d_update = None, None
+
+is_fast_path_available = all(
+    (selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)
+)
+from paddle.amp.auto_cast import amp_global_state
+
+from paddlenlp.utils.log import logger
+
+from ..llama.modeling import parallel_matmul
+
+_flash_supports_window_size = False
+
+_CONFIG_FOR_DOC = "JambaConfig"
+
+
+def is_autocast_enabled():
+    tracer = paddle.framework._dygraph_tracer()
+    return False if tracer._amp_level == paddle.core.AmpLevel.O0 else True
+
+
+def get_triangle_upper_mask(x, mask=None):
+    if mask is not None:
+        return mask
+    # [bsz, n_head, q_len, kv_seq_len]
+    shape = x.shape
+    #  [bsz, 1, q_len, kv_seq_len]
+    shape[1] = 1
+    mask = paddle.full(shape, paddle.finfo(x.dtype).min, dtype=x.dtype)
+    mask = paddle.triu(mask, diagonal=1)
+    mask.stop_gradient = True
+    return mask
+
+
+def is_casual_mask(attention_mask):
+    """
+    Upper triangular of attention_mask equals to attention_mask is casual
+    """
+    return (paddle.triu(attention_mask) == attention_mask).all()
+
+
+def _make_causal_mask(input_ids_shape, past_key_values_length):
+    """
+    Make causal mask used for self-attention
+    """
+    batch_size, target_length = input_ids_shape  # target_length: seq_len
+
+    mask = paddle.tril(paddle.ones((target_length, target_length), dtype="bool"))
+
+    if past_key_values_length > 0:
+        # [tgt_len, tgt_len + past_len]
+        mask = paddle.concat([paddle.ones([target_length, past_key_values_length], dtype="bool"), mask], axis=-1)
+
+    # [bs, 1, tgt_len, tgt_len + past_len]
+    return mask[None, None, :, :].expand([batch_size, 1, target_length, target_length + past_key_values_length])
+
+
+def _expand_2d_mask(mask, dtype, tgt_length):
+    """
+    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
+    """
+    batch_size, src_length = mask.shape[0], mask.shape[-1]
+    tgt_length = tgt_length if tgt_length is not None else src_length
+
+    mask = mask[:, None, None, :].astype("bool")
+    mask.stop_gradient = True
+    expanded_mask = mask.expand([batch_size, 1, tgt_length, src_length])
+
+    return expanded_mask
+
+
+# Copied from transformers.models.mixtral.modeling_mixtral.load_balancing_loss_func with gate->router
+def load_balancing_loss_func(
+    router_logits: paddle.Tensor,
+    num_experts: paddle.Tensor = None,
+    top_k=2,
+    attention_mask: Optional[paddle.Tensor] = None,
+) -> float:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Paddle.
+
+    See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+
+    Args:
+        router_logits (Union[`paddle.Tensor`, Tuple[paddle.Tensor]):
+            Logits from the `router`, should be a tuple of model.config.num_hidden_layers tensors of
+            shape [batch_size X sequence_length, num_experts].
+        attention_mask (`paddle.Tensor`, None):
+            The attention_mask used in forward function
+            shape [batch_size X sequence_length] if not None.
+        num_experts (`int`, *optional*):
+            Number of experts
+
+    Returns:
+        The auxiliary loss.
+    """
+    if router_logits is None or not isinstance(router_logits, tuple):
+        return 0
+
+    if isinstance(router_logits, tuple):
+        concatenated_router_logits = paddle.concat([layer_router for layer_router in router_logits], axis=0)
+
+    routing_weights = paddle.nn.functional.softmax(concatenated_router_logits, axis=-1)
+
+    _, selected_experts = paddle.topk(routing_weights, top_k, axis=-1)
+
+    expert_mask = paddle.nn.functional.one_hot(selected_experts, num_experts)
+
+    if attention_mask is None or attention_mask.ndim == 4:
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = paddle.mean(expert_mask.cast("float32"), axis=0)
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = paddle.mean(routing_weights, axis=0)
+    else:
+        if attention_mask.ndim == 2:
+            batch_size, sequence_length = attention_mask.shape
+            num_hidden_layers = concatenated_router_logits.shape[0] // (batch_size * sequence_length)
+            if attention_mask.dtype == paddle.bool:
+                attention_mask = attention_mask.cast("float32")
+            # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+            expert_attention_mask = (
+                attention_mask[None, :, :, None, None]
+                .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
+                .reshape([-1, top_k, num_experts])
+            )
+
+            # Compute the percentage of tokens routed to each experts
+            tokens_per_expert = paddle.sum(expert_mask.cast("float32") * expert_attention_mask, axis=0) / paddle.sum(
+                expert_attention_mask, axis=0
+            )
+
+            # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+            router_per_expert_attention_mask = (
+                attention_mask[None, :, :, None]
+                .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
+                .reshape([-1, num_experts])
+            )
+
+            # Compute the average probability of routing to these experts
+            router_prob_per_expert = paddle.sum(
+                routing_weights * router_per_expert_attention_mask, axis=0
+            ) / paddle.sum(router_per_expert_attention_mask, axis=0)
+
+    overall_loss = paddle.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
+    return overall_loss * num_experts
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Jamba
+class JambaRMSNorm(nn.Layer):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        JambaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = self.create_parameter(
+            [
+                hidden_size,
+            ],
+            default_initializer=paddle.nn.initializer.Constant(1.0),
+        )
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.cast(paddle.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * paddle.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.cast(input_dtype)
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor:
+    """
+    This is the equivalent of paddle.repeat_interleave(x, axis=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand([batch, num_key_value_heads, n_rep, slen, head_dim])
+    return hidden_states.reshape([batch, num_key_value_heads * n_rep, slen, head_dim])
+
+
+@dataclass
+class HybridMambaAttentionDynamicCache:
+    """
+    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
+    (which has a constant shape regardless of seq_len).
+
+    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
+    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
+    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
+    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
+    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
+    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
+    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
+    """
+
+    def __init__(self, config, batch_size, dtype=paddle.float16):
+        self.dtype = dtype
+        self.layers_block_type = config.layers_block_type
+        self.has_previous_state = False  # only used by mamba
+        intermediate_size = config.mamba_expand * config.hidden_size
+        ssm_state_size = config.mamba_d_state
+        conv_kernel_size = config.mamba_d_conv
+        self.conv_states = []
+        self.ssm_states = []
+        self.transformer_layers = []
+        for i in range(config.num_hidden_layers):
+            if self.layers_block_type[i] == "mamba":
+                self.conv_states += [paddle.zeros([batch_size, intermediate_size, conv_kernel_size], dtype=dtype)]
+                self.ssm_states += [paddle.zeros([batch_size, intermediate_size, ssm_state_size], dtype=dtype)]
+            else:
+                self.conv_states += [paddle.to_tensor([[]] * batch_size)]
+                self.ssm_states += [paddle.to_tensor([[]] * batch_size)]
+                self.transformer_layers.append(i)
+
+        self.key_cache = [paddle.to_tensor([[]] * batch_size) for _ in range(config.num_hidden_layers)]
+        self.value_cache = [paddle.to_tensor([[]] * batch_size) for _ in range(config.num_hidden_layers)]
+
+    def update(
+        self,
+        key_states: paddle.Tensor,
+        value_states: paddle.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        # Update the cache
+        if self.key_cache[layer_idx].shape[-1] == 0:
+            self.key_cache[layer_idx] = key_states
+            self.value_cache[layer_idx] = value_states
+        else:
+            # bsz, num_key_value_heads, q_len, self.head_dim
+            self.key_cache[layer_idx] = paddle.concat([self.key_cache[layer_idx], key_states], axis=2)
+            self.value_cache[layer_idx] = paddle.concat([self.value_cache[layer_idx], value_states], axis=2)
+
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
+    def reorder_cache(self, beam_idx: paddle.Tensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        for layer_idx in range(len(self.key_cache)):
+            self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0)
+            self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0)
+            self.conv_states[layer_idx] = self.conv_states[layer_idx].index_select(0)
+            self.ssm_states[layer_idx] = self.ssm_states[layer_idx].index_select(0)
+
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        # take any layer that contains cache and not empty tensor
+        layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx
+        if len(self.key_cache) <= layer_idx:
+            return 0
+        key_val = self.key_cache[layer_idx]
+        if key_val.ndim == 2 and key_val.shape[-1] == 0:
+            return 0
+        return key_val.shape[-2]
+
+    def get_max_length(self) -> Optional[int]:
+        """Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length."""
+        return None
+
+    def __getitem__(self, layer_idx: int) -> List[Tuple[paddle.Tensor]]:
+        """
+        Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
+        sequence length.
+        """
+        if layer_idx < len(self):
+            return (self.key_cache[layer_idx], self.value_cache[layer_idx])
+        else:
+            raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}")
+
+    def __iter__(self):
+        """
+        Support for backwards-compatible `past_key_value` iteration, e.g. `for x in past_key_value:` to iterate over
+        keys and values
+        """
+        for layer_idx in range(len(self)):
+            yield (self.key_cache[layer_idx], self.value_cache[layer_idx])
+
+    def __len__(self):
+        """
+        Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
+        to the number of layers in the model.
+        """
+        return len(self.key_cache)
+
+
+# Adapted from transformers.models.mistral.modeling_mistral.MistralAttention with Mistral->Jamba
+class JambaAttention(nn.Layer):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: JambaConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+
+        if config.tensor_parallel_degree > 1:
+            assert (
+                self.num_heads % config.tensor_parallel_degree == 0
+            ), f"num_heads: {self.num_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}"
+            self.num_heads = self.num_heads // config.tensor_parallel_degree
+
+            assert (
+                self.num_key_value_heads % config.tensor_parallel_degree == 0
+            ), f"num_key_value_heads: {self.num_key_value_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}"
+            self.num_key_value_heads = self.num_key_value_heads // config.tensor_parallel_degree
+
+            ColumnParallelLinear = mpu.ColumnParallelLinear
+            RowParallelLinear = mpu.RowParallelLinear
+            self.q_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.hidden_size,
+                has_bias=False,
+                gather_output=False,
+            )
+            self.k_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.config.num_key_value_heads * self.head_dim,
+                has_bias=False,
+                gather_output=False,
+            )
+            self.v_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.config.num_key_value_heads * self.head_dim,
+                has_bias=False,
+                gather_output=False,
+            )
+            self.o_proj = RowParallelLinear(
+                self.hidden_size,
+                self.hidden_size,
+                has_bias=False,
+                input_is_parallel=True,
+            )
+        else:
+            self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias_attr=False)
+            self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias_attr=False)
+            self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias_attr=False)
+            self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias_attr=False)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[paddle.Tensor] = None,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        bsz, q_len, _ = hidden_states.shape
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.reshape([bsz, q_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
+        key_states = key_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3])
+        value_states = value_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose(
+            [0, 2, 1, 3]
+        )
+        if past_key_value is not None:
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = paddle.matmul(query_states, key_states, transpose_y=True) / math.sqrt(self.head_dim)
+
+        if attention_mask is None:
+            attention_mask = get_triangle_upper_mask(attn_weights)
+
+        # [bs, num_heads, kv_seq_len, head_dim]
+        kv_seq_len = value_states.shape[2]
+
+        attention_mask = attention_mask.reshape([bsz, 1, q_len, kv_seq_len])
+        if attention_mask.shape != [bsz, 1, q_len, kv_seq_len]:
+            raise ValueError(
+                f"Attention mask should be of shape {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.shape}"
+            )
+        attn_weights = attn_weights + attention_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, axis=-1, dtype=paddle.float32).cast(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = paddle.matmul(attn_weights, value_states)
+
+        if attn_output.shape != [bsz, self.num_heads, q_len, self.head_dim]:
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.shape}"
+            )
+
+        attn_output = attn_output.transpose([0, 2, 1, 3]).contiguous()
+        attn_output = attn_output.reshape([bsz, q_len, -1])
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+# Adapted from transformers.models.mistral.modeling_mistral.MistralFlashAttention2 with Mistral->Jamba
+class JambaFlashAttention2(JambaAttention):
+    """
+    Jamba flash attention module. This module inherits from `JambaAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[paddle.Tensor] = None,
+        **kwargs,
+    ):
+        bsz, q_len, _ = hidden_states.shape
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.reshape([bsz, q_len, self.num_heads, self.head_dim])
+        key_states = key_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3])
+        value_states = value_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose(
+            [0, 2, 1, 3]
+        )
+
+        if not _flash_supports_window_size:
+            logger.warning_once(
+                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
+                " make sure to upgrade flash-attn library."
+            )
+
+        if past_key_value is not None:
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == paddle.float32:
+            if is_autocast_enabled():
+                target_dtype = amp_global_state().amp_dtype
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.cast(target_dtype)
+            key_states = key_states.cast(target_dtype)
+            value_states = value_states.cast(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        key_states = key_states.transpose([0, 2, 1, 3])
+        value_states = value_states.transpose([0, 2, 1, 3])
+
+        attn_output = F.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            is_causal=attention_mask is None,
+            dropout_p=dropout_rate,
+            training=self.training,
+        )
+        attn_output = attn_output.reshape([bsz, q_len, -1]).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer
+class JambaMambaMixer(nn.Layer):
+    """
+    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
+    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
+    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
+    and is why Mamba is called **selective** state spaces)
+    """
+
+    def __init__(self, config: JambaConfig, layer_idx):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
+        self.intermediate_size = config.mamba_expand * config.hidden_size
+        self.time_step_rank = config.mamba_dt_rank
+        self.use_conv_bias = config.mamba_conv_bias
+        self.use_bias = config.mamba_proj_bias
+        self.conv1d = nn.Conv1D(
+            in_channels=self.intermediate_size,
+            out_channels=self.intermediate_size,
+            bias_attr=self.use_conv_bias,
+            kernel_size=self.conv_kernel_size,
+            groups=self.intermediate_size,
+            padding=self.conv_kernel_size - 1,
+        )
+
+        self.activation = config.hidden_act
+        self.act = ACT2FN[config.hidden_act]
+
+        self.use_fast_kernels = config.use_mamba_kernels and is_fast_path_available
+
+        # projection of the input hidden states
+        self.in_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias_attr=self.use_bias)
+        # selective projection used to make dt, B and C input dependant
+        self.x_proj = nn.Linear(self.intermediate_size, self.time_step_rank + self.ssm_state_size * 2, bias_attr=False)
+        # time step projection (discretization)
+        self.dt_proj = nn.Linear(self.time_step_rank, self.intermediate_size, bias_attr=True)
+
+        # S4D real initialization. These are not discretized!
+        # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
+        A = paddle.arange(1, self.ssm_state_size + 1, dtype=paddle.float32)[None, :]
+        A = A.expand([self.intermediate_size, -1])
+
+        self.A_log = self.create_parameter(
+            shape=A.shape,
+            default_initializer=nn.initializer.Assign(paddle.log(A)),
+        )
+        self.D = self.create_parameter(
+            shape=[
+                self.intermediate_size,
+            ],
+            default_initializer=nn.initializer.Constant(1),
+        )
+        self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias_attr=self.use_bias)
+
+        self.dt_layernorm = JambaRMSNorm(self.time_step_rank, eps=config.rms_norm_eps)
+        self.b_layernorm = JambaRMSNorm(self.ssm_state_size, eps=config.rms_norm_eps)
+        self.c_layernorm = JambaRMSNorm(self.ssm_state_size, eps=config.rms_norm_eps)
+
+        if not is_fast_path_available:
+            logger.warning_once(
+                "The fast path is not available because on of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`"
+                " is None. To install follow https://github.com/state-spaces/mamba/#installation and"
+                " https://github.com/Dao-AILab/causal-conv1d. If you want to use the naive implementation, set `use_mamba_kernels=False` in the model config"
+            )
+
+    def cuda_kernels_forward(self, hidden_states: paddle.Tensor, cache: HybridMambaAttentionDynamicCache = None):
+        batch_size, seq_len, _ = hidden_states.shape
+        use_precomputed_states = (
+            cache is not None
+            and cache.has_previous_state
+            and seq_len == 1
+            and cache.conv_states[self.layer_idx].shape[0] == cache.ssm_states[self.layer_idx].shape[0] == batch_size
+        )
+        # 1. Gated MLP's linear projection
+        projected_states = self.in_proj(hidden_states).transpose([0, 2, 1])
+
+        # We can't use `mamba_inner_fn` even if in training and without cache params because we have the
+        # inner layernorms which isn't supported by this fused kernel
+        hidden_states, gate = projected_states.chunk(2, axis=1)
+
+        # 2. Convolution sequence transformation
+        conv_weights = self.conv1d.weight.reshape([self.conv1d.weight.shape[0], self.conv1d.weight.shape[2]])
+        if use_precomputed_states:
+            hidden_states = causal_conv1d_update(
+                hidden_states.squeeze(-1),
+                cache.conv_states[self.layer_idx],
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+            )
+            hidden_states = hidden_states.unsqueeze(-1)
+        else:
+            if cache is not None:
+                conv_states = nn.functional.pad(
+                    hidden_states,
+                    (self.conv_kernel_size - hidden_states.shape[-1], 0),
+                    data_format="NCL",
+                )
+                cache.conv_states[self.layer_idx].copy_(conv_states.cast(cache.dtype), False)
+            hidden_states = causal_conv1d_fn(hidden_states, conv_weights, self.conv1d.bias, activation=self.activation)
+
+        # 3. State Space Model sequence transformation
+        # 3.a. input varying initialization of time_step, B and C
+        ssm_parameters = self.x_proj(hidden_states.transpose([0, 2, 1]))
+        time_step, B, C = paddle.split(
+            ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], axis=-1
+        )
+
+        time_step = self.dt_layernorm(time_step)
+        B = self.b_layernorm(B)
+        C = self.c_layernorm(C)
+
+        # Here we need to apply dt_proj without the bias, as the bias is added in the selective scan kernel.
+        # This is a hack to apply dt_proj while still using the forward pass of `paddle.nn.Linear`, which is needed
+        # in order to make quantization work. Quantization code replaces `paddle.nn.Linear` layers with quantized
+        # linear layers, and requires to call the forward pass directly.
+        # The original code here was: ```discrete_time_step = self.dt_proj.weight @ time_step.transpose(1, 2)```
+        time_proj_bias = self.dt_proj.bias
+        self.dt_proj.bias = None
+        discrete_time_step = self.dt_proj(time_step).transpose([0, 2, 1])
+        self.dt_proj.bias = time_proj_bias
+
+        A = -paddle.exp(self.A_log.cast("float32"))
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        time_proj_bias = time_proj_bias.cast("float32") if time_proj_bias is not None else None
+        if use_precomputed_states:
+            scan_outputs = selective_state_update(
+                cache.ssm_states[self.layer_idx],
+                hidden_states[..., 0],
+                discrete_time_step[..., 0],
+                A,
+                B[:, 0],
+                C[:, 0],
+                self.D,
+                gate[..., 0],
+                time_proj_bias,
+                dt_softplus=True,
+            ).unsqueeze(-1)
+        else:
+            scan_outputs, ssm_state = selective_scan_fn(
+                hidden_states,
+                discrete_time_step,
+                A,
+                B.transpose([0, 2, 1]),
+                C.transpose([0, 2, 1]),
+                self.D.cast("float32"),
+                gate,
+                time_proj_bias,
+                delta_softplus=True,
+                return_last_state=True,
+            )
+            if ssm_state is not None and cache is not None:
+                cache.ssm_states[self.layer_idx].copy_(ssm_state.cast(cache.dtype), False)
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_outputs.transpose([0, 2, 1]))
+
+        return contextualized_states
+
+    # fmt: off
+    def slow_forward(self, input_states, cache: HybridMambaAttentionDynamicCache = None):
+        batch_size, seq_len, _ = input_states.shape
+        dtype = input_states.dtype
+        # 1. Gated MLP's linear projection
+        projected_states = self.in_proj(input_states).transpose([0, 2, 1])                   # [batch, 2 * intermediate_size, seq_len]
+        hidden_states, gate = projected_states.chunk(2, axis=1)
+
+        use_cache = isinstance(cache, HybridMambaAttentionDynamicCache)
+        # 2. Convolution sequence transformation
+        if use_cache and cache.ssm_states[self.layer_idx].shape[0] == batch_size:
+            if self.training:
+                # In training mode, we don't want to perform in-place operations on ssm_state so we can compute the backwards pass
+                ssm_state = cache.ssm_states[self.layer_idx].clone()
+            else:
+                ssm_state = cache.ssm_states[self.layer_idx]
+
+            if cache.has_previous_state and seq_len == 1 and \
+                    cache.conv_states[self.layer_idx].shape[0] == batch_size:
+                conv_state = cache.conv_states[self.layer_idx]                   # [batch, intermediate_size, conv_kernel_size]
+                conv_state = paddle.roll(conv_state, shifts=-1, axis=-1)
+                conv_state[:, :, -1] = hidden_states[:, :, 0]
+                cache.conv_states[self.layer_idx] = conv_state
+                hidden_states = paddle.sum(conv_state * self.conv1d.weight[:, 0, :], axis=-1)
+                if self.use_conv_bias:
+                    hidden_states += self.conv1d.bias
+                hidden_states = self.act(hidden_states).cast(dtype).unsqueeze(-1)         # [batch, intermediate_size, 1] : decoding
+            else:
+                conv_state = nn.functional.pad(
+                    hidden_states,
+                    (self.conv_kernel_size - hidden_states.shape[-1], 0),
+                    data_format="NCL",
+                )
+                cache.conv_states[self.layer_idx] = conv_state
+                hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])     # [batch, intermediate_size, seq_len]
+        else:
+            ssm_state = paddle.zeros(
+                (batch_size, self.intermediate_size, self.ssm_state_size),
+                dtype=dtype,
+            )
+            hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])         # [batch, intermediate_size, seq_len]
+
+        # 3. State Space Model sequence transformation
+        # 3.a. Selection:  [batch, seq_len, self.time_step_rank + self.ssm_state_size * 2]
+        ssm_parameters = self.x_proj(hidden_states.transpose([0, 2, 1]))
+        time_step, B, C = paddle.split(
+            ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], axis=-1
+        )
+
+        time_step = self.dt_layernorm(time_step)
+        B = self.b_layernorm(B)
+        C = self.c_layernorm(C)
+
+        discrete_time_step = self.dt_proj(time_step)                                              # [batch, seq_len, intermediate_size]
+        discrete_time_step = nn.functional.softplus(discrete_time_step).transpose([0, 2, 1])      # [batch, intermediate_size, seq_len]
+
+        # 3.b. Discretization: B and C to [batch, seq_len, intermediate_size, ssm_state_size] (SRAM)
+        A = -paddle.exp(self.A_log.cast("float32"))                                              # [intermediate_size, ssm_state_size]
+        discrete_A = paddle.exp(A[None, :, None, :] * discrete_time_step[:, :, :, None])         # [batch, intermediate_size, seq_len, ssm_state_size]
+        discrete_B = discrete_time_step[:, :, :, None] * B[:, None, :, :].cast("float32")        # [batch, intermediate_size, seq_len, ssm_state_size]
+        deltaB_u = discrete_B * hidden_states[:, :, :, None].cast("float32")
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        scan_outputs = []
+        for i in range(seq_len):
+            ssm_state = discrete_A[:, :, i, :] * ssm_state + deltaB_u[:, :, i, :]      # [batch, intermediate_size, ssm_state]
+            scan_output = paddle.matmul(ssm_state.cast(dtype), C[:, i, :].unsqueeze(-1))  # [batch, intermediate_size, 1]
+            scan_outputs.append(scan_output[:, :, 0])
+        scan_output = paddle.stack(scan_outputs, axis=-1)                                # [batch, intermediate_size, seq_len]
+        scan_output = scan_output + (hidden_states * self.D[None, :, None])
+        scan_output = (scan_output * self.act(gate))
+
+        if use_cache:
+            cache.ssm_states[self.layer_idx] = ssm_state
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_output.transpose([0, 2, 1]))  # [batch, seq_len, hidden_size]
+        return contextualized_states
+    # fmt: on
+
+    def forward(self, hidden_states, cache: HybridMambaAttentionDynamicCache = None):
+        if self.use_fast_kernels:
+            if not is_fast_path_available:
+                raise ValueError(
+                    "Fast Mamba kernels are not available. Make sure to they are installed and that the mamba module is on a CUDA device"
+                )
+            return self.cuda_kernels_forward(hidden_states, cache)
+        return self.slow_forward(hidden_states, cache)
+
+
+# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Jamba
+class JambaMLP(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+
+        if config.tensor_parallel_degree > 1:
+            ColumnParallelLinear = mpu.ColumnParallelLinear
+            RowParallelLinear = mpu.RowParallelLinear
+            self.gate_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.intermediate_size,
+                gather_output=False,
+                has_bias=False,
+            )
+            self.up_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.intermediate_size,
+                gather_output=False,
+                has_bias=False,
+            )
+            self.down_proj = RowParallelLinear(
+                self.intermediate_size,
+                self.hidden_size,
+                input_is_parallel=True,
+                has_bias=False,
+            )
+        else:
+            self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
+            self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
+            self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias_attr=False)
+
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+class FakeMLPForwardBackward(paddle.autograd.PyLayer):
+    @staticmethod
+    def forward(ctx, x, gate_weight, up_weight, down_weight):
+        assert not x.stop_gradient, "x should not be stop_gradient"
+        ctx.shape_list = [x.shape, gate_weight.shape, up_weight.shape, down_weight.shape]
+        ctx.dtype_list = [x.dtype, gate_weight.dtype, up_weight.dtype, down_weight.dtype]
+        return paddle.zeros_like(x)
+
+    @staticmethod
+    def backward(ctx, grad):
+        return tuple(paddle.zeros(shape, dtype=dtype) for shape, dtype in zip(ctx.shape_list, ctx.dtype_list))
+
+
+# Adapted from transformers.models.mixtral.modeling_mixtral.MixtralSparseMoeBlock with Mistral->Jamba
+class JambaSparseMoeBlock(nn.Layer):
+    """
+    This implementation is
+    strictly equivalent to standard MoE with full capacity (no
+    dropped tokens). It's faster since it formulates MoE operations
+    in terms of block-sparse operations to accomodate imbalanced
+    assignments of tokens to experts, whereas standard MoE either
+    (1) drop tokens at the cost of reduced performance or (2) set
+    capacity factor to number of experts and thus waste computation
+    and memory on padding.
+    """
+
+    def __init__(self, config: JambaConfig):
+        super().__init__()
+        self.hidden_dim = config.hidden_size
+        self.ffn_dim = config.intermediate_size
+        self.num_experts = config.num_experts
+        self.top_k = config.num_experts_per_tok
+
+        self.router = nn.Linear(self.hidden_dim, self.num_experts, bias_attr=False)
+        self.experts = nn.LayerList([JambaMLP(config) for _ in range(self.num_experts)])
+
+    def forward(self, hidden_states):
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+
+        hidden_states = hidden_states.reshape([-1, hidden_dim])
+        # router_logits: [batch_size * seq_len, num_experts]
+        router_logits = self.router(hidden_states)
+
+        with paddle.amp.auto_cast(False):
+            routing_weights = F.softmax(router_logits.astype("float32"), axis=1)
+        routing_weights, selected_experts = paddle.topk(routing_weights, self.top_k, axis=-1)
+        # we cast back to input dtype
+        routing_weights = routing_weights.cast(hidden_states.dtype)
+
+        final_hidden_states = paddle.zeros(
+            [batch_size * sequence_length, hidden_dim],
+            dtype=hidden_states.dtype,
+        )
+
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated.
+        # shape: [num_experts, top_k, batch_size * seq_len]
+        expert_mask = F.one_hot(selected_experts, num_classes=self.num_experts).transpose([2, 1, 0])
+
+        # NOTE: we need to do some fake gradient for sharding parallel training.
+        try:
+            hcg = fleet.get_hybrid_communicate_group()
+            sharding_parallel_world_size = hcg.get_sharding_parallel_world_size()
+            if sharding_parallel_world_size > 1 and self.training:
+                logger.warning_once(
+                    f"Sharding parallel world size is {sharding_parallel_world_size}, we need to do some fake gradient."
+                )
+                for expert_id in range(self.num_experts):
+                    expert_layer = self.experts[expert_id]
+                    final_hidden_states += (
+                        FakeMLPForwardBackward.apply(
+                            hidden_states,
+                            expert_layer.gate_proj.weight,
+                            expert_layer.up_proj.weight,
+                            expert_layer.down_proj.weight,
+                        )
+                        * routing_weights[0, 0]
+                    )
+        except:
+            pass
+
+        # Loop over all available experts in the model and perform the computation on each expert.
+        for expert_id in range(self.num_experts):
+            expert_layer = self.experts[expert_id]
+            idx, top_x = paddle.where(expert_mask[expert_id])
+
+            if top_x.shape[0] == 0:
+                continue
+
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = paddle.gather(hidden_states, top_x.squeeze(-1))
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx]
+
+            top_x = top_x.squeeze()
+            if top_x.shape == []:
+                top_x = paddle.to_tensor([top_x.item()])
+            final_hidden_states.index_add_(top_x, 0, current_hidden_states.cast(hidden_states.dtype))
+
+        final_hidden_states = final_hidden_states.reshape([batch_size, sequence_length, hidden_dim])
+        return final_hidden_states, router_logits
+
+
+class JambaAttentionDecoderLayer(nn.Layer):
+    def __init__(self, config: JambaConfig, layer_idx: int):
+        super().__init__()
+        num_experts = config.layers_num_experts[layer_idx]
+        if config.use_flash_attention:
+            self.self_attn = JambaFlashAttention2(config, layer_idx)
+        else:
+            self.self_attn = JambaAttention(config, layer_idx)
+
+        ffn_layer_class = JambaSparseMoeBlock if num_experts > 1 else JambaMLP
+        self.feed_forward = ffn_layer_class(config)
+        self.input_layernorm = JambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = JambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        output_router_logits: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[paddle.Tensor] = None,
+    ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
+        """
+        Args:
+            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`paddle.Tensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_value (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
+                should not be returned during inference.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`paddle.Tensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+        )
+
+        # residual connection after attention
+        hidden_states = residual + hidden_states
+
+        # feed-forward (experts/MLP)
+        residual = hidden_states
+        hidden_states = self.pre_ff_layernorm(hidden_states)
+        ff_outputs = self.feed_forward(hidden_states)
+        if isinstance(ff_outputs, tuple):
+            hidden_states, router_logits = ff_outputs
+        else:
+            hidden_states, router_logits = ff_outputs, None
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        if output_router_logits:
+            outputs += (router_logits,)
+
+        return outputs
+
+
+class JambaMambaDecoderLayer(nn.Layer):
+    def __init__(self, config: JambaConfig, layer_idx: int):
+        super().__init__()
+        num_experts = config.layers_num_experts[layer_idx]
+        self.mamba = JambaMambaMixer(config=config, layer_idx=layer_idx)
+
+        ffn_layer_class = JambaSparseMoeBlock if num_experts > 1 else JambaMLP
+        self.feed_forward = ffn_layer_class(config)
+        self.input_layernorm = JambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = JambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[HybridMambaAttentionDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        output_router_logits: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[paddle.Tensor] = None,
+    ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
+        """
+        Args:
+            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`paddle.Tensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_value (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
+                should not be returned during inference.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`paddle.Tensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.mamba(
+            hidden_states=hidden_states,
+            cache=past_key_value,
+        )
+        self_attn_weights = None
+
+        # residual connection after mamba
+        hidden_states = residual + hidden_states
+
+        # feed-forward (experts/MLP)
+        residual = hidden_states
+        hidden_states = self.pre_ff_layernorm(hidden_states)
+        ff_outputs = self.feed_forward(hidden_states)
+        if isinstance(ff_outputs, tuple):
+            hidden_states, router_logits = ff_outputs
+        else:
+            hidden_states, router_logits = ff_outputs, None
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (past_key_value,)
+
+        if output_router_logits:
+            outputs += (router_logits,)
+
+        return outputs
+
+
+class JambaPretrainedModel(PretrainedModel):
+    config_class = JambaConfig
+    base_model_prefix = "jamba"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["JambaAttentionDecoderLayer", "JambaMambaDecoderLayer"]
+
+    @classmethod
+    def _get_name_mappings(cls, config: JambaConfig) -> List[StateDictNameMapping]:
+        mappings: List[StateDictNameMapping] = []
+        model_mappings = [
+            ["embed_tokens.weight"],
+            ["final_layernorm.weight"],
+        ]
+        for layer_index in range(config.num_hidden_layers):
+            layer_type_name = config.layers_block_type[layer_index]
+
+            if layer_type_name == "mamba":
+                layer_mappings = [
+                    [f"layers.{layer_index}.mamba.A_log"],
+                    [f"layers.{layer_index}.mamba.D"],
+                    [f"layers.{layer_index}.mamba.conv1d.weight"],
+                    [f"layers.{layer_index}.mamba.conv1d.bias"],
+                    # linear
+                    [f"layers.{layer_index}.mamba.in_proj.weight", None, "transpose"],
+                    [f"layers.{layer_index}.mamba.x_proj.weight", None, "transpose"],
+                    [f"layers.{layer_index}.mamba.dt_proj.weight", None, "transpose"],
+                    [f"layers.{layer_index}.mamba.dt_proj.bias"],
+                    [f"layers.{layer_index}.mamba.out_proj.weight", None, "transpose"],
+                    # layernorm
+                    [f"layers.{layer_index}.mamba.dt_layernorm.weight"],
+                    [f"layers.{layer_index}.mamba.b_layernorm.weight"],
+                    [f"layers.{layer_index}.mamba.c_layernorm.weight"],
+                ]
+                if config.mamba_proj_bias:
+                    layer_mappings.extend(
+                        [
+                            [f"layers.{layer_index}.mamba.in_proj.bias"],
+                            [f"layers.{layer_index}.mamba.out_proj.bias"],
+                        ]
+                    )
+            elif layer_type_name == "attention":
+                layer_mappings = [
+                    [f"layers.{layer_index}.self_attn.q_proj.weight", None, "transpose"],
+                    [f"layers.{layer_index}.self_attn.k_proj.weight", None, "transpose"],
+                    [f"layers.{layer_index}.self_attn.v_proj.weight", None, "transpose"],
+                    [f"layers.{layer_index}.self_attn.o_proj.weight", None, "transpose"],
+                ]
+            else:
+                raise ValueError(f"{layer_type_name} is not a valid layer type.")
+
+            num_experts = config.layers_num_experts[layer_index]
+            if num_experts > 1:
+                layer_mappings.append([f"layers.{layer_index}.feed_forward.router.weight", None, "transpose"])
+            for expert_idx in range(num_experts):
+                expert_tag = f"experts.{expert_idx}." if num_experts > 1 else ""
+                layer_mappings.extend(
+                    [
+                        [f"layers.{layer_index}.feed_forward.{expert_tag}gate_proj.weight", None, "transpose"],
+                        [f"layers.{layer_index}.feed_forward.{expert_tag}up_proj.weight", None, "transpose"],
+                        [f"layers.{layer_index}.feed_forward.{expert_tag}down_proj.weight", None, "transpose"],
+                    ]
+                )
+            layer_mappings.extend(
+                [
+                    [f"layers.{layer_index}.input_layernorm.weight"],
+                    [f"layers.{layer_index}.pre_ff_layernorm.weight"],
+                ]
+            )
+
+            model_mappings.extend(layer_mappings)
+
+        init_name_mappings(mappings=model_mappings)
+        # base-model prefix "JambaModel"
+        if "JambaModel" not in config.architectures:
+            for mapping in model_mappings:
+                mapping[0] = "model." + mapping[0]
+                mapping[1] = "jamba." + mapping[1]
+            if not config.tie_word_embeddings:
+                model_mappings.append(["lm_head.weight", "lm_head.weight", "transpose"])
+        mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)]
+        return mappings
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config: JambaConfig, is_split=True):
+
+        from paddlenlp.transformers.conversion_utils import split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def get_tensor_parallel_split_mappings(config: JambaConfig):
+            final_actions = {
+                # Column Linear
+                "lm_head.weight": partial(fn, is_column=True),
+                # Row Linear
+                "embed_tokens.weight": partial(fn, is_column=False),
+            }
+
+            if not config.vocab_size % config.tensor_parallel_degree == 0:
+                final_actions.pop("lm_head.weight")
+                final_actions.pop("embed_tokens.weight")
+
+            for layer_index in range(config.num_hidden_layers):
+                layer_type_name = config.layers_block_type[layer_index]
+                if layer_type_name == "mamba":
+                    # NO TP
+                    pass
+                elif layer_type_name == "attention":
+                    # Column Linear
+                    final_actions[f"layers.{layer_index}.self_attn.q_proj.weight"] = partial(fn, is_column=True)
+                    # if we have enough num_key_value_heads to split, then split it.
+                    if config.num_key_value_heads % config.tensor_parallel_degree == 0:
+                        final_actions[f"layers.{layer_index}.self_attn.k_proj.weight"] = partial(fn, is_column=True)
+                        final_actions[f"layers.{layer_index}.self_attn.v_proj.weight"] = partial(fn, is_column=True)
+
+                    # Row Linear
+                    final_actions[f"layers.{layer_index}.self_attn.o_proj.weight"] = partial(fn, is_column=False)
+                else:
+                    raise ValueError(f"{layer_type_name} is not a valid layer type.")
+
+                num_experts = config.layers_num_experts[layer_index]
+                for expert_idx in range(num_experts):
+                    expert_tag = f"experts.{expert_idx}." if num_experts > 1 else ""
+                    # Column Linear
+                    final_actions[f"layers.{layer_index}.feed_forward.{expert_tag}gate_proj.weight"] = partial(
+                        fn, is_column=True
+                    )
+                    final_actions[f"layers.{layer_index}.feed_forward.{expert_tag}up_proj.weight"] = partial(
+                        fn, is_column=True
+                    )
+                    # Row Linear
+                    final_actions[f"layers.{layer_index}.feed_forward.{expert_tag}down_proj.weight"] = partial(
+                        fn, is_column=False
+                    )
+
+            return final_actions
+
+        mappings = get_tensor_parallel_split_mappings(config)
+        return mappings
+
+    def post_init(self):
+        """
+        A method executed at the end of each Transformer model initialization, to execute code that needs the model's
+        modules properly initialized (such as weight initialization).
+        """
+        self.init_weights()
+
+    @paddle.no_grad()
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if self.config.tensor_parallel_degree > 1:
+            rng_tracker = get_rng_state_tracker().rng_state
+        if isinstance(
+            module,
+            (
+                nn.Linear,
+                nn.Conv1D,
+                nn.Embedding,
+                mpu.VocabParallelEmbedding,
+                mpu.ColumnParallelLinear,
+                mpu.RowParallelLinear,
+            ),
+        ):
+            if isinstance(module.weight, paddle.Tensor):
+                if module.weight.is_distributed:
+                    with rng_tracker():
+                        normal_(module.weight, mean=0.0, std=std)
+                else:
+                    normal_(module.weight, mean=0.0, std=std)
+
+            if isinstance(module, (nn.Linear, nn.Conv1D)):
+                if module.bias is not None:
+                    zeros_(module.bias)
+            elif isinstance(module, nn.Embedding) and hasattr(module, "padding_idx"):
+                module.weight[module.padding_idx] = 0.0
+
+
+ALL_DECODER_LAYER_TYPES = {"attention": JambaAttentionDecoderLayer, "mamba": JambaMambaDecoderLayer}
+
+
+# Adapted from transformers.models.mistral.modeling_mistral.MistralModel with MISTRAL->JAMBA, Mistral->Jamba
+class JambaModel(JambaPretrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`JambaDecoderLayer`]
+
+    Args:
+        config: JambaConfig
+    """
+
+    def __init__(self, config: JambaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        # new added
+        if config.tensor_parallel_degree > 1 and config.sequence_parallel:
+            logger.warning_once("Currently we donot support sequence parallelism yet!")
+        self.recompute_granularity = config.recompute_granularity
+        self.no_recompute_layers = config.no_recompute_layers if config.no_recompute_layers is not None else []
+
+        if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0:
+            self.embed_tokens = mpu.VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()),
+            )
+        else:
+            self.embed_tokens = nn.Embedding(
+                config.vocab_size,
+                config.hidden_size,
+            )
+
+        self.embed_tokens.padding_idx = self.padding_idx
+
+        decoder_layers = []
+        for i in range(config.num_hidden_layers):
+            layer_class = ALL_DECODER_LAYER_TYPES[config.layers_block_type[i]]
+            decoder_layers.append(layer_class(config, layer_idx=i))
+        self.layers = nn.LayerList(decoder_layers)
+
+        self.final_layernorm = JambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.enable_recompute = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @staticmethod
+    def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length, dtype):
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            if len(attention_mask.shape) == 2:
+                expanded_attn_mask = _expand_2d_mask(attention_mask, dtype, tgt_length=input_shape[-1])
+                # For decoding phase in generation, seq_length = 1, we don't need to add causal mask
+                if input_shape[-1] > 1:
+                    combined_attention_mask = _make_causal_mask(
+                        input_shape,
+                        past_key_values_length=past_key_values_length,
+                    )
+                    expanded_attn_mask = expanded_attn_mask & combined_attention_mask
+            # [bsz, seq_len, seq_len] -> [bsz, 1, seq_len, seq_len]
+            elif len(attention_mask.shape) == 3:
+                expanded_attn_mask = attention_mask.unsqueeze(1).astype("bool")
+            # if attention_mask is already 4-D, do nothing
+            else:
+                expanded_attn_mask = attention_mask
+        else:
+            expanded_attn_mask = _make_causal_mask(
+                input_shape,
+                past_key_values_length=past_key_values_length,
+            )
+        # Convert bool attention_mask to float attention mask, which will be added to attention_scores later
+        expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
+        return expanded_attn_mask
+
+    @paddle.jit.not_to_static
+    def recompute_training_full(
+        self,
+        layer_module: nn.Layer,
+        hidden_states: paddle.Tensor,
+        attention_mask: paddle.Tensor,
+        position_ids: paddle.Tensor = None,
+        past_key_values: HybridMambaAttentionDynamicCache = None,
+        output_attentions: bool = False,
+        output_router_logits: bool = False,
+        use_cache: bool = False,
+        cache_position: paddle.Tensor = None,
+    ):
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+
+            return custom_forward
+
+        hidden_states = recompute(
+            create_custom_forward(layer_module),
+            hidden_states,
+            attention_mask,
+            position_ids,
+            past_key_values,
+            output_attentions,
+            output_router_logits,
+            use_cache,
+            cache_position,
+            use_reentrant=self.config.recompute_use_reentrant,
+        )
+
+        return hidden_states
+
+    def forward(
+        self,
+        input_ids: paddle.Tensor = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[paddle.Tensor] = None,
+    ) -> Union[Tuple, MoEModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if self.enable_recompute and self.training and use_cache:
+            logger.warning_once("`use_cache=True` is incompatible with recompute. Setting `use_cache=False`.")
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            logger.warning_once(
+                "Jamba requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. None was "
+                "provided, so no cache will be returned."
+            )
+        if not use_cache and past_key_values is not None:
+            past_key_values = None
+
+        batch_size, seq_length = inputs_embeds.shape[:2]
+        seq_length_with_past = seq_length
+        cache_length = 0
+        if past_key_values is not None:
+            cache_length = past_key_values.get_seq_length()
+            seq_length_with_past += cache_length
+
+        # embed positions
+        if attention_mask is None:
+            # [bs, seq_len]
+            attention_mask = paddle.ones((batch_size, seq_length_with_past), dtype=paddle.bool)
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), cache_length, inputs_embeds.dtype
+        )  # [bs, 1, seq_len, seq_len]
+        if self.config.use_flash_attention:
+            is_casual = is_casual_mask(attention_mask)
+            if is_casual:
+                attention_mask = None
+
+        hidden_states = inputs_embeds
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_router_logits = () if output_router_logits else None
+
+        for idx, (decoder_layer) in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            has_gradient = not hidden_states.stop_gradient
+            if (
+                self.enable_recompute
+                and idx not in self.no_recompute_layers
+                and has_gradient
+                and self.recompute_granularity == "full"
+            ):
+                layer_outputs = self.recompute_training_full(
+                    decoder_layer,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    output_router_logits,
+                    use_cache,
+                    cache_position,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    output_router_logits=output_router_logits,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                if layer_outputs[1] is not None:
+                    # append attentions only of attention layers. Mamba layers return `None` as the attention weights
+                    all_self_attns += (layer_outputs[1],)
+
+            if output_router_logits:
+                if layer_outputs[-1] is not None:
+                    # append router logits only of expert layers. Regular MLP layers return `None` as the router logits
+                    all_router_logits += (layer_outputs[-1],)
+
+        hidden_states = self.final_layernorm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if past_key_values and not past_key_values.has_previous_state:
+            past_key_values.has_previous_state = True
+
+        next_cache = None if not use_cache else past_key_values
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
+                if v is not None
+            )
+        return MoEModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            router_logits=all_router_logits,
+        )
+
+
+class JambaPretrainingCriterion(nn.Layer):
+    """
+    Criterion for Jamba.
+    It calculates the final loss.
+    """
+
+    def __init__(self, config: JambaConfig):
+
+        super().__init__()
+        self.ignore_index = getattr(config, "ignore_index", -100)
+        self.config = config
+        self.enable_parallel_cross_entropy = (
+            config.tensor_parallel_degree > 1
+            and config.vocab_size % config.tensor_parallel_degree == 0
+            and config.tensor_parallel_output
+        )
+
+        if self.enable_parallel_cross_entropy:  # and False: # and lm_head is distributed
+            self.loss_func = mpu.ParallelCrossEntropy(ignore_index=self.ignore_index)
+        else:
+            self.loss_func = nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index)
+
+    def forward(self, prediction_scores, masked_lm_labels):
+        if self.enable_parallel_cross_entropy:
+            if prediction_scores.shape[-1] == self.config.vocab_size:
+                logger.warning_once(
+                    f"enable_parallel_cross_entropy, the vocab_size should be splited: {prediction_scores.shape[-1]}, {self.config.vocab_size}"
+                )
+                self.loss_func = nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index)
+
+        with paddle.amp.auto_cast(False):
+            masked_lm_loss = self.loss_func(prediction_scores.astype("float32"), masked_lm_labels.unsqueeze(2))
+            # skip ignore_index which loss == 0
+            # masked_lm_loss = masked_lm_loss[masked_lm_loss > 0]
+            # loss = paddle.mean(masked_lm_loss)
+            binary_sequence = paddle.where(
+                masked_lm_loss > 0, paddle.ones_like(masked_lm_loss), paddle.zeros_like(masked_lm_loss)
+            )
+            count = paddle.sum(binary_sequence)
+            if count == 0:
+                loss = paddle.sum(masked_lm_loss * binary_sequence)
+            else:
+                loss = paddle.sum(masked_lm_loss * binary_sequence) / count
+
+        return loss
+
+
+class JambaLMHead(nn.Layer):
+    def __init__(self, config: JambaConfig):
+        super().__init__()
+        self.config = config
+        if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0:
+            vocab_size = config.vocab_size // config.tensor_parallel_degree
+        else:
+            vocab_size = config.vocab_size
+
+        if vocab_size != config.vocab_size:
+            with get_rng_state_tracker().rng_state():
+                self.weight = self.create_parameter(
+                    shape=[config.hidden_size, vocab_size],
+                    dtype=paddle.get_default_dtype(),
+                )
+        else:
+            self.weight = self.create_parameter(
+                shape=[config.hidden_size, vocab_size],
+                dtype=paddle.get_default_dtype(),
+            )
+        # Must set distributed attr for Tensor Parallel !
+        self.weight.is_distributed = True if (vocab_size != config.vocab_size) else False
+        if self.weight.is_distributed:
+            self.weight.split_axis = 1
+
+    def forward(self, hidden_states, tensor_parallel_output=None):
+        # if self.config.sequence_parallel:
+        #     hidden_states = GatherOp.apply(hidden_states)
+        #     seq_length = self.config.seq_length
+        #     hidden_states = paddle.reshape_(hidden_states, [-1, seq_length, self.config.hidden_size])
+
+        if tensor_parallel_output is None:
+            tensor_parallel_output = self.config.tensor_parallel_output and self.config.tensor_parallel_degree > 1
+
+        logits = parallel_matmul(hidden_states, self.weight, tensor_parallel_output=tensor_parallel_output)
+        return logits
+
+
+# Adapted from transformers.models.mixtral.modeling_mixtral.MixtralForCausalLM with MIXTRAL->JAMBA, Mixtral->Jamba
+class JambaForCausalLM(JambaPretrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: JambaConfig):
+        super().__init__(config)
+
+        self.jamba = JambaModel(config)
+        assert not config.tie_word_embeddings, "Tied word embeddings are not supported in JambaForCausalLM"
+        self.lm_head = JambaLMHead(config)
+        self.criterion = JambaPretrainingCriterion(config)
+
+        self.router_aux_loss_coef = config.router_aux_loss_coef
+        self.num_experts = config.num_experts
+        self.num_experts_per_tok = config.num_experts_per_tok
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.jamba.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.jamba.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.jamba = decoder
+
+    def get_decoder(self):
+        return self.jamba
+
+    # Ignore copy
+    def forward(
+        self,
+        input_ids: paddle.Tensor = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[paddle.Tensor] = None,
+        num_logits_to_keep: Optional[Union[int, None]] = None,
+    ) -> Union[Tuple, MoECausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            num_logits_to_keep (`int` or `None`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `None`, calculate logits for all
+                `input_ids`. Only last token logits are needed for generation, and calculating them only for that token
+                can save memory, which becomes pretty significant for long sequences.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from paddlenlp.transformers import JambaTokenizer, JambaForCausalLM
+
+        >>> model = JambaForCausalLM.from_pretrained("ai21labs/Jamba-v0.1")
+        >>> tokenizer = JambaTokenizer.from_pretrained("ai21labs/Jamba-v0.1")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pd")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.jamba(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_router_logits=output_router_logits,
+            cache_position=cache_position,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]  # [bs, seq_len, dim]
+
+        # if labels is None，means we need full output, instead of tensor_parallel_output
+        # tensor_parallel_output is togather with ParallelCrossEntropy
+        tensor_parallel_output = (
+            self.config.tensor_parallel_output and labels is not None and self.config.tensor_parallel_degree > 1
+        )
+
+        if num_logits_to_keep is None:
+            logits = self.lm_head(hidden_states, tensor_parallel_output=tensor_parallel_output)
+        else:
+            logits = self.lm_head(
+                hidden_states[..., -num_logits_to_keep:, :], tensor_parallel_output=tensor_parallel_output
+            )
+
+        loss = None
+        if labels is not None:
+            loss = self.criterion(logits, labels)
+
+        aux_loss = None
+        if output_router_logits:
+            aux_loss = load_balancing_loss_func(
+                outputs.router_logits if return_dict else outputs[-1],
+                self.num_experts,
+                self.num_experts_per_tok,
+                attention_mask,
+            )
+            if labels is not None:
+                loss += self.router_aux_loss_coef * aux_loss  # make sure to reside in the same device
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            if output_router_logits:
+                output = (aux_loss,) + output
+            return (loss,) + output if loss is not None else output
+
+        return MoECausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            router_logits=outputs.router_logits,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        output_router_logits=False,
+        cache_position=None,
+        use_cache=True,
+        **kwargs,
+    ):
+        empty_past_kv = past_key_values is None
+
+        # Omit tokens covered by past_key_values
+        if not empty_past_kv:
+            input_ids = input_ids[:, -1].unsqueeze(axis=-1)
+        else:
+            past_key_values = HybridMambaAttentionDynamicCache(
+                self.config,
+                input_ids.shape[0],
+                self.get_input_embeddings().weight.dtype,
+            )
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and empty_past_kv:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases
+        model_inputs.update(
+            {
+                "position_ids": None,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "output_router_logits": output_router_logits,
+                "num_logits_to_keep": self.config.num_logits_to_keep,
+                "cache_position": None,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False):
+        # update cache
+        if isinstance(outputs, tuple) and len(outputs) > 1 and not isinstance(outputs[1], paddle.Tensor):
+            model_kwargs["past_key_values"] = outputs[1]
+
+        if isinstance(outputs, MoECausalLMOutputWithPast) and "past_key_values" in outputs:
+            model_kwargs["past_key_values"] = outputs.past_key_values
+
+        # update position_ids
+        if "position_ids" in model_kwargs and model_kwargs["position_ids"] is not None:
+            position_ids = model_kwargs["position_ids"]
+            model_kwargs["position_ids"] = paddle.concat([position_ids, position_ids[..., -1:] + 1], axis=-1)
+
+        if not is_encoder_decoder and "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = paddle.concat(
+                [attention_mask, paddle.ones([attention_mask.shape[0], 1], dtype=attention_mask.dtype)], axis=-1
+            )
+
+        return model_kwargs
+
+    @staticmethod
+    def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id):
+        is_pad_token_in_inputs_ids = (pad_token_id is not None) and paddle.any(input_ids == pad_token_id).item()
+        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (
+            (eos_token_id is not None) and (pad_token_id != eos_token_id)
+        )
+        if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id:
+            attention_mask = (input_ids != pad_token_id).astype(paddle.int64)
+        else:
+            attention_mask = paddle.ones_like(input_ids, dtype=paddle.int64)
+        return attention_mask
+
+
+# Copied from transformers.models.mixtral.modeling_mixtral.MixtralForSequenceClassification with Mixtral->Jamba, MIXTRAL->JAMBA
+# class JambaForSequenceClassification(JambaPretrainedModel):
+#     def __init__(self, config):
+#         super().__init__(config)
+#         self.num_labels = config.num_labels
+#         self.jamba = JambaModel(config)
+#         self.score = nn.Linear(config.hidden_size, self.num_labels, bias_attr=False)
+
+#         # Initialize weights and apply final processing
+#         self.post_init()
+
+#     def get_input_embeddings(self):
+#         return self.jamba.embed_tokens
+
+#     def set_input_embeddings(self, value):
+#         self.jamba.embed_tokens = value
+
+#     def forward(
+#         self,
+#         input_ids: paddle.Tensor = None,
+#         attention_mask: Optional[paddle.Tensor] = None,
+#         position_ids: Optional[paddle.Tensor] = None,
+#         past_key_values: Optional[Union[HybridMambaAttentionDynamicCache, List[paddle.Tensor]]] = None,
+#         inputs_embeds: Optional[paddle.Tensor] = None,
+#         labels: Optional[paddle.Tensor] = None,
+#         use_cache: Optional[bool] = None,
+#         output_attentions: Optional[bool] = None,
+#         output_hidden_states: Optional[bool] = None,
+#         return_dict: Optional[bool] = None,
+#     ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+#         r"""
+#         labels (`paddle.Tensor` of shape `(batch_size,)`, *optional*):
+#             Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+#             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+#             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+#         """
+#         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+#         transformer_outputs = self.jamba(
+#             input_ids,
+#             attention_mask=attention_mask,
+#             position_ids=position_ids,
+#             past_key_values=past_key_values,
+#             inputs_embeds=inputs_embeds,
+#             use_cache=use_cache,
+#             output_attentions=output_attentions,
+#             output_hidden_states=output_hidden_states,
+#             return_dict=return_dict,
+#         )
+#         hidden_states = transformer_outputs[0]
+#         logits = self.score(hidden_states)
+
+#         if input_ids is not None:
+#             batch_size = input_ids.shape[0]
+#         else:
+#             batch_size = inputs_embeds.shape[0]
+
+#         if self.config.pad_token_id is None and batch_size != 1:
+#             raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+#         if self.config.pad_token_id is None:
+#             sequence_lengths = -1
+#         else:
+#             if input_ids is not None:
+#                 # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+#                 sequence_lengths = paddle.equal(input_ids, self.config.pad_token_id).cast("int32").argmax(-1) - 1
+#                 sequence_lengths = sequence_lengths % input_ids.shape[-1]
+#             else:
+#                 sequence_lengths = -1
+
+#         pooled_logits = logits[paddle.arange(batch_size), sequence_lengths]
+
+#         loss = None
+#         if labels is not None:
+#             if self.config.problem_type is None:
+#                 if self.num_labels == 1:
+#                     self.config.problem_type = "regression"
+#                 elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32):
+#                     self.config.problem_type = "single_label_classification"
+#                 else:
+#                     self.config.problem_type = "multi_label_classification"
+
+#             if self.config.problem_type == "regression":
+#                 loss_fct = MSELoss()
+#                 if self.num_labels == 1:
+#                     loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+#                 else:
+#                     loss = loss_fct(pooled_logits, labels)
+#             elif self.config.problem_type == "single_label_classification":
+#                 loss_fct = CrossEntropyLoss()
+#                 loss = loss_fct(
+#                     pooled_logits.reshape([-1, self.num_labels]),
+#                     labels.reshape(
+#                         [
+#                             -1,
+#                         ]
+#                     ),
+#                 )
+#             elif self.config.problem_type == "multi_label_classification":
+#                 loss_fct = BCEWithLogitsLoss()
+#                 loss = loss_fct(pooled_logits, labels)
+#         if not return_dict:
+#             output = (pooled_logits,) + transformer_outputs[1:]
+#             return ((loss,) + output) if loss is not None else output
+
+#         return SequenceClassifierOutputWithPast(
+#             loss=loss,
+#             logits=pooled_logits,
+#             past_key_values=transformer_outputs.past_key_values,
+#             hidden_states=transformer_outputs.hidden_states,
+#             attentions=transformer_outputs.attentions,
+#         )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/jamba/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/jamba/tokenizer.py
new file mode 100644
index 000000000..ec3c03a8e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/jamba/tokenizer.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..llama import LlamaTokenizer
+
+__all__ = ["JambaTokenizer"]
+
+
+class JambaTokenizer(LlamaTokenizer):
+    model_input_names = ["input_ids", "attention_mask"]
+    resource_files_names = {
+        "vocab_file": "sentencepiece.bpe.model",
+    }
+    pretrained_resource_files_map = {}
+    pretrained_init_configuration = {}
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlm/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlm/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlm/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlm/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlm/configuration.py
new file mode 100644
index 000000000..e9c6511ce
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlm/configuration.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" LayoutLM model configuration"""
+
+from typing import Dict
+
+from ..configuration_utils import PretrainedConfig
+
+__all__ = ["LAYOUTLM_PRETRAINED_INIT_CONFIGURATION", "LayoutLMConfig", "LAYOUTLM_PRETRAINED_RESOURCE_FILES_MAP"]
+
+LAYOUTLM_PRETRAINED_INIT_CONFIGURATION = {
+    "layoutlm-base-uncased": {
+        "vocab_size": 30522,
+        "hidden_size": 768,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "max_2d_position_embeddings": 1024,
+        "initializer_range": 0.02,
+        "layer_norm_eps": 1e-12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+    },
+    "layoutlm-large-uncased": {
+        "vocab_size": 30522,
+        "hidden_size": 1024,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 24,
+        "intermediate_size": 4096,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_2d_position_embeddings": 1024,
+        "max_position_embeddings": 512,
+        "initializer_range": 0.02,
+        "layer_norm_eps": 1e-12,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+    },
+}
+
+LAYOUTLM_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "layoutlm-base-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/layoutlm/layoutlm-base-uncased/model_state.pdparams",
+        "layoutlm-large-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/layoutlm/layoutlm-large-uncased/model_state.pdparams",
+    }
+}
+
+
+class LayoutLMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`LayoutLMModel`]. It is used to instantiate an LayoutLM Model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the LayoutLM LayoutLM-base-uncased architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, optional, defaults to 30522):
+            Vocabulary size of the LayoutLMModel model. Defines the different tokens that can be represented by the
+            *inputs_ids* passed to the forward method of [`LayoutLMModel`].
+        embedding_size (`int`, optional, defaults to 768):
+            Dimensionality of vocabulary embeddings.
+        hidden_size (`int`, optional, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, optional, defaults to 3072):
+            The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+        hidden_dropout_prob (`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            (e.g., 512 or 1024 or 2048).
+        max_2d_position_embeddings (`int`, optional, defaults to 1024):
+            The maximum value that the 2D position embedding might ever used. Typically set this to something large just in case (e.g., 1024).
+        type_vocab_size (`int`, optional, defaults to 2):
+            The vocabulary size of the *token_type_ids* passed into [`NezhaModel`].
+        initializer_range (`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        classifier_dropout (`float`, optional, defaults to 0.1):
+            The dropout ratio for attached classifiers.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import LayoutLMConfig, LayoutLMModel
+    >>> # Initializing an LayoutLMConfig configuration
+    >>> configuration = LayoutLMConfig()
+    >>> # Initializing a model (with random weights) from the LayoutLM-base style configuration model
+    >>> model = LayoutLMModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    attribute_map: Dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
+    pretrained_init_configuration = LAYOUTLM_PRETRAINED_INIT_CONFIGURATION
+    model_type = "layoutlm"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        max_2d_position_embeddings=1024,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        classifier_dropout=0.1,
+        pad_token_id=0,
+        pool_act="tanh",
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.max_2d_position_embeddings = max_2d_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.classifier_dropout = classifier_dropout
+        self.pad_token_id = pad_token_id
+        self.pool_act = pool_act
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlm/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlm/modeling.py
new file mode 100644
index 000000000..abdea1f59
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlm/modeling.py
@@ -0,0 +1,662 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Modeling classes for LayoutLM model."""
+
+import paddle
+import paddle.nn as nn
+from paddle.nn import Layer
+
+from paddlenlp.utils.log import logger
+
+from ...layers import Linear as TransposedLinear
+from .. import PretrainedModel, register_base_model
+from .configuration import (
+    LAYOUTLM_PRETRAINED_INIT_CONFIGURATION,
+    LAYOUTLM_PRETRAINED_RESOURCE_FILES_MAP,
+    LayoutLMConfig,
+)
+
+__all__ = [
+    "LayoutLMModel",
+    "LayoutLMPretrainedModel",
+    "LayoutLMForMaskedLM",
+    "LayoutLMForTokenClassification",
+    "LayoutLMForSequenceClassification",
+]
+
+
+class LayoutLMPooler(Layer):
+    def __init__(self, config: LayoutLMConfig):
+        super(LayoutLMPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.pool_act = config.pool_act
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        if self.pool_act == "tanh":
+            pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class LayoutLMEmbeddings(Layer):
+    """
+    Include embeddings from word, position and token_type embeddings
+    """
+
+    def __init__(self, config: LayoutLMConfig):
+        super(LayoutLMEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        # gry add for layoutlm
+        self.x_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.hidden_size)
+        self.y_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.hidden_size)
+        self.h_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.hidden_size)
+        self.w_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.hidden_size)
+        # end of gry add for layoutlm
+        # self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size, padding_idx=pad_token_id)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.register_buffer(
+            "position_ids", paddle.arange(config.max_position_embeddings, dtype="int64").expand((1, -1))
+        )
+
+    def forward(self, input_ids, bbox=None, token_type_ids=None, position_ids=None):
+        # input_shape = input_ids.size()
+        # seq_length = input_shape[1]
+        if position_ids is None:
+            ones = paddle.ones_like(input_ids, dtype="int64")
+            seq_length = paddle.cumsum(ones, axis=-1)
+
+            position_ids = seq_length - ones
+            position_ids.stop_gradient = True
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros_like(input_ids, dtype="int64")
+
+        word_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+
+        # gry add
+        try:
+            left_position_embeddings = self.x_position_embeddings(bbox[:, :, 0])
+            upper_position_embeddings = self.y_position_embeddings(bbox[:, :, 1])
+            right_position_embeddings = self.x_position_embeddings(bbox[:, :, 2])
+            lower_position_embeddings = self.y_position_embeddings(bbox[:, :, 3])
+        except IndexError as e:
+            raise IndexError("The :obj:`bbox`coordinate values should be within 0-1000 range.") from e
+        h_position_embeddings = self.h_position_embeddings(bbox[:, :, 3] - bbox[:, :, 1])
+        w_position_embeddings = self.w_position_embeddings(bbox[:, :, 2] - bbox[:, :, 0])
+        # end of gry add
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = (
+            word_embeddings
+            + position_embeddings
+            + left_position_embeddings
+            + upper_position_embeddings
+            + right_position_embeddings
+            + lower_position_embeddings
+            + h_position_embeddings
+            + w_position_embeddings
+            + token_type_embeddings
+        )
+
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class LayoutLMPretrainedModel(PretrainedModel):
+    config_class = LayoutLMConfig
+    pretrained_init_configuration = LAYOUTLM_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = LAYOUTLM_PRETRAINED_RESOURCE_FILES_MAP
+    base_model_prefix = "layoutlm"
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range,
+                        shape=layer.weight.shape,
+                    )
+                )
+
+        elif isinstance(layer, nn.LayerNorm):
+            layer._epsilon = 1e-12
+
+
+@register_base_model
+class LayoutLMModel(LayoutLMPretrainedModel):
+    """
+    The bare LayoutLM Model outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        vocab_size (int):
+            Vocabulary size of the LayoutLM model. Defines the number of different tokens that can
+            be represented by the `inputs_ids` passed when calling LayoutLMModel.
+        hidden_size (int):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (int):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (int):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (int):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (str, optional):
+            The non-linear activation function in the feed-forward layer.
+            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
+            are supported.
+        hidden_dropout_prob (float):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        attention_probs_dropout_prob (float):
+            The dropout probability for all fully connected layers in the pooler.
+        type_vocab_size (int, optional):
+            The vocabulary size of `token_type_ids`.
+            Defaults to `16`.
+        initializer_range (float):
+            The standard deviation of the normal initializer.
+            Defaults to 0.02.
+
+            .. note::
+                A normal_initializer initializes weight matrices as normal distributions.
+                See :meth:`LayoutLMPretrainedModel.init_weights()` for how weights are initialized in `LayoutLMModel`.
+
+        pad_token_id (int, optional):
+            The index of padding token in the token vocabulary.
+            Defaults to `0`.
+        pool_act (str, optional):
+            The non-linear activation function in the pooling layer.
+            Defaults to `"tanh"`.
+    """
+
+    def __init__(self, config: LayoutLMConfig):
+        super(LayoutLMModel, self).__init__(config)
+        # self.config = kwargs
+        self.num_hidden_layers = config.num_hidden_layers
+        self.pad_token_id = config.pad_token_id
+        self.initializer_range = config.initializer_range
+        self.embeddings = LayoutLMEmbeddings(config)
+
+        encoder_layer = nn.TransformerEncoderLayer(
+            config.hidden_size,
+            config.num_attention_heads,
+            config.intermediate_size,
+            dropout=config.hidden_dropout_prob,
+            activation=config.hidden_act,
+            attn_dropout=config.attention_probs_dropout_prob,
+            act_dropout=0,
+        )
+        self.encoder = nn.TransformerEncoder(encoder_layer, config.num_hidden_layers)
+        self.pooler = LayoutLMPooler(config)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def resize_position_embeddings(self, new_num_position_embeddings):
+        """
+        Resizes position embeddings of the model if `new_num_position_embeddings != config["max_position_embeddings"]`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embedding matrix. If position embeddings are learned, increasing the size
+                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
+                end.
+        """
+        num_position_embeds_diff = new_num_position_embeddings - self.config["max_position_embeddings"]
+
+        # no resizing needs to be done if the length stays the same
+        if num_position_embeds_diff == 0:
+            return
+
+        logger.info(f"Setting `config.max_position_embeddings={new_num_position_embeddings}`...")
+        self.config.max_position_embeddings = new_num_position_embeddings
+
+        old_position_embeddings_weight = self.embeddings.position_embeddings.weight
+
+        self.embeddings.position_embeddings = nn.Embedding(
+            self.config.max_position_embeddings, self.config.hidden_size
+        )
+
+        with paddle.no_grad():
+            if num_position_embeds_diff > 0:
+                self.embeddings.position_embeddings.weight[:-num_position_embeds_diff] = old_position_embeddings_weight
+            else:
+                self.embeddings.position_embeddings.weight = old_position_embeddings_weight[:num_position_embeds_diff]
+
+    def forward(
+        self,
+        input_ids=None,
+        bbox=None,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        output_hidden_states=False,
+    ):
+        r"""
+        The LayoutLMModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            output_hidden_states (bool, optional):
+                Whether to return the output of each hidden layers.
+                Defaults to `False`.
+
+        Returns:
+            tuple: Returns tuple (`sequence_output`, `pooled_output`).
+
+            With the fields:
+
+            - `sequence_output` (Tensor):
+                Sequence of hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+            - `pooled_output` (Tensor):
+                The output of first token (`[CLS]`) in sequence.
+                We "pool" the model by simply taking the hidden state corresponding to the first token.
+                Its data type should be float32 and its shape is [batch_size, hidden_size].
+        """
+
+        input_shape = input_ids.shape
+        if attention_mask is None:
+            attention_mask = paddle.unsqueeze(
+                (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e4, axis=[1, 2]
+            )
+        else:
+            if attention_mask.ndim == 2:
+                # attention_mask [batch_size, sequence_length] -> [batch_size, 1, 1, sequence_length]
+                attention_mask = attention_mask.unsqueeze(axis=[1, 2])
+        if bbox is None:
+            bbox = paddle.zeros(tuple(list(input_shape) + [4]), dtype="int64")
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            bbox=bbox,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+        )
+
+        if output_hidden_states:
+            output = embedding_output
+            encoder_outputs = []
+            for mod in self.encoder.layers:
+                output = mod(output, src_mask=attention_mask)
+                encoder_outputs.append(output)
+            if self.encoder.norm is not None:
+                encoder_outputs[-1] = self.encoder.norm(encoder_outputs[-1])
+            pooled_output = self.pooler(encoder_outputs[-1])
+        else:
+            sequence_output = self.encoder(embedding_output, attention_mask)
+            pooled_output = self.pooler(sequence_output)
+        if output_hidden_states:
+            return encoder_outputs, pooled_output
+        else:
+            return sequence_output, pooled_output
+
+
+class LayoutLMForTokenClassification(LayoutLMPretrainedModel):
+    """
+    LayoutLM Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        config (:class:`LayoutLMConfig`):
+            An instance of LayoutLMConfig used to construct LayoutLMForTokenClassification.
+    """
+
+    def __init__(self, config: LayoutLMConfig):
+        super(LayoutLMForTokenClassification, self).__init__(config)
+        self.num_classes = config.num_classes
+        self.layoutlm = LayoutLMModel(config)
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, self.num_classes)
+        self.classifier.apply(self._init_weights)
+
+    def get_input_embeddings(self):
+        return self.layoutlm.embeddings.word_embeddings
+
+    def resize_position_embeddings(self, new_num_position_embeddings):
+        """
+        Resizes position embeddings of the model if `new_num_position_embeddings != config["max_position_embeddings"]`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embedding matrix. If position embeddings are learned, increasing the size
+                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
+                end.
+        """
+        self.layoutlm.resize_position_embeddings(new_num_position_embeddings)
+
+    def forward(
+        self,
+        input_ids,
+        bbox=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        output_hidden_states=False,
+    ):
+        r"""
+        The LayoutLMForTokenClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`LayoutLMModel`.
+            bbox (Tensor):
+                See :class:`LayoutLMModel`.
+            attention_mask (list, optional):
+                See :class:`LayoutLMModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`LayoutLMModel`.
+            position_ids(Tensor, optional):
+                See :class:`LayoutLMModel`.
+            output_hidden_states(Tensor, optional):
+                See :class:`LayoutLMModel`.
+
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input token classification logits.
+            Shape as `[batch_size, sequence_length, num_classes]` and dtype as `float32`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import LayoutLMFForTokenClassification
+                from paddlenlp.transformers import LayoutLMFTokenizer
+
+                tokenizer = LayoutLMFTokenizer.from_pretrained('layoutlm-base-uncased')
+                model = LayoutLMFForTokenClassification.from_pretrained('layoutlm-base-uncased', num_classes=2)
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!", return_tensors="pd")
+
+                logits = model(**inputs)
+                print(logits.shape)
+                # [1, 13, 2]
+
+        """
+        if attention_mask is not None:
+            attention_mask = attention_mask.unsqueeze(axis=[1, 2]).astype("int64")
+        outputs = self.layoutlm(
+            input_ids=input_ids,
+            bbox=bbox,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            output_hidden_states=False,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        return logits
+
+
+class LayoutLMForSequenceClassification(LayoutLMPretrainedModel):
+    """
+    LayoutLM Model with a linear layer on top of the output layer,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        config (:class:`LayoutLMConfig`):
+            An instance of LayoutLMConfig used to construct LayoutLMForSequenceClassification.
+    """
+
+    def __init__(self, config: LayoutLMConfig):
+        super(LayoutLMForSequenceClassification, self).__init__(config)
+        self.layoutlm = LayoutLMModel(config)
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.num_classes = config.num_classes
+        self.classifier = nn.Linear(config.hidden_size, self.num_classes)
+
+    def get_input_embeddings(self):
+        return self.layoutlm.embeddings.word_embeddings
+
+    def resize_position_embeddings(self, new_num_position_embeddings):
+        """
+        Resizes position embeddings of the model if `new_num_position_embeddings != config["max_position_embeddings"]`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embedding matrix. If position embeddings are learned, increasing the size
+                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
+                end.
+        """
+        self.layoutlm.resize_position_embeddings(new_num_position_embeddings)
+
+    def forward(
+        self,
+        input_ids,
+        bbox=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        output_hidden_states=False,
+    ):
+        r"""
+        The LayoutLMForSequenceClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`LayoutLMModel`.
+            bbox (Tensor):
+                See :class:`LayoutLMModel`.
+            attention_mask (list, optional):
+                See :class:`LayoutLMModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`LayoutLMModel`.
+            position_ids(Tensor, optional):
+                See :class:`LayoutLMModel`.
+            output_hidden_states(Tensor, optional):
+                See :class:`LayoutLMModel`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input text classification logits.
+            Shape as `[batch_size, num_classes]` and dtype as float32.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import LayoutLMForSequenceClassification
+                from paddlenlp.transformers import LayoutLMTokenizer
+
+                tokenizer = LayoutLMTokenizer.from_pretrained('layoutlm-base-uncased')
+                model = LayoutLMForSequenceClassification.from_pretrained('layoutlm-base-uncased', num_classes=2)
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!", return_tensors="pd")
+
+                logits = model(**inputs)
+                print(logits.shape)
+                # [1, 2]
+
+        """
+        outputs = self.layoutlm(
+            input_ids=input_ids,
+            bbox=bbox,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            output_hidden_states=output_hidden_states,
+        )
+        pooled_outputs = outputs[1]
+        pooled_outputs = self.dropout(pooled_outputs)
+        logits = self.classifier(pooled_outputs)
+        return logits
+
+
+class LayoutLMLMPredictionHead(Layer):
+    """
+    LayoutLM Model with a `language modeling` head on top for CLM fine-tuning.
+    """
+
+    def __init__(self, config: LayoutLMConfig, weight_attr=None):
+        super(LayoutLMLMPredictionHead, self).__init__()
+        self.transform = nn.Linear(config.hidden_size, config.hidden_size, weight_attr=weight_attr)
+        self.activation = getattr(nn.functional, config.hidden_act)
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.decoder = TransposedLinear(config.hidden_size, config.vocab_size)
+        # link bias to load pretrained weights
+        self.decoder_bias = self.decoder.bias
+        # self.decoder_weight = (
+        #     self.create_parameter(shape=[vocab_size, hidden_size], dtype=self.transform.weight.dtype, is_bias=False)
+        #     if embedding_weights is None
+        #     else embedding_weights
+        # )
+        # self.decoder_bias = self.create_parameter(shape=[vocab_size], dtype=self.decoder_weight.dtype, is_bias=True)
+
+    def forward(self, hidden_states, masked_positions=None):
+        if masked_positions is not None:
+            hidden_states = paddle.reshape(hidden_states, [-1, hidden_states.shape[-1]])
+            hidden_states = paddle.tensor.gather(hidden_states, masked_positions)
+        # gather masked tokens might be more quick
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class LayoutLMOnlyMLMHead(nn.Layer):
+    def __init__(self, config: LayoutLMConfig, weight_attr=None):
+        super().__init__()
+        self.predictions = LayoutLMLMPredictionHead(config, weight_attr=weight_attr)
+
+    def forward(self, sequence_output, masked_positions=None):
+        prediction_scores = self.predictions(sequence_output, masked_positions)
+        return prediction_scores
+
+
+class LayoutLMForMaskedLM(LayoutLMPretrainedModel):
+    """
+    LayoutLM Model with a `masked language modeling` head on top.
+
+    Args:
+        config (:class:`LayoutLMConfig`):
+            An instance of LayoutLMConfig used to construct LayoutLMForMaskedLM.
+
+    """
+
+    def __init__(self, config: LayoutLMConfig):
+        super(LayoutLMForMaskedLM, self).__init__(config)
+        self.layoutlm = LayoutLMModel(config)
+        self.cls = LayoutLMOnlyMLMHead(config)
+
+    def resize_position_embeddings(self, new_num_position_embeddings):
+        """
+        Resizes position embeddings of the model if `new_num_position_embeddings != config["max_position_embeddings"]`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embedding matrix. If position embeddings are learned, increasing the size
+                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
+                end.
+        """
+        self.layoutlm.resize_position_embeddings(new_num_position_embeddings)
+
+    def forward(self, input_ids, bbox=None, token_type_ids=None, position_ids=None, attention_mask=None):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`LayoutLMModel`.
+            bbox (Tensor):
+                See :class:`LayoutLMModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`LayoutLMModel`.
+            position_ids (Tensor, optional):
+                See :class:`LayoutLMModel`.
+            attention_mask (Tensor, optional):
+                See :class:`LayoutLMModel`.
+
+        Returns:
+            Tensor: Returns tensor `prediction_scores`, The scores of masked token prediction.
+            Its data type should be float32 and shape is [batch_size, sequence_length, vocab_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import LayoutLMForMaskedLM, LayoutLMTokenizer
+
+                tokenizer = LayoutLMTokenizer.from_pretrained('layoutlm-base-uncased')
+                model = LayoutLMForMaskedLM.from_pretrained('layoutlm-base-uncased')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!", return_tensors="pd")
+
+                logits = model(**inputs)
+                print(logits.shape)
+
+        """
+
+        outputs = self.layoutlm(
+            input_ids,
+            bbox=bbox,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output, masked_positions=None)
+        return prediction_scores
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlm/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlm/tokenizer.py
new file mode 100644
index 000000000..51db3820f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlm/tokenizer.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tokenization classes for LayoutLM model."""
+from ..bert.tokenizer import BertTokenizer
+
+__all__ = ["LayoutLMTokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"layoutlm-base-uncased": 512, "layoutlm-large-uncased": 512}
+
+
+class LayoutLMTokenizer(BertTokenizer):
+    """
+    The usage of LayoutLMTokenizer is the same as
+    `BertTokenizer <https://paddlenlp.readthedocs.io/zh/latest/source/paddlenlp.transformers.bert.tokenizer.html>`__.
+    For more information regarding those methods, please refer to this superclass.
+    """
+
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "layoutlm-base-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/layoutlm/layoutlm-base-uncased/vocab.txt",
+            "layoutlm-large-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/layoutlm/layoutlm-large-uncased/vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "layoutlm-base-uncased": {"do_lower_case": True},
+        "layoutlm-large-uncased": {"do_lower_case": True},
+    }
+
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlmv2/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlmv2/__init__.py
new file mode 100644
index 000000000..595add0ae
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlmv2/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlmv2/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlmv2/configuration.py
new file mode 100644
index 000000000..55facb107
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlmv2/configuration.py
@@ -0,0 +1,252 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" LayoutLMv2 model configuration"""
+
+from typing import Dict
+
+from ..configuration_utils import PretrainedConfig
+
+__all__ = ["LAYOUTLMV2_PRETRAINED_INIT_CONFIGURATION", "LayoutLMv2Config", "LAYOUTLMV2_PRETRAINED_RESOURCE_FILES_MAP"]
+
+LAYOUTLMV2_PRETRAINED_INIT_CONFIGURATION = {
+    "layoutlmv2-base-uncased": {
+        "attention_probs_dropout_prob": 0.1,
+        "coordinate_size": 128,
+        "fast_qkv": True,
+        "gradient_checkpointing": False,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "image_feature_pool_shape": [7, 7, 256],
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "layer_norm_eps": 1e-12,
+        "max_2d_position_embeddings": 1024,
+        "max_position_embeddings": 512,
+        "max_rel_2d_pos": 256,
+        "max_rel_pos": 128,
+        "model_type": "layoutlmv2",
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "output_past": True,
+        "pad_token_id": 0,
+        "shape_size": 128,
+        "rel_2d_pos_bins": 64,
+        "rel_pos_bins": 32,
+        "type_vocab_size": 2,
+        "vocab_size": 30522,
+        "has_relative_attention_bias": True,
+        "has_spatial_attention_bias": True,
+        "has_visual_segment_embedding": False,
+        "use_visual_backbone": True,
+    },
+    "layoutlmv2-large-uncased": {
+        "attention_probs_dropout_prob": 0.1,
+        "coordinate_size": 171,
+        "fast_qkv": False,
+        "gradient_checkpointing": False,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "image_feature_pool_shape": [7, 7, 256],
+        "initializer_range": 0.02,
+        "intermediate_size": 4096,
+        "layer_norm_eps": 1e-12,
+        "max_2d_position_embeddings": 1024,
+        "max_position_embeddings": 512,
+        "max_rel_2d_pos": 256,
+        "max_rel_pos": 128,
+        "model_type": "layoutlmv2",
+        "num_attention_heads": 16,
+        "num_hidden_layers": 24,
+        "output_past": True,
+        "pad_token_id": 0,
+        "shape_size": 170,
+        "rel_2d_pos_bins": 64,
+        "rel_pos_bins": 32,
+        "type_vocab_size": 2,
+        "vocab_size": 30522,
+        "has_relative_attention_bias": True,
+        "has_spatial_attention_bias": True,
+        "has_visual_segment_embedding": False,
+        "use_visual_backbone": True,
+    },
+    "vi-layoutlmv2-base-uncased": {
+        "attention_probs_dropout_prob": 0.1,
+        "coordinate_size": 128,
+        "fast_qkv": True,
+        "gradient_checkpointing": False,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "image_feature_pool_shape": [7, 7, 256],
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "layer_norm_eps": 1e-12,
+        "max_2d_position_embeddings": 1024,
+        "max_position_embeddings": 512,
+        "max_rel_2d_pos": 256,
+        "max_rel_pos": 128,
+        "model_type": "layoutlmv2",
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "output_past": True,
+        "pad_token_id": 0,
+        "shape_size": 128,
+        "rel_2d_pos_bins": 64,
+        "rel_pos_bins": 32,
+        "type_vocab_size": 2,
+        "vocab_size": 30522,
+        "has_relative_attention_bias": True,
+        "has_spatial_attention_bias": True,
+        "has_visual_segment_embedding": False,
+        "use_visual_backbone": False,
+    },
+}
+
+LAYOUTLMV2_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "layoutlmv2-base-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/layoutlmv2/layoutlmv2-base-uncased/model_state.pdparams",
+        "layoutlmv2-large-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/layoutlmv2/layoutlmv2-large-uncased/model_state.pdparams",
+        "vi-layoutlmv2-base-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/layoutlmv2/vi-layoutlmv2-base-uncased/model_state.pdparams",
+    }
+}
+
+
+class LayoutLMv2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`LayoutLMv2Model`]. It is used to instantiate an LayoutLMv2 Model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the LayoutLMv2 layoutlmv2-base-uncased architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, optional, defaults to 21128):
+            Vocabulary size of the NEZHA model. Defines the different tokens that can be represented by the
+            *inputs_ids* passed to the forward method of [`NezhaModel`].
+        embedding_size (`int`, optional, defaults to 128):
+            Dimensionality of vocabulary embeddings.
+        hidden_size (`int`, optional, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, optional, defaults to 3072):
+            The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+        hidden_dropout_prob (`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, optional, defaults to 2):
+            The vocabulary size of the *token_type_ids* passed into [`NezhaModel`].
+        initializer_range (`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        classifier_dropout (`float`, optional, defaults to 0.1):
+            The dropout ratio for attached classifiers.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import NeZhaConfig, NeZhaModel
+    >>> # Initializing an Nezha configuration
+    >>> configuration = NeZhaConfig()
+    >>> # Initializing a model (with random weights) from the Nezha-base style configuration model
+    >>> model = NeZhaModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    attribute_map: Dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
+    pretrained_init_configuration = LAYOUTLMV2_PRETRAINED_INIT_CONFIGURATION
+    model_type = "layoutlmv2"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        max_2d_position_embeddings=1024,
+        max_rel_pos=128,
+        max_rel_2d_pos=256,
+        rel_pos_bins=32,
+        rel_2d_pos_bins=64,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        coordinate_size=128,
+        shape_size=128,
+        image_feature_pool_shape=[7, 7, 256],
+        fast_qkv=True,
+        has_relative_attention_bias=True,
+        has_spatial_attention_bias=True,
+        has_visual_segment_embedding=False,
+        output_past=True,
+        gradient_checkpointing=False,
+        classifier_dropout=0.1,
+        pad_token_id=0,
+        bos_token_id=2,
+        eos_token_id=3,
+        use_cache=True,
+        with_pool="tanh",
+        use_visual_backbone=True,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.max_2d_position_embeddings = max_2d_position_embeddings
+        self.max_rel_pos = max_rel_pos
+        self.max_rel_2d_pos = max_rel_2d_pos
+        self.rel_pos_bins = rel_pos_bins
+        self.rel_2d_pos_bins = rel_2d_pos_bins
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.coordinate_size = coordinate_size
+        self.shape_size = shape_size
+        self.image_feature_pool_shape = image_feature_pool_shape
+        self.fast_qkv = fast_qkv
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.has_spatial_attention_bias = has_spatial_attention_bias
+        self.has_visual_segment_embedding = has_visual_segment_embedding
+        self.output_past = output_past
+        self.gradient_checkpointing = gradient_checkpointing
+        self.classifier_dropout = classifier_dropout
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.use_cache = use_cache
+        self.with_pool = with_pool
+        self.use_visual_backbone = use_visual_backbone
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlmv2/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlmv2/modeling.py
new file mode 100644
index 000000000..83212f9fe
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlmv2/modeling.py
@@ -0,0 +1,1203 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Modeling classes for LayoutLMv2 model."""
+
+import copy
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import CrossEntropyLoss, Layer
+
+from paddlenlp.utils.log import logger
+
+from ...utils.env import CONFIG_NAME
+from .. import PretrainedModel, register_base_model
+from ..layoutxlm.visual_backbone import build_resnet_fpn_backbone, read_config
+from .configuration import (
+    LAYOUTLMV2_PRETRAINED_INIT_CONFIGURATION,
+    LAYOUTLMV2_PRETRAINED_RESOURCE_FILES_MAP,
+    LayoutLMv2Config,
+)
+
+__all__ = [
+    "LayoutLMv2Model",
+    "LayoutLMv2PretrainedModel",
+    "LayoutLMv2ForTokenClassification",
+    "LayoutLMv2ForPretraining",
+    "LayoutLMv2ForRelationExtraction",
+]
+
+
+def relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+    ret = 0
+    if bidirectional:
+        num_buckets //= 2
+        ret += (relative_position > 0).astype(paddle.int64) * num_buckets
+        n = paddle.abs(relative_position)
+    else:
+        n = paddle.max(-relative_position, paddle.zeros_like(relative_position))
+    # now n is in the range [0, inf)
+    # half of the buckets are for exact increments in positions
+    max_exact = num_buckets // 2
+    is_small = n < max_exact
+
+    # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+    val_if_large = max_exact + (
+        paddle.log(n.astype(paddle.float32) / max_exact)
+        / math.log(max_distance / max_exact)
+        * (num_buckets - max_exact)
+    ).astype(paddle.int64)
+
+    val_if_large = paddle.minimum(val_if_large, paddle.full_like(val_if_large, num_buckets - 1))
+
+    ret += paddle.where(is_small, n, val_if_large)
+    return ret
+
+
+# Copied from paddlenlp.transformers.layoutxlm.modeling.LayoutXLMPooler with XLM->LMv2
+class LayoutLMv2Pooler(Layer):
+    def __init__(self, hidden_size, with_pool):
+        super(LayoutLMv2Pooler, self).__init__()
+        self.dense = nn.Linear(hidden_size, hidden_size)
+        self.activation = nn.Tanh()
+        self.with_pool = with_pool
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        if self.with_pool == "tanh":
+            pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+# Copied from paddlenlp.transformers.layoutxlm.modeling.LayoutXLMEmbeddings with XLM->LMv2
+class LayoutLMv2Embeddings(Layer):
+    """
+    Include embeddings from word, position and token_type embeddings
+    """
+
+    def __init__(self, config):
+        super(LayoutLMv2Embeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.x_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
+        self.y_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
+        self.h_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
+        self.w_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.register_buffer(
+            "position_ids", paddle.arange(config.max_position_embeddings, dtype="int64").expand((1, -1))
+        )
+
+    def _cal_spatial_position_embeddings(self, bbox):
+        try:
+            left_position_embeddings = self.x_position_embeddings(bbox[:, :, 0])
+            upper_position_embeddings = self.y_position_embeddings(bbox[:, :, 1])
+            right_position_embeddings = self.x_position_embeddings(bbox[:, :, 2])
+            lower_position_embeddings = self.y_position_embeddings(bbox[:, :, 3])
+        except IndexError as e:
+            raise IndexError("The :obj:`bbox`coordinate values should be within 0-1000 range.") from e
+
+        h_position_embeddings = self.h_position_embeddings(bbox[:, :, 3] - bbox[:, :, 1])
+        w_position_embeddings = self.w_position_embeddings(bbox[:, :, 2] - bbox[:, :, 0])
+
+        spatial_position_embeddings = paddle.concat(
+            [
+                left_position_embeddings,
+                upper_position_embeddings,
+                right_position_embeddings,
+                lower_position_embeddings,
+                h_position_embeddings,
+                w_position_embeddings,
+            ],
+            axis=-1,
+        )
+        return spatial_position_embeddings
+
+    def forward(self, input_ids, bbox=None, token_type_ids=None, position_ids=None):
+        if position_ids is None:
+            ones = paddle.ones_like(input_ids, dtype="int64")
+            seq_length = paddle.cumsum(ones, axis=-1)
+
+            position_ids = seq_length - ones
+            position_ids.stop_gradient = True
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros_like(input_ids, dtype="int64")
+
+        input_embedings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+
+        try:
+            left_position_embeddings = self.x_position_embeddings(bbox[:, :, 0])
+            upper_position_embeddings = self.y_position_embeddings(bbox[:, :, 1])
+            right_position_embeddings = self.x_position_embeddings(bbox[:, :, 2])
+            lower_position_embeddings = self.y_position_embeddings(bbox[:, :, 3])
+        except IndexError as e:
+            raise IndexError("The :obj:`bbox`coordinate values should be within 0-1000 range.") from e
+        h_position_embeddings = self.h_position_embeddings(bbox[:, :, 3] - bbox[:, :, 1])
+        w_position_embeddings = self.w_position_embeddings(bbox[:, :, 2] - bbox[:, :, 0])
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = (
+            input_embedings
+            + position_embeddings
+            + left_position_embeddings
+            + upper_position_embeddings
+            + right_position_embeddings
+            + lower_position_embeddings
+            + h_position_embeddings
+            + w_position_embeddings
+            + token_type_embeddings
+        )
+
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class LayoutLMv2PretrainedModel(PretrainedModel):
+    model_config_file = CONFIG_NAME
+    config_class = LayoutLMv2Config
+    resource_files_names = {"model_state": "model_state.pdparams"}
+
+    base_model_prefix = "layoutlmv2"
+
+    pretrained_init_configuration = LAYOUTLMV2_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = LAYOUTLMV2_PRETRAINED_RESOURCE_FILES_MAP
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range,
+                        shape=layer.weight.shape,
+                    )
+                )
+
+
+# Copied from paddlenlp.transformers.layoutxlm.modeling.LayoutXLMSelfOutput with XLM->LMv2
+class LayoutLMv2SelfOutput(nn.Layer):
+    def __init__(self, config):
+        super(LayoutLMv2SelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from paddlenlp.transformers.layoutxlm.modeling.LayoutXLMSelfAttention with XLM->LMv2
+class LayoutLMv2SelfAttention(nn.Layer):
+    def __init__(self, config):
+        super(LayoutLMv2SelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size {} is not a multiple of the number of attention "
+                "heads {}".format(config.hidden_size, config.num_attention_heads)
+            )
+        self.fast_qkv = config.fast_qkv
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.has_relative_attention_bias = config.has_relative_attention_bias
+        self.has_spatial_attention_bias = config.has_spatial_attention_bias
+
+        if self.fast_qkv:
+            self.qkv_linear = nn.Linear(config.hidden_size, 3 * self.all_head_size, bias_attr=False)
+            self.q_bias = self.create_parameter(
+                shape=[1, 1, self.all_head_size], default_initializer=nn.initializer.Constant(0.0)
+            )
+            self.v_bias = self.create_parameter(
+                shape=[1, 1, self.all_head_size], default_initializer=nn.initializer.Constant(0.0)
+            )
+        else:
+            self.query = nn.Linear(config.hidden_size, self.all_head_size)
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = list(x.shape[:-1]) + [self.num_attention_heads, self.attention_head_size]
+        x = x.reshape(new_x_shape)
+        return x.transpose([0, 2, 1, 3])
+
+    def compute_qkv(self, hidden_states):
+        if self.fast_qkv:
+            qkv = self.qkv_linear(hidden_states)
+            q, k, v = paddle.chunk(qkv, 3, axis=-1)
+            if q.ndimension() == self.q_bias.ndimension():
+                q = q + self.q_bias
+                v = v + self.v_bias
+            else:
+                _sz = (1,) * (q.ndimension() - 1) + (-1,)
+                q = q + self.q_bias.reshape(_sz)
+                v = v + self.v_bias.vreshape(_sz)
+        else:
+            q = self.query(hidden_states)
+            k = self.key(hidden_states)
+            v = self.value(hidden_states)
+        return q, k, v
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        rel_pos=None,
+        rel_2d_pos=None,
+    ):
+        q, k, v = self.compute_qkv(hidden_states)
+
+        # (B, L, H*D) -> (B, H, L, D)
+        query_layer = self.transpose_for_scores(q)
+        key_layer = self.transpose_for_scores(k)
+        value_layer = self.transpose_for_scores(v)
+
+        query_layer = query_layer / math.sqrt(self.attention_head_size)
+        # [BSZ, NAT, L, L]
+        attention_scores = paddle.matmul(query_layer, key_layer.transpose([0, 1, 3, 2]))
+        if self.has_relative_attention_bias:
+            attention_scores += rel_pos
+        if self.has_spatial_attention_bias:
+            attention_scores += rel_2d_pos
+
+        bool_attention_mask = attention_mask.astype(paddle.bool)
+        bool_attention_mask.stop_gradient = True
+        attention_scores_shape = attention_scores.shape
+        attention_scores = paddle.where(
+            bool_attention_mask.expand(attention_scores_shape),
+            paddle.ones(attention_scores_shape) * float("-1e10"),
+            attention_scores,
+        )
+
+        attention_probs = F.softmax(attention_scores, axis=-1)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        context_layer = paddle.matmul(attention_probs, value_layer)
+        context_layer = context_layer.transpose([0, 2, 1, 3])
+        new_context_layer_shape = list(context_layer.shape[:-2]) + [self.all_head_size]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        if output_attentions:
+            outputs = [context_layer, attention_probs]
+        else:
+            outputs = [context_layer]
+        return outputs
+
+
+# Copied from paddlenlp.transformers.layoutxlm.modeling.LayoutXLMAttention with XLM->LMv2
+class LayoutLMv2Attention(nn.Layer):
+    def __init__(self, config):
+        super(LayoutLMv2Attention, self).__init__()
+        self.self = LayoutLMv2SelfAttention(config)
+        self.output = LayoutLMv2SelfOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        rel_pos=None,
+        rel_2d_pos=None,
+    ):
+
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+            rel_pos=rel_pos,
+            rel_2d_pos=rel_2d_pos,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        if output_attentions:
+            outputs = [
+                attention_output,
+            ] + self_outputs[1:]
+        else:
+            outputs = [attention_output]
+        return outputs
+
+
+# Copied from paddlenlp.transformers.layoutxlm.modeling.LayoutXLMEncoder with XLM->LMv2
+class LayoutLMv2Encoder(nn.Layer):
+    def __init__(self, config):
+        super(LayoutLMv2Encoder, self).__init__()
+        self.config = config
+        self.layer = nn.LayerList([LayoutLMv2Layer(config) for _ in range(config.num_hidden_layers)])
+
+        self.has_relative_attention_bias = config.has_relative_attention_bias
+        self.has_spatial_attention_bias = config.has_spatial_attention_bias
+
+        if self.has_relative_attention_bias:
+            self.rel_pos_bins = config.rel_pos_bins
+            self.max_rel_pos = config.max_rel_pos
+            self.rel_pos_onehot_size = config.rel_pos_bins
+            self.rel_pos_bias = nn.Linear(self.rel_pos_onehot_size, config.num_attention_heads, bias_attr=False)
+
+        if self.has_spatial_attention_bias:
+            self.max_rel_2d_pos = config.max_rel_2d_pos
+            self.rel_2d_pos_bins = config.rel_2d_pos_bins
+            self.rel_2d_pos_onehot_size = config.rel_2d_pos_bins
+            self.rel_pos_x_bias = nn.Linear(self.rel_2d_pos_onehot_size, config.num_attention_heads, bias_attr=False)
+            self.rel_pos_y_bias = nn.Linear(self.rel_2d_pos_onehot_size, config.num_attention_heads, bias_attr=False)
+
+    def _cal_1d_pos_emb(self, hidden_states, position_ids):
+        rel_pos_mat = position_ids.unsqueeze(-2) - position_ids.unsqueeze(-1)
+        rel_pos = relative_position_bucket(
+            rel_pos_mat,
+            num_buckets=self.rel_pos_bins,
+            max_distance=self.max_rel_pos,
+        )
+        rel_pos = paddle.nn.functional.one_hot(rel_pos, num_classes=self.rel_pos_onehot_size).astype(
+            hidden_states.dtype
+        )
+        rel_pos = self.rel_pos_bias(rel_pos).transpose([0, 3, 1, 2])
+        return rel_pos
+
+    def _cal_2d_pos_emb(self, hidden_states, bbox):
+        position_coord_x = bbox[:, :, 0]
+        position_coord_y = bbox[:, :, 3]
+        rel_pos_x_2d_mat = position_coord_x.unsqueeze(-2) - position_coord_x.unsqueeze(-1)
+        rel_pos_y_2d_mat = position_coord_y.unsqueeze(-2) - position_coord_y.unsqueeze(-1)
+        rel_pos_x = relative_position_bucket(
+            rel_pos_x_2d_mat,
+            num_buckets=self.rel_2d_pos_bins,
+            max_distance=self.max_rel_2d_pos,
+        )
+        rel_pos_y = relative_position_bucket(
+            rel_pos_y_2d_mat,
+            num_buckets=self.rel_2d_pos_bins,
+            max_distance=self.max_rel_2d_pos,
+        )
+        rel_pos_x = F.one_hot(rel_pos_x, num_classes=self.rel_2d_pos_onehot_size).astype(hidden_states.dtype)
+        rel_pos_y = F.one_hot(rel_pos_y, num_classes=self.rel_2d_pos_onehot_size).astype(hidden_states.dtype)
+        rel_pos_x = self.rel_pos_x_bias(rel_pos_x).transpose([0, 3, 1, 2])
+        rel_pos_y = self.rel_pos_y_bias(rel_pos_y).transpose([0, 3, 1, 2])
+        rel_2d_pos = rel_pos_x + rel_pos_y
+        return rel_2d_pos
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        bbox=None,
+        position_ids=None,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+
+        rel_pos = self._cal_1d_pos_emb(hidden_states, position_ids) if self.has_relative_attention_bias else None
+        rel_2d_pos = self._cal_2d_pos_emb(hidden_states, bbox) if self.has_spatial_attention_bias else None
+
+        hidden_save = dict()
+        hidden_save["input_hidden_states"] = hidden_states
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            # gradient_checkpointing is set as False here so we remove some codes here
+            hidden_save["input_attention_mask"] = attention_mask
+            hidden_save["input_layer_head_mask"] = layer_head_mask
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_value,
+                output_attentions,
+                rel_pos=rel_pos,
+                rel_2d_pos=rel_2d_pos,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            hidden_save["{}_data".format(i)] = hidden_states
+
+        return hidden_states, hidden_save
+
+
+# Copied from paddlenlp.transformers.layoutxlm.modeling.LayoutXLMIntermediate with XLM->LMv2
+class LayoutLMv2Intermediate(nn.Layer):
+    def __init__(self, config):
+        super(LayoutLMv2Intermediate, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if config.hidden_act == "gelu":
+            self.intermediate_act_fn = nn.GELU()
+        else:
+            assert False, "hidden_act is set as: {}, please check it..".format(config.hidden_act)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from paddlenlp.transformers.layoutxlm.modeling.LayoutXLMOutput with XLM->LMv2
+class LayoutLMv2Output(nn.Layer):
+    def __init__(self, config):
+        super(LayoutLMv2Output, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from paddlenlp.transformers.layoutxlm.modeling.LayoutXLMLayer with XLM->LMv2
+class LayoutLMv2Layer(nn.Layer):
+    def __init__(self, config):
+        super(LayoutLMv2Layer, self).__init__()
+        # since chunk_size_feed_forward is 0 as default, no chunk is needed here.
+        self.seq_len_dim = 1
+        self.attention = LayoutLMv2Attention(config)
+        self.add_cross_attention = False  # default as false
+        self.intermediate = LayoutLMv2Intermediate(config)
+        self.output = LayoutLMv2Output(config)
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        rel_pos=None,
+        rel_2d_pos=None,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+            rel_pos=rel_pos,
+            rel_2d_pos=rel_2d_pos,
+        )
+        attention_output = self_attention_outputs[0]
+
+        layer_output = self.feed_forward_chunk(attention_output)
+
+        if output_attentions:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+            outputs = [
+                layer_output,
+            ] + list(outputs)
+        else:
+            outputs = [layer_output]
+        return outputs
+
+
+# Copied from paddlenlp.transformers.layoutxlm.modeling.VisualBackbone
+class VisualBackbone(nn.Layer):
+    def __init__(self, config):
+        super(VisualBackbone, self).__init__()
+        self.cfg = read_config()
+        self.backbone = build_resnet_fpn_backbone(self.cfg)
+
+        assert len(self.cfg.MODEL.PIXEL_MEAN) == len(self.cfg.MODEL.PIXEL_STD)
+        num_channels = len(self.cfg.MODEL.PIXEL_MEAN)
+        self.register_buffer("pixel_mean", paddle.to_tensor(self.cfg.MODEL.PIXEL_MEAN).reshape([num_channels, 1, 1]))
+        self.register_buffer("pixel_std", paddle.to_tensor(self.cfg.MODEL.PIXEL_STD).reshape([num_channels, 1, 1]))
+        self.out_feature_key = "p2"
+        # is_deterministic is disabled here.
+        self.pool = nn.AdaptiveAvgPool2D(config.image_feature_pool_shape[:2])
+        if len(config.image_feature_pool_shape) == 2:
+            config.image_feature_pool_shape.append(self.backbone.output_shape()[self.out_feature_key].channels)
+        assert self.backbone.output_shape()[self.out_feature_key].channels == config.image_feature_pool_shape[2]
+
+    def forward(self, images):
+        images_input = (paddle.to_tensor(images) - self.pixel_mean) / self.pixel_std
+        features = self.backbone(images_input)
+        features = features[self.out_feature_key]
+        features = self.pool(features).flatten(start_axis=2).transpose([0, 2, 1])
+        return features
+
+
+# Copied from paddlenlp.transformers.layoutxlm.modeling.LayoutXLMModel with XLM->LMv2
+@register_base_model
+class LayoutLMv2Model(LayoutLMv2PretrainedModel):
+    """
+    The bare LayoutLMv2 Model outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        vocab_size (`int`):
+            Vocabulary size of the XLNet model. Defines the number of different tokens that can
+            be represented by the `inputs_ids` passed when calling XLNetModel.
+        hidden_size (`int`, optional):
+            Dimensionality of the encoder layers and the pooler layer. Defaults to ``768``.
+        num_hidden_layers (`int`, optional):
+            Number of hidden layers in the Transformer encoder. Defaults to ``12``.
+        num_attention_heads (`int`, optional):
+            Number of attention heads for each attention layer in the Transformer encoder.
+            Defaults to ``12``.
+        intermediate_size (`int`, optional):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+            Defaults to ``3072``.
+        hidden_act (`str`, optional):
+            The non-linear activation function in the feed-forward layer.
+            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
+            are supported. Defaults to ``"gelu"``.
+        hidden_dropout_prob (`float`, optional):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+            Defaults to ``0.1``.
+        attention_probs_dropout_prob (`float`, optional):
+            The dropout probability for all fully connected layers in the pooler.
+            Defaults to ``0.1``.
+        initializer_range (`float`, optional):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            Defaults to ``0.02``.
+    """
+
+    def __init__(self, config):
+        super(LayoutLMv2Model, self).__init__(config)
+        self.use_visual_backbone = config.use_visual_backbone
+        self.has_visual_segment_embedding = config.has_visual_segment_embedding
+        self.embeddings = LayoutLMv2Embeddings(config)
+
+        if self.use_visual_backbone is True:
+            self.visual = VisualBackbone(config)
+            self.visual.stop_gradient = True
+            self.visual_proj = nn.Linear(config.image_feature_pool_shape[-1], config.hidden_size)
+        if self.has_visual_segment_embedding:
+            self.visual_segment_embedding = self.create_parameter(
+                shape=[
+                    config.hidden_size,
+                ],
+                dtype=paddle.float32,
+            )
+        self.visual_LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.visual_dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.encoder = LayoutLMv2Encoder(config)
+        self.pooler = LayoutLMv2Pooler(config.hidden_size, config.with_pool)
+        self.config = config
+
+    def _calc_text_embeddings(self, input_ids, bbox, position_ids, token_type_ids):
+        words_embeddings = self.embeddings.word_embeddings(input_ids)
+        position_embeddings = self.embeddings.position_embeddings(position_ids)
+        spatial_position_embeddings = self.embeddings._cal_spatial_position_embeddings(bbox)
+        token_type_embeddings = self.embeddings.token_type_embeddings(token_type_ids)
+        embeddings = words_embeddings + position_embeddings + spatial_position_embeddings + token_type_embeddings
+        embeddings = self.embeddings.LayerNorm(embeddings)
+        embeddings = self.embeddings.dropout(embeddings)
+        return embeddings
+
+    def _calc_img_embeddings(self, image, bbox, position_ids):
+        position_embeddings = self.embeddings.position_embeddings(position_ids)
+        spatial_position_embeddings = self.embeddings._cal_spatial_position_embeddings(bbox)
+        if self.use_visual_backbone is True:
+            visual_embeddings = self.visual_proj(self.visual(image.astype(paddle.float32)))
+            embeddings = visual_embeddings + position_embeddings + spatial_position_embeddings
+        else:
+            embeddings = position_embeddings + spatial_position_embeddings
+        if self.has_visual_segment_embedding:
+            embeddings += self.visual_segment_embedding
+        embeddings = self.visual_LayerNorm(embeddings)
+        embeddings = self.visual_dropout(embeddings)
+        return embeddings
+
+    def resize_position_embeddings(self, new_num_position_embeddings):
+        """
+        Resizes position embeddings of the model if `new_num_position_embeddings != config["max_position_embeddings"]`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embedding matrix. If position embeddings are learned, increasing the size
+                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
+                end.
+        """
+        num_position_embeds_diff = new_num_position_embeddings - self.config.max_position_embeddings
+
+        # no resizing needs to be done if the length stays the same
+        if num_position_embeds_diff == 0:
+            return
+
+        logger.info(f"Setting `config.max_position_embeddings={new_num_position_embeddings}`...")
+        self.config.max_position_embeddings = new_num_position_embeddings
+
+        old_position_embeddings_weight = self.embeddings.position_embeddings.weight
+
+        self.embeddings.position_embeddings = nn.Embedding(
+            self.config.max_position_embeddings, self.config.hidden_size
+        )
+
+        with paddle.no_grad():
+            if num_position_embeds_diff > 0:
+                self.embeddings.position_embeddings.weight[:-num_position_embeds_diff] = old_position_embeddings_weight
+            else:
+                self.embeddings.position_embeddings.weight = old_position_embeddings_weight[:num_position_embeds_diff]
+
+    def forward(
+        self,
+        input_ids=None,
+        bbox=None,
+        image=None,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        head_mask=None,
+        output_hidden_states=False,
+        output_attentions=False,
+    ):
+        input_shape = input_ids.shape
+
+        visual_shape = list(input_shape)
+        visual_shape[1] = self.config.image_feature_pool_shape[0] * self.config.image_feature_pool_shape[1]
+
+        visual_bbox_x = (
+            paddle.arange(
+                0,
+                1000 * (self.config.image_feature_pool_shape[1] + 1),
+                1000,
+                dtype=bbox.dtype,
+            )
+            // self.config.image_feature_pool_shape[1]
+        )
+        visual_bbox_y = (
+            paddle.arange(
+                0,
+                1000 * (self.config.image_feature_pool_shape[0] + 1),
+                1000,
+                dtype=bbox.dtype,
+            )
+            // self.config.image_feature_pool_shape[0]
+        )
+
+        expand_shape = self.config.image_feature_pool_shape[0:2]
+
+        visual_bbox = paddle.stack(
+            [
+                visual_bbox_x[:-1].expand(expand_shape),
+                visual_bbox_y[:-1].expand(expand_shape[::-1]).transpose([1, 0]),
+                visual_bbox_x[1:].expand(expand_shape),
+                visual_bbox_y[1:].expand(expand_shape[::-1]).transpose([1, 0]),
+            ],
+            axis=-1,
+        ).reshape([expand_shape[0] * expand_shape[1], bbox.shape[-1]])
+        visual_bbox = visual_bbox.expand([input_shape[0], visual_bbox.shape[0], visual_bbox.shape[1]])
+        final_bbox = paddle.concat([bbox, visual_bbox], axis=1)
+
+        if attention_mask is None:
+            attention_mask = paddle.ones(input_shape)
+
+        if self.use_visual_backbone is True:
+            visual_attention_mask = paddle.ones(visual_shape)
+        else:
+            visual_attention_mask = paddle.zeros(visual_shape)
+
+        attention_mask = attention_mask.astype(visual_attention_mask.dtype)
+
+        final_attention_mask = paddle.concat([attention_mask, visual_attention_mask], axis=1)
+
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64)
+
+        if position_ids is None:
+            seq_length = input_shape[1]
+            position_ids = self.embeddings.position_ids[:, :seq_length]
+            position_ids = position_ids.expand(input_shape)
+
+        visual_position_ids = paddle.arange(0, visual_shape[1]).expand([input_shape[0], visual_shape[1]])
+        final_position_ids = paddle.concat([position_ids, visual_position_ids], axis=1)
+
+        if bbox is None:
+            bbox = paddle.zeros(input_shape + [4])
+
+        text_layout_emb = self._calc_text_embeddings(
+            input_ids=input_ids,
+            bbox=bbox,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+        )
+
+        visual_emb = self._calc_img_embeddings(
+            image=image,
+            bbox=visual_bbox,
+            position_ids=visual_position_ids,
+        )
+        final_emb = paddle.concat([text_layout_emb, visual_emb], axis=1)
+
+        extended_attention_mask = final_attention_mask.unsqueeze(1).unsqueeze(2)
+
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype)
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        encoder_outputs = self.encoder(
+            final_emb,
+            extended_attention_mask,
+            bbox=final_bbox,
+            position_ids=final_position_ids,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output)
+        # i_data (i in [0, 12) is the key of the hidden states
+        return sequence_output, pooled_output, encoder_outputs[1]
+
+
+# Copied from paddlenlp.transformers.layoutxlm.modeling.LayoutXLMForTokenClassification with XLM->LMv2
+class LayoutLMv2ForTokenClassification(LayoutLMv2PretrainedModel):
+    def __init__(self, config):
+        super(LayoutLMv2ForTokenClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.layoutlmv2 = LayoutLMv2Model(config)
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, self.num_labels)
+        self.num_hidden_layers = config.num_hidden_layers
+
+    def get_input_embeddings(self):
+        return self.layoutlmv2.embeddings.word_embeddings
+
+    def resize_position_embeddings(self, new_num_position_embeddings):
+        """
+        Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embedding matrix. If position embeddings are learned, increasing the size
+                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
+                end.
+        """
+        self.layoutlmv2.resize_position_embeddings(new_num_position_embeddings)
+
+    def forward(
+        self,
+        input_ids=None,
+        bbox=None,
+        image=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        labels=None,
+    ):
+        outputs = self.layoutlmv2(
+            input_ids=input_ids,
+            bbox=bbox,
+            image=image,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+        )
+        seq_length = input_ids.shape[1]
+        # sequence out and image out
+        sequence_output, _ = outputs[0][:, :seq_length], outputs[0][:, seq_length:]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        hidden_states = {f"hidden_states_{idx}": outputs[2][f"{idx}_data"] for idx in range(self.num_hidden_layers)}
+
+        if self.training:
+            outputs = logits, hidden_states
+        else:
+            outputs = (logits,)
+
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+
+            if attention_mask is not None:
+                active_loss = (
+                    attention_mask.reshape(
+                        [
+                            -1,
+                        ]
+                    )
+                    == 1
+                )
+                active_logits = logits.reshape([-1, self.num_classes])[active_loss]
+                active_labels = labels.reshape(
+                    [
+                        -1,
+                    ]
+                )[active_loss]
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(
+                    logits.reshape([-1, self.num_classes]),
+                    labels.reshape(
+                        [
+                            -1,
+                        ]
+                    ),
+                )
+
+            outputs = (loss,) + outputs
+
+        return outputs
+
+
+# Copied from paddlenlp.transformers.layoutxlm.modeling.LayoutXLMPredictionHead with XLM->LMv2
+class LayoutLMv2PredictionHead(Layer):
+    """
+    Bert Model with a `language modeling` head on top for CLM fine-tuning.
+    """
+
+    def __init__(self, hidden_size, vocab_size, activation, embedding_weights=None):
+        super(LayoutLMv2PredictionHead, self).__init__()
+        self.transform = nn.Linear(hidden_size, hidden_size)
+        self.activation = getattr(nn.functional, activation)
+        self.layer_norm = nn.LayerNorm(hidden_size)
+        self.decoder_weight = (
+            self.create_parameter(shape=[vocab_size, hidden_size], dtype=self.transform.weight.dtype, is_bias=False)
+            if embedding_weights is None
+            else embedding_weights
+        )
+        self.decoder_bias = self.create_parameter(shape=[vocab_size], dtype=self.decoder_weight.dtype, is_bias=True)
+
+    def forward(self, hidden_states, masked_positions=None):
+        if masked_positions is not None:
+            hidden_states = paddle.reshape(hidden_states, [-1, hidden_states.shape[-1]])
+            hidden_states = paddle.tensor.gather(hidden_states, masked_positions)
+        # gather masked tokens might be more quick
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = paddle.tensor.matmul(hidden_states, self.decoder_weight, transpose_y=True) + self.decoder_bias
+        return hidden_states
+
+
+# Copied from paddlenlp.transformers.layoutxlm.modeling.LayoutXLMPretrainingHeads with XLM->LMv2
+class LayoutLMv2PretrainingHeads(Layer):
+    def __init__(self, hidden_size, vocab_size, activation, embedding_weights=None):
+        super(LayoutLMv2PretrainingHeads, self).__init__()
+        self.predictions = LayoutLMv2PredictionHead(hidden_size, vocab_size, activation, embedding_weights)
+
+    def forward(self, sequence_output, masked_positions=None):
+        prediction_scores = self.predictions(sequence_output, masked_positions)
+        return prediction_scores
+
+
+# Copied from paddlenlp.transformers.layoutxlm.modeling.LayoutXLMForPretraining with XLM->LMv2
+class LayoutLMv2ForPretraining(LayoutLMv2PretrainedModel):
+    def __init__(self, config):
+        super(LayoutLMv2ForPretraining, self).__init__(config)
+        self.layoutlmv2 = LayoutLMv2Model(config)
+        self.cls = LayoutLMv2PretrainingHeads(
+            self.layoutlmv2.config.hidden_size,
+            self.layoutlmv2.config.vocab_size,
+            self.layoutlmv2.config.hidden_act,
+            embedding_weights=self.layoutlmv2.embeddings.word_embeddings.weight,
+        )
+
+    def resize_position_embeddings(self, new_num_position_embeddings):
+        """
+        Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embedding matrix. If position embeddings are learned, increasing the size
+                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
+                end.
+        """
+        self.layoutlmv2.resize_position_embeddings(new_num_position_embeddings)
+
+    def forward(
+        self,
+        input_ids=None,
+        bbox=None,
+        image=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        masked_positions=None,
+    ):
+        outputs = self.layoutlmv2(
+            input_ids=input_ids,
+            bbox=bbox,
+            image=image,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output, masked_positions)
+        return prediction_scores
+
+
+# Copied from paddlenlp.transformers.layoutxlm.modeling.LayoutXLMOutput with XLM->LMv2
+class BiaffineAttention(nn.Layer):
+    """Implements a biaffine attention operator for binary relation classification."""
+
+    def __init__(self, in_features, out_features):
+        super(BiaffineAttention, self).__init__()
+
+        self.in_features = in_features
+        self.out_features = out_features
+
+        self.bilinear = nn.Bilinear(in_features, in_features, out_features, bias_attr=False)
+        self.linear = nn.Linear(2 * in_features, out_features)
+
+    def forward(self, x_1, x_2):
+        return self.bilinear(x_1, x_2) + self.linear(paddle.concat((x_1, x_2), axis=-1))
+
+
+# Copied from paddlenlp.transformers.layoutxlm.modeling.REDecoder
+class REDecoder(nn.Layer):
+    def __init__(self, hidden_size=768, hidden_dropout_prob=0.1):
+        super(REDecoder, self).__init__()
+        self.entity_emb = nn.Embedding(3, hidden_size)
+        projection = nn.Sequential(
+            nn.Linear(hidden_size * 2, hidden_size),
+            nn.ReLU(),
+            nn.Dropout(hidden_dropout_prob),
+            nn.Linear(hidden_size, hidden_size // 2),
+            nn.ReLU(),
+            nn.Dropout(hidden_dropout_prob),
+        )
+        self.ffnn_head = copy.deepcopy(projection)
+        self.ffnn_tail = copy.deepcopy(projection)
+        self.rel_classifier = BiaffineAttention(hidden_size // 2, 2)
+        self.loss_fct = CrossEntropyLoss()
+
+    def build_relation(self, relations, entities):
+        batch_size = len(relations)
+        new_relations = []
+        for b in range(batch_size):
+            if len(entities[b]["start"]) <= 2:
+                entities[b] = {"end": [1, 1], "label": [0, 0], "start": [0, 0]}
+            all_possible_relations = set(
+                [
+                    (i, j)
+                    for i in range(len(entities[b]["label"]))
+                    for j in range(len(entities[b]["label"]))
+                    if entities[b]["label"][i] == 1 and entities[b]["label"][j] == 2
+                ]
+            )
+            if len(all_possible_relations) == 0:
+                all_possible_relations = {(0, 1)}
+            if "head" in relations[b]:
+                positive_relations = set(list(zip(relations[b]["head"], relations[b]["tail"])))
+            else:
+                positive_relations = set()
+            negative_relations = all_possible_relations - positive_relations
+            positive_relations = set([i for i in positive_relations if i in all_possible_relations])
+            reordered_relations = list(positive_relations) + list(negative_relations)
+            relation_per_doc = {
+                "head": [i[0] for i in reordered_relations],
+                "tail": [i[1] for i in reordered_relations],
+                "label": [1] * len(positive_relations) + [0] * (len(reordered_relations) - len(positive_relations)),
+            }
+            assert len(relation_per_doc["head"]) != 0
+            new_relations.append(relation_per_doc)
+        return new_relations, entities
+
+    def get_predicted_relations(self, logits, relations, entities):
+        pred_relations = []
+        for i, pred_label in enumerate(logits.argmax(-1)):
+            if pred_label != 1:
+                continue
+            rel = {}
+            rel["head_id"] = relations["head"][i]
+            rel["head"] = (entities["start"][rel["head_id"]], entities["end"][rel["head_id"]])
+            rel["head_type"] = entities["label"][rel["head_id"]]
+
+            rel["tail_id"] = relations["tail"][i]
+            rel["tail"] = (entities["start"][rel["tail_id"]], entities["end"][rel["tail_id"]])
+            rel["tail_type"] = entities["label"][rel["tail_id"]]
+            rel["type"] = 1
+            pred_relations.append(rel)
+        return pred_relations
+
+    def forward(self, hidden_states, entities, relations):
+        batch_size, max_n_words, context_dim = hidden_states.shape
+        relations, entities = self.build_relation(relations, entities)
+        loss = 0
+        all_pred_relations = []
+        for b in range(batch_size):
+            if "head" not in relations[b]:
+                continue
+            head_entities = paddle.to_tensor(relations[b]["head"])
+            tail_entities = paddle.to_tensor(relations[b]["tail"])
+            relation_labels = paddle.to_tensor(relations[b]["label"])
+            entities_start_index = paddle.to_tensor(entities[b]["start"])
+            entities_labels = paddle.to_tensor(entities[b]["label"])
+            head_index = entities_start_index[head_entities]
+            head_label = entities_labels[head_entities]
+            head_label_repr = self.entity_emb(head_label)
+
+            tail_index = entities_start_index[tail_entities]
+            tail_label = entities_labels[tail_entities]
+            tail_label_repr = self.entity_emb(tail_label)
+
+            tmp_hidden_states = hidden_states[b][head_index]
+            if len(tmp_hidden_states.shape) == 1:
+                tmp_hidden_states = paddle.unsqueeze(tmp_hidden_states, axis=0)
+            head_repr = paddle.concat((tmp_hidden_states, head_label_repr), axis=-1)
+
+            tmp_hidden_states = hidden_states[b][tail_index]
+            if len(tmp_hidden_states.shape) == 1:
+                tmp_hidden_states = paddle.unsqueeze(tmp_hidden_states, axis=0)
+            tail_repr = paddle.concat((tmp_hidden_states, tail_label_repr), axis=-1)
+
+            heads = self.ffnn_head(head_repr)
+            tails = self.ffnn_tail(tail_repr)
+            logits = self.rel_classifier(heads, tails)
+            loss += self.loss_fct(logits, relation_labels)
+            pred_relations = self.get_predicted_relations(logits, relations[b], entities[b])
+            all_pred_relations.append(pred_relations)
+        return loss, all_pred_relations
+
+
+# Copied from paddlenlp.transformers.layoutxlm.modeling.LayoutXLMForRelationExtraction with XLM->LMv2
+class LayoutLMv2ForRelationExtraction(LayoutLMv2PretrainedModel):
+    def __init__(self, config):
+        super(LayoutLMv2ForRelationExtraction, self).__init__(config)
+        self.layoutlmv2 = LayoutLMv2Model(config)
+
+        self.extractor = REDecoder(config.hidden_size, config.hidden_dropout_prob)
+
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+
+    def _init_weights(self, layer):
+        """Initialize the weights"""
+        if isinstance(layer, nn.Linear):
+            layer.weight.set_value(paddle.tensor.normal(mean=0.0, std=0.02, shape=layer.weight.shape))
+            if layer.bias is not None:
+                layer.bias.set_value(paddle.tensor.zeros(shape=layer.bias.shape))
+        elif isinstance(layer, nn.Embedding):
+            layer.weight.set_value(paddle.tensor.normal(mean=0.0, std=0.02, shape=layer.weight.shape))
+            if layer._padding_idx is not None:
+                layer.weight[layer._padding_idx].set_value(
+                    paddle.tensor.normal(mean=0.0, std=0.02, shape=layer.weight[layer._padding_idx].shape)
+                )
+        elif isinstance(layer, nn.LayerNorm):
+            layer.weight.set_value(paddle.tensor.ones(shape=layer.bias.shape))
+            layer.bias.set_value(paddle.tensor.zeros(shape=layer.bias.shape))
+
+    def resize_position_embeddings(self, new_num_position_embeddings):
+        """
+        Resizes position embeddings of the model if `new_num_position_embeddings != config.max_position_embeddings`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embedding matrix. If position embeddings are learned, increasing the size
+                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
+                end.
+        """
+        self.layoutlmv2.resize_position_embeddings(new_num_position_embeddings)
+
+    def forward(
+        self,
+        input_ids,
+        bbox,
+        labels=None,
+        image=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        entities=None,
+        relations=None,
+    ):
+        outputs = self.layoutlmv2(
+            input_ids=input_ids,
+            bbox=bbox,
+            image=image,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+        )
+
+        seq_length = input_ids.shape[1]
+        sequence_output, _ = outputs[0][:, :seq_length], outputs[0][:, seq_length:]
+        sequence_output = self.dropout(sequence_output)
+        loss, pred_relations = self.extractor(sequence_output, entities, relations)
+
+        return dict(
+            loss=loss,
+            entities=entities,
+            relations=relations,
+            pred_relations=pred_relations,
+            hidden_states=outputs[0],
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlmv2/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlmv2/tokenizer.py
new file mode 100644
index 000000000..02a0b761d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutlmv2/tokenizer.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tokenization classes for LayoutLMv2 model."""
+
+from ..bert.tokenizer import BertTokenizer
+
+__all__ = ["LayoutLMv2Tokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "layoutlmv2-base-uncased": 512,
+    "layoutlmv2-large-uncased": 512,
+    "layoutlmv2-wo-backbone-base-uncased": 512,
+}
+
+
+class LayoutLMv2Tokenizer(BertTokenizer):
+    """
+    The usage of LayoutLMv2Tokenizer is the same as
+    `BertTokenizer <https://paddlenlp.readthedocs.io/zh/latest/source/paddlenlp.transformers.bert.tokenizer.html>`__.
+    For more information regarding those methods, please refer to this superclass.
+    """
+
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "layoutlmv2-base-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/layoutlmv2/layoutlmv2-base-uncased/vocab.txt",
+            "layoutlmv2-large-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/layoutlmv2/layoutlmv2-large-uncased/vocab.txt",
+            "layoutlmv2-wo-backbone-base-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/layoutlmv2/layoutlmv2-base-uncased/vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "layoutlmv2-base-uncased": {"do_lower_case": True},
+        "layoutlmv2-large-uncased": {"do_lower_case": True},
+        "layoutlmv2-wo-backbone-base-uncased": {"do_lower_case": True},
+    }
+
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutxlm/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutxlm/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutxlm/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutxlm/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutxlm/configuration.py
new file mode 100644
index 000000000..cff50737d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutxlm/configuration.py
@@ -0,0 +1,246 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" LayoutXLM model configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["LAYOUTXLM_PRETRAINED_INIT_CONFIGURATION", "LayoutXLMConfig", "LAYOUTXLM_PRETRAINED_RESOURCE_FILES_MAP"]
+
+LAYOUTXLM_PRETRAINED_INIT_CONFIGURATION = {
+    "layoutxlm-base-uncased": {
+        "attention_probs_dropout_prob": 0.1,
+        "bos_token_id": 0,
+        "coordinate_size": 128,
+        "eos_token_id": 2,
+        "fast_qkv": False,
+        "gradient_checkpointing": False,
+        "has_relative_attention_bias": False,
+        "has_spatial_attention_bias": False,
+        "has_visual_segment_embedding": True,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "image_feature_pool_shape": [7, 7, 256],
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "layer_norm_eps": 1e-05,
+        "max_2d_position_embeddings": 1024,
+        "max_position_embeddings": 514,
+        "max_rel_2d_pos": 256,
+        "max_rel_pos": 128,
+        "model_type": "layoutlmv2",
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "output_past": True,
+        "pad_token_id": 1,
+        "shape_size": 128,
+        "rel_2d_pos_bins": 64,
+        "rel_pos_bins": 32,
+        "type_vocab_size": 1,
+        "vocab_size": 250002,
+    },
+    "vi-layoutxlm-base-uncased": {
+        "attention_probs_dropout_prob": 0.1,
+        "bos_token_id": 0,
+        "coordinate_size": 128,
+        "eos_token_id": 2,
+        "fast_qkv": False,
+        "gradient_checkpointing": False,
+        "has_relative_attention_bias": False,
+        "has_spatial_attention_bias": False,
+        "has_visual_segment_embedding": True,
+        "use_visual_backbone": False,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "image_feature_pool_shape": [7, 7, 256],
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "layer_norm_eps": 1e-05,
+        "max_2d_position_embeddings": 1024,
+        "max_position_embeddings": 514,
+        "max_rel_2d_pos": 256,
+        "max_rel_pos": 128,
+        "model_type": "layoutlmv2",
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "output_past": True,
+        "pad_token_id": 1,
+        "shape_size": 128,
+        "rel_2d_pos_bins": 64,
+        "rel_pos_bins": 32,
+        "type_vocab_size": 1,
+        "vocab_size": 250002,
+    },
+}
+
+LAYOUTXLM_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "layoutxlm-base-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/layoutxlm_base/model_state.pdparams",
+        "vi-layoutxlm-base-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/vi-layoutxlm-base-uncased/model_state.pdparams",
+    }
+}
+
+
+class LayoutXLMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LayoutXLMtModel`]. It is used to instantiate a
+    LayoutXLM model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the LayoutXLM.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the SqueezeBERT model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`SqueezeBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The ID of the token in the word embedding to use as padding.
+        embedding_size (`int`, *optional*, defaults to 768):
+            The dimension of the word embedding vectors.
+
+        q_groups (`int`, *optional*, defaults to 4):
+            The number of groups in Q layer.
+        k_groups (`int`, *optional*, defaults to 4):
+            The number of groups in K layer.
+        v_groups (`int`, *optional*, defaults to 4):
+            The number of groups in V layer.
+        post_attention_groups (`int`, *optional*, defaults to 1):
+            The number of groups in the first feed forward network layer.
+        intermediate_groups (`int`, *optional*, defaults to 4):
+            The number of groups in the second feed forward network layer.
+        output_groups (`int`, *optional*, defaults to 4):
+            The number of groups in the third feed forward network layer.
+
+    Examples:
+
+    ```python
+    >>> from transformers import SqueezeBertConfig, SqueezeBertModel
+
+    >>> # Initializing a SqueezeBERT configuration
+    >>> configuration = SqueezeBertConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration above
+    >>> model = SqueezeBertModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+
+    Attributes: pretrained_config_archive_map (Dict[str, str]): A dictionary containing all the available pre-trained
+    checkpoints.
+    """
+    pretrained_init_configuration = LAYOUTXLM_PRETRAINED_INIT_CONFIGURATION
+    attribute_map: Dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
+    model_type = "layoutxlm"
+
+    def __init__(
+        self,
+        attention_probs_dropout_prob=0.1,
+        bos_token_id=0,
+        coordinate_size=128,
+        eos_token_id=2,
+        fast_qkv=False,
+        gradient_checkpointing=False,
+        has_relative_attention_bias=False,
+        has_spatial_attention_bias=False,
+        has_visual_segment_embedding=True,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        hidden_size=768,
+        image_feature_pool_shape=[7, 7, 256],
+        initializer_range=0.02,
+        intermediate_size=3072,
+        layer_norm_eps=1e-05,
+        max_2d_position_embeddings=1024,
+        max_position_embeddings=514,
+        max_rel_2d_pos=256,
+        max_rel_pos=128,
+        model_type="layoutlmv2",
+        num_attention_heads=12,
+        num_hidden_layers=12,
+        output_past=True,
+        pad_token_id=1,
+        shape_size=128,
+        rel_2d_pos_bins=64,
+        rel_pos_bins=32,
+        type_vocab_size=1,
+        vocab_size=250002,
+        with_pool="tanh",
+        use_visual_backbone=False,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.max_2d_position_embeddings = max_2d_position_embeddings
+        self.max_rel_pos = max_rel_pos
+        self.max_rel_2d_pos = max_rel_2d_pos
+        self.rel_pos_bins = rel_pos_bins
+        self.rel_2d_pos_bins = rel_2d_pos_bins
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.coordinate_size = coordinate_size
+        self.shape_size = shape_size
+        self.image_feature_pool_shape = image_feature_pool_shape
+        self.fast_qkv = fast_qkv
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.has_spatial_attention_bias = has_spatial_attention_bias
+        self.has_visual_segment_embedding = has_visual_segment_embedding
+        self.output_past = output_past
+        self.gradient_checkpointing = gradient_checkpointing
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.model_type = model_type
+        self.with_pool = with_pool
+        self.use_visual_backbone = use_visual_backbone
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutxlm/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutxlm/modeling.py
new file mode 100644
index 000000000..67a9881ee
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutxlm/modeling.py
@@ -0,0 +1,1411 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Modeling classes for LayoutXLM model."""
+
+import copy
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import CrossEntropyLoss, Layer
+
+from paddlenlp.utils.log import logger
+
+from .. import PretrainedModel, register_base_model
+from .configuration import (
+    LAYOUTXLM_PRETRAINED_INIT_CONFIGURATION,
+    LAYOUTXLM_PRETRAINED_RESOURCE_FILES_MAP,
+    LayoutXLMConfig,
+)
+from .visual_backbone import build_resnet_fpn_backbone, read_config
+
+__all__ = [
+    "LayoutXLMModel",
+    "LayoutXLMPretrainedModel",
+    "LayoutXLMForTokenClassification",
+    "LayoutXLMForSequenceClassification",
+    "LayoutXLMForPretraining",
+    "LayoutXLMForRelationExtraction",
+    "LayoutXLMForQuestionAnswering",
+]
+
+
+def relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+    ret = 0
+    if bidirectional:
+        num_buckets //= 2
+        ret += (relative_position > 0).astype(paddle.int64) * num_buckets
+        n = paddle.abs(relative_position)
+    else:
+        n = paddle.max(-relative_position, paddle.zeros_like(relative_position))
+    # Now n is in the range [0, inf)
+    # half of the buckets are for exact increments in positions
+    max_exact = num_buckets // 2
+    is_small = n < max_exact
+
+    # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+    val_if_large = max_exact + (
+        paddle.log(n.astype(paddle.float32) / max_exact)
+        / math.log(max_distance / max_exact)
+        * (num_buckets - max_exact)
+    ).astype(paddle.int64)
+
+    val_if_large = paddle.minimum(val_if_large, paddle.full_like(val_if_large, num_buckets - 1))
+
+    ret += paddle.where(is_small, n, val_if_large)
+    return ret
+
+
+def token_featue_to_sequence_feature(input_ids, seq_length, sequence_output):
+    """
+    used to transform token feature into sequence feature by
+    averaging all the token features of certain sequence
+    """
+    batches = input_ids.shape[0]
+    for batch_id in range(batches):
+        start_idx = -1
+        for i in range(0, seq_length):
+            if input_ids[batch_id, i] == 6:
+                if start_idx > -1:
+                    feature_block = sequence_output[batch_id, start_idx + 1 : i]
+                    sequence_output[batch_id, start_idx] = paddle.mean(feature_block, axis=0)
+                start_idx = i
+
+            if input_ids[batch_id, i] == 1:
+                feature_block = sequence_output[batch_id, start_idx + 1 : i]
+                sequence_output[batch_id, start_idx] = paddle.mean(feature_block, axis=0)
+                break
+
+        if i == seq_length - 1:
+            sequence_output[batch_id, start_idx] = paddle.mean(feature_block, axis=0)
+        return
+
+
+class LayoutXLMPooler(Layer):
+    def __init__(self, config: LayoutXLMConfig):
+        super(LayoutXLMPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.with_pool = config.with_pool
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        if self.with_pool == "tanh":
+            pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class LayoutXLMEmbeddings(Layer):
+    """
+    Include embeddings from word, position and token_type embeddings
+    """
+
+    def __init__(self, config: LayoutXLMConfig):
+        super(LayoutXLMEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+
+        self.x_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
+        self.y_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
+        self.h_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
+        self.w_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
+
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.register_buffer(
+            "position_ids", paddle.arange(config.max_position_embeddings, dtype="int64").expand((1, -1))
+        )
+
+    def _cal_spatial_position_embeddings(self, bbox):
+        try:
+            left_position_embeddings = self.x_position_embeddings(bbox[:, :, 0])
+            upper_position_embeddings = self.y_position_embeddings(bbox[:, :, 1])
+            right_position_embeddings = self.x_position_embeddings(bbox[:, :, 2])
+            lower_position_embeddings = self.y_position_embeddings(bbox[:, :, 3])
+        except IndexError as e:
+            raise IndexError("The :obj:`bbox`coordinate values should be within 0-1000 range.") from e
+
+        h_position_embeddings = self.h_position_embeddings(bbox[:, :, 3] - bbox[:, :, 1])
+        w_position_embeddings = self.w_position_embeddings(bbox[:, :, 2] - bbox[:, :, 0])
+
+        spatial_position_embeddings = paddle.concat(
+            [
+                left_position_embeddings,
+                upper_position_embeddings,
+                right_position_embeddings,
+                lower_position_embeddings,
+                h_position_embeddings,
+                w_position_embeddings,
+            ],
+            axis=-1,
+        )
+        return spatial_position_embeddings
+
+    def forward(self, input_ids, bbox=None, token_type_ids=None, position_ids=None):
+        if position_ids is None:
+            ones = paddle.ones_like(input_ids, dtype="int64")
+            seq_length = paddle.cumsum(ones, axis=-1)
+
+            position_ids = seq_length - ones
+            position_ids.stop_gradient = True
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros_like(input_ids, dtype="int64")
+
+        input_embedings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+
+        try:
+            left_position_embeddings = self.x_position_embeddings(bbox[:, :, 0])
+            upper_position_embeddings = self.y_position_embeddings(bbox[:, :, 1])
+            right_position_embeddings = self.x_position_embeddings(bbox[:, :, 2])
+            lower_position_embeddings = self.y_position_embeddings(bbox[:, :, 3])
+        except IndexError as e:
+            raise IndexError("The :obj:`bbox`coordinate values should be within 0-1000 range.") from e
+        h_position_embeddings = self.h_position_embeddings(bbox[:, :, 3] - bbox[:, :, 1])
+        w_position_embeddings = self.w_position_embeddings(bbox[:, :, 2] - bbox[:, :, 0])
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = (
+            input_embedings
+            + position_embeddings
+            + left_position_embeddings
+            + upper_position_embeddings
+            + right_position_embeddings
+            + lower_position_embeddings
+            + h_position_embeddings
+            + w_position_embeddings
+            + token_type_embeddings
+        )
+
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class LayoutXLMPretrainedModel(PretrainedModel):
+
+    config_class = LayoutXLMConfig
+    pretrained_init_configuration = LAYOUTXLM_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = LAYOUTXLM_PRETRAINED_RESOURCE_FILES_MAP
+    base_model_prefix = "layoutxlm"
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.pretrained_init_configuration["initializer_range"]
+                        if "initializer_range" in self.pretrained_init_configuration
+                        else 0.02,
+                        shape=layer.weight.shape,
+                    )
+                )
+
+
+class LayoutXLMSelfOutput(nn.Layer):
+    def __init__(self, config: LayoutXLMConfig):
+        super(LayoutXLMSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor.astype(hidden_states.dtype))
+        return hidden_states
+
+
+class LayoutXLMSelfAttention(nn.Layer):
+    def __init__(self, config: LayoutXLMConfig):
+        super(LayoutXLMSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size {} is not a multiple of the number of attention "
+                "heads {}".format(config.hidden_size, config.num_attention_heads)
+            )
+        self.fast_qkv = config.fast_qkv
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.has_relative_attention_bias = config.has_relative_attention_bias
+        self.has_spatial_attention_bias = config.has_spatial_attention_bias
+
+        if config.fast_qkv:
+            self.qkv_linear = nn.Linear(config.hidden_size, 3 * self.all_head_size, bias_attr=False)
+            self.q_bias = self.create_parameter(
+                shape=[1, 1, self.all_head_size], default_initializer=nn.initializer.Constant(0.0)
+            )
+            self.v_bias = self.create_parameter(
+                shape=[1, 1, self.all_head_size], default_initializer=nn.initializer.Constant(0.0)
+            )
+        else:
+            self.query = nn.Linear(config.hidden_size, self.all_head_size)
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = list(x.shape[:-1]) + [self.num_attention_heads, self.attention_head_size]
+
+        x = x.reshape(new_x_shape)
+        return x.transpose([0, 2, 1, 3])
+
+    def compute_qkv(self, hidden_states):
+        if self.fast_qkv:
+            qkv = self.qkv_linear(hidden_states)
+            q, k, v = paddle.chunk(qkv, 3, axis=-1)
+            if q.ndimension() == self.q_bias.ndimension():
+                q = q + self.q_bias
+                v = v + self.v_bias
+            else:
+                _sz = (1,) * (q.ndimension() - 1) + (-1,)
+                q = q + self.q_bias.reshape(_sz)
+                v = v + self.v_bias.vreshape(_sz)
+        else:
+            q = self.query(hidden_states)
+            k = self.key(hidden_states)
+            v = self.value(hidden_states)
+        return q, k, v
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        rel_pos=None,
+        rel_2d_pos=None,
+    ):
+        q, k, v = self.compute_qkv(hidden_states)
+
+        # (B, L, H*D) -> (B, H, L, D)
+        query_layer = self.transpose_for_scores(q)
+        key_layer = self.transpose_for_scores(k)
+        value_layer = self.transpose_for_scores(v)
+
+        query_layer = query_layer / math.sqrt(self.attention_head_size)
+        # [BSZ, NAT, L, L]
+        attention_scores = paddle.matmul(query_layer, key_layer.transpose([0, 1, 3, 2]))
+        if self.has_relative_attention_bias:
+            attention_scores += rel_pos
+        if self.has_spatial_attention_bias:
+            attention_scores += rel_2d_pos
+        bool_attention_mask = attention_mask.astype(paddle.bool)
+        bool_attention_mask.stop_gradient = True
+        attention_scores_shape = attention_scores.shape
+        attention_scores = paddle.where(
+            bool_attention_mask.expand(attention_scores_shape),
+            paddle.ones(attention_scores_shape) * float("-1e10"),
+            attention_scores,
+        )
+        attention_probs = F.softmax(attention_scores, axis=-1)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        context_layer = paddle.matmul(attention_probs, value_layer)
+        context_layer = context_layer.transpose([0, 2, 1, 3])
+        new_context_layer_shape = list(context_layer.shape[:-2]) + [self.all_head_size]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        if output_attentions:
+            outputs = [context_layer, attention_probs]
+        else:
+            outputs = [context_layer]
+        return outputs
+
+
+class LayoutXLMAttention(nn.Layer):
+    def __init__(self, config: LayoutXLMConfig):
+        super(LayoutXLMAttention, self).__init__()
+        self.self = LayoutXLMSelfAttention(config)
+        self.output = LayoutXLMSelfOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        rel_pos=None,
+        rel_2d_pos=None,
+    ):
+
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+            rel_pos=rel_pos,
+            rel_2d_pos=rel_2d_pos,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        # add attentions if we output them
+        if output_attentions:
+            outputs = [
+                attention_output,
+            ] + self_outputs[1:]
+        else:
+            outputs = [attention_output]
+        return outputs
+
+
+class LayoutXLMEncoder(nn.Layer):
+    def __init__(self, config: LayoutXLMConfig):
+        super(LayoutXLMEncoder, self).__init__()
+        self.config = config
+        self.layer = nn.LayerList([LayoutXLMLayer(config) for _ in range(config.num_hidden_layers)])
+
+        self.has_relative_attention_bias = config.has_relative_attention_bias
+        self.has_spatial_attention_bias = config.has_spatial_attention_bias
+
+        if self.has_relative_attention_bias:
+            self.rel_pos_bins = config.rel_pos_bins
+            self.max_rel_pos = config.max_rel_pos
+            self.rel_pos_onehot_size = config.rel_pos_bins
+            self.rel_pos_bias = nn.Linear(self.rel_pos_onehot_size, config.num_attention_heads, bias_attr=False)
+
+        if self.has_spatial_attention_bias:
+            self.max_rel_2d_pos = config.max_rel_2d_pos
+            self.rel_2d_pos_bins = config.rel_2d_pos_bins
+            self.rel_2d_pos_onehot_size = config.rel_2d_pos_bins
+            self.rel_pos_x_bias = nn.Linear(self.rel_2d_pos_onehot_size, config.num_attention_heads, bias_attr=False)
+            self.rel_pos_y_bias = nn.Linear(self.rel_2d_pos_onehot_size, config.num_attention_heads, bias_attr=False)
+
+    def _cal_1d_pos_emb(self, hidden_states, position_ids):
+        rel_pos_mat = position_ids.unsqueeze(-2) - position_ids.unsqueeze(-1)
+        rel_pos = relative_position_bucket(
+            rel_pos_mat,
+            num_buckets=self.rel_pos_bins,
+            max_distance=self.max_rel_pos,
+        )
+        rel_pos = paddle.nn.functional.one_hot(rel_pos, num_classes=self.rel_pos_onehot_size).astype(
+            hidden_states.dtype
+        )
+        rel_pos = self.rel_pos_bias(rel_pos).transpose([0, 3, 1, 2])
+        return rel_pos
+
+    def _cal_2d_pos_emb(self, hidden_states, bbox):
+        position_coord_x = bbox[:, :, 0]
+        position_coord_y = bbox[:, :, 3]
+        rel_pos_x_2d_mat = position_coord_x.unsqueeze(-2) - position_coord_x.unsqueeze(-1)
+        rel_pos_y_2d_mat = position_coord_y.unsqueeze(-2) - position_coord_y.unsqueeze(-1)
+        rel_pos_x = relative_position_bucket(
+            rel_pos_x_2d_mat,
+            num_buckets=self.rel_2d_pos_bins,
+            max_distance=self.max_rel_2d_pos,
+        )
+        rel_pos_y = relative_position_bucket(
+            rel_pos_y_2d_mat,
+            num_buckets=self.rel_2d_pos_bins,
+            max_distance=self.max_rel_2d_pos,
+        )
+        rel_pos_x = F.one_hot(rel_pos_x, num_classes=self.rel_2d_pos_onehot_size).astype(hidden_states.dtype)
+        rel_pos_y = F.one_hot(rel_pos_y, num_classes=self.rel_2d_pos_onehot_size).astype(hidden_states.dtype)
+        rel_pos_x = self.rel_pos_x_bias(rel_pos_x).transpose([0, 3, 1, 2])
+        rel_pos_y = self.rel_pos_y_bias(rel_pos_y).transpose([0, 3, 1, 2])
+        rel_2d_pos = rel_pos_x + rel_pos_y
+        return rel_2d_pos
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        bbox=None,
+        position_ids=None,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+
+        rel_pos = self._cal_1d_pos_emb(hidden_states, position_ids) if self.has_relative_attention_bias else None
+        rel_2d_pos = self._cal_2d_pos_emb(hidden_states, bbox) if self.has_spatial_attention_bias else None
+
+        hidden_save = dict()
+        hidden_save["input_hidden_states"] = hidden_states
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            # gradient_checkpointing is set as False here so we remove some codes here
+            hidden_save["input_attention_mask"] = attention_mask
+            hidden_save["input_layer_head_mask"] = layer_head_mask
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_value,
+                output_attentions,
+                rel_pos=rel_pos,
+                rel_2d_pos=rel_2d_pos,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            hidden_save["{}_data".format(i)] = hidden_states
+
+        return hidden_states, hidden_save
+
+
+class LayoutXLMIntermediate(nn.Layer):
+    def __init__(self, config: LayoutXLMConfig):
+        super(LayoutXLMIntermediate, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if config.hidden_act == "gelu":
+            self.intermediate_act_fn = nn.GELU()
+        else:
+            assert False, "hidden_act is set as: {}, please check it..".format(config.hidden_act)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class LayoutXLMOutput(nn.Layer):
+    def __init__(self, config: LayoutXLMConfig):
+        super(LayoutXLMOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class LayoutXLMLayer(nn.Layer):
+    def __init__(self, config: LayoutXLMConfig):
+        super(LayoutXLMLayer, self).__init__()
+        # since chunk_size_feed_forward is 0 as default, no chunk is needed here.
+        self.seq_len_dim = 1
+        self.attention = LayoutXLMAttention(config)
+        self.add_cross_attention = False  # default as false
+        self.intermediate = LayoutXLMIntermediate(config)
+        self.output = LayoutXLMOutput(config)
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        rel_pos=None,
+        rel_2d_pos=None,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+            rel_pos=rel_pos,
+            rel_2d_pos=rel_2d_pos,
+        )
+        attention_output = self_attention_outputs[0]
+        layer_output = self.feed_forward_chunk(attention_output)
+
+        if output_attentions:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+            outputs = [
+                layer_output,
+            ] + list(outputs)
+        else:
+            outputs = [layer_output]
+        return outputs
+
+
+class VisualBackbone(nn.Layer):
+    def __init__(self, config: LayoutXLMConfig):
+        super(VisualBackbone, self).__init__()
+        self.cfg = read_config()
+        self.backbone = build_resnet_fpn_backbone(self.cfg)
+
+        assert len(self.cfg.MODEL.PIXEL_MEAN) == len(self.cfg.MODEL.PIXEL_STD)
+        num_channels = len(self.cfg.MODEL.PIXEL_MEAN)
+        self.register_buffer("pixel_mean", paddle.to_tensor(self.cfg.MODEL.PIXEL_MEAN).reshape([num_channels, 1, 1]))
+        self.register_buffer("pixel_std", paddle.to_tensor(self.cfg.MODEL.PIXEL_STD).reshape([num_channels, 1, 1]))
+        self.out_feature_key = "p2"
+        # is_deterministic is disabled here.
+        self.pool = nn.AdaptiveAvgPool2D(config.image_feature_pool_shape[:2])
+        if len(config.image_feature_pool_shape) == 2:
+            config.image_feature_pool_shape.append(self.backbone.output_shape()[self.out_feature_key].channels)
+        assert self.backbone.output_shape()[self.out_feature_key].channels == config.image_feature_pool_shape[2]
+
+    def forward(self, images):
+        images_input = (paddle.to_tensor(images) - self.pixel_mean) / self.pixel_std
+        features = self.backbone(images_input)
+        features = features[self.out_feature_key]
+        features = self.pool(features).flatten(start_axis=2).transpose([0, 2, 1])
+        return features
+
+
+@register_base_model
+class LayoutXLMModel(LayoutXLMPretrainedModel):
+    """
+    The bare LayoutXLM Model outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        vocab_size (`int`):
+            Vocabulary size of the XLNet model. Defines the number of different tokens that can
+            be represented by the `inputs_ids` passed when calling XLNetModel.
+        hidden_size (`int`, optional):
+            Dimensionality of the encoder layers and the pooler layer. Defaults to ``768``.
+        num_hidden_layers (`int`, optional):
+            Number of hidden layers in the Transformer encoder. Defaults to ``12``.
+        num_attention_heads (`int`, optional):
+            Number of attention heads for each attention layer in the Transformer encoder.
+            Defaults to ``12``.
+        intermediate_size (`int`, optional):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+            Defaults to ``3072``.
+        hidden_act (`str`, optional):
+            The non-linear activation function in the feed-forward layer.
+            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
+            are supported. Defaults to ``"gelu"``.
+        hidden_dropout_prob (`float`, optional):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+            Defaults to ``0.1``.
+        attention_probs_dropout_prob (`float`, optional):
+            The dropout probability for all fully connected layers in the pooler.
+            Defaults to ``0.1``.
+        initializer_range (`float`, optional):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            Defaults to ``0.02``.
+    """
+
+    def __init__(self, config: LayoutXLMConfig):
+        super(LayoutXLMModel, self).__init__(config)
+        self.config = config
+        self.use_visual_backbone = config.use_visual_backbone
+        self.has_visual_segment_embedding = config.has_visual_segment_embedding
+        self.embeddings = LayoutXLMEmbeddings(config)
+
+        if self.use_visual_backbone is True:
+            self.visual = VisualBackbone(config)
+            self.visual.stop_gradient = True
+            self.visual_proj = nn.Linear(config.image_feature_pool_shape[-1], config.hidden_size)
+
+        if self.has_visual_segment_embedding:
+            self.visual_segment_embedding = self.create_parameter(
+                shape=[
+                    config.hidden_size,
+                ],
+                dtype=paddle.float32,
+            )
+        self.visual_LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.visual_dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.encoder = LayoutXLMEncoder(config)
+        self.pooler = LayoutXLMPooler(config)
+
+    def _calc_text_embeddings(self, input_ids, bbox, position_ids, token_type_ids):
+        words_embeddings = self.embeddings.word_embeddings(input_ids)
+        position_embeddings = self.embeddings.position_embeddings(position_ids)
+        spatial_position_embeddings = self.embeddings._cal_spatial_position_embeddings(bbox)
+        token_type_embeddings = self.embeddings.token_type_embeddings(token_type_ids)
+        embeddings = words_embeddings + position_embeddings + spatial_position_embeddings + token_type_embeddings
+        embeddings = self.embeddings.LayerNorm(embeddings)
+        embeddings = self.embeddings.dropout(embeddings)
+        return embeddings
+
+    def _calc_visual_bbox(self, image_feature_pool_shape, bbox, visual_shape):
+        visual_bbox_x = (
+            paddle.arange(
+                0,
+                1000 * (image_feature_pool_shape[1] + 1),
+                1000,
+                dtype=bbox.dtype,
+            )
+            // image_feature_pool_shape[1]
+        )
+        visual_bbox_y = (
+            paddle.arange(
+                0,
+                1000 * (image_feature_pool_shape[0] + 1),
+                1000,
+                dtype=bbox.dtype,
+            )
+            // image_feature_pool_shape[0]
+        )
+
+        expand_shape = image_feature_pool_shape[0:2]
+        visual_bbox = paddle.stack(
+            [
+                visual_bbox_x[:-1].expand(expand_shape),
+                visual_bbox_y[:-1].expand(expand_shape[::-1]).transpose([1, 0]),
+                visual_bbox_x[1:].expand(expand_shape),
+                visual_bbox_y[1:].expand(expand_shape[::-1]).transpose([1, 0]),
+            ],
+            axis=-1,
+        ).reshape([expand_shape[0] * expand_shape[1], bbox.shape[-1]])
+
+        visual_bbox = visual_bbox.expand([visual_shape[0], visual_bbox.shape[0], visual_bbox.shape[1]])
+        return visual_bbox
+
+    def _calc_img_embeddings(self, image, bbox, position_ids):
+        use_image_info = self.use_visual_backbone and image is not None
+        position_embeddings = self.embeddings.position_embeddings(position_ids)
+        spatial_position_embeddings = self.embeddings._cal_spatial_position_embeddings(bbox)
+        if use_image_info is True:
+            visual_embeddings = self.visual_proj(self.visual(image.astype(paddle.float32)))
+            embeddings = visual_embeddings + position_embeddings + spatial_position_embeddings
+        else:
+            embeddings = position_embeddings + spatial_position_embeddings
+
+        if self.has_visual_segment_embedding:
+            embeddings += self.visual_segment_embedding
+        embeddings = self.visual_LayerNorm(embeddings)
+        embeddings = self.visual_dropout(embeddings)
+        return embeddings
+
+    def resize_position_embeddings(self, new_num_position_embeddings):
+        """
+        Resizes position embeddings of the model if `new_num_position_embeddings != config["max_position_embeddings"]`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embedding matrix. If position embeddings are learned, increasing the size
+                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
+                end.
+        """
+        num_position_embeds_diff = new_num_position_embeddings - self.config.max_position_embeddings
+
+        # no resizing needs to be done if the length stays the same
+        if num_position_embeds_diff == 0:
+            return
+
+        logger.info(f"Setting `config.max_position_embeddings={new_num_position_embeddings}`...")
+        self.config.max_position_embeddings = new_num_position_embeddings
+
+        old_position_embeddings_weight = self.embeddings.position_embeddings.weight
+
+        self.embeddings.position_embeddings = nn.Embedding(
+            self.config.max_position_embeddings, self.config.hidden_size
+        )
+
+        with paddle.no_grad():
+            if num_position_embeds_diff > 0:
+                self.embeddings.position_embeddings.weight[:-num_position_embeds_diff] = old_position_embeddings_weight
+            else:
+                self.embeddings.position_embeddings.weight = old_position_embeddings_weight[:num_position_embeds_diff]
+
+    def forward(
+        self,
+        input_ids=None,
+        bbox=None,
+        image=None,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        head_mask=None,
+        output_hidden_states=False,
+        output_attentions=False,
+    ):
+        input_shape = input_ids.shape
+        visual_shape = list(input_shape)
+        visual_shape[1] = self.config.image_feature_pool_shape[0] * self.config.image_feature_pool_shape[1]
+        visual_bbox = self._calc_visual_bbox(self.config.image_feature_pool_shape, bbox, visual_shape)
+
+        final_bbox = paddle.concat([bbox, visual_bbox], axis=1)
+        if attention_mask is None:
+            attention_mask = paddle.ones(input_shape)
+
+        if self.use_visual_backbone is True:
+            visual_attention_mask = paddle.ones(visual_shape)
+        else:
+            visual_attention_mask = paddle.zeros(visual_shape)
+
+        attention_mask = attention_mask.astype(visual_attention_mask.dtype)
+
+        final_attention_mask = paddle.concat([attention_mask, visual_attention_mask], axis=1)
+
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64)
+
+        if position_ids is None:
+            seq_length = input_shape[1]
+            position_ids = self.embeddings.position_ids[:, :seq_length]
+            position_ids = position_ids.expand(input_shape)
+
+        visual_position_ids = paddle.arange(0, visual_shape[1]).expand([input_shape[0], visual_shape[1]])
+        final_position_ids = paddle.concat([position_ids, visual_position_ids], axis=1)
+
+        if bbox is None:
+            bbox = paddle.zeros(input_shape + [4])
+
+        text_layout_emb = self._calc_text_embeddings(
+            input_ids=input_ids,
+            bbox=bbox,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+        )
+
+        visual_emb = self._calc_img_embeddings(
+            image=image,
+            bbox=visual_bbox,
+            position_ids=visual_position_ids,
+        )
+        final_emb = paddle.concat([text_layout_emb, visual_emb], axis=1)
+
+        extended_attention_mask = final_attention_mask.unsqueeze(1).unsqueeze(2)
+
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        encoder_outputs = self.encoder(
+            final_emb,
+            extended_attention_mask,
+            bbox=final_bbox,
+            position_ids=final_position_ids,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output)
+        return sequence_output, pooled_output, encoder_outputs[1]
+
+
+class LayoutXLMForTokenClassification(LayoutXLMPretrainedModel):
+    def __init__(self, config: LayoutXLMConfig):
+        super(LayoutXLMForTokenClassification, self).__init__(config)
+        self.num_classes = config.num_labels
+        self.layoutxlm = LayoutXLMModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, self.num_classes)
+
+    def get_input_embeddings(self):
+        return self.layoutxlm.embeddings.word_embeddings
+
+    def resize_position_embeddings(self, new_num_position_embeddings):
+        """
+        Resizes position embeddings of the model if `new_num_position_embeddings != config["max_position_embeddings"]`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embedding matrix. If position embeddings are learned, increasing the size
+                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
+                end.
+        """
+        self.layoutxlm.resize_position_embeddings(new_num_position_embeddings)
+
+    def forward(
+        self,
+        input_ids=None,
+        bbox=None,
+        image=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        labels=None,
+    ):
+        outputs = self.layoutxlm(
+            input_ids=input_ids,
+            bbox=bbox,
+            image=image,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+        )
+        seq_length = input_ids.shape[1]
+        # sequence out and image out
+        sequence_output = outputs[0][:, :seq_length]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        hidden_states = {
+            f"hidden_states_{idx}": outputs[2][f"{idx}_data"] for idx in range(self.layoutxlm.config.num_hidden_layers)
+        }
+        if self.training:
+            outputs = (logits, hidden_states)
+        else:
+            outputs = (logits,)
+
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+
+            if attention_mask is not None:
+                active_loss = (
+                    attention_mask.reshape(
+                        [
+                            -1,
+                        ]
+                    )
+                    == 1
+                )
+                active_logits = logits.reshape([-1, self.num_classes])[active_loss]
+                active_labels = labels.reshape(
+                    [
+                        -1,
+                    ]
+                )[active_loss]
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(
+                    logits.reshape([-1, self.num_classes]),
+                    labels.reshape(
+                        [
+                            -1,
+                        ]
+                    ),
+                )
+
+            outputs = (loss,) + outputs
+
+        return outputs
+
+
+class LayoutXLMForSequenceClassification(LayoutXLMPretrainedModel):
+    def __init__(self, config: LayoutXLMConfig):
+        super(LayoutXLMForSequenceClassification, self).__init__(config)
+        self.num_classes = config.num_labels
+
+        self.layoutxlm = LayoutXLMModel(config)
+
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size * 3, self.num_classes)
+
+    def get_input_embeddings(self):
+        return self.layoutxlm.embeddings.word_embeddings
+
+    def resize_position_embeddings(self, new_num_position_embeddings):
+        """
+        Resizes position embeddings of the model if `new_num_position_embeddings != config["max_position_embeddings"]`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embedding matrix. If position embeddings are learned, increasing the size
+                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
+                end.
+        """
+        self.layoutxlm.resize_position_embeddings(new_num_position_embeddings)
+
+    def forward(
+        self,
+        input_ids=None,
+        bbox=None,
+        image=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        labels=None,
+    ):
+        input_shape = input_ids.shape
+        visual_shape = list(input_shape)
+        visual_shape[1] = (
+            self.layoutxlm.config.image_feature_pool_shape[0] * self.layoutxlm.config.image_feature_pool_shape[1]
+        )
+        visual_bbox = self.layoutxlm._calc_visual_bbox(
+            self.layoutxlm.config.image_feature_pool_shape, bbox, visual_shape
+        )
+
+        visual_position_ids = paddle.arange(0, visual_shape[1]).expand([input_shape[0], visual_shape[1]])
+
+        initial_image_embeddings = self.layoutxlm._calc_img_embeddings(
+            image=image,
+            bbox=visual_bbox,
+            position_ids=visual_position_ids,
+        )
+
+        outputs = self.layoutxlm(
+            input_ids=input_ids,
+            bbox=bbox,
+            image=image,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+        )
+        seq_length = input_ids.shape[1]
+        # sequence out and image out
+        sequence_output, final_image_embeddings = outputs[0][:, :seq_length], outputs[0][:, seq_length:]
+
+        cls_final_output = sequence_output[:, 0, :]
+
+        # average-pool the visual embeddings
+        pooled_initial_image_embeddings = initial_image_embeddings.mean(axis=1)
+        pooled_final_image_embeddings = final_image_embeddings.mean(axis=1)
+        # concatenate with cls_final_output
+        sequence_output = paddle.concat(
+            [cls_final_output, pooled_initial_image_embeddings, pooled_final_image_embeddings], axis=1
+        )
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        outputs = (logits,)
+
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+
+            loss = loss_fct(
+                logits.reshape([-1, self.num_classes]),
+                labels.reshape(
+                    [
+                        -1,
+                    ]
+                ),
+            )
+
+            outputs = (loss,) + outputs
+
+        return outputs
+
+
+class LayoutXLMPredictionHead(Layer):
+    """
+    Bert Model with a `language modeling` head on top for CLM fine-tuning.
+    """
+
+    def __init__(self, hidden_size, vocab_size, activation, embedding_weights=None):
+        super(LayoutXLMPredictionHead, self).__init__()
+        self.transform = nn.Linear(hidden_size, hidden_size)
+        self.activation = getattr(nn.functional, activation)
+        self.layer_norm = nn.LayerNorm(hidden_size)
+        self.decoder_weight = (
+            self.create_parameter(shape=[vocab_size, hidden_size], dtype=self.transform.weight.dtype, is_bias=False)
+            if embedding_weights is None
+            else embedding_weights
+        )
+        self.decoder_bias = self.create_parameter(shape=[vocab_size], dtype=self.decoder_weight.dtype, is_bias=True)
+
+    def forward(self, hidden_states, masked_positions=None):
+        if masked_positions is not None:
+            hidden_states = paddle.reshape(hidden_states, [-1, hidden_states.shape[-1]])
+            hidden_states = paddle.tensor.gather(hidden_states, masked_positions)
+        # gather masked tokens might be more quick
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = paddle.tensor.matmul(hidden_states, self.decoder_weight, transpose_y=True) + self.decoder_bias
+        return hidden_states
+
+
+class LayoutXLMPretrainingHeads(Layer):
+    def __init__(self, hidden_size, vocab_size, activation, embedding_weights=None):
+        super(LayoutXLMPretrainingHeads, self).__init__()
+        self.predictions = LayoutXLMPredictionHead(hidden_size, vocab_size, activation, embedding_weights)
+
+    def forward(self, sequence_output, masked_positions=None):
+        prediction_scores = self.predictions(sequence_output, masked_positions)
+        return prediction_scores
+
+
+class LayoutXLMForPretraining(LayoutXLMPretrainedModel):
+    def __init__(self, config: LayoutXLMConfig):
+        super(LayoutXLMForPretraining, self).__init__(config)
+        self.layoutxlm = LayoutXLMModel(config)
+        self.cls = LayoutXLMPretrainingHeads(
+            config.hidden_size,
+            config.vocab_size,
+            config.hidden_act,
+            embedding_weights=self.layoutxlm.embeddings.word_embeddings.weight,
+        )
+
+    def resize_position_embeddings(self, new_num_position_embeddings):
+        """
+        Resizes position embeddings of the model if `new_num_position_embeddings != config["max_position_embeddings"]`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embedding matrix. If position embeddings are learned, increasing the size
+                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
+                end.
+        """
+        self.layoutxlm.resize_position_embeddings(new_num_position_embeddings)
+
+    def forward(
+        self,
+        input_ids=None,
+        bbox=None,
+        image=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        masked_positions=None,
+    ):
+        outputs = self.layoutxlm(
+            input_ids=input_ids,
+            bbox=bbox,
+            image=image,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output, masked_positions)
+        return prediction_scores
+
+
+class BiaffineAttention(nn.Layer):
+    """Implements a biaffine attention operator for binary relation classification."""
+
+    def __init__(self, in_features, out_features):
+        super(BiaffineAttention, self).__init__()
+
+        self.in_features = in_features
+        self.out_features = out_features
+
+        self.bilinear = nn.Bilinear(in_features, in_features, out_features, bias_attr=False)
+        self.linear = nn.Linear(2 * in_features, out_features)
+
+    def forward(self, x_1, x_2):
+        return self.bilinear(x_1, x_2) + self.linear(paddle.concat((x_1, x_2), axis=-1))
+
+
+class REDecoder(nn.Layer):
+    def __init__(self, hidden_size=768, hidden_dropout_prob=0.1):
+        super(REDecoder, self).__init__()
+        self.entity_emb = nn.Embedding(3, hidden_size)
+        projection = nn.Sequential(
+            nn.Linear(hidden_size * 2, hidden_size),
+            nn.ReLU(),
+            nn.Dropout(hidden_dropout_prob),
+            nn.Linear(hidden_size, hidden_size // 2),
+            nn.ReLU(),
+            nn.Dropout(hidden_dropout_prob),
+        )
+        self.ffnn_head = copy.deepcopy(projection)
+        self.ffnn_tail = copy.deepcopy(projection)
+        self.rel_classifier = BiaffineAttention(hidden_size // 2, 2)
+        self.loss_fct = CrossEntropyLoss()
+
+    def build_relation(self, relations, entities):
+        batch_size, max_seq_len = entities.shape[:2]
+        new_relations = paddle.full(
+            shape=[batch_size, max_seq_len * max_seq_len, 3], fill_value=-1, dtype=relations.dtype
+        )
+        for b in range(batch_size):
+            if entities[b, 0, 0] <= 2:
+                entitie_new = paddle.full(shape=[512, 3], fill_value=-1, dtype=entities.dtype)
+                entitie_new[0, :] = 2
+                entitie_new[1:3, 0] = 0  # start
+                entitie_new[1:3, 1] = 1  # end
+                entitie_new[1:3, 2] = 0  # label
+                entities[b] = entitie_new
+            entitie_label = entities[b, 1 : entities[b, 0, 2] + 1, 2]
+            all_possible_relations1 = paddle.arange(0, entities[b, 0, 2], dtype=entities.dtype)
+            all_possible_relations1 = all_possible_relations1[entitie_label == 1]
+            all_possible_relations2 = paddle.arange(0, entities[b, 0, 2], dtype=entities.dtype)
+            all_possible_relations2 = all_possible_relations2[entitie_label == 2]
+
+            all_possible_relations = paddle.stack(
+                paddle.meshgrid(all_possible_relations1, all_possible_relations2), axis=2
+            ).reshape([-1, 2])
+            if len(all_possible_relations) == 0:
+                all_possible_relations = paddle.full_like(all_possible_relations, fill_value=-1, dtype=entities.dtype)
+                all_possible_relations[0, 0] = 0
+                all_possible_relations[0, 1] = 1
+
+            relation_head = relations[b, 1 : relations[b, 0, 0] + 1, 0]
+            relation_tail = relations[b, 1 : relations[b, 0, 1] + 1, 1]
+            positive_relations = paddle.stack([relation_head, relation_tail], axis=1)
+
+            all_possible_relations_repeat = all_possible_relations.unsqueeze(axis=1).tile(
+                [1, len(positive_relations), 1]
+            )
+            positive_relations_repeat = positive_relations.unsqueeze(axis=0).tile([len(all_possible_relations), 1, 1])
+            mask = paddle.all(all_possible_relations_repeat == positive_relations_repeat, axis=2)
+            negative_mask = paddle.any(mask, axis=1) is False
+            negative_relations = all_possible_relations[negative_mask]
+
+            positive_mask = paddle.any(mask, axis=0) is True
+            positive_relations = positive_relations[positive_mask]
+            if negative_mask.sum() > 0:
+                reordered_relations = paddle.concat([positive_relations, negative_relations])
+            else:
+                reordered_relations = positive_relations
+
+            relation_per_doc_label = paddle.zeros([len(reordered_relations), 1], dtype=reordered_relations.dtype)
+            relation_per_doc_label[: len(positive_relations)] = 1
+            relation_per_doc = paddle.concat([reordered_relations, relation_per_doc_label], axis=1)
+            assert len(relation_per_doc[:, 0]) != 0
+            new_relations[b, 0] = relation_per_doc.shape[0].astype(new_relations.dtype)
+            new_relations[b, 1 : len(relation_per_doc) + 1] = relation_per_doc
+            # new_relations.append(relation_per_doc)
+        return new_relations, entities
+
+    def get_predicted_relations(self, logits, relations, entities):
+        pred_relations = []
+        for i, pred_label in enumerate(logits.argmax(-1)):
+            if pred_label != 1:
+                continue
+            rel = paddle.full(shape=[7, 2], fill_value=-1, dtype=relations.dtype)
+            rel[0, 0] = relations[:, 0][i]
+            rel[1, 0] = entities[:, 0][relations[:, 0][i] + 1]
+            rel[1, 1] = entities[:, 1][relations[:, 0][i] + 1]
+            rel[2, 0] = entities[:, 2][relations[:, 0][i] + 1]
+            rel[3, 0] = relations[:, 1][i]
+            rel[4, 0] = entities[:, 0][relations[:, 1][i] + 1]
+            rel[4, 1] = entities[:, 1][relations[:, 1][i] + 1]
+            rel[5, 0] = entities[:, 2][relations[:, 1][i] + 1]
+            rel[6, 0] = 1
+            pred_relations.append(rel)
+        return pred_relations
+
+    def forward(self, hidden_states, entities, relations):
+        batch_size, max_length, _ = entities.shape
+        relations, entities = self.build_relation(relations, entities)
+        loss = 0
+        all_pred_relations = paddle.full(
+            shape=[batch_size, max_length * max_length, 7, 2], fill_value=-1, dtype=entities.dtype
+        )
+        for b in range(batch_size):
+            relation = relations[b, 1 : relations[b, 0, 0] + 1]
+            head_entities = relation[:, 0]
+            tail_entities = relation[:, 1]
+            relation_labels = relation[:, 2]
+            entities_start_index = paddle.to_tensor(entities[b, 1 : entities[b, 0, 0] + 1, 0])
+            entities_labels = paddle.to_tensor(entities[b, 1 : entities[b, 0, 2] + 1, 2])
+            head_index = entities_start_index[head_entities]
+            head_label = entities_labels[head_entities]
+            head_label_repr = self.entity_emb(head_label)
+
+            tail_index = entities_start_index[tail_entities]
+            tail_label = entities_labels[tail_entities]
+            tail_label_repr = self.entity_emb(tail_label)
+
+            tmp_hidden_states = hidden_states[b][head_index]
+            if len(tmp_hidden_states.shape) == 1:
+                tmp_hidden_states = paddle.unsqueeze(tmp_hidden_states, axis=0)
+            head_repr = paddle.concat((tmp_hidden_states, head_label_repr), axis=-1)
+
+            tmp_hidden_states = hidden_states[b][tail_index]
+            if len(tmp_hidden_states.shape) == 1:
+                tmp_hidden_states = paddle.unsqueeze(tmp_hidden_states, axis=0)
+            tail_repr = paddle.concat((tmp_hidden_states, tail_label_repr), axis=-1)
+
+            heads = self.ffnn_head(head_repr)
+            tails = self.ffnn_tail(tail_repr)
+            logits = self.rel_classifier(heads, tails)
+            loss += self.loss_fct(logits, relation_labels)
+            pred_relations = self.get_predicted_relations(logits, relation, entities[b])
+            if len(pred_relations) > 0:
+                pred_relations = paddle.stack(pred_relations)
+                all_pred_relations[b, 0, :, :] = pred_relations.shape[0].astype(all_pred_relations.dtype)
+                all_pred_relations[b, 1 : len(pred_relations) + 1, :, :] = pred_relations
+        return loss, all_pred_relations
+
+
+class LayoutXLMForRelationExtraction(LayoutXLMPretrainedModel):
+    def __init__(self, config: LayoutXLMConfig):
+        super(LayoutXLMForRelationExtraction, self).__init__(config)
+
+        self.layoutxlm = LayoutXLMModel(config)
+
+        self.extractor = REDecoder(config.hidden_size, config.hidden_dropout_prob)
+
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def _init_weights(self, layer):
+        """Initialize the weights"""
+        if isinstance(layer, nn.Linear):
+            layer.weight.set_value(paddle.tensor.normal(mean=0.0, std=0.02, shape=layer.weight.shape))
+            if layer.bias is not None:
+                layer.bias.set_value(paddle.tensor.zeros(shape=layer.bias.shape))
+        elif isinstance(layer, nn.Embedding):
+            layer.weight.set_value(paddle.tensor.normal(mean=0.0, std=0.02, shape=layer.weight.shape))
+            if layer._padding_idx is not None:
+                layer.weight[layer._padding_idx].set_value(
+                    paddle.tensor.normal(mean=0.0, std=0.02, shape=layer.weight[layer._padding_idx].shape)
+                )
+        elif isinstance(layer, nn.LayerNorm):
+            layer.weight.set_value(paddle.tensor.ones(shape=layer.bias.shape))
+            layer.bias.set_value(paddle.tensor.zeros(shape=layer.bias.shape))
+
+    def resize_position_embeddings(self, new_num_position_embeddings):
+        """
+        Resizes position embeddings of the model if `new_num_position_embeddings != config["max_position_embeddings"]`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embedding matrix. If position embeddings are learned, increasing the size
+                will add newly initialized vectors at the end, whereas reducing the size will remove vectors from the
+                end.
+        """
+        self.layoutxlm.resize_position_embeddings(new_num_position_embeddings)
+
+    def forward(
+        self,
+        input_ids,
+        bbox,
+        image=None,
+        attention_mask=None,
+        entities=None,
+        relations=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        labels=None,
+    ):
+        outputs = self.layoutxlm(
+            input_ids=input_ids,
+            bbox=bbox,
+            image=image,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+        )
+        seq_length = input_ids.shape[1]
+        sequence_output = outputs[0][:, :seq_length]
+
+        sequence_output = self.dropout(sequence_output)
+        loss, pred_relations = self.extractor(sequence_output, entities, relations)
+        hidden_states = [outputs[2][f"{idx}_data"] for idx in range(self.layoutxlm.config.num_hidden_layers)]
+        hidden_states = paddle.stack(hidden_states, axis=1)
+
+        res = dict(loss=loss, pred_relations=pred_relations, hidden_states=hidden_states)
+        return res
+
+
+class LayoutXLMForQuestionAnswering(LayoutXLMPretrainedModel):
+    def __init__(self, config: LayoutXLMConfig):
+        super(LayoutXLMForQuestionAnswering, self).__init__(config)
+        self.num_classes = config.num_labels
+        self.layoutxlm = LayoutXLMModel(config)
+        self.has_visual_segment_embedding = config.has_visual_segment_embedding
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.qa_outputs = nn.Linear(config.hidden_size, self.num_classes)
+
+    def get_input_embeddings(self):
+        return self.layoutxlm.embeddings.word_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        bbox=None,
+        image=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        start_positions=None,
+        end_positions=None,
+    ):
+        # In LayoutXLM the type vocab size is 1
+        token_type_ids = paddle.zeros_like(input_ids)
+
+        outputs = self.layoutxlm(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            bbox=bbox,
+            image=image,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+        )
+        seq_length = input_ids.shape[1]
+        # sequence out and image out
+        sequence_output = outputs[0][:, :seq_length]
+        sequence_output = self.dropout(sequence_output)
+
+        if token_type_ids is not None:
+            span_mask = -token_type_ids * 1e8
+        else:
+            span_mask = 0
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = paddle.split(logits, num_or_sections=2, axis=-1)
+        start_logits = start_logits.squeeze(-1) + span_mask
+        end_logits = end_logits.squeeze(-1) + span_mask
+
+        outputs = (start_logits, end_logits) + outputs[2:]
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.shape) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.shape) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # Sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clip(0, ignored_index)
+            end_positions = end_positions.clip(0, ignored_index)
+
+            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not total_loss:
+            return outputs
+        else:
+            outputs = (total_loss,) + outputs
+            return outputs
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutxlm/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutxlm/tokenizer.py
new file mode 100644
index 000000000..26c23c9f6
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutxlm/tokenizer.py
@@ -0,0 +1,170 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tokenization classes for LayoutXLM model."""
+
+from typing import List, Optional
+
+import sentencepiece as spm
+
+from .. import AddedToken, PretrainedTokenizer
+from ..tokenizer_utils import _is_control, _is_punctuation, _is_whitespace
+
+SPIECE_UNDERLINE = "▁"
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "layoutxlm-base-uncased": 514,
+    # FIXME(wj-Mcat): why this model-name not in the init-configuration
+    # "layoutxlm-wo-backbone-base-uncased": 514
+}
+
+
+def _is_end_of_word(text):
+    """Checks whether the last character in text is one of a punctuation, control or whitespace character."""
+    last_char = text[-1]
+    return bool(_is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(last_char))
+
+
+def _is_start_of_word(text):
+    """Checks whether the first character in text is one of a punctuation, control or whitespace character."""
+    first_char = text[0]
+    return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char))
+
+
+class LayoutXLMTokenizer(PretrainedTokenizer):
+    resource_files_names = {"vocab_file": "sentencepiece.bpe.model"}
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "layoutxlm-base-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/layoutxlm_base/sentencepiece.bpe.model",
+        }
+    }
+    pretrained_init_configuration = {
+        "layoutxlm-base-uncased": {"do_lower_case": False},
+    }
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    SPECIAL_TOKENS_ATTRIBUTES = [
+        "bos_token",
+        "eos_token",
+        "unk_token",
+        "sep_token",
+        "pad_token",
+        "cls_token",
+        "mask_token",
+        "additional_special_tokens",
+    ]
+
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        **kwargs
+    ):
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+        self._bos_token = bos_token
+        self._eos_token = eos_token
+        self._sep_token = sep_token
+        self._cls_token = cls_token
+        self._unk_token = unk_token
+        self._pad_token = pad_token
+        self._mask_token = mask_token
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(vocab_file)
+        self.vocab_file = vocab_file
+
+        self.tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
+
+        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
+        self.offset = 1
+
+        self.tokens_to_ids["<mask>"] = len(self.sp_model) + self.offset
+        self.ids_to_tokens = {v: k for k, v in self.tokens_to_ids.items()}
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    @property
+    def vocab_size(self):
+        return len(self.sp_model) + self.offset + 1  # Add the <mask> token
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text):
+        return self.sp_model.EncodeAsPieces(text)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if token in self.tokens_to_ids:
+            return self.tokens_to_ids[token]
+        spm_id = self.sp_model.PieceToId(token)
+
+        # Need to return unknown token if the SP model returned 0
+        return spm_id + self.offset if spm_id else self.unk_token_id
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.ids_to_tokens:
+            return self.ids_to_tokens[index]
+        return self.sp_model.IdToPiece(index - self.offset)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    def num_special_tokens_to_add(self, pair=False):
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutxlm/visual_backbone.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutxlm/visual_backbone.py
new file mode 100644
index 000000000..8d9f77711
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutxlm/visual_backbone.py
@@ -0,0 +1,737 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import os
+from abc import abstractmethod
+from collections import namedtuple
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import Layer
+from paddle.utils import try_import
+
+
+def read_config(fp=None):
+    if fp is None:
+        dir_name = os.path.dirname(os.path.abspath(__file__))
+        fp = os.path.join(dir_name, "visual_backbone.yaml")
+    with open(fp, "r") as fin:
+        yacs_config = try_import("yacs.config")
+        cfg = yacs_config.CfgNode().load_cfg(fin)
+    cfg.freeze()
+    return cfg
+
+
+class Conv2d(nn.Conv2D):
+    def __init__(self, *args, **kwargs):
+        norm = kwargs.pop("norm", None)
+        activation = kwargs.pop("activation", None)
+        super(Conv2d, self).__init__(*args, **kwargs)
+
+        self.norm = norm
+        self.activation = activation
+
+    def forward(self, x):
+        x = super(Conv2d, self).forward(x)
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+
+
+class CNNBlockBase(Layer):
+    def __init__(self, in_channels, out_channels, stride):
+        """
+        The `__init__` method of any subclass should also contain these arguments.
+        Args:
+            in_channels (int):
+            out_channels (int):
+            stride (int):
+        """
+        super(CNNBlockBase, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.stride = stride
+
+    def freeze(self):
+        for p in self.parameters():
+            p.stop_gradient = True
+
+
+ResNetBlockBase = CNNBlockBase
+
+
+class ShapeSpec(namedtuple("_ShapeSpec", ["channels", "height", "width", "stride"])):
+    def __new__(cls, channels=None, height=None, width=None, stride=None):
+        return super().__new__(cls, channels, height, width, stride)
+
+
+def get_norm(norm, out_channels):
+    """
+    Args:
+        norm (str or callable): either one of BN, SyncBN, FrozenBN, GN;
+            or a callable that takes a channel number and returns
+            the normalization layer as a nn.Layer.
+        out_channels (int): out_channels
+    Returns:
+        nn.Layer or None: the normalization layer
+    """
+    if norm is None:
+        return None
+    if isinstance(norm, str):
+        if len(norm) == 0:
+            return None
+        norm = {
+            "BN": nn.BatchNorm,
+            "SyncBN": nn.SyncBatchNorm,
+            "FrozenBN": FrozenBatchNorm,
+        }[norm]
+    return norm(out_channels)
+
+
+class FrozenBatchNorm(nn.BatchNorm):
+    def __init__(self, num_channels):
+        param_attr = ParamAttr(learning_rate=0.0, trainable=False)
+        bias_attr = ParamAttr(learning_rate=0.0, trainable=False)
+        super(FrozenBatchNorm, self).__init__(
+            num_channels, param_attr=param_attr, bias_attr=bias_attr, use_global_stats=True
+        )
+
+
+class Backbone(nn.Layer):
+    def __init__(self):
+        super(Backbone, self).__init__()
+
+    @abstractmethod
+    def forward(self, *args):
+        pass
+
+    @property
+    def size_divisibility(self) -> int:
+        return 0
+
+    def output_shape(self):
+        # this is a backward-compatible default
+        return {
+            name: ShapeSpec(channels=self._out_feature_channels[name], stride=self._out_feature_strides[name])
+            for name in self._out_features
+        }
+
+
+class BasicBlock(CNNBlockBase):
+    """
+    The basic residual block for ResNet-18 and ResNet-34 defined in :paper:`ResNet`,
+    with two 3x3 conv layers and a projection shortcut if needed.
+    """
+
+    def __init__(self, in_channels, out_channels, *, stride=1, norm="BN"):
+        raise NotImplementedError
+
+
+class BottleneckBlock(CNNBlockBase):
+    """
+    The standard bottleneck residual block used by ResNet-50, 101 and 152
+    defined in :paper:`ResNet`.  It contains 3 conv layers with kernels
+    1x1, 3x3, 1x1, and a projection shortcut if needed.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        *,
+        bottleneck_channels,
+        stride=1,
+        num_groups=1,
+        norm="BN",
+        stride_in_1x1=False,
+        dilation=1,
+    ):
+        super(BottleneckBlock, self).__init__(in_channels, out_channels, stride)
+
+        if in_channels != out_channels:
+            self.shortcut = Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=1,
+                stride=stride,
+                bias_attr=False,
+                norm=get_norm(norm, out_channels),
+            )
+        else:
+            self.shortcut = None
+
+        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)
+
+        self.conv1 = Conv2d(
+            in_channels,
+            bottleneck_channels,
+            kernel_size=1,
+            stride=stride_1x1,
+            bias_attr=False,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+
+        self.conv2 = Conv2d(
+            bottleneck_channels,
+            bottleneck_channels,
+            kernel_size=3,
+            stride=stride_3x3,
+            padding=1 * dilation,
+            bias_attr=False,
+            groups=num_groups,
+            dilation=dilation,
+            norm=get_norm(norm, bottleneck_channels),
+        )
+
+        self.conv3 = Conv2d(
+            bottleneck_channels,
+            out_channels,
+            kernel_size=1,
+            bias_attr=False,
+            norm=get_norm(norm, out_channels),
+        )
+        # init code is removed cause pretrained model will be loaded
+
+    def forward(self, x):
+        out = self.conv1(x)
+        out = F.relu(out)
+
+        out = self.conv2(out)
+        out = F.relu(out)
+
+        out = self.conv3(out)
+
+        if self.shortcut is not None:
+            shortcut = self.shortcut(x)
+        else:
+            shortcut = x
+
+        out += shortcut
+        out = F.relu(out)
+        return out
+
+
+class DeformBottleneckBlock(CNNBlockBase):
+    """
+    Similar to :class:`BottleneckBlock`, but with :paper:`deformable conv <deformconv>`
+    in the 3x3 convolution.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        *,
+        bottleneck_channels,
+        stride=1,
+        num_groups=1,
+        norm="BN",
+        stride_in_1x1=False,
+        dilation=1,
+        deform_modulated=False,
+        deform_num_groups=1,
+    ):
+        raise NotImplementedError
+
+
+class BasicStem(CNNBlockBase):
+    """
+    The standard ResNet stem (layers before the first residual block),
+    with a conv, relu and max_pool.
+    """
+
+    def __init__(self, in_channels=3, out_channels=64, norm="BN"):
+        """
+        Args:
+            norm (str or callable): norm after the first conv layer.
+                See :func:`layers.get_norm` for supported format.
+        """
+        super(BasicStem, self).__init__(in_channels, out_channels, 4)
+        self.in_channels = in_channels
+        self.conv1 = Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            bias_attr=False,
+            norm=get_norm(norm, out_channels),
+        )
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
+        return x
+
+
+class ResNet(Backbone):
+    def __init__(self, stem, stages, num_classes=None, out_features=None, freeze_at=0):
+        super(ResNet, self).__init__()
+        self.stem = stem
+        self.num_classes = num_classes
+
+        current_stride = self.stem.stride
+        self._out_feature_strides = {"stem": current_stride}
+        self._out_feature_channels = {"stem": self.stem.out_channels}
+
+        self.stage_names, self.stages = [], []
+
+        if out_features is not None:
+            num_stages = max([{"res2": 1, "res3": 2, "res4": 3, "res5": 4}.get(f, 0) for f in out_features])
+            stages = stages[:num_stages]
+        for i, blocks in enumerate(stages):
+            assert len(blocks) > 0, len(blocks)
+            for block in blocks:
+                assert isinstance(block, CNNBlockBase), block
+
+            name = "res" + str(i + 2)
+            stage = nn.Sequential(*blocks)
+
+            self.add_sublayer(name, stage)
+            self.stage_names.append(name)
+            self.stages.append(stage)
+
+            self._out_feature_strides[name] = current_stride = int(
+                current_stride * np.prod([k.stride for k in blocks])
+            )
+            self._out_feature_channels[name] = curr_channels = blocks[-1].out_channels
+        self.stage_names = tuple(self.stage_names)
+
+        if num_classes is not None:
+            self.avgpool = nn.AdaptiveAvgPool2D(1)
+            self.linear = nn.Linear(curr_channels, num_classes)
+            name = "linear"
+
+        if out_features is None:
+            out_features = [name]
+        self._out_features = out_features
+        assert len(self._out_features)
+        children = [x[0] for x in self.named_children()]
+        for out_feature in self._out_features:
+            assert out_feature in children, "Available children: {}".format(", ".join(children))
+        self.freeze(freeze_at)
+
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        assert x.dim() == 4, f"ResNet takes an input of shape (N, C, H, W). Got {x.shape} instead!"
+        outputs = {}
+        x = self.stem(x)
+        if "stem" in self._out_features:
+            outputs["stem"] = x
+        for name, stage in zip(self.stage_names, self.stages):
+            x = stage(x)
+            if name in self._out_features:
+                outputs[name] = x
+        if self.num_classes is not None:
+            x = self.avgpool(x)
+            x = paddle.flatten(x, 1)
+            x = self.linear(x)
+            if "linear" in self._out_features:
+                outputs["linear"] = x
+        return outputs
+
+    def output_shape(self):
+        return {
+            name: ShapeSpec(channels=self._out_feature_channels[name], stride=self._out_feature_strides[name])
+            for name in self._out_features
+        }
+
+    @staticmethod
+    def make_stage(block_class, num_blocks, *, in_channels, out_channels, **kwargs):
+        """
+        Create a list of blocks of the same type that forms one ResNet stage.
+
+        Args:
+            block_class (type): a subclass of CNNBlockBase that's used to create all blocks in this
+                stage. A module of this type must not change spatial resolution of inputs unless its
+                stride != 1.
+            num_blocks (int): number of blocks in this stage
+            in_channels (int): input channels of the entire stage.
+            out_channels (int): output channels of **every block** in the stage.
+            kwargs: other arguments passed to the constructor of
+                `block_class`. If the argument name is "xx_per_block", the
+                argument is a list of values to be passed to each block in the
+                stage. Otherwise, the same argument is passed to every block
+                in the stage.
+
+        Returns:
+            list[CNNBlockBase]: a list of block module.
+
+        Examples:
+        ::
+            stage = ResNet.make_stage(
+                BottleneckBlock, 3, in_channels=16, out_channels=64,
+                bottleneck_channels=16, num_groups=1,
+                stride_per_block=[2, 1, 1],
+                dilations_per_block=[1, 1, 2]
+            )
+
+        Usually, layers that produce the same feature map spatial size are defined as one
+        "stage" (in :paper:`FPN`). Under such definition, ``stride_per_block[1:]`` should
+        all be 1.
+        """
+        blocks = []
+        for i in range(num_blocks):
+            curr_kwargs = {}
+            for k, v in kwargs.items():
+                if k.endswith("_per_block"):
+                    assert len(v) == num_blocks, (
+                        f"Argument '{k}' of make_stage should have the " f"same length as num_blocks={num_blocks}."
+                    )
+                    newk = k[: -len("_per_block")]
+                    assert newk not in kwargs, f"Cannot call make_stage with both {k} and {newk}!"
+                    curr_kwargs[newk] = v[i]
+                else:
+                    curr_kwargs[k] = v
+
+            blocks.append(block_class(in_channels=in_channels, out_channels=out_channels, **curr_kwargs))
+            in_channels = out_channels
+        return blocks
+
+    @staticmethod
+    def make_default_stages(depth, block_class=None, **kwargs):
+        """
+        Created list of ResNet stages from pre-defined depth (one of 18, 34, 50, 101, 152).
+        If it doesn't create the ResNet variant you need, please use :meth:`make_stage`
+        instead for fine-grained customization.
+
+        Args:
+            depth (int): depth of ResNet
+            block_class (type): the CNN block class. Has to accept
+                `bottleneck_channels` argument for depth > 50.
+                By default it is BasicBlock or BottleneckBlock, based on the
+                depth.
+            kwargs:
+                other arguments to pass to `make_stage`. Should not contain
+                stride and channels, as they are predefined for each depth.
+
+        Returns:
+            list[list[CNNBlockBase]]: modules in all stages; see arguments of
+                :class:`ResNet.__init__`.
+        """
+        num_blocks_per_stage = {
+            18: [2, 2, 2, 2],
+            34: [3, 4, 6, 3],
+            50: [3, 4, 6, 3],
+            101: [3, 4, 23, 3],
+            152: [3, 8, 36, 3],
+        }[depth]
+        if block_class is None:
+            block_class = BasicBlock if depth < 50 else BottleneckBlock
+        if depth < 50:
+            in_channels = [64, 64, 128, 256]
+            out_channels = [64, 128, 256, 512]
+        else:
+            in_channels = [64, 256, 512, 1024]
+            out_channels = [256, 512, 1024, 2048]
+        ret = []
+        for (n, s, i, o) in zip(num_blocks_per_stage, [1, 2, 2, 2], in_channels, out_channels):
+            if depth >= 50:
+                kwargs["bottleneck_channels"] = o // 4
+            ret.append(
+                ResNet.make_stage(
+                    block_class=block_class,
+                    num_blocks=n,
+                    stride_per_block=[s] + [1] * (n - 1),
+                    in_channels=i,
+                    out_channels=o,
+                    **kwargs,
+                )
+            )
+        return ret
+
+    def freeze(self, freeze_at=0):
+        if freeze_at >= 1:
+            self.stem.freeze()
+        for idx, stage in enumerate(self.stages, start=2):
+            if freeze_at >= idx:
+                for block in stage.children():
+                    block.freeze()
+        return self
+
+
+class LastLevelMaxPool(nn.Layer):
+    """
+    This module is used in the original FPN to generate a downsampled
+    P6 feature from P5.
+    """
+
+    def __init__(self):
+        super(LastLevelMaxPool, self).__init__()
+        self.num_levels = 1
+        self.in_feature = "p5"
+
+    def forward(self, x):
+        return [F.max_pool2d(x, kernel_size=1, stride=2, padding=0)]
+
+
+def _assert_strides_are_log2_contiguous(strides):
+    """
+    Assert that each stride is 2x times its preceding stride, i.e. "contiguous in log2".
+    """
+    for i, stride in enumerate(strides[1:], 1):
+        assert stride == 2 * strides[i - 1], "Strides {} {} are not log2 contiguous".format(stride, strides[i - 1])
+
+
+class FPN(Backbone):
+    def __init__(self, bottom_up, in_features, out_channels, norm="", top_block=None, fuse_type="sum"):
+        super(FPN, self).__init__()
+        assert isinstance(bottom_up, Backbone)
+        assert in_features, in_features
+
+        # Feature map strides and channels from the bottom up network (e.g. ResNet)
+        input_shapes = bottom_up.output_shape()
+        strides = [input_shapes[f].stride for f in in_features]
+        in_channels_per_feature = [input_shapes[f].channels for f in in_features]
+
+        _assert_strides_are_log2_contiguous(strides)
+        lateral_convs = []
+        output_convs = []
+
+        use_bias = norm == ""
+        for idx, in_channels in enumerate(in_channels_per_feature):
+            lateral_norm = get_norm(norm, out_channels)
+            output_norm = get_norm(norm, out_channels)
+
+            lateral_conv = Conv2d(in_channels, out_channels, kernel_size=1, bias_attr=use_bias, norm=lateral_norm)
+            output_conv = Conv2d(
+                out_channels,
+                out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias_attr=use_bias,
+                norm=output_norm,
+            )
+            stage = int(math.log2(strides[idx]))
+            self.add_sublayer("fpn_lateral{}".format(stage), lateral_conv)
+            self.add_sublayer("fpn_output{}".format(stage), output_conv)
+
+            lateral_convs.append(lateral_conv)
+            output_convs.append(output_conv)
+        # Place convs into top-down order (from low to high resolution)
+        # to make the top-down computation in forward clearer.
+        self.lateral_convs = lateral_convs[::-1]
+        self.output_convs = output_convs[::-1]
+        self.top_block = top_block
+        self.in_features = tuple(in_features)
+        self.bottom_up = bottom_up
+        # Return feature names are "p<stage>", like ["p2", "p3", ..., "p6"]
+        self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in strides}
+        # top block output feature maps.
+        if self.top_block is not None:
+            for s in range(stage, stage + self.top_block.num_levels):
+                self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1)
+
+        self._out_features = list(self._out_feature_strides.keys())
+        self._out_feature_channels = {k: out_channels for k in self._out_features}
+        self._size_divisibility = strides[-1]
+        assert fuse_type in {"avg", "sum"}
+        self._fuse_type = fuse_type
+
+    @property
+    def size_divisibility(self):
+        return self._size_divisibility
+
+    def forward(self, x):
+        """
+        Args:
+            x (dict[str->Tensor]): mapping feature map name (e.g., "res5") to
+                feature map tensor for each feature level in high to low resolution order.
+
+        Returns:
+            dict[str->Tensor]:
+                mapping from feature map name to FPN feature map tensor
+                in high to low resolution order. Returned feature names follow the FPN
+                paper convention: "p<stage>", where stage has stride = 2 ** stage e.g.,
+                ["p2", "p3", ..., "p6"].
+        """
+        bottom_up_features = self.bottom_up(x)
+        results = []
+        prev_features = self.lateral_convs[0](bottom_up_features[self.in_features[-1]])
+        results.append(self.output_convs[0](prev_features))
+
+        # Reverse feature maps into top-down order (from low to high resolution)
+        for idx, (lateral_conv, output_conv) in enumerate(zip(self.lateral_convs, self.output_convs)):
+            if idx > 0:
+                features = self.in_features[-idx - 1]
+                features = bottom_up_features[features]
+                top_down_features = F.interpolate(prev_features, scale_factor=2.0, mode="nearest")
+                lateral_features = lateral_conv(features)
+                prev_features = lateral_features + top_down_features
+                if self._fuse_type == "avg":
+                    prev_features /= 2
+                results.insert(0, output_conv(prev_features))
+
+        if self.top_block is not None:
+            if self.top_block.in_feature in bottom_up_features:
+                top_block_in_feature = bottom_up_features[self.top_block.in_feature]
+            else:
+                top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)]
+            results.extend(self.top_block(top_block_in_feature))
+        assert len(self._out_features) == len(results)
+        return {f: res for f, res in zip(self._out_features, results)}
+
+    def output_shape(self):
+        return {
+            name: ShapeSpec(channels=self._out_feature_channels[name], stride=self._out_feature_strides[name])
+            for name in self._out_features
+        }
+
+
+def make_stage(*args, **kwargs):
+    """
+    Deprecated alias for backward compatibiltiy.
+    """
+    return ResNet.make_stage(*args, **kwargs)
+
+
+def build_resnet_backbone(cfg, input_shape=None):
+    """
+    Create a ResNet instance from config.
+
+    Returns:
+        ResNet: a :class:`ResNet` instance.
+    """
+    # need registration of new blocks/stems?
+    if input_shape is None:
+        ch = 3
+    else:
+        ch = input_shape.channels
+    norm = cfg.MODEL.RESNETS.NORM
+    stem = BasicStem(
+        in_channels=ch,
+        out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS,
+        norm=norm,
+    )
+
+    # fmt: off
+    freeze_at = cfg.MODEL.BACKBONE.FREEZE_AT  # default as 2
+    out_features = cfg.MODEL.RESNETS.OUT_FEATURES
+    depth = cfg.MODEL.RESNETS.DEPTH
+    num_groups = cfg.MODEL.RESNETS.NUM_GROUPS
+    width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
+    bottleneck_channels = num_groups * width_per_group
+    in_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS
+    out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
+    stride_in_1x1 = cfg.MODEL.RESNETS.STRIDE_IN_1X1
+    res5_dilation = cfg.MODEL.RESNETS.RES5_DILATION
+    deform_on_per_stage = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE
+    deform_modulated = cfg.MODEL.RESNETS.DEFORM_MODULATED
+    deform_num_groups = cfg.MODEL.RESNETS.DEFORM_NUM_GROUPS
+    # fmt: on
+    assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation)
+
+    num_blocks_per_stage = {
+        18: [2, 2, 2, 2],
+        34: [3, 4, 6, 3],
+        50: [3, 4, 6, 3],
+        101: [3, 4, 23, 3],
+        152: [3, 8, 36, 3],
+    }[depth]
+
+    if depth in [18, 34]:
+        assert out_channels == 64, "Must set MODEL.RESNETS.RES2_OUT_CHANNELS = 64 for R18/R34"
+        assert not any(deform_on_per_stage), "MODEL.RESNETS.DEFORM_ON_PER_STAGE unsupported for R18/R34"
+        assert res5_dilation == 1, "Must set MODEL.RESNETS.RES5_DILATION = 1 for R18/R34"
+        assert num_groups == 1, "Must set MODEL.RESNETS.NUM_GROUPS = 1 for R18/R34"
+
+    stages = []
+
+    for idx, stage_idx in enumerate(range(2, 6)):
+        # res5_dilation is used this way as a convention in R-FCN & Deformable Conv paper
+        dilation = res5_dilation if stage_idx == 5 else 1
+        first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2
+        stage_kargs = {
+            "num_blocks": num_blocks_per_stage[idx],
+            "stride_per_block": [first_stride] + [1] * (num_blocks_per_stage[idx] - 1),
+            "in_channels": in_channels,
+            "out_channels": out_channels,
+            "norm": norm,
+        }
+        # Use BasicBlock for R18 and R34.
+        if depth in [18, 34]:
+            stage_kargs["block_class"] = BasicBlock
+        else:
+            stage_kargs["bottleneck_channels"] = bottleneck_channels
+            stage_kargs["stride_in_1x1"] = stride_in_1x1
+            stage_kargs["dilation"] = dilation
+            stage_kargs["num_groups"] = num_groups
+            if deform_on_per_stage[idx]:
+                stage_kargs["block_class"] = DeformBottleneckBlock
+                stage_kargs["deform_modulated"] = deform_modulated
+                stage_kargs["deform_num_groups"] = deform_num_groups
+            else:
+                stage_kargs["block_class"] = BottleneckBlock
+        blocks = ResNet.make_stage(**stage_kargs)
+        in_channels = out_channels
+        out_channels *= 2
+        bottleneck_channels *= 2
+        stages.append(blocks)
+    return ResNet(stem, stages, out_features=out_features, freeze_at=freeze_at)
+
+
+def build_resnet_fpn_backbone(cfg, input_shape=None):
+    bottom_up = build_resnet_backbone(cfg, input_shape)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=LastLevelMaxPool(),
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone
+
+
+class VisualBackbone(Layer):
+    def __init__(self, config):
+        super(VisualBackbone, self).__init__()
+        self.cfg = read_config()
+        self.backbone = build_resnet_fpn_backbone(self.cfg)
+        # syncbn is removed cause that will cause import of torch
+
+        assert len(self.cfg.MODEL.PIXEL_MEAN) == len(self.cfg.MODEL.PIXEL_STD)
+        num_channels = len(self.cfg.MODEL.PIXEL_MEAN)
+        self.register_buffer("pixel_mean", paddle.to_tensor(self.cfg.MODEL.PIXEL_MEAN).reshape([num_channels, 1, 1]))
+        self.register_buffer("pixel_std", paddle.to_tensor(self.cfg.MODEL.PIXEL_STD).reshape([num_channels, 1, 1]))
+        self.out_feature_key = "p2"
+        # is_deterministic is disabled here.
+        self.pool = nn.AdaptiveAvgPool2D(config["image_feature_pool_shape"][:2])
+        if len(config["image_feature_pool_shape"]) == 2:
+            config["image_feature_pool_shape"].append(self.backbone.output_shape()[self.out_feature_key].channels)
+        assert self.backbone.output_shape()[self.out_feature_key].channels == config["image_feature_pool_shape"][2]
+
+    def forward(self, images):
+        images_input = (paddle.to_tensor(images) - self.pixel_mean) / self.pixel_std
+        features = self.backbone(images_input)
+        features = features[self.out_feature_key]
+        features = self.pool(features).flatten(start_axis=2).transpose([0, 2, 1])
+        return features
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutxlm/visual_backbone.yaml b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutxlm/visual_backbone.yaml
new file mode 100644
index 000000000..e386f20a0
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/layoutxlm/visual_backbone.yaml
@@ -0,0 +1,323 @@
+CUDNN_BENCHMARK: false
+DATALOADER:
+  ASPECT_RATIO_GROUPING: true
+  FILTER_EMPTY_ANNOTATIONS: true
+  NUM_WORKERS: 4
+  REPEAT_THRESHOLD: 0.0
+  SAMPLER_TRAIN: TrainingSampler
+DATASETS:
+  PRECOMPUTED_PROPOSAL_TOPK_TEST: 1000
+  PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 2000
+  PROPOSAL_FILES_TEST: []
+  PROPOSAL_FILES_TRAIN: []
+  TEST: []
+  TRAIN: []
+GLOBAL:
+  HACK: 1.0
+INPUT:
+  CROP:
+    ENABLED: false
+    SIZE:
+    - 0.9
+    - 0.9
+    TYPE: relative_range
+  FORMAT: BGR
+  MASK_FORMAT: polygon
+  MAX_SIZE_TEST: 1333
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MIN_SIZE_TRAIN:
+  - 800
+  MIN_SIZE_TRAIN_SAMPLING: choice
+  RANDOM_FLIP: horizontal
+MODEL:
+  ANCHOR_GENERATOR:
+    ANGLES:
+    - - -90
+      - 0
+      - 90
+    ASPECT_RATIOS:
+    - - 0.5
+      - 1.0
+      - 2.0
+    NAME: DefaultAnchorGenerator
+    OFFSET: 0.0
+    SIZES:
+    - - 32
+    - - 64
+    - - 128
+    - - 256
+    - - 512
+  BACKBONE:
+    FREEZE_AT: 2
+    NAME: build_resnet_fpn_backbone
+  DEVICE: cuda
+  FPN:
+    FUSE_TYPE: sum
+    IN_FEATURES:
+    - res2
+    - res3
+    - res4
+    - res5
+    NORM: ''
+    OUT_CHANNELS: 256
+  KEYPOINT_ON: false
+  LOAD_PROPOSALS: false
+  MASK_ON: true
+  META_ARCHITECTURE: GeneralizedRCNN
+  PANOPTIC_FPN:
+    COMBINE:
+      ENABLED: true
+      INSTANCES_CONFIDENCE_THRESH: 0.5
+      OVERLAP_THRESH: 0.5
+      STUFF_AREA_LIMIT: 4096
+    INSTANCE_LOSS_WEIGHT: 1.0
+  PIXEL_MEAN:
+  - 103.53
+  - 116.28
+  - 123.675
+  PIXEL_STD:
+  - 57.375
+  - 57.12
+  - 58.395
+  PROPOSAL_GENERATOR:
+    MIN_SIZE: 0
+    NAME: RPN
+  RESNETS:
+    ASPECT_RATIOS:
+    - - 0.5
+      - 1.0
+      - 2.0
+    DEFORM_MODULATED: false
+    DEFORM_NUM_GROUPS: 1
+    DEFORM_ON_PER_STAGE:
+    - false
+    - false
+    - false
+    - false
+    DEPTH: 101
+    NORM: FrozenBN
+    NUM_GROUPS: 32
+    OUT_FEATURES:
+    - res2
+    - res3
+    - res4
+    - res5
+    RES2_OUT_CHANNELS: 256
+    RES5_DILATION: 1
+    SIZES:
+    - - 32
+    - - 64
+    - - 128
+    - - 256
+    - - 512
+    STEM_OUT_CHANNELS: 64
+    STRIDE_IN_1X1: false
+    WIDTH_PER_GROUP: 8
+  RETINANET:
+    BBOX_REG_LOSS_TYPE: smooth_l1
+    BBOX_REG_WEIGHTS: &id001
+    - 1.0
+    - 1.0
+    - 1.0
+    - 1.0
+    FOCAL_LOSS_ALPHA: 0.25
+    FOCAL_LOSS_GAMMA: 2.0
+    IN_FEATURES:
+    - p3
+    - p4
+    - p5
+    - p6
+    - p7
+    IOU_LABELS:
+    - 0
+    - -1
+    - 1
+    IOU_THRESHOLDS:
+    - 0.4
+    - 0.5
+    NMS_THRESH_TEST: 0.5
+    NORM: ''
+    NUM_CLASSES: 80
+    NUM_CONVS: 4
+    PRIOR_PROB: 0.01
+    SCORE_THRESH_TEST: 0.05
+    SMOOTH_L1_LOSS_BETA: 0.1
+    TOPK_CANDIDATES_TEST: 1000
+  ROI_BOX_CASCADE_HEAD:
+    BBOX_REG_WEIGHTS:
+    - - 10.0
+      - 10.0
+      - 5.0
+      - 5.0
+    - - 20.0
+      - 20.0
+      - 10.0
+      - 10.0
+    - - 30.0
+      - 30.0
+      - 15.0
+      - 15.0
+    IOUS:
+    - 0.5
+    - 0.6
+    - 0.7
+  ROI_BOX_HEAD:
+    BBOX_REG_LOSS_TYPE: smooth_l1
+    BBOX_REG_LOSS_WEIGHT: 1.0
+    BBOX_REG_WEIGHTS:
+    - 10.0
+    - 10.0
+    - 5.0
+    - 5.0
+    CLS_AGNOSTIC_BBOX_REG: false
+    CONV_DIM: 256
+    FC_DIM: 1024
+    NAME: FastRCNNConvFCHead
+    NORM: ''
+    NUM_CONV: 0
+    NUM_FC: 2
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 0
+    POOLER_TYPE: ROIAlignV2
+    SMOOTH_L1_BETA: 0.0
+    TRAIN_ON_PRED_BOXES: false
+  ROI_HEADS:
+    BATCH_SIZE_PER_IMAGE: 512
+    IN_FEATURES:
+    - p2
+    - p3
+    - p4
+    - p5
+    IOU_LABELS:
+    - 0
+    - 1
+    IOU_THRESHOLDS:
+    - 0.5
+    NAME: StandardROIHeads
+    NMS_THRESH_TEST: 0.5
+    NUM_CLASSES: 5
+    POSITIVE_FRACTION: 0.25
+    PROPOSAL_APPEND_GT: true
+    SCORE_THRESH_TEST: 0.05
+  ROI_KEYPOINT_HEAD:
+    CONV_DIMS:
+    - 512
+    - 512
+    - 512
+    - 512
+    - 512
+    - 512
+    - 512
+    - 512
+    LOSS_WEIGHT: 1.0
+    MIN_KEYPOINTS_PER_IMAGE: 1
+    NAME: KRCNNConvDeconvUpsampleHead
+    NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: true
+    NUM_KEYPOINTS: 17
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 0
+    POOLER_TYPE: ROIAlignV2
+  ROI_MASK_HEAD:
+    CLS_AGNOSTIC_MASK: false
+    CONV_DIM: 256
+    NAME: MaskRCNNConvUpsampleHead
+    NORM: ''
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 7
+    POOLER_SAMPLING_RATIO: 0
+    POOLER_TYPE: ROIAlignV2
+  RPN:
+    BATCH_SIZE_PER_IMAGE: 256
+    BBOX_REG_LOSS_TYPE: smooth_l1
+    BBOX_REG_LOSS_WEIGHT: 1.0
+    BBOX_REG_WEIGHTS: *id001
+    BOUNDARY_THRESH: -1
+    HEAD_NAME: StandardRPNHead
+    IN_FEATURES:
+    - p2
+    - p3
+    - p4
+    - p5
+    - p6
+    IOU_LABELS:
+    - 0
+    - -1
+    - 1
+    IOU_THRESHOLDS:
+    - 0.3
+    - 0.7
+    LOSS_WEIGHT: 1.0
+    NMS_THRESH: 0.7
+    POSITIVE_FRACTION: 0.5
+    POST_NMS_TOPK_TEST: 1000
+    POST_NMS_TOPK_TRAIN: 1000
+    PRE_NMS_TOPK_TEST: 1000
+    PRE_NMS_TOPK_TRAIN: 2000
+    SMOOTH_L1_BETA: 0.0
+  SEM_SEG_HEAD:
+    COMMON_STRIDE: 4
+    CONVS_DIM: 128
+    IGNORE_VALUE: 255
+    IN_FEATURES:
+    - p2
+    - p3
+    - p4
+    - p5
+    LOSS_WEIGHT: 1.0
+    NAME: SemSegFPNHead
+    NORM: GN
+    NUM_CLASSES: 54
+  WEIGHTS: ''
+OUTPUT_DIR: ./output
+SEED: -1
+SOLVER:
+  AMP:
+    ENABLED: false
+  BASE_LR: 0.001
+  BIAS_LR_FACTOR: 1.0
+  CHECKPOINT_PERIOD: 5000
+  CLIP_GRADIENTS:
+    CLIP_TYPE: value
+    CLIP_VALUE: 1.0
+    ENABLED: false
+    NORM_TYPE: 2.0
+  GAMMA: 0.1
+  IMS_PER_BATCH: 16
+  LR_SCHEDULER_NAME: WarmupMultiStepLR
+  MAX_ITER: 40000
+  MOMENTUM: 0.9
+  NESTEROV: false
+  REFERENCE_WORLD_SIZE: 0
+  STEPS:
+  - 30000
+  WARMUP_FACTOR: 0.001
+  WARMUP_ITERS: 1000
+  WARMUP_METHOD: linear
+  WEIGHT_DECAY: 0.0001
+  WEIGHT_DECAY_BIAS: 0.0001
+  WEIGHT_DECAY_NORM: 0.0
+TEST:
+  AUG:
+    ENABLED: false
+    FLIP: true
+    MAX_SIZE: 4000
+    MIN_SIZES:
+    - 400
+    - 500
+    - 600
+    - 700
+    - 800
+    - 900
+    - 1000
+    - 1100
+    - 1200
+  DETECTIONS_PER_IMAGE: 100
+  EVAL_PERIOD: 0
+  EXPECTED_RESULTS: []
+  KEYPOINT_OKS_SIGMAS: []
+  PRECISE_BN:
+    ENABLED: false
+    NUM_ITER: 200
+VERSION: 2
+VIS_PERIOD: 0
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/linear_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/linear_utils.py
new file mode 100644
index 000000000..f0d361068
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/linear_utils.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This file is used for replacing Paddle's native Linear implementations with vendors' customized implementations
+"""
+
+import paddle.distributed.fleet.meta_parallel as mpu
+from paddle import nn
+
+try:
+    from paddle.distributed.fleet.utils import sequence_parallel_utils
+except:
+    sequence_parallel_utils = None
+
+from paddlenlp.transformers.mc2_parallel_linear import (
+    MC2ColumnSeqParallelLinear,
+    MC2RowSeqParallelLinear,
+)
+from paddlenlp.utils.tools import get_env_device
+
+Linear = nn.Linear
+ColumnParallelLinear = mpu.ColumnParallelLinear
+RowParallelLinear = mpu.RowParallelLinear
+try:
+    ColumnSequenceParallelLinear = sequence_parallel_utils.ColumnSequenceParallelLinear
+    RowSequenceParallelLinear = sequence_parallel_utils.RowSequenceParallelLinear
+except:
+
+    class ColumnSequenceParallelLinearPass(object):
+        """
+        A dummy class for ColumnSequenceParallelLinear, used when the actual class
+        cannot be imported from sequence_parallel_utils.
+        """
+
+        pass
+
+    class RowSequenceParallelLinearPass(object):
+        """
+        A dummy class for RowSequenceParallelLinear, used when the actual class
+        cannot be imported from sequence_parallel_utils.
+        """
+
+        pass
+
+    ColumnSequenceParallelLinear = ColumnSequenceParallelLinearPass
+    RowSequenceParallelLinear = RowSequenceParallelLinearPass
+
+if get_env_device() == "npu":
+    if MC2ColumnSeqParallelLinear is not None and MC2RowSeqParallelLinear is not None:
+        ColumnSequenceParallelLinear = MC2ColumnSeqParallelLinear
+        RowSequenceParallelLinear = MC2RowSeqParallelLinear
+elif get_env_device() == "xpu":
+    try:
+        from paddle_xpu.layers.nn import ColumnParallelLinear as XPUColumnParallelLinear
+        from paddle_xpu.layers.nn import Linear as XPULinear
+        from paddle_xpu.layers.nn import RowParallelLinear as XPURowParallelLinear
+        from paddle_xpu.layers.nn.sequence_parallel import (
+            XPUColumnSequenceParallelLinear,
+            XPURowSequenceParallelLinear,
+        )
+
+        Linear = XPULinear
+        ColumnParallelLinear = XPUColumnParallelLinear
+        RowParallelLinear = XPURowParallelLinear
+        ColumnSequenceParallelLinear = XPUColumnSequenceParallelLinear
+        RowSequenceParallelLinear = XPURowSequenceParallelLinear
+    except ImportError:
+        # If paddle_xpu is not installed, just use Paddle's native Linear implementations
+        pass
+else:
+    # By default, use Paddle's native Linear implementations
+    pass
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/LICENSE b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/LICENSE
new file mode 100644
index 000000000..b1c9239ba
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/LICENSE
@@ -0,0 +1,76 @@
+LLaMA LICENSE AGREEMENT
+This License Agreement (as may be amended in accordance with this License Agreement, “License”), between you, or your employer or other entity (if you are entering into this agreement on behalf of your employer or other entity) (“Licensee” or “you”) and Meta Platforms, Inc. (“Meta” or “we”) applies to your use of any computer program, algorithm, source code, object code, or software that is made available by Meta under this License (“Software”) and any specifications, manuals, documentation, and other written information provided by Meta related to the Software (“Documentation”).
+
+By clicking “I Accept” below or by using the Software, you agree to the terms of this License. If you do not agree to this License, then you do not have any rights to use the Software or Documentation (collectively, the “Software Products”), and you must immediately cease using the Software Products. If you are agreeing to be bound by the terms of this License on behalf of your employer or other entity, you represent and warrant to Meta that you have full legal authority to bind your employer or such entity to this License. If you do not have the requisite authority, you may not accept the License or access the Software Products on behalf of your employer or other entity.
+
+
+
+LICENSE GRANT
+
+a. Subject to your compliance with the Documentation and Sections 2, 3, and 5, Meta grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty free and limited license under Meta’s copyright interests to reproduce, distribute, and create derivative works of the Software solely for your non-commercial research purposes. The foregoing license is personal to you, and you may not assign or sublicense this License or any other rights or obligations under this License without Meta’s prior written consent; any such assignment or sublicense will be void and will automatically and immediately terminate this License.
+
+b. You may make a reasonable number of copies of the Documentation solely for use in connection with the license to the Software granted above.
+
+c. The grant of rights expressly set forth in this Section 1 (License Grant) are the complete grant of rights to you in the Software Products, and no other licenses are granted, whether by waiver, estoppel, implication, equity or otherwise. Meta and its licensors reserve all rights not expressly granted by this License.
+
+
+RESTRICTIONS
+
+You will not, and will not permit, assist or cause any third party to:
+
+a. use, modify, copy, reproduce, create derivative works of, or distribute the Software Products (or any derivative works thereof, works incorporating the Software Products, or any data produced by the Software), in whole or in part, for (i) any commercial or production purposes, (ii) military purposes or in the service of nuclear technology, (iii) purposes of surveillance, including any research or development relating to surveillance, (iv) biometric processing, (v) in any manner that infringes, misappropriates, or otherwise violates any third-party rights, or (vi) in any manner that violates any applicable law, including accessing the Software Products from an embargoed country as prohibited by the U.S. government, and violating any privacy or security laws, rules, regulations, directives, or governmental requirements (including the General Data Privacy Regulation (Regulation (EU) 2016/679), the California Consumer Privacy Act, and any and all laws governing the processing of biometric information), as well as all amendments and successor laws to any of the foregoing;
+
+b. alter or remove copyright and other proprietary notices which appear on or in the Software Products;
+
+c. utilize any equipment, device, software, or other means to circumvent or remove any security or protection used by Meta in connection with the Software, or to circumvent or remove any usage restrictions, or to enable functionality disabled by Meta; or
+
+d. offer or impose any terms on the Software Products that alter, restrict, or are inconsistent with the terms of this License.
+
+
+ATTRIBUTION
+
+Together with any copies of the Software Products (as well as derivative works thereof or works incorporating the Software Products) that you distribute, you must provide (i) a copy of this License, and (ii) the following attribution notice: “LLaMA is licensed under the LLaMA license, Copyright (c) Meta Platforms, Inc. All Rights Reserved.”
+
+
+DISCLAIMERS
+
+THE SOFTWARE PRODUCTS ARE PROVIDED “AS IS” and “WITH ALL FAULTS” WITH NO WARRANTY OF ANY KIND, EXPRESS OR IMPLIED. META EXPRESSLY DISCLAIMS ALL REPRESENTATIONS AND WARRANTIES, EXPRESS OR IMPLIED, WHETHER BY STATUTE, CUSTOM, USAGE OR OTHERWISE AS TO ANY MATTERS RELATED TO THE SOFTWARE PRODUCTS, INCLUDING BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE, SATISFACTORY QUALITY, OR NON-INFRINGEMENT. META MAKES NO WARRANTIES OR REPRESENTATIONS THAT THE SOFTWARE PRODUCTS WILL BE ERROR FREE OR FREE OF VIRUSES OR OTHER HARMFUL COMPONENTS, OR PRODUCE ANY PARTICULAR RESULTS.
+
+
+LIMITATION OF LIABILITY
+
+TO THE FULLEST EXTENT PERMITTED BY LAW, IN NO EVENT WILL META BE LIABLE TO YOU (A) UNDER ANY THEORY OF LIABILITY, WHETHER BASED IN CONTRACT, TORT, NEGLIGENCE, STRICT LIABILITY, WARRANTY, OR OTHERWISE UNDER THIS LICENSE, OR (B) FOR ANY INDIRECT, CONSEQUENTIAL, EXEMPLARY, INCIDENTAL, PUNITIVE OR SPECIAL DAMAGES OR LOST PROFITS, EVEN IF META HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. THE SOFTWARE PRODUCTS, THEIR CONSTITUENT COMPONENTS, AND ANY OUTPUT (COLLECTIVELY, “SOFTWARE MATERIALS”) ARE NOT DESIGNED OR INTENDED FOR USE IN ANY APPLICATION OR SITUATION WHERE FAILURE OR FAULT OF THE SOFTWARE MATERIALS COULD REASONABLY BE ANTICIPATED TO LEAD TO SERIOUS INJURY OF ANY PERSON, INCLUDING POTENTIAL DISCRIMINATION OR VIOLATION OF AN INDIVIDUAL’S PRIVACY RIGHTS, OR TO SEVERE PHYSICAL, PROPERTY, OR ENVIRONMENTAL DAMAGE (EACH, A “HIGH-RISK USE”). IF YOU ELECT TO USE ANY OF THE SOFTWARE MATERIALS FOR A HIGH-RISK USE, YOU DO SO AT YOUR OWN RISK. YOU AGREE TO DESIGN AND IMPLEMENT APPROPRIATE DECISION-MAKING AND RISK-MITIGATION PROCEDURES AND POLICIES IN CONNECTION WITH A HIGH-RISK USE SUCH THAT EVEN IF THERE IS A FAILURE OR FAULT IN ANY OF THE SOFTWARE MATERIALS, THE SAFETY OF PERSONS OR PROPERTY AFFECTED BY THE ACTIVITY STAYS AT A LEVEL THAT IS REASONABLE, APPROPRIATE, AND LAWFUL FOR THE FIELD OF THE HIGH-RISK USE.
+
+
+INDEMNIFICATION
+
+You will indemnify, defend and hold harmless Meta and our subsidiaries and affiliates, and each of our respective shareholders, directors, officers, employees, agents, successors, and assigns (collectively, the “Meta Parties”) from and against any losses, liabilities, damages, fines, penalties, and expenses (including reasonable attorneys’ fees) incurred by any Meta Party in connection with any claim, demand, allegation, lawsuit, proceeding, or investigation (collectively, “Claims”) arising out of or related to: (a) your access to or use of the Software Products (as well as any results or data generated from such access or use), including any High-Risk Use (defined below); (b) your violation of this License; or (c) your violation, misappropriation or infringement of any rights of another (including intellectual property or other proprietary rights and privacy rights). You will promptly notify the Meta Parties of any such Claims, and cooperate with Meta Parties in defending such Claims. You will also grant the Meta Parties sole control of the defense or settlement, at Meta’s sole option, of any Claims. This indemnity is in addition to, and not in lieu of, any other indemnities or remedies set forth in a written agreement between you and Meta or the other Meta Parties.
+
+
+TERMINATION; SURVIVAL
+
+a. This License will automatically terminate upon any breach by you of the terms of this License.
+
+b. We may terminate this License, in whole or in part, at any time upon notice (including electronic) to you.
+
+c. The following sections survive termination of this License: 2 (Restrictions), 3 (Attribution), 4 (Disclaimers), 5 (Limitation on Liability), 6 (Indemnification) 7 (Termination; Survival), 8 (Third Party Materials), 9 (Trademarks), 10 (Applicable Law; Dispute Resolution), and 11 (Miscellaneous).
+
+
+THIRD PARTY MATERIALS
+
+The Software Products may contain third-party software or other components (including free and open source software) (all of the foregoing, “Third Party Materials”), which are subject to the license terms of the respective third-party licensors. Your dealings or correspondence with third parties and your use of or interaction with any Third Party Materials are solely between you and the third party. Meta does not control or endorse, and makes no representations or warranties regarding, any Third Party Materials, and your access to and use of such Third Party Materials are at your own risk.
+
+
+TRADEMARKS
+
+Licensee has not been granted any trademark license as part of this License and may not use any name or mark associated with Meta without the prior written permission of Meta, except to the extent necessary to make the reference required by the “ATTRIBUTION” section of this Agreement.
+
+
+APPLICABLE LAW; DISPUTE RESOLUTION
+
+This License will be governed and construed under the laws of the State of California without regard to conflicts of law provisions. Any suit or proceeding arising out of or relating to this License will be brought in the federal or state courts, as applicable, in San Mateo County, California, and each party irrevocably submits to the jurisdiction and venue of such courts.
+
+
+MISCELLANEOUS
+
+If any provision or part of a provision of this License is unlawful, void or unenforceable, that provision or part of the provision is deemed severed from this License, and will not affect the validity and enforceability of any remaining provisions. The failure of Meta to exercise or enforce any right or provision of this License will not operate as a waiver of such right or provision. This License does not confer any third-party beneficiary rights upon any other person or entity. This License, together with the Documentation, contains the entire understanding between you and Meta regarding the subject matter of this License, and supersedes all other written or oral agreements and understandings between you and Meta regarding such subject matter. No change or addition to any provision of this License will be binding unless it is in writing and signed by an authorized representative of both you and Meta.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/Llama2.LICENSE b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/Llama2.LICENSE
new file mode 100644
index 000000000..51089e27e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/Llama2.LICENSE
@@ -0,0 +1,126 @@
+LLAMA 2 COMMUNITY LICENSE AGREEMENT	
+Llama 2 Version Release Date: July 18, 2023
+
+"Agreement" means the terms and conditions for use, reproduction, distribution and 
+modification of the Llama Materials set forth herein.
+
+"Documentation" means the specifications, manuals and documentation 
+accompanying Llama 2 distributed by Meta at ai.meta.com/resources/models-and-
+libraries/llama-downloads/.
+
+"Licensee" or "you" means you, or your employer or any other person or entity (if 
+you are entering into this Agreement on such person or entity's behalf), of the age 
+required under applicable laws, rules or regulations to provide legal consent and that 
+has legal authority to bind your employer or such other person or entity if you are 
+entering in this Agreement on their behalf.
+
+"Llama 2" means the foundational large language models and software and 
+algorithms, including machine-learning model code, trained model weights, 
+inference-enabling code, training-enabling code, fine-tuning enabling code and other 
+elements of the foregoing distributed by Meta at ai.meta.com/resources/models-and-
+libraries/llama-downloads/.
+
+"Llama Materials" means, collectively, Meta's proprietary Llama 2 and 
+Documentation (and any portion thereof) made available under this Agreement.
+
+"Meta" or "we" means Meta Platforms Ireland Limited (if you are located in or, if you 
+are an entity, your principal place of business is in the EEA or Switzerland) and Meta 
+Platforms, Inc. (if you are located outside of the EEA or Switzerland). 
+
+By clicking "I Accept" below or by using or distributing any portion or element of the 
+Llama Materials, you agree to be bound by this Agreement.
+
+1. License Rights and Redistribution. 
+
+      a. Grant of Rights. You are granted a non-exclusive, worldwide, non-
+transferable and royalty-free limited license under Meta's intellectual property or 
+other rights owned by Meta embodied in the Llama Materials to use, reproduce, 
+distribute, copy, create derivative works of, and make modifications to the Llama 
+Materials.  
+      
+      b. Redistribution and Use.  
+
+            i. If you distribute or make the Llama Materials, or any derivative works 
+thereof, available to a third party, you shall provide a copy of this Agreement to such 
+third party. 
+            ii.  If you receive Llama Materials, or any derivative works thereof, from 
+a Licensee as part of an integrated end user product, then Section 2 of this 
+Agreement will not apply to you. 
+
+            iii. You must retain in all copies of the Llama Materials that you 
+distribute the following attribution notice within a "Notice" text file distributed as a 
+part of such copies: "Llama 2 is licensed under the LLAMA 2 Community License, 
+Copyright (c) Meta Platforms, Inc. All Rights Reserved."
+
+            iv. Your use of the Llama Materials must comply with applicable laws 
+and regulations (including trade compliance laws and regulations) and adhere to the 
+Acceptable Use Policy for the Llama Materials (available at 
+https://ai.meta.com/llama/use-policy), which is hereby incorporated by reference into 
+this Agreement.
+
+            v. You will not use the Llama Materials or any output or results of the 
+Llama Materials to improve any other large language model (excluding Llama 2 or 
+derivative works thereof).  
+
+2. Additional Commercial Terms. If, on the Llama 2 version release date, the 
+monthly active users of the products or services made available by or for Licensee, 
+or Licensee's affiliates, is greater than 700 million monthly active users in the 
+preceding calendar month, you must request a license from Meta, which Meta may 
+grant to you in its sole discretion, and you are not authorized to exercise any of the 
+rights under this Agreement unless or until Meta otherwise expressly grants you 
+such rights.
+            
+3. Disclaimer of Warranty. UNLESS REQUIRED BY APPLICABLE LAW, THE 
+LLAMA MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM ARE 
+PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 
+EITHER EXPRESS OR IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY 
+WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR 
+FITNESS FOR A PARTICULAR PURPOSE. YOU ARE SOLELY RESPONSIBLE 
+FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING 
+THE LLAMA MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR 
+USE OF THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS.
+
+4. Limitation of Liability. IN NO EVENT WILL META OR ITS AFFILIATES BE 
+LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, TORT, 
+NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS 
+AGREEMENT, FOR ANY LOST PROFITS OR ANY INDIRECT, SPECIAL, 
+CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR PUNITIVE DAMAGES, EVEN 
+IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF 
+ANY OF THE FOREGOING.
+ 
+5. Intellectual Property.
+
+      a. No trademark licenses are granted under this Agreement, and in 
+connection with the Llama Materials, neither Meta nor Licensee may use any name 
+or mark owned by or associated with the other or any of its affiliates, except as 
+required for reasonable and customary use in describing and redistributing the 
+Llama Materials.
+
+      b. Subject to Meta's ownership of Llama Materials and derivatives made by or 
+for Meta, with respect to any derivative works and modifications of the Llama 
+Materials that are made by you, as between you and Meta, you are and will be the 
+owner of such derivative works and modifications.
+
+      c. If you institute litigation or other proceedings against Meta or any entity 
+(including a cross-claim or counterclaim in a lawsuit) alleging that the Llama 
+Materials or Llama 2 outputs or results, or any portion of any of the foregoing, 
+constitutes infringement of intellectual property or other rights owned or licensable 
+by you, then any licenses granted to you under this Agreement shall terminate as of 
+the date such litigation or claim is filed or instituted. You will indemnify and hold 
+harmless Meta from and against any claim by any third party arising out of or related 
+to your use or distribution of the Llama Materials.
+
+6. Term and Termination. The term of this Agreement will commence upon your 
+acceptance of this Agreement or access to the Llama Materials and will continue in 
+full force and effect until terminated in accordance with the terms and conditions 
+herein. Meta may terminate this Agreement if you are in breach of any term or 
+condition of this Agreement. Upon termination of this Agreement, you shall delete 
+and cease use of the Llama Materials. Sections 3, 4 and 7 shall survive the 
+termination of this Agreement. 
+
+7. Governing Law and Jurisdiction. This Agreement will be governed and 
+construed under the laws of the State of California without regard to choice of law 
+principles, and the UN Convention on Contracts for the International Sale of Goods 
+does not apply to this Agreement. The courts of California shall have exclusive 
+jurisdiction of any dispute arising out of this Agreement. 
+
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/__init__.py
new file mode 100644
index 000000000..10dc4b2ea
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .configuration import *
+from .modeling import *
+from .modeling_auto import *
+from .modeling_auto_static import *
+from .modeling_pp import *
+from .tokenizer import *
+from .tokenizer_fast import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/configuration.py
new file mode 100644
index 000000000..8d24ee98a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/configuration.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Llama model configuration"""
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = [
+    "LLAMA_PRETRAINED_INIT_CONFIGURATION",
+    "LlamaConfig",
+    "LLAMA_PRETRAINED_RESOURCE_FILES_MAP",
+]
+
+LLAMA_PRETRAINED_INIT_CONFIGURATION = {
+    # Hypothetical model weights (tiny-random-llama & micro-random-llama) for test only
+    "__internal_testing__/micro-random-llama": {
+        "architectures": ["LlamaForCausalLM"],
+        "hidden_size": 64,
+        "initializer_range": 0.02,
+        "intermediate_size": 1000,
+        "max_position_embeddings": 2048,
+        "model_type": "llama",
+        "num_attention_heads": 8,
+        "num_hidden_layers": 1,
+        "rms_norm_eps": 1e-06,
+        "vocab_size": 32000,
+        "bos_token_id": 1,
+        "eos_token_id": 2,
+        "pad_token_id": 0,
+    },
+    "__internal_testing__/tiny-random-llama": {
+        "architectures": ["LlamaForCausalLM"],
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "intermediate_size": 11008,
+        "max_position_embeddings": 2048,
+        "model_type": "llama",
+        "num_attention_heads": 8,
+        "num_hidden_layers": 2,
+        "rms_norm_eps": 1e-06,
+        "vocab_size": 32000,
+        "bos_token_id": 1,
+        "eos_token_id": 2,
+        "pad_token_id": 0,
+    },
+}
+
+# Hypothetical model weights (tiny-random-llama) for test only
+LLAMA_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "__internal_testing__/micro-random-llama": "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/micro-random-llama/model_state.pdparams",
+        "__internal_testing__/tiny-random-llama": "https://bj.bcebos.com/paddlenlp/models/community/__internal_testing__/tiny-random-llama/model_state.pdparams",
+    },
+}
+
+
+class LlamaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`~LlamaModel`]. It is used to instantiate an Llama
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Llama-7B.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Llama model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~LlamaModel`] or [`~TFLlamaModel`].
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+            Enable rope fusion or not.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        Example:
+    ```python
+    >>> from paddlenlp.transformer import LlamaModel, LlamaConfig
+
+    >>> # Initializing a Llama llama-7b style configuration
+    >>> configuration = LlamaConfig()
+
+    >>> # Initializing a model from the llama-7b style configuration
+    >>> model = LlamaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "llama"
+    attribute_map = {
+        "n_positions": "max_position_embeddings",
+        "n_embd": "hidden_size",
+        "n_layer": "num_hidden_layers",
+        "n_head": "num_attention_heads",
+        "n_inner": "intermediate_size",
+        "activation_function": "hidden_act",
+    }
+    pretrained_init_configuration = LLAMA_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        max_position_embeddings=2048,
+        seq_length=2048,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        rope_theta=10000.0,
+        use_cache=True,
+        fuse_attention_qkv=False,
+        fuse_attention_ffn=False,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        alibi=False,
+        rope_scaling_factor=1.0,
+        rope_scaling_type=None,
+        long_sequence_strategy_type=None,
+        long_sequence_strategy_name=None,
+        long_sequence_init_args=None,
+        use_long_sequence_strategies=False,
+        use_flash_attention_for_generation=False,
+        use_last_token_for_generation=False,
+        immediate_clear_past_key_value=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.max_position_embeddings = max_position_embeddings
+        self.seq_length = seq_length
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.rope_theta = rope_theta
+
+        self.use_cache = use_cache
+        self.fuse_attention_qkv = fuse_attention_qkv
+        self.fuse_attention_ffn = fuse_attention_ffn
+
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.alibi = alibi
+
+        self.rope_scaling_factor = rope_scaling_factor
+        self.rope_scaling_type = rope_scaling_type
+
+        self.long_sequence_strategy_type = long_sequence_strategy_type
+        self.long_sequence_strategy_name = long_sequence_strategy_name
+        self.long_sequence_init_args = {} if long_sequence_init_args is None else long_sequence_init_args
+        self.use_long_sequence_strategies = use_long_sequence_strategies
+        self.use_flash_attention_for_generation = use_flash_attention_for_generation
+        self.use_last_token_for_generation = use_last_token_for_generation
+        self.immediate_clear_past_key_value = immediate_clear_past_key_value
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @property
+    def rope(self):
+        return not self.alibi
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/fusion_ops.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/fusion_ops.py
new file mode 100644
index 000000000..f58c92e7c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/fusion_ops.py
@@ -0,0 +1,255 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import paddle
+import paddle.nn.functional as F
+
+try:
+    from paddle.incubate.nn.functional import fused_rotary_position_embedding
+except ImportError:
+    fused_rotary_position_embedding = None
+
+try:
+    from paddle.incubate.nn.functional import swiglu
+except ImportError:
+
+    def swiglu(x, y=None):
+        if y is None:
+            x, y = paddle.chunk(x, chunks=2, axis=-1)
+        return F.silu(x) * y
+
+
+from paddle.utils import try_import
+
+from paddlenlp.utils.tools import get_env_device
+
+try:
+    from paddle.incubate.nn.functional import fused_rotary_position_embedding
+except ImportError:
+    fused_rotary_position_embedding = None
+try:
+    if get_env_device() in ["npu", "gcu"]:
+        from paddle.base import core
+
+        for lib in os.listdir(os.getenv("CUSTOM_DEVICE_ROOT")):
+            if lib.endswith(".so"):
+                paddle.utils.cpp_extension.extension_utils.load_op_meta_info_and_register_op(lib)
+    from paddle.nn.functional.flash_attention import flash_attention
+except:
+    flash_attention = None
+
+from paddlenlp.transformers.ring_flash_attention import RingFlashAttention
+
+
+def fusion_rope(
+    query_states,
+    key_states,
+    value_states,
+    hidden_states,
+    position_ids,
+    past_key_value,
+    rotary_emb,
+    context_parallel_degree=-1,
+):
+    if get_env_device() != "gcu":
+        assert past_key_value is None, "fuse rotary not support cache kv for now"
+    batch_size, seq_length, num_heads, head_dim = query_states.shape
+    _, kv_seq_len, num_key_value_heads, _ = key_states.shape
+    if context_parallel_degree > 1:
+        assert get_env_device() == "gpu", "context parallel only support cuda device for now"
+        kv_seq_len *= context_parallel_degree
+    if get_env_device() != "gcu":
+        cos, sin = rotary_emb(value_states, seq_len=kv_seq_len)
+    if get_env_device() == "npu":
+        query_states = core.eager._run_custom_op("fused_rope", query_states, cos, sin)[0]
+        key_states = core.eager._run_custom_op("fused_rope", key_states, cos, sin)[0]
+    elif get_env_device() == "gcu":
+        cos_sin = rotary_emb.get_fused_cos_sin(value_states, seq_len=kv_seq_len)
+        query_states, key_states = core.eager._run_custom_op(
+            "fused_rotary_embedding_gcu", query_states, key_states, cos_sin, position_ids, True
+        )
+    else:
+        # paddle version > 2.6 or develop support q and k/v with different num_heads
+        paddle_version = float(paddle.__version__[:3])
+        if ((paddle_version != 0.0) and (paddle_version <= 2.6)) and (num_heads != num_key_value_heads):
+            query_states, _, _ = fused_rotary_position_embedding(
+                query_states,
+                None,
+                None,
+                sin=sin,
+                cos=cos,
+                position_ids=position_ids,
+                use_neox_rotary_style=False,
+            )
+            key_states, _, _ = fused_rotary_position_embedding(
+                key_states,
+                None,
+                None,
+                sin=sin,
+                cos=cos,
+                position_ids=position_ids,
+                use_neox_rotary_style=False,
+            )
+        else:
+            query_states, key_states, _ = fused_rotary_position_embedding(
+                query_states,
+                key_states,
+                v=None,
+                sin=sin,
+                cos=cos,
+                position_ids=position_ids,
+                use_neox_rotary_style=False,
+            )
+    return query_states, key_states
+
+
+def rms_norm_fused(x_in, w, eps, use_fast_ln=False):
+    if use_fast_ln:
+        fast_ln = try_import("fast_ln")
+        return fast_ln.fast_rms_norm(x_in, w, eps)[0]
+    else:
+        fused_ln = try_import("fused_ln")
+        return fused_ln.fused_rms_norm(x_in, w, eps)[0]
+
+
+def fusion_rms_norm(hidden_states, weight, variance_epsilon, use_fast_ln=False):
+    if get_env_device() == "npu":
+        return core.eager._run_custom_op("rms_norm_npu", hidden_states, weight, variance_epsilon)[0]
+    elif get_env_device() == "gcu":
+        return core.eager._run_custom_op("rms_norm_gcu", hidden_states, weight, variance_epsilon)[0]
+    elif get_env_device() == "xpu":
+        try:
+            import paddle_xpu_nn  # noqa: F821
+
+            return paddle_xpu_nn.xpu_rms_norm(hidden_states, weight, variance_epsilon)[0]
+        except ImportError:
+            raise NotImplementedError(
+                f"Implementation of fused_rms_norm is not available on {get_env_device()}. Please install paddle_xpu to use this feature"
+            )
+    return rms_norm_fused(hidden_states, weight, variance_epsilon, use_fast_ln)
+
+
+def fusion_flash_attention(
+    query_states,
+    config,
+    key_states,
+    value_states,
+    attention_mask,
+    output_attentions,
+    alibi=None,
+    attn_mask_startend_row_indices=None,
+    sequence_parallel=False,
+    reshard_layer=None,
+    npu_is_casual=False,
+):
+    bsz, q_len, num_heads, head_dim = query_states.shape
+    _, kv_seq_len, _, _ = value_states.shape
+    version = paddle.version.full_version
+    if version != "0.0.0" and version <= "2.5.2":
+        if alibi is not None:
+            raise ValueError("Flash Attention doesn't support alibi")
+        if config.context_parallel_degree > 1:
+            raise ValueError(f"Context parallel is not implemented in version {version}")
+        attn_output, attn_weights = flash_attention(
+            query_states,
+            key_states,
+            value_states,
+            causal=True,
+            return_softmax=output_attentions,
+        )
+    else:
+        if alibi is not None:
+            alibi = alibi.reshape([bsz, num_heads, 1, -1])
+            attention_mask = attention_mask.cast(alibi.dtype) + alibi
+        if get_env_device() == "npu":
+            if config.context_parallel_degree > 1:
+                raise ValueError("Context parallel is not implemented for npu")
+            attn_output = core.eager._run_custom_op(
+                "flash_attention_npu",
+                query_states,
+                key_states,
+                value_states,
+                None,
+                attention_mask,
+                0.0,
+                attention_mask is None,
+                True,
+                False,
+                npu_is_casual,
+            )[0]
+        elif get_env_device() == "gcu":
+            if config.context_parallel_degree > 1:
+                raise ValueError("Context parallel is not implemented for gcu")
+            attn_output = core.eager._run_custom_op(
+                "fused_sdp_flash_attention_gcu",
+                query_states,
+                key_states,
+                value_states,
+                attention_mask,
+                0.0,
+                attention_mask is None,
+                True,
+            )[0]
+        else:
+            if config.context_parallel_degree > 1:
+                attn_output = RingFlashAttention.apply(
+                    query_states,
+                    key_states,
+                    value_states,
+                    attn_mask=None,
+                    is_causal=True,
+                )
+            else:
+                if attn_mask_startend_row_indices is not None:
+                    assert alibi is None, "flash_attention_with_sparse_mask not support alibi"
+                    if len(attn_mask_startend_row_indices.shape) == 2:
+                        attn_mask_startend_row_indices = paddle.unsqueeze(attn_mask_startend_row_indices, axis=1)
+                    attn_output = F.flash_attention_with_sparse_mask(
+                        query_states,
+                        key_states,
+                        value_states,
+                        attn_mask_start_row_indices=attn_mask_startend_row_indices,
+                        is_causal=True,
+                    )
+                else:
+                    attn_output = F.scaled_dot_product_attention(
+                        query_states,
+                        key_states,
+                        value_states,
+                        attn_mask=attention_mask,
+                        is_causal=attention_mask is None and query_states.shape[1] != 1,
+                    )
+        attn_weights = None
+
+    if reshard_layer is not None:
+        # attn_output shape: [bs, seqlen, num_head/sep, head_dim]
+        attn_output = reshard_layer(
+            attn_output,
+            split_axis=1,
+            concat_axis=2,
+        )
+        # attn_output shape: [bs, seqlen/sep, num_head, head_dim]
+        assert (
+            config.sep_parallel_degree > 1 and q_len % config.sep_parallel_degree == 0
+        ), f"q_len:{q_len}, config.sep_parallel_degree:{config.sep_parallel_degree}"
+        q_len = q_len // config.sep_parallel_degree
+        num_heads = num_heads * config.sep_parallel_degree
+
+    if sequence_parallel:
+        attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])
+    else:
+        attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
+    return (attn_output, attn_weights) if output_attentions else attn_output
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/modeling.py
new file mode 100644
index 000000000..0e90e1159
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/modeling.py
@@ -0,0 +1,2008 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Paddle Llama model"""
+from __future__ import annotations
+
+import math
+import os
+import warnings
+from functools import partial
+from typing import Optional, Tuple
+
+import paddle
+import paddle.distributed.fleet.meta_parallel as mpu
+import paddle.nn.functional as F
+from paddle import Tensor, nn
+from paddle.autograd import PyLayer
+from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
+from paddle.distributed.fleet.utils import recompute
+
+try:
+    from paddle.incubate.nn.functional import fused_rotary_position_embedding
+except ImportError:
+    fused_rotary_position_embedding = None
+
+try:
+    from paddle.incubate.nn.functional import swiglu
+except ImportError:
+
+    def swiglu(x, y=None):
+        if y is None:
+            x, y = paddle.chunk(x, chunks=2, axis=-1)
+        return F.silu(x) * y
+
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        GatherOp,
+        ScatterOp,
+        mark_as_sequence_parallel_parameter,
+    )
+except:
+    pass
+
+from paddlenlp.transformers.conversion_utils import (
+    StateDictNameMapping,
+    init_name_mappings,
+)
+from paddlenlp.transformers.long_sequence_strategies import LongSequenceStrategies
+from paddlenlp.transformers.model_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+)
+from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model
+from paddlenlp.utils.log import logger
+from paddlenlp.utils.tools import get_env_device
+
+from .. import linear_utils
+from ..linear_utils import Linear
+from ..segment_parallel_utils import ReshardLayer
+from .configuration import (
+    LLAMA_PRETRAINED_INIT_CONFIGURATION,
+    LLAMA_PRETRAINED_RESOURCE_FILES_MAP,
+    LlamaConfig,
+)
+
+try:
+    if get_env_device() in ["npu", "gcu"]:
+
+        for lib in os.listdir(os.getenv("CUSTOM_DEVICE_ROOT")):
+            if lib.endswith(".so"):
+                paddle.utils.cpp_extension.extension_utils.load_op_meta_info_and_register_op(lib)
+    from paddle.nn.functional.flash_attention import flash_attention
+except:
+    flash_attention = None
+from . import fusion_ops
+
+rms_norm_fused = fusion_ops.rms_norm_fused
+
+__all__ = [
+    "LlamaModel",
+    "LlamaPretrainedModel",
+    "LlamaForCausalLM",
+    "LlamaPretrainingCriterion",
+]
+
+
+def _get_interleave(n):
+    def _get_interleave_power_of_2(n):
+        start = 2 ** (-(2 ** -(math.log2(n) - 3)))
+        ratio = start
+        return [start * ratio**i for i in range(n)]
+
+    if math.log2(n).is_integer():
+        return _get_interleave_power_of_2(n)
+    else:
+        closest_power_of_2 = 2 ** math.floor(math.log2(n))
+        return (
+            _get_interleave_power_of_2(closest_power_of_2)
+            + _get_interleave(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
+        )
+
+
+def get_use_casual_mask():
+    """Get the value of the 'USE_CASUAL_MASK' environment variable."""
+    return os.getenv("USE_CASUAL_MASK", "False") == "True"
+
+
+def build_alibi_tensor(
+    bool_attention_mask: Tensor, num_heads: int, dtype: paddle.dtype, tensor_parallel_degree=1
+) -> Tensor:
+    batch_size, seq_length = bool_attention_mask.shape[0], bool_attention_mask.shape[-1]
+    slopes = paddle.to_tensor(_get_interleave(num_heads), dtype="float32")
+    alibi = slopes.unsqueeze(axis=[1, 2]) * paddle.arange(seq_length, dtype="float32").unsqueeze(axis=[0, 1]).expand(
+        [num_heads, -1, -1]
+    )
+    alibi = alibi.reshape(shape=(1, num_heads, 1, seq_length)).expand([batch_size, -1, -1, -1])
+    return paddle.cast(alibi, dtype)
+
+
+def get_triangle_upper_mask(x, mask=None):
+    if mask is not None:
+        return mask
+    # [bsz, n_head, q_len, kv_seq_len]
+    shape = x.shape
+    #  [bsz, 1, q_len, kv_seq_len]
+    shape[1] = 1
+    mask = paddle.full(shape, paddle.finfo(x.dtype).min, dtype=x.dtype)
+    mask = paddle.triu(mask, diagonal=1)
+    mask.stop_gradient = True
+    return mask
+
+
+def assign_kv_heads(num_kv_heads: int, num_gpus: int):
+    # Initialize the assignment list
+    """
+    Assign kv heads to different GPUs in the Tensor Parallel Setup
+
+    Examples:
+        assign_kv_heads(num_kv_heads=1, num_gpus=2): [[0], [0]]
+        assign_kv_heads(num_kv_heads=2, num_gpus=2): [[0], [1]]
+        assign_kv_heads(num_kv_heads=4, num_gpus=2): [[0,1], [2,3]]
+        assign_kv_heads(num_kv_heads=1, num_gpus=4): [[0],[0],[0],[0]]
+        assign_kv_heads(num_kv_heads=2, num_gpus=4): [[0],[0],[1],[1]]
+        assign_kv_heads(num_kv_heads=4, num_gpus=4): [[0],[1],[2],[3]]
+    """
+    assignment_list = [[] for _ in range(num_gpus)]
+    # Case 1: more heads than cards
+    if num_kv_heads > num_gpus:
+        num_heads_per_card = num_kv_heads // num_gpus
+        for i in range(num_gpus):
+            for j in range(num_heads_per_card):
+                assignment_list[i].append(i * num_heads_per_card + j)
+    # Case 2: more cards than heads. each card get only 1 head.
+    else:
+        num_card_per_heads = num_gpus // num_kv_heads
+        for i in range(num_kv_heads):
+            for j in range(num_card_per_heads):
+                assignment_list[i * num_card_per_heads + j].append(i)
+    return assignment_list
+
+
+def parallel_matmul(x: Tensor, y: Tensor, tensor_parallel_output=True):
+    is_fleet_init = True
+    tensor_parallel_degree = 1
+    try:
+        hcg = fleet.get_hybrid_communicate_group()
+        model_parallel_group = hcg.get_model_parallel_group()
+        tensor_parallel_degree = hcg.get_model_parallel_world_size()
+    except:
+        is_fleet_init = False
+
+    if paddle.in_dynamic_mode():
+        y_is_distributed = y.is_distributed
+    else:
+        y_is_distributed = tensor_parallel_degree > 1
+
+    if is_fleet_init and tensor_parallel_degree > 1 and y_is_distributed:
+        # if not running under distributed.launch, it will raise AttributeError: 'Fleet' object has no attribute '_hcg'
+        input_parallel = paddle.distributed.collective._c_identity(x, group=model_parallel_group)
+        logits = paddle.matmul(input_parallel, y, transpose_y=False)
+
+        if tensor_parallel_output:
+            return logits
+
+        return paddle.distributed.collective._c_concat(logits, group=model_parallel_group)
+
+    else:
+        logits = paddle.matmul(x, y, transpose_y=False)
+        return logits
+
+
+def scaled_dot_product_attention(
+    query_states,
+    config,
+    key_states,
+    value_states,
+    attention_mask,
+    output_attentions,
+    alibi=None,
+    attn_mask_startend_row_indices=None,
+    sequence_parallel=False,
+    reshard_layer=None,
+    npu_is_casual=False,
+):
+    bsz, q_len, num_heads, head_dim = query_states.shape
+    _, kv_seq_len, _, _ = value_states.shape
+
+    if config.use_flash_attention and flash_attention:
+        return fusion_ops.fusion_flash_attention(
+            query_states,
+            config,
+            key_states,
+            value_states,
+            attention_mask,
+            output_attentions,
+            alibi,
+            attn_mask_startend_row_indices,
+            sequence_parallel,
+            reshard_layer,
+            npu_is_casual,
+        )
+
+        # Paddle Flash Attention input [ bz, seqlen, nhead, head_dim]
+        # Torch Flash Attention input [ bz, nhead, seqlen, head_dim]
+
+    else:
+        if config.context_parallel_degree > 1:
+            raise ValueError("Context parallel requires `use_flash_attention=True`")
+
+        #  [ bz, seqlen, nhead, head_dim] -> [bs, nhead, seq_len, head_dim]
+        query_states = paddle.transpose(query_states, [0, 2, 1, 3])
+        # merge with the next tranpose
+        key_states = paddle.transpose(key_states, [0, 2, 1, 3])
+        value_states = paddle.transpose(value_states, [0, 2, 1, 3])
+
+        # matmul and devide by sqrt(head_dim)
+        attn_weights = paddle.matmul(query_states / math.sqrt(head_dim), key_states.transpose([0, 1, 3, 2]))
+        # then add alibi bias
+        if alibi is not None:
+            alibi = alibi.reshape([bsz, num_heads, 1, -1])
+            attn_weights = attn_weights + alibi
+
+        if paddle.in_dynamic_mode() and attn_weights.shape != [bsz, num_heads, q_len, kv_seq_len]:
+            raise ValueError(
+                f"Attention weights should be of shape {(bsz, num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.shape}"
+            )
+
+        # In sep mode, the attenion mask should be created in the runtime.
+        if reshard_layer is not None:
+            attention_mask = None
+
+        # NOTE: we only call get_triangle_upper_mask under PP setup
+        # FIXME ZHUI when we use pipeline parallel, the attention_mask can be None
+        # we just make it triangle_upper_mask
+        if attention_mask is None:
+            attention_mask = get_triangle_upper_mask(attn_weights)
+        attention_mask = attention_mask.reshape([bsz, 1, q_len, kv_seq_len])
+        if paddle.in_dynamic_mode() and attention_mask.shape != [bsz, 1, q_len, kv_seq_len]:
+            raise ValueError(
+                f"Attention mask should be of shape {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.shape}"
+            )
+
+        attn_weights = attn_weights + attention_mask
+        if not paddle.in_dynamic_mode():
+            attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query_states.dtype)
+        else:
+            with paddle.amp.auto_cast(False):
+                attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query_states.dtype)
+
+        attn_output = paddle.matmul(attn_weights, value_states)
+        attn_output = attn_output.transpose([0, 2, 1, 3])
+
+        if reshard_layer is not None:
+            attn_output = reshard_layer(
+                attn_output,
+                split_axis=1,
+                concat_axis=2,
+            )
+            q_len = q_len // config.sep_parallel_degree
+            num_heads = num_heads * config.sep_parallel_degree
+
+        if sequence_parallel:
+            attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])
+        else:
+            attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
+        return (attn_output, attn_weights) if output_attentions else attn_output
+
+
+def masked_fill(x, mask, value):
+    y = paddle.full(x.shape, value, x.dtype)
+    return paddle.where(mask, y, x)
+
+
+def is_casual_mask(attention_mask):
+    """
+    Upper triangular of attention_mask equals to attention_mask is casual
+    """
+    return (paddle.triu(attention_mask) == attention_mask).all().item()
+
+
+def _make_causal_mask(input_ids_shape, past_key_values_length):
+    """
+    Make casual mask used for self-attention
+    """
+    batch_size, target_length = input_ids_shape  # target_length: seq_len
+
+    if get_env_device() == "npu":
+        mask = paddle.tril(paddle.ones((target_length, target_length))).astype("int32")
+    else:
+        mask = paddle.tril(paddle.ones((target_length, target_length), dtype="bool"))
+
+    if past_key_values_length > 0:
+        # [tgt_len, tgt_len + past_len]
+        mask = paddle.concat([paddle.ones([target_length, past_key_values_length], dtype="bool"), mask], axis=-1)
+
+    # [bs, 1, tgt_len, tgt_len + past_len]
+    return mask[None, None, :, :].expand([batch_size, 1, target_length, target_length + past_key_values_length])
+
+
+def _expand_2d_mask(mask, dtype, tgt_length):
+    """
+    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
+    """
+    batch_size, src_length = mask.shape[0], mask.shape[-1]
+    tgt_length = tgt_length if tgt_length is not None else src_length
+
+    if get_env_device() == "npu":
+        mask = mask[:, None, None, :].astype(dtype)
+    else:
+        mask = mask[:, None, None, :].astype("bool")
+    mask.stop_gradient = True
+    expanded_mask = mask.expand([batch_size, 1, tgt_length, src_length])
+
+    return expanded_mask
+
+
+class LlamaRMSNorm(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.weight = paddle.create_parameter(
+            shape=[self.hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(1.0),
+        )
+        self.variance_epsilon = config.rms_norm_eps
+        self.config = config
+
+        if config.sequence_parallel:
+            mark_as_sequence_parallel_parameter(self.weight)
+
+    def forward(self, hidden_states):
+        if self.config.use_fused_rms_norm:
+            return fusion_ops.fusion_rms_norm(
+                hidden_states, self.weight, self.variance_epsilon, self.config.use_fast_layer_norm
+            )
+
+        if paddle.in_dynamic_mode():
+            with paddle.amp.auto_cast(False):
+                # hidden_states = hidden_states.astype("float32")
+                # variance = hidden_states.pow(2).mean(-1, keepdim=True)
+                variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True)
+                hidden_states = paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
+        else:
+            hidden_states = hidden_states.astype("float32")
+            variance = hidden_states.pow(2).mean(-1, keepdim=True)
+            hidden_states = paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
+
+        if self.weight.dtype in [paddle.float16, paddle.bfloat16]:
+            hidden_states = paddle.cast(hidden_states, self.weight.dtype)
+        return hidden_states * self.weight
+
+
+def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor:
+    """
+    This is the equivalent of paddle.repeat_interleave(hidden_states, n_rep, axis=1). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, slen, num_key_value_heads, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+
+    hidden_states = hidden_states.unsqueeze(-2).tile([1, 1, 1, n_rep, 1])
+    return hidden_states.reshape([batch, slen, num_key_value_heads * n_rep, head_dim])
+
+
+class LlamaRotaryEmbedding(nn.Layer):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        # [dim / 2]
+        self.inv_freq = 1.0 / (self.base ** (paddle.cast(paddle.arange(0, self.dim, 2), dtype="float32") / self.dim))
+        self._set_cos_sin_cache(seq_len=max_position_embeddings)
+
+    def _set_cos_sin_cache(self, seq_len):
+        self.max_seq_len_cached = seq_len
+        # [seq_len]
+        t = paddle.arange(seq_len, dtype="float32")
+        # [seq_len, dim/2]
+        freqs = paddle.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        # [seq_len, dim]
+        emb = paddle.concat([freqs, freqs], axis=-1)
+        # [1, seqlen, 1, dim]
+        self.cos_cached = emb.cos()[None, :, None, :]
+        self.sin_cached = emb.sin()[None, :, None, :]
+        self.cos_sin_table = None if get_env_device() != "gcu" else paddle.concat([freqs.cos(), freqs.sin()], axis=-1)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        cos = self.cos_cached[:, :seq_len, :, :]
+        sin = self.sin_cached[:, :seq_len, :, :]
+        return (
+            cos.cast(x.dtype) if cos.dtype != x.dtype else cos,
+            sin.cast(x.dtype) if sin.dtype != x.dtype else sin,
+        )
+
+    def get_fused_cos_sin(self, x, seq_len=None):
+        if self.cos_sin_table is not None and self.cos_sin_table.dtype != x.dtype:
+            return self.cos_sin_table.cast(x.dtype)
+        else:
+            return self.cos_sin_table
+
+
+class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings * scaling_factor, base)
+
+    def _set_cos_sin_cache(self, seq_len):
+        self.max_seq_len_cached = seq_len
+        # [seq_len]
+        t = paddle.arange(seq_len, dtype="float32")
+        t = t / self.scaling_factor
+        # [seq_len, dim/2]
+        freqs = paddle.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        # [seq_len, dim]
+        emb = paddle.concat([freqs, freqs], axis=-1)
+        # [1, seqlen, 1, dim]
+        self.cos_cached = emb.cos()[None, :, None, :]
+        self.sin_cached = emb.sin()[None, :, None, :]
+        self.cos_sin_table = None if get_env_device() != "gcu" else paddle.concat([freqs.cos(), freqs.sin()], axis=-1)
+
+
+class LlamaNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with NTK scaling. https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, scaling_factor=1.0):
+        base = base * scaling_factor ** (dim / (dim - 2))
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings * scaling_factor, base)
+
+
+class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding):
+    """LlamaRotaryEmbedding extended with Dynamic NTK scaling. https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base)
+
+    def _scale_cos_sin(self, seq_len):
+        # [seq_len]
+        t = paddle.arange(seq_len, dtype="float32")
+        # [seq_len, dim/2]
+        alpha = (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+        base = self.base * alpha ** (self.dim / (self.dim - 2))
+        inv_freq = 1.0 / (base ** (paddle.cast(paddle.arange(0, self.dim, 2), dtype="float32") / self.dim))
+        freqs = paddle.einsum("i,j->ij", t, inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        # [seq_len, dim]
+        emb = paddle.concat([freqs, freqs], axis=-1)
+        # [1, seqlen, 1, dim]
+        scale_cos = emb.cos()[None, :, None, :]
+        scale_sin = emb.sin()[None, :, None, :]
+        scale_cos_sin = None if get_env_device() != "gcu" else paddle.concat([freqs.cos(), freqs.sin()], axis=-1)
+        return scale_cos, scale_sin, scale_cos_sin
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_position_embeddings:
+            scale_cos, scale_sin, _ = self._scale_cos_sin(seq_len=seq_len)
+        else:
+            scale_cos, scale_sin = self.cos_cached, self.sin_cached
+        cos = scale_cos[:, :seq_len, :, ...]
+        sin = scale_sin[:, :seq_len, :, ...]
+        return (
+            cos.cast(x.dtype) if cos.dtype != x.dtype else cos,
+            sin.cast(x.dtype) if sin.dtype != x.dtype else sin,
+        )
+
+    def get_fused_cos_sin(self, x, seq_len=None):
+        if seq_len > self.max_position_embeddings:
+            _, _, scale_cos_sin = self._scale_cos_sin(seq_len=seq_len)
+        else:
+            scale_cos_sin = self.cos_sin_table
+        if scale_cos_sin is not None and scale_cos_sin.dtype != x.dtype:
+            return scale_cos_sin.cast(x.dtype)
+        else:
+            return scale_cos_sin
+
+
+class Llama3RotaryEmbedding(LlamaRotaryEmbedding):
+    def __init__(
+        self,
+        dim,
+        max_position_embeddings=8192,
+        base=500000,
+        factor=8.0,
+        low_freq_factor=1.0,
+        high_freq_factor=4.0,
+        original_max_position_embeddings=8192,
+    ):
+        self.factor = factor
+        self.low_freq_factor = low_freq_factor
+        self.high_freq_factor = high_freq_factor
+        self.original_max_position_embeddings = original_max_position_embeddings
+        super().__init__(dim, max_position_embeddings, base)
+
+    def _set_cos_sin_cache(self, seq_len):
+        low_freq_wavelen = self.original_max_position_embeddings / self.low_freq_factor
+        high_freq_wavelen = self.original_max_position_embeddings / self.high_freq_factor
+        new_freqs = []
+        for freq in self.inv_freq:
+            wavelen = 2 * math.pi / freq
+            if wavelen < high_freq_wavelen:
+                new_freqs.append(freq)
+            elif wavelen > low_freq_wavelen:
+                new_freqs.append(freq / self.factor)
+            else:
+                assert low_freq_wavelen != high_freq_wavelen
+                smooth = (self.original_max_position_embeddings / wavelen - self.low_freq_factor) / (
+                    self.high_freq_factor - self.low_freq_factor
+                )
+                new_freqs.append((1 - smooth) * freq / self.factor + smooth * freq)
+        self.inv_freq = paddle.to_tensor(new_freqs, dtype=self.inv_freq.dtype)
+        super()._set_cos_sin_cache(seq_len=seq_len)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return paddle.concat([-x2, x1], axis=-1)  # shape is the same as x
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+
+    if position_ids is None:
+        # Note: Only for LlamaForCausalLMPipe model pretraining
+        cos = cos[:, : q.shape[1], :, :]  # [bs, seq_len, 1, dim]
+        sin = sin[:, : q.shape[1], :, :]  # [bs, seq_len, 1, dim]
+    else:
+        cos = cos.squeeze(axis=[0, 2])  # [seq_len, dim]
+        sin = sin.squeeze(axis=[0, 2])  # [seq_len, dim]
+        cos = cos[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
+        sin = sin[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class LlamaMLP(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.tensor_parallel_degree = config.tensor_parallel_degree
+        self.fuse_attention_ffn = config.fuse_attention_ffn
+
+        if config.sequence_parallel:
+            ColumnParallelLinear = linear_utils.ColumnSequenceParallelLinear
+            RowParallelLinear = linear_utils.RowSequenceParallelLinear
+        else:
+            ColumnParallelLinear = linear_utils.ColumnParallelLinear
+            RowParallelLinear = linear_utils.RowParallelLinear
+
+        if config.tensor_parallel_degree > 1:
+            if config.fuse_attention_ffn:
+                self.gate_up_fused_proj = ColumnParallelLinear(
+                    self.hidden_size,
+                    self.intermediate_size * 2,
+                    gather_output=False,
+                    has_bias=False,
+                )
+            else:
+                self.gate_proj = ColumnParallelLinear(
+                    self.hidden_size,
+                    self.intermediate_size,
+                    gather_output=False,
+                    has_bias=False,
+                )
+                self.up_proj = ColumnParallelLinear(
+                    self.hidden_size,
+                    self.intermediate_size,
+                    gather_output=False,
+                    has_bias=False,
+                )
+
+            self.down_proj = RowParallelLinear(
+                self.intermediate_size,
+                self.hidden_size,
+                input_is_parallel=True,
+                has_bias=False,
+            )
+        else:
+            if config.fuse_attention_ffn:
+                self.gate_up_fused_proj = Linear(self.hidden_size, self.intermediate_size * 2, bias_attr=False)
+            else:
+                self.gate_proj = Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
+                self.up_proj = Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
+
+            self.down_proj = Linear(self.intermediate_size, self.hidden_size, bias_attr=False)
+
+    def forward(self, x):
+        if self.fuse_attention_ffn:
+            # FIXME(yangjianbang): use paddle's native swiglu
+            if get_env_device() == "xpu":
+                try:
+                    import paddle_xpu_nn  # noqa: F821
+
+                    out = self.gate_up_fused_proj(x)
+                    out = paddle_xpu_nn.xpu_swiglu(out, axis=-1, turn=True)
+                    out = self.down_proj(out)
+                    return out
+                except ImportError:
+                    gate_out, up_out = paddle.chunk(self.gate_up_fused_proj(x), chunks=2, axis=-1)
+                    out = self.down_proj(F.silu(gate_out) * up_out)
+                    return out
+
+            x = swiglu(self.gate_up_fused_proj(x))
+        else:
+            x = swiglu(self.gate_proj(x), self.up_proj(x))
+        out = self.down_proj(x)
+        return out
+
+
+class LlamaAttention(nn.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: LlamaConfig, layerwise_recompute: bool = False):
+        super().__init__()
+
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+
+        self.head_dim = self.hidden_size // config.num_attention_heads
+
+        self.num_key_value_heads = config.num_key_value_heads
+        assert config.num_attention_heads // config.num_key_value_heads
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads
+
+        self.max_position_embeddings = config.max_position_embeddings
+        self.seq_length = config.seq_length
+        self.sequence_parallel = config.sequence_parallel
+
+        self.fuse_attention_qkv = config.fuse_attention_qkv
+
+        self.kv_indices = None
+        # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True
+        # Enable_recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        self.layerwise_recompute = layerwise_recompute
+        self.recompute_granularity = config.recompute_granularity
+        if config.tensor_parallel_degree > 1:
+            assert (
+                self.num_heads % config.tensor_parallel_degree == 0
+            ), f"num_heads: {self.num_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}"
+            self.num_heads = self.num_heads // config.tensor_parallel_degree
+
+            if self.num_key_value_heads % config.tensor_parallel_degree == 0:
+                self.num_key_value_heads = self.num_key_value_heads // config.tensor_parallel_degree
+            else:
+                if self.fuse_attention_qkv:
+                    # TODO(Yuang): support fusion for kv when kv heads cannot be divided by mp
+                    raise ValueError(
+                        f"fuse_attention_qkv can't be True when num_key_value_heads {config.num_key_value_heads} % tensor_parallel_degree {config.tensor_parallel_degree} != 0"
+                    )
+                logger.warning(
+                    f"Get num_key_value_heads: {self.num_key_value_heads}, can't split to tensor_parallel_degree: {config.tensor_parallel_degree}, so we don't spilt key value weight."
+                )
+                self.kv_indices = paddle.to_tensor(
+                    assign_kv_heads(self.num_key_value_heads, config.tensor_parallel_degree)[
+                        config.tensor_parallel_rank
+                    ]
+                )
+
+        self.use_fused_rope = config.use_fused_rope
+        if self.use_fused_rope and get_env_device() not in ["npu", "xpu", "gcu"]:
+            if "gpu" not in paddle.device.get_device() or fused_rotary_position_embedding is None:
+                warnings.warn(
+                    "Enable fuse rope in the config, but fuse rope is not available. "
+                    "Will disable fuse rope. Try using latest gpu version of Paddle."
+                )
+                self.use_fused_rope = False
+
+        if config.sequence_parallel:
+            ColumnParallelLinear = linear_utils.ColumnSequenceParallelLinear
+            RowParallelLinear = linear_utils.RowSequenceParallelLinear
+        else:
+            ColumnParallelLinear = linear_utils.ColumnParallelLinear
+            RowParallelLinear = linear_utils.RowParallelLinear
+
+        if config.tensor_parallel_degree > 1:
+            if self.fuse_attention_qkv:
+                self.qkv_proj = ColumnParallelLinear(
+                    self.hidden_size,
+                    self.hidden_size + 2 * self.config.num_key_value_heads * self.head_dim,
+                    has_bias=False,
+                    gather_output=False,
+                )
+            else:
+                self.q_proj = ColumnParallelLinear(
+                    self.hidden_size,
+                    self.hidden_size,
+                    has_bias=False,
+                    gather_output=False,
+                )
+                if self.kv_indices is None:
+                    self.k_proj = ColumnParallelLinear(
+                        self.hidden_size,
+                        self.config.num_key_value_heads * self.head_dim,
+                        has_bias=False,
+                        gather_output=False,
+                    )
+                    self.v_proj = ColumnParallelLinear(
+                        self.hidden_size,
+                        self.config.num_key_value_heads * self.head_dim,
+                        has_bias=False,
+                        gather_output=False,
+                    )
+                else:
+                    self.k_proj = Linear(
+                        self.hidden_size,
+                        self.config.num_key_value_heads * self.head_dim,
+                        bias_attr=False,
+                    )
+                    self.v_proj = Linear(
+                        self.hidden_size,
+                        self.config.num_key_value_heads * self.head_dim,
+                        bias_attr=False,
+                    )
+
+        else:
+            if self.fuse_attention_qkv:
+                self.qkv_proj = Linear(
+                    self.hidden_size,
+                    self.hidden_size + 2 * self.config.num_key_value_heads * self.head_dim,
+                    bias_attr=False,
+                )
+            else:
+                self.q_proj = Linear(
+                    self.hidden_size,
+                    self.hidden_size,
+                    bias_attr=False,
+                )
+                self.k_proj = Linear(
+                    self.hidden_size,
+                    self.config.num_key_value_heads * self.head_dim,
+                    bias_attr=False,
+                )
+                self.v_proj = Linear(
+                    self.hidden_size,
+                    self.config.num_key_value_heads * self.head_dim,
+                    bias_attr=False,
+                )
+
+        if config.tensor_parallel_degree > 1:
+            self.o_proj = RowParallelLinear(
+                self.hidden_size,
+                self.hidden_size,
+                has_bias=False,
+                input_is_parallel=True,
+            )
+        else:
+            self.o_proj = Linear(
+                self.hidden_size,
+                self.hidden_size,
+                bias_attr=False,
+            )
+
+        if config.rope:
+            if config.use_long_sequence_strategies:
+                self.rotary_emb = LongSequenceStrategies.build_long_sequence_strategy(
+                    config.long_sequence_strategy_type,
+                    config.long_sequence_strategy_name,
+                    **config.long_sequence_init_args,
+                )
+            else:
+                self._init_rope()
+
+        self.reshard_layer = None
+        if config.sep_parallel_degree > 1:
+            assert self.num_key_value_heads % config.sep_parallel_degree == 0
+            assert self.num_heads % config.sep_parallel_degree == 0
+            self.reshard_layer = ReshardLayer()
+
+        self.config = config
+
+    def _init_rope(self):
+        if (
+            hasattr(self.config, "rope_scaling")
+            and self.config.rope_scaling is not None
+            and self.config.rope_scaling.get("rope_type", None) == "llama3"
+        ):
+            self.rotary_emb = Llama3RotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.config.rope_theta,
+                factor=self.config.rope_scaling["factor"],
+                high_freq_factor=self.config.rope_scaling["high_freq_factor"],
+                low_freq_factor=self.config.rope_scaling["low_freq_factor"],
+                original_max_position_embeddings=self.config.rope_scaling["original_max_position_embeddings"],
+            )
+        elif self.config.rope_scaling_type is None:
+            self.rotary_emb = LlamaRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.config.rope_theta,
+            )
+        elif self.config.rope_scaling_type == "linear":
+            self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                scaling_factor=self.config.rope_scaling_factor,
+                base=self.config.rope_theta,
+            )
+        elif self.config.rope_scaling_type == "ntk":
+            self.rotary_emb = LlamaNTKScalingRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                scaling_factor=self.config.rope_scaling_factor,
+                base=self.config.rope_theta,
+            )
+        elif self.config.rope_scaling_type == "dynamic_ntk":
+            self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                scaling_factor=self.config.rope_scaling_factor,
+                base=self.config.rope_theta,
+            )
+        else:
+            raise ValueError(f"Unknown RoPE scaling type {self.config.rope_scaling_type}")
+
+    def forward(
+        self,
+        hidden_states,
+        position_ids: Optional[Tuple[paddle.Tensor]] = None,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        alibi: Optional[paddle.Tensor] = None,
+        attn_mask_startend_row_indices: Optional[paddle.Tensor] = None,
+        npu_is_casual: bool = False,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism)
+
+        if self.fuse_attention_qkv:
+            mix_layer = self.qkv_proj(hidden_states)
+            # NOTE for GQA attention fusion (compatible with MHA and MQA):
+            # The weight for qkv_proj is in shape like [hidden_size, hidden_size + 2 * num_kv_heads * head_dim].
+            # After the projection, the mix_layer is in shape like [b, s, hidden_size + 2 * num_kv_heads * head_dim].
+            # Reshape the mix_layer into a shape like [b, s, num_kv_heads, (num_groups + 2) * head_dim],
+            # where num_groups = num_q_heads // num_kv_heads.
+            # Split the mix_layer on the last axis into three sections [num_groups * head_dim, head_dim, head_dim]
+            # to represent the q, k and v respectively.
+            # The q is in the shape like [b, s, num_kv_heads, num_groups * head_dim].
+            # The k and v are in the shape like [b, s, num_kv_heads, head_dim].
+            # Under MHA, the q is ready for the following calculation since num_kv_heads == num_q_heads,
+            # But for the GQA or MQA, q should be reshaped into [b, s, num_q_heads, head_dim].
+            if self.reshard_layer is not None:
+                if self.sequence_parallel:
+                    assert self.seq_length % self.config.sep_parallel_degree == 0
+                    mix_layer = paddle.reshape_(
+                        mix_layer,
+                        [
+                            -1,
+                            self.seq_length // self.config.sep_parallel_degree,
+                            self.num_heads * self.head_dim + 2 * self.num_key_value_heads * self.head_dim,
+                        ],
+                    )
+                # [bs, seq_len / sep, num_head, head_dim] -> [bs, seq_len, num_head / sep, head_dim]
+                mix_layer = self.reshard_layer(
+                    mix_layer,
+                    split_axis=2,
+                    concat_axis=1,
+                )
+                mix_layer = paddle.reshape_(
+                    mix_layer, [0, self.seq_length, -1, (self.num_key_value_groups + 2) * self.head_dim]
+                )  # [bs, seq_len, num_head/k, 3*head_dim], k is sep degree
+            else:
+                if self.sequence_parallel:
+                    target_shape = [
+                        -1,
+                        self.seq_length,
+                        self.num_key_value_heads,
+                        (self.num_key_value_groups + 2) * self.head_dim,
+                    ]
+                else:
+                    target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim]
+                mix_layer = paddle.reshape_(mix_layer, target_shape)
+            query_states, key_states, value_states = paddle.split(
+                mix_layer,
+                num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim],
+                axis=-1,
+            )
+            if self.gqa_or_mqa:
+                query_states = paddle.reshape_(query_states, [0, 0, self.num_heads, self.head_dim])
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+
+            if self.reshard_layer is not None:
+                if self.sequence_parallel:
+                    assert self.seq_length % self.config.sep_parallel_degree == 0
+                    query_states = paddle.reshape(
+                        query_states,
+                        [-1, self.seq_length // self.config.sep_parallel_degree, self.num_heads * self.head_dim],
+                    )
+                    key_states = paddle.reshape(
+                        key_states,
+                        [
+                            -1,
+                            self.seq_length // self.config.sep_parallel_degree,
+                            self.num_key_value_heads * self.head_dim,
+                        ],
+                    )
+                    value_states = paddle.reshape(
+                        value_states,
+                        [
+                            -1,
+                            self.seq_length // self.config.sep_parallel_degree,
+                            self.num_key_value_heads * self.head_dim,
+                        ],
+                    )
+                query_states = self.reshard_layer(
+                    query_states,
+                    split_axis=2,
+                    concat_axis=1,
+                )
+                key_states = self.reshard_layer(
+                    key_states,
+                    split_axis=2,
+                    concat_axis=1,
+                )
+                value_states = self.reshard_layer(
+                    value_states,
+                    split_axis=2,
+                    concat_axis=1,
+                )
+                query_states = paddle.reshape(
+                    query_states, [0, self.seq_length, -1, self.head_dim]
+                )  # [bs, seq_len, num_head/k, head_dim], k is sep degree
+                key_states = paddle.reshape(key_states, [0, self.seq_length, -1, self.head_dim])
+                value_states = paddle.reshape(value_states, [0, self.seq_length, -1, self.head_dim])
+            else:
+                if self.sequence_parallel:
+                    target_query_shape = [-1, self.seq_length, self.num_heads, self.head_dim]
+                    target_key_value_shape = [-1, self.seq_length, self.num_key_value_heads, self.head_dim]
+                else:
+                    target_query_shape = [0, 0, self.num_heads, self.head_dim]
+                    target_key_value_shape = [0, 0, self.num_key_value_heads, self.head_dim]
+                query_states = query_states.reshape(shape=target_query_shape)
+                key_states = key_states.reshape(shape=target_key_value_shape)
+                value_states = value_states.reshape(shape=target_key_value_shape)
+
+        kv_seq_len = key_states.shape[-3]
+
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-3]
+
+        if self.config.rope:
+            if self.reshard_layer is not None:
+                batch_size, seq_length, _, _ = query_states.shape
+                position_ids = paddle.arange(seq_length, dtype="int64").expand((batch_size, seq_length))
+            if self.config.context_parallel_degree > 1:
+                batch_size, seq_length, _, _ = query_states.shape
+                group = fleet.get_hybrid_communicate_group().get_sep_parallel_group()
+                chunk_size = seq_length // 2
+                chunk_num = group.nranks * 2
+                rank = group.rank
+                first_chunk_ids = paddle.arange(rank * chunk_size, (rank + 1) * chunk_size, dtype="int64")
+                second_chunk_ids = paddle.arange(
+                    (chunk_num - rank - 1) * chunk_size, (chunk_num - rank) * chunk_size, dtype="int64"
+                )
+                position_ids = paddle.concat([first_chunk_ids, second_chunk_ids]).expand((batch_size, seq_length))
+            if self.use_fused_rope:
+                query_states, key_states = fusion_ops.fusion_rope(
+                    query_states,
+                    key_states,
+                    value_states,
+                    hidden_states,
+                    position_ids,
+                    past_key_value,
+                    self.rotary_emb,
+                    self.config.context_parallel_degree,
+                )
+
+            else:
+                if self.config.context_parallel_degree > 1:
+                    kv_seq_len *= self.config.context_parallel_degree
+                if self.config.use_long_sequence_strategies:
+                    cos, sin = self.rotary_emb(seq_len=kv_seq_len)
+                    cos = cos[None, :, None, :]
+                    sin = sin[None, :, None, :]
+                    cos, sin = (
+                        cos.cast(value_states.dtype) if cos.dtype != value_states.dtype else cos,
+                        sin.cast(value_states.dtype) if sin.dtype != value_states.dtype else sin,
+                    )
+                else:
+                    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+                query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        # [bs, seq_len, num_head, head_dim]
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = paddle.concat([past_key_value[0], key_states], axis=1)
+            value_states = paddle.concat([past_key_value[1], value_states], axis=1)
+            if self.config.immediate_clear_past_key_value:
+                past_key_value[0]._clear_data()
+                past_key_value[1]._clear_data()
+
+        past_key_value = (key_states, value_states) if use_cache else None
+        if self.kv_indices is not None:
+            key_states = paddle.index_select(key_states, self.kv_indices, axis=2)
+            value_states = paddle.index_select(value_states, self.kv_indices, axis=2)
+
+        # TODO(wj-Mcat): use broadcast strategy when n_kv_heads = 1
+        # repeat k/v heads if n_kv_heads < n_heads
+        # paddle version > 2.6 or develop support flash-attn with gqa/mqa
+        paddle_version = float(paddle.__version__[:3])
+        if not self.config.use_flash_attention or ((paddle_version != 0.0) and (paddle_version <= 2.6)):
+            key_states = repeat_kv(key_states, self.num_key_value_groups)
+            value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        has_gradient = not (query_states.stop_gradient and key_states.stop_gradient and value_states.stop_gradient)
+        if (
+            self.enable_recompute
+            and self.layerwise_recompute
+            and has_gradient
+            and self.recompute_granularity == "core_attn"
+        ):
+            outputs = recompute(
+                scaled_dot_product_attention,
+                query_states,
+                self.config,
+                key_states,
+                value_states,
+                attention_mask,
+                output_attentions,
+                alibi,
+                attn_mask_startend_row_indices,
+                self.sequence_parallel,
+                reshard_layer=self.reshard_layer,
+                use_reentrant=self.config.recompute_use_reentrant,
+            )
+        else:
+            outputs = scaled_dot_product_attention(
+                query_states,
+                self.config,
+                key_states,
+                value_states,
+                attention_mask,
+                output_attentions,
+                alibi,
+                attn_mask_startend_row_indices,
+                self.sequence_parallel,
+                reshard_layer=self.reshard_layer,
+                npu_is_casual=npu_is_casual,
+            )
+        if output_attentions:
+            attn_output, attn_weights = outputs
+        else:
+            attn_output = outputs
+
+        # if sequence_parallel is true, out shape are [q_len / n, bs, num_head * head_dim]
+        # else their shape are [bs, q_len, num_head * head_dim], n is mp parallelism.
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        outputs = (attn_output,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        if use_cache:
+            outputs += (past_key_value,)
+
+        if type(outputs) is tuple and len(outputs) == 1:
+            outputs = outputs[0]
+
+        return outputs
+
+
+class LlamaDecoderLayer(nn.Layer):
+    def __init__(self, config, layerwise_recompute: bool = False):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.self_attn = LlamaAttention(config, layerwise_recompute)
+        self.mlp = LlamaMLP(config)
+        self.input_layernorm = LlamaRMSNorm(config)
+        self.post_attention_layernorm = LlamaRMSNorm(config)
+        self.sequence_parallel = config.sequence_parallel
+        # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True
+        # Enable_recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        self.layerwise_recompute = layerwise_recompute
+        self.recompute_granularity = config.recompute_granularity
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        position_ids: Optional[Tuple[paddle.Tensor]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        use_cache: Optional[bool] = False,
+        alibi: Optional[paddle.Tensor] = None,
+        attn_mask_startend_row_indices: Optional[paddle.Tensor] = None,
+        npu_is_casual: bool = False,
+    ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
+        """
+        Args:
+            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`paddle.Tensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `cache` key value states are returned and can be used to speed up decoding
+                (see `cache`).
+            cache (`Tuple(paddle.Tensor)`, *optional*): cached past key and value projection states
+        """
+
+        # [bs * seq_len, embed_dim] -> [seq_len * bs / n, embed_dim] (sequence_parallel)
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        has_gradient = not hidden_states.stop_gradient
+        if (
+            self.enable_recompute
+            and self.layerwise_recompute
+            and has_gradient
+            and self.recompute_granularity == "full_attn"
+        ):
+            outputs = recompute(
+                self.self_attn,
+                hidden_states,
+                position_ids,
+                past_key_value,
+                attention_mask,
+                output_attentions,
+                use_cache,
+                alibi,
+                attn_mask_startend_row_indices,
+                use_reentrant=self.config.recompute_use_reentrant,
+            )
+        else:
+            outputs = self.self_attn(
+                hidden_states,
+                position_ids,
+                past_key_value,
+                attention_mask,
+                output_attentions,
+                use_cache,
+                alibi,
+                attn_mask_startend_row_indices=attn_mask_startend_row_indices,
+                npu_is_casual=npu_is_casual,
+            )
+
+        if type(outputs) is tuple:
+            hidden_states = outputs[0]
+        else:
+            hidden_states = outputs
+
+        if output_attentions:
+            self_attn_weights = outputs[1]
+
+        if use_cache:
+            present_key_value = outputs[2 if output_attentions else 1]
+
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        # remove empty tuple for pipeline parallel
+        if type(outputs) is tuple and len(outputs) == 1:
+            outputs = outputs[0]
+
+        return outputs
+
+
+class LlamaPretrainedModel(PretrainedModel):
+    config_class = LlamaConfig
+    base_model_prefix = "llama"
+    pretrained_init_configuration = LLAMA_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = LLAMA_PRETRAINED_RESOURCE_FILES_MAP
+    _keys_to_ignore_on_load_unexpected = [r"self_attn.rotary_emb.inv_freq"]
+
+    @classmethod
+    def _get_name_mappings(cls, config: LlamaConfig) -> list[StateDictNameMapping]:
+        mappings: list[StateDictNameMapping] = []
+        model_mappings = [
+            ["embed_tokens.weight"],
+            ["norm.weight"],
+        ]
+        for layer_index in range(config.num_hidden_layers):
+            layer_mappings = [
+                [f"layers.{layer_index}.self_attn.q_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.k_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.v_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.o_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.rotary_emb.inv_freq"],
+                [f"layers.{layer_index}.mlp.gate_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.mlp.down_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.mlp.up_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.input_layernorm.weight"],
+                [f"layers.{layer_index}.post_attention_layernorm.weight"],
+            ]
+            model_mappings.extend(layer_mappings)
+
+        init_name_mappings(mappings=model_mappings)
+        # base-model prefix "LlamaModel"
+        if "LlamaModel" not in config.architectures:
+            for mapping in model_mappings:
+                mapping[0] = "model." + mapping[0]
+                mapping[1] = "llama." + mapping[1]
+            model_mappings.append(["lm_head.weight", "lm_head.weight", "transpose"])
+
+        mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)]
+        return mappings
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config: LlamaConfig, is_split=True):
+
+        from paddlenlp.transformers.conversion_utils import split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def get_tensor_parallel_split_mappings(num_layers):
+            final_actions = {}
+
+            base_actions = {
+                "lm_head.weight": partial(fn, is_column=True),
+                # Row Linear
+                "embed_tokens.weight": partial(fn, is_column=False),
+                "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False),
+                "layers.0.mlp.down_proj.weight": partial(fn, is_column=False),
+            }
+
+            if not config.vocab_size % config.tensor_parallel_degree == 0:
+                base_actions.pop("lm_head.weight")
+                base_actions.pop("embed_tokens.weight")
+            # Column Linear
+            if config.fuse_attention_qkv:
+                base_actions["layers.0.self_attn.qkv_proj.weight"] = partial(fn, is_column=True)
+            else:
+                base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True)
+                # if we have enough num_key_value_heads to split, then split it.
+                if config.num_key_value_heads % config.tensor_parallel_degree == 0:
+                    base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True)
+                    base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True)
+
+            if config.fuse_attention_ffn:
+                base_actions["layers.0.mlp.gate_up_fused_proj.weight"] = partial(
+                    fn, is_column=True, is_naive_2fuse=True
+                )
+            else:
+                base_actions["layers.0.mlp.gate_proj.weight"] = partial(fn, is_column=True)
+                base_actions["layers.0.mlp.up_proj.weight"] = partial(fn, is_column=True)
+
+            for key, action in base_actions.items():
+                if "layers.0." in key:
+                    for i in range(num_layers):
+                        final_actions[key.replace("layers.0.", f"layers.{i}.")] = action
+                final_actions[key] = action
+
+            return final_actions
+
+        mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
+
+        return mappings
+
+    @classmethod
+    def _get_fuse_or_split_param_mappings(cls, config: LlamaConfig, is_fuse=False):
+        # return parameter fuse utils
+        from paddlenlp.transformers.conversion_utils import split_or_fuse_func
+
+        fn = split_or_fuse_func(is_fuse=is_fuse)
+
+        # last key is fused key, other keys are to be fused.
+        fuse_qkv_keys = (
+            "layers.0.self_attn.q_proj.weight",
+            "layers.0.self_attn.k_proj.weight",
+            "layers.0.self_attn.v_proj.weight",
+            "layers.0.self_attn.qkv_proj.weight",
+        )
+
+        fuse_gate_up_keys = (
+            "layers.0.mlp.gate_proj.weight",
+            "layers.0.mlp.up_proj.weight",
+            "layers.0.mlp.gate_up_fused_proj.weight",
+        )
+        num_heads = config.num_attention_heads
+        num_key_value_heads = getattr(config, "num_key_value_heads", num_heads)
+        fuse_attention_qkv = getattr(config, "fuse_attention_qkv", False)
+        fuse_attention_ffn = getattr(config, "fuse_attention_ffn", False)
+
+        final_actions = {}
+        if is_fuse:
+            if fuse_attention_qkv:
+                for i in range(config.num_hidden_layers):
+                    keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_qkv_keys])
+                    final_actions[keys] = partial(
+                        fn, is_qkv=True, num_heads=num_heads, num_key_value_heads=num_key_value_heads
+                    )
+            if fuse_attention_ffn:
+                for i in range(config.num_hidden_layers):
+                    keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_gate_up_keys])
+                    final_actions[keys] = fn
+        else:
+            if not fuse_attention_qkv:
+                for i in range(config.num_hidden_layers):
+                    keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_qkv_keys])
+                    final_actions[keys] = partial(
+                        fn, split_nums=3, is_qkv=True, num_heads=num_heads, num_key_value_heads=num_key_value_heads
+                    )
+            if not fuse_attention_ffn:
+                for i in range(config.num_hidden_layers):
+                    keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_gate_up_keys])
+                    final_actions[keys] = partial(fn, split_nums=2)
+        return final_actions
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if self.config.tensor_parallel_degree > 1:
+            rng_tracker = get_rng_state_tracker().rng_state
+        if isinstance(
+            layer,
+            (
+                nn.Linear,
+                nn.Embedding,
+                mpu.VocabParallelEmbedding,
+                mpu.RowParallelLinear,
+                mpu.ColumnParallelLinear,
+                linear_utils.RowSequenceParallelLinear,
+                linear_utils.ColumnSequenceParallelLinear,
+                LlamaLMHead,
+            ),
+        ):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                if layer.weight.is_distributed:
+                    with rng_tracker():
+                        layer.weight.set_value(
+                            paddle.tensor.normal(
+                                mean=0.0,
+                                std=self.config.initializer_range
+                                if hasattr(self.config, "initializer_range")
+                                else self.llama.config.initializer_range,
+                                shape=layer.weight.shape,
+                            )
+                        )
+                else:
+                    layer.weight.set_value(
+                        paddle.tensor.normal(
+                            mean=0.0,
+                            std=self.config.initializer_range
+                            if hasattr(self.config, "initializer_range")
+                            else self.llama.config.initializer_range,
+                            shape=layer.weight.shape,
+                        )
+                    )
+        # Layer.apply is DFS https://github.com/PaddlePaddle/Paddle/blob/a6f5021fcc58b21f4414bae6bf4731ef6971582c/python/paddle/nn/layer/layers.py#L527-L530
+        # sublayer is init first
+        # scale RowParallelLinear weight
+        with paddle.no_grad():
+            if isinstance(layer, LlamaMLP):
+                factor = 1 / math.sqrt(2 * self.config.num_hidden_layers)
+                layer.down_proj.weight.scale_(factor)
+            if isinstance(layer, LlamaAttention):
+                factor = 1 / math.sqrt(2 * self.config.num_hidden_layers)
+                layer.o_proj.weight.scale_(factor)
+
+
+@register_base_model
+class LlamaModel(LlamaPretrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+    Args:
+        config: LlamaConfig
+    """
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+        self.sequence_parallel = config.sequence_parallel
+        self.recompute_granularity = config.recompute_granularity
+        self.no_recompute_layers = config.no_recompute_layers if config.no_recompute_layers is not None else []
+        self.config = config
+
+        # Recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0:
+            self.embed_tokens = mpu.VocabParallelEmbedding(
+                self.vocab_size,
+                self.hidden_size,
+                weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()),
+            )
+        else:
+            self.embed_tokens = nn.Embedding(
+                self.vocab_size,
+                self.hidden_size,
+            )
+
+        self.layers = nn.LayerList(
+            [LlamaDecoderLayer(config, i not in self.no_recompute_layers) for i in range(config.num_hidden_layers)]
+        )
+        self.norm = LlamaRMSNorm(config)
+
+        self.gradient_checkpointing = False
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @staticmethod
+    def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length, dtype):
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            if len(attention_mask.shape) == 2:
+                expanded_attn_mask = _expand_2d_mask(attention_mask, dtype, tgt_length=input_shape[-1])
+                # For decoding phase in generation, seq_length = 1, we don't need to add causal mask
+                if input_shape[-1] > 1:
+                    combined_attention_mask = _make_causal_mask(
+                        input_shape, past_key_values_length=past_key_values_length
+                    )
+                    if get_env_device() == "npu":
+                        expanded_attn_mask = expanded_attn_mask.astype("bool") & combined_attention_mask.astype("bool")
+                    else:
+                        expanded_attn_mask = expanded_attn_mask & combined_attention_mask
+            # [bsz, seq_len, seq_len] -> [bsz, 1, seq_len, seq_len]
+            elif len(attention_mask.shape) == 3:
+                expanded_attn_mask = attention_mask.unsqueeze(1).astype("bool")
+            # if attention_mask is already 4-D, do nothing
+            else:
+                expanded_attn_mask = attention_mask
+        else:
+            expanded_attn_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
+        # Convert bool attention_mask to float attention mask, which will be added to attention_scores later
+        if get_env_device() == "npu":
+            x = paddle.to_tensor(0.0, dtype="float32")
+            y = paddle.to_tensor(paddle.finfo(dtype).min, dtype="float32")
+            expanded_attn_mask = expanded_attn_mask.astype("float32")
+            expanded_attn_mask = paddle.where(expanded_attn_mask, x, y).astype(dtype)
+        elif get_env_device() in ["xpu", "gcu"]:
+            x = paddle.to_tensor(0.0, dtype=dtype)
+            y = paddle.to_tensor(paddle.finfo(dtype).min, dtype=dtype)
+            expanded_attn_mask = expanded_attn_mask.astype(dtype)
+            expanded_attn_mask = paddle.where(expanded_attn_mask, x, y).astype(dtype)
+        else:
+            expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
+        return expanded_attn_mask
+
+    @paddle.jit.not_to_static
+    def recompute_training_full(
+        self,
+        layer_module: nn.Layer,
+        hidden_states: Tensor,
+        position_ids: Optional[Tensor],
+        attention_mask: Tensor,
+        output_attentions: bool,
+        past_key_value: Tensor,
+        use_cache: bool,
+        alibi=None,
+        attn_mask_startend_row_indices=None,
+    ):
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+
+            return custom_forward
+
+        hidden_states = recompute(
+            create_custom_forward(layer_module),
+            hidden_states,
+            position_ids,
+            attention_mask,
+            output_attentions,
+            past_key_value,
+            use_cache,
+            alibi,
+            attn_mask_startend_row_indices,
+            use_reentrant=self.config.recompute_use_reentrant,
+        )
+
+        return hidden_states
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        use_cache=None,
+        past_key_values=None,
+        output_attentions=False,
+        output_hidden_states=None,
+        return_dict=False,
+        attn_mask_startend_row_indices=None,
+        **kwargs,
+    ):
+        if self.sequence_parallel and use_cache:
+            raise ValueError("We currently only support sequence parallel without cache.")
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.layers))
+        # NOTE: to make cache can be clear in-time
+        past_key_values = list(past_key_values)
+
+        seq_length_with_past = seq_length
+        cache_length = 0
+        if past_key_values[0] is not None:
+            cache_length = past_key_values[0][0].shape[1]
+            seq_length_with_past += cache_length
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if self.sequence_parallel:
+            # [bs, seq_len, num_head * head_dim] -> [bs * seq_len, num_head * head_dim]
+            bs, seq_len, hidden_size = inputs_embeds.shape
+            inputs_embeds = paddle.reshape_(inputs_embeds, [bs * seq_len, hidden_size])
+            # [seq_len * bs / n, num_head * head_dim] (n is mp parallelism)
+            inputs_embeds = ScatterOp.apply(inputs_embeds)
+
+        if self.config.context_parallel_degree > 1 and (attention_mask is not None or self.config.alibi):
+            raise NotImplementedError("Ring FlashAttention dosen't support attention_mask or alibi")
+
+        # embed positions
+        if self.config.use_flash_attention_for_generation:
+            attention_mask = None
+        elif attn_mask_startend_row_indices is None and attention_mask is None:
+            # [bs, seq_len]
+            attention_mask = paddle.ones((batch_size, seq_length_with_past), dtype=paddle.bool)
+        if attn_mask_startend_row_indices is None and self.config.alibi:
+            if self.config.use_long_sequence_strategies:
+                alibi_layer = LongSequenceStrategies.build_long_sequence_strategy(
+                    self.config.long_sequence_strategy_type,
+                    self.config.long_sequence_strategy_name,
+                    **self.config.long_sequence_init_args,
+                )
+                alibi = alibi_layer(attention_mask, self.config.num_attention_heads, dtype=inputs_embeds.dtype)
+            else:
+                alibi = build_alibi_tensor(attention_mask, self.config.num_attention_heads, dtype=inputs_embeds.dtype)
+            if self.config.tensor_parallel_degree > 1:
+                block_size = self.config.num_attention_heads // self.config.tensor_parallel_degree
+                alibi = alibi[
+                    :,
+                    self.config.tensor_parallel_rank
+                    * block_size : (self.config.tensor_parallel_rank + 1)
+                    * block_size,
+                ]
+                alibi = alibi.reshape([batch_size * block_size, 1, seq_length_with_past])
+            else:
+                alibi = alibi.reshape([batch_size * self.config.num_attention_heads, 1, seq_length_with_past])
+        else:
+            alibi = None
+
+        if position_ids is None:
+            position_ids = paddle.arange(seq_length, dtype="int64").expand((batch_size, seq_length))
+
+        use_casual_mask = get_use_casual_mask() and not self.config.alibi
+
+        if self.config.use_flash_attention_for_generation or use_casual_mask:
+            attention_mask = None
+        elif attn_mask_startend_row_indices is None:
+            attention_mask = self._prepare_decoder_attention_mask(
+                attention_mask, (batch_size, seq_length), cache_length, inputs_embeds.dtype
+            )  # [bs, 1, seq_len, seq_len]
+
+        is_casual = False
+
+        if attn_mask_startend_row_indices is None and self.config.use_flash_attention and get_env_device() != "gcu":
+            if self.config.use_flash_attention_for_generation or use_casual_mask:
+                is_casual = True
+            else:
+                is_casual = is_casual_mask(attention_mask)
+            if get_env_device() != "npu":
+                if is_casual and alibi is None:
+                    attention_mask = None
+            else:
+                attention_mask = None if attention_mask is None else attention_mask.astype("bool")
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, (decoder_layer) in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            has_gradient = not hidden_states.stop_gradient
+            if (
+                self.enable_recompute
+                and idx not in self.no_recompute_layers
+                and has_gradient
+                and self.recompute_granularity == "full"
+            ):
+                layer_outputs = self.recompute_training_full(
+                    decoder_layer,
+                    hidden_states,
+                    position_ids,
+                    attention_mask,
+                    output_attentions,
+                    past_key_value,
+                    use_cache,
+                    alibi=alibi,
+                    attn_mask_startend_row_indices=attn_mask_startend_row_indices,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    position_ids,
+                    attention_mask,
+                    output_attentions,
+                    past_key_value,
+                    use_cache,
+                    alibi=alibi,
+                    attn_mask_startend_row_indices=attn_mask_startend_row_indices,
+                    npu_is_casual=is_casual,
+                )
+
+            # NOTE: clear outdate cache after it has been used for memory saving
+            past_key_value = past_key_values[idx] = None
+            if type(layer_outputs) is tuple:
+                hidden_states = layer_outputs[0]
+            else:
+                hidden_states = layer_outputs
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+        if self.config.use_last_token_for_generation:
+            hidden_states = paddle.unsqueeze(hidden_states[:, -1, :], 1)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=None,
+        )
+
+
+class LlamaPretrainingCriterion(paddle.nn.Layer):
+    """
+    Criterion for Llama.
+    It calculates the final loss.
+    """
+
+    def __init__(self, config):
+
+        super(LlamaPretrainingCriterion, self).__init__()
+        self.ignore_index = getattr(config, "ignore_index", -100)
+        self.config = config
+        self.enable_parallel_cross_entropy = (
+            config.tensor_parallel_degree > 1
+            and config.vocab_size % config.tensor_parallel_degree == 0
+            and config.tensor_parallel_output
+        )
+
+        if self.enable_parallel_cross_entropy:  # and False: # and lm_head is distributed
+            self.loss_func = mpu.ParallelCrossEntropy(ignore_index=self.ignore_index)
+        else:
+            self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index)
+
+    def forward(self, prediction_scores, masked_lm_labels):
+        if self.enable_parallel_cross_entropy:
+            if prediction_scores.shape[-1] == self.config.vocab_size:
+                warnings.warn(
+                    f"enable_parallel_cross_entropy, the vocab_size should be splited: {prediction_scores.shape[-1]}, {self.config.vocab_size}"
+                )
+                self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index)
+
+        with paddle.amp.auto_cast(False):
+            masked_lm_loss = self.loss_func(prediction_scores.astype("float32"), masked_lm_labels.unsqueeze(2))
+
+            if self.config.sep_parallel_degree > 1 or self.config.context_parallel_degree > 1:
+                _hcg = fleet.get_hybrid_communicate_group()
+                masked_lm_loss = ConcatMaskedLoss.apply(masked_lm_loss, axis=1, group=_hcg.get_sep_parallel_group())
+            # skip ignore_index which loss == 0
+            # masked_lm_loss = masked_lm_loss[masked_lm_loss > 0]
+            # loss = paddle.mean(masked_lm_loss)
+            binary_sequence = paddle.where(
+                masked_lm_loss > 0, paddle.ones_like(masked_lm_loss), paddle.zeros_like(masked_lm_loss)
+            )
+            count = paddle.sum(binary_sequence)
+            if count == 0:
+                loss = paddle.sum(masked_lm_loss * binary_sequence)
+            else:
+                loss = paddle.sum(masked_lm_loss * binary_sequence) / count
+
+        return loss
+
+
+class ConcatMaskedLoss(PyLayer):
+    @staticmethod
+    def forward(ctx, inp, axis, group):
+        inputs = []
+        paddle.distributed.all_gather(inputs, inp, group=group)
+        with paddle.no_grad():
+            cat = paddle.concat(inputs, axis=axis)
+        ctx.args_axis = axis
+        ctx.args_group = group
+        return cat
+
+    @staticmethod
+    def backward(ctx, grad):
+        axis = ctx.args_axis
+        group = ctx.args_group
+        with paddle.no_grad():
+            grads = paddle.split(grad, paddle.distributed.get_world_size(group), axis=axis)
+        grad = grads[paddle.distributed.get_rank(group)]
+        return grad
+
+
+class LlamaLMHead(nn.Layer):
+    def __init__(self, config: LlamaConfig):
+        super(LlamaLMHead, self).__init__()
+        self.config = config
+        if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0:
+            vocab_size = config.vocab_size // config.tensor_parallel_degree
+        else:
+            vocab_size = config.vocab_size
+
+        if vocab_size != config.vocab_size:
+            with get_rng_state_tracker().rng_state():
+                self.weight = self.create_parameter(
+                    shape=[config.hidden_size, vocab_size],
+                    dtype=paddle.get_default_dtype(),
+                )
+        else:
+            self.weight = self.create_parameter(
+                shape=[config.hidden_size, vocab_size],
+                dtype=paddle.get_default_dtype(),
+            )
+        # Must set distributed attr for Tensor Parallel !
+        self.weight.is_distributed = True if (vocab_size != config.vocab_size) else False
+        if self.weight.is_distributed:
+            self.weight.split_axis = 1
+        if get_env_device() == "xpu":
+            try:
+                from paddle_xpu.layers.nn import (  # noqa: F401
+                    parallel_matmul as xpu_parallel_matmul,
+                )
+
+                self.xpu_parallel_matmul = xpu_parallel_matmul()
+            except ImportError:
+                self.xpu_parallel_matmul = None
+
+    def forward(self, hidden_states, tensor_parallel_output=None):
+        if self.config.sequence_parallel:
+            hidden_states = GatherOp.apply(hidden_states)
+            seq_length = self.config.seq_length
+            if self.config.sep_parallel_degree > 1:
+                assert seq_length % self.config.sep_parallel_degree == 0
+                seq_length = seq_length // self.config.sep_parallel_degree
+            if self.config.context_parallel_degree > 1:
+                assert seq_length % self.config.context_parallel_degree == 0
+                seq_length = seq_length // self.config.context_parallel_degree
+            hidden_states = paddle.reshape_(hidden_states, [-1, seq_length, self.config.hidden_size])
+
+        if tensor_parallel_output is None:
+            tensor_parallel_output = self.config.tensor_parallel_output and self.config.tensor_parallel_degree > 1
+
+        if get_env_device() == "xpu" and self.xpu_parallel_matmul is not None:
+            logits = self.xpu_parallel_matmul(
+                hidden_states, self.weight, tensor_parallel_output=tensor_parallel_output, training=self.training
+            )
+        else:
+            logits = parallel_matmul(hidden_states, self.weight, tensor_parallel_output=tensor_parallel_output)
+        return logits
+
+
+class LlamaForCausalLM(LlamaPretrainedModel):
+    enable_to_static_method = True
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.llama = LlamaModel(config)
+        self.lm_head = LlamaLMHead(config)
+        self.criterion = LlamaPretrainingCriterion(config)
+
+    def get_input_embeddings(self):
+        return self.llama.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.llama.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.llama = decoder
+
+    def get_decoder(self):
+        return self.llama
+
+    def prepare_inputs_for_generation(
+        self, input_ids, use_cache=False, past_key_values=None, inputs_embeds=None, **kwargs
+    ):
+        batch_size, seq_length = input_ids.shape
+        position_ids = kwargs.get("position_ids", paddle.arange(seq_length).expand((batch_size, seq_length)))
+        attention_mask = kwargs.get("attention_mask", None)
+        if past_key_values:
+            input_ids = input_ids[:, -1].unsqueeze(axis=-1)
+            position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    def _get_model_inputs_spec(self, dtype: str):
+        return {
+            "input_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+            "attention_mask": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+            "position_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+        }
+
+    @staticmethod
+    def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False):
+        # update cache
+        if isinstance(outputs, tuple) and len(outputs) > 1 and not isinstance(outputs[1], paddle.Tensor):
+            model_kwargs["past_key_values"] = outputs[1]
+
+        if isinstance(outputs, CausalLMOutputWithCrossAttentions) and "past_key_values" in outputs:
+            model_kwargs["past_key_values"] = outputs.past_key_values
+
+        # update position_ids
+        if "position_ids" in model_kwargs and model_kwargs["position_ids"] is not None:
+            position_ids = model_kwargs["position_ids"]
+            model_kwargs["position_ids"] = paddle.concat([position_ids, position_ids[..., -1:] + 1], axis=-1)
+
+        if not is_encoder_decoder and "attention_mask" in model_kwargs and model_kwargs["attention_mask"] is not None:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = paddle.concat(
+                [attention_mask, paddle.ones([attention_mask.shape[0], 1], dtype=attention_mask.dtype)], axis=-1
+            )
+
+        return model_kwargs
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=False,
+        past_key_values=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        attn_mask_startend_row_indices=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if attn_mask_startend_row_indices is not None and attention_mask is not None:
+            logger.warning(
+                "You have provided both attn_mask_startend_row_indices and attention_mask. "
+                "The attn_mask_startend_row_indices will be used."
+            )
+            attention_mask = None
+
+        outputs = self.llama(
+            input_ids,  # [bs, seq_len]
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            attn_mask_startend_row_indices=attn_mask_startend_row_indices,
+        )
+
+        hidden_states = outputs[0]  # [bs, seq_len, dim]
+
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss = self.criterion(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/modeling_auto.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/modeling_auto.py
new file mode 100644
index 000000000..b78f58284
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/modeling_auto.py
@@ -0,0 +1,1308 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Paddle Llama model"""
+from __future__ import annotations
+
+import math
+import warnings
+from functools import partial
+from typing import Optional, Tuple
+
+import paddle
+import paddle.distributed as dist
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.distributed import fleet
+from paddle.distributed.fleet.utils import recompute
+
+try:
+    from paddle.incubate.nn.functional import fused_rotary_position_embedding
+except ImportError:
+    fused_rotary_position_embedding = None
+
+try:
+    from paddle.incubate.nn.functional import swiglu
+except ImportError:
+
+    def swiglu(x, y=None):
+        if y is None:
+            x, y = paddle.chunk(x, chunks=2, axis=-1)
+        return F.silu(x) * y
+
+
+from paddlenlp.transformers.conversion_utils import (
+    StateDictNameMapping,
+    init_name_mappings,
+)
+from paddlenlp.transformers.model_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+)
+from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model
+
+from .configuration import (
+    LLAMA_PRETRAINED_INIT_CONFIGURATION,
+    LLAMA_PRETRAINED_RESOURCE_FILES_MAP,
+    LlamaConfig,
+)
+from .modeling import (
+    LlamaDynamicNTKScalingRotaryEmbedding,
+    LlamaLinearScalingRotaryEmbedding,
+    LlamaNTKScalingRotaryEmbedding,
+    LlamaRotaryEmbedding,
+    _expand_2d_mask,
+    _make_causal_mask,
+    apply_rotary_pos_emb,
+    build_alibi_tensor,
+    get_triangle_upper_mask,
+    repeat_kv,
+    rms_norm_fused,
+)
+
+try:
+    from paddle.nn.functional.flash_attention import flash_attention
+except:
+    flash_attention = None
+
+__all__ = [
+    "LlamaForCausalLM3DAuto",
+    "LlamaPretrainingCriterion3DAuto",
+]
+
+
+def is_pp_enable():
+    mesh = fleet.auto.get_mesh()
+    return "pp" in mesh.dim_names
+
+
+def get_mesh(pp_idx=0):
+    mesh = fleet.auto.get_mesh()
+    if "pp" in mesh.dim_names:
+        mesh = mesh.get_mesh_with_dim("pp", pp_idx)
+    return mesh
+
+
+def global_mesh_starts_with_pp():
+    mesh = fleet.auto.get_mesh()
+    if is_pp_enable():
+        return mesh.get_mesh_with_dim("pp")
+    else:
+        return mesh
+
+
+def scaled_dot_product_attention(
+    query_states,
+    config,
+    key_states,
+    value_states,
+    attention_mask,
+    output_attentions,
+    alibi=None,
+):
+    bsz, q_len, num_heads, head_dim = query_states.shape
+    _, kv_seq_len, _, _ = value_states.shape
+
+    if config.use_flash_attention and flash_attention:
+        # Paddle Flash Attention input [ bz, seqlen, nhead, head_dim]
+        # Torch Flash Attention input [ bz, nhead, seqlen, head_dim]
+        version = paddle.version.full_version
+        if version != "0.0.0" and version <= "2.5.2":
+            if alibi is not None:
+                raise ValueError("Flash Attention doesn't support alibi")
+            attn_output, attn_weights = flash_attention(
+                query_states,
+                key_states,
+                value_states,
+                causal=True,
+                return_softmax=output_attentions,
+            )
+        else:
+            if alibi is not None:
+                alibi = alibi.reshape([bsz, num_heads, 1, -1])
+                attention_mask = attention_mask.cast(alibi.dtype) + alibi
+            attn_output = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                is_causal=attention_mask is None,
+            )
+            attn_weights = None
+
+        attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
+        return (attn_output, attn_weights) if output_attentions else attn_output
+    else:
+        #  [ bz, seqlen, nhead, head_dim] -> [bs, nhead, seq_len, head_dim]
+        query_states = paddle.transpose(query_states, [0, 2, 1, 3])
+        # merge with the next tranpose
+        key_states = paddle.transpose(key_states, [0, 2, 1, 3])
+        value_states = paddle.transpose(value_states, [0, 2, 1, 3])
+
+        # matmul and devide by sqrt(head_dim)
+        attn_weights = paddle.matmul(query_states / math.sqrt(head_dim), key_states.transpose([0, 1, 3, 2]))
+        # then add alibi bias
+        if alibi is not None:
+            alibi = alibi.reshape([bsz, num_heads, 1, -1])
+            attn_weights = attn_weights + alibi
+
+        if list(attn_weights.shape) != [bsz, num_heads, q_len, kv_seq_len]:
+            raise ValueError(
+                f"Attention weights should be of shape {(bsz, num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.shape}"
+            )
+
+        # NOTE: we only call get_triangle_upper_mask under PP setup
+        # FIXME ZHUI when we use pipeline parallel, the attention_mask can be None
+        # we just make it triangle_upper_mask
+        if attention_mask is None:
+            attention_mask = get_triangle_upper_mask(attn_weights)
+
+        attention_mask = attention_mask.reshape([bsz, 1, q_len, kv_seq_len])
+        if list(attention_mask.shape) != [bsz, 1, q_len, kv_seq_len]:
+            raise ValueError(
+                f"Attention mask should be of shape {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.shape}"
+            )
+
+        attn_weights = attn_weights + attention_mask
+        with paddle.amp.auto_cast(False):
+            attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query_states.dtype)
+
+        attn_output = paddle.matmul(attn_weights, value_states)
+        attn_output = attn_output.transpose([0, 2, 1, 3])
+        # [bsz, q_len, num_heads, head_dim] -> [bsz, q_len, num_heads * head_dim]
+        attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
+        return (attn_output, attn_weights) if output_attentions else attn_output
+
+
+class LlamaRMSNormAuto(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.weight = paddle.create_parameter(
+            shape=[self.hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(1.0),
+        )
+        self.variance_epsilon = config.rms_norm_eps
+        self.config = config
+
+    def forward(self, hidden_states):
+        if self.config.use_fused_rms_norm:
+            return rms_norm_fused(hidden_states, self.weight, self.variance_epsilon)
+
+        with paddle.amp.auto_cast(False):
+            variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True)
+            hidden_states = paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
+
+        if self.weight.dtype in [paddle.float16, paddle.bfloat16]:
+            hidden_states = paddle.cast(hidden_states, self.weight.dtype)
+
+        return hidden_states * self.weight
+
+
+class LlamaMLPAuto(nn.Layer):
+    def __init__(self, config, ipp: Optional[int] = None):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.fuse_attention_ffn = config.fuse_attention_ffn
+        self.ipp = ipp
+        self.config = config
+
+        if config.fuse_attention_ffn:
+            self.gate_up_fused_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias_attr=False)
+            self.gate_up_fused_proj.weight = dist.shard_tensor(
+                self.gate_up_fused_proj.weight,
+                get_mesh(self.ipp),
+                [dist.Replicate(), dist.Shard(1)],
+            )
+        else:
+            self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
+            self.gate_proj.weight = dist.shard_tensor(
+                self.gate_proj.weight,
+                get_mesh(self.ipp),
+                [dist.Replicate(), dist.Shard(1)],
+            )
+
+            self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
+            self.up_proj.weight = dist.shard_tensor(
+                self.up_proj.weight,
+                get_mesh(self.ipp),
+                [dist.Replicate(), dist.Shard(1)],
+            )
+
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias_attr=False)
+        self.down_proj.weight = dist.shard_tensor(
+            self.down_proj.weight,
+            get_mesh(self.ipp),
+            [dist.Replicate(), dist.Shard(0)],
+        )
+
+    def forward(self, x):
+        if self.fuse_attention_ffn:
+            x = swiglu(self.gate_up_fused_proj(x))
+        else:
+            x = swiglu(self.gate_proj(x), self.up_proj(x))
+        out = self.down_proj(x)
+        return out
+
+
+class LlamaAttentionAuto(nn.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: LlamaConfig, layerwise_recompute: bool = False, ipp: Optional[int] = None):
+        super().__init__()
+
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+
+        self.head_dim = self.hidden_size // config.num_attention_heads
+
+        self.num_key_value_heads = config.num_key_value_heads
+        assert config.num_attention_heads // config.num_key_value_heads
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads
+
+        self.max_position_embeddings = config.max_position_embeddings
+        self.seq_length = config.seq_length
+
+        self.fuse_attention_qkv = config.fuse_attention_qkv
+
+        self.kv_indices = None
+        # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True
+        # Enable_recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        self.layerwise_recompute = layerwise_recompute
+        self.recompute_granularity = config.recompute_granularity
+        self.ipp = ipp
+
+        self.use_fused_rope = config.use_fused_rope
+        if self.use_fused_rope:
+            if "gpu" not in paddle.device.get_device() or fused_rotary_position_embedding is None:
+                warnings.warn(
+                    "Enable fuse rope in the config, but fuse rope is not available. "
+                    "Will disable fuse rope. Try using latest gpu version of Paddle."
+                )
+                self.use_fused_rope = False
+
+        if self.fuse_attention_qkv:
+            self.qkv_proj = nn.Linear(
+                self.hidden_size,
+                self.hidden_size + 2 * self.config.num_key_value_heads * self.head_dim,
+                bias_attr=False,
+            )
+            self.qkv_proj.weight = dist.shard_tensor(
+                self.qkv_proj.weight,
+                get_mesh(self.ipp),
+                [dist.Replicate(), dist.Shard(1)],
+            )
+
+        else:
+            self.q_proj = nn.Linear(
+                self.hidden_size,
+                self.hidden_size,
+                bias_attr=False,
+            )
+            self.q_proj.weight = dist.shard_tensor(
+                self.q_proj.weight,
+                get_mesh(self.ipp),
+                [dist.Replicate(), dist.Shard(1)],
+            )
+
+            self.k_proj = nn.Linear(
+                self.hidden_size,
+                self.config.num_key_value_heads * self.head_dim,
+                bias_attr=False,
+            )
+            self.k_proj.weight = dist.shard_tensor(
+                self.k_proj.weight,
+                get_mesh(self.ipp),
+                [dist.Replicate(), dist.Shard(1)],
+            )
+
+            self.v_proj = nn.Linear(
+                self.hidden_size,
+                self.config.num_key_value_heads * self.head_dim,
+                bias_attr=False,
+            )
+            self.v_proj.weight = dist.shard_tensor(
+                self.v_proj.weight,
+                get_mesh(self.ipp),
+                [dist.Replicate(), dist.Shard(1)],
+            )
+
+        self.o_proj = nn.Linear(
+            self.hidden_size,
+            self.hidden_size,
+            bias_attr=False,
+        )
+        self.o_proj.weight = dist.shard_tensor(
+            self.o_proj.weight,
+            get_mesh(self.ipp),
+            [dist.Replicate(), dist.Shard(0)],
+        )
+
+        if config.rope:
+            self._init_rope()
+
+        self.config = config
+
+    def _init_rope(self):
+        if self.config.rope_scaling_type is None:
+            self.rotary_emb = LlamaRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.config.rope_theta,
+            )
+        elif self.config.rope_scaling_type == "linear":
+            self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                scaling_factor=self.config.rope_scaling_factor,
+                base=self.config.rope_theta,
+            )
+        elif self.config.rope_scaling_type == "ntk":
+            self.rotary_emb = LlamaNTKScalingRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                scaling_factor=self.config.rope_scaling_factor,
+                base=self.config.rope_theta,
+            )
+        elif self.config.rope_scaling_type == "dynamic_ntk":
+            self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                scaling_factor=self.config.rope_scaling_factor,
+                base=self.config.rope_theta,
+            )
+        else:
+            raise ValueError(f"Unknown RoPE scaling type {self.config.rope_scaling_type}")
+
+    def forward(
+        self,
+        hidden_states,
+        position_ids: Optional[Tuple[paddle.Tensor]] = None,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        alibi: Optional[paddle.Tensor] = None,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        # [bs, seq_len, num_head * head_dim] or [seq_len / n, bs, num_head * head_dim] (if sequence_parallel)
+        # enter tp region
+        if self.config.sequence_parallel:
+            # [seq_len / n, bs, num_head * head_dim] -> [seq_len, bs, num_head * head_dim] (if sequence_parallel)
+            hidden_states = dist.reshard(
+                hidden_states,
+                get_mesh(self.ipp),
+                [dist.Shard(1), dist.Replicate()],
+            )
+
+        if self.fuse_attention_qkv:
+            target_shape = [0, 0, self.num_key_value_heads, (self.num_key_value_groups + 2) * self.head_dim]
+            mix_layer = self.qkv_proj(hidden_states)
+            mix_layer = paddle.reshape_(mix_layer, target_shape)
+            query_states, key_states, value_states = paddle.split(
+                mix_layer,
+                num_or_sections=[self.num_key_value_groups * self.head_dim, self.head_dim, self.head_dim],
+                axis=-1,
+            )
+            if self.gqa_or_mqa:
+                query_states = paddle.reshape(query_states, [0, 0, self.num_heads, self.head_dim])
+        else:
+            target_query_shape = [0, 0, self.num_heads, self.head_dim]
+            target_key_value_shape = [0, 0, self.num_key_value_heads, self.head_dim]
+
+            query_states = self.q_proj(hidden_states).reshape(shape=target_query_shape)
+            key_states = self.k_proj(hidden_states).reshape(shape=target_key_value_shape)
+            value_states = self.v_proj(hidden_states).reshape(shape=target_key_value_shape)
+
+        if self.config.sequence_parallel:
+            # [seq_len, bs, num_head * head_dim] -> [bs, seq_len, num_head * head_dim]  (if sequence_parallel)
+            # FA and rope not support sequence first
+            query_states = paddle.transpose(query_states, [1, 0, 2, 3])
+            key_states = paddle.transpose(key_states, [1, 0, 2, 3])
+            value_states = paddle.transpose(value_states, [1, 0, 2, 3])
+
+        kv_seq_len = key_states.shape[-3]
+
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-3]
+
+        if self.config.rope:
+            if self.use_fused_rope:
+                assert past_key_value is None, "fuse rotary not support cache kv for now"
+                batch_size, seq_length, num_heads, head_dim = query_states.shape
+                _, kv_seq_len, num_key_value_heads, _ = key_states.shape
+                cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+                paddle_version = float(paddle.__version__[:3])
+                if ((paddle_version != 0.0) and (paddle_version <= 2.6)) and (num_heads != num_key_value_heads):
+                    query_states, _, _ = fused_rotary_position_embedding(
+                        query_states,
+                        None,
+                        None,
+                        sin=sin,
+                        cos=cos,
+                        position_ids=position_ids,
+                        use_neox_rotary_style=False,
+                    )
+                    key_states, _, _ = fused_rotary_position_embedding(
+                        key_states,
+                        None,
+                        None,
+                        sin=sin,
+                        cos=cos,
+                        position_ids=position_ids,
+                        use_neox_rotary_style=False,
+                    )
+                else:
+                    query_states, key_states, _ = fused_rotary_position_embedding(
+                        query_states,
+                        key_states,
+                        v=None,
+                        sin=sin,
+                        cos=cos,
+                        position_ids=position_ids,
+                        use_neox_rotary_style=False,
+                    )
+            else:
+                cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+                # hack here, because elementwise infer spmd not support broadcast now
+                query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        # [bs, seq_len, num_head, head_dim]
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = paddle.concat([past_key_value[0], key_states], axis=1)
+            value_states = paddle.concat([past_key_value[1], value_states], axis=1)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        if self.kv_indices is not None:
+            key_states = paddle.index_select(key_states, self.kv_indices, axis=2)
+            value_states = paddle.index_select(value_states, self.kv_indices, axis=2)
+
+        # TODO(wj-Mcat): use broadcast strategy when n_kv_heads = 1
+        # repeat k/v heads if n_kv_heads < n_heads
+        # paddle version > 2.6 or develop support flash-attn with gqa/mqa
+        paddle_version = float(paddle.__version__[:3])
+        if (paddle_version != 0.0) and (paddle_version <= 2.6):
+            key_states = repeat_kv(key_states, self.num_key_value_groups)
+            value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        has_gradient = not (query_states.stop_gradient and key_states.stop_gradient and value_states.stop_gradient)
+        if (
+            self.enable_recompute
+            and self.layerwise_recompute
+            and has_gradient
+            and self.recompute_granularity == "core_attn"
+        ):
+            outputs = recompute(
+                scaled_dot_product_attention,
+                query_states,
+                self.config,
+                key_states,
+                value_states,
+                attention_mask,
+                output_attentions,
+                alibi,
+                use_reentrant=self.config.recompute_use_reentrant,
+            )
+        else:
+            outputs = scaled_dot_product_attention(
+                query_states,
+                self.config,
+                key_states,
+                value_states,
+                attention_mask,
+                output_attentions,
+                alibi,
+            )
+        if output_attentions:
+            attn_output, attn_weights = outputs
+        else:
+            attn_output = outputs
+
+        if self.config.sequence_parallel:
+            attn_output = paddle.transpose(attn_output, [1, 0, 2])
+
+        # [bs, q_len, num_head * head_dim]
+        attn_output = self.o_proj(attn_output)
+
+        # enter sp region
+        if self.config.sequence_parallel:
+            # [bs, q_len, num_head * head_dim] -> [q_len / n, bs, num_head * head_dim]
+            attn_output = dist.reshard(
+                attn_output,
+                get_mesh(self.ipp),
+                [dist.Shard(1), dist.Shard(0)],
+            )
+        if not output_attentions:
+            attn_weights = None
+
+        outputs = (attn_output,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        if use_cache:
+            outputs += (past_key_value,)
+
+        if type(outputs) is tuple and len(outputs) == 1:
+            outputs = outputs[0]
+
+        return outputs
+
+
+class LlamaDecoderLayerAuto(nn.Layer):
+    def __init__(self, config, layerwise_recompute: bool = False, ipp: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.self_attn = LlamaAttentionAuto(config, layerwise_recompute, ipp)
+        self.mlp = LlamaMLPAuto(config, ipp)
+        self.input_layernorm = LlamaRMSNormAuto(config)
+        self.post_attention_layernorm = LlamaRMSNormAuto(config)
+        # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True
+        # Enable_recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        self.layerwise_recompute = layerwise_recompute
+        self.recompute_granularity = config.recompute_granularity
+        self.ipp = ipp
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        position_ids: Optional[Tuple[paddle.Tensor]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        use_cache: Optional[bool] = False,
+        alibi: Optional[paddle.Tensor] = None,
+    ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
+        """
+        Args:
+            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`paddle.Tensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `cache` key value states are returned and can be used to speed up decoding
+                (see `cache`).
+            cache (`Tuple(paddle.Tensor)`, *optional*): cached past key and value projection states
+        """
+
+        # [bs, seq_len, embed_dim] or [seq_len / n, bs, embed_dim] (if sequence_parallel)
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        has_gradient = not hidden_states.stop_gradient
+        if (
+            self.enable_recompute
+            and self.layerwise_recompute
+            and has_gradient
+            and self.recompute_granularity == "full_attn"
+        ):
+            outputs = recompute(
+                self.self_attn,
+                hidden_states,
+                position_ids,
+                past_key_value,
+                attention_mask,
+                output_attentions,
+                use_cache,
+                alibi,
+                use_reentrant=self.config.recompute_use_reentrant,
+            )
+        else:
+            outputs = self.self_attn(
+                hidden_states,
+                position_ids,
+                past_key_value,
+                attention_mask,
+                output_attentions,
+                use_cache,
+                alibi,
+            )
+
+        if type(outputs) is tuple:
+            hidden_states = outputs[0]
+        else:
+            hidden_states = outputs
+
+        if output_attentions:
+            self_attn_weights = outputs[1]
+
+        if use_cache:
+            present_key_value = outputs[2 if output_attentions else 1]
+
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+
+        # enter tp region
+        if self.config.sequence_parallel:
+            hidden_states = dist.reshard(
+                hidden_states,
+                get_mesh(self.ipp),
+                [dist.Shard(1), dist.Replicate()],
+            )
+
+        hidden_states = self.mlp(hidden_states)
+
+        # enter sp region
+        if self.config.sequence_parallel:
+            hidden_states = dist.reshard(
+                hidden_states,
+                get_mesh(self.ipp),
+                [dist.Shard(1), dist.Shard(0)],
+            )
+
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        # remove empty tuple for pipeline parallel
+        if type(outputs) is tuple and len(outputs) == 1:
+            outputs = outputs[0]
+
+        return outputs
+
+
+class LlamaPretrainedModelAuto(PretrainedModel):
+    config_class = LlamaConfig
+    base_model_prefix = "llama"
+    pretrained_init_configuration = LLAMA_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = LLAMA_PRETRAINED_RESOURCE_FILES_MAP
+    _keys_to_ignore_on_load_unexpected = [r"self_attn.rotary_emb.inv_freq"]
+
+    @classmethod
+    def _get_name_mappings(cls, config: LlamaConfig) -> list[StateDictNameMapping]:
+        mappings: list[StateDictNameMapping] = []
+        model_mappings = [
+            ["embed_tokens.weight"],
+            ["norm.weight"],
+        ]
+        for layer_index in range(config.num_hidden_layers):
+            layer_mappings = [
+                [f"layers.{layer_index}.self_attn.q_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.k_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.v_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.o_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.rotary_emb.inv_freq"],
+                [f"layers.{layer_index}.mlp.gate_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.mlp.down_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.mlp.up_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.input_layernorm.weight"],
+                [f"layers.{layer_index}.post_attention_layernorm.weight"],
+            ]
+            model_mappings.extend(layer_mappings)
+
+        init_name_mappings(mappings=model_mappings)
+        # base-model prefix "LlamaModelAuto"
+        if "LlamaModelAuto" not in config.architectures:
+            for mapping in model_mappings:
+                mapping[0] = "model." + mapping[0]
+                mapping[1] = "llama." + mapping[1]
+            model_mappings.append(["lm_head.weight", "lm_head.weight", "transpose"])
+
+        mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)]
+        return mappings
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config: LlamaConfig, is_split=True):
+
+        from paddlenlp.transformers.conversion_utils import split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def get_tensor_parallel_split_mappings(num_layers):
+            final_actions = {}
+
+            base_actions = {
+                "lm_head.weight": partial(fn, is_column=True),
+                # Row Linear
+                "embed_tokens.weight": partial(fn, is_column=False),
+                "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False),
+                "layers.0.mlp.down_proj.weight": partial(fn, is_column=False),
+            }
+
+            # Column Linear
+            if config.fuse_attention_qkv:
+                base_actions["layers.0.self_attn.qkv_proj.weight"] = partial(fn, is_column=True)
+            else:
+                base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True)
+                # if we have enough num_key_value_heads to split, then split it.
+                if config.num_key_value_heads % config.tensor_parallel_degree == 0:
+                    base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True)
+                    base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True)
+
+            if config.fuse_attention_ffn:
+                base_actions["layers.0.mlp.gate_up_fused_proj.weight"] = partial(
+                    fn, is_column=True, is_naive_2fuse=True
+                )
+            else:
+                base_actions["layers.0.mlp.gate_proj.weight"] = partial(fn, is_column=True)
+                base_actions["layers.0.mlp.up_proj.weight"] = partial(fn, is_column=True)
+
+            for key, action in base_actions.items():
+                if "layers.0." in key:
+                    for i in range(num_layers):
+                        final_actions[key.replace("layers.0.", f"layers.{i}.")] = action
+                final_actions[key] = action
+
+            return final_actions
+
+        mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
+
+        return mappings
+
+    '''
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(
+            layer,
+            (
+                nn.Linear,
+                nn.Embedding,
+                LlamaLMHeadAuto,
+            ),
+        ):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range
+                        if hasattr(self.config, "initializer_range")
+                        else self.llama.config.initializer_range,
+                        shape=layer.weight.shape,
+                    )
+                )
+        # Layer.apply is DFS https://github.com/PaddlePaddle/Paddle/blob/a6f5021fcc58b21f4414bae6bf4731ef6971582c/python/paddle/nn/layer/layers.py#L527-L530
+        # sublayer is init first
+        # scale RowParallelLinear weight
+        with paddle.no_grad():
+            if isinstance(layer, LlamaMLPAuto):
+                factor = 1 / math.sqrt(2 * self.config.num_hidden_layers)
+                layer.down_proj.weight.scale_(factor)
+            if isinstance(layer, LlamaAttentionAuto):
+                factor = 1 / math.sqrt(2 * self.config.num_hidden_layers)
+                layer.o_proj.weight.scale_(factor)
+    '''
+
+
+@register_base_model
+class LlamaModelAuto(LlamaPretrainedModelAuto):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayerAuto`]
+    Args:
+        config: LlamaConfig
+    """
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+        self.recompute_granularity = config.recompute_granularity
+        self.no_recompute_layers = config.no_recompute_layers if config.no_recompute_layers is not None else []
+
+        # Recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        self.embed_tokens = nn.Embedding(
+            self.vocab_size,
+            self.hidden_size,
+        )
+
+        self.embed_tokens.weight = dist.shard_tensor(
+            self.embed_tokens.weight,
+            get_mesh(),
+            [dist.Replicate(), dist.Shard(1)],
+        )
+
+        def get_layer_pp_info(layer_index):
+            mesh = fleet.auto.get_mesh()
+            if is_pp_enable() is False:
+                return None, False
+            else:
+                pp_degree = mesh.get_dim_size("pp")
+                layer_per_stage = math.ceil(config.num_hidden_layers / pp_degree)
+                input_need_reshard = layer_index % layer_per_stage == 0
+                return layer_index // layer_per_stage, input_need_reshard
+
+        decoder_layers = []
+        self.next_pp_stage_indexes = []
+        for i in range(config.num_hidden_layers):
+            pp_stage_id, input_need_reshard = get_layer_pp_info(i)
+            decoder_layers.append(LlamaDecoderLayerAuto(config, i not in self.no_recompute_layers, pp_stage_id))
+            if input_need_reshard:
+                self.next_pp_stage_indexes.append(i)
+
+        self.layers = nn.LayerList(decoder_layers)
+        self.norm = LlamaRMSNormAuto(config)
+
+        self.gradient_checkpointing = False
+
+        self.placements = (
+            [dist.Shard(1), dist.Shard(0)] if self.config.sequence_parallel else [dist.Shard(0), dist.Replicate()]
+        )
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @staticmethod
+    def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length, dtype):
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            if len(attention_mask.shape) == 2:
+                expanded_attn_mask = _expand_2d_mask(attention_mask, dtype, tgt_length=input_shape[-1])
+                # For decoding phase in generation, seq_length = 1, we don't need to add causal mask
+                if input_shape[-1] > 1:
+                    combined_attention_mask = _make_causal_mask(
+                        input_shape, past_key_values_length=past_key_values_length
+                    )
+                    expanded_attn_mask = expanded_attn_mask & combined_attention_mask
+            # [bsz, seq_len, seq_len] -> [bsz, 1, seq_len, seq_len]
+            elif len(attention_mask.shape) == 3:
+                expanded_attn_mask = attention_mask.unsqueeze(1).astype("bool")
+            # if attention_mask is already 4-D, do nothing
+            else:
+                expanded_attn_mask = attention_mask
+        else:
+            expanded_attn_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
+        # Convert bool attention_mask to float attention mask, which will be added to attention_scores later
+        expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
+        return expanded_attn_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        use_cache=None,
+        past_key_values=None,
+        output_attentions=False,
+        output_hidden_states=None,
+        return_dict=False,
+        **kwargs,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.layers))
+
+        seq_length_with_past = seq_length
+        cache_length = 0
+        if past_key_values[0] is not None:
+            cache_length = past_key_values[0][0].shape[1]
+            seq_length_with_past += cache_length
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if self.config.sequence_parallel:
+            # [B, S, H] -> [S, B, H]
+            inputs_embeds = paddle.transpose(inputs_embeds, [1, 0, 2])
+
+        global_mesh = global_mesh_starts_with_pp()
+        if position_ids is None and self.config.sep_parallel_degree > 1:
+            position_ids = paddle.arange(seq_length, dtype="int64").expand((batch_size, seq_length))
+        if position_ids is not None:
+            position_ids = dist.shard_tensor(
+                position_ids,
+                global_mesh,
+                [dist.Replicate() for _ in range(len(global_mesh._shape))],
+            )
+
+        # embed positions
+        if not self.config.use_flash_attention and attention_mask is None:
+            # [bs, seq_len]
+            attention_mask = paddle.ones((batch_size, seq_length_with_past), dtype=paddle.bool)
+
+        if self.config.alibi:
+            alibi = build_alibi_tensor(attention_mask, self.config.num_attention_heads, dtype=inputs_embeds.dtype)
+            alibi = alibi.reshape([batch_size * self.config.num_attention_heads, 1, seq_length_with_past])
+        else:
+            alibi = None
+
+        if self.config.use_flash_attention:
+            # attention_mask in flash_attn is always None for pretrain
+            attention_mask = None
+        else:
+            attention_mask = self._prepare_decoder_attention_mask(
+                attention_mask, (batch_size, seq_length), cache_length, inputs_embeds.dtype
+            )  # [bs, 1, seq_len, seq_len]
+            attention_mask = dist.shard_tensor(
+                attention_mask,
+                global_mesh,
+                [dist.Replicate() for _ in range(len(global_mesh._shape))],
+            )
+
+        hidden_states = inputs_embeds
+        hidden_states = dist.reshard(hidden_states, get_mesh(), self.placements)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, (decoder_layer) in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            has_gradient = not hidden_states.stop_gradient
+            ipp = decoder_layer.ipp
+            if not is_pp_enable():
+                position_ids_input = position_ids
+                attention_mask_input = attention_mask
+            else:
+                if position_ids is not None:
+                    position_ids_input = dist.reshard(
+                        position_ids,
+                        get_mesh(ipp),
+                        [dist.Replicate(), dist.Replicate()],
+                    )
+                else:
+                    position_ids_input = position_ids
+                attention_mask_input = (
+                    dist.reshard(
+                        attention_mask,
+                        get_mesh(ipp),
+                        [dist.Replicate(), dist.Replicate()],
+                    )
+                    if attention_mask is not None
+                    else None
+                )
+
+            if idx in self.next_pp_stage_indexes:
+                hidden_states = dist.reshard(
+                    hidden_states,
+                    get_mesh(ipp),
+                    self.placements,
+                )
+
+            if (
+                self.enable_recompute
+                and idx not in self.no_recompute_layers
+                and has_gradient
+                and self.recompute_granularity == "full"
+            ):
+                layer_outputs = recompute(
+                    decoder_layer,
+                    hidden_states,
+                    position_ids_input,
+                    attention_mask_input,
+                    output_attentions,
+                    past_key_value,
+                    use_cache,
+                    alibi=alibi,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    position_ids_input,
+                    attention_mask_input,
+                    output_attentions,
+                    past_key_value,
+                    use_cache,
+                    alibi=alibi,
+                )
+
+            if type(layer_outputs) is tuple:
+                hidden_states = layer_outputs[0]
+            else:
+                hidden_states = layer_outputs
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=None,
+        )
+
+
+class LlamaPretrainingCriterion3DAuto(paddle.nn.Layer):
+    """
+    Criterion for Llama.
+    It calculates the final loss.
+    """
+
+    def __init__(self, config):
+
+        super(LlamaPretrainingCriterion3DAuto, self).__init__()
+        self.ignore_index = getattr(config, "ignore_index", -100)
+        self.config = config
+        self.enable_parallel_cross_entropy = config.tensor_parallel_degree > 1 and config.tensor_parallel_output
+        self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index)
+
+    def forward(self, prediction_scores, masked_lm_labels):
+        if self.enable_parallel_cross_entropy:
+            if prediction_scores.shape[-1] == self.config.vocab_size:
+                warnings.warn(
+                    f"enable_parallel_cross_entropy, the vocab_size should be splited: {prediction_scores.shape[-1]}, {self.config.vocab_size}"
+                )
+                self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index)
+
+        # Force entropy same kernel
+        with paddle.amp.auto_cast(False):
+            if isinstance(prediction_scores, paddle.Tensor):
+                masked_lm_loss = self.loss_func(
+                    prediction_scores.astype("float32")._use_gpudnn(False),
+                    masked_lm_labels.unsqueeze(2),
+                )
+            else:
+
+                masked_lm_loss = self.loss_func(
+                    prediction_scores.astype("float32"),
+                    masked_lm_labels.unsqueeze(2),
+                )
+
+            masked_lm_loss = paddle.masked_select(masked_lm_loss, masked_lm_loss > 0).astype("float32")
+            loss = paddle.mean(masked_lm_loss)
+        return loss
+
+
+class LlamaLMHeadAuto(nn.Layer):
+    def __init__(self, config: LlamaConfig):
+        super(LlamaLMHeadAuto, self).__init__()
+        self.config = config
+        vocab_size = config.vocab_size
+        self.weight = self.create_parameter(
+            shape=[config.hidden_size, vocab_size],
+            dtype=paddle.get_default_dtype(),
+        )
+        self.weight = dist.shard_tensor(
+            self.weight,
+            get_mesh(-1),
+            [dist.Replicate(), dist.Shard(1)],
+        )
+
+    def forward(self, hidden_states, tensor_parallel_output=None):
+        if tensor_parallel_output is None:
+            tensor_parallel_output = self.config.tensor_parallel_output
+        logits = paddle.matmul(hidden_states, self.weight, transpose_y=False)
+        return logits
+
+
+class LlamaForCausalLM3DAuto(LlamaPretrainedModelAuto):
+    enable_to_static_method = True
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.llama = LlamaModelAuto(config)
+        self.lm_head = LlamaLMHeadAuto(config)
+
+    def get_input_embeddings(self):
+        return self.llama.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.llama.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.llama = decoder
+
+    def get_decoder(self):
+        return self.llama
+
+    def prepare_inputs_for_generation(
+        self, input_ids, use_cache=False, past_key_values=None, inputs_embeds=None, **kwargs
+    ):
+        batch_size, seq_length = input_ids.shape
+        position_ids = kwargs.get("position_ids", paddle.arange(seq_length).expand((batch_size, seq_length)))
+        attention_mask = kwargs.get("attention_mask", None)
+        if past_key_values:
+            input_ids = input_ids[:, -1].unsqueeze(axis=-1)
+            position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    def _get_model_inputs_spec(self, dtype: str):
+        return {
+            "input_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+            "attention_mask": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+            "position_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+        }
+
+    @staticmethod
+    def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False):
+        # update cache
+        if isinstance(outputs, tuple) and len(outputs) > 1 and not isinstance(outputs[1], paddle.Tensor):
+            model_kwargs["past_key_values"] = outputs[1]
+
+        if isinstance(outputs, CausalLMOutputWithCrossAttentions) and "past_key_values" in outputs:
+            model_kwargs["past_key_values"] = outputs.past_key_values
+
+        # update position_ids
+        if "position_ids" in model_kwargs and model_kwargs["position_ids"] is not None:
+            position_ids = model_kwargs["position_ids"]
+            model_kwargs["position_ids"] = paddle.concat([position_ids, position_ids[..., -1:] + 1], axis=-1)
+
+        if not is_encoder_decoder and "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = paddle.concat(
+                [attention_mask, paddle.ones([attention_mask.shape[0], 1], dtype=attention_mask.dtype)], axis=-1
+            )
+
+        return model_kwargs
+
+    def forward(
+        self,
+        input_ids=None,
+        labels=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        use_cache=False,
+        past_key_values=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        input_ids.stop_gradient = True
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.llama(
+            input_ids,  # [bs, seq_len]
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]  # [bs, seq_len, dim]
+        # enter tp region
+        if self.config.sequence_parallel:
+            hidden_states = dist.reshard(
+                hidden_states,
+                get_mesh(-1),
+                [dist.Shard(1), dist.Replicate()],
+            )
+            hidden_states = paddle.transpose(hidden_states, [1, 0, 2])
+
+        # if labels is None，means we need full output, instead of tensor_parallel_output
+        # tensor_parallel_output is togather with ParallelCrossEntropy
+        tensor_parallel_output = (
+            self.config.tensor_parallel_output and labels is not None and self.config.tensor_parallel_degree > 1
+        )
+
+        logits = self.lm_head(hidden_states, tensor_parallel_output=tensor_parallel_output)
+
+        return logits
+
+        # loss = None
+        # if labels is not None:
+        #     labels.stop_gradient = True
+        #     loss = self.criterion(logits, labels)
+
+        # if not return_dict:
+        #     output = (logits,) + outputs[1:]
+        #     return (loss,) + output if loss is not None else output
+
+        # return CausalLMOutputWithCrossAttentions(
+        #     loss=loss,
+        #     logits=logits,
+        #     past_key_values=outputs.past_key_values,
+        #     hidden_states=outputs.hidden_states,
+        #     attentions=outputs.attentions,
+        # )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/modeling_auto_static.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/modeling_auto_static.py
new file mode 100644
index 000000000..d9af478b8
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/modeling_auto_static.py
@@ -0,0 +1,1251 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Paddle Llama model"""
+from __future__ import annotations
+
+import math
+import warnings
+from functools import partial
+from typing import Optional, Tuple
+
+import paddle
+import paddle.distributed as dist
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.distributed import fleet
+
+try:
+    from paddle.incubate.nn.functional import fused_rotary_position_embedding
+except ImportError:
+    fused_rotary_position_embedding = None
+
+try:
+    from paddle.incubate.nn.functional import swiglu
+except ImportError:
+
+    def swiglu(x, y=None):
+        if y is None:
+            x, y = paddle.chunk(x, chunks=2, axis=-1)
+        return F.silu(x) * y
+
+
+from paddlenlp.transformers.conversion_utils import (
+    StateDictNameMapping,
+    init_name_mappings,
+)
+from paddlenlp.transformers.model_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+)
+from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model
+
+from .configuration import (
+    LLAMA_PRETRAINED_INIT_CONFIGURATION,
+    LLAMA_PRETRAINED_RESOURCE_FILES_MAP,
+    LlamaConfig,
+)
+from .modeling import (
+    LlamaDynamicNTKScalingRotaryEmbedding,
+    LlamaLinearScalingRotaryEmbedding,
+    LlamaNTKScalingRotaryEmbedding,
+    LlamaRotaryEmbedding,
+    _expand_2d_mask,
+    _make_causal_mask,
+    apply_rotary_pos_emb,
+    build_alibi_tensor,
+    get_triangle_upper_mask,
+    repeat_kv,
+    rms_norm_fused,
+)
+
+try:
+    from paddle.nn.functional.flash_attention import flash_attention
+except:
+    flash_attention = None
+
+__all__ = [
+    "LlamaModelAuto",
+    "LlamaPretrainedModelAuto",
+    "LlamaForCausalLMAuto",
+    "LlamaPretrainingCriterionAuto",
+]
+
+
+def get_mesh(pp_idx=None):
+    mesh = fleet.auto.get_mesh()
+    if pp_idx is None:
+        return mesh
+    if "pp" in mesh.dim_names:
+        mesh = mesh.get_mesh_with_dim("pp")[pp_idx]
+    return mesh
+
+
+def get_dist_attr(shard_specs, pp_idx=None):
+    mesh = get_mesh(pp_idx)
+    new_spec = []
+    for spec in shard_specs:
+        if not spec:
+            new_spec.append(spec)
+        else:
+            if spec in mesh.dim_names:
+                new_spec.append(spec)
+            else:
+                new_spec.append(None)
+
+    return mesh, new_spec
+
+
+def scaled_dot_product_attention(
+    query_states,
+    config,
+    key_states,
+    value_states,
+    attention_mask,
+    output_attentions,
+    alibi=None,
+):
+    bsz, q_len, num_heads, head_dim = query_states.shape
+    _, kv_seq_len, _, _ = value_states.shape
+
+    if config.use_flash_attention and flash_attention:
+        # Paddle Flash Attention input [ bz, seqlen, nhead, head_dim]
+        # Torch Flash Attention input [ bz, nhead, seqlen, head_dim]
+
+        version = paddle.version.full_version
+        if version != "0.0.0" and version <= "2.5.2":
+            if alibi is not None:
+                raise ValueError("Flash Attention doesn't support alibi")
+            attn_output, attn_weights = flash_attention(
+                query_states,
+                key_states,
+                value_states,
+                causal=True,
+                return_softmax=output_attentions,
+            )
+        else:
+            if alibi is not None:
+                alibi = alibi.reshape([bsz, num_heads, 1, -1])
+                attention_mask = attention_mask.cast(alibi.dtype) + alibi
+            attn_output = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                is_causal=attention_mask is None,
+            )
+            attn_weights = None
+
+        attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
+        return (attn_output, attn_weights) if output_attentions else attn_output
+    else:
+        #  [ bz, seqlen, nhead, head_dim] -> [bs, nhead, seq_len, head_dim]
+        query_states = paddle.transpose(query_states, [0, 2, 1, 3])
+        # merge with the next tranpose
+        key_states = paddle.transpose(key_states, [0, 2, 1, 3])
+        value_states = paddle.transpose(value_states, [0, 2, 1, 3])
+
+        # matmul and devide by sqrt(head_dim)
+        attn_weights = paddle.matmul(query_states / math.sqrt(head_dim), key_states.transpose([0, 1, 3, 2]))
+        # then add alibi bias
+        if alibi is not None:
+            alibi = alibi.reshape([bsz, num_heads, 1, -1])
+            attn_weights = attn_weights + alibi
+
+        if list(attn_weights.shape) != [bsz, num_heads, q_len, kv_seq_len]:
+            raise ValueError(
+                f"Attention weights should be of shape {(bsz, num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.shape}"
+            )
+
+        # NOTE: we only call get_triangle_upper_mask under PP setup
+        # FIXME ZHUI when we use pipeline parallel, the attention_mask can be None
+        # we just make it triangle_upper_mask
+        if attention_mask is None:
+            attention_mask = get_triangle_upper_mask(attn_weights)
+
+        attention_mask = attention_mask.reshape([bsz, 1, q_len, kv_seq_len])
+        if list(attention_mask.shape) != [bsz, 1, q_len, kv_seq_len]:
+            raise ValueError(
+                f"Attention mask should be of shape {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.shape}"
+            )
+
+        attn_weights = attn_weights + attention_mask
+        if not paddle.in_dynamic_mode():
+            attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query_states.dtype)
+        else:
+            # with paddle.amp.auto_cast(False):
+            #     attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query_states.dtype)
+            attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query_states.dtype)
+
+        attn_output = paddle.matmul(attn_weights, value_states)
+        attn_output = attn_output.transpose([0, 2, 1, 3])
+        attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
+        return (attn_output, attn_weights) if output_attentions else attn_output
+
+
+class LlamaRMSNormAuto(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.weight = paddle.create_parameter(
+            shape=[self.hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(1.0),
+        )
+        self.variance_epsilon = config.rms_norm_eps
+        self.config = config
+
+    def forward(self, hidden_states):
+        if self.config.use_fused_rms_norm:
+            return rms_norm_fused(hidden_states, self.weight, self.variance_epsilon)
+
+        if paddle.in_dynamic_mode():
+            # with paddle.amp.auto_cast(False):
+            #     variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True)
+            #     hidden_states = paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
+            variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True)
+            hidden_states = paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
+        else:
+            variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True)
+            hidden_states = paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
+
+        if self.weight.dtype in [paddle.float16, paddle.bfloat16]:
+            hidden_states = paddle.cast(hidden_states, self.weight.dtype)
+        return hidden_states * self.weight
+
+
+class LlamaMLPAuto(nn.Layer):
+    def __init__(self, config, ipp: Optional[int] = None):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.fuse_attention_ffn = config.fuse_attention_ffn
+        self.ipp = ipp
+
+        if config.fuse_attention_ffn:
+            self.gate_up_fused_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias_attr=False)
+        else:
+            self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
+            self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
+
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias_attr=False)
+
+    def forward(self, x):
+        if self.fuse_attention_ffn:
+            fleet.auto.shard_tensor(self.gate_up_fused_proj.weight, *get_dist_attr([None, "mp"], self.ipp))
+        else:
+            fleet.auto.shard_tensor(self.gate_proj.weight, *get_dist_attr([None, "mp"], self.ipp))
+            fleet.auto.shard_tensor(self.up_proj.weight, *get_dist_attr([None, "mp"], self.ipp))
+
+        fleet.auto.shard_tensor(self.down_proj.weight, *get_dist_attr(["mp", None], self.ipp))
+
+        if self.fuse_attention_ffn:
+            x = swiglu(self.gate_up_fused_proj(x))
+        else:
+            x = swiglu(self.gate_proj(x), self.up_proj(x))
+        out = self.down_proj(x)
+        return out
+
+
+class LlamaAttentionAuto(nn.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: LlamaConfig, layerwise_recompute: bool = False, ipp: Optional[int] = None):
+        super().__init__()
+
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+
+        self.head_dim = self.hidden_size // config.num_attention_heads
+
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+
+        self.max_position_embeddings = config.max_position_embeddings
+        self.seq_length = config.seq_length
+
+        self.fuse_attention_qkv = config.fuse_attention_qkv
+        if self.fuse_attention_qkv and config.num_attention_heads != config.num_key_value_heads:
+            raise ValueError(
+                f"fuse_attention_qkv can't be True when num_attention_heads {config.num_attention_heads}!= num_key_value_heads {config.num_key_value_heads}"
+            )
+
+        self.kv_indices = None
+        # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True
+        # Enable_recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        self.layerwise_recompute = layerwise_recompute
+        self.recompute_granularity = config.recompute_granularity
+        self.ipp = ipp
+
+        self.use_fused_rope = config.use_fused_rope
+        if self.use_fused_rope:
+            if "gpu" not in paddle.device.get_device() or fused_rotary_position_embedding is None:
+                warnings.warn(
+                    "Enable fuse rope in the config, but fuse rope is not available. "
+                    "Will disable fuse rope. Try using latest gpu version of Paddle."
+                )
+                self.use_fused_rope = False
+
+        if self.fuse_attention_qkv:
+            self.qkv_proj = nn.Linear(
+                self.hidden_size,
+                3 * self.hidden_size,
+                bias_attr=False,
+            )
+        else:
+            self.q_proj = nn.Linear(
+                self.hidden_size,
+                self.hidden_size,
+                bias_attr=False,
+            )
+            self.k_proj = nn.Linear(
+                self.hidden_size,
+                self.config.num_key_value_heads * self.head_dim,
+                bias_attr=False,
+            )
+            self.v_proj = nn.Linear(
+                self.hidden_size,
+                self.config.num_key_value_heads * self.head_dim,
+                bias_attr=False,
+            )
+
+        self.o_proj = nn.Linear(
+            self.hidden_size,
+            self.hidden_size,
+            bias_attr=False,
+        )
+
+        if config.rope:
+            self._init_rope()
+
+        self.config = config
+
+    def _init_rope(self):
+        if self.config.rope_scaling_type is None:
+            self.rotary_emb = LlamaRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+            )
+        elif self.config.rope_scaling_type == "linear":
+            self.rotary_emb = LlamaLinearScalingRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                scaling_factor=self.config.rope_scaling_factor,
+            )
+        elif self.config.rope_scaling_type == "ntk":
+            self.rotary_emb = LlamaNTKScalingRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                scaling_factor=self.config.rope_scaling_factor,
+            )
+        elif self.config.rope_scaling_type == "dynamic_ntk":
+            self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                scaling_factor=self.config.rope_scaling_factor,
+            )
+        else:
+            raise ValueError(f"Unknown RoPE scaling type {self.config.rope_scaling_type}")
+
+    def forward(
+        self,
+        hidden_states,
+        position_ids: Optional[Tuple[paddle.Tensor]] = None,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        alibi: Optional[paddle.Tensor] = None,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism)
+        # enter tp region
+        if self.config.sequence_parallel:
+            mesh = get_mesh(self.ipp)
+            if "dp" in mesh.dim_names:
+                hidden_states = dist.reshard(
+                    hidden_states,
+                    get_mesh(self.ipp),
+                    [dist.Shard(1), dist.Replicate()],
+                )
+            else:
+                hidden_states = dist.reshard(
+                    hidden_states,
+                    get_mesh(self.ipp),
+                    [dist.Replicate()],
+                )
+
+        if self.fuse_attention_qkv:
+            target_shape = [0, 0, self.num_heads, 3 * self.head_dim]
+            fleet.auto.shard_tensor(self.qkv_proj.weight, *get_dist_attr([None, "mp"], self.ipp))
+
+            mix_layer = self.qkv_proj(hidden_states)
+            mix_layer = paddle.reshape_(mix_layer, target_shape)
+            query_states, key_states, value_states = paddle.split(mix_layer, num_or_sections=3, axis=-1)
+        else:
+            target_query_shape = [0, 0, self.num_heads, self.head_dim]
+            target_key_value_shape = [0, 0, self.num_key_value_heads, self.head_dim]
+
+            fleet.auto.shard_tensor(self.q_proj.weight, *get_dist_attr([None, "mp"], self.ipp))
+            fleet.auto.shard_tensor(self.k_proj.weight, *get_dist_attr([None, "mp"], self.ipp))
+            fleet.auto.shard_tensor(self.v_proj.weight, *get_dist_attr([None, "mp"], self.ipp))
+
+            query_states = self.q_proj(hidden_states).reshape(shape=target_query_shape)
+            key_states = self.k_proj(hidden_states).reshape(shape=target_key_value_shape)
+            value_states = self.v_proj(hidden_states).reshape(shape=target_key_value_shape)
+
+        if self.config.sequence_parallel:
+            query_states = paddle.transpose(query_states, [1, 0, 2, 3])
+            key_states = paddle.transpose(key_states, [1, 0, 2, 3])
+            value_states = paddle.transpose(value_states, [1, 0, 2, 3])
+
+        kv_seq_len = key_states.shape[-3]
+
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-3]
+
+        if self.config.rope:
+            if self.use_fused_rope:
+                assert past_key_value is None, "fuse rotary not support cache kv for now"
+                cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+                query_states, key_states, _ = fused_rotary_position_embedding(
+                    query_states,
+                    key_states,
+                    v=None,
+                    sin=sin,
+                    cos=cos,
+                    position_ids=position_ids,
+                    use_neox_rotary_style=False,
+                )
+            else:
+                cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+                query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        # [bs, seq_len, num_head, head_dim]
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = paddle.concat([past_key_value[0], key_states], axis=1)
+            value_states = paddle.concat([past_key_value[1], value_states], axis=1)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        if self.kv_indices is not None:
+            key_states = paddle.index_select(key_states, self.kv_indices, axis=2)
+            value_states = paddle.index_select(value_states, self.kv_indices, axis=2)
+
+        # TODO(wj-Mcat): use broadcast strategy when n_kv_heads = 1
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        has_gradient = not (query_states.stop_gradient and key_states.stop_gradient and value_states.stop_gradient)
+        if (
+            self.enable_recompute
+            and self.layerwise_recompute
+            and has_gradient
+            and self.recompute_granularity == "core_attn"
+        ):
+            outputs = fleet.auto.recompute(scaled_dot_product_attention)(
+                query_states,
+                self.config,
+                key_states,
+                value_states,
+                attention_mask,
+                output_attentions,
+                alibi,
+                use_reentrant=self.config.recompute_use_reentrant,
+            )
+        else:
+            outputs = scaled_dot_product_attention(
+                query_states,
+                self.config,
+                key_states,
+                value_states,
+                attention_mask,
+                output_attentions,
+                alibi,
+            )
+        if output_attentions:
+            attn_output, attn_weights = outputs
+        else:
+            attn_output = outputs
+
+        # if sequence_parallel is true, out shape are [q_len / n, bs, num_head * head_dim]
+        # else their shape are [bs, q_len, num_head * head_dim], n is mp parallelism.
+        fleet.auto.shard_tensor(self.o_proj.weight, *get_dist_attr(["mp", None], self.ipp))
+        attn_output = self.o_proj(attn_output)
+
+        # enter sp region
+        if self.config.sequence_parallel:
+            attn_output = paddle.transpose(attn_output, [1, 0, 2])
+            mesh = get_mesh(self.ipp)
+            if "dp" in mesh.dim_names:
+                attn_output = dist.reshard(
+                    attn_output,
+                    get_mesh(self.ipp),
+                    [dist.Shard(1), dist.Shard(0)],
+                )
+            else:
+                attn_output = dist.reshard(
+                    attn_output,
+                    get_mesh(self.ipp),
+                    [dist.Shard(0)],
+                )
+        if not output_attentions:
+            attn_weights = None
+
+        outputs = (attn_output,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        if use_cache:
+            outputs += (past_key_value,)
+
+        if type(outputs) is tuple and len(outputs) == 1:
+            outputs = outputs[0]
+
+        return outputs
+
+
+class LlamaDecoderLayerAuto(nn.Layer):
+    def __init__(self, config, layerwise_recompute: bool = False, ipp: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.self_attn = LlamaAttentionAuto(config, layerwise_recompute, ipp)
+        self.mlp = LlamaMLPAuto(config, ipp)
+        self.input_layernorm = LlamaRMSNormAuto(config)
+        self.post_attention_layernorm = LlamaRMSNormAuto(config)
+        # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True
+        # Enable_recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        self.layerwise_recompute = layerwise_recompute
+        self.recompute_granularity = config.recompute_granularity
+        self.ipp = ipp
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        position_ids: Optional[Tuple[paddle.Tensor]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        use_cache: Optional[bool] = False,
+        alibi: Optional[paddle.Tensor] = None,
+    ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
+        """
+        Args:
+            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`paddle.Tensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `cache` key value states are returned and can be used to speed up decoding
+                (see `cache`).
+            cache (`Tuple(paddle.Tensor)`, *optional*): cached past key and value projection states
+        """
+
+        # [bs * seq_len, embed_dim] -> [seq_len * bs / n, embed_dim] (sequence_parallel)
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        has_gradient = not hidden_states.stop_gradient
+        if (
+            self.enable_recompute
+            and self.layerwise_recompute
+            and has_gradient
+            and self.recompute_granularity == "full_attn"
+        ):
+            outputs = fleet.auto.recompute(self.self_attn)(
+                hidden_states,
+                position_ids,
+                past_key_value,
+                attention_mask,
+                output_attentions,
+                use_cache,
+                alibi,
+                use_reentrant=self.config.recompute_use_reentrant,
+            )
+        else:
+            outputs = self.self_attn(
+                hidden_states,
+                position_ids,
+                past_key_value,
+                attention_mask,
+                output_attentions,
+                use_cache,
+                alibi,
+            )
+
+        if type(outputs) is tuple:
+            hidden_states = outputs[0]
+        else:
+            hidden_states = outputs
+
+        if output_attentions:
+            self_attn_weights = outputs[1]
+
+        if use_cache:
+            present_key_value = outputs[2 if output_attentions else 1]
+
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+
+        # enter tp region
+        if self.config.sequence_parallel:
+            mesh = get_mesh(self.ipp)
+            if "dp" in mesh.dim_names:
+                hidden_states = dist.reshard(
+                    hidden_states,
+                    get_mesh(self.ipp),
+                    [dist.Shard(1), dist.Replicate()],
+                )
+            else:
+                hidden_states = dist.reshard(
+                    hidden_states,
+                    get_mesh(self.ipp),
+                    [dist.Replicate()],
+                )
+
+        hidden_states = self.mlp(hidden_states)
+        # enter sp region
+        if self.config.sequence_parallel:
+            mesh = get_mesh(self.ipp)
+            if "dp" in mesh.dim_names:
+                hidden_states = dist.reshard(
+                    hidden_states,
+                    get_mesh(self.ipp),
+                    [dist.Shard(1), dist.Shard(0)],
+                )
+            else:
+                hidden_states = dist.reshard(
+                    hidden_states,
+                    get_mesh(self.ipp),
+                    [dist.Shard(0)],
+                )
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        # remove empty tuple for pipeline parallel
+        if type(outputs) is tuple and len(outputs) == 1:
+            outputs = outputs[0]
+
+        return outputs
+
+
+class LlamaPretrainedModelAuto(PretrainedModel):
+    config_class = LlamaConfig
+    base_model_prefix = "llama"
+    pretrained_init_configuration = LLAMA_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = LLAMA_PRETRAINED_RESOURCE_FILES_MAP
+    _keys_to_ignore_on_load_unexpected = [r"self_attn.rotary_emb.inv_freq"]
+
+    @classmethod
+    def _get_name_mappings(cls, config: LlamaConfig) -> list[StateDictNameMapping]:
+        mappings: list[StateDictNameMapping] = []
+        model_mappings = [
+            ["embed_tokens.weight"],
+            ["norm.weight"],
+        ]
+        for layer_index in range(config.num_hidden_layers):
+            layer_mappings = [
+                [f"layers.{layer_index}.self_attn.q_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.k_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.v_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.o_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.rotary_emb.inv_freq"],
+                [f"layers.{layer_index}.mlp.gate_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.mlp.down_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.mlp.up_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.input_layernorm.weight"],
+                [f"layers.{layer_index}.post_attention_layernorm.weight"],
+            ]
+            model_mappings.extend(layer_mappings)
+
+        init_name_mappings(mappings=model_mappings)
+        # base-model prefix "LlamaModelAuto"
+        if "LlamaModelAuto" not in config.architectures:
+            for mapping in model_mappings:
+                mapping[0] = "model." + mapping[0]
+                mapping[1] = "llama." + mapping[1]
+            model_mappings.append(["lm_head.weight", "lm_head.weight", "transpose"])
+
+        mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)]
+        return mappings
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config: LlamaConfig, is_split=True):
+
+        from paddlenlp.transformers.conversion_utils import split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def get_tensor_parallel_split_mappings(num_layers):
+            final_actions = {}
+
+            base_actions = {
+                "lm_head.weight": partial(fn, is_column=True),
+                # Row Linear
+                "embed_tokens.weight": partial(fn, is_column=False),
+                "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False),
+                "layers.0.mlp.down_proj.weight": partial(fn, is_column=False),
+            }
+
+            # Column Linear
+            if config.fuse_attention_qkv:
+                base_actions["layers.0.self_attn.qkv_proj.weight"] = partial(fn, is_column=True)
+            else:
+                base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True)
+                # if we have enough num_key_value_heads to split, then split it.
+                if config.num_key_value_heads % config.tensor_parallel_degree == 0:
+                    base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True)
+                    base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True)
+
+            if config.fuse_attention_ffn:
+                base_actions["layers.0.mlp.gate_up_fused_proj.weight"] = partial(
+                    fn, is_column=True, is_naive_2fuse=True
+                )
+            else:
+                base_actions["layers.0.mlp.gate_proj.weight"] = partial(fn, is_column=True)
+                base_actions["layers.0.mlp.up_proj.weight"] = partial(fn, is_column=True)
+
+            for key, action in base_actions.items():
+                if "layers.0." in key:
+                    for i in range(num_layers):
+                        final_actions[key.replace("layers.0.", f"layers.{i}.")] = action
+                final_actions[key] = action
+
+            return final_actions
+
+        mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
+
+        return mappings
+
+
+@register_base_model
+class LlamaModelAuto(LlamaPretrainedModelAuto):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayerAuto`]
+    Args:
+        config: LlamaConfig
+    """
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+        self.recompute_granularity = config.recompute_granularity
+        self.no_recompute_layers = config.no_recompute_layers if config.no_recompute_layers is not None else []
+
+        # Recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        self.embed_tokens = nn.Embedding(
+            self.vocab_size,
+            self.hidden_size,
+        )
+
+        mesh = get_mesh()
+        if "pp" not in mesh.dim_names:
+            pp_degree = 1
+        else:
+            pp_degree = mesh.get_dim_size("pp")
+        virtual_pp_degree = getattr(self.config, "virtual_pp_degree", 1)
+        assert config.num_hidden_layers % (pp_degree * virtual_pp_degree) == 0
+
+        num_layer_per_stage = math.ceil(config.num_hidden_layers / pp_degree)
+        self.layer_to_ipp = [i // num_layer_per_stage for i in range(config.num_hidden_layers)]
+        self.layers = nn.LayerList(
+            [
+                LlamaDecoderLayerAuto(config, i not in self.no_recompute_layers, self.layer_to_ipp[i])
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = LlamaRMSNormAuto(config)
+
+        self.gradient_checkpointing = False
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @staticmethod
+    def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length, dtype):
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            if len(attention_mask.shape) == 2:
+                expanded_attn_mask = _expand_2d_mask(attention_mask, dtype, tgt_length=input_shape[-1])
+                # For decoding phase in generation, seq_length = 1, we don't need to add causal mask
+                if input_shape[-1] > 1:
+                    combined_attention_mask = _make_causal_mask(
+                        input_shape, past_key_values_length=past_key_values_length
+                    )
+                    # NOTE(zhaoyingli): infer spmd does not support [seq_len, seq_len] --> [batch, 1, seq_len, seq_len] in data_parallel
+                    fleet.auto.shard_tensor(combined_attention_mask, get_mesh(), [None, None, None, None])
+                    expanded_attn_mask = expanded_attn_mask & combined_attention_mask
+            # [bsz, seq_len, seq_len] -> [bsz, 1, seq_len, seq_len]
+            elif len(attention_mask.shape) == 3:
+                expanded_attn_mask = attention_mask.unsqueeze(1).astype("bool")
+            # if attention_mask is already 4-D, do nothing
+            else:
+                expanded_attn_mask = attention_mask
+        else:
+            expanded_attn_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
+        # Convert bool attention_mask to float attention mask, which will be added to attention_scores later
+        expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
+        return expanded_attn_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        use_cache=None,
+        past_key_values=None,
+        output_attentions=False,
+        output_hidden_states=None,
+        return_dict=False,
+        **kwargs,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # NOTE(zhaoyingli): temprorary method to guarantee the later ops are placed all ranks until meeting new annotaion.
+        full = fleet.auto.shard_op(paddle.full, get_mesh())
+        full(shape=[1], fill_value=0)
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.layers))
+
+        seq_length_with_past = seq_length
+        cache_length = 0
+        if past_key_values[0] is not None:
+            cache_length = past_key_values[0][0].shape[1]
+            seq_length_with_past += cache_length
+
+        if inputs_embeds is None:
+            fleet.auto.shard_tensor(self.embed_tokens.weight, *get_dist_attr(["mp", None], 0))
+            embed_tokens = fleet.auto.shard_op(self.embed_tokens, get_mesh(0))
+            inputs_embeds = embed_tokens(input_ids)
+
+        # NOTE(zhaoyingli): temprorary method to guarantee the later ops are placed all ranks until meeting new annotaion.
+        full = fleet.auto.shard_op(paddle.full, get_mesh())
+        full(shape=[1], fill_value=0)
+
+        # embed positions
+        if attention_mask is None:
+            # [bs, seq_len]
+            attention_mask = paddle.ones((batch_size, seq_length_with_past), dtype=paddle.bool)
+
+        if self.config.alibi:
+            alibi = build_alibi_tensor(attention_mask, self.config.num_attention_heads, dtype=inputs_embeds.dtype)
+            alibi = alibi.reshape([batch_size * self.config.num_attention_heads, 1, seq_length_with_past])
+        else:
+            alibi = None
+
+        if position_ids is None:
+            position_ids = paddle.arange(seq_length, dtype="int64").expand((batch_size, seq_length))
+            # NOTE(zhaoyingli):
+            # 1. infer spmd does not support [seq_len] --> [batch, seq_len] in data_parallel
+            fleet.auto.shard_tensor(position_ids, get_mesh(), [None, None])
+
+        if self.config.use_flash_attention:
+            # attention_mask in flash_attn is always None for pretrain
+            attention_mask = None
+        else:
+            attention_mask = self._prepare_decoder_attention_mask(
+                attention_mask, (batch_size, seq_length), cache_length, inputs_embeds.dtype
+            )  # [bs, 1, seq_len, seq_len]
+
+        hidden_states = inputs_embeds
+        if self.config.sequence_parallel:
+            # [B, S, H] -> [S, B, H]
+            emb_transpose = fleet.auto.shard_op(paddle.transpose, get_mesh(0))
+            hidden_states = emb_transpose(hidden_states, [1, 0, 2])
+            # enter sp region
+            mesh = get_mesh(0)
+            if "dp" in mesh.dim_names:
+                hidden_states = dist.reshard(
+                    hidden_states,
+                    get_mesh(0),
+                    [dist.Shard(1), dist.Shard(0)],
+                )
+            else:
+                hidden_states = dist.reshard(
+                    hidden_states,
+                    get_mesh(0),
+                    [dist.Shard(0)],
+                )
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, (decoder_layer) in enumerate(self.layers):
+            ipp = decoder_layer.ipp
+            if self.config.sequence_parallel:
+                fleet.auto.shard_tensor(hidden_states, *get_dist_attr(["mp", "dp", None], ipp))
+            else:
+                fleet.auto.shard_tensor(hidden_states, *get_dist_attr(["dp", None, None], ipp))
+            decoder_layer = fleet.auto.shard_op(decoder_layer, get_mesh(ipp))
+
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            has_gradient = not hidden_states.stop_gradient
+
+            if (
+                self.enable_recompute
+                and idx not in self.no_recompute_layers
+                and has_gradient
+                and self.recompute_granularity == "full"
+            ):
+                layer_outputs = fleet.auto.recompute(decoder_layer)(
+                    hidden_states,
+                    position_ids,
+                    attention_mask,
+                    output_attentions,
+                    past_key_value,
+                    use_cache,
+                    alibi=alibi,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    position_ids,
+                    attention_mask,
+                    output_attentions,
+                    past_key_value,
+                    use_cache,
+                    alibi=alibi,
+                )
+
+            if type(layer_outputs) is tuple:
+                hidden_states = layer_outputs[0]
+            else:
+                hidden_states = layer_outputs
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=None,
+        )
+
+    def _post_output(
+        self,
+        hidden_states,
+        output_hidden_states,
+        next_decoder_cache,
+        all_self_attns,
+        all_hidden_states,
+        use_cache,
+        return_dict,
+    ):
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=None,
+        )
+
+
+class LlamaPretrainingCriterionAuto(paddle.nn.Layer):
+    """
+    Criterion for Llama.
+    It calculates the final loss.
+    """
+
+    def __init__(self, config):
+
+        super(LlamaPretrainingCriterionAuto, self).__init__()
+        self.ignore_index = getattr(config, "ignore_index", -100)
+        self.config = config
+        self.enable_parallel_cross_entropy = config.tensor_parallel_degree > 1 and config.tensor_parallel_output
+        self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index)
+
+    def forward(self, prediction_scores, masked_lm_labels):
+        if self.enable_parallel_cross_entropy:
+            if prediction_scores.shape[-1] == self.config.vocab_size:
+                warnings.warn(
+                    f"enable_parallel_cross_entropy, the vocab_size should be splited: {prediction_scores.shape[-1]}, {self.config.vocab_size}"
+                )
+                self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index)
+
+        # with paddle.amp.auto_cast(False):
+        #     masked_lm_loss = self.loss_func(prediction_scores.astype("float32"), masked_lm_labels.unsqueeze(2))
+        #     # skip ignore_index which loss == 0
+        #     masked_lm_loss = masked_lm_loss[masked_lm_loss > 0].astype("float32")
+        #     loss = paddle.mean(masked_lm_loss)
+
+        masked_lm_loss = self.loss_func(prediction_scores.astype("float32"), masked_lm_labels.unsqueeze(2))
+        # skip ignore_index which loss == 0
+        # masked_lm_loss = masked_lm_loss[masked_lm_loss > 0].astype("float32")
+        # TODO: solve the issue of conditional block
+        masked_lm_loss = paddle.masked_select(masked_lm_loss, masked_lm_loss > 0).astype("float32")
+        loss = paddle.mean(masked_lm_loss)
+
+        return loss
+
+
+class LlamaLMHeadAuto(nn.Layer):
+    def __init__(self, config: LlamaConfig):
+        super(LlamaLMHeadAuto, self).__init__()
+        self.config = config
+        vocab_size = config.vocab_size
+
+        self.weight = self.create_parameter(
+            shape=[config.hidden_size, vocab_size],
+            dtype=paddle.get_default_dtype(),
+        )
+
+    def forward(self, hidden_states, tensor_parallel_output=None):
+        if tensor_parallel_output is None:
+            tensor_parallel_output = self.config.tensor_parallel_output
+
+        fleet.auto.shard_tensor(self.weight, *get_dist_attr([None, "mp"], -1))
+        logits = paddle.matmul(hidden_states, self.weight, transpose_y=False)
+        return logits
+
+
+class LlamaForCausalLMAuto(LlamaPretrainedModelAuto):
+    enable_to_static_method = True
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        with paddle.LazyGuard():
+            self.llama = LlamaModelAuto(config)
+            self.lm_head = LlamaLMHeadAuto(config)
+            self.criterion = LlamaPretrainingCriterionAuto(config)
+
+    def get_input_embeddings(self):
+        return self.llama.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.llama.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.llama = decoder
+
+    def get_decoder(self):
+        return self.llama
+
+    def prepare_inputs_for_generation(
+        self, input_ids, use_cache=False, past_key_values=None, inputs_embeds=None, **kwargs
+    ):
+        batch_size, seq_length = input_ids.shape
+        position_ids = kwargs.get("position_ids", paddle.arange(seq_length).expand((batch_size, seq_length)))
+        attention_mask = kwargs.get("attention_mask", None)
+        if past_key_values:
+            input_ids = input_ids[:, -1].unsqueeze(axis=-1)
+            position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    def _get_model_inputs_spec(self, dtype: str):
+        return {
+            "input_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+            "attention_mask": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+            "position_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+        }
+
+    @staticmethod
+    def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False):
+        # update cache
+        if isinstance(outputs, tuple) and len(outputs) > 1 and not isinstance(outputs[1], paddle.Tensor):
+            model_kwargs["past_key_values"] = outputs[1]
+
+        if isinstance(outputs, CausalLMOutputWithCrossAttentions) and "past_key_values" in outputs:
+            model_kwargs["past_key_values"] = outputs.past_key_values
+
+        # update position_ids
+        if "position_ids" in model_kwargs and model_kwargs["position_ids"] is not None:
+            position_ids = model_kwargs["position_ids"]
+            model_kwargs["position_ids"] = paddle.concat([position_ids, position_ids[..., -1:] + 1], axis=-1)
+
+        if not is_encoder_decoder and "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = paddle.concat(
+                [attention_mask, paddle.ones([attention_mask.shape[0], 1], dtype=attention_mask.dtype)], axis=-1
+            )
+
+        return model_kwargs
+
+    def forward(
+        self,
+        input_ids=None,
+        labels=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        use_cache=False,
+        past_key_values=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        input_ids.stop_gradient = True
+        fleet.auto.shard_tensor(input_ids, *get_dist_attr(["dp", None], 0))
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.llama(
+            input_ids,  # [bs, seq_len]
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]  # [bs, seq_len, dim]
+        # enter tp region
+        if self.config.sequence_parallel:
+            mesh = get_mesh(-1)
+            if "dp" in mesh.dim_names:
+                hidden_states = dist.reshard(
+                    hidden_states,
+                    get_mesh(-1),
+                    [dist.Shard(1), dist.Replicate()],
+                )
+            else:
+                hidden_states = dist.reshard(
+                    hidden_states,
+                    get_mesh(-1),
+                    [dist.Replicate()],
+                )
+            hidden_states = paddle.transpose(hidden_states, [1, 0, 2])
+
+        # if labels is None，means we need full output, instead of tensor_parallel_output
+        # tensor_parallel_output is togather with ParallelCrossEntropy
+        tensor_parallel_output = (
+            self.config.tensor_parallel_output and labels is not None and self.config.tensor_parallel_degree > 1
+        )
+
+        logits = self.lm_head(hidden_states, tensor_parallel_output=tensor_parallel_output)
+
+        loss = None
+        if labels is not None:
+            labels.stop_gradient = True
+            fleet.auto.shard_tensor(labels, *get_dist_attr(["dp", None], -1))
+            loss = self.criterion(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/modeling_pp.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/modeling_pp.py
new file mode 100644
index 000000000..eaf0c1bed
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/modeling_pp.py
@@ -0,0 +1,373 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+
+import paddle
+import paddle.distributed.fleet as fleet
+import paddle.nn as nn
+from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
+from paddle.distributed.fleet.utils import recompute
+
+from paddlenlp.transformers.model_utils import PipelinePretrainedModel
+from paddlenlp.utils.tools import get_env_device
+
+from .modeling import (
+    LlamaConfig,
+    LlamaDecoderLayer,
+    LlamaLMHead,
+    LlamaModel,
+    LlamaPretrainedModel,
+    LlamaPretrainingCriterion,
+    LlamaRMSNorm,
+    build_alibi_tensor,
+)
+
+
+def __repr__(self):
+    return self.layer_func.__name__
+
+
+# hack LayerDesc for showing to much config
+LayerDesc.__repr__ = __repr__
+
+__all__ = [
+    "LlamaForCausalLMPipe",
+]
+
+
+def parse_args(args):
+    if isinstance(args, tuple):
+        if len(args) == 5:
+            hidden_states, attention_mask, attn_mask_startend_row_indices, position_ids, alibi = args
+        elif len(args) == 4:
+            hidden_states, attention_mask, attn_mask_startend_row_indices, position_ids = args
+            alibi = None
+        elif len(args) == 3:
+            hidden_states, attention_mask, attn_mask_startend_row_indices = args
+            position_ids = None
+            alibi = None
+        elif len(args) == 2:
+            hidden_states, attention_mask = args
+            attn_mask_startend_row_indices = None
+            position_ids = None
+            alibi = None
+    else:
+        hidden_states = args
+        attention_mask, attn_mask_startend_row_indices, position_ids, alibi = None, None, None, None
+
+    if position_ids is not None:
+        position_ids.stop_gradient = True
+
+    if attention_mask is not None:
+        attention_mask.stop_gradient = True
+
+    if attn_mask_startend_row_indices is not None:
+        attn_mask_startend_row_indices.stop_gradient = True
+
+    if alibi is not None:
+        alibi.stop_gradient = True
+
+    return hidden_states, attention_mask, attn_mask_startend_row_indices, position_ids, alibi
+
+
+def return_args(
+    hidden_states, attention_mask=None, attn_mask_startend_row_indices=None, position_ids=None, alibi=None
+):
+    ret = (hidden_states,)
+
+    if attention_mask is not None:
+        ret += (attention_mask.clone(),)
+    if attn_mask_startend_row_indices is not None:
+        ret += (attn_mask_startend_row_indices.clone(),)
+    if position_ids is not None:
+        ret += (position_ids.clone(),)
+    if alibi is not None:
+        ret += (alibi.clone(),)
+
+    if len(ret) == 1:
+        ret = ret[0]
+
+    return ret
+
+
+class LlamaEmbeddingPipe(nn.Layer):
+    """Extends LlamaEmbeddings to forward attention_mask through the pipeline."""
+
+    def __init__(self, config):
+        super(LlamaEmbeddingPipe, self).__init__()
+        self.config = config
+        self.sequence_parallel = config.sequence_parallel
+        self.hidden_size = config.hidden_size
+        if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0:
+            self.embed_tokens = fleet.meta_parallel.VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()),
+            )
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+
+    def forward(self, args):
+        """_summary_
+
+        Args:
+            input (_type_): _description_
+
+        Returns:
+            _type_: _description_
+        """
+        input_ids, attention_mask, attn_mask_startend_row_indices, position_ids, alibi = parse_args(args)
+        input_embeds = self.embed_tokens(input_ids)
+        if self.sequence_parallel:
+            from paddlenlp.transformers import ScatterOp
+
+            # [bs, seq_len, num_head * head_dim] -> [bs * seq_len, num_head * head_dim]
+            bs, seq_len, hidden_size = input_embeds.shape
+            input_embeds = paddle.reshape_(input_embeds, [bs * seq_len, hidden_size])
+            # [seq_len * bs / n, num_head * head_dim] (n is mp parallelism)
+            input_embeds = ScatterOp.apply(input_embeds)
+
+        batch_size, seq_length = input_ids.shape
+        alibi = None
+        if self.config.alibi:
+            assert (
+                attn_mask_startend_row_indices is None
+            ), "alibi and attn_mask_startend_row_indices can not be set at same time"
+            # embed positions
+            mask = (
+                attention_mask
+                if attention_mask is not None
+                else paddle.ones((batch_size, seq_length), dtype=paddle.bool)
+            )
+            alibi = build_alibi_tensor(mask, self.config.num_attention_heads, dtype=input_embeds.dtype)
+
+            if self.config.tensor_parallel_degree > 1:
+                block_size = self.config.num_attention_heads // self.config.tensor_parallel_degree
+                alibi = alibi[
+                    :,
+                    self.config.tensor_parallel_rank
+                    * block_size : (self.config.tensor_parallel_rank + 1)
+                    * block_size,
+                ]
+                alibi = alibi.reshape([batch_size * block_size, 1, seq_length])
+            else:
+                alibi = alibi.reshape([batch_size * self.config.num_attention_heads, 1, seq_length])
+            alibi.stop_gradient = True
+
+        if attention_mask is not None:
+            assert (
+                attn_mask_startend_row_indices is None
+            ), "attention_mask and attn_mask_startend_row_indices can not be set at same time"
+            attention_mask = LlamaModel._prepare_decoder_attention_mask(
+                attention_mask, (batch_size, seq_length), 0, input_embeds.dtype
+            )
+            attention_mask.stop_gradient = True
+            if get_env_device() == "npu":
+                attention_mask = attention_mask.astype("bool")
+        elif get_env_device() == "npu":
+            attention_mask = paddle.tril(paddle.ones((seq_length, seq_length), dtype="bool"))
+            attention_mask.stop_gradient = True
+
+        if self.config.alibi and attention_mask is None:
+            attention_mask = LlamaModel._prepare_decoder_attention_mask(
+                None, (batch_size, seq_length), 0, input_embeds.dtype
+            )
+            attention_mask.stop_gradient = True
+
+        return return_args(input_embeds, attention_mask, attn_mask_startend_row_indices, position_ids, alibi)
+
+
+class LlamaDecoderLayerPipe(LlamaDecoderLayer):
+    def forward(self, args):
+        hidden_states, attention_mask, attn_mask_startend_row_indices, position_ids, alibi = parse_args(args)
+        # we can't distinguish
+        if self.config.alibi and alibi is None and position_ids is None and attn_mask_startend_row_indices is not None:
+            # hidden_states, attention_mask, alibi
+            alibi = attn_mask_startend_row_indices
+            position_ids = None
+            attn_mask_startend_row_indices = None
+        elif (
+            self.config.alibi
+            and alibi is None
+            and position_ids is not None
+            and attn_mask_startend_row_indices is not None
+        ):
+            # hidden_states, attention_mask, position_ids, alibi
+            alibi = position_ids
+            position_ids = attn_mask_startend_row_indices
+            attn_mask_startend_row_indices = None
+        elif not self.config.alibi:
+            if get_env_device() in ["gpu"]:
+                if attention_mask is not None and attention_mask.dtype == paddle.int32:
+                    attention_mask, attn_mask_startend_row_indices, position_ids = (
+                        None,
+                        attention_mask,
+                        attn_mask_startend_row_indices,
+                    )
+                elif attention_mask is not None and attention_mask.dtype == paddle.int64:
+                    attention_mask, attn_mask_startend_row_indices, position_ids = None, None, attention_mask
+                elif (
+                    attn_mask_startend_row_indices is not None and attn_mask_startend_row_indices.dtype == paddle.int64
+                ):
+                    attn_mask_startend_row_indices, position_ids = None, attn_mask_startend_row_indices
+            elif position_ids is None and attn_mask_startend_row_indices is not None:
+                position_ids = attn_mask_startend_row_indices
+                attn_mask_startend_row_indices = None
+
+        has_gradient = not hidden_states.stop_gradient
+        if self.enable_recompute and self.config.recompute_granularity == "full" and has_gradient:
+            if attention_mask is not None or alibi is not None or attn_mask_startend_row_indices is not None:
+                hidden_states = recompute(
+                    super().forward,
+                    hidden_states,
+                    position_ids=position_ids,
+                    attention_mask=attention_mask,
+                    alibi=alibi,
+                    attn_mask_startend_row_indices=attn_mask_startend_row_indices,
+                    use_reentrant=False,
+                )
+            else:
+                # for pretrain
+                hidden_states = recompute(
+                    super().forward,
+                    hidden_states,
+                    position_ids=position_ids,
+                    attn_mask_startend_row_indices=attn_mask_startend_row_indices,
+                    use_reentrant=self.config.recompute_use_reentrant,
+                )
+        else:
+            hidden_states = super().forward(
+                hidden_states,
+                position_ids=position_ids,
+                attention_mask=attention_mask,
+                alibi=alibi,
+                attn_mask_startend_row_indices=attn_mask_startend_row_indices,
+            )
+
+        return return_args(hidden_states, attention_mask, attn_mask_startend_row_indices, position_ids, alibi)
+
+
+class LlamaRMSNormPipe(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.norm = LlamaRMSNorm(config)
+
+    def forward(self, args):
+        hidden_states, attention_mask, attn_mask_startend_row_indices, position_ids, alibi = parse_args(args)
+        return self.norm(hidden_states)
+
+
+class LlamaForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):
+    """LlamaForPretraining adapted for pipeline parallelism.
+
+    The largest change is flattening the LlamaModel class so we can express it as a
+    sequence of layers including embedding, transformer layers, and output.
+    """
+
+    config_class = LlamaConfig
+
+    _get_tensor_parallel_mappings = LlamaPretrainedModel._get_tensor_parallel_mappings
+    _get_fuse_or_split_param_mappings = LlamaPretrainedModel._get_fuse_or_split_param_mappings
+    _init_weights = LlamaPretrainedModel._init_weights
+    _keys_to_ignore_on_load_unexpected = LlamaPretrainedModel._keys_to_ignore_on_load_unexpected
+
+    # DONOT Add base_model_prefix !!!!
+
+    @classmethod
+    def _prepare_pipeline_inputs_func(cls, inputs):
+
+        first_stage_keys = ["input_ids", "attention_mask", "attn_mask_startend_row_indices", "position_ids"]
+        last_stage_keys = ["labels"]
+
+        def get_expected_keys(inputs, keys):
+            ret = tuple([inputs.pop(k) if k in inputs else None for k in keys])
+            if len(ret) == 1:
+                ret = ret[0]
+            return ret
+
+        if type(inputs) is dict or type(inputs) is OrderedDict:
+            return [
+                get_expected_keys(inputs, first_stage_keys),
+                get_expected_keys(inputs, last_stage_keys),
+            ]
+
+        keys = list(inputs[0].keys())
+        inputs_batch = {key: [data.pop(key) for data in inputs] for key in keys}
+        return [
+            get_expected_keys(inputs_batch, first_stage_keys),
+            get_expected_keys(inputs_batch, last_stage_keys),
+        ]
+
+    def __init__(self, config):
+        self.config = config
+
+        self.recompute_granularity = self.config.recompute_granularity
+        self.pp_recompute_interval = self.config.pp_recompute_interval
+        self.no_recompute_layers = config.no_recompute_layers if config.no_recompute_layers is not None else []
+        if self.recompute_granularity == "full":
+            assert len(self.no_recompute_layers) == 0, "for pp with full recompute, no_recompute_layers is not support"
+
+        virtual_pp_degree = getattr(self.config, "virtual_pp_degree", 1)
+
+        def get_hcg():
+            return fleet.get_hybrid_communicate_group()
+
+        hcg = get_hcg()
+        tensor_parallel_degree = max(hcg.get_model_parallel_world_size(), 1)
+        tensor_parallel_rank = max(hcg.get_model_parallel_rank(), 0)
+
+        # TODO: fix tensor_parallel_degree rewrite in here
+        config.tensor_parallel_degree = tensor_parallel_degree
+        config.tensor_parallel_rank = tensor_parallel_rank
+
+        self.add_sequential_layer(LayerDesc(LlamaEmbeddingPipe, config=config), "llama")
+        for i in range(config.num_hidden_layers):
+            self.add_sequential_layer(
+                LayerDesc(LlamaDecoderLayerPipe, config=config, layerwise_recompute=i not in self.no_recompute_layers),
+                f"llama.layers.{i}",
+            )
+        self.add_sequential_layer(LayerDesc(LlamaRMSNormPipe, config=config), "llama")
+        self.add_head(config)
+
+        recompute_interval = 0
+
+        seg_method = "layer:LlamaDecoderLayer"
+        if config.num_hidden_layers % get_hcg().topology().get_dim_size("pipe") != 0:
+            seg_method = "uniform"
+
+        PipelineLayer.__init__(
+            self,
+            layers=self.get_sequential_layers(),
+            loss_fn=self.get_loss_fn(config),
+            topology=get_hcg().topology(),
+            seg_method=seg_method,
+            recompute_interval=recompute_interval,
+            recompute_ctx={
+                "mp_group": get_hcg().get_model_parallel_group(),
+                "offload": False,
+                "partition": False,
+            },
+            num_virtual_pipeline_stages=virtual_pp_degree,
+        )
+        # You should call init here, since there is a  diamond inheritance problem
+        self.apply(self._init_weights)
+        # DON'T init PipelinePretrainedModel
+        # PipelinePretrainedModel.__init__(self.super(), config=config)
+
+    def add_head(self, config):
+        self.add_sequential_layer(LayerDesc(LlamaLMHead, config=config), "lm_head")
+
+    def get_loss_fn(self, config):
+        return LlamaPretrainingCriterion(config)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/tokenizer.py
new file mode 100644
index 000000000..46c16c58b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/tokenizer.py
@@ -0,0 +1,562 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from shutil import copyfile
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import sentencepiece as spm
+
+from ...utils.log import logger
+from .. import PretrainedTokenizer
+from ..tokenizer_utils_base import BatchEncoding, EncodedInput, PaddingStrategy
+
+__all__ = ["LlamaTokenizer", "Llama3Tokenizer"]
+
+
+class LlamaTokenizer(PretrainedTokenizer):
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+    resource_files_names = {
+        "vocab_file": "sentencepiece.bpe.model",
+    }
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "__internal_testing__/micro-random-llama": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model",
+            "__internal_testing__/tiny-random-llama": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model",
+            "facebook/llama-7b": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model",
+            "facebook/llama-13b": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model",
+            "facebook/llama-30b": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model",
+            "facebook/llama-65b": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model",
+        },
+    }
+
+    pretrained_init_configuration = {
+        "__internal_testing__/micro-random-llama": {},
+        "__internal_testing__/tiny-random-llama": {},
+        "facebook/llama-7b": {},
+        "facebook/llama-13b": {},
+        "facebook/llama-30b": {},
+        "facebook/llama-65b": {},
+    }
+    padding_side = "left"
+
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        add_bos_token=True,
+        add_eos_token=False,
+        sp_model_kwargs=None,
+        decode_with_prefix_space=False,
+        **kwargs
+    ):
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
+
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.decode_with_prefix_space = decode_with_prefix_space
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+
+    @property
+    def bos_token_id(self) -> Optional[int]:
+        return self.sp_model.bos_id()
+
+    @property
+    def eos_token_id(self) -> Optional[int]:
+        return self.sp_model.eos_id()
+
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text):
+        """Returns a tokenized string."""
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for i, token in enumerate(tokens):
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special and i != 0:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+
+    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + self.resource_files_names["vocab_file"],
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        if self.add_bos_token:
+            bos_token_ids = [self.bos_token_id]
+        else:
+            bos_token_ids = []
+
+        output = bos_token_ids + token_ids_0
+
+        if token_ids_1 is not None:
+            output = output + token_ids_1
+
+        if self.add_eos_token:
+            output = output + [self.eos_token_id]
+
+        return output
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
+        use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        eos = [self.eos_token_id]
+
+        if token_ids_1 is None:
+            return len(token_ids_0 + eos) * [0]
+        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
+
+    def _pad(
+        self,
+        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                >= 7.5 (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+
+        # attention_mask shape [1,seq_len,seq_len]
+        if "attention_mask" in encoded_inputs and len(np.shape(encoded_inputs["attention_mask"])) > 2:
+            attention_mask = encoded_inputs["attention_mask"]
+            encoded_inputs.pop("attention_mask")
+        else:
+            attention_mask = None
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+        encoded_inputs = super()._pad(
+            encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, return_attention_mask
+        )
+        if attention_mask is not None and len(np.shape(attention_mask)) > 2:
+            encoded_inputs["attention_mask"] = attention_mask
+            needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+            if needs_to_be_padded:
+                difference = max_length - len(required_input)
+                if "attention_mask" in encoded_inputs:
+                    encoded_inputs["attention_mask"] = np.pad(
+                        encoded_inputs["attention_mask"],
+                        pad_width=[(0, 0), (difference, 0), (difference, 0)],
+                        mode="constant",
+                        constant_values=0,
+                    )
+        return encoded_inputs
+
+
+"""Copied Tokenization classes for QWen."""
+
+import base64
+import unicodedata
+from typing import Collection, Dict, List, Optional, Set, Tuple, Union
+
+from ...utils.import_utils import is_tiktoken_available
+from .. import PretrainedTokenizer
+from ..tokenizer_utils_base import (
+    AddedToken,
+    BatchEncoding,
+    EncodedInput,
+    PaddingStrategy,
+)
+
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
+
+PAT_STR = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+BEGINOFTEXT = "<|begin_of_text|>"
+ENDOFTEXT = "<|end_of_text|>"
+IMSTART = "<|start_header_id|>"
+IMEND = "<|end_header_id|>"
+EOTID = "<|eot_id|>"
+# as the default behavior is changed to allow special tokens in
+# regular texts, the surface forms of special tokens need to be
+# as different as possible to minimize the impact
+EXTRAS = tuple((f"<|reserved_special_token_{i}|>" for i in range(251)))
+SPECIAL_TOKENS = (BEGINOFTEXT, ENDOFTEXT) + EXTRAS[0:4] + (IMSTART, IMEND, EXTRAS[4], EOTID) + EXTRAS[5:]
+
+tiktoken = None
+
+
+def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
+    with open(tiktoken_bpe_file, "rb") as f:
+        contents = f.read()
+    return {
+        base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line)
+    }
+
+
+class Llama3Tokenizer(PretrainedTokenizer):
+    """QWen tokenizer."""
+
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+    resource_files_names = VOCAB_FILES_NAMES
+
+    def __init__(
+        self,
+        vocab_file,
+        errors="replace",
+        padding_side="left",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if not is_tiktoken_available():
+            raise ValueError("tiktoken is not installed, please install it use: pip install tiktoken")
+
+        import tiktoken as tk
+
+        tiktoken = tk
+
+        self.errors = errors  # how to handle errors in decoding
+
+        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: dict[bytes, int]
+        self.special_tokens = {
+            token: index for index, token in enumerate(SPECIAL_TOKENS, start=len(self.mergeable_ranks))
+        }
+        enc = tiktoken.Encoding(
+            "Llama3",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        assert (
+            len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
+        ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
+
+        self.decoder = {v: k for k, v in self.mergeable_ranks.items()}  # type: dict[int, bytes|str]
+        self.decoder.update({v: k for k, v in self.special_tokens.items()})
+
+        self.tokenizer = enc  # type: tiktoken.Encoding
+
+        self.bod_id = self.special_tokens[BEGINOFTEXT]
+        self.eod_id = self.special_tokens[ENDOFTEXT]
+        self.start_header_id = self.special_tokens[IMSTART]
+        self.end_header_id = self.special_tokens[IMEND]
+        self.eot_id = self.special_tokens[EOTID]
+
+        if "pad_token_id" in kwargs:
+            self.pad_token_id = kwargs["pad_token_id"]
+        if "eos_token_id" in kwargs:
+            self.eos_token_id = kwargs["eos_token_id"]
+
+    def __len__(self) -> int:
+        return self.tokenizer.n_vocab
+
+    def get_vocab(self) -> Dict[bytes, int]:
+        return self.mergeable_ranks
+
+    def convert_tokens_to_ids(self, tokens: Union[bytes, str, List[Union[bytes, str]]]) -> List[int]:
+        ids = []
+        if isinstance(tokens, (str, bytes)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.mergeable_ranks.get(tokens)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.mergeable_ranks.get(token))
+        return ids
+
+    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
+        if not special_tokens and new_tokens:
+            raise ValueError("Adding regular tokens is not supported")
+        for token in new_tokens:
+            surface_form = token.content if isinstance(token, AddedToken) else token
+            if surface_form not in SPECIAL_TOKENS:
+                raise ValueError("Adding unknown special tokens is not supported")
+        return 0
+
+    def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
+        """
+        Save only the vocabulary of the tokenizer (vocabulary).
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        file_path = os.path.join(save_directory, "tokenizer.model")
+        with open(file_path, "w", encoding="utf8") as w:
+            for k, v in self.mergeable_ranks.items():
+                line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
+                w.write(line)
+        return (file_path,)
+
+    def tokenize(
+        self,
+        text: str,
+        allowed_special: Union[Set, str] = "all",
+        disallowed_special: Union[Collection, str] = (),
+        **kwargs,
+    ) -> List[Union[bytes, str]]:
+        """
+        Converts a string in a sequence of tokens.
+
+        Args:
+            text (`str`):
+                The sequence to be encoded.
+            allowed_special (`Literal["all"]` or `set`):
+                The surface forms of the tokens to be encoded as special tokens in regular texts.
+                Default to "all".
+            disallowed_special (`Literal["all"]` or `Collection`):
+                The surface forms of the tokens that should not be in regular texts and trigger errors.
+                Default to an empty tuple.
+
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific encode method.
+
+        Returns:
+            `List[bytes|str]`: The list of tokens.
+        """
+        tokens = []
+        text = unicodedata.normalize("NFC", text)
+
+        # this implementation takes a detour: text -> token id -> token surface forms
+        for t in self.tokenizer.encode(text, allowed_special=allowed_special, disallowed_special=disallowed_special):
+            tokens.append(self.decoder[t])
+        return tokens
+
+    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
+        """
+        Converts a sequence of tokens in a single string.
+        """
+        text = ""
+        temp = b""
+        for t in tokens:
+            if isinstance(t, str):
+                if temp:
+                    text += temp.decode("utf-8", errors=self.errors)
+                    temp = b""
+                text += t
+            elif isinstance(t, bytes):
+                temp += t
+            else:
+                raise TypeError("token should only be of type types or str")
+        if temp:
+            text += temp.decode("utf-8", errors=self.errors)
+        return text
+
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_vocab
+
+    def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
+        """Converts an id to a token, special tokens included"""
+        if index in self.decoder:
+            return self.decoder[index]
+        raise ValueError("unknown ids")
+
+    def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
+        """Converts a token to an id using the vocab, special tokens included"""
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        if token in self.mergeable_ranks:
+            return self.mergeable_ranks[token]
+        raise ValueError("unknown token")
+
+    def _tokenize(self, text: str, **kwargs):
+        """
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+
+        Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        errors: str = None,
+        **kwargs,
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        if skip_special_tokens:
+            token_ids = [i for i in token_ids if i < self.eod_id]
+        return self.tokenizer.decode(token_ids, errors=errors or self.errors)
+
+    def _pad(
+        self,
+        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                >= 7.5 (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+
+        # attention_mask shape [1,seq_len,seq_len]
+        if "attention_mask" in encoded_inputs and len(np.shape(encoded_inputs["attention_mask"])) > 2:
+            attention_mask = encoded_inputs["attention_mask"]
+            encoded_inputs.pop("attention_mask")
+        else:
+            attention_mask = None
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+        encoded_inputs = super()._pad(
+            encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, return_attention_mask
+        )
+        if attention_mask is not None and len(np.shape(attention_mask)) > 2:
+            encoded_inputs["attention_mask"] = attention_mask
+            needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+            if needs_to_be_padded:
+                difference = max_length - len(required_input)
+                if "attention_mask" in encoded_inputs:
+                    encoded_inputs["attention_mask"] = np.pad(
+                        encoded_inputs["attention_mask"],
+                        pad_width=[(0, 0), (difference, 0), (difference, 0)],
+                        mode="constant",
+                        constant_values=0,
+                    )
+        return encoded_inputs
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/tokenizer_fast.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/tokenizer_fast.py
new file mode 100644
index 000000000..1543e14b6
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/llama/tokenizer_fast.py
@@ -0,0 +1,171 @@
+# Copyright 2020 The HuggingFace Inc. team.
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from shutil import copyfile
+from typing import Optional, Tuple
+
+from tokenizers import processors
+
+from ...utils.log import logger
+from ..tokenizer_utils_fast import PretrainedTokenizerFast
+from .tokenizer import LlamaTokenizer
+
+__all__ = ["LlamaTokenizerFast"]
+
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.bpe.model", "tokenizer_file": "tokenizer.json"}
+
+B_INST, E_INST = "[INST]", "[/INST]"
+B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
+
+# fmt: off
+DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
+answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
+ that your responses are socially unbiased and positive in nature.
+
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
+correct. If you don't know the answer to a question, please don't share false information."""
+# fmt: on
+
+
+class LlamaTokenizerFast(PretrainedTokenizerFast):
+    resource_files_names = VOCAB_FILES_NAMES  # for save_pretrained
+    slow_tokenizer_class = LlamaTokenizer
+    pretrained_resource_files_map = slow_tokenizer_class.pretrained_resource_files_map
+    pretrained_resource_files_map.update(
+        {
+            "tokenizer_file": {
+                "__internal_testing__/micro-random-llama": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/tokenizer.json",
+                "__internal_testing__/tiny-random-llama": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/tokenizer.json",
+                "facebook/llama-7b": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/tokenizer.json",
+                "facebook/llama-13b": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/tokenizer.json",
+                "facebook/llama-30b": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/tokenizer.json",
+                "facebook/llama-65b": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/tokenizer.json",
+            },
+        }
+    )
+    pretrained_init_configuration = slow_tokenizer_class.pretrained_init_configuration
+    padding_side = "left"
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        clean_up_tokenization_spaces=False,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        add_bos_token=True,
+        add_eos_token=False,
+        use_default_system_prompt=False,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file=vocab_file,
+            tokenizer_file=tokenizer_file,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            add_bos_token=add_bos_token,
+            add_eos_token=add_eos_token,
+            use_default_system_prompt=use_default_system_prompt,
+            **kwargs,
+        )
+        self._add_bos_token = add_bos_token
+        self._add_eos_token = add_eos_token
+        self.update_post_processor()
+        self.use_default_system_prompt = use_default_system_prompt
+        self.vocab_file = vocab_file
+
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        return os.path.isfile(self.vocab_file) if self.vocab_file else False
+
+    def update_post_processor(self):
+        """
+        Updates the underlying post processor with the current `bos_token` and `eos_token`.
+        """
+        bos = self.bos_token
+        bos_token_id = self.bos_token_id
+        if bos is None and self.add_bos_token:
+            raise ValueError("add_bos_token = True but bos_token = None")
+
+        eos = self.eos_token
+        eos_token_id = self.eos_token_id
+        if eos is None and self.add_eos_token:
+            raise ValueError("add_eos_token = True but eos_token = None")
+
+        single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
+        pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
+
+        special_tokens = []
+        if self.add_bos_token:
+            special_tokens.append((bos, bos_token_id))
+        if self.add_eos_token:
+            special_tokens.append((eos, eos_token_id))
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=single, pair=pair, special_tokens=special_tokens
+        )
+
+    @property
+    def add_eos_token(self):
+        return self._add_eos_token
+
+    @property
+    def add_bos_token(self):
+        return self._add_bos_token
+
+    @add_eos_token.setter
+    def add_eos_token(self, value):
+        self._add_eos_token = value
+        self.update_post_processor()
+
+    @add_bos_token.setter
+    def add_bos_token(self, value):
+        self._add_bos_token = value
+        self.update_post_processor()
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
+                "tokenizer."
+            )
+
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
+
+    # TODO ArthurZ let's rely on the template processor instead, refactor all fast tokenizers
+    # Copied from paddlenlp.transformers.llama.tokenizer.LlamaTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+        output = bos_token_id + token_ids_0 + eos_token_id
+
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+
+        return output
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/long_sequence_strategies/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/long_sequence_strategies/__init__.py
new file mode 100644
index 000000000..de784444a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/long_sequence_strategies/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .attention_strategies import *
+from .embedding_strategies import *
+from .long_sequence_strategies import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/long_sequence_strategies/attention_strategies.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/long_sequence_strategies/attention_strategies.py
new file mode 100644
index 000000000..3b19d1945
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/long_sequence_strategies/attention_strategies.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import numpy as np
+import paddle
+from paddle import Tensor, nn
+
+__all__ = ["AttentionWithLinearBias"]
+
+
+class AttentionWithLinearBias(nn.Layer):
+    def __init__(self, **init_args):
+        super().__init__()
+
+    def _get_interleave(self, n):
+        def _get_interleave_power_of_2(n):
+            start = 2 ** (-(2 ** -(math.log2(n) - 3)))
+            return np.array([start * start**i for i in range(n)]).astype(np.float32)
+
+        if math.log2(n).is_integer():
+            return _get_interleave_power_of_2(n)
+        else:
+            closest_power_of_2 = 2 ** math.floor(math.log2(n))
+            return (
+                _get_interleave_power_of_2(closest_power_of_2)
+                + self._get_interleave(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
+            )
+
+    def forward(self, bool_attention_mask: Tensor, num_heads: int, dtype: paddle.dtype):
+        attention_mask = bool_attention_mask.astype("float32")
+        batch_size, seq_length = attention_mask.shape[0], attention_mask.shape[-1]
+        slopes = paddle.to_tensor(self._get_interleave(num_heads), dtype="float32")
+        with paddle.amp.auto_cast(enable=False):
+            alibi = slopes.unsqueeze(axis=[1, 2]) * paddle.arange(seq_length, dtype="float32").unsqueeze(
+                axis=[0, 1]
+            ).expand([num_heads, -1, -1])
+        alibi = alibi.reshape(shape=(1, num_heads, 1, seq_length)).expand([batch_size, -1, -1, -1])
+        return paddle.cast(alibi, dtype)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/long_sequence_strategies/embedding_strategies.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/long_sequence_strategies/embedding_strategies.py
new file mode 100644
index 000000000..6e9291e0d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/long_sequence_strategies/embedding_strategies.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import nn
+
+__all__ = [
+    "RotaryEmbedding",
+    "LinearScalingRotaryEmbedding",
+    "NTKScalingRotaryEmbedding",
+    "DynamicNTKScalingRotaryEmbedding",
+]
+
+
+class RotaryEmbedding(nn.Layer):
+    def __init__(self, **init_args):
+        super().__init__()
+        self.dim = init_args["dim"]
+        self.max_position_embeddings = init_args["max_position_embeddings"]
+        self.base = init_args["base"]
+        self.position_encoding_2d = init_args["position_encoding_2d"] if "position_encoding_2d" in init_args else False
+        if self.position_encoding_2d:
+            # [dim / 4]# 2D--Embedding
+            self.dim = self.dim / 2
+            inv_freq = 1.0 / (
+                self.base ** (paddle.cast(paddle.arange(0, self.dim, 2), dtype=paddle.float32) / self.dim)
+            )
+        else:
+            # [dim / 2]
+            inv_freq = 1.0 / (
+                self.base ** (paddle.cast(paddle.arange(0, self.dim, 2), dtype=paddle.float32) / self.dim)
+            )
+        self.register_buffer("inv_freq", inv_freq)
+        self._set_cos_sin_cache(seq_len=self.max_position_embeddings)
+
+    def _set_cos_sin_cache(self, seq_len):
+        self.max_seq_len_cached = seq_len
+        # [seq_len]
+        t = paddle.arange(seq_len, dtype=paddle.float32)
+        # [seq_len, dim/2]
+        with paddle.amp.auto_cast(enable=False):
+            freqs = paddle.outer(t.astype(self.inv_freq.dtype), self.inv_freq)
+        # [seq_len, dim]
+        emb = paddle.concat([freqs, freqs], axis=-1)
+        self.cos_cached = emb.cos()[:, :]
+        self.sin_cached = emb.sin()[:, :]
+
+    def forward(self, seq_len=None, ntk_alpha=None):
+
+        return self.cos_cached[:, :], self.sin_cached[:, :]
+
+
+class LinearScalingRotaryEmbedding(RotaryEmbedding):
+    def __init__(self, **init_args):
+        self.scaling_factor = init_args["scaling_factor"]
+        super().__init__(**init_args)
+
+    def _set_cos_sin_cache(self, seq_len):
+        self.max_seq_len_cached = seq_len
+        # [seq_len]
+        t = paddle.arange(seq_len, dtype=paddle.float32)
+        t = t / self.scaling_factor
+        # [seq_len, dim/2]
+        with paddle.amp.auto_cast(enable=False):
+            freqs = paddle.outer(t.astype(self.inv_freq.dtype), self.inv_freq)
+        # [seq_len, dim]
+        emb = paddle.concat([freqs, freqs], axis=-1)
+        self.cos_cached = emb.cos()[:, :]
+        self.sin_cached = emb.sin()[:, :]
+
+
+class NTKScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with NTK scaling. https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/"""
+
+    def __init__(self, **init_args):
+        init_args["base"] = init_args["base"] * init_args["scaling_factor"] ** (
+            init_args["dim"] / (init_args["dim"] - 2)
+        )
+        super().__init__(**init_args)
+
+
+class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with Dynamic NTK scaling. https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/"""
+
+    def __init__(self, **init_args):
+        self.scaling_factor = init_args["scaling_factor"]
+        self._seq_len_cached = 0
+        super().__init__(**init_args)
+
+    def _scale_cos_sin(self, seq_len, ntk_alpha=None):
+        # [seq_len]
+        t = paddle.arange(seq_len, dtype=paddle.float32)
+        if ntk_alpha is None:
+            ntk_alpha = (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+        base = self.base * ntk_alpha ** (self.dim / (self.dim - 2))
+
+        # [seq_len, dim/2]
+        inv_freq = 1.0 / (base ** (paddle.cast(paddle.arange(0, self.dim, 2), dtype=paddle.float32) / self.dim))
+        with paddle.amp.auto_cast(enable=False):
+            freqs = paddle.outer(t.astype(inv_freq.dtype), inv_freq)
+        # [seq_len, dim]
+        emb = paddle.concat([freqs, freqs], axis=-1)
+        self.cos_cached = emb.cos()[:, :]
+        self.sin_cached = emb.sin()[:, :]
+
+    def forward(self, seq_len=None, ntk_alpha=None):
+
+        if seq_len > self.max_position_embeddings:
+            self._scale_cos_sin(seq_len=seq_len, ntk_alpha=ntk_alpha)
+
+        return self.cos_cached[:, :], self.sin_cached[:, :]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/long_sequence_strategies/long_sequence_strategies.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/long_sequence_strategies/long_sequence_strategies.py
new file mode 100644
index 000000000..286be7c5f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/long_sequence_strategies/long_sequence_strategies.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+
+all_strategy_types = ["embedding_strategies", "attention_strategies"]
+
+
+class LongSequenceStrategies:
+    @classmethod
+    def build_long_sequence_strategy(cls, strategy_type=None, stratety_name=None, **init_args):
+        """
+
+        **init_args:   head_dim,
+                       max_position_embeddings,
+                       rope_scaling_type,
+                       rope_scaling_factor,
+                       ...
+
+        strategy_type: "None" ---------------走原始的build-in模块
+                       "embedding_strategies"、
+                       "attention_strategies"
+                       ...
+
+        stratety_name: "RotaryEmbedding"、
+                       "LinearScalingRotaryEmbedding"、
+                       "NTKScalingRotaryEmbedding"、
+                       "DynamicNTKScalingRotaryEmbedding"、
+                       "AttentionWithLinearBias"
+                       ...
+
+        """
+
+        """
+        paddlenlp.transformers.long_sequence_strategies.{strategy_type<->import_class)}.{stratety_name<->strategy_class)}
+        paddlenlp.transformers.long_sequence_strategies.{embedding_strategies}.{RoPE,...}
+        paddlenlp.transformers.long_sequence_strategies.{attention_strategies}.{ALiBi,...}
+        """
+        try:
+            import_class = importlib.import_module(f"paddlenlp.transformers.long_sequence_strategies.{strategy_type}")
+        except ModuleNotFoundError:
+            raise ModuleNotFoundError(
+                f"Wrong strategy type {strategy_type}. module only supports the following types: "
+                + ", ".join(m for m in all_strategy_types)
+            )
+        try:
+            strategy_class = getattr(import_class, stratety_name)
+        except:
+            all_strategy_classes = import_class.__all__
+            raise LookupError(
+                f"module '{import_class.__name__}' only supports the following classes: "
+                + ", ".join(m for m in all_strategy_classes)
+            )
+        strategy_instance = strategy_class(**init_args)
+        return strategy_instance
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/luke/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/luke/__init__.py
new file mode 100644
index 000000000..3bd752713
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/luke/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .modeling import *
+from .tokenizer import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/luke/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/luke/configuration.py
new file mode 100644
index 000000000..8c408b6d7
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/luke/configuration.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Luke model configuration"""
+from __future__ import annotations
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = [
+    "LUKE_PRETRAINED_INIT_CONFIGURATION",
+    "LUKE_PRETRAINED_RESOURCE_FILES_MAP",
+    "LukeConfig",
+]
+
+LUKE_PRETRAINED_INIT_CONFIGURATION = {
+    "luke-base": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "pad_token_id": 1,
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "max_position_embeddings": 514,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 1,
+        "vocab_size": 50267,
+    },
+    "luke-large": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "pad_token_id": 1,
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "intermediate_size": 4096,
+        "max_position_embeddings": 514,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 24,
+        "type_vocab_size": 1,
+        "vocab_size": 50267,
+    },
+}
+
+LUKE_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "luke-base": "https://bj.bcebos.com/paddlenlp/models/transformers/luke/luke-base/model_state.pdparams",
+        "luke-large": "https://bj.bcebos.com/paddlenlp/models/transformers/luke/luke-large/model_state.pdparams",
+    }
+}
+
+
+class LukeConfig(PretrainedConfig):
+    r"""
+    Args:
+       vocab_size (int, optional):
+           Vocabulary size of `inputs_ids` in `LukeModel`. Also is the vocab size of token embedding matrix.
+           Defines the number of different tokens that can be represented by the `inputs_ids` passed when
+           calling `LukeModel`. Defaults to 50267.
+       hidden_size (int, optional):
+           Dimensionality of the embedding layer, encoder layer and pooler layer. Defaults to `768`.
+       num_hidden_layers (int, optional):
+           Number of hidden layers in the Transformer encoder. Defaults to `12`.
+       num_attention_heads (int, optional):
+           Number of attention heads for each attention layer in the Transformer encoder.
+           Defaults to `12`.
+       intermediate_size (int, optional):
+           Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
+           to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
+           and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
+           Defaults to `3072`.
+       hidden_act (str, optional):
+           The non-linear activation function in the feed-forward layer.
+           ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
+           are supported. Defaults to `"gelu"`.
+       hidden_dropout_prob (float, optional):
+           The dropout probability for all fully connected layers in the embeddings and encoder.
+           Defaults to `0.1`.
+       attention_probs_dropout_prob (float, optional):
+           The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
+           Defaults to `0.1`.
+       max_position_embeddings (int, optional):
+           The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
+           sequence. Defaults to `514`.
+       type_vocab_size (int, optional):
+           The vocabulary size of `token_type_ids`.
+           Defaults to `1`.
+       entity_vocab_size (int, optional):
+           Vocabulary size of `entity_ids` in `LukeModel`. Also is the vocab size of token entity embedding matrix.
+           Defines the number of different entity that can be represented by the `entity_ids` passed when
+           calling `LukeModel`. Defaults to 500000.
+       entity_emb_size (int, optional):
+           Dimensionality of the entity embedding layer Defaults to `256`.
+       initializer_range (float, optional):
+           The standard deviation of the normal initializer.
+           Defaults to 0.02.
+
+           .. note::
+               A normal_initializer initializes weight matrices as normal distributions.
+               See :meth:`BertPretrainedModel.init_weights()` for how weights are initialized in `BertModel`.
+
+       pad_token_id (int, optional):
+           The index of padding token in the token vocabulary.
+           Defaults to `1`.
+       entity_pad_token_id (int, optional):
+           The index of padding token in the token vocabulary.
+           Defaults to `0`.
+    """
+    model_type = "luke"
+
+    def __init__(
+        self,
+        vocab_size=50267,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=514,
+        type_vocab_size=1,
+        entity_vocab_size=500000,
+        entity_emb_size=256,
+        initializer_range=0.02,
+        pad_token_id=1,
+        entity_pad_token_id=0,
+        cls_token_id=101,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.entity_vocab_size = entity_vocab_size
+        self.entity_emb_size = entity_emb_size
+        self.initializer_range = initializer_range
+        self.pad_token_id = pad_token_id
+        self.entity_pad_token_id = entity_pad_token_id
+        self.cls_token_id = cls_token_id
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/luke/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/luke/modeling.py
new file mode 100644
index 000000000..94e01659a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/luke/modeling.py
@@ -0,0 +1,1124 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+import paddle.nn as nn
+
+from ...transformers.roberta.modeling import RobertaEmbeddings
+from .. import PretrainedModel, register_base_model
+from ..activations import get_activation
+from .configuration import (
+    LUKE_PRETRAINED_INIT_CONFIGURATION,
+    LUKE_PRETRAINED_RESOURCE_FILES_MAP,
+    LukeConfig,
+)
+
+__all__ = [
+    "LukeModel",
+    "LukePretrainedModel",
+    "LukeForEntitySpanClassification",
+    "LukeForEntityPairClassification",
+    "LukeForEntityClassification",
+    "LukeForMaskedLM",
+    "LukeForQuestionAnswering",
+]
+
+
+def paddle_gather(x, dim, index):
+    index_shape = index.shape
+    index_flatten = index.flatten()
+    if dim < 0:
+        dim = len(x.shape) + dim
+    nd_index = []
+    for k in range(len(x.shape)):
+        if k == dim:
+            nd_index.append(index_flatten)
+        else:
+            reshape_shape = [1] * len(x.shape)
+            reshape_shape[k] = x.shape[k]
+            x_arange = paddle.arange(x.shape[k], dtype=index.dtype)
+            x_arange = x_arange.reshape(reshape_shape)
+            dim_index = paddle.expand(x_arange, index_shape).flatten()
+            nd_index.append(dim_index)
+    ind2 = paddle.transpose(paddle.stack(nd_index), [1, 0]).astype("int64")
+    paddle_out = paddle.gather_nd(x, ind2).reshape(index_shape)
+    return paddle_out
+
+
+layer_norm_eps = 1e-6
+
+
+class LukePretrainedModel(PretrainedModel):
+    r"""
+    An abstract class for pretrained Luke models. It provides Luke related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+
+    """
+
+    pretrained_init_configuration = LUKE_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = LUKE_PRETRAINED_RESOURCE_FILES_MAP
+
+    base_model_prefix = "luke"
+    config_class = LukeConfig
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # only support dygraph, use truncated_normal and make it inplace
+            # and configurable later
+            layer.weight.set_value(
+                paddle.tensor.normal(
+                    mean=0.0,
+                    std=self.config.initializer_range,
+                    shape=layer.weight.shape,
+                )
+            )
+        elif isinstance(layer, nn.LayerNorm):
+            layer._epsilon = layer_norm_eps
+
+
+class LukeSelfOutput(nn.Layer):
+    def __init__(self, config: LukeConfig):
+        super(LukeSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.layer_norm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class LukeIntermediate(nn.Layer):
+    def __init__(self, config: LukeConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.intermediate_act_fn = get_activation(config.hidden_act)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class LukeOutput(nn.Layer):
+    def __init__(self, config: LukeConfig):
+        super(LukeOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.layer_norm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class LukeEmbeddings(RobertaEmbeddings):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    def __init__(self, config: LukeConfig):
+        super(LukeEmbeddings, self).__init__(config)
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+    ):
+        return super(LukeEmbeddings, self).forward(
+            input_ids=input_ids, token_type_ids=token_type_ids, position_ids=position_ids
+        )
+
+
+class LukePooler(nn.Layer):
+    def __init__(self, config: LukeConfig):
+        super(LukePooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class EntityEmbeddings(nn.Layer):
+    def __init__(self, config: LukeConfig):
+        super(EntityEmbeddings, self).__init__()
+        self.entity_emb_size = config.entity_emb_size
+        self.hidden_size = config.hidden_size
+        self.entity_embeddings = nn.Embedding(config.entity_vocab_size, config.entity_emb_size, padding_idx=0)
+        if config.entity_emb_size != config.hidden_size:
+            self.entity_embedding_dense = nn.Linear(config.entity_emb_size, config.hidden_size, bias_attr=False)
+
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, entity_ids, position_ids, token_type_ids=None):
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros_like(entity_ids)
+
+        entity_embeddings = self.entity_embeddings(entity_ids)
+        if self.entity_emb_size != self.hidden_size:
+            entity_embeddings = self.entity_embedding_dense(entity_embeddings)
+
+        position_embeddings = self.position_embeddings(position_ids.clip(min=0))
+        position_embedding_mask = (position_ids != -1).astype(position_embeddings.dtype).unsqueeze(-1)
+        position_embeddings = position_embeddings * position_embedding_mask
+        position_embeddings = paddle.sum(position_embeddings, axis=-2)
+        position_embeddings = position_embeddings / position_embedding_mask.sum(axis=-2).clip(min=1e-7)
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = entity_embeddings + position_embeddings + token_type_embeddings
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class LukeSelfAttention(nn.Layer):
+    def __init__(self, config: LukeConfig):
+        super(LukeSelfAttention, self).__init__()
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.w2e_query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.e2w_query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.e2e_query = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.shape[:-1] + [self.num_attention_heads, self.attention_head_size]
+        x = x.reshape(new_x_shape)
+        return x.transpose((0, 2, 1, 3))
+
+    def forward(
+        self,
+        word_hidden_states,
+        entity_hidden_states,
+        attention_mask=None,
+    ):
+        word_size = word_hidden_states.shape[1]
+
+        if entity_hidden_states is None:
+            concat_hidden_states = word_hidden_states
+        else:
+            concat_hidden_states = paddle.concat([word_hidden_states, entity_hidden_states], axis=1)
+
+        key_layer = self.transpose_for_scores(self.key(concat_hidden_states))
+        value_layer = self.transpose_for_scores(self.value(concat_hidden_states))
+
+        if entity_hidden_states is not None:
+            # compute query vectors using word-word (w2w), word-entity (w2e), entity-word (e2w), entity-entity (e2e)
+            # query layers
+            w2w_query_layer = self.transpose_for_scores(self.query(word_hidden_states))
+            w2e_query_layer = self.transpose_for_scores(self.w2e_query(word_hidden_states))
+            e2w_query_layer = self.transpose_for_scores(self.e2w_query(entity_hidden_states))
+            e2e_query_layer = self.transpose_for_scores(self.e2e_query(entity_hidden_states))
+
+            # compute w2w, w2e, e2w, and e2e key vectors used with the query vectors computed above
+            w2w_key_layer = key_layer[:, :, :word_size, :]
+            e2w_key_layer = key_layer[:, :, :word_size, :]
+            w2e_key_layer = key_layer[:, :, word_size:, :]
+            e2e_key_layer = key_layer[:, :, word_size:, :]
+
+            # compute attention scores based on the dot product between the query and key vectors
+            w2w_attention_scores = paddle.matmul(w2w_query_layer, w2w_key_layer.transpose((0, 1, 3, 2)))
+            w2e_attention_scores = paddle.matmul(w2e_query_layer, w2e_key_layer.transpose((0, 1, 3, 2)))
+            e2w_attention_scores = paddle.matmul(e2w_query_layer, e2w_key_layer.transpose((0, 1, 3, 2)))
+            e2e_attention_scores = paddle.matmul(e2e_query_layer, e2e_key_layer.transpose((0, 1, 3, 2)))
+
+            # combine attention scores to create the final attention score matrix
+            word_attention_scores = paddle.concat([w2w_attention_scores, w2e_attention_scores], axis=3)
+            entity_attention_scores = paddle.concat([e2w_attention_scores, e2e_attention_scores], axis=3)
+            attention_scores = paddle.concat([word_attention_scores, entity_attention_scores], axis=2)
+
+        else:
+            query_layer = self.transpose_for_scores(self.query(concat_hidden_states))
+            attention_scores = paddle.matmul(query_layer, key_layer.transpose((0, 1, 3, 2)))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in LukeModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = paddle.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.transpose((0, 2, 1, 3))
+        new_context_layer_shape = context_layer.shape[:-2] + [
+            self.all_head_size,
+        ]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        output_word_hidden_states = context_layer[:, :word_size, :]
+        if entity_hidden_states is None:
+            output_entity_hidden_states = None
+        else:
+            output_entity_hidden_states = context_layer[:, word_size:, :]
+
+        outputs = (output_word_hidden_states, output_entity_hidden_states)
+
+        return outputs
+
+
+class LukeAttention(nn.Layer):
+    def __init__(self, config: LukeConfig):
+        super().__init__()
+        self.self = LukeSelfAttention(config)
+        self.output = LukeSelfOutput(config)
+
+    def forward(
+        self,
+        word_hidden_states,
+        entity_hidden_states,
+        attention_mask=None,
+    ):
+        word_size = word_hidden_states.shape[1]
+        self_outputs = self.self(word_hidden_states, entity_hidden_states, attention_mask)
+        if entity_hidden_states is None:
+            concat_self_outputs = self_outputs[0]
+            concat_hidden_states = word_hidden_states
+        else:
+            concat_self_outputs = paddle.concat(self_outputs[:2], axis=1)
+            concat_hidden_states = paddle.concat([word_hidden_states, entity_hidden_states], axis=1)
+
+        attention_output = self.output(concat_self_outputs, concat_hidden_states)
+
+        word_attention_output = attention_output[:, :word_size, :]
+        if entity_hidden_states is None:
+            entity_attention_output = None
+        else:
+            entity_attention_output = attention_output[:, word_size:, :]
+
+        # add attentions if we output them
+        outputs = (word_attention_output, entity_attention_output) + self_outputs[2:]
+
+        return outputs
+
+
+class LukeLayer(nn.Layer):
+    def __init__(self, config: LukeConfig):
+        super(LukeLayer, self).__init__()
+        self.seq_len_dim = 1
+        self.attention = LukeAttention(config)
+        self.intermediate = LukeIntermediate(config)
+        self.output = LukeOutput(config)
+
+    def forward(
+        self,
+        word_hidden_states,
+        entity_hidden_states,
+        attention_mask=None,
+    ):
+        word_size = word_hidden_states.shape[1]
+
+        self_attention_outputs = self.attention(
+            word_hidden_states,
+            entity_hidden_states,
+            attention_mask,
+        )
+        if entity_hidden_states is None:
+            concat_attention_output = self_attention_outputs[0]
+        else:
+            concat_attention_output = paddle.concat(self_attention_outputs[:2], axis=1)
+
+        outputs = self_attention_outputs[2:]  # add self attentions if we output attention weights
+
+        layer_output = self.feed_forward_chunk(concat_attention_output)
+
+        word_layer_output = layer_output[:, :word_size, :]
+        if entity_hidden_states is None:
+            entity_layer_output = None
+        else:
+            entity_layer_output = layer_output[:, word_size:, :]
+
+        outputs = (word_layer_output, entity_layer_output) + outputs
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class LukeEncoder(nn.Layer):
+    def __init__(self, config: LukeConfig):
+        super(LukeEncoder, self).__init__()
+        self.layer = nn.LayerList([LukeLayer(config) for _ in range(config.num_hidden_layers)])
+
+    def forward(
+        self,
+        word_hidden_states,
+        entity_hidden_states,
+        attention_mask=None,
+    ):
+
+        for i, layer_module in enumerate(self.layer):
+
+            layer_outputs = layer_module(
+                word_hidden_states,
+                entity_hidden_states,
+                attention_mask,
+            )
+
+            word_hidden_states = layer_outputs[0]
+
+            if entity_hidden_states is not None:
+                entity_hidden_states = layer_outputs[1]
+
+        return word_hidden_states, entity_hidden_states
+
+
+@register_base_model
+class LukeModel(LukePretrainedModel):
+    """
+    The bare Luke Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`LukeConfig`):
+            An instance of LukeConfig.
+    """
+
+    def __init__(self, config: LukeConfig):
+        super(LukeModel, self).__init__(config)
+        self.initializer_range = config.initializer_range
+        self.pad_token_id = config.pad_token_id
+        self.entity_pad_token_id = config.entity_pad_token_id
+        self.encoder = LukeEncoder(config)
+        self.embeddings = LukeEmbeddings(config)
+        self.entity_embeddings = EntityEmbeddings(config)
+        self.pooler = LukePooler(config)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def forward(
+        self,
+        input_ids,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        entity_ids=None,
+        entity_position_ids=None,
+        entity_token_type_ids=None,
+        entity_attention_mask=None,
+    ):
+        r"""
+        The LukeModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            entity_ids (Tensor, optional):
+                Indices of entity sequence tokens in the entity vocabulary. They are numerical
+                representations of entities that build the entity input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, entity_sequence_length].
+            entity_position_ids (Tensor, optional):
+                Indices of positions of each entity sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_entity_tokens)` and dtype as int64. Defaults to `None`.
+            entity_token_type_ids (Tensor, optional):
+                Segment entity token indices to indicate different portions of the entity inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+            entity_attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor will be concat with `attention_mask`.
+
+        Returns:
+            tuple: Returns tuple (`word_hidden_state, entity_hidden_state, pool_output`).
+
+            With the fields:
+
+            - `word_hidden_state` (Tensor):
+                Sequence of hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+            - `entity_hidden_state` (Tensor):
+                Sequence of entity hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+            - `pooled_output` (Tensor):
+                The output of first token (`<s>`) in sequence.
+                We "pool" the model by simply taking the hidden state corresponding to the first token.
+                Its data type should be float32 and its shape is [batch_size, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import LukeModel, LukeTokenizer
+
+                tokenizer = LukeTokenizer.from_pretrained('luke-base')
+                model = LukeModel.from_pretrained('luke-base')
+
+                text = "Beyoncé lives in Los Angeles."
+                entity_spans = [(0, 7)]
+                inputs = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True)
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+        """
+
+        input_shape = input_ids.shape
+
+        batch_size, seq_length = input_shape
+
+        if attention_mask is None:
+            attention_mask = paddle.unsqueeze(
+                (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e4, axis=[1, 2]
+            )
+        else:
+            if attention_mask.ndim == 2:
+                # attention_mask [batch_size, sequence_length] -> [batch_size, 1, 1, sequence_length]
+                attention_mask = attention_mask.unsqueeze(axis=[1, 2])
+                attention_mask = (1.0 - attention_mask) * -1e4
+        if entity_ids is not None:
+            entity_seq_length = entity_ids.shape[1]
+            if entity_attention_mask is None:
+                entity_attention_mask = paddle.unsqueeze(
+                    (entity_ids == self.entity_pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e4, axis=[1, 2]
+                )
+            else:
+                if entity_attention_mask.ndim == 2:
+                    # attention_mask [batch_size, sequence_length] -> [batch_size, 1, 1, sequence_length]
+                    entity_attention_mask = entity_attention_mask.unsqueeze(axis=[1, 2])
+                    entity_attention_mask = (1.0 - entity_attention_mask) * -1e4
+            if entity_token_type_ids is None:
+                entity_token_type_ids = paddle.zeros((batch_size, entity_seq_length), dtype="int64")
+            attention_mask = paddle.concat([attention_mask, entity_attention_mask], axis=-1)
+
+        word_embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+        )
+
+        if entity_ids is None:
+            entity_embedding_output = None
+        else:
+            entity_embedding_output = self.entity_embeddings(entity_ids, entity_position_ids, entity_token_type_ids)
+
+        # Fourth, send embeddings through the model
+        encoder_outputs = self.encoder(
+            word_embedding_output,
+            entity_embedding_output,
+            attention_mask=attention_mask,
+        )
+
+        sequence_output = encoder_outputs[0]
+
+        pooled_output = self.pooler(sequence_output)
+
+        return sequence_output, encoder_outputs[1], pooled_output
+
+
+class LukeLMHead(nn.Layer):
+    """Luke Head for masked language modeling."""
+
+    def __init__(self, config: LukeConfig, embedding_weights=None):
+        super(LukeLMHead, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=layer_norm_eps)
+        self.activation = get_activation(config.hidden_act)
+        self.decoder_weight = (
+            self.create_parameter(
+                shape=[config.vocab_size, config.hidden_size], dtype=self.transform.weight.dtype, is_bias=False
+            )
+            if embedding_weights is None
+            else embedding_weights
+        )
+        self.decoder_bias = self.create_parameter(
+            shape=[config.vocab_size], dtype=self.decoder_weight.dtype, is_bias=True
+        )
+
+    def forward(self, features, **kwargs):
+        hidden_state = self.dense(features)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.layer_norm(hidden_state)
+        hidden_state = paddle.tensor.matmul(hidden_state, self.decoder_weight, transpose_y=True) + self.decoder_bias
+        return hidden_state
+
+
+class EntityPredictionHeadTransform(nn.Layer):
+    def __init__(self, config: LukeConfig):
+        super(EntityPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.entity_emb_size)
+        self.transform_act_fn = get_activation(config.hidden_act)
+        self.layer_norm = nn.LayerNorm(config.entity_emb_size, epsilon=layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        return hidden_states
+
+
+class EntityPredictionHead(nn.Layer):
+    def __init__(self, config: LukeConfig):
+        super(EntityPredictionHead, self).__init__()
+        self.transform = EntityPredictionHeadTransform(config)
+        self.decoder = nn.Linear(config.entity_emb_size, config.entity_vocab_size)
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class LukeForMaskedLM(LukePretrainedModel):
+    """
+    Luke Model with a `masked language modeling` head on top.
+
+    Args:
+        config (:class:`LukeConfig`):
+            An instance of LukeConfig.
+
+    """
+
+    def __init__(self, config: LukeConfig):
+        super(LukeForMaskedLM, self).__init__(config)
+        self.luke = LukeModel(config)
+        self.vocab_size = self.config.vocab_size
+        self.entity_vocab_size = self.config.entity_vocab_size
+
+        self.lm_head = LukeLMHead(
+            config,
+            embedding_weights=self.luke.embeddings.word_embeddings.weight,
+        )
+        self.entity_predictions = EntityPredictionHead(config)
+
+    def forward(
+        self,
+        input_ids,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        entity_ids=None,
+        entity_position_ids=None,
+        entity_token_type_ids=None,
+        entity_attention_mask=None,
+    ):
+        r"""
+        The LukeForMaskedLM forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`LukeModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`LukeModel`.
+            position_ids (Tensor, optional):
+                See :class: `LukeModel`
+            attention_mask (list, optional):
+                See :class:`LukeModel`.
+            entity_ids (Tensor, optional):
+                See :class:`LukeModel`.
+            entity_position_ids (Tensor, optional):
+                See :class:`LukeModel`.
+            entity_token_type_ids (Tensor, optional):
+                See :class:`LukeModel`.
+            entity_attention_mask (list, optional):
+                See :class:`LukeModel`.
+
+        Returns:
+            tuple: Returns tuple (``logits``, ``entity_logits``).
+
+            With the fields:
+
+            - `logits` (Tensor):
+                The scores of masked token prediction.
+                Its data type should be float32 and shape is [batch_size, sequence_length, vocab_size].
+
+            - `entity_logits` (Tensor):
+                The scores of masked entity prediction.
+                Its data type should be float32 and its shape is [batch_size, entity_length, entity_vocab_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import LukeForMaskedLM, LukeTokenizer
+
+                tokenizer = LukeTokenizer.from_pretrained('luke-base')
+                model = LukeForMaskedLM.from_pretrained('luke-base')
+
+                text = "Beyoncé lives in Los Angeles."
+                entity_spans = [(0, 7)]
+                inputs = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True)
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits, entity_logits = model(**inputs)
+        """
+
+        outputs = self.luke(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            entity_ids=entity_ids,
+            entity_position_ids=entity_position_ids,
+            entity_token_type_ids=entity_token_type_ids,
+            entity_attention_mask=entity_attention_mask,
+        )
+
+        logits = self.lm_head(outputs[0])
+        entity_logits = self.entity_predictions(outputs[1])
+
+        return logits, entity_logits
+
+
+class LukeForEntityClassification(LukePretrainedModel):
+    """
+    The LUKE model with a classification head on top (a linear layer on top of the hidden state of the first entity
+    token) for entity classification tasks, such as Open Entity.
+
+    Args:
+        config (:class:`LukeConfig`):
+            An instance of LukeConfig.
+    """
+
+    def __init__(self, config: LukeConfig):
+        super(LukeForEntityClassification, self).__init__(config)
+
+        self.luke = LukeModel(config)
+
+        self.num_labels = config.num_labels
+        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
+        self.classifier = nn.Linear(self.config.hidden_size, config.num_labels)
+
+    def forward(
+        self,
+        input_ids,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        entity_ids=None,
+        entity_position_ids=None,
+        entity_token_type_ids=None,
+        entity_attention_mask=None,
+    ):
+        r"""
+        The LukeForEntityClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`LukeModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`LukeModel`.
+            position_ids (Tensor, optional):
+                See :class: `LukeModel`
+            attention_mask (list, optional):
+                See :class:`LukeModel`.
+            entity_ids (Tensor, optional):
+                See :class:`LukeModel`.
+            entity_position_ids (Tensor, optional):
+                See :class:`LukeModel`.
+            entity_token_type_ids (Tensor, optional):
+                See :class:`LukeModel`.
+            entity_attention_mask (list, optional):
+                See :class:`LukeModel`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the entity classification logits.
+            Shape as `[batch_size, num_labels]` and dtype as float32.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import LukeForEntityClassification, LukeTokenizer
+
+                tokenizer = LukeTokenizer.from_pretrained('luke-base')
+                model = LukeForEntityClassification.from_pretrained('luke-base', num_labels=2)
+
+                text = "Beyoncé lives in Los Angeles."
+                entity_spans = [(0, 7)]
+                inputs = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True)
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+        """
+
+        outputs = self.luke(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            entity_ids=entity_ids,
+            entity_position_ids=entity_position_ids,
+            entity_token_type_ids=entity_token_type_ids,
+            entity_attention_mask=entity_attention_mask,
+        )
+
+        feature_vector = outputs[1][:, 0, :]
+        feature_vector = self.dropout(feature_vector)
+        logits = self.classifier(feature_vector)
+
+        return logits
+
+
+class LukeForEntityPairClassification(LukePretrainedModel):
+    """
+    The LUKE model with a classification head on top (a linear layer on top of the hidden states of the two entity
+    tokens) for entity pair classification tasks, such as TACRED.
+
+    Args:
+        config (:class:`LukeConfig`):
+            An instance of LukeConfig.
+
+    """
+
+    def __init__(self, config: LukeConfig):
+        super(LukeForEntityPairClassification, self).__init__(config)
+
+        self.luke = LukeModel(config)
+
+        self.num_labels = config.num_labels
+        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
+        self.classifier = nn.Linear(self.config.hidden_size * 2, config.num_labels, bias_attr=False)
+
+    def forward(
+        self,
+        input_ids,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        entity_ids=None,
+        entity_position_ids=None,
+        entity_token_type_ids=None,
+        entity_attention_mask=None,
+    ):
+        r"""
+        The LukeForEntityPairClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`LukeModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`LukeModel`.
+            position_ids (Tensor, optional):
+                See :class: `LukeModel`
+            attention_mask (list, optional):
+                See :class:`LukeModel`.
+            entity_ids (Tensor, optional):
+                See :class:`LukeModel`.
+            entity_position_ids (Tensor, optional):
+                See :class:`LukeModel`.
+            entity_token_type_ids (Tensor, optional):
+                See :class:`LukeModel`.
+            entity_attention_mask (list, optional):
+                See :class:`LukeModel`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the entity pair classification logits.
+            Shape as `[batch_size, num_labels]` and dtype as float32.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import LukeForEntityPairClassification, LukeTokenizer
+
+                tokenizer = LukeTokenizer.from_pretrained('luke-base')
+                model = LukeForEntityPairClassification.from_pretrained('luke-base', num_labels=2)
+
+                text = "Beyoncé lives in Los Angeles."
+                entity_spans = [(0, 7), (17, 28)]
+                inputs = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True)
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+        """
+
+        outputs = self.luke(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            entity_ids=entity_ids,
+            entity_position_ids=entity_position_ids,
+            entity_token_type_ids=entity_token_type_ids,
+            entity_attention_mask=entity_attention_mask,
+        )
+
+        feature_vector = paddle.concat([outputs[1][:, 0, :], outputs[1][:, 1, :]], axis=1)
+        feature_vector = self.dropout(feature_vector)
+        logits = self.classifier(feature_vector)
+
+        return logits
+
+
+class LukeForEntitySpanClassification(LukePretrainedModel):
+    """
+    The LUKE model with a span classification head on top (a linear layer on top of the hidden states output) for tasks
+    such as named entity recognition.
+
+    Args:
+        config (:class:`LukeConfig`):
+            An instance of LukeConfig.
+    """
+
+    def __init__(self, config: LukeConfig):
+        super(LukeForEntitySpanClassification, self).__init__(config)
+
+        self.luke = LukeModel(config)
+
+        self.num_labels = config.num_labels
+        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
+        self.classifier = nn.Linear(self.config.hidden_size * 3, config.num_labels)
+
+    def forward(
+        self,
+        entity_start_positions,
+        entity_end_positions,
+        input_ids,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        entity_ids=None,
+        entity_position_ids=None,
+        entity_token_type_ids=None,
+        entity_attention_mask=None,
+    ):
+        r"""
+        The LukeForEntitySpanClassification forward method, overrides the __call__() special method.
+
+        Args:
+            entity_start_positions:
+                The start position of entities in sequence.
+            entity_end_positions:
+                The start position of entities in sequence.
+            input_ids (Tensor):
+                See :class:`LukeModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`LukeModel`.
+            position_ids (Tensor, optional):
+                See :class: `LukeModel`
+            attention_mask (list, optional):
+                See :class:`LukeModel`.
+            entity_ids (Tensor, optional):
+                See :class:`LukeModel`.
+            entity_position_ids (Tensor, optional):
+                See :class:`LukeModel`.
+            entity_token_type_ids (Tensor, optional):
+                See :class:`LukeModel`.
+            entity_attention_mask (list, optional):
+                See :class:`LukeModel`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the entity span classification logits.
+            Shape as `[batch_size, num_entities, num_labels]` and dtype as float32.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import LukeForEntitySpanClassification, LukeTokenizer
+
+                tokenizer = LukeTokenizer.from_pretrained('luke-base')
+                model = LukeForEntitySpanClassification.from_pretrained('luke-base', num_labels=2)
+
+                text = "Beyoncé lives in Los Angeles."
+                entity_spans = [(0, 7)]
+                inputs = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True)
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                inputs['entity_start_positions'] = paddle.to_tensor([[1]], dtype='int64')
+                inputs['entity_end_positions'] = paddle.to_tensor([[2]], dtype='int64')
+                logits = model(**inputs)
+        """
+
+        outputs = self.luke(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            entity_ids=entity_ids,
+            entity_position_ids=entity_position_ids,
+            entity_token_type_ids=entity_token_type_ids,
+            entity_attention_mask=entity_attention_mask,
+        )
+        hidden_size = outputs[0].shape[-1]
+
+        entity_start_positions = entity_start_positions.unsqueeze(-1).expand((-1, -1, hidden_size))
+        start_states = paddle_gather(x=outputs[0], index=entity_start_positions, dim=-2)
+        entity_end_positions = entity_end_positions.unsqueeze(-1).expand((-1, -1, hidden_size))
+        end_states = paddle_gather(x=outputs[0], index=entity_end_positions, dim=-2)
+        feature_vector = paddle.concat([start_states, end_states, outputs[1]], axis=2)
+
+        feature_vector = self.dropout(feature_vector)
+        logits = self.classifier(feature_vector)
+
+        return logits
+
+
+class LukeForQuestionAnswering(LukePretrainedModel):
+    """
+    LukeBert Model with question answering tasks.
+    Args:
+        config (:class:`LukeConfig`):
+            An instance of LukeConfig.
+    """
+
+    def __init__(self, config: LukeConfig):
+        super(LukeForQuestionAnswering, self).__init__(config)
+        self.luke = LukeModel(config)
+        self.qa_outputs = nn.Linear(self.config.hidden_size, 2)
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        entity_ids=None,
+        entity_position_ids=None,
+        entity_token_type_ids=None,
+        entity_attention_mask=None,
+    ):
+        r"""
+        The LukeForQuestionAnswering forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`LukeModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`LukeModel`.
+            position_ids (Tensor, optional):
+                See :class: `LukeModel`
+            attention_mask (list, optional):
+                See :class:`LukeModel`.
+            entity_ids (Tensor, optional):
+                See :class:`LukeModel`.
+            entity_position_ids (Tensor, optional):
+                See :class:`LukeModel`.
+            entity_token_type_ids (Tensor, optional):
+                See :class:`LukeModel`.
+            entity_attention_mask (list, optional):
+                See :class:`LukeModel`.
+
+        Returns:
+            tuple: Returns tuple (`start_logits`, `end_logits`).
+            With the fields:
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import LukeForQuestionAnswering, LukeTokenizer
+
+                tokenizer = LukeTokenizer.from_pretrained('luke-base')
+                model = LukeForQuestionAnswering.from_pretrained('luke-base')
+
+                text = "Beyoncé lives in Los Angeles."
+                entity_spans = [(0, 7)]
+                inputs = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True)
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                start_logits, end_logits = model(**inputs)
+        """
+
+        encoder_outputs = self.luke(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            entity_ids=entity_ids,
+            entity_position_ids=entity_position_ids,
+            entity_token_type_ids=entity_token_type_ids,
+            entity_attention_mask=entity_attention_mask,
+        )
+
+        word_hidden_states = encoder_outputs[0][:, : input_ids.shape[1], :]
+        logits = self.qa_outputs(word_hidden_states)
+        start_logits, end_logits = paddle.split(logits, 2, -1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        return start_logits, end_logits
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/luke/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/luke/tokenizer.py
new file mode 100644
index 000000000..7653c9652
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/luke/tokenizer.py
@@ -0,0 +1,752 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for LUKE."""
+
+from typing import Dict, List, Optional, Union
+
+try:
+    import regex as re
+except:
+    import re
+
+import itertools
+import json
+import sys
+import warnings
+from itertools import repeat
+
+from .. import RobertaBPETokenizer
+
+try:
+    from functools import lru_cache
+except ImportError:
+    # Just a dummy decorator to get the checks to run on python2
+    # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
+    def lru_cache():
+        return lambda func: func
+
+
+__all__ = ["LukeTokenizer"]
+_add_prefix_space = False
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"luke-base": 514, "luke-large": 514}
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings.
+    We specifically avoids mapping to whitespace/control characters the bpe code barfs on.
+
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    """
+    _chr = chr
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [_chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+class LukeTokenizer(RobertaBPETokenizer):
+    """
+    Constructs a Luke tokenizer. It uses a basic tokenizer to do punctuation
+    splitting, lower casing and so on, and follows a WordPiece tokenizer to
+    tokenize as subwords.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            The vocabulary file path (ends with '.json') required to instantiate
+            a `WordpieceTokenizer`.
+        entity_file (str):
+            The entity vocabulary file path (ends with '.tsv') required to instantiate
+            a `EntityTokenizer`.
+        do_lower_case (bool):
+            Whether or not to lowercase the input when tokenizing.
+            Defaults to`True`.
+        unk_token (str):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import LukeTokenizer
+            tokenizer = LukeTokenizer.from_pretrained('luke-large)
+
+            tokens = tokenizer('Beyoncé lives in Los Angeles', entity_spans=[(0, 7), (17, 28)])
+            #{'input_ids': [0, 40401, 261, 12695, 1074, 11, 1287, 1422, 2], 'entity_ids': [1657, 32]}
+
+    """
+
+    # resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    resource_files_names = {
+        "vocab_file": "vocab.json",
+        "merges_file": "merges.txt",
+        "entity_file": "entity_vocab.json",
+    }
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "luke-base": "https://bj.bcebos.com/paddlenlp/models/transformers/luke/luke-base/vocab.json",
+            "luke-large": "https://bj.bcebos.com/paddlenlp/models/transformers/luke/luke-large/vocab.json",
+        },
+        "merges_file": {
+            "luke-base": "https://bj.bcebos.com/paddlenlp/models/transformers/luke/luke-base/merges.txt",
+            "luke-large": "https://bj.bcebos.com/paddlenlp/models/transformers/luke/luke-large/merges.txt",
+        },
+        "entity_file": {
+            "luke-base": "https://bj.bcebos.com/paddlenlp/models/transformers/luke/luke-base/entity_vocab.json",
+            "luke-large": "https://bj.bcebos.com/paddlenlp/models/transformers/luke/luke-large/entity_vocab.json",
+        },
+    }
+    pretrained_init_configuration = {"luke-base": {"do_lower_case": True}, "luke-large": {"do_lower_case": True}}
+
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        entity_file,
+        merges_file,
+        do_lower_case=True,
+        unk_token="<unk>",
+        sep_token="</s>",
+        pad_token="<pad>",
+        cls_token="<s>",
+        mask_token="<mask>",
+        **kwargs
+    ):
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        with open(entity_file, encoding="utf-8") as entity_vocab_handle:
+            self.entity_vocab = json.load(entity_vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.sep_token = sep_token
+        self.cls_token = cls_token
+        self.pad_token = pad_token
+        self.unk_token = unk_token
+        self._all_special_tokens = [unk_token, sep_token, pad_token, cls_token, mask_token]
+        self.errors = "replace"  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            bpe_merges = merges_handle.read().split("\n")[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        self.added_tokens_encoder = {}
+        self.added_tokens_decoder = {}
+
+        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+        # RobertaTokenizer don't maintain the entity_file resource file name,
+        # so we should not set it as a param in super.__init__ function
+        self._entity_file = entity_file
+        super(LukeTokenizer, self).__init__(
+            vocab_file,
+            merges_file,
+            do_lower_case=do_lower_case,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+
+    @property
+    def sep_token_id(self):
+        return self.encoder[self.sep_token]
+
+    @property
+    def cls_token_id(self):
+        return self.encoder[self.cls_token]
+
+    @property
+    def pad_token_id(self):
+        return self.encoder[self.pad_token]
+
+    @property
+    def unk_token_id(self):
+        return self.encoder[self.unk_token]
+
+    def get_entity_vocab(self):
+        """Get the entity vocab"""
+        return self.entity_vocab
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str/unicode) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        return self.decoder.get(index)
+
+    def _tokenize(self, text, add_prefix_space=False):
+        if add_prefix_space:
+            text = " " + text
+
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            if sys.version_info[0] == 2:
+                token = "".join(
+                    self.byte_encoder[ord(b)] for b in token
+                )  # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
+            else:
+                token = "".join(
+                    self.byte_encoder[b] for b in token.encode("utf-8")
+                )  # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    def __call__(
+        self,
+        text,
+        text_pair=None,
+        entity_spans=None,
+        entity_spans_pair=None,
+        entities=None,
+        entities_pair=None,
+        max_mention_length=30,
+        max_length: Optional[int] = None,
+        stride=0,
+        add_prefix_space=False,
+        is_split_into_words=False,
+        padding=False,
+        truncation="longest_first",
+        return_position_ids=True,
+        return_token_type_ids=False,
+        return_attention_mask=True,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_special_tokens_mask=False,
+        **kwargs
+    ):
+        """
+        Performs tokenization and uses the tokenized tokens to prepare model
+        inputs. It supports sequence or sequence pair as input, and batch input
+        is allowed. `self.encode()` or `self.batch_encode()` would be called
+        separately for single or batch input depending on input format and
+        `is_split_into_words` argument.
+
+        Args:
+            text (str, List[str] or List[List[str]]):
+                The sequence or batch of sequences to be processed. One sequence
+                is a string or a list of strings depending on whether it has been
+                pretokenized. If each sequence is provided as a list of strings
+                (pretokenized), you must set `is_split_into_words` as `True` to
+                disambiguate with a batch of sequences.
+            text_pair (str, List[str] or List[List[str]], optional):
+                Same as `text` argument, while it represents for the latter
+                sequence of the sequence pair.
+            entity_spans (`List[Tuple[int, int]]`, `List[List[Tuple[int, int]]]`, *optional*):
+                The sequence or batch of sequences of entity spans to be encoded. Each sequence consists of tuples each
+                with two integers denoting character-based(different from transformers LUKE) start and end positions
+                of entities. If you specify `"entity_classification"` or `"entity_pair_classification"` as the `task`
+                argument in the constructor, the length of each sequence must be 1 or 2, respectively. If you specify
+                `entities`, the length of each sequence must be equal to the length of each sequence of `entities`.
+            entity_spans_pair (`List[Tuple[int, int]]`, `List[List[Tuple[int, int]]]`, *optional*):
+                The sequence or batch of sequences of entity spans to be encoded. Each sequence consists of tuples each
+                with two integers denoting character-based start and end positions of entities. If you specify the
+                `task` argument in the constructor, this argument is ignored. If you specify `entities_pair`, the
+                length of each sequence must be equal to the length of each sequence of `entities_pair`.
+            entities (`List[str]`, `List[List[str]]`, *optional*):
+                The sequence or batch of sequences of entities to be encoded. Each sequence consists of strings
+                representing entities, i.e., special entities (e.g., [MASK]) or entity titles of Wikipedia (e.g., Los
+                Angeles). This argument is ignored if you specify the `task` argument in the constructor. The length of
+                each sequence must be equal to the length of each sequence of `entity_spans`. If you specify
+                `entity_spans` without specifying this argument, the entity sequence or the batch of entity sequences
+                is automatically constructed by filling it with the [MASK] entity.
+            entities_pair (`List[str]`, `List[List[str]]`, *optional*):
+                The sequence or batch of sequences of entities to be encoded. Each sequence consists of strings
+                representing entities, i.e., special entities (e.g., [MASK]) or entity titles of Wikipedia (e.g., Los
+                Angeles). This argument is ignored if you specify the `task` argument in the constructor. The length of
+                each sequence must be equal to the length of each sequence of `entity_spans_pair`. If you specify
+                `entity_spans_pair` without specifying this argument, the entity sequence or the batch of entity
+                sequences is automatically constructed by filling it with the [MASK] entity.
+            max_mention_length (`int`):
+                The entity_position_ids's length.
+            max_length (int, optional):
+                If set to a number, will limit the total sequence returned so
+                that it has a maximum length. If there are overflowing tokens,
+                those overflowing tokens will be added to the returned dictionary
+                when `return_overflowing_tokens` is `True`. Defaults to `None`.
+            stride (int, optional):
+                Only available for batch input of sequence pair and mainly for
+                question answering usage. When for QA, `text` represents questions
+                and `text_pair` represents contexts. If `stride` is set to a
+                positive number, the context will be split into multiple spans
+                where `stride` defines the number of (tokenized) tokens to skip
+                from the start of one span to get the next span, thus will produce
+                a bigger batch than inputs to include all spans. Moreover, 'overflow_to_sample'
+                and 'offset_mapping' preserving the original example and position
+                information will be added to the returned dictionary. Defaults to 0.
+            add_prefix_space (bool, optional):
+                The tokenizer will add a space at the beginning of the sentence when it set to `True`.
+                Defaults to `False`.
+            padding (bool, optional):
+                If set to `True`, the returned sequences would be padded up to
+                `max_length` specified length according to padding side
+                (`self.padding_side`) and padding token id. Defaults to `False`.
+            truncation (str, optional):
+                String selected in the following options:
+
+                - 'longest_first' (default) Iteratively reduce the inputs sequence
+                until the input is under `max_length` starting from the longest
+                one at each token (when there is a pair of input sequences).
+                - 'only_first': Only truncate the first sequence.
+                - 'only_second': Only truncate the second sequence.
+                - 'do_not_truncate': Do not truncate (raise an error if the input
+                sequence is longer than `max_length`).
+
+                Defaults to 'longest_first'.
+            return_position_ids (bool, optional):
+                Whether to include tokens position ids in the returned dictionary.
+                Defaults to `False`.
+            return_token_type_ids (bool, optional):
+                Whether to include token type ids in the returned dictionary.
+                Defaults to `True`.
+            return_attention_mask (bool, optional):
+                Whether to include the attention mask in the returned dictionary.
+                Defaults to `False`.
+            return_length (bool, optional):
+                Whether to include the length of each encoded inputs in the
+                returned dictionary. Defaults to `False`.
+            return_overflowing_tokens (bool, optional):
+                Whether to include overflowing token information in the returned
+                dictionary. Defaults to `False`.
+            return_special_tokens_mask (bool, optional):
+                Whether to include special tokens mask information in the returned
+                dictionary. Defaults to `False`.
+
+        Returns:
+            dict or list[dict] (for batch input):
+                The dict has the following optional items:
+
+                - **input_ids** (list[int]): List of token ids to be fed to a model.
+                - **position_ids** (list[int], optional): List of token position ids to be
+                  fed to a model. Included when `return_position_ids` is `True`
+                - **token_type_ids** (list[int], optional): List of token type ids to be
+                  fed to a model. Included when `return_token_type_ids` is `True`.
+                - **attention_mask** (list[int], optional): List of integers valued 0 or 1,
+                  where 0 specifies paddings and should not be attended to by the
+                  model. Included when `return_attention_mask` is `True`.
+                - **entity_ids** (list[int]): List of token ids to be fed to a model. Included when
+                  `entity_spans` is not `None`.
+                - **entity_position_ids** (list[int], optional): List of token position ids to be
+                  fed to a model. Included when `entity_spans` is not `None`.
+                - **entity_segment_ids** (list[int], optional): List of token type ids to be
+                  fed to a model. Included when `entity_spans` is not `None`.
+                - **entity_attention_mask** (list[int], optional): List of integers valued 0 or 1,
+                  where 0 specifies paddings and should not be attended to by the
+                  model. Included when `entity_spans` is not `None`.
+                - **seq_len** (int, optional): The input_ids length. Included when `return_length`
+                  is `True`.
+                - **overflowing_tokens** (list[int], optional): List of overflowing tokens.
+                  Included when if `max_length` is specified and `return_overflowing_tokens`
+                  is True.
+                - **num_truncated_tokens** (int, optional): The number of overflowing tokens.
+                  Included when if `max_length` is specified and `return_overflowing_tokens`
+                  is True.
+                - **special_tokens_mask** (list[int], optional): List of integers valued 0 or 1,
+                  with 0 specifying special added tokens and 1 specifying sequence tokens.
+                  Included when `return_special_tokens_mask` is `True`.
+                - **offset_mapping** (list[int], optional): list of pair preserving the
+                  index of start and end char in original input for each token.
+                  For a special token, the index pair is `(0, 0)`. Included when
+                  `stride` works.
+                - **overflow_to_sample** (int, optional): Index of example from which this
+                  feature is generated. Included when `stride` works.
+        """
+
+        global _add_prefix_space
+        if add_prefix_space:
+            _add_prefix_space = True
+
+        encode_output = super(LukeTokenizer, self).__call__(
+            text,
+            text_pair=text_pair,
+            max_length=max_length,
+            stride=stride,
+            is_split_into_words=is_split_into_words,
+            padding=padding,
+            truncation=truncation,
+            return_position_ids=return_position_ids,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_length=return_length,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            **kwargs,
+        )
+        if not entity_spans:
+            return encode_output
+        is_batched = bool(
+            (not is_split_into_words and isinstance(text, (list, tuple)))
+            or (
+                is_split_into_words and isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
+            )
+        )
+        if is_batched:
+            if entities is None:
+                entities = [None] * len(entity_spans)
+            for i, ent in enumerate(zip(entities, entity_spans, text)):
+                entity_encode = self.entity_encode(ent[2], ent[0], max_mention_length, ent[1])
+                encode_output[i].update(entity_encode)
+            if entity_spans_pair:
+                if entities_pair is None:
+                    entities_pair = [None] * len(entity_spans_pair)
+                for i, ent in enumerate(zip(entities_pair, entity_spans_pair, text_pair)):
+                    entity_encode = self.entity_encode(
+                        ent[2],
+                        ent[0],
+                        max_mention_length,
+                        ent[1],
+                        1,
+                        encode_output[i]["input_ids"].index(self.sep_token_id) + 2,
+                    )
+                    for k in entity_encode.keys():
+                        encode_output[i][k] = encode_output[i][k] + entity_encode[k]
+
+        else:
+            entity_encode = self.entity_encode(text, entities, max_mention_length, entity_spans)
+
+            encode_output.update(entity_encode)
+            if entity_spans_pair:
+                entity_encode = self.entity_encode(
+                    text_pair,
+                    entities_pair,
+                    max_mention_length,
+                    entity_spans_pair,
+                    1,
+                    encode_output["input_ids"].index(self.sep_token_id) + 2,
+                )
+                for k in entity_encode.keys():
+                    encode_output[k] = encode_output[k] + entity_encode[k]
+
+        return encode_output
+
+    def tokenize(self, text, add_prefix_space=False):
+        """
+        Tokenize a string.
+            Args:
+              text (str):
+                The sentence to be tokenized.
+              add_prefix_space (boolean, default False):
+                Begin the sentence with at least one space to get invariance
+                to word order in GPT-2 (and Luke) tokenizers.
+        """
+        if _add_prefix_space:
+            add_prefix_space = True
+
+        def split_on_token(tok, text):
+            result = []
+            split_text = text.split(tok)
+            for i, sub_text in enumerate(split_text):
+                sub_text = sub_text.strip()
+                if i == 0 and not sub_text:
+                    result += [tok]
+                elif i == len(split_text) - 1:
+                    if sub_text:
+                        result += [sub_text]
+                    else:
+                        pass
+                else:
+                    if sub_text:
+                        result += [sub_text]
+                    result += [tok]
+            return result
+
+        def split_on_tokens(tok_list, text):
+            if not text.strip():
+                return []
+            if not tok_list:
+                return self._tokenize(text, add_prefix_space)
+
+            tokenized_text = []
+            text_list = [text]
+            for tok in tok_list:
+                tokenized_text = []
+                for sub_text in text_list:
+                    if sub_text not in self.added_tokens_encoder and sub_text not in self._all_special_tokens:
+                        tokenized_text += split_on_token(tok, sub_text)
+                    else:
+                        tokenized_text += [sub_text]
+                text_list = tokenized_text
+
+            return list(
+                itertools.chain.from_iterable(
+                    (
+                        self._tokenize(token, add_prefix_space)
+                        if token not in self.added_tokens_encoder and token not in self._all_special_tokens
+                        else [token]
+                        for token in tokenized_text
+                    )
+                )
+            )
+
+        added_tokens = list(self.added_tokens_encoder.keys()) + self._all_special_tokens
+        tokenized_text = split_on_tokens(added_tokens, text)
+        return tokenized_text
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    def convert_tokens_to_ids(self, tokens):
+        if tokens is None:
+            return None
+        ids = []
+        for token in tokens:
+            ids.append(self._convert_token_to_id_with_added_voc(token))
+        return ids
+
+    def _convert_token_to_id_with_added_voc(self, token):
+        if token is None:
+            return None
+        if token in self.added_tokens_encoder:
+            return self.added_tokens_encoder[token]
+
+        return self._convert_token_to_id(token)
+
+    def add_special_tokens(self, token_list: Union[List[int], Dict]):
+        """
+        Adding special tokens if you need.
+
+        Args:
+            token_list (List[int], Dict[List[int]]):
+                The special token list you provided. If you provide a Dict, the key of the Dict must
+                be "additional_special_tokens" and the value must be token list.
+        """
+        if isinstance(token_list, dict):
+            token_list = token_list["additional_special_tokens"]
+        encoder_dict = dict()
+        decoder_dict = dict()
+        for token in token_list:
+            encoder_dict[token] = len(self.encoder.keys())
+            decoder_dict[len(self.decoder.keys())] = token
+        self.added_tokens_encoder.update(encoder_dict)
+        self.added_tokens_decoder.update(decoder_dict)
+
+    def convert_entity_to_id(self, entity: str):
+        """Convert the entity to id"""
+        if not self.entity_vocab.get(entity, None):
+            warnings.warn(f"{entity} not found in entity thesaurus")
+            return None
+        else:
+            return self.entity_vocab[entity]
+
+    def entity_encode(self, text, entities, max_mention_length, entity_spans, ent_sep=0, offset_a=1):
+        """Convert the string entity to digital entity"""
+
+        def convert_tuple_to_list(x):
+            """This function aim to convert tuple to list"""
+            if isinstance(x, tuple):
+                x = list(x)
+            for i, each_x in enumerate(x):
+                if isinstance(each_x, tuple):
+                    x[i] = list(each_x)
+            return x
+
+        mentions = []
+        if entities:
+            for i, entity in enumerate(zip(entities, entity_spans)):
+                entity = convert_tuple_to_list(entity)
+                entity[1][0], entity[1][1] = self._convert_entity_pos(text, entity[1])
+                if not self.entity_vocab.get(entity[0], None):
+                    warnings.warn(f"{entity[0]} not found in entity thesaurus")
+                    mentions.append((1, entity[1][0], entity[1][1]))
+                else:
+                    mentions.append((self.entity_vocab[entity[0]], entity[1][0], entity[1][1]))
+        else:
+            entities = [2] * len(entity_spans)
+            for i, entity in enumerate(zip(entities, entity_spans)):
+                entity = convert_tuple_to_list(entity)
+                entity[1][0], entity[1][1] = self._convert_entity_pos(text, entity[1])
+                mentions.append((entity[0], entity[1][0], entity[1][1]))
+
+        entity_ids = [0] * len(mentions)
+        entity_segment_ids = [ent_sep] * len(mentions)
+        entity_attention_mask = [1] * len(mentions)
+        entity_position_ids = [[-1 for y in range(max_mention_length)] for x in range(len(mentions))]
+
+        for i, (offset, (entity_id, start, end)) in enumerate(zip(repeat(offset_a), mentions)):
+            entity_ids[i] = entity_id
+            entity_position_ids[i][: end - start] = range(start + offset, end + offset)
+        return dict(
+            entity_ids=entity_ids,
+            entity_token_type_ids=entity_segment_ids,
+            entity_attention_mask=entity_attention_mask,
+            entity_position_ids=entity_position_ids,
+        )
+
+    def _convert_entity_pos(self, text, entity_span):
+        text_token = self.tokenize(text[0 : entity_span[0]].strip())
+        entity_token = self.tokenize(text[entity_span[0] : entity_span[1]].strip())
+        return len(text_token), len(text_token) + len(entity_token)
+
+    def get_offset_mapping(self, text):
+        tokens = self._tokenize(text)
+        offset_mapping = []
+        offset = 0
+        for token in tokens:
+            if token[0] == "Ġ":
+                offset_mapping.append((offset + 1, offset + len(token)))
+            else:
+                offset_mapping.append((offset, offset + len(token)))
+            offset += len(token)
+
+        return offset_mapping
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+
+        A Luke sequence pair mask has the following format:
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+
+        Returns:
+            List[int]: List of token_type_id according to the given sequence(s).
+        """
+        _sep = [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(_cls + token_ids_0 + _sep) * [0]
+        return len(_cls + token_ids_0 + _sep) * [0] + len(_sep + token_ids_1 + _sep) * [1]
+
+    def num_special_tokens_to_add(self, pair=False):
+        """
+        Returns the number of added tokens when encoding a sequence with special tokens.
+
+        Args:
+            pair(bool):
+                Whether the input is a sequence pair or a single sequence.
+                Defaults to `False` and the input is a single sequence.
+
+        Returns:
+            int: Number of tokens added to sequences.
+        """
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification
+        tasks by concatenating and adding special tokens.
+        """
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        if token_ids_1 is None:
+            return _cls + token_ids_0 + _sep
+        return _cls + token_ids_0 + _sep + _sep + token_ids_1 + _sep
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mamba/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mamba/__init__.py
new file mode 100644
index 000000000..a354024a5
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mamba/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .configuration import *
+from .modeling import *
+from .tokenizer import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mamba/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mamba/configuration.py
new file mode 100644
index 000000000..7d0d17dda
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mamba/configuration.py
@@ -0,0 +1,151 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MAMBA configuration"""
+
+import math
+
+from ..configuration_utils import PretrainedConfig
+
+__all__ = ["MambaConfig"]
+
+
+class MambaConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`MambaModel`]. It is used to instantiate a MAMBA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the MAMBA
+    [state-spaces/mamba-2.8b](https://huggingface.co/state-spaces/mamba-2.8b) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50280):
+            Vocabulary size of the MAMBA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MambaModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        state_size (`int`, *optional*, defaults to 16): shape of the state space latents.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the model.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
+            The epsilon to use in the layer normalization layers.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 0):
+            The id of the beginning of sentence token in the vocabulary.
+        eos_token_id (`int`, *optional*, defaults to 0):
+            The id of the end of sentence token in the vocabulary.
+        expand (`int`, *optional*, defaults to 2): Expanding factor used to determine the intermediate size.
+        conv_kernel (`int`, *optional*, defaults to 4): Size of the convolution kernel.
+        use_bias (`bool`, *optional*, defaults to `False`):
+            Whether or not to use bias in ["in_proj", "out_proj"] of the mixer block
+        use_conv_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not to use bias in the convolution layer of the mixer block.
+        hidden_act (`str`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        initializer_range (`float`, *optional*, defaults to 0.1):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        residual_in_fp32 (`bool`, *optional*, defaults to `True`):
+            Whether or not residuals should be in `float32`. If set to `False` residuals will keep the same `dtype` as the rest of the model
+        time_step_rank (`Union[int,str]`, *optional*, defaults to `"auto"`):
+            Rank of the discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)`
+        time_step_scale (`float`, *optional*, defaults to 1.0):
+            Scale used used to scale `dt_proj.bias`.
+        time_step_min (`float`, *optional*, defaults to 0.001):
+            Minimum `time_step` used to bound `dt_proj.bias`.
+        time_step_max (`float`, *optional*, defaults to 0.1):
+            Maximum `time_step` used to bound `dt_proj.bias`.
+        time_step_init_scheme (`float`, *optional*, defaults to `"random"`):
+            Init scheme used for `dt_proj.weight`. Should be one of `["random","uniform"]`
+        time_step_floor (`float`, *optional*, defaults to 0.0001):
+            Minimum clamping value of the `dt_proj.bias` layer initialization.
+        rescale_prenorm_residual (`bool`, *optional*, defaults to `False`):
+            Whether or not to rescale `out_proj` weights when initializing.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the cache should be used.
+
+
+    Example:
+
+    ```python
+    >>> from paddlenlp.transformers import MambaConfig, MambaModel
+
+    >>> # Initializing a Mamba configuration
+    >>> configuration = MambaConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = MambaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "mamba"
+
+    def __init__(
+        self,
+        vocab_size=50280,
+        hidden_size=768,
+        state_size=16,
+        num_hidden_layers=32,
+        layer_norm_epsilon=1e-5,
+        pad_token_id=0,
+        bos_token_id=0,
+        eos_token_id=0,
+        expand=2,
+        conv_kernel=4,
+        use_bias=False,
+        use_conv_bias=True,
+        hidden_act="silu",
+        initializer_range=0.1,
+        residual_in_fp32=True,
+        time_step_rank="auto",
+        time_step_scale=1.0,
+        time_step_min=0.001,
+        time_step_max=0.1,
+        time_step_init_scheme="random",
+        time_step_floor=1e-4,
+        rescale_prenorm_residual=False,
+        use_cache=True,
+        **kwargs,
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.state_size = state_size
+        self.num_hidden_layers = num_hidden_layers
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.conv_kernel = conv_kernel
+        self.expand = expand
+        self.intermediate_size = int(expand * self.hidden_size)
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.use_bias = use_bias
+        self.use_conv_bias = use_conv_bias
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.time_step_rank = math.ceil(self.hidden_size / 16) if time_step_rank == "auto" else time_step_rank
+        self.time_step_scale = time_step_scale
+        self.time_step_min = time_step_min
+        self.time_step_max = time_step_max
+        self.time_step_init_scheme = time_step_init_scheme
+        self.time_step_floor = time_step_floor
+        self.rescale_prenorm_residual = rescale_prenorm_residual
+        self.residual_in_fp32 = residual_in_fp32
+        self.use_cache = use_cache
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mamba/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mamba/modeling.py
new file mode 100644
index 000000000..27e4533cd
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mamba/modeling.py
@@ -0,0 +1,795 @@
+# coding=utf-8
+# Copyright 2024 state-spaces/mamba org and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Paddle MAMBA model."""
+
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import paddle
+from paddle import nn
+from paddle.distributed.fleet.utils import recompute
+from paddle.nn import CrossEntropyLoss
+
+from paddlenlp.transformers.conversion_utils import (
+    StateDictNameMapping,
+    init_name_mappings,
+)
+
+from ...utils.initializer import constant_, kaiming_uniform_, normal_, uniform_, zeros_
+from ..activations import ACT2FN
+from ..model_outputs import ModelOutput
+from ..model_utils import PretrainedModel
+from .configuration import MambaConfig
+
+try:
+    from mamba_ssm_paddle.ops.selective_scan_interface import (
+        mamba_inner_fn,
+        selective_scan_fn,
+    )
+    from mamba_ssm_paddle.ops.triton.selective_state_update import (
+        selective_state_update,
+    )
+except ImportError:
+    selective_state_update, selective_scan_fn, mamba_inner_fn = None, None, None
+
+try:
+    from mamba_ssm_paddle.ops.causal_conv1d_interface import (
+        causal_conv1d_fn,
+        causal_conv1d_update,
+    )
+except ImportError:
+    causal_conv1d_fn, causal_conv1d_update = None, None
+
+is_fast_path_available = all(
+    (selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)
+)
+
+from paddlenlp.utils.log import logger
+
+########################################################################################################################
+
+_CHECKPOINT_FOR_DOC = "state-spaces/mamba-130m-hf"
+_CONFIG_FOR_DOC = "MambaConfig"
+
+__all__ = [
+    "MambaMixer",
+    "MambaBlock",
+    "MambaModel",
+    "MambaPretrainedModel",
+    "MambaForCausalLM",
+]
+
+
+class MambaCache:
+    """
+    Arguments:
+        config: MambaConfig
+        batch_size: int
+        dtype: paddle.dtype
+
+    Attributes:
+        seqlen_offset: int
+        dtype: paddle.dtype
+        conv_states: Dict[int, paddle.Tensor] # layer_idx -> [batch_size, intermediate_size, conv_kernel_size]
+        ssm_states: Dict[int, paddle.Tensor] # layer_idx -> [batch_size, intermediate_size, ssm_state_size]
+    """
+
+    def __init__(
+        self,
+        config: MambaConfig,
+        batch_size: int,
+        dtype: paddle.dtype = paddle.float16,
+    ):
+        self.seqlen_offset = 0
+        self.dtype = dtype
+        self.config = config
+        intermediate_size = config.intermediate_size
+        ssm_state_size = config.state_size
+        conv_kernel_size = config.conv_kernel
+
+        self.conv_states = {
+            i: paddle.zeros([batch_size, intermediate_size, conv_kernel_size], dtype=dtype)
+            for i in range(config.num_hidden_layers)
+        }
+        self.ssm_states = {
+            i: paddle.zeros([batch_size, intermediate_size, ssm_state_size], dtype=dtype)
+            for i in range(config.num_hidden_layers)
+        }
+
+    def reset(self):
+        """Resets the cache values while preserving the objects"""
+        for layer_idx in range(self.config.num_hidden_layers):
+            # In-place ops prevent breaking the static address
+            self.conv_states[layer_idx].zero_()
+            self.ssm_states[layer_idx].zero_()
+        self.seqlen_offset = 0
+
+
+class MambaMixer(nn.Layer):
+    """
+    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
+    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
+    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
+    and is why Mamba is called **selective** state spaces)
+    """
+
+    def __init__(self, config: MambaConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.state_size
+        self.conv_kernel_size = config.conv_kernel
+        self.intermediate_size = config.intermediate_size
+        self.time_step_rank = int(config.time_step_rank)
+        self.layer_idx = layer_idx
+        self.use_conv_bias = config.use_conv_bias
+        self.conv1d = nn.Conv1D(
+            in_channels=self.intermediate_size,
+            out_channels=self.intermediate_size,
+            bias_attr=config.use_conv_bias,
+            kernel_size=config.conv_kernel,
+            groups=self.intermediate_size,
+            padding=config.conv_kernel - 1,
+        )
+
+        self.activation = config.hidden_act
+        self.act = ACT2FN[config.hidden_act]
+
+        # projection of the input hidden states
+        self.in_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias_attr=config.use_bias)
+        # selective projection used to make dt, B and C input dependant
+        self.x_proj = nn.Linear(self.intermediate_size, self.time_step_rank + self.ssm_state_size * 2, bias_attr=False)
+        # time step projection (discretization)
+        self.dt_proj = nn.Linear(self.time_step_rank, self.intermediate_size, bias_attr=True)
+
+        # S4D real initialization. These are not discretized!
+        # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
+        A = paddle.arange(1, self.ssm_state_size + 1, dtype=paddle.float32)[None, :]
+        A = A.expand([self.intermediate_size, -1]).contiguous()
+
+        self.A_log = self.create_parameter(
+            shape=A.shape,
+            default_initializer=nn.initializer.Assign(paddle.log(A)),
+        )
+        self.D = self.create_parameter(
+            shape=[
+                self.intermediate_size,
+            ],
+            default_initializer=nn.initializer.Constant(1),
+        )
+        self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias_attr=config.use_bias)
+        self.use_bias = config.use_bias
+        if not is_fast_path_available:
+            logger.warning_once(
+                "The fast path is not available because on of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`"
+                " is None. Falling back to the naive implementation. To install follow https://github.com/JunnYu/mamba/tree/paddle-1.2.2/#installation. "
+            )
+
+    def cuda_kernels_forward(self, hidden_states: paddle.Tensor, cache: Optional[MambaCache] = None):
+        # 1. Gated MLP's linear projection
+        projected_states = self.in_proj(hidden_states).transpose([0, 2, 1])
+
+        if self.training and cache is None:  # Doesn't support outputting the states -> used for training
+            contextualized_states = mamba_inner_fn(
+                projected_states,
+                self.conv1d.weight,
+                self.conv1d.bias if self.use_conv_bias else None,
+                self.x_proj.weight,
+                self.dt_proj.weight,
+                self.out_proj.weight,
+                self.out_proj.bias.cast("float32") if self.use_bias else None,
+                -paddle.exp(self.A_log.cast("float32")),
+                None,  # input-dependent B
+                None,  # input-dependent C
+                self.D.cast("float32"),
+                delta_bias=self.dt_proj.bias.cast("float32"),
+                delta_softplus=True,
+                is_paddle_linear=True,
+            )
+
+        else:
+            hidden_states, gate = projected_states.chunk(2, axis=1)
+
+            # 2. Convolution sequence transformation
+            conv_weights = self.conv1d.weight.reshape([self.conv1d.weight.shape[0], self.conv1d.weight.shape[2]])
+            if cache is not None and cache.seqlen_offset > 0:
+                hidden_states = causal_conv1d_update(
+                    hidden_states.squeeze(-1),
+                    cache.conv_states[self.layer_idx],
+                    conv_weights,
+                    self.conv1d.bias,
+                    self.activation,
+                )
+                hidden_states = hidden_states.unsqueeze(-1)
+            else:
+                if cache is not None:
+                    conv_states = nn.functional.pad(
+                        hidden_states,
+                        (self.conv_kernel_size - hidden_states.shape[-1], 0),
+                        data_format="NCL",
+                    )
+                    cache.conv_states[self.layer_idx].copy_(conv_states.cast(cache.dtype), False)
+                hidden_states = causal_conv1d_fn(
+                    hidden_states, conv_weights, self.conv1d.bias, activation=self.activation
+                )
+
+            # 3. State Space Model sequence transformation
+            # 3.a. input varying initialization of time_step, B and C
+            ssm_parameters = self.x_proj(hidden_states.transpose([0, 2, 1]))
+            time_step, B, C = paddle.split(
+                ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], axis=-1
+            )
+            discrete_time_step = self.dt_proj.weight.t() @ time_step.transpose([0, 2, 1])
+
+            A = -paddle.exp(self.A_log.cast("float32"))
+            # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+            time_proj_bias = self.dt_proj.bias.cast("float32") if hasattr(self.dt_proj, "bias") else None
+            if cache is not None and cache.seqlen_offset > 0:
+                scan_outputs = selective_state_update(
+                    cache.ssm_states[self.layer_idx],
+                    hidden_states[..., 0],
+                    discrete_time_step[..., 0],
+                    A,
+                    B[:, 0],
+                    C[:, 0],
+                    self.D,
+                    gate[..., 0],
+                    time_proj_bias,
+                    dt_softplus=True,
+                ).unsqueeze(-1)
+            else:
+                scan_outputs, ssm_state = selective_scan_fn(
+                    hidden_states,
+                    discrete_time_step,
+                    A,
+                    B.transpose([0, 2, 1]),
+                    C.transpose([0, 2, 1]),
+                    self.D.cast("float32"),
+                    gate,
+                    time_proj_bias,
+                    delta_softplus=True,
+                    return_last_state=True,
+                )
+                if ssm_state is not None and cache is not None:
+                    cache.ssm_states[self.layer_idx].copy_(ssm_state.cast(cache.dtype), False)
+
+            # 4. Final linear projection
+            contextualized_states = self.out_proj(scan_outputs.transpose([0, 2, 1]))
+        return contextualized_states
+
+    # fmt: off
+    def slow_forward(self, input_states, cache: Optional[MambaCache] = None):
+        batch_size, seq_len, _ = input_states.shape
+        dtype = input_states.dtype
+        # 1. Gated MLP's linear projection
+        projected_states = self.in_proj(input_states).transpose([0, 2, 1])                   # [batch, 2 * intermediate_size, seq_len]
+        hidden_states, gate = projected_states.chunk(2, axis=1)
+
+        # 2. Convolution sequence transformation
+        if cache is not None:
+            ssm_state = cache.ssm_states[self.layer_idx].clone()
+            if cache.seqlen_offset > 0:
+                conv_state = cache.conv_states[self.layer_idx]                   # [batch, intermediate_size, conv_kernel_size]
+                conv_state = paddle.roll(conv_state, shifts=-1, axis=-1)
+                conv_state[:, :, -1] = hidden_states[:, :, 0]
+                cache.conv_states[self.layer_idx].copy_(conv_state.cast(cache.dtype), False)
+                hidden_states = paddle.sum(conv_state * self.conv1d.weight[:, 0, :], axis=-1)
+                if self.use_conv_bias:
+                    hidden_states += self.conv1d.bias
+                hidden_states = self.act(hidden_states).cast(dtype).unsqueeze(-1)         # [batch, intermediate_size, 1] : decoding
+            else:
+                conv_state = nn.functional.pad(
+                    hidden_states,
+                    (self.conv_kernel_size - hidden_states.shape[-1], 0),
+                    data_format="NCL",
+                )
+                cache.conv_states[self.layer_idx].copy_(conv_state.cast(cache.dtype), False)
+                hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])     # [batch, intermediate_size, seq_len]
+        else:
+            ssm_state = paddle.zeros(
+                (batch_size, self.intermediate_size, self.ssm_state_size),
+                dtype=dtype,
+            )
+            hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])         # [batch, intermediate_size, seq_len]
+
+        # 3. State Space Model sequence transformation
+        # 3.a. Selection:  [batch, seq_len, self.time_step_rank + self.ssm_state_size * 2]
+        ssm_parameters = self.x_proj(hidden_states.transpose([0, 2, 1]))
+        time_step, B, C = paddle.split(
+            ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], axis=-1
+        )
+        discrete_time_step = self.dt_proj(time_step)                                          # [batch, seq_len, intermediate_size]
+        discrete_time_step = nn.functional.softplus(discrete_time_step).transpose([0, 2, 1])  # [batch, intermediate_size, seq_len]
+
+        # 3.b. Discretization: B and C to [batch, seq_len, intermediate_size, ssm_state_size] (SRAM)
+        A = -paddle.exp(self.A_log.cast("float32"))                                             # [intermediate_size, ssm_state_size]
+        discrete_A = paddle.exp(A[None, :, None, :] * discrete_time_step[:, :, :, None])        # [batch, intermediate_size, seq_len, ssm_state_size]
+        discrete_B = discrete_time_step[:, :, :, None] * B[:, None, :, :].cast("float32")       # [batch, intermediade_size, seq_len, ssm_state_size]
+        deltaB_u = discrete_B * hidden_states[:, :, :, None].cast("float32")
+
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        scan_outputs = []
+        for i in range(seq_len):
+            ssm_state = discrete_A[:, :, i, :] * ssm_state + deltaB_u[:, :, i, :]         # [batch, intermediade_size, ssm_state]
+            scan_output = paddle.matmul(ssm_state.cast(dtype), C[:, i, :].unsqueeze(-1))  # [batch, intermediade_size, 1]
+            scan_outputs.append(scan_output[:, :, 0])
+        scan_output = paddle.stack(scan_outputs, axis=-1)                                 # [batch, seq_len, intermediade_size]
+        scan_output = scan_output + (hidden_states * self.D[None, :, None])
+        scan_output = (scan_output * self.act(gate))
+
+        if cache is not None:
+            cache.ssm_states[self.layer_idx].copy_(ssm_state.cast(cache.dtype), False)
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_output.transpose([0, 2, 1]))             # [batch, seq_len, hidden_size]
+        return contextualized_states
+    # fmt: on
+
+    def forward(self, hidden_states, cache: Optional[MambaCache] = None):
+        if is_fast_path_available:
+            return self.cuda_kernels_forward(hidden_states, cache)
+        return self.slow_forward(hidden_states, cache)
+
+
+class MambaRMSNorm(nn.Layer):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        MambaRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
+        """
+        super().__init__()
+        self.weight = self.create_parameter(
+            shape=[
+                hidden_size,
+            ],
+            default_initializer=nn.initializer.Constant(1),
+        )
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.cast(paddle.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * paddle.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.cast(input_dtype)
+
+    def extra_repr(self):
+        return f"{self.weight.shape[0]}, eps={self.variance_epsilon}"
+
+
+class MambaBlock(nn.Layer):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.residual_in_fp32 = config.residual_in_fp32
+        self.norm = MambaRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        self.mixer = MambaMixer(config, layer_idx=layer_idx)
+
+    def forward(self, hidden_states, cache: Optional[MambaCache] = None):
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states.cast(dtype=self.norm.weight.dtype))
+        if self.residual_in_fp32:
+            residual = residual.cast(paddle.float32)
+
+        hidden_states = self.mixer(hidden_states, cache=cache)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class MambaPretrainedModel(PretrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = MambaConfig
+    base_model_prefix = "backbone"
+    _no_split_modules = ["MambaBlock"]
+    supports_gradient_checkpointing = True
+
+    @classmethod
+    def _get_name_mappings(cls, config: MambaConfig) -> List[StateDictNameMapping]:
+        mappings: list[StateDictNameMapping] = []
+        model_mappings = [
+            ["embeddings.weight"],
+            ["norm_f.weight"],
+        ]
+        for layer_index in range(config.num_hidden_layers):
+            layer_mappings = [
+                [f"layers.{layer_index}.norm.weight"],
+                [f"layers.{layer_index}.mixer.A_log"],
+                [f"layers.{layer_index}.mixer.D"],
+                [f"layers.{layer_index}.mixer.conv1d.weight"],
+                [f"layers.{layer_index}.mixer.in_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.mixer.x_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.mixer.dt_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.mixer.out_proj.weight", None, "transpose"],
+            ]
+            layer_mappings.append([f"layers.{layer_index}.mixer.dt_proj.bias"])
+
+            if config.use_conv_bias:
+                layer_mappings.append([f"layers.{layer_index}.mixer.conv1d.bias"])
+            if config.use_bias:
+                layer_mappings.append([f"layers.{layer_index}.mixer.in_proj.bias"])
+                layer_mappings.append([f"layers.{layer_index}.mixer.out_proj.bias"])
+            model_mappings.extend(layer_mappings)
+
+        init_name_mappings(mappings=model_mappings)
+
+        # base-model prefix "MambaModel"
+        if "MambaModel" not in config.architectures:
+            for mapping in model_mappings:
+                mapping[0] = "backbone." + mapping[0]
+                mapping[1] = "backbone." + mapping[1]
+            if not config.tie_word_embeddings:
+                model_mappings.append(["lm_head.weight", "lm_head.weight", "transpose"])
+
+        mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)]
+        return mappings
+
+    def post_init(self):
+        """
+        A method executed at the end of each Transformer model initialization, to execute code that needs the model's
+        modules properly initialized (such as weight initialization).
+        """
+        self.init_weights()
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, MambaMixer):
+            module.A_log._no_weight_decay = True
+            module.D._no_weight_decay = True
+
+            dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale
+            if self.config.time_step_init_scheme == "constant":
+                constant_(module.dt_proj.weight, dt_init_std)
+            elif self.config.time_step_init_scheme == "random":
+                uniform_(module.dt_proj.weight, -dt_init_std, dt_init_std)
+
+            dt = paddle.exp(
+                paddle.rand((self.config.intermediate_size,), dtype="float32").cast(paddle.get_default_dtype())
+                * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
+                + math.log(self.config.time_step_min)
+            ).clip(min=self.config.time_step_floor)
+            # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+            inv_dt = dt + paddle.log(-paddle.expm1(-dt))
+            with paddle.no_grad():
+                module.dt_proj.bias.copy_(inv_dt, False)
+            module.dt_proj.bias._no_reinit = True
+
+        if isinstance(module, nn.Linear):
+            if module.bias is not None:
+                if not getattr(module.bias, "_no_reinit", False):
+                    zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            normal_(module.weight, std=self.config.initializer_range)
+
+        if self.config.rescale_prenorm_residual:
+            # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+            #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+            #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+            #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+            #
+            # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+            for name, p in module.named_parameters():
+                if name in ["out_proj.weight"]:
+                    # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                    # Following Pytorch init, except scale by 1/sqrt(2 * n_layer)
+                    # We need to reinit p since this code could be called multiple times
+                    # Having just p *= scale would repeatedly scale it down
+                    kaiming_uniform_(p, a=math.sqrt(5))
+                    with paddle.no_grad():
+                        p.copy_(p / math.sqrt(self.config.num_layers), False)
+
+
+@dataclass
+class MambaOutput(ModelOutput):
+    """
+    Class for the MAMBA model outputs.
+
+    Args:
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        cache (`MambaCache`):
+            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
+            avoid providing the old `input_ids`.
+
+            Includes both the State space model state matrices after the selective scan, and the Convolutional states
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+    """
+
+    last_hidden_state: Optional[paddle.Tensor] = None
+    cache: Optional[MambaCache] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class MambaCausalLMOutput(ModelOutput):
+    """
+    Base class for causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        cache (`MambaCache`):
+            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
+            avoid providing the old `input_ids`.
+
+            Includes both the State space model state matrices after the selective scan, and the Convolutional states
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: Optional[paddle.Tensor] = None
+    cache: Optional[MambaCache] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+
+
+MAMBA_START_DOCSTRING = r"""
+
+    This model inherits from [`PretrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [paddle.nn.Layer](https://pypaddle.org/docs/stable/nn.html#paddle.nn.Layer) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MambaConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PretrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+MAMBA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`paddle.Tensor` of shape `(batch_size, input_ids_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            If `cache.seqlen_offset>0`, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PretrainedTokenizer.encode`] and
+            [`PretrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        inputs_embeds (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        cache (`MambaCache`, *optional*):
+            If passed along, the model uses the previous state in all the blocks (which will give the output for the
+            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
+        use_cache (`bool`, *optional*):
+            If set to `True`, the `cache` is returned and can be used to quickly generate the next logits.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class MambaModel(MambaPretrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.LayerList([MambaBlock(config, layer_idx=idx) for idx in range(config.num_hidden_layers)])
+
+        self.enable_recompute = False
+        self.norm_f = MambaRMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embeddings = new_embeddings
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        cache: Optional[MambaCache] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,  # `attention_mask` is passed by the tokenizer and we don't want it
+    ) -> Union[Tuple, MambaOutput]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):  # ^ is python for xor
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embeddings(input_ids)
+
+        if self.enable_recompute and self.training and use_cache:
+            use_cache = False
+
+        if cache is None and use_cache:
+            cache = MambaCache(self.config, inputs_embeds.shape[0], dtype=inputs_embeds.dtype)
+
+        hidden_states = inputs_embeds
+        all_hidden_states = () if output_hidden_states else None
+        for mixer_block in self.layers:
+            if self.enable_recompute and self.training and not hidden_states.stop_gradient:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = recompute(
+                    create_custom_forward(mixer_block),
+                    hidden_states,
+                    cache,
+                )
+            else:
+                hidden_states = mixer_block(hidden_states, cache=cache)
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if use_cache:
+            cache.seqlen_offset += inputs_embeds.shape[1]
+
+        hidden_states = self.norm_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, cache, all_hidden_states] if v is not None)
+
+        return MambaOutput(
+            last_hidden_state=hidden_states,
+            cache=cache if use_cache else None,
+            hidden_states=all_hidden_states,
+        )
+
+
+class MambaForCausalLM(MambaPretrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.backbone = MambaModel(config)
+        if self.config.tie_word_embeddings:
+            self.lm_head = lambda x: paddle.matmul(x, self.backbone.embeddings.weight, transpose_y=True)
+        else:
+            self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias_attr=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        if self.config.tie_word_embeddings:
+            return None
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        if self.config.tie_word_embeddings:
+            return None
+        self.lm_head = new_embeddings
+
+    def get_input_embeddings(self):
+        return self.backbone.get_input_embeddings()
+
+    def set_input_embeddings(self, new_embeddings):
+        return self.backbone.set_input_embeddings(new_embeddings)
+
+    def update_model_kwargs_for_generation(
+        self, outputs: ModelOutput, model_kwargs: Dict[str, Any], **kwargs
+    ) -> Dict[str, Any]:
+        model_kwargs["cache"] = outputs.get("cache", None)
+        return model_kwargs
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        inputs_embeds=None,
+        use_cache=True,
+        cache: Optional[MambaCache] = None,
+        **kwargs,
+    ):
+        # only last token for inputs_ids if the state is passed along.
+        if cache is not None:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+
+        if inputs_embeds is not None and cache is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs["cache"] = cache
+        model_inputs["use_cache"] = use_cache
+        return model_inputs
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        cache: Optional[MambaCache] = None,
+        labels: Optional[paddle.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs,  # for now we need this for generation
+    ) -> Union[Tuple, MambaCausalLMOutput]:
+        r"""
+        labels (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        mamba_outputs = self.backbone(
+            input_ids,
+            cache=cache,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            use_cache=use_cache,
+        )
+        hidden_states = mamba_outputs[0]
+
+        logits = self.lm_head(hidden_states.cast(self.get_input_embeddings().weight.dtype)).cast("float32")
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            # shift_logits = logits[..., :-1, :]
+            # shift_labels = labels[..., 1:]
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                logits.reshape([-1, logits.shape[-1]]),
+                labels.reshape(
+                    [
+                        -1,
+                    ]
+                ),
+            )
+
+        if not return_dict:
+            output = (logits,) + mamba_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MambaCausalLMOutput(
+            loss=loss,
+            logits=logits,
+            cache=mamba_outputs.cache,
+            hidden_states=mamba_outputs.hidden_states,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mamba/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mamba/tokenizer.py
new file mode 100644
index 000000000..440188f68
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mamba/tokenizer.py
@@ -0,0 +1,365 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The Open AI Team Authors and The HuggingFace Inc. team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+import shutil
+from functools import lru_cache
+from typing import Dict, Optional, Union
+
+import numpy as np
+from paddle.utils import try_import
+
+from .. import AddedToken, PretrainedTokenizer
+from ..tokenizer_utils_base import BatchEncoding, EncodedInput, PaddingStrategy
+
+__all__ = ["MambaTokenizer"]
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    _chr = chr
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [_chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class MambaTokenizer(PretrainedTokenizer):
+
+    resource_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
+    pretrained_resource_files_map = {"vocab_file": {}, "merges_file": {}}
+    pretrained_init_configuration = {}
+    model_input_names = ["input_ids"]
+
+    def __init__(
+        self,
+        vocab_file=None,
+        merges_file=None,
+        unk_token="<|endoftext|>",
+        bos_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        pad_token=None,
+        add_bos_token=False,
+        add_eos_token=False,
+        add_prefix_space=False,
+        max_length=None,
+        errors="replace",
+        **kwargs,
+    ):
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+
+        self._build_special_tokens_map_extended(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+        )
+
+        # NOTE: add special tokens to the vocab
+        value = kwargs.pop("added_tokens_decoder", {})
+        additional_special_tokens = []
+        for _, token_kwargs in value.items():
+            if isinstance(token_kwargs, AddedToken):
+                content = token_kwargs
+            else:
+                content = AddedToken(**token_kwargs)
+            additional_special_tokens.append(content)
+        if len(additional_special_tokens) > 0:
+            self._build_special_tokens_map_extended(
+                additional_special_tokens=additional_special_tokens,
+            )
+
+        self._vocab_file = vocab_file
+        self._merges_file = merges_file
+        self.max_length = max_length if max_length is not None else int(1e12)
+        self.num_command_tokens = 2
+        self.num_type_tokens = 2
+
+        with open(vocab_file, "r", encoding="utf-8") as f:
+            self.encoder = json.load(f)
+
+        self.decoder = {v: k for k, v in self.encoder.items()}
+
+        self.num_tokens = len(self.encoder)
+        self.num_text_tokens = self.num_tokens - 1
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+
+        with open(merges_file, encoding="utf-8") as f:
+            bpe_data = f.read().split("\n")[1:-1]
+
+        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        self.add_prefix_space = add_prefix_space
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+
+        re = try_import("regex")
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+        super().__init__(**kwargs)
+
+    @property
+    def vocab_size(self):
+        """
+        Returns the size of vocabulary.
+
+        Returns:
+            int: The sum of size of vocabulary and the size of speical tokens.
+
+        """
+        return len(self.encoder)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        re = try_import("regex")
+        for token in re.findall(self.pat, text):
+            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    def _convert_token_to_id(self, token):
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        return self.decoder[index]
+
+    def convert_ids_to_string(self, ids):
+        """
+        Converts a single index or a sequence of indices to texts.
+
+        Args:
+            ids (int|List[int]):
+                The token id (or token ids) to be converted to text.
+
+        Returns:
+            str: The decoded text.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import MambaTokenizer
+                tokenizer = MambaTokenizer.from_pretrained('state-spaces/mamba-2.8b-hf')
+                print(tokenizer.convert_ids_to_string([21096, 281, 897, 367, 17014, 49, 17014, 285, 367, 17014, 47, 13010]))
+                # 'Welcome to use PaddlePaddle and PaddleNLP'
+
+        """
+
+        text = "".join([self.decoder[id] for id in ids])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+
+        return text
+
+    def save_resources(self, save_directory):
+        """
+        Saves `SentencePiece <https://github.com/google/sentencepiece>`__ file
+        (ends with '.spm') under `save_directory`.
+
+        Args:
+            save_directory (str): Directory to save files into.
+        """
+        for name, file_name in self.resource_files_names.items():
+            source_path = getattr(self, "_%s" % name)
+
+            save_path = os.path.join(save_directory, file_name)
+            if os.path.abspath(source_path) != os.path.abspath(save_path):
+                shutil.copyfile(source_path, save_path)
+
+    def convert_tokens_to_string(self, tokens):
+        """
+        Converts a sequence of tokens (string) in a single string.
+        """
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
+        if is_split_into_words or add_prefix_space:
+            text = " " + text
+        return (text, kwargs)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        if self.add_bos_token:
+            bos_token_ids = [self.bos_token_id]
+        else:
+            bos_token_ids = []
+
+        if self.add_eos_token:
+            eos_token_ids = [self.eos_token_id]
+        else:
+            eos_token_ids = []
+
+        output = bos_token_ids + token_ids_0
+
+        if token_ids_1 is None:
+            return output + eos_token_ids
+
+        return output + bos_token_ids + token_ids_1 + eos_token_ids
+
+    def _pad(
+        self,
+        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                >= 7.5 (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+
+        # attention_mask shape [1,seq_len,seq_len]
+        if "attention_mask" in encoded_inputs and len(np.shape(encoded_inputs["attention_mask"])) > 2:
+            attention_mask = encoded_inputs["attention_mask"]
+            encoded_inputs.pop("attention_mask")
+        else:
+            attention_mask = None
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+        encoded_inputs = super()._pad(
+            encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, return_attention_mask
+        )
+        if attention_mask is not None and len(np.shape(attention_mask)) > 2:
+            encoded_inputs["attention_mask"] = attention_mask
+            needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+            if needs_to_be_padded:
+                difference = max_length - len(required_input)
+                if "attention_mask" in encoded_inputs:
+                    encoded_inputs["attention_mask"] = np.pad(
+                        encoded_inputs["attention_mask"],
+                        pad_width=[(0, 0), (difference, 0), (difference, 0)],
+                        mode="constant",
+                        constant_values=0,
+                    )
+        return encoded_inputs
+
+    def decode(
+        self,
+        token_ids,
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = True,
+        spaces_between_special_tokens: bool = False,
+        **kwargs,
+    ) -> str:
+        return super().decode(
+            token_ids=token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            **kwargs,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mbart/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mbart/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mbart/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mbart/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mbart/configuration.py
new file mode 100644
index 000000000..2876d8821
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mbart/configuration.py
@@ -0,0 +1,272 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MBart model configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["MBART_PRETRAINED_INIT_CONFIGURATION", "MBartConfig", "MBART_PRETRAINED_RESOURCE_FILES_MAP"]
+
+MBART_PRETRAINED_INIT_CONFIGURATION = {
+    "mbart-large-cc25": {
+        "vocab_size": 250027,
+        "bos_token_id": 0,
+        "pad_token_id": 1,
+        "eos_token_id": 2,
+        "d_model": 1024,
+        "num_encoder_layers": 12,
+        "num_decoder_layers": 12,
+        "encoder_attention_heads": 16,
+        "decoder_attention_heads": 16,
+        "encoder_ffn_dim": 4096,
+        "decoder_ffn_dim": 4096,
+        "dropout": 0.1,
+        "activation_function": "gelu",
+        "attention_dropout": 0.0,
+        "activation_dropout": 0.0,
+        "max_position_embeddings": 1024,
+        "init_std": 0.02,
+        "scale_embedding": True,
+    },
+    "mbart-large-en-ro": {
+        "vocab_size": 250027,
+        "bos_token_id": 0,
+        "pad_token_id": 1,
+        "eos_token_id": 2,
+        "decoder_start_token_id": 250020,
+        "d_model": 1024,
+        "num_encoder_layers": 12,
+        "num_decoder_layers": 12,
+        "encoder_attention_heads": 16,
+        "decoder_attention_heads": 16,
+        "encoder_ffn_dim": 4096,
+        "decoder_ffn_dim": 4096,
+        "dropout": 0.1,
+        "activation_function": "gelu",
+        "attention_dropout": 0.1,
+        "activation_dropout": 0.0,
+        "max_position_embeddings": 1024,
+        "init_std": 0.02,
+        "scale_embedding": True,
+    },
+    "mbart-large-50-one-to-many-mmt": {
+        "vocab_size": 250054,
+        "bos_token_id": 0,
+        "pad_token_id": 1,
+        "eos_token_id": 2,
+        "decoder_start_token_id": 2,
+        "d_model": 1024,
+        "num_encoder_layers": 12,
+        "num_decoder_layers": 12,
+        "encoder_attention_heads": 16,
+        "decoder_attention_heads": 16,
+        "encoder_ffn_dim": 4096,
+        "decoder_ffn_dim": 4096,
+        "dropout": 0.1,
+        "activation_function": "relu",
+        "attention_dropout": 0.0,
+        "activation_dropout": 0.0,
+        "max_position_embeddings": 1024,
+        "init_std": 0.02,
+        "scale_embedding": True,
+    },
+    "mbart-large-50-many-to-one-mmt": {
+        "vocab_size": 250054,
+        "bos_token_id": 0,
+        "pad_token_id": 1,
+        "eos_token_id": 2,
+        "decoder_start_token_id": 2,
+        "forced_bos_token_id": 250004,
+        "d_model": 1024,
+        "num_encoder_layers": 12,
+        "num_decoder_layers": 12,
+        "encoder_attention_heads": 16,
+        "decoder_attention_heads": 16,
+        "encoder_ffn_dim": 4096,
+        "decoder_ffn_dim": 4096,
+        "dropout": 0.1,
+        "activation_function": "relu",
+        "attention_dropout": 0.0,
+        "activation_dropout": 0.0,
+        "max_position_embeddings": 1024,
+        "init_std": 0.02,
+        "scale_embedding": True,
+    },
+    "mbart-large-50-many-to-many-mmt": {
+        "vocab_size": 250054,
+        "bos_token_id": 0,
+        "pad_token_id": 1,
+        "eos_token_id": 2,
+        "decoder_start_token_id": 2,
+        "d_model": 1024,
+        "num_encoder_layers": 12,
+        "num_decoder_layers": 12,
+        "encoder_attention_heads": 16,
+        "decoder_attention_heads": 16,
+        "encoder_ffn_dim": 4096,
+        "decoder_ffn_dim": 4096,
+        "dropout": 0.1,
+        "activation_function": "relu",
+        "attention_dropout": 0.0,
+        "activation_dropout": 0.0,
+        "max_position_embeddings": 1024,
+        "init_std": 0.02,
+        "scale_embedding": True,
+    },
+}
+
+MBART_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "mbart-large-cc25": "https://bj.bcebos.com/paddlenlp/models/transformers/mbart/mbart-large-cc25.pdparams",
+        "mbart-large-en-ro": "https://bj.bcebos.com/paddlenlp/models/transformers/mbart/mbart-large-en-ro.pdparams",
+        "mbart-large-50-one-to-many-mmt": "https://bj.bcebos.com/paddlenlp/models/transformers/mbart50/mbart-large-50-one-to-many-mmt.pdparams",
+        "mbart-large-50-many-to-one-mmt": "https://bj.bcebos.com/paddlenlp/models/transformers/mbart50/mbart-large-50-many-to-one-mmt.pdparams",
+        "mbart-large-50-many-to-many-mmt": "https://bj.bcebos.com/paddlenlp/models/transformers/mbart50/mbart-large-50-many-to-many-mmt.pdparams",
+    }
+}
+
+
+class MBartConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MBartModel`]. It is used to instantiate a MBART
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the MBART mbart-large-cc25 architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (int, optional):
+            Vocabulary size of `inputs_ids` in `MBartModel`. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `MBartModel`.
+            Defaults to 50265.
+        bos_token (int, optional):
+            The beginning of sequence token that was used during pretraining. Can be
+            used a sequence classifier token.
+            Defaults to `0`.
+        pad_token_id(int, optional):
+            The index of padding token in the token vocabulary.
+            Defaults to `1`.
+        eos_token (int, optional):
+            A special token representing the end of a sequence that was used during pretraining.
+            Defaults to `2`.
+        d_model (int, optional):
+            Dimensionality of the embedding layer, encoder layer and decoder layer. Defaults to `768`.
+        num_encoder_layers (int, optional):
+            Number of hidden layers in the Transformer encoder. Defaults to `6`.
+        num_decoder_layers (int, optional):
+            Number of hidden layers in the Transformer decoder. Defaults to `6`.
+        encoder_attention_heads (int, optional):
+            Number of attention heads for each attention layer in the Transformer encoder.
+            Defaults to `12`.
+        decoder_attention_heads (int, optional):
+            Number of attention heads for each attention layer in the Transformer decoder.
+            Defaults to `12`.
+        encoder_ffn_dim (int, optional):
+            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
+            to ff layers are firstly projected from `d_model` to `encoder_ffn_dim`,
+            and then projected back to `d_model`. Typically `encoder_ffn_dim` is larger than `d_model`.
+            Defaults to `3072`.
+        decoder_ffn_dim (int, optional):
+            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
+            to ff layers are firstly projected from `d_model` to `decoder_ffn_dim`,
+            and then projected back to `d_model`. Typically `decoder_ffn_dim` is larger than `d_model`.
+            Defaults to `3072`.
+        dropout (float, optional):
+            The dropout probability used in all fully connected layers (pre-process and post-process of MHA and FFN sub-layer)
+            in the encoders and decoders. Defaults to `0.1`.
+        activation_function (str, optional):
+            The non-linear activation function in the feed-forward layer.
+            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions are supported.
+            Defaults to `"gelu"`.
+        attention_dropout (float, optional):
+            The dropout probability used in MultiHeadAttention in all encoder layers and decoder layers to drop some attention target.
+            Defaults to `0.1`.
+        activation_dropout (float, optional):
+            The dropout probability used after FFN activation in all encoder layers and decoder layers.
+            Defaults to `0.1`.
+        max_position_embeddings (int, optional):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
+            sequence. Defaults to `1024`.
+        init_std (float, optional):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            Default to `0.02`.
+        num_labels (`int`, optional):
+            The number of labels to use in [`BartForSequenceClassification`]. Defaults to 3.
+        forced_eos_token_id (`int`, optional):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`. Defaults to 2.
+        scale_embedding (`bool`, optional):
+            Scale embeddings by diving by sqrt(d_model). Default to `True`.
+
+    """
+    model_type = "mbart"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map: Dict[str, str] = {
+        "num_encoder_layers": "encoder_layers",
+        "num_decoder_layers": "decoder_layers",
+        "num_classes": "num_labels",
+    }
+    pretrained_init_configuration = MBART_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size: int = 50265,
+        bos_token_id: int = 0,
+        pad_token_id: int = 1,
+        eos_token_id: int = 2,
+        forced_eos_token_id: int = 2,
+        d_model: int = 768,
+        encoder_layers: int = 12,
+        decoder_layers: int = 12,
+        encoder_attention_heads: int = 16,
+        decoder_attention_heads: int = 16,
+        encoder_ffn_dim: int = 4096,
+        decoder_ffn_dim: int = 4096,
+        dropout: float = 0.1,
+        activation_function: str = "gelu",
+        attention_dropout: float = 0.0,
+        activation_dropout: float = 0.0,
+        max_position_embeddings: int = 1024,
+        init_std: float = 0.02,
+        is_encoder_decoder: bool = True,
+        scale_embedding: bool = True,
+        **kwargs
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            forced_eos_token_id=forced_eos_token_id,
+            **kwargs,
+        )
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.scale_embedding = scale_embedding
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mbart/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mbart/modeling.py
new file mode 100644
index 000000000..34ccf2f3a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mbart/modeling.py
@@ -0,0 +1,1150 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import Tensor
+from paddle.nn import Embedding, Layer, MultiHeadAttention
+
+from ...utils.env import CONFIG_NAME
+from ...utils.log import logger
+from .. import PretrainedModel, register_base_model
+from ..model_outputs import (
+    ModelOutput,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+    Seq2SeqQuestionAnsweringModelOutput,
+    Seq2SeqSequenceClassifierOutput,
+    convert_encoder_output,
+)
+from .configuration import (
+    MBART_PRETRAINED_INIT_CONFIGURATION,
+    MBART_PRETRAINED_RESOURCE_FILES_MAP,
+    MBartConfig,
+)
+
+__all__ = [
+    "MBartModel",
+    "MBartPretrainedModel",
+    "MBartEncoder",
+    "MBartDecoder",
+    "MBartClassificationHead",
+    "MBartForSequenceClassification",
+    "MBartForQuestionAnswering",
+    "MBartForConditionalGeneration",
+]
+
+Cache = MultiHeadAttention.Cache
+StaticCache = MultiHeadAttention.StaticCache
+
+
+def shift_tokens_right(input_ids, pad_token_id):
+    """
+    Shift input ids one token to the right, and wrap the last non pad token (the <LID> token)
+    """
+    shifted_input_ids = input_ids.clone()
+    input_flat = paddle.flatten(shifted_input_ids)
+    batch_size, seq_length = shifted_input_ids.shape
+    index = paddle.arange(0, batch_size, 1, dtype="int32") * seq_length
+    index_of_eos = paddle.cast(shifted_input_ids != pad_token_id, dtype="int32").sum(axis=-1) - 1
+    decoder_start_tokens = paddle.gather(input_flat, index + index_of_eos.astype(index.dtype))
+    shifted_input_ids[:, 1:] = shifted_input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_tokens
+    return shifted_input_ids
+
+
+class MBartPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained MBart models. It provides MBart related
+    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
+    `pretrained_init_configuration`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    model_config_file = CONFIG_NAME
+    pretrained_init_configuration = MBART_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = MBART_PRETRAINED_RESOURCE_FILES_MAP
+    base_model_prefix = "mbart"
+    config_class = MBartConfig
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.init_std,
+                        shape=layer.weight.shape,
+                    )
+                )
+
+
+class MBartLearnedPositionalEmbedding(Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, num_embeddings, embedding_dim):
+        # MBart is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models dont have this hack
+        self.offset = 2
+        super().__init__(num_embeddings + self.offset, embedding_dim)
+
+    def forward(self, input_ids_shape: Tuple, past_key_values_length: int = 0) -> Tensor:
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        bsz, seq_len = input_ids_shape[:2]
+        positions = paddle.arange(past_key_values_length, past_key_values_length + seq_len, dtype="int64")
+        return Embedding.forward(self, positions + self.offset)
+
+
+class MBartEncoder(MBartPretrainedModel):
+    """
+    The Transformer Encoder of MBartModel. The arguments of MBartEncoder can see :class:`MBartModel`.
+    """
+
+    def __init__(self, config: MBartConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.d_model = config.d_model
+        self.init_std = config.init_std
+        self.pad_token_id = config.pad_token_id
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model)
+
+        self.embed_scale = (config.d_model**0.5) if config.scale_embedding else 1.0
+        self.encoder_embed_positions = MBartLearnedPositionalEmbedding(config.max_position_embeddings, config.d_model)
+
+        self.encoder_dropout = nn.Dropout(config.dropout)
+        self.encoder_layernorm_embedding = nn.LayerNorm(config.d_model)
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=config.d_model,
+            nhead=config.encoder_attention_heads,
+            dim_feedforward=config.encoder_ffn_dim,
+            dropout=config.dropout,
+            activation=config.activation_function,
+            attn_dropout=config.attention_dropout,
+            act_dropout=config.activation_dropout,
+            normalize_before=True,
+        )
+        self.encoder = nn.TransformerEncoder(encoder_layer, config.encoder_layers, nn.LayerNorm(config.d_model))
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ):
+        """
+        The MBartEncoder forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor, optional):
+                See :class:`MBartModel`.
+            attention_mask (Tensor, optional):
+                See :class:`MBartModel`.
+            input_embeds (Tensor, optional):
+                See :class:`MBartModel`.
+            output_attentions (bool, optional):
+                See :class:`MBartModel`.
+            output_hidden_states (bool, optional):
+                See :class:`MBartModel`.
+            return_dict (bool, optional):
+                See :class:`MBartModel`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions`.
+            Especially, When `return_dict=output_hidden_states=output_attentions=False`,
+            returns tensor `encoder_outputs` which is the output at the last layer of the model.
+            Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size].
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        inputs_embed_pos = self.encoder_embed_positions(input_shape)
+        hidden_states = inputs_embeds + inputs_embed_pos
+        hidden_states = self.encoder_layernorm_embedding(hidden_states)
+        encoder_input = self.encoder_dropout(hidden_states)
+
+        if attention_mask is None and input_ids is not None:
+            attention_mask = (
+                paddle.cast(input_ids == self.pad_token_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
+            )
+        # For 2D attention_mask from tokenizer
+        elif attention_mask.ndim == 2:
+            attention_mask = paddle.unsqueeze(attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
+            attention_mask = (1.0 - attention_mask) * -1e4
+        attention_mask.stop_gradient = True
+
+        encoder_output = self.encoder(
+            encoder_input,
+            src_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return encoder_output
+
+
+class MBartDecoder(MBartPretrainedModel):
+    """
+    The Transformer Decoder of MBartModel. The arguments of MBartDecoder can see :class:`MBartModel`.
+    """
+
+    def __init__(self, config: MBartConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.d_model = config.d_model
+        self.init_std = config.init_std
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model)
+        self.embed_scale = (config.d_model**0.5) if config.scale_embedding else 1.0
+        self.decoder_embed_positions = MBartLearnedPositionalEmbedding(config.max_position_embeddings, config.d_model)
+        self.decoder_dropout = nn.Dropout(config.dropout)
+        self.decoder_layernorm_embedding = nn.LayerNorm(config.d_model)
+
+        decoder_layer = nn.TransformerDecoderLayer(
+            d_model=config.d_model,
+            nhead=config.decoder_attention_heads,
+            dim_feedforward=config.decoder_ffn_dim,
+            dropout=config.dropout,
+            activation=config.activation_function,
+            attn_dropout=config.attention_dropout,
+            act_dropout=config.activation_dropout,
+            normalize_before=True,
+        )
+        self.decoder = nn.TransformerDecoder(decoder_layer, config.decoder_layers, nn.LayerNorm(config.d_model))
+
+    def forward(
+        self,
+        decoder_input_ids: Optional[Tensor] = None,
+        decoder_attention_mask: Optional[Tensor] = None,
+        encoder_output: Union[Tuple[Tensor], ModelOutput, None] = None,
+        memory_mask: Optional[Tensor] = None,
+        cache: Optional[List[Tuple[Cache, StaticCache]]] = None,
+        decoder_inputs_embeds: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        """
+        The MBartDecoder forward method, overrides the `__call__()` special method.
+
+        Args:
+            decoder_input_ids (Tensor, optional):
+                See :class:`MBartModel`.
+            decoder_attention_mask (Tensor, optional):
+                See :class:`MBartModel`.
+            encoder_output (Tensor, optional):
+                See :class:`MBartModel`.
+            memory_mask (Tensor, optional):
+                See :class:`MBartModel`.
+            cache (Tensor, optional):
+                See :class:`MBartModel`.
+            decoder_inputs_embeds (Tensor, optional):
+                See :class:`MBartModel`.
+            output_attentions (bool, optional):
+                See :class:`MBartModel`.
+            output_hidden_states (bool, optional):
+                See :class:`MBartModel`.
+            return_dict (bool, optional):
+                See :class:`MBartModel`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions`.
+            Especially, When `return_dict=output_hidden_states=output_attentions=False`,
+            returns tensor `decoder_outputs` which is the output at the last layer of the model.
+            Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size].
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if decoder_input_ids is not None and decoder_inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif decoder_input_ids is not None:
+            decoder_input_shape = decoder_input_ids.shape
+            decoder_input_ids = decoder_input_ids.reshape((-1, decoder_input_shape[-1]))
+        elif decoder_inputs_embeds is not None:
+            decoder_input_shape = decoder_inputs_embeds.shape[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if decoder_attention_mask is None:
+
+            decoder_length = decoder_input_shape[-1]
+            decoder_attention_mask = paddle.tensor.triu(
+                (paddle.full((decoder_length, decoder_length), -np.inf, dtype=paddle.get_default_dtype())), 1
+            )
+        if decoder_inputs_embeds is None:
+            decoder_inputs_embeds = self.embed_tokens(decoder_input_ids) * self.embed_scale
+
+        past_key_values_length = cache[0][0].k.shape[2] if cache is not None else 0
+        decoder_inputs_embed_pos = self.decoder_embed_positions(decoder_input_shape, past_key_values_length)
+
+        hidden_states = decoder_inputs_embeds + decoder_inputs_embed_pos
+        hidden_states = self.decoder_layernorm_embedding(hidden_states)
+        decoder_input = self.decoder_dropout(hidden_states)
+
+        decoder_output = self.decoder(
+            tgt=decoder_input,
+            memory=encoder_output,
+            tgt_mask=decoder_attention_mask,
+            memory_mask=memory_mask,
+            cache=cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return decoder_output
+
+
+@register_base_model
+class MBartModel(MBartPretrainedModel):
+    r"""
+    The bare MBart Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        Args:
+        config (:class:`MBartConfig`):
+            An instance of MBartConfig used to construct MBartModel.
+    """
+
+    def __init__(self, config: MBartConfig):
+        super().__init__(config)
+        self.init_std = config.init_std
+        self.pad_token_id = config.pad_token_id
+        self.decoder_start_token_id = config.decoder_start_token_id
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+        self.encoder = MBartEncoder(config, self.shared)
+
+        self.decoder = MBartDecoder(config, self.shared)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, value):
+        self.shared = value
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        decoder_input_ids: Optional[Tensor] = None,
+        decoder_attention_mask: Optional[Tensor] = None,
+        encoder_output: Union[Tuple[Tensor], ModelOutput, None] = None,
+        use_cache: Optional[bool] = None,
+        cache: Optional[List[Tuple[Cache, StaticCache]]] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        decoder_inputs_embeds: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The MBartModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor, optional):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
+                [batch_size, num_attention_heads, sequence_length, sequence_length].
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            decoder_input_ids (Tensor, optional):
+                Indices of decoder input sequence tokens in the vocabulary.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means no `decoder_input_ids` is provided, the model will create the tensor
+                by shifting the `input_ids` to the right.
+            decoder_attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention to some unwanted positions in `decoder_input_ids`.
+                Its data type and shape is the same as `attention_mask`. Defaults to `None`.
+            encoder_output (tuple, optional):
+                The output of the encoder, a tuple consists `last_hidden_state`, `hidden_states`(optional), `attentions`(optional).
+                The data type of `last_hidden_state` is float32 and its shape is `[batch_size, sequence_length, hidden_size]`.
+                `hidden_states` is hidden_states of all layers in the Transformer encoder. The length of `hidden_states` is `num_hidden_layers + 1`.
+                For all element in the tuple, its data type should be float32 and its shape is [`batch_size, sequence_length, hidden_size`].
+                `attentions` is attentions of all layers of in the Transformer encoder. The length of `attentions` is `num_hidden_layers`.
+                For all element in the tuple, its data type should be float32 and its shape is [`batch_size, num_attention_heads, sequence_length, sequence_length`].
+            inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation
+                of shape `(batch_size, sequence_length, hidden_size)`. This is useful if you want more control over
+                how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+                Default to None.
+            decoder_inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+                representation  of shape `(batch_size, target_sequence_length, hidden_size)`. If `cache` is used,
+                optionally only the last `decoder_inputs_embeds` have to be input (see `past_key_values`).
+                This is useful if you want more control over how to convert `decoder_input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix. Default to None.
+                If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+                of `inputs_embeds`.
+            use_cache (bool, optional):
+                 Whether or not to use cache. Defaults to `False`. If set to `True`, key value states will be returned and
+                 can be used to speed up decoding.
+            cache (list, optional):
+                It is a list, and each element in the list is a tuple `(incremental_cache, static_cache)`.
+                See `TransformerDecoder.gen_cache <https://github.com/PaddlePaddle/Paddle/blob/release/2.1/python/paddle/nn/layer/transformer.py#L1060>`__ for more details.
+                It is only used for inference and should be None for training.
+                Default to `None`.
+            output_attentions (bool, optional):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail. Defaults to `False`.
+            output_hidden_states (bool, optional):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail. Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions`.
+            Especially, When `return_dict=output_hidden_states=output_attentions=False`,
+            returns tensor `decoder_output`, which is the output at the last layer of the model.
+            Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import MBartModel, MBartTokenizer
+
+                tokenizer = MBartTokenizer.from_pretrained('bart-base')
+                model = MBartModel.from_pretrained('bart-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # different to other models, MBart automatically creates decoder_input_ids from
+        # input MBartForSequenceClassification_ids if no decoder_input_ids are provided
+        if input_ids is None and inputs_embeds is None and encoder_output is None:
+            raise ValueError("You have to specify one of input_ids, inputs_embeds and encoder_output")
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError(
+                    "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
+                    "passed, `input_ids` cannot be `None`. Please pass either "
+                    "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
+                )
+            decoder_input_ids = shift_tokens_right(input_ids, self.pad_token_id)
+        if attention_mask is None and input_ids is not None:
+            logger.warning("input_ids should be specified when generating attention_mask")
+            attention_mask = (
+                paddle.cast(input_ids == self.pad_token_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
+            )
+        # For 2D attention_mask from tokenizer
+        elif attention_mask.ndim == 2:
+            attention_mask = paddle.unsqueeze(attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
+            attention_mask = (1.0 - attention_mask) * -1e4
+            attention_mask.stop_gradient = True
+
+        input_type = type(decoder_input_ids) if decoder_input_ids is not None else type(decoder_inputs_embeds)
+
+        if encoder_output is None:
+            encoder_output = self.encoder(
+                input_ids,
+                attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_output, ModelOutput):
+            if isinstance(encoder_output, input_type):
+                encoder_output = (encoder_output,)
+            encoder_output = convert_encoder_output(encoder_output)
+        if isinstance(encoder_output, input_type):
+            encoder_last_hidden_state = encoder_output
+        else:
+            encoder_last_hidden_state = encoder_output[0]
+
+        if use_cache:
+            if cache is None:
+                cache = self.decoder.decoder.gen_cache(encoder_last_hidden_state)
+        else:
+            cache = None
+
+        memory_mask = attention_mask
+        if attention_mask is not None:
+            if attention_mask.ndim == 4:
+                memory_mask = attention_mask[:, :, -1:, :]
+            elif attention_mask.ndim == 3:
+                memory_mask = attention_mask[:, -1:, :].unsqueeze([1])
+            elif attention_mask.ndim == 2:
+                memory_mask = attention_mask.unsqueeze([1, 2])
+            else:
+                raise ValueError("Invalid attention mask shape. ")
+
+        decoder_output = self.decoder(
+            decoder_input_ids,
+            decoder_attention_mask,
+            encoder_last_hidden_state,
+            memory_mask,
+            cache,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            if isinstance(decoder_output, input_type):
+                decoder_output = (decoder_output,)
+            if isinstance(encoder_output, input_type):
+                encoder_output = (encoder_output,)
+            return decoder_output + encoder_output
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_output.last_hidden_state,
+            past_key_values=decoder_output.past_key_values,
+            decoder_hidden_states=decoder_output.hidden_states,
+            decoder_attentions=decoder_output.attentions,
+            cross_attentions=decoder_output.cross_attentions,
+            encoder_last_hidden_state=encoder_output.last_hidden_state,
+            encoder_hidden_states=encoder_output.hidden_states,
+            encoder_attentions=encoder_output.attentions,
+        )
+
+
+class MBartClassificationHead(Layer):
+    """
+    Head for sentence-level classification tasks.
+    """
+
+    def __init__(self, input_dim: int, inner_dim: int, num_classes: int, pooler_dropout: float):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+
+    def forward(self, hidden_states: Tensor):
+        """
+        Args:
+            hidden_states (Tensor):
+                Hidden states of the classification model.
+        """
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = F.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
+class MBartForSequenceClassification(MBartPretrainedModel):
+    r"""
+    MBart Model with a linear layer on top of the pooled output,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        config (:class:`MBartConfig`):
+            An instance of MBartConfig used to construct MBartForSequenceClassification.
+    """
+
+    def __init__(self, config: MBartConfig):
+        super().__init__(config)
+        self.mbart = MBartModel(config)
+        self.classifier = MBartClassificationHead(
+            config.d_model,
+            config.d_model,
+            config.num_labels,
+            config.classifier_dropout if config.classifier_dropout is not None else config.dropout,
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        decoder_input_ids: Optional[Tensor] = None,
+        decoder_attention_mask: Optional[Tensor] = None,
+        encoder_output: Union[Tuple[Tensor], ModelOutput, None] = None,
+        use_cache: Optional[bool] = None,
+        cache: Optional[List[Tuple[Cache, StaticCache]]] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        decoder_inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The MBartForSequenceClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor, optional):
+                See :class:`MBartModel`.
+            attention_mask (Tensor, optional):
+                See :class:`MBartModel`.
+            decoder_input_ids (Tensor, `optional`):
+                See :class:`MBartModel`.
+            decoder_attention_mask (Tensor, optional):
+                See :class:`MBartModel`.
+            encoder_output (Tensor, optonal):
+                See :class:`MBartModel`.
+            use_cache (bool, optional):
+                See :class:`MBartModel`.
+            cache (Tensor, optional):
+                See :class:`MBartModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`MBartModel`.
+            decoder_inputs_embeds (Tensor, optional):
+                See :class:`MBartModel`.
+            labels (Tensor, optional):
+                Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+                num_labels - 1]`. If `num_labels > 1` a classification loss is computed (Cross-Entropy).
+                Default to `None`.
+            output_attentions (bool, optional):
+                See :class:`MBartModel`.
+            output_hidden_states (bool, optional):
+                See :class:`MBartModel`.
+            return_dict (bool, optional):
+                See :class:`MBartModel`.
+
+        Returns:
+            `An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqSequenceClassifierOutput` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.Seq2SeqSequenceClassifierOutput`.
+            Especially, When `return_dict=output_hidden_states=output_attentions=False` and labels=None,
+            returns tensor `logits`, a tensor of the input text classification logits.
+            Shape as `[batch_size, num_labels]` and dtype as float32.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import MBartForSequenceClassification, MBartTokenizer
+
+                tokenizer = MBartTokenizer.from_pretrained('bart-base')
+                model = MBartForSequenceClassification.from_pretrained('bart-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+
+        if input_ids is None and inputs_embeds is not None:
+            logger.warning(
+                f"{self.__class__.__name__} will not detect eos tokens in `inputs_embeds`. Results may be "
+                "unexpected if using eos tokens in conjunction with `inputs_embeds.`"
+            )
+
+        outputs = self.mbart(
+            input_ids,
+            attention_mask,
+            decoder_input_ids,
+            decoder_attention_mask,
+            encoder_output,
+            use_cache=use_cache,
+            cache=cache,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        output = outputs[0]
+        output_shape = output.shape
+        if input_ids is not None:
+            eos_mask = paddle.cast(input_ids == self.mbart.config.eos_token_id, dtype="int64")
+            if len(paddle.unique(paddle.sum(eos_mask, axis=1))) > 1:
+                raise ValueError("All examples must have the same number of <eos> tokens.")
+
+            # TODO(gongenlei): support bool tensor index
+            output = output.masked_select(eos_mask.unsqueeze(-1).astype("bool").tile([1, 1, output_shape[-1]]))
+        sentence_representation = output.reshape([output_shape[0], -1, output_shape[-1]])[:, -1, :]
+        logits = self.classifier(sentence_representation)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = paddle.nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = paddle.nn.CrossEntropyLoss()
+                loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = paddle.nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            if len(outputs) == 2:
+                return (loss, logits) if loss is not None else logits
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+class MBartForQuestionAnswering(MBartPretrainedModel):
+    r"""
+    MBart Model with a linear layer on top of the hidden-states output to
+    compute `span_start_logits` and `span_end_logits`, designed for question-answering tasks like SQuAD.
+
+    Args:
+        config (:class:`MBartConfig`):
+            An instance of MBartConfig used to construct MBartForQuestionAnswering.
+    """
+
+    def __init__(self, config: MBartConfig):
+        super().__init__(config)
+        self.mbart = MBartModel(config)
+        self.classifier = nn.Linear(config.d_model, 2)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        decoder_input_ids: Optional[Tensor] = None,
+        decoder_attention_mask: Optional[Tensor] = None,
+        encoder_output: Union[Tuple[Tensor], ModelOutput, None] = None,
+        use_cache: Optional[bool] = None,
+        cache: Optional[List[Tuple[Cache, StaticCache]]] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        decoder_inputs_embeds: Optional[Tensor] = None,
+        start_positions: Optional[Tensor] = None,
+        end_positions: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The MBartForQuestionAnswering forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor, optional):
+                See :class:`MBartModel`.
+            attention_mask (Tensor, optional):
+                See :class:`MBartModel`.
+            decoder_input_ids (Tensor, `optional`):
+                See :class:`MBartModel`.
+            decoder_attention_mask (Tensor, optional):
+                See :class:`MBartModel`.
+            encoder_output (Tensor, optonal):
+                See :class:`MBartModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`MBartModel`.
+            decoder_inputs_embeds (Tensor, optional):
+                See :class:`MBartModel`.
+            use_cache (bool, optional):
+                See :class:`MBartModel`.
+            cache (Tensor, optional):
+                See :class:`MBartModel`.
+            start_positions (Tensor, optional):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (sequence_length). Position outside of the sequence
+                are not taken into account for computing the loss.
+                A tensor of shape `(batch_size, )`. Default to `None`.
+            end_positions (Tensor, optional):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (sequence_length). Position outside of the sequence
+                are not taken into account for computing the loss.
+                A tensor of shape `(batch_size, )`. Default to `None`.
+            output_attentions (bool, optional):
+                See :class:`MBartModel`.
+            output_hidden_states (bool, optional):
+                See :class:`MBartModel`.
+            return_dict (bool, optional):
+                See :class:`MBartModel`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqQuestionAnsweringModelOutput` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.Seq2SeqQuestionAnsweringModelOutput`.
+            Especially, When `return_dict=output_hidden_states=output_attentions=False` and `start_positions=end_positions=None`,
+            returns tuple (`start_logits`, `end_logits`).
+
+            With the fields:
+
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import MBartForQuestionAnswering, MBartTokenizer
+
+                tokenizer = MBartTokenizer.from_pretrained('bart-base')
+                model = MBartForQuestionAnswering.from_pretrained('bart-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+                start_logits = outputs[0]
+                end_logits  =outputs[1]
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if start_positions is not None and end_positions is not None:
+            logger.warning(
+                "The `use_cache` argument is changed to `False` since `start_positions` and `end_positions` are provided."
+            )
+            use_cache = False
+        outputs = self.mbart(
+            input_ids,
+            attention_mask,
+            decoder_input_ids,
+            decoder_attention_mask,
+            encoder_output,
+            use_cache=use_cache,
+            cache=cache,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = self.classifier(outputs[0])
+        logits = paddle.transpose(logits, perm=[2, 0, 1])
+        start_logits, end_logits = paddle.unstack(x=logits, axis=0)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if start_positions.ndim > 1:
+                start_positions = start_positions.squeeze(-1)
+            if start_positions.ndim > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clip(0, ignored_index)
+            end_positions = end_positions.clip(0, ignored_index)
+
+            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            outputs = (start_logits, end_logits) + (outputs[1:] if len(outputs) > 2 else ())
+            return ((total_loss,) + outputs) if total_loss else outputs
+
+        return Seq2SeqQuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+class MBartForConditionalGeneration(MBartPretrainedModel):
+    r"""
+     MBart Model with a `language modeling` head on top.
+
+    Args:
+         config (:class:`MBartConfig`):
+             An instance of MBartConfig used to construct MBartForConditionalGeneration.
+    """
+
+    def __init__(self, config: MBartConfig):
+        super().__init__(config)
+        self.mbart = MBartModel(config)
+        self.lm_head_weight = self.create_parameter(
+            shape=[config.vocab_size, config.d_model], dtype=self.mbart.shared.weight.dtype, is_bias=False
+        )
+        self.register_buffer(
+            "final_logits_bias", paddle.zeros((1, config.vocab_size), dtype=paddle.get_default_dtype())
+        )
+
+    def get_encoder(self):
+        return self.mbart.get_encoder()
+
+    def get_decoder(self):
+        return self.mbart.get_decoder()
+
+    def prepare_fast_entry(self, kwargs):
+        from paddlenlp.ops import FasterMBART
+
+        decode_strategy = kwargs.get("decode_strategy")
+        use_fp16_decoding = kwargs.get("use_fp16_decoding", False)
+        if decode_strategy == "sampling" and kwargs.get("top_k") != 0 and kwargs.get("top_p") != 1:
+            raise AttributeError(
+                "Only topk sampling or topp sampling are supported. "
+                "Topk sampling and topp sampling cannot be both applied in the fast version."
+            )
+        if kwargs["repetition_penalty"] != 1.0:
+            # not support for repetition_penalty yet in the fast version
+            raise AttributeError("'repetition_penalty != 1' is not supported yet in the fast version")
+        if kwargs["min_length"] != 0:
+            # not support for min_length yet in the fast version
+            raise AttributeError("'min_length != 0' is not supported yet in the fast version")
+        self._fast_entry = FasterMBART(self, use_fp16_decoding=use_fp16_decoding).forward
+        return self._fast_entry
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        decoder_input_ids: Optional[Tensor] = None,
+        decoder_attention_mask: Optional[Tensor] = None,
+        encoder_output: Union[Tuple[Tensor], ModelOutput, None] = None,
+        use_cache: Optional[bool] = None,
+        cache: Optional[List[Tuple[Cache, StaticCache]]] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        decoder_inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The MBartForConditionalGeneration forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor, optional):
+                See :class:`MBartModel`.
+            attention_mask (Tensor, optional):
+                See :class:`MBartModel`.
+            decoder_input_ids (Tensor, `optional`):
+                See :class:`MBartModel`.
+            decoder_attention_mask (Tensor, optional):
+                See :class:`MBartModel`.
+            encoder_output (Tensor, optonal):
+                See :class:`MBartModel`.
+                See :class:`MBartModel`.
+            use_cache (bool, optional):
+                See :class:`MBartModel`.
+            cache (Tensor, optional):
+                See :class:`MBartModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`MBartModel`.
+            decoder_inputs_embeds (Tensor, optional):
+            labels (Tensor, optional):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., vocab_size]`.
+                A tensor of shape `(batch_size, sequence_length)`. Default to `None`.
+            output_attentions (bool, optional):
+                See :class:`MBartModel`.
+            output_hidden_states (bool, optional):
+                See :class:`MBartModel`.
+            return_dict (bool, optional):
+                See :class:`MBartModel`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqLMOutput` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.Seq2SeqLMOutput`.
+            Especially, When `use_cache=return_dict=output_hidden_states=output_attentions=False` and labels=None,
+            returns tensor `logits`, a tensor of the input text classification logits.
+
+            With the fields:
+
+            - `lm_logits` (Tensor):
+                The generated sentence of the model.
+                Its data type should be float32 and has a shape of [batch_size, sequence_length, vocab_size].
+
+            - `cache` (Tensor):
+                See :class:`MBartModel`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import MBartForConditionalGeneration, MBartTokenizer
+
+                tokenizer = MBartTokenizer.from_pretrained('bart-base')
+                model = MBartForConditionalGeneration.from_pretrained('bart-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+
+        outputs = self.mbart(
+            input_ids,
+            attention_mask,
+            decoder_input_ids,
+            decoder_attention_mask,
+            encoder_output,
+            use_cache=use_cache,
+            cache=cache,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        lm_logits = paddle.tensor.matmul(outputs[0], self.lm_head_weight, transpose_y=True) + self.final_logits_bias
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            masked_lm_loss = loss_fct(lm_logits.reshape((-1, self.mbart.config.vocab_size)), labels.reshape((-1,)))
+
+        if not return_dict:
+            if len(outputs) == 2:
+                return (masked_lm_loss, lm_logits) if masked_lm_loss is not None else lm_logits
+            else:
+                outputs = (lm_logits,) + outputs[1:]
+                return ((masked_lm_loss,) + outputs) if masked_lm_loss is not None else outputs
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        attention_mask=None,
+        decoder_attention_mask=None,
+        cache=None,
+        use_cache=False,
+        encoder_output=None,
+        **kwargs
+    ):
+        # cut decoder_input_ids if past is used
+        if cache is not None:
+            decoder_input_ids = decoder_input_ids[:, -1].unsqueeze(-1)
+            if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask[:, :, -1, :].unsqueeze(2)
+
+        return {
+            "input_ids": None,
+            "decoder_input_ids": decoder_input_ids,
+            "encoder_output": encoder_output,
+            "decoder_attention_mask": decoder_attention_mask,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,
+            "cache": cache,
+        }
+
+    def __getattr__(self, name):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(getattr(self, self.base_model_prefix), name)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mbart/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mbart/tokenizer.py
new file mode 100644
index 000000000..163031e17
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mbart/tokenizer.py
@@ -0,0 +1,631 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from shutil import copyfile
+
+import sentencepiece as spm
+
+from .. import AddedToken, PretrainedTokenizer
+
+__all__ = ["MBartTokenizer", "MBart50Tokenizer"]
+
+MBART_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "mbart-large-cc25": 1024,
+    "mbart-large-en-ro": 1024,
+}
+
+MBART50_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "mbart-large-50-one-to-many-mmt": 1024,
+    "mbart-large-50-many-to-one-mmt": 1024,
+    "mbart-large-50-many-to-many-mmt": 1024,
+}
+
+
+class MBartTokenizer(PretrainedTokenizer):
+    resource_files_names = {
+        "vocab_file": "sentencepiece.bpe.model",
+    }
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "mbart-large-en-ro": "https://bj.bcebos.com/paddlenlp/models/transformers/mbart/mbart-large-en-ro.sentencepiece.bpe.model",
+            "mbart-large-cc25": "https://bj.bcebos.com/paddlenlp/models/transformers/mbart/mbart-large-cc25.sentencepiece.bpe.model",
+        }
+    }
+    pretrained_init_configuration = {"mbart-large-cc25": {}, "mbart-large-en-ro": {}}
+    max_model_input_sizes = MBART_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids"]
+
+    FAIRSEQ_LANGUAGE_CODES = [
+        "ar_AR",
+        "cs_CZ",
+        "de_DE",
+        "en_XX",
+        "es_XX",
+        "et_EE",
+        "fi_FI",
+        "fr_XX",
+        "gu_IN",
+        "hi_IN",
+        "it_IT",
+        "ja_XX",
+        "kk_KZ",
+        "ko_KR",
+        "lt_LT",
+        "lv_LV",
+        "my_MM",
+        "ne_NP",
+        "nl_XX",
+        "ro_RO",
+        "ru_RU",
+        "si_LK",
+        "tr_TR",
+        "vi_VN",
+        "zh_CN",
+    ]
+
+    def __init__(
+        self,
+        vocab_file,
+        src_lang=None,
+        tgt_lang=None,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        sp_model_kwargs=None,
+        additional_special_tokens=None,
+        **kwargs
+    ):
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+        self._build_special_tokens_map_extended(mask_token=mask_token)
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.vocab_file = vocab_file
+        self.sp_model.Load(str(vocab_file))
+        self.fairseq_offset = 1
+        self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
+        self.sp_model_size = len(self.sp_model)
+        self.lang_code_to_id = {
+            code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(self.FAIRSEQ_LANGUAGE_CODES)
+        }
+        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset
+        self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
+        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+        self.src_lang = src_lang if src_lang is not None else "en_XX"
+        self.tgt_lang = tgt_lang
+        # Get `special_tokens_map` after `_wrap_init()`
+        self.eos_token_id = self.fairseq_tokens_to_ids[eos_token]
+        self.unk_token_id = self.fairseq_tokens_to_ids[unk_token]
+        self.set_src_lang_special_tokens(self.src_lang)
+        self._additional_special_tokens = list(self.lang_code_to_id.keys())
+
+        if additional_special_tokens is not None:
+            # Only add those special tokens if they are not already there.
+            self._additional_special_tokens.extend(
+                [t for t in additional_special_tokens if t not in self._additional_special_tokens]
+            )
+
+    def __call__(
+        self,
+        text,
+        text_pair=None,
+        max_length=None,
+        stride=0,
+        is_split_into_words=False,
+        padding=None,
+        truncation="longest_first",
+        return_position_ids=False,
+        return_token_type_ids=False,
+        return_attention_mask=True,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_special_tokens_mask=False,
+        **kwargs
+    ):
+        if "pad_to_max_seq_len" in kwargs and padding is None:
+            pad_to_max_seq_len = kwargs.pop("pad_to_max_seq_len")
+            padding = "max_length" if pad_to_max_seq_len else False
+        elif padding is None:
+            padding = False
+
+        if "max_seq_len" in kwargs and max_length is None:
+            max_length = kwargs["max_seq_len"]
+
+        if "truncation_strategy" in kwargs and kwargs["truncation_strategy"] != "longest_first":
+            truncation = kwargs["truncation_strategy"]
+
+        return super(MBartTokenizer, self).__call__(
+            text=text,
+            text_pair=text_pair,
+            max_length=max_length,
+            stride=stride,
+            is_split_into_words=is_split_into_words,
+            padding=padding,
+            truncation=truncation,
+            return_position_ids=return_position_ids,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_length=return_length,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            **kwargs,
+        )
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+
+    def save_resources(self, save_directory):
+        for name, file_name in self.resource_files_names.items():
+            save_path = os.path.join(save_directory, file_name)
+            if os.path.abspath(self.vocab_file) != os.path.abspath(save_path) and os.path.isfile(self.vocab_file):
+                copyfile(self.vocab_file, save_path)
+            elif not os.path.isfile(self.vocab_file):
+                with open(save_path, "wb") as fi:
+                    content_spiece_model = self.sp_model.serialized_model_proto()
+                    fi.write(content_spiece_model)
+
+    @property
+    def vocab_size(self):
+        """
+        Returns the size of vocabulary.
+
+        Returns:
+            int: The sum of size of vocabulary and the size of speical tokens.
+
+        """
+
+        return len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + 1
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text):
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """
+        Converts a token (str) in an id using the vocab.
+        """
+        if token in self.fairseq_tokens_to_ids:
+            return self.fairseq_tokens_to_ids[token]
+        spm_id = self.sp_model.PieceToId(token)
+
+        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
+
+    def _convert_id_to_token(self, index):
+        """
+        Converts an index (integer) in a token (str) using the vocab.
+        """
+        if index in self.fairseq_ids_to_tokens:
+            return self.fairseq_ids_to_tokens[index]
+        return self.sp_model.IdToPiece(index - self.fairseq_offset)
+
+    def convert_tokens_to_string(self, tokens):
+        """
+        Converts a sequence of tokens (strings for sub-words) in a single string.
+        """
+        out_string = "".join(tokens).replace("▁", " ").strip()
+        return out_string
+
+    def convert_ids_to_string(self, ids):
+        """
+        Converts a sequence of tokens (strings for sub-words) in a single string.
+        """
+        tokens = self.convert_ids_to_tokens(ids)
+        out_string = "".join(tokens).replace("▁", " ").strip()
+        return out_string
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieve sequence ids from a token list that has no special tokens added.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        prefix_ones = [1] * len(self.prefix_tokens)
+        suffix_ones = [1] * len(self.suffix_tokens)
+        if token_ids_1 is None:
+            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
+        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An MBART sequence has the following format, where ``X`` represents the sequence:
+
+        - ``input_ids`` (for encoder) ``X [eos, src_lang_code]``
+        - ``decoder_input_ids``: (for decoder) ``X [eos, tgt_lang_code]``
+
+        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
+        separator.
+        """
+        if token_ids_1 is None:
+            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        """
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        Should be overridden in a subclass if the model has a special way of building those.
+
+        Args:
+            offset_mapping_0 (List[tuple]):
+                List of char offsets to which the special tokens will be added.
+            offset_mapping_1 (List[tuple], optional):
+                Optional second list of char offsets for offset mapping pairs.
+
+        Returns:
+            List[tuple]: List of char offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + offset_mapping_1 + [(0, 0)]
+
+    def set_src_lang_special_tokens(self, src_lang):
+        """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
+        self.cur_lang_code_id = self.lang_code_to_id[src_lang]
+        self.prefix_tokens = []
+        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code_id]
+
+    def set_tgt_lang_special_tokens(self, tgt_lang):
+        """Reset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code]."""
+        self.cur_lang_code_id = self.lang_code_to_id[tgt_lang]
+        self.prefix_tokens = []
+        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code_id]
+
+
+class MBart50Tokenizer(PretrainedTokenizer):
+    resource_files_names = {
+        "vocab_file": "sentencepiece.bpe.model",
+    }
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "mbart-large-50-one-to-many-mmt": "https://bj.bcebos.com/paddlenlp/models/transformers/mbart50/mbart-large-50-one-to-many-mmt.sentencepiece.bpe.model",
+            "mbart-large-50-many-to-one-mmt": "https://bj.bcebos.com/paddlenlp/models/transformers/mbart50/mbart-large-50-many-to-one-mmt.sentencepiece.bpe.model",
+            "mbart-large-50-many-to-many-mmt": "https://bj.bcebos.com/paddlenlp/models/transformers/mbart50/mbart-large-50-many-to-many-mmt.sentencepiece.bpe.model",
+        }
+    }
+    pretrained_init_configuration = {
+        "mbart-large-50-one-to-many-mmt": {},
+        "mbart-large-50-many-to-one-mmt": {},
+        "mbart-large-50-many-to-many-mmt": {},
+    }
+    max_model_input_sizes = MBART50_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids"]
+
+    FAIRSEQ_LANGUAGE_CODES = [
+        "ar_AR",
+        "cs_CZ",
+        "de_DE",
+        "en_XX",
+        "es_XX",
+        "et_EE",
+        "fi_FI",
+        "fr_XX",
+        "gu_IN",
+        "hi_IN",
+        "it_IT",
+        "ja_XX",
+        "kk_KZ",
+        "ko_KR",
+        "lt_LT",
+        "lv_LV",
+        "my_MM",
+        "ne_NP",
+        "nl_XX",
+        "ro_RO",
+        "ru_RU",
+        "si_LK",
+        "tr_TR",
+        "vi_VN",
+        "zh_CN",
+        "af_ZA",
+        "az_AZ",
+        "bn_IN",
+        "fa_IR",
+        "he_IL",
+        "hr_HR",
+        "id_ID",
+        "ka_GE",
+        "km_KH",
+        "mk_MK",
+        "ml_IN",
+        "mn_MN",
+        "mr_IN",
+        "pl_PL",
+        "ps_AF",
+        "pt_XX",
+        "sv_SE",
+        "sw_KE",
+        "ta_IN",
+        "te_IN",
+        "th_TH",
+        "tl_XX",
+        "uk_UA",
+        "ur_PK",
+        "xh_ZA",
+        "gl_ES",
+        "sl_SI",
+    ]
+
+    def __init__(
+        self,
+        vocab_file,
+        src_lang=None,
+        tgt_lang=None,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        sp_model_kwargs=None,
+        additional_special_tokens=None,
+        **kwargs
+    ):
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+        self._build_special_tokens_map_extended(mask_token=mask_token)
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.vocab_file = vocab_file
+        self.sp_model.Load(str(vocab_file))
+        self.fairseq_offset = 1
+        self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
+        self.sp_model_size = len(self.sp_model)
+        self.lang_code_to_id = {
+            code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(self.FAIRSEQ_LANGUAGE_CODES)
+        }
+        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset
+        self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
+        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+        self.src_lang = src_lang if src_lang is not None else "en_XX"
+        self.tgt_lang = tgt_lang
+        # Get `special_tokens_map` after `_wrap_init()`
+        self.eos_token_id = self.fairseq_tokens_to_ids[eos_token]
+        self.unk_token_id = self.fairseq_tokens_to_ids[unk_token]
+        self.set_src_lang_special_tokens(self.src_lang)
+        self._additional_special_tokens = list(self.lang_code_to_id.keys())
+
+        if additional_special_tokens is not None:
+            # Only add those special tokens if they are not already there.
+            self._additional_special_tokens.extend(
+                [t for t in additional_special_tokens if t not in self._additional_special_tokens]
+            )
+
+    def __call__(
+        self,
+        text,
+        text_pair=None,
+        max_length=None,
+        stride=0,
+        is_split_into_words=False,
+        padding=None,
+        truncation="longest_first",
+        return_position_ids=False,
+        return_token_type_ids=False,
+        return_attention_mask=True,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_special_tokens_mask=False,
+        **kwargs
+    ):
+        if "pad_to_max_seq_len" in kwargs and padding is None:
+            pad_to_max_seq_len = kwargs.pop("pad_to_max_seq_len")
+            padding = "max_length" if pad_to_max_seq_len else False
+        elif padding is None:
+            padding = False
+
+        if "max_seq_len" in kwargs and max_length is None:
+            max_length = kwargs["max_seq_len"]
+
+        if "truncation_strategy" in kwargs and kwargs["truncation_strategy"] != "longest_first":
+            truncation = kwargs["truncation_strategy"]
+
+        return super(MBart50Tokenizer, self).__call__(
+            text=text,
+            text_pair=text_pair,
+            max_length=max_length,
+            stride=stride,
+            is_split_into_words=is_split_into_words,
+            padding=padding,
+            truncation=truncation,
+            return_position_ids=return_position_ids,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_length=return_length,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            **kwargs,
+        )
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+
+    def save_resources(self, save_directory):
+        for name, file_name in self.resource_files_names.items():
+            save_path = os.path.join(save_directory, file_name)
+            if os.path.abspath(self.vocab_file) != os.path.abspath(save_path) and os.path.isfile(self.vocab_file):
+                copyfile(self.vocab_file, save_path)
+            elif not os.path.isfile(self.vocab_file):
+                with open(save_path, "wb") as fi:
+                    content_spiece_model = self.sp_model.serialized_model_proto()
+                    fi.write(content_spiece_model)
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text):
+        return self.sp_model.encode(text, out_type=str)
+
+    @property
+    def vocab_size(self):
+        """
+        Returns the size of vocabulary.
+
+        Returns:
+            int: The sum of size of vocabulary and the size of speical tokens.
+
+        """
+
+        return len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + 1
+
+    def _convert_token_to_id(self, token):
+        """
+        Converts a token (str) in an id using the vocab.
+        """
+        if token in self.fairseq_tokens_to_ids:
+            return self.fairseq_tokens_to_ids[token]
+        spm_id = self.sp_model.PieceToId(token)
+
+        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
+
+    def _convert_id_to_token(self, index):
+        """
+        Converts an index (integer) in a token (str) using the vocab.
+        """
+        if index in self.fairseq_ids_to_tokens:
+            return self.fairseq_ids_to_tokens[index]
+        return self.sp_model.IdToPiece(index - self.fairseq_offset)
+
+    def convert_tokens_to_string(self, tokens):
+        """
+        Converts a sequence of tokens (strings for sub-words) in a single string.
+        """
+        out_string = "".join(tokens).replace("▁", " ").strip()
+        return out_string
+
+    def convert_ids_to_string(self, ids):
+        """
+        Converts a sequence of tokens (strings for sub-words) in a single string.
+        """
+        tokens = self.convert_ids_to_tokens(ids)
+        out_string = "".join(tokens).replace("▁", " ").strip()
+        return out_string
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieve sequence ids from a token list that has no special tokens added.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        prefix_ones = [1] * len(self.prefix_tokens)
+        suffix_ones = [1] * len(self.suffix_tokens)
+        if token_ids_1 is None:
+            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
+        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An MBART50 sequence has the following format, where ``X`` represents the sequence:
+
+        - ``input_ids`` (for encoder) ``[src_lang_code] X [eos]``
+        - ``labels``: (for decoder) ``[tgt_lang_code] X [eos]``
+
+        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
+        separator.
+        """
+        if token_ids_1 is None:
+            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        """
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        Should be overridden in a subclass if the model has a special way of building those.
+
+        Args:
+            offset_mapping_0 (List[tuple]):
+                List of char offsets to which the special tokens will be added.
+            offset_mapping_1 (List[tuple], optional):
+                Optional second list of char offsets for offset mapping pairs.
+
+        Returns:
+            List[tuple]: List of char offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + offset_mapping_1 + [(0, 0)]
+
+    def set_src_lang_special_tokens(self, src_lang):
+        """Reset the special tokens to the source lang setting. prefix=[src_lang_code] and suffix=[eos]."""
+        self.cur_lang_code_id = self.lang_code_to_id[src_lang]
+        self.prefix_tokens = [self.cur_lang_code_id]
+        self.suffix_tokens = [self.eos_token_id]
+
+    def set_tgt_lang_special_tokens(self, tgt_lang):
+        """Reset the special tokens to the target language setting. prefix=[tgt_lang_code] and suffix=[eos]."""
+        self.cur_lang_code_id = self.lang_code_to_id[tgt_lang]
+        self.prefix_tokens = [self.cur_lang_code_id]
+        self.suffix_tokens = [self.eos_token_id]
+
+    def _build_translation_inputs(self, raw_inputs, return_tensors, src_lang, tgt_lang, **extra_kwargs):
+        """Used by translation pipeline, to prepare inputs for the generate function"""
+        if src_lang is None or tgt_lang is None:
+            raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
+        self.src_lang = src_lang
+        inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
+        tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
+        inputs["forced_bos_token_id"] = tgt_lang_id
+        return inputs
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mc2_parallel_linear.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mc2_parallel_linear.py
new file mode 100644
index 000000000..232c66e5a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mc2_parallel_linear.py
@@ -0,0 +1,230 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import paddle
+
+try:
+    import paddle_custom_device
+except ImportError:
+    pass
+
+from paddle import distributed as dist
+from paddle.autograd import PyLayer
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        ColumnSequenceParallelLinear,
+        RowSequenceParallelLinear,
+    )
+except:
+    pass
+from paddlenlp.utils.tools import get_env_device
+
+__all_gather_recomputation__ = False
+if int(os.getenv("FLAGS_NPU_MC2_Recompute", 0)):
+    __all_gather_recomputation__ = True
+
+
+def is_mc2_valid():
+    current_device = get_env_device()
+    if current_device == "npu":
+        return int(os.getenv("FLAGS_NPU_MC2", 0))
+    return 0
+
+
+if is_mc2_valid():
+
+    class MC2ColumnParallelCoreLinear(PyLayer):
+        @staticmethod
+        def forward(ctx, input_, weight, group):
+            ctx.save_for_backward(input_, weight)
+            ctx.group = group
+            input_mp = input_
+            result_mp = paddle.matmul(input_mp, weight)
+            return result_mp
+
+        @staticmethod
+        def backward(ctx, dy):
+            input_, weight = ctx.saved_tensor()
+            sub_grad = dy.reshape([-1, dy.shape[-1]])
+            rank = paddle.distributed.get_rank()
+            hcom_name = ctx.group.process_group.get_comm_name(rank)
+
+            d_weight = (
+                paddle.matmul(input_.reshape([-1, input_.shape[-1]]), sub_grad, transpose_x=True)
+                if not weight.stop_gradient
+                else None
+            )
+            d_input = paddle_custom_device.npu.fused_mm_allreduce(
+                sub_grad, weight.t(), bias=None, hcom=hcom_name, reduce_op="sum", comm_turn=0
+            )
+
+            if d_weight is not None:
+                return d_input.reshape(input_.shape), d_weight
+            else:
+                return d_input.reshape(input_.shape), None
+
+    class MC2RowParallelCoreLinear(PyLayer):
+        @staticmethod
+        def forward(ctx, input_, weight, group):
+            ctx.save_for_backward(input_, weight)
+            rank = paddle.distributed.get_rank()
+            hcom_name = group.process_group.get_comm_name(rank)
+            x = input_.reshape([-1, input_.shape[-1]])
+            out = paddle_custom_device.npu.fused_mm_allreduce(
+                x, weight, bias=None, hcom=hcom_name, reduce_op="sum", comm_turn=0
+            )
+            output = out.reshape([input_.shape[0], input_.shape[1], weight.shape[1]])
+            ctx.ring_id = group.id
+            return output
+
+        @staticmethod
+        def backward(ctx, dy):
+            input_, weight = ctx.saved_tensor()
+            out_grad = dy
+            sub_grad = out_grad.reshape([-1, out_grad.shape[-1]])
+            input_grad = paddle.matmul(sub_grad, weight, transpose_y=True)
+            if weight.stop_gradient:
+                return input_grad.reshape(input_.shape), None
+            else:
+                input_reshape = input_.reshape([-1, input_.shape[-1]])
+                weight_grad = paddle.matmul(input_reshape, sub_grad, transpose_x=True)
+                return input_grad.reshape(input_.shape), weight_grad
+
+    class MC2ColumnSeqParallelCoreLinear(PyLayer):
+        @staticmethod
+        def forward(ctx, input_, weight, group):
+            ctx.weight_stop_gradient = weight.stop_gradient
+            ctx.save_for_backward(input_, weight)
+
+            rank = dist.get_rank()
+            hcomm_info = group.process_group.get_comm_name(rank)
+
+            world_size = group.nranks
+            output, gather_out = paddle_custom_device.npu.fused_allgather_mm(
+                input_,
+                weight,
+                bias=None,
+                hcom=hcomm_info,
+                world_size=world_size,
+                gather_index=0,
+                gather_output=(not __all_gather_recomputation__),
+                comm_turn=0,
+            )
+
+            ctx.all_gather_output = gather_out
+            ctx.world_size = world_size
+            ctx.group = group
+            return output
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            input_, weight = ctx.saved_tensor()
+
+            if __all_gather_recomputation__:
+                dim_size = input_.shape
+                dim_size[0] = dim_size[0] * ctx.world_size
+                all_gather_output = paddle.empty(dim_size, dtype=input_.dtype)
+                all_gather_output.stop_gradient = True
+                all_gather_work = dist.stream.all_gather(all_gather_output, input_, group=ctx.group, sync_op=False)
+            else:
+                all_gather_output = ctx.all_gather_output
+
+            grad_input = paddle.matmul(grad_output, weight, transpose_y=True)
+            sub_grad_input = paddle.empty(input_.shape, dtype=input_.dtype)
+            reduce_scatter_work = dist.stream.reduce_scatter(
+                sub_grad_input, grad_input, group=ctx.group, sync_op=False
+            )
+
+            if __all_gather_recomputation__:
+                all_gather_work.wait()
+
+            grad_weight = (
+                paddle.matmul(all_gather_output, grad_output, transpose_x=True)
+                if not ctx.weight_stop_gradient
+                else None
+            )
+            reduce_scatter_work.wait()
+
+            return sub_grad_input, grad_weight
+
+    class MC2RowSeqParallelCoreLinear(PyLayer):
+        @staticmethod
+        def forward(ctx, input_, weight, group):
+            ctx.weight_stop_gradient = weight.stop_gradient
+            ctx.save_for_backward(input_, weight)
+
+            rank = dist.get_rank()
+            hcomm_info = group.process_group.get_comm_name(rank)
+            world_size = group.nranks
+
+            output = paddle_custom_device.npu.fused_mm_reduce_scatter(
+                input_,
+                weight,
+                bias=None,
+                hcom=hcomm_info,
+                world_size=world_size,
+                reduce_op="sum",
+                comm_turn=0,
+            )
+
+            ctx.hcomm_info = hcomm_info
+            ctx.world_size = world_size
+            return output
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            input_, weight = ctx.saved_tensor()
+            hcomm_info = ctx.hcomm_info
+            world_size = ctx.world_size
+
+            grad_input, all_gather_grad_output = paddle_custom_device.npu.fused_allgather_mm(
+                grad_output,
+                weight.t(),
+                bias=None,
+                hcom=hcomm_info,
+                world_size=world_size,
+                gather_index=0,
+                gather_output=True,
+                comm_turn=0,
+            )
+            grad_weight = (
+                paddle.matmul(input_, all_gather_grad_output, transpose_x=True)
+                if not ctx.weight_stop_gradient
+                else None
+            )
+
+            return grad_input, grad_weight
+
+    class MC2ColumnSeqParallelLinear(ColumnSequenceParallelLinear):
+        def forward(self, x):
+            output = MC2ColumnSeqParallelCoreLinear.apply(x, self.weight, self.model_parallel_group)
+            output = output + self.bias if self.bias is not None else output
+            return output
+
+    class MC2RowSeqParallelLinear(RowSequenceParallelLinear):
+        def forward(self, x):
+            output = MC2RowSeqParallelCoreLinear.apply(x, self.weight, self.model_parallel_group)
+            output = output + self.bias if self.bias is not None else output
+            return output
+
+else:
+    MC2ColumnSeqParallelCoreLinear = None
+    MC2RowSeqParallelCoreLinear = None
+    MC2ColumnSeqParallelLinear = None
+    MC2RowSeqParallelLinear = None
+    MC2ColumnParallelCoreLinear = None
+    MC2RowParallelCoreLinear = None
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/megatronbert/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/megatronbert/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/megatronbert/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/megatronbert/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/megatronbert/configuration.py
new file mode 100644
index 000000000..2bc3e695a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/megatronbert/configuration.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MBart model configuration"""
+from __future__ import annotations
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = [
+    "MegatronBert_PRETRAINED_INIT_CONFIGURATION",
+    "MegatronBert_PRETRAINED_RESOURCE_FILES_MAP",
+    "MegatronBertConfig",
+]
+
+MegatronBert_PRETRAINED_INIT_CONFIGURATION = {
+    "megatronbert-cased": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "intermediate_size": 4096,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 24,
+        "type_vocab_size": 2,
+        "vocab_size": 29056,
+        "pad_token_id": 0,
+    },
+    "megatronbert-uncased": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "intermediate_size": 4096,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 24,
+        "type_vocab_size": 2,
+        "vocab_size": 30592,
+        "pad_token_id": 0,
+    },
+}
+
+MegatronBert_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "megatronbert-cased": "http://bj.bcebos.com/paddlenlp/models/transformers/megatron-bert/megatronbert-cased/model_state.pdparams",
+        "megatronbert-uncased": "http://bj.bcebos.com/paddlenlp/models/transformers/megatron-bert/megatronbert-uncased/model_state.pdparams",
+    }
+}
+
+
+class MegatronBertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MegatronBertModel`]. It is used to instantiate a
+    MEGATRON_BERT model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the MEGATRON_BERT
+    [nvidia/megatron-bert-uncased-345m](https://huggingface.co/nvidia/megatron-bert-uncased-345m) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (int):
+            Vocabulary size of `inputs_ids` in `MegatronBertModel`. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `MegatronBert`.
+        hidden_size (int, optional):
+            Dimensionality of the encoder layer and pooler layer. Defaults to `1024`.
+        pad_token_id (int, optional):
+            The index of padding token in the token vocabulary.
+            Defaults to `0`.
+        type_vocab_size (int, optional):
+            The vocabulary size of `token_type_ids`.
+            Defaults to `2`.
+        hidden_act (str, optional):
+            The non-linear activation function in the feed-forward layer.
+            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
+            are supported. Defaults to `"gelu"`.
+        attention_probs_dropout_prob (float, optional):
+            The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
+            Defaults to `0.1`.
+        num_attention_heads (int, optional):
+            Number of attention heads for each attention layer in the Transformer encoder.
+            Defaults to `16`.
+        num_hidden_layers (int, optional):
+            Number of hidden layers in the Transformer encoder. Defaults to `24`.
+        max_position_embeddings (int, optional):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
+            sequence. Defaults to `512`.
+        hidden_dropout_prob (float, optional):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+            Defaults to `0.1`.
+        intermediate_size (int, optional):
+            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
+            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
+            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
+            Defaults to `4096`.
+        position_embedding_type (str, optional):
+            Type of position embedding. Defaults to "absolute"
+        initializer_range (float, optional):
+            The standard deviation of the normal initializer.
+            Defaults to 0.02.
+
+            .. note::
+                A normal_initializer initializes weight matrices as normal distributions.
+                See :meth:`MegatronBertPretrainedModel.init_weights()` for how weights are initialized in `MegatronBertModel`.
+
+    """
+    model_type = "megatronbert"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=29056,
+        hidden_size=1024,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        intermediate_size=4096,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        # use_cache=True,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        # self.use_cache = use_cache
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/megatronbert/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/megatronbert/modeling.py
new file mode 100644
index 000000000..abe724c3f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/megatronbert/modeling.py
@@ -0,0 +1,1006 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+import paddle.nn as nn
+from paddle import einsum
+
+from ...utils.env import CONFIG_NAME
+from .. import PretrainedModel, register_base_model
+from ..activations import get_activation
+from .configuration import (
+    MegatronBert_PRETRAINED_INIT_CONFIGURATION,
+    MegatronBert_PRETRAINED_RESOURCE_FILES_MAP,
+    MegatronBertConfig,
+)
+
+__all__ = [
+    "MegatronBertModel",
+    "MegatronBertPretrainedModel",
+    "MegatronBertForQuestionAnswering",
+    "MegatronBertForSequenceClassification",
+    "MegatronBertForNextSentencePrediction",
+    "MegatronBertForCausalLM",
+    "MegatronBertForPreTraining",
+    "MegatronBertForMaskedLM",
+    "MegatronBertForMultipleChoice",
+    "MegatronBertForTokenClassification",
+]
+
+layer_norm_eps = 1e-12
+
+
+class MegatronBertPretrainedModel(PretrainedModel):
+    r"""
+    An abstract class for pretrained MegatronBert models. It provides RoBerta related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+
+    """
+    model_config_file = CONFIG_NAME
+    resource_files_names = {"model_state": "model_state.pdparams"}
+
+    pretrained_init_configuration = MegatronBert_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = MegatronBert_PRETRAINED_RESOURCE_FILES_MAP
+    base_model_prefix = "megatronbert"
+    config_class = MegatronBertConfig
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # only support dygraph, use truncated_normal and make it inplace
+            # and configurable later
+            layer.weight.set_value(
+                paddle.tensor.normal(
+                    mean=0.0,
+                    std=self.initializer_range
+                    if hasattr(self, "initializer_range")
+                    else self.megatronbert.config["initializer_range"],
+                    shape=layer.weight.shape,
+                )
+            )
+        elif isinstance(layer, nn.LayerNorm):
+            layer._epsilon = layer_norm_eps
+
+
+class MegatronBertEmbeddings(nn.Layer):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config: MegatronBertConfig):
+        super(MegatronBertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.register_buffer("position_ids", paddle.arange(end=config.max_position_embeddings).expand((1, -1)))
+        self.position_embedding_type = config.position_embedding_type
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.shape
+        else:
+            input_shape = inputs_embeds.shape[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(input_shape, dtype="int64")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class MegatronBertSelfAttention(nn.Layer):
+    def __init__(self, config: MegatronBertConfig):
+        super(MegatronBertSelfAttention, self).__init__()
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = config.position_embedding_type
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.shape[:-1] + [self.num_attention_heads, self.attention_head_size]
+        x = x.reshape(new_x_shape)
+        return x.transpose((0, 2, 1, 3))
+
+    def forward(self, hidden_states, attention_mask=None):
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        attention_scores = paddle.matmul(query_layer, key_layer.transpose((0, 1, 3, 2)))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.shape[1]
+            position_ids_l = paddle.arange(end=seq_length, dtype="int64").reshape((-1, 1))
+            position_ids_r = paddle.arange(end=seq_length, dtype="int64").reshape((1, -1))
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in MegatronBertModel forward() function)
+            attention_scores = attention_scores + attention_mask.astype(attention_scores.dtype)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = paddle.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.transpose((0, 2, 1, 3))
+        new_context_layer_shape = context_layer.shape[:-2] + [self.all_head_size]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        return context_layer, attention_probs
+
+
+class MegatronBertSelfOutput(nn.Layer):
+    def __init__(self, config: MegatronBertConfig):
+        super(MegatronBertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, residual):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return residual + hidden_states
+
+
+class MegatronBertAttention(nn.Layer):
+    def __init__(self, config: MegatronBertConfig):
+        super(MegatronBertAttention, self).__init__()
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.self = MegatronBertSelfAttention(config)
+        self.output = MegatronBertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def forward(self, hidden_states, attention_mask=None):
+        ln_outputs = self.layer_norm(hidden_states)
+        self_outputs = self.self(ln_outputs, attention_mask)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class MegatronBertIntermediate(nn.Layer):
+    def __init__(self, config: MegatronBertConfig):
+        super(MegatronBertIntermediate, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.intermediate_act_fn = get_activation(config.hidden_act)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class MegatronBertOutput(nn.Layer):
+    def __init__(self, config: MegatronBertConfig):
+        super(MegatronBertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return input_tensor + hidden_states
+
+
+class MegatronBertLayer(nn.Layer):
+    def __init__(self, config: MegatronBertConfig):
+        super(MegatronBertLayer, self).__init__()
+        self.seq_len_dim = 1
+        self.attention = MegatronBertAttention(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.intermediate = MegatronBertIntermediate(config)
+        self.output = MegatronBertOutput(config)
+
+    def forward(self, hidden_states, attention_mask=None):
+        self_attention_outputs = self.attention(hidden_states, attention_mask)
+        attention_output = self_attention_outputs[0]
+
+        outputs = self_attention_outputs[1:]
+
+        layer_output = self.feed_forward_chunk(attention_output)
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        ln_output = self.layer_norm(attention_output)
+        intermediate_output = self.intermediate(ln_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class MegatronBertEncoder(nn.Layer):
+    def __init__(self, config: MegatronBertConfig):
+        super(MegatronBertEncoder, self).__init__()
+        self.layer = nn.LayerList([MegatronBertLayer(config) for _ in range(config.num_hidden_layers)])
+
+        # The final layer norm. We removed the 1st LN, moved LN to each hidden layer and this one
+        # is simply the final LN (Transformer's BERT has it attached to each hidden layer).
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+
+    def forward(self, hidden_states, attention_mask=None):
+        for i, layer_module in enumerate(self.layer):
+            layer_outputs = layer_module(hidden_states, attention_mask)
+
+            hidden_states = layer_outputs[0]
+
+        # Finalize the hidden states.
+        hidden_states = self.layer_norm(hidden_states)
+
+        return hidden_states
+
+
+class MegatronBertPooler(nn.Layer):
+    def __init__(self, config: MegatronBertConfig):
+        super(MegatronBertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+@register_base_model
+class MegatronBertModel(MegatronBertPretrainedModel):
+    """
+    The bare MegatronBert Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        Args:
+        config (:class:`MegatronBertConfig`):
+            An instance of MegatronBertConfig used to construct MBartModel.
+
+    """
+
+    def __init__(self, config: MegatronBertConfig):
+        super(MegatronBertModel, self).__init__(config)
+        self.num_hidden_layers = config.num_hidden_layers
+        self.pad_token_id = config.pad_token_id
+        self.initializer_range = config.initializer_range
+        self.embeddings = MegatronBertEmbeddings(config)
+        self.encoder = MegatronBertEncoder(config)
+        self.pooler = MegatronBertPooler(config)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, attention_mask=None):
+        r"""
+        The MegatronBertModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                If its data type is int, the values should be either 0 or 1.
+
+                - **1** for tokens that **not masked**,
+                - **0** for tokens that **masked**.
+
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+
+        Returns:
+            tuple: Returns tuple (`sequence_output`, `pooled_output`).
+
+            With the fields:
+
+            - `sequence_output` (Tensor):
+                Sequence of hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+            - `pooled_output` (Tensor):
+                The output of first token (`[CLS]`) in sequence.
+                We "pool" the model by simply taking the hidden state corresponding to the first token.
+                Its data type should be float32 and its shape is [batch_size, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import MegatronBertModel, MegatronBertTokenizer
+
+                tokenizer = MegatronBertTokenizer.from_pretrained('megatronbert-uncased')
+                model = MegatronBertModel.from_pretrained('megatronbert-uncased')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+        """
+
+        input_shape = input_ids.shape
+
+        if attention_mask is None:
+            attention_mask = paddle.unsqueeze(
+                (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e4, axis=[1, 2]
+            )
+        else:
+            if attention_mask.ndim == 2:
+                # attention_mask [batch_size, sequence_length] -> [batch_size, 1, 1, sequence_length]
+                attention_mask = attention_mask.unsqueeze(axis=[1, 2])
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(input_shape, dtype="int64")
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids
+        )
+        encoder_outputs = self.encoder(embedding_output, attention_mask=attention_mask)
+        sequence_output = encoder_outputs
+        pooled_output = self.pooler(sequence_output)
+
+        return sequence_output, pooled_output
+
+
+class MegatronBertForQuestionAnswering(MegatronBertPretrainedModel):
+    """
+    MegatronBert Model with question answering tasks.
+
+    Args:
+        megatronbert (:class:`MegatronBertModel`):
+            An instance of :class:`MegatronBertModel`.
+
+    """
+
+    def __init__(self, config: MegatronBertConfig):
+        super(MegatronBertForQuestionAnswering, self).__init__(config)
+        self.megatronbert = MegatronBertModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+    ):
+        r"""
+        The MegatronBertForQuestionAnswering forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`MegatronBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`MegatronBertModel`.
+            position_ids(Tensor, optional):
+                See :class:`MegatronBertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`MegatronBertModel`.
+        Returns:
+            tuple: Returns tuple (`start_logits`, `end_logits`).
+
+            With the fields:
+
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import MegatronBertForQuestionAnswering, MegatronBertTokenizer
+
+                tokenizer = MegatronBertTokenizer.from_pretrained('megatronbert-uncased')
+                model = MegatronBertForQuestionAnswering.from_pretrained('megatronbert-uncased')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                start_logits = outputs[0]
+                end_logits  = outputs[1]
+        """
+
+        outputs = self.megatronbert(
+            input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(2, axis=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        output = (start_logits, end_logits)
+        return output
+
+
+class MegatronBertForSequenceClassification(MegatronBertPretrainedModel):
+    """
+    MegatronBert Model with sequence classification tasks.
+
+    Args:
+        megatronbert (:class:`MegatronBertModel`):
+            An instance of :class:`MegatronBertModel`.
+        num_labels (int):
+            The number of labels.
+    """
+
+    def __init__(self, config: MegatronBertConfig):
+        super(MegatronBertForSequenceClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.megatronbert = MegatronBertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, attention_mask=None):
+        r"""
+        The MegatronBertForSequenceClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`MegatronBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`MegatronBertModel`.
+            position_ids(Tensor, optional):
+                See :class:`MegatronBertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`MegatronBertModel`.
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the sequence classification logits.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import MegatronBertForSequenceClassification, MegatronBertTokenizer
+
+                tokenizer = MegatronBertTokenizer.from_pretrained('megatronbert-uncased')
+                model = MegatronBertForSequenceClassification.from_pretrained('megatronbert-uncased', num_labels=2)
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+        """
+
+        outputs = self.megatronbert(
+            input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        return logits
+
+
+class MegatronBertPredictionHeadTransform(nn.Layer):
+    def __init__(self, config: MegatronBertConfig):
+        super(MegatronBertPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.transform_act_fn = get_activation(config.hidden_act)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        return hidden_states
+
+
+class MegatronBertLMPredictionHead(nn.Layer):
+    def __init__(self, config: MegatronBertConfig):
+        super(MegatronBertLMPredictionHead, self).__init__()
+        self.transform = MegatronBertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+
+        self.decoder_weight = self.create_parameter(
+            shape=[config.vocab_size, config.hidden_size], dtype=self.transform.dense.weight.dtype, is_bias=False
+        )
+        self.decoder_bias = self.create_parameter(
+            shape=[config.vocab_size], dtype=self.decoder_weight.dtype, is_bias=True
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = paddle.tensor.matmul(hidden_states, self.decoder_weight, transpose_y=True) + self.decoder_bias
+        return hidden_states
+
+
+class MegatronBertOnlyMLMHead(nn.Layer):
+    def __init__(self, config: MegatronBertConfig):
+        super(MegatronBertOnlyMLMHead, self).__init__()
+        self.predictions = MegatronBertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class MegatronBertOnlyNSPHead(nn.Layer):
+    def __init__(self, config: MegatronBertConfig):
+        super(MegatronBertOnlyNSPHead, self).__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class MegatronBertPreTrainingHeads(nn.Layer):
+    def __init__(self, config: MegatronBertConfig):
+        super(MegatronBertPreTrainingHeads, self).__init__()
+        self.predictions = MegatronBertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class MegatronBertForPreTraining(MegatronBertPretrainedModel):
+    """
+    Megatronbert Model with pretraining tasks on top.
+
+    Args:
+        megatronbert (:class:`MegatronBertModel`):
+            An instance of :class:`MegatronBertModel`.
+
+    """
+
+    def __init__(self, config: MegatronBertConfig):
+        super(MegatronBertForPreTraining, self).__init__(config)
+
+        self.megatronbert = MegatronBertModel(config)
+        self.cls = MegatronBertPreTrainingHeads(config)
+
+    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, attention_mask=None):
+        r"""
+        The MegatronBertForPreTraining forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`MegatronBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`MegatronBertModel`.
+            position_ids(Tensor, optional):
+                See :class:`MegatronBertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`MegatronBertModel`.
+        Returns:
+            tuple: Returns tuple (`prediction_scores`, `seq_relationship_score`).
+
+            With the fields:
+
+            - `prediction_scores` (Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size].
+                Otherwise, its shape is [batch_size, mask_token_num, vocab_size].
+
+            - `seq_relationship_score` (Tensor):
+                The scores of next sentence prediction.
+                Its data type should be float32 and its shape is [batch_size, 2].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import MegatronBertForPreTraining, MegatronBertTokenizer
+
+                tokenizer = MegatronBertTokenizer.from_pretrained('megatronbert-uncased')
+                model = MegatronBertForPreTraining.from_pretrained('megatronbert-uncased')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                prediction_scores, seq_relationship_score = model(**inputs)
+        """
+        outputs = self.megatronbert(
+            input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+
+        output = (prediction_scores, seq_relationship_score)
+        return output
+
+
+class MegatronBertForCausalLM(MegatronBertPretrainedModel):
+    """
+    MegatronBert Model with a `causal masked language modeling` head on top.
+
+    Args:
+        megatronbert (:class:`MegatronBertModel`):
+            An instance of :class:`MegatronBertModel`.
+
+    """
+
+    def __init__(self, config: MegatronBertConfig):
+        super(MegatronBertForCausalLM, self).__init__(config)
+
+        self.megatronbert = MegatronBertModel(config)
+        self.cls = MegatronBertOnlyMLMHead(config)
+
+    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, attention_mask=None):
+        r"""
+        The MegatronBertForCausalLM forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`MegatronBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`MegatronBertModel`.
+            position_ids(Tensor, optional):
+                See :class:`MegatronBertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`MegatronBertModel`.
+        Returns:
+            Tensor: Returns Tensor `prediction_scores`. The scores of masked token prediction.
+                    Its data type should be float32. If `masked_positions` is None, its shape is
+                    [batch_size, sequence_length, vocab_size]. Otherwise, its shape is
+                    [batch_size, mask_token_num, vocab_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import MegatronBertForCausalLM, MegatronBertTokenizer
+
+                tokenizer = MegatronBertTokenizer.from_pretrained('megatronbert-uncased')
+                model = MegatronBertForCausalLM.from_pretrained('megatronbert-uncased')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                prediction_scores = model(**inputs)
+        """
+        outputs = self.megatronbert(
+            input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+        return prediction_scores
+
+
+class MegatronBertForMaskedLM(MegatronBertPretrainedModel):
+    """
+    MegatronBert Model with a `masked language modeling` head on top.
+
+    Args:
+        megatronbert (:class:`MegatronBertModel`):
+            An instance of :class:`MegatronBertModel`.
+
+    """
+
+    def __init__(self, config: MegatronBertConfig):
+        super(MegatronBertForMaskedLM, self).__init__(config)
+
+        self.megatronbert = MegatronBertModel(config)
+        self.cls = MegatronBertOnlyMLMHead(config)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+    ):
+        r"""
+        The MegatronBertForMaskedLM forward method, overrides the __call__() special method.
+
+        Args:
+           input_ids (Tensor):
+               See :class:`MegatronBertModel`.
+           token_type_ids (Tensor, optional):
+               See :class:`MegatronBertModel`.
+           position_ids(Tensor, optional):
+               See :class:`MegatronBertModel`.
+           attention_mask (Tensor, optional):
+               See :class:`MegatronBertModel`.
+        Returns:
+           Tensor: Returns Tensor `prediction_scores`. The scores of masked token prediction.
+                   Its data type should be float32. If `masked_positions` is None, its shape is
+                   [batch_size, sequence_length, vocab_size]. Otherwise, its shape is
+                   [batch_size, mask_token_num, vocab_size].
+
+        Example:
+           .. code-block::
+
+               import paddle
+               from paddlenlp.transformers import MegatronBertForMaskedLM, MegatronBertTokenizer
+
+               tokenizer = MegatronBertTokenizer.from_pretrained('megatronbert-uncased')
+               model = MegatronBertForMaskedLM.from_pretrained('megatronbert-uncased')
+
+               inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+               inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+               prediction_scores = model(**inputs)
+        """
+
+        outputs = self.megatronbert(
+            input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        return prediction_scores
+
+
+class MegatronBertForNextSentencePrediction(MegatronBertPretrainedModel):
+    """
+    MegatronBert Model with a `next sentence prediction (classification)` head on top.
+
+    Args:
+        megatronbert (:class:`MegatronBertModel`):
+            An instance of :class:`MegatronBertModel`.
+    """
+
+    def __init__(self, config: MegatronBertConfig):
+        super(MegatronBertForNextSentencePrediction, self).__init__(config)
+
+        self.megatronbert = MegatronBertModel(config)
+        self.cls = MegatronBertOnlyNSPHead(config)
+
+    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, attention_mask=None):
+        r"""
+        The MegatronBertForNextSentencePrediction forward method, overrides the __call__() special method.
+
+        Args:
+           input_ids (Tensor):
+               See :class:`MegatronBertModel`.
+           token_type_ids (Tensor, optional):
+               See :class:`MegatronBertModel`.
+           position_ids(Tensor, optional):
+               See :class:`MegatronBertModel`.
+           attention_mask (Tensor, optional):
+               See :class:`MegatronBertModel`.
+        Returns:
+           Tensor: Returns Tensor `seq_relationship_scores`. The scores of next sentence prediction.
+                   Its data type should be float32 and its shape is [batch_size, 2].
+
+        Example:
+           .. code-block::
+
+               import paddle
+               from paddlenlp.transformers import MegatronBertForNextSentencePrediction, MegatronBertTokenizer
+
+               tokenizer = MegatronBertTokenizer.from_pretrained('megatronbert-uncased')
+               model = MegatronBertForNextSentencePrediction.from_pretrained('megatronbert-uncased')
+
+               inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+               inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+               seq_relationship_scores = model(**inputs)
+        """
+
+        outputs = self.megatronbert(
+            input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids
+        )
+
+        pooled_output = outputs[1]
+
+        seq_relationship_scores = self.cls(pooled_output)
+
+        return seq_relationship_scores
+
+
+class MegatronBertForMultipleChoice(MegatronBertPretrainedModel):
+    """
+    MegatronBert Model with a multiple choice classification head on top.
+
+    Args:
+        megatronbert (:class:`MegatronBertModel`):
+            An instance of :class:`MegatronBertModel`.
+    """
+
+    def __init__(self, config: MegatronBertConfig):
+        super(MegatronBertForMultipleChoice, self).__init__(config)
+
+        self.megatronbert = MegatronBertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, attention_mask=None):
+        r"""
+        The MegatronBertForMultipleChoice forward method, overrides the __call__() special method.
+
+        Args:
+           input_ids (Tensor):
+               See :class:`MegatronBertModel`.
+           token_type_ids (Tensor, optional):
+               See :class:`MegatronBertModel`.
+           position_ids(Tensor, optional):
+               See :class:`MegatronBertModel`.
+           attention_mask (Tensor, optional):
+               See :class:`MegatronBertModel`.
+        Returns:
+           Tensor: Returns Tensor `reshaped_logits`. A tensor of the multiple choice classification logits.
+                   Shape as `[batch_size, num_choice]` and dtype as `float32`.
+
+        Example:
+           .. code-block::
+
+               import paddle
+               from paddlenlp.transformers import MegatronBertForMultipleChoice, MegatronBertTokenizer
+
+               tokenizer = MegatronBertTokenizer.from_pretrained('megatronbert-uncased')
+               model = MegatronBertForNextSentencePrediction.from_pretrained('megatronbert-uncased')
+
+               inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+               inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+               reshaped_logits = model(**inputs)
+        """
+        num_choices = input_ids.shape[1]
+
+        input_ids = input_ids.reshape((-1, input_ids.shape[-1])) if input_ids is not None else None
+        attention_mask = attention_mask.reshape((-1, attention_mask.shape[-1])) if attention_mask is not None else None
+        token_type_ids = token_type_ids.reshape((-1, token_type_ids.shape[-1])) if token_type_ids is not None else None
+        position_ids = position_ids.reshape((-1, position_ids.shape[-1])) if position_ids is not None else None
+
+        outputs = self.megatronbert(
+            input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.reshape((-1, num_choices))
+
+        return reshaped_logits
+
+
+class MegatronBertForTokenClassification(MegatronBertPretrainedModel):
+    """
+    MegatronBert Model with a token classification head on top.
+
+    Args:
+        megatronbert (:class:`MegatronBertModel`):
+            An instance of :class:`MegatronBertModel`.
+
+        num_labels (int):
+            The number of labels.
+    """
+
+    def __init__(self, config: MegatronBertConfig):
+        super(MegatronBertForTokenClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.megatronbert = MegatronBertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, self.num_labels)
+
+    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None):
+        r"""
+        The MegatronBertForTokenClassification forward method, overrides the __call__() special method.
+
+        Args:
+           input_ids (Tensor):
+               See :class:`MegatronBertModel`.
+           token_type_ids (Tensor, optional):
+               See :class:`MegatronBertModel`.
+           position_ids(Tensor, optional):
+               See :class:`MegatronBertModel`.
+           attention_mask (Tensor, optional):
+               See :class:`MegatronBertModel`.
+        Returns:
+           Tensor: Returns tensor `logits`, a tensor of the input token classification logits.
+                   Shape as `[batch_size, sequence_length, num_classes]` and dtype as `float32`.
+
+        Example:
+           .. code-block::
+
+               import paddle
+               from paddlenlp.transformers import MegatronBertForTokenClassification, MegatronBertTokenizer
+
+               tokenizer = MegatronBertTokenizer.from_pretrained('megatronbert-uncased')
+               model = MegatronBertForTokenClassification.from_pretrained('megatronbert-uncased', num_labels=2)
+
+               inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+               inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+               reshaped_logits = model(**inputs)
+        """
+
+        outputs = self.megatronbert(
+            input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        return logits
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/megatronbert/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/megatronbert/tokenizer.py
new file mode 100644
index 000000000..24f7c2426
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/megatronbert/tokenizer.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for MegatronBert."""
+
+from .. import BertTokenizer
+
+__all__ = ["MegatronBertTokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"megatronbert-cased": 512, "megatronbert-uncased": 512}
+
+
+class MegatronBertTokenizer(BertTokenizer):
+    """
+    Constructs a MegatronBert tokenizer. It uses a basic tokenizer to do punctuation
+    splitting, lower casing and so on, and follows a WordPiece tokenizer to
+    tokenize as subwords.
+
+    Args:
+        vocab_file (str):
+            The vocabulary file path (ends with '.txt') required to instantiate
+            a `WordpieceTokenizer`.
+        do_lower_case (bool):
+            Whether or not to lowercase the input when tokenizing.
+            Defaults to`True`.
+        unk_token (str):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import MegatronBertTokenizer
+            tokenizer = MegatronBertTokenizer.from_pretrained('MegatronBert-uncased')
+            inputs = tokenizer('He was a puppeteer')
+            print(inputs)
+
+            '''
+            {'input_ids': [101, 2002, 2001, 1037, 13997, 11510, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0]}
+            '''
+
+    """
+
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "megatronbert-uncased": "https://bj.bcebos.com/paddle-hapi/models/bert/bert-base-uncased-vocab.txt",
+            "megatronbert-cased": "https://bj.bcebos.com/paddle-hapi/models/bert/bert-base-cased-vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "megatronbert-uncased": {"do_lower_case": True},
+        "megatronbert-cased": {"do_lower_case": False},
+    }
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+        super(MegatronBertTokenizer, self).__init__(
+            vocab_file,
+            do_lower_case=do_lower_case,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/minigpt4/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/minigpt4/__init__.py
new file mode 100644
index 000000000..595add0ae
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/minigpt4/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/minigpt4/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/minigpt4/configuration.py
new file mode 100644
index 000000000..ca733095d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/minigpt4/configuration.py
@@ -0,0 +1,348 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" MiniGPT4 model configuration """
+import copy
+import os
+from typing import Union
+
+from ...utils.log import logger
+from ..auto.modeling import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+from ..configuration_utils import PretrainedConfig
+from ..llama.configuration import LlamaConfig
+
+__all__ = ["MiniGPT4VisionConfig", "MiniGPT4QFormerConfig", "MiniGPT4Config"]
+
+
+class MiniGPT4VisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MiniGPT4VisionModel`]. It is used to instantiate a
+    MiniGPT4 vision encoder according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1408):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 6144):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 39):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
+            to 1e-5): The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float``, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import MiniGPT4VisionConfig, MiniGPT4VisionModel
+    >>> # Initializing a MiniGPT4VisionConfig
+    >>> configuration = MiniGPT4VisionConfig()
+    >>> # Initializing a MiniGPT4VisionModel (with random weights) from the configuration above.
+    >>> model = MiniGPT4VisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "mimigpt4_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=1408,
+        intermediate_size=6144,
+        projection_dim=512,
+        num_hidden_layers=39,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=224,
+        patch_size=14,
+        hidden_act="gelu",
+        layer_norm_eps=0.00001,
+        dropout=0.0,
+        attention_dropout=0.0,
+        initializer_range=1e-10,
+        initializer_factor=1.0,
+        qkv_bias=True,
+        **kwargs,
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.qkv_bias = qkv_bias
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        # get the vision config dict if we are loading from MiniGPT4Config
+        if config_dict.get("model_type") == "minigpt4":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class MiniGPT4QFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MiniGPT4QFormerModel`]. It is used to instantiate a
+    MiniGPT4 Querying Transformer (Q-Former) model according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.
+    Note that [`MiniGPT4QFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling the model.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+        cross_attention_frequency (`int`, *optional*, defaults to 2):
+            The frequency of adding cross-attention to the Transformer layers.
+        encoder_hidden_size (`int`, *optional*, defaults to 1408):
+            The hidden size of the hidden states for cross-attention.
+    Examples:
+    ```python
+    >>> from paddlenlp.transformers import MiniGPT4QFormerConfig, MiniGPT4QFormerModel
+    >>> # Initializing a MiniGPT4 configuration
+    >>> configuration = MiniGPT4QFormerConfig()
+    >>> # Initializing a model (with random weights) from the configuration above
+    >>> model = MiniGPT4QFormerModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "minigpt4_qformer"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        classifier_dropout=None,
+        cross_attention_frequency=2,
+        encoder_hidden_size=1408,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.classifier_dropout = classifier_dropout
+        self.cross_attention_frequency = cross_attention_frequency
+        self.encoder_hidden_size = encoder_hidden_size
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the qformer config dict if we are loading from MiniGPT4Config
+        if config_dict.get("model_type") == "minigpt4":
+            config_dict = config_dict["qformer_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class MiniGPT4Config(PretrainedConfig):
+    r"""
+    [`MiniGPT4Config`] is the configuration class to store the configuration of a [`MiniGPT4ForConditionalGeneration`]. It is
+    used to instantiate a MiniGPT4 model according to the specified arguments, defining the vision model, Q-Former model
+    and language model configs.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`MiniGPT4VisionConfig`].
+        qformer_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`MiniGPT4QFormerConfig`].
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
+        num_query_tokens (`int`, *optional*, defaults to 32):
+            The number of query tokens passed through the Transformer.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import (
+    ...     MiniGPT4VisionConfig,
+    ...     MiniGPT4QFormerConfig,
+    ...     LlamaConfig,
+    ...     MiniGPT4Config,
+    ...     MiniGPT4ForConditionalGeneration,
+    ... )
+    >>> # Initializing a MiniGPT4Config configuration
+    >>> configuration = MiniGPT4Config()
+    >>> # Initializing a MiniGPT4ForConditionalGeneration (with random weights) from the configuration above
+    >>> model = MiniGPT4ForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    >>> # We can also initialize a MiniGPT4Config from a MiniGPT4VisionConfig, MiniGPT4QFormerConfig and any PretrainedConfig
+    >>> # Initializing MiniGPT4 vision, MiniGPT4 Q-Former and language model configurations
+    >>> vision_config = MiniGPT4VisionConfig()
+    >>> qformer_config = MiniGPT4QFormerConfig()
+    >>> text_config = LlamaConfig()
+    >>> config = MiniGPT4Config.from_text_vision_configs(vision_config, qformer_config, text_config)
+    ```"""
+
+    model_type = "minigpt4"
+    is_composition = True
+
+    def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the MiniGPT4VisionConfig with default values.")
+
+        if qformer_config is None:
+            qformer_config = {}
+            logger.info("qformer_config is None. Initializing the MiniGPT4QFormerConfig with default values.")
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the text config with default values (`LlamaConfig`).")
+        self.vision_config = MiniGPT4VisionConfig(**vision_config)
+        self.qformer_config = MiniGPT4QFormerConfig(**qformer_config)
+        text_model_type = text_config["model_type"] if "model_type" in text_config else "llama"
+
+        if text_model_type == "llama":
+            self.text_config = LlamaConfig(**text_config)
+        else:
+            raise ValueError("Only llama accepted for model_type, but accepted {}.".format(text_model_type))
+
+        self.num_query_tokens = num_query_tokens
+        self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
+        self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+
+        self.initializer_factor = 1.0
+        self.initializer_range = 0.02
+
+    @classmethod
+    def from_vision_qformer_text_configs(
+        cls,
+        vision_config: MiniGPT4VisionConfig,
+        qformer_config: MiniGPT4QFormerConfig,
+        text_config: PretrainedConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`MiniGPT4Config`] (or a derived class) from a vision model, Q-Former and language model
+        configurations.
+        Returns:
+            [`MiniGPT4`]: An instance of a configuration object
+        """
+
+        return cls(
+            vision_config=vision_config.to_dict(),
+            qformer_config=qformer_config.to_dict(),
+            text_config=text_config.to_dict(),
+            **kwargs,
+        )
+
+    def to_dict(self, *args, **kwargs):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["vision_config"] = self.vision_config.to_dict()
+        output["qformer_config"] = self.qformer_config.to_dict()
+        output["text_config"] = self.text_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/minigpt4/image_processing.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/minigpt4/image_processing.py
new file mode 100644
index 000000000..069559b74
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/minigpt4/image_processing.py
@@ -0,0 +1,284 @@
+# coding=utf-8
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for MiniGPT4."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import PIL
+
+from ..image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ..image_transforms import (
+    convert_to_rgb,
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from ..image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    is_batched,
+    to_numpy_array,
+    valid_images,
+)
+from ..tokenizer_utils_base import TensorType
+
+__all__ = [
+    "MiniGPT4ImageProcessor",
+]
+
+
+class MiniGPT4ImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a MiniGPT4 image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+            overridden by the `resample` parameter in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Wwhether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
+            overridden by the `rescale_factor` parameter in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        default_image_mean = [0.48145466, 0.4578275, 0.40821073]
+        default_image_std = [0.26862954, 0.26130258, 0.27577711]
+        size = size if size is not None else {"height": 224, "width": 224}
+        size = get_size_dict(size, default_to_square=True)
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else default_image_mean
+        self.image_std = image_std if image_std is not None else default_image_std
+        self.do_convert_rgb = do_convert_rgb
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Resize an image.
+
+        Resizes the shorter side of the image to `size["shortest_edge"]` while preserving the aspect ratio. If the
+        longer side is larger than the max size `(int(`size["shortest_edge"]` * 1333 / 800))`, the longer side is then
+        resized to the max size while preserving the aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Controls the size of the output image. Should be of the form `{"shortest_edge": int}`.
+            resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size, default_to_square=True)
+        output_size = (size["width"], size["height"])
+        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
+
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ):
+        """
+        Rescale an image by a scale factor. image = image * scale.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`int` or `float`):
+                Scale to apply to the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return rescale(image, scale=scale, data_format=data_format, **kwargs)
+
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: Union[float, List[float]],
+        std: Union[float, List[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            mean (`float` or `List[float]`):
+                Image mean.
+            std (`float` or `List[float]`):
+                Image standard deviation.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        do_convert_rgb: bool = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Controls the size of the image after `resize`. The shortest edge of the image is resized to
+                `size["shortest_edge"]` while preserving the aspect ratio. If the longest edge of this resized image
+                is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
+                edge equal to `int(size["shortest_edge"] * (1333 / 800))`.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to normalize the image by if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.PADDLE` or `'pt'`: Return a batch of type `paddle.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: defaults to the channel dimension format of the input image.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+
+        if not is_batched(images):
+            images = [images]
+
+        if not valid_images(images):
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
+
+        if do_resize and size is None or resample is None:
+            raise ValueError("Size and resample must be specified if do_resize is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_resize:
+            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+
+        if do_rescale:
+            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+
+        if do_normalize:
+            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
+
+        images = [to_channel_dimension_format(image, data_format) for image in images]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/minigpt4/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/minigpt4/modeling.py
new file mode 100644
index 000000000..c64d49b5a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/minigpt4/modeling.py
@@ -0,0 +1,1771 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.distributed.fleet.utils import recompute
+from paddle.nn import CrossEntropyLoss
+
+from paddlenlp.ops import transfer_param
+from paddlenlp.utils.log import logger
+
+from ...utils.initializer import normal_, ones_, zeros_
+from ..activations import ACT2FN
+from ..llama.modeling import LlamaForCausalLM
+from ..model_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPooling,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    ModelOutput,
+)
+from ..model_utils import (
+    PretrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+
+MiniGPT4_PRETRAINED_MODEL_ARCHIVE_LIST = []
+
+from .configuration import MiniGPT4Config, MiniGPT4QFormerConfig, MiniGPT4VisionConfig
+
+__all__ = [
+    "MiniGPT4Model",
+    "MiniGPT4PretrainedModel",
+    "MiniGPT4QFormerModel",
+    "MiniGPT4VisionModel",
+    "MiniGPT4ForConditionalGeneration",
+]
+
+
+def Parameter(tensor):
+    return paddle.create_parameter(tensor.shape, dtype=tensor.dtype, default_initializer=nn.initializer.Assign(tensor))
+
+
+def convert_weights_to_dtype(model, dtype: str):
+    # trying to convert model dtype if necessary
+    if dtype not in ["float16", "float32", "float64"]:
+        raise ValueError("Not supported dtype: {}., only [float16, float32, float64] supported.".format(dtype))
+    dtype_mapping = {
+        "float16": paddle.float16,
+        "float32": paddle.float32,
+        "float64": paddle.float64,
+    }
+
+    def convert_for_vit(layer):
+        if isinstance(layer, (nn.Linear, nn.Conv1D, nn.Conv2D)):
+            if layer.weight.dtype != dtype_mapping[dtype]:
+                layer.weight = transfer_param(layer.weight, restore_data=True, dtype=dtype)
+            if layer.bias is not None and layer.bias.dtype != dtype_mapping[dtype]:
+                layer.bias = transfer_param(layer.bias, restore_data=True, dtype=dtype)
+
+    if isinstance(model, MiniGPT4VisionModel):
+        model.apply(convert_for_vit)
+    elif isinstance(model, (MiniGPT4QFormerModel, LlamaForCausalLM)):
+        model.to(dtype=dtype)
+    else:
+        raise TypeError("Not support model type: {}.".format(type(model)))
+
+
+@dataclass
+class MiniGPT4ForConditionalGenerationModelOutput(ModelOutput):
+    """
+    Class defining the outputs of [`MiniGPT4ForConditionalGeneration`].
+    Args:
+        loss (`paddle.Tensor`, *optional*, returned when `labels` is provided, `paddle.Tensor` of shape `(1,)`):
+            Language modeling loss from the language model.
+        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head of the language model.
+        vision_outputs (`BaseModelOutputWithPooling`):
+            Outputs of the vision encoder.
+        qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
+            Outputs of the Q-Former (Querying Transformer).
+        language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
+            Outputs of the language model.
+    """
+
+    loss: Optional[Tuple[paddle.Tensor]] = None
+    logits: Optional[Tuple[paddle.Tensor]] = None
+    vision_outputs: Optional[paddle.Tensor] = None
+    qformer_outputs: Optional[Tuple[paddle.Tensor]] = None
+    language_model_outputs: Optional[Tuple[paddle.Tensor]] = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k]
+            if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"]
+            else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+class MiniGPT4PretrainedModel(PretrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = MiniGPT4Config
+    base_model_prefix = "minigpt4"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [
+        r"position_ids",
+    ]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_range
+        if isinstance(module, nn.Conv2D) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
+            normal_(module.weight, mean=0.0, std=factor)
+            if hasattr(module, "bias") and module.bias is not None:
+                zeros_(module.bias)
+
+        if isinstance(module, MiniGPT4VisionEmbeddings):
+            if hasattr(self.config, "vision_config"):
+                factor = self.config.vision_config.initializer_range
+            trunc_normal_ = nn.initializer.TruncatedNormal(mean=0.0, std=factor)
+            trunc_normal_(module.position_embedding)
+            trunc_normal_(
+                module.class_embedding,
+            )
+        elif isinstance(module, nn.LayerNorm):
+            zeros_(module.bias)
+            ones_(module.weight)
+        elif isinstance(module, nn.Linear) and module.bias is not None:
+            zeros_(module.bias)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, MiniGPT4Encoder):
+            module.gradient_checkpointing = value
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        vit_dtype = kwargs.pop("vit_dtype", "float16")
+        qformer_dtype = kwargs.pop("qformer_dtype", "float32")
+        llama_dtype = kwargs.pop("llama_dtype", "float16")
+
+        model = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+
+        logger.info("Trying to convert dtype for MiniGPT4 model, it may take a while.")
+        if isinstance(model, (MiniGPT4Model, MiniGPT4ForConditionalGeneration)):
+            convert_weights_to_dtype(model.vision_model, dtype=vit_dtype)
+            convert_weights_to_dtype(model.qformer, dtype=qformer_dtype)
+            convert_weights_to_dtype(model.language_model, dtype=llama_dtype)
+        elif isinstance(model, MiniGPT4VisionModel):
+            convert_weights_to_dtype(model, dtype=vit_dtype)
+        elif isinstance(model, MiniGPT4QFormerModel):
+            convert_weights_to_dtype(model, dtype=qformer_dtype)
+        elif isinstance(model, LlamaForCausalLM):
+            convert_weights_to_dtype(model, dtype=llama_dtype)
+        else:
+            raise TypeError("Not supported model type: {}.".format(type(model)))
+
+        return model
+
+
+class MiniGPT4VisionEmbeddings(nn.Layer):
+    def __init__(self, config: MiniGPT4VisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = Parameter(paddle.randn([1, 1, self.embed_dim]))
+
+        self.patch_embedding = nn.Conv2D(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = Parameter(paddle.randn([1, self.num_positions, self.embed_dim]))
+
+    def forward(self, pixel_values: paddle.Tensor) -> paddle.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds_shape = patch_embeds.shape
+        patch_embeds = paddle.reshape(
+            patch_embeds, shape=[patch_embeds_shape[0], patch_embeds_shape[1], -1]
+        ).transpose([0, 2, 1])
+
+        class_embeds = self.class_embedding.expand([batch_size, 1, -1]).cast(target_dtype)
+        embeddings = paddle.concat([class_embeds, patch_embeds], axis=1)
+        embeddings = embeddings + self.position_embedding[:, : embeddings.shape[1], :].cast(target_dtype)
+        return embeddings
+
+
+class MiniGPT4Attention(nn.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = nn.Dropout(config.attention_dropout)
+
+        # small tweak here compared to CLIP, no bias here
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias_attr=False)
+
+        if config.qkv_bias:
+            q_bias = Parameter(paddle.zeros([self.embed_dim]))
+            v_bias = Parameter(paddle.zeros([self.embed_dim]))
+        else:
+            q_bias = None
+            v_bias = None
+
+        if q_bias is not None:
+            qkv_bias = paddle.concat((q_bias, paddle.zeros_like(v_bias), v_bias))
+            self.qkv.bias = Parameter(qkv_bias)
+
+        self.projection = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
+        return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        head_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = hidden_states.shape
+
+        mixed_qkv = self.qkv(hidden_states)
+
+        mixed_qkv = mixed_qkv.reshape([bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads]).transpose(
+            [2, 0, 3, 1, 4]
+        )
+        query_states, key_states, value_states = (
+            mixed_qkv[0],
+            mixed_qkv[1],
+            mixed_qkv[2],
+        )
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_states, key_states, transpose_y=True)
+
+        attention_scores = attention_scores * self.scale
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = F.softmax(attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = paddle.matmul(attention_probs, value_states).transpose([0, 2, 1, 3])
+
+        new_context_layer_shape = context_layer.shape[:-2] + [
+            self.embed_dim,
+        ]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        output = self.projection(context_layer)
+
+        outputs = (output, attention_probs) if output_attentions else (output, None)
+
+        return outputs
+
+
+class MiniGPT4MLP(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class MiniGPT4EncoderLayer(nn.Layer):
+    def __init__(self, config: MiniGPT4Config):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = MiniGPT4Attention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
+        self.mlp = MiniGPT4MLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: paddle.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        """
+        Args:
+            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`paddle.Tensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            head_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = hidden_states + residual
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class MiniGPT4Encoder(nn.Layer):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`MiniGPT4EncoderLayer`].
+    Args:
+        config (`MiniGPT4Config`):
+            The corresponding vision configuration for the `MiniGPT4Encoder`.
+    """
+
+    def __init__(self, config: MiniGPT4Config):
+        super().__init__()
+        self.config = config
+        self.layers = nn.LayerList([MiniGPT4EncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = recompute(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class MiniGPT4VisionModel(MiniGPT4PretrainedModel):
+    main_input_name = "pixel_values"
+    config_class = MiniGPT4VisionConfig
+
+    def __init__(self, config: MiniGPT4VisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = MiniGPT4VisionEmbeddings(config)
+        self.encoder = MiniGPT4Encoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
+
+    def forward(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+
+class MiniGPT4QFormerMultiHeadAttention(nn.Layer):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
+                % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.save_attention = False
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.shape[:-1] + [self.num_attention_heads, self.attention_head_size]
+        x = x.reshape(new_x_shape)
+        return x.transpose([0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = paddle.concat([past_key_value[0], key_layer], axis=2)
+            value_layer = paddle.concat([past_key_value[1], value_layer], axis=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        mixed_query_layer = self.query(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_layer, key_layer, transpose_y=True)
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.shape[1]
+            position_ids_l = paddle.arange(seq_length, dtype="int64").reshape([-1, 1])
+            position_ids_r = paddle.arange(seq_length, dtype="int64").reshape([1, -1])
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.cast(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = paddle.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(axis=-1)(attention_scores)
+
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = paddle.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.transpose([0, 2, 1, 3])
+        new_context_layer_shape = context_layer.shape[:-2] + [
+            self.all_head_size,
+        ]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class MiniGPT4QFormerSelfOutput(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class MiniGPT4QFormerAttention(nn.Layer):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.attention = MiniGPT4QFormerMultiHeadAttention(config, is_cross_attention)
+        self.output = MiniGPT4QFormerSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, axis=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        encoder_attention_mask: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        self_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class MiniGPT4QFormerIntermediate(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class MiniGPT4QFormerOutput(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class MiniGPT4QFormerLayer(nn.Layer):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = MiniGPT4QFormerAttention(config)
+
+        self.layer_idx = layer_idx
+
+        if layer_idx % config.cross_attention_frequency == 0:
+            self.crossattention = MiniGPT4QFormerAttention(config, is_cross_attention=True)
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+
+        self.intermediate_query = MiniGPT4QFormerIntermediate(config)
+        self.output_query = MiniGPT4QFormerOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+
+        present_key_value = self_attention_outputs[-1]
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                if encoder_hidden_states is None:
+                    raise ValueError("encoder_hidden_states must be given for cross-attention layers")
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                # add cross attentions if we output attention weights
+                outputs = outputs + cross_attention_outputs[1:-1]
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = paddle.concat([layer_output, layer_output_text], axis=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+
+        outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+
+class MiniGPT4QFormerEncoder(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.LayerList(
+            [MiniGPT4QFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+
+        next_decoder_cache = () if use_cache else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions, query_length)
+
+                    return custom_forward
+
+                layer_outputs = recompute(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if layer_module.has_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class MiniGPT4QFormerModel(MiniGPT4PretrainedModel):
+    """
+    Querying Transformer (Q-Former), used in MiniGPT4.
+    """
+
+    def __init__(self, config: MiniGPT4QFormerConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.encoder = MiniGPT4QFormerEncoder(config)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self,
+        attention_mask: paddle.Tensor,
+        input_shape: Tuple[int],
+        has_query: bool = False,
+    ) -> paddle.Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+        Arguments:
+            attention_mask (`paddle.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`Tuple[int]`):
+                The shape of the input to the model.
+        Returns:
+            `paddle.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.cast(dtype=self.layernorm.weight.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def invert_attention_mask(self, encoder_attention_mask: paddle.Tensor) -> paddle.Tensor:
+        """
+        Invert an attention mask (e.g., switches 0. and 1.).
+        Args:
+            encoder_attention_mask (`paddle.Tensor`): An attention mask.
+        Returns:
+            `paddle.Tensor`: The inverted attention mask.
+        """
+        if encoder_attention_mask.ndim == 3:
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+        if encoder_attention_mask.ndim == 2:
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
+        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
+        # /transformer/transformer_layers.py#L270
+        encoder_extended_attention_mask = encoder_extended_attention_mask.cast(
+            dtype=self.layernorm.weight.dtype
+        )  # fp16 compatibility
+        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4
+
+        return encoder_extended_attention_mask
+
+    def get_head_mask(
+        self, head_mask: Optional[paddle.Tensor], num_hidden_layers: int, is_attention_chunked: bool = False
+    ) -> paddle.Tensor:
+        """
+        Prepare the head mask if needed.
+        Args:
+            head_mask (`paddle.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):
+                The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
+            num_hidden_layers (`int`):
+                The number of hidden layers in the model.
+            is_attention_chunked: (`bool`, *optional*, defaults to `False`):
+                Whether or not the attentions scores are computed by chunks or not.
+        Returns:
+            `paddle.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with
+            `[None]` for each layer.
+        """
+        if head_mask is not None:
+            head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
+            if is_attention_chunked is True:
+                head_mask = head_mask.unsqueeze(-1)
+        else:
+            head_mask = [None] * num_hidden_layers
+
+        return head_mask
+
+    def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
+        """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
+        if head_mask.ndim == 1:
+            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+            head_mask = head_mask.expand([num_hidden_layers, -1, -1, -1, -1])
+        elif head_mask.ndim == 2:
+            head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+        assert head_mask.ndim == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
+        head_mask = head_mask.cast(dtype=self.config.dtype)  # switch to float if need + fp16 compatibility
+        return head_mask
+
+    def forward(
+        self,
+        query_embeds,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(paddle.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of:
+            shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
+            value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
+            used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
+            `(batch_size, sequence_length)`.
+        use_cache (`bool`, `optional`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0
+        )
+
+        query_length = query_embeds.shape[1] if query_embeds is not None else 0
+
+        embedding_output = self.layernorm(query_embeds.cast(self.layernorm.weight.dtype))
+        embedding_output = self.dropout(embedding_output)
+
+        input_shape = embedding_output.shape[:-1]
+        batch_size, seq_length = input_shape
+
+        if attention_mask is None:
+            attention_mask = paddle.ones(((batch_size, seq_length + past_key_values_length)))
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].shape
+            else:
+                (
+                    encoder_batch_size,
+                    encoder_sequence_length,
+                    _,
+                ) = encoder_hidden_states.shape
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = paddle.ones(encoder_hidden_shape)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = sequence_output[:, 0, :]
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+class MiniGPT4Model(MiniGPT4PretrainedModel):
+    config_class = MiniGPT4Config
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: MiniGPT4Config):
+        super().__init__(config)
+
+        self.vision_model = MiniGPT4VisionModel(config.vision_config)
+
+        self.query_tokens = Parameter(paddle.zeros([1, config.num_query_tokens, config.qformer_config.hidden_size]))
+        self.qformer = MiniGPT4QFormerModel(config.qformer_config)
+
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+        self.language_model = LlamaForCausalLM(config.text_config)
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.vision_model.embeddings.patch_embedding
+
+    def get_text_features(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ):
+        r"""
+        Returns:
+            text_outputs (`CausalLMOutputWithPast`, or `tuple(paddle.Tensor)` if `return_dict=False`):
+                The language model outputs. If `return_dict=True`, the output is a [`CausalLMOutputWithPast`] that
+                contains the language model logits, the past key values and the hidden states if
+                `output_hidden_states=True`.
+        Examples:
+        ```python
+        >>> import paddle
+        >>> from paddlenlp.transformers import LlamaTokenizer, MiniGPT4Model
+        >>> tokenizer = LlamaTokenizer.from_pretrained("model_name")
+        >>> tokenizer.pad_token = tokenizer.eos_token
+        >>> model = MiniGPT4Model.from_pretrained("model_name")
+        >>> model.eval()
+        >>> inputs = tokenizer(["a photo of a cat"], padding=True, return_tensors="pd", return_token_type_ids=False)
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.language_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return text_outputs
+
+    def get_image_features(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ):
+        r"""
+        Returns:
+            vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`):
+                The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that
+                contains the image features, the pooled image features and the hidden states if
+                `output_hidden_states=True`.
+        Examples:
+        ```python
+        >>> import paddle
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import MinitGPT4Processor, MiniGPT4Model
+        >>> processor = MinitGPT4Processor.from_pretrained("model_name")
+        >>> model = MiniGPT4Model.from_pretrained("model_name")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor.process_images(images=image, return_tensors="pd")
+        >>> image_outputs = model.get_image_features(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
+        vision_outputs = self.vision_model(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return vision_outputs
+
+    def get_qformer_features(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ):
+        r"""
+        Returns:
+            vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`):
+                The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that
+                contains the image features, the pooled image features and the hidden states if
+                `output_hidden_states=True`.
+        Examples:
+        ```python
+        >>> import paddle
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import MinitGPT4Processor, MiniGPT4Model
+        >>> processor = MinitGPT4Processor.from_pretrained("model_name")
+        >>> model = MiniGPT4Model.from_pretrained("model_name")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor.process_images(images=image, return_tensors="pd")
+        >>> qformer_outputs = model.get_qformer_features(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
+        vision_outputs = self.vision_model(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[0]
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
+        query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype)
+        image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+
+        return query_outputs
+
+    def forward(
+        self,
+        pixel_values: paddle.Tensor,  # processed image
+        first_input_ids: paddle.Tensor,
+        second_input_ids: paddle.Tensor,
+        first_attention_mask: Optional[paddle.Tensor] = None,
+        second_attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[paddle.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MiniGPT4ForConditionalGenerationModelOutput]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> import paddle
+        >>> from paddlenlp.transformers import MiniGPT4Processor, MiniGPT4Model
+        >>> processor = MiniGPT4Processor.from_pretrained("model_name")
+        >>> model = MiniGPT4Model.from_pretrained("model_name")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "describe this image"
+        >>> prompt = "###Human: <Img><ImageHere></Img> <TextHere>###Assistant:"
+        >>> inputs = processor(images=image, texts=text, prompts=prompt, return_tensors="pd")
+        >>> outputs = model(**inputs)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
+        vision_outputs = self.vision_model(pixel_values, return_dict=True)
+        image_embeds = vision_outputs.last_hidden_state
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
+        query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype)
+        image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=True,
+        )
+        query_output = query_outputs.last_hidden_state
+
+        # step 3: use the language model, conditioned on the text and image
+        language_model_inputs = self.language_projection(query_output)
+        language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64")
+
+        first_embeds = self.language_model.llama.embed_tokens(first_input_ids)
+        second_embeds = self.language_model.llama.embed_tokens(second_input_ids)
+        language_model_inputs = paddle.cast(language_model_inputs, dtype=first_embeds.dtype)
+        inputs_embeds = paddle.concat([first_embeds, language_model_inputs, second_embeds], axis=1)
+
+        if first_attention_mask is None:
+            first_attention_mask = paddle.ones(first_embeds.shape[:-1], dtype="int64")
+        if second_attention_mask is None:
+            second_attention_mask = paddle.ones(second_embeds.shape[:-1], dtype="int64")
+        attention_mask = paddle.concat(
+            [first_attention_mask, language_model_attention_mask, second_attention_mask], axis=1
+        )
+
+        outputs = self.language_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = outputs.logits if return_dict else outputs[0]
+        loss = None
+        # we compute the loss here since we need to take into account the sequence length of the query embeds
+        if labels is not None:
+            logits = logits[:, -labels.shape[1] :, :]
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(reduction="mean")
+
+            loss = loss_fct(shift_logits.reshape([-1, self.config.text_config.vocab_size]), shift_labels.reshape([-1]))
+
+        if not return_dict:
+            output = (logits, vision_outputs, query_outputs, outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return MiniGPT4ForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+
+
+class MiniGPT4ForConditionalGeneration(MiniGPT4PretrainedModel):
+    config_class = MiniGPT4Config
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: MiniGPT4Config):
+        super().__init__(config)
+        self.config = config
+        self.vision_model = MiniGPT4VisionModel(config.vision_config)
+
+        self.query_tokens = Parameter(paddle.zeros([1, config.num_query_tokens, config.qformer_config.hidden_size]))
+        self.qformer = MiniGPT4QFormerModel(config.qformer_config)
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+        self.language_model = LlamaForCausalLM(config.text_config)
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(
+        self,
+        pixel_values: paddle.Tensor,  # processed image
+        first_input_ids: paddle.Tensor,
+        second_input_ids: paddle.Tensor,
+        first_attention_mask: Optional[paddle.Tensor] = None,
+        second_attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[paddle.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MiniGPT4ForConditionalGenerationModelOutput]:
+        r"""
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> import paddle
+        >>> from paddlenlp.transformers import MiniGPT4Processor, MiniGPT4ForConditionalGeneration
+        >>> processor = MiniGPT4Processor.from_pretrained("model_name")
+        >>> model = MiniGPT4ForConditionalGeneration.from_pretrained("model_name")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "describe this image"
+        >>> prompt = "###Human: <Img><ImageHere></Img> <TextHere>###Assistant:"
+        >>> inputs = processor(images=image, texts=text, prompts=prompt, return_tensors="pd")
+        >>> outputs = model(**inputs)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
+        vision_outputs = self.vision_model(pixel_values, return_dict=True)
+        image_embeds = vision_outputs.last_hidden_state
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
+        query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype)
+        image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=True,
+        )
+        query_output = query_outputs.last_hidden_state
+
+        # step 3: use the language model, conditioned on the text and image
+        language_model_inputs = self.language_projection(query_output)
+        language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64")
+
+        first_embeds = self.language_model.llama.embed_tokens(first_input_ids)
+        second_embeds = self.language_model.llama.embed_tokens(second_input_ids)
+        language_model_inputs = paddle.cast(language_model_inputs, dtype=first_embeds.dtype)
+        inputs_embeds = paddle.concat([first_embeds, language_model_inputs, second_embeds], axis=1)
+
+        if first_attention_mask is None:
+            first_attention_mask = paddle.ones(first_embeds.shape[:-1], dtype="int64")
+        if second_attention_mask is None:
+            second_attention_mask = paddle.ones(second_embeds.shape[:-1], dtype="int64")
+        attention_mask = paddle.concat(
+            [first_attention_mask, language_model_attention_mask, second_attention_mask], axis=1
+        )
+
+        outputs = self.language_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        logits = outputs.logits if return_dict else outputs[0]
+        loss = None
+        # we compute the loss here since we need to take into account the sequence length of the query embeds
+        if labels is not None:
+            logits = logits[:, -labels.shape[1] :, :]
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(reduction="mean")
+
+            loss = loss_fct(shift_logits.reshape([-1, self.config.text_config.vocab_size]), shift_labels.reshape([-1]))
+
+        if not return_dict:
+            output = (logits, vision_outputs, query_outputs, outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return MiniGPT4ForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+
+    @paddle.no_grad()
+    def generate(
+        self,
+        pixel_values: paddle.Tensor,  # processed image
+        first_input_ids: paddle.Tensor,
+        second_input_ids: paddle.Tensor,
+        first_attention_mask: Optional[paddle.Tensor] = None,
+        second_attention_mask: Optional[paddle.Tensor] = None,
+        **generate_kwargs,
+    ) -> paddle.Tensor:
+        """
+        Overrides `generate` function to be able to use the model as a conditional generator.
+        Args:
+            pixel_values (`paddle.Tensor` of shape (batch_size, num_channels, height, width)):
+                Input images to be processed.
+            first_input_ids (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*):
+                The first input prompt before the tag `<ImageHere>`, it's embeddings will concat with image embeddings and the embeddings of the second_input_ids for the generation.
+            second_input_ids (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*):
+                The second input prompt after the tag `<ImageHere>`, it's embeddings will concat with image embeddings and the embeddings of the first_input_ids for the generation.
+            first_attention_mask (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*):
+                The attention mask corresponding with the first_input_ids, whill will mask to avoid performing attention on padding token indices.
+            second_attention_mask (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*):
+                The attention mask corresponding with the second_input_ids, whill will mask to avoid performing attention on padding token indices.
+        Returns:
+            captions (list): A list of strings of length batch_size * num_captions.
+
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> import paddle
+        >>> from paddlenlp.transformers import MiniGPT4Processor, MiniGPT4ForConditionalGeneration
+        >>> processor = MiniGPT4Processor.from_pretrained("model_name")
+        >>> model = MiniGPT4ForConditionalGeneration.from_pretrained("model_name")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "describe this image"
+        >>> prompt = "###Human: <Img><ImageHere></Img> <TextHere>###Assistant:"
+        >>> inputs = processor(images=image, texts=text, prompts=prompt, return_tensors="pd")
+        >>> generated_ids, scores= model.generate(**inputs)
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        """
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
+        vision_outputs = self.vision_model(pixel_values, return_dict=True)
+        image_embeds = vision_outputs.last_hidden_state
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
+        query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype)
+        image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=True,
+        )
+        query_output = query_outputs.last_hidden_state
+
+        # step 3: use the language model, conditioned on the text and image
+        language_model_inputs = self.language_projection(query_output)
+        language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64")
+
+        first_embeds = self.language_model.llama.embed_tokens(first_input_ids)
+        second_embeds = self.language_model.llama.embed_tokens(second_input_ids)
+        language_model_inputs = paddle.cast(language_model_inputs, dtype=first_embeds.dtype)
+        inputs_embeds = paddle.concat([first_embeds, language_model_inputs, second_embeds], axis=1)
+
+        if first_attention_mask is None:
+            first_attention_mask = paddle.ones(first_embeds.shape[:-1], dtype="int64")
+        if second_attention_mask is None:
+            second_attention_mask = paddle.ones(second_embeds.shape[:-1], dtype="int64")
+        attention_mask = paddle.concat(
+            [first_attention_mask, language_model_attention_mask, second_attention_mask], axis=1
+        )
+
+        outputs = self.language_model.generate(
+            inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generate_kwargs
+        )
+
+        return outputs
+
+    @paddle.no_grad()
+    def encode_images(
+        self,
+        pixel_values: paddle.Tensor,  # processed image
+    ) -> paddle.Tensor:
+        """
+        Overrides `generate` function to be able to use the model as a conditional generator.
+        Args:
+            pixel_values (`paddle.Tensor` of shape (batch_size, num_channels, height, width)):
+                Input images to be processed.
+        Returns:
+            captions (list): A list of strings of length batch_size * num_captions.
+
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> import paddle
+        >>> from paddlenlp.transformers import MiniGPT4Processor, MiniGPT4ForConditionalGeneration
+        >>> processor = MiniGPT4Processor.from_pretrained("model_name")
+        >>> model = MiniGPT4ForConditionalGeneration.from_pretrained("model_name")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> image = processor.process_images(images=image, return_tensors="pd")
+        >>> image_features, image_attention_mask = model.encode_images(**image)
+        """
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
+        vision_outputs = self.vision_model(pixel_values, return_dict=True)
+        image_embeds = vision_outputs.last_hidden_state
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
+        query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype)
+        image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=True,
+        )
+        query_output = query_outputs.last_hidden_state
+
+        # step 3: use the language model, conditioned on the text and image
+        language_model_inputs = self.language_projection(query_output)
+        language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64")
+
+        return language_model_inputs, language_model_attention_mask
+
+    @paddle.no_grad()
+    def generate_with_image_features(
+        self,
+        image_features: paddle.Tensor,
+        first_input_ids: paddle.Tensor,
+        second_input_ids: paddle.Tensor,
+        image_attention_mask: Optional[paddle.Tensor] = None,
+        first_attention_mask: Optional[paddle.Tensor] = None,
+        second_attention_mask: Optional[paddle.Tensor] = None,
+        **generate_kwargs,
+    ) -> paddle.Tensor:
+        """
+        Overrides `generate` function to be able to use the model as a conditional generator.
+        Args:
+            image_features (`paddle.Tensor` of shape (batch_size, num_channels, height, width)):
+                Image features extracted with vit and qformer, specifically, the features extracted with the method `encoded_images`.
+            first_input_ids (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*):
+                The first input prompt before the tag `<ImageHere>`, it's embeddings will concat with image embeddings and the embeddings of the second_input_ids for the generation.
+            second_input_ids (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*):
+                The second input prompt after the tag `<ImageHere>`, it's embeddings will concat with image embeddings and the embeddings of the first_input_ids for the generation.
+            image_attention_mask (`paddle.Tensor` of shape (batch_size, image_sequence_length), *optional*):
+                The attention mask to the image_features.
+            first_attention_mask (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*):
+                The attention mask corresponding to the first_input_ids.
+            second_attention_mask (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*):
+                The attention mask corresponding to the second_input_ids.
+        Returns:
+            captions (list): A list of strings of length batch_size * num_captions.
+
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> import paddle
+        >>> from paddlenlp.transformers import MiniGPT4Processor, MiniGPT4ForConditionalGeneration
+        >>> processor = MiniGPT4Processor.from_pretrained("model_name")
+        >>> model = MiniGPT4ForConditionalGeneration.from_pretrained("model_name")
+        >>>  url = "https://paddlenlp.bj.bcebos.com/data/images/dog.png"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> processed_image = processor.process_images(images=image, return_tensors="pd")
+        >>> image_features, image_attention_mask = model.encode_images(**processed_image)
+        >>> text = "describe this image"
+        >>> prompt = "###Human: <Img><ImageHere></Img> <TextHere>###Assistant:"
+        >>> inputs = processor(text=text, prompt=prompt, return_tensors="pd")
+        >>> generated_ids, scores= model.generate_with_image_features(image_features, image_attention_mask=image_attention_mask, **inputs)
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        """
+        first_embeds = self.language_model.llama.embed_tokens(first_input_ids)
+        second_embeds = self.language_model.llama.embed_tokens(second_input_ids)
+        image_features = paddle.cast(image_features, dtype=first_embeds.dtype)
+        inputs_embeds = paddle.concat([first_embeds, image_features, second_embeds], axis=1)
+
+        if first_attention_mask is None:
+            first_attention_mask = paddle.ones(first_embeds.shape[:-1], dtype="int64")
+        if second_attention_mask is None:
+            second_attention_mask = paddle.ones(second_embeds.shape[:-1], dtype="int64")
+        if image_attention_mask is None:
+            image_attention_mask = paddle.ones(image_features.shape[:-1], dtype="int64")
+
+        attention_mask = paddle.concat([first_attention_mask, image_attention_mask, second_attention_mask], axis=1)
+
+        outputs = self.language_model.generate(
+            inputs_embeds=inputs_embeds, attention_mask=attention_mask, **generate_kwargs
+        )
+
+        return outputs
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/minigpt4/processing.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/minigpt4/processing.py
new file mode 100644
index 000000000..e31d7ef72
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/minigpt4/processing.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Processor class for MiniGPT4.
+"""
+
+from typing import List, Optional, Union
+
+import numpy as np
+import paddle
+from PIL import Image
+
+from ..image_processing_utils import BatchFeature
+from ..image_utils import ImageInput
+from ..processing_utils import ProcessorMixin
+from ..tokenizer_utils_base import BatchEncoding, TensorType, TextInput
+
+__all__ = [
+    "MiniGPT4Processor",
+]
+
+
+class MiniGPT4Processor(ProcessorMixin):
+    r"""
+    Constructs a MiniGPT4 processor which wraps a MiniGPT4 image processor and an llama tokenizer into a single processor.
+    [`MiniGPT4Processor`] offers all the functionalities of [`MiniGPT4ImageProcessor`] and [`LlamaTokenizer`]. See the docstring
+    of [`~MiniGPT4ImageProcessor.__call__`] and [`~LlamaTokenizer.decode`] for more information.
+
+    Args:
+        image_processor (`MiniGPT4ImageProcessor`):
+            An instance of [`MiniGPT4ImageProcessor`]. The image processor is a required input.
+        tokenizer (`LlamaTokenizer`):
+            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
+
+    Examples:
+    ```python
+    >>> import requests
+    >>> from PIL import Image
+
+    >>> import paddle
+    >>> from paddlenlp.transformers import MiniGPT4Processor
+
+    >>> # load processor
+    >>> minigpt4_13b_path = "model_name"
+    >>> processor = MiniGPT4Processor.from_pretrained(minigpt4_13b_path)
+    >>> print("load processor and model done!")
+
+    >>> # prepare model inputs for MiniGPT4
+    >>> url = "https://paddlenlp.bj.bcebos.com/data/images/mugs.png"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
+    >>> text = "describe this image"
+    >>> prompt = "Give the following image: <Img>ImageContent</Img>. You will be able to see the image once I provide it to you. Please answer my questions.###Human: <Img><ImageHere></Img> <TextHere>###Assistant:"
+    >>> res = processor([image], text, prompt)
+    ```"""
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "MiniGPT4ImageProcessor"
+    tokenizer_class = "LlamaTokenizer"
+
+    def __init__(self, image_processor, tokenizer):
+        tokenizer.return_token_type_ids = False
+        tokenizer.model_input_names = ["input_ids", "attention_mask"]
+        tokenizer.padding_side = "right"
+        tokenizer.pad_token = tokenizer.eos_token
+        super().__init__(image_processor, tokenizer)
+        self.current_processor = self.image_processor
+        self.default_prompt = "###Human: <Img><ImageHere></Img> <TextHere>###Assistant: "
+        self.image_tag = "<ImageHere>"
+        self.text_tag = "<TextHere>"
+
+    def process_images(
+        self,
+        images: ImageInput,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        This method uses [`MiniGPT4ImageProcessor.__call__`] method to prepare image(s) for the model.
+        Please refer to the docstring of the method for more information.
+        """
+        if not images:
+            raise ValueError("You have to input correct images.")
+
+        if isinstance(images, (Image.Image, np.ndarray, paddle.Tensor)):
+            images = [images]
+
+        # processing with image processor
+        processed_images = self.image_processor(images, return_tensors=return_tensors)
+
+        return processed_images
+
+    def process_texts(
+        self,
+        texts: Union[TextInput, List[TextInput]],
+        prompts: Union[TextInput, List[TextInput]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE,
+        **kwargs,
+    ):
+        prompts = prompts if prompts is not None else [self.default_prompt]
+
+        if (not isinstance(texts, TextInput)) and (not isinstance(texts, list)):
+            raise TypeError("Unsupported type for texts: {}, only str and list type supported.".format(type(texts)))
+        if prompts is not None and (not isinstance(prompts, TextInput)) and (not isinstance(prompts, list)):
+            raise TypeError(
+                "Unsupported type for prompts: {}, only str and list type supported.".format(type(prompts))
+            )
+
+        if isinstance(prompts, list):
+            if isinstance(texts, list) and len(prompts) != len(texts):
+                raise ValueError(
+                    "The length of prompts not is equal to texts' length: {} != {}".format(len(prompts), len(texts))
+                )
+            elif isinstance(texts, TextInput):
+                texts = [texts] * len(prompts)
+        else:
+            if isinstance(texts, TextInput):
+                texts = [texts]
+                prompts = [prompts]
+            else:
+                prompts = [prompts] * len(texts)
+
+        assemble_texts = []
+        for text, prompt in zip(texts, prompts):
+            if self.image_tag not in text:
+                if self.image_tag not in prompt:
+                    raise ValueError(
+                        "A prompt should contain a image tag `{}` to insert image embeddings. if you don't want to use prompt function, you have to input a text with the image tag `{}`.".format(
+                            self.image_tag, self.image_tag
+                        )
+                    )
+                if self.text_tag not in prompt:
+                    raise ValueError(
+                        "A prompt should contain a text tag `{}` to insert text information.".format(self.text_tag)
+                    )
+                assemble_texts.append(prompt.replace(self.text_tag, text))
+            else:
+                assemble_texts.append(text)
+
+        # processing with text tokenizer
+        first_texts, second_texts = zip(*[assemble_text.split(self.image_tag) for assemble_text in assemble_texts])
+        first_text_encoding = self.tokenizer(
+            text=first_texts, return_tensors=return_tensors, add_special_tokens=True, **kwargs
+        )
+        second_text_encoding = self.tokenizer(
+            text=second_texts, return_tensors=return_tensors, add_special_tokens=False, **kwargs
+        )
+
+        encoded_texts = BatchEncoding(
+            {
+                "first_input_ids": first_text_encoding["input_ids"],
+                "first_attention_mask": first_text_encoding["attention_mask"],
+                "second_input_ids": second_text_encoding["input_ids"],
+                "second_attention_mask": second_text_encoding["attention_mask"],
+            }
+        )
+        return encoded_texts
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: str = None,
+        prompt: str = None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        This method uses [`MiniGPT4ImageProcessor.__call__`] method to prepare image(s) for the model, and
+        [`LlamaTokenizer.__call__`] to prepare text for the model.
+        Please refer to the docstring of the above two methods for more information.
+        """
+        prompt = prompt if prompt is not None else self.default_prompt
+
+        if images is None and text is None:
+            raise ValueError("Images and text are None, you have to specify either images or texts.")
+        if images is not None and not isinstance(images, (Image.Image, np.ndarray, paddle.Tensor, list)):
+            raise TypeError(
+                "A type in [Image.Image, np.ndarray, paddle.Tensor, list] for images is expected, but received {}.".format(
+                    type(images)
+                )
+            )
+        if text is not None and not isinstance(text, str):
+            raise TypeError("A str type of text is expected, but received {}.".format(type(text)))
+        if prompt is not None and not isinstance(prompt, str):
+            raise TypeError("A str type of prompt is expected, but received {}.".format(type(prompt)))
+
+        if images is not None and not isinstance(images, list):
+            images = [images]
+        if text is not None and images is not None:
+            texts = [text] * len(images)
+            prompts = [prompt] * len(images)
+        elif text is not None and images is None:
+            texts = [text]
+            prompts = [prompt]
+
+        # image-only mode
+        if text is None:
+            # processing with image processor
+            processed_features = self.process_images(images, return_tensors=return_tensors, **kwargs)
+            return processed_features
+
+        # text-only mode
+        if images is None:
+            # processing with text tokenizer
+            encoded_texts = self.process_texts(texts, prompts, **kwargs)
+            return encoded_texts
+
+        # text-image mode
+        processed_features = self.image_processor(images, return_tensors=return_tensors)
+        encoded_texts = self.process_texts(texts, prompts, **kwargs)
+        processed_features.update(encoded_texts)
+
+        return processed_features
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mistral/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mistral/__init__.py
new file mode 100644
index 000000000..0b41cc3d8
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mistral/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .configuration import MistralConfig
+from .modeling import MistralForCausalLM
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mistral/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mistral/configuration.py
new file mode 100644
index 000000000..11237e5c8
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mistral/configuration.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Mistral model configuration"""
+
+from ..configuration_utils import PretrainedConfig
+
+
+class MistralConfig(PretrainedConfig):
+    model_type = "mistral"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        sliding_window=4096,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mistral/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mistral/modeling.py
new file mode 100644
index 000000000..f973390f0
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mistral/modeling.py
@@ -0,0 +1,962 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import warnings
+from functools import partial
+from typing import List, Optional, Tuple, Union
+
+import paddle
+import paddle.distributed.fleet.meta_parallel as mpu
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.distributed import fleet
+from paddle.distributed.fleet.utils import recompute
+from paddle.nn import CrossEntropyLoss
+
+from paddlenlp.transformers.conversion_utils import (
+    StateDictNameMapping,
+    init_name_mappings,
+)
+from paddlenlp.utils.log import logger
+
+from ..activations import ACT2FN
+from ..model_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithCrossAttentions,
+    CausalLMOutputWithPast,
+)
+from ..model_utils import PretrainedModel
+from .configuration import MistralConfig
+
+
+def _make_causal_mask(
+    input_ids_shape: paddle.shape,
+    dtype: paddle.dtype,
+    past_key_values_length: int = 0,
+):
+    """
+    Make causal mask used for sliding window attention
+    """
+    bsz, tgt_len = input_ids_shape
+
+    tensor = paddle.full(
+        (tgt_len, tgt_len),
+        fill_value=1,
+    )
+    mask = paddle.tril(tensor, diagonal=0)
+    mask = paddle.log(mask).astype(dtype)
+
+    if past_key_values_length > 0:
+        mask = paddle.concat([paddle.zeros([tgt_len, past_key_values_length], dtype=dtype), mask], axis=-1)
+    return mask[None, None, :, :].expand([bsz, 1, tgt_len, tgt_len + past_key_values_length])
+
+
+def _expand_mask(mask: paddle.Tensor, dtype: paddle.dtype, tgt_len: Optional[int] = None):
+    expanded_mask = mask
+    if len(mask.shape) == 2:
+        """
+        Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+        """
+        bsz, src_len = mask.shape
+        tgt_len = tgt_len if tgt_len is not None else src_len
+
+        expanded_mask = mask[:, None, None, :].expand([bsz, 1, tgt_len, src_len]).astype(dtype)
+    elif len(mask.shape) == 3:
+        """
+        Expands attention_mask from `[bsz, tgt_seq_len, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+        """
+        expanded_mask = mask.unsqueeze(1).astype(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return paddle.where(inverted_mask > 0.5, paddle.full_like(inverted_mask, paddle.finfo(dtype).min), inverted_mask)
+
+
+class MistralRMSNorm(nn.Layer):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        MistralRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = paddle.create_parameter(
+            shape=[hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(1.0),
+        )
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.astype(paddle.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * paddle.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.astype(input_dtype)
+
+
+class MistralRotaryEmbedding(nn.Layer):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.inv_freq = 1.0 / (self.base ** (paddle.arange(0, self.dim, 2).astype("float32") / self.dim))
+
+        # Build here to make `paddle.jit.trace` work.
+        self._set_cos_sin_cache(seq_len=max_position_embeddings, dtype=paddle.get_default_dtype())
+
+    def _set_cos_sin_cache(self, seq_len, dtype):
+        self.max_seq_len_cached = seq_len
+        t = paddle.arange(self.max_seq_len_cached, dtype=self.inv_freq.dtype)
+
+        freqs = paddle.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = paddle.concat((freqs, freqs), axis=-1)
+        self.cos_cached = emb.cos().astype(dtype)
+        self.sin_cached = emb.sin().astype(dtype)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].astype(dtype=x.dtype),
+            self.sin_cached[:seq_len].astype(dtype=x.dtype),
+        )
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return paddle.concat((-x2, x1), axis=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    cos = cos[position_ids].unsqueeze(1)  # [seq_len, dim] -> [batch_size, 1, seq_len, head_dim]
+    sin = sin[position_ids].unsqueeze(1)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class MistralMLP(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        if config.tensor_parallel_degree > 1:
+            self.gate_proj = mpu.ColumnParallelLinear(
+                self.hidden_size,
+                self.intermediate_size,
+                gather_output=False,
+                has_bias=False,
+            )
+            self.up_proj = mpu.ColumnParallelLinear(
+                self.hidden_size,
+                self.intermediate_size,
+                gather_output=False,
+                has_bias=False,
+            )
+
+            self.down_proj = mpu.RowParallelLinear(
+                self.intermediate_size,
+                self.hidden_size,
+                input_is_parallel=True,
+                has_bias=False,
+            )
+        else:
+            self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
+            self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
+            self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias_attr=False)
+
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor:
+    """
+    This is the equivalent of paddle.repeat_interleave(x, axis=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand([batch, num_key_value_heads, n_rep, slen, head_dim])
+    return hidden_states.reshape([batch, num_key_value_heads * n_rep, slen, head_dim])
+
+
+class MistralAttention(nn.Layer):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: MistralConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        if config.tensor_parallel_degree > 1:
+            if self.num_key_value_heads % config.tensor_parallel_degree != 0:
+                raise ValueError(
+                    f"num_key_value_heads must be divisible by tensor_parallel_degree (got `num_key_value_heads`: {self.num_key_value_heads}"
+                    f" and `tensor_parallel_degree`: {config.tensor_parallel_degree})."
+                )
+
+            self.q_proj = mpu.ColumnParallelLinear(
+                self.hidden_size,
+                self.num_heads * self.head_dim,
+                has_bias=False,
+                gather_output=False,
+            )
+            self.k_proj = mpu.ColumnParallelLinear(
+                self.hidden_size,
+                self.num_key_value_heads * self.head_dim,
+                has_bias=False,
+                gather_output=False,
+            )
+            self.v_proj = mpu.ColumnParallelLinear(
+                self.hidden_size,
+                self.num_key_value_heads * self.head_dim,
+                has_bias=False,
+                gather_output=False,
+            )
+        else:
+            self.q_proj = nn.Linear(
+                self.hidden_size,
+                self.num_heads * self.head_dim,
+                bias_attr=False,
+            )
+            self.k_proj = nn.Linear(
+                self.hidden_size,
+                self.num_key_value_heads * self.head_dim,
+                bias_attr=False,
+            )
+            self.v_proj = nn.Linear(
+                self.hidden_size,
+                self.num_key_value_heads * self.head_dim,
+                bias_attr=False,
+            )
+
+        if config.tensor_parallel_degree > 1:
+            self.o_proj = mpu.RowParallelLinear(
+                self.num_heads * self.head_dim,
+                self.hidden_size,
+                has_bias=False,
+                input_is_parallel=True,
+            )
+            self.num_heads = self.num_heads // config.tensor_parallel_degree
+            self.num_key_value_heads = self.num_key_value_heads // config.tensor_parallel_degree
+        else:
+            self.o_proj = nn.Linear(
+                self.num_heads * self.head_dim,
+                self.hidden_size,
+                bias_attr=False,
+            )
+
+        self.rotary_emb = MistralRotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        bsz, q_len, _ = hidden_states.shape
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.reshape([bsz, q_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
+        key_states = key_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose([0, 2, 1, 3])
+        value_states = value_states.reshape([bsz, q_len, self.num_key_value_heads, self.head_dim]).transpose(
+            [0, 2, 1, 3]
+        )
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = paddle.concat([past_key_value[0], key_states], axis=2)
+            value_states = paddle.concat([past_key_value[1], value_states], axis=2)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        if not self.config.use_flash_attention:
+            attn_weights = paddle.matmul(query_states, key_states.transpose([0, 1, 3, 2])) / math.sqrt(self.head_dim)
+
+            if attn_weights.shape != [bsz, self.num_heads, q_len, kv_seq_len]:
+                raise ValueError(
+                    f"Attention weights should be of size {[bsz, self.num_heads, q_len, kv_seq_len]}, but is"
+                    f" {attn_weights.shape}"
+                )
+
+            if attention_mask is not None:
+                if attention_mask.shape != [bsz, 1, q_len, kv_seq_len]:
+                    raise ValueError(
+                        f"Attention mask should be of size {[bsz, 1, q_len, kv_seq_len]}, but is {attention_mask.shape}"
+                    )
+
+                attn_weights = attn_weights + attention_mask
+
+            # upcast attention to fp32
+            attn_weights = nn.functional.softmax(attn_weights, axis=-1, dtype=paddle.float32).astype(
+                query_states.dtype
+            )
+            attn_output = paddle.matmul(attn_weights, value_states)
+        else:
+            query_states = query_states.transpose([0, 2, 1, 3])
+            key_states = key_states.transpose([0, 2, 1, 3])
+            value_states = value_states.transpose([0, 2, 1, 3])
+            attn_output = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                is_causal=attention_mask is None,
+            )
+            attn_output = attn_output.transpose([0, 2, 1, 3])
+
+        if attn_output.shape != [bsz, self.num_heads, q_len, self.head_dim]:
+            raise ValueError(
+                f"`attn_output` should be of size {[bsz, self.num_heads, q_len, self.head_dim]}, but is"
+                f" {attn_output.shape}"
+            )
+
+        attn_output = attn_output.transpose([0, 2, 1, 3])
+        attn_output = attn_output.reshape([bsz, q_len, self.num_heads * self.head_dim])
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class MistralDecoderLayer(nn.Layer):
+    def __init__(self, config: MistralConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = MistralAttention(config=config)
+        self.mlp = MistralMLP(config)
+        self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
+        """
+        Args:
+            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`paddle.Tensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(paddle.Tensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class MistralPreTrainedModel(PretrainedModel):
+    config_class = MistralConfig
+    base_model_prefix = "mistral"
+
+    @classmethod
+    def _get_name_mappings(cls, config: MistralConfig) -> List[StateDictNameMapping]:
+        mappings: List[StateDictNameMapping] = []
+        model_mappings = [
+            ["embed_tokens.weight"],
+            ["norm.weight"],
+        ]
+        for layer_index in range(config.num_hidden_layers):
+            layer_mappings = [
+                [f"layers.{layer_index}.self_attn.q_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.k_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.v_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.o_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.mlp.gate_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.mlp.down_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.mlp.up_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.input_layernorm.weight"],
+                [f"layers.{layer_index}.post_attention_layernorm.weight"],
+            ]
+            model_mappings.extend(layer_mappings)
+
+        init_name_mappings(mappings=model_mappings)
+        for mapping in model_mappings:
+            mapping[0] = "model." + mapping[0]
+            mapping[1] = "mistral." + mapping[1]
+
+        if "MistralModel" not in config.architectures:
+            model_mappings.append(["lm_head.weight", "lm_head.weight", "transpose"])
+
+        mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)]
+        return mappings
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config: MistralConfig, is_split=True):
+
+        from paddlenlp.transformers.conversion_utils import split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def get_tensor_parallel_split_mappings(num_layers):
+            final_actions = {}
+
+            base_actions = {
+                "lm_head.weight": partial(fn, is_column=True),
+                # Row Linear
+                "embed_tokens.weight": partial(fn, is_column=False),
+                "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False),
+                "layers.0.mlp.down_proj.weight": partial(fn, is_column=False),
+            }
+
+            # Column Linear
+            base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True)
+            # if we have enough num_key_value_heads to split, then split it.
+            if config.num_key_value_heads % config.tensor_parallel_degree == 0:
+                base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True)
+                base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True)
+
+            base_actions["layers.0.mlp.gate_proj.weight"] = partial(fn, is_column=True)
+            base_actions["layers.0.mlp.up_proj.weight"] = partial(fn, is_column=True)
+
+            for key, action in base_actions.items():
+                if "layers.0." in key:
+                    for i in range(num_layers):
+                        final_actions[key.replace("layers.0.", f"layers.{i}.")] = action
+                final_actions[key] = action
+
+            return final_actions
+
+        mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
+
+        return mappings
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(
+            layer,
+            (
+                nn.Linear,
+                nn.Embedding,
+                mpu.VocabParallelEmbedding,
+                mpu.ColumnParallelLinear,
+                mpu.RowParallelLinear,
+            ),
+        ):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range
+                        if hasattr(self.config, "initializer_range")
+                        else self.llama.config.initializer_range,
+                        shape=layer.weight.shape,
+                    )
+                )
+        # Layer.apply is DFS https://github.com/PaddlePaddle/Paddle/blob/a6f5021fcc58b21f4414bae6bf4731ef6971582c/python/paddle/nn/layer/layers.py#L527-L530
+        # sublayer is init first
+        # scale RowParallelLinear weight
+        with paddle.no_grad():
+            if isinstance(layer, MistralMLP):
+                factor = 1 / math.sqrt(2 * self.config.num_hidden_layers)
+                layer.down_proj.weight.scale_(factor)
+            if isinstance(layer, MistralAttention):
+                factor = 1 / math.sqrt(2 * self.config.num_hidden_layers)
+                layer.o_proj.weight.scale_(factor)
+
+
+class MistralModel(MistralPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
+
+    Args:
+        config: MistralConfig
+    """
+
+    def __init__(self, config: MistralConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        if config.tensor_parallel_degree > 1:
+            self.embed_tokens = mpu.VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()),
+            )
+        else:
+            self.embed_tokens = nn.Embedding(
+                config.vocab_size,
+                config.hidden_size,
+                self.padding_idx,
+            )
+        self.layers = nn.LayerList([MistralDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.enable_recompute = False
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+
+        combined_attention_mask = _make_causal_mask(
+            input_shape,
+            inputs_embeds.dtype,
+            past_key_values_length=past_key_values_length,
+        )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    def forward(
+        self,
+        input_ids: paddle.Tensor = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[List[paddle.Tensor]] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+
+        if position_ids is None:
+            position_ids = paddle.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=paddle.int64
+            )
+            position_ids = position_ids.unsqueeze(0).expand((batch_size, seq_length))
+        else:
+            position_ids = position_ids.reshape([-1, seq_length]).astype("int64")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask,
+            (batch_size, seq_length),
+            inputs_embeds,
+            past_key_values_length,
+        )
+
+        hidden_states = inputs_embeds
+
+        if self.enable_recompute and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            has_gradient = not hidden_states.stop_gradient
+            if self.enable_recompute and has_gradient:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = recompute(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+def parallel_matmul(x: paddle.Tensor, y: paddle.Tensor, tensor_parallel_output=True):
+    is_fleet_init = True
+    tensor_parallel_degree = 1
+    try:
+        hcg = fleet.get_hybrid_communicate_group()
+        model_parallel_group = hcg.get_model_parallel_group()
+        tensor_parallel_degree = hcg.get_model_parallel_world_size()
+    except:
+        is_fleet_init = False
+
+    if paddle.in_dynamic_mode():
+        y_is_distributed = y.is_distributed
+    else:
+        y_is_distributed = tensor_parallel_degree > 1
+
+    if is_fleet_init and tensor_parallel_degree > 1 and y_is_distributed:
+        # if not running under distributed.launch, it will raise AttributeError: 'Fleet' object has no attribute '_hcg'
+        input_parallel = paddle.distributed.collective._c_identity(x, group=model_parallel_group)
+        logits = paddle.matmul(input_parallel, y, transpose_y=False)
+
+        if tensor_parallel_output:
+            return logits
+
+        return paddle.distributed.collective._c_concat(logits, group=model_parallel_group)
+
+    else:
+        logits = paddle.matmul(x, y, transpose_y=False)
+        return logits
+
+
+class MistralLMHead(nn.Layer):
+    def __init__(self, config: MistralConfig):
+        super(MistralLMHead, self).__init__()
+        self.config = config
+        if config.tensor_parallel_degree > 1:
+            vocab_size = config.vocab_size // config.tensor_parallel_degree
+        else:
+            vocab_size = config.vocab_size
+
+        self.weight = self.create_parameter(
+            shape=[config.hidden_size, vocab_size],
+            dtype=paddle.get_default_dtype(),
+        )
+        # Must set distributed attr for Tensor Parallel !
+        self.weight.is_distributed = True if (vocab_size != config.vocab_size) else False
+        if self.weight.is_distributed:
+            self.weight.split_axis = 1
+
+    def forward(self, hidden_states, tensor_parallel_output=None):
+        if tensor_parallel_output is None:
+            tensor_parallel_output = self.config.tensor_parallel_output
+
+        logits = parallel_matmul(hidden_states, self.weight, tensor_parallel_output=tensor_parallel_output)
+        return logits
+
+
+class MistralPretrainingCriterion(paddle.nn.Layer):
+    """
+    Criterion for Llama.
+    It calculates the final loss.
+    """
+
+    def __init__(self, config):
+
+        super(MistralPretrainingCriterion, self).__init__()
+        self.ignore_index = getattr(config, "ignore_index", -100)
+        self.config = config
+        self.enable_parallel_cross_entropy = config.tensor_parallel_degree > 1 and config.tensor_parallel_output
+
+        if self.enable_parallel_cross_entropy:  # and False: # and lm_head is distributed
+            self.loss_func = mpu.ParallelCrossEntropy(ignore_index=self.ignore_index)
+        else:
+            self.loss_func = CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index)
+
+    def forward(self, prediction_scores, masked_lm_labels):
+        if self.enable_parallel_cross_entropy:
+            if prediction_scores.shape[-1] == self.config.vocab_size:
+                warnings.warn(
+                    f"enable_parallel_cross_entropy, the vocab_size should be splited: {prediction_scores.shape[-1]}, {self.config.vocab_size}"
+                )
+                self.loss_func = CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index)
+
+        with paddle.amp.auto_cast(False):
+            masked_lm_loss = self.loss_func(prediction_scores.astype("float32"), masked_lm_labels.unsqueeze(2))
+            # skip ignore_index which loss == 0
+            masked_lm_loss = masked_lm_loss[masked_lm_loss > 0].astype("float32")
+            loss = paddle.mean(masked_lm_loss)
+
+        return loss
+
+
+class MistralForCausalLM(MistralPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.mistral = MistralModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = MistralLMHead(config)
+        self.criterion = MistralPretrainingCriterion(config)
+
+    def get_input_embeddings(self):
+        return self.mistral.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.mistral.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.mistral = decoder
+
+    def get_decoder(self):
+        return self.mistral
+
+    def prepare_inputs_for_generation(
+        self, input_ids, use_cache=False, past_key_values=None, inputs_embeds=None, **kwargs
+    ):
+        batch_size, seq_length = input_ids.shape
+        position_ids = kwargs.get("position_ids", paddle.arange(seq_length).expand((batch_size, seq_length)))
+        attention_mask = kwargs.get("attention_mask", None)
+        if past_key_values:
+            input_ids = input_ids[:, -1].unsqueeze(axis=-1)
+            position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False):
+        # update cache
+        if isinstance(outputs, tuple) and len(outputs) > 1 and not isinstance(outputs[1], paddle.Tensor):
+            model_kwargs["past_key_values"] = outputs[1]
+
+        if isinstance(outputs, CausalLMOutputWithCrossAttentions) and "past_key_values" in outputs:
+            model_kwargs["past_key_values"] = outputs.past_key_values
+
+        # update position_ids
+        if "position_ids" in model_kwargs and model_kwargs["position_ids"] is not None:
+            position_ids = model_kwargs["position_ids"]
+            model_kwargs["position_ids"] = paddle.concat([position_ids, position_ids[..., -1:] + 1], axis=-1)
+
+        if not is_encoder_decoder and "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs.pop("attention_mask", None)
+
+            if attention_mask is not None and len(attention_mask.shape) == 2:
+                model_kwargs["attention_mask"] = paddle.concat(
+                    [attention_mask, paddle.ones([attention_mask.shape[0], 1], dtype=attention_mask.dtype)], axis=-1
+                )
+
+        return model_kwargs
+
+    def forward(
+        self,
+        input_ids: paddle.Tensor = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[List[paddle.Tensor]] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.mistral(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.astype("float32")
+
+        loss = None
+        if labels is not None:
+            loss = self.criterion(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mixtral/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mixtral/__init__.py
new file mode 100644
index 000000000..816c416d6
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mixtral/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .configuration import MixtralConfig
+from .modeling import MixtralForCausalLM
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mixtral/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mixtral/configuration.py
new file mode 100644
index 000000000..e09b8b42a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mixtral/configuration.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Mixtral model configuration"""
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = [
+    "MixtralConfig",
+]
+
+
+class MixtralConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`~MixtralModel`]. It is used to instantiate an Mixtral
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Mixtral-7B-v0.1 or Mixtral-7B-Instruct-v0.1.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Mixtral model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~MixtralModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
+            The maximum sequence length that this model might ever be used with. Mixtral's sliding window attention
+            allows sequence of up to 4096*32 tokens.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings.
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        num_experts_per_tok (`int`, *optional*, defaults to 2):
+            The number of experts to root per-token, can be also interpreted as the `top-p` routing
+            parameter
+        num_local_experts (`int`, *optional*, defaults to 8):
+            Number of experts per Sparse MLP layer.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabeling this will also
+            allow the model to output the auxiliary loss. See [here]() for more details
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        Example:
+    ```python
+    >>> from paddlenlp.transformer import MixtralModel, MixtralConfig
+
+    >>> # Initializing a Mixtral mixtral-7b style configuration
+    >>> configuration = MixtralConfig()
+
+    >>> # Initializing a model from the mixtral-7b style configuration
+    >>> model = MixtralModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "mixtral"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        max_position_embeddings=4096 * 32,
+        seq_length=2048,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        attention_dropout=0.0,
+        rope_theta=1e6,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        num_experts_per_tok=2,
+        num_local_experts=8,
+        router_aux_loss_coef=0.001,
+        output_router_logits=False,
+        sliding_window=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.max_position_embeddings = max_position_embeddings
+        self.seq_length = seq_length
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.attention_dropout = attention_dropout
+
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+
+        self.use_cache = use_cache
+
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        self.rope_theta = rope_theta
+
+        # ----------------- Experts -------------------- #
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_local_experts = num_local_experts
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.output_router_logits = output_router_logits
+
+        self.sliding_window = sliding_window
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mixtral/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mixtral/modeling.py
new file mode 100644
index 000000000..5c6b535da
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mixtral/modeling.py
@@ -0,0 +1,1535 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Paddle Mixtral model"""
+from __future__ import annotations
+
+import math
+import warnings
+from functools import partial
+from typing import Optional, Tuple
+
+import paddle
+import paddle.distributed.fleet.meta_parallel as mpu
+import paddle.nn.functional as F
+from paddle import Tensor, nn
+from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
+from paddle.distributed.fleet.utils import recompute
+
+try:
+    from paddle.incubate.nn.functional import fused_rotary_position_embedding
+except ImportError:
+    fused_rotary_position_embedding = None
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        GatherOp,
+        ScatterOp,
+        mark_as_sequence_parallel_parameter,
+    )
+except:
+    pass
+
+from ...utils.log import logger
+from .. import linear_utils
+from ..activations import ACT2FN
+from ..conversion_utils import StateDictNameMapping, init_name_mappings
+from ..linear_utils import Linear
+from ..model_outputs import MoECausalLMOutputWithPast, MoEModelOutputWithPast
+from ..model_utils import PretrainedModel, register_base_model
+from .configuration import MixtralConfig
+
+try:
+    from paddle.nn.functional.flash_attention import flash_attention
+except:
+    flash_attention = None
+
+__all__ = [
+    "MixtralModel",
+    "MixtralPretrainedModel",
+    "MixtralForCausalLM",
+    "MixtralPretrainingCriterion",
+]
+
+
+def load_balancing_loss_func(gate_logits, num_experts, top_k=2, attention_mask=None):
+    """
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Paddle.
+    See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+    Args:
+        gate_logits (Union[`paddle.Tensor`, Tuple[paddle.Tensor]):
+            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
+            shape [batch_size X sequence_length, num_experts].
+        num_experts (`int`):
+            Number of experts.
+        top_k (`int`):
+            Number of top k experts to be considered for the loss computation.
+        attention_mask (`paddle.Tensor`, None):
+            The attention_mask used in forward function
+            shape [batch_size X sequence_length] if not None.
+    Returns:
+        The auxiliary loss.
+    """
+    if gate_logits is None or not isinstance(gate_logits, tuple):
+        return 0
+
+    if isinstance(gate_logits, tuple):
+        concatenated_gate_logits = paddle.concat(
+            gate_logits, axis=0
+        )  # [num_hidden_layers X batch_size X sequence_length, num_experts]
+
+    routing_weights = F.softmax(concatenated_gate_logits, axis=-1)
+    _, selected_experts = paddle.topk(routing_weights, top_k, axis=-1)
+    expert_mask = F.one_hot(
+        selected_experts, num_classes=num_experts
+    )  # [num_hidden_layers X batch_size X sequence_length, top_k, num_experts]
+
+    if attention_mask is None or len(attention_mask.shape) == 4:
+        # Only intokens strategy has 4-D attention_mask, we currently do not support excluding padding tokens.
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = paddle.mean(expert_mask.astype("float32"), axis=0)
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = paddle.mean(routing_weights, axis=0)
+    else:
+        # Exclude the load balancing loss of padding tokens.
+        if len(attention_mask.shape) == 2:
+            batch_size, sequence_length = attention_mask.shape
+            num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+
+            # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+            expert_attention_mask = (
+                attention_mask[None, :, :, None, None]
+                .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
+                .reshape([-1, top_k, num_experts])
+            )  # [num_hidden_layers * batch_size * sequence_length, top_k, num_experts]
+
+            # Compute the percentage of tokens routed to each experts
+            tokens_per_expert = paddle.sum(expert_mask.astype("float32") * expert_attention_mask, axis=0) / paddle.sum(
+                expert_attention_mask, axis=0
+            )
+
+            # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+            router_per_expert_attention_mask = (
+                attention_mask[None, :, :, None]
+                .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
+                .reshape([-1, num_experts])
+            )
+
+            # Compute the average probability of routing to these experts
+            router_prob_per_expert = paddle.sum(
+                routing_weights * router_per_expert_attention_mask, axis=0
+            ) / paddle.sum(router_per_expert_attention_mask, axis=0)
+
+    overall_loss = paddle.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
+    return overall_loss * num_experts
+
+
+def get_triangle_upper_mask(x, mask=None):
+    if mask is not None:
+        return mask
+    # [bsz, n_head, q_len, kv_seq_len]
+    shape = x.shape
+    #  [bsz, 1, q_len, kv_seq_len]
+    shape[1] = 1
+    mask = paddle.full(shape, paddle.finfo(x.dtype).min, dtype=x.dtype)
+    mask = paddle.triu(mask, diagonal=1)
+    mask.stop_gradient = True
+    return mask
+
+
+def assign_kv_heads(num_kv_heads: int, num_gpus: int):
+    # Initialize the assignment list
+    """
+    Assign kv heads to different GPUs in the Tensor Parallel Setup
+
+    Examples:
+        assign_kv_heads(num_kv_heads=1, num_gpus=2): [[0], [0]]
+        assign_kv_heads(num_kv_heads=2, num_gpus=2): [[0], [1]]
+        assign_kv_heads(num_kv_heads=4, num_gpus=2): [[0,1], [2,3]]
+        assign_kv_heads(num_kv_heads=1, num_gpus=4): [[0],[0],[0],[0]]
+        assign_kv_heads(num_kv_heads=2, num_gpus=4): [[0],[0],[1],[1]]
+        assign_kv_heads(num_kv_heads=4, num_gpus=4): [[0],[1],[2],[3]]
+    """
+    assignment_list = [[] for _ in range(num_gpus)]
+    # Case 1: more heads than cards
+    if num_kv_heads > num_gpus:
+        num_heads_per_card = num_kv_heads // num_gpus
+        for i in range(num_gpus):
+            for j in range(num_heads_per_card):
+                assignment_list[i].append(i * num_heads_per_card + j)
+    # Case 2: more cards than heads. each card get only 1 head.
+    else:
+        num_card_per_heads = num_gpus // num_kv_heads
+        for i in range(num_kv_heads):
+            for j in range(num_card_per_heads):
+                assignment_list[i * num_card_per_heads + j].append(i)
+    return assignment_list
+
+
+def parallel_matmul(x: Tensor, y: Tensor, tensor_parallel_output=True):
+    is_fleet_init = True
+    tensor_parallel_degree = 1
+    try:
+        hcg = fleet.get_hybrid_communicate_group()
+        model_parallel_group = hcg.get_model_parallel_group()
+        tensor_parallel_degree = hcg.get_model_parallel_world_size()
+    except:
+        is_fleet_init = False
+
+    if paddle.in_dynamic_mode():
+        y_is_distributed = y.is_distributed
+    else:
+        y_is_distributed = tensor_parallel_degree > 1
+
+    if is_fleet_init and tensor_parallel_degree > 1 and y_is_distributed:
+        # if not running under distributed.launch, it will raise AttributeError: 'Fleet' object has no attribute '_hcg'
+        input_parallel = paddle.distributed.collective._c_identity(x, group=model_parallel_group)
+        logits = paddle.matmul(input_parallel, y, transpose_y=False)
+
+        if tensor_parallel_output:
+            return logits
+
+        return paddle.distributed.collective._c_concat(logits, group=model_parallel_group)
+
+    else:
+        logits = paddle.matmul(x, y, transpose_y=False)
+        return logits
+
+
+def scaled_dot_product_attention(
+    query_states,
+    config,
+    key_states,
+    value_states,
+    attention_mask,
+    output_attentions,
+    training=True,
+    sequence_parallel=False,
+):
+    bsz, q_len, num_heads, head_dim = query_states.shape
+    _, kv_seq_len, _, _ = value_states.shape
+
+    if config.use_flash_attention and flash_attention:
+        # Paddle Flash Attention input [ bz, seqlen, nhead, head_dim]
+        # Torch Flash Attention input [ bz, nhead, seqlen, head_dim]
+
+        version = paddle.version.full_version
+        if version != "0.0.0" and version <= "2.5.2":
+            attn_output, attn_weights = flash_attention(
+                query_states,
+                key_states,
+                value_states,
+                causal=True,
+                return_softmax=output_attentions,
+            )
+        else:
+            attn_output = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                is_causal=attention_mask is None,
+                dropout_p=config.attention_dropout if training else 0.0,
+                training=training,
+            )
+            attn_weights = None
+
+        if sequence_parallel:
+            attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])
+        else:
+            attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
+        return (attn_output, attn_weights) if output_attentions else attn_output
+    else:
+        #  [ bz, seqlen, nhead, head_dim] -> [bs, nhead, seq_len, head_dim]
+        query_states = paddle.transpose(query_states, [0, 2, 1, 3])
+        # merge with the next tranpose
+        key_states = paddle.transpose(key_states, [0, 2, 1, 3])
+        value_states = paddle.transpose(value_states, [0, 2, 1, 3])
+
+        # matmul and devide by sqrt(head_dim)
+        attn_weights = paddle.matmul(query_states / math.sqrt(head_dim), key_states.transpose([0, 1, 3, 2]))
+
+        if attn_weights.shape != [bsz, num_heads, q_len, kv_seq_len]:
+            raise ValueError(
+                f"Attention weights should be of shape {(bsz, num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.shape}"
+            )
+
+        if attention_mask is None:
+            attention_mask = get_triangle_upper_mask(attn_weights)
+        attention_mask = attention_mask.reshape([bsz, 1, q_len, kv_seq_len])
+        if attention_mask.shape != [bsz, 1, q_len, kv_seq_len]:
+            raise ValueError(
+                f"Attention mask should be of shape {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.shape}"
+            )
+
+        attn_weights = attn_weights + attention_mask
+        if not paddle.in_dynamic_mode():
+            attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query_states.dtype)
+        else:
+            with paddle.amp.auto_cast(False):
+                attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query_states.dtype)
+
+        attn_weights = F.dropout(attn_weights, p=config.attention_dropout, training=training)
+
+        attn_output = paddle.matmul(attn_weights, value_states)
+        attn_output = attn_output.transpose([0, 2, 1, 3])
+
+        if sequence_parallel:
+            attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])
+        else:
+            attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
+        return (attn_output, attn_weights) if output_attentions else attn_output
+
+
+def masked_fill(x, mask, value):
+    y = paddle.full(x.shape, value, x.dtype)
+    return paddle.where(mask, y, x)
+
+
+def is_casual_mask(attention_mask):
+    """
+    Upper triangular of attention_mask equals to attention_mask is casual
+    """
+    return (paddle.triu(attention_mask) == attention_mask).all().item()
+
+
+def _make_causal_mask(input_ids_shape, past_key_values_length):
+    """
+    Make causal mask used for self-attention
+    """
+    batch_size, target_length = input_ids_shape  # target_length: seq_len
+
+    mask = paddle.tril(paddle.ones((target_length, target_length), dtype="bool"))
+
+    if past_key_values_length > 0:
+        # [tgt_len, tgt_len + past_len]
+        mask = paddle.concat([paddle.ones([target_length, past_key_values_length], dtype="bool"), mask], axis=-1)
+
+    # [bs, 1, tgt_len, tgt_len + past_len]
+    return mask[None, None, :, :].expand([batch_size, 1, target_length, target_length + past_key_values_length])
+
+
+def _expand_2d_mask(mask, dtype, tgt_length):
+    """
+    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
+    """
+    batch_size, src_length = mask.shape[0], mask.shape[-1]
+    tgt_length = tgt_length if tgt_length is not None else src_length
+
+    mask = mask[:, None, None, :].astype("bool")
+    mask.stop_gradient = True
+    expanded_mask = mask.expand([batch_size, 1, tgt_length, src_length])
+
+    return expanded_mask
+
+
+class MixtralRMSNorm(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.weight = paddle.create_parameter(
+            shape=[self.hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(1.0),
+        )
+        self.variance_epsilon = config.rms_norm_eps
+        self.config = config
+
+        if config.sequence_parallel:
+            mark_as_sequence_parallel_parameter(self.weight)
+
+    def forward(self, hidden_states):
+        if paddle.in_dynamic_mode():
+            with paddle.amp.auto_cast(False):
+                hidden_states = hidden_states.astype("float32")
+                variance = hidden_states.pow(2).mean(-1, keepdim=True)
+                hidden_states = paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
+        else:
+            hidden_states = hidden_states.astype("float32")
+            variance = hidden_states.pow(2).mean(-1, keepdim=True)
+            hidden_states = paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
+
+        if self.weight.dtype in [paddle.float16, paddle.bfloat16]:
+            hidden_states = paddle.cast(hidden_states, self.weight.dtype)
+        return hidden_states * self.weight
+
+
+def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor:
+    """
+    This is the equivalent of paddle.repeat_interleave(hidden_states, n_rep, axis=1). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, slen, num_key_value_heads, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+
+    hidden_states = hidden_states.unsqueeze(-2).tile([1, 1, 1, n_rep, 1])
+    return hidden_states.reshape([batch, slen, num_key_value_heads * n_rep, head_dim])
+
+
+class MixtralRotaryEmbedding(nn.Layer):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        # [dim / 2]
+        self.inv_freq = 1.0 / (self.base ** (paddle.cast(paddle.arange(0, self.dim, 2), dtype="float32") / self.dim))
+        self._set_cos_sin_cache(seq_len=max_position_embeddings)
+
+    def _set_cos_sin_cache(self, seq_len):
+        self.max_seq_len_cached = seq_len
+        # [seq_len]
+        t = paddle.arange(seq_len, dtype="float32")
+        # [seq_len, dim/2]
+        freqs = paddle.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        # [seq_len, dim]
+        emb = paddle.concat([freqs, freqs], axis=-1)
+        # [1, seqlen, 1, dim]
+        self.cos_cached = emb.cos()[None, :, None, :]
+        self.sin_cached = emb.sin()[None, :, None, :]
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len)
+        cos = self.cos_cached[:, :seq_len, :, :]
+        sin = self.sin_cached[:, :seq_len, :, :]
+        return (
+            cos.cast(x.dtype) if cos.dtype != x.dtype else cos,
+            sin.cast(x.dtype) if sin.dtype != x.dtype else sin,
+        )
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return paddle.concat([-x2, x1], axis=-1)  # shape is the same as x
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+
+    if position_ids is None:
+        # Note: Only for MixtralForCausalLMPipe model pretraining
+        cos = cos[:, : q.shape[1], :, :]  # [bs, seq_len, 1, dim]
+        sin = sin[:, : q.shape[1], :, :]  # [bs, seq_len, 1, dim]
+    else:
+        cos = cos.squeeze(axis=[0, 2])  # [seq_len, dim]
+        sin = sin.squeeze(axis=[0, 2])  # [seq_len, dim]
+        cos = cos[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
+        sin = sin[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class MixtralMLP(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.tensor_parallel_degree = config.tensor_parallel_degree
+
+        if config.sequence_parallel:
+            ColumnParallelLinear = linear_utils.ColumnSequenceParallelLinear
+            RowParallelLinear = linear_utils.RowSequenceParallelLinear
+        else:
+            ColumnParallelLinear = linear_utils.ColumnParallelLinear
+            RowParallelLinear = linear_utils.RowParallelLinear
+
+        if config.tensor_parallel_degree > 1:
+            self.w1 = ColumnParallelLinear(
+                self.hidden_size,
+                self.intermediate_size,
+                gather_output=False,
+                has_bias=False,
+            )
+            self.w3 = ColumnParallelLinear(
+                self.hidden_size,
+                self.intermediate_size,
+                gather_output=False,
+                has_bias=False,
+            )
+            self.w2 = RowParallelLinear(
+                self.intermediate_size,
+                self.hidden_size,
+                input_is_parallel=True,
+                has_bias=False,
+            )
+        else:
+            self.w1 = Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
+            self.w2 = Linear(self.intermediate_size, self.hidden_size, bias_attr=False)
+            self.w3 = Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
+
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        x = self.act_fn(self.w1(x)) * self.w3(x)
+        x = self.w2(x)
+        return x
+
+
+class MixtralSparseMoeBlock(nn.Layer):
+    def __init__(self, config: MixtralConfig):
+        super().__init__()
+        self.hidden_dim = config.hidden_size
+        self.ffn_dim = config.intermediate_size
+        self.num_experts = config.num_local_experts
+        self.top_k = config.num_experts_per_tok
+        self.gate = Linear(self.hidden_dim, self.num_experts, bias_attr=False)
+        self.experts = nn.LayerList([MixtralMLP(config) for _ in range(self.num_experts)])
+
+    def forward(self, hidden_states):
+        batch_size, seq_len, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.reshape([-1, hidden_dim])
+        # router_logits: [batch_size * seq_len, num_experts]
+        router_logits = self.gate(hidden_states)
+
+        with paddle.amp.auto_cast(False):
+            routing_weights = F.softmax(router_logits.astype("float32"), axis=1)
+        routing_weights, selected_experts = paddle.topk(routing_weights, self.top_k, axis=-1)
+        routing_weights /= routing_weights.sum(axis=-1, keepdim=True)
+        # we cast back to input dtype
+        routing_weights = routing_weights.astype(hidden_states.dtype)
+
+        final_hidden_states = paddle.zeros(
+            [batch_size * seq_len, hidden_dim],
+            dtype=hidden_states.dtype,
+        )
+
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated.
+        # shape: [num_experts, top_k, batch_size * seq_len]
+        expert_mask = F.one_hot(selected_experts, num_classes=self.num_experts).transpose([2, 1, 0])
+
+        # Loop over all available experts in the model and perform the computation on each expert.
+        for expert_id in range(self.num_experts):
+            expert_layer = self.experts[expert_id]
+            idx, top_x = paddle.where(expert_mask[expert_id])
+
+            if top_x.shape[0] == 0:
+                continue
+
+            current_state = paddle.gather(hidden_states, top_x.squeeze())
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx]
+
+            top_x = top_x.squeeze()
+            if top_x.shape == []:
+                top_x = paddle.to_tensor([top_x.item()])
+            final_hidden_states.index_add_(top_x, 0, current_hidden_states.astype(hidden_states.dtype))
+
+        final_hidden_states = final_hidden_states.reshape([batch_size, seq_len, hidden_dim])
+        return final_hidden_states, router_logits
+
+
+class MixtralAttention(nn.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: MixtralConfig, layerwise_recompute: bool = False):
+        super().__init__()
+
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+
+        self.head_dim = self.hidden_size // config.num_attention_heads
+
+        self.num_key_value_heads = config.num_key_value_heads
+        assert config.num_attention_heads // config.num_key_value_heads
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads
+        self.rope_theta = config.rope_theta
+        self.max_position_embeddings = config.max_position_embeddings
+        self.seq_length = config.seq_length
+        self.sequence_parallel = config.sequence_parallel
+
+        # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True
+        # Enable_recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        self.layerwise_recompute = layerwise_recompute
+        self.recompute_granularity = config.recompute_granularity
+        if config.tensor_parallel_degree > 1:
+            assert (
+                self.num_heads % config.tensor_parallel_degree == 0
+            ), f"num_heads: {self.num_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}"
+            self.num_heads = self.num_heads // config.tensor_parallel_degree
+
+            assert (
+                self.num_key_value_heads % config.tensor_parallel_degree == 0
+            ), f"num_key_value_heads: {self.num_key_value_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}"
+            self.num_key_value_heads = self.num_key_value_heads // config.tensor_parallel_degree
+
+        self.use_fused_rope = config.use_fused_rope
+        if self.use_fused_rope:
+            if "gpu" not in paddle.device.get_device() or fused_rotary_position_embedding is None:
+                warnings.warn(
+                    "Enable fuse rope in the config, but fuse rope is not available. "
+                    "Will disable fuse rope. Try using latest gpu version of Paddle."
+                )
+                self.use_fused_rope = False
+
+        if config.sequence_parallel:
+            ColumnParallelLinear = linear_utils.ColumnSequenceParallelLinear
+            RowParallelLinear = linear_utils.RowSequenceParallelLinear
+        else:
+            ColumnParallelLinear = linear_utils.ColumnParallelLinear
+            RowParallelLinear = linear_utils.RowParallelLinear
+
+        if config.tensor_parallel_degree > 1:
+            self.q_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.hidden_size,
+                has_bias=False,
+                gather_output=False,
+            )
+            self.k_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.config.num_key_value_heads * self.head_dim,
+                has_bias=False,
+                gather_output=False,
+            )
+            self.v_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.config.num_key_value_heads * self.head_dim,
+                has_bias=False,
+                gather_output=False,
+            )
+        else:
+            self.q_proj = Linear(
+                self.hidden_size,
+                self.hidden_size,
+                bias_attr=False,
+            )
+            self.k_proj = Linear(
+                self.hidden_size,
+                self.config.num_key_value_heads * self.head_dim,
+                bias_attr=False,
+            )
+            self.v_proj = Linear(
+                self.hidden_size,
+                self.config.num_key_value_heads * self.head_dim,
+                bias_attr=False,
+            )
+
+        if config.tensor_parallel_degree > 1:
+            self.o_proj = RowParallelLinear(
+                self.hidden_size,
+                self.hidden_size,
+                has_bias=False,
+                input_is_parallel=True,
+            )
+        else:
+            self.o_proj = Linear(
+                self.hidden_size,
+                self.hidden_size,
+                bias_attr=False,
+            )
+
+        self.rotary_emb = MixtralRotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+
+        self.config = config
+
+    def forward(
+        self,
+        hidden_states,
+        position_ids: Optional[Tuple[paddle.Tensor]] = None,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism)
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        if self.sequence_parallel:
+            target_query_shape = [-1, self.seq_length, self.num_heads, self.head_dim]
+            target_key_value_shape = [-1, self.seq_length, self.num_key_value_heads, self.head_dim]
+        else:
+            target_query_shape = [0, 0, self.num_heads, self.head_dim]
+            target_key_value_shape = [0, 0, self.num_key_value_heads, self.head_dim]
+        query_states = query_states.reshape(shape=target_query_shape)
+        key_states = key_states.reshape(shape=target_key_value_shape)
+        value_states = value_states.reshape(shape=target_key_value_shape)
+
+        kv_seq_len = key_states.shape[-3]
+
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-3]
+
+        if self.use_fused_rope:
+            assert past_key_value is None, "fuse rotary not support cache kv for now"
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+            query_states, key_states, _ = fused_rotary_position_embedding(
+                query_states,
+                key_states,
+                v=None,
+                sin=sin,
+                cos=cos,
+                position_ids=position_ids,
+                use_neox_rotary_style=False,
+            )
+        else:
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        # [bs, seq_len, num_head, head_dim]
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = paddle.concat([past_key_value[0], key_states], axis=1)
+            value_states = paddle.concat([past_key_value[1], value_states], axis=1)
+
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        # TODO(wj-Mcat): use broadcast strategy when n_kv_heads = 1
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        has_gradient = not (query_states.stop_gradient and key_states.stop_gradient and value_states.stop_gradient)
+        if (
+            self.enable_recompute
+            and self.layerwise_recompute
+            and has_gradient
+            and self.recompute_granularity == "core_attn"
+        ):
+            outputs = recompute(
+                scaled_dot_product_attention,
+                query_states,
+                self.config,
+                key_states,
+                value_states,
+                attention_mask,
+                output_attentions,
+                self.training,
+                self.sequence_parallel,
+                use_reentrant=self.config.recompute_use_reentrant,
+            )
+        else:
+            outputs = scaled_dot_product_attention(
+                query_states,
+                self.config,
+                key_states,
+                value_states,
+                attention_mask,
+                output_attentions,
+                self.training,
+                self.sequence_parallel,
+            )
+        if output_attentions:
+            attn_output, attn_weights = outputs
+        else:
+            attn_output = outputs
+
+        # if sequence_parallel is true, out shape are [q_len / n, bs, num_head * head_dim]
+        # else their shape are [bs, q_len, num_head * head_dim], n is mp parallelism.
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        outputs = (attn_output,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        if use_cache:
+            outputs += (past_key_value,)
+
+        if type(outputs) is tuple and len(outputs) == 1:
+            outputs = outputs[0]
+
+        return outputs
+
+
+class MixtralDecoderLayer(nn.Layer):
+    def __init__(self, config, layerwise_recompute: bool = False):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.self_attn = MixtralAttention(config, layerwise_recompute)
+        self.block_sparse_moe = MixtralSparseMoeBlock(config)
+        self.input_layernorm = MixtralRMSNorm(config)
+        self.post_attention_layernorm = MixtralRMSNorm(config)
+        self.sequence_parallel = config.sequence_parallel
+        # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True
+        # Enable_recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        self.layerwise_recompute = layerwise_recompute
+        self.recompute_granularity = config.recompute_granularity
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        position_ids: Optional[Tuple[paddle.Tensor]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        output_router_logits: Optional[bool] = False,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
+        """
+        Args:
+            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`paddle.Tensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
+                should not be returned during inference.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `cache` key value states are returned and can be used to speed up decoding
+                (see `cache`).
+            cache (`Tuple(paddle.Tensor)`, *optional*): cached past key and value projection states
+        """
+
+        # [bs * seq_len, embed_dim] -> [seq_len * bs / n, embed_dim] (sequence_parallel)
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        has_gradient = not hidden_states.stop_gradient
+        if (
+            self.enable_recompute
+            and self.layerwise_recompute
+            and has_gradient
+            and self.recompute_granularity == "full_attn"
+        ):
+            outputs = recompute(
+                self.self_attn,
+                hidden_states,
+                position_ids,
+                past_key_value,
+                attention_mask,
+                output_attentions,
+                use_cache,
+                use_reentrant=self.config.recompute_use_reentrant,
+            )
+        else:
+            outputs = self.self_attn(
+                hidden_states,
+                position_ids,
+                past_key_value,
+                attention_mask,
+                output_attentions,
+                use_cache,
+            )
+
+        if type(outputs) is tuple:
+            hidden_states = outputs[0]
+        else:
+            hidden_states = outputs
+
+        if output_attentions:
+            self_attn_weights = outputs[1]
+
+        if use_cache:
+            present_key_value = outputs[2 if output_attentions else 1]
+
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states, router_logits = self.block_sparse_moe(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        if output_router_logits:
+            outputs += (router_logits,)
+
+        if type(outputs) is tuple and len(outputs) == 1:
+            outputs = outputs[0]
+
+        return outputs
+
+
+class MixtralPretrainedModel(PretrainedModel):
+    config_class = MixtralConfig
+    base_model_prefix = "mixtral"
+    _keys_to_ignore_on_load_unexpected = [r"self_attn.rotary_emb.inv_freq"]
+
+    @classmethod
+    def _get_name_mappings(cls, config: MixtralConfig) -> list[StateDictNameMapping]:
+        mappings: list[StateDictNameMapping] = []
+        model_mappings = [
+            ["embed_tokens.weight"],
+            ["norm.weight"],
+        ]
+        for layer_index in range(config.num_hidden_layers):
+            layer_mappings = [
+                [f"layers.{layer_index}.self_attn.q_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.k_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.v_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.o_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.rotary_emb.inv_freq"],
+                [f"layers.{layer_index}.input_layernorm.weight"],
+                [f"layers.{layer_index}.post_attention_layernorm.weight"],
+            ]
+            model_mappings.extend(layer_mappings)
+
+            for expert_idx in range(config.num_local_experts):
+                expert_mappings = [
+                    [f"layers.{layer_index}.block_sparse_moe.experts.{expert_idx}.w1.weight", None, "transpose"],
+                    [f"layers.{layer_index}.block_sparse_moe.experts.{expert_idx}.w2.weight", None, "transpose"],
+                    [f"layers.{layer_index}.block_sparse_moe.experts.{expert_idx}.w3.weight", None, "transpose"],
+                ]
+                model_mappings.extend(expert_mappings)
+            model_mappings.append([f"layers.{layer_index}.block_sparse_moe.gate.weight", None, "transpose"])
+
+        init_name_mappings(mappings=model_mappings)
+        # base-model prefix "MixtralModel"
+        if "MixtralModel" not in config.architectures:
+            for mapping in model_mappings:
+                mapping[0] = "model." + mapping[0]
+                mapping[1] = "mixtral." + mapping[1]
+            model_mappings.append(["lm_head.weight", "lm_head.weight", "transpose"])
+
+        mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)]
+        return mappings
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config: MixtralConfig, is_split=True):
+
+        from paddlenlp.transformers.conversion_utils import split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def get_tensor_parallel_split_mappings(num_layers, num_local_experts):
+            final_actions = {}
+
+            base_actions = {
+                "lm_head.weight": partial(fn, is_column=True),
+                # Row Linear
+                "embed_tokens.weight": partial(fn, is_column=False),
+                "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False),
+            }
+
+            if not config.vocab_size % config.tensor_parallel_degree == 0:
+                base_actions.pop("lm_head.weight")
+                base_actions.pop("embed_tokens.weight")
+
+            # Column Linear
+            base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True)
+            # if we have enough num_key_value_heads to split, then split it.
+            if config.num_key_value_heads % config.tensor_parallel_degree == 0:
+                base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True)
+                base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True)
+
+            for key, action in base_actions.items():
+                if "layers.0." in key:
+                    for i in range(num_layers):
+                        final_actions[key.replace("layers.0.", f"layers.{i}.")] = action
+                final_actions[key] = action
+
+            # Add tp split for expert params.
+            base_actions = {
+                "layers.0.block_sparse_moe.experts.0.w1.weight": partial(fn, is_column=True),
+                "layers.0.block_sparse_moe.experts.0.w2.weight": partial(fn, is_column=False),
+                "layers.0.block_sparse_moe.experts.0.w3.weight": partial(fn, is_column=True),
+            }
+            for key, action in base_actions.items():
+                for i in range(num_layers):
+                    newkey = key.replace("layers.0.", f"layers.{i}.")
+                    for j in range(num_local_experts):
+                        newkey2 = newkey.replace("experts.0.", f"experts.{j}.")
+                        final_actions[newkey2] = action
+
+            return final_actions
+
+        mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers, config.num_local_experts)
+
+        return mappings
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if self.config.tensor_parallel_degree > 1:
+            rng_tracker = get_rng_state_tracker().rng_state
+        if isinstance(
+            layer,
+            (
+                nn.Linear,
+                nn.Embedding,
+                mpu.VocabParallelEmbedding,
+                mpu.ColumnParallelLinear,
+                mpu.RowParallelLinear,
+                MixtralLMHead,
+                linear_utils.ColumnSequenceParallelLinear,
+                linear_utils.RowSequenceParallelLinear,
+            ),
+        ):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                if layer.weight.is_distributed:
+                    with rng_tracker():
+                        layer.weight.set_value(
+                            paddle.tensor.normal(
+                                mean=0.0,
+                                std=self.config.initializer_range
+                                if hasattr(self.config, "initializer_range")
+                                else self.mixtral.config.initializer_range,
+                                shape=layer.weight.shape,
+                            )
+                        )
+                else:
+                    layer.weight.set_value(
+                        paddle.tensor.normal(
+                            mean=0.0,
+                            std=self.config.initializer_range
+                            if hasattr(self.config, "initializer_range")
+                            else self.mixtral.config.initializer_range,
+                            shape=layer.weight.shape,
+                        )
+                    )
+        # Layer.apply is DFS https://github.com/PaddlePaddle/Paddle/blob/a6f5021fcc58b21f4414bae6bf4731ef6971582c/python/paddle/nn/layer/layers.py#L527-L530
+        # sublayer is init first
+        # scale RowParallelLinear weight
+        with paddle.no_grad():
+            if isinstance(layer, MixtralMLP):
+                factor = 1 / math.sqrt(2 * self.config.num_hidden_layers)
+                layer.w2.weight.scale_(factor)
+            if isinstance(layer, MixtralAttention):
+                factor = 1 / math.sqrt(2 * self.config.num_hidden_layers)
+                layer.o_proj.weight.scale_(factor)
+
+
+@register_base_model
+class MixtralModel(MixtralPretrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MixtralDecoderLayer`]
+    Args:
+        config: MixtralConfig
+    """
+
+    def __init__(self, config: MixtralConfig):
+        super().__init__(config)
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+        self.sequence_parallel = config.sequence_parallel
+        self.recompute_granularity = config.recompute_granularity
+        self.no_recompute_layers = config.no_recompute_layers if config.no_recompute_layers is not None else []
+
+        # Recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0:
+            self.embed_tokens = mpu.VocabParallelEmbedding(
+                self.vocab_size,
+                self.hidden_size,
+                weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()),
+            )
+        else:
+            self.embed_tokens = nn.Embedding(
+                self.vocab_size,
+                self.hidden_size,
+            )
+
+        self.layers = nn.LayerList(
+            [MixtralDecoderLayer(config, i not in self.no_recompute_layers) for i in range(config.num_hidden_layers)]
+        )
+        self.norm = MixtralRMSNorm(config)
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @staticmethod
+    def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length, dtype):
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            if len(attention_mask.shape) == 2:
+                expanded_attn_mask = _expand_2d_mask(attention_mask, dtype, tgt_length=input_shape[-1])
+                # For decoding phase in generation, seq_length = 1, we don't need to add causal mask
+                if input_shape[-1] > 1:
+                    combined_attention_mask = _make_causal_mask(
+                        input_shape,
+                        past_key_values_length=past_key_values_length,
+                    )
+                    expanded_attn_mask = expanded_attn_mask & combined_attention_mask
+            # [bsz, seq_len, seq_len] -> [bsz, 1, seq_len, seq_len]
+            elif len(attention_mask.shape) == 3:
+                expanded_attn_mask = attention_mask.unsqueeze(1).astype("bool")
+            # if attention_mask is already 4-D, do nothing
+            else:
+                expanded_attn_mask = attention_mask
+        else:
+            expanded_attn_mask = _make_causal_mask(
+                input_shape,
+                past_key_values_length=past_key_values_length,
+            )
+        # Convert bool attention_mask to float attention mask, which will be added to attention_scores later
+        expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
+        return expanded_attn_mask
+
+    @paddle.jit.not_to_static
+    def recompute_training_full(
+        self,
+        layer_module: nn.Layer,
+        hidden_states: Tensor,
+        position_ids: Optional[Tensor],
+        attention_mask: Tensor,
+        output_attentions: bool,
+        output_router_logits: bool,
+        past_key_value: Tensor,
+        use_cache: bool,
+    ):
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+
+            return custom_forward
+
+        hidden_states = recompute(
+            create_custom_forward(layer_module),
+            hidden_states,
+            position_ids,
+            attention_mask,
+            output_attentions,
+            output_router_logits,
+            past_key_value,
+            use_cache,
+            use_reentrant=self.config.recompute_use_reentrant,
+        )
+
+        return hidden_states
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        use_cache=None,
+        past_key_values=None,
+        output_attentions=False,
+        output_hidden_states=None,
+        output_router_logits: Optional[bool] = None,
+        return_dict=False,
+        **kwargs,
+    ):
+        if self.sequence_parallel and use_cache:
+            raise ValueError("We currently only support sequence parallel without cache.")
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.layers))
+        # NOTE: to make cache can be clear in-time
+        past_key_values = list(past_key_values)
+
+        seq_length_with_past = seq_length
+        cache_length = 0
+        if past_key_values[0] is not None:
+            cache_length = past_key_values[0][0].shape[1]
+            seq_length_with_past += cache_length
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if self.sequence_parallel:
+            # [bs, seq_len, num_head * head_dim] -> [bs * seq_len, num_head * head_dim]
+            bs, seq_len, hidden_size = inputs_embeds.shape
+            inputs_embeds = paddle.reshape_(inputs_embeds, [bs * seq_len, hidden_size])
+            # [seq_len * bs / n, num_head * head_dim] (n is mp parallelism)
+            inputs_embeds = ScatterOp.apply(inputs_embeds)
+
+        # embed positions
+        if attention_mask is None:
+            # [bs, seq_len]
+            attention_mask = paddle.ones((batch_size, seq_length_with_past), dtype=paddle.bool)
+
+        if position_ids is None:
+            position_ids = paddle.arange(seq_length, dtype="int64").expand((batch_size, seq_length))
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), cache_length, inputs_embeds.dtype
+        )  # [bs, 1, seq_len, seq_len]
+        if self.config.use_flash_attention:
+            is_casual = is_casual_mask(attention_mask)
+            if is_casual:
+                attention_mask = None
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_router_logits = () if output_router_logits else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, (decoder_layer) in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            has_gradient = not hidden_states.stop_gradient
+            if (
+                self.enable_recompute
+                and idx not in self.no_recompute_layers
+                and has_gradient
+                and self.recompute_granularity == "full"
+            ):
+                layer_outputs = self.recompute_training_full(
+                    decoder_layer,
+                    hidden_states,
+                    position_ids,
+                    attention_mask,
+                    output_attentions,
+                    output_router_logits,
+                    past_key_value,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    position_ids,
+                    attention_mask,
+                    output_attentions,
+                    output_router_logits,
+                    past_key_value,
+                    use_cache,
+                )
+
+            # NOTE: clear outdate cache after it has been used for memory saving
+            past_key_value = past_key_values[idx] = None
+            if type(layer_outputs) is tuple:
+                hidden_states = layer_outputs[0]
+            else:
+                hidden_states = layer_outputs
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_router_logits:
+                all_router_logits += (layer_outputs[-1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
+                if v is not None
+            )
+        return MoEModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            router_logits=all_router_logits,
+        )
+
+
+class MixtralPretrainingCriterion(paddle.nn.Layer):
+    """
+    Criterion for Mixtral.
+    It calculates the final loss.
+    """
+
+    def __init__(self, config):
+
+        super(MixtralPretrainingCriterion, self).__init__()
+        self.ignore_index = getattr(config, "ignore_index", -100)
+        self.config = config
+        self.enable_parallel_cross_entropy = config.tensor_parallel_degree > 1 and config.tensor_parallel_output
+
+        if self.enable_parallel_cross_entropy:  # and False: # and lm_head is distributed
+            self.loss_func = mpu.ParallelCrossEntropy(ignore_index=self.ignore_index)
+        else:
+            self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index)
+
+    def forward(self, prediction_scores, masked_lm_labels):
+        if self.enable_parallel_cross_entropy:
+            if prediction_scores.shape[-1] == self.config.vocab_size:
+                warnings.warn(
+                    f"enable_parallel_cross_entropy, the vocab_size should be splited: {prediction_scores.shape[-1]}, {self.config.vocab_size}"
+                )
+                self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index)
+
+        with paddle.amp.auto_cast(False):
+            masked_lm_loss = self.loss_func(prediction_scores.astype("float32"), masked_lm_labels.unsqueeze(2))
+
+            # skip ignore_index which loss == 0
+            masked_lm_loss = masked_lm_loss[masked_lm_loss > 0]
+            loss = paddle.mean(masked_lm_loss)
+
+        return loss
+
+
+class MixtralLMHead(nn.Layer):
+    def __init__(self, config: MixtralConfig):
+        super(MixtralLMHead, self).__init__()
+        self.config = config
+        if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0:
+            vocab_size = config.vocab_size // config.tensor_parallel_degree
+        else:
+            vocab_size = config.vocab_size
+
+        if vocab_size != config.vocab_size:
+            with get_rng_state_tracker().rng_state():
+                self.weight = self.create_parameter(
+                    shape=[config.hidden_size, vocab_size],
+                    dtype=paddle.get_default_dtype(),
+                )
+        else:
+            self.weight = self.create_parameter(
+                shape=[config.hidden_size, vocab_size],
+                dtype=paddle.get_default_dtype(),
+            )
+        # Must set distributed attr for Tensor Parallel !
+        self.weight.is_distributed = True if (vocab_size != config.vocab_size) else False
+        if self.weight.is_distributed:
+            self.weight.split_axis = 1
+
+    def forward(self, hidden_states, tensor_parallel_output=None):
+        if self.config.sequence_parallel:
+            hidden_states = GatherOp.apply(hidden_states)
+            seq_length = self.config.seq_length
+            hidden_states = paddle.reshape_(hidden_states, [-1, seq_length, self.config.hidden_size])
+
+        if tensor_parallel_output is None:
+            tensor_parallel_output = self.config.tensor_parallel_output
+
+        logits = parallel_matmul(hidden_states, self.weight, tensor_parallel_output=tensor_parallel_output)
+        return logits
+
+
+class MixtralForCausalLM(MixtralPretrainedModel):
+    enable_to_static_method = True
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.mixtral = MixtralModel(config)
+        self.lm_head = MixtralLMHead(config)
+        self.criterion = MixtralPretrainingCriterion(config)
+        self.router_aux_loss_coef = config.router_aux_loss_coef
+        self.num_experts = config.num_local_experts
+        self.num_experts_per_tok = config.num_experts_per_tok
+
+        if config.sliding_window is not None:
+            logger.warning("We do not support sliding window attention for now.")
+
+    def get_input_embeddings(self):
+        return self.mixtral.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.mixtral.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.mixtral = decoder
+
+    def get_decoder(self):
+        return self.mixtral
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        use_cache=False,
+        past_key_values=None,
+        inputs_embeds=None,
+        output_router_logits=False,
+        **kwargs
+    ):
+        batch_size, seq_length = input_ids.shape
+        position_ids = kwargs.get("position_ids", paddle.arange(seq_length).expand((batch_size, seq_length)))
+        attention_mask = kwargs.get("attention_mask", None)
+        if past_key_values:
+            input_ids = input_ids[:, -1].unsqueeze(axis=-1)
+            position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "output_router_logits": output_router_logits,
+            }
+        )
+        return model_inputs
+
+    def _get_model_inputs_spec(self, dtype: str):
+        return {
+            "input_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+            "attention_mask": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+            "position_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+        }
+
+    @staticmethod
+    def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False):
+        # update cache
+        if isinstance(outputs, tuple) and len(outputs) > 1 and not isinstance(outputs[1], paddle.Tensor):
+            model_kwargs["past_key_values"] = outputs[1]
+
+        if isinstance(outputs, MoECausalLMOutputWithPast) and "past_key_values" in outputs:
+            model_kwargs["past_key_values"] = outputs.past_key_values
+
+        # update position_ids
+        if "position_ids" in model_kwargs and model_kwargs["position_ids"] is not None:
+            position_ids = model_kwargs["position_ids"]
+            model_kwargs["position_ids"] = paddle.concat([position_ids, position_ids[..., -1:] + 1], axis=-1)
+
+        if not is_encoder_decoder and "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = paddle.concat(
+                [attention_mask, paddle.ones([attention_mask.shape[0], 1], dtype=attention_mask.dtype)], axis=-1
+            )
+
+        return model_kwargs
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=False,
+        past_key_values=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        output_router_logits: Optional[bool] = None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mixtral(
+            input_ids,  # [bs, seq_len]
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_router_logits=output_router_logits,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]  # [bs, seq_len, dim]
+
+        # if labels is None，means we need full output, instead of tensor_parallel_output
+        # tensor_parallel_output is togather with ParallelCrossEntropy
+        tensor_parallel_output = (
+            self.config.tensor_parallel_output and labels is not None and self.config.tensor_parallel_degree > 1
+        )
+
+        logits = self.lm_head(hidden_states, tensor_parallel_output=tensor_parallel_output)
+
+        loss = None
+        if labels is not None:
+            loss = self.criterion(logits, labels)
+
+        aux_loss = None
+        if output_router_logits:
+            aux_loss = load_balancing_loss_func(
+                outputs.router_logits if return_dict else outputs[-1],
+                self.num_experts,
+                self.num_experts_per_tok,
+                attention_mask,
+            )
+            if labels is not None:
+                loss += self.router_aux_loss_coef * aux_loss
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            if output_router_logits:
+                output = (aux_loss,) + output
+            return (loss,) + output if loss is not None else output
+
+        return MoECausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            router_logits=outputs.router_logits,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mobilebert/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mobilebert/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mobilebert/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mobilebert/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mobilebert/configuration.py
new file mode 100644
index 000000000..0675a9ff5
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mobilebert/configuration.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MobileBert model configuration"""
+from __future__ import annotations
+
+from ..configuration_utils import PretrainedConfig
+
+__all__ = ["MOBILEBERT_PRETRAINED_INIT_CONFIGURATION", "MobileBertConfig", "MOBILEBERT_PRETRAINED_RESOURCE_FILES_MAP"]
+
+MOBILEBERT_PRETRAINED_INIT_CONFIGURATION = {
+    "mobilebert-uncased": {
+        "attention_probs_dropout_prob": 0.1,
+        "classifier_activation": False,
+        "embedding_size": 128,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0.0,
+        "hidden_size": 512,
+        "initializer_range": 0.02,
+        "intermediate_size": 512,
+        "intra_bottleneck_size": 128,
+        "key_query_shared_bottleneck": True,
+        "layer_norm_eps": 1e-12,
+        "max_position_embeddings": 512,
+        "model_type": "mobilebert",
+        "normalization_type": "no_norm",
+        "num_attention_heads": 4,
+        "num_feedforward_networks": 4,
+        "num_hidden_layers": 24,
+        "pad_token_id": 0,
+        "transformers_version": "4.6.0.dev0",
+        "trigram_input": True,
+        "true_hidden_size": 128,
+        "type_vocab_size": 2,
+        "use_bottleneck": True,
+        "use_bottleneck_attention": False,
+        "vocab_size": 30522,
+    }
+}
+MOBILEBERT_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "mobilebert-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/mobilebert/mobilebert-uncased/model_state.pdparams"
+    }
+}
+
+
+class MobileBertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~paddlenlp.transformers.MobileBertModel`.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the MobileBERT model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`MobileBertModel`].
+        hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 4):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`MobileBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The ID of the token in the word embedding to use as padding.
+        embedding_size (`int`, *optional*, defaults to 128):
+            The dimension of the word embedding vectors.
+        trigram_input (`bool`, *optional*, defaults to `True`):
+            Use a convolution of trigram as input.
+        use_bottleneck (`bool`, *optional*, defaults to `True`):
+            Whether to use bottleneck in BERT.
+        intra_bottleneck_size (`int`, *optional*, defaults to 128):
+            Size of bottleneck layer output.
+        use_bottleneck_attention (`bool`, *optional*, defaults to `False`):
+            Whether to use attention inputs from the bottleneck transformation.
+        key_query_shared_bottleneck (`bool`, *optional*, defaults to `True`):
+            Whether to use the same linear transformation for query&key in the bottleneck.
+        num_feedforward_networks (`int`, *optional*, defaults to 4):
+            Number of FFNs in a block.
+        normalization_type (`str`, *optional*, defaults to `"no_norm"`):
+            The normalization type in MobileBERT.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+    ```python
+    >>> from paddlenlp.transformers import MobileBertConfig, MobileBertModel
+    >>> # Initializing a MobileBERT configuration
+    >>> configuration = MobileBertConfig()
+    >>> # Initializing a model (with random weights) from the configuration above
+    >>> model = MobileBertModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "mobilebert"
+    pretrained_init_configuration = MOBILEBERT_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = MOBILEBERT_PRETRAINED_RESOURCE_FILES_MAP
+    keys_to_ignore_at_inference = ["pooled_output"]
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=512,
+        num_hidden_layers=24,
+        num_attention_heads=4,
+        intermediate_size=512,
+        hidden_act="relu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        embedding_size=128,
+        true_hidden_size=128,
+        normalization_type="no_norm",
+        use_bottleneck=True,
+        use_bottleneck_attention=False,
+        intra_bottleneck_size=128,
+        key_query_shared_bottleneck=True,
+        num_feedforward_networks=4,
+        trigram_input=True,
+        classifier_activation=False,
+        classifier_dropout=None,
+        add_pooling_layer=True,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.pad_token_id = pad_token_id
+        self.embedding_size = embedding_size
+        self.true_hidden_size = true_hidden_size
+        self.normalization_type = normalization_type
+        self.use_bottleneck = use_bottleneck
+        self.use_bottleneck_attention = use_bottleneck_attention
+        self.intra_bottleneck_size = intra_bottleneck_size
+        self.key_query_shared_bottleneck = key_query_shared_bottleneck
+        self.num_feedforward_networks = num_feedforward_networks
+        self.trigram_input = trigram_input
+        self.classifier_activation = classifier_activation
+        if self.use_bottleneck:
+            self.true_hidden_size = intra_bottleneck_size
+        else:
+            self.true_hidden_size = hidden_size
+
+        self.classifier_dropout = classifier_dropout
+        self.add_pooling_layer = add_pooling_layer
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mobilebert/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mobilebert/modeling.py
new file mode 100644
index 000000000..f2c0a0861
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mobilebert/modeling.py
@@ -0,0 +1,1194 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import paddle
+import paddle.nn as nn
+from paddle import Tensor
+
+from ...utils.env import CONFIG_NAME
+from .. import PretrainedModel, register_base_model
+from ..activations import ACT2FN
+from ..model_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    ModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+)
+from .configuration import (
+    MOBILEBERT_PRETRAINED_INIT_CONFIGURATION,
+    MOBILEBERT_PRETRAINED_RESOURCE_FILES_MAP,
+    MobileBertConfig,
+)
+
+__all__ = [
+    "MobileBertModel",
+    "MobileBertPretrainedModel",
+    "MobileBertForPreTraining",
+    "MobileBertForSequenceClassification",
+    "MobileBertForQuestionAnswering",
+]
+
+
+class NoNorm(nn.Layer):
+    def __init__(self, feat_size, eps=None):
+        super().__init__()
+        if isinstance(feat_size, int):
+            feat_size = [feat_size]
+        self.bias = paddle.create_parameter(feat_size, "float32", is_bias=True)
+        self.weight = paddle.create_parameter(
+            feat_size, "float32", default_initializer=paddle.nn.initializer.Constant(value=1.0)
+        )
+
+    def forward(self, input_tensor):
+        return input_tensor * self.weight + self.bias
+
+
+NORM2FN = {"layer_norm": nn.LayerNorm, "no_norm": NoNorm}
+
+
+class MobileBertEmbeddings(nn.Layer):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.trigram_input = config.trigram_input
+        self.embedding_size = config.embedding_size
+        self.hidden_size = config.hidden_size
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        embed_dim_multiplier = 3 if self.trigram_input else 1
+        embedded_input_size = self.embedding_size * embed_dim_multiplier
+        self.embedding_transformation = nn.Linear(embedded_input_size, config.hidden_size)
+
+        self.layer_norm = NORM2FN[config.normalization_type](config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", paddle.arange(config.max_position_embeddings, dtype="int64").expand((1, -1))
+        )
+
+    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
+        if input_ids is not None:
+            input_shape = input_ids.shape
+        else:
+            input_shape = inputs_embeds.shape[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(input_shape, dtype="int64")
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        if self.trigram_input:
+            # From the paper MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited
+            # Devices (https://arxiv.org/abs/2004.02984)
+            #
+            # The embedding table in BERT models accounts for a substantial proportion of model size. To compress
+            # the embedding layer, we reduce the embedding dimension to 128 in MobileBERT.
+            # Then, we apply a 1D convolution with kernel size 3 on the raw token embedding to produce a 512
+            # dimensional output.
+            inputs_embeds = paddle.concat(
+                [
+                    nn.functional.pad(inputs_embeds[:, 1:], [0, 0, 0, 1, 0, 0], value=0),
+                    inputs_embeds,
+                    nn.functional.pad(inputs_embeds[:, :-1], [0, 0, 1, 0, 0, 0], value=0),
+                ],
+                axis=2,
+            )
+        if self.trigram_input or self.embedding_size != self.hidden_size:
+            inputs_embeds = self.embedding_transformation(inputs_embeds)
+
+        # Add positional embeddings and token type embeddings, then layer
+        # normalize and perform dropout.
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class MobileBertAttention(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.true_hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.true_hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.true_hidden_size, self.all_head_size)
+        self.value = nn.Linear(
+            config.true_hidden_size if config.use_bottleneck_attention else config.hidden_size,
+            self.all_head_size,
+        )
+
+        self.attention_dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+        self.use_bottleneck = config.use_bottleneck
+        self.dense = nn.Linear(config.true_hidden_size, config.true_hidden_size)
+        self.layer_norm = NORM2FN[config.normalization_type](config.true_hidden_size, eps=config.layer_norm_eps)
+        if not self.use_bottleneck:
+            self.output_dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.shape[:-1] + [self.num_attention_heads, self.attention_head_size]
+        x = x.reshape(new_x_shape)
+        return x.transpose(perm=(0, 2, 1, 3))
+
+    def forward(
+        self,
+        query_tensor,
+        key_tensor,
+        value_tensor,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=None,
+    ):
+
+        mixed_query_layer = self.query(query_tensor)
+        mixed_key_layer = self.key(key_tensor)
+        mixed_value_layer = self.value(value_tensor)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_layer, key_layer, transpose_y=True)
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(axis=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.attention_dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        context_layer = paddle.matmul(attention_probs, value_layer)
+        context_layer = context_layer.transpose(perm=(0, 2, 1, 3))
+        new_context_layer_shape = context_layer.shape[:-2] + [self.all_head_size]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        # dense layer shape to be checked
+        projected_context_layer = self.dense(context_layer)
+
+        # Run a linear projection of `hidden_size` then add a residual
+        # with `hidden_states`.
+        if not self.use_bottleneck:
+            projected_context_layer = self.output_dropout(projected_context_layer)
+        layer_normed_context_layer = self.layer_norm(hidden_states + projected_context_layer)
+
+        outputs = (layer_normed_context_layer, attention_probs) if output_attentions else (layer_normed_context_layer,)
+        return outputs
+
+
+class MobileBertIntermediate(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.true_hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class OutputBottleneck(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.true_hidden_size, config.hidden_size)
+        self.layer_norm = NORM2FN[config.normalization_type](config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, residual_tensor):
+        layer_outputs = self.dense(hidden_states)
+        layer_outputs = self.dropout(layer_outputs)
+        layer_outputs = self.layer_norm(layer_outputs + residual_tensor)
+        return layer_outputs
+
+
+class MobileBertOutput(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.use_bottleneck = config.use_bottleneck
+        self.dense = nn.Linear(config.intermediate_size, config.true_hidden_size)
+        self.layer_norm = NORM2FN[config.normalization_type](config.true_hidden_size)
+        if not self.use_bottleneck:
+            self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        else:
+            self.bottleneck = OutputBottleneck(config)
+
+    def forward(self, intermediate_states, residual_tensor_1, residual_tensor_2):
+        layer_output = self.dense(intermediate_states)
+        if not self.use_bottleneck:
+            layer_output = self.dropout(layer_output)
+            layer_output = self.layer_norm(layer_output + residual_tensor_1)
+        else:
+            layer_output = self.layer_norm(layer_output + residual_tensor_1)
+            layer_output = self.bottleneck(layer_output, residual_tensor_2)
+        return layer_output
+
+
+class BottleneckLayer(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intra_bottleneck_size)
+        self.layer_norm = NORM2FN[config.normalization_type](config.intra_bottleneck_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        layer_input = self.dense(hidden_states)
+        layer_input = self.layer_norm(layer_input)
+        return layer_input
+
+
+class Bottleneck(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.key_query_shared_bottleneck = config.key_query_shared_bottleneck
+        self.use_bottleneck_attention = config.use_bottleneck_attention
+        self.input = BottleneckLayer(config)
+        if self.key_query_shared_bottleneck:
+            self.attention = BottleneckLayer(config)
+
+    def forward(self, hidden_states):
+        # This method can return three different tuples of values. These different values make use of bottlenecks,
+        # which are linear layers used to project the hidden states to a lower-dimensional vector, reducing memory
+        # usage. These linear layer have weights that are learned during training.
+        #
+        # If `config.use_bottleneck_attention`, it will return the result of the bottleneck layer four times for the
+        # key, query, value, and "layer input" to be used by the attention layer.
+        # This bottleneck is used to project the hidden. This last layer input will be used as a residual tensor
+        # in the attention self output, after the attention scores have been computed.
+        #
+        # If not `config.use_bottleneck_attention` and `config.key_query_shared_bottleneck`, this will return
+        # four values, three of which have been passed through a bottleneck: the query and key, passed through the same
+        # bottleneck, and the residual layer to be applied in the attention self output, through another bottleneck.
+        #
+        # Finally, in the last case, the values for the query, key and values are the hidden states without bottleneck,
+        # and the residual layer will be this value passed through a bottleneck.
+
+        bottlenecked_hidden_states = self.input(hidden_states)
+        if self.use_bottleneck_attention:
+            return (bottlenecked_hidden_states,) * 4
+        elif self.key_query_shared_bottleneck:
+            shared_attention_input = self.attention(hidden_states)
+            return (shared_attention_input, shared_attention_input, hidden_states, bottlenecked_hidden_states)
+        else:
+            return (hidden_states, hidden_states, hidden_states, bottlenecked_hidden_states)
+
+
+class FFNOutput(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.true_hidden_size)
+        self.layer_norm = NORM2FN[config.normalization_type](config.true_hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, residual_tensor):
+        layer_outputs = self.dense(hidden_states)
+        layer_outputs = self.layer_norm(layer_outputs + residual_tensor)
+        return layer_outputs
+
+
+class FFNLayer(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.intermediate = MobileBertIntermediate(config)
+        self.output = FFNOutput(config)
+
+    def forward(self, hidden_states):
+        intermediate_output = self.intermediate(hidden_states)
+        layer_outputs = self.output(intermediate_output, hidden_states)
+        return layer_outputs
+
+
+class MobileBertLayer(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.use_bottleneck = config.use_bottleneck
+        self.num_feedforward_networks = config.num_feedforward_networks
+
+        self.attention = MobileBertAttention(config)
+        self.intermediate = MobileBertIntermediate(config)
+        self.output = MobileBertOutput(config)
+        if self.use_bottleneck:
+            self.bottleneck = Bottleneck(config)
+        if config.num_feedforward_networks > 1:
+            self.ffn = nn.LayerList([FFNLayer(config) for _ in range(config.num_feedforward_networks - 1)])
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=None,
+    ):
+        if self.use_bottleneck:
+            query_tensor, key_tensor, value_tensor, layer_input = self.bottleneck(hidden_states)
+        else:
+            query_tensor, key_tensor, value_tensor, layer_input = [hidden_states] * 4
+
+        self_attention_outputs = self.attention(
+            query_tensor,
+            key_tensor,
+            value_tensor,
+            layer_input,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        s = (attention_output,)
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        if self.num_feedforward_networks != 1:
+            for i, ffn_module in enumerate(self.ffn):
+                attention_output = ffn_module(attention_output)
+                s += (attention_output,)
+
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output, hidden_states)
+        outputs = (
+            (layer_output,)
+            + outputs
+            + (
+                paddle.to_tensor(1000),
+                query_tensor,
+                key_tensor,
+                value_tensor,
+                layer_input,
+                attention_output,
+                intermediate_output,
+            )
+            + s
+        )
+        return outputs
+
+
+class MobileBertEncoder(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.layers = nn.LayerList([MobileBertLayer(config) for _ in range(config.num_hidden_layers)])
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=None,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for i, layer_module in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                head_mask[i],
+                output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+
+
+class MobileBertPooler(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.do_activate = config.classifier_activation
+        if self.do_activate:
+            self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        if not self.do_activate:
+            return first_token_tensor
+        else:
+            pooled_output = self.dense(first_token_tensor)
+            pooled_output = paddle.tanh(pooled_output)
+            return pooled_output
+
+
+class MobileBertPredictionHeadTransform(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.layer_norm = NORM2FN["layer_norm"](config.hidden_size, epsilon=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        return hidden_states
+
+
+class MobileBertLMPredictionHead(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = MobileBertPredictionHeadTransform(config)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.dense = nn.Linear(config.vocab_size, config.hidden_size - config.embedding_size, bias_attr=False)
+        self.decoder = nn.Linear(config.embedding_size, config.vocab_size)
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        param_concat = paddle.concat([self.decoder.weight, self.dense.weight.t()], axis=0)
+
+        hidden_states = paddle.matmul(hidden_states, param_concat)
+        hidden_states += self.decoder.bias
+        return hidden_states
+
+
+class MobileBertOnlyMLMHead(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = MobileBertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class MobileBertPreTrainingHeads(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = MobileBertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class MobileBertPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained MobileBert models. It provides MobileBert related
+    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
+    `pretrained_init_configuration`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    model_config_file = CONFIG_NAME
+    pretrained_init_configuration = MOBILEBERT_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = MOBILEBERT_PRETRAINED_RESOURCE_FILES_MAP
+    base_model_prefix = "mobilebert"
+    config_class = MobileBertConfig
+
+    def _init_weights(self, layer):
+        # Initialize the weights.
+        if isinstance(layer, nn.Linear):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            layer.weight.set_value(
+                paddle.tensor.normal(
+                    mean=0.0,
+                    std=self.config.initializer_range,
+                    shape=layer.weight.shape,
+                )
+            )
+            if layer.bias is not None:
+                layer.bias.set_value(paddle.zeros_like(layer.bias))
+        elif isinstance(layer, (nn.LayerNorm, NoNorm)):
+            layer.bias.set_value(paddle.zeros_like(layer.bias))
+            layer.weight.set_value(paddle.ones_like(layer.weight))
+
+
+@dataclass
+class MobileBertForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`ErnieForPreTraining`].
+    Args:
+        loss (*optional*, returned when `labels` is provided, `paddle.Tensor` of shape `(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (`paddle.Tensor` of shape `(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    prediction_logits: paddle.Tensor = None
+    seq_relationship_logits: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+class MobileBertForPreTraining(MobileBertPretrainedModel):
+    """
+    MobileBert Model with pretraining tasks on top.
+
+    Args:
+        bert (:class:`MobileBertModel`):
+            An instance of :class:`MobileBertModel`.
+    """
+
+    def __init__(self, config):
+        super(MobileBertForPreTraining, self).__init__(config)
+        self.mobilebert = MobileBertModel(config)
+        self.cls = MobileBertPreTrainingHeads(config)
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddigs):
+        self.cls.predictions.decoder = new_embeddigs
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels: Optional[Tensor] = None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        The MobileBertForPreTraining forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`MobileBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`MobileBertModel`.
+            position_ids(Tensor, optional):
+                See :class:`MobileBertModel`.
+            head_mask (Tensor, optional):
+                See :class:`MobileBertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`MobileBertModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`MobileBertModel`.
+            output_attentions (bool, optional):
+                See :class:`MobileBertModel`.
+            output_hidden_states (bool, optional):
+                See :class:`MobileBertModel`.
+
+        Returns:
+            tuple: Returns tuple (``prediction_scores``, ``seq_relationship_score``).
+            With the fields:
+            - `prediction_scores` (Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size].
+                Otherwise, its shape is [batch_size, mask_token_num, vocab_size].
+            - `seq_relationship_score` (Tensor):
+                The scores of next sentence prediction.
+                Its data type should be float32 and its shape is [batch_size, 2].
+
+        .. code-block::
+                import paddle
+                from paddlenlp.transformers import MobileBertModel, MobileBertTokenizer
+                tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
+                model = MobileBertForPreTraining.from_pretrained('mobilebert-uncased')
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+                prediction_logits = outputs[0]
+                seq_relationship_logits = outputs[1]
+        """
+        with paddle.static.amp.fp16_guard():
+            outputs = self.mobilebert(
+                input_ids,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                position_ids=position_ids,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            sequence_output, pooled_output = outputs[:2]
+            prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+
+            total_loss = None
+            if labels is not None:
+                loss_fct = paddle.nn.CrossEntropyLoss()
+                total_loss = loss_fct(
+                    prediction_scores.reshape((-1, prediction_scores.shape[-1])), labels.reshape((-1,))
+                )
+
+            if not return_dict:
+                output = (prediction_scores, seq_relationship_score) + outputs[2:]
+                return ((total_loss,) + output) if total_loss is not None else output
+
+            return MobileBertForPreTrainingOutput(
+                loss=total_loss,
+                prediction_logits=prediction_scores,
+                seq_relationship_logits=seq_relationship_score,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+
+
+@register_base_model
+class MobileBertModel(MobileBertPretrainedModel):
+    """
+    The bare MobileBert Model transformer outputting raw hidden-states.
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        vocab_size (int):
+            Vocabulary size of `inputs_ids` in `MobileBertModel`. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `MobileBertModel`.
+        embedding_size (int, optional):
+            Embedding dimensionality of lookup_table in the embedding layer. Defaults to `128`.
+        hidden_size (int, optional):
+            Dimensionality of the embedding layer, encoder layer and pooler layer. Defaults to `512`.
+        true_hidden_size (int, optional):
+            Dimensionality of input_tensor in self attention layer. Defaults to `128`.
+        use_bottleneck_attention (bool, optional):
+            Using bottleneck to value tensor in self attention layer. Defaults to `False`.
+        key_query_shared_bottleneck (bool, optional):
+            Key and query shared bottleneck layer. Defaults to `True`.
+        num_hidden_layers (int, optional):
+            Number of hidden layers in the Transformer encoder. Defaults to `24`.
+        num_attention_heads (int, optional):
+            Number of attention heads for each attention layer in the Transformer encoder.
+            Defaults to `4`.
+        intermediate_size (int, optional):
+            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
+            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
+            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
+            Defaults to `512`.
+        hidden_act (str, optional):
+            The non-linear activation function in the feed-forward layer.
+            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
+            are supported. Defaults to `"relu"`.
+        hidden_dropout_prob (float, optional):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+            Defaults to `0.1`.
+        attention_probs_dropout_prob (float, optional):
+            The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
+            Defaults to `0.1`.
+        max_position_embeddings (int, optional):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
+            sequence. Defaults to `512`.
+        type_vocab_size (int, optional):
+            The vocabulary size of `token_type_ids`.
+            Defaults to `2`.
+        initializer_range (float, optional):
+            The standard deviation of the normal initializer.
+            Defaults to 0.02.
+            .. note::
+                A normal_initializer initializes weight matrices as normal distributions.
+                See :meth:`MobileBertPretrainedModel.init_weights()` for how weights are initialized in `MobileBertModel`.
+        pad_token_id (int, optional):
+            The index of padding token in the token vocabulary.
+            Defaults to `1`.
+        add_pooling_layer (bool, optional):
+            Adding the pooling Layer after the encoder layer. Defaults to `True`.
+        classifier_activation (bool, optional):
+            Using the non-linear activation function in the pooling layer. Defaults to `False`.
+
+    """
+
+    def __init__(self, config):
+        super(MobileBertModel, self).__init__(config)
+
+        self.initializer_range = config.initializer_range
+        self.embeddings = MobileBertEmbeddings(config)
+        self.encoder = MobileBertEncoder(config)
+        self.num_hidden_layers = config.num_hidden_layers
+        self.pooler = MobileBertPooler(config) if config.add_pooling_layer else None
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def get_head_mask(self, head_mask, num_hidden_layers, is_attention_chunked=False):
+        """
+        Prepare the head mask if needed.
+
+        Args:
+            head_mask (:obj:`paddle.Tensor` with shape :obj:`[num_heads]` or :obj:`[num_hidden_layers x num_heads]`, `optional`):
+                The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
+            num_hidden_layers (:obj:`int`):
+                The number of hidden layers in the model.
+            is_attention_chunked: (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the attentions scores are computed by chunks or not.
+
+        Returns:
+            :obj:`paddle.Tensor` with shape :obj:`[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or
+            list with :obj:`[None]` for each layer.
+        """
+        if head_mask is not None:
+            head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
+            if is_attention_chunked is True:
+                head_mask = head_mask.unsqueeze(-1)
+        else:
+            head_mask = [None] * num_hidden_layers
+
+        return head_mask
+
+    def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
+        """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
+        if head_mask.dim() == 1:
+            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+            head_mask = head_mask.expand(num_hidden_layers, -1, -1, -1, -1)
+        elif head_mask.dim() == 2:
+            head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+        assert head_mask.dim() == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
+        head_mask = head_mask.to(dtype=self.dtype)  # switch to float if need + fp16 compatibility
+        return head_mask
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_hidden_states=None,
+        output_attentions=None,
+        return_dict=None,
+    ):
+        r"""
+        The MobileBertModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            head_mask (:obj:`paddle.Tensor` with shape :obj:`[num_heads]` or :obj:`[num_hidden_layers x num_heads]`, `optional`):
+                The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard). Defaults to `None`.
+            output_hidden_states (bool, optional):
+                Whether to return the output of each hidden layers.
+                Defaults to `None`.
+            output_attentions (bool, optional):
+                Whether to return the output of each self attention layers.
+                Defaults to `None`.
+
+        Returns:
+            tuple: Returns tuple (`sequence_output`, `pooled_output`) or (`encoder_outputs`, `pooled_output`).
+            With the fields:
+            - `sequence_output` (Tensor):
+                Sequence of hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+            - `pooled_output` (Tensor):
+                The output of first token (`[CLS]`) in sequence.
+                We "pool" the model by simply taking the hidden state corresponding to the first token.
+                Its data type should be float32 and its shape is [batch_size, hidden_size].
+            - `encoder_outputs` (List(Tensor)):
+                A list of Tensor containing hidden-states of the model at each hidden layer in the Transformer encoder.
+                The length of the list is `num_hidden_layers`.
+                Each Tensor has a data type of float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+        Example:
+            .. code-block::
+                import paddle
+                from paddlenlp.transformers import MobileBertModel, MobileBertTokenizer
+                tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
+                model = MobileBertModel.from_pretrained('mobilebert-uncased')
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else False
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else False
+        return_dict = return_dict if return_dict is not None else False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if attention_mask is None:
+            attention_mask = paddle.ones(input_shape, dtype=input_ids.dtype)
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(input_shape, dtype="int64")
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = attention_mask.unsqueeze(axis=[1, 2])
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if isinstance(encoder_outputs, type(embedding_output)):
+            sequence_output = encoder_outputs
+            pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+            return (sequence_output, pooled_output)
+        else:
+            sequence_output = encoder_outputs[0]
+            pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+            if not return_dict:
+                return (sequence_output, pooled_output) + encoder_outputs[1:]
+            return BaseModelOutputWithPoolingAndCrossAttentions(
+                last_hidden_state=sequence_output,
+                pooler_output=pooled_output,
+                hidden_states=encoder_outputs.hidden_states,
+                attentions=encoder_outputs.attentions,
+            )
+
+
+class MobileBertForSequenceClassification(MobileBertPretrainedModel):
+    """
+    MobileBert Model with a linear layer on top of the output layer,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        mobilebert (:class:`MobileBertModel`):
+            An instance of MobileBert.
+        num_classes (int, optional):
+            The number of classes. Defaults to `2`.
+    """
+
+    def __init__(self, config):
+        super(MobileBertForSequenceClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.mobilebert = MobileBertModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, self.num_labels)
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        The MobileBertForSequenceClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`MobileBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`MobileBertModel`.
+            position_ids(Tensor, optional):
+                See :class:`MobileBertModel`.
+            head_mask (Tensor, optional):
+                See :class:`MobileBertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`MobileBertModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`MobileBertModel`.
+            output_attentions (bool, optional):
+                See :class:`MobileBertModel`.
+            output_hidden_states (bool, optional):
+                See :class:`MobileBertModel`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input text classification logits.
+            Shape as `[batch_size, num_classes]` and dtype as float32.
+
+        Example:
+            .. code-block::
+                import paddle
+                from paddlenlp.transformers import MobileBertForSequenceClassification, MobileBertTokenizer
+                tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
+                model = MobileBertForSequenceClassification.from_pretrained('mobilebert-uncased', num_classes=2)
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+                print(logits.shape)
+                # [1, 2]
+        """
+        outputs = self.mobilebert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = paddle.nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = paddle.nn.CrossEntropyLoss()
+                loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = paddle.nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class MobileBertForQuestionAnswering(MobileBertPretrainedModel):
+    """
+    MobileBert Model with a linear layer on top of the hidden-states output to compute `span_start_logits`
+    and `span_end_logits`, designed for question-answering tasks like SQuAD.
+
+    Args:
+        mobilebert (:class:`MobileBert`):
+            An instance of MobileBert.
+    """
+
+    def __init__(self, config):
+        super(MobileBertForQuestionAnswering, self).__init__(config)
+        self.num_labels = 2
+        self.mobilebert = MobileBertModel(config)
+        self.qa_outputs = nn.Linear(self.config.hidden_size, self.num_labels)
+
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        The MobileBertForQuestionAnswering forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`MobileBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`MobileBertModel`.
+            position_ids(Tensor, optional):
+                See :class:`MobileBertModel`.
+            head_mask (Tensor, optional):
+                See :class:`MobileBertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`MobileBertModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`MobileBertModel`.
+            output_attentions (bool, optional):
+                See :class:`MobileBertModel`.
+            output_hidden_states (bool, optional):
+                See :class:`MobileBertModel`.
+            start_positions (Tensor, optional):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+                sequence are not taken into account for computing the loss.
+            end_positions (Tensor, optional):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+                sequence are not taken into account for computing the loss.
+
+        Returns:
+            tuple: Returns tuple (`start_logits`, `end_logits`).
+            With the fields:
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+        Example:
+            .. code-block::
+                import paddle
+                from paddlenlp.transformers import MobileBertForQuestionAnswering, MobileBertTokenizer
+                tokenizer = MobileBertTokenizer.from_pretrained('mobilebert-uncased')
+                model = MobileBertForQuestionAnswering.from_pretrained('mobilebert-uncased')
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+                start_logits = outputs[0]
+                end_logits = outputs[1]
+        """
+        outputs = self.mobilebert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.qa_outputs(sequence_output)
+
+        logits = paddle.transpose(logits, perm=[2, 0, 1])
+
+        start_logits, end_logits = paddle.unstack(x=logits, axis=0)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if start_positions.ndim > 1:
+                start_positions = start_positions.squeeze(-1)
+            if start_positions.ndim > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clip(0, ignored_index)
+            end_positions = end_positions.clip(0, ignored_index)
+
+            loss_fct = paddle.nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mobilebert/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mobilebert/tokenizer.py
new file mode 100644
index 000000000..eaa6a03e7
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mobilebert/tokenizer.py
@@ -0,0 +1,329 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+from paddlenlp.transformers.tokenizer_utils_base import (
+    PaddingStrategy,
+    TensorType,
+    TruncationStrategy,
+)
+
+from ...utils.log import logger
+from .. import BertTokenizer
+from ..tokenizer_utils_base import BatchEncoding
+
+__all__ = ["MobileBertTokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"mobilebert-uncased": 512}
+
+
+class MobileBertTokenizer(BertTokenizer):
+    r"""
+    Construct a MobileBERT tokenizer.
+    :class:`~paddlenlp.transformers.MobileBertTokenizer is identical to :class:`~paddlenlp.transformers.BertTokenizer` and runs end-to-end
+    tokenization: punctuation splitting and wordpiece.
+    Refer to superclass :class:`~~paddlenlp.transformers.BertTokenizer` for usage examples and documentation concerning
+    parameters.
+    """
+    resource_files_names = {"vocab_file": "vocab.txt"}
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "mobilebert-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/mobilebert/mobilebert-uncased/vocab.txt"
+        }
+    }
+    pretrained_init_configuration = {"mobilebert-uncased": {"do_lower_case": True}}
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def batch_encode(
+        self,
+        batch_text_or_text_pairs,
+        max_length: int = 512,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = False,
+        stride=0,
+        is_split_into_words=False,
+        return_position_ids=False,
+        return_token_type_ids=True,
+        return_attention_mask=False,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_special_tokens_mask=False,
+        return_dict=True,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        verbose: bool = True,
+        **kwargs
+    ):
+        """
+        Performs tokenization and uses the tokenized tokens to prepare model
+        inputs. It supports batch inputs of sequence or sequence pair.
+
+        Args:
+            batch_text_or_text_pairs (list):
+                The element of list can be sequence or sequence pair, and the
+                sequence is a string or a list of strings depending on whether
+                it has been pretokenized. If each sequence is provided as a list
+                of strings (pretokenized), you must set `is_split_into_words` as
+                `True` to disambiguate with a sequence pair.
+            max_length (int, optional):
+                If set to a number, will limit the total sequence returned so
+                that it has a maximum length. If there are overflowing tokens,
+                those overflowing tokens will be added to the returned dictionary
+                when `return_overflowing_tokens` is `True`. Defaults to `None`.
+            stride (int, optional):
+                Only available for batch input of sequence pair and mainly for
+                question answering usage. When for QA, `text` represents questions
+                and `text_pair` represents contexts. If `stride` is set to a
+                positive number, the context will be split into multiple spans
+                where `stride` defines the number of (tokenized) tokens to skip
+                from the start of one span to get the next span, thus will produce
+                a bigger batch than inputs to include all spans. Moreover, 'overflow_to_sample'
+                and 'offset_mapping' preserving the original example and position
+                information will be added to the returned dictionary. Defaults to 0.
+            padding (bool, optional):
+                If set to `True`, the returned sequences would be padded up to
+                `max_length` specified length according to padding side
+                (`self.padding_side`) and padding token id. Defaults to `False`.
+            truncation_strategy (str, optional):
+                String selected in the following options:
+                - 'longest_first' (default) Iteratively reduce the inputs sequence
+                until the input is under `max_length` starting from the longest
+                one at each token (when there is a pair of input sequences).
+                - 'only_first': Only truncate the first sequence.
+                - 'only_second': Only truncate the second sequence.
+                - 'do_not_truncate': Do not truncate (raise an error if the input
+                sequence is longer than `max_length`).
+                Defaults to 'longest_first'.
+            return_position_ids (bool, optional):
+                Whether to include tokens position ids in the returned dictionary.
+                Defaults to `False`.
+            return_token_type_ids (bool, optional):
+                Whether to include token type ids in the returned dictionary.
+                Defaults to `True`.
+            return_attention_mask (bool, optional):
+                Whether to include the attention mask in the returned dictionary.
+                Defaults to `False`.
+            return_length (bool, optional):
+                Whether to include the length of each encoded inputs in the
+                returned dictionary. Defaults to `False`.
+            return_overflowing_tokens (bool, optional):
+                Whether to include overflowing token information in the returned
+                dictionary. Defaults to `False`.
+            return_special_tokens_mask (bool, optional):
+                Whether to include special tokens mask information in the returned
+                dictionary. Defaults to `False`.
+
+        Returns:
+            dict:
+                The dict has the following optional items:
+                - **input_ids** (list[int]): List of token ids to be fed to a model.
+                - **position_ids** (list[int], optional): List of token position ids to be
+                  fed to a model. Included when `return_position_ids` is `True`
+                - **token_type_ids** (list[int], optional): List of token type ids to be
+                  fed to a model. Included when `return_token_type_ids` is `True`.
+                - **attention_mask** (list[int], optional): List of integers valued 0 or 1,
+                  where 0 specifies paddings and should not be attended to by the
+                  model. Included when `return_attention_mask` is `True`.
+                - **seq_len** (int, optional): The input_ids length. Included when `return_length`
+                  is `True`.
+                - **overflowing_tokens** (list[int], optional): List of overflowing tokens.
+                  Included when if `max_length` is specified and `return_overflowing_tokens`
+                  is True.
+                - **num_truncated_tokens** (int, optional): The number of overflowing tokens.
+                  Included when if `max_length` is specified and `return_overflowing_tokens`
+                  is True.
+                - **special_tokens_mask** (list[int], optional): List of integers valued 0 or 1,
+                  with 0 specifying special added tokens and 1 specifying sequence tokens.
+                  Included when `return_special_tokens_mask` is `True`.
+                - **offset_mapping** (list[int], optional): list of pair preserving the
+                  index of start and end char in original input for each token.
+                  For a sqecial token, the index pair is `(0, 0)`. Included when
+                  `stride` works.
+                - **overflow_to_sample** (int, optional): Index of example from which this
+                  feature is generated. Included when `stride` works.
+        """
+        # Backward compatibility for 'max_seq_len'
+        old_max_seq_len = kwargs.get("max_seq_len", None)
+        if max_length is None and old_max_seq_len:
+            if verbose:
+                logger.warnings(
+                    "The `max_seq_len` argument is deprecated and will be removed in a future version, "
+                    "please use `max_length` instead.",
+                    FutureWarning,
+                )
+            max_length = old_max_seq_len
+
+        padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
+            padding=padding, max_length=max_length, verbose=verbose
+        )
+
+        def get_input_ids(text):
+            if isinstance(text, str):
+                tokens = self._tokenize(text)
+                return self.convert_tokens_to_ids(tokens)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
+                return self.convert_tokens_to_ids(text)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
+                return text
+            else:
+                raise ValueError(
+                    "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
+                )
+
+        batch_encode_inputs = []
+        for example_id, tokens_or_pair_tokens in enumerate(batch_text_or_text_pairs):
+            if not isinstance(tokens_or_pair_tokens, (list, tuple)):
+                text, text_pair = tokens_or_pair_tokens, None
+            elif is_split_into_words and not isinstance(tokens_or_pair_tokens[0], (list, tuple)):
+                text, text_pair = tokens_or_pair_tokens, None
+            else:
+                text, text_pair = tokens_or_pair_tokens
+
+            first_ids = get_input_ids(text)
+            second_ids = get_input_ids(text_pair) if text_pair is not None else None
+
+            if stride > 0 and second_ids is not None:
+
+                max_len_for_pair = (
+                    max_length - len(first_ids) - self.num_special_tokens_to_add(pair=True)
+                )  # need -4  <sep> A </sep> </sep> B <sep>
+
+                token_offset_mapping = self.get_offset_mapping(text)
+                token_pair_offset_mapping = self.get_offset_mapping(text_pair)
+
+                while True:
+                    encoded_inputs = {}
+
+                    ids = first_ids
+                    mapping = token_offset_mapping
+                    if len(second_ids) <= max_len_for_pair:
+                        pair_ids = second_ids
+                        pair_mapping = token_pair_offset_mapping
+                    else:
+                        pair_ids = second_ids[:max_len_for_pair]
+                        pair_mapping = token_pair_offset_mapping[:max_len_for_pair]
+
+                    offset_mapping = self.build_offset_mapping_with_special_tokens(mapping, pair_mapping)
+                    sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
+                    token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
+
+                    # Build output dictionnary
+                    encoded_inputs["input_ids"] = sequence
+                    if return_token_type_ids:
+                        encoded_inputs["token_type_ids"] = token_type_ids
+                    if return_special_tokens_mask:
+                        encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
+                    if return_length:
+                        encoded_inputs["seq_len"] = len(encoded_inputs["input_ids"])
+
+                    # Check lengths
+                    assert max_length is None or len(encoded_inputs["input_ids"]) <= max_length
+
+                    # Padding
+                    needs_to_be_padded = padding and max_length and len(encoded_inputs["input_ids"]) < max_length
+
+                    encoded_inputs["offset_mapping"] = offset_mapping
+
+                    if needs_to_be_padded:
+                        difference = max_length - len(encoded_inputs["input_ids"])
+                        if self.padding_side == "right":
+                            if return_attention_mask:
+                                encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [
+                                    0
+                                ] * difference
+                            if return_token_type_ids:
+                                # 0 for padding token mask
+                                encoded_inputs["token_type_ids"] = (
+                                    encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
+                                )
+                            if return_special_tokens_mask:
+                                encoded_inputs["special_tokens_mask"] = (
+                                    encoded_inputs["special_tokens_mask"] + [1] * difference
+                                )
+                            encoded_inputs["input_ids"] = (
+                                encoded_inputs["input_ids"] + [self.pad_token_id] * difference
+                            )
+                            encoded_inputs["offset_mapping"] = encoded_inputs["offset_mapping"] + [(0, 0)] * difference
+                        elif self.padding_side == "left":
+                            if return_attention_mask:
+                                encoded_inputs["attention_mask"] = [0] * difference + [1] * len(
+                                    encoded_inputs["input_ids"]
+                                )
+                            if return_token_type_ids:
+                                # 0 for padding token mask
+                                encoded_inputs["token_type_ids"] = [
+                                    self.pad_token_type_id
+                                ] * difference + encoded_inputs["token_type_ids"]
+                            if return_special_tokens_mask:
+                                encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs[
+                                    "special_tokens_mask"
+                                ]
+                            encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs[
+                                "input_ids"
+                            ]
+                            encoded_inputs["offset_mapping"] = [(0, 0)] * difference + encoded_inputs["offset_mapping"]
+                    else:
+                        if return_attention_mask:
+                            encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
+
+                    if return_position_ids:
+                        encoded_inputs["position_ids"] = list(range(len(encoded_inputs["input_ids"])))
+
+                    encoded_inputs["overflow_to_sample"] = example_id
+                    batch_encode_inputs.append(encoded_inputs)
+
+                    if len(second_ids) <= max_len_for_pair:
+                        break
+                    else:
+                        second_ids = second_ids[max_len_for_pair - stride :]
+                        token_pair_offset_mapping = token_pair_offset_mapping[max_len_for_pair - stride :]
+
+            else:
+                batch_encode_inputs.append(
+                    self.encode(
+                        first_ids,
+                        second_ids,
+                        max_length=max_length,
+                        padding=padding,
+                        truncation=truncation,
+                        return_position_ids=return_position_ids,
+                        return_token_type_ids=return_token_type_ids,
+                        return_attention_mask=return_attention_mask,
+                        return_overflowing_tokens=return_overflowing_tokens,
+                        return_special_tokens_mask=return_special_tokens_mask,
+                    )
+                )
+
+        batch_encode_inputs = {k: [output[k] for output in batch_encode_inputs] for k in batch_encode_inputs[0].keys()}
+        batch_encode_inputs = self.pad(
+            batch_encode_inputs,
+            padding=padding_strategy.value,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+        )
+        if return_dict:
+            batch_outputs = BatchEncoding(batch_encode_inputs, tensor_type=return_tensors)
+            return batch_outputs
+        else:
+            batch_outputs_list = []
+            for k, v in batch_encode_inputs.items():
+                for i in range(len(v)):
+                    if i >= len(batch_outputs_list):
+                        batch_outputs_list.append({k: v[i]})
+                    else:
+                        batch_outputs_list[i][k] = v[i]
+            return batch_outputs_list
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/model_outputs.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/model_outputs.py
new file mode 100644
index 000000000..570052274
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/model_outputs.py
@@ -0,0 +1,1520 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+from collections import OrderedDict
+from dataclasses import dataclass, fields
+from typing import Any, Optional, Tuple
+
+import numpy as np
+import paddle
+from paddle import Tensor
+from paddle.distributed.fleet.utils import recompute
+from paddle.nn import MultiHeadAttention
+from paddle.nn.layer.transformer import _convert_attention_mask
+
+from .utils import adapt_stale_fwd_patch
+
+
+def tuple_output(outputs: Tuple[Tensor], loss: Optional[Tensor] = None):
+    """re-construct the outputs with one method which contains the simple logic
+
+    Args:
+        outputs (Tuple[Tensor]): the source of the outputs
+        loss (Optional[Tensor], optional): the loss of the model. Defaults to None.
+    """
+    if loss is not None:
+        outputs = (loss,) + outputs
+    if len(outputs) == 1:
+        return outputs[0]
+    return outputs
+
+
+def convert_encoder_output(encoder_output):
+    """
+    Convert encoder_output from tuple to class:`~paddlenlp.transformers.model_outputs.BaseModelOutput`.
+
+    Args:
+        encoder_output (tuple or ModelOutput):
+            The output of the encoder, a tuple consists `last_hidden_state`, `hidden_states`(optional), `attentions`(optional).
+            The data type of `last_hidden_state` is float32 and its shape is [batch_size, sequence_length, hidden_size].
+    """
+    return BaseModelOutput(
+        last_hidden_state=encoder_output[0],
+        hidden_states=encoder_output[1] if len(encoder_output) > 1 else None,
+        attentions=encoder_output[2] if len(encoder_output) > 2 else None,
+    )
+
+
+def layer_init_wrapper(func):
+    @functools.wraps(func)
+    def _impl(self, *args, **kwargs):
+        enable_recompute = kwargs.pop("enable_recompute", False)
+        func(self, *args, **kwargs)
+        if paddle.in_dynamic_mode():
+            self.enable_recompute = enable_recompute
+        else:
+            self.enable_recompute = False
+
+    return _impl
+
+
+@paddle.jit.not_to_static
+def _transformer_encoder_layer_fwd(self, src, src_mask=None, cache=None, output_attentions=False):
+    self.self_attn.need_weights = output_attentions
+    src_mask = _convert_attention_mask(src_mask, src.dtype)
+
+    residual = src
+    if self.normalize_before:
+        src = self.norm1(src)
+
+    attn_outputs = self.self_attn(src, src, src, src_mask, cache)
+    if isinstance(attn_outputs, tuple):
+        src = attn_outputs[0]
+        outputs = attn_outputs[1:]
+    else:
+        src = attn_outputs
+        outputs = None
+
+    src = residual + self.dropout1(src)
+    if not self.normalize_before:
+        src = self.norm1(src)
+
+    residual = src
+    if self.normalize_before:
+        src = self.norm2(src)
+    src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+    src = residual + self.dropout2(src)
+    if not self.normalize_before:
+        src = self.norm2(src)
+
+    return src if outputs is None else ((src,) + outputs[::-1])  # hidden_states, cache, attentions
+
+
+@paddle.jit.not_to_static
+def _transformer_decoder_layer_fwd(
+    self,
+    tgt,
+    memory,
+    tgt_mask=None,
+    memory_mask=None,
+    cache=None,
+    output_attentions=False,
+):
+    residual = tgt
+
+    # self attention
+    self.self_attn.need_weights = output_attentions
+    tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
+
+    if self.normalize_before:
+        tgt = self.norm1(tgt)
+
+    self_attn_outputs = self.self_attn(tgt, tgt, tgt, tgt_mask, cache[0] if cache else None)
+    # self_attn_outputs = (tgt, attn_weights, incremental_cache) or only tgt
+    if isinstance(self_attn_outputs, type(tgt)):
+        tgt = self_attn_outputs
+    else:
+        tgt = self_attn_outputs[0]
+        if output_attentions:
+            self_attn_weights = self_attn_outputs[1]
+        if cache:
+            incremental_cache = self_attn_outputs[-1]
+
+    tgt = residual + self.dropout1(tgt)
+    if not self.normalize_before:
+        tgt = self.norm1(tgt)
+
+    residual = tgt
+
+    # cross attention
+    if memory is not None:
+        self.cross_attn.need_weights = output_attentions
+        memory_mask = _convert_attention_mask(memory_mask, memory.dtype)
+
+        if self.normalize_before:
+            tgt = self.norm2(tgt)
+
+        cross_attn_outputs = self.cross_attn(tgt, memory, memory, memory_mask, cache[1] if cache else None)
+        if isinstance(cross_attn_outputs, type(tgt)):
+            tgt = cross_attn_outputs
+        else:
+            tgt = cross_attn_outputs[0]
+            if output_attentions:
+                cross_attn_weights = cross_attn_outputs[1]
+            if cache:
+                static_cache = cross_attn_outputs[-1]
+
+        tgt = residual + self.dropout2(tgt)
+        if not self.normalize_before:
+            tgt = self.norm2(tgt)
+
+        residual = tgt
+
+    if self.normalize_before:
+        tgt = self.norm3(tgt)
+    tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+    tgt = residual + self.dropout3(tgt)
+    if not self.normalize_before:
+        tgt = self.norm3(tgt)
+
+    if not output_attentions and cache is None:
+        return tgt
+    else:
+        outputs = (tgt,)
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights if memory is not None else None)
+        if cache:
+            outputs += ((incremental_cache, static_cache if memory is not None else None),)
+        return outputs
+
+
+@paddle.jit.not_to_static
+def _transformer_decoder_fwd(
+    self,
+    tgt,
+    memory=None,
+    tgt_mask=None,
+    memory_mask=None,
+    cache=None,
+    output_attentions=False,
+    output_hidden_states=False,
+    return_dict=False,
+):
+    tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
+    if memory is not None:
+        memory_mask = _convert_attention_mask(memory_mask, memory.dtype)
+
+    new_caches = [] if cache else None
+    all_hidden_states = [tgt] if output_hidden_states else None
+    all_self_attns = [] if output_attentions else None
+    all_cross_attns = [] if output_attentions else None
+
+    for i, mod in enumerate(self.layers):
+        if cache is None:
+            # if output has no gradient, recompute is unnecessary
+            memory_stop_gradient = memory is not None and memory.stop_gradient
+            has_gradient = (not tgt.stop_gradient) or (not memory_stop_gradient)
+            if self.enable_recompute and has_gradient:
+                outputs = recompute(mod, tgt, memory, tgt_mask, memory_mask, None, output_attentions)
+            else:
+                outputs = mod(
+                    tgt,
+                    memory,
+                    tgt_mask=tgt_mask,
+                    memory_mask=memory_mask,
+                    cache=None,
+                    output_attentions=output_attentions,
+                )
+        else:
+            outputs = mod(
+                tgt,
+                memory,
+                tgt_mask=tgt_mask,
+                memory_mask=memory_mask,
+                cache=cache[i] if cache else None,
+                output_attentions=output_attentions,
+            )
+        if isinstance(outputs, type(tgt)):
+            tgt = outputs
+        else:
+            tgt = outputs[0]
+        if cache:
+            new_caches.append(outputs[-1])
+        if output_attentions:
+            all_self_attns.append(outputs[1])
+            all_cross_attns.append(outputs[2])
+        if output_hidden_states:
+            all_hidden_states.append(tgt)
+
+    if self.norm is not None:
+        tgt = self.norm(tgt)
+        if output_hidden_states:
+            all_hidden_states[-1] = tgt
+
+    if not return_dict:
+        if isinstance(outputs, type(tgt)):
+            return tgt
+
+        temp_list = [
+            tgt,
+            new_caches if cache else None,
+            all_hidden_states,
+            all_self_attns,
+            all_cross_attns,
+        ]
+        return tuple(v for v in temp_list if v is not None)
+
+    return BaseModelOutputWithPastAndCrossAttentions(
+        last_hidden_state=tgt,
+        past_key_values=new_caches,
+        hidden_states=all_hidden_states,
+        attentions=all_self_attns,
+        cross_attentions=all_cross_attns,
+    )
+
+
+@paddle.jit.not_to_static
+def _transformer_encoder_fwd(
+    self, src, src_mask=None, cache=None, output_attentions=False, output_hidden_states=False, return_dict=False
+):
+    src_mask = _convert_attention_mask(src_mask, src.dtype)
+
+    output = src
+    # To get cache from None when use_cache is True, which is compatible with HF
+    # while HF requires decoder. The implementation here uses cache update in the
+    # MultiHeadAttention not so efficiently, and maybe optimize it later.
+    if cache is None and getattr(self, "_use_cache", False):
+        cache = [tuple(self.layers[0].gen_cache(src))] * len(self.layers)
+    # To be compatible with `TransformerEncoder.forward`, `_use_cache` defualts
+    # to True when cache is not None.
+    new_caches = [] if cache is not None and getattr(self, "_use_cache", True) else None
+    all_attentions = [] if output_attentions else None
+    # NOTE: Also includes embeding output which is same as HF.
+    all_hidden_states = [output] if output_hidden_states else None
+    for i, mod in enumerate(self.layers):
+        # if output has no gradient, recompute is unnecessary
+        has_gradient = not output.stop_gradient
+        if self.enable_recompute and has_gradient:
+            # Note: recompute do not support pass as **kwargs yet.
+            layer_outputs = recompute(
+                mod,
+                output,
+                src_mask,
+                None
+                if cache is None
+                else cache[i]
+                if isinstance(cache[i], MultiHeadAttention.Cache)
+                else MultiHeadAttention.Cache(*cache[i]),
+                output_attentions,
+            )
+        else:
+            layer_outputs = mod(
+                output,
+                src_mask=src_mask,
+                cache=None
+                if cache is None
+                else cache[i]
+                if isinstance(cache[i], MultiHeadAttention.Cache)
+                else MultiHeadAttention.Cache(*cache[i]),
+                output_attentions=output_attentions,
+            )
+
+        if isinstance(layer_outputs, tuple):
+            output = layer_outputs[0]
+            outputs = layer_outputs[1:]
+        else:
+            output = layer_outputs
+            outputs = None
+
+        if output_hidden_states:
+            all_hidden_states.append(output)
+        if output_attentions:
+            all_attentions.append(outputs[-1])
+        if new_caches is not None:
+            new_caches.append(outputs[0] if isinstance(cache[i], MultiHeadAttention.Cache) else (tuple(outputs[0])))
+
+    if self.norm is not None:
+        output = self.norm(output)
+
+        if output_hidden_states:
+            all_hidden_states[-1] = output
+
+    if not return_dict:
+        outputs = tuple(
+            tuple(v) if isinstance(v, list) else v
+            for v in [
+                output,
+                new_caches,
+                all_hidden_states,
+                all_attentions,
+            ]
+            if v is not None
+        )
+        if len(outputs) == 1:
+            return output
+        else:
+            return outputs
+
+    return BaseModelOutputWithPastAndCrossAttentions(
+        last_hidden_state=output,
+        past_key_values=new_caches,
+        hidden_states=all_hidden_states,
+        attentions=all_attentions,
+    )
+
+
+_transformer_encoder_fwd.__name__ = "forward"
+_transformer_encoder_layer_fwd.__name__ = "forward"
+# patches of paddle.nn.Transformer to get all hidden_states and attentions
+paddle.nn.TransformerEncoderLayer.forward = _transformer_encoder_layer_fwd
+paddle.nn.TransformerDecoderLayer.forward = _transformer_decoder_layer_fwd
+paddle.nn.TransformerEncoder.forward = _transformer_encoder_fwd
+paddle.nn.TransformerDecoder.forward = _transformer_decoder_fwd
+
+_encoder_init = paddle.nn.TransformerEncoder.__init__
+_decoder_init = paddle.nn.TransformerDecoder.__init__
+paddle.nn.TransformerEncoder.__init__ = layer_init_wrapper(_encoder_init)
+paddle.nn.TransformerDecoder.__init__ = layer_init_wrapper(_decoder_init)
+
+
+def _get_wrap_setattr(cls):
+    def _wrap_setattr(self, name, value):
+        value = adapt_stale_fwd_patch(self, name, value)
+        return super(cls, self).__setattr__(name, value)
+
+    return _wrap_setattr
+
+
+paddle.nn.TransformerEncoderLayer.__setattr__ = functools.wraps(paddle.nn.TransformerEncoderLayer.__setattr__)(
+    _get_wrap_setattr(paddle.nn.TransformerEncoderLayer)
+)
+paddle.nn.TransformerEncoder.__setattr__ = functools.wraps(paddle.nn.TransformerEncoder.__setattr__)(
+    _get_wrap_setattr(paddle.nn.TransformerEncoder)
+)
+paddle.nn.TransformerDecoder.__setattr__ = functools.wraps(paddle.nn.TransformerDecoder.__setattr__)(
+    _get_wrap_setattr(paddle.nn.TransformerDecoder)
+)
+
+
+def is_tensor(x):
+    if isinstance(x, paddle.Tensor):
+        return True
+
+    return isinstance(x, np.ndarray)
+
+
+class ModelOutput(OrderedDict):
+    """
+    Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a
+    tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular
+    python dictionary.
+
+    <Tip warning={true}>
+
+    You can't unpack a `ModelOutput` directly. Use the [`~utils.ModelOutput.to_tuple`] method to convert it to a tuple
+    before.
+
+    </Tip>
+    """
+
+    def __post_init__(self):
+        class_fields = fields(self)
+
+        # note(guosheng): Convert list to tuple automatically, and better to
+        # check if it is frozen.
+        # assert not getattr(self, dataclasses._PARAMS).frozen
+        for f in class_fields:
+            value = getattr(self, f.name)
+            if isinstance(value, list):
+                setattr(self, f.name, tuple(value))
+
+        # Safety and consistency checks
+        if not len(class_fields):
+            raise ValueError(f"{self.__class__.__name__} has no fields.")
+        if not all(field.default is None for field in class_fields[1:]):
+            raise ValueError(f"{self.__class__.__name__} should not have more than one required field.")
+
+        first_field = getattr(self, class_fields[0].name)
+        other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:])
+
+        if other_fields_are_none and not is_tensor(first_field):
+            if isinstance(first_field, dict):
+                iterator = first_field.items()
+                first_field_iterator = True
+            else:
+                try:
+                    iterator = iter(first_field)
+                    first_field_iterator = True
+                except TypeError:
+                    first_field_iterator = False
+
+            # if we provided an iterator as first field and the iterator is a (key, value) iterator
+            # set the associated fields
+            if first_field_iterator:
+                for element in iterator:
+                    if (
+                        not isinstance(element, (list, tuple))
+                        or not len(element) == 2
+                        or not isinstance(element[0], str)
+                    ):
+                        break
+                    setattr(self, element[0], element[1])
+                    if element[1] is not None:
+                        self[element[0]] = element[1]
+            elif first_field is not None:
+                self[class_fields[0].name] = first_field
+        else:
+            for field in class_fields:
+                v = getattr(self, field.name)
+                if v is not None:
+                    self[field.name] = v
+
+    def __delitem__(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
+
+    def setdefault(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
+
+    def pop(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
+
+    def update(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
+
+    def __getitem__(self, k):
+        if isinstance(k, str):
+            inner_dict = {k: v for (k, v) in self.items()}
+            return inner_dict[k]
+        else:
+            return self.to_tuple()[k]
+
+    def __setattr__(self, name, value):
+        if name in self.keys() and value is not None:
+            # Don't call self.__setitem__ to avoid recursion errors
+            super().__setitem__(name, value)
+        super().__setattr__(name, value)
+
+    def __setitem__(self, key, value):
+        # Will raise a KeyException if needed
+        super().__setitem__(key, value)
+        # Don't call self.__setattr__ to avoid recursion errors
+        super().__setattr__(key, value)
+
+    def to_tuple(self) -> Tuple[Any]:
+        """
+        Convert self to a tuple containing all the attributes/keys that are not `None`.
+        """
+        # try to fix: https://github.com/PaddlePaddle/PaddleNLP/issues/3355
+        # when trying to get the keys of `OrderedDict`, `keys` method return empty values.
+        # TODO(wj-Mcat): this bug should be fixed in Paddle framework
+        tuples = ()
+        for field in fields(self):
+            if getattr(self, field.name, None) is None:
+                continue
+            tuples = tuples + (getattr(self, field.name),)
+
+        return tuples
+
+
+@dataclass
+class BaseModelOutput(ModelOutput):
+    """
+    Base class for model's outputs, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BaseModelOutputWithNoAttention(ModelOutput):
+    """
+    Base class for model's outputs, with potential hidden states.
+
+    Args:
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+    """
+
+    last_hidden_state: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BaseModelOutputWithPooling(ModelOutput):
+    """
+    Base class for model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (`paddle.Tensor` of shape `(batch_size, hidden_size)`):
+            Last layer hidden-state of the first token of the sequence (classification token) after further processing
+            through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
+            the classification token after processing through a linear layer and a tanh activation function. The linear
+            layer weights are trained from the next sentence prediction (classification) objective during pretraining.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: paddle.Tensor = None
+    pooler_output: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BaseModelOutputWithPast(ModelOutput):
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+            input) to speed up sequential decoding.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: paddle.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BaseModelOutputWithPastAndCrossAttentions(ModelOutput):
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+            input) to speed up sequential decoding.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+    """
+
+    last_hidden_state: paddle.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+    cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
+    """
+    Base class for model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (`paddle.Tensor` of shape `(batch_size, hidden_size)`):
+            Last layer hidden-state of the first token of the sequence (classification token) after further processing
+            through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
+            the classification token after processing through a linear layer and a tanh activation function. The linear
+            layer weights are trained from the next sentence prediction (classification) objective during pretraining.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+            input) to speed up sequential decoding.
+    """
+
+    last_hidden_state: paddle.Tensor = None
+    pooler_output: paddle.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+    cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class SequenceClassifierOutput(ModelOutput):
+    """
+    Base class for outputs of sentence classification models.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`paddle.Tensor` of shape `(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class TokenClassifierOutput(ModelOutput):
+    """
+    Base class for outputs of token classification models.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
+            Classification loss.
+        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.num_labels)`):
+            Classification scores (before SoftMax).
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class QuestionAnsweringModelOutput(ModelOutput):
+    """
+    Base class for outputs of question answering models.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        start_logits (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+            Span-start scores (before SoftMax).
+        end_logits (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+            Span-end scores (before SoftMax).
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    start_logits: paddle.Tensor = None
+    end_logits: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class MultipleChoiceModelOutput(ModelOutput):
+    """
+    Base class for outputs of multiple choice models.
+
+    Args:
+        loss (`paddle.Tensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
+            Classification loss.
+        logits (`paddle.Tensor` of shape `(batch_size, num_choices)`):
+            *num_choices* is the second dimension of the input tensors. (see *input_ids* above).
+
+            Classification scores (before SoftMax).
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class MaskedLMOutput(ModelOutput):
+    """
+    Base class for masked language models outputs.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Masked language modeling (MLM) loss.
+        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class CausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `paddle.Tensor` tuples of length `config.n_layers`, with each tuple containing the cached key,
+            value states of the self-attention and the cross-attention layers if model is used in encoder-decoder
+            setting. Only relevant if `config.is_decoder = True`.
+
+            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class CausalLMOutputWithCrossAttentions(ModelOutput):
+    """
+    Base class for causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Cross attentions weights after the attention softmax, used to compute the weighted average in the
+            cross-attention heads.
+        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `paddle.Tensor` tuples of length `config.n_layers`, with each tuple containing the cached key,
+            value states of the self-attention and the cross-attention layers if model is used in encoder-decoder
+            setting. Only relevant if `config.is_decoder = True`.
+
+            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+    cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class Seq2SeqModelOutput(ModelOutput):
+    """
+    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
+    decoding.
+
+    Args:
+        last_hidden_state (`paddle.Tensor`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model, whose shape is `(batch_size, Sequence_length, hidden_size)`.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        past_key_values (`tuple(tuple(paddle.Tensor))`, optional):
+            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Returned when `use_cache=True` is passed or when `config.use_cache=True`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(paddle.Tensor)`, optional):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`.
+
+            Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
+        decoder_attentions (`tuple(paddle.Tensor)`, optional):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Returned when `output_attentions=True` is passed or when `config.output_attentions=True`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(paddle.Tensor)`, optional):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Returned when `output_attentions=True` is passed or when `config.output_attentions=True`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`paddle.Tensor`, optional):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model whose shape is `(batch_size, sequence_length, hidden_size)`,
+        encoder_hidden_states (`tuple(paddle.Tensor)`, optional):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`.
+
+            Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
+        encoder_attentions (`tuple(paddle.Tensor)`, optional):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Returned when `output_attentions=True` is passed or when `config.output_attentions=True`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+    """
+
+    last_hidden_state: paddle.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    decoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+    cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+    encoder_last_hidden_state: Optional[paddle.Tensor] = None
+    encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    encoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class Seq2SeqLMOutput(ModelOutput):
+    """
+    Base class for sequence-to-sequence language models outputs.
+
+    Args:
+        loss (`paddle.Tensor`, optional):
+            Language modeling loss whose shape is `(1,)`. Returned when `labels` is provided.
+        logits (`paddle.Tensor`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) whose shape is `(batch_size, sequence_length, config.vocab_size)`).
+        past_key_values (`tuple(tuple(paddle.Tensor))`, optional):
+            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Returned when `use_cache=True` is passed or when `config.use_cache=True`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(paddle.Tensor)`, optional):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`.
+
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(paddle.Tensor)`, optional):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Returned when `output_attentions=True` is passed or when `config.output_attentions=True`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(paddle.Tensor)`, optional):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Returned when `output_attentions=True` is passed or when `config.output_attentions=True`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`paddle.Tensor`, optional):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model whose shape is `(batch_size, sequence_length, hidden_size)`.
+        encoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(paddle.Tensor)`, optional):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Returned when `output_attentions=True` is passed or when `config.output_attentions=True`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    decoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+    cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+    encoder_last_hidden_state: Optional[paddle.Tensor] = None
+    encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    encoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class Seq2SeqQuestionAnsweringModelOutput(ModelOutput):
+    """
+    Base class for outputs of sequence-to-sequence question answering models.
+    Args:
+        loss (`paddle.Tensor` ,optional):
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+            A Tensor of shape `(1,)`, returned when `labels` is provided.
+        start_logits (`paddle.Tensor`):
+            Span-start scores (before SoftMax). Tensor of shape `(batch_size, sequence_length)`).
+        end_logits (`paddle.Tensor`):
+            Span-end scores (before SoftMax). Tensor of shape `(batch_size, sequence_length)`).
+        past_key_values (`tuple(tuple(paddle.Tensor))`, optional):
+            Tuple of `tuple(paddle.Tensor)` of length `n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Returned when `use_cache=True` is passed.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(paddle.Tensor)`, optional):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Returned when `output_hidden_states=True` is passed.
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(paddle.Tensor)`, optional):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Returned when `output_attentions=True` is passed.
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(paddle.Tensor)`, optional):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Returned when `output_attentions=True` is passed.
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`paddle.Tensor` optional):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+            Tensor of shape `(batch_size, sequence_length, hidden_size)`.
+        encoder_hidden_states (`tuple(paddle.Tensor)`, optional):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Returned when `output_hidden_states=True` is passed.
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(paddle.Tensor)`, optional):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Returned when `output_attentions=True` is passed.
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    start_logits: paddle.Tensor = None
+    end_logits: paddle.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    decoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+    cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+    encoder_last_hidden_state: Optional[paddle.Tensor] = None
+    encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    encoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class Seq2SeqSequenceClassifierOutput(ModelOutput):
+    """
+    Base class for outputs of sequence-to-sequence sentence classification models.
+    Args:
+        loss (`paddle.Tensor` optional):
+            Classification (or regression if config.num_labels==1) loss of shape `(1,)`. Returned when `label` is provided).
+        logits (`paddle.Tensor`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax) of shape `(batch_size, config.num_labels)`
+        past_key_values (`tuple(tuple(paddle.Tensor))`, optional):
+            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Returned when `use_cache=True` is passed.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(paddle.Tensor)`, optional):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Returned when `output_hidden_states=True` is passed.
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(paddle.Tensor)`, optional):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Returned when `output_attentions=True` is passed.
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(paddle.Tensor)`, optional):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Returned when `output_attentions=True` is passed.
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`paddle.Tensor`, optional):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+            Tensor of shape `(batch_size, sequence_length, hidden_size)`.
+        encoder_hidden_states (`tuple(paddle.Tensor)`, optional):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Returned when `output_hidden_states=True` is passed.
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(paddle.Tensor)`, optional):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Returned when `output_attentions=True` is passed.
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    decoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+    cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+    encoder_last_hidden_state: Optional[paddle.Tensor] = None
+    encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    encoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class SequenceClassifierOutputWithPast(ModelOutput):
+    """
+    Base class for outputs of sentence classification models.
+    Args:
+        loss (`paddle.Tensor`, optional):
+            Classification (or regression if config.num_labels==1) loss whose shape is `(1,)`.
+            Returned when `labels` is provided.
+        logits (`paddle.Tensor`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax)
+            whose shape is `(batch_size, num_labels)`
+        past_key_values (`tuple(tuple(paddle.Tensor))`, optional):
+            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+            Returned when `use_cache=True` is passed or when `config.use_cache=True`).
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(paddle.Tensor)`, optional):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`).
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, optional):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Returned when `output_attentions=True` is passed or when `config.output_attentions=True`).
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BackboneOutput(ModelOutput):
+    """
+    Base class for outputs of backbones.
+
+    Args:
+        feature_maps (`tuple(paddle.Tensor)` of shape `(batch_size, num_channels, height, width)`):
+            Feature maps of the stages.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, num_channels, height, width)`,
+            depending on the backbone.
+
+            Hidden-states of the model at the output of each stage plus the initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Only applicable if the backbone uses attention.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    feature_maps: Tuple[paddle.Tensor] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BaseModelOutputWithPoolingAndNoAttention(ModelOutput):
+    """
+    Base class for model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (`paddle.Tensor` of shape `(batch_size, hidden_size)`):
+            Last layer hidden-state after a pooling operation on the spatial dimensions.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+    """
+
+    last_hidden_state: paddle.Tensor = None
+    pooler_output: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class ImageClassifierOutputWithNoAttention(ModelOutput):
+    """
+    Base class for outputs of image classification models.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`paddle.Tensor` of shape `(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each stage) of shape `(batch_size, num_channels, height, width)`. Hidden-states (also
+            called feature maps) of the model at the output of each stage.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class DepthEstimatorOutput(ModelOutput):
+    """
+    Base class for outputs of depth estimation models.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        predicted_depth (`paddle.Tensor` of shape `(batch_size, height, width)`):
+            Predicted depth for each pixel.
+
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    predicted_depth: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class SemanticSegmenterOutput(ModelOutput):
+    """
+    Base class for outputs of semantic segmentation models.
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`paddle.Tensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
+            Classification scores for each pixel.
+            <Tip warning={true}>
+            The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
+            to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
+            original image size as post-processing. You should always check your logits shape and resize as needed.
+            </Tip>
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, patch_size, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class Seq2SeqSpectrogramOutput(ModelOutput):
+    """
+    Base class for sequence-to-sequence spectrogram outputs.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Spectrogram generation loss.
+        spectrogram (`paddle.Tensor` of shape `(batch_size, sequence_length, num_bins)`):
+            The predicted spectrogram.
+        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    spectrogram: paddle.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    decoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+    cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+    encoder_last_hidden_state: Optional[paddle.Tensor] = None
+    encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    encoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class MoEModelOutputWithPast(ModelOutput):
+    """
+    Base class for model's outputs, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+            input) to speed up sequential decoding.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        router_logits (`tuple(paddle.Tensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
+
+            Raw router logtis (post-softmax) that are computed by MoE routers, these terms are used to compute the auxiliary
+            loss for Mixture of Experts models.
+    """
+
+    last_hidden_state: paddle.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+    router_logits: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class MoECausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for causal language model (or autoregressive) with mixture of experts outputs.
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+
+        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+
+        aux_loss (`paddle.Tensor`, *optional*, returned when `labels` is provided):
+            aux_loss for the sparse modules.
+
+        router_logits (`tuple(paddle.Tensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
+
+            Raw router logtis (post-softmax) that are computed by MoE routers, these terms are used to compute the auxiliary
+            loss for Mixture of Experts models.
+
+        past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    aux_loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+    router_logits: Optional[Tuple[paddle.Tensor]] = None
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/model_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/model_utils.py
new file mode 100644
index 000000000..c15e1687c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/model_utils.py
@@ -0,0 +1,2803 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import contextlib
+import copy
+import gc
+import inspect
+import json
+import os
+import re
+import sys
+import tempfile
+import warnings
+from contextlib import contextmanager
+from functools import partial
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+
+import aistudio_sdk
+import numpy as np
+import paddle
+import paddle.nn as nn
+import six
+from huggingface_hub import (
+    create_repo,
+    get_hf_file_metadata,
+    hf_hub_url,
+    repo_type_and_id_from_hf_id,
+    upload_folder,
+)
+from huggingface_hub.utils import EntryNotFoundError
+from paddle import Tensor
+from paddle.distributed.fleet.meta_parallel.parallel_layers import (
+    PipelineLayer,
+    SharedLayerDesc,
+)
+from paddle.nn import Embedding, Layer
+
+# TODO(fangzeyang) Temporary fix and replace by paddle framework downloader later
+from paddle.utils.download import is_url as is_remote_url
+from tqdm.auto import tqdm
+
+from paddlenlp.utils.env import (
+    CONFIG_NAME,
+    LEGACY_CONFIG_NAME,
+    PADDLE_WEIGHTS_INDEX_NAME,
+    PADDLE_WEIGHTS_NAME,
+    PYTORCH_WEIGHTS_INDEX_NAME,
+    PYTORCH_WEIGHTS_NAME,
+    SAFE_MASTER_WEIGHTS_INDEX_NAME,
+    SAFE_PEFT_WEIGHTS_INDEX_NAME,
+    SAFE_WEIGHTS_INDEX_NAME,
+    SAFE_WEIGHTS_NAME,
+)
+from paddlenlp.utils.log import logger
+
+from ..generation import GenerationConfig, GenerationMixin
+from ..utils import device_guard
+from ..utils.download import resolve_file_path
+from .configuration_utils import PretrainedConfig
+from .conversion_utils import ConversionMixin
+from .utils import (  # convert_ndarray_dtype,
+    ContextManagers,
+    InitTrackerMeta,
+    adapt_stale_fwd_patch,
+    cached_file_for_hf_hub,
+    convert_file_size_to_int,
+    dtype_byte_size,
+    fn_args_to_dict,
+    get_checkpoint_shard_files,
+    is_paddle_support_lazy_init,
+    is_safetensors_available,
+    paddlenlp_load,
+    weight_name_suffix,
+)
+
+__all__ = [
+    "PretrainedModel",
+    "register_base_model",
+]
+
+
+def dy2st_nocheck_guard_context():
+    try:
+        context = paddle.framework._no_check_dy2st_diff()
+    except:
+        context = contextlib.nullcontext()
+    return context
+
+
+def unwrap_optimizer(optimizer, optimizer_instances=()):
+    if optimizer is None:
+        return None
+    while hasattr(optimizer, "_inner_opt") and not isinstance(optimizer, optimizer_instances):
+        optimizer = optimizer._inner_opt
+    if isinstance(optimizer, optimizer_instances):
+        return optimizer
+    return None
+
+
+if is_safetensors_available():
+    from safetensors.numpy import save_file as safe_save_file
+
+    from paddlenlp.utils.safetensors import fast_load_file as safe_load_file
+
+    if sys.platform.startswith("win"):
+        from safetensors import safe_open
+    else:
+        from paddlenlp.utils.safetensors import fast_safe_open as safe_open
+
+
+def prune_linear_layer(layer: nn.Linear, index: paddle.Tensor, dim: int = 0) -> nn.Linear:
+    """
+    Prune a linear layer to keep only entries in index.
+    Used to remove heads.
+    Args:
+        layer (`paddle.nn.Linear`): The layer to prune.
+        index (`paddle.Tensor`): The indices to keep in the layer.
+        dim (`int`, *optional*, defaults to 0): The dimension on which to keep the indices.
+    Returns:
+        `paddle.nn.Linear`: The pruned layer as a new layer with `stop_gradient=False`.
+    """
+    index = index.to(layer.weight)
+    W = layer.weight.index_select(dim, index).clone().detach()
+    if layer.bias is not None:
+        if dim == 1:
+            b = layer.bias.clone().detach()
+        else:
+            b = layer.bias[index].clone().detach()
+    new_size = list(layer.weight.shape)
+    new_size[dim] = len(index)
+    new_layer = nn.Linear(new_size[1], new_size[0], bias_attr=layer.bias is not None)
+    new_layer.weight.stop_gradient = True
+    new_layer.weight.copy_(W)
+    new_layer.weight.stop_gradient = False
+    if layer.bias is not None:
+        new_layer.bias.stop_gradient = True
+        new_layer.bias.copy_(b)
+        new_layer.bias.stop_gradient = False
+    return new_layer
+
+
+def find_pruneable_heads_and_indices(
+    heads: List[int], n_heads: int, head_size: int, already_pruned_heads: Set[int]
+) -> Tuple[Set[int], paddle.Tensor]:
+    """
+    Finds the heads and their indices taking `already_pruned_heads` into account.
+    Args:
+        heads (`List[int]`): List of the indices of heads to prune.
+        n_heads (`int`): The number of heads in the model.
+        head_size (`int`): The size of each head.
+        already_pruned_heads (`Set[int]`): A set of already pruned heads.
+    Returns:
+        `Tuple[Set[int], paddle.Tensor]`: A tuple with the remaining heads and their corresponding indices.
+    """
+    mask = paddle.ones([n_heads, head_size])
+    heads = set(heads) - already_pruned_heads  # Convert to set and remove already pruned heads
+    for head in heads:
+        # Compute how many pruned heads are before the head and move the index accordingly
+        head = head - sum(1 if h < head else 0 for h in already_pruned_heads)
+        mask[head] = 0
+    mask = mask.reshape([-1]).eq(1)
+    index: paddle.Tensor = paddle.arange(len(mask))[mask].cast("int64")
+    return heads, index
+
+
+def apply_chunking_to_forward(
+    forward_fn: Callable[..., paddle.Tensor], chunk_size: int, chunk_dim: int, *input_tensors
+) -> paddle.Tensor:
+    """
+    This function chunks the `input_tensors` into smaller input tensor parts of size `chunk_size` over the dimension
+    `chunk_dim`. It then applies a layer `forward_fn` to each chunk independently to save memory.
+    If the `forward_fn` is independent across the `chunk_dim` this function will yield the same result as directly
+    applying `forward_fn` to `input_tensors`.
+    Args:
+        forward_fn (`Callable[..., paddle.Tensor]`):
+            The forward function of the model.
+        chunk_size (`int`):
+            The chunk size of a chunked tensor: `num_chunks = len(input_tensors[0]) / chunk_size`.
+        chunk_dim (`int`):
+            The dimension over which the `input_tensors` should be chunked.
+        input_tensors (`Tuple[paddle.Tensor]`):
+            The input tensors of `forward_fn` which will be chunked
+    Returns:
+        `paddle.Tensor`: A tensor with the same shape as the `forward_fn` would have given if applied`.
+    Examples:
+    ```python
+    # rename the usual forward() fn to forward_chunk()
+    def forward_chunk(self, hidden_states):
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+    # implement a chunked forward function
+    def forward(self, hidden_states):
+        return apply_chunking_to_forward(self.forward_chunk, self.chunk_size_lm_head, self.seq_len_dim, hidden_states)
+    ```"""
+
+    assert len(input_tensors) > 0, f"{input_tensors} has to be a tuple/list of tensors"
+
+    # inspect.signature exist since python 3.5 and is a python method -> no problem with backward compatibility
+    num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters)
+    if num_args_in_forward_chunk_fn != len(input_tensors):
+        raise ValueError(
+            f"forward_chunk_fn expects {num_args_in_forward_chunk_fn} arguments, but only {len(input_tensors)} input "
+            "tensors are given"
+        )
+
+    if chunk_size > 0:
+        tensor_shape = input_tensors[0].shape[chunk_dim]
+        for input_tensor in input_tensors:
+            if input_tensor.shape[chunk_dim] != tensor_shape:
+                raise ValueError(
+                    f"All input tenors have to be of the same shape: {tensor_shape}, "
+                    f"found shape {input_tensor.shape[chunk_dim]}"
+                )
+
+        if input_tensors[0].shape[chunk_dim] % chunk_size != 0:
+            raise ValueError(
+                f"The dimension to be chunked {input_tensors[0].shape[chunk_dim]} has to be a multiple of the chunk "
+                f"size {chunk_size}"
+            )
+
+        num_chunks = input_tensors[0].shape[chunk_dim] // chunk_size
+
+        # chunk input tensor into tuples
+        input_tensors_chunks = tuple(input_tensor.chunk(num_chunks, axis=chunk_dim) for input_tensor in input_tensors)
+        # apply forward fn to every tuple
+        output_chunks = tuple(forward_fn(*input_tensors_chunk) for input_tensors_chunk in zip(*input_tensors_chunks))
+        # concatenate output at same dimension
+        return paddle.concat(output_chunks, axis=chunk_dim)
+
+    return forward_fn(*input_tensors)
+
+
+def unwrap_model(model, *args, **kwargs):
+    raw_model = model
+    while hasattr(raw_model, "_layers") or hasattr(raw_model, "_layer"):
+        if hasattr(raw_model, "_layers"):
+            # Caused by issue https://github.com/PaddlePaddle/PaddleNLP/issues/5295
+            # TODO: remove this after we fix the issue
+            if raw_model._layers is None:
+                break
+            raw_model = raw_model._layers
+        else:
+            if raw_model._layer is None:
+                break
+            raw_model = raw_model._layer
+
+    return raw_model
+
+
+def _add_variant(weights_name: str, variant=None) -> str:
+    if variant is not None and len(variant) > 0:
+        splits = weights_name.split(".")
+        splits = splits[:-1] + [variant] + splits[-1:]
+        weights_name = ".".join(splits)
+
+    return weights_name
+
+
+@contextmanager
+def dtype_guard(dtype="float32"):
+    origin_dtype = paddle.get_default_dtype()
+    paddle.set_default_dtype(dtype)
+    try:
+        yield
+    finally:
+        paddle.set_default_dtype(origin_dtype)
+
+
+_init_weights = True
+
+
+@contextmanager
+def no_init_weights(_enable=True):
+    """
+    Context manager to globally disable weight initialization to speed up loading large models.
+
+    TODO(Patrick): Delete safety argument `_enable=True` at next major version. .
+    """
+    global _init_weights
+    old_init_weights = _init_weights
+    if _enable:
+        _init_weights = False
+    try:
+        yield
+    finally:
+        _init_weights = old_init_weights
+
+
+def get_parameter_dtype(parameter: nn.Layer) -> paddle.dtype:
+    """get dtype of parameter which should be sub-class of nn.Layer
+
+    Args:
+        parameter (nn.Layer): the instance of layer
+
+    Returns:
+        paddle.dtype: the dtype of tensor
+    """
+
+    last_dtype = None
+    for t in parameter.parameters():
+        last_dtype = t.dtype
+        if t.is_floating_point():
+            return t.dtype
+
+    # TODO(wj-Mcat): get dtype of model when it's in DataParallel Mode.
+    return last_dtype
+
+
+def load_state_dict(
+    checkpoint_file: Union[str, os.PathLike], tensor_parallel_split_mapping=None, fliter_dict_keys=None, device="cpu"
+):
+    """
+    Reads a PaddlePaddle checkpoint file, returning properly formatted errors if they arise.
+    """
+    if tensor_parallel_split_mapping is None:
+        tensor_parallel_split_mapping = {}
+
+    if checkpoint_file.endswith(".safetensors") and is_safetensors_available():
+        # Check format of the archive
+        with safe_open(checkpoint_file, framework="np") as f:
+            metadata = f.metadata()
+        if metadata is None:
+            metadata = {"format", "np"}
+
+        if metadata.get("format", "np") not in ["pd", "np"]:
+            raise OSError(
+                f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata. Make sure "
+                "you save your model with the `save_pretrained` method."
+            )
+        if metadata.get("format", "np") == "pd":
+            raise ValueError("Currently unsupport paddle weights file, use numpy instead.")
+        if metadata.get("format", "np") == "np":
+            state_dict = {}
+            with safe_open(checkpoint_file, framework="np") as f:
+                for key in f.keys():
+                    if fliter_dict_keys is not None and key not in fliter_dict_keys:
+                        continue
+                    py_safe_slice_ = f.get_slice(key)
+                    if key in tensor_parallel_split_mapping:
+                        weight = tensor_parallel_split_mapping[key](py_safe_slice_)
+                    else:
+                        weight = py_safe_slice_[:]
+                    if device == "expected":
+                        with device_guard():
+                            weight = paddle.Tensor(weight, zero_copy=True)
+                        weight = weight._copy_to(paddle.framework._current_expected_place(), False)
+                    state_dict[key] = weight
+
+            if device == "cpu":
+                for k in list(state_dict.keys()):
+                    with device_guard():
+                        state_dict[k] = paddle.Tensor(state_dict.pop(k), zero_copy=True)
+
+            return state_dict
+
+    state_dict = paddlenlp_load(checkpoint_file, map_location="cpu")
+    return state_dict
+
+
+def resolve_weight_file_from_hf_hub(
+    repo_id: str, cache_dir: str, convert_from_torch: bool, subfolder=None, use_safetensors=False
+):
+    """find the suitable weight file name
+
+    Args:
+        repo_id (str): repo name of huggingface hub
+        cache_dir (str): cache dir for hf
+        convert_from_torch (bool): whether support converting pytorch weight file to paddle weight file
+        subfolder (str, optional) An optional value corresponding to a folder inside the repo.
+    """
+    is_sharded = False
+
+    if use_safetensors:
+        file_name_list = [
+            SAFE_WEIGHTS_INDEX_NAME,
+            SAFE_WEIGHTS_NAME,
+        ]
+    else:
+        file_name_list = [
+            PYTORCH_WEIGHTS_INDEX_NAME,
+            PADDLE_WEIGHTS_INDEX_NAME,
+            PYTORCH_WEIGHTS_NAME,
+            PADDLE_WEIGHTS_NAME,
+            SAFE_WEIGHTS_NAME,  # (NOTE,lxl): 兼容极端情况
+        ]
+    resolved_file = None
+    for fn in file_name_list:
+        resolved_file = cached_file_for_hf_hub(
+            repo_id, fn, cache_dir, subfolder, _raise_exceptions_for_missing_entries=False
+        )
+        if resolved_file is not None:
+            if resolved_file.endswith(".json"):
+                is_sharded = True
+            break
+
+    if resolved_file is None:
+        str_name_list = ", ".join(file_name_list)
+        raise EnvironmentError(
+            f"{repo_id} does not appear to have a file named {str_name_list}. Checkout "
+            f"'https://huggingface.co/{repo_id}' for available files."
+        )
+
+    return resolved_file, is_sharded
+
+
+def register_base_model(cls):
+    """
+    A decorator for `PretrainedModel` class. It first retrieves the parent class
+    of the class being decorated, then sets the `base_model_class` attribute
+    of that parent class to be the class being decorated. In summary, the decorator registers
+    the decorated class as the base model class in all derived classes under the same architecture.
+
+    Args:
+        cls (PretrainedModel): The class (inherited from PretrainedModel) to be decorated .
+
+    Returns:
+        PretrainedModel: The input class `cls` after decorating.
+
+    Example:
+        .. code-block::
+
+            from paddlenlp.transformers import BertModel, register_base_model
+
+            BertModel = register_base_model(BertModel)
+            assert BertModel.base_model_class == BertModel
+    """
+    base_cls = cls.__bases__[0]
+    assert issubclass(
+        base_cls, PretrainedModel
+    ), "`register_base_model` should be used on subclasses of PretrainedModel."
+    base_cls.base_model_class = cls
+    return cls
+
+
+class BackboneMixin:
+    def forward_with_filtered_kwargs(self, *args, **kwargs):
+        signature = dict(inspect.signature(self.forward).parameters)
+        filtered_kwargs = {k: v for k, v in kwargs.items() if k in signature}
+
+        return self(*args, **filtered_kwargs)
+
+
+_re_layer_prefix = re.compile(r"\.(\d+)\.")
+
+
+def _partion_for_pipeline_mode(keys):
+    # the keys should be sort in networks order
+    # TODO maybe handle tie_weight ?
+    def layer_prefix(key):
+        ret = _re_layer_prefix.search(key)
+        if ret is not None:
+            return key[0 : ret.end()]
+        return ""
+
+    keys = list(keys)
+    start_idx = -1
+    prefix_str = None
+    parttion_map = {}
+    for k in keys:
+        prefix = layer_prefix(k)
+        if prefix != prefix_str:
+            prefix_str = prefix
+            start_idx += 1
+        parttion_map[k] = start_idx
+
+    # if only one parttion, we don't parttion it
+    if start_idx < 1:
+        return {keys[i]: i for i in range(len(keys))}
+
+    return parttion_map
+
+
+def shard_checkpoint(
+    state_dict: Dict[str, paddle.Tensor],
+    max_shard_size: Union[int, str] = "10GB",
+    weights_name: str = PADDLE_WEIGHTS_NAME,
+    shard_format="naive",
+):
+    """
+    Splits a model state dictionary in sub-checkpoints so that the final size of each sub-checkpoint does not exceed a
+    given size.
+
+    The sub-checkpoints are determined by iterating through the `state_dict` in the order of its keys, so there is no
+    optimization made to make each sub-checkpoint as close as possible to the maximum size passed. For example, if the
+    limit is 10GB and we have weights of sizes [6GB, 6GB, 2GB, 6GB, 2GB, 2GB] they will get sharded as [6GB], [6+2GB],
+    [6+2+2GB] and not [6+2+2GB], [6+2GB], [6GB].
+
+    <Tip warning={true}>
+
+    If one of the model's weight is bigger that `max_sahrd_size`, it will end up in its own sub-checkpoint which will
+    have a size greater than `max_shard_size`.
+
+    </Tip>
+
+    Args:
+        state_dict (`Dict[str, paddle.Tensor]`): The state dictionary of a model to save.
+        max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
+            The maximum size of each sub-checkpoint. If expressed as a string, needs to be digits followed by a unit
+            (like `"5MB"`).
+        weights_name (`str`, *optional*, defaults to `"model_state.pdparams"`):
+            The name of the model save file.
+        shard_format (`str`, *optional*, defaults to `"naive"`):
+            support naive or pipeline.
+    """
+    assert shard_format in [
+        "naive",
+        "pipeline",
+    ], f"Invalid shard_format: {shard_format}, it show be `naive` or `pipeline`."
+
+    max_shard_size = convert_file_size_to_int(max_shard_size)
+
+    sharded_state_dicts = []
+    current_block = {}
+    current_block_size = 0
+    total_size = 0
+
+    if shard_format == "naive":
+        for key, weight in state_dict.items():
+            # _C_ops.numel not yet support paddle.int8
+            weight_size = np.prod(weight.shape) * dtype_byte_size(weight.dtype)
+            # If this weight is going to tip up over the maximal size, we split.
+            if current_block_size + weight_size > max_shard_size:
+                # fix if the first param is large than max_shard_size
+                if len(current_block) > 0:
+                    sharded_state_dicts.append(current_block)
+                current_block = {}
+                current_block_size = 0
+
+            current_block[key] = weight
+            current_block_size += weight_size
+            total_size += weight_size
+
+        # Add the last block
+        sharded_state_dicts.append(current_block)
+
+    if shard_format == "pipeline":
+        parttion_map = _partion_for_pipeline_mode(state_dict.keys())
+        partition_num = max(parttion_map.values())
+
+        for index in range(partition_num + 1):
+            weight_names = [k for k, v in parttion_map.items() if v == index]
+            weight_size = sum(
+                state_dict[key].numel().item() * dtype_byte_size(state_dict[key].dtype) for key in weight_names
+            )
+
+            # try to add new block
+            if current_block_size + weight_size > max_shard_size:
+                # fix if the first param is large than max_shard_size
+                if len(current_block) > 0:
+                    sharded_state_dicts.append(current_block)
+                current_block = {}
+                current_block_size = 0
+            for key in weight_names:
+                current_block[key] = state_dict[key]
+            current_block_size += weight_size
+            total_size += weight_size
+
+        # Add the last block
+        sharded_state_dicts.append(current_block)
+        logger.info(f"The average size of partition is around: {total_size//partition_num}")
+
+    # If we only have one shard, we return it
+    if len(sharded_state_dicts) == 1:
+        return {weights_name: sharded_state_dicts[0]}, None
+
+    # Otherwise, let's build the index
+    weight_map = {}
+    shards = {}
+    weights_name_suffix = Path(weights_name).suffix
+    for idx, shard in enumerate(sharded_state_dicts):
+        # replace `suffix` -> `-00001-of-00002suffix`
+        shard_file = weights_name.replace(
+            weights_name_suffix, f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}{weights_name_suffix}"
+        )
+        shards[shard_file] = shard
+        for key in shard.keys():
+            weight_map[key] = shard_file
+
+    # Add the metadata
+    metadata = {"total_size": int(total_size)}
+    index = {"metadata": metadata, "weight_map": weight_map}
+    return shards, index
+
+
+def load_sharded_checkpoint(model, folder, variant=None, strict=True, prefer_safe=False):
+    """
+    This is the same as [`paddle.nn.Layer.set_state_dict`]
+    but for a sharded checkpoint.
+
+    This load is performed efficiently: each checkpoint shard is loaded one by one in RAM and deleted after being
+    loaded in the model.
+
+    Args:
+        model (`paddle.nn.Module`): The model in which to load the checkpoint.
+        folder (`str` or `os.PathLike`): A path to a folder containing the sharded checkpoint.
+        variant (`str`): The model variant.
+        strict (`bool`, *optional`, defaults to `True`):
+            Whether to strictly enforce that the keys in the model state dict match the keys in the sharded checkpoint.
+        prefer_safe (`bool`, *optional*, defaults to `False`):
+            If both safetensors and Paddle save files are present in checkpoint and `prefer_safe` is True, the safetensors
+            files will be loaded. Otherwise, Paddle files are always loaded when possible.
+
+    Returns:
+        `NamedTuple`: A named tuple with `missing_keys` and `unexpected_keys` fields
+            - `missing_keys` is a list of str containing the missing keys
+            - `unexpected_keys` is a list of str containing the unexpected keys
+    """
+    # Load the index
+    index_file = os.path.join(folder, _add_variant(PADDLE_WEIGHTS_INDEX_NAME, variant))
+    safe_index_file = os.path.join(folder, _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant))
+
+    index_present = os.path.isfile(index_file)
+    safe_index_present = os.path.isfile(safe_index_file)
+
+    if not index_present and not (safe_index_present and is_safetensors_available()):
+        filenames = (
+            (_add_variant(PADDLE_WEIGHTS_INDEX_NAME, variant), _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant))
+            if is_safetensors_available()
+            else (_add_variant(PADDLE_WEIGHTS_INDEX_NAME, variant),)
+        )
+        raise ValueError(f"Can't find a checkpoint index ({' or '.join(filenames)}) in {folder}.")
+
+    load_safe = False
+    if safe_index_present:
+        if prefer_safe:
+            if is_safetensors_available():
+                load_safe = True  # load safe due to preference
+            else:
+                logger.warning(
+                    f"Cannot load sharded checkpoint at {folder} safely since safetensors is not installed!"
+                )
+        elif not index_present:
+            load_safe = True
+
+    load_index = safe_index_file if load_safe else index_file
+
+    with open(load_index, "r", encoding="utf-8") as f:
+        index = json.load(f)
+
+    shard_files = list(set(index["weight_map"].values()))
+
+    # If strict=True, error before loading any of the state dicts.
+    loaded_keys = index["weight_map"].keys()
+    model_keys = model.state_dict().keys()
+    missing_keys = [key for key in model_keys if key not in loaded_keys]
+    unexpected_keys = [key for key in loaded_keys if key not in model_keys]
+    if strict and (len(missing_keys) > 0 or len(unexpected_keys) > 0):
+        error_message = f"Error(s) in loading state_dict for {model.__class__.__name__}"
+        if len(missing_keys) > 0:
+            str_missing_keys = ",".join([f'"{k}"' for k in missing_keys])
+            error_message += f"\nMissing key(s): {str_missing_keys}."
+        if len(unexpected_keys) > 0:
+            str_unexpected_keys = ",".join([f'"{k}"' for k in unexpected_keys])
+            error_message += f"\nMissing key(s): {str_unexpected_keys}."
+        raise RuntimeError(error_message)
+
+    loader = safe_load_file if load_safe else partial(paddlenlp_load, map_location="cpu")
+
+    for shard_file in shard_files:
+        state_dict = loader(os.path.join(folder, shard_file))
+        with warnings.catch_warnings():
+            warnings.resetwarnings()
+            warnings.filterwarnings("ignore", message=r".*is not found in the provided dict.*")
+            model.set_state_dict(state_dict)
+
+        # Make sure memory is fred before we load the next state dict.
+        del state_dict
+        gc.collect()
+
+    # Return the same thing as PaddlePaddle set_state_dict function.
+    return missing_keys, unexpected_keys
+
+
+def faster_set_state_dict(model, state_dict, strict_dtype=True):
+    # the state_dict will be destroied.
+    unused_keys = set(state_dict.keys())
+    unset_keys = set(model.state_dict().keys())
+    with paddle.no_grad():
+        for k, v in model.state_dict().items():
+            if k in state_dict:
+                v_new = state_dict.pop(k)
+                if not isinstance(v_new, paddle.Tensor):
+                    raise ValueError(
+                        f"faster_set_state_dict need state dict with paddle.Tensor, but got {type(v_new)}"
+                    )
+                # 2. cast param / Tensor to dtype
+                #
+                if v.dtype != v_new.dtype:
+                    if strict_dtype or (not v.is_floating_point() or not v_new.is_floating_point()):
+                        raise ValueError(f"for key: {k}, expect dtype {v.dtype}, but got {v_new.dtype}")
+                # check shape
+                if list(v.shape) != list(v_new.shape):
+                    raise ValueError(f"for key: {k}, expect shape {v.shape}, but got {v_new.shape}")
+
+                dst_tensor = v.value().get_tensor()
+                place = v.place
+
+                if not v_new.place._equals(place):
+                    # clear dst_tensor for save memory
+                    dst_tensor._clear()
+                    # v_new = v_new._copy_to(paddle.CUDAPinnedPlace(), False)
+                    new_t = v_new._copy_to(place, False)
+                else:
+                    new_t = v_new
+
+                if not strict_dtype and v.dtype != new_t.dtype:
+                    new_t = new_t.astype(v.dtype)
+
+                # 4. share Tensor to origin param / Tensor
+                src_tensor = new_t.value().get_tensor()
+                dst_tensor._share_data_with(src_tensor)
+                unset_keys.remove(k)
+                unused_keys.remove(k)
+
+    error_msgs = []
+    # if len(unset_keys) > 0:
+    #    error_msgs.append(f"Those weight of model is not initialized: {list(unset_keys)}")
+    if len(unused_keys) > 0:
+        error_msgs.append(f"Those state dict keys are not using in model: {list(unused_keys)}")
+
+    return error_msgs
+
+
+def _load_state_dict_into_model(model_to_load, state_dict, start_prefix):
+    # torch will cast dtype in load_state_dict, but paddle strictly check dtype
+    _convert_state_dict_dtype_and_shape(state_dict, model_to_load)
+
+    error_msgs = []
+
+    if len(start_prefix) > 0:
+        for key in list(state_dict.keys()):
+            if key.startswith(start_prefix):
+                state_dict[key.replace(start_prefix, "")] = state_dict.pop(key)
+
+    # TODO: add return status to state_dict
+    with warnings.catch_warnings(record=True) as w:
+        warnings.resetwarnings()
+        # paddlenlp hold  missing_keys , just ignore not found warnings.
+        warnings.filterwarnings("ignore", message=r".*is not found in the provided dict.*")
+        model_to_load.set_state_dict(state_dict)
+        error_msgs.extend([str(x.message) for x in w])
+
+    del state_dict
+
+    return error_msgs
+
+
+def _convert_state_dict_dtype_and_shape(state_dict, model_to_load):
+    # convert the dtype of state dict
+    def is_0d_or_1d(tensor):
+        return len(tensor.shape) == 0 or list(tensor.shape) == [1]
+
+    for key, value in model_to_load.state_dict().items():
+        if key in list(state_dict.keys()):
+            if isinstance(state_dict[key], np.ndarray):
+                raise ValueError(
+                    "convert_state_dict_dtype expected paddle.Tensor not numpy.ndarray, plase convert numpy.ndarray to paddle.Tensor"
+                )
+            # confirm parameter cast is executed on the same device as model
+            # TODO: cast(FP32 -> FP16) has diff on different devices, need to fix it
+            if state_dict[key].is_floating_point() and state_dict[key].dtype != value.dtype:
+                state_dict[key] = paddle.cast(state_dict.pop(key), value.dtype)
+            # unified 0d and 1d tensor
+            if is_0d_or_1d(value) and is_0d_or_1d(state_dict[key]):
+                if list(value.shape) != list(state_dict[key].shape):
+                    state_dict[key] = paddle.reshape(state_dict.pop(key), value.shape)
+
+
+def _load_state_dict_into_meta_model(
+    model,
+    state_dict,
+    loaded_state_dict_keys,  # left for now but could be removed, see below
+    start_prefix,
+    expected_keys,
+    dtype=None,
+    is_safetensors=False,
+    keep_in_fp32_modules=None,
+):
+    """
+    This is somewhat similar to `_load_state_dict_into_model`, but deals with a model that has some or all of its
+    params on a `meta` device. It replaces the model params with the data from the `state_dict`, while moving the
+    params back to the normal device, but only for `loaded_state_dict_keys`.
+
+    `start_prefix` is used for models which insert their name into model keys, e.g. `bert` in
+    `bert.pooler.dense.weight`
+
+    """
+    from paddle.common_ops_import import convert_np_dtype_to_dtype_
+
+    dtype = convert_np_dtype_to_dtype_(dtype)
+    error_msgs = []
+    model_state_dict = model.state_dict()
+    for param_name, param in state_dict.items():
+        # First part of the test is always true as loaded_state_dict_keys always contains state_dict keys.
+        if param_name not in loaded_state_dict_keys or param_name not in expected_keys:
+            continue
+
+        if param_name.startswith(start_prefix):
+            param_name = param_name[len(start_prefix) :]
+
+        if param.place != paddle.framework._current_expected_place():
+            param = param._copy_to(paddle.framework._current_expected_place(), False)
+
+        # # We convert floating dtypes to the `dtype` passed. We want to keep the buffers/params
+        # # in int/uint/bool and not cast them.
+        if dtype is not None and paddle.is_floating_point(param):
+            if (
+                keep_in_fp32_modules is not None
+                and any(module_to_keep_in_fp32 in param_name for module_to_keep_in_fp32 in keep_in_fp32_modules)
+                and (dtype == paddle.float16 or dtype == paddle.bfloat16)
+            ):
+                param = param.astype(dtype=paddle.float32)
+            else:
+                param = param.astype(dtype=dtype)
+
+        if dtype is None:
+            old_param = model
+            splits = param_name.split(".")
+            for split in splits:
+                old_param = getattr(old_param, split)
+                if old_param is None:
+                    break
+
+            if old_param is not None:
+                param = param.astype(dtype=old_param.dtype)
+        with paddle.no_grad():
+            model_state_dict[param_name].get_tensor()._share_data_with(param.value().get_tensor())
+            param.value().get_tensor()._clear()
+    return error_msgs
+
+
+@six.add_metaclass(InitTrackerMeta)
+class PretrainedModel(Layer, GenerationMixin, ConversionMixin):
+    """
+    The base class for all pretrained models. It mainly provides common methods
+    for loading (construction and loading) and saving pretrained models. Loading
+    and saving also rely on the following class attributes which should be overridden
+    by derived classes accordingly:
+
+    - **model_config_file** (str): Represents the file name of model configuration
+      for configuration saving and loading in local file system. The value is
+      `model_config.json`.
+    - **resource_files_names** (dict): Name of local file where the model configuration
+      can be saved and loaded locally. Currently, resources only include the model state,
+      thus the dict only includes `'model_state'` as key with corresponding
+      value `'model_state.pdparams'` for model weights saving and loading.
+    - **pretrained_init_configuration** (dict): Provides the model configurations
+      of built-in pretrained models (contrasts to models in local file system).
+      It has pretrained model names as keys (such as `bert-base-uncased`), and
+      the values are dict preserving corresponding configuration for model initialization.
+    - **pretrained_resource_files_map** (dict): Provides resource URLs of built-in
+      pretrained models (contrasts to models in local file system).
+      It has the same key as resource_files_names (that is "model_state"),
+      and the corresponding value is a dict with specific model name to model weights URL mapping
+      (such as "bert-base-uncased" ->
+      "https://bj.bcebos.com/paddlenlp/models/transformers/bert-base-uncased.pdparams").
+    - **base_model_prefix** (str): Represents the attribute associated to the
+      base model in derived classes of the same architecture adding layers on
+      top of the base model. Note: A base model class is pretrained model class
+      decorated by `register_base_model`, such as `BertModel`; A derived model
+      class is a pretrained model class adding layers on top of the base model,
+      and it has a base model as attribute, such as `BertForSequenceClassification`.
+
+    Methods common to models for text generation are defined in `GenerationMixin`
+    and also inherited here.
+
+    Besides, metaclass `InitTrackerMeta` is used to create `PretrainedModel`,
+    by which subclasses can track arguments for initialization automatically.
+    """
+
+    # Deprecated(wj-Mcat): after 2.6.* version
+    # save the old-school `LEGACY_CONFIG_NAME`, and will be changed to `CONFIG_NAME` after 2.6.* version
+    model_config_file = LEGACY_CONFIG_NAME
+
+    pretrained_init_configuration = {}
+    # TODO: more flexible resource handle, namedtuple with fields as:
+    # resource_name, saved_file, handle_name_for_load(None for used as __init__
+    # arguments), handle_name_for_save
+    resource_files_names = {"model_state": PADDLE_WEIGHTS_NAME}
+    pretrained_resource_files_map = {}
+    base_model_prefix = ""
+    main_input_name = "input_ids"
+    config_class = None
+    _keep_in_fp32_modules = None
+
+    # a list of `re` patterns of `state_dict` keys that should be removed from the list of missing
+    # keys we find (keys inside the model but not in the checkpoint) and avoid unnecessary warnings.
+    _keys_to_ignore_on_load_missing = None
+    # a list of `re` patterns of `state_dict` keys that should be removed from the list of
+    # unexpected keys we find (keys inside the checkpoint but not the model) and avoid unnecessary
+    # warnings.
+    _keys_to_ignore_on_load_unexpected = None
+    # a list of `state_dict` keys to ignore when saving the model (useful for keys that aren't
+    # trained, but which are either deterministic or tied variables)
+    _keys_to_ignore_on_save = None
+    _tied_weights_keys = None
+
+    def __init__(self, *args, **kwargs):
+        super(PretrainedModel, self).__init__()
+
+        if not self.constructed_from_pretrained_config():
+            return
+
+        # extract config from args
+        config = None
+        for arg in args:
+            if isinstance(arg, PretrainedConfig):
+                config = arg
+                break
+        if config is not None:
+            self.config: PretrainedConfig = config
+            self.model_config_file = CONFIG_NAME
+            self.generation_config = GenerationConfig.from_model_config(self.config) if self.can_generate() else None
+            return
+
+        # extract config from kwargs
+        if "config" not in kwargs:
+            raise ValueError(
+                "PretrainedConfig instance not found in the arguments, you can set it as args or kwargs with config field"
+            )
+
+        config = kwargs["config"]
+        if not isinstance(config, PretrainedConfig):
+            raise TypeError("config parameter should be the instance of PretrainedConfig")
+
+        self.config: PretrainedConfig = kwargs["config"]
+        self.generation_config = GenerationConfig.from_model_config(self.config) if self.can_generate() else None
+        self.model_config_file = CONFIG_NAME
+        self.warnings_issued = {}
+
+    def _post_init(self, original_init, *args, **kwargs):
+        """
+        It would be hooked after `__init__` to add a dict including arguments of
+        `__init__` as a attribute named `config` of the pretrained model instance.
+        """
+        if not self.constructed_from_pretrained_config():
+            init_dict = fn_args_to_dict(original_init, *((self,) + args), **kwargs)
+            self.config = init_dict
+
+        # only execute when it's the base method
+        if (
+            original_init.__module__ != "paddlenlp.transformers.model_utils"
+            and self.__class__.init_weights is PretrainedModel.init_weights
+        ):
+            self.init_weights()
+
+        # Note:
+        # 1. PipelineLayer will create parameters for each layer and
+        # call `_synchronize_shared_weights()` to synchronize the shared parameters.
+        # 2. When setting the model `state_dict`, `_synchronize_shared_weights` will be called to
+        # synchronize the shared parameters.
+        # However, `self._init_weights` will re-initialize the parameters without
+        # synchronizing the shared parameters. If the following step does not load a checkpoint,
+        # the shared parameters will be different.
+
+        if isinstance(self, PipelineLayer):
+            self._synchronize_shared_weights()
+
+    def _init_weights(self, layer):
+        """
+        Initialize the weights. This method should be overridden by derived class.
+        """
+        pass
+
+    def _initialize_weights(self, layer):
+        """
+        Initialize the weights if they are not already initialized.
+        """
+        if getattr(layer, "_is_initialized", False):
+            return
+        self._init_weights(layer)
+        layer._is_initialized = True
+
+    def init_weights(self):
+        """
+        If needed prunes and maybe initializes weights. If using a custom `PreTrainedModel`, you need to implement any
+        initialization logic in `_init_weights`.
+        """
+        # call pure
+        if _init_weights:
+            # Initialize weights
+            self.apply(self._initialize_weights)
+
+            # Tie weights should be skipped when not initializing all weights
+            # since from_pretrained(...) calls tie weights anyways
+
+            # TODO(wj-Mcat): enable all tie-weights later
+            # self.tie_weights()
+
+    @classmethod
+    def _from_config(cls, config, **kwargs):
+        """
+        All context managers that the model should be initialized under go here.
+
+        Args:
+            dtype (`paddle.dtype`, *optional*):
+                Override the default `paddle.dtype` and load the model under this dtype.
+        """
+        dtype = kwargs.pop("dtype", None)
+
+        if dtype is None:
+            if config.dtype is not None:
+                dtype = config.dtype
+            else:
+                dtype = paddle.get_default_dtype()
+
+        with dtype_guard(dtype):
+            model = cls(config, **kwargs)
+
+        return model
+
+    @classmethod
+    def from_config(cls, config, **kwargs):
+        """
+        All context managers that the model should be initialized under go here.
+
+        Args:
+            dtype (`paddle.dtype`, *optional*):
+                Override the default `paddle.dtype` and load the model under this dtype.
+        """
+        return cls._from_config(config, **kwargs)
+
+    @property
+    def base_model(self):
+        """
+        PretrainedModel: The body of the same model architecture. It is the base
+            model itself for base model or the base model attribute for derived
+            model.
+        """
+        return getattr(self, self.base_model_prefix, self)
+
+    @property
+    def model_name_list(self):
+        """
+        list: Contains all supported built-in pretrained model names of the
+            current PretrainedModel class.
+        """
+        # Todo: return all model name
+        return list(self.pretrained_init_configuration.keys())
+
+    def can_generate(self) -> bool:
+        """
+        Returns whether this model can generate sequences with `.generate()`.
+        Returns:
+            `bool`: Whether this model can generate sequences with `.generate()`.
+        """
+        # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation
+        if "GenerationMixin" in str(self.prepare_inputs_for_generation):
+            return False
+        return True
+
+    def recompute_enable(self):
+        r"""
+        Enable Recompute.
+        All layers with the `enable_recompute` attribute will be set to `True`
+        """
+
+        def fn(layer):
+            if hasattr(layer, "enable_recompute") and (layer.enable_recompute is False or layer.enable_recompute == 0):
+                layer.enable_recompute = True
+
+        self.apply(fn)
+
+    def recompute_disable(self):
+        r"""
+        Disable Recompute.
+        All layers with the `enable_recompute` attribute will be set to `False`
+        """
+
+        def fn(layer):
+            if hasattr(layer, "enable_recompute") and (layer.enable_recompute is False or layer.enable_recompute == 0):
+                layer.enable_recompute = True
+
+        self.apply(fn)
+
+    def get_memory_footprint(self, return_buffers=True):
+        r"""
+        Get the memory footprint of a model. This will return the memory footprint of the current model in bytes.
+        Useful to benchmark the memory footprint of the current model and design some tests.
+
+        Arguments:
+            return_buffers (`bool`, *optional*, defaults to `True`):
+                Whether to return the size of the buffer tensors in the computation of the memory footprint. Buffers
+                are tensors that do not require gradients and not registered as parameters
+        """
+        mem = sum([param.numel().item() * param.element_size() for param in self.parameters()])
+        if return_buffers:
+            mem_bufs = sum([buf.numel().item() * buf.element_size() for buf in self.buffers()])
+            mem = mem + mem_bufs
+        return mem
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        """get input embedding of model
+
+        Returns:
+            nn.Embedding: embedding of model
+        """
+        base_model = getattr(self, self.base_model_prefix, self)
+        if base_model is not self:
+            return base_model.get_input_embeddings()
+
+        raise NotImplementedError(
+            f"model of {type(base_model)} has not implemented the `get_input_embeddings`"
+            " or `set_input_embeddings` method"
+        )
+
+    def set_input_embeddings(self, value: Embedding):
+        """set new input embedding for model
+
+        Args:
+            value (Embedding): the new embedding of model
+
+        Raises:
+            NotImplementedError: Model has not implement `set_input_embeddings` method
+        """
+        base_model = getattr(self, self.base_model_prefix, self)
+        if base_model is not self:
+            return base_model.set_input_embeddings(value)
+        raise NotImplementedError(
+            f"model of {type(base_model)} has not implemented the `get_input_embeddings`"
+            " or `set_input_embeddings` method"
+        )
+
+    def get_output_embeddings(self) -> Optional[Embedding]:
+        """To be overwrited for models with output embeddings
+
+        Returns:
+            Optional[Embedding]: the otuput embedding of model
+        """
+        return None
+
+    def tie_weights(self):
+        """
+        Tie the weights between the input embeddings and the output embeddings.
+        """
+        if self.config.tie_word_embeddings:
+            output_embeddings = self.get_output_embeddings()
+            input_embeddings = self.get_input_embeddings()
+            if output_embeddings is not None and input_embeddings is not None:
+                if input_embeddings.weight.shape != output_embeddings.weight.shape:
+                    logger.warning(
+                        f"The shape of input embeddings is {input_embeddings.weight.shape} and the shape of output embeddings is {output_embeddings.weight.shape}. "
+                        "This is only expected if you are calling the `resize_token_embeddings` method"
+                    )
+                output_embeddings.weight = input_embeddings.weight
+                if getattr(output_embeddings, "bias", None) is not None:
+                    # need to pad
+                    if output_embeddings.weight.shape[0] > output_embeddings.bias.shape[0]:
+                        old_bias = output_embeddings.bias
+                        pad_length = output_embeddings.weight.shape[0] - old_bias.shape[0]
+                        output_embeddings.bias = output_embeddings.create_parameter(
+                            shape=[output_embeddings.weight.shape[0]],
+                            attr=output_embeddings._bias_attr,
+                            dtype=output_embeddings._dtype,
+                            is_bias=True,
+                        )
+                        new_bias = paddle.concat(
+                            [old_bias, paddle.zeros([pad_length], dtype=output_embeddings.bias.dtype)]
+                        )
+                        output_embeddings.bias.set_value(new_bias)
+                    # need to trim
+                    elif output_embeddings.weight.shape[0] < output_embeddings.bias.shape[0]:
+                        new_bias = output_embeddings.bias[: output_embeddings.weight.shape[0]]
+                        output_embeddings.bias = output_embeddings.create_parameter(
+                            shape=[output_embeddings.weight.shape[0]],
+                            attr=output_embeddings._bias_attr,
+                            dtype=output_embeddings._dtype,
+                            is_bias=True,
+                        )
+                        output_embeddings.bias.set_value(new_bias)
+
+    def resize_position_embeddings(self, new_num_position_embeddings: int):
+        """resize position embedding, this method should be overrited overwrited by downstream models
+
+        Args:
+            new_num_position_embeddings (int): the new position size
+
+        Raises:
+            NotImplementedError: when called and not be implemented
+        """
+        raise NotImplementedError(
+            f"`resize_position_embeddings` is not implemented for {self.__class__}`. To implement it, you should "
+            f"overwrite this method in the class {self.__class__} in `{self.__class__.__module__}.py`"
+        )
+
+    @classmethod
+    def constructed_from_pretrained_config(cls, init_func=None) -> bool:
+        """check if the model is constructed from `PretrainedConfig`
+        Returns:
+            bool: if the model is constructed from `PretrainedConfig`
+        """
+        return cls.config_class is not None and issubclass(cls.config_class, PretrainedConfig)
+
+    def save_model_config(self, save_dir: str):
+        """
+        Deprecated, please use `.config.save_pretrained()` instead.
+        Saves model configuration to a file named "config.json" under `save_dir`.
+
+        Args:
+            save_dir (str): Directory to save model_config file into.
+        """
+        logger.warning("The `save_model_config` is deprecated! Please use `.config.save_pretrained()` instead.")
+        self.config.save_pretrained(save_dir)
+
+    def save_to_hf_hub(
+        self,
+        repo_id: str,
+        private: Optional[bool] = None,
+        subfolder: Optional[str] = None,
+        commit_message: Optional[str] = None,
+        revision: Optional[str] = None,
+        create_pr: bool = False,
+    ):
+        """
+        Uploads all elements of this model to a new HuggingFace Hub repository.
+        Args:
+            repo_id (str): Repository name for your model/tokenizer in the Hub.
+            private (bool, optional): Whether the model/tokenizer is set to private
+            subfolder (str, optional): Push to a subfolder of the repo instead of the root
+            commit_message (str, optional) — The summary / title / first line of the generated commit. Defaults to: f"Upload {path_in_repo} with huggingface_hub"
+            revision (str, optional) — The git revision to commit from. Defaults to the head of the "main" branch.
+            create_pr (boolean, optional) — Whether or not to create a Pull Request with that commit. Defaults to False.
+                If revision is not set, PR is opened against the "main" branch. If revision is set and is a branch, PR is opened against this branch.
+                If revision is set and is not a branch name (example: a commit oid), an RevisionNotFoundError is returned by the server.
+
+        Returns: The url of the commit of your model in the given repository.
+        """
+        repo_url = create_repo(repo_id, private=private, exist_ok=True)
+
+        # Infer complete repo_id from repo_url
+        # Can be different from the input `repo_id` if repo_owner was implicit
+        _, repo_owner, repo_name = repo_type_and_id_from_hf_id(repo_url)
+
+        repo_id = f"{repo_owner}/{repo_name}"
+
+        # Check if README file already exist in repo
+        try:
+            get_hf_file_metadata(hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision))
+            has_readme = True
+        except EntryNotFoundError:
+            has_readme = False
+
+        with tempfile.TemporaryDirectory() as root_dir:
+            if subfolder is not None:
+                save_dir = os.path.join(root_dir, subfolder)
+            else:
+                save_dir = root_dir
+            # save model
+            self.save_pretrained(save_dir)
+            # Add readme if does not exist
+            logger.info("README.md not found, adding the default README.md")
+            if not has_readme:
+                with open(os.path.join(root_dir, "README.md"), "w") as f:
+                    f.write(f"---\nlibrary_name: paddlenlp\n---\n# {repo_id}")
+
+            # Upload model and return
+            logger.info(f"Pushing to the {repo_id}. This might take a while")
+            return upload_folder(
+                repo_id=repo_id,
+                repo_type="model",
+                folder_path=root_dir,
+                commit_message=commit_message,
+                revision=revision,
+                create_pr=create_pr,
+            )
+
+    def save_to_aistudio(
+        self,
+        repo_id,
+        private=True,
+        license="Apache License 2.0",
+        exist_ok=True,
+        safe_serialization=True,
+        subfolder=None,
+        merge_tensor_parallel=False,
+        **kwargs
+    ):
+        """
+        Uploads all elements of this model to a new AiStudio Hub repository.
+        Args:
+            repo_id (str): Repository name for your model/tokenizer in the Hub.
+            token (str): Your token for the Hub.
+            private (bool, optional): Whether the model/tokenizer is set to private. Defaults to True.
+            license (str): The license of your model/tokenizer. Defaults to: "Apache License 2.0".
+            exist_ok (bool, optional): Whether to override existing repository. Defaults to: True.
+            safe_serialization (bool, optional): Whether to save the model in safe serialization way. Defaults to: True.
+            subfolder (str, optional): Push to a subfolder of the repo instead of the root
+            merge_tensor_parallel (bool): Whether to merge the tensor parallel weights. Defaults to False.
+        """
+
+        res = aistudio_sdk.hub.create_repo(repo_id=repo_id, private=private, license=license, **kwargs)
+        if "error_code" in res:
+            if res["error_code"] == 10003 and exist_ok:
+                logger.info(
+                    f"Repo {repo_id} already exists, it will override files with the same name. To avoid this, please set exist_ok=False"
+                )
+            else:
+                logger.error(
+                    f"Failed to create repo {repo_id}, error_code: {res['error_code']}, error_msg: {res['error_msg']}"
+                )
+        else:
+            logger.info(f"Successfully created repo {repo_id}")
+
+        with tempfile.TemporaryDirectory() as root_dir:
+            if subfolder is not None:
+                save_dir = os.path.join(root_dir, subfolder)
+            else:
+                save_dir = root_dir
+            # save model
+            self.save_pretrained(
+                save_dir,
+                shard_format="pipeline",
+                safe_serialization=(is_safetensors_available() and safe_serialization),
+                max_shard_size="5GB",
+                merge_tensor_parallel=merge_tensor_parallel,
+            )
+
+            # Upload model and return
+            logger.info(f"Pushing to the {repo_id}. This might take a while")
+            for filename in os.listdir(save_dir):
+                res = aistudio_sdk.hub.upload(
+                    repo_id=repo_id, path_or_fileobj=os.path.join(save_dir, filename), path_in_repo=filename, **kwargs
+                )
+                if "error_code" in res:
+                    logger.error(
+                        f"Failed to upload {filename}, error_code: {res['error_code']}, error_msg: {res['error_msg']}"
+                    )
+                else:
+                    logger.info(f"{filename}: {res['message']}")
+
+    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Embedding:
+        """
+        Resizes input token embeddings matrix of the model according to new_num_tokens.
+
+        Args:
+            new_num_tokens (Optional[int]):
+                The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
+                vectors at the end. Reducing the size will remove vectors from the end. If not provided or None, just
+                returns a pointer to the input tokens embedding module of the model without doing anything.
+
+        Returns:
+            paddle.nn.Embedding: The input tokens Embeddings Module of the model.
+        """
+        old_embeddings: nn.Embedding = self.get_input_embeddings()
+        if not new_num_tokens or new_num_tokens == old_embeddings.weight.shape[0]:
+            return old_embeddings
+
+        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
+        self.set_input_embeddings(new_embeddings)
+
+        # 2. Update vocab_size
+        self.base_model.config["vocab_size"] = new_num_tokens
+        self.vocab_size = new_num_tokens
+
+        # update init_config
+        self._update_init_config(self.init_config, "vocab_size", new_num_tokens)
+
+        # Tie the weights between the input embeddings and the output embeddings if needed.
+        self.tie_weights()
+
+        return new_embeddings
+
+    def _update_init_config(self, init_config: dict, key: str, value: Any):
+        """update init_config by <key, value> pair
+
+        Args:
+            init_config (dict): the init_config instance
+            key (str): the key field
+            value (Any): the new value of instance
+        """
+        if key in init_config:
+            init_config[key] = value
+            return
+
+        for arg in init_config.get("init_args", []):
+            if not isinstance(arg, PretrainedModel):
+                continue
+            self._update_init_config(arg.init_config, key, value)
+
+    def _get_resized_embeddings(
+        self, old_embeddings: nn.Embedding, new_num_tokens: Optional[int] = None
+    ) -> nn.Embedding:
+        """
+        Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly
+        initialized vectors at the end. Reducing the size will remove vectors from the end
+
+        Args:
+            old_embeddings (nn.Embedding):
+                Old embeddings to be resized.
+            new_num_tokens (Optional[int]):
+                New number of tokens in the embedding matrix.
+                Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
+                vectors from the end.
+
+        Returns:
+            paddle.nn.Embedding: The resized Embedding Module or the old Embedding Module if new_num_tokens is None.
+        """
+        if new_num_tokens is None:
+            return old_embeddings
+
+        old_num_tokens, old_embedding_dim = old_embeddings.weight.shape
+        if old_num_tokens == new_num_tokens:
+            return old_embeddings
+
+        if not isinstance(old_embeddings, nn.Embedding):
+            raise TypeError(
+                f"Old embeddings are of type {type(old_embeddings)}, which is not an instance of {nn.Embedding}. You"
+                " should either use a different resize function or make sure that old_embeddings are an instance of"
+                f" {nn.Embedding}."
+            )
+
+        # Build new embeddings
+        new_embeddings = nn.Embedding(
+            new_num_tokens,
+            old_embedding_dim,
+            padding_idx=old_embeddings._padding_idx,
+            sparse=old_embeddings._sparse,
+        )
+
+        # make sure that new_embeddings's dtype is same as the old embeddings' dtype
+        if new_embeddings.weight.dtype != old_embeddings.weight.dtype:
+            new_embeddings.to(dtype=old_embeddings.weight.dtype)
+
+        # numbers of tokens to copy
+        n = min(old_num_tokens, new_num_tokens)
+        with paddle.no_grad():
+            new_embeddings.weight[:n, :] = old_embeddings.weight[:n, :]
+
+        return new_embeddings
+
+    def __setattr__(self, name, value):
+        value = adapt_stale_fwd_patch(self, name, value)
+        return super(PretrainedModel, self).__setattr__(name, value)
+
+    @classmethod
+    def _resolve_model_file_path(
+        cls: Type[PretrainedModel],
+        pretrained_model_name_or_path: str,
+        from_hf_hub: bool = False,
+        from_aistudio: bool = False,
+        cache_dir: str | None = None,
+        subfolder: Optional[str] = "",
+        config: PretrainedConfig = None,
+        convert_from_torch: bool = False,
+        use_safetensors: bool | None = None,
+        variant=None,
+    ) -> str:
+        """resolve model target file path from `` and `cache_dir`
+
+        1. when it is file path:
+            return the weight file
+
+        2. when it is model-name:
+            2.1 check default `MODEL_HOME` + `model-mame` + model_state.pdparams
+            2.2 get the url from `pretrained_resource_files_map`, and set it to `pretrained_model_name_or_path`
+
+        3. when it is local dir:
+            check whether the file<local_dir + weight_file> exist
+
+        Args:
+            cls (Type[PretrainedModel]): the inherited PretrainedModel class
+            pretrained_model_name_or_path (str): the model-name/url/local_dir/local_dir
+            cache_dir (Optional[str], optional): cache_dir is used when name_or_path is model-name/url. Defaults to None.
+            convert_from_torch (bool, optional): whether support convert pytorch model to paddle model
+
+        Returns:
+            str: the model weight file path
+        """
+        is_sharded = False
+        sharded_metadata = None
+
+        if pretrained_model_name_or_path is not None:
+            # the following code use a lot of os.path.join, hence setting subfolder to empty str if None
+            if subfolder is None:
+                subfolder = ""
+            pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+            is_local = os.path.isdir(pretrained_model_name_or_path)
+
+            def get_file_path(pretrained_model_name_or_path, subfolder, SAFE_WEIGHTS_NAME, variant):
+                return os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_NAME, variant))
+
+            # pretrained_model_name_or_path is file
+            if os.path.isfile(pretrained_model_name_or_path):
+                archive_file = pretrained_model_name_or_path
+                is_local = True
+            # pretrained_model_name_or_path is dir
+            elif is_local:
+                if use_safetensors is not False and os.path.isfile(
+                    get_file_path(pretrained_model_name_or_path, subfolder, SAFE_WEIGHTS_INDEX_NAME, variant)
+                ):
+                    # Load from a sharded safetensors checkpoint
+                    archive_file = get_file_path(
+                        pretrained_model_name_or_path, subfolder, SAFE_WEIGHTS_INDEX_NAME, variant
+                    )
+                    is_sharded = True
+                elif use_safetensors is not False and os.path.isfile(
+                    get_file_path(
+                        pretrained_model_name_or_path, subfolder, SAFE_WEIGHTS_INDEX_NAME, weight_name_suffix()
+                    )
+                ):
+                    # Load from a sharded safetensors checkpoint
+                    archive_file = get_file_path(
+                        pretrained_model_name_or_path, subfolder, SAFE_WEIGHTS_INDEX_NAME, weight_name_suffix()
+                    )
+                    is_sharded = True
+                elif use_safetensors is not False and os.path.isfile(
+                    get_file_path(pretrained_model_name_or_path, subfolder, SAFE_WEIGHTS_NAME, variant)
+                ):
+                    # Load from a safetensors checkpoint
+                    archive_file = get_file_path(pretrained_model_name_or_path, subfolder, SAFE_WEIGHTS_NAME, variant)
+                elif use_safetensors is not False and os.path.isfile(
+                    get_file_path(pretrained_model_name_or_path, subfolder, SAFE_WEIGHTS_NAME, weight_name_suffix())
+                ):
+                    # Load from a safetensors checkpoint
+                    archive_file = get_file_path(
+                        pretrained_model_name_or_path, subfolder, SAFE_WEIGHTS_NAME, weight_name_suffix()
+                    )
+                elif os.path.isfile(
+                    get_file_path(pretrained_model_name_or_path, subfolder, PADDLE_WEIGHTS_INDEX_NAME, variant)
+                ):
+                    # Load from a sharded PaddlePaddle checkpoint
+                    archive_file = get_file_path(
+                        pretrained_model_name_or_path, subfolder, PADDLE_WEIGHTS_INDEX_NAME, variant
+                    )
+                    is_sharded = True
+                elif os.path.isfile(
+                    get_file_path(
+                        pretrained_model_name_or_path, subfolder, PADDLE_WEIGHTS_INDEX_NAME, weight_name_suffix()
+                    )
+                ):
+                    # Load from a sharded PaddlePaddle checkpoint for hybrid parallel model
+                    archive_file = get_file_path(
+                        pretrained_model_name_or_path, subfolder, PADDLE_WEIGHTS_INDEX_NAME, weight_name_suffix()
+                    )
+                    is_sharded = True
+                elif os.path.isfile(
+                    get_file_path(pretrained_model_name_or_path, subfolder, PADDLE_WEIGHTS_NAME, variant)
+                ):
+                    # Load from a PaddlePaddle checkpoint
+                    archive_file = get_file_path(
+                        pretrained_model_name_or_path, subfolder, PADDLE_WEIGHTS_NAME, variant
+                    )
+                elif os.path.isfile(
+                    get_file_path(
+                        pretrained_model_name_or_path,
+                        subfolder,
+                        PADDLE_WEIGHTS_NAME,
+                        weight_name_suffix(),
+                    )
+                ):
+                    # Load from a PaddlePaddle checkpoint for hybrid parallel model
+                    archive_file = get_file_path(
+                        pretrained_model_name_or_path,
+                        subfolder,
+                        PADDLE_WEIGHTS_NAME,
+                        weight_name_suffix(),
+                    )
+                elif os.path.isfile(
+                    os.path.join(
+                        pretrained_model_name_or_path, subfolder, _add_variant(PYTORCH_WEIGHTS_INDEX_NAME, variant)
+                    )
+                ):
+                    if from_hf_hub or convert_from_torch:
+                        archive_file = os.path.join(
+                            pretrained_model_name_or_path, subfolder, _add_variant(PYTORCH_WEIGHTS_INDEX_NAME, variant)
+                        )
+                    else:
+                        raise ValueError(
+                            f"Found {_add_variant(PYTORCH_WEIGHTS_INDEX_NAME, variant)} in directory"
+                            f" {pretrained_model_name_or_path}. Please set convert_from_torch=True in from_pretrained. eg, Model.from_pretrained(model_name, convert_from_torch=True) "
+                        )
+                elif os.path.isfile(
+                    os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(PYTORCH_WEIGHTS_NAME, variant))
+                ):
+                    if from_hf_hub or convert_from_torch:
+                        archive_file = os.path.join(
+                            pretrained_model_name_or_path, subfolder, _add_variant(PYTORCH_WEIGHTS_NAME, variant)
+                        )
+                    else:
+                        raise ValueError(
+                            f"Found {_add_variant(PYTORCH_WEIGHTS_NAME, variant)} in directory"
+                            f" {pretrained_model_name_or_path}. Please set convert_from_torch=True in from_pretrained. eg, Model.from_pretrained(model_name, convert_from_torch=True) "
+                        )
+                else:
+                    raise EnvironmentError(
+                        f"Error no file named {_add_variant(PADDLE_WEIGHTS_NAME, variant)}, found in directory"
+                        f" {pretrained_model_name_or_path}."
+                    )
+            elif is_remote_url(pretrained_model_name_or_path):
+                resolved_archive_file = resolve_file_path(
+                    pretrained_model_name_or_path,
+                    pretrained_model_name_or_path,
+                    subfolder,
+                    cache_dir=cache_dir,
+                    from_aistudio=from_aistudio,
+                    from_hf_hub=from_hf_hub,
+                )
+
+            elif pretrained_model_name_or_path in cls.pretrained_init_configuration:
+                # fetch the weight url from the `pretrained_resource_files_map`
+                resource_file_url = cls.pretrained_resource_files_map["model_state"][pretrained_model_name_or_path]
+                resolved_archive_file = resolve_file_path(
+                    pretrained_model_name_or_path,
+                    [resource_file_url],
+                    subfolder,
+                    cache_dir=cache_dir,
+                    from_aistudio=from_aistudio,
+                    from_hf_hub=from_hf_hub,
+                )
+            else:
+                if use_safetensors is True:
+                    filenames = [
+                        _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant),
+                        _add_variant(SAFE_WEIGHTS_NAME, variant),
+                    ]
+                elif use_safetensors is None:
+                    filenames = [
+                        _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant),
+                        _add_variant(PADDLE_WEIGHTS_INDEX_NAME, variant),
+                        _add_variant(SAFE_WEIGHTS_NAME, variant),
+                        _add_variant(PADDLE_WEIGHTS_NAME, variant),
+                        _add_variant(PYTORCH_WEIGHTS_INDEX_NAME, variant),
+                        _add_variant(PYTORCH_WEIGHTS_NAME, variant),
+                    ]
+                else:
+                    filenames = [
+                        _add_variant(PADDLE_WEIGHTS_INDEX_NAME, variant),
+                        _add_variant(PADDLE_WEIGHTS_NAME, variant),
+                        _add_variant(PYTORCH_WEIGHTS_INDEX_NAME, variant),
+                        _add_variant(PYTORCH_WEIGHTS_NAME, variant),
+                    ]
+                resolved_archive_file = resolve_file_path(
+                    pretrained_model_name_or_path,
+                    filenames,
+                    subfolder,
+                    cache_dir=cache_dir,
+                    from_aistudio=from_aistudio,
+                    from_hf_hub=from_hf_hub,
+                )
+                if resolved_archive_file is None:
+                    raise EnvironmentError(
+                        f"Error no files {filenames} found in repo {pretrained_model_name_or_path}."
+                    )
+                elif "pytorch_model.bin" in str(resolved_archive_file):
+                    if not from_hf_hub and not convert_from_torch:
+                        raise ValueError(
+                            f"Download pytorch wight in "
+                            f" {resolved_archive_file}. Please set convert_from_torch=True in from_pretrained. eg, Model.from_pretrained(model_name, convert_from_torch=True) "
+                        )
+
+            if is_local:
+                logger.info(f"Loading weights file {archive_file}")
+                resolved_archive_file = archive_file
+            else:
+                logger.info(f"Loading weights file from cache at {resolved_archive_file}")
+        else:
+            resolved_archive_file = None
+
+        # We'll need to download and cache each checkpoint shard if the checkpoint is sharded.
+        resolved_sharded_files = None
+        if str(resolved_archive_file).endswith(".json"):
+            is_sharded = True
+        if is_sharded:
+            # resolved_archive_file becomes a list of files that point to the different checkpoint shards in this case.
+            resolved_sharded_files, sharded_metadata = get_checkpoint_shard_files(
+                pretrained_model_name_or_path,
+                resolved_archive_file,
+                from_aistudio=from_aistudio,
+                from_hf_hub=from_hf_hub,
+                cache_dir=cache_dir,
+                subfolder=subfolder,
+            )
+
+        return resolved_archive_file, resolved_sharded_files, sharded_metadata, is_sharded
+
+    @classmethod
+    def _load_pretrained_model(
+        cls,
+        model: PretrainedModel,
+        state_dict: Dict[str, Tensor],
+        loaded_keys: List[str],
+        resolved_archive_file: Union[str, List] = [],
+        pretrained_model_name_or_path=None,
+        config=None,
+        ignore_mismatched_sizes=False,
+        low_cpu_mem_usage=False,
+        dtype=None,
+        keep_in_fp32_modules=None,
+        quantization_linear_list=None,
+    ) -> Tuple[List[str]]:
+        """load the state_dict into model, and do the following things:
+
+            * check the
+
+        Args:
+            model (PretrainedModel): the pretrained model instance
+            state_dict (Dict[str, Tensor]): the model state dict data
+            loaded_keys (List[str]):
+            ignore_mismatched_sizes (bool, optional): whether ignore error when tensor size mismatched. Defaults to False.
+            dtype (_type_, optional): the dtype of model state dict. Defaults to None.
+
+        Returns:
+            Tuple[List[str]]: _description_
+        """
+        is_safetensors = False
+
+        model_state_dict = model.state_dict()
+
+        expected_keys = list(model_state_dict.keys())
+        prefix = model.base_model_prefix
+
+        if len(prefix) > 0:
+            has_prefix_module = any(s.startswith(prefix) for s in loaded_keys)
+            expects_prefix_module = any(s.startswith(prefix) for s in expected_keys)
+        else:
+            has_prefix_module = False
+            expects_prefix_module = False
+
+        # key re-naming operations are never done on the keys
+        # that are loaded, but always on the keys of the newly initialized model
+        remove_prefix_from_model = not has_prefix_module and expects_prefix_module
+        add_prefix_to_model = has_prefix_module and not expects_prefix_module
+
+        if remove_prefix_from_model:
+            _prefix = f"{prefix}."
+            expected_keys_not_prefixed = [s for s in expected_keys if not s.startswith(_prefix)]
+            expected_keys = [s[len(_prefix) :] if s.startswith(_prefix) else s for s in expected_keys]
+            if quantization_linear_list is not None:
+                quantization_linear_list = [
+                    s[len(_prefix) :] if s.startswith(_prefix) else s for s in quantization_linear_list
+                ]
+        elif add_prefix_to_model:
+            expected_keys = [".".join([prefix, s]) for s in expected_keys]
+            if quantization_linear_list is not None:
+                quantization_linear_list = [".".join([prefix, s]) for s in quantization_linear_list]
+
+        # Weight quantization if not yet quantized & update loaded_keys
+        if hasattr(config, "quantization_config") and config.quantization_config.is_weight_quantize():
+            try:
+                from ..quantization.quantization_utils import (
+                    convert_to_quantize_state_dict,
+                    update_loaded_state_dict_keys,
+                )
+            except ImportError:
+                raise ImportError("Quantization features require `paddlepaddle >= 2.5.2`")
+            if state_dict is not None:
+                state_dict = convert_to_quantize_state_dict(
+                    state_dict,
+                    quantization_linear_list,
+                    config.quantization_config,
+                    dtype,
+                )
+                loaded_keys = [k for k in state_dict.keys()]
+            else:
+                loaded_keys = update_loaded_state_dict_keys(
+                    loaded_keys, quantization_linear_list, config.quantization_config
+                )
+            if keep_in_fp32_modules is None:
+                keep_in_fp32_modules = (
+                    ["quant_scale"] if config.quantization_config.weight_quantize_algo in ["nf4", "fp4"] else None
+                )
+            else:
+                keep_in_fp32_modules = (
+                    keep_in_fp32_modules + ["quant_scale"]
+                    if config.quantization_config.weight_quantize_algo in ["nf4", "fp4"]
+                    else keep_in_fp32_modules
+                )
+
+        missing_keys = list(set(expected_keys) - set(loaded_keys))
+        unexpected_keys = list(set(loaded_keys) - set(expected_keys))
+
+        # Some models may have keys that are not in the state by design, removing them before needlessly warning
+        # the user.
+        if cls._keys_to_ignore_on_load_missing is not None:
+            for pat in cls._keys_to_ignore_on_load_missing:
+                missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
+
+        if cls._keys_to_ignore_on_load_unexpected is not None:
+            for pat in cls._keys_to_ignore_on_load_unexpected:
+                unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
+
+        # Set some modules to fp32 if any
+        if keep_in_fp32_modules is not None:
+            for name, param in model.named_parameters():
+                if any(module_to_keep_in_fp32 in name for module_to_keep_in_fp32 in keep_in_fp32_modules):
+                    if param.dtype != paddle.float32:
+                        param = param.to(dtype=paddle.float32)
+
+        # Make sure we are able to load base models as well as derived models (with heads)
+        start_prefix = ""
+        model_to_load = model
+        if len(cls.base_model_prefix) > 0 and not hasattr(model, cls.base_model_prefix) and has_prefix_module:
+            start_prefix = cls.base_model_prefix + "."
+        if len(cls.base_model_prefix) > 0 and hasattr(model, cls.base_model_prefix) and not has_prefix_module:
+            model_to_load = getattr(model, cls.base_model_prefix)
+            base_model_expected_keys = list(model_to_load.state_dict().keys())
+            if any(key in expected_keys_not_prefixed and key not in base_model_expected_keys for key in loaded_keys):
+                raise ValueError(
+                    "The state dictionary of the model you are trying to load is corrupted. Are you sure it was "
+                    "properly saved?"
+                )
+
+        def _find_mismatched_keys(
+            state_dict,
+            model_state_dict,
+            loaded_keys,
+            add_prefix_to_model,
+            remove_prefix_from_model,
+            ignore_mismatched_sizes,
+        ):
+            mismatched_keys = []
+            if ignore_mismatched_sizes:
+                for checkpoint_key in loaded_keys:
+                    # If the checkpoint is sharded, we may not have the key here.
+                    if checkpoint_key not in state_dict:
+                        continue
+                    model_key = checkpoint_key
+                    if remove_prefix_from_model:
+                        # The model key starts with `prefix` but `checkpoint_key` doesn't so we add it.
+                        model_key = f"{prefix}.{checkpoint_key}"
+                    elif add_prefix_to_model:
+                        # The model key doesn't start with `prefix` but `checkpoint_key` does so we remove it.
+                        model_key = ".".join(checkpoint_key.split(".")[1:])
+
+                    if (
+                        model_key in model_state_dict
+                        and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
+                    ):
+                        mismatched_keys.append(
+                            (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
+                        )
+                        del state_dict[checkpoint_key]
+            return mismatched_keys
+
+        def _fuse_or_split_keys(
+            state_dict, config, loaded_keys, pre_tensor_parallel_split=False, resume_state_dict=None
+        ):
+            if resume_state_dict is not None:
+                state_dict.update(resume_state_dict)
+
+            before_fuse_keys = list(state_dict.keys())
+            if pre_tensor_parallel_split:
+                tp_actions = cls.get_tensor_parallel_convert_actions(config, loaded_keys, ignore_error=True)
+            else:
+                tp_actions = None
+            state_dict, resume_state_dict = cls.convert_fuse_and_split(config, state_dict, tp_actions)
+            after_fuse_keys = list(state_dict.keys())
+
+            fused_keys = list(set(before_fuse_keys) - set(after_fuse_keys))
+            new_keys = list(set(after_fuse_keys) - set(before_fuse_keys))
+
+            return state_dict, resume_state_dict, fused_keys, new_keys
+
+        if state_dict is not None:
+            # have loaded all state_dict, no resume state_dict
+            state_dict, _, fused_keys, new_keys = _fuse_or_split_keys(
+                state_dict,
+                config,
+                loaded_keys,
+                pre_tensor_parallel_split=True if config is not None and config.tensor_parallel_degree > 1 else False,
+            )
+            missing_keys = list(set(missing_keys) - set(new_keys))
+            unexpected_keys = list(set(unexpected_keys) - set(fused_keys))
+
+            mismatched_keys = _find_mismatched_keys(
+                state_dict,
+                model_state_dict,
+                loaded_keys,
+                add_prefix_to_model,
+                remove_prefix_from_model,
+                ignore_mismatched_sizes,
+            )
+
+            if hasattr(config, "quantization_config") and config.quantization_config.is_weight_quantize():
+                error_msgs = _load_state_dict_into_meta_model(
+                    model_to_load,
+                    state_dict,
+                    loaded_keys,
+                    start_prefix,
+                    expected_keys,
+                    dtype=dtype,
+                    is_safetensors=is_safetensors,
+                    keep_in_fp32_modules=keep_in_fp32_modules,
+                )
+            else:
+                error_msgs = _load_state_dict_into_model(model_to_load, state_dict, start_prefix)
+        else:
+            # Sharded checkpoint or whole but low_cpu_mem_usage==True
+
+            # This should always be a list but, just to be sure.
+            if not isinstance(resolved_archive_file, list):
+                resolved_archive_file = [resolved_archive_file]
+
+            error_msgs = []
+            mismatched_keys = []
+            resume_state_dict = {}
+            if len(resolved_archive_file) > 1:
+                resolved_archive_file = tqdm(resolved_archive_file, desc="Loading checkpoint shards")
+
+            for shard_file in resolved_archive_file:
+                pre_tensor_parallel_split = False
+                if (
+                    shard_file.endswith(".safetensors")
+                    and config.tensor_parallel_degree > 1
+                    and "tp" not in os.path.split(shard_file)[-1]
+                ):
+                    pre_tensor_parallel_split = True
+                    assert loaded_keys is not None, "loaded_keys is not None."
+                    tp_actions = cls.get_tensor_parallel_convert_actions(config, loaded_keys, ignore_error=True)
+                # Here we use expected_keys to optimize weights loading for pipeline model. Only works for safetensors
+                filter_dict_keys = set(expected_keys)
+                fuse_actions, _ = cls.get_fuse_or_split_param_convert_actions(config, loaded_keys, is_fuse=True)
+                split_actions, _ = cls.get_fuse_or_split_param_convert_actions(config, loaded_keys, is_fuse=False)
+                for k in list(fuse_actions.keys()):
+                    need_add_except_key = k[-1] in expected_keys
+                    if need_add_except_key:
+                        filter_dict_keys |= set(k[:-1])
+                    # remove pre_tensor_parallel_split function from tp_actions
+                    if pre_tensor_parallel_split:
+                        for item in k[:-1]:
+                            if item in tp_actions:
+                                tp_actions.pop(item, None)
+
+                for k in list(split_actions.keys()):
+                    need_add_except_key = False
+                    for item in k[:-1]:
+                        if item in expected_keys:
+                            need_add_except_key = True
+                            break
+                    if need_add_except_key:
+                        filter_dict_keys.add(k[-1])
+                    # remove pre_tensor_parallel_split function from tp_actions
+                    if pre_tensor_parallel_split:
+                        if k[-1] in tp_actions:
+                            fuse_actions.pop(k[-1], None)
+
+                if config.quantization_config.is_weight_quantize():
+                    filter_dict_keys = None
+
+                state_dict = load_state_dict(
+                    shard_file, tp_actions if pre_tensor_parallel_split else None, filter_dict_keys
+                )
+
+                # convert for fusing or splitting weights
+                state_dict, resume_state_dict, fused_keys, new_keys = _fuse_or_split_keys(
+                    state_dict,
+                    config,
+                    loaded_keys,
+                    pre_tensor_parallel_split=pre_tensor_parallel_split,
+                    resume_state_dict=resume_state_dict,
+                )
+                missing_keys = list(set(missing_keys) - set(new_keys))
+                unexpected_keys = list(set(unexpected_keys) - set(fused_keys))
+
+                if config.quantization_config.is_weight_quantize():
+                    state_dict = convert_to_quantize_state_dict(
+                        state_dict,
+                        quantization_linear_list,
+                        config.quantization_config,
+                        dtype,
+                    )
+
+                # Mistmatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not
+                # matching the weights in the model.
+                mismatched_keys += _find_mismatched_keys(
+                    state_dict,
+                    model_state_dict,
+                    loaded_keys,
+                    add_prefix_to_model,
+                    remove_prefix_from_model,
+                    ignore_mismatched_sizes,
+                )
+
+                if config.tensor_parallel_degree > 1 and ".tp" not in shard_file and not pre_tensor_parallel_split:
+                    logger.info("Converting state_dict to Tensor Parallel Format")
+                    # ignore error for multi shard, since only parts of data
+                    state_dict = cls.convert_tensor_parallel(
+                        None, config, state_dict=state_dict, ignore_error=len(resolved_archive_file) > 1
+                    )
+                    logger.info("Converted state_dict to Tensor Parallel Format")
+
+                if low_cpu_mem_usage or config.quantization_config.is_weight_quantize():
+                    new_error_msgs = _load_state_dict_into_meta_model(
+                        model_to_load,
+                        state_dict,
+                        loaded_keys,
+                        start_prefix,
+                        expected_keys,
+                        dtype=dtype,
+                        is_safetensors=is_safetensors,
+                        keep_in_fp32_modules=keep_in_fp32_modules,
+                    )
+                    error_msgs += new_error_msgs
+                else:
+                    error_msgs += _load_state_dict_into_model(model_to_load, state_dict, start_prefix)
+
+                # force memory release
+                del state_dict
+                gc.collect()
+
+        if len(error_msgs) > 0:
+            error_msg = "\n\t".join(error_msgs)
+            if " but the expected shape is" in error_msg:
+                error_msg += (
+                    "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
+                )
+            raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
+
+        if len(unexpected_keys) > 0:
+            if logger.logger.level < 20:
+                logger.warning(
+                    f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
+                    f" initializing {model.__class__.__name__}: {sorted(unexpected_keys)}\n- This IS expected if you are"
+                    f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task or"
+                    " with another architecture (e.g. initializing a BertForSequenceClassification model from a"
+                    " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
+                    f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly identical"
+                    " (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)."
+                )
+            else:
+                logger.warning(
+                    f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
+                    f" initializing the model, - This IS expected if you are"
+                    f" initializing the model from a checkpoint of a model trained on another task or"
+                    " with another architecture."
+                )
+        else:
+            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
+
+        if len(missing_keys) > 0:
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
+                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
+            )
+        elif len(mismatched_keys) == 0:
+            logger.info(
+                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the checkpoint"
+                f" was trained on, you can already use {model.__class__.__name__} for predictions without further"
+                " training."
+            )
+        if len(mismatched_keys) > 0:
+            mismatched_warning = "\n".join(
+                [
+                    f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
+                    for key, shape1, shape2 in mismatched_keys
+                ]
+            )
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
+                f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be able"
+                " to use it for predictions and inference."
+            )
+
+        return model, missing_keys, unexpected_keys, mismatched_keys
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        """
+        Creates an instance of `PretrainedModel`. Model weights are loaded
+        by specifying name of a built-in pretrained model, a pretrained model from HF Hub, a community contributed model,
+        or a local file directory path.
+
+        Args:
+            pretrained_model_name_or_path (str): Name of pretrained model or dir path
+                to load from. The string can be:
+
+                - Name of a built-in pretrained model
+                - Name of a pretrained model from HF Hub
+                - Name of a community-contributed pretrained model.
+                - Local directory path which contains model weights file("model_state.pdparams")
+                  and model config file ("model_config.json").
+            from_hf_hub (bool): load model from huggingface hub. Default to `False`.
+            subfolder (str, optional) An optional value corresponding to a folder inside the repo.
+                Only works when loading from Huggingface Hub.
+            *args (tuple): Position arguments for model `__init__`. If provided,
+                use these as position argument values for model initialization.
+            **kwargs (dict): Keyword arguments for model `__init__`. If provided,
+                use these to update pre-defined keyword argument values for model
+                initialization. If the keyword is in `__init__` argument names of
+                base model, update argument values of the base model; else update
+                argument values of derived model.
+            load_state_as_np (bool, optional): The weights read in can be choosed
+                to place on CPU or GPU though the model is on the default device.
+                If `True`, load the model weights as `numpy.ndarray` on CPU.
+                Otherwise, weights would be loaded as tensors on the default
+                device. Note that if on GPU, the latter would creates extra
+                temporary tensors in addition to the model weights, which
+                doubles the memory usage . Thus it is suggested to use `True`
+                for big models on GPU. Default to `False`.
+
+        Returns:
+            PretrainedModel: An instance of `PretrainedModel`.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import BertForSequenceClassification
+
+                # Name of built-in pretrained model
+                model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+
+                # Name of pretrained model from PaddleHub
+                model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+
+                # Name of community-contributed pretrained model
+                model = BertForSequenceClassification.from_pretrained('yingyibiao/bert-base-uncased-sst-2-finetuned', num_labels=3)
+
+                # Load from local directory path
+                model = BertForSequenceClassification.from_pretrained('./my_bert/')
+        """
+        config = kwargs.pop("config", None)
+        state_dict = kwargs.pop("state_dict", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.get("force_download", False)
+        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
+        dtype = kwargs.pop("dtype", None)
+        from_hf_hub = kwargs.pop("from_hf_hub", False)
+        from_aistudio = kwargs.pop("from_aistudio", False)
+        subfolder = kwargs.pop("subfolder", None)
+        if subfolder is None:
+            subfolder = ""
+        variant = kwargs.pop("variant", None)
+        use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False)
+
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", False)
+        convert_from_torch = kwargs.pop("convert_from_torch", None)
+        load_state_as_np = kwargs.pop("load_state_as_np", None)
+        if load_state_as_np is not None:
+            logger.warning("`load_state_as_np` is deprecated,  please delete it!")
+
+        model_kwargs = kwargs
+
+        if convert_from_torch is None and os.environ.get("from_modelscope", False):
+            logger.warning(
+                "If you are attempting to load weights from ModelScope Hub and want to disable the default behavior of considering torch weights,"
+                " you can set ·convert_from_torch=False·. By default, `convert_from_torch` is set to `True`. "
+            )
+            convert_from_torch = True
+
+        # from_hf_hub defalut enable convert_from_torch
+        if from_hf_hub and convert_from_torch is None:
+            logger.warning(
+                "If you are attempting to load weights from Hugging Face Hub and want to disable the default behavior of considering torch weights,"
+                " you can set ·convert_from_torch=False·. By default, `convert_from_torch` is set to `True`. "
+            )
+            convert_from_torch = True
+        # convert_from_torch defalut is False
+        if convert_from_torch is None:
+            convert_from_torch = False
+
+        # 1. get the PretrainedConfig to init model
+        if not isinstance(config, PretrainedConfig):
+            config_path = config if config is not None else pretrained_model_name_or_path
+            config, model_kwargs = cls.config_class.from_pretrained(
+                config_path,
+                cache_dir=cache_dir,
+                from_hf_hub=from_hf_hub,
+                from_aistudio=from_aistudio,
+                subfolder=subfolder,
+                return_unused_kwargs=True,
+                **kwargs,
+            )
+        if "from_aistudio" in model_kwargs:
+            model_kwargs.pop("from_aistudio")
+
+        # if not from_hf_hub and not from_aistudio:
+        #     if not os.path.exists(os.path.join(cache_dir, pretrained_model_name_or_path, subfolder, CONFIG_NAME)):
+        #         config.save_pretrained(os.path.join(cache_dir, pretrained_model_name_or_path, subfolder))
+
+        # refine options for config
+        convert_from_torch = cls.support_conversion(config) and convert_from_torch
+        if dtype is None:
+            dtype = config.dtype
+
+        if config.quantization_config.is_weight_quantize():
+            try:
+                from ..quantization.quantization_utils import (
+                    replace_with_quantization_linear,
+                )
+            except ImportError:
+                raise ImportError("You need to install paddlepaddle >= 2.6.0")
+
+            if dtype != "float16" and dtype != "bfloat16":
+                dtype = "float16"
+                logger.warning(
+                    "Overriding dtype='float16' due to quantization method required DataTypes: float16, bfloat16. Pass your own dtype to remove this warning"
+                )
+        config.dtype = dtype
+
+        init_contexts = []
+        if low_cpu_mem_usage or config.quantization_config.is_weight_quantize():
+            # Instantiate model.
+            init_contexts.append(no_init_weights(_enable=True))
+            if is_paddle_support_lazy_init():
+                init_contexts.append(paddle.LazyGuard())
+
+        if dtype:
+            init_contexts.append(dtype_guard(dtype))
+
+        # Quantization method requires empty init to avoid unnecessary GPU allocation
+        if config.quantization_config.is_weight_quantize():
+            quantization_init_contexts = []
+            quantization_init_contexts.append(no_init_weights(_enable=True))
+            if is_paddle_support_lazy_init():
+                quantization_init_contexts.append(paddle.LazyGuard())
+
+        # Keep in fp32 modules
+        keep_in_fp32_modules = None
+        use_keep_in_fp32_modules = False
+
+        # resolve model_weight file
+        resolved_archive_file, resolved_sharded_files, sharded_metadata, is_sharded = cls._resolve_model_file_path(
+            pretrained_model_name_or_path,
+            cache_dir=cache_dir,
+            subfolder=subfolder,
+            from_hf_hub=from_hf_hub,
+            from_aistudio=from_aistudio,
+            config=config,
+            convert_from_torch=convert_from_torch,
+            use_safetensors=use_safetensors,
+            variant=variant,
+        )
+
+        if convert_from_torch and state_dict is None:
+            if (
+                resolved_archive_file.endswith(PYTORCH_WEIGHTS_NAME)
+                or resolved_archive_file.endswith(PYTORCH_WEIGHTS_INDEX_NAME)
+                or resolved_archive_file.endswith(SAFE_WEIGHTS_NAME)
+                or resolved_archive_file.endswith(SAFE_WEIGHTS_INDEX_NAME)
+            ):
+                # try to get the name-mapping info
+                convert_dir = os.path.dirname(resolved_archive_file)
+                logger.info(
+                    f"Starting to convert pytorch weight file<{resolved_archive_file}> to "
+                    f"paddle weight file<{convert_dir}> ..."
+                )
+                state_dict = cls.convert(
+                    resolved_archive_file,
+                    config,
+                    # cache_dir=os.path.join(cache_dir, pretrained_model_name_or_path, subfolder),
+                    cache_dir=convert_dir,
+                )
+            elif (
+                resolved_archive_file.endswith(PADDLE_WEIGHTS_NAME)
+                or resolved_archive_file.endswith(PADDLE_WEIGHTS_INDEX_NAME)
+                or resolved_archive_file.endswith(".pdparams")
+            ):
+                print(f"file: {resolved_archive_file} is paddle weight.")
+            else:
+                raise ValueError(f"Unexpected file: {resolved_archive_file} for weight conversion.")
+            # load pt weights early so that we know which dtype to init the model under
+
+        if not is_sharded and state_dict is None:
+            # 4. loading non-sharded ckpt from the state dict
+            if config.tensor_parallel_degree > 1 and resolved_archive_file.endswith("model_state.pdparams"):
+                state_dict = cls.convert_tensor_parallel(resolved_archive_file, config)
+            elif config.tensor_parallel_degree > 1 and resolved_archive_file.endswith("model.safetensors"):
+                with safe_open(resolved_archive_file, framework="np", device="cpu") as f:
+                    loaded_keys = f.keys()
+                tp_actions = cls.get_tensor_parallel_convert_actions(config, loaded_keys)
+                state_dict = load_state_dict(resolved_archive_file, tp_actions)
+            else:
+                state_dict = load_state_dict(resolved_archive_file)
+
+            logger.info("Loaded weights file from disk, setting weights to model.")
+
+        # Check if `_keep_in_fp32_modules` is not None
+        use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and (
+            dtype == "float16" or dtype == "bfloat16"
+        )
+
+        if state_dict is not None:
+            loaded_state_dict_keys = [k for k in state_dict.keys()]
+            # will only support load paddle.Tensor to model.
+            for k in list(state_dict.keys()):
+                if not isinstance(state_dict[k], paddle.Tensor):
+                    with device_guard():
+                        state_dict[k] = paddle.Tensor(state_dict.pop(k), zero_copy=True)
+        else:
+            if is_sharded:
+                loaded_state_dict_keys = sharded_metadata["all_checkpoint_keys"]
+            else:
+                loaded_state_dict_keys = [k for k in state_dict.keys()]
+
+        if low_cpu_mem_usage:  # or use_keep_in_fp32_modules:
+            state_dict = None
+
+        # will only support load paddle.Tensor to model.
+        if state_dict is not None:
+            for k in list(state_dict.keys()):
+                if not isinstance(state_dict[k], paddle.Tensor):
+                    with device_guard():
+                        state_dict[k] = paddle.Tensor(state_dict.pop(k), zero_copy=True)
+        # 3. init the model
+        init_args = config["init_args"] or ()
+        with ContextManagers(init_contexts):
+            model = cls(config, *init_args, **model_kwargs)
+
+        if use_keep_in_fp32_modules:
+            # low_cpu_mem_usage = True
+            keep_in_fp32_modules = model._keep_in_fp32_modules
+        else:
+            keep_in_fp32_modules = []
+
+        quantization_linear_list = None
+        if config.quantization_config.is_weight_quantize():
+            with ContextManagers(quantization_init_contexts):
+                quantization_linear_list = replace_with_quantization_linear(
+                    model=model,
+                    quantization_config=config.quantization_config,
+                    llm_int8_threshold=config.quantization_config.llm_int8_threshold,
+                )
+                quantization_linear_list = []
+                for key in model.state_dict().keys():
+                    if "quant_weight" in key:
+                        quantization_linear_list.append(key[:-13])
+
+        model, missing_keys, unexpected_keys, mismatched_keys = cls._load_pretrained_model(
+            model=model,
+            state_dict=state_dict,
+            loaded_keys=loaded_state_dict_keys,
+            resolved_archive_file=resolved_sharded_files if is_sharded else resolved_archive_file,
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
+            config=config,
+            ignore_mismatched_sizes=ignore_mismatched_sizes,
+            low_cpu_mem_usage=low_cpu_mem_usage,
+            dtype=dtype,
+            keep_in_fp32_modules=keep_in_fp32_modules,
+            quantization_linear_list=quantization_linear_list,
+        )
+
+        # load generation_config.json
+        if model.can_generate() and pretrained_model_name_or_path is not None:
+            try:
+                model.generation_config = GenerationConfig.from_pretrained(
+                    pretrained_model_name_or_path,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    from_hf_hub=from_hf_hub,
+                    from_aistudio=from_aistudio,
+                    subfolder=subfolder,
+                    **kwargs,
+                )
+            except:
+                logger.info(
+                    "Generation config file not found, using a generation config created from the model config."
+                )
+                pass
+
+        # Note:
+        # 1. PipelineLayer will create parameters for each layer and
+        # call `_synchronize_shared_weights()` to synchronize the shared parameters.
+        # 2. When setting the model `state_dict`, `_synchronize_shared_weights` will be called to
+        # synchronize the shared parameters.
+        # However, when state dict only contains the one piece of shared parameters, the shared parameters
+        # will be different from the original shared parameters.
+
+        if isinstance(model, PipelineLayer):
+            model._synchronize_shared_weights()
+
+        if paddle.in_dynamic_mode():
+            return model
+
+        return model, state_dict
+
+    def save_pretrained(
+        self,
+        save_dir: Union[str, os.PathLike],
+        is_main_process: bool = True,
+        state_dict: Optional[dict] = None,
+        save_function: Callable = paddle.save,
+        max_shard_size: Union[int, str] = "10GB",
+        safe_serialization: bool = False,
+        variant: Optional[str] = None,
+        *args,
+        **kwargs,
+    ):
+        """
+        Saves model configuration and related resources (model state) as files
+        under `save_dir`. The model configuration would be saved into a file named
+        "model_config.json", and model state would be saved into a file
+        named "model_state.pdparams".
+
+        The `save_dir` can be used in `from_pretrained` as argument value
+        of `pretrained_model_name_or_path` to re-load the trained model.
+
+        Args:
+            save_dir (str): Directory to save files into.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import BertForSequenceClassification
+
+                model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+                model.save_pretrained('./trained_model/')
+                # reload from save_directory
+                model = BertForSequenceClassification.from_pretrained('./trained_model/')
+        """
+
+        assert not os.path.isfile(save_dir), "Saving directory ({}) should be a directory, not a file".format(save_dir)
+        os.makedirs(save_dir, exist_ok=True)
+
+        merge_tensor_parallel = kwargs.get("merge_tensor_parallel", False)
+        config_to_save = kwargs.get("config_to_save", None)
+        shard_format = kwargs.get("shard_format", "naive")  # support naive pipeline
+        # variant = kwargs.get("variant", None)
+        # is_main_process = kwargs.get("is_main_process", True)
+
+        save_directory = save_dir
+
+        if safe_serialization and not is_safetensors_available():
+            raise ImportError("`safe_serialization` requires the `safetensors library: `pip install safetensors`.")
+
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+
+        os.makedirs(save_directory, exist_ok=True)
+        # Save model config
+
+        # Only save the model in distributed training setup
+        model_to_save = unwrap_model(self)
+
+        # save the string version of dtype to the config, e.g. convert paddle.float32 => "float32"
+        # we currently don't use this setting automatically, but may start to use with v5
+
+        dtype = get_parameter_dtype(model_to_save)
+        model_to_save.config.dtype = str(dtype).split(".")[1]
+        if config_to_save is None:
+            config_to_save = copy.deepcopy(model_to_save.config)
+
+        # Save the model
+        if state_dict is None:
+            state_dict = model_to_save.state_dict()
+            if config_to_save.tensor_parallel_degree > 1:
+                if not config_to_save.quantization_config.is_support_merge_tensor_parallel() and merge_tensor_parallel:
+                    logger.warning(
+                        f"Quantization strategy: {config_to_save.quantization_config.weight_quantize_algo} does not support merge tensor parallel, thus we set merge_tensor_parallel to False."
+                    )
+                    merge_tensor_parallel = False
+                if merge_tensor_parallel:
+                    state_dict = model_to_save.merge_tensor_parallel(state_dict, config_to_save)
+                    config_to_save.tensor_parallel_degree = 1
+                    if config_to_save.tensor_parallel_rank != 0:
+                        logger.info("Saving with merge_tensor_parallel, tensor_parallel_rank > 0 don't need save")
+                        return
+                    if variant is not None and "tp" in variant:
+                        variant = "_".join([x for x in variant.split("_") if "tp" not in x])
+                else:
+                    variant = weight_name_suffix() if variant is None else variant
+
+        # Attach architecture to the config
+        config_to_save.architectures = [model_to_save.__class__.__name__]
+        # Save the config
+        if is_main_process:
+            config_to_save.save_pretrained(save_directory)
+            if self.can_generate():
+                model_to_save.generation_config.save_pretrained(save_directory)
+
+        # Handle the case where some state_dict keys shouldn't be saved
+        if self._keys_to_ignore_on_save is not None:
+            for ignore_key in self._keys_to_ignore_on_save:
+                if ignore_key in state_dict.keys():
+                    del state_dict[ignore_key]
+
+        # Shard the model if it is too big.
+        weights_name = SAFE_WEIGHTS_NAME if safe_serialization else PADDLE_WEIGHTS_NAME
+        weights_name = _add_variant(weights_name, variant)
+
+        # Save model
+        shards, index = shard_checkpoint(
+            state_dict, max_shard_size=max_shard_size, weights_name=weights_name, shard_format=shard_format
+        )
+
+        # Clean the folder from a previous save
+        for filename in os.listdir(save_directory):
+            full_filename = os.path.join(save_directory, filename)
+            # If we have a shard file that is not going to be replaced, we delete it, but only from the main process
+            # in distributed settings to avoid race conditions.
+            weights_no_suffix = weights_name.replace(".pdparams", "").replace(".safetensors", "")
+
+            # make sure that file to be deleted matches format of sharded file, e.g. paddle_model-00001-of-00005
+            filename_no_suffix = filename.replace(".pdparams", "").replace(".safetensors", "")
+            reg = re.compile("(.*?)-\d{5}-of-\d{5}")
+
+            if (
+                filename.startswith(weights_no_suffix)
+                and os.path.isfile(full_filename)
+                and filename not in shards.keys()
+                and is_main_process
+                and reg.fullmatch(filename_no_suffix) is not None
+            ):
+                os.remove(full_filename)
+
+        # Save the model
+        for shard_file, shard in shards.items():
+            if safe_serialization:
+                # At some point we will need to deal better with save_function (used for TPU and other distributed
+                # joyfulness), but for now this enough.
+                for k in list(shard.keys()):
+                    if isinstance(shard[k], paddle.Tensor):
+                        shard[k] = shard.pop(k).cpu().numpy()
+                safe_save_file(shard, os.path.join(save_directory, shard_file), metadata={"format": "np"})
+            else:
+                save_function(shard, os.path.join(save_directory, shard_file))
+
+        if index is None:
+            if not safe_serialization:
+                path_to_weights = os.path.join(save_directory, _add_variant(PADDLE_WEIGHTS_NAME, variant))
+            else:
+                path_to_weights = os.path.join(save_directory, _add_variant(SAFE_WEIGHTS_NAME, variant))
+            logger.info(f"Model weights saved in {path_to_weights}")
+
+        else:
+            save_index_file = SAFE_WEIGHTS_INDEX_NAME if safe_serialization else PADDLE_WEIGHTS_INDEX_NAME
+            save_index_file = os.path.join(save_directory, _add_variant(save_index_file, variant))
+            # Save the index as well
+            with open(save_index_file, "w", encoding="utf-8") as f:
+                content = json.dumps(index, indent=2) + "\n"
+                f.write(content)
+            logger.info(
+                f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be "
+                f"split in {len(shards)} checkpoint shards. You can find where each parameters has been saved in the "
+                f"index located at {save_index_file}."
+            )
+
+
+class PipelinePretrainedModel(PretrainedModel):
+    def __init_hook__(self):
+        if not hasattr(self, "_sequential_layers"):
+            self._sequential_layers = []
+            self._single_to_pp_mapping = None
+            self._pp_to_single_mapping = None
+
+    def __init__(self, config, *args, **kwargs):
+        self.__init_hook__()
+        super().__init__(config, *args, **kwargs)
+
+    def add_sequential_layer(self, layer_desc, name_prefix=""):
+        self.__init_hook__()
+        self._sequential_layers.append({"layer": layer_desc, "name_prefix": name_prefix})
+
+    def get_sequential_layers(self):
+        self.__init_hook__()
+        return [x["layer"] for x in self._sequential_layers]
+
+    def get_sequential_name_prefixes(self):
+        self.__init_hook__()
+        return {str(index): x["name_prefix"] for index, x in enumerate(self._sequential_layers)}
+
+    def _set_pipeline_name_mapping(self, mappings=None):
+        if mappings is not None:
+            self._single_to_pp_mapping = mappings
+        else:
+            single_to_pp_mapping = {}
+            pp_to_single_mapping = {}
+
+            state_dict_keys = list(super().state_dict().keys())
+            first_key = ""
+            for k in state_dict_keys:
+                if "shared_layers" not in k:
+                    first_key = k
+                    break
+            first_key = first_key.split(".")
+            # if use virtual pp_degree, the prefix is like 0.0.xxx
+            # else it will be like 0.xxx
+            use_virtual_pp_degree = first_key[0].isdigit() and first_key[1].isdigit()
+
+            prefixes = self.get_sequential_name_prefixes()
+            for k in state_dict_keys:
+                name_splited = k.split(".")
+                if use_virtual_pp_degree:
+                    if name_splited[0].isdigit():
+                        if name_splited[1].isdigit():
+                            idx = str(int(name_splited[0]) + int(name_splited[1]))
+                            single_name = [prefixes[idx]]
+                            single_name.extend(name_splited[2:])
+                        else:
+                            single_name = [prefixes[str(len(prefixes) - 1)]]
+                            single_name.extend(name_splited[2:])
+                            logger.warning(
+                                f"Please check! we treat this key as last layer, get {k}, set origin name as {'.'.join(single_name)}"
+                            )
+                    elif name_splited[0] == "shared_layers":
+                        single_name = [self.get_shardlayer_prefix(name_splited)]
+                        single_name.extend(name_splited[2:])
+                    else:
+                        raise ValueError(f"Unexpected key: {k} for pp layer.")
+                else:
+                    idx = name_splited[0]
+                    # for normal pp layer
+                    if idx.isdigit():
+                        # allow empty prefix
+                        single_name = [] if prefixes[idx] == "" else [prefixes[idx]]
+                        single_name.extend(name_splited[1:])
+                    elif idx == "shared_layers":
+                        single_name = [self.get_shardlayer_prefix(name_splited)]
+                        single_name.extend(name_splited[2:])
+                    else:
+                        raise ValueError(f"Unexpected key: {k} for pp layer.")
+
+                single_to_pp_mapping[".".join(single_name)] = k
+                pp_to_single_mapping[k] = ".".join(single_name)
+
+            self._single_to_pp_mapping = single_to_pp_mapping
+            self._pp_to_single_mapping = pp_to_single_mapping
+
+        return self._single_to_pp_mapping
+
+    def get_shardlayer_prefix(self, name_splited):
+        """_summary_
+            This function retrieves the prefix of a shared layer. The process involves:
+            1. Identifying all key names of shared layers, like 'shared_weight01', 'shared_weight02', etc.
+            2. For instance, given name_splited = ['shared_layers', 'shared_weight01', 'weight'],
+                the 'shared_layer_key' would be name_splited[1], which is 'shared_weight01'.
+            3. By traversing through all layers, the function checks if the specified
+                shared_layer is present in the current stage. If found, it returns the corresponding prefix.
+
+            Note: For retrieving all SharedLayer instances in Paddle, you can refer to the following Paddle code.
+            https://github.com/PaddlePaddle/Paddle/blob/2cf724d055679a1a0e48766dfb1708b920273078/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py#L460-L513
+        Args:
+            name_splited (_type_): _description_
+
+        Returns:
+            _type_: _description_
+        """
+        shared_layer_names = {s.layer_name for s in self._layers_desc if isinstance(s, SharedLayerDesc)}
+        assert name_splited[1] in shared_layer_names, f"The shared layer name {name_splited[1]} must be in prefixes!"
+        shared_layer_key = name_splited[1]
+        for idx, layer in enumerate(self._layers_desc):
+            if isinstance(layer, SharedLayerDesc) and layer.layer_name == shared_layer_key:
+                if self.get_stage_from_index(idx) == self._stage_id:
+                    return self.get_sequential_name_prefixes()[str(idx)]
+
+        # the prefix must be in the current stage, else raise error
+        raise ValueError(f"The shared layer {shared_layer_key} must be in the current stage!")
+
+    def state_dict(self, *args, **kwargs):
+        state_dict = super().state_dict(*args, **kwargs)
+
+        if self._single_to_pp_mapping is None:
+            self._set_pipeline_name_mapping()
+        assert len(self._single_to_pp_mapping) > 0, "The pipeline stage must have parameters!"
+
+        for k in list(state_dict.keys()):
+            v = state_dict.pop(k)
+            state_dict[self._pp_to_single_mapping[k]] = v
+
+        return state_dict
+
+    def set_state_dict(self, state_dict, *args, **kwargs):
+        if self._single_to_pp_mapping is None:
+            self._set_pipeline_name_mapping()
+        assert len(self._single_to_pp_mapping) > 0, "The pipeline stage must have parameters!"
+
+        for k in list(state_dict.keys()):
+            v = state_dict.pop(k)
+            if k not in self._single_to_pp_mapping:
+                continue
+            state_dict[self._single_to_pp_mapping[k]] = v
+
+        ret = super().set_state_dict(state_dict, *args, **kwargs)
+        return ret
+
+
+def load_sharded_checkpoint_as_one(folder, variant=None, return_numpy=False):
+    """
+
+    This load is performed efficiently: each checkpoint shard is loaded one by one in RAM and deleted after being
+    loaded in the model.
+
+    Args:
+        folder (`str` or `os.PathLike`): A path to a folder containing the sharded checkpoint.
+        variant (`str`): The model variant.
+        return_numpy (`bool`): Whether to return numpy array instead of paddle tensor.
+
+    """
+    # Load the index
+    pdparams_file = os.path.join(folder, _add_variant("model_state.pdparams", variant))
+    lora_pdparams_file = os.path.join(folder, _add_variant("lora_model_state.pdparams", variant))
+    safetensors_file = os.path.join(folder, _add_variant("model.safetensors", variant))
+    if os.path.isfile(pdparams_file):
+        return paddle.load(pdparams_file, return_numpy=return_numpy)
+    if os.path.isfile(lora_pdparams_file):
+        return paddle.load(lora_pdparams_file, return_numpy=return_numpy)
+    if os.path.isfile(safetensors_file):
+        state_dict = safe_load_file(safetensors_file)
+        if not return_numpy:
+            for key in list(state_dict.keys()):
+                if isinstance(state_dict[key], np.ndarray):
+                    state_dict[key] = paddle.Tensor(state_dict.pop(key), zero_copy=True)
+        return state_dict
+
+    index_file = os.path.join(folder, _add_variant(PADDLE_WEIGHTS_INDEX_NAME, variant))
+    safe_index_file = os.path.join(folder, _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant))
+    safe_master_file = os.path.join(folder, _add_variant(SAFE_MASTER_WEIGHTS_INDEX_NAME, variant))
+    safe_peft_file = os.path.join(folder, _add_variant(SAFE_PEFT_WEIGHTS_INDEX_NAME, variant))
+
+    index_present = os.path.isfile(index_file)
+    safe_index_present = os.path.isfile(safe_index_file)
+    safe_master_present = os.path.isfile(safe_master_file)
+    safe_peft_present = os.path.isfile(safe_peft_file)
+
+    load_safe = False
+    load_index = None
+    if safe_index_present:
+        load_safe = True  # load safe due to preference
+        load_index = safe_index_file
+    elif safe_master_present:
+        load_safe = True
+        load_index = safe_master_file
+    elif index_present:
+        load_index = index_file
+    elif safe_peft_present:
+        load_safe = True
+        load_index = safe_peft_file
+    else:
+        raise ValueError(f"Could not find {index_file} or {safe_index_file} or {safe_peft_file}")
+
+    with open(load_index, "r", encoding="utf-8") as f:
+        index = json.load(f)
+
+    shard_files = list(set(index["weight_map"].values()))
+    loader = safe_load_file if load_safe else partial(paddlenlp_load, map_location="np" if return_numpy else "cpu")
+
+    ret = {}
+    for shard_file in tqdm(shard_files):
+        state_dict = loader(os.path.join(folder, shard_file))
+        ret.update(state_dict)
+
+    if not return_numpy:
+        for key in list(ret.keys()):
+            if isinstance(ret[key], np.ndarray):
+                ret[key] = paddle.Tensor(ret.pop(key), zero_copy=True)
+
+    return ret
+
+
+def load_tp_checkpoint(folder, cls, config, return_numpy=False):
+    """
+
+    This load is performed efficiently: Load tp checkpoint only from cpu, no need to init the model.
+
+    Args:
+        folder (`str` or `os.PathLike`): A path to a folder containing the model checkpoint.
+        cls (`str`): The model class.
+        config (`AutoConfig`): The model config.
+        return_numpy (bool): Whether load the tp checkpoint as numpy.
+    """
+    if config.tensor_parallel_degree == 1 or config.tensor_parallel_degree == -1:
+        return load_sharded_checkpoint_as_one(folder, return_numpy=return_numpy)
+    else:
+        rank_model_path = os.path.join(folder, f"model_state.tp0{config.tensor_parallel_rank}.pdparams")
+        model_path = os.path.join(folder, "model_state.pdparams")
+        safe_model_path = os.path.join(folder, "model.safetensors")
+        if os.path.exists(rank_model_path):
+            return paddle.load(rank_model_path, return_numpy=return_numpy)
+        elif os.path.exists(model_path):
+            state_dict = cls.convert_tensor_parallel(model_path, config)
+        elif os.path.exists(safe_model_path):
+            with safe_open(safe_model_path, framework="np", device="cpu") as f:
+                loaded_keys = f.keys()
+            tp_actions = cls.get_tensor_parallel_convert_actions(config, loaded_keys)
+            state_dict = load_state_dict(safe_model_path, tp_actions)
+        else:  # shard files safetensors
+            resolved_archive_file, resolved_sharded_files, sharded_metadata, is_sharded = cls._resolve_model_file_path(
+                pretrained_model_name_or_path=folder,
+                use_safetensors=True,
+            )
+            if len(resolved_sharded_files) > 1:
+                resolved_sharded_files = tqdm(resolved_sharded_files, desc="Loading checkpoint shards")
+            loaded_state_dict_keys = sharded_metadata["all_checkpoint_keys"]
+            tp_actions = cls.get_tensor_parallel_convert_actions(config, loaded_state_dict_keys, ignore_error=True)
+            state_dict = {}
+            for shard_file in resolved_sharded_files:
+                shard_state_dict = load_state_dict(
+                    shard_file,
+                    tp_actions,
+                    loaded_state_dict_keys,
+                )
+                state_dict.update(shard_state_dict)
+        if return_numpy:
+            for k in list(state_dict.keys()):
+                if not isinstance(state_dict[k], np.ndarray):
+                    state_dict[k] = state_dict.pop(k).cpu().numpy()
+    return state_dict
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mpnet/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mpnet/__init__.py
new file mode 100644
index 000000000..3bd752713
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mpnet/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .modeling import *
+from .tokenizer import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mpnet/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mpnet/configuration.py
new file mode 100644
index 000000000..e443a4568
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mpnet/configuration.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MPNet model configuration"""
+from __future__ import annotations
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = [
+    "MPNET_PRETRAINED_INIT_CONFIGURATION",
+    "MPNetConfig",
+]
+
+MPNET_PRETRAINED_INIT_CONFIGURATION = {}
+
+
+class MPNetConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MPNetModel`]. It is used to
+    instantiate a MPNet model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the MPNet.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30527):
+            Vocabulary size of the MPNet model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MPNetModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 514):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
+            The number of buckets to use for each attention layer.
+
+    Examples:
+
+    ```python
+    >>> from paddlenlp.transformers import MPNetModel, MPNetConfig
+
+    >>> # Initializing a MPNet mpnet-base style configuration
+    >>> configuration = MPNetConfig()
+
+    >>> # Initializing a model from the MPNet mpnet-base style configuration
+    >>> model = MPNetModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "mpnet"
+    attribute_map = {
+        "num_classes": "num_labels",
+    }
+
+    def __init__(
+        self,
+        vocab_size: int = 30527,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        max_position_embeddings: int = 514,
+        initializer_range: float = 0.02,
+        layer_norm_eps: float = 1e-5,
+        relative_attention_num_buckets: int = 32,
+        pad_token_id: int = 1,
+        bos_token_id: int = 0,
+        eos_token_id: int = 2,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.relative_attention_num_buckets = relative_attention_num_buckets
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mpnet/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mpnet/modeling.py
new file mode 100644
index 000000000..b47506fd4
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mpnet/modeling.py
@@ -0,0 +1,731 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The HuggingFace Inc. team, Microsoft Corporation.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from .. import PretrainedModel, register_base_model
+from ..activations import ACT2FN
+from .configuration import MPNET_PRETRAINED_INIT_CONFIGURATION, MPNetConfig
+
+__all__ = [
+    "MPNetModel",
+    "MPNetPretrainedModel",
+    "MPNetForMaskedLM",
+    "MPNetForSequenceClassification",
+    "MPNetForMultipleChoice",
+    "MPNetForTokenClassification",
+    "MPNetForQuestionAnswering",
+]
+
+
+def create_position_ids_from_input_ids(input_ids, padding_idx=1):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`. :param paddle.Tensor x: :return paddle.Tensor:
+    """
+    mask = (input_ids != padding_idx).astype(paddle.int64)
+    incremental_indices = paddle.cumsum(mask, axis=1).astype(mask.dtype) * mask
+    return incremental_indices.astype(paddle.int64) + padding_idx
+
+
+class MPNetEmbeddings(nn.Layer):
+    """
+    Include embeddings from word and position embeddings.
+    """
+
+    def __init__(self, config: MPNetConfig):
+        super(MPNetEmbeddings, self).__init__()
+        self.padding_idx = config.pad_token_id
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx)
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        )
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, position_ids=None):
+
+        if position_ids is None:
+            position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx)
+
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+
+        embeddings = words_embeddings + position_embeddings
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class MPNetAttention(nn.Layer):
+    def __init__(self, config: MPNetConfig):
+        super(MPNetAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = config.hidden_size // config.num_attention_heads
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.scale = self.attention_head_size**-0.5
+        self.q = nn.Linear(config.hidden_size, self.all_head_size)
+        self.k = nn.Linear(config.hidden_size, self.all_head_size)
+        self.v = nn.Linear(config.hidden_size, self.all_head_size)
+        self.o = nn.Linear(config.hidden_size, config.hidden_size)
+
+        self.attention_dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.output_dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.shape[:-1] + [
+            self.num_attention_heads,
+            self.attention_head_size,
+        ]
+        x = x.reshape(new_x_shape)
+        return x.transpose(perm=(0, 2, 1, 3))
+
+    def forward(self, hidden_states, attention_mask=None, position_bias=None):
+        q = self.q(hidden_states)
+        k = self.k(hidden_states)
+        v = self.v(hidden_states)
+
+        q = self.transpose_for_scores(q)
+        k = self.transpose_for_scores(k)
+        v = self.transpose_for_scores(v)
+
+        attention_scores = paddle.matmul(q, k, transpose_y=True) * self.scale
+
+        if position_bias is not None:
+            attention_scores += position_bias
+
+        if attention_mask is not None:
+            attention_scores = attention_scores + attention_mask
+
+        attention_probs = F.softmax(attention_scores, axis=-1)
+
+        attention_probs = self.attention_dropout(attention_probs)
+
+        context_layer = paddle.matmul(attention_probs, v)
+
+        context_layer = context_layer.transpose(perm=(0, 2, 1, 3))
+        new_context_layer_shape = context_layer.shape[:-2] + [self.all_head_size]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        projected_context_layer = self.o(context_layer)
+        projected_context_layer_dropout = self.output_dropout(projected_context_layer)
+        layer_normed_context_layer = self.layer_norm(hidden_states + projected_context_layer_dropout)
+
+        return layer_normed_context_layer, attention_scores
+
+
+class MPNetLayer(nn.Layer):
+    def __init__(self, config: MPNetConfig):
+        super(MPNetLayer, self).__init__()
+        self.attention = MPNetAttention(config)
+        self.ffn = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.ffn_output = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.activation = ACT2FN[config.hidden_act]
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, attention_mask=None, position_bias=None):
+        attention_output, layer_att = self.attention(
+            hidden_states, attention_mask=attention_mask, position_bias=position_bias
+        )
+
+        ffn_output = self.ffn(attention_output)
+        ffn_output = self.activation(ffn_output)
+        ffn_output = self.ffn_output(ffn_output)
+
+        ffn_output_dropout = self.dropout(ffn_output)
+        hidden_states = self.layer_norm(ffn_output_dropout + attention_output)
+
+        return hidden_states, layer_att
+
+
+class MPNetEncoder(nn.Layer):
+    def __init__(self, config: MPNetConfig):
+        super(MPNetEncoder, self).__init__()
+        layer = MPNetLayer(config)
+        self.layer = nn.LayerList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+        self.relative_attention_bias = nn.Embedding(config.relative_attention_num_buckets, config.num_attention_heads)
+
+    def forward(self, hidden_states, attention_mask=None):
+        position_bias = self.compute_position_bias(hidden_states)
+        all_encoder_layers = []
+        all_encoder_att = []
+        for i, layer_module in enumerate(self.layer):
+            all_encoder_layers.append(hidden_states)
+            hidden_states, layer_att = layer_module(all_encoder_layers[i], attention_mask, position_bias)
+            all_encoder_att.append(layer_att)
+        all_encoder_layers.append(hidden_states)
+        return all_encoder_layers, all_encoder_att
+
+    def compute_position_bias(self, x, position_ids=None, num_buckets=32):
+        bsz, qlen, klen = x.shape[0], x.shape[1], x.shape[1]
+        if position_ids is not None:
+            context_position = position_ids.unsqueeze(2)
+            memory_position = position_ids.unsqueeze(1)
+        else:
+            context_position = paddle.arange(qlen).unsqueeze(1)
+            memory_position = paddle.arange(klen).unsqueeze(0)
+
+        relative_position = memory_position - context_position
+
+        rp_bucket = self.relative_position_bucket(relative_position, num_buckets=num_buckets)
+
+        values = self.relative_attention_bias(rp_bucket)
+        values = values.transpose(perm=[2, 0, 1]).unsqueeze(0)
+        values = values.expand(shape=(bsz, values.shape[1], qlen, klen))
+        return values
+
+    @staticmethod
+    def relative_position_bucket(relative_position, num_buckets=32, max_distance=128):
+        ret = 0
+        n = -relative_position
+
+        num_buckets //= 2
+        ret += (n < 0).astype(paddle.int64) * num_buckets
+        n = paddle.abs(n)
+
+        max_exact = num_buckets // 2
+        is_small = n < max_exact
+
+        val_if_large = max_exact + (
+            paddle.log(n.astype(paddle.float32) / max_exact)
+            / math.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).astype(paddle.int64)
+
+        val_if_large = paddle.minimum(val_if_large, paddle.full_like(val_if_large, num_buckets - 1))
+        ret += paddle.where(is_small, n, val_if_large)
+        return ret
+
+
+class MPNetPooler(nn.Layer):
+    """
+    Pool the result of MPNetEncoder.
+    """
+
+    def __init__(self, config: MPNetConfig):
+        super(MPNetPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class MPNetPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained MPNet models. It provides MPNet related
+    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
+    `pretrained_init_configuration`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    base_model_prefix = "mpnet"
+    pretrained_resource_files_map = {
+        "model_state": {
+            "mpnet-base": "https://bj.bcebos.com/paddlenlp/models/transformers/mpnet/mpnet-base/model_state.pdparams",
+        }
+    }
+    pretrained_init_configuration = MPNET_PRETRAINED_INIT_CONFIGURATION
+    config_class = MPNetConfig
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range,
+                        shape=layer.weight.shape,
+                    )
+                )
+
+
+@register_base_model
+class MPNetModel(MPNetPretrainedModel):
+    """
+    The bare MPNet Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`MPNetConfig`):
+            An instance of MPNetConfig used to construct MPNetModel.
+    """
+
+    def __init__(self, config: MPNetConfig):
+        super(MPNetModel, self).__init__(config)
+        self.initializer_range = config.initializer_range
+        self.embeddings = MPNetEmbeddings(config)
+        self.encoder = MPNetEncoder(config)
+        self.pooler = MPNetPooler(config)
+
+    def forward(self, input_ids, position_ids=None, attention_mask=None):
+        r"""
+        The MPNetModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            position_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                If its data type is int, the values should be either 0 or 1.
+
+                - **1** for tokens that **not masked**,
+                - **0** for tokens that **masked**.
+
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+
+        Returns:
+            tuple: Returns tuple (`sequence_output`, `pooled_output`).
+
+            With the fields:
+
+            - `sequence_output` (Tensor):
+                Sequence of hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+            - `pooled_output` (Tensor):
+                The output of first token (`<s>`) in sequence.
+                We "pool" the model by simply taking the hidden state corresponding to the first token.
+                Its data type should be float32 and its shape is [batch_size, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import MPNetModel, MPNetTokenizer
+
+                tokenizer = MPNetTokenizer.from_pretrained('mpnet-base')
+                model = MPNetModel.from_pretrained('mpnet-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+        """
+
+        if attention_mask is None:
+            attention_mask = (input_ids != self.embeddings.padding_idx).astype(input_ids.dtype)
+
+        if attention_mask.ndim == 2:
+            attention_mask = attention_mask.unsqueeze(axis=[1, 2])
+            attention_mask = (1.0 - attention_mask) * -10000.0
+
+        embedding_output = self.embeddings(input_ids, position_ids)
+
+        encoder_outputs, _ = self.encoder(embedding_output, attention_mask)
+
+        sequence_output = encoder_outputs[-1]
+        pooled_output = self.pooler(sequence_output)
+
+        return sequence_output, pooled_output
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+
+class MPNetLMHead(nn.Layer):
+    """
+    MPNet Model with a `language modeling` head on top for CLM fine-tuning.
+    """
+
+    def __init__(
+        self,
+        config: MPNetConfig,
+        embedding_weights=None,
+    ):
+        super(MPNetLMHead, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = ACT2FN[config.hidden_act]
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+
+        self.decoder_weight = (
+            self.create_parameter(
+                shape=[config.vocab_size, config.hidden_size], dtype=self.dense.weight.dtype, is_bias=False
+            )
+            if embedding_weights is None
+            else embedding_weights
+        )
+        self.decoder_bias = self.create_parameter(
+            shape=[config.vocab_size], dtype=self.decoder_weight.dtype, is_bias=True
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+
+        hidden_states = paddle.matmul(hidden_states, self.decoder_weight, transpose_y=True) + self.decoder_bias
+
+        return hidden_states
+
+
+class MPNetForMaskedLM(MPNetPretrainedModel):
+    """
+    MPNet Model with a `language modeling` head on top.
+
+    Args:
+        config (:class:`MPNetConfig`):
+            An instance of MPNetConfig used to construct MPNetModel.
+
+    """
+
+    def __init__(self, config: MPNetConfig):
+        super(MPNetForMaskedLM, self).__init__(config)
+        self.mpnet = MPNetModel(config)
+        self.lm_head = MPNetLMHead(config, embedding_weights=self.mpnet.embeddings.word_embeddings.weight)
+
+    def forward(
+        self,
+        input_ids,
+        position_ids=None,
+        attention_mask=None,
+        labels=None,
+    ):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`MPNetModel`.
+            position_ids (Tensor, optional):
+                See :class:`MPNetModel`.
+            attention_mask (Tensor, optional):
+                See :class:`MPNetModel`.
+            labels (Tensor, optional):
+                The Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., vocab_size]`` Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., vocab_size]`` Its shape is [batch_size, sequence_length].
+
+        Returns:
+            tuple: Returns tuple (`masked_lm_loss`, `prediction_scores`, ``sequence_output`).
+
+            With the fields:
+
+            - `masked_lm_loss` (Tensor):
+                The masked lm loss. Its data type should be float32 and its shape is [1].
+
+            - `prediction_scores` (Tensor):
+                The scores of masked token prediction. Its data type should be float32. Its shape is [batch_size, sequence_length, vocab_size].
+
+            - `sequence_output` (Tensor):
+                Sequence of hidden-states at the last layer of the model. Its data type should be float32. Its shape is `[batch_size, sequence_length, hidden_size]`.
+
+
+        """
+        sequence_output, pooled_output = self.mpnet(
+            input_ids, position_ids=position_ids, attention_mask=attention_mask
+        )
+        prediction_scores = self.lm_head(sequence_output)
+
+        masked_lm_loss = None
+
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            masked_lm_loss = loss_fct(
+                prediction_scores.reshape(shape=(-1, self.mpnet.config["vocab_size"])),
+                labels.reshape(shape=(-1,)),
+            )
+            return masked_lm_loss, prediction_scores, sequence_output
+
+        return prediction_scores, sequence_output
+
+
+class MPNetForSequenceClassification(MPNetPretrainedModel):
+    """
+    MPNet Model with a linear layer on top of the output layer,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        config (:class:`MPNetConfig`):
+            An instance of MPNetConfig used to construct MPNetModel.
+    """
+
+    def __init__(self, config: MPNetConfig):
+        super(MPNetForSequenceClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.mpnet = MPNetModel(config)
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, input_ids, position_ids=None, attention_mask=None):
+        r"""
+        The MPNetForSequenceClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`MPNetModel`.
+            position_ids(Tensor, optional):
+                See :class:`MPNetModel`.
+            attention_mask (list, optional):
+                See :class:`MPNetModel`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input text classification logits.
+            Shape as `[batch_size, num_classes]` and dtype as float32.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import MPNetForSequenceClassification, MPNetTokenizer
+
+                tokenizer = MPNetTokenizer.from_pretrained('mpnet-base')
+                model = MPNetForSequenceClassification.from_pretrained('mpnet-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                logits = outputs[0]
+        """
+
+        _, pooled_output = self.mpnet(input_ids, position_ids=position_ids, attention_mask=attention_mask)
+        pooled_output = self.dropout(pooled_output)
+
+        logits = self.classifier(pooled_output)
+
+        return logits
+
+
+class MPNetForMultipleChoice(MPNetPretrainedModel):
+    """
+    MPNet Model with a linear layer on top of the hidden-states output layer,
+    designed for multiple choice tasks like RocStories/SWAG tasks.
+
+    Args:
+        config (:class:`MPNetConfig`):
+            An instance of MPNetConfig used to construct MPNetModel.
+        num_choices (int, optional):
+            The number of choices. Defaults to `2`.
+    """
+
+    def __init__(self, config: MPNetConfig, num_choices=2):
+        super(MPNetForMultipleChoice, self).__init__(config)
+        self.num_choices = num_choices
+        self.mpnet = MPNetModel(config)
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+    def forward(self, input_ids, position_ids=None, attention_mask=None):
+        r"""
+        The MPNetForMultipleChoice forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`MPNetModel` and shape as [batch_size, num_choice, sequence_length].
+            position_ids(Tensor, optional):
+                See :class:`MPNetModel` and shape as [batch_size, num_choice, sequence_length].
+            attention_mask (list, optional):
+                See :class:`MPNetModel` and shape as [batch_size, num_choice, sequence_length].
+
+        Returns:
+            Tensor: Returns tensor `reshaped_logits`, a tensor of the multiple choice classification logits.
+            Shape as `[batch_size, num_choice]` and dtype as `float32`.
+
+        Example:
+            .. code-block::
+                import paddle
+                from paddlenlp.transformers import MPNetForMultipleChoice, MPNetTokenizer
+
+                tokenizer = MPNetTokenizer.from_pretrained('mpnet-base')
+                model = MPNetForMultipleChoice.from_pretrained('mpnet-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+
+                logits = model(**inputs)
+
+        """
+        # input_ids: [bs, num_choice, seq_l]
+        input_ids = input_ids.reshape(shape=(-1, input_ids.shape[-1]))  # flat_input_ids: [bs*num_choice,seq_l]
+
+        if position_ids is not None:
+            position_ids = position_ids.reshape(shape=(-1, position_ids.shape[-1]))
+
+        if attention_mask is not None:
+            attention_mask = attention_mask.reshape(shape=(-1, attention_mask.shape[-1]))
+
+        _, pooled_output = self.mpnet(input_ids, position_ids=position_ids, attention_mask=attention_mask)
+        pooled_output = self.dropout(pooled_output)
+
+        logits = self.classifier(pooled_output)  # logits: (bs*num_choice,1)
+        reshaped_logits = logits.reshape(shape=(-1, self.num_choices))  # logits: (bs, num_choice)
+
+        return reshaped_logits
+
+
+class MPNetForTokenClassification(MPNetPretrainedModel):
+    """
+    MPNet Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        config (:class:`MPNetConfig`):
+            An instance of MPNetConfig used to construct MPNetModel.
+    """
+
+    def __init__(self, config: MPNetConfig):
+        super(MPNetForTokenClassification, self).__init__(config)
+        self.mpnet = MPNetModel(config)
+        self.num_labels = config.num_labels
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, input_ids, position_ids=None, attention_mask=None):
+        r"""
+        The MPNetForTokenClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`MPNetModel`.
+            position_ids(Tensor, optional):
+                See :class:`MPNetModel`.
+            attention_mask (list, optional):
+                See :class:`MPNetModel`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input token classification logits.
+            Shape as `[batch_size, sequence_length, num_classes]` and dtype as `float32`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import MPNetForTokenClassification, MPNetTokenizer
+
+                tokenizer = MPNetTokenizer.from_pretrained('mpnet-base')
+                model = MPNetForTokenClassification.from_pretrained('mpnet-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+
+                logits = model(**inputs)
+        """
+        sequence_output, _ = self.mpnet(input_ids, position_ids=position_ids, attention_mask=attention_mask)
+        sequence_output = self.dropout(sequence_output)
+
+        logits = self.classifier(sequence_output)
+
+        return logits
+
+
+class MPNetForQuestionAnswering(MPNetPretrainedModel):
+    """
+    MPNet Model with a linear layer on top of the hidden-states output to compute `span_start_logits`
+    and `span_end_logits`, designed for question-answering tasks like SQuAD.
+
+    Args:
+        config (:class:`MPNetConfig`):
+            An instance of MPNetConfig used to construct MPNetModel.
+    """
+
+    def __init__(self, config: MPNetConfig):
+        super(MPNetForQuestionAnswering, self).__init__(config)
+        self.mpnet = MPNetModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, input_ids, position_ids=None, attention_mask=None):
+        r"""
+        The MPNetForQuestionAnswering forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`MPNetModel`.
+            position_ids (Tensor, optional):
+                See :class:`MPNetModel`.
+            attention_mask (Tensor, optional):
+                See :class:`MPNetModel`.
+
+        Returns:
+            tuple: Returns tuple (`start_logits`, `end_logits`).
+
+            With the fields:
+
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import MPNetForQuestionAnswering, MPNetTokenizer
+
+                tokenizer = MPNetTokenizer.from_pretrained('mpnet-base')
+                model = MPNetForQuestionAnswering.from_pretrained('mpnet-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                start_logits = outputs[0]
+                end_logits  = outputs[1]
+
+        """
+
+        sequence_output, _ = self.mpnet(input_ids, position_ids=position_ids, attention_mask=attention_mask)
+        logits = self.qa_outputs(sequence_output)
+        logits = paddle.transpose(logits, perm=[2, 0, 1])
+
+        start_logits, end_logits = paddle.unstack(x=logits, axis=0)
+
+        return start_logits, end_logits
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mpnet/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mpnet/tokenizer.py
new file mode 100644
index 000000000..f91bb00dd
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mpnet/tokenizer.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The HuggingFace Inc. team, Microsoft Corporation.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .. import AddedToken
+from ..bert.tokenizer import BertTokenizer
+
+__all__ = ["MPNetTokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"mpnet-base": 514}
+
+
+class MPNetTokenizer(BertTokenizer):
+    """
+    Construct a MPNet tokenizer which is almost identical to `BertTokenizer`.
+    For more information regarding those methods, please refer to this superclass.
+    """
+
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "mpnet-base": "https://bj.bcebos.com/paddlenlp/models/transformers/mpnet/mpnet-base/vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {"mpnet-base": {"do_lower_case": True}}
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        bos_token="<s>",
+        eos_token="</s>",
+        unk_token="[UNK]",
+        sep_token="</s>",
+        pad_token="<pad>",
+        cls_token="<s>",
+        mask_token="<mask>",
+        **kwargs
+    ):
+
+        super().__init__(
+            vocab_file=vocab_file,
+            do_lower_case=do_lower_case,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+        self._build_special_tokens_map_extended(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+        )
+
+    def __call__(
+        self,
+        text,
+        text_pair=None,
+        max_length=None,
+        stride=0,
+        padding=False,
+        is_split_into_words=False,
+        pad_to_max_seq_len=False,
+        truncation=False,
+        return_position_ids=False,
+        return_token_type_ids=False,
+        return_attention_mask=False,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_special_tokens_mask=False,
+        add_special_tokens=True,
+        pad_to_multiple_of=None,
+        return_offsets_mapping=False,
+    ):
+        return super().__call__(
+            text,
+            text_pair=text_pair,
+            max_length=max_length,
+            stride=stride,
+            padding=padding,
+            is_split_into_words=is_split_into_words,
+            pad_to_max_seq_len=pad_to_max_seq_len,
+            truncation=truncation,
+            return_position_ids=return_position_ids,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_length=return_length,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            add_special_tokens=add_special_tokens,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_offsets_mapping=return_offsets_mapping,
+        )
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens.
+
+        A MPNet sequence has the following format:
+
+        - single sequence:      ``<s> X </s>``
+        - pair of sequences:        ``<s> A </s></s> B </s>``
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+
+        Returns:
+            List[int]: List of input_id with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+            already_has_special_tokens (bool, optional): Whether or not the token list is already
+                formatted with special tokens for the model. Defaults to None.
+
+        Returns:
+            List[int]: The list of integers either be 0 or 1: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. MPNet does not
+        make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+
+        Returns:
+            List[int]: List of token_type_id according to the given sequence(s).
+        """
+
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0)] + [(0, 0)] + offset_mapping_1 + [(0, 0)]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mt5/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mt5/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mt5/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mt5/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mt5/configuration.py
new file mode 100644
index 000000000..a02062b3b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mt5/configuration.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" mT5 model configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["MT5_PRETRAINED_INIT_CONFIGURATION", "MT5Config"]
+
+MT5_PRETRAINED_INIT_CONFIGURATION = {}
+
+
+class MT5Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MT5Model`]. It is used to
+    instantiate a bert model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the mT5
+    mt5-small architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 250112):
+            Vocabulary size of the mT5 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MT5Model`].
+        d_model (`int`, *optional*, defaults to 512):
+            Size of the encoder layers and the pooler layer.
+        d_kv (`int`, *optional*, defaults to 64):
+            Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model //
+            num_heads`.
+        d_ff (`int`, *optional*, defaults to 1024):
+            Size of the intermediate feed forward layer in each `MT5Block`.
+        num_layers (`int`, *optional*, defaults to 8):
+            Number of hidden layers in the Transformer encoder.
+        num_decoder_layers (`int`, *optional*):
+            Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set.
+        num_heads (`int`, *optional*, defaults to 6):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
+            The number of buckets to use for each attention layer.
+        relative_attention_max_distance (`int`, *optional*, defaults to 128):
+            The maximum distance of the longer sequences for the bucket separation.
+        dropout_rate (`float`, *optional*, defaults to 0.1):
+            The ratio for all dropout layers.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        initializer_factor (`float`, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        feed_forward_proj (`string`, *optional*, defaults to `"gated-gelu"`):
+            he non-linear activation function (function or string) in the feed forward layer in the residual attention block.
+            If string, `"relu"`, `"gated-gelu"` are supported. Defaults to `"gated-gelu"`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        pad_token_id (int, optional):
+            The id of the `padding` token. Defaults to `0`.
+        bos_token_id (int, optional):
+            The id of the `bos` token. Defaults to `0`.
+        eos_token_id (int, optional):
+            The id of the `eos` token. Defaults to `1`.
+
+    """
+    model_type = "mt5"
+    attribute_map: Dict[str, str] = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "num_heads",
+        "num_hidden_layers": "num_layers",
+        "num_classes": "num_labels",
+    }
+    pretrained_init_configuration = MT5_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size: int = 250112,
+        d_model: int = 512,
+        d_kv: int = 64,
+        d_ff: int = 1024,
+        num_layers: int = 8,
+        num_decoder_layers: int = None,
+        num_heads: int = 6,
+        relative_attention_num_buckets: int = 32,
+        relative_attention_max_distance: int = 128,
+        dropout_rate: float = 0.1,
+        layer_norm_epsilon: float = 1e-6,
+        initializer_factor: float = 1.0,
+        feed_forward_proj: str = "gated-gelu",
+        is_encoder_decoder: bool = True,
+        use_cache: bool = True,
+        bos_token_id: int = 0,
+        pad_token_id: int = 0,
+        eos_token_id: int = 1,
+        **kwargs
+    ):
+
+        super().__init__(
+            bos_token_id=bos_token_id,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.d_kv = d_kv
+        self.d_ff = d_ff
+        self.num_layers = num_layers
+        self.num_decoder_layers = (
+            num_decoder_layers if num_decoder_layers is not None else self.num_layers
+        )  # default = symmetry
+        self.num_heads = num_heads
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.relative_attention_max_distance = relative_attention_max_distance
+        self.dropout_rate = dropout_rate
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_factor = initializer_factor
+        self.feed_forward_proj = feed_forward_proj
+        self.use_cache = use_cache
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mt5/converter.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mt5/converter.py
new file mode 100644
index 000000000..52649af50
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mt5/converter.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from collections import OrderedDict
+
+dont_transpose = [
+    "shared.weight",
+    "layer_norm.weight",
+    ".layer_norm.weight",
+    "relative_attention_bias.weight",
+    "embed_tokens.weight",
+]
+
+
+def convert_pytorch_checkpoint_to_paddle(pytorch_checkpoint_path, paddle_dump_path):
+    import paddle
+    import torch
+
+    pytorch_state_dict = torch.load(pytorch_checkpoint_path, map_location="cpu")
+    paddle_state_dict = OrderedDict()
+    for k, v in pytorch_state_dict.items():
+        transpose = False
+
+        if k[-7:] == ".weight":
+            if not any([w in k for w in dont_transpose]):
+                if v.ndim == 2:
+                    v = v.transpose(0, 1)
+                    transpose = True
+
+        print(f"Converting: {k} | is_transpose {transpose}")
+
+        if k != "lm_head.weight":
+            k = "mt5." + k
+        paddle_state_dict[k] = v.data.numpy()
+
+    paddle.save(paddle_state_dict, paddle_dump_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--pytorch_checkpoint_path",
+        default="google/mt5-small/pytorch_model.bin",
+        type=str,
+        required=False,
+        help="Path to the Pytorch checkpoint path.",
+    )
+    parser.add_argument(
+        "--paddle_dump_path",
+        default="paddle/mt5-small/model_state.pdparams",
+        type=str,
+        required=False,
+        help="Path to the output Paddle model.",
+    )
+    args = parser.parse_args()
+    convert_pytorch_checkpoint_to_paddle(args.pytorch_checkpoint_path, args.paddle_dump_path)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mt5/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mt5/modeling.py
new file mode 100644
index 000000000..a07079746
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/mt5/modeling.py
@@ -0,0 +1,1742 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 Mesh TensorFlow authors, mT5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import copy
+import math
+from typing import Optional, Tuple
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import Tensor
+from paddle.distributed.fleet.utils import recompute
+
+from ...utils.converter import StateDictNameMapping, init_name_mappings
+from ...utils.log import logger
+from ..activations import ACT2FN
+from ..model_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+    convert_encoder_output,
+)
+from ..model_utils import PretrainedModel, register_base_model
+from .configuration import MT5_PRETRAINED_INIT_CONFIGURATION, MT5Config
+
+__all__ = [
+    "MT5Model",
+    "MT5PretrainedModel",
+    "MT5ForConditionalGeneration",
+    "MT5EncoderModel",
+    "MT5_PRETRAINED_MODEL_ARCHIVE_LIST",
+]
+
+MT5_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "google/mt5-small",
+    "google/mt5-base",
+    "google/mt5-large",
+    "google/mt5-xl",
+    "google/mt5-xxl",
+]
+
+DATA_TYPE_MAP = {
+    paddle.int64: "int64",
+    paddle.int32: "int32",
+    paddle.float32: "float32",
+    paddle.float64: "float64",
+    paddle.float16: "float16",
+}
+
+
+def data_type_converter(tensor):
+    return DATA_TYPE_MAP[tensor.dtype]
+
+
+def finfo(dtype):
+    if dtype == paddle.float32:
+        return np.finfo(np.float32)
+    if dtype == paddle.float16:
+        return np.finfo(np.float16)
+    if dtype == paddle.float64:
+        return np.finfo(np.float64)
+
+
+class MT5LayerNorm(nn.Layer):
+    """
+    Construct a layernorm module in the MT5 style No bias and no subtraction of mean.
+    """
+
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = self.create_parameter(shape=[hidden_size], default_initializer=nn.initializer.Constant(1.0))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        # layer norm should always be calculated in float32
+        variance = paddle.pow(hidden_states.astype(paddle.float32), 2).mean(axis=-1, keepdim=True)
+        hidden_states = hidden_states * paddle.rsqrt(variance + self.variance_epsilon)
+
+        # convert into float16 if necessary
+        if self.weight.dtype == paddle.float16:
+            hidden_states = hidden_states.astype(paddle.float16)
+        return self.weight * hidden_states
+
+
+class MT5DenseReluDense(nn.Layer):
+    """
+    Construct a dense-relu-dense module.
+    """
+
+    def __init__(self, config: MT5Config):
+        super().__init__()
+        self.wi = nn.Linear(config.d_model, config.d_ff, bias_attr=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias_attr=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        hidden_states = self.wi(hidden_states)
+        hidden_states = F.relu(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class MT5DenseGatedGeluDense(nn.Layer):
+    """
+    Construct a dense-gated_gelu-dense module.
+    """
+
+    def __init__(self, config: MT5Config):
+        super().__init__()
+        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias_attr=False)
+        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias_attr=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias_attr=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        self.gelu_act = ACT2FN["gelu_new"]
+
+    def forward(self, hidden_states):
+        hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_gelu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class MT5DenseGatedSiluDense(nn.Layer):
+    """
+    Construct a dense-gated_gelu-dense module.
+    """
+
+    def __init__(self, config: MT5Config):
+        super().__init__()
+        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias_attr=False)
+        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias_attr=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias_attr=False)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        hidden_silu = F.silu(self.wi_0(hidden_states))
+        hidden_linear = self.wi_1(hidden_states)
+        hidden_states = hidden_silu * hidden_linear
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class MT5LayerFF(nn.Layer):
+    def __init__(self, config: MT5Config):
+        super().__init__()
+        if config.feed_forward_proj == "relu":
+            self.DenseReluDense = MT5DenseReluDense(config)
+        elif config.feed_forward_proj == "gated-gelu":
+            self.DenseReluDense = MT5DenseGatedGeluDense(config)
+        elif config.feed_forward_proj == "gated-silu":
+            self.DenseReluDense = MT5DenseGatedSiluDense(config)
+        else:
+            raise ValueError(f"{config.feed_forward_proj} is not supported. Choose between `relu` and `gated-gelu`")
+
+        self.layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(self, hidden_states):
+        forwarded_states = self.layer_norm(hidden_states)
+        forwarded_states = self.DenseReluDense(forwarded_states)
+        hidden_states = hidden_states + self.dropout(forwarded_states)
+        return hidden_states
+
+
+class MT5Attention(nn.Layer):
+    def __init__(self, config: MT5Config, has_relative_attention_bias: bool = False):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.has_relative_attention_bias = has_relative_attention_bias
+
+        self.relative_attention_num_buckets = config.relative_attention_num_buckets
+        self.d_model = config.d_model
+        self.key_value_proj_dim = config.d_kv
+        self.n_heads = config.num_heads
+        self.dropout = config.dropout_rate
+        self.inner_dim = self.n_heads * self.key_value_proj_dim
+        # Recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.q = nn.Linear(self.d_model, self.inner_dim, bias_attr=False)
+        self.k = nn.Linear(self.d_model, self.inner_dim, bias_attr=False)
+        self.v = nn.Linear(self.d_model, self.inner_dim, bias_attr=False)
+        self.o = nn.Linear(self.inner_dim, self.d_model, bias_attr=False)
+
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
+
+    @staticmethod
+    def _relative_position_bucket(relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention. The relative position is defined as
+        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
+        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
+        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
+        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
+        This should allow for more graceful generalization to longer sequences than the model has been trained on
+
+        Args:
+            relative_position: an int64 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+
+        Returns:
+            a Tensor with the same shape as relative_position, containing int64 values in the range [0, num_buckets)
+
+        """
+        relative_buckets = 0
+        if bidirectional:
+            num_buckets //= 2
+            relative_buckets += (relative_position > 0).astype(paddle.int64) * num_buckets
+            relative_position = paddle.abs(relative_position)
+        else:
+            relative_position = -paddle.minimum(relative_position, paddle.zeros_like(relative_position))
+        # now relative_position is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = relative_position < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        relative_postion_if_large = max_exact + (
+            paddle.log(relative_position.astype(paddle.get_default_dtype()) / max_exact)
+            / math.log(max_distance / max_exact)
+            * (num_buckets - max_exact)
+        ).astype(paddle.int64)
+        relative_postion_if_large = paddle.minimum(
+            relative_postion_if_large,
+            paddle.full_like(relative_postion_if_large, num_buckets - 1),
+        )
+
+        relative_buckets += paddle.where(is_small, relative_position, relative_postion_if_large)
+        return relative_buckets
+
+    def compute_bias(self, query_length, key_length):
+        """Compute binned relative position bias"""
+        context_position = paddle.arange(query_length).unsqueeze(-1)
+        memory_position = paddle.arange(key_length).unsqueeze(0)
+        relative_position = memory_position - context_position  # shape (query_length, key_length)
+        relative_position_bucket = self._relative_position_bucket(
+            relative_position,  # shape (query_length, key_length)
+            bidirectional=(not self.is_decoder),
+            num_buckets=self.relative_attention_num_buckets,
+        )
+        values = self.relative_attention_bias(relative_position_bucket)  # shape (query_length, key_length, num_heads)
+        values = values.transpose(perm=[2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
+        return values
+
+    def forward(
+        self,
+        hidden_states,
+        mask=None,
+        key_value_states=None,
+        position_bias=None,
+        cache=None,
+        query_length=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        """
+        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+        """
+        # Input is (batch_size, seq_length, dim)
+        # Mask is (batch_size, key_length) (non-causal) or (batch_size, key_length, key_length)
+        # cache[0] is (batch_size, n_heads, q_len - 1, dim_per_head)
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        real_seq_length = seq_length
+
+        if cache is not None:
+            assert len(cache) == 2, f"cache should have 2 past states: keys and values. Got { len(cache)} past states"
+            real_seq_length += cache[0].shape[2] if query_length is None else query_length
+
+        key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
+
+        def shape(states):
+            """projection"""
+            return states.reshape(shape=[batch_size, -1, self.n_heads, self.key_value_proj_dim]).transpose(
+                perm=[0, 2, 1, 3]
+            )
+
+        def unshape(states):
+            """reshape"""
+            return states.transpose(perm=[0, 2, 1, 3]).reshape(shape=[batch_size, -1, self.inner_dim])
+
+        def project(hidden_states, proj_layer, key_value_states, cache):
+            """projects hidden states correctly to key/query states"""
+            if key_value_states is None:
+                # self-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(hidden_states))
+            elif cache is None:
+                # cross-attn
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(key_value_states))
+
+            if cache is not None:
+                if key_value_states is None:
+                    # self-attn
+                    # (batch_size, n_heads, key_length, dim_per_head)
+                    hidden_states = paddle.concat([cache, hidden_states], axis=2)
+                else:
+                    # cross-attn
+                    hidden_states = cache
+            return hidden_states
+
+        # get query states
+        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
+
+        # get key/value states
+        key_states = project(
+            hidden_states,
+            self.k,
+            key_value_states,
+            cache[0] if cache is not None else None,
+        )
+        value_states = project(
+            hidden_states,
+            self.v,
+            key_value_states,
+            cache[1] if cache is not None else None,
+        )
+
+        # compute scores
+        scores = paddle.matmul(query_states, key_states, transpose_y=True)
+
+        if position_bias is None:
+            if not self.has_relative_attention_bias:
+                position_bias = paddle.zeros(
+                    shape=(1, self.n_heads, real_seq_length, key_length),
+                    dtype=scores.dtype,
+                )
+                if self.training and self.enable_recompute:
+                    position_bias.stop_gradient = False
+            else:
+                position_bias = self.compute_bias(real_seq_length, key_length)
+
+            # if key and values are already calculated
+            # we want only the last query position bias
+            if cache is not None:
+                position_bias = position_bias[:, :, -hidden_states.shape[1] :, :]
+
+            if mask is not None:
+                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
+
+        scores += position_bias
+        attn_weights = F.softmax(scores.astype(paddle.float32), axis=-1).astype(
+            scores.dtype
+        )  # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = F.dropout(
+            attn_weights, p=self.dropout, training=self.training
+        )  # (batch_size, n_heads, seq_length, key_length)
+
+        attn_output = unshape(paddle.matmul(attn_weights, value_states))  # (batch_size, seq_length, dim)
+
+        attn_output = self.o(attn_output)
+
+        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
+        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
+
+        if output_attentions:
+            outputs = outputs + (attn_weights,)
+        return outputs
+
+
+class MT5LayerSelfAttention(nn.Layer):
+    def __init__(self, config: MT5Config, has_relative_attention_bias: bool = False):
+        super().__init__()
+        self.SelfAttention = MT5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
+        self.layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        cache=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+        attention_output = self.SelfAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            position_bias=position_bias,
+            cache=cache,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + self.dropout(attention_output[0])
+        outputs = (hidden_states,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class MT5LayerCrossAttention(nn.Layer):
+    def __init__(self, config: MT5Config):
+        super().__init__()
+        self.EncDecAttention = MT5Attention(config, has_relative_attention_bias=False)
+        self.layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+    def forward(
+        self,
+        hidden_states,
+        key_value_states,
+        attention_mask=None,
+        position_bias=None,
+        cache=None,
+        use_cache=False,
+        query_length=None,
+        output_attentions=False,
+    ):
+        normed_hidden_states = self.layer_norm(hidden_states)
+
+        attention_output = self.EncDecAttention(
+            normed_hidden_states,
+            mask=attention_mask,
+            key_value_states=key_value_states,
+            position_bias=position_bias,
+            cache=cache,
+            use_cache=use_cache,
+            query_length=query_length,
+            output_attentions=output_attentions,
+        )
+        layer_output = hidden_states + self.dropout(attention_output[0])
+        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class MT5Block(nn.Layer):
+    def __init__(self, config: MT5Config, has_relative_attention_bias: bool = False):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.layer = nn.LayerList()
+        self.layer.append(MT5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
+        if self.is_decoder:
+            self.layer.append(MT5LayerCrossAttention(config))
+
+        self.layer.append(MT5LayerFF(config))
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_bias=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        encoder_decoder_position_bias=None,
+        cache=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+
+        if cache is not None:
+            assert self.is_decoder, "Only decoder can use `caches`"
+            expected_num_caches = 2 if encoder_hidden_states is None else 4
+
+            if len(cache) != expected_num_caches:
+                raise ValueError(
+                    f"There should be {expected_num_caches} past states. "
+                    f"{'2 (past / key) for cross attention. ' if expected_num_caches == 4 else ''}"
+                    f"Got {len(cache)} past key / value states"
+                )
+
+            self_attn_cache = cache[:2]
+            cross_attn_cache = cache[2:]
+        else:
+            self_attn_cache, cross_attn_cache = None, None
+
+        self_attention_outputs = self.layer[0](
+            hidden_states,
+            attention_mask=attention_mask,
+            position_bias=position_bias,
+            cache=self_attn_cache,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        hidden_states, present_key_value_state = self_attention_outputs[:2]
+
+        attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == paddle.float16 and paddle.isinf(hidden_states).any():
+            # TODO finfo
+            clamp_value = finfo(hidden_states.dtype).max - 1000
+            hidden_states = paddle.clip(hidden_states, min=-clamp_value, max=clamp_value)
+
+        do_cross_attention = self.is_decoder and encoder_hidden_states is not None
+        if do_cross_attention:
+            # the actual query length is unknown for cross attention
+            # if using past key value states. Need to inject it here
+            if present_key_value_state is not None:
+                query_length = present_key_value_state[0].shape[2]
+            else:
+                query_length = None
+
+            cross_attention_outputs = self.layer[1](
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                position_bias=encoder_decoder_position_bias,
+                cache=cross_attn_cache,
+                query_length=query_length,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = cross_attention_outputs[0]
+
+            # clamp inf values to enable fp16 training
+            if hidden_states.dtype == paddle.float16 and paddle.isinf(hidden_states).any():
+                clamp_value = finfo(hidden_states.dtype).max - 1000
+                hidden_states = paddle.clip(hidden_states, min=-clamp_value, max=clamp_value)
+
+            # Combine self attn and cross attn key value states
+            if present_key_value_state is not None:
+                present_key_value_state = present_key_value_state + cross_attention_outputs[1]
+
+            # Keep cross-attention outputs and relative position weights
+            attention_outputs = attention_outputs + cross_attention_outputs[2:]
+
+        # Apply Feed Forward layer
+        hidden_states = self.layer[-1](hidden_states)
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == paddle.float16 and paddle.isinf(hidden_states).any():
+            clamp_value = finfo(hidden_states.dtype).max - 1000
+            hidden_states = paddle.clip(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if use_cache:
+            outputs = outputs + (present_key_value_state,) + attention_outputs
+        else:
+            outputs = outputs + attention_outputs
+
+        return outputs  # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+
+
+class MT5PretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained MT5 models. It provides MT5 related
+    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
+    `pretrained_init_configuration`, `base_model_prefix` for downloading and
+    loading pretrained models. See `PretrainedModel` for more details.
+    """
+
+    base_model_prefix = "mt5"
+    config_class = MT5Config
+
+    pretrained_init_configuration = MT5_PRETRAINED_INIT_CONFIGURATION
+
+    # support AutoConverter after fix load_torch function
+    @classmethod
+    def _get_name_mappings(cls, config: MT5Config) -> list[StateDictNameMapping]:
+        mappings: list[StateDictNameMapping] = []
+        model_mappings = [
+            "shared.weight",
+            "encoder.embed_tokens.weight",
+            "encoder.final_layer_norm.weight",
+            "decoder.embed_tokens.weight",
+            "decoder.final_layer_norm.weight",
+            "encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight",
+            "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight",
+        ]
+        for layer_index in range(config.num_hidden_layers):
+            for att_head in ["q", "k", "v", "o"]:
+                model_mappings.extend(
+                    [
+                        [
+                            f"encoder.block.{layer_index}.layer.0.SelfAttention.{att_head}.weight",
+                            None,
+                            "transpose",
+                        ],
+                        [
+                            f"decoder.block.{layer_index}.layer.0.SelfAttention.{att_head}.weight",
+                            None,
+                            "transpose",
+                        ],
+                        [
+                            f"decoder.block.{layer_index}.layer.1.EncDecAttention.{att_head}.weight",
+                            None,
+                            "transpose",
+                        ],
+                    ]
+                )
+
+            layer_mappings = [
+                [
+                    f"encoder.block.{layer_index}.layer.1.DenseReluDense.wo.weight",
+                    None,
+                    "transpose",
+                ],
+                [
+                    f"decoder.block.{layer_index}.layer.2.DenseReluDense.wo.weight",
+                    None,
+                    "transpose",
+                ],
+                f"encoder.block.{layer_index}.layer.0.layer_norm.weight",
+                f"encoder.block.{layer_index}.layer.1.layer_norm.weight",
+                f"decoder.block.{layer_index}.layer.0.layer_norm.weight",
+                f"decoder.block.{layer_index}.layer.1.layer_norm.weight",
+                f"decoder.block.{layer_index}.layer.2.layer_norm.weight",
+            ]
+
+            if config.feed_forward_proj == "relu":
+                layer_mappings.extend(
+                    [
+                        [
+                            f"encoder.block.{layer_index}.layer.1.DenseReluDense.wi.weight",
+                            None,
+                            "transpose",
+                        ],
+                        [
+                            f"decoder.block.{layer_index}.layer.2.DenseReluDense.wi.weight",
+                            None,
+                            "transpose",
+                        ],
+                    ]
+                )
+            elif config.feed_forward_proj == "gated-gelu":
+                for i in range(2):
+                    layer_mappings.extend(
+                        [
+                            [
+                                f"encoder.block.{layer_index}.layer.1.DenseReluDense.wi_{i}.weight",
+                                None,
+                                "transpose",
+                            ],
+                            [
+                                f"decoder.block.{layer_index}.layer.2.DenseReluDense.wi_{i}.weight",
+                                None,
+                                "transpose",
+                            ],
+                        ]
+                    )
+
+            model_mappings.extend(layer_mappings)
+
+        init_name_mappings(model_mappings)
+
+        if cls.__name__ != "MT5Model":
+            for mapping in model_mappings:
+                mapping[1] = "mt5." + mapping[1]
+
+        if config.architectures is not None and "MT5ForConditionalGeneration" in config.architectures:
+            model_mappings.append(["lm_head.weight", "lm_head.weight", "transpose"])
+
+        mappings = [StateDictNameMapping(*mapping) for mapping in model_mappings]
+        return mappings
+
+    @property
+    def dummy_inputs(self):
+        DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
+        DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]
+        input_ids = paddle.assign(np.asarray(DUMMY_INPUTS, dtype="int64"))
+        input_mask = paddle.assign(np.asarray(DUMMY_MASK, dtype="int64"))
+        dummy_inputs = {
+            "decoder_input_ids": input_ids,
+            "input_ids": input_ids,
+            "decoder_attention_mask": input_mask,
+        }
+        return dummy_inputs
+
+    def _init_weights(self, layer):
+        """Initialize the weights"""
+        # Used for testing weights initialization
+        factor = self.config.initializer_factor
+        d_model = self.config.d_model
+        d_ff = self.config.d_ff
+        n_heads = self.config.num_heads
+        key_value_proj_dim = self.config.d_kv
+
+        if isinstance(layer, MT5LayerNorm):
+            layer.weight.set_value(paddle.ones_like(layer.weight) * factor)
+        elif isinstance(layer, MT5Model):
+            # Mesh TensorFlow embeddings initialization
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
+            layer.shared.weight.set_value(paddle.normal(mean=0.0, std=factor * 1.0, shape=layer.shared.weight.shape))
+        elif isinstance(layer, (MT5ForConditionalGeneration,)):
+            layer.mt5.shared.weight.set_value(
+                paddle.normal(mean=0.0, std=factor * 1.0, shape=layer.mt5.shared.weight.shape)
+            )
+
+        elif isinstance(layer, MT5DenseReluDense):
+            # Mesh TensorFlow FF initialization
+            # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
+            # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
+            layer.wi.weight.set_value(
+                paddle.normal(mean=0.0, std=factor * ((d_model) ** -0.5), shape=layer.wi.weight.shape)
+            )
+
+            if hasattr(layer.wi, "bias") and layer.wi.bias is not None:
+                layer.wi.bias.set_value(paddle.zeros_like(layer.wi.bias))
+
+            layer.wo.weight.set_value(
+                paddle.normal(mean=0.0, std=factor * ((d_ff) ** -0.5), shape=layer.wo.weight.shape)
+            )
+
+            if hasattr(layer.wo, "bias") and layer.wo.bias is not None:
+                layer.wo.bias.set_value(paddle.zeros_like(layer.wo.bias))
+
+        elif isinstance(layer, MT5DenseGatedGeluDense):
+            layer.wi_0.weight.set_value(
+                paddle.normal(mean=0.0, std=factor * ((d_model) ** -0.5), shape=layer.wi_0.weight.shape)
+            )
+            if hasattr(layer.wi_0, "bias") and layer.wi_0.bias is not None:
+                layer.wi_0.bias.set_value(paddle.zeros_like(layer.wi_0.bias))
+
+            layer.wi_1.weight.set_value(
+                paddle.normal(mean=0.0, std=factor * ((d_model) ** -0.5), shape=layer.wi_1.weight.shape)
+            )
+            if hasattr(layer.wi_1, "bias") and layer.wi_1.bias is not None:
+                layer.wi_1.bias.set_value(paddle.zeros_like(layer.wi_1.bias))
+
+            layer.wo.weight.set_value(
+                paddle.normal(mean=0.0, std=factor * ((d_ff) ** -0.5), shape=layer.wo.weight.shape)
+            )
+
+            if hasattr(layer.wo, "bias") and layer.wo.bias is not None:
+                layer.wo.bias.set_value(paddle.zeros_like(layer.wo.bias))
+        elif isinstance(layer, MT5Attention):
+            # Mesh TensorFlow attention initialization to avoid scaling before softmax
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
+
+            layer.q.weight.set_value(
+                paddle.normal(
+                    mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5), shape=layer.q.weight.shape
+                )
+            )
+
+            layer.k.weight.set_value(
+                paddle.normal(mean=0.0, std=factor * (d_model**-0.5), shape=layer.k.weight.shape)
+            )
+
+            layer.v.weight.set_value(
+                paddle.normal(mean=0.0, std=factor * (d_model**-0.5), shape=layer.v.weight.shape)
+            )
+
+            layer.o.weight.set_value(
+                paddle.normal(
+                    mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5), shape=layer.o.weight.shape
+                )
+            )
+
+            if layer.has_relative_attention_bias:
+                layer.relative_attention_bias.weight.set_value(
+                    paddle.normal(
+                        mean=0.0, std=factor * ((d_model) ** -0.5), shape=layer.relative_attention_bias.weight.shape
+                    )
+                )
+
+    def _shift_right(self, input_ids):
+        bos_token_id = self.config.bos_token_id
+        pad_token_id = self.config.pad_token_id
+
+        assert (
+            bos_token_id is not None
+        ), "bos_token_id has to be defined. In MT5 it is usually set to the pad_token_id. See MT5 docs for more information"
+
+        # shift inputs to the right
+        shifted_input_ids = paddle.zeros_like(input_ids)
+        shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+        shifted_input_ids[:, 0] = bos_token_id
+
+        assert pad_token_id is not None, "pad_token_id has to be defined."
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids = paddle.where(
+            shifted_input_ids == -100,
+            paddle.assign(np.asarray(pad_token_id, dtype=data_type_converter(shifted_input_ids)).reshape([1])),
+            shifted_input_ids,
+        )
+
+        assert paddle.all(shifted_input_ids >= 0), "Verify that `shifted_input_ids` has only positive values"
+
+        return shifted_input_ids
+
+
+class MT5Stack(nn.Layer):
+    def __init__(self, config: MT5Config, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__()
+        self.is_decoder = config.is_decoder
+        self.embed_tokens = embed_tokens
+        self.block = nn.LayerList(
+            [MT5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
+        )
+        self.final_layer_norm = MT5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+        # Recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embed_tokens = new_embeddings
+
+    @property
+    def dtype(self):
+        return self.embed_tokens.weight.dtype
+
+    @paddle.jit.not_to_static
+    def recompute_training(
+        self,
+        layer_module,
+        hidden_states,
+        extended_attention_mask,
+        position_bias,
+        encoder_hidden_states,
+        encoder_extended_attention_mask,
+        encoder_decoder_position_bias,
+        use_cache,
+        output_attentions,
+    ):
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return tuple(module(*inputs, use_cache, output_attentions))
+
+            return custom_forward
+
+        layer_outputs = recompute(
+            create_custom_forward(layer_module),
+            hidden_states,
+            extended_attention_mask,
+            position_bias,
+            encoder_hidden_states,
+            encoder_extended_attention_mask,
+            encoder_decoder_position_bias,
+            None,
+        )
+
+        return layer_outputs
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        inputs_embeds=None,
+        cache=None,
+        use_cache=False,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
+        **model_kwargs
+    ):
+
+        if input_ids is not None and inputs_embeds is not None:
+            err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(
+                f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+            # input_ids = input_ids.reshape(shape=[-1, input_shape[-1]])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+        else:
+            err_msg_prefix = "decoder_" if self.is_decoder else ""
+            raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
+
+        if inputs_embeds is None:
+            assert self.embed_tokens is not None, "You have to initialize the model with valid token embeddings"
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        batch_size, seq_length = input_shape
+
+        # required mask seq length can be calculated via length of past
+        mask_seq_length = cache[0][0].shape[2] + seq_length if cache is not None else seq_length
+
+        if use_cache is True:
+            assert self.is_decoder, f"`use_cache` can only be set to `True` if {self.__class__} is used as a decoder"
+
+        if attention_mask is None:
+            attention_mask = paddle.ones(shape=[batch_size, mask_seq_length])
+        if self.is_decoder and encoder_attention_mask is None and encoder_hidden_states is not None:
+            encoder_seq_length = encoder_hidden_states.shape[1]
+            encoder_attention_mask = paddle.ones([batch_size, encoder_seq_length], dtype=paddle.int64)
+
+        # initialize caches with `None` if past does not exist
+        if cache is None:
+            cache = [None] * len(self.block)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.shape
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = paddle.ones(shape=encoder_hidden_shape)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        present_key_value_states = () if use_cache else None
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and self.is_decoder) else None
+        position_bias = None
+        encoder_decoder_position_bias = None
+
+        hidden_states = self.dropout(inputs_embeds)
+
+        for i, (layer_module, past_key_value) in enumerate(zip(self.block, cache)):
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.enable_recompute and self.training:
+                if use_cache:
+                    logger.warning("`use_cache=True` is incompatible with Recompute. Setting " "`use_cache=False`...")
+                    use_cache = False
+
+                layer_outputs = self.recompute_training(
+                    layer_module,
+                    hidden_states,
+                    extended_attention_mask,
+                    position_bias,
+                    encoder_hidden_states,
+                    encoder_extended_attention_mask,
+                    encoder_decoder_position_bias,
+                    use_cache,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask=extended_attention_mask,
+                    position_bias=position_bias,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_extended_attention_mask,
+                    encoder_decoder_position_bias=encoder_decoder_position_bias,
+                    cache=past_key_value,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+
+            # layer_outputs is a tuple with:
+            # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
+            if not use_cache:
+                layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
+
+            hidden_states, present_key_value_state = layer_outputs[:2]
+
+            # We share the position biases between the layers - the first layer store them
+            # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
+            # (cross-attention position bias), (cross-attention weights)
+            position_bias = layer_outputs[2]
+            if self.is_decoder and encoder_hidden_states is not None:
+                encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
+            # append next layer key value states
+            if use_cache:
+                present_key_value_states = present_key_value_states + (present_key_value_state,)
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[3],)
+                if self.is_decoder:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[5],)
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    present_key_value_states,
+                    all_hidden_states,
+                    all_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=present_key_value_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+    def get_extended_attention_mask(self, attention_mask, input_shape):
+        if attention_mask.ndim == 3:
+            extended_attention_mask = attention_mask.unsqueeze(1)
+        elif attention_mask.ndim == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if self.is_decoder:
+                batch_size, seq_length = input_shape
+                seq_ids = paddle.arange(seq_length)
+                causal_mask = paddle.tile(
+                    seq_ids.unsqueeze(axis=[0, 1]), [batch_size, seq_length, 1]
+                ) <= seq_ids.unsqueeze(axis=[0, 2])
+                causal_mask = causal_mask.astype(attention_mask.dtype)
+
+                if causal_mask.shape[1] < attention_mask.shape[1]:
+                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+                    causal_mask = paddle.concat(
+                        [
+                            paddle.ones(
+                                [batch_size, seq_length, prefix_seq_len],
+                                dtype=causal_mask.dtype,
+                            ),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+
+                extended_attention_mask = causal_mask.unsqueeze(1) * attention_mask.unsqueeze([1, 2])
+            else:
+                extended_attention_mask = attention_mask.unsqueeze([1, 2])
+        elif attention_mask.ndim == 4:
+            if self.is_decoder:
+                batch_size, seq_length = input_shape
+                seq_ids = paddle.arange(seq_length)
+                causal_mask = paddle.tile(
+                    seq_ids.unsqueeze(axis=[0, 1]), [batch_size, seq_length, 1]
+                ) <= seq_ids.unsqueeze(axis=[0, 2])
+                # in case cache are used we need to add a prefix ones mask to the causal mask
+                # causal and attention masks must have same type
+                causal_mask = causal_mask.astype(attention_mask.dtype)
+
+                if causal_mask.shape[1] < attention_mask.shape[-1]:
+                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+                    causal_mask = paddle.concat(
+                        [
+                            paddle.ones(
+                                [batch_size, seq_length, prefix_seq_len],
+                                dtype=causal_mask.dtype,
+                            ),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+
+                extended_attention_mask = causal_mask.unsqueeze(1) * attention_mask
+            else:
+                extended_attention_mask = attention_mask
+        else:
+            raise ValueError(
+                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
+            )
+
+        extended_attention_mask = extended_attention_mask.astype(self.dtype)
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def invert_attention_mask(self, encoder_attention_mask):
+        if encoder_attention_mask.ndim == 4:
+            encoder_extended_attention_mask = encoder_attention_mask
+        elif encoder_attention_mask.ndim == 3:
+            encoder_extended_attention_mask = encoder_attention_mask.unsqueeze(1)
+        elif encoder_attention_mask.ndim == 2:
+            encoder_extended_attention_mask = encoder_attention_mask.unsqueeze([1, 2])
+        encoder_extended_attention_mask = encoder_extended_attention_mask.astype(self.dtype)  # fp16 compatibility
+
+        if self.dtype == paddle.float16:
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4
+        elif self.dtype == paddle.float32:
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4
+        else:
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4
+
+            # raise ValueError(
+            #     f"{self.dtype} not recognized. `dtype` should be set to either `paddle.float32` or `paddle.float16`"
+            # )
+
+        return encoder_extended_attention_mask
+
+
+@register_base_model
+class MT5Model(MT5PretrainedModel):
+    """
+    The bare MT5 Model transformer outputting raw hidden-states without any specific head on top.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (class:`MT5Config`):
+            Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+    """
+
+    def __init__(self, config: MT5Config):
+        super().__init__(config)
+        self.bos_token_id = config.bos_token_id
+        self.pad_token_id = config.pad_token_id
+        self.initializer_factor = config.initializer_factor
+        self.d_model = config.d_model
+        self.num_heads = config.num_heads
+        self.d_kv = config.d_kv
+        self.d_ff = config.d_ff
+        self.tie_word_embeddings = config.tie_word_embeddings
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_decoder = False
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.encoder = MT5Stack(encoder_config, self.shared)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        decoder_config.num_layers = config.num_decoder_layers
+        self.decoder = MT5Stack(decoder_config, self.shared)
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+        self.decoder.set_input_embeddings(new_embeddings)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_output=None,
+        cache=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        The MT5Model forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on
+                to some unwanted positions, usually the paddings or the subsequent positions.
+                Its data type can be int, float.
+                When the data type is int, the `masked` tokens have `0` values and the others
+                have `1` values.
+                When the data type is float, the `masked` tokens have `0.0` values and the
+                others have `1.0` values.
+                It is a tensor with shape broadcasted to [batch_size, num_attention_heads, sequence_length, sequence_length].
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            decoder_input_ids (Tensor, optional):
+                Indices of decoder input sequence tokens in the vocabulary.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means no `decoder_input_ids` is provided, the model will create the tensor
+                by shifting the `input_ids` to the right.
+            decoder_attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention to some unwanted positions in `decoder_input_ids`.
+                Its data type and shape is the same as `attention_mask`. Defaults to `None`.
+            encoder_output (tuple, optional):
+                The output of the encoder, a tuple consists `last_hidden_state`, `hidden_states`(optional), `attentions`(optional).
+                The data type of `last_hidden_state` is float32 and its shape is [batch_size, sequence_length, hidden_size].
+                `hidden_states` is hidden_states of all layers in the Transformer encoder. The length of `hidden_states` is `num_hidden_layers + 1`.
+                For all element in the tuple, its data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+                `attentions` is attentions of all layers of in the Transformer encoder. The length of `attentions` is `num_hidden_layers`.
+                For all element in the tuple, its data type should be float32 and its shape is [batch_size, num_attention_heads, sequence_length, sequence_length].
+            cache (Tuple[Tuple[Tensor]], optional):
+                Contains pre-computed hidden-states (key and values in the attention blocks)
+                as computed by the model. Can be used to speed up sequential decoding.
+                The `input_ids` which have their past given to this model should not be
+                passed as input ids as they have already been computed.
+                Defaults to `None`.
+            inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation
+                of shape `(batch_size, sequence_length, hidden_size)`. This is useful if you want more control over
+                how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+                Default to None.
+            decoder_inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+                representation  of shape `(batch_size, target_sequence_length, hidden_size)`. If `cache` is used,
+                optionally only the last `decoder_inputs_embeds` have to be input (see `past_key_values`).
+                This is useful if you want more control over how to convert `decoder_input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix. Default to None.
+
+                If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+                of `inputs_embeds`.
+            use_cache (bool, optional):
+                Whether or not to use cache. If set to `True`, `past_buckets_states` states are returned
+                and can be used to speed up decoding.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether or not to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            output_hidden_states (bool, optional):
+                Whether or not to return the output of all hidden layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether or not to return a class:`~paddlenlp.transformers.model_outputs.Seq2SeqModelOutput`. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqModelOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.Seq2SeqModelOutput`.
+
+            tuple: Returns tuple (`last_hidden_state`, `cache`, `decoder_hidden_states`, `decoder_attentions`,
+            `cross_attentions`, `encoder_last_hidden_state`, `encoder_hidden_states`, `encoder_attentions`)
+
+            With the fields:
+
+            - `last_hidden_state` (Tensor):
+                Sequence of hidden-states at the last layer of the decoder of the model.
+                It's data type should be float32 and
+                its shape is [batch_size, sequence_length, hidden_size].
+
+            - `cache` (List[tuple(Tensor, Tensor)], optional):
+                returned when `use_cache=True` is passed.
+                List of `tuple(Tensor, Tensor)` of length `config["num_layers"]`,
+                with the first element being the previous `buckets` of shape
+                `[batch_size, num_heads, num_hashes, sequence_length]` and the second
+                being the previous `hidden_states` of shape `[batch_size, sequence_length, hidden_size]`.
+
+            - `decoder_hidden_states` (tuple(Tensor), optional)
+                returned when ``output_hidden_states=True`` is passed.
+                Tuple of `Tensor` (one for the output of the embeddings + one for the output of decoder each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            - `decoder_attentions` (tuple(Tensor), optional):
+                returned when `output_attentions=True` is passed.
+                tuple of `Tensor` (one for each layer) of shape. Each Tensor has a data
+                type of float32 and its shape is [batch_size, num_heads, sequence_length, sequence_length].
+
+            - `cross_attentions` (tuple(Tensor), optional):
+                returned when `output_attentions=True` is passed.
+                tuple of `Tensor` (one for each layer) of shape. Each Tensor has a data
+                type of float32 and its shape is [batch_size, num_heads, sequence_length, sequence_length].
+
+            - `encoder_last_hidden_state` (Tensor):
+                Sequence of hidden-states at the last layer of the encoder of the model.
+                It's data type should be float32 and
+                its shape is [batch_size, sequence_length, hidden_size].
+
+            - `encoder_hidden_states` (tuple(Tensor), optional):
+                returned when `output_hidden_states=True` is passed.
+                tuple of `Tensor` (one for the output of the embeddings + one for the
+                output of encoder each layer). Each Tensor has a data type of float32
+                and its shape is [batch_size, sequence_length, hidden_size].
+
+            - `encoder_attentions` (tuple(Tensor), optional):
+                returned when `output_attentions=True` is passed.
+                tuple of `Tensor` (one for each layer) of shape. Each Tensor has a data
+                type of float32 and its shape is [batch_size, num_heads, sequence_length, sequence_length].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import MT5Model, AutoTokenizer
+
+                tokenizer = AutoTokenizer.from_pretrained('mt5-base')
+                model = MT5Model.from_pretrained('mt5-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                input_ids = paddle.to_tensor([inputs["input_ids"]], dtype="int64")
+                decoder_inputs = tokenizer("It means you can")
+                decoder_input_ids = paddle.to_tensor([decoder_inputs["input_ids"]], dtype="int64")
+
+                outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+                last_hidden_state = outputs[0]
+                print(last_hidden_state.shape)
+                # [1, 5, 768]
+
+        """
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Encode if needed (training, first prediction pass)
+        if encoder_output is None:
+            encoder_output = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        elif return_dict and not isinstance(encoder_output, BaseModelOutput):
+            encoder_output = convert_encoder_output(encoder_output)
+        hidden_states = encoder_output[0]
+
+        # Decode
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            cache=cache,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_output
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_output.last_hidden_state,
+            encoder_hidden_states=encoder_output.hidden_states,
+            encoder_attentions=encoder_output.attentions,
+        )
+
+
+class MT5ForConditionalGeneration(MT5PretrainedModel):
+    """
+    The MT5 Model transformer with a language modeling head on top.
+
+    Args:
+        config (:class:`MT5Config`):
+            An instance of MT5Config used to construct MT5ForConditionalGeneration.
+
+    """
+
+    def __init__(self, config: MT5Config):
+        super().__init__(config)
+        self.mt5 = MT5Model(config)
+        if not self.mt5.config["tie_word_embeddings"]:
+            self.lm_head = nn.Linear(self.mt5.config["d_model"], self.mt5.config["vocab_size"], bias_attr=False)
+
+    def get_input_embeddings(self):
+        return self.mt5.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.mt5.shared = new_embeddings
+        self.mt5.encoder.set_input_embeddings(new_embeddings)
+        self.mt5.decoder.set_input_embeddings(new_embeddings)
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def get_output_embeddings(self):
+        if self.mt5.config["tie_word_embeddings"]:
+            return self.mt5.shared
+        else:
+            return self.lm_head
+
+    def get_encoder(self):
+        return self.mt5.encoder
+
+    def get_decoder(self):
+        return self.mt5.decoder
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_output=None,
+        cache=None,
+        labels=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+
+        Args:
+            input_ids (Tensor, optional):
+                See :class:`MT5Model`.
+            attention_mask (Tensor, optional):
+                See :class:`MT5Model`.
+            decoder_input_ids (Tensor, optional):
+                See :class:`MT5Model`.
+            decoder_attention_mask (Tensor, optional):
+                See :class:`MT5Model`.
+            encoder_output (tuple(Tensor), optional):
+                See :class:`MT5Model`.
+            cache (List[tuple(Tensor, Tensor)], optional):
+                See :class:`MT5Model`.
+            labels (Tensor, optional):
+                Labels for language modeling. Note that the labels **are shifted**
+                inside the model, i.e. you can set `labels = input_ids` Indices are
+                selected in `[-100, 0, ..., vocab_size]` All labels set to `-100` are
+                ignored (masked), the loss is only computed for labels in `[0, ..., vocab_size]`.
+                Shape is [batch_size, sequence_length] and dtype is int64.
+            inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation
+                of shape `(batch_size, sequence_length, hidden_size)`. This is useful if you want more control over
+                how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+                Default to None.
+            decoder_inputs_embeds (Tensor , optional):
+                Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+                representation of shape `(batch_size, target_sequence_length, hidden_size)`. If `past_key_values` is used,
+                optionally only the last `decoder_inputs_embeds` have to be input (see `past_key_values`). This is useful
+                if you want more control over how to convert `decoder_input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix. Default to None.
+
+                If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+                of `inputs_embeds`.
+            use_cache (bool, optional):
+                See :class:`MT5Model`.
+            output_attentions (bool, optional):
+                See :class:`MT5Model`.
+            output_hidden_states (bool, optional):
+                See :class:`MT5Model`.
+            return_dict (bool, optional):
+                Whether or not to return a class:`~paddlenlp.transformers.model_outputs.Seq2SeqLMOutput`. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.Seq2SeqLMOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.Seq2SeqLMOutput`.
+
+            tuple: Returns tuple (`loss`, `logits`, `cache`, `decoder_hidden_states`, `decoder_attentions`,
+            `cross_attentions`, `encoder_last_hidden_state`, `encoder_hidden_states`, `encoder_attentions`)
+
+            With the fields:
+
+            - `loss` (Tensor):
+                returned when `labels` is provided.
+                Language modeling loss. It's data type should be float32 and its shape is [1,].
+
+            - `logits` (Tensor):
+                Prediction scores of the language modeling head
+                (scores for each vocabulary token before SoftMax).
+                It's data type should be float32 and its shape is
+                [batch_size, sequence_length, vocab_size].
+
+            - `cache` (List[tuple(Tensor, Tensor)], optional):
+                See :class:`MT5Model`.
+
+            - `decoder_hidden_states` (tuple(Tensor), optional)
+                See :class:`MT5Model`.
+
+            - `decoder_attentions` (tuple(Tensor), optional):
+                See :class:`MT5Model`.
+
+            - `cross_attentions` (tuple(Tensor), optional):
+                See :class:`MT5Model`.
+
+            - `encoder_last_hidden_state` (Tensor):
+                See :class:`MT5Model`.
+
+            - `encoder_hidden_states` (tuple(Tensor), optional):
+                See :class:`MT5Model`.
+
+            - `encoder_attentions` (tuple(Tensor), optional):
+                See :class:`MT5Model`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import MT5ForConditionalGeneration, AutoTokenizer
+
+                tokenizer = AutoTokenizer.from_pretrained('mt5-base')
+                model = MT5ForConditionalGeneration.from_pretrained('mt5-base')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs, labels=inputs["input_ids"])
+
+                loss = output[0]
+                logits = output[1]
+
+        """
+
+        input_type = type(decoder_input_ids) if decoder_input_ids is not None else type(decoder_inputs_embeds)
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Encode if needed (training, first prediction pass)
+        if encoder_output is None:
+            # Convert encoder inputs in embeddings if needed
+            encoder_output = self.mt5.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        else:
+            if isinstance(encoder_output, input_type):
+                encoder_output = (encoder_output,)
+            if return_dict and not isinstance(encoder_output, BaseModelOutput):
+                encoder_output = convert_encoder_output(encoder_output)
+
+        hidden_states = encoder_output[0]
+
+        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(labels)
+
+        # If decoding with past key value states, only the last tokens
+        # should be given as an input
+        if cache is not None:
+            assert labels is None, "Decoder should not use cached key value states when training."
+            if decoder_input_ids is not None:
+                decoder_input_ids = decoder_input_ids[:, -1:]
+
+        encoder_attention_mask = attention_mask
+        if attention_mask is not None:
+            if attention_mask.ndim == 4:
+                encoder_attention_mask = attention_mask[:, :, -1:, :]
+            elif attention_mask.ndim == 3:
+                encoder_attention_mask = attention_mask[:, -1:, :].unsqueeze([1])
+            elif attention_mask.ndim == 2:
+                encoder_attention_mask = attention_mask.unsqueeze([1, 2])
+            else:
+                raise ValueError("Invalid attention mask shape. ")
+
+        # Decode
+        decoder_outputs = self.mt5.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            inputs_embeds=decoder_inputs_embeds,
+            cache=cache,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = decoder_outputs[0]
+
+        if self.mt5.config["tie_word_embeddings"]:
+            # Rescale output before projecting on vocab
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+            sequence_output = sequence_output * (self.mt5.config["d_model"] ** -0.5)
+            lm_logits = paddle.matmul(sequence_output, self.mt5.shared.weight, transpose_y=True)
+        else:
+            lm_logits = self.lm_head(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(lm_logits.reshape(shape=[-1, lm_logits.shape[-1]]).astype("float32"), labels.flatten())
+
+        if not return_dict:
+            output = (lm_logits,) + decoder_outputs[1:] + encoder_output
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_output.last_hidden_state,
+            encoder_hidden_states=encoder_output.hidden_states,
+            encoder_attentions=encoder_output.attentions,
+        )
+
+    @staticmethod
+    def prepare_input_ids_for_generation(bos_token_id, encoder_output=None):
+        batch_size = 1
+        if bos_token_id is None:
+            raise ValueError("`bos_token_id` should be defined when no " "`input_ids` are provided.")
+        if encoder_output is not None:
+            if isinstance(encoder_output, tuple):
+                encoder_output = encoder_output[0]
+            batch_size = encoder_output.shape[0]
+        return paddle.ones([batch_size, 1], dtype="int64") * bos_token_id
+
+    def prepare_inputs_for_generation(
+        self, input_ids, cache=None, attention_mask=None, use_cache=None, encoder_output=None, **kwargs
+    ):
+
+        # cut decoder_input_ids if past is used
+        if cache is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "decoder_input_ids": input_ids,
+            "cache": cache,
+            "encoder_output": encoder_output,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,
+        }
+
+    def prepare_decoder_input_ids_from_labels(self, labels: paddle.Tensor):
+        return self._shift_right(labels)
+
+    @staticmethod
+    def expand_inputs_for_generation(input_ids, expand_size, attention_mask=None, **model_kwargs):
+        index = paddle.tile(paddle.arange(input_ids.shape[0]).unsqueeze(-1), [1, expand_size]).reshape([-1])
+
+        input_ids = paddle.index_select(input_ids, index)
+
+        if attention_mask is not None:
+            model_kwargs["attention_mask"] = paddle.index_select(attention_mask, index)
+
+        if "token_type_ids" in model_kwargs:
+            token_type_ids = model_kwargs["token_type_ids"]
+            model_kwargs["token_type_ids"] = paddle.index_select(token_type_ids, index)
+
+        if "position_ids" in model_kwargs:
+            position_ids = model_kwargs["position_ids"]
+            model_kwargs["position_ids"] = paddle.index_select(position_ids, index)
+
+        if "seq_len" in model_kwargs:
+            seq_len = model_kwargs["seq_len"]
+            model_kwargs["seq_len"] = paddle.index_select(seq_len, index)
+
+        if "encoder_output" in model_kwargs:
+            encoder_output = model_kwargs["encoder_output"]
+            if isinstance(encoder_output, tuple):
+                model_kwargs["encoder_output"] = (paddle.index_select(encoder_output[0], index),) + encoder_output[1:]
+            else:
+                model_kwargs["encoder_output"] = paddle.index_select(encoder_output, index)
+        return input_ids, model_kwargs
+
+    @staticmethod
+    def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id):
+        is_pad_token_in_inputs_ids = (pad_token_id is not None) and paddle.any(input_ids == pad_token_id).item()
+        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (
+            (eos_token_id is not None) and (pad_token_id != eos_token_id)
+        )
+        if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id:
+            attention_mask = (input_ids != pad_token_id).astype("int64")
+            return attention_mask
+        else:
+            attention_mask = paddle.ones_like(input_ids)
+        return attention_mask
+
+    def __getattr__(self, name):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(getattr(self, self.base_model_prefix), name)
+
+
+class MT5EncoderModel(MT5PretrainedModel):
+    base_model_class = None
+
+    def __init__(self, config: MT5Config):
+        super().__init__(config)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.use_cache = False
+        encoder_config.is_encoder_decoder = False
+        self.shared = nn.Embedding(encoder_config.vocab_size, encoder_config.d_model)
+        self.encoder = MT5Stack(encoder_config, embed_tokens=self.shared)
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings: nn.Embedding) -> None:
+        self.shared = new_embeddings
+        self.encoder.set_input_embeddings(new_embeddings)
+
+    def get_encoder(self) -> MT5Stack:
+        return self.encoder
+
+    def forward(
+        self,
+        input_ids: Tensor = None,
+        attention_mask: Optional[Tensor] = None,
+        encoder_hidden_states: Optional[Tuple[Tensor]] = None,
+        encoder_attention_mask: Optional[Tensor] = None,
+        cache=None,
+        inputs_embeds: Optional[Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        encoder_outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            cache=cache,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return encoder_outputs
+
+
+MT5EncoderModel.base_model_class = MT5EncoderModel
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nezha/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nezha/__init__.py
new file mode 100644
index 000000000..3bd752713
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nezha/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .modeling import *
+from .tokenizer import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nezha/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nezha/configuration.py
new file mode 100644
index 000000000..5dc02196c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nezha/configuration.py
@@ -0,0 +1,190 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" NeZha model configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from ..configuration_utils import PretrainedConfig
+
+__all__ = ["NEZHA_PRETRAINED_INIT_CONFIGURATION", "NeZhaConfig", "NEZHA_PRETRAINED_RESOURCE_FILES_MAP"]
+
+NEZHA_PRETRAINED_INIT_CONFIGURATION = {
+    "nezha-base-chinese": {
+        "vocab_size": 21128,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "max_relative_position": 64,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "use_relative_position": True,
+    },
+    "nezha-large-chinese": {
+        "vocab_size": 21128,
+        "hidden_size": 1024,
+        "num_hidden_layers": 24,
+        "num_attention_heads": 16,
+        "intermediate_size": 4096,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "max_relative_position": 64,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "use_relative_position": True,
+    },
+    "nezha-base-wwm-chinese": {
+        "vocab_size": 21128,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "max_relative_position": 64,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "use_relative_position": True,
+    },
+    "nezha-large-wwm-chinese": {
+        "vocab_size": 21128,
+        "hidden_size": 1024,
+        "num_hidden_layers": 24,
+        "num_attention_heads": 16,
+        "intermediate_size": 4096,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "max_relative_position": 64,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "use_relative_position": True,
+    },
+}
+NEZHA_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "nezha-base-chinese": "https://bj.bcebos.com/paddlenlp/models/transformers/nezha/nezha-base-chinese.pdparams",
+        "nezha-large-chinese": "https://bj.bcebos.com/paddlenlp/models/transformers/nezha/nezha-large-chinese.pdparams",
+        "nezha-base-wwm-chinese": "https://bj.bcebos.com/paddlenlp/models/transformers/nezha/nezha-base-wwm-chinese.pdparams",
+        "nezha-large-wwm-chinese": "https://bj.bcebos.com/paddlenlp/models/transformers/nezha/nezha-large-wwm-chinese.pdparams",
+    }
+}
+
+
+class NeZhaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`NezhaModel`]. It is used to instantiate an Nezha
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Nezha
+    [sijunhe/nezha-cn-base](https://huggingface.co/sijunhe/nezha-cn-base) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, optional, defaults to 21128):
+            Vocabulary size of the NEZHA model. Defines the different tokens that can be represented by the
+            *inputs_ids* passed to the forward method of [`NezhaModel`].
+        embedding_size (`int`, optional, defaults to 128):
+            Dimensionality of vocabulary embeddings.
+        hidden_size (`int`, optional, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, optional, defaults to 3072):
+            The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+        hidden_dropout_prob (`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, optional, defaults to 2):
+            The vocabulary size of the *token_type_ids* passed into [`NezhaModel`].
+        initializer_range (`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        classifier_dropout (`float`, optional, defaults to 0.1):
+            The dropout ratio for attached classifiers.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import NeZhaConfig, NeZhaModel
+    >>> # Initializing an Nezha configuration
+    >>> configuration = NeZhaConfig()
+    >>> # Initializing a model (with random weights) from the Nezha-base style configuration model
+    >>> model = NeZhaModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    attribute_map: Dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
+    pretrained_init_configuration = NEZHA_PRETRAINED_INIT_CONFIGURATION
+    model_type = "nezha"
+
+    def __init__(
+        self,
+        vocab_size=21128,
+        embedding_size=128,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        max_relative_position=64,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        classifier_dropout=0.1,
+        pad_token_id=0,
+        bos_token_id=2,
+        eos_token_id=3,
+        use_cache=True,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.embedding_size = embedding_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.max_relative_position = max_relative_position
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nezha/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nezha/modeling.py
new file mode 100644
index 000000000..b97336deb
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nezha/modeling.py
@@ -0,0 +1,1179 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2020 Huawei Technologies Co., Ltd.
+# Copyright 2018 The Google AI Language Team Authors, The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+from paddle import Tensor
+
+from paddlenlp.transformers import PretrainedModel, register_base_model
+
+from ...utils.env import CONFIG_NAME
+from ..activations import ACT2FN
+from ..model_outputs import (
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    ModelOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from .configuration import (
+    NEZHA_PRETRAINED_INIT_CONFIGURATION,
+    NEZHA_PRETRAINED_RESOURCE_FILES_MAP,
+    NeZhaConfig,
+)
+
+__all__ = [
+    "NeZhaModel",
+    "NeZhaPretrainedModel",
+    "NeZhaForPretraining",
+    "NeZhaForSequenceClassification",
+    "NeZhaForTokenClassification",
+    "NeZhaForQuestionAnswering",
+    "NeZhaForMultipleChoice",
+]
+
+
+class NeZhaAttention(nn.Layer):
+    def __init__(self, config: NeZhaConfig):
+        super(NeZhaAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                "heads ({config.num_attention_heads})"
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.relative_positions_embeddings = self.generate_relative_positions_embeddings(
+            length=512, depth=self.attention_head_size, max_relative_position=config.max_relative_position
+        )
+        self.attention_dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.output_dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def generate_relative_positions_embeddings(self, length, depth, max_relative_position=127):
+        vocab_size = max_relative_position * 2 + 1
+        range_vec = paddle.arange(length)
+        range_mat = paddle.tile(range_vec, repeat_times=[length]).reshape((length, length))
+        distance_mat = range_mat - paddle.t(range_mat)
+        distance_mat_clipped = paddle.clip(
+            distance_mat.astype("float32"), -max_relative_position, max_relative_position
+        )
+        final_mat = distance_mat_clipped + max_relative_position
+        embeddings_table = np.zeros([vocab_size, depth])
+
+        for pos in range(vocab_size):
+            for i in range(depth // 2):
+                embeddings_table[pos, 2 * i] = np.sin(pos / np.power(10000, 2 * i / depth))
+                embeddings_table[pos, 2 * i + 1] = np.cos(pos / np.power(10000, 2 * i / depth))
+
+        embeddings_table_tensor = paddle.to_tensor(embeddings_table, dtype="float32")
+        flat_relative_positions_matrix = final_mat.reshape((-1,))
+        one_hot_relative_positions_matrix = paddle.nn.functional.one_hot(
+            flat_relative_positions_matrix.astype("int64"), num_classes=vocab_size
+        )
+        embeddings = paddle.matmul(one_hot_relative_positions_matrix, embeddings_table_tensor)
+        my_shape = final_mat.shape
+        my_shape.append(depth)
+        embeddings = embeddings.reshape(my_shape)
+        return embeddings
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.shape[:-1] + [self.num_attention_heads, self.attention_head_size]
+        x = x.reshape(new_x_shape)
+        return x.transpose((0, 2, 1, 3))
+
+    def forward(self, hidden_states, attention_mask):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_layer, key_layer.transpose((0, 1, 3, 2)))
+        batch_size, num_attention_heads, from_seq_length, to_seq_length = attention_scores.shape
+
+        relations_keys = self.relative_positions_embeddings.detach().clone()[:to_seq_length, :to_seq_length, :]
+
+        query_layer_t = query_layer.transpose((2, 0, 1, 3))
+        query_layer_r = query_layer_t.reshape(
+            (from_seq_length, batch_size * num_attention_heads, self.attention_head_size)
+        )
+        key_position_scores = paddle.matmul(query_layer_r, relations_keys.transpose((0, 2, 1)))
+        key_position_scores_r = key_position_scores.reshape(
+            (from_seq_length, batch_size, num_attention_heads, from_seq_length)
+        )
+        key_position_scores_r_t = key_position_scores_r.transpose((1, 2, 0, 3))
+        attention_scores = attention_scores + key_position_scores_r_t
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(axis=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.attention_dropout(attention_probs)
+
+        context_layer = paddle.matmul(attention_probs, value_layer)
+
+        relations_values = self.relative_positions_embeddings.clone()[:to_seq_length, :to_seq_length, :]
+        attention_probs_t = attention_probs.transpose((2, 0, 1, 3))
+        attentions_probs_r = attention_probs_t.reshape(
+            (from_seq_length, batch_size * num_attention_heads, to_seq_length)
+        )
+        value_position_scores = paddle.matmul(attentions_probs_r, relations_values)
+        value_position_scores_r = value_position_scores.reshape(
+            (from_seq_length, batch_size, num_attention_heads, self.attention_head_size)
+        )
+        value_position_scores_r_t = value_position_scores_r.transpose((1, 2, 0, 3))
+        context_layer = context_layer + value_position_scores_r_t
+
+        context_layer = context_layer.transpose((0, 2, 1, 3))
+        new_context_layer_shape = context_layer.shape[:-2] + [self.all_head_size]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        projected_context_layer = self.dense(context_layer)
+        projected_context_layer_dropout = self.output_dropout(projected_context_layer)
+        layer_normed_context_layer = self.layer_norm(hidden_states + projected_context_layer_dropout)
+
+        return layer_normed_context_layer, attention_scores
+
+
+class NeZhaLayer(nn.Layer):
+    def __init__(self, config: NeZhaConfig):
+        super(NeZhaLayer, self).__init__()
+        self.seq_len_dim = 1
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.attention = NeZhaAttention(config)
+        self.ffn = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.ffn_output = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.activation = ACT2FN[config.hidden_act]
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, attention_mask=None):
+        attention_output, layer_att = self.attention(hidden_states, attention_mask)
+
+        ffn_output = self.ffn(attention_output)
+        ffn_output = self.activation(ffn_output)
+        ffn_output = self.ffn_output(ffn_output)
+
+        ffn_output_dropout = self.dropout(ffn_output)
+        hidden_states = self.layer_norm(ffn_output_dropout + attention_output)
+
+        return hidden_states, layer_att
+
+
+class NeZhaEncoder(nn.Layer):
+    def __init__(self, config: NeZhaConfig):
+        super(NeZhaEncoder, self).__init__()
+        layer = NeZhaLayer(config)
+        self.layer = nn.LayerList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+
+    def forward(self, hidden_states, attention_mask):
+        all_encoder_layers = []
+        all_encoder_att = []
+        for i, layer_module in enumerate(self.layer):
+            all_encoder_layers.append(hidden_states)
+            hidden_states, layer_att = layer_module(all_encoder_layers[i], attention_mask)
+            all_encoder_att.append(layer_att)
+        all_encoder_layers.append(hidden_states)
+        return all_encoder_layers, all_encoder_att
+
+
+class NeZhaEmbeddings(nn.Layer):
+    def __init__(self, config: NeZhaConfig):
+        super(NeZhaEmbeddings, self).__init__()
+        self.use_relative_position = config.use_relative_position
+
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+
+        if not self.use_relative_position:
+            self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+    ):
+        if input_ids is not None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        input_shape = inputs_embeds.shape[:-1]
+
+        ones = paddle.ones(input_shape, dtype="int64")
+        seq_length = paddle.cumsum(ones, axis=1)
+        position_ids = seq_length - ones
+        position_ids.stop_gradient = True
+
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros_like(input_ids, dtype="int64")
+
+        embeddings = inputs_embeds
+
+        if not self.use_relative_position:
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings += token_type_embeddings
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class NeZhaPooler(nn.Layer):
+    def __init__(self, config: NeZhaConfig):
+        super(NeZhaPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class NeZhaPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained NeZha models. It provides NeZha related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    model_config_file = CONFIG_NAME
+    config_class = NeZhaConfig
+    resource_files_names = {"model_state": "model_state.pdparams"}
+    base_model_prefix = "nezha"
+
+    pretrained_init_configuration = NEZHA_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = NEZHA_PRETRAINED_RESOURCE_FILES_MAP
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range,
+                        shape=layer.weight.shape,
+                    )
+                )
+        elif isinstance(layer, nn.LayerNorm):
+            layer._epsilon = 1e-12
+
+
+@register_base_model
+class NeZhaModel(NeZhaPretrainedModel):
+    """
+    The bare NeZha Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        vocab_size (int):
+            Vocabulary size of `inputs_ids` in `DistilBertModel`. Defines the number of different tokens that can
+            be represented by the `inputs_ids` passed when calling `DistilBertModel`.
+        hidden_size (int, optional):
+            Dimensionality of the embedding layer, encoder layers and the pooler layer. Defaults to `768`.
+        num_hidden_layers (int, optional):
+            Number of hidden layers in the Transformer encoder. Defaults to `12`.
+        num_attention_heads (int, optional):
+            Number of attention heads for each attention layer in the Transformer encoder.
+            Defaults to `12`.
+        intermediate_size (int, optional):
+            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
+            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
+            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
+            Defaults to `3072`.
+        hidden_act (str, optional):
+            The non-linear activation function in the feed-forward layer.
+            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
+            are supported. Defaults to `"gelu"`.
+        hidden_dropout_prob (float, optional):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+            Defaults to `0.1`.
+        attention_probs_dropout_prob (float, optional):
+            The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
+            Defaults to `0.1`.
+        max_position_embeddings (int, optional):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
+            sequence. Defaults to `512`.
+        type_vocab_size (int, optional):
+            The vocabulary size of `token_type_ids`.
+            Defaults to `16`.
+        initializer_range (float, optional):
+            The standard deviation of the normal initializer.
+            Defaults to `0.02`.
+
+            .. note::
+                A normal_initializer initializes weight matrices as normal distributions.
+                See :meth:`NeZhaPretrainedModel.init_weights()` for how weights are initialized in `NeZhaModel`.
+
+        max_relative_embeddings (int, optional):
+            The maximum value of the dimensionality of relative encoding, which dictates the maximum supported
+            relative distance of two sentences.
+            Defaults to `64`.
+        layer_norm_eps (float, optional):
+            The small value added to the variance in `LayerNorm` to prevent division by zero.
+            Defaults to `1e-12`.
+        use_relative_position (bool, optional):
+            Whether or not to use relative position embedding. Defaults to `True`.
+
+    """
+
+    def __init__(self, config: NeZhaConfig):
+        super(NeZhaModel, self).__init__(config)
+        self.initializer_range = config.initializer_range
+
+        self.embeddings = NeZhaEmbeddings(config)
+
+        self.encoder = NeZhaEncoder(config)
+
+        self.pooler = NeZhaPooler(config)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The NeZhaModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
+                [batch_size, num_attention_heads, sequence_length, sequence_length].
+                We use whole-word-mask in NeZha, so the whole word will have the same value. For example, "使用" as a word,
+                "使" and "用" will have the same value.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            inputs_embeds (Tensor, optional):
+                If you want to control how to convert `inputs_ids` indices into associated vectors, you can
+                pass an embedded representation directly instead of passing `inputs_ids`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.ModelOutput` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import NeZhaModel, NeZhaTokenizer
+
+                tokenizer = NeZhaTokenizer.from_pretrained('nezha-base-chinese')
+                model = NeZhaModel.from_pretrained('nezha-base-chinese')
+
+                inputs = tokenizer("欢迎使用百度飞浆!", return_tensors='pt')
+                output = model(**inputs)
+        """
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time.")
+
+        output_attentions = output_attentions if output_attentions is not None else False
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else False
+        return_dict = return_dict if return_dict is not None else False
+
+        if attention_mask is None:
+            attention_mask = paddle.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros_like(input_ids)
+
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+
+        encoder_outputs = self.encoder(embedding_output, extended_attention_mask)
+        encoder_hidden_outputs, encoder_att_outputs = encoder_outputs
+
+        sequence_output = encoder_hidden_outputs[-1]
+        pooled_output = self.pooler(sequence_output)
+
+        if not return_dict:
+            outputs = (sequence_output, pooled_output)
+            if output_hidden_states:
+                outputs += (encoder_hidden_outputs,)
+            if output_attentions:
+                outputs += (encoder_att_outputs,)
+            return outputs
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_hidden_outputs if output_hidden_states else None,
+            attentions=encoder_att_outputs if output_attentions else None,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+
+class NeZhaLMPredictionHead(nn.Layer):
+    def __init__(self, config: NeZhaConfig, embedding_weights=None):
+        super(NeZhaLMPredictionHead, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = ACT2FN[config.hidden_act]
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+
+        self.decoder_weight = embedding_weights
+        self.decoder_bias = self.create_parameter(
+            shape=[config.vocab_size], dtype=self.decoder_weight.dtype, is_bias=True
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+
+        hidden_states = paddle.tensor.matmul(hidden_states, self.decoder_weight, transpose_y=True) + self.decoder_bias
+
+        return hidden_states
+
+
+class NeZhaPretrainingHeads(nn.Layer):
+    """
+    Perform language modeling task and next sentence classification task.
+
+    Args:
+        hidden_size (int):
+            See :class:`NeZhaModel`.
+        vocab_size (int):
+            See :class:`NeZhaModel`.
+        hidden_act (str):
+            Activation function used in the language modeling task.
+        embedding_weights (Tensor, optional):
+            Decoding weights used to map hidden_states to logits of the masked token prediction.
+            Its data type should be float32 and its shape is [vocab_size, hidden_size].
+            Defaults to `None`, which means use the same weights of the embedding layer.
+
+    """
+
+    def __init__(self, config: NeZhaConfig, embedding_weights=None):
+        super(NeZhaPretrainingHeads, self).__init__()
+        self.predictions = NeZhaLMPredictionHead(config=config, embedding_weights=embedding_weights)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        """
+        Args:
+            sequence_output(Tensor):
+                Sequence of hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+            pooled_output(Tensor):
+                The output of first token (`[CLS]`) in sequence.
+                We "pool" the model by simply taking the hidden state corresponding to the first token.
+                Its data type should be float32 and its shape is [batch_size, hidden_size].
+
+        Returns:
+            tuple: Returns tuple (``prediction_scores``, ``seq_relationship_score``).
+
+            With the fields:
+
+            - `prediction_scores` (Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size].
+                Otherwise, its shape is [batch_size, mask_token_num, vocab_size].
+
+            - `seq_relationship_score` (Tensor):
+                The scores of next sentence prediction.
+                Its data type should be float32 and its shape is [batch_size, 2].
+
+        """
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+@dataclass
+class NeZhaForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`NeZhaForPreTraining`].
+
+    Args:
+        loss (*optional*, returned when `labels` is provided, `paddle.Tensor` of shape `(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (`paddle.Tensor` of shape `(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    prediction_logits: paddle.Tensor = None
+    seq_relationship_logits: paddle.Tensor = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+class NeZhaForPretraining(NeZhaPretrainedModel):
+    """
+    NeZha Model with pretraining tasks on top.
+
+    Args:
+        nezha (:class:`NeZhaModel`):
+            An instance of :class:`NeZhaModel`.
+
+    """
+
+    def __init__(self, config: NeZhaConfig):
+        super(NeZhaForPretraining, self).__init__(config)
+        self.nezha = NeZhaModel(config)
+        self.cls = NeZhaPretrainingHeads(
+            config,
+            self.nezha.embeddings.word_embeddings.weight,
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        masked_lm_labels: Optional[Tensor] = None,
+        next_sentence_label: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`NeZhaModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`NeZhaModel`.
+            attention_mask (Tensor, optional):
+                See :class:`NeZhaModel`.
+            inputs_embeds(Tensor, optional):
+                See :class:`NeZhaModel`.
+            masked_lm_labels (Tensor, optional):
+                The labels of the masked language modeling, its dimensionality is equal to `prediction_scores`.
+                Its data type should be int64 and its shape is [batch_size, sequence_length, 1].
+            next_sentence_label (Tensor, optional):
+                The labels of the next sentence prediction task, the dimensionality of `next_sentence_labels`
+                is equal to `seq_relation_labels`. Its data type should be int64 and its shape is [batch_size, 1].
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.nezha.NeZhaForPreTrainingOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.nezha.NeZhaForPreTrainingOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.nezha.NeZhaForPreTrainingOutput`.
+        """
+        outputs = self.nezha(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output, pooled_output = outputs[0], outputs[1]
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+
+        total_loss = None
+        if masked_lm_labels is not None and next_sentence_label is not None:
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
+            masked_lm_loss = loss_fct(
+                prediction_scores.reshape((-1, self.nezha.config.vocab_size)), masked_lm_labels.reshape((-1,))
+            )
+            next_sentence_loss = loss_fct(seq_relationship_score.reshape((-1, 2)), next_sentence_label.reshape((-1,)))
+            total_loss = masked_lm_loss + next_sentence_loss
+        elif masked_lm_labels is not None:
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
+            masked_lm_loss = loss_fct(
+                prediction_scores.reshape((-1, self.nezha.config.vocab_size)), masked_lm_labels.reshape((-1,))
+            )
+            total_loss = masked_lm_loss
+
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return NeZhaForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class NeZhaForQuestionAnswering(NeZhaPretrainedModel):
+    """
+    NeZha with a linear layer on top of the hidden-states output to compute `span_start_logits`
+    and `span_end_logits`, designed for question-answering tasks like SQuAD.
+
+    Args:
+        config (:class:`NeZhaConfig`):
+            An instance of NeZhaConfig used to construct NeZhaForQuestionAnswering.
+    """
+
+    def __init__(self, config: NeZhaConfig):
+        super(NeZhaForQuestionAnswering, self).__init__(config)
+        self.nezha = NeZhaModel(config)
+        self.classifier = nn.Linear(config.hidden_size, 2)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        start_positions: Optional[Tensor] = None,
+        end_positions: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The NeZhaForQuestionAnswering forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`NeZhaModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`NeZhaModel`.
+            attention_mask (Tensor, optional):
+                See :class:`NeZhaModel`.
+            inputs_embeds(Tensor, optional):
+                See :class:`NeZhaModel`.
+            start_positions (Tensor of shape `(batch_size,)`, optional):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            end_positions (Tensor of shape `(batch_size,)`, optional):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            tuple: Returns tuple (`start_logits`, `end_logits`).
+
+            With the fields:
+
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import NeZhaForQuestionAnswering
+                from paddlenlp.transformers import NeZhaTokenizer
+
+                tokenizer = NeZhaTokenizer.from_pretrained('nezha-base-chinese')
+                model = NeZhaForQuestionAnswering.from_pretrained('nezha-base-chinese')
+
+                inputs = tokenizer("欢迎使用百度飞浆!", return_tensors='pt')
+                outputs = model(**inputs)
+
+                start_logits = outputs[0]
+                end_logits  =outputs[1]
+        """
+        outputs = self.nezha(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output)
+        logits = paddle.transpose(logits, perm=[2, 0, 1])
+
+        start_logits, end_logits = paddle.unstack(x=logits, axis=0)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if start_positions.ndim > 1:
+                start_positions = start_positions.squeeze(-1)
+            if end_positions.ndim > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clip(0, ignored_index)
+            end_positions = end_positions.clip(0, ignored_index)
+
+            loss_fct = paddle.nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        output = (start_logits, end_logits)
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class NeZhaForSequenceClassification(NeZhaPretrainedModel):
+    """
+    NeZha Model with a linear layer on top of the output layer, designed for
+    sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        config (:class:`NeZhaConfig`):
+            An instance of NeZhaConfig used to construct NeZhaForSequenceClassification.
+    """
+
+    def __init__(self, config: NeZhaConfig):
+        super(NeZhaForSequenceClassification, self).__init__(config)
+        self.nezha = NeZhaModel(config)
+        self.num_labels = config.num_labels
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, self.num_labels)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The NeZhaForSequenceClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`NeZhaModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`NeZhaModel`.
+            attention_mask (Tensor, optional):
+                See :class:`NeZhaModel`.
+            inputs_embeds(Tensor, optional):
+                See :class:`NeZhaModel`.
+            labels (Tensor of shape `(batch_size,)`, optional):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in `[0, ..., num_labels - 1]`. If `num_labels == 1`
+                a regression loss is computed (Mean-Square loss), If `num_labels > 1`
+                a classification loss is computed (Cross-Entropy).
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input text classification logits.
+            Shape as `[batch_size, num_classes]` and dtype as float32.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import NeZhaForSequenceClassification
+                from paddlenlp.transformers import NeZhaTokenizer
+
+                tokenizer = NeZhaTokenizer.from_pretrained('nezha-base-chinese')
+                model = NeZhaForSequenceClassification.from_pretrained('nezha-base-chinese')
+
+                inputs = tokenizer("欢迎使用百度飞浆!", return_tensors='pt')
+                output = model(**inputs)
+
+                logits = outputs[0]
+
+        """
+        outputs = self.nezha(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = paddle.nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = paddle.nn.CrossEntropyLoss()
+                loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = paddle.nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class NeZhaForTokenClassification(NeZhaPretrainedModel):
+    """
+    NeZha Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        config (:class:`NeZhaConfig`):
+            An instance of NeZhaConfig used to construct NeZhaForSequenceClassification.
+    """
+
+    def __init__(self, config: NeZhaConfig):
+        super(NeZhaForTokenClassification, self).__init__(config)
+        self.nezha = NeZhaModel(config)
+        self.num_labels = config.num_labels
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, self.num_labels)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The NeZhaForTokenClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`NeZhaModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`NeZhaModel`.
+            attention_mask (list, optional):
+                See :class:`NeZhaModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`NeZhaModel`.
+            labels (Tensor of shape `(batch_size, sequence_length)`, optional):
+                Labels for computing the token classification loss. Indices should be in `[0, ..., num_labels - 1]`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import NeZhaForTokenClassification
+                from paddlenlp.transformers import NeZhaTokenizer
+
+                tokenizer = NeZhaTokenizer.from_pretrained('nezha-base-chinese')
+                model = NeZhaForTokenClassification.from_pretrained('nezha-base-chinese')
+
+                inputs = tokenizer("欢迎使用百度飞浆!", return_tensors='pt')
+                output = model(**inputs)
+
+                logits = outputs[0]
+        """
+        outputs = self.nezha(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class NeZhaForMultipleChoice(NeZhaPretrainedModel):
+    """
+    NeZha Model with a linear layer on top of the hidden-states output layer,
+    designed for multiple choice tasks like RocStories/SWAG tasks.
+
+    Args:
+        config (:class:`BertConfig`):
+            An instance of BertConfig used to construct BertForMultipleChoice.
+    """
+
+    def __init__(self, config: NeZhaConfig):
+        super(NeZhaForMultipleChoice, self).__init__(config)
+        self.nezha = NeZhaModel(config)
+        self.num_choices = config.num_choices
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The NeZhaForMultipleChoice forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`NeZhaModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`NeZhaModel`.
+            attention_mask (list, optional):
+                See :class:`NeZhaModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`NeZhaModel`.
+            labels (Tensor of shape `(batch_size, )`, optional):
+                Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+                num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+                `input_ids` above)
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            Tensor: Returns tensor `reshaped_logits`, a tensor of the input multiple choice classification logits.
+            Shape as `[batch_size, num_classes]` and dtype as `float32`.
+        """
+
+        # input_ids: [bs, num_choice, seq_l]
+        if input_ids is not None:
+            input_ids = input_ids.reshape((-1, input_ids.shape[-1]))  # flat_input_ids: [bs*num_choice,seq_l]
+
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.reshape((-1, token_type_ids.shape[-1]))
+        if attention_mask is not None:
+            attention_mask = attention_mask.reshape((-1, attention_mask.shape[-1]))
+        if inputs_embeds is not None:
+            inputs_embeds = inputs_embeds.reshape(shape=(-1, inputs_embeds.shape[-2], inputs_embeds.shape[-1]))
+
+        outputs = self.nezha(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+
+        logits = self.classifier(pooled_output)  # logits: (bs*num_choice,1)
+        reshaped_logits = logits.reshape((-1, self.num_choices))  # logits: (bs, num_choice)
+
+        loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nezha/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nezha/tokenizer.py
new file mode 100644
index 000000000..3de0b2fee
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nezha/tokenizer.py
@@ -0,0 +1,304 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from paddlenlp.transformers import (
+    BasicTokenizer,
+    PretrainedTokenizer,
+    WordpieceTokenizer,
+)
+
+__all__ = ["NeZhaTokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "nezha-base-chinese": 512,
+    "nezha-large-chinese": 512,
+    "nezha-base-wwm-chinese": 512,
+    "nezha-large-wwm-chinese": 512,
+}
+
+
+class NeZhaTokenizer(PretrainedTokenizer):
+    """
+    Constructs a NeZha tokenizer. It uses a basic tokenizer to do punctuation
+    splitting, lower casing and so on, and follows a WordPiece tokenizer to
+    tokenize as subwords.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            The vocabulary file path (ends with '.txt') required to instantiate
+            a `WordpieceTokenizer`.
+        do_lower_case (bool):
+            Whether or not to lowercase the input when tokenizing.
+            Defaults to`True`.
+        unk_token (str):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import NeZhaTokenizer
+            tokenizer = NeZhaTokenizer.from_pretrained('nezha-base-chinese')
+
+            inputs = tokenizer('欢迎使用百度飞桨！')
+            print(inputs)
+
+            '''
+            {'input_ids': [101, 3614, 6816, 886, 4500, 4636, 2428, 7607, 3444, 8013, 102],
+            'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
+            '''
+
+    """
+
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "nezha-base-chinese": "http://bj.bcebos.com/paddlenlp/models/transformers/nezha/nezha-chinese-vocab.txt",
+            "nezha-base-wwm-chinese": "http://bj.bcebos.com/paddlenlp/models/transformers/nezha/nezha-chinese-vocab.txt",
+            "nezha-large-chinese": "http://bj.bcebos.com/paddlenlp/models/transformers/nezha/nezha-chinese-vocab.txt",
+            "nezha-large-wwm-chinese": "http://bj.bcebos.com/paddlenlp/models/transformers/nezha/nezha-chinese-vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "nezha-base-chinese": {"do_lower_case": False},
+        "nezha-base-wwm-chinese": {"do_lower_case": False},
+        "nezha-large-chinese": {"do_lower_case": False},
+        "nezha-large-wwm-chinese": {"do_lower_case": False},
+    }
+    padding_side = "right"
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the "
+                "vocabulary from a pretrained model please use "
+                "`tokenizer = NeZhaTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+        self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token)
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=unk_token)
+
+    @property
+    def vocab_size(self):
+        """
+        Return the size of vocabulary.
+
+        Returns:
+            int: The size of vocabulary.
+        """
+        return len(self.vocab)
+
+    def _tokenize(self, text):
+        """
+        End-to-end tokenization for NeZha models.
+        Args:
+            text (str): The text to be tokenized.
+
+        Returns:
+            list: A list of string representing converted tokens.
+        """
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+
+    def convert_tokens_to_string(self, tokens):
+        """
+        Converts a sequence of tokens (list of string) to a single string. Since
+        the usage of WordPiece introducing `##` to concat subwords, also removes
+        `##` when converting.
+
+        Args:
+            tokens (list): A list of string representing tokens to be converted.
+
+        Returns:
+            str: Converted string from tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import NeZhaTokenizer
+
+                tokenizer = NeZhaTokenizer.from_pretrained('nezha-base-chinese')
+                tokens = tokenizer.tokenize('欢迎使用百度飞桨！')
+                '''
+                ['欢', '迎', '使', '用', '百', '度', '飞', '桨', '！']
+                '''
+                strings = tokenizer.convert_tokens_to_string(tokens)
+                '''
+                欢 迎 使 用 百 度 飞 桨 ！
+                '''
+        """
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def num_special_tokens_to_add(self, pair=False):
+        """
+        Returns the number of added tokens when encoding a sequence with special tokens.
+
+        Args:
+            pair(bool):
+                Whether the input is a sequence pair or a single sequence.
+                Defaults to `False` and the input is a single sequence.
+
+        Returns:
+            int: Number of tokens added to sequences.
+        """
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens.
+
+        A NeZha sequence has the following format:
+
+        - single sequence:      ``[CLS] X [SEP]``
+        - pair of sequences:        ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to `None`.
+
+        Returns:
+            List[int]: List of input_id with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        return _cls + token_ids_0 + _sep + token_ids_1 + _sep
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        """
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        A NeZha offset_mapping has the following format:
+
+        - single sequence:      ``(0,0) X (0,0)``
+        - pair of sequences:        ``(0,0) A (0,0) B (0,0)``
+
+        Args:
+            offset_mapping_ids_0 (List[tuple]):
+                List of wordpiece offsets to which the special tokens will be added.
+            offset_mapping_ids_1 (List[tuple], optional):
+                Optional second list of wordpiece offsets for offset mapping pairs. Defaults to `None`.
+
+        Returns:
+            List[tuple]: A list of wordpiece offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+
+        A NeZha sequence pair mask has the following format:
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+
+        Returns:
+            List[int]: List of token_type_id according to the given sequence(s).
+        """
+        _sep = [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(_cls + token_ids_0 + _sep) * [0]
+        return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 + _sep) * [1]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to `None`.
+            already_has_special_tokens (bool, optional):
+                Whether or not the token list is already formatted with special tokens for the model.
+                Defaults to `False`.
+
+        Returns:
+            List[int]: The list of integers either be 0 or 1: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nystromformer/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nystromformer/__init__.py
new file mode 100644
index 000000000..ac50b5704
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nystromformer/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nystromformer/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nystromformer/configuration.py
new file mode 100644
index 000000000..bfe10653d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nystromformer/configuration.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Nystromformer Model Configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from ..configuration_utils import PretrainedConfig
+
+__all__ = [
+    "NYSTROMFORMER_PRETRAINED_INIT_CONFIGURATION",
+    "NYSTROMFORMER_PRETRAINED_RESOURCE_FILES_MAP",
+    "NystromformerConfig",
+]
+
+NYSTROMFORMER_PRETRAINED_INIT_CONFIGURATION = {
+    "nystromformer-base-zh": {
+        "model_type": "nystromformer",
+        "attention_probs_dropout_prob": 0.1,
+        "bos_token_id": 1,
+        "conv_kernel_size": 65,
+        "eos_token_id": 2,
+        "hidden_act": "gelu_new",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "inv_coeff_init_option": False,
+        "layer_norm_eps": 1e-05,
+        "max_position_embeddings": 4096,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "num_landmarks": 64,
+        "pad_token_id": 0,
+        "segment_means_seq_len": 64,
+        "type_vocab_size": 2,
+        "vocab_size": 40000,
+    },
+}
+
+NYSTROMFORMER_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "nystromformer-base-zh": "https://paddlenlp.bj.bcebos.com/models/transformers/nystromformer/nystromformer_base_zh/model_state.pdparams"
+    }
+}
+
+
+class NystromformerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`NystromformerModel`]. It is used to instantiate
+    an Nystromformer model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Nystromformer
+    [uw-madison/nystromformer-512](https://huggingface.co/uw-madison/nystromformer-512) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 40000):
+            Vocabulary size of the Nystromformer model. Defines the number of different tokens that can be represented
+            by the `inputs_ids` passed when calling [`NystromformerModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`NystromformerModel`].
+        segment_means_seq_len (`int`, *optional*, defaults to 64):
+            Sequence length used in segment-means.
+        num_landmarks (`int`, *optional*, defaults to 64):
+            The number of landmark (or Nystrom) points to use in Nystrom approximation of the softmax self-attention
+            matrix.
+        conv_kernel_size (`int`, *optional*, defaults to 65):
+            The kernel size of depthwise convolution used in Nystrom approximation.
+        inv_coeff_init_option (`bool`, *optional*, defaults to `False`):
+            Whether or not to use exact coefficient computation for the initial values for the iterative method of
+            calculating the Moore-Penrose inverse of a matrix.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import NystromformerModel, NystromformerConfig
+    >>> # Initializing a Nystromformer uw-madison/nystromformer-512 style configuration
+    >>> configuration = NystromformerConfig()
+    >>> # Initializing a model from the uw-madison/nystromformer-512 style configuration
+    >>> model = NystromformerModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    attribute_map: Dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
+    pretrained_init_configuration = NYSTROMFORMER_PRETRAINED_INIT_CONFIGURATION
+    model_type = "nystromformer"
+
+    def __init__(
+        self,
+        vocab_size=40000,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu_new",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=4096,
+        type_vocab_size=2,
+        segment_means_seq_len=64,
+        num_landmarks=64,
+        conv_kernel_size=65,
+        inv_coeff_init_option=False,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.type_vocab_size = type_vocab_size
+        self.segment_means_seq_len = segment_means_seq_len
+        self.num_landmarks = num_landmarks
+        self.conv_kernel_size = conv_kernel_size
+        self.inv_coeff_init_option = inv_coeff_init_option
+        self.layer_norm_eps = layer_norm_eps
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nystromformer/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nystromformer/modeling.py
new file mode 100644
index 000000000..f5f5e7d41
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nystromformer/modeling.py
@@ -0,0 +1,1331 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import math
+from typing import Callable, Optional, Tuple, Union
+
+import paddle
+import paddle.nn as nn
+from paddle import Tensor
+from paddle.distributed.fleet.utils import recompute
+
+from ...utils.env import CONFIG_NAME
+from .. import PretrainedModel, register_base_model
+from ..activations import ACT2FN
+from ..model_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from .configuration import (
+    NYSTROMFORMER_PRETRAINED_INIT_CONFIGURATION,
+    NYSTROMFORMER_PRETRAINED_RESOURCE_FILES_MAP,
+    NystromformerConfig,
+)
+
+__all__ = [
+    "NystromformerEmbeddings",
+    "NystromformerModel",
+    "NystromformerPretrainedModel",
+    "NystromformerForSequenceClassification",
+    "NystromformerForMaskedLM",
+    "NystromformerForTokenClassification",
+    "NystromformerForMultipleChoice",
+    "NystromformerForQuestionAnswering",
+]
+
+
+class NystromformerEmbeddings(nn.Layer):
+    """
+    Include embeddings from word, position and token_type embeddings.
+    """
+
+    def __init__(self, config: NystromformerConfig):
+        super(NystromformerEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings + 2, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", paddle.arange(config.max_position_embeddings, dtype="int64").expand((1, -1)) + 2
+        )
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "token_type_ids",
+            paddle.zeros(self.position_ids.shape, dtype=paddle.int64),
+            persistable=False,
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+    ):
+
+        if input_ids is not None:
+            input_shape = input_ids.shape
+        else:
+            input_shape = inputs_embeds.shape[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids,
+        # sloves the issue: https://github.com/huggingface/transformers/issues/5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand((input_shape[0], seq_length))
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class NystromformerSelfAttention(nn.Layer):
+    def __init__(self, config: NystromformerConfig, position_embedding_type: Optional[str] = None):
+        super(NystromformerSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.num_landmarks = config.num_landmarks
+        self.seq_len = config.segment_means_seq_len
+        self.conv_kernel_size = config.conv_kernel_size
+
+        if config.inv_coeff_init_option:
+            self.init_option = config["inv_init_coeff_option"]
+        else:
+            self.init_option = "original"
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+
+        if self.conv_kernel_size is not None:
+            self.conv = nn.Conv2D(
+                in_channels=self.num_attention_heads,
+                out_channels=self.num_attention_heads,
+                kernel_size=(self.conv_kernel_size, 1),
+                padding=(self.conv_kernel_size // 2, 0),
+                bias_attr=False,
+                groups=self.num_attention_heads,
+            )
+
+    # Function to approximate Moore-Penrose inverse via the iterative method
+    def iterative_inv(self, mat, n_iter=6):
+        identity = paddle.eye(mat.shape[-1])
+        key = mat
+
+        # The entries of key are positive and ||key||_{\infty} = 1 due to softmax
+        if self.init_option == "original":
+            # This original implementation is more conservative to compute coefficient of Z_0.
+            value = 1 / paddle.max(paddle.sum(key, axis=-2)) * key.transpose([0, 1, 3, 2])
+        else:
+            # TODO make sure this way is OK
+            # This is the exact coefficient computation, 1 / ||key||_1, of initialization of Z_0, leading to faster convergence.
+            value = (
+                1
+                / paddle.max(paddle.sum(key, axis=-2), axis=-1).values[:, :, None, None]
+                * key.transpose([0, 1, 3, 2])
+            )
+
+        for _ in range(n_iter):
+            key_value = paddle.matmul(key, value)
+            value = paddle.matmul(
+                0.25 * value,
+                13 * identity
+                - paddle.matmul(key_value, 15 * identity - paddle.matmul(key_value, 7 * identity - key_value)),
+            )
+        return value
+
+    def transpose_for_scores(self, layer):
+        new_layer_shape = layer.shape[:-1] + [self.num_attention_heads, self.attention_head_size]
+        layer = layer.reshape(new_layer_shape)
+        return layer.transpose([0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        output_attentions: Optional[Tensor] = False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        query_layer = query_layer / math.sqrt(math.sqrt(self.attention_head_size))
+        key_layer = key_layer / math.sqrt(math.sqrt(self.attention_head_size))
+
+        if self.num_landmarks == self.seq_len:
+            attention_scores = paddle.matmul(query_layer, key_layer, transpose_y=True)
+            if attention_mask is not None:
+                # Apply the attention mask is (precomputed for all layers in NystromformerModel forward() function)
+                attention_scores = attention_scores + attention_mask
+
+            attention_probs = nn.functional.softmax(attention_scores, axis=-1)
+            context_layer = paddle.matmul(attention_probs, value_layer)
+        else:
+            q_landmarks = query_layer.reshape(
+                [
+                    -1,
+                    self.num_attention_heads,
+                    self.num_landmarks,
+                    self.seq_len // self.num_landmarks,
+                    self.attention_head_size,
+                ]
+            ).mean(axis=-2)
+            k_landmarks = key_layer.reshape(
+                [
+                    -1,
+                    self.num_attention_heads,
+                    self.num_landmarks,
+                    self.seq_len // self.num_landmarks,
+                    self.attention_head_size,
+                ]
+            ).mean(axis=-2)
+
+            kernel_1 = nn.functional.softmax(paddle.matmul(query_layer, k_landmarks, transpose_y=True), axis=-1)
+            kernel_2 = nn.functional.softmax(paddle.matmul(q_landmarks, k_landmarks, transpose_y=True), axis=-1)
+
+            attention_scores = paddle.matmul(q_landmarks, key_layer, transpose_y=True)
+
+            if attention_mask is not None:
+                # Apply the attention mask is (precomputed for all layers in NystromformerModel forward() function)
+                attention_scores = attention_scores + attention_mask
+
+            kernel_3 = nn.functional.softmax(attention_scores, axis=-1)
+            attention_probs = paddle.matmul(kernel_1, self.iterative_inv(kernel_2))
+            new_value_layer = paddle.matmul(kernel_3, value_layer)
+            context_layer = paddle.matmul(attention_probs, new_value_layer)
+
+        if self.conv_kernel_size is not None:
+            context_layer += self.conv(value_layer)
+
+        context_layer = context_layer.transpose([0, 2, 1, 3])
+        new_context_layer_shape = context_layer.shape[:-2] + [self.all_head_size]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        return outputs
+
+
+class NystromformerSelfOutput(nn.Layer):
+    def __init__(self, config: NystromformerConfig):
+        super(NystromformerSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: Tensor, input_tensor: Tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class NystromformerAttention(nn.Layer):
+    def __init__(self, config: NystromformerConfig, position_embedding_type: Optional[str] = None):
+        super(NystromformerAttention, self).__init__()
+        self.self = NystromformerSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.output = NystromformerSelfOutput(config)
+        self.pruned_heads = set()
+
+    def forward(
+        self, hidden_states: Tensor, attention_mask: Optional[Tensor] = None, output_attentions: Optional[bool] = False
+    ):
+        self_outputs = self.self(hidden_states, attention_mask, output_attentions)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class NystromformerIntermediate(nn.Layer):
+    def __init__(self, config):
+        super(NystromformerIntermediate, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: Tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class NystromformerOutput(nn.Layer):
+    def __init__(self, config: NystromformerConfig):
+        super(NystromformerOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: Tensor, input_tensor: Tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class NystromformerLayer(nn.Layer):
+    def __init__(self, config: NystromformerConfig):
+        super(NystromformerLayer, self).__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = NystromformerAttention(config)
+        self.add_cross_attention = config.add_cross_attention
+        self.intermediate = NystromformerIntermediate(config)
+        self.output = NystromformerOutput(config)
+
+    def apply_chunking_to_forward(
+        self, forward_fn: Callable[..., Tensor], chunk_size: int, chunk_dim: int, *input_tensors
+    ):
+        assert len(input_tensors) > 0, f"{input_tensors} has to be a tuple/list of tensors"
+
+        # inspect.signature exist since python 3.5 and is a python method -> no problem with backward compatibility
+        num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters)
+        if num_args_in_forward_chunk_fn != len(input_tensors):
+            raise ValueError(
+                f"forward_chunk_fn expects {num_args_in_forward_chunk_fn} arguments, but only {len(input_tensors)} input "
+                "tensors are given"
+            )
+        if chunk_size > 0:
+            tensor_shape = input_tensors[0].shape[chunk_dim]
+            for input_tensor in input_tensors:
+                if input_tensor.shape[chunk_dim] != tensor_shape:
+                    raise ValueError(
+                        f"All input tenors have to be of the same shape: {tensor_shape}, "
+                        f"found shape {input_tensor.shape[chunk_dim]}"
+                    )
+            if input_tensors[0].shape[chunk_dim] % chunk_size != 0:
+                raise ValueError(
+                    f"The dimension to be chunked {input_tensors[0].shape[chunk_dim]} has to be a multiple of the chunk "
+                    f"size {chunk_size}"
+                )
+            num_chunks = input_tensors[0].shape[chunk_dim] // chunk_size
+            input_tensors_chunks = tuple(
+                input_tensor.chunk(num_chunks, axis=chunk_dim) for input_tensor in input_tensors
+            )
+            output_chunks = tuple(
+                forward_fn(*input_tensors_chunk) for input_tensors_chunk in zip(*input_tensors_chunks)
+            )
+            return paddle.concat(output_chunks, axis=chunk_dim)
+        return forward_fn(*input_tensors)
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        output_attentions: Optional[Tensor] = False,
+    ):
+        self_attention_outputs = self.attention(hidden_states, attention_mask, output_attentions=output_attentions)
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        layer_output = self.apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class NystromformerEncoder(nn.Layer):
+    def __init__(self, config: NystromformerConfig):
+        super(NystromformerEncoder, self).__init__()
+        self.config = config
+        self.layer = nn.LayerList([NystromformerLayer(config) for _ in range(config.num_hidden_layers)])
+        # The parameter output_attentions in forward shoule set to be False when self.recompute = True.
+        # Recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            if self.enable_recompute and self.training:
+
+                def create_cumtom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)[0]
+
+                    return custom_forward
+
+                layer_outputs = (recompute(create_cumtom_forward(layer_module), hidden_states, attention_mask),)
+            else:
+                layer_outputs = layer_module(hidden_states, attention_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class NystromformerPredictionHeadTransform(nn.Layer):
+    def __init__(self, config: NystromformerConfig):
+        super(NystromformerPredictionHeadTransform, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+
+    def forward(self, hidden_states: Tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class NystromformerLMPredictionHead(nn.Layer):
+    def __init__(self, config: NystromformerConfig):
+        super(NystromformerLMPredictionHead, self).__init__()
+        self.transform = NystromformerPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias_attr=False)
+        self.bias = paddle.create_parameter(shape=(config.vocab_size,), dtype=self.decoder.weight.dtype)
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states: Tensor):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class NystromformerOnlyMLMHead(nn.Layer):
+    def __init__(self, config: NystromformerConfig):
+        super(NystromformerOnlyMLMHead, self).__init__()
+        self.predictions = NystromformerLMPredictionHead(config)
+
+    def forward(self, sequence_output: Tensor):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class NystromformerPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained Nystromformer models. It provides Nystromformer related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    model_config_file = CONFIG_NAME
+    config_class = NystromformerConfig
+    resource_files_names = {"model_state": "model_state.pdparams"}
+    base_model_prefix = "nystromformer"
+    support_recompute = True
+
+    # model init configuration
+    pretrained_init_configuration = NYSTROMFORMER_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = NYSTROMFORMER_PRETRAINED_RESOURCE_FILES_MAP
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding, nn.Conv2D)):
+            # only support dygraph, use truncated_normal and make it inplace
+            # and configurable later
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range,
+                        shape=layer.weight.shape,
+                    )
+                )
+        elif isinstance(layer, nn.LayerNorm):
+            layer._epsilon = self.config.layer_norm_eps
+
+    def _set_recompute(self, module, value=False):
+        if isinstance(module, NystromformerEncoder):
+            module.enable_recompute = value
+
+
+@register_base_model
+class NystromformerModel(NystromformerPretrainedModel):
+    """
+    The bare Nystromformer Model outputting raw hidden-states.
+    Nystromformer is a nystrom-based approximation of transformer which reduce the time complexity to O(n).
+    See the Nystromformer paper at: https://arxiv.org/pdf/2102.03902v3.pdf
+
+    Ref:
+        Xiong, Yunyang, et al. "Nyströmformer: A Nystöm-based Algorithm for Approximating Self-Attention." AAAI, 2021.
+
+    Args:
+        config(NystromformerConfig):
+            An instance of ErnieConfig used to construct NystromformerModel.
+    """
+
+    def __init__(self, config: NystromformerConfig):
+        super(NystromformerModel, self).__init__(config)
+        self.embeddings = NystromformerEmbeddings(config)
+        self.encoder = NystromformerEncoder(config)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def get_extended_attention_mask(self, attention_mask, input_shape):
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
+            )
+
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        r"""
+        The NystromformerModel forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type should be int. The `masked` tokens have `0` values and the others have `1` values.
+                It is a tensor with shape `[batch_size, sequence_length]`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids (Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                ``[0, max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            inputs_embeds (Tensor, optional):
+                Indices of embedded input sequence. They are representations of tokens that build the input sequence.
+                Its data type should be `float32` and it has a shape of [batch_size, sequence_length, hidden_size].
+                Defaults to 'None', which means the input_ids represents the sequence.
+            output_attentions (bool, optional):
+                Whether to return the output of each hidden layers.
+                Defaults to `None`, which means get the option from config.
+            output_hidden_states (bool, optional):
+                Whether to return the output of each hidden layers.
+                Defaults to `None`, which means get the option from config.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions`.
+
+        Example:
+            .. code-block::
+                import paddle
+                from paddlenlp.transformers import NystromformerModel, AutoTokenizer
+                tokenizer = AutoTokenizer.from_pretrained("model_name")
+                model = NystromformerModel.from_pretrained("model_name")
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+
+        if attention_mask is None:
+            attention_mask = paddle.ones((batch_size, seq_length))
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand((batch_size, seq_length))
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+class NystromformerForMaskedLM(NystromformerPretrainedModel):
+    """
+    Nystromformer Model with a `masked language modeling` head on top.
+
+    Args:
+        config (:class:`NystromformerConfig`):
+            An instance of NystromformerConfig used to construct NystromformerForMaskedLM.
+    """
+
+    _keys_to_ignore_on_load_missing = ["cls.predictions.decoder"]
+
+    def __init__(self, config: NystromformerConfig):
+        super(NystromformerForMaskedLM, self).__init__(config)
+
+        self.nystromformer = NystromformerModel(config)
+        self.cls = NystromformerOnlyMLMHead(config)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[Tensor], MaskedLMOutput]:
+        r"""
+        The NystromformerForMaskedLM forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type should be int. The `masked` tokens have `0` values and the others have `1` values.
+                It is a tensor with shape `[batch_size, sequence_length]`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids (Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                ``[0, max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            inputs_embeds (Tensor, optional):
+                Indices of embedded input sequence. They are representations of tokens that build the input sequence.
+                Its data type should be `float32` and it has a shape of [batch_size, sequence_length, hidden_size].
+                Defaults to 'None', which means the input_ids represents the sequence.
+            labels (Tensor of shape `(batch_size, sequence_length)`, optional):
+                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+                vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+                loss is only computed for the tokens with labels in `[0, ..., vocab_size]`.
+            output_attentions (bool, optional):
+                Whether to return the output of each hidden layers.
+                Defaults to `None`, which means get the option from config.
+            output_hidden_states (bool, optional):
+                Whether to return the output of each hidden layers.
+                Defaults to `None`, which means get the option from config.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.MaskedLMOutput` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.MaskedLMOutput` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.MaskedLMOutput`.
+
+        Example:
+            .. code-block::
+                import paddle
+                from paddlenlp.transformers import NystromformerForMaskedLM, AutoTokenizer
+                tokenizer = AutoTokenizer.from_pretrained("model_name")
+                model = NystromformerForMaskedLM.from_pretrained("model_name")
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+                print(logits.shape) # [batch_size, seq_len, hidden_size]
+        """
+
+        return_dict = return_dict if return_dict is not None else False
+
+        outputs = self.nystromformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.reshape([-1, self.config.vocab_size]), labels.reshape([-1]))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class NystromformerClassificationHead(nn.Layer):
+    """
+    Classification head of nystromformer used in sequence classification
+    """
+
+    def __init__(self, config: NystromformerConfig):
+        super(NystromformerClassificationHead, self).__init__()
+        self.config = config
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = ACT2FN[self.config.hidden_act](x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+class NystromformerForSequenceClassification(NystromformerPretrainedModel):
+    """
+    Nystromformer Model with a linear layer on top of the output layer,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        config(NystromformerConfig, optional):
+            An instance of ErnieConfig used to construct NystromformerForSequenceClassification.
+    """
+
+    def __init__(self, config: NystromformerConfig):
+        super(NystromformerForSequenceClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.nystromformer = NystromformerModel(config)
+        self.classifier = NystromformerClassificationHead(config)
+        self.config = config if config is not None else NystromformerConfig()
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[Tensor], SequenceClassifierOutput]:
+        r"""
+        The NystromformerForSequenceClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type should be int. The `masked` tokens have `0` values and the others have `1` values.
+                It is a tensor with shape `[batch_size, sequence_length]`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids (Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                ``[0, max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            inputs_embeds (Tensor, optional):
+                Indices of embedded input sequence. They are representations of tokens that build the input sequence.
+                Its data type should be `float32` and it has a shape of [batch_size, sequence_length, hidden_size].
+                Defaults to 'None', which means the input_ids represents the sequence.
+            labels (Tensor of shape `(batch_size,)`, optional):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in `[0, ..., num_labels - 1]`. If `num_labels == 1`
+                a regression loss is computed (Mean-Square loss), If `num_labels > 1`
+                a classification loss is computed (Cross-Entropy).
+            output_attentions (bool, optional):
+                Whether to return the output of each hidden layers.
+                Defaults to `None`, which means get the option from config.
+            output_hidden_states (bool, optional):
+                Whether to return the output of each hidden layers.
+                Defaults to `None`, which means get the option from config.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput`.
+
+        Example:
+            .. code-block::
+                import paddle
+                from paddlenlp.transformers import AutoTokenizer, NystromformerForSequenceClassification
+
+                tokenizer = AutoTokenizer.from_pretrained("model_name")
+                model = NystromformerForSequenceClassification.from_pretrained("model_name")
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+        """
+        return_dict = return_dict if return_dict is not None else False
+
+        outputs = self.nystromformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(logits.reshape([-1, self.num_labels]), labels.flatten())
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class NystromformerForMultipleChoice(NystromformerPretrainedModel):
+    """
+    Nystromformer Model with a linear layer on top of the hidden-states output layer,
+    designed for multiple choice tasks like RocStories/SWAG tasks.
+
+    Args:
+        config (:class:`NystromformerConfig`):
+            An instance of NystromformerConfig used to construct NystromformerForMultipleChoice.
+    """
+
+    def __init__(self, config: NystromformerConfig):
+        super(NystromformerForMultipleChoice, self).__init__(config)
+
+        self.nystromformer = NystromformerModel(config)
+        self.pre_classifier = nn.Linear(config.hidden_size, config.hidden_size)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[Tensor], MultipleChoiceModelOutput]:
+        r"""
+        The NystromformerForMultipleChoice forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, num_choice, sequence_length].
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type should be int. The `masked` tokens have `0` values and the others have `1` values.
+                It is a tensor with shape `[batch_size, num_choice, sequence_length]`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Its data type should be `int64` and it has a shape of [batch_size, num_choice, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids (Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                ``[0, max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_choice, sequence_length)` and dtype as int64. Defaults to `None`.
+            inputs_embeds (Tensor, optional):
+                Indices of embedded input sequence. They are representations of tokens that build the input sequence.
+                Its data type should be `float32` and it has a shape of [batch_size, num_choice, sequence_length, hidden_size].
+                Defaults to 'None', which means the input_ids represents the sequence.
+            labels (Tensor of shape `(batch_size, )`, optional):
+                Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+                num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+                `input_ids` above)
+            output_attentions (bool, optional):
+                Whether to return the output of each hidden layers.
+                Defaults to `None`, which means get the option from config.
+            output_hidden_states (bool, optional):
+                Whether to return the output of each hidden layers.
+                Defaults to `None`, which means get the option from config.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput`.
+        """
+
+        return_dict = return_dict if return_dict is not None else False
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.reshape([-1, input_ids.shape[-1]]) if input_ids is not None else None
+        attention_mask = attention_mask.reshape([-1, attention_mask.shape[-1]]) if attention_mask is not None else None
+        token_type_ids = token_type_ids.reshape([-1, token_type_ids.shape[-1]]) if token_type_ids is not None else None
+        position_ids = position_ids.reshape([-1, position_ids.shape[-1]]) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.reshape([-1, inputs_embeds.shape[-2], inputs_embeds.shape[-1]])
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.nystromformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_state = outputs[0]  # (bs * num_choices, seq_len, hidden_size)
+        pooled_output = hidden_state[:, 0]  # (bs * num_choices, hidden_size)
+        pooled_output = self.pre_classifier(pooled_output)  # (bs * num_choices, hidden_size)
+        pooled_output = nn.ReLU()(pooled_output)  # (bs * num_choices, hidden_size)
+        logits = self.classifier(pooled_output)
+
+        reshaped_logits = logits.reshape([-1, num_choices])
+
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class NystromformerForTokenClassification(NystromformerPretrainedModel):
+    r"""
+    Nystromformer Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+    Args:
+        config (:class:`NystromformerConfig`):
+            An instance of NystromformerConfig used to construct NystromformerForTokenClassification.
+    """
+
+    def __init__(self, config: NystromformerConfig):
+        super(NystromformerForTokenClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.nystromformer = NystromformerModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[Tensor], TokenClassifierOutput]:
+        r"""
+        The NystromformerForTokenClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type should be int. The `masked` tokens have `0` values and the others have `1` values.
+                It is a tensor with shape `[batch_size, sequence_length]`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids (Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                ``[0, max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            inputs_embeds (Tensor, optional):
+                Indices of embedded input sequence. They are representations of tokens that build the input sequence.
+                Its data type should be `float32` and it has a shape of [batch_size, sequence_length, hidden_size].
+                Defaults to 'None', which means the input_ids represents the sequence.
+            labels (Tensor of shape `(batch_size, sequence_length)`, optional):
+                Labels for computing the token classification loss. Indices should be in `[0, ..., num_labels - 1]`.
+            output_attentions (bool, optional):
+                Whether to return the output of each hidden layers.
+                Defaults to `None`, which means get the option from config.
+            output_hidden_states (bool, optional):
+                Whether to return the output of each hidden layers.
+                Defaults to `None`, which means get the option from config.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput`.
+        """
+
+        return_dict = return_dict if return_dict is not None else False
+
+        outputs = self.nystromformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.reshape([-1, self.num_labels]), labels.reshape([-1]))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class NystromformerForQuestionAnswering(NystromformerPretrainedModel):
+    """
+    Nystromformer Model with a linear layer on top of the hidden-states
+    output to compute `span_start_logits` and `span_end_logits`,
+    designed for question-answering tasks like SQuAD.
+    Args:
+        config (:class:`NystromformerConfig`):
+            An instance of NystromformerConfig used to construct NystromformerForQuestionAnswering.
+    """
+
+    def __init__(self, config: NystromformerConfig):
+        super(NystromformerForQuestionAnswering, self).__init__(config)
+
+        config.num_labels = 2
+        self.nystromformer = NystromformerModel(config)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        start_positions: Optional[Tensor] = None,
+        end_positions: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[Tensor], QuestionAnsweringModelOutput]:
+        r"""
+        The NystromformerForMultipleChoice forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type should be int. The `masked` tokens have `0` values and the others have `1` values.
+                It is a tensor with shape `[batch_size, sequence_length]`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids (Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
+                ``[0, max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            inputs_embeds (Tensor, optional):
+                Indices of embedded input sequence. They are representations of tokens that build the input sequence.
+                Its data type should be `float32` and it has a shape of [batch_size, sequence_length, hidden_size].
+                Defaults to 'None', which means the input_ids represents the sequence.
+            start_positions (Tensor of shape `(batch_size,)`, optional):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            end_positions (Tensor of shape `(batch_size,)`, optional):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            output_attentions (bool, optional):
+                Whether to return the output of each hidden layers.
+                Defaults to `None`, which means get the option from config.
+            output_hidden_states (bool, optional):
+                Whether to return the output of each hidden layers.
+                Defaults to `None`, which means get the option from config.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput`.
+        """
+
+        return_dict = return_dict if return_dict is not None else False
+
+        outputs = self.nystromformer(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output)
+        logits = paddle.transpose(logits, perm=[2, 0, 1])
+        start_logits, end_logits = paddle.unstack(x=logits, axis=0)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if start_positions.ndim > 1:
+                start_positions = start_positions.squeeze(-1)
+            if end_positions.ndim > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clip(0, ignored_index)
+            end_positions = end_positions.clip(0, ignored_index)
+
+            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nystromformer/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nystromformer/tokenizer.py
new file mode 100644
index 000000000..580e04b85
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/nystromformer/tokenizer.py
@@ -0,0 +1,316 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from .. import BasicTokenizer, PretrainedTokenizer, WordpieceTokenizer
+
+__all__ = [
+    "NystromformerTokenizer",
+]
+
+
+class NystromformerTokenizer(PretrainedTokenizer):
+    """
+    Constructs a Nystromformer tokenizer. It uses a basic tokenizer to do punctuation
+    splitting, lower casing and so on, and follows a WordPiece tokenizer to
+    tokenize as subwords.
+
+    Args:
+        vocab_file (str):
+            The vocabulary file path (ends with '.txt') required to instantiate
+            a `WordpieceTokenizer`.
+        do_lower_case (bool, optional):
+            Whether to lowercase the input when tokenizing.
+            Defaults to `True`.
+        do_basic_tokenize (bool, optional):
+            Whether to use a basic tokenizer before a WordPiece tokenizer.
+            Defaults to `True`.
+        never_split (Iterable, optional):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`. Defaults to `None`.
+        unk_token (str, optional):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str, optional):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str, optional):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str, optional):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str, optional):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+        tokenize_chinese_chars (bool, optional):
+            Whether to tokenize Chinese characters.
+            Defaults to `True`.
+        strip_accents: (bool, optional):
+            Whether to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase`.
+            Defaults to `None`.
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import NystromformerTokenizer
+            tokenizer = NystromformerTokenizer.from_pretrained("model_name")
+
+            inputs = tokenizer("He was a puppeteer")
+            print(inputs)
+
+            '''
+            {"input_ids": [101, 2002, 2001, 1037, 13997, 11510, 102], "token_type_ids": [0, 0, 0, 0, 0, 0, 0]}
+            '''
+    """
+
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "nystromformer-base-zh": "https://paddlenlp.bj.bcebos.com/models/transformers/nystromformer/nystromformer_base_zh/vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "nystromformer-base-zh": {"do_lower_case": True},
+    }
+    max_model_input_sizes = {
+        "nystromformer-base-zh": 4096,
+    }
+    padding_side = "right"
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs
+    ):
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the "
+                "vocabulary from a pretrained model please use "
+                "`tokenizer = NystromformerTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+        self.do_lower_case = do_lower_case
+        self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token)
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=unk_token)
+
+    @property
+    def vocab_size(self):
+        """
+        Return the size of vocabulary.
+
+        Returns:
+            int: The size of vocabulary.
+        """
+
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab.token_to_idx, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        """
+        End-to-end tokenization for Nystromformer models.
+
+        Args:
+            text (str): The text to be tokenized.
+
+        Returns:
+            list: A list of string representing converted tokens.
+        """
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
+                # If the token is part of the never_split set
+                if token in self.basic_tokenizer.never_split:
+                    split_tokens.append(token)
+                else:
+                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def convert_tokens_to_string(self, tokens):
+        """
+        Converts a sequence of tokens (list of string) to a single string. Since
+        the usage of WordPiece introducing `##` to concat subwords, also removes
+        `##` when converting.
+
+        Args:
+            tokens (list): A list of string representing tokens to be converted.
+
+        Returns:
+            str: Converted string from tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import NystromformerTokenizer
+
+                tokenizer = NystromformerTokenizer.from_pretrained("model_name")
+                tokens = tokenizer.tokenize("He was a puppeteer")
+                '''
+                ["he", "was", "a", "puppet", "##eer"]
+                '''
+                strings = tokenizer.convert_tokens_to_string(tokens)
+                '''
+                he was a puppeteer
+                '''
+        """
+
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def num_special_tokens_to_add(self, pair=False):
+        """
+        Returns the number of added tokens when encoding a sequence with special tokens.
+
+        Args:
+            pair(bool):
+                Whether the input is a sequence pair or a single sequence.
+                Defaults to `False` and the input is a single sequence.
+
+        Returns:
+            int: Number of tokens added to sequences.
+        """
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens.
+
+        A Nystromformer sequence has the following format:
+
+        - single sequence:      ``[CLS] X [SEP]``
+        - pair of sequences:        ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+
+        Returns:
+            List[int]: List of input_id with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        return _cls + token_ids_0 + _sep + token_ids_1 + _sep
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        """
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        A Nystromformer offset_mapping has the following format:
+
+        - single sequence:      ``(0,0) X (0,0)``
+        - pair of sequences:        ``(0,0) A (0,0) B (0,0)``
+
+        Args:
+            offset_mapping_ids_0 (List[tuple]):
+                List of wordpiece offsets to which the special tokens will be added.
+            offset_mapping_ids_1 (List[tuple], optional):
+                Optional second list of wordpiece offsets for offset mapping pairs. Defaults to None.
+
+        Returns:
+            List[tuple]: A list of wordpiece offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+
+        A Nystromformer sequence pair mask has the following format:
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+
+        Returns:
+            List[int]: List of token_type_id according to the given sequence(s).
+        """
+        _sep = [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(_cls + token_ids_0 + _sep) * [0]
+        return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 + _sep) * [1]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optinal):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+            already_has_special_tokens (bool, optional): Whether or not the token list is already
+                formatted with special tokens for the model. Defaults to None.
+
+        Returns:
+            List[int]: The list of integers either be 0 or 1: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ofa_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ofa_utils.py
new file mode 100644
index 000000000..e0f7bb240
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ofa_utils.py
@@ -0,0 +1,326 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+__all__ = [
+    "prepare_qkv_ofa",
+    "mha_ofa_forward",
+    "encoder_ofa_forward",
+    "encoder_layer_ofa_forward",
+    "compute_neuron_head_importance",
+    "reorder_neuron_head",
+]
+
+
+def prepare_qkv_ofa(self, query, key, value, cache=None):
+    q = self.q_proj(query)
+    if hasattr(self.q_proj, "fn") and self.q_proj.fn.cur_config["expand_ratio"] is not None:
+        self.num_heads = int(self.num_heads * self.q_proj.fn.cur_config["expand_ratio"])
+    q = paddle.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
+    q = paddle.transpose(x=q, perm=[0, 2, 1, 3])
+
+    if isinstance(cache, self.StaticCache):
+        # for encoder-decoder attention in inference and has cached
+        k, v = cache.k, cache.v
+    else:
+        k, v = self.compute_kv(key, value)
+
+    if isinstance(cache, self.Cache):
+        # for decoder self-attention in inference
+        k = paddle.concat([cache.k, k], axis=2)
+        v = paddle.concat([cache.v, v], axis=2)
+        cache = self.Cache(k, v)
+
+    return (q, k, v) if cache is None else (q, k, v, cache)
+
+
+def mha_ofa_forward(self, query, key, value, attn_mask=None, cache=None):
+    """
+    monkey patch for MultiHeadAttention forward to accept head_mask
+    attn_mask[0] = attn_mask, attn_mask[1] = head_mask
+    """
+    key = query if key is None else key
+    value = query if value is None else value
+    # compute q ,k ,v
+    if cache is None:
+        q, k, v = self._prepare_qkv(query, key, value, cache)
+    else:
+        q, k, v, cache = self._prepare_qkv(query, key, value, cache)
+
+    # scale dot product attention
+    product = paddle.matmul(x=q * (self.head_dim**-0.5), y=k, transpose_y=True)
+    if attn_mask[0] is not None:
+        # TODO(guosheng): support bool mask
+        product = product + attn_mask[0]
+    weights = F.softmax(product)
+    if self.dropout:
+        weights = F.dropout(weights, self.dropout, training=self.training, mode="upscale_in_train")
+
+    if attn_mask[1] is not None:
+        weights = weights * attn_mask[1]
+
+    out = paddle.matmul(weights, v)
+
+    # combine heads
+    out = paddle.transpose(out, perm=[0, 2, 1, 3])
+    out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+
+    # project to output
+    out = self.out_proj(out)
+
+    outs = [out]
+    if self.need_weights:
+        outs.append(weights)
+    if cache is not None:
+        outs.append(cache)
+    if hasattr(self.q_proj, "fn") and self.q_proj.fn.cur_config["expand_ratio"] is not None:
+        self.num_heads = int(float(self.num_heads) / self.q_proj.fn.cur_config["expand_ratio"])
+    return out if len(outs) == 1 else tuple(outs)
+
+
+def encoder_ofa_forward(
+    self,
+    src,
+    src_mask=[None, None],
+    cache=None,
+    output_attentions=False,
+    output_hidden_states=False,
+    return_dict=False,
+):
+    """
+    monkey patch for TransformerEncoder forward to accept head_mask
+    attn_mask[0] = attn_mask, attn_mask[1] = head_mask
+    """
+    output = src
+    if src_mask[1] is not None:
+        head_mask = src_mask[1]
+        if len(head_mask.shape) == 1:
+            head_mask = paddle.unsqueeze(paddle.unsqueeze(paddle.unsqueeze(paddle.unsqueeze(head_mask, 0), 0), -1), -1)
+            head_mask = paddle.expand(head_mask, shape=[self.num_layers] + head_mask.shape[1:])
+        elif len(head_mask.shape) == 2:
+            head_mask = paddle.unsqueeze(paddle.unsqueeze(paddle.unsqueeze(head_mask, 1), -1), -1)
+    else:
+        head_mask = [None] * self.num_layers
+    for i, mod in enumerate(self.layers):
+        output = mod(output, src_mask=[src_mask[0], head_mask[i]])
+    if self.norm is not None:
+        output = self.norm(output)
+
+    return output
+
+
+def encoder_layer_ofa_forward(self, src, src_mask=None, cache=None, output_attentions=False):
+    residual = src
+    if self.normalize_before:
+        src = self.norm1(src)
+    # Add cache for encoder for the usage like UniLM
+    if cache is None:
+        src = self.self_attn(src, src, src, src_mask)
+    else:
+        src, incremental_cache = self.self_attn(src, src, src, src_mask, cache)
+
+    src = residual + self.dropout1(src)
+    if not self.normalize_before:
+        src = self.norm1(src)
+
+    residual = src
+    if self.normalize_before:
+        src = self.norm2(src)
+    src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+    src = residual + self.dropout2(src)
+    if not self.normalize_before:
+        src = self.norm2(src)
+    return src if cache is None else (src, incremental_cache)
+
+
+def reorder_head(layer, index):
+    """
+    Reorder head weights according index.
+    Args:
+         layer(paddle.nn.Layer): the instance of `paddle.nn.MultiHeadAttention` layer.
+         index(list): the sort indices of multi-head.
+    """
+    assert isinstance(
+        layer, nn.MultiHeadAttention
+    ), "layer in reorder_head must be the instance of `paddle.nn.MultiHeadAttention`."
+    n, a = layer.num_heads, layer.head_dim
+    idx = paddle.reshape(
+        paddle.index_select(paddle.reshape(paddle.arange(0, n * a, dtype="int64"), shape=[n, a]), index=index, axis=0),
+        shape=[-1],
+    )
+
+    def reorder_head_matrix(linearLayer, index, dim=1):
+        W = paddle.index_select(linearLayer.weight, index, axis=dim).detach()
+        if linearLayer.bias is not None:
+            if dim == 0:
+                b = paddle.assign(linearLayer.bias).detach()
+            else:
+                b = paddle.assign(paddle.index_select(linearLayer.bias, index, axis=0)).detach()
+
+        linearLayer.weight.stop_gradient = True
+        linearLayer.weight.set_value(W)
+        linearLayer.weight.stop_gradient = False
+        if linearLayer.bias is not None:
+            linearLayer.bias.stop_gradient = True
+            linearLayer.bias.set_value(b)
+            linearLayer.bias.stop_gradient = False
+
+    reorder_head_matrix(layer.q_proj.fn if hasattr(layer.q_proj, "fn") else layer.q_proj, idx)
+    reorder_head_matrix(layer.k_proj.fn if hasattr(layer.k_proj, "fn") else layer.k_proj, idx)
+    reorder_head_matrix(layer.v_proj.fn if hasattr(layer.v_proj, "fn") else layer.v_proj, idx)
+    reorder_head_matrix(layer.out_proj.fn if hasattr(layer.out_proj, "fn") else layer.out_proj, idx, dim=0)
+
+
+def reorder_neuron(layer, index, dim=0):
+    """
+    Reorder feed-forward weights according index.
+    Args:
+         layer(paddle.nn.Layer): the instance of `paddle.nn.Linear` layer.
+         index(list): the sort indices of feed-forward.
+         dim(int): select weights according to the dim.
+    """
+    linearLayer = layer.fn if hasattr(layer, "fn") else layer
+    W = paddle.index_select(linearLayer.weight, index, axis=dim).detach()
+    if linearLayer.bias is not None:
+        if dim == 0:
+            b = paddle.assign(linearLayer.bias).detach()
+        else:
+            b = paddle.assign(paddle.index_select(linearLayer.bias, index, axis=0)).detach()
+    linearLayer.weight.stop_gradient = True
+    linearLayer.weight.set_value(W)
+    linearLayer.weight.stop_gradient = False
+
+    if linearLayer.bias is not None:
+        linearLayer.bias.stop_gradient = True
+        linearLayer.bias.set_value(b)
+        linearLayer.bias.stop_gradient = False
+
+
+def reorder_neuron_head(model, head_importance, neuron_importance):
+    """
+    Reorders weights according head importance and neuron importance
+    """
+    # Reorders heads and ffn neurons
+    for layer, current_importance in enumerate(neuron_importance):
+        # Reorders heads
+        idx = paddle.argsort(head_importance[layer], descending=True)
+        reorder_head(model.base_model.encoder.layers[layer].self_attn, idx)
+        # Reorders neurons
+        idx = paddle.argsort(paddle.to_tensor(current_importance), descending=True)
+        reorder_neuron(model.base_model.encoder.layers[layer].linear1.fn, idx, dim=1)
+
+        reorder_neuron(model.base_model.encoder.layers[layer].linear2.fn, idx, dim=0)
+
+
+def compute_neuron_head_importance(
+    model,
+    data_loader,
+    num_layers,
+    num_heads,
+    loss_fct=nn.loss.CrossEntropyLoss(),
+    intermediate_name="linear1",
+    output_name="linear2",
+    label_names=None,
+):
+    """
+    Computes the importance of multi-head attention and feed-forward  neuron in
+    each transformer layer.
+
+    Args:
+        model(paddle.nn.Layer):
+            The instance of transformer model.
+        data_loader (DataLoader):
+            An iterable data loader is used for evaluate. An instance of
+            `paddle.io.Dataloader`.
+        num_layers (int):
+            Number of transformer layers.
+        num_heads (int):
+            Number of heads in each multi-head attention.
+        loss_fct (Loss|optional):
+            Loss function can be a `paddle.nn.Layer` instance. Default: `nn.loss.CrossEntropyLoss()`.
+        intermediate_name (str|optional):
+            The name of intermediate `Linear` layer in feed-forward.
+            Defaults to `linear1`.
+        output_name (str|optional):
+            The name of output `Linear` layer in feed-forward.
+            Defaults to `linear2`.
+    """
+    head_importance = paddle.zeros(shape=[num_layers, num_heads], dtype="float32")
+    head_mask = paddle.ones(shape=[num_layers, num_heads], dtype="float32")
+    head_mask.stop_gradient = False
+
+    intermediate_weight = []
+    intermediate_bias = []
+    output_weight = []
+
+    for name, w in model.named_parameters():
+        if intermediate_name in name:
+            if len(w.shape) > 1:
+                intermediate_weight.append(w)
+            else:
+                intermediate_bias.append(w)
+
+        if output_name in name:
+            if len(w.shape) > 1:
+                output_weight.append(w)
+
+    neuron_importance = []
+    for w in intermediate_weight:
+        neuron_importance.append(np.zeros(shape=[w.shape[1]], dtype="float32"))
+
+    for i, batch in enumerate(data_loader):
+        labels = None
+        if isinstance(batch, list):
+            input_ids, segment_ids, labels = batch
+            logits = model(input_ids, segment_ids, attention_mask=[None, head_mask])
+        else:
+            if label_names is not None:
+                labels = []
+                for label in label_names:
+                    labels.append(batch.pop(label))
+                labels = tuple(labels)
+            elif "labels" in batch:
+                labels = batch.pop("labels")
+                # For token cls tasks
+                for key in ("length", "seq_len"):
+                    if key in batch:
+                        batch.pop(key)
+            elif "start_positions" in batch and "end_positions" in batch:
+                labels = (batch.pop("start_positions"), batch.pop("end_positions"))
+
+            batch["attention_mask"] = [None, head_mask]
+            logits = model(**batch)
+
+        if loss_fct is not None:
+            loss = loss_fct(logits, labels)
+        else:
+            raise NotImplementedError(
+                "Model to be compressed is an instance of a custom class, "
+                "so function `loss_fct(logits, labels)` should "
+                "be implemented, and it should return a single float for precision "
+                "value, such as acc."
+            )
+
+        loss.backward()
+        head_importance += paddle.abs(paddle.to_tensor(head_mask.gradient()))
+        for w1, b1, w2, current_importance in zip(
+            intermediate_weight, intermediate_bias, output_weight, neuron_importance
+        ):
+            current_importance += np.abs((np.sum(w1.numpy() * w1.gradient(), axis=0) + b1.numpy() * b1.gradient()))
+            current_importance += np.abs(np.sum(w2.numpy() * w2.gradient(), axis=1))
+    return head_importance, neuron_importance
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/opt/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/opt/__init__.py
new file mode 100644
index 000000000..9c9c883a4
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/opt/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .modeling import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/opt/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/opt/configuration.py
new file mode 100644
index 000000000..3f6f23c1c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/opt/configuration.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" OPT Model Configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from ..configuration_utils import PretrainedConfig
+
+__all__ = [
+    "OPT_PRETRAINED_INIT_CONFIGURATION",
+    "OPT_PRETRAINED_RESOURCE_FILES_MAP",
+    "OPTConfig",
+]
+
+OPT_PRETRAINED_INIT_CONFIGURATION = {
+    "facebook/opt-1.3b": {
+        "init_args": [
+            {
+                "intermediate_size": 8192,
+                "attention_probs_dropout_prob": 0.0,
+                "hidden_dropout_prob": 0.1,
+                "normalize_before": True,
+                "word_embed_proj_dim": 2048,
+                "num_attention_heads": 32,
+                "bos_token_id": 2,
+                "hidden_size": 2048,
+                "eos_token_id": 2,
+                "hidden_act": "relu",
+                "initializer_range": 0.02,
+                "max_position_embeddings": 2048,
+                "num_hidden_layers": 24,
+                "pad_token_id": 1,
+                "vocab_size": 50272,
+                "type_vocab_size": 16,
+                "init_class": "OPTModel",
+            }
+        ],
+        "init_class": "OPTForCausalLM",
+    },
+}
+
+OPT_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "facebook/opt-1.3b": "https://bj.bcebos.com/paddlenlp/models/community/facebook/opt-1.3b/model_state.pdparams"
+    }
+}
+
+
+class OPTConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`OPTModel`]. It is used to instantiate
+    an OPT model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the OPT
+    [facebook/opt-1.3b](https://huggingface.co/facebook/opt-1.3b) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+     Args:
+        vocab_size (`int`, *optional*, defaults to 50272):
+            Vocabulary size of the OPT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`OPTModel`]
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of decoder layers.
+        intermediate_size (`int`, *optional*, defaults to 8192):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        normalize_before (`bool`, *optional*, defaults to `True`):
+            Whether to perform layer normalization before the attention block.
+        word_embed_proj_dim (`int`, *optional*):
+            `word_embed_proj_dim` can be set to down-project word embeddings, *e.g.* `opt-1.3b`. Defaults to
+            `hidden_size`.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        type_vocab_size (int, optional):
+            The vocabulary size of the `token_type_ids`. Defaults to `16`.
+            .. note::
+                Please NOT using `type_vocab_size`, for it will be obsolete in the future..
+        initializer_range (float, optional):
+            The standard deviation of the normal initializer. Default to `0.02`.
+            .. note::
+                A normal_initializer initializes weight matrices as normal distributions.
+                See :meth:`OPTPretrainedModel._init_weights()` for how weights are initialized in `OPTModel`.
+
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import OPTModel, OPTConfig
+    >>> # Initializing a OPT facebook/opt-1.3b style configuration
+    >>> config = OPTConfig()
+    >>> # Initializing a model from the facebook/opt-1.3b style configuration
+    >>> model = OPTModel(config)
+    >>> # Accessing the model config
+    >>> config = model.config
+    ```"""
+
+    attribute_map: Dict[str, str] = {
+        "dropout": "classifier_dropout",
+        "num_classes": "num_labels",
+        "ffn_dim": "intermediate_size",
+        "activation_function": "hidden_act",
+    }
+    pretrained_init_configuration = OPT_PRETRAINED_INIT_CONFIGURATION
+    model_type = "opt"
+
+    def __init__(
+        self,
+        vocab_size=50272,
+        hidden_size=2048,
+        num_hidden_layers=24,
+        intermediate_size=8192,
+        num_attention_heads=32,
+        hidden_act="relu",
+        max_position_embeddings=2048,
+        normalize_before=True,
+        word_embed_proj_dim=2048,
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        type_vocab_size=16,
+        pad_token_id=1,
+        bos_token_id=2,
+        eos_token_id=2,
+        enable_bias: bool = True,
+        mp_degree: int = 1,
+        fuse_attention_qkv=False,
+        fuse_attention_ffn=False,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.normalize_before = normalize_before
+        self.word_embed_proj_dim = word_embed_proj_dim if word_embed_proj_dim is not None else hidden_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.type_vocab_size = type_vocab_size
+
+        self.enable_bias = enable_bias
+        self.mp_degree = mp_degree
+
+        self.fuse_attention_qkv = fuse_attention_qkv
+        self.fuse_attention_ffn = fuse_attention_ffn
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/opt/convert_torch_to_paddle.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/opt/convert_torch_to_paddle.py
new file mode 100644
index 000000000..3cd36b293
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/opt/convert_torch_to_paddle.py
@@ -0,0 +1,180 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import json
+import os
+
+
+def convert_configs(model_dir: str, output_dir: str | None = None):
+    """convert pytorch config.json to model_config.json
+
+    Args:
+        model_dir (str): the directory of model-realted files
+    """
+
+    # 1. load the config file
+    output_dir = output_dir or model_dir
+    target_config_file = os.path.join(output_dir, "model_config.json")
+
+    if os.path.exists(target_config_file):
+        return
+
+    config_file = os.path.join(model_dir, "config.json")
+    assert os.path.exists(config_file), f"<config.json> not found in <{model_dir}> dir"
+
+    with open(config_file, "r", encoding="utf-8") as f:
+        config = json.load(f)
+
+    # 2. transform the config to opt model file
+    target_config = {
+        "init_args": [
+            {
+                "intermediate_size": config["ffn_dim"],
+                "attention_probs_dropout_prob": config["attention_dropout"],
+                "hidden_dropout_prob": config["dropout"],
+                "normalize_before": config["do_layer_norm_before"],
+                "word_embed_proj_dim": config["word_embed_proj_dim"],
+                "num_attention_heads": config["num_attention_heads"],
+                "bos_token_id": config["bos_token_id"],
+                "hidden_size": config["hidden_size"],
+                "eos_token_id": config["eos_token_id"],
+                "hidden_act": config["activation_function"],
+                "initializer_range": config["init_std"],
+                "max_position_embeddings": config["max_position_embeddings"],
+                "num_hidden_layers": config["num_hidden_layers"],
+                "pad_token_id": config["pad_token_id"],
+                "vocab_size": config["vocab_size"],
+                "init_class": "OPTModel",
+            }
+        ],
+        "init_class": "OPTForCausalLM",
+    }
+
+    with open(target_config_file, "w", encoding="utf-8") as f:
+        json.dump(target_config, f)
+
+    print("convert config successfully ...")
+
+
+def convert_weights(model_dir: str, output_dir: str | None = None):
+    # 1. serach the pytorch_model weight files
+    files = [
+        file_name
+        for file_name in os.listdir(model_dir)
+        if file_name.startswith("pytorch_model") and file_name.endswith(".bin")
+    ]
+
+    # 2. construct name-mapping
+    mappings = [
+        ["decoder.embed_tokens.weight", "embeddings.word_embeddings.weight"],
+        ["decoder.embed_positions.weight", "embeddings.position_embeddings.weight"],
+        ["decoder.final_layer_norm.weight", "decoder.final_layer_norm.weight"],
+        ["decoder.final_layer_norm.bias", "decoder.final_layer_norm.bias"],
+    ]
+
+    with open(os.path.join(model_dir, "config.json"), "r", encoding="utf-8") as f:
+        config = json.load(f)
+    for layer_index in range(config["num_hidden_layers"]):
+        layer_mappings = [
+            [
+                f"decoder.layers.{layer_index}.self_attn.k_proj.weight",
+                f"decoder.layers.{layer_index}.self_attn.k_proj.weight",
+                "transpose",
+            ],
+            [
+                f"decoder.layers.{layer_index}.self_attn.k_proj.bias",
+                f"decoder.layers.{layer_index}.self_attn.k_proj.bias",
+            ],
+            [
+                f"decoder.layers.{layer_index}.self_attn.v_proj.weight",
+                f"decoder.layers.{layer_index}.self_attn.v_proj.weight",
+                "transpose",
+            ],
+            [
+                f"decoder.layers.{layer_index}.self_attn.v_proj.bias",
+                f"decoder.layers.{layer_index}.self_attn.v_proj.bias",
+            ],
+            [
+                f"decoder.layers.{layer_index}.self_attn.q_proj.weight",
+                f"decoder.layers.{layer_index}.self_attn.q_proj.weight",
+                "transpose",
+            ],
+            [
+                f"decoder.layers.{layer_index}.self_attn.q_proj.bias",
+                f"decoder.layers.{layer_index}.self_attn.q_proj.bias",
+            ],
+            [
+                f"decoder.layers.{layer_index}.self_attn.out_proj.weight",
+                f"decoder.layers.{layer_index}.self_attn.out_proj.weight",
+                "transpose",
+            ],
+            [
+                f"decoder.layers.{layer_index}.self_attn.out_proj.bias",
+                f"decoder.layers.{layer_index}.self_attn.out_proj.bias",
+            ],
+            [
+                f"decoder.layers.{layer_index}.self_attn_layer_norm.weight",
+                f"decoder.layers.{layer_index}.norm1.weight",
+            ],
+            [f"decoder.layers.{layer_index}.self_attn_layer_norm.bias", f"decoder.layers.{layer_index}.norm1.bias"],
+            [f"decoder.layers.{layer_index}.fc1.weight", f"decoder.layers.{layer_index}.linear1.weight", "transpose"],
+            [f"decoder.layers.{layer_index}.fc1.bias", f"decoder.layers.{layer_index}.linear1.bias"],
+            [f"decoder.layers.{layer_index}.fc2.weight", f"decoder.layers.{layer_index}.linear2.weight", "transpose"],
+            [f"decoder.layers.{layer_index}.fc2.bias", f"decoder.layers.{layer_index}.linear2.bias"],
+            [f"decoder.layers.{layer_index}.final_layer_norm.weight", f"decoder.layers.{layer_index}.norm2.weight"],
+            [f"decoder.layers.{layer_index}.final_layer_norm.bias", f"decoder.layers.{layer_index}.norm2.bias"],
+        ]
+        mappings.extend(layer_mappings)
+
+    # 3. checking the model keys
+    import torch
+    from tqdm import tqdm
+
+    state_dict = {}
+    for file in files:
+        file_state_dict = torch.load(file)
+        for key in list(file_state_dict.keys()):
+            state_dict[key] = file_state_dict.pop(key).cpu().numpy()
+
+    for mapping in tqdm(mappings):
+        torch_key, paddle_key = mapping[:2]
+        assert torch_key in state_dict, f"{torch_key} no in weight file"
+
+    import paddle
+
+    # 4. transform tensor
+    from tqdm import tqdm
+
+    for mapping in tqdm(mappings):
+        torch_key, paddle_key = mapping[:2]
+        value = state_dict.pop(torch_key)
+        if len(mapping) == 3:
+            value = value.T
+        state_dict[paddle_key] = value
+
+    # 5. save the model files
+    paddle.save(state_dict, "model_state.pdparams")
+    print("convert pytorch model to paddle weight file successfully ...")
+
+
+if __name__ == "__main__":
+    # update your `model_dir` and `output_dir` here to your pytorch model dir
+    model_dir = "your pytorch path"
+    output_dir = None
+
+    convert_configs(model_dir, output_dir)
+
+    convert_weights(model_dir, output_dir)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/opt/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/opt/modeling.py
new file mode 100644
index 000000000..c24bd357d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/opt/modeling.py
@@ -0,0 +1,1216 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import collections
+from functools import partial
+from typing import Any, Dict, List
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.tensor as tensor
+from paddle.distributed import fleet
+from paddle.nn import Layer
+from paddle.nn.layer.transformer import _convert_param_attr_to_list
+
+from paddlenlp.transformers.conversion_utils import StateDictNameMapping
+from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model
+from paddlenlp.utils.log import logger
+
+from ..model_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+)
+from .configuration import (
+    OPT_PRETRAINED_INIT_CONFIGURATION,
+    OPT_PRETRAINED_RESOURCE_FILES_MAP,
+    OPTConfig,
+)
+
+__all__ = ["OPTModel", "OPTPretrainedModel", "OPTForCausalLM", "OPTForConditionalGeneration"]
+
+
+def finfo(dtype):
+    if dtype == "float32":
+        return np.finfo(np.float32)
+    if dtype == "float16":
+        return np.finfo(np.float16)
+    if dtype == "float64":
+        return np.finfo(np.float64)
+
+
+def _make_causal_mask(input_ids_shape, past_key_values_length, dtype):
+    """
+    Make causal mask used for self-attention.
+    """
+    batch_size, target_length = input_ids_shape
+
+    mask = paddle.full((target_length, target_length), float(finfo(paddle.get_default_dtype()).min))
+
+    mask_cond = paddle.arange(mask.shape[-1])
+    mask_cond = mask_cond < (mask_cond + 1).reshape([mask.shape[-1], 1])
+    mask = paddle.where(mask_cond, paddle.full(mask_cond.shape, 0), mask)
+
+    if past_key_values_length > 0:
+        mask = paddle.concat([paddle.zeros([target_length, past_key_values_length], dtype=mask.dtype), mask], axis=-1)
+
+    expanded_mask = mask.unsqueeze(0).expand([batch_size, 1, target_length, target_length + past_key_values_length])
+    return expanded_mask
+
+
+def _expand_mask(mask, tgt_length):
+    """
+    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
+    """
+    batch_size, src_length = mask.shape[0], mask.shape[-1]
+    tgt_length = tgt_length if tgt_length is not None else src_length
+
+    expanded_mask = ~(paddle.cast(mask[:, None, None, :], "bool"))
+    expanded_mask = paddle.cast(expanded_mask, dtype=paddle.get_default_dtype())
+
+    expanded_mask = expanded_mask.expand([batch_size, 1, tgt_length, src_length])
+    expanded_mask = expanded_mask * float(finfo(paddle.get_default_dtype()).min)
+    return expanded_mask
+
+
+class MultiHeadAttention(nn.Layer):
+    """
+    Attention mapps queries and a set of key-value pairs to outputs, and
+    Multi-Head Attention performs multiple parallel attention to jointly attending
+    to information from different representation subspaces.
+
+    """
+
+    Cache = collections.namedtuple("Cache", ["k", "v"])
+    StaticCache = collections.namedtuple("StaticCache", ["k", "v"])
+
+    def __init__(
+        self,
+        config: OPTConfig,
+        need_weights=False,
+    ):
+        super(MultiHeadAttention, self).__init__()
+
+        self.num_heads = config.num_attention_heads
+        self.head_dim = config.hidden_size // self.num_heads
+
+        # get the `num_heads`
+        assert self.num_heads % config.tensor_parallel_degree == 0
+        if config.tensor_parallel_degree > 0:
+            self.num_heads = self.num_heads // config.tensor_parallel_degree
+            assert (
+                self.head_dim * self.num_heads * config.tensor_parallel_degree == config.hidden_size
+            ), "hidden_size must be divisible by num_heads"
+
+        self.dropout = config.attention_probs_dropout_prob
+        self.need_weights = need_weights
+        self.fuse_attention_qkv = config.fuse_attention_qkv
+
+        if config.tensor_parallel_degree > 1:
+            if self.fuse_attention_qkv:
+                self.qkv_proj = fleet.meta_parallel.ColumnParallelLinear(
+                    config.hidden_size,
+                    config.hidden_size * 3,
+                    has_bias=True,
+                    input_is_parallel=True,
+                )
+            else:
+                self.q_proj = fleet.meta_parallel.ColumnParallelLinear(
+                    config.hidden_size,
+                    config.hidden_size,
+                    has_bias=True,
+                    gather_output=False,
+                )
+                self.k_proj = fleet.meta_parallel.ColumnParallelLinear(
+                    config.hidden_size,
+                    config.hidden_size,
+                    has_bias=True,
+                    gather_output=False,
+                )
+                self.v_proj = fleet.meta_parallel.ColumnParallelLinear(
+                    config.hidden_size,
+                    config.hidden_size,
+                    has_bias=True,
+                    gather_output=False,
+                )
+
+            self.out_proj = fleet.meta_parallel.RowParallelLinear(
+                config.hidden_size, config.hidden_size, input_is_parallel=True, has_bias=True
+            )
+        else:
+            if self.fuse_attention_qkv:
+                self.qkv_proj = nn.Linear(config.hidden_size, 3 * config.hidden_size)
+            else:
+                self.q_proj = nn.Linear(config.hidden_size, config.hidden_size)
+                self.k_proj = nn.Linear(config.hidden_size, config.hidden_size)
+                self.v_proj = nn.Linear(config.hidden_size, config.hidden_size)
+
+            self.out_proj = nn.Linear(config.hidden_size, config.hidden_size)
+
+    def _fuse_prepare_qkv(self, query, use_cache=False, cache=None):
+        mix_layer = self.qkv_proj(query)
+        mix_layer = paddle.reshape_(mix_layer, [0, 0, self.num_heads, 3 * self.head_dim])
+        mix_layer = paddle.transpose(mix_layer, [0, 2, 1, 3])
+        q, k, v = paddle.split(mix_layer, num_or_sections=3, axis=-1)
+
+        assert not isinstance(cache, self.StaticCache), "cache currently does not support the StaticCache type"
+
+        if isinstance(cache, self.Cache):
+            # for decoder self-attention in inference
+            k = paddle.concat([cache.k, k], axis=2)
+            v = paddle.concat([cache.v, v], axis=2)
+        if use_cache is True:
+            cache = self.Cache(k, v)
+
+        return (q, k, v, cache) if use_cache else (q, k, v, None)
+
+    def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
+        r"""
+        Prapares linear projected queries, keys and values for usage of subsequnt
+        multiple parallel attention. If `cache` is not None, using cached results
+        to reduce redundant calculations.
+
+        """
+        q = self.q_proj(query)
+        q = paddle.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
+        q = paddle.transpose(x=q, perm=[0, 2, 1, 3])
+
+        if isinstance(cache, self.StaticCache):
+            # for encoder-decoder attention in inference and has cached
+            k, v = cache.k, cache.v
+        else:
+            k, v = self.compute_kv(key, value)
+
+        if isinstance(cache, self.Cache):
+            # for decoder self-attention in inference
+            k = paddle.concat([cache.k, k], axis=2)
+            v = paddle.concat([cache.v, v], axis=2)
+        if use_cache is True:
+            cache = self.Cache(k, v)
+
+        return (q, k, v, None) if use_cache is False else (q, k, v, cache)
+
+    def compute_kv(self, key, value):
+        r"""
+        Applies linear projection on input keys and values, then splits heads
+        (reshape and transpose) to get keys and values from different representation
+        subspaces. The results are used as key-values pairs for subsequent multiple
+        parallel attention.
+
+        It is part of calculations in multi-head attention, and is provided as
+        a method to pre-compute and prefetch these results, thus we can use them
+        to construct cache for inference.
+
+        """
+        k = self.k_proj(key)
+        v = self.v_proj(value)
+        k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
+        k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
+        v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
+        v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
+        return k, v
+
+    def gen_cache(self, key, value=None, type=Cache):
+        """
+        Generates cache for `forward` usage in inference accroding to arguments.
+        The generated cache is an instance of `MultiHeadAttention.Cache` or an
+        instance of `MultiHeadAttention.StaticCache`.
+        """
+        if type == MultiHeadAttention.StaticCache:  # static_kv
+            k, v = self.compute_kv(key, value)
+            return self.StaticCache(k, v)
+        elif value is None:  # incremental_state
+            k = paddle.full(shape=[key.shape[0], self.num_heads, 0, self.head_dim], dtype=key.dtype, fill_value=0)
+            v = paddle.full(shape=[key.shape[0], self.num_heads, 0, self.head_dim], dtype=key.dtype, fill_value=0)
+            return self.Cache(k, v)
+        else:
+            # incremental_state with initial value, mainly for usage like UniLM
+            return self.Cache(key, value)
+
+    def forward(self, query, key, value, attn_mask=None, use_cache=False, cache=None):
+        r"""
+        Applies multi-head attention to map queries and a set of key-value pairs
+        to outputs.
+        """
+        key = query if key is None else key
+        value = query if value is None else value
+
+        if self.fuse_attention_qkv:
+            q, k, v, cache = self._fuse_prepare_qkv(query, use_cache, cache)
+        else:
+            q, k, v, cache = self._prepare_qkv(query, key, value, use_cache, cache)
+
+        # scale dot product attention
+        product = paddle.matmul(x=q * (self.head_dim**-0.5), y=k, transpose_y=True)
+
+        if attn_mask is not None:
+            product = product + attn_mask
+
+        weights = F.softmax(product)
+        if self.dropout:
+            weights = F.dropout(weights, self.dropout, training=self.training, mode="upscale_in_train")
+
+        out = tensor.matmul(weights, v)
+
+        # combine heads
+        out = tensor.transpose(out, perm=[0, 2, 1, 3])
+        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+
+        # project to output
+        out = self.out_proj(out)
+
+        outs = [out]
+        if self.need_weights:
+            outs.append(weights)
+        if use_cache:
+            outs.append(cache)
+        return out if len(outs) == 1 else tuple(outs)
+
+
+class TransformerDecoderLayer(nn.Layer):
+    """
+    The transformer decoder layer.
+
+    It contains multiheadattention and some linear layers.
+    """
+
+    def __init__(self, config):
+
+        d_model = config.hidden_size
+        dim_feedforward = config.intermediate_size
+        dropout = config.hidden_dropout_prob
+        activation = config.hidden_act
+        attn_dropout = config.attention_probs_dropout_prob
+        act_dropout = config.hidden_dropout_prob
+        normalize_before = getattr(config, "normalize_before", True)
+
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(mean=0.0, std=config.initializer_range))
+        bias_attr = None
+
+        self._config = locals()
+        self._config.pop("self")
+        self._config.pop("__class__", None)  # py3
+
+        super(TransformerDecoderLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
+        bias_attrs = _convert_param_attr_to_list(bias_attr, 3)
+
+        self.self_attn = MultiHeadAttention(config, need_weights=True)
+        if config.tensor_parallel_degree > 1:
+            self.linear1 = fleet.meta_parallel.ColumnParallelLinear(
+                d_model,
+                dim_feedforward,
+                gather_output=False,
+                has_bias=True,
+            )
+        else:
+            self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2])
+
+        if config.tensor_parallel_degree > 1:
+            self.linear2 = fleet.meta_parallel.RowParallelLinear(
+                dim_feedforward,
+                d_model,
+                input_is_parallel=True,
+                has_bias=True,
+            )
+        else:
+            self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2])
+
+        self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout2 = nn.Dropout(act_dropout, mode="upscale_in_train")
+
+        if activation == "gelu":
+            self.activation = nn.GELU(approximate=True)
+        else:
+            self.activation = getattr(F, activation)
+
+    def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None, output_attentions=False):
+        residual = tgt
+
+        if self.normalize_before:
+            tgt = self.norm1(tgt)
+
+        # self.self_attn(...) --> hidden_states, weights, (cache)
+        if use_cache is False:
+            tgt, attn_weights = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache)
+        else:
+            tgt, attn_weights, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache)
+        tgt = residual + self.dropout1(tgt)
+        if not self.normalize_before:
+            tgt = self.norm1(tgt)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm2(tgt)
+        tgt = self.dropout2(self.linear2(self.activation(self.linear1(tgt))))
+        tgt = residual + tgt
+
+        if not self.normalize_before:
+            tgt = self.norm2(tgt)
+
+        if not (output_attentions or use_cache):
+            return tgt
+
+        temp_list = [tgt, attn_weights if output_attentions else None, incremental_cache if use_cache else None]
+
+        return tuple(v for v in temp_list if v is not None)
+
+    def gen_cache(self, memory):
+        incremental_cache = self.self_attn.gen_cache(memory, type=self.self_attn.Cache)
+        return incremental_cache
+
+
+class TransformerDecoder(Layer):
+    """
+    TransformerDecoder is a stack of N decoder layers.
+    """
+
+    def __init__(self, config: OPTConfig, decoder_layers: List[Layer]):
+        super(TransformerDecoder, self).__init__()
+
+        if config.word_embed_proj_dim != config.hidden_size:
+            if config.tensor_parallel_degree > 1:
+                self.project_out = fleet.meta_parallel.ColumnParallelLinear(
+                    config.hidden_size,
+                    config.word_embed_proj_dim,
+                    gather_output=True,
+                    has_bias=False,
+                )
+            else:
+                self.project_out = nn.Linear(config.hidden_size, config.word_embed_proj_dim, bias_attr=False)
+        else:
+            self.project_out = None
+
+        self.num_layers = config.num_hidden_layers
+        self.layers = decoder_layers
+
+        if config.normalize_before:
+            self.final_layer_norm = nn.LayerNorm(config.hidden_size)
+        else:
+            self.final_layer_norm = None
+
+        self.checkpoints = []
+
+    def forward(
+        self,
+        tgt,
+        memory,
+        tgt_mask=None,
+        memory_mask=None,
+        use_cache: bool = False,
+        cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
+    ):
+        r"""
+        Applies a stack of N Transformer decoder layers on inputs. If `norm` is
+        provided, also applies layer normalization on the output of last decoder
+        layer.
+        """
+        output = tgt
+        new_caches = [] if use_cache else None
+        self.checkpoints = []
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        for i, mod in enumerate(self.layers):
+            outputs = mod(
+                output,
+                memory,
+                tgt_mask=tgt_mask,
+                use_cache=use_cache,
+                cache=cache[i] if cache is not None else cache,
+                output_attentions=output_attentions,
+            )
+
+            # outputs = hidden_states if both use_cache and output_attentions are False
+            # Otherwise, outputs = (hidden_states, attention if output_attentions, cache if use_cache)
+            output = outputs[0] if (use_cache or output_attentions) else outputs
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[1],)
+            if use_cache:
+                new_caches.append(outputs[-1])
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (output,)
+            self.checkpoints.append(output.name)
+
+        if self.final_layer_norm:
+            output = self.final_layer_norm(output)
+
+        if self.project_out:
+            output = self.project_out(output)
+
+        if not return_dict:
+            temp_list = [output, new_caches, all_hidden_states, all_self_attentions]
+
+            if not (use_cache or output_attentions or output_hidden_states):
+                return output
+
+            return tuple(v for v in temp_list if v is not None)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=output,
+            past_key_values=new_caches,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=None,
+        )
+
+    def gen_cache(self, memory, do_zip=False):
+        r"""
+        Generates cache for `forward` usage. The generated cache is a list, and
+        each element in it is a tuple( :code:`(incremental_cache, static_cache)` )
+        produced by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache`
+        for more details. If `do_zip` is True, apply `zip` on these tuples to get
+        a list with two elements.
+        """
+        cache = [layer.gen_cache(memory) for layer in self.layers]
+        if do_zip:
+            cache = list(zip(*cache))
+        return cache
+
+
+class OPTLearnedPositionEmbedding(nn.Embedding):
+    """this module learns postional embeddings up to a fixed maximum size"""
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, initializer_range: float):
+        """OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
+        and adjust num_embeddings appropriately. Other models don't have this hack.
+
+        Args:
+            num_embeddings (int): the number of embedding size
+            embedding_dim (int): the dim of embedding
+        """
+        self.offset = 2
+        super().__init__(num_embeddings + self.offset, embedding_dim)
+
+    def forward(self, attention_mask, past_key_values_length: int = 0):
+        """get the position embedding with attention mask
+
+        Args:
+            attention_mask: (paddle.Tensor): # create positions depending on attention_mask
+            past_key_values_length (int, optional): the past key value which will . Defaults to 0.
+
+        Returns:
+            paddle.Tensor: the position embedding
+        """
+        # create positions depending on attention_mask
+        if attention_mask.dtype not in [paddle.bool, paddle.int64]:
+            attention_mask = attention_mask == 1.0
+
+        position_ids = paddle.cumsum(paddle.cast(attention_mask, "int64"), axis=-1) - 1
+
+        # cut positions if `past_key_values_length` is > 0
+        position_ids = position_ids[:, past_key_values_length:]
+        return nn.Embedding.forward(self, position_ids + self.offset)
+
+
+class OPTEmbeddings(Layer):
+    """
+    Include embeddings from word and position embeddings.
+    """
+
+    def __init__(self, config: OPTConfig):
+        super(OPTEmbeddings, self).__init__()
+        if config.tensor_parallel_degree > 1:
+            self.word_embeddings = fleet.meta_parallel.VocabParallelEmbedding(
+                config.vocab_size,
+                config.word_embed_proj_dim,
+                weight_attr=paddle.ParamAttr(
+                    initializer=nn.initializer.Normal(mean=0.0, std=config.initializer_range)
+                ),
+            )
+        else:
+            self.word_embeddings = nn.Embedding(
+                config.vocab_size,
+                config.word_embed_proj_dim,
+                # padding_idx=config.pad_token_id,
+                weight_attr=paddle.ParamAttr(
+                    initializer=nn.initializer.Normal(mean=0.0, std=config.initializer_range)
+                ),
+            )
+
+        if config.word_embed_proj_dim != config.hidden_size:
+            if config.tensor_parallel_degree > 1:
+                self.project_in = fleet.meta_parallel.ColumnParallelLinear(
+                    config.word_embed_proj_dim,
+                    config.hidden_size,
+                    gather_output=True,
+                    has_bias=False,
+                )
+            else:
+                self.project_in = nn.Linear(config.word_embed_proj_dim, config.hidden_size, bias_attr=False)
+        else:
+            self.project_in = None
+
+        self.position_embeddings = OPTLearnedPositionEmbedding(
+            num_embeddings=config.max_position_embeddings,
+            embedding_dim=config.hidden_size,
+            initializer_range=config.initializer_range,
+        )
+
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids=None, attention_mask=None, input_embeddings=None, past_key_values_length=None):
+        if input_ids is not None and input_embeddings is None:
+            input_embeddings = self.word_embeddings(input_ids)
+
+        if self.project_in:
+            input_embeddings = self.project_in(input_embeddings)
+
+        position_embeddings = self.position_embeddings(attention_mask, past_key_values_length)
+
+        embeddings = input_embeddings + position_embeddings
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class OPTPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained OPT models. It provides OPT related
+    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
+    `pretrained_init_configuration`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    config_class = OPTConfig
+    base_model_prefix = "opt"
+
+    pretrained_init_configuration = OPT_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = OPT_PRETRAINED_RESOURCE_FILES_MAP
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config: OPTConfig, is_split=True):
+
+        from paddlenlp.transformers.conversion_utils import split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+        actions = {
+            "embeddings.word_embeddings.weight": partial(fn, is_column=False),
+        }
+        for layer_index in range(config.num_hidden_layers):
+            actions.update(
+                {
+                    # Column Linear
+                    f"decoder.layers.{layer_index}.self_attn.q_proj.weight": partial(fn, is_column=True),
+                    f"decoder.layers.{layer_index}.self_attn.q_proj.bias": partial(fn, is_column=True),
+                    f"decoder.layers.{layer_index}.self_attn.k_proj.weight": partial(fn, is_column=True),
+                    f"decoder.layers.{layer_index}.self_attn.k_proj.bias": partial(fn, is_column=True),
+                    f"decoder.layers.{layer_index}.self_attn.v_proj.weight": partial(fn, is_column=True),
+                    f"decoder.layers.{layer_index}.self_attn.v_proj.bias": partial(fn, is_column=True),
+                    f"decoder.layers.{layer_index}.linear1.weight": partial(fn, is_column=True),
+                    f"decoder.layers.{layer_index}.linear1.bias": partial(fn, is_column=True),
+                    # Row Linear
+                    f"decoder.layers.{layer_index}.linear2.weight": partial(fn, is_column=False),
+                    f"decoder.layers.{layer_index}.self_attn.out_proj.weight": partial(fn, is_column=False),
+                }
+            )
+
+        if config.word_embed_proj_dim != config.hidden_size:
+            actions.update(
+                {
+                    "decoder.project_out.weight": partial(fn, is_column=True),
+                    "decoder.project_in.weight": partial(fn, is_column=True),
+                }
+            )
+
+        if cls.__name__ != "OPTModel":
+            for key in list(actions.keys()):
+                actions["opt." + key] = actions.pop(key)
+
+        return actions
+
+    @classmethod
+    def _get_fuse_or_split_param_mappings(cls, config: OPTConfig, is_fuse=False):
+        # return parameter fuse utils
+        from paddlenlp.transformers.conversion_utils import split_or_fuse_func
+
+        fn = split_or_fuse_func(is_fuse=is_fuse)
+
+        # last key is fused key, other keys are to be fused.
+        fuse_qkv_keys = (
+            "decoder.layers.0.self_attn.q_proj.weight",
+            "decoder.layers.0.self_attn.k_proj.weight",
+            "decoder.layers.0.self_attn.v_proj.weight",
+            "decoder.layers.0.self_attn.qkv_proj.weight",
+        )
+        fuse_qkv_bias_keys = (
+            "decoder.layers.0.self_attn.q_proj.bias",
+            "decoder.layers.0.self_attn.k_proj.bias",
+            "decoder.layers.0.self_attn.v_proj.bias",
+            "decoder.layers.0.self_attn.qkv_proj.bias",
+        )
+        num_heads = config.num_attention_heads
+        num_key_value_heads = getattr(config, "num_key_value_heads", num_heads)
+        fuse_attention_qkv = getattr(config, "fuse_attention_qkv", False)
+
+        final_actions = {}
+        if is_fuse:
+            if fuse_attention_qkv:
+                for i in range(config.num_hidden_layers):
+                    for keys in [fuse_qkv_keys, fuse_qkv_bias_keys]:
+                        new_keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in keys])
+                        final_actions[new_keys] = partial(
+                            fn, is_qkv=True, num_heads=num_heads, num_key_value_heads=num_key_value_heads
+                        )
+        else:
+            if not fuse_attention_qkv:
+                for i in range(config.num_hidden_layers):
+                    for keys in [fuse_qkv_keys, fuse_qkv_bias_keys]:
+                        new_keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in keys])
+                        final_actions[new_keys] = partial(
+                            fn, split_nums=3, is_qkv=True, num_heads=num_heads, num_key_value_heads=num_key_value_heads
+                        )
+        return final_actions
+
+    @classmethod
+    def _get_name_mappings(cls, config: OPTConfig) -> list[StateDictNameMapping]:
+        mappings: list[StateDictNameMapping] = []
+        model_mappings = [
+            ["decoder.embed_tokens.weight", "embeddings.word_embeddings.weight"],
+            ["decoder.embed_positions.weight", "embeddings.position_embeddings.weight"],
+            ["decoder.final_layer_norm.weight", "decoder.final_layer_norm.weight"],
+            ["decoder.final_layer_norm.bias", "decoder.final_layer_norm.bias"],
+        ]
+        for layer_index in range(config.num_hidden_layers):
+            layer_mappings = [
+                [
+                    f"decoder.layers.{layer_index}.self_attn.k_proj.weight",
+                    f"decoder.layers.{layer_index}.self_attn.k_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"decoder.layers.{layer_index}.self_attn.k_proj.bias",
+                    f"decoder.layers.{layer_index}.self_attn.k_proj.bias",
+                ],
+                [
+                    f"decoder.layers.{layer_index}.self_attn.v_proj.weight",
+                    f"decoder.layers.{layer_index}.self_attn.v_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"decoder.layers.{layer_index}.self_attn.v_proj.bias",
+                    f"decoder.layers.{layer_index}.self_attn.v_proj.bias",
+                ],
+                [
+                    f"decoder.layers.{layer_index}.self_attn.q_proj.weight",
+                    f"decoder.layers.{layer_index}.self_attn.q_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"decoder.layers.{layer_index}.self_attn.q_proj.bias",
+                    f"decoder.layers.{layer_index}.self_attn.q_proj.bias",
+                ],
+                [
+                    f"decoder.layers.{layer_index}.self_attn.out_proj.weight",
+                    f"decoder.layers.{layer_index}.self_attn.out_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"decoder.layers.{layer_index}.self_attn.out_proj.bias",
+                    f"decoder.layers.{layer_index}.self_attn.out_proj.bias",
+                ],
+                [
+                    f"decoder.layers.{layer_index}.self_attn_layer_norm.weight",
+                    f"decoder.layers.{layer_index}.norm1.weight",
+                ],
+                [
+                    f"decoder.layers.{layer_index}.self_attn_layer_norm.bias",
+                    f"decoder.layers.{layer_index}.norm1.bias",
+                ],
+                [
+                    f"decoder.layers.{layer_index}.fc1.weight",
+                    f"decoder.layers.{layer_index}.linear1.weight",
+                    "transpose",
+                ],
+                [f"decoder.layers.{layer_index}.fc1.bias", f"decoder.layers.{layer_index}.linear1.bias"],
+                [
+                    f"decoder.layers.{layer_index}.fc2.weight",
+                    f"decoder.layers.{layer_index}.linear2.weight",
+                    "transpose",
+                ],
+                [f"decoder.layers.{layer_index}.fc2.bias", f"decoder.layers.{layer_index}.linear2.bias"],
+                [
+                    f"decoder.layers.{layer_index}.final_layer_norm.weight",
+                    f"decoder.layers.{layer_index}.norm2.weight",
+                ],
+                [f"decoder.layers.{layer_index}.final_layer_norm.bias", f"decoder.layers.{layer_index}.norm2.bias"],
+            ]
+            model_mappings.extend(layer_mappings)
+
+        # base-model prefix "OPTModel"
+        if cls.__name__ != "OPTModel":
+            for mapping in model_mappings:
+                mapping[0] = "model." + mapping[0]
+                mapping[1] = "opt." + mapping[1]
+
+        # downstream mappings
+        mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)]
+        return mappings
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.initializer_range
+                        if hasattr(self, "initializer_range")
+                        else self.opt.config["initializer_range"],
+                        shape=layer.weight.shape,
+                    )
+                )
+
+
+@register_base_model
+class OPTModel(OPTPretrainedModel):
+    r"""
+    The bare OPT Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`OPTConfig`):
+            An instance of OPTConfig used to construct OPTModel.
+    """
+
+    def __init__(self, config: OPTConfig):
+        super(OPTModel, self).__init__(config)
+        self.pad_token_id = config.pad_token_id
+        self.initializer_range = config.initializer_range
+        self.hidden_size = config.hidden_size
+        self.vocab_size = config.vocab_size
+        self.embeddings = OPTEmbeddings(config)
+
+        config.fuse_attention_qkv = False
+        decoder_layers = nn.LayerList()
+        for i in range(config.num_hidden_layers):
+            decoder_layers.append(TransformerDecoderLayer(config))
+        self.decoder = TransformerDecoder(config, decoder_layers)
+        self.checkpoints = []
+
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape, past_key_values_length=past_key_values_length, dtype=attention_mask.dtype
+            )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, tgt_length=input_shape[-1])
+            if input_shape[-1] > 1:
+                combined_attention_mask = combined_attention_mask + expanded_attn_mask
+            else:
+                combined_attention_mask = expanded_attn_mask
+
+        return combined_attention_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        use_cache=False,
+        cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        The OPTModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            position_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in self attention to avoid performing attention to some unwanted positions,
+                usually the subsequent positions.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
+                [batch_size, num_attention_heads, sequence_length, sequence_length].
+                Its data type should be float32.
+                The `masked` tokens have `-1e9` values, and the `unmasked` tokens have `0` values.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation
+                of shape `(batch_size, sequence_length, hidden_size)`. This is useful if you want more control over
+                how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+                Default to None.
+            use_cache (bool, optional):
+                Whether or not to use cache. Defaults to `False`. If set to `True`, key value states will be returned and
+                can be used to speed up decoding.
+            cache (list, optional):
+                It is a list, and each element in the list is a tuple `(incremental_cache, static_cache)`.
+                See `TransformerDecoder.gen_cache <https://github.com/PaddlePaddle/Paddle/blob/release/2.1/python/paddle/nn/layer/transformer.py#L1060>`__ for more details.
+                It is only used for inference and should be None for training.
+                Default to `None`.
+            output_attentions (bool, optional):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail. Defaults to `None`.
+            output_hidden_states (bool, optional):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail. Defaults to `None`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `None`.
+
+
+        Returns:
+            Tensor: Returns tensor `encoder_output`, which is the output at the last layer of the model.
+            Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import OPTModel, GPTTokenizer
+
+                tokenizer = GPTTokenizer.from_pretrained('facebook/opt-125m')
+
+                model = OPTModel.from_pretrained('facebook/opt-125m')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLimage.pngP!", return_token_type_ids=False)
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+        """
+        if position_ids is not None:
+            logger.warning("position_ids has not required for OPTModel.")
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+            input_ids = input_ids.reshape((-1, input_shape[-1]))
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        self.checkpoints = []
+        past_key_values_length = cache[0].k.shape[2] if cache is not None else 0
+
+        seq_length_with_past = input_shape[-1] + past_key_values_length
+
+        if attention_mask is None:
+            attention_mask = paddle.ones((input_shape[0], seq_length_with_past), dtype=paddle.bool)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            input_embeddings=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        attention_mask = self._prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length)
+        attention_mask.stop_gradient = True
+
+        outputs = self.decoder.forward(
+            embedding_output,
+            memory=None,
+            tgt_mask=attention_mask,
+            use_cache=use_cache,
+            cache=cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if output_hidden_states:
+            if return_dict:
+                outputs.hidden_states = (embedding_output,) + outputs.hidden_states
+            else:
+                # [last_hidden_state, caches, all_hidden_states, all_self_attentions]
+                idx = 2 if use_cache else 1
+                all_hidden_states = ((embedding_output,) + outputs[idx],)
+                outputs = outputs[:idx] + all_hidden_states + outputs[idx + 1 :]
+
+        self.checkpoints.extend(self.decoder.checkpoints)
+        return outputs
+
+    def get_input_embeddings(self):
+        """get opt input word embedding
+        Returns:
+            nn.Embedding: the input word embedding of opt mdoel
+        """
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, embedding: nn.Embedding):
+        """set opt input embedding
+        Returns:
+            nn.Embedding: the instance of new word embedding
+        """
+        self.embeddings.word_embeddings = embedding
+
+
+class OPTLMHead(Layer):
+    def __init__(self, config: OPTConfig, embedding_weights=None):
+        super(OPTLMHead, self).__init__()
+        self.config = config
+        self.decoder_weight = (
+            self.create_parameter(
+                default_initializer=paddle.nn.initializer.Uniform(low=-0.1, high=0.1),
+                shape=[config.vocab_size, config.hidden_size],
+                dtype=config.dtype,
+                is_bias=True,
+            )
+            if embedding_weights is None
+            else embedding_weights
+        )
+
+    def forward(self, hidden_states):
+        if isinstance(hidden_states, BaseModelOutputWithPastAndCrossAttentions):
+            hidden_states = hidden_states["last_hidden_state"]
+        logits = paddle.tensor.matmul(hidden_states, self.decoder_weight.cast(hidden_states.dtype), transpose_y=True)
+        return logits
+
+
+class OPTForCausalLM(OPTPretrainedModel):
+    """
+    The OPT Model with a `language modeling` head on top.
+
+    Args:
+        config (:class:`OPTConfig`):
+            An instance of OPTConfig used to construct OPTModel.
+
+    """
+
+    def __init__(self, config: OPTConfig):
+        super(OPTForCausalLM, self).__init__(config)
+        self.opt = OPTModel(config)
+        self.lm_head = OPTLMHead(
+            config,
+        )
+
+    def _get_model_inputs_spec(self, dtype: str):
+        return {
+            "input_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+        }
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=False,
+        cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **kwargs,
+    ):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`OPTModel`.
+            attention_mask (Tensor, optional):
+                See :class:`OPTModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`GPTModel`.
+            use_cache (bool, optional):
+                See :class:`OPTModel`.
+            cache (Tensor, optional):
+                See :class:`OPTModel`.
+            labels (paddle.Tensor, optional):
+                A Tensor of shape `(batch_size, sequence_length)`.
+                Labels for language modeling. Note that the labels are shifted inside the model, i.e. you can set
+                `labels = input_ids` Indices are selected in `[-100, 0, ..., vocab_size]` All labels set to `-100`
+                are ignored (masked), the loss is only computed for labels in `[0, ..., vocab_size]`
+                Defaults to None.
+            output_attentions (bool, optional):
+                See :class:`GPTModel`.
+            output_hidden_states (bool, optional):
+                See :class:`GPTModel`.
+            return_dict (bool, optional):
+                See :class:`GPTModel`.
+        Returns:
+            Tensor or tuple: Returns tensor `logits` or tuple `(logits, cached_kvs)`. If `use_cache` is True,
+            tuple (`logits, cached_kvs`) will be returned. Otherwise, tensor `logits` will be returned.
+            `logits` is the output of the opt model.
+            `cache_kvs` is the cache output of opt model if `use_cache` is True.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import OPTForCausalLM, GPTTokenizer
+
+                tokenizer = GPTTokenizer.from_pretrained('facebook/opt-125m')
+                model = OPTForCausalLM.from_pretrained('facebook/opt-125m')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output_ids, score = model.generate(input_ids=inputs['input_ids'])
+                print(tokenizer.batch_decode(output_ids[0]))
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.opt(
+            input_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache=cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if use_cache:
+            encoder_outputs, cached_kvs = outputs[:2]
+        else:
+            encoder_outputs = outputs
+
+        logits = self.lm_head(encoder_outputs)
+
+        loss = None
+        if labels is not None:
+            loss = nn.functional.cross_entropy(logits, labels)
+
+        if not return_dict:
+            if not use_cache:
+                return (loss, logits) if loss is not None else logits
+
+            outputs = (logits,) + outputs[1:]
+            return ((loss,) + outputs) if loss is not None else outputs
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_fast_entry(self, kwargs: Dict[str, Any]):
+        # import FasterOPT at here to avoid cycling import
+        from paddlenlp.ops import FasterOPT
+
+        use_fp16_decoding = kwargs.get("use_fp16_decoding", False)
+        decode_strategy = kwargs.get("decode_strategy")
+        # decoding_lib can be passed into FasterOPT
+        decoding_lib = kwargs.get("decoding_lib", None)
+
+        if decode_strategy == "beam_search":
+            raise AttributeError("'beam_search' is not supported yet in the fast version of OPT")
+        # Currently, FasterTransformer only support restricted size_per_head.
+        size_per_head = self.opt.config["hidden_size"] // self.opt.config["num_attention_heads"]
+        if size_per_head not in [32, 64, 80, 96, 128]:
+            raise AttributeError(
+                "'size_per_head = %d' is not supported yet in the fast version of OPT" % size_per_head
+            )
+        if kwargs["forced_bos_token_id"] is not None:
+            # not support for forced_bos_token_id yet in the fast version
+            raise AttributeError("'forced_bos_token_id != None' is not supported yet in the fast version")
+        if kwargs["min_length"] != 0:
+            # not support for min_length yet in the fast version
+            raise AttributeError("'min_length != 0' is not supported yet in the fast version")
+        self._fast_entry = FasterOPT(self, use_fp16_decoding=use_fp16_decoding, decoding_lib=decoding_lib).forward
+        return self._fast_entry
+
+    def prepare_inputs_for_generation(
+        self, input_ids, use_cache=False, cache=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if cache is not None:
+            input_ids = input_ids[:, -1:]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and cache is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "cache": cache,
+                "use_cache": True,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id):
+        is_pad_token_in_inputs_ids = (pad_token_id is not None) and paddle.any(input_ids == pad_token_id).item()
+        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (
+            (eos_token_id is not None) and (pad_token_id != eos_token_id)
+        )
+        if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id:
+            attention_mask = (input_ids != pad_token_id).astype("int64")
+        else:
+            attention_mask = paddle.ones_like(input_ids, dtype="int64")
+        return attention_mask
+
+    def __getattr__(self, name):
+        try:
+            return super().__getattr__(name)
+        except AttributeError as e:
+            try:
+                return getattr(getattr(self, self.base_model_prefix), name)
+            except AttributeError:
+                try:
+                    return getattr(self, self.base_model_prefix).config[name]
+                except KeyError:
+                    raise e
+
+
+OPTForConditionalGeneration = OPTForCausalLM
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/optimization.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/optimization.py
new file mode 100644
index 000000000..03e3fd2d0
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/optimization.py
@@ -0,0 +1,304 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+from paddle.optimizer.lr import LambdaDecay, LRScheduler
+
+__all__ = [
+    "LinearDecayWithWarmup",
+    "ConstScheduleWithWarmup",
+    "CosineDecayWithWarmup",
+    "PolyDecayWithWarmup",
+    "CosineAnnealingWithWarmupDecay",
+    "LinearAnnealingWithWarmupDecay",
+]
+
+
+def is_integer(number):
+    return isinstance(number, int)
+
+
+class CosineAnnealingWithWarmupDecay(LRScheduler):
+    def __init__(self, max_lr, min_lr, warmup_step, decay_step, last_epoch=-1, verbose=False):
+        self.decay_step = decay_step
+        self.warmup_step = warmup_step
+        self.max_lr = max_lr
+        self.min_lr = min_lr
+        super(CosineAnnealingWithWarmupDecay, self).__init__(max_lr, last_epoch, verbose)
+
+    def get_lr(self):
+        if self.warmup_step > 0 and self.last_epoch <= self.warmup_step:
+            return float(self.max_lr) * (self.last_epoch) / self.warmup_step
+
+        if self.last_epoch > self.decay_step:
+            return self.min_lr
+
+        num_step_ = self.last_epoch - self.warmup_step
+        decay_step_ = self.decay_step - self.warmup_step
+        decay_ratio = float(num_step_) / float(decay_step_)
+        coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0)
+        return self.min_lr + coeff * (self.max_lr - self.min_lr)
+
+
+class LinearAnnealingWithWarmupDecay(LRScheduler):
+    def __init__(self, max_lr, min_lr, warmup_step, decay_step, last_epoch=-1, verbose=False):
+
+        self.decay_step = decay_step
+        self.warmup_step = warmup_step
+        self.max_lr = max_lr
+        self.min_lr = min_lr
+        super(LinearAnnealingWithWarmupDecay, self).__init__(max_lr, last_epoch, verbose)
+
+    def get_lr(self):
+        if self.warmup_step > 0 and self.last_epoch <= self.warmup_step:
+            return float(self.max_lr) * (self.last_epoch) / self.warmup_step
+
+        if self.last_epoch > self.decay_step:
+            return self.min_lr
+
+        num_step_ = self.last_epoch - self.warmup_step
+        decay_step_ = self.decay_step - self.warmup_step
+        decay_ratio = float(num_step_) / float(decay_step_)
+        coeff = 1.0 - decay_ratio
+        return self.min_lr + coeff * (self.max_lr - self.min_lr)
+
+
+class LinearDecayWithWarmup(LambdaDecay):
+    """
+    Creates a learning rate scheduler, which increases learning rate linearly
+    from 0 to given `learning_rate`, after this warmup period learning rate
+    would be decreased linearly from the base learning rate to 0.
+
+    Args:
+        learning_rate (float):
+            The base learning rate. It is a python float number.
+        total_steps (int):
+            The number of training steps.
+        warmup (int or float):
+            If int, it means the number of steps for warmup. If float, it means
+            the proportion of warmup in total training steps.
+        last_epoch (int, optional):
+            The index of last epoch. It can be set to restart training. If
+            None, it means initial learning rate.
+            Defaults to -1.
+        verbose (bool, optional):
+            If True, prints a message to stdout for each update.
+            Defaults to False.
+
+    Examples:
+
+        .. code-block:: python
+
+            from paddlenlp.transformers import LinearDecayWithWarmup
+            lr, warmup_steps, max_steps = 0.1, 100, 1000
+            lr_scheduler = LinearDecayWithWarmup(lr, max_steps, warmup_steps)
+
+    """
+
+    def __init__(self, learning_rate, total_steps, warmup, last_epoch=-1, verbose=False):
+        warmup_steps = warmup if is_integer(warmup) else int(math.floor(warmup * total_steps))
+
+        def lr_lambda(current_step):
+            if current_step < warmup_steps:
+                return float(current_step) / float(max(1, warmup_steps))
+            return max(0.0, float(total_steps - current_step) / float(max(1, total_steps - warmup_steps)))
+
+        super(LinearDecayWithWarmup, self).__init__(learning_rate, lr_lambda, last_epoch, verbose)
+
+
+class ConstScheduleWithWarmup(LambdaDecay):
+    """
+    Creates a learning rate scheduler, which increases learning rate linearly
+    from 0 to given `learning_rate` during warmup periods and keeps learning
+    rate a constant after that.
+
+    Args:
+        learning_rate (float):
+            The base learning rate. It is a python float number.
+        warmup (int or float):
+            If int, it means the number of steps for warmup. If float, it means
+            the proportion of warmup in total training steps.
+        total_steps (int, optional):
+            The number of training steps. If `warmup` is a float number,
+            `total_steps` must be provided.
+            Defaults to None.
+        last_epoch (int, optional):
+            The index of last epoch. It can be set to restart training. If
+            None, it means initial learning rate.
+            Defaults to -1.
+
+    Examples:
+
+        .. code-block:: python
+
+            from paddlenlp.transformers import ConstScheduleWithWarmup
+            lr, warmup_steps = 0.1, 100
+            lr_scheduler = ConstScheduleWithWarmup(lr, warmup_steps)
+
+    """
+
+    def __init__(self, learning_rate, warmup, total_steps=None, last_epoch=-1, verbose=False):
+        warmup_steps = warmup if is_integer(warmup) else int(math.floor(warmup * total_steps))
+        if is_integer(warmup):
+            warmup_steps = warmup
+        elif total_steps:
+            warmup_steps = int(math.floor(warmup * total_steps))
+        else:
+            raise ValueError(
+                "Please provide total steps if `warmup` is a float number , or provide integer for argument `warmup`."
+            )
+
+        def lr_lambda(current_step):
+            if current_step < warmup_steps:
+                return float(current_step) / float(max(1.0, warmup_steps))
+            return 1.0
+
+        super(ConstScheduleWithWarmup, self).__init__(learning_rate, lr_lambda, last_epoch, verbose)
+
+
+class CosineDecayWithWarmup(LambdaDecay):
+    """
+    Creates a learning rate scheduler, which increases learning rate linearly
+    from 0 to given `learning_rate`, after this warmup period learning rate
+    would be decreased following the values of the cosine function. If
+    `with_hard_restarts` is True, the cosine function could have serveral hard
+    restarts.
+
+    Args:
+        learning_rate (float):
+            The base learning rate. It is a python float number.
+        total_steps (int):
+            The number of training steps.
+        warmup (int or float):
+            If int, it means the number of steps for warmup. If float, it means
+            the proportion of warmup in total training steps.
+        with_hard_restarts (bool):
+            Whether cosine function has several hard restarts.
+            Defaults to False.
+        num_cycles (int or float, optional):
+            If `with_hard_restarts` is False, it means the number of waves in
+            cosine scheduler and should be an integer number and defaults to 1.
+            If `with_hard_restarts` is True, it means the number of hard
+            restarts to use and should be a float number and defaults to be 0.5.
+            Defaults to None.
+        last_epoch (int, optional):
+            The index of last epoch. It can be set to restart training. If
+            None, it means initial learning rate.
+            Defaults to -1.
+
+    Examples:
+
+        .. code-block:: python
+
+            from paddlenlp.transformers import CosineDecayWithWarmup
+            lr, warmup_steps, max_steps = 0.1, 100, 1000
+            lr_scheduler = CosineDecayWithWarmup(lr, max_steps, warmup_steps)
+
+    """
+
+    def __init__(
+        self,
+        learning_rate,
+        total_steps,
+        warmup,
+        with_hard_restarts=False,
+        num_cycles=None,
+        last_epoch=-1,
+        verbose=False,
+    ):
+        warmup_steps = warmup if is_integer(warmup) else int(math.floor(warmup * total_steps))
+        # Input check
+        if num_cycles is not None:
+            assert (
+                not with_hard_restarts
+                and isinstance(num_cycles, int)
+                or with_hard_restarts
+                and isinstance(num_cycles, float)
+            ), "`num_circles` should be an integer while `with_hard_restarts` is False, an float while `with_hard_restarts` is True."
+        else:
+            num_cycles = 1 if not with_hard_restarts else 0.5
+
+        def lr_lambda(current_step):
+            if current_step < warmup_steps:
+                return float(current_step) / float(max(1, warmup_steps))
+
+            progress = float(current_step - warmup_steps) / float(max(1, total_steps - warmup_steps))
+
+            if with_hard_restarts:
+                if progress >= 1.0:
+                    return 0.0
+                return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0))))
+
+            return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
+
+        super(CosineDecayWithWarmup, self).__init__(learning_rate, lr_lambda, last_epoch, verbose)
+
+
+class PolyDecayWithWarmup(LambdaDecay):
+    """
+    Creates a learning rate scheduler, which increases learning rate linearly
+    from 0 to given `lr_init`, after this warmup period learning rate would
+    be decreased as a polynomial decay from the base learning rate to the end
+    learning rate `lr_end`.
+
+    Args:
+        learning_rate (float):
+            The base learning rate. It is a python float number.
+        total_steps (int):
+            The number of training steps.
+        warmup (int or float):
+            If int, it means the number of steps for warmup. If float, it means
+            the proportion of warmup in total training steps.
+        lr_end (float, optional):
+            The end learning rate.
+            Defaults to 1e-7.
+        power (float, optional):
+            Power factor.
+            Defaults to 1.0.
+        last_epoch (int, optional):
+            The index of last epoch. It can be set to restart training. If
+            None, it means initial learning rate.
+            Defaults to -1.
+
+    Examples:
+
+        .. code-block:: python
+
+            from paddlenlp.transformers import PolyDecayWithWarmup
+            lr, lr_end, warmup_steps, max_steps = 0.1, 1e-6, 100, 1000
+            lr_scheduler = PolyDecayWithWarmup(lr, max_steps, warmup_steps, lr_end)
+
+    """
+
+    def __init__(self, learning_rate, total_steps, warmup, lr_end=1e-7, power=1.0, last_epoch=-1, verbose=False):
+        lr_init = learning_rate
+        assert (
+            lr_init > lr_end
+        ), f"`lr_end` must be be smaller than `learning_rate`. But `lr_end` is {lr_end} while `learning_rate` is {lr_init}."
+        warmup_steps = warmup if is_integer(warmup) else int(math.floor(warmup * total_steps))
+
+        def lr_lambda(current_step):
+            if current_step < warmup_steps:
+                return float(current_step) / float(max(1, warmup_steps))
+            elif current_step > total_steps:
+                return lr_end / lr_init  # it multiplies by lr_init equals to lr_end
+            else:
+                lr_range = lr_init - lr_end
+                decay_steps = total_steps - warmup_steps
+                pct_remaining = 1 - (current_step - warmup_steps) / decay_steps
+                decay = lr_range * pct_remaining**power + lr_end
+                return decay / lr_init  # it multiplies by lr_init equals to decay
+
+        super(PolyDecayWithWarmup, self).__init__(lr_init, lr_lambda, last_epoch, verbose)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/pegasus/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/pegasus/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/pegasus/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/pegasus/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/pegasus/configuration.py
new file mode 100644
index 000000000..e72efdec4
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/pegasus/configuration.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Pegasus model configuration"""
+from __future__ import annotations
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+from ...utils.log import logger
+
+__all__ = ["PEGASUS_PRETRAINED_INIT_CONFIGURATION", "PegasusConfig"]
+
+PEGASUS_PRETRAINED_INIT_CONFIGURATION = {}
+
+
+class PegasusConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PegasusModel`]. It is used to instantiate a PEGASUS
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the PEGASUS pegasus-238M architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, optional):
+            Vocabulary size of the PEGASUS model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`PegasusModel`]. Default to 50000.
+        d_model (`int`, optional):
+            Dimensionality of the layers and the pooler layer. Default to 1024
+        encoder_layers (`int`, optional):
+            Number of encoder layers. Default to 12.
+        decoder_layers (`int`, optional):
+            Number of decoder layers. Default to 12.
+        encoder_attention_heads (`int`, optional):
+            Number of attention heads for each attention layer in the Transformer encoder. Default to 12.
+        decoder_attention_heads (`int`, optional):
+            Number of attention heads for each attention layer in the Transformer decoder. Default to 12.
+        decoder_ffn_dim (`int`, optional):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. Default to 3072.
+        encoder_ffn_dim (`int`, optional):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. Default to 3072.
+        activation_function (`str` or `function`, optional):
+            The non-linear activation function in the feed-forward layer.
+            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions are supported.
+            Default to `"relu"`.
+        dropout (`float`, optional):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. Default to 0.1.
+        attention_dropout (`float`, optional):
+            The dropout ratio for the attention probabilities. Default to 0.1.
+        activation_dropout (`float`, optional):
+            The dropout ratio for activations inside the fully connected layer. Default to 0.1.
+        max_position_embeddings (`int`, optional):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048). Default to 1024.
+        init_std (`float`, optional):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices. Default to 0.02.
+        num_labels (`int`, optional):
+            The number of labels. Default to 3.
+        forced_eos_token_id (`int`, optional):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`. Default to 1.
+        scale_embedding (`bool`, optional):
+            Scale embeddings by diving by sqrt(d_model). Default to `False`.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+
+    """
+    model_type = "pegasus"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "num_attention_heads": "encoder_attention_heads",
+        "hidden_size": "d_model",
+        "num_classes": "num_labels",
+    }
+    pretrained_init_configuration = PEGASUS_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size: int = 50000,
+        max_position_embeddings: int = 1024,
+        encoder_layers: int = 12,
+        encoder_ffn_dim: int = 3072,
+        encoder_attention_heads: int = 12,
+        decoder_layers: int = 12,
+        decoder_ffn_dim: int = 3072,
+        decoder_attention_heads: int = 12,
+        activation_function: str = "relu",
+        d_model: int = 768,
+        dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        init_std: float = 0.02,
+        pad_token_id: int = 0,
+        bos_token_id: int = 2,
+        eos_token_id: int = 1,
+        is_encoder_decoder: bool = True,
+        decoder_start_token_id: int = 0,
+        forced_eos_token_id: int = 1,
+        scale_embedding: bool = True,
+        use_cache: bool = True,
+        encoder_layerdrop: float = 0.0,
+        decoder_layerdrop: float = 0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding
+        self.use_cache = use_cache
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            forced_eos_token_id=forced_eos_token_id,
+            **kwargs,
+        )
+
+        if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
+            self.forced_bos_token_id = self.bos_token_id
+            logger.warning(
+                f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
+                "The config can simply be saved and uploaded again to be fixed."
+            )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/pegasus/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/pegasus/modeling.py
new file mode 100644
index 000000000..406f703e0
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/pegasus/modeling.py
@@ -0,0 +1,663 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The Google Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+from paddle import Tensor
+from paddle.nn import Embedding, MultiHeadAttention
+
+from ...utils.env import CONFIG_NAME
+from .. import PretrainedModel, register_base_model
+from ..model_outputs import ModelOutput
+from .configuration import PEGASUS_PRETRAINED_INIT_CONFIGURATION, PegasusConfig
+
+__all__ = [
+    "PegasusModel",
+    "PegasusPretrainedModel",
+    "PegasusEncoder",
+    "PegasusDecoder",
+    "PegasusForConditionalGeneration",
+]
+
+PEGASUS_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese",
+    "IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese",
+    "IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese-V1",
+    "PaddlePaddle/Randeng-Pegasus-238M-Summary-Chinese-SSTIA",
+    "PaddlePaddle/Randeng-Pegasus-523M-Summary-Chinese-SSTIA",
+]
+
+Cache = MultiHeadAttention.Cache
+StaticCache = MultiHeadAttention.StaticCache
+
+
+def shift_tokens_right(input_ids, pad_token_id, decoder_start_token_id):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = paddle.zeros_like(input_ids)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("self.model.config.pad_token_id has to be defined.")
+
+    shifted_input_ids = paddle.where(
+        shifted_input_ids == -100, paddle.full_like(shifted_input_ids, pad_token_id), shifted_input_ids
+    )
+    return shifted_input_ids
+
+
+class PegasusPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained Pegasus models. It provides Pegasus related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    model_config_file = CONFIG_NAME
+    pretrained_init_configuration = PEGASUS_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = {}
+    base_model_prefix = "pegasus"
+    config_class = PegasusConfig
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.init_std,
+                        shape=layer.weight.shape,
+                    )
+                )
+            if hasattr(layer, "bias"):
+                layer.bias.set_value(paddle.zeros_like(layer.bias))
+        elif isinstance(layer, PegasusSinusoidalPositionalEmbedding):
+            pass
+
+
+class PegasusSinusoidalPositionalEmbedding(Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, num_embeddings, embedding_dim):
+        super().__init__(num_embeddings, embedding_dim)
+        self.weight = self._init_weight(self.weight)
+
+    @staticmethod
+    def _init_weight(out):
+        """
+        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
+        the 2nd half of the vector. [dim // 2:]
+        """
+        n_pos, dim = out.shape
+        position_enc = np.array(
+            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
+        )
+        out.stop_gradient = True
+        sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
+        out[:, 0:sentinel] = np.sin(position_enc[:, 0::2])
+        out[:, sentinel:] = np.cos(position_enc[:, 1::2])
+        return out
+
+    @paddle.no_grad()
+    def forward(self, input_ids_shape: Tuple, past_key_values_length: int = 0) -> Tensor:
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        bsz, seq_len = input_ids_shape[:2]
+        positions = paddle.arange(past_key_values_length, past_key_values_length + seq_len, dtype="int64")
+        # (gongenlei) For dygraph to static graph
+        return Embedding.forward(self, positions)
+
+
+class PegasusEncoder(PegasusPretrainedModel):
+    """
+    The Transformer Encoder of PegasusModel. The arguments of PegasusEncoder can see :class:`PegasusModel`.
+    """
+
+    def __init__(self, config: PegasusConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.init_std = config.init_std
+        self.pad_token_id = config.pad_token_id
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model)
+
+        self.encoder_embed_positions = PegasusSinusoidalPositionalEmbedding(
+            config.max_position_embeddings, config.d_model
+        )
+
+        self.encoder_dropout = nn.Dropout(config.dropout)
+        self.encoder_layernorm = nn.LayerNorm(config.d_model)
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=config.d_model,
+            nhead=config.encoder_attention_heads,
+            dim_feedforward=config.encoder_ffn_dim,
+            dropout=config.dropout,
+            activation=config.activation_function,
+            attn_dropout=config.attention_dropout,
+            act_dropout=config.activation_dropout,
+            normalize_before=True,
+        )
+        self.encoder = nn.TransformerEncoder(encoder_layer, config.encoder_layers)
+
+    def forward(self, input_ids: Optional[Tensor] = None, attention_mask: Optional[Tensor] = None, **kwargs):
+        """
+        The PegasusEncoder forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor, optional):
+                See :class:`PegasusModel`.
+            attention_mask (Tensor, optional):
+                See :class:`PegasusModel`.
+
+        Returns:
+            Tensor: Returns tensor `encoder_output`, which is the output at the last layer of the model.
+            Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size].
+
+        """
+        if input_ids is None:
+            raise ValueError("Input_ids cannot be None.")
+        inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+        inputs_embed_pos = self.encoder_embed_positions(input_ids.shape)
+        hidden_states = inputs_embeds + inputs_embed_pos
+        encoder_input = self.encoder_dropout(hidden_states)
+
+        if attention_mask is None:
+            attention_mask = (
+                paddle.cast(input_ids == self.pad_token_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
+            )
+        # For 2D attention_mask from tokenizer
+        elif attention_mask.ndim == 2:
+            attention_mask = paddle.unsqueeze(attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
+            attention_mask = (1.0 - attention_mask) * -1e4
+        attention_mask.stop_gradient = True
+
+        encoder_output = self.encoder(encoder_input, src_mask=attention_mask)
+        encoder_output = self.encoder_layernorm(encoder_output)
+        return encoder_output
+
+
+class PegasusDecoder(PegasusPretrainedModel):
+    """
+    The Transformer Decoder of PegasusModel. The arguments of PegasusDecoder can see :class:`PegasusModel`.
+    """
+
+    def __init__(self, config: PegasusConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.init_std = config.init_std
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model)
+
+        self.decoder_embed_positions = PegasusSinusoidalPositionalEmbedding(
+            config.max_position_embeddings, config.d_model
+        )
+        self.decoder_dropout = nn.Dropout(config.dropout)
+        self.decoder_layernorm = nn.LayerNorm(config.d_model)
+
+        decoder_layer = nn.TransformerDecoderLayer(
+            d_model=config.d_model,
+            nhead=config.decoder_attention_heads,
+            dim_feedforward=config.decoder_ffn_dim,
+            dropout=config.dropout,
+            activation=config.activation_function,
+            attn_dropout=config.attention_dropout,
+            act_dropout=config.activation_dropout,
+            normalize_before=True,
+        )
+        self.decoder = nn.TransformerDecoder(decoder_layer, config.decoder_layers)
+
+    def forward(
+        self,
+        decoder_input_ids: Optional[Tensor] = None,
+        decoder_attention_mask: Optional[Tensor] = None,
+        encoder_output: Union[Tuple[Tensor], ModelOutput, None] = None,
+        memory_mask: Optional[Tensor] = None,
+        cache: Optional[List[Tuple[Cache, StaticCache]]] = None,
+        x: Optional[Tensor] = None,
+        mix_ratio: Optional[float] = 0,
+    ):
+        """
+        The PegasusDecoder forward method, overrides the `__call__()` special method.
+
+        Args:
+            decoder_input_ids (Tensor, optional):
+                See :class:`PegasusModel`.
+            decoder_attention_mask (Tensor, optional):
+                See :class:`PegasusModel`.
+            encoder_output (Tensor, optional):
+                See :class:`PegasusModel`.
+            memory_mask (Tensor, optional):
+                See :class:`PegasusModel`.
+            cache (Tensor, optional):
+                See :class:`PegasusModel`.
+            x (Tensor, optional):
+                The synthetic decoder input embedding of SSTIA strategy.
+                Its data type should be `float32` and it has a shape of [batch_size, sequence_length, hidden_size].
+                Defaults to `None`, which means don't use SSTIA strategy.
+            mix_ratio (float, optional):
+                The mixing ratio of synthetic decoder embedding and general deocder input embedding.
+                If SSTIA strategy is used, this arg should be set in (0,1).
+                Defaults to `0`, which means don't use synthetic decoder embedding.
+
+
+        Returns:
+            Tensor: Returns tensor `decoder_output`, which is the output at the last layer of the model.
+            Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size].
+
+        """
+        if decoder_attention_mask is None:
+            decoder_length = decoder_input_ids.shape[-1]
+            decoder_attention_mask = paddle.tensor.triu(
+                (paddle.full((decoder_length, decoder_length), -np.inf, dtype=paddle.get_default_dtype())), 1
+            )
+
+        if x is None:
+            decoder_inputs_embeds = self.embed_tokens(decoder_input_ids) * self.embed_scale
+        else:
+            decoder_inputs_embeds = self.embed_tokens(
+                decoder_input_ids
+            ) * self.embed_scale * mix_ratio + self.embed_scale * x * (1 - mix_ratio)
+
+        past_key_values_length = cache[0][0].k.shape[2] if cache is not None else 0
+        decoder_inputs_embed_pos = self.decoder_embed_positions(decoder_input_ids.shape, past_key_values_length)
+        hidden_states = decoder_inputs_embeds + decoder_inputs_embed_pos
+        decoder_input = self.decoder_dropout(hidden_states)
+
+        decoder_output = self.decoder(
+            tgt=decoder_input,
+            memory=encoder_output,
+            tgt_mask=decoder_attention_mask,
+            memory_mask=memory_mask,
+            cache=cache,
+        )
+        if cache is not None:
+            new_cache = decoder_output[1]
+            decoder_output = decoder_output[0]
+        else:
+            new_cache = None
+        decoder_output = self.decoder_layernorm(decoder_output)
+        return decoder_output, new_cache
+
+
+@register_base_model
+class PegasusModel(PegasusPretrainedModel):
+    r"""
+    The bare Pegasus Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`PegasusConfig`):
+            An instance of PegasusConfig used to construct PegasusModel.
+    """
+
+    def __init__(self, config: PegasusConfig):
+        super().__init__(config)
+        self.init_std = config.init_std
+        self.pad_token_id = config.pad_token_id
+        self.decoder_start_token_id = config.decoder_start_token_id
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+        self.encoder = PegasusEncoder(config, self.shared)
+        self.decoder = PegasusDecoder(config, self.shared)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, value):
+        self.shared = value
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        decoder_input_ids: Optional[Tensor] = None,
+        decoder_attention_mask: Optional[Tensor] = None,
+        encoder_output: Union[Tuple[Tensor], ModelOutput, None] = None,
+        use_cache: Optional[bool] = None,
+        cache: Optional[List[Tuple[Cache, StaticCache]]] = None,
+    ):
+        r"""
+        The PegasusModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
+                [batch_size, num_attention_heads, sequence_length, sequence_length].
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            decoder_input_ids (Tensor, optional):
+                Indices of decoder input sequence tokens in the vocabulary.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means no `decoder_input_ids` is provided, the model will create the tensor
+                by shifting the `input_ids` to the right.
+            decoder_attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention to some unwanted positions in `decoder_input_ids`.
+                Its data type and shape is the same as `attention_mask`. Defaults to `None`.
+            encoder_output (tuple, optional):
+                The output of the encoder, a tuple consists `last_hidden_state`, `hidden_states`(optional), `attentions`(optional).
+                The data type of `last_hidden_state` is float32 and its shape is `[batch_size, sequence_length, hidden_size]`.
+                `hidden_states` is hidden_states of all layers in the Transformer encoder. The length of `hidden_states` is `num_hidden_layers + 1`.
+                For all element in the tuple, its data type should be float32 and its shape is [`batch_size, sequence_length, hidden_size`].
+                `attentions` is attentions of all layers of in the Transformer encoder. The length of `attentions` is `num_hidden_layers`.
+                For all element in the tuple, its data type should be float32 and its shape is [`batch_size, num_attention_heads, sequence_length, sequence_length`].
+            use_cache (bool, optional):
+                 Whether or not to use cache. Defaults to `False`. If set to `True`, key value states will be returned and
+                 can be used to speed up decoding.
+            cache (list, optional):
+                It is a list, and each element in the list is a tuple `(incremental_cache, static_cache)`.
+                See `TransformerDecoder.gen_cache <https://github.com/PaddlePaddle/Paddle/blob/release/2.1/python/paddle/nn/layer/transformer.py#L1060>`__ for more details.
+                It is only used for inference and should be None for training.
+                Default to `None`.
+
+        Returns:
+            Tensor: Returns tensor `decoder_output`, which is the output at the last layer of the model.
+            Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import PegasusModel, PegasusTokenizer
+
+                tokenizer = PegasusTokenizer.from_pretrained(pegasus_path)
+                model = PegasusModel.from_pretrained(pegasus_path)
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+        """
+        if input_ids is None and encoder_output is None:
+            raise ValueError("You have to specify either input_ids or encoder_output")
+        if decoder_input_ids is None:
+            assert input_ids is not None, "input_ids should be " "specified when generating decoder_input_ids"
+            decoder_input_ids = shift_tokens_right(input_ids, self.pad_token_id, self.decoder_start_token_id)
+        if attention_mask is None:
+            assert input_ids is not None, "input_ids should be " "specified when generating attention_mask"
+            attention_mask = (
+                paddle.cast(input_ids == self.pad_token_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4
+            )
+        # For 2D attention_mask from tokenizer
+        elif attention_mask.ndim == 2:
+            attention_mask = paddle.unsqueeze(attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
+            attention_mask = (1.0 - attention_mask) * -1e4
+            attention_mask.stop_gradient = True
+        if encoder_output is None:
+            encoder_output = self.encoder(input_ids, attention_mask)
+        if decoder_attention_mask is not None and decoder_attention_mask.ndim == 2:
+            decoder_attention_mask = paddle.unsqueeze(decoder_attention_mask, axis=[1, 2]).astype(
+                paddle.get_default_dtype()
+            )
+            decoder_attention_mask = (1.0 - decoder_attention_mask) * -1e4
+            decoder_attention_mask.stop_gradient = True
+
+        if use_cache:
+            if cache is None:
+                cache = self.decoder.decoder.gen_cache(encoder_output)
+        else:
+            cache = None
+
+        memory_mask = attention_mask
+        if attention_mask is not None:
+            if attention_mask.ndim == 4:
+                memory_mask = attention_mask[:, :, -1:, :]
+            elif attention_mask.ndim == 3:
+                memory_mask = attention_mask[:, -1:, :].unsqueeze([1])
+            elif attention_mask.ndim == 2:
+                memory_mask = attention_mask.unsqueeze([1, 2])
+            else:
+                raise ValueError("Invalid attention mask shape. ")
+
+        decoder_output, new_cache = self.decoder(
+            decoder_input_ids, decoder_attention_mask, encoder_output, memory_mask, cache
+        )
+        return decoder_output, new_cache, encoder_output, attention_mask
+
+
+class PegasusForConditionalGeneration(PegasusPretrainedModel):
+    r"""
+    Pegasus Model with a `language modeling` head on top.
+
+    Args:
+        config (:class:`PegasusConfig`):
+            An instance of PegasusConfig used to construct PegasusForConditionalGeneration.
+    """
+
+    def __init__(self, config: PegasusConfig):
+        super().__init__(config)
+        self.pegasus = PegasusModel(config)
+        self.lm_head_weight = self.create_parameter(
+            shape=[config.vocab_size, config.d_model],
+            dtype=self.pegasus.shared.weight.dtype,
+            is_bias=False,
+        )
+        if hasattr(self, "final_logits_bias") and "final_logits_bias" not in self._buffers:
+            self.final_logits_bias = paddle.zeros((1, config.vocab_size))
+        else:
+            self.register_buffer("final_logits_bias", paddle.zeros((1, config.vocab_size)))
+        self.use_SSTIA = False
+        self.mix_ratio = 0
+
+    def get_encoder(self):
+        return self.pegasus.get_encoder()
+
+    def get_decoder(self):
+        return self.pegasus.get_decoder()
+
+    def prepare_fast_entry(self, kwargs):
+        from paddlenlp.ops import FasterPegasus
+
+        decode_strategy = kwargs.get("decode_strategy")
+        use_fp16_decoding = kwargs.get("use_fp16_decoding", False)
+        decoding_lib = kwargs.get("decoding_lib", None)
+        enable_fast_encoder = kwargs.get("enable_fast_encoder", True)
+        if decode_strategy == "sampling" and kwargs.get("top_k") != 0 and kwargs.get("top_p") != 1:
+            raise AttributeError(
+                "Only topk sampling or topp sampling are supported. "
+                "Topk sampling and topp sampling cannot be both applied in the fast version."
+            )
+        if kwargs["repetition_penalty"] != 1.0:
+            # not support for repetition_penalty yet in the fast version
+            raise AttributeError("'repetition_penalty != 1' is not supported yet in the fast version")
+        self._fast_entry = FasterPegasus(
+            self,
+            use_fp16_decoding=use_fp16_decoding,
+            decoding_lib=decoding_lib,
+            enable_fast_encoder=enable_fast_encoder,
+        ).forward
+        return self._fast_entry
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        decoder_input_ids: Optional[Tensor] = None,
+        decoder_attention_mask: Optional[Tensor] = None,
+        encoder_output: Union[Tuple[Tensor], ModelOutput, None] = None,
+        use_cache: Optional[bool] = None,
+        cache: Optional[List[Tuple[Cache, StaticCache]]] = None,
+        labels: Optional[Tensor] = None,
+    ):
+        r"""
+        The PegasusForConditionalGeneration forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`PegasusModel`.
+            attention_mask (Tensor, optional):
+                See :class:`PegasusModel`.
+            decoder_input_ids (Tensor, `optional`):
+                See :class:`PegasusModel`.
+            decoder_attention_mask (Tensor, optional):
+                See :class:`PegasusModel`.
+            encoder_output (Tensor, optonal):
+                See :class:`PegasusModel`.
+            use_cache (bool, optional):
+                See :class:`PegasusModel`.
+            cache (Tensor, optional):
+                See :class:`PegasusModel`.
+
+        Returns:
+            Tensor or tuple: Returns Tensor `lm_logits` if `use_cache` is `False`, otherwise, returns tuple (`lm_logits`, `cache`).
+
+            With the fields:
+
+            - `lm_logits` (Tensor):
+                The generated sentence of the model.
+                Its data type should be float32 and has a shape of [batch_size, sequence_length, vocab_size].
+
+            - `cache` (Tensor):
+                See :class:`PegasusModel`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import PegasusForConditionalGeneration, PegasusTokenizer
+
+                tokenizer = PegasusTokenizer.from_pretrained(pegasus_path)
+                model = PegasusForConditionalGeneration.from_pretrained(pegasus_path)
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+        """
+        output, new_cache, encoder_output, attention_mask = self.pegasus(
+            input_ids, attention_mask, decoder_input_ids, decoder_attention_mask, encoder_output, use_cache, cache
+        )
+        lm_logits = paddle.tensor.matmul(output, self.lm_head_weight, transpose_y=True) + self.final_logits_bias
+
+        if self.use_SSTIA:
+            assert 0 < self.mix_ratio < 1
+            x = lm_logits.clone()
+            length = len(x[0])
+            for idx in range(length - 1, -1, -1):
+                x[:, idx] = x[:, idx - 1]
+            x[:, 0, 0] = 2 * paddle.max(x[:, 0])
+            x = paddle.nn.functional.softmax(x, axis=2)
+
+            with paddle.no_grad():
+                embed_matrix = self.pegasus.decoder.embed_tokens.weight.clone()
+                decoder_in = paddle.einsum("blv,ve->ble", x, embed_matrix)
+
+            output_new, _ = self.pegasus.decoder(
+                decoder_input_ids,
+                decoder_attention_mask,
+                encoder_output,
+                attention_mask,
+                cache,
+                x=decoder_in,
+                mix_ratio=self.mix_ratio,
+            )
+            lm_logits_new = (
+                paddle.tensor.matmul(output_new, self.lm_head_weight, transpose_y=True) + self.final_logits_bias
+            )
+
+            masked_lm_loss = None
+            if labels is not None:
+                loss_fct = nn.CrossEntropyLoss()
+                masked_lm_loss = (1 - self.mix_ratio) * loss_fct(
+                    lm_logits.reshape((-1, self.pegasus.config["vocab_size"])), labels.reshape((-1,))
+                )
+                masked_lm_loss += self.mix_ratio * loss_fct(
+                    lm_logits_new.reshape((-1, self.pegasus.config["vocab_size"])), labels.reshape((-1,))
+                )
+                p = paddle.nn.functional.log_softmax(lm_logits_new, axis=2)
+                q = paddle.nn.functional.softmax(lm_logits, axis=2)
+                loss_kl = paddle.nn.functional.kl_div(p, q, reduction="mean")
+                masked_lm_loss += loss_kl
+            return lm_logits, new_cache, masked_lm_loss
+
+        else:
+            masked_lm_loss = None
+            if labels is not None:
+                loss_fct = nn.CrossEntropyLoss()
+                masked_lm_loss = loss_fct(
+                    lm_logits.reshape((-1, self.pegasus.config["vocab_size"])), labels.reshape((-1,))
+                )
+
+            return lm_logits, new_cache, masked_lm_loss
+
+    def prepare_decoder_input_ids_from_labels(self, labels):
+        return shift_tokens_right(labels, self.pegasus.pad_token_id, self.pegasus.config["decoder_start_token_id"])
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        attention_mask=None,
+        decoder_attention_mask=None,
+        cache=None,
+        use_cache=False,
+        encoder_output=None,
+        **kwargs
+    ):
+        # cut decoder_input_ids if past is used
+        if cache is not None:
+            decoder_input_ids = decoder_input_ids[:, -1].unsqueeze(-1)
+            if decoder_attention_mask is not None:
+                decoder_attention_mask = decoder_attention_mask[:, :, -1, :].unsqueeze(2)
+
+        return {
+            "input_ids": None,
+            "decoder_input_ids": decoder_input_ids,
+            "encoder_output": encoder_output,
+            "decoder_attention_mask": decoder_attention_mask,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,
+            "cache": cache,
+        }
+
+    def __getattr__(self, name):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(getattr(self, self.base_model_prefix), name)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/pegasus/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/pegasus/tokenizer.py
new file mode 100644
index 000000000..abc4a5bec
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/pegasus/tokenizer.py
@@ -0,0 +1,376 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The IDEA-CCNL Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+import re
+
+import jieba
+
+from .. import BasicTokenizer, PretrainedTokenizer, WordpieceTokenizer
+from ..tokenizer_utils import _is_punctuation
+
+__all__ = ["PegasusChineseTokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese": 1024,
+    "IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese": 1024,
+    "IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese-V1": 1024,
+    "PaddlePaddle/Randeng-Pegasus-238M-Summary-Chinese-SSTIA": 1024,
+    "PaddlePaddle/Randeng-Pegasus-523M-Summary-Chinese-SSTIA": 1024,
+}
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+def _is_chinese_char(cp):
+    """Checks whether CP is the codepoint of a CJK character."""
+    # This defines a "chinese character" as anything in the CJK Unicode block:
+    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    #
+    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+    # despite its name. The modern Korean Hangul alphabet is a different block,
+    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+    # space-separated words, so they are not treated specially and handled
+    # like the all of the other languages.
+    if (
+        (cp >= 0x4E00 and cp <= 0x9FFF)
+        or (cp >= 0x3400 and cp <= 0x4DBF)
+        or (cp >= 0x20000 and cp <= 0x2A6DF)
+        or (cp >= 0x2A700 and cp <= 0x2B73F)
+        or (cp >= 0x2B740 and cp <= 0x2B81F)
+        or (cp >= 0x2B820 and cp <= 0x2CEAF)
+        or (cp >= 0xF900 and cp <= 0xFAFF)
+        or (cp >= 0x2F800 and cp <= 0x2FA1F)
+    ):
+        return True
+
+    return False
+
+
+class PegasusChineseTokenizer(PretrainedTokenizer):
+    r"""
+    Construct a Pegasus tokenizer. Based on WordPiece.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
+            Whether or not to do basic tokenization before WordPiece.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import PegasusChineseTokenizer
+
+            tokenizer = PegasusChineseTokenizer.from_pretrained('IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese')
+            print(tokenizer('欢迎使用PaddleNLP'))
+
+            '''
+            {'input_ids': [22355, 8994, 35941, 48563, 49375, 48877, 1],
+            'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
+            '''
+
+    """
+    resource_files_names = {"vocab_file": "vocab.txt"}
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese": "",
+            "IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese": "",
+            "IDEA-CCNL/Randeng-Pegasus-523M-Summary-Chinese-V1": "",
+            "PaddlePaddle/Randeng-Pegasus-238M-Summary-Chinese-SSTIA": "",
+            "PaddlePaddle/Randeng-Pegasus-523M-Summary-Chinese-SSTIA": "",
+        },
+    }
+    pretrained_init_configuration = {}
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        pad_token="<pad>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        mask_token="<mask_2>",
+        mask_token_sent="<mask_1>",
+        additional_special_tokens=None,
+        sep_token="[SEP]",
+        cls_token="[CLS]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        offset=100,
+        **kwargs
+    ):
+
+        self.offset = offset
+
+        if additional_special_tokens is not None:
+            if not isinstance(additional_special_tokens, list):
+                raise TypeError(
+                    f"additional_special_tokens should be of type {type(list)}, \
+                     but is {type(additional_special_tokens)}"
+                )
+
+            additional_special_tokens_extended = (
+                ([mask_token_sent] + additional_special_tokens)
+                if mask_token_sent not in additional_special_tokens and mask_token_sent is not None
+                else additional_special_tokens
+            )
+
+            # fill additional tokens with ..., <unk_token_102> in case not all additional tokens are already taken
+            additional_special_tokens_extended += [
+                f"<unk_{i}>" for i in range(len(additional_special_tokens_extended), self.offset - 1)
+            ]
+
+            if len(set(additional_special_tokens_extended)) != len(additional_special_tokens_extended):
+                raise ValueError(
+                    f"Please make sure that the provided additional_special_tokens \
+                        do not contain an incorrectly shifted list of <unk_x> tokens. \
+                        Found {additional_special_tokens_extended}."
+                )
+            additional_special_tokens = additional_special_tokens_extended
+        else:
+            additional_special_tokens = [mask_token_sent] if mask_token_sent is not None else []
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. \
+                To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            eos_token=eos_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            additional_special_tokens=additional_special_tokens,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+        # Function object isn't serializable
+        self.pre_tokenizer = lambda x: jieba.cut(x, HMM=False)
+        self.mask_token_sent = mask_token_sent
+        self.vocab = load_vocab(vocab_file)
+
+        self.vocab[self.eos_token] = self.vocab.pop("[unused1]")
+        self.vocab[self.pad_token] = self.vocab.pop("[PAD]")
+        self.vocab[self.unk_token] = self.vocab.pop("[UNK]")
+
+        if self.mask_token_sent is not None:
+            self.vocab[self.mask_token] = self.vocab.pop("[unused3]")
+            self.vocab[self.mask_token_sent] = self.vocab.pop("[unused2]")
+
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+
+    @property
+    def do_lower_case(self):
+        return self.basic_tokenizer.do_lower_case
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        split_tokens = []
+        for text in self.pre_tokenizer(text):
+            if text in self.vocab:
+                split_tokens.append(text)
+            else:
+                if self.do_basic_tokenize:
+                    for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
+
+                        # If the token is part of the never_split set
+                        if token in self.basic_tokenizer.never_split:
+                            split_tokens.append(token)
+                        else:
+                            split_tokens += self.wordpiece_tokenizer.tokenize(token)
+                else:
+                    split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+    @staticmethod
+    def _cjk_punctuation():
+        return "\uff02\uff03\uff04\uff05\uff06\uff07\uff08\uff09\uff0a\uff0b\uff0c\uff0d\uff0f\uff1a\uff1b\uff1c\uff1d\
+            \uff1e\uff20\uff3b\uff3c\uff3d\uff3e\uff3f\uff40\uff5b\uff5c\uff5d\uff5e\uff5f\uff60\uff62\
+            \uff63\uff64\u3000\u3001\u3003\u3008\u3009\u300a\u300b\u300c\u300d\u300e\u300f\u3010\u3011\u3014\
+            \u3015\u3016\u3017\u3018\u3019\u301a\u301b\u301c\u301d\u301e\u301f\u3030\u303e\u303f\u2013\u2014\
+            \u2018\u2019\u201b\u201c\u201d\u201e\u201f\u2026\u2027\ufe4f\ufe51\ufe54\u00b7\uff01\uff1f\uff61\u3002"
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """
+        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
+        added tokens.
+        Args:
+            ids (`int` or `List[int]`):
+                The token id (or token ids) to convert to tokens.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
+        Returns:
+            `str` or `List[str]`: The decoded token(s).
+        """
+        if isinstance(ids, int):
+            if ids in self.added_tokens_decoder:
+                return self.added_tokens_decoder[ids]
+            else:
+                return self._convert_id_to_token(ids)
+        tokens = []
+        for index in ids:
+            index = int(index)
+            if skip_special_tokens and index in self.all_special_ids and index != 2:
+                continue
+            if index in self.added_tokens_decoder:
+                tokens.append(self.added_tokens_decoder[index])
+            else:
+                tokens.append(self._convert_id_to_token(index))
+        return tokens
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        # for token in
+        # tokens = tokens or self.ids_to_tokens(ids)
+        # tokens = [token for token in tokens if not self._is_special(token)]
+
+        text = ""
+        for i, token in enumerate(tokens):
+            if token[:2] == "##":
+                text += token[2:]
+            elif len(token) == 1 and _is_chinese_char(ord(token)):
+                text += token
+            elif len(token) == 1 and _is_punctuation(token):
+                text += token
+                text += " "
+            elif i > 0 and _is_chinese_char(ord(text[-1])):
+                text += token
+            elif tokens == "</s>":
+                continue
+            else:
+                text += " "
+                text += token
+
+        text = re.sub(" +", " ", text)
+        text = re.sub("' (re|m|s|t|ve|d|ll) ", "'\\1 ", text)
+        punctuation = re.sub(" +", "", self._cjk_punctuation()).strip() + "+-/={(<["
+        punctuation_regex = "|".join([re.escape(p) for p in punctuation])
+        punctuation_regex = "(%s) " % punctuation_regex
+        text = re.sub(punctuation_regex, "\\1", text)
+        text = re.sub(r"(\d\.) (\d)", "\\1\\2", text)
+
+        return text.strip()
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification
+        tasks by concatenating and adding special tokens.
+        """
+        if token_ids_1 is None:
+            return token_ids_0 + [self.eos_token_id]
+        return token_ids_0 + token_ids_1 + [self.eos_token_id]
+
+    def _special_token_mask(self, seq):
+        all_special_ids = set(self.all_special_ids)  # call it once instead of inside list comp
+        all_special_ids.remove(self.unk_token_id)  # <unk> is only sometimes special
+
+        return [1 if x in all_special_ids else 0 for x in seq]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is
+        called when adding special tokens using the tokenizer ``encode`` methods.
+        """
+        if already_has_special_tokens:
+            return self._special_token_mask(token_ids_0)
+        elif token_ids_1 is None:
+            return self._special_token_mask(token_ids_0) + [self.eos_token_id]
+        else:
+            return self._special_token_mask(token_ids_0 + token_ids_1) + [self.eos_token_id]
+
+    def num_special_tokens_to_add(self, pair=False):
+        """Just EOS"""
+        return 1
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        if offset_mapping_1 is None:
+            return offset_mapping_0 + [(0, 0)]
+
+        return offset_mapping_0 + offset_mapping_1 + [(0, 0)]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ppminilm/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ppminilm/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ppminilm/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ppminilm/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ppminilm/configuration.py
new file mode 100644
index 000000000..70330b10e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ppminilm/configuration.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PPMiniLM model configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["PPMINILM_PRETRAINED_INIT_CONFIGURATION", "PPMiniLMConfig", "PPMINILM_PRETRAINED_RESOURCE_FILES_MAP"]
+
+PPMINILM_PRETRAINED_INIT_CONFIGURATION = {
+    "ppminilm-6l-768h": {
+        "attention_probs_dropout_prob": 0.1,
+        "intermediate_size": 3072,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 6,
+        "type_vocab_size": 2,
+        "vocab_size": 21128,
+        "pad_token_id": 0,
+    },
+}
+
+PPMINILM_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "ppminilm-6l-768h": "https://bj.bcebos.com/paddlenlp/models/transformers/ppminilm-6l-768h/ppminilm-6l-768h.pdparams",
+    },
+}
+
+
+class PPMiniLMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PPMiniLMModel`]. It is used to
+    instantiate a PPMiniLM model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the PPMiniLM ppminilm-6l-768h architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 21128):
+            Vocabulary size of the PPMiniLM model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`PPMiniLMModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`PPMiniLMModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Examples:
+
+    ```python
+    >>> from paddlenlp.transformers import PPMiniLMModel, PPMiniLMConfig
+
+    >>> # Initializing a PPMiniLM ppminilm-6l-768h style configuration
+    >>> configuration = PPMiniLMConfig()
+
+    >>> # Initializing a model from the ppminilm-6l-768h style configuration
+    >>> model = PPMiniLMModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "ppminilm"
+    attribute_map: Dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
+    pretrained_init_configuration = PPMINILM_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size: int = 21128,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        max_position_embeddings: int = 512,
+        type_vocab_size: int = 2,
+        initializer_range=0.02,
+        pad_token_id: int = 0,
+        do_lower_case: bool = True,
+        is_split_into_words: bool = False,
+        max_seq_len: int = 128,
+        pad_to_max_seq_len: bool = False,
+        layer_norm_eps: float = 1e-12,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.do_lower_case = do_lower_case
+        self.max_seq_len = max_seq_len
+        self.is_split_into_words = is_split_into_words
+        self.pad_token_id = pad_token_id
+        self.pad_to_max_seq_len = pad_to_max_seq_len
+        self.initializer_range = initializer_range
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.layer_norm_eps = layer_norm_eps
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ppminilm/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ppminilm/modeling.py
new file mode 100644
index 000000000..5cb405507
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ppminilm/modeling.py
@@ -0,0 +1,442 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+
+from ...utils.env import CONFIG_NAME
+from .. import PretrainedModel, register_base_model
+from .configuration import (
+    PPMINILM_PRETRAINED_INIT_CONFIGURATION,
+    PPMINILM_PRETRAINED_RESOURCE_FILES_MAP,
+    PPMiniLMConfig,
+)
+
+__all__ = [
+    "PPMiniLMModel",
+    "PPMiniLMPretrainedModel",
+    "PPMiniLMForSequenceClassification",
+    "PPMiniLMForQuestionAnswering",
+    "PPMiniLMForMultipleChoice",
+]
+
+
+class PPMiniLMEmbeddings(nn.Layer):
+    r"""
+    Include embeddings from word, position and token_type embeddings.
+    """
+
+    def __init__(self, config: PPMiniLMConfig):
+        super(PPMiniLMEmbeddings, self).__init__()
+
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None):
+        if position_ids is None:
+            # maybe need use shape op to unify static graph and dynamic graph
+            # seq_length = input_ids.shape[1]
+            ones = paddle.ones_like(input_ids, dtype="int64")
+            seq_length = paddle.cumsum(ones, axis=1)
+            position_ids = seq_length - ones
+            position_ids.stop_gradient = True
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros_like(input_ids, dtype="int64")
+        input_embedings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = input_embedings + position_embeddings + token_type_embeddings
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class PPMiniLMPooler(nn.Layer):
+    def __init__(self, config: PPMiniLMConfig):
+        super(PPMiniLMPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class PPMiniLMPretrainedModel(PretrainedModel):
+    r"""
+    An abstract class for pretrained PPMiniLM models. It provides PPMiniLM related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    Refer to :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+
+    """
+    model_config_file = CONFIG_NAME
+    config_class = PPMiniLMConfig
+    resource_files_names = {"model_state": "model_state.pdparams"}
+    base_model_prefix = "ppminilm"
+
+    pretrained_init_configuration = PPMINILM_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = PPMINILM_PRETRAINED_RESOURCE_FILES_MAP
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # only support dygraph, use truncated_normal and make it inplace
+            # and configurable later
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range,
+                        shape=layer.weight.shape,
+                    )
+                )
+        elif isinstance(layer, nn.LayerNorm):
+            layer._epsilon = self.config.layer_norm_eps
+
+
+@register_base_model
+class PPMiniLMModel(PPMiniLMPretrainedModel):
+    r"""
+    The bare PPMiniLM Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`PPMiniLMConfig`):
+            An instance of PPMiniLMConfig used to construct PPMiniLMModel.
+
+    """
+
+    def __init__(self, config: PPMiniLMConfig):
+        super(PPMiniLMModel, self).__init__(config)
+        self.pad_token_id = config.pad_token_id
+        self.embeddings = PPMiniLMEmbeddings(config)
+
+        encoder_layer = nn.TransformerEncoderLayer(
+            config.hidden_size,
+            config.num_attention_heads,
+            config.intermediate_size,
+            dropout=config.hidden_dropout_prob,
+            activation=config.hidden_act,
+            attn_dropout=config.attention_probs_dropout_prob,
+            act_dropout=0.0,
+        )
+        self.encoder = nn.TransformerEncoder(encoder_layer, config.num_hidden_layers)
+        self.pooler = PPMiniLMPooler(config)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
+        r"""
+        Args:
+            input_ids (Tensor):
+                If `input_ids` is a Tensor object, it is an indices of input
+                sequence tokens in the vocabulary. They are numerical
+                representations of tokens that build the input sequence. It's
+                data type should be `int64` and has a shape of [batch_size, sequence_length].
+            token_type_ids (Tensor, string, optional):
+                If `token_type_ids` is a Tensor object:
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+
+            position_ids (Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `[batch_size, num_tokens]` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
+                [batch_size, num_attention_heads, sequence_length, sequence_length].
+                We use whole-word-mask in PPMiniLM, so the whole word will have the same value. For example, "使用" as a word,
+                "使" and "用" will have the same value.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+
+        Returns:
+            tuple: Returns tuple (``sequence_output``, ``pooled_output``).
+
+            With the fields:
+
+            - `sequence_output` (Tensor):
+                Sequence of hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+            - `pooled_output` (Tensor):
+                The output of first token (`[CLS]`) in sequence.
+                We "pool" the model by simply taking the hidden state corresponding to the first token.
+                Its data type should be float32 and its shape is [batch_size, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import PPMiniLMModel, PPMiniLMTokenizer
+
+                tokenizer = PPMiniLMTokenizer.from_pretrained('ppminilm-6l-768h')
+                model = PPMiniLMModel.from_pretrained('ppminilm-6l-768h')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                sequence_output, pooled_output = model(**inputs)
+
+        """
+        if attention_mask is None:
+            attention_mask = paddle.unsqueeze(
+                (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e4, axis=[1, 2]
+            )
+        else:
+            if attention_mask.ndim == 2:
+                # attention_mask [batch_size, sequence_length] -> [batch_size, 1, 1, sequence_length]
+                attention_mask = attention_mask.unsqueeze(axis=[1, 2]).astype(paddle.get_default_dtype())
+                attention_mask = (1.0 - attention_mask) * -1e4
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids
+        )
+
+        encoder_outputs = self.encoder(embedding_output, attention_mask)
+        sequence_output = encoder_outputs
+        pooled_output = self.pooler(sequence_output)
+        return sequence_output, pooled_output
+
+
+class PPMiniLMForSequenceClassification(PPMiniLMPretrainedModel):
+    r"""
+    PPMiniLM Model with a linear layer on top of the output layer,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        ppminilm (PPMiniLMModel):
+            An instance of `paddlenlp.transformers.PPMiniLMModel`.
+        num_classes (int, optional):
+            The number of classes. Default to `2`.
+        dropout (float, optional):
+            The dropout probability for output of PPMiniLM.
+            If None, use the same value as `hidden_dropout_prob`
+            of `paddlenlp.transformers.PPMiniLMModel` instance. Defaults to `None`.
+    """
+
+    def __init__(self, config: PPMiniLMConfig):
+        super(PPMiniLMForSequenceClassification, self).__init__(config)
+        self.ppminilm = PPMiniLMModel(config)
+        self.num_labels = config.num_labels
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`PPMiniLMModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`PPMiniLMModel`.
+            position_ids (Tensor, optional):
+                See :class:`PPMiniLMModel`.
+            attention_mask (Tensor, optional):
+                See :class:`MiniLMModel`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input text classification logits.
+            Shape as `[batch_size, num_classes]` and dtype as float32.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import PPMiniLMForSequenceClassification, PPMiniLMTokenizer
+
+                tokenizer = PPMiniLMTokenizer.from_pretrained('ppminilm-6l-768h')
+                model = PPMiniLMForSequenceClassification.from_pretrained('ppminilm-6l-768h0')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+
+        """
+        _, pooled_output = self.ppminilm(
+            input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask
+        )
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        return logits
+
+
+class PPMiniLMForQuestionAnswering(PPMiniLMPretrainedModel):
+    """
+    PPMiniLM Model with a linear layer on top of the hidden-states
+    output to compute `span_start_logits` and `span_end_logits`,
+    designed for question-answering tasks like SQuAD.
+
+    Args:
+        ppminilm (`PPMiniLMModel`):
+            An instance of `PPMiniLMModel`.
+    """
+
+    def __init__(self, config: PPMiniLMConfig):
+        super(PPMiniLMForQuestionAnswering, self).__init__(config)
+        self.ppminilm = PPMiniLMModel(config)
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`PPMiniLMModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`PPMiniLMModel`.
+            position_ids (Tensor, optional):
+                See :class:`PPMiniLMModel`.
+            attention_mask (Tensor, optional):
+                See :class:`PPMiniLMModel`.
+
+
+        Returns:
+            tuple: Returns tuple (`start_logits`, `end_logits`).
+
+            With the fields:
+
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import PPMiniLMForQuestionAnswering, PPMiniLMTokenizer
+
+                tokenizer = PPMiniLMTokenizer.from_pretrained('ppminilm-6l-768h')
+                model = PPMiniLMForQuestionAnswering.from_pretrained('ppminilm-6l-768h')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+        """
+
+        sequence_output, _ = self.ppminilm(
+            input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask
+        )
+
+        logits = self.classifier(sequence_output)
+        logits = paddle.transpose(logits, perm=[2, 0, 1])
+        start_logits, end_logits = paddle.unstack(x=logits, axis=0)
+
+        return start_logits, end_logits
+
+
+class PPMiniLMForMultipleChoice(PPMiniLMPretrainedModel):
+    """
+    PPMiniLM Model with a linear layer on top of the hidden-states output layer,
+    designed for multiple choice tasks like RocStories/SWAG tasks.
+
+    Args:
+        ppminilm (:class:`PPMiniLMModel`):
+            An instance of PPMiniLMModel.
+        num_choices (int, optional):
+            The number of choices. Defaults to `2`.
+        dropout (float, optional):
+            The dropout probability for output of PPMiniLM.
+            If None, use the same value as `hidden_dropout_prob` of `PPMiniLMModel`
+            instance `ppminilm`. Defaults to None.
+    """
+
+    def __init__(self, config: PPMiniLMConfig):
+        super(PPMiniLMForMultipleChoice, self).__init__(config)
+        self.num_choices = config.num_choices
+        self.ppminilm = PPMiniLMModel(config)
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
+        r"""
+        The PPMiniLMForMultipleChoice forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`PPMiniLMModel` and shape as [batch_size, num_choice, sequence_length].
+            token_type_ids(Tensor, optional):
+                See :class:`PPMiniLMModel` and shape as [batch_size, num_choice, sequence_length].
+            position_ids(Tensor, optional):
+                See :class:`PPMiniLMModel` and shape as [batch_size, num_choice, sequence_length].
+            attention_mask (list, optional):
+                See :class:`PPMiniLMModel` and shape as [batch_size, num_choice, sequence_length].
+
+        Returns:
+            Tensor: Returns tensor `reshaped_logits`, a tensor of the multiple choice classification logits.
+            Shape as `[batch_size, num_choice]` and dtype as `float32`.
+
+        """
+        # input_ids: [bs, num_choice, seq_l]
+        input_ids = input_ids.reshape(shape=(-1, input_ids.shape[-1]))  # flat_input_ids: [bs*num_choice,seq_l]
+
+        if position_ids is not None:
+            position_ids = position_ids.reshape(shape=(-1, position_ids.shape[-1]))
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.reshape(shape=(-1, token_type_ids.shape[-1]))
+
+        if attention_mask is not None:
+            attention_mask = attention_mask.reshape(shape=(-1, attention_mask.shape[-1]))
+
+        _, pooled_output = self.ppminilm(
+            input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask
+        )
+        pooled_output = self.dropout(pooled_output)
+
+        logits = self.classifier(pooled_output)  # logits: (bs*num_choice,1)
+        reshaped_logits = logits.reshape(shape=(-1, self.num_choices))  # logits: (bs, num_choice)
+
+        return reshaped_logits
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ppminilm/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ppminilm/tokenizer.py
new file mode 100644
index 000000000..8309cc644
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ppminilm/tokenizer.py
@@ -0,0 +1,308 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from .. import BasicTokenizer, PretrainedTokenizer, WordpieceTokenizer
+
+__all__ = ["PPMiniLMTokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"ppminilm-6l-768h": 512}
+
+
+class PPMiniLMTokenizer(PretrainedTokenizer):
+    r"""
+    Constructs an PPMiniLM tokenizer. It uses a basic tokenizer to do punctuation
+    splitting, lower casing and so on, and follows a WordPiece tokenizer to
+    tokenize as subwords.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            The vocabulary file path (ends with '.txt') required to instantiate
+            a `WordpieceTokenizer`.
+        do_lower_case (str, optional):
+            Whether or not to lowercase the input when tokenizing.
+            Defaults to`True`.
+        unk_token (str, optional):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str, optional):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str, optional):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str, optional):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str, optional):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import PPMiniLMTokenizer
+            tokenizer = PPMiniLMTokenizer.from_pretrained('ppminilm-6l-768h')
+
+            encoded_inputs = tokenizer('He was a puppeteer')
+            # encoded_inputs:
+            # { 'input_ids': [1, 4444, 4385, 1545, 6712, 10062, 9568, 9756, 9500, 2],
+            #  'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
+            # }
+
+    """
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "ppminilm-6l-768h": "https://bj.bcebos.com/paddlenlp/models/transformers/ppminilm-6l-768h/vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "ppminilm-6l-768h": {"do_lower_case": True},
+    }
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs
+    ):
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the "
+                "vocabulary from a pretrained model please use "
+                "`tokenizer = PPMiniLMTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+        self.do_lower_case = do_lower_case
+        self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token)
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=unk_token)
+
+    @property
+    def vocab_size(self):
+        """
+        Return the size of vocabulary.
+
+        Returns:
+            int: The size of vocabulary.
+        """
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab.token_to_idx, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        r"""
+        End-to-end tokenization for PPMiniM models.
+
+        Args:
+            text (str): The text to be tokenized.
+
+        Returns:
+            List[str]: A list of string representing converted tokens.
+        """
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
+                # If the token is part of the never_split set
+                if token in self.basic_tokenizer.never_split:
+                    split_tokens.append(token)
+                else:
+                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def convert_tokens_to_string(self, tokens):
+        r"""
+        Converts a sequence of tokens (list of string) in a single string. Since
+        the usage of WordPiece introducing `##` to concat subwords, also remove
+        `##` when converting.
+
+        Args:
+            tokens (List[str]): A list of string representing tokens to be converted.
+
+        Returns:
+            str: Converted string from tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import PPMiniLMTokenizer
+                tokenizer = PPMiniLMTokenizer.from_pretrained('ppminilm-6l-768h')
+
+                tokens = tokenizer.tokenize('He was a puppeteer')
+                strings = tokenizer.convert_tokens_to_string(tokens)
+                #he was a puppeteer
+
+        """
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def num_special_tokens_to_add(self, pair=False):
+        r"""
+        Returns the number of added tokens when encoding a sequence with special tokens.
+
+        Note:
+            This encodes inputs and checks the number of added tokens, and is therefore not efficient.
+            Do not put this inside your training loop.
+
+        Args:
+            pair (bool, optional):
+                Whether the input is a sequence pair or a single sequence.
+                Defaults to `False` and the input is a single sequence.
+
+        Returns:
+            int: Number of tokens added to sequences
+        """
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        r"""
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens.
+
+        A sequence has the following format:
+
+        - single sequence:      ``[CLS] X [SEP]``
+        - pair of sequences:        ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs.
+                Defaults to `None`.
+
+        Returns:
+            List[int]: List of input_id with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        return _cls + token_ids_0 + _sep + token_ids_1 + _sep
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        r"""
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        An offset_mapping has the following format:
+
+        - single sequence:      ``(0,0) X (0,0)``
+        - pair of sequences:        ``(0,0) A (0,0) B (0,0)``
+
+        Args:
+            offset_mapping_ids_0 (List[tuple]):
+                List of char offsets to which the special tokens will be added.
+            offset_mapping_ids_1 (List[tuple], optional):
+                Optional second list of wordpiece offsets for offset mapping pairs.
+                Defaults to `None`.
+
+        Returns:
+            List[tuple]: A list of wordpiece offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        r"""
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+
+        A sequence pair mask has the following format:
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs.
+                Defaults to `None`.
+
+        Returns:
+            List[int]: List of token_type_id according to the given sequence(s).
+        """
+        _sep = [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(_cls + token_ids_0 + _sep) * [0]
+        return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 + _sep) * [1]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+            already_has_special_tokens (bool, optional): Whether or not the token list is already
+                formatted with special tokens for the model. Defaults to None.
+
+        Returns:
+            List[int]: The list of integers either be 0 or 1: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in self.all_special_ids else 0, token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.vocab._idx_to_token.get(index, self.unk_token)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/processing_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/processing_utils.py
new file mode 100644
index 000000000..f575b0f6d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/processing_utils.py
@@ -0,0 +1,136 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ Processing saving/loading class for common processors.
+"""
+
+import os
+
+import paddlenlp.transformers
+
+
+class ProcessorMixin(object):
+    """
+    This is a mixin used to provide saving/loading functionality for all processor classes.
+    """
+
+    attributes = ["feature_extractor", "tokenizer"]
+    # Names need to be attr_class for attr in attributes
+    feature_extractor_class = None
+    tokenizer_class = None
+    _auto_class = None
+
+    # args have to match the attributes class attribute
+    def __init__(self, *args, **kwargs):
+        # Sanitize args and kwargs
+        for key in kwargs:
+            if key not in self.attributes:
+                raise TypeError(f"Unexepcted keyword argument {key}.")
+        for arg, attribute_name in zip(args, self.attributes):
+            if attribute_name in kwargs:
+                raise TypeError(f"Got multiple values for argument {attribute_name}.")
+            else:
+                kwargs[attribute_name] = arg
+
+        if len(kwargs) != len(self.attributes):
+            raise ValueError(
+                f"This processor requires {len(self.attributes)} arguments: {', '.join(self.attributes)}. Got "
+                f"{len(args)} arguments instead."
+            )
+
+        # Check each arg is of the proper class (this will also catch a user initializing in the wrong order)
+        for attribute_name, arg in kwargs.items():
+            setattr(self, attribute_name, arg)
+
+    def __repr__(self):
+        attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes]
+        attributes_repr = "\n".join(attributes_repr)
+        return f"{self.__class__.__name__}:\n{attributes_repr}"
+
+    def save_pretrained(self, save_directory, **kwargs):
+        """
+        Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it
+        can be reloaded using the [`~ProcessorMixin.from_pretrained`] method.
+
+        <Tip>
+
+        This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
+        [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the docstrings of the methods
+        above for more information.
+
+        </Tip>
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
+                be created if it does not exist).
+            kwargs:
+                Additional key word arguments.
+        """
+        os.makedirs(save_directory, exist_ok=True)
+
+        for attribute_name in self.attributes:
+            attribute = getattr(self, attribute_name)
+            # Include the processor class in the attribute config so this processor can then be reloaded with the
+            # `AutoProcessor` API.
+            if hasattr(attribute, "_set_processor_class"):
+                attribute._set_processor_class(self.__class__.__name__)
+            attribute.save_pretrained(save_directory)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r"""
+        Instantiate a processor associated with a pretrained model.
+
+        <Tip>
+
+        This class method is simply calling the feature extractor
+        [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and the tokenizer
+        [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] methods. Please refer to the docstrings of the
+        methods above for more information.
+
+        </Tip>
+
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                This can be either:
+
+                - a string, the name of a community-contributed pretrained or built-in pretrained model.
+                - a path to a *directory* containing a feature extractor file saved using the
+                  [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
+                - a path or url to a saved feature extractor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
+            **kwargs
+                Additional keyword arguments passed along to both
+                [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and
+                [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
+        """
+        args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
+        return cls(*args)
+
+    @classmethod
+    def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        args = []
+        for attribute_name in cls.attributes:
+            class_name = getattr(cls, f"{attribute_name}_class")
+            attribute_class = getattr(paddlenlp.transformers, class_name)
+            args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
+        return args
+
+    @property
+    def model_input_names(self):
+        first_attribute = getattr(self, self.attributes[0])
+        return getattr(first_attribute, "model_input_names", None)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/prophetnet/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/prophetnet/__init__.py
new file mode 100644
index 000000000..6444835b4
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/prophetnet/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .configuration import *
+from .modeling import *
+from .tokenizer import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/prophetnet/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/prophetnet/configuration.py
new file mode 100644
index 000000000..ed10d0898
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/prophetnet/configuration.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MBart model configuration"""
+from __future__ import annotations
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = [
+    "PROPHETNET_PRETRAINED_INIT_CONFIGURATION",
+    "PROPHETNET_PRETRAINED_RESOURCE_FILES_MAP",
+    "ProphetNetConfig",
+]
+
+PROPHETNET_PRETRAINED_INIT_CONFIGURATION = {
+    "prophetnet-large-uncased": {
+        "activation_dropout": 0.1,
+        "activation_function": "gelu",
+        "attention_dropout": 0.1,
+        "bos_token_id": 102,
+        "decoder_ffn_dim": 4096,
+        "decoder_layerdrop": 0.0,
+        "decoder_max_position_embeddings": 514,
+        "decoder_start_token_id": 102,
+        "disable_ngram_loss": False,
+        "dropout": 0.1,
+        "encoder_ffn_dim": 4096,
+        "encoder_layerdrop": 0.0,
+        "encoder_max_position_embeddings": 513,
+        "eos_token_id": 102,
+        "eps": 0.1,
+        "hidden_size": 1024,
+        "init_std": 0.02,
+        "max_position_embeddings": 512,
+        "ngram": 2,
+        "num_buckets": 32,
+        "num_decoder_attention_heads": 16,
+        "num_decoder_layers": 12,
+        "num_encoder_attention_heads": 16,
+        "num_encoder_layers": 12,
+        "pad_token_id": 0,
+        "relative_max_distance": 128,
+        "length_penalty": 2.0,
+        "no_repeat_ngram_size": 3,
+        "num_beams": 4,
+        "max_length": 142,
+        "vocab_size": 30522,
+    },
+}
+
+PROPHETNET_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "prophetnet-large-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/prophetnet/prophetnet-large-uncased.pdparams"
+    }
+}
+
+
+class ProphetNetConfig(PretrainedConfig):
+
+    model_type = "prophetnet"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        bos_token_id=102,
+        pad_token_id=0,
+        eos_token_id=102,
+        hidden_size=1024,
+        decoder_start_token_id=102,
+        max_position_embeddings=512,
+        activation_function="gelu",
+        activation_dropout=0.1,
+        dropout=0.1,
+        relative_max_distance=128,
+        ngram=2,
+        num_buckets=32,
+        encoder_ffn_dim=4096,
+        num_encoder_attention_heads=16,
+        num_encoder_layers=12,
+        decoder_ffn_dim=4096,
+        num_decoder_attention_heads=16,
+        num_decoder_layers=12,
+        attention_dropout=0.1,
+        init_std=0.02,
+        eps=0.1,
+        add_cross_attention=True,
+        disable_ngram_loss=False,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.bos_token_id = bos_token_id
+        self.pad_token_id = pad_token_id
+        self.eos_token_id = eos_token_id
+        self.hidden_size = hidden_size
+        self.decoder_start_token_id = decoder_start_token_id
+        self.max_position_embeddings = max_position_embeddings
+        self.activation_function = activation_function
+        self.activation_dropout = activation_dropout
+        self.dropout = dropout
+        self.relative_max_distance = relative_max_distance
+        self.ngram = ngram
+        self.num_buckets = num_buckets
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.num_encoder_attention_heads = num_encoder_attention_heads
+        self.num_decoder_attention_heads = num_decoder_attention_heads
+        self.num_encoder_layers = num_encoder_layers
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.num_decoder_layers = num_decoder_layers
+        self.attention_dropout = attention_dropout
+        self.init_std = init_std
+        self.eps = eps
+        self.add_cross_attention = add_cross_attention
+        self.disable_ngram_loss = disable_ngram_loss
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/prophetnet/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/prophetnet/modeling.py
new file mode 100644
index 000000000..54b85bdc2
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/prophetnet/modeling.py
@@ -0,0 +1,1247 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Optional, Tuple
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import Tensor
+from paddle.nn import Layer
+
+from .. import PretrainedModel, register_base_model
+from ..activations import ACT2FN
+from .configuration import (
+    PROPHETNET_PRETRAINED_INIT_CONFIGURATION,
+    PROPHETNET_PRETRAINED_RESOURCE_FILES_MAP,
+    ProphetNetConfig,
+)
+
+__all__ = [
+    "ProphetNetModel",
+    "ProphetNetPretrainedModel",
+    "ProphetNetEncoder",
+    "ProphetNetDecoder",
+    "ProphetNetForConditionalGeneration",
+]
+
+
+def ngram_attention_bias(sequence_length, ngram, dtype):
+    """
+    This function computes the bias for the predict stream
+    """
+    left_block = paddle.ones((ngram, sequence_length, sequence_length), dtype=dtype) * float("-inf")
+    right_block = left_block.detach().clone()
+    # create bias
+    for stream_idx in range(ngram):
+        right_block[stream_idx] = right_block[stream_idx].fill_diagonal_(0, wrap=False)
+        left_block[stream_idx] = paddle.triu(left_block[stream_idx], diagonal=-stream_idx + 1)
+
+    left_block[:, :, 0] = 0
+    return paddle.concat([left_block, right_block], axis=2)
+
+
+def compute_relative_buckets(num_buckets, max_distance, relative_positions, is_bidirectional=False):
+    """
+    This function computes individual parts of the relative position buckets. For more detail, see paper.
+    """
+    inv_relative_positions = -relative_positions
+    rel_positions_bucket = 0
+
+    if is_bidirectional:
+        num_buckets = num_buckets // 2
+        rel_positions_bucket = (
+            rel_positions_bucket
+            + paddle.cast(
+                paddle.less_than(inv_relative_positions, paddle.zeros_like(inv_relative_positions)), dtype=paddle.int32
+            )
+            * num_buckets
+        )
+        inv_relative_positions = paddle.abs(inv_relative_positions)
+    else:
+        inv_relative_positions = paddle.cast(
+            paddle.less_than(paddle.zeros_like(inv_relative_positions), inv_relative_positions), dtype=paddle.int32
+        ) * inv_relative_positions.astype(paddle.int32)
+
+    max_exact = num_buckets // 2
+    is_small = paddle.less_than(inv_relative_positions, paddle.to_tensor(max_exact).cast(dtype=paddle.int32))
+    val_if_large = max_exact + paddle.log(
+        paddle.cast(inv_relative_positions, dtype=paddle.float32) / max_exact
+    ) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
+    val_if_large_num_buckets = paddle.ones_like(val_if_large) * (num_buckets - 1)
+    val_if_large_lt = paddle.cast(paddle.less_than(val_if_large, val_if_large_num_buckets), dtype=paddle.int32)
+    val_if_large = val_if_large_lt * val_if_large.astype(val_if_large_lt.dtype) + (
+        1 - val_if_large_lt
+    ) * val_if_large_num_buckets.astype(val_if_large_lt.dtype)
+    rel_positions_bucket = rel_positions_bucket + paddle.where(
+        is_small, paddle.cast(inv_relative_positions, dtype=paddle.int32), val_if_large
+    )
+    return rel_positions_bucket
+
+
+def compute_all_stream_relative_buckets(num_buckets, max_distance, position_ids):
+    """
+    This function computes both main and predict relative position buckets. For more detail, see paper.
+    """
+    # main stream
+    main_stream_relative_positions = paddle.tile(
+        paddle.unsqueeze(position_ids, axis=1), repeat_times=[1, position_ids.shape[-1], 1]
+    )
+    main_stream_relative_positions = main_stream_relative_positions - paddle.unsqueeze(position_ids, axis=-1)
+
+    # predicting stream
+    predicting_stream_relative_positions = paddle.unsqueeze(
+        paddle.concat([position_ids - 1, position_ids], axis=-1), axis=1
+    )
+    predicting_stream_relative_positions = paddle.tile(
+        predicting_stream_relative_positions, repeat_times=[1, position_ids.shape[-1], 1]
+    )
+    predicting_stream_relative_positions = predicting_stream_relative_positions - paddle.unsqueeze(
+        position_ids, axis=-1
+    )
+
+    # get both position buckets
+    main_relative_position_buckets = compute_relative_buckets(
+        num_buckets, max_distance, main_stream_relative_positions, is_bidirectional=False
+    )
+    predict_relative_position_buckets = compute_relative_buckets(
+        num_buckets, max_distance, predicting_stream_relative_positions, is_bidirectional=False
+    )
+    return main_relative_position_buckets, predict_relative_position_buckets
+
+
+class ProphetNetPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained Prophetnet models. It provides Prophetnet related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    """
+
+    pretrained_init_configuration = PROPHETNET_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = PROPHETNET_PRETRAINED_RESOURCE_FILES_MAP
+    base_model_prefix = "prophetnet"
+    config_class = ProphetNetConfig
+
+    def _init_weights(self, layer):
+        if isinstance(layer, nn.Linear):
+            layer.weight.set_value(
+                paddle.tensor.normal(
+                    mean=0.0,
+                    std=self.config.init_std,
+                    shape=layer.weight.shape,
+                )
+            )
+            if layer.bias is not None:
+                layer.bias.set_value(paddle.tensor.zeros(layer.bias.shape))
+
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.config.decoder_start_token_id
+        pad_token_id = self.config.pad_token_id
+
+        assert decoder_start_token_id is not None, (
+            "self.config.decoder_start_token_id has to be defined. "
+            "In ProphetNet it is usually set to the pad_token_id. See ProphetNet docs for more information"
+        )
+
+        # shift inputs to the right
+        shifted_input_ids = paddle.zeros_like(input_ids)
+        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+        shifted_input_ids[..., 0] = decoder_start_token_id
+
+        assert pad_token_id is not None, "self.config.pad_token_id has to be defined."
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids_mask = paddle.cast(shifted_input_ids == -100, dtype=paddle.int32)
+        shifted_input_ids = shifted_input_ids_mask * pad_token_id + (1 - shifted_input_ids_mask) * shifted_input_ids
+
+        assert (
+            paddle.sum(paddle.cast(shifted_input_ids >= 0, dtype=paddle.int32)).item() == shifted_input_ids.shape[-1]
+        ), "Verify that `shifted_input_ids` has only positive values"
+
+        return shifted_input_ids
+
+
+class ProphetNetPositionalEmbeddings(nn.Embedding):
+    """
+    ProphetNetPositional Embeddings.
+    """
+
+    def __init__(self, config: ProphetNetConfig):
+        self.max_length = config.max_position_embeddings
+        super(ProphetNetPositionalEmbeddings, self).__init__(
+            config.max_position_embeddings, config.hidden_size, config.pad_token_id
+        )
+
+    def forward(self, inputs_shape, attention_mask=None, past_key_values=None, position_ids=None):
+        assert (position_ids is None) or (
+            self._padding_idx is None
+        ), "If position_ids is pre-computed then padding_idx should not be set."
+
+        if position_ids is None:
+            if past_key_values is not None:
+                # position_ids is the same for every token when decoding a single step
+                # Without the int() cast, it doesn't work in some cases when exporting to ONNX
+                prev_num_input_ids = past_key_values[0][0].shape[2]
+                num_input_ids = inputs_shape[1] + prev_num_input_ids
+                position_ids = paddle.ones((1, 1), dtype="int64") * (int(self._padding_idx + num_input_ids))
+            else:
+                if attention_mask is None:
+                    attention_mask = paddle.ones(inputs_shape, dtype="int64")
+
+                # retrieve position_ids from input_ids / attention_mask
+                position_ids = (
+                    paddle.cast(
+                        paddle.cast(paddle.cumsum(attention_mask, axis=1), dtype=attention_mask.dtype)
+                        * attention_mask,
+                        dtype=paddle.int64,
+                    )
+                    + self._padding_idx
+                )
+
+                # make sure position_ids are not bigger then max_length
+                position_ids = paddle.clip(position_ids, min=0, max=self.max_length - 1)
+
+        return super().forward(position_ids), position_ids
+
+    def _forward(self, position_ids):
+        return super().forward(position_ids)
+
+
+class ProphetNetAttention(Layer):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+    """
+
+    def __init__(self, hidden_size, attention_dropout, dropout, num_attn_heads: int):
+        super().__init__()
+        hidden_size = hidden_size
+
+        self.attention_dropout = attention_dropout
+        self.dropout = dropout
+        self.num_attn_heads = num_attn_heads
+        self.head_dim = hidden_size // num_attn_heads
+
+        assert (
+            self.head_dim * num_attn_heads == hidden_size
+        ), "`config.hidden_size` must be divisible by `config.num_encoder_attention_heads` and `config.num_decoder_attention_heads`"
+
+        self.key_proj = nn.Linear(hidden_size, hidden_size)
+        self.value_proj = nn.Linear(hidden_size, hidden_size)
+        self.query_proj = nn.Linear(hidden_size, hidden_size)
+
+        self.out_proj = nn.Linear(hidden_size, hidden_size)
+
+    def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
+        return paddle.transpose(
+            paddle.reshape(tensor, [bsz, seq_len, self.num_attn_heads, self.head_dim]), (0, 2, 1, 3)
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        key_value_states: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        past_key_value: Optional[Tuple[Tensor]] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+
+        batch_size, tgt_len, hidden_size = hidden_states.shape
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        assert hidden_states.shape == [
+            batch_size,
+            tgt_len,
+            hidden_size,
+        ], f"Size of hidden states should be {batch_size, tgt_len, hidden_size}, but is {hidden_states.shape}"
+
+        # previous time steps are cached - no need to recompute key and value if they are static
+        query_states = self.query_proj(hidden_states) / (self.head_dim**0.5)
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.key_proj(key_value_states), -1, batch_size)
+            value_states = self._shape(self.value_proj(key_value_states), -1, batch_size)
+        else:
+            # self_attention
+            key_states = self._shape(self.key_proj(hidden_states), -1, batch_size)
+            value_states = self._shape(self.value_proj(hidden_states), -1, batch_size)
+
+        if is_cross_attention:
+            # if cross_attention save Tuple(paddle.Tensor, paddle.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        # project states into the correct shape
+        proj_shape = (batch_size * self.num_attn_heads, -1, self.head_dim)
+        query_states = paddle.reshape(self._shape(query_states, tgt_len, batch_size), proj_shape)
+        key_states = paddle.reshape(key_states, proj_shape)
+        value_states = paddle.reshape(value_states, proj_shape)
+
+        src_len = key_states.shape[1]
+        attn_weights = paddle.bmm(query_states, key_states.transpose((0, 2, 1)))
+        assert attn_weights.shape == [
+            batch_size * self.num_attn_heads,
+            tgt_len,
+            src_len,
+        ], f"`attn_weights` should be of size {batch_size * self.num_attn_heads, tgt_len, src_len}, but is of size {attn_weights.shape}"
+
+        # This is part of a workaround to get around fork/join parallelism not supporting Optional types.
+        if attention_mask is not None and len(attention_mask.shape) == 0:
+            attention_mask = None
+        assert attention_mask is None or attention_mask.shape == [
+            self.num_attn_heads * batch_size,
+            1,
+            src_len,
+        ], f"`attention_mask` should be `None` or of shape attention_mask.shape == {batch_size * self.num_attn_heads, 1, src_len}, but is {attention_mask.shape}"
+
+        if attention_mask is not None:  # don't attend to padding symbols
+            attn_weights = attn_weights + attention_mask
+
+        attn_weights = F.softmax(attn_weights, axis=-1)
+
+        attn_probs = F.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+
+        attn_output = paddle.bmm(attn_probs, value_states)
+        assert attn_output.shape == [
+            batch_size * self.num_attn_heads,
+            tgt_len,
+            self.head_dim,
+        ], f"`attn_output` should be of shape {batch_size * self.num_attn_heads, tgt_len, self.head_dim}, but is of shape {attn_output.shape}"
+
+        attn_output = paddle.reshape(
+            paddle.transpose(
+                paddle.reshape(attn_output, (batch_size, self.num_attn_heads, tgt_len, self.head_dim)), (0, 2, 1, 3)
+            ),
+            (batch_size, tgt_len, hidden_size),
+        )
+
+        attn_output = self.out_proj(attn_output)
+
+        attn_output = F.dropout(attn_output, p=self.dropout, training=self.training)
+        return attn_output, past_key_value
+
+
+class ProphetNetFeedForward(Layer):
+    """
+    This is the residual two feed-forward layer block based on the original Transformer implementation.
+    """
+
+    def __init__(self, hidden_size, activation_function, activation_dropout, dropout, ffn_dim: int):
+        super(ProphetNetFeedForward, self).__init__()
+        self.activation_fn = ACT2FN[activation_function]
+        self.intermediate = nn.Linear(hidden_size, ffn_dim)
+        self.output = nn.Linear(ffn_dim, hidden_size)
+        self.activation_dropout = activation_dropout
+        self.dropout = dropout
+
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+
+        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.output(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        return hidden_states
+
+
+class ProphetNetNgramSelfAttention(Layer):
+    def __init__(
+        self,
+        hidden_size,
+        num_buckets,
+        relative_max_distance,
+        num_decoder_attention_heads,
+        dropout,
+        attention_dropout,
+        ngram,
+    ):
+        super(ProphetNetNgramSelfAttention, self).__init__()
+
+        self.hidden_size = hidden_size
+
+        self.num_buckets = num_buckets
+        self.relative_max_distance = relative_max_distance
+        self.num_attn_heads = num_decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.head_dim = hidden_size // self.num_attn_heads
+        self.ngram = ngram
+
+        assert (
+            self.head_dim * self.num_attn_heads == hidden_size
+        ), "config.hidden_size must be divisible by num_attn_heads"
+        # key, value, query projection
+        self.key_proj = nn.Linear(hidden_size, hidden_size)
+        self.value_proj = nn.Linear(hidden_size, hidden_size)
+        self.query_proj = nn.Linear(hidden_size, hidden_size)
+
+        # out projection
+        self.out_proj = nn.Linear(hidden_size, hidden_size)
+
+        # rel position embeddings
+        self.relative_pos_embeddings = nn.Linear(hidden_size, self.num_buckets * self.num_attn_heads)
+
+    def _shape(self, tensor, seq_len, batch_size):
+        return paddle.transpose(
+            paddle.reshape(tensor, (batch_size, seq_len, self.num_attn_heads, self.head_dim)), (0, 2, 1, 3)
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        past_key_value: Optional[Tuple[Tensor]] = None,
+        attention_mask=None,
+        extended_predict_attention_mask=None,
+        main_relative_position_buckets=None,
+        predict_relative_position_buckets=None,
+        position_ids=None,
+    ):
+        batch_size, ngram_sequence_length, hidden_size = hidden_states.shape
+
+        assert hidden_states.shape == [
+            batch_size,
+            ngram_sequence_length,
+            hidden_size,
+        ], f"`hidden_states` should be of shape {batch_size, ngram_sequence_length, hidden_size}, but is of shape {hidden_states.shape}"
+
+        # project
+        query_states = self.query_proj(hidden_states)
+        key_states = self.key_proj(hidden_states)
+        value_states = self.value_proj(hidden_states)
+
+        # normalize
+        query_states = query_states / (self.head_dim**0.5)
+
+        # reshape
+        query_states = self._shape(query_states, ngram_sequence_length, batch_size)
+        key_states = self._shape(key_states, -1, batch_size)
+        value_states = self._shape(value_states, -1, batch_size)
+
+        proj_shape = (batch_size * self.num_attn_heads, -1, self.head_dim)
+
+        query_states = paddle.reshape(query_states, proj_shape)
+        key_states = paddle.reshape(key_states, proj_shape)
+        value_states = paddle.reshape(value_states, proj_shape)
+
+        # chunk into main stream and predict stream
+        hidden_states_list = paddle.chunk(hidden_states, 1 + self.ngram, axis=1)
+
+        query_states_list = paddle.chunk(query_states, 1 + self.ngram, axis=1)
+        key_states_list = paddle.chunk(key_states, 1 + self.ngram, axis=1)
+        value_states_list = paddle.chunk(value_states, 1 + self.ngram, axis=1)
+
+        main_hidden_states, hidden_states_predict_list = hidden_states_list[0], hidden_states_list[1:]
+        main_query_states, predict_query_states_list = query_states_list[0], query_states_list[1:]
+        main_key_states, predict_key_states_list = key_states_list[0], key_states_list[1:]
+        main_value_states, predict_value_states_list = value_states_list[0], value_states_list[1:]
+
+        # saved states are stored with shape (batch_size, num_attn_heads, seq_len, head_dim)
+        if past_key_value is not None:
+            prev_main_key_states = past_key_value[0].reshape([batch_size * self.num_attn_heads, -1, self.head_dim])
+            main_key_states = paddle.concat((prev_main_key_states, main_key_states), axis=1)
+            prev_main_value_states = past_key_value[1].reshape([batch_size * self.num_attn_heads, -1, self.head_dim])
+            main_value_states = paddle.concat((prev_main_value_states, main_value_states), axis=1)
+
+        # Update cache
+        past_key_value = (
+            paddle.reshape(main_key_states, (batch_size, self.num_attn_heads, -1, self.head_dim)),
+            paddle.reshape(main_value_states, (batch_size, self.num_attn_heads, -1, self.head_dim)),
+        )
+
+        # get seq_length of main stream only
+        sequence_length = ngram_sequence_length // (1 + self.ngram)
+
+        # MAIN-STREAM
+        # main attn weights
+        main_attn_weights = paddle.bmm(main_query_states, paddle.transpose(main_key_states, (0, 2, 1)))
+
+        # retrieve relative position embeddings for each layer -> see paper for more details
+        main_relative_pos_embeddings = self.get_main_relative_pos_embeddings(
+            main_hidden_states, main_attn_weights, position_ids, main_relative_position_buckets
+        )
+
+        main_attn_weights = main_attn_weights + main_relative_pos_embeddings
+
+        if attention_mask is not None:
+            main_attn_weights = main_attn_weights + attention_mask
+
+        main_attn_probs = F.softmax(main_attn_weights, axis=-1, dtype=main_attn_weights.dtype)
+
+        main_attn_probs = F.dropout(main_attn_probs, p=self.attention_dropout, training=self.training)
+        # project to attn_output
+        main_attn_output = paddle.bmm(main_attn_probs, main_value_states)
+
+        # reshape so that num_heads dim is merged into last `head_dim` axis
+        main_attn_output = paddle.reshape(
+            paddle.transpose(
+                paddle.reshape(main_attn_output, (batch_size, self.num_attn_heads, sequence_length, self.head_dim)),
+                (0, 2, 1, 3),
+            ),
+            (batch_size, 1, sequence_length, hidden_size),
+        )
+        main_attn_output = self.out_proj(main_attn_output)
+
+        # PREDICT-STREAM
+        # [ngram, B*head, T, c]
+        predict_query_states = paddle.reshape(
+            paddle.concat(predict_query_states_list, axis=0), (self.ngram, -1, sequence_length, self.head_dim)
+        )
+        # [ngram, B*head, 2*T, c]
+        predict_key_states = paddle.concat(
+            [
+                paddle.unsqueeze(paddle.concat([main_key_states, key], axis=1), axis=0)
+                for key in predict_key_states_list
+            ],
+            axis=0,
+        )
+
+        # [ngram, T, B, C]
+        predict_hidden_states = paddle.reshape(
+            paddle.concat(hidden_states_predict_list, axis=0), (self.ngram, sequence_length, batch_size, hidden_size)
+        )
+
+        # [ngram, B*head, 2*T, c]
+        predict_value_states = paddle.concat(
+            [
+                paddle.unsqueeze(paddle.concat([main_value_states, v_p], axis=1), axis=0)
+                for v_p in predict_value_states_list
+            ],
+            axis=0,
+        )
+
+        # [ngram, B*head, T, 2*T]
+        predict_attn_weights = paddle.einsum("nbtc,nbsc->nbts", predict_query_states, predict_key_states)
+
+        # [ngram, B*head, T, S]
+        # retrieve relative position embeddings for each layer -> see paper for more details
+        predict_relative_pos_embeddings = self.get_predict_relative_pos_embeddings(
+            predict_hidden_states, predict_attn_weights, position_ids, predict_relative_position_buckets
+        )
+
+        # [ngram, B*head, T, 2*T]
+        predict_attn_weights = predict_attn_weights + predict_relative_pos_embeddings
+
+        if extended_predict_attention_mask is not None:
+            predict_attn_weights = predict_attn_weights + paddle.cast(
+                extended_predict_attention_mask, predict_attn_weights.dtype
+            )
+
+        predict_attn_probs = F.softmax(predict_attn_weights, axis=-1, dtype=predict_attn_weights.dtype)
+
+        predict_attn_probs = F.dropout(predict_attn_probs, p=self.attention_dropout, training=self.training)
+        # project to attention output
+        # [ngram, B*head, T, c]
+        predict_attn_output = paddle.einsum("nbts,nbsc->nbtc", predict_attn_probs, predict_value_states)
+
+        # reshape so that num_heads dim is merged into last `head_dim` axis
+        # [ngram, B, T, C]
+        predict_attn_output = paddle.reshape(
+            paddle.transpose(
+                paddle.reshape(
+                    predict_attn_output, (self.ngram, batch_size, self.num_attn_heads, sequence_length, self.head_dim)
+                ),
+                (1, 0, 3, 2, 4),
+            ),
+            (batch_size, self.ngram, sequence_length, hidden_size),
+        )
+        predict_attn_output = self.out_proj(predict_attn_output)
+
+        # concat to single attn output
+        # [B, 1+ngram*T, C]
+        attn_output = paddle.reshape(
+            paddle.concat([main_attn_output, predict_attn_output], axis=1), (batch_size, -1, hidden_size)
+        )
+        # reshape into better form for `config.output_attentions`
+        main_attn_probs = paddle.reshape(main_attn_probs, (batch_size, self.num_attn_heads, sequence_length, -1))
+        predict_attn_probs = paddle.transpose(
+            paddle.reshape(predict_attn_probs, (self.ngram, batch_size, self.num_attn_heads, sequence_length, -1)),
+            (1, 0, 2, 3, 4),
+        )
+
+        attn_output = F.dropout(attn_output, p=self.dropout, training=self.training)
+
+        return attn_output, main_attn_probs, predict_attn_probs, past_key_value
+
+    def get_main_relative_pos_embeddings(
+        self, hidden_states, attn_weights, position_ids, main_relative_position_buckets
+    ):
+        # input hidden_states [B,T,C], input attn_weights [T*head,T,S], input position_ids [B,T] or [1,1]
+
+        if main_relative_position_buckets is None:
+            batch_size, sequence_length = hidden_states.shape[:2]
+            relative_positions = paddle.tile(
+                paddle.unsqueeze(paddle.unsqueeze(paddle.arange(1, attn_weights.shape[-1] + 1), axis=0), axis=0),
+                repeat_times=[batch_size, sequence_length, 1],
+            )
+            relative_positions = relative_positions - paddle.tile(
+                paddle.unsqueeze(position_ids, axis=0), repeat_times=[batch_size, sequence_length, 1]
+            )  # [B, T, s]
+            main_relative_position_buckets = compute_relative_buckets(
+                self.num_buckets, self.relative_max_distance, relative_positions, False
+            )
+
+        rel_pos_embeddings = self.relative_pos_embeddings(hidden_states)  # [B,T,Buckets*head]
+        rel_pos_embeddings = paddle.transpose(
+            paddle.reshape(
+                rel_pos_embeddings, (rel_pos_embeddings.shape[:2] + [self.num_buckets, self.num_attn_heads])
+            ),
+            (0, 3, 1, 2),
+        )  # [B,T,Buckets,head]
+        rel_pos_embeddings = rel_pos_embeddings.reshape(attn_weights.shape[:2] + [-1])  # [B*head,T,Buckets]
+
+        main_relative_position_buckets = paddle.cast(
+            paddle.reshape(
+                paddle.tile(main_relative_position_buckets, repeat_times=[1, self.num_attn_heads, 1]),
+                (-1, main_relative_position_buckets.shape[-1]),
+            ),
+            dtype=paddle.int64,
+        )  # [B*head*T, T]
+        rel_pos_embeddings = paddle.reshape(
+            rel_pos_embeddings, (-1, rel_pos_embeddings.shape[-1])
+        )  # [B*head*T,Buckets]
+
+        main_relative_position_buckets_index = paddle.tile(
+            main_relative_position_buckets.unsqueeze(2), repeat_times=[1, 1, 2]
+        )
+        main_relative_position_buckets_index[:, :, 0] = paddle.tile(
+            paddle.arange(0, main_relative_position_buckets_index.shape[0]).unsqueeze(1),
+            repeat_times=[1, main_relative_position_buckets_index.shape[1]],
+        )
+
+        main_relative_pos_embeddings = paddle.reshape(
+            paddle.gather_nd(rel_pos_embeddings, index=main_relative_position_buckets_index),
+            (attn_weights.shape[:2] + [-1]),
+        )
+        return main_relative_pos_embeddings
+
+    def get_predict_relative_pos_embeddings(
+        self, hidden_states, attn_weights, position_ids, predict_relative_position_buckets
+    ):
+        # input hidden_states [ngram, T,B,C],
+        # input attn_weights [ngram, B*head,T,S],
+        # input position_ids [B,T] or [1,1],
+        # input predict_relative_position_buckets [B,T, 2*T] or None
+        sequence_length, batch_size = hidden_states.shape[1:3]
+
+        if predict_relative_position_buckets is None:
+            key_sequence_length = attn_weights.shape[-1]
+            assert (
+                position_ids[0][0] == key_sequence_length - 1
+            ), "`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)"
+            relative_positions = paddle.tile(
+                paddle.unsqueeze(paddle.unsqueeze(paddle.arange(0, key_sequence_length), axis=0), axis=0),
+                repeat_times=[batch_size, sequence_length, 1],
+            )
+
+            relative_positions = relative_positions - paddle.tile(
+                paddle.unsqueeze(position_ids, axis=0), repeat_times=[batch_size, sequence_length, 1]
+            )
+            predict_relative_position_buckets = compute_relative_buckets(
+                self.num_buckets, self.relative_max_distance, relative_positions, False
+            )
+
+        hidden_states = paddle.transpose(hidden_states, (0, 2, 1, 3))  # [ngram, B, T, C]
+        rel_pos_embeddings = paddle.reshape(
+            self.relative_pos_embeddings(hidden_states),
+            hidden_states.shape[:-1] + [self.num_buckets, self.num_attn_heads],
+        )  # [ngram, B, T, bucket, head]
+        rel_pos_embeddings = paddle.reshape(
+            paddle.transpose(rel_pos_embeddings, (0, 1, 4, 2, 3)),
+            (self.ngram * batch_size * self.num_attn_heads, sequence_length, -1),
+        )  # [ngram*B*head, T, bucket]
+
+        predict_relative_position_buckets = paddle.tile(
+            paddle.unsqueeze(predict_relative_position_buckets, axis=0),
+            repeat_times=[self.ngram, 1, self.num_attn_heads, 1],
+        )  # [ngram, B, head*T, S]
+
+        rel_pos_embeddings = paddle.reshape(rel_pos_embeddings, (-1, rel_pos_embeddings.shape[-1]))
+        predict_relative_position_buckets = paddle.cast(
+            paddle.reshape(predict_relative_position_buckets, (-1, predict_relative_position_buckets.shape[-1])),
+            dtype=paddle.int64,
+        )  # [ngram*B*head*T, S]
+
+        predict_relative_position_buckets_index = paddle.tile(
+            predict_relative_position_buckets.unsqueeze(2), repeat_times=[1, 1, 2]
+        )
+        predict_relative_position_buckets_index[:, :, 0] = paddle.tile(
+            paddle.arange(0, predict_relative_position_buckets_index.shape[0]).unsqueeze(1),
+            repeat_times=[1, predict_relative_position_buckets_index.shape[1]],
+        )
+
+        predict_relative_pos_embeddings = paddle.reshape(
+            paddle.gather_nd(rel_pos_embeddings, index=predict_relative_position_buckets_index),
+            (self.ngram, batch_size * self.num_attn_heads, sequence_length, -1),
+        )  # [ngram, B*head, T, S]
+
+        return predict_relative_pos_embeddings
+
+
+class ProphetNetEncoderLayer(Layer):
+    """
+    Encoder block for Prophetnet
+    """
+
+    def __init__(self, config: ProphetNetConfig):
+        super(ProphetNetEncoderLayer, self).__init__()
+        # 1st residual block
+        self.self_attn = ProphetNetAttention(
+            config.hidden_size, config.attention_dropout, config.dropout, config.num_encoder_attention_heads
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(config.hidden_size)
+
+        # 2nd residual block
+        self.feed_forward = ProphetNetFeedForward(
+            config.hidden_size,
+            config.activation_function,
+            config.activation_dropout,
+            config.dropout,
+            config.encoder_ffn_dim,
+        )
+        self.feed_forward_layer_norm = nn.LayerNorm(config.hidden_size)
+
+    def forward(self, hidden_states, attention_mask):
+        # 1st residual block
+        attention_output, _ = self.self_attn(hidden_states=hidden_states, attention_mask=attention_mask)
+        hidden_states = self.self_attn_layer_norm(attention_output + hidden_states)
+
+        # 2nd residual block
+        feed_forward_output = self.feed_forward(hidden_states)
+        hidden_states = self.feed_forward_layer_norm(feed_forward_output + hidden_states)
+        return hidden_states
+
+
+class ProphetNetDecoderLayer(Layer):
+    """
+    Decoder block for Prophetnet
+    """
+
+    def __init__(self, config: ProphetNetConfig):
+        super(ProphetNetDecoderLayer, self).__init__()
+        # 1st residual block
+        self.self_attn = ProphetNetNgramSelfAttention(
+            config.hidden_size,
+            config.num_buckets,
+            config.relative_max_distance,
+            config.num_decoder_attention_heads,
+            config.dropout,
+            config.attention_dropout,
+            config.ngram,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(config.hidden_size)
+
+        # 2nd residual block
+        if config.add_cross_attention:
+            self.cross_attn = ProphetNetAttention(
+                config.hidden_size, config.attention_dropout, config.dropout, config.num_decoder_attention_heads
+            )
+            self.cross_attn_layer_norm = nn.LayerNorm(config.hidden_size)
+
+        # 3rd residual block
+        self.feed_forward = ProphetNetFeedForward(
+            config.hidden_size,
+            config.activation_function,
+            config.activation_dropout,
+            config.dropout,
+            config.decoder_ffn_dim,
+        )
+        self.feed_forward_layer_norm = nn.LayerNorm(config.hidden_size)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attn_mask=None,
+        extended_predict_attention_mask=None,
+        main_relative_position_buckets=None,
+        predict_relative_position_buckets=None,
+        position_ids=None,
+        past_key_value=None,
+        use_cache: bool = True,
+    ):
+        # 1st residual block
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        ngram_attention_output, self_attn_weights, self_attn_weights_ngram, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            extended_predict_attention_mask=extended_predict_attention_mask,
+            main_relative_position_buckets=main_relative_position_buckets,
+            predict_relative_position_buckets=predict_relative_position_buckets,
+            position_ids=position_ids,
+        )
+        hidden_states = self.self_attn_layer_norm(hidden_states + ngram_attention_output)
+
+        # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+        cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+        if encoder_hidden_states is not None:
+            # 2nd residual block
+            attention_output, cross_attn_present_key_value = self.cross_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attn_mask,
+                past_key_value=cross_attn_past_key_value,
+            )
+            hidden_states = self.cross_attn_layer_norm(attention_output + hidden_states)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # 3rd residual block
+        feed_forward_output = self.feed_forward(hidden_states)
+        hidden_states = self.feed_forward_layer_norm(feed_forward_output + hidden_states)
+
+        outputs = (hidden_states,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class ProphetNetEncoder(ProphetNetPretrainedModel):
+    r"""
+    word_embeddings  (:obj:`paddle.nn.Embeddings` of shape :obj:`(config.vocab_size, config.hidden_size)`, `optional`):
+        The word embedding parameters. This can be used to initialize :class:`~transformers.ProphetNetEncoder` with
+        pre-defined word embeddings instead of randomly initialized word embeddings.
+    """
+
+    def __init__(self, word_embeddings, config: ProphetNetConfig):
+        super(ProphetNetEncoder, self).__init__(config)
+        self.init_std = config.init_std
+        if word_embeddings is not None:
+            self.word_embeddings = word_embeddings
+        else:
+            self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+
+        self.position_embeddings = ProphetNetPositionalEmbeddings(config)
+        self.embeddings_layer_norm = nn.LayerNorm(config.hidden_size)
+
+        self.layers = nn.LayerList([ProphetNetEncoderLayer(config) for _ in range(config.num_encoder_layers)])
+
+    def forward(self, input_ids=None, attention_mask=None):
+        if input_ids is None:
+            raise ValueError("Input_ids cannot be None.")
+        inputs_embeds = self.word_embeddings(input_ids)
+
+        # prepare attention mask
+        if attention_mask is not None:
+            extended_attention_mask = (
+                paddle.tile(
+                    1.0 - attention_mask.unsqueeze(1), repeat_times=[self.config.num_encoder_attention_heads, 1, 1]
+                )
+            ) * -10000.0
+            extended_attention_mask = paddle.cast(extended_attention_mask, dtype=inputs_embeds.dtype)
+            extended_attention_mask.stop_gradient = True
+        else:
+            extended_attention_mask = None
+
+        position_embeddings, position_ids = self.position_embeddings(inputs_embeds.shape[:2])
+
+        hidden_states = inputs_embeds + position_embeddings
+        hidden_states = self.embeddings_layer_norm(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.config.dropout, training=self.training)
+
+        for idx, encoder_layer in enumerate(self.layers):
+            hidden_states = encoder_layer(hidden_states, attention_mask=extended_attention_mask)
+        return hidden_states
+
+
+class ProphetNetDecoder(ProphetNetPretrainedModel):
+    def __init__(self, word_embeddings, config: ProphetNetConfig):
+        super(ProphetNetDecoder, self).__init__(config)
+        self.init_std = config.init_std
+        self.ngram = config.ngram
+        self.num_buckets = config.num_buckets
+        self.relative_max_distance = config.relative_max_distance
+        self.dropout = config.dropout
+        self.max_target_positions = config.max_position_embeddings
+        self.add_cross_attention = config.add_cross_attention
+        if word_embeddings is not None:
+            self.word_embeddings = word_embeddings
+        else:
+            self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+
+        self.position_embeddings = ProphetNetPositionalEmbeddings(config)
+
+        self.ngram_embeddings = nn.Embedding(self.ngram, config.hidden_size)
+        self.layers = nn.LayerList([ProphetNetDecoderLayer(config) for _ in range(config.num_decoder_layers)])
+        self.embeddings_layer_norm = nn.LayerNorm(config.hidden_size)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=True,
+    ):
+        if input_ids is None:
+            raise ValueError("Decoder input_ids cannot be None.")
+        inputs_embeds = self.word_embeddings(input_ids)
+        batch_size, sequence_length = inputs_embeds.shape[:2]
+
+        main_stream_pos_embed, position_ids = self.position_embeddings(
+            (batch_size, sequence_length), past_key_values=past_key_values
+        )
+
+        if past_key_values is not None:
+            main_relative_position_buckets, predict_relative_position_buckets = None, None
+        else:
+            main_relative_position_buckets, predict_relative_position_buckets = self.compute_buffered_relative_buckets(
+                position_ids
+            )
+        predicting_stream_pos_embed = self.position_embeddings._forward(position_ids + 1)
+
+        # add position embeddings
+        hidden_states = inputs_embeds + main_stream_pos_embed
+
+        ngram_embeddings = self.ngram_embeddings.weight
+
+        # prepare attention mask
+        if past_key_values is not None:
+            assert (
+                hidden_states.shape[1] == 1
+            ), "At the moment `use_cache` is only supported for `decoder_input_ids` of length 1"
+
+            ngram_hidden_states = [
+                paddle.tile(
+                    (ngram_embeddings[ngram - 1] + predicting_stream_pos_embed), repeat_times=[batch_size, 1, 1]
+                )
+                for ngram in range(self.ngram)
+            ]
+            extended_attention_mask = None
+            extended_predict_attention_mask = None
+        else:
+            ngram_hidden_states = [
+                (ngram_embeddings[ngram - 1] + predicting_stream_pos_embed) for ngram in range(self.ngram)
+            ]
+            extended_attention_mask = self.prepare_attention_mask(hidden_states, attention_mask)
+            extended_predict_attention_mask = self.prepare_predict_attention_mask(hidden_states, attention_mask)
+            extended_attention_mask.stop_gradient = True
+            extended_predict_attention_mask.stop_gradient = True
+
+        # prepare encoder attention mask
+        if encoder_attention_mask is not None:
+            extended_encoder_attention_mask = (
+                1.0
+                - paddle.tile(
+                    encoder_attention_mask[:, None, :], repeat_times=[self.config.num_decoder_attention_heads, 1, 1]
+                )
+            ) * -10000.0
+            extended_encoder_attention_mask = paddle.cast(extended_encoder_attention_mask, dtype=inputs_embeds.dtype)
+        else:
+            extended_encoder_attention_mask = None
+
+        hidden_states = paddle.concat([hidden_states] + ngram_hidden_states, axis=1)
+
+        if self.embeddings_layer_norm:
+            hidden_states = self.embeddings_layer_norm(hidden_states)
+
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        present_key_values = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=extended_attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attn_mask=extended_encoder_attention_mask,
+                extended_predict_attention_mask=extended_predict_attention_mask,
+                main_relative_position_buckets=main_relative_position_buckets,
+                predict_relative_position_buckets=predict_relative_position_buckets,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                use_cache=use_cache,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                present_key_values += (layer_outputs[1],)
+
+        last_hidden_state = hidden_states[:, :sequence_length]  # 1-gram
+        last_hidden_state_ngram = hidden_states[:, sequence_length:] if self.ngram > 0 else None  # 2-gram
+        return tuple(v for v in [last_hidden_state, last_hidden_state_ngram, present_key_values] if v is not None)
+
+    def compute_buffered_relative_buckets(self, position_ids):
+        batch_size, sequence_length = position_ids.shape
+
+        if not hasattr(self, "_main_relative_buckets") or self._main_relative_buckets is None:
+            position_ids = paddle.tile(paddle.arange(1, self.max_target_positions + 1), repeat_times=[1, 1])
+            self._main_relative_buckets, self._predict_relative_buckets = compute_all_stream_relative_buckets(
+                self.num_buckets, self.relative_max_distance, position_ids
+            )
+
+        # buffer relative buckets
+        main_relative_buckets = paddle.tile(
+            self._main_relative_buckets[:, :sequence_length, :sequence_length], repeat_times=[batch_size, 1, 1]
+        )
+        predict_relative_buckets = paddle.tile(
+            paddle.concat(
+                [
+                    self._predict_relative_buckets[:, :sequence_length, :sequence_length],
+                    self._predict_relative_buckets[
+                        :, :sequence_length, self.max_target_positions : self.max_target_positions + sequence_length
+                    ],
+                ],
+                axis=2,
+            ),
+            repeat_times=[batch_size, 1, 1],
+        )
+
+        return main_relative_buckets, predict_relative_buckets
+
+    def prepare_attention_mask(self, hidden_states, attention_mask):
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        # get causal mask
+        if not hasattr(self, "_causal_mask") or self._causal_mask is None:
+            causal_mask = paddle.full(
+                (self.max_target_positions, self.max_target_positions), -float("inf"), dtype=hidden_states.dtype
+            )
+            self._causal_mask = paddle.triu(causal_mask, 1)
+        extended_causal_mask = paddle.expand(
+            self._causal_mask[:seq_length, :seq_length].unsqueeze(0), shape=[batch_size, seq_length, seq_length]
+        )
+
+        # add usual attention mask
+        if attention_mask is not None:
+            extended_attention_mask = (1.0 - attention_mask.unsqueeze(1)) * -10000.0
+            extended_attention_mask = extended_causal_mask + extended_attention_mask
+        else:
+            extended_attention_mask = extended_causal_mask
+        return paddle.cast(
+            paddle.tile(extended_attention_mask, repeat_times=[self.config.num_decoder_attention_heads, 1, 1]),
+            dtype=hidden_states.dtype,
+        )
+
+    def prepare_predict_attention_mask(self, hidden_states, attention_mask):
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        # get causal mask
+        if not hasattr(self, "_predict_causal_mask") or self._predict_causal_mask is None:
+            self._predict_causal_mask = ngram_attention_bias(
+                self.max_target_positions, self.ngram, hidden_states.dtype
+            )
+        predict_causal_mask = paddle.concat(
+            [
+                self._predict_causal_mask[:, :seq_length, :seq_length],
+                self._predict_causal_mask[
+                    :, :seq_length, self.max_target_positions : self.max_target_positions + seq_length
+                ],
+            ],
+            axis=-1,
+        )
+        extended_predict_causal_mask = paddle.expand(
+            predict_causal_mask[:, None, :, :],
+            shape=predict_causal_mask.shape[:1] + [batch_size] + predict_causal_mask.shape[1:],
+        )
+
+        # add usual attention mask
+        if attention_mask is not None:
+            extended_attention_mask = (1.0 - attention_mask[None, :, None, :]) * -10000.0
+            extended_attention_mask = extended_attention_mask.expand((self.ngram, batch_size, seq_length, seq_length))
+            # predicted stream attention_mask should always be 0
+            extended_attention_mask = paddle.concat(
+                [extended_attention_mask, paddle.zeros_like(extended_attention_mask)], axis=-1
+            )
+            extended_predict_attention_mask = extended_predict_causal_mask + extended_attention_mask
+        else:
+            extended_predict_attention_mask = extended_predict_causal_mask
+        return paddle.cast(
+            extended_predict_attention_mask.tile([1, self.config.num_decoder_attention_heads, 1, 1]),
+            dtype=hidden_states.dtype,
+        )
+
+
+@register_base_model
+class ProphetNetModel(ProphetNetPretrainedModel):
+    def __init__(self, config: ProphetNetConfig):
+        super(ProphetNetModel, self).__init__(config)
+        self.init_std = config.init_std
+        self.eps = config.eps
+        self.pad_token_id = config.pad_token_id
+        self.disable_ngram_loss = config.disable_ngram_loss
+        self.decoder_start_token_id = config.decoder_start_token_id
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+
+        self.encoder = ProphetNetEncoder(self.word_embeddings, config)
+
+        self.decoder = ProphetNetDecoder(self.word_embeddings, config)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def get_input_embeddings(self):
+        return self.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.word_embeddings = value
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_output: Optional[Tuple] = None,
+        use_cache=True,
+        past_key_values=None,
+    ):
+        if attention_mask is None:
+            assert input_ids is not None, "input_ids should be " "specified when generating attention_mask"
+            attention_mask = paddle.cast(input_ids != self.pad_token_id, dtype=paddle.get_default_dtype())
+
+        if decoder_attention_mask is None:
+            assert decoder_input_ids is not None, (
+                "decoder_input_ids should be " "specified when generating decoder_attention_mask"
+            )
+            decoder_attention_mask = paddle.cast(
+                decoder_input_ids != self.pad_token_id, dtype=paddle.get_default_dtype()
+            )
+        if encoder_output is None:
+            encoder_output = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_output,
+            encoder_attention_mask=attention_mask,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+        )
+        return decoder_outputs + (encoder_output,)
+
+
+class Linear_wo_bias(Layer):
+    def __init__(self, in_features, out_features, weight_attr=None, name=None):
+        super(Linear_wo_bias, self).__init__()
+        self._dtype = self._helper.get_default_dtype()
+        self._weight_attr = weight_attr
+        self.weight = self.create_parameter(
+            shape=[in_features, out_features], attr=self._weight_attr, dtype=self._dtype, is_bias=False
+        )
+        self.name = name
+
+    def forward(self, input):
+        out = F.linear(x=input, weight=self.weight, name=self.name)
+        return out
+
+    def extra_repr(self):
+        name_str = ", name={}".format(self.name) if self.name else ""
+        return "in_features={}, out_features={}, dtype={}{}".format(
+            self.weight.shape[0], self.weight.shape[1], self._dtype, name_str
+        )
+
+
+class ProphetNetForConditionalGeneration(ProphetNetPretrainedModel):
+    def __init__(self, config: ProphetNetConfig):
+        super(ProphetNetForConditionalGeneration, self).__init__(config)
+        self.prophetnet = ProphetNetModel(config)
+        self.padding_idx = self.prophetnet.word_embeddings._padding_idx
+
+        self.lm_head = Linear_wo_bias(config.hidden_size, config.vocab_size)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_output=None,
+        labels=None,
+        use_cache=True,
+        past_key_values=None,
+    ):
+        if labels is not None and decoder_input_ids is None:
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(labels)
+        outputs = self.prophetnet(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_output=encoder_output,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+        )
+
+        batch_size, sequence_length = decoder_input_ids.shape
+
+        predicting_streams = paddle.reshape(outputs[1], (batch_size, self.config.ngram, sequence_length, -1))
+        predict_logits = self.lm_head(predicting_streams)
+
+        logits = predict_logits[:, 0]
+        if use_cache:
+            past_key_values = outputs[2]
+            return logits, past_key_values, predict_logits
+        else:
+            return logits, predict_logits
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        attention_mask=None,
+        decoder_attention_mask=None,
+        cache=None,
+        use_cache=None,
+        encoder_output=None,
+    ):
+        assert encoder_output is not None, "`encoder_output` have to be passed for generation."
+        if cache is not None:
+            decoder_input_ids = decoder_input_ids[:, -1].unsqueeze(-1)
+
+        # first step, decoder_cached_states are empty
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "decoder_input_ids": decoder_input_ids,
+            "encoder_output": encoder_output,
+            "decoder_attention_mask": decoder_attention_mask,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,
+            "past_key_values": cache,
+        }
+
+    def prepare_decoder_input_ids_from_labels(self, labels):
+        return self._shift_right(labels)
+
+    def get_encoder(self):
+        return self.prophetnet.encoder
+
+    def get_decoder(self):
+        return self.prophetnet.decoder
+
+    def __getattr__(self, name):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(getattr(self, self.base_model_prefix), name)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/prophetnet/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/prophetnet/tokenizer.py
new file mode 100644
index 000000000..303884880
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/prophetnet/tokenizer.py
@@ -0,0 +1,316 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import logging
+import os
+from typing import List
+
+from .. import PretrainedTokenizer, BasicTokenizer, WordpieceTokenizer
+from ..tokenizer_utils import Trie
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"prophetnet-large-uncased": 512}
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+def create_trie(unique_no_split_tokens):
+    trie = Trie()
+    for token in unique_no_split_tokens:
+        trie.add(token)
+    return trie
+
+
+class ProphetNetTokenizer(PretrainedTokenizer):
+    r"""
+    Construct a ProphetNetTokenizer. Based on WordPiece.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
+    Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
+            Whether or not to do basic tokenization before WordPiece.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        x_sep_token (`str`, *optional*, defaults to `"[X_SEP]"`):
+            Special second separator token, which can be generated by
+            [`ProphetNetForConditionalGeneration`]. It is used to separate bullet-point like
+            sentences in summarization, *e.g.*.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+    """
+
+    resource_files_names = {"vocab_file": "prophetnet.tokenizer"}
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "prophetnet-large-uncased": "https://bj.bcebos.com/paddlenlp/models/transformers/prophetnet/prophetnet.tokenizer",
+        }
+    }
+    pretrained_init_configuration = {
+        "prophetnet-large-uncased": {"do_lower_case": True},
+    }
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        bos_token="[SEP]",
+        eos_token="[SEP]",
+        cls_token="[CLS]",
+        x_sep_token="[X_SEP]",
+        pad_token="[PAD]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+        self.unique_no_split_tokens = [
+            x_sep_token,
+            unk_token,
+            sep_token,
+            bos_token,
+            eos_token,
+            cls_token,
+            pad_token,
+            mask_token,
+        ]
+        self.tokens_trie = create_trie(self.unique_no_split_tokens)
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=unk_token)
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab)
+
+    def tokenize(self, text):
+        return self._tokenize(text)
+
+    def _tokenize(self, text):
+        """
+        Converts a string to a list of tokens.
+
+        Args:
+            text (str): The text to be tokenized.
+
+        Returns:
+            List[str]: A list of string representing converted tokens.
+        """
+        no_split_token = set(self.unique_no_split_tokens)
+        tokens = self.tokens_trie.split(text)
+        for i, token in enumerate(tokens):
+            if token in no_split_token:
+                left = tokens[i - 1] if i > 0 else None
+                right = tokens[i + 1] if i < len(tokens) - 1 else None
+                # We strip left and right by default
+                if right:
+                    tokens[i + 1] = right.lstrip()
+                if left:
+                    tokens[i - 1] = left.rstrip()
+        # ["This is something", "<special_token_1>", "else"]
+        tokenized_text = []
+        for token in tokens:
+            # Need to skip eventual empty (fully stripped) tokens
+            if not token:
+                continue
+            if token in no_split_token:
+                tokenized_text.append(token)
+            else:
+                tokenized_text.extend(self._tokenize_function(token))
+        # ["This", " is", " something", "<special_token_1>", "else"]
+        return tokenized_text
+
+    def _tokenize_function(self, text):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text):
+                split_tokens += self.wordpiece_tokenizer.tokenize(token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+    def convert_tokens_to_ids(self, tokens):
+        """
+        Converts a sequence of tokens into ids using the `vocab` attribute (an
+        instance of `Vocab`). Override it if needed.
+
+        Args：
+            tokens (list[int]): List of token ids.
+
+        Returns:
+            list: Converted id list.
+        """
+        if not isinstance(tokens, (list, tuple)):
+            return self._convert_token_to_id(tokens)
+        else:
+            return [self._convert_token_to_id(token) for token in tokens]
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """
+        Converts a single index or a sequence of indices to a token or
+        a sequence of tokens, using the vocabulary and added tokens.
+
+        Args:
+            ids (int or List[int]):
+                The token id (or token ids) to be converted to token(s).
+            skip_special_tokens (bool, optional):
+                Whether or not to remove special tokens in the decoding.
+                Defaults to `False` and we do not remove special tokens.
+
+        Returns:
+            str or List[str]: The decoded token(s).
+        """
+        if not isinstance(ids, (list, tuple)):
+            return self._convert_id_to_token(ids)
+        tokens = [self._convert_id_to_token(_id) for _id in ids]
+        if skip_special_tokens:
+            return [token for token in tokens if token not in self.all_special_tokens]
+        return tokens
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def convert_ids_to_string(self, ids):
+        return self.convert_tokens_to_string(self.convert_ids_to_tokens(ids))
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return ([0] * len(token_ids_0)) + [1]
+        return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A ProphetNet
+        sequence pair mask has the following format:
+
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        if token_ids_1 is None:
+            return len(token_ids_0 + sep) * [0]
+        return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return token_ids_0 + [self.sep_token_id]
+        sep = [self.sep_token_id]
+        return token_ids_0 + sep + token_ids_1 + sep
+
+    def save_vocabulary(self, save_directory):
+        index = 0
+        vocab_file = os.path.join(save_directory, self.resource_files_names["vocab_file"])
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logging.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen/__init__.py
new file mode 100644
index 000000000..d64a428ad
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .configuration import *
+from .modeling import *
+from .modeling_3D_auto import *
+from .modeling_pp import *
+from .tokenizer import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen/configuration.py
new file mode 100644
index 000000000..1841622ea
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen/configuration.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2023 Alibaba Cloud and PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddlenlp.transformers import PretrainedConfig
+
+__all__ = ["QWenConfig"]
+
+
+class QWenConfig(PretrainedConfig):
+    model_type = "qwen"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        emb_dropout_prob=0.0,
+        attn_dropout_prob=0.0,
+        layer_norm_epsilon=1e-6,
+        initializer_range=0.02,
+        max_position_embeddings=8192,
+        scale_attn_weights=True,
+        use_cache=True,
+        kv_channels=128,
+        rotary_pct=1.0,
+        rotary_emb_base=10000,
+        use_dynamic_ntk=True,
+        use_logn_attn=True,
+        intermediate_size=22016,
+        no_bias=True,
+        tie_word_embeddings=False,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        long_sequence_strategy_type=None,
+        long_sequence_strategy_name=None,
+        long_sequence_init_args=None,
+        use_long_sequence_strategies=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.emb_dropout_prob = emb_dropout_prob
+        self.attn_dropout_prob = attn_dropout_prob
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.max_position_embeddings = max_position_embeddings
+        self.kv_channels = kv_channels
+
+        self.rotary_pct = rotary_pct
+        self.rotary_emb_base = rotary_emb_base
+        self.use_dynamic_ntk = use_dynamic_ntk
+        self.use_logn_attn = use_logn_attn
+        self.no_bias = no_bias
+        self.long_sequence_strategy_type = long_sequence_strategy_type
+        self.long_sequence_strategy_name = long_sequence_strategy_name
+        self.long_sequence_init_args = {} if long_sequence_init_args is None else long_sequence_init_args
+        self.use_long_sequence_strategies = use_long_sequence_strategies
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen/modeling.py
new file mode 100644
index 000000000..88602e5d8
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen/modeling.py
@@ -0,0 +1,1192 @@
+# Copyright (c) 2023 Alibaba Cloud and PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import os
+import warnings
+from functools import partial
+from typing import List
+
+import paddle
+import paddle.distributed.fleet.meta_parallel as mpu
+import paddle.nn.functional as F
+from paddle import Tensor, nn
+from paddle.distributed import fleet
+from paddle.distributed.fleet.layers.mpu.random import get_rng_state_tracker
+from paddle.distributed.fleet.utils import recompute
+from paddle.utils import try_import
+
+try:
+    from paddle.incubate.nn.functional import swiglu
+except ImportError:
+
+    def swiglu(x, y=None):
+        if y is None:
+            x, y = paddle.chunk(x, chunks=2, axis=-1)
+        return F.silu(x) * y
+
+
+from paddlenlp.transformers.long_sequence_strategies import LongSequenceStrategies
+from paddlenlp.transformers.model_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from paddlenlp.transformers.model_utils import PretrainedModel
+from paddlenlp.utils.log import logger
+
+from ...utils.converter import StateDictNameMapping, init_name_mappings
+from .. import linear_utils
+from ..linear_utils import Linear
+from ..model_outputs import ModelOutput
+from .configuration import QWenConfig
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        GatherOp,
+        ScatterOp,
+        mark_as_sequence_parallel_parameter,
+    )
+except:
+    pass
+
+__all__ = [
+    "QWenBlock",
+    "QWenForCausalLM",
+    "QWenLMHeadModel",
+    "QWenPretrainedModel",
+    "QWenModel",
+    "QWenLMHead",
+    "QWenPretrainingCriterion",
+]
+
+
+MAX_NTK_SEQ_LENGTH = 32768
+
+try:
+    from paddle.nn.functional.flash_attention import flash_attention
+except:
+    flash_attention = None
+
+try:
+    from paddle.incubate.nn.functional import fused_rotary_position_embedding
+except:
+    fused_rotary_position_embedding = None
+
+
+def get_use_casual_mask():
+    """Get the value of the 'USE_CASUAL_MASK' environment variable."""
+    return os.getenv("USE_CASUAL_MASK", "False") == "True"
+
+
+def parallel_matmul(x: Tensor, y: Tensor, tensor_parallel_output=True):
+    is_fleet_init = True
+    tensor_parallel_degree = 1
+    try:
+        hcg = fleet.get_hybrid_communicate_group()
+        model_parallel_group = hcg.get_model_parallel_group()
+        tensor_parallel_degree = hcg.get_model_parallel_world_size()
+    except:
+        is_fleet_init = False
+
+    if is_fleet_init and tensor_parallel_degree > 1 and y.is_distributed:
+        # if not running under distributed.launch, it will raise AttributeError: 'Fleet' object has no attribute '_hcg'
+        input_parallel = paddle.distributed.collective._c_identity(x, group=model_parallel_group)
+        logits = paddle.matmul(input_parallel, y, transpose_y=False)
+
+        if tensor_parallel_output:
+            return logits
+
+        return paddle.distributed.collective._c_concat(logits, group=model_parallel_group)
+
+    else:
+        logits = paddle.matmul(x, y, transpose_y=False)
+        return logits
+
+
+def get_triangle_upper_mask(x, mask=None):
+    if mask is not None:
+        return mask
+    # [bsz, n_head, q_len, kv_seq_len]
+    shape = x.shape
+    #  [bsz, 1, q_len, kv_seq_len]
+    shape[1] = 1
+    mask = paddle.full(shape, paddle.finfo(x.dtype).min, dtype=x.dtype)
+    mask = paddle.triu(mask, diagonal=1)
+    mask.stop_gradient = True
+    return mask
+
+
+class QWenAttention(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+        self.seq_length = config.seq_length
+        self.hidden_size = config.hidden_size
+        self.split_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
+
+        self.scale_attn_weights = True
+        self.enable_recompute = False
+        self.recompute_granularity = config.recompute_granularity
+
+        self.projection_size = config.kv_channels * config.num_attention_heads
+
+        assert self.projection_size % config.num_attention_heads == 0
+        self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
+
+        self.sequence_parallel = config.sequence_parallel
+
+        if config.sequence_parallel:
+            ColumnParallelLinear = linear_utils.ColumnSequenceParallelLinear
+            RowParallelLinear = linear_utils.RowSequenceParallelLinear
+        else:
+            ColumnParallelLinear = linear_utils.ColumnParallelLinear
+            RowParallelLinear = linear_utils.RowParallelLinear
+
+        if config.tensor_parallel_degree > 1:
+            if config.num_attention_heads % config.tensor_parallel_degree != 0:
+                raise ValueError("num_attention_heads has to be divisible by tensor_parallel_degree")
+            self.num_heads = config.num_attention_heads // config.tensor_parallel_degree
+            self.c_attn = ColumnParallelLinear(
+                config.hidden_size,
+                3 * self.projection_size,
+                has_bias=True,
+                gather_output=False,
+            )
+            self.c_proj = RowParallelLinear(
+                config.hidden_size,
+                self.projection_size,
+                has_bias=not config.no_bias,
+                input_is_parallel=True,
+            )
+        else:
+            self.c_attn = Linear(config.hidden_size, 3 * self.projection_size, bias_attr=True)
+            self.c_proj = Linear(
+                config.hidden_size,
+                self.projection_size,
+                bias_attr=not config.no_bias,
+            )
+
+        if config.rotary_pct == 1.0:
+            self.rotary_ndims = None
+        else:
+            assert config.rotary_pct < 1
+            self.rotary_ndims = int(self.hidden_size_per_attention_head * config.rotary_pct)
+        dim = self.rotary_ndims if self.rotary_ndims is not None else self.hidden_size_per_attention_head
+        if config.use_long_sequence_strategies:
+            self.rotary_emb = LongSequenceStrategies.build_long_sequence_strategy(
+                config.long_sequence_strategy_type,
+                config.long_sequence_strategy_name,
+                **config.long_sequence_init_args,
+            )
+        else:
+            self.rotary_emb = RotaryEmbedding(dim, base=config.rotary_emb_base)
+
+        self.use_dynamic_ntk = config.use_dynamic_ntk
+        self.use_logn_attn = config.use_logn_attn
+
+        logn_list = [math.log(i, self.seq_length) if i > self.seq_length else 1 for i in range(1, MAX_NTK_SEQ_LENGTH)]
+        self.logn_tensor = paddle.to_tensor(logn_list)[None, :, None, None]
+        self._ntk_cached = 1.0
+
+        self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
+
+    def _attn(self, query, key, value, attention_mask=None):
+        # Support the flash attention and normal attention
+        bsz, q_len, num_heads, head_dim = query.shape
+        _, kv_seq_len, _, _ = value.shape
+
+        if self.config.use_flash_attention and flash_attention is not None:
+            # Flash Attention now ignore attention mask
+            # Current Flash Attention doesn't support attn maskt
+            # Paddle Flash Attention input [ bz, seqlen, nhead, head_dim]
+            # Torch Flash Attention input [ bz, nhead, seqlen, head_dim]
+            version = paddle.version.full_version
+            if version != "0.0.0" and version <= "2.5.2":
+                attn_output, attn_weights = flash_attention(
+                    query,
+                    key,
+                    value,
+                    causal=query.shape[1] != 1,
+                    dropout=self.config.attn_dropout_prob,
+                    return_softmax=self.config.attn_dropout_prob > 0.0,
+                )
+            else:
+                attn_output = F.scaled_dot_product_attention(
+                    query,
+                    key,
+                    value,
+                    attn_mask=attention_mask,
+                    is_causal=attention_mask is None,
+                )
+                attn_weights = None
+
+            if self.sequence_parallel:
+                attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])
+            else:
+                attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
+            return attn_output, attn_weights
+        else:
+            # [bz, sql, nh, hid] ==> [bz, nh, sql hdim]
+            query = query.transpose([0, 2, 1, 3])
+            # [bz, sql, nh, hid] ==> [bz, nh, sql hdim]
+            key = key.transpose([0, 2, 1, 3])
+            # [bz, sql, nh, hid] ==> [bz, nh, sql hdim]
+            value = value.transpose([0, 2, 1, 3])
+
+            attn_weights = paddle.matmul(query / math.sqrt(head_dim), key.transpose([0, 1, 3, 2]))
+
+            if attn_weights.shape != [bsz, num_heads, q_len, kv_seq_len]:
+                raise ValueError(
+                    f"Attention weights should be of shape {(bsz, num_heads, q_len, kv_seq_len)}, but is"
+                    f" {attn_weights.shape}"
+                )
+            # If the attention mask is None, we need to construct the causal attention mask
+            if attention_mask is None:
+                attention_mask = get_triangle_upper_mask(attn_weights)
+            attn_weights = attn_weights + attention_mask
+            attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(value.dtype)
+
+            attn_weights = self.attn_dropout(attn_weights)
+            attn_output = paddle.matmul(attn_weights, value)
+            attn_output = attn_output.transpose([0, 2, 1, 3])
+
+            if self.sequence_parallel:
+                attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])
+            else:
+                attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
+            return attn_output, attn_weights
+
+    def _split_heads(self, tensor, num_heads, attn_head_size):
+        new_shape = tensor.shape[:-1] + [num_heads, attn_head_size]
+        tensor = tensor.reshape(new_shape)
+        return tensor
+
+    def forward(
+        self,
+        hidden_states,
+        layer_past=None,
+        attention_mask=None,
+        position_ids=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=False,
+        use_cache=False,
+    ):
+        # [bz, sql, hid] ==> [bz, sql, 3*hid]
+        mixed_x_layer = self.c_attn(hidden_states)
+
+        if self.sequence_parallel:
+            target_shape = [-1, self.seq_length, self.num_heads * 3 * self.head_dim]
+            mixed_x_layer = paddle.reshape_(mixed_x_layer, target_shape)
+
+        # [bz, sql, hid] ==> [bz, sql, nh, hdim]
+        query, key, value = paddle.split(mixed_x_layer, num_or_sections=3, axis=-1)
+        query = self._split_heads(query, self.num_heads, self.head_dim)
+        key = self._split_heads(key, self.num_heads, self.head_dim)
+        value = self._split_heads(value, self.num_heads, self.head_dim)
+
+        kv_seq_len = key.shape[-3]
+        if layer_past:
+            # layer past[0] shape: bs * seq_len * head_num * dim
+            kv_seq_len += layer_past[0].shape[1]
+        if self.use_dynamic_ntk and kv_seq_len == hidden_states.shape[1] and not self.training:
+            context_value = math.log(kv_seq_len / self.seq_length, 2) + 1
+            ntk_alpha = 2 ** math.ceil(context_value) - 1
+            ntk_alpha = max(ntk_alpha, 1)
+            self._ntk_cached = ntk_alpha
+        else:
+            ntk_alpha = self._ntk_cached
+        if self.config.use_long_sequence_strategies:
+            cos, sin = self.rotary_emb(seq_len=kv_seq_len, ntk_alpha=ntk_alpha)
+            rotary_pos_emb = (cos[None, :, None, :], sin[None, :, None, :])
+        else:
+            rotary_pos_emb = self.rotary_emb(value, kv_seq_len, ntk_alpha=ntk_alpha)
+
+        if rotary_pos_emb is not None:
+            if isinstance(rotary_pos_emb, tuple):
+                rotary_pos_emb = rotary_pos_emb
+            else:
+                rotary_pos_emb = (rotary_pos_emb,) * 2
+
+        if rotary_pos_emb is not None:
+            cos, sin = rotary_pos_emb
+            if self.config.use_fused_rope:
+                query, key, _ = fused_rotary_position_embedding(
+                    query,
+                    key,
+                    v=None,
+                    sin=sin,
+                    cos=cos,
+                    position_ids=position_ids,
+                    use_neox_rotary_style=False,
+                )
+            else:
+                query, key = apply_rotary_pos_emb(query, key, cos, sin, position_ids=position_ids)
+
+        if layer_past is not None:
+            past_key, past_value = layer_past[0], layer_past[1]
+            key = paddle.concat([past_key, key], axis=1)
+            value = paddle.concat([past_value, value], axis=1)
+
+        if use_cache:
+            present = (key, value)
+        else:
+            present = None
+
+        if self.use_logn_attn and not self.training:
+            if self.logn_tensor.dtype != query.dtype:
+                self.logn_tensor = self.logn_tensor.astype(query.dtype)
+            seq_start = key.shape[1] - query.shape[1]
+            seq_end = key.shape[1]
+            logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
+            query = query * logn_tensor.expand(query.shape)
+
+        has_gradient = not (query.stop_gradient and key.stop_gradient and value.stop_gradient)
+        if self.enable_recompute and self.training and has_gradient and self.recompute_granularity == "core_attn":
+            attn_output, attn_weight = recompute(
+                self._attn,
+                query,
+                key,
+                value,
+                attention_mask,
+                use_reentrant=self.config.recompute_use_reentrant,
+            )
+        else:
+            attn_output, attn_weight = self._attn(query, key, value, attention_mask)
+
+        # if sequence_parallel is true, out shape are [q_len / n, bs, num_head * head_dim]
+        # else their shape are [bs, q_len, num_head * head_dim], n is mp parallelism.
+        attn_output = self.c_proj(attn_output)
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weight,)
+        return outputs
+
+
+class QWenMLP(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        ff_dim_in = config.intermediate_size // 2
+        self.fuse_attention_ffn = config.fuse_attention_ffn
+
+        if config.sequence_parallel:
+            ColumnParallelLinear = linear_utils.ColumnSequenceParallelLinear
+            RowParallelLinear = linear_utils.RowSequenceParallelLinear
+        else:
+            ColumnParallelLinear = linear_utils.ColumnParallelLinear
+            RowParallelLinear = linear_utils.RowParallelLinear
+
+        if config.tensor_parallel_degree > 1:
+            if self.fuse_attention_ffn:
+                self.gate_up_fused_proj = ColumnParallelLinear(
+                    config.hidden_size,
+                    ff_dim_in * 2,
+                    gather_output=False,
+                    has_bias=False,
+                )
+            else:
+                self.w1 = ColumnParallelLinear(
+                    config.hidden_size,
+                    ff_dim_in,
+                    gather_output=False,
+                    has_bias=False,
+                )
+                self.w2 = ColumnParallelLinear(
+                    config.hidden_size,
+                    ff_dim_in,
+                    gather_output=False,
+                    has_bias=False,
+                )
+            self.c_proj = RowParallelLinear(
+                ff_dim_in,
+                config.hidden_size,
+                input_is_parallel=True,
+                has_bias=False,
+            )
+        else:
+            if self.fuse_attention_ffn:
+                self.gate_up_fused_proj = Linear(config.hidden_size, ff_dim_in * 2, bias_attr=not config.no_bias)
+            else:
+                self.w1 = Linear(config.hidden_size, ff_dim_in, bias_attr=not config.no_bias)
+                self.w2 = Linear(config.hidden_size, ff_dim_in, bias_attr=not config.no_bias)
+            self.c_proj = Linear(ff_dim_in, config.hidden_size, bias_attr=not config.no_bias)
+
+    def forward(self, hidden_states):
+        # up
+        # a1 = self.w1(hidden_states)
+        # # gate
+        # a2 = self.w2(hidden_states)
+        # intermediate_parallel = a1 * F.silu(a2)
+        if self.fuse_attention_ffn:
+            intermediate_parallel = swiglu(self.gate_up_fused_proj(hidden_states))
+        else:
+            intermediate_parallel = swiglu(self.w2(hidden_states), self.w1(hidden_states))
+        output = self.c_proj(intermediate_parallel)
+        return output
+
+
+class QWenBlock(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.sequence_parallel = config.sequence_parallel
+        self.ln_1 = QWenRMSNorm(config)
+        self.attn = QWenAttention(config)
+        self.ln_2 = QWenRMSNorm(config)
+        self.mlp = QWenMLP(config)
+
+    def forward(
+        self,
+        hidden_states,
+        layer_past=None,
+        attention_mask=None,
+        position_ids=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        # [bs * seq_len, embed_dim] -> [seq_len * bs / n, embed_dim] (sequence_parallel)
+        residual = hidden_states
+        layernorm_output = self.ln_1(hidden_states)
+
+        attn_outputs = self.attn(
+            layernorm_output,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        attn_output = attn_outputs[0]
+
+        outputs = attn_outputs[1:]
+
+        layernorm_input = attn_output + residual
+
+        layernorm_output = self.ln_2(layernorm_input)
+
+        residual = layernorm_input
+        mlp_output = self.mlp(layernorm_output)
+        hidden_states = residual + mlp_output
+
+        if use_cache:
+            outputs = (hidden_states,) + outputs
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+
+        # remove empty tuple for pipeline parallel
+        if type(outputs) is tuple and len(outputs) == 1:
+            outputs = outputs[0]
+        return outputs
+
+
+class QWenPretrainedModel(PretrainedModel):
+    config_class = QWenConfig
+    base_model_prefix = "qwen"
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config, is_split=True):
+
+        from paddlenlp.transformers.conversion_utils import split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def get_tensor_parallel_split_mappings(num_hidden_layers):
+            final_actions = {}
+            base_actions = {
+                # Column Linear
+                "lm_head.weight": partial(fn, is_column=True),
+                "qwen.h.0.attn.c_attn.weight": partial(fn, is_column=True, is_naive_3fuse=True),
+                "qwen.h.0.attn.c_attn.bias": partial(fn, is_column=True, is_naive_3fuse=True),
+                # Row Linear
+                "qwen.wte.weight": partial(fn, is_column=False),
+                "qwen.h.0.mlp.c_proj.weight": partial(fn, is_column=False),
+                "qwen.h.0.attn.c_proj.weight": partial(fn, is_column=False),
+            }
+
+            if config.fuse_attention_ffn:
+                base_actions["layers.0.mlp.gate_up_fused_proj.weight"] = partial(
+                    fn, is_column=True, is_naive_2fuse=True
+                )
+            else:
+                base_actions["qwen.h.0.mlp.w2.weight"] = partial(fn, is_column=True)
+                base_actions["qwen.h.0.mlp.w1.weight"] = partial(fn, is_column=True)
+
+            for key, action in base_actions.items():
+                if "h.0." in key:
+                    for i in range(num_hidden_layers):
+                        final_actions[key.replace("h.0.", f"h.{i}.")] = action
+                final_actions[key] = action
+
+            return final_actions
+
+        mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
+
+        return mappings
+
+    @classmethod
+    def _get_name_mappings(cls, config: QWenConfig) -> List[StateDictNameMapping]:
+        mappings = [
+            "wte.weight",
+            "ln_f.weight",
+        ]
+
+        for layer_index in range(config.num_hidden_layers):
+            layer_mappings = [
+                [
+                    f"h.{layer_index}.ln_1.weight",
+                    f"h.{layer_index}.ln_1.weight",
+                ],
+                [
+                    f"h.{layer_index}.attn.c_attn.weight",
+                    f"h.{layer_index}.attn.c_attn.weight",
+                    "transpose",
+                ],
+                [
+                    f"h.{layer_index}.attn.c_attn.bias",
+                    f"h.{layer_index}.attn.c_attn.bias",
+                ],
+                [
+                    f"h.{layer_index}.attn.c_proj.weight",
+                    f"h.{layer_index}.attn.c_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"h.{layer_index}.ln_2.weight",
+                    f"h.{layer_index}.ln_2.weight",
+                ],
+                [
+                    f"h.{layer_index}.mlp.w1.weight",
+                    f"h.{layer_index}.mlp.w1.weight",
+                    "transpose",
+                ],
+                [
+                    f"h.{layer_index}.mlp.w2.weight",
+                    f"h.{layer_index}.mlp.w2.weight",
+                    "transpose",
+                ],
+                [
+                    f"h.{layer_index}.mlp.c_proj.weight",
+                    f"h.{layer_index}.mlp.c_proj.weight",
+                    "transpose",
+                ],
+            ]
+            mappings.extend(layer_mappings)
+
+        init_name_mappings(mappings)
+        for mapping in mappings:
+            mapping[0] = "transformer." + mapping[0]
+            if len(mapping) > 1 and mapping[1] is not None:
+                mapping[1] = "qwen." + mapping[1]
+
+        if config.architectures is not None:
+            if "QWenForCausalLM" in config.architectures or "QWenLMHeadModel" in config.architectures:
+                mappings.extend(
+                    [
+                        [
+                            "lm_head.weight",
+                            "lm_head.weight",
+                            "transpose",
+                        ]
+                    ]
+                )
+
+        init_name_mappings(mappings)
+        return [StateDictNameMapping(*mapping) for mapping in mappings]
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if self.config.tensor_parallel_degree > 1:
+            rng_tracker = get_rng_state_tracker().rng_state
+        if isinstance(
+            module,
+            (
+                nn.Linear,
+                nn.Embedding,
+                mpu.VocabParallelEmbedding,
+                mpu.RowParallelLinear,
+                mpu.ColumnParallelLinear,
+                linear_utils.RowSequenceParallelLinear,
+                linear_utils.ColumnSequenceParallelLinear,
+                QWenLMHead,
+            ),
+        ):
+            if isinstance(module.weight, paddle.Tensor):
+                if module.weight.is_distributed:
+                    with rng_tracker():
+                        module.weight.set_value(
+                            paddle.tensor.normal(
+                                mean=0.0,
+                                std=self.config.initializer_range,
+                                shape=module.weight.shape,
+                            )
+                        )
+            else:
+                module.weight.set_value(
+                    paddle.tensor.normal(mean=0.0, std=self.config.initializer_range, shape=module.weight.shape)
+                )
+
+        for name, p in module.named_parameters():
+            if name == "c_proj.weight":
+                p.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range / math.sqrt(2 * self.config.num_hidden_layers),
+                        shape=p.shape,
+                    )
+                )
+
+
+class QWenModel(QWenPretrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.num_hidden_layers = config.num_hidden_layers
+        self.embed_dim = config.hidden_size
+        self.enable_recompute = False
+        self.recompute_granularity = config.recompute_granularity
+        self.sequence_parallel = config.sequence_parallel
+
+        if config.tensor_parallel_degree > 1:
+            self.wte = mpu.VocabParallelEmbedding(
+                self.vocab_size,
+                self.embed_dim,
+            )
+        else:
+            self.wte = nn.Embedding(self.vocab_size, self.embed_dim)
+
+        self.drop = nn.Dropout(config.emb_dropout_prob)
+        self.h = nn.LayerList(
+            [
+                QWenBlock(
+                    config,
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.ln_f = QWenRMSNorm(config)
+
+    def get_input_embeddings(self):
+        return self.wte
+
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
+
+    @paddle.jit.not_to_static
+    def recompute_training(
+        self,
+        block,
+        hidden_states,
+        layer_past,
+        attention_mask,
+        position_ids,
+        encoder_hidden_states,
+        encoder_attention_mask,
+        use_cache,
+        output_attentions,
+    ):
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+
+            return custom_forward
+
+        hidden_states = recompute(
+            create_custom_forward(block),
+            hidden_states,
+            layer_past,
+            attention_mask,
+            position_ids,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            use_cache,
+            output_attentions,
+            use_reentrant=self.config.recompute_use_reentrant,
+        )
+        return hidden_states
+
+    def get_masks(self, batch_size, seq_length, past_length, padding_mask=None):
+        # casual mask
+        casual_mask = paddle.tril(paddle.ones([batch_size, 1, seq_length, seq_length], dtype="bool"))
+        if past_length > 0:
+            casual_mask = paddle.concat(
+                [paddle.ones([batch_size, 1, seq_length, past_length], dtype="bool"), casual_mask], axis=-1
+            )
+
+        # seq_mask
+        if padding_mask is None:
+            padding_mask = paddle.ones((batch_size, 1, seq_length, seq_length + past_length), dtype="bool")
+        if len(padding_mask.shape) == 2:
+            # from Tokenizer
+            padding_mask = (
+                padding_mask.unsqueeze(axis=[1, 2])
+                .expand([batch_size, 1, seq_length, seq_length + past_length])
+                .astype("bool")
+            )
+        elif len(padding_mask.shape) == 3:
+            # [batch_size,tgt_length, src_length] -> [batch_size, 1, tgt_length, src_length]
+            padding_mask = padding_mask.unsqueeze(1).astype("bool")
+        elif len(padding_mask.shape) == 4:
+            padding_mask = padding_mask.astype("bool")
+
+        casual_mask = casual_mask & padding_mask
+
+        return casual_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        past_key_values=None,
+        attention_mask=None,
+        position_ids=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        if self.sequence_parallel and use_cache:
+            raise ValueError("We currently only support sequence parallel without cache.")
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+            input_ids = input_ids.reshape([-1, input_shape[-1]])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = tuple([None] * len(self.h))
+        else:
+            past_length = past_key_values[0][0].shape[1]
+
+        encoder_attention_mask = None
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+
+        if self.sequence_parallel:
+            # [bs, seq_len, num_head * head_dim] -> [bs * seq_len, num_head * head_dim]
+            bs, seq_len, hidden_size = inputs_embeds.shape
+            inputs_embeds = paddle.reshape_(inputs_embeds, [bs * seq_len, hidden_size])
+            # [seq_len * bs / n, num_head * head_dim] (n is mp parallelism)
+            inputs_embeds = ScatterOp.apply(inputs_embeds)
+
+        hidden_states = inputs_embeds
+        use_casual_mask = get_use_casual_mask()
+        # bool 4D mask
+        if use_casual_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = self.get_masks(input_shape[0], input_shape[1], past_length, padding_mask=attention_mask)
+            zero = paddle.zeros(attention_mask.shape, dtype=hidden_states.dtype)
+            neg_inf = paddle.full_like(
+                attention_mask, paddle.finfo(hidden_states.dtype).min, dtype=hidden_states.dtype
+            )
+            # dtype 4D mask
+            attention_mask = paddle.where(attention_mask, zero, neg_inf)
+
+        hidden_states = self.drop(hidden_states)
+
+        if self.enable_recompute and self.training:
+            if use_cache:
+                logger.warning_once("`use_cache=True` is incompatible with recompute")
+                use_cache = False
+
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            has_gradient = not hidden_states.stop_gradient
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.enable_recompute and self.training and has_gradient and self.recompute_granularity == "full":
+                outputs = self.recompute_training(
+                    block,
+                    hidden_states,
+                    layer_past=layer_past,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+            else:
+                outputs = block(
+                    hidden_states,
+                    layer_past=layer_past,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+
+            if type(outputs) is tuple:
+                hidden_states = outputs[0]
+            else:
+                hidden_states = outputs
+
+            if use_cache is True:
+                presents = presents + (outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[1],)
+
+        hidden_states = self.ln_f(hidden_states)
+
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states] if v is not None)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class QWenLMHead(nn.Layer):
+    def __init__(self, config: QWenConfig):
+        super(QWenLMHead, self).__init__()
+        self.config = config
+        if config.tensor_parallel_degree > 1:
+            vocab_size = config.vocab_size // config.tensor_parallel_degree
+        else:
+            vocab_size = config.vocab_size
+
+        if vocab_size != config.vocab_size:
+            with get_rng_state_tracker().rng_state():
+                self.weight = self.create_parameter(
+                    shape=[config.hidden_size, vocab_size],
+                    dtype=paddle.get_default_dtype(),
+                )
+        else:
+            self.weight = self.create_parameter(
+                shape=[config.hidden_size, vocab_size],
+                dtype=paddle.get_default_dtype(),
+            )
+        # Must set distributed attr for Tensor Parallel !
+        self.weight.is_distributed = True if (vocab_size != config.vocab_size) else False
+        if self.weight.is_distributed:
+            self.weight.split_axis = 1
+
+    def forward(self, hidden_states, tensor_parallel_output=None):
+        if self.config.sequence_parallel:
+            hidden_states = GatherOp.apply(hidden_states)
+            seq_length = self.config.seq_length
+            hidden_states = paddle.reshape_(hidden_states, [-1, seq_length, self.config.hidden_size])
+
+        if tensor_parallel_output is None:
+            tensor_parallel_output = self.config.tensor_parallel_output and self.config.tensor_parallel_degree > 1
+
+        logits = parallel_matmul(hidden_states, self.weight, tensor_parallel_output=tensor_parallel_output)
+        return logits
+
+
+class QWenPretrainingCriterion(paddle.nn.Layer):
+    """
+    Criterion for Llama.
+    It calculates the final loss.
+    """
+
+    def __init__(self, config):
+
+        super(QWenPretrainingCriterion, self).__init__()
+        self.ignore_index = getattr(config, "ignore_index", -100)
+        self.config = config
+        self.enable_parallel_cross_entropy = config.tensor_parallel_degree > 1 and config.tensor_parallel_output
+
+        if self.enable_parallel_cross_entropy:  # and False: # and lm_head is distributed
+            self.loss_func = mpu.ParallelCrossEntropy(ignore_index=self.ignore_index)
+        else:
+            self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index)
+
+    def forward(self, prediction_scores, masked_lm_labels):
+        if self.enable_parallel_cross_entropy:
+            if prediction_scores.shape[-1] == self.config.vocab_size:
+                warnings.warn(
+                    f"enable_parallel_cross_entropy, the vocab_size should be splited: {prediction_scores.shape[-1]}, {self.config.vocab_size}"
+                )
+                self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index)
+
+        with paddle.amp.auto_cast(False):
+            masked_lm_loss = self.loss_func(prediction_scores.astype("float32"), masked_lm_labels.unsqueeze(2))
+            # skip ignore_index which loss == 0
+            masked_lm_loss = masked_lm_loss[masked_lm_loss > 0].astype("float32")
+            loss = paddle.mean(masked_lm_loss)
+
+        return loss
+
+
+class QWenForCausalLM(QWenPretrainedModel):
+    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.rotary_emb\.inv_freq"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.qwen = QWenModel(config)
+        self.lm_head = QWenLMHead(config)
+        self.criterion = QWenPretrainingCriterion(config)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    @staticmethod
+    def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False):
+        # Update the model inputs during generation.
+        # Note that If `token_type_ids` and `attention_mask` in `model_kwargs`
+        # and they contain pad value, the result vectors updated by this method
+        # may be different from expected. In this case, you need to rewrite the
+        # method.
+
+        # update cache
+        if isinstance(outputs, tuple) and len(outputs) > 1 and not isinstance(outputs[1], paddle.Tensor):
+            model_kwargs["cache"] = outputs[1]
+            model_kwargs["past_key_values"] = outputs[1]
+
+        if isinstance(outputs, ModelOutput) and "past_key_values" in outputs:
+            model_kwargs["cache"] = outputs.past_key_values
+            model_kwargs["past_key_values"] = outputs.past_key_values
+
+        if "position_ids" in model_kwargs and model_kwargs["position_ids"] is not None:
+            position_ids = model_kwargs["position_ids"]
+            model_kwargs["position_ids"] = paddle.concat([position_ids, position_ids[..., -1:] + 1], axis=-1)
+
+        # update attention_mask
+        if not is_encoder_decoder and "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            if attention_mask is not None and len(attention_mask.shape) == 2:
+                model_kwargs["attention_mask"] = paddle.concat(
+                    [attention_mask, paddle.ones([attention_mask.shape[0], 1], dtype=attention_mask.dtype)], axis=-1
+                )
+            else:
+                model_kwargs["attention_mask"] = None
+
+        return model_kwargs
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
+        attention_mask = kwargs.get("attention_mask", None)
+        position_ids = kwargs.get("position_ids", None)
+
+        if past_key_values:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+            if position_ids is not None:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "position_ids": position_ids,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id):
+        is_pad_token_in_inputs_ids = (pad_token_id is not None) and paddle.any(input_ids == pad_token_id).item()
+        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (
+            (eos_token_id is not None) and (pad_token_id != eos_token_id)
+        )
+        if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id:
+            attention_mask = (input_ids != pad_token_id).astype(paddle.int64)
+        else:
+            attention_mask = paddle.ones_like(input_ids, dtype=paddle.int64)
+        return attention_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        past_key_values=None,
+        attention_mask=None,
+        position_ids=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.qwen(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss = self.criterion(lm_logits, labels)
+
+        # lm_logits = self.lm_head(hidden_states)
+
+        # loss = None
+        # if labels is not None:
+        #     loss_fct = nn.CrossEntropyLoss()
+        #     loss = loss_fct(lm_logits, labels)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+class RotaryEmbedding(nn.Layer):
+    def __init__(self, dim, base=10000):
+        super().__init__()
+        self.dim = dim
+        self.base = base
+        self.inv_freq = 1.0 / (self.base ** (paddle.cast(paddle.arange(0, self.dim, 2), dtype="float32") / self.dim))
+        self._seq_len_cached = 0
+        self._ntk_alpha_cached = 1.0
+
+    def update_cos_sin_cache(self, max_seq_len, offset=0, ntk_alpha=1.0):
+        seqlen = max_seq_len + offset
+        if seqlen > self._seq_len_cached or ntk_alpha != self._ntk_alpha_cached:
+            base = self.base * ntk_alpha ** (self.dim / (self.dim - 2))
+            self.inv_freq = 1.0 / (base ** (paddle.arange(0, self.dim, 2, dtype=paddle.float32) / self.dim))
+            self._seq_len_cached = max(2 * seqlen, 16)
+            self._ntk_alpha_cached = ntk_alpha
+            seq = paddle.arange(self._seq_len_cached)
+            with paddle.amp.auto_cast(enable=False):
+                freqs = paddle.outer(seq.astype(self.inv_freq.dtype), self.inv_freq)
+            emb = paddle.concat([freqs, freqs], axis=-1)
+            self.cos_cached = emb.cos()[None, :, None, :]
+            self.sin_cached = emb.sin()[None, :, None, :]
+
+    def forward(self, x, max_seq_len, offset=0, ntk_alpha=1.0):
+        self.update_cos_sin_cache(max_seq_len, offset, ntk_alpha)
+        cos = self.cos_cached[:, offset : offset + max_seq_len, :, ...]
+        sin = self.sin_cached[:, offset : offset + max_seq_len, :, ...]
+        return (
+            cos.cast(x.dtype) if cos.dtype != x.dtype else cos,
+            sin.cast(x.dtype) if sin.dtype != x.dtype else sin,
+        )
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return paddle.concat([-x2, x1], axis=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None):
+
+    if position_ids is None:
+        cos = cos[:, : q.shape[1], :, :]  # [bs, seq_len, 1, dim]
+        sin = sin[:, : q.shape[1], :, :]  # [bs, seq_len, 1, dim]
+    else:
+        cos = cos.squeeze(axis=[0, 2])  # [seq_len, dim]
+        sin = sin.squeeze(axis=[0, 2])  # [seq_len, dim]
+        cos = cos[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
+        sin = sin[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def rms_norm_fused(x_in, w, eps):
+    fused_ln = try_import("fused_ln")
+    return fused_ln.fused_rms_norm(x_in, w, eps)[0]
+
+
+class QWenRMSNorm(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.eps = config.layer_norm_epsilon
+        self.weight = paddle.create_parameter(
+            shape=[config.hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(1.0),
+        )
+        if config.sequence_parallel:
+            mark_as_sequence_parallel_parameter(self.weight)
+
+    def _norm(self, x):
+        return x * paddle.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        if self.config.use_fused_rms_norm:
+            return rms_norm_fused(x, self.weight, self.eps)
+
+        output = self._norm(x.astype(paddle.float32)).astype(x.dtype)
+        return output * self.weight
+
+
+QWenLMHeadModel = QWenForCausalLM
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen/modeling_3D_auto.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen/modeling_3D_auto.py
new file mode 100644
index 000000000..5a4f9d4c5
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen/modeling_3D_auto.py
@@ -0,0 +1,962 @@
+# Copyright (c) 2023 Alibaba Cloud and PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import warnings
+from functools import partial
+from typing import List
+
+import paddle
+import paddle.distributed as dist
+import paddle.distributed.fleet.meta_parallel as mpu
+import paddle.nn.functional as F
+from paddle import Tensor, nn
+from paddle.distributed import fleet
+from paddle.distributed.fleet.utils import recompute
+from paddle.utils import try_import
+
+from paddlenlp.transformers.model_outputs import BaseModelOutputWithPast
+from paddlenlp.transformers.model_utils import PretrainedModel
+from paddlenlp.utils.log import logger
+
+from ...utils.converter import StateDictNameMapping, init_name_mappings
+from .configuration import QWenConfig
+
+__all__ = [
+    "QWenBlockAuto",
+    "QWenForCausalLM3DAuto",
+    "QWenPretrainedModelAuto",
+    "QWenModelAuto",
+    "QWenLMHeadAuto",
+    "QWenPretrainingCriterionAuto",
+]
+
+
+MAX_NTK_SEQ_LENGTH = 32768
+
+try:
+    from paddle.nn.functional.flash_attention import flash_attention
+except:
+    flash_attention = None
+
+try:
+    from paddle.incubate.nn.functional import fused_rotary_position_embedding
+except:
+    fused_rotary_position_embedding = None
+
+
+def get_mesh(pp_idx=0):
+    mesh = fleet.auto.get_mesh()
+    if "pp" in mesh.dim_names:
+        mesh = mesh.get_mesh_with_dim("pp")[pp_idx]
+    return mesh
+
+
+def parallel_matmul(x: Tensor, y: Tensor, tensor_parallel_output=True):
+    is_fleet_init = True
+    tensor_parallel_degree = 1
+    try:
+        hcg = fleet.get_hybrid_communicate_group()
+        model_parallel_group = hcg.get_model_parallel_group()
+        tensor_parallel_degree = hcg.get_model_parallel_world_size()
+    except:
+        is_fleet_init = False
+
+    if is_fleet_init and tensor_parallel_degree > 1 and y.is_distributed:
+        # if not running under distributed.launch, it will raise AttributeError: 'Fleet' object has no attribute '_hcg'
+        input_parallel = paddle.distributed.collective._c_identity(x, group=model_parallel_group)
+        logits = paddle.matmul(input_parallel, y, transpose_y=False)
+
+        if tensor_parallel_output:
+            return logits
+
+        return paddle.distributed.collective._c_concat(logits, group=model_parallel_group)
+
+    else:
+        logits = paddle.matmul(x, y, transpose_y=False)
+        return logits
+
+
+def get_triangle_upper_mask(x, mask=None):
+    if mask is not None:
+        return mask
+    # [bsz, n_head, q_len, kv_seq_len]
+    shape = x.shape
+    #  [bsz, 1, q_len, kv_seq_len]
+    shape[1] = 1
+    mask = paddle.full(shape, paddle.finfo(x.dtype).min, dtype=x.dtype)
+    mask = paddle.triu(mask, diagonal=1)
+    mask.stop_gradient = True
+    return mask
+
+
+attention_cnt = 0
+
+
+class QWenAttentionAuto(nn.Layer):
+    def __init__(self, config, ipp=None):
+        super().__init__()
+
+        self.config = config
+        self.seq_length = config.seq_length
+        self.hidden_size = config.hidden_size
+        self.split_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
+
+        self.scale_attn_weights = True
+        self.enable_recompute = config.use_recompute
+        self.recompute_granularity = config.recompute_granularity
+
+        self.projection_size = config.kv_channels * config.num_attention_heads
+
+        assert self.projection_size % config.num_attention_heads == 0
+        self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
+
+        self.c_attn = nn.Linear(config.hidden_size, 3 * self.projection_size, bias_attr=True)
+        self.c_proj = nn.Linear(config.hidden_size, self.projection_size, bias_attr=not config.no_bias)
+
+        if config.rotary_pct == 1.0:
+            self.rotary_ndims = None
+        else:
+            assert config.rotary_pct < 1
+            self.rotary_ndims = int(self.hidden_size_per_attention_head * config.rotary_pct)
+        dim = self.rotary_ndims if self.rotary_ndims is not None else self.hidden_size_per_attention_head
+        self.rotary_emb = RotaryEmbedding(dim, base=config.rotary_emb_base)
+
+        self.use_dynamic_ntk = config.use_dynamic_ntk
+        self.use_logn_attn = config.use_logn_attn
+
+        logn_list = [math.log(i, self.seq_length) if i > self.seq_length else 1 for i in range(1, MAX_NTK_SEQ_LENGTH)]
+        self.logn_tensor = paddle.to_tensor(logn_list)[None, :, None, None]
+        self._ntk_cached = 1.0
+
+        self.attn_dropout = nn.Dropout(config.attn_dropout_prob)
+        self.ipp = ipp
+        global attention_cnt
+        self.attention_cnt = attention_cnt
+        attention_cnt += 1
+
+    def _attn(self, query, key, value, attention_mask=None):
+        # Support the flash attention and normal attention
+        bsz, q_len, num_heads, head_dim = query.shape
+        _, kv_seq_len, _, _ = value.shape
+        if self.config.use_flash_attention and flash_attention is not None:
+            # Flash Attention now ignore attention mask
+            # Current Flash Attention doesn't support attn maskt
+            # Paddle Flash Attention input [ bz, seqlen, nhead, head_dim]
+            # Torch Flash Attention input [ bz, nhead, seqlen, head_dim]
+            version = paddle.version.full_version
+            if version != "0.0.0" and version <= "2.5.2":
+                attn_output, attn_weights = flash_attention(
+                    query,
+                    key,
+                    value,
+                    causal=query.shape[1] != 1,
+                    dropout=self.config.attn_dropout_prob,
+                    return_softmax=self.config.attn_dropout_prob > 0.0,
+                )
+            else:
+                attn_output = F.scaled_dot_product_attention(
+                    query,
+                    key,
+                    value,
+                    attn_mask=attention_mask,
+                    is_causal=attention_mask is None,
+                )
+                attn_weights = None
+            return attn_output, attn_weights
+        else:
+            # [bz, sql, nh, hid] ==> [bz, nh, sql hdim]
+            query = query.transpose([0, 2, 1, 3])
+            # [bz, sql, nh, hid] ==> [bz, nh, sql hdim]
+            key = key.transpose([0, 2, 1, 3])
+            # [bz, sql, nh, hid] ==> [bz, nh, sql hdim]
+            value = value.transpose([0, 2, 1, 3])
+
+            attn_weights = paddle.matmul(query / math.sqrt(head_dim), key.transpose([0, 1, 3, 2]))
+
+            if attn_weights.shape != [bsz, num_heads, q_len, kv_seq_len]:
+                raise ValueError(
+                    f"Attention weights should be of shape {(bsz, num_heads, q_len, kv_seq_len)}, but is"
+                    f" {attn_weights.shape}"
+                )
+            # If the attention mask is None, we need to construct the causal attention mask
+            if attention_mask is None:
+                attention_mask = get_triangle_upper_mask(attn_weights)
+            attn_weights = attn_weights + attention_mask
+            with paddle.amp.auto_cast(False):
+                attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(value.dtype)
+
+            attn_weights = self.attn_dropout(attn_weights)
+            attn_output = paddle.matmul(attn_weights, value)
+            attn_output = attn_output.transpose([0, 2, 1, 3])
+            return attn_output, attn_weights
+
+    def _split_heads(self, tensor, num_heads, attn_head_size):
+        new_shape = tensor.shape[:-1] + [num_heads, attn_head_size]
+        tensor = tensor.reshape(new_shape)
+        return tensor
+
+    def _merge_heads(self, tensor, num_heads, attn_head_size):
+        new_shape = tensor.shape[:-2] + [
+            num_heads * attn_head_size,
+        ]
+        return tensor.reshape(new_shape)
+
+    def forward(
+        self,
+        hidden_states,
+        layer_past=None,
+        attention_mask=None,
+        position_ids=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=False,
+        use_cache=False,
+    ):
+        # # [bz, sql, hid] ==> [bz, sql, 3*hid]
+        mixed_x_layer = self.c_attn(hidden_states)
+        # [bz, sql, 3*hid] ==> [bz, sql, hid]
+        query, key, value = paddle.split(mixed_x_layer, num_or_sections=3, axis=-1)
+
+        # [bz, sql, hid] ==> [bz, sql, nh, hdim]
+        query = self._split_heads(query, self.num_heads, self.head_dim)
+        key = self._split_heads(key, self.num_heads, self.head_dim)
+        value = self._split_heads(value, self.num_heads, self.head_dim)
+
+        kv_seq_len = hidden_states.shape[1]
+        if layer_past:
+            # layer past[0] shape: bs * seq_len * head_num * dim
+            kv_seq_len += layer_past[0].shape[1]
+        if self.use_dynamic_ntk and kv_seq_len == hidden_states.shape[1] and not self.training:
+            context_value = math.log(kv_seq_len / self.seq_length, 2) + 1
+            ntk_alpha = 2 ** math.ceil(context_value) - 1
+            ntk_alpha = max(ntk_alpha, 1)
+            self._ntk_cached = ntk_alpha
+        else:
+            ntk_alpha = self._ntk_cached
+        rotary_pos_emb = self.rotary_emb(value, kv_seq_len, ntk_alpha=ntk_alpha)
+
+        if rotary_pos_emb is not None:
+            if isinstance(rotary_pos_emb, tuple):
+                rotary_pos_emb = rotary_pos_emb
+            else:
+                rotary_pos_emb = (rotary_pos_emb,) * 2
+
+        if rotary_pos_emb is not None:
+            cos, sin = rotary_pos_emb
+            if self.config.use_fused_rope:
+                query, key, _ = fused_rotary_position_embedding(
+                    query,
+                    key,
+                    v=None,
+                    sin=sin,
+                    cos=cos,
+                    position_ids=position_ids,
+                    use_neox_rotary_style=False,
+                )
+            else:
+                query, key = apply_rotary_pos_emb(query, key, cos, sin, position_ids=position_ids)
+
+        if layer_past is not None:
+            past_key, past_value = layer_past[0], layer_past[1]
+            key = paddle.concat([past_key, key], axis=1)
+            value = paddle.concat([past_value, value], axis=1)
+
+        if use_cache:
+            present = (key, value)
+        else:
+            present = None
+
+        if self.use_logn_attn and not self.training:
+            if self.logn_tensor.dtype != query.dtype:
+                self.logn_tensor = self.logn_tensor.astype(query.dtype)
+            seq_start = key.shape[1] - query.shape[1]
+            seq_end = key.shape[1]
+            logn_tensor = self.logn_tensor[:, seq_start:seq_end, :, :]
+            query = query * logn_tensor.expand(query.shape)
+
+        has_gradient = not (query.stop_gradient and key.stop_gradient and value.stop_gradient)
+        if self.enable_recompute and self.training and has_gradient and self.recompute_granularity == "core_attn":
+            attn_output, attn_weight = recompute(
+                self._attn, query, key, value, attention_mask, use_reentrant=self.config.recompute_use_reentrant
+            )
+        else:
+            attn_output, attn_weight = self._attn(query, key, value, attention_mask)
+        context_layer = self._merge_heads(attn_output, self.num_heads, self.head_dim)
+
+        attn_output = self.c_proj(context_layer)
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weight,)
+
+        return outputs
+
+
+class QWenMLPAuto(nn.Layer):
+    def __init__(self, config, ipp=None):
+        super().__init__()
+        ff_dim_in = config.intermediate_size // 2
+        self.w1 = nn.Linear(config.hidden_size, ff_dim_in, bias_attr=not config.no_bias)
+        self.w2 = nn.Linear(config.hidden_size, ff_dim_in, bias_attr=not config.no_bias)
+        self.c_proj = nn.Linear(ff_dim_in, config.hidden_size, bias_attr=not config.no_bias)
+        self.ipp = ipp
+
+    def forward(self, hidden_states):
+        # up
+        a1 = self.w1(hidden_states)
+        # gate
+        a2 = self.w2(hidden_states)
+        intermediate_parallel = a1 * F.silu(a2)
+        # down
+        output = self.c_proj(intermediate_parallel)
+        return output
+
+
+class QWenBlockAuto(nn.Layer):
+    def __init__(self, config, ipp=None, idx=None):
+        super().__init__()
+        self.config = config
+        self.ln_1 = QWenRMSNormAuto(config)
+        self.attn = QWenAttentionAuto(config, ipp)
+        self.ln_2 = QWenRMSNormAuto(config)
+        self.mlp = QWenMLPAuto(config, ipp)
+        self.ipp = ipp
+        self.idx = idx
+
+    def forward(
+        self,
+        hidden_states,
+        layer_past=None,
+        attention_mask=None,
+        position_ids=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        layernorm_output = self.ln_1(hidden_states)
+
+        attn_outputs = self.attn(
+            layernorm_output,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        attn_output = attn_outputs[0]
+
+        outputs = attn_outputs[1:]
+
+        residual = hidden_states
+        layernorm_input = attn_output + residual
+
+        layernorm_output = self.ln_2(layernorm_input)
+
+        residual = layernorm_input
+        mlp_output = self.mlp(layernorm_output)
+        hidden_states = residual + mlp_output
+
+        if use_cache:
+            outputs = (hidden_states,) + outputs
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+
+        # remove empty tuple for pipeline parallel
+        if type(outputs) is tuple and len(outputs) == 1:
+            outputs = outputs[0]
+        return outputs
+
+
+class QWenPretrainedModelAuto(PretrainedModel):
+    config_class = QWenConfig
+    base_model_prefix = "qwen"
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config, is_split=True):
+
+        from paddlenlp.transformers.conversion_utils import split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def get_tensor_parallel_split_mappings(num_hidden_layers):
+            final_actions = {}
+            base_actions = {
+                # Column Linear
+                "lm_head.weight": partial(fn, is_column=True),
+                "qwen.h.0.mlp.w2.weight": partial(fn, is_column=True),
+                "qwen.h.0.mlp.w1.weight": partial(fn, is_column=True),
+                "qwen.h.0.attn.c_attn.weight": partial(fn, is_column=True, is_naive_3fuse=True),
+                "qwen.h.0.attn.c_attn.bias": partial(fn, is_column=True, is_naive_3fuse=True),
+                # Row Linear
+                "qwen.wte.weight": partial(fn, is_column=False),
+                "qwen.h.0.mlp.c_proj.weight": partial(fn, is_column=False),
+                "qwen.h.0.attn.c_proj.weight": partial(fn, is_column=False),
+            }
+            for key, action in base_actions.items():
+                if "h.0." in key:
+                    for i in range(num_hidden_layers):
+                        final_actions[key.replace("h.0.", f"h.{i}.")] = action
+                final_actions[key] = action
+
+            return final_actions
+
+        mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
+
+        return mappings
+
+    @classmethod
+    def _get_name_mappings(cls, config: QWenConfig) -> List[StateDictNameMapping]:
+        mappings = [
+            "wte.weight",
+            "ln_f.weight",
+        ]
+
+        for layer_index in range(config.num_hidden_layers):
+            layer_mappings = [
+                [
+                    f"h.{layer_index}.ln_1.weight",
+                    f"h.{layer_index}.ln_1.weight",
+                ],
+                [
+                    f"h.{layer_index}.attn.c_attn.weight",
+                    f"h.{layer_index}.attn.c_attn.weight",
+                    "transpose",
+                ],
+                [
+                    f"h.{layer_index}.attn.c_attn.bias",
+                    f"h.{layer_index}.attn.c_attn.bias",
+                ],
+                [
+                    f"h.{layer_index}.attn.c_proj.weight",
+                    f"h.{layer_index}.attn.c_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"h.{layer_index}.ln_2.weight",
+                    f"h.{layer_index}.ln_2.weight",
+                ],
+                [
+                    f"h.{layer_index}.mlp.w1.weight",
+                    f"h.{layer_index}.mlp.w1.weight",
+                    "transpose",
+                ],
+                [
+                    f"h.{layer_index}.mlp.w2.weight",
+                    f"h.{layer_index}.mlp.w2.weight",
+                    "transpose",
+                ],
+                [
+                    f"h.{layer_index}.mlp.c_proj.weight",
+                    f"h.{layer_index}.mlp.c_proj.weight",
+                    "transpose",
+                ],
+            ]
+            mappings.extend(layer_mappings)
+
+        init_name_mappings(mappings)
+        for mapping in mappings:
+            mapping[0] = "transformer." + mapping[0]
+            if len(mapping) > 1 and mapping[1] is not None:
+                mapping[1] = "qwen." + mapping[1]
+
+        if config.architectures is not None:
+            if "QWenForCausalLM" in config.architectures or "QWenLMHeadModel" in config.architectures:
+                mappings.extend(
+                    [
+                        [
+                            "lm_head.weight",
+                            "lm_head.weight",
+                            "transpose",
+                        ]
+                    ]
+                )
+
+        init_name_mappings(mappings)
+        return [StateDictNameMapping(*mapping) for mapping in mappings]
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(
+            module,
+            (
+                nn.Linear,
+                nn.Embedding,
+                mpu.ColumnParallelLinear,
+                mpu.RowParallelLinear,
+                mpu.VocabParallelEmbedding,
+                QWenLMHeadAuto,
+            ),
+        ):
+            module.weight.set_value(
+                paddle.tensor.normal(mean=0.0, std=self.config.initializer_range, shape=module.weight.shape)
+            )
+            if getattr(module, "bias", None) is not None:
+                module.weight.set_value(paddle.zeros(shape=module.weight.shape, dtype=paddle.get_default_dtype()))
+
+        for name, p in module.named_parameters():
+            if name == "c_proj.weight":
+                p.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range / math.sqrt(2 * self.config.num_hidden_layers),
+                        shape=p.shape,
+                    )
+                )
+
+
+class QWenModelAuto(QWenPretrainedModelAuto):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.num_hidden_layers = config.num_hidden_layers
+        self.embed_dim = config.hidden_size
+        self.enable_recompute = config.use_recompute
+        self.recompute_granularity = config.recompute_granularity
+
+        self.wte = nn.Embedding(self.vocab_size, self.embed_dim)
+
+        self.drop = nn.Dropout(config.emb_dropout_prob)
+
+        def get_layer_ipp(layer_index):
+            mesh = fleet.auto.get_mesh()
+            if "pp" not in mesh.dim_names:
+                return None
+            else:
+                pp_degree = mesh.get_dim_size("pp")
+                layer_per_stage = math.ceil(config.num_hidden_layers / pp_degree)
+                return layer_index // layer_per_stage
+
+        self.h = nn.LayerList(
+            [
+                QWenBlockAuto(
+                    config,
+                    get_layer_ipp(i),
+                    i,
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.ln_f = QWenRMSNormAuto(config)
+
+    def get_input_embeddings(self):
+        return self.wte
+
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
+
+    @paddle.jit.not_to_static
+    def recompute_training(
+        self,
+        block,
+        hidden_states,
+        layer_past,
+        attention_mask,
+        position_ids,
+        encoder_hidden_states,
+        encoder_attention_mask,
+        use_cache,
+        output_attentions,
+    ):
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+
+            return custom_forward
+
+        hidden_states = recompute(
+            create_custom_forward(block),
+            hidden_states,
+            layer_past,
+            attention_mask,
+            position_ids,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            use_cache,
+            output_attentions,
+            use_reentrant=self.config.recompute_use_reentrant,
+        )
+        return hidden_states
+
+    def get_masks(self, batch_size, seq_length, past_length, dtype, padding_mask=None):
+        # casual mask
+        casual_mask = paddle.tril(paddle.ones([batch_size, 1, seq_length, seq_length], dtype="bool"))
+        if past_length > 0:
+            casual_mask = paddle.concat(
+                [paddle.ones([batch_size, 1, seq_length, past_length], dtype="bool"), casual_mask], axis=-1
+            )
+
+        # seq_mask
+        if padding_mask is None:
+            padding_mask = paddle.ones((batch_size, 1, seq_length, seq_length + past_length), dtype="bool")
+        if len(padding_mask.shape) == 2:
+            # from Tokenizer
+            padding_mask = (
+                padding_mask.unsqueeze(axis=[1, 2])
+                .expand([batch_size, 1, seq_length, seq_length + past_length])
+                .astype("bool")
+            )
+        elif len(padding_mask.shape) == 3:
+            # [batch_size,tgt_length, src_length] -> [batch_size, 1, tgt_length, src_length]
+            padding_mask = padding_mask.unsqueeze(1).astype("bool")
+        elif len(padding_mask.shape) == 4:
+            padding_mask = padding_mask.astype("bool")
+
+        casual_mask = casual_mask & padding_mask
+
+        return casual_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        past_key_values=None,
+        attention_mask=None,
+        position_ids=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape
+            input_ids = input_ids.reshape([-1, input_shape[-1]])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = tuple([None] * len(self.h))
+        else:
+            past_length = past_key_values[0][0].shape[1]
+
+        encoder_attention_mask = None
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+
+        hidden_states = inputs_embeds
+
+        # bool 4D mask
+        attention_mask = self.get_masks(
+            input_shape[0], input_shape[1], past_length, dtype=hidden_states.dtype, padding_mask=attention_mask
+        )
+        # TODO(GhostScreaming): how to fix paddle.finfo?
+        zero = paddle.zeros(attention_mask.shape, dtype=paddle.bfloat16)
+        neg_inf = paddle.full_like(attention_mask, paddle.finfo(paddle.bfloat16).min, dtype=paddle.bfloat16)
+        # dtype 4D mask
+        attention_mask = paddle.where(attention_mask, zero, neg_inf)
+
+        hidden_states = self.drop(hidden_states)
+        hidden_states = dist.reshard(hidden_states, get_mesh(), [dist.Shard(0), dist.Replicate()])
+        output_shape = input_shape + [
+            hidden_states.shape[-1],
+        ]
+
+        if self.enable_recompute and self.training:
+            if use_cache:
+                logger.warning_once("`use_cache=True` is incompatible with recompute")
+                use_cache = False
+
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        pre_ipp = 0
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            has_gradient = not hidden_states.stop_gradient
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if block.ipp is not None and pre_ipp != block.ipp:
+                hidden_states = dist.reshard(
+                    hidden_states,
+                    get_mesh(block.ipp),
+                    [dist.Shard(0), dist.Replicate()],
+                )
+                if position_ids is not None:
+                    position_ids = dist.reshard(
+                        position_ids,
+                        get_mesh(block.ipp),
+                        [dist.Shard(0), dist.Replicate()],
+                    )
+                if attention_mask is not None:
+                    attention_mask = dist.reshard(
+                        attention_mask,
+                        get_mesh(block.ipp),
+                        [dist.Shard(0), dist.Replicate()],
+                    )
+            if self.enable_recompute and self.training and has_gradient and self.recompute_granularity == "full":
+                outputs = self.recompute_training(
+                    block,
+                    hidden_states,
+                    layer_past=layer_past,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+            else:
+                outputs = block(
+                    hidden_states,
+                    layer_past=layer_past,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+            pre_ipp = block.ipp
+
+            if type(outputs) is tuple:
+                hidden_states = outputs[0]
+            else:
+                hidden_states = outputs
+
+            if use_cache is True:
+                presents = presents + (outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[1],)
+
+        hidden_states = self.ln_f(hidden_states)
+        hidden_states = hidden_states.reshape(output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states] if v is not None)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class QWenLMHeadAuto(nn.Layer):
+    def __init__(self, config: QWenConfig):
+        super(QWenLMHeadAuto, self).__init__()
+        self.config = config
+        vocab_size = config.vocab_size
+
+        self.weight = self.create_parameter(
+            shape=[config.hidden_size, vocab_size],
+            dtype=paddle.get_default_dtype(),
+        )
+
+    def forward(self, hidden_states, tensor_parallel_output=None):
+        if tensor_parallel_output is None:
+            tensor_parallel_output = self.config.tensor_parallel_output
+
+        logits = paddle.matmul(hidden_states, self.weight, transpose_y=False)
+        return logits
+
+
+loss_cnt = 0
+
+
+class QWenPretrainingCriterionAuto(paddle.nn.Layer):
+    """
+    Criterion for Llama.
+    It calculates the final loss.
+    """
+
+    def __init__(self, config):
+
+        super(QWenPretrainingCriterionAuto, self).__init__()
+        self.ignore_index = getattr(config, "ignore_index", -100)
+        self.config = config
+        self.enable_parallel_cross_entropy = config.tensor_parallel_degree > 1 and config.tensor_parallel_output
+
+        self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index)
+
+    def forward(self, prediction_scores, masked_lm_labels):
+        global loss_cnt
+        if self.enable_parallel_cross_entropy:
+            if prediction_scores.shape[-1] == self.config.vocab_size:
+                warnings.warn(
+                    f"enable_parallel_cross_entropy, the vocab_size should be splited: {prediction_scores.shape[-1]}, {self.config.vocab_size}"
+                )
+                self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index)
+
+        with paddle.amp.auto_cast(False):
+            masked_lm_loss = self.loss_func(prediction_scores.astype("float32"), masked_lm_labels.unsqueeze(2))
+            # skip ignore_index which loss == 0
+            masked_lm_loss = paddle.masked_select(masked_lm_loss, masked_lm_loss > 0).astype("float32")
+            loss = paddle.mean(masked_lm_loss)
+
+        loss_cnt += 1
+        return loss
+
+
+class QWenForCausalLM3DAuto(QWenPretrainedModelAuto):
+    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.rotary_emb\.inv_freq"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.qwen = QWenModelAuto(config)
+        self.lm_head = QWenLMHeadAuto(config)
+
+    def forward(
+        self,
+        input_ids=None,
+        past_key_values=None,
+        attention_mask=None,
+        position_ids=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.qwen(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+
+        # if labels is None，means we need full output, instead of tensor_parallel_output
+        # tensor_parallel_output is togather with ParallelCrossEntropy
+        tensor_parallel_output = (
+            self.config.tensor_parallel_output and labels is not None and self.config.tensor_parallel_degree > 1
+        )
+        lm_logits = self.lm_head(hidden_states, tensor_parallel_output=tensor_parallel_output)
+
+        return lm_logits
+
+
+class RotaryEmbedding(nn.Layer):
+    def __init__(self, dim, base=10000):
+        super().__init__()
+        self.dim = dim
+        self.base = base
+        self.inv_freq = 1.0 / (self.base ** (paddle.cast(paddle.arange(0, self.dim, 2), dtype="float32") / self.dim))
+        self._seq_len_cached = 0
+        self._ntk_alpha_cached = 1.0
+
+    def update_cos_sin_cache(self, max_seq_len, offset=0, ntk_alpha=1.0):
+        seqlen = max_seq_len + offset
+        if seqlen > self._seq_len_cached or ntk_alpha != self._ntk_alpha_cached:
+            base = self.base * ntk_alpha ** (self.dim / (self.dim - 2))
+            self.inv_freq = 1.0 / (base ** (paddle.arange(0, self.dim, 2, dtype=paddle.float32) / self.dim))
+            self._seq_len_cached = max(2 * seqlen, 16)
+            self._ntk_alpha_cached = ntk_alpha
+            seq = paddle.arange(self._seq_len_cached)
+            with paddle.amp.auto_cast(enable=False):
+                freqs = paddle.outer(seq.astype(self.inv_freq.dtype), self.inv_freq)
+            emb = paddle.concat([freqs, freqs], axis=-1)
+            self.cos_cached = emb.cos()[None, :, None, :]
+            self.sin_cached = emb.sin()[None, :, None, :]
+
+    def forward(self, x, max_seq_len, offset=0, ntk_alpha=1.0):
+        self.update_cos_sin_cache(max_seq_len, offset, ntk_alpha)
+        cos = self.cos_cached[:, offset : offset + max_seq_len, :, ...]
+        sin = self.sin_cached[:, offset : offset + max_seq_len, :, ...]
+        return (
+            cos.cast(x.dtype) if cos.dtype != x.dtype else cos,
+            sin.cast(x.dtype) if sin.dtype != x.dtype else sin,
+        )
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return paddle.concat([-x2, x1], axis=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None):
+    if position_ids is None:
+        cos = cos[:, : q.shape[1], :, :]  # [bs, seq_len, 1, dim]
+        sin = sin[:, : q.shape[1], :, :]  # [bs, seq_len, 1, dim]
+    else:
+        cos = cos.squeeze(axis=[0, 2])  # [seq_len, dim]
+        sin = sin.squeeze(axis=[0, 2])  # [seq_len, dim]
+        cos = cos[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
+        sin = sin[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def rms_norm_fused(x_in, w, eps):
+    fused_ln = try_import("fused_ln")
+    return fused_ln.fused_rms_norm(x_in, w, eps)[0]
+
+
+class QWenRMSNormAuto(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.eps = config.layer_norm_epsilon
+        self.weight = paddle.create_parameter(
+            shape=[config.hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(1.0),
+        )
+
+    def forward(self, x):
+        if self.config.use_fused_rms_norm:
+            return rms_norm_fused(x, self.weight, self.eps)
+        with paddle.amp.auto_cast(False):
+            variance = x.astype("float32").pow(2).mean(-1, keepdim=True)
+            output = paddle.rsqrt(variance + self.eps) * x
+
+        if self.weight.dtype in [paddle.float16, paddle.bfloat16]:
+            output = paddle.cast(output, self.weight.dtype)
+        return output * self.weight
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen/modeling_pp.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen/modeling_pp.py
new file mode 100644
index 000000000..47357d692
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen/modeling_pp.py
@@ -0,0 +1,207 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.distributed.fleet as fleet
+import paddle.nn as nn
+from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
+
+from paddlenlp.transformers.model_utils import PipelinePretrainedModel
+
+from .modeling import (
+    QWenBlock,
+    QWenConfig,
+    QWenLMHead,
+    QWenModel,
+    QWenPretrainedModel,
+    QWenPretrainingCriterion,
+    QWenRMSNorm,
+)
+
+__all__ = [
+    "QWenForCausalLMPipe",
+]
+
+
+def parse_args(args):
+    if isinstance(args, tuple):
+        if len(args) == 3:
+            hidden_states, attention_mask, position_ids = args
+        elif len(args) == 2:
+            hidden_states, attention_mask = args
+            position_ids = None
+        elif len(args) == 1:
+            hidden_states = args
+            attention_mask, position_ids = None, None
+    else:
+        hidden_states = args
+        attention_mask, position_ids = None, None
+
+    if position_ids is not None:
+        position_ids.stop_gradient = True
+
+    if attention_mask is not None:
+        attention_mask.stop_gradient = True
+
+    return hidden_states, attention_mask, position_ids
+
+
+def return_args(hidden_states, attention_mask=None, position_ids=None):
+    ret = (hidden_states,)
+
+    if attention_mask is not None:
+        ret += (attention_mask.clone(),)
+    if position_ids is not None:
+        ret += (position_ids.clone(),)
+    if len(ret) == 1:
+        ret = ret[0]
+
+    return ret
+
+
+class QWenEmbeddingPipe(nn.Layer):
+    """Extends QWenEmbeddings to forward attention_mask through the pipeline."""
+
+    def __init__(self, config):
+        super(QWenEmbeddingPipe, self).__init__()
+        self.hidden_size = config.hidden_size
+        self.sequence_parallel = config.sequence_parallel
+        if config.tensor_parallel_degree > 1:
+            self.wte = fleet.meta_parallel.VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()),
+            )
+        else:
+            self.wte = nn.Embedding(config.vocab_size, config.hidden_size)
+
+    def forward(self, args):
+        """_summary_
+
+        Args:
+            input (_type_): _description_
+
+        Returns:
+            _type_: _description_
+        """
+        input_ids, attention_mask, position_ids = parse_args(args)
+        input_embeds = self.wte(input_ids)
+        if self.sequence_parallel:
+            from paddlenlp.transformers import ScatterOp
+
+            # [bs, seq_len, num_head * head_dim] -> [bs * seq_len, num_head * head_dim]
+            bs, seq_len, hidden_size = input_embeds.shape
+            input_embeds = paddle.reshape_(input_embeds, [bs * seq_len, hidden_size])
+            # [seq_len * bs / n, num_head * head_dim] (n is mp parallelism)
+            input_embeds = ScatterOp.apply(input_embeds)
+
+        batch_size, seq_length = input_ids.shape
+        if attention_mask is not None:
+            attention_mask = QWenModel._prepare_decoder_attention_mask(
+                attention_mask, (batch_size, seq_length), 0, input_embeds.dtype
+            )
+            attention_mask.stop_gradient = True
+
+        return return_args(input_embeds, attention_mask, position_ids)
+
+
+class QWenBlockPipe(QWenBlock):
+    def forward(self, args):
+        hidden_states, attention_mask, position_ids = parse_args(args)
+        hidden_states = super().forward(hidden_states, attention_mask=attention_mask)
+        return return_args(hidden_states, attention_mask, position_ids)
+
+
+class QWenRMSNormPipe(QWenRMSNorm):
+    def forward(self, args):
+        hidden_states, attention_mask, position_ids = parse_args(args)
+        return super().forward(hidden_states)
+
+
+class QWenForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):
+    """QWenForPretraining adapted for pipeline parallelism.
+
+    The largest change is flattening the QWenModel class so we can express it as a
+    sequence of layers including embedding, transformer layers, and output.
+    """
+
+    config_class = QWenConfig
+
+    _get_tensor_parallel_mappings = QWenPretrainedModel._get_tensor_parallel_mappings
+    _init_weights = QWenPretrainedModel._init_weights
+    _keys_to_ignore_on_load_unexpected = QWenPretrainedModel._keys_to_ignore_on_load_unexpected
+
+    # DONOT Add base_model_prefix !!!!
+
+    def __init__(self, config):
+        self.config = config
+
+        self.recompute = self.config.recompute
+        self.recompute_granularity = self.config.recompute_granularity
+        self.pp_recompute_interval = self.config.pp_recompute_interval
+        self.no_recompute_layers = config.no_recompute_layers if config.no_recompute_layers is not None else []
+        if self.recompute_granularity == "full":
+            assert len(self.no_recompute_layers) == 0, "for pp with full recompute, no_recompute_layers is not support"
+
+        virtual_pp_degree = getattr(self.config, "virtual_pp_degree", 1)
+
+        def get_hcg():
+            return fleet.get_hybrid_communicate_group()
+
+        hcg = get_hcg()
+        tensor_parallel_degree = max(hcg.get_model_parallel_world_size(), 1)
+        tensor_parallel_rank = max(hcg.get_model_parallel_rank(), 0)
+
+        # TODO: fix tensor_parallel_degree rewrite in here
+        config.tensor_parallel_degree = tensor_parallel_degree
+        config.tensor_parallel_rank = tensor_parallel_rank
+
+        self.add_sequential_layer(LayerDesc(QWenEmbeddingPipe, config=config), "qwen")
+        for i in range(config.num_hidden_layers):
+            self.add_sequential_layer(
+                LayerDesc(QWenBlockPipe, config=config),
+                f"qwen.h.{i}",
+            )
+        self.add_sequential_layer(LayerDesc(QWenRMSNormPipe, config=config), "qwen.ln_f")
+        self.add_sequential_layer(LayerDesc(QWenLMHead, config=config), "lm_head")
+
+        recompute_interval = 0
+        if self.recompute and self.recompute_granularity == "full":
+            assert self.config.pp_recompute_interval <= config.num_hidden_layers // (
+                virtual_pp_degree * get_hcg().topology().get_dim_size("pipe")
+            ), "pp recompute interval should smaller than num layers of each pp chunk"
+            recompute_interval = self.config.pp_recompute_interval
+
+        seg_method = "layer:QWenBlock"
+        if config.num_hidden_layers % get_hcg().topology().get_dim_size("pipe") != 0:
+            seg_method = "uniform"
+
+        PipelineLayer.__init__(
+            self,
+            layers=self.get_sequential_layers(),
+            loss_fn=QWenPretrainingCriterion(config),
+            topology=get_hcg().topology(),
+            seg_method=seg_method,
+            recompute_interval=recompute_interval,
+            recompute_ctx={
+                "mp_group": get_hcg().get_model_parallel_group(),
+                "offload": False,
+                "partition": False,
+            },
+            num_virtual_pipeline_stages=virtual_pp_degree,
+        )
+        # You should call init here, since there is a  diamond inheritance problem
+        self.apply(self._init_weights)
+        # DON'T init PipelinePretrainedModel
+        # PipelinePretrainedModel.__init__(self.super(), config=config)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen/tokenizer.py
new file mode 100644
index 000000000..16e881ef7
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen/tokenizer.py
@@ -0,0 +1,308 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization classes for QWen."""
+
+import base64
+import os
+import unicodedata
+from typing import Collection, Dict, List, Optional, Set, Tuple, Union
+
+import numpy as np
+
+from ...utils.import_utils import is_tiktoken_available
+from .. import PretrainedTokenizer
+from ..tokenizer_utils_base import (
+    AddedToken,
+    BatchEncoding,
+    EncodedInput,
+    PaddingStrategy,
+)
+
+__all__ = ["QWenTokenizer"]
+
+
+VOCAB_FILES_NAMES = {"vocab_file": "qwen.tiktoken"}
+
+PAT_STR = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+ENDOFTEXT = "<|endoftext|>"
+IMSTART = "<|im_start|>"
+IMEND = "<|im_end|>"
+# as the default behavior is changed to allow special tokens in
+# regular texts, the surface forms of special tokens need to be
+# as different as possible to minimize the impact
+EXTRAS = tuple((f"<|extra_{i}|>" for i in range(205)))
+SPECIAL_TOKENS = (
+    ENDOFTEXT,
+    IMSTART,
+    IMEND,
+) + EXTRAS
+
+tiktoken = None
+
+
+def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
+    with open(tiktoken_bpe_file, "rb") as f:
+        contents = f.read()
+    return {
+        base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line)
+    }
+
+
+class QWenTokenizer(PretrainedTokenizer):
+    """QWen tokenizer."""
+
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+    resource_files_names = VOCAB_FILES_NAMES
+
+    def __init__(
+        self,
+        vocab_file,
+        errors="replace",
+        padding_side="left",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if not is_tiktoken_available():
+            raise ValueError("tiktoken is not installed, please install it use: pip install tiktoken")
+
+        import tiktoken as tk
+
+        tiktoken = tk
+
+        self.errors = errors  # how to handle errors in decoding
+
+        self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)  # type: dict[bytes, int]
+        self.special_tokens = {
+            token: index for index, token in enumerate(SPECIAL_TOKENS, start=len(self.mergeable_ranks))
+        }
+
+        enc = tiktoken.Encoding(
+            "Qwen",
+            pat_str=PAT_STR,
+            mergeable_ranks=self.mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+        assert (
+            len(self.mergeable_ranks) + len(self.special_tokens) == enc.n_vocab
+        ), f"{len(self.mergeable_ranks) + len(self.special_tokens)} != {enc.n_vocab} in encoding"
+
+        self.decoder = {v: k for k, v in self.mergeable_ranks.items()}  # type: dict[int, bytes|str]
+        self.decoder.update({v: k for k, v in self.special_tokens.items()})
+
+        self.tokenizer = enc  # type: tiktoken.Encoding
+
+        self.eod_id = self.tokenizer.eot_token
+        self.im_start_id = self.special_tokens[IMSTART]
+        self.im_end_id = self.special_tokens[IMEND]
+
+        if "pad_token_id" in kwargs:
+            self.pad_token_id = kwargs["pad_token_id"]
+        if "eos_token_id" in kwargs:
+            self.eos_token_id = kwargs["eos_token_id"]
+
+    def __len__(self) -> int:
+        return self.tokenizer.n_vocab
+
+    def get_vocab(self) -> Dict[bytes, int]:
+        return self.mergeable_ranks
+
+    def convert_tokens_to_ids(self, tokens: Union[bytes, str, List[Union[bytes, str]]]) -> List[int]:
+        ids = []
+        if isinstance(tokens, (str, bytes)):
+            if tokens in self.special_tokens:
+                return self.special_tokens[tokens]
+            else:
+                return self.mergeable_ranks.get(tokens)
+        for token in tokens:
+            if token in self.special_tokens:
+                ids.append(self.special_tokens[token])
+            else:
+                ids.append(self.mergeable_ranks.get(token))
+        return ids
+
+    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
+        if not special_tokens and new_tokens:
+            raise ValueError("Adding regular tokens is not supported")
+        for token in new_tokens:
+            surface_form = token.content if isinstance(token, AddedToken) else token
+            if surface_form not in SPECIAL_TOKENS:
+                raise ValueError("Adding unknown special tokens is not supported")
+        return 0
+
+    def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
+        """
+        Save only the vocabulary of the tokenizer (vocabulary).
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        file_path = os.path.join(save_directory, "qwen.tiktoken")
+        with open(file_path, "w", encoding="utf8") as w:
+            for k, v in self.mergeable_ranks.items():
+                line = base64.b64encode(k).decode("utf8") + " " + str(v) + "\n"
+                w.write(line)
+        return (file_path,)
+
+    def tokenize(
+        self,
+        text: str,
+        allowed_special: Union[Set, str] = "all",
+        disallowed_special: Union[Collection, str] = (),
+        **kwargs,
+    ) -> List[Union[bytes, str]]:
+        """
+        Converts a string in a sequence of tokens.
+
+        Args:
+            text (`str`):
+                The sequence to be encoded.
+            allowed_special (`Literal["all"]` or `set`):
+                The surface forms of the tokens to be encoded as special tokens in regular texts.
+                Default to "all".
+            disallowed_special (`Literal["all"]` or `Collection`):
+                The surface forms of the tokens that should not be in regular texts and trigger errors.
+                Default to an empty tuple.
+
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific encode method.
+
+        Returns:
+            `List[bytes|str]`: The list of tokens.
+        """
+        tokens = []
+        text = unicodedata.normalize("NFC", text)
+
+        # this implementation takes a detour: text -> token id -> token surface forms
+        for t in self.tokenizer.encode(text, allowed_special=allowed_special, disallowed_special=disallowed_special):
+            tokens.append(self.decoder[t])
+        return tokens
+
+    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
+        """
+        Converts a sequence of tokens in a single string.
+        """
+        text = ""
+        temp = b""
+        for t in tokens:
+            if isinstance(t, str):
+                if temp:
+                    text += temp.decode("utf-8", errors=self.errors)
+                    temp = b""
+                text += t
+            elif isinstance(t, bytes):
+                temp += t
+            else:
+                raise TypeError("token should only be of type types or str")
+        if temp:
+            text += temp.decode("utf-8", errors=self.errors)
+        return text
+
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_vocab
+
+    def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
+        """Converts an id to a token, special tokens included"""
+        if index in self.decoder:
+            return self.decoder[index]
+        raise ValueError("unknown ids")
+
+    def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
+        """Converts a token to an id using the vocab, special tokens included"""
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        if token in self.mergeable_ranks:
+            return self.mergeable_ranks[token]
+        raise ValueError("unknown token")
+
+    def _tokenize(self, text: str, **kwargs):
+        """
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+
+        Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        errors: str = None,
+        **kwargs,
+    ) -> str:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        if skip_special_tokens:
+            token_ids = [i for i in token_ids if i < self.eod_id]
+        return self.tokenizer.decode(token_ids, errors=errors or self.errors)
+
+    def _pad(
+        self,
+        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                >= 7.5 (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+
+        # attention_mask shape [1,seq_len,seq_len]
+        if "attention_mask" in encoded_inputs and len(np.shape(encoded_inputs["attention_mask"])) > 2:
+            attention_mask = encoded_inputs["attention_mask"]
+            encoded_inputs.pop("attention_mask")
+        else:
+            attention_mask = None
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+        encoded_inputs = super()._pad(
+            encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, return_attention_mask
+        )
+        if attention_mask is not None and len(np.shape(attention_mask)) > 2:
+            encoded_inputs["attention_mask"] = attention_mask
+            needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+            if needs_to_be_padded:
+                difference = max_length - len(required_input)
+                if "attention_mask" in encoded_inputs:
+                    encoded_inputs["attention_mask"] = np.pad(
+                        encoded_inputs["attention_mask"],
+                        pad_width=[(0, 0), (difference, 0), (difference, 0)],
+                        mode="constant",
+                        constant_values=0,
+                    )
+        return encoded_inputs
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2/__init__.py
new file mode 100644
index 000000000..b79ab9e6b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2024 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .configuration import *
+from .modeling import *
+from .modeling_pp import *
+from .tokenizer import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2/configuration.py
new file mode 100644
index 000000000..e6ab72a29
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2/configuration.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Qwen2 model configuration"""
+
+from ..configuration_utils import PretrainedConfig
+
+__all__ = [
+    "Qwen2Config",
+]
+
+
+class Qwen2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
+    Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen2Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22016):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 28):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import Qwen2Model, Qwen2Config
+
+    >>> # Initializing a Qwen2 style configuration
+    >>> configuration = Qwen2Config()
+
+    >>> # Initializing a model from the Qwen2-7B style configuration
+    >>> model = Qwen2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        seq_length=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        pad_token_id=0,
+        bos_token_id=151643,
+        eos_token_id=151643,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=28,
+        attention_dropout=0.0,
+        rope_scaling_factor=1.0,
+        rope_scaling_type=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.seq_length = seq_length
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window
+        self.max_window_layers = max_window_layers
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        self.use_cache = use_cache
+        self.rope_scaling_factor = rope_scaling_factor
+        self.rope_scaling_type = rope_scaling_type
+
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2/modeling.py
new file mode 100644
index 000000000..81eb6addc
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2/modeling.py
@@ -0,0 +1,1555 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Paddle Qwen2 model."""
+from __future__ import annotations
+
+import math
+import warnings
+from functools import partial
+from typing import List, Optional, Tuple, Union
+
+import paddle
+import paddle.distributed.fleet.meta_parallel as mpu
+import paddle.nn.functional as F
+from paddle import Tensor, nn
+from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
+from paddle.distributed.fleet.utils import recompute
+
+from .. import linear_utils
+from ..activations import ACT2FN
+from ..conversion_utils import StateDictNameMapping, init_name_mappings
+from ..linear_utils import Linear
+from ..model_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from ..model_utils import PretrainedModel, register_base_model
+from .configuration import Qwen2Config
+
+try:
+    from paddle.incubate.nn.functional import fused_rotary_position_embedding
+except ImportError:
+    fused_rotary_position_embedding = None
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        GatherOp,
+        ScatterOp,
+        mark_as_sequence_parallel_parameter,
+    )
+except:
+    pass
+
+try:
+    from paddle.nn.functional.flash_attention import flash_attention
+except:
+    flash_attention = None
+
+
+__all__ = [
+    "Qwen2Model",
+    "Qwen2PretrainedModel",
+    "Qwen2ForCausalLM",
+    "Qwen2PretrainingCriterion",
+    "Qwen2ForSequenceClassification",
+    "Qwen2ForTokenClassification",
+]
+
+
+def get_triangle_upper_mask(x, mask=None):
+    if mask is not None:
+        return mask
+    # [bsz, n_head, q_len, kv_seq_len]
+    shape = x.shape
+    #  [bsz, 1, q_len, kv_seq_len]
+    shape[1] = 1
+    mask = paddle.full(shape, paddle.finfo(x.dtype).min, dtype=x.dtype)
+    mask = paddle.triu(mask, diagonal=1)
+    mask.stop_gradient = True
+    return mask
+
+
+def assign_kv_heads(num_kv_heads: int, num_gpus: int):
+    # Initialize the assignment list
+    """
+    Assign kv heads to different GPUs in the Tensor Parallel Setup
+
+    Examples:
+        assign_kv_heads(num_kv_heads=1, num_gpus=2): [[0], [0]]
+        assign_kv_heads(num_kv_heads=2, num_gpus=2): [[0], [1]]
+        assign_kv_heads(num_kv_heads=4, num_gpus=2): [[0,1], [2,3]]
+        assign_kv_heads(num_kv_heads=1, num_gpus=4): [[0],[0],[0],[0]]
+        assign_kv_heads(num_kv_heads=2, num_gpus=4): [[0],[0],[1],[1]]
+        assign_kv_heads(num_kv_heads=4, num_gpus=4): [[0],[1],[2],[3]]
+    """
+    assignment_list = [[] for _ in range(num_gpus)]
+    # Case 1: more heads than cards
+    if num_kv_heads > num_gpus:
+        num_heads_per_card = num_kv_heads // num_gpus
+        for i in range(num_gpus):
+            for j in range(num_heads_per_card):
+                assignment_list[i].append(i * num_heads_per_card + j)
+    # Case 2: more cards than heads. each card get only 1 head.
+    else:
+        num_card_per_heads = num_gpus // num_kv_heads
+        for i in range(num_kv_heads):
+            for j in range(num_card_per_heads):
+                assignment_list[i * num_card_per_heads + j].append(i)
+    return assignment_list
+
+
+def parallel_matmul(x: Tensor, y: Tensor, transpose_y=True, tensor_parallel_output=True):
+    is_fleet_init = True
+    tensor_parallel_degree = 1
+    try:
+        hcg = fleet.get_hybrid_communicate_group()
+        model_parallel_group = hcg.get_model_parallel_group()
+        tensor_parallel_degree = hcg.get_model_parallel_world_size()
+    except:
+        is_fleet_init = False
+
+    if paddle.in_dynamic_mode():
+        y_is_distributed = y.is_distributed
+    else:
+        y_is_distributed = tensor_parallel_degree > 1
+
+    if is_fleet_init and tensor_parallel_degree > 1 and y_is_distributed:
+        # if not running under distributed.launch, it will raise AttributeError: 'Fleet' object has no attribute '_hcg'
+        input_parallel = paddle.distributed.collective._c_identity(x, group=model_parallel_group)
+        logits = paddle.matmul(input_parallel, y, transpose_y=transpose_y)
+
+        if tensor_parallel_output:
+            return logits
+
+        return paddle.distributed.collective._c_concat(logits, group=model_parallel_group)
+
+    else:
+        logits = paddle.matmul(x, y, transpose_y=transpose_y)
+        return logits
+
+
+def scaled_dot_product_attention(
+    query_states,
+    config,
+    key_states,
+    value_states,
+    attention_mask,
+    output_attentions,
+    training=True,
+    sequence_parallel=False,
+):
+    bsz, q_len, num_heads, head_dim = query_states.shape
+    _, kv_seq_len, _, _ = value_states.shape
+
+    if config.use_flash_attention and flash_attention:
+        # Paddle Flash Attention input [ bz, seqlen, nhead, head_dim]
+        # Torch Flash Attention input [ bz, nhead, seqlen, head_dim]
+
+        version = paddle.version.full_version
+        if version != "0.0.0" and version <= "2.5.2":
+            attn_output, attn_weights = flash_attention(
+                query_states,
+                key_states,
+                value_states,
+                causal=True,
+                return_softmax=output_attentions,
+            )
+        else:
+            attn_output = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                is_causal=attention_mask is None,
+                dropout_p=config.attention_dropout if training else 0.0,
+                training=training,
+            )
+            attn_weights = None
+
+        if sequence_parallel:
+            attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])
+        else:
+            attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
+        return (attn_output, attn_weights) if output_attentions else attn_output
+    else:
+        #  [ bz, seqlen, nhead, head_dim] -> [bs, nhead, seq_len, head_dim]
+        query_states = paddle.transpose(query_states, [0, 2, 1, 3])
+        # merge with the next transpose
+        key_states = paddle.transpose(key_states, [0, 2, 1, 3])
+        value_states = paddle.transpose(value_states, [0, 2, 1, 3])
+
+        # matmul and divide by sqrt(head_dim)
+        attn_weights = paddle.matmul(query_states / math.sqrt(head_dim), key_states.transpose([0, 1, 3, 2]))
+
+        if attn_weights.shape != [bsz, num_heads, q_len, kv_seq_len]:
+            raise ValueError(
+                f"Attention weights should be of shape {(bsz, num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.shape}"
+            )
+
+        if attention_mask is None:
+            attention_mask = get_triangle_upper_mask(attn_weights)
+        attention_mask = attention_mask.reshape([bsz, 1, q_len, kv_seq_len])
+        if attention_mask.shape != [bsz, 1, q_len, kv_seq_len]:
+            raise ValueError(
+                f"Attention mask should be of shape {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.shape}"
+            )
+
+        attn_weights = attn_weights + attention_mask
+        if not paddle.in_dynamic_mode():
+            attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query_states.dtype)
+        else:
+            with paddle.amp.auto_cast(False):
+                attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query_states.dtype)
+
+        attn_weights = F.dropout(attn_weights, p=config.attention_dropout, training=training)
+
+        attn_output = paddle.matmul(attn_weights, value_states)
+        attn_output = attn_output.transpose([0, 2, 1, 3])
+
+        if sequence_parallel:
+            attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])
+        else:
+            attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
+        return (attn_output, attn_weights) if output_attentions else attn_output
+
+
+def masked_fill(x, mask, value):
+    y = paddle.full(x.shape, value, x.dtype)
+    return paddle.where(mask, y, x)
+
+
+def is_casual_mask(attention_mask):
+    """
+    Upper triangular of attention_mask equals to attention_mask is casual
+    """
+    return (paddle.triu(attention_mask) == attention_mask).all().item()
+
+
+def _make_causal_mask(input_ids_shape, past_key_values_length):
+    """
+    Make causal mask used for self-attention
+    """
+    batch_size, target_length = input_ids_shape  # target_length: seq_len
+
+    mask = paddle.tril(paddle.ones((target_length, target_length), dtype="bool"))
+
+    if past_key_values_length > 0:
+        # [tgt_len, tgt_len + past_len]
+        mask = paddle.concat([paddle.ones([target_length, past_key_values_length], dtype="bool"), mask], axis=-1)
+
+    # [bs, 1, tgt_len, tgt_len + past_len]
+    return mask[None, None, :, :].expand([batch_size, 1, target_length, target_length + past_key_values_length])
+
+
+def _expand_2d_mask(mask, dtype, tgt_length):
+    """
+    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
+    """
+    batch_size, src_length = mask.shape[0], mask.shape[-1]
+    tgt_length = tgt_length if tgt_length is not None else src_length
+
+    mask = mask[:, None, None, :].astype("bool")
+    mask.stop_gradient = True
+    expanded_mask = mask.expand([batch_size, 1, tgt_length, src_length])
+
+    return expanded_mask
+
+
+class Qwen2RMSNorm(nn.Layer):
+    def __init__(self, config: Qwen2Config):
+        """
+        Qwen2RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.weight = paddle.create_parameter(
+            shape=[self.hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(1.0),
+        )
+        self.variance_epsilon = config.rms_norm_eps
+        self.config = config
+
+        if config.sequence_parallel:
+            mark_as_sequence_parallel_parameter(self.weight)
+
+    def forward(self, hidden_states):
+        if paddle.in_dynamic_mode():
+            with paddle.amp.auto_cast(False):
+                variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True)
+                hidden_states = paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
+        else:
+            variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True)
+            hidden_states = paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
+
+        if self.weight.dtype in [paddle.float16, paddle.bfloat16]:
+            hidden_states = paddle.cast(hidden_states, self.weight.dtype)
+        return hidden_states * self.weight
+
+
+class Qwen2RotaryEmbedding(nn.Layer):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        # [dim / 2]
+        self.inv_freq = 1.0 / (self.base ** (paddle.cast(paddle.arange(0, self.dim, 2), dtype="float32") / self.dim))
+        self._set_cos_sin_cache(seq_len=max_position_embeddings)
+
+    def _set_cos_sin_cache(self, seq_len):
+        self.max_seq_len_cached = seq_len
+        # [seq_len]
+        t = paddle.arange(seq_len, dtype="float32")
+        # [seq_len, dim/2]
+        freqs = paddle.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        # [seq_len, dim]
+        emb = paddle.concat([freqs, freqs], axis=-1)
+        # [1, seqlen, 1, dim]
+        self.cos_cached = emb.cos()[None, :, None, :]
+        self.sin_cached = emb.sin()[None, :, None, :]
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len)
+        cos = self.cos_cached[:, :seq_len, :, :]
+        sin = self.sin_cached[:, :seq_len, :, :]
+        return (
+            cos.cast(x.dtype) if cos.dtype != x.dtype else cos,
+            sin.cast(x.dtype) if sin.dtype != x.dtype else sin,
+        )
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return paddle.concat([-x2, x1], axis=-1)  # shape is the same as x
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    if position_ids is None:
+        # Note: Only for Qwen2MoEForCausalLMPipe model pretraining
+        cos = cos[:, : q.shape[1], :, :]  # [bs, seq_len, 1, dim]
+        sin = sin[:, : q.shape[1], :, :]  # [bs, seq_len, 1, dim]
+    else:
+        cos = cos.squeeze(axis=[0, 2])  # [seq_len, dim]
+        sin = sin.squeeze(axis=[0, 2])  # [seq_len, dim]
+        cos = cos[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
+        sin = sin[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class Qwen2MLP(nn.Layer):
+    def __init__(self, config: Qwen2Config, is_shared=False):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+
+        self.tensor_parallel_degree = config.tensor_parallel_degree
+
+        if config.sequence_parallel:
+            ColumnParallelLinear = linear_utils.ColumnSequenceParallelLinear
+            RowParallelLinear = linear_utils.RowSequenceParallelLinear
+        else:
+            ColumnParallelLinear = linear_utils.ColumnParallelLinear
+            RowParallelLinear = linear_utils.RowParallelLinear
+
+        if config.tensor_parallel_degree > 1:
+            self.gate_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.intermediate_size,
+                gather_output=False,
+                has_bias=False,
+            )
+            self.up_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.intermediate_size,
+                gather_output=False,
+                has_bias=False,
+            )
+            self.down_proj = RowParallelLinear(
+                self.intermediate_size,
+                self.hidden_size,
+                input_is_parallel=True,
+                has_bias=False,
+            )
+        else:
+            self.gate_proj = Linear(self.hidden_size, self.intermediate_size, bias_attr=False)  # w1
+            self.up_proj = Linear(self.hidden_size, self.intermediate_size, bias_attr=False)  # w3
+            self.down_proj = Linear(self.intermediate_size, self.hidden_size, bias_attr=False)  # w2
+
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor:
+    """
+    This is the equivalent of paddle.repeat_interleave(hidden_states, n_rep, axis=1). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, slen, num_key_value_heads, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+
+    hidden_states = hidden_states.unsqueeze(-2).tile([1, 1, 1, n_rep, 1])
+    return hidden_states.reshape([batch, slen, num_key_value_heads * n_rep, head_dim])
+
+
+class Qwen2Attention(nn.Layer):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: Qwen2Config, layerwise_recompute: bool = True):
+        super().__init__()
+
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+
+        self.head_dim = self.hidden_size // config.num_attention_heads
+
+        self.num_key_value_heads = config.num_key_value_heads
+        assert config.num_attention_heads // config.num_key_value_heads
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        self.seq_length = config.seq_length
+        self.sequence_parallel = config.sequence_parallel
+
+        # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True
+        # Enable_recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        self.layerwise_recompute = layerwise_recompute
+        self.recompute_granularity = config.recompute_granularity
+        if config.tensor_parallel_degree > 1:
+            assert (
+                self.num_heads % config.tensor_parallel_degree == 0
+            ), f"num_heads: {self.num_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}"
+            self.num_heads = self.num_heads // config.tensor_parallel_degree
+
+            assert (
+                self.num_key_value_heads % config.tensor_parallel_degree == 0
+            ), f"num_key_value_heads: {self.num_key_value_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}"
+            self.num_key_value_heads = self.num_key_value_heads // config.tensor_parallel_degree
+
+        self.use_fused_rope = config.use_fused_rope
+        if self.use_fused_rope:
+            if "gpu" not in paddle.device.get_device() or fused_rotary_position_embedding is None:
+                warnings.warn(
+                    "Enable fuse rope in the config, but fuse rope is not available. "
+                    "Will disable fuse rope. Try using latest gpu version of Paddle."
+                )
+                self.use_fused_rope = False
+
+        if config.sequence_parallel:
+            ColumnParallelLinear = linear_utils.ColumnSequenceParallelLinear
+            RowParallelLinear = linear_utils.RowSequenceParallelLinear
+        else:
+            ColumnParallelLinear = linear_utils.ColumnParallelLinear
+            RowParallelLinear = linear_utils.RowParallelLinear
+
+        if config.tensor_parallel_degree > 1:
+            self.q_proj = ColumnParallelLinear(self.hidden_size, self.hidden_size, has_bias=True, gather_output=False)
+            self.k_proj = ColumnParallelLinear(self.hidden_size, self.config.num_key_value_heads * self.head_dim, has_bias=True, gather_output=False)  # fmt:skip
+            self.v_proj = ColumnParallelLinear(self.hidden_size, self.config.num_key_value_heads * self.head_dim, has_bias=True, gather_output=False)  # fmt:skip
+            self.o_proj = RowParallelLinear(self.hidden_size, self.hidden_size, has_bias=False, input_is_parallel=True)
+        else:
+            self.q_proj = Linear(self.hidden_size, self.hidden_size, bias_attr=True)
+            self.k_proj = Linear(self.hidden_size, self.config.num_key_value_heads * self.head_dim, bias_attr=True)
+            self.v_proj = Linear(self.hidden_size, self.config.num_key_value_heads * self.head_dim, bias_attr=True)
+            self.o_proj = Linear(self.hidden_size, self.hidden_size, bias_attr=False)
+
+        self.rotary_emb = Qwen2RotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        position_ids: Optional[Tuple[paddle.Tensor]] = None,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism)
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        if self.sequence_parallel:
+            target_query_shape = [-1, self.seq_length, self.num_heads, self.head_dim]
+            target_key_value_shape = [-1, self.seq_length, self.num_key_value_heads, self.head_dim]
+        else:
+            target_query_shape = [0, 0, self.num_heads, self.head_dim]
+            target_key_value_shape = [0, 0, self.num_key_value_heads, self.head_dim]
+        query_states = query_states.reshape(shape=target_query_shape)
+        key_states = key_states.reshape(shape=target_key_value_shape)
+        value_states = value_states.reshape(shape=target_key_value_shape)
+
+        kv_seq_len = key_states.shape[-3]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-3]
+        if self.use_fused_rope:
+            assert past_key_value is None, "fuse rotary not support cache kv for now"
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+            query_states, key_states, _ = fused_rotary_position_embedding(
+                query_states,
+                key_states,
+                v=None,
+                sin=sin,
+                cos=cos,
+                position_ids=position_ids,
+                use_neox_rotary_style=False,
+            )
+        else:
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        # [bs, seq_len, num_head, head_dim]
+        if past_key_value is not None:
+            key_states = paddle.concat([past_key_value[0], key_states], axis=1)
+            value_states = paddle.concat([past_key_value[1], value_states], axis=1)
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        # TODO(wj-Mcat): use broadcast strategy when n_kv_heads = 1
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        has_gradient = not (query_states.stop_gradient and key_states.stop_gradient and value_states.stop_gradient)
+        if (
+            self.enable_recompute
+            and self.layerwise_recompute
+            and has_gradient
+            and self.recompute_granularity == "core_attn"
+        ):
+            outputs = recompute(
+                scaled_dot_product_attention,
+                query_states,
+                self.config,
+                key_states,
+                value_states,
+                attention_mask,
+                output_attentions,
+                self.training,
+                self.sequence_parallel,
+                use_reentrant=self.config.recompute_use_reentrant,
+            )
+        else:
+            outputs = scaled_dot_product_attention(
+                query_states,
+                self.config,
+                key_states,
+                value_states,
+                attention_mask,
+                output_attentions,
+                self.training,
+                self.sequence_parallel,
+            )
+        if output_attentions:
+            attn_output, attn_weights = outputs
+        else:
+            attn_output = outputs
+
+        # if sequence_parallel is true, out shape are [q_len / n, bs, num_head * head_dim]
+        # else their shape are [bs, q_len, num_head * head_dim], n is mp parallelism.
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        outputs = (attn_output,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        if use_cache:
+            outputs += (past_key_value,)
+
+        if type(outputs) is tuple and len(outputs) == 1:
+            outputs = outputs[0]
+
+        return outputs
+
+
+class Qwen2DecoderLayer(nn.Layer):
+    def __init__(self, config: Qwen2Config, layerwise_recompute: bool = False):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.self_attn = Qwen2Attention(config, layerwise_recompute)
+
+        self.mlp = Qwen2MLP(config)
+        self.input_layernorm = Qwen2RMSNorm(config)
+        self.post_attention_layernorm = Qwen2RMSNorm(config)
+
+        # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True
+        # Enable_recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        self.layerwise_recompute = layerwise_recompute
+        self.recompute_granularity = config.recompute_granularity
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        position_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
+        """
+        Args:
+            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`paddle.Tensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(paddle.Tensor)`, *optional*): cached past key and value projection states
+        """
+
+        # [bs * seq_len, embed_dim] -> [seq_len * bs / n, embed_dim] (sequence_parallel)
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        has_gradient = not hidden_states.stop_gradient
+        if (
+            self.enable_recompute
+            and self.layerwise_recompute
+            and has_gradient
+            and self.recompute_granularity == "full_attn"
+        ):
+            outputs = recompute(
+                self.self_attn,
+                hidden_states,
+                position_ids,
+                past_key_value,
+                attention_mask,
+                output_attentions,
+                use_cache,
+                use_reentrant=self.config.recompute_use_reentrant,
+            )
+        else:
+            outputs = self.self_attn(
+                hidden_states,
+                position_ids,
+                past_key_value,
+                attention_mask,
+                output_attentions,
+                use_cache,
+            )
+
+        if type(outputs) is tuple:
+            hidden_states = outputs[0]
+        else:
+            hidden_states = outputs
+
+        if output_attentions:
+            self_attn_weights = outputs[1]
+
+        if use_cache:
+            present_key_value = outputs[2 if output_attentions else 1]
+
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        if type(outputs) is tuple and len(outputs) == 1:
+            outputs = outputs[0]
+
+        return outputs
+
+
+class Qwen2PretrainedModel(PretrainedModel):
+    config_class = Qwen2Config
+    base_model_prefix = "qwen2"
+    _keys_to_ignore_on_load_unexpected = [r"self_attn.rotary_emb.inv_freq"]
+
+    @classmethod
+    def _get_name_mappings(cls, config: Qwen2Config) -> list[StateDictNameMapping]:
+        mappings: list[StateDictNameMapping] = []
+        model_mappings = [
+            ["embed_tokens.weight"],
+            ["norm.weight"],
+        ]
+        for layer_index in range(config.num_hidden_layers):
+            layer_mappings = [
+                [f"layers.{layer_index}.self_attn.q_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.k_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.v_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.q_proj.bias", None],
+                [f"layers.{layer_index}.self_attn.k_proj.bias", None],
+                [f"layers.{layer_index}.self_attn.v_proj.bias", None],
+                [f"layers.{layer_index}.self_attn.o_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.mlp.up_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.mlp.gate_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.mlp.down_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.rotary_emb.inv_freq"],
+                [f"layers.{layer_index}.input_layernorm.weight"],
+                [f"layers.{layer_index}.post_attention_layernorm.weight"],
+            ]
+            model_mappings.extend(layer_mappings)
+
+        init_name_mappings(mappings=model_mappings)
+        # base-model prefix "Qwen2MoEModel"
+        if "Qwen2Model" not in config.architectures:
+            for mapping in model_mappings:
+                mapping[0] = "model." + mapping[0]
+                mapping[1] = "qwen2." + mapping[1]
+            if not config.tie_word_embeddings:
+                model_mappings.append(["lm_head.weight", "lm_head.weight", "transpose"])
+
+        mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)]
+        return mappings
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config: Qwen2Config, is_split=True):
+        from paddlenlp.transformers.conversion_utils import split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def get_tensor_parallel_split_mappings(num_layers):
+            final_actions = {}
+
+            base_actions = {
+                # Row Linear
+                "embed_tokens.weight": partial(fn, is_column=False),
+                "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False),
+            }
+            if config.tie_word_embeddings:
+                base_actions["lm_head.weight"] = partial(fn, is_column=False)
+            else:
+                base_actions["lm_head.weight"] = partial(fn, is_column=True)
+
+            if not config.vocab_size % config.tensor_parallel_degree == 0:
+                base_actions.pop("lm_head.weight")
+                base_actions.pop("embed_tokens.weight")
+
+            # Column Linear
+            base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True)
+            base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True)
+            # if we have enough num_key_value_heads to split, then split it.
+            if config.num_key_value_heads % config.tensor_parallel_degree == 0:
+                base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True)
+                base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True)
+                base_actions["layers.0.self_attn.k_proj.bias"] = partial(fn, is_column=True)
+                base_actions["layers.0.self_attn.v_proj.bias"] = partial(fn, is_column=True)
+
+            base_actions["layers.0.mlp.up_proj.weight"] = partial(fn, is_column=True)
+            base_actions["layers.0.mlp.gate_proj.weight"] = partial(fn, is_column=True)
+            base_actions["layers.0.mlp.down_proj.weight"] = partial(fn, is_column=False)
+
+            for key, action in base_actions.items():
+                if "layers.0." in key:
+                    for i in range(num_layers):
+                        final_actions[key.replace("layers.0.", f"layers.{i}.")] = action
+                final_actions[key] = action
+
+            return final_actions
+
+        mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
+
+        return mappings
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if self.config.tensor_parallel_degree > 1:
+            rng_tracker = get_rng_state_tracker().rng_state
+        if isinstance(
+            layer,
+            (
+                nn.Linear,
+                nn.Embedding,
+                mpu.VocabParallelEmbedding,
+                mpu.RowParallelLinear,
+                mpu.ColumnParallelLinear,
+                linear_utils.RowSequenceParallelLinear,
+                linear_utils.ColumnSequenceParallelLinear,
+                Qwen2LMHead,
+            ),
+        ):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                if layer.weight.is_distributed:
+                    with rng_tracker():
+                        layer.weight.set_value(
+                            paddle.tensor.normal(
+                                mean=0.0,
+                                std=self.config.initializer_range
+                                if hasattr(self.config, "initializer_range")
+                                else self.qwen2.config.initializer_range,
+                                shape=layer.weight.shape,
+                            )
+                        )
+                else:
+                    layer.weight.set_value(
+                        paddle.tensor.normal(
+                            mean=0.0,
+                            std=self.config.initializer_range
+                            if hasattr(self.config, "initializer_range")
+                            else self.qwen2.config.initializer_range,
+                            shape=layer.weight.shape,
+                        )
+                    )
+            if hasattr(layer, "bias") and isinstance(layer.bias, paddle.Tensor):
+                layer.bias.set_value(paddle.zeros_like(layer.bias))
+        # Layer.apply is DFS https://github.com/PaddlePaddle/Paddle/blob/a6f5021fcc58b21f4414bae6bf4731ef6971582c/python/paddle/nn/layer/layers.py#L527-L530
+        # sublayer is init first
+        # scale RowParallelLinear weight
+        with paddle.no_grad():
+            if isinstance(layer, Qwen2MLP):
+                factor = 1 / math.sqrt(2 * self.config.num_hidden_layers)
+                layer.down_proj.weight.scale_(factor)
+            if isinstance(layer, Qwen2Attention):
+                factor = 1 / math.sqrt(2 * self.config.num_hidden_layers)
+                layer.o_proj.weight.scale_(factor)
+
+
+@register_base_model
+class Qwen2Model(Qwen2PretrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
+
+    Args:
+        config: Qwen2Config
+    """
+
+    def __init__(self, config: Qwen2Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.hidden_size = config.hidden_size
+        self.sequence_parallel = config.sequence_parallel
+        self.recompute_granularity = config.recompute_granularity
+        self.no_recompute_layers = config.no_recompute_layers if config.no_recompute_layers is not None else []
+
+        # Recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0:
+            self.embed_tokens = mpu.VocabParallelEmbedding(
+                self.vocab_size,
+                self.hidden_size,
+                weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()),
+            )
+        else:
+            self.embed_tokens = nn.Embedding(
+                self.vocab_size,
+                self.hidden_size,
+            )
+
+        self.layers = nn.LayerList(
+            [
+                Qwen2DecoderLayer(config, layerwise_recompute=layer_idx not in self.no_recompute_layers)
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = Qwen2RMSNorm(config)
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @staticmethod
+    def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length, dtype):
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            if len(attention_mask.shape) == 2:
+                expanded_attn_mask = _expand_2d_mask(attention_mask, dtype, tgt_length=input_shape[-1])
+                # For decoding phase in generation, seq_length = 1, we don't need to add causal mask
+                if input_shape[-1] > 1:
+                    combined_attention_mask = _make_causal_mask(
+                        input_shape,
+                        past_key_values_length=past_key_values_length,
+                    )
+                    expanded_attn_mask = expanded_attn_mask & combined_attention_mask
+            # [bsz, seq_len, seq_len] -> [bsz, 1, seq_len, seq_len]
+            elif len(attention_mask.shape) == 3:
+                expanded_attn_mask = attention_mask.unsqueeze(1).astype("bool")
+            # if attention_mask is already 4-D, do nothing
+            else:
+                expanded_attn_mask = attention_mask
+        else:
+            expanded_attn_mask = _make_causal_mask(
+                input_shape,
+                past_key_values_length=past_key_values_length,
+            )
+        # Convert bool attention_mask to float attention mask, which will be added to attention_scores later
+        expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
+        return expanded_attn_mask
+
+    @paddle.jit.not_to_static
+    def recompute_training_full(
+        self,
+        layer_module: nn.Layer,
+        hidden_states: Tensor,
+        position_ids: Optional[Tensor],
+        attention_mask: Tensor,
+        output_attentions: bool,
+        past_key_value: Tensor,
+        use_cache: bool,
+    ):
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+
+            return custom_forward
+
+        hidden_states = recompute(
+            create_custom_forward(layer_module),
+            hidden_states,
+            position_ids,
+            attention_mask,
+            output_attentions,
+            past_key_value,
+            use_cache,
+            use_reentrant=self.config.recompute_use_reentrant,
+        )
+
+        return hidden_states
+
+    def forward(
+        self,
+        input_ids: paddle.Tensor = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        past_key_values: Optional[List[paddle.Tensor]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states  # fmt:skip
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.layers))
+        # NOTE: to make cache can be clear in-time
+        past_key_values = list(past_key_values)
+
+        seq_length_with_past = seq_length
+        cache_length = 0
+        if past_key_values[0] is not None:
+            cache_length = past_key_values[0][0].shape[1]
+            seq_length_with_past += cache_length
+        if inputs_embeds is None:
+            # [bs, seq_len, dim]
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if self.sequence_parallel:
+            # [bs, seq_len, num_head * head_dim] -> [bs * seq_len, num_head * head_dim]
+            bs, seq_len, hidden_size = inputs_embeds.shape
+            inputs_embeds = paddle.reshape_(inputs_embeds, [bs * seq_len, hidden_size])
+            # [seq_len * bs / n, num_head * head_dim] (n is mp parallelism)
+            inputs_embeds = ScatterOp.apply(inputs_embeds)
+
+        # embed positions
+        if attention_mask is None:
+            # [bs, seq_len]
+            attention_mask = paddle.ones((batch_size, seq_length_with_past), dtype=paddle.bool)
+
+        if position_ids is None:
+            position_ids = paddle.arange(seq_length, dtype="int64").expand((batch_size, seq_length))
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), cache_length, inputs_embeds.dtype
+        )  # [bs, 1, seq_len, seq_len]
+        if self.config.use_flash_attention:
+            is_casual = is_casual_mask(attention_mask)
+            if is_casual:
+                attention_mask = None
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, (decoder_layer) in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            has_gradient = not hidden_states.stop_gradient
+            if (
+                self.enable_recompute
+                and idx not in self.no_recompute_layers
+                and has_gradient
+                and self.recompute_granularity == "full"
+            ):
+                layer_outputs = self.recompute_training_full(
+                    decoder_layer,
+                    hidden_states,
+                    position_ids,
+                    attention_mask,
+                    output_attentions,
+                    past_key_value,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    position_ids,
+                    attention_mask,
+                    output_attentions,
+                    past_key_value,
+                    use_cache,
+                )
+
+            # NOTE: clear outdate cache after it has been used for memory saving
+            past_key_value = past_key_values[idx] = None
+            if type(layer_outputs) is tuple:
+                hidden_states = layer_outputs[0]
+            else:
+                hidden_states = layer_outputs
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class Qwen2PretrainingCriterion(nn.Layer):
+    """
+    Criterion for Mixtral.
+    It calculates the final loss.
+    """
+
+    def __init__(self, config: Qwen2Config):
+        super(Qwen2PretrainingCriterion, self).__init__()
+        self.ignore_index = getattr(config, "ignore_index", -100)
+        self.config = config
+        self.enable_parallel_cross_entropy = config.tensor_parallel_degree > 1 and config.tensor_parallel_output
+
+        if self.enable_parallel_cross_entropy:  # and False: # and lm_head is distributed
+            self.loss_func = mpu.ParallelCrossEntropy(ignore_index=self.ignore_index)
+        else:
+            self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index)
+
+    def forward(self, prediction_scores, masked_lm_labels):
+        if self.enable_parallel_cross_entropy:
+            if prediction_scores.shape[-1] == self.config.vocab_size:
+                warnings.warn(
+                    f"enable_parallel_cross_entropy, the vocab_size should be splitted: {prediction_scores.shape[-1]}, {self.config.vocab_size}"
+                )
+                self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index)
+
+        with paddle.amp.auto_cast(False):
+            masked_lm_loss = self.loss_func(prediction_scores.astype("float32"), masked_lm_labels.unsqueeze(2))
+
+            # skip ignore_index which loss == 0
+            masked_lm_loss = masked_lm_loss[masked_lm_loss > 0]
+            loss = paddle.mean(masked_lm_loss)
+
+        return loss
+
+
+class Qwen2LMHead(nn.Layer):
+    def __init__(self, config: Qwen2Config, embedding_weights=None, transpose_y=False):
+        super(Qwen2LMHead, self).__init__()
+        self.config = config
+        if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0:
+            vocab_size = config.vocab_size // config.tensor_parallel_degree
+        else:
+            vocab_size = config.vocab_size
+
+        self.transpose_y = transpose_y
+        if transpose_y:
+            if embedding_weights is not None:
+                self.weight = embedding_weights
+            else:
+                self.weight = self.create_parameter(
+                    shape=[vocab_size, config.hidden_size],
+                    dtype=paddle.get_default_dtype(),
+                )
+        else:
+            if vocab_size != config.vocab_size:
+                with get_rng_state_tracker().rng_state():
+                    self.weight = self.create_parameter(
+                        shape=[config.hidden_size, vocab_size],
+                        dtype=paddle.get_default_dtype(),
+                    )
+            else:
+                self.weight = self.create_parameter(
+                    shape=[config.hidden_size, vocab_size],
+                    dtype=paddle.get_default_dtype(),
+                )
+
+        # Must set distributed attr for Tensor Parallel !
+        self.weight.is_distributed = True if (vocab_size != config.vocab_size) else False
+        if self.weight.is_distributed:
+            # for tie_word_embeddings
+            self.weight.split_axis = 0 if self.transpose_y else 1
+
+    def forward(self, hidden_states, tensor_parallel_output=None):
+        if self.config.sequence_parallel:
+            hidden_states = GatherOp.apply(hidden_states)
+            seq_length = self.config.seq_length
+            hidden_states = paddle.reshape_(hidden_states, [-1, seq_length, self.config.hidden_size])
+
+        if tensor_parallel_output is None:
+            tensor_parallel_output = self.config.tensor_parallel_output
+
+        logits = parallel_matmul(
+            hidden_states, self.weight, transpose_y=self.transpose_y, tensor_parallel_output=tensor_parallel_output
+        )
+        return logits
+
+
+class Qwen2ForCausalLM(Qwen2PretrainedModel):
+    enable_to_static_method = True
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: Qwen2Config):
+        super().__init__(config)
+        self.qwen2 = Qwen2Model(config)
+        if config.tie_word_embeddings:
+            self.lm_head = Qwen2LMHead(config, embedding_weights=self.qwen2.embed_tokens.weight, transpose_y=True)
+            self.tie_weights()
+        else:
+            self.lm_head = Qwen2LMHead(config)
+        self.criterion = Qwen2PretrainingCriterion(config)
+        self.vocab_size = config.vocab_size
+
+    def get_input_embeddings(self):
+        return self.qwen2.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.qwen2.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.qwen2 = decoder
+
+    def get_decoder(self):
+        return self.qwen2
+
+    def prepare_inputs_for_generation(
+        self, input_ids, use_cache=False, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        batch_size, seq_length = input_ids.shape
+        position_ids = kwargs.get("position_ids", paddle.arange(seq_length).expand((batch_size, seq_length)))
+        attention_mask = kwargs.get("attention_mask", None)
+        if past_key_values:
+            input_ids = input_ids[:, -1].unsqueeze(axis=-1)
+            position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    def _get_model_inputs_spec(self, dtype: str):
+        return {
+            "input_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+            "attention_mask": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+            "position_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+        }
+
+    @staticmethod
+    def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False):
+        # update cache
+        if isinstance(outputs, tuple) and len(outputs) > 1 and not isinstance(outputs[1], paddle.Tensor):
+            model_kwargs["past_key_values"] = outputs[1]
+
+        if isinstance(outputs, CausalLMOutputWithPast) and "past_key_values" in outputs:
+            model_kwargs["past_key_values"] = outputs.past_key_values
+
+        # update position_ids
+        if "position_ids" in model_kwargs and model_kwargs["position_ids"] is not None:
+            position_ids = model_kwargs["position_ids"]
+            model_kwargs["position_ids"] = paddle.concat([position_ids, position_ids[..., -1:] + 1], axis=-1)
+
+        if not is_encoder_decoder and "attention_mask" in model_kwargs:
+            # TODO: support attention mask for other models
+            attention_mask = model_kwargs["attention_mask"]
+            if len(attention_mask.shape) == 2:
+                model_kwargs["attention_mask"] = paddle.concat(
+                    [attention_mask, paddle.ones([attention_mask.shape[0], 1], dtype=attention_mask.dtype)],
+                    axis=-1,
+                )
+            elif len(attention_mask.shape) == 4:
+                model_kwargs["attention_mask"] = paddle.concat(
+                    [attention_mask, paddle.ones([*attention_mask.shape[:3], 1], dtype=attention_mask.dtype)],
+                    axis=-1,
+                )[:, :, -1:, :]
+
+        return model_kwargs
+
+    def forward(
+        self,
+        input_ids: paddle.Tensor = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        past_key_values: Optional[List[paddle.Tensor]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
+
+        >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.qwen2(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+
+        # if labels is None，means we need full output, instead of tensor_parallel_output
+        # tensor_parallel_output is together with ParallelCrossEntropy
+        tensor_parallel_output = (
+            self.config.tensor_parallel_output and labels is not None and self.config.tensor_parallel_degree > 1
+        )
+
+        logits = self.lm_head(hidden_states, tensor_parallel_output=tensor_parallel_output)
+
+        loss = None
+        if labels is not None:
+            loss = self.criterion(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class Qwen2ForSequenceClassification(Qwen2PretrainedModel):
+    def __init__(self, config: Qwen2Config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.qwen2 = Qwen2Model(config)
+        self.score = Linear(config.hidden_size, self.num_labels, bias_attr=False)
+
+    def get_input_embeddings(self):
+        return self.qwen2.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.qwen2.embed_tokens = value
+
+    def forward(
+        self,
+        input_ids: paddle.Tensor = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[List[paddle.Tensor]] = None,
+        labels: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`paddle.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.qwen2(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = paddle.equal(input_ids, self.config.pad_token_id).astype("int32").argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths
+            else:
+                sequence_lengths = -1
+
+        # pooled_logits = logits[paddle.arange(batch_size), sequence_lengths]
+        pooled_logits = logits.gather_nd(paddle.stack([paddle.arange(logits.shape[0]), sequence_lengths], axis=-1))
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.reshape([-1, self.num_labels]), labels.reshape([-1]))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = nn.BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->Qwen2, LLAMA->QWEN2
+class Qwen2ForTokenClassification(Qwen2PretrainedModel):
+    def __init__(self, config: Qwen2Config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.qwen2 = Qwen2Model(config)
+        if getattr(config, "classifier_dropout", None) is not None:
+            classifier_dropout = config.classifier_dropout
+        elif getattr(config, "hidden_dropout", None) is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.score = Linear(config.hidden_size, config.num_labels)
+
+    def get_input_embeddings(self):
+        return self.qwen2.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.qwen2.embed_tokens = value
+
+    def forward(
+        self,
+        input_ids: paddle.Tensor = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[List[paddle.Tensor]] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`paddle.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.qwen2(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.score(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.reshape([-1, self.num_labels]), labels.reshape([-1]))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2/modeling_pp.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2/modeling_pp.py
new file mode 100644
index 000000000..549e9e55b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2/modeling_pp.py
@@ -0,0 +1,289 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.distributed.fleet as fleet
+import paddle.nn as nn
+from paddle.distributed.fleet.meta_parallel import (
+    LayerDesc,
+    PipelineLayer,
+    SharedLayerDesc,
+)
+from paddle.distributed.fleet.utils import recompute
+
+from ...utils.tools import get_env_device
+from ..model_utils import PipelinePretrainedModel
+from .modeling import (
+    Qwen2Config,
+    Qwen2DecoderLayer,
+    Qwen2LMHead,
+    Qwen2Model,
+    Qwen2PretrainedModel,
+    Qwen2PretrainingCriterion,
+    Qwen2RMSNorm,
+)
+
+__all__ = [
+    "Qwen2ForCausalLMPipe",
+]
+
+
+def parse_args(args):
+    if isinstance(args, tuple):
+        if len(args) == 3:
+            hidden_states, attention_mask, position_ids = args
+        elif len(args) == 2:
+            hidden_states, attention_mask = args
+            position_ids = None
+        elif len(args) == 1:
+            hidden_states = args
+            attention_mask, position_ids = None, None
+    else:
+        hidden_states = args
+        attention_mask, position_ids = None, None
+
+    if position_ids is not None:
+        position_ids.stop_gradient = True
+
+    if attention_mask is not None:
+        attention_mask.stop_gradient = True
+
+    return hidden_states, attention_mask, position_ids
+
+
+def return_args(hidden_states, attention_mask=None, position_ids=None):
+    ret = (hidden_states,)
+
+    if attention_mask is not None:
+        ret += (attention_mask.clone(),)
+    if position_ids is not None:
+        ret += (position_ids.clone(),)
+    if len(ret) == 1:
+        ret = ret[0]
+
+    return ret
+
+
+def get_attr(layer, name):
+    if getattr(layer, name, None) is not None:
+        return getattr(layer, name, None)
+    else:
+        return get_attr(layer._layer, name)
+
+
+class Qwen2EmbeddingPipe(nn.Layer):
+    """Extends QWenEmbeddings to forward attention_mask through the pipeline."""
+
+    def __init__(self, config: Qwen2Config):
+        super(Qwen2EmbeddingPipe, self).__init__()
+        self.config = config
+        self.sequence_parallel = config.sequence_parallel
+        self.hidden_size = config.hidden_size
+        if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0:
+            self.embed_tokens = fleet.meta_parallel.VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()),
+            )
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+
+    @property
+    def embedding_weight(self):
+        return get_attr(self.embed_tokens, "weight")
+
+    def forward(self, args):
+        """_summary_
+
+        Args:
+            input (_type_): _description_
+
+        Returns:
+            _type_: _description_
+        """
+        input_ids, attention_mask, position_ids = parse_args(args)
+        input_embeds = self.embed_tokens(input_ids)
+        if self.config.sequence_parallel:
+            from paddlenlp.transformers import ScatterOp
+
+            # [bs, seq_len, num_head * head_dim] -> [bs * seq_len, num_head * head_dim]
+            bs, seq_len, hidden_size = input_embeds.shape
+            input_embeds = paddle.reshape_(input_embeds, [bs * seq_len, hidden_size])
+            # [seq_len * bs / n, num_head * head_dim] (n is mp parallelism)
+            input_embeds = ScatterOp.apply(input_embeds)
+
+        batch_size, seq_length = input_ids.shape
+
+        if attention_mask is not None:
+            attention_mask = Qwen2Model._prepare_decoder_attention_mask(
+                attention_mask, (batch_size, seq_length), 0, input_embeds.dtype
+            )
+            attention_mask.stop_gradient = True
+            if get_env_device() == "npu":
+                attention_mask = attention_mask.astype("bool")
+        elif get_env_device() == "npu":
+            attention_mask = paddle.tril(paddle.ones((seq_length, seq_length), dtype="bool"))
+            attention_mask.stop_gradient = True
+
+        return return_args(input_embeds, attention_mask, position_ids)
+
+
+class Qwen2DecoderLayerPipe(Qwen2DecoderLayer):
+    def forward(self, args):
+        hidden_states, attention_mask, position_ids = parse_args(args)
+
+        has_gradient = not hidden_states.stop_gradient
+
+        if self.enable_recompute and self.config.recompute_granularity == "full" and has_gradient:
+            if attention_mask is not None:
+                hidden_states = recompute(
+                    super().forward,
+                    hidden_states,
+                    position_ids=position_ids,
+                    attention_mask=attention_mask,
+                    use_reentrant=False,
+                )
+            else:
+                # for pretrain
+                hidden_states = recompute(
+                    super().forward,
+                    hidden_states,
+                    position_ids=position_ids,
+                    use_reentrant=self.config.recompute_use_reentrant,
+                )
+        else:
+            hidden_states = super().forward(hidden_states, position_ids=position_ids, attention_mask=attention_mask)
+
+        return return_args(hidden_states, attention_mask, position_ids)
+
+
+class Qwen2RMSNormPipe(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.norm = Qwen2RMSNorm(config)
+
+    def forward(self, args):
+        hidden_states, attention_mask, position_ids = parse_args(args)
+        return self.norm(hidden_states)
+
+
+class Qwen2LMHeadPipe(Qwen2LMHead):
+    def __init__(self, config, transpose_y=False):
+        super(Qwen2LMHeadPipe, self).__init__(config, transpose_y=transpose_y)
+
+    @property
+    def embedding_weight(self):
+        return get_attr(self, "weight")
+
+
+class Qwen2ForCausalLMPipe(PipelinePretrainedModel, PipelineLayer):
+    """QWenForPretraining adapted for pipeline parallelism.
+
+    The largest change is flattening the QWenModel class so we can express it as a
+    sequence of layers including embedding, transformer layers, and output.
+    """
+
+    config_class = Qwen2Config
+
+    _get_tensor_parallel_mappings = Qwen2PretrainedModel._get_tensor_parallel_mappings
+    _init_weights = Qwen2PretrainedModel._init_weights
+    _keys_to_ignore_on_load_unexpected = Qwen2PretrainedModel._keys_to_ignore_on_load_unexpected
+
+    # DONOT Add base_model_prefix !!!!
+
+    def __init__(self, config: Qwen2Config):
+        self.config = config
+
+        # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True
+        # Enable_recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        self.recompute_granularity = self.config.recompute_granularity
+        self.pp_recompute_interval = self.config.pp_recompute_interval
+        self.no_recompute_layers = config.no_recompute_layers if config.no_recompute_layers is not None else []
+        if self.recompute_granularity == "full":
+            assert len(self.no_recompute_layers) == 0, "for pp with full recompute, no_recompute_layers is not support"
+
+        virtual_pp_degree = getattr(self.config, "virtual_pp_degree", 1)
+
+        def get_hcg():
+            return fleet.get_hybrid_communicate_group()
+
+        hcg = get_hcg()
+        tensor_parallel_degree = max(hcg.get_model_parallel_world_size(), 1)
+        tensor_parallel_rank = max(hcg.get_model_parallel_rank(), 0)
+
+        # TODO: fix tensor_parallel_degree rewrite in here
+        config.tensor_parallel_degree = tensor_parallel_degree
+        config.tensor_parallel_rank = tensor_parallel_rank
+
+        if config.tie_word_embeddings:
+            self.add_sequential_layer(
+                SharedLayerDesc(
+                    "qwen2_shared_weight", Qwen2EmbeddingPipe, shared_weight_attr="embedding_weight", config=config
+                ),
+                "qwen2",
+            )
+        else:
+            self.add_sequential_layer(LayerDesc(Qwen2EmbeddingPipe, config=config), "qwen2")
+
+        for i in range(config.num_hidden_layers):
+            self.add_sequential_layer(
+                LayerDesc(Qwen2DecoderLayerPipe, config=config, layerwise_recompute=i not in self.no_recompute_layers),
+                f"qwen2.layers.{i}",
+            )
+        self.add_sequential_layer(LayerDesc(Qwen2RMSNormPipe, config=config), "qwen2")
+
+        if config.tie_word_embeddings:
+            self.add_sequential_layer(
+                SharedLayerDesc(
+                    "qwen2_shared_weight",
+                    Qwen2LMHeadPipe,
+                    shared_weight_attr="embedding_weight",
+                    config=config,
+                    **{"transpose_y": True},
+                ),
+                "lm_head",
+            )
+        else:
+            self.add_sequential_layer(LayerDesc(Qwen2LMHeadPipe, config=config), "lm_head")
+
+        recompute_interval = 0
+        if self.enable_recompute and self.recompute_granularity == "full":
+            assert self.config.pp_recompute_interval <= config.num_hidden_layers // (
+                virtual_pp_degree * get_hcg().topology().get_dim_size("pipe")
+            ), "pp recompute interval should smaller than num layers of each pp chunk"
+            recompute_interval = self.config.pp_recompute_interval
+
+        seg_method = "layer:Qwen2DecoderLayer"
+        if config.num_hidden_layers % get_hcg().topology().get_dim_size("pipe") != 0:
+            seg_method = "uniform"
+
+        PipelineLayer.__init__(
+            self,
+            layers=self.get_sequential_layers(),
+            loss_fn=Qwen2PretrainingCriterion(config),
+            topology=get_hcg().topology(),
+            seg_method=seg_method,
+            recompute_interval=recompute_interval,
+            recompute_ctx={
+                "mp_group": get_hcg().get_model_parallel_group(),
+                "offload": False,
+                "partition": False,
+            },
+            num_virtual_pipeline_stages=virtual_pp_degree,
+        )
+        # You should call init here, since there is a  diamond inheritance problem
+        self.apply(self._init_weights)
+        # DON'T init PipelinePretrainedModel
+        # PipelinePretrainedModel.__init__(self.super(), config=config)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2/tokenizer.py
new file mode 100644
index 000000000..83a172045
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2/tokenizer.py
@@ -0,0 +1,340 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for Qwen2."""
+
+import json
+import os
+import unicodedata
+from functools import lru_cache
+from typing import Optional, Tuple
+
+import regex as re
+
+from ...utils.log import logger
+from .. import AddedToken, PretrainedTokenizer
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+
+__all__ = ["Qwen2Tokenizer"]
+
+MAX_MODEL_INPUT_SIZES = {"__internal_testing__/tiny-random-qwen2": 32768}
+
+PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+    characters the bpe code barfs on.
+
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
+    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
+    tables between utf-8 bytes and unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class Qwen2Tokenizer(PretrainedTokenizer):
+    """
+    Construct a Qwen2 tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:
+
+    ```python
+    >>> from transformers import Qwen2Tokenizer
+
+    >>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer")
+    >>> tokenizer("Hello world")["input_ids"]
+    [9707, 1879]
+
+    >>> tokenizer(" Hello world")["input_ids"]
+    [21927, 1879]
+    ```
+    This is expected.
+
+    You should not use GPT2Tokenizer instead, because of the different pretokenization rules.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*):
+            The beginning of sequence token. Not applicable for this tokenizer.
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The end of sequence token.
+        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should cleanup the spaces that were added when splitting the input text during the
+            tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces.
+        split_special_tokens (`bool`, *optional*, defaults to `False`):
+            Whether or not the special tokens should be split during the tokenization process. The default behavior is
+            to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") =
+            ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<',
+            '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment.
+    """
+
+    resource_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    max_model_input_sizes = MAX_MODEL_INPUT_SIZES
+
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "__internal_testing__/tiny-random-qwen2": "https://bj.bcebos.com/paddlenlp/models/community/qwen2/vocab.json",
+        },
+    }
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        unk_token="<|endoftext|>",
+        bos_token=None,
+        eos_token="<|endoftext|>",
+        pad_token="<|endoftext|>",
+        clean_up_tokenization_spaces=False,
+        split_special_tokens=False,
+        **kwargs,
+    ):
+
+        if unk_token is None:
+            logger.info("The `unk_token` parameter needs to be defined: we use `eos_token` by default.")
+            unk_token = eos_token
+
+        # Qwen vocab does not contain control tokens; added tokens need to be special
+        bos_token = (
+            AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(bos_token, str)
+            else bos_token
+        )
+        eos_token = (
+            AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(eos_token, str)
+            else eos_token
+        )
+        unk_token = (
+            AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(unk_token, str)
+            else unk_token
+        )
+        pad_token = (
+            AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
+            if isinstance(pad_token, str)
+            else pad_token
+        )
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        bpe_merges = []
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            for i, line in enumerate(merges_handle):
+                line = line.strip()
+                if (i == 0 and line.startswith("#version:")) or not line:
+                    continue
+                bpe_merges.append(tuple(line.split()))
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        # NOTE: the cache can grow without bound and will get really large for long running processes
+        # (esp. for texts of language that do not use space between word, e.g. Chinese); technically
+        # not a memory leak but appears as one.
+        # GPT2Tokenizer has the same problem, so let's be consistent.
+        self.cache = {}
+
+        self.pat = re.compile(PRETOKENIZE_REGEX)
+
+        if kwargs.get("add_prefix_space", False):
+            logger.warning_once(
+                f"{self.__class__.__name} does not support `add_prefix_space`, setting it to True has no effect."
+            )
+
+        super().__init__(
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            unk_token=unk_token,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            split_special_tokens=split_special_tokens,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self.encoder)
+
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            token = "".join(
+                self.byte_encoder[b] for b in token.encode("utf-8")
+            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.added_tokens_encoder.get(token, len(self.encoder)))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index, self.added_tokens_decoder.get(index, self.unk_token))
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    def _decode(
+        self,
+        token_ids,
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = False,
+        spaces_between_special_tokens: bool = False,
+        **kwargs,
+    ) -> str:
+        # `spaces_between_special_tokens` defaults to True for _decode in slow tokenizers
+        # and cannot be configured elsewhere, but it should default to False for Qwen2Tokenizer
+        return super()._decode(
+            token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+            **kwargs,
+        )
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return vocab_file, merge_file
+
+    def prepare_for_tokenization(self, text, **kwargs):
+        text = unicodedata.normalize("NFC", text)
+        return (text, kwargs)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2_moe/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2_moe/__init__.py
new file mode 100644
index 000000000..2f2acfa9b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2_moe/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..qwen2.tokenizer import *
+from .configuration import *
+from .modeling import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2_moe/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2_moe/configuration.py
new file mode 100644
index 000000000..baf7a2551
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2_moe/configuration.py
@@ -0,0 +1,186 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Qwen2Moe model configuration"""
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = [
+    "Qwen2MoeConfig",
+]
+
+
+class Qwen2MoeConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen2MoeModel`]. It is used to instantiate a
+    Qwen2Moe model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen1.5-MoE-A2.7B" [Qwen/Qwen1.5-MoE-A2.7B"](https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B").
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen2Moe model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen2MoeModel`]
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 5632):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 16):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 28):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        decoder_sparse_step (`int`, *optional*, defaults to 1):
+            The frequency of the MoE layer.
+        moe_intermediate_size (`int`, *optional*, defaults to 1408):
+            Intermediate size of the routed expert.
+        shared_expert_intermediate_size (`int`, *optional*, defaults to 5632):
+            Intermediate size of the shared expert.
+        num_experts_per_tok (`int`, *optional*, defaults to 4):
+            Number of selected experts.
+        num_experts (`int`, *optional*, defaults to 60):
+            Number of routed experts.
+        norm_topk_prob (`bool`, *optional*, defaults to `False`):
+            Whether to normalize the topk probabilities.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabeling this will also
+            allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+
+    ```python
+    >>> from paddlenlp.transformers import Qwen2MoeModel, Qwen2MoeConfig
+
+    >>> # Initializing a Qwen2Moe style configuration
+    >>> configuration = Qwen2MoeConfig()
+
+    >>> # Initializing a model from the Qwen1.5-MoE-A2.7B" style configuration
+    >>> model = Qwen2MoeModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen2_moe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=2048,
+        intermediate_size=5632,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        hidden_act="silu",
+        max_position_embeddings=8192,
+        seq_length=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        attention_dropout=0.0,
+        rope_theta=1000000.0,
+        pad_token_id=0,
+        bos_token_id=151643,
+        eos_token_id=151643,
+        tie_word_embeddings=False,
+        use_sliding_window=False,
+        sliding_window=32768,
+        max_window_layers=28,
+        decoder_sparse_step=1,
+        moe_intermediate_size=1408,
+        shared_expert_intermediate_size=5632,
+        num_experts_per_tok=4,
+        num_experts=60,
+        norm_topk_prob=False,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.seq_length = seq_length
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window
+        self.max_window_layers = max_window_layers
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+
+        self.use_cache = use_cache
+
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        # MoE arguments
+        self.decoder_sparse_step = decoder_sparse_step
+        self.moe_intermediate_size = moe_intermediate_size
+        self.shared_expert_intermediate_size = shared_expert_intermediate_size
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.norm_topk_prob = norm_topk_prob
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2_moe/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2_moe/modeling.py
new file mode 100644
index 000000000..ee6f56b5b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/qwen2_moe/modeling.py
@@ -0,0 +1,1556 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Paddle Qwen2Moe model."""
+from __future__ import annotations
+
+import math
+import warnings
+from functools import partial
+from typing import Optional, Tuple
+
+import paddle
+import paddle.distributed.fleet.meta_parallel as mpu
+import paddle.nn.functional as F
+from paddle import Tensor, nn
+from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
+from paddle.distributed.fleet.utils import recompute
+
+from ...utils.log import logger
+from .. import linear_utils
+from ..activations import ACT2FN
+from ..conversion_utils import StateDictNameMapping, init_name_mappings
+from ..model_outputs import MoECausalLMOutputWithPast, MoEModelOutputWithPast
+from ..model_utils import PretrainedModel, register_base_model
+from .configuration import Qwen2MoeConfig
+
+try:
+    from paddle.incubate.nn.functional import fused_rotary_position_embedding
+except ImportError:
+    fused_rotary_position_embedding = None
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        GatherOp,
+        ScatterOp,
+        mark_as_sequence_parallel_parameter,
+    )
+except ImportError:
+    pass
+
+try:
+    from paddle.nn.functional.flash_attention import flash_attention
+except:
+    flash_attention = None
+
+__all__ = [
+    "Qwen2MoeModel",
+    "Qwen2MoePretrainedModel",
+    "Qwen2MoeForCausalLM",
+    "Qwen2MoePretrainingCriterion",
+]
+
+
+def load_balancing_loss_func(gate_logits, num_experts, top_k=2, attention_mask=None):
+    """
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Paddle.
+    See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+    Args:
+        gate_logits (Union[`paddle.Tensor`, Tuple[paddle.Tensor]):
+            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
+            shape [batch_size X sequence_length, num_experts].
+        num_experts (`int`):
+            Number of experts.
+        top_k (`int`):
+            Number of top k experts to be considered for the loss computation.
+        attention_mask (`paddle.Tensor`, None):
+            The attention_mask used in forward function
+            shape [batch_size X sequence_length] if not None.
+    Returns:
+        The auxiliary loss.
+    """
+    if gate_logits is None or not isinstance(gate_logits, tuple):
+        return 0
+
+    if isinstance(gate_logits, tuple):
+        concatenated_gate_logits = paddle.concat(
+            gate_logits, axis=0
+        )  # [num_hidden_layers X batch_size X sequence_length, num_experts]
+
+    routing_weights = F.softmax(concatenated_gate_logits, axis=-1)
+    _, selected_experts = paddle.topk(routing_weights, top_k, axis=-1)
+    expert_mask = F.one_hot(
+        selected_experts, num_classes=num_experts
+    )  # [num_hidden_layers X batch_size X sequence_length, top_k, num_experts]
+
+    if attention_mask is None or len(attention_mask.shape) == 4:
+        # Only intokens strategy has 4-D attention_mask, we currently do not support excluding padding tokens.
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = paddle.mean(expert_mask.astype("float32"), axis=0)
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = paddle.mean(routing_weights, axis=0)
+    else:
+        # Exclude the load balancing loss of padding tokens.
+        if len(attention_mask.shape) == 2:
+            batch_size, sequence_length = attention_mask.shape
+            num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+
+            # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+            expert_attention_mask = (
+                attention_mask[None, :, :, None, None]
+                .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
+                .reshape([-1, top_k, num_experts])
+            )  # [num_hidden_layers * batch_size * sequence_length, top_k, num_experts]
+
+            # Compute the percentage of tokens routed to each experts
+            tokens_per_expert = paddle.sum(expert_mask.astype("float32") * expert_attention_mask, axis=0) / paddle.sum(
+                expert_attention_mask, axis=0
+            )
+
+            # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+            router_per_expert_attention_mask = (
+                attention_mask[None, :, :, None]
+                .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
+                .reshape([-1, num_experts])
+            )
+
+            # Compute the average probability of routing to these experts
+            router_prob_per_expert = paddle.sum(
+                routing_weights * router_per_expert_attention_mask, axis=0
+            ) / paddle.sum(router_per_expert_attention_mask, axis=0)
+
+    overall_loss = paddle.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
+    return overall_loss * num_experts
+
+
+def get_triangle_upper_mask(x, mask=None):
+    if mask is not None:
+        return mask
+    # [bsz, n_head, q_len, kv_seq_len]
+    shape = x.shape
+    #  [bsz, 1, q_len, kv_seq_len]
+    shape[1] = 1
+    mask = paddle.full(shape, paddle.finfo(x.dtype).min, dtype=x.dtype)
+    mask = paddle.triu(mask, diagonal=1)
+    mask.stop_gradient = True
+    return mask
+
+
+def assign_kv_heads(num_kv_heads: int, num_gpus: int):
+    # Initialize the assignment list
+    """
+    Assign kv heads to different GPUs in the Tensor Parallel Setup
+
+    Examples:
+        assign_kv_heads(num_kv_heads=1, num_gpus=2): [[0], [0]]
+        assign_kv_heads(num_kv_heads=2, num_gpus=2): [[0], [1]]
+        assign_kv_heads(num_kv_heads=4, num_gpus=2): [[0,1], [2,3]]
+        assign_kv_heads(num_kv_heads=1, num_gpus=4): [[0],[0],[0],[0]]
+        assign_kv_heads(num_kv_heads=2, num_gpus=4): [[0],[0],[1],[1]]
+        assign_kv_heads(num_kv_heads=4, num_gpus=4): [[0],[1],[2],[3]]
+    """
+    assignment_list = [[] for _ in range(num_gpus)]
+    # Case 1: more heads than cards
+    if num_kv_heads > num_gpus:
+        num_heads_per_card = num_kv_heads // num_gpus
+        for i in range(num_gpus):
+            for j in range(num_heads_per_card):
+                assignment_list[i].append(i * num_heads_per_card + j)
+    # Case 2: more cards than heads. each card get only 1 head.
+    else:
+        num_card_per_heads = num_gpus // num_kv_heads
+        for i in range(num_kv_heads):
+            for j in range(num_card_per_heads):
+                assignment_list[i * num_card_per_heads + j].append(i)
+    return assignment_list
+
+
+def parallel_matmul(x: Tensor, y: Tensor, tensor_parallel_output=True):
+    is_fleet_init = True
+    tensor_parallel_degree = 1
+    try:
+        hcg = fleet.get_hybrid_communicate_group()
+        model_parallel_group = hcg.get_model_parallel_group()
+        tensor_parallel_degree = hcg.get_model_parallel_world_size()
+    except:
+        is_fleet_init = False
+
+    if paddle.in_dynamic_mode():
+        y_is_distributed = y.is_distributed
+    else:
+        y_is_distributed = tensor_parallel_degree > 1
+
+    if is_fleet_init and tensor_parallel_degree > 1 and y_is_distributed:
+        # if not running under distributed.launch, it will raise AttributeError: 'Fleet' object has no attribute '_hcg'
+        input_parallel = paddle.distributed.collective._c_identity(x, group=model_parallel_group)
+        logits = paddle.matmul(input_parallel, y, transpose_y=False)
+
+        if tensor_parallel_output:
+            return logits
+
+        return paddle.distributed.collective._c_concat(logits, group=model_parallel_group)
+
+    else:
+        logits = paddle.matmul(x, y, transpose_y=False)
+        return logits
+
+
+def scaled_dot_product_attention(
+    query_states,
+    config,
+    key_states,
+    value_states,
+    attention_mask,
+    output_attentions,
+    training=True,
+    sequence_parallel=False,
+):
+    bsz, q_len, num_heads, head_dim = query_states.shape
+    _, kv_seq_len, _, _ = value_states.shape
+
+    if config.use_flash_attention and flash_attention:
+        # Paddle Flash Attention input [ bz, seqlen, nhead, head_dim]
+        # Torch Flash Attention input [ bz, nhead, seqlen, head_dim]
+
+        version = paddle.version.full_version
+        if version != "0.0.0" and version <= "2.5.2":
+            attn_output, attn_weights = flash_attention(
+                query_states,
+                key_states,
+                value_states,
+                causal=True,
+                return_softmax=output_attentions,
+            )
+        else:
+            attn_output = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                is_causal=attention_mask is None,
+                dropout_p=config.attention_dropout if training else 0.0,
+                training=training,
+            )
+            attn_weights = None
+
+        if sequence_parallel:
+            attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])
+        else:
+            attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
+        return (attn_output, attn_weights) if output_attentions else attn_output
+    else:
+        #  [ bz, seqlen, nhead, head_dim] -> [bs, nhead, seq_len, head_dim]
+        query_states = paddle.transpose(query_states, [0, 2, 1, 3])
+        # merge with the next tranpose
+        key_states = paddle.transpose(key_states, [0, 2, 1, 3])
+        value_states = paddle.transpose(value_states, [0, 2, 1, 3])
+
+        # matmul and devide by sqrt(head_dim)
+        attn_weights = paddle.matmul(query_states / math.sqrt(head_dim), key_states.transpose([0, 1, 3, 2]))
+
+        if attn_weights.shape != [bsz, num_heads, q_len, kv_seq_len]:
+            raise ValueError(
+                f"Attention weights should be of shape {(bsz, num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.shape}"
+            )
+
+        if attention_mask is None:
+            attention_mask = get_triangle_upper_mask(attn_weights)
+        attention_mask = attention_mask.reshape([bsz, 1, q_len, kv_seq_len])
+        if attention_mask.shape != [bsz, 1, q_len, kv_seq_len]:
+            raise ValueError(
+                f"Attention mask should be of shape {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.shape}"
+            )
+
+        attn_weights = attn_weights + attention_mask
+        if not paddle.in_dynamic_mode():
+            attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query_states.dtype)
+        else:
+            with paddle.amp.auto_cast(False):
+                attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query_states.dtype)
+
+        attn_weights = F.dropout(attn_weights, p=config.attention_dropout, training=training)
+
+        attn_output = paddle.matmul(attn_weights, value_states)
+        attn_output = attn_output.transpose([0, 2, 1, 3])
+
+        if sequence_parallel:
+            attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])
+        else:
+            attn_output = attn_output.reshape([bsz, q_len, head_dim * num_heads])
+        return (attn_output, attn_weights) if output_attentions else attn_output
+
+
+def masked_fill(x, mask, value):
+    y = paddle.full(x.shape, value, x.dtype)
+    return paddle.where(mask, y, x)
+
+
+def is_casual_mask(attention_mask):
+    """
+    Upper triangular of attention_mask equals to attention_mask is casual
+    """
+    return (paddle.triu(attention_mask) == attention_mask).all().item()
+
+
+def _make_causal_mask(input_ids_shape, past_key_values_length):
+    """
+    Make causal mask used for self-attention
+    """
+    batch_size, target_length = input_ids_shape  # target_length: seq_len
+
+    mask = paddle.tril(paddle.ones((target_length, target_length), dtype="bool"))
+
+    if past_key_values_length > 0:
+        # [tgt_len, tgt_len + past_len]
+        mask = paddle.concat([paddle.ones([target_length, past_key_values_length], dtype="bool"), mask], axis=-1)
+
+    # [bs, 1, tgt_len, tgt_len + past_len]
+    return mask[None, None, :, :].expand([batch_size, 1, target_length, target_length + past_key_values_length])
+
+
+def _expand_2d_mask(mask, dtype, tgt_length):
+    """
+    Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`.
+    """
+    batch_size, src_length = mask.shape[0], mask.shape[-1]
+    tgt_length = tgt_length if tgt_length is not None else src_length
+
+    mask = mask[:, None, None, :].astype("bool")
+    mask.stop_gradient = True
+    expanded_mask = mask.expand([batch_size, 1, tgt_length, src_length])
+
+    return expanded_mask
+
+
+class Qwen2MoeRMSNorm(nn.Layer):
+    def __init__(self, config: Qwen2MoeConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.weight = paddle.create_parameter(
+            shape=[self.hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(1.0),
+        )
+        self.variance_epsilon = config.rms_norm_eps
+        self.config = config
+
+        if config.sequence_parallel:
+            mark_as_sequence_parallel_parameter(self.weight)
+
+    def forward(self, hidden_states):
+        if paddle.in_dynamic_mode():
+            with paddle.amp.auto_cast(False):
+                hidden_states = hidden_states.astype("float32")
+                variance = hidden_states.pow(2).mean(-1, keepdim=True)
+                hidden_states = paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
+        else:
+            hidden_states = hidden_states.astype("float32")
+            variance = hidden_states.pow(2).mean(-1, keepdim=True)
+            hidden_states = paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
+
+        if self.weight.dtype in [paddle.float16, paddle.bfloat16]:
+            hidden_states = paddle.cast(hidden_states, self.weight.dtype)
+        return hidden_states * self.weight
+
+
+class Qwen2MoeRotaryEmbedding(nn.Layer):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        # [dim / 2]
+        self.inv_freq = 1.0 / (self.base ** (paddle.cast(paddle.arange(0, self.dim, 2), dtype="float32") / self.dim))
+        self._set_cos_sin_cache(seq_len=max_position_embeddings)
+
+    def _set_cos_sin_cache(self, seq_len):
+        self.max_seq_len_cached = seq_len
+        # [seq_len]
+        t = paddle.arange(seq_len, dtype="float32")
+        # [seq_len, dim/2]
+        freqs = paddle.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        # [seq_len, dim]
+        emb = paddle.concat([freqs, freqs], axis=-1)
+        # [1, seqlen, 1, dim]
+        self.cos_cached = emb.cos()[None, :, None, :]
+        self.sin_cached = emb.sin()[None, :, None, :]
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len)
+        cos = self.cos_cached[:, :seq_len, :, :]
+        sin = self.sin_cached[:, :seq_len, :, :]
+        return (
+            cos.cast(x.dtype) if cos.dtype != x.dtype else cos,
+            sin.cast(x.dtype) if sin.dtype != x.dtype else sin,
+        )
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return paddle.concat([-x2, x1], axis=-1)  # shape is the same as x
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    if position_ids is None:
+        # Note: Only for Qwen2MoeForCausalLMPipe model pretraining
+        cos = cos[:, : q.shape[1], :, :]  # [bs, seq_len, 1, dim]
+        sin = sin[:, : q.shape[1], :, :]  # [bs, seq_len, 1, dim]
+    else:
+        cos = cos.squeeze(axis=[0, 2])  # [seq_len, dim]
+        sin = sin.squeeze(axis=[0, 2])  # [seq_len, dim]
+        cos = cos[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
+        sin = sin[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+# Modified from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Qwen2Moe
+class Qwen2MoeMLP(nn.Layer):
+    def __init__(self, config: Qwen2MoeConfig, is_shared=False):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = (
+            config.moe_intermediate_size if not is_shared else config.shared_expert_intermediate_size
+        )
+        self.tensor_parallel_degree = config.tensor_parallel_degree
+
+        if config.sequence_parallel:
+            ColumnParallelLinear = linear_utils.ColumnSequenceParallelLinear
+            RowParallelLinear = linear_utils.RowSequenceParallelLinear
+        else:
+            ColumnParallelLinear = linear_utils.ColumnParallelLinear
+            RowParallelLinear = linear_utils.RowParallelLinear
+
+        if config.tensor_parallel_degree > 1:
+            self.gate_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.intermediate_size,
+                gather_output=False,
+                has_bias=False,
+            )
+            self.up_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.intermediate_size,
+                gather_output=False,
+                has_bias=False,
+            )
+            self.down_proj = RowParallelLinear(
+                self.intermediate_size,
+                self.hidden_size,
+                input_is_parallel=True,
+                has_bias=False,
+            )
+        else:
+            self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False)  # w1
+            self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False)  # w3
+            self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias_attr=False)  # w2
+
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+def repeat_kv(hidden_states: paddle.Tensor, n_rep: int) -> paddle.Tensor:
+    """
+    This is the equivalent of paddle.repeat_interleave(hidden_states, n_rep, axis=1). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, slen, num_key_value_heads, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+
+    hidden_states = hidden_states.unsqueeze(-2).tile([1, 1, 1, n_rep, 1])
+    return hidden_states.reshape([batch, slen, num_key_value_heads * n_rep, head_dim])
+
+
+class Qwen2MoeAttention(nn.Layer):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: Qwen2MoeConfig, layerwise_recompute: bool = True):
+        super().__init__()
+
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+
+        self.head_dim = self.hidden_size // config.num_attention_heads
+
+        self.num_key_value_heads = config.num_key_value_heads
+        assert config.num_attention_heads // config.num_key_value_heads
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.gqa_or_mqa = config.num_attention_heads != config.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        self.seq_length = config.seq_length
+        self.sequence_parallel = config.sequence_parallel
+
+        # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True
+        # Enable_recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        self.layerwise_recompute = layerwise_recompute
+        self.recompute_granularity = config.recompute_granularity
+        if config.tensor_parallel_degree > 1:
+            assert (
+                self.num_heads % config.tensor_parallel_degree == 0
+            ), f"num_heads: {self.num_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}"
+            self.num_heads = self.num_heads // config.tensor_parallel_degree
+
+            assert (
+                self.num_key_value_heads % config.tensor_parallel_degree == 0
+            ), f"num_key_value_heads: {self.num_key_value_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}"
+            self.num_key_value_heads = self.num_key_value_heads // config.tensor_parallel_degree
+
+        self.use_fused_rope = config.use_fused_rope
+        if self.use_fused_rope:
+            if "gpu" not in paddle.device.get_device() or fused_rotary_position_embedding is None:
+                warnings.warn(
+                    "Enable fuse rope in the config, but fuse rope is not available. "
+                    "Will disable fuse rope. Try using latest gpu version of Paddle."
+                )
+                self.use_fused_rope = False
+
+        if config.sequence_parallel:
+            ColumnParallelLinear = linear_utils.ColumnSequenceParallelLinear
+            RowParallelLinear = linear_utils.RowSequenceParallelLinear
+        else:
+            ColumnParallelLinear = linear_utils.ColumnParallelLinear
+            RowParallelLinear = linear_utils.RowParallelLinear
+
+        if config.tensor_parallel_degree > 1:
+            self.q_proj = ColumnParallelLinear(self.hidden_size, self.hidden_size, has_bias=True, gather_output=False)
+            self.k_proj = ColumnParallelLinear(
+                self.hidden_size, self.config.num_key_value_heads * self.head_dim, has_bias=True, gather_output=False
+            )
+            self.v_proj = ColumnParallelLinear(
+                self.hidden_size, self.config.num_key_value_heads * self.head_dim, has_bias=True, gather_output=False
+            )
+            self.o_proj = RowParallelLinear(self.hidden_size, self.hidden_size, has_bias=False, input_is_parallel=True)
+        else:
+            self.q_proj = nn.Linear(self.hidden_size, self.hidden_size, bias_attr=True)
+            self.k_proj = nn.Linear(self.hidden_size, self.config.num_key_value_heads * self.head_dim, bias_attr=True)
+            self.v_proj = nn.Linear(self.hidden_size, self.config.num_key_value_heads * self.head_dim, bias_attr=True)
+            self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias_attr=False)
+
+        self.rotary_emb = Qwen2MoeRotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        position_ids: Optional[Tuple[paddle.Tensor]] = None,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        # [bs, seq_len, num_head * head_dim] -> [seq_len / n, bs, num_head * head_dim] (n is model parallelism)
+
+        batch_size, seq_len, _ = hidden_states.shape
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        if self.sequence_parallel:
+            target_query_shape = [-1, self.seq_length, self.num_heads, self.head_dim]
+            target_key_value_shape = [-1, self.seq_length, self.num_key_value_heads, self.head_dim]
+        else:
+            target_query_shape = [0, 0, self.num_heads, self.head_dim]
+            target_key_value_shape = [0, 0, self.num_key_value_heads, self.head_dim]
+        query_states = query_states.reshape(shape=target_query_shape)
+        key_states = key_states.reshape(shape=target_key_value_shape)
+        value_states = value_states.reshape(shape=target_key_value_shape)
+
+        kv_seq_len = key_states.shape[-3]
+
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-3]
+
+        if self.use_fused_rope:
+            assert past_key_value is None, "fuse rotary not support cache kv for now"
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+            query_states, key_states, _ = fused_rotary_position_embedding(
+                query_states,
+                key_states,
+                v=None,
+                sin=sin,
+                cos=cos,
+                position_ids=position_ids,
+                use_neox_rotary_style=False,
+            )
+        else:
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        # [bs, seq_len, num_head, head_dim]
+        if past_key_value is not None:
+            key_states = paddle.concat([past_key_value[0], key_states], axis=1)
+            value_states = paddle.concat([past_key_value[1], value_states], axis=1)
+        past_key_value = (key_states, value_states) if use_cache else None
+
+        # TODO(wj-Mcat): use broadcast strategy when n_kv_heads = 1
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        has_gradient = not (query_states.stop_gradient and key_states.stop_gradient and value_states.stop_gradient)
+        if (
+            self.enable_recompute
+            and self.layerwise_recompute
+            and has_gradient
+            and self.recompute_granularity == "core_attn"
+        ):
+            outputs = recompute(
+                scaled_dot_product_attention,
+                query_states,
+                self.config,
+                key_states,
+                value_states,
+                attention_mask,
+                output_attentions,
+                self.training,
+                self.sequence_parallel,
+                use_reentrant=self.config.recompute_use_reentrant,
+            )
+        else:
+            outputs = scaled_dot_product_attention(
+                query_states,
+                self.config,
+                key_states,
+                value_states,
+                attention_mask,
+                output_attentions,
+                self.training,
+                self.sequence_parallel,
+            )
+        if output_attentions:
+            attn_output, attn_weights = outputs
+        else:
+            attn_output = outputs
+
+        # if sequence_parallel is true, out shape are [q_len / n, bs, num_head * head_dim]
+        # else their shape are [bs, q_len, num_head * head_dim], n is mp parallelism.
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        outputs = (attn_output,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        if use_cache:
+            outputs += (past_key_value,)
+
+        if type(outputs) is tuple and len(outputs) == 1:
+            outputs = outputs[0]
+
+        return outputs
+
+
+class Qwen2MoeSparseMoEBlock(nn.Layer):
+    def __init__(self, config: Qwen2MoeConfig):
+        super().__init__()
+        self.num_experts = config.num_experts
+        self.top_k = config.num_experts_per_tok
+        self.norm_topk_prob = config.norm_topk_prob
+
+        self.gate = nn.Linear(config.hidden_size, self.num_experts, bias_attr=False)
+        self.experts = nn.LayerList([Qwen2MoeMLP(config) for _ in range(self.num_experts)])
+
+        self.shared_expert = Qwen2MoeMLP(config, is_shared=True)
+        self.shared_expert_gate = nn.Linear(config.hidden_size, 1, bias_attr=False)
+
+    def forward(self, hidden_states):
+        batch_size, seq_len, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.reshape([-1, hidden_dim])
+        # router_logits: [batch_size * seq_len, num_experts]
+        router_logits = self.gate(hidden_states)
+
+        with paddle.amp.auto_cast(False):
+            routing_weights = F.softmax(router_logits.astype("float32"), axis=1)
+        routing_weights, selected_experts = paddle.topk(routing_weights, self.top_k, axis=-1)
+        if self.norm_topk_prob:  # Note: Mixtral is set norm as default, Qwen2Moe is set to no norm
+            routing_weights /= routing_weights.sum(axis=-1, keepdim=True)
+        # we cast back to input dtype
+        routing_weights = routing_weights.astype(hidden_states.dtype)
+
+        final_hidden_states = paddle.zeros(
+            [batch_size * seq_len, hidden_dim],
+            dtype=hidden_states.dtype,
+        )
+
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated.
+        # shape: [num_experts, top_k, batch_size * seq_len]
+        expert_mask = F.one_hot(selected_experts, num_classes=self.num_experts).transpose([2, 1, 0])
+
+        # Loop over all available experts in the model and perform the computation on each expert.
+        for expert_id in range(self.num_experts):
+            expert_layer = self.experts[expert_id]
+            idx, top_x = paddle.where(expert_mask[expert_id])
+
+            if top_x.shape[0] == 0:
+                continue
+
+            current_state = paddle.gather(hidden_states, top_x.squeeze())
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx]
+
+            top_x = top_x.squeeze()
+            if top_x.shape == []:
+                top_x = paddle.to_tensor([top_x.item()])
+            final_hidden_states = paddle.index_add_(
+                final_hidden_states, top_x, 0, current_hidden_states.astype(hidden_states.dtype)
+            )
+
+        shared_expert_output = self.shared_expert(hidden_states)
+        shared_expert_output = F.sigmoid(self.shared_expert_gate(hidden_states)) * shared_expert_output
+
+        final_hidden_states = final_hidden_states + shared_expert_output
+
+        final_hidden_states = final_hidden_states.reshape([batch_size, seq_len, hidden_dim])
+        return final_hidden_states, router_logits
+
+
+class Qwen2MoeDecoderLayer(nn.Layer):
+    def __init__(self, config: Qwen2MoeConfig, layerwise_recompute: bool = False):
+        super().__init__()
+        self.config = config
+
+        self.self_attn = Qwen2MoeAttention(config, layerwise_recompute)
+
+        if config.num_experts > 0:
+            self.mlp = Qwen2MoeSparseMoEBlock(config)
+        else:
+            # num_experts == 0 or this layer is not sparse layer
+            self.mlp = Qwen2MoeMLP(config)
+
+        self.input_layernorm = Qwen2MoeRMSNorm(config)
+        self.post_attention_layernorm = Qwen2MoeRMSNorm(config)
+
+        self.sequence_parallel = config.sequence_parallel
+        # Note that we will actually perform a recompute only if both enable_recompute and layerwise_recompute are set to True
+        # Enable_recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        self.layerwise_recompute = layerwise_recompute
+        self.recompute_granularity = config.recompute_granularity
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        position_ids: Optional[Tuple[paddle.Tensor]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        output_router_logits: Optional[bool] = False,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
+        """
+        Args:
+            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`paddle.Tensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
+                should not be returned during inference.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(paddle.Tensor)`, *optional*): cached past key and value projection states
+        """
+
+        # [bs * seq_len, embed_dim] -> [seq_len * bs / n, embed_dim] (sequence_parallel)
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        has_gradient = not hidden_states.stop_gradient
+        if (
+            self.enable_recompute
+            and self.layerwise_recompute
+            and has_gradient
+            and self.recompute_granularity == "full_attn"
+        ):
+            outputs = recompute(
+                self.self_attn,
+                hidden_states,
+                position_ids,
+                past_key_value,
+                attention_mask,
+                output_attentions,
+                use_cache,
+                use_reentrant=self.config.recompute_use_reentrant,
+            )
+        else:
+            outputs = self.self_attn(
+                hidden_states,
+                position_ids,
+                past_key_value,
+                attention_mask,
+                output_attentions,
+                use_cache,
+            )
+
+        if type(outputs) is tuple:
+            hidden_states = outputs[0]
+        else:
+            hidden_states = outputs
+
+        if output_attentions:
+            self_attn_weights = outputs[1]
+
+        if use_cache:
+            present_key_value = outputs[2 if output_attentions else 1]
+
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        if isinstance(hidden_states, tuple):
+            hidden_states, router_logits = hidden_states
+        else:
+            router_logits = None
+
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        if output_router_logits:
+            outputs += (router_logits,)
+
+        if type(outputs) is tuple and len(outputs) == 1:
+            outputs = outputs[0]
+
+        return outputs
+
+
+class Qwen2MoePretrainedModel(PretrainedModel):
+    config_class = Qwen2MoeConfig
+    base_model_prefix = "qwen2_moe"
+    _keys_to_ignore_on_load_unexpected = [r"self_attn.rotary_emb.inv_freq"]
+
+    @classmethod
+    def _get_name_mappings(cls, config: Qwen2MoeConfig) -> list[StateDictNameMapping]:
+        mappings: list[StateDictNameMapping] = []
+        model_mappings = [
+            ["embed_tokens.weight"],
+            ["norm.weight"],
+        ]
+        for layer_index in range(config.num_hidden_layers):
+            layer_mappings = [
+                [f"layers.{layer_index}.self_attn.q_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.k_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.v_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.q_proj.bias", None],
+                [f"layers.{layer_index}.self_attn.k_proj.bias", None],
+                [f"layers.{layer_index}.self_attn.v_proj.bias", None],
+                [f"layers.{layer_index}.self_attn.o_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.rotary_emb.inv_freq"],
+                [f"layers.{layer_index}.input_layernorm.weight"],
+                [f"layers.{layer_index}.post_attention_layernorm.weight"],
+            ]
+            model_mappings.extend(layer_mappings)
+
+            for expert_idx in range(config.num_experts):
+                expert_mappings = [
+                    [f"layers.{layer_index}.mlp.experts.{expert_idx}.gate_proj.weight", None, "transpose"],
+                    [f"layers.{layer_index}.mlp.experts.{expert_idx}.down_proj.weight", None, "transpose"],
+                    [f"layers.{layer_index}.mlp.experts.{expert_idx}.up_proj.weight", None, "transpose"],
+                ]
+                model_mappings.extend(expert_mappings)
+            model_mappings.append([f"layers.{layer_index}.mlp.gate.weight", None, "transpose"])
+
+            model_mappings.append([f"layers.{layer_index}.mlp.shared_expert.gate_proj.weight", None, "transpose"])
+            model_mappings.append([f"layers.{layer_index}.mlp.shared_expert.down_proj.weight", None, "transpose"])
+            model_mappings.append([f"layers.{layer_index}.mlp.shared_expert.up_proj.weight", None, "transpose"])
+            model_mappings.append([f"layers.{layer_index}.mlp.shared_expert_gate.weight", None, "transpose"])
+
+        init_name_mappings(mappings=model_mappings)
+        # base-model prefix "Qwen2MoeModel"
+        if "Qwen2MoeModel" not in config.architectures:
+            for mapping in model_mappings:
+                mapping[0] = "model." + mapping[0]
+                mapping[1] = "qwen2_moe." + mapping[1]
+            model_mappings.append(["lm_head.weight", "lm_head.weight", "transpose"])
+
+        mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)]
+        return mappings
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config: Qwen2MoeConfig, is_split=True):
+        from paddlenlp.transformers.conversion_utils import split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def get_tensor_parallel_split_mappings(num_layers, num_experts):
+            final_actions = {}
+
+            base_actions = {
+                "lm_head.weight": partial(fn, is_column=True),
+                # Row Linear
+                "embed_tokens.weight": partial(fn, is_column=False),
+                "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False),
+            }
+
+            if not config.vocab_size % config.tensor_parallel_degree == 0:
+                base_actions.pop("lm_head.weight")
+                base_actions.pop("embed_tokens.weight")
+
+            # Column Linear
+            base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True)
+            base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True)
+            # if we have enough num_key_value_heads to split, then split it.
+            if config.num_key_value_heads % config.tensor_parallel_degree == 0:
+                base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True)
+                base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True)
+                base_actions["layers.0.self_attn.k_proj.bias"] = partial(fn, is_column=True)
+                base_actions["layers.0.self_attn.v_proj.bias"] = partial(fn, is_column=True)
+
+            for key, action in base_actions.items():
+                if "layers.0." in key:
+                    for i in range(num_layers):
+                        final_actions[key.replace("layers.0.", f"layers.{i}.")] = action
+                final_actions[key] = action
+
+            # Add tp split for expert params.
+            base_actions = {
+                "layers.0.mlp.experts.0.gate_proj.weight": partial(fn, is_column=True),
+                "layers.0.mlp.experts.0.down_proj.weight": partial(fn, is_column=False),
+                "layers.0.mlp.experts.0.up_proj.weight": partial(fn, is_column=True),
+            }
+            for key, action in base_actions.items():
+                for i in range(num_layers):
+                    newkey = key.replace("layers.0.", f"layers.{i}.")
+                    for j in range(num_experts):
+                        newkey2 = newkey.replace("experts.0.", f"experts.{j}.")
+                        final_actions[newkey2] = action
+
+            # Add tp split for shared expert params.
+            base_actions = {
+                "layers.0.mlp.shared_expert.gate_proj.weight": partial(fn, is_column=True),
+                "layers.0.mlp.shared_expert.up_proj.weight": partial(fn, is_column=True),
+                "layers.0.mlp.shared_expert.down_proj.weight": partial(fn, is_column=False),
+            }
+            for key, action in base_actions.items():
+                if "layers.0." in key:
+                    for i in range(num_layers):
+                        final_actions[key.replace("layers.0.", f"layers.{i}.")] = action
+                final_actions[key] = action
+
+            return final_actions
+
+        mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers, config.num_experts)
+
+        return mappings
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if self.config.tensor_parallel_degree > 1:
+            rng_tracker = get_rng_state_tracker().rng_state
+        if isinstance(
+            layer,
+            (
+                nn.Linear,
+                nn.Embedding,
+                mpu.VocabParallelEmbedding,
+                mpu.ColumnParallelLinear,
+                mpu.RowParallelLinear,
+                Qwen2MoeLMHead,
+                linear_utils.ColumnSequenceParallelLinear,
+                linear_utils.RowSequenceParallelLinear,
+            ),
+        ):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                if layer.weight.is_distributed:
+                    with rng_tracker():
+                        layer.weight.set_value(
+                            paddle.tensor.normal(
+                                mean=0.0,
+                                std=self.config.initializer_range
+                                if hasattr(self.config, "initializer_range")
+                                else self.qwen2_moe.config.initializer_range,
+                                shape=layer.weight.shape,
+                            )
+                        )
+                else:
+                    layer.weight.set_value(
+                        paddle.tensor.normal(
+                            mean=0.0,
+                            std=self.config.initializer_range
+                            if hasattr(self.config, "initializer_range")
+                            else self.qwen2_moe.config.initializer_range,
+                            shape=layer.weight.shape,
+                        )
+                    )
+            if hasattr(layer, "bias") and isinstance(layer.bias, paddle.Tensor):
+                layer.bias.set_value(paddle.zeros_like(layer.bias))
+        # Layer.apply is DFS https://github.com/PaddlePaddle/Paddle/blob/a6f5021fcc58b21f4414bae6bf4731ef6971582c/python/paddle/nn/layer/layers.py#L527-L530
+        # sublayer is init first
+        # scale RowParallelLinear weight
+        with paddle.no_grad():
+            if isinstance(layer, Qwen2MoeMLP):
+                factor = 1 / math.sqrt(2 * self.config.num_hidden_layers)
+                layer.down_proj.weight.scale_(factor)
+            if isinstance(layer, Qwen2MoeAttention):
+                factor = 1 / math.sqrt(2 * self.config.num_hidden_layers)
+                layer.o_proj.weight.scale_(factor)
+
+
+@register_base_model
+class Qwen2MoeModel(Qwen2MoePretrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2MoeDecoderLayer`]
+    Args:
+        config: Qwen2MoeConfig
+    """
+
+    def __init__(self, config: Qwen2MoeConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+        self.sequence_parallel = config.sequence_parallel
+        self.recompute_granularity = config.recompute_granularity
+        self.no_recompute_layers = config.no_recompute_layers if config.no_recompute_layers is not None else []
+
+        # Recompute defaults to False and is controlled by Trainer
+        self.enable_recompute = False
+        if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0:
+            self.embed_tokens = mpu.VocabParallelEmbedding(
+                self.vocab_size,
+                self.hidden_size,
+                weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()),
+            )
+        else:
+            self.embed_tokens = nn.Embedding(
+                self.vocab_size,
+                self.hidden_size,
+            )
+
+        self.layers = nn.LayerList(
+            [
+                Qwen2MoeDecoderLayer(config, layerwise_recompute=layer_idx not in self.no_recompute_layers)
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = Qwen2MoeRMSNorm(config)
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @staticmethod
+    def _prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length, dtype):
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            if len(attention_mask.shape) == 2:
+                expanded_attn_mask = _expand_2d_mask(attention_mask, dtype, tgt_length=input_shape[-1])
+                # For decoding phase in generation, seq_length = 1, we don't need to add causal mask
+                if input_shape[-1] > 1:
+                    combined_attention_mask = _make_causal_mask(
+                        input_shape,
+                        past_key_values_length=past_key_values_length,
+                    )
+                    expanded_attn_mask = expanded_attn_mask & combined_attention_mask
+            # [bsz, seq_len, seq_len] -> [bsz, 1, seq_len, seq_len]
+            elif len(attention_mask.shape) == 3:
+                expanded_attn_mask = attention_mask.unsqueeze(1).astype("bool")
+            # if attention_mask is already 4-D, do nothing
+            else:
+                expanded_attn_mask = attention_mask
+        else:
+            expanded_attn_mask = _make_causal_mask(
+                input_shape,
+                past_key_values_length=past_key_values_length,
+            )
+        # Convert bool attention_mask to float attention mask, which will be added to attention_scores later
+        expanded_attn_mask = paddle.where(expanded_attn_mask, 0.0, paddle.finfo(dtype).min).astype(dtype)
+        return expanded_attn_mask
+
+    @paddle.jit.not_to_static
+    def recompute_training_full(
+        self,
+        layer_module: nn.Layer,
+        hidden_states: Tensor,
+        position_ids: Optional[Tensor],
+        attention_mask: Tensor,
+        output_attentions: bool,
+        output_router_logits: bool,
+        past_key_value: Tensor,
+        use_cache: bool,
+    ):
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+
+            return custom_forward
+
+        hidden_states = recompute(
+            create_custom_forward(layer_module),
+            hidden_states,
+            position_ids,
+            attention_mask,
+            output_attentions,
+            output_router_logits,
+            past_key_value,
+            use_cache,
+            use_reentrant=self.config.recompute_use_reentrant,
+        )
+
+        return hidden_states
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        use_cache=None,
+        past_key_values=None,
+        output_attentions=False,
+        output_hidden_states=None,
+        output_router_logits: Optional[bool] = None,
+        return_dict=False,
+        **kwargs,
+    ):
+        if self.sequence_parallel and use_cache:
+            raise ValueError("We currently only support sequence parallel without cache.")
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.layers))
+        # NOTE: to make cache can be clear in-time
+        past_key_values = list(past_key_values)
+
+        seq_length_with_past = seq_length
+        cache_length = 0
+        if past_key_values[0] is not None:
+            cache_length = past_key_values[0][0].shape[1]
+            seq_length_with_past += cache_length
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if self.sequence_parallel:
+            # [bs, seq_len, num_head * head_dim] -> [bs * seq_len, num_head * head_dim]
+            bs, seq_len, hidden_size = inputs_embeds.shape
+            inputs_embeds = paddle.reshape_(inputs_embeds, [bs * seq_len, hidden_size])
+            # [seq_len * bs / n, num_head * head_dim] (n is mp parallelism)
+            inputs_embeds = ScatterOp.apply(inputs_embeds)
+
+        # embed positions
+        if attention_mask is None:
+            # [bs, seq_len]
+            attention_mask = paddle.ones((batch_size, seq_length_with_past), dtype=paddle.bool)
+
+        if position_ids is None:
+            position_ids = paddle.arange(seq_length, dtype="int64").expand((batch_size, seq_length))
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), cache_length, inputs_embeds.dtype
+        )  # [bs, 1, seq_len, seq_len]
+        if self.config.use_flash_attention:
+            is_casual = is_casual_mask(attention_mask)
+            if is_casual:
+                attention_mask = None
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_router_logits = () if output_router_logits else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, (decoder_layer) in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            has_gradient = not hidden_states.stop_gradient
+            if (
+                self.enable_recompute
+                and idx not in self.no_recompute_layers
+                and has_gradient
+                and self.recompute_granularity == "full"
+            ):
+                layer_outputs = self.recompute_training_full(
+                    decoder_layer,
+                    hidden_states,
+                    position_ids,
+                    attention_mask,
+                    output_attentions,
+                    output_router_logits,
+                    past_key_value,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    position_ids,
+                    attention_mask,
+                    output_attentions,
+                    output_router_logits,
+                    past_key_value,
+                    use_cache,
+                )
+
+            # NOTE: clear outdate cache after it has been used for memory saving
+            past_key_value = past_key_values[idx] = None
+            if type(layer_outputs) is tuple:
+                hidden_states = layer_outputs[0]
+            else:
+                hidden_states = layer_outputs
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_router_logits:
+                all_router_logits += (layer_outputs[-1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
+                if v is not None
+            )
+        return MoEModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            router_logits=all_router_logits,
+        )
+
+
+class Qwen2MoePretrainingCriterion(nn.Layer):
+    """
+    Criterion for Mixtral.
+    It calculates the final loss.
+    """
+
+    def __init__(self, config: Qwen2MoeConfig):
+        super(Qwen2MoePretrainingCriterion, self).__init__()
+        self.ignore_index = getattr(config, "ignore_index", -100)
+        self.config = config
+        self.enable_parallel_cross_entropy = config.tensor_parallel_degree > 1 and config.tensor_parallel_output
+
+        if self.enable_parallel_cross_entropy:  # and False: # and lm_head is distributed
+            self.loss_func = mpu.ParallelCrossEntropy(ignore_index=self.ignore_index)
+        else:
+            self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index)
+
+    def forward(self, prediction_scores, masked_lm_labels):
+        if self.enable_parallel_cross_entropy:
+            if prediction_scores.shape[-1] == self.config.vocab_size:
+                warnings.warn(
+                    f"enable_parallel_cross_entropy, the vocab_size should be splited: {prediction_scores.shape[-1]}, {self.config.vocab_size}"
+                )
+                self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none", ignore_index=self.ignore_index)
+
+        with paddle.amp.auto_cast(False):
+            masked_lm_loss = self.loss_func(prediction_scores.astype("float32"), masked_lm_labels.unsqueeze(2))
+
+            # skip ignore_index which loss == 0
+            masked_lm_loss = masked_lm_loss[masked_lm_loss > 0]
+            loss = paddle.mean(masked_lm_loss)
+
+        return loss
+
+
+class Qwen2MoeLMHead(nn.Layer):
+    def __init__(self, config: Qwen2MoeConfig):
+        super(Qwen2MoeLMHead, self).__init__()
+        self.config = config
+        if config.tensor_parallel_degree > 1 and config.vocab_size % config.tensor_parallel_degree == 0:
+            vocab_size = config.vocab_size // config.tensor_parallel_degree
+        else:
+            vocab_size = config.vocab_size
+
+        self.weight = self.create_parameter(
+            shape=[config.hidden_size, vocab_size],
+            dtype=paddle.get_default_dtype(),
+        )
+        # Must set distributed attr for Tensor Parallel !
+        self.weight.is_distributed = True if (vocab_size != config.vocab_size) else False
+        if self.weight.is_distributed:
+            self.weight.split_axis = 1
+
+    def forward(self, hidden_states, tensor_parallel_output=None):
+        if self.config.sequence_parallel:
+            hidden_states = GatherOp.apply(hidden_states)
+            seq_length = self.config.seq_length
+            hidden_states = paddle.reshape_(hidden_states, [-1, seq_length, self.config.hidden_size])
+
+        if tensor_parallel_output is None:
+            tensor_parallel_output = self.config.tensor_parallel_output
+
+        logits = parallel_matmul(hidden_states, self.weight, tensor_parallel_output=tensor_parallel_output)
+        return logits
+
+
+class Qwen2MoeForCausalLM(Qwen2MoePretrainedModel):
+    enable_to_static_method = True
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: Qwen2MoeConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.qwen2_moe = Qwen2MoeModel(config)
+        self.lm_head = Qwen2MoeLMHead(config)
+        self.criterion = Qwen2MoePretrainingCriterion(config)
+        self.router_aux_loss_coef = config.router_aux_loss_coef
+        self.num_experts = config.num_experts
+        self.num_experts_per_tok = config.num_experts_per_tok
+        # Initialize weights and apply final processing
+
+        if config.sliding_window:
+            self.config.sliding_window = False
+            logger.warning("We do not support sliding window attention for now.")
+
+    def get_input_embeddings(self):
+        return self.qwen2_moe.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.qwen2_moe.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.qwen2_moe = decoder
+
+    def get_decoder(self):
+        return self.qwen2_moe
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        use_cache=False,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        output_router_logits=False,
+        **kwargs
+    ):
+        batch_size, seq_length = input_ids.shape
+        position_ids = kwargs.get("position_ids", paddle.arange(seq_length).expand((batch_size, seq_length)))
+        attention_mask = kwargs.get("attention_mask", None)
+        if past_key_values:
+            input_ids = input_ids[:, -1].unsqueeze(axis=-1)
+            position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "output_router_logits": output_router_logits,
+            }
+        )
+        return model_inputs
+
+    def _get_model_inputs_spec(self, dtype: str):
+        return {
+            "input_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+            "attention_mask": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+            "position_ids": paddle.static.InputSpec(shape=[None, None], dtype="int64"),
+        }
+
+    @staticmethod
+    def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False):
+        # update cache
+        if isinstance(outputs, tuple) and len(outputs) > 1 and not isinstance(outputs[1], paddle.Tensor):
+            model_kwargs["past_key_values"] = outputs[1]
+
+        if isinstance(outputs, MoECausalLMOutputWithPast) and "past_key_values" in outputs:
+            model_kwargs["past_key_values"] = outputs.past_key_values
+
+        # update position_ids
+        if "position_ids" in model_kwargs and model_kwargs["position_ids"] is not None:
+            position_ids = model_kwargs["position_ids"]
+            model_kwargs["position_ids"] = paddle.concat([position_ids, position_ids[..., -1:] + 1], axis=-1)
+
+        if not is_encoder_decoder and "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = paddle.concat(
+                [attention_mask, paddle.ones([attention_mask.shape[0], 1], dtype=attention_mask.dtype)], axis=-1
+            )
+
+        return model_kwargs
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=False,
+        past_key_values=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        output_router_logits: Optional[bool] = None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.qwen2_moe(
+            input_ids=input_ids,  # [bs, seq_len]
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_router_logits=output_router_logits,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]  # [bs, seq_len, dim]
+
+        # if labels is None，means we need full output, instead of tensor_parallel_output
+        # tensor_parallel_output is togather with ParallelCrossEntropy
+        tensor_parallel_output = (
+            self.config.tensor_parallel_output and labels is not None and self.config.tensor_parallel_degree > 1
+        )
+
+        logits = self.lm_head(hidden_states, tensor_parallel_output=tensor_parallel_output)
+
+        loss = None
+        if labels is not None:
+            loss = self.criterion(logits, labels)
+
+        aux_loss = None
+        if output_router_logits:
+            aux_loss = load_balancing_loss_func(
+                outputs.router_logits if return_dict else outputs[-1],
+                self.num_experts,
+                self.num_experts_per_tok,
+                attention_mask,
+            )
+            if labels is not None:
+                loss += self.router_aux_loss_coef * aux_loss
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            if output_router_logits:
+                output = (aux_loss,) + output
+            return (loss,) + output if loss is not None else output
+
+        return MoECausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            router_logits=outputs.router_logits,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/reformer/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/reformer/__init__.py
new file mode 100644
index 000000000..3b5b28f31
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/reformer/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/reformer/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/reformer/configuration.py
new file mode 100644
index 000000000..b804c2cef
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/reformer/configuration.py
@@ -0,0 +1,310 @@
+# coding=utf-8
+# Copyright 2020 The Trax Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Reformer model configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["REFORMER_PRETRAINED_INIT_CONFIGURATION", "ReformerConfig", "REFORMER_PRETRAINED_RESOURCE_FILES_MAP"]
+
+REFORMER_PRETRAINED_INIT_CONFIGURATION = {
+    "reformer-enwik8": {
+        "tie_word_embeddings": False,
+        "is_decoder": True,
+        "chunk_size_feed_forward": 0,
+        "pad_token_id": 0,
+        "hash_seed": None,
+        "vocab_size": 258,
+        "attention_head_size": 128,
+        "hidden_size": 1024,
+        "num_attention_heads": 8,
+        "num_hashes": 4,
+        "num_hidden_layers": 12,
+        "num_buckets": 512,
+        "lsh_attn_chunk_length": 256,
+        "local_attn_chunk_length": 128,
+        "lsh_num_chunks_after": 0,
+        "lsh_num_chunks_before": 1,
+        "local_num_chunks_after": 0,
+        "local_num_chunks_before": 1,
+        "hidden_act": "relu",
+        "feed_forward_size": 4096,
+        "hidden_dropout_prob": 0.2,
+        "lsh_attention_probs_dropout_prob": 0.1,
+        "local_attention_probs_dropout_prob": 0.2,
+        "max_position_embeddings": 65536,
+        "initializer_range": 0.02,
+        "layer_norm_eps": 1e-12,
+        "axial_pos_embds": True,
+        "axial_pos_shape": [128, 512],
+        "axial_pos_embds_dim": [256, 768],
+        "axial_norm_std": 1.0,
+        "chunk_size_lm_head": 0,
+        "attn_layers": [
+            "local",
+            "local",
+            "lsh",
+            "local",
+            "local",
+            "local",
+            "lsh",
+            "local",
+            "local",
+            "local",
+            "lsh",
+            "local",
+        ],
+    },
+    "reformer-crime-and-punishment": {
+        "tie_word_embeddings": False,
+        "is_decoder": True,
+        "chunk_size_feed_forward": 0,
+        "pad_token_id": 0,
+        "num_hidden_layers": 6,
+        "hash_seed": None,
+        "vocab_size": 320,
+        "attention_head_size": 64,
+        "hidden_size": 256,
+        "num_attention_heads": 2,
+        "num_hashes": 1,
+        "num_buckets": [64, 128],
+        "lsh_attn_chunk_length": 64,
+        "local_attn_chunk_length": 64,
+        "lsh_num_chunks_after": 0,
+        "lsh_num_chunks_before": 1,
+        "local_num_chunks_after": 0,
+        "local_num_chunks_before": 1,
+        "hidden_act": "relu",
+        "feed_forward_size": 512,
+        "hidden_dropout_prob": 0.05,
+        "lsh_attention_probs_dropout_prob": 0.0,
+        "local_attention_probs_dropout_prob": 0.05,
+        "max_position_embeddings": 524288,
+        "initializer_range": 0.02,
+        "layer_norm_eps": 1e-12,
+        "axial_pos_embds": True,
+        "axial_pos_shape": [512, 1024],
+        "axial_pos_embds_dim": [64, 192],
+        "axial_norm_std": 1.0,
+        "chunk_size_lm_head": 0,
+        "attn_layers": ["local", "lsh", "local", "lsh", "local", "lsh"],
+    },
+}
+
+REFORMER_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "reformer-enwik8": "http://paddlenlp.bj.bcebos.com/models/transformers/reformer/reformer-enwik8/model_state.pdparams",
+        "reformer-crime-and-punishment": "http://paddlenlp.bj.bcebos.com/models/transformers/reformer/reformer-crime-and-punishment/model_state.pdparams",
+    }
+}
+
+
+class ReformerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ReformerModel`]. It is used to instantiate a
+    Reformer model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the ReFormer
+    [google/reformer-crime-and-punishment](https://huggingface.co/google/reformer-crime-and-punishment) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        tie_word_embeddings (bool, optional):
+            Whether to tie input and output embeddings. Defaults to `False`.
+        is_decoder (bool, optional):
+            Whether or not to use a causal mask in addition to the `attention_mask` passed to `ReformerModel`. When using the Reformer for causal language modeling, this argument should be set to `True`. Defaults to `True`.
+        chunk_size_feed_forward (int, optional):
+            The chunk size of all feed forward layers in the residual attention blocks. A chunk size of `0` means
+            that the feed forward layer is not chunked. A chunk size of n means that the feed forward layer processes
+            `n` < sequence_length embeddings at a time. Defaults to `0`.
+        pad_token_id (int, optional):
+            The id of the `padding` token. Defaults to `0`.
+        hash_seed (int, optional):
+            Seed that can be used to make local sensitive hashing in `LSHSelfAttention` deterministic. This should
+            only be set for testing purposed. For evaluation and training purposes `hash_seed` should be left as
+            `None` to ensure fully random rotations in local sensitive hashing scheme. Defaults to `None`.
+        vocab_size (int, optional):
+            Vocabulary size of `inputs_ids` in `ReformerModel`. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `ReformerModel`. Defaults to `258`.
+        attention_head_size (int, optional):
+            Dimensionality of the projected key, query and value vectors. Defaults to `128`.
+        hidden_size (int, optional):
+            Dimensionality of the embedding layer, encoder layer.Defaults to `1024`.
+        num_attention_heads (int, optional):
+            Number of attention heads for each attention layer in the Transformer encoder.
+            Defaults to `8`.
+        num_hashes (int, optional):
+            Number of hashing rounds (e.g., number of random rotations) in Local Sensitive Hashing scheme. The higher `num_hashes`, the more accurate the `LSHSelfAttention` becomes, but also the more memory and time intensive the hashing becomes. Defaults to `4`.
+        num_hidden_layers (int, optional):
+            Number of hidden layers in the Transformer encoder. Defaults to `12`.
+        num_buckets (int or List[int], optional):
+            Number of buckets, the key query vectors can be "hashed into" using the locality sensitive hashing scheme.
+            Each query key vector is hashed into a hash in `1, ..., num_buckets`. The number of buckets can also be factorized into a list for improved memory complexity. In this case, each query key vector is hashed into a hash in `1-1, 1-2, ..., num_buckets[0]-1, ..., num_buckets[0]-num_buckets[1]` if `num_buckets` is factorized into two factors. The number of buckets (or the product the factors) should approximately equal sequence length / lsh_chunk_length. If `num_buckets` not set, a good value is calculated on the fly. Defaults to `512`.
+        lsh_attn_chunk_length (int, optional):
+            Length of chunk which attends to itself in `LSHSelfAttention`. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).Defaults to `256`.
+        local_attn_chunk_length (int, optional):
+            Length of chunk which attends to itself in `LocalSelfAttention`. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).Defaults to `128`.
+        lsh_num_chunks_after (int, optional):
+            Number of following neighbouring chunks to attend to in `LSHSelfAttention` layer to itself. Defaults to `0`.
+        lsh_num_chunks_before (int, optional):
+            Number of previous neighbouring chunks to attend to in `LSHSelfAttention` layer to itself. Defaults to `1`.
+        local_num_chunks_after (int, optional):
+            Number of following neighbouring chunks to attend to in `LocalSelfAttention` layer to itself. Defaults to `0`.
+        local_num_chunks_before (int, optional):
+            Number of previous neighbouring chunks to attend to in `LocalSelfAttention` layer to itself. Defaults to `1`.
+        hidden_act (str, optional):
+            The non-linear activation function (function or string) in the feed forward layer in the residual attention block. If string, `"gelu"`, `"relu"`, `"tanh"`, `"mish"` and `"gelu_new"` are supported. Defaults to `"relu"`.
+        feed_forward_size (int, optional):
+            Dimensionality of the feed_forward layer in the residual attention block. Defaults to `4096`.
+        hidden_dropout_prob (float, optional):
+            The dropout ratio for all fully connected layers in the embeddings and encoder. Defaults to `0.2`.
+        lsh_attention_probs_dropout_prob (float, optional):
+            The dropout ratio for the attention probabilities in `LSHSelfAttention`. Defaults to `0.1`.
+        local_attention_probs_dropout_prob (float, optional):
+            The dropout ratio for the attention probabilities in `LocalSelfAttention`. Defaults to `0.2`.
+        max_position_embeddings (int, optional):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). Defaults to `65536`.
+        initializer_range (float, optional):
+            The standard deviation of the normal initializer. Defaults to `0.02`.
+
+            .. note::
+                A normal_initializer initializes weight matrices as normal distributions.
+                See :meth:`ReformerPretrainedModel._init_weights()` for how weights are initialized in `ReformerModel`.
+
+        layer_norm_eps (float, optional):
+            The epsilon used by the layer normalization layers. Defaults to `1e-12`.
+
+        axial_pos_embds (bool, optional):
+            Whether or not to use axial position embeddings. Defaults to `True`.
+        axial_pos_shape (List[int], optional):
+            The position dims of the axial position encodings. During training, the product of the position dims has to be equal to the sequence length. Defaults to `[128, 512]`.
+        axial_pos_embds_dim (List[int], optional):
+            The embedding dims of the axial position encodings. The sum of the embedding dims has to be equal to the
+            hidden size. Defaults to `[256, 768]`.
+        axial_norm_std (float, optional):
+            The standard deviation of the normal_initializer for initializing the weight matrices of the axial
+            positional encodings. Defaults to `1.0`.
+        chunk_size_lm_head (int, optional):
+            The chunk size of the final language model feed forward head layer. A chunk size of 0 means that the feed forward layer is not chunked. A chunk size of n means that the feed forward layer processes n <
+            sequence_length embeddings at a time. Defaults to `0`.
+        attn_layers (List[str], optional):
+            List of attention layer types in ascending order. It can be chosen between a LSHSelfAttention layer
+            (`"lsh"`) and a LocalSelfAttention layer (`"local"`). Defaults to `["local", "local", "lsh", "local", "local", "local", "lsh", "local", "local", "local", "lsh", "local"]`.
+
+    """
+    model_type = "reformer"
+    attribute_map: Dict[str, str] = {
+        "num_attention_heads": "num_heads",
+        "num_hidden_layers": "num_layers",
+        "num_classes": "num_labels",
+    }
+    pretrained_init_configuration = REFORMER_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        axial_pos_shape=[128, 512],
+        axial_pos_embds_dim=[256, 768],
+        hidden_dropout_prob=0.2,
+        attn_layers=[
+            "local",
+            "local",
+            "lsh",
+            "local",
+            "local",
+            "local",
+            "lsh",
+            "local",
+            "local",
+            "local",
+            "lsh",
+            "local",
+        ],
+        lsh_attn_chunk_length=256,
+        local_attn_chunk_length=128,
+        hidden_size=1024,
+        max_position_embeddings=65536,
+        axial_pos_embds=True,
+        vocab_size=258,
+        num_hashes=4,
+        num_buckets=512,
+        lsh_num_chunks_before=1,
+        lsh_num_chunks_after=0,
+        hash_seed=None,
+        is_decoder=True,
+        lsh_attention_probs_dropout_prob=0.1,
+        num_attention_heads=8,
+        attention_head_size=128,
+        local_num_chunks_before=1,
+        local_num_chunks_after=0,
+        pad_token_id=0,
+        local_attention_probs_dropout_prob=0.2,
+        layer_norm_eps=1e-12,
+        hidden_act="relu",
+        feed_forward_size=4096,
+        chunk_size_feed_forward=0,
+        chunk_size_lm_head=0,
+        tie_word_embeddings=False,
+        initializer_range=0.02,
+        axial_norm_std=1.0,
+        use_cache=True,
+        classifier_dropout=None,
+        num_hidden_layers=12,
+        **kwargs
+    ):
+
+        self.axial_pos_shape = tuple(axial_pos_shape)
+        self.axial_pos_embds_dim = tuple(axial_pos_embds_dim)
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attn_layers = attn_layers
+        self.lsh_attn_chunk_length = lsh_attn_chunk_length
+        self.local_attn_chunk_length = local_attn_chunk_length
+        self.hidden_size = hidden_size
+        self.max_position_embeddings = max_position_embeddings
+        self.axial_pos_embds = axial_pos_embds
+        self.vocab_size = vocab_size
+        self.num_hashes = num_hashes
+        self.num_buckets = tuple(num_buckets) if isinstance(num_buckets, list) else num_buckets
+        self.lsh_num_chunks_before = lsh_num_chunks_before
+        self.lsh_num_chunks_after = lsh_num_chunks_after
+        self.hash_seed = hash_seed
+        self.is_decoder = is_decoder
+        self.lsh_attention_probs_dropout_prob = lsh_attention_probs_dropout_prob
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_size = attention_head_size
+        self.local_num_chunks_before = local_num_chunks_before
+        self.local_num_chunks_after = local_num_chunks_after
+        self.pad_token_id = pad_token_id
+        self.local_attention_probs_dropout_prob = local_attention_probs_dropout_prob
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.feed_forward_size = feed_forward_size
+        self.chunk_size_lm_head = chunk_size_lm_head
+        self.tie_word_embeddings = tie_word_embeddings
+        self.num_hidden_layers = num_hidden_layers
+        self.initializer_range = initializer_range
+        self.axial_norm_std = axial_norm_std
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+        super().__init__(
+            pad_token_id=pad_token_id,
+            is_decoder=is_decoder,
+            tie_word_embeddings=tie_word_embeddings,
+            chunk_size_feed_forward=chunk_size_feed_forward,
+            **kwargs,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/reformer/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/reformer/modeling.py
new file mode 100644
index 000000000..f94d220d5
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/reformer/modeling.py
@@ -0,0 +1,2987 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2020 The Trax Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import sys
+from dataclasses import dataclass
+from functools import reduce
+from operator import mul
+from typing import List, Optional, Tuple
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import Tensor
+from paddle.autograd import PyLayer
+
+from ...utils.log import logger
+from .. import PretrainedModel, register_base_model
+from ..activations import ACT2FN
+from ..model_outputs import (
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    MaskedLMOutput,
+    ModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+)
+from .configuration import (
+    REFORMER_PRETRAINED_INIT_CONFIGURATION,
+    REFORMER_PRETRAINED_RESOURCE_FILES_MAP,
+    ReformerConfig,
+)
+
+__all__ = [
+    "ReformerModel",
+    "ReformerPretrainedModel",
+    "ReformerForSequenceClassification",
+    "ReformerForQuestionAnswering",
+    "ReformerModelWithLMHead",
+    "ReformerForMaskedLM",
+    "ReformerLayer",
+]
+
+REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "reformer-crime-and-punishment",
+    "reformer-enwik8",
+]
+
+
+def _logsumexp(x, axis=-1, keepdim=False):
+    # paddle.logsumexp don't support 5D Tensor
+    if axis < 0:
+        axis = x.ndim + axis
+    if axis > 1:
+        lse = paddle.logsumexp(x.flatten(0, 1), axis=axis - 1, keepdim=keepdim)
+        orgshape = x.shape
+        if keepdim:
+            orgshape[axis] = 1
+        else:
+            orgshape = orgshape[:axis] + orgshape[axis + 1 :]
+
+        return lse.reshape(shape=orgshape)
+    else:
+        raise ValueError("axis must larger equal than 1.")
+
+
+def _stable_argsort(vector, axis=-1):
+    # this function scales the vector so that paddle.argsort is stable.
+    # paddle.argsort is not stable on its own
+    scale_offset = paddle.arange(vector.shape[axis]).reshape(shape=[1, -1]).astype(vector.dtype)
+    scale_offset = scale_offset.expand_as(vector)
+    scaled_vector = vector.shape[axis] * vector + (scale_offset % vector.shape[axis])
+    return paddle.argsort(scaled_vector, axis=axis)
+
+
+def _apply_chunking_to_forward(forward_fn, chunk_size, chunk_dim, *input_tensors):
+    """
+    This function chunks the `input_tensors` into smaller input tensor parts of size `chunk_size` over the
+    dimension `chunk_dim`. It then applies a layer `forward_fn` to each chunk independently to save memory.
+    If the `forward_fn` is independent across the `chunk_dim` this function will yield the same result as
+    directly applying `forward_fn` to `input_tensors`.
+    """
+    assert len(input_tensors) > 0, f"{input_tensors} has to be a tuple/list of tensors"
+    tensor_shape = input_tensors[0].shape[chunk_dim]
+    assert all(
+        input_tensor.shape[chunk_dim] == tensor_shape for input_tensor in input_tensors
+    ), "All input tenors have to be of the same shape"
+
+    # inspect.signature exist since python 3.5 and is a python method -> no problem with backward compatibility
+    num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters)
+    if num_args_in_forward_chunk_fn != len(input_tensors):
+        raise ValueError(
+            f"forward_chunk_fn expects {num_args_in_forward_chunk_fn} arguments, but only {len(input_tensors)} input "
+            "tensors are given"
+        )
+
+    if chunk_size > 0:
+        if input_tensors[0].shape[chunk_dim] % chunk_size != 0:
+            raise ValueError(
+                f"The dimension to be chunked {input_tensors[0].shape[chunk_dim]} has to be a multiple of the chunk "
+                f"size {chunk_size}"
+            )
+
+        num_chunks = input_tensors[0].shape[chunk_dim] // chunk_size
+
+        # chunk input tensor into tuples
+        input_tensors_chunks = tuple(input_tensor.chunk(num_chunks, axis=chunk_dim) for input_tensor in input_tensors)
+        # apply forward fn to every tuple
+        output_chunks = tuple(forward_fn(*input_tensors_chunk) for input_tensors_chunk in zip(*input_tensors_chunks))
+        # concatenate output at same dimension
+        return paddle.concat(output_chunks, axis=chunk_dim)
+
+    return forward_fn(*input_tensors)
+
+
+def _get_least_common_mult_chunk_len(attn_layers, lsh_attn_chunk_length, local_attn_chunk_length):
+    attn_types_set = set(attn_layers)
+    if len(attn_types_set) == 1 and attn_layers[0] == "lsh":
+        return lsh_attn_chunk_length
+    elif len(attn_types_set) == 1 and attn_layers[0] == "local":
+        return local_attn_chunk_length
+    elif len(attn_types_set) == 2 and attn_types_set == set(["lsh", "local"]):
+        return np.lcm(lsh_attn_chunk_length, local_attn_chunk_length)
+    else:
+        raise NotImplementedError(
+            f"Only attn layer types 'lsh' and 'local' exist, but `attn_layers`: {attn_layers}. Select "
+            "attn layer types from ['lsh', 'local'] only."
+        )
+
+
+def _get_min_chunk_len(attn_layers, lsh_attn_chunk_length, local_attn_chunk_length):
+    attn_types_set = set(attn_layers)
+    if len(attn_types_set) == 1 and attn_layers[0] == "lsh":
+        return lsh_attn_chunk_length
+    elif len(attn_types_set) == 1 and attn_layers[0] == "local":
+        return local_attn_chunk_length
+    elif len(attn_types_set) == 2 and attn_types_set == set(["lsh", "local"]):
+        return min(lsh_attn_chunk_length, local_attn_chunk_length)
+    else:
+        raise NotImplementedError(
+            f"Only attn layer types 'lsh' and 'local' exist, but `attn_layers`: {attn_layers}. Select "
+            "attn layer types from ['lsh', 'local'] only."
+        )
+
+
+class ReverseSort(PyLayer):
+    """
+    modified from https://github.com/huggingface/transformers/blob/fbf468b0573baddb1b9d1bb088a8b6d5c9303a7e/src/transformers/models/reformer/modeling_reformer.py#L982-L1011
+    After chunked attention is applied which sorted clusters, original ordering has to be restored. Since customized
+    backward function is used for Reformer, the gradients of the output vectors have to be explicitly sorted here.
+    """
+
+    @staticmethod
+    def forward(ctx, out_vectors, logits, sorted_bucket_idx, undo_sorted_bucket_idx):
+        # save sorted_bucket_idx for backprop
+        with paddle.no_grad():
+            ctx.sorted_bucket_idx = sorted_bucket_idx
+            # undo sort to have correct order for next layer
+            raw_shape = out_vectors.shape
+            out_vectors = out_vectors.transpose(perm=[0, 1, 3, 2])
+            expanded_undo_sort_indices = undo_sorted_bucket_idx.unsqueeze(-2).expand_as(out_vectors)
+            out_vectors = (
+                paddle.index_sample(
+                    out_vectors.reshape([-1, raw_shape[2]]),
+                    expanded_undo_sort_indices.reshape([-1, raw_shape[2]]),
+                )
+                .reshape(out_vectors.shape)
+                .transpose(perm=[0, 1, 3, 2])
+            )
+
+            logits = paddle.index_sample(
+                logits.reshape([-1, raw_shape[2]]),
+                undo_sorted_bucket_idx.reshape([-1, raw_shape[2]]),
+            ).reshape(raw_shape[:3])
+
+        return out_vectors, logits
+
+    @staticmethod
+    def backward(ctx, grad_out_vectors, grad_logits):
+        # get parameters saved in ctx
+        sorted_bucket_idx = ctx.sorted_bucket_idx
+
+        raw_shape = grad_out_vectors.shape
+        grad_out_vectors = grad_out_vectors.transpose(perm=[0, 1, 3, 2])
+
+        expanded_sorted_bucket_idx = sorted_bucket_idx.unsqueeze(-2).expand_as(grad_out_vectors)
+        grad_out_vectors = (
+            paddle.index_sample(
+                grad_out_vectors.reshape([-1, raw_shape[2]]),
+                expanded_sorted_bucket_idx.reshape([-1, raw_shape[2]]),
+            )
+            .reshape(grad_out_vectors.shape)
+            .transpose(perm=[0, 1, 3, 2])
+        )
+
+        grad_logits = paddle.index_sample(
+            grad_logits.reshape([-1, raw_shape[2]]),
+            sorted_bucket_idx.reshape([-1, raw_shape[2]]),
+        ).reshape(raw_shape[:3])
+
+        return grad_out_vectors, sorted_bucket_idx, None, None
+
+
+class _ReversibleFunction(PyLayer):
+    """
+    modified from https://github.com/huggingface/transformers/blob/c016dbdbdaf79339ae6d275d4651dc9f380be055/src/transformers/models/reformer/modeling_reformer.py#L1568-L1677
+    To prevent Paddle from performing the usual backpropagation, a customized backward function is implemented here.
+    This way it is made sure that no memory expensive activations are saved during the forward pass.
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        hidden_states,
+        layers,
+        attention_mask,
+        num_hashes,
+        all_hidden_states,
+        all_attentions,
+        cache,
+        use_cache,
+        orig_sequence_length,
+        output_hidden_states,
+        output_attentions,
+    ):
+        all_buckets = ()
+
+        # split duplicated tensor
+        hidden_states, attn_output = paddle.chunk(hidden_states, chunks=2, axis=-1)
+
+        for layer_id, layer in enumerate(layers):
+            if output_hidden_states is True:
+                all_hidden_states.append(hidden_states)
+
+            layer_outputs = layer(
+                prev_attn_output=attn_output,
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                num_hashes=num_hashes,
+                cache=cache,
+                use_cache=use_cache,
+                orig_sequence_length=orig_sequence_length,
+                output_attentions=output_attentions,
+            )
+
+            attn_output = layer_outputs.attn_output
+            hidden_states = layer_outputs.hidden_states
+            all_buckets = all_buckets + (layer_outputs.buckets,)
+
+            if output_attentions:
+                all_attentions.append(layer_outputs.attention_probs)
+
+        # Add last layer
+        if output_hidden_states is True:
+            all_hidden_states.append(hidden_states)
+
+        # attach params to ctx for backward
+        ctx.save_for_backward(attn_output.detach(), hidden_states.detach())
+        ctx.layers = layers
+        ctx.all_buckets = all_buckets
+        ctx.attention_mask = attention_mask
+
+        # Concatenate 2 RevNet outputs
+        return paddle.concat([attn_output, hidden_states], axis=-1)
+
+    @staticmethod
+    def backward(ctx, grad_hidden_states):
+
+        grad_attn_output, grad_hidden_states = paddle.chunk(grad_hidden_states, chunks=2, axis=-1)
+
+        # retrieve params from ctx for backward
+        (attn_output, hidden_states) = ctx.saved_tensor()
+
+        # create tuple
+        output = ReformerBackwardOutput(
+            attn_output=attn_output,
+            hidden_states=hidden_states,
+            grad_attn_output=grad_attn_output,
+            grad_hidden_states=grad_hidden_states,
+        )
+
+        # free memory
+        del grad_attn_output, grad_hidden_states, attn_output, hidden_states
+
+        layers = ctx.layers
+        all_buckets = ctx.all_buckets
+        attention_mask = ctx.attention_mask
+
+        for idx, layer in enumerate(layers[::-1]):
+            # pop last buckets from stack
+            buckets = all_buckets[-1]
+            all_buckets = all_buckets[:-1]
+
+            # backprop
+            output = layer.backward_pass(
+                next_attn_output=output.attn_output,
+                hidden_states=output.hidden_states,
+                grad_attn_output=output.grad_attn_output,
+                grad_hidden_states=output.grad_hidden_states,
+                attention_mask=attention_mask,
+                buckets=buckets,
+            )
+
+        assert all_buckets == (), "buckets have to be empty after backpropagation"
+        grad_hidden_states = paddle.concat([output.grad_attn_output, output.grad_hidden_states], axis=-1)
+
+        # num of return vars has to match num of forward() args
+        # return gradient for hidden_states arg and None for other args
+        return grad_hidden_states, None
+
+
+class AxialPositionEmbeddings(nn.Layer):
+    """
+    Constructs axial position embeddings. Useful for very long input sequences to save memory and time.
+    """
+
+    def __init__(self, config: ReformerConfig):
+        super().__init__()
+        self.axial_pos_shape = config.axial_pos_shape
+        self.axial_pos_embds_dim = config.axial_pos_embds_dim
+        self.dropout = config.hidden_dropout_prob
+
+        self.least_common_mult_chunk_length = _get_least_common_mult_chunk_len(
+            attn_layers=config.attn_layers,
+            lsh_attn_chunk_length=config.lsh_attn_chunk_length,
+            local_attn_chunk_length=config.local_attn_chunk_length,
+        )
+        self.weights = nn.ParameterList()
+
+        if sum(self.axial_pos_embds_dim) != config.hidden_size:
+            raise ValueError(
+                f"Make sure that axial_pos_embds factors: {self.axial_pos_embds_dim} sum to "
+                f"hidden_size: {config.hidden_size}"
+            )
+
+        # create weights
+        for axis, axial_pos_embd_dim in enumerate(self.axial_pos_embds_dim):
+            # create expanded shapes
+            ax_shape = [1] * len(self.axial_pos_shape)
+            ax_shape[axis] = self.axial_pos_shape[axis]
+            ax_shape = ax_shape + [axial_pos_embd_dim]
+
+            self.weights.append(
+                paddle.create_parameter(
+                    shape=ax_shape,
+                    dtype=paddle.get_default_dtype(),
+                    default_initializer=nn.initializer.Constant(value=1.0),
+                )
+            )
+
+    def forward(self, position_ids):
+        # broadcast weights to correct shape
+        batch_size = position_ids.shape[0]
+        sequence_length = position_ids.shape[1]
+        broadcasted_weights = [
+            weight.expand(shape=[batch_size] + list(self.axial_pos_shape) + weight.shape[-1:])
+            for weight in self.weights
+        ]
+
+        if self.training is True:
+            if reduce(mul, self.axial_pos_shape) != sequence_length:
+                raise ValueError(
+                    f"If training, make sure that axial_pos_shape factors: {self.axial_pos_shape} multiply to "
+                    f"sequence length. Got prod({self.axial_pos_shape}) != sequence_length: {sequence_length}. "
+                    f"You might want to consider padding your sequence length to {reduce(mul, self.axial_pos_shape)} "
+                    "or changing axial_pos_shape."
+                )
+
+            if self.dropout > 0:
+                weights = paddle.concat(broadcasted_weights, axis=-1)
+                # permute weights so that 2D correctly drops dims 1 and 2
+                transposed_weights = weights.transpose(perm=[0, 2, 1, 3])
+                # drop entire matrix of last two dims (prev dims 1 and 2)
+                dropped_transposed_weights = F.dropout2d(transposed_weights, p=self.dropout, training=self.training)
+                dropped_weights = dropped_transposed_weights.transpose(perm=[0, 2, 1, 3])
+
+                position_encodings = paddle.reshape(dropped_weights, shape=[batch_size, sequence_length, -1])
+
+            else:
+                position_encodings = paddle.concat(
+                    [
+                        paddle.reshape(weight, shape=[batch_size, sequence_length, -1])
+                        for weight in broadcasted_weights
+                    ],
+                    axis=-1,
+                )
+
+        else:
+            if reduce(mul, self.axial_pos_shape) < sequence_length:
+                raise ValueError(
+                    f"Make sure that axial_pos_shape factors: {self.axial_pos_shape} multiply at least to "
+                    f"max(sequence_length, least_common_mult_chunk_length): max({sequence_length}, "
+                    f"{self.least_common_mult_chunk_length})."
+                )
+
+            # compute how many columns are needed
+            max_position_id = position_ids.max().item()
+            required_pos_encodings_columns = -(-(max_position_id + 1) // self.axial_pos_shape[1])
+            # cut to columns that are needed
+            position_encodings = paddle.concat(
+                [weight[:, :required_pos_encodings_columns] for weight in broadcasted_weights],
+                axis=-1,
+            )
+            position_encodings = paddle.reshape(
+                position_encodings, shape=[batch_size, -1, position_encodings.shape[-1]]
+            )
+
+            # select correct position encodings
+            position_encodings = paddle.concat(
+                [
+                    paddle.index_select(position_encodings[i], index=position_ids[i], axis=0).unsqueeze(0)
+                    for i in range(batch_size)
+                ],
+                axis=0,
+            )
+
+        return position_encodings
+
+
+class PositionEmbeddings(nn.Layer):
+    """Constructs conventional position embeddings of shape `[max_pos_embeddings, hidden_size]`."""
+
+    def __init__(self, config: ReformerConfig):
+        super().__init__()
+        self.dropout = config.hidden_dropout_prob
+        self.embedding = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+
+    def forward(self, position_ids):
+        position_embeddings = self.embedding(position_ids)
+        position_embeddings = F.dropout(position_embeddings, p=self.dropout, training=self.training)
+        return position_embeddings
+
+
+class ReformerEmbeddings(nn.Layer):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config: ReformerConfig):
+        super().__init__()
+        self.max_position_embeddings = config.max_position_embeddings
+        self.dropout = config.hidden_dropout_prob
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.position_embeddings = (
+            AxialPositionEmbeddings(config) if config.axial_pos_embds else PositionEmbeddings(config)
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        start_idx_pos_encodings=0,
+        inputs_embeds: Optional[Tensor] = None,
+    ):
+
+        if input_ids is not None:
+            input_shape = input_ids.shape
+            inputs_embeds = self.word_embeddings(input_ids)
+        else:
+            input_shape = inputs_embeds.shape[:-1]
+
+        if position_ids is None:
+            ones = paddle.ones(input_shape, dtype="int64")
+            seq_length = paddle.cumsum(ones, axis=1)
+            position_ids = start_idx_pos_encodings + seq_length - start_idx_pos_encodings - ones
+            position_ids.stop_gradient = True
+
+        if position_ids.shape[-1] > self.max_position_embeddings:
+            raise ValueError(
+                f"Sequence Length: {position_ids.shape[-1]} has to be larger equal than "
+                f"max_position_embeddings {self.max_position_embeddings}."
+            )
+
+        # add positional embeddings
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+        # dropout
+        embeddings = F.dropout(embeddings, p=self.dropout, training=self.training)
+        return embeddings
+
+
+class EfficientAttentionMixin:
+    """
+    A few utilities for nn.Layers in Reformer, to be used as a mixin.
+    """
+
+    def _look_adjacent(self, vectors, num_chunks_before, num_chunks_after):
+        """
+        Used to implement attention between consecutive chunks.
+
+        Args:
+            vectors: array of shape [batch_size, num_attention_heads, n_chunks, chunk_len, ...]
+            num_chunks_before: chunks before current chunk to include in attention
+            num_chunks_after: chunks after current chunk to include in attention
+
+        Returns:
+            tensor of shape [num_chunks, N * chunk_length, ...], where N = (1 + num_chunks_before + num_chunks_after).
+        """
+        if num_chunks_before == 0 and num_chunks_after == 0:
+            return vectors
+
+        slices = []
+        for i in range(-num_chunks_before, num_chunks_after + 1):
+            if i == 0:
+                slices.append(vectors)
+            else:
+                slices.append(paddle.concat([vectors[:, :, i:], vectors[:, :, :i]], axis=2))
+        return paddle.concat(slices, axis=3)
+
+    def _split_hidden_size_dim(self, x, num_attn_heads, attn_head_size):
+        """
+        splits hidden_size dim into attn_head_size and num_attn_heads
+        """
+        new_x_shape = x.shape[:-1] + [num_attn_heads, attn_head_size]
+        x = x.reshape(shape=new_x_shape)
+        return x.transpose(perm=[0, 2, 1, 3])
+
+    def _merge_hidden_size_dims(self, x, num_attn_heads, attn_head_size):
+        """
+        merges attn_head_size dim and num_attn_heads dim into hidden_size
+        """
+        x = x.transpose(perm=[0, 2, 1, 3])
+        return paddle.reshape(x, shape=[x.shape[0], -1, num_attn_heads * attn_head_size])
+
+    def _split_seq_length_dim_to(self, vectors, dim_factor_1, dim_factor_2, num_attn_heads, attn_head_size=None):
+        """
+        splits sequence length dim of vectors into `dim_factor_1` and `dim_factor_2` dims
+        """
+        batch_size = vectors.shape[0]
+
+        split_dim_shape = [batch_size, num_attn_heads, dim_factor_1, dim_factor_2]
+
+        if vectors.ndim == 4:
+            return paddle.reshape(vectors, shape=split_dim_shape + [attn_head_size])
+        elif vectors.ndim == 3:
+            return paddle.reshape(vectors, shape=split_dim_shape)
+        else:
+            raise ValueError(f"Input vector rank should be one of [3, 4], but is: {vectors.ndim}")
+
+
+class LSHSelfAttention(nn.Layer, EfficientAttentionMixin):
+    def __init__(self, config: ReformerConfig):
+        super().__init__()
+
+        self.chunk_length = config.lsh_attn_chunk_length
+        self.num_hashes = config.num_hashes
+        self.num_buckets = config.num_buckets
+        self.num_chunks_before = config.lsh_num_chunks_before
+        self.num_chunks_after = config.lsh_num_chunks_after
+        self.hash_seed = config.hash_seed
+        self.is_decoder = config.is_decoder
+        self.max_position_embeddings = config.max_position_embeddings
+
+        self.dropout = config.lsh_attention_probs_dropout_prob
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = config.attention_head_size
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.hidden_size = config.hidden_size
+
+        # projection matrices
+        self.query_key = nn.Linear(self.hidden_size, self.all_head_size, bias_attr=False)
+        self.value = nn.Linear(self.hidden_size, self.all_head_size, bias_attr=False)
+
+        # save mask value here. Need fp32 and fp16 mask values
+        self.register_buffer("self_mask_value_float16", paddle.to_tensor(-1e3))
+        self.register_buffer("self_mask_value_float32", paddle.to_tensor(-1e5))
+        self.register_buffer("mask_value_float16", paddle.to_tensor(-1e4))
+        self.register_buffer("mask_value_float32", paddle.to_tensor(-1e9))
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        num_hashes=None,
+        buckets=None,
+        cache=None,
+        use_cache=False,
+        output_attentions=False,
+        **kwargs,
+    ):
+        batch_size, sequence_length = hidden_states.shape[:2]
+
+        # num hashes can optionally be overwritten by user
+        num_hashes = num_hashes if num_hashes is not None else self.num_hashes
+
+        do_cached_attention = use_cache and cache[1] is not None
+
+        # check if cache shall be used and that hidden states are already cached
+        if do_cached_attention:
+            assert (
+                sequence_length == 1
+            ), f"At the moment, auto-regressive language generation is only possible one word at a time. Make sure that input sequence length {sequence_length} equals 1, when `cache` is passed."
+            past_buckets = cache[0]
+            past_states = cache[1]
+
+            # get query vector
+            query_vectors = self.query_key(hidden_states)
+            query_vectors = self._split_hidden_size_dim(
+                query_vectors, self.num_attention_heads, self.attention_head_size
+            )
+
+            if past_buckets is not None:
+                (key_value_hidden_states, sorted_bucket_idx, buckets,) = self._get_relevant_hid_states_and_buckets(
+                    query_vectors=query_vectors,
+                    attention_mask=attention_mask,
+                    num_hashes=num_hashes,
+                    hidden_states=hidden_states,
+                    past_states=past_states,
+                    past_buckets=past_buckets,
+                )
+
+                query_key_vectors = self._query_per_attn_head(key_value_hidden_states)
+                value_vectors = self._value_per_attn_head(key_value_hidden_states)
+
+                # split key & value vectors by num hashes to apply
+                # self attention on each separately
+                query_key_vectors = self._split_seq_length_dim_to(
+                    query_key_vectors,
+                    num_hashes,
+                    -1,
+                    self.num_attention_heads,
+                    self.attention_head_size,
+                )
+                value_vectors = self._split_seq_length_dim_to(
+                    value_vectors,
+                    num_hashes,
+                    -1,
+                    self.num_attention_heads,
+                    self.attention_head_size,
+                )
+                # expand query vectors across hash dimension
+                query_vectors = paddle.tile(query_vectors.unsqueeze(2), repeat_times=[1, 1, num_hashes, 1, 1])
+            else:
+                key_value_hidden_states = paddle.concat([past_states, hidden_states], axis=1)
+
+                query_key_vectors = self.query_key(key_value_hidden_states)
+                value_vectors = self.value(key_value_hidden_states)
+
+        else:
+            # project hidden_states to query_key and value
+            query_vectors = None
+            query_key_vectors = self.query_key(hidden_states)
+            value_vectors = self.value(hidden_states)
+
+        # if query key is not already split
+        if not do_cached_attention or past_buckets is None:
+            query_key_vectors = self._split_hidden_size_dim(
+                query_key_vectors, self.num_attention_heads, self.attention_head_size
+            )
+            value_vectors = self._split_hidden_size_dim(
+                value_vectors, self.num_attention_heads, self.attention_head_size
+            )
+
+        # cache buckets for next incremental decoding
+        if do_cached_attention and past_buckets is None and key_value_hidden_states.shape[1] >= self.chunk_length:
+            buckets = self._hash_vectors(query_key_vectors, num_hashes, attention_mask)
+
+        # free memory
+        del hidden_states
+
+        assert (
+            query_key_vectors.shape[-1] == self.attention_head_size
+        ), f"last dim of query_key_vectors is {query_key_vectors.shape[-1]} but should be {self.attention_head_size}."
+        assert (
+            value_vectors.shape[-1] == self.attention_head_size
+        ), f"last dim of value_vectors is {value_vectors.shape[-1]} but should be {self.attention_head_size}."
+
+        do_standard_self_attention = (sequence_length <= self.chunk_length) or (use_cache and cache[1] is not None)
+        # LSH attention only makes sense if chunked attention should be performed
+        if not do_standard_self_attention:
+            # set `num_buckets` on the fly, recommended way to do it
+            if self.num_buckets is None:
+                self._set_num_buckets(sequence_length)
+
+            # use cached buckets for backprop only
+            if buckets is None:
+                # hash query key vectors into buckets
+                buckets = self._hash_vectors(query_key_vectors, num_hashes, attention_mask)
+            else:
+                # make sure buckets has correct shape for LSH attention
+                buckets = buckets.reshape(
+                    shape=[
+                        batch_size,
+                        self.num_attention_heads,
+                        num_hashes * sequence_length,
+                    ]
+                )
+
+            assert (
+                int(buckets.shape[-1]) == num_hashes * sequence_length
+            ), f"last dim of buckets is {buckets.shape[-1]}, but should be {num_hashes * sequence_length}"
+
+            (
+                sorted_bucket_idx,
+                undo_sorted_bucket_idx,
+            ) = self._get_sorted_bucket_idx_and_undo_sorted_bucket_idx(buckets)
+
+            # make sure bucket idx is not longer then sequence length
+            sorted_bucket_idx_per_hash = sorted_bucket_idx % sequence_length
+
+            # cluster query key value vectors according to hashed buckets
+            query_key_vectors = self._gather_by_expansion(query_key_vectors, sorted_bucket_idx_per_hash, num_hashes)
+
+            value_vectors = self._gather_by_expansion(value_vectors, sorted_bucket_idx_per_hash, num_hashes)
+            query_key_vectors = self._split_seq_length_dim_to(
+                query_key_vectors,
+                -1,
+                self.chunk_length,
+                self.num_attention_heads,
+                self.attention_head_size,
+            )
+            value_vectors = self._split_seq_length_dim_to(
+                value_vectors,
+                -1,
+                self.chunk_length,
+                self.num_attention_heads,
+                self.attention_head_size,
+            )
+
+            if self.chunk_length is None:
+                assert (
+                    self.num_chunks_before == 0 and self.num_chunks_after == 0
+                ), "If `chunk_length` is `None`, make sure `num_chunks_after` and `num_chunks_before` are set to 0."
+        elif do_cached_attention and past_buckets is not None:
+            # use max sequence length
+            sorted_bucket_idx_per_hash = sorted_bucket_idx
+        else:
+            # get sequence length indices
+            sorted_bucket_idx_per_hash = paddle.tile(
+                paddle.arange(sequence_length),
+                repeat_times=[batch_size, self.num_attention_heads, 1],
+            )
+
+        # scale key vectors
+        key_vectors = self._len_and_dim_norm(query_key_vectors)
+
+        # set query_vectors to query key vectors if LSH self attention
+        query_vectors = query_vectors if query_vectors is not None else query_key_vectors
+
+        # free memory
+        del query_key_vectors
+
+        # get attention probs
+        out_vectors, logits, attention_probs = self._attend(
+            query_vectors=query_vectors,
+            key_vectors=key_vectors,
+            value_vectors=value_vectors,
+            sorted_bucket_idx_per_hash=sorted_bucket_idx_per_hash,
+            attention_mask=attention_mask,
+            do_standard_self_attention=do_standard_self_attention,
+            do_cached_attention=do_cached_attention,
+        )
+
+        # free memory
+        del key_vectors, value_vectors
+
+        # re-order out_vectors and logits
+        if not do_standard_self_attention:
+            # sort clusters back to correct ordering
+            out_vectors, logits = ReverseSort.apply(out_vectors, logits, sorted_bucket_idx, undo_sorted_bucket_idx)
+
+        if not do_standard_self_attention or (do_cached_attention and past_buckets is not None):
+            # sum up all hash rounds
+            if num_hashes > 1:
+                out_vectors = self._split_seq_length_dim_to(
+                    out_vectors,
+                    num_hashes,
+                    sequence_length,
+                    self.num_attention_heads,
+                    self.attention_head_size,
+                )
+                logits = self._split_seq_length_dim_to(
+                    logits,
+                    num_hashes,
+                    sequence_length,
+                    self.num_attention_heads,
+                    self.attention_head_size,
+                ).unsqueeze(-1)
+
+                probs_vectors = paddle.exp(logits - _logsumexp(logits, axis=2, keepdim=True))
+                out_vectors = paddle.sum(out_vectors * probs_vectors, axis=2)
+                # free memory
+                del probs_vectors
+
+            # free memory
+            del logits
+
+        assert out_vectors.shape == [
+            batch_size,
+            self.num_attention_heads,
+            sequence_length,
+            self.attention_head_size,
+        ], "out_vectors have be of shape `[batch_size, num_attention_heads, sequence_length, attention_head_size]`."
+
+        out_vectors = self._merge_hidden_size_dims(out_vectors, self.num_attention_heads, self.attention_head_size)
+
+        if output_attentions is False:
+            attention_probs = ()
+
+        if buckets is not None:
+            buckets = buckets.reshape(shape=[batch_size, self.num_attention_heads, num_hashes, -1])
+
+        return LSHSelfAttentionOutput(hidden_states=out_vectors, attention_probs=attention_probs, buckets=buckets)
+
+    def _query_per_attn_head(self, hidden_states):
+        per_head_query_key = self.query_key.weight.reshape(
+            shape=[self.num_attention_heads, self.attention_head_size, self.hidden_size]
+        ).transpose(perm=[0, 2, 1])
+        # only relevant for inference and no bias => we can use einsum here
+        query_key_vectors = paddle.einsum("balh,ahr->balr", hidden_states, per_head_query_key)
+        return query_key_vectors
+
+    def _value_per_attn_head(self, hidden_states):
+        per_head_value = self.value.weight.reshape(
+            shape=[self.num_attention_heads, self.attention_head_size, self.hidden_size]
+        ).transpose(perm=[0, 2, 1])
+        # only relevant for inference and no bias => we can use einsum here
+        value_vectors = paddle.einsum("balh,ahr->balr", hidden_states, per_head_value)
+        return value_vectors
+
+    def _hash_vectors(self, vectors, num_hashes, attention_mask, increase_num_buckets=False):
+        batch_size = vectors.shape[0]
+
+        # See https://arxiv.org/pdf/1509.02897.pdf
+        # We sample a different random rotation for each round of hashing to
+        # decrease the probability of hash misses.
+        if isinstance(self.num_buckets, int):
+            assert (
+                self.num_buckets % 2 == 0
+            ), f"There should be an even number of buckets, but `self.num_buckets`: {self.num_buckets}"
+            rotation_size = self.num_buckets
+            num_buckets = self.num_buckets
+        else:
+            # Factorize the hash if self.num_buckets is a list or tuple
+            rotation_size, num_buckets = 0, 1
+            for bucket_factor in self.num_buckets:
+                assert (
+                    bucket_factor % 2 == 0
+                ), f"The number of buckets should be even, but `num_bucket`: {bucket_factor}"
+                rotation_size = rotation_size + bucket_factor
+                num_buckets = num_buckets * bucket_factor
+
+        # remove gradient
+        vectors = vectors.detach()
+
+        if self.hash_seed is not None:
+            # for determinism
+            paddle.seed(self.hash_seed)
+
+        rotations_shape = [
+            self.num_attention_heads,
+            vectors.shape[-1],
+            num_hashes,
+            rotation_size // 2,
+        ]
+
+        # create a random self.attention_head_size x num_hashes x num_buckets/2
+        random_rotations = paddle.randn(shape=rotations_shape, dtype=vectors.dtype)
+        # Output dim: Batch_Size x Num_Attn_Heads x Num_Hashes x Seq_Len x Num_Buckets/2
+        rotated_vectors = paddle.einsum("bmtd,mdhr->bmhtr", vectors, random_rotations)
+
+        if isinstance(self.num_buckets, int) or len(self.num_buckets) == 1:
+            rotated_vectors = paddle.concat([rotated_vectors, -rotated_vectors], axis=-1)
+            buckets = paddle.argmax(rotated_vectors, axis=-1)
+        else:
+            # Get the buckets for them and combine.
+            buckets, cur_sum, cur_product = None, 0, 1
+            for bucket_factor in self.num_buckets:
+                # bmhtr
+                rotated_vectors_factor = rotated_vectors[:, :, :, :, cur_sum : cur_sum + (bucket_factor // 2)]
+                cur_sum = cur_sum + bucket_factor // 2
+                rotated_vectors_factor = paddle.concat([rotated_vectors_factor, -rotated_vectors_factor], axis=-1)
+                if buckets is None:
+                    buckets = paddle.argmax(rotated_vectors_factor, axis=-1)
+                else:
+                    buckets = buckets + (cur_product * paddle.argmax(rotated_vectors_factor, axis=-1))
+
+                cur_product = cur_product * bucket_factor
+
+        if attention_mask is not None and (attention_mask.sum() < batch_size * attention_mask.shape[-1]):
+            # add an extra bucket for padding tokens only
+            num_buckets = num_buckets + 1
+            # assign padding tokens extra bucket
+            buckets_mask = attention_mask.unsqueeze(axis=[1, 2]).expand_as(buckets)
+            buckets = paddle.where(
+                buckets_mask.astype(paddle.bool),
+                buckets,
+                paddle.to_tensor(num_buckets - 1, dtype=buckets.dtype),
+            )
+        elif increase_num_buckets:
+            num_buckets = num_buckets + 1
+
+        # buckets is now (Batch_size x Num_Attn_Heads x Num_Hashes x Seq_Len).
+        # Next we add offsets so that bucket numbers from different hashing rounds don't overlap.
+        offsets = paddle.arange(num_hashes)
+        offsets = (offsets * num_buckets).reshape(shape=[1, 1, -1, 1])
+
+        # expand to batch size and num attention heads
+        offsets = offsets.expand(shape=[batch_size, self.num_attention_heads] + offsets.shape[-2:])
+        offset_buckets = (buckets + offsets).flatten(start_axis=2, stop_axis=3)
+
+        return offset_buckets
+
+    def _get_sorted_bucket_idx_and_undo_sorted_bucket_idx(self, buckets):
+        # no gradients are needed
+        # buckets shape [batch_size, self.num_attention_heads, num_hashes * sequence_length]
+        with paddle.no_grad():
+            original_shape = buckets.shape
+            new_buckets = buckets.flatten(0, 1)
+            offsets = (paddle.arange(new_buckets.shape[0]) * new_buckets.shape[1]).unsqueeze(-1)
+            sorted_bucket_idx = _stable_argsort(new_buckets, axis=-1)
+            new_sorted_bucket_idx = (sorted_bucket_idx + offsets).flatten()
+            updates = paddle.tile(paddle.arange(new_buckets.shape[1]), repeat_times=[new_buckets.shape[0]])
+
+            undo_sorted_bucket_idx = paddle.scatter(
+                paddle.zeros_like(new_sorted_bucket_idx),
+                new_sorted_bucket_idx,
+                updates,
+                overwrite=True,
+            )
+
+        return sorted_bucket_idx.reshape(shape=original_shape), undo_sorted_bucket_idx.reshape(shape=original_shape)
+
+    def _set_num_buckets(self, sequence_length):
+        # `num_buckets` should be set to 2 * sequence_length // chunk_length as recommended in paper
+        num_buckets_pow_2 = (2 * (sequence_length // self.chunk_length)).bit_length() - 1
+        # make sure buckets are power of 2
+        num_buckets = 2**num_buckets_pow_2
+
+        # factorize `num_buckets` if `num_buckets` becomes too large
+        num_buckets_limit = 2 * max(
+            int((self.max_position_embeddings // self.chunk_length) ** (0.5)),
+            self.chunk_length,
+        )
+        if num_buckets > num_buckets_limit:
+            num_buckets = [
+                2 ** (num_buckets_pow_2 // 2),
+                2 ** (num_buckets_pow_2 - num_buckets_pow_2 // 2),
+            ]
+
+        logger.warning(f"num_buckets is not set. Setting num_buckets to {num_buckets}...")
+
+        # set num buckets in config to be properly saved
+        self.num_buckets = num_buckets
+
+    def _attend(
+        self,
+        query_vectors,
+        key_vectors,
+        value_vectors,
+        sorted_bucket_idx_per_hash,
+        attention_mask,
+        do_standard_self_attention,
+        do_cached_attention,
+    ):
+        # look at previous and following chunks if chunked attention
+        if not do_standard_self_attention:
+            key_vectors = self._look_adjacent(key_vectors, self.num_chunks_before, self.num_chunks_after)
+            value_vectors = self._look_adjacent(value_vectors, self.num_chunks_before, self.num_chunks_after)
+
+        # get logits and dots
+        # (BS, NumAttn, NumHash x NumChunk, Chunk_L x Hidden),(BS, NumAttn, NumHash x NumChunk, Chunk_L * (Num_bef + Num_aft + 1) x Hidden) -> (BS, NumAttn, NumHash x NumChunk, Chunk_L, Chunk_L * (1 + Num_bef + Num_aft))
+        query_key_dots = paddle.matmul(query_vectors, key_vectors, transpose_y=True)
+
+        # free memory
+        del query_vectors, key_vectors
+
+        # if chunked attention split bucket idxs to query and key
+        if not do_standard_self_attention:
+            query_bucket_idx = self._split_seq_length_dim_to(
+                sorted_bucket_idx_per_hash,
+                -1,
+                self.chunk_length,
+                self.num_attention_heads,
+            )
+            key_value_bucket_idx = self._look_adjacent(query_bucket_idx, self.num_chunks_before, self.num_chunks_after)
+        elif do_cached_attention and query_key_dots.ndim > 4:
+            key_value_bucket_idx = sorted_bucket_idx_per_hash
+            query_bucket_idx = (
+                paddle.ones(
+                    shape=key_value_bucket_idx.shape[:-1] + [1],
+                    dtype=key_value_bucket_idx.dtype,
+                )
+                * key_value_bucket_idx.max()
+            )
+        elif do_cached_attention and query_key_dots.ndim <= 4:
+            query_bucket_idx = (query_key_dots.shape[-1] - 1) * paddle.ones_like(query_key_dots)[:, :, :, -1]
+            key_value_bucket_idx = (
+                paddle.arange(query_key_dots.shape[-1])
+                .unsqueeze(axis=[0, 1])
+                .expand(
+                    shape=query_bucket_idx.shape[:2]
+                    + [
+                        query_key_dots.shape[-1],
+                    ]
+                )
+            )
+        else:
+            query_bucket_idx = key_value_bucket_idx = sorted_bucket_idx_per_hash
+
+        # get correct mask values depending on precision
+        if query_key_dots.dtype == paddle.float16:
+            self_mask_value = self.self_mask_value_float16.astype(paddle.float16)
+            mask_value = self.mask_value_float16.astype(paddle.float16)
+        else:
+            self_mask_value = self.self_mask_value_float32
+            mask_value = self.mask_value_float32
+
+        if not do_cached_attention:
+            mask = self._compute_attn_mask(
+                query_bucket_idx,
+                key_value_bucket_idx,
+                attention_mask,
+                query_key_dots.shape,
+                do_standard_self_attention,
+            )
+
+            if mask is not None:
+                query_key_dots = paddle.where(mask.astype(paddle.bool), query_key_dots, mask_value)
+
+            # free memory
+            del mask
+
+        # Self mask is ALWAYS applied.
+        # From the reformer paper (https://arxiv.org/pdf/2001.04451.pdf):
+        # " While attention to the future is not allowed, typical implementations of the
+        # Transformer do allow a position to attend to itself.
+        # Such behavior is undesirable in a shared-QK formulation because the dot-product
+        # of a query vector with itself will almost always be greater than the dot product of a
+        # query vector with a vector at another position. We therefore modify the masking
+        # to forbid a token from attending to itself, except in situations
+        # where a token has no other valid attention targets (e.g. the first token in a sequence) "
+        self_mask = paddle.not_equal(
+            query_bucket_idx.unsqueeze(-1).astype("int64"), key_value_bucket_idx.unsqueeze(-2).astype("int64")
+        )
+
+        # apply self_mask
+        query_key_dots = paddle.where(self_mask, query_key_dots, self_mask_value)
+
+        # free memory
+        del self_mask
+
+        logits = _logsumexp(query_key_dots, axis=-1, keepdim=True)
+        # dots shape is `[batch_size, num_attn_heads, num_hashes * seq_len // chunk_length, chunk_length, chunk_length * (1 + num_chunks_before + num_chunks_after)]`
+        attention_probs = paddle.exp(query_key_dots - logits)
+
+        # free memory
+        del query_key_dots
+
+        # dropout
+        attention_probs = F.dropout(attention_probs, p=self.dropout, training=self.training)
+
+        # attend values
+        out_vectors = paddle.matmul(attention_probs, value_vectors)
+
+        # free memory
+        del value_vectors
+
+        # merge chunk length
+        if out_vectors.ndim > 4:
+
+            logits = logits.flatten(start_axis=2, stop_axis=3).squeeze(-1)
+            out_vectors = out_vectors.flatten(start_axis=2, stop_axis=3)
+
+        return out_vectors, logits, attention_probs
+
+    def _compute_attn_mask(
+        self,
+        query_indices,
+        key_indices,
+        attention_mask,
+        query_key_dot_shape,
+        do_standard_self_attention,
+    ):
+
+        # attention mask for LSH
+        if attention_mask is not None:
+            # if chunked attention, the attention mask has to correspond to LSH order
+            attention_mask = attention_mask.astype(paddle.int64).unsqueeze(1)
+            if not do_standard_self_attention:
+                # expand attn_mask to fit with key_value_bucket_idx shape
+                attention_mask = attention_mask.unsqueeze(1)
+                attention_mask = attention_mask.expand(shape=query_indices.shape[:-1] + [attention_mask.shape[-1]])
+
+                attention_mask = attention_mask.reshape([-1, attention_mask.shape[-1]])
+                new_key_indices = key_indices.reshape([-1, key_indices.shape[-1]])
+                attention_mask = paddle.index_sample(attention_mask, new_key_indices).reshape(key_indices.shape)
+
+            attention_mask = attention_mask.unsqueeze(-2).expand(shape=query_key_dot_shape)
+
+        # Causal mask
+        if self.is_decoder is True:
+
+            causal_mask = paddle.greater_equal(query_indices.unsqueeze(-1), key_indices.unsqueeze(-2)).astype(
+                paddle.int64
+            )
+
+            # add attention mask if not None
+            if attention_mask is not None:
+                attention_mask = causal_mask * attention_mask
+            else:
+                attention_mask = causal_mask
+
+        return attention_mask
+
+    def _get_relevant_hid_states_and_buckets(
+        self,
+        query_vectors,
+        attention_mask,
+        num_hashes,
+        hidden_states,
+        past_states,
+        past_buckets,
+    ):
+        # concat hidden states
+        hidden_states = paddle.concat([past_states, hidden_states], axis=1)
+
+        # batch_size hidden
+        batch_size = hidden_states.shape[0]
+        sequence_length = hidden_states.shape[1]
+
+        # check if cached buckets include pad bucket
+        max_bucket = self.num_buckets if isinstance(self.num_buckets, int) else reduce(mul, self.num_buckets)
+
+        # if pad bucket was cached => need to increase num buckets for caching
+        increase_num_buckets = past_buckets.max() > num_hashes * max_bucket - 1
+
+        # retrieve query buckets
+        query_buckets = self._hash_vectors(
+            query_vectors,
+            num_hashes,
+            attention_mask,
+            increase_num_buckets=increase_num_buckets,
+        )
+
+        # concat buckets
+        concat_buckets = paddle.concat([past_buckets, query_buckets.unsqueeze(-1)], axis=-1)
+
+        # hash-based sort
+        bucket_idx = paddle.argsort(concat_buckets, axis=-1)
+
+        # bucket_idx has shape: BatchSize x NumAttnHeads x NumHashes x SequenceLength
+        assert bucket_idx.shape == [
+            batch_size,
+            self.num_attention_heads,
+            num_hashes,
+            sequence_length,
+        ], f"bucket_idx should have shape {(batch_size, self.num_attention_heads, num_hashes, sequence_length)}, but has shape {bucket_idx.shape}."
+
+        # find indices of new bucket indices
+        relevant_bucket_idx = (bucket_idx == (bucket_idx.shape[-1] - 1)).nonzero()
+
+        # expand relevant bucket indices to its chunks
+        relevant_bucket_idx_chunk = self._expand_to_indices_in_relevant_chunk(relevant_bucket_idx, sequence_length)
+
+        relevant_bucket_idx_chunk = bucket_idx.gather_nd(relevant_bucket_idx_chunk)
+
+        # adapt bucket_idx for batch and hidden states for index select
+        bucket_idx_batch_offset = sequence_length * (
+            batch_size * paddle.arange(relevant_bucket_idx_chunk.shape[-1]) // relevant_bucket_idx_chunk.shape[-1]
+        )
+
+        # add batch offset
+        relevant_bucket_idx_chunk_all_batch = relevant_bucket_idx_chunk + bucket_idx_batch_offset
+        hidden_states = hidden_states.reshape(shape=[-1, self.hidden_size])
+
+        # select all relevant hidden states
+        relevant_hidden_states = hidden_states.index_select(relevant_bucket_idx_chunk_all_batch, axis=0)
+
+        # reshape hidden states and bucket_idx to correct output
+        relevant_hidden_states = relevant_hidden_states.reshape(
+            shape=[batch_size, self.num_attention_heads, -1, self.hidden_size]
+        )
+        relevant_bucket_idx_chunk = relevant_bucket_idx_chunk.reshape(
+            shape=[batch_size, self.num_attention_heads, num_hashes, -1]
+        )
+
+        assert (
+            relevant_hidden_states.shape[2]
+            == (self.num_chunks_before + self.num_chunks_after + 1) * self.chunk_length * num_hashes
+        ), f"There should be {(self.num_chunks_before + self.num_chunks_after + 1) * self.chunk_length * num_hashes} `hidden_states`, there are {relevant_hidden_states.shape[2]} `hidden_states`."
+
+        assert (
+            relevant_bucket_idx_chunk.shape[-1]
+            == (self.num_chunks_before + self.num_chunks_after + 1) * self.chunk_length
+        ), f"There should be {(self.num_chunks_before + self.num_chunks_after + 1) * self.chunk_length} `hidden_states`, there are {relevant_bucket_idx_chunk.shape[-1]} `bucket_idx`."
+
+        return relevant_hidden_states, relevant_bucket_idx_chunk, query_buckets
+
+    def _expand_to_indices_in_relevant_chunk(self, indices, sequence_length):
+        # get relevant indices of where chunk starts and its size
+        start_indices_chunk = ((indices[:, -1] // self.chunk_length) - self.num_chunks_before) * self.chunk_length
+        total_chunk_size = self.chunk_length * (1 + self.num_chunks_before + self.num_chunks_after)
+
+        # expand start indices and add correct chunk offset via arange
+        expanded_start_indices = start_indices_chunk.unsqueeze(-1).expand(shape=[indices.shape[0], total_chunk_size])
+        chunk_sequence_indices = expanded_start_indices + paddle.arange(total_chunk_size).unsqueeze(0).expand(
+            shape=[indices.shape[0], total_chunk_size]
+        )
+
+        # make sure that circular logic holds via % seq len
+        chunk_sequence_indices = chunk_sequence_indices.flatten() % sequence_length
+
+        # expand indices and set indices correctly
+        indices = (
+            indices.unsqueeze(1)
+            .expand(shape=(indices.shape[0], total_chunk_size, indices.shape[-1]))
+            .flatten(0, 1)
+            .clone()
+        )
+        indices[:, -1] = chunk_sequence_indices
+
+        return indices
+
+    def _len_and_dim_norm(self, vectors):
+        """
+        length and attention head size dim normalization
+        """
+        vectors = self._len_norm(vectors)
+        vectors = vectors * paddle.rsqrt(paddle.to_tensor(self.attention_head_size, dtype=vectors.dtype))
+        return vectors
+
+    def _len_norm(self, x, epsilon=1e-6):
+        """
+        length normalization
+        """
+        variance = paddle.mean(x**2, axis=-1, keepdim=True)
+        norm_x = x * paddle.rsqrt(variance + epsilon)
+        return norm_x
+
+    def _gather_by_expansion(self, vectors, idxs, num_hashes):
+        """
+        expand dims of idxs and vectors for all hashes and gather
+        """
+        expanded_idxs = paddle.tile(idxs.unsqueeze(-2), repeat_times=[1, 1, self.attention_head_size, 1]).reshape(
+            shape=[-1, idxs.shape[2]]
+        )
+        vectors = (
+            paddle.tile(vectors, repeat_times=[1, 1, num_hashes, 1])
+            .transpose(perm=[0, 1, 3, 2])
+            .reshape(shape=[-1, idxs.shape[2]])
+        )
+
+        return (
+            paddle.index_sample(vectors, expanded_idxs)
+            .reshape(shape=[idxs.shape[0], idxs.shape[1], self.attention_head_size, -1])
+            .transpose(perm=[0, 1, 3, 2])
+        )
+
+
+class LocalSelfAttention(nn.Layer, EfficientAttentionMixin):
+    def __init__(self, config: ReformerConfig):
+        super().__init__()
+        self.num_attention_heads = config.num_attention_heads
+        self.chunk_length = config.local_attn_chunk_length
+        self.num_chunks_before = config.local_num_chunks_before
+        self.num_chunks_after = config.local_num_chunks_after
+        self.is_decoder = config.is_decoder
+        self.pad_token_id = config.pad_token_id
+
+        self.attention_head_size = config.attention_head_size
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.hidden_size = config.hidden_size
+
+        # projection matrices
+        self.query = nn.Linear(self.hidden_size, self.all_head_size, bias_attr=False)
+        self.key = nn.Linear(self.hidden_size, self.all_head_size, bias_attr=False)
+        self.value = nn.Linear(self.hidden_size, self.all_head_size, bias_attr=False)
+
+        self.dropout = config.local_attention_probs_dropout_prob
+
+        # save mask value here
+        self.register_buffer("mask_value_float16", paddle.to_tensor(-1e4))
+        self.register_buffer("mask_value_float32", paddle.to_tensor(-1e9))
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        cache=None,
+        use_cache=False,
+        output_attentions=False,
+        **kwargs,
+    ):
+        sequence_length = hidden_states.shape[1]
+        batch_size = hidden_states.shape[0]
+
+        # check if cache shall be used and that hidden states are already cached
+        if use_cache and cache[1] is not None:
+            assert (
+                cache[0] is None
+            ), "LocalSelfAttention should not make use of `buckets`. There seems to be an error when caching hidden_states_and_buckets."
+            key_value_hidden_states = self._retrieve_relevant_hidden_states(
+                cache[1], self.chunk_length, self.num_chunks_before
+            )
+            key_value_hidden_states = paddle.concat([key_value_hidden_states, hidden_states], axis=1)
+
+            # only query vector for last token
+            query_vectors = self.query(hidden_states)
+            # compute key and value for relevant chunk
+            key_vectors = self.key(key_value_hidden_states)
+            value_vectors = self.value(key_value_hidden_states)
+
+            # free memory
+            del key_value_hidden_states
+        else:
+            # project hidden_states to query, key and value
+            query_vectors = self.query(hidden_states)
+            key_vectors = self.key(hidden_states)
+            value_vectors = self.value(hidden_states)
+
+        # split last dim into `num_attention_heads` and `attention_head_size`
+        query_vectors = self._split_hidden_size_dim(query_vectors, self.num_attention_heads, self.attention_head_size)
+        key_vectors = self._split_hidden_size_dim(key_vectors, self.num_attention_heads, self.attention_head_size)
+        value_vectors = self._split_hidden_size_dim(value_vectors, self.num_attention_heads, self.attention_head_size)
+
+        assert (
+            query_vectors.shape[-1] == self.attention_head_size
+        ), f"last dim of query_key_vectors is {query_vectors.shape[-1]} but should be {self.attention_head_size}."
+        assert (
+            key_vectors.shape[-1] == self.attention_head_size
+        ), f"last dim of query_key_vectors is {key_vectors.shape[-1]} but should be {self.attention_head_size}."
+        assert (
+            value_vectors.shape[-1] == self.attention_head_size
+        ), f"last dim of query_key_vectors is {value_vectors.shape[-1]} but should be {self.attention_head_size}."
+
+        if self.chunk_length is None:
+            assert (
+                self.num_chunks_before == 0 and self.num_chunks_after == 0
+            ), "If `chunk_length` is `None`, make sure `num_chunks_after` and `num_chunks_before` are set to 0."
+
+        # normalize key vectors
+        key_vectors = key_vectors / paddle.sqrt(paddle.to_tensor(self.attention_head_size, dtype=key_vectors.dtype))
+
+        # get sequence length indices
+        indices = paddle.tile(
+            paddle.arange(sequence_length),
+            repeat_times=[batch_size, self.num_attention_heads, 1],
+        )
+
+        # if one should do normal n^2 self-attention
+        do_standard_self_attention = sequence_length <= self.chunk_length
+
+        # if input should be chunked
+        if not do_standard_self_attention:
+            # chunk vectors
+
+            # B x Num_Attn_Head x Seq_Len // chunk_len x chunk_len  x  attn_head_size
+
+            query_vectors = self._split_seq_length_dim_to(
+                query_vectors,
+                -1,
+                self.chunk_length,
+                self.num_attention_heads,
+                self.attention_head_size,
+            )
+
+            key_vectors = self._split_seq_length_dim_to(
+                key_vectors,
+                -1,
+                self.chunk_length,
+                self.num_attention_heads,
+                self.attention_head_size,
+            )
+            value_vectors = self._split_seq_length_dim_to(
+                value_vectors,
+                -1,
+                self.chunk_length,
+                self.num_attention_heads,
+                self.attention_head_size,
+            )
+
+            query_indices = self._split_seq_length_dim_to(indices, -1, self.chunk_length, self.num_attention_heads)
+            key_indices = self._split_seq_length_dim_to(indices, -1, self.chunk_length, self.num_attention_heads)
+
+            # append chunks before and after
+            key_vectors = self._look_adjacent(key_vectors, self.num_chunks_before, self.num_chunks_after)
+            value_vectors = self._look_adjacent(value_vectors, self.num_chunks_before, self.num_chunks_after)
+            key_indices = self._look_adjacent(key_indices, self.num_chunks_before, self.num_chunks_after)
+        else:
+            query_indices = key_indices = indices
+
+        # query-key matmul: QK^T
+        query_key_dots = paddle.matmul(query_vectors, key_vectors, transpose_y=True)
+
+        # free memory
+        del query_vectors, key_vectors
+
+        mask = self._compute_attn_mask(
+            query_indices,
+            key_indices,
+            attention_mask,
+            query_key_dots.shape,
+            do_standard_self_attention,
+        )
+
+        if mask is not None:
+            # get mask tensor depending on half precision or not
+            if query_key_dots.dtype == paddle.float16:
+                mask_value = self.mask_value_float16.astype(paddle.float16)
+            else:
+                mask_value = self.mask_value_float32
+
+            query_key_dots = paddle.where(mask.astype(paddle.bool), query_key_dots, mask_value)
+
+        # free memory
+        del mask
+
+        # softmax
+        logits = _logsumexp(query_key_dots, axis=-1, keepdim=True)
+        attention_probs = paddle.exp(query_key_dots - logits)
+
+        # free memory
+        del logits
+
+        # dropout
+        attention_probs = F.dropout(attention_probs, p=self.dropout, training=self.training)
+
+        # attend values
+        out_vectors = paddle.matmul(attention_probs, value_vectors)
+
+        # free memory
+        del value_vectors
+
+        # merge chunk length
+        if not do_standard_self_attention:
+            out_vectors = out_vectors.flatten(start_axis=2, stop_axis=3)
+
+        assert out_vectors.shape == [
+            batch_size,
+            self.num_attention_heads,
+            sequence_length,
+            self.attention_head_size,
+        ]
+        out_vectors = self._merge_hidden_size_dims(out_vectors, self.num_attention_heads, self.attention_head_size)
+
+        if output_attentions is False:
+            attention_probs = ()
+
+        return LocalSelfAttentionOutput(hidden_states=out_vectors, attention_probs=attention_probs)
+
+    def _compute_attn_mask(
+        self,
+        query_indices,
+        key_indices,
+        attention_mask,
+        query_key_dots_shape,
+        do_standard_self_attention,
+    ):
+
+        # chunk attention mask and look before and after
+
+        if attention_mask is not None:
+
+            attention_mask = attention_mask.astype(paddle.int64).unsqueeze(1)
+
+            if not do_standard_self_attention:
+                attention_mask = self._split_seq_length_dim_to(attention_mask, -1, self.chunk_length, 1)
+                attention_mask = self._look_adjacent(attention_mask, self.num_chunks_before, self.num_chunks_after)
+            # create attn_mask
+
+            attention_mask = attention_mask.unsqueeze(-2).expand(shape=query_key_dots_shape)
+
+        # Causal mask
+        if self.is_decoder is True:
+            causal_mask = paddle.greater_equal(query_indices.unsqueeze(-1), key_indices.unsqueeze(-2)).astype(
+                paddle.int64
+            )
+
+            # add attention mask if not None
+            if attention_mask is not None:
+                attention_mask = causal_mask * attention_mask
+            else:
+                attention_mask = causal_mask
+
+        return attention_mask
+
+    @staticmethod
+    def _retrieve_relevant_hidden_states(previous_hidden_states, chunk_length, num_chunks_before):
+        start_position = ((previous_hidden_states.shape[1] // chunk_length) - num_chunks_before) * chunk_length
+        return previous_hidden_states[:, start_position:]
+
+
+class ReformerSelfOutput(nn.Layer):
+    def __init__(self, config: ReformerConfig):
+        super().__init__()
+        all_head_size = config.num_attention_heads * config.attention_head_size
+        self.dropout = config.hidden_dropout_prob
+
+        self.dense = nn.Linear(all_head_size, config.hidden_size, bias_attr=False)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        return hidden_states
+
+
+class ReformerAttention(nn.Layer):
+    def __init__(self, config: ReformerConfig, layer_id=0):
+        super().__init__()
+        self.layer_id = layer_id
+        self.attn_layers = config.attn_layers
+
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+
+        if len(set(self.attn_layers)) == 1 and self.attn_layers[0] == "lsh":
+            self.self_attention = LSHSelfAttention(config)
+        elif len(set(self.attn_layers)) == 1 and self.attn_layers[0] == "local":
+            self.self_attention = LocalSelfAttention(config)
+
+        elif len(set(self.attn_layers)) == 2 and set(self.attn_layers) == set(["lsh", "local"]):
+            # get correct attn layers
+            if self.attn_layers[self.layer_id] == "lsh":
+                self.self_attention = LSHSelfAttention(config)
+            else:
+                self.self_attention = LocalSelfAttention(config)
+        else:
+            raise NotImplementedError(
+                f"Only attn layer types 'lsh' and 'local' exist, but got `attn_layers`: {self.attn_layers}. "
+                "Select attn layer types from ['lsh', 'local'] only."
+            )
+        self.output = ReformerSelfOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        num_hashes=None,
+        cache=None,
+        use_cache=False,
+        orig_sequence_length=None,
+        output_attentions=False,
+        buckets=None,
+    ):
+        hidden_states = self.layer_norm(hidden_states)
+
+        # make sure cached hidden states is set to None for backward pass
+        if cache is not None:
+            cache_layer = cache[self.layer_id]
+        else:
+            cache_layer = None
+
+        # use cached buckets for backprob if buckets not None for LSHSelfAttention
+        self_attention_outputs = self.self_attention(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            num_hashes=num_hashes,
+            cache=cache_layer,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            buckets=buckets,
+        )
+
+        # add buckets if necessary
+        if hasattr(self_attention_outputs, "buckets"):
+            buckets = self_attention_outputs.buckets
+        else:
+            buckets = None
+
+        # cache hidden states for future use
+        if use_cache:
+            if cache[self.layer_id][0] is None:
+                # padded input should not be cached
+                past_buckets = (
+                    buckets[:, :, :, :orig_sequence_length]
+                    if (buckets is not None and orig_sequence_length > 1)
+                    else buckets
+                )
+            else:
+                past_buckets = paddle.concat([cache[self.layer_id][0], buckets], axis=-1)
+
+            if cache[self.layer_id][1] is None:
+                # padded input should not be cached
+                past_states = hidden_states[:, :orig_sequence_length]
+            else:
+                past_states = paddle.concat([cache[self.layer_id][1], hidden_states], axis=1)
+
+            cache[self.layer_id] = (past_buckets, past_states)
+        # compute attention feed forward output
+        attention_output = self.output(self_attention_outputs.hidden_states)
+
+        return AttentionOutput(
+            hidden_states=attention_output,
+            attention_probs=self_attention_outputs.attention_probs,
+            buckets=buckets,
+        )
+
+
+class ReformerFeedForwardDense(nn.Layer):
+    def __init__(self, config: ReformerConfig):
+        super().__init__()
+        self.dropout = config.hidden_dropout_prob
+
+        if isinstance(config.hidden_act, str):
+            self.act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.act_fn = config.hidden_act
+
+        self.dense = nn.Linear(config.hidden_size, config.feed_forward_size)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = self.act_fn(hidden_states)
+        return hidden_states
+
+
+class ReformerFeedForwardOutput(nn.Layer):
+    def __init__(self, config: ReformerConfig):
+        super().__init__()
+        self.dropout = config.hidden_dropout_prob
+
+        self.dense = nn.Linear(config.feed_forward_size, config.hidden_size)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        return hidden_states
+
+
+class ChunkReformerFeedForward(nn.Layer):
+    def __init__(self, config: ReformerConfig):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dense = ReformerFeedForwardDense(config)
+        self.output = ReformerFeedForwardOutput(config)
+
+    def forward(self, attention_output):
+        return _apply_chunking_to_forward(
+            self.forward_chunk,
+            self.chunk_size_feed_forward,
+            self.seq_len_dim,
+            attention_output,
+        )
+
+    def forward_chunk(self, hidden_states):
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        return self.output(hidden_states)
+
+
+class ReformerLayer(nn.Layer):
+    def __init__(self, config: ReformerConfig, layer_id=0):
+        super().__init__()
+        self.attention = ReformerAttention(config, layer_id)
+        # dropout requires to have the same
+        # seed for forward and backward pass
+        self.attention_seed = None
+        self.feed_forward_seed = None
+
+        self.feed_forward = ChunkReformerFeedForward(config)
+
+    def _init_attention_seed(self):
+        """
+        This function sets a new seed for the attention layer to make dropout
+        deterministic for both forward calls: 1 normal forward call and 1 forward
+        call in backward to recalculate activations.
+        """
+
+        # randomize seeds
+        # use cuda generator if available
+        if paddle.get_device() != "cpu":
+            # GPU
+            device_idx = int(paddle.get_device().split(":")[1])
+            sts = paddle.get_cuda_rng_state()
+            self.attention_seed = sts[device_idx].current_seed()
+        else:
+            # CPU
+            self.attention_seed = np.random.randint(0, sys.maxsize, size=(1,), dtype="int64").item()
+
+        paddle.seed(self.attention_seed)
+
+    def _init_feed_forward_seed(self):
+        """
+        This function sets a new seed for the feed forward layer to make dropout deterministic for both forward calls:
+        1 normal forward call and 1 forward call in backward to recalculate activations.
+        """
+        # randomize seeds
+        # use cuda generator if available
+        if paddle.get_device() != "cpu":
+            # GPU
+            device_idx = int(paddle.get_device().split(":")[1])
+            sts = paddle.get_cuda_rng_state()
+            self.feed_forward_seed = sts[device_idx].current_seed()
+        else:
+            # CPU
+            self.feed_forward_seed = np.random.randint(0, sys.maxsize, size=(1,), dtype="int64").item()
+
+        paddle.seed(self.feed_forward_seed)
+
+    def forward(
+        self,
+        prev_attn_output,
+        hidden_states,
+        attention_mask=None,
+        num_hashes=None,
+        cache=None,
+        use_cache=False,
+        orig_sequence_length=None,
+        output_attentions=False,
+    ):
+        with paddle.no_grad():
+            # every forward pass we sample a different seed
+            # for dropout and save for forward fn in backward pass
+            # to have correct dropout
+            if self.training:
+                self._init_attention_seed()
+
+            attn_outputs = self.attention(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                num_hashes=num_hashes,
+                cache=cache,
+                use_cache=use_cache,
+                orig_sequence_length=orig_sequence_length,
+                output_attentions=output_attentions,
+            )
+            attn_output = attn_outputs.hidden_states
+
+            # Implementation of RevNet (see Fig. 6 in https://towardsdatascience.com/illustrating-the-reformer-393575ac6ba0)
+            # Y_1 = X_1 + f(X_2)
+            attn_output = prev_attn_output + attn_output
+
+            # free memory
+            del prev_attn_output
+
+            # every forward pass we sample a different seed
+            # for dropout and save seed for forward fn in backward
+            # to have correct dropout
+            if self.training:
+                self._init_feed_forward_seed()
+            # Y_2 = X_2 + g(Y_1)
+            hidden_states = hidden_states + self.feed_forward(attn_output)
+
+        return ReformerOutput(
+            attn_output=attn_output,
+            hidden_states=hidden_states,
+            attention_probs=attn_outputs.attention_probs,
+            buckets=attn_outputs.buckets,
+        )
+
+    def backward_pass(
+        self,
+        next_attn_output,
+        hidden_states,
+        grad_attn_output,
+        grad_hidden_states,
+        attention_mask=None,
+        buckets=None,
+    ):
+        # Implements the backward pass for reversible ResNets.
+        # A good blog post on how this works can be found here:
+        # Implementation of RevNet (see Fig. 6 in https://towardsdatascience.com/illustrating-the-reformer-393575ac6ba0)
+        # This code is heavily inspired by https://github.com/lucidrains/reformer-pytorch/blob/master/reformer_pytorch/reversible.py
+
+        assert (
+            self.training
+        ), "If you want to train `ReformerModel` and its variations, make sure to use `model.train()` to put the model into training mode."
+
+        with paddle.set_grad_enabled(True):
+            next_attn_output.stop_gradient = False
+            # set seed to have correct dropout
+            paddle.seed(self.feed_forward_seed)
+            # g(Y_1)
+            res_hidden_states = self.feed_forward(next_attn_output)
+            res_hidden_states.backward(grad_hidden_states, retain_graph=True)
+
+        with paddle.no_grad():
+            # X_2 = Y_2 - g(Y_1)
+            hidden_states = hidden_states - res_hidden_states
+            del res_hidden_states
+            grad_attn_output = grad_attn_output + next_attn_output.grad
+
+            next_attn_output.stop_gradient = True
+
+        with paddle.set_grad_enabled(True):
+            hidden_states.stop_gradient = False
+
+            # set seed to have correct dropout
+            paddle.seed(self.attention_seed)
+            # f(X_2)
+            # use cached buckets for backprob if buckets not None for LSHSelfAttention
+            output = self.attention(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                buckets=buckets,
+            ).hidden_states
+            output.backward(grad_attn_output, retain_graph=True)
+
+        with paddle.no_grad():
+            # X_1 = Y_1 - f(X_2)
+            attn_output = next_attn_output - output
+            del output, next_attn_output
+
+            grad_hidden_states = grad_hidden_states + hidden_states.grad
+            hidden_states.stop_gradient = True
+            hidden_states = hidden_states.detach()
+
+        return ReformerBackwardOutput(
+            attn_output=attn_output,
+            hidden_states=hidden_states,
+            grad_attn_output=grad_attn_output,
+            grad_hidden_states=grad_hidden_states,
+        )
+
+
+class ReformerEncoder(nn.Layer):
+    def __init__(self, config: ReformerConfig):
+        super().__init__()
+        self.dropout = config.hidden_dropout_prob
+
+        self.layers = nn.LayerList([ReformerLayer(config, i) for i in range(config.num_hidden_layers)])
+        # Reformer is using Rev Nets, thus last layer outputs are concatenated and
+        # Layer Norm is done over 2 * hidden_size
+        self.layer_norm = nn.LayerNorm(2 * config.hidden_size, epsilon=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        num_hashes=None,
+        cache=None,
+        use_cache=False,
+        orig_sequence_length=None,
+        output_hidden_states=False,
+        output_attentions=False,
+    ):
+        # hidden_states and attention lists to be filled if wished
+        all_hidden_states = []
+        all_attentions = []
+
+        # init cached hidden states if necessary
+        if cache is None:
+            cache = [((None), (None)) for i in range(len(self.layers))]
+
+        # concat same tensor for reversible ResNet
+        hidden_states = paddle.concat([hidden_states, hidden_states], axis=-1)
+        hidden_states = _ReversibleFunction.apply(
+            hidden_states,
+            self.layers,
+            attention_mask,
+            num_hashes,
+            all_hidden_states,
+            all_attentions,
+            cache,
+            use_cache,
+            orig_sequence_length,
+            output_hidden_states,
+            output_attentions,
+        )
+
+        # Apply layer norm to concatenated hidden states
+        hidden_states = self.layer_norm(hidden_states)
+
+        # Apply dropout
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        return ReformerEncoderOutput(
+            hidden_states=hidden_states,
+            all_hidden_states=all_hidden_states,
+            all_attentions=all_attentions,
+            cache=cache,
+        )
+
+
+class ReformerOnlyLMHead(nn.Layer):
+    def __init__(self, config: ReformerConfig):
+        super().__init__()
+        # Reformer is using Rev Nets, thus last layer outputs are concatenated and
+        # Layer Norm is done over 2 * hidden_size
+        self.seq_len_dim = 1
+        self.chunk_size_lm_head = config.chunk_size_lm_head
+        self.decoder = nn.Linear(2 * config.hidden_size, config.vocab_size, bias_attr=False)
+        self.bias = self.create_parameter(shape=(config.vocab_size,), is_bias=True)
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        return _apply_chunking_to_forward(self.forward_chunk, self.chunk_size_lm_head, self.seq_len_dim, hidden_states)
+
+    def forward_chunk(self, hidden_states):
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class ReformerClassificationHead(nn.Layer):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(2 * config.hidden_size, config.hidden_size)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_classes)
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states[:, 0]  # take <s> token (equiv. to [CLS])
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = F.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
+@dataclass
+class LSHSelfAttentionOutput(ModelOutput):
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attention_probs: Optional[Tuple[paddle.Tensor]] = None
+    buckets: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class LocalSelfAttentionOutput(ModelOutput):
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attention_probs: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class AttentionOutput(ModelOutput):
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attention_probs: Optional[Tuple[paddle.Tensor]] = None
+    buckets: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class ReformerOutput(ModelOutput):
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attn_output: Optional[Tuple[paddle.Tensor]] = None
+    attention_probs: Optional[Tuple[paddle.Tensor]] = None
+    buckets: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class ReformerBackwardOutput(ModelOutput):
+    attn_output: Optional[Tuple[paddle.Tensor]] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    grad_attn_output: Optional[Tuple[paddle.Tensor]] = None
+    grad_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class ReformerEncoderOutput(ModelOutput):
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    all_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    all_attentions: Optional[Tuple[paddle.Tensor]] = None
+    cache: Optional[Tuple[paddle.Tensor]] = None
+
+
+class ReformerPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained Reformer models. It provides Reformer related
+    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
+    `pretrained_init_configuration`, `base_model_prefix` for downloading and
+    loading pretrained models. See `PretrainedModel` for more details.
+    """
+
+    base_model_prefix = "reformer"
+    config_class = ReformerConfig
+
+    pretrained_init_configuration = REFORMER_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = REFORMER_PRETRAINED_RESOURCE_FILES_MAP
+
+    def _init_weights(self, layer):
+        """Initialize the weights"""
+        if isinstance(layer, AxialPositionEmbeddings):
+            for weight in layer.weights:
+                weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.axial_norm_std,
+                        shape=weight.shape,
+                    )
+                )
+
+        elif isinstance(layer, nn.Embedding):
+            layer.weight.set_value(
+                paddle.tensor.normal(
+                    mean=0.0,
+                    std=self.config.initializer_range,
+                    shape=layer.weight.shape,
+                )
+            )
+
+        elif isinstance(layer, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            layer.weight.set_value(
+                paddle.tensor.normal(
+                    mean=0.0,
+                    std=self.config.axial_norm_std,
+                    shape=layer.weight.shape,
+                )
+            )
+
+            if layer.bias is not None:
+                layer.bias.set_value(paddle.zeros_like(layer.bias))
+
+        elif isinstance(layer, nn.LayerNorm):
+            layer.bias.set_value(paddle.zeros_like(layer.bias))
+            layer.weight.set_value(paddle.full_like(layer.weight, 1.0))
+
+
+@register_base_model
+class ReformerModel(ReformerPretrainedModel):
+    """
+    The bare Reformer Model transformer outputting raw hidden-states without any specific head on top.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        tie_word_embeddings (bool, optional):
+            Whether to tie input and output embeddings. Defaults to `False`.
+        is_decoder (bool, optional):
+            Whether or not to use a causal mask in addition to the `attention_mask` passed to `ReformerModel`. When using the Reformer for causal language modeling, this argument should be set to `True`. Defaults to `True`.
+        chunk_size_feed_forward (int, optional):
+            The chunk size of all feed forward layers in the residual attention blocks. A chunk size of `0` means
+            that the feed forward layer is not chunked. A chunk size of n means that the feed forward layer processes
+            `n` < sequence_length embeddings at a time. Defaults to `0`.
+        pad_token_id (int, optional):
+            The id of the `padding` token. Defaults to `0`.
+        hash_seed (int, optional):
+            Seed that can be used to make local sensitive hashing in `LSHSelfAttention` deterministic. This should
+            only be set for testing purposed. For evaluation and training purposes `hash_seed` should be left as
+            `None` to ensure fully random rotations in local sensitive hashing scheme. Defaults to `None`.
+        vocab_size (int, optional):
+            Vocabulary size of `inputs_ids` in `ReformerModel`. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `ReformerModel`. Defaults to `258`.
+        attention_head_size (int, optional):
+            Dimensionality of the projected key, query and value vectors. Defaults to `128`.
+        hidden_size (int, optional):
+            Dimensionality of the embedding layer, encoder layer.Defaults to `1024`.
+        num_attention_heads (int, optional):
+            Number of attention heads for each attention layer in the Transformer encoder.
+            Defaults to `8`.
+        num_hashes (int, optional):
+            Number of hashing rounds (e.g., number of random rotations) in Local Sensitive Hashing scheme. The higher `num_hashes`, the more accurate the `LSHSelfAttention` becomes, but also the more memory and time intensive the hashing becomes. Defaults to `4`.
+        num_hidden_layers (int, optional):
+            Number of hidden layers in the Transformer encoder. Defaults to `12`.
+        num_buckets (int or List[int], optional):
+            Number of buckets, the key query vectors can be "hashed into" using the locality sensitive hashing scheme.
+            Each query key vector is hashed into a hash in `1, ..., num_buckets`. The number of buckets can also be factorized into a list for improved memory complexity. In this case, each query key vector is hashed into a hash in `1-1, 1-2, ..., num_buckets[0]-1, ..., num_buckets[0]-num_buckets[1]` if `num_buckets` is factorized into two factors. The number of buckets (or the product the factors) should approximately equal sequence length / lsh_chunk_length. If `num_buckets` not set, a good value is calculated on the fly. Defaults to `512`.
+        lsh_attn_chunk_length (int, optional):
+            Length of chunk which attends to itself in `LSHSelfAttention`. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).Defaults to `256`.
+        local_attn_chunk_length (int, optional):
+            Length of chunk which attends to itself in `LocalSelfAttention`. Chunking reduces memory complexity from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk length (chunked self attention).Defaults to `128`.
+        lsh_num_chunks_after (int, optional):
+            Number of following neighbouring chunks to attend to in `LSHSelfAttention` layer to itself. Defaults to `0`.
+        lsh_num_chunks_before (int, optional):
+            Number of previous neighbouring chunks to attend to in `LSHSelfAttention` layer to itself. Defaults to `1`.
+        local_num_chunks_after (int, optional):
+            Number of following neighbouring chunks to attend to in `LocalSelfAttention` layer to itself. Defaults to `0`.
+        local_num_chunks_before (int, optional):
+            Number of previous neighbouring chunks to attend to in `LocalSelfAttention` layer to itself. Defaults to `1`.
+        hidden_act (str, optional):
+            The non-linear activation function (function or string) in the feed forward layer in the residual attention block. If string, `"gelu"`, `"relu"`, `"tanh"`, `"mish"` and `"gelu_new"` are supported. Defaults to `"relu"`.
+        feed_forward_size (int, optional):
+            Dimensionality of the feed_forward layer in the residual attention block. Defaults to `4096`.
+        hidden_dropout_prob (float, optional):
+            The dropout ratio for all fully connected layers in the embeddings and encoder. Defaults to `0.2`.
+        lsh_attention_probs_dropout_prob (float, optional):
+            The dropout ratio for the attention probabilities in `LSHSelfAttention`. Defaults to `0.1`.
+        local_attention_probs_dropout_prob (float, optional):
+            The dropout ratio for the attention probabilities in `LocalSelfAttention`. Defaults to `0.2`.
+        max_position_embeddings (int, optional):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). Defaults to `65536`.
+        initializer_range (float, optional):
+            The standard deviation of the normal initializer. Defaults to `0.02`.
+
+            .. note::
+                A normal_initializer initializes weight matrices as normal distributions.
+                See :meth:`ReformerPretrainedModel._init_weights()` for how weights are initialized in `ReformerModel`.
+
+        layer_norm_eps (float, optional):
+            The epsilon used by the layer normalization layers. Defaults to `1e-12`.
+
+        axial_pos_embds (bool, optional):
+            Whether or not to use axial position embeddings. Defaults to `True`.
+        axial_pos_shape (List[int], optional):
+            The position dims of the axial position encodings. During training, the product of the position dims has to be equal to the sequence length. Defaults to `[128, 512]`.
+        axial_pos_embds_dim (List[int], optional):
+            The embedding dims of the axial position encodings. The sum of the embedding dims has to be equal to the
+            hidden size. Defaults to `[256, 768]`.
+        axial_norm_std (float, optional):
+            The standard deviation of the normal_initializer for initializing the weight matrices of the axial
+            positional encodings. Defaults to `1.0`.
+        chunk_size_lm_head (int, optional):
+            The chunk size of the final language model feed forward head layer. A chunk size of 0 means that the feed forward layer is not chunked. A chunk size of n means that the feed forward layer processes n <
+            sequence_length embeddings at a time. Defaults to `0`.
+        attn_layers (List[str], optional):
+            List of attention layer types in ascending order. It can be chosen between a LSHSelfAttention layer
+            (`"lsh"`) and a LocalSelfAttention layer (`"local"`). Defaults to `["local", "local", "lsh", "local", "local", "local", "lsh", "local", "local", "local", "lsh", "local"]`.
+
+    """
+
+    def __init__(self, config: ReformerConfig):
+        super().__init__(config)
+        assert (
+            self.config.num_hidden_layers > 0
+        ), "`config.attn_layers` is empty. Select at least one attn layer form ['lsh', 'local']"
+
+        self.embeddings = ReformerEmbeddings(config)
+        self.encoder = ReformerEncoder(config)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        num_hashes: Optional[int] = None,
+        cache: Optional[List[Tuple[Tensor]]] = None,
+        use_cache: Optional[bool] = False,
+        inputs_embeds: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The ReformerModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on
+                to some unwanted positions, usually the paddings or the subsequent positions.
+                Its data type can be int, float.
+                When the data type is int, the `masked` tokens have `0` values and the others
+                have `1` values.
+                When the data type is float, the `masked` tokens have `0.0` values and the
+                others have `1.0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            position_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings.
+                Selected in the range `[0, max_position_embeddings - 1]`.
+                Shape as [batch_size, num_tokens] and dtype as int64. Defaults to `None`.
+            num_hashes (int, optional):
+                The number of hashing rounds that should be performed during bucketing. Setting
+                this argument overwrites the default defined in `config["num_hashes"]`.
+                Defaults to `None`.
+            cache (List[tuple(Tensor, Tensor)], optional):
+                List of `tuple(Tensor, Tensor)` of length `config["num_hidden_layers"]`, with
+                the first element being the previous `buckets` of shape `[batch_size, num_heads, num_hashes, sequence_length]` and the second being the previous `hidden_states` of shape
+                `[batch_size, sequence_length, hidden_size]`.
+                Contains precomputed hidden-states and buckets (only relevant for LSH Self-Attention). Can
+                be used to speed up sequential decoding.
+                Defaults to `None`.
+            use_cache (bool, optional):
+                Whether or not to use cache. If set to `True`, `cache` states are returned
+                and can be used to speed up decoding.
+                Defaults to `False`.
+            inputs_embeds (Tensor, optional):
+                If you want to control how to convert `inputs_ids` indices into associated vectors, you can
+                pass an embedded representation directly instead of passing `inputs_ids`.
+            output_attentions (bool, optional):
+                Whether or not to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            output_hidden_states (bool, optional):
+                Whether or not to return the output of all hidden layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.ModelOutput` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ReformerModel, ReformerTokenizer
+
+                tokenizer = ReformerTokenizer.from_pretrained('reformer-crime-and-punishment')
+                model = ReformerModel.from_pretrained('reformer-crime-and-punishment')
+                model.eval()
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+
+                outputs = model(**inputs)
+                last_hidden_state = outputs[0]
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else False
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else False
+        return_dict = return_dict if return_dict is not None else False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.shape  # noqa: F841
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.shape[:-1]  # noqa: F841
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        assert (
+            len(input_shape) == 2
+        ), f"`input_ids` have be of shape `[batch_size, sequence_length]`, but got shape: {input_shape}"
+
+        if cache is not None:
+            assert not self.training, "`cache` can only be used for inference, not for training`."
+
+        # original sequence length for padding
+        orig_sequence_length = input_shape[-1]
+
+        # if needs padding
+        least_common_mult_chunk_length = _get_least_common_mult_chunk_len(
+            self.config.attn_layers, self.config.lsh_attn_chunk_length, self.config.local_attn_chunk_length
+        )
+        min_chunk_length = _get_min_chunk_len(
+            self.config.attn_layers, self.config.lsh_attn_chunk_length, self.config.local_attn_chunk_length
+        )
+
+        must_pad_to_match_chunk_length = (
+            input_shape[-1] % least_common_mult_chunk_length != 0
+            and input_shape[-1] > min_chunk_length
+            and cache is None
+        )
+
+        if must_pad_to_match_chunk_length:
+            padding_length = least_common_mult_chunk_length - input_shape[-1] % least_common_mult_chunk_length
+
+            if self.training is True:
+                raise ValueError(
+                    f"If training, sequence length {input_shape[-1]} has to be a multiple of least common multiple "
+                    f"chunk_length {least_common_mult_chunk_length}. Please consider padding the input to a length "
+                    f"of {input_shape[-1] + padding_length}."
+                )
+
+            # pad input
+            input_ids, inputs_embeds, attention_mask, position_ids, input_shape = self._pad_to_mult_of_chunk_length(
+                input_ids,
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                input_shape=input_shape,
+                padding_length=padding_length,
+                padded_seq_length=least_common_mult_chunk_length,
+            )
+
+        # start index for position encoding depends on incremental decoding
+        if cache is not None:
+            start_idx_pos_encodings = cache[0][1].shape[1]
+        else:
+            start_idx_pos_encodings = 0
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            start_idx_pos_encodings=start_idx_pos_encodings,
+            inputs_embeds=inputs_embeds,
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states=embedding_output,
+            attention_mask=attention_mask,
+            num_hashes=num_hashes,
+            cache=cache,
+            use_cache=use_cache,
+            orig_sequence_length=orig_sequence_length,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+        )
+        sequence_output = encoder_outputs.hidden_states
+
+        # if padding was applied
+        if must_pad_to_match_chunk_length:
+            sequence_output = sequence_output[:, :orig_sequence_length]
+
+        cache = encoder_outputs.cache if use_cache else None
+        hidden_states = encoder_outputs.all_hidden_states if output_hidden_states else None
+        attentions = encoder_outputs.all_attentions if output_attentions else None
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    sequence_output,
+                    cache,
+                    hidden_states,
+                    attentions,
+                ]
+                if v is not None
+            )
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            past_key_values=cache,
+            hidden_states=hidden_states,
+            attentions=attentions,
+        )
+
+    def _pad_to_mult_of_chunk_length(
+        self,
+        input_ids,
+        inputs_embeds=None,
+        attention_mask=None,
+        position_ids=None,
+        input_shape=None,
+        padding_length=None,
+        padded_seq_length=None,
+    ):
+        logger.info(
+            f"Input ids are automatically padded from {input_shape[-1]} to {input_shape[-1] + padding_length} to be a "
+            f"multiple of `config.chunk_length`: {padded_seq_length}"
+        )
+
+        padded_input_ids = paddle.full(
+            (input_shape[0], padding_length),
+            self.config.pad_token_id,
+            dtype=paddle.int64,
+        )
+
+        # Extend `attention_mask`
+        if attention_mask is not None:
+            pad_attention_mask = paddle.zeros(shape=[input_shape[0], padding_length], dtype=attention_mask.dtype)
+
+            attention_mask = paddle.concat([attention_mask, pad_attention_mask], axis=-1)
+        else:
+            attention_mask = paddle.concat(
+                [
+                    paddle.ones(input_shape, dtype=paddle.int64),
+                    paddle.zeros((input_shape[0], padding_length), dtype=paddle.int64),
+                ],
+                axis=-1,
+            )
+
+        # Extend `input_ids` with padding to match least common multiple chunk_length
+        if input_ids is not None:
+            input_ids = paddle.concat([paddle.cast(input_ids, dtype="int64"), padded_input_ids], axis=-1)
+            input_shape = input_ids.shape
+
+            # Pad position ids if given
+            if position_ids is not None:
+                padded_position_ids = paddle.arange(input_shape[-1], padded_seq_length, dtype=paddle.int64)
+                padded_position_ids = position_ids.unsqueeze(0).expand(input_shape[0], padding_length)
+                position_ids = paddle.concat([position_ids, padded_position_ids], axis=-1)
+
+        # Extend `inputs_embeds` with padding to match least common multiple chunk_length
+        if inputs_embeds is not None:
+            padded_inputs_embeds = self.embeddings(padded_input_ids, position_ids)
+            inputs_embeds = paddle.concat([inputs_embeds, padded_inputs_embeds], axis=-2)
+            input_shape = inputs_embeds.shape
+        return input_ids, inputs_embeds, attention_mask, position_ids, input_shape
+
+
+class ReformerModelWithLMHead(ReformerPretrainedModel):
+    """
+    The Reformer Model transformer with a language modeling head on top.
+
+    Args:
+        reformer (:class:`ReformerModel`):
+            An instance of :class:`ReformerModel`.
+
+    """
+
+    def __init__(self, config: ReformerConfig):
+        super().__init__(config)
+        self.reformer = ReformerModel(config)
+        local_num_chunks_after = self.config.local_num_chunks_after
+        lsh_num_chunks_after = self.config.lsh_num_chunks_after
+        assert self.config[
+            "is_decoder"
+        ], "If you want to use `ReformerModelWithLMHead` make sure that `is_decoder=True`."
+        assert (
+            "local" not in self.config.attn_layers or local_num_chunks_after == 0
+        ), f"If causal mask is enabled, make sure that `local_num_chunks_after` is set to 0 and not {local_num_chunks_after}."
+        assert (
+            "lsh" not in self.config.attn_layers or lsh_num_chunks_after == 0
+        ), f"If causal mask is enabled, make sure that `lsh_num_chunks_after` is set to 1 and not {lsh_num_chunks_after}."
+
+        """self.lm_head = ReformerOnlyLMHead(
+            chunk_size_lm_head=self.config.chunk_size_lm_head,
+            hidden_size=self.config.hidden_size,
+            vocab_size=self.config.vocab_size,
+        )"""
+        self.lm_head = ReformerOnlyLMHead(config)
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        num_hashes: Optional[int] = None,
+        cache: Optional[List[Tuple[Tensor]]] = None,
+        use_cache: Optional[bool] = False,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[Tensor] = None,
+        output_attentions: Optional[Tensor] = None,
+    ):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ReformerModel`.
+            position_ids (Tensor, optional):
+                See :class:`ReformerModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ReformerModel`.
+            num_hashes (int, optional):
+                See :class:`ReformerModel`.
+            cache (List[tuple(Tensor, Tensor)], optional):
+                See :class:`ReformerModel`.
+            use_cache (bool, optional):
+                See :class:`ReformerModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`ReformerModel`.
+            labels (Tensor, optional):
+                Labels for language modeling. Note that the labels **are shifted**
+                inside the model, i.e. you can set `labels = input_ids` Indices are
+                selected in `[-100, 0, ..., vocab_size]` All labels set to `-100` are
+                ignored (masked), the loss is only computed for labels in `[0, ..., vocab_size]`.
+                Shape is [batch_size, sequence_length] and dtype is int64.
+            output_attentions (bool, optional):
+                See :class:`ReformerModel`.
+            output_hidden_states (bool, optional):
+                See :class:`ReformerModel`.
+
+        Returns:
+            tuple: Returns tuple `(loss, logits, cache, hidden_states, attentions)`.
+
+            With the fields:
+
+            - `loss` (Tensor):
+                returned when `labels` is provided.
+                Language modeling loss (for next-token prediction).
+                It's data type should be float32 and its shape is [1,].
+
+            - `logits` (Tensor):
+                Prediction scores of the language modeling head
+                (scores for each vocabulary token before SoftMax).
+                It's data type should be float32 and its shape is
+                [batch_size, sequence_length, vocab_size].
+
+            - `cache` (List[tuple(Tensor, Tensor)]):
+                See :class:`ReformerModel`.
+
+            - `hidden_states` (tuple(Tensor)):
+                See :class:`ReformerModel`.
+
+            - `attentions` (tuple(Tensor)):
+                See :class:`ReformerModel`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ReformerModelWithLMHead, ReformerTokenizer
+
+                tokenizer = ReformerTokenizer.from_pretrained('reformer-crime-and-punishment')
+                model = ReformerModelWithLMHead.from_pretrained('reformer-crime-and-punishment')
+                model.eval()
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs, labels=inputs["input_ids"])
+
+                loss = output[0]
+                logits = output[1]
+
+        """
+
+        reformer_outputs = self.reformer(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            num_hashes=num_hashes,
+            cache=cache,
+            use_cache=use_cache,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+        )
+
+        sequence_output = reformer_outputs[0]
+        logits = self.lm_head(sequence_output)
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[:, :-1]
+            shift_labels = labels[:, 1:]
+
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.reshape(shape=[-1, self.config.vocab_size]),
+                shift_labels.flatten(),
+            )
+
+        output = (logits,) + reformer_outputs[1:]
+        return ((loss,) + output) if loss is not None else output
+
+    def prepare_inputs_for_generation(self, input_ids, cache=None, use_cache=None, num_hashes=None, **kwargs):
+        # only last token for inputs_ids if cache is defined in kwargs
+        if cache is not None:
+            input_ids = input_ids[:, -1:]
+
+        inputs_dict = {
+            "input_ids": input_ids,
+            "cache": cache,
+            "use_cache": use_cache,
+            "num_hashes": num_hashes,
+        }
+
+        return inputs_dict
+
+
+class ReformerForMaskedLM(ReformerPretrainedModel):
+    """
+    The Reformer Model transformer with a masked language modeling head on top.
+
+    Args:
+        reformer (:class:`ReformerModel`):
+            An instance of :class:`ReformerModel`.
+
+    """
+
+    def __init__(self, config: ReformerConfig):
+        super().__init__(config)
+        self.reformer = ReformerModel(config)
+        assert not self.config[
+            "is_decoder"
+        ], "If you want to use `ReformerForMaskedLM` make sure `is_decoder=False` for bi-directional self-attention."
+        self.lm_head = ReformerOnlyLMHead(config)
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        num_hashes: Optional[int] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ReformerModel`.
+            position_ids (Tensor, optional):
+                See :class:`ReformerModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ReformerModel`.
+            num_hashes (int, optional):
+                See :class:`ReformerModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`ReformerModel`.
+            labels (Tensor, optional):
+                Labels for computing the masked language modeling loss.
+                Indices should be in ``[-100, 0, ..., vocab_size]``
+                (see ``input_ids`` docstring) Tokens with indices set
+                to ``-100`` are ignored(masked), the loss is only computed
+                for the tokens with labels in ``[0, ..., vocab_size]``.
+                Shape is [batch_size, sequence_length] and dtype is int64.
+            output_attentions (bool, optional):
+                See :class:`ReformerModel`.
+            output_hidden_states (bool, optional):
+                See :class:`ReformerModel`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.MaskedLMOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            tuple: Returns tuple `(loss, logits, hidden_states, attentions)`.
+
+            With the fields:
+
+            - `loss` (Tensor):
+                returned when `labels` is provided.
+                Masked Language modeling loss.
+                It's data type should be float32 and its shape is [1,].
+
+            - `logits` (Tensor):
+                Prediction scores of the masked language modeling head
+                (scores for each vocabulary token before SoftMax).
+                It's data type should be float32 and its shape is
+                [batch_size, sequence_length, vocab_size].
+
+            - `hidden_states` (tuple(Tensor)):
+                See :class:`ReformerModel`.
+
+            - `attentions` (tuple(Tensor)):
+                See :class:`ReformerModel`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ReformerForMaskedLM, ReformerTokenizer
+
+                tokenizer = ReformerTokenizer.from_pretrained('reformer-crime-and-punishment')
+                model = ReformerForMaskedLM.from_pretrained('reformer-crime-and-punishment', is_decoder=False)
+                model.eval()
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs, labels=inputs["input_ids"])
+
+                loss = output[0]
+                logits = output[1]
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        reformer_outputs = self.reformer(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            num_hashes=num_hashes,
+            use_cache=False,  # no causal mask
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+
+        sequence_output = reformer_outputs[0]
+        logits = self.lm_head(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                logits.reshape(shape=[-1, self.config.vocab_size]),
+                labels.flatten(),
+            )
+
+        if not return_dict:
+            output = (logits,) + reformer_outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=logits,
+            hidden_states=reformer_outputs.hidden_states,
+            attentions=reformer_outputs.attentions,
+        )
+
+
+class ReformerForSequenceClassification(ReformerPretrainedModel):
+    """
+    The Reformer Model transformer with a sequence classification head on top (linear layer).
+
+    Args:
+        reformer (:class:`ReformerModel`):
+            An instance of :class:`ReformerModel`.
+        num_classes (int, optional):
+            The number of classes. Defaults to `2`.
+        dropout (float, optional):
+            The dropout probability for output of Reformer.
+            If None, use the same value as `hidden_dropout_prob` of `ReformerModel`
+            instance `reformer`. Defaults to None.
+
+    """
+
+    def __init__(self, config: ReformerConfig):
+        super().__init__(config)
+        self.reformer = ReformerModel(config)
+        self.num_classes = config.num_classes
+        self.classifier = ReformerClassificationHead(config)
+        if self.config.is_decoder:
+            logger.warning("You might want to disable causal masking for sequence classification")
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        num_hashes: Optional[int] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ReformerModel`.
+            position_ids (Tensor, optional):
+                See :class:`ReformerModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ReformerModel`.
+            num_hashes (int, optional):
+                See :class:`ReformerModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`ReformerModel`.
+            labels (Tensor, optional):
+                Labels for computing the sequence classification/regression loss. Indices
+                should be in `[0, ...,num_classes - 1]`. If `num_classes == 1` a regression
+                loss is computed (Mean-Square loss), If `num_classes > 1` a classification
+                loss is computed (Cross-Entropy).
+                Shape is [batch_size,] and dtype is int64.
+            output_attentions (bool, optional):
+                See :class:`ReformerModel`.
+            output_hidden_states (bool, optional):
+                See :class:`ReformerModel`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            tuple: Returns tuple `(loss, logits, hidden_states, attentions)`.
+
+            With the fields:
+
+            - `loss` (Tensor):
+                returned when `labels` is provided.
+                Classification (or regression if num_classes==1) loss.
+                It's data type should be float32 and its shape is [1,].
+
+            - `logits` (Tensor):
+                Classification (or regression if num_classes==1) scores (before SoftMax).
+                It's data type should be float32 and its shape is [batch_size, num_classes].
+
+            - `hidden_states` (tuple(Tensor)):
+                See :class:`ReformerModel`.
+
+            - `attentions` (tuple(Tensor)):
+                See :class:`ReformerModel`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ReformerForSequenceClassification, ReformerTokenizer
+
+                tokenizer = ReformerTokenizer.from_pretrained('reformer-crime-and-punishment')
+                model = ReformerForSequenceClassification.from_pretrained('reformer-crime-and-punishment', is_decoder=False)
+                model.eval()
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs, labels=paddle.to_tensor([0]))
+
+                loss = output[0]
+                logits = output[1]
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.reformer(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            num_hashes=num_hashes,
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.num_classes == 1:
+                #  We are doing regression
+                loss_fct = nn.MSELoss()
+                loss = loss_fct(logits.flatten(), labels.astype(logits.dtype).flatten())
+            else:
+                loss_fct = nn.CrossEntropyLoss()
+                loss = loss_fct(logits.reshape([-1, self.num_classes]), labels.flatten())
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class ReformerForQuestionAnswering(ReformerPretrainedModel):
+    """
+    Reformer Model with a span classification head on top for extractive question-answering tasks like
+    SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and
+    `span end logits`).
+
+    Args:
+        reformer (:class:`ReformerModel`):
+            An instance of ReformerModel.
+        dropout (float, optional):
+            The dropout probability for output of Reformer.
+            If None, use the same value as `hidden_dropout_prob` of `ReformerModel` instance `reformer`. Defaults to `None`.
+
+    """
+
+    def __init__(self, config: ReformerConfig):
+        super().__init__(config)
+        self.reformer = ReformerModel(config)
+        # 2 * hidden_size because we use reversible residual layers
+        self.qa_outputs = nn.Linear(2 * self.config.hidden_size, 2)
+        if self.config.is_decoder:
+            logger.warning("You might want to disable causal masking for question answering task.")
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        num_hashes: Optional[int] = None,
+        start_positions: Optional[Tensor] = None,
+        end_positions: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`ReformerModel`.
+            position_ids (Tensor, optional):
+                See :class:`ReformerModel`.
+            attention_mask (Tensor, optional):
+                See :class:`ReformerModel`.
+            num_hashes (int, optional):
+                See :class:`ReformerModel`.
+            start_positions (Tensor, optional):
+                Labels for position (index) of the start of the labelled
+                span for computing the token classification loss.
+                Positions are clamped to the length of the sequence
+                (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+                Shape is [batch_size,] and dtype is int64.
+            end_positions (Tensor, optional):
+                Labels for position (index) of the end of the labelled
+                span for computing the token classification loss.
+                Positions are clamped to the length of the sequence
+                (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+                Shape is [batch_size,] and dtype is int64.
+            inputs_embeds (Tensor, optional):
+                See :class:`ReformerModel`.
+            output_attentions (bool, optional):
+                See :class:`ReformerModel`.
+            output_hidden_states (bool, optional):
+                See :class:`ReformerModel`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+
+        Returns:
+            tuple: Returns tuple `(loss, logits, hidden_states, attentions)`.
+
+            With the fields:
+
+            - `loss` (Tensor):
+                returned when `labels` is provided.
+                Classification (or regression if num_classes==1) loss.
+                It's data type should be float32 and its shape is [1,].
+
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates
+                the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates
+                the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - `hidden_states` (tuple(Tensor)):
+                See :class:`ReformerModel`.
+
+            - `attentions` (tuple(Tensor)):
+                See :class:`ReformerModel`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import ReformerForQuestionAnswering, ReformerTokenizer
+
+                tokenizer = ReformerTokenizer.from_pretrained('reformer-crime-and-punishment')
+                model = ReformerForQuestionAnswering.from_pretrained('reformer-crime-and-punishment', is_decoder=False)
+                model.eval()
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+
+                start_logits = outputs[0]
+                end_logits = outputs[1]
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        reformer_outputs = self.reformer(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            num_hashes=num_hashes,
+            use_cache=False,  # no causal mask
+            inputs_embeds=inputs_embeds,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+
+        sequence_output = reformer_outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(2, axis=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clip(0, ignored_index)
+            end_positions = end_positions.clip(0, ignored_index)
+
+            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + reformer_outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=reformer_outputs.hidden_states,
+            attentions=reformer_outputs.attentions,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/reformer/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/reformer/tokenizer.py
new file mode 100644
index 000000000..6944bc258
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/reformer/tokenizer.py
@@ -0,0 +1,292 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import warnings
+
+import sentencepiece as spm
+
+from ..albert.tokenizer import AlbertEnglishTokenizer
+
+__all__ = ["ReformerTokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"reformer-enwik8": 65536, "reformer-crime-and-punishment": 524288}
+
+
+class ReformerTokenizer(AlbertEnglishTokenizer):
+    """
+    Constructs a Reformer tokenizer based on SentencePiece .
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        sentencepiece_model_file (str):
+            The vocabulary file (ends with '.spm') required to instantiate
+            a `SentencePiece <https://github.com/google/sentencepiece>`__ tokenizer.
+        do_lower_case (bool):
+            Whether or not to lowercase the input when tokenizing. Defaults to `False`.
+        remove_space (bool):
+            Whether or note to remove space when tokenizing. Defaults to `True`.
+        keep_accents (bool):
+            Whether or note to keep accents when tokenizing. Defaults to `False`.
+        eos_token (str):
+            A special token representing the *eos (end-of-sentence)* token.
+            Defaults to "</s>".
+        unk_token (str):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "<unk>".
+        pad_token (str):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "<unk>".
+
+    """
+
+    resource_files_names = {
+        "sentencepiece_model_file": "spiece.model",
+    }
+    pretrained_resource_files_map = {
+        "sentencepiece_model_file": {
+            "reformer-crime-and-punishment": "http://paddlenlp.bj.bcebos.com/models/transformers/reformer/reformer-crime-and-punishment/spiece.model",
+        },
+    }
+
+    pretrained_init_configuration = {
+        "reformer-crime-and-punishment": {"do_lower_case": False},
+    }
+
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        sentencepiece_model_file,
+        do_lower_case=False,
+        remove_space=True,
+        keep_accents=True,
+        eos_token="</s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        extra_ids=100,
+        additional_special_tokens=[],
+        sp_model_kwargs=None,
+        **kwargs
+    ):
+
+        # Add extra_ids to the special token list
+        if extra_ids > 0 and len(additional_special_tokens) == 0:
+            self._additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
+        elif extra_ids > 0 and len(additional_special_tokens) != 0:
+            # Check that we have the right number of extra_id special tokens
+            extra_tokens = len(set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens)))
+            if extra_tokens != extra_ids:
+                raise ValueError(
+                    f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are provided to ReformerTokenizer. "
+                    "In this case the additional_special_tokens must include the extra_ids tokens"
+                )
+
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.keep_accents = keep_accents
+        self.extra_ids = extra_ids
+        self.sentencepiece_model_file = sentencepiece_model_file
+
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(sentencepiece_model_file)
+
+    def __call__(
+        self,
+        text,
+        text_pair=None,
+        max_length=None,
+        stride=0,
+        is_split_into_words=False,
+        padding=None,
+        truncation="longest_first",
+        return_position_ids=False,
+        return_token_type_ids=False,
+        return_attention_mask=True,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_special_tokens_mask=False,
+        **kwargs
+    ):
+        if "pad_to_max_seq_len" in kwargs and padding is None:
+            pad_to_max_seq_len = kwargs.pop("pad_to_max_seq_len")
+            padding = "max_length" if pad_to_max_seq_len else False
+        elif padding is None:
+            padding = False
+
+        if "max_seq_len" in kwargs and max_length is None:
+            max_length = kwargs["max_seq_len"]
+
+        if "truncation_strategy" in kwargs and kwargs["truncation_strategy"] != "longest_first":
+            truncation = kwargs["truncation_strategy"]
+
+        return super(ReformerTokenizer, self).__call__(
+            text=text,
+            text_pair=text_pair,
+            max_length=max_length,
+            stride=stride,
+            is_split_into_words=is_split_into_words,
+            padding=padding,
+            truncation=truncation,
+            return_position_ids=return_position_ids,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_length=return_length,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        return len(self.sp_model) + self.extra_ids
+
+    def _add_eos_if_not_present(self, token_ids):
+        """Do not add eos again if user already added it."""
+        if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
+            warnings.warn(
+                f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
+            )
+            return token_ids
+        else:
+            return token_ids + [self.eos_token_id]
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1):
+        """
+        Build model inputs from a sequence or a pair of sequence.
+
+        An Reformer sequence has the following format:
+
+        - single sequence:      ``X </s>``
+        - pair of sequences:        ``A </s> B </s>``
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+
+        Returns:
+            List[int]: List of input_id with the appropriate special tokens.
+
+        """
+        token_ids_0 = self._add_eos_if_not_present(token_ids_0)
+        if token_ids_1 is None:
+            return token_ids_0
+        else:
+            token_ids_1 = self._add_eos_if_not_present(token_ids_1)
+            return token_ids_0 + token_ids_1
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        """
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        Should be overridden in a subclass if the model has a special way of building those.
+
+        Args:
+            offset_mapping_0 (List[tuple]):
+                List of char offsets to which the special tokens will be added.
+            offset_mapping_1 (List[tuple], optional):
+                Optional second list of char offsets for offset mapping pairs.
+
+        Returns:
+            List[tuple]: List of char offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return offset_mapping_0 + [(0, 0)]
+
+        return offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Create a mask from the two sequences.
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            List[int]: List of token_type_id according to the given sequence(s).
+
+        """
+        eos = [self.eos_token_id]
+        if token_ids_1 is None:
+            return len(token_ids_0 + eos) * [0]
+        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+
+        Args:
+            token_ids_0 (List[int]): List of ids of the first sequence.
+            token_ids_1 (List[int], optional): List of ids of the second sequence.
+            already_has_special_tokens (bool, optional): Whether or not the token list is already
+                formatted with special tokens for the model. Defaults to None.
+
+        Returns:
+            List[int]: The list of integers in the range [0, 1]:
+                1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True,
+            )
+
+        # normal case: some special tokens
+        if token_ids_1 is None:
+            return ([0] * len(token_ids_0)) + [1]
+        return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                out_string += self.sp_model.decode_pieces(current_sub_tokens) + token + " "
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+        out_string += self.sp_model.decode_pieces(current_sub_tokens)
+        return out_string.strip()
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if token.startswith("<extra_id_"):
+            match = re.match(r"<extra_id_(\d+)>", token)
+            num = int(match.group(1))
+            return self.vocab_size - num - 1
+        return self.sp_model.piece_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index < self.sp_model.get_piece_size():
+            token = self.sp_model.IdToPiece(index)
+        else:
+            token = f"<extra_id_{self.vocab_size - 1 - index}>"
+        return token
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rembert/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rembert/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rembert/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rembert/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rembert/configuration.py
new file mode 100644
index 000000000..2a5d84576
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rembert/configuration.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MBart model configuration"""
+from __future__ import annotations
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = [
+    "REMBERT_PRETRAINED_INIT_CONFIGURATION",
+    "REMBERT_PRETRAINED_RESOURCE_FILES_MAP",
+    "RemBertConfig",
+]
+
+REMBERT_PRETRAINED_INIT_CONFIGURATION = {
+    "rembert": {
+        "attention_probs_dropout_prob": 0,
+        "input_embedding_size": 256,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0,
+        "hidden_size": 1152,
+        "initializer_range": 0.02,
+        "intermediate_size": 4608,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 18,
+        "num_hidden_layers": 32,
+        "pad_token_id": 0,
+        "type_vocab_size": 2,
+        "vocab_size": 250300,
+        "layer_norm_eps": 1e-12,
+    }
+}
+
+REMBERT_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "rembert": "https://bj.bcebos.com/paddlenlp/models/transformers/rembert/model_state.pdparams",
+    }
+}
+
+
+class RemBertConfig(PretrainedConfig):
+    r"""
+    Args:
+    vocab_size (int):
+        Vocabulary size of `inputs_ids` in `RemBertModel`. Also is the vocab size of token embedding matrix.
+        Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `RemBertModel`.
+    input_embedding_size (int, optional):
+        Dimensionality of the embedding layer. Defaults to `256`.
+    hidden_size (int, optional):
+        Dimensionality of the encoder layer and pooler layer. Defaults to `1152`.
+    num_hidden_layers (int, optional):
+        Number of hidden layers in the Transformer encoder. Defaults to `32`.
+    num_attention_heads (int, optional):
+        Number of attention heads for each attention layer in the Transformer encoder.
+        Defaults to `18`.
+    intermediate_size (int, optional):
+        Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
+        to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
+        and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
+        Defaults to `3072`.
+    hidden_act (str, optional):
+        The non-linear activation function in the feed-forward layer.
+        ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
+        are supported. Defaults to `"gelu"`.
+    hidden_dropout_prob (float, optional):
+        The dropout probability for all fully connected layers in the embeddings and encoder.
+        Defaults to `0.1`.
+    attention_probs_dropout_prob (float, optional):
+        The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
+        Defaults to `0.1`.
+    max_position_embeddings (int, optional):
+        The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
+        sequence. Defaults to `512`.
+    type_vocab_size (int, optional):
+        The vocabulary size of `token_type_ids`.
+        Defaults to `16`.
+
+    initializer_range (float, optional):
+        The standard deviation of the normal initializer.
+        Defaults to 0.02.
+
+        .. note::
+            A normal_initializer initializes weight matrices as normal distributions.
+            See :meth:`BertPretrainedModel.init_weights()` for how weights are initialized in `BertModel`.
+
+    pad_token_id (int, optional):
+        The index of padding token in the token vocabulary.
+        Defaults to `0`.
+    """
+
+    model_type = "rembert"
+
+    def __init__(
+        self,
+        vocab_size=250300,
+        input_embedding_size=256,
+        hidden_size=1152,
+        num_hidden_layers=32,
+        num_attention_heads=18,
+        intermediate_size=4608,
+        hidden_act="gelu",
+        hidden_dropout_prob=0,
+        attention_probs_dropout_prob=0,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        pad_token_id=0,
+        layer_norm_eps=1e-12,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.input_embedding_size = input_embedding_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.pad_token_id = pad_token_id
+        self.layer_norm_eps = layer_norm_eps
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rembert/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rembert/modeling.py
new file mode 100644
index 000000000..c4697253e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rembert/modeling.py
@@ -0,0 +1,781 @@
+# encoding=utf8
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlenlp.transformers import PretrainedModel, register_base_model
+
+from ..activations import get_activation
+from .configuration import (
+    REMBERT_PRETRAINED_INIT_CONFIGURATION,
+    REMBERT_PRETRAINED_RESOURCE_FILES_MAP,
+    RemBertConfig,
+)
+
+__all__ = [
+    "RemBertModel",
+    "RemBertForMaskedLM",
+    "RemBertForQuestionAnswering",
+    "RemBertForSequenceClassification",
+    "RemBertForMultipleChoice",
+    "RemBertPretrainedModel",
+    "RemBertForTokenClassification",
+]
+
+
+class RemBertPretrainedModel(PretrainedModel):
+    pretrained_init_configuration = REMBERT_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = REMBERT_PRETRAINED_RESOURCE_FILES_MAP
+    base_model_prefix = "rembert"
+    config_class = RemBertConfig
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # only support dygraph, use truncated_normal and make it inplace
+            # and configurable later
+            layer.weight.set_value(
+                paddle.tensor.normal(
+                    mean=0.0,
+                    std=self.config.initializer_range,
+                    shape=layer.weight.shape,
+                )
+            )
+        elif isinstance(layer, nn.LayerNorm):
+            layer._epsilon = 1e-12
+
+
+class RemBertEmbeddings(nn.Layer):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config: RemBertConfig):
+        super(RemBertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.input_embedding_size)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.input_embedding_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.input_embedding_size)
+
+        self.layer_norm = nn.LayerNorm(config.input_embedding_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", paddle.arange(end=config.max_position_embeddings).expand((1, -1)))
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+    ):
+        input_shape = input_ids.shape
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(input_shape, dtype="int64")
+
+        inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings += position_embeddings
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class RemBertPooler(nn.Layer):
+    def __init__(self, config: RemBertConfig):
+        super(RemBertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class RemBertSelfAttention(nn.Layer):
+    def __init__(self, config: RemBertConfig):
+        super(RemBertSelfAttention, self).__init__()
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.shape[:-1] + [self.num_attention_heads, self.attention_head_size]
+        x = x.reshape(new_x_shape)
+        return x.transpose((0, 2, 1, 3))
+
+    def forward(self, hidden_states, attention_mask=None):
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_layer, key_layer.transpose((0, 1, 3, 2)))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in RemBertModel forward() function)
+            attention_scores = attention_scores + attention_mask.astype(attention_scores.dtype)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = F.softmax(attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = paddle.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.transpose((0, 2, 1, 3))
+        new_context_layer_shape = context_layer.shape[:-2] + [self.all_head_size]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs)
+        return outputs
+
+
+class RemBertSelfOutput(nn.Layer):
+    def __init__(self, config: RemBertConfig):
+        super(RemBertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.layer_norm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class RemBertAttention(nn.Layer):
+    def __init__(self, config: RemBertConfig):
+        super(RemBertAttention, self).__init__()
+        self.self = RemBertSelfAttention(config)
+        self.output = RemBertSelfOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+    ):
+        self_outputs = self.self(hidden_states, attention_mask)
+        attention_output = self.output(self_outputs, hidden_states)
+        return attention_output
+
+
+class RemBertIntermediate(nn.Layer):
+    def __init__(self, config: RemBertConfig):
+        super(RemBertIntermediate, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.intermediate_act_fn = get_activation(config.hidden_act)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class RemBertOutput(nn.Layer):
+    def __init__(self, config: RemBertConfig):
+        super(RemBertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.layer_norm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class RemBertLayer(nn.Layer):
+    def __init__(self, config: RemBertConfig):
+        super(RemBertLayer, self).__init__()
+        self.attention = RemBertAttention(config)
+
+        self.intermediate = RemBertIntermediate(config)
+        self.output = RemBertOutput(config)
+
+    def forward(self, hidden_states, attention_mask=None):
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+        )
+
+        layer_output = self.feed_forward_chunk(self_attention_outputs)
+
+        return layer_output
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class RemBertEncoder(nn.Layer):
+    def __init__(self, config: RemBertConfig):
+        super(RemBertEncoder, self).__init__()
+        self.embedding_hidden_mapping_in = nn.Linear(config.input_embedding_size, config.hidden_size)
+        self.layer = nn.LayerList([RemBertLayer(config) for _ in range(config.num_hidden_layers)])
+
+    def forward(self, hidden_states, attention_mask=None):
+        hidden_states = self.embedding_hidden_mapping_in(hidden_states)
+
+        for i, layer_module in enumerate(self.layer):
+            layer_outputs = layer_module(hidden_states, attention_mask)
+
+            hidden_states = layer_outputs
+
+        return hidden_states
+
+
+@register_base_model
+class RemBertModel(RemBertPretrainedModel):
+    """
+    The bare RemBERT Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+
+    """
+
+    def __init__(self, config: RemBertConfig):
+        super(RemBertModel, self).__init__(config)
+        self.pad_token_id = config.pad_token_id
+        self.num_hidden_layers = config.num_hidden_layers
+        self.initializer_range = config.initializer_range
+        self.layer_norm_eps = config.layer_norm_eps
+        self.embeddings = RemBertEmbeddings(config)
+        self.encoder = RemBertEncoder(config)
+
+        self.pooler = RemBertPooler(config)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, attention_mask=None):
+        r"""
+        The RemBertModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+
+        Returns:
+            tuple: Returns tuple (`sequence_output`, `pooled_output`)
+
+            With the fields:
+
+            - `sequence_output` (Tensor):
+                Sequence of hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+            - `pooled_output` (Tensor):
+                The output of first token (`[CLS]`) in sequence.
+                We "pool" the model by simply taking the hidden state corresponding to the first token.
+                Its data type should be float32 and its shape is [batch_size, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RemBertModel, RemBertTokenizer
+
+                tokenizer = RemBertTokenizer.from_pretrained('rembert')
+                model = RemBertModel.from_pretrained('rembert')
+
+                inputs = tokenizer("欢迎使用百度飞桨!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+        """
+        input_shape = input_ids.shape
+        if attention_mask is None:
+            attention_mask = paddle.unsqueeze(
+                (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e4, axis=[1, 2]
+            )
+        else:
+            if attention_mask.ndim == 2:
+                # attention_mask [batch_size, sequence_length] -> [batch_size, 1, 1, sequence_length]
+                attention_mask = attention_mask.unsqueeze(axis=[1, 2])
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(input_shape, dtype="int64")
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=attention_mask,
+        )
+        sequence_output = encoder_outputs
+        pooled_output = self.pooler(sequence_output)
+
+        return sequence_output, pooled_output
+
+
+class RemBertForSequenceClassification(RemBertPretrainedModel):
+    """
+    RemBert Model with a linear layer on top of the output layer,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        config (:class:`RemBertConfig`):
+            An instance of RemBertConfig used to construct RemBertForSequenceClassification.
+    """
+
+    def __init__(self, config: RemBertConfig):
+        super(RemBertForSequenceClassification, self).__init__(config)
+        self.rembert = RemBertModel(config)
+        self.dense = nn.Linear(config.hidden_size, config.num_classes)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
+        r"""
+        The RemBertForSequenceClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`RemBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`RemBertModel`.
+            position_ids (Tensor, optional):
+                See :class:`RemBertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`RemBertModel`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input text classification logits.
+            Shape as `[batch_size, num_classes]` and dtype as float32.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RemBertForSequenceClassification
+                from paddlenlp.transformers import RemBertTokenizer
+
+                tokenizer = RemBertTokenizer.from_pretrained('rembert')
+                model = RemBertForQuestionAnswering.from_pretrained('rembert', num_classes=2)
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+        """
+
+        pool_output = self.rembert(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+        )[1]
+
+        pool_output = self.dropout(pool_output)
+        logits = self.dense(pool_output)
+        return logits
+
+
+class RemBertForQuestionAnswering(RemBertPretrainedModel):
+    """
+    RemBert Model with a linear layer on top of the hidden-states output to compute `span_start_logits`
+    and `span_end_logits`, designed for question-answering tasks like SQuAD.
+
+    Args:
+        config (:class:`RemBertConfig`):
+            An instance of RemBertConfig used to construct RemBertForQuestionAnswering.
+    """
+
+    def __init__(self, config: RemBertConfig):
+        super(RemBertForQuestionAnswering, self).__init__(config)
+        self.rembert = RemBertModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, 2)
+
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+    ):
+        r"""
+        The RemBertForQuestionAnswering forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`RemBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`RemBertModel`.
+            position_ids (Tensor, optional):
+                See :class:`RemBertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`RemBertModel`.
+
+        Returns:
+            tuple: Returns tuple (`start_logits`, `end_logits`).
+
+            With the fields:
+
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RemBertForQuestionAnswering
+                from paddlenlp.transformers import RemBertTokenizer
+
+                tokenizer = RemBertTokenizer.from_pretrained('rembert')
+                model = RemBertForQuestionAnswering.from_pretrained('rembert')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                start_logits = outputs[0]
+                end_logits = outputs[1]
+        """
+
+        outputs = self.rembert(
+            input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = paddle.split(logits, num_or_sections=2, axis=-1)
+
+        return start_logits, end_logits
+
+
+class RemBertLMPredictionHead(nn.Layer):
+    """
+    RemBert Model with a `language modeling` head on top for CLM fine-tuning.
+    """
+
+    def __init__(self, config: RemBertConfig, embedding_weights=None):
+        super(RemBertLMPredictionHead, self).__init__()
+        self.transform = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = get_activation(config.hidden_act)
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.decoder = nn.Linear(config.hidden_size, config.hidden_size)
+
+    def forward(self, hidden_states, masked_positions=None):
+        if masked_positions is not None:
+            hidden_states = paddle.reshape(hidden_states, [-1, hidden_states.shape[-1]])
+            hidden_states = paddle.tensor.gather(hidden_states, masked_positions)
+        # gather masked tokens might be more quick
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class RemBertOnlyMLMHead(nn.Layer):
+    def __init__(self, config: RemBertConfig, embedding_weights):
+        super(RemBertOnlyMLMHead, self).__init__()
+        self.predictions = RemBertLMPredictionHead(config, embedding_weights=embedding_weights)
+
+    def forward(self, sequence_output, masked_positions=None):
+        prediction_scores = self.predictions(sequence_output, masked_positions)
+        return prediction_scores
+
+
+class RemBertForMaskedLM(RemBertPretrainedModel):
+    """
+    RemBert Model with a `masked language modeling` head on top.
+
+    Args:
+        config (:class:`RemBertConfig`):
+            An instance of RemBertConfig used to construct RemBertForMaskedLM.
+
+    """
+
+    def __init__(self, config: RemBertConfig):
+        super(RemBertForMaskedLM, self).__init__(config)
+        self.rembert = RemBertModel(config)
+        self.cls = RemBertOnlyMLMHead(
+            config=config,
+            embedding_weights=self.rembert.embeddings.word_embeddings.weight,
+        )
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`RemBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`RemBertModel`.
+            position_ids (Tensor, optional):
+                See :class:`RemBertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`RemBertModel`.
+
+        Returns:
+            Tensor: Returns tensor `prediction_scores`, The scores of masked token prediction.
+            Its data type should be float32 and shape is [batch_size, sequence_length, vocab_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RemBertForMaskedLM, RemBertTokenizer
+
+                tokenizer = RemBertTokenizer.from_pretrained('rembert')
+                model = RemBertForMaskedLM.from_pretrained('rembert')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+
+                logits = model(**inputs)
+        """
+
+        outputs = self.rembert(
+            input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output, masked_positions=None)
+        return prediction_scores
+
+
+class RemBertForTokenClassification(RemBertPretrainedModel):
+    """
+    RemBert Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        config (:class:`RemBertConfig`):
+            An instance of RemBertConfig used to construct RemBertForTokenClassification.
+    """
+
+    def __init__(self, config: RemBertConfig):
+        super(RemBertForTokenClassification, self).__init__(config)
+        self.num_classes = config.num_classes
+        self.rembert = RemBertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, self.num_classes)
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
+        r"""
+        The RemBertForTokenClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`RemBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`RemBertModel`.
+            position_ids(Tensor, optional):
+                See :class:`RemBertModel`.
+            attention_mask (list, optional):
+                See :class:`RemBertModel`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input token classification logits.
+            Shape as `[batch_size, sequence_length, num_classes]` and dtype as `float32`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RemBertForTokenClassification
+                from paddlenlp.transformers import RemBertTokenizer
+
+                tokenizer = RemBertTokenizer.from_pretrained('rembert')
+                model = RemBertForTokenClassification.from_pretrained('rembert')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+
+                logits = model(**inputs)
+                print(logits.shape)
+        """
+        sequence_output, _ = self.rembert(
+            input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask
+        )
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        return logits
+
+
+class RemBertForMultipleChoice(RemBertPretrainedModel):
+    """
+    RemBert Model with a linear layer on top of the hidden-states output layer,
+    designed for multiple choice tasks like RocStories/SWAG tasks.
+
+    Args:
+        config (:class:`RemBertConfig`):
+            An instance of RemBertConfig used to construct RemBertForMultipleChoice.
+    """
+
+    def __init__(self, config: RemBertConfig):
+        super(RemBertForMultipleChoice, self).__init__(config)
+        self.num_choices = config.num_choices
+        self.rembert = RemBertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
+        r"""
+        The BertForMultipleChoice forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`RemBertModel` and shape as [batch_size, num_choice, sequence_length].
+            token_type_ids(Tensor, optional):
+                See :class:`RemBertModel` and shape as [batch_size, num_choice, sequence_length].
+            position_ids(Tensor, optional):
+                See :class:`RemBertModel` and shape as [batch_size, num_choice, sequence_length].
+            attention_mask (list, optional):
+                See :class:`RemBertModel` and shape as [batch_size, num_choice, sequence_length].
+
+        Returns:
+            Tensor: Returns tensor `reshaped_logits`, a tensor of the multiple choice classification logits.
+            Shape as `[batch_size, num_choice]` and dtype as `float32`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RemBertForMultipleChoice, RemBertTokenizer
+                from paddlenlp.data import Pad, Dict
+
+                tokenizer = RemBertTokenizer.from_pretrained('rembert')
+                model = RemBertForMultipleChoice.from_pretrained('rembert', num_choices=2)
+
+                data = [
+                    {
+                        "question": "how do you turn on an ipad screen?",
+                        "answer1": "press the volume button.",
+                        "answer2": "press the lock button.",
+                        "label": 1,
+                    },
+                    {
+                        "question": "how do you indent something?",
+                        "answer1": "leave a space before starting the writing",
+                        "answer2": "press the spacebar",
+                        "label": 0,
+                    },
+                ]
+
+                text = []
+                text_pair = []
+                for d in data:
+                    text.append(d["question"])
+                    text_pair.append(d["answer1"])
+                    text.append(d["question"])
+                    text_pair.append(d["answer2"])
+
+                inputs = tokenizer(text, text_pair)
+                batchify_fn = lambda samples, fn=Dict(
+                    {
+                        "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input_ids
+                        "token_type_ids": Pad(
+                            axis=0, pad_val=tokenizer.pad_token_type_id
+                        ),  # token_type_ids
+                    }
+                ): fn(samples)
+                inputs = batchify_fn(inputs)
+
+                reshaped_logits = model(
+                    input_ids=paddle.to_tensor(inputs[0], dtype="int64"),
+                    token_type_ids=paddle.to_tensor(inputs[1], dtype="int64"),
+                )
+        """
+        # input_ids: [bs, num_choice, seq_l]
+        input_ids = input_ids.reshape(shape=(-1, input_ids.shape[-1]))  # flat_input_ids: [bs*num_choice,seq_l]
+
+        if position_ids is not None:
+            position_ids = position_ids.reshape(shape=(-1, position_ids.shape[-1]))
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.reshape(shape=(-1, token_type_ids.shape[-1]))
+
+        if attention_mask is not None:
+            attention_mask = attention_mask.reshape(shape=(-1, attention_mask.shape[-1]))
+
+        _, pooled_output = self.rembert(
+            input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask
+        )
+        pooled_output = self.dropout(pooled_output)
+
+        logits = self.classifier(pooled_output)  # logits: (bs*num_choice,1)
+        reshaped_logits = logits.reshape(shape=(-1, self.num_choices))  # logits: (bs, num_choice)
+
+        return reshaped_logits
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rembert/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rembert/tokenizer.py
new file mode 100644
index 000000000..a975be240
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rembert/tokenizer.py
@@ -0,0 +1,240 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from shutil import copyfile
+from typing import List, Optional
+
+import sentencepiece as spm
+
+from .. import PretrainedTokenizer
+
+__all__ = ["RemBertTokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"rembert": 512}
+
+
+class RemBertTokenizer(PretrainedTokenizer):
+    """
+    Construct a RemBertTokenizer.
+    For more information regarding those methods, please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            The vocabulary file path (ends with '.txt') required to instantiate
+            a `WordpieceTokenizer`.
+        do_lower_case (bool, optional):
+            Whether or not to lowercase the input when tokenizing.
+            Defaults to `False`.
+        unk_token (str, optional):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str, optional):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str, optional):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str, optional):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str, optional):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import RemBertTokenizer
+            tokenizer = RemBertTokenizer.from_pretrained('rembert')
+
+            inputs = tokenizer('欢迎使用飞桨！')
+            print(inputs)
+
+            '''
+            {'input_ids': [312, 573, 36203, 3916, 9744, 242391, 646, 313],
+            'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0]}
+            '''
+    """
+
+    resource_files_names = {"vocab_file": "sentencepiece.model"}
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "rembert": "https://bj.bcebos.com/paddlenlp/models/transformers/rembert/sentencepiece.model",
+        },
+    }
+    pretrained_init_configuration = {
+        "rembert": {"do_lower_case": False},
+    }
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=False,
+        remove_space=True,
+        keep_accents=True,
+        cls_token="[CLS]",
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.keep_accents = keep_accents
+        self.vocab_file = vocab_file
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(vocab_file)
+
+    @property
+    def vocab_size(self):
+        return len(self.sp_model)
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(self.vocab_file)
+
+    def _tokenize(self, text, sample=False):
+        """Tokenize a string."""
+        pieces = self.sp_model.EncodeAsPieces(text)
+        return pieces
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.PieceToId(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.sp_model.IdToPiece(index)
+
+    def convert_tokens_to_string(self, tokens):
+        out_string = self.sp_model.decode_pieces(tokens)
+        return out_string
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A REMBERT sequence has the following format:
+
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return cls + token_ids_0 + sep
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A RemBERT
+        sequence pair mask has the following format:
+
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None):
+        if not os.path.isdir(save_directory):
+            raise ValueError("Vocabulary path ({}) should be a directory".format(save_directory))
+            return None
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + "sentencepiece.model"
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ring_flash_attention.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ring_flash_attention.py
new file mode 100644
index 000000000..9fa8ea52b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/ring_flash_attention.py
@@ -0,0 +1,354 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# paddlenlp/transformers/ring_attention.py
+
+import paddle
+import paddle.distributed as dist
+import paddle.nn.functional as F
+from paddle import _C_ops
+from paddle.autograd.py_layer import PyLayer
+
+try:
+    from paddlenlp_ops import flash_attn_bwd
+except (ImportError, ModuleNotFoundError):
+    from paddlenlp.utils.log import logger
+
+    logger.warning(
+        "if you run ring_flash_attention.py, please ensure you install "
+        "the paddlenlp_ops by following the instructions "
+        "provided at https://github.com/PaddlePaddle/PaddleNLP/blob/develop/csrc/README.md"
+    )
+
+
+class RingCommunicator:
+    def __init__(self, group, local_key, local_value):
+        self._k_buffer = [paddle.zeros_like(local_key) for _ in range(2)]
+        self._v_buffer = [paddle.zeros_like(local_value) for _ in range(2)]
+
+        self._k_buffer[0] = local_key.clone()
+        self._v_buffer[0] = local_value.clone()
+
+        self._next_buffer_idx = 0
+
+        self.group = group
+        self.group_rank = group.rank
+        self.send_rank = self.group.ranks[(self.group_rank + 1) % self.group.world_size]
+        self.recv_rank = self.group.ranks[(self.group_rank - 1) % self.group.world_size]
+
+        self._reqs = []
+
+    def wait(self):
+        # TODO(zhangyuqin1998)：batch_isend_irecv异步流下，无法wait，需要修复。对性能有影响。
+        paddle.device.synchronize()
+
+    def add_to_buffers(self, key, value):
+        if key.shape != self._k_buffer[self._next_buffer_idx].shape:
+            self._k_buffer[self._next_buffer_idx][:, : key.shape[1], :, :].add_(key)
+            self._v_buffer[self._next_buffer_idx][:, : key.shape[1], :, :].add_(value)
+        else:
+            self._k_buffer[self._next_buffer_idx].add_(key)
+            self._v_buffer[self._next_buffer_idx].add_(value)
+
+    def get_buffers(self):
+        return self._k_buffer[self._next_buffer_idx], self._v_buffer[self._next_buffer_idx]
+
+    def send_recv(self):
+        send_k_op = dist.P2POp(dist.isend, self._k_buffer[self._next_buffer_idx], self.send_rank, self.group)
+        send_v_op = dist.P2POp(dist.isend, self._v_buffer[self._next_buffer_idx], self.send_rank, self.group)
+        recv_k_op = dist.P2POp(dist.irecv, self._k_buffer[(self._next_buffer_idx + 1) % 2], self.recv_rank, self.group)
+        recv_v_op = dist.P2POp(dist.irecv, self._v_buffer[(self._next_buffer_idx + 1) % 2], self.recv_rank, self.group)
+
+        self._next_buffer_idx = (self._next_buffer_idx + 1) % 2
+
+        ops = [send_k_op, send_v_op, recv_k_op, recv_v_op]
+
+        self._reqs = dist.batch_isend_irecv(ops)
+
+
+def update_out_and_lse(old_out, old_lse, block_out, block_lse, second_chunk_only=False):
+    if second_chunk_only:
+        second_chunk_out = old_out[:, old_out.shape[1] // 2 :, :, :]
+        second_chunk_lse = old_lse[:, old_lse.shape[1] // 2 :, :, :]
+        second_chunk_out, second_chunk_lse = update_out_and_lse(
+            second_chunk_out, second_chunk_lse, block_out, block_lse
+        )
+        old_out[:, old_out.shape[1] // 2 :, :, :] = second_chunk_out
+        old_lse[:, old_lse.shape[1] // 2 :, :, :] = second_chunk_lse
+        return old_out, old_lse
+    else:
+        block_out, block_lse = paddle.cast(block_out, "float32"), paddle.cast(block_lse, "float32")
+        with paddle.amp.auto_cast(enable=False):
+            return old_out - (old_out - block_out) * F.sigmoid(block_lse - old_lse), old_lse - F.log_sigmoid(
+                old_lse - block_lse
+            )
+
+
+def get_chunk_id(rank, cp_size):
+    return rank, (2 * cp_size - 1 - rank)
+
+
+def concat_masks(attn_masks_list, rank, cp_size):
+    assert len(attn_masks_list) == 2 * cp_size
+    first_chunk_id, second_chunk_id = get_chunk_id(rank, cp_size)
+    return paddle.concat([attn_masks_list[first_chunk_id], attn_masks_list[second_chunk_id]], axis=3)
+
+
+def balanced_ring_flash_attention_fwd_func(
+    group,
+    local_query,
+    local_key,
+    local_value,
+    fixed_seed_offset=None,
+    attn_mask=None,
+    dropout=0.0,
+    is_causal=False,
+    training=True,
+):
+    cp_size = group.world_size
+    rank = group.rank
+
+    comm_buffer = RingCommunicator(group, local_key, local_value)
+    local_q_seq_len = local_query.shape[1]
+
+    if attn_mask is not None:
+        attn_masks_list = paddle.split(attn_mask, num_or_sections=cp_size * 2, axis=3)
+    if is_causal:
+        local_query_second_chunk = local_query[:, local_q_seq_len // 2 :, :, :]
+    for step in range(cp_size):
+        block_k, block_v = comm_buffer.get_buffers()
+
+        if step != cp_size - 1:
+            comm_buffer.send_recv()
+
+        if not is_causal:
+            # out [bs, seq, nhead, headdim]
+            # lse [bs, nhead, seq]
+            block_out, _, block_lse, _ = _C_ops.flash_attn(
+                local_query,
+                block_k,
+                block_v,
+                fixed_seed_offset,
+                None if attn_mask is None else concat_masks(attn_masks_list, (group.rank - step) % cp_size, cp_size),
+                dropout,
+                False,
+                False,
+                not training,
+                "",
+            )
+            paddle.unsqueeze_(paddle.transpose_(block_lse, [0, 2, 1]), axis=-1)
+
+            if step == 0:
+                out, lse = block_out, block_lse
+            else:
+                out, lse = update_out_and_lse(out, lse, block_out, block_lse)
+        else:
+            if step == 0:
+                block_out, _, block_lse, _ = _C_ops.flash_attn(
+                    local_query, block_k, block_v, fixed_seed_offset, None, dropout, True, False, not training, ""
+                )
+                paddle.unsqueeze_(paddle.transpose_(block_lse, [0, 2, 1]), axis=-1)
+                out, lse = block_out, block_lse
+            elif step > rank:
+                block_out, _, block_lse, _ = _C_ops.flash_attn(
+                    local_query_second_chunk,
+                    block_k,
+                    block_v,
+                    fixed_seed_offset,
+                    None,
+                    dropout,
+                    False,
+                    False,
+                    not training,
+                    "",
+                )
+                block_lse = block_lse[:, :, 0 : (local_q_seq_len // 2)]
+                paddle.unsqueeze_(paddle.transpose_(block_lse, [0, 2, 1]), axis=-1)
+                out, lse = update_out_and_lse(out, lse, block_out, block_lse, True)
+            else:
+                block_out, _, block_lse, _ = _C_ops.flash_attn(
+                    local_query,
+                    block_k[:, : local_q_seq_len // 2, :, :],
+                    block_v[:, : local_q_seq_len // 2, :, :],
+                    fixed_seed_offset,
+                    None,
+                    dropout,
+                    False,
+                    False,
+                    not training,
+                    "",
+                )
+                paddle.unsqueeze_(paddle.transpose_(block_lse, [0, 2, 1]), axis=-1)
+                out, lse = update_out_and_lse(out, lse, block_out, block_lse)
+
+        # TODO(zhangyuqin1998)：batch_isend_irecv异步流下，无法wait，需要修复。对性能有影响。
+        # if step != cp_size - 1:
+        #     comm_buffer.wait()
+        paddle.device.synchronize()
+
+    return paddle.cast(out, local_query.dtype), paddle.transpose_(paddle.squeeze(lse, axis=-1), [0, 2, 1])
+
+
+def balanced_ring_flash_attention_bwd_func(
+    group,
+    out_grad,
+    local_query,
+    local_key,
+    local_value,
+    local_out,
+    lse,
+    fixed_seed_offset,
+    attn_mask,
+    dropout=0.0,
+    is_causal=False,
+):
+    cp_size = group.world_size
+    rank = group.rank
+    local_q_seq_len = local_query.shape[1]
+
+    query_grad_buffer = paddle.zeros_like(local_query)
+    key_grad_buffer = paddle.zeros_like(local_key)
+    value_grad_buffer = paddle.zeros_like(local_value)
+
+    kv_comm_buffer = RingCommunicator(group, local_key, local_value)
+    grad_comm_buffer = RingCommunicator(group, key_grad_buffer, value_grad_buffer)
+
+    if is_causal:
+        local_query_second_chunk = local_query[:, local_q_seq_len // 2 :, :, :]
+        local_out_second_chunk = local_out[:, local_q_seq_len // 2 :, :, :]
+        lse_second_chunk = lse[:, :, local_q_seq_len // 2 :]
+        out_grad_second_chunk = out_grad[:, local_q_seq_len // 2 :, :, :]
+
+    if attn_mask is not None:
+        attn_masks_list = paddle.split(attn_mask, num_or_sections=cp_size * 2, axis=3)
+
+    for step in range(cp_size):
+        block_k, block_v = kv_comm_buffer.get_buffers()
+
+        if step != cp_size - 1:
+            kv_comm_buffer.send_recv()
+
+        if not is_causal:
+            block_q_grad, block_k_grad, block_v_grad = flash_attn_bwd(
+                local_query,
+                block_k,
+                block_v,
+                local_out,
+                lse,
+                fixed_seed_offset,
+                None if attn_mask is None else concat_masks(attn_masks_list, (group.rank - step) % cp_size, cp_size),
+                out_grad,
+                dropout,
+                False,
+            )
+            query_grad_buffer.add_(block_q_grad)
+        else:
+            if step == 0:
+                block_q_grad, block_k_grad, block_v_grad = flash_attn_bwd(
+                    local_query, block_k, block_v, local_out, lse, fixed_seed_offset, None, out_grad, dropout, True
+                )
+                query_grad_buffer.add_(block_q_grad)
+            elif step > rank:
+                block_q_grad, block_k_grad, block_v_grad = flash_attn_bwd(
+                    local_query_second_chunk,
+                    block_k,
+                    block_v,
+                    local_out_second_chunk,
+                    lse_second_chunk,
+                    fixed_seed_offset,
+                    None,
+                    out_grad_second_chunk,
+                    dropout,
+                    False,
+                )
+                query_grad_buffer[:, local_q_seq_len // 2 :, :, :].add_(block_q_grad)
+            else:
+                block_q_grad, block_k_grad, block_v_grad = flash_attn_bwd(
+                    local_query,
+                    block_k[:, : local_q_seq_len // 2, :, :],
+                    block_v[:, : local_q_seq_len // 2, :, :],
+                    local_out,
+                    lse,
+                    fixed_seed_offset,
+                    None,
+                    out_grad,
+                    dropout,
+                    False,
+                )
+                query_grad_buffer.add_(block_q_grad)
+
+        # if step != cp_size - 1:
+        #     kv_comm_buffer.wait()
+        # if step != 0:
+        #     grad_comm_buffer.wait()
+        paddle.device.synchronize()
+
+        grad_comm_buffer.add_to_buffers(block_k_grad, block_v_grad)
+        grad_comm_buffer.send_recv()
+
+    grad_comm_buffer.wait()
+    key_grad_buffer, value_grad_buffer = grad_comm_buffer.get_buffers()
+
+    return query_grad_buffer, key_grad_buffer, value_grad_buffer
+
+
+class RingFlashAttention(PyLayer):
+    @staticmethod
+    def forward(
+        ctx,
+        query,
+        key,
+        value,
+        group=None,
+        fixed_seed_offset=None,
+        attn_mask=None,
+        dropout=0.0,
+        is_causal=False,
+        training=True,
+    ):
+        if dropout > 0.0:
+            raise NotImplementedError("Dropout is not supported in ring attention yet.")
+        if group is None:
+            group = dist.fleet.get_hybrid_communicate_group().get_sep_parallel_group()
+        if attn_mask is not None:
+            is_causal = False
+
+        out, lse = balanced_ring_flash_attention_fwd_func(
+            group, query, key, value, fixed_seed_offset, attn_mask, dropout, is_causal, training
+        )
+        ctx.save_for_backward(query, key, value, out, lse, attn_mask)
+        ctx.group = group
+        ctx.fixed_seed_offset = fixed_seed_offset
+        ctx.dropout = dropout
+        ctx.is_causal = is_causal
+        return out
+
+    @staticmethod
+    def backward(ctx, out_grad):
+        query, key, value, out, lse, attn_mask = ctx.saved_tensor()
+        group = ctx.group
+        fixed_seed_offset = ctx.fixed_seed_offset
+        dropout = ctx.dropout
+        is_causal = ctx.is_causal
+
+        if fixed_seed_offset is None:
+            fixed_seed_offset = paddle.to_tensor([0, 0], place=paddle.CPUPlace(), dtype=paddle.int64)
+
+        query_grad, key_grad, value_grad = balanced_ring_flash_attention_bwd_func(
+            group, out_grad, query, key, value, out, lse, fixed_seed_offset, attn_mask, dropout, is_causal
+        )
+        if attn_mask is not None and not attn_mask.stop_gradient:
+            return query_grad, key_grad, value_grad, None
+        else:
+            return query_grad, key_grad, value_grad
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roberta/README.md b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roberta/README.md
new file mode 100644
index 000000000..97ed2fbad
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roberta/README.md
@@ -0,0 +1 @@
+# RoBERTa
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roberta/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roberta/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roberta/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roberta/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roberta/configuration.py
new file mode 100644
index 000000000..8ce536777
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roberta/configuration.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Albert model configuration"""
+from __future__ import annotations
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["PRETRAINED_INIT_CONFIGURATION", "RobertaConfig"]
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "hfl/roberta-wwm-ext": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 2,
+        "vocab_size": 21128,
+        "pad_token_id": 0,
+    },
+    "hfl/roberta-wwm-ext-large": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "intermediate_size": 4096,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 24,
+        "type_vocab_size": 2,
+        "vocab_size": 21128,
+        "pad_token_id": 0,
+    },
+    "hfl/rbt6": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 6,
+        "type_vocab_size": 2,
+        "vocab_size": 21128,
+        "pad_token_id": 0,
+    },
+    "hfl/rbt4": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 4,
+        "type_vocab_size": 2,
+        "vocab_size": 21128,
+        "pad_token_id": 0,
+    },
+    "hfl/rbt3": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 12,
+        "num_hidden_layers": 3,
+        "type_vocab_size": 2,
+        "vocab_size": 21128,
+        "pad_token_id": 0,
+    },
+    "hfl/rbtl3": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "intermediate_size": 4096,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 3,
+        "type_vocab_size": 2,
+        "vocab_size": 21128,
+        "pad_token_id": 0,
+    },
+}
+
+
+class RobertaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`RobertaModel`]. It is used to
+    instantiate a ALBERT model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the ALBERT
+    albert-base-v1 architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (int):
+            Vocabulary size of `inputs_ids` in `RobertaModel`. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `RobertaModel`.
+        hidden_size (int, optional):
+            Dimensionality of the embedding layer, encoder layers and pooler layer. Defaults to `768`.
+        num_hidden_layers (int, optional):
+            Number of hidden layers in the Transformer encoder. Defaults to `12`.
+        num_attention_heads (int, optional):
+            Number of attention heads for each attention layer in the Transformer encoder.
+            Defaults to `12`.
+        intermediate_size (int, optional):
+            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
+            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
+            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
+            Defaults to `3072`.
+        hidden_act (str, optional):
+            The non-linear activation function in the feed-forward layer.
+            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
+            are supported. Defaults to ``"gelu"``.
+        hidden_dropout_prob (float, optional):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+            Defaults to `0.1`.
+        attention_probs_dropout_prob (float, optional):
+            The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
+            Defaults to `0.1`.
+        max_position_embeddings (int, optional):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
+            sequence. Defaults to `512`.
+        type_vocab_size (int, optional):
+            The vocabulary size of the `token_type_ids` passed when calling `~transformers.RobertaModel`.
+            Defaults to `2`.
+        initializer_range (float, optional):
+            The standard deviation of the normal initializer. Defaults to 0.02.
+
+            .. note::
+                A normal_initializer initializes weight matrices as normal distributions.
+                See :meth:`RobertaPretrainedModel._init_weights()` for how weights are initialized in `RobertaModel`.
+
+        pad_token_id(int, optional):
+            The index of padding token in the token vocabulary.
+            Defaults to `0`.
+        cls_token_id(int, optional):
+            The index of cls token in the token vocabulary.
+            Defaults to `101`.
+
+    Examples:
+
+    ```python
+    >>> from paddlenlp.transformers import RobertaModel, AlbertConfig
+
+    >>> # Initializing a ALBERT albert-base-v1 style configuration
+    >>> configuration = AlbertConfig()
+
+    >>> # Initializing a model from the albert-base-v1 style configuration
+    >>> model = RobertaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "roberta"
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size: int = 21128,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        max_position_embeddings: int = 512,
+        type_vocab_size: int = 16,
+        initializer_range: float = 0.02,
+        pad_token_id: int = 0,
+        layer_norm_eps: float = 1e-12,
+        cls_token_id: int = 101,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, cls_token_id=cls_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.pad_token_id = pad_token_id
+        self.layer_norm_eps = layer_norm_eps
+        self.cls_token_id = cls_token_id
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roberta/converter.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roberta/converter.py
new file mode 100644
index 000000000..a86f59320
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roberta/converter.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+from typing import List, Union, Dict, Type
+
+from paddlenlp.transformers import PretrainedModel, RobertaModel
+from paddlenlp.utils.converter import StateDictNameMapping, Converter
+
+__all__ = ["RobertaConverter"]
+
+
+class RobertaConverter(Converter):
+    _ignore_state_dict_keys = ["embeddings.position_ids"]
+    architectures: Dict[str, Type[PretrainedModel]] = {"RobertaModel": RobertaModel}
+
+    def get_paddle_pytorch_model_classes(self):
+        from paddlenlp.transformers import RobertaModel as PaddleRobertaModel
+        from transformers import RobertaModel as PytorchRobertaModel
+
+        return PaddleRobertaModel, PytorchRobertaModel
+
+    def get_name_mapping(self, config_or_num_layers: Union[dict, int] = None) -> List[StateDictNameMapping]:
+        num_layer = self.resolve_num_layer(config_or_num_layers)
+
+        mappings = [
+            ["embeddings.word_embeddings.weight", "embeddings.word_embeddings.weight"],
+            ["embeddings.position_embeddings.weight", "embeddings.position_embeddings.weight"],
+            ["embeddings.token_type_embeddings.weight", "embeddings.token_type_embeddings.weight"],
+            ["embeddings.LayerNorm.weight", "embeddings.layer_norm.weight"],
+            ["embeddings.LayerNorm.bias", "embeddings.layer_norm.bias"],
+            ["pooler.dense.weight", "pooler.dense.weight", "transpose"],
+            ["pooler.dense.bias", "pooler.dense.bias"],
+        ]
+
+        for layer_index in range(num_layer):
+            layer_mappings = [
+                [
+                    f"encoder.layer.{layer_index}.attention.self.query.weight",
+                    f"encoder.layers.{layer_index}.self_attn.q_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.query.bias",
+                    f"encoder.layers.{layer_index}.self_attn.q_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.key.weight",
+                    f"encoder.layers.{layer_index}.self_attn.k_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.key.bias",
+                    f"encoder.layers.{layer_index}.self_attn.k_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.value.weight",
+                    f"encoder.layers.{layer_index}.self_attn.v_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.value.bias",
+                    f"encoder.layers.{layer_index}.self_attn.v_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.dense.weight",
+                    f"encoder.layers.{layer_index}.self_attn.out_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.dense.bias",
+                    f"encoder.layers.{layer_index}.self_attn.out_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.LayerNorm.weight",
+                    f"encoder.layers.{layer_index}.norm1.weight",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.LayerNorm.bias",
+                    f"encoder.layers.{layer_index}.norm1.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.intermediate.dense.weight",
+                    f"encoder.layers.{layer_index}.linear1.weight",
+                    "transpose",
+                ],
+                [f"encoder.layer.{layer_index}.intermediate.dense.bias", f"encoder.layers.{layer_index}.linear1.bias"],
+                [
+                    f"encoder.layer.{layer_index}.output.dense.weight",
+                    f"encoder.layers.{layer_index}.linear2.weight",
+                    "transpose",
+                ],
+                [f"encoder.layer.{layer_index}.output.dense.bias", f"encoder.layers.{layer_index}.linear2.bias"],
+                [f"encoder.layer.{layer_index}.output.LayerNorm.weight", f"encoder.layers.{layer_index}.norm2.weight"],
+                [f"encoder.layer.{layer_index}.output.LayerNorm.bias", f"encoder.layers.{layer_index}.norm2.bias"],
+            ]
+            mappings.extend(layer_mappings)
+        return [StateDictNameMapping(*mapping) for mapping in mappings]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roberta/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roberta/modeling.py
new file mode 100644
index 000000000..e6f42c582
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roberta/modeling.py
@@ -0,0 +1,1387 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+from typing import Optional, Tuple
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import Tensor
+
+from ...layers import Linear as TransposedLinear
+from ...utils.converter import StateDictNameMapping, init_name_mappings
+from .. import PretrainedModel, register_base_model
+from ..model_outputs import (
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from .configuration import PRETRAINED_INIT_CONFIGURATION, RobertaConfig
+
+__all__ = [
+    "RobertaModel",
+    "RobertaPretrainedModel",
+    "RobertaForSequenceClassification",
+    "RobertaForTokenClassification",
+    "RobertaForQuestionAnswering",
+    "RobertaForMaskedLM",
+    "RobertaForMultipleChoice",
+    "RobertaForCausalLM",
+]
+
+
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+    Args:
+        x: paddle.Tensor x:
+    Returns: paddle.Tensor
+    """
+    if past_key_values_length is None:
+        past_key_values_length = 0
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = (input_ids != padding_idx).cast("int64")
+    incremental_indices = (paddle.cumsum(mask, axis=1) + past_key_values_length) * mask
+    return incremental_indices + padding_idx
+
+
+class RobertaEmbeddings(nn.Layer):
+    r"""
+    Include embeddings from word, position and token_type embeddings.
+    """
+
+    def __init__(self, config: RobertaConfig):
+        super(RobertaEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.padding_idx = config.pad_token_id
+        self.cls_token_id = config.cls_token_id
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        past_key_values_length: Optional[int] = None,
+    ):
+
+        if input_ids is not None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        if position_ids is None:
+            if input_ids is not None:
+                position_ids = create_position_ids_from_input_ids(
+                    input_ids, padding_idx=self.padding_idx, past_key_values_length=past_key_values_length
+                )
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+            position_ids.stop_gradient = True
+
+        if token_type_ids is None:
+            input_shape = inputs_embeds.shape[:-1]
+            token_type_ids = paddle.zeros(input_shape, dtype="int64")
+
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+        Args:
+            input_shape: paddle.Tensor
+        Returns: paddle.Tensor
+        """
+        input_shape = inputs_embeds.shape[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = paddle.arange(self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype="int64")
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+
+class RobertaPooler(nn.Layer):
+    def __init__(self, hidden_size):
+        super(RobertaPooler, self).__init__()
+        self.dense = nn.Linear(hidden_size, hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class RobertaPretrainedModel(PretrainedModel):
+    r"""
+    An abstract class for pretrained RoBerta models. It provides RoBerta related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+
+    """
+
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    config_class = RobertaConfig
+
+    pretrained_resource_files_map = {
+        "model_state": {
+            "hfl/roberta-wwm-ext": "https://bj.bcebos.com/paddlenlp/models/transformers/roberta_base/roberta_chn_base.pdparams",
+            "hfl/roberta-wwm-ext-large": "https://bj.bcebos.com/paddlenlp/models/transformers/roberta_large/roberta_chn_large.pdparams",
+            "hfl/rbt6": "https://bj.bcebos.com/paddlenlp/models/transformers/rbt6/rbt6_chn_large.pdparams",
+            "hfl/rbt4": "https://bj.bcebos.com/paddlenlp/models/transformers/rbt4/rbt4_chn_large.pdparams",
+            "hfl/rbt3": "https://bj.bcebos.com/paddlenlp/models/transformers/rbt3/rbt3_chn_large.pdparams",
+            "hfl/rbtl3": "https://bj.bcebos.com/paddlenlp/models/transformers/rbtl3/rbtl3_chn_large.pdparams",
+        }
+    }
+    base_model_prefix = "roberta"
+
+    @classmethod
+    def _get_name_mappings(cls, config: RobertaConfig) -> list[StateDictNameMapping]:
+        mappings = [
+            "embeddings.word_embeddings.weight",
+            "embeddings.position_embeddings.weight",
+            "embeddings.token_type_embeddings.weight",
+            ["embeddings.LayerNorm.weight", "embeddings.layer_norm.weight"],
+            ["embeddings.LayerNorm.bias", "embeddings.layer_norm.bias"],
+        ]
+
+        for layer_index in range(config.num_hidden_layers):
+            layer_mappings = [
+                [
+                    f"encoder.layer.{layer_index}.attention.self.query.weight",
+                    f"encoder.layers.{layer_index}.self_attn.q_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.query.bias",
+                    f"encoder.layers.{layer_index}.self_attn.q_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.key.weight",
+                    f"encoder.layers.{layer_index}.self_attn.k_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.key.bias",
+                    f"encoder.layers.{layer_index}.self_attn.k_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.value.weight",
+                    f"encoder.layers.{layer_index}.self_attn.v_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.value.bias",
+                    f"encoder.layers.{layer_index}.self_attn.v_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.dense.weight",
+                    f"encoder.layers.{layer_index}.self_attn.out_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.dense.bias",
+                    f"encoder.layers.{layer_index}.self_attn.out_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.LayerNorm.weight",
+                    f"encoder.layers.{layer_index}.norm1.weight",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.LayerNorm.bias",
+                    f"encoder.layers.{layer_index}.norm1.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.intermediate.dense.weight",
+                    f"encoder.layers.{layer_index}.linear1.weight",
+                    "transpose",
+                ],
+                [f"encoder.layer.{layer_index}.intermediate.dense.bias", f"encoder.layers.{layer_index}.linear1.bias"],
+                [
+                    f"encoder.layer.{layer_index}.output.dense.weight",
+                    f"encoder.layers.{layer_index}.linear2.weight",
+                    "transpose",
+                ],
+                [f"encoder.layer.{layer_index}.output.dense.bias", f"encoder.layers.{layer_index}.linear2.bias"],
+                [f"encoder.layer.{layer_index}.output.LayerNorm.weight", f"encoder.layers.{layer_index}.norm2.weight"],
+                [f"encoder.layer.{layer_index}.output.LayerNorm.bias", f"encoder.layers.{layer_index}.norm2.bias"],
+            ]
+            mappings.extend(layer_mappings)
+
+        init_name_mappings(mappings)
+        # Other than RobertaModel, other architectures will prepend model prefix
+        if config.architectures is not None and "RobertaModel" not in config.architectures:
+            for mapping in mappings:
+                mapping[0] = "roberta." + mapping[0]
+
+        if cls.__name__ != "RobertaModel":
+            for mapping in mappings:
+                mapping[1] = "roberta." + mapping[1]
+
+        mappings.extend(
+            [
+                ["pooler.dense.weight", "roberta.pooler.dense.weight", "transpose"],
+                ["pooler.dense.bias", "roberta.pooler.dense.bias"],
+            ]
+        )
+
+        if config.architectures is not None:
+            if "RobertaForSequenceClassification" in config.architectures:
+                mappings.extend(
+                    [
+                        ["classifier.out_proj.weight", None, "transpose"],
+                        "classifier.out_proj.bias",
+                        ["classifier.dense.weight", None, "transpose"],
+                        "classifier.dense.bias",
+                    ]
+                )
+            if "RobertaForMaskedLM" in config.architectures:
+                mappings.extend(
+                    [
+                        "lm_head.bias",
+                        "lm_head.dense.weight",
+                        "lm_head.dense.bias",
+                        "lm_head.layer_norm.weight",
+                        "lm_head.layer_norm.bias",
+                    ]
+                )
+            if (
+                "RobertaForTokenClassification" in config.architectures
+                or "RobertaForMultipleChoice" in config.architectures
+            ):
+                mappings.extend(
+                    [
+                        ["classifier.weight", None, "transpose"],
+                        "classifier.bias",
+                    ]
+                )
+            if "RobertaForQuestionAnswering" in config.architectures:
+                mappings.extend(
+                    [
+                        ["qa_outputs.weight", "classifier.weight", "transpose"],
+                        ["qa_outputs.bias", "classifier.bias"],
+                    ]
+                )
+        init_name_mappings(mappings)
+        return [StateDictNameMapping(*mapping) for mapping in mappings]
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # only support dygraph, use truncated_normal and make it inplace
+            # and configurable later
+            layer.weight.set_value(
+                paddle.tensor.normal(
+                    mean=0.0,
+                    std=self.config.initializer_range,
+                    shape=layer.weight.shape,
+                )
+            )
+        elif isinstance(layer, nn.LayerNorm):
+            layer._epsilon = self.config.layer_norm_eps
+
+
+@register_base_model
+class RobertaModel(RobertaPretrainedModel):
+    r"""
+    The bare Roberta Model outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        vocab_size (int):
+            Vocabulary size of `inputs_ids` in `RobertaModel`. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `RobertaModel`.
+        hidden_size (int, optional):
+            Dimensionality of the embedding layer, encoder layers and pooler layer. Defaults to `768`.
+        num_hidden_layers (int, optional):
+            Number of hidden layers in the Transformer encoder. Defaults to `12`.
+        num_attention_heads (int, optional):
+            Number of attention heads for each attention layer in the Transformer encoder.
+            Defaults to `12`.
+        intermediate_size (int, optional):
+            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
+            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
+            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
+            Defaults to `3072`.
+        hidden_act (str, optional):
+            The non-linear activation function in the feed-forward layer.
+            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
+            are supported. Defaults to ``"gelu"``.
+        hidden_dropout_prob (float, optional):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+            Defaults to `0.1`.
+        attention_probs_dropout_prob (float, optional):
+            The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
+            Defaults to `0.1`.
+        max_position_embeddings (int, optional):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
+            sequence. Defaults to `512`.
+        type_vocab_size (int, optional):
+            The vocabulary size of the `token_type_ids` passed when calling `~transformers.RobertaModel`.
+            Defaults to `2`.
+        initializer_range (float, optional):
+            The standard deviation of the normal initializer. Defaults to 0.02.
+
+            .. note::
+                A normal_initializer initializes weight matrices as normal distributions.
+                See :meth:`RobertaPretrainedModel._init_weights()` for how weights are initialized in `RobertaModel`.
+
+        pad_token_id(int, optional):
+            The index of padding token in the token vocabulary.
+            Defaults to `0`.
+        cls_token_id(int, optional):
+            The index of cls token in the token vocabulary.
+            Defaults to `101`.
+    """
+
+    def __init__(self, config: RobertaConfig, add_pooling_layer=True):
+        super(RobertaModel, self).__init__(config)
+
+        self.pad_token_id = config.pad_token_id
+        self.initializer_range = config.initializer_range
+        self.layer_norm_eps = config.layer_norm_eps
+        self.embeddings = RobertaEmbeddings(config)
+        encoder_layer = nn.TransformerEncoderLayer(
+            config.hidden_size,
+            config.num_attention_heads,
+            config.intermediate_size,
+            dropout=config.hidden_dropout_prob,
+            activation=config.hidden_act,
+            attn_dropout=config.attention_probs_dropout_prob,
+            act_dropout=0,
+        )
+        self.encoder = nn.TransformerEncoder(encoder_layer, config.num_hidden_layers)
+        self.pooler = RobertaPooler(config.hidden_size) if add_pooling_layer else None
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[Tensor]]] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate first and second portions of the inputs.
+                Indices can be either 0 or 1:
+
+                - 0 corresponds to a **sentence A** token,
+                - 1 corresponds to a **sentence B** token.
+
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
+                Defaults to None, which means no segment embeddings is added to token embeddings.
+            position_ids (Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings.
+                Selected in the range ``[0, max_position_embeddings - 1]``.
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
+                Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
+                [batch_size, num_attention_heads, sequence_length, sequence_length].
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            past_key_values (tuple(tuple(Tensor)), optional):
+                The length of tuple equals to the number of layers, and each inner
+                tuple haves 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`)
+                which contains precomputed key and value hidden states of the attention blocks.
+                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that
+                don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+                `input_ids` of shape `(batch_size, sequence_length)`.
+            inputs_embeds (Tensor, optional):
+                If you want to control how to convert `inputs_ids` indices into associated vectors, you can
+                pass an embedded representation directly instead of passing `inputs_ids`.
+            use_cache (`bool`, optional):
+                If set to `True`, `past_key_values` key value states are returned.
+                Defaults to `None`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.ModelOutput` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RobertaModel, RobertaTokenizer
+
+                tokenizer = RobertaTokenizer.from_pretrained('roberta-wwm-ext')
+                model = RobertaModel.from_pretrained('roberta-wwm-ext')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                sequence_output, pooled_output = model(**inputs)
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time.")
+
+        past_key_values_length = None
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+        if attention_mask is None:
+            attention_mask = paddle.unsqueeze(
+                (input_ids == self.pad_token_id).astype(paddle.get_default_dtype()) * -1e4, axis=[1, 2]
+            )
+            if past_key_values is not None:
+                batch_size = past_key_values[0][0].shape[0]
+                past_mask = paddle.zeros([batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype)
+                attention_mask = paddle.concat([past_mask, attention_mask], axis=-1)
+        elif attention_mask.ndim == 2:
+            attention_mask = paddle.unsqueeze(attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
+            attention_mask = (1.0 - attention_mask) * -1e4
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        self.encoder._use_cache = use_cache  # To be consistent with HF
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            src_mask=attention_mask,
+            cache=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if isinstance(encoder_outputs, type(embedding_output)):
+            sequence_output = encoder_outputs
+            pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+            return (sequence_output, pooled_output)
+        else:
+            sequence_output = encoder_outputs[0]
+            pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+            if not return_dict:
+                return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+            return BaseModelOutputWithPoolingAndCrossAttentions(
+                last_hidden_state=sequence_output,
+                pooler_output=pooled_output,
+                past_key_values=encoder_outputs.past_key_values,
+                hidden_states=encoder_outputs.hidden_states,
+                attentions=encoder_outputs.attentions,
+            )
+
+
+class RobertaForQuestionAnswering(RobertaPretrainedModel):
+    r"""
+    Roberta Model with a linear layer on top of the hidden-states output to compute `span_start_logits`
+     and `span_end_logits`, designed for question-answering tasks like SQuAD.
+
+    Args:
+        roberta (:class:`RobertaModel`):
+            An instance of RobertaModel.
+    """
+
+    def __init__(self, config: RobertaConfig):
+        super(RobertaForQuestionAnswering, self).__init__(config)
+
+        self.roberta = RobertaModel(config, add_pooling_layer=False)
+        self.classifier = nn.Linear(config.hidden_size, 2)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        start_positions: Optional[Tensor] = None,
+        end_positions: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`RobertaModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`RobertaModel`.
+            position_ids (Tensor, optional):
+                See :class:`RobertaModel`.
+            attention_mask (Tensor, optional):
+                See :class:`RobertaModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`RobertaModel`.
+            start_positions (Tensor of shape `(batch_size,)`, optional):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            end_positions (Tensor of shape `(batch_size,)`, optional):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RobertaForSequenceClassification, RobertaTokenizer
+
+                tokenizer = RobertaTokenizer.from_pretrained('roberta-wwm-ext')
+                model = RobertaForSequenceClassification.from_pretrained('roberta-wwm-ext')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.roberta(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output)
+        logits = paddle.transpose(logits, perm=[2, 0, 1])
+        start_logits, end_logits = paddle.unstack(x=logits, axis=0)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if start_positions.ndim > 1:
+                start_positions = start_positions.squeeze(-1)
+            if start_positions.ndim > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clip(0, ignored_index)
+            end_positions = end_positions.clip(0, ignored_index)
+
+            loss_fct = paddle.nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class RobertaClassificationHead(nn.Layer):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = paddle.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+class RobertaForSequenceClassification(RobertaPretrainedModel):
+    r"""
+    Roberta Model with a linear layer on top of the output layer,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        roberta (:class:`RobertaModel`):
+            An instance of `RobertaModel`.
+        num_classes (int, optional):
+            The number of classes. Defaults to `2`.
+        dropout (float, optional):
+            The dropout probability for output of Roberta.
+            If None, use the same value as `hidden_dropout_prob`
+            of `RobertaModel` instance `roberta`. Defaults to `None`.
+    """
+
+    def __init__(self, config: RobertaConfig):
+        super(RobertaForSequenceClassification, self).__init__(config)
+        self.roberta = RobertaModel(config, add_pooling_layer=False)
+
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = RobertaClassificationHead(config)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`RobertaModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`RobertaModel`.
+            position_ids (Tensor, optional):
+                See :class:`RobertaModel`.
+            attention_mask (Tensor, optional):
+                See :class:`RobertaModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`RobertaModel`.
+            labels (Tensor of shape `(batch_size,)`, optional):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in `[0, ..., num_classes - 1]`. If `num_classes == 1`
+                a regression loss is computed (Mean-Square loss), If `num_classes > 1`
+                a classification loss is computed (Cross-Entropy).
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RobertaForSequenceClassification, RobertaTokenizer
+
+                tokenizer = RobertaTokenizer.from_pretrained('roberta-wwm-ext')
+                model = RobertaForSequenceClassification.from_pretrained('roberta-wwm-ext')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.roberta(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.num_labels == 1:
+                loss_fct = paddle.nn.MSELoss()
+                loss = loss_fct(logits, labels)
+            elif labels.dtype == paddle.int64 or labels.dtype == paddle.int32:
+                loss_fct = paddle.nn.CrossEntropyLoss()
+                loss = loss_fct(logits.reshape((-1, self.config.num_labels)), labels.reshape((-1,)))
+            else:
+                loss_fct = paddle.nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class RobertaForTokenClassification(RobertaPretrainedModel):
+    r"""
+    Roberta Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        roberta (:class:`RobertaModel`):
+            An instance of `RobertaModel`.
+        num_classes (int, optional):
+            The number of classes. Defaults to `2`.
+        dropout (float, optional):
+            The dropout probability for output of Roberta.
+            If None, use the same value as `hidden_dropout_prob`
+            of `RobertaModel` instance `roberta`. Defaults to `None`.
+    """
+
+    def __init__(self, config: RobertaConfig):
+        super(RobertaForTokenClassification, self).__init__(config)
+
+        self.roberta = RobertaModel(config, add_pooling_layer=False)
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`RobertaModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`RobertaModel`.
+            position_ids (Tensor, optional):
+                See :class:`RobertaModel`.
+            attention_mask (Tensor, optional):
+                See :class:`RobertaModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`RobertaModel`.
+            labels (Tensor of shape `(batch_size, sequence_length)`, optional):
+                Labels for computing the token classification loss. Indices should be in `[0, ..., num_classes - 1]`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RobertaForTokenClassification, RobertaTokenizer
+
+                tokenizer = RobertaTokenizer.from_pretrained('roberta-wwm-ext')
+                model = RobertaForTokenClassification.from_pretrained('roberta-wwm-ext')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.roberta(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(logits.reshape((-1, self.config.num_labels)), labels.reshape((-1,)))
+        if not return_dict:
+
+            output = (logits,) + outputs[2:]
+            if loss is not None:
+                return (loss,) + output
+            if len(output) == 1:
+                return output[0]
+            return output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class RobertaForMultipleChoice(RobertaPretrainedModel):
+    """
+    RoBerta Model with a linear layer on top of the hidden-states output layer,
+    designed for multiple choice tasks like RocStories/SWAG tasks.
+
+    Args:
+        bert (:class:`RobertaModel`):
+            An instance of RobertaModel.
+        num_choices (int, optional):
+            The number of choices. Defaults to `2`.
+        dropout (float, optional):
+            The dropout probability for output of Bert.
+            If None, use the same value as `hidden_dropout_prob` of `RobertaModel`
+            instance `bert`. Defaults to None.
+    """
+
+    def __init__(self, config: RobertaConfig):
+        super(RobertaForMultipleChoice, self).__init__(config)
+        self.roberta = RobertaModel(config)
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The RobertaForMultipleChoice forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`RobertaModel` and shape as [batch_size, num_choice, sequence_length].
+            token_type_ids(Tensor, optional):
+                See :class:`RobertaModel` and shape as [batch_size, num_choice, sequence_length].
+            position_ids(Tensor, optional):
+                See :class:`RobertaModel` and shape as [batch_size, num_choice, sequence_length].
+            attention_mask (list, optional):
+                See :class:`RobertaModel` and shape as [batch_size, num_choice, sequence_length].
+            inputs_embeds (list, optional):
+                See :class:`RobertaModel` and shape as [batch_size, num_choice, sequence_length].
+            labels (Tensor of shape `(batch_size, )`, optional):
+                Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+                num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+                `input_ids` above)
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import BertForMultipleChoice, BertTokenizer
+                from paddlenlp.data import Pad, Dict
+
+                tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+                model = BertForMultipleChoice.from_pretrained('bert-base-uncased', num_choices=2)
+
+                data = [
+                    {
+                        "question": "how do you turn on an ipad screen?",
+                        "answer1": "press the volume button.",
+                        "answer2": "press the lock button.",
+                        "label": 1,
+                    },
+                    {
+                        "question": "how do you indent something?",
+                        "answer1": "leave a space before starting the writing",
+                        "answer2": "press the spacebar",
+                        "label": 0,
+                    },
+                ]
+
+                text = []
+                text_pair = []
+                for d in data:
+                    text.append(d["question"])
+                    text_pair.append(d["answer1"])
+                    text.append(d["question"])
+                    text_pair.append(d["answer2"])
+
+                inputs = tokenizer(text, text_pair)
+                batchify_fn = lambda samples, fn=Dict(
+                    {
+                        "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input_ids
+                        "token_type_ids": Pad(
+                            axis=0, pad_val=tokenizer.pad_token_type_id
+                        ),  # token_type_ids
+                    }
+                ): fn(samples)
+                inputs = batchify_fn(inputs)
+
+                reshaped_logits = model(
+                    input_ids=paddle.to_tensor(inputs[0], dtype="int64"),
+                    token_type_ids=paddle.to_tensor(inputs[1], dtype="int64"),
+                )
+                print(reshaped_logits.shape)
+                # [2, 2]
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is not None:
+            num_choices = input_ids.shape[1]
+        elif inputs_embeds is not None:
+            num_choices = inputs_embeds.shape[1]
+
+        input_ids = input_ids.reshape((-1, input_ids.shape[-1])) if input_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.reshape((-1, inputs_embeds.shape[-2], inputs_embeds.shape[-1]))
+            if inputs_embeds is not None
+            else None
+        )
+        position_ids = position_ids.reshape((-1, position_ids.shape[-1])) if position_ids is not None else None
+        token_type_ids = token_type_ids.reshape((-1, token_type_ids.shape[-1])) if token_type_ids is not None else None
+        attention_mask = attention_mask.reshape((-1, attention_mask.shape[-1])) if attention_mask is not None else None
+
+        outputs = self.roberta(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.reshape((-1, num_choices))
+
+        loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class RobertaForMaskedLM(RobertaPretrainedModel):
+    """
+    Roberta Model with a `masked language modeling` head on top.
+
+    Args:
+        bert (:class:RobertaModel`):
+            An instance of :class:`RobertaModel`.
+
+    """
+
+    def __init__(self, config: RobertaConfig):
+        super(RobertaForMaskedLM, self).__init__(config)
+
+        self.roberta = RobertaModel(config, add_pooling_layer=False)
+        self.lm_head = RobertaLMHead(config)
+        self.tie_weights()
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`RobertaModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`RobertaModel`.
+            position_ids (Tensor, optional):
+                See :class:`RobertaModel`.
+            attention_mask (Tensor, optional):
+                See :class:`RobertaModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`RobertaModel`.
+            labels (Tensor of shape `(batch_size, sequence_length)`, optional):
+                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+                vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+                loss is only computed for the tokens with labels in `[0, ..., vocab_size]`
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.MaskedLMOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.MaskedLMOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.MaskedLMOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RobertaForMaskedLM, RobertaTokenizer
+
+                tokenizer = RobertaTokenizer.from_pretrained('roberta-wwm-ext')
+                model = RobertaForMaskedLM.from_pretrained('roberta-wwm-ext')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+
+                logits = model(**inputs)
+                print(logits.shape)
+                # [1, 13, 30522]
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.roberta(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.reshape((-1, prediction_scores.shape[-1])), labels.reshape((-1,))
+            )
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return (
+                ((masked_lm_loss,) + output)
+                if masked_lm_loss is not None
+                else (output[0] if len(output) == 1 else output)
+            )
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class RobertaLMHead(nn.Layer):
+    """Roberta Head for masked language modeling."""
+
+    def __init__(self, config: RobertaConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+
+        self.decoder = TransposedLinear(config.hidden_size, config.vocab_size)
+        # link bias to load pretrained weights
+        self.bias = self.decoder.bias
+
+    def forward(self, features, **kwargs):
+        x = self.dense(features)
+        x = F.gelu(x)
+        x = self.layer_norm(x)
+
+        # project back to size of vocabulary with bias
+        x = self.decoder(x)
+
+        return x
+
+
+class RobertaForCausalLM(RobertaPretrainedModel):
+    """
+    Roberta Model with a `Causal language modeling` head on top.
+
+    Args:
+        bert (:class:RobertaModel`):
+            An instance of :class:`RobertaModel`.
+
+    """
+
+    def __init__(self, config: RobertaConfig):
+        super().__init__(config)
+        self.roberta = RobertaModel(config, add_pooling_layer=False)
+        self.lm_head = RobertaLMHead(config)
+
+        self.tie_weights()
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[Tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`RobertaModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`RobertaModel`.
+            position_ids (Tensor, optional):
+                See :class:`RobertaModel`.
+            attention_mask (Tensor, optional):
+                See :class:`RobertaModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`RobertaModel`.
+            past_key_values (tuple(tuple(Tensor)), optional):
+                See :class:`RobertaModel`.
+            use_cache (Tensor, optional):
+                See :class:`RobertaModel`.
+            attention_mask (Tensor, optional):
+                See :class:`RobertaModel`.
+            labels (Tensor of shape `(batch_size, sequence_length)`, optional):
+                Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+                `[-100, 0, ..., vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+                ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., vocab_size]`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.CausalLMOutputWithCrossAttentions` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.CausalLMOutputWithCrossAttentions` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.CausalLMOutputWithCrossAttentions`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RobertaForCausalLM, RobertaTokenizer
+
+                tokenizer = RobertaTokenizer.from_pretrained('roberta-wwm-ext')
+                model = RobertaForCausalLM.from_pretrained('roberta-wwm-ext')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+
+                logits = model(**inputs)
+                print(logits.shape)
+                # [1, 13, 30522]
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+        outputs = self.roberta(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :]
+            labels = labels[:, 1:]
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            lm_loss = loss_fct(
+                shifted_prediction_scores.reshape((-1, prediction_scores.shape[-1])), labels.reshape((-1,))
+            )
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else (output[0] if len(output) == 1 else output)
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past}
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roberta/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roberta/tokenizer.py
new file mode 100644
index 000000000..4ef53d5c6
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roberta/tokenizer.py
@@ -0,0 +1,628 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+import json
+import os
+
+from paddle.utils import try_import
+
+from paddlenlp.utils.download import resolve_file_path
+
+from .. import (
+    AddedToken,
+    BasicTokenizer,
+    GPTTokenizer,
+    PretrainedTokenizer,
+    WordpieceTokenizer,
+)
+from ..gpt.tokenizer import bytes_to_unicode
+
+__all__ = ["RobertaTokenizer", "RobertaChineseTokenizer", "RobertaBPETokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "hfl/roberta-wwm-ext": 512,
+    "hfl/roberta-wwm-ext-large": 512,
+    "hfl/rbt6": 512,
+    "hfl/rbt4": 512,
+    "hfl/rbt3": 512,
+    "hfl/rbtl3": 512,
+}
+
+
+class RobertaChineseTokenizer(PretrainedTokenizer):
+    """
+    Constructs a RoBerta tokenizer. It uses a basic tokenizer to do punctuation
+    splitting, lower casing and so on, and follows a WordPiece tokenizer to
+    tokenize as subwords.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            The vocabulary file path (ends with '.txt') required to instantiate
+            a `WordpieceTokenizer`.
+        do_lower_case (bool):
+            Whether or not to lowercase the input when tokenizing.
+            Defaults to`True`.
+        unk_token (str):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import RobertaTokenizer
+            tokenizer = RobertaTokenizer.from_pretrained('roberta-wwm-ext')
+
+            tokens = tokenizer('He was a puppeteer')
+            #{'input_ids': [101, 9245, 9947, 143, 11227, 9586, 8418, 8854, 8180, 102],
+            #'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}、
+
+    """
+
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "hfl/roberta-wwm-ext": "https://bj.bcebos.com/paddlenlp/models/transformers/roberta_base/vocab.txt",
+            "hfl/roberta-wwm-ext-large": "https://bj.bcebos.com/paddlenlp/models/transformers/roberta_large/vocab.txt",
+            "hfl/rbt6": "https://bj.bcebos.com/paddlenlp/models/transformers/rbt6/vocab.txt",
+            "hfl/rbt4": "https://bj.bcebos.com/paddlenlp/models/transformers/rbt4/vocab.txt",
+            "hfl/rbt3": "https://bj.bcebos.com/paddlenlp/models/transformers/rbt3/vocab.txt",
+            "hfl/rbtl3": "https://bj.bcebos.com/paddlenlp/models/transformers/rbtl3/vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "hfl/roberta-wwm-ext": {"do_lower_case": True},
+        "hfl/roberta-wwm-ext-large": {"do_lower_case": True},
+        "hfl/rbt6": {"do_lower_case": True},
+        "hfl/rbt4": {"do_lower_case": True},
+        "hfl/rbt3": {"do_lower_case": True},
+        "hfl/rbtl3": {"do_lower_case": True},
+    }
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs
+    ):
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the "
+                "vocabulary from a pretrained model please use "
+                "`tokenizer = RobertaTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+        self.do_lower_case = do_lower_case
+        self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token)
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=unk_token)
+
+    @property
+    def vocab_size(self):
+        """
+        Return the size of vocabulary.
+
+        Returns:
+            int: The size of vocabulary.
+        """
+
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab._token_to_idx, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        """
+        End-to-end tokenization for Roberta models.
+
+        Args:
+            text (str): The text to be tokenized.
+
+        Returns:
+            list: A list of string representing converted tokens.
+        """
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
+                # If the token is part of the never_split set
+                if token in self.basic_tokenizer.never_split:
+                    split_tokens.append(token)
+                else:
+                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def convert_tokens_to_string(self, tokens):
+        """
+        Converts a sequence of tokens (list of string) to a single string. Since
+        the usage of WordPiece introducing `##` to concat subwords, also removes
+        `##` when converting.
+
+        Args:
+            tokens (list): A list of string representing tokens to be converted.
+
+        Returns:
+            str: Converted string from tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import RobertaTokenizer
+
+                tokenizer = RobertaTokenizer.from_pretrained('roberta-wwm-ext')
+                tokens = tokenizer.tokenize('He was a puppeteer')
+                '''
+                ['he', 'was', 'a', 'puppet', '##eer']
+                '''
+                strings = tokenizer.convert_tokens_to_string(tokens)
+                '''
+                he was a puppeteer
+                '''
+        """
+
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def num_special_tokens_to_add(self, pair=False):
+        """
+        Returns the number of added tokens when encoding a sequence with special tokens.
+
+        Args:
+            pair(bool):
+                Whether the input is a sequence pair or a single sequence.
+                Defaults to `False` and the input is a single sequence.
+
+        Returns:
+            int: Number of tokens added to sequences.
+        """
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens.
+
+        A RoBERTa sequence has the following format:
+
+        - single sequence:      ``[CLS] X [SEP]``
+        - pair of sequences:        ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+
+        Returns:
+            List[int]: List of input_id with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        return _cls + token_ids_0 + _sep + token_ids_1 + _sep
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        """
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        A RoBERTa offset_mapping has the following format:
+
+        - single sequence:      ``(0,0) X (0,0)``
+        - pair of sequences:        ``(0,0) A (0,0) B (0,0)``
+
+        Args:
+            offset_mapping_0 (List[tuple]):
+                List of wordpiece offsets to which the special tokens will be added.
+            offset_mapping_1 (List[tuple], optional):
+                Optional second list of wordpiece offsets for offset mapping pairs. Defaults to None.
+
+        Returns:
+            List[tuple]: A list of wordpiece offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+
+        A RoBERTa sequence pair mask has the following format:
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+
+        Returns:
+            List[int]: List of token_type_id according to the given sequence(s).
+        """
+        _sep = [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(_cls + token_ids_0 + _sep) * [0]
+        return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 + _sep) * [1]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+            already_has_special_tokens (bool, optional): Whether or not the token list is already
+                formatted with special tokens for the model. Defaults to None.
+
+        Returns:
+            List[int]: The list of integers either be 0 or 1: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+
+class RobertaBPETokenizer(GPTTokenizer):
+    """
+    Constructs a Roberta tokenizer based on byte-level Byte-Pair-Encoding.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.GPTTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            Path to the vocab file.
+            The vocab file contains a mapping from vocabulary strings to indices.
+        merges_file (str):
+            Path to the merge file.
+            The merge file is used to split the input sentence into "subword" units.
+            The vocab file is then used to encode those units as intices.
+        errors (str):
+            Paradigm to follow when decoding bytes to UTF-8.
+            Defaults to `'replace'`.
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import RobertaBPETokenizer
+            tokenizer = RobertaBPETokenizer.from_pretrained('roberta-base')
+
+            tokens = tokenizer('This is a simple Paddle')
+            #{'input_ids': [0, 713, 16, 10, 2007, 221, 33151, 2],
+            #'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0]}
+    """
+
+    resource_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}  # for save_pretrained
+
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "roberta-base": "https://bj.bcebos.com/paddlenlp/models/community/roberta-base/vocab.json",
+            "roberta-large": "https://bj.bcebos.com/paddlenlp/models/community/roberta-large/vocab.json",
+        },
+        "merges_file": {
+            "roberta-base": "https://bj.bcebos.com/paddlenlp/models/community/roberta-base/merges.txt",
+            "roberta-large": "https://bj.bcebos.com/paddlenlp/models/community/roberta-large/merges.txt",
+        },
+    }
+    pretrained_init_configuration = {
+        "roberta-base": {},
+        "roberta-large": {},
+    }
+    max_model_input_sizes = {
+        "roberta-base": 512,
+        "roberta-large": 512,
+    }
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        add_prefix_space=False,
+        **kwargs
+    ):
+
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+        self._build_special_tokens_map_extended(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+        )
+
+        self._vocab_file = vocab_file
+        self._merges_file = merges_file
+        self.num_command_tokens = 2
+        self.num_type_tokens = 2
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+
+        self.num_tokens = len(self.encoder)
+        self.num_text_tokens = self.num_tokens - 1
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            bpe_data = merges_handle.read().split("\n")[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        self.add_prefix_space = add_prefix_space
+
+        re = try_import("regex")
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification
+        tasks by concatenating and adding special tokens.
+        """
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        if token_ids_1 is None:
+            return _cls + token_ids_0 + _sep
+        return _cls + token_ids_0 + _sep + _sep + token_ids_1 + _sep
+
+    def get_offset_mapping(self, text):
+        tokens = self.tokenize(text)
+        offset_mapping = []
+        offset = 0
+        for token in tokens:
+            if token[0] == "Ġ":
+                offset_mapping.append((offset + 1, offset + len(token)))
+            else:
+                offset_mapping.append((offset, offset + len(token)))
+            offset += len(token)
+
+        return offset_mapping
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        """
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        A Roberta offset_mapping has the following format:
+
+        - single sequence:      ``(0,0) X (0,0)``
+        - pair of sequences:        ``(0,0) A (0,0) (0,0) B (0,0)``
+
+        Args:
+            offset_mapping_0 (List[tuple]):
+                List of wordpiece offsets to which the special tokens will be added.
+            offset_mapping_1 (List[tuple], optional):
+                Optional second list of wordpiece offsets for offset mapping pairs. Defaults to None.
+
+        Returns:
+            List[tuple]: A list of wordpiece offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0), (0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+            already_has_special_tokens (bool, optional): Whether or not the token list is already
+                formatted with special tokens for the model. Defaults to None.
+
+        Returns:
+            List[int]: The list of integers either be 0 or 1: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    def convert_tokens_to_string(self, tokens):
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    def num_special_tokens_to_add(self, pair=False):
+        """
+        Returns the number of added tokens when encoding a sequence with special tokens.
+
+        Args:
+            pair(bool):
+                Whether the input is a sequence pair or a single sequence.
+                Defaults to `False` and the input is a single sequence.
+
+        Returns:
+            int: Number of tokens added to sequences.
+        """
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
+        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
+            text = " " + text
+        return (text, kwargs)
+
+
+class RobertaTokenizer:
+    """
+    RobertaTokenizer is a generic tokenizer class that will be instantiated as either
+    RobertaChineseTokenizer or RobertaBPETokenizer when created with the RobertaTokenizer.from_pretrained() class method.
+    """
+
+    chinese_model_names = RobertaChineseTokenizer.pretrained_init_configuration.keys()
+    english_model_names = RobertaBPETokenizer.pretrained_init_configuration.keys()
+    tokenizer_config_file = "tokenizer_config.json"
+
+    def __init__(self, *args, **kwargs):
+        raise EnvironmentError(
+            f"{self.__class__.__name__} is designed to be instantiated "
+            f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path).`"
+        )
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        # From built-in pretrained models
+        if pretrained_model_name_or_path in cls.chinese_model_names:
+            return RobertaChineseTokenizer.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif pretrained_model_name_or_path in cls.english_model_names:
+            return RobertaBPETokenizer.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        # From local dir path
+        elif os.path.isdir(pretrained_model_name_or_path):
+            config_file = os.path.join(pretrained_model_name_or_path, cls.tokenizer_config_file)
+            if os.path.exists(config_file):
+                with io.open(config_file, encoding="utf-8") as f:
+                    init_kwargs = json.load(f)
+                # class name corresponds to this configuration
+                init_class = init_kwargs.pop("init_class", None)
+                if init_class == "RobertaBPETokenizer":
+                    return RobertaBPETokenizer.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+                if init_class == "RobertaChineseTokenizer" or init_class == "BertTokenizer":
+                    return RobertaChineseTokenizer.from_pretrained(
+                        pretrained_model_name_or_path, *model_args, **kwargs
+                    )
+            return RobertaBPETokenizer.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        else:
+            # Assuming from community-contributed pretrained models
+
+            subfolder = kwargs.pop("subfolder", None)
+            cache_dir = kwargs.pop("cache_dir", None)
+            force_download = kwargs.pop("force_download", False)
+            from_aistudio = kwargs.pop("from_aistudio", False)
+            from_hf_hub = kwargs.pop("from_hf_hub", False)
+
+            resolved_config_file = resolve_file_path(
+                pretrained_model_name_or_path,
+                [cls.tokenizer_config_file],
+                subfolder,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                from_aistudio=from_aistudio,
+                from_hf_hub=from_hf_hub,
+            )
+            assert (
+                resolved_config_file is not None
+            ), f"please make sure {cls.tokenizer_config_file} under {pretrained_model_name_or_path}"
+
+            with io.open(resolved_config_file, encoding="utf-8") as f:
+                init_kwargs = json.load(f)
+
+            init_class = init_kwargs.pop("init_class", None)
+            if init_class == "RobertaBPETokenizer":
+                return RobertaBPETokenizer.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+            elif init_class == "RobertaChineseTokenizer" or init_class == "BertTokenizer":
+                return RobertaChineseTokenizer.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+            else:
+                return RobertaBPETokenizer.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformer/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformer/__init__.py
new file mode 100644
index 000000000..3bd752713
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformer/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .modeling import *
+from .tokenizer import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformer/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformer/configuration.py
new file mode 100644
index 000000000..d34adfce0
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformer/configuration.py
@@ -0,0 +1,325 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ROFORMER model configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from ..configuration_utils import PretrainedConfig
+
+__all__ = ["ROFORMER_PRETRAINED_INIT_CONFIGURATION", "RoFormerConfig", "ROFORMER_PRETRAINED_RESOURCE_FILES_MAP"]
+
+ROFORMER_PRETRAINED_INIT_CONFIGURATION = {
+    "roformer-chinese-small": {
+        "vocab_size": 50000,
+        "embedding_size": 384,
+        "hidden_size": 384,
+        "num_hidden_layers": 6,
+        "num_attention_heads": 6,
+        "intermediate_size": 1536,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+        "rotary_value": False,
+    },
+    "roformer-chinese-base": {
+        "vocab_size": 50000,
+        "embedding_size": 768,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 1536,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+        "rotary_value": False,
+    },
+    "roformer-chinese-char-small": {
+        "vocab_size": 12000,
+        "embedding_size": 384,
+        "hidden_size": 384,
+        "num_hidden_layers": 6,
+        "num_attention_heads": 6,
+        "intermediate_size": 1536,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+        "rotary_value": False,
+    },
+    "roformer-chinese-char-base": {
+        "vocab_size": 12000,
+        "embedding_size": 768,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+        "rotary_value": False,
+    },
+    "roformer-chinese-sim-char-ft-small": {
+        "vocab_size": 12000,
+        "embedding_size": 384,
+        "hidden_size": 384,
+        "num_hidden_layers": 6,
+        "num_attention_heads": 6,
+        "intermediate_size": 1536,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+        "eos_token_id": 102,
+        "rotary_value": False,
+        "pool_act": "linear",
+    },
+    "roformer-chinese-sim-char-ft-base": {
+        "vocab_size": 12000,
+        "embedding_size": 768,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+        "eos_token_id": 102,
+        "rotary_value": False,
+        "pool_act": "linear",
+    },
+    "roformer-chinese-sim-char-small": {
+        "vocab_size": 12000,
+        "embedding_size": 384,
+        "hidden_size": 384,
+        "num_hidden_layers": 6,
+        "num_attention_heads": 6,
+        "intermediate_size": 1536,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+        "eos_token_id": 102,
+        "rotary_value": False,
+        "pool_act": "linear",
+    },
+    "roformer-chinese-sim-char-base": {
+        "vocab_size": 12000,
+        "embedding_size": 768,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+        "eos_token_id": 102,
+        "rotary_value": False,
+        "pool_act": "linear",
+    },
+    "roformer-english-small-discriminator": {
+        "vocab_size": 30522,
+        "embedding_size": 128,
+        "hidden_size": 256,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 4,
+        "intermediate_size": 1024,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 128,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+        "rotary_value": True,
+    },
+    "roformer-english-small-generator": {
+        "vocab_size": 30522,
+        "embedding_size": 128,
+        "hidden_size": 64,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 1,
+        "intermediate_size": 256,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 128,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+        "rotary_value": True,
+    },
+}
+
+ROFORMER_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "roformer-chinese-small": "https://bj.bcebos.com/paddlenlp/models/transformers/roformer/roformer-chinese-small/model_state.pdparams",
+        "roformer-chinese-base": "https://bj.bcebos.com/paddlenlp/models/transformers/roformer/roformer-chinese-base/model_state.pdparams",
+        "roformer-chinese-char-small": "https://bj.bcebos.com/paddlenlp/models/transformers/roformer/roformer-chinese-char-small/model_state.pdparams",
+        "roformer-chinese-char-base": "https://bj.bcebos.com/paddlenlp/models/transformers/roformer/roformer-chinese-char-base/model_state.pdparams",
+        "roformer-chinese-sim-char-ft-small": "https://bj.bcebos.com/paddlenlp/models/transformers/roformer/roformer-chinese-sim-char-ft-small/model_state.pdparams",
+        "roformer-chinese-sim-char-ft-base": "https://bj.bcebos.com/paddlenlp/models/transformers/roformer/roformer-chinese-sim-char-ft-base/model_state.pdparams",
+        "roformer-chinese-sim-char-small": "https://bj.bcebos.com/paddlenlp/models/transformers/roformer/roformer-chinese-sim-char-small/model_state.pdparams",
+        "roformer-chinese-sim-char-base": "https://bj.bcebos.com/paddlenlp/models/transformers/roformer/roformer-chinese-sim-char-base/model_state.pdparams",
+        "roformer-english-small-discriminator": "https://bj.bcebos.com/paddlenlp/models/transformers/roformer/roformer-english-small-discriminator/model_state.pdparams",
+        "roformer-english-small-generator": "https://bj.bcebos.com/paddlenlp/models/transformers/roformer/roformer-english-small-generator/model_state.pdparams",
+    }
+}
+
+
+class RoFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`RoFormerModel`]. It is used to
+    instantiate a RoFormer model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the RoFormer
+    roformer-chinese-base architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the RoFormer model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`RoFormer`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 1536):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 1536).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`RoFormerModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+        pad_token_id (`int`, *optional*):
+            The index of padding token in the token vocabulary.
+            Defaults to `0`.
+        eos_token_id (`int`, *optional*):
+            The id of the `eos` token. Defaults to `102`.
+        pool_act (`str`, *optional*):
+            The non-linear activation function in the pooler.
+            Defaults to `"tanh"`.
+        rotary_value (`bool`, *optional*):
+            Whether or not apply rotay position embeddings to value.
+            Defaults to `False`.
+
+    Examples:
+
+    ```python
+    >>> from paddlenlp.transformers import RoFormerModel, RoFormerConfig
+
+    >>> # Initializing a RoFormer roformer-chinese-base style configuration
+    >>> configuration = RoFormerConfig()
+
+    >>> # Initializing a model from the roformer-chinese-base style configuration
+    >>> model = RoFormerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "roformer"
+    attribute_map: Dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
+    pretrained_init_configuration = ROFORMER_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size: int = 30522,
+        embedding_size: int = 768,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        max_position_embeddings: int = 1536,
+        type_vocab_size: int = 2,
+        initializer_range: float = 0.02,
+        pad_token_id: int = 0,
+        pool_act: str = "tanh",
+        layer_norm_eps: float = 1e-12,
+        rotary_value: bool = False,
+        eos_token_id: int = 102,
+        use_cache=False,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        if embedding_size is None:
+            embedding_size = hidden_size
+        self.embedding_size = embedding_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.pool_act = pool_act
+        self.rotary_value = rotary_value
+
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformer/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformer/modeling.py
new file mode 100644
index 000000000..95c6cbbd4
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformer/modeling.py
@@ -0,0 +1,1380 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional, Tuple
+
+import paddle
+import paddle.nn as nn
+from paddle import Tensor
+from paddle.common_ops_import import convert_dtype
+
+from ...utils.converter import StateDictNameMapping, init_name_mappings
+from .. import PretrainedModel, register_base_model
+from ..activations import get_activation
+from ..model_outputs import (
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    tuple_output,
+)
+
+__all__ = [
+    "RoFormerModel",
+    "RoFormerPretrainedModel",
+    "RoFormerForSequenceClassification",
+    "RoFormerForTokenClassification",
+    "RoFormerForQuestionAnswering",
+    "RoFormerForMaskedLM",
+    "RoFormerForMultipleChoice",
+    "RoFormerForCausalLM",
+]
+from .configuration import (
+    ROFORMER_PRETRAINED_INIT_CONFIGURATION,
+    ROFORMER_PRETRAINED_RESOURCE_FILES_MAP,
+    RoFormerConfig,
+)
+
+
+class RoFormerEmbeddings(nn.Layer):
+    """
+    Include embeddings from word and token_type embeddings
+    """
+
+    def __init__(
+        self,
+        vocab_size,
+        embedding_size=768,
+        hidden_dropout_prob=0.1,
+        type_vocab_size=2,
+    ):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(vocab_size, embedding_size)
+        self.token_type_embeddings = nn.Embedding(type_vocab_size, embedding_size)
+        self.layer_norm = nn.LayerNorm(embedding_size)
+        self.dropout = nn.Dropout(hidden_dropout_prob)
+
+    def forward(self, input_ids=None, token_type_ids=None, inputs_embeds=None):
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        if token_type_ids is None:
+            token_type_ids_shape = inputs_embeds.shape[:-1]
+            token_type_ids = paddle.zeros(token_type_ids_shape, dtype="int64")
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class RotaryPositionEmbedding(nn.Layer):
+    def __init__(self, dim, max_position_embeddings=512):
+        super().__init__()
+        inv_freq = 1.0 / (10000 ** (paddle.arange(0, dim, 2, dtype=paddle.get_default_dtype()) / dim))
+        t = paddle.arange(max_position_embeddings, dtype=paddle.get_default_dtype())
+        freqs = paddle.matmul(t.unsqueeze(1), inv_freq.unsqueeze(0))
+        self.register_buffer("sin", freqs.sin(), persistable=False)
+        self.register_buffer("cos", freqs.cos(), persistable=False)
+
+    def forward(self, x, offset=0):
+        # x shape [batch_size, num_heads, seqlen, head_dim]
+        seqlen = x.shape[-2]
+        sin, cos = (
+            self.sin[offset : offset + seqlen, :],
+            self.cos[offset : offset + seqlen, :],
+        )
+        x1, x2 = x[..., 0::2], x[..., 1::2]
+        # [cos_nθ, -sin_nθ] [x1]
+        # [sin_nθ,  cos_nθ] [x2]
+        # => [x1 * cos_nθ - x2 * sin_nθ, x1 * sin_nθ + x2 * cos_nθ]
+        return paddle.stack([x1 * cos - x2 * sin, x1 * sin + x2 * cos], axis=-1).flatten(-2, -1)
+
+
+class MultiHeadAttentionWithRotary(nn.MultiHeadAttention):
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        dropout=0.0,
+        kdim=None,
+        vdim=None,
+        need_weights=False,
+        rotary_value=False,
+        max_position_embeddings=512,
+    ):
+        super().__init__(embed_dim, num_heads, dropout, kdim, vdim, need_weights)
+        self.rotary_value = rotary_value
+        self.rotary = RotaryPositionEmbedding(self.head_dim, max_position_embeddings)
+
+    def _prepare_qkv(self, query, key, value, cache=None):
+        q = self.q_proj(query)
+        q = paddle.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
+        q = paddle.transpose(x=q, perm=[0, 2, 1, 3])
+
+        k, v = self.compute_kv(key, value)
+
+        offset = 0 if cache is None else cache.k.shape[2]
+
+        # rotary q,k,v
+        q = self.rotary(q, offset=offset)
+        k = self.rotary(k, offset=offset)
+        if self.rotary_value:
+            v = self.rotary(v, offset=offset)
+
+        if isinstance(cache, self.Cache):
+            # for decoder self-attention in inference
+            k = paddle.concat([cache.k, k], axis=2)
+            v = paddle.concat([cache.v, v], axis=2)
+            cache = self.Cache(k, v)
+
+        return (q, k, v) if cache is None else (q, k, v, cache)
+
+
+class TransformerEncoderLayerWithRotary(nn.TransformerEncoderLayer):
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dim_feedforward,
+        dropout=0.1,
+        activation="relu",
+        attn_dropout=None,
+        act_dropout=None,
+        normalize_before=False,
+        rotary_value=False,
+        max_position_embeddings=512,
+        **kwargs
+    ):
+        super().__init__(
+            d_model,
+            nhead,
+            dim_feedforward,
+            dropout=dropout,
+            activation=activation,
+            attn_dropout=attn_dropout,
+            act_dropout=act_dropout,
+            normalize_before=normalize_before,
+        )
+        self.self_attn = MultiHeadAttentionWithRotary(
+            d_model,
+            nhead,
+            dropout=attn_dropout,
+            rotary_value=rotary_value,
+            max_position_embeddings=max_position_embeddings,
+        )
+        self._config.update({"rotary_value": rotary_value, "max_position_embeddings": max_position_embeddings})
+
+
+class RoFormerPooler(nn.Layer):
+    def __init__(self, hidden_size, pool_act="tanh"):
+        super().__init__()
+        self.dense = nn.Linear(hidden_size, hidden_size)
+        self.activation = get_activation(pool_act)
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class RoFormerLMPredictionHead(nn.Layer):
+    def __init__(self, embedding_size, hidden_size, vocab_size, activation, embedding_weights=None):
+        super().__init__()
+        self.transform = nn.Linear(hidden_size, embedding_size)
+        self.activation = get_activation(activation)
+        self.layer_norm = nn.LayerNorm(embedding_size)
+        self.decoder_weight = (
+            self.create_parameter(
+                shape=[vocab_size, embedding_size],
+                dtype=self.transform.weight.dtype,
+                is_bias=False,
+            )
+            if embedding_weights is None
+            else embedding_weights
+        )
+        self.decoder_bias = self.create_parameter(shape=[vocab_size], dtype=self.decoder_weight.dtype, is_bias=True)
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = paddle.matmul(hidden_states, self.decoder_weight, transpose_y=True) + self.decoder_bias
+        return hidden_states
+
+
+class RoFormerOnlyMLMHead(nn.Layer):
+    def __init__(self, embedding_size, hidden_size, vocab_size, activation, embedding_weights):
+        super().__init__()
+        self.predictions = RoFormerLMPredictionHead(
+            embedding_size,
+            hidden_size=hidden_size,
+            vocab_size=vocab_size,
+            activation=activation,
+            embedding_weights=embedding_weights,
+        )
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class RoFormerPretrainedModel(PretrainedModel):
+    r"""
+    An abstract class for pretrained RoFormer models. It provides RoFormer related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+
+    """
+    config_class = RoFormerConfig
+    resource_files_names = {"model_state": "model_state.pdparams"}
+    base_model_prefix = "roformer"
+    pretrained_init_configuration = ROFORMER_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = ROFORMER_PRETRAINED_RESOURCE_FILES_MAP
+
+    @classmethod
+    def _get_name_mappings(cls, config: RoFormerConfig) -> List[StateDictNameMapping]:
+        mappings: List[StateDictNameMapping] = []
+        model_mappings = [
+            "embeddings.word_embeddings.weight",
+            "embeddings.token_type_embeddings.weight",
+            ["embeddings.LayerNorm.weight", "embeddings.layer_norm.weight"],
+            ["embeddings.LayerNorm.bias", "embeddings.layer_norm.bias"],
+            ["pooler.dense.weight", None, "transpose"],
+            "pooler.dense.bias",
+            # for TokenClassification
+        ]
+        for layer_index in range(config.num_hidden_layers):
+            layer_mappings = [
+                [
+                    f"encoder.layer.{layer_index}.attention.self.query.weight",
+                    f"encoder.layers.{layer_index}.self_attn.q_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.query.bias",
+                    f"encoder.layers.{layer_index}.self_attn.q_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.key.weight",
+                    f"encoder.layers.{layer_index}.self_attn.k_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.key.bias",
+                    f"encoder.layers.{layer_index}.self_attn.k_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.value.weight",
+                    f"encoder.layers.{layer_index}.self_attn.v_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.self.value.bias",
+                    f"encoder.layers.{layer_index}.self_attn.v_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.dense.weight",
+                    f"encoder.layers.{layer_index}.self_attn.out_proj.weight",
+                    "transpose",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.dense.bias",
+                    f"encoder.layers.{layer_index}.self_attn.out_proj.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.intermediate.dense.weight",
+                    f"encoder.layers.{layer_index}.linear1.weight",
+                    "transpose",
+                ],
+                [f"encoder.layer.{layer_index}.intermediate.dense.bias", f"encoder.layers.{layer_index}.linear1.bias"],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.LayerNorm.weight",
+                    f"encoder.layers.{layer_index}.norm1.weight",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.attention.output.LayerNorm.bias",
+                    f"encoder.layers.{layer_index}.norm1.bias",
+                ],
+                [
+                    f"encoder.layer.{layer_index}.output.dense.weight",
+                    f"encoder.layers.{layer_index}.linear2.weight",
+                    "transpose",
+                ],
+                [f"encoder.layer.{layer_index}.output.dense.bias", f"encoder.layers.{layer_index}.linear2.bias"],
+                [f"encoder.layer.{layer_index}.output.LayerNorm.weight", f"encoder.layers.{layer_index}.norm2.weight"],
+                [f"encoder.layer.{layer_index}.output.LayerNorm.bias", f"encoder.layers.{layer_index}.norm2.bias"],
+            ]
+            model_mappings.extend(layer_mappings)
+
+        init_name_mappings(model_mappings)
+
+        # base-model prefix "RoFormerModel"
+        if "RoFormerModel" not in config.architectures:
+            for mapping in model_mappings:
+                mapping[0] = "roformer." + mapping[0]
+                mapping[1] = "roformer." + mapping[1]
+
+        if "RoFormerForMaskedLM" in config.architectures:
+            model_mappings.extend(
+                [
+                    ["cls.predictions.transform.dense.weight", "cls.predictions.transform.weight", "transpose"],
+                    ["cls.predictions.transform.dense.bias", "cls.predictions.transform.bias"],
+                    ["cls.predictions.transform.LayerNorm.weight", "cls.predictions.layer_norm.weight"],
+                    ["cls.predictions.transform.LayerNorm.bias", "cls.predictions.layer_norm.bias"],
+                    ["cls.predictions.decoder.bias", "cls.predictions.decoder_bias"],
+                ]
+            )
+        # downstream mappings
+        if "RoFormerForQuestionAnswering" in config.architectures:
+            model_mappings.extend(
+                [["qa_outputs.weight", "classifier.weight", "transpose"], ["qa_outputs.bias", "classifier.bias"]]
+            )
+        if (
+            "RoFormerForMultipleChoice" in config.architectures
+            or "RoFormerForSequenceClassification" in config.architectures
+            or "RoFormerForTokenClassification" in config.architectures
+        ):
+            model_mappings.extend([["classifier.weight", None, "transpose"]])
+
+        init_name_mappings(model_mappings)
+        mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)]
+        return mappings
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range,
+                        shape=layer.weight.shape,
+                    )
+                )
+
+        elif isinstance(layer, nn.LayerNorm):
+            layer._epsilon = self.config.layer_norm_eps
+
+
+@register_base_model
+class RoFormerModel(RoFormerPretrainedModel):
+    """
+    The bare RoFormerModel outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`RoFormerConfig`):
+            An instance of RoFormerConfig used to construct RoFormerModel.
+    """
+
+    def __init__(
+        self,
+        config: RoFormerConfig,
+    ):
+        super().__init__(config)
+        self.pad_token_id = config.pad_token_id
+        self.eos_token_id = config.eos_token_id
+        self.initializer_range = config.initializer_range
+        if config.embedding_size != config.hidden_size:
+            self.embeddings_project = nn.Linear(config.embedding_size, config.hidden_size)
+        self.embeddings = RoFormerEmbeddings(
+            config.vocab_size,
+            config.embedding_size,
+            config.hidden_dropout_prob,
+            config.type_vocab_size,
+        )
+        encoder_layer = TransformerEncoderLayerWithRotary(
+            config.hidden_size,
+            config.num_attention_heads,
+            config.intermediate_size,
+            dropout=config.hidden_dropout_prob,
+            activation=config.hidden_act,
+            attn_dropout=config.attention_probs_dropout_prob,
+            act_dropout=0,
+            rotary_value=config.rotary_value,
+            max_position_embeddings=config.max_position_embeddings,
+        )
+        self.encoder = nn.TransformerEncoder(encoder_layer, config.num_hidden_layers)
+        self.pooler = RoFormerPooler(config.hidden_size, config.pool_act)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[Tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The RoFormerModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor, optional):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate first and second portions of the inputs.
+                Indices can be either 0 or 1:
+
+                - 0 corresponds to a **sentence A** token,
+                - 1 corresponds to a **sentence B** token.
+
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
+                Defaults to None, which means no segment embeddings is added to token embeddings.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
+                [batch_size, num_attention_heads, sequence_length, sequence_length].
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            inputs_embeds (Tensor, optional):
+                If you want to control how to convert `inputs_ids` indices into associated vectors, you can
+                pass an embedded representation directly instead of passing `inputs_ids`.
+            past_key_values (tuple(tuple(Tensor)), optional):
+                The length of tuple equals to the number of layers, and each inner
+                tuple haves 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`)
+                which contains precomputed key and value hidden states of the attention blocks.
+                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that
+                don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+                `input_ids` of shape `(batch_size, sequence_length)`.
+            use_cache (`bool`, optional):
+                If set to `True`, `past_key_values` key value states are returned.
+                Defaults to `None`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.ModelOutput` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RoFormerModel, RoFormerTokenizer
+
+                tokenizer = RoFormerTokenizer.from_pretrained('roformer-chinese-char-base')
+                model = RoFormerModel.from_pretrained('roformer-chinese-char-base')
+
+                tokenized_inputs = tokenizer("欢迎使用百度飞桨!", return_tensors="pd")
+                output = model(**tokenized_inputs)
+
+        """
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time.")
+
+        # init the default bool value
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+
+        if attention_mask is None:
+            attention_mask = paddle.unsqueeze(
+                (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e4, axis=[1, 2]
+            )
+            if past_key_values is not None:
+                batch_size = past_key_values[0][0].shape[0]
+                past_mask = paddle.zeros([batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype)
+                attention_mask = paddle.concat([past_mask, attention_mask], axis=-1)
+
+        # For 2D attention_mask from tokenizer
+        elif attention_mask.ndim == 2:
+            attention_mask = paddle.unsqueeze(attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
+            attention_mask = (1.0 - attention_mask) * -1e4
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+
+        if hasattr(self, "embeddings_project"):
+            embedding_output = self.embeddings_project(embedding_output)
+
+        self.encoder._use_cache = use_cache  # To be consistent with HF
+        encoder_outputs = self.encoder(
+            embedding_output,
+            src_mask=attention_mask,
+            cache=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if isinstance(encoder_outputs, type(embedding_output)):
+            sequence_output = encoder_outputs
+            pooled_output = self.pooler(sequence_output)
+            return (sequence_output, pooled_output)
+        else:
+            sequence_output = encoder_outputs[0]
+            pooled_output = self.pooler(sequence_output)
+            if not return_dict:
+                return (sequence_output, pooled_output) + encoder_outputs[1:]
+            return BaseModelOutputWithPoolingAndCrossAttentions(
+                last_hidden_state=sequence_output,
+                pooler_output=pooled_output,
+                past_key_values=encoder_outputs.past_key_values,
+                hidden_states=encoder_outputs.hidden_states,
+                attentions=encoder_outputs.attentions,
+            )
+
+
+class RoFormerForQuestionAnswering(RoFormerPretrainedModel):
+    r"""
+    RoFormer Model with a linear layer on top of the hidden-states output to compute `span_start_logits`
+     and `span_end_logits`, designed for question-answering tasks like SQuAD.
+
+    Args:
+        config (:class:`RoFormerConfig`):
+            An instance of RoFormerConfig used to construct RoFormerForQuestionAnswering.
+    """
+
+    def __init__(self, config: RoFormerConfig):
+        super().__init__(config)
+        self.roformer = RoFormerModel(config)
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, 2)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        start_positions: Optional[Tensor] = None,
+        end_positions: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The RoFormerForQuestionAnswering forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`RoFormerModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`RoFormerModel`.
+            attention_mask (Tensor, optional):
+                See :class:`RoFormerModel`.
+            inputs_embeds(Tensor, optional):
+                See :class:`RoFormerModel`.
+            start_positions (Tensor of shape `(batch_size,)`, optional):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            end_positions (Tensor of shape `(batch_size,)`, optional):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RoFormerForQuestionAnswering, RoFormerTokenizer
+
+                tokenizer = RoFormerTokenizer.from_pretrained('roformer-chinese-char-base')
+                model = RoFormerForQuestionAnswering.from_pretrained('roformer-chinese-char-base')
+
+                tokenized_inputs = tokenizer("欢迎使用百度飞桨!", return_tensors="pd")
+                outputs = model(**tokenized_inputs)
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roformer(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output)
+        start_logits, end_logits = paddle.unstack(x=logits, axis=-1)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if start_positions.ndim > 1:
+                start_positions = start_positions.squeeze(-1)
+            if start_positions.ndim > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clip(0, ignored_index)
+            end_positions = end_positions.clip(0, ignored_index)
+
+            loss_fct = paddle.nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return tuple_output(output, total_loss)
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class RoFormerForSequenceClassification(RoFormerPretrainedModel):
+    r"""
+    RoFormer Model with a linear layer on top of the output layer,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        config (:class:`RoFormerConfig`):
+            An instance of RoFormerConfig used to construct RoFormerForSequenceClassification.
+    """
+
+    def __init__(self, config: RoFormerConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.roformer = RoFormerModel(config)
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The RoFormerForSequenceClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`RoFormerModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`RoFormerModel`.
+            attention_mask (Tensor, optional):
+                See :class:`RoFormerModel`.
+            inputs_embeds(Tensor, optional):
+                See :class:`RoFormerModel`.
+            labels (Tensor of shape `(batch_size,)`, optional):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in `[0, ..., num_labels - 1]`. If `num_labels == 1`
+                a regression loss is computed (Mean-Square loss), If `num_labels > 1`
+                a classification loss is computed (Cross-Entropy).
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RoFormerForSequenceClassification, RoFormerTokenizer
+
+                tokenizer = RoFormerTokenizer.from_pretrained('roformer-chinese-char-base')
+                model = RoFormerForSequenceClassification.from_pretrained('roformer-chinese-char-base')
+
+                tokenized_inputs = tokenizer("欢迎使用百度飞桨!", return_tensors="pd")
+                logits = model(**tokenized_inputs)
+
+        """
+        outputs = self.roformer(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = paddle.nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = paddle.nn.CrossEntropyLoss()
+                loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = paddle.nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return tuple_output(output, loss)
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class RoFormerForTokenClassification(RoFormerPretrainedModel):
+    r"""
+    RoFormer Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        config (:class:`RoFormerConfig`):
+            An instance of RoFormerConfig used to construct RoFormerForTokenClassification.
+    """
+
+    def __init__(self, config: RoFormerConfig):
+        super().__init__(config)
+        self.roformer = RoFormerModel(config)
+        self.num_labels = config.num_labels
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The RoFormerForTokenClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`RoFormerModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`RoFormerModel`.
+            attention_mask (Tensor, optional):
+                See :class:`RoFormerModel`.
+            inputs_embeds(Tensor, optional):
+                See :class:`RoFormerModel`.
+            labels (Tensor of shape `(batch_size, sequence_length)`, optional):
+                Labels for computing the token classification loss. Indices should be in `[0, ..., num_labels - 1]`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RoFormerForTokenClassification, RoFormerTokenizer
+
+                tokenizer = RoFormerTokenizer.from_pretrained('roformer-chinese-char-base')
+                model = RoFormerForTokenClassification.from_pretrained('roformer-chinese-char-base')
+
+                tokenized_inputs = tokenizer("欢迎使用百度飞桨!", return_tensors="pd")
+                logits = model(**tokenized_inputs)
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.roformer(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return tuple_output(output, loss)
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class RoFormerForMultipleChoice(RoFormerPretrainedModel):
+    """
+    RoFormerModel with a linear layer on top of the hidden-states output layer,
+    designed for multiple choice tasks like RocStories/SWAG tasks.
+
+    Args:
+        config (:class:`RoFormerConfig`):
+            An instance of RoFormerConfig used to construct RoFormerForMultipleChoice.
+    """
+
+    def __init__(self, config: RoFormerConfig):
+        super().__init__(config)
+        self.roformer = RoFormerModel(config)
+        self.num_choices = config.num_choices
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The RoFormerForMultipleChoice forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`RoFormerModel` and shape as [batch_size, num_choice, sequence_length].
+            token_type_ids (Tensor, optional):
+                See :class:`RoFormerModel` and shape as [batch_size, num_choice, sequence_length].
+            attention_mask (Tensor, optional):
+                See :class:`RoFormerModel` and shape as [batch_size, num_choice, sequence_length].
+            inputs_embeds(Tensor, optional):
+                See :class:`RoFormerModel`.
+            labels (Tensor of shape `(batch_size, )`, optional):
+                Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+                num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+                `input_ids` above)
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RoFormerForMultipleChoice, RoFormerTokenizer
+
+                tokenizer = RoFormerTokenizer.from_pretrained('roformer-chinese-char-base')
+                model = RoFormerForMultipleChoice.from_pretrained('roformer-chinese-char-base')
+
+                data = [
+                    {
+                        "question": "如何打开ipad屏幕？",
+                        "answer1": "按音量按钮。",
+                        "answer2": "按下锁定按钮。",
+                        "label": 1,
+                    },
+                    {
+                        "question": "如何缩进一些文本？",
+                        "answer1": "在开始写之前留一些空格。",
+                        "answer2": "按空格键。",
+                        "label": 0,
+                    },
+                ]
+
+                text = []
+                text_pair = []
+                for d in data:
+                    text.append(d["question"])
+                    text_pair.append(d["answer1"])
+                    text.append(d["question"])
+                    text_pair.append(d["answer2"])
+
+                tokenized_inputs = tokenizer(text, text_pair, padding=True, return_tensors="pd")
+                reshaped_logits = model(**tokenized_inputs)
+                print(reshaped_logits.shape)
+                # [2, 2]
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        input_ids = input_ids.reshape((-1, input_ids.shape[-1])) if input_ids is not None else None
+        token_type_ids = token_type_ids.reshape((-1, token_type_ids.shape[-1])) if token_type_ids is not None else None
+        attention_mask = attention_mask.reshape((-1, attention_mask.shape[-1])) if attention_mask is not None else None
+
+        outputs = self.roformer(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.reshape((-1, self.num_choices))
+
+        loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return tuple_output(output, loss)
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class RoFormerForMaskedLM(RoFormerPretrainedModel):
+    """
+    RoFormer Model with a `masked language modeling` head on top.
+
+    Args:
+        config (:class:`RoFormerConfig`):
+            An instance of RoFormerConfig used to construct RoFormerForMaskedLM.
+
+    """
+
+    def __init__(self, config: RoFormerConfig):
+        super().__init__(config)
+        self.roformer = RoFormerModel(config)
+        self.cls = RoFormerOnlyMLMHead(
+            config.embedding_size,
+            config.hidden_size,
+            config.vocab_size,
+            config.hidden_act,
+            embedding_weights=self.roformer.embeddings.word_embeddings.weight,
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The RoFormerForMaskedLM forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`RoFormerModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`RoFormerModel`.
+            attention_mask (Tensor, optional):
+                See :class:`RoFormerModel`.
+            inputs_embeds(Tensor, optional):
+                See :class:`RoFormerModel`.
+            labels (Tensor of shape `(batch_size, sequence_length)`, optional):
+                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+                vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+                loss is only computed for the tokens with labels in `[0, ..., vocab_size]`
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.MaskedLMOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.MaskedLMOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.MaskedLMOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RoFormerForMaskedLM, RoFormerTokenizer
+
+                tokenizer = RoFormerTokenizer.from_pretrained('roformer-chinese-char-base')
+                model = RoFormerForMaskedLM.from_pretrained('roformer-chinese-char-base')
+
+                tokenized_inputs = tokenizer("欢迎使用百度飞桨!", return_tensors="pd")
+                logits = model(**tokenized_inputs)
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roformer(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(
+                prediction_scores.reshape((-1, prediction_scores.shape[-1])), labels.reshape((-1,))
+            )
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return tuple_output(output, masked_lm_loss)
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class RoFormerForCausalLM(RoFormerPretrainedModel):
+    """
+    RoFormer Model with a `Causal language modeling` head on top.
+
+    Args:
+        config (:class:`RoFormerConfig`):
+            An instance of RoFormerConfig used to construct RoFormerForCausalLM.
+
+    """
+
+    def __init__(self, config: RoFormerConfig):
+        super().__init__(config)
+        self.roformer = RoFormerModel(config)
+        self.cls = RoFormerOnlyMLMHead(
+            config.embedding_size,
+            config.hidden_size,
+            config.vocab_size,
+            config.hidden_act,
+            embedding_weights=self.roformer.embeddings.word_embeddings.weight,
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[Tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The RoFormerForCausalLM forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`RoFormerModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`RoFormerModel`.
+            attention_mask (Tensor, optional):
+                See :class:`RoFormerModel`.
+            inputs_embeds(Tensor, optional):
+                See :class:`RoFormerModel`.
+            labels (Tensor of shape `(batch_size, sequence_length)`, optional):
+                Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+                `[-100, 0, ..., vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+                ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., vocab_size]`.
+            past_key_values (tuple(tuple(Tensor)), optional):
+                See :class:`RoFormerModel`.
+            use_cache (Tensor, optional):
+                See :class:`RoFormerModel`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.CausalLMOutputWithCrossAttentions` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.CausalLMOutputWithCrossAttentions` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.CausalLMOutputWithCrossAttentions`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RoFormerForCausalLM, RoFormerTokenizer
+
+                tokenizer = RoFormerTokenizer.from_pretrained('roformer-chinese-sim-char-ft-base')
+                model = RoFormerForCausalLM.from_pretrained('roformer-chinese-sim-char-ft-base')
+
+                tokenized_inputs = tokenizer("欢迎使用百度飞桨!", return_tensors="pd")
+                logits = model(**tokenized_inputs)
+                print(logits.shape)
+                # [1, 11, 12000]
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roformer(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :]
+            labels = labels[:, 1:]
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            lm_loss = loss_fct(
+                shifted_prediction_scores.reshape((-1, prediction_scores.shape[-1])), labels.reshape((-1,))
+            )
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return tuple_output(output, lm_loss)
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, use_cache=False, cache=None, **kwargs):
+        # only last token for inputs_ids if past is defined in kwargs
+        token_type_ids = kwargs.get("token_type_ids", None)
+        attention_mask = kwargs.get("attention_mask", None)
+
+        if attention_mask is not None:
+            if "int" in convert_dtype(attention_mask.dtype):
+                attention_mask = (1.0 - attention_mask) * -1e4
+
+        if cache is not None:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+            token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
+            if attention_mask.ndim == 4:
+                attention_mask = attention_mask[:, -1, -1, :].unsqueeze([1, 2])
+
+        return {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "attention_mask": attention_mask,
+            "past_key_values": cache,
+            "use_cache": use_cache,
+        }
+
+    @staticmethod
+    def update_model_kwargs_for_generation(outputs, model_kwargs, is_encoder_decoder=False):
+        # Update the model inputs during generation.
+        # Note that If `token_type_ids` and `attention_mask` in `model_kwargs`
+        # and they contain pad value, the result vectors updated by this method
+        # may be different from expected. In this case, you need to rewrite the
+        # method.
+
+        # update cache
+        if isinstance(outputs, tuple):
+            model_kwargs["cache"] = outputs[1]
+
+        # update token_type_ids with last value
+        if "token_type_ids" in model_kwargs and model_kwargs["token_type_ids"] is not None:
+            token_type_ids = model_kwargs["token_type_ids"]
+            # token type id = 1
+            model_kwargs["token_type_ids"] = paddle.concat(
+                [token_type_ids, paddle.ones_like(token_type_ids[:, -1:])], axis=-1
+            )
+
+        # update attention_mask
+        if not is_encoder_decoder and "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            # nn.Pad2D don't support the data type `bool`
+            if convert_dtype(attention_mask.dtype) == "bool":
+                attention_mask = paddle.cast(attention_mask, "int64")
+            if len(attention_mask.shape) == 4:
+                attention_mask = attention_mask.expand((-1, -1, attention_mask.shape[-1], -1))
+                attention_mask = nn.Pad2D([0, 0, 0, 1], mode="replicate")(attention_mask)
+                attention_mask = nn.Pad2D([0, 1, 0, 0], value=-1e4)(attention_mask)
+                dtype = convert_dtype(attention_mask.dtype)
+                if "int" in dtype:
+                    attention_mask[:, :, -1, -1] = 1
+                elif "float" in dtype:
+                    attention_mask[:, :, -1, -1] = 0.0
+                else:
+                    raise ValueError("The data type of input `attention_mask` must " "be bool, int or float")
+            else:
+                # convert to 4D attention_mask
+                attention_mask = paddle.concat(
+                    [attention_mask, paddle.ones([attention_mask.shape[0], 1], dtype="int64")], axis=-1
+                )
+                if "int" in convert_dtype(attention_mask.dtype):
+                    attention_mask = (1.0 - attention_mask) * -1e4
+                attention_mask = attention_mask.unsqueeze([1, 2]).expand((-1, -1, attention_mask.shape[-1], -1))
+
+            token_type_ids = model_kwargs["token_type_ids"]
+            mask = token_type_ids[:, None, :] > token_type_ids[:, :, None]
+            # we need expand attention_mask
+            attention_mask = paddle.where(mask.unsqueeze(1), paddle.to_tensor(-1e4), attention_mask)
+            model_kwargs["attention_mask"] = attention_mask
+
+        return model_kwargs
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformer/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformer/tokenizer.py
new file mode 100644
index 000000000..a7fef8037
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformer/tokenizer.py
@@ -0,0 +1,381 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import jieba
+from ..bert.tokenizer import BasicTokenizer, WordpieceTokenizer
+from ..tokenizer_utils import PretrainedTokenizer
+
+__all__ = ["RoFormerTokenizer", "JiebaBasicTokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "roformer-chinese-small": 512,
+    "roformer-chinese-base": 1536,
+    "roformer-chinese-char-small": 512,
+    "roformer-chinese-char-base": 512,
+    "roformer-chinese-sim-char-ft-small": 512,
+    "roformer-chinese-sim-char-ft-base": 512,
+    "roformer-chinese-sim-char-small": 512,
+    "roformer-chinese-sim-char-base": 512,
+    "roformer-english-small-discriminator": 128,
+    "roformer-english-small-generator": 128,
+}
+
+
+class JiebaBasicTokenizer(BasicTokenizer):
+    """
+    Runs basic tokenization with jieba (punctuation splitting, lower casing, jieba pretokenizer etc).
+
+    Args:
+        vocab (:class:`paddlenlp.data.Vocab`): An instance of paddlenlp.data.Vocab.
+        do_lower_case (bool):
+            Whether the text strips accents and converts to lower case.
+            If you use the RoFormer Pretrained model, lower is set to
+            False when using the cased model, otherwise it is set to True.
+            Defaults to `True`.
+    """
+
+    def __init__(self, vocab, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+        """Constructs a JiebaBasicTokenizer."""
+        super().__init__(
+            never_split=never_split,
+            do_lower_case=do_lower_case,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+        )
+        self.vocab = vocab
+
+    def _tokenize_chinese_chars(self, text):
+        output = []
+        for wholeword in jieba.cut(text, HMM=False):
+            if wholeword in self.vocab:
+                output.append(" ")
+                output.append(wholeword)
+                output.append(" ")
+            else:
+                for char in wholeword:
+                    cp = ord(char)
+                    if self._is_chinese_char(cp):
+                        output.append(" ")
+                        output.append(char)
+                        output.append(" ")
+                    else:
+                        output.append(char)
+        return "".join(output)
+
+
+class RoFormerTokenizer(PretrainedTokenizer):
+    """
+    Constructs a RoFormer tokenizer. It uses a basic tokenizer to do punctuation
+    splitting, lower casing, jieba pretokenizer and so on, and follows a WordPiece tokenizer to
+    tokenize as subwords.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            The vocabulary file path (ends with '.txt') required to instantiate
+            a `WordpieceTokenizer`.
+        do_lower_case (bool,optional):
+            Whether or not to lowercase the input when tokenizing.
+            If you use the RoFormer pretrained model, lower is set to
+            False when using the cased model, otherwise it is set to True.
+            Defaults to`True`.
+        use_jieba (bool,optional):
+            Whether or not to tokenize the text with jieba. Default: False.
+        unk_token (str,optional):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str,optional):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str,optional):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str,optional):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str,optional):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import RoFormerTokenizer
+            tokenizer = RoFormerTokenizer.from_pretrained('roformer-chinese-base')
+
+            tokens = tokenizer('欢迎使用百度飞桨')
+            '''
+            {'input_ids': [101, 22355, 8994, 25854, 5438, 2473, 102],
+             'token_type_ids': [0, 0, 0, 0, 0, 0, 0]}
+            '''
+
+    """
+
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            # chinese word level model
+            "roformer-chinese-small": "https://bj.bcebos.com/paddlenlp/models/transformers/roformer/roformer-chinese-small/vocab.txt",
+            "roformer-chinese-base": "https://bj.bcebos.com/paddlenlp/models/transformers/roformer/roformer-chinese-base/vocab.txt",
+            # chinese char level model
+            "roformer-chinese-char-small": "https://bj.bcebos.com/paddlenlp/models/transformers/roformer/roformer-chinese-char-small/vocab.txt",
+            "roformer-chinese-char-base": "https://bj.bcebos.com/paddlenlp/models/transformers/roformer/roformer-chinese-char-base/vocab.txt",
+            "roformer-chinese-sim-char-ft-small": "https://bj.bcebos.com/paddlenlp/models/transformers/roformer/roformer-chinese-sim-char-ft-small/vocab.txt",
+            "roformer-chinese-sim-char-ft-base": "https://bj.bcebos.com/paddlenlp/models/transformers/roformer/roformer-chinese-sim-char-ft-base/vocab.txt",
+            "roformer-chinese-sim-char-small": "https://bj.bcebos.com/paddlenlp/models/transformers/roformer/roformer-chinese-sim-char-small/vocab.txt",
+            "roformer-chinese-sim-char-base": "https://bj.bcebos.com/paddlenlp/models/transformers/roformer/roformer-chinese-sim-char-base/vocab.txt",
+            # english
+            "roformer-english-small-discriminator": "https://bj.bcebos.com/paddlenlp/models/transformers/roformer/roformer-english-small-discriminator/vocab.txt",
+            "roformer-english-small-generator": "https://bj.bcebos.com/paddlenlp/models/transformers/roformer/roformer-english-small-generator/vocab.txt",
+        }
+    }
+    max_model_input_sizes = {
+        "roformer-chinese-small": 512,
+        "roformer-chinese-base": 1536,
+        "roformer-chinese-char-small": 512,
+        "roformer-chinese-char-base": 512,
+        "roformer-chinese-sim-char-ft-small": 512,
+        "roformer-chinese-sim-char-ft-base": 512,
+        "roformer-chinese-sim-char-small": 512,
+        "roformer-chinese-sim-char-base": 512,
+        "roformer-english-small-discriminator": 128,
+        "roformer-english-small-generator": 128,
+    }
+    pretrained_init_configuration = {
+        "roformer-chinese-small": {"do_lower_case": True, "use_jieba": True},
+        "roformer-chinese-base": {"do_lower_case": True, "use_jieba": True},
+        "roformer-chinese-char-small": {"do_lower_case": True, "use_jieba": False},
+        "roformer-chinese-char-base": {"do_lower_case": True, "use_jieba": False},
+        "roformer-chinese-sim-char-ft-small": {"do_lower_case": True, "use_jieba": False},
+        "roformer-chinese-sim-char-ft-base": {"do_lower_case": True, "use_jieba": False},
+        "roformer-chinese-sim-char-small": {"do_lower_case": True, "use_jieba": False},
+        "roformer-chinese-sim-char-base": {"do_lower_case": True, "use_jieba": False},
+        "roformer-english-small-discriminator": {"do_lower_case": True, "use_jieba": False},
+        "roformer-english-small-generator": {"do_lower_case": True, "use_jieba": False},
+    }
+    padding_side = "right"
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        use_jieba=False,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the "
+                "vocabulary from a pretrained model please use "
+                "`tokenizer = RoFormerTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+        self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token)
+        if use_jieba:
+            self.basic_tokenizer = JiebaBasicTokenizer(vocab=self.vocab, do_lower_case=do_lower_case)
+        else:
+            self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=unk_token)
+
+    @property
+    def vocab_size(self):
+        """
+        Return the size of vocabulary.
+
+        Returns:
+            int: The size of vocabulary.
+        """
+        return len(self.vocab)
+
+    def _tokenize(self, text):
+        """
+        End-to-end tokenization for RoFormer models.
+        Args:
+            text (str): The text to be tokenized.
+
+        Returns:
+            list: A list of string representing converted tokens.
+        """
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+
+        return split_tokens
+
+    def convert_tokens_to_string(self, tokens):
+        """
+        Converts a sequence of tokens (list of string) in a single string.
+
+        Args:
+            tokens (list): A list of string representing tokens to be converted.
+
+        Returns:
+            str: Converted string from tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import RoFormerTokenizer
+
+                tokenizer = RoFormerTokenizer.from_pretrained('roformer-chinese-base')
+                tokens = tokenizer.tokenize('欢迎使用百度飞桨')
+                #['欢迎', '使用', '百度', '飞', '桨']
+                strings = tokenizer.convert_tokens_to_string(tokens)
+                #'欢迎 使用 百度 飞 桨'
+
+        """
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def num_special_tokens_to_add(self, pair=False):
+        """
+        Returns the number of added tokens when encoding a sequence with special tokens.
+
+        Args:
+            pair(bool):
+                Whether the input is a sequence pair or a single sequence.
+                Defaults to `False` and the input is a single sequence.
+
+        Returns:
+            int: Number of tokens added to sequences.
+        """
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens.
+
+        A Roformer sequence has the following format:
+
+        - single sequence:      ``[CLS] X [SEP]``
+        - pair of sequences:        ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+
+        Returns:
+            List[int]: List of input_id with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        return _cls + token_ids_0 + _sep + token_ids_1 + _sep
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        """
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        A RoFormer offset_mapping has the following format:
+
+        - single sequence: ``(0,0) X (0,0)``
+        - pair of sequences: `(0,0) A (0,0) B (0,0)``
+
+        Args:
+            offset_mapping_ids_0 (List[tuple]):
+                List of wordpiece offsets to which the special tokens will be added.
+            offset_mapping_ids_1 (List[tuple], optional):
+                Optional second list of wordpiece offsets for offset mapping pairs. Defaults to None.
+
+        Returns:
+            List[tuple]: List of wordpiece offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+
+        A RoFormer sequence pair mask has the following format:
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+
+        Returns:
+            List[int]: List of token_type_id according to the given sequence(s).
+        """
+        _sep = [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(_cls + token_ids_0 + _sep) * [0]
+        return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 + _sep) * [1]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+            already_has_special_tokens (bool, optional): Whether or not the token list is already
+                formatted with special tokens for the model. Defaults to None.
+
+        Returns:
+            List[int]: The list of integers either be 0 or 1: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(
+                map(
+                    lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
+                    token_ids_0,
+                )
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def get_vocab(self):
+        return dict(self.vocab.token_to_idx, **self.added_tokens_encoder)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformerv2/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformerv2/__init__.py
new file mode 100644
index 000000000..3bd752713
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformerv2/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .modeling import *
+from .tokenizer import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformerv2/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformerv2/configuration.py
new file mode 100644
index 000000000..5b150b215
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformerv2/configuration.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" RoFormerv2 model configuration """
+from __future__ import annotations
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["RoFormerv2Config", "ROFORMERV2_PRETRAINED_INIT_CONFIGURATION", "ROFORMERV2_PRETRAINED_RESOURCE_FILES_MAP"]
+
+ROFORMERV2_PRETRAINED_INIT_CONFIGURATION = {
+    "roformer_v2_chinese_char_small": {
+        "vocab_size": 12000,
+        "hidden_size": 384,
+        "num_hidden_layers": 6,
+        "num_attention_heads": 6,
+        "intermediate_size": 1536,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "pad_token_id": 0,
+        "rotary_value": False,
+        "use_bias": False,
+    },
+    "roformer_v2_chinese_char_base": {
+        "vocab_size": 12000,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "pad_token_id": 0,
+        "rotary_value": False,
+        "use_bias": False,
+    },
+    "roformer_v2_chinese_char_large": {
+        "vocab_size": 12000,
+        "hidden_size": 1024,
+        "num_hidden_layers": 24,
+        "num_attention_heads": 16,
+        "intermediate_size": 4096,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "pad_token_id": 0,
+        "rotary_value": False,
+        "use_bias": False,
+    },
+}
+
+ROFORMERV2_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "roformer_v2_chinese_char_small": "https://bj.bcebos.com/paddlenlp/models/transformers/roformerv2/roformer_v2_chinese_char_small/model_state.pdparams",
+        "roformer_v2_chinese_char_base": "https://bj.bcebos.com/paddlenlp/models/transformers/roformerv2/roformer_v2_chinese_char_base/model_state.pdparams",
+        "roformer_v2_chinese_char_large": "https://bj.bcebos.com/paddlenlp/models/transformers/roformerv2/roformer_v2_chinese_char_large/model_state.pdparams",
+    }
+}
+
+
+class RoFormerv2Config(PretrainedConfig):
+    model_type = "roformerv2"
+    pretrained_init_configuration = ROFORMERV2_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size: int = 12000,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: str = "relu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        act_dropout: float = 0,
+        max_position_embeddings: int = 512,
+        type_vocab_size: int = 2,
+        pad_token_id: int = 0,
+        rotary_value: bool = False,
+        use_bias: bool = False,
+        epsilon: float = 1e-12,
+        normalize_before: bool = False,
+        num_choices: int = 2,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.act_dropout = act_dropout
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.pad_token_id = pad_token_id
+        self.rotary_value = rotary_value
+        self.use_bias = use_bias
+        self.epsilon = epsilon
+        self.normalize_before = normalize_before
+        self.num_choices = num_choices
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformerv2/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformerv2/modeling.py
new file mode 100644
index 000000000..e727109ab
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformerv2/modeling.py
@@ -0,0 +1,802 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import tensor
+from paddle.nn import Layer
+
+from .. import PretrainedModel, register_base_model
+from .configuration import (
+    ROFORMERV2_PRETRAINED_INIT_CONFIGURATION,
+    ROFORMERV2_PRETRAINED_RESOURCE_FILES_MAP,
+    RoFormerv2Config,
+)
+
+__all__ = [
+    "RoFormerv2Model",
+    "RoFormerv2ForMaskedLM",
+    "RoFormerv2PretrainedModel",
+    "RoFormerv2ForSequenceClassification",
+    "RoFormerv2ForTokenClassification",
+    "RoFormerv2ForQuestionAnswering",
+    "RoFormerv2ForMultipleChoice",
+]
+
+
+class Norm(Layer):
+    def __init__(self, epsilon=1e-12):
+        super().__init__()
+        self._epsilon = epsilon
+
+    def forward(self, x):
+        variance = paddle.mean(paddle.square(x), axis=-1, keepdim=True)
+        return x / paddle.sqrt(variance + self._epsilon)
+
+
+def initializer(tensor, num_hidden_layers=12, order=2, gain=1.0):
+    """
+    https://github.com/bojone/bert4keras/blob/5572ed481a14f5a62be7107e3846c88a5d6b617d/bert4keras/models.py#L1226-L1235
+    """
+    shape = tensor.shape
+    if shape[0] > 10000 or shape[0] < 10:
+        hidden_size = shape[1]
+    else:
+        hidden_size = shape[0]
+    gain *= num_hidden_layers ** (-1.0 / order)
+    std = 1.13684723 / hidden_size**0.5 * gain
+
+    return nn.initializer.TruncatedNormal(std=std)
+
+
+def _convert_attention_mask(attn_mask, dtype):
+    if attn_mask is not None and attn_mask.dtype != dtype:
+        attn_mask_dtype = attn_mask.dtype
+        if attn_mask_dtype in [paddle.bool, paddle.int8, paddle.int16, paddle.int32, paddle.int64]:
+            attn_mask = (paddle.cast(attn_mask, dtype) - 1.0) * 1e4
+        else:
+            attn_mask = paddle.cast(attn_mask, dtype)
+    return attn_mask
+
+
+class RotaryPositionEmbedding(Layer):
+    def __init__(self, dim, max_position_embeddings=512):
+        super().__init__()
+        inv_freq = 1.0 / (10000 ** (paddle.arange(0, dim, 2, dtype=paddle.get_default_dtype()) / dim))
+        t = paddle.arange(max_position_embeddings, dtype=paddle.get_default_dtype())
+        freqs = paddle.matmul(t.unsqueeze(1), inv_freq.unsqueeze(0))
+        self.register_buffer("sin", freqs.sin(), persistable=False)
+        self.register_buffer("cos", freqs.cos(), persistable=False)
+
+    def forward(self, x, offset=0):
+        # x shape [batch_size, num_heads, seqlen, head_dim]
+        seqlen = x.shape[-2]
+        sin, cos = (
+            self.sin[offset : offset + seqlen, :],
+            self.cos[offset : offset + seqlen, :],
+        )
+        x1, x2 = x[..., 0::2], x[..., 1::2]
+        # [cos_nθ, -sin_nθ] [x1]
+        # [sin_nθ,  cos_nθ] [x2]
+        # => [x1 * cos_nθ - x2 * sin_nθ, x1 * sin_nθ + x2 * cos_nθ]
+        return paddle.stack([x1 * cos - x2 * sin, x1 * sin + x2 * cos], axis=-1).flatten(-2, -1)
+
+
+class MultiHeadAttentionWithRotary(Layer):
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        dropout=0.0,
+        kdim=None,
+        vdim=None,
+        need_weights=False,
+        rotary_value=False,
+        max_position_embeddings=512,
+    ):
+        super(MultiHeadAttentionWithRotary, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.num_heads = num_heads
+        self.need_weights = need_weights
+        self.rotary_value = rotary_value
+
+        self.head_dim = embed_dim // num_heads
+        self.scale = self.head_dim**-0.5
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        self.dropout = nn.Dropout(dropout)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.k_proj = nn.Linear(self.kdim, embed_dim)
+        self.v_proj = nn.Linear(self.vdim, embed_dim)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+        self.rotary = RotaryPositionEmbedding(self.head_dim, max_position_embeddings)
+
+    def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
+        key = query if key is None else key
+        value = query if value is None else value
+
+        q = self.q_proj(query)
+        k = self.k_proj(key)
+        v = self.v_proj(value)
+
+        q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
+        q = tensor.transpose(x=q, perm=[0, 2, 1, 3])
+        k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
+        k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
+        v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
+        v = tensor.transpose(x=v, perm=[0, 2, 1, 3])
+
+        q, k = self.rotary(q), self.rotary(k)
+        if self.rotary_value:
+            v = self.rotary(v)
+
+        product = tensor.matmul(x=q, y=k, transpose_y=True) * self.scale
+        if attn_mask is not None:
+            attn_mask = _convert_attention_mask(attn_mask, product.dtype)
+            product = product + attn_mask
+
+        weights = F.softmax(product)
+        weights = self.dropout(weights)
+        out = tensor.matmul(weights, v)
+
+        # combine heads
+        out = tensor.transpose(out, perm=[0, 2, 1, 3])
+        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+
+        # project to output
+        out = self.out_proj(out)
+
+        outs = [out]
+        if self.need_weights:
+            outs.append(weights)
+        if cache is not None:
+            outs.append(cache)
+        return out if len(outs) == 1 else tuple(outs)
+
+
+class TransformerEncoderLayerWithRotary(nn.TransformerEncoderLayer):
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dim_feedforward,
+        dropout=0.1,
+        activation="relu",
+        attn_dropout=None,
+        act_dropout=None,
+        normalize_before=False,
+        rotary_value=False,
+        max_position_embeddings=512,
+        **kwargs
+    ):
+        super().__init__(
+            d_model,
+            nhead,
+            dim_feedforward,
+            dropout=dropout,
+            activation=activation,
+            attn_dropout=attn_dropout,
+            act_dropout=act_dropout,
+            normalize_before=normalize_before,
+        )
+        self.self_attn = MultiHeadAttentionWithRotary(
+            d_model,
+            nhead,
+            dropout=attn_dropout,
+            rotary_value=rotary_value,
+            max_position_embeddings=max_position_embeddings,
+        )
+        self.norm1 = Norm()
+        self.norm2 = Norm()
+        self._config.update({"rotary_value": rotary_value, "max_position_embeddings": max_position_embeddings})
+
+
+class RoFormerv2Embeddings(Layer):
+    """
+    Include embeddings from word and token_type embeddings
+    """
+
+    def __init__(self, config: RoFormerv2Config):
+        super(RoFormerv2Embeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        self.norm = Norm(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, token_type_ids=None):
+
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros_like(input_ids)
+
+        input_embedings = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = input_embedings + token_type_embeddings
+        embeddings = self.norm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class RoFormerv2PretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained RoFormerv2 models. It provides RoFormerv2 related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    pretrained_init_configuration = ROFORMERV2_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = ROFORMERV2_PRETRAINED_RESOURCE_FILES_MAP
+
+    base_model_prefix = "roformerv2"
+    config_class = RoFormerv2Config
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                num_hidden_layers = self.config.num_hidden_layers
+                initializer(layer.weight, num_hidden_layers, order=2, gain=1.0)
+            if isinstance(layer, nn.Linear):
+                use_bias = self.config.use_bias
+                if layer.bias is not None and not use_bias:
+                    layer.bias = None
+        elif isinstance(layer, Norm):
+            layer._epsilon = 1e-12
+
+
+@register_base_model
+class RoFormerv2Model(RoFormerv2PretrainedModel):
+    """
+    The bare RoFormerv2 Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        vocab_size (int):
+            Vocabulary size of `inputs_ids` in `RoFormerv2Model`. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `RoFormerv2Model`.
+        hidden_size (int, optional):
+            Dimensionality of the, encoder layers and pooler layer. Defaults to `768`.
+        num_hidden_layers (int, optional):
+            Number of hidden layers in the Transformer encoder. Defaults to `12`.
+        num_attention_heads (int, optional):
+            Number of attention heads for each attention layer in the Transformer encoder.
+            Defaults to `12`.
+        intermediate_size (int, optional):
+            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
+            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
+            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
+            Defaults to `3072`.
+        hidden_act (str, optional):
+            The non-linear activation function in the feed-forward layer.
+            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
+            are supported. Defaults to `"relu"`.
+        hidden_dropout_prob (float, optional):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+            Defaults to `0.1`.
+        attention_probs_dropout_prob (float, optional):
+            The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
+            Defaults to `0.1`.
+        max_position_embeddings (int, optional):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
+            sequence. Defaults to `512`.
+        type_vocab_size (int, optional):
+            The vocabulary size of `token_type_ids`.
+            Defaults to `2`.
+        pad_token_id (int, optional):
+            The index of padding token in the token vocabulary.
+            Defaults to `0`.
+        rotary_value (`bool`, optional):
+            Whether or not apply rotay position embeddings to value.
+            Defaults to `False`.
+        use_bias (`bool`, optional):
+            Whether or not use bias.
+            Defaults to `False`.
+    """
+
+    def __init__(self, config: RoFormerv2Config):
+        super(RoFormerv2Model, self).__init__(config)
+        self.pad_token_id = config.pad_token_id
+        self.num_hidden_layers = config.num_hidden_layers
+        self.use_bias = config.use_bias
+        self.embeddings = RoFormerv2Embeddings(config)
+        encoder_layer = TransformerEncoderLayerWithRotary(
+            d_model=config.hidden_size,
+            nhead=config.num_attention_heads,
+            dim_feedforward=config.intermediate_size,
+            dropout=config.hidden_dropout_prob,
+            activation=config.hidden_act,
+            attn_dropout=config.attention_probs_dropout_prob,
+            act_dropout=0,
+            rotary_value=config.rotary_value,
+            max_position_embeddings=config.max_position_embeddings,
+        )
+        self.encoder = nn.TransformerEncoder(encoder_layer, config.num_hidden_layers)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_hidden_states=False):
+        r"""
+        The RoFormerv2Model forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `0.0` values and the others have `1.0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                Currently, we only support 2D attention_mask.
+                Defaults to `None`, which means `pad_token_id` will be ignored.
+            output_hidden_states (bool, optional):
+                Whether to return the output of each hidden layers.
+                Defaults to `False`.
+
+        Returns:
+            tuple: Returns `sequence_output` or `encoder_outputs`.
+
+            With the fields:
+
+            - `sequence_output` (Tensor):
+                Sequence of hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+            - `encoder_outputs` (List(Tensor)):
+                A list of Tensor containing hidden-states of the model at each hidden layer in the Transformer encoder.
+                The length of the list is `num_hidden_layers`.
+                Each Tensor has a data type of float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RoFormerv2Model, RoFormerv2Tokenizer
+
+                tokenizer = RoFormerv2Tokenizer.from_pretrained('roformer_v2_chinese_char_base')
+                model = RoFormerv2Model.from_pretrained('roformer_v2_chinese_char_base')
+
+                inputs = tokenizer("欢迎使用百度飞桨!")
+                inputs = {k:paddle.to_tensor([v], dtype="int64") for (k, v) in inputs.items()}
+                output = model(**inputs)
+        """
+        if attention_mask is None:
+            attention_mask = paddle.unsqueeze(
+                (input_ids == self.pad_token_id).astype(paddle.get_default_dtype()) * -1e4, axis=[1, 2]
+            )
+        else:
+            if attention_mask.ndim == 2:
+                # attention_mask [batch_size, sequence_length] -> [batch_size, 1, 1, sequence_length]
+                attention_mask = attention_mask.unsqueeze(axis=[1, 2]).astype(paddle.get_default_dtype())
+                attention_mask = (1.0 - attention_mask) * -1e4
+            else:
+                raise ValueError("Currently we only support 2D attention_mask.")
+
+        attention_mask.stop_gradient = True
+
+        embedding_output = self.embeddings(input_ids=input_ids, token_type_ids=token_type_ids)
+
+        if output_hidden_states:
+            output = embedding_output
+            encoder_outputs = []
+            for mod in self.encoder.layers:
+                output = mod(output, src_mask=attention_mask)
+                encoder_outputs.append(output)
+            if self.encoder.norm is not None:
+                encoder_outputs[-1] = self.encoder.norm(encoder_outputs[-1])
+        else:
+            sequence_output = self.encoder(embedding_output, attention_mask)
+
+        outputs = encoder_outputs if output_hidden_states else sequence_output
+
+        return outputs
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, embedding: nn.Embedding):
+        self.embeddings.word_embeddings = embedding
+
+
+class RoFormerv2ForQuestionAnswering(RoFormerv2PretrainedModel):
+    """
+    RoFormerv2 with a linear layer on top of the hidden-states output to compute `span_start_logits`
+    and `span_end_logits`, designed for question-answering tasks like SQuAD.
+
+    Args:
+        roformerv2 (:class:`RoFormerv2Model`):
+            An instance of RoFormerv2Model.
+        dropout (float, optional):
+            The dropout probability for output of RoFormerv2.
+            If None, use the same value as `hidden_dropout_prob` of `RoFormerv2Model`
+            instance `roformerv2`. Defaults to `None`.
+    """
+
+    def __init__(self, config: RoFormerv2Config):
+        super(RoFormerv2ForQuestionAnswering, self).__init__(config)
+        self.roformerv2 = RoFormerv2Model(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
+        r"""
+        The RoFormerv2ForQuestionAnswering forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`RoFormerv2Model`.
+            token_type_ids (Tensor, optional):
+                See :class:`RoFormerv2Model`.
+            attention_mask (Tensor, optional):
+                See :class:`RoFormerv2Model`.
+
+        Returns:
+            tuple: Returns tuple (`start_logits`, `end_logits`).
+
+            With the fields:
+
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RoFormerv2ForQuestionAnswering, RoFormerv2Tokenizer
+
+                tokenizer = RoFormerv2Tokenizer.from_pretrained('roformer_v2_chinese_char_base')
+                model = RoFormerv2ForQuestionAnswering.from_pretrained('roformer_v2_chinese_char_base')
+
+                inputs = tokenizer("欢迎使用百度飞桨!")
+                inputs = {k:paddle.to_tensor([v], dtype="int64") for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                start_logits = outputs[0]
+                end_logits = outputs[1]
+        """
+        sequence_output = self.roformerv2(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
+
+        logits = self.classifier(sequence_output)
+        start_logits, end_logits = paddle.unstack(logits, axis=-1)
+
+        return start_logits, end_logits
+
+
+class RoFormerv2ForSequenceClassification(RoFormerv2PretrainedModel):
+    """
+    RoFormerv2 Model with a linear layer on top of the output layer,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        roformerv2 (`RoFormerv2Model`):
+            An instance of `paddlenlp.transformers.RoFormerv2Model`.
+        num_labels (int, optional):
+            The number of classes. Default to `2`.
+        dropout (float, optional):
+            The dropout probability for output of RoFormerv2.
+            If None, use the same value as `hidden_dropout_prob`
+            of `paddlenlp.transformers.RoFormerv2Model` instance. Defaults to `None`.
+    """
+
+    def __init__(self, config: RoFormerv2Config):
+        super(RoFormerv2ForSequenceClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.roformerv2 = RoFormerv2Model(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`RoFormerv2Model`.
+            token_type_ids (Tensor, optional):
+                See :class:`RoFormerv2Model`.
+            attention_mask (Tensor, optional):
+                See :class:`RoFormerv2Model`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input text classification logits.
+            Shape as `[batch_size, num_labels]` and dtype as float32.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RoFormerv2ForSequenceClassification, RoFormerv2Tokenizer
+
+                tokenizer = RoFormerv2Tokenizer.from_pretrained('roformer_v2_chinese_char_base')
+                model = RoFormerv2ForSequenceClassification.from_pretrained('roformer_v2_chinese_char_base')
+
+                inputs = tokenizer("欢迎使用百度飞桨!")
+                inputs = {k:paddle.to_tensor([v], dtype="int64") for (k, v) in inputs.items()}
+                logits = model(**inputs)
+
+        """
+        sequence_output = self.roformerv2(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
+        pooled_output = sequence_output[:, 0]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        return logits
+
+
+class RoFormerv2ForTokenClassification(RoFormerv2PretrainedModel):
+    """
+    RoFormerv2 Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        roformerv2 (`RoFormerv2Model`):
+            An instance of `paddlenlp.transformers.RoFormerv2Model`.
+        num_labels (int, optional):
+            The number of classes. Default to `2`.
+        dropout (float, optional):
+            The dropout probability for output of RoFormerv2.
+            If None, use the same value as `hidden_dropout_prob`
+            of `paddlenlp.transformers.RoFormerv2Model` instance. Defaults to `None`.
+    """
+
+    def __init__(self, config: RoFormerv2Config):
+        super(RoFormerv2ForTokenClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+        self.roformerv2 = RoFormerv2Model(config)  # allow roformerv2 to be config
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`RoFormerv2Model`.
+            token_type_ids (Tensor, optional):
+                See :class:`RoFormerv2Model`.
+            attention_mask (Tensor, optional):
+                See :class:`RoFormerv2Model`.
+
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input token classification logits.
+            Shape as `[batch_size, sequence_length, num_labels]` and dtype as `float32`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RoFormerv2ForTokenClassification, RoFormerv2Tokenizer
+
+                tokenizer = RoFormerv2Tokenizer.from_pretrained('roformer_v2_chinese_char_base')
+                model = RoFormerv2ForTokenClassification.from_pretrained('roformer_v2_chinese_char_base')
+
+                inputs = tokenizer("欢迎使用百度飞桨!")
+                inputs = {k:paddle.to_tensor([v], dtype="int64") for (k, v) in inputs.items()}
+                logits = model(**inputs)
+
+        """
+        sequence_output = self.roformerv2(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        return logits
+
+
+class RoFormerv2ForMultipleChoice(RoFormerv2PretrainedModel):
+    """
+    RoFormerv2 Model with a linear layer on top of the hidden-states output layer,
+    designed for multiple choice tasks like RocStories/SWAG tasks.
+
+    Args:
+        roformerv2 (:class:`RoFormerv2Model`):
+            An instance of RoFormerv2Model.
+        num_choices (int, optional):
+            The number of choices. Defaults to `2`.
+        dropout (float, optional):
+            The dropout probability for output of RoFormerv2.
+            If None, use the same value as `hidden_dropout_prob` of `RoFormerv2Model`
+            instance `roformerv2`. Defaults to None.
+    """
+
+    def __init__(self, config: RoFormerv2Config):
+        super(RoFormerv2ForMultipleChoice, self).__init__(config)
+        self.num_choices = config.num_choices
+        self.roformerv2 = RoFormerv2Model(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
+        r"""
+        The RoFormerv2ForMultipleChoice forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`RoFormerv2Model` and shape as [batch_size, num_choice, sequence_length].
+            token_type_ids(Tensor, optional):
+                See :class:`RoFormerv2Model` and shape as [batch_size, num_choice, sequence_length].
+            attention_mask (list, optional):
+                See :class:`RoFormerv2Model` and shape as [batch_size, num_choice, sequence_length].
+
+        Returns:
+            Tensor: Returns tensor `reshaped_logits`, a tensor of the multiple choice classification logits.
+            Shape as `[batch_size, num_choice]` and dtype as `float32`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RoFormerv2ForMultipleChoice, RoFormerv2Tokenizer
+                from paddlenlp.data import Pad
+
+                tokenizer = RoFormerv2Tokenizer.from_pretrained('roformer_v2_chinese_char_base')
+                model = RoFormerv2ForMultipleChoice.from_pretrained('roformer_v2_chinese_char_base', num_choices=2)
+
+                data = [
+                    {
+                        "question": "如何打开ipad屏幕？",
+                        "answer1": "按音量按钮。",
+                        "answer2": "按下锁定按钮。",
+                        "label": 1,
+                    },
+                    {
+                        "question": "如何缩进一些文本？",
+                        "answer1": "在开始写之前留一些空格。",
+                        "answer2": "按空格键。",
+                        "label": 0,
+                    },
+                ]
+
+                text = []
+                text_pair = []
+                for d in data:
+                    text.append(d["question"])
+                    text_pair.append(d["answer1"])
+                    text.append(d["question"])
+                    text_pair.append(d["answer2"])
+
+                inputs = tokenizer(text, text_pair)
+                input_ids = Pad(axis=0, pad_val=tokenizer.pad_token_id)(inputs["input_ids"])
+                token_type_ids = Pad(axis=0, pad_val=tokenizer.pad_token_type_id)(inputs["token_type_ids"])
+
+                reshaped_logits = model(
+                    input_ids=paddle.to_tensor(input_ids, dtype="int64"),
+                    token_type_ids=paddle.to_tensor(token_type_ids, dtype="int64"),
+                )
+                print(reshaped_logits.shape)
+                # [2, 2]
+
+        """
+        # input_ids: [bs, num_choice, seq_l]
+        input_ids = input_ids.reshape(shape=(-1, input_ids.shape[-1]))  # flat_input_ids: [bs*num_choice,seq_l]
+
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.reshape(shape=(-1, token_type_ids.shape[-1]))
+
+        if attention_mask is not None:
+            attention_mask = attention_mask.reshape(shape=(-1, attention_mask.shape[-1]))
+
+        sequence_output = self.roformerv2(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
+
+        pooled_output = sequence_output[:, 0]
+        pooled_output = self.dropout(pooled_output)
+
+        logits = self.classifier(pooled_output)  # logits: (bs*num_choice,1)
+        reshaped_logits = logits.reshape(shape=(-1, self.num_choices))  # logits: (bs, num_choice)
+
+        return reshaped_logits
+
+
+class RoFormerv2LMPredictionHead(Layer):
+    def __init__(self, config: RoFormerv2Config, embedding_weights=None):
+        super(RoFormerv2LMPredictionHead, self).__init__()
+        self.use_bias = config.use_bias
+        self.decoder_weight = (
+            self.create_parameter(shape=[config.vocab_size, config.hidden_size], dtype=self.transform.weight.dtype)
+            if embedding_weights is None
+            else embedding_weights
+        )
+        if config.use_bias:
+            self.decoder_bias = self.create_parameter(
+                shape=[config.vocab_size], dtype=self.decoder_weight.dtype, is_bias=True
+            )
+
+    def forward(self, hidden_states):
+        hidden_states = paddle.matmul(hidden_states, self.decoder_weight, transpose_y=True)
+        if self.use_bias:
+            hidden_states = hidden_states + self.decoder_bias
+
+        return hidden_states
+
+
+class RoFormerv2ForMaskedLM(RoFormerv2PretrainedModel):
+    """
+    RoFormerv2 Model with a `masked language modeling` head on top.
+
+    Args:
+        roformerv2 (:class:`RoFormerv2Model`):
+            An instance of :class:`RoFormerv2Model`.
+
+    """
+
+    def __init__(self, config: RoFormerv2Config):
+        super(RoFormerv2ForMaskedLM, self).__init__(config)
+        self.roformerv2 = RoFormerv2Model(config)
+        self.cls = RoFormerv2LMPredictionHead(
+            config, embedding_weights=self.roformerv2.embeddings.word_embeddings.weight
+        )
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`RoFormerv2Model`.
+            token_type_ids (Tensor, optional):
+                See :class:`RoFormerv2Model`.
+            attention_mask (Tensor, optional):
+                See :class:`RoFormerv2Model`.
+
+        Returns:
+            Tensor: Returns tensor `prediction_scores`, The scores of masked token prediction.
+            Its data type should be float32 and shape is [batch_size, sequence_length, vocab_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import RoFormerv2ForMaskedLM, RoFormerv2Tokenizer
+
+                tokenizer = RoFormerv2Tokenizer.from_pretrained('roformer_v2_chinese_char_base')
+                model = RoFormerv2ForMaskedLM.from_pretrained('roformer_v2_chinese_char_base')
+
+                inputs = tokenizer("欢迎使用百度飞桨!")
+                inputs = {k:paddle.to_tensor([v], dtype="int64") for (k, v) in inputs.items()}
+
+                logits = model(**inputs)
+                print(logits.shape)
+                # [1, 11, 12000]
+
+        """
+        sequence_output = self.roformerv2(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
+
+        prediction_scores = self.cls(sequence_output)
+        return prediction_scores
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformerv2/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformerv2/tokenizer.py
new file mode 100644
index 000000000..92266df53
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/roformerv2/tokenizer.py
@@ -0,0 +1,306 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from ..bert.tokenizer import BasicTokenizer, WordpieceTokenizer
+from ..tokenizer_utils import PretrainedTokenizer
+
+__all__ = ["RoFormerv2Tokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "roformer_v2_chinese_char_small": 512,
+    "roformer_v2_chinese_char_base": 512,
+    "roformer_v2_chinese_char_large": 512,
+}
+
+
+class RoFormerv2Tokenizer(PretrainedTokenizer):
+    """
+    Constructs a RoFormerv2 tokenizer. It uses a basic tokenizer to do punctuation
+    splitting, lower casing and so on, and follows a WordPiece tokenizer to
+    tokenize as subwords.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            The vocabulary file path (ends with '.txt') required to instantiate
+            a `WordpieceTokenizer`.
+        do_lower_case (bool,optional):
+            Whether or not to lowercase the input when tokenizing.
+            If you use the RoFormerv2 pretrained model, lower is set to
+            False when using the cased model, otherwise it is set to True.
+            Defaults to`True`.
+        unk_token (str,optional):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str,optional):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str,optional):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str,optional):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str,optional):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import RoFormerv2Tokenizer
+            tokenizer = RoFormerv2Tokenizer.from_pretrained('roformer_v2_chinese_char_base')
+
+            tokens = tokenizer('欢迎使用百度飞桨')
+            '''
+            {'input_ids': [101, 3223, 6500, 421, 4179, 4331, 2008, 7263, 3055, 102],
+             'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
+            '''
+
+    """
+
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "roformer_v2_chinese_char_small": "https://bj.bcebos.com/paddlenlp/models/transformers/roformerv2/roformer_v2_chinese_char_small/vocab.txt",
+            "roformer_v2_chinese_char_base": "https://bj.bcebos.com/paddlenlp/models/transformers/roformerv2/roformer_v2_chinese_char_base/vocab.txt",
+            "roformer_v2_chinese_char_large": "https://bj.bcebos.com/paddlenlp/models/transformers/roformerv2/roformer_v2_chinese_char_large/vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "roformer_v2_chinese_char_small": {"do_lower_case": True},
+        "roformer_v2_chinese_char_base": {"do_lower_case": True},
+        "roformer_v2_chinese_char_large": {"do_lower_case": True},
+    }
+
+    # TODO(wj-Mcat): to be confirmed
+    max_model_input_sizes = {
+        "roformer_v2_chinese_char_small": 1024,
+        "roformer_v2_chinese_char_base": 1024,
+        "roformer_v2_chinese_char_large": 1024,
+    }
+    padding_side = "right"
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the "
+                "vocabulary from a pretrained model please use "
+                "`tokenizer = RoFormerv2Tokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+        self.do_lower_case = do_lower_case
+        self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token)
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=unk_token)
+
+    @property
+    def vocab_size(self):
+        """
+        Return the size of vocabulary.
+
+        Returns:
+            int: The size of vocabulary.
+        """
+
+        return len(self.vocab)
+
+    def _tokenize(self, text):
+        """
+        End-to-end tokenization for RoFormerv2 models.
+
+        Args:
+            text (str): The text to be tokenized.
+
+        Returns:
+            list: A list of string representing converted tokens.
+        """
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+
+    def convert_tokens_to_string(self, tokens):
+        """
+        Converts a sequence of tokens (list of string) to a single string. Since
+        the usage of WordPiece introducing `##` to concat subwords, also removes
+        `##` when converting.
+
+        Args:
+            tokens (list): A list of string representing tokens to be converted.
+
+        Returns:
+            str: Converted string from tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import RoFormerv2Tokenizer
+
+                tokenizer = RoFormerv2Tokenizer.from_pretrained('roformer_v2_chinese_char_base')
+                tokens = tokenizer.tokenize('欢迎使用百度飞桨!')
+                '''
+                ['欢', '迎', '使', '用', '百', '度', '飞', '桨', '!']
+                '''
+                strings = tokenizer.convert_tokens_to_string(tokens)
+                '''
+                '欢 迎 使 用 百 度 飞 桨 !'
+                '''
+        """
+
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def num_special_tokens_to_add(self, pair=False):
+        """
+        Returns the number of added tokens when encoding a sequence with special tokens.
+
+        Args:
+            pair(bool):
+                Whether the input is a sequence pair or a single sequence.
+                Defaults to `False` and the input is a single sequence.
+
+        Returns:
+            int: Number of tokens added to sequences.
+        """
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens.
+
+        A RoFormerv2 sequence has the following format:
+
+        - single sequence:      ``[CLS] X [SEP]``
+        - pair of sequences:        ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+
+        Returns:
+            List[int]: List of input_id with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        return _cls + token_ids_0 + _sep + token_ids_1 + _sep
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        """
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        A RoFormerv2 offset_mapping has the following format:
+
+        - single sequence:      ``(0,0) X (0,0)``
+        - pair of sequences:        ``(0,0) A (0,0) B (0,0)``
+
+        Args:
+            offset_mapping_ids_0 (List[tuple]):
+                List of wordpiece offsets to which the special tokens will be added.
+            offset_mapping_ids_1 (List[tuple], optional):
+                Optional second list of wordpiece offsets for offset mapping pairs. Defaults to None.
+
+        Returns:
+            List[tuple]: A list of wordpiece offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+
+        A RoFormerv2 sequence pair mask has the following format:
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+
+        Returns:
+            List[int]: List of token_type_id according to the given sequence(s).
+        """
+        _sep = [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(_cls + token_ids_0 + _sep) * [0]
+        return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 + _sep) * [1]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+            already_has_special_tokens (bool, optional): Whether or not the token list is already
+                formatted with special tokens for the model. Defaults to None.
+
+        Returns:
+            List[int]: The list of integers either be 0 or 1: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def get_vocab(self):
+        return dict(self.vocab._token_to_idx, **self.added_tokens_encoder)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rw/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rw/__init__.py
new file mode 100644
index 000000000..fb888a96e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rw/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 Technology Innovation Institute (TII) and PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rw/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rw/configuration.py
new file mode 100644
index 000000000..7c6e91ec9
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rw/configuration.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2023 Technology Innovation Institute (TII) and PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..configuration_utils import PretrainedConfig
+
+RW_PRETRAINED_INIT_CONFIGURATION = {
+    "model_state": {
+        "tiiuae/falcon-7b": "https://bj.bcebos.com/paddlenlp/models/community/tiiuae/falcon-7b/model_state.pdparams",
+        "tiiuae/falcon-7b-instruct": "https://bj.bcebos.com/paddlenlp/models/community/tiiuae/falcon-7b-instruct/model_state.pdparams",
+        "OpenBuddy/openbuddy-falcon-7b-v5-fp16": "https://bj.bcebos.com/paddlenlp/models/community/OpenBuddy/openbuddy-falcon-7b-v5-fp16/model_state.pdparams",
+    },
+}
+
+
+class RWConfig(PretrainedConfig):
+    model_type = "RefinedWeb"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "num_hidden_layers": "n_layer",
+        "num_attention_heads": "n_head",
+    }
+
+    pretrained_init_configuration = RW_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size=250880,
+        hidden_size=64,
+        n_layer=2,
+        n_head=8,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        bos_token_id=1,
+        eos_token_id=2,
+        apply_residual_connection_post_layernorm=False,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        multi_query=False,
+        n_head_kv=None,
+        bias=False,
+        alibi=False,
+        parallel_attn=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        # Backward compatibility with n_embed kwarg
+        n_embed = kwargs.pop("n_embed", None)
+        self.hidden_size = hidden_size if n_embed is None else n_embed
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.multi_query = multi_query
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.n_head_kv = n_head if n_head_kv is None else n_head_kv
+        self.alibi = alibi
+        self.bias = bias
+        self.parallel_attn = parallel_attn
+
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+    @property
+    def head_dim(self):
+        return self.hidden_size // self.n_head
+
+    @property
+    def rotary(self):
+        return not self.alibi
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rw/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rw/modeling.py
new file mode 100644
index 000000000..264533078
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rw/modeling.py
@@ -0,0 +1,894 @@
+# Copyright (c) 2023 Technology Innovation Institute (TII) and PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import paddle
+from paddle import Tensor, nn
+
+from ...utils.converter import StateDictNameMapping, init_name_mappings
+from .. import PretrainedModel
+from ..model_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+)
+from .configuration import RW_PRETRAINED_INIT_CONFIGURATION, RWConfig
+
+
+# rotary pos emb helpers (paddle.jit.script does not seem to support staticmethod...)
+def rotate_half(x):
+    x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
+    return paddle.concat((-x2, x1), axis=x1.ndim - 1)  # dim=-1 triggers a bug in paddle < 1.8.0
+
+
+class RotaryEmbedding(paddle.nn.Layer):
+    """Implementation of RotaryEmbedding from GPT-NeoX.
+    This implementation is design to operate on queries and keys that are compatible with
+    [batch_size, n_heads_per_partition, seq_len, head_dim] (e.g. MinGPTAttention format).
+    """
+
+    def __init__(
+        self,
+        head_dim: int,
+        base=10000,
+    ):
+        super().__init__()
+        # head_dim must be an even number
+        inv_freq = 1.0 / (base ** (paddle.arange(0, head_dim, 2).astype("float32") / head_dim))
+        self.register_buffer("inv_freq", inv_freq, persistable=False)
+        self.head_dim = head_dim
+        self.seq_len_cached = None
+        self.batch_size_cached = None
+        self.cos_cached: Tensor | None = None
+        self.sin_cached: Tensor | None = None
+
+    def cos_sin(
+        self,
+        seq_len: int,
+        dtype=paddle.bfloat16,
+    ) -> Tensor:
+        if seq_len != self.seq_len_cached:
+            self.seq_len_cached = seq_len
+            t = paddle.arange(seq_len, dtype=self.inv_freq.dtype)
+            freqs = paddle.einsum("i,j->ij", t, self.inv_freq)
+            emb = paddle.concat((freqs, freqs), axis=-1)
+
+            if dtype in [paddle.float16, paddle.bfloat16]:
+                emb = paddle.cast(emb, dtype)
+
+            self.cos_cached = emb.cos()[None, :, :]
+            self.sin_cached = emb.sin()[None, :, :]
+
+            self.cos_cached = paddle.cast(self.cos_cached, dtype)
+            self.sin_cached = paddle.cast(self.sin_cached, dtype)
+
+        return self.cos_cached, self.sin_cached
+
+    def forward(self, q, k):
+        batch, seq_len, head_dim = q.shape
+        cos, sin = self.cos_sin(seq_len, q.dtype)
+        return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
+
+
+def _make_causal_mask(input_ids_shape: paddle.shape, past_key_values_length: int):
+    batch_size, target_length = input_ids_shape
+    mask = paddle.empty((target_length, target_length + past_key_values_length), dtype=paddle.bool)
+    # ONNX doesn't support `Tensor.triu` properly, thus we use this workaround
+    seq_ids = paddle.arange(target_length)
+    mask[:, past_key_values_length:] = seq_ids[:, None] < seq_ids[None, :]
+
+    if past_key_values_length > 0:
+        mask[:, :past_key_values_length] = False
+
+    expanded_mask = mask[None, None, :, :].expand(
+        shape=(batch_size, 1, target_length, target_length + past_key_values_length)
+    )
+    return expanded_mask
+
+
+def _expand_mask(mask: Tensor, tgt_length: int):
+    batch_size, src_length = mask.shape
+    tgt_length = tgt_length if tgt_length is not None else src_length
+
+    expanded_mask = ~(paddle.cast(mask[:, None, None, :], "bool"))
+    return expanded_mask.expand(shape=(batch_size, 1, tgt_length, src_length))
+
+
+def build_alibi_tensor(attention_mask: Tensor, num_heads: int, dtype: paddle.dtype) -> Tensor:
+    batch_size, seq_length = attention_mask.shape
+    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
+
+    base = paddle.to_tensor(2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), dtype=paddle.float32)
+    powers = paddle.arange(1, 1 + closest_power_of_2, dtype=paddle.float32)
+    slopes = paddle.pow(base, powers)
+
+    if closest_power_of_2 != num_heads:
+        extra_base = Tensor(2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), dtype=paddle.float32)
+        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
+        extra_powers = paddle.arange(1, 1 + 2 * num_remaining_heads, 2, dtype=paddle.int32)
+        slopes = paddle.concat([slopes, paddle.pow(extra_base, extra_powers)], axis=0)
+
+    arange_tensor = ((attention_mask.cumsum(axis=-1) - 1) * attention_mask)[:, None, :]
+    alibi = paddle.cast(slopes[..., None], "bfloat16") * arange_tensor
+    return paddle.cast(alibi.reshape([batch_size * num_heads, 1, seq_length]), dtype)
+
+
+def dropout_add(x: Tensor, residual: Tensor, prob: float, training: bool) -> Tensor:
+    out = nn.functional.dropout(x, p=prob, training=training)
+    out = residual + out
+    return out
+
+
+class Attention(nn.Layer):
+    def __init__(self, config: RWConfig):
+        super().__init__()
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.n_head
+        self.head_dim = self.hidden_size // self.num_heads
+        self.split_size = self.hidden_size
+        self.hidden_dropout = config.hidden_dropout
+
+        if self.head_dim * self.num_heads != self.hidden_size:
+            raise ValueError(
+                f"`hidden_size` must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+
+        self.maybe_rotary = RotaryEmbedding(config.head_dim) if config.rotary else lambda q, k: (q, k)
+
+        # Layer-wise attention scaling
+        self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
+        self.beta = self.inv_norm_factor
+
+        self.query_key_value = nn.Linear(
+            self.hidden_size,
+            3 * self.hidden_size if not config.multi_query else (self.hidden_size + 2 * self.head_dim),
+            bias_attr=config.bias,
+        )
+        self.multi_query = config.multi_query
+        self.dense = nn.Linear(self.hidden_size, self.hidden_size, bias_attr=config.bias)
+        self.attention_dropout = nn.Dropout(config.attention_dropout)
+        self.num_kv = config.n_head if not self.multi_query else 1
+
+    def _split_heads(self, fused_qkv: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
+        """
+        Split the last dimension into (num_heads, head_dim) without making any copies, results share same memory
+        storage as `fused_qkv`
+
+        Args:
+            fused_qkv (`Tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
+
+        Returns:
+            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
+            value: [batch_size, seq_length, num_heads, head_dim]
+        """
+        if not self.multi_query:
+            batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
+            fused_qkv = fused_qkv.reshape([batch_size, seq_length, self.num_heads, 3, self.head_dim])
+            return fused_qkv[..., 0, :], fused_qkv[..., 1, :], fused_qkv[..., 2, :]
+        else:
+            batch_size, seq_length, three_times_hidden_size = fused_qkv.shape
+            fused_qkv = fused_qkv.reshape([batch_size, seq_length, self.num_heads + 2, self.head_dim])
+            return fused_qkv[..., :-2, :], fused_qkv[..., -2, :].unsqueeze(-2), fused_qkv[..., -1, :].unsqueeze(-2)
+
+    def _merge_heads(self, x: Tensor) -> Tensor:
+        """
+        Merge heads together over the last dimenstion
+
+        Args:
+            x: (`Tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
+
+        Returns:
+            Tensor: [batch_size, seq_length, num_heads * head_dim]
+        """
+        # What we want to achieve is:
+        # batch_size * num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads * head_dim
+        batch_size_and_num_heads, seq_length, _ = x.shape
+        batch_size = batch_size_and_num_heads // self.num_heads
+
+        # First reshape to decompose the batch size
+        # batch_size * num_heads, seq_length, head_dim -> batch_size, num_heads, seq_length, head_dim
+        x = x.reshape([batch_size, self.num_heads, seq_length, self.head_dim])
+
+        # batch_size, num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads, head_dim
+        x = x.transpose([0, 2, 1, 3])
+
+        # batch_size, seq_length, num_heads, head_dim -> batch_size, seq_length, num_heads * head_dim
+        return x.reshape([batch_size, seq_length, self.num_heads * self.head_dim])
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        alibi: Tensor,
+        attention_mask: Tensor,
+        layer_past: Optional[Tuple[Tensor, Tensor]] = None,
+        head_mask: Optional[Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+        i: int = 0,
+    ):
+        fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
+
+        # 3 x [batch_size, seq_length, num_heads, head_dim]
+        (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
+
+        batch_size, q_length, _, _ = query_layer.shape
+
+        # [batch_size, seq_length, num_heads, head_dim]
+        query_layer = query_layer.transpose([0, 2, 1, 3]).reshape(
+            [batch_size * self.num_heads, q_length, self.head_dim]
+        )
+        key_layer = key_layer.transpose([0, 2, 1, 3]).reshape(
+            [
+                batch_size * self.num_kv,
+                q_length,
+                self.head_dim,
+            ]
+        )
+        value_layer = value_layer.transpose([0, 2, 1, 3]).reshape([batch_size * self.num_kv, q_length, self.head_dim])
+
+        query_layer, key_layer = self.maybe_rotary(query_layer, key_layer)
+
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            # concatenate along seq_length dimension:
+            #  - key: [batch_size * self.num_heads, head_dim, kv_length]
+            #  - value: [batch_size * self.num_heads, kv_length, head_dim]
+            key_layer = paddle.concat((past_key, key_layer), axis=1)
+            value_layer = paddle.concat((past_value, value_layer), axis=1)
+
+        # if use layer_past, kv_length != q_length
+        _, kv_length, _ = key_layer.shape
+
+        if use_cache is True:
+            present = (key_layer, value_layer)
+        else:
+            present = None
+
+        if alibi is None:
+            query_layer_ = query_layer.reshape([batch_size, self.num_heads, q_length, self.head_dim])
+            key_layer_ = key_layer.reshape([batch_size, self.num_kv, kv_length, self.head_dim])
+            value_layer_ = value_layer.reshape([batch_size, self.num_kv, kv_length, self.head_dim])
+
+            attn_output = query_layer_ @ key_layer_.transpose([0, 1, 3, 2])
+            attention_scores = attn_output.reshape([batch_size, self.num_heads, q_length, kv_length])
+
+            # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
+            input_dtype = attention_scores.dtype
+            # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38`
+            if input_dtype == paddle.float16 or input_dtype == paddle.bfloat16:
+                attention_scores = paddle.cast(attention_scores, paddle.float32)
+            attention_scores = paddle.where(
+                attention_mask > 0,
+                paddle.full_like(attention_scores, paddle.finfo(attention_scores.dtype).min),
+                attention_scores,
+            )
+            attention_probs = nn.functional.softmax(
+                attention_scores * self.inv_norm_factor,
+                axis=-1,
+                dtype=hidden_states.dtype,
+            )
+            # [batch_size, num_heads, q_length, kv_length]
+            attention_probs = self.attention_dropout(attention_probs)
+
+            if head_mask is not None:
+                attention_probs = attention_probs * head_mask
+
+            # matmul: [batch_size, num_heads, q_length, head_dim]
+            context_layer = attention_probs @ value_layer_
+
+            # change reshape [batch_size , q_length,  num_heads * head_dim]
+            context_layer = context_layer.transpose([0, 2, 1, 3])
+            context_layer = context_layer.reshape([batch_size, q_length, -1])
+
+            output_tensor = self.dense(context_layer)
+
+            outputs = (output_tensor, present)
+            if output_attentions:
+                outputs += (attention_probs,)
+
+            return outputs
+        else:
+            query_layer_ = query_layer.reshape([batch_size, self.num_heads, q_length, self.head_dim])
+            key_layer_ = key_layer.reshape([batch_size, self.num_kv, kv_length, self.head_dim])
+            value_layer_ = value_layer.reshape([batch_size, self.num_kv, kv_length, self.head_dim])
+
+            alibi = alibi.reshape([batch_size, self.num_heads, 1, -1])
+
+            attention_scores = query_layer_ @ key_layer_.transpose([0, 1, 3, 2])
+
+            attention_mask_float = paddle.zeros_like(attention_mask, dtype=attention_scores.dtype)
+            attention_mask_float = paddle.where(
+                attention_mask > 0,
+                paddle.full_like(attention_scores, paddle.finfo(attention_scores.dtype).min),
+                attention_mask_float,
+            )
+
+            # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
+            input_dtype = attention_scores.dtype
+            # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38`
+            if input_dtype == paddle.float16 or input_dtype == paddle.bfloat16:
+                attention_scores = paddle.cast(attention_scores, paddle.float32)
+            # attn_weights = paddle.masked_fill(attention_scores, attention_mask, paddle.finfo(attention_scores.dtype).min)
+            attention_probs = nn.functional.softmax(
+                (attention_scores + alibi) * self.inv_norm_factor + attention_mask_float,
+                axis=-1,
+                dtype=hidden_states.dtype,
+            )
+            # [batch_size, num_heads, q_length, kv_length]
+            attention_probs = self.attention_dropout(attention_probs)
+
+            if head_mask is not None:
+                attention_probs = attention_probs * head_mask
+
+            # matmul: [batch_size, num_heads, q_length, kv_length] * [batch_size, num_kv, kv_length, head_dim]
+            context_layer = attention_probs @ value_layer_
+
+            # change reshape [batch_size x num_heads, q_length, head_dim]
+            context_layer = context_layer.reshape([batch_size * self.num_heads, q_length, self.head_dim])
+
+            # change reshape [batch_size, num_heads, q_length, head_dim]
+            context_layer = self._merge_heads(context_layer)
+
+            output_tensor = self.dense(context_layer)
+
+            outputs = (output_tensor, present)
+            if output_attentions:
+                outputs += (attention_probs,)
+
+            return outputs
+
+
+class MLP(nn.Layer):
+    def __init__(self, config: RWConfig):
+        super().__init__()
+        hidden_size = config.hidden_size
+
+        self.dense_h_to_4h = nn.Linear(hidden_size, 4 * hidden_size, bias_attr=config.bias)
+        self.act = nn.GELU()
+        self.dense_4h_to_h = nn.Linear(4 * hidden_size, hidden_size, bias_attr=config.bias)
+        self.hidden_dropout = config.hidden_dropout
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.act(self.dense_h_to_4h(x))
+        x = self.dense_4h_to_h(x)
+        return x
+
+
+class DecoderLayer(nn.Layer):
+    def __init__(self, config: RWConfig):
+        super().__init__()
+        hidden_size = config.hidden_size
+
+        self.input_layernorm = nn.LayerNorm(hidden_size, epsilon=config.layer_norm_epsilon)
+        self.num_heads = config.n_head
+        self.self_attention = Attention(config)
+
+        if not config.parallel_attn:
+            # unused if parallel attn
+            self.post_attention_layernorm = nn.LayerNorm(hidden_size, epsilon=config.layer_norm_epsilon)
+
+        self.mlp = MLP(config)
+
+        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
+        self.hidden_dropout = config.hidden_dropout
+
+        self.config = config
+
+    def forward(
+        self,
+        hidden_states: Tensor = None,
+        alibi: Tensor = None,
+        attention_mask: Tensor = None,
+        layer_past: Optional[Tuple[Tensor, Tensor]] = None,
+        head_mask: Optional[Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+        i: int = 0,
+    ):
+
+        layernorm_output = self.input_layernorm(hidden_states)
+        residual = hidden_states
+
+        # Self attention.
+        attn_outputs = self.self_attention(
+            layernorm_output,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            alibi=alibi,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            i=i,
+        )
+
+        attention_output = attn_outputs[0]
+
+        if not self.config.parallel_attn:
+            residual = dropout_add(attention_output, residual, self.config.attention_dropout, training=self.training)
+            layernorm_output = self.post_attention_layernorm(residual)
+
+        outputs = attn_outputs[1:]
+
+        # MLP.
+        mlp_output = self.mlp(layernorm_output)
+
+        if self.config.parallel_attn:
+            mlp_output += attention_output
+
+        output = dropout_add(mlp_output, residual, self.config.hidden_dropout, training=self.training)
+
+        if use_cache:
+            outputs = (output,) + outputs
+        else:
+            outputs = (output,) + outputs[1:]
+
+        return outputs  # hidden_states, present, attentions
+
+
+class RWPreTrainedModel(PretrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = RWConfig
+    base_model_prefix = "transformer"
+
+    pretrained_init_configuration = RW_PRETRAINED_INIT_CONFIGURATION
+
+    @classmethod
+    def _get_name_mappings(cls, config: RWConfig) -> List[StateDictNameMapping]:
+        mappings = [
+            "word_embeddings.weight",
+            "ln_f.weight",
+            "ln_f.bias",
+        ]
+
+        for layer_index in range(config.num_hidden_layers):
+            layer_mappings = [
+                [
+                    f"h.{layer_index}.input_layernorm.weight",
+                    f"h.{layer_index}.input_layernorm.weight",
+                ],
+                [
+                    f"h.{layer_index}.input_layernorm.bias",
+                    f"h.{layer_index}.input_layernorm.bias",
+                ],
+                [
+                    f"h.{layer_index}.self_attention.query_key_value.weight",
+                    f"h.{layer_index}.self_attention.query_key_value.weight",
+                    "transpose",
+                ],
+                [
+                    f"h.{layer_index}.self_attention.query_key_value.bias",
+                    f"h.{layer_index}.self_attention.query_key_value.bias",
+                ],
+                [
+                    f"h.{layer_index}.self_attention.dense.weight",
+                    f"h.{layer_index}.self_attention.dense.weight",
+                    "transpose",
+                ],
+                [
+                    f"h.{layer_index}.self_attention.dense.bias",
+                    f"h.{layer_index}.self_attention.dense.bias",
+                ],
+                [
+                    f"h.{layer_index}.mlp.dense_h_to_4h.weight",
+                    f"h.{layer_index}.mlp.dense_h_to_4h.weight",
+                    "transpose",
+                ],
+                [
+                    f"h.{layer_index}.mlp.dense_h_to_4h.bias",
+                    f"h.{layer_index}.mlp.dense_h_to_4h.bias",
+                ],
+                [
+                    f"h.{layer_index}.mlp.dense_4h_to_h.weight",
+                    f"h.{layer_index}.mlp.dense_4h_to_h.weight",
+                    "transpose",
+                ],
+                [
+                    f"h.{layer_index}.mlp.dense_4h_to_h.bias",
+                    f"h.{layer_index}.mlp.dense_4h_to_h.bias",
+                ],
+            ]
+            mappings.extend(layer_mappings)
+
+        init_name_mappings(mappings)
+        # Other than RWModel, other architectures will prepend model prefix
+        if config.architectures is not None and "RWModel" not in config.architectures:
+            for mapping in mappings:
+                mapping[0] = "transformer." + mapping[0]
+                if len(mapping) > 1 and mapping[1] is not None:
+                    mapping[1] = "transformer." + mapping[1]
+
+        if config.architectures is not None:
+            if "RWForCausalLM" in config.architectures:
+                mappings.extend(
+                    [
+                        "lm_head.weight",
+                        "lm_head.bias",
+                    ]
+                )
+
+        init_name_mappings(mappings)
+        return [StateDictNameMapping(*mapping) for mapping in mappings]
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    def _init_weights(self, layer: nn.Layer):
+        """Initialize the weights."""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            layer.weight.set_value(
+                paddle.tensor.normal(mean=0.0, std=self.config.initializer_range, shape=layer.weight.shape)
+            )
+            if getattr(layer, "bias", None) is not None:
+                layer.weight.set_value(paddle.zeros(shape=layer.weight.shape, dtype=paddle.get_default_dtype()))
+
+    def _set_gradient_checkpointing(self, module: nn.Layer, value: bool = False):
+        if isinstance(module, RWModel):
+            module.gradient_checkpointing = value
+
+    @staticmethod
+    def _convert_to_standard_cache(
+        past_key_value: Tuple[Tuple[Tensor, Tensor]], batch_size: int
+    ) -> Tuple[Tuple[Tensor, Tensor]]:
+        """
+        Standardizes the format of the cache so as to match most implementations, i.e. to tuple(tuple([batch_size,
+        num_heads, ...]))
+        """
+        batch_size_times_num_heads, head_dim, seq_length = past_key_value[0][0].shape
+        num_heads = batch_size_times_num_heads // batch_size
+        # key: [batch_size * num_heads, head_dim, seq_length] -> [batch_size, num_heads, head_dim, seq_length]
+        # value: [batch_size * num_heads, seq_length, head_dim] -> [batch_size, num_heads, seq_length, head_dim]
+        return tuple(
+            (
+                layer_past[0].reshape([batch_size, num_heads, head_dim, seq_length]),
+                layer_past[1].reshape([batch_size, num_heads, seq_length, head_dim]),
+            )
+            for layer_past in past_key_value
+        )
+
+    @staticmethod
+    def _convert_to_rw_cache(past_key_value: Tuple[Tuple[Tensor, Tensor]]) -> Tuple[Tuple[Tensor, Tensor]]:
+        batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape
+        batch_size_times_num_heads = batch_size * num_heads
+        # key:  [batch_size, num_heads, head_dim, seq_length] -> [batch_size * num_heads, head_dim, seq_length]
+        # value: [batch_size, num_heads, seq_length, head_dim] -> [batch_size * num_heads, seq_length, head_dim]
+        return tuple(
+            (
+                layer_past[0].reshape([batch_size_times_num_heads, head_dim, seq_length]),
+                layer_past[1].reshape([batch_size_times_num_heads, seq_length, head_dim]),
+            )
+            for layer_past in past_key_value
+        )
+
+
+class RWModel(RWPreTrainedModel):
+    def __init__(self, config: RWConfig):
+        super().__init__(config)
+
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.n_head
+        self.alibi = config.alibi
+
+        # Embedding + LN Embedding
+        self.word_embeddings = nn.Embedding(config.vocab_size, self.embed_dim)
+
+        # Transformer blocks
+        self.h = nn.LayerList([DecoderLayer(config) for _ in range(config.num_hidden_layers)])
+
+        # Final Layer Norm
+        self.ln_f = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_epsilon)
+
+        self.gradient_checkpointing = False
+
+    def get_input_embeddings(self):
+        return self.word_embeddings
+
+    def _prepare_attn_mask(self, attention_mask: Tensor, input_shape: Tuple[int, int], past_key_values_length: int):
+        # create causal mask
+        # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
+        combined_attention_mask = None
+        # device = attention_mask.device
+        _, src_length = input_shape
+
+        if src_length > 1:
+            combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
+
+        # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
+        expanded_attn_mask = _expand_mask(attention_mask, tgt_length=src_length)
+        combined_attention_mask = (
+            expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask | combined_attention_mask
+        )
+
+        return combined_attention_mask
+
+    def set_input_embeddings(self, new_embeddings: Tensor):
+        self.word_embeddings = new_embeddings
+
+    def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
+        """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
+        if head_mask.dim() == 1:
+            axis = paddle.to_tensor([0, 1, 3, 4])
+            head_mask = paddle.unsqueeze(head_mask, axis=axis)
+            head_mask = head_mask.expand(shape=(num_hidden_layers, -1, -1, -1, -1))
+        elif head_mask.dim() == 2:
+            axis = paddle.to_tensor([1, 3, 4])
+            head_mask = paddle.unsqueeze(head_mask, axis=axis)
+        assert head_mask.dim() == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
+
+        head_mask = paddle.cast(head_mask, dtype=self.config.dtype)
+        return head_mask
+
+    def get_head_mask(
+        self, head_mask: Optional[Tensor], num_hidden_layers: int, is_attention_chunked: bool = False
+    ) -> Tensor:
+        """
+        Prepare the head mask if needed.
+        Args:
+            head_mask (`paddle.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):
+                The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
+            num_hidden_layers (`int`):
+                The number of hidden layers in the model.
+            is_attention_chunked: (`bool`, *optional*, defaults to `False`):
+                Whether or not the attentions scores are computed by chunks or not.
+        Returns:
+            `paddle.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with
+            `[None]` for each layer.
+        """
+        if head_mask is not None:
+            head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
+            if is_attention_chunked is True:
+                head_mask = head_mask.unsqueeze(-1)
+        else:
+            head_mask = [None] * num_hidden_layers
+
+        return head_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **deprecated_arguments,
+    ) -> Union[Tuple[Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
+        if deprecated_arguments.pop("position_ids", False) is not False:
+            # `position_ids` could have been `Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
+            warnings.warn(
+                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
+                " passing `position_ids`.",
+                FutureWarning,
+            )
+        if len(deprecated_arguments) > 0:
+            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.h))
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape batch_size x num_heads x N x N
+        # head_mask has shape n_layer x batch x num_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        hidden_states = inputs_embeds
+
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        # Compute alibi tensor: check build_alibi_tensor documentation
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values[0] is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if attention_mask is None:
+            attention_mask = paddle.ones((batch_size, seq_length_with_past))
+
+        if self.alibi:
+            alibi = build_alibi_tensor(attention_mask, self.num_heads, dtype=hidden_states.dtype)
+        else:
+            alibi = None
+
+        causal_mask = self._prepare_attn_mask(
+            attention_mask,
+            input_shape=(batch_size, seq_length),
+            past_key_values_length=past_key_values_length,
+        )
+
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            outputs = block(
+                hidden_states,
+                layer_past=layer_past,
+                attention_mask=causal_mask,
+                head_mask=head_mask[i],
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                alibi=alibi,
+                i=i,
+            )
+
+            hidden_states = outputs[0]
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+
+        # Add last hidden state
+        hidden_states = self.ln_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class CausalLMHead(nn.Linear):
+    def forward(self, input: Tensor) -> Tensor:
+        ret = input @ self.weight.T
+        return ret
+
+
+class RWForCausalLM(RWPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
+
+    def __init__(self, config: RWConfig):
+        super().__init__(config)
+        self.transformer = RWModel(config)
+        self.lm_head = CausalLMHead(config.vocab_size, config.hidden_size, bias_attr=False)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings: Tensor):
+        self.lm_head = new_embeddings
+
+    def prepare_attention_mask_for_generation(self, input_ids, pad_token_id, eos_token_id):
+        is_pad_token_in_inputs_ids = (pad_token_id is not None) and paddle.any(input_ids == pad_token_id).item()
+        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (
+            (eos_token_id is not None) and (pad_token_id != eos_token_id)
+        )
+        if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id:
+            attention_mask = (input_ids != pad_token_id).astype("int64")
+        else:
+            attention_mask = paddle.ones_like(input_ids, dtype="int64")
+        return attention_mask
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        **kwargs,
+    ) -> dict:
+        # only last token for input_ids if past is not None
+        if past:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+
+            # the cache may be in the stardard format (e.g. in contrastive search), convert to our's format if needed
+            if past[0][0].shape[0] == input_ids.shape[0]:
+                past = self._convert_to_rw_cache(past)
+
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past,
+            "use_cache": kwargs.get("use_cache"),
+            "attention_mask": attention_mask,
+        }
+
+    def forward(
+        self,
+        input_ids=None,
+        past_key_values: Optional[Tuple[Tuple[Tensor, Tensor], ...]] = None,
+        attention_mask: Optional[Tensor] = None,
+        head_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **deprecated_arguments,
+    ) -> Union[Tuple[Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        labels (`paddle.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss = nn.functional.cross_entropy(lm_logits, labels)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    @staticmethod
+    def _reorder_cache(past: Tuple[Tuple[Tensor]], beam_idx: Tensor) -> Tuple[Tuple[Tensor]]:
+        """
+        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
+        beam_idx at every generation step.
+        """
+        return tuple(tuple(past_state.index_select(0, beam_idx) for past_state in layer_past) for layer_past in past)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rw/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rw/tokenizer.py
new file mode 100644
index 000000000..dadf52a89
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/rw/tokenizer.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2023 Technology Innovation Institute (TII) and PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from paddlenlp.transformers.gpt.tokenizer import GPTTokenizer
+
+__all__ = ["RWTokenizer"]
+
+
+class RWTokenizer(GPTTokenizer):
+    """
+    Constructs a RWModel tokenizer based on byte-level Byte-Pair-Encoding.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.GPTTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            Path to the vocab file.
+            The vocab file contains a mapping from vocabulary strings to indices.
+        merges_file (str):
+            Path to the merge file.
+            The merge file is used to split the input sentence into "subword" units.
+            The vocab file is then used to encode those units as intices.
+        errors (str):
+            Paradigm to follow when decoding bytes to UTF-8.
+            Defaults to `'replace'`.
+        max_len (int, optional):
+            The maximum value of the input sequence length.
+            Defaults to `None`.
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import RWTokenizer
+
+            tokenizer = RWTokenizer.from_pretrained('tiiuae/falcon-7b')
+            print(tokenizer('Welcome to use PaddlePaddle and PaddleNLP'))
+
+            '''
+            {'input_ids': [11302, 271, 745, 337, 18849, 59, 18849, 273, 337, 18849, 57, 15549]}
+            '''
+
+    """
+
+    resource_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}  # for save_pretrained
+    model_input_names = ["input_ids"]
+
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "falcon-7b": "https://bj.bcebos.com/paddlenlp/models/community/tiiuae/falcon-7b/vocab.json",
+            "falcon-7b-instruct": "https://bj.bcebos.com/paddlenlp/models/community/tiiuae/falcon-7b-instruct/vocab.json",
+            "OpenBuddy/openbuddy-falcon-7b-v5-fp16": "https://bj.bcebos.com/paddlenlp/models/community/OpenBuddy/openbuddy-falcon-7b-v5-fp16/vocab.json",
+        },
+        "merges_file": {
+            "falcon-7b": "https://bj.bcebos.com/paddlenlp/models/community/tiiuae/falcon-7b/merges.txt",
+            "falcon-7b-instruct": "https://bj.bcebos.com/paddlenlp/models/community/tiiuae/falcon-7b-instruct/merges.txt",
+            "OpenBuddy/openbuddy-falcon-7b-v5-fp16": "https://bj.bcebos.com/paddlenlp/models/community/OpenBuddy/openbuddy-falcon-7b-v5-fp16/merges.txt",
+        },
+    }
+    padding_side = "right"
+
+    def __init__(self, vocab_file, merges_file, **kwargs):  # The token of newline.
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the "
+                "vocabulary from a pretrained model please use "
+                "`tokenizer = RWTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+
+        self.spaces_between_special_tokens = kwargs.get("spaces_between_special_tokens", True)
+        super().__init__(vocab_file, merges_file, **kwargs)
+
+    def decode(
+        self, token_ids, skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True, **kwargs
+    ) -> str:
+        return super(RWTokenizer, self).decode(
+            token_ids=token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            spaces_between_special_tokens=self.spaces_between_special_tokens,
+            **kwargs,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/segment_parallel_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/segment_parallel_utils.py
new file mode 100644
index 000000000..67ce493b3
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/segment_parallel_utils.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import paddle
+import paddle.distributed as dist
+from paddle.autograd import PyLayer
+from paddle.distributed.communication.group import _get_global_group
+from paddle.distributed.fleet import fleet
+
+
+def split_inputs_sequence_dim(inputs, sep_rank=None, sep_degree=None):
+    if sep_degree is None and sep_rank is None:
+        _hcg = fleet.get_hybrid_communicate_group()
+        sep_degree = _hcg.get_sep_parallel_world_size()
+        sep_rank = _hcg.get_sep_parallel_rank()
+    assert isinstance(sep_degree, int) and isinstance(
+        sep_rank, int
+    ), f"sep_degree:{type(sep_degree)} and sep_rank:{type(sep_rank)} must be int"
+    if sep_degree <= 1:
+        return inputs
+
+    def do_split_sequence_dim(data, sep_rank, sep_degree):
+        if data is None:
+            return None
+        assert isinstance(data, paddle.Tensor), f"data should be paddle.Tensor, but is type:{type(data)}"
+        assert len(data.shape) == 2, f"data dims should be 2, but shaped: {data.shape}"
+        sliced_data = paddle.split(data, num_or_sections=sep_degree, axis=-1)[sep_rank]
+        return sliced_data
+
+    if isinstance(inputs, paddle.Tensor):
+        return do_split_sequence_dim(inputs, sep_rank, sep_degree)
+    elif isinstance(inputs, dict):
+        res = {}
+        for k, tensor in inputs.items():
+            res[k] = do_split_sequence_dim(tensor, sep_rank, sep_degree)
+    elif isinstance(inputs, list):
+        res = []
+        for tensor in inputs:
+            res.append(do_split_sequence_dim(tensor, sep_rank, sep_degree))
+        raise ValueError(f"the inputs should be a list or a dict, but is type: {type(inputs)}")
+    return res
+
+
+@paddle.no_grad()
+def _reshard_qkv(x, group, split_axis=2, concat_axis=0):
+    # [s/sep, b, h] -> [s, b, h/sep]
+    # [s, b, h/sep] -> [s/sep, b, h]
+    group = _get_global_group() if group is None else group
+    nranks = dist.get_world_size(group=group)
+    shape = x.shape
+
+    assert len(shape) == 3, "Only support 3D tensor, but got {}".format(len(shape))
+    assert shape[split_axis] % nranks == 0, "Only support evenly split, but got {} % {} != 0".format(shape[2], nranks)
+
+    comm_tensor_list = paddle.split(x, nranks, axis=split_axis)
+    output_list = [paddle.empty_like(comm_tensor_list[0]) for _ in comm_tensor_list]
+    dist.alltoall(comm_tensor_list, output_list, group=group)
+    reshard_tensor = paddle.concat(output_list, axis=concat_axis)
+
+    return reshard_tensor
+
+
+class ReshardQKV(PyLayer):
+    @staticmethod
+    def forward(ctx, x, group=None, split_axis=2, concat_axis=0):
+        ctx.group = _get_global_group() if group is None else group
+        ctx.split_axis = split_axis
+        ctx.concat_axis = concat_axis
+        res = _reshard_qkv(x, group, split_axis=ctx.split_axis, concat_axis=ctx.concat_axis)
+        return res
+
+    @staticmethod
+    def backward(ctx, dy):
+        res = _reshard_qkv(dy, ctx.group, split_axis=ctx.concat_axis, concat_axis=ctx.split_axis)
+        return res
+
+
+class ReshardLayer(paddle.nn.Layer):
+    def __init__(self, sep_group=None) -> None:
+        if sep_group is None:
+            _hcg = fleet.get_hybrid_communicate_group()
+            sep_group = _hcg.get_sep_parallel_group() if sep_group is None else sep_group
+        self.sep_group = sep_group
+        self.sep_degree = dist.get_world_size(group=self.sep_group)
+        super(ReshardLayer, self).__init__()
+
+    def forward(
+        self,
+        x,
+        split_axis=1,
+        concat_axis=2,
+    ):
+        # if x dims==3, its shape can be [s/sep, b, h] or [b, s/sep, h], the output shape can be [s, b, h/sep] or [b, s, h/sep]
+        # if x dims==4, its shape can be [s, b, num_head/sep, head_dim] or [b, s, num_head/sep, head_dim], the output shape can be [s/sep, b, num_head, head_dim] or [b, s/sep, num_head, head_dim]
+        shape = x.shape
+        assert len(shape) == 3 or len(shape) == 4, "Only support 3D or 4D tensor"
+        if len(shape) == 4:
+            assert shape[split_axis] % self.sep_degree == 0
+            shape[split_axis] = shape[split_axis] // self.sep_degree
+            shape[concat_axis] = shape[concat_axis] * self.sep_degree
+
+        input_data = x
+        if len(shape) == 3:
+            reshard_tensor = ReshardQKV.apply(
+                input_data, self.sep_group, split_axis=split_axis, concat_axis=concat_axis
+            )
+        else:
+            input_data = input_data.reshape([0, 0, -1])
+            reshard_tensor = ReshardQKV.apply(
+                input_data, self.sep_group, split_axis=split_axis, concat_axis=concat_axis
+            )
+            reshard_tensor.reshape_(shape)
+        return reshard_tensor
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/semantic_search/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/semantic_search/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/semantic_search/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/semantic_search/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/semantic_search/modeling.py
new file mode 100644
index 000000000..c16808e21
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/semantic_search/modeling.py
@@ -0,0 +1,311 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ..ernie.configuration import ErnieConfig
+from ..ernie.modeling import ErnieModel, ErniePretrainedModel
+
+__all__ = ["ErnieDualEncoder", "ErnieCrossEncoder", "ErnieEncoder"]
+
+
+class ErnieEncoder(ErniePretrainedModel):
+    def __init__(self, config: ErnieConfig, output_emb_size: int | None = None):
+        super(ErnieEncoder, self).__init__(config)
+
+        self.ernie = ErnieModel(config)
+        dropout = config.classifier_dropout if config.classifier_dropout is not None else 0.1
+
+        self.dropout = nn.Dropout(dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Compatible to ERNIE-Search for adding extra linear layer
+        if output_emb_size is not None and output_emb_size > 0:
+            weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=0.02))
+            self.emb_reduce_linear = paddle.nn.Linear(config.hidden_size, output_emb_size, weight_attr=weight_attr)
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, nn.LayerNorm):
+            layer._epsilon = 1e-12
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
+        sequence_output, pool_output = self.ernie(
+            input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask
+        )
+        return sequence_output, pool_output
+
+
+class ErnieDualEncoder(nn.Layer):
+    """
+    This class encapsulates two ErnieEncoder models into one model, so query
+    embedding and title embedding could be obtained using one model. And this
+    class allows two ErnieEncoder models to be trained at the same time.
+
+    Example:
+
+        .. code-block::
+
+            import paddle
+            from paddlenlp.transformers import ErnieDualEncoder, ErnieTokenizer
+
+            model = ErnieDualEncoder("rocketqa-zh-dureader-query-encoder", "rocketqa-zh-dureader-para-encoder")
+            tokenizer = ErnieTokenizer.from_pretrained("rocketqa-zh-dureader-query-encoder")
+
+            inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+            inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+
+            # Get query embedding
+            query_embedding = model.get_pooled_embedding(**inputs)
+
+            # Get title embedding
+            title_embedding = model.get_pooled_embedding(**inputs, is_query=False)
+
+    """
+
+    def __init__(
+        self,
+        query_model_name_or_path=None,
+        title_model_name_or_path=None,
+        share_parameters=False,
+        output_emb_size=None,
+        dropout=None,
+        reinitialize=False,
+        use_cross_batch=False,
+    ):
+
+        super().__init__()
+        self.query_ernie, self.title_ernie = None, None
+        self.use_cross_batch = use_cross_batch
+        self.output_emb_size = output_emb_size
+        if query_model_name_or_path is not None:
+            self.query_ernie = ErnieEncoder.from_pretrained(query_model_name_or_path, output_emb_size=output_emb_size)
+        if share_parameters:
+            self.title_ernie = self.query_ernie
+        elif title_model_name_or_path is not None:
+            self.title_ernie = ErnieEncoder.from_pretrained(title_model_name_or_path, output_emb_size=output_emb_size)
+        assert (self.query_ernie is not None) or (
+            self.title_ernie is not None
+        ), "At least one of query_ernie and title_ernie should not be None"
+
+        # Compatible to rocketv2 initialization for setting layer._epsilon to 1e-5
+        if reinitialize:
+            self.apply(self.init_epsilon_weights)
+
+    def init_epsilon_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, nn.LayerNorm):
+            layer._epsilon = 1e-5
+
+    def get_semantic_embedding(self, data_loader):
+        self.eval()
+        with paddle.no_grad():
+            for batch_data in data_loader:
+                input_ids, token_type_ids = batch_data
+                input_ids = paddle.to_tensor(input_ids)
+                token_type_ids = paddle.to_tensor(token_type_ids)
+
+                text_embeddings = self.get_pooled_embedding(input_ids, token_type_ids=token_type_ids)
+
+                yield text_embeddings
+
+    def get_pooled_embedding(
+        self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, is_query=True
+    ):
+        """Get the first feature of each sequence for classification"""
+        assert (is_query and self.query_ernie is not None) or (
+            not is_query and self.title_ernie
+        ), "Please check whether your parameter for `is_query` are consistent with DualEncoder initialization."
+        if is_query:
+            sequence_output, _ = self.query_ernie(input_ids, token_type_ids, position_ids, attention_mask)
+            if self.output_emb_size is not None and self.output_emb_size > 0:
+                cls_embedding = self.query_ernie.emb_reduce_linear(sequence_output[:, 0])
+            else:
+                cls_embedding = sequence_output[:, 0]
+
+        else:
+            sequence_output, _ = self.title_ernie(input_ids, token_type_ids, position_ids, attention_mask)
+            if self.output_emb_size is not None and self.output_emb_size > 0:
+                cls_embedding = self.title_ernie.emb_reduce_linear(sequence_output[:, 0])
+            else:
+                cls_embedding = sequence_output[:, 0]
+        return cls_embedding
+
+    def cosine_sim(
+        self,
+        query_input_ids,
+        title_input_ids,
+        query_token_type_ids=None,
+        query_position_ids=None,
+        query_attention_mask=None,
+        title_token_type_ids=None,
+        title_position_ids=None,
+        title_attention_mask=None,
+    ):
+        query_cls_embedding = self.get_pooled_embedding(
+            query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask
+        )
+
+        title_cls_embedding = self.get_pooled_embedding(
+            title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask, is_query=False
+        )
+
+        cosine_sim = paddle.sum(query_cls_embedding * title_cls_embedding, axis=-1)
+        return cosine_sim
+
+    def forward(
+        self,
+        query_input_ids,
+        pos_title_input_ids,
+        neg_title_input_ids,
+        is_prediction=False,
+        query_token_type_ids=None,
+        query_position_ids=None,
+        query_attention_mask=None,
+        pos_title_token_type_ids=None,
+        pos_title_position_ids=None,
+        pos_title_attention_mask=None,
+        neg_title_token_type_ids=None,
+        neg_title_position_ids=None,
+        neg_title_attention_mask=None,
+    ):
+        query_cls_embedding = self.get_pooled_embedding(
+            query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask
+        )
+
+        pos_title_cls_embedding = self.get_pooled_embedding(
+            pos_title_input_ids,
+            pos_title_token_type_ids,
+            pos_title_position_ids,
+            pos_title_attention_mask,
+            is_query=False,
+        )
+
+        neg_title_cls_embedding = self.get_pooled_embedding(
+            neg_title_input_ids,
+            neg_title_token_type_ids,
+            neg_title_position_ids,
+            neg_title_attention_mask,
+            is_query=False,
+        )
+
+        all_title_cls_embedding = paddle.concat(x=[pos_title_cls_embedding, neg_title_cls_embedding], axis=0)
+
+        if is_prediction:
+            logits = paddle.dot(query_cls_embedding, pos_title_cls_embedding)
+            outputs = {"probs": logits, "q_rep": query_cls_embedding, "p_rep": pos_title_cls_embedding}
+            return outputs
+
+        if self.use_cross_batch:
+            tensor_list = []
+            paddle.distributed.all_gather(tensor_list, all_title_cls_embedding)
+            all_title_cls_embedding = paddle.concat(x=tensor_list, axis=0)
+
+        logits = paddle.matmul(query_cls_embedding, all_title_cls_embedding, transpose_y=True)
+
+        batch_size = query_cls_embedding.shape[0]
+
+        labels = paddle.arange(batch_size * self.rank * 2, batch_size * (self.rank * 2 + 1), dtype="int64")
+        labels = paddle.reshape(labels, shape=[-1, 1])
+
+        accuracy = paddle.metric.accuracy(input=logits, label=labels)
+        loss = F.cross_entropy(input=logits, label=labels)
+        outputs = {"loss": loss, "accuracy": accuracy}
+
+        return outputs
+
+
+class ErnieCrossEncoder(nn.Layer):
+    """
+    Example:
+
+        .. code-block::
+
+            import paddle
+            from paddlenlp.transformers import ErnieCrossEncoder, ErnieTokenizer
+
+            model = ErnieCrossEncoder("rocketqa-zh-dureader-cross-encoder")
+            tokenizer = ErnieTokenizer.from_pretrained("rocketqa-zh-dureader-cross-encoder")
+
+            inputs = tokenizer("你们好", text_pair="你好")
+            inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+
+            # Get embedding of text pair.
+            embedding = model.matching(**inputs)
+
+    """
+
+    def __init__(self, pretrain_model_name_or_path, num_classes=2, reinitialize=False, dropout=None):
+        super().__init__()
+
+        self.ernie = ErnieEncoder.from_pretrained(pretrain_model_name_or_path, num_classes=num_classes)
+        # Compatible to rocketv2 initialization for setting layer._epsilon to 1e-5
+        if reinitialize:
+            self.apply(self.init_epsilon_weights)
+
+    def init_epsilon_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, nn.LayerNorm):
+            layer._epsilon = 1e-5
+
+    def matching(
+        self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, return_prob_distributation=False
+    ):
+        """Use the pooled_output as the feature for pointwise prediction, eg. RocketQAv1"""
+        _, pooled_output = self.ernie(
+            input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask
+        )
+        pooled_output = self.ernie.dropout(pooled_output)
+        cls_embedding = self.ernie.classifier(pooled_output)
+        probs = F.softmax(cls_embedding, axis=1)
+        if return_prob_distributation:
+            return probs
+        return probs[:, 1]
+
+    def matching_v2(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
+        """Use the cls token embedding as the feature for listwise prediction, eg. RocketQAv2"""
+        sequence_output, _ = self.ernie(
+            input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask
+        )
+        pooled_output = self.ernie.dropout(sequence_output[:, 0])
+        probs = self.ernie.classifier(pooled_output)
+        return probs
+
+    def matching_v3(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
+        """Use the pooled_output as the feature for listwise prediction, eg. ERNIE-Search"""
+        sequence_output, pooled_output = self.ernie(
+            input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask
+        )
+        pooled_output = self.ernie.dropout(pooled_output)
+        probs = self.ernie.classifier(pooled_output)
+        return probs
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, labels=None):
+        probs = self.matching(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            return_prob_distributation=True,
+        )
+        if labels is not None:
+            accuracy = paddle.metric.accuracy(input=probs, label=labels)
+            loss = F.cross_entropy(input=probs, label=labels)
+            outputs = {"loss": loss, "accuracy": accuracy}
+            return outputs
+        else:
+            return probs[:, 1]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/sentencepiece_model_pb2.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/sentencepiece_model_pb2.py
new file mode 100644
index 000000000..2502772a9
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/sentencepiece_model_pb2.py
@@ -0,0 +1,1534 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: sentencepiece_model.proto
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+    name="sentencepiece_model.proto",
+    package="sentencepiece",
+    syntax="proto2",
+    serialized_options=b"H\003",
+    create_key=_descriptor._internal_create_key,
+    serialized_pb=b'\n\x19sentencepiece_model.proto\x12\rsentencepiece"\xdb\x0b\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12*\n\x1b\x65nable_differential_privacy\x18\x32 \x01(\x08:\x05\x66\x61lse\x12+\n differential_privacy_noise_level\x18\x33 \x01(\x02:\x01\x30\x12\x32\n\'differential_privacy_clipping_threshold\x18\x34 \x01(\x04:\x01\x30\x12"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12+\n\x1c\x61llow_whitespace_only_pieces\x18\x1a \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18  \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18. \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03',
+)
+
+_TRAINERSPEC_MODELTYPE = _descriptor.EnumDescriptor(
+    name="ModelType",
+    full_name="sentencepiece.TrainerSpec.ModelType",
+    filename=None,
+    file=DESCRIPTOR,
+    create_key=_descriptor._internal_create_key,
+    values=[
+        _descriptor.EnumValueDescriptor(
+            name="UNIGRAM",
+            index=0,
+            number=1,
+            serialized_options=None,
+            type=None,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="BPE",
+            index=1,
+            number=2,
+            serialized_options=None,
+            type=None,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="WORD",
+            index=2,
+            number=3,
+            serialized_options=None,
+            type=None,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="CHAR",
+            index=3,
+            number=4,
+            serialized_options=None,
+            type=None,
+            create_key=_descriptor._internal_create_key,
+        ),
+    ],
+    containing_type=None,
+    serialized_options=None,
+    serialized_start=1480,
+    serialized_end=1533,
+)
+_sym_db.RegisterEnumDescriptor(_TRAINERSPEC_MODELTYPE)
+
+_MODELPROTO_SENTENCEPIECE_TYPE = _descriptor.EnumDescriptor(
+    name="Type",
+    full_name="sentencepiece.ModelProto.SentencePiece.Type",
+    filename=None,
+    file=DESCRIPTOR,
+    create_key=_descriptor._internal_create_key,
+    values=[
+        _descriptor.EnumValueDescriptor(
+            name="NORMAL",
+            index=0,
+            number=1,
+            serialized_options=None,
+            type=None,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="UNKNOWN",
+            index=1,
+            number=2,
+            serialized_options=None,
+            type=None,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="CONTROL",
+            index=2,
+            number=3,
+            serialized_options=None,
+            type=None,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="USER_DEFINED",
+            index=3,
+            number=4,
+            serialized_options=None,
+            type=None,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="BYTE",
+            index=4,
+            number=6,
+            serialized_options=None,
+            type=None,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.EnumValueDescriptor(
+            name="UNUSED",
+            index=5,
+            number=5,
+            serialized_options=None,
+            type=None,
+            create_key=_descriptor._internal_create_key,
+        ),
+    ],
+    containing_type=None,
+    serialized_options=None,
+    serialized_start=2286,
+    serialized_end=2370,
+)
+_sym_db.RegisterEnumDescriptor(_MODELPROTO_SENTENCEPIECE_TYPE)
+
+_TRAINERSPEC = _descriptor.Descriptor(
+    name="TrainerSpec",
+    full_name="sentencepiece.TrainerSpec",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    create_key=_descriptor._internal_create_key,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="input",
+            full_name="sentencepiece.TrainerSpec.input",
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="input_format",
+            full_name="sentencepiece.TrainerSpec.input_format",
+            index=1,
+            number=7,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=b"".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="model_prefix",
+            full_name="sentencepiece.TrainerSpec.model_prefix",
+            index=2,
+            number=2,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=b"".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="model_type",
+            full_name="sentencepiece.TrainerSpec.model_type",
+            index=3,
+            number=3,
+            type=14,
+            cpp_type=8,
+            label=1,
+            has_default_value=True,
+            default_value=1,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="vocab_size",
+            full_name="sentencepiece.TrainerSpec.vocab_size",
+            index=4,
+            number=4,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=True,
+            default_value=8000,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="accept_language",
+            full_name="sentencepiece.TrainerSpec.accept_language",
+            index=5,
+            number=5,
+            type=9,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="self_test_sample_size",
+            full_name="sentencepiece.TrainerSpec.self_test_sample_size",
+            index=6,
+            number=6,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=True,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="enable_differential_privacy",
+            full_name="sentencepiece.TrainerSpec.enable_differential_privacy",
+            index=7,
+            number=50,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=False,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="differential_privacy_noise_level",
+            full_name="sentencepiece.TrainerSpec.differential_privacy_noise_level",
+            index=8,
+            number=51,
+            type=2,
+            cpp_type=6,
+            label=1,
+            has_default_value=True,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="differential_privacy_clipping_threshold",
+            full_name="sentencepiece.TrainerSpec.differential_privacy_clipping_threshold",
+            index=9,
+            number=52,
+            type=4,
+            cpp_type=4,
+            label=1,
+            has_default_value=True,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="character_coverage",
+            full_name="sentencepiece.TrainerSpec.character_coverage",
+            index=10,
+            number=10,
+            type=2,
+            cpp_type=6,
+            label=1,
+            has_default_value=True,
+            default_value=float(0.9995),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="input_sentence_size",
+            full_name="sentencepiece.TrainerSpec.input_sentence_size",
+            index=11,
+            number=11,
+            type=4,
+            cpp_type=4,
+            label=1,
+            has_default_value=True,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="shuffle_input_sentence",
+            full_name="sentencepiece.TrainerSpec.shuffle_input_sentence",
+            index=12,
+            number=19,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=True,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="mining_sentence_size",
+            full_name="sentencepiece.TrainerSpec.mining_sentence_size",
+            index=13,
+            number=12,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=b"\030\001",
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="training_sentence_size",
+            full_name="sentencepiece.TrainerSpec.training_sentence_size",
+            index=14,
+            number=13,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=False,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=b"\030\001",
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="seed_sentencepiece_size",
+            full_name="sentencepiece.TrainerSpec.seed_sentencepiece_size",
+            index=15,
+            number=14,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=True,
+            default_value=1000000,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="shrinking_factor",
+            full_name="sentencepiece.TrainerSpec.shrinking_factor",
+            index=16,
+            number=15,
+            type=2,
+            cpp_type=6,
+            label=1,
+            has_default_value=True,
+            default_value=float(0.75),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="max_sentence_length",
+            full_name="sentencepiece.TrainerSpec.max_sentence_length",
+            index=17,
+            number=18,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=True,
+            default_value=4192,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="num_threads",
+            full_name="sentencepiece.TrainerSpec.num_threads",
+            index=18,
+            number=16,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=True,
+            default_value=16,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="num_sub_iterations",
+            full_name="sentencepiece.TrainerSpec.num_sub_iterations",
+            index=19,
+            number=17,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=True,
+            default_value=2,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="max_sentencepiece_length",
+            full_name="sentencepiece.TrainerSpec.max_sentencepiece_length",
+            index=20,
+            number=20,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=True,
+            default_value=16,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="split_by_unicode_script",
+            full_name="sentencepiece.TrainerSpec.split_by_unicode_script",
+            index=21,
+            number=21,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=True,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="split_by_number",
+            full_name="sentencepiece.TrainerSpec.split_by_number",
+            index=22,
+            number=23,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=True,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="split_by_whitespace",
+            full_name="sentencepiece.TrainerSpec.split_by_whitespace",
+            index=23,
+            number=22,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=True,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="treat_whitespace_as_suffix",
+            full_name="sentencepiece.TrainerSpec.treat_whitespace_as_suffix",
+            index=24,
+            number=24,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=False,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="allow_whitespace_only_pieces",
+            full_name="sentencepiece.TrainerSpec.allow_whitespace_only_pieces",
+            index=25,
+            number=26,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=False,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="split_digits",
+            full_name="sentencepiece.TrainerSpec.split_digits",
+            index=26,
+            number=25,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=False,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="control_symbols",
+            full_name="sentencepiece.TrainerSpec.control_symbols",
+            index=27,
+            number=30,
+            type=9,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="user_defined_symbols",
+            full_name="sentencepiece.TrainerSpec.user_defined_symbols",
+            index=28,
+            number=31,
+            type=9,
+            cpp_type=9,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="required_chars",
+            full_name="sentencepiece.TrainerSpec.required_chars",
+            index=29,
+            number=36,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=b"".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="byte_fallback",
+            full_name="sentencepiece.TrainerSpec.byte_fallback",
+            index=30,
+            number=35,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=False,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="vocabulary_output_piece_score",
+            full_name="sentencepiece.TrainerSpec.vocabulary_output_piece_score",
+            index=31,
+            number=32,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=True,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="hard_vocab_limit",
+            full_name="sentencepiece.TrainerSpec.hard_vocab_limit",
+            index=32,
+            number=33,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=True,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="use_all_vocab",
+            full_name="sentencepiece.TrainerSpec.use_all_vocab",
+            index=33,
+            number=34,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=False,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="unk_id",
+            full_name="sentencepiece.TrainerSpec.unk_id",
+            index=34,
+            number=40,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=True,
+            default_value=0,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="bos_id",
+            full_name="sentencepiece.TrainerSpec.bos_id",
+            index=35,
+            number=41,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=True,
+            default_value=1,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="eos_id",
+            full_name="sentencepiece.TrainerSpec.eos_id",
+            index=36,
+            number=42,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=True,
+            default_value=2,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="pad_id",
+            full_name="sentencepiece.TrainerSpec.pad_id",
+            index=37,
+            number=43,
+            type=5,
+            cpp_type=1,
+            label=1,
+            has_default_value=True,
+            default_value=-1,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="unk_piece",
+            full_name="sentencepiece.TrainerSpec.unk_piece",
+            index=38,
+            number=45,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=True,
+            default_value=b"<unk>".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="bos_piece",
+            full_name="sentencepiece.TrainerSpec.bos_piece",
+            index=39,
+            number=46,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=True,
+            default_value=b"<s>".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="eos_piece",
+            full_name="sentencepiece.TrainerSpec.eos_piece",
+            index=40,
+            number=47,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=True,
+            default_value=b"</s>".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="pad_piece",
+            full_name="sentencepiece.TrainerSpec.pad_piece",
+            index=41,
+            number=48,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=True,
+            default_value=b"<pad>".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="unk_surface",
+            full_name="sentencepiece.TrainerSpec.unk_surface",
+            index=42,
+            number=44,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=True,
+            default_value=b" \342\201\207 ".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="train_extremely_large_corpus",
+            full_name="sentencepiece.TrainerSpec.train_extremely_large_corpus",
+            index=43,
+            number=49,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=False,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[
+        _TRAINERSPEC_MODELTYPE,
+    ],
+    serialized_options=None,
+    is_extendable=True,
+    syntax="proto2",
+    extension_ranges=[
+        (200, 536870912),
+    ],
+    oneofs=[],
+    serialized_start=45,
+    serialized_end=1544,
+)
+
+_NORMALIZERSPEC = _descriptor.Descriptor(
+    name="NormalizerSpec",
+    full_name="sentencepiece.NormalizerSpec",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    create_key=_descriptor._internal_create_key,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="name",
+            full_name="sentencepiece.NormalizerSpec.name",
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=b"".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="precompiled_charsmap",
+            full_name="sentencepiece.NormalizerSpec.precompiled_charsmap",
+            index=1,
+            number=2,
+            type=12,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=b"",
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="add_dummy_prefix",
+            full_name="sentencepiece.NormalizerSpec.add_dummy_prefix",
+            index=2,
+            number=3,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=True,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="remove_extra_whitespaces",
+            full_name="sentencepiece.NormalizerSpec.remove_extra_whitespaces",
+            index=3,
+            number=4,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=True,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="escape_whitespaces",
+            full_name="sentencepiece.NormalizerSpec.escape_whitespaces",
+            index=4,
+            number=5,
+            type=8,
+            cpp_type=7,
+            label=1,
+            has_default_value=True,
+            default_value=True,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="normalization_rule_tsv",
+            full_name="sentencepiece.NormalizerSpec.normalization_rule_tsv",
+            index=5,
+            number=6,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=b"".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=True,
+    syntax="proto2",
+    extension_ranges=[
+        (200, 536870912),
+    ],
+    oneofs=[],
+    serialized_start=1547,
+    serialized_end=1756,
+)
+
+_SELFTESTDATA_SAMPLE = _descriptor.Descriptor(
+    name="Sample",
+    full_name="sentencepiece.SelfTestData.Sample",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    create_key=_descriptor._internal_create_key,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="input",
+            full_name="sentencepiece.SelfTestData.Sample.input",
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=b"".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="expected",
+            full_name="sentencepiece.SelfTestData.Sample.expected",
+            index=1,
+            number=2,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=b"".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=False,
+    syntax="proto2",
+    extension_ranges=[],
+    oneofs=[],
+    serialized_start=1827,
+    serialized_end=1868,
+)
+
+_SELFTESTDATA = _descriptor.Descriptor(
+    name="SelfTestData",
+    full_name="sentencepiece.SelfTestData",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    create_key=_descriptor._internal_create_key,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="samples",
+            full_name="sentencepiece.SelfTestData.samples",
+            index=0,
+            number=1,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+    ],
+    extensions=[],
+    nested_types=[
+        _SELFTESTDATA_SAMPLE,
+    ],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=True,
+    syntax="proto2",
+    extension_ranges=[
+        (200, 536870912),
+    ],
+    oneofs=[],
+    serialized_start=1758,
+    serialized_end=1879,
+)
+
+_MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor(
+    name="SentencePiece",
+    full_name="sentencepiece.ModelProto.SentencePiece",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    create_key=_descriptor._internal_create_key,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="piece",
+            full_name="sentencepiece.ModelProto.SentencePiece.piece",
+            index=0,
+            number=1,
+            type=9,
+            cpp_type=9,
+            label=1,
+            has_default_value=False,
+            default_value=b"".decode("utf-8"),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="score",
+            full_name="sentencepiece.ModelProto.SentencePiece.score",
+            index=1,
+            number=2,
+            type=2,
+            cpp_type=6,
+            label=1,
+            has_default_value=False,
+            default_value=float(0),
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="type",
+            full_name="sentencepiece.ModelProto.SentencePiece.type",
+            index=2,
+            number=3,
+            type=14,
+            cpp_type=8,
+            label=1,
+            has_default_value=True,
+            default_value=1,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+    ],
+    extensions=[],
+    nested_types=[],
+    enum_types=[
+        _MODELPROTO_SENTENCEPIECE_TYPE,
+    ],
+    serialized_options=None,
+    is_extendable=True,
+    syntax="proto2",
+    extension_ranges=[
+        (200, 536870912),
+    ],
+    oneofs=[],
+    serialized_start=2171,
+    serialized_end=2381,
+)
+
+_MODELPROTO = _descriptor.Descriptor(
+    name="ModelProto",
+    full_name="sentencepiece.ModelProto",
+    filename=None,
+    file=DESCRIPTOR,
+    containing_type=None,
+    create_key=_descriptor._internal_create_key,
+    fields=[
+        _descriptor.FieldDescriptor(
+            name="pieces",
+            full_name="sentencepiece.ModelProto.pieces",
+            index=0,
+            number=1,
+            type=11,
+            cpp_type=10,
+            label=3,
+            has_default_value=False,
+            default_value=[],
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="trainer_spec",
+            full_name="sentencepiece.ModelProto.trainer_spec",
+            index=1,
+            number=2,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="normalizer_spec",
+            full_name="sentencepiece.ModelProto.normalizer_spec",
+            index=2,
+            number=3,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="self_test_data",
+            full_name="sentencepiece.ModelProto.self_test_data",
+            index=3,
+            number=4,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+        _descriptor.FieldDescriptor(
+            name="denormalizer_spec",
+            full_name="sentencepiece.ModelProto.denormalizer_spec",
+            index=4,
+            number=5,
+            type=11,
+            cpp_type=10,
+            label=1,
+            has_default_value=False,
+            default_value=None,
+            message_type=None,
+            enum_type=None,
+            containing_type=None,
+            is_extension=False,
+            extension_scope=None,
+            serialized_options=None,
+            file=DESCRIPTOR,
+            create_key=_descriptor._internal_create_key,
+        ),
+    ],
+    extensions=[],
+    nested_types=[
+        _MODELPROTO_SENTENCEPIECE,
+    ],
+    enum_types=[],
+    serialized_options=None,
+    is_extendable=True,
+    syntax="proto2",
+    extension_ranges=[
+        (200, 536870912),
+    ],
+    oneofs=[],
+    serialized_start=1882,
+    serialized_end=2392,
+)
+
+_TRAINERSPEC.fields_by_name["model_type"].enum_type = _TRAINERSPEC_MODELTYPE
+_TRAINERSPEC_MODELTYPE.containing_type = _TRAINERSPEC
+_SELFTESTDATA_SAMPLE.containing_type = _SELFTESTDATA
+_SELFTESTDATA.fields_by_name["samples"].message_type = _SELFTESTDATA_SAMPLE
+_MODELPROTO_SENTENCEPIECE.fields_by_name["type"].enum_type = _MODELPROTO_SENTENCEPIECE_TYPE
+_MODELPROTO_SENTENCEPIECE.containing_type = _MODELPROTO
+_MODELPROTO_SENTENCEPIECE_TYPE.containing_type = _MODELPROTO_SENTENCEPIECE
+_MODELPROTO.fields_by_name["pieces"].message_type = _MODELPROTO_SENTENCEPIECE
+_MODELPROTO.fields_by_name["trainer_spec"].message_type = _TRAINERSPEC
+_MODELPROTO.fields_by_name["normalizer_spec"].message_type = _NORMALIZERSPEC
+_MODELPROTO.fields_by_name["self_test_data"].message_type = _SELFTESTDATA
+_MODELPROTO.fields_by_name["denormalizer_spec"].message_type = _NORMALIZERSPEC
+DESCRIPTOR.message_types_by_name["TrainerSpec"] = _TRAINERSPEC
+DESCRIPTOR.message_types_by_name["NormalizerSpec"] = _NORMALIZERSPEC
+DESCRIPTOR.message_types_by_name["SelfTestData"] = _SELFTESTDATA
+DESCRIPTOR.message_types_by_name["ModelProto"] = _MODELPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+TrainerSpec = _reflection.GeneratedProtocolMessageType(
+    "TrainerSpec",
+    (_message.Message,),
+    {
+        "DESCRIPTOR": _TRAINERSPEC,
+        "__module__": "sentencepiece_model_pb2"
+        # @@protoc_insertion_point(class_scope:sentencepiece.TrainerSpec)
+    },
+)
+_sym_db.RegisterMessage(TrainerSpec)
+
+NormalizerSpec = _reflection.GeneratedProtocolMessageType(
+    "NormalizerSpec",
+    (_message.Message,),
+    {
+        "DESCRIPTOR": _NORMALIZERSPEC,
+        "__module__": "sentencepiece_model_pb2"
+        # @@protoc_insertion_point(class_scope:sentencepiece.NormalizerSpec)
+    },
+)
+_sym_db.RegisterMessage(NormalizerSpec)
+
+SelfTestData = _reflection.GeneratedProtocolMessageType(
+    "SelfTestData",
+    (_message.Message,),
+    {
+        "Sample": _reflection.GeneratedProtocolMessageType(
+            "Sample",
+            (_message.Message,),
+            {
+                "DESCRIPTOR": _SELFTESTDATA_SAMPLE,
+                "__module__": "sentencepiece_model_pb2"
+                # @@protoc_insertion_point(class_scope:sentencepiece.SelfTestData.Sample)
+            },
+        ),
+        "DESCRIPTOR": _SELFTESTDATA,
+        "__module__": "sentencepiece_model_pb2"
+        # @@protoc_insertion_point(class_scope:sentencepiece.SelfTestData)
+    },
+)
+_sym_db.RegisterMessage(SelfTestData)
+_sym_db.RegisterMessage(SelfTestData.Sample)
+
+ModelProto = _reflection.GeneratedProtocolMessageType(
+    "ModelProto",
+    (_message.Message,),
+    {
+        "SentencePiece": _reflection.GeneratedProtocolMessageType(
+            "SentencePiece",
+            (_message.Message,),
+            {
+                "DESCRIPTOR": _MODELPROTO_SENTENCEPIECE,
+                "__module__": "sentencepiece_model_pb2"
+                # @@protoc_insertion_point(class_scope:sentencepiece.ModelProto.SentencePiece)
+            },
+        ),
+        "DESCRIPTOR": _MODELPROTO,
+        "__module__": "sentencepiece_model_pb2"
+        # @@protoc_insertion_point(class_scope:sentencepiece.ModelProto)
+    },
+)
+_sym_db.RegisterMessage(ModelProto)
+_sym_db.RegisterMessage(ModelProto.SentencePiece)
+
+DESCRIPTOR._options = None
+_TRAINERSPEC.fields_by_name["mining_sentence_size"]._options = None
+_TRAINERSPEC.fields_by_name["training_sentence_size"]._options = None
+# @@protoc_insertion_point(module_scope)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/skep/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/skep/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/skep/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/skep/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/skep/configuration.py
new file mode 100644
index 000000000..f2cf0bec9
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/skep/configuration.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" SKEP model configuration """
+from __future__ import annotations
+
+from typing import Dict
+
+from ..configuration_utils import PretrainedConfig
+
+__all__ = ["SKEP_PRETRAINED_INIT_CONFIGURATION", "SKEP_PRETRAINED_RESOURCE_FILES_MAP", "SkepConfig"]
+
+SKEP_PRETRAINED_INIT_CONFIGURATION = {
+    "skep_ernie_1.0_large_ch": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "intermediate_size": 4096,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 24,
+        "type_vocab_size": 4,
+        "vocab_size": 12800,
+        "pad_token_id": 0,
+    },
+    "skep_ernie_2.0_large_en": {
+        "attention_probs_dropout_prob": 0.1,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "intermediate_size": 4096,
+        "max_position_embeddings": 512,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 24,
+        "type_vocab_size": 4,
+        "vocab_size": 30522,
+        "pad_token_id": 0,
+    },
+    "skep_roberta_large_en": {
+        "attention_probs_dropout_prob": 0.1,
+        "intermediate_size": 4096,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "max_position_embeddings": 514,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 24,
+        "type_vocab_size": 0,
+        "vocab_size": 50265,
+        "pad_token_id": 1,
+    },
+}
+
+SKEP_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "skep_ernie_1.0_large_ch": "https://bj.bcebos.com/paddlenlp/models/transformers/skep/skep_ernie_1.0_large_ch.pdparams",
+        "skep_ernie_2.0_large_en": "https://bj.bcebos.com/paddlenlp/models/transformers/skep/skep_ernie_2.0_large_en.pdparams",
+        "skep_roberta_large_en": "https://bj.bcebos.com/paddlenlp/models/transformers/skep/skep_roberta_large_en.pdparams",
+    }
+}
+
+
+class SkepConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`SKEPModel`]. It is used to instantiate an SKEP Model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the SKEP skep_ernie_1.0_large_ch architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, optional, defaults to 12800): Vocabulary size of the SKEP model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`SKEPModel`].
+        hidden_size (`int`, optional, defaults to 768): Dimensionality of the embedding layer, encoder layers and the pooler layer.
+        num_hidden_layers (int, optional, defaults to 12): Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, optional, defaults to 12):  Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, optional, defaults to 3072): Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors to ff layers are firstly projected from `hidden_size` to `intermediate_size`, and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
+        hidden_act (`str`, optional, defaults to "relu"):The non-linear activation function in the encoder and pooler. "gelu", "relu" and any other paddle supported activation functions are supported.
+        hidden_dropout_prob (`float`, optional, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings and encoder.
+        attention_probs_dropout_prob (`float`, optional, defaults to 0.1): The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
+        max_position_embeddings (`int`, optional, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, optional, defaults to 4): The vocabulary size of the *token_type_ids* passed into [`SKEPModel`].
+        initializer_range (`float`, optional, defaults to 0.02): The standard deviation of the normal initializer.
+            .. note::
+                A normal_initializer initializes weight matrices as normal distributions.
+                See :meth:`SkepPretrainedModel.init_weights()` for how weights are initialized in [`SkepModel`].
+        pad_token_id(int, optional, defaults to 0): The index of padding token in the token vocabulary.
+    Examples:
+    ```python
+    >>> from paddlenlp.transformers import SKEPModel, SkepConfig
+    >>> # Initializing an SKEP configuration
+    >>> configuration = SkepConfig()
+    >>> # Initializing a model (with random weights) from the SKEP-base style configuration model
+    >>> model = SKEPModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    attribute_map: Dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
+    pretrained_init_configuration = SKEP_PRETRAINED_INIT_CONFIGURATION
+    model_type = "skep"
+
+    def __init__(
+        self,
+        vocab_size=12800,
+        hidden_size=1024,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        intermediate_size=4096,
+        hidden_act="relu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=4,
+        initializer_range=0.02,
+        pad_token_id=0,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.pad_token_id = pad_token_id
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/skep/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/skep/modeling.py
new file mode 100644
index 000000000..ce4da4bff
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/skep/modeling.py
@@ -0,0 +1,760 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple
+
+import paddle
+import paddle.nn as nn
+from paddle import Tensor
+
+from paddlenlp.layers.crf import LinearChainCrf, LinearChainCrfLoss
+from paddlenlp.utils.log import logger
+from paddlenlp.utils.tools import compare_version
+
+if compare_version(paddle.version.full_version, "2.2.0") >= 0:
+    # paddle.text.ViterbiDecoder is supported by paddle after version 2.2.0
+    from paddle.text import ViterbiDecoder
+else:
+    from paddlenlp.layers.crf import ViterbiDecoder
+
+from .. import PretrainedModel, register_base_model
+from ..model_outputs import (
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from .configuration import (
+    SKEP_PRETRAINED_INIT_CONFIGURATION,
+    SKEP_PRETRAINED_RESOURCE_FILES_MAP,
+    SkepConfig,
+)
+
+__all__ = [
+    "SkepModel",
+    "SkepPretrainedModel",
+    "SkepForSequenceClassification",
+    "SkepForTokenClassification",
+    "SkepCrfForTokenClassification",
+]
+
+
+class SkepEmbeddings(nn.Layer):
+    """
+    Include embeddings from word, position and token_type embeddings
+    """
+
+    def __init__(self, config: SkepConfig):
+        super(SkepEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.type_vocab_size = config.type_vocab_size
+        if self.type_vocab_size != 0:
+            self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        past_key_values_length: Optional[int] = 0,
+    ):
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        if position_ids is None:
+            input_shape = inputs_embeds.shape[:-1]
+            # maybe need use shape op to unify static graph and dynamic graph
+            ones = paddle.ones(input_shape, dtype="int64")
+            seq_length = paddle.cumsum(ones, axis=1)
+            position_ids = seq_length - ones
+
+            if past_key_values_length > 0:
+                position_ids = position_ids + past_key_values_length
+
+            position_ids.stop_gradient = True
+
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+        if self.type_vocab_size != 0:
+            if token_type_ids is None:
+                token_type_ids_shape = inputs_embeds.shape[:-1]
+                token_type_ids = paddle.zeros(token_type_ids_shape, dtype="int64")
+            token_type_embeddings = self.token_type_embeddings(token_type_ids)
+            embeddings += token_type_embeddings
+        elif token_type_ids is not None:
+            logger.warning(
+                "There is no need to pass the token type ids to SKEP based on RoBERTa model."
+                "The input token type ids will be ignored."
+            )
+
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class SkepPooler(nn.Layer):
+    """
+    The pooling layer on skep model.
+    """
+
+    def __init__(self, config: SkepConfig):
+        super(SkepPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: Tensor):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class SkepPretrainedModel(PretrainedModel):
+    r"""
+    An abstract class for pretrained Skep models. It provides Skep related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+
+    """
+
+    config_class = SkepConfig
+    base_model_prefix = "skep"
+
+    pretrained_init_configuration = SKEP_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = SKEP_PRETRAINED_RESOURCE_FILES_MAP
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # only support dygraph, use truncated_normal and make it inplace
+            # and configurable later
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range,
+                        shape=layer.weight.shape,
+                    )
+                )
+        elif isinstance(layer, nn.LayerNorm):
+            layer._epsilon = 1e-5
+
+
+@register_base_model
+class SkepModel(SkepPretrainedModel):
+    r"""
+    The bare SKEP Model outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    More details refer to `SKEP <https://www.aclweb.org/anthology/2020.acl-main.374>`.
+
+    Args:
+        vocab_size (`int`, optional, defaults to 12800): Vocabulary size of the SKEP model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`SKEPModel`].
+        hidden_size (`int`, optional, defaults to 768): Dimensionality of the embedding layer, encoder layers and the pooler layer.
+        num_hidden_layers (int, optional, defaults to 12): Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, optional, defaults to 12):  Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, optional, defaults to 3072): Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors to ff layers are firstly projected from `hidden_size` to `intermediate_size`, and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
+        hidden_act (`str`, optional, defaults to "relu"):The non-linear activation function in the encoder and pooler. "gelu", "relu" and any other paddle supported activation functions are supported.
+        hidden_dropout_prob (`float`, optional, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings and encoder.
+        attention_probs_dropout_prob (`float`, optional, defaults to 0.1): The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
+        max_position_embeddings (`int`, optional, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, optional, defaults to 4): The vocabulary size of the *token_type_ids* passed into [`SKEPModel`].
+        initializer_range (`float`, optional, defaults to 0.02): The standard deviation of the normal initializer.
+            .. note::
+                A normal_initializer initializes weight matrices as normal distributions.
+                See :meth:`SkepPretrainedModel.init_weights()` for how weights are initialized in [`SkepModel`].
+        pad_token_id(int, optional, defaults to 0): The index of padding token in the token vocabulary.
+
+    """
+
+    def __init__(self, config: SkepConfig):
+        super(SkepModel, self).__init__(config)
+        self.initializer_range = config.initializer_range
+        self.embeddings = SkepEmbeddings(config)
+        encoder_layer = nn.TransformerEncoderLayer(
+            config.hidden_size,
+            config.num_attention_heads,
+            config.intermediate_size,
+            dropout=config.hidden_dropout_prob,
+            activation=config.hidden_act,
+            attn_dropout=config.attention_probs_dropout_prob,
+            act_dropout=0,
+        )
+        self.encoder = nn.TransformerEncoder(encoder_layer, config.num_hidden_layers)
+        self.pooler = SkepPooler(config)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[Tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The SkepModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids (Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
+                [batch_size, num_attention_heads, sequence_length, sequence_length].
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            inputs_embeds (Tensor, optional):
+                If you want to control how to convert `inputs_ids` indices into associated vectors, you can
+                pass an embedded representation directly instead of passing `inputs_ids`.
+            past_key_values (tuple(tuple(Tensor)), optional):
+                The length of tuple equals to the number of layers, and each inner
+                tuple haves 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`)
+                which contains precomputed key and value hidden states of the attention blocks.
+                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that
+                don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+                `input_ids` of shape `(batch_size, sequence_length)`.
+            use_cache (bool, optional):
+                If set to `True`, `past_key_values` key value states are returned.
+                Defaults to `None`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.ModelOutput` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions`.
+
+            if the result is tuple: Returns tuple (`sequence_output`, `pooled_output`).
+
+            With the fields:
+
+            - `sequence_output` (Tensor):
+                Sequence of hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+            - `pooled_output` (Tensor):
+                The output of first token (`[CLS]`) in sequence.
+                We "pool" the model by simply taking the hidden state corresponding to the first token.
+                Its data type should be float32 and its shape is [batch_size, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import SkepModel, SkepTokenizer
+
+                tokenizer = SkepTokenizer.from_pretrained('skep_ernie_2.0_large_en')
+                model = SkepModel.from_pretrained('skep_ernie_2.0_large_en')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP! ")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+
+        """
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time.")
+
+        # init the default bool value
+        output_attentions = output_attentions if output_attentions is not None else False
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else False
+        return_dict = return_dict if return_dict is not None else False
+        use_cache = use_cache if use_cache is not None else False
+
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+
+        if attention_mask is None:
+            attention_mask = paddle.unsqueeze(
+                (input_ids.astype("int64") == self.config.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e4,
+                axis=[1, 2],
+            )
+            if past_key_values is not None:
+                batch_size = past_key_values[0][0].shape[0]
+
+                past_mask = paddle.zeros([batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype)
+                attention_mask = paddle.concat([past_mask, attention_mask], axis=-1)
+
+        # For 2D attention_mask from tokenizer
+        elif attention_mask.ndim == 2:
+            attention_mask = paddle.unsqueeze(attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype())
+            attention_mask = (1.0 - attention_mask) * -1e4
+        attention_mask.stop_gradient = True
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        self.encoder._use_cache = use_cache  # To be consistent with HF
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask,
+            cache=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if isinstance(encoder_outputs, type(embedding_output)):
+            sequence_output = encoder_outputs
+            pooled_output = self.pooler(sequence_output)
+            return (sequence_output, pooled_output)
+        else:
+            sequence_output = encoder_outputs[0]
+            pooled_output = self.pooler(sequence_output)
+            if not return_dict:
+                return (sequence_output, pooled_output) + encoder_outputs[1:]
+            return BaseModelOutputWithPoolingAndCrossAttentions(
+                last_hidden_state=sequence_output,
+                pooler_output=pooled_output,
+                past_key_values=encoder_outputs.past_key_values,
+                hidden_states=encoder_outputs.hidden_states,
+                attentions=encoder_outputs.attentions,
+            )
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        """get skep input word embedding
+
+        Returns:
+            nn.Embedding: the input word embedding of skep mdoel
+        """
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, embedding: nn.Embedding) -> nn.Embedding:
+        """set skep input embedding
+
+        Returns:
+            nn.Embedding: the instance of new word embedding
+        """
+        self.embeddings.word_embeddings = embedding
+
+
+class SkepForSequenceClassification(SkepPretrainedModel):
+    r"""
+    SKEP Model with a linear layer on top of the pooled output,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        config (:class:`SkepConfig`): An instance of SkepConfig used to contruct SkepForSequenceClassification.
+    """
+
+    def __init__(self, config: SkepConfig):
+        super(SkepForSequenceClassification, self).__init__(config)
+        self.skep = SkepModel(config)
+        self.num_labels = config.num_labels
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, self.num_labels)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The SkepForSequenceClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`SkepModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`SkepModel`.
+            position_ids (Tensor, `optional`):
+                See :class:`SkepModel`.
+            attention_mask (Tensor, optional):
+                See :class:`SkepModel`.
+            labels (Tensor of shape `(batch_size,)`, optional):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in `[0, ..., num_labels - 1]`. If `num_labels == 1`
+                a regression loss is computed (Mean-Square loss), If `num_labels > 1`
+                a classification loss is computed (Cross-Entropy).
+            inputs_embeds(Tensor, optional):
+                See :class:`SkepModel`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import SkepForSequenceClassification, SkepTokenizer
+
+                tokenizer = SkepTokenizer.from_pretrained('skep_ernie_2.0_large_en')
+                model = SkepForSequenceClassification.from_pretrained('skep_ernie_2.0_large_en')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+
+        """
+        outputs = self.skep(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = paddle.nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = paddle.nn.CrossEntropyLoss()
+                loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = paddle.nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class SkepForTokenClassification(SkepPretrainedModel):
+    r"""
+    SKEP Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        config (:class:`SkepConfig`): An instance of SkepConfig used to construct SkepForTokenClassification.
+
+    """
+
+    def __init__(self, config: SkepConfig):
+        super(SkepForTokenClassification, self).__init__(config)
+        self.skep = SkepModel(config)
+        self.num_labels = config.num_labels
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, self.num_labels)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The SkepForTokenClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`SkepModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`SkepModel`.
+            position_ids (Tensor, optional):
+                See :class:`SkepModel`.
+            attention_mask (Tensor, optional):
+                See :class:`SkepModel`.
+            labels (Tensor of shape `(batch_size, sequence_length)`, optional):
+                Labels for computing the token classification loss. Indices should be in `[0, ..., num_labels - 1]`.
+            inputs_embeds(Tensor, optional):
+                See :class:`SkepModel`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import SkepForTokenClassification, SkepTokenizer
+
+                tokenizer = SkepTokenizer.from_pretrained('skep_ernie_2.0_large_en')
+                model = SkepForTokenClassification.from_pretrained('skep_ernie_2.0_large_en')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+
+        """
+        outputs = self.skep(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else (output[0] if len(output) == 1 else output)
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class SkepCrfForTokenClassification(SkepPretrainedModel):
+    r"""
+    SKEPCRF Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        config (:class:`SkepConfig`): An instance of SkepConfig used to construct SkepCrfForTokenClassification.
+    """
+
+    def __init__(self, config: SkepConfig):
+        super(SkepCrfForTokenClassification, self).__init__(config)
+        self.skep = SkepModel(config)
+        self.num_labels = config.num_labels
+        gru_hidden_size = 128
+
+        self.gru = nn.GRU(config.hidden_size, gru_hidden_size, num_layers=2, direction="bidirect")
+        self.fc = nn.Linear(
+            gru_hidden_size * 2,
+            self.num_labels,
+            weight_attr=paddle.ParamAttr(
+                initializer=nn.initializer.Uniform(low=-0.1, high=0.1),
+                regularizer=paddle.regularizer.L2Decay(coeff=1e-4),
+            ),
+        )
+        self.crf = LinearChainCrf(self.num_labels, crf_lr=0.2, with_start_stop_tag=False)
+        self.crf_loss = LinearChainCrfLoss(self.crf)
+        self.viterbi_decoder = ViterbiDecoder(self.crf.transitions, False)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        seq_lens: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The SkepCrfForTokenClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`SkepModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`SkepModel`.
+            position_ids (Tensor, optional):
+                See :class:`SkepModel`.
+            attention_mask (Tensor, optional):
+                See :class:`SkepModel`.
+            seq_lens (Tensor, optional):
+                The input length tensor storing real length of each sequence for correctness.
+                Its data type should be int64 and its shape is `[batch_size]`.
+                Defaults to `None`.
+            labels (Tensor, optional):
+                The input label tensor.
+                Its data type should be int64 and its shape is `[batch_size, sequence_length]`.
+            inputs_embeds(Tensor, optional):
+                See :class:`SkepModel`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.TokenClassifierOutput`.
+
+            if return_dict is False, Returns tensor `loss` if `labels` is not None. Otherwise, returns tensor `prediction`.
+
+            - `loss` (Tensor):
+                The crf loss. Its data type is float32 and its shape is `[batch_size]`.
+
+            - `prediction` (Tensor):
+                The prediction tensor containing the highest scoring tag indices.
+                Its data type is int64 and its shape is `[batch_size, sequence_length]`.
+
+        """
+        outputs = self.skep(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        bigru_output, _ = self.gru(outputs[0])
+        emission = self.fc(bigru_output)
+
+        if seq_lens is None:
+            # compute seq length according to the attention mask
+            if attention_mask is not None:
+                seq_lens = paddle.sum(attention_mask, axis=1, dtype="int64")
+            else:
+                input_ids_shape = input_ids.shape
+                seq_lens = paddle.ones(shape=[input_ids_shape[0]], dtype="int64") * input_ids_shape[1]
+
+        loss, prediction = None, None
+        if labels is not None:
+            loss = self.crf_loss(emission, seq_lens, labels)
+        else:
+            _, prediction = self.viterbi_decoder(emission, seq_lens)
+
+        if not return_dict:
+            # when loss is None, return prediction
+            if labels is not None:
+                return loss if len(outputs[2:]) == 0 else (loss,) + outputs[2:]
+            return prediction if len(outputs[2:]) == 0 else (prediction,) + outputs[2:]
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=prediction,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/skep/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/skep/tokenizer.py
new file mode 100644
index 000000000..8146670b9
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/skep/tokenizer.py
@@ -0,0 +1,588 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import shutil
+from typing import Dict, List, Optional
+
+from paddle.utils import try_import
+
+from paddlenlp.transformers import (
+    BasicTokenizer,
+    PretrainedTokenizer,
+    WordpieceTokenizer,
+)
+
+__all__ = [
+    "SkepTokenizer",
+]
+
+
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(33, 126 + 1)) + list(range(161, 172 + 1)) + list(range(174, 255 + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class BpeEncoder(object):
+    """BpeEncoder"""
+
+    def __init__(self, encoder_json_file, vocab_bpe_file, errors="replace", unk_token="<|endoftext|>", **kwargs):
+        """
+        Constructs a BpeEncoder.
+
+        Args:
+            encoder_json_file (`str`): The path to bpe encode json file.
+            vocab_bpe_file (`str`): The path to bpe vocab file.
+            errors (`str`): the error handler
+            unk_token (`str`): the unk token
+        """
+        self.encoder = self.__get_encoder(encoder_json_file)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        self.bpe_ranks = self.__get_bpe_ranks(vocab_bpe_file)
+        self.unk_token = unk_token
+        self.cache = {}
+        re = try_import("regex")
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+    def __get_encoder(self, encoder_json_file):
+        with open(encoder_json_file, "r") as f:
+            encoder = json.load(f)
+        return encoder
+
+    def __get_bpe_ranks(self, vocab_bpe_file):
+        with open(vocab_bpe_file, "r", encoding="utf-8") as f:
+            bpe_data = f.read()
+        bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]]
+        bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        return bpe_ranks
+
+    def bpe(self, token):
+        """
+        bpe
+        """
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except Exception:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text: str) -> List[int]:
+        """
+        encode the text to token_ids
+        TODO(wj-Mcat): to be deprecated
+        """
+        bpe_tokens = []
+        re = try_import("regex")
+        for token in re.findall(self.pat, text):
+            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    def decode(self, tokens: List[str]) -> str:
+        """
+        decode
+        TODO(wj-Mcat): to be deprecated
+        """
+        text = "".join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    def _tokenize(self, text: str) -> List[str]:
+        """tokenize text into tokens with bpe algo
+
+        Args:
+            text (str): the content of text
+
+        Returns:
+            List[str]: the sub token of text
+        """
+        bpe_tokens = []
+        re = try_import("regex")
+        for token in re.findall(self.pat, text):
+            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    def _convert_token_to_id(self, token: str) -> int:
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index: int) -> str:
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """Converts a sequence of tokens (string) in a single string."""
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+
+class SkepTokenizer(PretrainedTokenizer):
+    r"""
+    Constructs a Skep tokenizer. It uses a basic tokenizer to do punctuation
+    splitting, lower casing and so on, and follows a WordPiece tokenizer to
+    tokenize as subwords.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            The vocabulary file path (ends with '.txt') required to instantiate
+            a `WordpieceTokenizer`.
+        bpe_vocab_file (str, optional):
+            The vocabulary file path of a `BpeTokenizer`. Defaults to `None`.
+        bpe_json_file (str, optional):
+            The json file path of a `BpeTokenizer`. Defaults to `None`.
+        use_bpe_encoder (bool, optional):
+            Whether or not to use BPE Encoder. Defaults to `False`.
+        need_token_type_id (bool, optional):
+            Whether or not to use token type id. Defaults to `True`.
+        add_two_sep_token_inter (bool, optional):
+            Whether or not to add two different `sep_token`. Defaults to `False`.
+        unk_token (str, optional):
+            The special token for unknown words.
+            Defaults to "[UNK]".
+        sep_token (str, optional):
+            The special token for separator token.
+            Defaults to "[SEP]".
+        pad_token (str, optional):
+            The special token for padding.
+            Defaults to "[PAD]".
+        cls_token (str, optional):
+            The special token for cls.
+            Defaults to "[CLS]".
+        mask_token (str, optional):
+            The special token for mask.
+            Defaults to "[MASK]".
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import SkepTokenizer
+            tokenizer = SkepTokenizer.from_pretrained('skep_ernie_2.0_large_en')
+            encoded_inputs = tokenizer('He was a puppeteer')
+            # encoded_inputs:
+            # {
+            #    'input_ids': [101, 2002, 2001, 1037, 13997, 11510, 102],
+            #    'token_type_ids': [0, 0, 0, 0, 0, 0, 0]
+            # }
+    """
+    resource_files_names = {
+        "vocab_file": "vocab.txt",
+        "bpe_vocab_file": "vocab.bpe",
+        "bpe_json_file": "encoder.json",
+    }  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "skep_ernie_1.0_large_ch": "https://bj.bcebos.com/paddlenlp/models/transformers/skep/skep_ernie_1.0_large_ch.vocab.txt",
+            "skep_ernie_2.0_large_en": "https://bj.bcebos.com/paddlenlp/models/transformers/skep/skep_ernie_2.0_large_en.vocab.txt",
+            "skep_roberta_large_en": "https://bj.bcebos.com/paddlenlp/models/transformers/skep/skep_roberta_large_en.vocab.txt",
+        },
+        "bpe_vocab_file": {
+            "skep_ernie_1.0_large_ch": None,
+            "skep_ernie_2.0_large_en": None,
+            "skep_roberta_large_en": "https://bj.bcebos.com/paddlenlp/models/transformers/skep/skep_roberta_large_en.vocab.bpe",
+        },
+        "bpe_json_file": {
+            "skep_ernie_1.0_large_ch": None,
+            "skep_ernie_2.0_large_en": None,
+            "skep_roberta_large_en": "https://bj.bcebos.com/paddlenlp/models/transformers/skep/skep_roberta_large_en.encoder.json",
+        },
+    }
+    max_model_input_sizes = {
+        "skep_ernie_1.0_large_ch": 512,
+        "skep_ernie_2.0_large_en": 512,
+        "skep_roberta_large_en": 514,
+    }
+
+    pretrained_init_configuration = {
+        "skep_ernie_1.0_large_ch": {
+            "do_lower_case": True,
+            "use_bpe_encoder": False,
+            "need_token_type_id": True,
+            "add_two_sep_token_inter": False,
+        },
+        "skep_ernie_2.0_large_en": {
+            "do_lower_case": True,
+            "use_bpe_encoder": False,
+            "need_token_type_id": True,
+            "add_two_sep_token_inter": False,
+        },
+        "skep_roberta_large_en": {
+            "do_lower_case": True,
+            "use_bpe_encoder": True,
+            "need_token_type_id": False,
+            "add_two_sep_token_inter": True,
+        },
+    }
+
+    def __init__(
+        self,
+        vocab_file,
+        bpe_vocab_file=None,
+        bpe_json_file=None,
+        do_lower_case=True,
+        use_bpe_encoder=False,
+        need_token_type_id=True,
+        add_two_sep_token_inter=False,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the "
+                "vocabulary from a pretrained model please use "
+                "`tokenizer = SkepTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+        self.vocab_file = vocab_file
+        self.bpe_vocab_file = bpe_vocab_file
+        self.bpe_json_file = bpe_json_file
+        self.vocab = self.load_vocabulary(
+            vocab_file,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            bos_token=cls_token,
+            eos_token=sep_token,
+            mask_token=mask_token,
+        )
+
+        self.use_bpe_encoder = use_bpe_encoder
+        self.need_token_type_id = need_token_type_id
+        self.add_two_sep_token_inter = add_two_sep_token_inter
+
+        if not self.use_bpe_encoder:
+            self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+            self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=unk_token)
+        else:
+            assert (bpe_vocab_file and bpe_json_file) is not None, "bpe_vocab_file and bpe_json_file must be not None."
+            if os.path.isfile(bpe_vocab_file) and os.path.isfile(bpe_json_file):
+                self.bpe_tokenizer = BpeEncoder(bpe_json_file, bpe_vocab_file, unk_token=unk_token)
+
+    @property
+    def vocab_size(self):
+        r"""
+        Return the size of vocabulary.
+
+        Returns:
+            int: the size of vocabulary.
+        """
+        return len(self.vocab)
+
+    def _tokenize(self, text):
+        r"""
+        End-to-end tokenization for Skep models.
+
+        Args:
+            text (str): The text to be tokenized.
+
+        Returns:
+            list: A list of string representing converted tokens.
+        """
+        split_tokens = []
+        if not self.use_bpe_encoder:
+            for token in self.basic_tokenizer.tokenize(text):
+                for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                    split_tokens.append(sub_token)
+        else:
+            for token in self.bpe_tokenizer._tokenize(text):
+                split_tokens.append(str(token))
+
+        return split_tokens
+
+    def num_special_tokens_to_add(self, pair=False):
+        r"""
+        Returns the number of added tokens when encoding a sequence with special tokens.
+
+        Args:
+            pair (bool, optional):
+                Returns the number of added tokens in the case of a sequence
+                pair if set to True, returns the number of added tokens in the case of a single sequence if set to False.
+                Defaults to False.
+
+        Returns:
+            int: Number of tokens added to sequences
+        """
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        """
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        Should be overridden in a subclass if the model has a special way of building those.
+
+        Args:
+            offset_mapping_0 (List[tuple]):
+                List of char offsets to which the special tokens will be added.
+            offset_mapping_1 (List[tuple], optional):
+                Optional second list of char offsets for offset mapping pairs.
+
+        Returns:
+            List[tuple]: List of char offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        r"""
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens.
+
+        A skep_ernie_1.0_large_ch/skep_ernie_2.0_large_en sequence has the following format:
+
+        - single sequence:      ``[CLS] X [SEP]``
+        - pair of sequences:        ``[CLS] A [SEP] B [SEP]``
+
+        A skep_roberta_large_en sequence has the following format:
+
+        - single sequence:      ``[CLS] X [SEP]``
+        - pair of sequences:        ``[CLS] A [SEP] [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs.
+                Defaults to `None`.
+
+        Returns:
+            list[int]: List of input_id with the appropriate special tokens.
+        """
+        if not self.add_two_sep_token_inter:
+            if token_ids_1 is None:
+                return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+            _cls = [self.cls_token_id]
+            _sep = [self.sep_token_id]
+            return _cls + token_ids_0 + _sep + token_ids_1 + _sep
+        else:
+            if token_ids_1 is None:
+                return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+            _cls = [self.cls_token_id]
+            _sep = [self.sep_token_id]
+            return _cls + token_ids_0 + _sep + _sep + token_ids_1 + _sep
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        r"""
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+
+        A skep_ernie_1.0_large_ch/skep_ernie_2.0_large_en sequence pair mask has the following format:
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        note: There is no need token type ids for skep_roberta_large_ch model.
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs.
+                Defaults to `None`.
+
+        Returns:
+            List[int]: List of token_type_id according to the given sequence(s).
+        """
+        if self.need_token_type_id:
+            _sep = [self.sep_token_id]
+            _cls = [self.cls_token_id]
+            if token_ids_1 is None:
+                return len(_cls + token_ids_0 + _sep) * [0]
+            return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 + _sep) * [1]
+        else:
+            # For model skep-roberta-large-en, token type ids is no need.
+            return None
+
+    def save_resources(self, save_directory):
+        """
+        Save tokenizer related resources to files under `save_directory`.
+
+        Args:
+            save_directory (str): Directory to save files into.
+        """
+        for name, file_name in self.resource_files_names.items():
+            save_path = os.path.join(save_directory, file_name)
+            source_file = getattr(self, name, None)
+            if not source_file:
+                continue
+
+            if os.path.abspath(source_file) != os.path.abspath(save_path):
+                shutil.copyfile(source_file, save_path)
+
+    def convert_tokens_to_string(self, tokens: List[str]):
+        """
+        Converts a sequence of tokens (list of string) in a single string.
+
+        Args:
+            tokens (list): A list of string representing tokens to be converted.
+
+        Returns:
+            str: Converted string from tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import RoFormerTokenizer
+
+                tokenizer = RoFormerTokenizer.from_pretrained('roformer-chinese-base')
+                tokens = tokenizer.tokenize('欢迎使用百度飞桨')
+                #['欢迎', '使用', '百度', '飞', '桨']
+                strings = tokenizer.convert_tokens_to_string(tokens)
+                #'欢迎 使用 百度 飞 桨'
+
+        """
+        # to handle the bpe and wordpiece case
+        if hasattr(self, "wordpiece_tokenizer"):
+            return " ".join(tokens).replace(" ##", "").strip()
+        else:
+            return self.bpe_tokenizer.convert_tokens_to_string(tokens)
+
+    def _convert_token_to_id(self, token: str) -> int:
+        """Converts a token (str) in an id using the vocab."""
+        if self.use_bpe_encoder:
+            return self.bpe_tokenizer._convert_token_to_id(token)
+
+        return super()._convert_token_to_id(token)
+
+    def _convert_id_to_token(self, index: int) -> str:
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if self.use_bpe_encoder:
+            return self.bpe_tokenizer._convert_id_to_token(index)
+
+        return super()._convert_id_to_token(index)
+
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False,
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True,
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def get_vocab(self) -> Dict[str, int]:
+        """
+        Returns the vocabulary as a dictionary of token to index.
+
+        `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the
+        vocab.
+
+        Returns:
+            `Dict[str, int]`: The vocabulary.
+        """
+        return dict(self.vocab.token_to_idx, **self.added_tokens_encoder)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/speecht5/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/speecht5/__init__.py
new file mode 100644
index 000000000..595add0ae
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/speecht5/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/speecht5/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/speecht5/configuration.py
new file mode 100644
index 000000000..a5f9033a1
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/speecht5/configuration.py
@@ -0,0 +1,419 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The Fairseq Authors, Microsoft Research, and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" SpeechT5 model configuration"""
+
+import functools
+import operator
+
+from ..configuration_utils import PretrainedConfig
+
+
+class SpeechT5Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SpeechT5Model`]. It is used to instantiate a
+    SpeechT5 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the SpeechT5
+    [microsoft/speecht5_asr](https://huggingface.co/microsoft/speecht5_asr) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 81):
+            Vocabulary size of the SpeechT5 model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed to the forward method of [`SpeechT5Model`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        encoder_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        encoder_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer decoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer decoder.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        positional_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the text position encoding layers.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for activations inside the fully connected layer.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
+            Scale embeddings by diving by sqrt(d_model).
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in the speech encoder pre-net. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
+            convolutional layers.
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for output of the speech encoder pre-net.
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the 1D convolutional layers of the feature
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
+            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
+            speech encoder pre-net. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the speech encoder pre-net. The
+            length of *conv_stride* defines the number of convolutional layers and has to match the length of
+            *conv_dim*.
+        conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the speech encoder pre-net.
+            The length of *conv_kernel* defines the number of convolutional layers and has to match the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
+            Whether the 1D convolutional layers have a bias.
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
+            Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
+            embeddings layer.
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
+            Number of groups of 1D convolutional positional embeddings layer.
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
+            Whether to apply *SpecAugment* data augmentation to the outputs of the speech encoder pre-net. For
+            reference see [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
+            Recognition](https://arxiv.org/abs/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
+            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
+            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
+            actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the time axis.
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
+            irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
+            mask_time_min_masks''
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
+            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
+            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
+            may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
+            True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the feature axis.
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
+            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
+        num_mel_bins (`int`, *optional*, defaults to 80):
+            Number of mel features used per input features. Used by the speech decoder pre-net. Should correspond to
+            the value used in the [`SpeechT5Processor`] class.
+        speech_decoder_prenet_layers (`int`, *optional*, defaults to 2):
+            Number of layers in the speech decoder pre-net.
+        speech_decoder_prenet_units (`int`, *optional*, defaults to 256):
+            Dimensionality of the layers in the speech decoder pre-net.
+        speech_decoder_prenet_dropout (`float`, *optional*, defaults to 0.5):
+            The dropout probability for the speech decoder pre-net layers.
+        speaker_embedding_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of the *XVector* embedding vectors.
+        speech_decoder_postnet_layers (`int`, *optional*, defaults to 5):
+            Number of layers in the speech decoder post-net.
+        speech_decoder_postnet_units (`int`, *optional*, defaults to 256):
+            Dimensionality of the layers in the speech decoder post-net.
+        speech_decoder_postnet_kernel (`int`, *optional*, defaults to 5):
+            Number of convolutional filter channels in the speech decoder post-net.
+        speech_decoder_postnet_dropout (`float`, *optional*, defaults to 0.5):
+            The dropout probability for the speech decoder post-net layers.
+        reduction_factor (`int`, *optional*, defaults to 2):
+            Spectrogram length reduction factor for the speech decoder inputs.
+        max_speech_positions (`int`, *optional*, defaults to 4000):
+            The maximum sequence length of speech features that this model might ever be used with.
+        max_text_positions (`int`, *optional*, defaults to 450):
+            The maximum sequence length of text features that this model might ever be used with.
+        encoder_max_relative_position (`int`, *optional*, defaults to 160):
+            Maximum distance for relative position embedding in the encoder.
+        use_guided_attention_loss (`bool`, *optional*, defaults to `True`):
+            Whether to apply guided attention loss while training the TTS model.
+        guided_attention_loss_num_heads (`int`, *optional*, defaults to 2):
+            Number of attention heads the guided attention loss will be applied to. Use -1 to apply this loss to all
+            attention heads.
+        guided_attention_loss_sigma (`float`, *optional*, defaults to 0.4):
+            Standard deviation for guided attention loss.
+        guided_attention_loss_scale (`float`, *optional*, defaults to 10.0):
+            Scaling coefficient for guided attention loss (also known as lambda).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+
+    Example:
+
+    ```python
+    >>> from paddlenlp.transformers import SpeechT5Model, SpeechT5Config
+
+    >>> # Initializing a "microsoft/speecht5_asr" style configuration
+    >>> configuration = SpeechT5Config()
+
+    >>> # Initializing a model (with random weights) from the "microsoft/speecht5_asr" style configuration
+    >>> model = SpeechT5Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "speecht5"
+    attribute_map = {
+        "num_attention_heads": "encoder_attention_heads",
+        "num_hidden_layers": "encoder_layers",
+        "num_classes": "num_labels",
+    }
+
+    def __init__(
+        self,
+        vocab_size=81,
+        hidden_size=768,
+        encoder_layers=12,
+        encoder_attention_heads=12,
+        encoder_ffn_dim=3072,
+        encoder_layerdrop=0.1,
+        decoder_layers=6,
+        decoder_ffn_dim=3072,
+        decoder_attention_heads=12,
+        decoder_layerdrop=0.1,
+        hidden_act="gelu",
+        positional_dropout=0.1,
+        hidden_dropout=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        scale_embedding=False,
+        feat_extract_norm="group",
+        feat_proj_dropout=0.0,
+        feat_extract_activation="gelu",
+        conv_dim=(512, 512, 512, 512, 512, 512, 512),
+        conv_stride=(5, 2, 2, 2, 2, 2, 2),
+        conv_kernel=(10, 3, 3, 3, 3, 2, 2),
+        conv_bias=False,
+        num_conv_pos_embeddings=128,
+        num_conv_pos_embedding_groups=16,
+        apply_spec_augment=True,
+        mask_time_prob=0.05,
+        mask_time_length=10,
+        mask_time_min_masks=2,
+        mask_feature_prob=0.0,
+        mask_feature_length=10,
+        mask_feature_min_masks=0,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        decoder_start_token_id=2,
+        num_mel_bins=80,
+        speech_decoder_prenet_layers=2,
+        speech_decoder_prenet_units=256,
+        speech_decoder_prenet_dropout=0.5,
+        speaker_embedding_dim=512,
+        speech_decoder_postnet_layers=5,
+        speech_decoder_postnet_units=256,
+        speech_decoder_postnet_kernel=5,
+        speech_decoder_postnet_dropout=0.5,
+        reduction_factor=2,
+        max_speech_positions=4000,
+        max_text_positions=450,
+        encoder_max_relative_position=160,
+        use_guided_attention_loss=True,
+        guided_attention_loss_num_heads=2,
+        guided_attention_loss_sigma=0.4,
+        guided_attention_loss_scale=10.0,
+        use_cache=True,
+        is_encoder_decoder=True,
+        **kwargs,
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.encoder_layers = encoder_layers
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_attention_heads = encoder_attention_heads
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layers = decoder_layers
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_attention_heads = decoder_attention_heads
+        self.decoder_layerdrop = decoder_layerdrop
+        self.hidden_act = hidden_act
+        self.positional_dropout = positional_dropout
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.scale_embedding = scale_embedding
+
+        self.feat_extract_norm = feat_extract_norm
+        self.feat_proj_dropout = feat_proj_dropout
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = list(conv_dim)
+        self.conv_stride = list(conv_stride)
+        self.conv_kernel = list(conv_kernel)
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.num_feat_extract_layers = len(self.conv_dim)
+
+        if (
+            (len(self.conv_stride) != self.num_feat_extract_layers)
+            or (len(self.conv_kernel) != self.num_feat_extract_layers)
+            or (len(self.conv_dim) != self.num_feat_extract_layers)
+        ):
+            raise ValueError(
+                "Configuration for convolutional layers is incorrect. It is required that `len(config.conv_dim)` =="
+                " `len(config.conv_stride)` == `len(config.conv_kernel)`, but is `len(config.conv_dim) ="
+                f" {len(self.conv_dim)}`, `len(config.conv_stride) = {len(self.conv_stride)}`,"
+                f" `len(config.conv_kernel) = {len(self.conv_kernel)}`."
+            )
+
+        # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
+        self.apply_spec_augment = apply_spec_augment
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
+        self.mask_feature_prob = mask_feature_prob
+        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
+
+        self.num_mel_bins = num_mel_bins
+        self.speech_decoder_prenet_layers = speech_decoder_prenet_layers
+        self.speech_decoder_prenet_units = speech_decoder_prenet_units
+        self.speech_decoder_prenet_dropout = speech_decoder_prenet_dropout
+        self.speaker_embedding_dim = speaker_embedding_dim
+
+        self.speech_decoder_postnet_layers = speech_decoder_postnet_layers
+        self.speech_decoder_postnet_units = speech_decoder_postnet_units
+        self.speech_decoder_postnet_kernel = speech_decoder_postnet_kernel
+        self.speech_decoder_postnet_dropout = speech_decoder_postnet_dropout
+        self.reduction_factor = reduction_factor
+
+        self.max_speech_positions = max_speech_positions
+        self.max_text_positions = max_text_positions
+        self.encoder_max_relative_position = encoder_max_relative_position
+
+        self.use_guided_attention_loss = use_guided_attention_loss
+        self.guided_attention_loss_num_heads = guided_attention_loss_num_heads
+        self.guided_attention_loss_sigma = guided_attention_loss_sigma
+        self.guided_attention_loss_scale = guided_attention_loss_scale
+
+        self.use_cache = use_cache
+        self.is_encoder_decoder = is_encoder_decoder
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
+
+    def inputs_to_logits_ratio(self):
+        return functools.reduce(operator.mul, self.conv_stride, 1)
+
+
+class SpeechT5HifiGanConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SpeechT5HifiGanModel`]. It is used to instantiate
+    a SpeechT5 HiFi-GAN vocoder model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the SpeechT5
+    [microsoft/speecht5_hifigan](https://huggingface.co/microsoft/speecht5_hifigan) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        model_in_dim (`int`, *optional*, defaults to 80):
+            The number of frequency bins in the input log-mel spectrogram.
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate at which the output audio will be generated, expressed in hertz (Hz).
+        upsample_initial_channel (`int`, *optional*, defaults to 512):
+            The number of input channels into the upsampling network.
+        upsample_rates (`Tuple[int]` or `List[int]`, *optional*, defaults to `[4, 4, 4, 4]`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the upsampling network. The
+            length of *upsample_rates* defines the number of convolutional layers and has to match the length of
+            *upsample_kernel_sizes*.
+        upsample_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[8, 8, 8, 8]`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the upsampling network. The
+            length of *upsample_kernel_sizes* defines the number of convolutional layers and has to match the length of
+            *upsample_rates*.
+        resblock_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[3, 7, 11]`):
+            A tuple of integers defining the kernel sizes of the 1D convolutional layers in the multi-receptive field
+            fusion (MRF) module.
+        resblock_dilation_sizes (`Tuple[Tuple[int]]` or `List[List[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`):
+            A nested tuple of integers defining the dilation rates of the dilated 1D convolutional layers in the
+            multi-receptive field fusion (MRF) module.
+        initializer_range (`float`, *optional*, defaults to 0.01):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        leaky_relu_slope (`float`, *optional*, defaults to 0.1):
+            The angle of the negative slope used by the leaky ReLU activation.
+        normalize_before (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the spectrogram before vocoding using the vocoder's learned mean and variance.
+
+    Example:
+
+    ```python
+    >>> from paddlenlp.transformers import SpeechT5HifiGan, SpeechT5HifiGanConfig
+
+    >>> # Initializing a "microsoft/speecht5_hifigan" style configuration
+    >>> configuration = SpeechT5HifiGanConfig()
+
+    >>> # Initializing a model (with random weights) from the "microsoft/speecht5_hifigan" style configuration
+    >>> model = SpeechT5HifiGan(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "hifigan"
+
+    def __init__(
+        self,
+        model_in_dim=80,
+        sampling_rate=16000,
+        upsample_initial_channel=512,
+        upsample_rates=[4, 4, 4, 4],
+        upsample_kernel_sizes=[8, 8, 8, 8],
+        resblock_kernel_sizes=[3, 7, 11],
+        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        initializer_range=0.01,
+        leaky_relu_slope=0.1,
+        normalize_before=True,
+        **kwargs,
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        self.model_in_dim = model_in_dim
+        self.sampling_rate = sampling_rate
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_rates = upsample_rates
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.initializer_range = initializer_range
+        self.leaky_relu_slope = leaky_relu_slope
+        self.normalize_before = normalize_before
+        super().__init__(**kwargs)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/speecht5/feature_extraction.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/speecht5/feature_extraction.py
new file mode 100644
index 000000000..422c8c464
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/speecht5/feature_extraction.py
@@ -0,0 +1,394 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The Fairseq Authors, Microsoft Research, and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+import paddle
+
+from ...utils.log import logger
+from ..audio_utils import mel_filter_bank, optimal_fft_length, spectrogram
+from ..feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ..feature_extraction_utils import BatchFeature
+from ..tokenizer_utils_base import PaddingStrategy
+
+__all__ = ["SpeechT5FeatureExtractor"]
+
+
+class SpeechT5FeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs a SpeechT5 feature extractor.
+
+    This class can pre-process a raw speech signal by (optionally) normalizing to zero-mean unit-variance, for use by
+    the SpeechT5 speech encoder prenet.
+
+    This class can also extract log-mel filter bank features from raw speech, for use by the SpeechT5 speech decoder
+    prenet.
+
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        feature_size (`int`, *optional*, defaults to 1):
+            The feature dimension of the extracted features.
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
+        padding_value (`float`, *optional*, defaults to 0.0):
+            The value that is used to fill the padding values.
+        do_normalize (`bool`, *optional*, defaults to `False`):
+            Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
+            improve the performance for some models.
+        num_mel_bins (`int`, *optional*, defaults to 80):
+            The number of mel-frequency bins in the extracted spectrogram features.
+        hop_length (`int`, *optional*, defaults to 16):
+            Number of ms between windows. Otherwise referred to as "shift" in many papers.
+        win_length (`int`, *optional*, defaults to 64):
+            Number of ms per window.
+        win_function (`str`, *optional*, defaults to `"hann_window"`):
+            Name for the window function used for windowing, must be accessible via `paddle.{win_function}`
+        frame_signal_scale (`float`, *optional*, defaults to 1.0):
+            Constant multiplied in creating the frames before applying DFT. This argument is deprecated.
+        fmin (`float`, *optional*, defaults to 80):
+            Minimum mel frequency in Hz.
+        fmax (`float`, *optional*, defaults to 7600):
+            Maximum mel frequency in Hz.
+        mel_floor (`float`, *optional*, defaults to 1e-10):
+            Minimum value of mel frequency banks.
+        reduction_factor (`int`, *optional*, defaults to 2):
+            Spectrogram length reduction factor. This argument is deprecated.
+        return_attention_mask (`bool`, *optional*, defaults to `True`):
+            Whether or not [`~SpeechT5FeatureExtractor.__call__`] should return `attention_mask`.
+    """
+
+    model_input_names = ["input_values", "attention_mask"]
+
+    def __init__(
+        self,
+        feature_size: int = 1,
+        sampling_rate: int = 16000,
+        padding_value: float = 0.0,
+        do_normalize: bool = False,
+        num_mel_bins: int = 80,
+        hop_length: int = 16,
+        win_length: int = 64,
+        win_function: str = "hann_window",
+        frame_signal_scale: float = 1.0,
+        fmin: float = 80,
+        fmax: float = 7600,
+        mel_floor: float = 1e-10,
+        reduction_factor: int = 2,
+        return_attention_mask: bool = True,
+        **kwargs,
+    ):
+        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+        self.do_normalize = do_normalize
+        self.return_attention_mask = return_attention_mask
+
+        self.num_mel_bins = num_mel_bins
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.win_function = win_function
+        self.frame_signal_scale = frame_signal_scale
+        self.fmin = fmin
+        self.fmax = fmax
+        self.mel_floor = mel_floor
+        self.reduction_factor = reduction_factor
+
+        self.sample_size = win_length * sampling_rate // 1000
+        self.sample_stride = hop_length * sampling_rate // 1000
+        self.n_fft = optimal_fft_length(self.sample_size)
+        self.n_freqs = (self.n_fft // 2) + 1
+
+        window = paddle.audio.functional.get_window(
+            win_function.split("_")[0], win_length=self.sample_size, fftbins=True
+        )
+        self.window = window.numpy().astype(np.float64)
+
+        self.mel_filters = mel_filter_bank(
+            num_frequency_bins=self.n_freqs,
+            num_mel_filters=self.num_mel_bins,
+            min_frequency=self.fmin,
+            max_frequency=self.fmax,
+            sampling_rate=self.sampling_rate,
+            norm="slaney",
+            mel_scale="slaney",
+        )
+
+        if frame_signal_scale != 1.0:
+            warnings.warn(
+                "The argument `frame_signal_scale` is deprecated and will be removed in version 4.30.0 of Transformers",
+                FutureWarning,
+            )
+        if reduction_factor != 2.0:
+            warnings.warn(
+                "The argument `reduction_factor` is deprecated and will be removed in version 4.30.0 of Transformers",
+                FutureWarning,
+            )
+
+    @staticmethod
+    # Copied from transformers.models.wav2vec2.feature_extraction_wav2vec2.Wav2Vec2FeatureExtractor.zero_mean_unit_var_norm
+    def zero_mean_unit_var_norm(
+        input_values: List[np.ndarray], attention_mask: List[np.ndarray], padding_value: float = 0.0
+    ) -> List[np.ndarray]:
+        """
+        Every array in the list is normalized to have zero mean and unit variance
+        """
+        if attention_mask is not None:
+            attention_mask = np.array(attention_mask, np.int32)
+            normed_input_values = []
+
+            for vector, length in zip(input_values, attention_mask.sum(-1)):
+                normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
+                if length < normed_slice.shape[0]:
+                    normed_slice[length:] = padding_value
+
+                normed_input_values.append(normed_slice)
+        else:
+            normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
+
+        return normed_input_values
+
+    def _extract_mel_features(
+        self,
+        one_waveform: np.ndarray,
+    ) -> np.ndarray:
+        """
+        Extracts log-mel filterbank features for one waveform array (unbatched).
+        """
+        log_mel_spec = spectrogram(
+            one_waveform,
+            window=self.window,
+            frame_length=self.sample_size,
+            hop_length=self.sample_stride,
+            fft_length=self.n_fft,
+            mel_filters=self.mel_filters,
+            mel_floor=self.mel_floor,
+            log_mel="log10",
+        )
+        return log_mel_spec.T
+
+    def __call__(
+        self,
+        audio: Optional[Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]]] = None,
+        audio_target: Optional[Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]]] = None,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        max_length: Optional[int] = None,
+        truncation: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_tensors: Optional[str] = None,
+        sampling_rate: Optional[int] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s).
+
+        Pass in a value for `audio` to extract waveform features. Pass in a value for `audio_target` to extract log-mel
+        spectrogram features.
+
+        Args:
+            audio (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`, *optional*):
+                The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values. This outputs waveform features. Must
+                be mono channel audio, not stereo, i.e. single float per timestep.
+            audio_target (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`, *optional*):
+                The sequence or batch of sequences to be processed as targets. Each sequence can be a numpy array, a
+                list of float values, a list of numpy arrays or a list of list of float values. This outputs log-mel
+                spectrogram features.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`):
+                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value.
+
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
+            return_attention_mask (`bool`, *optional*):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific feature_extractor's default.
+
+                [What are attention masks?](../glossary#attention-mask)
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+                - `'pd'`: Return PaddlePaddle `paddle.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `audio` or `audio_target` input was sampled. It is strongly recommended
+                to pass `sampling_rate` at the forward call to prevent silent errors.
+        """
+        if audio is None and audio_target is None:
+            raise ValueError("You must provide either `audio` or `audio_target` values.")
+
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+                    f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
+                    f" {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                "It is strongly recommended to pass the ``sampling_rate`` argument to this function. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        if audio is not None:
+            inputs = self._process_audio(
+                audio,
+                False,
+                padding,
+                max_length,
+                truncation,
+                pad_to_multiple_of,
+                return_attention_mask,
+                return_tensors,
+                **kwargs,
+            )
+        else:
+            inputs = None
+
+        if audio_target is not None:
+            inputs_target = self._process_audio(
+                audio_target,
+                True,
+                padding,
+                max_length,
+                truncation,
+                pad_to_multiple_of,
+                return_attention_mask,
+                return_tensors,
+                **kwargs,
+            )
+
+            if inputs is None:
+                return inputs_target
+            else:
+                inputs["labels"] = inputs_target["input_values"]
+                decoder_attention_mask = inputs_target.get("attention_mask")
+                if decoder_attention_mask is not None:
+                    inputs["decoder_attention_mask"] = decoder_attention_mask
+
+        return inputs
+
+    def _process_audio(
+        self,
+        speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
+        is_target: bool = False,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        max_length: Optional[int] = None,
+        truncation: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_tensors: Optional[str] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        is_batched_numpy = isinstance(speech, np.ndarray) and len(speech.shape) > 1
+        if is_batched_numpy and len(speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(speech, (list, tuple)) and (isinstance(speech[0], (np.ndarray, tuple, list)))
+        )
+
+        if is_batched:
+            speech = [np.asarray(speech, dtype=np.float32) for speech in speech]
+        elif not is_batched and not isinstance(speech, np.ndarray):
+            speech = np.asarray(speech, dtype=np.float32)
+        elif isinstance(speech, np.ndarray) and speech.dtype is np.dtype(np.float64):
+            speech = speech.astype(np.float32)
+
+        # always return batch
+        if not is_batched:
+            speech = [speech]
+
+        # needed to make pad() work on spectrogram inputs
+        feature_size_hack = self.feature_size
+
+        # convert into correct format for padding
+        if is_target:
+            features = [self._extract_mel_features(waveform) for waveform in speech]
+            encoded_inputs = BatchFeature({"input_values": features})
+            self.feature_size = self.num_mel_bins
+        else:
+            encoded_inputs = BatchFeature({"input_values": speech})
+
+        padded_inputs = self.pad(
+            encoded_inputs,
+            padding=padding,
+            max_length=max_length,
+            truncation=truncation,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+            **kwargs,
+        )
+
+        self.feature_size = feature_size_hack
+
+        # convert input values to correct format
+        input_values = padded_inputs["input_values"]
+        if not isinstance(input_values[0], np.ndarray):
+            padded_inputs["input_values"] = [np.asarray(array, dtype=np.float32) for array in input_values]
+        elif (
+            not isinstance(input_values, np.ndarray)
+            and isinstance(input_values[0], np.ndarray)
+            and input_values[0].dtype is np.dtype(np.float64)
+        ):
+            padded_inputs["input_values"] = [array.astype(np.float32) for array in input_values]
+        elif isinstance(input_values, np.ndarray) and input_values.dtype is np.dtype(np.float64):
+            padded_inputs["input_values"] = input_values.astype(np.float32)
+
+        # convert attention_mask to correct format
+        attention_mask = padded_inputs.get("attention_mask")
+        if attention_mask is not None:
+            padded_inputs["attention_mask"] = [np.asarray(array, dtype=np.int32) for array in attention_mask]
+
+        # zero-mean and unit-variance normalization
+        if not is_target and self.do_normalize:
+            attention_mask = (
+                attention_mask
+                if self._get_padding_strategies(padding, max_length=max_length) is not PaddingStrategy.DO_NOT_PAD
+                else None
+            )
+            padded_inputs["input_values"] = self.zero_mean_unit_var_norm(
+                padded_inputs["input_values"], attention_mask=attention_mask, padding_value=self.padding_value
+            )
+
+        if return_tensors is not None:
+            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+
+        return padded_inputs
+
+    def to_dict(self, *args, **kwargs) -> Dict[str, Any]:
+        output = super().to_dict(*args, **kwargs)
+
+        # Don't serialize these as they are derived from the other properties.
+        names = ["window", "mel_filters", "sample_size", "sample_stride", "n_fft", "n_freqs"]
+        for name in names:
+            if name in output:
+                del output[name]
+
+        return output
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/speecht5/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/speecht5/modeling.py
new file mode 100644
index 000000000..a94dabca1
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/speecht5/modeling.py
@@ -0,0 +1,3112 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The Fairseq Authors, Microsoft Research, and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" PyTorch SpeechT5 model."""
+
+import math
+import random
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.distributed.fleet.utils import recompute
+from paddle.nn import BCEWithLogitsLoss, CrossEntropyLoss, L1Loss
+
+from ...utils.initializer import (
+    constant_,
+    kaiming_normal_,
+    normal_,
+    ones_,
+    uniform_,
+    zeros_,
+)
+from ...utils.log import logger
+from ..activations import ACT2FN
+from ..model_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+    Seq2SeqSpectrogramOutput,
+)
+from ..model_utils import PretrainedModel
+
+# from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration import SpeechT5Config, SpeechT5HifiGanConfig
+
+_HIDDEN_STATES_START_POSITION = 1
+
+# General docstring
+_CONFIG_FOR_DOC = "SpeechT5Config"
+
+
+SPEECHT5_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "microsoft/speecht5_asr",
+    "microsoft/speecht5_tts",
+    "microsoft/speecht5_vc",
+    # See all SpeechT5 models at https://huggingface.co/models?filter=speecht5
+]
+
+
+def masked_fill(x, mask, value):
+    y = paddle.full(x.shape, value, x.dtype)
+    return paddle.where(mask, y, x)
+
+
+def finfo(dtype: paddle.dtype = None):
+    if dtype is None:
+        dtype = paddle.get_default_dtype()
+
+    if dtype == paddle.bfloat16:
+        # Numpy do not support `np.finfo(np.uint16)`, so try to construct a finfo object to fetch min value
+        class BFloatFInfo:
+            min = -3.3895313892515355e38
+
+        return BFloatFInfo
+    if dtype == paddle.float32:
+        return np.finfo(np.float32)
+    if dtype == paddle.float16:
+        return np.finfo(np.float16)
+    if dtype == paddle.float64:
+        return np.finfo(np.float64)
+
+
+def Parameter(tensor):
+    return paddle.create_parameter(tensor.shape, dtype=tensor.dtype, default_initializer=nn.initializer.Assign(tensor))
+
+
+# Copied from paddlenlp.transformers.models.bart.modeling_bart.shift_tokens_right
+def shift_tokens_right(input_ids: paddle.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = paddle.zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("self.model.config.pad_token_id has to be defined.")
+    # replace possible -100 values in labels by `pad_token_id`
+    masked_fill(shifted_input_ids, shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+def shift_spectrograms_right(input_values: paddle.Tensor, reduction_factor: int = 1):
+    """
+    Shift input spectrograms one timestep to the right. Also applies the reduction factor to the sequence length.
+    """
+    # thin out frames for reduction factor
+    if reduction_factor > 1:
+        input_values = input_values[:, reduction_factor - 1 :: reduction_factor]
+
+    shifted_input_values = paddle.zeros(input_values.shape)
+    shifted_input_values[:, 1:] = input_values[:, :-1].clone()
+
+    # replace possible -100 values in labels by zeros
+    masked_fill(shifted_input_values, shifted_input_values == -100.0, 0.0)
+
+    return shifted_input_values
+
+
+# Copied from paddlenlp.transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(input_ids_shape: paddle.shape, dtype: paddle.dtype, past_key_values_length: int = 0):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = paddle.full((tgt_len, tgt_len), float(finfo(dtype).min))
+    mask_cond = paddle.arange(mask.shape[-1])
+    masked_fill(mask, mask_cond < (mask_cond + 1).reshape([mask.shape[-1], 1]), 0)
+    mask = mask.cast(dtype)
+
+    if past_key_values_length > 0:
+        mask = paddle.concat([paddle.zeros([tgt_len, past_key_values_length], dtype=dtype), mask], axis=-1)
+    return mask[None, None, :, :].expand([bsz, 1, tgt_len, tgt_len + past_key_values_length])
+
+
+# Copied from paddlenlp.transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: paddle.Tensor, dtype: paddle.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.shape
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand([bsz, 1, tgt_len, src_len]).cast(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return masked_fill(inverted_mask, inverted_mask.cast("bool"), finfo(dtype).min)
+
+
+# Copied from paddlenlp.transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[paddle.Tensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.sum(-1).detach().tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape([batch_size, max_num_masked_span * mask_length])
+
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        [batch_size, max_num_masked_span * mask_length]
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+# Copied from paddlenlp.transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2NoLayerNormConvLayer with Wav2Vec2->SpeechT5
+class SpeechT5NoLayerNormConvLayer(nn.Layer):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1D(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias_attr=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+# Copied from paddlenlp.transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2LayerNormConvLayer with Wav2Vec2->SpeechT5
+class SpeechT5LayerNormConvLayer(nn.Layer):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1D(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias_attr=config.conv_bias,
+        )
+        self.layer_norm = nn.LayerNorm(self.out_conv_dim)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+
+        hidden_states = hidden_states.transpose(-2, -1)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(-2, -1)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+# Copied from paddlenlp.transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GroupNormConvLayer with Wav2Vec2->SpeechT5
+class SpeechT5GroupNormConvLayer(nn.Layer):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1D(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias_attr=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+# Copied from paddlenlp.transformers.models.speech_to_text.modeling_speech_to_text.Speech2TextSinusoidalPositionalEmbedding with Speech2Text->SpeechT5
+class SpeechT5SinusoidalPositionalEmbedding(nn.Layer):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        super().__init__()
+        self.offset = 2
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
+
+    def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
+        if hasattr(self, "weights"):
+            # in forward put the weights on the correct dtype and device of the param
+            emb_weights = emb_weights.cast(dtype=self.weights.dtype)
+
+        self.weights = Parameter(emb_weights)
+        self.weights.stop_gradient = True
+        self.weights.detach()
+
+    @staticmethod
+    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        """
+        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
+        description in Section 3.5 of "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = paddle.exp(paddle.arange(half_dim, dtype="float32") * -emb)
+        emb = paddle.arange(num_embeddings, dtype="float32").unsqueeze(1) * emb.unsqueeze(0)
+        emb = paddle.concat([paddle.sin(emb), paddle.cos(emb)], axis=1).reshape([num_embeddings, -1])
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = paddle.concat([emb, paddle.zeros([num_embeddings, 1])], axis=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+        return emb.cast(paddle.get_default_dtype())
+
+    @paddle.no_grad()
+    def forward(self, input_ids: paddle.Tensor, past_key_values_length: int = 0):
+        bsz, seq_len = input_ids.shape
+        # Create the position ids from the input token ids. Any padded tokens remain padded.
+        position_ids = self.create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
+
+        # expand embeddings if needed
+        max_pos = self.padding_idx + 1 + seq_len
+        if max_pos > self.weights.shape[0]:
+            self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
+
+        return self.weights.index_select(axis=0, index=position_ids.reshape([-1])).reshape([bsz, seq_len, -1]).detach()
+
+    def create_position_ids_from_input_ids(
+        self, input_ids: paddle.Tensor, padding_idx: int, past_key_values_length: Optional[int] = 0
+    ):
+        """
+        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
+        symbols are ignored. This is modified from fairseq's `utils.make_positions`.
+
+        Args:
+            x: paddle.Tensor x:
+        Returns: paddle.Tensor
+        """
+        # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+        # mask = input_ids.ne(padding_idx).cast('int32')
+        mask = input_ids.cast("int64").not_equal(paddle.to_tensor([padding_idx], dtype="int64")).cast("int32")
+        incremental_indices = (paddle.cumsum(mask, axis=1).cast(mask.dtype) + past_key_values_length) * mask
+        return incremental_indices.cast("int64") + padding_idx
+
+
+# Copied from paddlenlp.transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PositionalConvEmbedding with Wav2Vec2->SpeechT5
+class SpeechT5PositionalConvEmbedding(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1D(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.num_conv_pos_embeddings,
+            padding=config.num_conv_pos_embeddings // 2,
+            groups=config.num_conv_pos_embedding_groups,
+        )
+        # self.conv = nn.utils.weight_norm(self.conv, name="weight")
+        self.padding = SpeechT5SamePadLayer(config.num_conv_pos_embeddings)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+
+        hidden_states = hidden_states.transpose([0, 2, 1])
+
+        hidden_states = self.conv(hidden_states)
+        print(self.conv.weight)
+        hidden_states = self.padding(hidden_states)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = hidden_states.transpose([0, 2, 1])
+        return hidden_states
+
+
+class SpeechT5ScaledPositionalEncoding(nn.Layer):
+    """
+    Scaled positional encoding, see §3.2 in https://arxiv.org/abs/1809.08895
+    """
+
+    def __init__(self, dropout, dim, max_len=5000):
+        pe = paddle.zeros([max_len, dim])
+        position = paddle.arange(0, max_len).unsqueeze(1)
+        div_term = paddle.exp((paddle.arange(0, dim, 2, dtype="float32") * -(math.log(10000.0) / dim)))
+        pe[:, 0::2] = paddle.sin(position.cast("float32") * div_term)
+        pe[:, 1::2] = paddle.cos(position.cast("float32") * div_term)
+        pe = pe.unsqueeze(0)
+        super().__init__()
+        self.register_buffer("pe", pe)
+        self.dropout = nn.Dropout(p=dropout)
+        self.dim = dim
+        self.alpha = Parameter(paddle.to_tensor([1.0]))
+
+    def forward(self, emb):
+        emb = emb + self.alpha * self.pe[:, : emb.shape[1]]
+        emb = self.dropout(emb)
+        return emb
+
+
+class SpeechT5RelativePositionalEncoding(nn.Layer):
+    def __init__(self, dim, max_length=1000):
+        super().__init__()
+        self.dim = dim
+        self.max_length = max_length
+        self.pe_k = paddle.nn.Embedding(2 * max_length, dim)
+
+    def forward(self, hidden_states):
+        seq_len = hidden_states.shape[1]
+        pos_seq = paddle.arange(0, seq_len).cast("int64")
+        pos_seq = pos_seq[:, None] - pos_seq[None, :]
+
+        pos_seq[pos_seq < -self.max_length] = -self.max_length
+        pos_seq[pos_seq >= self.max_length] = self.max_length - 1
+        pos_seq = pos_seq + self.max_length
+
+        return self.pe_k(pos_seq)
+
+
+# Copied from paddlenlp.transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer with Wav2Vec2->SpeechT5
+class SpeechT5SamePadLayer(nn.Layer):
+    def __init__(self, num_conv_pos_embeddings):
+        super().__init__()
+        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
+
+    def forward(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+        return hidden_states
+
+
+# Copied from paddlenlp.transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->SpeechT5
+class SpeechT5FeatureEncoder(nn.Layer):
+    """Construct the features from raw audio waveform"""
+
+    def __init__(self, config):
+        super().__init__()
+
+        if config.feat_extract_norm == "group":
+            conv_layers = [SpeechT5GroupNormConvLayer(config, layer_id=0)] + [
+                SpeechT5NoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
+            ]
+        elif config.feat_extract_norm == "layer":
+            conv_layers = [
+                SpeechT5LayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
+            ]
+        else:
+            raise ValueError(
+                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
+            )
+        self.conv_layers = nn.LayerList(conv_layers)
+        self.gradient_checkpointing = False
+        self._requires_grad = True
+
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.stop_gradient = True
+        self._requires_grad = False
+
+    def forward(self, input_values):
+        hidden_states = input_values[:, None]
+
+        # make sure hidden_states require grad for gradient_checkpointing
+        if self._requires_grad and self.training:
+            hidden_states.stop_gradiet = False
+
+        for conv_layer in self.conv_layers:
+            if self._requires_grad and self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                hidden_states = recompute(
+                    create_custom_forward(conv_layer),
+                    hidden_states,
+                )
+            else:
+                hidden_states = conv_layer(hidden_states.cast("float32"))
+
+        return hidden_states
+
+
+# Copied from paddlenlp.transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->SpeechT5
+class SpeechT5FeatureProjection(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], epsilon=config.layer_norm_eps)
+        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states, norm_hidden_states
+
+
+class SpeechT5SpeechEncoderPrenet(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.feature_encoder = SpeechT5FeatureEncoder(config)
+        self.feature_projection = SpeechT5FeatureProjection(config)
+
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = Parameter(uniform_(paddle.to_tensor([config.hidden_size], dtype="float32"), 0, 1))
+
+        self.pos_conv_embed = SpeechT5PositionalConvEmbedding(config)
+        self.pos_sinusoidal_embed = SpeechT5SinusoidalPositionalEmbedding(
+            config.max_speech_positions + config.pad_token_id + 1,
+            config.hidden_size,
+            config.pad_token_id,
+        )
+
+    def freeze_feature_encoder(self):
+        self.feature_encoder._freeze_parameters()
+
+    def forward(
+        self,
+        input_values: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        mask_time_indices: Optional[paddle.Tensor] = None,
+    ):
+        extract_features = self.feature_encoder(input_values)
+        extract_features = extract_features.transpose([0, 2, 1])
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1],
+                attention_mask,
+            )
+
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+        positional_conv_embedding = self.pos_conv_embed(hidden_states)
+        hidden_states = hidden_states + positional_conv_embedding
+
+        if attention_mask is not None:
+            padding_mask = attention_mask.not_equal(paddle.to_tensor([1], dtype="bool")).cast("int64")
+        else:
+            padding_mask = paddle.zeros(hidden_states.shape[:2], dtype="int64")
+
+        positional_sinusoidal_embeddings = self.pos_sinusoidal_embed(padding_mask)
+        hidden_states = hidden_states + positional_sinusoidal_embeddings
+        return hidden_states, attention_mask
+
+    # Copied from paddlenlp.transformers.models.unispeech.modeling_unispeech.UniSpeechPretrainedModel._get_feature_vector_attention_mask
+    def _get_feature_vector_attention_mask(self, feature_vector_length: int, attention_mask: paddle.Tensor):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(axis=-1)[:, -1]
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths).cast("int64")
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = paddle.zeros((batch_size, feature_vector_length), dtype=attention_mask.dtype)
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(paddle.arange(attention_mask.shape[0]), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).cast("bool")
+        return attention_mask
+
+    # Copied from paddlenlp.transformers.models.unispeech.modeling_unispeech.UniSpeechPretrainedModel._get_feat_extract_output_lengths
+    def _get_feat_extract_output_lengths(self, input_lengths: Union[paddle.Tensor, int]):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/paddle.nn.Conv1D.html
+            # return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+            return (input_length - kernel_size) // stride + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            if isinstance(input_lengths, paddle.Tensor):
+                input_lengths = input_lengths.cast("int64")
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        return input_lengths
+
+    # Copied from paddlenlp.transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states
+    def _mask_hidden_states(
+        self,
+        hidden_states: paddle.Tensor,
+        mask_time_indices: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+    ):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
+        """
+
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return hidden_states
+
+        # generate indices & apply SpecAugment along time axis
+        batch_size, sequence_length, hidden_size = hidden_states.shape
+
+        if mask_time_indices is not None:
+            # apply SpecAugment along time axis with given mask_time_indices
+            hidden_states[mask_time_indices] = self.masked_spec_embed.cast(hidden_states.dtype)
+        elif self.config.mask_time_prob > 0 and self.training:
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                attention_mask=attention_mask,
+                min_masks=self.config.mask_time_min_masks,
+            )
+            mask_time_indices = paddle.to_tensor(mask_time_indices, dtype="bool")
+            hidden_states[mask_time_indices] = self.masked_spec_embed.cast(hidden_states.dtype)
+
+        if self.config.mask_feature_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along feature axis
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
+            )
+            mask_feature_indices = paddle.to_tensor(mask_feature_indices, dtype="bool")
+            mask_feature_indices = mask_feature_indices[:, None].expand([-1, sequence_length, -1])
+            hidden_states[mask_feature_indices] = 0
+
+        return hidden_states
+
+
+class SpeechT5SpeechDecoderPrenet(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        self.layers = nn.LayerList(
+            [
+                nn.Linear(
+                    config.num_mel_bins if i == 0 else config.speech_decoder_prenet_units,
+                    config.speech_decoder_prenet_units,
+                )
+                for i in range(config.speech_decoder_prenet_layers)
+            ]
+        )
+
+        self.final_layer = nn.Linear(config.speech_decoder_prenet_units, config.hidden_size)
+
+        self.encode_positions = SpeechT5ScaledPositionalEncoding(
+            config.positional_dropout,
+            config.hidden_size,
+            config.max_speech_positions,
+        )
+
+        self.speaker_embeds_layer = nn.Linear(config.speaker_embedding_dim + config.hidden_size, config.hidden_size)
+
+    def forward(
+        self,
+        input_values: paddle.Tensor,
+        speaker_embeddings: Optional[paddle.Tensor] = None,
+    ):
+        # Dropout is always applied, even when evaluating. See §2.2 in https://arxiv.org/abs/1712.05884.
+
+        inputs_embeds = input_values
+        for layer in self.layers:
+            inputs_embeds = nn.functional.relu(layer(inputs_embeds))
+            inputs_embeds = nn.functional.dropout(
+                inputs_embeds, self.config.speech_decoder_prenet_dropout, training=True
+            )
+
+        inputs_embeds = self.final_layer(inputs_embeds)
+        inputs_embeds = self.encode_positions(inputs_embeds)
+
+        if speaker_embeddings is not None:
+            speaker_embeddings = nn.functional.normalize(speaker_embeddings)
+            speaker_embeddings = speaker_embeddings.unsqueeze(1)
+            speaker_embeddings = speaker_embeddings.expand([-1, inputs_embeds.shape[1], -1])
+            inputs_embeds = paddle.concat([inputs_embeds, speaker_embeddings], axis=-1)
+            inputs_embeds = nn.functional.relu(self.speaker_embeds_layer(inputs_embeds))
+
+        return inputs_embeds
+
+
+class SpeechT5BatchNormConvLayer(nn.Layer):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+
+        if layer_id == 0:
+            in_conv_dim = config.num_mel_bins
+        else:
+            in_conv_dim = config.speech_decoder_postnet_units
+
+        if layer_id == config.speech_decoder_postnet_layers - 1:
+            out_conv_dim = config.num_mel_bins
+        else:
+            out_conv_dim = config.speech_decoder_postnet_units
+
+        self.conv = nn.Conv1D(
+            in_conv_dim,
+            out_conv_dim,
+            kernel_size=config.speech_decoder_postnet_kernel,
+            stride=1,
+            padding=(config.speech_decoder_postnet_kernel - 1) // 2,
+            bias_attr=False,
+        )
+        self.batch_norm = nn.BatchNorm1D(out_conv_dim)
+
+        if layer_id < config.speech_decoder_postnet_layers - 1:
+            self.activation = nn.Tanh()
+        else:
+            self.activation = None
+
+        self.dropout = nn.Dropout(config.speech_decoder_postnet_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.batch_norm(hidden_states)
+        if self.activation is not None:
+            hidden_states = self.activation(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class SpeechT5SpeechDecoderPostnet(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        self.feat_out = nn.Linear(config.hidden_size, config.num_mel_bins * config.reduction_factor)
+        self.prob_out = nn.Linear(config.hidden_size, config.reduction_factor)
+
+        self.layers = nn.LayerList(
+            [SpeechT5BatchNormConvLayer(config, i) for i in range(config.speech_decoder_postnet_layers)]
+        )
+
+    def forward(self, hidden_states: paddle.Tensor):
+        outputs_before_postnet = self.feat_out(hidden_states).reshape(
+            [hidden_states.shape[0], -1, self.config.num_mel_bins]
+        )
+        outputs_after_postnet = self.postnet(outputs_before_postnet)
+        logits = self.prob_out(hidden_states).reshape([hidden_states.shape[0], -1])
+        return outputs_before_postnet, outputs_after_postnet, logits
+
+    def postnet(self, hidden_states: paddle.Tensor):
+        layer_output = hidden_states.transpose([0, 2, 1])
+        for layer in self.layers:
+            layer_output = layer(layer_output)
+        return hidden_states + layer_output.transpose([0, 2, 1])
+
+
+class SpeechT5TextEncoderPrenet(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.encode_positions = SpeechT5ScaledPositionalEncoding(
+            config.positional_dropout,
+            config.hidden_size,
+            config.max_text_positions,
+        )
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def forward(self, input_ids: paddle.Tensor):
+        inputs_embeds = self.embed_tokens(input_ids)
+        inputs_embeds = self.encode_positions(inputs_embeds)
+        return inputs_embeds
+
+
+class SpeechT5TextDecoderPrenet(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.dropout = nn.Dropout(config.positional_dropout)
+        self.embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+
+        self.embed_positions = SpeechT5SinusoidalPositionalEmbedding(
+            config.max_text_positions + config.pad_token_id + 1,
+            config.hidden_size,
+            config.pad_token_id,
+        )
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def forward(
+        self,
+        input_ids: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[List[paddle.Tensor]] = None,
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.shape
+            input_ids = input_ids.reshape([-1, input_shape[-1]])
+        else:
+            raise ValueError("You have to specify `decoder_input_ids`")
+
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        positions = self.embed_positions(input_ids, past_key_values_length)
+
+        inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+        inputs_embeds += positions
+        inputs_embeds = self.dropout(inputs_embeds)
+
+        return inputs_embeds, attention_mask
+
+
+class SpeechT5TextDecoderPostnet(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias_attr=False)
+
+    def forward(self, hidden_states: paddle.Tensor):
+        return self.lm_head(hidden_states)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+
+class SpeechT5Attention(nn.Layer):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper with relative position bias (see
+    https://aclanthology.org/N18-2074.pdf)
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias_attr=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias_attr=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias_attr=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias_attr=bias)
+
+    def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
+        return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        key_value_states: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        layer_head_mask: Optional[paddle.Tensor] = None,
+        position_bias: Optional[paddle.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.shape
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = paddle.concat([past_key_value[0], key_states], axis=2)
+            value_states = paddle.concat([past_key_value[1], value_states], axis=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(paddle.Tensor, paddle.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(paddle.Tensor, paddle.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).reshape(proj_shape)
+        key_states = key_states.reshape(proj_shape)
+        value_states = value_states.reshape(proj_shape)
+
+        src_len = key_states.shape[1]
+        attn_weights = paddle.bmm(query_states, key_states.transpose([0, 2, 1]))
+
+        if attn_weights.shape != [bsz * self.num_heads, tgt_len, src_len]:
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.shape}"
+            )
+
+        # relative attention bias
+        if position_bias is not None:
+            reshape_q = query_states.reshape([bsz * self.num_heads, -1, self.head_dim]).transpose([1, 0, 2])
+            rel_pos_bias = paddle.matmul(reshape_q, position_bias.transpose([0, 2, 1]))
+            rel_pos_bias = rel_pos_bias.transpose([1, 0, 2]).reshape(
+                [bsz * self.num_heads, position_bias.shape[0], position_bias.shape[1]]
+            )
+            attn_weights += rel_pos_bias
+
+        if attention_mask is not None:
+            if attention_mask.shape != [bsz, 1, tgt_len, src_len]:
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.shape}"
+                )
+            attn_weights = attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len]) + attention_mask
+            attn_weights = attn_weights.reshape([bsz * self.num_heads, tgt_len, src_len])
+
+        attn_weights = nn.functional.softmax(attn_weights, axis=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.shape != [
+                self.num_heads,
+            ]:
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.shape}"
+                )
+            attn_weights = layer_head_mask.reshape([1, -1, 1, 1]) * attn_weights.reshape(
+                [bsz, self.num_heads, tgt_len, src_len]
+            )
+            attn_weights = attn_weights.reshape([bsz * self.num_heads, tgt_len, src_len])
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.reshape([bsz, self.num_heads, tgt_len, src_len])
+            attn_weights = attn_weights_reshaped.reshape([bsz * self.num_heads, tgt_len, src_len])
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = paddle.bmm(attn_probs, value_states)
+        if attn_output.shape != [bsz * self.num_heads, tgt_len, self.head_dim]:
+            raise ValueError(
+                f"`attn_output` should be of size {[bsz, self.num_heads, tgt_len, self.head_dim]}, but is"
+                f" {attn_output.shape}"
+            )
+
+        attn_output = attn_output.reshape([bsz, self.num_heads, tgt_len, self.head_dim])
+        attn_output = attn_output.transpose([0, 2, 1, 3])
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape([bsz, tgt_len, self.embed_dim])
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class SpeechT5FeedForward(nn.Layer):
+    def __init__(self, config, intermediate_size):
+        super().__init__()
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
+
+        self.intermediate_dense = nn.Linear(config.hidden_size, intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+        self.output_dense = nn.Linear(intermediate_size, config.hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
+
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
+        return hidden_states
+
+
+class SpeechT5EncoderLayer(nn.Layer):
+    def __init__(self, config: SpeechT5Config):
+        super().__init__()
+        self.attention = SpeechT5Attention(
+            embed_dim=config.hidden_size,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=False,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.feed_forward = SpeechT5FeedForward(config, config.encoder_ffn_dim)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        layer_head_mask: Optional[paddle.Tensor] = None,
+        position_bias: Optional[paddle.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`paddle.Tensor`):
+                input to the layer of shape `(batch, seq_len, hidden_size)`
+            attention_mask (`paddle.Tensor`):
+                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
+                large negative values.
+            layer_head_mask (`paddle.Tensor`): mask for attention heads in a given layer of size
+                `(config.encoder_attention_heads,)`.
+            position_bias (`paddle.Tensor`):
+                relative position embeddings of size `(seq_len, seq_len, hidden_size // encoder_attention_heads)`
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights, _ = self.attention(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            position_bias=position_bias,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = residual + hidden_states
+
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class SpeechT5DecoderLayer(nn.Layer):
+    def __init__(self, config: SpeechT5Config):
+        super().__init__()
+        self.self_attn = SpeechT5Attention(
+            embed_dim=config.hidden_size,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.self_attn_layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+
+        self.encoder_attn = SpeechT5Attention(
+            config.hidden_size,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+
+        self.feed_forward = SpeechT5FeedForward(config, config.decoder_ffn_dim)
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        encoder_attention_mask: Optional[paddle.Tensor] = None,
+        layer_head_mask: Optional[paddle.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ):
+        """
+        Args:
+            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, hidden_size)`
+            attention_mask (`paddle.Tensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`paddle.Tensor`):
+                cross attention input to the layer of shape `(batch, seq_len, hidden_size)`
+            encoder_attention_mask (`paddle.Tensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`paddle.Tensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`paddle.Tensor`): mask for cross-attention heads in a given layer of
+                size `(decoder_attention_heads,)`.
+            past_key_value (`Tuple(paddle.Tensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = self.dropout(hidden_states)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        hidden_states = hidden_states + self.feed_forward(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class SpeechT5PretrainedModel(PretrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = SpeechT5Config
+    base_model_prefix = "speecht5"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, SpeechT5PositionalConvEmbedding):
+
+            normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv._kernel_size[0] * module.conv._in_channels)),
+            )
+            constant_(module.conv.bias, 0)
+        elif isinstance(module, SpeechT5FeatureProjection):
+            # module.projection.weight.shape[0] == module.projection.in_features
+            k = math.sqrt(1 / module.projection.weight.shape[0])
+            uniform_(module.projection.weight, a=-k, b=k)
+            uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                zeros_(module.bias)
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            zeros_(module.bias)
+            ones_(module.weight)
+        elif isinstance(module, nn.Conv1D):
+            kaiming_normal_(module.weight)
+            if module.bias is not None:
+                k = math.sqrt(module._groups / (module._in_channels * module._kernel_size[0]))
+                uniform_(module.bias, a=-k, b=k)
+        elif isinstance(module, nn.Embedding):
+            normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+            if module._padding_idx is not None:
+                zeros_(module.weight[module._padding_idx])
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (SpeechT5Encoder, SpeechT5Decoder, SpeechT5FeatureEncoder)):
+            module.gradient_checkpointing = value
+
+
+class SpeechT5Encoder(SpeechT5PretrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* layers. Each layer is a [`SpeechT5EncoderLayer`].
+    """
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layerdrop = config.encoder_layerdrop
+
+        self.layers = nn.LayerList([SpeechT5EncoderLayer(config) for _ in range(config.encoder_layers)])
+
+        self.embed_positions = SpeechT5RelativePositionalEncoding(
+            config.hidden_size // config.encoder_attention_heads, config.encoder_max_relative_position
+        )
+
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.init_weights()
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        """
+        Args:
+            hidden_states (`paddle.Tensor` of shape `(batch_size, sequence_length, feature_size)`):
+                Features extracted from the speech or text input by the encoder prenet.
+            attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
+                `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            head_mask (`paddle.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
+
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        position_bias = self.embed_positions(hidden_states)
+
+        # deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.shape[0] != len(self.layers):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.shape[0]}."
+                )
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = np.random.uniform(0, 1)
+            # print(dropout_probability)
+
+            skip_the_layer = self.training and (dropout_probability < self.layerdrop)
+            if not skip_the_layer:
+                # under deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    # create gradient checkpointing function
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = recompute(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                        (head_mask[idx] if head_mask is not None else None),
+                        position_bias,
+                    )
+                else:
+
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        position_bias=position_bias,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                    )
+
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class SpeechT5EncoderWithSpeechPrenet(SpeechT5PretrainedModel):
+    """
+    Wrapper around SpeechT5Encoder that applies SpeechT5SpeechEncoderPrenet to convert the audio waveform data to
+    hidden features.
+    """
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__(config)
+        self.prenet = SpeechT5SpeechEncoderPrenet(config)
+        self.wrapped_encoder = SpeechT5Encoder(config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.init_weights()
+
+    def forward(
+        self,
+        input_values: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        hidden_states, attention_mask = self.prenet(input_values, attention_mask)
+
+        outputs = self.wrapped_encoder(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return outputs
+
+
+class SpeechT5EncoderWithTextPrenet(SpeechT5PretrainedModel):
+    """
+    Wrapper around SpeechT5Encoder that applies SpeechT5TextEncoderPrenet to convert the input_ids to hidden features.
+    """
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__(config)
+        self.prenet = SpeechT5TextEncoderPrenet(config)
+        self.wrapped_encoder = SpeechT5Encoder(config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.prenet.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.prenet.set_input_embeddings(value)
+
+    def forward(
+        self,
+        input_values: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        hidden_states = self.prenet(input_values)
+
+        outputs = self.wrapped_encoder(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return outputs
+
+
+class SpeechT5EncoderWithoutPrenet(SpeechT5PretrainedModel):
+    """
+    This wrapper class is a helper class to correctly load pretrained checkpoints when used in combination with
+    [`SpeechT5Model`].
+    """
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__(config)
+        self.wrapped_encoder = SpeechT5Encoder(config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.init_weights()
+
+    def forward(
+        self,
+        input_values: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        return self.wrapped_encoder(
+            hidden_states=input_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+class SpeechT5Decoder(SpeechT5PretrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SpeechT5DecoderLayer`]
+    """
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__(config)
+        self.layerdrop = config.decoder_layerdrop
+
+        self.layers = nn.LayerList([SpeechT5DecoderLayer(config) for _ in range(config.decoder_layers)])
+
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.init_weights()
+
+    # Copied from paddlenlp.transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                past_key_values_length=past_key_values_length,
+            )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    def forward(
+        self,
+        hidden_states: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        encoder_attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        cross_attn_head_mask: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[List[paddle.Tensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        r"""
+        Args:
+            hidden_states (`paddle.Tensor` of shape `(batch_size, sequence_length, feature_size)`):
+                Features extracted from the speech or text input by the decoder prenet.
+            attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`paddle.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`paddle.Tensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`paddle.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`paddle.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
+                cross-attention on hidden heads. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`paddle.Tensor` of
+                shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
+                `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
+                control over how to convert `input_ids` indices into associated vectors than the model's internal
+                embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        input_shape = hidden_states.shape[:-1]
+
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, hidden_states, past_key_values_length
+        )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, hidden_states.dtype, tgt_len=input_shape[-1])
+
+        # deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.shape[0] != (len(self.layers)):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.shape[0]}."
+                    )
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+
+            skip_the_layer = self.training and (dropout_probability < self.layerdrop)
+            if skip_the_layer:
+                continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+
+                    return custom_forward
+
+                layer_outputs = recompute(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attentions, all_cross_attentions]
+                if v is not None
+            )
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class SpeechT5DecoderWithSpeechPrenet(SpeechT5PretrainedModel):
+    """
+    Wrapper around SpeechT5Decoder that applies SpeechT5SpeechDecoderPrenet to convert log-mel filterbanks to hidden
+    features.
+    """
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__(config)
+        self.prenet = SpeechT5SpeechDecoderPrenet(config)
+        self.wrapped_decoder = SpeechT5Decoder(config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.init_weights()
+
+    def forward(
+        self,
+        input_values: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        encoder_attention_mask: Optional[paddle.Tensor] = None,
+        speaker_embeddings: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        cross_attn_head_mask: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[List[paddle.Tensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        decoder_hidden_states = self.prenet(input_values, speaker_embeddings)
+
+        outputs = self.wrapped_decoder(
+            hidden_states=decoder_hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return outputs
+
+
+class SpeechT5DecoderWithTextPrenet(SpeechT5PretrainedModel):
+    """
+    Wrapper around SpeechT5Decoder that applies SpeechT5TextDecoderPrenet to convert input tokens to hidden features.
+    """
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__(config)
+        self.prenet = SpeechT5TextDecoderPrenet(config)
+        self.wrapped_decoder = SpeechT5Decoder(config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.prenet.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.prenet.set_input_embeddings(value)
+
+    def forward(
+        self,
+        input_values: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        encoder_attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        cross_attn_head_mask: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[List[paddle.Tensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        decoder_hidden_states, attention_mask = self.prenet(input_values, attention_mask, past_key_values)
+
+        outputs = self.wrapped_decoder(
+            hidden_states=decoder_hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return outputs
+
+
+class SpeechT5DecoderWithoutPrenet(SpeechT5PretrainedModel):
+    """
+    This wrapper class is a helper class to correctly load pretrained checkpoints when used in combination with
+    [`SpeechT5Model`].
+    """
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__(config)
+        self.wrapped_decoder = SpeechT5Decoder(config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.init_weights()
+
+    def forward(
+        self,
+        input_values: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        encoder_attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        cross_attn_head_mask: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[List[paddle.Tensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        outputs = self.wrapped_decoder(
+            hidden_states=input_values,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return outputs
+
+
+class SpeechT5GuidedMultiheadAttentionLoss(nn.Layer):
+    """
+    Guided attention loss from the paper [Efficiently Trainable Text-to-Speech System Based on Deep Convolutional
+    Networks with Guided Attention](https://arxiv.org/abs/1710.08969), adapted for multi-head attention.
+    """
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__()
+        self.sigma = config.guided_attention_loss_sigma
+        self.scale = config.guided_attention_loss_scale
+
+    def forward(
+        self, attentions: paddle.Tensor, input_masks: paddle.Tensor, output_masks: paddle.Tensor
+    ) -> paddle.Tensor:
+        """
+        Compute the attention loss.
+
+        Args:
+            attentions (`paddle.Tensor` of shape `(batch_size, layers * heads, output_sequence_length, input_sequence_length)`):
+                Batch of multi-head attention weights
+            input_masks (`paddle.Tensor` of shape `(batch_size, input_sequence_length)`):
+                Input attention mask as booleans.
+            output_masks (`paddle.Tensor` of shape `(batch_size, output_sequence_length)`):
+                Target attention mask as booleans.
+
+        Returns:
+            `paddle.Tensor` with the loss value
+        """
+        guided_attn_masks = self._make_guided_attention_masks(input_masks, output_masks)
+        masks = output_masks.unsqueeze(-1) & input_masks.unsqueeze(-2)
+        masks = masks.unsqueeze(1)
+
+        losses = guided_attn_masks * attentions
+        loss = paddle.mean(losses.masked_select(masks))
+        return self.scale * loss
+
+    def _make_guided_attention_masks(self, input_masks, output_masks):
+        input_lengths = input_masks.sum(-1)
+        output_lengths = output_masks.sum(-1)
+
+        guided_attn_masks = paddle.zeros((len(input_masks), output_masks.shape[1], input_masks.shape[1]))
+
+        for idx, (ilen, olen) in enumerate(zip(input_lengths, output_lengths)):
+            guided_attn_masks[idx, :olen, :ilen] = self._make_guided_attention_mask(ilen, olen, self.sigma)
+
+        return guided_attn_masks.unsqueeze(1)
+
+    @staticmethod
+    def _make_guided_attention_mask(input_length, output_length, sigma):
+        grid_y, grid_x = paddle.meshgrid(
+            paddle.arange(input_length),
+            paddle.arange(output_length),
+            indexing="xy",
+        )
+        grid_x = grid_x.cast("float32") / output_length
+        grid_y = grid_y.cast("float32") / input_length
+        return 1.0 - paddle.exp(-((grid_y - grid_x) ** 2) / (2 * (sigma**2)))
+
+
+class SpeechT5SpectrogramLoss(nn.Layer):
+    """
+    Loss computation used by SpeechT5ForTextToSpeech.
+    """
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__()
+        self.use_guided_attention_loss = config.use_guided_attention_loss
+        self.guided_attention_loss_num_heads = config.guided_attention_loss_num_heads
+        self.reduction_factor = config.reduction_factor
+
+        self.l1_criterion = L1Loss()
+        self.bce_criterion = BCEWithLogitsLoss(pos_weight=paddle.to_tensor([5.0]))
+
+        if self.use_guided_attention_loss:
+            self.attn_criterion = SpeechT5GuidedMultiheadAttentionLoss(config)
+
+    def forward(
+        self,
+        attention_mask: paddle.Tensor,
+        outputs_before_postnet: paddle.Tensor,
+        outputs_after_postnet: paddle.Tensor,
+        logits: paddle.Tensor,
+        labels: paddle.Tensor,
+        cross_attentions: Optional[paddle.Tensor] = None,
+    ) -> paddle.Tensor:
+        padding_mask = labels != -100.0
+
+        # mask out the padded portions
+        labels = labels.masked_select(padding_mask)
+        outputs_before_postnet = outputs_before_postnet.masked_select(padding_mask)
+        outputs_after_postnet = outputs_after_postnet.masked_select(padding_mask)
+
+        # spectrogram loss
+        l1_loss = self.l1_criterion(outputs_after_postnet, labels) + self.l1_criterion(outputs_before_postnet, labels)
+
+        # construct stop labels from the padding mask
+        masks = padding_mask[:, :, 0]
+        stop_labels = paddle.concat([~masks * 1.0, paddle.ones(masks.shape[0], 1)], axis=1)
+        stop_labels = stop_labels[:, 1:].masked_select(masks)
+        logits = logits.masked_select(masks)
+
+        # stop token loss
+        bce_loss = self.bce_criterion(logits, stop_labels)
+
+        # combined loss
+        loss = l1_loss + bce_loss
+
+        # guided attention loss
+        if self.use_guided_attention_loss:
+            attn = paddle.concat([x[:, : self.guided_attention_loss_num_heads] for x in cross_attentions], axis=1)
+            input_masks = attention_mask == 1
+            output_masks = padding_mask[:, :, 0]
+            if self.reduction_factor > 1:
+                output_masks = output_masks[:, self.reduction_factor - 1 :: self.reduction_factor]
+            attn_loss = self.attn_criterion(attn, input_masks, output_masks)
+            loss += attn_loss
+
+        return loss
+
+
+class SpeechT5Model(SpeechT5PretrainedModel):
+    def __init__(
+        self,
+        config: SpeechT5Config,
+        encoder: Optional[nn.Layer] = None,
+        decoder: Optional[nn.Layer] = None,
+    ):
+        super().__init__(config)
+        self.config = config
+        self.encoder = SpeechT5EncoderWithoutPrenet(config) if encoder is None else encoder
+        self.decoder = SpeechT5DecoderWithoutPrenet(config) if decoder is None else decoder
+
+        # Initialize weights and apply final processing
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        if isinstance(self.encoder, SpeechT5EncoderWithTextPrenet):
+            return self.encoder.get_input_embeddings()
+        if isinstance(self.decoder, SpeechT5DecoderWithTextPrenet):
+            return self.decoder.get_input_embeddings()
+        return None
+
+    def set_input_embeddings(self, value):
+        if isinstance(self.encoder, SpeechT5EncoderWithTextPrenet):
+            self.encoder.set_input_embeddings(value)
+        if isinstance(self.decoder, SpeechT5DecoderWithTextPrenet):
+            self.decoder.set_input_embeddings(value)
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        if isinstance(self.encoder, SpeechT5EncoderWithSpeechPrenet):
+            self.encoder.prenet.freeze_feature_encoder()
+
+    def forward(
+        self,
+        input_values: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        decoder_input_values: Optional[paddle.Tensor] = None,
+        decoder_attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        decoder_head_mask: Optional[paddle.Tensor] = None,
+        cross_attn_head_mask: Optional[paddle.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        speaker_embeddings: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[paddle.Tensor], Seq2SeqModelOutput]:
+        r"""
+        input_values (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+            Depending on which encoder is being used, the `input_values` are either: float values of the input raw
+            speech waveform, or indices of input sequence tokens in the vocabulary, or hidden states.
+
+        decoder_input_values (`paddle.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Depending on which decoder is being used, the `decoder_input_values` are either: float values of log-mel
+            filterbank features extracted from the raw speech waveform, or indices of decoder input sequence tokens in
+            the vocabulary, or hidden states.
+
+        speaker_embeddings (`paddle.Tensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
+            Tensor containing the speaker embeddings.
+
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Encode if needed (training, first prediction pass)
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_values=input_values,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # downsample encoder attention mask (only for encoders with speech input)
+        if attention_mask is not None and isinstance(self.encoder, SpeechT5EncoderWithSpeechPrenet):
+            encoder_attention_mask = self.encoder.prenet._get_feature_vector_attention_mask(
+                encoder_outputs[0].shape[1], attention_mask
+            )
+        else:
+            encoder_attention_mask = attention_mask
+
+        if isinstance(self.decoder, SpeechT5DecoderWithSpeechPrenet):
+            decoder_args = {"speaker_embeddings": speaker_embeddings}
+        else:
+            decoder_args = {}
+
+        decoder_outputs = self.decoder(
+            input_values=decoder_input_values,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            **decoder_args,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+class SpeechT5ForSpeechToText(SpeechT5PretrainedModel):
+    _keys_to_ignore_on_load_missing = [
+        r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights",
+        r"text_decoder_postnet.lm_head.weight",
+    ]
+    _keys_to_ignore_on_save = [
+        r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights",
+    ]
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__(config)
+
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that does not define the"
+                " vocabulary size of the language model head. Please instantiate the model as follows:"
+                " `SpeechT5ForSpeechToText.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of"
+                " your model's configuration."
+            )
+
+        speech_encoder = SpeechT5EncoderWithSpeechPrenet(config)
+        text_decoder = SpeechT5DecoderWithTextPrenet(config)
+        self.speecht5 = SpeechT5Model(config, speech_encoder, text_decoder)
+
+        self.text_decoder_postnet = SpeechT5TextDecoderPostnet(config)
+
+        # Initialize weights and apply final processing
+        self.init_weights()
+
+    def get_encoder(self):
+        return self.speecht5.get_encoder()
+
+    def get_decoder(self):
+        return self.speecht5.get_decoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.get_encoder().prenet.freeze_feature_encoder()
+
+    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens)
+        return new_embeddings
+
+    def get_output_embeddings(self):
+        return self.text_decoder_postnet.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        self.text_decoder_postnet.set_output_embeddings(new_embeddings)
+
+    def forward(
+        self,
+        input_values: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        decoder_input_ids: Optional[paddle.Tensor] = None,
+        decoder_attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        decoder_head_mask: Optional[paddle.Tensor] = None,
+        cross_attn_head_mask: Optional[paddle.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[paddle.Tensor] = None,
+    ) -> Union[Tuple, Seq2SeqLMOutput]:
+        r"""
+        input_values (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (*pip install
+            soundfile*). To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding
+            and conversion into a tensor of type `paddle.Tensor`. See [`SpeechT5Processor.__call__`] for details.
+
+        decoder_input_ids (`paddle.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            SpeechT5 uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+        labels (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
+            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
+            only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            Label indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from paddlenlp.transformers import SpeechT5Processor, SpeechT5ForSpeechToText
+        >>> from datasets import load_dataset
+
+        >>> dataset = load_dataset(
+        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
+        ... )  # doctest: +IGNORE_RESULT
+        >>> dataset = dataset.sort("id")
+        >>> sampling_rate = dataset.features["audio"].sampling_rate
+
+        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
+        >>> model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr")
+
+        >>> # audio file is decoded on the fly
+        >>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pd")
+        >>> predicted_ids = model.generate(**inputs, max_length=100)
+
+        >>> # transcribe speech
+        >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+        >>> transcription[0]
+        'mister quilter is the apostle of the middle classes and we are glad to welcome his gospel'
+        ```
+
+        ```python
+        >>> inputs["labels"] = processor(text_target=dataset[0]["text"], return_tensors="pd").input_ids
+
+        >>> # compute loss
+        >>> loss = model(**inputs).loss
+        >>> round(loss.item(), 2)
+        19.68
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if decoder_input_ids is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+        outputs = self.speecht5(
+            input_values=input_values,
+            attention_mask=attention_mask,
+            decoder_input_values=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+        logits = self.text_decoder_postnet(outputs[0])
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.reshape([-1, self.config.vocab_size]), labels.reshape([-1]))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past_key_values,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+
+
+def _generate_speech(
+    model: SpeechT5PretrainedModel,
+    input_values: paddle.Tensor,
+    speaker_embeddings: Optional[paddle.Tensor] = None,
+    threshold: float = 0.5,
+    minlenratio: float = 0.0,
+    maxlenratio: float = 20.0,
+    vocoder: Optional[nn.Layer] = None,
+    output_cross_attentions: bool = False,
+) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
+    encoder_attention_mask = paddle.ones_like(input_values)
+
+    encoder_out = model.speecht5.encoder(
+        input_values=input_values,
+        attention_mask=encoder_attention_mask,
+        return_dict=True,
+    )
+
+    encoder_last_hidden_state = encoder_out.last_hidden_state
+
+    # downsample encoder attention mask
+    if isinstance(model.speecht5.encoder, SpeechT5EncoderWithSpeechPrenet):
+        encoder_attention_mask = model.speecht5.encoder.prenet._get_feature_vector_attention_mask(
+            encoder_out[0].shape[1], encoder_attention_mask
+        )
+
+    maxlen = int(encoder_last_hidden_state.shape[1] * maxlenratio / model.config.reduction_factor)
+    minlen = int(encoder_last_hidden_state.shape[1] * minlenratio / model.config.reduction_factor)
+
+    # Start the output sequence with a mel spectrum that is all zeros.
+    output_sequence = paddle.zeros([1, 1, model.config.num_mel_bins], dtype=encoder_last_hidden_state.dtype)
+
+    spectrogram = []
+    cross_attentions = []
+    past_key_values = None
+    idx = 0
+
+    while True:
+        idx += 1
+
+        # Run the decoder prenet on the entire output sequence.
+        decoder_hidden_states = model.speecht5.decoder.prenet(output_sequence, speaker_embeddings)
+
+        # Run the decoder layers on the last element of the prenet output.
+        decoder_out = model.speecht5.decoder.wrapped_decoder(
+            hidden_states=decoder_hidden_states[:, -1:],
+            attention_mask=None,
+            encoder_hidden_states=encoder_last_hidden_state,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=True,
+            output_attentions=output_cross_attentions,
+            return_dict=True,
+        )
+
+        if output_cross_attentions:
+            cross_attentions.append(paddle.concat(decoder_out.cross_attentions, axis=0))
+
+        last_decoder_output = decoder_out.last_hidden_state[0, -1]
+        past_key_values = decoder_out.past_key_values
+
+        # Predict the new mel spectrum for this step in the sequence.
+        spectrum = model.speech_decoder_postnet.feat_out(last_decoder_output)
+        spectrum = spectrum.reshape([model.config.reduction_factor, model.config.num_mel_bins])
+        spectrogram.append(spectrum)
+
+        # Extend the output sequence with the new mel spectrum.
+        output_sequence = paddle.concat(
+            (output_sequence, spectrum[-1].reshape([1, 1, model.config.num_mel_bins])), axis=1
+        )
+
+        # Predict the probability that this is the stop token.
+        prob = F.sigmoid(model.speech_decoder_postnet.prob_out(last_decoder_output))
+
+        # Finished when stop token or maximum length is reached.
+        if idx >= minlen and (int(sum(prob.numpy() >= threshold)) > 0 or idx >= maxlen):
+            spectrogram = paddle.concat(spectrogram, axis=0).unsqueeze(0)
+            spectrogram = model.speech_decoder_postnet.postnet(spectrogram)
+            spectrogram = spectrogram.squeeze(0)
+            break
+
+    if vocoder is not None:
+        outputs = vocoder(spectrogram)
+    else:
+        outputs = spectrogram
+
+    if output_cross_attentions:
+        cross_attentions = paddle.concat(cross_attentions, axis=2)
+        outputs = (outputs, cross_attentions)
+
+    return outputs
+
+
+class SpeechT5ForTextToSpeech(SpeechT5PretrainedModel):
+    _keys_to_ignore_on_load_missing = []
+    _keys_to_ignore_on_save = []
+
+    main_input_name = "input_ids"
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__(config)
+
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that does not define the"
+                " vocabulary size of the language model head. Please instantiate the model as follows:"
+                " `SpeechT5ForTextToSpeech.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of"
+                " your model's configuration."
+            )
+
+        text_encoder = SpeechT5EncoderWithTextPrenet(config)
+        speech_decoder = SpeechT5DecoderWithSpeechPrenet(config)
+        self.speecht5 = SpeechT5Model(config, text_encoder, speech_decoder)
+
+        self.speech_decoder_postnet = SpeechT5SpeechDecoderPostnet(config)
+
+        # Initialize weights and apply final processing
+        self.init_weights()
+
+    def get_encoder(self):
+        return self.speecht5.get_encoder()
+
+    def get_decoder(self):
+        return self.speecht5.get_decoder()
+
+    def forward(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        decoder_input_values: Optional[paddle.Tensor] = None,
+        decoder_attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        decoder_head_mask: Optional[paddle.Tensor] = None,
+        cross_attn_head_mask: Optional[paddle.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        speaker_embeddings: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+        stop_labels: Optional[paddle.Tensor] = None,
+    ) -> Union[Tuple, Seq2SeqSpectrogramOutput]:
+        r"""
+        input_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. The `batch_size` should be 1 currently.
+
+            Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
+            [`~PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        decoder_input_values (`paddle.Tensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
+            Float values of input mel spectrogram.
+
+            SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
+            `past_key_values`).
+        speaker_embeddings (`paddle.Tensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
+            Tensor containing the speaker embeddings.
+        labels (`paddle.Tensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
+            Float values of target mel spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
+            computation. Spectrograms can be obtained using [`SpeechT5Processor`]. See [`SpeechT5Processor.__call__`]
+            for details.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from paddlenlp.transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, set_seed
+        >>> import paddle
+
+        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+        >>> model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
+        >>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+
+        >>> inputs = processor(text="Hello, my dog is cute", return_tensors="pd")
+        >>> speaker_embeddings = paddle.zeros((1, 512))  # or load xvectors from a file
+
+        >>> set_seed(555)  # make deterministic
+
+        >>> # generate speech
+        >>> speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
+        >>> speech.shape
+            [15872]
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if stop_labels is not None:
+            warnings.warn(
+                "The argument `stop_labels` is deprecated and will be removed in version 4.30.0 of Transformers",
+                FutureWarning,
+            )
+
+        if labels is not None:
+            if decoder_input_values is None:
+                decoder_input_values = shift_spectrograms_right(labels, self.config.reduction_factor)
+            if self.config.use_guided_attention_loss:
+                output_attentions = True
+
+        outputs = self.speecht5(
+            input_values=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_values=decoder_input_values,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            speaker_embeddings=speaker_embeddings,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+
+        outputs_before_postnet, outputs_after_postnet, logits = self.speech_decoder_postnet(outputs[0])
+
+        loss = None
+        if labels is not None:
+            criterion = SpeechT5SpectrogramLoss(self.config)
+            loss = criterion(
+                attention_mask,
+                outputs_before_postnet,
+                outputs_after_postnet,
+                logits,
+                labels,
+                outputs.cross_attentions,
+            )
+
+        if not return_dict:
+            output = (outputs_after_postnet,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqSpectrogramOutput(
+            loss=loss,
+            spectrogram=outputs_after_postnet,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    @paddle.no_grad()
+    def generate_speech(
+        self,
+        input_ids: paddle.Tensor,
+        speaker_embeddings: Optional[paddle.Tensor] = None,
+        threshold: float = 0.5,
+        minlenratio: float = 0.0,
+        maxlenratio: float = 20.0,
+        vocoder: Optional[nn.Layer] = None,
+        output_cross_attentions: bool = False,
+    ) -> Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
+        r"""
+        Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
+        speech waveform using a vocoder.
+
+        Args:
+            input_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. The `batch_size` should be 1 currently.
+
+                Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
+                [`~PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            speaker_embeddings (`paddle.Tensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
+                Tensor containing the speaker embeddings.
+            threshold (`float`, *optional*, defaults to 0.5):
+                The generated sequence ends when the predicted stop token probability exceeds this value.
+            minlenratio (`float`, *optional*, defaults to 0.0):
+                Used to calculate the minimum required length for the output sequence.
+            maxlenratio (`float`, *optional*, defaults to 20.0):
+                Used to calculate the maximum allowed length for the output sequence.
+            vocoder (`nn.Layer`, *optional*, defaults to `None`):
+                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
+                spectrogram.
+            output_cross_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
+
+        Returns:
+            `tuple(paddle.Tensor)` comprising various elements depending on the inputs:
+            - **spectrogram** (*optional*, returned when no `vocoder` is provided) `paddle.Tensor` of shape
+              `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
+            - **waveform** (*optional*, returned when a `vocoder` is provided) `paddle.Tensor` of shape
+              `(num_frames,)` -- The predicted speech waveform.
+            - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`) `paddle.Tensor`
+              of shape `(config.decoder_layers, config.decoder_attention_heads, output_sequence_length,
+              input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
+        """
+        return _generate_speech(
+            self,
+            input_ids,
+            speaker_embeddings,
+            threshold,
+            minlenratio,
+            maxlenratio,
+            vocoder,
+            output_cross_attentions,
+        )
+
+
+class SpeechT5ForSpeechToSpeech(SpeechT5PretrainedModel):
+    _keys_to_ignore_on_load_missing = [
+        r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights",
+    ]
+    _keys_to_ignore_on_save = [
+        r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights",
+    ]
+
+    def __init__(self, config: SpeechT5Config):
+        super().__init__(config)
+
+        speech_encoder = SpeechT5EncoderWithSpeechPrenet(config)
+        speech_decoder = SpeechT5DecoderWithSpeechPrenet(config)
+        self.speecht5 = SpeechT5Model(config, speech_encoder, speech_decoder)
+
+        self.speech_decoder_postnet = SpeechT5SpeechDecoderPostnet(config)
+
+        # Initialize weights and apply final processing
+        self.init_weights()
+
+    def get_encoder(self):
+        return self.speecht5.get_encoder()
+
+    def get_decoder(self):
+        return self.speecht5.get_decoder()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.get_encoder().prenet.freeze_feature_encoder()
+
+    def forward(
+        self,
+        input_values: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        decoder_input_values: Optional[paddle.Tensor] = None,
+        decoder_attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        decoder_head_mask: Optional[paddle.Tensor] = None,
+        cross_attn_head_mask: Optional[paddle.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        speaker_embeddings: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+        stop_labels: Optional[paddle.Tensor] = None,
+    ) -> Union[Tuple, Seq2SeqSpectrogramOutput]:
+        r"""
+        input_values (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (*pip install
+            soundfile*). To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding
+            and conversion into a tensor of type `paddle.Tensor`. See [`SpeechT5Processor.__call__`] for details.
+        decoder_input_values (`paddle.Tensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
+            Float values of input mel spectrogram.
+
+            SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
+            `past_key_values`).
+        speaker_embeddings (`paddle.Tensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
+            Tensor containing the speaker embeddings.
+        labels (`paddle.Tensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
+            Float values of target mel spectrogram. Spectrograms can be obtained using [`SpeechT5Processor`]. See
+            [`SpeechT5Processor.__call__`] for details.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from paddlenlp.transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan, set_seed
+        >>> from datasets import load_dataset
+        >>> import paddle
+
+        >>> dataset = load_dataset(
+        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
+        ... )  # doctest: +IGNORE_RESULT
+        >>> dataset = dataset.sort("id")
+        >>> sampling_rate = dataset.features["audio"].sampling_rate
+
+        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
+        >>> model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
+        >>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+
+        >>> # audio file is decoded on the fly
+        >>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pd")
+
+        >>> speaker_embeddings = paddle.zeros((1, 512))  # or load xvectors from a file
+
+        >>> set_seed(555)  # make deterministic
+
+        >>> # generate speech
+        >>> speech = model.generate_speech(inputs["input_values"], speaker_embeddings, vocoder=vocoder)
+        >>> speech.shape
+        [77824]
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if stop_labels is not None:
+            warnings.warn(
+                "The argument `stop_labels` is deprecated and will be removed in version 4.30.0 of Transformers",
+                FutureWarning,
+            )
+
+        if labels is not None:
+            if decoder_input_values is None:
+                decoder_input_values = shift_spectrograms_right(labels, self.config.reduction_factor)
+
+        outputs = self.speecht5(
+            input_values=input_values,
+            attention_mask=attention_mask,
+            decoder_input_values=decoder_input_values,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            speaker_embeddings=speaker_embeddings,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+
+        _, spectrogram, logits = self.speech_decoder_postnet(outputs[0])
+
+        loss = None
+
+        if not return_dict:
+            output = (spectrogram,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqSpectrogramOutput(
+            loss=loss,
+            spectrogram=spectrogram,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    @paddle.no_grad()
+    def generate_speech(
+        self,
+        input_values: paddle.Tensor,
+        speaker_embeddings: Optional[paddle.Tensor] = None,
+        threshold: float = 0.5,
+        minlenratio: float = 0.0,
+        maxlenratio: float = 20.0,
+        vocoder: Optional[nn.Layer] = None,
+        output_cross_attentions: bool = False,
+    ) -> paddle.Tensor:
+        r"""
+        Converts a raw speech waveform into a sequence of mel spectrograms, which are subsequently turned back into a
+        speech waveform using a vocoder.
+
+        Args:
+            input_values (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+                Float values of input raw speech waveform. The `batch_size` should be 1 currently.
+
+                Values can be obtained by loading a *.flac* or *.wav* audio file into an array of type `List[float]` or
+                a `numpy.ndarray`, *e.g.* via the soundfile library (*pip install soundfile*). To prepare the array
+                into `input_values`, the [`SpeechT5Processor`] should be used for padding and conversion into a tensor
+                of type `paddle.Tensor`. See [`SpeechT5Processor.__call__`] for details.
+            speaker_embeddings (`paddle.Tensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
+                Tensor containing the speaker embeddings.
+            threshold (`float`, *optional*, defaults to 0.5):
+                The generated sequence ends when the predicted stop token probability exceeds this value.
+            minlenratio (`float`, *optional*, defaults to 0.0):
+                Used to calculate the minimum required length for the output sequence.
+            maxlenratio (`float`, *optional*, defaults to 20.0):
+                Used to calculate the maximum allowed length for the output sequence.
+            vocoder (`nn.Layer`, *optional*, defaults to `None`):
+                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
+                spectrogram.
+            output_cross_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
+
+        Returns:
+            `tuple(paddle.Tensor)` comprising various elements depending on the inputs:
+            - **spectrogram** (*optional*, returned when no `vocoder` is provided) `paddle.Tensor` of shape
+              `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
+            - **waveform** (*optional*, returned when a `vocoder` is provided) `paddle.Tensor` of shape
+              `(num_frames,)` -- The predicted speech waveform.
+            - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`) `paddle.Tensor`
+              of shape `(config.decoder_layers, config.decoder_attention_heads, output_sequence_length,
+              input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
+        """
+        if speaker_embeddings is None:
+            speaker_embeddings = paddle.zeros((1, 512))
+
+        return _generate_speech(
+            self,
+            input_values,
+            speaker_embeddings,
+            threshold,
+            minlenratio,
+            maxlenratio,
+            vocoder,
+            output_cross_attentions,
+        )
+
+
+class HifiGanResidualBlock(nn.Layer):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
+        super().__init__()
+        self.leaky_relu_slope = leaky_relu_slope
+
+        self.convs1 = nn.LayerList(
+            [
+                nn.Conv1D(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=dilation[i],
+                    padding=self.get_padding(kernel_size, dilation[i]),
+                )
+                for i in range(len(dilation))
+            ]
+        )
+        self.convs2 = nn.LayerList(
+            [
+                nn.Conv1D(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=1,
+                    padding=self.get_padding(kernel_size, 1),
+                )
+                for _ in range(len(dilation))
+            ]
+        )
+
+    def get_padding(self, kernel_size, dilation=1):
+        return (kernel_size * dilation - dilation) // 2
+
+    def apply_weight_norm(self):
+        for layer in self.convs1:
+            nn.utils.weight_norm(layer)
+        for layer in self.convs2:
+            nn.utils.weight_norm(layer)
+
+    def remove_weight_norm(self):
+        for layer in self.convs1:
+            nn.utils.remove_weight_norm(layer)
+        for layer in self.convs2:
+            nn.utils.remove_weight_norm(layer)
+
+    def forward(self, hidden_states):
+        for conv1, conv2 in zip(self.convs1, self.convs2):
+            residual = hidden_states
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = conv1(hidden_states)
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = conv2(hidden_states)
+            hidden_states = hidden_states + residual
+        return hidden_states
+
+
+class SpeechT5HifiGan(PretrainedModel):
+    config_class = SpeechT5HifiGanConfig
+    main_input_name = "spectrogram"
+
+    def __init__(self, config: SpeechT5HifiGanConfig):
+        super().__init__(config)
+        self.num_kernels = len(config.resblock_kernel_sizes)
+        self.num_upsamples = len(config.upsample_rates)
+        self.conv_pre = nn.Conv1D(
+            config.model_in_dim,
+            config.upsample_initial_channel,
+            kernel_size=7,
+            stride=1,
+            padding=3,
+        )
+
+        self.upsampler = nn.LayerList()
+        for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
+            self.upsampler.append(
+                nn.Conv1DTranspose(
+                    config.upsample_initial_channel // (2**i),
+                    config.upsample_initial_channel // (2 ** (i + 1)),
+                    kernel_size=kernel_size,
+                    stride=upsample_rate,
+                    padding=(kernel_size - upsample_rate) // 2,
+                )
+            )
+
+        self.resblocks = nn.LayerList()
+        for i in range(len(self.upsampler)):
+            channels = config.upsample_initial_channel // (2 ** (i + 1))
+            for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
+                self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope))
+
+        self.conv_post = nn.Conv1D(channels, 1, kernel_size=7, stride=1, padding=3)
+
+        self.register_buffer("mean", paddle.zeros([config.model_in_dim]))
+        self.register_buffer("scale", paddle.ones([config.model_in_dim]))
+
+        # Initialize weights and apply final processing
+        self.init_weights()
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear, nn.Conv1D)):
+            # module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            normal_(module.weight, mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                # module.bias.data.zero_()
+                zeros_(module.bias)
+
+    def apply_weight_norm(self):
+        nn.utils.weight_norm(self.conv_pre)
+        for layer in self.upsampler:
+            nn.utils.weight_norm(layer)
+        for layer in self.resblocks:
+            layer.apply_weight_norm()
+        nn.utils.weight_norm(self.conv_post)
+
+    def remove_weight_norm(self):
+        nn.utils.remove_weight_norm(self.conv_pre)
+        for layer in self.upsampler:
+            nn.utils.remove_weight_norm(layer)
+        for layer in self.resblocks:
+            layer.remove_weight_norm()
+        nn.utils.remove_weight_norm(self.conv_post)
+
+    def forward(self, spectrogram: paddle.Tensor) -> paddle.Tensor:
+        r"""
+        Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
+        of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
+        waveform.
+
+        Args:
+            spectrogram (`paddle.Tensor`):
+                Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
+                config.model_in_dim)`, or un-batched and of shape `(sequence_length, config.model_in_dim)`.
+
+        Returns:
+            `paddle.Tensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
+            shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
+        """
+        if self.config.normalize_before:
+            spectrogram = (spectrogram - self.mean) / self.scale
+
+        is_batched = spectrogram.dim() == 3
+        if not is_batched:
+            spectrogram = spectrogram.unsqueeze(0)
+        hidden_states = spectrogram.transpose([0, 2, 1])
+
+        hidden_states = self.conv_pre(hidden_states)
+        for i in range(self.num_upsamples):
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.config.leaky_relu_slope)
+            hidden_states = self.upsampler[i](hidden_states)
+
+            res_state = self.resblocks[i * self.num_kernels](hidden_states)
+            for j in range(1, self.num_kernels):
+                res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
+            hidden_states = res_state / self.num_kernels
+
+        hidden_states = nn.functional.leaky_relu(hidden_states)
+        hidden_states = self.conv_post(hidden_states)
+        hidden_states = paddle.tanh(hidden_states)
+
+        if not is_batched:
+            # remove batch dim and collapse tensor to 1-d audio waveform
+            waveform = hidden_states.squeeze(0).transpose([1, 0]).reshape([-1])
+        else:
+            # remove seq-len dim since this collapses to 1
+            waveform = hidden_states.squeeze(1)
+
+        return waveform
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/speecht5/processing.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/speecht5/processing.py
new file mode 100644
index 000000000..4c5430185
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/speecht5/processing.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The Fairseq Authors, Microsoft Research, and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Speech processor class for SpeechT5."""
+
+from ..processing_utils import ProcessorMixin
+
+__all__ = [
+    "SpeechT5Processor",
+]
+
+
+class SpeechT5Processor(ProcessorMixin):
+    r"""
+    Constructs a SpeechT5 processor which wraps a feature extractor and a tokenizer into a single processor.
+
+    [`SpeechT5Processor`] offers all the functionalities of [`SpeechT5FeatureExtractor`] and [`SpeechT5Tokenizer`]. See
+    the docstring of [`~SpeechT5Processor.__call__`] and [`~SpeechT5Processor.decode`] for more information.
+
+    Args:
+        feature_extractor (`SpeechT5FeatureExtractor`):
+            An instance of [`SpeechT5FeatureExtractor`]. The feature extractor is a required input.
+        tokenizer (`SpeechT5Tokenizer`):
+            An instance of [`SpeechT5Tokenizer`]. The tokenizer is a required input.
+    """
+    feature_extractor_class = "SpeechT5FeatureExtractor"
+    tokenizer_class = "SpeechT5Tokenizer"
+
+    pretrained_init_configuration = {
+        "microsoft/speecht5_hifigan": {"do_lower_case": True},
+        "microsoft/speecht5_vc": {"do_lower_case": True},
+    }
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+
+    def __call__(self, *args, **kwargs):
+        """
+        Processes audio and text input, as well as audio and text targets.
+
+        You can process audio by using the argument `audio`, or process audio targets by using the argument
+        `audio_target`. This forwards the arguments to SpeechT5FeatureExtractor's
+        [`~SpeechT5FeatureExtractor.__call__`].
+
+        You can process text by using the argument `text`, or process text labels by using the argument `text_target`.
+        This forwards the arguments to SpeechT5Tokenizer's [`~SpeechT5Tokenizer.__call__`].
+
+        Valid input combinations are:
+
+        - `text` only
+        - `audio` only
+        - `text_target` only
+        - `audio_target` only
+        - `text` and `audio_target`
+        - `audio` and `audio_target`
+        - `text` and `text_target`
+        - `audio` and `text_target`
+
+        Please refer to the docstring of the above two methods for more information.
+        """
+        audio = kwargs.pop("audio", None)
+        text = kwargs.pop("text", None)
+        text_target = kwargs.pop("text_target", None)
+        audio_target = kwargs.pop("audio_target", None)
+        sampling_rate = kwargs.pop("sampling_rate", None)
+
+        if audio is not None and text is not None:
+            raise ValueError(
+                "Cannot process both `audio` and `text` inputs. Did you mean `audio_target` or `text_target`?"
+            )
+        if audio_target is not None and text_target is not None:
+            raise ValueError(
+                "Cannot process both `audio_target` and `text_target` inputs. Did you mean `audio` or `text`?"
+            )
+        if audio is None and audio_target is None and text is None and text_target is None:
+            raise ValueError(
+                "You need to specify either an `audio`, `audio_target`, `text`, or `text_target` input to process."
+            )
+
+        if audio is not None:
+            inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
+        elif text is not None:
+            inputs = self.tokenizer(text, **kwargs)
+        else:
+            inputs = None
+
+        if audio_target is not None:
+            targets = self.feature_extractor(audio_target=audio_target, *args, sampling_rate=sampling_rate, **kwargs)
+            labels = targets["input_values"]
+        elif text_target is not None:
+            targets = self.tokenizer(text_target, **kwargs)
+            labels = targets["input_ids"]
+        else:
+            targets = None
+
+        if inputs is None:
+            return targets
+
+        if targets is not None:
+            inputs["labels"] = labels
+
+            decoder_attention_mask = targets.get("attention_mask")
+            if decoder_attention_mask is not None:
+                inputs["decoder_attention_mask"] = decoder_attention_mask
+
+        return inputs
+
+    def pad(self, *args, **kwargs):
+        """
+        Collates the audio and text inputs, as well as their targets, into a padded batch.
+
+        Audio inputs are padded by SpeechT5FeatureExtractor's [`~SpeechT5FeatureExtractor.pad`]. Text inputs are padded
+        by SpeechT5Tokenizer's [`~SpeechT5Tokenizer.pad`].
+
+        Valid input combinations are:
+
+        - `input_ids` only
+        - `input_values` only
+        - `labels` only, either log-mel spectrograms or text tokens
+        - `input_ids` and log-mel spectrogram `labels`
+        - `input_values` and text `labels`
+
+        Please refer to the docstring of the above two methods for more information.
+        """
+        input_values = kwargs.pop("input_values", None)
+        input_ids = kwargs.pop("input_ids", None)
+        labels = kwargs.pop("labels", None)
+
+        if input_values is not None and input_ids is not None:
+            raise ValueError("Cannot process both `input_values` and `input_ids` inputs.")
+        if input_values is None and input_ids is None and labels is None:
+            raise ValueError(
+                "You need to specify either an `input_values`, `input_ids`, or `labels` input to be padded."
+            )
+
+        if input_values is not None:
+            inputs = self.feature_extractor.pad(input_values, *args, **kwargs)
+        elif input_ids is not None:
+            inputs = self.tokenizer.pad(input_ids, **kwargs)
+        else:
+            inputs = None
+
+        if labels is not None:
+            if "input_ids" in labels or (isinstance(labels, list) and "input_ids" in labels[0]):
+                targets = self.tokenizer.pad(labels, **kwargs)
+                labels = targets["input_ids"]
+            else:
+                feature_size_hack = self.feature_extractor.feature_size
+                self.feature_extractor.feature_size = self.feature_extractor.num_mel_bins
+                targets = self.feature_extractor.pad(labels, *args, **kwargs)
+                self.feature_extractor.feature_size = feature_size_hack
+                labels = targets["input_values"]
+        else:
+            targets = None
+
+        if inputs is None:
+            return targets
+
+        if targets is not None:
+            inputs["labels"] = labels
+
+            decoder_attention_mask = targets.get("attention_mask")
+            if decoder_attention_mask is not None:
+                inputs["decoder_attention_mask"] = decoder_attention_mask
+
+        return inputs
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to SpeechT5Tokenizer's [`~SpeechT5Tokenizer.batch_decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to SpeechT5Tokenizer's [`~SpeechT5Tokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/speecht5/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/speecht5/tokenizer.py
new file mode 100644
index 000000000..fa0e0d959
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/speecht5/tokenizer.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The Fairseq Authors, Microsoft Research, and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization class for SpeechT5."""
+
+
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+
+from paddlenlp.transformers import PretrainedTokenizer
+
+from ...utils.log import logger
+
+VOCAB_FILES_NAMES = {"vocab_file": "spm_char.model"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "microsoft/speecht5_asr": "https://huggingface.co/microsoft/speecht5_asr/resolve/main/spm_char.model",
+        "microsoft/speecht5_tts": "https://huggingface.co/microsoft/speecht5_tts/resolve/main/spm_char.model",
+        "microsoft/speecht5_vc": "https://huggingface.co/microsoft/speecht5_vc/resolve/main/spm_char.model",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "microsoft/speecht5_asr": 1024,
+    "microsoft/speecht5_tts": 1024,
+    "microsoft/speecht5_vc": 1024,
+}
+
+
+__all__ = ["SpeechT5Tokenizer"]
+
+
+# Define type aliases and NamedTuples
+TextInput = str
+PreTokenizedInput = List[str]
+EncodedInput = List[int]
+TextInputPair = Tuple[str, str]
+PreTokenizedInputPair = Tuple[List[str], List[str]]
+EncodedInputPair = Tuple[List[int], List[int]]
+
+
+class SpeechT5Tokenizer(PretrainedTokenizer):
+    """
+    Construct a SpeechT5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The begin of sequence token.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+
+    Attributes:
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+    """
+
+    resource_files_names = VOCAB_FILES_NAMES
+    pretrained_resource_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+        self.vocab_file = vocab_file
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+        self._in_target_context_manager = False
+
+    @property
+    def vocab_size(self):
+        return self.sp_model.get_piece_size()
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.vocab_file)
+
+    def _tokenize(self, text: str) -> List[str]:
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string.strip()
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
+        """Build model inputs from a sequence by appending eos_token_id."""
+        if token_ids_1 is None:
+            return token_ids_0 + [self.eos_token_id]
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return token_ids_0 + token_ids_1 + [self.eos_token_id]
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        suffix_ones = [1]
+        if token_ids_1 is None:
+            return ([0] * len(token_ids_0)) + suffix_ones
+        return ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/squeezebert/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/squeezebert/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/squeezebert/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/squeezebert/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/squeezebert/configuration.py
new file mode 100644
index 000000000..64d00b9a6
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/squeezebert/configuration.py
@@ -0,0 +1,233 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" SqueezeBERT model configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = [
+    "SQUEEZEBERT_PRETRAINED_INIT_CONFIGURATION",
+    "SqueezeBertConfig",
+    "SQUEEZEBERT_PRETRAINED_RESOURCE_FILES_MAP",
+]
+
+SQUEEZEBERT_PRETRAINED_INIT_CONFIGURATION = {
+    "squeezebert-uncased": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 768,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "max_position_embeddings": 512,
+        "model_type": "squeezebert",
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 2,
+        "vocab_size": 30528,
+        "q_groups": 4,
+        "k_groups": 4,
+        "v_groups": 4,
+        "post_attention_groups": 1,
+        "intermediate_groups": 4,
+        "output_groups": 4,
+        "pad_token_id": 0,
+        "layer_norm_eps": 1e-12,
+    },
+    "squeezebert-mnli": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 768,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "max_position_embeddings": 512,
+        "model_type": "squeezebert",
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 2,
+        "vocab_size": 30528,
+        "q_groups": 4,
+        "k_groups": 4,
+        "v_groups": 4,
+        "post_attention_groups": 1,
+        "intermediate_groups": 4,
+        "output_groups": 4,
+        "num_labels": 3,
+        "pad_token_id": 0,
+        "layer_norm_eps": 1e-12,
+    },
+    "squeezebert-mnli-headless": {
+        "attention_probs_dropout_prob": 0.1,
+        "embedding_size": 768,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 768,
+        "initializer_range": 0.02,
+        "intermediate_size": 3072,
+        "max_position_embeddings": 512,
+        "model_type": "squeezebert",
+        "num_attention_heads": 12,
+        "num_hidden_layers": 12,
+        "type_vocab_size": 2,
+        "vocab_size": 30528,
+        "q_groups": 4,
+        "k_groups": 4,
+        "v_groups": 4,
+        "post_attention_groups": 1,
+        "intermediate_groups": 4,
+        "output_groups": 4,
+        "pad_token_id": 0,
+        "layer_norm_eps": 1e-12,
+    },
+}
+
+SQUEEZEBERT_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "squeezebert-uncased": "http://bj.bcebos.com/paddlenlp/models/transformers/squeezebert/squeezebert-uncased/model_state.pdparams",
+        "squeezebert-mnli": "http://bj.bcebos.com/paddlenlp/models/transformers/squeezebert/squeezebert-mnli/model_state.pdparams",
+        "squeezebert-mnli-headless": "http://bj.bcebos.com/paddlenlp/models/transformers/squeezebert/squeezebert-mnli-headless/model_state.pdparams",
+    }
+}
+
+
+class SqueezeBertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SqueezeBertModel`]. It is used to instantiate a
+    SqueezeBERT model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the SqueezeBERT.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the SqueezeBERT model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`SqueezeBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The ID of the token in the word embedding to use as padding.
+        embedding_size (`int`, *optional*, defaults to 768):
+            The dimension of the word embedding vectors.
+
+        q_groups (`int`, *optional*, defaults to 4):
+            The number of groups in Q layer.
+        k_groups (`int`, *optional*, defaults to 4):
+            The number of groups in K layer.
+        v_groups (`int`, *optional*, defaults to 4):
+            The number of groups in V layer.
+        post_attention_groups (`int`, *optional*, defaults to 1):
+            The number of groups in the first feed forward network layer.
+        intermediate_groups (`int`, *optional*, defaults to 4):
+            The number of groups in the second feed forward network layer.
+        output_groups (`int`, *optional*, defaults to 4):
+            The number of groups in the third feed forward network layer.
+
+    Examples:
+
+    ```python
+    >>> from transformers import SqueezeBertConfig, SqueezeBertModel
+
+    >>> # Initializing a SqueezeBERT configuration
+    >>> configuration = SqueezeBertConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration above
+    >>> model = SqueezeBertModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+
+    Attributes: pretrained_config_archive_map (Dict[str, str]): A dictionary containing all the available pre-trained
+    checkpoints.
+    """
+    pretrained_init_configuration = SQUEEZEBERT_PRETRAINED_INIT_CONFIGURATION
+    model_type = "squeezebert"
+    attribute_map: Dict[str, str] = {
+        "num_classes": "num_labels",
+    }
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        embedding_size=768,
+        q_groups=4,
+        k_groups=4,
+        v_groups=4,
+        post_attention_groups=1,
+        intermediate_groups=4,
+        output_groups=4,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.embedding_size = embedding_size
+        self.q_groups = q_groups
+        self.k_groups = k_groups
+        self.v_groups = v_groups
+        self.post_attention_groups = post_attention_groups
+        self.intermediate_groups = intermediate_groups
+        self.output_groups = output_groups
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/squeezebert/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/squeezebert/modeling.py
new file mode 100644
index 000000000..60b88144a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/squeezebert/modeling.py
@@ -0,0 +1,623 @@
+# coding=utf-8
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2020 The SqueezeBert authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+from paddle import nn
+
+from .. import PretrainedModel, register_base_model
+from ..activations import ACT2FN
+from .configuration import (
+    SQUEEZEBERT_PRETRAINED_INIT_CONFIGURATION,
+    SQUEEZEBERT_PRETRAINED_RESOURCE_FILES_MAP,
+    SqueezeBertConfig,
+)
+
+__all__ = [
+    "SqueezeBertModel",
+    "SqueezeBertPreTrainedModel",
+    "SqueezeBertForSequenceClassification",
+    "SqueezeBertForTokenClassification",
+    "SqueezeBertForQuestionAnswering",
+]
+
+
+def _convert_attention_mask(attention_mask, inputs):
+    if attention_mask.dim() == 3:
+        extended_attention_mask = attention_mask.unsqueeze(1)
+    elif attention_mask.dim() == 2:
+        # extended_attention_mask = attention_mask[:, None, None, :]
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)
+    extended_attention_mask = paddle.cast(extended_attention_mask, inputs.dtype)  # fp16 compatibility
+    extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+    return extended_attention_mask
+
+
+class SqueezeBertEmbeddings(nn.Layer):
+    def __init__(self, config: SqueezeBertConfig):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=None)
+
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.register_buffer(
+            "position_ids", paddle.arange(config.max_position_embeddings, dtype="int64").expand((1, -1))
+        )
+
+    def forward(self, input_ids=None, token_type_ids=None, position_ids=None):
+        input_shape = input_ids.shape
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(
+                input_shape,
+                dtype=paddle.int64,
+            )
+
+        inputs_embeds = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class MatMulWrapper(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, mat1, mat2):
+        """
+        :param inputs: two paddle tensors :return: matmul of these tensors
+        Here are the typical dimensions found in BERT (the B is optional) mat1.shape: [B, <optional extra dims>, M, K]
+        mat2.shape: [B, <optional extra dims>, K, N] output shape: [B, <optional extra dims>, M, N]
+        """
+        return paddle.matmul(mat1, mat2)
+
+
+class SqueezeBertLayerNorm(nn.LayerNorm):
+    def __init__(self, hidden_size, epsilon=1e-12):
+        nn.LayerNorm.__init__(
+            self, normalized_shape=hidden_size, epsilon=epsilon
+        )  # instantiates self.{weight, bias, eps}
+
+    def forward(self, x):
+        x = x.transpose((0, 2, 1))
+        x = nn.LayerNorm.forward(self, x)
+        return x.transpose((0, 2, 1))
+
+
+class ConvDropoutLayerNorm(nn.Layer):
+    def __init__(self, cin, cout, groups, dropout_prob):
+        super().__init__()
+
+        self.conv1d = nn.Conv1D(in_channels=cin, out_channels=cout, kernel_size=1, groups=groups)
+        self.layernorm = SqueezeBertLayerNorm(cout)
+        self.dropout = nn.Dropout(dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        x = self.conv1d(hidden_states)
+        x = self.dropout(x)
+        x = x + input_tensor
+        x = self.layernorm(x)
+        return x
+
+
+class ConvActivation(nn.Layer):
+    def __init__(self, cin, cout, groups, act):
+        super().__init__()
+        self.conv1d = nn.Conv1D(in_channels=cin, out_channels=cout, kernel_size=1, groups=groups)
+        self.act = ACT2FN[act]
+
+    def forward(self, x):
+        output = self.conv1d(x)
+        return self.act(output)
+
+
+class SqueezeBertSelfAttention(nn.Layer):
+    def __init__(self, config: SqueezeBertConfig, cin, q_groups=1, k_groups=1, v_groups=1):
+        super().__init__()
+        if cin % config.num_attention_heads != 0:
+            raise ValueError(
+                f"cin ({cin}) is not a multiple of the number of attention heads ({config.num_attention_heads})"
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(cin / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Conv1D(in_channels=cin, out_channels=cin, kernel_size=1, groups=q_groups)
+        self.key = nn.Conv1D(in_channels=cin, out_channels=cin, kernel_size=1, groups=k_groups)
+        self.value = nn.Conv1D(in_channels=cin, out_channels=cin, kernel_size=1, groups=v_groups)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.softmax = nn.Softmax(axis=-1)
+
+        self.matmul_qk = MatMulWrapper()
+        self.matmul_qkv = MatMulWrapper()
+
+    def transpose_for_scores(self, x):
+        """
+        - input: [N, C, W]
+        - output: [N, C1, W, C2] where C1 is the head index, and C2 is one head's contents
+        """
+        new_x_shape = (x.shape[0], self.num_attention_heads, self.attention_head_size, x.shape[-1])  # [N, C1, C2, W]
+        x = x.reshape(new_x_shape)
+        return x.transpose((0, 1, 3, 2))  # [N, C1, C2, W] --> [N, C1, W, C2]
+
+    def transpose_key_for_scores(self, x):
+        """
+        - input: [N, C, W]
+        - output: [N, C1, C2, W] where C1 is the head index, and C2 is one head's contents
+        """
+        new_x_shape = (x.shape[0], self.num_attention_heads, self.attention_head_size, x.shape[-1])  # [N, C1, C2, W]
+        x = x.reshape(new_x_shape)
+        return x
+
+    def transpose_output(self, x):
+        """
+        - input: [N, C1, W, C2]
+        - output: [N, C, W]
+        """
+        x = x.transpose((0, 1, 3, 2))  # [N, C1, C2, W]
+        new_x_shape = (x.shape[0], self.all_head_size, x.shape[3])  # [N, C, W]
+        x = x.reshape(new_x_shape)
+        return x
+
+    def forward(self, hidden_states, attention_mask, output_attentions):
+        """
+        expects hidden_states in [N, C, W] data layout.
+        The attention_mask data layout is [N, W], and it does not need to be transposed.
+        """
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_key_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_score = self.matmul_qk(query_layer, key_layer)
+        attention_score = attention_score / math.sqrt(self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        attention_score = attention_score + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = self.softmax(attention_score)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = self.matmul_qkv(attention_probs, value_layer)
+        context_layer = self.transpose_output(context_layer)
+
+        result = {"context_layer": context_layer}
+        if output_attentions:
+            result["attention_score"] = attention_score
+        return result
+
+
+class SqueezeBertLayer(nn.Layer):
+    def __init__(self, config: SqueezeBertConfig):
+        """
+        - hidden_size = input chans = output chans for Q, K, V (they are all the same ... for now) = output chans for
+          the module
+        - intermediate_size = output chans for intermediate layer
+        - groups = number of groups for all layers in the BertLayer. (eventually we could change the interface to
+          allow different groups for different layers)
+        """
+        super().__init__()
+
+        c0 = config.hidden_size
+        c1 = config.hidden_size
+        c2 = config.intermediate_size
+        c3 = config.hidden_size
+
+        self.attention = SqueezeBertSelfAttention(
+            config,
+            cin=c0,
+            q_groups=config.q_groups,
+            k_groups=config.k_groups,
+            v_groups=config.v_groups,
+        )
+        self.post_attention = ConvDropoutLayerNorm(
+            cin=c0, cout=c1, groups=config.post_attention_groups, dropout_prob=config.hidden_dropout_prob
+        )
+        self.intermediate = ConvActivation(cin=c1, cout=c2, groups=config.intermediate_groups, act=config.hidden_act)
+        self.output = ConvDropoutLayerNorm(
+            cin=c2, cout=c3, groups=config.output_groups, dropout_prob=config.hidden_dropout_prob
+        )
+
+    def forward(self, hidden_states, attention_mask, output_attentions):
+        att = self.attention(hidden_states, attention_mask, output_attentions)
+        attention_output = att["context_layer"]
+
+        post_attention_output = self.post_attention(attention_output, hidden_states)
+        intermediate_output = self.intermediate(post_attention_output)
+        layer_output = self.output(intermediate_output, post_attention_output)
+
+        output_dict = {"feature_map": layer_output}
+        if output_attentions:
+            output_dict["attention_score"] = att["attention_score"]
+
+        return output_dict
+
+
+class SqueezeBertEncoder(nn.Layer):
+    def __init__(self, config: SqueezeBertConfig):
+        super().__init__()
+        assert config.embedding_size == config.hidden_size, (
+            "If you want embedding_size != intermediate hidden_size,"
+            "please insert a Conv1D layer to adjust the number of channels "
+            "before the first SqueezeBertLayer."
+        )
+        self.layers = nn.LayerList(SqueezeBertLayer(config) for _ in range(config.num_hidden_layers))
+
+    def forward(self, hidden_states, attention_mask=None, output_attentions=False, output_hidden_states=False):
+
+        hidden_states = hidden_states.transpose((0, 2, 1))
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        for layer in self.layers:
+            if output_hidden_states:
+                hidden_states = hidden_states.transpose((0, 2, 1))
+                all_hidden_states += (hidden_states,)
+                hidden_states = hidden_states.transpose((0, 2, 1))
+
+            layer_output = layer.forward(hidden_states, attention_mask, output_attentions)
+
+            hidden_states = layer_output["feature_map"]
+
+            if output_attentions:
+                all_attentions += (layer_output["attention_score"],)
+
+        # [batch_size, hidden_size, sequence_length] --> [batch_size, sequence_length, hidden_size]
+        hidden_states = hidden_states.transpose((0, 2, 1))
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None)
+
+
+class SqueezeBertPooler(nn.Layer):
+    def __init__(self, config: SqueezeBertConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class SqueezeBertPredictionHeadTransform(nn.Layer):
+    def __init__(self, config: SqueezeBertConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class SqueezeBertLMPredictionHead(nn.Layer):
+    def __init__(self, config: SqueezeBertConfig):
+        super().__init__()
+        self.transform = SqueezeBertPredictionHeadTransform(
+            config.hidden_size, config.hidden_act, config.layer_norm_eps
+        )
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias_attr=False)
+        self.bias = paddle.create_parameter([config.vocab_size], dtype="float32", is_bias=True)
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class SqueezeBertPreTrainingHeads(nn.Layer):
+    def __init__(self, config: SqueezeBertConfig):
+        super().__init__()
+        self.predictions = SqueezeBertLMPredictionHead(
+            config.hidden_size, config.hidden_act, config.layer_norm_eps, config.vocab_size
+        )
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class SqueezeBertPreTrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained SqueezBert models. It provides SqueezBert related
+    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
+    `pretrained_init_configuration`, `base_model_prefix` for downloading and
+    loading pretrained models. See `PretrainedModel` for more details.
+    """
+
+    config_class = SqueezeBertConfig
+    base_model_prefix = "squeezebert"
+
+    pretrained_init_configuration = SQUEEZEBERT_PRETRAINED_INIT_CONFIGURATION
+
+    pretrained_resource_files_map = SQUEEZEBERT_PRETRAINED_RESOURCE_FILES_MAP
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.initializer_range
+                        if hasattr(self, "initializer_range")
+                        else self.squeezebert.config["initializer_range"],
+                        shape=layer.weight.shape,
+                    )
+                )
+        elif isinstance(layer, nn.LayerNorm):
+            layer._epsilon = 1e-12
+
+
+@register_base_model
+class SqueezeBertModel(SqueezeBertPreTrainedModel):
+    def __init__(self, config: SqueezeBertConfig):
+        super().__init__(config)
+        self.initializer_range = config.initializer_range
+        self.embeddings = SqueezeBertEmbeddings(config)
+        self.encoder = SqueezeBertEncoder(config)
+        self.pooler = SqueezeBertPooler(config)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embeddings.word_embeddings = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        output_attentions=None,
+        output_hidden_states=None,
+    ):
+        r"""
+        The  forward method, overrides the `__call__()` special method.
+        Args:
+           input_ids (Tensor):
+               Indices of input sequence tokens in the vocabulary. They are
+               numerical representations of tokens that build the input sequence.
+               Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+           attention_mask (Tensor, optional):
+               Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+               usually the paddings or the subsequent positions.
+               Its data type can be int, float and bool.
+               If its data type is int, the values should be either 0 or 1.
+               - **1** for tokens that **not masked**,
+               - **0** for tokens that **masked**.
+               It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+               Defaults to `None`, which means nothing needed to be prevented attention to.
+           token_type_ids (Tensor, optional):
+               Segment token indices to indicate different portions of the inputs.
+               Selected in the range ``[0, type_vocab_size - 1]``.
+               If `type_vocab_size` is 2, which means the inputs have two portions.
+               Indices can either be 0 or 1:
+               - 0 corresponds to a *sentence A* token,
+               - 1 corresponds to a *sentence B* token.
+               Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+               Defaults to `None`, which means we don't add segment embeddings.
+           position_ids(Tensor, optional):
+               Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+               max_position_embeddings - 1]``.
+               Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+
+           output_attentions (bool, optional):
+               Whether to return the attention_weight of each hidden layers.
+               Defaults to `False`.
+           output_hidden_states (bool, optional):
+               Whether to return the output of each hidden layers.
+               Defaults to `False`.
+        Returns:
+           tuple: Returns tuple (`sequence_output`, `pooled_output`) with (`encoder_outputs`, `encoder_attentions`) by
+           optional.
+           With the fields:
+           - `sequence_output` (Tensor):
+               Sequence of hidden-states at the last layer of the model.
+               It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+           - `pooled_output` (Tensor):
+               The output of first token (`[CLS]`) in sequence.
+               We "pool" the model by simply taking the hidden state corresponding to the first token.
+               Its data type should be float32 and its shape is [batch_size, hidden_size].
+           - `encoder_outputs` (List(Tensor)):
+               A list of Tensor containing hidden-states of the model at each hidden layer in the Transformer encoder.
+               The length of the list is `num_hidden_layers` + 1 (Embedding Layer output).
+               Each Tensor has a data type of float32 and its shape is [batch_size, sequence_length, hidden_size].
+        """
+        input_shape = input_ids.shape
+        if attention_mask is None:
+            attention_mask = paddle.ones(input_shape)
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(input_shape, dtype=paddle.int64)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids
+        )
+        extended_attention_mask = _convert_attention_mask(attention_mask, embedding_output)
+        encoder_outputs = self.encoder(
+            hidden_states=embedding_output,
+            attention_mask=extended_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output)
+
+        return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+
+class SqueezeBertForSequenceClassification(SqueezeBertPreTrainedModel):
+    """
+    SqueezeBert Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g.
+    for GLUE tasks.
+    Args:
+        config (:class:`SqueezeBertConfig`):
+            An instance of SqueezeBertConfig.
+    """
+
+    def __init__(self, config: SqueezeBertConfig):
+        super().__init__(config)
+        self.num_classes = config.num_labels
+        self.squeezebert = SqueezeBertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, self.num_classes)
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
+        r"""
+        The SqueezeBertForSequenceClassification forward method, overrides the __call__() special method.
+        Args:
+            input_ids (Tensor):
+                See :class:`SqueezeBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`SqueezeBertModel`.
+            position_ids(Tensor, optional):
+                See :class:`SqueezeBertModel`.
+            attention_mask (list, optional):
+                See :class:`SqueezeBertModel`.
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input text classification logits.
+            Shape as `[batch_size, num_classes]` and dtype as float32.
+        """
+
+        _, pooled_output = self.squeezebert(
+            input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask
+        )
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        return logits
+
+
+class SqueezeBertForQuestionAnswering(SqueezeBertPreTrainedModel):
+    """
+    SqueezeBert Model with a span classification head on top for extractive question-answering tasks like
+    SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and
+    `span end logits`).
+    Args:
+        config (:class:`SqueezeBertConfig`):
+            An instance of SqueezeBertConfig.
+    """
+
+    def __init__(self, config: SqueezeBertConfig):
+        super().__init__(config)
+        self.squeezebert = SqueezeBertModel(config)
+        self.classifier = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, input_ids, token_type_ids=None):
+        r"""
+        The SqueezeBertForQuestionAnswering forward method, overrides the __call__() special method.
+        Args:
+            input_ids (Tensor):
+                See :class:`SqueezeBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`SqueezeBertModel`.
+        Returns:
+            tuple: Returns tuple (`start_logits`, `end_logits`).
+            With the fields:
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+        """
+        sequence_output, _ = self.squeezebert(
+            input_ids, token_type_ids=token_type_ids, position_ids=None, attention_mask=None
+        )
+        logits = self.classifier(sequence_output)
+        logits = paddle.transpose(logits, perm=[2, 0, 1])
+        start_logits, end_logits = paddle.unstack(x=logits, axis=0)
+        return start_logits, end_logits
+
+
+class SqueezeBertForTokenClassification(SqueezeBertPreTrainedModel):
+    """
+    SqueezeBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
+    for Named-Entity-Recognition (NER) tasks.
+    Args:
+        config (:class:`SqueezeBertConfig`):
+            An instance of SqueezeBertConfig.
+    """
+
+    def __init__(self, config: SqueezeBertConfig):
+        super().__init__(config)
+        self.num_classes = config.num_labels
+        self.squeezebert = SqueezeBertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, self.num_classes)
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):
+        r"""
+        The SqueezeBertForTokenClassification forward method, overrides the __call__() special method.
+        Args:
+            input_ids (Tensor):
+                See :class:`SqueezeBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`SqueezeBertModel`.
+            position_ids(Tensor, optional):
+                See :class:`SqueezeBertModel`.
+            attention_mask (list, optional):
+                See :class:`SqueezeBertModel`.
+        Returns:
+            Tensor: Returns tensor `logits`, a tensor of the input token classification logits.
+            Shape as `[batch_size, sequence_length, num_classes]` and dtype as `float32`.
+        """
+
+        sequence_output, _ = self.squeezebert(
+            input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=attention_mask
+        )
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        return logits
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/squeezebert/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/squeezebert/tokenizer.py
new file mode 100644
index 000000000..13e51b3b2
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/squeezebert/tokenizer.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from paddlenlp.transformers import (
+    BasicTokenizer,
+    PretrainedTokenizer,
+    WordpieceTokenizer,
+)
+
+__all__ = [
+    "SqueezeBertTokenizer",
+]
+
+
+class SqueezeBertTokenizer(PretrainedTokenizer):
+    """
+    Constructs a SqueezeBert tokenizer. It uses a basic tokenizer to do punctuation
+    splitting, lower casing and so on, and follows a WordPiece tokenizer to
+    tokenize as subwords.
+
+    Args:
+        vocab_file (str): file path of the vocabulary
+        do_lower_case (bool): Whether the text strips accents and convert to
+            lower case. Default: `True`.
+            Default: True.
+        unk_token (str): The special token for unkown words. Default: "[UNK]".
+        sep_token (str): The special token for separator token . Default: "[SEP]".
+        pad_token (str): The special token for padding. Default: "[PAD]".
+        cls_token (str): The special token for cls. Default: "[CLS]".
+        mask_token (str): The special token for mask. Default: "[MASK]".
+
+    Examples:
+        .. code-block:: python
+            from paddlenlp.transformers import SqueezeBertTokenizer
+            tokenizer = SqueezeBertTokenizer.from_pretrained('squeezebert-uncased')
+            # the following line get: ['he', 'was', 'a', 'puppet', '##eer']
+            tokens = tokenizer('He was a puppeteer')
+            # the following line get: 'he was a puppeteer'
+            tokenizer.convert_tokens_to_string(tokens)
+    """
+
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "squeezebert-uncased": "http://bj.bcebos.com/paddlenlp/models/transformers/squeezebert/squeezebert-uncased/vocab.txt",
+            "squeezebert-mnli": "http://bj.bcebos.com/paddlenlp/models/transformers/squeezebert/squeezebert-mnli/vocab.txt",
+            "squeezebert-mnli-headless": "http://bj.bcebos.com/paddlenlp/models/transformers/squeezebert/squeezebert-mnli-headless/vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "squeezebert-uncased": {"do_lower_case": True},
+        "squeezebert-mnli": {"do_lower_case": True},
+        "squeezebert-mnli-headless": {"do_lower_case": True},
+    }
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the "
+                "vocabulary from a pretrained model please use "
+                "`tokenizer = SqueezeBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+        self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token)
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=unk_token)
+
+    @property
+    def vocab_size(self):
+        """
+        return the size of vocabulary.
+        Returns:
+            int: the size of vocabulary.
+        """
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab.token_to_idx, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        """
+        End-to-end tokenization for SqueezeBert models.
+        Args:
+            text (str): The text to be tokenized.
+        Returns:
+            list: A list of string representing converted tokens.
+        """
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+
+    def convert_tokens_to_string(self, tokens):
+        """
+        Converts a sequence of tokens (list of string) in a single string. Since
+        the usage of WordPiece introducing `##` to concat subwords, also remove
+        `##` when converting.
+        Args:
+            tokens (list): A list of string representing tokens to be converted.
+        Returns:
+            str: Converted string from tokens.
+        """
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def num_special_tokens_to_add(self, pair=False):
+        """
+        Returns the number of added tokens when encoding a sequence with special tokens.
+        Note:
+            This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this
+            inside your training loop.
+        Args:
+            pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the
+                number of added tokens in the case of a single sequence if set to False.
+        Returns:
+            Number of tokens added to sequences
+        """
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens.
+        A SqueezeBert sequence has the following format:
+        ::
+            - single sequence: ``[CLS] X [SEP]``
+            - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of input_id with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        return _cls + token_ids_0 + _sep + token_ids_1 + _sep
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        """
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+        A SqueezeBert offset_mapping has the following format:
+        ::
+            - single sequence: ``(0,0) X (0,0)``
+            - pair of sequences: `(0,0) A (0,0) B (0,0)``
+        Args:
+            offset_mapping_ids_0 (:obj:`List[tuple]`):
+                List of char offsets to which the special tokens will be added.
+            offset_mapping_ids_1 (:obj:`List[tuple]`, `optional`):
+                Optional second list of char offsets for offset mapping pairs.
+        Returns:
+            :obj:`List[tuple]`: List of char offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+        A SqueezeBert sequence pair mask has the following format:
+        ::
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of token_type_id according to the given sequence(s).
+        """
+        _sep = [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(_cls + token_ids_0 + _sep) * [0]
+        return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 + _sep) * [1]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+        Args:
+            token_ids_0 (List[int]): List of ids of the first sequence.
+            token_ids_1 (List[int], optional): List of ids of the second sequence.
+            already_has_special_tokens (bool, optional): Whether or not the token list is already
+                formatted with special tokens for the model. Defaults to None.
+        Returns:
+            results (List[int]): The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tinybert/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tinybert/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tinybert/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tinybert/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tinybert/configuration.py
new file mode 100644
index 000000000..620b4dce9
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tinybert/configuration.py
@@ -0,0 +1,227 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TinyBERT model configuration"""
+from __future__ import annotations
+
+from typing import Dict
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["TINYBERT_PRETRAINED_INIT_CONFIGURATION", "TinyBertConfig", "TINYBERT_PRETRAINED_RESOURCE_FILES_MAP"]
+
+TINYBERT_PRETRAINED_INIT_CONFIGURATION = {
+    "tinybert-4l-312d": {
+        "vocab_size": 30522,
+        "hidden_size": 312,
+        "num_hidden_layers": 4,
+        "num_attention_heads": 12,
+        "intermediate_size": 1200,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+    },
+    "tinybert-6l-768d": {
+        "vocab_size": 30522,
+        "hidden_size": 768,
+        "num_hidden_layers": 6,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+    },
+    "tinybert-4l-312d-v2": {
+        "vocab_size": 30522,
+        "hidden_size": 312,
+        "num_hidden_layers": 4,
+        "num_attention_heads": 12,
+        "intermediate_size": 1200,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+    },
+    "tinybert-6l-768d-v2": {
+        "vocab_size": 30522,
+        "hidden_size": 768,
+        "num_hidden_layers": 6,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+    },
+    "tinybert-4l-312d-zh": {
+        "vocab_size": 21128,
+        "hidden_size": 312,
+        "num_hidden_layers": 4,
+        "num_attention_heads": 12,
+        "intermediate_size": 1200,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+    },
+    "tinybert-6l-768d-zh": {
+        "vocab_size": 21128,
+        "hidden_size": 768,
+        "num_hidden_layers": 6,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "pad_token_id": 0,
+    },
+}
+
+TINYBERT_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "tinybert-4l-312d": "http://bj.bcebos.com/paddlenlp/models/transformers/tinybert/tinybert-4l-312d.pdparams",
+        "tinybert-6l-768d": "http://bj.bcebos.com/paddlenlp/models/transformers/tinybert/tinybert-6l-768d.pdparams",
+        "tinybert-4l-312d-v2": "http://bj.bcebos.com/paddlenlp/models/transformers/tinybert/tinybert-4l-312d-v2.pdparams",
+        "tinybert-6l-768d-v2": "http://bj.bcebos.com/paddlenlp/models/transformers/tinybert/tinybert-6l-768d-v2.pdparams",
+        "tinybert-4l-312d-zh": "http://bj.bcebos.com/paddlenlp/models/transformers/tinybert/tinybert-4l-312d-zh.pdparams",
+        "tinybert-6l-768d-zh": "http://bj.bcebos.com/paddlenlp/models/transformers/tinybert/tinybert-6l-768d-zh.pdparams",
+    }
+}
+
+
+class TinyBertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`TinyBertModel`]. It is used to
+    instantiate a TinyBERT model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the TinyBERT
+    tinybert-6l-768d-v2 architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+        pad_token_id (int, optional):
+            The index of padding token in the token vocabulary.
+            Defaults to `0`.
+        fit_size (int, optional):
+            Dimensionality of the output layer of `fit_dense(s)`, which is the hidden size of the teacher model.
+            `fit_dense(s)` means a hidden states' transformation from student to teacher.
+            `fit_dense(s)` will be generated when bert model is distilled during the training, and will not be generated
+            during the prediction process.
+            `fit_denses` is used in v2 models and it has `num_hidden_layers+1` layers.
+            `fit_dense` is used in other pretraining models and it has one linear layer.
+            Defaults to `768`.
+
+    Examples:
+
+    ```python
+    >>> from paddlenlp.transformers import TinyBertModel, TinyBertConfig
+
+    >>> # Initializing a TinyBERT tinybert-6l-768d-v2 style configuration
+    >>> configuration = TinyBertConfig()
+
+    >>> # Initializing a model from the tinybert-6l-768d-v2 style configuration
+    >>> model = TinyBertModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "tinybert"
+    attribute_map: Dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
+    pretrained_init_configuration = TINYBERT_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size: int = 30522,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: str = "gelu",
+        pool_act="tanh",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        max_position_embeddings: int = 512,
+        type_vocab_size: int = 16,
+        layer_norm_eps=1e-12,
+        initializer_range: float = 0.02,
+        pad_token_id: int = 0,
+        fit_size: int = 768,
+        **kwargs
+    ):
+
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.pool_act = pool_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.fit_size = fit_size
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tinybert/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tinybert/modeling.py
new file mode 100644
index 000000000..b85374079
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tinybert/modeling.py
@@ -0,0 +1,754 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2020 Huawei Technologies Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Tuple
+
+import paddle
+import paddle.nn as nn
+from paddle import Tensor
+
+from ...utils.env import CONFIG_NAME
+from .. import PretrainedModel, register_base_model
+from ..bert.modeling import BertEmbeddings, BertPooler
+from ..model_outputs import (
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    tuple_output,
+)
+from .configuration import (
+    TINYBERT_PRETRAINED_INIT_CONFIGURATION,
+    TINYBERT_PRETRAINED_RESOURCE_FILES_MAP,
+    TinyBertConfig,
+)
+
+__all__ = [
+    "TinyBertModel",
+    "TinyBertPretrainedModel",
+    "TinyBertForPretraining",
+    "TinyBertForSequenceClassification",
+    "TinyBertForQuestionAnswering",
+    "TinyBertForMultipleChoice",
+]
+
+
+class TinyBertPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained TinyBERT models. It provides TinyBERT related
+    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
+    `pretrained_init_configuration`, `base_model_prefix` for downloading
+    and loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    model_config_file = CONFIG_NAME
+    config_class = TinyBertConfig
+    resource_files_names = {"model_state": "model_state.pdparams"}
+
+    pretrained_init_configuration = TINYBERT_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = TINYBERT_PRETRAINED_RESOURCE_FILES_MAP
+
+    base_model_prefix = "tinybert"
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range,
+                        shape=layer.weight.shape,
+                    )
+                )
+        elif isinstance(layer, nn.LayerNorm):
+            layer._epsilon = self.config.layer_norm_eps
+
+
+@register_base_model
+class TinyBertModel(TinyBertPretrainedModel):
+    """
+    The bare TinyBERT Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`TinyBertConfig`):
+            An instance of TinyBertConfig used to construct TinyBertModel.
+
+    """
+
+    def __init__(self, config: TinyBertConfig):
+        super(TinyBertModel, self).__init__(config)
+
+        self.pad_token_id = config.pad_token_id
+        self.initializer_range = config.initializer_range
+
+        self.embeddings = BertEmbeddings(config)
+
+        encoder_layer = nn.TransformerEncoderLayer(
+            config.hidden_size,
+            config.num_attention_heads,
+            config.intermediate_size,
+            dropout=config.hidden_dropout_prob,
+            activation=config.hidden_act,
+            attn_dropout=config.attention_probs_dropout_prob,
+            act_dropout=0.0,
+        )
+
+        self.encoder = nn.TransformerEncoder(encoder_layer, config.num_hidden_layers)
+
+        self.pooler = BertPooler(config)
+        # fit_dense(s) means a hidden states' transformation from student to teacher.
+        # `fit_denses` is used in v2 model, and `fit_dense` is used in other pretraining models.
+        self.fit_denses = nn.LayerList(
+            [nn.Linear(config.hidden_size, config.fit_size) for i in range(config.num_hidden_layers + 1)]
+        )
+        self.fit_dense = nn.Linear(config.hidden_size, config.fit_size)
+
+    def get_input_embeddings(self) -> nn.Embedding:
+        """get input embedding of TinyBert Pretrained Model
+
+        Returns:
+            nn.Embedding: the input embedding of tiny bert
+        """
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, embedding: nn.Embedding) -> None:
+        """set the input embedding with the new embedding value
+
+        Args:
+            embedding (nn.Embedding): the new embedding value
+        """
+        self.embeddings.word_embeddings = embedding
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        past_key_values: Optional[Tuple[Tuple[Tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The TinyBertModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                If `type_vocab_size` is 2, which means the inputs have two portions.
+                Indices can either be 0 or 1:
+
+                - 0 corresponds to a *sentence A* token,
+                - 1 corresponds to a *sentence B* token.
+
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+                Defaults to `None`, which means we don't add segment embeddings.
+            position_ids(Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
+                [batch_size, num_attention_heads, sequence_length, sequence_length].
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            inputs_embeds (Tensor, optional):
+                If you want to control how to convert `inputs_ids` indices into associated vectors, you can
+                pass an embedded representation directly instead of passing `inputs_ids`.
+            past_key_values (tuple(tuple(Tensor)), optional):
+                The length of tuple equals to the number of layers, and each inner
+                tuple haves 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`)
+                which contains precomputed key and value hidden states of the attention blocks.
+                If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that
+                don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+                `input_ids` of shape `(batch_size, sequence_length)`.
+            use_cache (`bool`, optional):
+                If set to `True`, `past_key_values` key value states are returned.
+                Defaults to `None`.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.ModelOutput` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPoolingAndCrossAttentions`.
+
+            tuple: Returns tuple (`encoder_output`, `pooled_output`).
+
+            With the fields:
+
+            - `encoder_output` (Tensor):
+                Sequence of hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+            - `pooled_output` (Tensor):
+                The output of first token (`[CLS]`) in sequence.
+                We "pool" the model by simply taking the hidden state corresponding to the first token.
+                Its data type should be float32 and its shape is [batch_size, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import TinyBertModel, TinyBertTokenizer
+
+                tokenizer = TinyBertTokenizer.from_pretrained('tinybert-4l-312d')
+                model = TinyBertModel.from_pretrained('tinybert-4l-312d')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP! ")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                output = model(**inputs)
+        """
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time.")
+
+        # init the default bool value
+        output_attentions = output_attentions if output_attentions is not None else False
+        output_hidden_states = output_hidden_states if output_hidden_states is not None else False
+        return_dict = return_dict if return_dict is not None else False
+        use_cache = use_cache if use_cache is not None else False
+
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+
+        if attention_mask is None:
+            attention_mask = paddle.unsqueeze(
+                (input_ids == self.pad_token_id).astype(self.pooler.dense.weight.dtype) * -1e4, axis=[1, 2]
+            )
+
+            if past_key_values is not None:
+                batch_size = past_key_values[0][0].shape[0]
+                past_mask = paddle.zeros([batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype)
+                attention_mask = paddle.concat([past_mask, attention_mask], axis=-1)
+        elif attention_mask.ndim == 2:
+            # attention_mask [batch_size, sequence_length] -> [batch_size, 1, 1, sequence_length]
+            attention_mask = attention_mask.unsqueeze(axis=[1, 2]).astype(paddle.get_default_dtype())
+            attention_mask = (1.0 - attention_mask) * -1e4
+
+        # TODO(wj-Mcat): in current branch, not support `inputs_embeds`
+        embedding_output = self.embeddings(
+            input_ids, token_type_ids, position_ids, past_key_values_length=past_key_values_length
+        )
+
+        self.encoder._use_cache = use_cache  # To be consistent with HF
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask,
+            cache=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if isinstance(encoder_outputs, type(embedding_output)):
+            sequence_output = encoder_outputs
+            pooled_output = self.pooler(sequence_output)
+            return (sequence_output, pooled_output)
+
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output)
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class TinyBertForPretraining(TinyBertPretrainedModel):
+    """
+    TinyBert Model with pretraining tasks on top.
+
+    Args:
+        config (:class:`TinyBertConfig`):
+            An instance of TinyBertConfig used to construct TinyBertForPretraining.
+
+    """
+
+    def __init__(self, config: TinyBertConfig):
+        super(TinyBertForPretraining, self).__init__(config)
+        self.tinybert = TinyBertModel(config)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The TinyBertForPretraining forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`TinyBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`TinyBertModel`.
+            position_ids (Tensor, optional):
+                See :class:`TinyBertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`TinyBertModel`.
+
+        Returns:
+            Tensor: Returns tensor `sequence_output`, sequence of hidden-states at the last layer of the model.
+            It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers.tinybert.modeling import TinyBertForPretraining
+                from paddlenlp.transformers.tinybert.tokenizer import TinyBertTokenizer
+
+                tokenizer = TinyBertTokenizer.from_pretrained('tinybert-4l-312d')
+                model = TinyBertForPretraining.from_pretrained('tinybert-4l-312d')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP! ")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                logits = outputs[0]
+
+
+        """
+        outputs = self.tinybert(
+            input_ids,
+            token_type_ids,
+            position_ids,
+            attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        # return the sequence presentation
+        if not return_dict:
+            return outputs[0]
+        return outputs
+
+
+class TinyBertForSequenceClassification(TinyBertPretrainedModel):
+    """
+    TinyBert Model with a linear layer on top of the output layer,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        config (:class:`TinyBertConfig`):
+            An instance of TinyBertConfig used to construct TinyBertForSequenceClassification.
+    """
+
+    def __init__(self, config: TinyBertConfig):
+        super(TinyBertForSequenceClassification, self).__init__(config)
+        self.tinybert = TinyBertModel(config)
+        self.num_labels = config.num_labels
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.activation = nn.ReLU()
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The TinyBertForSequenceClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`TinyBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`TinyBertModel`.
+            position_ids (Tensor, optional):
+                See :class:`TinyBertModel`.
+            attention_mask_list (list, optional):
+                See :class:`TinyBertModel`.
+            labels (Tensor of shape `(batch_size,)`, optional):
+                Labels for computing the sequence classification/regression loss.
+                Indices should be in `[0, ..., num_labels - 1]`. If `num_labels == 1`
+                a regression loss is computed (Mean-Square loss), If `num_labels > 1`
+                a classification loss is computed (Cross-Entropy).
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput` if `return_dict=True`.
+            Otherwise it returns a tuple of tensors corresponding to ordered and
+            not None (depending on the input arguments) fields of :class:`~paddlenlp.transformers.model_outputs.SequenceClassifierOutput`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers.tinybert.modeling import TinyBertForSequenceClassification
+                from paddlenlp.transformers.tinybert.tokenizer import TinyBertTokenizer
+
+                tokenizer = TinyBertTokenizer.from_pretrained('tinybert-4l-312d')
+                model = TinyBertForSequenceClassification.from_pretrained('tinybert-4l-312d')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP! ")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                logits = outputs[0]
+        """
+
+        outputs = self.tinybert(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        logits = self.classifier(self.activation(outputs[1]))
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == paddle.int64 or labels.dtype == paddle.int32):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = paddle.nn.MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = paddle.nn.CrossEntropyLoss()
+                loss = loss_fct(logits.reshape((-1, self.num_labels)), labels.reshape((-1,)))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = paddle.nn.BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return tuple_output(output, loss)
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class TinyBertForQuestionAnswering(TinyBertPretrainedModel):
+    """
+    TinyBert Model with a linear layer on top of the hidden-states
+    output to compute `span_start_logits` and `span_end_logits`,
+    designed for question-answering tasks like SQuAD.
+
+    Args:
+    Args:
+        config (:class:`TinyBertConfig`):
+            An instance of TinyBertConfig used to construct TinyBertForQuestionAnswering.
+    """
+
+    def __init__(self, config: TinyBertConfig):
+        super(TinyBertForQuestionAnswering, self).__init__(config)
+        self.tinybert = TinyBertModel(config)
+        self.classifier = nn.Linear(config.hidden_size, 2)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        start_positions: Optional[Tensor] = None,
+        end_positions: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Args:
+            input_ids (Tensor):
+                See :class:`TinyBertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`TinyBertModel`.
+            position_ids (Tensor, optional):
+                See :class:`TinyBertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`TinyBertModel`.
+            start_positions (Tensor of shape `(batch_size,)`, optional):
+                Labels for position (index) of the start of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            end_positions (Tensor of shape `(batch_size,)`, optional):
+                Labels for position (index) of the end of the labelled span for computing the token classification loss.
+                Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+                are not taken into account for computing the loss.
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.QuestionAnsweringModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            tuple: Returns tuple (`start_logits`, `end_logits`).
+
+            With the fields:
+
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import TinyBertForQuestionAnswering, TinyBertTokenizer
+
+                tokenizer = TinyBertTokenizer.from_pretrained('tinybert-6l-768d-zh')
+                model = TinyBertForQuestionAnswering.from_pretrained('tinybert-6l-768d-zh')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                logits = model(**inputs)
+        """
+
+        outputs = self.tinybert(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = self.classifier(outputs[0])
+        logits = paddle.transpose(logits, perm=[2, 0, 1])
+        start_logits, end_logits = paddle.unstack(x=logits, axis=0)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if start_positions.ndim > 1:
+                start_positions = start_positions.squeeze(-1)
+            if start_positions.ndim > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clip(0, ignored_index)
+            end_positions = end_positions.clip(0, ignored_index)
+
+            loss_fct = paddle.nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return tuple_output(output, total_loss)
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class TinyBertForMultipleChoice(TinyBertPretrainedModel):
+    """
+    TinyBERT Model with a linear layer on top of the hidden-states output layer,
+    designed for multiple choice tasks like RocStories/SWAG tasks.
+
+    Args:
+    Args:
+        config (:class:`TinyBertConfig`):
+            An instance of TinyBertConfig used to construct TinyBertForMultipleChoice.
+    """
+
+    def __init__(self, config: TinyBertConfig):
+        super(TinyBertForMultipleChoice, self).__init__(config)
+        self.num_choices = config.num_choices
+        self.tinybert = TinyBertModel(config)
+        self.dropout = nn.Dropout(
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The TinyBertForMultipleChoice forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`TinyBertModel` and shape as [batch_size, num_choice, sequence_length].
+            token_type_ids(Tensor, optional):
+                See :class:`TinyBertModel` and shape as [batch_size, num_choice, sequence_length].
+            position_ids(Tensor, optional):
+                See :class:`TinyBertModel` and shape as [batch_size, num_choice, sequence_length].
+            attention_mask (list, optional):
+                See :class:`TinyBertModel` and shape as [batch_size, num_choice, sequence_length].
+            labels (Tensor of shape `(batch_size, )`, optional):
+                Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+                num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+                `input_ids` above)
+            output_hidden_states (bool, optional):
+                Whether to return the hidden states of all layers.
+                Defaults to `False`.
+            output_attentions (bool, optional):
+                Whether to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.MultipleChoiceModelOutput` object. If
+                `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            Tensor: Returns tensor `reshaped_logits`, a tensor of the multiple choice classification logits.
+            Shape as `[batch_size, num_choice]` and dtype as `float32`.
+
+        """
+        # input_ids: [bs, num_choice, seq_l]
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time.")
+
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("input_ids and inputs_embeds should not be None at the same time.")
+        if inputs_embeds is not None:
+            inputs_embeds = inputs_embeds.reshape([-1, inputs_embeds.shape[-2], inputs_embeds.shape[-1]])
+        else:
+            input_ids = input_ids.reshape(shape=(-1, input_ids.shape[-1]))  # flat_input_ids: [bs*num_choice,seq_l]
+
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.reshape(shape=(-1, token_type_ids.shape[-1]))
+
+        if position_ids is not None:
+            position_ids = position_ids.reshape(shape=(-1, position_ids.shape[-1]))
+
+        if attention_mask is not None:
+            attention_mask = attention_mask.reshape(shape=(-1, attention_mask.shape[-1]))
+
+        outputs = self.tinybert(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = self.dropout(outputs[1])
+
+        logits = self.classifier(pooled_output)  # logits: (bs*num_choice,1)
+        reshaped_logits = logits.reshape(shape=(-1, self.num_choices))  # logits: (bs, num_choice)
+
+        loss = None
+        if labels is not None:
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return tuple_output(output, loss)
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tinybert/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tinybert/tokenizer.py
new file mode 100644
index 000000000..c4384e461
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tinybert/tokenizer.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..bert.tokenizer import BertTokenizer
+
+__all__ = ["TinyBertTokenizer"]
+
+
+class TinyBertTokenizer(BertTokenizer):
+    """
+    Constructs a TinyBert tokenizer. The usage of TinyBertTokenizer is the same as
+    `BertTokenizer <https://paddlenlp.readthedocs.io/zh/latest/source/paddlenlp.transformers.bert.tokenizer.html>`__.
+    For more information regarding those methods, please refer to this superclass.
+    """
+
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "tinybert-4l-312d": "http://bj.bcebos.com/paddlenlp/models/transformers/tinybert/tinybert-4l-312d-vocab.txt",
+            "tinybert-6l-768d": "http://bj.bcebos.com/paddlenlp/models/transformers/tinybert/tinybert-6l-768d-vocab.txt",
+            "tinybert-4l-312d-v2": "http://bj.bcebos.com/paddlenlp/models/transformers/tinybert/tinybert-4l-312d-v2-vocab.txt",
+            "tinybert-6l-768d-v2": "http://bj.bcebos.com/paddlenlp/models/transformers/tinybert/tinybert-6l-768d-v2-vocab.txt",
+            "tinybert-4l-312d-zh": "http://bj.bcebos.com/paddlenlp/models/transformers/tinybert/tinybert-4l-312d-zh-vocab.txt",
+            "tinybert-6l-768d-zh": "http://bj.bcebos.com/paddlenlp/models/transformers/tinybert/tinybert-6l-768d-zh-vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "tinybert-4l-312d": {"do_lower_case": True},
+        "tinybert-6l-768d": {"do_lower_case": True},
+        "tinybert-4l-312d-v2": {"do_lower_case": True},
+        "tinybert-6l-768d-v2": {"do_lower_case": True},
+        "tinybert-4l-312d-zh": {"do_lower_case": True},
+        "tinybert-6l-768d-zh": {"do_lower_case": True},
+    }
+    max_model_input_sizes = {
+        "tinybert-4l-312d": 512,
+        "tinybert-6l-768d": 512,
+        "tinybert-4l-312d-v2": 512,
+        "tinybert-6l-768d-v2": 512,
+        "tinybert-4l-312d-zh": 512,
+        "tinybert-6l-768d-zh": 512,
+    }
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tokenizer_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tokenizer_utils.py
new file mode 100644
index 000000000..4870a3e9b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tokenizer_utils.py
@@ -0,0 +1,2132 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import bisect
+import io
+import itertools
+import json
+import os
+import re
+import unicodedata
+from collections import OrderedDict
+from dataclasses import asdict, dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy
+import numpy as np
+import paddle
+import six
+from jinja2 import Template
+from jinja2.exceptions import TemplateError, TemplateSyntaxError
+from jinja2.sandbox import ImmutableSandboxedEnvironment
+from paddle.utils import try_import
+
+from paddlenlp.utils.env import CHAT_TEMPLATE_CONFIG_NAME
+from paddlenlp.utils.log import logger
+
+try:
+    from functools import lru_cache
+except ImportError:
+    from backports.functools_lru_cache import lru_cache
+
+from ..data.vocab import Vocab
+from ..utils.import_utils import is_tokenizers_available
+from .tokenizer_utils_base import (
+    BatchEncoding,
+    EncodedInput,
+    EncodedInputPair,
+    PaddingStrategy,
+    PreTokenizedInput,
+    PreTokenizedInputPair,
+    PretrainedTokenizerBase,
+    TensorType,
+    TextInput,
+    TextInputPair,
+    TruncationStrategy,
+)
+from .utils import InitTrackerMeta, convert_to_dict_message, fn_args_to_dict
+
+if is_tokenizers_available():
+    from tokenizers import AddedToken
+else:
+    from .tokenizer_utils_base import AddedToken
+
+__all__ = [
+    "PretrainedTokenizer",
+    "BPETokenizer",
+    "tokenize_chinese_chars",
+    "is_chinese_char",
+    "normalize_chars",
+    "tokenize_special_chars",
+    "convert_to_unicode",
+]
+
+
+def convert_to_unicode(text):
+    """
+    Converts `text` to Unicode (if it's not already), assuming utf-8 input.
+    Args:
+        text (str|bytes): Text to be converted to unicode.
+    Returns:
+        str: converted text.
+    """
+    if isinstance(text, str):
+        return text
+    elif isinstance(text, bytes):
+        return text.decode("utf-8", "ignore")
+    else:
+        raise ValueError("Unsupported string type: %s" % (type(text)))
+
+
+def whitespace_tokenize(text):
+    """
+    Runs basic whitespace cleaning and splitting on a peice of text.
+    Args:
+        text (str): Text to be tokenized.
+    Returns:
+        list(str): Token list.
+    """
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+def _is_whitespace(char):
+    """
+    Checks whether `chars` is a whitespace character.
+    """
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
+
+
+def _is_end_of_word(text):
+    """Checks whether the last character in text is one of a punctuation, control or whitespace character."""
+    last_char = text[-1]
+    return bool(_is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(last_char))
+
+
+def _is_start_of_word(text):
+    """Checks whether the first character in text is one of a punctuation, control or whitespace character."""
+    first_char = text[0]
+    return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char))
+
+
+def _insert_one_token_to_ordered_list(token_list: List[str], new_token: str):
+    """
+    Inserts one token to an ordered list if it does not already exist. Note: token_list must be sorted.
+    """
+    insertion_idx = bisect.bisect_left(token_list, new_token)
+    # Checks if new_token is already in the ordered token_list
+    if insertion_idx < len(token_list) and token_list[insertion_idx] == new_token:
+        # new_token is in token_list, don't add
+        return
+    else:
+        token_list.insert(insertion_idx, new_token)
+
+
+def is_chinese_char(cp):
+    """Checks whether CP is the codepoint of a CJK character."""
+    # This defines a "chinese character" as anything in the CJK Unicode block:
+    #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    #
+    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+    # despite its name. The modern Korean Hangul alphabet is a different block,
+    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+    # space-separated words, so they are not treated specially and handled
+    # like the all of the other languages.
+    if (
+        (cp >= 0x4E00 and cp <= 0x9FFF)
+        or (cp >= 0x3400 and cp <= 0x4DBF)  #
+        or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+        or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+        or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+        or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+        or (cp >= 0xF900 and cp <= 0xFAFF)
+        or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+    ):  #
+        return True
+
+    return False
+
+
+def _is_nonnormalized_char(char):
+    """Check whther `chars` is a non-normalized character."""
+    cp = ord(char)
+    if (
+        (0xFF00 <= cp <= 0xFFEF)
+        or (0xFE50 <= cp <= 0xFE6B)  # Halfwidth and Fullwidth Forms
+        or (0x3358 <= cp <= 0x33FF)  # Small Form Variants
+        or (0x249C <= cp <= 0x24E9)  # CJK Compatibility
+        or (0x3200 <= cp <= 0x32FF)  # Enclosed Alphanumerics: Ⓛ ⒰
+    ):  # Enclosed CJK Letters and Months
+        return True
+
+    return False
+
+
+def _is_nonnormalized_numeric(char):
+    """Check whether `chars` is a non-normalized numeric character."""
+    cp = ord(char)
+    if (
+        (0x2460 <= cp <= 0x249B)
+        or (0x24EA <= cp <= 0x24FF)  #
+        or (0x2776 <= cp <= 0x2793)  #
+        or (0x2160 <= cp <= 0x217F)  # Enclosed Alphanumerics
+    ):  # Number Forms
+        return True
+
+    return False
+
+
+def normalize_chars(text):
+    """
+    Normalize the text for multiligual and chinese models. Unicode range:
+    https://www.ling.upenn.edu/courses/Spring_2003/ling538/UnicodeRanges.html
+    """
+    output = []
+    for char in text:
+        if _is_nonnormalized_char(char):
+            for c in unicodedata.normalize("NFKC", char):
+                output.append(c)
+        elif _is_nonnormalized_numeric(char):
+            output.append(" ")
+            for c in str(int(unicodedata.numeric(char))):
+                output.append(c)
+            output.append(" ")
+        elif ord(char) == 0xF979:  # https://www.zhihu.com/question/20697984
+            output.append("凉")
+        else:
+            output.append(char)
+    return "".join(output)
+
+
+def _is_symbol(char):
+    """Check whether CP is the codepoint of a Symbol character."""
+    cp = ord(char)
+    if unicodedata.category(char).startswith("S") or (
+        cp in [0x00AD, 0x00B2, 0x00BA, 0x3007, 0x00B5, 0x00D8, 0x014B, 0x01B1]
+    ):
+        return True
+    return False
+
+
+def tokenize_special_chars(text):
+    """Adds whitespace around any special character."""
+    output = []
+    for char in text:
+        cp = ord(char)
+        if (
+            (0x3040 <= cp <= 0x30FF)
+            or (0x0370 <= cp <= 0x04FF)  # Japanese
+            or (0x0250 <= cp <= 0x02AF)  # Greek/Coptic & Cyrillic
+            or _is_symbol(char)  # IPA
+        ):
+            output.append(" ")
+            output.append(char)
+            output.append(" ")
+        else:
+            output.append(char)
+    return "".join(output)
+
+
+class Trie:
+    """
+    Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass
+    Loose reference https://en.wikipedia.org/wiki/Trie
+    """
+
+    def __init__(self):
+        self.data = {}
+
+    def add(self, word: str):
+        """
+        Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation.
+        The special key `""` is used to represent termination.
+
+        This function is idempotent, adding twice the same word will leave the trie unchanged
+
+        Example:
+
+        ```python
+        >>> trie = Trie()
+        >>> trie.add("Hello 友達")
+        >>> trie.data
+        {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}}
+
+        >>> trie.add("Hello")
+        >>> trie.data
+        {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}}
+        ```
+        """
+        if not word:
+            # Prevent empty string
+            return
+        ref = self.data
+        for char in word:
+            ref[char] = char in ref and ref[char] or {}
+            ref = ref[char]
+        ref[""] = 1
+
+    def split(self, text: str) -> List[str]:
+        """
+        Will look for the words added to the trie within `text`. Output is the original string splitted along the
+        boundaries of the words found.
+
+        This trie will match the longest possible word first !
+
+        Example:
+
+        ```python
+        >>> trie = Trie()
+        >>> trie.split("[CLS] This is a extra_id_100")
+        ["[CLS] This is a extra_id_100"]
+
+        >>> trie.add("[CLS]")
+        >>> trie.add("extra_id_1")
+        >>> trie.add("extra_id_100")
+        >>> trie.split("[CLS] This is a extra_id_100")
+        ["[CLS]", " This is a ", "extra_id_100"]
+        ```
+        """
+        # indexes are counted left of the chars index.
+        # "hello", index 0, is left of h, index 1 is between h and e.
+        # index 5 is right of the "o".
+
+        # States are going to capture every possible start (indexes as above)
+        # as keys, and have as values, a pointer to the position in the trie
+        # where we're at. This is a partial match for now.
+        # This enables to keep track of multiple matches while we're iterating
+        # the string
+        # If the trie contains, "blowing", and "lower" and we encounter the
+        # string "blower", we need to split into ["b", "lower"].
+        # This is where we need to keep track of multiple possible starts.
+        states = OrderedDict()
+
+        # This will contain every indices where we need
+        # to cut.
+        # We force to cut at offset 0 and len(text) (added later)
+        offsets = [0]
+
+        # This is used by the lookahead which needs to skip over
+        # some text where the full match exceeded the place in the initial
+        # for loop
+        skip = 0
+        # Main loop, Giving this algorithm O(n) complexity
+        for current, current_char in enumerate(text):
+            if skip and current < skip:
+                # Prevents the lookahead for matching twice
+                # like extra_id_100 and id_100
+                continue
+
+            # This will track every state
+            # that stop matching, we need to stop tracking them.
+            # If we look at "lowball", we're going to match "l" (add it to states), "o", "w", then
+            # fail on "b", we need to remove 0 from the valid states.
+            to_remove = set()
+            # Whenever we found a match, we need to drop everything
+            # this is a greedy algorithm, it will match on the first found token
+            reset = False
+
+            # In this case, we already have partial matches (But unfinished)
+            for start, trie_pointer in states.items():
+                if "" in trie_pointer:
+                    # This is a final match, we need to reset and
+                    # store the results in `offsets`.
+
+                    # Lookahead to match longest first
+                    # Important in case of extra_id_1 vs extra_id_100
+                    # Here we are also actively looking for other earlier partial
+                    # matches
+                    # "[CLS]", "L", we need to match CLS even if L is special
+                    for lookstart, looktrie_pointer in states.items():
+                        if lookstart > start:
+                            # This partial match is later, we can stop looking
+                            break
+                        elif lookstart < start:
+                            # This partial match is earlier, the trie pointer
+                            # was already updated, so index is + 1
+                            lookahead_index = current + 1
+                            end = current + 1
+                        else:
+                            # Here lookstart == start and
+                            #      looktrie_pointer == trie_pointer
+                            # It wasn't updated yet so indices are current ones
+                            lookahead_index = current
+                            end = current
+                        next_char = text[lookahead_index] if lookahead_index < len(text) else None
+                        if "" in looktrie_pointer:
+                            start = lookstart
+                            end = lookahead_index
+                            skip = lookahead_index
+
+                        while next_char in looktrie_pointer:
+                            looktrie_pointer = looktrie_pointer[next_char]
+                            lookahead_index += 1
+                            if "" in looktrie_pointer:
+                                start = lookstart
+                                end = lookahead_index
+                                skip = lookahead_index
+
+                            if lookahead_index == len(text):
+                                # End of string
+                                break
+                            next_char = text[lookahead_index]
+                        # End lookahead
+
+                        # Storing and resetting
+                    offsets.append(start)
+                    offsets.append(end)
+                    reset = True
+                    break
+                elif current_char in trie_pointer:
+                    # The current character being looked at has a match within the trie
+                    # update the pointer (it will be stored back into states later).
+                    trie_pointer = trie_pointer[current_char]
+
+                    # Storing back the new pointer into the states.
+                    # Partial matches got longer by one.
+                    states[start] = trie_pointer
+                else:
+                    # The new character has not match in the trie, we need
+                    # to stop keeping track of this partial match.
+                    # We can't do it directly within the loop because of how
+                    # python iteration works
+                    to_remove.add(start)
+
+            # Either clearing the full start (we found a real match)
+            # Or clearing only the partial matches that didn't work.
+            if reset:
+                states = {}
+            else:
+                for start in to_remove:
+                    del states[start]
+
+            # If this character is a starting character within the trie
+            # start keeping track of this partial match.
+            if current >= skip and current_char in self.data:
+                states[current] = self.data[current_char]
+
+        # We have a cut at the end with states.
+        for start, trie_pointer in states.items():
+            if "" in trie_pointer:
+                # This is a final match, we need to reset and
+                # store the results in `offsets`.
+                end = len(text)
+                offsets.append(start)
+                offsets.append(end)
+                # Longest cut is always the one with lower start so the first
+                # item so we need to break.
+                break
+
+        return self.cut_text(text, offsets)
+
+    def cut_text(self, text, offsets):
+        # We have all the offsets now, we just need to do the actual splitting.
+        # We need to eventually add the first part of the string and the eventual
+        # last part.
+        offsets.append(len(text))
+        tokens = []
+        start = 0
+        for end in offsets:
+            if start > end:
+                logger.error(
+                    "There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it anyway."
+                )
+                continue
+            elif start == end:
+                # This might happen if there's a match at index 0
+                # we're also preventing zero-width cuts in case of two
+                # consecutive matches
+                continue
+            tokens.append(text[start:end])
+            start = end
+
+        return tokens
+
+
+def tokenize_chinese_chars(text):
+    """Adds whitespace around any CJK character."""
+    output = []
+    buff = ""
+    for char in text:
+        cp = ord(char)
+        if is_chinese_char(cp):
+            if buff != "":
+                output.append(buff)
+                buff = ""
+            output.append(char)
+        else:
+            buff += char
+
+    if buff != "":
+        output.append(buff)
+
+    return output
+
+
+@dataclass
+class ChatTemplate:
+    conversation: list[str] | None = None
+    system: str | None = None
+    query: str = None
+
+    @staticmethod
+    @lru_cache()
+    def _compile_jinja_template(chat_template) -> Template:
+        def raise_exception(message):
+            raise TemplateError(message)
+
+        jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True, keep_trailing_newline=True)
+        jinja_env.globals["raise_exception"] = raise_exception
+        return jinja_env.from_string(chat_template)
+
+    def render_conversation(
+        self, conversation_data: list[str] | dict[str, str], index: int = 0, context_data: Dict[str, Any] = {}
+    ) -> list[str]:
+        """
+        Args:
+            conversation_data (list[str]): the conversation data which must be two parts
+            index (int): the index of current conversation
+
+        Returns:
+            list[str]: the rendered conversation data
+        """
+        if self.conversation is None:
+            raise ValueError(
+                "The template for multi-turns is invalid, please check `conversation` filed in your chat-template."
+            )
+
+        if isinstance(conversation_data, (list, tuple)):
+            assert (
+                len(conversation_data) == 2
+            ), "Each round/turn of conversation must be two participants, eg: [user-query, bot-query]"
+
+            conversation_data = {"user": conversation_data[0], "bot": conversation_data[1], "index": index}
+        conversation_data.update(context_data)
+
+        one_turn_conversation = []
+        for conversation in self.conversation:
+            template = self._compile_jinja_template(conversation)
+            result = template.render(conversation_data)
+            one_turn_conversation.append(result)
+        return one_turn_conversation
+
+    def render_query(self, query: str, index: int = 0, context_data: Dict[str, Union[int, str]] = {}):
+        if self.query is None:
+            return query
+
+        template = self._compile_jinja_template(self.query)
+        return template.render(query=query, index=index, **context_data)
+
+    def _init_context_data(self, context_data: Dict[str, Union[int, str]] = {}) -> Dict[str, Union[int, str]]:
+        """init the context data for chat-template"""
+        context_data["is_training"] = context_data.get("is_training", False)
+        return context_data
+
+    def render_system(self, context_data: Dict[str, Union[int, str]] = {}) -> str:
+        if self.system is None:
+            return ""
+
+        template = self._compile_jinja_template(self.system)
+        return template.render(**context_data)
+
+    def __call__(self, conversations: list[list[str]] | str, context_data: Dict[str, Union[int, str]] = {}) -> str:
+        """render the conversations by chat-template
+
+        Args:
+            conversations (list[list[str]]): the conversations of use and bot
+
+        Returns:
+            str: the result of conversation
+        """
+        if isinstance(conversations, str):
+            conversations = [[conversations]]
+
+        # [1 ... n-1] conversation
+        final_query = self.render_system(context_data=context_data)
+        context_data["length"] = len(conversations)
+        for index, conversation in enumerate(conversations[:-1]):
+            context_data["is_first"] = index == 0
+            context_data["is_last"] = False
+            final_query += "".join(self.render_conversation(conversation, index=index, context_data=context_data))
+
+        if not isinstance(conversations[-1], list) and not len(conversations[-1]) != 1:
+            raise ValueError(
+                "The length of last conversation must be one, eg: [[user-query, bot-answer], [user-query, bot-answer], ..., [user-query]]"
+            )
+        if len(conversations[-1]) > 1:
+            logger.warning(
+                f"The last conversation is not a single-round, chat-template will skip the conversation: {conversations[-1][1:]}"
+            )
+
+        final_query += self.render_query(conversations[-1][0], index=len(conversations) - 1, context_data=context_data)
+        return final_query
+
+    @classmethod
+    def from_dict(cls, config: dict):
+        return cls(**config)
+
+    @classmethod
+    def from_file(cls, file: str):
+        with open(file, "r", encoding="utf-8") as f:
+            config = json.load(f)
+        return cls.from_dict(config)
+
+
+class ChatTemplateMixin:
+    chat_template: Optional[ChatTemplate] = None
+
+    def apply_chat_template(
+        self,
+        conversation: List[List[str, str] | Dict[str, str]] | str,
+        tokenize: bool = True,
+        context_data: Dict[str, Any] = {},
+        **tokenizer_kwargs
+    ) -> str | dict[str, numpy.ndarray | paddle.Tensor]:
+        """apply chat_template rules to conversation which should not be batched data
+
+        Args:
+            conversation (List[List[str, str]] | str): the conversation messages between user and bot
+            context_data (Dict[str, Any]): the context data for chat_template.json
+            tokenize (bool, optional): whether do tokenization. Defaults to True.
+
+        Returns:
+            str | dict[str, numpy.ndarray | paddle.Tensor]: return the result of applied data
+        """
+        if not self.chat_template:
+            raise ValueError("chat_template is not set, please set chat_template first.")
+        elif isinstance(self.chat_template, Template):
+            add_generation_prompt = tokenizer_kwargs.pop("add_generation_prompt", True)
+            query = self._apply_chat_template(conversation, add_generation_prompt=add_generation_prompt)
+        elif isinstance(self.chat_template, ChatTemplate):
+            query = self._apply_chat_template_paddle(conversation, context_data)
+
+        if not tokenize:
+            return query
+
+        # chat_template should not add special tokens
+        tokenizer_kwargs["add_special_tokens"] = False
+        return self(query, **tokenizer_kwargs)
+
+    def _apply_chat_template_paddle(
+        self,
+        conversation: List[List[str, str]] | str,
+        context_data: Dict[str, Any] = {},
+    ) -> str | dict[str, numpy.ndarray | paddle.Tensor]:
+        context_data = self.chat_template._init_context_data(context_data)
+
+        if isinstance(conversation, str):
+            conversation = [[conversation]]
+        elif isinstance(conversation, list) and isinstance(conversation[0], str):
+            raise ValueError(
+                "apply_chat_template do not support appling batch conversations, "
+                "so you should apply the conversation one by one."
+            )
+
+        query = self.chat_template(conversation, context_data=context_data)
+        return query
+
+    def _apply_chat_template(
+        self,
+        conversation: List[List[str, str] | Dict[str, str]] | str,
+        add_generation_prompt=True,
+    ) -> str | dict[str, numpy.ndarray | paddle.Tensor]:
+        if isinstance(conversation, str):
+            conversations = [{"role": "user", "content": conversation}]
+        elif isinstance(conversation, list):
+            assert len(conversation) > 0, "empty conversation is not allowed"
+            if isinstance(conversation[0], list):
+                conversations = convert_to_dict_message(conversation)
+            elif isinstance(conversation[0], dict):
+                conversations = conversation
+            else:
+                raise ValueError(
+                    "apply_chat_template do not support appling batch conversations, "
+                    "so you should apply the conversation one by one."
+                )
+        query = self.chat_template.render(
+            messages=conversations, **self.special_tokens_map, add_generation_prompt=add_generation_prompt
+        )
+        return query
+
+    def encode_chat_inputs(self, conversations: List[List[str, str]], context_data: Dict[str, Any] = {}, **kwargs):
+        """Encodes conversation to pairs of token ids.
+        Turn 0: bos + system + sep + user     bot + eos
+        Turn t: sep + bot + query             bot + eos
+
+        Args:
+            conversation (List[List[str, str]]): the conversation of data
+            context_data (Dict[str, Any]): the context data of conversation
+
+        Returns:
+            List[list[int], list[int]]: the pair of input_ids and target_ids
+        """
+        if not self.chat_template:
+            raise ValueError("chat_template is not set, please set chat_template first.")
+        elif isinstance(self.chat_template, Template):
+            add_generation_prompt = kwargs.pop("add_generation_prompt", True)
+            query = self._encode_chat_inputs(conversations, context_data, add_generation_prompt=add_generation_prompt)
+        elif isinstance(self.chat_template, ChatTemplate):
+            query = self._encode_chat_inputs_paddle(conversations, context_data)
+        return query
+
+    def _encode_chat_inputs_paddle(self, conversations: List[List[str, str]], context_data: Dict[str, Any] = {}):
+        context_data = self.chat_template._init_context_data(context_data)
+        # encode system
+        result = {}
+        if self.chat_template.system:
+            system = self.chat_template.render_system(context_data)
+            result["system"] = self.encode(system, add_special_tokens=False)["input_ids"]
+
+        # encode conversation
+        conversation_ids = []
+        for index, conversation in enumerate(conversations):
+            # give more control to chat_template
+            context_data["is_first"] = index == 0
+            context_data["is_last"] = index == len(conversations) - 1
+
+            user_input, bot_output = self.chat_template.render_conversation(
+                conversation, index=index, context_data=context_data
+            )
+            user_ids = self.encode(user_input, add_special_tokens=False)["input_ids"]
+            bot_ids = self.encode(bot_output, add_special_tokens=False)["input_ids"]
+            conversation_ids.append([user_ids, bot_ids])
+
+        result["conversations"] = conversation_ids
+        return result
+
+    def _encode_chat_inputs(
+        self,
+        conversations: List[List[str, str]],
+        context_data: Dict[str, Any] = {},
+        system: str = None,
+        add_generation_prompt=True,
+    ):
+        result = {}
+
+        # Some template do not support system msg, so we need to check it first.
+        if system:
+            try:
+                self.chat_template.render(messages={"role": "system", "content": system})
+            except Exception as e:
+                raise ValueError("System is not supported in this tokenizer.", e)
+
+        # convert list msg to role dict msg
+        conversation_dict = []
+        origin_msg = []
+        for round in conversations:
+            round_role = [
+                {"role": "user", "content": round[0]},
+                {"role": "assistant", "content": round[1]},
+            ]
+            origin_msg.extend(round_role)
+            conversation_dict.append(round_role)
+        ans = []
+
+        # get answer in single round, then compile the chat entirely and split by single round ans
+        # attention: answer should include end token!
+        for conv in conversation_dict:
+            roundi = [system] + conv if system else conv
+            roundi_str = self.chat_template.render(
+                messages=roundi, add_generation_prompt=False, **self.special_tokens_map
+            )
+            roundi_no_ans = [system] + [conv[0]] if system else [conv[0]]
+            roundi_no_ans_str = self.chat_template.render(
+                messages=roundi_no_ans, add_generation_prompt=add_generation_prompt, **self.special_tokens_map
+            )
+            ans_roundi = roundi_str[len(roundi_no_ans_str) :]
+            ans.append(ans_roundi)
+
+        non_learnable_parts = self._extract_non_learnable_parts(origin_msg, ans)
+        assert len(non_learnable_parts) == len(ans)
+
+        conversation_ids = []
+        for i in range(len(non_learnable_parts)):
+            conversation_ids.append(
+                self.batch_encode(
+                    [non_learnable_parts[i], ans[i]],
+                    add_special_tokens=False,
+                    padding=False,
+                )["input_ids"]
+            )
+
+        result["conversations"] = conversation_ids
+        return result
+
+    def _extract_non_learnable_parts(self, origin_msg: List[Dict[str, str]], split_s: List[str]):
+        """Split the entire chat by specified words. Extract the non-learnable parts."""
+        # distingish and replace the special words in original string to an uncompiled form: Like | -> \|
+        regex_pattern = "|".join(map(re.escape, split_s))
+        # splited by replaced specified words
+        non_learnable_parts = re.split(
+            r"(?:%s)" % regex_pattern,
+            self.chat_template.render(messages=origin_msg, add_generation_prompt=False, **self.special_tokens_map),
+        )
+        if non_learnable_parts[-1] == "":
+            non_learnable_parts.pop()
+        return non_learnable_parts
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        cache_dir = kwargs.pop("cache_dir", None)
+        from_hf_hub = kwargs.pop("from_hf_hub", False)
+        from_aistudio = kwargs.pop("from_aistudio", False)
+        subfolder = kwargs.pop("subfolder", "")
+        if subfolder is None:
+            subfolder = ""
+
+        kwargs["subfolder"] = subfolder
+        kwargs["cache_dir"] = cache_dir
+        kwargs["from_hf_hub"] = from_hf_hub
+        kwargs["from_aistudio"] = from_aistudio
+        kwargs["return_tokenizer_file_dir"] = True
+        tokenizer, tokenizer_config_file_dir = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+
+        # load chat-template
+        chat_template_file = os.path.join(tokenizer_config_file_dir, CHAT_TEMPLATE_CONFIG_NAME)
+        if not os.path.exists(chat_template_file):
+            return tokenizer
+
+        if tokenizer.chat_template is not None:
+            logger.warning(
+                "Chat-template already exists in config file, it will be overwritten by chat_template.json file."
+            )
+            logger.warning(
+                "`chat_template.json` will be deprecated in the future! Please set it in `tokenizer_config.json`."
+            )
+        tokenizer.init_chat_template(chat_template_file)
+        return tokenizer
+
+    def init_chat_template(self, chat_template: str | dict):
+        """init chat_tempalte by file_path or template dict data
+
+        Args:
+            chat_template (str | dict): file_path or template dict data
+        """
+        if isinstance(chat_template, str):
+            if not os.path.exists(chat_template):
+                try:
+                    self.chat_template: Template = ChatTemplate._compile_jinja_template(chat_template)
+                except TemplateSyntaxError:
+                    # It is neither jinjia string nor path string
+                    raise TemplateSyntaxError(
+                        "The chat-template in json is not valid jinja string: {}".format(chat_template),
+                        lineno=0,  # fake lineno, useless required msg
+                    )
+            else:
+                self.chat_template = ChatTemplate.from_file(chat_template)
+        elif isinstance(chat_template, dict):
+            self.chat_template = ChatTemplate.from_dict(chat_template)
+        elif isinstance(chat_template, ChatTemplate):
+            self.chat_template = chat_template
+        else:
+            raise ValueError("Receive error chat_template data: ", chat_template)
+
+    def save_resources(self, save_directory):
+        super().save_resources(save_directory)
+
+        if isinstance(self.chat_template, ChatTemplate):  # Future remove if ChatTemplate is deprecated
+            chat_template_file = os.path.join(save_directory, CHAT_TEMPLATE_CONFIG_NAME)
+            with open(chat_template_file, "w", encoding="utf-8") as f:
+                json.dump(asdict(self.chat_template), f, ensure_ascii=False, indent=4)
+            logger.info("Chat-template config file saved in " + chat_template_file)
+
+
+@six.add_metaclass(InitTrackerMeta)
+class PretrainedTokenizer(ChatTemplateMixin, PretrainedTokenizerBase):
+    """
+    Base class for all tokenizers.
+
+    Inherits from [`~tokenizer_utils_base.PretrainedTokenizerBase`].
+
+    Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading
+    pretrained tokenizers as well as adding tokens to the vocabulary.
+
+    This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the
+    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
+
+    - **resource_files_names** (`Dict[str, str]`) -- A dictionary with, as keys, the `__init__` keyword name of each
+        vocabulary file required by the model, and as associated values, the filename for saving the associated file
+        (string).
+    - **pretrained_resource_files_map** (`Dict[str, Dict[str, str]]`) -- A dictionary of dictionaries, with the
+        high-level keys being the `__init__` keyword name of each vocabulary file required by the model, the
+        low-level being the `short-cut-names` of the pretrained models with, as associated values, the `url` to the
+        associated pretrained vocabulary file.
+    - **max_model_input_sizes** (`Dict[str, Optional[int]]`) -- A dictionary with, as keys, the `short-cut-names`
+        of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model,
+        or `None` if the model has no maximum input size.
+    - **pretrained_init_configuration** (`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the
+        `short-cut-names` of the pretrained models, and as associated values, a dictionary of specific arguments to
+        pass to the `__init__` method of the tokenizer class for this pretrained model when loading the tokenizer
+        with the [`~tokenizer_utils_base.PretrainedTokenizerBase.from_pretrained`] method.
+    - **model_input_names** (`List[str]`) -- A list of inputs expected in the forward pass of the model.
+    - **padding_side** (`str`) -- The default value for the side on which the model should have padding applied.
+        Should be `'right'` or `'left'`.
+    - **truncation_side** (`str`) -- The default value for the side on which the model should have truncation
+        applied. Should be `'right'` or `'left'`.
+
+    Moreover, methods common to tokenizers for tokenization, token/id conversion
+    and encoding as model inputs are also provided here.
+
+    Besides, metaclass `InitTrackerMeta` is used to create `PretrainedTokenizer`,
+    by which subclasses can track arguments for initialization automatically
+    and expose special tokens initialization used as attributes.
+    """
+
+    added_tokens_encoder: Dict[str, int] = {}
+    added_tokens_decoder: Dict[int, str] = {}
+    unique_no_split_tokens: List[str] = []
+    tokens_trie = Trie()
+
+    _decode_use_source_tokenizer = False
+
+    def _pre_init(self, original_init, *args, **kwargs):
+        """
+        It would be hooked before `__init__` to add specials tokens (arguments of
+        `__init__` whose name ends with `_token`) as attributes of the tokenizer
+        instance.
+        """
+        init_dict = fn_args_to_dict(original_init, *((self,) + args), **kwargs)
+        init_dict.pop("self", None)
+        super(PretrainedTokenizer, self).__init__(**init_dict)
+
+        self.added_tokens_encoder: Dict[str, int] = {}
+        self.added_tokens_decoder: Dict[int, str] = {}
+        self.unique_no_split_tokens: List[str] = []
+        self.tokens_trie = Trie()
+
+        self._decode_use_source_tokenizer = False
+
+    def _build_special_tokens_map_extended(self, **kwargs):
+        for key, value in kwargs.items():
+            if value is None:
+                continue
+            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
+                if key == "additional_special_tokens":
+                    assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple"
+                    assert all(
+                        isinstance(t, (str, AddedToken)) for t in value
+                    ), "One of the tokens is not a string or an AddedToken"
+                    setattr(self, key, value)
+                elif isinstance(value, (str, AddedToken)):
+                    setattr(self, key, value)
+                else:
+                    raise TypeError(f"special token {key} has to be either str or AddedToken but got: {type(value)}")
+
+    @property
+    def vocab_size(self) -> int:
+        """
+        `int`: Size of the base vocabulary (without the added tokens).
+        """
+        raise NotImplementedError
+
+    @property
+    def is_fast(self) -> bool:
+        return False
+
+    def get_added_vocab(self) -> Dict[str, int]:
+        """
+        Returns the added tokens in the vocabulary as a dictionary of token to index.
+
+        Returns:
+            `Dict[str, int]`: The added tokens.
+        """
+        return self.added_tokens_encoder
+
+    def __len__(self):
+        """
+        Size of the full vocabulary with the added tokens.
+        """
+        return self.vocab_size + len(self.added_tokens_encoder)
+
+    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
+        """
+        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
+        it with indices starting from length of the current vocabulary.
+
+        Args:
+            new_tokens (`List[str]`or `List[AddedToken]`):
+                Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
+                checking if the tokenizer assign the index of the `unk_token` to them).
+            special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the tokens should be added as special tokens.
+
+        Returns:
+            `int`: The number of tokens actually added to the vocabulary.
+
+        Examples:
+
+        ```python
+        # Let's see how to increase the vocabulary of Bert model and tokenizer
+        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        model = BertModel.from_pretrained("bert-base-uncased")
+
+        num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
+        print("We have added", num_added_toks, "tokens")
+        ```"""
+        new_tokens = [str(tok) for tok in new_tokens]
+
+        tokens_to_add = []
+        for token in new_tokens:
+            if not isinstance(token, str):
+                raise TypeError(f"Token {token} is not a string but a {type(token)}.")
+            if not special_tokens and hasattr(self, "do_lower_case") and self.do_lower_case:
+                token = token.lower()
+            if (
+                token != self.unk_token
+                and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
+                and token not in tokens_to_add
+                and token not in self.added_tokens_encoder.keys()
+            ):
+                tokens_to_add.append(token)
+                if self.verbose:
+                    logger.info(f"Adding {token} to the vocabulary")
+
+        added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add))
+        added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
+        self.added_tokens_encoder.update(added_tok_encoder)
+        self.added_tokens_decoder.update(added_tok_decoder)
+
+        # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)
+        if special_tokens:
+            if len(new_tokens) == 1:
+                _insert_one_token_to_ordered_list(self.unique_no_split_tokens, new_tokens[0])
+            else:
+                self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(new_tokens)))
+        else:
+            # Or on the newly added tokens
+            if len(tokens_to_add) == 1:
+                _insert_one_token_to_ordered_list(self.unique_no_split_tokens, tokens_to_add[0])
+            else:
+                self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(tokens_to_add)))
+        self._create_trie(self.unique_no_split_tokens)
+
+        return len(tokens_to_add)
+
+    def _create_trie(self, unique_no_split_tokens):
+        trie = Trie()
+        for token in unique_no_split_tokens:
+            if hasattr(self, "do_lower_case") and self.do_lower_case and token not in self.all_special_tokens:
+                trie.add(token.lower())
+            else:
+                trie.add(token)
+        self.tokens_trie = trie
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        """
+        Performs any necessary transformations before tokenization.
+
+        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
+        `kwargs` at the end of the encoding process to be sure all the arguments have been used.
+
+        Args:
+            text (`str`):
+                The text to prepare.
+            is_split_into_words (`bool`, *optional*, defaults to `False`):
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
+                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
+                which it will tokenize. This is useful for NER or token classification.
+            kwargs:
+                Keyword arguments to use for the tokenization.
+
+        Returns:
+            `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
+        """
+
+        return (text, kwargs)
+
+    def tokenize(self, text: TextInput, **kwargs) -> List[str]:
+        """
+        Converts a string in a sequence of tokens, using the tokenizer.
+
+        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
+        (BPE/SentencePieces/WordPieces). Takes care of added tokens.
+
+        Args:
+            text (`str`):
+                The sequence to be encoded.
+            **kwargs (additional keyword arguments):
+                Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
+
+        Returns:
+            `List[str]`: The list of tokens.
+        """
+
+        split_special_tokens = kwargs.pop("split_special_tokens", self.split_special_tokens)
+
+        # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
+        all_special_tokens_extended = dict(
+            (str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
+        )
+
+        text, kwargs = self.prepare_for_tokenization(text, **kwargs)
+
+        # TODO: should this be in the base class?
+        if hasattr(self, "do_lower_case") and self.do_lower_case:
+            # convert non-special tokens to lowercase
+            escaped_special_toks = [
+                re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens)
+            ]
+            pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
+            text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
+
+        if split_special_tokens:
+            no_split_token = []
+            tokens = [text]
+        else:
+            no_split_token = set(self.unique_no_split_tokens)  # don't split on any of the added tokens
+            # "This is something<special_token_1>  else"
+            tokens = self.tokens_trie.split(text)
+
+        # ["This is something", "<special_token_1>", "  else"]
+        for i, token in enumerate(tokens):
+            if token in no_split_token:
+                tok_extended = all_special_tokens_extended.get(token, None)
+                left = tokens[i - 1] if i > 0 else None
+                right = tokens[i + 1] if i < len(tokens) - 1 else None
+                if isinstance(tok_extended, AddedToken):
+                    if tok_extended.rstrip and right:
+                        # A bit counter-intuitive but we strip the left of the string
+                        # since tok_extended.rstrip means the special token is eating all white spaces on its right
+                        tokens[i + 1] = right.lstrip()
+                    # Strip white spaces on the left
+                    if tok_extended.lstrip and left:
+                        tokens[i - 1] = left.rstrip()  # Opposite here
+                else:
+                    # We strip left and right by default
+                    if right:
+                        tokens[i + 1] = right.lstrip()
+                    if left:
+                        tokens[i - 1] = left.rstrip()
+        # ["This is something", "<special_token_1>", "else"]
+        tokenized_text = []
+        for token in tokens:
+            # Need to skip eventual empty (fully stripped) tokens
+            if not token:
+                continue
+            if token in no_split_token:
+                tokenized_text.append(token)
+            else:
+                tokenized_text.extend(self._tokenize(token))
+        # ["This", " is", " something", "<special_token_1>", "else"]
+        return tokenized_text
+
+    def _tokenize(self, text, **kwargs):
+        """
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+
+        Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+
+    def convert_tokens_to_ids(self, tokens):
+        if tokens is None:
+            return None
+
+        if isinstance(tokens, str):
+            return self._convert_token_to_id_with_added_voc(tokens)
+
+        ids = []
+        for token in tokens:
+            ids.append(self._convert_token_to_id_with_added_voc(token))
+
+        return ids
+
+    def _convert_token_to_id_with_added_voc(self, token):
+        if token is None:
+            return None
+
+        if token in self.added_tokens_encoder:
+            return self.added_tokens_encoder[token]
+        return self._convert_token_to_id(token)
+
+    def _convert_token_to_id(self, token):
+
+        return self.vocab.to_indices(token)
+
+    def convert_tokens_to_string(self, tokens):
+        """
+        Converts a sequence of tokens (list of string) to a single string by
+        using ``' '.join(tokens)`` .
+
+        Args:
+            tokens (list[str]): A sequence of tokens.
+
+        Returns:
+            str: Converted string.
+        """
+        return " ".join(tokens)
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        if isinstance(ids, int):
+            if ids in self.added_tokens_decoder:
+                return self.added_tokens_decoder[ids]
+            else:
+                return self._convert_id_to_token(ids)
+        tokens = []
+        for index in ids:
+            index = int(index)
+            if skip_special_tokens and index in self.all_special_ids:
+                continue
+            if index in self.added_tokens_decoder:
+                tokens.append(self.added_tokens_decoder[index])
+            else:
+                tokens.append(self._convert_id_to_token(index))
+        return tokens
+
+    def _convert_id_to_token(self, index):
+
+        return self.vocab.to_tokens(index)
+
+    @staticmethod
+    def load_vocabulary(filepath, unk_token=None, pad_token=None, bos_token=None, eos_token=None, **kwargs):
+        """
+        Instantiate an instance of `Vocab` from a file reserving all tokens
+        by using `Vocab.from_dict`. The file contains a token per line, and the
+        line number would be the index of corresponding token.
+
+        Args:
+            filepath (str): path of file to construct vocabulary.
+            unk_token (str): special token for unknown token. If no need, it also
+                could be `None`. Defaults to `None`.
+            pad_token (str): special token for padding token. If no need, it also
+                could be `None`. Defaults to `None`.
+            bos_token (str): special token for bos token. If no need, it also
+                could be `None`. Defaults to `None`.
+            eos_token (str): special token for eos token. If no need, it also
+                could be `None`. Defaults to `None`.
+            **kwargs (dict): keyword arguments for `Vocab.from_dict`.
+
+        Returns:
+            Vocab: An instance of `Vocab`.
+        """
+        token_to_idx = {}
+        with io.open(filepath, "r", encoding="utf-8") as f:
+            for index, line in enumerate(f):
+                token = line.rstrip("\n")
+                token_to_idx[token] = int(index)
+        vocab = Vocab.from_dict(
+            token_to_idx, unk_token=unk_token, pad_token=pad_token, bos_token=bos_token, eos_token=eos_token, **kwargs
+        )
+        return vocab
+
+    @staticmethod
+    def save_vocabulary(filepath, vocab):
+        """
+        Save all tokens to a vocabulary file. The file contains a token per line,
+        and the line number would be the index of corresponding token.
+
+        Args:
+            filepath (str): File path to be saved to.
+            vocab (Vocab|dict): The `Vocab` or `dict` instance to be saved.
+        """
+        if isinstance(vocab, Vocab):
+            tokens = vocab.idx_to_token
+        else:
+            tokens = sorted(vocab.keys(), key=lambda token: vocab[token])
+        with io.open(filepath, "w", encoding="utf-8") as f:
+            for token in tokens:
+                f.write(token + "\n")
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+
+        Args:
+            token_ids_0 (List[int]): List of ids of the first sequence.
+            token_ids_1 (List[int], optional): List of ids of the second sequence.
+            already_has_special_tokens (bool, optional): Whether or not the token list is already
+                formatted with special tokens for the model. Defaults to None.
+
+        Returns:
+            results (List[int]): The list of integers in the range [0, 1]:
+                1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
+
+    def num_special_tokens_to_add(self, pair):
+        """
+        Returns the number of added tokens when encoding a sequence with special tokens.
+
+        Args:
+            pair (bool, optional):
+                Whether the number of added tokens should be computed in the case of a sequence pair or a single
+                sequence. Defaults to `False`.
+        Returns:
+            int: Number of special tokens added to sequences.
+        """
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+
+    def _encode_plus(
+        self,
+        text: Union[TextInput, PreTokenizedInput, EncodedInput],
+        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_position_ids: Optional[bool] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        def get_input_ids(text):
+            if isinstance(text, str):
+                tokens = self.tokenize(text, **kwargs)
+                return self.convert_tokens_to_ids(tokens)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
+                if is_split_into_words:
+                    tokens = list(
+                        itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
+                    )
+                    return self.convert_tokens_to_ids(tokens)
+                else:
+                    return self.convert_tokens_to_ids(text)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
+                return text
+            else:
+                if is_split_into_words:
+                    raise ValueError(
+                        f"Input {text} is not valid. Should be a string or a list/tuple of strings when `is_split_into_words=True`."
+                    )
+                else:
+                    raise ValueError(
+                        f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
+                    )
+
+        first_ids = get_input_ids(text)
+        second_ids = get_input_ids(text_pair) if text_pair is not None else None
+
+        if return_offsets_mapping:
+            kwargs["text"] = text
+            kwargs["text_pair"] = text_pair
+
+        return self.prepare_for_model(
+            first_ids,
+            pair_ids=second_ids,
+            add_special_tokens=add_special_tokens,
+            padding=padding_strategy.value,
+            truncation=truncation_strategy.value,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            prepend_batch_axis=True,
+            return_position_ids=return_position_ids,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            List[TextInput],
+            List[TextInputPair],
+            List[PreTokenizedInput],
+            List[PreTokenizedInputPair],
+            List[EncodedInput],
+            List[EncodedInputPair],
+        ],
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_position_ids: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_dict: bool = True,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        def get_input_ids(text):
+            if isinstance(text, str):
+                tokens = self.tokenize(text, **kwargs)
+                return self.convert_tokens_to_ids(tokens)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
+                if is_split_into_words:
+                    tokens = list(
+                        itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
+                    )
+                    return self.convert_tokens_to_ids(tokens)
+                else:
+                    return self.convert_tokens_to_ids(text)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
+                return text
+            else:
+                raise ValueError(
+                    "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
+                )
+
+        input_ids = []
+        for ids_or_pair_ids in batch_text_or_text_pairs:
+            if not isinstance(ids_or_pair_ids, (list, tuple)):
+                ids, pair_ids = ids_or_pair_ids, None
+            elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)):
+                ids, pair_ids = ids_or_pair_ids, None
+            else:
+                ids, pair_ids = ids_or_pair_ids
+
+            first_ids = get_input_ids(ids)
+            second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
+            input_ids.append((first_ids, second_ids))
+
+        if stride > 0 and second_ids is not None:
+            kwargs["batch_text_or_text_pairs"] = batch_text_or_text_pairs
+        else:
+            if return_offsets_mapping:
+                has_pair = False
+                if len(batch_text_or_text_pairs) > 0:
+                    if isinstance(batch_text_or_text_pairs[0], (list, tuple)):
+                        has_pair = True
+                kwargs["texts"] = None
+                kwargs["text_pairs"] = None
+                if has_pair:
+                    kwargs["texts"] = [text[0] for text in batch_text_or_text_pairs]
+                    kwargs["text_pairs"] = [text[1] for text in batch_text_or_text_pairs]
+                else:
+                    kwargs["texts"] = [text for text in batch_text_or_text_pairs]
+
+        batch_outputs = self._batch_prepare_for_model(
+            input_ids,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_position_ids=return_position_ids,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_dict=return_dict,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            return_tensors=return_tensors,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return batch_outputs
+
+    def _batch_prepare_for_model(
+        self,
+        batch_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_position_ids: Optional[bool] = None,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_dict: bool = True,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        """
+        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
+        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
+        manages a moving window (with user defined stride) for overflowing tokens
+
+        Args:
+            batch_ids_pairs: list of tokenized input ids or input ids pairs
+        """
+        if return_token_type_ids and not add_special_tokens:
+            raise ValueError(
+                "Asking to return token_type_ids while setting add_special_tokens to False "
+                "results in an undefined behavior. Please set add_special_tokens to True or "
+                "set return_token_type_ids to None."
+            )
+
+        batch_outputs = {}
+        batch_outputs_list = []
+        for example_id, (first_ids, second_ids) in enumerate(batch_ids_pairs):
+            if stride > 0 and second_ids is not None:
+                if return_token_type_ids is None:
+                    return_token_type_ids = "token_type_ids" in self.model_input_names
+                if return_attention_mask is None:
+                    return_attention_mask = "attention_mask" in self.model_input_names
+
+                max_len_for_pair = (
+                    max_length
+                    - len(first_ids)
+                    - (self.num_special_tokens_to_add(pair=True) if add_special_tokens else 0)
+                )
+
+                text, text_pair = kwargs["batch_text_or_text_pairs"][example_id]
+                token_offset_mapping = self.get_offset_mapping(text)
+                token_pair_offset_mapping = self.get_offset_mapping(text_pair)
+
+                offset = 0
+                while offset < len(second_ids):
+                    encoded_inputs = {}
+                    length = len(second_ids) - offset
+                    if length > max_len_for_pair:
+                        length = max_len_for_pair
+
+                    ids = first_ids
+                    pair_ids = second_ids[offset : offset + length]
+                    pair = bool(pair_ids is not None)
+                    mapping = token_offset_mapping
+                    pair_mapping = token_pair_offset_mapping[offset : offset + length]
+                    if add_special_tokens:
+                        offset_mapping = self.build_offset_mapping_with_special_tokens(mapping, pair_mapping)
+                        sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
+                        token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
+                    else:
+                        offset_mapping = mapping + pair_mapping
+                        sequence = ids + pair_ids if pair else ids
+                        token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
+                    encoded_inputs["offset_mapping"] = offset_mapping
+                    # Build output dictionnary
+                    encoded_inputs["input_ids"] = sequence
+                    if return_token_type_ids:
+                        encoded_inputs["token_type_ids"] = token_type_ids
+                    if return_special_tokens_mask:
+                        if add_special_tokens:
+                            encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
+                        else:
+                            encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
+
+                    # Check lengths
+                    self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
+                    if return_position_ids:
+                        encoded_inputs["position_ids"] = list(range(len(encoded_inputs["input_ids"])))
+
+                    if return_length:
+                        encoded_inputs["length"] = len(encoded_inputs["input_ids"])
+                        encoded_inputs["seq_len"] = encoded_inputs["length"]
+
+                    encoded_inputs["overflow_to_sample"] = example_id
+
+                    for key, value in encoded_inputs.items():
+                        if key not in batch_outputs:
+                            batch_outputs[key] = []
+                        batch_outputs[key].append(value)
+
+                    if offset + length == len(second_ids):
+                        break
+                    offset += min(length, stride)
+            else:
+                if return_offsets_mapping:
+                    kwargs["text"] = kwargs["texts"][example_id]
+                    kwargs["text_pair"] = None
+                    if kwargs["text_pairs"] is not None:
+                        kwargs["text_pair"] = kwargs["text_pairs"][example_id]
+
+                encoded_inputs = self.prepare_for_model(
+                    first_ids,
+                    second_ids,
+                    add_special_tokens=add_special_tokens,
+                    padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterward
+                    truncation=truncation_strategy.value,
+                    max_length=max_length,
+                    stride=stride,
+                    pad_to_multiple_of=None,  # we pad in batch afterward
+                    return_position_ids=return_position_ids,  # we pad in batch afterward
+                    return_attention_mask=False,  # we pad in batch afterward
+                    return_token_type_ids=return_token_type_ids,
+                    return_overflowing_tokens=return_overflowing_tokens,
+                    return_special_tokens_mask=return_special_tokens_mask,
+                    return_offsets_mapping=return_offsets_mapping,
+                    return_length=return_length,
+                    return_tensors=None,  # We convert the whole batch to tensors at the end
+                    prepend_batch_axis=False,
+                    verbose=verbose,
+                    **kwargs,
+                )
+                for key, value in encoded_inputs.items():
+                    if key not in batch_outputs:
+                        batch_outputs[key] = []
+                    batch_outputs[key].append(value)
+
+        batch_outputs = self.pad(
+            batch_outputs,
+            padding=padding_strategy.value,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+        )
+        if return_dict:
+            batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
+            return batch_outputs
+        else:
+            for k, v in batch_outputs.items():
+                for i in range(len(v)):
+                    if i >= len(batch_outputs_list):
+                        batch_outputs_list.append({k: v[i]})
+                    else:
+                        batch_outputs_list[i][k] = v[i]
+            return batch_outputs_list
+
+    def _get_bert_like_offset_mapping(self, text: str):
+        """
+        Returns the map of tokens and the start and end index of their start and end character.
+        Modified from https://github.com/bojone/bert4keras/blob/master/bert4keras/tokenizers.py#L372
+        Args:
+            text (str):
+                Input text.
+        Returns:
+            list: The offset map of input text.
+
+        """
+        if text is None:
+            return None
+        split_tokens = self.tokenize(text)
+
+        normalized_text, char_mapping = "", []
+
+        for i, ch in enumerate(text):
+            if hasattr(self, "do_lower_case") and self.do_lower_case:
+                ch = ch.lower()
+                if self.basic_tokenizer.strip_accents is not False:
+                    ch = unicodedata.normalize("NFD", ch)
+                    ch = "".join([c for c in ch if unicodedata.category(c) != "Mn"])
+            elif self.basic_tokenizer.strip_accents:
+                ch = unicodedata.normalize("NFD", ch)
+                ch = "".join([c for c in ch if unicodedata.category(c) != "Mn"])
+
+            ch = "".join([c for c in ch if not (ord(c) == 0 or ord(c) == 0xFFFD or _is_control(c))])
+            normalized_text += ch
+
+            char_mapping.extend([i] * len(ch))
+        text, token_mapping, offset = normalized_text, [], 0
+
+        char_mapping_indexes = []
+        for index, token in enumerate(split_tokens):
+            if token[:2] == "##":
+                token = token[2:]
+            if token in self.all_special_tokens:
+                token = token.lower() if hasattr(self, "do_lower_case") and self.do_lower_case else token
+            # The greek letter "sigma" has 2 forms of lowercase, σ and ς respectively.
+            # When used as a final letter of a word, the final form (ς) is used. Otherwise, the form (σ) is used.
+            # https://latin.stackexchange.com/questions/6168/how-and-when-did-we-get-two-forms-of-sigma
+            if "σ" in token or "ς" in token:
+                start = text[offset:].replace("ς", "σ").index(token.replace("ς", "σ")) + offset
+            else:
+
+                # try to fix: https://github.com/PaddlePaddle/PaddleNLP/issues/3985
+                if token not in text[offset:]:
+                    # check whether there are consecutive UNK tokens, eg: ['好', '[UNK]', '[UNK]', 'good']
+                    if index < len(split_tokens) - 1 and split_tokens[index + 1] in self.all_special_tokens:
+                        start = offset
+                        token = " "  # only contains one char
+                    else:
+                        start = -1
+                else:
+                    start = text[offset:].index(token) + offset
+
+            end = start + len(token)
+            char_mapping_indexes.append([start, end])
+
+            if start != -1:
+                offset = end
+
+        token_mapping = []
+        for index, (start, end) in enumerate(char_mapping_indexes):
+            if start == -1:
+                # init start
+                if index == 0:
+                    start = 0
+                else:
+                    start = char_mapping_indexes[index - 1][1]
+
+                # init end
+                if index == len(char_mapping_indexes) - 1:
+                    end = len(char_mapping)
+                else:
+                    # next start
+                    end = char_mapping_indexes[index + 1][0]
+
+            token_mapping.append((char_mapping[start], char_mapping[end - 1] + 1))
+
+        return token_mapping
+
+    def get_offset_mapping(self, text: str, split_tokens: Optional[List[str]] = None):
+        """
+        Returns the map of tokens and the start and end index of their start and end character.
+        Modified from https://github.com/bojone/bert4keras/blob/master/bert4keras/tokenizers.py#L372
+        Args:
+            text (str):
+                Input text.
+            split_tokens (Optional[List[str]]):
+                the tokens which has been split which can accelerate the operation.
+
+        Returns:
+            list: The offset map of input text.
+
+        """
+        if text is None:
+            return None
+        split_tokens = self.tokenize(text)
+
+        # bert-like tokenizer use the old-school code block
+        if hasattr(self, "basic_tokenizer") or hasattr(self, "wordpiece_tokenizer"):
+            return self._get_bert_like_offset_mapping(text)
+
+        if not split_tokens:
+            split_tokens = self.tokenize(text)
+
+        normalized_text, char_mapping = "", []
+
+        for i, ch in enumerate(text):
+            normalized_text += normalize_chars(ch)
+            char_mapping.extend([i] * len(ch))
+
+        text, token_mapping, offset = normalized_text, [], 0
+        do_lower_case = getattr(self, "do_lower_case", False)
+
+        # lower the text if the token is lower-cased
+        # keep align with token
+        if do_lower_case:
+            text = text.lower()
+
+        char_mapping_indexes = []
+        for token in split_tokens:
+
+            # convert tokens into original string
+            token: str = self.convert_tokens_to_string(token).strip()
+
+            if token in self.all_special_tokens:
+                if do_lower_case:
+                    token = token.lower()
+
+            # The greek letter "sigma" has 2 forms of lowercase, σ and ς respectively.
+            # When used as a final letter of a word, the final form (ς) is used. Otherwise, the form (σ) is used.
+            # https://latin.stackexchange.com/questions/6168/how-and-when-did-we-get-two-forms-of-sigma
+            if "σ" in token or "ς" in token:
+                start = text[offset:].replace("ς", "σ").index(token.replace("ς", "σ")) + offset
+            else:
+
+                # try to fix: https://github.com/PaddlePaddle/PaddleNLP/issues/3985
+                if token not in text[offset:]:
+                    start = -1
+                else:
+                    start = text[offset:].index(token) + offset
+
+            end = start + len(token)
+            char_mapping_indexes.append([start, end])
+
+            if start != -1:
+                offset = end
+
+        token_mapping = []
+        for index, (start, end) in enumerate(char_mapping_indexes):
+            if start == -1:
+                # init start
+                if index == 0:
+                    start = 0
+                else:
+                    start = char_mapping_indexes[index - 1][1]
+
+                # init end
+                if index == len(char_mapping_indexes) - 1:
+                    end = len(char_mapping)
+                else:
+                    # next start
+                    end = char_mapping_indexes[index + 1][0]
+
+            token_mapping.append((char_mapping[start], char_mapping[end - 1] + 1))
+
+        return token_mapping
+
+    def _decode(
+        self,
+        token_ids: List[int],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = True,
+        spaces_between_special_tokens: bool = True,
+        **kwargs
+    ) -> str:
+        if isinstance(token_ids, np.ndarray):
+            token_ids = token_ids.tolist()
+        self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
+        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
+
+        # To avoid mixing byte-level and unicode for byte-level BPT
+        # we need to build string separately for added tokens and byte-level tokens
+        # cf. https://github.com/huggingface/transformers/issues/1133
+        sub_texts = []
+        current_sub_text = []
+        for token in filtered_tokens:
+            if skip_special_tokens and token in self.all_special_ids:
+                continue
+            if token in self.added_tokens_encoder:
+                if current_sub_text:
+                    sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+                    current_sub_text = []
+                sub_texts.append(token)
+            else:
+                current_sub_text.append(token)
+        if current_sub_text:
+            sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+
+        if spaces_between_special_tokens:
+            text = " ".join(sub_texts)
+        else:
+            text = "".join(sub_texts)
+
+        if clean_up_tokenization_spaces:
+            clean_text = self.clean_up_tokenization(text)
+            return clean_text
+        else:
+            return text
+
+    def decode_token(
+        self,
+        all_input_ids: List[int],
+        prefix_offset: int = 0,
+        read_offset: int = 0,
+    ) -> Tuple[str, int, int]:
+        """tokenizer decoding for the streaming generation use case. This method can be overrided for tokenizer that doesn't follow this API"""
+        # The prefix text is necessary only to defeat cleanup algorithms in the decode
+        # which decide to add a space or not depending on the surrounding ids.
+        prefix_text = self.decode(all_input_ids[prefix_offset:read_offset], skip_special_tokens=False)
+        new_text = self.decode(all_input_ids[prefix_offset:], skip_special_tokens=False)
+
+        if len(new_text) > len(prefix_text) and not new_text.endswith("�"):
+            # utf-8 char at the end means it's a potential unfinished byte sequence
+            # from byte fallback tokenization.
+            # If it's in the middle, it's probably a real invalid id generated
+            # by the model
+            prefix_index = new_text.index(prefix_text)
+            new_text = new_text[prefix_index + len(prefix_text) :]
+            return new_text, read_offset, len(all_input_ids)
+        else:
+            return "", prefix_offset, read_offset
+
+
+class BPETokenizer(PretrainedTokenizer):
+    """
+    The base class for all bpe tokenizers. It mainly provides common tokenize
+    methods for bpe type tokenizer.
+
+    Args:
+        vocab_file (str):
+            file path of the vocabulary.
+        encoder_json_path (str, optional):
+            file path of the id to vocab.
+        vocab_bpe_path (str, optional):
+            file path of word merge text.
+        unk_token (str, optional):
+            The special token for unknown words.
+            Defaults to "[UNK]".
+        sep_token (str, optional):
+            The special token for separator token.
+            Defaults to "[SEP]".
+        pad_token (str, optional):
+            The special token for padding.
+            Defaults to "[PAD]".
+        cls_token (str, optional):
+            The special token for cls.
+            Defaults to "[CLS]".
+        mask_token (str, optional):
+            The special token for mask.
+            Defaults to "[MASK]".
+
+    """
+
+    class Encoder(object):
+        def __init__(self, encoder, bpe_merges, errors="replace", special_tokens=["[SEP]", "[p]", "[q]", "[/q]"]):
+            self.encoder = encoder
+            self.decoder = {v: k for k, v in self.encoder.items()}
+            self.errors = errors  # how to handle errors in decoding
+            self.byte_encoder = self._bytes_to_unicode()
+            self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+            self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+            self.cache = {}
+            self.re = try_import("regex")
+            self.special_tokens = special_tokens
+
+            # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+            self.pat = self.re.compile(
+                r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
+            )
+
+        @lru_cache()
+        def _bytes_to_unicode(self):
+            """
+            Returns list of utf-8 byte and a corresponding list of unicode strings.
+            The reversible bpe codes work on unicode strings.
+            This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+            When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+            This is a signficant percentage of your normal, say, 32K bpe vocab.
+            To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+            And avoids mapping to whitespace/control characters the bpe code barfs on.
+            """
+
+            bs = (
+                list(range(ord("!"), ord("~") + 1))
+                + list(range(ord("¡"), ord("¬") + 1))
+                + list(range(ord("®"), ord("ÿ") + 1))
+            )
+            cs = bs[:]
+
+            n = 0
+            for b in range(2**8):
+                if b not in bs:
+                    bs.append(b)
+                    cs.append(2**8 + n)
+                    n += 1
+
+            cs = [chr(n) for n in cs]
+
+            return dict(zip(bs, cs))
+
+        def _get_pairs(self, word):
+            """Return set of symbol pairs in a word.
+            Word is represented as tuple of symbols (symbols being variable-length strings).
+            """
+            pairs = set()
+            prev_char = word[0]
+            for char in word[1:]:
+                pairs.add((prev_char, char))
+                prev_char = char
+            return pairs
+
+        def bpe(self, token):
+            if token in self.cache:
+                return self.cache[token]
+            word = tuple(token)
+            pairs = self._get_pairs(word)
+
+            if not pairs:
+                return token
+
+            while True:
+                bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+                if bigram not in self.bpe_ranks:
+                    break
+                first, second = bigram
+                new_word = []
+                i = 0
+                while i < len(word):
+                    try:
+                        j = word.index(first, i)
+                        new_word.extend(word[i:j])
+                        i = j
+                    except:  # noqa: E722
+                        new_word.extend(word[i:])
+                        break
+
+                    if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                        new_word.append(first + second)
+                        i += 2
+                    else:
+                        new_word.append(word[i])
+                        i += 1
+                new_word = tuple(new_word)
+                word = new_word
+                if len(word) == 1:
+                    break
+                else:
+                    pairs = self._get_pairs(word)
+            word = " ".join(word)
+            self.cache[token] = word
+
+            return word
+
+        def tokenize(self, text):
+            tokens = text.split(" ")
+            sub_tokens = []
+            for token_i, token in enumerate(tokens):
+                if self.is_special_token(token):
+                    if token_i == 0:
+                        sub_tokens.extend([token])
+                    else:
+                        sub_tokens.extend([" " + token])
+                else:
+                    if token_i == 0:
+                        sub_tokens.extend(self.re.findall(self.pat, token))
+                    else:
+                        sub_tokens.extend(self.re.findall(self.pat, " " + token))
+            return sub_tokens
+
+        def tokenize_old(self, text):
+            return self.re.findall(self.pat, text)
+
+        def is_special_token(self, tok):
+            if isinstance(tok, int):
+                return False
+            res = False
+            for t in self.special_tokens:
+                # if tok.find(t) != -1:
+                if tok.strip() == t:
+                    res = True
+                    break
+            return res
+
+        def tokenize_bpe(self, token):
+
+            if self.is_special_token(token):
+                return [token.strip()]  # remove space for convert_to_ids
+            else:
+
+                token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
+                return [self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ")]
+
+        def encode(self, text):
+            bpe_tokens = []
+            for token in self.tokenize(text):
+                bpe_tokens.extend(self.tokenize_bpe(token))
+            return bpe_tokens
+
+        def decode(self, tokens):
+            pre_token_i = 0
+            texts = []
+            for token_i, token in enumerate(tokens):
+                if self.is_special_token(token):
+                    # proprecess tokens before token_i
+                    if token_i - pre_token_i > 0:
+                        text = "".join([self.decoder[int(tok)] for tok in tokens[pre_token_i:token_i]])
+                        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+                        texts.append(text)
+                    # texts.append(token)
+                    if token_i == 0:
+                        texts.append(token)  # in the beginning, there is no space before special tokens
+                    else:
+                        texts.extend([" ", token])  # in middle sentence, there must be a space before special tokens
+                    pre_token_i = token_i + 1
+
+            if pre_token_i < len(tokens):
+                text = "".join([self.decoder[int(tok)] for tok in tokens[pre_token_i:]])
+                text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+                texts.append(text)
+
+            return "".join(texts)
+
+    def __init__(
+        self,
+        vocab_file,
+        encoder_json_path="./configs/encoder.json",
+        vocab_bpe_path="./configs/vocab.bpe",
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+    ):
+        self.vocab = self.load_vocabulary(
+            vocab_file, unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, mask_token=mask_token
+        )
+        self.encoder_json_path = encoder_json_path
+        self.vocab_bpe_path = vocab_bpe_path
+        self.encoder = self._get_encoder(encoder_json_path, vocab_bpe_path)
+        self.nltk = try_import("nltk")
+
+    def _tokenize(self, text, is_sentencepiece=True):
+        text = convert_to_unicode(text)
+        text = " ".join(text.split())  # remove duplicate whitespace
+        if is_sentencepiece:
+            sents = self.nltk.tokenize.sent_tokenize(text)
+            bpe_ids = sum([self.encoder.encode(sent) for sent in sents], [])
+        else:
+            bpe_ids = self.encoder.encode(text)
+        tokens = [str(bpe_id) for bpe_id in bpe_ids]
+        return tokens
+
+    def _get_encoder(self, encoder_json_path, vocab_bpe_path):
+        with open(encoder_json_path, "r") as f:
+            encoder = json.load(f)
+        with open(vocab_bpe_path, "r", encoding="utf-8") as f:
+            bpe_data = f.read()
+        bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]]
+
+        return self.Encoder(
+            encoder=encoder,
+            bpe_merges=bpe_merges,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tokenizer_utils_base.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tokenizer_utils_base.py
new file mode 100644
index 000000000..5c7909d7a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tokenizer_utils_base.py
@@ -0,0 +1,3363 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import io
+import json
+import os
+import shutil
+import tempfile
+import warnings
+from collections import UserDict
+from dataclasses import dataclass
+from enum import Enum
+from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
+
+import aistudio_sdk
+import numpy as np
+import paddle
+from huggingface_hub import (
+    create_repo,
+    get_hf_file_metadata,
+    hf_hub_url,
+    repo_type_and_id_from_hf_id,
+    upload_folder,
+)
+from huggingface_hub.utils import EntryNotFoundError
+
+from ..utils.download import resolve_file_path
+from ..utils.env import CHAT_TEMPLATE_CONFIG_NAME, TOKENIZER_CONFIG_NAME
+from ..utils.import_utils import is_tokenizers_available
+from ..utils.log import logger
+
+if is_tokenizers_available():
+    from tokenizers import AddedToken
+    from tokenizers import Encoding as EncodingFast
+else:
+
+    @dataclass(frozen=False, eq=True)
+    class AddedToken:
+        """
+        AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the
+        way it should behave.
+        The `normalized` will default to `not special` if it is not specified, similarly to the definition in
+        `tokenizers`.
+        """
+
+        def __init__(
+            self, content: str, single_word=False, lstrip=False, rstrip=False, special=False, normalized=None
+        ):
+            self.content = content
+            self.single_word = single_word
+            self.lstrip = lstrip
+            self.rstrip = rstrip
+            self.special = special
+            self.normalized = normalized if normalized is not None else not special
+
+        def __getstate__(self):
+            return self.__dict__
+
+        def __str__(self):
+            return self.content
+
+        def __repr__(self) -> str:
+            return f"AddedToken(content={self.content}, single_word={self.single_word}, lstrip={self.lstrip}, rstrip={self.rstrip}, special={self.special}, normalized={self.normalized})"
+
+    @dataclass
+    class EncodingFast:
+        """This is dummy class reserved for fast tokenizer"""
+
+        pass
+
+
+class ExplicitEnum(Enum):
+    """
+    Enum with more explicit error message for missing values.
+    """
+
+    @classmethod
+    def _missing_(cls, value):
+        raise ValueError(
+            f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}"
+        )
+
+
+class PaddingStrategy(ExplicitEnum):
+    """
+    Possible values for the `padding` argument in [`PretrainedTokenizerBase.__call__`]. Useful for tab-completion in an
+    IDE.
+    """
+
+    LONGEST = "longest"
+    MAX_LENGTH = "max_length"
+    DO_NOT_PAD = "do_not_pad"
+
+
+class TensorType(ExplicitEnum):
+    """
+    Possible values for the `return_tensors` argument in [`PretrainedTokenizerBase.__call__`]. Useful for
+    tab-completion in an IDE.
+    """
+
+    PADDLE = "pd"
+    NUMPY = "np"
+
+
+VERY_LARGE_INTEGER = int(1e30)  # This is used to set the max input length for a model with infinite size input
+LARGE_INTEGER = int(1e20)  # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER
+
+# Define type aliases and NamedTuples
+TextInput = str
+PreTokenizedInput = List[str]
+EncodedInput = List[int]
+TextInputPair = Tuple[str, str]
+PreTokenizedInputPair = Tuple[List[str], List[str]]
+EncodedInputPair = Tuple[List[int], List[int]]
+
+# Slow tokenizers used to be saved in three separated files
+SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
+ADDED_TOKENS_FILE = "added_tokens.json"
+TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
+
+
+def to_py_obj(obj):
+    """
+    Convert a Paddle tensor, Numpy array or python list to a python list.
+    """
+    if isinstance(obj, (dict, UserDict)):
+        return {k: to_py_obj(v) for k, v in obj.items()}
+    elif isinstance(obj, (list, tuple)):
+        return [to_py_obj(o) for o in obj]
+    elif isinstance(obj, paddle.Tensor):
+        return obj.numpy().tolist()
+    elif isinstance(obj, (np.ndarray, np.number)):  # tolist also works on 0d np arrays
+        return obj.tolist()
+    else:
+        return obj
+
+
+def _is_numpy(x):
+    return isinstance(x, np.ndarray)
+
+
+class TruncationStrategy(ExplicitEnum):
+    """
+    Possible values for the `truncation` argument in [`PretrainedTokenizerBase.__call__`]. Useful for tab-completion in
+    an IDE.
+    """
+
+    ONLY_FIRST = "only_first"
+    ONLY_SECOND = "only_second"
+    LONGEST_FIRST = "longest_first"
+    DO_NOT_TRUNCATE = "do_not_truncate"
+
+
+class CharSpan(NamedTuple):
+    """
+    Character span in the original string.
+
+    Args:
+        start (`int`): Index of the first character in the original string.
+        end (`int`): Index of the character following the last character in the original string.
+    """
+
+    start: int
+    end: int
+
+
+class TokenSpan(NamedTuple):
+    """
+    Token span in an encoded string (list of tokens).
+
+    Args:
+        start (`int`): Index of the first token in the span.
+        end (`int`): Index of the token following the last token in the span.
+    """
+
+    start: int
+    end: int
+
+
+class BatchEncoding(UserDict):
+    """
+    Holds the output of the [`PretrainedTokenizerBase.__call__`],
+    [`PretrainedTokenizerBase.encode_plus`] and
+    [`PretrainedTokenizerBase.batch_encode_plus`] methods (tokens, attention_masks, etc).
+
+    This class is derived from a python dictionary and can be used as a dictionary. In addition, this class exposes
+    utility methods to map from word/character space to token space.
+
+    Args:
+        data (`dict`):
+            Dictionary of lists/arrays/tensors returned by the `__call__`/`encode`/`batch_encode` methods
+            ('input_ids', 'attention_mask', etc.).
+        tensor_type (`Union[None, str, TensorType]`, *optional*):
+            You can give a tensor_type here to convert the lists of integers in Paddle/Numpy Tensors at
+            initialization.
+        prepend_batch_axis (`bool`, *optional*, defaults to `False`):
+            Whether or not to add a batch axis when converting to tensors (see `tensor_type` above).
+    """
+
+    def __init__(
+        self,
+        data: Optional[Dict[str, Any]] = None,
+        encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None,
+        tensor_type: Union[None, str] = None,
+        prepend_batch_axis: bool = False,
+        n_sequences: Optional[int] = None,
+    ):
+        super().__init__(data)
+
+        if isinstance(encoding, EncodingFast):
+            encoding = [encoding]
+
+        self._encodings = encoding
+
+        if n_sequences is None and encoding is not None and len(encoding):
+            n_sequences = encoding[0].n_sequences
+
+        self._n_sequences = n_sequences
+
+        self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
+
+    @property
+    def n_sequences(self) -> Optional[int]:
+        """
+        `Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this
+        [`BatchEncoding`]. Currently can be one of `None` (unknown), `1` (a single sentence) or `2` (a pair of
+        sentences)
+        """
+        return self._n_sequences
+
+    @property
+    def is_fast(self) -> bool:
+        """
+        `bool`: Indicate whether this [`BatchEncoding`] was generated from the result of a [`PretrainedFastTokenizer`]
+        or not.
+        """
+        return self._encodings is not None
+
+    def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:
+        """
+        If the key is a string, returns the value of the dict associated to `key` ('input_ids', 'attention_mask',
+        etc.).
+
+        If the key is an integer, get the `Encoding` for batch item with index `key`.
+        """
+        if isinstance(item, str):
+            return self.data[item]
+        elif self._encodings is not None:
+            return self._encodings[item]
+        else:
+            raise KeyError(
+                "Indexing with integers is not available when using tokenizer.__call__()"
+                " with return_dict=True. Please set return_dict to False to use integer indexing."
+            )
+
+    def __getattr__(self, item: str):
+        try:
+            return self.data[item]
+        except KeyError:
+            raise AttributeError
+
+    def __getstate__(self):
+        return {"data": self.data, "encodings": self._encodings}
+
+    def __setstate__(self, state):
+        if "data" in state:
+            self.data = state["data"]
+
+        if "encodings" in state:
+            self._encodings = state["encodings"]
+
+    def keys(self):
+        return self.data.keys()
+
+    def values(self):
+        return self.data.values()
+
+    def items(self):
+        return self.data.items()
+
+    # After this point:
+    # Extended properties and methods only available for fast tokenizers
+    # not yet supported
+
+    @property
+    def encodings(self) -> Optional[List[EncodingFast]]:
+        """
+        `Optional[List[EncodingFast]]`: The list all encodings from the tokenization process. Returns `None` if
+        the input was tokenized through Python (i.e., not a fast) tokenizer.
+        """
+        return self._encodings
+
+    def tokens(self, batch_index: int = 0) -> List[str]:
+        """
+        Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to
+        integer indices) at a given batch index (only works for the output of a fast tokenizer).
+
+        Args:
+            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
+
+        Returns:
+            `List[str]`: The list of tokens at that index.
+        """
+        if not self._encodings:
+            raise ValueError("tokens() is not available when using Python-based tokenizers")
+        return self._encodings[batch_index].tokens
+
+    def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]:
+        """
+        Return a list mapping the tokens to the id of their original sentences:
+
+            - `None` for special tokens added around or between sequences,
+            - `0` for tokens corresponding to words in the first sequence,
+            - `1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly
+              encoded.
+
+        Args:
+            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
+
+        Returns:
+            `List[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens added
+            by the tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding
+            sequence.
+        """
+        if not self._encodings:
+            raise ValueError("sequence_ids() is not available when using Python-based tokenizers")
+        return self._encodings[batch_index].sequence_ids
+
+    def words(self, batch_index: int = 0) -> List[Optional[int]]:
+        """
+        Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
+
+        Args:
+            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
+
+        Returns:
+            `List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the
+            tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word
+            (several tokens will be mapped to the same word index if they are parts of that word).
+        """
+        if not self._encodings:
+            raise ValueError("words() is not available when using Python-based tokenizers")
+        warnings.warn(
+            "`BatchEncoding.words()` property is deprecated and should be replaced with the identical, "
+            "but more self-explanatory `BatchEncoding.word_ids()` property.",
+            FutureWarning,
+        )
+        return self.word_ids(batch_index)
+
+    def word_ids(self, batch_index: int = 0) -> List[Optional[int]]:
+        """
+        Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
+
+        Args:
+            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
+
+        Returns:
+            `List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the
+            tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word
+            (several tokens will be mapped to the same word index if they are parts of that word).
+        """
+        if not self._encodings:
+            raise ValueError("word_ids() is not available when using Python-based tokenizers")
+        return self._encodings[batch_index].word_ids
+
+    def token_to_sequence(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
+        """
+        Get the index of the sequence represented by the given token. In the general use case, this method returns `0`
+        for a single sequence or the first sequence of a pair, and `1` for the second sequence of a pair
+
+        Can be called as:
+
+        - `self.token_to_sequence(token_index)` if batch size is 1
+        - `self.token_to_sequence(batch_index, token_index)` if batch size is greater than 1
+
+        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
+        words are defined by the user). In this case it allows to easily associate encoded tokens with provided
+        tokenized words.
+
+        Args:
+            batch_or_token_index (`int`):
+                Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
+                the token in the sequence.
+            token_index (`int`, *optional*):
+                If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the
+                sequence.
+
+        Returns:
+            `int`: Index of the word in the input sequence.
+        """
+
+        if not self._encodings:
+            raise ValueError("token_to_sequence() is not available when using Python based tokenizers")
+        if token_index is not None:
+            batch_index = batch_or_token_index
+        else:
+            batch_index = 0
+            token_index = batch_or_token_index
+        if batch_index < 0:
+            batch_index = self._batch_size + batch_index
+        if token_index < 0:
+            token_index = self._seq_len + token_index
+        return self._encodings[batch_index].token_to_sequence(token_index)
+
+    def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
+        """
+        Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch.
+
+        Can be called as:
+
+        - `self.token_to_word(token_index)` if batch size is 1
+        - `self.token_to_word(batch_index, token_index)` if batch size is greater than 1
+
+        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
+        words are defined by the user). In this case it allows to easily associate encoded tokens with provided
+        tokenized words.
+
+        Args:
+            batch_or_token_index (`int`):
+                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
+                the token in the sequence.
+            token_index (`int`, *optional*):
+                If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the
+                sequence.
+
+        Returns:
+            `int`: Index of the word in the input sequence.
+        """
+
+        if not self._encodings:
+            raise ValueError("token_to_word() is not available when using Python based tokenizers")
+        if token_index is not None:
+            batch_index = batch_or_token_index
+        else:
+            batch_index = 0
+            token_index = batch_or_token_index
+        if batch_index < 0:
+            batch_index = self._batch_size + batch_index
+        if token_index < 0:
+            token_index = self._seq_len + token_index
+        return self._encodings[batch_index].token_to_word(token_index)
+
+    def word_to_tokens(
+        self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
+    ) -> Optional[TokenSpan]:
+        """
+        Get the encoded token span corresponding to a word in a sequence of the batch.
+
+        Token spans are returned as a [`TokenSpan`] with:
+
+        - **start** -- Index of the first token.
+        - **end** -- Index of the token following the last token.
+
+        Can be called as:
+
+        - `self.word_to_tokens(word_index, sequence_index: int = 0)` if batch size is 1
+        - `self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)` if batch size is greater or equal to
+          1
+
+        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
+        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
+        words.
+
+        Args:
+            batch_or_word_index (`int`):
+                Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
+                the word in the sequence.
+            word_index (`int`, *optional*):
+                If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
+                sequence.
+            sequence_index (`int`, *optional*, defaults to 0):
+                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
+                or 1) the provided word index belongs to.
+
+        Returns:
+            Optional [`TokenSpan`] Span of tokens in the encoded sequence. Returns `None` if
+            no tokens correspond to the word.
+        """
+
+        if not self._encodings:
+            raise ValueError("word_to_tokens() is not available when using Python based tokenizers")
+        if word_index is not None:
+            batch_index = batch_or_word_index
+        else:
+            batch_index = 0
+            word_index = batch_or_word_index
+        if batch_index < 0:
+            batch_index = self._batch_size + batch_index
+        if word_index < 0:
+            word_index = self._seq_len + word_index
+        span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index)
+        return TokenSpan(*span) if span is not None else None
+
+    def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan:
+        """
+        Get the character span corresponding to an encoded token in a sequence of the batch.
+
+        Character spans are returned as a [`CharSpan`] with:
+
+        - **start** -- Index of the first character in the original string associated to the token.
+        - **end** -- Index of the character following the last character in the original string associated to the
+          token.
+
+        Can be called as:
+
+        - `self.token_to_chars(token_index)` if batch size is 1
+        - `self.token_to_chars(batch_index, token_index)` if batch size is greater or equal to 1
+
+        Args:
+            batch_or_token_index (`int`):
+                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
+                the token in the sequence.
+            token_index (`int`, *optional*):
+                If a batch index is provided in *batch_or_token_index*, this can be the index of the token or tokens in
+                the sequence.
+
+        Returns:
+            [`CharSpan`]: Span of characters in the original string.
+        """
+
+        if not self._encodings:
+            raise ValueError("token_to_chars() is not available when using Python based tokenizers")
+        if token_index is not None:
+            batch_index = batch_or_token_index
+        else:
+            batch_index = 0
+            token_index = batch_or_token_index
+        return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index)))
+
+    def char_to_token(
+        self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0
+    ) -> int:
+        """
+        Get the index of the token in the encoded output comprising a character in the original string for a sequence
+        of the batch.
+
+        Can be called as:
+
+        - `self.char_to_token(char_index)` if batch size is 1
+        - `self.char_to_token(batch_index, char_index)` if batch size is greater or equal to 1
+
+        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
+        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
+        words.
+
+        Args:
+            batch_or_char_index (`int`):
+                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
+                the word in the sequence
+            char_index (`int`, *optional*):
+                If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
+                sequence.
+            sequence_index (`int`, *optional*, defaults to 0):
+                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
+                or 1) the provided character index belongs to.
+
+
+        Returns:
+            `int`: Index of the token.
+        """
+
+        if not self._encodings:
+            raise ValueError("char_to_token() is not available when using Python based tokenizers")
+        if char_index is not None:
+            batch_index = batch_or_char_index
+        else:
+            batch_index = 0
+            char_index = batch_or_char_index
+        return self._encodings[batch_index].char_to_token(char_index, sequence_index)
+
+    def word_to_chars(
+        self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
+    ) -> CharSpan:
+        """
+        Get the character span in the original string corresponding to given word in a sequence of the batch.
+
+        Character spans are returned as a CharSpan NamedTuple with:
+
+        - start: index of the first character in the original string
+        - end: index of the character following the last character in the original string
+
+        Can be called as:
+
+        - `self.word_to_chars(word_index)` if batch size is 1
+        - `self.word_to_chars(batch_index, word_index)` if batch size is greater or equal to 1
+
+        Args:
+            batch_or_word_index (`int`):
+                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
+                the word in the sequence
+            word_index (`int`, *optional*):
+                If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
+                sequence.
+            sequence_index (`int`, *optional*, defaults to 0):
+                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
+                or 1) the provided word index belongs to.
+
+        Returns:
+            `CharSpan` or `List[CharSpan]`: Span(s) of the associated character or characters in the string. CharSpan
+            are NamedTuple with:
+
+                - start: index of the first character associated to the token in the original string
+                - end: index of the character following the last character associated to the token in the original
+                  string
+        """
+
+        if not self._encodings:
+            raise ValueError("word_to_chars() is not available when using Python based tokenizers")
+        if word_index is not None:
+            batch_index = batch_or_word_index
+        else:
+            batch_index = 0
+            word_index = batch_or_word_index
+        return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index, sequence_index)))
+
+    def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0) -> int:
+        """
+        Get the word in the original string corresponding to a character in the original string of a sequence of the
+        batch.
+
+        Can be called as:
+
+        - `self.char_to_word(char_index)` if batch size is 1
+        - `self.char_to_word(batch_index, char_index)` if batch size is greater than 1
+
+        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
+        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
+        words.
+
+        Args:
+            batch_or_char_index (`int`):
+                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
+                the character in the original string.
+            char_index (`int`, *optional*):
+                If a batch index is provided in *batch_or_token_index*, this can be the index of the character in the
+                original string.
+            sequence_index (`int`, *optional*, defaults to 0):
+                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
+                or 1) the provided character index belongs to.
+
+
+        Returns:
+            `int` or `List[int]`: Index or indices of the associated encoded token(s).
+        """
+
+        if not self._encodings:
+            raise ValueError("char_to_word() is not available when using Python based tokenizers")
+        if char_index is not None:
+            batch_index = batch_or_char_index
+        else:
+            batch_index = 0
+            char_index = batch_or_char_index
+        return self._encodings[batch_index].char_to_word(char_index, sequence_index)
+
+    def convert_to_tensors(
+        self, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False
+    ):
+        """
+        Convert the inner content to tensors.
+
+        Args:
+            tensor_type (`str` or [`TensorType`], *optional*):
+                The type of tensors to use. If `str`, should be one of the values of the enum [`TensorType`]. If
+                `None`, no modification is done.
+            prepend_batch_axis (`int`, *optional*, defaults to `False`):
+                Whether or not to add the batch dimension during the conversion.
+        """
+        if tensor_type is None:
+            return self
+
+        # Convert to TensorType
+        if not isinstance(tensor_type, TensorType):
+            tensor_type = TensorType(tensor_type)
+        # Get a function reference for the correct framework
+        if tensor_type == TensorType.PADDLE:
+            as_tensor = paddle.to_tensor
+            is_tensor = paddle.is_tensor
+        else:
+            as_tensor = np.asarray
+            is_tensor = _is_numpy
+
+        # Do the tensor conversion in batch
+        for key, value in self.items():
+            try:
+                if prepend_batch_axis:
+                    value = [value]
+
+                if not is_tensor(value):
+                    tensor = as_tensor(value)
+
+                    self[key] = tensor
+            except:  # noqa E722
+                if key == "overflowing_tokens":
+                    raise ValueError(
+                        "Unable to create tensor returning overflowing tokens of different lengths. "
+                        "Please see if a fast version of this tokenizer is available to have this feature available."
+                    )
+                raise ValueError(
+                    "Unable to create tensor, you should probably activate truncation and/or padding "
+                    "with 'padding=True' 'truncation=True' to have batched tensors with the same length."
+                )
+
+        return self
+
+
+class SpecialTokensMixin:
+    """
+    A mixin derived by [`PretrainedTokenizer`] to handle specific behaviors related to
+    special tokens. In particular, this class hold the attributes which can be used to directly access these special
+    tokens in a model-independent manner and allow to set and update the special tokens.
+
+    Args:
+        bos_token (`str` or `AddedToken`, *optional*):
+            A special token representing the beginning of a sentence.
+        eos_token (`str` or `AddedToken`, *optional*):
+            A special token representing the end of a sentence.
+        unk_token (`str` or `AddedToken`, *optional*):
+            A special token representing an out-of-vocabulary token.
+        sep_token (`str` or `AddedToken`, *optional*):
+            A special token separating two different sentences in the same input (used by BERT for instance).
+        pad_token (`str` or `AddedToken`, *optional*):
+            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
+            attention mechanisms or loss computation.
+        cls_token (`str` or `AddedToken`, *optional*):
+            A special token representing the class of the input (used by BERT for instance).
+        mask_token (`str` or `AddedToken`, *optional*):
+            A special token representing a masked token (used by masked-language modeling pretraining objectives, like
+            BERT).
+        additional_special_tokens (tuple or list of `str` or `AddedToken`, *optional*):
+            A tuple or a list of additional special tokens.
+    """
+
+    SPECIAL_TOKENS_ATTRIBUTES = [
+        "bos_token",
+        "eos_token",
+        "unk_token",
+        "sep_token",
+        "pad_token",
+        "cls_token",
+        "mask_token",
+        "additional_special_tokens",
+    ]
+
+    def __init__(self, verbose=True, **kwargs):
+        # note(guosheng): Since `__init__` might be called multiple times which
+        # is hooked before `PretrainedTokenizer` init, we do not set to None as
+        # HF to avoid unintentional overriding.
+        self._bos_token = getattr(self, "_bos_token", None)
+        self._eos_token = getattr(self, "_eos_token", None)
+        self._unk_token = getattr(self, "_unk_token", None)
+        self._sep_token = getattr(self, "_sep_token", None)
+        self._pad_token = getattr(self, "_pad_token", None)
+        self._cls_token = getattr(self, "_cls_token", None)
+        self._mask_token = getattr(self, "_mask_token", None)
+        self._pad_token_type_id = getattr(self, "_pad_token_type_id", 0)
+        self._additional_special_tokens = getattr(self, "_additional_special_tokens", [])
+        self.verbose = verbose
+
+        # We directly set the hidden value to allow initialization with special tokens
+        # which are not yet in the vocabulary. Necessary for serialization/de-serialization
+        # TODO clean this up at some point (probably by switching to fast tokenizers)
+        for key, value in kwargs.items():
+            if value is None:
+                continue
+            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
+                if key == "additional_special_tokens":
+                    assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple"
+                    assert all(
+                        isinstance(t, (str, AddedToken)) for t in value
+                    ), "One of the tokens is not a string or an AddedToken"
+                    setattr(self, key, value)
+                elif isinstance(value, (str, AddedToken)):
+                    setattr(self, key, value)
+                else:
+                    raise TypeError(f"special token {key} has to be either str or AddedToken but got: {type(value)}")
+
+    def sanitize_special_tokens(self) -> int:
+        """
+        Make sure that all the special tokens attributes of the tokenizer (`tokenizer.mask_token`,
+        `tokenizer.cls_token`, etc.) are in the vocabulary.
+
+        Add the missing ones to the vocabulary if needed.
+
+        Return:
+            `int`: The number of tokens added in the vocabulary during the operation.
+        """
+        return self.add_tokens(self.all_special_tokens_extended, special_tokens=True)
+
+    def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToken]]) -> int:
+        """
+        Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If
+        special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the
+        current vocabulary).
+
+        Note,None When adding new tokens to the vocabulary, you should make sure to also resize the token embedding
+        matrix of the model so that its embedding matrix matches the tokenizer.
+
+        In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
+
+        Using `add_special_tokens` will ensure your special tokens can be used in several ways:
+
+        - Special tokens are carefully handled by the tokenizer (they are never split).
+        - You can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This
+          makes it easy to develop model-agnostic training and fine-tuning scripts.
+
+        When possible, special tokens are already registered for provided pretrained models (for instance
+        [`BertTokenizer`] `cls_token` is already registered to be :obj*'[CLS]'* and XLM's one is also registered to be
+        `'</s>'`).
+
+        Args:
+            special_tokens_dict (dictionary *str* to *str* or `AddedToken`):
+                Keys should be in the list of predefined special attributes: [`bos_token`, `eos_token`, `unk_token`,
+                `sep_token`, `pad_token`, `cls_token`, `mask_token`, `additional_special_tokens`].
+
+                Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer
+                assign the index of the `unk_token` to them).
+
+        Returns:
+            `int`: Number of tokens added to the vocabulary.
+
+        Examples:
+
+        ```python
+        # Let's see how to add a new classification token to GPT-2
+        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        model = GPT2Model.from_pretrained("gpt2")
+
+        special_tokens_dict = {"cls_token": "<CLS>"}
+
+        num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
+        print("We have added", num_added_toks, "tokens")
+        # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
+        model.resize_token_embeddings(len(tokenizer))
+
+        assert tokenizer.cls_token == "<CLS>"
+        ```"""
+        if not special_tokens_dict:
+            return 0
+
+        added_tokens = 0
+        for key, value in special_tokens_dict.items():
+            assert key in self.SPECIAL_TOKENS_ATTRIBUTES, f"Key {key} is not a special token"
+
+            if self.verbose:
+                logger.info(f"Assigning {value} to the {key} key of the tokenizer")
+            setattr(self, key, value)
+
+            if key == "additional_special_tokens":
+                assert isinstance(value, (list, tuple)) and all(
+                    isinstance(t, (str, AddedToken)) for t in value
+                ), f"Tokens {value} for key {key} should all be str or AddedToken instances"
+                added_tokens += self.add_tokens(value, special_tokens=True)
+            else:
+                assert isinstance(
+                    value, (str, AddedToken)
+                ), f"Token {value} for key {key} should be a str or an AddedToken instance"
+                added_tokens += self.add_tokens([value], special_tokens=True)
+
+        return added_tokens
+
+    def add_tokens(
+        self, new_tokens: Union[str, AddedToken, List[Union[str, AddedToken]]], special_tokens: bool = False
+    ) -> int:
+        """
+        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
+        it with indices starting from length of the current vocabulary.
+
+        Note,None When adding new tokens to the vocabulary, you should make sure to also resize the token embedding
+        matrix of the model so that its embedding matrix matches the tokenizer.
+
+        In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
+
+        Args:
+            new_tokens (`str`, `AddedToken` or a list of *str* or `AddedToken`):
+                Tokens are only added if they are not already in the vocabulary. `AddedToken` wraps a string
+                token to let you personalize its behavior: whether this token should only match against a single word,
+                whether this token should strip all potential whitespaces on the left side, whether this token should
+                strip all potential whitespaces on the right side, etc.
+            special_tokens (`bool`, *optional*, defaults to `False`):
+                Can be used to specify if the token is a special token. This mostly change the normalization behavior
+                (special tokens like CLS or [MASK] are usually not lower-cased for instance).
+
+        Returns:
+            `int`: Number of tokens added to the vocabulary.
+
+        Examples:
+
+        ```python
+        # Let's see how to increase the vocabulary of Bert model and tokenizer
+        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        model = BertModel.from_pretrained("bert-base-uncased")
+
+        num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
+        print("We have added", num_added_toks, "tokens")
+        # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
+        model.resize_token_embeddings(len(tokenizer))
+        ```"""
+        if not new_tokens:
+            return 0
+
+        if not isinstance(new_tokens, (list, tuple)):
+            new_tokens = [new_tokens]
+
+        return self._add_tokens(new_tokens, special_tokens=special_tokens)
+
+    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
+        raise NotImplementedError
+
+    @property
+    def bos_token(self) -> str:
+        """
+        `str`: Beginning of sentence token. Log an error if used while not having been set.
+        """
+        if self._bos_token is None and self.verbose:
+            logger.error("Using bos_token, but it is not set yet.")
+            return None
+        return str(self._bos_token)
+
+    @property
+    def eos_token(self) -> str:
+        """
+        `str`: End of sentence token. Log an error if used while not having been set.
+        """
+        if self._eos_token is None and self.verbose:
+            logger.error("Using eos_token, but it is not set yet.")
+            return None
+        return str(self._eos_token)
+
+    @property
+    def unk_token(self) -> str:
+        """
+        `str`: Unknown token. Log an error if used while not having been set.
+        """
+        if self._unk_token is None and self.verbose:
+            logger.error("Using unk_token, but it is not set yet.")
+            return None
+        return str(self._unk_token)
+
+    @property
+    def sep_token(self) -> str:
+        """
+        `str`: Separation token, to separate context and query in an input sequence. Log an error if used while not
+        having been set.
+        """
+        if self._sep_token is None and self.verbose:
+            logger.error("Using sep_token, but it is not set yet.")
+            return None
+        return str(self._sep_token)
+
+    @property
+    def pad_token(self) -> str:
+        """
+        `str`: Padding token. Log an error if used while not having been set.
+        """
+        if self._pad_token is None and self.verbose:
+            logger.error("Using pad_token, but it is not set yet.")
+            return None
+        return str(self._pad_token)
+
+    @property
+    def cls_token(self) -> str:
+        """
+        `str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the full
+        depth of the model. Log an error if used while not having been set.
+        """
+        if self._cls_token is None and self.verbose:
+            logger.error("Using cls_token, but it is not set yet.")
+            return None
+        return str(self._cls_token)
+
+    @property
+    def mask_token(self) -> str:
+        """
+        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
+        having been set.
+        """
+        if self._mask_token is None and self.verbose:
+            logger.error("Using mask_token, but it is not set yet.")
+            return None
+        return str(self._mask_token)
+
+    @property
+    def additional_special_tokens(self) -> List[str]:
+        """
+        `List[str]`: All the additional special tokens you may want to use. Log an error if used while not having been
+        set.
+        """
+        if self._additional_special_tokens is None and self.verbose:
+            logger.error("Using additional_special_tokens, but it is not set yet.")
+            return None
+        return [str(tok) for tok in self._additional_special_tokens]
+
+    @bos_token.setter
+    def bos_token(self, value):
+        self._bos_token = value
+
+    @eos_token.setter
+    def eos_token(self, value):
+        self._eos_token = value
+
+    @unk_token.setter
+    def unk_token(self, value):
+        self._unk_token = value
+
+    @sep_token.setter
+    def sep_token(self, value):
+        self._sep_token = value
+
+    @pad_token.setter
+    def pad_token(self, value):
+        self._pad_token = value
+
+    @cls_token.setter
+    def cls_token(self, value):
+        self._cls_token = value
+
+    @mask_token.setter
+    def mask_token(self, value):
+        self._mask_token = value
+
+    @additional_special_tokens.setter
+    def additional_special_tokens(self, value):
+        self._additional_special_tokens = value
+
+    @property
+    def bos_token_id(self) -> Optional[int]:
+        """
+        `Optional[int]`: Id of the beginning of sentence token in the vocabulary. Returns `None` if the token has not
+        been set.
+        """
+        if self._bos_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.bos_token)
+
+    @property
+    def eos_token_id(self) -> Optional[int]:
+        """
+        `Optional[int]`: Id of the end of sentence token in the vocabulary. Returns `None` if the token has not been
+        set.
+        """
+        if self._eos_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.eos_token)
+
+    @property
+    def unk_token_id(self) -> Optional[int]:
+        """
+        `Optional[int]`: Id of the unknown token in the vocabulary. Returns `None` if the token has not been set.
+        """
+        if self._unk_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.unk_token)
+
+    @property
+    def sep_token_id(self) -> Optional[int]:
+        """
+        `Optional[int]`: Id of the separation token in the vocabulary, to separate context and query in an input
+        sequence. Returns `None` if the token has not been set.
+        """
+        if self._sep_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.sep_token)
+
+    @property
+    def pad_token_id(self) -> Optional[int]:
+        """
+        `Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been set.
+        """
+        if self._pad_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.pad_token)
+
+    @property
+    def pad_token_type_id(self) -> int:
+        """
+        `int`: Id of the padding token type in the vocabulary.
+        """
+        return self._pad_token_type_id
+
+    @property
+    def cls_token_id(self) -> Optional[int]:
+        """
+        `Optional[int]`: Id of the classification token in the vocabulary, to extract a summary of an input sequence
+        leveraging self-attention along the full depth of the model.
+
+        Returns `None` if the token has not been set.
+        """
+        if self._cls_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.cls_token)
+
+    @property
+    def mask_token_id(self) -> Optional[int]:
+        """
+        `Optional[int]`: Id of the mask token in the vocabulary, used when training a model with masked-language
+        modeling. Returns `None` if the token has not been set.
+        """
+        if self._mask_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.mask_token)
+
+    @property
+    def additional_special_tokens_ids(self) -> List[int]:
+        """
+        `List[int]`: Ids of all the additional special tokens in the vocabulary. Log an error if used while not having
+        been set.
+        """
+        return self.convert_tokens_to_ids(self.additional_special_tokens)
+
+    @bos_token_id.setter
+    def bos_token_id(self, value):
+        self._bos_token = self.convert_ids_to_tokens(value) if value is not None else None
+
+    @eos_token_id.setter
+    def eos_token_id(self, value):
+        self._eos_token = self.convert_ids_to_tokens(value) if value is not None else None
+
+    @unk_token_id.setter
+    def unk_token_id(self, value):
+        self._unk_token = self.convert_ids_to_tokens(value) if value is not None else None
+
+    @sep_token_id.setter
+    def sep_token_id(self, value):
+        self._sep_token = self.convert_ids_to_tokens(value) if value is not None else None
+
+    @pad_token_id.setter
+    def pad_token_id(self, value):
+        self._pad_token = self.convert_ids_to_tokens(value) if value is not None else None
+
+    @cls_token_id.setter
+    def cls_token_id(self, value):
+        self._cls_token = self.convert_ids_to_tokens(value) if value is not None else None
+
+    @mask_token_id.setter
+    def mask_token_id(self, value):
+        self._mask_token = self.convert_ids_to_tokens(value) if value is not None else None
+
+    @additional_special_tokens_ids.setter
+    def additional_special_tokens_ids(self, values):
+        self._additional_special_tokens = [self.convert_ids_to_tokens(value) for value in values]
+
+    @property
+    def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
+        """
+        `Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes (`cls_token`,
+        `unk_token`, etc.) to their values (`'<unk>'`, `'<cls>'`, etc.).
+
+        Convert potential tokens of `AddedToken` type to string.
+        """
+        set_attr = {}
+        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
+            attr_value = getattr(self, "_" + attr)
+            if attr_value:
+                set_attr[attr] = (
+                    type(attr_value)(str(attr_value_sub) for attr_value_sub in attr_value)
+                    if isinstance(attr_value, (list, tuple))
+                    else str(attr_value)
+                )
+        return set_attr
+
+    @property
+    def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedToken, List[Union[str, AddedToken]]]]:
+        """
+        `Dict[str, Union[str, AddedToken, List[Union[str, AddedToken]]]]`: A dictionary mapping
+        special token class attributes (`cls_token`, `unk_token`, etc.) to their values (`'<unk>'`, `'<cls>'`, etc.).
+
+        Don't convert tokens of `AddedToken` type to string so they can be used to control more finely how
+        special tokens are tokenized.
+        """
+        set_attr = {}
+        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
+            attr_value = getattr(self, "_" + attr)
+            if attr_value:
+                set_attr[attr] = attr_value
+        return set_attr
+
+    @property
+    def all_special_tokens(self) -> List[str]:
+        """
+        `List[str]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
+
+        Convert tokens of `AddedToken` type to string.
+        """
+        all_toks = [str(s) for s in self.all_special_tokens_extended]
+        return all_toks
+
+    @property
+    def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]:
+        """
+        `List[Union[str, AddedToken]]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class
+        attributes.
+
+        Don't convert tokens of `AddedToken` type to string so they can be used to control more finely how
+        special tokens are tokenized.
+        """
+        all_tokens = []
+        seen = set()
+        for value in self.special_tokens_map_extended.values():
+            if isinstance(value, (list, tuple)):
+                tokens_to_add = [token for token in value if str(token) not in seen]
+            else:
+                tokens_to_add = [value] if str(value) not in seen else []
+            seen.update(map(str, tokens_to_add))
+            all_tokens.extend(tokens_to_add)
+        return all_tokens
+
+    @property
+    def all_special_ids(self) -> List[int]:
+        """
+        `List[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
+        """
+        all_toks = self.all_special_tokens
+        all_ids = self.convert_tokens_to_ids(all_toks)
+        return all_ids
+
+
+class PretrainedTokenizerBase(SpecialTokensMixin):
+    """
+    Base class for [`PretrainedTokenizer`].
+
+    Class attributes (overridden by derived classes)
+
+         - **resource_files_names** (`Dict[str, str]`) -- A dictionary with, as keys, the `__init__` keyword name of each
+            vocabulary file required by the model, and as associated values, the filename for saving the associated file
+            (string).
+        - **pretrained_resource_files_map** (`Dict[str, Dict[str, str]]`) -- A dictionary of dictionaries, with the
+            high-level keys being the `__init__` keyword name of each vocabulary file required by the model, the
+            low-level being the `short-cut-names` of the pretrained models with, as associated values, the `url` to the
+            associated pretrained vocabulary file.
+        - **max_model_input_sizes** (`Dict[str, Optional[int]]`) -- A dictionary with, as keys, the `short-cut-names`
+            of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model,
+            or `None` if the model has no maximum input size.
+        - **pretrained_init_configuration** (`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the
+            `short-cut-names` of the pretrained models, and as associated values, a dictionary of specific arguments to
+            pass to the `__init__` method of the tokenizer class for this pretrained model when loading the tokenizer
+            with the [`~tokenizer_utils_base.PretrainedTokenizerBase.from_pretrained`] method.
+        - **model_input_names** (`List[str]`) -- A list of inputs expected in the forward pass of the model.
+        - **padding_side** (`str`) -- The default value for the side on which the model should have padding applied.
+            Should be `'right'` or `'left'`.
+        - **truncation_side** (`str`) -- The default value for the side on which the model should have truncation
+            applied. Should be `'right'` or `'left'`.
+
+    Args:
+        model_max_length (`int`, *optional*):
+            The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is
+            loaded with [`~tokenizer_utils_base.PretrainedTokenizerBase.from_pretrained`], this will be set to the
+            value stored for the associated model in `max_model_input_sizes` (see above). If no value is provided, will
+            default to VERY_LARGE_INTEGER (`int(1e30)`).
+        padding_side (`str`, *optional*):
+            The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+            Default value is picked from the class attribute of the same name.
+        truncation_side (`str`, *optional*):
+            The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
+            Default value is picked from the class attribute of the same name.
+        model_input_names (`List[string]`, *optional*):
+            The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
+            `"attention_mask"`). Default value is picked from the class attribute of the same name.
+        bos_token (`str` or `AddedToken`, *optional*):
+            A special token representing the beginning of a sentence. Will be associated to `self.bos_token` and
+            `self.bos_token_id`.
+        eos_token (`str` or `AddedToken`, *optional*):
+            A special token representing the end of a sentence. Will be associated to `self.eos_token` and
+            `self.eos_token_id`.
+        unk_token (`str` or `AddedToken`, *optional*):
+            A special token representing an out-of-vocabulary token. Will be associated to `self.unk_token` and
+            `self.unk_token_id`.
+        sep_token (`str` or `AddedToken`, *optional*):
+            A special token separating two different sentences in the same input (used by BERT for instance). Will be
+            associated to `self.sep_token` and `self.sep_token_id`.
+        pad_token (`str` or `AddedToken`, *optional*):
+            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
+            attention mechanisms or loss computation. Will be associated to `self.pad_token` and `self.pad_token_id`.
+        cls_token (`str` or `AddedToken`, *optional*):
+            A special token representing the class of the input (used by BERT for instance). Will be associated to
+            `self.cls_token` and `self.cls_token_id`.
+        mask_token (`str` or `AddedToken`, *optional*):
+            A special token representing a masked token (used by masked-language modeling pretraining objectives, like
+            BERT). Will be associated to `self.mask_token` and `self.mask_token_id`.
+        additional_special_tokens (tuple or list of `str` or `AddedToken`, *optional*):
+            A tuple or a list of additional special tokens. Add them here to ensure they won't be split by the
+            tokenization process. Will be associated to `self.additional_special_tokens` and
+            `self.additional_special_tokens_ids`.
+    """
+
+    resource_files_names: Dict[str, str] = {}
+    pretrained_resource_files_map: Dict[str, Dict[str, str]] = {}
+    pretrained_init_configuration: Dict[str, Dict[str, Any]] = {}
+    max_model_input_sizes: Dict[str, Optional[int]] = {}
+    _auto_class: Optional[str] = None
+    tokenizer_config_file = TOKENIZER_CONFIG_NAME
+
+    # first name has to correspond to main model input name
+    # to make sure `tokenizer.pad(...)` works correctly
+    model_input_names: List[str] = ["input_ids", "token_type_ids"]
+    padding_side: str = "right"
+    truncation_side: str = "right"
+    slow_tokenizer_class = None
+
+    def __init__(self, **kwargs):
+        # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
+        self.init_inputs = ()
+
+        self.init_kwargs = getattr(self, "init_kwargs", None) or copy.deepcopy(kwargs)
+        self.name_or_path = kwargs.pop("name_or_path", "")
+        self._processor_class = kwargs.pop("processor_class", None)
+
+        # For backward compatibility we fallback to set model_max_length from max_len if provided
+        model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
+        self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER
+
+        # Padding and truncation side are right by default and overridden in subclasses. If specified in the kwargs, it
+        # is changed.
+        self.padding_side = kwargs.pop("padding_side", self.padding_side)
+        if self.padding_side not in ["right", "left"]:
+            raise ValueError(
+                f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}"
+            )
+
+        self.truncation_side = kwargs.pop("truncation_side", self.truncation_side)
+        if self.truncation_side not in ["right", "left"]:
+            raise ValueError(
+                f"Padding side should be selected between 'right' and 'left', current value: {self.truncation_side}"
+            )
+
+        self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
+
+        # By default, do not split special tokens for both fast and slow tokenizers
+        self.split_special_tokens = kwargs.pop("split_special_tokens", False)
+
+        self.deprecation_warnings = (
+            {}
+        )  # Use to store when we have already noticed a deprecation warning (avoid overlogging).
+
+        super().__init__(**kwargs)
+
+    @property
+    def max_len_single_sentence(self) -> int:
+        """
+        `int`: The maximum length of a sentence that can be fed to the model.
+        """
+        return self.model_max_length - self.num_special_tokens_to_add(pair=False)
+
+    @property
+    def max_len_sentences_pair(self) -> int:
+        """
+        `int`: The maximum combined length of a pair of sentences that can be fed to the model.
+        """
+        return self.model_max_length - self.num_special_tokens_to_add(pair=True)
+
+    @max_len_single_sentence.setter
+    def max_len_single_sentence(self, value) -> int:
+        # For backward compatibility, allow to try to setup 'max_len_single_sentence'.
+        if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose:
+            if not self.deprecation_warnings.get("max_len_single_sentence", False):
+                warnings.warn(
+                    "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
+                )
+            self.deprecation_warnings["max_len_single_sentence"] = True
+        else:
+            raise ValueError(
+                "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
+            )
+
+    def _switch_to_input_mode(self):
+        """
+        Private method to put the tokenizer in input mode (when it has different modes for input/outputs)
+        """
+        pass
+
+    @max_len_sentences_pair.setter
+    def max_len_sentences_pair(self, value) -> int:
+        # For backward compatibility, allow to try to setup 'max_len_sentences_pair'.
+        if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose:
+            if not self.deprecation_warnings.get("max_len_sentences_pair", False):
+                warnings.warn(
+                    "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
+                )
+            self.deprecation_warnings["max_len_sentences_pair"] = True
+        else:
+            raise ValueError(
+                "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
+            )
+
+    def _set_processor_class(self, processor_class: str):
+        """Sets processor class as an attribute."""
+        self._processor_class = processor_class
+
+    def __repr__(self) -> str:
+        return (
+            f"{'PretrainedTokenizer'}(name_or_path='{self.name_or_path}', "
+            f"vocab_size={self.vocab_size}, model_max_len={self.model_max_length}, "
+            f"padding_side='{self.padding_side}', truncation_side='{self.truncation_side}', special_tokens={self.special_tokens_map_extended})"
+        )
+
+    def get_vocab(self) -> Dict[str, int]:
+        """
+        Returns the vocabulary as a dictionary of token to index.
+
+        `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the
+        vocab.
+
+        Returns:
+            `Dict[str, int]`: The vocabulary.
+        """
+        raise NotImplementedError()
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        """
+        Creates an instance of `PretrainedTokenizer`. Related resources are loaded
+        by specifying name of a built-in pretrained model, or a community-contributed
+        pretrained model, or a local file directory path.
+
+        Args:
+            pretrained_model_name_or_path (str): Name of pretrained model or dir path
+                to load from. The string can be:
+
+                - Name of built-in pretrained model
+                - Name of a community-contributed pretrained model.
+                - Local directory path which contains tokenizer related resources
+                  and tokenizer config file ("tokenizer_config.json").
+            from_hf_hub (bool, optional): whether to load from Huggingface Hub
+            subfolder (str, optional) An optional value corresponding to a folder inside the repo.
+                Only works when loading from Huggingface Hub.
+            *args (tuple): position arguments for model `__init__`. If provided,
+                use these as position argument values for tokenizer initialization.
+            **kwargs (dict): keyword arguments for model `__init__`. If provided,
+                use these to update pre-defined keyword argument values for tokenizer
+                initialization.
+
+        Returns:
+            PretrainedTokenizer: An instance of `PretrainedTokenizer`.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import BertTokenizer
+
+                # Name of built-in pretrained model
+                tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+
+                # Name of community-contributed pretrained model
+                tokenizer = BertTokenizer.from_pretrained('yingyibiao/bert-base-uncased-sst-2-finetuned')
+
+                # Load from local directory path
+                tokenizer = BertTokenizer.from_pretrained('./my_bert/')
+        """
+
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        cache_dir = kwargs.pop("cache_dir", None)
+        from_hf_hub = kwargs.pop("from_hf_hub", False)
+        from_aistudio = kwargs.pop("from_aistudio", False)
+        subfolder = kwargs.pop("subfolder", "")
+        return_tokenizer_file_dir = kwargs.pop("return_tokenizer_file_dir", False)
+
+        if subfolder is None:
+            subfolder = ""
+
+        vocab_files = {}
+        init_configuration = {}
+
+        additional_files_names = {
+            "added_tokens_file": ADDED_TOKENS_FILE,
+            "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
+            "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
+            "chat_template_file": CHAT_TEMPLATE_CONFIG_NAME,
+        }
+
+        vocab_files_target = {**cls.resource_files_names, **additional_files_names}
+
+        # From HF Hub or AI Studio
+        if from_hf_hub or from_aistudio:
+            # Only include the necessary resource files specified by the tokenizer cls
+            # Deep copy to avoid modifiying the class attributes
+            vocab_files = copy.deepcopy(cls.resource_files_names)
+            vocab_files["tokenizer_config_file"] = cls.tokenizer_config_file
+
+        # From built-in pretrained models
+        elif pretrained_model_name_or_path in cls.pretrained_init_configuration:
+            for file_id, map_list in cls.pretrained_resource_files_map.items():
+                vocab_files[file_id] = map_list[pretrained_model_name_or_path]
+            init_configuration = copy.deepcopy(cls.pretrained_init_configuration[pretrained_model_name_or_path])
+        # From local dir path
+        elif os.path.isdir(pretrained_model_name_or_path):
+            vocab_files_target["tokenizer_config_file"] = cls.tokenizer_config_file
+            for file_id, file_name in vocab_files_target.items():
+                full_file_name = os.path.join(pretrained_model_name_or_path, subfolder, file_name)
+                if os.path.isfile(full_file_name):
+                    vocab_files[file_id] = full_file_name
+        else:
+            # Assuming from community-contributed pretrained models
+            for file_id, file_name in vocab_files_target.items():
+                vocab_files[file_id] = file_name
+
+        resolved_vocab_files = {}
+        for file_id, file_path in vocab_files.items():
+            if file_path is None or os.path.isfile(file_path):
+                resolved_vocab_files[file_id] = file_path
+                continue
+            resolved_vocab_files[file_id] = resolve_file_path(
+                pretrained_model_name_or_path,
+                [file_path],
+                subfolder,
+                cache_dir=cache_dir,
+                from_aistudio=from_aistudio,
+                from_hf_hub=from_hf_hub,
+            )
+
+        for file_id, file_path in resolved_vocab_files.items():
+            if resolved_vocab_files[file_id] is not None:
+                cache_dir = os.path.dirname(resolved_vocab_files[file_id])
+                break
+
+        tokenizer_config_file_dir_list = set()
+        for k, v in resolved_vocab_files.items():
+            if v is not None and os.path.isfile(v):
+                tokenizer_config_file_dir_list.add(os.path.dirname(v))
+        tokenizer_config_file_dir_list = list(tokenizer_config_file_dir_list)
+        # TODO: check this
+        assert len(tokenizer_config_file_dir_list) > 0, "All tokenizer files should be in the same directory."
+        # Prepare tokenizer initialization kwargs
+        # Did we saved some inputs and kwargs to reload ?
+        has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None
+        tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
+        if tokenizer_config_file is not None:
+            with io.open(tokenizer_config_file, encoding="utf-8") as f:
+                init_kwargs = json.load(f)
+        else:
+            init_kwargs = init_configuration
+
+        # Handle tokenizer serialization of added and special tokens
+        added_tokens_decoder: Dict[int, AddedToken] = {}
+        # if we have info on the slow added tokens
+        if "added_tokens_decoder" in init_kwargs:
+            for idx, token in init_kwargs["added_tokens_decoder"].items():
+                if isinstance(token, dict):
+                    token = AddedToken(**token)
+                if isinstance(token, AddedToken):
+                    added_tokens_decoder[int(idx)] = token
+                else:
+                    raise ValueError(
+                        f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary or an AddedToken instance"
+                    )
+            init_kwargs["added_tokens_decoder"] = added_tokens_decoder
+
+        # position args are stored in kwargs, maybe better not include
+        init_args = init_kwargs.pop("init_args", ())
+        init_kwargs.pop("init_class", None)
+
+        # Update with newly provided args and kwargs
+        init_args = init_args if not args else args
+        init_kwargs.update(kwargs)
+
+        def convert_added_tokens(obj):
+            if isinstance(obj, dict) and "__type" in obj and obj["__type"] == "AddedToken":
+                obj.pop("__type")
+                return AddedToken(**obj)
+            elif isinstance(obj, (list, tuple)):
+                return list(convert_added_tokens(o) for o in obj)
+            elif isinstance(obj, dict):
+                return {k: convert_added_tokens(v) for k, v in obj.items()}
+            return obj
+
+        init_kwargs = convert_added_tokens(init_kwargs)
+        # Set max length if needed
+        if pretrained_model_name_or_path in cls.max_model_input_sizes:
+            # if we're using a pretrained model, ensure the tokenizer
+            # wont index sequences longer than the number of positional embeddings
+            model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path]
+            if model_max_length is not None and isinstance(model_max_length, (int, float)):
+                init_kwargs["model_max_length"] = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length)
+
+        added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
+        # Merge resolved_vocab_files arguments in init_kwargs if not including.
+        # Maybe need more ways to load resources.
+        for args_name, file_path in resolved_vocab_files.items():
+            # when `pretrained_model_name_or_path` is a pretrained model name,
+            # use pretrained_init_configuration as `init_kwargs` to init which
+            # does not include the vocab file in it, thus add vocab file into
+            # args.
+            if args_name not in init_kwargs or init_kwargs[args_name] is None:
+                init_kwargs[args_name] = file_path
+            # when `pretrained_model_name_or_path` is a pretrained model dir,
+            # use tokenizer_config_file.json as `init_kwargs` to init which
+            # does include a vocab file path in it. However, if the vocab file
+            # path included in json does not exist, such as was deleted, to make
+            # it still work, use the vocab file under this dir.
+            elif not os.path.isfile(init_kwargs[args_name] or "") and os.path.isfile(file_path):
+                init_kwargs[args_name] = file_path
+
+        # TODO(zhoushunjie): It's not supportted to load tokenizer.json of hf so far.
+        if from_hf_hub and "tokenizer_file" in init_kwargs:
+            init_kwargs.pop("tokenizer_file")
+
+        # TODO(guosheng): avoid reduplication of position args and key word args
+        tokenizer = cls(*init_args, **init_kwargs)
+        chat_template = init_kwargs.pop("chat_template", None)
+        if chat_template is not None:
+            tokenizer.init_chat_template(chat_template)
+        special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
+        if special_tokens_map_file is not None:
+            with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
+                special_tokens_map = json.load(special_tokens_map_handle)
+            for key, value in special_tokens_map.items():
+                if key in kwargs and kwargs[key]:
+                    # This value has already been redefined by the kwargs
+                    # We keep this new value and ignore the one stored in the special_tokens_map_file
+
+                    continue
+
+                if isinstance(value, dict):
+                    value = AddedToken(**value)
+                elif isinstance(value, list):
+                    value = [AddedToken(**token) if isinstance(token, dict) else token for token in value]
+                setattr(tokenizer, key, value)
+        # Add supplementary tokens.
+        special_tokens = tokenizer.all_special_tokens
+        if added_tokens_file is not None:
+            with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
+                added_tok_encoder = json.load(added_tokens_handle)
+
+            # Sort added tokens by index
+            added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1]))
+            for token, index in added_tok_encoder_sorted:
+                if has_tokenizer_file and index != len(tokenizer) and tokenizer.convert_tokens_to_ids(token) != index:
+                    # index is the current length of the tokenizer (not in vocabulary)
+                    raise ValueError(
+                        f"Wrong index found for {token}: should be {tokenizer.convert_tokens_to_ids(token)} but found "
+                        f"{index}."
+                    )
+                elif not has_tokenizer_file and index != len(tokenizer):
+                    # Tokenizer slow: added token cannot already be in the vocabulary so its index needs to be the
+                    # current length of the tokenizer.
+                    raise ValueError(
+                        f"Non-consecutive added token '{token}' found. "
+                        f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary."
+                    )
+
+                tokenizer.add_tokens(token, special_tokens=bool(token in special_tokens))
+        # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab
+        added_tokens = tokenizer.sanitize_special_tokens()
+        if added_tokens:
+            logger.info(
+                "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained."
+            )
+        # save all of related things into default root dir
+        if pretrained_model_name_or_path in cls.pretrained_init_configuration:
+            # tokenizer.save_pretrained(os.path.join(cache_dir, pretrained_model_name_or_path, subfolder))
+            tokenizer.save_pretrained(cache_dir)
+
+        if return_tokenizer_file_dir:
+            return tokenizer, list(tokenizer_config_file_dir_list)[0]
+        return tokenizer
+
+    def save_pretrained(self, save_directory, filename_prefix: Optional[str] = None, **kwargs):
+        """
+        Save tokenizer configuration and related resources to files under
+        `save_directory`. The tokenizer configuration would be saved into
+        `tokenizer_config_file` indicating file (thus `tokenizer_config.json`),
+        and resources would be saved into `resource_files_names` indicating files
+        by using `self.save_resources(save_directory)`.
+
+        The `save_directory` can be used in `from_pretrained` as argument value
+        of `pretrained_model_name_or_path` to re-load the tokenizer.
+
+        Args:
+            save_directory (str): Directory to save files into.
+            filename_prefix: (str, optional):
+                A prefix to add to the names of the files saved by the tokenizer.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import BertTokenizer
+
+                tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+                tokenizer.save_pretrained('trained_model')
+                # reload from save_directory
+                tokenizer = BertTokenizer.from_pretrained('trained_model')
+        """
+        assert not os.path.isfile(save_directory), "Saving directory ({}) should be a directory, not a file".format(
+            save_directory
+        )
+        os.makedirs(save_directory, exist_ok=True)
+
+        special_tokens_map_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + SPECIAL_TOKENS_MAP_FILE
+        )
+        tokenizer_config_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + self.tokenizer_config_file
+        )
+
+        tokenizer_config = copy.deepcopy(self.init_kwargs)
+        if len(self.init_inputs) > 0:
+            tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
+        for file_id in self.resource_files_names.keys():
+            tokenizer_config.pop(file_id, None)
+
+        # Sanitize AddedTokens
+        def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True):
+            if isinstance(obj, AddedToken):
+                out = obj.__getstate__()
+                if add_type_field:
+                    out["__type"] = "AddedToken"
+                return out
+            elif isinstance(obj, (list, tuple)):
+                return list(convert_added_tokens(o, add_type_field=add_type_field) for o in obj)
+            elif isinstance(obj, dict):
+                return {k: convert_added_tokens(v, add_type_field=add_type_field) for k, v in obj.items()}
+            return obj
+
+        # add_type_field=True to allow dicts in the kwargs / differentiate from AddedToken serialization
+        tokenizer_config = convert_added_tokens(tokenizer_config, add_type_field=True)
+
+        # Add tokenizer class to the tokenizer config to be able to reload it with from_pretrained
+        tokenizer_class = self.__class__.__name__
+        tokenizer_config["tokenizer_class"] = tokenizer_class
+
+        with io.open(tokenizer_config_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(tokenizer_config, ensure_ascii=False))
+        logger.info(f"tokenizer config file saved in {tokenizer_config_file}")
+
+        # Sanitize AddedTokens in special_tokens_map
+        write_dict = convert_added_tokens(self.special_tokens_map_extended, add_type_field=False)
+        with open(special_tokens_map_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(write_dict, ensure_ascii=False))
+        logger.info(f"Special tokens file saved in {special_tokens_map_file}")
+
+        file_names = (tokenizer_config_file, special_tokens_map_file)
+
+        save_files = self._save_pretrained(
+            save_directory=save_directory,
+            file_names=file_names,
+            filename_prefix=filename_prefix,
+        )
+
+        return save_files
+
+    def _save_pretrained(
+        self, save_directory: Union[str, os.PathLike], file_names: Tuple[str], filename_prefix: Optional[str] = None
+    ) -> Tuple[str]:
+        """
+        Save a tokenizer using the tokenizer format: vocabulary + added tokens.
+
+        """
+        save_directory = str(save_directory)
+
+        added_tokens_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
+        )
+        added_vocab = self.get_added_vocab()
+        if added_vocab:
+            with open(added_tokens_file, "w", encoding="utf-8") as f:
+                out_str = json.dumps(added_vocab, ensure_ascii=False)
+                f.write(out_str)
+                logger.info(f"added tokens file saved in {added_tokens_file}")
+
+        self.save_resources(save_directory)
+
+        return file_names + (added_tokens_file,)
+
+    def save_resources(self, save_directory):
+        """
+        Save tokenizer related resources to `resource_files_names` indicating
+        files under `save_directory` by copying directly. Override it if necessary.
+
+        Args:
+            save_directory (str): Directory to save files into.
+        """
+        for name, file_name in self.resource_files_names.items():
+            src_path = self.init_kwargs[name]
+            dst_path = os.path.join(save_directory, file_name)
+            if os.path.abspath(src_path) != os.path.abspath(dst_path):
+                shutil.copyfile(src_path, dst_path)
+
+    def save_to_hf_hub(
+        self,
+        repo_id: str,
+        private: Optional[bool] = None,
+        subfolder: Optional[str] = None,
+        commit_message: Optional[str] = None,
+        revision: Optional[str] = None,
+        create_pr: bool = False,
+    ):
+        """
+        Uploads all elements of this tokenizer to a new HuggingFace Hub repository.
+        Args:
+            repo_id (str): Repository name for your model/tokenizer in the Hub.
+            private (bool, optional): Whether the model/tokenizer is set to private
+            subfolder (str, optional): Push to a subfolder of the repo instead of the root
+            commit_message (str, optional) — The summary / title / first line of the generated commit. Defaults to: f"Upload {path_in_repo} with huggingface_hub"
+            revision (str, optional) — The git revision to commit from. Defaults to the head of the "main" branch.
+            create_pr (boolean, optional) — Whether or not to create a Pull Request with that commit. Defaults to False.
+                If revision is not set, PR is opened against the "main" branch. If revision is set and is a branch, PR is opened against this branch.
+                If revision is set and is not a branch name (example: a commit oid), an RevisionNotFoundError is returned by the server.
+
+        Returns: The url of the commit of your model in the given repository.
+        """
+        repo_url = create_repo(repo_id, private=private, exist_ok=True)
+
+        # Infer complete repo_id from repo_url
+        # Can be different from the input `repo_id` if repo_owner was implicit
+        _, repo_owner, repo_name = repo_type_and_id_from_hf_id(repo_url)
+        repo_id = f"{repo_owner}/{repo_name}"
+
+        # Check if README file already exist in repo
+        try:
+            get_hf_file_metadata(hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision))
+            has_readme = True
+        except EntryNotFoundError:
+            has_readme = False
+
+        with tempfile.TemporaryDirectory() as root_dir:
+            if subfolder is not None:
+                save_dir = os.path.join(root_dir, subfolder)
+            else:
+                save_dir = root_dir
+            # save model
+            self.save_pretrained(save_dir)
+            # Add readme if does not exist
+            logger.info("README.md not found, adding the default README.md")
+            if not has_readme:
+                with open(os.path.join(root_dir, "README.md"), "w") as f:
+                    f.write(f"---\nlibrary_name: paddlenlp\n---\n# {repo_id}")
+            # Upload model and return
+            logger.info(f"Pushing to the {repo_id}. This might take a while")
+            return upload_folder(
+                repo_id=repo_id,
+                repo_type="model",
+                folder_path=root_dir,
+                commit_message=commit_message,
+                revision=revision,
+                create_pr=create_pr,
+            )
+
+    def save_to_aistudio(
+        self, repo_id, private=True, license="Apache License 2.0", exist_ok=True, subfolder=None, **kwargs
+    ):
+        """
+        Uploads all elements of this model to a new AiStudio Hub repository.
+        Args:
+            repo_id (str): Repository name for your model/tokenizer in the Hub.
+            token (str): Your token for the Hub.
+            private (bool, optional): Whether the model/tokenizer is set to private. Defaults to True.
+            license (str): The license of your model/tokenizer. Defaults to: "Apache License 2.0".
+            exist_ok (bool, optional): Whether to override existing repository. Defaults to: True.
+            subfolder (str, optional): Push to a subfolder of the repo instead of the root
+        """
+
+        res = aistudio_sdk.hub.create_repo(repo_id=repo_id, private=private, license=license, **kwargs)
+        if "error_code" in res:
+            if res["error_code"] == 10003 and exist_ok:
+                logger.info(
+                    f"Repo {repo_id} already exists, it will override files with the same name. To avoid this, please set exist_ok=False"
+                )
+            else:
+                logger.error(
+                    f"Failed to create repo {repo_id}, error_code: {res['error_code']}, error_msg: {res['error_msg']}"
+                )
+        else:
+            logger.info(f"Successfully created repo {repo_id}")
+
+        with tempfile.TemporaryDirectory() as root_dir:
+            if subfolder is not None:
+                save_dir = os.path.join(root_dir, subfolder)
+            else:
+                save_dir = root_dir
+            # save model
+            self.save_pretrained(save_dir)
+
+            # Upload model and return
+            logger.info(f"Pushing to the {repo_id}. This might take a while")
+            for filename in os.listdir(save_dir):
+                res = aistudio_sdk.hub.upload(
+                    repo_id=repo_id, path_or_fileobj=os.path.join(save_dir, filename), path_in_repo=filename, **kwargs
+                )
+                if "error_code" in res:
+                    logger.error(
+                        f"Failed to upload {filename}, error_code: {res['error_code']}, error_msg: {res['error_msg']}"
+                    )
+                else:
+                    logger.info(f"{filename}: {res['message']}")
+
+    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
+        """
+        Converts a string in a sequence of tokens, replacing unknown tokens with the `unk_token`.
+
+        Args:
+            text (`str`):
+                The sequence to be encoded.
+            pair (`str`, *optional*):
+                A second sequence to be encoded with the first.
+            add_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to add the special tokens associated with the corresponding model.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific encode method. See details in
+                [`~PretrainedTokenizerBase.__call__`]
+
+        Returns:
+            `List[str]`: The list of tokens.
+        """
+        raise NotImplementedError
+
+    def num_special_tokens_to_add(self, pair: bool = False) -> int:
+        raise NotImplementedError
+
+    def _get_padding_truncation_strategies(
+        self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
+    ):
+        """
+        Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy
+        and pad_to_max_length) and behaviors.
+        """
+        old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate")
+        old_pad_to_max_length = kwargs.pop("pad_to_max_seq_len", False)
+
+        # Backward compatibility for previous behavior, maybe we should deprecate it:
+        # If you only set max_length, it activates truncation for max_length
+        if max_length is not None and padding is False and truncation is False:
+            if verbose:
+                if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False):
+                    warnings.warn(
+                        "Truncation was not explicitly activated but `max_length` is provided a specific value, "
+                        "please use `truncation=True` to explicitly truncate examples to max length. "
+                        "Defaulting to 'longest_first' truncation strategy. "
+                        "If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy "
+                        "more precisely by providing a specific strategy to `truncation`."
+                    )
+                self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
+            truncation = "longest_first"
+
+        # Get padding strategy
+        if padding is False and old_pad_to_max_length:
+            if verbose:
+                warnings.warn(
+                    "The `pad_to_max_length` argument is deprecated and will be removed in a future version, "
+                    "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or "
+                    "use `padding='max_length'` to pad to a max length. In this case, you can give a specific "
+                    "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the "
+                    "maximal input size of the model (e.g. 512 for Bert).",
+                    FutureWarning,
+                )
+            if max_length is None:
+                padding_strategy = PaddingStrategy.LONGEST
+            else:
+                padding_strategy = PaddingStrategy.MAX_LENGTH
+        elif padding is not False:
+            if padding is True:
+                if verbose:
+                    if max_length is not None and (truncation is False or truncation == "do_not_truncate"):
+                        warnings.warn(
+                            "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
+                            "To pad to max length, use `padding='max_length'`."
+                        )
+                    if old_pad_to_max_length is not False:
+                        warnings.warn("Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`.")
+                # Default to pad to the longest sequence in the batch
+                padding_strategy = PaddingStrategy.LONGEST
+            elif not isinstance(padding, PaddingStrategy):
+                padding_strategy = PaddingStrategy(padding)
+            elif isinstance(padding, PaddingStrategy):
+                padding_strategy = padding
+        else:
+            padding_strategy = PaddingStrategy.DO_NOT_PAD
+
+        # Get truncation strategy
+        if truncation is False and old_truncation_strategy != "do_not_truncate":
+            if verbose:
+                warnings.warn(
+                    "The `truncation_strategy` argument is deprecated and will be removed in a future version, "
+                    "use `truncation=True` to truncate examples to a max length. You can give a specific "
+                    "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the "
+                    "maximal input size of the model (e.g. 512 for Bert). "
+                    " If you have pairs of inputs, you can give a specific truncation strategy selected among "
+                    "`truncation='only_first'` (will only truncate the first sentence in the pairs) "
+                    "`truncation='only_second'` (will only truncate the second sentence in the pairs) "
+                    "or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence in the pairs).",
+                    FutureWarning,
+                )
+            truncation_strategy = TruncationStrategy(old_truncation_strategy)
+        elif truncation is not False and truncation is not None:
+            if truncation is True:
+                truncation_strategy = (
+                    TruncationStrategy.LONGEST_FIRST
+                )  # Default to truncate the longest sequences in pairs of inputs
+            elif not isinstance(truncation, TruncationStrategy):
+                truncation_strategy = TruncationStrategy(truncation)
+            elif isinstance(truncation, TruncationStrategy):
+                truncation_strategy = truncation
+        else:
+            truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
+
+        # Set max length if needed
+        if max_length is None:
+            if padding_strategy == PaddingStrategy.MAX_LENGTH:
+                if self.model_max_length > LARGE_INTEGER:
+                    if verbose:
+                        if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False):
+                            warnings.warn(
+                                "Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. "
+                                "Default to no padding."
+                            )
+                        self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
+                    padding_strategy = PaddingStrategy.DO_NOT_PAD
+                else:
+                    max_length = self.model_max_length
+
+            if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
+                if self.model_max_length > LARGE_INTEGER:
+                    if verbose:
+                        if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False):
+                            warnings.warn(
+                                "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. "
+                                "Default to no truncation."
+                            )
+                        self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
+                    truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
+                else:
+                    max_length = self.model_max_length
+
+        # Test if we have a padding token
+        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token or self.pad_token_id < 0):
+            raise ValueError(
+                "Asking to pad but the tokenizer does not have a padding token. "
+                "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
+                "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
+            )
+
+        # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
+        if (
+            truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
+            and padding_strategy != PaddingStrategy.DO_NOT_PAD
+            and pad_to_multiple_of is not None
+            and max_length is not None
+            and (max_length % pad_to_multiple_of != 0)
+        ):
+            raise ValueError(
+                f"Truncation and padding are both activated but "
+                f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
+            )
+
+        return padding_strategy, truncation_strategy, max_length, kwargs
+
+    def __call__(
+        self,
+        text: Union[str, List[str], List[List[str]]],
+        text_pair: Optional[Union[str, List[str], List[List[str]]]] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: Union[bool, str] = False,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = False,
+        return_position_ids: bool = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_length: bool = False,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_dict: bool = True,
+        return_offsets_mapping: bool = False,
+        add_special_tokens: bool = True,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        verbose: bool = True,
+        **kwargs
+    ):
+        """
+        Performs tokenization and uses the tokenized tokens to prepare model
+        inputs. It supports sequence or sequence pair as input, and batch input
+        is allowed. `self.encode()` or `self.batch_encode()` would be called
+        separately for single or batch input depending on input format and
+        `is_split_into_words` argument.
+
+        Args:
+            text (str, List[str] or List[List[str]]):
+                The sequence or batch of sequences to be processed. One sequence
+                is a string or a list of strings depending on whether it has been
+                pretokenized. If each sequence is provided as a list of strings
+                (pretokenized), you must set `is_split_into_words` as `True` to
+                disambiguate with a batch of sequences.
+            text_pair (str, List[str] or List[List[str]], optional):
+                Same as `text` argument, while it represents for the latter
+                sequence of the sequence pair.
+            max_length (int, optional):
+                If set to a number, will limit the total sequence returned so
+                that it has a maximum length. If there are overflowing tokens,
+                those overflowing tokens will be added to the returned dictionary
+                when `return_overflowing_tokens` is `True`. Defaults to `None`.
+            stride (int, optional):
+                Only available for batch input of sequence pair and mainly for
+                question answering usage. When for QA, `text` represents questions
+                and `text_pair` represents contexts. If `stride` is set to a
+                positive number, the context will be split into multiple spans
+                where `stride` defines the number of (tokenized) tokens to skip
+                from the start of one span to get the next span, thus will produce
+                a bigger batch than inputs to include all spans. Moreover, 'overflow_to_sample'
+                and 'offset_mapping' preserving the original example and position
+                information will be added to the returned dictionary. Defaults to 0.
+            is_split_into_words (Union[bool, str], optional):
+                when the text is words or tokens, `is_split_into_words` should be True or `token`.
+                `True`: means that the text should be words which should be tokenized.
+                `token`: means that the text should be tokens which already be tokenized, so it should not be tokenized again.
+            padding (bool, str or [PaddingStrategy], optional):
+                Activates and controls padding. Accepts the following values:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+                Defaults to `False`.
+            truncation (bool, str or [TruncationStrategy], optional):
+                Activates and controls truncation. Accepts the following values:
+
+                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
+                  to the maximum acceptable input length for the model if that argument is not provided. This will
+                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
+                  sequences (or a batch of pairs) is provided.
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                  greater than the model maximum admissible input size).
+                Defaults to `False`.
+            return_position_ids (bool, optional):
+                Whether to include tokens position ids in the returned dictionary.
+                Defaults to `False`.
+            return_token_type_ids (bool, optional):
+                Whether to include token type ids in the returned dictionary.
+                Defaults to `True`.
+            return_attention_mask (bool, optional):
+                Whether to include the attention mask in the returned dictionary.
+                Defaults to `False`.
+            return_length (bool, optional):
+                Whether to include the length of each encoded inputs in the
+                returned dictionary. Defaults to `False`.
+            return_overflowing_tokens (bool, optional):
+                Whether to include overflowing token information in the returned
+                dictionary. Defaults to `False`.
+            return_special_tokens_mask (bool, optional):
+                Whether to include special tokens mask information in the returned
+                dictionary. Defaults to `False`.
+            return_dict (bool, optional):
+                Decide the format for returned encoded batch inputs. Only works when
+                input is a batch of data.
+                ::
+                    - If True, encoded inputs would be a dictionary like:
+                        {'input_ids': [[1, 4444, 4385, 1545, 6712],[1, 4444, 4385]],
+                        'token_type_ids': [[0, 0, 0, 0, 0], [0, 0, 0]]}
+                    - If False, encoded inputs would be a list like:
+                        [{'input_ids': [1, 4444, 4385, 1545, 6712],
+                          'token_type_ids': [0, 0, 0, 0, 0]},
+                         {'input_ids': [1, 4444, 4385], 'token_type_ids': [0, 0, 0]}]
+
+                Defaults to `True`.
+            return_offsets_mapping (bool, optional):
+                Whether to include the list of pair preserving the index of start
+                and end char in original input for each token in the returned
+                dictionary. Would be automatically set to `True` when `stride` > 0.
+                Defaults to `False`.
+            add_special_tokens (bool, optional):
+                Whether to add the special tokens associated with the corresponding model
+                to the encoded inputs. Defaults to `True`
+            pad_to_multiple_of (int, optional):
+                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
+                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+                Defaults to `None`.
+            return_tensors (str or [TensorType], optional):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'pd'`: Return Paddle `paddle.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+                Defaults to `None`.
+            verbose (bool, optional):
+                Whether or not to print more information and warnings. Defaults to True.
+
+        Returns:
+            dict or list[dict] (for batch input):
+                The dict has the following optional items:
+
+                - **input_ids** (list[int] or list[list[int]]): List of token ids to be fed to a model.
+                - **position_ids** (list[int] or list[list[int]], optional): List of token position ids to be
+                  fed to a model. Included when `return_position_ids` is `True`
+                - **token_type_ids** (list[int] or list[list[int]], optional): List of token type ids to be
+                  fed to a model. Included when `return_token_type_ids` is `True`.
+                - **attention_mask** (list[int] or list[list[int]], optional): List of integers valued 0 or 1,
+                  where 0 specifies paddings and should not be attended to by the
+                  model. Included when `return_attention_mask` is `True`.
+                - **seq_len** (int or list[int], optional): The input_ids length. Included when `return_length`
+                  is `True`.
+                - **overflowing_tokens** (list[int] or list[list[int]], optional): List of overflowing tokens.
+                  Included when if `max_length` is specified and `return_overflowing_tokens`
+                  is True.
+                - **num_truncated_tokens** (int or list[int], optional): The number of overflowing tokens.
+                  Included when if `max_length` is specified and `return_overflowing_tokens`
+                  is True.
+                - **special_tokens_mask** (list[int] or list[list[int]], optional): List of integers valued 0 or 1,
+                  with 0 specifying special added tokens and 1 specifying sequence tokens.
+                  Included when `return_special_tokens_mask` is `True`.
+                - **offset_mapping** (list[int], optional): list of pair preserving the
+                  index of start and end char in original input for each token.
+                  For a sqecial token, the index pair is `(0, 0)`. Included when
+                  `return_overflowing_tokens` is True or `stride` > 0.
+                - **overflow_to_sample** (int or list[int], optional): Index of example from which this
+                  feature is generated. Included when `stride` works.
+        """
+
+        # Input type checking for clearer error
+        def _is_valid_text_input(t):
+            if isinstance(t, str):
+                # Strings are fine
+                return True
+            elif isinstance(t, (list, tuple)):
+                # List are fine as long as they are...
+                if len(t) == 0:
+                    # ... empty
+                    return True
+                elif isinstance(t[0], str):
+                    # ... list of strings
+                    return True
+                elif isinstance(t[0], (list, tuple)):
+                    # ... list with an empty list or with a list of strings
+                    return len(t[0]) == 0 or isinstance(t[0][0], str)
+                else:
+                    return False
+            else:
+                return False
+
+        if not _is_valid_text_input(text):
+            raise ValueError(
+                "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
+                "or `List[List[str]]` (batch of pretokenized examples)."
+            )
+
+        if text_pair is not None and not _is_valid_text_input(text_pair):
+            raise ValueError(
+                "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
+                "or `List[List[str]]` (batch of pretokenized examples)."
+            )
+
+        # check `split_into_words` value
+        if isinstance(is_split_into_words, str) and is_split_into_words != "token":
+            raise ValueError(
+                "the value of `is_split_into_words` should be one of: {True, False, 'token'} but receive: <%s>",
+                is_split_into_words,
+            )
+
+        if is_split_into_words:
+            is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
+        else:
+            is_batched = isinstance(text, (list, tuple))
+
+        if is_batched:
+            if isinstance(text_pair, str):
+                raise TypeError(
+                    "when tokenizing batches of text, `text_pair` must be a list or tuple with the same length as `text`."
+                )
+            if text_pair is not None and len(text) != len(text_pair):
+                raise ValueError(
+                    f"batch length of `text`: {len(text)} does not match batch length of `text_pair`: {len(text_pair)}."
+                )
+            batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
+            return self.batch_encode(
+                batch_text_or_text_pairs=batch_text_or_text_pairs,
+                max_length=max_length,
+                stride=stride,
+                is_split_into_words=is_split_into_words,
+                padding=padding,
+                truncation=truncation,
+                return_position_ids=return_position_ids,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_length=return_length,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_dict=return_dict,
+                return_offsets_mapping=return_offsets_mapping,
+                add_special_tokens=add_special_tokens,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                verbose=verbose,
+                **kwargs,
+            )
+        else:
+            return self.encode(
+                text=text,
+                text_pair=text_pair,
+                max_length=max_length,
+                stride=stride,
+                is_split_into_words=is_split_into_words,
+                padding=padding,
+                truncation=truncation,
+                return_position_ids=return_position_ids,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_length=return_length,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                add_special_tokens=add_special_tokens,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                verbose=verbose,
+                **kwargs,
+            )
+
+    def encode(
+        self,
+        text,
+        text_pair=None,
+        add_special_tokens=True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        return_position_ids=None,
+        **kwargs
+    ) -> BatchEncoding:
+        """
+        Tokenize and prepare for the model a sequence or a pair of sequences.
+
+        Args:
+            text (`str`, `List[str]` or `List[int]`):
+                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
+                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
+                method).
+            text_pair (`str`, `List[str]` or `List[int]`, *optional*):
+                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
+                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
+                method).
+        """
+        # Backward compatibility for 'max_seq_len'
+        old_max_seq_len = kwargs.get("max_seq_len", None)
+        if max_length is None and old_max_seq_len:
+            if verbose:
+                warnings.warn(
+                    "The `max_seq_len` argument is deprecated and will be removed in a future version, "
+                    "please use `max_length` instead.",
+                    FutureWarning,
+                )
+            max_length = old_max_seq_len
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._encode_plus(
+            text=text,
+            text_pair=text_pair,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            is_split_into_words=is_split_into_words,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_position_ids=return_position_ids,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def encode_plus(
+        self,
+        text: Union[TextInput, PreTokenizedInput, EncodedInput],
+        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Tokenize and prepare for the model a sequence or a pair of sequences.
+
+        <Tip warning={true}>
+
+        This method is deprecated, `__call__` should be used instead.
+
+        </Tip>
+
+        Args:
+            text (`str`, `List[str]` or `List[int]` (the latter only for not-fast tokenizers)):
+                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
+                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
+                method).
+            text_pair (`str`, `List[str]` or `List[int]`, *optional*):
+                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
+                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
+                method).
+        """
+
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._encode_plus(
+            text=text,
+            text_pair=text_pair,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            is_split_into_words=is_split_into_words,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _encode_plus(
+        self,
+        text: Union[TextInput, PreTokenizedInput, EncodedInput],
+        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_position_ids: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        raise NotImplementedError
+
+    def batch_encode(
+        self,
+        batch_text_or_text_pairs: Union[
+            List[TextInput],
+            List[TextInputPair],
+            List[PreTokenizedInput],
+            List[PreTokenizedInputPair],
+            List[EncodedInput],
+            List[EncodedInputPair],
+        ],
+        max_length=None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = False,
+        return_position_ids=None,
+        # TODO(wj-mcat): keep align with `encode` method
+        return_token_type_ids=None,
+        return_attention_mask=None,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_special_tokens_mask=False,
+        return_dict=True,
+        return_offsets_mapping=False,
+        add_special_tokens=True,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        """
+        Performs tokenization and uses the tokenized tokens to prepare model
+        inputs. It supports batch inputs of sequence or sequence pair.
+
+        Args:
+            batch_text_or_text_pairs (list):
+                The element of list can be sequence or sequence pair, and the
+                sequence is a string or a list of strings depending on whether
+                it has been pretokenized. If each sequence is provided as a list
+                of strings (pretokenized), you must set `is_split_into_words` as
+                `True` to disambiguate with a sequence pair.
+
+        Returns:
+            dict or list[dict]:
+                The dict has the following optional items:
+
+        """
+        # Backward compatibility for 'max_seq_len'
+        old_max_seq_len = kwargs.get("max_seq_len", None)
+        if max_length is None and old_max_seq_len:
+            if verbose:
+                warnings.warn(
+                    "The `max_seq_len` argument is deprecated and will be removed in a future version, "
+                    "please use `max_length` instead.",
+                    FutureWarning,
+                )
+            max_length = old_max_seq_len
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._batch_encode_plus(
+            batch_text_or_text_pairs=batch_text_or_text_pairs,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            is_split_into_words=is_split_into_words,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_position_ids=return_position_ids,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_dict=return_dict,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            List[TextInput],
+            List[TextInputPair],
+            List[PreTokenizedInput],
+            List[PreTokenizedInputPair],
+            List[EncodedInput],
+            List[EncodedInputPair],
+        ],
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_position_ids: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_dict: bool = True,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        raise NotImplementedError
+
+    def pad(
+        self,
+        encoded_inputs: Union[
+            BatchEncoding,
+            List[BatchEncoding],
+            Dict[str, EncodedInput],
+            Dict[str, List[EncodedInput]],
+            List[Dict[str, EncodedInput]],
+        ],
+        padding: Union[bool, str, PaddingStrategy] = True,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        verbose: bool = True,
+    ) -> BatchEncoding:
+        """
+        Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
+        in the batch.
+
+        Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
+        `self.pad_token_id` and `self.pad_token_type_id`)
+
+        <Tip>
+
+        If the `encoded_inputs` passed are dictionary of numpy arrays, Paddle tensors, the
+        result will use the same type unless you provide a different tensor type with `return_tensors`.
+        </Tip>
+
+        Args:
+            encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`):
+                Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of
+                tokenized inputs (list of [`BatchEncoding`], *Dict[str, List[List[int]]]* or *List[Dict[str,
+                List[int]]]*) so you can use this method during preprocessing as well as in a Paddle Dataloader
+                collate function.
+
+                Instead of `List[int]` you can have tensors (numpy arrays, Paddle tensors), see
+                the note above for the return type.
+            padding (`bool`, `str` or [`PaddingStrategy`], *optional*, defaults to `True`):
+                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                 index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value.
+
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                >= 7.5 (Volta).
+            return_attention_mask (`bool`, *optional*):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific tokenizer's default, defined by the `return_outputs` attribute.
+
+                [What are attention masks?](../glossary#attention-mask)
+            return_tensors (`str` or [`TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'pd'`: Return Paddle `paddle.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            verbose (`bool`, *optional*, defaults to `True`):
+                Whether or not to print more information and warnings.
+        """
+        # If we have a list of dicts, let's convert it in a dict of lists
+        if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)):
+            encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
+
+        # The model's main input name, usually `input_ids`, has be passed for padding
+        if self.model_input_names[0] not in encoded_inputs:
+            raise ValueError(
+                "You should supply an encoding or a list of encodings to this method "
+                f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
+            )
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+
+        if not required_input:
+            if return_attention_mask:
+                encoded_inputs["attention_mask"] = []
+            return encoded_inputs
+
+        # If we have Paddle/NumPy tensors/arrays as inputs, we cast them as python objects
+        # and rebuild them afterwards if no return_tensors is specified
+
+        first_element = required_input[0]
+        if isinstance(first_element, (list, tuple)):
+            # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
+            for item in required_input:
+                if len(item) != 0:
+                    first_element = item[0]
+                    break
+        # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
+        if not isinstance(first_element, (int, list, tuple)):
+            if isinstance(first_element, paddle.Tensor):
+                return_tensors = "pd" if return_tensors is None else return_tensors
+            else:
+                raise ValueError(
+                    f"type of {first_element} unknown: {type(first_element)}. "
+                    f"Should be either python or paddle object."
+                )
+
+            for key, value in encoded_inputs.items():
+                encoded_inputs[key] = to_py_obj(value)
+
+        # Convert padding_strategy in PaddingStrategy
+        padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
+            padding=padding, max_length=max_length, verbose=verbose
+        )
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+        if required_input and not isinstance(required_input[0], (list, tuple)):
+            encoded_inputs = self._pad(
+                encoded_inputs,
+                max_length=max_length,
+                padding_strategy=padding_strategy,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+            )
+            return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
+
+        batch_size = len(required_input)
+        assert all(
+            len(v) == batch_size for v in encoded_inputs.values()
+        ), "Some items in the output dictionary have a different batch size than others."
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = max(len(inputs) for inputs in required_input)
+            padding_strategy = PaddingStrategy.MAX_LENGTH
+
+        batch_outputs = {}
+        for i in range(batch_size):
+            inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
+            outputs = self._pad(
+                inputs,
+                max_length=max_length,
+                padding_strategy=padding_strategy,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+            )
+
+            for key, value in outputs.items():
+                if key not in batch_outputs:
+                    batch_outputs[key] = []
+                batch_outputs[key].append(value)
+
+        return BatchEncoding(batch_outputs, tensor_type=return_tensors)
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create the token type IDs corresponding to the sequences passed. [What are token type
+        IDs?](../glossary#token-type-ids)
+
+        Should be overridden in a subclass if the model has a special way of building those.
+
+        Args:
+            token_ids_0 (`List[int]`): The first tokenized sequence.
+            token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
+
+        Returns:
+            `List[int]`: The token type ids.
+        """
+        if token_ids_1 is None:
+            return len(token_ids_0) * [0]
+        return [0] * len(token_ids_0) + [1] * len(token_ids_1)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens.
+
+        This implementation does not add special tokens and this method should be overridden in a subclass.
+
+        Args:
+            token_ids_0 (`List[int]`): The first tokenized sequence.
+            token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
+
+        Returns:
+            `List[int]`: The model input with special tokens.
+        """
+        if token_ids_1 is None:
+            return token_ids_0
+        return token_ids_0 + token_ids_1
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        """
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        Should be overridden in a subclass if the model has a special way of building those.
+
+        Args:
+            offset_mapping_0 (List[tuple]):
+                List of char offsets to which the special tokens will be added.
+            offset_mapping_1 (List[tuple], optional):
+                Optional second list of char offsets for offset mapping pairs.
+
+        Returns:
+            List[tuple]: List of char offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return offset_mapping_0
+
+        return offset_mapping_0 + offset_mapping_1
+
+    def prepare_for_model(
+        self,
+        ids,
+        pair_ids=None,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_position_ids=None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_special_tokens_mask=False,
+        return_offsets_mapping=False,
+        add_special_tokens=True,
+        verbose: bool = True,
+        prepend_batch_axis: bool = False,
+        **kwargs
+    ):
+        """
+        Performs tokenization and uses the tokenized tokens to prepare model
+        inputs. It supports sequence or sequence pair as input, and batch input
+        is not allowed.
+        """
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        pair = bool(pair_ids is not None)
+        len_ids = len(ids)
+        len_pair_ids = len(pair_ids) if pair else 0
+
+        if return_token_type_ids and not add_special_tokens:
+            raise ValueError(
+                "Asking to return token_type_ids while setting add_special_tokens to False "
+                "results in an undefined behavior. Please set add_special_tokens to True or "
+                "set return_token_type_ids to None."
+            )
+
+        if (
+            return_overflowing_tokens
+            and truncation_strategy == TruncationStrategy.LONGEST_FIRST
+            and pair_ids is not None
+        ):
+            raise ValueError(
+                "Not possible to return overflowing tokens for pair of sequences with the "
+                "`longest_first`. Please select another truncation strategy than `longest_first`, "
+                "for instance `only_second` or `only_first`."
+            )
+
+        # Load from model defaults
+        if return_token_type_ids is None:
+            return_token_type_ids = "token_type_ids" in self.model_input_names
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+        if return_position_ids is None:
+            return_position_ids = "position_ids" in self.model_input_names
+        encoded_inputs = {}
+        # Truncation: Handle max sequence length
+        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
+
+        overflowing_tokens = []
+
+        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
+            ids, pair_ids, overflowing_tokens = self.truncate_sequences(
+                ids,
+                pair_ids=pair_ids,
+                num_tokens_to_remove=total_len - max_length,
+                truncation_strategy=truncation_strategy,
+                stride=stride,
+            )
+        if return_overflowing_tokens:
+            encoded_inputs["overflowing_tokens"] = overflowing_tokens
+            encoded_inputs["num_truncated_tokens"] = total_len - max_length
+
+        # Add special tokens
+        if add_special_tokens:
+            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
+            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
+        else:
+            sequence = ids + pair_ids if pair else ids
+            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
+
+        # Build output dictionnary
+        encoded_inputs["input_ids"] = sequence
+        if return_token_type_ids:
+            encoded_inputs["token_type_ids"] = token_type_ids
+        if return_special_tokens_mask:
+            if add_special_tokens:
+                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
+            else:
+                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
+
+        if return_offsets_mapping and "text" in kwargs and "text_pair" in kwargs:
+            text = kwargs.pop("text")
+            text_pair = kwargs.pop("text_pair")
+
+            token_offset_mapping = self.get_offset_mapping(text)
+            token_pair_offset_mapping = self.get_offset_mapping(text_pair) if text_pair is not None else None
+            if max_length and total_len > max_length:
+                token_offset_mapping, token_pair_offset_mapping, _ = self.truncate_sequences(
+                    token_offset_mapping,
+                    pair_ids=token_pair_offset_mapping,
+                    num_tokens_to_remove=total_len - max_length,
+                    truncation_strategy=truncation_strategy,
+                    stride=stride,
+                )
+            if add_special_tokens:
+                offset_mapping = self.build_offset_mapping_with_special_tokens(
+                    token_offset_mapping, token_pair_offset_mapping
+                )
+            else:
+                offset_mapping = (
+                    token_offset_mapping + token_pair_offset_mapping
+                    if token_pair_offset_mapping
+                    else token_offset_mapping
+                )
+            encoded_inputs["offset_mapping"] = offset_mapping
+
+        # Check lengths
+        self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
+
+        if return_position_ids:
+            encoded_inputs["position_ids"] = list(range(len(encoded_inputs["input_ids"])))
+
+        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
+            encoded_inputs = self.pad(
+                encoded_inputs,
+                max_length=max_length,
+                padding=padding_strategy.value,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+            )
+
+        if return_length:
+            encoded_inputs["length"] = len(encoded_inputs["input_ids"])
+            # for compatibility
+            encoded_inputs["seq_len"] = encoded_inputs["length"]
+
+        batch_outputs = BatchEncoding(
+            encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
+        )
+
+        return batch_outputs
+
+    def truncate_sequences(
+        self,
+        ids: List[int],
+        pair_ids: Optional[List[int]] = None,
+        num_tokens_to_remove: int = 0,
+        truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
+        stride: int = 0,
+    ) -> Tuple[List[int], List[int], List[int]]:
+        """
+        Truncates a sequence pair in-place following the strategy.
+
+        Args:
+            ids (`List[int]`):
+                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
+                `convert_tokens_to_ids` methods.
+            pair_ids (`List[int]`, *optional*):
+                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
+                and `convert_tokens_to_ids` methods.
+            num_tokens_to_remove (`int`, *optional*, defaults to 0):
+                Number of tokens to remove using the truncation strategy.
+            truncation_strategy (`str` or [`TruncationStrategy`], *optional*, defaults to `False`):
+                The strategy to follow for truncation. Can be:
+
+                - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will truncate
+                  token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a
+                  batch of pairs) is provided.
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater
+                  than the model maximum admissible input size).
+            stride (`int`, *optional*, defaults to 0):
+                If set to a positive number, the overflowing tokens returned will contain some tokens from the main
+                sequence returned. The value of this argument defines the number of additional tokens.
+
+        Returns:
+            `Tuple[List[int], List[int], List[int]]`: The truncated `ids`, the truncated `pair_ids` and the list of
+            overflowing tokens. Note: The *longest_first* strategy returns empty list of overflowing tokens if a pair
+            of sequences (or a batch of pairs) is provided.
+        """
+        if num_tokens_to_remove <= 0:
+            return ids, pair_ids, []
+
+        if not isinstance(truncation_strategy, TruncationStrategy):
+            truncation_strategy = TruncationStrategy(truncation_strategy)
+
+        overflowing_tokens = []
+        if truncation_strategy == TruncationStrategy.ONLY_FIRST or (
+            truncation_strategy == TruncationStrategy.LONGEST_FIRST and pair_ids is None
+        ):
+            if len(ids) > num_tokens_to_remove:
+                window_len = min(len(ids), stride + num_tokens_to_remove)
+                if self.truncation_side == "left":
+                    overflowing_tokens = ids[:window_len]
+                    ids = ids[num_tokens_to_remove:]
+                elif self.truncation_side == "right":
+                    overflowing_tokens = ids[-window_len:]
+                    ids = ids[:-num_tokens_to_remove]
+                else:
+                    raise ValueError(f"invalid truncation strategy: {self.truncation_side}, use 'left' or 'right'.")
+
+            else:
+                error_msg = (
+                    f"We need to remove {num_tokens_to_remove} to truncate the input "
+                    f"but the first sequence has a length {len(ids)}. "
+                )
+                if truncation_strategy == TruncationStrategy.ONLY_FIRST:
+                    error_msg = (
+                        error_msg + "Please select another truncation strategy than "
+                        f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
+                    )
+                logger.error(error_msg)
+        elif truncation_strategy == TruncationStrategy.LONGEST_FIRST:
+            warnings.warn(
+                f"Be aware, overflowing tokens are not returned for the setting you have chosen,"
+                f" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' "
+                f"truncation strategy. So the returned list will always be empty even if some "
+                f"tokens have been removed."
+            )
+            for _ in range(num_tokens_to_remove):
+                if pair_ids is None or len(ids) > len(pair_ids):
+                    if self.truncation_side == "right":
+                        ids = ids[:-1]
+                    elif self.truncation_side == "left":
+                        ids = ids[1:]
+                    else:
+                        raise ValueError("invalid truncation strategy:" + str(self.truncation_side))
+                else:
+                    if self.truncation_side == "right":
+                        pair_ids = pair_ids[:-1]
+                    elif self.truncation_side == "left":
+                        pair_ids = pair_ids[1:]
+                    else:
+                        raise ValueError("invalid truncation strategy:" + str(self.truncation_side))
+        elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:
+            if len(pair_ids) > num_tokens_to_remove:
+                window_len = min(len(pair_ids), stride + num_tokens_to_remove)
+                if self.truncation_side == "right":
+                    overflowing_tokens = pair_ids[-window_len:]
+                    pair_ids = pair_ids[:-num_tokens_to_remove]
+                elif self.truncation_side == "left":
+                    overflowing_tokens = pair_ids[:window_len]
+                    pair_ids = pair_ids[num_tokens_to_remove:]
+                else:
+                    raise ValueError("invalid truncation strategy:" + str(self.truncation_side))
+            else:
+                logger.error(
+                    f"We need to remove {num_tokens_to_remove} to truncate the input "
+                    f"but the second sequence has a length {len(pair_ids)}. "
+                    f"Please select another truncation strategy than {truncation_strategy}, "
+                    f"for instance 'longest_first' or 'only_first'."
+                )
+
+        return (ids, pair_ids, overflowing_tokens)
+
+    def _pad(
+        self,
+        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                >= 7.5 (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names or "attention_mask" in encoded_inputs
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if return_attention_mask and "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * len(required_input)
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+
+            if self.padding_side == "right":
+                if return_attention_mask:
+
+                    encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = (
+                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
+                    )
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
+                if "offset_mapping" in encoded_inputs:
+                    encoded_inputs["offset_mapping"] = encoded_inputs["offset_mapping"] + [(0, 0)] * difference
+                if "position_ids" in encoded_inputs:
+                    encoded_inputs["position_ids"] = encoded_inputs["position_ids"] + [0] * difference
+                # NOTE: In ernie3.0-qa, the type of `*_positions` is int.
+                if "start_positions" in encoded_inputs and isinstance(encoded_inputs["start_positions"], list):
+                    encoded_inputs["start_positions"] = encoded_inputs["start_positions"] + [0] * difference
+                if "end_positions" in encoded_inputs and isinstance(encoded_inputs["end_positions"], list):
+                    encoded_inputs["end_positions"] = encoded_inputs["end_positions"] + [0] * difference
+                encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
+            elif self.padding_side == "left":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                        "token_type_ids"
+                    ]
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+                if "offset_mapping" in encoded_inputs:
+                    encoded_inputs["offset_mapping"] = [(0, 0)] * difference + encoded_inputs["offset_mapping"]
+                if "position_ids" in encoded_inputs:
+                    encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
+                if "start_positions" in encoded_inputs and isinstance(encoded_inputs["start_positions"], list):
+                    encoded_inputs["start_positions"] = [0] * difference + encoded_inputs["start_positions"]
+                if "end_positions" in encoded_inputs and isinstance(encoded_inputs["end_positions"], list):
+                    encoded_inputs["end_positions"] = [0] * difference + encoded_inputs["end_positions"]
+                encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+            else:
+                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+
+        return encoded_inputs
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """
+        Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
+        often want to remove sub-word tokenization artifacts at the same time.
+
+        Args:
+            tokens (`List[str]`): The token to join in a string.
+
+        Returns:
+            `str`: The joined tokens.
+        """
+        raise NotImplementedError
+
+    def batch_decode(
+        self,
+        sequences: Union[List[int], List[List[int]], "np.ndarray", "paddle.Tensor"],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = True,
+        **kwargs
+    ) -> List[str]:
+        """
+        Convert a list of lists of token ids into a list of strings by calling decode.
+
+        Args:
+            sequences (`Union[List[int], List[List[int]], np.ndarray, paddle.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean up the tokenization spaces.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific decode method.
+
+        Returns:
+            `List[str]`: The list of decoded sentences.
+        """
+        return [
+            self.decode(
+                seq,
+                skip_special_tokens=skip_special_tokens,
+                clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                **kwargs,
+            )
+            for seq in sequences
+        ]
+
+    def decode(
+        self,
+        token_ids: Union[int, List[int], "np.ndarray", "paddle.Tensor"],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = True,
+        **kwargs
+    ) -> str:
+        """
+        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
+        tokens and clean up tokenization spaces.
+
+        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
+
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, paddle.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean up the tokenization spaces.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific decode method.
+
+        Returns:
+            `str`: The decoded sentence.
+        """
+        # Convert inputs to python lists
+        token_ids = to_py_obj(token_ids)
+
+        return self._decode(
+            token_ids=token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = True,
+        **kwargs
+    ) -> str:
+        raise NotImplementedError
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of ids of the first sequence.
+            token_ids_1 (`List[int]`, *optional*):
+                List of ids of the second sequence.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        assert already_has_special_tokens and token_ids_1 is None, (
+            "You cannot use ``already_has_special_tokens=False`` with this tokenizer. "
+            "Please use a slow (full python) tokenizer to activate this argument. "
+            "Or set `return_special_tokens_mask=True` when calling the encoding method "
+            "to get the special tokens mask in any tokenizer. "
+        )
+
+        all_special_ids = self.all_special_ids  # cache the property
+
+        special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0]
+
+        return special_tokens_mask
+
+    @staticmethod
+    def clean_up_tokenization(out_string: str) -> str:
+        """
+        Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.
+
+        Args:
+            out_string (`str`): The text to clean up.
+
+        Returns:
+            `str`: The cleaned-up string.
+        """
+        out_string = (
+            out_string.replace(" .", ".")
+            .replace(" ?", "?")
+            .replace(" !", "!")
+            .replace(" ,", ",")
+            .replace(" ' ", "'")
+            .replace(" n't", "n't")
+            .replace(" 'm", "'m")
+            .replace(" 's", "'s")
+            .replace(" 've", "'ve")
+            .replace(" 're", "'re")
+        )
+        return out_string
+
+    def _eventual_warn_about_too_long_sequence(self, ids: List[int], max_length: Optional[int], verbose: bool):
+        """
+        Depending on the input and internal state we might trigger a warning about a sequence that is too long for its
+        corresponding model
+
+        Args:
+            ids (`List[str]`): The ids produced by the tokenization
+            max_length (`int`, *optional*): The max_length desired (does not trigger a warning if it is set)
+            verbose (`bool`): Whether or not to print more information and warnings.
+
+        """
+        if max_length is None and len(ids) > self.model_max_length and verbose:
+            if not self.deprecation_warnings.get("sequence-length-is-longer-than-the-specified-maximum", False):
+                logger.warning(
+                    "Token indices sequence length is longer than the specified maximum sequence length "
+                    f"for this model ({len(ids)} > {self.model_max_length}). Running this sequence through the model "
+                    "will result in indexing errors"
+                )
+            self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tokenizer_utils_fast.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tokenizer_utils_fast.py
new file mode 100644
index 000000000..d6a854fdd
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/tokenizer_utils_fast.py
@@ -0,0 +1,869 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Tokenizer classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
+see tokenizer_utils.py
+"""
+
+import copy
+import json
+import os
+from collections import defaultdict
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import tokenizers.pre_tokenizers as pre_tokenizers_fast
+from tokenizers import Encoding as EncodingFast
+from tokenizers import Tokenizer as TokenizerFast
+from tokenizers.decoders import Decoder as DecoderFast
+from tokenizers.trainers import (
+    BpeTrainer,
+    UnigramTrainer,
+    WordLevelTrainer,
+    WordPieceTrainer,
+)
+
+from ..utils.env import ADDED_TOKENS_NAME, FULL_TOKENIZER_NAME
+from .convert_slow_tokenizer import convert_slow_tokenizer
+from .tokenizer_utils import ChatTemplateMixin, PretrainedTokenizer
+from .tokenizer_utils_base import (
+    AddedToken,
+    BatchEncoding,
+    EncodedInput,
+    EncodedInputPair,
+    PaddingStrategy,
+    PreTokenizedInput,
+    PreTokenizedInputPair,
+    PretrainedTokenizerBase,
+    SpecialTokensMixin,
+    TextInput,
+    TextInputPair,
+    TruncationStrategy,
+)
+
+MODEL_TO_TRAINER_MAPPING = {
+    "BPE": BpeTrainer,
+    "Unigram": UnigramTrainer,
+    "WordLevel": WordLevelTrainer,
+    "WordPiece": WordPieceTrainer,
+}
+
+VOCAB_FILES_NAMES = {"tokenizer_file": FULL_TOKENIZER_NAME}
+
+
+class PretrainedTokenizerFast(ChatTemplateMixin, PretrainedTokenizerBase):
+    """
+    Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).
+
+    Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].
+
+    Handles all the shared methods for tokenization and special tokens, as well as methods for
+    downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.
+
+    This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
+    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
+    """
+
+    resource_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class: PretrainedTokenizer = None
+
+    def __init__(self, *args, **kwargs):
+        tokenizer_object = kwargs.pop("tokenizer_object", None)
+        slow_tokenizer = kwargs.pop("__slow_tokenizer", None)
+        fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
+        from_slow = kwargs.pop("from_slow", False)
+        added_tokens_decoder = kwargs.pop("added_tokens_decoder", {})
+
+        if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None:
+            raise ValueError(
+                "Cannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you "
+                "have sentencepiece installed."
+            )
+
+        if tokenizer_object is not None:
+            fast_tokenizer = copy.deepcopy(tokenizer_object)
+        elif fast_tokenizer_file is not None and not from_slow:
+            # We have a serialization from tokenizers which let us directly build the backend
+            fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
+        elif slow_tokenizer is not None:
+            # We need to convert a slow tokenizer to build the backend
+            fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
+        elif self.slow_tokenizer_class is not None:
+            # We need to create and convert a slow tokenizer to build the backend
+            slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs)
+            fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
+        else:
+            raise ValueError(
+                "Couldn't instantiate the backend tokenizer from one of: \n"
+                "(1) a `tokenizers` library serialization file, \n"
+                "(2) a slow tokenizer instance to convert or \n"
+                "(3) an equivalent slow tokenizer class to instantiate and convert. \n"
+                "You need to have sentencepiece installed to convert a slow tokenizer to a fast one."
+            )
+
+        self._tokenizer = fast_tokenizer
+
+        if slow_tokenizer is not None:
+            kwargs.update(slow_tokenizer.init_kwargs)
+
+        self._decode_use_source_tokenizer = False
+
+        _truncation = self._tokenizer.truncation
+
+        if _truncation is not None:
+            self._tokenizer.enable_truncation(**_truncation)
+            kwargs.setdefault("max_length", _truncation["max_length"])
+            kwargs.setdefault("truncation_side", _truncation["direction"])
+            kwargs.setdefault("stride", _truncation["stride"])
+            kwargs.setdefault("truncation_strategy", _truncation["strategy"])
+        else:
+            self._tokenizer.no_truncation()
+
+        _padding = self._tokenizer.padding
+        if _padding is not None:
+            self._tokenizer.enable_padding(**_padding)
+            kwargs.setdefault("pad_token", _padding["pad_token"])
+            kwargs.setdefault("pad_token_type_id", _padding["pad_type_id"])
+            kwargs.setdefault("padding_side", _padding["direction"])
+            kwargs.setdefault("max_length", _padding["length"])
+            kwargs.setdefault("pad_to_multiple_of", _padding["pad_to_multiple_of"])
+
+        # We call this after having initialized the backend tokenizer because we update it.
+        super().__init__(**kwargs)
+
+        # Set the splitting mode for special tokens for the tokenizer to be used throughout the class.
+        self._tokenizer.encode_special_tokens = self.split_special_tokens
+
+        # The following logic will be replace with a single add_tokens once a fix is pushed to tokenizers
+        # allows converting a slow -> fast, non-legacy: if the `tokenizer.json` does not have all the added tokens
+        # uses the information stored in `added_tokens_decoder`.
+        # this is costly for fast tokenizers as we re-compute the regex again. But not all tokens are added tokens
+        # Use hash to speed up the very slow operation `token not in added_tokens_decoder`.
+        added_tokens_decoder_hash = {hash(repr(token)) for token in self.added_tokens_decoder}
+        tokens_to_add = [
+            token
+            for index, token in sorted(added_tokens_decoder.items(), key=lambda x: x[0])
+            if hash(repr(token)) not in added_tokens_decoder_hash
+        ]
+        encoder = list(self.added_tokens_encoder.keys()) + [str(token) for token in tokens_to_add]
+        # if some of the special tokens are strings, we check if we don't already have a token
+        tokens_to_add += [
+            token for token in self.all_special_tokens_extended if token not in encoder and token not in tokens_to_add
+        ]
+
+        if len(tokens_to_add) > 0:
+            # super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ
+            # Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
+            # individual tokens would repeatedly rebuild a trie, which can be slow.
+            is_last_special = None
+            tokens = []
+            special_tokens = self.all_special_tokens
+            for token in tokens_to_add:
+                is_special = (
+                    (token.special or str(token) in special_tokens)
+                    if isinstance(token, AddedToken)
+                    else str(token) in special_tokens
+                )
+                if is_last_special is None or is_last_special == is_special:
+                    tokens.append(token)
+                else:
+                    self._add_tokens(tokens, special_tokens=is_last_special)
+                    tokens = [token]
+                is_last_special = is_special
+            if tokens:
+                self._add_tokens(tokens, special_tokens=is_last_special)
+
+    @property
+    def is_fast(self) -> bool:
+        return True
+
+    @property
+    def can_save_slow_tokenizer(self) -> bool:
+        """
+        `bool`: Whether or not the slow tokenizer can be saved. Usually for sentencepiece based slow tokenizer, this
+        can only be `True` if the original `"sentencepiece.model"` was not deleted.
+        """
+        return True
+
+    @property
+    def vocab_size(self) -> int:
+        """
+        `int`: Size of the base vocabulary (without the added tokens).
+        """
+        return self._tokenizer.get_vocab_size(with_added_tokens=False)
+
+    def get_vocab(self) -> Dict[str, int]:
+        return self._tokenizer.get_vocab(with_added_tokens=True)
+
+    @property
+    def vocab(self) -> Dict[str, int]:
+        return self.get_vocab()
+
+    @property
+    def added_tokens_encoder(self) -> Dict[str, int]:
+        """
+        Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
+        optimization in `self._added_tokens_encoder` for the slow tokenizers.
+        """
+        return {k.content: v for v, k in sorted(self.added_tokens_decoder.items(), key=lambda item: item[0])}
+
+    @property
+    def added_tokens_decoder(self) -> Dict[int, AddedToken]:
+        """
+        Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.
+
+        Returns:
+            `Dict[str, int]`: The added tokens.
+        """
+        return self._tokenizer.get_added_tokens_decoder()
+
+    def get_added_vocab(self) -> Dict[str, int]:
+        """
+        Returns the added tokens in the vocabulary as a dictionary of token to index.
+
+        Returns:
+            `Dict[str, int]`: The added tokens.
+        """
+        return {k.content: v for v, k in sorted(self.added_tokens_decoder.items(), key=lambda item: item[0])}
+
+    def __len__(self) -> int:
+        """
+        Size of the full vocabulary with the added tokens.
+        """
+        return self._tokenizer.get_vocab_size(with_added_tokens=True)
+
+    @property
+    def backend_tokenizer(self) -> TokenizerFast:
+        """
+        `tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
+        """
+        return self._tokenizer
+
+    @property
+    def decoder(self) -> DecoderFast:
+        """
+        `tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
+        """
+        return self._tokenizer.decoder
+
+    def _convert_encoding(
+        self,
+        encoding: EncodingFast,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        return_position_ids: bool = False,
+        verbose: bool = True,
+    ) -> Tuple[Dict[str, Any], List[EncodingFast]]:
+        """
+        Convert the encoding representation (from low-level PaddleNLP TokenizerFast output) to a python Dict and a list
+        of encodings, take care of building a batch from overflowing tokens.
+
+        Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
+        lists (overflows) of lists (tokens).
+
+        Output shape: (overflows, sequence length)
+        """
+        if return_token_type_ids is None:
+            return_token_type_ids = "token_type_ids" in self.model_input_names
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
+        if return_overflowing_tokens and encoding.overflowing is not None:
+            encodings = [encoding] + encoding.overflowing
+        else:
+            encodings = [encoding]
+
+        encoding_dict = defaultdict(list)
+        for e in encodings:
+            encoding_dict["input_ids"].append(e.ids)
+
+            if return_token_type_ids:
+                encoding_dict["token_type_ids"].append(e.type_ids)
+            if return_attention_mask:
+                encoding_dict["attention_mask"].append(e.attention_mask)
+            if return_special_tokens_mask:
+                encoding_dict["special_tokens_mask"].append(e.special_tokens_mask)
+            if return_offsets_mapping:
+                encoding_dict["offset_mapping"].append(e.offsets)
+            if return_length:
+                encoding_dict["length"].append(len(e.ids))
+            if return_position_ids:
+                encoding_dict["position_ids"].append(list(range(len(e.ids))))
+        return encoding_dict, encodings
+
+    def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
+        """
+        Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
+        vocabulary.
+
+        Args:
+            tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).
+
+        Returns:
+            `int` or `List[int]`: The token id or list of token ids.
+        """
+        if tokens is None:
+            return None
+
+        if isinstance(tokens, str):
+            return self._convert_token_to_id_with_added_voc(tokens)
+
+        return [self._convert_token_to_id_with_added_voc(token) for token in tokens]
+
+    def _convert_token_to_id_with_added_voc(self, token: str) -> int:
+        index = self._tokenizer.token_to_id(token)
+        if index is None:
+            return self.unk_token_id
+        return index
+
+    def _convert_id_to_token(self, index: int) -> Optional[str]:
+        return self._tokenizer.id_to_token(int(index))
+
+    def _add_tokens(self, new_tokens: List[Union[str, AddedToken]], special_tokens=False) -> int:
+        if special_tokens:
+            return self._tokenizer.add_special_tokens(new_tokens)
+
+        return self._tokenizer.add_tokens(new_tokens)
+
+    def num_special_tokens_to_add(self, pair: bool = False) -> int:
+        """
+        Returns the number of added tokens when encoding a sequence with special tokens.
+
+        <Tip>
+
+        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
+        this inside your training loop.
+
+        </Tip>
+
+        Args:
+            pair (`bool`, *optional*, defaults to `False`):
+                Whether the number of added tokens should be computed in the case of a sequence pair or a single
+                sequence.
+
+        Returns:
+            `int`: Number of special tokens added to sequences.
+        """
+        return self._tokenizer.num_special_tokens_to_add(pair)
+
+    def convert_ids_to_tokens(
+        self, ids: Union[int, List[int]], skip_special_tokens: bool = False
+    ) -> Union[str, List[str]]:
+        """
+        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
+        added tokens.
+
+        Args:
+            ids (`int` or `List[int]`):
+                The token id (or token ids) to convert to tokens.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
+
+        Returns:
+            `str` or `List[str]`: The decoded token(s).
+        """
+        if isinstance(ids, int):
+            return self._tokenizer.id_to_token(ids)
+        tokens = []
+        for index in ids:
+            index = int(index)
+            if skip_special_tokens and index in self.all_special_ids:
+                continue
+            tokens.append(self._tokenizer.id_to_token(index))
+        return tokens
+
+    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
+        return self.encode_plus(text=text, text_pair=pair, add_special_tokens=add_special_tokens, **kwargs).tokens()
+
+    def set_truncation_and_padding(
+        self,
+        padding_strategy: PaddingStrategy,
+        truncation_strategy: TruncationStrategy,
+        max_length: int,
+        stride: int,
+        pad_to_multiple_of: Optional[int],
+    ):
+        """
+        Define the truncation and the padding strategies for fast tokenizers (provided by PaddleNLP's fast_tokenizer
+        library) and restore the tokenizer settings afterwards.
+
+        The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
+        padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
+        section.
+
+        Args:
+            padding_strategy ([`~utils.PaddingStrategy`]):
+                The kind of padding that will be applied to the input
+            truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
+                The kind of truncation that will be applied to the input
+            max_length (`int`):
+                The maximum size of a sequence.
+            stride (`int`):
+                The stride to use when handling overflow.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
+                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
+        """
+        _truncation = self._tokenizer.truncation
+        _padding = self._tokenizer.padding
+        # Set truncation and padding on the backend tokenizer
+        if truncation_strategy == TruncationStrategy.DO_NOT_TRUNCATE:
+            if _truncation is not None:
+                self._tokenizer.no_truncation()
+        else:
+            target = {
+                "max_length": max_length,
+                "stride": stride,
+                "strategy": truncation_strategy.value,
+                "direction": self.truncation_side,
+            }
+
+            # _truncation might contain more keys that the target `transformers`
+            # supports. Use only the target keys to trigger `enable_truncation`.
+            # This should enable this code to works on various `tokenizers`
+            # targets.
+            if _truncation is None:
+                current = None
+            else:
+                current = {k: _truncation.get(k, None) for k in target}
+
+            if current != target:
+                self._tokenizer.enable_truncation(**target)
+
+        if padding_strategy == PaddingStrategy.DO_NOT_PAD:
+            if _padding is not None:
+                self._tokenizer.no_padding()
+        else:
+            length = max_length if padding_strategy == PaddingStrategy.MAX_LENGTH else None
+            target = {
+                "length": length,
+                "direction": self.padding_side,
+                "pad_id": self.pad_token_id,
+                "pad_token": self.pad_token,
+                "pad_type_id": self.pad_token_type_id,
+                "pad_to_multiple_of": pad_to_multiple_of,
+            }
+            if _padding != target:
+                self._tokenizer.enable_padding(**target)
+
+    def _batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            List[TextInput],
+            List[TextInputPair],
+            List[PreTokenizedInput],
+            List[PreTokenizedInputPair],
+            List[EncodedInput],
+            List[EncodedInputPair],
+        ],
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_position_ids: Optional[bool] = None,
+        return_dict: bool = True,
+        return_length: bool = False,
+        verbose: bool = True,
+        split_special_tokens: bool = False,
+        **kwargs
+    ) -> BatchEncoding:
+        if not isinstance(batch_text_or_text_pairs, (tuple, list)):
+            raise TypeError(
+                f"batch_text_or_text_pairs has to be a list or a tuple (got {type(batch_text_or_text_pairs)})"
+            )
+
+        # Set the truncation and padding strategy and restore the initial configuration
+        self.set_truncation_and_padding(
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+        )
+
+        if self._tokenizer.encode_special_tokens != split_special_tokens:
+            self._tokenizer.encode_special_tokens = split_special_tokens
+
+        encodings = self._tokenizer.encode_batch(
+            batch_text_or_text_pairs,
+            add_special_tokens=add_special_tokens,
+            is_pretokenized=is_split_into_words,
+        )
+
+        # Convert encoding to dict
+        # `Tokens` has type: Tuple[
+        #                       List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]],
+        #                       List[EncodingFast]
+        #                    ]
+        # with nested dimensions corresponding to batch, overflows, sequence length
+        tokens_and_encodings = [
+            self._convert_encoding(
+                encoding=encoding,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                return_position_ids=return_position_ids,
+                verbose=verbose,
+            )
+            for encoding in encodings
+        ]
+
+        # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
+        # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length)
+        # (we say ~ because the number of overflow varies with the example in the batch)
+        #
+        # To match each overflowing sample with the original sample in the batch
+        # we add an overflow_to_sample_mapping array (see below)
+        sanitized_tokens = {}
+        for key in tokens_and_encodings[0][0].keys():
+            stack = [e for item, _ in tokens_and_encodings for e in item[key]]
+            sanitized_tokens[key] = stack
+        sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
+
+        # If returning overflowing tokens, we need to return a mapping
+        # from the batch idx to the original sample
+        if return_overflowing_tokens:
+            overflow_to_sample_mapping = []
+            for i, (toks, _) in enumerate(tokens_and_encodings):
+                overflow_to_sample_mapping += [i] * len(toks["input_ids"])
+            sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping
+
+        for input_ids in sanitized_tokens["input_ids"]:
+            self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
+        return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
+
+    def _encode_plus(
+        self,
+        text: Union[TextInput, PreTokenizedInput, EncodedInput],
+        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_position_ids: Optional[bool] = None,
+        return_tensors: Optional[bool] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        split_special_tokens: bool = False,
+        **kwargs,
+    ) -> BatchEncoding:
+        batched_input = [(text, text_pair)] if text_pair else [text]
+        batched_output = self._batch_encode_plus(
+            batched_input,
+            is_split_into_words=is_split_into_words,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_position_ids=return_position_ids,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            split_special_tokens=split_special_tokens,
+            **kwargs,
+        )
+
+        # Return tensor is None, then we can remove the leading batch axis
+        # Overflowing tokens are returned as a batch of output so we keep them in this case
+        if return_tensors is None and not return_overflowing_tokens:
+            batched_output = BatchEncoding(
+                {
+                    key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
+                    for key, value in batched_output.items()
+                },
+                batched_output.encodings,
+            )
+
+        self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
+
+        return batched_output
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """
+        Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
+        often want to remove sub-word tokenization artifacts at the same time.
+
+        Args:
+            tokens (`List[str]`): The token to join in a string.
+
+        Returns:
+            `str`: The joined tokens.
+        """
+        return self.backend_tokenizer.decoder.decode(tokens)
+
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = None,
+        **kwargs,
+    ) -> str:
+        self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
+
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
+
+        clean_up_tokenization_spaces = (
+            clean_up_tokenization_spaces
+            if clean_up_tokenization_spaces is not None
+            else self.clean_up_tokenization_spaces
+        )
+        if clean_up_tokenization_spaces:
+            clean_text = self.clean_up_tokenization(text)
+            return clean_text
+        else:
+            return text
+
+    def _save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        file_names: Tuple[str],
+        legacy_format: Optional[bool] = None,
+        filename_prefix: Optional[str] = None,
+    ) -> Tuple[str]:
+        """
+        Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well as in a unique JSON
+        file containing {config + vocab + added-tokens}.
+        """
+        save_directory = str(save_directory)
+
+        if self.slow_tokenizer_class is None and legacy_format is True:
+            raise ValueError(
+                "Your tokenizer does not have a legacy version defined and therefore cannot register this version. You"
+                " might consider leaving the legacy_format at `None` or setting it to `False`."
+            )
+
+        save_slow = (
+            (legacy_format is None or legacy_format is True)
+            and self.slow_tokenizer_class is not None
+            and self.can_save_slow_tokenizer
+        )
+        save_fast = legacy_format is None or legacy_format is False
+
+        if save_slow:
+            added_tokens_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_NAME
+            )
+            # make sure to be forward compatible
+            added_vocab = {tok: index for tok, index in self.added_tokens_encoder.items() if index >= self.vocab_size}
+            if added_vocab:
+                with open(added_tokens_file, "w", encoding="utf-8") as f:
+                    out_str = json.dumps(added_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
+                    f.write(out_str)
+
+            vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
+            file_names = file_names + vocab_files + (added_tokens_file,)
+
+        if save_fast:
+            tokenizer_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + FULL_TOKENIZER_NAME
+            )
+            self.backend_tokenizer.save(tokenizer_file)
+            file_names = file_names + (tokenizer_file,)
+
+        return file_names
+
+    def train_new_from_iterator(
+        self,
+        text_iterator,
+        vocab_size,
+        length=None,
+        new_special_tokens=None,
+        special_tokens_map=None,
+        **kwargs,
+    ):
+        """
+        Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
+        as the current one.
+
+        Args:
+            text_iterator (generator of `List[str]`):
+                The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts
+                if you have everything in memory.
+            vocab_size (`int`):
+                The size of the vocabulary you want for your tokenizer.
+            length (`int`, *optional*):
+                The total number of sequences in the iterator. This is used to provide meaningful progress tracking
+            new_special_tokens (list of `str` or `AddedToken`, *optional*):
+                A list of new special tokens to add to the tokenizer you are training.
+            special_tokens_map (`Dict[str, str]`, *optional*):
+                If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
+                token name to new special token name in this argument.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.
+
+        Returns:
+            [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
+            `text_iterator`.
+
+        """
+        tokenizer_json = json.loads(self._tokenizer.to_str())
+        # Remove added tokens for now (uses IDs of tokens)
+        added_tokens = tokenizer_json.pop("added_tokens")
+        # Remove post processor for now (uses IDs of tokens)
+        post_processor = tokenizer_json.pop("post_processor")
+
+        unk_token = None
+        # Remove vocab
+        if tokenizer_json["model"]["type"] == "BPE":
+            tokenizer_json["model"]["vocab"] = {}
+            tokenizer_json["model"]["merges"] = []
+        elif tokenizer_json["model"]["type"] == "Unigram":
+            if tokenizer_json["model"]["unk_id"] is not None:
+                unk_id = tokenizer_json["model"]["unk_id"]
+                unk_token = tokenizer_json["model"]["vocab"][unk_id][0]
+                if special_tokens_map is not None and unk_token in special_tokens_map:
+                    unk_token = special_tokens_map[unk_token]
+                tokenizer_json["model"]["unk_id"] = 0
+                tokenizer_json["model"]["vocab"] = [[unk_token, 0.0]]
+        elif tokenizer_json["model"]["type"] in ["WordLevel", "WordPiece"]:
+            tokenizer_json["model"]["vocab"] = {}
+        else:
+            raise ValueError(
+                f"This method does not support this type of tokenizer (found {tokenizer_json['model']['type']}) "
+                "only BPE, Unigram, WordLevel and WordPiece."
+            )
+
+        if (
+            special_tokens_map is not None
+            and "unk_token" in tokenizer_json["model"]
+            and tokenizer_json["model"]["unk_token"] in special_tokens_map
+        ):
+            tokenizer_json["model"]["unk_token"] = special_tokens_map[tokenizer_json["model"]["unk_token"]]
+
+        tokenizer = TokenizerFast.from_str(json.dumps(tokenizer_json))
+
+        # Get the special tokens from the current tokenizer if none are specified.
+        special_tokens = []
+        for added_token in added_tokens:
+            special = added_token.pop("special", None)
+            _ = added_token.pop("id", None)
+            if tokenizer_json["model"]["type"] != "Unigram" and not special:
+                continue
+            if special_tokens_map is not None and added_token["content"] in special_tokens_map:
+                added_token["content"] = special_tokens_map[added_token["content"]]
+            special_tokens.append(AddedToken(**added_token))
+
+        if new_special_tokens is not None:
+            special_tokens.extend(new_special_tokens)
+
+        # Trainer needs to know the end of word / continuing subword thingies in BPE
+        if (
+            tokenizer_json["model"]["type"] == "BPE"
+            and "continuing_subword_prefix" not in kwargs
+            and tokenizer_json["model"]["continuing_subword_prefix"] is not None
+        ):
+            kwargs["continuing_subword_prefix"] = tokenizer_json["model"]["continuing_subword_prefix"]
+        if (
+            tokenizer_json["model"]["type"] == "BPE"
+            and "end_of_word_suffix" not in kwargs
+            and tokenizer_json["model"]["end_of_word_suffix"] is not None
+        ):
+            kwargs["end_of_word_suffix"] = tokenizer_json["model"]["end_of_word_suffix"]
+        if tokenizer_json["model"]["type"] == "Unigram" and unk_token is not None:
+            kwargs["unk_token"] = unk_token
+        if tokenizer_json["pre_tokenizer"] is not None and tokenizer_json["pre_tokenizer"]["type"] == "ByteLevel":
+            kwargs["initial_alphabet"] = pre_tokenizers_fast.ByteLevel.alphabet()
+
+        trainer_class = MODEL_TO_TRAINER_MAPPING[tokenizer_json["model"]["type"]]
+        trainer = trainer_class(vocab_size=vocab_size, special_tokens=special_tokens, **kwargs)
+        tokenizer.train_from_iterator(text_iterator, length=length, trainer=trainer)
+
+        if post_processor is not None:
+            trained_tokenizer_json = json.loads(tokenizer.to_str())
+            # Almost done, we just have to adjust the token IDs in the post processor
+            if "special_tokens" in post_processor:
+                for key in post_processor["special_tokens"]:
+                    tokens = post_processor["special_tokens"][key]["tokens"]
+                    if special_tokens_map is not None:
+                        tokens = [special_tokens_map.get(token, token) for token in tokens]
+                    post_processor["special_tokens"][key]["tokens"] = tokens
+                    post_processor["special_tokens"][key]["ids"] = [tokenizer.token_to_id(token) for token in tokens]
+
+            for special_token in ["cls", "sep"]:
+                if special_token in post_processor:
+                    token, _ = post_processor[special_token]
+                    if special_tokens_map is not None and token in special_tokens_map:
+                        token = special_tokens_map[token]
+                    token_id = tokenizer.token_to_id(token)
+                    post_processor[special_token] = [token, token_id]
+
+            trained_tokenizer_json["post_processor"] = post_processor
+            tokenizer = TokenizerFast.from_str(json.dumps(trained_tokenizer_json))
+
+        kwargs = self.init_kwargs.copy()
+        # Map pad/cls/mask token at the Transformers level
+        special_tokens_list = SpecialTokensMixin.SPECIAL_TOKENS_ATTRIBUTES.copy()
+        special_tokens_list.remove("additional_special_tokens")
+        for token in special_tokens_list:
+            # Get the private one to avoid unnecessary warnings.
+            if getattr(self, f"_{token}") is not None:
+                special_token = getattr(self, token)
+                if special_tokens_map is not None and special_token in special_tokens_map:
+                    special_token = special_tokens_map[special_token]
+
+                special_token_full = getattr(self, f"_{token}")
+                if isinstance(special_token_full, AddedToken):
+                    # Create an added token with the same parameters except the content
+                    kwargs[token] = AddedToken(
+                        special_token,
+                        single_word=special_token_full.single_word,
+                        lstrip=special_token_full.lstrip,
+                        rstrip=special_token_full.rstrip,
+                        normalized=special_token_full.normalized,
+                        special=True,
+                    )
+                else:
+                    kwargs[token] = special_token
+
+        additional_special_tokens = self.additional_special_tokens
+        if new_special_tokens is not None:
+            additional_special_tokens.extend(new_special_tokens)
+        if len(additional_special_tokens) > 0:
+            kwargs["additional_special_tokens"] = additional_special_tokens
+
+        return self.__class__(tokenizer_object=tokenizer, **kwargs)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unified_transformer/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unified_transformer/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unified_transformer/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unified_transformer/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unified_transformer/configuration.py
new file mode 100644
index 000000000..c629459f7
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unified_transformer/configuration.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""UNIFIED_TRANSFORMER model configuration"""
+from __future__ import annotations
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = [
+    "UNIFIED_TRANSFORMER_PRETRAINED_INIT_CONFIGURATION",
+    "UnifiedTransformerConfig",
+    "UNIFIED_TRANSFORMER_PRETRAINED_RESOURCE_FILES_MAP",
+]
+
+UNIFIED_TRANSFORMER_PRETRAINED_INIT_CONFIGURATION = {
+    "unified_transformer-12L-cn": {
+        "vocab_size": 30004,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "normalize_before": True,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "unk_token_id": 0,
+        "pad_token_id": 0,
+        "bos_token_id": 1,
+        "eos_token_id": 2,
+        "mask_token_id": 30000,
+    },
+    "unified_transformer-12L-cn-luge": {
+        "vocab_size": 30004,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "normalize_before": True,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "unk_token_id": 0,
+        "pad_token_id": 0,
+        "bos_token_id": 1,
+        "eos_token_id": 2,
+        "mask_token_id": 30000,
+    },
+    "plato-mini": {
+        "vocab_size": 30001,
+        "hidden_size": 768,
+        "num_hidden_layers": 6,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "normalize_before": True,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 2,
+        "initializer_range": 0.02,
+        "unk_token_id": 0,
+        "pad_token_id": 0,
+        "bos_token_id": 1,
+        "eos_token_id": 2,
+        "mask_token_id": 30000,
+    },
+    "plato-xl": {
+        "vocab_size": 8001,
+        "hidden_size": 3072,
+        "num_hidden_layers": 72,
+        "num_attention_heads": 32,
+        "intermediate_size": 18432,
+        "hidden_act": "gelu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "normalize_before": True,
+        "max_position_embeddings": 1024,
+        "type_vocab_size": 3,
+        "role_type_size": 128,
+        "initializer_range": 0.02,
+        "unk_token_id": 0,
+        "pad_token_id": 0,
+        "bos_token_id": 1,
+        "eos_token_id": 2,
+        "mask_token_id": 8000,
+    },
+}
+
+
+UNIFIED_TRANSFORMER_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "unified_transformer-12L-cn": "https://bj.bcebos.com/paddlenlp/models/transformers/unified_transformer/unified_transformer-12L-cn.pdparams",
+        "unified_transformer-12L-cn-luge": "https://bj.bcebos.com/paddlenlp/models/transformers/unified_transformer/unified_transformer-12L-cn-luge.pdparams",
+        "plato-mini": "https://bj.bcebos.com/paddlenlp/models/transformers/unified_transformer/plato-mini.pdparams",
+        "plato-xl": "https://bj.bcebos.com/paddlenlp/models/transformers/unified_transformer/plato-xl.pdparams",
+    }
+}
+
+
+class UnifiedTransformerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`UnifiedTransformerModel`]. It is used to
+    instantiate a Unified TransformerModel model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Unified TransformerModel
+    unified_transformer-12L-cn architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+        Args:
+        vocab_size (int, optional):
+            Vocabulary size of `inputs_ids` in :class:`UnifiedTransformerModel`.
+            Also is the vocab size of token embedding matrix. Defaults to 30004.
+        hidden_size (int, optional):
+            Dimensionality of the embedding layers, encoder layers and pooler
+            layer. Defaults to 768.
+        num_hidden_layers (int, optional):
+            The number of hidden layers in the encoder. Defaults to 12.
+        num_attention_heads (int, optional):
+            The number of heads in multi-head attention(MHA). Defaults to 12.
+        intermediate_size (int, optional):
+            Dimensionality of the feed-forward layer in the encoder. Input
+            tensors to feed-forward layers are firstly projected from
+            `hidden_size` to `intermediate_size`, and then projected back to
+            `hidden_size`. Typically `intermediate_size` is larger than
+            `hidden_size`. Defaults to 3072.
+        hidden_act (str, optional):
+            The activation function in the feedforward network. Defaults to
+            "gelu".
+        hidden_dropout_prob(float, optional):
+            The dropout probability used in pre-process and post-precess of MHA
+            and FFN sub-layer. Defaults to 0.1.
+        attention_probs_dropout_prob (float, optional):
+            The dropout probability used in MHA to drop some attention target.
+            Defaults to 0.1.
+        normalize_before (bool, optional):
+            Indicate whether to put layer normalization into preprocessing of
+            MHA and FFN sub-layers. If True, pre-process is layer normalization
+            and post-precess includes dropout, residual connection. Otherwise,
+            no pre-process and post-precess includes dropout, residual
+            connection, layer normalization. Defaults to True.
+        max_position_embeddings (int, optional):
+            The maximum length of input `position_ids`. Defaults to 512.
+        type_vocab_size (int, optional):
+            The size of the input `token_type_ids`. Defaults to 2.
+        initializer_range (float, optional):
+            The standard deviation of the normal initializer. Defaults to 0.02.
+
+            .. note::
+                A normal_initializer initializes weight matrices as normal
+                distributions. See
+                :meth:`UnifiedTransformerPretrainedModel.init_weights` method
+                for how weights are initialized in
+                :class:`UnifiedTransformerModel`.
+        unk_token_id (int, optional):
+            The id of special token `unk_token`. Defaults to 0.
+        pad_token_id (int, optional):
+            The id of special token `pad_token`. Defaults to 0.
+        bos_token_id (int, optional):
+            The id of special token `bos_token`. Defaults to 1.
+        eos_token_id (int, optional):
+            The id of special token `eos_token`. Defaults to 2.
+        mask_token_id (int, optional):
+            The id of special token `mask_token`. Defaults to 30000.
+    ```"""
+    model_type = "unified_transformer"
+    pretrained_init_configuration = UNIFIED_TRANSFORMER_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size: int = 30004,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        normalize_before: bool = True,
+        max_position_embeddings: int = 512,
+        type_vocab_size: int = 2,
+        initializer_range: float = 0.02,
+        unk_token_id: int = 0,
+        pad_token_id: int = 0,
+        bos_token_id: int = 1,
+        eos_token_id: int = 2,
+        mask_token_id: int = 30000,
+        role_type_size: int = None,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.normalize_before = normalize_before
+        self.unk_token_id = unk_token_id
+        self.mask_token_id = mask_token_id
+        self.role_type_size = role_type_size
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unified_transformer/convert.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unified_transformer/convert.py
new file mode 100644
index 000000000..6d6b31d56
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unified_transformer/convert.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import pickle
+import re
+
+import paddle
+
+
+def setup_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--param_path", type=str, required=True)
+    parser.add_argument("--save_path", type=str, required=True)
+    return parser.parse_args()
+
+
+def convert(args):
+    paddle.enable_static()
+    prog_state = paddle.static.load_program_state(args.param_path)
+    new_state = {}
+    for k in prog_state:
+        if k.endswith("_embedding"):
+            prefix = "unified_transformer."
+            if k == "word_embedding":
+                suffix = "word_embeddings.weight"
+            elif k == "pos_embedding":
+                suffix = "position_embeddings.weight"
+            elif k == "sent_embedding":
+                suffix = "token_type_embeddings.weight"
+            elif k == "role_embedding":
+                suffix = "role_embeddings.weight"
+        elif k.startswith("encoder_layer"):
+            p = "encoder_layer_(\d+)_([^_]+)_([^_]+)_"
+            m = re.match(p, k)
+            layer_idx = m.group(1)
+            sub_layer = m.group(2)
+            prefix = "unified_transformer.encoder.layers." + layer_idx + "."
+            if sub_layer == "pre":
+                if m.group(3) == "att":
+                    if k.endswith("layer_norm_scale"):
+                        suffix = "norm1.weight"
+                    elif k.endswith("layer_norm_bias"):
+                        suffix = "norm1.bias"
+                elif m.group(3) == "ffn":
+                    if k.endswith("layer_norm_scale"):
+                        suffix = "norm2.weight"
+                    elif k.endswith("layer_norm_bias"):
+                        suffix = "norm2.bias"
+            elif sub_layer == "multi":
+                prefix += "self_attn."
+                m = re.match("encoder_layer_(\d+)_multi_head_att_(\w+)\.(.+)", k)
+                if m.group(2) == "query_fc":
+                    if m.group(3) == "w_0":
+                        suffix = "q_proj.weight"
+                    elif m.group(3) == "b_0":
+                        suffix = "q_proj.bias"
+                elif m.group(2) == "key_fc":
+                    if m.group(3) == "w_0":
+                        suffix = "k_proj.weight"
+                    elif m.group(3) == "b_0":
+                        suffix = "k_proj.bias"
+                elif m.group(2) == "value_fc":
+                    if m.group(3) == "w_0":
+                        suffix = "v_proj.weight"
+                    elif m.group(3) == "b_0":
+                        suffix = "v_proj.bias"
+                elif m.group(2) == "output_fc":
+                    if m.group(3) == "w_0":
+                        suffix = "out_proj.weight"
+                    elif m.group(3) == "b_0":
+                        suffix = "out_proj.bias"
+            elif sub_layer == "ffn":
+                if k.endswith("fc_0.w_0"):
+                    suffix = "linear1.weight"
+                elif k.endswith("fc_0.b_0"):
+                    suffix = "linear1.bias"
+                elif k.endswith("fc_1.w_0"):
+                    suffix = "linear2.weight"
+                elif k.endswith("fc_1.b_0"):
+                    suffix = "linear2.bias"
+        elif k.startswith("post_encoder"):
+            prefix = "unified_transformer.encoder."
+            if k.endswith("_scale"):
+                suffix = "norm.weight"
+            elif k.endswith("_bias"):
+                suffix = "norm.bias"
+        elif k.startswith("mask_lm"):
+            prefix = "lm_head."
+            if k.endswith("layer_norm_scale"):
+                suffix = "layer_norm.weight"
+            elif k.endswith("layer_norm_bias"):
+                suffix = "layer_norm.bias"
+            elif k.endswith("trans_fc.w_0"):
+                suffix = "transform.weight"
+            elif k.endswith("trans_fc.b_0"):
+                suffix = "transform.bias"
+            elif k.endswith("out_fc.w_0"):
+                suffix = "decoder_weight"
+            elif k.endswith("out_fc.b_0"):
+                suffix = "decoder_bias"
+        new_state[prefix + suffix] = prog_state[k]
+    with open(args.save_path, "wb") as f:
+        pickle.dump(new_state, f)
+
+
+if __name__ == "__main__":
+    args = setup_args()
+    convert(args)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unified_transformer/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unified_transformer/modeling.py
new file mode 100644
index 000000000..fb85fc9c8
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unified_transformer/modeling.py
@@ -0,0 +1,577 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Modeling classes for UnifiedTransformer model."""
+
+from typing import Optional, Tuple
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import Tensor
+
+from ...utils.env import CONFIG_NAME
+from ...utils.log import logger
+from .. import PretrainedModel, register_base_model
+from ..model_outputs import CausalLMOutputWithCrossAttentions
+from .configuration import (
+    UNIFIED_TRANSFORMER_PRETRAINED_INIT_CONFIGURATION,
+    UNIFIED_TRANSFORMER_PRETRAINED_RESOURCE_FILES_MAP,
+    UnifiedTransformerConfig,
+)
+
+__all__ = [
+    "UnifiedTransformerPretrainedModel",
+    "UnifiedTransformerModel",
+    "UnifiedTransformerLMHeadModel",
+    "UnifiedTransformerForMaskedLM",
+]
+
+
+class UnifiedTransformerPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained UnifiedTransformer models. It provides  UnifiedTransformer
+    related `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
+    `pretrained_init_configuration`, `base_model_prefix` for downloading
+    and loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    model_config_file = CONFIG_NAME
+    pretrained_init_configuration = UNIFIED_TRANSFORMER_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = UNIFIED_TRANSFORMER_PRETRAINED_RESOURCE_FILES_MAP
+    config_class = UnifiedTransformerConfig
+    base_model_prefix = "unified_transformer"
+
+    def _init_weights(self, layer):
+        # Initialization hook
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor) and paddle.get_default_dtype() == "float32":
+                layer.weight.set_value(
+                    # TODO(guosheng): `normal` does not support float16, and
+                    # need to handle this when using fp16 as default dtype for
+                    # big models.
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range,
+                        shape=layer.weight.shape,
+                    )
+                )
+
+
+class UnifiedTransformerEmbeddings(nn.Layer):
+    # Include embeddings from word, position and token_type.
+
+    def __init__(self, config: UnifiedTransformerConfig):
+        super(UnifiedTransformerEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        self.role_embeddings = (
+            None if config.role_type_size is None else nn.Embedding(config.role_type_size, config.hidden_size)
+        )
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.pad_token_id = config.pad_token_id
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        role_ids: Optional[Tensor] = None,
+        input_embeddings: Optional[Tensor] = None,
+    ):
+        if input_ids is None and input_embeddings is None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            inputs_shape = input_ids.shape
+        elif input_embeddings is not None:
+            inputs_shape = input_embeddings.shape[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        if input_embeddings is None:
+            input_embeddings = self.word_embeddings(input_ids)
+
+        if position_ids is None:
+            if self.pad_token_id is None:
+                position_ids = paddle.expand(paddle.arange(end=inputs_shape[1], dtype="int64"), inputs_shape)
+            else:
+                if input_ids is not None:
+                    # NOTE: If there is a unk_token_id in input_ids, the following logic is wrong.
+                    # In that case, the position_ids must be provided.
+                    # And this is for left padding input_ids.
+                    num_pad = paddle.sum((input_ids == self.pad_token_id).astype("float32"), axis=-1, keepdim=True)
+                    position_ids = F.relu(
+                        paddle.expand(paddle.arange(end=inputs_shape[1], dtype="int64"), inputs_shape) - num_pad
+                    ).astype("int64")
+                else:
+                    logger.warning(
+                        "Position_ids or pad_token_ids should be provided when input_embeds is specified, "
+                        "otherwise an unexpected result may be returned since `[0, 1, ..., sequence length - 1]` will be generated as a default position_ids."
+                    )
+                    position_ids = paddle.expand(paddle.arange(end=inputs_shape[1], dtype="int64"), inputs_shape)
+            position_ids.stop_gradient = True
+
+        position_embeddings = self.position_embeddings(position_ids)
+
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros_like(input_ids, dtype="int64")
+            token_type_ids.stop_gradient = True
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = input_embeddings + position_embeddings + token_type_embeddings
+        # A model with role_embeddings can generate without role_ids.
+        if role_ids is not None:
+            embeddings += self.role_embeddings(role_ids)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+@register_base_model
+class UnifiedTransformerModel(UnifiedTransformerPretrainedModel):
+    """
+    The bare UnifiedTransformer Model outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a `paddle.nn.Layer <https://www.paddlepaddle.org.cn
+    /documentation/docs/zh/api/paddle/nn/Layer_cn.html>`__
+    subclass. Use it as a regular Paddle Layer and refer to the Paddle
+    documentation for all matter related to general usage and behavior.
+
+
+    """
+
+    def __init__(self, config: UnifiedTransformerConfig):
+        super(UnifiedTransformerModel, self).__init__(config)
+        self.unk_token_id = config.unk_token_id
+        self.pad_token_id = config.pad_token_id
+        self.bos_token_id = config.bos_token_id
+        self.eos_token_id = config.eos_token_id
+        self.mask_token_id = config.mask_token_id
+        self.initializer_range = config.initializer_range
+
+        self.embeddings = UnifiedTransformerEmbeddings(config)
+        encoder_layer = nn.TransformerEncoderLayer(
+            config.hidden_size,
+            config.num_attention_heads,
+            config.intermediate_size,
+            dropout=config.hidden_dropout_prob,
+            activation=config.hidden_act,
+            attn_dropout=config.attention_probs_dropout_prob,
+            act_dropout=0,
+            normalize_before=config.normalize_before,
+        )
+        encoder_norm = nn.LayerNorm(config.hidden_size)
+        self.encoder = nn.TransformerEncoder(encoder_layer, config.num_hidden_layers, encoder_norm)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        use_cache: Optional[bool] = None,
+        cache: Optional[Tuple[Tensor]] = None,
+        role_ids: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The UnifiedTransformerModel forward method, overrides the special
+        :meth:`__call__` method.
+
+        Args:
+            input_ids (Tensor, optional):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input
+                sequence. It's data type should be `int64` and has a shape of
+                [batch_size, sequence_length].
+            token_type_ids (Tensor):
+                Segment token indices to indicate first and second portions of
+                the inputs. Indices can be either 0 or 1:
+
+                - 0 corresponds to a **sentence A** token,
+                - 1 corresponds to a **sentence B** token.
+
+                It's data type should be `int64` and has a shape of
+                [batch_size, sequence_length].
+            position_ids (Tensor):
+                The position indices of input sequence tokens. It's data type
+                should be `int64` and has a shape of [batch_size, sequence_length].
+            attention_mask (Tensor):
+                A tensor used in multi-head attention to prevents attention to
+                some unwanted positions, usually the paddings or the subsequent
+                positions. It is a tensor with shape broadcasted to
+                [batch_size, n_head, sequence_length, sequence_length].
+
+                - When the data type is bool, the unwanted positions have
+                  `False` values and the others have `True` values.
+                - When the data type is int, the unwanted positions have 0
+                  values and the others have 1 values.
+                - When the data type is float, the unwanted positions have
+                  `-INF` values and the others have 0 values.
+
+            use_cache: (bool, optional):
+                Whether or not use the model cache to speed up decoding. Defaults
+                to False.
+            cache (list, optional):
+                It is a list, and each element in the list is `incremental_cache`
+                produced by :meth:`paddle.nn.TransformerEncoderLayer.gen_cache`
+                method. See :meth:`paddle.nn.TransformerEncoder.gen_cache`
+                method for more details. It is only used for inference and
+                should be None for training. Defaults to None.
+            role_ids (Tensor, optional):
+                Indices of role ids indicated different roles.
+                 It's data type should be `int64` and has a shape of
+                [batch_size, sequence_length]. Defaults to None.
+            inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation
+                of shape `(batch_size, sequence_length, hidden_size)`. This is useful if you want more control over
+                how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+                Default to None.
+            output_attentions (bool, optional):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail. Defaults to `False`.
+            output_hidden_states (bool, optional):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail. Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` object.
+                If `False`, the output will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions`.
+            Especially, When `return_dict=output_hidden_states=output_attentions=False` and `cache=None`,
+            returns a tensor representing the output of :class:`UnifiedTransformerModel`, with
+            shape [batch_size, sequence_length, hidden_size]. The data type is
+            float32 or float64.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import UnifiedTransformerModel
+                from paddlenlp.transformers import UnifiedTransformerTokenizer
+
+                model = UnifiedTransformerModel.from_pretrained('plato-mini')
+                tokenizer = UnifiedTransformerTokenizer.from_pretrained('plato-mini')
+
+                history = '我爱祖国'
+                inputs = tokenizer.dialogue_encode(
+                    history,
+                    return_tensors=True,
+                    is_split_into_words=False)
+                outputs = model(**inputs)
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if attention_mask is None:
+            if input_ids is not None:
+                attention_mask = (
+                    (input_ids == self.pad_token_id).astype(paddle.get_default_dtype()) * -1e4
+                ).unsqueeze([1, 2])
+            else:
+                logger.warning(
+                    "Provided inputs_embeds while attention_mask is None, attention weights will not be masked during forwarding."
+                )
+        if attention_mask is not None:
+            attention_mask.stop_gradient = True
+
+        embedding_output = self.embeddings(
+            input_ids, token_type_ids, position_ids, role_ids=role_ids, input_embeddings=inputs_embeds
+        )
+        if use_cache and cache is None:
+            cache = self.encoder.gen_cache(embedding_output)
+
+        sequence_output = self.encoder(
+            embedding_output,
+            attention_mask,
+            cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return sequence_output
+
+
+class UnifiedTransformerLMHead(nn.Layer):
+    def __init__(self, hidden_size, vocab_size, activation, embedding_weights=None):
+        super(UnifiedTransformerLMHead, self).__init__()
+        self.transform = nn.Linear(hidden_size, hidden_size)
+        self.activation = getattr(nn.functional, activation)
+        self.layer_norm = nn.LayerNorm(hidden_size)
+        self.decoder_weight = (
+            self.create_parameter(shape=[vocab_size, hidden_size], dtype=self.transform.weight.dtype, is_bias=False)
+            if embedding_weights is None
+            else embedding_weights
+        )
+        self.decoder_bias = self.create_parameter(shape=[vocab_size], dtype=self.decoder_weight.dtype, is_bias=True)
+
+    def forward(
+        self,
+        hidden_states: Tensor,
+        masked_positions: Optional[Tensor] = None,
+    ):
+        if masked_positions is not None:
+            hidden_states = paddle.reshape(hidden_states, [-1, hidden_states.shape[-1]])
+            hidden_states = paddle.tensor.gather(hidden_states, masked_positions)
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        logits = paddle.tensor.matmul(hidden_states, self.decoder_weight, transpose_y=True) + self.decoder_bias
+        return logits
+
+
+class UnifiedTransformerLMHeadModel(UnifiedTransformerPretrainedModel):
+    """
+    The UnifiedTransformer Model with a language modeling head on top
+    for generation tasks.
+
+    Args:
+        unified_transformer (:class:`UnifiedTransformerModel`):
+            An instance of :class:`UnifiedTransformerModel`.
+    """
+
+    def __init__(self, config: UnifiedTransformerConfig):
+        super(UnifiedTransformerLMHeadModel, self).__init__(config)
+        self.unified_transformer = UnifiedTransformerModel(config)
+        self.lm_head = UnifiedTransformerLMHead(
+            config.hidden_size,
+            config.vocab_size,
+            config.hidden_act,
+            self.unified_transformer.embeddings.word_embeddings.weight,
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        masked_positions: Optional[Tensor] = None,
+        use_cache: Optional[bool] = None,
+        cache: Optional[Tuple[Tensor]] = None,
+        role_ids: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The UnifiedTransformerLMHeadModel forward method, overrides the special
+        :meth:`__call__` method.
+
+        Args:
+            input_ids (Tensor, optional):
+                See :class:`UnifiedTransformerModel`.
+            token_type_ids (Tensor):
+                See :class:`UnifiedTransformerModel`.
+            position_ids (Tensor):
+                See :class:`UnifiedTransformerModel`.
+            attention_mask (Tensor):
+                See :class:`UnifiedTransformerModel`.
+            use_cache: (bool, optional):
+                See :class:`UnifiedTransformerModel`.
+            cache (list, optional):
+                See :class:`UnifiedTransformerModel`.
+            role_ids: (Tensor, optional):
+                See :class:`UnifiedTransformerModel`.
+            labels: (Tensor, optional):
+                Labels for computing the left-to-right language modeling loss. Indices should be in
+                `[-100, 0, ..., vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+                ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., vocab_size]`
+            inputs_embeds (Tensor, optional):
+                See :class:`UnifiedTransformerModel`.
+            output_attentions (bool, optional):
+                See :class: `UnifiedTransformerModel`
+            output_hidden_states (bool, optional):
+                See :class: `UnifiedTransformerModel`
+            return_dict (bool, optional):
+                See :class: `UnifiedTransformerModel`
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.CausalLMOutputWithCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.CausalLMOutputWithCrossAttentions`.
+            Especially, When `return_dict=output_hidden_states=output_attentions=False` and `cache=labels=None`,
+            returns a tensor representing the output of :class:`UnifiedTransformerLMHeadModel`,
+            with shape [batch_size, sequence_length, vocab_size]. The data type
+            is float32 or float64.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import UnifiedTransformerLMHeadModel
+                from paddlenlp.transformers import UnifiedTransformerTokenizer
+
+                model = UnifiedTransformerLMHeadModel.from_pretrained('plato-mini')
+                tokenizer = UnifiedTransformerTokenizer.from_pretrained('plato-mini')
+
+                history = '我爱祖国'
+                inputs = tokenizer.dialogue_encode(
+                    history,
+                    return_tensors=True,
+                    is_split_into_words=False)
+                logits = model(**inputs)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.unified_transformer(
+            input_ids,
+            token_type_ids,
+            position_ids,
+            attention_mask,
+            use_cache,
+            cache,
+            role_ids=role_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        input_type = type(input_ids) if input_ids is not None else type(inputs_embeds)
+        sequence_output = outputs if isinstance(outputs, input_type) else outputs[0]
+        logits = self.lm_head(sequence_output, masked_positions)
+
+        lm_loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            lm_loss = loss_fct(logits.reshape((-1, logits.shape[-1])), labels.reshape([-1]))
+        if not return_dict:
+            if isinstance(outputs, input_type):
+                return (lm_loss, logits) if lm_loss is not None else logits
+            else:
+                outputs = (logits,) + outputs[1:]
+                return ((lm_loss,) + outputs) if lm_loss is not None else outputs
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_fast_entry(self, kwargs):
+        from paddlenlp.ops import FasterUnifiedTransformer
+
+        use_fp16_decoding = kwargs.get("use_fp16_decoding", False)
+        decode_strategy = kwargs.get("decode_strategy")
+        if decode_strategy == "sampling" and kwargs.get("top_k") != 0 and kwargs.get("top_p") != 1:
+            raise AttributeError(
+                "Only topk sampling or topp sampling are supported. "
+                "Topk sampling and topp sampling cannot be both applied in the fast version."
+            )
+        if kwargs["repetition_penalty"] != 1.0:
+            # not support for repetition_penalty yet in the fast version
+            raise AttributeError("'repetition_penalty != 1' is not supported yet in the fast version")
+        if kwargs["forced_bos_token_id"] is not None:
+            # not support for min_length yet in the fast version
+            raise AttributeError("'forced_bos_token_id != None' is not supported yet in the fast version")
+        self._fast_entry = FasterUnifiedTransformer(self, use_fp16_decoding=use_fp16_decoding).forward
+        return self._fast_entry
+
+    def adjust_logits_during_generation(self, logits):
+        # pre-process distribution
+        logits[:, self.unified_transformer.unk_token_id] = -1e4
+        logits[:, self.unified_transformer.bos_token_id] = -1e4
+        logits[:, self.unified_transformer.mask_token_id] = -1e4
+        return logits
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        use_cache=False,
+        cache=None,
+        **kwargs
+    ):
+
+        role_ids = kwargs.get("role_ids", None)
+
+        if position_ids is None:
+            if self.pad_token_id is None:
+                position_ids = paddle.expand_as(paddle.arange(end=input_ids.shape[1], dtype="int64"), input_ids)
+            else:
+                # NOTE: If there is a unk_token_id in input_ids, the following logic is wrong.
+                # In that case, the position_ids must be provided.
+                # And this is for left padding input_ids.
+                num_pad = paddle.sum((input_ids == self.pad_token_id).astype("float32"), axis=-1, keepdim=True)
+                position_ids = F.relu(
+                    paddle.expand_as(paddle.arange(end=input_ids.shape[1], dtype="float32"), input_ids) - num_pad
+                ).astype("int64")
+            position_ids.stop_gradient = True
+
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros_like(input_ids, dtype="int64")
+            token_type_ids.stop_gradient = True
+
+        if attention_mask is None:
+            attention_mask = ((input_ids == self.pad_token_id).astype(paddle.get_default_dtype()) * -1e4).unsqueeze(
+                [1, 2]
+            )
+            attention_mask.stop_gradient = True
+
+        # only last token for inputs_ids if cache is defined in kwargs
+        if cache is not None:
+            input_ids = input_ids[:, -1:]
+            if token_type_ids is not None:
+                token_type_ids = token_type_ids[:, -1:]
+            if position_ids is not None:
+                position_ids = position_ids[:, -1:]
+            if role_ids is not None:
+                role_ids = role_ids[:, -1:]
+            if attention_mask is not None:
+                attention_mask = attention_mask[:, :, -1:, :]
+
+        return {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,
+            "cache": cache,
+            "role_ids": role_ids,
+        }
+
+    def __getattr__(self, name):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(getattr(self, self.base_model_prefix), name)
+
+
+UnifiedTransformerForMaskedLM = UnifiedTransformerLMHeadModel
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unified_transformer/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unified_transformer/tokenizer.py
new file mode 100644
index 000000000..ddb8a6a36
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unified_transformer/tokenizer.py
@@ -0,0 +1,711 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for UnifiedTransformer model."""
+
+import os
+import re
+import unicodedata
+from shutil import copyfile
+
+import jieba
+import numpy as np
+import paddle
+import sentencepiece as spm
+
+from ...data.vocab import Vocab
+from .. import PretrainedTokenizer
+from ..tokenizer_utils import _is_control, _is_whitespace, convert_to_unicode
+
+__all__ = ["UnifiedTransformerTokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "unified_transformer-12L-cn": 512,
+    "unified_transformer-12L-cn-luge": 512,
+    "plato-mini": 512,
+    "plato-xl": 1024,
+}
+
+
+class UnifiedTransformerTokenizer(PretrainedTokenizer):
+    """
+    Constructs an UnifiedTransformer tokenizer based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            The path of file to construct vocabulary.
+        sentencepiece_model_file (str):
+            The sentencepiece model file (ends with '.spm') required to instantiate a
+            `SentencePiece <https://github.com/google/sentencepiece>`__.
+        do_lower_case (bool, optional):
+            Whether or not to lowercase the input when tokenizing. Defaults to
+            False and **does not** lowercase the input.
+        unk_token (str, optional):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted
+            to an ID. Defaults to "[UNK]".
+        pad_token (str, optional):
+            A special token used to make arrays of tokens the same size for
+            batching purposes. Defaults to "[PAD]".
+        cls_token (str, optional):
+            A special token representing the beginning of a sequence. Defaults
+            to "[CLS]".
+        sep_token (str, optional):
+            A special token representing the end of a sequence or separating
+            two different sentences in the same input. Defaults to "[SEP]".
+        mask_token (str, optional):
+            A special token representing a masked token. Defaults to "[MASK]".
+        special_tokens_file (str, optional):
+            The path of file that contains additional special tokens to be used
+            by the tokenizer. Defaults to "".
+    """
+
+    resource_files_names = {
+        "vocab_file": "vocab.txt",
+        "sentencepiece_model_file": "spm.model",
+    }  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "unified_transformer-12L-cn": "https://bj.bcebos.com/paddlenlp/models/transformers/unified_transformer/unified_transformer-12L-cn-vocab.txt",
+            "unified_transformer-12L-cn-luge": "https://bj.bcebos.com/paddlenlp/models/transformers/unified_transformer/unified_transformer-12L-cn-vocab.txt",
+            "plato-mini": "https://bj.bcebos.com/paddlenlp/models/transformers/unified_transformer/plato-mini-vocab.txt",
+            "plato-xl": "https://bj.bcebos.com/paddlenlp/models/transformers/unified_transformer/plato-xl-vocab.txt",
+        },
+        "sentencepiece_model_file": {
+            "unified_transformer-12L-cn": "https://bj.bcebos.com/paddlenlp/models/transformers/unified_transformer/unified_transformer-12L-cn-spm.model",
+            "unified_transformer-12L-cn-luge": "https://bj.bcebos.com/paddlenlp/models/transformers/unified_transformer/unified_transformer-12L-cn-spm.model",
+            "plato-mini": "https://bj.bcebos.com/paddlenlp/models/transformers/unified_transformer/plato-mini-spm.model",
+            "plato-xl": "https://bj.bcebos.com/paddlenlp/models/transformers/unified_transformer/plato-xl-spm.model",
+        },
+    }
+    pretrained_init_configuration = {
+        "unified_transformer-12L-cn": {"do_lower_case": False},
+        "unified_transformer-12L-cn-luge": {"do_lower_case": False},
+        "plato-mini": {"do_lower_case": False},
+        "plato-xl": {"do_lower_case": False},
+    }
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    TASK_TO_SPECIAL_TOKEN = {
+        "chitchat": "[CHAT]",
+        "knowledge": "[KNOW]",
+        "recommend": "[RECO]",
+    }
+    padding_side = "left"
+
+    def __init__(
+        self,
+        vocab_file,
+        sentencepiece_model_file,
+        do_lower_case=False,
+        unk_token="[UNK]",
+        pad_token="[UNK]",
+        cls_token="[CLS]",
+        sep_token="[SEP]",
+        mask_token="[MASK]",
+        special_tokens_file="",
+        **kwargs
+    ):
+        self.spm_model = spm.SentencePieceProcessor()
+
+        self.do_lower_case = do_lower_case
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the "
+                "vocabulary from a pretrained model please use "
+                "`tokenizer = ErnieTinyTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+        self.vocab = self.load_vocabulary(
+            vocab_file, unk_token, pad_token, cls_token, sep_token, mask_token=mask_token
+        )
+
+        # if the sentencepiece_model_file is not exists, just the default sentence-piece model
+        if os.path.isfile(sentencepiece_model_file):
+            self.spm_model.Load(sentencepiece_model_file)
+
+        pat_str = ""
+        if os.path.isfile(special_tokens_file):
+            self.specials = self.read_file(special_tokens_file)
+            for special in self.specials:
+                pat_str += "(" + re.escape(special) + ")|"
+        else:
+            self.specials = {}
+
+        pat_str += r"([a-zA-Z0-9\S]+)"
+        self.pat = re.compile(pat_str)
+
+        self.vocab_file = vocab_file
+        self.sentencepiece_model_file = sentencepiece_model_file
+
+    @property
+    def vocab_size(self):
+        """
+        Returns the size of vocabulary.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import UnifiedTransformerTokenizer
+
+                tokenizer = UnifiedTransformerTokenizer.from_pretrained('plato-mini')
+                print(tokenizer.vocab_size)
+                # 30001
+        """
+        return len(self.vocab)
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        return vocab
+
+    def preprocess_text(self, inputs, remove_space=True, lower=False, is_split_into_words=True):
+        # preprocess data by removing extra space and normalize data.
+        if not is_split_into_words:
+            inputs = " ".join(jieba.lcut(inputs))
+        outputs = inputs
+        if remove_space:
+            outputs = " ".join(inputs.strip().split())
+        outputs = unicodedata.normalize("NFKD", outputs)
+        outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
+        if lower:
+            outputs = outputs.lower()
+        return outputs
+
+    def clean_text(self, text):
+        # Performs invalid character removal and whitespace cleanup on text.
+        text = text.replace("“", '"').replace("”", '"').replace("‘", "'").replace("’", "'").replace("—", "-")
+        output = []
+        for char in text:
+            if _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def encode_pieces(self, spm_model, text, return_unicode=True, sample=False):
+        # turn sentences into word pieces.
+        # liujiaxiang: add for ernie-albert, mainly consider for “/”/‘/’/— causing too many unk
+        text = self.clean_text(text)
+        if not sample:
+            pieces = spm_model.EncodeAsPieces(text)
+        else:
+            pieces = spm_model.SampleEncodeAsPieces(text, 64, 0.1)
+        return pieces
+
+    def _tokenize(self, text, is_split_into_words=True):
+        """
+        End-to-end tokenization for UnifiedTransformer models.
+
+        Args:
+            text (str):
+                The text to be tokenized.
+
+        Returns:
+            list: A list of string representing converted tokens.
+        """
+        text = self.preprocess_text(text, lower=self.do_lower_case, is_split_into_words=is_split_into_words)
+        tokens = []
+        for match in self.pat.finditer(text):
+            part_text = match.group(0)
+            if part_text in self.specials:
+                tokens.append(part_text)
+                continue
+            part_tokens = self.encode_pieces(self.spm_model, part_text)
+            tokens.extend(part_tokens)
+        return tokens
+
+    def merge_subword(self, tokens):
+        # Merge subword.
+        ret = []
+        for token in tokens:
+            if token.startswith("▁"):
+                ret.append(token[1:])
+            else:
+                if len(ret):
+                    ret[-1] += token
+                else:
+                    ret.append(token)
+
+        ret = [token for token in ret if token]
+        return ret
+
+    def convert_tokens_to_string(self, tokens, keep_space=True):
+        """
+        Converts a sequence of tokens (list of string) in a single string. Since
+        the usage of WordPiece introducing `__` to concat subwords, also remove
+        `__` when converting.
+
+        Args:
+            tokens (list[str]):
+                A list of string representing tokens to be converted.
+            keep_space (bool, optional):
+                Whether or not to keep the segmentation with space. Defaults to
+                True.
+
+        Returns:
+            str: Converted string from tokens.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import UnifiedTransformerTokenizer
+
+                tokenizer = UnifiedTransformerTokenizer.from_pretrained('plato-mini')
+                print(tokenizer.convert_tokens_to_string(['▁欢迎', '▁使用', '▁百度', '▁飞', '桨', '▁!']))
+                # 欢迎 使用 百度 飞桨 !
+                print(tokenizer.convert_tokens_to_string(['▁欢迎', '▁使用', '▁百度', '▁飞', '桨', '▁!'], keep_space=False))
+                # 欢迎使用百度飞桨!
+        """
+        tokens = self.merge_subword(tokens)
+        if keep_space:
+            out_string = " ".join(tokens).replace("<s>", "")
+        else:
+            out_string = "".join(tokens).replace("<s>", "")
+        out_string = out_string.replace("</s>", "\n").replace("\n ", "\n").strip()
+        return out_string
+
+    def convert_ids_to_string(self, ids, keep_space=True):
+        """
+        Converts a single index or a sequence of indices to a token or a
+        sequence of tokens.
+
+        Args:
+            ids (int|list[int]):
+                The token id (or token ids) to be converted to token(s).
+            keep_space (bool, optional):
+                Whether or not to keep the segmentation with space. Defaults to
+                True.
+
+        Returns:
+            str|list[str]: The decoded token(s).
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import UnifiedTransformerTokenizer
+
+                tokenizer = UnifiedTransformerTokenizer.from_pretrained('plato-mini')
+                tokens = tokenizer.tokenize('欢迎使用百度飞桨！', is_split_into_words=False)
+                ids = tokenizer.convert_tokens_to_ids(tokens)
+                print(ids)
+                # [6, 121, 26907, 25475]
+
+                print(tokenizer.convert_ids_to_string(ids))
+                # 我 爱祖国
+                print(tokenizer.convert_ids_to_string(ids, keep_space=False))
+                # 我爱祖国
+        """
+        tokens = self.convert_ids_to_tokens(ids)
+        out_string = self.convert_tokens_to_string(tokens, keep_space)
+        return out_string
+
+    def num_special_tokens_to_add(self, pair=False):
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        if token_ids_1 is None:
+            return _cls + token_ids_0 + _sep
+        return _cls + token_ids_0 + _sep + token_ids_1 + _sep
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        if token_ids_1 is None:
+            return [0] * len(_cls + token_ids_0 + _sep)
+        return [0] * len(_cls + token_ids_0 + _sep) + [1] * len(token_ids_1 + _sep)
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def save_resources(self, save_directory):
+        # Rewrite the :meth:`save_resources` method of superclass to save
+        # related resources under `save_directory`.
+        for name, file_name in self.resource_files_names.items():
+            src_path = getattr(self, name)
+            save_path = os.path.join(save_directory, file_name)
+            if os.path.abspath(src_path) != os.path.abspath(save_path):
+                copyfile(src_path, save_path)
+
+    @staticmethod
+    def read_file(filepath):
+        token_to_idx = {}
+        with open(filepath, "r", encoding="utf-8") as f:
+            for num, line in enumerate(f):
+                items = convert_to_unicode(line.rstrip()).split("\t")
+                if len(items) > 2:
+                    break
+                token = items[0]
+                index = int(items[1]) if len(items) == 2 else num
+                token = token.strip()
+                token_to_idx[token] = index
+        return token_to_idx
+
+    @staticmethod
+    def load_vocabulary(filepath, unk_token=None, pad_token=None, bos_token=None, eos_token=None, **kwargs):
+        # Rewrite the :meth:`load_vocabulary` method of superclass to deal with
+        # the inconsistency of the vocabulary format.
+        token_to_idx = UnifiedTransformerTokenizer.read_file(filepath)
+        vocab = Vocab.from_dict(
+            token_to_idx, unk_token=unk_token, pad_token=pad_token, bos_token=bos_token, eos_token=eos_token, **kwargs
+        )
+        # Filtered the tokens that are mapped to the same id
+        idx_to_token = {v: k for k, v in vocab._token_to_idx.items()}
+        vocab._idx_to_token = [idx_to_token[idx] for idx in sorted(idx_to_token.keys())]
+        return vocab
+
+    def dialogue_encode(
+        self,
+        history,
+        response=None,
+        knowledge=None,
+        task_type=None,
+        max_seq_len=512,
+        max_response_len=128,
+        max_knowledge_len=128,
+        return_position_ids=True,
+        return_token_type_ids=True,
+        return_role_ids=False,
+        return_attention_mask=True,
+        return_length=False,
+        add_start_token_as_response=False,
+        pad_to_max_seq_len=False,
+        return_tensors=False,
+        is_split_into_words=True,
+        position_style="continuous",
+    ):
+        """
+        Main method to encode the single-turn or multi-turn dialogue conversation.
+        It will return a dictionary containing the encoded sequence and other
+        relative informations which meets the input format requirements of the
+        UnifiedTransformer model.
+        See detail at
+        https://github.com/PaddlePaddle/Knover/tree/luge-dialogue/luge-dialogue
+
+        Args:
+            history (str|list|tuple): The history of dialogue conversation. It
+                is an utterance or list of utterances to be encoded. Each
+                utterance is a string.
+            response (str, optional): The response of dialogue conversation.
+                It should be set when training the model. It should not be set
+                when running inference. Defaults to None.
+            knowledge (str, optional): The knowledge information of dialogue
+                conversation. It should be set if the `task_type` is "knowledge"
+                or "recommend". Defaults to None.
+            task_type (str, optional): The type of dialogue conversation. It is
+                one of "chitchat", "knowledge" and "recommend". They represent
+                the chitchat dialogue, knowledge grounded dialogue and
+                conversational recommendation respectively. Defaults to None,
+                which means there is no `special_token` added in output sequence
+                for identifying different conversation types.
+            max_seq_len (int, optional): The maximum encoded sequence length.
+                Defaults to 512.
+            max_response_len (int, optional): The maximum encoded sequence
+                length of the input `response`. Defaults to 128.
+            max_knowledge_len (int, optional): The maximum encoded sequence
+                length of the input `knowledge`. Defaults to 128.
+            return_position_ids (bool, optional): Whether to return the
+                position_ids. Defaults to True.
+            return_token_type_ids (bool, optional): Whether to return the
+                token_type_ids. Defaults to True.
+            return_role_ids (bool, optional): Whether to return the role_ids.
+                Defaults to False.
+            return_attention_mask (bool, optional): Whether to return the
+                attention_mask. Defaults to True.
+            return_length (bool, optional): Whether to return the length of the
+                encoded sequence. Defaults to False.
+            add_start_token_as_response (bool, optional): Whether to add the
+                special token "[CLS]" at the end of sequence as the beginning of
+                the response when running inference to force the model to start
+                generating response sequence. Defaults to False.
+            pad_to_max_seq_len (bool, optional): Whether to pad the returned
+                sequences to the `max_seq_len`. Note that, in this method,
+                returned sequences will be padded on the left. Defaults to False.
+            return_tensors (bool, optional): Whether to convert the returned
+                sequences to Tensor. Defaults to False.
+            is_split_into_words (bool, optional): Whether or not the input text
+                (`history`, `response` and `knowledge`) has been pretokenized.
+                Defaults to True.
+            position_style (str, optional): Specify the involved positional style
+                which must be one of [relative, continuous]. Defaults to continuous
+                which means start from 0 to maximum length of history.
+
+        Returns:
+            dict: A dictionary containing the encoded sequence and other
+            relative informations.
+
+            With the corresponding fields:
+
+            - input_ids (list[int]|Tensor):
+                A list of indices of input tokens to be feed to UnifiedTransformer
+                model. If `return_tensors` is True, it is a Tensor with shape
+                [1, sequence_length] and data type 'int64'.
+            - role_ids (list[int]|Tensor, optional):
+                A list of indices of role indices. If `return_role_ids` is True,
+                it is a Tensor with shape [1, sequence_length] and data type 'int64'.
+            - token_type_ids (list[int]|Tensor, optional):
+                A list of segment token indices to indicate whether the token
+                belongs to the dialogue response. If `return_tensors` is True,
+                it is a Tensor with shape [1, sequence_length] and data type
+                'int64'.
+                Being returned when `return_token_type_ids` is set to True.
+            - position_ids (list[int]|Tensor, optional):
+                A list of The position indices. If `return_tensors` is True,
+                it is a Tensor with shape [1, sequence_length] and data type
+                'int64'.
+                Being returned when `return_position_ids` is set to True.
+            - attention_mask (numpy.ndarray|Tensor, optional):
+                A numpy.ndarray to prevents attention to some unwanted positions,
+                with shape [sequence_length, sequence_length] and data type
+                'float32'. If `return_tensors` is True, it is a Tensor with shape
+                [1, 1, sequence_length, sequence_length] and data type 'float32'.
+                Being returned when `return_attention_mask` is set to True.
+            - seq_len (int, optional):
+                The actual length of the `input_ids`, excluding the pad token.
+                Being returned when `return_length` is set to True.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import UnifiedTransformerTokenizer
+
+                tokenizer = UnifiedTransformerTokenizer.from_pretrained('plato-mini')
+
+                inputs = tokenizer.dialogue_encode('我爱祖国')
+                for key in inputs:
+                    print(key + ':')
+                    print(inputs[key])
+                # input_ids: [1, 6, 25445, 26907, 25475, 2]
+                # token_type_ids: [0, 0, 0, 0, 0, 0]
+                # position_ids: [0, 1, 2, 3, 4, 5]
+                # attention_mask: [[0. 0. 0. 0. 0. 0.]
+                # [0. 0. 0. 0. 0. 0.]
+                # [0. 0. 0. 0. 0. 0.]
+                # [0. 0. 0. 0. 0. 0.]
+                # [0. 0. 0. 0. 0. 0.]
+                # [0. 0. 0. 0. 0. 0.]]
+        """
+
+        # Input type checking for clearer error
+        assert isinstance(history, str) or (
+            isinstance(history, (list, tuple))
+            and (len(history) == 0 or len(history) != 0 and isinstance(history[0], str))
+        ), (
+            "The input `history` must be with type `str` (single context) "
+            "or `List[str]` (multi-turn context). But received: {}".format(history)
+        )
+        assert response is None or isinstance(
+            response, str
+        ), "The input `response` must of be with type `str`. But received: {}".format(response)
+        assert knowledge is None or isinstance(
+            knowledge, str
+        ), "The input `knowledge` must of be with type `str`. But received: {}".format(knowledge)
+        assert (
+            task_type is None or task_type in self.TASK_TO_SPECIAL_TOKEN
+        ), "The input `task_type` must be None or one of {}.".format(", ".join(self.TASK_TO_SPECIAL_TOKEN.keys()))
+        assert max_seq_len > max_response_len + max_knowledge_len, (
+            "`max_seq_len` must be greater than the sum of `max_response_len` "
+            "and `max_knowledge_len`. But received `max_seq_len` is {}, "
+            "`max_response_len` is {}, `max_knowledge_len` is {}.".format(
+                max_seq_len, max_response_len, max_knowledge_len
+            )
+        )
+        assert response is None or not add_start_token_as_response, (
+            "`add_start_token_as_response` only works when `response` is "
+            "`None`. But received `add_start_token_as_response`: `{}`, "
+            "`response`: {}.".format(add_start_token_as_response, response)
+        )
+
+        knowledge_ids = []
+        if knowledge is not None:
+            tokens = self._tokenize(knowledge, is_split_into_words)
+            knowledge_ids = self.convert_tokens_to_ids(tokens)
+            if len(knowledge_ids) > max_knowledge_len - 1:
+                knowledge_ids = knowledge_ids[: max_knowledge_len - 1]
+            knowledge_ids += [self.sep_token_id]
+
+        if return_role_ids:
+            response_role_ids = []
+
+        response_ids = []
+        if response is not None:
+            if return_role_ids:
+                if "\1" in response:
+                    response, role_id = response.split("\1")
+                    role_id = int(role_id)
+                else:
+                    role_id = 0
+
+            tokens = self._tokenize(response, is_split_into_words)
+            response_ids = [self.cls_token_id] + self.convert_tokens_to_ids(tokens)
+            if len(response_ids) > max_response_len - 1:
+                response_ids = response_ids[: max_response_len - 1]
+            response_ids += [self.sep_token_id]
+
+            if return_role_ids:
+                response_role_ids = [role_id] * len(response_ids)
+
+        elif add_start_token_as_response:
+            response_ids = [self.cls_token_id]
+
+            if return_role_ids:
+                response_role_ids = [0]
+
+        if task_type is not None:
+            special_token = self.TASK_TO_SPECIAL_TOKEN[task_type]
+            assert (
+                special_token in self.vocab._token_to_idx
+            ), "The vocab file should contain the special token corresponding " "to the task: {}.".format(task_type)
+            special_token_id = self.vocab._token_to_idx[special_token]
+            knowledge_ids = [self.cls_token_id, special_token_id] + knowledge_ids
+        else:
+            knowledge_ids = [self.cls_token_id] + knowledge_ids
+
+        if return_role_ids:
+            history_role_ids = []
+            individual_history_length = []
+            knowledge_role_ids = [0] * len(knowledge_ids)
+
+        max_history_len = max_seq_len - len(knowledge_ids) - len(response_ids)
+        if isinstance(history, str):
+            history = [history]
+        history_ids = []
+        for i in range(len(history) - 1, -1, -1):
+            role_id = None
+
+            if return_role_ids and "\1" in history[i]:
+                history[i], role_id = history[i].split("\1")
+                role_id = int(role_id)
+
+            tokens = self._tokenize(history[i], is_split_into_words)
+            if len(history_ids) + len(tokens) + 1 > max_history_len:
+                if i == len(history) - 1:
+                    tokens = tokens[1 - max_history_len :]
+                    history_ids = self.convert_tokens_to_ids(tokens) + [self.sep_token_id]
+
+                    if role_id is not None:
+                        history_role_ids = [role_id] * len(history_ids)
+                    elif return_role_ids:
+                        individual_history_length = [len(history_ids)]
+                break
+
+            if role_id is not None:
+                # 1 stands for [SEP]
+                history_role_ids = [role_id] * (len(tokens) + 1) + history_role_ids
+            elif return_role_ids:
+                individual_history_length = [len(tokens) + 1] + individual_history_length
+
+            history_ids = (self.convert_tokens_to_ids(tokens) + [self.sep_token_id]) + history_ids
+
+        if return_role_ids and len(history_role_ids) == 0:
+            for i in range(len(individual_history_length)):
+                history_role_ids = (
+                    history_role_ids + [(len(individual_history_length) - i) % 2] * individual_history_length[i]
+                )
+
+        history_ids = knowledge_ids + history_ids
+
+        if return_role_ids:
+            history_role_ids = knowledge_role_ids + history_role_ids
+
+        # Build output dictionnary
+        encoded_inputs = {}
+        encoded_inputs["input_ids"] = history_ids + response_ids
+
+        if return_role_ids:
+            encoded_inputs["role_ids"] = history_role_ids + response_role_ids
+
+        # Check lengths
+        sequence_length = len(encoded_inputs["input_ids"])
+        assert sequence_length <= max_seq_len
+
+        # Considering that the logits at the last time step in the API of
+        # generative task are taken to generate the next token. In order to
+        # avoid the last time step being a pad, so take padding on the left.
+        pad_length = max_seq_len - sequence_length if pad_to_max_seq_len else 0
+        if pad_length > 0:
+            encoded_inputs["input_ids"] = [self.pad_token_id] * pad_length + encoded_inputs["input_ids"]
+        if return_tensors:
+            # Add dimention for batch_size
+            encoded_inputs["input_ids"] = paddle.to_tensor(encoded_inputs["input_ids"]).unsqueeze(0)
+
+        if return_token_type_ids:
+            encoded_inputs["token_type_ids"] = [0] * len(history_ids) + [1] * len(response_ids)
+            if pad_length > 0:
+                encoded_inputs["token_type_ids"] = [self.pad_token_id] * pad_length + encoded_inputs["token_type_ids"]
+            if return_tensors:
+                # Add dimention for batch_size
+                encoded_inputs["token_type_ids"] = paddle.to_tensor(encoded_inputs["token_type_ids"]).unsqueeze(0)
+
+        if return_length:
+            encoded_inputs["seq_len"] = sequence_length
+
+        if return_position_ids:
+            if position_style == "continuous":
+                encoded_inputs["position_ids"] = list(range(sequence_length))
+            elif position_style == "relative":
+                encoded_inputs["position_ids"] = [
+                    max_response_len + (len(history_ids)) - i - 1 for i in range(len(history_ids))
+                ] + list(range(len(response_ids)))
+            else:
+                raise ValueError(
+                    "Expected position_style is one of [continuous, relative], but received {}".format(position_style)
+                )
+            if pad_length > 0:
+                encoded_inputs["position_ids"] = [self.pad_token_id] * pad_length + encoded_inputs["position_ids"]
+            if return_tensors:
+                # Add dimention for batch_size
+                encoded_inputs["position_ids"] = paddle.to_tensor(encoded_inputs["position_ids"]).unsqueeze(0)
+
+        if return_attention_mask:
+            attention_mask = np.ones((sequence_length, sequence_length), dtype="float32") * -1e4
+            start = len(history_ids)
+            end = sequence_length
+            attention_mask[:end, :start] = 0.0
+            # Generate the lower triangular matrix using the slice of matrix
+            tmp = np.triu(np.ones([end - start, end - start], dtype="float32") * -1e4, 1)
+            attention_mask[start:end, start:end] = tmp
+            encoded_inputs["attention_mask"] = attention_mask
+            if pad_length > 0:
+                new_mask = np.ones((max_seq_len, max_seq_len), dtype="float32") * -1e4
+                new_mask[-sequence_length:, -sequence_length:] = attention_mask
+                encoded_inputs["attention_mask"] = new_mask
+            if return_tensors:
+                # Add dimentions for batch_size and num_heads
+                encoded_inputs["attention_mask"] = paddle.to_tensor(encoded_inputs["attention_mask"]).unsqueeze((0, 1))
+
+        return encoded_inputs
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unimo/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unimo/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unimo/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unimo/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unimo/configuration.py
new file mode 100644
index 000000000..8d76afda2
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unimo/configuration.py
@@ -0,0 +1,303 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" UNIMO model configuration"""
+from __future__ import annotations
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+__all__ = ["UNIMO_PRETRAINED_INIT_CONFIGURATION", "UNIMOConfig", "UNIMO_PRETRAINED_RESOURCE_FILES_MAP"]
+
+UNIMO_PRETRAINED_INIT_CONFIGURATION = {
+    "unimo-text-1.0": {
+        "vocab_size": 18000,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "normalize_before": False,
+        "max_position_embeddings": 513,
+        "type_vocab_size": 4,
+        "initializer_range": 0.02,
+        "unk_token_id": 17963,
+        "pad_token_id": 0,
+        "bos_token_id": 1,
+        "eos_token_id": 3,
+        "mask_token_id": 3,
+    },
+    "unimo-text-1.0-lcsts-new": {
+        "vocab_size": 18000,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "normalize_before": False,
+        "max_position_embeddings": 513,
+        "type_vocab_size": 4,
+        "initializer_range": 0.02,
+        "unk_token_id": 17963,
+        "pad_token_id": 0,
+        "bos_token_id": 1,
+        "eos_token_id": 3,
+        "mask_token_id": 3,
+    },
+    "unimo-text-1.0-summary": {
+        "vocab_size": 18000,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "normalize_before": False,
+        "max_position_embeddings": 513,
+        "type_vocab_size": 4,
+        "initializer_range": 0.02,
+        "unk_token_id": 17963,
+        "pad_token_id": 0,
+        "bos_token_id": 1,
+        "eos_token_id": 3,
+        "mask_token_id": 3,
+    },
+    "unimo-text-1.0-large": {
+        "vocab_size": 12800,
+        "hidden_size": 1024,
+        "num_hidden_layers": 24,
+        "num_attention_heads": 16,
+        "intermediate_size": 4096,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "normalize_before": False,
+        "max_position_embeddings": 512,
+        "type_vocab_size": 4,
+        "initializer_range": 0.02,
+        "unk_token_id": 12088,
+        "pad_token_id": 0,
+        "bos_token_id": 1,
+        "eos_token_id": 3,
+        "mask_token_id": 3,
+    },
+    "unimo-text-1.0-dureader_qg": {
+        "vocab_size": 18000,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "normalize_before": False,
+        "max_position_embeddings": 513,
+        "type_vocab_size": 4,
+        "initializer_range": 0.02,
+        "unk_token_id": 17963,
+        "pad_token_id": 0,
+        "bos_token_id": 1,
+        "eos_token_id": 3,
+        "mask_token_id": 3,
+    },
+    "unimo-text-1.0-question-generation": {
+        "vocab_size": 18000,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "normalize_before": False,
+        "max_position_embeddings": 513,
+        "type_vocab_size": 4,
+        "initializer_range": 0.02,
+        "unk_token_id": 17963,
+        "pad_token_id": 0,
+        "bos_token_id": 1,
+        "eos_token_id": 3,
+        "mask_token_id": 3,
+    },
+    "unimo-text-1.0-question-generation-full_domain": {
+        "vocab_size": 18000,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "normalize_before": False,
+        "max_position_embeddings": 513,
+        "type_vocab_size": 4,
+        "initializer_range": 0.02,
+        "unk_token_id": 17963,
+        "pad_token_id": 0,
+        "bos_token_id": 1,
+        "eos_token_id": 3,
+        "mask_token_id": 3,
+    },
+    "unimo-text-1.0-question-generation-dureader_qg": {
+        "vocab_size": 18000,
+        "hidden_size": 768,
+        "num_hidden_layers": 12,
+        "num_attention_heads": 12,
+        "intermediate_size": 3072,
+        "hidden_act": "relu",
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "normalize_before": False,
+        "max_position_embeddings": 513,
+        "type_vocab_size": 4,
+        "initializer_range": 0.02,
+        "unk_token_id": 17963,
+        "pad_token_id": 0,
+        "bos_token_id": 1,
+        "eos_token_id": 3,
+        "mask_token_id": 3,
+    },
+}
+
+UNIMO_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "unimo-text-1.0": "https://bj.bcebos.com/paddlenlp/models/transformers/unimo/unimo-text-1.0.pdparams",
+        "unimo-text-1.0-lcsts-new": "https://bj.bcebos.com/paddlenlp/models/transformers/unimo/unimo-text-1.0-lcsts-new.pdparams",
+        "unimo-text-1.0-large": "https://bj.bcebos.com/paddlenlp/models/transformers/unimo/unimo-text-1.0-large.pdparams",
+        "unimo-text-1.0-summary": "https://bj.bcebos.com/paddlenlp/models/transformers/unimo/unimo-text-1.0-summary.pdparams",
+        "unimo-text-1.0-dureader_qg": "https://bj.bcebos.com/paddlenlp/models/transformers/unimo/unimo-text-1.0-dureader_qg.pdparams",
+        "unimo-text-1.0-question-generation": "https://bj.bcebos.com/paddlenlp/models/transformers/unimo/unimo-text-1.0-question-generation.pdparams",
+        "unimo-text-1.0-question-generation-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/unimo/unimo-text-1.0-question-generation-full_domain.pdparams",
+        "unimo-text-1.0-question-generation-dureader_qg": "https://bj.bcebos.com/paddlenlp/models/transformers/unimo/unimo-text-1.0-question-generation-dureader_qg.pdparams",
+    }
+}
+
+
+class UNIMOConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`UNIMOModel`]. It is used to
+    instantiate a UNIMO model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the UNIMO
+    unimo-text-1.0 architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (int, optional):
+            Vocabulary size of `inputs_ids` in `UNIMOModel`. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `UNIMOModel`.
+            Defaults to `18000`.
+        hidden_size (int, optional):
+            Dimensionality of the embedding layers and encoder layers. Defaults to `768`.
+        num_hidden_layers (int, optional):
+            The number of hidden layers in the Transformer encoder. Defaults to `12`.
+        num_attention_heads (int, optional):
+            Number of attention heads for each attention layer in the Transformer encoder.
+            Defaults to `12`.
+        intermediate_size (int, optional):
+            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
+            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
+            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
+            Defaults to `3072`.
+        hidden_act (str, optional):
+            The non-linear activation function in the feed-forward layer.
+            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
+            are supported. Defaults to ``"gelu"``.
+        hidden_dropout_prob(float, optional):
+            The dropout probability used in pre-process and post-precess of MHA
+            and FFN sub-layer. Defaults to 0.1.
+        attention_probs_dropout_prob (float, optional):
+            The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
+            Defaults to `0.1`.
+        normalize_before (bool, optional):
+            Indicate whether to put layer normalization into preprocessing of
+            MHA and FFN sub-layers. If True, pre-process is layer normalization
+            and post-precess includes dropout, residual connection. Otherwise,
+            no pre-process and post-precess includes dropout, residual
+            connection, layer normalization. Defaults to `True`.
+        max_position_embeddings (int, optional):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
+            sequence. Defaults to `512`.
+        type_vocab_size (int, optional):
+            The vocabulary size of the `token_type_ids` passed when calling `~transformers.UNIMOModel`.
+            Defaults to `2`.
+        initializer_range (float, optional):
+            The standard deviation of the normal initializer. Defaults to `0.02`.
+
+            .. note::
+                A normal_initializer initializes weight matrices as normal distributions.
+                See :meth:`UNIMOPretrainedModel._init_weights()` for how weights are initialized in `UNIMOModel`.
+
+        unk_token_id (int, optional):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` in order to be converted to an ID.
+            Defaults to `17963`.
+        pad_token_id (int, optional):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to `0`.
+        bos_token_id (int, optional):
+            A special token representing the beginning of a sequence that was used during pretraining.
+            Defaults to `1`.
+        eos_token_id (int, optional):
+            A special token representing the end of a sequence that was used during pretraining.
+            Defaults to `3`.
+        mask_token_id (int, optional):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to `3`.
+    ```"""
+    model_type = "unimo"
+    pretrained_init_configuration = UNIMO_PRETRAINED_INIT_CONFIGURATION
+
+    def __init__(
+        self,
+        vocab_size: int = 18000,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: str = "relu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        normalize_before: int = False,
+        max_position_embeddings: int = 513,
+        type_vocab_size: int = 4,
+        initializer_range: float = 0.02,
+        unk_token_id: int = 17963,
+        pad_token_id: int = 0,
+        bos_token_id: int = 1,
+        eos_token_id: int = 3,
+        mask_token_id: int = 3,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.normalize_before = normalize_before
+        self.unk_token_id = unk_token_id
+        self.mask_token_id = mask_token_id
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unimo/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unimo/modeling.py
new file mode 100644
index 000000000..fc5b0389d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unimo/modeling.py
@@ -0,0 +1,553 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Modeling classes for UNIMO model."""
+
+from typing import Optional, Tuple
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import Tensor
+
+from ...utils.env import CONFIG_NAME
+from ...utils.log import logger
+from .. import PretrainedModel, register_base_model
+from ..model_outputs import CausalLMOutputWithCrossAttentions
+from .configuration import (
+    UNIMO_PRETRAINED_INIT_CONFIGURATION,
+    UNIMO_PRETRAINED_RESOURCE_FILES_MAP,
+    UNIMOConfig,
+)
+
+__all__ = [
+    "UNIMOPretrainedModel",
+    "UNIMOModel",
+    "UNIMOLMHeadModel",
+    "UNIMOForMaskedLM",
+    "UNIMOForConditionalGeneration",
+]
+
+
+class UNIMOPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained UNIMO models. It provides UNIMO related
+    `model_config_file`, `pretrained_init_configuration`, `resource_files_names`,
+    `pretrained_resource_files_map`, `base_model_prefix` for downloading
+    and loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    model_config_file = CONFIG_NAME
+    pretrained_init_configuration = UNIMO_PRETRAINED_INIT_CONFIGURATION
+    pretrained_resource_files_map = UNIMO_PRETRAINED_RESOURCE_FILES_MAP
+    base_model_prefix = "unimo"
+    config_class = UNIMOConfig
+
+    def _init_weights(self, layer):
+        # Initialization hook
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.config.initializer_range,
+                        shape=layer.weight.shape,
+                    )
+                )
+
+
+class UNIMOEmbeddings(nn.Layer):
+    # Include embeddings from word, position and token_type.
+
+    def __init__(self, config: UNIMOConfig):
+        super(UNIMOEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        self.pad_token_id = config.pad_token_id
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        input_embeddings: Optional[Tensor] = None,
+    ):
+        if input_ids is None and input_embeddings is None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            inputs_shape = input_ids.shape
+        elif input_embeddings is not None:
+            inputs_shape = input_embeddings.shape[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        if input_embeddings is None:
+            input_embeddings = self.word_embeddings(input_ids)
+
+        if position_ids is None:
+            if self.pad_token_id is None:
+                position_ids = paddle.expand_as(paddle.arange(end=inputs_shape[1], dtype="int64"), inputs_shape)
+            else:
+                if input_ids is not None:
+                    num_pad = paddle.sum((input_ids == self.pad_token_id).astype("float32"), axis=-1, keepdim=True)
+                    position_ids = F.relu(
+                        paddle.expand_as(paddle.arange(end=inputs_shape[1], dtype="int64"), inputs_shape) - num_pad
+                    ).astype("int64")
+                else:
+                    logger.warning(
+                        "Position_ids or pad_token_ids should be provided when input_embeds is specified, "
+                        "otherwise an unexpected result may be returned since `[0, 1, ..., sequence length - 1]` will be generated as a default position_ids."
+                    )
+                    position_ids = paddle.expand_as(paddle.arange(end=inputs_shape[1], dtype="int64"), inputs_shape)
+            position_ids.stop_gradient = True
+        position_embeddings = self.position_embeddings(position_ids)
+
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros_like(input_ids, dtype="int64")
+            token_type_ids.stop_gradient = True
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = input_embeddings + position_embeddings + token_type_embeddings
+        return embeddings
+
+
+@register_base_model
+class UNIMOModel(UNIMOPretrainedModel):
+    """
+    The bare UNIMO Model outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the  superclass documentation for the generic methods.
+
+    This model is also a `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass.
+    Use it as a regular Paddle Layer and refer to the Paddle
+    documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`UNIMOConfig`):
+            An instance of UNIMOConfig used to construct UNIMOModel.
+    """
+
+    def __init__(self, config: UNIMOConfig):
+        super(UNIMOModel, self).__init__(config)
+        self.unk_token_id = config.unk_token_id
+        self.pad_token_id = config.pad_token_id
+        self.bos_token_id = config.bos_token_id
+        self.eos_token_id = config.eos_token_id
+        self.mask_token_id = config.mask_token_id
+        self.initializer_range = config.initializer_range
+
+        self.embeddings = UNIMOEmbeddings(config)
+        encoder_layer = nn.TransformerEncoderLayer(
+            config.hidden_size,
+            config.num_attention_heads,
+            config.intermediate_size,
+            dropout=config.hidden_dropout_prob,
+            activation=config.hidden_act,
+            attn_dropout=config.attention_probs_dropout_prob,
+            act_dropout=0,
+            normalize_before=config.normalize_before,
+        )
+
+        self.encoder_norm = nn.LayerNorm(config.hidden_size)
+        # post_encoder_norm = nn.LayerNorm(config.hidden_size)
+
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.encoder = nn.TransformerEncoder(
+            encoder_layer,
+            config.num_hidden_layers,
+            # post_encoder_norm,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        use_cache: Optional[bool] = None,
+        cache: Optional[Tuple[Tensor]] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The UNIMOModel forward method, overrides the special :meth:`__call__` method.
+
+        Args:
+            input_ids (Tensor, optional):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                It's data type should be `int64` and has a shape of  [batch_size, sequence_length].
+            token_type_ids (Tensor):
+                Segment token indices to indicate first and second portions of
+                the inputs. Indices can be either 0 or 1:
+
+                - 0 corresponds to a **sentence A** token,
+                - 1 corresponds to a **sentence B** token.
+
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
+                Defaults to None, which means no segment embeddings is added to token embeddings.
+            position_ids (Tensor):
+                Indices of positions of each input sequence tokens in the position embeddings.
+                Selected in the range ``[0, max_position_embeddings - 1]``.
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
+                Defaults to `None`.
+            attention_mask (Tensor):
+                Mask used in multi-head attention to avoid performing attention to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                For example, its shape can be  [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
+                [batch_size, num_attention_heads, sequence_length, sequence_length].
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            use_cache: (bool, optional):
+                Whether or not use the model cache to speed up decoding.
+                Defaults to `False`.
+            cache (list, optional):
+                It is a list, and each element in the list is `incremental_cache`
+                produced by :meth:`paddle.nn.TransformerEncoderLayer.gen_cache`
+                method. See :meth:`paddle.nn.TransformerEncoder.gen_cache`
+                method for more details. It is only used for inference and
+                should be None for training. Defaults to `None`.
+            inputs_embeds (Tensor, optional):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation
+                of shape `(batch_size, sequence_length, hidden_size)`. This is useful if you want more control over
+                how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+                Default to None.
+            output_attentions (bool, optional):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+                tensors for more detail. Defaults to `False`.
+            output_hidden_states (bool, optional):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+                more detail. Defaults to `False`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions`.
+            Especially, When `return_dict=output_hidden_states=output_attentions=False` and `cache=None`,
+            returns tensor `Sequence_output` of shape [batch_size, sequence_length, hidden_size],
+            which is the output at the last layer of the model.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import UNIMOModel
+                from paddlenlp.transformers import UNIMOTokenizer
+
+                model = UNIMOModel.from_pretrained('unimo-text-1.0')
+                tokenizer = UNIMOTokenizer.from_pretrained('unimo-text-1.0')
+
+                inputs = tokenizer.gen_encode("Welcome to use PaddlePaddle and PaddleNLP!", return_tensors=True)
+                outputs = model(**inputs)
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if attention_mask is None:
+            if input_ids is not None:
+                attention_mask = (
+                    (input_ids == self.pad_token_id).astype(paddle.get_default_dtype()) * -1e4
+                ).unsqueeze([1, 2])
+            else:
+                logger.warning(
+                    "Provided inputs_embeds while attention_mask is None, attention weights will not be masked during forwarding."
+                )
+
+        if attention_mask is not None:
+            attention_mask.stop_gradient = True
+
+        embedding_output = self.embeddings(input_ids, token_type_ids, position_ids, inputs_embeds)
+
+        embedding_output = self.encoder_norm(embedding_output)
+        embedding_output = self.dropout(embedding_output)
+
+        if use_cache and cache is None:
+            cache = self.encoder.gen_cache(embedding_output)
+
+        outputs = self.encoder(
+            embedding_output,
+            attention_mask,
+            cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return outputs
+
+
+class UNIMOLMHead(nn.Layer):
+    def __init__(self, hidden_size, vocab_size, activation, embedding_weights=None):
+        super(UNIMOLMHead, self).__init__()
+        self.transform = nn.Linear(hidden_size, hidden_size)
+        self.activation = getattr(nn.functional, activation)
+        self.layer_norm = nn.LayerNorm(hidden_size)
+        self.decoder_weight = (
+            self.create_parameter(shape=[vocab_size, hidden_size], dtype=self.transform.weight.dtype, is_bias=False)
+            if embedding_weights is None
+            else embedding_weights
+        )
+        self.decoder_bias = self.create_parameter(shape=[vocab_size], dtype=self.decoder_weight.dtype, is_bias=True)
+
+    def forward(self, hidden_states: Tensor, masked_positions: Optional[Tensor] = None):
+        if masked_positions is not None:
+            hidden_states = paddle.reshape(hidden_states, [-1, hidden_states.shape[-1]])
+            hidden_states = paddle.tensor.gather(hidden_states, masked_positions)
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        logits = paddle.tensor.matmul(hidden_states, self.decoder_weight, transpose_y=True) + self.decoder_bias
+        return logits
+
+
+class UNIMOLMHeadModel(UNIMOPretrainedModel):
+    """
+    The UNIMO Model with a `language modeling` head on top designed for generation tasks.
+
+    Args:
+        unimo (:class:`UNIMOModel`):
+            An instance of :class:`UNIMOModel`.
+    """
+
+    def __init__(self, config: UNIMOConfig):
+        super(UNIMOLMHeadModel, self).__init__(config)
+        self.unimo = UNIMOModel(config)
+        self.lm_head = UNIMOLMHead(
+            config.hidden_size,
+            config.vocab_size,
+            config.hidden_act,
+            self.unimo.embeddings.word_embeddings.weight,
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[Tensor] = None,
+        token_type_ids: Optional[Tensor] = None,
+        position_ids: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        masked_positions: Optional[Tensor] = None,
+        use_cache: Optional[bool] = None,
+        cache: Optional[Tuple[Tensor]] = None,
+        inputs_embeds: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        The UNIMOLMHeadModel forward method, overrides the special
+        :meth:`__call__` method.
+
+        Args:
+            input_ids (Tensor, optional):
+                See :class:`UNIMOModel`.
+            token_type_ids (Tensor):
+                See :class:`UNIMOModel`.
+            position_ids (Tensor):
+                See :class:`UNIMOModel`.
+            attention_mask (Tensor):
+                See :class:`UNIMOModel`.
+            use_cache: (bool, optional):
+                See :class:`UNIMOModel`.
+            cache (list, optional):
+                See :class:`UNIMOModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`UNIMOModel`.
+            labels (Tensor, optional):
+                Labels for computing the left-to-right language modeling loss. Indices should be in
+                `[-100, 0, ..., vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+                ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., vocab_size]`
+            output_attentions (bool, optional):
+                See :class:`UNIMOModel`.
+            output_hidden_states (bool, optional):
+                See :class:`UNIMOModel`.
+            return_dict (bool, optional):
+                Whether to return a :class:`~paddlenlp.transformers.model_outputs.CausalLMOutputWithPastAndCrossAttentions` object. If `False`, the output
+                will be a tuple of tensors. Defaults to `False`.
+
+        Returns:
+            An instance of :class:`~paddlenlp.transformers.model_outputs.CausalLMOutputWithPastAndCrossAttentions` if
+            `return_dict=True`. Otherwise it returns a tuple of tensors corresponding
+            to ordered and not None (depending on the input arguments) fields of
+            :class:`~paddlenlp.transformers.model_outputs.CausalLMOutputWithPastAndCrossAttentions`.
+            Especially, When `return_dict=output_hidden_states=output_attentions=False` and `cache=labels=None`,
+            returns tensor `logits` of shape [batch_size, sequence_length, hidden_size],
+            which is the output at the last layer of the model.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import UNIMOLMHeadModel
+                from paddlenlp.transformers import UNIMOTokenizer
+
+                model = UNIMOLMHeadModel.from_pretrained('unimo-text-1.0')
+                tokenizer = UNIMOTokenizer.from_pretrained('unimo-text-1.0')
+
+                inputs = tokenizer.gen_encode(
+                    "Welcome to use PaddlePaddle and PaddleNLP!",
+                    return_tensors=True,
+                    is_split_into_words=False)
+                logits = model(**inputs)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.unimo(
+            input_ids,
+            token_type_ids,
+            position_ids,
+            attention_mask,
+            use_cache,
+            cache,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        input_type = type(input_ids) if input_ids is not None else type(inputs_embeds)
+        sequence_output = outputs if isinstance(outputs, input_type) else outputs[0]
+
+        logits = self.lm_head(sequence_output, masked_positions)
+
+        lm_loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            lm_loss = loss_fct(logits.reshape((-1, self.unimo.config.vocab_size)), labels.reshape((-1,)))
+
+        if not return_dict:
+            if isinstance(outputs, input_type):
+                return (lm_loss, logits) if lm_loss is not None else logits
+            else:
+                outputs = (logits,) + outputs[1:]
+                return ((lm_loss,) + outputs) if lm_loss is not None else outputs
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_fast_entry(self, kwargs):
+        from paddlenlp.ops import FasterMIRO, FasterUNIMOText
+
+        use_fp16_decoding = kwargs.get("use_fp16_decoding", False)
+        decode_strategy = kwargs.get("decode_strategy")
+        if decode_strategy == "sampling" and kwargs.get("top_k") != 0 and kwargs.get("top_p") != 1:
+            raise AttributeError(
+                "Only topk sampling or topp sampling are supported. "
+                "Topk sampling and topp sampling cannot be both applied in the fast version."
+            )
+        if kwargs["repetition_penalty"] != 1.0:
+            # not support for repetition_penalty yet in the fast version
+            raise AttributeError("'repetition_penalty != 1' is not supported yet in the fast version")
+        if kwargs["forced_bos_token_id"] is not None:
+            # not support for min_length yet in the fast version
+            raise AttributeError(
+                "Only topk sampling or topp sampling are supported. "
+                "Topk sampling and topp sampling cannot be both applied in the fast version."
+            )
+
+        if getattr(self.encoder, "norm", None) is None:
+            self._fast_entry = FasterUNIMOText(self, use_fp16_decoding=use_fp16_decoding).forward
+        else:
+            self._fast_entry = FasterMIRO(self, use_fp16_decoding=use_fp16_decoding).forward
+        return self._fast_entry
+
+    def adjust_logits_during_generation(self, logits):
+        # pre-process distribution
+        logits[:, self.unimo.unk_token_id] = -1e9
+        logits[:, self.unimo.pad_token_id] = -1e9
+        logits[:, self.unimo.bos_token_id] = -1e9
+        return logits
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        token_type_ids=None,
+        position_ids=None,
+        attention_mask=None,
+        use_cache=False,
+        cache=None,
+        **kwargs
+    ):
+
+        if position_ids is None:
+            if self.pad_token_id is None:
+                position_ids = paddle.expand_as(paddle.arange(end=input_ids.shape[1], dtype="int64"), input_ids)
+            else:
+                num_pad = paddle.sum((input_ids == self.pad_token_id).astype("float32"), axis=-1, keepdim=True)
+                position_ids = F.relu(
+                    paddle.expand_as(paddle.arange(end=input_ids.shape[1], dtype="float32"), input_ids) - num_pad
+                ).astype("int64")
+            position_ids.stop_gradient = True
+
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros_like(input_ids, dtype="int64")
+            token_type_ids.stop_gradient = True
+
+        if attention_mask is None:
+            attention_mask = ((input_ids == self.pad_token_id).astype(paddle.get_default_dtype()) * -1e4).unsqueeze(
+                [1, 2]
+            )
+            attention_mask.stop_gradient = True
+
+        # only last token for inputs_ids if cache is defined in kwargs
+        if cache is not None:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+            if token_type_ids is not None:
+                token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
+            if position_ids is not None:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+            if attention_mask is not None:
+                attention_mask = attention_mask[:, :, -1:, :]
+
+        return {
+            "input_ids": input_ids,
+            "token_type_ids": token_type_ids,
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,
+            "cache": cache,
+        }
+
+    def __getattr__(self, name):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(getattr(self, self.base_model_prefix), name)
+
+
+UNIMOForMaskedLM = UNIMOLMHeadModel
+UNIMOForConditionalGeneration = UNIMOLMHeadModel
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unimo/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unimo/tokenizer.py
new file mode 100644
index 000000000..e25b65ac4
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/unimo/tokenizer.py
@@ -0,0 +1,540 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+import paddle
+
+from ...data.vocab import Vocab
+from .. import BasicTokenizer, PretrainedTokenizer, WordpieceTokenizer
+
+__all__ = ["UNIMOTokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "unimo-text-1.0": 513,
+    "unimo-text-1.0-lcsts-new": 513,
+    "unimo-text-1.0-large": 512,
+}
+
+
+class UNIMOTokenizer(PretrainedTokenizer):
+    r"""
+    Constructs an UNIMO tokenizer. It uses a basic tokenizer to do punctuation
+    splitting, lower casing and so on, and follows a WordPiece tokenizer to
+    tokenize as subwords.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            The vocabulary file path (ends with '.txt') required to instantiate
+            a `WordpieceTokenizer`.
+        do_lower_case (str, optional):
+            Whether or not to lowercase the input when tokenizing.
+            Defaults to`True`.
+        unk_token (str):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import UNIMOTokenizer
+            tokenizer = UNIMOTokenizer.from_pretrained('unimo-text-1.0')
+            encoded_inputs = tokenizer('He was a puppeteer')
+            # encoded_inputs
+            #{
+            #   'input_ids': [1, 4444, 4385, 1545, 6712, 10062, 9568, 9756, 9500, 2],
+            #   'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+            #}
+
+    """
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "unimo-text-1.0": "https://bj.bcebos.com/paddlenlp/models/transformers/unimo/unimo-text-1.0-vocab.txt",
+            "unimo-text-1.0-lcsts-new": "https://bj.bcebos.com/paddlenlp/models/transformers/unimo/unimo-text-1.0-vocab.txt",
+            "unimo-text-1.0-large": "https://bj.bcebos.com/paddlenlp/models/transformers/unimo/unimo-text-1.0-large-vocab.txt",
+            "unimo-text-1.0-summary": "https://bj.bcebos.com/paddlenlp/models/transformers/unimo/unimo-text-1.0-vocab.txt",
+            "unimo-text-1.0-dureader_qg": "https://bj.bcebos.com/paddlenlp/models/transformers/unimo/unimo-text-1.0-vocab.txt",
+            "unimo-text-1.0-question-generation": "https://bj.bcebos.com/paddlenlp/models/transformers/unimo/unimo-text-1.0-vocab.txt",
+            "unimo-text-1.0-question-generation-full_domain": "https://bj.bcebos.com/paddlenlp/models/transformers/unimo/unimo-text-1.0-vocab.txt",
+            "unimo-text-1.0-question-generation-dureader_qg": "https://bj.bcebos.com/paddlenlp/models/transformers/unimo/unimo-text-1.0-vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "unimo-text-1.0": {"do_lower_case": True},
+        "unimo-text-1.0-lcsts-new": {"do_lower_case": True},
+        "unimo-text-1.0-large": {"do_lower_case": True},
+        "unimo-text-1.0-summary": {"do_lower_case": True},
+        "unimo-text-1.0-dureader_qg": {"do_lower_case": True},
+        "unimo-text-1.0-question-generation": {"do_lower_case": True},
+        "unimo-text-1.0-question-generation-full_domain": {"do_lower_case": True},
+        "unimo-text-1.0-question-generation-dureader_qg": {"do_lower_case": True},
+    }
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        **kwargs
+    ):
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the "
+                "vocabulary from a pretrained model please use "
+                "`tokenizer = UNIMOTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+        self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token)
+        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=unk_token)
+
+    @property
+    def vocab_size(self):
+        """
+        Return the size of vocabulary.
+
+        Returns:
+            int: The size of vocabulary.
+        """
+        return len(self.vocab)
+
+    @staticmethod
+    def load_vocabulary(filepath, unk_token=None, pad_token=None, bos_token=None, eos_token=None, **kwargs):
+        token_to_idx = {}
+        with open(filepath, "r", encoding="utf-8") as f:
+            for line in f:
+                token, index = line.rstrip("\n").split("\t")
+                token_to_idx[token] = int(index)
+        vocab = Vocab.from_dict(
+            token_to_idx, unk_token=unk_token, pad_token=pad_token, bos_token=bos_token, eos_token=eos_token, **kwargs
+        )
+        return vocab
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        return vocab
+
+    def _tokenize(self, text):
+        r"""
+        End-to-end tokenization for UNIMO models.
+
+        Args:
+            text (str): The text to be tokenized.
+
+        Returns:
+            List[str]: A list of string representing converted tokens.
+        """
+        split_tokens = []
+        for token in self.basic_tokenizer.tokenize(text):
+            for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                split_tokens.append(sub_token)
+        return split_tokens
+
+    def convert_tokens_to_string(self, tokens):
+        r"""
+        Converts a sequence of tokens (list of string) in a single string. Since
+        the usage of WordPiece introducing `##` to concat subwords, also remove
+        `##` when converting.
+
+        Args:
+            tokens (list): A list of string representing tokens to be converted.
+
+        Returns:
+            str: Converted string from tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import UNIMOTokenizer
+
+                tokenizer = UNIMOTokenizer.from_pretrained('unimo-text-1.0')
+                tokens = tokenizer.tokenize('He was a puppeteer')
+
+                strings = tokenizer.convert_tokens_to_string(tokens)
+                '''
+                he was a puppeteer
+                '''
+
+        """
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def num_special_tokens_to_add(self, pair=False):
+        r"""
+        Returns the number of added tokens when encoding a sequence with special tokens.
+
+        Args:
+            pair(bool):
+                Whether the input is a sequence pair or a single sequence.
+                Defaults to `False` and the input is a single sequence.
+
+        Returns:
+            int: Number of tokens added to sequences.
+        """
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        r"""
+        Build model inputs from a sequence or a pair of sequence for sequence
+        classification tasks by concatenating and adding special tokens.
+
+        A UNIMO sequence has the following format:
+
+        - single sequence:      ``[CLS] X [SEP]``
+        - pair of sequences:        ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs.
+                Defaults to `None`.
+
+        Returns:
+            List[int]: List of input_id with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        return _cls + token_ids_0 + _sep + token_ids_1 + _sep
+
+    def merge_subword(self, tokens):
+        r"""
+        Converts the subwords in a sequence of tokens (list of string) to whole
+        words, also remove `##` when converting.
+
+        Args:
+            tokens (List[str]): A list of string representing tokens to be converted.
+
+        Returns:
+            List[str]: Converted sequence of whole words.
+        """
+        ret = []
+        for token in tokens:
+            if token.startswith("##"):
+                real_token = token[2:]
+                if len(ret):
+                    ret[-1] += real_token
+                else:
+                    ret.append(real_token)
+            else:
+                ret.append(token)
+
+        return ret
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        r"""
+        Build offset map from a pair of offset map by concatenating and adding
+        offsets of special tokens.
+
+        A UNIMO offset_mapping has the following format:
+        ::
+            - single sequence: ``(0,0) X (0,0)``
+            - pair of sequences: `(0,0) A (0,0) B (0,0)``
+
+        Args:
+            offset_mapping_ids_0 (List[tuple]):
+                List of char offsets to which the special tokens will be added.
+            offset_mapping_ids_1 (List[tuple], optional):
+                Optional second list of char offsets for offset mapping pairs.
+                Defaults to `None`.
+
+        Returns:
+            List[tuple]: List of char offsets with the appropriate offsets
+                of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        r"""
+        Create a mask from the two sequences passed to be used in a sequence-pair
+        classification task.
+
+        A UNIMO sequence pair mask has the following format:
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If `token_ids_1` is `None`, this method only returns the first portion
+        of the mask (0s).
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs.
+                Defaults to `None`.
+
+        Returns:
+            List[int]: List of token_type_id according to the given sequence(s).
+        """
+        _sep = [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(_cls + token_ids_0 + _sep) * [0]
+        return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 + _sep) * [1]
+
+    def gen_encode(
+        self,
+        source,
+        title=None,
+        target=None,
+        max_seq_len=512,
+        max_title_len=128,
+        max_target_len=128,
+        return_position_ids=True,
+        return_token_type_ids=True,
+        return_attention_mask=True,
+        return_length=False,
+        add_start_token_for_decoding=False,
+        pad_to_max_seq_len=False,
+        return_tensors=False,
+        is_split_into_words=False,
+        continuous_position=False,
+    ):
+        """
+        Main method for encoding the source for generation. It will return a
+        dictionary containing the encoded sequence and other relative informations
+        which meets the input format requirements of the UNIMO-text model.
+
+        Args:
+            source (str): The source text of generation. It should be a string.
+            target (str, optional): The target text of generation. It should be
+                set when training the model and should be None when running
+                inference. Defaults to None.
+            title (str, optional): The additional information of some of the
+                generation tasks such as summary. Defaults to None.
+            max_seq_len (int, optional): The maximum encoded sequence length.
+                Defaults to 512.
+            max_target_len (int, optional): The maximum encoded sequence
+                length of the input `target`. Defaults to 128.
+            max_title_len (int, optional): The maximum encoded sequence
+                length of the input `title`. Defaults to 128.
+            return_position_ids (bool, optional): Whether to return the
+                position_ids. Defaults to True.
+            return_token_type_ids (bool, optional): Whether to return the
+                token_type_ids. Defaults to True.
+            return_attention_mask (bool, optional): Whether to return the
+                attention_mask. Defaults to True.
+            return_length (bool, optional): Whether to return the length of the
+                encoded sequence. Defaults to False.
+            add_start_token_for_decoding (bool, optional): Whether to add the
+                special token "[CLS]" at the end of sequence as the beginning of
+                the target when running inference to force the model to start
+                generating target sequence. Defaults to False.
+            pad_to_max_seq_len (bool, optional): Whether to pad the returned
+                sequences to the `max_seq_len`. Note that, in this method,
+                returned sequences will be padded on the left. Defaults to False.
+            return_tensors (bool, optional): Whether to convert the returned
+                sequences to Tensor. Defaults to False.
+            is_split_into_words(bool, optional): Whether or not the input text
+                (`source`, `target` and `title`) has been pretokenized.
+                Defaults to False.
+            continuous_position(bool, optional): Whether the position ids is
+                continuous between source ids and target ids. Defaults to False.
+
+        Returns:
+            dict: A dictionary containing the encoded sequence and other
+            relative informations.
+
+            With the corresponding fields:
+
+            - input_ids (list[int]|Tensor):
+                A list of indices of input tokens to be feed to UNIMO-text
+                model. If `return_tensors` is True, it is a Tensor with shape
+                [1, sequence_length] and data type 'int64'.
+            - token_type_ids (list[int]|Tensor, optional):
+                A list of segment token indices to indicate whether the token
+                belongs to the dialogue target. If `return_tensors` is True,
+                it is a Tensor with shape [1, sequence_length] and data type
+                'int64'.
+                Being returned when `return_token_type_ids` is set to True.
+            - position_ids (list[int]|Tensor, optional):
+                A list of The position indices. If `return_tensors` is True,
+                it is a Tensor with shape [1, sequence_length] and data type
+                'int64'.
+                Being returned when `return_position_ids` is set to True.
+            - attention_mask (numpy.ndarray|Tensor, optional):
+                A numpy.ndarray to prevents attention to some unwanted positions,
+                with shape [sequence_length, sequence_length] and data type
+                'float32'. If `return_tensors` is True, it is a Tensor with shape
+                [1, 1, sequence_length, sequence_length] and data type 'float32'.
+                Being returned when `return_attention_mask` is set to True.
+            - seq_len (int, optional):
+                The actual length of the `input_ids`, excluding the pad token.
+                Being returned when `return_length` is set to True.
+
+        Example:
+            .. code-block::
+
+                from paddlenlp.transformers import UNIMOTokenizer
+                tokenizer = UNIMOTokenizer.from_pretrained('unimo-text-1.0')
+                inputs = tokenizer.gen_encode('He was a puppeteer')
+                #{'input_ids': [1, 4444, 4385, 1545, 6712, 10062, 9568, 9756, 9500, 2],
+                #'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                #'position_ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+                #'attention_mask': array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
+                #[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
+                #[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
+                #[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
+                #[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
+                #[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
+                #[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
+                #[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
+                #[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
+                #[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)}
+        """
+
+        # Input type checking for clearer error
+        assert isinstance(
+            source, str
+        ), "The input `source` must be with type `str` (single context). " " But received: {}".format(source)
+        assert target is None or isinstance(
+            target, str
+        ), "The input `target` must of be with type `str`. But received: {}".format(target)
+        assert title is None or isinstance(
+            title, str
+        ), "The input `title` must of be with type `str`. But received: {}".format(title)
+        assert max_seq_len > max_title_len + max_target_len, (
+            "`max_seq_len` must be greater than the sum of `max_target_len` "
+            "and `max_title_len`. But received `max_seq_len` is {}, "
+            "`max_target_len` is {}, `max_title_len` is {}.".format(max_seq_len, max_title_len, max_target_len)
+        )
+        assert target is None or not add_start_token_for_decoding, (
+            "`add_start_token_for_decoding` only works when `target` is "
+            "`None`. But received `add_start_token_for_decoding`: `{}`, "
+            "`target`: {}.".format(add_start_token_for_decoding, target)
+        )
+
+        title_ids = []
+        if title is not None:
+            tokens = self._tokenize(title)
+            title_ids = self.convert_tokens_to_ids(tokens)
+            if len(title_ids) > max_title_len - 1:
+                title_ids = title_ids[: max_title_len - 1]
+            title_ids += [self.sep_token_id]
+
+        target_ids = []
+        if target is not None:
+            tokens = self._tokenize(target)
+            target_ids = [self.cls_token_id] + self.convert_tokens_to_ids(tokens)
+            if len(target_ids) > max_target_len - 1:
+                target_ids = target_ids[: max_target_len - 1]
+            target_ids += [self.mask_token_id]
+        elif add_start_token_for_decoding:
+            target_ids = [self.cls_token_id]
+
+        title_ids = [self.cls_token_id] + title_ids
+
+        max_source_len = max_seq_len - len(title_ids) - len(target_ids)
+        source_ids = []
+        tokens = self._tokenize(source)
+        source_ids = self.convert_tokens_to_ids(tokens)
+
+        if len(source_ids) > max_source_len - 1:
+            source_ids = source_ids[: max_source_len - 1]
+
+        source_ids += [self.sep_token_id]
+        source_ids = title_ids + source_ids
+        # Build output dictionnary
+
+        encoded_inputs = {}
+        encoded_inputs["input_ids"] = source_ids + target_ids
+        # Check lengths
+        sequence_length = len(encoded_inputs["input_ids"])
+        assert sequence_length <= max_seq_len
+
+        # Considering that the logits at the last time step in the API of
+        # generative task are taken to generate the next token. In order to
+        # avoid the last time step being a pad, so take padding on the left.
+        pad_length = max_seq_len - sequence_length if pad_to_max_seq_len else 0
+        if pad_length > 0:
+            encoded_inputs["input_ids"] = [self.pad_token_id] * pad_length + encoded_inputs["input_ids"]
+        if return_tensors:
+            # Add dimention for batch_size
+            encoded_inputs["input_ids"] = paddle.to_tensor(encoded_inputs["input_ids"]).unsqueeze(0)
+
+        if return_token_type_ids:
+            encoded_inputs["token_type_ids"] = [0] * len(source_ids) + [1] * len(target_ids)
+            if pad_length > 0:
+                encoded_inputs["token_type_ids"] = [self.pad_token_id] * pad_length + encoded_inputs["token_type_ids"]
+            if return_tensors:
+                # Add dimention for batch_size
+                encoded_inputs["token_type_ids"] = paddle.to_tensor(encoded_inputs["token_type_ids"]).unsqueeze(0)
+
+        if return_length:
+            encoded_inputs["seq_len"] = sequence_length
+
+        if return_position_ids:
+            if continuous_position:
+                encoded_inputs["position_ids"] = list(range(sequence_length))
+            else:
+                encoded_inputs["position_ids"] = list(range(len(source_ids))) + list(range(len(target_ids)))
+            if pad_length > 0:
+                encoded_inputs["position_ids"] = [self.pad_token_id] * pad_length + encoded_inputs["position_ids"]
+            if return_tensors:
+                # Add dimention for batch_size
+                encoded_inputs["position_ids"] = paddle.to_tensor(encoded_inputs["position_ids"]).unsqueeze(0)
+
+        if return_attention_mask:
+            attention_mask = np.ones((sequence_length, sequence_length), dtype="float32") * -1e4
+            start = len(source_ids)
+            end = sequence_length
+            attention_mask[:end, :start] = 0.0
+            # Generate the lower triangular matrix using the slice of matrix
+            tmp = np.triu(np.ones([end - start, end - start], dtype="float32") * -1e4, 1)
+            attention_mask[start:end, start:end] = tmp
+            encoded_inputs["attention_mask"] = attention_mask
+            if pad_length > 0:
+                new_mask = np.ones((max_seq_len, max_seq_len), dtype="float32") * -1e4
+                new_mask[-sequence_length:, -sequence_length:] = attention_mask
+                encoded_inputs["attention_mask"] = new_mask
+            if return_tensors:
+                # Add dimentions for batch_size and num_heads
+                encoded_inputs["attention_mask"] = paddle.to_tensor(encoded_inputs["attention_mask"]).unsqueeze((0, 1))
+
+        return encoded_inputs
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/utils.py
new file mode 100644
index 000000000..f785a5358
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/utils.py
@@ -0,0 +1,948 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import contextlib
+import functools
+import hashlib
+import importlib
+import inspect
+import os
+import re
+import shutil
+import sys
+import warnings
+from contextlib import ExitStack
+from io import StringIO
+from pathlib import Path
+from typing import TYPE_CHECKING, ContextManager, List, Optional, Type, Union
+
+from filelock import FileLock
+
+from paddlenlp import __version__
+from paddlenlp.utils.downloader import (
+    COMMUNITY_MODEL_PREFIX,
+    download_check,
+    get_path_from_url_with_filelock,
+    is_url,
+    url_file_exists,
+)
+
+if TYPE_CHECKING:
+    from paddlenlp.transformers import PretrainedModel
+
+import numpy as np
+import paddle
+import tqdm
+from huggingface_hub import hf_hub_download, try_to_load_from_cache
+from huggingface_hub.utils import EntryNotFoundError
+from paddle.common_ops_import import convert_dtype
+from paddle.nn import Layer
+from requests.exceptions import HTTPError
+
+from paddlenlp.utils.env import HF_CACHE_HOME, MODEL_HOME
+from paddlenlp.utils.import_utils import import_module
+from paddlenlp.utils.log import logger
+
+from ..utils.download import resolve_file_path
+from .aistudio_utils import aistudio_download
+
+HUGGINGFACE_CO_RESOLVE_ENDPOINT = "https://huggingface.co"
+
+
+def convert_ndarray_dtype(np_array: np.ndarray, target_dtype: str) -> np.ndarray:
+    """convert ndarray
+
+    Args:
+        np_array (np.ndarray): numpy ndarray instance
+        target_dtype (str): the target dtype
+
+    Returns:
+        np.ndarray: converted numpy ndarray instance
+    """
+    source_dtype = convert_dtype(np_array.dtype)
+    if source_dtype == "uint16" or target_dtype == "bfloat16":
+        tensor = paddle.to_tensor(np_array)
+        tensor = paddle.cast(tensor, target_dtype)
+        return tensor.cpu().numpy()
+
+        # TODO(wj-Mcat): device_guard will slow the converting
+        # with device_guard("cpu"):
+        #     tensor = paddle.to_tensor(np_array)
+        #     tensor = paddle.cast(tensor, target_dtype)
+        # return tensor.cpu().numpy()
+
+    if target_dtype == "bfloat16":
+        target_dtype = "uint16"
+
+    return np_array.astype(target_dtype)
+
+
+def convert_to_dict_message(conversation: List[List[str]]):
+    """Convert the list of chat messages to a role dictionary chat messages."""
+    conversations = []
+    for index, item in enumerate(conversation):
+        assert 1 <= len(item) <= 2, "Each Rounds in conversation should have 1 or 2 elements."
+        if isinstance(item[0], str):
+            conversations.append({"role": "user", "content": item[0]})
+            if len(item) == 2 and isinstance(item[1], str):
+                conversations.append({"role": "assistant", "content": item[1]})
+            else:
+                # If there is only one element in item, it must be the last round.
+                # If it is not the last round, it must be an error.
+                if index != len(conversation) - 1:
+                    raise ValueError(f"Round {index} has error round")
+        else:
+            raise ValueError("Each round in list should be string")
+    return conversations
+
+
+def get_scale_by_dtype(dtype: str = None, return_positive: bool = True) -> float:
+    """get scale value by dtype
+
+    Args:
+        dtype (str): the string dtype value
+
+    Returns:
+        float: the scale value
+    """
+    if dtype is None:
+        dtype = paddle.get_default_dtype()
+
+    dtype = convert_dtype(dtype)
+    scale_value = 1e6
+
+    # TODO(wj-Mcaf): support int8, int4 dtypes later
+    if dtype == "float16":
+        scale_value = 1e4
+
+    if return_positive:
+        return scale_value
+    return -1 * scale_value
+
+
+def fn_args_to_dict(func, *args, **kwargs):
+    """
+    Inspect function `func` and its arguments for running, and extract a
+    dict mapping between argument names and keys.
+    """
+    if hasattr(inspect, "getfullargspec"):
+        (spec_args, spec_varargs, spec_varkw, spec_defaults, _, _, _) = inspect.getfullargspec(func)
+    else:
+        (spec_args, spec_varargs, spec_varkw, spec_defaults) = inspect.getargspec(func)
+    # add positional argument values
+    init_dict = dict(zip(spec_args, args))
+    # add default argument values
+    kwargs_dict = dict(zip(spec_args[-len(spec_defaults) :], spec_defaults)) if spec_defaults else {}
+    for k in list(kwargs_dict.keys()):
+        if k in init_dict:
+            kwargs_dict.pop(k)
+    kwargs_dict.update(kwargs)
+    init_dict.update(kwargs_dict)
+    return init_dict
+
+
+def adapt_stale_fwd_patch(self, name, value):
+    """
+    Since there are some monkey patches for forward of PretrainedModel, such as
+    model compression, we make these patches compatible with the latest forward
+    method.
+    """
+    if name == "forward":
+        # NOTE(guosheng): In dygraph to static, `layer.forward` would be patched
+        # by an instance of `StaticFunction`. And use string compare to avoid to
+        # import fluid.
+        if type(value).__name__.endswith("StaticFunction") or self.forward.__class__.__name__.endswith(
+            "StaticFunction"
+        ):
+            return value
+        if hasattr(inspect, "getfullargspec"):
+            (
+                patch_spec_args,
+                patch_spec_varargs,
+                patch_spec_varkw,
+                patch_spec_defaults,
+                _,
+                _,
+                _,
+            ) = inspect.getfullargspec(value)
+            (spec_args, spec_varargs, spec_varkw, spec_defaults, _, _, _) = inspect.getfullargspec(self.forward)
+        else:
+            (patch_spec_args, patch_spec_varargs, patch_spec_varkw, patch_spec_defaults) = inspect.getargspec(value)
+            (spec_args, spec_varargs, spec_varkw, spec_defaults) = inspect.getargspec(self.forward)
+        new_args = [
+            arg
+            for arg in ("output_hidden_states", "output_attentions", "return_dict")
+            if arg not in patch_spec_args and arg in spec_args
+        ]
+
+        if new_args:
+            if self.__module__.startswith("paddlenlp"):
+                warnings.warn(
+                    f"The `forward` method of {self.__class__ if isinstance(self, Layer) else self} is patched and the patch "
+                    "might be based on an old oversion which missing some "
+                    f"arguments compared with the latest, such as {new_args}. "
+                    "We automatically add compatibility on the patch for "
+                    "these arguemnts, and maybe the patch should be updated."
+                )
+            else:
+                warnings.warn(
+                    f"The `forward` method of {self.__class__ if isinstance(self, Layer) else self} "
+                    "is patched and the patch might be conflict with patches made "
+                    f"by paddlenlp which seems have more arguments such as {new_args}. "
+                    "We automatically add compatibility on the patch for "
+                    "these arguemnts, and maybe the patch should be updated."
+                )
+            if isinstance(self, Layer) and inspect.isfunction(value):
+
+                @functools.wraps(value)
+                def wrap_fwd(*args, **kwargs):
+                    for arg in new_args:
+                        kwargs.pop(arg, None)
+                    return value(self, *args, **kwargs)
+
+            else:
+
+                @functools.wraps(value)
+                def wrap_fwd(*args, **kwargs):
+                    for arg in new_args:
+                        kwargs.pop(arg, None)
+                    return value(*args, **kwargs)
+
+            return wrap_fwd
+    return value
+
+
+class InitTrackerMeta(type(Layer)):
+    """
+    This metaclass wraps the `__init__` method of a class to add `init_config`
+    attribute for instances of that class, and `init_config` use a dict to track
+    the initial configuration. If the class has `_pre_init` or `_post_init`
+    method, it would be hooked before or after `__init__` and called as
+    `_pre_init(self, init_fn, init_args)` or `_post_init(self, init_fn, init_args)`.
+    Since InitTrackerMeta would be used as metaclass for pretrained model classes,
+    which always are Layer and `type(Layer)` is not `type`, thus use `type(Layer)`
+    rather than `type` as base class for it to avoid inheritance metaclass
+    conflicts.
+    """
+
+    def __init__(cls, name, bases, attrs):
+        init_func = cls.__init__
+        # If attrs has `__init__`, wrap it using accessable `_pre_init, _post_init`.
+        # Otherwise, no need to wrap again since the super cls has been wraped.
+        # TODO: remove reduplicated tracker if using super cls `__init__`
+        pre_init_func = getattr(cls, "_pre_init", None) if "__init__" in attrs else None
+        post_init_func = getattr(cls, "_post_init", None) if "__init__" in attrs else None
+        cls.__init__ = InitTrackerMeta.init_and_track_conf(init_func, pre_init_func, post_init_func)
+        super(InitTrackerMeta, cls).__init__(name, bases, attrs)
+
+    @staticmethod
+    def init_and_track_conf(init_func, pre_init_func=None, post_init_func=None):
+        """
+        wraps `init_func` which is `__init__` method of a class to add `init_config`
+        attribute for instances of that class.
+        Args:
+            init_func (callable): It should be the `__init__` method of a class.
+                warning: `self` always is the class type of down-stream model, eg: BertForTokenClassification
+            pre_init_func (callable, optional): If provided, it would be hooked after
+                `init_func` and called as `pre_init_func(self, init_func, *init_args, **init_args)`.
+                Default None.
+            post_init_func (callable, optional): If provided, it would be hooked after
+                `init_func` and called as `post_init_func(self, init_func, *init_args, **init_args)`.
+                Default None.
+
+        Returns:
+            function: the wrapped function
+        """
+
+        @functools.wraps(init_func)
+        def __impl__(self, *args, **kwargs):
+            # registed helper by `pre_init_func`
+            if pre_init_func:
+                pre_init_func(self, init_func, *args, **kwargs)
+            # keep full configuration
+            init_func(self, *args, **kwargs)
+            # registed helper by `post_init_func`
+            if post_init_func:
+                post_init_func(self, init_func, *args, **kwargs)
+            self.init_config = kwargs
+            if args:
+                kwargs["init_args"] = args
+            kwargs["init_class"] = self.__class__.__name__
+
+        return __impl__
+
+    def __setattr__(self, name, value):
+        value = adapt_stale_fwd_patch(self, name, value)
+        return super(InitTrackerMeta, self).__setattr__(name, value)
+
+
+def param_in_func(func, param_field: str) -> bool:
+    """check if the param_field is in `func` method, eg: if the `bert` param is in `__init__` method
+
+    Args:
+        cls (type): the class of PretrainedModel
+        param_field (str): the name of field
+
+    Returns:
+        bool: the result of existence
+    """
+
+    if hasattr(inspect, "getfullargspec"):
+        result = inspect.getfullargspec(func)
+    else:
+        result = inspect.getargspec(func)
+
+    return param_field in result[0]
+
+
+def resolve_cache_dir(from_hf_hub: bool, from_aistudio: bool, cache_dir: Optional[str] = None) -> str:
+    """resolve cache dir for PretrainedModel and PretrainedConfig
+
+    Args:
+        from_hf_hub (bool): if load from huggingface hub
+        cache_dir (str): cache_dir for models
+    """
+    if cache_dir is not None:
+        return cache_dir
+    if from_aistudio:
+        return None
+    if from_hf_hub:
+        return HF_CACHE_HOME
+    return MODEL_HOME
+
+
+def find_transformer_model_type(model_class: Type) -> str:
+    """get the model type from module name,
+        eg:
+            BertModel -> bert,
+            RobertaForTokenClassification -> roberta
+
+    Args:
+        model_class (Type): the class of model
+
+    Returns:
+        str: the type string
+    """
+    from paddlenlp.transformers import PretrainedModel
+
+    default_model_type = ""
+
+    if not issubclass(model_class, PretrainedModel):
+        return default_model_type
+
+    module_name: str = model_class.__module__
+    if not module_name.startswith("paddlenlp.transformers."):
+        return default_model_type
+
+    tokens = module_name.split(".")
+    if len(tokens) < 3:
+        return default_model_type
+
+    return tokens[2]
+
+
+def find_transformer_model_class_by_name(model_name: str) -> Optional[Type[PretrainedModel]]:
+    """find transformer model_class by name
+
+    Args:
+        model_name (str): the string of class name
+
+    Returns:
+        Optional[Type[PretrainedModel]]: optional pretrained-model class
+    """
+    transformer_module = import_module("paddlenlp.transformers")
+
+    for obj_name in dir(transformer_module):
+        if obj_name.startswith("_"):
+            continue
+        obj = getattr(transformer_module, obj_name, None)
+        if obj is None:
+            continue
+
+        name = getattr(obj, "__name__", None)
+        if name is None:
+            continue
+
+        if name == model_name:
+            return obj
+    logger.debug(f"can not find model_class<{model_name}>")
+    return None
+
+
+def convert_file_size_to_int(size: Union[int, str]):
+    """
+    Converts a size expressed as a string with digits an unit (like `"5MB"`) to an integer (in bytes).
+    Args:
+        size (`int` or `str`): The size to convert. Will be directly returned if an `int`.
+    Example:
+    ```py
+    >>> convert_file_size_to_int("1MiB")
+    1048576
+    ```
+    """
+    if isinstance(size, int):
+        return size
+    if size.upper().endswith("GIB"):
+        return int(size[:-3]) * (2**30)
+    if size.upper().endswith("MIB"):
+        return int(size[:-3]) * (2**20)
+    if size.upper().endswith("KIB"):
+        return int(size[:-3]) * (2**10)
+    if size.upper().endswith("GB"):
+        int_size = int(size[:-2]) * (10**9)
+        return int_size // 8 if size.endswith("b") else int_size
+    if size.upper().endswith("MB"):
+        int_size = int(size[:-2]) * (10**6)
+        return int_size // 8 if size.endswith("b") else int_size
+    if size.upper().endswith("KB"):
+        int_size = int(size[:-2]) * (10**3)
+        return int_size // 8 if size.endswith("b") else int_size
+    raise ValueError("`size` is not in a valid format. Use an integer followed by the unit, e.g., '5GB'.")
+
+
+def paddlenlp_hub_download(
+    repo_id: str,
+    filename: str,
+    *,
+    subfolder: Optional[str] = None,
+    cache_dir: Union[str, Path, None] = None,
+    pretrained_model_name_or_path: str = None,
+) -> str:
+    if subfolder is None:
+        subfolder = ""
+    if pretrained_model_name_or_path is not None and is_url(repo_id):
+        cache_dir = os.path.join(cache_dir, pretrained_model_name_or_path, subfolder)
+    else:
+        cache_dir = os.path.join(cache_dir, repo_id, subfolder)
+
+    # check in cache_dir
+    weight_file_path = os.path.join(cache_dir, filename)
+
+    if os.path.exists(weight_file_path):
+        logger.info(f"Already cached {weight_file_path}")
+        return weight_file_path
+
+    # Download from custom model url
+    if is_url(repo_id):
+        # check wether the target file exist in the comunity bos server
+        if url_file_exists(repo_id):
+            logger.info(f"Downloading {repo_id}")
+            weight_file_path = get_path_from_url_with_filelock(repo_id, cache_dir)
+            # # check the downloaded weight file and registered weight file name
+            download_check(repo_id, "paddlenlp_hub_download")
+
+            # make sure that model states names: model_states.pdparams
+            new_weight_file_path = os.path.join(os.path.split(weight_file_path)[0], filename)
+
+            if weight_file_path != new_weight_file_path:
+                # create lock file, which is empty, under the `LOCK_FILE_HOME` directory.
+                lock_file_name = hashlib.md5((repo_id + cache_dir).encode("utf-8")).hexdigest()
+                # create `.lock` private directory in the cache dir
+                lock_file_path = os.path.join(cache_dir, ".lock", lock_file_name)
+
+                with FileLock(lock_file_path):
+                    if not os.path.exists(new_weight_file_path):
+                        shutil.move(weight_file_path, new_weight_file_path)
+
+                weight_file_path = new_weight_file_path
+
+            return weight_file_path
+
+        return None
+
+    # find in community repo
+    url_list = [COMMUNITY_MODEL_PREFIX, repo_id, filename]
+    if subfolder != "":
+        url_list.insert(2, subfolder)
+    community_model_file_path = "/".join(url_list)
+    assert is_url(community_model_file_path)
+
+    # check wether the target file exist in the comunity bos server
+    if url_file_exists(community_model_file_path):
+        logger.info(f"Downloading {community_model_file_path}")
+        weight_file_path = get_path_from_url_with_filelock(community_model_file_path, cache_dir)
+        # # check the downloaded weight file and registered weight file name
+        download_check(community_model_file_path, "paddlenlp_hub_download")
+        return weight_file_path
+
+    return None
+
+
+# Return value when trying to load a file from cache but the file does not exist in the distant repo.
+_CACHED_NO_EXIST = object()
+
+
+def cached_file(
+    path_or_repo_id: Union[str, os.PathLike],
+    filename: str,
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    subfolder: str = "",
+    from_aistudio: bool = False,
+    _raise_exceptions_for_missing_entries: bool = True,
+    _raise_exceptions_for_connection_errors: bool = True,
+    pretrained_model_name_or_path=None,
+) -> str:
+    """
+    Tries to locate a file in a local folder and repo, downloads and cache it if necessary.
+    Args:
+        path_or_repo_id (`str` or `os.PathLike`):
+            This can be either:
+            - a string, the *model id* of a model repo on huggingface.co.
+            - a path to a *directory* potentially containing the file.
+        filename (`str`):
+            The name of the file to locate in `path_or_repo`.
+        cache_dir (`str` or `os.PathLike`, *optional*):
+            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
+            cache should not be used.
+        subfolder (`str`, *optional*, defaults to `""`):
+            In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
+            specify the folder name here.
+
+    Returns:
+        `Optional[str]`: Returns the resolved file (to the cache folder if downloaded from a repo).
+    Examples:
+    ```python
+    # Download a model weight from the Hub and cache it.
+    model_weights_file = cached_file("bert-base-uncased", "pytorch_model.bin")
+    ```
+    """
+
+    if subfolder is None:
+        subfolder = ""
+
+    path_or_repo_id = str(path_or_repo_id)
+    full_filename = os.path.join(subfolder, filename)
+    if os.path.isdir(path_or_repo_id):
+        resolved_file = os.path.join(os.path.join(path_or_repo_id, subfolder), filename)
+        if not os.path.isfile(resolved_file):
+            if _raise_exceptions_for_missing_entries:
+                raise EnvironmentError(
+                    f"{path_or_repo_id} does not appear to have a file named {full_filename}. Checkout "
+                    f"'https://huggingface.co/{path_or_repo_id}/' for available files."
+                )
+            else:
+                return None
+        return resolved_file
+
+    if cache_dir is not None and isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    if from_aistudio:
+        try:
+            resolved_file = aistudio_download(
+                repo_id=path_or_repo_id, filename=filename, subfolder=subfolder, cache_dir=cache_dir
+            )
+        except:
+            resolved_file = None
+    else:
+        # if cache_dir is None:
+        #     cache_dir = os.path.join(MODEL_HOME, ".cache")
+        try:
+            # Load from URL or cache if already cached
+            resolved_file = paddlenlp_hub_download(
+                path_or_repo_id,
+                filename,
+                subfolder=None if len(subfolder) == 0 else subfolder,
+                # revision=revision,
+                cache_dir=cache_dir,
+                pretrained_model_name_or_path=pretrained_model_name_or_path,
+            )
+        except HTTPError as err:
+            # First we try to see if we have a cached version (not up to date):
+            resolved_file = try_to_load_from_cache(path_or_repo_id, full_filename, cache_dir=cache_dir)
+            if resolved_file is not None and resolved_file != _CACHED_NO_EXIST:
+                return resolved_file
+            if not _raise_exceptions_for_connection_errors:
+                return None
+
+            raise EnvironmentError(
+                f"There was a specific connection error when trying to load {path_or_repo_id}:\n{err}"
+            )
+
+    return resolved_file
+
+
+def cached_file_for_hf_hub(
+    path_or_repo_id: Union[str, os.PathLike],
+    filename: str,
+    cache_dir: Optional[Union[str, os.PathLike]] = None,
+    subfolder: str = "",
+    _raise_exceptions_for_missing_entries: bool = True,
+):
+
+    if subfolder is None:
+        subfolder = ""
+
+    path_or_repo_id = str(path_or_repo_id)
+    full_filename = os.path.join(subfolder, filename)
+    if os.path.isdir(path_or_repo_id):
+        resolved_file = os.path.join(os.path.join(path_or_repo_id, subfolder), filename)
+        if not os.path.isfile(resolved_file):
+            if _raise_exceptions_for_missing_entries:
+                raise EnvironmentError(
+                    f"{path_or_repo_id} does not appear to have a file named {full_filename}. Checkout "
+                    f"'https://huggingface.co/{path_or_repo_id}' for available files."
+                )
+            else:
+                return None
+        return resolved_file
+
+    if cache_dir is None:
+        cache_dir = os.path.join(MODEL_HOME, ".cache")
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    try:
+        # Load from URL or cache if already cached
+        download_check(path_or_repo_id, full_filename, addition="from_hf_hub")
+        resolved_file = hf_hub_download(
+            repo_id=path_or_repo_id,
+            filename=filename,
+            cache_dir=cache_dir,
+            subfolder=subfolder,
+            library_name="PaddleNLP",
+            library_version=__version__,
+        )
+        return resolved_file
+    except Exception as e:
+        print(e)
+        msg = f"""
+            {path_or_repo_id} is not a local folder and is not a valid model identifier "
+            "listed on 'https://huggingface.co/models' If this is a private repository, make sure to "
+            "pass a token having permission to this repo with `use_auth_token` or log in with "
+            "`huggingface-cli login` and pass `use_auth_token=True`.
+            """
+        if _raise_exceptions_for_missing_entries:
+            raise EnvironmentError(msg)
+        else:
+            logger.info(msg)
+            return None
+
+
+def get_checkpoint_shard_files(
+    pretrained_model_name_or_path,
+    index_filename,
+    cache_dir=None,
+    subfolder="",
+    from_aistudio=False,
+    from_hf_hub=False,
+):
+    """
+    For a given model:
+    - download and cache all the shards of a sharded checkpoint if `pretrained_model_name_or_path` is a model ID on the
+      Hub
+    - returns the list of paths to all the shards, as well as some metadata.
+    For the description of each arg, see [`PretrainedModel.from_pretrained`]. `index_filename` is the full path to the
+    index (downloaded and cached if `pretrained_model_name_or_path` is a model ID on the Hub).
+    """
+
+    import json
+
+    if not os.path.isfile(index_filename):
+        raise ValueError(f"Can't find a checkpoint index ({index_filename}) in {pretrained_model_name_or_path}.")
+
+    with open(index_filename, "r") as f:
+        index = json.loads(f.read())
+
+    shard_filenames = sorted(set(index["weight_map"].values()))
+    sharded_metadata = index["metadata"]
+    sharded_metadata["all_checkpoint_keys"] = list(index["weight_map"].keys())
+    sharded_metadata["weight_map"] = index["weight_map"].copy()
+
+    file_map = {file: set() for file in shard_filenames}
+    for weight, file in index["weight_map"].items():
+        file_map[file].add(weight)
+
+    sharded_metadata["file_map"] = file_map
+
+    # First, let's deal with local folder.
+    if os.path.isdir(pretrained_model_name_or_path):
+        shard_filenames = [os.path.join(pretrained_model_name_or_path, subfolder, f) for f in shard_filenames]
+        return shard_filenames, sharded_metadata
+
+    # At this stage pretrained_model_name_or_path is a model identifier on the Hub
+    cached_filenames = []
+    # Check if the model is already cached or not. We only try the last checkpoint, this should cover most cases of
+    # downloaded (if interrupted).
+    last_shard = try_to_load_from_cache(
+        pretrained_model_name_or_path,
+        shard_filenames[-1],
+        cache_dir=cache_dir,
+    )
+
+    show_progress_bar = last_shard is None
+    for shard_filename in tqdm.tqdm(shard_filenames, desc="Downloading shards", disable=not show_progress_bar):
+        try:
+            cached_filename = resolve_file_path(
+                pretrained_model_name_or_path,
+                [shard_filename],
+                subfolder,
+                cache_dir=cache_dir,
+                from_aistudio=from_aistudio,
+                from_hf_hub=from_hf_hub,
+            )
+            assert (
+                cached_filename is not None
+            ), f"please make sure {shard_filename} under {pretrained_model_name_or_path}"
+        # We have already dealt with RepositoryNotFoundError and RevisionNotFoundError when getting the index, so
+        # we don't have to catch them here.
+        except EntryNotFoundError:
+            raise EnvironmentError(
+                f"{pretrained_model_name_or_path} does not appear to have a file named {shard_filename} which is "
+                "required according to the checkpoint index."
+            )
+        except HTTPError:
+            raise EnvironmentError(
+                f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load {shard_filename}. You should try"
+                " again after checking your internet connection."
+            )
+
+        cached_filenames.append(cached_filename)
+
+    return cached_filenames, sharded_metadata
+
+
+def is_safetensors_available():
+    return importlib.util.find_spec("safetensors") is not None
+
+
+@contextlib.contextmanager
+def device_guard(device="cpu", dev_id=0):
+    origin_device = paddle.device.get_device()
+    if device == "cpu":
+        paddle.set_device(device)
+    elif device in ["gpu", "xpu", "npu"]:
+        paddle.set_device("{}:{}".format(device, dev_id))
+    try:
+        yield
+    finally:
+        paddle.set_device(origin_device)
+
+
+def paddlenlp_load(path, map_location="cpu"):
+    assert map_location in ["cpu", "gpu", "xpu", "npu", "numpy", "np"]
+    if map_location in ["numpy", "np"]:
+        return paddle.load(path, return_numpy=True)
+    else:
+        with device_guard(map_location):
+            return paddle.load(path)
+            # TODO(zhonghui03): the following code has problems when hot start optimizer checkpoint.
+            if map_location == "cpu":
+                from paddle.framework.io import (
+                    _parse_every_object,
+                    _to_LodTensor,
+                    _transformed_from_lodtensor,
+                )
+
+                def _ndarray_to_tensor(obj, return_numpy=False):
+                    if return_numpy:
+                        return obj
+                    if paddle.in_dynamic_mode():
+                        return paddle.Tensor(obj, zero_copy=True)
+                    else:
+                        return _to_LodTensor(obj)
+
+                state_dict = paddle.load(path, return_numpy=True)
+                # Hack for zero copy for saving loading time. for paddle.load there need copy to create paddle.Tensor
+                return _parse_every_object(state_dict, _transformed_from_lodtensor, _ndarray_to_tensor)
+
+            else:
+                return paddle.load(path)
+
+
+def is_paddle_support_lazy_init():
+    return hasattr(paddle, "LazyGuard")
+
+
+class ContextManagers:
+    """
+    Wrapper for `contextlib.ExitStack` which enters a collection of context managers. Adaptation of `ContextManagers`
+    in the `fastcore` library.
+    """
+
+    def __init__(self, context_managers: List[ContextManager]):
+        self.context_managers = context_managers
+        self.stack = ExitStack()
+
+    def __enter__(self):
+        for context_manager in self.context_managers:
+            self.stack.enter_context(context_manager)
+
+    def __exit__(self, *args, **kwargs):
+        self.stack.__exit__(*args, **kwargs)
+
+
+def use_hybrid_parallel():
+    try:
+        from paddle.distributed import fleet
+
+        hcg = fleet.get_hybrid_communicate_group()
+        return hcg
+    except:
+        return None
+
+
+def optimizer_name_suffix():
+    hcg = use_hybrid_parallel()
+    if hcg is not None:
+        name = []
+        if hcg.get_model_parallel_world_size() > 1:
+            name.append(f"tp{hcg.get_model_parallel_rank():0>2d}")
+        if hcg.get_pipe_parallel_world_size() > 1:
+            name.append(f"pp{hcg.get_stage_id():0>2d}")
+        if hcg.get_sharding_parallel_world_size() > 1:
+            name.append(f"shard{hcg.get_sharding_parallel_rank():0>2d}")
+
+        return "_".join(name)
+    else:
+        return None
+
+
+def weight_name_suffix():
+    hcg = use_hybrid_parallel()
+    if hcg is not None:
+        name = []
+        if hcg.get_model_parallel_world_size() > 1:
+            name.append(f"tp{hcg.get_model_parallel_rank():0>2d}")
+        if hcg.get_pipe_parallel_world_size() > 1:
+            name.append(f"pp{hcg.get_stage_id():0>2d}")
+        return "_".join(name)
+    else:
+        return None
+
+
+def dtype_byte_size(dtype):
+    """
+    Returns the size (in bytes) occupied by one parameter of type `dtype`.
+
+    Example:
+
+    ```py
+    >>> dtype_byte_size(paddle.float32)
+    4
+    ```
+    """
+    if dtype == paddle.bool:
+        return 1 / 8
+    bit_search = re.search(r"[^\d](\d+)$", str(dtype))
+    if bit_search is None:
+        raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
+    bit_size = int(bit_search.groups()[0])
+    return bit_size // 8
+
+
+def apply_print_resets(buf):
+    return re.sub(r"^.*\r", "", buf, 0, re.M)
+
+
+class CaptureStd:
+    """
+    Context manager to capture:
+
+        - stdout: replay it, clean it up and make it available via `obj.out`
+        - stderr: replay it and make it available via `obj.err`
+
+    Args:
+        out (`bool`, *optional*, defaults to `True`): Whether to capture stdout or not.
+        err (`bool`, *optional*, defaults to `True`): Whether to capture stderr or not.
+        replay (`bool`, *optional*, defaults to `True`): Whether to replay or not.
+            By default each captured stream gets replayed back on context's exit, so that one can see what the test was
+            doing. If this is a not wanted behavior and the captured data shouldn't be replayed, pass `replay=False` to
+            disable this feature.
+
+    Examples:
+
+    ```python
+    # to capture stdout only with auto-replay
+    with CaptureStdout() as cs:
+        print("Secret message")
+    assert "message" in cs.out
+
+    # to capture stderr only with auto-replay
+    import sys
+
+    with CaptureStderr() as cs:
+        print("Warning: ", file=sys.stderr)
+    assert "Warning" in cs.err
+
+    # to capture both streams with auto-replay
+    with CaptureStd() as cs:
+        print("Secret message")
+        print("Warning: ", file=sys.stderr)
+    assert "message" in cs.out
+    assert "Warning" in cs.err
+
+    # to capture just one of the streams, and not the other, with auto-replay
+    with CaptureStd(err=False) as cs:
+        print("Secret message")
+    assert "message" in cs.out
+    # but best use the stream-specific subclasses
+
+    # to capture without auto-replay
+    with CaptureStd(replay=False) as cs:
+        print("Secret message")
+    assert "message" in cs.out
+    ```"""
+
+    def __init__(self, out=True, err=True, replay=True):
+        self.replay = replay
+
+        if out:
+            self.out_buf = StringIO()
+            self.out = "error: CaptureStd context is unfinished yet, called too early"
+        else:
+            self.out_buf = None
+            self.out = "not capturing stdout"
+
+        if err:
+            self.err_buf = StringIO()
+            self.err = "error: CaptureStd context is unfinished yet, called too early"
+        else:
+            self.err_buf = None
+            self.err = "not capturing stderr"
+
+    def __enter__(self):
+        if self.out_buf:
+            self.out_old = sys.stdout
+            sys.stdout = self.out_buf
+
+        if self.err_buf:
+            self.err_old = sys.stderr
+            sys.stderr = self.err_buf
+
+        return self
+
+    def __exit__(self, *exc):
+        if self.out_buf:
+            sys.stdout = self.out_old
+            captured = self.out_buf.getvalue()
+            if self.replay:
+                sys.stdout.write(captured)
+            self.out = apply_print_resets(captured)
+
+        if self.err_buf:
+            sys.stderr = self.err_old
+            captured = self.err_buf.getvalue()
+            if self.replay:
+                sys.stderr.write(captured)
+            self.err = captured
+
+    def __repr__(self):
+        msg = ""
+        if self.out_buf:
+            msg += f"stdout: {self.out}\n"
+        if self.err_buf:
+            msg += f"stderr: {self.err}\n"
+        return msg
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/visualglm/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/visualglm/__init__.py
new file mode 100644
index 000000000..595add0ae
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/visualglm/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/visualglm/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/visualglm/configuration.py
new file mode 100644
index 000000000..ac9312a76
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/visualglm/configuration.py
@@ -0,0 +1,338 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" VisualGLM model configuration """
+import copy
+import os
+from typing import Union
+
+from ...utils.log import logger
+from ..chatglm.configuration import ChatGLMConfig
+from ..configuration_utils import PretrainedConfig
+
+__all__ = ["VisualGLMVisionConfig", "VisualGLMQFormerConfig", "VisualGLMConfig"]
+
+
+class VisualGLMVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`VisualGLMVisionModel`]. It is used to instantiate a
+    VisualGLM vision encoder according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1408):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 6144):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 39):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
+            to 1e-5): The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float``, *optional*, defaults to 1):
+            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
+            testing).
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import VisualGLMVisionConfig, VisualGLMVisionModel
+    >>> # Initializing a VisualGLMVisionConfig
+    >>> configuration = VisualGLMVisionConfig()
+    >>> # Initializing a VisualGLMVisionModel (with random weights) from the configuration above.
+    >>> model = VisualGLMVisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "visualglm_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=1408,
+        intermediate_size=6144,
+        num_hidden_layers=39,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=224,
+        patch_size=14,
+        hidden_act="gelu",
+        layer_norm_eps=0.00001,
+        dropout=0.1,
+        attention_dropout=0.1,
+        initializer_range=1e-10,
+        initializer_factor=1.0,
+        qkv_bias=True,
+        **kwargs,
+    ):
+        kwargs["return_dict"] = kwargs.pop("return_dict", True)
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.qkv_bias = qkv_bias
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        # get the vision config dict if we are loading from VisualGLMConfig
+        if config_dict.get("model_type") == "visualglm":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class VisualGLMQFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`VisualGLMQFormerModel`]. It is used to instantiate a
+    VisualGLM Querying Transformer (Q-Former) model according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.
+    Note that [`VisualGLMQFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+        cross_attention_frequency (`int`, *optional*, defaults to 2):
+            The frequency of adding cross-attention to the Transformer layers.
+        encoder_hidden_size (`int`, *optional*, defaults to 1408):
+            The hidden size of the hidden states for cross-attention.
+    Examples:
+    ```python
+    >>> from paddlenlp.transformers import VisualGLMQFormerConfig, VisualGLMQFormerModel
+    >>> # Initializing a VisualGLM configuration
+    >>> configuration = VisualGLMQFormerConfig()
+    >>> # Initializing a model (with random weights) from the configuration above
+    >>> model = VisualGLMQFormerModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "visualglm_qformer_model"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        classifier_dropout=None,
+        cross_attention_frequency=2,
+        encoder_hidden_size=1408,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.classifier_dropout = classifier_dropout
+        self.cross_attention_frequency = cross_attention_frequency
+        self.encoder_hidden_size = encoder_hidden_size
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the qformer config dict if we are loading from VisualGLMConfig
+        if config_dict.get("model_type") == "visualglm":
+            config_dict = config_dict["qformer_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class VisualGLMConfig(PretrainedConfig):
+    r"""
+    [`VisualGLMConfig`] is the configuration class to store the configuration of a [`VisualGLMForConditionalGeneration`]. It is
+    used to instantiate a VisualGLM model according to the specified arguments, defining the vision model, Q-Former model
+    and language model configs.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`VisualGLMVisionConfig`].
+        qformer_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`VisualGLMQFormerConfig`].
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
+        num_query_tokens (`int`, *optional*, defaults to 32):
+            The number of query tokens passed through the Transformer.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    Example:
+    ```python
+    >>> from paddlenlp.transformers import (
+    ...     VisualGLMVisionConfig,
+    ...     VisualGLMQFormerConfig,
+    ...     ChatGLMConfig,
+    ...     VisualGLMConfig,
+    ...     VisualGLMForConditionalGeneration,
+    ... )
+    >>> # Initializing a VisualGLMConfig configuration
+    >>> configuration = VisualGLMConfig()
+    >>> # Initializing a VisualGLMForConditionalGeneration (with random weights) from the configuration above
+    >>> model = VisualGLMForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    >>> # We can also initialize a VisualGLMConfig from a VisualGLMVisionConfig, VisualGLMQFormerConfig and any PretrainedConfig
+    >>> # Initializing VisualGLM vision, VisualGLM Q-Former and language model configurations
+    >>> vision_config = VisualGLMVisionConfig()
+    >>> qformer_config = VisualGLMQFormerConfig()
+    >>> text_config = ChatGLMConfig()
+    >>> config = VisualGLMConfig.from_text_vision_configs(vision_config, qformer_config, text_config)
+    ```"""
+
+    model_type = "visualglm"
+
+    def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the VisualGLMVisionConfig with default values.")
+
+        if qformer_config is None:
+            qformer_config = {}
+            logger.info("qformer_config is None. Initializing the VisualGLMQFormerConfig with default values.")
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the text config with default values (`ChatGLMConfig`).")
+        self.vision_config = VisualGLMVisionConfig(**vision_config)
+        self.qformer_config = VisualGLMQFormerConfig(**qformer_config)
+        text_model_type = text_config["model_type"] if "model_type" in text_config else "chatglm"
+
+        if text_model_type == "chatglm":
+            self.text_config = ChatGLMConfig(**text_config)
+        else:
+            raise ValueError("Only chatglm accepted for model_type, but accepted {}.".format(text_model_type))
+
+        self.num_query_tokens = num_query_tokens
+        self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
+
+        self.initializer_factor = 1.0
+        self.initializer_range = 0.02
+
+    @classmethod
+    def from_vision_qformer_text_configs(
+        cls,
+        vision_config: VisualGLMVisionConfig,
+        qformer_config: VisualGLMQFormerConfig,
+        text_config: PretrainedConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`VisualGLMConfig`] (or a derived class) from a vision model, Q-Former and language model
+        configurations.
+        Returns:
+            [`VisualGLM`]: An instance of a configuration object
+        """
+
+        return cls(
+            vision_config=vision_config.to_dict(),
+            qformer_config=qformer_config.to_dict(),
+            text_config=text_config.to_dict(),
+            **kwargs,
+        )
+
+    def to_dict(self, *args, **kwargs):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["vision_config"] = self.vision_config.to_dict()
+        output["qformer_config"] = self.qformer_config.to_dict()
+        output["text_config"] = self.text_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/visualglm/image_processing.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/visualglm/image_processing.py
new file mode 100644
index 000000000..9d02c5597
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/visualglm/image_processing.py
@@ -0,0 +1,284 @@
+# coding=utf-8
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for VisualGLM."""
+
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import PIL
+
+from ..image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ..image_transforms import (
+    convert_to_rgb,
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from ..image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    is_batched,
+    to_numpy_array,
+    valid_images,
+)
+from ..tokenizer_utils_base import TensorType
+
+__all__ = [
+    "VisualGLMImageProcessor",
+]
+
+
+class VisualGLMImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a VisualGLM image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+            overridden by the `resample` parameter in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Wwhether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
+            overridden by the `rescale_factor` parameter in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        default_image_mean = [0.48145466, 0.4578275, 0.40821073]
+        default_image_std = [0.26862954, 0.26130258, 0.27577711]
+        size = size if size is not None else {"height": 224, "width": 224}
+        size = get_size_dict(size, default_to_square=True)
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else default_image_mean
+        self.image_std = image_std if image_std is not None else default_image_std
+        self.do_convert_rgb = do_convert_rgb
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Resize an image.
+
+        Resizes the shorter side of the image to `size["shortest_edge"]` while preserving the aspect ratio. If the
+        longer side is larger than the max size `(int(`size["shortest_edge"]` * 1333 / 800))`, the longer side is then
+        resized to the max size while preserving the aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Controls the size of the output image. Should be of the form `{"shortest_edge": int}`.
+            resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        size = get_size_dict(size, default_to_square=True)
+        output_size = (size["width"], size["height"])
+        return resize(image, size=output_size, resample=resample, data_format=data_format, **kwargs)
+
+    def rescale(
+        self,
+        image: np.ndarray,
+        scale: Union[int, float],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ):
+        """
+        Rescale an image by a scale factor. image = image * scale.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            scale (`int` or `float`):
+                Scale to apply to the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return rescale(image, scale=scale, data_format=data_format, **kwargs)
+
+    def normalize(
+        self,
+        image: np.ndarray,
+        mean: Union[float, List[float]],
+        std: Union[float, List[float]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            mean (`float` or `List[float]`):
+                Image mean.
+            std (`float` or `List[float]`):
+                Image standard deviation.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+        """
+        return normalize(image, mean=mean, std=std, data_format=data_format, **kwargs)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        do_convert_rgb: bool = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Controls the size of the image after `resize`. The shortest edge of the image is resized to
+                `size["shortest_edge"]` while preserving the aspect ratio. If the longest edge of this resized image
+                is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
+                edge equal to `int(size["shortest_edge"] * (1333 / 800))`.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to normalize the image by if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.PADDLE` or `'pt'`: Return a batch of type `paddle.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: defaults to the channel dimension format of the input image.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+
+        if not is_batched(images):
+            images = [images]
+
+        if not valid_images(images):
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
+
+        if do_resize and size is None or resample is None:
+            raise ValueError("Size and resample must be specified if do_resize is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_resize:
+            images = [self.resize(image=image, size=size, resample=resample) for image in images]
+
+        if do_rescale:
+            images = [self.rescale(image=image, scale=rescale_factor) for image in images]
+
+        if do_normalize:
+            images = [self.normalize(image=image, mean=image_mean, std=image_std) for image in images]
+
+        images = [to_channel_dimension_format(image, data_format) for image in images]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/visualglm/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/visualglm/modeling.py
new file mode 100644
index 000000000..34c8590e3
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/visualglm/modeling.py
@@ -0,0 +1,1550 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Any, Optional, Tuple, Union
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.distributed.fleet.utils import recompute
+from paddle.nn import CrossEntropyLoss
+
+from paddlenlp.utils.log import logger
+
+from ...utils.initializer import normal_, ones_, zeros_
+from ..activations import ACT2FN
+from ..chatglm.configuration import ChatGLMConfig
+from ..chatglm.modeling import ChatGLMForCausalLM
+from ..model_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPooling,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    ModelOutput,
+)
+from ..model_utils import (
+    PretrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+
+VisualGLM_PRETRAINED_MODEL_ARCHIVE_LIST = []
+
+from .configuration import (
+    VisualGLMConfig,
+    VisualGLMQFormerConfig,
+    VisualGLMVisionConfig,
+)
+
+__all__ = [
+    "VisualGLMModel",
+    "VisualGLMPretrainedModel",
+    "VisualGLMQFormerModel",
+    "VisualGLMVisionModel",
+    "VisualGLMForConditionalGeneration",
+]
+
+
+def Parameter(tensor, dtype="float16"):
+    tensor = paddle.cast(tensor, dtype)
+    return paddle.create_parameter(tensor.shape, dtype=tensor.dtype, default_initializer=nn.initializer.Assign(tensor))
+
+
+@dataclass
+class VisualGLMForConditionalGenerationModelOutput(ModelOutput):
+    """
+    Class defining the outputs of [`VisualGLMForConditionalGeneration`].
+    Args:
+        loss (`paddle.Tensor`, *optional*, returned when `labels` is provided, `paddle.Tensor` of shape `(1,)`):
+            Language modeling loss from the language model.
+        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head of the language model.
+        vision_outputs (`BaseModelOutputWithPooling`):
+            Outputs of the vision encoder.
+        qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
+            Outputs of the Q-Former (Querying Transformer).
+        language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
+            Outputs of the language model.
+    """
+
+    loss: Optional[Tuple[paddle.Tensor]] = None
+    logits: Optional[Tuple[paddle.Tensor]] = None
+    vision_outputs: Optional[paddle.Tensor] = None
+    qformer_outputs: Optional[Tuple[paddle.Tensor]] = None
+    language_model_outputs: Optional[Tuple[paddle.Tensor]] = None
+
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k]
+            if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"]
+            else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+class VisualGLMPretrainedModel(PretrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = VisualGLMConfig
+    base_model_prefix = "visualglm"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [
+        r"position_ids",
+    ]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_range
+        if isinstance(module, nn.Conv2D) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
+            normal_(module.weight, mean=0.0, std=factor)
+            if hasattr(module, "bias") and module.bias is not None:
+                zeros_(module.bias)
+
+        if isinstance(module, VisualGLMVisionEmbeddings):
+            if hasattr(self.config, "vision_config"):
+                factor = self.config.vision_config.initializer_range
+            trunc_normal_ = nn.initializer.TruncatedNormal(mean=0.0, std=factor)
+            trunc_normal_(module.position_embedding)
+            trunc_normal_(
+                module.class_embedding,
+            )
+        elif isinstance(module, nn.LayerNorm):
+            zeros_(module.bias)
+            ones_(module.weight)
+        elif isinstance(module, nn.Linear) and module.bias is not None:
+            zeros_(module.bias)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, VisualGLMEncoder):
+            module.gradient_checkpointing = value
+
+
+class VisualGLMVisionEmbeddings(nn.Layer):
+    def __init__(self, config: VisualGLMVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.in_channels = config.num_channels
+
+        self.patch_embedding = nn.Conv2D(
+            in_channels=self.in_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+
+        self.class_embedding = Parameter(paddle.randn([1, 1, self.embed_dim]), dtype=self.patch_embedding.weight.dtype)
+        self.position_embedding = Parameter(
+            paddle.randn([1, self.num_positions, self.embed_dim]), dtype=self.patch_embedding.weight.dtype
+        )
+
+    def forward(self, pixel_values: paddle.Tensor) -> paddle.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose([0, 2, 1])
+
+        class_embeds = self.class_embedding.expand([batch_size, 1, -1]).cast(target_dtype)
+        embeddings = paddle.concat([class_embeds, patch_embeds], axis=1)
+        embeddings = embeddings + self.position_embedding[:, : embeddings.shape[1], :].cast(target_dtype)
+        return embeddings
+
+
+class VisualGLMAttention(nn.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = nn.Dropout(config.attention_dropout)
+
+        # small tweak here compared to CLIP, no bias here
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias_attr=False)
+
+        if config.qkv_bias:
+            q_bias = Parameter(paddle.zeros([self.embed_dim], dtype=self.qkv.weight.dtype))
+            v_bias = Parameter(paddle.zeros([self.embed_dim], dtype=self.qkv.weight.dtype))
+        else:
+            q_bias = None
+            v_bias = None
+
+        if q_bias is not None:
+            qkv_bias = paddle.concat((q_bias, paddle.zeros_like(v_bias), v_bias))
+            self.qkv.bias = Parameter(qkv_bias, dtype=self.qkv.weight.dtype)
+
+        self.projection = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
+        return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        head_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        bsz, tgt_len, embed_dim = hidden_states.shape
+
+        mixed_qkv = self.qkv(hidden_states)
+
+        mixed_qkv = mixed_qkv.reshape([bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads]).transpose(
+            [2, 0, 3, 1, 4]
+        )
+        query_states, key_states, value_states = (
+            mixed_qkv[0],
+            mixed_qkv[1],
+            mixed_qkv[2],
+        )
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_states, key_states, transpose_y=True)
+
+        attention_scores = attention_scores * self.scale
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = F.softmax(attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = paddle.matmul(attention_probs, value_states).transpose([0, 2, 1, 3])
+
+        new_context_layer_shape = context_layer.shape[:-2] + [
+            self.embed_dim,
+        ]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        output = self.projection(context_layer)
+
+        outputs = (output, attention_probs) if output_attentions else (output, None)
+
+        return outputs
+
+
+class VisualGLMMLP(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class VisualGLMEncoderLayer(nn.Layer):
+    def __init__(self, config: VisualGLMConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = VisualGLMAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
+        self.mlp = VisualGLMMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, epsilon=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: paddle.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        """
+        Args:
+            hidden_states (`paddle.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`paddle.Tensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            head_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = hidden_states + residual
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class VisualGLMEncoder(nn.Layer):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`VisualGLMEncoderLayer`].
+    Args:
+        config (`VisualGLMConfig`):
+            The corresponding vision configuration for the `VisualGLMEncoder`.
+    """
+
+    def __init__(self, config: VisualGLMConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.LayerList([VisualGLMEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = recompute(
+                    create_custom_forward(encoder_layer),
+                    hidden_states,
+                    attention_mask,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class VisualGLMVisionModel(VisualGLMPretrainedModel):
+    main_input_name = "pixel_values"
+    config_class = VisualGLMVisionConfig
+
+    def __init__(self, config: VisualGLMVisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = VisualGLMVisionEmbeddings(config)
+        self.encoder = VisualGLMEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
+
+    def forward(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+
+class VisualGLMQFormerMultiHeadAttention(nn.Layer):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
+                % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.save_attention = False
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.shape[:-1] + [self.num_attention_heads, self.attention_head_size]
+        x = x.reshape(new_x_shape)
+        return x.transpose([0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = paddle.concat([past_key_value[0], key_layer], axis=2)
+            value_layer = paddle.concat([past_key_value[1], value_layer], axis=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        mixed_query_layer = self.query(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = paddle.matmul(query_layer, key_layer, transpose_y=True)
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.shape[1]
+            position_ids_l = paddle.arange(seq_length, dtype="int64").reshape([-1, 1])
+            position_ids_r = paddle.arange(seq_length, dtype="int64").reshape([1, -1])
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.cast(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = paddle.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = paddle.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(axis=-1)(attention_scores)
+
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = paddle.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.transpose([0, 2, 1, 3])
+        new_context_layer_shape = context_layer.shape[:-2] + [
+            self.all_head_size,
+        ]
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class VisualGLMQFormerSelfOutput(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class VisualGLMQFormerAttention(nn.Layer):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.attention = VisualGLMQFormerMultiHeadAttention(config, is_cross_attention)
+        self.output = VisualGLMQFormerSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, axis=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        head_mask: Optional[paddle.Tensor] = None,
+        encoder_hidden_states: Optional[paddle.Tensor] = None,
+        encoder_attention_mask: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[Tuple[Tuple[paddle.Tensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor]:
+        self_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class VisualGLMQFormerIntermediate(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class VisualGLMQFormerOutput(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: paddle.Tensor, input_tensor: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states + input_tensor
+        # hidden_states = self.LayerNorm()
+        return hidden_states
+
+
+class VisualGLMQFormerLayer(nn.Layer):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.attention = VisualGLMQFormerAttention(config)
+
+        self.layer_idx = layer_idx
+
+        if layer_idx % config.cross_attention_frequency == 0:
+            self.crossattention = VisualGLMQFormerAttention(config, is_cross_attention=True)
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+
+        self.intermediate_query = VisualGLMQFormerIntermediate(config)
+        self.output_query = VisualGLMQFormerOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        hidden_states = self.input_layernorm(hidden_states)
+        self_attention_outputs = self.attention(
+            hidden_states,  # 1, 32, 768
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+
+        present_key_value = self_attention_outputs[-1]
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                if encoder_hidden_states is None:
+                    raise ValueError("encoder_hidden_states must be given for cross-attention layers")
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                query_attention_output = cross_attention_outputs[0]
+                # add cross attentions if we output attention weights
+                outputs = outputs + cross_attention_outputs[1:-1]
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = paddle.concat([layer_output, layer_output_text], axis=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+
+        outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+
+class VisualGLMQFormerEncoder(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.LayerList(
+            [VisualGLMQFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+
+        next_decoder_cache = () if use_cache else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions, query_length)
+
+                    return custom_forward
+
+                layer_outputs = recompute(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if layer_module.has_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class VisualGLMQFormerModel(VisualGLMPretrainedModel):
+    """
+    Querying Transformer (Q-Former), used in VisualGLM.
+    """
+
+    def __init__(self, config: VisualGLMQFormerConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.final_layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.encoder = VisualGLMQFormerEncoder(config)
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self,
+        attention_mask: paddle.Tensor,
+        input_shape: Tuple[int],
+        has_query: bool = False,
+    ) -> paddle.Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+        Arguments:
+            attention_mask (`paddle.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`Tuple[int]`):
+                The shape of the input to the model.
+        Returns:
+            `paddle.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.cast(dtype=self.config.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def invert_attention_mask(self, encoder_attention_mask: paddle.Tensor) -> paddle.Tensor:
+        """
+        Invert an attention mask (e.g., switches 0. and 1.).
+        Args:
+            encoder_attention_mask (`paddle.Tensor`): An attention mask.
+        Returns:
+            `paddle.Tensor`: The inverted attention mask.
+        """
+        if encoder_attention_mask.ndim == 3:
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+        if encoder_attention_mask.ndim == 2:
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
+        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
+        # /transformer/transformer_layers.py#L270
+        # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
+        # encoder_extended_attention_mask.transpose(-1, -2))
+        encoder_extended_attention_mask = encoder_extended_attention_mask.cast(
+            dtype=self.config.dtype
+        )  # fp16 compatibility
+        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4
+
+        return encoder_extended_attention_mask
+
+    def get_head_mask(
+        self, head_mask: Optional[paddle.Tensor], num_hidden_layers: int, is_attention_chunked: bool = False
+    ) -> paddle.Tensor:
+        """
+        Prepare the head mask if needed.
+        Args:
+            head_mask (`paddle.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):
+                The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
+            num_hidden_layers (`int`):
+                The number of hidden layers in the model.
+            is_attention_chunked: (`bool`, *optional*, defaults to `False`):
+                Whether or not the attentions scores are computed by chunks or not.
+        Returns:
+            `paddle.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with
+            `[None]` for each layer.
+        """
+        if head_mask is not None:
+            head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
+            if is_attention_chunked is True:
+                head_mask = head_mask.unsqueeze(-1)
+        else:
+            head_mask = [None] * num_hidden_layers
+
+        return head_mask
+
+    def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
+        """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
+        if head_mask.ndim == 1:
+            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+            head_mask = head_mask.expand([num_hidden_layers, -1, -1, -1, -1])
+        elif head_mask.ndim == 2:
+            head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+        assert head_mask.ndim == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
+        head_mask = head_mask.cast(dtype=self.config.dtype)  # switch to float if need + fp16 compatibility
+        return head_mask
+
+    def forward(
+        self,
+        query_embeds,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(paddle.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of:
+            shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
+            value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
+            used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
+            `(batch_size, sequence_length)`.
+        use_cache (`bool`, `optional`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0
+        )
+
+        query_length = query_embeds.shape[1] if query_embeds is not None else 0
+
+        embedding_output = self.dropout(query_embeds)
+
+        input_shape = embedding_output.shape[:-1]
+        batch_size, seq_length = input_shape
+
+        if attention_mask is None:
+            attention_mask = paddle.ones(((batch_size, seq_length + past_key_values_length)))
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].shape
+            else:
+                (
+                    encoder_batch_size,
+                    encoder_sequence_length,
+                    _,
+                ) = encoder_hidden_states.shape
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = paddle.ones(encoder_hidden_shape)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.final_layernorm(sequence_output)
+        pooled_output = sequence_output[:, 0, :]
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+class VisualGLMModel(VisualGLMPretrainedModel):
+    config_class = VisualGLMConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: VisualGLMConfig):
+        super().__init__(config)
+
+        self.vision_model = VisualGLMVisionModel(config.vision_config)
+        self.query_tokens = Parameter(
+            paddle.zeros([1, config.num_query_tokens, config.qformer_config.hidden_size]), dtype=self.config.dtype
+        )
+        self.qformer = VisualGLMQFormerModel(config.qformer_config)
+
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+        self.language_model = ChatGLMForCausalLM(config.text_config)
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.vision_model.embeddings.patch_embedding
+
+    def get_text_features(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ):
+        r"""
+        Returns:
+            text_outputs (`CausalLMOutputWithPast`, or `tuple(paddle.Tensor)` if `return_dict=False`):
+                The language model outputs. If `return_dict=True`, the output is a [`CausalLMOutputWithPast`] that
+                contains the language model logits, the past key values and the hidden states if
+                `output_hidden_states=True`.
+        Examples:
+        ```python
+        >>> import paddle
+        >>> from paddlenlp.transformers import ChatGLMTokenizer, VisualGLMModel
+        >>> tokenizer = ChatGLMTokenizer.from_pretrained("model_name")
+        >>> tokenizer.pad_token = tokenizer.eos_token
+        >>> model = VisualGLMModel.from_pretrained("model_name")
+        >>> model.eval()
+        >>> inputs = tokenizer(["a photo of a cat"], padding=True, return_tensors="pd", return_token_type_ids=False)
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.language_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return text_outputs
+
+    def get_image_features(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ):
+        r"""
+        Returns:
+            vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`):
+                The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that
+                contains the image features, the pooled image features and the hidden states if
+                `output_hidden_states=True`.
+        Examples:
+        ```python
+        >>> import paddle
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import MinitGPT4Processor, VisualGLMModel
+        >>> processor = MinitGPT4Processor.from_pretrained("model_name")
+        >>> model = VisualGLMModel.from_pretrained("model_name")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor.process_images(images=image, return_tensors="pd")
+        >>> image_outputs = model.get_image_features(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
+        vision_outputs = self.vision_model(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        return vision_outputs
+
+    def get_qformer_features(
+        self,
+        pixel_values: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs
+    ):
+        r"""
+        Returns:
+            vision_outputs (`BaseModelOutputWithPooling` or tuple of `paddle.Tensor`):
+                The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that
+                contains the image features, the pooled image features and the hidden states if
+                `output_hidden_states=True`.
+        Examples:
+        ```python
+        >>> import paddle
+        >>> from PIL import Image
+        >>> import requests
+        >>> from paddlenlp.transformers import MinitGPT4Processor, VisualGLMModel
+        >>> processor = MinitGPT4Processor.from_pretrained("model_name")
+        >>> model = VisualGLMModel.from_pretrained("model_name")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor.process_images(images=image, return_tensors="pd")
+        >>> qformer_outputs = model.get_qformer_features(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
+        vision_outputs = self.vision_model(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[0]
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
+        query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype)
+        image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+
+        return query_outputs
+
+    def forward(
+        self,
+        pixel_values: paddle.Tensor,  # processed image
+        first_input_ids: paddle.Tensor,
+        second_input_ids: paddle.Tensor,
+        first_attention_mask: Optional[paddle.Tensor] = None,
+        second_attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[paddle.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, VisualGLMForConditionalGenerationModelOutput]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> import paddle
+        >>> from paddlenlp.transformers import VisualGLMProcessor, VisualGLMModel
+        >>> processor = VisualGLMProcessor.from_pretrained("model_name")
+        >>> model = VisualGLMModel.from_pretrained("model_name")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "describe this image"
+        >>> prompt = "###Human: <Img><ImageHere></Img> <TextHere>###Assistant:"
+        >>> inputs = processor(images=image, texts=text, prompts=prompt, return_tensors="pd")
+        >>> outputs = model(**inputs)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        vision_outputs = self.vision_model(pixel_values, return_dict=True)
+        image_embeds = vision_outputs.last_hidden_state
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
+        query_tokens = paddle.cast(query_tokens, self.qformer.layernorm.weight.dtype)
+        image_embeds = paddle.cast(image_embeds, self.qformer.layernorm.weight.dtype)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=True,
+        )
+        query_output = query_outputs.last_hidden_state
+
+        # step 3: use the language model, conditioned on the text and image
+        language_model_inputs = self.language_projection(query_output)
+        language_model_attention_mask = paddle.ones(language_model_inputs.shape[:-1], dtype="int64")
+
+        first_embeds = self.language_model.chatglm.transformer.word_embeddings(first_input_ids)
+        second_embeds = self.language_model.chatglm.word_embeddings(second_input_ids)
+        language_model_inputs = paddle.cast(language_model_inputs, dtype=first_embeds.dtype)
+        inputs_embeds = paddle.concat([first_embeds, language_model_inputs, second_embeds], axis=1)
+
+        if first_attention_mask is None:
+            first_attention_mask = paddle.ones_like(first_embeds.shape[:-1], dtype="int64")
+        if second_attention_mask is None:
+            second_attention_mask = paddle.ones_like(second_embeds.shape[:-1], dtype="int64")
+        attention_mask = paddle.concat(
+            [first_attention_mask, language_model_attention_mask, second_attention_mask], axis=1
+        )
+
+        outputs = self.language_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        logits = outputs.logits if return_dict else outputs[0]
+        loss = None
+        # we compute the loss here since we need to take into account the sequence length of the query embeds
+        if labels is not None:
+            logits = logits[:, -labels.shape[1] :, :]
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(reduction="mean")
+
+            loss = loss_fct(shift_logits.reshape([-1, self.config.text_config.vocab_size]), shift_labels.reshape([-1]))
+
+        if not return_dict:
+            output = (logits, vision_outputs, query_outputs, outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return VisualGLMForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+
+
+class ChatGLMForConditionalGenerationWithImage(ChatGLMForCausalLM):
+    def __init__(self, config: ChatGLMConfig):
+        super(ChatGLMForConditionalGenerationWithImage, self).__init__(config)
+        self.config = config
+
+    def forward(
+        self,
+        image_features: paddle.Tensor,
+        input_ids: paddle.Tensor,
+        position_ids: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        pre_image_length: Optional[int] = None,
+        cache: Optional[Tuple[paddle.Tensor]] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is None and cache is None and image_features is not None:
+            pre_ids, pad_ids, post_ids = paddle.split(input_ids, num_or_sections=[pre_image_length, 32, -1], axis=1)
+            pre_txt_emb = self.chatglm.transformer.word_embeddings(pre_ids)
+            post_txt_emb = self.chatglm.transformer.word_embeddings(post_ids)
+            inputs_embeds = paddle.concat([pre_txt_emb, image_features, post_txt_emb], axis=1)
+
+        outputs = super().forward(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            cache=cache,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            return_dict=return_dict,
+        )
+
+        return outputs
+
+
+class VisualGLMForConditionalGeneration(VisualGLMPretrainedModel):
+    config_class = VisualGLMConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: VisualGLMConfig):
+        super().__init__(config)
+        self.config = config
+        self.vision_model = VisualGLMVisionModel(config.vision_config)
+        self.query_tokens = Parameter(
+            paddle.zeros([1, config.num_query_tokens, config.qformer_config.hidden_size]), dtype=self.config.dtype
+        )
+        self.qformer = VisualGLMQFormerModel(config.qformer_config)
+        self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
+        self.language_model = ChatGLMForConditionalGenerationWithImage(config.text_config)
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.vision_model.embeddings.patch_embedding
+
+    def encode_images(
+        self,
+        pixel_values: paddle.Tensor,  # processed image
+    ):
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        pixel_values = paddle.cast(pixel_values, self.vision_model.embeddings.patch_embedding.weight.dtype)
+        vision_outputs = self.vision_model(pixel_values, return_dict=True)
+        image_embeds = vision_outputs.last_hidden_state
+        image_attention_mask = paddle.ones(image_embeds.shape[:-1], dtype="int64")
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        query_tokens = self.query_tokens.expand([image_embeds.shape[0], -1, -1])
+        query_tokens = paddle.cast(query_tokens, self.qformer.final_layernorm.weight.dtype)
+        image_embeds = paddle.cast(image_embeds, self.qformer.final_layernorm.weight.dtype)
+        query_outputs = self.qformer(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=True,
+        )
+        query_output = query_outputs.last_hidden_state
+
+        # step 3: mapping query_output into language_model space
+        language_model_inputs = self.language_projection(query_output)
+
+        return language_model_inputs
+
+    @paddle.no_grad()
+    def generate(
+        self,
+        pixel_values: paddle.Tensor,
+        input_ids: paddle.Tensor,
+        pre_image_length: int,
+        attention_mask: Optional[paddle.Tensor] = None,
+        **generate_kwargs,
+    ) -> paddle.Tensor:
+        """
+        Overrides `generate` function to be able to use the model as a conditional generator.
+        Args:
+            pixel_values (`paddle.Tensor` of shape (batch_size, num_channels, height, width)):
+                Input images to be processed.
+            input_ids (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*):
+                The sequence used as a prompt for the generation.
+            attention_mask (`paddle.Tensor` of shape (batch_size, sequence_length), *optional*):
+                Mask to avoid performing attention on padding token indices
+        Returns:
+            captions (list): A list of strings of length batch_size * num_captions.
+
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> import paddle
+        >>> from paddlenlp.transformers import VisualGLMProcessor, VisualGLMForConditionalGeneration
+        >>> processor = VisualGLMProcessor.from_pretrained("model_name")
+        >>> model = VisualGLMForConditionalGeneration.from_pretrained("model_name")
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "describe this image"
+        >>> prompt = "###Human: <Img><ImageHere></Img> <TextHere>###Assistant:"
+        >>> inputs = processor(images=image, texts=text, prompts=prompt, return_tensors="pd")
+        >>> generated_ids, scores= model.generate(**inputs)
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        """
+
+        image_features = self.encode_images(pixel_values)
+
+        outputs = self.language_model.generate(
+            input_ids=input_ids,
+            image_features=image_features,
+            pre_image_length=pre_image_length,
+            attention_mask=attention_mask,
+            **generate_kwargs,
+        )
+
+        return outputs
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/visualglm/processing.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/visualglm/processing.py
new file mode 100644
index 000000000..dd1a17397
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/visualglm/processing.py
@@ -0,0 +1,223 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Processor class for VisualGLM.
+"""
+
+import re
+from typing import List, Optional, Union
+
+import numpy as np
+import paddle
+from PIL import Image
+
+from ..image_processing_utils import BatchFeature
+from ..image_utils import ImageInput
+from ..processing_utils import ProcessorMixin
+from ..tokenizer_utils_base import BatchEncoding, TensorType, TextInput
+
+__all__ = [
+    "VisualGLMProcessor",
+]
+
+
+class VisualGLMProcessor(ProcessorMixin):
+    r"""
+    Constructs a VisualGLM processor which wraps a VisualGLM image processor and an llama tokenizer into a single processor.
+    [`VisualGLMProcessor`] offers all the functionalities of [`VisualGLMImageProcessor`] and [`LlamaTokenizer`]. See the docstring
+    of [`~VisualGLMImageProcessor.__call__`] and [`~LlamaTokenizer.decode`] for more information.
+
+    Args:
+        image_processor (`VisualGLMImageProcessor`):
+            An instance of [`VisualGLMImageProcessor`]. The image processor is a required input.
+        tokenizer (`LlamaTokenizer`):
+            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
+
+    Examples:
+    ```python
+    >>> import requests
+    >>> from PIL import Image
+
+    >>> import paddle
+    >>> from paddlenlp.transformers import VisualGLMProcessor
+
+    >>> # load processor
+    >>> minigpt4_13b_path = "model_name"
+    >>> processor = VisualGLMProcessor.from_pretrained(minigpt4_13b_path)
+    >>> print("load processor and model done!")
+
+    >>> # prepare model inputs for VisualGLM
+    >>> url = "https://paddlenlp.bj.bcebos.com/data/images/mugs.png"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
+    >>> text = "describe this image"
+    >>> prompt = "Give the following image: <Img>ImageContent</Img>. You will be able to see the image once I provide it to you. Please answer my questions.###Human: <Img><ImageHere></Img> <TextHere>###Assistant:"
+    >>> res = processor([image], text, prompt)
+    ```"""
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "VisualGLMImageProcessor"
+    tokenizer_class = "ChatGLMTokenizer"
+
+    def __init__(self, image_processor, tokenizer):
+        tokenizer.return_token_type_ids = False
+        tokenizer.model_input_names = ["input_ids", "attention_mask"]
+        super().__init__(image_processor, tokenizer)
+        self.current_processor = self.image_processor
+        self.default_prompt = "<img><ImageHere></img>"
+        self.image_tag = "<ImageHere>"
+        self.num_query_tokens = 32
+
+    def process_images(
+        self,
+        images: ImageInput,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        This method uses [`VisualGLMImageProcessor.__call__`] method to prepare image(s) for the model.
+        Please refer to the docstring of the method for more information.
+        """
+        if not images:
+            raise ValueError("You have to input correct images.")
+
+        if isinstance(images, (Image.Image, np.ndarray, paddle.Tensor)):
+            images = [images]
+
+        processed_images = self.image_processor(images, return_tensors=return_tensors)
+
+        return processed_images
+
+    def process_texts(
+        self,
+        texts: Union[TextInput, List[TextInput]],
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PADDLE,
+        **kwargs,
+    ) -> BatchEncoding:
+        if not texts:
+            raise ValueError("You have to input correct texts.")
+
+        if isinstance(texts, TextInput):
+            texts = [texts]
+
+        processed_texts = self.tokenizer(text=texts, return_tensors=return_tensors, **kwargs)
+        return BatchEncoding(processed_texts)
+
+    def build_inputs_with_image(
+        self,
+        image: Union[Image.Image, np.ndarray, paddle.Tensor],
+        query: str,
+        history: Optional[str] = None,
+    ):
+        # construct prompt with inputs
+        if image is not None:
+            prompt = self.default_prompt
+        else:
+            prompt = ""
+        for old_query, response in history:
+            prompt += "问：{}\n答：{}\n".format(old_query, response)
+        prompt += "问：{}\n答：".format(query)
+
+        if image is not None:
+            image_start_position = prompt.rfind(self.image_tag)
+            image_end_position = image_start_position + len(self.image_tag)
+            first_text_input = self.tokenizer.encode(prompt[:image_start_position], add_special_tokens=False)
+            image_input = [self.tokenizer.unk_token_id] * self.num_query_tokens
+            second_text_input = self.tokenizer.encode(prompt[image_end_position:], add_special_tokens=False)
+            all_input_ids = first_text_input["input_ids"] + image_input + second_text_input["input_ids"]
+            all_input_ids = self.tokenizer.build_inputs_with_special_tokens(all_input_ids)
+
+            # processing image
+            processed_image = self.process_images(image)
+
+            inputs = {
+                "input_ids": paddle.to_tensor(all_input_ids, dtype="int64").unsqueeze(0),
+                "pre_image_length": len(first_text_input["input_ids"]),
+                "pixel_values": processed_image["pixel_values"],
+            }
+        else:
+            inputs = self.tokenizer([prompt], return_tensors="pd")
+            inputs["pre_image_length"] = 0
+
+        return inputs
+
+    def __call__(
+        self,
+        image: Union[Image.Image, np.ndarray, paddle.Tensor],
+        query: str,
+        history: Optional[str] = [],
+        **kwargs,
+    ):
+        if image is None:
+            raise ValueError("Image should not be None.")
+        if query is None:
+            raise ValueError("Query should not be None.")
+        if not isinstance(query, str):
+            raise TypeError("A string type of query is expected, but acceived {}.".format(type(query)))
+        if not isinstance(history, list):
+            raise TypeError(
+                "A list type of history is expected with each item [query, response] in it, but acceived {}.".format(
+                    type(history)
+                )
+            )
+
+        inputs = self.build_inputs_with_image(image, query, history=history)
+
+        return inputs
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
+        to the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    def process_response(self, response):
+        response = response.strip()
+        response = response.replace("[[训练时间]]", "2023年")
+        punkts = [
+            [",", "，"],
+            ["!", "！"],
+            [":", "："],
+            [";", "；"],
+            ["\?", "？"],
+        ]
+        for item in punkts:
+            response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
+            response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
+        return response
+
+    def get_responses(self, *args, **kwargs):
+        processed_responses = []
+        responses = self.batch_decode(*args, **kwargs)
+
+        for response in responses:
+            response = self.process_response(response)
+            processed_responses.append(response)
+
+        return processed_responses
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlm/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlm/__init__.py
new file mode 100644
index 000000000..b0d15c218
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlm/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .modeling import *
+from .tokenizer import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlm/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlm/configuration.py
new file mode 100644
index 000000000..fb8a8f236
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlm/configuration.py
@@ -0,0 +1,609 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" XLM configuration"""
+from __future__ import annotations
+
+# from .onnx import OnnxConfig
+import logging
+from typing import Dict
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["XLM_PRETRAINED_INIT_CONFIGURATION", "XLM_PRETRAINED_RESOURCE_FILES_MAP", "XLMConfig"]
+
+
+XLM_PRETRAINED_INIT_CONFIGURATION = {
+    "xlm-mlm-en-2048": {
+        "is_encoder": True,
+        "causal": False,
+        "n_langs": 1,
+        "use_lang_embeddings": True,
+        "vocab_size": 30145,
+        "pad_token_id": 2,
+        "hidden_size": 2048,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 12,
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "use_sinusoidal_embeddings": False,
+        "layer_norm_eps": 1e-12,
+        "hidden_act": "gelu",
+        "embed_init_std": 0.015625,
+        "init_std": 0.02,
+        "lang_id": 0,
+        "lang2id": None,
+    },
+    "xlm-mlm-ende-1024": {
+        "is_encoder": True,
+        "causal": False,
+        "n_langs": 2,
+        "use_lang_embeddings": True,
+        "vocab_size": 64699,
+        "pad_token_id": 2,
+        "hidden_size": 1024,
+        "num_attention_heads": 8,
+        "num_hidden_layers": 6,
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "use_sinusoidal_embeddings": False,
+        "layer_norm_eps": 1e-12,
+        "hidden_act": "gelu",
+        "embed_init_std": 0.02209708691207961,
+        "init_std": 0.02,
+        "lang_id": 1,
+        "lang2id": {"de": 0, "en": 1},
+    },
+    "xlm-mlm-enfr-1024": {
+        "is_encoder": True,
+        "causal": False,
+        "n_langs": 2,
+        "use_lang_embeddings": True,
+        "vocab_size": 64139,
+        "pad_token_id": 2,
+        "hidden_size": 1024,
+        "num_attention_heads": 8,
+        "num_hidden_layers": 6,
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "use_sinusoidal_embeddings": False,
+        "layer_norm_eps": 1e-12,
+        "hidden_act": "gelu",
+        "embed_init_std": 0.02209708691207961,
+        "init_std": 0.02,
+        "lang_id": 0,
+        "lang2id": {"en": 0, "fr": 1},
+    },
+    "xlm-mlm-enro-1024": {
+        "is_encoder": True,
+        "causal": False,
+        "n_langs": 2,
+        "use_lang_embeddings": True,
+        "vocab_size": 64592,
+        "pad_token_id": 2,
+        "hidden_size": 1024,
+        "num_attention_heads": 8,
+        "num_hidden_layers": 6,
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "use_sinusoidal_embeddings": False,
+        "layer_norm_eps": 1e-12,
+        "hidden_act": "gelu",
+        "embed_init_std": 0.02209708691207961,
+        "init_std": 0.02,
+        "lang_id": 0,
+        "lang2id": {"en": 0, "ro": 1},
+    },
+    "xlm-mlm-tlm-xnli15-1024": {
+        "is_encoder": True,
+        "causal": False,
+        "n_langs": 15,
+        "use_lang_embeddings": True,
+        "vocab_size": 95000,
+        "pad_token_id": 2,
+        "hidden_size": 1024,
+        "num_attention_heads": 8,
+        "num_hidden_layers": 12,
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "use_sinusoidal_embeddings": False,
+        "layer_norm_eps": 1e-12,
+        "hidden_act": "gelu",
+        "embed_init_std": 0.02209708691207961,
+        "init_std": 0.02,
+        "lang_id": 4,
+        "lang2id": {
+            "ar": 0,
+            "bg": 1,
+            "de": 2,
+            "el": 3,
+            "en": 4,
+            "es": 5,
+            "fr": 6,
+            "hi": 7,
+            "ru": 8,
+            "sw": 9,
+            "th": 10,
+            "tr": 11,
+            "ur": 12,
+            "vi": 13,
+            "zh": 14,
+        },
+    },
+    "xlm-mlm-xnli15-1024": {
+        "is_encoder": True,
+        "causal": False,
+        "n_langs": 15,
+        "use_lang_embeddings": True,
+        "vocab_size": 95000,
+        "pad_token_id": 2,
+        "hidden_size": 1024,
+        "num_attention_heads": 8,
+        "num_hidden_layers": 12,
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "use_sinusoidal_embeddings": False,
+        "layer_norm_eps": 1e-12,
+        "hidden_act": "gelu",
+        "embed_init_std": 0.02209708691207961,
+        "init_std": 0.02,
+        "lang_id": 4,
+        "lang2id": {
+            "ar": 0,
+            "bg": 1,
+            "de": 2,
+            "el": 3,
+            "en": 4,
+            "es": 5,
+            "fr": 6,
+            "hi": 7,
+            "ru": 8,
+            "sw": 9,
+            "th": 10,
+            "tr": 11,
+            "ur": 12,
+            "vi": 13,
+            "zh": 14,
+        },
+    },
+    "xlm-clm-enfr-1024": {
+        "is_encoder": True,
+        "causal": False,
+        "n_langs": 2,
+        "use_lang_embeddings": True,
+        "vocab_size": 64139,
+        "pad_token_id": 2,
+        "hidden_size": 1024,
+        "num_attention_heads": 8,
+        "num_hidden_layers": 6,
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "use_sinusoidal_embeddings": False,
+        "layer_norm_eps": 1e-12,
+        "hidden_act": "gelu",
+        "embed_init_std": 0.02209708691207961,
+        "init_std": 0.02,
+        "lang_id": 0,
+        "lang2id": {"en": 0, "fr": 1},
+    },
+    "xlm-clm-ende-1024": {
+        "is_encoder": True,
+        "causal": False,
+        "n_langs": 2,
+        "use_lang_embeddings": True,
+        "vocab_size": 64699,
+        "pad_token_id": 2,
+        "hidden_size": 1024,
+        "num_attention_heads": 8,
+        "num_hidden_layers": 6,
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "use_sinusoidal_embeddings": False,
+        "layer_norm_eps": 1e-12,
+        "hidden_act": "gelu",
+        "embed_init_std": 0.02209708691207961,
+        "init_std": 0.02,
+        "lang_id": 1,
+        "lang2id": {"de": 0, "en": 1},
+    },
+    "xlm-mlm-17-1280": {
+        "is_encoder": True,
+        "causal": False,
+        "n_langs": 17,
+        "use_lang_embeddings": False,
+        "vocab_size": 200000,
+        "pad_token_id": 2,
+        "hidden_size": 1280,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 16,
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "use_sinusoidal_embeddings": False,
+        "layer_norm_eps": 1e-12,
+        "hidden_act": "gelu",
+        "embed_init_std": 0.01976423537605237,
+        "init_std": 0.02,
+        "lang_id": 2,
+        "lang2id": {
+            "ar": 0,
+            "de": 1,
+            "en": 2,
+            "es": 3,
+            "fr": 4,
+            "hi": 5,
+            "it": 6,
+            "ja": 7,
+            "ko": 8,
+            "nl": 9,
+            "pl": 10,
+            "pt": 11,
+            "ru": 12,
+            "sv": 13,
+            "tr": 14,
+            "vi": 15,
+            "zh": 16,
+        },
+    },
+    "xlm-mlm-100-1280": {
+        "is_encoder": True,
+        "causal": False,
+        "n_langs": 100,
+        "use_lang_embeddings": False,
+        "vocab_size": 200000,
+        "pad_token_id": 2,
+        "hidden_size": 1280,
+        "num_attention_heads": 16,
+        "num_hidden_layers": 16,
+        "hidden_dropout_prob": 0.1,
+        "attention_probs_dropout_prob": 0.1,
+        "max_position_embeddings": 512,
+        "use_sinusoidal_embeddings": False,
+        "layer_norm_eps": 1e-12,
+        "hidden_act": "gelu",
+        "embed_init_std": 0.01976423537605237,
+        "init_std": 0.02,
+        "lang_id": 23,
+        "lang2id": {
+            "af": 0,
+            "als": 1,
+            "am": 2,
+            "an": 3,
+            "ang": 4,
+            "ar": 5,
+            "arz": 6,
+            "ast": 7,
+            "az": 8,
+            "bar": 9,
+            "be": 10,
+            "bg": 11,
+            "bn": 12,
+            "br": 13,
+            "bs": 14,
+            "ca": 15,
+            "ceb": 16,
+            "ckb": 17,
+            "cs": 18,
+            "cy": 19,
+            "da": 20,
+            "de": 21,
+            "el": 22,
+            "en": 23,
+            "eo": 24,
+            "es": 25,
+            "et": 26,
+            "eu": 27,
+            "fa": 28,
+            "fi": 29,
+            "fr": 30,
+            "fy": 31,
+            "ga": 32,
+            "gan": 33,
+            "gl": 34,
+            "gu": 35,
+            "he": 36,
+            "hi": 37,
+            "hr": 38,
+            "hu": 39,
+            "hy": 40,
+            "ia": 41,
+            "id": 42,
+            "is": 43,
+            "it": 44,
+            "ja": 45,
+            "jv": 46,
+            "ka": 47,
+            "kk": 48,
+            "kn": 49,
+            "ko": 50,
+            "ku": 51,
+            "la": 52,
+            "lb": 53,
+            "lt": 54,
+            "lv": 55,
+            "mk": 56,
+            "ml": 57,
+            "mn": 58,
+            "mr": 59,
+            "ms": 60,
+            "my": 61,
+            "nds": 62,
+            "ne": 63,
+            "nl": 64,
+            "nn": 65,
+            "no": 66,
+            "oc": 67,
+            "pl": 68,
+            "pt": 69,
+            "ro": 70,
+            "ru": 71,
+            "scn": 72,
+            "sco": 73,
+            "sh": 74,
+            "si": 75,
+            "simple": 76,
+            "sk": 77,
+            "sl": 78,
+            "sq": 79,
+            "sr": 80,
+            "sv": 81,
+            "sw": 82,
+            "ta": 83,
+            "te": 84,
+            "th": 85,
+            "tl": 86,
+            "tr": 87,
+            "tt": 88,
+            "uk": 89,
+            "ur": 90,
+            "uz": 91,
+            "vi": 92,
+            "war": 93,
+            "wuu": 94,
+            "yi": 95,
+            "zh": 96,
+            "zh_classical": 97,
+            "zh_min_nan": 98,
+            "zh_yue": 99,
+        },
+    },
+}
+
+XLM_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "xlm-mlm-en-2048": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-mlm-en-2048/model_state.pdparams",
+        "xlm-mlm-ende-1024": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-mlm-ende-1024/model_state.pdparams",
+        "xlm-mlm-enfr-1024": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-mlm-enfr-1024/model_state.pdparams",
+        "xlm-mlm-enro-1024": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-mlm-enro-1024/model_state.pdparams",
+        "xlm-mlm-tlm-xnli15-1024": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-mlm-tlm-xnli15-1024/model_state.pdparams",
+        "xlm-mlm-xnli15-1024": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-mlm-xnli15-1024/model_state.pdparams",
+        "xlm-clm-enfr-1024": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-clm-enfr-1024/model_state.pdparams",
+        "xlm-clm-ende-1024": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-clm-ende-1024/model_state.pdparams",
+        "xlm-mlm-17-1280": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-mlm-17-1280/model_state.pdparams",
+        "xlm-mlm-100-1280": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-mlm-100-1280/model_state.pdparams",
+    }
+}
+
+
+class XLMConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`XLMModel`]. It is used to
+    instantiate a XLM model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the
+    [xlm-mlm-en-2048] architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30145):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`XLMModel`] .
+        emb_dim (`int`, *optional*, defaults to 2048):
+            Dimensionality of the encoder layers and the pooler layer.
+        n_layer (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention mechanism
+        gelu_activation (`bool`, *optional*, defaults to `True`):
+            Whether or not to use *gelu* for the activations instead of *relu*.
+        sinusoidal_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether or not to use sinusoidal positional embeddings instead of absolute positional embeddings.
+        causal (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should behave in a causal manner. Causal models use a triangular attention mask in
+            order to only attend to the left-side context instead if a bidirectional context.
+        asm (`bool`, *optional*, defaults to `False`):
+            Whether or not to use an adaptive log softmax projection layer instead of a linear layer for the prediction
+            layer.
+        n_langs (`int`, *optional*, defaults to 1):
+            The number of languages the model handles. Set to 1 for monolingual models.
+        use_lang_emb (`bool`, *optional*, defaults to `True`)
+            Whether to use language embeddings. Some models use additional language embeddings, see [the multilingual
+            models page]
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        embed_init_std (`float`, *optional*, defaults to 2048^-0.5):
+            The standard deviation of the truncated_normal_initializer for initializing the embedding matrices.
+        init_std (`int`, *optional*, defaults to 50257):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices except the
+            embedding matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        bos_index (`int`, *optional*, defaults to 0):
+            The index of the beginning of sentence token in the vocabulary.
+        eos_index (`int`, *optional*, defaults to 1):
+            The index of the end of sentence token in the vocabulary.
+        pad_index (`int`, *optional*, defaults to 2):
+            The index of the padding token in the vocabulary.
+        unk_index (`int`, *optional*, defaults to 3):
+            The index of the unknown token in the vocabulary.
+        mask_index (`int`, *optional*, defaults to 5):
+            The index of the masking token in the vocabulary.
+        is_encoder(`bool`, *optional*, defaults to `True`):
+            Whether or not the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
+        summary_type (`string`, *optional*, defaults to "first"):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+
+            Has to be one of the following options:
+
+                - `"last"`: Take the last token hidden state (like XLNet).
+                - `"first"`: Take the first token hidden state (like BERT).
+                - `"mean"`: Take the mean of all tokens hidden states.
+                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - `"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+
+            Whether or not to add a projection after the vector extraction.
+        summary_activation (`str`, *optional*):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+
+            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+        summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
+            Used in the sequence classification and multiple choice models.
+
+            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+        summary_first_dropout (`float`, *optional*, defaults to 0.1):
+            Used in the sequence classification and multiple choice models.
+
+            The dropout ratio to be used after the projection and activation.
+        start_n_top (`int`, *optional*, defaults to 5):
+            Used in the SQuAD evaluation script.
+        end_n_top (`int`, *optional*, defaults to 5):
+            Used in the SQuAD evaluation script.
+        mask_token_id (`int`, *optional*, defaults to 0):
+            Model agnostic parameter to identify masked tokens when generating text in an MLM context.
+        lang_id (`int`, *optional*, defaults to 1):
+            The ID of the language used by the model. This parameter is used when generating text in a given language.
+
+    Examples:
+
+    ```python
+    >>> from transformers import XLMConfig, XLMModel
+
+    >>> # Initializing a XLM configuration
+    >>> configuration = XLMConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = XLMModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "xlm"
+    pretrained_init_configuration = XLM_PRETRAINED_INIT_CONFIGURATION
+    attribute_map: Dict[str, str] = {
+        "dropout_prob": "dropout",
+        "attention_probs_dropout_prob": "attention_dropout",
+        "hidden_size": "emb_dim",
+        "num_attention_heads": "n_heads",
+        "num_hidden_layers": "n_layers",
+        "n_words": "vocab_size",  # For backward compatibility
+        "use_lang_embeddings": "use_lang_emb",
+        "use_sinusoidal_embeddings": "sinusoidal_embeddings",
+        "hidden_dropout_prob": "dropout",
+        "num_classes": "num_labels",
+    }
+
+    def __init__(
+        self,
+        vocab_size=30145,
+        emb_dim=2048,
+        n_layers=12,
+        n_heads=16,
+        dropout=0.1,
+        attention_dropout=0.1,
+        gelu_activation=True,
+        hidden_act="gelu",
+        sinusoidal_embeddings=False,
+        causal=False,
+        asm=False,
+        n_langs=1,
+        use_lang_emb=True,
+        max_position_embeddings=512,
+        embed_init_std=2048**-0.5,
+        layer_norm_eps=1e-12,
+        init_std=0.02,
+        bos_index=0,
+        eos_index=1,
+        pad_index=2,
+        unk_index=3,
+        mask_index=5,
+        is_encoder=True,
+        summary_type="first",
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        start_n_top=5,
+        end_n_top=5,
+        mask_token_id=0,
+        lang_id=0,
+        pad_token_id=2,
+        bos_token_id=0,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, **kwargs)
+
+        """Constructs XLMConfig."""
+        self.vocab_size = vocab_size
+        self.emb_dim = emb_dim
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.gelu_activation = gelu_activation
+        self.hidden_act = hidden_act
+        self.sinusoidal_embeddings = sinusoidal_embeddings
+        self.causal = causal
+        self.asm = asm
+        self.n_langs = n_langs
+        self.use_lang_emb = use_lang_emb
+        self.layer_norm_eps = layer_norm_eps
+        self.bos_index = bos_index
+        self.eos_index = eos_index
+        self.pad_index = pad_index
+        self.unk_index = unk_index
+        self.mask_index = mask_index
+        self.is_encoder = is_encoder
+        self.max_position_embeddings = max_position_embeddings
+        self.embed_init_std = embed_init_std
+        self.init_std = init_std
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_proj_to_labels = summary_proj_to_labels
+        self.summary_first_dropout = summary_first_dropout
+        self.start_n_top = start_n_top
+        self.end_n_top = end_n_top
+        self.mask_token_id = mask_token_id
+        self.lang_id = lang_id
+
+        if "n_words" in kwargs:
+            self.n_words = kwargs["n_words"]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlm/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlm/modeling.py
new file mode 100644
index 000000000..4f8fb0b85
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlm/modeling.py
@@ -0,0 +1,890 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ...utils.env import CONFIG_NAME
+from .. import PretrainedModel, register_base_model
+from ..activations import ACT2FN
+from .configuration import (
+    XLM_PRETRAINED_INIT_CONFIGURATION,
+    XLM_PRETRAINED_RESOURCE_FILES_MAP,
+    XLMConfig,
+)
+
+__all__ = [
+    "XLMModel",
+    "XLMPretrainedModel",
+    "XLMWithLMHeadModel",
+    "XLMForSequenceClassification",
+    "XLMForTokenClassification",
+    "XLMForQuestionAnsweringSimple",
+    "XLMForMultipleChoice",
+]
+
+INF = 1e4
+
+
+class SinusoidalPositionalEmbedding(nn.Embedding):
+    def __init__(self, num_embeddings, embedding_dim):
+        super().__init__(num_embeddings, embedding_dim)
+        self.weight = self._init_weight(self.weight)
+
+    @staticmethod
+    def _init_weight(out):
+        n_pos, dim = out.shape
+        out.stop_gradient = True
+        position_ids = paddle.arange(0, n_pos, dtype=out.dtype).unsqueeze(1)
+        indices = paddle.arange(0, dim // 2, dtype=out.dtype).unsqueeze(0)
+        indices = 10000.0 ** (-2 * indices / dim)
+        embeddings = paddle.matmul(position_ids, indices)
+        out[:, 0::2] = paddle.sin(embeddings)
+        out[:, 1::2] = paddle.cos(embeddings)
+        return out
+
+    @paddle.no_grad()
+    def forward(self, position_ids):
+        return super().forward(position_ids)
+
+
+def get_masks(seqlen, lengths, causal, padding_mask=None):
+    """
+    Generate hidden states mask, and optionally an attention mask.
+    """
+    alen = paddle.arange(0, seqlen, dtype="int64")
+    if padding_mask is not None:
+        mask = padding_mask
+    else:
+        mask = alen < lengths[:, None]
+
+    # attention mask is the same as mask, or triangular inferior attention (causal)
+    bs = lengths.shape[0]
+    if causal:
+        attn_mask = paddle.tile(alen[None, None, :], (bs, seqlen, 1)) <= alen[None, :, None]
+    else:
+        attn_mask = mask
+
+    return mask, attn_mask
+
+
+class MultiHeadAttention(nn.Layer):
+
+    NEW_ID = itertools.count()
+
+    def __init__(self, n_heads, dim, config: XLMConfig):
+        super().__init__()
+        self.layer_id = next(MultiHeadAttention.NEW_ID)
+        self.dim = dim
+        self.n_heads = n_heads
+        assert self.dim % self.n_heads == 0
+        self.q_lin = nn.Linear(dim, dim)
+        self.k_lin = nn.Linear(dim, dim)
+        self.v_lin = nn.Linear(dim, dim)
+        self.out_lin = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.dim_per_head = self.dim // self.n_heads
+
+    def shape(self, x):
+        """projection"""
+        return x.reshape([0, 0, self.n_heads, self.dim_per_head]).transpose([0, 2, 1, 3])
+
+    def unshape(self, x):
+        """compute context"""
+        return x.transpose([0, 2, 1, 3]).reshape([0, 0, self.n_heads * self.dim_per_head])
+
+    def forward(self, input, mask, kv=None, cache=None, output_attentions=False):
+        """
+        Self-attention (if kv is None) or attention over source sentence (provided by kv).
+        """
+        # Input is (bs, qlen, dim)
+        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
+        bs, qlen, dim = input.shape
+        if kv is None:
+            klen = qlen if cache is None else cache["seqlen"] + qlen
+        else:
+            klen = kv.shape[1]
+
+        mask_reshape = (bs, 1, qlen, klen) if mask.ndim == 3 else (bs, 1, 1, klen)
+
+        q = self.shape(self.q_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+        if kv is None:
+            k = self.shape(self.k_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+            v = self.shape(self.v_lin(input))  # (bs, n_heads, qlen, dim_per_head)
+        elif cache is None or self.layer_id not in cache:
+            k = v = kv
+            k = self.shape(self.k_lin(k))  # (bs, n_heads, qlen, dim_per_head)
+            v = self.shape(self.v_lin(v))  # (bs, n_heads, qlen, dim_per_head)
+
+        if cache is not None:
+            if self.layer_id in cache:
+                if kv is None:
+                    k_, v_ = cache[self.layer_id]
+                    k = paddle.concat([k_, k], axis=2)  # (bs, n_heads, klen, dim_per_head)
+                    v = paddle.concat([v_, v], axis=2)  # (bs, n_heads, klen, dim_per_head)
+                else:
+                    k, v = cache[self.layer_id]
+            cache[self.layer_id] = (k, v)
+
+        q = q / math.sqrt(self.dim_per_head)  # (bs, n_heads, qlen, dim_per_head)
+
+        scores = paddle.matmul(q, k, transpose_y=True)  # (bs, n_heads, qlen, klen)
+
+        mask = mask.reshape(mask_reshape)  # (bs, n_heads, qlen, klen)
+
+        scores = scores + (mask.astype(scores.dtype) - 1) * INF
+
+        weights = F.softmax(scores, axis=-1)  # (bs, n_heads, qlen, klen)
+        weights = self.dropout(weights)  # (bs, n_heads, qlen, klen)
+
+        context = paddle.matmul(weights, v)  # (bs, n_heads, qlen, dim_per_head)
+        context = self.unshape(context)  # (bs, qlen, dim)
+
+        outputs = (self.out_lin(context),)
+        if output_attentions:
+            outputs = outputs + (weights,)
+        return outputs
+
+
+class TransformerFFN(nn.Layer):
+    def __init__(self, in_dim, dim_hidden, out_dim, config: XLMConfig):
+        super().__init__()
+        self.lin1 = nn.Linear(in_dim, dim_hidden)
+        self.lin2 = nn.Linear(dim_hidden, out_dim)
+        self.dropout = nn.Dropout(config.dropout_prob)
+        self.act = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        x = self.lin1(x)
+        x = self.act(x)
+        x = self.lin2(x)
+        x = self.dropout(x)
+        return x
+
+
+class XLMPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained XLM models. It provides XLM related
+    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
+    `pretrained_init_configuration`, `base_model_prefix` for downloading and
+    loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    pretrained_init_configuration = XLM_PRETRAINED_INIT_CONFIGURATION
+    resource_files_names = {"model_state": "model_state.pdparams"}
+    pretrained_resource_files_map = XLM_PRETRAINED_RESOURCE_FILES_MAP
+    model_config_file = CONFIG_NAME
+    config_class = XLMConfig
+    base_model_prefix = "xlm"
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if isinstance(layer, nn.Embedding):
+            new_weight = paddle.normal(
+                mean=0.0,
+                std=self.embed_init_std if hasattr(self, "embed_init_std") else self.xlm.config["embed_init_std"],
+                shape=layer.weight.shape,
+            )
+            if layer._padding_idx is not None:
+                new_weight[layer._padding_idx] = paddle.zeros_like(new_weight[layer._padding_idx])
+            layer.weight.set_value(new_weight)
+        elif isinstance(layer, nn.Linear):
+            layer.weight.set_value(
+                paddle.normal(
+                    mean=0.0,
+                    std=self.init_std if hasattr(self, "init_std") else self.xlm.config["init_std"],
+                    shape=layer.weight.shape,
+                )
+            )
+            if layer.bias is not None:
+                layer.bias.set_value(paddle.zeros_like(layer.bias))
+        elif isinstance(layer, nn.LayerNorm):
+            layer.bias.set_value(paddle.zeros_like(layer.bias))
+            layer.weight.set_value(paddle.full_like(layer.weight, 1.0))
+
+
+@register_base_model
+class XLMModel(XLMPretrainedModel):
+    """
+    The bare XLM Model transformer outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`XLMConfig`):
+            An instance of :class:`XLMConfig`.
+    """
+
+    def __init__(self, config: XLMConfig):
+        super().__init__(config)
+        self.causal = config.causal
+        self.num_hidden_layers = config.num_hidden_layers
+        self.pad_token_id = config.pad_token_id
+        self.hidden_size = config.hidden_size
+        self.embed_init_std = config.embed_init_std
+        self.init_std = config.init_std
+        self.use_lang_embeddings = config.use_lang_embeddings
+        self.n_langs = config.n_langs
+        if not config.is_encoder:
+            raise NotImplementedError("Currently XLM can only be used as an encoder")
+        assert (
+            config.hidden_size % config.num_attention_heads == 0
+        ), "xlm model's hidden_size must be a multiple of num_attention_heads"
+
+        # embeddings
+        if config.use_sinusoidal_embeddings:
+            self.position_embeddings = SinusoidalPositionalEmbedding(
+                config.max_position_embeddings, config.hidden_size
+            )
+        else:
+            self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        if config.n_langs > 1 and config.use_lang_embeddings:
+            self.lang_embeddings = nn.Embedding(config.n_langs, config.hidden_size)
+        self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layer_norm_emb = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+
+        self.attentions = nn.LayerList()
+        self.layer_norm1 = nn.LayerList()
+        self.ffns = nn.LayerList()
+        self.layer_norm2 = nn.LayerList()
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        for _ in range(self.num_hidden_layers):
+            self.attentions.append(MultiHeadAttention(config.num_attention_heads, config.hidden_size, config))
+            self.layer_norm1.append(nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps))
+
+            self.ffns.append(
+                TransformerFFN(
+                    config.hidden_size,
+                    config.hidden_size * 4,
+                    config.hidden_size,
+                    config,
+                )
+            )
+            self.layer_norm2.append(nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps))
+
+        self.register_buffer(
+            "position_ids",
+            paddle.arange(0, config.max_position_embeddings).reshape((1, -1)),
+            persistable=False,
+        )
+
+    def forward(
+        self,
+        input_ids=None,
+        langs=None,
+        attention_mask=None,
+        position_ids=None,
+        lengths=None,
+        cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+    ):
+        r"""
+        The XLMModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
+            langs (Tensor, optional):
+                A parallel sequence of tokens to be used to indicate the language of each token in the input. Indices are
+                languages ids which can be obtained from the language names by using two conversion mappings provided in
+                the configuration of the model (only provided for multilingual models). More precisely, the *language name
+                to language id* mapping is in `model.config['lang2id']` (which is a dictionary string to int).
+                Shape as [batch_size, sequence_length] and dtype as int64. Defaults to `None`.
+            attention_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some
+                unwanted positions, usually the paddings or the subsequent positions.
+                Its data type can be int, float and bool.
+                When the data type is bool, the `masked` tokens have `False` values and the others
+                have `True` values.
+                When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
+                When the data type is float, the `masked` tokens have `0.0` values and the others have `1.0` values.
+                It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
+                Defaults to `None`, which means nothing needed to be prevented attention to.
+            position_ids (Tensor, optional):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected
+                in the range `[0, max_position_embeddings - 1]`.
+                Shape as [batch_size, sequence_length] and dtype as int64. Defaults to `None`.
+            lengths (Tensor, optional):
+                Length of each sentence that can be used to avoid performing attention on padding token indices. You can
+                also use *attention_mask* for the same result (see above), kept here for compatibility. Indices selected in
+                `[0, ..., sequence_length]`.
+                Shape as [batch_size] and dtype as int64. Defaults to `None`.
+            cache (Tuple[Tuple[Tensor]], optional):
+                Contains pre-computed hidden-states (key and values in the attention blocks)
+                as computed by the model. Can be used to speed up sequential decoding.
+                The `input_ids` which have their past given to this model should not be
+                passed as input ids as they have already been computed.
+                Defaults to `None`.
+            output_attentions (bool, optional):
+                Whether or not to return the attentions tensors of all attention layers.
+                Defaults to `False`.
+            output_hidden_states (bool, optional):
+                Whether or not to return the output of all hidden layers.
+                Defaults to `False`.
+
+        Returns:
+            tuple: Returns tuple (`last_hidden_state`, `hidden_states`, `attentions`)
+
+            With the fields:
+
+            - `last_hidden_state` (Tensor):
+                Sequence of hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+
+            - `hidden_states` (tuple(Tensor), optional):
+                returned when `output_hidden_states=True` is passed.
+                Tuple of `Tensor` (one for the output of the embeddings + one for the output of
+                each layer). Each Tensor has a data type of float32 and its shape is
+                [batch_size, sequence_length, hidden_size].
+
+            - `attentions` (tuple(Tensor), optional):
+                returned when `output_attentions=True` is passed.
+                Tuple of `Tensor` (one for each layer) of shape. Each Tensor has a data type of
+                float32 and its shape is [batch_size, num_heads, sequence_length, sequence_length].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import XLMModel, XLMTokenizer
+
+                tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-tlm-xnli15-1024")
+                model = XLMModel.from_pretrained("xlm-mlm-tlm-xnli15-1024")
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!", lang="en")
+                inputs = {k:paddle.to_tensor([v], dtype="int64") for (k, v) in inputs.items()}
+                inputs["langs"] = paddle.ones_like(inputs["input_ids"]) * tokenizer.lang2id["en"]
+
+                last_hidden_state = model(**inputs)[0]
+
+        """
+        bs, seqlen = input_ids.shape
+
+        if lengths is None:
+            if input_ids is not None:
+                lengths = (input_ids != self.pad_token_id).sum(axis=1).astype("int64")
+            else:
+                lengths = paddle.to_tensor([seqlen] * bs, dtype="int64")
+
+        # generate masks
+        mask, attn_mask = get_masks(seqlen, lengths, self.causal, padding_mask=attention_mask)
+
+        # position_ids
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seqlen]
+
+        # do not recompute cached elements
+        if cache is not None and input_ids is not None:
+            _seqlen = seqlen - cache["seqlen"]
+            input_ids = input_ids[:, -_seqlen:]
+            position_ids = position_ids[:, -_seqlen:]
+            if langs is not None:
+                langs = langs[:, -_seqlen:]
+            mask = mask[:, -_seqlen:]
+            attn_mask = attn_mask[:, -_seqlen:]
+
+        # embeddings
+        tensor = self.embeddings(input_ids) + self.position_embeddings(position_ids)
+        if langs is not None and self.use_lang_embeddings and self.n_langs > 1:
+            tensor = tensor + self.lang_embeddings(langs)
+
+        tensor = self.layer_norm_emb(tensor)
+        tensor = self.dropout(tensor)
+        tensor = tensor * mask.unsqueeze(-1).astype(tensor.dtype)
+
+        # transformer layers
+        hidden_states = () if output_hidden_states else None
+        attentions = () if output_attentions else None
+        for i in range(self.num_hidden_layers):
+            if output_hidden_states:
+                hidden_states = hidden_states + (tensor,)
+            # self attention
+            attn_outputs = self.attentions[i](
+                tensor,
+                attn_mask,
+                cache=cache,
+                output_attentions=output_attentions,
+            )
+            attn = attn_outputs[0]
+            if output_attentions:
+                attentions = attentions + (attn_outputs[1],)
+            attn = self.dropout(attn)
+            tensor = tensor + attn
+            tensor = self.layer_norm1[i](tensor)
+            # FFN
+            tensor = tensor + self.ffns[i](tensor)
+            tensor = self.layer_norm2[i](tensor)
+            tensor = tensor * mask.unsqueeze(-1).astype(tensor.dtype)
+
+        # Add last hidden state
+        if output_hidden_states:
+            hidden_states = hidden_states + (tensor,)
+
+        # update cache length
+        if cache is not None:
+            cache["seqlen"] += tensor.shape[1]
+
+        return tuple(v for v in [tensor, hidden_states, attentions] if v is not None)
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings = value
+
+
+class XLMPredLayer(nn.Layer):
+    """
+    Prediction layer with cross_entropy.
+    """
+
+    def __init__(
+        self,
+        config: XLMConfig,
+        embedding_weights=None,
+    ):
+        super().__init__()
+        self.vocab_size = config.vocab_size
+        if embedding_weights is None:
+            self.proj = nn.Linear(config.hidden_size, config.vocab_size)
+        else:
+            self.bias = self.create_parameter(shape=[config.vocab_size], is_bias=True)
+            self.proj = lambda x: paddle.matmul(x, embedding_weights, transpose_y=True) + self.bias
+
+    def forward(self, x, y=None):
+        """Compute the loss, and optionally the scores."""
+        outputs = ()
+        scores = self.proj(x)
+        outputs = (scores,) + outputs
+        if y is not None:
+            loss = F.cross_entropy(scores.reshape([-1, self.vocab_size]), y.flatten(), reduction="mean")
+            outputs = (loss,) + outputs
+        return outputs
+
+
+class XLMWithLMHeadModel(XLMPretrainedModel):
+    """
+    The XLM Model transformer with a masked language modeling head on top (linear
+    layer with weights tied to the input embeddings).
+
+    Args:
+        config (:class:`XLMConfig`):
+            An instance of :class:`XLMConfig`.
+
+    """
+
+    def __init__(self, config: XLMConfig):
+        super().__init__(config)
+        self.xlm = XLMModel(config)
+        self.pred_layer = XLMPredLayer(
+            config,
+            embedding_weights=self.xlm.embeddings.weight,
+        )
+
+    def forward(
+        self, input_ids=None, langs=None, attention_mask=None, position_ids=None, lengths=None, cache=None, labels=None
+    ):
+        r"""
+        The XLMWithLMHeadModel forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`XLMModel`.
+            langs (Tensor, optional):
+                See :class:`XLMModel`.
+            attention_mask (Tensor, optional):
+                See :class:`XLMModel`.
+            position_ids (Tensor, optional):
+                See :class:`XLMModel`.
+            lengths (Tensor, optional):
+                See :class:`XLMModel`.
+            cache (Dict[str, Tensor], optional):
+                See :class:`XLMModel`.
+            labels (Tensor, optional):
+                The Labels for computing the masked language modeling loss. Indices are selected in
+                `[-100, 0, ..., vocab_size-1]` All labels set to `-100` are ignored (masked), the loss is
+                only computed for labels in `[0, ..., vocab_size-1]`
+                Shape as [batch_size, sequence_length] and dtype as int64. Defaults to `None`.
+
+        Returns:
+            tuple: Returns tuple `(loss, logits)`.
+            With the fields:
+
+            - `loss` (Tensor):
+                returned when `labels` is provided.
+                Language modeling loss (for next-token prediction).
+                It's data type should be float32 and its shape is [1,].
+
+            - `logits` (Tensor):
+                Prediction scores of the language modeling head (scores for each vocabulary
+                token before SoftMax).
+                It's data type should be float32 and
+                its shape is [batch_size, sequence_length, vocab_size].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import XLMWithLMHeadModel, XLMTokenizer
+
+                tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-tlm-xnli15-1024')
+                model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-tlm-xnli15-1024')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!", lang="en")
+                inputs = {k:paddle.to_tensor([v], dtype="int64") for (k, v) in inputs.items()}
+                inputs["langs"] = paddle.ones_like(inputs["input_ids"]) * tokenizer.lang2id["en"]
+                inputs["labels"] = inputs["input_ids"]
+
+                loss, logits = model(**inputs)
+
+
+        """
+        xlm_outputs = self.xlm(
+            input_ids,
+            langs=langs,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            lengths=lengths,
+            cache=cache,
+        )
+
+        output = xlm_outputs[0]
+        outputs = self.pred_layer(output, labels)
+        return outputs + xlm_outputs[1:]
+
+
+class XLMForSequenceClassification(XLMPretrainedModel):
+    """
+    The XLMModel with a sequence classification head on top (linear layer).
+    `XLMForSequenceClassification` uses the first token in order to do the classification.
+
+    Args:
+        config (:class:`XLMConfig`):
+            An instance of :class:`XLMConfig`.
+
+    """
+
+    def __init__(self, config: XLMConfig):
+        super().__init__(config)
+        self.num_classes = config.num_classes
+        self.xlm = XLMModel(config)
+        dropout_prob = config.dropout if config.dropout is not None else config.hidden_dropout_prob
+        self.dropout = nn.Dropout(dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_classes)
+
+    def forward(self, input_ids=None, langs=None, attention_mask=None, position_ids=None, lengths=None):
+        r"""
+        The XLMForSequenceClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`XLMModel`.
+            langs (Tensor, optional):
+                See :class:`XLMModel`.
+            attention_mask (Tensor, optional):
+                See :class:`XLMModel`.
+            position_ids (Tensor, optional):
+                See :class:`XLMModel`.
+            lengths (Tensor, optional):
+                See :class:`XLMModel`.
+
+        Returns:
+            logits (Tensor):
+                A tensor of the input text classification logits.
+                Shape as `[batch_size, num_classes]` and dtype as float32.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import XLMForSequenceClassification, XLMTokenizer
+
+                tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-tlm-xnli15-1024")
+                model = XLMForSequenceClassification.from_pretrained("xlm-mlm-tlm-xnli15-1024", num_classes=2)
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!", lang="en")
+                inputs = {k:paddle.to_tensor([v], dtype="int64") for (k, v) in inputs.items()}
+                inputs["langs"] = paddle.ones_like(inputs["input_ids"]) * tokenizer.lang2id["en"]
+
+                logits = model(**inputs)
+
+        """
+
+        sequence_output = self.xlm(
+            input_ids, langs=langs, attention_mask=attention_mask, position_ids=position_ids, lengths=lengths
+        )[0]
+        sequence_output = self.dropout(sequence_output)
+        pooled_output = sequence_output[:, 0]
+        logits = self.classifier(pooled_output)
+
+        return logits
+
+
+class XLMForTokenClassification(XLMPretrainedModel):
+    """
+    XLMModel with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        config (:class:`XLMConfig`):
+            An instance of :class:`XLMConfig`.
+    """
+
+    def __init__(self, config: XLMConfig):
+        super(XLMForTokenClassification, self).__init__(config)
+        self.num_classes = config.num_classes
+        self.xlm = XLMModel(config)  # allow xlm to be config
+        self.dropout = nn.Dropout(config.dropout if config.dropout is not None else config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_classes)
+
+    def forward(self, input_ids=None, langs=None, attention_mask=None, position_ids=None, lengths=None):
+        r"""
+        The XLMForTokenClassification forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`XLMModel`.
+            langs (Tensor, optional):
+                See :class:`XLMModel`.
+            attention_mask (Tensor, optional):
+                See :class:`XLMModel`.
+            position_ids (Tensor, optional):
+                See :class:`XLMModel`.
+            lengths (Tensor, optional):
+                See :class:`XLMModel`.
+
+        Returns:
+            logits (Tensor):
+                A tensor of the input token classification logits.
+                Shape as `[batch_size, sequence_length, num_classes]` and dtype as `float32`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import XLMForTokenClassification, XLMTokenizer
+
+                tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-tlm-xnli15-1024")
+                model = XLMForTokenClassification.from_pretrained("xlm-mlm-tlm-xnli15-1024", num_classes=2)
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!", lang="en")
+                inputs = {k:paddle.to_tensor([v], dtype="int64") for (k, v) in inputs.items()}
+                inputs["langs"] = paddle.ones_like(inputs["input_ids"]) * tokenizer.lang2id["en"]
+
+                logits = model(**inputs)
+
+        """
+
+        sequence_output = self.xlm(
+            input_ids, langs=langs, attention_mask=attention_mask, position_ids=position_ids, lengths=lengths
+        )[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        return logits
+
+
+class XLMForQuestionAnsweringSimple(XLMPretrainedModel):
+    """
+    XLMModel with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+
+    Args:
+        config (:class:`XLMConfig`):
+            An instance of :class:`XLMConfig`.
+    """
+
+    def __init__(self, config: XLMConfig):
+        super(XLMForQuestionAnsweringSimple, self).__init__(config)
+        self.xlm = XLMModel(config)  # allow xlm to be config
+        self.classifier = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, input_ids=None, langs=None, attention_mask=None, position_ids=None, lengths=None):
+        r"""
+        The XLMForQuestionAnswering forward method, overrides the __call__() special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`XLMModel`.
+            langs (Tensor, optional):
+                See :class:`XLMModel`.
+            attention_mask (Tensor, optional):
+                See :class:`XLMModel`.
+            position_ids (Tensor, optional):
+                See :class:`XLMModel`.
+            lengths (Tensor, optional):
+                See :class:`XLMModel`.
+
+        Returns:
+            tuple: Returns tuple (`start_logits`, `end_logits`).
+
+            With the fields:
+
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import XLMForQuestionAnswering, XLMTokenizer
+
+                tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-tlm-xnli15-1024")
+                model = XLMForQuestionAnswering.from_pretrained("xlm-mlm-tlm-xnli15-1024", num_classes=2)
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!", lang="en")
+                inputs = {k:paddle.to_tensor([v], dtype="int64") for (k, v) in inputs.items()}
+                inputs["langs"] = paddle.ones_like(inputs["input_ids"]) * tokenizer.lang2id["en"]
+
+                outputs = model(**inputs)
+
+                start_logits = outputs[0]
+                end_logits = outputs[1]
+
+        """
+
+        sequence_output = self.xlm(
+            input_ids, langs=langs, attention_mask=attention_mask, position_ids=position_ids, lengths=lengths
+        )[0]
+        logits = self.classifier(sequence_output)
+        start_logits, end_logits = paddle.unstack(x=logits, axis=-1)
+
+        return start_logits, end_logits
+
+
+class XLMForMultipleChoice(XLMPretrainedModel):
+    """
+    XLMModel with a linear layer on top of the hidden-states output layer,
+    designed for multiple choice tasks like RocStories/SWAG tasks.
+
+    Args:
+        config (:class:`XLMConfig`):
+            An instance of :class:`XLMConfig`.
+    """
+
+    def __init__(self, config: XLMConfig):
+        super(XLMForMultipleChoice, self).__init__(config)
+        # self.num_choices = num_choices
+        self.xlm = XLMModel(config)
+        self.dropout = nn.Dropout(config.dropout if config.dropout is not None else config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+    def forward(self, input_ids=None, langs=None, attention_mask=None, position_ids=None, lengths=None):
+        r"""
+        The XLMForMultipleChoice forward method, overrides the __call__() special method.
+        Args:
+            input_ids (Tensor):
+                See :class:`XLMModel` and shape as [batch_size, num_choice, sequence_length].
+            langs(Tensor, optional):
+                See :class:`XLMModel` and shape as [batch_size, num_choice, sequence_length].
+            attention_mask (Tensor, optional):
+                See :class:`XLMModel` and shape as [batch_size, num_choice, sequence_length].
+            position_ids (Tensor, optional):
+                See :class:`XLMModel` and shape as [batch_size, num_choice, sequence_length].
+            lengths (Tensor, optional):
+                See :class:`XLMModel` and shape as [batch_size, num_choice].
+
+        Returns:
+            reshaped_logits (Tensor):
+                A tensor of the multiple choice classification logits.
+                Shape as `[batch_size, num_choice]` and dtype as `float32`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import XLMForMultipleChoice, XLMTokenizer
+                from paddlenlp.data import Pad
+
+                tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-tlm-xnli15-1024")
+                model = XLMForMultipleChoice.from_pretrained("xlm-mlm-tlm-xnli15-1024", num_choices=2)
+
+                data = [
+                    {
+                        "question": "how do you turn on an ipad screen?",
+                        "answer1": "press the volume button.",
+                        "answer2": "press the lock button.",
+                        "label": 1,
+                    },
+                    {
+                        "question": "how do you indent something?",
+                        "answer1": "leave a space before starting the writing",
+                        "answer2": "press the spacebar",
+                        "label": 0,
+                    },
+                ]
+                text = []
+                text_pair = []
+                for d in data:
+                    text.append(d["question"])
+                    text_pair.append(d["answer1"])
+                    text.append(d["question"])
+                    text_pair.append(d["answer2"])
+
+                inputs = tokenizer(text, text_pair, lang="en")
+                input_ids = Pad(axis=0, pad_val=tokenizer.pad_token_id)(inputs["input_ids"])
+                input_ids = paddle.to_tensor(input_ids, dtype="int64")
+                langs = paddle.ones_like(input_ids) * tokenizer.lang2id["en"]
+
+                reshaped_logits = model(
+                    input_ids=input_ids,
+                    langs=langs,
+                )
+        """
+        num_choices = input_ids.shape[1]
+        # input_ids: [bs, num_choice, seqlen]
+        input_ids = input_ids.reshape(shape=(-1, input_ids.shape[-1]))  # flat_input_ids: [bs*num_choice, seqlen]
+
+        if langs is not None:
+            langs = langs.reshape(shape=(-1, langs.shape[-1]))
+
+        if attention_mask is not None:
+            attention_mask = attention_mask.reshape(shape=(-1, attention_mask.shape[-1]))
+
+        if position_ids is not None:
+            position_ids = position_ids.reshape(shape=(-1, position_ids.shape[-1]))
+
+        if lengths is not None:
+            lengths = lengths.reshape(shape=(-1,))
+
+        sequence_output = self.xlm(
+            input_ids, langs=langs, attention_mask=attention_mask, position_ids=position_ids, lengths=lengths
+        )[0]
+        sequence_output = self.dropout(sequence_output)
+        pooled_output = sequence_output[:, 0]
+
+        logits = self.classifier(pooled_output)  # logits: [bs*num_choice, 1]
+        reshaped_logits = logits.reshape(shape=(-1, num_choices))  # logits: [bs, num_choice]
+
+        return reshaped_logits
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlm/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlm/tokenizer.py
new file mode 100644
index 000000000..34064dec3
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlm/tokenizer.py
@@ -0,0 +1,1023 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import re
+import shutil
+import sys
+import unicodedata
+from typing import List, Optional
+
+from paddle.utils import try_import
+
+from ...utils.log import logger
+from .. import PretrainedTokenizer
+from ..tokenizer_utils import AddedToken, TextInput
+
+__all__ = ["XLMTokenizer"]
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "xlm-mlm-en-2048": 512,
+    "xlm-mlm-ende-1024": 512,
+    "xlm-mlm-enfr-1024": 512,
+    "xlm-mlm-enro-1024": 512,
+    "xlm-mlm-tlm-xnli15-1024": 512,
+    "xlm-mlm-xnli15-1024": 512,
+    "xlm-clm-enfr-1024": 512,
+    "xlm-clm-ende-1024": 512,
+    "xlm-mlm-17-1280": 512,
+    "xlm-mlm-100-1280": 512,
+}
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+
+    pairs = set(pairs)
+    return pairs
+
+
+def lowercase_and_remove_accent(text):
+    """
+    Lowercase and strips accents from a piece of text based on
+    https://github.com/facebookresearch/XLM/blob/master/tools/lowercase_and_remove_accent.py
+    """
+    text = " ".join(text)
+    text = text.lower()
+    text = unicodedata.normalize("NFD", text)
+    output = []
+    for char in text:
+        cat = unicodedata.category(char)
+        if cat == "Mn":
+            continue
+        output.append(char)
+    return "".join(output).lower().split(" ")
+
+
+def replace_unicode_punct(text):
+    """
+    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
+    """
+    text = text.replace("，", ",")
+    text = re.sub(r"。\s*", ". ", text)
+    text = text.replace("、", ",")
+    text = text.replace("”", '"')
+    text = text.replace("“", '"')
+    text = text.replace("∶", ":")
+    text = text.replace("：", ":")
+    text = text.replace("？", "?")
+    text = text.replace("《", '"')
+    text = text.replace("》", '"')
+    text = text.replace("）", ")")
+    text = text.replace("！", "!")
+    text = text.replace("（", "(")
+    text = text.replace("；", ";")
+    text = text.replace("１", "1")
+    text = text.replace("」", '"')
+    text = text.replace("「", '"')
+    text = text.replace("０", "0")
+    text = text.replace("３", "3")
+    text = text.replace("２", "2")
+    text = text.replace("５", "5")
+    text = text.replace("６", "6")
+    text = text.replace("９", "9")
+    text = text.replace("７", "7")
+    text = text.replace("８", "8")
+    text = text.replace("４", "4")
+    text = re.sub(r"．\s*", ". ", text)
+    text = text.replace("～", "~")
+    text = text.replace("’", "'")
+    text = text.replace("…", "...")
+    text = text.replace("━", "-")
+    text = text.replace("〈", "<")
+    text = text.replace("〉", ">")
+    text = text.replace("【", "[")
+    text = text.replace("】", "]")
+    text = text.replace("％", "%")
+    return text
+
+
+def remove_non_printing_char(text):
+    """
+    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
+    """
+    output = []
+    for char in text:
+        cat = unicodedata.category(char)
+        if cat.startswith("C"):
+            continue
+        output.append(char)
+    return "".join(output)
+
+
+def romanian_preprocessing(text):
+    """Sennrich's WMT16 scripts for Romanian preprocessing, used by model `xlm-mlm-enro-1024`"""
+    # https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/normalise-romanian.py
+    text = text.replace("\u015e", "\u0218").replace("\u015f", "\u0219")
+    text = text.replace("\u0162", "\u021a").replace("\u0163", "\u021b")
+    # https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/remove-diacritics.py
+    text = text.replace("\u0218", "S").replace("\u0219", "s")  # s-comma
+    text = text.replace("\u021a", "T").replace("\u021b", "t")  # t-comma
+    text = text.replace("\u0102", "A").replace("\u0103", "a")
+    text = text.replace("\u00C2", "A").replace("\u00E2", "a")
+    text = text.replace("\u00CE", "I").replace("\u00EE", "i")
+    return text
+
+
+class XLMTokenizer(PretrainedTokenizer):
+    """
+    Construct an XLM tokenizer. Based on Byte-Pair Encoding. The tokenization process is the following:
+    - Moses preprocessing and tokenization for most supported languages.
+    - Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP).
+    - Optionally lowercases and normalizes all inputs text.
+    - The arguments `special_tokens` and the function `set_special_tokens`, can be used to add additional symbols (like
+      "__classify__") to a vocabulary.
+    - The `lang2id` attribute maps the languages supported by the model with their IDs if provided (automatically set
+      for pretrained vocabularies).
+    - The `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies).
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (str):
+            Vocabulary file.
+        merges_file (str):
+            Merges file.
+        unk_token (str, optional):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+            <Tip>
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+            </Tip>
+            Defaults to `"<unk>"`.
+        sep_token (str, optional):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+            Defaults to `"</s>"`.
+        pad_token (str, optional):
+            The token used for padding, for example when batching sequences of different lengths.
+            Defaults to `"<pad>"`.
+        cls_token (str, optional):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+            Defaults to `"</s>"`.
+        mask_token (str, optional):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+            Defaults to `"<special1>"`.
+        additional_special_tokens (List[str], optional):
+            List of additional special tokens.
+            Defaults to `["<special0>","<special1>","<special2>","<special3>","<special4>","<special5>","<special6>","<special7>","<special8>","<special9>"]`.
+        lang2id (Dict[str, int], optional):
+            Dictionary mapping languages string identifiers to their IDs.
+        id2lang (Dict[int, str], optional):
+            Dictionary mapping language IDs to their string identifiers.
+        do_lowercase_and_remove_accent (bool, optional):
+            Whether to lowercase and remove accents when tokenizing.
+            Defaults to `True`.
+    """
+
+    resource_files_names = {
+        "vocab_file": "vocab.json",
+        "merges_file": "merges.txt",
+    }
+
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "xlm-mlm-en-2048": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-mlm-en-2048/vocab.json",
+            "xlm-mlm-ende-1024": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-mlm-ende-1024/vocab.json",
+            "xlm-mlm-enfr-1024": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-mlm-enfr-1024/vocab.json",
+            "xlm-mlm-enro-1024": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-mlm-enro-1024/vocab.json",
+            "xlm-mlm-tlm-xnli15-1024": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-mlm-tlm-xnli15-1024/vocab.json",
+            "xlm-mlm-xnli15-1024": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-mlm-xnli15-1024/vocab.json",
+            "xlm-clm-enfr-1024": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-clm-enfr-1024/vocab.json",
+            "xlm-clm-ende-1024": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-clm-ende-1024/vocab.json",
+            "xlm-mlm-17-1280": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-mlm-17-1280/vocab.json",
+            "xlm-mlm-100-1280": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-mlm-100-1280/vocab.json",
+        },
+        "merges_file": {
+            "xlm-mlm-en-2048": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-mlm-en-2048/merges.txt",
+            "xlm-mlm-ende-1024": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-mlm-ende-1024/merges.txt",
+            "xlm-mlm-enfr-1024": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-mlm-enfr-1024/merges.txt",
+            "xlm-mlm-enro-1024": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-mlm-enro-1024/merges.txt",
+            "xlm-mlm-tlm-xnli15-1024": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-mlm-tlm-xnli15-1024/merges.txt",
+            "xlm-mlm-xnli15-1024": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-mlm-xnli15-1024/merges.txt",
+            "xlm-clm-enfr-1024": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-clm-enfr-1024/merges.txt",
+            "xlm-clm-ende-1024": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-clm-ende-1024/merges.txt",
+            "xlm-mlm-17-1280": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-mlm-17-1280/merges.txt",
+            "xlm-mlm-100-1280": "https://bj.bcebos.com/paddlenlp/models/transformers/xlm/xlm-mlm-100-1280/merges.txt",
+        },
+    }
+    pretrained_init_configuration = {
+        "xlm-mlm-en-2048": {"do_lowercase_and_remove_accent": True},
+        "xlm-mlm-ende-1024": {
+            "do_lowercase_and_remove_accent": True,
+            "id2lang": {0: "de", 1: "en"},
+            "lang2id": {"de": 0, "en": 1},
+        },
+        "xlm-mlm-enfr-1024": {
+            "do_lowercase_and_remove_accent": True,
+            "id2lang": {0: "en", 1: "fr"},
+            "lang2id": {"en": 0, "fr": 1},
+        },
+        "xlm-mlm-enro-1024": {
+            "do_lowercase_and_remove_accent": True,
+            "id2lang": {0: "en", 1: "ro"},
+            "lang2id": {"en": 0, "ro": 1},
+        },
+        "xlm-mlm-tlm-xnli15-1024": {
+            "do_lowercase_and_remove_accent": True,
+            "id2lang": {
+                0: "ar",
+                1: "bg",
+                2: "de",
+                3: "el",
+                4: "en",
+                5: "es",
+                6: "fr",
+                7: "hi",
+                8: "ru",
+                9: "sw",
+                10: "th",
+                11: "tr",
+                12: "ur",
+                13: "vi",
+                14: "zh",
+            },
+            "lang2id": {
+                "ar": 0,
+                "bg": 1,
+                "de": 2,
+                "el": 3,
+                "en": 4,
+                "es": 5,
+                "fr": 6,
+                "hi": 7,
+                "ru": 8,
+                "sw": 9,
+                "th": 10,
+                "tr": 11,
+                "ur": 12,
+                "vi": 13,
+                "zh": 14,
+            },
+        },
+        "xlm-mlm-xnli15-1024": {
+            "do_lowercase_and_remove_accent": True,
+            "id2lang": {
+                0: "ar",
+                1: "bg",
+                2: "de",
+                3: "el",
+                4: "en",
+                5: "es",
+                6: "fr",
+                7: "hi",
+                8: "ru",
+                9: "sw",
+                10: "th",
+                11: "tr",
+                12: "ur",
+                13: "vi",
+                14: "zh",
+            },
+            "lang2id": {
+                "ar": 0,
+                "bg": 1,
+                "de": 2,
+                "el": 3,
+                "en": 4,
+                "es": 5,
+                "fr": 6,
+                "hi": 7,
+                "ru": 8,
+                "sw": 9,
+                "th": 10,
+                "tr": 11,
+                "ur": 12,
+                "vi": 13,
+                "zh": 14,
+            },
+        },
+        "xlm-clm-enfr-1024": {
+            "do_lowercase_and_remove_accent": True,
+            "id2lang": {0: "en", 1: "fr"},
+            "lang2id": {"en": 0, "fr": 1},
+        },
+        "xlm-clm-ende-1024": {
+            "do_lowercase_and_remove_accent": True,
+            "id2lang": {0: "de", 1: "en"},
+            "lang2id": {"de": 0, "en": 1},
+        },
+        "xlm-mlm-17-1280": {
+            "do_lowercase_and_remove_accent": False,
+            "id2lang": {
+                0: "ar",
+                1: "de",
+                2: "en",
+                3: "es",
+                4: "fr",
+                5: "hi",
+                6: "it",
+                7: "ja",
+                8: "ko",
+                9: "nl",
+                10: "pl",
+                11: "pt",
+                12: "ru",
+                13: "sv",
+                14: "tr",
+                15: "vi",
+                16: "zh",
+            },
+            "lang2id": {
+                "ar": 0,
+                "de": 1,
+                "en": 2,
+                "es": 3,
+                "fr": 4,
+                "hi": 5,
+                "it": 6,
+                "ja": 7,
+                "ko": 8,
+                "nl": 9,
+                "pl": 10,
+                "pt": 11,
+                "ru": 12,
+                "sv": 13,
+                "tr": 14,
+                "vi": 15,
+                "zh": 16,
+            },
+        },
+        "xlm-mlm-100-1280": {
+            "do_lowercase_and_remove_accent": False,
+            "id2lang": {
+                0: "af",
+                1: "als",
+                2: "am",
+                3: "an",
+                4: "ang",
+                5: "ar",
+                6: "arz",
+                7: "ast",
+                8: "az",
+                9: "bar",
+                10: "be",
+                11: "bg",
+                12: "bn",
+                13: "br",
+                14: "bs",
+                15: "ca",
+                16: "ceb",
+                17: "ckb",
+                18: "cs",
+                19: "cy",
+                20: "da",
+                21: "de",
+                22: "el",
+                23: "en",
+                24: "eo",
+                25: "es",
+                26: "et",
+                27: "eu",
+                28: "fa",
+                29: "fi",
+                30: "fr",
+                31: "fy",
+                32: "ga",
+                33: "gan",
+                34: "gl",
+                35: "gu",
+                36: "he",
+                37: "hi",
+                38: "hr",
+                39: "hu",
+                40: "hy",
+                41: "ia",
+                42: "id",
+                43: "is",
+                44: "it",
+                45: "ja",
+                46: "jv",
+                47: "ka",
+                48: "kk",
+                49: "kn",
+                50: "ko",
+                51: "ku",
+                52: "la",
+                53: "lb",
+                54: "lt",
+                55: "lv",
+                56: "mk",
+                57: "ml",
+                58: "mn",
+                59: "mr",
+                60: "ms",
+                61: "my",
+                62: "nds",
+                63: "ne",
+                64: "nl",
+                65: "nn",
+                66: "no",
+                67: "oc",
+                68: "pl",
+                69: "pt",
+                70: "ro",
+                71: "ru",
+                72: "scn",
+                73: "sco",
+                74: "sh",
+                75: "si",
+                76: "simple",
+                77: "sk",
+                78: "sl",
+                79: "sq",
+                80: "sr",
+                81: "sv",
+                82: "sw",
+                83: "ta",
+                84: "te",
+                85: "th",
+                86: "tl",
+                87: "tr",
+                88: "tt",
+                89: "uk",
+                90: "ur",
+                91: "uz",
+                92: "vi",
+                93: "war",
+                94: "wuu",
+                95: "yi",
+                96: "zh",
+                97: "zh_classical",
+                98: "zh_min_nan",
+                99: "zh_yue",
+            },
+            "lang2id": {
+                "af": 0,
+                "als": 1,
+                "am": 2,
+                "an": 3,
+                "ang": 4,
+                "ar": 5,
+                "arz": 6,
+                "ast": 7,
+                "az": 8,
+                "bar": 9,
+                "be": 10,
+                "bg": 11,
+                "bn": 12,
+                "br": 13,
+                "bs": 14,
+                "ca": 15,
+                "ceb": 16,
+                "ckb": 17,
+                "cs": 18,
+                "cy": 19,
+                "da": 20,
+                "de": 21,
+                "el": 22,
+                "en": 23,
+                "eo": 24,
+                "es": 25,
+                "et": 26,
+                "eu": 27,
+                "fa": 28,
+                "fi": 29,
+                "fr": 30,
+                "fy": 31,
+                "ga": 32,
+                "gan": 33,
+                "gl": 34,
+                "gu": 35,
+                "he": 36,
+                "hi": 37,
+                "hr": 38,
+                "hu": 39,
+                "hy": 40,
+                "ia": 41,
+                "id": 42,
+                "is": 43,
+                "it": 44,
+                "ja": 45,
+                "jv": 46,
+                "ka": 47,
+                "kk": 48,
+                "kn": 49,
+                "ko": 50,
+                "ku": 51,
+                "la": 52,
+                "lb": 53,
+                "lt": 54,
+                "lv": 55,
+                "mk": 56,
+                "ml": 57,
+                "mn": 58,
+                "mr": 59,
+                "ms": 60,
+                "my": 61,
+                "nds": 62,
+                "ne": 63,
+                "nl": 64,
+                "nn": 65,
+                "no": 66,
+                "oc": 67,
+                "pl": 68,
+                "pt": 69,
+                "ro": 70,
+                "ru": 71,
+                "scn": 72,
+                "sco": 73,
+                "sh": 74,
+                "si": 75,
+                "simple": 76,
+                "sk": 77,
+                "sl": 78,
+                "sq": 79,
+                "sr": 80,
+                "sv": 81,
+                "sw": 82,
+                "ta": 83,
+                "te": 84,
+                "th": 85,
+                "tl": 86,
+                "tr": 87,
+                "tt": 88,
+                "uk": 89,
+                "ur": 90,
+                "uz": 91,
+                "vi": 92,
+                "war": 93,
+                "wuu": 94,
+                "yi": 95,
+                "zh": 96,
+                "zh_classical": 97,
+                "zh_min_nan": 98,
+                "zh_yue": 99,
+            },
+        },
+    }
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        sep_token="</s>",
+        pad_token="<pad>",
+        cls_token="</s>",
+        mask_token="<special1>",
+        additional_special_tokens=[
+            "<special0>",
+            "<special1>",
+            "<special2>",
+            "<special3>",
+            "<special4>",
+            "<special5>",
+            "<special6>",
+            "<special7>",
+            "<special8>",
+            "<special9>",
+        ],
+        lang2id=None,
+        id2lang=None,
+        do_lowercase_and_remove_accent=True,
+        **kwargs,
+    ):
+        super().__init__(
+            unk_token=unk_token,
+            bos_token=bos_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            additional_special_tokens=additional_special_tokens,
+            lang2id=lang2id,
+            id2lang=id2lang,
+            do_lowercase_and_remove_accent=do_lowercase_and_remove_accent,
+            **kwargs,
+        )
+        self._vocab_file = vocab_file
+        self._merges_file = merges_file
+        self.sm = try_import("sacremoses")
+
+        # cache of sm.MosesPunctNormalizer instance
+        self.cache_moses_punct_normalizer = dict()
+        # cache of sm.MosesTokenizer instance
+        self.cache_moses_tokenizer = dict()
+        self.lang_with_custom_tokenizer = set(["zh", "th", "ja"])
+        # True for current supported model (v1.2.0), False for XLM-17 & 100
+        self.do_lowercase_and_remove_accent = do_lowercase_and_remove_accent
+        self.lang2id = lang2id
+        self.id2lang = id2lang
+        if lang2id is not None and id2lang is not None:
+            assert len(lang2id) == len(id2lang)
+
+        self.ja_word_tokenizer = None
+        self.zh_word_tokenizer = None
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            merges = merges_handle.read().split("\n")[:-1]
+        merges = [tuple(merge.split()[:2]) for merge in merges]
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {}
+
+    @property
+    def do_lower_case(self):
+        return self.do_lowercase_and_remove_accent
+
+    def moses_punct_norm(self, text, lang):
+        if lang not in self.cache_moses_punct_normalizer:
+            punct_normalizer = self.sm.MosesPunctNormalizer(lang=lang)
+            self.cache_moses_punct_normalizer[lang] = punct_normalizer
+        else:
+            punct_normalizer = self.cache_moses_punct_normalizer[lang]
+        return punct_normalizer.normalize(text)
+
+    def moses_tokenize(self, text, lang):
+        if lang not in self.cache_moses_tokenizer:
+            moses_tokenizer = self.sm.MosesTokenizer(lang=lang)
+            self.cache_moses_tokenizer[lang] = moses_tokenizer
+        else:
+            moses_tokenizer = self.cache_moses_tokenizer[lang]
+        return moses_tokenizer.tokenize(text, return_str=False, escape=False)
+
+    def moses_pipeline(self, text, lang):
+        text = replace_unicode_punct(text)
+        text = self.moses_punct_norm(text, lang)
+        text = remove_non_printing_char(text)
+        return text
+
+    def ja_tokenize(self, text):
+        """Tokenize a Japanese string."""
+        if self.ja_word_tokenizer is None:
+            try:
+                import Mykytea
+
+                self.ja_word_tokenizer = Mykytea.Mykytea(
+                    f"-model {os.path.expanduser('~')}/local/share/kytea/model.bin"
+                )
+            except (AttributeError, ImportError):
+                logger.error(
+                    "Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following steps"
+                )
+                logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea")
+                logger.error("2. autoreconf -i")
+                logger.error("3. ./configure --prefix=$HOME/local")
+                logger.error("4. make && make install")
+                logger.error("5. pip install kytea")
+                raise
+        return list(self.ja_word_tokenizer.getWS(text))
+
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    # def __len__(self):
+    #     return len(self.encoder)
+
+    def bpe(self, token):
+        word = tuple(token[:-1]) + (token[-1] + "</w>",)
+        if token in self.cache:
+            return self.cache[token]
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token + "</w>"
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        if word == "\n  </w>":
+            word = "\n</w>"
+        self.cache[token] = word
+        return word
+
+    def tokenize(self, text: TextInput, **kwargs) -> List[str]:
+        """
+        Converts a string in a sequence of tokens, using the tokenizer.
+
+        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
+        (BPE/SentencePieces/WordPieces). Takes care of added tokens.
+
+        Args:
+            text (`str`):
+                The sequence to be encoded.
+            **kwargs (additional keyword arguments):
+                Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
+
+        Returns:
+            `List[str]`: The list of tokens.
+        """
+        # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
+        all_special_tokens_extended = dict(
+            (str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
+        )
+
+        text, kwargs = self.prepare_for_tokenization(text, **kwargs)
+
+        # TODO: should this be in the base class?
+        if hasattr(self, "do_lower_case") and self.do_lower_case:
+            # convert non-special tokens to lowercase
+            escaped_special_toks = [
+                re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens)
+            ]
+            pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
+            text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
+
+        no_split_token = set(self.unique_no_split_tokens)
+        tokens = self.tokens_trie.split(text)
+        # ["This is something", "<special_token_1>", "  else"]
+        for i, token in enumerate(tokens):
+            if token in no_split_token:
+                tok_extended = all_special_tokens_extended.get(token, None)
+                left = tokens[i - 1] if i > 0 else None
+                right = tokens[i + 1] if i < len(tokens) - 1 else None
+                if isinstance(tok_extended, AddedToken):
+                    if tok_extended.rstrip and right:
+                        # A bit counter-intuitive but we strip the left of the string
+                        # since tok_extended.rstrip means the special token is eating all white spaces on its right
+                        tokens[i + 1] = right.lstrip()
+                    # Strip white spaces on the left
+                    if tok_extended.lstrip and left:
+                        tokens[i - 1] = left.rstrip()  # Opposite here
+                else:
+                    # We strip left and right by default
+                    if right:
+                        tokens[i + 1] = right.lstrip()
+                    if left:
+                        tokens[i - 1] = left.rstrip()
+        # ["This is something", "<special_token_1>", "else"]
+        tokenized_text = []
+        lang = kwargs.pop("lang", "en")
+        bypass_tokenizer = kwargs.pop("bypass_tokenizer", False)
+        for token in tokens:
+            # Need to skip eventual empty (fully stripped) tokens
+            if not token:
+                continue
+            if token in no_split_token:
+                tokenized_text.append(token)
+            else:
+                tokenized_text.extend(self._tokenize(token, lang=lang, bypass_tokenizer=bypass_tokenizer))
+        # ["This", " is", " something", "<special_token_1>", "else"]
+        return tokenized_text
+
+    def _tokenize(self, text, lang="en", bypass_tokenizer=False):
+        """
+        Tokenize a string given language code. For Chinese, Japanese and Thai, we use a language specific tokenizer.
+        Otherwise, we use Moses.
+        Details of tokenization:
+            - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
+            - Install with `pip install sacremoses`
+            - [pythainlp](https://github.com/PyThaiNLP/pythainlp): Thai tokenizer
+            - Install with `pip install pythainlp`
+            - [kytea](https://github.com/chezou/Mykytea-python): Japanese tokenizer, wrapper of
+              [KyTea](https://github.com/neubig/kytea)
+            - Install with the following steps:
+            ::
+                git clone git@github.com:neubig/kytea.git && cd kytea autoreconf -i ./configure --prefix=$HOME/local
+                make && make install pip install kytea
+            - [jieba](https://github.com/fxsjy/jieba): Chinese tokenizer (*)
+            - Install with `pip install jieba`
+        (*) The original XLM used [Stanford
+        Segmenter](https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip). However, the wrapper
+        (`nltk.tokenize.stanford_segmenter`) is slow due to JVM overhead, and it will be deprecated. Jieba is a lot
+        faster and pip-installable. Note there is some mismatch with the Stanford Segmenter. It should be fine if you
+        fine-tune the model with Chinese supervisionself. If you want the same exact behaviour, use the original XLM
+        [preprocessing script](https://github.com/facebookresearch/XLM/tree/master/tools) to tokenize the sentence
+        externally, and set `bypass_tokenizer=True` to bypass the tokenizer.
+
+        Args:
+            - lang: ISO language code (default = 'en') (string). Languages should belong of the model supported
+              languages. However, we don't enforce it.
+            - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)
+              (bool). If True, we only apply BPE.
+
+        Returns:
+            List of tokens.
+
+        """
+        if lang and self.lang2id and lang not in self.lang2id:
+            logger.error(
+                "Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model."
+            )
+        if bypass_tokenizer:
+            text = text.split()
+        elif lang not in self.lang_with_custom_tokenizer:
+            text = self.moses_pipeline(text, lang=lang)
+            # TODO: make sure we are using `xlm-mlm-enro-1024`, since XLM-100 doesn't have this step
+            if lang == "ro":
+                text = romanian_preprocessing(text)
+            text = self.moses_tokenize(text, lang=lang)
+        elif lang == "th":
+            text = self.moses_pipeline(text, lang=lang)
+            try:
+                if "pythainlp" not in sys.modules:
+                    from pythainlp.tokenize import word_tokenize as th_word_tokenize
+                else:
+                    th_word_tokenize = sys.modules["pythainlp"].word_tokenize
+            except (AttributeError, ImportError):
+                logger.error(
+                    "Make sure you install PyThaiNLP (https://github.com/PyThaiNLP/pythainlp) with the following steps"
+                )
+                logger.error("1. pip install pythainlp")
+                raise
+            text = th_word_tokenize(text)
+        elif lang == "zh":
+            try:
+                if "jieba" not in sys.modules:
+                    import jieba
+                else:
+                    jieba = sys.modules["jieba"]
+            except (AttributeError, ImportError):
+                logger.error("Make sure you install Jieba (https://github.com/fxsjy/jieba) with the following steps")
+                logger.error("1. pip install jieba")
+                raise
+            text = " ".join(jieba.cut(text))
+            text = self.moses_pipeline(text, lang=lang)
+            text = text.split()
+        elif lang == "ja":
+            text = self.moses_pipeline(text, lang=lang)
+            text = self.ja_tokenize(text)
+        else:
+            raise ValueError("It should not reach here")
+
+        if self.do_lowercase_and_remove_accent and not bypass_tokenizer:
+            text = lowercase_and_remove_accent(text)
+
+        split_tokens = []
+        for token in text:
+            if token:
+                split_tokens.extend([t for t in self.bpe(token).split(" ")])
+
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = "".join(tokens).replace("</w>", " ").strip()
+        return out_string
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An XLM sequence has the following format:
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s> B </s>`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: The model input with special tokens.
+        """
+        bos = [self.bos_token_id]
+        sep = [self.sep_token_id]
+
+        if token_ids_1 is None:
+            return bos + token_ids_0 + sep
+        return bos + token_ids_0 + sep + token_ids_1 + sep
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLM sequence
+        pair mask has the following format:
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [token type IDs] according to the given sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False,
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True,
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def save_resources(self, save_directory):
+        """
+        Save tokenizer related resources to files under `save_directory`.
+
+        Args:
+            save_directory (str): Directory to save files into.
+
+        """
+        for name, file_name in self.resource_files_names.items():
+            source_path = getattr(self, "_%s" % name)
+            save_path = os.path.join(save_directory, file_name)
+            if os.path.abspath(source_path) != os.path.abspath(save_path):
+                shutil.copyfile(source_path, save_path)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlnet/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlnet/__init__.py
new file mode 100644
index 000000000..b0d15c218
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlnet/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .modeling import *
+from .tokenizer import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlnet/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlnet/configuration.py
new file mode 100644
index 000000000..f5a93b6cc
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlnet/configuration.py
@@ -0,0 +1,337 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" XLNet configuration"""
+from __future__ import annotations
+
+import logging
+import warnings
+from typing import Dict
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+__all__ = ["XLNET_PRETRAINED_INIT_CONFIGURATION", "XLNetConfig", "XLNET_PRETRAINED_RESOURCE_FILES_MAP"]
+XLNET_PRETRAINED_RESOURCE_FILES_MAP = {
+    "model_state": {
+        "xlnet-base-cased": "https://bj.bcebos.com/paddlenlp/models/transformers/xlnet/xlnet-base-cased.pdparams",
+        "xlnet-large-cased": "https://bj.bcebos.com/paddlenlp/models/transformers/xlnet/xlnet-large-cased.pdparams",
+        "chinese-xlnet-base": "https://bj.bcebos.com/paddlenlp/models/transformers/xlnet/chinese-xlnet-base.pdparams",
+        "chinese-xlnet-mid": "https://bj.bcebos.com/paddlenlp/models/transformers/xlnet/chinese-xlnet-mid.pdparams",
+        "chinese-xlnet-large": "https://bj.bcebos.com/paddlenlp/models/transformers/xlnet/chinese-xlnet-large.pdparams",
+    }
+}
+
+XLNET_PRETRAINED_INIT_CONFIGURATION = {
+    "xlnet-base-cased": {
+        "attn_type": "bi",
+        "bi_data": False,
+        "clamp_len": -1,
+        "d_head": 64,
+        "d_inner": 3072,
+        "d_model": 768,
+        "dropout": 0.1,
+        "classifier_dropout": 0.1,
+        "ff_activation": "gelu",
+        "initializer_range": 0.02,
+        "layer_norm_eps": 1e-12,
+        "mem_len": None,
+        "n_head": 12,
+        "n_layer": 12,
+        "reuse_len": None,
+        "same_length": False,
+        "vocab_size": 32000,
+    },
+    "xlnet-large-cased": {
+        "attn_type": "bi",
+        "bi_data": False,
+        "clamp_len": -1,
+        "d_head": 64,
+        "d_inner": 4096,
+        "d_model": 1024,
+        "dropout": 0.1,
+        "classifier_dropout": 0.1,
+        "ff_activation": "gelu",
+        "initializer_range": 0.02,
+        "layer_norm_eps": 1e-12,
+        "mem_len": None,
+        "n_head": 16,
+        "n_layer": 24,
+        "reuse_len": None,
+        "same_length": False,
+        "vocab_size": 32000,
+    },
+    "chinese-xlnet-base": {
+        "attn_type": "bi",
+        "bi_data": False,
+        "clamp_len": -1,
+        "d_head": 64,
+        "d_inner": 3072,
+        "d_model": 768,
+        "dropout": 0.1,
+        "classifier_dropout": 0.1,
+        "ff_activation": "relu",
+        "initializer_range": 0.02,
+        "layer_norm_eps": 1e-12,
+        "mem_len": None,
+        "n_head": 12,
+        "n_layer": 12,
+        "reuse_len": None,
+        "same_length": False,
+        "vocab_size": 32000,
+    },
+    "chinese-xlnet-mid": {
+        "attn_type": "bi",
+        "bi_data": False,
+        "clamp_len": -1,
+        "d_head": 64,
+        "d_inner": 3072,
+        "d_model": 768,
+        "dropout": 0.1,
+        "classifier_dropout": 0.1,
+        "ff_activation": "relu",
+        "initializer_range": 0.02,
+        "layer_norm_eps": 1e-12,
+        "mem_len": None,
+        "n_head": 12,
+        "n_layer": 24,
+        "reuse_len": None,
+        "same_length": False,
+        "vocab_size": 32000,
+    },
+    "chinese-xlnet-large": {
+        "attn_type": "bi",
+        "bi_data": False,
+        "clamp_len": -1,
+        "d_head": 64,
+        "d_inner": 4096,
+        "d_model": 1024,
+        "dropout": 0.1,
+        "classifier_dropout": 0.1,
+        "ff_activation": "relu",
+        "initializer_range": 0.02,
+        "layer_norm_eps": 1e-12,
+        "mem_len": None,
+        "n_head": 16,
+        "n_layer": 24,
+        "reuse_len": None,
+        "same_length": False,
+        "vocab_size": 32000,
+    },
+}
+
+
+class XLNetConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`XLNetModel`]. It is used to
+    instantiate a XLNet model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the
+    [xlnet-large-cased] architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the XLNet model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`XLNetModel`].
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        n_layer (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        d_inner (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        ff_activation (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the If string, `"gelu"`, `"relu"`, `"silu"` and
+            `"gelu_new"` are supported.
+        untie_r (`bool`, *optional*, defaults to `True`):
+            Whether or not to untie relative position biases
+        attn_type (`str`, *optional*, defaults to `"bi"`):
+            The attention type used by the model. Set `"bi"` for XLNet, `"uni"` for Transformer-XL.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        mem_len (`int` or `None`, *optional*):
+            The number of tokens to cache. The key/value pairs that have already been pre-computed in a previous
+            forward pass won't be re-computed.
+        reuse_len (`int`, *optional*):
+            The number of tokens in the current batch to be cached and reused in the future.
+        bi_data (`bool`, *optional*, defaults to `False`):
+            Whether or not to use bidirectional input pipeline. Usually set to `True` during pretraining and `False`
+            during finetuning.
+        clamp_len (`int`, *optional*, defaults to -1):
+            Clamp all relative distances larger than clamp_len. Setting this attribute to -1 means no clamping.
+        same_length (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the same attention length for each token.
+        summary_type (`str`, *optional*, defaults to "last"):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+
+            Has to be one of the following options:
+
+                - `"last"`: Take the last token hidden state (like XLNet).
+                - `"first"`: Take the first token hidden state (like BERT).
+                - `"mean"`: Take the mean of all tokens hidden states.
+                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - `"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+
+            Whether or not to add a projection after the vector extraction.
+        summary_activation (`str`, *optional*):
+            Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
+
+            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+        summary_proj_to_labels (`boo`, *optional*, defaults to `True`):
+            Used in the sequence classification and multiple choice models.
+
+            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+        summary_last_dropout (`float`, *optional*, defaults to 0.1):
+            Used in the sequence classification and multiple choice models.
+            The dropout ratio to be used after the projection and activation.
+        start_n_top (`int`, *optional*, defaults to 5):
+            Used in the SQuAD evaluation script.
+        end_n_top (`int`, *optional*, defaults to 5):
+            Used in the SQuAD evaluation script.
+        use_mems_eval (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should make use of the recurrent memory mechanism in evaluation mode.
+        use_mems_train (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should make use of the recurrent memory mechanism in train mode.
+
+    Examples:
+
+    ```python
+    >>> from transformers import XLNetConfig, XLNetModel
+
+    >>> # Initializing a XLNet configuration
+    >>> configuration = XLNetConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = XLNetModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "xlnet"
+    keys_to_ignore_at_inference = ["mems"]
+    pretrained_init_configuration = XLNET_PRETRAINED_INIT_CONFIGURATION
+    # attribute_map: Dict[str, str] = {"hidden_size": "classifier_dropout", "num_classes": "num_labels"}
+    attribute_map: Dict[str, str] = {
+        "n_token": "vocab_size",  # Backward compatibility
+        "hidden_size": "d_model",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+        "num_classes": "num_labels",
+    }
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        d_model=1024,
+        n_layer=24,
+        n_head=16,
+        d_inner=4096,
+        ff_activation="gelu",
+        untie_r=True,
+        attn_type="bi",
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        dropout=0.1,
+        classfier_dropout=0.1,
+        mem_len=512,
+        reuse_len=None,
+        use_mems_eval=True,
+        use_mems_train=False,
+        bi_data=False,
+        clamp_len=-1,
+        same_length=False,
+        summary_type="last",
+        summary_use_proj=True,
+        summary_activation="tanh",
+        summary_last_dropout=0.1,
+        start_n_top=5,
+        end_n_top=5,
+        pad_token_id=5,
+        bos_token_id=1,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        """Constructs XLNetConfig."""
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.n_layer = n_layer
+        self.n_head = n_head
+        if d_model % n_head != 0:
+            raise ValueError(f"'d_model % n_head' ({d_model % n_head}) should be equal to 0")
+        if "d_head" in kwargs:
+            if kwargs["d_head"] != d_model // n_head:
+                raise ValueError(
+                    f"`d_head` ({kwargs['d_head']}) should be equal to `d_model // n_head` ({d_model // n_head})"
+                )
+        self.d_head = d_model // n_head
+        self.ff_activation = ff_activation
+        self.d_inner = d_inner
+        self.untie_r = untie_r
+        self.attn_type = attn_type
+
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+
+        self.dropout = dropout
+        self.classfier_dropout = classfier_dropout
+        self.mem_len = mem_len
+        self.reuse_len = reuse_len
+        self.bi_data = bi_data
+        self.clamp_len = clamp_len
+        self.same_length = same_length
+
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_last_dropout = summary_last_dropout
+        self.start_n_top = start_n_top
+        self.end_n_top = end_n_top
+
+        self.bos_token_id = bos_token_id
+        self.pad_token_id = pad_token_id
+        self.eos_token_id = eos_token_id
+
+        if "use_cache" in kwargs:
+            warnings.warn(
+                "The `use_cache` argument is deprecated and will be removed in a future version, use `use_mems_eval`"
+                " instead.",
+                FutureWarning,
+            )
+            use_mems_eval = kwargs["use_cache"]
+
+        self.use_mems_eval = use_mems_eval
+        self.use_mems_train = use_mems_train
+
+    @property
+    def max_position_embeddings(self):
+        logger.info(f"The model {self.model_type} is one of the few models that has no sequence length limit.")
+        return -1
+
+    @max_position_embeddings.setter
+    def max_position_embeddings(self, value):
+        # Message copied from Transformer-XL documentation
+        raise NotImplementedError(
+            f"The model {self.model_type} is one of the few models that has no sequence length limit."
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlnet/converter.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlnet/converter.py
new file mode 100644
index 000000000..ea2a6e015
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlnet/converter.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+from typing import List, Union, Dict, Type
+
+from paddlenlp.transformers import PretrainedModel, XLNetModel
+from paddlenlp.utils.converter import StateDictNameMapping, Converter
+
+__all__ = ["XLNetConverter"]
+
+
+class XLNetConverter(Converter):
+    _ignore_state_dict_keys = ["embeddings.position_ids"]
+    num_layer_key = "n_layer"
+    architectures: Dict[str, Type[PretrainedModel]] = {"XLNetModel": XLNetModel}
+
+    def get_paddle_pytorch_model_classes(self):
+        from paddlenlp.transformers import XLNetModel as PaddleModel
+        from transformers import XLNetModel as PytorchModel
+
+        return PaddleModel, PytorchModel
+
+    def get_name_mapping(self, config_or_num_layers: Union[dict, int] = None) -> List[StateDictNameMapping]:
+        num_layer = self.resolve_num_layer(config_or_num_layers)
+
+        hard_mapping = [
+            ["mask_emb", "mask_emb"],
+            ["word_embedding.weight", "word_embedding.weight"],
+        ]
+
+        for layer_index in range(num_layer):
+            layer_mappings = [
+                [f"layer.{layer_index}.rel_attn.q", f"layer.{layer_index}.rel_attn.q", "merge_last_two_dim"],
+                [f"layer.{layer_index}.rel_attn.k", f"layer.{layer_index}.rel_attn.k", "merge_last_two_dim"],
+                [f"layer.{layer_index}.rel_attn.v", f"layer.{layer_index}.rel_attn.v", "merge_last_two_dim"],
+                [f"layer.{layer_index}.rel_attn.o", f"layer.{layer_index}.rel_attn.o", "merge_last_two_dim"],
+                [f"layer.{layer_index}.rel_attn.r", f"layer.{layer_index}.rel_attn.r", "merge_last_two_dim"],
+                [f"layer.{layer_index}.rel_attn.r_r_bias", f"layer.{layer_index}.rel_attn.r_r_bias"],
+                [f"layer.{layer_index}.rel_attn.r_s_bias", f"layer.{layer_index}.rel_attn.r_s_bias"],
+                [f"layer.{layer_index}.rel_attn.r_w_bias", f"layer.{layer_index}.rel_attn.r_w_bias"],
+                [f"layer.{layer_index}.rel_attn.seg_embed", f"layer.{layer_index}.rel_attn.seg_embed"],
+                [f"layer.{layer_index}.rel_attn.layer_norm.weight", f"layer.{layer_index}.rel_attn.layer_norm.weight"],
+                [f"layer.{layer_index}.rel_attn.layer_norm.bias", f"layer.{layer_index}.rel_attn.layer_norm.bias"],
+                [f"layer.{layer_index}.ff.layer_norm.weight", f"layer.{layer_index}.ff.layer_norm.weight"],
+                [f"layer.{layer_index}.ff.layer_norm.bias", f"layer.{layer_index}.ff.layer_norm.bias"],
+                [f"layer.{layer_index}.ff.layer_1.weight", f"layer.{layer_index}.ff.layer_1.weight", "transpose"],
+                [f"layer.{layer_index}.ff.layer_2.weight", f"layer.{layer_index}.ff.layer_2.weight", "transpose"],
+                [f"layer.{layer_index}.ff.layer_1.bias", f"layer.{layer_index}.ff.layer_1.bias"],
+                [f"layer.{layer_index}.ff.layer_2.bias", f"layer.{layer_index}.ff.layer_2.bias"],
+            ]
+            hard_mapping.extend(layer_mappings)
+        return [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(hard_mapping)]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlnet/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlnet/modeling.py
new file mode 100644
index 000000000..608f300db
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlnet/modeling.py
@@ -0,0 +1,1931 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Modeling classes for XLNet model."""
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import BCEWithLogitsLoss, CrossEntropyLoss, Layer, MSELoss
+
+from ...utils.env import CONFIG_NAME
+from .. import PretrainedModel, register_base_model
+from ..activations import ACT2FN, get_activation
+from ..model_outputs import ModelOutput, tuple_output
+from .configuration import (
+    XLNET_PRETRAINED_INIT_CONFIGURATION,
+    XLNET_PRETRAINED_RESOURCE_FILES_MAP,
+    XLNetConfig,
+)
+
+__all__ = [
+    "XLNetPretrainedModel",
+    "XLNetModel",
+    "XLNetForSequenceClassification",
+    "XLNetForTokenClassification",
+    "XLNetLMHeadModel",
+    "XLNetForMultipleChoice",
+    "XLNetForQuestionAnswering",
+    "XLNetForCausalLM",
+]
+
+dtype_float = paddle.get_default_dtype()
+
+
+class XLNetRelativeAttention(Layer):
+    def __init__(self, config: XLNetConfig):
+        super(XLNetRelativeAttention, self).__init__()
+
+        self.n_head = config.n_head
+        self.d_head = config.d_head
+        self.d_model = config.d_model
+        self.scale = 1 / (config.d_head**0.5)
+
+        self.q = self.create_parameter([self.d_model, self.n_head * self.d_head])
+        self.k = self.create_parameter([self.d_model, self.n_head * self.d_head])
+        self.v = self.create_parameter([self.d_model, self.n_head * self.d_head])
+        self.o = self.create_parameter([self.d_model, self.n_head * self.d_head])
+        self.r = self.create_parameter([self.d_model, self.n_head * self.d_head])
+
+        self.r_r_bias = self.create_parameter([self.n_head, self.d_head], is_bias=True)
+        self.r_s_bias = self.create_parameter([self.n_head, self.d_head], is_bias=True)
+        self.r_w_bias = self.create_parameter([self.n_head, self.d_head], is_bias=True)
+        self.seg_embed = self.create_parameter([2, self.n_head, self.d_head], is_bias=False)
+
+        self.layer_norm = nn.LayerNorm(config.d_model, epsilon=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.dropout)
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    @staticmethod
+    def rel_shift_bnij(x, klen=-1):
+        # Relative shift of the attention matrix from bd~ to bd (refer to Appendix B in the Transformer-XL paper)
+        x_size = x.shape
+
+        x = paddle.reshape(x, [x_size[0], x_size[1], x_size[3], x_size[2]])
+        x = x[:, :, 1:, :]
+        x = paddle.reshape(x, [x_size[0], x_size[1], x_size[2], x_size[3] - 1])
+        x = paddle.index_select(x, index=paddle.arange(klen, dtype="int64"), axis=3)
+        return x
+
+    def rel_attn_core(
+        self,
+        q_head,
+        k_head_h,
+        v_head_h,
+        k_head_r,
+        seg_mat=None,
+        attn_mask=None,
+        head_mask=None,
+        output_attentions=False,
+    ):
+        """Core relative positional attention operations."""
+
+        # Content based attention score (refer to the Transformer-XL paper)
+        # q_head = Exi * Wq; self.r_w_bias = u; k_head_h = Wke * Exj
+        # a = Exi * Wq * Wke * Exj; c = u * Wke * Exj; ac = a + c
+        ac = paddle.einsum("ibnd,jbnd->bnij", q_head + self.r_w_bias, k_head_h)
+
+        # Position based attention score (refer to the Transformer-XL paper)
+        # q_head = Exi * Wq; self.r_r_bias = v; k_head_r = Wkr * Rij
+        # b = Exi * Wq * Wkr * Rij; d = v * Wkr * Rij; bd = b + d
+        bd = paddle.einsum("ibnd,jbnd->bnij", q_head + self.r_r_bias, k_head_r)
+        bd = self.rel_shift_bnij(bd, klen=ac.shape[3])
+
+        # Segment based attention score
+        if seg_mat is None:
+            ef = 0
+        else:
+            ef = paddle.einsum("ibnd,snd->ibns", q_head + self.r_s_bias, self.seg_embed)
+            ef = paddle.einsum("ijbs,ibns->bnij", seg_mat, ef)
+
+        # Merge attention scores and perform masking
+        attn_score = (ac + bd + ef) * self.scale
+
+        if attn_mask is not None:
+            attn_mask = attn_mask.transpose([2, 3, 0, 1])
+            attn_score = attn_score - 1e30 * attn_mask
+
+        # Attention probability
+        attn_prob = F.softmax(attn_score, axis=3)
+        attn_prob = self.dropout(attn_prob)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_prob = attn_prob * head_mask.transpose([2, 3, 0, 1])
+
+        # Attention output
+        attn_vec = paddle.einsum("bnij,jbnd->ibnd", attn_prob, v_head_h)
+
+        if output_attentions:
+            return attn_vec, attn_prob.transpose([2, 3, 0, 1])
+        return attn_vec
+
+    def post_attention(self, h, attn_vec, residual=True):
+        """Post-attention processing."""
+        # Post-attention projection (back to 'd_model')
+        # Compute einsum4x4("ibnd,hnd->ibh", attn_vec, self.o)
+        shape = attn_vec.shape
+        attn_vec = attn_vec.reshape([shape[0], shape[1], attn_vec.shape[2] * attn_vec.shape[3]])
+        attn_out = paddle.einsum("ibm,hm->ibh", attn_vec, self.o)
+
+        attn_out = self.dropout(attn_out)
+        if residual:
+            attn_out = attn_out + h
+
+        output = self.layer_norm(attn_out)
+        return output
+
+    def forward(
+        self,
+        h,
+        g,
+        attn_mask_h,
+        attn_mask_g,
+        r,
+        seg_mat,
+        mems=None,
+        target_mapping=None,
+        head_mask=None,
+        output_attentions=False,
+    ):
+        if g is not None:
+            # Two-stream attention with relative positional encoding.
+            # Content based attention score
+            if mems is not None and mems.dim() > 1:
+                cat = paddle.concat([mems, h], axis=0)
+            else:
+                cat = h
+
+            # Content-based key head
+            # Compute k_head_h = einsum4x4("ibh,h(n*d)->ibnd", cat, self.k)
+            k_head_h = paddle.matmul(cat, self.k)
+            k_head_h = paddle.reshape(k_head_h, shape=[cat.shape[0], cat.shape[1], self.n_head, self.d_head])
+
+            # Content-based value head
+            # Compute v_head_h = einsum4x4("ibh,h(n*d)->ibnd", cat, self.v)
+            v_head_h = paddle.matmul(cat, self.v)
+            v_head_h = paddle.reshape(v_head_h, shape=[cat.shape[0], cat.shape[1], self.n_head, self.d_head])
+
+            # Position-based key head
+            # Compute k_head_r = einsum4x4("ibh,h(n*d)->ibnd", r, self.r)
+            k_head_r = paddle.matmul(r, self.r)
+            k_head_r = paddle.reshape(k_head_r, shape=[r.shape[0], r.shape[1], self.n_head, self.d_head])
+
+            # H-stream
+            # Content-stream query head
+            # Compute q_head_h = einsum4x4("ibh,h(n*d)->ibnd", h, self.q)
+            q_head_h = paddle.matmul(h, self.q)  # shape
+            q_head_h = paddle.reshape(q_head_h, shape=[h.shape[0], h.shape[1], self.n_head, self.d_head])
+
+            # Core attention ops
+            attn_vec_h = self.rel_attn_core(
+                q_head_h,
+                k_head_h,
+                v_head_h,
+                k_head_r,
+                seg_mat=seg_mat,
+                attn_mask=attn_mask_h,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+            )
+
+            if output_attentions:
+                attn_vec_h, attn_prob_h = attn_vec_h
+
+            # Post processing
+            output_h = self.post_attention(h, attn_vec_h)
+
+            # G-stream
+            # Query-stream query head
+            # Compute q_head_g = einsum4x4("ibh,hnd->ibnd", g, self.q)
+            shape = g.shape
+            q_head_g = paddle.matmul(g, self.q).reshape([shape[0], shape[1], self.n_head, self.d_head])
+
+            # Core attention ops
+            if target_mapping is not None:
+                # Compute q_head_g = einsum4x4("mbnd,mlb->lbnd", q_head_g, target_mapping)
+                q_head_g = paddle.einsum("mbnd,mlb->lbnd", q_head_g, target_mapping)
+                attn_vec_g = self.rel_attn_core(
+                    q_head_g,
+                    k_head_h,
+                    v_head_h,
+                    k_head_r,
+                    seg_mat=seg_mat,
+                    attn_mask=attn_mask_g,
+                    head_mask=head_mask,
+                    output_attentions=output_attentions,
+                )
+
+                if output_attentions:
+                    attn_vec_g, attn_prob_g = attn_vec_g
+
+                # Compute attn_vec_g = einsum4x4("lbnd,mlb->mbnd", attn_vec_g, target_mapping)
+                attn_vec_g = paddle.einsum("lbnd,mlb->mbnd", attn_vec_g, target_mapping)
+
+            else:
+                attn_vec_g = self.rel_attn_core(
+                    q_head_g,
+                    k_head_h,
+                    v_head_h,
+                    k_head_r,
+                    seg_mat=seg_mat,
+                    attn_mask=attn_mask_g,
+                    head_mask=head_mask,
+                    output_attentions=output_attentions,
+                )
+
+                if output_attentions:
+                    attn_vec_g, attn_prob_g = attn_vec_g
+
+            # Post processing
+            output_g = self.post_attention(g, attn_vec_g)
+
+            if output_attentions:
+                attn_prob = attn_prob_h, attn_prob_g
+
+        else:
+            # Multi-head attention with relative positional encoding
+            if mems is not None and mems.dim() > 1:
+                cat = paddle.concat([mems, h], axis=0)
+            else:
+                cat = h
+
+            # Content heads
+            # Compute q_head_h = einsum4x4("ibh,hnd->ibnd", h, self.q)
+            q_head_h = paddle.matmul(h, self.q)
+            q_head_h = paddle.reshape(q_head_h, shape=[h.shape[0], h.shape[1], self.n_head, self.d_head])
+
+            # Compute k_head_h = einsum4x4("ibh,hnd->ibnd", cat, self.k)
+            k_head_h = paddle.matmul(cat, self.k)
+            k_head_h = paddle.reshape(k_head_h, shape=[h.shape[0], h.shape[1], self.n_head, self.d_head])
+
+            # Compute v_head_h = einsum4x4("ibh,hnd->ibnd", cat, self.v)
+            v_head_h = paddle.matmul(cat, self.v)
+            v_head_h = paddle.reshape(v_head_h, shape=[h.shape[0], h.shape[1], self.n_head, self.d_head])
+
+            # Position-based key head
+            # Compute k_head_r = einsum4x4("ibh,hnd->ibnd", r, self.r)
+            k_head_r = paddle.matmul(r, self.r)
+            k_head_r = paddle.reshape(k_head_r, shape=[k_head_r.shape[0], -1, self.n_head, self.d_head])
+
+            # Core attention ops
+            attn_vec = self.rel_attn_core(
+                q_head_h,
+                k_head_h,
+                v_head_h,
+                k_head_r,
+                seg_mat=seg_mat,
+                attn_mask=attn_mask_h,
+                head_mask=head_mask,
+                output_attentions=output_attentions,
+            )
+
+            if output_attentions:
+                attn_vec, attn_prob = attn_vec
+
+            # Post processing
+            output_h = self.post_attention(h, attn_vec)
+            output_g = None
+
+        outputs = (output_h, output_g)
+
+        if output_attentions:
+            outputs = outputs + (attn_prob,)
+        return outputs
+
+
+class XLNetFeedForward(Layer):
+    def __init__(self, config: XLNetConfig):
+        super(XLNetFeedForward, self).__init__()
+
+        self.layer_norm = nn.LayerNorm(config.d_model, epsilon=config.layer_norm_eps)
+        self.layer_1 = nn.Linear(config.d_model, config.d_inner)
+        self.layer_2 = nn.Linear(config.d_inner, config.d_model)
+        self.dropout = nn.Dropout(config.dropout)
+        if isinstance(config.ff_activation, str):
+            self.activation_function = ACT2FN[config.ff_activation]
+        else:
+            self.activation_function = config.ff_activation
+
+    def forward(self, inp):
+        output = inp
+        output = self.layer_1(output)
+        output = self.activation_function(output)
+        output = self.dropout(output)
+        output = self.layer_2(output)
+        output = self.dropout(output)
+        output = self.layer_norm(output + inp)
+        return output
+
+
+class XLNetLayer(Layer):
+    def __init__(self, config: XLNetConfig):
+        super(XLNetLayer, self).__init__()
+
+        self.rel_attn = XLNetRelativeAttention(config)
+        self.ff = XLNetFeedForward(config)
+        self.seq_len_dim = 1
+
+    def forward(
+        self,
+        output_h,
+        output_g,
+        attn_mask_h,
+        attn_mask_g,
+        r,
+        seg_mat,
+        mems=None,
+        target_mapping=None,
+        head_mask=None,
+        output_attentions=False,
+    ):
+        outputs = self.rel_attn(
+            output_h,
+            output_g,
+            attn_mask_h,
+            attn_mask_g,
+            r,
+            seg_mat,
+            mems=mems,
+            target_mapping=target_mapping,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+        )
+
+        output_h, output_g = outputs[:2]
+
+        if output_g is not None:
+            output_g = self.ff(output_g)
+        output_h = self.ff(output_h)
+
+        outputs = (output_h, output_g) + outputs[2:]  # Add again attentions if they are there
+        return outputs
+
+
+class XLNetPretrainedModel(PretrainedModel):
+    """
+    An abstract class for pretrained XLNet models. It provides XLNet related
+    `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
+    `pretrained_init_configuration`, `base_model_prefix` for downloading
+    and loading pretrained models.
+    See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details.
+    """
+
+    pretrained_init_configuration = XLNET_PRETRAINED_INIT_CONFIGURATION
+    resource_files_names = {"model_state": "model_state.pdparams"}
+    pretrained_resource_files_map = XLNET_PRETRAINED_RESOURCE_FILES_MAP
+    model_config_file = CONFIG_NAME
+    config_class = XLNetConfig
+    base_model_prefix = "transformer"
+
+    def _init_weights(self, layer):
+        # Initialize the weights.
+        if isinstance(layer, (nn.Linear, nn.Embedding)):
+            if isinstance(layer.weight, paddle.Tensor):
+                layer.weight.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.initializer_range
+                        if hasattr(self, "initializer_range")
+                        else self.transformer.config["initializer_range"],
+                        shape=layer.weight.shape,
+                    )
+                )
+            if isinstance(layer, nn.Linear) and layer.bias is not None:
+                layer.bias.set_value(paddle.zeros_like(layer.bias))
+        elif isinstance(layer, nn.LayerNorm):
+            layer.bias.set_value(paddle.zeros_like(layer.bias))
+            layer.weight.set_value(paddle.full_like(layer.weight, 1.0))
+        elif isinstance(layer, XLNetRelativeAttention):
+            for param in [
+                layer.q,
+                layer.k,
+                layer.v,
+                layer.o,
+                layer.r,
+                layer.r_r_bias,
+                layer.r_s_bias,
+                layer.r_w_bias,
+                layer.seg_embed,
+            ]:
+                param.set_value(
+                    paddle.tensor.normal(
+                        mean=0.0,
+                        std=self.initializer_range
+                        if hasattr(self, "initializer_range")
+                        else self.transformer.config["initializer_range"],
+                        shape=param.shape,
+                    )
+                )
+        elif isinstance(layer, XLNetModel):
+            layer.mask_emb.set_value(
+                paddle.tensor.normal(
+                    mean=0.0,
+                    std=self.initializer_range
+                    if hasattr(self, "initializer_range")
+                    else self.transformer.config["initializer_range"],
+                    shape=layer.mask_emb.shape,
+                )
+            )
+
+
+@dataclass
+class XLNetModelOutput(ModelOutput):
+    """
+    Output type of [`XLNetModel`].
+
+    Args:
+        last_hidden_state (`paddle.Tensor` of shape `(batch_size, num_predict, hidden_size)`):
+            Sequence of hidden-states at the last layer of the model.
+
+            `num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict`
+            corresponds to `sequence_length`.
+        mems (`List[paddle.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
+            token ids which have their past given to this model should not be passed as `input_ids` as they have
+            already been computed.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: paddle.Tensor
+    mems: Optional[List[paddle.Tensor]] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class XLNetLMHeadModelOutput(ModelOutput):
+    """
+    Output type of [`XLNetLMHeadModel`].
+
+    Args:
+        loss (`paddle.Tensor` of shape *(1,)*, *optional*, returned when `labels` is provided)
+            Language modeling loss (for next-token prediction).
+        logits (`paddle.Tensor` of shape `(batch_size, num_predict, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+
+            `num_predict` corresponds to `target_mapping.shape[1]`. If `target_mapping` is `None`, then `num_predict`
+            corresponds to `sequence_length`.
+        mems (`List[paddle.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
+            token ids which have their past given to this model should not be passed as `input_ids` as they have
+            already been computed.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    mems: Optional[List[paddle.Tensor]] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class XLNetForSequenceClassificationOutput(ModelOutput):
+    """
+    Output type of [`XLNetForSequenceClassification`].
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `label` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`paddle.Tensor` of shape `(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        mems (`List[paddle.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
+            token ids which have their past given to this model should not be passed as `input_ids` as they have
+            already been computed.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    mems: Optional[List[paddle.Tensor]] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class XLNetForTokenClassificationOutput(ModelOutput):
+    """
+    Output type of [`XLNetForTokenClassificationOutput`].
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
+            Classification loss.
+        logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.num_labels)`):
+            Classification scores (before SoftMax).
+        mems (`List[paddle.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
+            token ids which have their past given to this model should not be passed as `input_ids` as they have
+            already been computed.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    mems: Optional[List[paddle.Tensor]] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class XLNetForMultipleChoiceOutput(ModelOutput):
+    """
+    Output type of [`XLNetForMultipleChoice`].
+
+    Args:
+        loss (`paddle.Tensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
+            Classification loss.
+        logits (`paddle.Tensor` of shape `(batch_size, num_choices)`):
+            *num_choices* is the second dimension of the input tensors. (see *input_ids* above).
+
+            Classification scores (before SoftMax).
+        mems (`List[paddle.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
+            token ids which have their past given to this model should not be passed as `input_ids` as they have
+            already been computed.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    mems: Optional[List[paddle.Tensor]] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class XLNetForQuestionAnsweringSimpleOutput(ModelOutput):
+    """
+    Output type of [`XLNetForQuestionAnsweringSimple`].
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        start_logits (`paddle.Tensor` of shape `(batch_size, sequence_length,)`):
+            Span-start scores (before SoftMax).
+        end_logits (`paddle.Tensor` of shape `(batch_size, sequence_length,)`):
+            Span-end scores (before SoftMax).
+        mems (`List[paddle.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
+            token ids which have their past given to this model should not be passed as `input_ids` as they have
+            already been computed.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    start_logits: paddle.Tensor = None
+    end_logits: paddle.Tensor = None
+    mems: Optional[List[paddle.Tensor]] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class XLNetForQuestionAnsweringOutput(ModelOutput):
+    """
+    Output type of [`XLNetForQuestionAnswering`].
+
+    Args:
+        loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
+            Classification loss as the sum of start token, end token (and is_impossible if provided) classification
+            losses.
+        start_top_log_probs (`paddle.Tensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
+        start_top_index (`paddle.Tensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Indices for the top config.start_n_top start token possibilities (beam-search).
+        end_top_log_probs (`paddle.Tensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
+            (beam-search).
+        end_top_index (`paddle.Tensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
+        cls_logits (`paddle.Tensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Log probabilities for the `is_impossible` label of the answers.
+        mems (`List[paddle.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states. Can be used (see `mems` input) to speed up sequential decoding. The
+            token ids which have their past given to this model should not be passed as `input_ids` as they have
+            already been computed.
+        hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[paddle.Tensor] = None
+    start_top_log_probs: Optional[paddle.Tensor] = None
+    start_top_index: Optional[paddle.Tensor] = None
+    end_top_log_probs: Optional[paddle.Tensor] = None
+    end_top_index: Optional[paddle.Tensor] = None
+    cls_logits: Optional[paddle.Tensor] = None
+    mems: Optional[List[paddle.Tensor]] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@register_base_model
+class XLNetModel(XLNetPretrainedModel):
+    """
+    The bare XLNet Model outputting raw hidden-states.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/zh/api/paddle/nn/Layer_cn.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        config (:class:`XLNetConfig`):
+            An instance of :class:`XLNetConfig`.
+
+            .. note::
+                A normal_initializer initializes weight matrices as normal distributions.
+                See :meth:`XLNetPretrainedModel._init_weights()` for how weights are initialized in `XLNetModel`.
+    """
+
+    def __init__(self, config: XLNetConfig):
+        super(XLNetModel, self).__init__(config)
+        self.initializer_range = config.initializer_range
+        self.mem_len = config.mem_len
+        self.reuse_len = config.reuse_len
+        self.d_model = config.d_model
+        self.same_length = config.same_length
+        self.attn_type = config.attn_type
+        self.bi_data = config.bi_data
+        self.clamp_len = config.clamp_len
+        self.n_layer = config.n_layer
+        self.dropout = nn.Dropout(config.dropout)
+        self.word_embedding = nn.Embedding(config.vocab_size, config.d_model)
+        self.mask_emb = self.create_parameter([1, 1, config.d_model])
+        self.layer = nn.LayerList([XLNetLayer(config) for _ in range(config.n_layer)])
+
+    def get_input_embeddings(self):
+        return self.word_embedding
+
+    def set_input_embeddings(self, new_embeddings):
+        self.word_embedding = new_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        raise NotImplementedError
+
+    def create_mask(self, qlen, mlen):
+        # Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked.
+        attn_mask = paddle.ones([qlen, qlen])
+        mask_up = paddle.triu(attn_mask, diagonal=1)
+        attn_mask_pad = paddle.zeros([qlen, mlen])
+        ret = paddle.concat([attn_mask_pad, mask_up], axis=1)
+        if self.same_length:
+            mask_lo = paddle.tril(attn_mask, diagonal=-1)
+            ret = paddle.concat([ret[:, :qlen] + mask_lo, ret[:, qlen:]], axis=1)
+
+        return ret
+
+    def cache_mem(self, curr_out, prev_mem):
+        # Cache hidden states into memory.
+        if self.reuse_len is not None and self.reuse_len > 0:
+            curr_out = curr_out[: self.reuse_len]
+
+        if self.mem_len is None or self.mem_len == 0:
+            # If `use_mems` is active but no `mem_len` is defined, the model behaves like GPT-2 at inference time
+            # and returns all of the past and current hidden states.
+            cutoff = 0
+        else:
+            # If :obj:`use_mems` is active and `mem_len` is defined, the model returns the last `mem_len` hidden
+            # states. This is the preferred setting for training and long-form generation.
+            cutoff = -self.mem_len
+        if prev_mem is None:
+            # If :obj:`use_mems` is active and `mem_len` is defined, the model
+            new_mem = curr_out[cutoff:]
+        else:
+            new_mem = paddle.concat([prev_mem, curr_out], axis=0)[cutoff:]
+
+        return new_mem.detach()
+
+    @staticmethod
+    def positional_embedding(pos_seq, inv_freq, bsz=None):
+        # Compute sinusoid_inp = einsum4x4("i,d->id", pos_seq, inv_freq)
+        sinusoid_inp = paddle.einsum("i,d->id", pos_seq, inv_freq)
+        pos_emb = paddle.concat([paddle.sin(sinusoid_inp), paddle.cos(sinusoid_inp)], axis=-1)
+        pos_emb = paddle.unsqueeze(pos_emb, axis=1)
+        if bsz is not None:
+            pos_emb = pos_emb.expand([-1, bsz, -1])
+            pos_emb.stop_gradient = True
+        pos_emb.stop_gradient = True
+        return pos_emb
+
+    def relative_positional_encoding(self, qlen, klen, bsz=None):
+        # Create relative positional encoding.
+        freq_seq = paddle.arange(0, self.d_model, 2.0, dtype=dtype_float)
+        inv_freq = 1 / 10000 ** (freq_seq / self.d_model)
+
+        if self.attn_type == "bi":
+            beg, end = klen, -qlen
+        elif self.attn_type == "uni":
+            beg, end = klen, -1
+        else:
+            raise ValueError("Unknown `attn_type` {}.".format(self.attn_type))
+
+        if self.bi_data:
+            fwd_pos_seq = paddle.arange(beg, end, -1.0, dtype=dtype_float)
+            bwd_pos_seq = paddle.arange(-beg, -end, 1.0, dtype=dtype_float)
+
+            if self.clamp_len > 0:
+                fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
+                bwd_pos_seq = bwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
+
+            if bsz is not None:
+                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz // 2)
+                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz // 2)
+            else:
+                fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq)
+                bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq)
+            pos_emb = paddle.concat([fwd_pos_emb, bwd_pos_emb], axis=1)
+        else:
+            fwd_pos_seq = paddle.arange(beg, end, -1.0, dtype=dtype_float)
+            if self.clamp_len > 0:
+                fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len)
+            pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz)
+        return pos_emb
+
+    def forward(
+        self,
+        input_ids,
+        token_type_ids=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        use_mems_train=False,
+        use_mems_eval=False,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
+    ):
+        r"""
+        The XLNetModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
+            token_type_ids (Tensor, optional):
+                Segment token indices to indicate first and second portions of the inputs.
+                Indices can be either 0 or 1:
+
+                - 0 corresponds to a **sentence A** token,
+                - 1 corresponds to a **sentence B** token.
+
+                It's data type should be `int64` and has a shape of [batch_size, sequence_length].
+                Defaults to None, which means no segment embeddings is added to token embeddings.
+            attention_mask (Tensor, optional):
+                Mask to indicate whether to perform attention on each input token or not.
+                The values should be either 0 or 1. The attention scores will be set
+                to **-infinity** for any positions in the mask that are **0**, and will be
+                **unchanged** for positions that are **1**.
+
+                - **1** for tokens that are **not masked**,
+                - **0** for tokens that are **masked**.
+
+                It's data type should be `float32` and has a shape of [batch_size, sequence_length].
+                Defaults to `None`.
+            mems (List[Tensor], optional):
+                A list of length `n_layers` with each Tensor being a pre-computed hidden-state for each layer.
+                Each Tensor has a dtype `float32` and a shape of [batch_size, sequence_length, hidden_size].
+                Defaults to None, and we don't use mems.
+
+                .. note::
+                    `use_mems` has to be set to `True` in order to make use of `mems`.
+            perm_mask (Tensor, optional):
+                Mask to indicate the permutation pattern of the input sequence with values being either 0 or 1.
+
+                - if ``perm_mask[k, i, j] = 0``, i **attend** to j in batch k;
+                - if ``perm_mask[k, i, j] = 1``, i **does not attend** to j in batch k.
+
+                Only used during pretraining (to define factorization order) or
+                for sequential decoding (generation). It's data type should be `float32` and
+                has a shape of [batch_size, sequence_length, sequence_length].
+                Defaults to `None`, then each token attends to all the other tokens (full bidirectional attention).
+            target_mapping (Tensor, optional):
+                Mask to indicate the output tokens to use with values being either 0 or 1.
+                If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
+                It's data type should be `float32` and has a shape of [batch_size, num_predict, sequence_length].
+                Only used during pretraining for partial prediction or for sequential decoding (generation).
+                Defaults to `None`.
+            input_mask (Tensor, optional):
+                Mask to avoid performing attention on padding token with values being either 0 or 1.
+                It's data type should be `float32` and it has a shape of [batch_size, sequence_length].
+                This mask is negative of `attention_mask`:
+
+                - 1 for tokens that are **masked**,
+                - 0 for tokens that are **not masked**.
+
+                You should use only one of `input_mask` and `attention_mask`. Defaults to `None`.
+            head_mask (Tensor, optional):
+                Mask to nullify selected heads of the self-attention layers with values being either 0 or 1.
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+                It's data type should be `float32` and has a shape of [num_heads] or [num_layers, num_heads].
+                Defaults to `None`, which means we keep all heads.
+            inputs_embeds (Tensor, optional):
+                An embedded representation tensor which is an alternative of `input_ids`.
+                You should specify only either one of them to avoid contradiction.
+                It's data type should be `float32` and has a shape of [batch_size, sequence_length, hidden_size].
+                Defaults to `None`, which means we only specify `input_ids`.
+            use_mems_train (bool, optional):
+                Whether or not to use recurrent memory mechanism during training.
+                Defaults to `False` and we don't use recurrent memory mechanism in training mode.
+            use_mems_eval (bool, optional):
+                Whether or not to use recurrent memory mechanism during evaluation.
+                Defaults to `False` and we don't use recurrent memory mechanism in evaluation mode.
+            return_dict (bool, optional):
+                Whether or not to return additional information other than the output tensor.
+                If True, then returns information about `output`, `new_mems`, `hidden_states` and `attentions`
+                which will also be formatted as a dict. Else only returns the output tensor.
+                Defaults to False.
+
+        Returns:
+            Tensor or dict: Returns tensor `output` or a dict with key-value pairs:
+            {"last_hidden_state": `output`, "mems": `mems`,
+            "hidden_states": `hidden_states`, "attentions": `attentions`}.
+
+            With the corresponding fields:
+
+            - `output` (Tensor):
+                Output of the final layer of the model.
+                It's a Tensor of dtype `float32` and has a shape of [batch_size, num_predict, hidden_size].
+
+                .. note::
+                    `num_predict` corresponds to `target_mapping.shape[1]`.
+                    If `target_mapping` is `None`, then `num_predict` equals to `sequence_length`.
+            - `mems` (List[Tensor]):
+                A list of pre-computed hidden-states. The length of the list is `n_layers`.
+                Each element in the list is a Tensor with dtype `float32` and has a shape of
+                [batch_size, sequence_length, hidden_size].
+            - `hidden_states` (List[Tensor], optional):
+                A list of Tensor containing hidden-states of the model at the output of each layer
+                plus the initial embedding outputs. Each Tensor has a data type of `float32` and
+                has a shape of [batch_size, sequence_length, hidden_size].
+                Being returned when `output_hidden_states` is set to `True`.
+            - `attentions` (List[Tensor], optional):
+                A list of Tensor containing attentions weights of each hidden layer.
+                Each Tensor (one for each layer) has a data type of `float32` and
+                has a shape of [batch_size, num_heads, sequence_length, sequence_length].
+                Being returned when `output_attentions` is set to `True`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers.xlnet.modeling import XLNetModel
+                from paddlenlp.transformers.xlnet.tokenizer import XLNetTokenizer
+
+                tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
+                model = XLNetModel.from_pretrained('xlnet-base-cased')
+
+                inputs = tokenizer("Hey, Paddle-paddle is awesome !")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                last_hidden_states = outputs[0]
+        """
+
+        if self.training:
+            use_mems = use_mems_train
+        else:
+            use_mems = use_mems_eval
+
+        # The original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
+        # but we want a unified interface in the library with the batch size on the first dimension
+        # so we move here the first dimension (batch) to the end
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_ids = paddle.transpose(input_ids, perm=[1, 0])
+            qlen, bsz = input_ids.shape[0], input_ids.shape[1]
+        elif inputs_embeds is not None:
+            inputs_embeds = paddle.transpose(inputs_embeds, perm=[1, 0])
+            qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        token_type_ids = token_type_ids.transpose([1, 0]) if token_type_ids is not None else None
+        input_mask = input_mask.transpose([1, 0]) if input_mask is not None else None
+        attention_mask = attention_mask.transpose([1, 0]) if attention_mask is not None else None
+        perm_mask = perm_mask.transpose([1, 2, 0]) if perm_mask is not None else None
+        target_mapping = target_mapping.transpose([1, 2, 0]) if target_mapping is not None else None
+
+        mlen = mems[0].shape[0] if mems is not None and mems[0] is not None else 0
+        klen = mlen + qlen
+
+        # Attention mask
+        # Causal attention mask
+        if self.attn_type == "uni":
+            attn_mask = self.create_mask(qlen, mlen)
+            attn_mask = paddle.unsqueeze(attn_mask, axis=[2, 3])
+        elif self.attn_type == "bi":
+            attn_mask = None
+        else:
+            raise ValueError("Unsupported attention type: {}".format(self.attn_type))
+
+        # Data mask: input mask & perm mask
+        assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) "
+        "or attention_mask (uses 0 for padding, added for compatibility with BERT). Please choose one."
+        if input_mask is None and attention_mask is not None:
+            input_mask = 1.0 - attention_mask
+        if input_mask is not None and perm_mask is not None:
+            data_mask = paddle.unsqueeze(input_mask, axis=0) + perm_mask
+        elif input_mask is not None and perm_mask is None:
+            data_mask = paddle.unsqueeze(input_mask, axis=0)
+        elif input_mask is None and perm_mask is not None:
+            data_mask = perm_mask
+        else:
+            data_mask = None
+
+        if data_mask is not None:
+            # All mems can be attended to
+            if mlen > 0:
+                mems_mask = paddle.cast(paddle.zeros([data_mask.shape[0], mlen, bsz]), dtype=dtype_float)
+                data_mask = paddle.concat([mems_mask, data_mask], axis=1)
+            if attn_mask is None:
+                attn_mask = paddle.unsqueeze(data_mask, axis=-1)
+            else:
+                attn_mask += paddle.unsqueeze(data_mask, axis=-1)
+
+        if attn_mask is not None:
+            attn_mask = paddle.cast((attn_mask > 0), dtype=dtype_float)
+
+        if attn_mask is not None:
+            fill_val = paddle.ones(qlen)
+            non_tgt_mask = paddle.cast(-paddle.diag(fill_val), dtype=dtype_float)
+            if mlen > 0:
+                non_tgt_mask = paddle.concat(
+                    [paddle.cast(paddle.zeros([qlen, mlen]), dtype=dtype_float), non_tgt_mask], axis=-1
+                )
+            non_tgt_mask = paddle.cast(
+                ((attn_mask + paddle.unsqueeze(non_tgt_mask, axis=[2, 3])) > 0), dtype=dtype_float
+            )
+        else:
+            non_tgt_mask = None
+
+        # Word embeddings and prepare h & g hidden states
+        if inputs_embeds is not None:
+            word_emb_k = inputs_embeds
+        else:
+            word_emb_k = self.word_embedding(input_ids)
+
+        output_h = self.dropout(word_emb_k)
+        if target_mapping is not None:
+            word_emb_q = self.mask_emb.expand([target_mapping.shape[0], bsz, -1])
+            output_g = self.dropout(word_emb_q)
+        else:
+            output_g = None
+
+        # Segment embedding
+        if token_type_ids is not None:
+            # Convert `token_type_ids` to one-hot `seg_mat`
+            if mlen > 0:
+                mem_pad = paddle.zeros(shape=[mlen, bsz], dtype="int64")
+                cat_ids = paddle.concat(x=[mem_pad, token_type_ids], axis=0)
+            else:
+                cat_ids = token_type_ids
+
+            # `1` indicates not in the same segment [qlen x klen x bsz]
+            seg_mat = paddle.cast(
+                paddle.unsqueeze(token_type_ids, axis=1) != paddle.unsqueeze(cat_ids, axis=0), dtype="int64"
+            )
+            seg_mat = paddle.cast(F.one_hot(seg_mat, num_classes=2), dtype=dtype_float)
+        else:
+            seg_mat = None
+
+        # Positional encoding
+        pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz)
+        pos_emb = self.dropout(pos_emb)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # Attention_probs has shape bsz x n_heads x N x N
+        # Input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer)
+        # And head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0).unsqueeze(0)
+                head_mask = head_mask.expand([self.n_layer, -1, -1, -1, -1])
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1)
+        else:
+            head_mask = [None] * self.n_layer
+
+        new_mems = ()
+        if mems is None:
+            mems = [None] * len(self.layer)
+
+        attentions = [] if output_attentions else None
+        hidden_states = [] if output_hidden_states else None
+
+        for i, layer_module in enumerate(self.layer):
+            if use_mems:
+                # Cache new mems
+                new_mems = new_mems + (self.cache_mem(output_h, mems[i]),)
+            if output_hidden_states:
+                hidden_states.append((output_h, output_g) if output_g is not None else output_h)
+
+            outputs = layer_module(
+                output_h,
+                output_g,
+                attn_mask_h=non_tgt_mask,
+                attn_mask_g=attn_mask,
+                r=pos_emb,
+                seg_mat=seg_mat,
+                mems=mems[i],
+                target_mapping=target_mapping,
+                head_mask=head_mask[i],
+                output_attentions=output_attentions,
+            )
+            output_h, output_g = outputs[:2]
+
+            if output_attentions:
+                attentions.append(outputs[2])
+
+        # Add last hidden state
+        if output_hidden_states:
+            hidden_states.append((output_h, output_g) if output_g is not None else output_h)
+
+        output = self.dropout(output_g if output_g is not None else output_h)
+
+        # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
+        output = paddle.transpose(output, perm=[1, 0, 2])
+
+        if not use_mems:
+            new_mems = None
+
+        if output_hidden_states:
+            if output_g is not None:
+                hidden_states = tuple(paddle.transpose(h, perm=[1, 0, 2]) for hs in hidden_states for h in hs)
+            else:
+                hidden_states = tuple(paddle.transpose(hs, perm=[1, 0, 2]) for hs in hidden_states)
+
+        if output_attentions:
+            if target_mapping is not None:
+                # When target_mapping is provided, there are 2-tuple of attentions
+                attentions = tuple(
+                    tuple(paddle.transpose(att_stream, perm=[2, 3, 0, 1]) for att_stream in t) for t in attentions
+                )
+            else:
+                attentions = tuple(paddle.transpose(t, perm=[2, 3, 0, 1]) for t in attentions)
+
+        if not return_dict:
+            return tuple(v for v in [output, new_mems, hidden_states, attentions] if v is not None)
+
+        return XLNetModelOutput(
+            last_hidden_state=output, mems=new_mems, hidden_states=hidden_states, attentions=attentions
+        )
+
+
+class XLNetClassificationHead(Layer):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config: XLNetConfig):
+        super(XLNetClassificationHead, self).__init__()
+        self.dense = nn.Linear(config.d_model, config.d_model)
+        self.dropout = nn.Dropout(config.classfier_dropout)
+        self.out_proj = nn.Linear(config.d_model, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features[:, -1, :]  # Take <cls> token
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = get_activation("tanh")(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+class XLNetForSequenceClassification(XLNetPretrainedModel):
+    """
+    XLNet Model with a linear layer on top of the output layer,
+    designed for sequence classification/regression tasks like GLUE tasks.
+
+    Args:
+        config (:class:`XLNetConfig`):
+            An instance of :class:`XLNetConfig`.
+    """
+
+    def __init__(self, config: XLNetConfig):
+        super(XLNetForSequenceClassification, self).__init__(config)
+        self.num_classes = config.num_classes
+        self.transformer = XLNetModel(config)
+        self.classifier = XLNetClassificationHead(config)
+
+    def forward(
+        self,
+        input_ids,
+        token_type_ids=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_mems_train=False,
+        use_mems_eval=False,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
+        problem_type: str = "single_label_classification",
+    ):
+        r"""
+        The XLNetForSequenceClassification forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`XLNetModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`XLNetModel`.
+            attention_mask (Tensor, optional):
+                See :class:`XLNetModel`.
+            mems (Tensor, optional):
+                See :class:`XLNetModel`.
+            perm_mask (Tensor, optional):
+                See :class:`XLNetModel`.
+            target_mapping (Tensor, optional):
+                See :class:`XLNetModel`.
+            input_mask (Tensor, optional):
+                See :class:`XLNetModel`.
+            head_mask (Tensor, optional):
+                See :class:`XLNetModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`XLNetModel`.
+            use_mems_train (bool, optional):
+                See :class:`XLNetModel`.
+            use_mems_eval (bool, optional):
+                See :class:`XLNetModel`.
+            return_dict (bool, optional):
+                See :class:`XLNetModel`.
+
+        Returns:
+            Tensor or dict: Returns tensor `logits` or a dict with key-value pairs:
+            {"logits": `logits`, "mems": `mems`,
+            "hidden_states": `hidden_states`, "attentions": `attentions`}.
+
+            With the corresponding fields:
+
+            - `logits` (Tensor):
+                Classification scores before SoftMax (also called logits). It's data type should be `float32`
+                and has a shape of [batch_size, num_classes].
+            - `mems` (List[Tensor]):
+                See :class:`XLNetModel`.
+            - `hidden_states` (List[Tensor], optional):
+                See :class:`XLNetModel`.
+            - `attentions` (List[Tensor], optional):
+                See :class:`XLNetModel`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers.xlnet.modeling import XLNetForSequenceClassification
+                from paddlenlp.transformers.xlnet.tokenizer import XLNetTokenizer
+
+                tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
+                model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased')
+
+                inputs = tokenizer("Hey, Paddle-paddle is awesome !")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                logits = outputs[0]
+        """
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_mems_train=use_mems_train,
+            use_mems_eval=use_mems_eval,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        output = transformer_outputs[0]
+
+        logits = self.classifier(output)
+
+        loss = None
+        if labels is not None:
+
+            if problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_classes == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.reshape(shape=[-1, self.num_classes]), labels.reshape(shape=[-1]))
+            elif problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return tuple_output(output, loss)
+
+        return XLNetForSequenceClassificationOutput(
+            loss=loss,
+            logits=logits,
+            mems=transformer_outputs.mems,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+class XLNetForTokenClassification(XLNetPretrainedModel):
+    """
+    XLNet Model with a linear layer on top of the hidden-states output layer,
+    designed for token classification tasks like NER tasks.
+
+    Args:
+        config (:class:`XLNetConfig`):
+            An instance of :class:`XLNetConfig`.
+    """
+
+    def __init__(self, config: XLNetConfig):
+        super(XLNetForTokenClassification, self).__init__(config)
+        self.num_classes = config.num_classes
+
+        self.transformer = XLNetModel(config)
+        self.classifier = nn.Linear(self.transformer.d_model, config.num_classes)
+
+    def forward(
+        self,
+        input_ids,
+        token_type_ids=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_mems_train=False,
+        use_mems_eval=False,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
+    ):
+        r"""
+        The XLNetForTokenClassification forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`XLNetModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`XLNetModel`.
+            attention_mask (Tensor, optional):
+                See :class:`XLNetModel`.
+            mems (Tensor, optional):
+                See :class:`XLNetModel`.
+            perm_mask (Tensor, optional):
+                See :class:`XLNetModel`.
+            target_mapping (Tensor, optional):
+                See :class:`XLNetModel`.
+            input_mask (Tensor, optional):
+                See :class:`XLNetModel`.
+            head_mask (Tensor, optional):
+                See :class:`XLNetModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`XLNetModel`.
+            use_mems_train (bool, optional):
+                See :class:`XLNetModel`.
+            use_mems_eval (bool, optional):
+                See :class:`XLNetModel`.
+            return_dict (bool, optional):
+                See :class:`XLNetModel`.
+
+        Returns:
+            Tensor or dict: Returns tensor `logits` or a dict with key-value pairs:
+             {"logits": `logits`, "mems": `mems`,
+            "hidden_states": `hidden_states`, "attentions": `attentions`}.
+
+            With the corresponding fields:
+
+            - `logits` (Tensor):
+                Classification scores before SoftMax (also called logits). It's data type should be `float32`
+                and has a shape of [batch_size, sequence_length, num_classes].
+            - `mems` (List[Tensor]):
+                See :class:`XLNetModel`.
+            - `hidden_states` (List[Tensor], optional):
+                See :class:`XLNetModel`.
+            - `attentions` (List[Tensor], optional):
+                See :class:`XLNetModel`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers.xlnet.modeling import XLNetForTokenClassification
+                from paddlenlp.transformers.xlnet.tokenizer import XLNetTokenizer
+
+                tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
+                model = XLNetForTokenClassification.from_pretrained('xlnet-base-cased')
+
+                inputs = tokenizer("Hey, Paddle-paddle is awesome !")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+
+                logits = outputs[0]
+        """
+        outputs = self.transformer(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_mems_train=use_mems_train,
+            use_mems_eval=use_mems_eval,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.reshape(shape=[-1, self.num_classes]), labels.reshape(shape=[-1]))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return tuple_output(output, loss)
+
+        return XLNetForTokenClassificationOutput(
+            loss=loss,
+            logits=logits,
+            mems=outputs.mems,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class XLNetLMHeadModel(XLNetPretrainedModel):
+    """
+    XLNet Model with a language modeling head on top (linear layer with weights tied to the input embeddings).
+
+    Args:
+        config (:class:`XLNetConfig`):
+            An instance of :class:`XLNetConfig`.
+    """
+
+    def __init__(self, config: XLNetConfig):
+        super(XLNetLMHeadModel, self).__init__(config)
+        self.transformer = XLNetModel(config)
+        self.decoder_weight = self.transformer.word_embedding.weight
+        self.decoder_bias = self.create_parameter(
+            shape=[config.vocab_size], dtype=self.decoder_weight.dtype, is_bias=True
+        )
+
+    def forward(
+        self,
+        input_ids,
+        token_type_ids=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_mems_train=False,
+        use_mems_eval=False,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
+    ):
+        r"""
+        The XLNetLMHeadModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`XLNetModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`XLNetModel`.
+            attention_mask (Tensor, optional):
+                See :class:`XLNetModel`.
+            mems (Tensor, optional):
+                See :class:`XLNetModel`.
+            perm_mask (Tensor, optional):
+                See :class:`XLNetModel`.
+            target_mapping (Tensor, optional):
+                See :class:`XLNetModel`.
+            input_mask (Tensor, optional):
+                See :class:`XLNetModel`.
+            head_mask (Tensor, optional):
+                See :class:`XLNetModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`XLNetModel`.
+            use_mems_train (bool, optional):
+                See :class:`XLNetModel`.
+            use_mems_eval (bool, optional):
+                See :class:`XLNetModel`.
+            return_dict (bool, optional):
+                See :class:`XLNetModel`.
+
+        Returns:
+            Tensor or dict: Returns tensor `logits` or a dict with key-value pairs:
+             {"logits": `logits`, "mems": `mems`,
+            "hidden_states": `hidden_states`, "attentions": `attentions`}.
+
+            With the corresponding fields:
+
+            - `logits` (Tensor):
+                Classification scores before SoftMax (also called logits). It's data type should be `float32`
+                and has a shape of [batch_size, sequence_length, num_classes].
+            - `mems` (List[Tensor]):
+                See :class:`XLNetModel`.
+            - `hidden_states` (List[Tensor], optional):
+                See :class:`XLNetModel`.
+            - `attentions` (List[Tensor], optional):
+                See :class:`XLNetModel`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers.xlnet.modeling import XLNetLMHeadModel
+                from paddlenlp.transformers.xlnet.tokenizer import XLNetTokenizer
+
+                tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
+                model = XLNetLMHeadModel.from_pretrained('xlnet-base-cased')
+
+                inputs = tokenizer("Hey, Paddle-paddle is awesome !")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+                logits = outputs
+        """
+        transformer_outputs = self.transformer(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_mems_train=use_mems_train,
+            use_mems_eval=use_mems_eval,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        logits = paddle.matmul(transformer_outputs[0], self.decoder_weight, transpose_y=True) + self.decoder_bias
+        loss = None
+        if labels is not None:
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.reshape(shape=[-1, logits.shape[-1]]), labels.reshape(shape=[-1]))
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return tuple_output(output, loss)
+
+        return XLNetLMHeadModelOutput(
+            loss=loss,
+            logits=logits,
+            mems=transformer_outputs.mems,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+class XLNetForMultipleChoice(XLNetPretrainedModel):
+    """
+    XLNet Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RACE/SWAG tasks.
+
+    Args:
+        config (:class:`XLNetConfig`):
+            An instance of :class:`XLNetConfig`.
+    """
+
+    def __init__(self, config: XLNetConfig):
+        super(XLNetForMultipleChoice, self).__init__(config)
+        self.transformer = XLNetModel(config)
+        self.classifier = XLNetClassificationHead(config)
+
+    def forward(
+        self,
+        input_ids,
+        token_type_ids=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_mems_train=False,
+        use_mems_eval=False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict=False,
+    ):
+        r"""
+        The XLNetForMultipleChoice forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`XLNetModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`XLNetModel`.
+            attention_mask (Tensor, optional):
+                See :class:`XLNetModel`.
+            mems (Tensor, optional):
+                See :class:`XLNetModel`.
+            perm_mask (Tensor, optional):
+                See :class:`XLNetModel`.
+            target_mapping (Tensor, optional):
+                See :class:`XLNetModel`.
+            input_mask (Tensor, optional):
+                See :class:`XLNetModel`.
+            head_mask (Tensor, optional):
+                See :class:`XLNetModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`XLNetModel`.
+            use_mems_train (bool, optional):
+                See :class:`XLNetModel`.
+            use_mems_eval (bool, optional):
+                See :class:`XLNetModel`.
+            return_dict (bool, optional):
+                See :class:`XLNetModel`.
+
+        Returns:
+            tensor or dict: Returns tensor `logtis` or a dict with key-value pairs:
+             {"logits": `logits`, "mems": `mems`,
+            "hidden_states": `hidden_states`, "attentions": `attentions`}
+
+            With the corresponding fields:
+            - `logits` (Tensor):
+                Classification scores before SoftMax (also called logits). It's data type should be `float32`
+                and has a shape of [batch_size, sequence_length, num_classes].
+            - `mems` (List[Tensor]):
+                See :class:`XLNetModel`.
+            - `hidden_states` (List[Tensor], optional):
+                See :class:`XLNetModel`.
+            - `attentions` (List[Tensor], optional):
+                See :class:`XLNetModel`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import XLNetForMultipleChoice, XLNetTokenizer
+                from paddlenlp.data import Pad, Dict
+                tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
+                model = XLNetForMultipleChoice.from_pretrained('xlnet-base-cased')
+                data = [
+                    {
+                        "question": "how do you turn on an ipad screen?",
+                        "answer1": "press the volume button.",
+                        "answer2": "press the lock button.",
+                        "label": 1,
+                    },
+                    {
+                        "question": "how do you indent something?",
+                        "answer1": "leave a space before starting the writing",
+                        "answer2": "press the spacebar",
+                        "label": 0,
+                    },
+                ]
+                text = []
+                text_pair = []
+                for d in data:
+                    text.append(d["question"])
+                    text_pair.append(d["answer1"])
+                    text.append(d["question"])
+                    text_pair.append(d["answer2"])
+                inputs = tokenizer(text, text_pair)
+                batchify_fn = lambda samples, fn=Dict(
+                    {
+                        "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input_ids
+                        "token_type_ids": Pad(
+                            axis=0, pad_val=tokenizer.pad_token_type_id
+                        ),  # token_type_ids
+                    }
+                ): fn(samples)
+                inputs = batchify_fn(inputs)
+                reshaped_logits = model(
+                    input_ids=paddle.to_tensor(inputs[0], dtype="int64"),
+                    token_type_ids=paddle.to_tensor(inputs[1], dtype="int64"),
+                )
+                print(reshaped_logits.shape)
+                # [2, 2]
+        """
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+        input_ids = input_ids.reshape(shape=(-1, input_ids.shape[-1]))  # flat_input_ids: [bs*num_choice,seq_l]
+
+        if attention_mask is not None:
+            attention_mask = attention_mask.reshape(shape=(-1, attention_mask.shape[-1]))
+
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.reshape(shape=(-1, token_type_ids.shape[-1]))
+
+        if inputs_embeds is not None:
+            inputs_embeds = inputs_embeds.reshape(shape=(inputs_embeds.shape[0], -1, inputs_embeds.shape[-1]))
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            use_mems_train=use_mems_train,
+            use_mems_eval=use_mems_eval,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        output = transformer_outputs[0]
+        logits = self.classifier(output)
+        reshaped_logits = logits.reshape([-1, num_choices])
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels.reshape(shape=[-1]))
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return tuple_output(output, loss)
+
+        return XLNetForMultipleChoiceOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            mems=transformer_outputs.mems,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+class XLNetForQuestionAnswering(XLNetPretrainedModel):
+    """
+      XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+      layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+
+    Args:
+        config (:class:`XLNetConfig`):
+            An instance of :class:`XLNetConfig`.
+    """
+
+    def __init__(self, config: XLNetConfig):
+        super(XLNetForQuestionAnswering, self).__init__(config)
+        self.transformer = XLNetModel(config)
+        self.qa_outputs = nn.Linear(config.d_model, 2)
+
+    def forward(
+        self,
+        input_ids,
+        token_type_ids=None,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        start_positions=None,
+        end_positions=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        use_mems_train=False,
+        use_mems_eval=False,
+        return_dict=False,
+    ):
+        r"""
+        The XLNetForQuestionAnswering forward method, overrides the `__call__()` special method.
+
+        Args:
+            input_ids (Tensor):
+                See :class:`XLNetModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`XLNetModel`.
+            attention_mask (Tensor, optional):
+                See :class:`XLNetModel`.
+            mems (Tensor, optional):
+                See :class:`XLNetModel`.
+            perm_mask (Tensor, optional):
+                See :class:`XLNetModel`.
+            target_mapping (Tensor, optional):
+                See :class:`XLNetModel`.
+            input_mask (Tensor, optional):
+                See :class:`XLNetModel`.
+            head_mask (Tensor, optional):
+                See :class:`XLNetModel`.
+            inputs_embeds (Tensor, optional):
+                See :class:`XLNetModel`.
+            use_mems_train (bool, optional):
+                See :class:`XLNetModel`.
+            use_mems_eval (bool, optional):
+                See :class:`XLNetModel`.
+            return_dict (bool, optional):
+                See :class:`XLNetModel`.
+
+        Returns:
+            tuple or dict: Returns tensor (`start_logits`, `end_logits`) or a dict with key-value pairs:
+             {"start_logits": `start_logits`, "end_logits": `end_logits`, "mems": `mems`,
+            "hidden_states": `hidden_states`, "attentions": `attentions`}
+
+            With the corresponding fields:
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+            - `mems` (List[Tensor]):
+                See :class:`XLNetModel`.
+            - `hidden_states` (List[Tensor], optional):
+                See :class:`XLNetModel`.
+            - `attentions` (List[Tensor], optional):
+                See :class:`XLNetModel`.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers.xlnet.modeling import XLNetForQuestionAnswering
+                from paddlenlp.transformers.xlnet.tokenizer import XLNetTokenizer
+
+                tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
+                model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased')
+
+                inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
+                inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
+                outputs = model(**inputs)
+                start_logits = outputs[0]
+                end_logits = outputs[1]
+        """
+        transformer_outputs = self.transformer(
+            input_ids,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            mems=mems,
+            perm_mask=perm_mask,
+            target_mapping=target_mapping,
+            input_mask=input_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_mems_train=use_mems_train,
+            use_mems_eval=use_mems_eval,
+            return_dict=return_dict,
+        )
+        output = transformer_outputs[0]
+
+        logits = self.qa_outputs(output)
+        logits = paddle.transpose(logits, perm=[2, 0, 1])
+        start_logits, end_logits = paddle.unstack(x=logits, axis=0)
+
+        loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if start_positions.ndim > 1:
+                start_positions = start_positions.squeeze(-1)
+            if start_positions.ndim > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.shape[1]
+            start_positions = start_positions.clip(0, ignored_index)
+            end_positions = end_positions.clip(0, ignored_index)
+
+            loss_fct = paddle.nn.CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + transformer_outputs[1:]
+            # the length of output must be larger than 1
+            return tuple_output(output, loss)
+
+        return XLNetForQuestionAnsweringSimpleOutput(
+            loss=loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            mems=transformer_outputs.mems,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+XLNetForCausalLM = XLNetLMHeadModel
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlnet/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlnet/tokenizer.py
new file mode 100644
index 000000000..196537a28
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/xlnet/tokenizer.py
@@ -0,0 +1,366 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for XLNet model."""
+
+import os
+import unicodedata
+from shutil import copyfile
+
+import sentencepiece as spm
+
+from .. import AddedToken, PretrainedTokenizer
+
+__all__ = ["XLNetTokenizer"]
+
+SENTENCEPIECE_UNDERLINE = "▁"
+SPIECE_UNDERLINE = SENTENCEPIECE_UNDERLINE  # Kept for backward compatibility
+
+# Segments (not really needed)
+SEG_ID_A = 0
+SEG_ID_B = 1
+SEG_ID_CLS = 2
+SEG_ID_SEP = 3
+SEG_ID_PAD = 4
+
+
+class XLNetTokenizer(PretrainedTokenizer):
+    """
+    Constructs an XLNet tokenizer based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+
+    This tokenizer inherits from :class:`~paddlenlp.transformers.tokenizer_utils.PretrainedTokenizer`
+    which contains most of the main methods. For more information regarding those methods,
+    please refer to this superclass.
+
+    Args:
+        vocab_file (str):
+            The vocabulary file (ends with '.spm') required to instantiate
+            a `SentencePiece <https://github.com/google/sentencepiece>`__ tokenizer.
+        do_lower_case (bool, optional):
+            Whether or not to lowercase the input when tokenizing. Defaults to `False` and
+            **does not** lowercase the input.
+        remove_space (bool, optional):
+            Whether or not to strip the text when tokenizing. Defaults to `True` and
+            removes excess spaces before and after the string.
+        keep_accents (bool, optional):
+            Whether or not to keep accents when tokenizing. Defaults to `False` and **does not** keep accents.
+        bos_token (str, optional):
+            A special token representing the beginning of a sequence that was used during pretraining.
+            Defaults to `"<s>"`.
+        eos_token (str, optional):
+            A special token representing the end of a sequence that was used during pretraining.
+            Defaults to `"</s>"`.
+        unk_token (str, optional):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to `"<unk>"`.
+        sep_token (str, optional):
+            A special token separating two different sentences in the same input.
+            Defaults to `"<sep>"`.
+        pad_token (str, optional):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to `"<pad>"`.
+        cls_token (str, optional):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to `"<cls>"`.
+        mask_token (str, optional):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to `"<mask>"`.
+        additional_special_tokens (List[str], optional):
+            A list of additional special tokens to be used by the tokenizer.
+            Defaults to `["<eop>", "<eod>"]`.
+
+    Attributes:
+        sp_model (SentencePieceProcessor):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+    """
+
+    resource_files_names = {"vocab_file": "spiece.model"}
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "xlnet-base-cased": "https://bj.bcebos.com/paddlenlp/models/transformers/xlnet/xlnet-base-cased-spiece.model",
+            "xlnet-large-cased": "https://bj.bcebos.com/paddlenlp/models/transformers/xlnet/xlnet-large-cased-spiece.model",
+            "chinese-xlnet-base": "https://bj.bcebos.com/paddlenlp/models/transformers/xlnet/chinese-xlnet-base-spiece.model",
+            "chinese-xlnet-mid": "https://bj.bcebos.com/paddlenlp/models/transformers/xlnet/chinese-xlnet-mid-spiece.model",
+            "chinese-xlnet-large": "https://bj.bcebos.com/paddlenlp/models/transformers/xlnet/chinese-xlnet-large-spiece.model",
+        }
+    }
+    pretrained_init_configuration = {
+        "xlnet-base-cased": {"do_lower_case": False},
+        "xlnet-large-cased": {"do_lower_case": False},
+        "chinese-xlnet-base": {"do_lower_case": False},
+        "chinese-xlnet-mid": {"do_lower_case": False},
+        "chinese-xlnet-large": {"do_lower_case": False},
+    }
+    pretrained_positional_embedding_sizes = {
+        "xlnet-base-cased": None,
+        "xlnet-large-cased": None,
+        "chinese-xlnet-base": None,
+        "chinese-xlnet-mid": None,
+        "chinese-xlnet-large": None,
+    }
+    max_model_input_sizes = pretrained_positional_embedding_sizes
+    padding_side = "left"
+    pad_token_type_id = 3
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=False,
+        remove_space=True,
+        keep_accents=False,
+        bos_token="<s>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        sep_token="<sep>",
+        pad_token="<pad>",
+        cls_token="<cls>",
+        mask_token="<mask>",
+        additional_special_tokens=["<eop>", "<eod>"],
+        sp_model_kwargs=None,
+        **kwargs
+    ):
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+        self._build_special_tokens_map_extended(mask_token=mask_token)
+
+        self._pad_token_type_id = 3
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        self.do_lower_case = do_lower_case
+        self.remove_space = remove_space
+        self.keep_accents = keep_accents
+        self.vocab_file = vocab_file
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+
+    @property
+    def vocab_size(self):
+        return len(self.sp_model)
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.vocab_file)
+
+    def preprocess_text(self, inputs):
+        if self.remove_space:
+            outputs = " ".join(inputs.strip().split())
+        else:
+            outputs = inputs
+        outputs = outputs.replace("``", '"').replace("''", '"')
+
+        if not self.keep_accents:
+            outputs = unicodedata.normalize("NFKD", outputs)
+            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
+        if self.do_lower_case:
+            outputs = outputs.lower()
+
+        return outputs
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        text = self.preprocess_text(text)
+        pieces = self.sp_model.encode(text, out_type=str)
+        new_pieces = []
+        for piece in pieces:
+            if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
+                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
+                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
+                    if len(cur_pieces[0]) == 1:
+                        cur_pieces = cur_pieces[1:]
+                    else:
+                        cur_pieces[0] = cur_pieces[0][1:]
+                cur_pieces.append(piece[-1])
+                new_pieces.extend(cur_pieces)
+            else:
+                new_pieces.append(piece)
+
+        return new_pieces
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) to an id using the vocab."""
+        return self.sp_model.PieceToId(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) to a token (str) using the vocab."""
+        return self.sp_model.IdToPiece(index)
+
+    def convert_tokens_to_string(self, tokens):
+        # Converts a sequence of tokens (strings for sub-words) in a single string.
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    def num_special_tokens_to_add(self, pair=False):
+        """
+        Returns the number of added tokens when encoding a sequence with special tokens.
+
+        Args:
+            pair (bool, optional):
+                Whether the input is a sequence pair or a single sequence.
+                Defaults to `False` and the input is a single sequence.
+
+        Returns:
+            int: Number of tokens added to sequences.
+        """
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Builds model inputs from a sequence or a pair of sequence
+        for sequence classification tasks by concatenating and
+        adding special tokens. An XLNet sequence has the following format:
+
+        - single sequence:      ``X <sep> <cls>``
+        - pair of sequences:    ``A <sep> B <sep> <cls>``
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for the second sequenze. Defaults to `None`.
+
+        Returns:
+            List[int]: List of input IDs with the appropriate special tokens.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return token_ids_0 + sep + cls
+        return token_ids_0 + sep + token_ids_1 + sep + cls
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        """
+        Builds offset map from a pair of offset map by concatenating
+        and adding offsets of special tokens.
+
+        An XLNet offset_mapping has the following format:
+
+        - single sequence:      ``X (0,0) (0,0)``
+        - pair of sequences:    ``A (0,0) B (0,0) (0,0)``
+
+        Args:
+            offset_mapping_0 (List[tuple]):
+                List of char offsets to which the special tokens will be added.
+            offset_mapping_1 (List[tuple], optional):
+                Optional second list of char offsets for offset mapping pairs.
+                Defaults to `None`.
+
+        Returns:
+            List[tuple]: A list of char offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return offset_mapping_0 + [(0, 0)] + [(0, 0)]
+
+        return offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)] + [(0, 0)]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Creates a special tokens mask from the input sequences.
+        This method is called when adding special tokens using the tokenizer `encode` method.
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of `inputs_ids` for the second sequence.
+                Defaults to `None`.
+            already_has_special_tokens (bool, optional):
+                Whether or not the token list already contains special tokens for the model.
+                Defaults to `False`.
+
+        Returns:
+            List[int]: A list of integers which is either 0 or 1: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is not None:
+            return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
+        return ([0] * len(token_ids_0)) + [1, 1]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Creates a token_type mask from the input sequences.
+        If `token_ids_1` is not `None`, then a sequence pair
+        token_type mask has the following format:
+
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 2
+            | first sequence    | second sequence |
+
+        Else if `token_ids_1` is `None`, then a single sequence
+        token_type mask has the following format:
+
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2
+            |            first sequence           |
+
+        - 0 stands for the segment id of **first segment tokens**,
+        - 1 stands for the segment id of **second segment tokens**,
+        - 2 stands for the segment id of **cls_token**.
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of `inputs_ids` for the second sequence.
+                Defaults to `None`.
+
+        Returns:
+            List[int]: List of token type IDs according to the given sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls_segment_id = [2]
+
+        if token_ids_1 is None:
+            return len(token_ids_0 + sep) * [0] + cls_segment_id
+        return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
+
+    def save_resources(self, save_directory):
+        for name, file_name in self.resource_files_names.items():
+            save_path = os.path.join(save_directory, file_name)
+            if os.path.abspath(self.vocab_file) != os.path.abspath(save_path) and os.path.isfile(self.vocab_file):
+                copyfile(self.vocab_file, save_path)
+            elif not os.path.isfile(self.vocab_file):
+                with open(save_path, "wb") as fi:
+                    content_spiece_model = self.sp_model.serialized_model_proto()
+                    fi.write(content_spiece_model)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/yuan/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/yuan/__init__.py
new file mode 100644
index 000000000..b3f0962cd
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/yuan/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Yuan model """
+
+from .configuration import *
+from .modeling import *
+from .tokenizer import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/yuan/configuration.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/yuan/configuration.py
new file mode 100644
index 000000000..2aab87010
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/yuan/configuration.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration class for Yuan2.0 model"""
+
+from paddlenlp.transformers.configuration_utils import PretrainedConfig
+
+
+class YuanConfig(PretrainedConfig):
+    model_type = "yuan"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=135040,
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=24,
+        num_attention_heads=32,
+        hidden_act="silu",
+        model_max_length=8192,
+        initializer_range=0.02,
+        tensor_parallel_output=True,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=77185,
+        bos_token_id=77185,
+        eos_token_id=77185,
+        num_key_value_heads=None,
+        tie_word_embeddings=True,
+        sequence_parallel=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.model_max_length = model_max_length
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.tensor_parallel_output = tensor_parallel_output
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.sequence_parallel = sequence_parallel
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tensor_parallel_output=tensor_parallel_output,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/yuan/modeling.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/yuan/modeling.py
new file mode 100644
index 000000000..531fab643
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/yuan/modeling.py
@@ -0,0 +1,1296 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Modeling class for Yuan2.0 model"""
+
+import copy
+import math
+from functools import partial
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import paddle
+import paddle.distributed.fleet.meta_parallel as mpu
+import paddle.nn.functional as F
+from paddle import Tensor, nn
+from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
+from paddle.distributed.fleet.utils import recompute
+from paddle.nn import CrossEntropyLoss
+
+from ...transformers.conversion_utils import StateDictNameMapping, init_name_mappings
+from ...transformers.model_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from ...transformers.model_utils import PretrainedModel
+from ...utils.log import logger
+from ..activations import ACT2FN
+from .configuration import YuanConfig
+
+try:
+    from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+        ColumnSequenceParallelLinear,
+        RowSequenceParallelLinear,
+    )
+except:
+    pass
+
+__all__ = [
+    "YuanModel",
+    "YuanPretrainedModel",
+    "YuanForCausalLM",
+]
+
+
+class YuanRMSNorm(nn.Layer):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        YuanRMSNorm is equivalent to LlamaRMSNorm
+        """
+        super().__init__()
+        self.weight = paddle.create_parameter(
+            shape=[hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=paddle.nn.initializer.Assign(paddle.ones([hidden_size])),
+        )
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = paddle.cast(hidden_states, "float32")
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * paddle.rsqrt(variance + self.variance_epsilon)
+        return self.weight * paddle.cast(hidden_states, input_dtype)
+
+
+class LocalizedFiltering(paddle.nn.Layer):
+    """
+    Mega's Exponential Moving Average layer, largely left unmodified from the original repo with the exception of
+    variable names and moving away from the stateful representation of incremental decoding state. See
+    "https://arxiv.org/abs/2209.10655" for more details.
+    """
+
+    def __init__(self, hidden_size):
+        super().__init__()
+
+        self.embed_dim = hidden_size
+        self.lf_conv2d_group = 1
+        self.lf_conv2d_num_pad = 1
+
+        self.conv1 = paddle.nn.Conv2D(
+            self.embed_dim,
+            self.embed_dim // 2,
+            (2, 1),
+            stride=(1, 1),
+            padding=(self.lf_conv2d_num_pad, 0),
+            groups=self.lf_conv2d_group,
+        )
+        self.conv2 = paddle.nn.Conv2D(
+            self.embed_dim // 2,
+            self.embed_dim,
+            (2, 1),
+            stride=(1, 1),
+            padding=(self.lf_conv2d_num_pad, 0),
+            groups=self.lf_conv2d_group,
+        )
+        self.output_layernorm = YuanRMSNorm(self.embed_dim)
+
+    def _train_forward(self, inputs):
+        inputs = paddle.transpose(inputs, perm=[1, 0, *range(2, len(inputs.shape))])
+        seq_len, bsz, embed_dim = inputs.shape
+        if embed_dim != self.embed_dim:
+            raise ValueError(
+                f"Unexpected embedding dimension received: input is {embed_dim}, model expects {self.embed_dim}"
+            )
+        residual = inputs
+
+        inputs = paddle.transpose(paddle.reshape(inputs, [seq_len, 1, bsz, embed_dim]), [2, 3, 0, 1])
+        output1 = self.conv1(inputs)
+        output1 = output1[:, :, :seq_len, :]
+
+        output2 = self.conv2(output1)
+        output2 = paddle.transpose(output2[:, :, :seq_len, :], [2, 3, 0, 1])
+        output2 = paddle.reshape(output2, [seq_len, bsz, embed_dim])
+        assert output2.shape == residual.shape
+
+        lf_output = self.output_layernorm(output2 + residual)
+        lf_output = paddle.transpose(lf_output, [1, 0, *range(2, len(lf_output.shape))])
+        return lf_output
+
+    def _inference_forward(self, inputs, before_hidden_states):
+
+        if before_hidden_states is None:
+            inputs = inputs.transpose([1, 0, *range(2, len(inputs.shape))])
+            seq_len, bsz, embed_dim = inputs.shape
+            if embed_dim != self.embed_dim:
+                raise ValueError(
+                    f"Unexpected embedding dimension received: input is {embed_dim}, model expects {self.embed_dim}"
+                )
+            residual = inputs
+
+            inputs = paddle.transpose(paddle.reshape(inputs, [seq_len, 1, bsz, embed_dim]), [2, 3, 0, 1])
+            output1 = self.conv1(inputs)
+            output1 = output1[:, :, :seq_len, :]
+
+            output2 = self.conv2(output1)
+            output2 = paddle.transpose(output2[:, :, :seq_len, :], [2, 3, 0, 1])
+            output2 = paddle.reshape(output2, [seq_len, bsz, embed_dim])
+            assert output2.shape == residual.shape
+
+            lf_output = self.output_layernorm(output2 + residual)
+            lf_output = paddle.transpose(lf_output, [1, 0, *range(2, len(lf_output.shape))])
+            return lf_output
+        else:
+            inputs = paddle.transpose(inputs, [1, 0, *range(2, len(inputs.shape))])
+            before_hidden_states = paddle.transpose(
+                before_hidden_states, [1, 0, *range(2, len(before_hidden_states.shape))]
+            )
+            residual = inputs
+
+            seq_len, bsz, embed_dim = inputs.shape
+            seq_len_before, _, _ = before_hidden_states.shape
+
+            assert seq_len == 1 and seq_len_before == 2
+
+            inputs = paddle.concat((before_hidden_states, inputs), axis=0)
+            inputs = paddle.transpose(paddle.reshape(inputs, [3, 1, bsz, embed_dim]), [2, 3, 0, 1])
+
+            output1 = self.conv1(inputs)
+            output2 = self.conv2(output1[:, :, 1:-1, :])
+            output2 = output2[:, :, 1:-1, :]
+            output2 = paddle.reshape(output2, [1, bsz, embed_dim])
+            assert output2.shape == residual.shape
+
+            lf_output = self.output_layernorm(output2 + residual)
+            lf_output = paddle.transpose(lf_output, [1, 0, *range(2, len(lf_output.shape))])
+
+        return lf_output
+
+    def forward(self, inputs, before_hidden_states) -> paddle.Tensor:
+        assert self.lf_conv2d_num_pad == 1
+        if self.training:
+            lf_output = self._train_forward(inputs)
+        else:
+            lf_output = self._inference_forward(inputs, before_hidden_states)
+
+        return lf_output
+
+
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(input_ids_shape: paddle.shape, dtype: paddle.dtype, past_key_values_length: int = 0):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = paddle.full((tgt_len, tgt_len), paddle.to_tensor(paddle.finfo(dtype).min))
+    mask_cond = paddle.arange(mask.size(-1))
+    mask_cond = paddle.add(mask_cond, 1)
+    mask_cond_reshaped = paddle.reshape(mask_cond, [mask.size(-1), 1])
+    mask = paddle.where(mask_cond < mask_cond_reshaped, paddle.zeros_like(mask), mask)
+    mask = paddle.cast(mask, dtype)
+    if past_key_values_length > 0:
+        mask = paddle.concat([paddle.zeros([tgt_len, past_key_values_length], dtype=dtype), mask], zeros=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: paddle.Tensor, dtype: paddle.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.shape
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len)
+    expanded_mask = paddle.to_tensor(expanded_mask, dtype=dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(paddle.cast(inverted_mask, paddle.bool), paddle.finfo(dtype).min)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return paddle.concat((-x2, x1), axis=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
+    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
+    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
+    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class YuanPretrainedModel(PretrainedModel):
+    config_class = YuanConfig
+    base_model_prefix = "yuan"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["YuanDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
+
+    @classmethod
+    def _get_name_mappings(cls, config: YuanConfig) -> List[StateDictNameMapping]:
+        mappings: List[StateDictNameMapping] = []
+        model_mappings = [
+            ["embed_tokens.weight"],
+            ["norm.weight"],
+        ]
+        for layer_index in range(config.num_hidden_layers):
+            layer_mappings = [
+                [f"layers.{layer_index}.self_attn.q_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.k_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.v_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.o_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.self_attn.rotary_emb.inv_freq"],
+                [f"layers.{layer_index}.mlp.gate_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.mlp.down_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.mlp.up_proj.weight", None, "transpose"],
+                [f"layers.{layer_index}.input_layernorm.weight"],
+                [f"layers.{layer_index}.post_attention_layernorm.weight"],
+                [f"layers.{layer_index}.self_attn.lf_gate.conv1.bias"],
+                [f"layers.{layer_index}.self_attn.lf_gate.conv1.weight"],
+                [f"layers.{layer_index}.self_attn.lf_gate.conv2.bias"],
+                [f"layers.{layer_index}.self_attn.lf_gate.conv2.weight"],
+                [f"layers.{layer_index}.self_attn.lf_gate.output_layernorm.weight"],
+            ]
+            model_mappings.extend(layer_mappings)
+
+        init_name_mappings(mappings=model_mappings)
+
+        if "YuanModel" not in config.architectures:
+            for mapping in model_mappings:
+                mapping[0] = "model." + mapping[0]
+                mapping[1] = "yuan." + mapping[1]
+            model_mappings.append(["lm_head.weight", "lm_head.weight", "transpose"])
+
+        mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)]
+        return mappings
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config: YuanConfig, is_split=True):
+
+        from paddlenlp.transformers.conversion_utils import split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def get_tensor_parallel_split_mappings(num_layers):
+            final_actions = {}
+
+            base_actions = {
+                "lm_head.weight": partial(fn, is_column=True),
+                # Row Linear
+                "embed_tokens.weight": partial(fn, is_column=False),
+                "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False),
+                "layers.0.mlp.down_proj.weight": partial(fn, is_column=False),
+            }
+
+            if not config.vocab_size % config.tensor_parallel_degree == 0:
+                base_actions.pop("lm_head.weight")
+                base_actions.pop("embed_tokens.weight")
+            # Column Linear
+
+            base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True)
+            # if we have enough num_key_value_heads to split, then split it.
+            if config.num_attention_heads % config.tensor_parallel_degree == 0:
+                base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True)
+                base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True)
+
+            base_actions["layers.0.mlp.gate_proj.weight"] = partial(fn, is_column=True)
+            base_actions["layers.0.mlp.up_proj.weight"] = partial(fn, is_column=True)
+
+            for key, action in base_actions.items():
+                if "layers.0." in key:
+                    for i in range(num_layers):
+                        final_actions[key.replace("layers.0.", f"layers.{i}.")] = action
+                final_actions[key] = action
+
+            return final_actions
+
+        mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
+
+        return mappings
+
+    @classmethod
+    def _get_fuse_or_split_param_mappings(cls, config: YuanConfig, is_fuse=False):
+        def convert_qk_keys_fn(fused_params, tensor_parallel_degree):
+            concat_fn = np.concatenate
+            split_fn = np.split
+            if isinstance(fused_params, paddle.Tensor):
+                concat_fn = paddle.concat
+                split_fn = paddle.split
+
+            q_weight, k_weight = split_fn(fused_params, 2, axis=-1)
+
+            hidden_size = q_weight.shape[-1]
+            step = 1
+            if tensor_parallel_degree > 1:
+                assert hidden_size // tensor_parallel_degree, "hidden_size must be divisible by tensor_parallel_degree"
+                step = hidden_size // tensor_parallel_degree
+
+            q_slices = [q_weight[:, i : i + step] for i in range(0, hidden_size, step)]
+            k_slices = [k_weight[:, i : i + step] for i in range(0, hidden_size, step)]
+            q1 = concat_fn(q_slices[0::2], -1)
+            q2 = concat_fn(k_slices[0::2], -1)
+            k1 = concat_fn(q_slices[1::2], -1)
+            k2 = concat_fn(k_slices[1::2], -1)
+
+            return concat_fn([q1, q2], -1), concat_fn([k1, k2], -1)
+
+        def fuse_qk_keys_fn(fuse_params):
+            concat_fn = np.concatenate
+            if isinstance(fuse_params[0], paddle.Tensor):
+                concat_fn = paddle.concat
+            return concat_fn(fuse_params, axis=-1)
+
+        # last key is fused key, other keys are to be fused.
+
+        final_actions = {}
+        if config.tensor_parallel_degree <= 1:
+            return final_actions
+
+        if is_fuse:
+            fuse_qk_keys = (
+                "layers.0.self_attn.q_proj.weight",  # base param key
+                "layers.0.self_attn.k_proj.weight",  # base param key
+                "layers.0.self_attn.qk_proj.weight",  # new param key
+            )
+
+            for i in range(config.num_hidden_layers):
+                keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in fuse_qk_keys])
+                final_actions[keys] = partial(fuse_qk_keys_fn)
+        else:
+            split_qk_keys = (
+                "layers.0.self_attn.q_proj.weight",  # new param key
+                "layers.0.self_attn.k_proj.weight",  # new param key
+                "layers.0.self_attn.qk_proj.weight",  # base param key
+            )
+
+            for i in range(config.num_hidden_layers):
+                keys = tuple([key.replace("layers.0.", f"layers.{i}.") for key in split_qk_keys])
+                final_actions[keys] = partial(convert_qk_keys_fn, tensor_parallel_degree=config.tensor_parallel_degree)
+
+        return final_actions
+
+    def _init_weights(self, layer):
+        """Initialization hook"""
+        if self.config.tensor_parallel_degree > 1:
+            rng_tracker = get_rng_state_tracker().rng_state
+        if isinstance(
+            layer,
+            (
+                nn.Linear,
+                nn.Embedding,
+                mpu.VocabParallelEmbedding,
+                mpu.ColumnParallelLinear,
+                mpu.RowParallelLinear,
+                ColumnSequenceParallelLinear,
+                RowSequenceParallelLinear,
+            ),
+        ):
+            # In the dygraph mode, use the `set_value` to reset the parameter directly,
+            # and reset the `state_dict` to update parameter in static mode.
+            if isinstance(layer.weight, paddle.Tensor):
+                if layer.weight.is_distributed:
+                    with rng_tracker():
+                        layer.weight.set_value(
+                            paddle.tensor.normal(
+                                mean=0.0,
+                                std=self.config.initializer_range
+                                if hasattr(self.config, "initializer_range")
+                                else self.llama.config.initializer_range,
+                                shape=layer.weight.shape,
+                            )
+                        )
+                else:
+                    layer.weight.set_value(
+                        paddle.tensor.normal(
+                            mean=0.0,
+                            std=self.config.initializer_range
+                            if hasattr(self.config, "initializer_range")
+                            else self.llama.config.initializer_range,
+                            shape=layer.weight.shape,
+                        )
+                    )
+
+        with paddle.no_grad():
+            if isinstance(layer, YuanMLP):
+                factor = 1 / math.sqrt(2 * self.config.num_hidden_layers)
+                layer.down_proj.weight.scale_(factor)
+            if isinstance(layer, YuanAttention):
+                factor = 1 / math.sqrt(2 * self.config.num_hidden_layers)
+                layer.o_proj.weight.scale_(factor)
+
+    def _post_init(self, *args, **kwargs):
+        with paddle.no_grad():
+            self.init_weights()
+
+
+class YuanRotaryEmbedding(nn.Layer):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000):
+
+        """
+        YuanRotaryEmbedding is equivalent to LlamaRotaryEmbedding in transformers v4.36
+        """
+
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        # [dim / 2]
+        self.inv_freq = 1.0 / (self.base ** (paddle.cast(paddle.arange(0, self.dim, 2), dtype="float32") / self.dim))
+        self._set_cos_sin_cache(seq_len=max_position_embeddings)
+
+    def _set_cos_sin_cache(self, seq_len):
+        self.max_seq_len_cached = seq_len
+        # [seq_len]
+        t = paddle.arange(seq_len, dtype=self.inv_freq.dtype)
+        # [seq_len, dim/2]
+        freqs = paddle.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        # [seq_len, dim]
+        emb = paddle.concat([freqs, freqs], axis=-1)
+        # [1, seqlen, 1, dim]
+        self.cos_cached = emb.cos()[None, None, :, :]
+        self.sin_cached = emb.sin()[None, None, :, :]
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        cos = self.cos_cached[:, :, :seq_len, ...]
+        sin = self.sin_cached[:, :, :seq_len, ...]
+        return (
+            paddle.cast(cos, x.dtype),
+            paddle.cast(sin, x.dtype),
+        )
+
+
+class YuanMLP(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        if config.sequence_parallel:
+            ColumnParallelLinear = ColumnSequenceParallelLinear
+            RowParallelLinear = RowSequenceParallelLinear
+        else:
+            ColumnParallelLinear = fleet.meta_parallel.ColumnParallelLinear
+            RowParallelLinear = fleet.meta_parallel.RowParallelLinear
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.hidden_act = config.hidden_act
+        if config.tensor_parallel_degree > 1:
+
+            self.gate_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.intermediate_size,
+                gather_output=False,
+                has_bias=False,
+            )
+            self.up_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.intermediate_size,
+                gather_output=False,
+                has_bias=False,
+            )
+
+            self.down_proj = RowParallelLinear(
+                self.intermediate_size,
+                self.hidden_size,
+                input_is_parallel=True,
+                has_bias=False,
+            )
+        else:
+            self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
+            self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias_attr=False)
+            self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias_attr=False)
+        self.act_fn = ACT2FN[self.hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.gate_proj(x) * self.act_fn(self.up_proj(x)))
+
+
+class YuanAttention(nn.Layer):
+
+    """Localized Filtering-based Attention 'YUAN 2.0: A Large Language Model with Localized Filtering-based Attention' paper"""
+
+    def __init__(self, config: YuanConfig):
+        super().__init__()
+        self.config = config
+        self.num_key_value_heads = config.num_key_value_heads
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.causal_mask = config.causal_mask
+        self.softmax_scale = 1.0 / math.sqrt(self.head_dim)
+        self.use_flash_attention = config.use_flash_attention
+        self.kv_indices = None
+        self.tp_degree = config.tensor_parallel_degree
+        if config.tensor_parallel_degree > 1:
+            assert (
+                self.num_heads % config.tensor_parallel_degree == 0
+            ), f"num_heads: {self.num_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}"
+            self.num_heads = self.num_heads // config.tensor_parallel_degree
+
+            if self.num_key_value_heads % config.tensor_parallel_degree == 0:
+                self.num_key_value_heads = self.num_key_value_heads // config.tensor_parallel_degree
+            else:
+                assert False
+        if config.sequence_parallel:
+            ColumnParallelLinear = ColumnSequenceParallelLinear
+            RowParallelLinear = RowSequenceParallelLinear
+        else:
+            ColumnParallelLinear = fleet.meta_parallel.ColumnParallelLinear
+            RowParallelLinear = fleet.meta_parallel.RowParallelLinear
+        self.dropout = 0.0
+
+        if config.tensor_parallel_degree > 1:
+            self.o_proj = RowParallelLinear(
+                self.hidden_size,
+                self.hidden_size,
+                has_bias=False,
+                input_is_parallel=True,
+            )
+            self.v_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.config.num_key_value_heads * self.head_dim,
+                has_bias=False,
+                gather_output=False,
+            )
+            self.q_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.hidden_size,
+                has_bias=False,
+                gather_output=False,
+            )
+            self.k_proj = ColumnParallelLinear(
+                self.hidden_size,
+                self.config.num_key_value_heads * self.head_dim,
+                has_bias=False,
+                gather_output=False,
+            )
+        else:
+            self.o_proj = nn.Linear(
+                self.hidden_size,
+                self.hidden_size,
+                bias_attr=False,
+            )
+            self.v_proj = nn.Linear(self.hidden_size, self.config.num_key_value_heads * self.head_dim, bias_attr=False)
+            self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias_attr=False)
+            self.k_proj = nn.Linear(self.hidden_size, self.config.num_key_value_heads * self.head_dim, bias_attr=False)
+        self.rotary_emb = YuanRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
+        self.lf_gate = LocalizedFiltering(self.hidden_size)
+
+    def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
+        return tensor.reshape([bsz, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        bsz, q_len, _ = hidden_states.shape
+        before_hidden_states = None
+        is_first_step = False
+        if use_cache:
+            if past_key_value is None:
+                inference_hidden_states_memory = paddle.empty(
+                    [bsz, 2, hidden_states.shape[2]], dtype=hidden_states.dtype
+                )
+                is_first_step = True
+            else:
+                before_hidden_states = past_key_value[2]
+
+        if use_cache:
+            if is_first_step:
+                if q_len >= 2:
+                    inference_hidden_states_memory = hidden_states[:, -2:, :]
+                else:
+                    inference_hidden_states_memory[:, :, :] = 0
+                    inference_hidden_states_memory[:, -1:, :] = hidden_states[:, -1:, :]
+            else:
+                hidden_states_tmp = before_hidden_states[:, -1:, :]
+                inference_hidden_states_memory = copy.deepcopy(
+                    paddle.concat((hidden_states_tmp, hidden_states), axis=1)
+                )
+
+        value_states = (
+            self.v_proj(hidden_states).reshape([bsz, q_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
+        )
+        hidden_states = self.lf_gate(hidden_states, before_hidden_states)
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        qk_states = paddle.concat([query_states, key_states], axis=-1)
+        qk_states = qk_states.reshape([bsz, q_len, self.num_heads, int(qk_states.shape[-1] // self.num_heads)])
+        (query_states, key_states) = paddle.chunk(qk_states, 2, axis=-1)
+        query_states = query_states.transpose([0, 2, 1, *range(3, len(query_states.shape))])
+        key_states = key_states.transpose([0, 2, 1, *range(3, len(key_states.shape))])
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = paddle.concat([past_key_value[0], key_states], axis=2)
+            value_states = paddle.concat([past_key_value[1], value_states], axis=2)
+
+        past_key_value = (key_states, value_states, inference_hidden_states_memory) if use_cache else None
+
+        if self.use_flash_attention:
+            attn_weights = None
+            query_states = query_states.transpose([0, 2, 1, *range(3, len(query_states.shape))])
+            key_states = key_states.transpose([0, 2, 1, *range(3, len(key_states.shape))])
+            value_states = value_states.transpose([0, 2, 1, *range(3, len(value_states.shape))])
+
+            batch_size = query_states.shape[0]
+
+            output = F.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                is_causal=attention_mask is None,
+            )
+            # attn_output = rearrange(output[0], '(b s) ... -> b s ...', b=batch_size)
+            seq_length = output[0].shape[0] // batch_size
+            new_shape = (batch_size, seq_length) + tuple(output[0].shape[1:])
+            attn_output = paddle.reshape(output[0], new_shape)
+        else:
+            attn_weights = paddle.matmul(
+                query_states, key_states.transpose([0, 1, 3, 2, *range(4, len(key_states.shape))])
+            ) / math.sqrt(self.head_dim)
+
+            if attn_weights.shape != [bsz, self.num_heads, q_len, kv_seq_len]:
+                raise ValueError(
+                    f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                    f" {attn_weights.shape}"
+                )
+            if attention_mask is not None:
+                if attention_mask.shape != [bsz, 1, q_len, kv_seq_len]:
+                    raise ValueError(
+                        f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.shape}"
+                    )
+                attn_weights = attn_weights + attention_mask
+                attn_weights = paddle.maximum(
+                    attn_weights, paddle.to_tensor(paddle.finfo(attn_weights.dtype).min, attn_weights.dtype)
+                )
+
+            # upcast attention to fp32
+            attn_weights = paddle.cast(
+                nn.functional.softmax(attn_weights, axis=-1, dtype=paddle.float32), query_states.dtype
+            )
+            attn_output = paddle.matmul(attn_weights, value_states)
+
+            if attn_output.shape != [bsz, self.num_heads, q_len, self.head_dim]:
+                raise ValueError(
+                    f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                    f" {attn_output.shape}"
+                )
+
+            attn_output = attn_output.transpose([0, 2, 1, *range(3, len(attn_output.shape))])
+
+        # attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = attn_output.reshape([bsz, q_len, -1])
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+
+
+class YuanDecoderLayer(nn.Layer):
+    def __init__(self, config: YuanConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = YuanAttention(config=config)
+        self.mlp = YuanMLP(config)
+        self.input_layernorm = YuanRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = YuanRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class YuanModel(YuanPretrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`YuanDecoderLayer`]
+
+    Args:
+        config: YuanConfig
+    """
+
+    def __init__(self, config: YuanConfig):
+        super().__init__(config)
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        # TODO: control it by config
+        self.eod_token = config.eod_token
+        self.reset_attention_mask = config.reset_attention_mask
+        self.reset_position_ids = config.reset_position_ids
+        self.enable_recompute = False
+        self.recompute_granularity = config.recompute_granularity
+        self.no_recompute_layers = config.no_recompute_layers if config.no_recompute_layers is not None else []
+        if config.tensor_parallel_degree > 1:
+            self.embed_tokens = mpu.VocabParallelEmbedding(
+                config.vocab_size,
+                config.hidden_size,
+                weight_attr=paddle.ParamAttr(initializer=nn.initializer.XavierNormal()),
+            )
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.LayerList([YuanDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = YuanRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self._post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                past_key_values_length=past_key_values_length,
+            )
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            expanded_attn_mask = paddle.to_tensor(expanded_attn_mask)
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    def _prepare_decoder_attention_mask_training(
+        self, input_id, inputs_embeds, eod_token, reset_mask_flag, reset_attention_mask=True, reset_position_ids=True
+    ):
+
+        micro_batch_size, seq_length = input_id.shape
+        attention_mask = paddle.tril(paddle.ones((micro_batch_size, seq_length, seq_length), dtype=self.config.dtype))
+        attention_mask = paddle.reshape(attention_mask, (micro_batch_size, 1, seq_length, seq_length))
+
+        position_ids = paddle.arange(seq_length, dtype=paddle.int64)
+        position_ids = position_ids.unsqueeze(0).expand_as(input_id)
+
+        if reset_position_ids:
+            position_ids = position_ids.clone()
+
+        if reset_position_ids or reset_attention_mask:
+            # Loop through the batches:
+            for b in range(micro_batch_size):
+
+                # Find indecies where EOD token is.
+                eod_index = position_ids[b, input_id[b] == eod_token]
+
+                # Detach indecies from positions if going to modify positions.
+                if reset_position_ids:
+                    eod_index = eod_index.detach()
+                # Loop through EOD indecies:
+                prev_index = 0
+                for j in range(eod_index.shape[0]):
+                    i = eod_index[j]
+                    # Mask attention loss.
+                    if reset_attention_mask:
+                        attention_mask[b, 0, (i + 1) :, : (i + 1)] = 0
+                    # Reset positions.
+                    if reset_position_ids:
+                        position_ids[b, (i + 1) :] -= i + 1 - prev_index
+                        prev_index = i + 1
+
+        inverted_mask = 1 - attention_mask
+        output_attn_mask = inverted_mask.masked_fill(
+            paddle.cast(inverted_mask, "bool"), paddle.finfo(inputs_embeds.dtype).min
+        )
+        if reset_mask_flag:
+            output_attn_mask = output_attn_mask[:, :, -1:, :]
+        return output_attn_mask, position_ids
+
+    @paddle.jit.not_to_static
+    def recompute_training_full(
+        self,
+        layer_module: nn.Layer,
+        hidden_states: Tensor,
+        position_ids: Optional[Tensor],
+        attention_mask: Tensor,
+        output_attentions: bool,
+        past_key_value: Tensor,
+        use_cache: bool,
+    ):
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs)
+
+            return custom_forward
+
+        hidden_states = recompute(
+            create_custom_forward(layer_module),
+            hidden_states,
+            attention_mask,
+            position_ids,
+            past_key_value,
+            output_attentions,
+            use_cache,
+            use_reentrant=self.config.recompute_use_reentrant,
+        )
+
+        return hidden_states
+
+    def forward(
+        self,
+        input_ids: paddle.Tensor = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[List[paddle.Tensor]] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        input_ids1 = copy.deepcopy(input_ids)
+        reset_mask_flag = False
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+            if use_cache:
+                reset_mask_flag = True
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+
+        if position_ids is None:
+            position_ids = paddle.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=paddle.int64
+            )
+            position_ids = paddle.unsqueeze(position_ids, axis=0).reshape([-1, seq_length])
+        else:
+            position_ids = paddle.reshape(position_ids, [-1, seq_length])
+            position_ids = paddle.cast(position_ids, dtype="int64")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if self.training or self.reset_position_ids:
+            attention_mask, _ = self._prepare_decoder_attention_mask_training(
+                input_ids1,
+                inputs_embeds,
+                self.eod_token,
+                reset_mask_flag,
+                self.reset_attention_mask,
+                self.reset_position_ids,
+            )
+
+        else:
+            if attention_mask is None:
+                attention_mask = paddle.ones((batch_size, seq_length_with_past), dtype=paddle.bool)
+            attention_mask = self._prepare_decoder_attention_mask(
+                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+            )
+
+        hidden_states = inputs_embeds
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            has_gradient = not hidden_states.stop_gradient
+            if (
+                self.enable_recompute
+                and idx not in self.no_recompute_layers
+                and has_gradient
+                and self.recompute_granularity == "full"
+            ):
+                layer_outputs = self.recompute_training_full(
+                    decoder_layer,
+                    hidden_states,
+                    position_ids=position_ids,
+                    attention_mask=attention_mask,
+                    output_attentions=output_attentions,
+                    past_key_value=past_key_value,
+                    use_cache=use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class YuanForCausalLM(YuanPretrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.eod_token = config.eod_token
+        self.sep_token = config.sep_token
+        self.use_loss_mask = config.use_loss_mask
+        self.yuan = YuanModel(config)
+        if config.sequence_parallel:
+            ColumnParallelLinear = ColumnSequenceParallelLinear
+        else:
+            ColumnParallelLinear = fleet.meta_parallel.ColumnParallelLinear
+
+        if config.tensor_parallel_degree > 1:
+            self.lm_head = ColumnParallelLinear(
+                config.hidden_size,
+                config.vocab_size,
+                has_bias=False,
+                gather_output=True,
+            )
+        else:
+            self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias_attr=False)
+        # Initialize weights and apply final processing
+        self._post_init()
+
+    def get_input_embeddings(self):
+        return self.yuan.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.yuan.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.yuan = decoder
+
+    def get_decoder(self):
+        return self.yuan
+
+    def get_loss_mask(self, input_ids, labels, eod_token, sep_token):
+        micro_batch_size, seq_length = input_ids.shape
+
+        loss_mask = paddle.ones(input_ids.shape, dtype=paddle.float32)
+
+        position_ids = paddle.arange(seq_length, dtype=paddle.int64)
+        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+
+        """modify loss_mask to only calculate the loss of the answer (separated with [SEP])"""
+
+        for b in range(micro_batch_size):
+            eod_indexs = position_ids[b, input_ids[b] == eod_token]
+            sep_indexs = position_ids[b, input_ids[b] == sep_token]
+
+            if len(eod_indexs) == 0 or len(sep_indexs) == 0:
+                loss_mask[b] = 1.0
+            else:
+                if eod_indexs[0] > sep_indexs[0]:
+                    loss_mask[b, 0 : sep_indexs[0]] = 0
+
+                    if len(eod_indexs) == len(sep_indexs):
+                        for ii, eod_index in enumerate(eod_indexs):
+                            start_index = eod_index
+                            if ii == (len(sep_indexs) - 1):
+                                stop_index = seq_length
+                            else:
+                                stop_index = sep_indexs[ii + 1]
+                            loss_mask[b, start_index:stop_index] = 0.0
+                    else:
+                        if len(eod_indexs) > len(sep_indexs):
+                            loss_mask[b, :] = 1.0
+                        else:
+                            for ii, eod_index in enumerate(eod_indexs):
+                                start_index = eod_index
+                                stop_index = sep_indexs[ii + 1]
+
+                                loss_mask[b, start_index:stop_index] = 0.0
+
+                elif eod_indexs[0] < sep_indexs[0]:
+
+                    if len(eod_indexs) == len(sep_indexs):
+                        for ii, eod_index in enumerate(eod_indexs):
+                            start_index = eod_index
+                            stop_index = sep_indexs[ii]
+                            loss_mask[b, start_index:stop_index] = 0.0
+
+                    else:
+                        if len(eod_indexs) < len(sep_indexs):
+                            loss_mask[b, :] = 1.0
+                        else:
+                            for ii, eod_index in enumerate(eod_indexs):
+                                start_index = eod_index
+                                if ii >= len(sep_indexs):
+                                    stop_index = seq_length
+                                else:
+                                    stop_index = sep_indexs[ii]
+                                loss_mask[b, start_index:stop_index] = 0.0
+
+        loss_mask[input_ids == eod_token] = 1.0
+        return loss_mask
+
+    def forward(
+        self,
+        input_ids: paddle.Tensor = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[List[paddle.Tensor]] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, YuanForCausalLM
+
+        >>> model = YuanForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you consciours? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        return_dict = True
+        outputs = self.yuan(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            if self.use_loss_mask:
+                loss_mask = self.get_loss_mask(input_ids, labels, self.eod_token, self.sep_token)
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :, :].contiguous()
+            shift_labels = labels[..., :].contiguous()
+            # Flatten the tokens
+            if self.use_loss_mask:
+                loss_fct = CrossEntropyLoss(reduction="none")
+                shift_logits = paddle.reshape(shift_logits, [-1, self.config.vocab_size])
+                shift_labels = paddle.reshape(shift_labels, [-1])
+                # Enable model parallelism
+                shift_labels = paddle.to_tensor(shift_labels)
+                loss = loss_fct(shift_logits, shift_labels)
+
+                loss = paddle.sum(loss * loss_mask) / loss_mask.sum()
+            else:
+                loss_fct = CrossEntropyLoss()
+                shift_logits = paddle.reshape(shift_logits, [-1, self.config.vocab_size])
+                shift_labels = paddle.reshape(shift_labels, [-1])
+                # Enable model parallelism
+                shift_labels = paddle.to_tensor(shift_labels)
+                loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        position_ids = None
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = paddle.cast(attention_mask, dtype="int64").cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/yuan/tokenizer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/yuan/tokenizer.py
new file mode 100644
index 000000000..d1ce819f2
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/transformers/yuan/tokenizer.py
@@ -0,0 +1,262 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization class for Yuan2.0 model"""
+
+import os
+from shutil import copyfile
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import sentencepiece as spm
+
+from ...utils.log import logger
+from .. import PretrainedTokenizer
+from ..tokenizer_utils_base import BatchEncoding, EncodedInput, PaddingStrategy
+
+__all__ = ["YuanTokenizer"]
+
+
+class YuanTokenizer(PretrainedTokenizer):
+    """
+    YuanTokenizer is equivalent to LlamaTokenizer
+    """
+
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+    resource_files_names = {
+        "vocab_file": "tokenizer.model",
+    }
+
+    padding_side = "left"
+
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        add_bos_token=True,
+        add_eos_token=False,
+        sp_model_kwargs=None,
+        decode_with_prefix_space=False,
+        **kwargs
+    ):
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
+
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.decode_with_prefix_space = decode_with_prefix_space
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+        self.pad_token_id = self.eos_token_id
+
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+
+    @property
+    def bos_token_id(self) -> Optional[int]:
+        return self.sp_model.bos_id()
+
+    @property
+    def eos_token_id(self) -> Optional[int]:
+        return self.sp_model.eos_id()
+
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text):
+        """Returns a tokenized string."""
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for i, token in enumerate(tokens):
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special and i != 0:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+
+    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "") + self.resource_files_names["vocab_file"],
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        if self.add_bos_token:
+            bos_token_ids = [self.bos_token_id]
+        else:
+            bos_token_ids = []
+
+        output = bos_token_ids + token_ids_0
+
+        if token_ids_1 is not None:
+            output = output + token_ids_1
+
+        if self.add_eos_token:
+            output = output + [self.eos_token_id]
+
+        return output
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
+        use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        eos = [self.eos_token_id]
+
+        if token_ids_1 is None:
+            return len(token_ids_0 + eos) * [0]
+        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
+
+    def _pad(
+        self,
+        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                >= 7.5 (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+
+        # attention_mask shape [1,seq_len,seq_len]
+        if "attention_mask" in encoded_inputs and len(np.shape(encoded_inputs["attention_mask"])) > 2:
+            attention_mask = encoded_inputs["attention_mask"]
+            encoded_inputs.pop("attention_mask")
+        else:
+            attention_mask = None
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+        encoded_inputs = super()._pad(
+            encoded_inputs, max_length, padding_strategy, pad_to_multiple_of, return_attention_mask
+        )
+        if attention_mask is not None and len(np.shape(attention_mask)) > 2:
+            encoded_inputs["attention_mask"] = attention_mask
+            needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+            if needs_to_be_padded:
+                difference = max_length - len(required_input)
+                if "attention_mask" in encoded_inputs:
+                    encoded_inputs["attention_mask"] = np.pad(
+                        encoded_inputs["attention_mask"],
+                        pad_width=[(0, 0), (difference, 0), (difference, 0)],
+                        mode="constant",
+                        constant_values=0,
+                    )
+        return encoded_inputs
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trl/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trl/__init__.py
new file mode 100644
index 000000000..ff5182c8f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trl/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you smay not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .dpo_trainer import DPOTrainer
+from .trl_data import *
+from .trl_utils import *
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trl/dpo_trainer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trl/dpo_trainer.py
new file mode 100644
index 000000000..0429c9faa
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trl/dpo_trainer.py
@@ -0,0 +1,676 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" DPO Trainer """
+import types
+from collections import OrderedDict, defaultdict
+
+import paddle
+import paddle.nn.functional as F
+from paddle import framework
+from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_parallel import ParallelCrossEntropy
+
+from paddlenlp.trainer import Trainer
+from paddlenlp.transformers.model_utils import unwrap_model
+
+
+def disable_dropout_in_model(model: paddle.nn.Layer) -> None:
+    """ "disable dropout"""
+    for module in model.children():
+        if isinstance(module, paddle.nn.Dropout):
+            module.p = 0
+
+
+class DPOTrainer(Trainer):
+    """
+    Initialize DPOTrainer.
+    """
+
+    def __init__(self, model, data_collator, ref_model=None, disable_dropout: bool = True, **kwargs):
+        super().__init__(model, data_collator=data_collator, **kwargs)
+
+        self.reference_free = kwargs.pop("reference_free", False)
+        if ref_model:
+            self.ref_model = ref_model
+            self.ref_model = self._wrap_ref_model(self.ref_model)
+            self.ref_model.eval()
+        elif not self.reference_free:
+            raise ValueError("Please provide a reference model.")
+        if self.reference_free and self.args.dpo_loss_type not in ["sigmoid", "hinge", "ipo"]:
+            raise ValueError(f"{self.args.dpo_loss_type} is not a valid loss type for DPO reference_free.")
+        if disable_dropout:
+            disable_dropout_in_model(model)
+            if self.ref_model is not None:
+                disable_dropout_in_model(self.ref_model)
+        if self.model.config.tensor_parallel_output and self.model.config.tensor_parallel_degree > 1:
+            self.logprobs = ParallelCrossEntropy()
+        else:
+            self.logprobs = paddle.nn.CrossEntropyLoss(reduction="none")
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+
+    def dpo_loss(
+        self,
+        policy_chosen_logps,
+        policy_rejected_logps,
+        reference_chosen_logps=None,
+        reference_rejected_logps=None,
+    ):
+        """
+        Compute the DPO loss for a batch of policy and reference model log probabilities.
+        """
+        pi_logratios = policy_chosen_logps - policy_rejected_logps
+        if self.reference_free:
+            ref_logratios = 0
+        else:
+            ref_logratios = reference_chosen_logps - reference_rejected_logps
+        logits = pi_logratios - ref_logratios
+        if self.args.dpo_loss_type == "sigmoid":
+            loss = (
+                -F.log_sigmoid(self.args.dpo_beta * logits) * (1 - self.args.dpo_label_smoothing)
+                - F.log_sigmoid(-self.args.dpo_beta * logits) * self.args.dpo_label_smoothing
+            )
+        elif self.args.dpo_loss_type == "hinge":
+            loss = F.relu(1 - self.args.dpo_beta * logits)
+        elif self.args.dpo_loss_type == "ipo":
+            # parameter for the IPO loss, denoted by tau in the paper.
+            loss = (logits - 1 / (2 * self.args.dpo_beta)) ** 2
+        elif self.args.dpo_loss_type == "kto_pair":
+            # eqn (7) of the HALOs paper
+            chosen_KL = (policy_chosen_logps - reference_chosen_logps).mean().clip(min=0)
+            rejected_KL = (policy_rejected_logps - reference_rejected_logps).mean().clip(min=0)
+
+            chosen_logratios = policy_chosen_logps - reference_chosen_logps
+            rejected_logratios = policy_rejected_logps - reference_rejected_logps
+            # As described in the KTO report, the KL term for
+            # chosen (rejected) is estimated using the rejected (chosen) half.
+            loss = paddle.concat(
+                (
+                    1 - F.sigmoid(self.args.dpo_beta * (chosen_logratios - rejected_KL)),
+                    1 - F.sigmoid(self.args.dpo_beta * (chosen_KL - rejected_logratios)),
+                ),
+                0,
+            )
+        elif self.args.dpo_loss_type == "sppo_hard":
+            # In the paper (https://arxiv.org/pdf/2405.00675), SPPO employs a soft probability
+            # approach, estimated using the PairRM score. The probability calculation is
+            # conducted outside of the trainer class. The version described here is the hard
+            # probability version, where P in Equation (4.7) of Algorithm 1 is set to 1 for
+            # the winner and 0 for the loser.
+            a = policy_chosen_logps - reference_chosen_logps
+            b = policy_rejected_logps - reference_rejected_logps
+
+            loss = (a - 0.5 / self.args.dpo_beta) ** 2 + (b + 0.5 / self.args.dpo_beta) ** 2
+        elif self.args.dpo_loss_type == "nca_pair":
+            chosen_rewards = (policy_chosen_logps - reference_chosen_logps) * self.args.dpo_beta
+            rejected_rewards = (policy_rejected_logps - reference_rejected_logps) * self.args.dpo_beta
+            loss = (
+                -F.log_sigmoid(chosen_rewards)
+                - 0.5 * F.log_sigmoid(-chosen_rewards)
+                - 0.5 * F.log_sigmoid(-rejected_rewards)
+            )
+        else:
+            raise ValueError(
+                f"Unknown loss type: {self.args.dpo_loss_type}. "
+                "Should be one of ['sigmoid', 'hinge', 'ipo', 'kto_pair', 'sppo_hard', 'nca_pair']"
+            )
+        return loss.mean()
+
+    def get_batch_logps(
+        self,
+        batch,
+        logits,
+        average_log_prob=False,
+    ):
+        """DPO logprobs"""
+        labels = batch["chosen_labels"] + batch["rejected_labels"]
+        logits = logits.astype("float32")
+        if logits.shape[:-1] != labels.shape:
+            raise ValueError("Logits (batch and sequence length dim) and labels must have the same shape.")
+        per_token_logps = -self.logprobs(logits, labels.unsqueeze(2)).squeeze(2)
+        chosen_logps = paddle.stack(
+            [
+                (per_token_logps[response_index[0]][response_index[1] : response_index[2]]).sum()
+                if response_index[3] != 0
+                else paddle.zeros([])
+                for response_index in batch["response_indexs"]
+            ],
+            axis=0,
+        )
+        rejected_logps = paddle.stack(
+            [
+                (per_token_logps[response_index[0]][response_index[2] + 1 : response_index[3]]).sum()
+                if response_index[3] != 0
+                else paddle.zeros([])
+                for response_index in batch["response_indexs"]
+            ],
+            axis=0,
+        )
+        if average_log_prob:
+            chosen_response_length = batch["response_indexs"][:, 2] - batch["response_indexs"][:, 1]
+            rejected_response_length = batch["response_indexs"][:, 3] - batch["response_indexs"][:, 2]
+            chosen_logps /= chosen_response_length
+            rejected_logps /= rejected_response_length
+        return chosen_logps, rejected_logps
+
+    def get_batch_metrics(self, model, batch, train_eval="train"):
+        """Compute the DPO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+        if hasattr(self.model.config, "dpo") and self.model.config.dpo:
+            dpo_inputs = {
+                "input_ids": batch["input_ids"],
+                "position_ids": batch["position_ids"],
+                "chosen_labels": batch["chosen_labels"],
+                "rejected_labels": batch["rejected_labels"],
+                "response_indexs": batch["response_indexs"],
+            }
+            if "attention_mask" in batch:
+                dpo_inputs["attention_mask"] = batch["attention_mask"]
+            if "attn_mask_startend_row_indices" in batch:
+                dpo_inputs["attn_mask_startend_row_indices"] = batch["attn_mask_startend_row_indices"]
+            if self.reference_free:
+                reference_chosen_logps, reference_rejected_logps = None, None
+            else:
+                with paddle.no_grad():
+                    reference_chosen_logps, reference_rejected_logps = self.ref_model(**dpo_inputs)
+                dpo_inputs["reference_chosen_logps"] = reference_chosen_logps
+                dpo_inputs["reference_rejected_logps"] = reference_rejected_logps
+            loss, policy_chosen_logps, policy_rejected_logps = model(**dpo_inputs)
+        else:
+            dpo_inputs = {
+                "input_ids": batch["input_ids"],
+                "position_ids": batch["position_ids"],
+            }
+            if "attention_mask" in batch:
+                dpo_inputs["attention_mask"] = batch["attention_mask"]
+            if "attn_mask_startend_row_indices" in batch:
+                dpo_inputs["attn_mask_startend_row_indices"] = batch["attn_mask_startend_row_indices"]
+            if self.reference_free:
+                reference_chosen_logps, reference_rejected_logps = None, None
+            else:
+                with paddle.no_grad():
+                    ref_logits = self.ref_model(**dpo_inputs)[0]
+                    reference_chosen_logps, reference_rejected_logps = self.get_batch_logps(
+                        batch,
+                        ref_logits,
+                        average_log_prob=self.args.dpo_loss_type == "ipo",
+                    )
+            policy_logits = model(**dpo_inputs)[0]
+            policy_chosen_logps, policy_rejected_logps = self.get_batch_logps(
+                batch,
+                policy_logits,
+                average_log_prob=self.args.dpo_loss_type == "ipo",
+            )
+
+            loss = self.dpo_loss(
+                policy_chosen_logps,
+                policy_rejected_logps,
+                reference_chosen_logps,
+                reference_rejected_logps,
+            )
+
+        policy_chosen_logps, policy_rejected_logps = policy_chosen_logps.detach(), policy_rejected_logps.detach()
+        if self.reference_free:
+            chosen_rewards = self.args.dpo_beta * (policy_chosen_logps)
+            rejected_rewards = self.args.dpo_beta * (policy_rejected_logps)
+            reward_accuracies = (chosen_rewards > rejected_rewards).astype(paddle.float32)
+        else:
+            chosen_rewards = self.args.dpo_beta * (policy_chosen_logps - reference_chosen_logps)
+            rejected_rewards = self.args.dpo_beta * (policy_rejected_logps - reference_rejected_logps)
+            reward_accuracies = (chosen_rewards > rejected_rewards).astype(paddle.float32)
+
+        prefix = "eval_" if train_eval == "eval" else ""
+        metrics[f"{prefix}rewards/chosen"] = chosen_rewards.mean()
+        metrics[f"{prefix}rewards/rejected"] = rejected_rewards.mean()
+        metrics[f"{prefix}rewards/accuracies"] = reward_accuracies.mean()
+        metrics[f"{prefix}rewards/margins"] = (chosen_rewards - rejected_rewards).mean()
+        metrics[f"{prefix}logps/rejected"] = policy_rejected_logps.mean()
+        metrics[f"{prefix}logps/chosen"] = policy_chosen_logps.mean()
+
+        for key in metrics:
+            metrics[key] = self._nested_gather(paddle.tile(metrics[key], repeat_times=[1, 1])).mean().cpu()
+        return loss, metrics
+
+    def compute_loss(self, model, inputs, return_outputs=False):
+        """Compute the DPO loss for the given batch of inputs."""
+        loss, metrics = self.get_batch_metrics(model, inputs, train_eval="train")
+        if self.args.should_save:
+            self.store_metrics(metrics, train_eval="train")
+        if return_outputs:
+            return (loss, metrics)
+
+        return loss
+
+    def _wrap_model(self, model, training=True):
+        """Wrap model."""
+        model = super()._wrap_model(model, training)
+        if self.args.pipeline_parallel_degree > 1:
+            model._prepare_pipeline_inputs_func = prepare_pipeline_dpo_inputs_func
+            model.eval_dpo_batch = types.MethodType(eval_dpo_batch, model)
+            model._forward_step = types.MethodType(_forward_step, model)
+            model.broadcast_pp_final_output = types.MethodType(broadcast_pp_final_output, model)
+        return model
+
+    def _wrap_ref_model(self, model):
+        """Wrap reference model."""
+        if unwrap_model(model) is not model:
+            return model
+        self.amp_dtype = "float16" if self.args.fp16 else "bfloat16"
+        model = paddle.amp.decorate(
+            models=model,
+            level=self.args.fp16_opt_level,
+            dtype=self.amp_dtype,
+        )
+        model = fleet.distributed_model(model)
+        if self.args.pipeline_parallel_degree > 1:
+            model._prepare_pipeline_inputs_func = prepare_pipeline_dpo_inputs_func
+            model.eval_dpo_batch = types.MethodType(eval_dpo_batch, model)
+            model._forward_step = types.MethodType(_forward_step, model)
+            model.broadcast_pp_final_output = types.MethodType(broadcast_pp_final_output, model)
+
+        return model
+
+    def evaluate(self, eval_dataset=None, ignore_keys=None, metric_key_prefix="eval"):
+        """evaluate"""
+        self.model_wrapped = self._wrap_ref_model(self.model_wrapped)
+        return super().evaluate(eval_dataset, ignore_keys, metric_key_prefix)
+
+    def prediction_step(self, model, inputs, prediction_loss_only=False, ignore_keys=None):
+
+        """prediction_step"""
+        if self.args.pipeline_parallel_degree > 1:
+            # hack for pipeline mode
+            inputs = self._prepare_inputs(inputs)
+            return self.prediction_pipeline_step(model, inputs)
+        if ignore_keys is None:
+            if hasattr(model, "config"):
+                ignore_keys = getattr(model.config, "keys_to_ignore_at_inference", [])
+            else:
+                ignore_keys = []
+
+        with paddle.no_grad():
+            loss, metrics = self.get_batch_metrics(model, inputs, train_eval="eval")
+
+        if self.args.should_save:
+            self.store_metrics(metrics, train_eval="eval")
+        if prediction_loss_only:
+            return (loss.detach(), None, None)
+
+        logits_dict = {
+            "eval_logits/chosen": metrics["eval_logits/chosen"],
+            "eval_logits/rejected": metrics["eval_logits/rejected"],
+        }
+
+        logits = tuple(v for k, v in logits_dict.items() if k not in ignore_keys)
+        logits = paddle.to_tensor(logits)
+        labels = paddle.zeros(logits.shape[0])
+        return (loss.detach(), logits, labels)
+
+    def store_metrics(self, metrics, train_eval="train"):
+        """store_metrics"""
+        for key, value in metrics.items():
+            self._stored_metrics[train_eval][key].append(value)
+
+    def log(self, logs, **kwargs):
+        """
+        Log `logs` on the various objects watching training, including stored metrics.
+
+        Args:
+            logs (`Dict[str, float]`):
+                The values to log.
+        """
+        # logs either has 'loss' or 'eval_loss'
+        train_eval = "train" if "loss" in logs else "eval"
+        # Add averaged stored metrics to logs
+        for key, metrics in self._stored_metrics[train_eval].items():
+            logs[key] = paddle.to_tensor(metrics).mean().item()
+        del self._stored_metrics[train_eval]
+        if self.state.epoch is not None and train_eval == "train":
+            self.state.epoch *= self.args.num_train_epochs
+        return super().log(logs, **kwargs)
+
+    def split_response_indexs_for_pipeline(self, batch):
+        """
+        split response indexs for pipeline parallel mode.
+        """
+        batch_response_indexs = []
+        response_indexs = None
+        response_num = [0] * batch["input_ids"].shape[0]
+        last_batch = -1
+        if batch["response_indexs"][0][1] == 0:
+            use_sparse_head_and_loss_fn = True
+        else:
+            use_sparse_head_and_loss_fn = False
+        last_batch_response_length = 0
+
+        for response_index in batch["response_indexs"]:
+            if response_index[0] == last_batch:
+                response_index -= last_batch_response_length
+                response_index[0] = 0
+                response_indexs.append(response_index)
+            else:
+                last_batch += 1
+                if use_sparse_head_and_loss_fn:
+                    last_batch_response_length = response_index[1]
+                if response_indexs is not None:
+                    batch_response_indexs.append(response_indexs)
+                response_index -= last_batch_response_length
+                response_index[0] = 0
+                response_indexs = [response_index]
+            response_num[last_batch] += 1
+        batch_response_indexs.append(response_indexs)
+        max_response_num = max(response_num)
+        for i in range(len(response_num)):
+            for _ in range(max_response_num - response_num[i]):
+                batch_response_indexs[i].append(paddle.to_tensor([0, 0, -1, 0], dtype="int64"))
+
+        return paddle.to_tensor(batch_response_indexs)
+
+    def prediction_pipeline_step(
+        self,
+        model,
+        batch,
+    ):
+        """
+        prediction_step function for pipeline parallel mode.
+        """
+        config_backup = model.micro_batch_size, model.accumulate_steps
+        model.accumulate_steps = batch["input_ids"].shape[0]
+        model.micro_batch_size = 1
+        if not self.reference_free:
+            self.ref_model.accumulate_steps = model.accumulate_steps
+            self.ref_model.micro_batch_size = model.micro_batch_size
+        # [1, total_response_indexs] -> [bs, response_indexs]
+        batch["response_indexs"] = self.split_response_indexs_for_pipeline(batch)
+        batch["reference_chosen_logps"] = None
+        batch["reference_rejected_logps"] = None
+        total_response_num = batch["response_indexs"].shape[0] * batch["response_indexs"].shape[1]
+
+        inputs, labels = model._prepare_pipeline_inputs_func(batch)
+        with paddle.no_grad():
+            with self.autocast_smart_context_manager():
+                policy_chosen_logps, policy_rejected_logps = model.eval_dpo_batch(
+                    data=[inputs, labels], total_response_num=total_response_num
+                )
+                policy_chosen_logps = paddle.masked_select(policy_chosen_logps, policy_chosen_logps != 0)
+                policy_rejected_logps = paddle.masked_select(policy_rejected_logps, policy_rejected_logps != 0)
+                if not self.reference_free:
+                    reference_chosen_logps, reference_rejected_logps = self.ref_model.eval_dpo_batch(
+                        [inputs, labels], total_response_num=total_response_num
+                    )
+                    reference_chosen_logps = paddle.masked_select(reference_chosen_logps, reference_chosen_logps != 0)
+                    reference_rejected_logps = paddle.masked_select(
+                        reference_rejected_logps, reference_rejected_logps != 0
+                    )
+                else:
+                    reference_chosen_logps, reference_rejected_logps = None, None
+
+        loss = self.dpo_loss(
+            policy_chosen_logps,
+            policy_rejected_logps,
+            reference_chosen_logps,
+            reference_rejected_logps,
+        )
+        policy_chosen_logps, policy_rejected_logps = policy_chosen_logps.detach(), policy_rejected_logps.detach()
+        if not self.reference_free:
+            chosen_rewards = self.args.dpo_beta * (policy_chosen_logps - reference_chosen_logps)
+            rejected_rewards = self.args.dpo_beta * (policy_rejected_logps - reference_rejected_logps)
+        else:
+            chosen_rewards = self.args.dpo_beta * (policy_chosen_logps)
+            rejected_rewards = self.args.dpo_beta * (policy_rejected_logps)
+
+        reward_accuracies = (chosen_rewards > rejected_rewards).astype(paddle.float32)
+        metrics = {}
+        metrics["eval_rewards/chosen"] = chosen_rewards.mean()
+        metrics["eval_rewards/rejected"] = rejected_rewards.mean()
+        metrics["eval_rewards/accuracies"] = reward_accuracies.mean()
+        metrics["eval_rewards/margins"] = (chosen_rewards - rejected_rewards).mean()
+        metrics["eval_logps/rejected"] = policy_rejected_logps.mean()
+        metrics["eval_logps/chosen"] = policy_chosen_logps.mean()
+        for key in metrics:
+            metrics[key] = self._nested_gather(paddle.tile(metrics[key], repeat_times=[1, 1])).mean().cpu()
+        if self.args.should_save:
+            self.store_metrics(metrics, train_eval="eval")
+        model.micro_batch_size, model.accumulate_steps = config_backup
+        if not self.reference_free:
+            self.ref_model.micro_batch_size, self.ref_model.accumulate_steps = config_backup
+        return (loss, None, None)
+
+    def training_pipeline_step(self, model, inputs):
+        """
+        Perform a training step on a batch of inputs.
+        """
+        # accumulation data
+        if not hasattr(self, "_pp_data_buffer"):
+            self._pp_data_buffer = []
+        self._pp_data_buffer.append(inputs)
+        if len(self._pp_data_buffer) != self.args.gradient_accumulation_steps:
+            return paddle.zeros([])
+        response_num = [
+            len(self._pp_data_buffer[i]["response_indexs"]) for i in range(self.args.gradient_accumulation_steps)
+        ]
+        max_response_num = max(response_num)
+        for i in range(self.args.gradient_accumulation_steps):
+            self._pp_data_buffer[i]["response_indexs"] = paddle.concat(
+                [
+                    self._pp_data_buffer[i]["response_indexs"],
+                    paddle.to_tensor((max_response_num - response_num[i]) * [[0, 0, -1, 0]], dtype="int64"),
+                ],
+                axis=0,
+            )
+        total_response_num = self.args.gradient_accumulation_steps * max_response_num
+        concatenated_inputs = {}
+        for key in self._pp_data_buffer[i].keys():
+            concatenated_inputs[key] = [
+                self._pp_data_buffer[i][key] for i in range(self.args.gradient_accumulation_steps)
+            ]
+        concatenated_inputs["reference_chosen_logps"] = None
+        concatenated_inputs["reference_rejected_logps"] = None
+
+        self._pp_data_buffer = []
+        inputs, labels = model._prepare_pipeline_inputs_func(concatenated_inputs)
+        model_config_backup = model.micro_batch_size, model.accumulate_steps
+        model.micro_batch_size = self.args.per_device_train_batch_size
+        model.accumulate_steps = self.args.gradient_accumulation_steps
+        if not self.reference_free:
+            ref_model_config_backup = self.ref_model.micro_batch_size, self.ref_model.accumulate_steps
+            self.ref_model.accumulate_steps = model.accumulate_steps
+            self.ref_model.micro_batch_size = model.micro_batch_size
+            with paddle.no_grad():
+                with self.autocast_smart_context_manager():
+                    reference_chosen_logps, reference_rejected_logps = self.ref_model.eval_dpo_batch(
+                        data=[inputs, labels], total_response_num=total_response_num
+                    )
+            labels = (
+                labels[0],
+                labels[1],
+                labels[2],
+                reference_chosen_logps.split(num_or_sections=model.accumulate_steps, axis=0),
+                reference_rejected_logps.split(num_or_sections=model.accumulate_steps, axis=0),
+            )
+        train_inputs = [inputs, labels]
+        train_inputs = model._prepare_training(train_inputs, self.optimizer, self.lr_scheduler)
+        model.optimizer = None  # we do not use `PipelineParallel` to handler optimizer step
+        model.lr_scheduler = None
+        with self.autocast_smart_context_manager():
+            loss = model.forward_backward_pipeline(train_inputs, self.scaler if self.do_grad_scaling else None)
+        model.micro_batch_size, model.accumulate_steps = model_config_backup
+        if not self.reference_free:
+            self.ref_model.micro_batch_size, self.ref_model.accumulate_steps = ref_model_config_backup
+        return loss.detach()
+
+
+def prepare_pipeline_dpo_inputs_func(inputs):
+    """Prepare pipeline inputs"""
+    if "attention_mask" in inputs:
+        first_stage_keys = [
+            "input_ids",
+            "attention_mask",
+            "position_ids",
+        ]
+    else:
+        first_stage_keys = [
+            "input_ids",
+            "attn_mask_startend_row_indices",
+            "position_ids",
+        ]
+
+    last_stage_keys = [
+        "chosen_labels",
+        "rejected_labels",
+        "response_indexs",
+        "reference_chosen_logps",
+        "reference_rejected_logps",
+    ]
+
+    def get_expected_keys(inputs, keys):
+        ret = tuple([inputs.pop(k) for k in keys if k in inputs])
+        if len(ret) == 1:
+            ret = ret[0]
+        return ret
+
+    if type(inputs) is dict or type(inputs) is OrderedDict:
+        return [
+            get_expected_keys(inputs, first_stage_keys),
+            get_expected_keys(inputs, last_stage_keys),
+        ]
+
+    keys = list(inputs[0].keys())
+    inputs_batch = {key: [data.pop(key) for data in inputs] for key in keys}
+    return [
+        get_expected_keys(inputs_batch, first_stage_keys),
+        get_expected_keys(inputs_batch, last_stage_keys),
+    ]
+
+
+def eval_dpo_batch(self, data, total_response_num):
+    """eval_dpo_batch"""
+    # reset the virtual pp rank for each run
+    self.set_virtual_pipeline_rank(0)
+
+    self._layers.eval()
+
+    # store data id for micro_batch
+    self.micro_batch_id = 0
+
+    # store total loss of entire batch
+    self.total_loss = None
+
+    startup_steps = self.num_stages - self.stage_id - 1
+    startup_steps = min(startup_steps, self.accumulate_steps)
+    steady_steps = self.accumulate_steps - startup_steps
+
+    input_buffers = []
+    output_buffers = []
+
+    # convert to micro dataset
+    micro_dataset = self._wrap_data(data)
+
+    for step_id in range(startup_steps):
+        input_tensor = self._p2p_helper.recv_forward(self.is_pipeline_first_stage())
+
+        output_tensor = self._forward_step(input_tensor, micro_dataset)
+        self._p2p_helper.send_forward(output_tensor, self.is_pipeline_last_stage(), skip_check_meta=True)
+
+        input_buffers.append(input_tensor)
+        output_buffers.append(output_tensor)
+
+    if steady_steps > 0:
+        input_tensor = self._p2p_helper.recv_forward(self.is_pipeline_first_stage())
+
+    for i in range(steady_steps):
+        last_iter = i == (steady_steps - 1)
+
+        output_tensor = self._forward_step(input_tensor, micro_dataset)
+        self._p2p_helper.send_forward(output_tensor, self.is_pipeline_last_stage(), skip_check_meta=True)
+
+        input_buffers.append(input_tensor)
+        output_buffers.append(output_tensor)
+
+        if not last_iter:
+            input_tensor = self._p2p_helper.recv_forward(self.is_pipeline_first_stage())
+    return self.broadcast_pp_final_output(output_buffers, total_response_num)
+
+
+def _forward_step(self, input_tensor, micro_dataset, chunk_id=None):
+    if self._enable_timer:
+        self.timers("forward_step").start()
+    if self.is_pipeline_first_stage():
+        input_tensor = next(micro_dataset)[0]
+        self._check_micro_batch_data_valid(input_tensor)
+
+    assert chunk_id is None or isinstance(chunk_id, int)
+
+    output_tensor = self._layers.forward(input_tensor, chunk_id=chunk_id)
+
+    if self.is_pipeline_last_stage():
+        assert self._layers._loss_fn is not None, "loss function should exist to compute loss"
+        labels = next(micro_dataset)[1]
+        self._check_micro_batch_data_valid(labels)
+        for idx, loss_fn in enumerate(self._layers._loss_fn):
+            output_tensor = loss_fn(output_tensor, labels[0], labels[1], labels[2], labels[3], labels[4])
+            if labels[3] is not None and labels[4] is not None:
+                assert isinstance(
+                    output_tensor, (paddle.Tensor, framework.core.eager.Tensor)
+                ), "Currently, loss_fn should obtain Paddle.Tensor dtype"
+
+                with paddle.amp.auto_cast(enable=False):
+                    if self.accumulate_steps > 1 and not self._delay_scale_loss:
+                        output_tensor = output_tensor / self.accumulate_steps
+
+                    if self.total_loss is None:
+                        self.total_loss = []
+                    if len(self.total_loss) <= idx:
+                        self.total_loss.append(paddle.zeros_like(output_tensor))
+                    self.total_loss[idx] += output_tensor.detach()
+                if idx == self.loss_fn_idx:
+                    loss_tensor = output_tensor
+
+    if self.is_pipeline_first_stage() or self.is_pipeline_last_stage():
+        # Only increase micro batch id at virtual first/last pp stage.
+        # The micro batch id is used to load data, therefore, only increase it when load data.
+        self.micro_batch_id += 1
+    if self._enable_timer:
+        self.timers("forward_step").stop()
+    if self.is_pipeline_last_stage() and labels[3] is not None and labels[4] is not None:
+        return loss_tensor
+    else:
+        return output_tensor
+
+
+def broadcast_pp_final_output(self, output_buffers, total_response_num):
+    """broadcast_pp_final_output"""
+    # Since the last backward run in interleave will set the virtual rank to 0,
+    # here we need to check last stage ignoring virtual stage.
+    if self.is_pipeline_last_stage(ignore_virtual=True):
+        chosen_logps = paddle.concat([buffer[0] for buffer in output_buffers], axis=0)
+        rejected_logps = paddle.concat([buffer[1] for buffer in output_buffers], axis=0)
+        paddle.distributed.broadcast(chosen_logps, src=self.global_rank, sync_op=True, group=self.pp_group)
+        paddle.distributed.broadcast(rejected_logps, src=self.global_rank, sync_op=True, group=self.pp_group)
+    else:
+        chosen_logps = paddle.zeros(shape=[total_response_num], dtype="float32")
+        rejected_logps = paddle.zeros(shape=[total_response_num], dtype="float32")
+        paddle.distributed.broadcast(
+            chosen_logps,
+            src=self._hcg.get_rank_from_stage(self.num_stages - 1),
+            sync_op=True,
+            group=self.pp_group,
+        )
+        paddle.distributed.broadcast(
+            rejected_logps,
+            src=self._hcg.get_rank_from_stage(self.num_stages - 1),
+            sync_op=True,
+            group=self.pp_group,
+        )
+    return chosen_logps, rejected_logps
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trl/trl_data.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trl/trl_data.py
new file mode 100644
index 000000000..7afc3cb52
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trl/trl_data.py
@@ -0,0 +1,235 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+
+def check_preference_data(data):
+    if isinstance(data["src"], str):
+        data["src"] = [data["src"]]
+    if isinstance(data["tgt"], str):
+        data["tgt"] = [data["tgt"]]
+    if len(data["src"]) != len(data["tgt"]) + 1:
+        raise ValueError(
+            "The number of src and tgt should differ by 1, but got {} and {}".format(
+                len(data["src"]), len(data["tgt"])
+            )
+        )
+    if (len(data["response"]) != 2) or (len(data["response"]) != len(data["sort"])):
+        raise ValueError(
+            "The number of response and sort should be 2, but got {} and {}".format(
+                len(data["response"]), len(data["sort"])
+            )
+        )
+    if len(data["response"][0]) == 0 or len(data["response"][1]) == 0:
+        raise ValueError("The response should not be empty, buut got {data}.")
+    if data["sort"][0] == data["sort"][1]:
+        raise ValueError("The two sort should be different.")
+
+    return data
+
+
+def preprocess_preference_data(data, tokenizer, data_args, model_args):
+    """Convert raw format example to Example."""
+    # 1. Check data format
+    data = check_preference_data(data)
+
+    if data["sort"][0] > data["sort"][1]:
+        chosen = data["response"][0]
+        rejected = data["response"][1]
+    else:
+        chosen = data["response"][1]
+        rejected = data["response"][0]
+    chosen_encode_tokens = []
+    for idx in range(len(data["src"])):
+        if idx < len(data["tgt"]):
+            if tokenizer.chat_template is not None:
+                chosen_encode_tokens.append(
+                    [
+                        data["src"][idx].strip(),
+                        data["tgt"][idx].strip(),
+                    ]
+                )
+            else:
+                chosen_encode_tokens.append(
+                    [
+                        tokenizer.encode(data["src"][idx].strip(), add_special_tokens=True)["input_ids"],
+                        tokenizer.encode(data["tgt"][idx].strip(), add_special_tokens=False)["input_ids"]
+                        + [tokenizer.eos_token_id],
+                    ]
+                )
+        else:
+            if tokenizer.chat_template is not None:
+                chosen_encode_tokens.append(
+                    [
+                        data["src"][idx].strip(),
+                        chosen.strip(),
+                    ]
+                )
+            else:
+                chosen_encode_tokens.append(
+                    [
+                        tokenizer.encode(data["src"][idx].strip(), add_special_tokens=True)["input_ids"],
+                        tokenizer.encode(chosen.strip(), add_special_tokens=False)["input_ids"]
+                        + [tokenizer.eos_token_id],
+                    ]
+                )
+    if tokenizer.chat_template is not None:
+        chat_input_list = chosen_encode_tokens
+        chosen_encode_tokens = tokenizer.encode_chat_inputs(chat_input_list)["conversations"]
+        # convert to rejected chosen_encode_tokens
+        chat_input_list[-1][-1] = rejected.strip()
+        rejected_encode_tokens = tokenizer.encode_chat_inputs(chat_input_list)["conversations"]
+
+        """Post process sequence: tokenization & truncation."""
+        tokens_prompt = chosen_encode_tokens[-1][0][:-1]
+        eos_token_id = chosen_encode_tokens[-1][-1][-1]
+        tokens_chosen = chosen_encode_tokens[-1][0][-1:] + chosen_encode_tokens[-1][-1][:-1]
+        tokens_rejected = chosen_encode_tokens[-1][0][-1:] + rejected_encode_tokens[-1][-1][:-1]
+    else:
+        eos_token_id = tokenizer.eos_token_id
+        tokens_prompt = chosen_encode_tokens[-1][0][:-1]
+        tokens_chosen = (
+            chosen_encode_tokens[-1][0][-1:] + tokenizer.encode(chosen.strip(), add_special_tokens=False)["input_ids"]
+        )
+        tokens_rejected = (
+            chosen_encode_tokens[-1][0][-1:]
+            + tokenizer.encode(rejected.strip(), add_special_tokens=False)["input_ids"]
+        )
+
+    if len(tokens_prompt) + len(tokens_chosen) + len(tokens_rejected) > data_args.max_seq_len:
+        # truncate prompt
+        tokens_prompt = tokens_prompt[-data_args.max_prompt_len :]
+        if (len(tokens_prompt) + len(tokens_chosen) + len(tokens_rejected)) > data_args.max_seq_len:
+            max_response_len = data_args.max_seq_len - len(tokens_prompt)
+            # 按比例截断
+            max_chosen_len = int(len(tokens_chosen) / (len(tokens_chosen) + len(tokens_rejected)) * max_response_len)
+            max_rejected_len = max_response_len - max_chosen_len
+            tokens_chosen = tokens_chosen[:max_chosen_len]
+            tokens_rejected = tokens_rejected[:max_rejected_len]
+
+    cur_len = len(tokens_prompt) + len(tokens_chosen) + len(tokens_rejected)
+    turn_index = len(chosen_encode_tokens) - 2
+
+    # append former dialog contents
+    while turn_index >= 0:
+        tokens_src = chosen_encode_tokens[turn_index][0]
+        tokens_target = chosen_encode_tokens[turn_index][1]
+        turn_index -= 1
+
+        if len(tokens_src) + len(tokens_target) > data_args.max_seq_len - cur_len:
+            break
+        tokens_prompt = tokens_src + tokens_target + tokens_prompt
+        cur_len += len(tokens_src) + len(tokens_target)
+
+    input_ids = tokens_prompt + tokens_chosen + tokens_rejected
+    prompt_len = len(tokens_prompt)
+    chosen_len = len(tokens_chosen)
+    rejected_len = len(tokens_rejected)
+    seq_len = len(input_ids)
+    # make position ids & labels
+
+    position_ids = (
+        list(range(prompt_len))  # prompt
+        + list(range(prompt_len, prompt_len + chosen_len))  # chosen
+        + list(range(prompt_len, prompt_len + rejected_len))  # rejected
+    )
+    chosen_labels = [0] * prompt_len + tokens_chosen[1:] + [eos_token_id] + [0] * rejected_len
+    rejected_labels = [0] * prompt_len + [0] * chosen_len + tokens_rejected[1:] + [eos_token_id]
+
+    # response index
+    response_indexs = [prompt_len, prompt_len + chosen_len, seq_len]
+    output_dict = {
+        "input_ids": input_ids,
+        "position_ids": position_ids,
+        "chosen_labels": chosen_labels,
+        "rejected_labels": rejected_labels,
+        "response_indexs": response_indexs,
+    }
+
+    # attention mask
+    if model_args.flash_mask:
+        output_dict["attn_mask_startend_row_indices"] = (
+            [seq_len] * prompt_len + [prompt_len + chosen_len] * chosen_len + [seq_len] * rejected_len
+        )
+    else:
+        attention_mask = np.tri(seq_len, seq_len, dtype=bool)
+        attention_mask[(prompt_len + chosen_len) :, prompt_len : (prompt_len + chosen_len)] = False
+        output_dict["attention_mask"] = attention_mask
+    return output_dict
+
+
+def preference_collate_fn(batch, max_seq_len=None):
+    """Convert batch data into tensor."""
+    if max_seq_len is None:
+        raise ValueError("max_seq_len is None.")
+
+    input_dict = {
+        "input_ids": [],
+        "position_ids": [],
+        "chosen_labels": [],
+        "rejected_labels": [],
+        "response_indexs": [],
+    }
+    sequence = batch[0]
+    if "attn_mask_startend_row_indices" in sequence:
+        input_dict["attn_mask_startend_row_indices"] = []
+        use_attn_mask_startend_row_indices = True
+    elif "attention_mask" in sequence:
+        input_dict["attention_mask"] = []
+        use_attn_mask_startend_row_indices = False
+    else:
+        raise ValueError("attention_mask and attn_mask_startend_row_indices are both None.")
+
+    for i, sequence in enumerate(batch):
+        difference = max_seq_len - len(sequence["input_ids"])
+
+        input_dict["input_ids"].append(sequence["input_ids"] + [0] * difference)
+        input_dict["position_ids"].append(sequence["position_ids"] + [0] * difference)
+        input_dict["chosen_labels"].append(sequence["chosen_labels"] + [0] * difference)
+        input_dict["rejected_labels"].append(sequence["rejected_labels"] + [0] * difference)
+        if use_attn_mask_startend_row_indices:
+            input_dict["attn_mask_startend_row_indices"].append(
+                [
+                    sequence["attn_mask_startend_row_indices"]
+                    + [sequence["attn_mask_startend_row_indices"][-1]] * difference
+                ]
+            )
+        else:
+            input_dict["attention_mask"].append(
+                np.pad(
+                    sequence["attention_mask"],
+                    pad_width=((0, 0), (0, difference), (0, difference)),
+                    mode="constant",
+                    constant_values=False,
+                )
+            )
+
+        for ri in sequence["response_indexs"]:
+            input_dict["response_indexs"].append(
+                [
+                    i,  # bs
+                    ri[0],  # chosen_response_start_index
+                    ri[1],  # rejeted_response_start_index
+                    ri[2],  # rejeted_response_end_index + 1
+                ]
+            )
+    for key in input_dict:
+        if key == "attention_mask":
+            input_dict[key] = np.array(input_dict[key], dtype=bool)
+        elif key == "attn_mask_startend_row_indices":
+            input_dict[key] = np.array(input_dict[key], dtype=np.int32)
+        else:
+            input_dict[key] = np.array(input_dict[key])
+    return input_dict
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trl/trl_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trl/trl_utils.py
new file mode 100644
index 000000000..541238807
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/trl/trl_utils.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def calculate_effective_tokens(training_args, train_dataset, max_seq_len):
+    """
+    Caculate the effective tokens during training.
+    """
+    total_effective_tokens = 0
+
+    try:
+        data_parallel_degree = training_args.data_parallel_degree
+    except:
+        data_parallel_degree = 1
+    if training_args.sharding_parallel_degree > 1:
+        sharding_parallel_degree = training_args.sharding_parallel_degree
+    else:
+        sharding_parallel_degree = 1
+    if training_args.max_steps > 0:
+        total_batch = (
+            training_args.max_steps
+            * training_args.per_device_train_batch_size
+            * training_args.gradient_accumulation_steps
+            * sharding_parallel_degree
+            * data_parallel_degree
+        )
+        for i, data in enumerate(train_dataset):
+            if i == total_batch:
+                break
+            total_effective_tokens += len(data["input_ids"])
+        total_tokens = total_batch * max_seq_len
+    else:
+        for i, data in enumerate(train_dataset):
+            total_effective_tokens += len(data["input_ids"])
+        total_tokens = (i + 1) * max_seq_len
+        total_effective_tokens *= training_args.num_train_epochs
+        total_tokens *= training_args.num_train_epochs
+    return total_effective_tokens, total_tokens
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/__init__.py
new file mode 100644
index 000000000..5e04e7f0f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/__init__.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+
+import paddle
+
+from .batch_sampler import *
+from .env import CONFIG_NAME, GENERATION_CONFIG_NAME, LEGACY_CONFIG_NAME
+from .import_utils import install_package, uninstall_package
+from .initializer import to
+from .serialization import load_torch
+
+# hack impl for EagerParamBase to function
+# https://github.com/PaddlePaddle/Paddle/blob/fa44ea5cf2988cd28605aedfb5f2002a63018df7/python/paddle/nn/layer/layers.py#L2077
+paddle.framework.io.EagerParamBase.to = to
+
+
+@contextlib.contextmanager
+def device_guard(device="cpu", dev_id=0):
+    origin_device = paddle.device.get_device()
+    if device == "cpu":
+        paddle.set_device(device)
+    elif device in ["gpu", "xpu", "npu"]:
+        paddle.set_device("{}:{}".format(device, dev_id))
+    try:
+        yield
+    finally:
+        paddle.set_device(origin_device)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/batch_sampler.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/batch_sampler.py
new file mode 100644
index 000000000..619904a6d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/batch_sampler.py
@@ -0,0 +1,182 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division, print_function
+
+import paddle
+
+__all__ = ["DistributedBatchSampler"]
+
+
+class DistributedBatchSampler(paddle.io.BatchSampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+
+    In such case, each process can pass a DistributedBatchSampler instance
+    as a DataLoader sampler, and load a subset of the original dataset that
+    is exclusive to it.
+
+    .. note::
+        Dataset is assumed to be of constant size.
+
+    Args:
+        dataset(paddle.io.Dataset): this could be a `paddle.io.Dataset` implement
+                     or other python object which implemented
+                     `__len__` for BatchSampler to get sample
+                     number of data source.
+        batch_size(int): sample indice number in a mini-batch indices.
+        num_replicas(int, optional): porcess number in distributed training.
+            If :attr:`num_replicas` is None, :attr:`num_replicas` will be
+            retrieved from :code:`paddle.distributed.ParallenEnv`.
+            Default None.
+        rank(int, optional): the rank of the current process among :attr:`num_replicas`
+            processes. If :attr:`rank` is None, :attr:`rank` is retrieved from
+            :code:`paddle.distributed.ParallenEnv`. Default None.
+        shuffle(bool): whther to shuffle indices order before genrating
+            batch indices. Default False.
+        drop_last(bool): whether drop the last incomplete batch dataset size
+            is not divisible by the batch size. Default False
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+
+            from paddle.io import Dataset, DistributedBatchSampler
+
+            # init with dataset
+            class RandomDataset(Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
+
+                def __getitem__(self, idx):
+                    image = np.random.random([784]).astype('float32')
+                    label = np.random.randint(0, 9, (1, )).astype('int64')
+                    return image, label
+
+                def __len__(self):
+                    return self.num_samples
+
+            dataset = RandomDataset(100)
+            sampler = DistributedBatchSampler(dataset, batch_size=64)
+
+            for data in sampler:
+                # do something
+                break
+    """
+
+    def __init__(
+        self, dataset, batch_size, num_replicas=None, rank=None, shuffle=False, drop_last=False, consumed_samples=0
+    ):
+        self.dataset = dataset
+
+        assert isinstance(batch_size, int) and batch_size > 0, "batch_size should be a positive integer"
+        self.batch_size = batch_size
+        assert isinstance(shuffle, bool), "shuffle should be a boolean value"
+        self.shuffle = shuffle
+        assert isinstance(drop_last, bool), "drop_last should be a boolean number"
+
+        from paddle.distributed import ParallelEnv
+
+        if num_replicas is not None:
+            assert isinstance(num_replicas, int) and num_replicas > 0, "num_replicas should be a positive integer"
+            self.nranks = num_replicas
+        else:
+            self.nranks = ParallelEnv().nranks
+
+        if rank is not None:
+            assert isinstance(rank, int) and rank >= 0, "rank should be a non-negative integer"
+            self.local_rank = rank
+        else:
+            self.local_rank = ParallelEnv().local_rank
+
+        self.drop_last = drop_last
+        self.epoch = 0
+
+        self.consumed_samples = consumed_samples
+        if self.dataset is None:
+            # In pre-training mode when using distributed dataloader, the input dataset can be None. We should handle this situation.
+            self.num_samples = 0
+        else:
+            self.num_samples = int(len(self.dataset) * 1.0 / self.nranks)
+        self.total_size = self.num_samples * self.nranks
+
+    def get_start_end_idx(self):
+        start_idx = self.local_rank * self.batch_size
+        end_idx = start_idx + self.batch_size
+        return start_idx, end_idx
+
+    def __iter__(self):
+        assert (
+            self.consumed_samples % self.nranks == 0
+        ), "The consumed_samples should be divided by nranks. consumed_samples=%d, nranks=%s" % (
+            self.consumed_samples,
+            self.nranks,
+        )
+        self.remain_num_samples = int((len(self.dataset) - self.consumed_samples) * 1.0 / self.nranks)
+        self.remain_total_size = self.remain_num_samples * self.nranks
+        self.batch_size_times_rank_size = self.batch_size * self.nranks
+
+        batch_indices = []
+        for idx in range(self.consumed_samples, self.total_size):
+            batch_indices.append(idx)
+            if len(batch_indices) == self.batch_size_times_rank_size:
+                start_idx, end_idx = self.get_start_end_idx()
+                yield batch_indices[start_idx:end_idx]
+                batch_indices = []
+        if not self.drop_last and len(batch_indices) > 0:
+            yield batch_indices
+
+    def __len__(self):
+        num_samples = self.num_samples
+        num_samples += int(not self.drop_last) * (self.batch_size - 1)
+        return num_samples // self.batch_size
+
+    def set_epoch(self, epoch=0, consumed_samples=0):
+        """
+        Sets the epoch number. When :attr:`shuffle=True`, this number is used
+        as seeds of random numbers. By default, users may not set this, all
+        replicas (workers) use a different random ordering for each epoch.
+        If set same number at each epoch, this sampler will yield the same
+        ordering at all epoches.
+
+        Arguments:
+            epoch (int): Epoch number.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.io import Dataset, DistributedBatchSampler
+
+                # init with dataset
+                class RandomDataset(Dataset):
+                    def __init__(self, num_samples):
+                        self.num_samples = num_samples
+
+                    def __getitem__(self, idx):
+                        image = np.random.random([784]).astype('float32')
+                        label = np.random.randint(0, 9, (1, )).astype('int64')
+                        return image, label
+
+                    def __len__(self):
+                        return self.num_samples
+
+                dataset = RandomDataset(100)
+                sampler = DistributedBatchSampler(dataset, batch_size=64)
+
+                for epoch in range(10):
+                    sampler.set_epoch(epoch)
+        """
+        self.epoch = epoch
+        # if we reset the epoch, the consumed_samples should be set to 0.
+        self.consumed_samples = consumed_samples
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/converter.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/converter.py
new file mode 100644
index 000000000..c4f2891c7
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/converter.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+# FIXME(wj-Mcat): this converter will be deprecated after V2.5.2
+from paddlenlp.transformers.conversion_utils import *  # noqa: F401, F403
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/distributed.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/distributed.py
new file mode 100644
index 000000000..9ccc7fbd6
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/distributed.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from typing import Any, Union
+
+import numpy as np
+import paddle
+import paddle.distributed as distributed
+
+from . import device_guard
+
+world_size = distributed.get_world_size()
+
+
+def convert_file_size_to_int(size: Union[int, str]):
+    """
+    Converts a size expressed as a string with digits an unit (like `"5MB"`) to an integer (in bytes).
+    Args:
+        size (`int` or `str`): The size to convert. Will be directly returned if an `int`.
+    Example:
+    ```py
+    >>> convert_file_size_to_int("1MiB")
+    1048576
+    ```
+    """
+    if isinstance(size, int):
+        return size
+    if size.upper().endswith("GIB"):
+        return int(size[:-3]) * (2**30)
+    if size.upper().endswith("MIB"):
+        return int(size[:-3]) * (2**20)
+    if size.upper().endswith("KIB"):
+        return int(size[:-3]) * (2**10)
+    if size.upper().endswith("GB"):
+        int_size = int(size[:-2]) * (10**9)
+        return int_size // 8 if size.endswith("b") else int_size
+    if size.upper().endswith("MB"):
+        int_size = int(size[:-2]) * (10**6)
+        return int_size // 8 if size.endswith("b") else int_size
+    if size.upper().endswith("KB"):
+        int_size = int(size[:-2]) * (10**3)
+        return int_size // 8 if size.endswith("b") else int_size
+    raise ValueError("`size` is not in a valid format. Use an integer followed by the unit, e.g., '5GB'.")
+
+
+def reduce_tensor(tensor, buffer_size="32MiB"):
+    if tensor.dtype == paddle.int8:
+        numel = np.prod(tensor.shape)
+    else:
+        numel = int(paddle.numel(tensor).item())
+    # dtype = str(tensor.dtype)
+    # numel_bits = numel * dtype_byte_size(tensor.dtype)
+    buffer_size = convert_file_size_to_int(buffer_size)
+    tensor.reshape_([-1])
+
+    send_size = buffer_size // dtype_byte_size(tensor.dtype)
+
+    for x in range(0, numel, send_size):
+        part_tensor = tensor[x : min(numel, x + send_size)]
+        yield part_tensor, (x, min(numel, x + send_size))
+
+
+def dtype_byte_size(dtype):
+    """
+    Returns the size (in bytes) occupied by one parameter of type `dtype`.
+    Example:
+    ```py
+    >>> dtype_byte_size(torch.float32)
+    4
+    ```
+    """
+    if dtype == paddle.bool:
+        return 1 / 8
+    bit_search = re.search(r"[^\d](\d+)$", str(dtype))
+    if bit_search is None:
+        raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
+    bit_size = int(bit_search.groups()[0])
+    return bit_size // 8
+
+
+@paddle.no_grad()
+def distributed_gather(tensor: Any, dst: int = 0, group=None, offload=False) -> Any:
+    try:
+        if isinstance(tensor, (tuple, list)):
+            return type(tensor)(distributed_gather(t, dst, group, offload) for t in tensor)
+        if isinstance(tensor, dict):
+            return {k: distributed_gather(v, dst, group, offload) for k, v in tensor.items()}
+
+        output_tensors = None
+
+        is_dst = dst == distributed.get_rank(group=group)
+        if is_dst:
+            if offload:
+                output_tensors = [[] for _ in range(distributed.get_world_size(group=group))]
+                # with device_guard("cpu"):
+                #     output_tensors = [paddle.empty_like(tensor) for _ in range(distributed.get_world_size())]
+            else:
+                output_tensors = [paddle.empty_like(tensor) for _ in range(distributed.get_world_size(group=group))]
+                # for scalar tensor ?
+                output_tensors = [t if len(t.shape) > 0 else t[None] for t in output_tensors]
+
+        if offload:
+            origin_shape = tensor.shape
+            tensor.reshape_([-1])
+
+            for slice_tensor, index in reduce_tensor(tensor):
+                slice_output_tensors = None
+                if distributed.get_rank(group=group) == dst:
+                    slice_output_tensors = [
+                        paddle.empty_like(slice_tensor) for _ in range(distributed.get_world_size(group=group))
+                    ]
+                paddle.distributed.communication.stream.gather(
+                    slice_tensor,
+                    slice_output_tensors,
+                    dst=group.ranks[dst] if group else dst,
+                    group=group,
+                    sync_op=True,
+                    use_calc_stream=False,
+                )
+
+                if is_dst:
+                    for i in range(len(output_tensors)):
+                        output_tensors[i].append(slice_output_tensors[i].cpu().numpy())
+
+            tensor.reshape_(origin_shape)
+            if is_dst:
+                with device_guard("cpu"):
+                    new_output_tensors = []
+                    for x in output_tensors:
+                        t = np.concatenate(x)
+                        t = t.reshape(origin_shape)
+                        new_output_tensors.append(t)
+                    output_tensors = new_output_tensors
+
+        else:
+            paddle.distributed.communication.stream.gather(
+                tensor,
+                output_tensors,
+                dst=group.ranks[dst] if group else dst,
+                group=group,
+                sync_op=True,
+                use_calc_stream=False,
+            )
+
+        return output_tensors
+
+    except AssertionError:
+        raise AssertionError("Not currently using distributed training")
+
+
+@paddle.no_grad()
+def distributed_allgather(tensor: Any, group=None, offload=False):
+    """nested all gather function with offload
+
+    Args:
+        tensor (Any): the desired tensor, list of tensor, dict of tensor to allgather.
+        group (_type_, optional): the communication group. Defaults to None.
+        offload (bool, optional): If True, we offload the received tensor to cpu/(numpy). Defaults to False.
+
+    Raises:
+        AssertionError: Unexpected errors.
+
+    Returns:
+        tensor list: list of all gathered tensors
+    """
+    try:
+        if isinstance(tensor, (tuple, list)):
+            return type(tensor)(distributed_allgather(t, group, offload) for t in tensor)
+        if isinstance(tensor, dict):
+            return {k: distributed_allgather(v, group, offload) for k, v in tensor.items()}
+
+        output_tensors = []
+
+        if offload:
+            with device_guard("cpu"):
+                output_tensors = [paddle.empty_like(tensor) for _ in range(distributed.get_world_size(group))]
+        else:
+            output_tensors = [paddle.empty_like(tensor) for _ in range(distributed.get_world_size(group))]
+
+        # for scalar tensor ?
+        output_tensors = [t if len(t.shape) > 0 else t[None] for t in output_tensors]
+
+        if offload:
+            origin_shape = tensor.shape
+            tensor.reshape_([-1])
+            for x in output_tensors:
+                x.reshape_([-1])
+
+            for slice_tensor, index in reduce_tensor(tensor):
+                # paddle.empty_like(slice_tensor)
+                slice_output_tensors = [
+                    paddle.empty_like(slice_tensor) for _ in range(distributed.get_world_size(group))
+                ]
+                distributed.all_gather(slice_output_tensors, slice_tensor, group=group)
+                for x, y in zip(slice_output_tensors, output_tensors):
+                    with device_guard("cpu"):
+                        # x.cpu()
+                        y[index[0] : index[1]] = x.cpu()
+
+            tensor.reshape_(origin_shape)
+            for x in output_tensors:
+                x.reshape_(origin_shape)
+
+        else:
+            distributed.all_gather(output_tensors, tensor, group=group)
+
+        return output_tensors
+
+    except AssertionError:
+        raise AssertionError("Not currently using distributed training")
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/doc_parser.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/doc_parser.py
new file mode 100644
index 000000000..6d9552f72
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/doc_parser.py
@@ -0,0 +1,432 @@
+# coding=utf-8
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+import mimetypes
+import os
+import random
+import re
+from io import BytesIO
+
+import numpy as np
+import requests
+from packaging.version import Version
+from PIL import Image, ImageDraw, ImageOps
+
+from .image_utils import np2base64
+from .log import logger
+
+
+class DocParser(object):
+    """DocParser"""
+
+    def __init__(self, ocr_lang="ch", layout_analysis=False, pdf_parser_config=None, use_gpu=None, device_id=None):
+        self.ocr_lang = ocr_lang
+        self.use_angle_cls = False
+        self.layout_analysis = layout_analysis
+        self.pdf_parser_config = pdf_parser_config
+        self.ocr_infer_model = None
+        self.use_gpu = use_gpu
+        self.device_id = device_id
+
+    def parse(self, doc, expand_to_a4_size=False, do_ocr=True):
+        """
+        parse
+        """
+        doc_type = mimetypes.guess_type(doc["doc"])[0]
+
+        if not doc_type or doc_type.startswith("image"):
+            image = self.read_image(doc["doc"])
+        elif doc_type == "application/pdf":
+            image = self.read_pdf(doc["doc"])
+        offset_x, offset_y = 0, 0
+        if expand_to_a4_size:
+            image, offset_x, offset_y = self.expand_image_to_a4_size(image, center=True)
+        img_h, img_w = image.shape[:2]
+        doc["image"] = np2base64(image)
+        doc["offset_x"] = offset_x
+        doc["offset_y"] = offset_y
+        doc["img_w"] = img_w
+        doc["img_h"] = img_h
+        if do_ocr:
+            ocr_result = self.ocr(image)
+            if expand_to_a4_size:
+                layout = []
+                for segment in ocr_result:
+                    box = segment[0]
+                    org_box = [
+                        max(box[0] - offset_x, 0),
+                        max(box[1] - offset_y, 0),
+                        max(box[2] - offset_x, 0),
+                        max(box[3] - offset_y, 0),
+                    ]
+                    if len(segment) == 2:
+                        layout.append((org_box, segment[1]))
+                    elif len(segment) == 3:
+                        layout.append((org_box, segment[1], segment[2]))
+                doc["layout"] = layout
+            else:
+                doc["layout"] = ocr_result
+        return doc
+
+    def __call__(self, *args, **kwargs):
+        """
+        Call parse
+        """
+        return self.parse(*args, **kwargs)
+
+    def ocr(self, image, det=True, rec=True, cls=None):
+        """
+        Call ocr for an image
+        """
+
+        def _get_box(box):
+            box = [
+                min(box[0][0], box[3][0]),  # x1
+                min(box[0][1], box[1][1]),  # y1
+                max(box[1][0], box[2][0]),  # x2
+                max(box[2][1], box[3][1]),  # y2
+            ]
+            return box
+
+        def _normal_box(box):
+            # Ensure the height and width of bbox are greater than zero
+            if box[3] - box[1] < 0 or box[2] - box[0] < 0:
+                return False
+            return True
+
+        def _is_ch(s):
+            for ch in s:
+                if "\u4e00" <= ch <= "\u9fff":
+                    return True
+            return False
+
+        if self.ocr_infer_model is None:
+            self.init_ocr_inference()
+        if cls is None:
+            cls = self.use_angle_cls
+        remove = False if self.ppocr_version <= Version("2.6.0.1") else True
+
+        layout = []
+        if not self.layout_analysis:
+            ocr_result = self.ocr_infer_model.ocr(image, det, rec, cls)
+            ocr_result = ocr_result[0] if remove else ocr_result
+            for segment in ocr_result:
+                box = segment[0]
+                box = _get_box(box)
+                if not _normal_box(box):
+                    continue
+                text = segment[1][0]
+                layout.append((box, text))
+        else:
+            layout_result = self.layout_analysis_engine(image)
+            for region in layout_result:
+                if region["type"] != "table":
+                    ocr_result = region["res"]
+                    for segment in ocr_result:
+                        box = segment["text_region"]
+                        box = _get_box(box)
+                        if not _normal_box(box):
+                            continue
+                        text = segment["text"]
+                        layout.append((box, text, region["type"]))
+                else:
+                    bbox = region["bbox"]
+                    table_result = region["res"]
+                    html = table_result["html"]
+                    cell_bbox = table_result["cell_bbox"]
+                    table_list = []
+                    lines = re.findall("<tr>(.*?)</tr>", html)
+                    for line in lines:
+                        table_list.extend(re.findall("<td.*?>(.*?)</td>", line))
+                    for cell_box, text in zip(cell_bbox, table_list):
+                        if self.ocr_lang == "ch":
+                            box = [
+                                bbox[0] + cell_box[0],
+                                bbox[1] + cell_box[1],
+                                bbox[0] + cell_box[4],
+                                bbox[1] + cell_box[5],
+                            ]
+                        else:
+                            box = [
+                                bbox[0] + cell_box[0],
+                                bbox[1] + cell_box[1],
+                                bbox[0] + cell_box[2],
+                                bbox[1] + cell_box[3],
+                            ]
+                        if not _normal_box(box):
+                            continue
+                        if _is_ch(text):
+                            text = text.replace(" ", "")
+                        layout.append((box, text, region["type"]))
+        return layout
+
+    @classmethod
+    def _get_buffer(self, data, file_like=False):
+        buff = None
+        if len(data) < 1024:
+            if os.path.exists(data):
+                buff = open(data, "rb").read()
+            elif data.startswith("http://") or data.startswith("https://"):
+                resp = requests.get(data, stream=True)
+                if not resp.ok:
+                    raise RuntimeError("Failed to download the file from {}".format(data))
+                buff = resp.raw.read()
+            else:
+                raise FileNotFoundError("Image file {} not found!".format(data))
+        if buff is None:
+            buff = base64.b64decode(data)
+        if buff and file_like:
+            return BytesIO(buff)
+        return buff
+
+    @classmethod
+    def read_image(self, image):
+        """
+        read image to np.ndarray
+        """
+        image_buff = self._get_buffer(image)
+
+        # Use exif_transpose to correct orientation
+        _image = np.array(ImageOps.exif_transpose(Image.open(BytesIO(image_buff)).convert("RGB")))
+        return _image
+
+    @classmethod
+    def read_pdf(self, pdf, password=None):
+        """
+        read pdf
+        """
+        try:
+            import fitz
+        except ImportError:
+            raise RuntimeError(
+                "Need PyMuPDF to process pdf input. " "Please install module by: python3 -m pip install pymupdf"
+            )
+        if isinstance(pdf, fitz.Document):
+            return pdf
+        pdf_buff = self._get_buffer(pdf)
+        if not pdf_buff:
+            logger.warning("Failed to read pdf: %s...", pdf[:32])
+            return None
+        pdf_doc = fitz.Document(stream=pdf_buff, filetype="pdf")
+        if pdf_doc.needs_pass:
+            if pdf_doc.authenticate(password) == 0:
+                raise ValueError("The password of pdf is incorrect.")
+
+        if pdf_doc.page_count > 1:
+            logger.warning("Currently only parse the first page for PDF input with more than one page.")
+
+        page = pdf_doc.load_page(0)
+        # The original image is shrunk when convertd from PDF by fitz, so we scale the image size by 10 times
+        matrix = fitz.Matrix(10, 10)
+        image = np.array(self.get_page_image(page, matrix).convert("RGB"))
+        return image
+
+    @classmethod
+    def get_page_image(self, page, matrix):
+        """
+        get page image
+        """
+        pix = page.get_pixmap(matrix=matrix)
+        image_buff = pix.pil_tobytes("jpeg")
+        return Image.open(BytesIO(image_buff))
+
+    def init_ocr_inference(self):
+        """
+        init ocr inference
+        """
+        if self.ocr_infer_model is not None:
+            logger.warning("ocr model has already been initialized")
+            return
+
+        try:
+            import paddleocr
+        except ImportError:
+            raise RuntimeError(
+                "Need paddleocr to process image input. Please install module by: python3 -m pip install paddleocr"
+            )
+        self.ppocr_version = Version(paddleocr.__version__)
+
+        if not self.layout_analysis:
+            from paddleocr import PaddleOCR
+
+            self.ocr_infer_model = PaddleOCR(show_log=False, lang=self.ocr_lang)
+        else:
+            from paddleocr import PPStructure
+
+            self.layout_analysis_engine = PPStructure(table=True, ocr=True, show_log=False, lang=self.ocr_lang)
+
+    @classmethod
+    def _normalize_box(self, box, old_size, new_size, offset_x=0, offset_y=0):
+        """normalize box"""
+        return [
+            int((box[0] + offset_x) * new_size[0] / old_size[0]),
+            int((box[1] + offset_y) * new_size[1] / old_size[1]),
+            int((box[2] + offset_x) * new_size[0] / old_size[0]),
+            int((box[3] + offset_y) * new_size[1] / old_size[1]),
+        ]
+
+    @classmethod
+    def expand_image_to_a4_size(self, image, center=False):
+        """expand image to a4 size"""
+        h, w = image.shape[:2]
+        offset_x, offset_y = 0, 0
+        if h * 1.0 / w >= 1.42:
+            exp_w = int(h / 1.414 - w)
+            if center:
+                offset_x = int(exp_w / 2)
+                exp_img = np.zeros((h, offset_x, 3), dtype="uint8")
+                exp_img.fill(255)
+                image = np.hstack([exp_img, image, exp_img])
+            else:
+                exp_img = np.zeros((h, exp_w, 3), dtype="uint8")
+                exp_img.fill(255)
+                image = np.hstack([image, exp_img])
+        elif h * 1.0 / w <= 1.40:
+            exp_h = int(w * 1.414 - h)
+            if center:
+                offset_y = int(exp_h / 2)
+                exp_img = np.zeros((offset_y, w, 3), dtype="uint8")
+                exp_img.fill(255)
+                image = np.vstack([exp_img, image, exp_img])
+            else:
+                exp_img = np.zeros((exp_h, w, 3), dtype="uint8")
+                exp_img.fill(255)
+                image = np.vstack([image, exp_img])
+        return image, offset_x, offset_y
+
+    @classmethod
+    def write_image_with_results(
+        self, image, layout=None, result=None, save_path=None, return_image=False, format=None, max_size=None
+    ):
+        """
+        write image with boxes and results
+        """
+
+        def _flatten_results(results):
+            """flatten results"""
+            is_single = False
+            if not isinstance(results, list):
+                results = [results]
+                is_single = True
+            flat_results = []
+
+            def _flatten(result):
+                flat_result = []
+                for key, vals in result.items():
+                    for val in vals:
+                        new_val = val.copy()
+                        if val.get("relations"):
+                            new_val["relations"] = _flatten(val["relations"])
+                        new_val["label"] = key
+                        flat_result.append(new_val)
+                return flat_result
+
+            for result in results:
+                flat_results.append(_flatten(result))
+            if is_single:
+                return flat_results[0]
+            return flat_results
+
+        def _write_results(results, color=None, root=True, parent_centers=None):
+            for segment in results:
+                if "bbox" not in segment.keys():
+                    continue
+                boxes = segment["bbox"]
+                if not isinstance(boxes[0], list):
+                    boxes = [boxes]
+                centers = []
+                plot_boxes = []
+                for box in boxes:
+                    x1, y1, x2, y2 = box
+                    plot_box = [
+                        (x1, y1),
+                        (x2, y1),
+                        (x2, y2),
+                        (x1, y2),
+                    ]
+                    plot_boxes.append(plot_box)
+                    centers.append(((x2 - x1) / 2 + x1, (y2 - y1) / 2 + y1))
+                if root:
+                    while True:
+                        color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
+                        if sum(color) < 480:
+                            break
+                for box in plot_boxes:
+                    draw_render.polygon(box, fill=color)
+                if parent_centers:
+                    for p_c in parent_centers:
+                        for c in centers:
+                            draw_render.line((p_c[0], p_c[1], c[0], c[1]), fill=125, width=3)
+                if isinstance(segment, dict) and segment.get("relations"):
+                    _write_results(segment["relations"], color, root=False, parent_centers=centers)
+
+        random.seed(0)
+        _image = self.read_image(image)
+        _image = Image.fromarray(np.uint8(_image))
+        h, w = _image.height, _image.width
+        img_render = _image.copy()
+        draw_render = ImageDraw.Draw(img_render)
+
+        if layout:
+            for segment in layout:
+                if isinstance(segment, dict):
+                    box = segment["bbox"]
+                else:
+                    box = segment[0]
+                box = [
+                    (box[0], box[1]),
+                    (box[2], box[1]),
+                    (box[2], box[3]),
+                    (box[0], box[3]),
+                ]
+                while True:
+                    color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
+                    if sum(color) < 480:
+                        break
+                draw_render.polygon(box, fill=color)
+
+        elif result:
+            flatten_results = _flatten_results(result)
+            _write_results(flatten_results, color=None, root=True)
+
+        img_render = Image.blend(_image, img_render, 0.3)
+        img_show = Image.new("RGB", (w, h), (255, 255, 255))
+        img_show.paste(img_render, (0, 0, w, h))
+        w, h = img_show.width, img_show.height
+        if max_size and max(w, h) > max_size:
+            if max(w, h) == h:
+                new_size = (int(w * max_size / h), max_size)
+            else:
+                new_size = (max_size, int(h * max_size / w))
+            img_show = img_show.resize(new_size)
+
+        if save_path:
+            dir_path = os.path.dirname(save_path)
+            if dir_path and not os.path.isdir(dir_path):
+                os.makedirs(dir_path)
+            img_show.save(save_path)
+            if return_image:
+                return np.array(img_show)
+        elif return_image:
+            return np.array(img_show)
+        else:
+            buff = BytesIO()
+            if format is None:
+                format = "jpeg"
+            if format.lower() == "jpg":
+                format = "jpeg"
+            img_show.save(buff, format=format, quality=90)
+            return buff
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/download/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/download/__init__.py
new file mode 100644
index 000000000..6f5dad5c8
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/download/__init__.py
@@ -0,0 +1,367 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from argparse import ArgumentTypeError
+from pathlib import Path
+from typing import Dict, Literal, Optional, Union
+
+from huggingface_hub import _CACHED_NO_EXIST
+from huggingface_hub import file_exists as hf_hub_file_exists
+from huggingface_hub import hf_hub_download
+from huggingface_hub import try_to_load_from_cache as hf_hub_try_to_load_from_cache
+from huggingface_hub.utils import (
+    EntryNotFoundError,
+    LocalEntryNotFoundError,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+)
+from paddle import __version__
+from requests import HTTPError
+
+from .aistudio_hub_download import (
+    aistudio_hub_download,
+    aistudio_hub_file_exists,
+    aistudio_hub_try_to_load_from_cache,
+)
+from .bos_download import bos_download, bos_file_exists, bos_try_to_load_from_cache
+
+
+def strtobool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise ArgumentTypeError(
+            f"Truthy value expected: got {v} but expected one of yes/no, true/false, t/f, y/n, 1/0 (case insensitive)."
+        )
+
+
+def resolve_file_path(
+    repo_id: str = None,
+    filenames: Union[str, list] = None,
+    subfolder: Optional[str] = None,
+    repo_type: Optional[str] = None,
+    revision: Optional[str] = None,
+    library_name: Optional[str] = "PaddleNLP",
+    library_version: Optional[str] = __version__,
+    cache_dir: Union[str, Path, None] = None,
+    local_dir: Union[str, Path, None] = None,
+    local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
+    user_agent: Union[Dict, str, None] = None,
+    force_download: bool = False,
+    proxies: Optional[Dict] = None,
+    etag_timeout: float = 10,
+    resume_download: bool = False,
+    token: Union[bool, str, None] = None,
+    local_files_only: bool = False,
+    endpoint: Optional[str] = None,
+    url: Optional[str] = None,
+    from_aistudio: bool = False,
+    from_hf_hub: bool = False,
+    from_bos: bool = True,
+) -> str:
+    """
+    This is a general download function, mainly called by the from_pretrained function.
+
+    It supports downloading files from four different download sources, including BOS, AiStudio,
+    HuggingFace Hub and ModelScope.
+
+    If you want to download a file from ModelScope, you need to set os.environ["from_modelscope"] = "True"
+
+    Args:
+        repo_id('str'): A path to a folder containing the file, a path of the file, a url or repo name.
+        filenames('str' or list): Name of the file to be downloaded. If it is a str, the file will be downloaded directly,
+            if it is a list, it will try to download the file in turn, and when one exists, it will be returned directly.
+        subfolder('str'): Some repos will exist subfolder.
+        repo_type('str'): The default is model.
+        cache_dir('str' or Path): Where to save or load the file after downloading.
+        url('str'): If it is not None, then it will be downloaded from BOS.
+        from_aistudio('bool'): If this value is true, it will be downloaded from aistudio.
+        from_hf_hub('bool'): If this value is true, it will be downloaded from hf hub.
+        from_bos('bool'): If this value is true, it will be downloaded from bos (default).
+
+
+    Returns:
+        cached_file('str'): The path of file or None.
+    """
+    assert repo_id is not None, "repo_id cannot be None"
+    assert filenames is not None, "filenames cannot be None"
+
+    if isinstance(filenames, str):
+        filenames = [filenames]
+
+    download_kwargs = dict(
+        repo_id=repo_id,
+        filename=filenames[0],
+        subfolder=subfolder if subfolder is not None else "",
+        repo_type=repo_type,
+        revision=revision,
+        library_name=library_name,
+        library_version=library_version,
+        cache_dir=cache_dir,
+        local_dir=local_dir,
+        local_dir_use_symlinks=local_dir_use_symlinks,
+        user_agent=user_agent,
+        force_download=force_download,
+        proxies=proxies,
+        etag_timeout=etag_timeout,
+        resume_download=resume_download,
+        token=token,
+        local_files_only=local_files_only,
+        endpoint=endpoint,
+    )
+    cached_file = None
+    log_endpoint = "N/A"
+    # log_filename = os.path.join(download_kwargs["subfolder"], filename)
+
+    # return file path from local file, eg: /cache/path/model_config.json
+    if os.path.isfile(repo_id):
+        return repo_id
+    # return the file path from local dir with filename, eg: /local/path
+    elif os.path.isdir(repo_id):
+        for index, filename in enumerate(filenames):
+            if os.path.exists(os.path.join(repo_id, download_kwargs["subfolder"], filename)):
+                if not os.path.isfile(os.path.join(repo_id, download_kwargs["subfolder"], filename)):
+                    raise EnvironmentError(f"{repo_id} does not appear to have file named {filename}.")
+                return os.path.join(repo_id, download_kwargs["subfolder"], filename)
+            elif index < len(filenames) - 1:
+                continue
+            else:
+                raise FileNotFoundError(f"please make sure one of the {filenames} under the dir {repo_id}")
+
+    # check cache
+    for filename in filenames:
+        cache_file_name = bos_aistudio_hf_try_to_load_from_cache(
+            repo_id, filename, cache_dir, subfolder, revision, repo_type, from_bos, from_aistudio, from_hf_hub
+        )
+        if from_hf_hub and cache_file_name is _CACHED_NO_EXIST:
+            cache_file_name = None
+        if cache_file_name is not None:
+            return cache_file_name
+
+    from_modelscope = strtobool(os.environ.get("from_modelscope", False))
+
+    # download file from different origins
+    try:
+        if filenames[0].startswith("http://") or filenames[0].startswith("https://"):
+            log_endpoint = "BOS"
+            download_kwargs["url"] = filenames[0]
+            download_kwargs["repo_id"] = repo_id
+            if filenames[0].split("/")[-1].endswith("pdparams"):
+                download_kwargs["filename"] = "model_state.pdparams"
+            else:
+                download_kwargs["filename"] = None
+            cached_file = bos_download(
+                **download_kwargs,
+            )
+            return cached_file
+
+        elif from_modelscope:
+            for index, filename in enumerate(filenames):
+                try:
+                    from modelscope.hub.file_download import (
+                        model_file_download as modelscope_download,
+                    )
+
+                    return modelscope_download(repo_id, filename, revision, cache_dir, user_agent, local_files_only)
+                except Exception:
+                    if index < len(filenames) - 1:
+                        continue
+                    else:
+                        print(f"please make sure one of the {filenames} under the repo {repo_id}")
+                        return None
+
+        elif from_aistudio:
+            log_endpoint = "Aistudio Hub"
+            for filename in filenames:
+                download_kwargs["filename"] = filename
+                is_available = bos_aistudio_hf_file_exist(
+                    repo_id,
+                    filename,
+                    subfolder=subfolder,
+                    repo_type=repo_type,
+                    revision=revision,
+                    token=token,
+                    endpoint=endpoint,
+                    from_bos=from_bos,
+                    from_aistudio=from_aistudio,
+                    from_hf_hub=from_hf_hub,
+                )
+                if is_available:
+                    cached_file = aistudio_hub_download(
+                        **download_kwargs,
+                    )
+                    if cached_file is not None:
+                        return cached_file
+        elif from_hf_hub:
+            log_endpoint = "Huggingface Hub"
+            for filename in filenames:
+                download_kwargs["filename"] = filename
+                is_available = bos_aistudio_hf_file_exist(
+                    repo_id,
+                    filename,
+                    subfolder=subfolder,
+                    repo_type=repo_type,
+                    revision=revision,
+                    token=token,
+                    endpoint=endpoint,
+                    from_bos=from_bos,
+                    from_aistudio=from_aistudio,
+                    from_hf_hub=from_hf_hub,
+                )
+                if is_available:
+                    cached_file = hf_hub_download(
+                        **download_kwargs,
+                    )
+                    if cached_file is not None:
+                        return cached_file
+        else:
+            log_endpoint = "BOS"
+            download_kwargs["url"] = url
+            for filename in filenames:
+                download_kwargs["filename"] = filename
+                is_available = bos_aistudio_hf_file_exist(
+                    repo_id,
+                    filename,
+                    subfolder=subfolder,
+                    repo_type=repo_type,
+                    revision=revision,
+                    token=token,
+                    endpoint=endpoint,
+                    from_bos=from_bos,
+                    from_aistudio=from_aistudio,
+                    from_hf_hub=from_hf_hub,
+                )
+                if is_available:
+                    cached_file = bos_download(
+                        **download_kwargs,
+                    )
+                    if cached_file is not None:
+                        return cached_file
+    except LocalEntryNotFoundError:
+        raise EnvironmentError(
+            "Cannot find the requested files in the cached path and"
+            " outgoing traffic has been disabled. To enable model look-ups"
+            " and downloads online, set 'local_files_only' to False."
+        )
+    except RepositoryNotFoundError:
+        raise EnvironmentError(
+            f"{repo_id} is not a local folder and is not a valid model identifier "
+            f"listed on '{log_endpoint}'\nIf this is a private repository, make sure to pass a "
+            "token having permission to this repo."
+        )
+    except RevisionNotFoundError:
+        raise EnvironmentError(
+            f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
+            "this model name. Check the model page at "
+            f"'{log_endpoint}' for available revisions."
+        )
+    except EntryNotFoundError:
+        raise EnvironmentError(f"Does not appear one of the {filenames} in {repo_id}.")
+    except HTTPError as err:
+        raise EnvironmentError(f"There was a specific connection error when trying to load {repo_id}:\n{err}")
+    except ValueError:
+        raise EnvironmentError(
+            f"We couldn't connect to '{log_endpoint}' to load this model, couldn't find it"
+            f" in the cached files and it looks like {repo_id} is not the path to a"
+            f" directory containing one of the {filenames} or"
+            " \nCheckout your internet connection or see how to run the library in offline mode."
+        )
+    except EnvironmentError:
+        raise EnvironmentError(
+            f"Can't load the model for '{repo_id}'. If you were trying to load it from "
+            f"'{log_endpoint}', make sure you don't have a local directory with the same name. "
+            f"Otherwise, make sure '{repo_id}' is the correct path to a directory "
+            f"containing one of the {filenames}"
+        )
+
+
+def bos_aistudio_hf_file_exist(
+    repo_id: str,
+    filename: str,
+    *,
+    subfolder: Optional[str] = None,
+    repo_type: Optional[str] = None,
+    revision: Optional[str] = None,
+    token: Optional[str] = None,
+    endpoint: Optional[str] = None,
+    from_bos: bool = True,
+    from_aistudio: bool = False,
+    from_hf_hub: bool = False,
+):
+    assert repo_id is not None, "repo_id cannot be None"
+    assert filename is not None, "filename cannot be None"
+
+    if subfolder is None:
+        subfolder = ""
+    filename = os.path.join(subfolder, filename)
+    if from_aistudio:
+        out = aistudio_hub_file_exists(
+            repo_id=repo_id,
+            filename=filename,
+            repo_type=repo_type,
+            revision=revision,
+            token=token,
+            endpoint=endpoint,
+        )
+    elif from_hf_hub:
+        out = hf_hub_file_exists(
+            repo_id=repo_id,
+            filename=filename,
+            repo_type=repo_type,
+            revision=revision,
+            token=token,
+        )
+    else:
+        out = bos_file_exists(
+            repo_id=repo_id,
+            filename=filename,
+            repo_type=repo_type,
+            revision=revision,
+            token=token,  # donot need token
+            endpoint=endpoint,
+        )
+    return out
+
+
+def bos_aistudio_hf_try_to_load_from_cache(
+    repo_id: str,
+    filename: str,
+    cache_dir: Union[str, Path, None] = None,
+    subfolder: str = None,
+    revision: Optional[str] = None,
+    repo_type: Optional[str] = None,
+    from_bos: bool = True,
+    from_aistudio: bool = False,
+    from_hf_hub: bool = False,
+):
+    if subfolder is None:
+        subfolder = ""
+    load_kwargs = dict(
+        repo_id=repo_id,
+        filename=os.path.join(subfolder, filename),
+        cache_dir=cache_dir,
+        revision=revision,
+        repo_type=repo_type,
+    )
+    if from_aistudio:
+        return aistudio_hub_try_to_load_from_cache(**load_kwargs)
+    elif from_hf_hub:
+        return hf_hub_try_to_load_from_cache(**load_kwargs)
+    else:
+        return bos_try_to_load_from_cache(**load_kwargs)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/download/aistudio_hub_download.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/download/aistudio_hub_download.py
new file mode 100644
index 000000000..df6ee635a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/download/aistudio_hub_download.py
@@ -0,0 +1,728 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+import logging
+import os
+import re
+import shutil
+import tempfile
+from contextlib import contextmanager
+from functools import partial
+from pathlib import Path
+from typing import Dict, Generator, Literal, Optional, Union
+from urllib.parse import quote
+
+import requests
+from filelock import FileLock
+from huggingface_hub.utils import (
+    EntryNotFoundError,
+    FileMetadataError,
+    GatedRepoError,
+    HfHubHTTPError,
+    LocalEntryNotFoundError,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+)
+
+logger = logging.getLogger(__name__)
+
+from .common import (
+    DEFALUT_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD,
+    DEFAULT_ETAG_TIMEOUT,
+    DEFAULT_REQUEST_TIMEOUT,
+    AistudioBosFileMetadata,
+    OfflineModeIsEnabled,
+    _cache_commit_hash_for_specific_revision,
+    _check_disk_space,
+    _chmod_and_replace,
+    _create_symlink,
+    _get_pointer_path,
+    _is_true,
+    _normalize_etag,
+    _request_wrapper,
+    _to_local_dir,
+    http_get,
+    raise_for_status,
+    repo_folder_name,
+)
+
+VERSION = "0.1.5"
+ENDPOINT = os.getenv("AISTUDIO_ENDPOINT", "http://git.aistudio.baidu.com")
+
+AISTUDIO_URL_TEMPLATE = ENDPOINT + "/api/v1/repos/{user_name}/{repo_name}/contents/{filename}"
+
+
+default_home = os.path.join(os.path.expanduser("~"), ".cache")
+AISTUDIO_HOME = os.path.expanduser(
+    os.getenv(
+        "AISTUDIO_HOME",
+        os.path.join(os.getenv("XDG_CACHE_HOME", default_home), "paddle"),
+    )
+)
+default_cache_path = os.path.join(AISTUDIO_HOME, "aistudio")
+AISTUDIO_HUB_CACHE = os.getenv("AISTUDIO_HUB_CACHE", default_cache_path)
+
+
+DEFAULT_REVISION = "master"
+REPO_TYPE_MODEL = "model"
+REPO_TYPES = [None, REPO_TYPE_MODEL]
+
+
+REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$")
+
+
+# TOKEN
+AISTUDIO_TOKEN_PATH = os.path.join(AISTUDIO_HOME, "token")
+AISTUDIO_HUB_DISABLE_IMPLICIT_TOKEN: bool = _is_true(os.environ.get("AISTUDIO_HUB_DISABLE_IMPLICIT_TOKEN"))
+
+
+class LocalTokenNotFoundError(EnvironmentError):
+    """Raised if local token is required but not found."""
+
+
+def _clean_token(token: Optional[str]) -> Optional[str]:
+    """Clean token by removing trailing and leading spaces and newlines.
+
+    If token is an empty string, return None.
+    """
+    if token is None:
+        return None
+    return token.replace("\r", "").replace("\n", "").strip() or None
+
+
+def _get_token_from_environment() -> Optional[str]:
+    return _clean_token(os.environ.get("AISTUDIO_ACCESS_TOKEN") or os.environ.get("AISTUDIO_TOKEN"))
+
+
+def _get_token_from_file() -> Optional[str]:
+    try:
+        return _clean_token(Path(AISTUDIO_TOKEN_PATH).read_text())
+    except FileNotFoundError:
+        return None
+
+
+def get_token() -> Optional[str]:
+    """
+    Get token if user is logged in.
+
+    Note: in most cases, you should use [`build_aistudio_headers`] instead. This method is only useful
+          if you want to retrieve the token for other purposes than sending an HTTP request.
+
+    Token is retrieved in priority from the `AISTUDIO_ACCESS_TOKEN` environment variable. Otherwise, we read the token file located
+    in the Aistudio home folder. Returns None if user is not logged in.
+
+    Returns:
+        `str` or `None`: The token, `None` if it doesn't exist.
+    """
+    return _get_token_from_environment() or _get_token_from_file()
+
+
+def get_token_to_send(token: Optional[Union[bool, str]]) -> Optional[str]:
+    """Select the token to send from either `token` or the cache."""
+    # Case token is explicitly provided
+    if isinstance(token, str):
+        return token
+
+    # Case token is explicitly forbidden
+    if token is False:
+        return None
+
+    # Token is not provided: we get it from local cache
+    cached_token = get_token()
+
+    # Case token is explicitly required
+    if token is True:
+        if cached_token is None:
+            raise LocalTokenNotFoundError(
+                "Token is required (`token=True`), but no token found. You"
+                " to provide a token or be logged in to Aistudio Hub . See"
+                "https://ai.baidu.com/ai-doc/AISTUDIO/slmkadt9z#2-%E5%A6%82%E4%BD%95%E4%BD%BF%E7%94%A8%E8%AE%BF%E9%97%AE%E4%BB%A4%E7%89%8C."
+            )
+        return cached_token
+
+    # Case implicit use of the token is forbidden by env variable
+    if AISTUDIO_HUB_DISABLE_IMPLICIT_TOKEN:
+        return None
+
+    # Otherwise: we use the cached token as the user has not explicitly forbidden it
+    return cached_token
+
+
+def _validate_token_to_send(token: Optional[str], is_write_action: bool) -> None:
+    if is_write_action:
+        if token is None:
+            raise ValueError(
+                "Token is required (write-access action) but no token found. You need"
+                " to provide a token or be logged in to Aistudio Hub . See"
+                "https://ai.baidu.com/ai-doc/AISTUDIO/slmkadt9z#2-%E5%A6%82%E4%BD%95%E4%BD%BF%E7%94%A8%E8%AE%BF%E9%97%AE%E4%BB%A4%E7%89%8C."
+            )
+
+
+def build_aistudio_headers(
+    *,
+    token: Optional[Union[bool, str]] = None,
+    is_write_action: bool = False,
+    library_name: Optional[str] = None,
+    library_version: Optional[str] = None,
+    user_agent: Union[Dict, str, None] = None,
+) -> Dict[str, str]:
+    # Get auth token to send
+    token_to_send = get_token_to_send(token)
+    _validate_token_to_send(token_to_send, is_write_action=is_write_action)
+
+    # Combine headers
+    headers = {"Content-Type": "application/json", "SDK-Version": str(VERSION)}
+    if token_to_send is not None:
+        headers["Authorization"] = f"token {token_to_send}"
+    return headers
+
+
+def get_aistudio_file_metadata(
+    url: str,
+    token: Union[bool, str, None] = None,
+    proxies: Optional[Dict] = None,
+    timeout: Optional[float] = DEFAULT_REQUEST_TIMEOUT,
+    library_name: Optional[str] = None,
+    library_version: Optional[str] = None,
+    user_agent: Union[Dict, str, None] = None,
+):
+    """Fetch metadata of a file versioned on the Hub for a given url.
+
+    Args:
+        url (`str`):
+            File url, for example returned by [`aistudio_hub_url`].
+        token (`str` or `bool`, *optional*):
+            A token to be used for the download.
+                - If `True`, the token is read from the Aistudio config
+                  folder.
+                - If `False` or `None`, no token is provided.
+                - If a string, it's used as the authentication token.
+        proxies (`dict`, *optional*):
+            Dictionary mapping protocol to the URL of the proxy passed to
+            `requests.request`.
+        timeout (`float`, *optional*, defaults to 10):
+            How many seconds to wait for the server to send metadata before giving up.
+        library_name (`str`, *optional*):
+            The name of the library to which the object corresponds.
+        library_version (`str`, *optional*):
+            The version of the library.
+        user_agent (`dict`, `str`, *optional*):
+            The user-agent info in the form of a dictionary or a string.
+
+    Returns:
+        A [`AistudioBosFileMetadata`] object containing metadata such as location, etag, size and
+        commit_hash.
+    """
+    headers = build_aistudio_headers(
+        token=token, library_name=library_name, library_version=library_version, user_agent=user_agent
+    )
+    headers["Accept-Encoding"] = "identity"  # prevent any compression => we want to know the real size of the file
+
+    # Retrieve metadata
+    r = _request_wrapper(
+        method="GET",
+        url=url,
+        headers=headers,
+        allow_redirects=False,
+        follow_relative_redirects=True,
+        proxies=proxies,
+        timeout=timeout,
+    )
+    raise_for_status(r)
+    res = r.json()
+
+    # Return
+    return AistudioBosFileMetadata(
+        commit_hash=res["last_commit_sha"],
+        etag=_normalize_etag(res["sha"]),
+        location=res["git_url"],
+        size=res["size"],
+    )
+
+
+def aistudio_hub_url(
+    repo_id: str,
+    filename: str,
+    *,
+    subfolder: Optional[str] = None,
+    repo_type: Optional[str] = None,
+    revision: Optional[str] = None,
+    endpoint: Optional[str] = None,
+) -> str:
+    if subfolder == "":
+        subfolder = None
+    if subfolder is not None:
+        filename = f"{subfolder}/{filename}"
+
+    if repo_type is None:
+        repo_type = REPO_TYPES[-1]
+    if repo_type not in REPO_TYPES:
+        raise ValueError("Invalid repo type")
+    if revision is None:
+        revision = DEFAULT_REVISION
+
+    # NEW ADD
+    if "/" not in repo_id:
+        raise ValueError("repo_id must be in the format of 'namespace/name'")
+    user_name, repo_name = repo_id.split("/")
+    user_name = user_name.strip()
+    repo_name = repo_name.strip()
+
+    url = AISTUDIO_URL_TEMPLATE.format(
+        user_name=quote(user_name, safe=""), repo_name=quote(repo_name, safe=""), filename=quote(filename)
+    )
+    # Update endpoint if provided
+    if endpoint is not None and url.startswith(ENDPOINT):
+        url = endpoint + url[len(ENDPOINT) :]
+
+    if revision != "master":
+        url += f"?ref={quote(revision, safe='')}"
+    return url
+
+
+def aistudio_hub_download(
+    repo_id: str = None,
+    filename: str = None,
+    subfolder: Optional[str] = None,
+    repo_type: Optional[str] = None,
+    revision: Optional[str] = None,
+    library_name: Optional[str] = None,
+    library_version: Optional[str] = None,
+    cache_dir: Union[str, Path, None] = None,
+    local_dir: Union[str, Path, None] = None,
+    local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
+    # TODO
+    user_agent: Union[Dict, str, None] = None,
+    force_download: bool = False,
+    proxies: Optional[Dict] = None,
+    etag_timeout: float = DEFAULT_ETAG_TIMEOUT,
+    resume_download: bool = False,
+    token: Optional[str] = None,
+    local_files_only: bool = False,
+    endpoint: Optional[str] = None,
+    **kwargs,
+):
+
+    if cache_dir is None:
+        cache_dir = AISTUDIO_HUB_CACHE
+    if revision is None:
+        revision = DEFAULT_REVISION
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+    if isinstance(local_dir, Path):
+        local_dir = str(local_dir)
+    locks_dir = os.path.join(cache_dir, ".locks")
+
+    if subfolder == "":
+        subfolder = None
+    if subfolder is not None:
+        # This is used to create a URL, and not a local path, hence the forward slash.
+        filename = f"{subfolder}/{filename}"
+
+    if repo_type is None:
+        repo_type = REPO_TYPES[-1]
+    if repo_type not in REPO_TYPES:
+        raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(REPO_TYPES)}")
+
+    storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
+    os.makedirs(storage_folder, exist_ok=True)
+
+    # cross platform transcription of filename, to be used as a local file path.
+    relative_filename = os.path.join(*filename.split("/"))
+    if os.name == "nt":
+        if relative_filename.startswith("..\\") or "\\..\\" in relative_filename:
+            raise ValueError(
+                f"Invalid filename: cannot handle filename '{relative_filename}' on Windows. Please ask the repository"
+                " owner to rename this file."
+            )
+
+    # if user provides a commit_hash and they already have the file on disk,
+    # shortcut everything.
+    # TODO, 当前不支持commit id下载，因此这个肯定跑的。
+    if not force_download:  # REGEX_COMMIT_HASH.match(revision)
+        pointer_path = _get_pointer_path(storage_folder, revision, relative_filename)
+        if os.path.exists(pointer_path):
+            if local_dir is not None:
+                return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
+            return pointer_path
+
+    url = aistudio_hub_url(repo_id, filename, repo_type=repo_type, revision=revision, endpoint=endpoint)
+
+    headers = build_aistudio_headers(
+        token=token,
+        library_name=library_name,
+        library_version=library_version,
+        user_agent=user_agent,
+    )
+    url_to_download = url.replace("/contents/", "/media/")
+
+    etag = None
+    commit_hash = None
+    expected_size = None
+    head_call_error: Optional[Exception] = None
+    if not local_files_only:
+        try:
+            try:
+                metadata = get_aistudio_file_metadata(
+                    url=url,
+                    token=token,
+                    proxies=proxies,
+                    timeout=etag_timeout,
+                    library_name=library_name,
+                    library_version=library_version,
+                    user_agent=user_agent,
+                )
+            except EntryNotFoundError as http_error:  # noqa: F841
+                raise
+            # Commit hash must exist
+            # TODO，这里修改了commit hash，强迫为revision了。
+            commit_hash = revision  # metadata.commit_hash
+            if commit_hash is None:
+                raise FileMetadataError(
+                    "Distant resource does not seem to be on aistudio hub. It is possible that a configuration issue"
+                    " prevents you from downloading resources from aistudio hub. Please check your firewall"
+                    " and proxy settings and make sure your SSL certificates are updated."
+                )
+
+            # Etag must exist
+            etag = metadata.etag
+            # We favor a custom header indicating the etag of the linked resource, and
+            # we fallback to the regular etag header.
+            # If we don't have any of those, raise an error.
+            if etag is None:
+                raise FileMetadataError(
+                    "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
+                )
+
+            # Expected (uncompressed) size
+            expected_size = metadata.size
+
+        except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
+            # Actually raise for those subclasses of ConnectionError
+            raise
+        except (
+            requests.exceptions.ConnectionError,
+            requests.exceptions.Timeout,
+            OfflineModeIsEnabled,
+        ) as error:
+            # Otherwise, our Internet connection is down.
+            # etag is None
+            head_call_error = error
+            pass
+        except (RevisionNotFoundError, EntryNotFoundError):
+            # The repo was found but the revision or entry doesn't exist on the Hub (never existed or got deleted)
+            raise
+        except requests.HTTPError as error:
+            # Multiple reasons for an http error:
+            # - Repository is private and invalid/missing token sent
+            # - Repository is gated and invalid/missing token sent
+            # - Hub is down (error 500 or 504)
+            # => let's switch to 'local_files_only=True' to check if the files are already cached.
+            #    (if it's not the case, the error will be re-raised)
+            head_call_error = error
+            pass
+        except FileMetadataError as error:
+            # Multiple reasons for a FileMetadataError:
+            # - Wrong network configuration (proxy, firewall, SSL certificates)
+            # - Inconsistency on the Hub
+            # => let's switch to 'local_files_only=True' to check if the files are already cached.
+            #    (if it's not the case, the error will be re-raised)
+            head_call_error = error
+            pass
+
+    # etag can be None for several reasons:
+    # 1. we passed local_files_only.
+    # 2. we don't have a connection
+    # 3. Hub is down (HTTP 500 or 504)
+    # 4. repo is not found -for example private or gated- and invalid/missing token sent
+    # 5. Hub is blocked by a firewall or proxy is not set correctly.
+    # => Try to get the last downloaded one from the specified revision.
+    #
+    # If the specified revision is a commit hash, look inside "snapshots".
+    # If the specified revision is a branch or tag, look inside "refs".
+    if etag is None:
+        # In those cases, we cannot force download.
+        if force_download:
+            raise ValueError(
+                "We have no connection or you passed local_files_only, so force_download is not an accepted option."
+            )
+
+        # Try to get "commit_hash" from "revision"
+        commit_hash = None
+        if REGEX_COMMIT_HASH.match(revision):
+            commit_hash = revision
+        else:
+            ref_path = os.path.join(storage_folder, "refs", revision)
+            if os.path.isfile(ref_path):
+                with open(ref_path) as f:
+                    commit_hash = f.read()
+
+        # Return pointer file if exists
+        if commit_hash is not None:
+            pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
+            if os.path.exists(pointer_path):
+                if local_dir is not None:
+                    return _to_local_dir(
+                        pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks
+                    )
+                return pointer_path
+
+        # If we couldn't find an appropriate file on disk, raise an error.
+        # If files cannot be found and local_files_only=True,
+        # the models might've been found if local_files_only=False
+        # Notify the user about that
+        if local_files_only:
+            raise LocalEntryNotFoundError(
+                "Cannot find the requested files in the disk cache and outgoing traffic has been disabled. To enable"
+                " aistudio hub look-ups and downloads online, set 'local_files_only' to False."
+            )
+        elif isinstance(head_call_error, RepositoryNotFoundError) or isinstance(head_call_error, GatedRepoError):
+            # Repo not found => let's raise the actual error
+            raise head_call_error
+        else:
+            # Otherwise: most likely a connection issue or Hub downtime => let's warn the user
+            raise LocalEntryNotFoundError(
+                "An error happened while trying to locate the file on the Hub and we cannot find the requested files"
+                " in the local cache. Please check your connection and try again or make sure your Internet connection"
+                " is on."
+            ) from head_call_error
+
+    # From now on, etag and commit_hash are not None.
+    assert etag is not None, "etag must have been retrieved from server"
+    assert commit_hash is not None, "commit_hash must have been retrieved from server"
+    blob_path = os.path.join(storage_folder, "blobs", etag)
+    pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
+
+    os.makedirs(os.path.dirname(blob_path), exist_ok=True)
+    os.makedirs(os.path.dirname(pointer_path), exist_ok=True)
+    # if passed revision is not identical to commit_hash
+    # then revision has to be a branch name or tag name.
+    # In that case store a ref.
+    _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash)
+
+    if os.path.exists(pointer_path) and not force_download:
+        if local_dir is not None:
+            return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
+        return pointer_path
+
+    if os.path.exists(blob_path) and not force_download:
+        # we have the blob already, but not the pointer
+        if local_dir is not None:  # to local dir
+            return _to_local_dir(blob_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
+        else:  # or in snapshot cache
+            _create_symlink(blob_path, pointer_path, new_blob=False)
+            return pointer_path
+
+    # Prevent parallel downloads of the same file with a lock.
+    # etag could be duplicated across repos,
+    lock_path = os.path.join(locks_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type), f"{etag}.lock")
+
+    # Some Windows versions do not allow for paths longer than 255 characters.
+    # In this case, we must specify it is an extended path by using the "\\?\" prefix.
+    if os.name == "nt" and len(os.path.abspath(lock_path)) > 255:
+        lock_path = "\\\\?\\" + os.path.abspath(lock_path)
+
+    if os.name == "nt" and len(os.path.abspath(blob_path)) > 255:
+        blob_path = "\\\\?\\" + os.path.abspath(blob_path)
+
+    Path(lock_path).parent.mkdir(parents=True, exist_ok=True)
+    with FileLock(lock_path):
+        # If the download just completed while the lock was activated.
+        if os.path.exists(pointer_path) and not force_download:
+            # Even if returning early like here, the lock will be released.
+            if local_dir is not None:
+                return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
+            return pointer_path
+
+        if resume_download:
+            incomplete_path = blob_path + ".incomplete"
+
+            @contextmanager
+            def _resumable_file_manager() -> Generator[io.BufferedWriter, None, None]:
+                with open(incomplete_path, "ab") as f:
+                    yield f
+
+            temp_file_manager = _resumable_file_manager
+            if os.path.exists(incomplete_path):
+                resume_size = os.stat(incomplete_path).st_size
+            else:
+                resume_size = 0
+        else:
+            temp_file_manager = partial(  # type: ignore
+                tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False
+            )
+            resume_size = 0
+
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise you get corrupt cache entries if the download gets interrupted.
+        with temp_file_manager() as temp_file:
+            logger.info("downloading %s to %s", url, temp_file.name)
+
+            if expected_size is not None:  # might be None if HTTP header not set correctly
+                # Check tmp path
+                _check_disk_space(expected_size, os.path.dirname(temp_file.name))
+
+                # Check destination
+                _check_disk_space(expected_size, os.path.dirname(blob_path))
+                if local_dir is not None:
+                    _check_disk_space(expected_size, local_dir)
+
+            http_get(
+                url_to_download,
+                temp_file,
+                proxies=proxies,
+                resume_size=resume_size,
+                headers=headers,
+                expected_size=expected_size,
+            )
+        if local_dir is None:
+            logger.debug(f"Storing {url} in cache at {blob_path}")
+            _chmod_and_replace(temp_file.name, blob_path)
+            _create_symlink(blob_path, pointer_path, new_blob=True)
+        else:
+            local_dir_filepath = os.path.join(local_dir, relative_filename)
+            os.makedirs(os.path.dirname(local_dir_filepath), exist_ok=True)
+
+            # If "auto" (default) copy-paste small files to ease manual editing but symlink big files to save disk
+            # In both cases, blob file is cached.
+            is_big_file = os.stat(temp_file.name).st_size > DEFALUT_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD
+            if local_dir_use_symlinks is True or (local_dir_use_symlinks == "auto" and is_big_file):
+                logger.debug(f"Storing {url} in cache at {blob_path}")
+                _chmod_and_replace(temp_file.name, blob_path)
+                logger.debug("Create symlink to local dir")
+                _create_symlink(blob_path, local_dir_filepath, new_blob=False)
+            elif local_dir_use_symlinks == "auto" and not is_big_file:
+                logger.debug(f"Storing {url} in cache at {blob_path}")
+                _chmod_and_replace(temp_file.name, blob_path)
+                logger.debug("Duplicate in local dir (small file and use_symlink set to 'auto')")
+                shutil.copyfile(blob_path, local_dir_filepath)
+            else:
+                logger.debug(f"Storing {url} in local_dir at {local_dir_filepath} (not cached).")
+                _chmod_and_replace(temp_file.name, local_dir_filepath)
+            pointer_path = local_dir_filepath  # for return value
+
+    return pointer_path
+
+
+def aistudio_hub_file_exists(
+    repo_id: str,
+    filename: str,
+    *,
+    repo_type: Optional[str] = None,
+    revision: Optional[str] = None,
+    token: Optional[str] = None,
+    endpoint: Optional[str] = None,
+) -> bool:
+    """
+    Checks if a file exists in a repository on the Aistudio Hub.
+
+    Args:
+        repo_id (`str`):
+            A namespace (user or an organization) and a repo name separated
+            by a `/`.
+        filename (`str`):
+            The name of the file to check, for example:
+            `"config.json"`
+        repo_type (`str`, *optional*):
+            Set to `"dataset"` or `"space"` if getting repository info from a dataset or a space,
+            `None` or `"model"` if getting repository info from a model. Default is `None`.
+        revision (`str`, *optional*):
+            The revision of the repository from which to get the information. Defaults to `"main"` branch.
+        token (`bool` or `str`, *optional*):
+            A valid authentication token (see https://huggingface.co/settings/token).
+            If `None` or `True` and machine is logged in (through `huggingface-cli login`
+            or [`~login`]), token will be retrieved from the cache.
+            If `False`, token is not sent in the request header.
+
+    Returns:
+        True if the file exists, False otherwise.
+
+    <Tip>
+
+    Examples:
+        ```py
+        >>> from huggingface_hub import file_exists
+        >>> file_exists("bigcode/starcoder", "config.json")
+        True
+        >>> file_exists("bigcode/starcoder", "not-a-file")
+        False
+        >>> file_exists("bigcode/not-a-repo", "config.json")
+        False
+        ```
+
+    </Tip>
+    """
+    url = aistudio_hub_url(
+        repo_id=repo_id, repo_type=repo_type, revision=revision, filename=filename, endpoint=endpoint
+    )
+    try:
+        if token is None:
+            token = get_token()
+        get_aistudio_file_metadata(url, token=token)
+        return True
+    except GatedRepoError:  # raise specifically on gated repo
+        raise
+    except (RepositoryNotFoundError, EntryNotFoundError, RevisionNotFoundError, HfHubHTTPError):
+        return False
+
+
+def aistudio_hub_try_to_load_from_cache(
+    repo_id: str,
+    filename: str,
+    cache_dir: Union[str, Path, None] = None,
+    revision: Optional[str] = None,
+    repo_type: Optional[str] = None,
+):
+    if revision is None:
+        revision = DEFAULT_REVISION
+    if repo_type is None:
+        repo_type = REPO_TYPES[-1]
+    if repo_type not in REPO_TYPES:
+        raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(REPO_TYPES)}")
+    if cache_dir is None:
+        cache_dir = AISTUDIO_HUB_CACHE
+
+    object_id = repo_id.replace("/", "--")
+    repo_cache = os.path.join(cache_dir, f"{repo_type}s--{object_id}")
+    if not os.path.isdir(repo_cache):
+        # No cache for this model
+        return None
+
+    refs_dir = os.path.join(repo_cache, "refs")
+    snapshots_dir = os.path.join(repo_cache, "snapshots")
+    no_exist_dir = os.path.join(repo_cache, ".no_exist")
+
+    # Resolve refs (for instance to convert main to the associated commit sha)
+    if os.path.isdir(refs_dir):
+        revision_file = os.path.join(refs_dir, revision)
+        if os.path.isfile(revision_file):
+            with open(revision_file) as f:
+                revision = f.read()
+
+    # Check if file is cached as "no_exist"
+    if os.path.isfile(os.path.join(no_exist_dir, revision, filename)):
+        return None
+
+    # Check if revision folder exists
+    if not os.path.exists(snapshots_dir):
+        return None
+    cached_shas = os.listdir(snapshots_dir)
+    if revision not in cached_shas:
+        # No cache for this revision and we won't try to return a random revision
+        return None
+
+    # Check if file exists in cache
+    cached_file = os.path.join(snapshots_dir, revision, filename)
+    return cached_file if os.path.isfile(cached_file) else None
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/download/bos_download.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/download/bos_download.py
new file mode 100644
index 000000000..71da52ad9
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/download/bos_download.py
@@ -0,0 +1,285 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import re
+import tempfile
+from contextlib import contextmanager
+from functools import partial
+from pathlib import Path
+from typing import Dict, Literal, Optional, Union
+
+from filelock import FileLock
+from huggingface_hub.utils import (
+    EntryNotFoundError,
+    GatedRepoError,
+    HfHubHTTPError,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+)
+
+logger = logging.getLogger(__name__)
+
+from paddlenlp.utils.env import MODEL_HOME
+
+from .common import (
+    DEFAULT_ETAG_TIMEOUT,
+    DEFAULT_REQUEST_TIMEOUT,
+    AistudioBosFileMetadata,
+    _as_int,
+    _chmod_and_replace,
+    _normalize_etag,
+    _request_wrapper,
+    http_get,
+    raise_for_status,
+)
+
+ENDPOINT = os.getenv("PPNLP_ENDPOINT", "https://bj.bcebos.com/paddlenlp")
+ENDPOINT_v2 = "https://paddlenlp.bj.bcebos.com"
+
+BOS_URL_TEMPLATE = ENDPOINT + "/{repo_type}/community/{repo_id}/{revision}/{filename}"
+BOS_URL_TEMPLATE_WITHOUT_REVISION = ENDPOINT + "/{repo_type}/community/{repo_id}/{filename}"
+
+
+REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$")
+REPO_TYPE = "models"
+
+
+def get_bos_file_metadata(
+    url: str,
+    token: Union[bool, str, None] = None,
+    proxies: Optional[Dict] = None,
+    timeout: Optional[float] = DEFAULT_REQUEST_TIMEOUT,
+    library_name: Optional[str] = None,
+    library_version: Optional[str] = None,
+    user_agent: Union[Dict, str, None] = None,
+):
+    """Fetch metadata of a file versioned on the Hub for a given url.
+
+    Args:
+        url (`str`):
+            File url, for example returned by [`bos_url`].
+        token (`str` or `bool`, *optional*):
+            A token to be used for the download.
+                - If `True`, the token is read from the BOS config
+                  folder.
+                - If `False` or `None`, no token is provided.
+                - If a string, it's used as the authentication token.
+        proxies (`dict`, *optional*):
+            Dictionary mapping protocol to the URL of the proxy passed to
+            `requests.request`.
+        timeout (`float`, *optional*, defaults to 10):
+            How many seconds to wait for the server to send metadata before giving up.
+        library_name (`str`, *optional*):
+            The name of the library to which the object corresponds.
+        library_version (`str`, *optional*):
+            The version of the library.
+        user_agent (`dict`, `str`, *optional*):
+            The user-agent info in the form of a dictionary or a string.
+
+    Returns:
+        A [`AistudioBosFileMetadata`] object containing metadata such as location, etag, size and
+        commit_hash.
+    """
+    headers = {}
+    headers["Accept-Encoding"] = "identity"  # prevent any compression => we want to know the real size of the file
+
+    # Retrieve metadata
+    r = _request_wrapper(
+        method="HEAD",
+        url=url,
+        headers=headers,
+        allow_redirects=False,
+        follow_relative_redirects=True,
+        proxies=proxies,
+        timeout=timeout,
+    )
+    raise_for_status(r)
+
+    # Return
+    return AistudioBosFileMetadata(
+        commit_hash=None,
+        etag=_normalize_etag(r.headers.get("ETag")),
+        location=url,
+        size=_as_int(r.headers.get("Content-Length")),
+    )
+
+
+def bos_url(
+    repo_id: str,
+    filename: str,
+    *,
+    subfolder: Optional[str] = None,
+    repo_type: Optional[str] = None,
+    revision: Optional[str] = None,
+    endpoint: Optional[str] = None,
+) -> str:
+    if subfolder == "":
+        subfolder = None
+    if subfolder is not None:
+        filename = f"{subfolder}/{filename}"
+
+    url = BOS_URL_TEMPLATE_WITHOUT_REVISION.format(
+        repo_type=REPO_TYPE,
+        repo_id=repo_id,
+        filename=filename,
+    )
+
+    # Update endpoint if provided
+    if endpoint is not None and url.startswith(ENDPOINT):
+        url = endpoint + url[len(ENDPOINT) :]
+    return url
+
+
+def bos_download(
+    repo_id: str = None,
+    filename: str = None,
+    subfolder: Optional[str] = None,
+    repo_type: Optional[str] = None,
+    revision: Optional[str] = None,
+    library_name: Optional[str] = None,
+    library_version: Optional[str] = None,
+    cache_dir: Union[str, Path, None] = None,
+    local_dir: Union[str, Path, None] = None,
+    local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
+    user_agent: Union[Dict, str, None] = None,
+    force_download: bool = False,
+    proxies: Optional[Dict] = None,
+    etag_timeout: float = DEFAULT_ETAG_TIMEOUT,
+    resume_download: bool = False,
+    token: Optional[str] = None,
+    local_files_only: bool = False,
+    endpoint: Optional[str] = None,
+    url: Optional[str] = None,
+    **kwargs,
+):
+    if url is not None:
+        if repo_id is None:
+            if url.startswith(ENDPOINT):
+                repo_id = "/".join(url[len(ENDPOINT) + 1 :].split("/")[:-1])
+            else:
+                repo_id = "/".join(url[len(ENDPOINT_v2) + 1 :].split("/")[:-1])
+        if filename is None:
+            filename = url.split("/")[-1]
+        subfolder = None
+
+    if cache_dir is None:
+        cache_dir = MODEL_HOME
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    if subfolder == "":
+        subfolder = None
+    if subfolder is not None:
+        # This is used to create a URL, and not a local path, hence the forward slash.
+        filename = f"{subfolder}/{filename}"
+
+    locks_dir = os.path.join(cache_dir, ".locks")
+
+    storage_folder = os.path.join(cache_dir, repo_id)
+    os.makedirs(storage_folder, exist_ok=True)
+
+    if url is None:
+        url = bos_url(repo_id, filename, repo_type=REPO_TYPE, endpoint=endpoint)
+    headers = None
+    url_to_download = url
+    lock_path = os.path.join(locks_dir, repo_id, f"{filename}.lock")
+    file_path = os.path.join(cache_dir, repo_id, filename)
+
+    if os.name == "nt" and len(os.path.abspath(lock_path)) > 255:
+        lock_path = "\\\\?\\" + os.path.abspath(lock_path)
+
+    if os.name == "nt" and len(os.path.abspath(file_path)) > 255:
+        file_path = "\\\\?\\" + os.path.abspath(file_path)
+
+    Path(lock_path).parent.mkdir(parents=True, exist_ok=True)
+    with FileLock(lock_path):
+        # If the download just completed while the lock was activated.
+        if os.path.exists(file_path) and not force_download:
+            # Even if returning early like here, the lock will be released.
+            return file_path
+
+        if resume_download:
+            incomplete_path = file_path + ".incomplete"
+
+            @contextmanager
+            def _resumable_file_manager():
+                with open(incomplete_path, "ab") as f:
+                    yield f
+
+            temp_file_manager = _resumable_file_manager
+            if os.path.exists(incomplete_path):
+                resume_size = os.stat(incomplete_path).st_size
+            else:
+                resume_size = 0
+        else:
+            temp_file_manager = partial(  # type: ignore
+                tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False
+            )
+            resume_size = 0
+
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise you get corrupt cache entries if the download gets interrupted.
+        with temp_file_manager() as temp_file:
+            logger.info("downloading %s to %s", url_to_download, temp_file.name)
+
+            http_get(
+                url_to_download,
+                temp_file,
+                proxies=proxies,
+                resume_size=resume_size,
+                headers=headers,
+            )
+
+        logger.info("storing %s in cache at %s", url_to_download, file_path)
+        _chmod_and_replace(temp_file.name, file_path)
+    try:
+        os.remove(lock_path)
+    except OSError:
+        pass
+    return file_path
+
+
+def bos_file_exists(
+    repo_id: str,
+    filename: str,
+    *,
+    repo_type: Optional[str] = None,
+    revision: Optional[str] = None,
+    token: Optional[str] = None,
+    endpoint: Optional[str] = None,
+) -> bool:
+    url = bos_url(repo_id=repo_id, repo_type=REPO_TYPE, filename=filename, endpoint=endpoint)
+    try:
+        get_bos_file_metadata(url, token=token)
+        return True
+    except GatedRepoError:  # raise specifically on gated repo
+        raise
+    except (RepositoryNotFoundError, EntryNotFoundError, RevisionNotFoundError, HfHubHTTPError):
+        return False
+
+
+def bos_try_to_load_from_cache(
+    repo_id: str,
+    filename: str,
+    cache_dir: Union[str, Path, None] = None,
+    revision: Optional[str] = None,
+    repo_type: Optional[str] = None,
+):
+    if cache_dir is None:
+        cache_dir = MODEL_HOME
+
+    cached_file = os.path.join(cache_dir, repo_id, filename)
+    return cached_file if os.path.isfile(cached_file) else None
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/download/common.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/download/common.py
new file mode 100644
index 000000000..ef391aa0d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/download/common.py
@@ -0,0 +1,662 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import logging
+import os
+import re
+import shutil
+import stat
+import tempfile
+import threading
+import time
+import uuid
+import warnings
+from contextlib import contextmanager
+from dataclasses import dataclass
+from functools import lru_cache
+from pathlib import Path
+from typing import BinaryIO, Callable, Dict, Generator, Literal, Optional, Union
+from urllib.parse import urlparse
+
+import requests
+from huggingface_hub.utils import (
+    BadRequestError,
+    EntryNotFoundError,
+    HfHubHTTPError,
+    tqdm,
+)
+from requests import HTTPError, Response
+from requests.adapters import HTTPAdapter
+from requests.models import PreparedRequest
+
+logger = logging.getLogger(__name__)
+
+ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
+
+
+def _is_true(value: Optional[str]) -> bool:
+    if value is None:
+        return False
+    return value.upper() in ENV_VARS_TRUE_VALUES
+
+
+def _as_int(value: Optional[str]) -> Optional[int]:
+    if value is None:
+        return None
+    return int(value)
+
+
+DISABLE_SYMLINKS_WARNING = False
+# Regex to get filename from a "Content-Disposition" header for CDN-served files
+HEADER_FILENAME_PATTERN = re.compile(r'filename="(?P<filename>.*?)"')
+DOWNLOAD_CHUNK_SIZE = 10 * 1024 * 1024
+REPO_ID_SEPARATOR = "--"
+
+DEFAULT_DOWNLOAD_TIMEOUT = 10
+DEFAULT_REQUEST_TIMEOUT = 10
+DEFAULT_ETAG_TIMEOUT = 10
+DEFALUT_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD: int = 5 * 1024 * 1024
+
+OFFLINE = _is_true(os.environ.get("AISTUDIO_BOS_OFFLINE"))
+_CACHED_NO_EXIST = object()
+
+
+def _cache_commit_hash_for_specific_revision(storage_folder: str, revision: str, commit_hash: str) -> None:
+    """Cache reference between a revision (tag, branch or truncated commit hash) and the corresponding commit hash.
+
+    Does nothing if `revision` is already a proper `commit_hash` or reference is already cached.
+    """
+    # if revision != commit_hash:
+    ref_path = Path(storage_folder) / "refs" / revision
+    ref_path.parent.mkdir(parents=True, exist_ok=True)
+    if not ref_path.exists() or commit_hash != ref_path.read_text():
+        # Update ref only if has been updated. Could cause useless error in case
+        # repo is already cached and user doesn't have write access to cache folder.
+        # See https://github.com/huggingface/huggingface_hub/issues/1216.
+        ref_path.write_text(commit_hash)
+
+
+def _check_disk_space(expected_size: int, target_dir: Union[str, Path]) -> None:
+    """Check disk usage and log a warning if there is not enough disk space to download the file.
+
+    Args:
+        expected_size (`int`):
+            The expected size of the file in bytes.
+        target_dir (`str`):
+            The directory where the file will be stored after downloading.
+    """
+
+    target_dir = Path(target_dir)  # format as `Path`
+    for path in [target_dir] + list(target_dir.parents):  # first check target_dir, then each parents one by one
+        try:
+            target_dir_free = shutil.disk_usage(path).free
+            if target_dir_free < expected_size:
+                warnings.warn(
+                    "Not enough free disk space to download the file. "
+                    f"The expected file size is: {expected_size / 1e6:.2f} MB. "
+                    f"The target location {target_dir} only has {target_dir_free / 1e6:.2f} MB free disk space."
+                )
+            return
+        except OSError:  # raise on anything: file does not exist or space disk cannot be checked
+            pass
+
+
+def http_get(
+    url: str,
+    temp_file: BinaryIO,
+    *,
+    proxies=None,
+    resume_size: float = 0,
+    headers: Optional[Dict[str, str]] = None,
+    expected_size: Optional[int] = None,
+    _nb_retries: int = 5,
+):
+    """
+    Download a remote file. Do not gobble up errors, and will return errors tailored to the Hugging Face Hub.
+
+    If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely a
+    transient error (network outage?). We log a warning message and try to resume the download a few times before
+    giving up. The method gives up after 5 attempts if no new data has being received from the server.
+    """
+    initial_headers = headers
+    headers = copy.deepcopy(headers) or {}
+    if resume_size > 0:
+        headers["Range"] = "bytes=%d-" % (resume_size,)
+
+    r = _request_wrapper(
+        method="GET", url=url, stream=True, proxies=proxies, headers=headers, timeout=DEFAULT_DOWNLOAD_TIMEOUT
+    )
+    raise_for_status(r)
+    content_length = r.headers.get("Content-Length")
+
+    # NOTE: 'total' is the total number of bytes to download, not the number of bytes in the file.
+    #       If the file is compressed, the number of bytes in the saved file will be higher than 'total'.
+    total = resume_size + int(content_length) if content_length is not None else None
+
+    displayed_name = url
+    content_disposition = r.headers.get("Content-Disposition")
+    if content_disposition is not None:
+        match = HEADER_FILENAME_PATTERN.search(content_disposition)
+        if match is not None:
+            # Means file is on CDN
+            displayed_name = match.groupdict()["filename"]
+
+    # Truncate filename if too long to display
+    if len(displayed_name) > 40:
+        displayed_name = f"(…){displayed_name[-40:]}"
+
+    consistency_error_message = (
+        f"Consistency check failed: file should be of size {expected_size} but has size"
+        f" {{actual_size}} ({displayed_name}).\nWe are sorry for the inconvenience. Please retry download and"
+        " pass `force_download=True, resume_download=False` as argument.\nIf the issue persists, please let us"
+        " know by opening an issue on https://github.com/huggingface/huggingface_hub."
+    )
+
+    # Stream file to buffer
+    with tqdm(
+        unit="B",
+        unit_scale=True,
+        total=total,
+        initial=resume_size,
+        desc=displayed_name,
+        disable=bool(logger.getEffectiveLevel() == logging.NOTSET),
+    ) as progress:
+        new_resume_size = resume_size
+        try:
+            for chunk in r.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE):
+                if chunk:  # filter out keep-alive new chunks
+                    progress.update(len(chunk))
+                    temp_file.write(chunk)
+                    new_resume_size += len(chunk)
+                    # Some data has been downloaded from the server so we reset the number of retries.
+                    _nb_retries = 5
+        except (requests.ConnectionError, requests.ReadTimeout) as e:
+            # If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely
+            # a transient error (network outage?). We log a warning message and try to resume the download a few times
+            # before giving up. Tre retry mechanism is basic but should be enough in most cases.
+            if _nb_retries <= 0:
+                logger.warning("Error while downloading from %s: %s\nMax retries exceeded.", url, str(e))
+                raise
+            logger.warning("Error while downloading from %s: %s\nTrying to resume download...", url, str(e))
+            time.sleep(1)
+            reset_sessions()  # In case of SSLError it's best to reset the shared requests.Session objects
+            return http_get(
+                url=url,
+                temp_file=temp_file,
+                proxies=proxies,
+                resume_size=new_resume_size,
+                headers=initial_headers,
+                expected_size=expected_size,
+                _nb_retries=_nb_retries - 1,
+            )
+
+        if expected_size is not None and expected_size != temp_file.tell():
+            raise EnvironmentError(
+                consistency_error_message.format(
+                    actual_size=temp_file.tell(),
+                )
+            )
+
+
+def _chmod_and_replace(src: str, dst: str) -> None:
+    """Set correct permission before moving a blob from tmp directory to cache dir.
+
+    Do not take into account the `umask` from the process as there is no convenient way
+    to get it that is thread-safe.
+
+    See:
+    - About umask: https://docs.python.org/3/library/os.html#os.umask
+    - Thread-safety: https://stackoverflow.com/a/70343066
+    - About solution: https://github.com/huggingface/huggingface_hub/pull/1220#issuecomment-1326211591
+    - Fix issue: https://github.com/huggingface/huggingface_hub/issues/1141
+    - Fix issue: https://github.com/huggingface/huggingface_hub/issues/1215
+    """
+    # Get umask by creating a temporary file in the cached repo folder.
+    tmp_file = Path(dst).parent.parent / f"tmp_{uuid.uuid4()}"
+    try:
+        tmp_file.touch()
+        cache_dir_mode = Path(tmp_file).stat().st_mode
+        os.chmod(src, stat.S_IMODE(cache_dir_mode))
+    finally:
+        tmp_file.unlink()
+
+    shutil.move(src, dst)
+
+
+def repo_folder_name(*, repo_id: str, repo_type: str) -> str:
+    """Return a serialized version of a aistudio repo name and type, safe for disk storage
+    as a single non-nested folder.
+
+    Example: models--julien-c--EsperBERTo-small
+    """
+    # remove all `/` occurrences to correctly convert repo to directory name
+    parts = [f"{repo_type}s", *repo_id.split("/")]
+    return REPO_ID_SEPARATOR.join(parts)
+
+
+class OfflineModeIsEnabled(ConnectionError):
+    """Raised when a request is made but `AISTUDIO_HUB_OFFLINE=1` is set as environment variable."""
+
+
+class OfflineAdapter(HTTPAdapter):
+    def send(self, request: PreparedRequest, *args, **kwargs) -> Response:
+        raise OfflineModeIsEnabled(
+            f"Cannot reach {request.url}: offline mode is enabled. To disable it, please unset the `AISTUDIO_HUB_OFFLINE` environment variable."
+        )
+
+
+BACKEND_FACTORY_T = Callable[[], requests.Session]
+
+
+def _default_backend_factory() -> requests.Session:
+    session = requests.Session()
+    if OFFLINE:
+        session.mount("http://", OfflineAdapter())
+        session.mount("https://", OfflineAdapter())
+
+    return session
+
+
+_GLOBAL_BACKEND_FACTORY: BACKEND_FACTORY_T = _default_backend_factory
+HTTP_METHOD_T = Literal["GET", "OPTIONS", "HEAD", "POST", "PUT", "PATCH", "DELETE"]
+
+
+@lru_cache
+def _get_session_from_cache(process_id: int, thread_id: int) -> requests.Session:
+    """
+    Create a new session per thread using global factory. Using LRU cache (maxsize 128) to avoid memory leaks when
+    using thousands of threads. Cache is cleared when `configure_http_backend` is called.
+    """
+    return _GLOBAL_BACKEND_FACTORY()
+
+
+def reset_sessions() -> None:
+    """Reset the cache of sessions.
+
+    Mostly used internally when sessions are reconfigured or an SSLError is raised.
+    See [`configure_http_backend`] for more details.
+    """
+    _get_session_from_cache.cache_clear()
+
+
+def get_session() -> requests.Session:
+    """
+    Get a `requests.Session` object, using the session factory from the user.
+
+    Use [`get_session`] to get a configured Session. Since `requests.Session` is not guaranteed to be thread-safe,
+    `huggingface_hub` creates 1 Session instance per thread. They are all instantiated using the same `backend_factory`
+    set in [`configure_http_backend`]. A LRU cache is used to cache the created sessions (and connections) between
+    calls. Max size is 128 to avoid memory leaks if thousands of threads are spawned.
+
+    See [this issue](https://github.com/psf/requests/issues/2766) to know more about thread-safety in `requests`.
+
+    Example:
+    ```py
+    import requests
+    from huggingface_hub import configure_http_backend, get_session
+
+    # Create a factory function that returns a Session with configured proxies
+    def backend_factory() -> requests.Session:
+        session = requests.Session()
+        session.proxies = {"http": "http://10.10.1.10:3128", "https": "https://10.10.1.11:1080"}
+        return session
+
+    # Set it as the default session factory
+    configure_http_backend(backend_factory=backend_factory)
+
+    # In practice, this is mostly done internally in `huggingface_hub`
+    session = get_session()
+    ```
+    """
+    return _get_session_from_cache(process_id=os.getpid(), thread_id=threading.get_ident())
+
+
+def _request_wrapper(
+    method: HTTP_METHOD_T, url: str, *, follow_relative_redirects: bool = False, **params
+) -> requests.Response:
+    """Wrapper around requests methods to follow relative redirects if `follow_relative_redirects=True` even when
+    `allow_redirection=False`.
+
+    Args:
+        method (`str`):
+            HTTP method, such as 'GET' or 'HEAD'.
+        url (`str`):
+            The URL of the resource to fetch.
+        follow_relative_redirects (`bool`, *optional*, defaults to `False`)
+            If True, relative redirection (redirection to the same site) will be resolved even when `allow_redirection`
+            kwarg is set to False. Useful when we want to follow a redirection to a renamed repository without
+            following redirection to a CDN.
+        **params (`dict`, *optional*):
+            Params to pass to `requests.request`.
+    """
+    # Recursively follow relative redirects
+    if follow_relative_redirects:
+        response = _request_wrapper(
+            method=method,
+            url=url,
+            follow_relative_redirects=False,
+            **params,
+        )
+
+        # If redirection, we redirect only relative paths.
+        # This is useful in case of a renamed repository.
+        if 300 <= response.status_code <= 399:
+            parsed_target = urlparse(response.headers["Location"])
+            if parsed_target.netloc == "":
+                # This means it is a relative 'location' headers, as allowed by RFC 7231.
+                # (e.g. '/path/to/resource' instead of 'http://domain.tld/path/to/resource')
+                # We want to follow this relative redirect !
+                #
+                # Highly inspired by `resolve_redirects` from requests library.
+                # See https://github.com/psf/requests/blob/main/requests/sessions.py#L159
+                next_url = urlparse(url)._replace(path=parsed_target.path).geturl()
+                return _request_wrapper(method=method, url=next_url, follow_relative_redirects=True, **params)
+        return response
+    # Perform request and return if status_code is not in the retry list.
+    response = get_session().request(method=method, url=url, **params)
+    raise_for_status(response)
+    return response
+
+
+def _get_pointer_path(storage_folder: str, revision: str, relative_filename: str) -> str:
+    # Using `os.path.abspath` instead of `Path.resolve()` to avoid resolving symlinks
+    snapshot_path = os.path.join(storage_folder, "snapshots")
+    pointer_path = os.path.join(snapshot_path, revision, relative_filename)
+    if Path(os.path.abspath(snapshot_path)) not in Path(os.path.abspath(pointer_path)).parents:
+        raise ValueError(
+            "Invalid pointer path: cannot create pointer path in snapshot folder if"
+            f" `storage_folder='{storage_folder}'`, `revision='{revision}'` and"
+            f" `relative_filename='{relative_filename}'`."
+        )
+    return pointer_path
+
+
+def _create_symlink(src: str, dst: str, new_blob: bool = False) -> None:
+    """Create a symbolic link named dst pointing to src.
+
+    By default, it will try to create a symlink using a relative path. Relative paths have 2 advantages:
+    - If the cache_folder is moved (example: back-up on a shared drive), relative paths within the cache folder will
+      not brake.
+    - Relative paths seems to be better handled on Windows. Issue was reported 3 times in less than a week when
+      changing from relative to absolute paths. See https://github.com/huggingface/huggingface_hub/issues/1398,
+      https://github.com/huggingface/diffusers/issues/2729 and https://github.com/huggingface/transformers/pull/22228.
+      NOTE: The issue with absolute paths doesn't happen on admin mode.
+    When creating a symlink from the cache to a local folder, it is possible that a relative path cannot be created.
+    This happens when paths are not on the same volume. In that case, we use absolute paths.
+
+
+    The result layout looks something like
+        └── [ 128]  snapshots
+            ├── [ 128]  2439f60ef33a0d46d85da5001d52aeda5b00ce9f
+            │   ├── [  52]  README.md -> ../../../blobs/d7edf6bd2a681fb0175f7735299831ee1b22b812
+            │   └── [  76]  pytorch_model.bin -> ../../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
+
+    If symlinks cannot be created on this platform (most likely to be Windows), the workaround is to avoid symlinks by
+    having the actual file in `dst`. If it is a new file (`new_blob=True`), we move it to `dst`. If it is not a new file
+    (`new_blob=False`), we don't know if the blob file is already referenced elsewhere. To avoid breaking existing
+    cache, the file is duplicated on the disk.
+
+    In case symlinks are not supported, a warning message is displayed to the user once when loading `huggingface_hub`.
+    The warning message can be disable with the `DISABLE_SYMLINKS_WARNING` environment variable.
+    """
+    try:
+        os.remove(dst)
+    except OSError:
+        pass
+
+    abs_src = os.path.abspath(os.path.expanduser(src))
+    abs_dst = os.path.abspath(os.path.expanduser(dst))
+    abs_dst_folder = os.path.dirname(abs_dst)
+
+    # Use relative_dst in priority
+    try:
+        relative_src = os.path.relpath(abs_src, abs_dst_folder)
+    except ValueError:
+        # Raised on Windows if src and dst are not on the same volume. This is the case when creating a symlink to a
+        # local_dir instead of within the cache directory.
+        # See https://docs.python.org/3/library/os.path.html#os.path.relpath
+        relative_src = None
+
+    try:
+        commonpath = os.path.commonpath([abs_src, abs_dst])
+        _support_symlinks = are_symlinks_supported(commonpath)
+    except ValueError:
+        # Raised if src and dst are not on the same volume. Symlinks will still work on Linux/Macos.
+        # See https://docs.python.org/3/library/os.path.html#os.path.commonpath
+        _support_symlinks = os.name != "nt"
+    except PermissionError:
+        # Permission error means src and dst are not in the same volume (e.g. destination path has been provided
+        # by the user via `local_dir`. Let's test symlink support there)
+        _support_symlinks = are_symlinks_supported(abs_dst_folder)
+
+    # Symlinks are supported => let's create a symlink.
+    if _support_symlinks:
+        src_rel_or_abs = relative_src or abs_src
+        logger.debug(f"Creating pointer from {src_rel_or_abs} to {abs_dst}")
+        try:
+            os.symlink(src_rel_or_abs, abs_dst)
+            return
+        except FileExistsError:
+            if os.path.islink(abs_dst) and os.path.realpath(abs_dst) == os.path.realpath(abs_src):
+                # `abs_dst` already exists and is a symlink to the `abs_src` blob. It is most likely that the file has
+                # been cached twice concurrently (exactly between `os.remove` and `os.symlink`). Do nothing.
+                return
+            else:
+                # Very unlikely to happen. Means a file `dst` has been created exactly between `os.remove` and
+                # `os.symlink` and is not a symlink to the `abs_src` blob file. Raise exception.
+                raise
+        except PermissionError:
+            # Permission error means src and dst are not in the same volume (e.g. download to local dir) and symlink
+            # is supported on both volumes but not between them. Let's just make a hard copy in that case.
+            pass
+
+    # Symlinks are not supported => let's move or copy the file.
+    if new_blob:
+        logger.info(f"Symlink not supported. Moving file from {abs_src} to {abs_dst}")
+        shutil.move(abs_src, abs_dst)
+    else:
+        logger.info(f"Symlink not supported. Copying file from {abs_src} to {abs_dst}")
+        shutil.copyfile(abs_src, abs_dst)
+
+
+_are_symlinks_supported_in_dir: Dict[str, bool] = {}
+
+
+def _set_write_permission_and_retry(func, path, excinfo):
+    os.chmod(path, stat.S_IWRITE)
+    func(path)
+
+
+@contextmanager
+def SoftTemporaryDirectory(
+    suffix: Optional[str] = None,
+    prefix: Optional[str] = None,
+    dir: Optional[Union[Path, str]] = None,
+    **kwargs,
+) -> Generator[str, None, None]:
+    """
+    Context manager to create a temporary directory and safely delete it.
+
+    If tmp directory cannot be deleted normally, we set the WRITE permission and retry.
+    If cleanup still fails, we give up but don't raise an exception. This is equivalent
+    to  `tempfile.TemporaryDirectory(..., ignore_cleanup_errors=True)` introduced in
+    Python 3.10.
+
+    See https://www.scivision.dev/python-tempfile-permission-error-windows/.
+    """
+    tmpdir = tempfile.TemporaryDirectory(prefix=prefix, suffix=suffix, dir=dir, **kwargs)
+    yield tmpdir.name
+
+    try:
+        # First once with normal cleanup
+        shutil.rmtree(tmpdir.name)
+    except Exception:
+        # If failed, try to set write permission and retry
+        try:
+            shutil.rmtree(tmpdir.name, onerror=_set_write_permission_and_retry)
+        except Exception:
+            pass
+
+    # And finally, cleanup the tmpdir.
+    # If it fails again, give up but do not throw error
+    try:
+        tmpdir.cleanup()
+    except Exception:
+        pass
+
+
+def _to_local_dir(
+    path: str, local_dir: str, relative_filename: str, use_symlinks: Union[bool, Literal["auto"]]
+) -> str:
+    """Place a file in a local dir (different than cache_dir).
+
+    Either symlink to blob file in cache or duplicate file depending on `use_symlinks` and file size.
+    """
+    # Using `os.path.abspath` instead of `Path.resolve()` to avoid resolving symlinks
+    local_dir_filepath = os.path.join(local_dir, relative_filename)
+    if Path(os.path.abspath(local_dir)) not in Path(os.path.abspath(local_dir_filepath)).parents:
+        raise ValueError(
+            f"Cannot copy file '{relative_filename}' to local dir '{local_dir}': file would not be in the local"
+            " directory."
+        )
+
+    os.makedirs(os.path.dirname(local_dir_filepath), exist_ok=True)
+    real_blob_path = os.path.realpath(path)
+
+    # If "auto" (default) copy-paste small files to ease manual editing but symlink big files to save disk
+    if use_symlinks == "auto":
+        use_symlinks = os.stat(real_blob_path).st_size > DEFALUT_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD
+
+    if use_symlinks:
+        _create_symlink(real_blob_path, local_dir_filepath, new_blob=False)
+    else:
+        shutil.copyfile(real_blob_path, local_dir_filepath)
+    return local_dir_filepath
+
+
+def _normalize_etag(etag: Optional[str]) -> Optional[str]:
+    """Normalize ETag HTTP header, so it can be used to create nice filepaths.
+
+    The HTTP spec allows two forms of ETag:
+      ETag: W/"<etag_value>"
+      ETag: "<etag_value>"
+
+    For now, we only expect the second form from the server, but we want to be future-proof so we support both. For
+    more context, see `TestNormalizeEtag` tests and https://github.com/huggingface/huggingface_hub/pull/1428.
+
+    Args:
+        etag (`str`, *optional*): HTTP header
+
+    Returns:
+        `str` or `None`: string that can be used as a nice directory name.
+        Returns `None` if input is None.
+    """
+    if etag is None:
+        return None
+    return etag.lstrip("W/").strip('"')
+
+
+@dataclass(frozen=True)
+class AistudioBosFileMetadata:
+    """Data structure containing information about a file versioned on the Aistudio Hub.
+
+    Returned by [`get_aistudio_file_metadata`] based on a URL.
+
+    Args:
+        commit_hash (`str`, *optional*):
+            The commit_hash related to the file.
+        etag (`str`, *optional*):
+            Etag of the file on the server.
+        location (`str`):
+            Location where to download the file. Can be a Hub url or not (CDN).
+        size (`size`):
+            Size of the file. In case of an LFS file, contains the size of the actual
+            LFS file, not the pointer.
+    """
+
+    commit_hash: Optional[str]
+    etag: Optional[str]
+    location: str
+    size: Optional[int]
+
+
+def raise_for_status(response: Response, endpoint_name: Optional[str] = None) -> None:
+    try:
+        response.raise_for_status()
+    except HTTPError as e:
+        if response.status_code == 404:
+            message = f"{response.status_code} Client Error." + "\n\n" + f"Entry Not Found for url: {response.url}."
+            raise EntryNotFoundError(message, None) from e
+        elif response.status_code == 400:
+            message = (
+                f"\n\nBad request for {endpoint_name} endpoint:" if endpoint_name is not None else "\n\nBad request:"
+            )
+            raise BadRequestError(message, response=None) from e
+        raise HfHubHTTPError(str(e), response=None) from e
+
+
+def are_symlinks_supported(cache_dir: Union[str, Path, None] = None) -> bool:
+    """Return whether the symlinks are supported on the machine.
+
+    Since symlinks support can change depending on the mounted disk, we need to check
+    on the precise cache folder.
+
+    Args:
+        cache_dir (`str`, `Path`, *optional*):
+            Path to the folder where cached files are stored.
+
+    Returns: [bool] Whether symlinks are supported in the directory.
+    """
+    assert cache_dir is not None
+    cache_dir = str(Path(cache_dir).expanduser().resolve())  # make it unique
+
+    # Check symlink compatibility only once (per cache directory) at first time use
+    if cache_dir not in _are_symlinks_supported_in_dir:
+        _are_symlinks_supported_in_dir[cache_dir] = True
+
+        os.makedirs(cache_dir, exist_ok=True)
+        with SoftTemporaryDirectory(dir=cache_dir) as tmpdir:
+            src_path = Path(tmpdir) / "dummy_file_src"
+            src_path.touch()
+            dst_path = Path(tmpdir) / "dummy_file_dst"
+
+            # Relative source path as in `_create_symlink``
+            relative_src = os.path.relpath(src_path, start=os.path.dirname(dst_path))
+            try:
+                os.symlink(relative_src, dst_path)
+            except OSError:
+                # Likely running on Windows
+                _are_symlinks_supported_in_dir[cache_dir] = False
+
+                if not DISABLE_SYMLINKS_WARNING:
+                    message = (
+                        "cache-system uses symlinks by default to"
+                        " efficiently store duplicated files but your machine does not"
+                        f" support them in {cache_dir}. Caching files will still work"
+                        " but in a degraded version that might require more space on"
+                        " your disk. This warning can be disabled by setting the"
+                        " `DISABLE_SYMLINKS_WARNING` environment variable."
+                    )
+                    if os.name == "nt":
+                        message += (
+                            "\nTo support symlinks on Windows, you either need to"
+                            " activate Developer Mode or to run Python as an"
+                            " administrator. In order to see activate developer mode,"
+                            " see this article:"
+                            " https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development"
+                        )
+                    warnings.warn(message)
+
+    return _are_symlinks_supported_in_dir[cache_dir]
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/downloader.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/downloader.py
new file mode 100644
index 000000000..62f7e9c5e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/downloader.py
@@ -0,0 +1,471 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import hashlib
+import json
+import os
+import os.path as osp
+import shutil
+import tarfile
+import threading
+import time
+import uuid
+import zipfile
+from collections import OrderedDict
+from typing import Optional, Union
+
+import requests
+from filelock import FileLock
+from huggingface_hub import get_hf_file_metadata, hf_hub_url
+from huggingface_hub.utils import EntryNotFoundError
+from tqdm.auto import tqdm
+
+from .env import DOWNLOAD_SERVER, FAILED_STATUS, SUCCESS_STATUS
+from .log import logger
+
+__all__ = ["get_weights_path_from_url"]
+
+
+COMMUNITY_MODEL_PREFIX = os.getenv("COMMUNITY_MODEL_PREFIX", "https://bj.bcebos.com/paddlenlp/models/community")
+WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/hapi/weights")
+DOWNLOAD_RETRY_LIMIT = 3
+DOWNLOAD_CHECK = False
+
+nlp_models = OrderedDict(
+    (
+        ("RoBERTa-zh-base", "https://bert-models.bj.bcebos.com/chinese_roberta_wwm_ext_L-12_H-768_A-12.tar.gz"),
+        (
+            "RoBERTa-zh-large",
+            "https://bert-models.bj.bcebos.com/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.tar.gz",
+        ),
+        ("ERNIE-v2-en-base", "https://ernie.bj.bcebos.com/ERNIE_Base_en_stable-2.0.0.tar.gz"),
+        ("ERNIE-v2-en-large", "https://ernie.bj.bcebos.com/ERNIE_Large_en_stable-2.0.0.tar.gz"),
+        ("XLNet-cased-base", "https://xlnet.bj.bcebos.com/xlnet_cased_L-12_H-768_A-12.tgz"),
+        ("XLNet-cased-large", "https://xlnet.bj.bcebos.com/xlnet_cased_L-24_H-1024_A-16.tgz"),
+        ("ERNIE-v1-zh-base", "https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz"),
+        ("ERNIE-v1-zh-base-max-len-512", "https://ernie.bj.bcebos.com/ERNIE_1.0_max-len-512.tar.gz"),
+        (
+            "BERT-en-uncased-large-whole-word-masking",
+            "https://bert-models.bj.bcebos.com/wwm_uncased_L-24_H-1024_A-16.tar.gz",
+        ),
+        (
+            "BERT-en-cased-large-whole-word-masking",
+            "https://bert-models.bj.bcebos.com/wwm_cased_L-24_H-1024_A-16.tar.gz",
+        ),
+        ("BERT-en-uncased-base", "https://bert-models.bj.bcebos.com/uncased_L-12_H-768_A-12.tar.gz"),
+        ("BERT-en-uncased-large", "https://bert-models.bj.bcebos.com/uncased_L-24_H-1024_A-16.tar.gz"),
+        ("BERT-en-cased-base", "https://bert-models.bj.bcebos.com/cased_L-12_H-768_A-12.tar.gz"),
+        ("BERT-en-cased-large", "https://bert-models.bj.bcebos.com/cased_L-24_H-1024_A-16.tar.gz"),
+        ("BERT-multilingual-uncased-base", "https://bert-models.bj.bcebos.com/multilingual_L-12_H-768_A-12.tar.gz"),
+        ("BERT-multilingual-cased-base", "https://bert-models.bj.bcebos.com/multi_cased_L-12_H-768_A-12.tar.gz"),
+        ("BERT-zh-base", "https://bert-models.bj.bcebos.com/chinese_L-12_H-768_A-12.tar.gz"),
+    )
+)
+
+
+def is_url(path):
+    """
+    Whether path is URL.
+    Args:
+        path (string): URL string or not.
+    """
+    return path.startswith("http://") or path.startswith("https://")
+
+
+def get_weights_path_from_url(url, md5sum=None):
+    """Get weights path from WEIGHT_HOME, if not exists,
+    download it from url.
+    Args:
+        url (str): download url
+        md5sum (str): md5 sum of download package
+
+    Returns:
+        str: a local path to save downloaded weights.
+    Examples:
+        .. code-block:: python
+            from paddle.utils.download import get_weights_path_from_url
+            resnet18_pretrained_weight_url = 'https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams'
+            local_weight_path = get_weights_path_from_url(resnet18_pretrained_weight_url)
+    """
+    path = get_path_from_url(url, WEIGHTS_HOME, md5sum)
+    return path
+
+
+def _map_path(url, root_dir):
+    # parse path after download under root_dir
+    fname = osp.split(url)[-1]
+    fpath = fname
+    return osp.join(root_dir, fpath)
+
+
+def get_path_from_url(url, root_dir, md5sum=None, check_exist=True):
+    """Download from given url to root_dir.
+    if file or directory specified by url is exists under
+    root_dir, return the path directly, otherwise download
+    from url and decompress it, return the path.
+    Args:
+        url (str): download url
+        root_dir (str): root dir for downloading, it should be
+                        WEIGHTS_HOME or DATASET_HOME
+        md5sum (str): md5 sum of download package
+
+    Returns:
+        str: a local path to save downloaded models & weights & datasets.
+    """
+
+    assert is_url(url), "downloading from {} not a url".format(url)
+    # parse path after download to decompress under root_dir
+    fullpath = _map_path(url, root_dir)
+
+    if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum):
+        logger.info("Found {}".format(fullpath))
+    else:
+        fullpath = _download(url, root_dir, md5sum)
+
+    if tarfile.is_tarfile(fullpath) or zipfile.is_zipfile(fullpath):
+        fullpath = _decompress(fullpath)
+
+    # model tokenizer config, [file-lock]
+    return fullpath
+
+
+def get_path_from_url_with_filelock(
+    url: str, root_dir: str, md5sum: Optional[str] = None, check_exist: bool = True, timeout: float = -1
+) -> str:
+    """construct `get_path_from_url` for `model_utils` to enable downloading multiprocess-safe
+
+    Args:
+        url (str): the url of resource file
+        root_dir (str): the local download path
+        md5sum (str, optional): md5sum string for file. Defaults to None.
+        check_exist (bool, optional): whether check the file is exist. Defaults to True.
+        timeout (int, optional): the timeout for downloading. Defaults to -1.
+
+    Returns:
+        str: the path of downloaded file
+    """
+
+    os.makedirs(root_dir, exist_ok=True)
+
+    # create lock file, which is empty, under the `LOCK_FILE_HOME` directory.
+    lock_file_name = hashlib.md5((url + root_dir).encode("utf-8")).hexdigest()
+
+    # create `.lock` private directory in the cache dir
+    lock_file_path = os.path.join(root_dir, ".lock", lock_file_name)
+
+    os.makedirs(os.path.dirname(lock_file_path), exist_ok=True)
+
+    with FileLock(lock_file_path, timeout=timeout):
+        result = get_path_from_url(url=url, root_dir=root_dir, md5sum=md5sum, check_exist=check_exist)
+    return result
+
+
+def _download(url, path, md5sum=None):
+    """
+    Download from url, save to path.
+    url (str): download url
+    path (str): download to given path
+    """
+    os.makedirs(path, exist_ok=True)
+
+    fname = osp.split(url)[-1]
+    fullname = osp.join(path, fname)
+    retry_cnt = 0
+
+    while not (osp.exists(fullname) and _md5check(fullname, md5sum)):
+        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+            retry_cnt += 1
+        else:
+            raise RuntimeError("Download from {} failed. " "Retry limit reached".format(url))
+
+        logger.info("Downloading {} from {}".format(fname, url))
+
+        req = requests.get(url, stream=True)
+        if req.status_code != 200:
+            raise RuntimeError("Downloading from {} failed with code " "{}!".format(url, req.status_code))
+
+        # For protecting download interupted, download to
+        # tmp_fullname firstly, move tmp_fullname to fullname
+        # after download finished
+        tmp_fullname = fullname + "_tmp"
+        total_size = req.headers.get("content-length")
+        with open(tmp_fullname, "wb") as f:
+            if total_size:
+                with tqdm(total=int(total_size), unit="B", unit_scale=True, unit_divisor=1024) as pbar:
+                    for chunk in req.iter_content(chunk_size=1024):
+                        f.write(chunk)
+                        pbar.update(len(chunk))
+            else:
+                for chunk in req.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+        shutil.move(tmp_fullname, fullname)
+
+    return fullname
+
+
+def _md5check(fullname, md5sum=None):
+    if md5sum is None:
+        return True
+
+    logger.info("File {} md5 checking...".format(fullname))
+    md5 = hashlib.md5()
+    with open(fullname, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5.update(chunk)
+    calc_md5sum = md5.hexdigest()
+
+    if calc_md5sum != md5sum:
+        logger.info("File {} md5 check failed, {}(calc) != " "{}(base)".format(fullname, calc_md5sum, md5sum))
+        return False
+    return True
+
+
+def _md5(text):
+    """
+    Calculate the md5 value of the input text.
+    """
+
+    md5code = hashlib.md5(text.encode())
+    return md5code.hexdigest()
+
+
+def _decompress(fname):
+    """
+    Decompress for zip and tar file
+    """
+    logger.info("Decompressing {}...".format(fname))
+
+    # For protecting decompressing interupted,
+    # decompress to fpath_tmp directory firstly, if decompress
+    # successed, move decompress files to fpath and delete
+    # fpath_tmp and remove download compress file.
+
+    if tarfile.is_tarfile(fname):
+        uncompressed_path = _uncompress_file_tar(fname)
+    elif zipfile.is_zipfile(fname):
+        uncompressed_path = _uncompress_file_zip(fname)
+    else:
+        raise TypeError("Unsupport compress file type {}".format(fname))
+
+    return uncompressed_path
+
+
+def _uncompress_file_zip(filepath):
+    files = zipfile.ZipFile(filepath, "r")
+    file_list = files.namelist()
+
+    file_dir = os.path.dirname(filepath)
+
+    if _is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+
+        for item in file_list:
+            files.extract(item, file_dir)
+
+    elif _is_a_single_dir(file_list):
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+
+        for item in file_list:
+            files.extract(item, file_dir)
+
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        if not os.path.exists(uncompressed_path):
+            os.makedirs(uncompressed_path)
+        for item in file_list:
+            files.extract(item, os.path.join(file_dir, rootpath))
+
+    files.close()
+
+    return uncompressed_path
+
+
+def _uncompress_file_tar(filepath, mode="r:*"):
+    files = tarfile.open(filepath, mode)
+    file_list = files.getnames()
+    file_dir = os.path.dirname(filepath)
+
+    if _is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        files.extractall(file_dir, files.getmembers())
+    elif _is_a_single_dir(file_list):
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        files.extractall(file_dir, files.getmembers())
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        if not os.path.exists(uncompressed_path):
+            os.makedirs(uncompressed_path)
+
+        files.extractall(os.path.join(file_dir, rootpath), files.getmembers())
+
+    files.close()
+
+    return uncompressed_path
+
+
+def _is_a_single_file(file_list):
+    if len(file_list) == 1 and file_list[0].find(os.sep) < -1:
+        return True
+    return False
+
+
+def _is_a_single_dir(file_list):
+    new_file_list = []
+    for file_path in file_list:
+        if "/" in file_path:
+            file_path = file_path.replace("/", os.sep)
+        elif "\\" in file_path:
+            file_path = file_path.replace("\\", os.sep)
+        new_file_list.append(file_path)
+
+    file_name = new_file_list[0].split(os.sep)[0]
+    for i in range(1, len(new_file_list)):
+        if file_name != new_file_list[i].split(os.sep)[0]:
+            return False
+    return True
+
+
+class DownloaderCheck(threading.Thread):
+    """
+    Check the resource applicability  when downloading the models.
+    """
+
+    def __init__(self, task, command="taskflow", addition=None):
+        threading.Thread.__init__(self)
+        self.command = command
+        self.task = task
+        self.addition = addition
+        self._initialize()
+
+    def uri_path(self, server_url, api):
+        srv = server_url
+        if server_url.endswith("/"):
+            srv = server_url[:-1]
+        if api.startswith("/"):
+            srv += api
+        else:
+            api = "/" + api
+            srv += api
+        return srv
+
+    def _initialize(self):
+        etime = str(int(time.time()))
+        self.full_hash_flag = _md5(str(uuid.uuid1())[-12:])
+        self.hash_flag = _md5(str(uuid.uuid1())[9:18]) + "-" + etime
+
+    def request_check(self, task, command, addition):
+        if task is None:
+            return SUCCESS_STATUS
+        payload = {"word": self.task}
+        api_url = self.uri_path(DOWNLOAD_SERVER, "stat")
+        cache_path = os.path.join("～")
+        if os.path.exists(cache_path):
+            extra = {
+                "command": self.command,
+                "mtime": os.stat(cache_path).st_mtime,
+                "hub_name": self.hash_flag,
+                "cache_info": self.full_hash_flag,
+            }
+        else:
+            extra = {
+                "command": self.command,
+                "mtime": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
+                "hub_name": self.hash_flag,
+                "cache_info": self.full_hash_flag,
+            }
+        if addition is not None:
+            extra.update({"addition": addition})
+        try:
+            import paddle
+
+            import paddlenlp
+
+            payload["hub_version"] = " "
+            payload["ppnlp_version"] = paddlenlp.__version__
+            payload["paddle_version"] = paddle.__version__.split("-")[0]
+            payload["from"] = "ppnlp"
+            payload["extra"] = json.dumps(extra)
+            r = requests.get(api_url, payload, timeout=1).json()
+            if r.get("update_cache", 0) == 1:
+                return SUCCESS_STATUS
+            else:
+                return FAILED_STATUS
+        except Exception:
+            return FAILED_STATUS
+
+    def run(self):
+        self.request_check(self.task, self.command, self.addition)
+
+
+def download_check(model_id, model_class, addition=None):
+    logger.disable()
+    global DOWNLOAD_CHECK
+    if not DOWNLOAD_CHECK:
+        DOWNLOAD_CHECK = True
+        checker = DownloaderCheck(model_id, model_class, addition)
+        checker.start()
+        checker.join()
+    logger.enable()
+
+
+def url_file_exists(url: str) -> bool:
+    """check whether the url file exists
+
+        refer to: https://stackoverflow.com/questions/2486145/python-check-if-url-to-jpg-exists
+
+    Args:
+        url (str): the url of target file
+
+    Returns:
+        bool: whether the url file exists
+    """
+    if not is_url(url):
+        return False
+
+    result = requests.head(url)
+    return result.status_code == requests.codes.ok
+
+
+def hf_file_exists(
+    repo_id: str, filename: str, token: Union[bool, str, None] = None, subfolder: Optional[str] = None
+) -> bool:
+    """Check whether the HF file exists
+
+    Args:
+        repo_id (`str`): A namespace (user or an organization) name and a repo name separated by a `/`.
+        filename (`str`): The name of the file in the repo.
+        token (`str` or `bool`, *optional*): A token to be used for the download.
+            - If `True`, the token is read from the HuggingFace config folder.
+            - If `False` or `None`, no token is provided.
+            - If a string, it's used as the authentication token.
+        subfolder (str, optional) An optional value corresponding to a folder inside the repo.
+    Returns:
+        bool: whether the HF file exists
+    """
+
+    url = hf_hub_url(repo_id=repo_id, filename=filename, subfolder=subfolder)
+    try:
+        _ = get_hf_file_metadata(
+            url=url,
+            token=token,
+        )
+        return True
+    except EntryNotFoundError:
+        return False
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/env.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/env.py
new file mode 100644
index 000000000..f57380fb4
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/env.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module is used to store environmental variables in PaddleNLP.
+PPNLP_HOME              -->  the root directory for storing PaddleNLP related data. Default to ~/.paddlenlp. Users can change the
+├                            default value through the PPNLP_HOME environment variable.
+├─ MODEL_HOME              -->  Store model files.
+└─ DATA_HOME         -->  Store automatically downloaded datasets.
+"""
+import os
+
+
+def _get_user_home():
+    return os.path.expanduser("~")
+
+
+def _get_ppnlp_home():
+    if "PPNLP_HOME" in os.environ:
+        home_path = os.environ["PPNLP_HOME"]
+        if os.path.exists(home_path):
+            if os.path.isdir(home_path):
+                return home_path
+            else:
+                raise RuntimeError("The environment variable PPNLP_HOME {} is not a directory.".format(home_path))
+        else:
+            return home_path
+    return os.path.join(_get_user_home(), ".paddlenlp")
+
+
+def _get_sub_home(directory, parent_home=_get_ppnlp_home()):
+    home = os.path.join(parent_home, directory)
+    if not os.path.exists(home):
+        os.makedirs(home, exist_ok=True)
+    return home
+
+
+def _get_bool_env(env_key: str, default_value: str) -> bool:
+    """get boolean environment variable, which can be "true", "True", "1"
+
+    Args:
+        env_key (str): key of env variable
+    """
+    value = os.getenv(env_key, default_value).lower()
+    return value in ["true", "1"]
+
+
+USER_HOME = _get_user_home()
+PPNLP_HOME = _get_ppnlp_home()
+MODEL_HOME = _get_sub_home("models")
+HF_CACHE_HOME = os.environ.get("HUGGINGFACE_HUB_CACHE", MODEL_HOME)
+DATA_HOME = _get_sub_home("datasets")
+PACKAGE_HOME = _get_sub_home("packages")
+DOWNLOAD_SERVER = "http://paddlepaddle.org.cn/paddlehub"
+FAILED_STATUS = -1
+SUCCESS_STATUS = 0
+
+SPECIAL_TOKENS_MAP_NAME = "special_tokens_map.json"
+ADDED_TOKENS_NAME = "added_tokens.json"
+LEGACY_CONFIG_NAME = "model_config.json"
+CONFIG_NAME = "config.json"
+TOKENIZER_CONFIG_NAME = "tokenizer_config.json"
+CHAT_TEMPLATE_CONFIG_NAME = "chat_template.json"
+GENERATION_CONFIG_NAME = "generation_config.json"
+# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
+FULL_TOKENIZER_NAME = "tokenizer.json"
+
+
+LORA_CONFIG_NAME = "lora_config.json"
+LORA_WEIGHTS_NAME = "lora_model_state.pdparams"
+
+VERA_CONFIG_NAME = "vera_config.json"
+VERA_WEIGHTS_NAME = "vera_model_state.pdparams"
+
+PREFIX_CONFIG_NAME = "prefix_config.json"
+PREFIX_WEIGHTS_NAME = "prefix_model_state.pdparams"
+PADDLE_PEFT_WEIGHTS_INDEX_NAME = "peft_model.pdparams.index.json"
+
+PAST_KEY_VALUES_FILE_NAME = "pre_caches.npy"
+
+PADDLE_WEIGHTS_NAME = "model_state.pdparams"
+PADDLE_WEIGHTS_INDEX_NAME = "model_state.pdparams.index.json"
+
+PYTORCH_WEIGHTS_NAME = "pytorch_model.bin"
+PYTORCH_WEIGHTS_INDEX_NAME = "pytorch_model.bin.index.json"
+
+SAFE_WEIGHTS_NAME = "model.safetensors"
+SAFE_WEIGHTS_INDEX_NAME = "model.safetensors.index.json"
+
+PADDLE_OPTIMIZER_NAME = "optimizer.pdopt"
+PADDLE_OPTIMIZER_INDEX_NAME = "optimizer.pdopt.index.json"
+
+SAFE_OPTIMIZER_NAME = "optimizer.safetensors"
+SAFE_OPTIMIZER_INDEX_NAME = "optimizer.safetensors.index.json"
+
+PADDLE_MASTER_WEIGHTS_NAME = "master_weights.pdparams"
+PADDLE_MASTER_WEIGHTS_INDEX_NAME = "master_weights.pdparams.index.json"
+
+SAFE_MASTER_WEIGHTS_NAME = "master_weights.safetensors"
+SAFE_MASTER_WEIGHTS_INDEX_NAME = "master_weights.safetensors.index.json"
+
+SAFE_PEFT_WEIGHTS_NAME = "peft_model.safetensors"
+SAFE_PEFT_WEIGHTS_INDEX_NAME = "peft_model.safetensors.index.json"
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/ie_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/ie_utils.py
new file mode 100644
index 000000000..0a8969bad
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/ie_utils.py
@@ -0,0 +1,142 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from io import BytesIO
+
+import numpy as np
+import paddle
+from PIL import Image
+
+from ..metrics import SpanEvaluator
+from .image_utils import NormalizeImage, Permute, ResizeImage
+
+resize_func = ResizeImage(target_size=224, interp=1)
+norm_func = NormalizeImage(is_channel_first=False, mean=[123.675, 116.280, 103.530], std=[58.395, 57.120, 57.375])
+permute_func = Permute(to_bgr=False)
+
+
+def map_offset(ori_offset, offset_mapping):
+    """
+    map ori offset to token offset
+    """
+    for index, span in enumerate(offset_mapping):
+        if span[0] <= ori_offset < span[1]:
+            return index
+    return -1
+
+
+def pad_image_data(image_data):
+    if not image_data:
+        image = np.zeros([3, 224, 224])
+        return image
+    # decode image
+    data = np.frombuffer(bytearray(image_data), dtype="uint8")
+    image = np.array(Image.open(BytesIO(data)).convert("RGB"))
+    sample = {"image": image}
+    # resize image
+    sample = resize_func(sample)
+    # norm image
+    sample = norm_func(sample)
+    # permute
+    sample = permute_func(sample)
+    return sample["image"]
+
+
+def unify_prompt_name(prompt):
+    # The classification labels are shuffled during finetuning, so they need
+    # to be unified during evaluation.
+    if re.search(r"\[.*?\]$", prompt):
+        prompt_prefix = prompt[: prompt.find("[", 1)]
+        cls_options = re.search(r"\[.*?\]$", prompt).group()[1:-1].split(",")
+        cls_options = sorted(list(set(cls_options)))
+        cls_options = ",".join(cls_options)
+        prompt = prompt_prefix + "[" + cls_options + "]"
+        return prompt
+    return prompt
+
+
+def get_relation_type_dict(relation_data, schema_lang="ch"):
+    def compare(a, b, schema_lang="ch"):
+        if schema_lang == "ch":
+            a = a[::-1]
+            b = b[::-1]
+
+        res = ""
+        for i in range(min(len(a), len(b))):
+            if a[i] == b[i]:
+                res += a[i]
+            else:
+                break
+        if res == "":
+            return res
+        if schema_lang == "ch" and res[::-1][0] == "的":
+            return res[::-1][1:]
+        elif schema_lang == "en" and res[-3:] == " of":
+            return res[:-3]
+        return ""
+
+    relation_type_dict = {}
+    added_list = []
+    for i in range(len(relation_data)):
+        added = False
+        if relation_data[i][0] not in added_list:
+            for j in range(i + 1, len(relation_data)):
+                match = compare(relation_data[i][0], relation_data[j][0], schema_lang=schema_lang)
+                if match != "":
+                    match = unify_prompt_name(match)
+                    if relation_data[i][0] not in added_list:
+                        added_list.append(relation_data[i][0])
+                        relation_type_dict.setdefault(match, []).append(relation_data[i][1])
+                    added_list.append(relation_data[j][0])
+                    relation_type_dict.setdefault(match, []).append(relation_data[j][1])
+                    added = True
+            if not added:
+                added_list.append(relation_data[i][0])
+                if schema_lang == "ch":
+                    suffix = relation_data[i][0].rsplit("的", 1)[1]
+                    suffix = unify_prompt_name(suffix)
+                    relation_type = suffix
+                else:
+                    prefix = relation_data[i][0].split(" of ", 1)[0]
+                    prefix = unify_prompt_name(prefix)
+                    relation_type = prefix
+                relation_type_dict.setdefault(relation_type, []).append(relation_data[i][1])
+    return relation_type_dict
+
+
+def uie_loss_func(outputs, labels):
+    criterion = paddle.nn.BCELoss()
+    start_ids, end_ids = labels
+    start_prob, end_prob = outputs
+    start_ids = paddle.cast(start_ids, "float32")
+    end_ids = paddle.cast(end_ids, "float32")
+    loss_start = criterion(start_prob, start_ids)
+    loss_end = criterion(end_prob, end_ids)
+    loss = (loss_start + loss_end) / 2.0
+    return loss
+
+
+def compute_metrics(p):
+    metric = SpanEvaluator()
+    start_prob, end_prob = p.predictions
+    start_ids, end_ids = p.label_ids
+    metric.reset()
+
+    num_correct, num_infer, num_label = metric.compute(start_prob, end_prob, start_ids, end_ids)
+    metric.update(num_correct, num_infer, num_label)
+    precision, recall, f1 = metric.accumulate()
+    metric.reset()
+
+    return {"precision": precision, "recall": recall, "f1": f1}
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/image_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/image_utils.py
new file mode 100644
index 000000000..708eb15cb
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/image_utils.py
@@ -0,0 +1,734 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+import random
+import re
+import uuid
+from collections.abc import Sequence
+from functools import cmp_to_key
+from io import BytesIO
+
+import numpy as np
+from PIL import Image
+
+
+class BaseOperator(object):
+    def __init__(self, name=None):
+        if name is None:
+            name = self.__class__.__name__
+        self._id = name + "_" + str(uuid.uuid4())[-6:]
+
+    def __call__(self, sample, context=None):
+        """Process a sample.
+        Args:
+            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
+            context (dict): info about this sample processing
+        Returns:
+            result (dict): a processed sample
+        """
+        return sample
+
+    def __str__(self):
+        return str(self._id)
+
+
+class DecodeImage(BaseOperator):
+    def __init__(self):
+        """Transform the image data to numpy format."""
+        super(DecodeImage, self).__init__()
+
+    def __call__(self, sample, context=None):
+        """load image if 'im_file' field is not empty but 'image' is"""
+        if "image" not in sample:
+            sample["image"] = base64.b64decode(sample["im_base64"].encode("utf-8"))
+
+        im = sample["image"]
+        data = np.frombuffer(bytearray(im), dtype="uint8")
+        im = np.array(Image.open(BytesIO(data)).convert("RGB"))  # RGB format
+        sample["image"] = im
+
+        if "h" not in sample:
+            sample["h"] = im.shape[0]
+        elif sample["h"] != im.shape[0]:
+            sample["h"] = im.shape[0]
+        if "w" not in sample:
+            sample["w"] = im.shape[1]
+        elif sample["w"] != im.shape[1]:
+            sample["w"] = im.shape[1]
+
+        # make default im_info with [h, w, 1]
+        sample["im_info"] = np.array([im.shape[0], im.shape[1], 1.0], dtype=np.float32)
+        return sample
+
+
+class ResizeImage(BaseOperator):
+    def __init__(self, target_size=0, interp=1):
+        """
+        Rescale image to the specified target size, and capped at max_size
+        if max_size != 0.
+        If target_size is list, selected a scale randomly as the specified
+        target size.
+        Args:
+            target_size (int|list): the target size of image's short side,
+                multi-scale training is adopted when type is list.
+            interp (int): the interpolation method
+        """
+        super(ResizeImage, self).__init__()
+        self.interp = int(interp)
+        if not (isinstance(target_size, int) or isinstance(target_size, list)):
+            raise TypeError(
+                "Type of target_size is invalid. Must be Integer or List, now is {}".format(type(target_size))
+            )
+        self.target_size = target_size
+
+    def __call__(self, sample, context=None, save_real_img=False):
+        """Resize the image numpy."""
+        im = sample["image"]
+        if not isinstance(im, np.ndarray):
+            raise TypeError("{}: image type is not numpy.".format(self))
+        im_shape = im.shape
+        im_size_min = np.min(im_shape[0:2])
+        if isinstance(self.target_size, list):
+            # Case for multi-scale training
+            selected_size = random.choice(self.target_size)
+        else:
+            selected_size = self.target_size
+        if float(im_size_min) == 0:
+            raise ZeroDivisionError("{}: min size of image is 0".format(self))
+
+        resize_w = selected_size
+        resize_h = selected_size
+
+        im = Image.fromarray(im.astype("uint8"))
+        im = im.resize((int(resize_w), int(resize_h)), self.interp)
+        sample["image"] = np.array(im)
+        return sample
+
+
+class Permute(BaseOperator):
+    def __init__(self, to_bgr=True):
+        """
+        Change the channel.
+        Args:
+            to_bgr (bool): confirm whether to convert RGB to BGR
+        """
+        super(Permute, self).__init__()
+        self.to_bgr = to_bgr
+
+    def __call__(self, sample, context=None):
+        samples = sample
+        batch_input = True
+        if not isinstance(samples, Sequence):
+            batch_input = False
+            samples = [samples]
+        for sample in samples:
+            assert "image" in sample, "image data not found"
+            for k in sample.keys():
+                # hard code
+                if k.startswith("image"):
+                    im = sample[k]
+                    im = np.swapaxes(im, 1, 2)
+                    im = np.swapaxes(im, 1, 0)
+                    if self.to_bgr:
+                        im = im[[2, 1, 0], :, :]
+                    sample[k] = im
+        if not batch_input:
+            samples = samples[0]
+        return samples
+
+
+class NormalizeImage(BaseOperator):
+    def __init__(self, mean=[0.485, 0.456, 0.406], std=[1, 1, 1], is_channel_first=True, is_scale=False):
+        """
+        Args:
+            mean (list): the pixel mean
+            std (list): the pixel variance
+            channel_first (bool): confirm whether to change channel
+        """
+        super(NormalizeImage, self).__init__()
+        self.mean = mean
+        self.std = std
+        self.is_channel_first = is_channel_first
+        self.is_scale = is_scale
+        from functools import reduce
+
+        if reduce(lambda x, y: x * y, self.std) == 0:
+            raise ValueError("{}: std is invalid!".format(self))
+
+    def __call__(self, sample, context=None):
+        """Normalize the image.
+        Operators:
+            1.(optional) Scale the image to [0,1]
+            2. Each pixel minus mean and is divided by std
+        """
+        samples = sample
+        batch_input = True
+        if not isinstance(samples, Sequence):
+            batch_input = False
+            samples = [samples]
+        for sample in samples:
+            for k in sample.keys():
+                if k.startswith("image"):
+                    im = sample[k]
+                    im = im.astype(np.float32, copy=False)
+                    if self.is_channel_first:
+                        mean = np.array(self.mean)[:, np.newaxis, np.newaxis]
+                        std = np.array(self.std)[:, np.newaxis, np.newaxis]
+                    else:
+                        mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
+                        std = np.array(self.std)[np.newaxis, np.newaxis, :]
+                    if self.is_scale:
+                        im = im / 255.0
+                    im -= mean
+                    im /= std
+                    sample[k] = im
+        if not batch_input:
+            samples = samples[0]
+        return samples
+
+
+class PadBatch(BaseOperator):
+    """
+    Pad a batch of samples so they can be divisible by a stride.
+    The layout of each image should be 'CHW'.
+    Args:
+        pad_to_stride (int): If `pad_to_stride > 0`, pad zeros to ensure
+            height and width is divisible by `pad_to_stride`.
+    """
+
+    def __init__(self, pad_to_stride=0, use_padded_im_info=True):
+        super(PadBatch, self).__init__()
+        self.pad_to_stride = pad_to_stride
+        self.use_padded_im_info = use_padded_im_info
+
+    def __call__(self, samples, context=None):
+        """
+        Args:
+            samples (list): a batch of sample, each is dict.
+        """
+        coarsest_stride = self.pad_to_stride
+        if coarsest_stride == 0:
+            return samples
+        max_shape = np.array([data["image"].shape for data in samples]).max(axis=0)
+
+        if coarsest_stride > 0:
+            max_shape[1] = int(np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride)
+            max_shape[2] = int(np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride)
+
+        for data in samples:
+            im = data["image"]
+            im_c, im_h, im_w = im.shape[:]
+            padding_im = np.zeros((im_c, max_shape[1], max_shape[2]), dtype=np.float32)
+            padding_im[:, :im_h, :im_w] = im
+            data["image"] = padding_im
+            if self.use_padded_im_info:
+                data["im_info"][:2] = max_shape[1:3]
+        return samples
+
+
+def check(s):
+    """Check whether is English"""
+    my_re = re.compile(r"[A-Za-z0-9]", re.S)
+    res = re.findall(my_re, s)
+    if len(res):
+        return True
+    return False
+
+
+def img2base64(img_path):
+    """get base64"""
+    with open(img_path, "rb") as f:
+        base64_str = base64.b64encode(f.read()).decode("utf-8")
+    return base64_str
+
+
+def np2base64(image_np):
+    img = Image.fromarray(image_np)
+    base64_str = pil2base64(img)
+    return base64_str
+
+
+def pil2base64(image, image_type=None, size=False):
+    if not image_type:
+        image_type = "JPEG"
+    img_buffer = BytesIO()
+    image.save(img_buffer, format=image_type)
+
+    byte_data = img_buffer.getvalue()
+    base64_str = base64.b64encode(byte_data)
+
+    base64_string = base64_str.decode("utf-8")
+
+    if size:
+        return base64_string, image.size
+    else:
+        return base64_string
+
+
+class Bbox(object):
+    """
+    The inner store format of `Bbox` is (left, top, width, height).
+
+    The user may instance plenty of `Bbox`, thats why we insist the `Bbox` only contains four variables.
+    """
+
+    __slots__ = ["_c_left", "_c_top", "_c_width", "_c_height"]
+
+    def __init__(self, left=0, top=0, width=0, height=0):
+        """
+        Constructor of `Bbox`.
+
+        >> left: The most left position of bounding box.
+        >> right: The most right position of bounding box.
+        >> width: The width of bounding box.
+        >> height: The height of bounding box.
+
+        ^^ AssertionError: width and height must larger than 0.
+        """
+        assert width >= 0, "width {} must no less than 0".format(width)
+        assert height >= 0, "height {} must no less than 0".format(height)
+        self._c_left, self._c_top, self._c_width, self._c_height = left, top, width, height
+
+    def __str__(self):
+        """
+        Reload the `str` operator.
+        """
+        return repr(self)
+
+    def __repr__(self):
+        """
+        Reload the `repr` operator.
+        """
+        return "(x={}, y={}, w={}, h={})".format(self.left, self.top, self.width, self.height)
+
+    def __eq__(self, other):
+        """
+        if `self` is equal with given `other` box.
+
+        >> other: The comparing box instance.
+
+        << True if two box is equal else False.
+        """
+        return (
+            self.left == other.left
+            and self.top == other.top
+            and self.width == other.width
+            and self.height == other.height
+        )
+
+    def tuple(self, precision=3):
+        """
+        Return the tuple format box.
+        """
+        return tuple(round(one, precision) for one in (self.left, self.top, self.width, self.height))
+
+    def list_int(self):
+        """
+        Return the list(int) format box.
+        """
+        return list(int(one) for one in (self.left, self.top, self.width, self.height))
+
+    def points_tuple(self, precision=3):
+        """
+        Return the coordinate of box
+        """
+        return tuple(round(one, precision) for one in (self.left, self.top, self.right, self.bottom))
+
+    @property
+    def left(self):
+        """
+        Visit the most left position of bounding box.
+        """
+        return self._c_left
+
+    @left.setter
+    def left(self, left):
+        """
+        Set the most left position of bounding box.
+        """
+        self._c_left = left
+
+    @property
+    def right(self):
+        """
+        Visit the most right position of bounding box.
+        """
+        return self._c_left + self._c_width
+
+    @right.setter
+    def right(self, right):
+        """
+        Set the most right position of bounding box.
+
+        ^^ AssertionError: when right is less than left.
+        """
+        assert right >= self._c_left, "right {} < left {} is forbidden.".format(right, self._c_left)
+        self._c_width = right - self._c_left
+
+    @property
+    def top(self):
+        """
+        Visit the most top position of bounding box.
+        """
+        return self._c_top
+
+    @top.setter
+    def top(self, top):
+        """
+        Set the most top position of bounding box.
+        """
+        self._c_top = top
+
+    @property
+    def bottom(self):
+        """
+        Visit the most bottom position of bounding box.
+        """
+        return self._c_top + self._c_height
+
+    @bottom.setter
+    def bottom(self, bottom):
+        """
+        Set the most bottom position of bounding box.
+
+        ^^ AssertionError: when bottom is less than top.
+        """
+        assert bottom >= self._c_top, "top {} > bottom {} is forbidden.".format(self._c_top, bottom)
+        self._c_height = bottom - self._c_top
+
+    @property
+    def width(self):
+        """
+        Visit the width of bounding box.
+        """
+        return self._c_width
+
+    @width.setter
+    def width(self, width):
+        """
+        Set the width of bounding box.
+
+        ^^ AssertionError: when width is less than 0.
+        """
+        assert width >= 0, "width {} < 0 is forbidden.".format(width)
+        self._c_width = width
+
+    @property
+    def height(self):
+        """
+        Visit the height of bounding box.
+        """
+        return self._c_height
+
+    @height.setter
+    def height(self, height):
+        """
+        Set the height of bounding box.
+
+        ^^ AssertionError: when height is less than 0.
+        """
+        assert height >= 0, "height {} < 0 is forbidden.".format(height)
+        self._c_height = height
+
+    def is_cross_boundary(self, width, height, top=0, left=0):
+        """
+        If this box is cross boundary of given boundary. The boundary is start at (0, 0) by default.
+
+        >> width: The width of boundary.
+        >> height: The height of boundary.
+        >> top: The top-left point location. Default at (0, 0)
+        >> left: The top-left point location. Default at (0, 0)
+        """
+        boundary = Bbox(top, left, width, height)
+        return boundary.contain(self)
+
+    def is_vertical(self):
+        """
+        If this box is vertical.
+        """
+        return self.width < self.height
+
+    def is_horizontal(self):
+        """
+        If this box is horizontal.
+        """
+        return self.width > self.height
+
+    def is_square(self):
+        """
+        If this box is square.
+        """
+        return self.width == self.height
+
+    def center(self):
+        """
+        Return the center point of this box.
+        """
+        return (self.left + self.width / 2.0, self.top + self.height / 2.0)
+
+    def points(self):
+        """
+        Convert bounding box to main corner points (left, top) + (right, bottom).
+
+        << Two tuple of points, left-top and right-bottom respectively.
+        """
+        return (self.left, self.top), (self.right, self.bottom)
+
+    def contain(self, box):
+        """
+        If given `box` is contained by `self`.
+
+        >> box: The box supposed to be contained.
+
+        << True if `self` contains `box` else False
+        """
+        return self.left <= box.left and self.top <= box.top and self.right >= box.right and self.bottom >= box.bottom
+
+    def overlap_vertically(self, box):
+        """
+        If given `box` is overlap with `self` vertically.
+
+        >> box: The comparing box.
+
+        << True if overlap with each others vertically else False.
+        """
+        return not (self.top >= box.bottom or self.bottom <= box.top)
+
+    def overlap_horizontally(self, box):
+        """
+        If given `box` is overlap with `self` horizontally.
+
+        >> box: The comparing box.
+
+        << True if overlap with each others horizontally else False.
+        """
+        return not (self.left >= box.right or self.right <= box.left)
+
+    def overlap(self, box):
+        """
+        If given `box` is overlap with `self`.
+
+        >> box: The comparing box.
+
+        << True if overlap with each others else False.
+        """
+        return self.overlap_horizontally(box) and self.overlap_vertically(box)
+
+    def hoverlap(self, box):
+        """
+        The value of overlapped horizontally.
+
+        >> box: The calculating box.
+        """
+        if not self.overlap_horizontally(box):
+            return 0
+
+        return min(self.right, box.right) - max(self.left, box.left)
+
+    def voverlap(self, box):
+        """
+        The value of overlap vertically.
+
+        >> box: The calculating box.
+        """
+        if not self.overlap_vertically(box):
+            return 0
+
+        return min(self.bottom, box.bottom) - max(self.top, box.top)
+
+    def hdistance(self, box):
+        """
+        The distance of two boxes horizontally.
+
+        >> box: The calculating box.
+        """
+        if self.overlap_horizontally(box):
+            return 0
+
+        return max(self.left, box.left) - min(self.right, box.right)
+
+    def vdistance(self, box):
+        """
+        The distance of two boxes vertically.
+
+        >> box: The calculating box.
+        """
+        if self.overlap_vertically(box):
+            return 0
+
+        return max(self.top, box.top) - min(self.bottom, box.bottom)
+
+    def area(self):
+        """
+        Calculate the area within the bounding box.
+        """
+        return self.width * self.height
+
+    def translate(self, vector):
+        """
+        Translate box in the direction of vector
+        """
+        return Bbox(self.left + vector[0], self.top + vector[1], self.width, self.height)
+
+    @staticmethod
+    def union(*boxes):
+        """
+        Calculate the union bounding box of given `boxes`.
+
+        >> boxes: The boxes to calculate with.
+
+        << The union `Bbox` of `boxes`.
+        """
+        left, top = min([box.left for box in boxes]), min([box.top for box in boxes])
+        right, bottom = max([box.right for box in boxes]), max([box.bottom for box in boxes])
+
+        return Bbox.from_points((left, top), (right, bottom))
+
+    @staticmethod
+    def adjacency(boxa, boxb):
+        """
+        Calculate the adjacent bounding box of given boxes.
+
+        >> boxa: The box to calculate with.
+        >> boxb: The box to calculate with.
+
+        << The adjacent `Bbox` of boxes.
+        """
+        horizon = [min(boxa.right, boxb.right), max(boxa.left, boxb.left)]
+        vertical = [min(boxa.bottom, boxb.bottom), max(boxa.top, boxb.top)]
+
+        left, right = min(horizon), max(horizon)
+        top, bottom = min(vertical), max(vertical)
+
+        return Bbox.from_points((left, top), (right, bottom))
+
+    @staticmethod
+    def intersection(*boxes):
+        """
+        Calculate the intersection bounding box of given `boxes`.
+
+        >> boxes: The boxes to calculate with.
+
+        << The intersection `Bbox` of `boxes`.
+        """
+        left, top = max(box.left for box in boxes), max(box.top for box in boxes)
+        right, bottom = min(box.right for box in boxes), min(box.bottom for box in boxes)
+
+        if left > right or top > bottom:
+            return Bbox()
+
+        return Bbox.from_points((left, top), (right, bottom))
+
+    @staticmethod
+    def iou(boxa, boxb):
+        """
+        Calculate the union area divided by intersection area.
+
+        >> boxa: The box to calculate with.
+        >> boxb: The box to calculate with.
+        """
+        return Bbox.intersection(boxa, boxb).area() / Bbox.union(boxa, boxb).area()
+
+    @staticmethod
+    def from_points(p0, p1):
+        """
+        Convert main corner points to bounding box.
+
+        >> p0: The left-top points in (x, y).
+        >> p1: The right-bottom points in (x, y).
+
+        << The instance of `Bbox`.
+
+        ^^ AssertionError: if width or height is less than 0.
+        """
+        assert p1[0] >= p0[0], "width {} must larger than 0.".format(p1[0] - p0[0])
+        assert p1[1] >= p0[1], "height {} must larger than 0.".format(p1[1] - p0[1])
+
+        return Bbox(p0[0], p0[1], p1[0] - p0[0], p1[1] - p0[1])
+
+
+def two_dimension_sort_box(box1: Bbox, box2: Bbox, vratio=0.5):
+    """bbox sort 2D
+
+    Args:
+        box1 (Bbox): [bbox1]
+        box2 (Bbox): [bbox2]
+        vratio (float, optional): [description]. Defaults to 0.5.
+
+    Returns:
+        [type]: [description]
+    """
+    kernel = [box1.left - box2.left, box1.top - box2.top]
+    if box1.voverlap(box2) < vratio * min(box1.height, box2.height):
+        kernel = [box1.top - box2.top, box1.left - box2.left]
+    return kernel[0] if kernel[0] != 0 else kernel[1]
+
+
+def two_dimension_sort_layout(layout1, layout2, vratio=0.54):
+    """Layout sort"""
+    return two_dimension_sort_box(layout1["bbox"], layout2["bbox"])
+
+
+def ppocr2example(ocr_res, img_path):
+    """Transfer paddleocr result to example"""
+    segments = []
+    for rst in ocr_res:
+        left = min(rst[0][0][0], rst[0][3][0])
+        top = min(rst[0][0][-1], rst[0][1][-1])
+        width = max(rst[0][1][0], rst[0][2][0]) - min(rst[0][0][0], rst[0][3][0])
+        height = max(rst[0][2][-1], rst[0][3][-1]) - min(rst[0][0][-1], rst[0][1][-1])
+        segments.append({"bbox": Bbox(*[left, top, width, height]), "text": rst[-1][0]})
+    segments.sort(key=cmp_to_key(two_dimension_sort_layout))
+    img_base64 = img2base64(img_path)
+    doc_tokens = []
+    doc_boxes = []
+
+    im_w_box = max([seg["bbox"].left + seg["bbox"].width for seg in segments]) + 20 if segments else 0
+    im_h_box = max([seg["bbox"].top + seg["bbox"].height for seg in segments]) + 20 if segments else 0
+    img = Image.open(img_path)
+    im_w, im_h = img.size
+    im_w, im_h = max(im_w, im_w_box), max(im_h, im_h_box)
+
+    for segment in segments:
+        bbox = segment["bbox"]
+        x1, y1, w, h = bbox.left, bbox.top, bbox.width, bbox.height
+        bbox = Bbox(*[x1, y1, w, h])
+        text = segment["text"]
+        char_num = 0
+        eng_word = ""
+        for char in text:
+            if not check(char) and not eng_word:
+                doc_tokens.append(char)
+                char_num += 1
+            elif not check(char) and eng_word:
+                doc_tokens.append(eng_word)
+                eng_word = ""
+                doc_tokens.append(char)
+                char_num += 2
+            else:
+                eng_word += char
+        if eng_word:
+            doc_tokens.append(eng_word)
+            char_num += 1
+        char_width = int(w / char_num)
+        for char_idx in range(char_num):
+            doc_boxes.append([Bbox(*[bbox.left + (char_width * char_idx), bbox.top, char_width, bbox.height])])
+    new_doc_boxes = []
+    for doc_box in doc_boxes:
+        bbox = doc_box[0]
+        new_doc_boxes.append([bbox.left, bbox.top, bbox.right, bbox.bottom])
+    doc_boxes = new_doc_boxes
+    example = {"text": doc_tokens, "bbox": doc_boxes, "width": im_w, "height": im_h, "image": img_base64}
+    return example
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/import_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/import_utils.py
new file mode 100644
index 000000000..3da810b7b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/import_utils.py
@@ -0,0 +1,202 @@
+# Copyright (c) 2022  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import importlib.util
+import os
+import shutil
+import site
+import sys
+from typing import Optional, Type
+
+import pip
+
+from paddlenlp.utils.log import logger
+
+
+def is_datasets_available():
+    import importlib
+
+    return importlib.util.find_spec("datasets") is not None
+
+
+def is_paddle_cuda_available() -> bool:
+    if is_paddle_available():
+        import paddle
+
+        return paddle.device.cuda.device_count() > 0
+    else:
+        return False
+
+
+def is_paddle_available() -> bool:
+    """check if `torch` package is installed
+    Returns:
+        bool: if `torch` is available
+    """
+    return is_package_available("paddle")
+
+
+def is_psutil_available():
+    return importlib.util.find_spec("psutil") is not None
+
+
+def is_tiktoken_available():
+    return importlib.util.find_spec("tiktoken") is not None
+
+
+def is_torch_available() -> bool:
+    """check if `torch` package is installed
+    Returns:
+        bool: if `torch` is available
+    """
+    return is_package_available("torch")
+
+
+def is_package_available(package_name: str) -> bool:
+    """check if the package is avaliable
+    Args:
+        package_name (str): the installed package name
+    Returns:
+        bool: the existence of installed package
+    """
+    package_spec = importlib.util.find_spec(package_name)
+    return package_spec is not None and package_spec.has_location
+
+
+def is_fast_tokenizer_available() -> bool:
+    """check if `fast_tokenizer` ia avaliable
+    Returns:
+        bool: if `fast_tokenizer` is avaliable
+    """
+    return is_package_available("fast_tokenizer")
+
+
+def is_tokenizers_available() -> bool:
+    """check if `tokenizers` ia available
+    Returns:
+        bool: if `tokenizers` is available
+    """
+    return is_package_available("tokenizers")
+
+
+def is_paddlenlp_ops_available() -> bool:
+    """check if `paddlenlp_ops` ia avaliable
+    Returns:
+        bool: if `paddlenlp_ops` is avaliable
+    """
+    return is_package_available("paddlenlp_ops")
+
+
+def is_transformers_available() -> bool:
+    """check if `transformers` package is installed
+    Returns:
+        bool: if `transformers` is available
+    """
+    return is_package_available("transformers")
+
+
+def install_package(
+    package_name: str,
+    version: Optional[str] = None,
+    module_name: Optional[str] = None,
+    cache_dir: Optional[str] = None,
+):
+    """install the specific version of package
+
+    Args:
+        package_name (str): the name of package
+        version (str): the version of package
+        module_name (str): the imported name of package
+        cache_dir (str): cache dir
+    """
+    module_name = module_name or package_name
+
+    # 1. remove the existing version of package
+    uninstall_package(package_name, module_name)
+
+    # 2. install the package
+    if version:
+        package_name += f"=={version}"
+
+    arguments = ["install"]
+    if cache_dir:
+        arguments += ["-t", cache_dir]
+        sys.path.insert(0, cache_dir)
+
+    # 3. load the pypi mirror to speedup of installing packages
+    mirror_key = "PYPI_MIRROR"
+    mirror_source = os.environ.get(mirror_key, None)
+    if mirror_source is not None:
+        logger.info(f"loading <{mirror_source}> from as the final mirror source to install package.")
+        arguments += ["-i", mirror_source]
+
+    arguments += [package_name]
+    pip.main(arguments)
+
+    # 4. add site-package to the top of package
+    for site_package_dir in site.getsitepackages():
+        sys.path.insert(0, site_package_dir)
+
+
+def uninstall_package(package_name: str, module_name: Optional[str] = None):
+    """uninstall the pacakge from site-packages.
+
+    To remove the cache of source package module & class & method, it should:
+        1. remove the source files of packages under the `site-packages` dir.
+        2. remove the cache under the `locals()`
+        3. remove the cache under the `sys.modules`
+
+    Args:
+        package_name (str): the name of package
+    """
+    module_name = module_name or package_name
+    for site_package_dir in site.getsitepackages():
+        if os.path.exists(site_package_dir):
+            for file in os.listdir(site_package_dir):
+                package_dir = os.path.join(site_package_dir, file)
+                if file.startswith(package_name) and os.path.isdir(package_dir):
+                    shutil.rmtree(package_dir)
+
+    for site_package_dir in site.getsitepackages():
+        while sys.path[0] == site_package_dir:
+            sys.path.pop(0)
+
+    for key in list(locals().keys()):
+        if module_name in key:
+            del locals()[key]
+
+    for key in list(sys.modules.keys()):
+        if module_name in key:
+            del sys.modules[key]
+
+
+def import_module(module_name: str) -> Optional[Type]:
+    """import moudle base on the model
+    Args:
+        module_name (str): the name of target module
+    """
+    # 1. prepare the name
+    assert "." in module_name, "`.` must be in the module_name"
+    index = module_name.rindex(".")
+    module = module_name[:index]
+    target_module_name = module_name[index + 1 :]
+
+    # 2. get the target module name
+    try:
+        module = importlib.import_module(module)
+        target_module = getattr(module, target_module_name, None)
+        return target_module
+    except ModuleNotFoundError:
+        return None
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/initializer.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/initializer.py
new file mode 100644
index 000000000..da90ed81e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/initializer.py
@@ -0,0 +1,337 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
+Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file.
+"""
+
+import math
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+
+__all__ = [
+    "uniform_",
+    "normal_",
+    "constant_",
+    "ones_",
+    "zeros_",
+    "xavier_uniform_",
+    "xavier_normal_",
+    "kaiming_uniform_",
+    "kaiming_normal_",
+    "linear_init_",
+    "conv_init_",
+    "reset_initialized_parameter",
+]
+
+
+def _no_grad_uniform_(tensor, a, b):
+    with paddle.no_grad():
+        nn.initializer.Uniform(a, b)(tensor)
+        # tensor.uniform_(min=a, max=b)  # NOTE uniform_ ops donot suprort on cpu
+    return tensor
+
+
+def _no_grad_normal_(tensor, mean=0.0, std=1.0):
+    with paddle.no_grad():
+        tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape))
+    return tensor
+
+
+def _no_grad_fill_(tensor, value=0.0):
+    with paddle.no_grad():
+        tensor.fill_(value)
+    return tensor
+
+
+def uniform_(tensor, a, b):
+    """
+    Modified tensor inspace using uniform_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        a (float|int): min value.
+        b (float|int): max value.
+    Return:
+        tensor
+    """
+    return _no_grad_uniform_(tensor, a, b)
+
+
+def normal_(tensor, mean=0.0, std=1.0):
+    """
+    Modified tensor inspace using normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mean (float|int): mean value.
+        std (float|int): std value.
+    Return:
+        tensor
+    """
+    return _no_grad_normal_(tensor, mean, std)
+
+
+def constant_(tensor, value=0.0):
+    """
+    Modified tensor inspace using constant_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        value (float|int): value to fill tensor.
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, value)
+
+
+def ones_(tensor):
+    """
+    Modified tensor inspace using ones_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, 1)
+
+
+def zeros_(tensor):
+    """
+    Modified tensor inspace using zeros_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, 0)
+
+
+def vector_(tensor, vector):
+    with paddle.no_grad():
+        tensor.set_value(paddle.to_tensor(vector, dtype=tensor.dtype))
+    return tensor
+
+
+def _calculate_fan_in_and_fan_out(tensor, reverse=False):
+    """
+    Calculate (fan_in, _fan_out) for tensor
+    Args:
+        tensor (Tensor): paddle.Tensor
+        reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True
+    Return:
+        Tuple[fan_in, fan_out]
+    """
+    if tensor.ndim < 2:
+        raise ValueError("Fan in and fan out can not be computed for tensor with fewer than 2 dimensions")
+
+    if reverse:
+        num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
+    else:
+        num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0]
+
+    receptive_field_size = 1
+    if tensor.ndim > 2:
+        receptive_field_size = np.prod(tensor.shape[2:])
+
+    fan_in = num_input_fmaps * receptive_field_size
+    fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+def xavier_uniform_(tensor, gain=1.0, reverse=False):
+    """
+    Modified tensor inspace using xavier_uniform_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        gain (float): super parameter, 1. default.
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    k = math.sqrt(3.0) * std
+    return _no_grad_uniform_(tensor, -k, k)
+
+
+def xavier_normal_(tensor, gain=1.0, reverse=False):
+    """
+    Modified tensor inspace using xavier_normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        gain (float): super parameter, 1. default.
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    return _no_grad_normal_(tensor, 0, std)
+
+
+# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html
+def _calculate_correct_fan(tensor, mode, reverse=False):
+    mode = mode.lower()
+    valid_modes = ["fan_in", "fan_out"]
+    if mode not in valid_modes:
+        raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes))
+
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)
+
+    return fan_in if mode == "fan_in" else fan_out
+
+
+def _calculate_gain(nonlinearity, param=None):
+    linear_fns = ["linear", "conv1d", "conv2d", "conv3d", "conv_transpose1d", "conv_transpose2d", "conv_transpose3d"]
+    if nonlinearity in linear_fns or nonlinearity == "sigmoid":
+        return 1
+    elif nonlinearity == "tanh":
+        return 5.0 / 3
+    elif nonlinearity == "relu":
+        return math.sqrt(2.0)
+    elif nonlinearity == "leaky_relu":
+        if param is None:
+            negative_slope = 0.01
+        elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float):
+            # True/False are instances of int, hence check above
+            negative_slope = param
+        else:
+            raise ValueError("negative_slope {} not a valid number".format(param))
+        return math.sqrt(2.0 / (1 + negative_slope**2))
+    elif nonlinearity == "selu":
+        return 3.0 / 4
+    else:
+        raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
+
+
+def kaiming_uniform_(tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", reverse=False):
+    """
+    Modified tensor inspace using kaiming_uniform method
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
+        nonlinearity (str): nonlinearity method name
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan = _calculate_correct_fan(tensor, mode, reverse)
+    gain = _calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    k = math.sqrt(3.0) * std
+    return _no_grad_uniform_(tensor, -k, k)
+
+
+def kaiming_normal_(tensor, a=0, mode="fan_in", nonlinearity="leaky_relu", reverse=False):
+    """
+    Modified tensor inspace using kaiming_normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
+        nonlinearity (str): nonlinearity method name
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan = _calculate_correct_fan(tensor, mode, reverse)
+    gain = _calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    return _no_grad_normal_(tensor, 0, std)
+
+
+def linear_init_(module):
+    bound = 1 / math.sqrt(module.weight.shape[0])
+    uniform_(module.weight, -bound, bound)
+    uniform_(module.bias, -bound, bound)
+
+
+def conv_init_(module):
+    bound = 1 / np.sqrt(np.prod(module.weight.shape[1:]))
+    uniform_(module.weight, -bound, bound)
+    if module.bias is not None:
+        uniform_(module.bias, -bound, bound)
+
+
+def bias_init_with_prob(prior_prob=0.01):
+    """initialize conv/fc bias value according to a given probability value."""
+    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
+    return bias_init
+
+
+@paddle.no_grad()
+def reset_initialized_parameter(model, include_self=True):
+    """
+    Reset initialized parameter using following method for [conv, linear, embedding, bn]
+    Args:
+        model (paddle.Layer): paddle Layer
+        include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself
+    Return:
+        None
+    """
+    for _, m in model.named_sublayers(include_self=include_self):
+        if isinstance(m, nn.Conv2D):
+            k = float(m._groups) / (m._in_channels * m._kernel_size[0] * m._kernel_size[1])
+            k = math.sqrt(k)
+            _no_grad_uniform_(m.weight, -k, k)
+            if hasattr(m, "bias") and getattr(m, "bias") is not None:
+                _no_grad_uniform_(m.bias, -k, k)
+
+        elif isinstance(m, nn.Linear):
+            k = math.sqrt(1.0 / m.weight.shape[0])
+            _no_grad_uniform_(m.weight, -k, k)
+            if hasattr(m, "bias") and getattr(m, "bias") is not None:
+                _no_grad_uniform_(m.bias, -k, k)
+
+        elif isinstance(m, nn.Embedding):
+            _no_grad_normal_(m.weight, mean=0.0, std=1.0)
+
+        elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)):
+            _no_grad_fill_(m.weight, 1.0)
+            if hasattr(m, "bias") and getattr(m, "bias") is not None:
+                _no_grad_fill_(m.bias, 0)
+
+
+def to(
+    self,
+    device=None,
+    dtype=None,
+    blocking=None,
+    floating_only=True,
+):
+    """
+    Cast the parameters and buffers of Layer by the give device, dtype and blocking.
+
+    Parameters:
+        device(str|paddle.CPUPlace()|paddle.CUDAPlace()|paddle.CUDAPinnedPlace()|paddle.XPUPlace()|None, optional): The device of the Layer which want to be stored.
+        If None, the device is the same with the original Tensor. If device is string, it can be ``cpu``, ``gpu:x`` and ``xpu:x``, where ``x`` is the
+        index of the GPUs or XPUs. Default: None.
+
+        dtype(str|numpy.dtype|paddle.dtype|None, optional): The type of the data. If None, the dtype is the same with the original Tensor. Default: None.
+
+        blocking(bool|None, optional): If False and the source is in pinned memory, the copy will be
+            asynchronous with respect to the host. Otherwise, the argument has no effect. If None, the blocking is set True. Default: None.
+
+        floating_only(bool|False, optional): If True, only cast all floating point parameters and buffers of Layer by the give device, dtype and blocking.
+
+    Returns:
+        self
+
+    """
+
+    if floating_only and (not paddle.is_floating_point(self)):
+        return self
+    paddle.Tensor._to(self, device, dtype, blocking)
+    return self
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/llm_utils.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/llm_utils.py
new file mode 100644
index 000000000..6ef5aae9d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/llm_utils.py
@@ -0,0 +1,875 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import glob
+import math
+import os
+import struct
+from typing import Dict, List, Optional
+
+import numpy as np
+import paddle
+import paddle.distributed as dist
+import paddle.distributed.fleet.base.topology as tp
+import paddle.incubate.multiprocessing as mp
+from paddle.distributed import fleet
+from paddle.io import BatchSampler, DataLoader, DistributedBatchSampler
+from sklearn.metrics import accuracy_score
+
+from paddlenlp.datasets import ZeroPaddingIterableDataset
+from paddlenlp.generation import GenerationConfig
+from paddlenlp.trainer import Trainer, TrainerCallback
+from paddlenlp.trainer.trainer_utils import IterableDatasetShard, has_length
+from paddlenlp.transformers import (
+    AutoTokenizer,
+    ChatGLMv2Tokenizer,
+    LlamaForCausalLMPipe,
+    PretrainedConfig,
+    Qwen2ForCausalLMPipe,
+)
+from paddlenlp.transformers.tokenizer_utils import PretrainedTokenizer
+from paddlenlp.utils.log import logger
+
+
+def compute_metrics(eval_preds):
+    flattened_preds = np.array(eval_preds.predictions).flatten()
+    flattened_labels = np.array(eval_preds.label_ids).flatten()
+    filtered_preds = flattened_preds[flattened_labels != -100]
+    filtered_labels = flattened_labels[flattened_labels != -100]
+    accuracy = accuracy_score(y_true=filtered_labels, y_pred=filtered_preds)
+    return {
+        "accuracy": accuracy,
+    }
+
+
+def get_prefix_tuning_params(model):
+    if model.base_model_prefix == "chatglm":
+        from paddlenlp.peft.prefix import chatglm_postprocess_past_key_value
+
+        num_attention_heads = model.config.num_attention_heads
+        num_hidden_layers = model.config.num_hidden_layers
+        hidden_size = model.config.hidden_size
+        postprocess_past_key_value = chatglm_postprocess_past_key_value
+        multi_query_group_num = None
+    elif model.base_model_prefix == "chatglm_v2":
+        from paddlenlp.peft.prefix import chatglm_postprocess_past_key_value
+
+        num_attention_heads = model.config.num_attention_heads
+        num_hidden_layers = model.config.num_layers
+        hidden_size = model.config.hidden_size
+        postprocess_past_key_value = chatglm_postprocess_past_key_value
+        multi_query_group_num = model.config.multi_query_group_num  # num_key_value_heads
+    elif model.base_model_prefix == "bloom":
+        from paddlenlp.peft.prefix import bloom_postprocess_past_key_value
+
+        num_attention_heads = model.config.num_attention_heads
+        num_hidden_layers = model.config.n_layer
+        hidden_size = model.config.n_embed
+        postprocess_past_key_value = bloom_postprocess_past_key_value
+        multi_query_group_num = None
+    elif model.base_model_prefix == "llama":
+        from paddlenlp.peft.prefix import llama_postprocess_past_key_value
+
+        num_attention_heads = model.config.n_head
+        num_hidden_layers = model.config.n_layer
+        hidden_size = model.config.hidden_size
+        postprocess_past_key_value = llama_postprocess_past_key_value
+        multi_query_group_num = None
+    elif model.base_model_prefix == "mistral":
+        from paddlenlp.peft.prefix import mistral_postprocess_past_key_value
+
+        num_attention_heads = model.config.num_attention_heads
+        num_hidden_layers = model.config.num_hidden_layers
+        hidden_size = model.config.hidden_size
+        postprocess_past_key_value = mistral_postprocess_past_key_value
+        multi_query_group_num = model.config.num_key_value_heads
+    elif model.base_model_prefix == "qwen":
+        from paddlenlp.peft.prefix import qwen_postprocess_past_key_value
+
+        num_attention_heads = model.config.num_attention_heads
+        num_hidden_layers = model.config.num_hidden_layers
+        hidden_size = model.config.hidden_size
+        postprocess_past_key_value = qwen_postprocess_past_key_value
+        multi_query_group_num = None
+    elif model.base_model_prefix == "qwen2":
+        from paddlenlp.peft.prefix import qwen_postprocess_past_key_value
+
+        num_attention_heads = model.config.num_attention_heads
+        num_hidden_layers = model.config.num_hidden_layers
+        hidden_size = model.config.hidden_size
+        postprocess_past_key_value = qwen_postprocess_past_key_value
+        multi_query_group_num = model.config.num_key_value_heads  # num_key_value_heads
+    else:
+        raise ValueError(f"Unknown base_model_prefix: {model.base_model_prefix}. ")
+    return dict(
+        num_attention_heads=num_attention_heads,
+        num_hidden_layers=num_hidden_layers,
+        hidden_size=hidden_size,
+        postprocess_past_key_value=postprocess_past_key_value,
+        multi_query_group_num=multi_query_group_num,
+    )
+
+
+def get_lora_target_modules(model):
+    # Not yet support RowParallelLinear
+    if model.base_model_prefix == "chatglm":
+        target_modules = [".*query_key_value.*", ".*dense.*", ".*dense_h_to_4h.*", ".*dense_4h_to_h.*"]
+    elif model.base_model_prefix == "chatglm_v2":
+        target_modules = [
+            ".*query.*",
+            ".*key.*",
+            ".*value.*",
+            ".*dense.*",
+            ".*dense_h_to_4h.*",
+            ".*dense_4h_to_h.*",
+        ]
+    elif model.base_model_prefix == "gpt":
+        target_modules = [
+            ".*qkv_proj.*",
+            ".*q_proj.*",
+            ".*k_proj.*",
+            ".*v_proj.*",
+            ".*linear1.*",
+            ".*linear2.*",
+            ".*out_proj.*",
+        ]
+    elif model.base_model_prefix == "bloom":
+        target_modules = [".*query_key_value.*", ".*dense.*", ".*dense_h_to_4h.*", ".*dense_4h_to_h.*"]
+    elif model.base_model_prefix in ["llama", "jamba"] or isinstance(model, LlamaForCausalLMPipe):
+        target_modules = [
+            ".*q_proj.*",
+            ".*v_proj.*",
+            ".*k_proj.*",
+            ".*o_proj.*",
+            ".*qkv_proj.*",
+            ".*gate_proj.*",
+            ".*down_proj.*",
+            ".*up_proj.*",
+            ".*gate_up_fused_proj.*",
+        ]
+    elif model.base_model_prefix == "opt":
+        target_modules = [
+            ".*project_in.*",
+            ".*project_out.*",
+            ".*q_proj.*",
+            ".*k_proj.*",
+            ".*v_proj.*",
+            ".*qkv_proj.*",
+            ".*out_proj.*",
+            ".*linear1.*",
+            ".*linear2.*",
+        ]
+    elif model.base_model_prefix == "qwen":
+        target_modules = [
+            ".*attn.c_attn.*",
+            ".*attn.c_proj.*",
+            ".*mlp.w1.*",
+            ".*mlp.w2.*",
+            ".*mlp.c_proj.*",
+        ]
+    elif model.base_model_prefix == "qwen2" or isinstance(model, Qwen2ForCausalLMPipe):
+        target_modules = [
+            ".*q_proj.*",
+            ".*k_proj.*",
+            ".*v_proj.*",
+            ".*o_proj.*",
+            ".*gate_proj.*",
+            ".*down_proj.*",
+            ".*up_proj.*",
+        ]
+    elif model.base_model_prefix == "mixtral":
+        target_modules = [
+            ".*q_proj.*",
+            ".*k_proj.*",
+            ".*v_proj.*",
+            ".*o_proj.*",
+            # ".*gate.*", # TODO(DrownFish19): Does the gate weight require training?
+            ".*w1.*",
+            ".*w2.*",
+            ".*w3.*",
+        ]
+    elif model.base_model_prefix == "mistral":
+        target_modules = [
+            ".*q_proj.*",
+            ".*k_proj.*",
+            ".*v_proj.*",
+            ".*o_proj.*",
+            ".*gate.*",
+            ".*w1.*",
+            ".*w2.*",
+            ".*w3.*",
+        ]
+    elif model.base_model_prefix == "qwen2_moe":
+        target_modules = [
+            ".*q_proj.*",
+            ".*k_proj.*",
+            ".*v_proj.*",
+            ".*o_proj.*",
+            # ".*gate.*", # TODO(DrownFish19): Does the gate weight require training?
+            ".*gate_proj.*",
+            ".*up_proj.*",
+            ".*down_proj.*",
+        ]
+    elif model.base_model_prefix == "yuan":
+        target_modules = [
+            ".*q_proj.*",
+            ".*k_proj.*",
+            ".*v_proj.*",
+            ".*o_proj.*",
+            ".*gate_proj.*",
+            ".*up_proj.*",
+            ".*down_proj.*",
+        ]
+    else:
+        raise ValueError(f"Unknown base_model_prefix: {model.base_model_prefix}.")
+    return target_modules
+
+
+class ZeroPaddingIterDatasetCallback(TrainerCallback):
+    """
+    A [`TrainerCallback`] that handles early stopping.
+
+    """
+
+    def on_step_end(self, args, state, control, **kwargs):
+        train_dataloader = kwargs["train_dataloader"]
+        if isinstance(train_dataloader.dataset, ZeroPaddingIterableDataset):
+            dataset = train_dataloader.dataset
+        elif isinstance(train_dataloader.dataset, IterableDatasetShard) and isinstance(
+            train_dataloader.dataset.dataset, ZeroPaddingIterableDataset
+        ):
+            dataset = train_dataloader.dataset.dataset
+        else:
+            raise ValueError(
+                "Unexpected dataset format: ZeroPaddingIterDatasetCallback expectes `paddlenlp.datasets.ZeroPaddingIterableDataset`"
+            )
+        if state.trial_params is None:
+            state.trial_params = {}
+        state.trial_params["zero_padding_global_step"] = dataset.zero_padding_global_step
+
+
+class CausalLMTrainer(Trainer):
+    def __init__(self, do_generation: bool, gen_args, data_args, **kwargs):
+        super().__init__(**kwargs)
+        self.do_generation = do_generation
+        self.gen_args = gen_args
+        self.data_args = data_args
+
+    def prediction_step(
+        self,
+        model,
+        inputs,
+        prediction_loss_only: bool,
+        ignore_keys=None,
+    ):
+        if prediction_loss_only or self.args.pipeline_parallel_degree > 1:
+            return super().prediction_step(model, inputs, prediction_loss_only, ignore_keys)
+        elif not self.do_generation:
+            loss, logits, labels = super().prediction_step(model, inputs, prediction_loss_only, ignore_keys)
+            # argmax here to avoid gather all logits, which is too memory-consuming.
+            # keepdim in order to maintain the same shape as logits
+            if isinstance(logits, (list, tuple)):
+                logits = logits[0]
+            # all gather logits when enabling tensor_parallel_output
+            if self.args.tensor_parallel_degree > 1 and getattr(self.args, "tensor_parallel_output", False):
+                hcg = fleet.get_hybrid_communicate_group()
+                model_parallel_group = hcg.get_model_parallel_group()
+                gathered_logits = []
+                dist.all_gather(gathered_logits, logits, group=model_parallel_group)
+                logits = paddle.concat(gathered_logits, axis=-1)
+            return (loss, logits.argmax(axis=-1, keepdim=True), labels)
+
+        loss = None
+
+        model.eval()
+        with paddle.no_grad():
+            generated_tokens = model.generate(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"] if "attention_mask" in inputs else None,
+                position_ids=inputs["position_ids"] if "position_ids" in inputs else None,
+                max_length=max(self.data_args.max_length - inputs["input_ids"].shape[-1], 1),
+                decode_strategy="sampling",
+                top_k=self.gen_args.top_k,
+                top_p=self.gen_args.top_p,
+                bos_token_id=self.tokenizer.bos_token_id,
+                eos_token_id=self.tokenizer.eos_token_id,
+                pad_token_id=self.tokenizer.pad_token_id,
+                use_cache=True,
+            )[0]
+            all_preds = []
+            for pred_tokens in generated_tokens:
+                pred_tokens = pred_tokens.numpy()
+                pred_tokens = pred_tokens[pred_tokens != self.tokenizer.pad_token_id].tolist()
+                all_preds.append(pred_tokens)
+            max_pred_length = max([len(x) for x in all_preds])
+            for index, preds in enumerate(all_preds):
+                all_preds[index] = preds + [-100] * (max_pred_length - len(preds))
+            all_preds = paddle.to_tensor(all_preds)
+
+            if "labels" in inputs:
+                all_labels = paddle.to_tensor(inputs["labels"])
+            else:
+                all_labels = None
+
+        return (loss, all_preds, all_labels)
+
+    def log(self, logs: Dict[str, float], **kwargs) -> None:
+        if "loss" in logs:
+            logs["ppl"] = np.exp(logs["loss"])
+        if "eval_loss" in logs:
+            logs["eval_ppl"] = np.exp(logs["eval_loss"])
+
+        super(CausalLMTrainer, self).log(logs, **kwargs)
+
+    def get_ptq_dataloader(self, ptq_ds):
+        if self.args.world_size <= 1:
+            ptq_sampler = BatchSampler(
+                dataset=ptq_ds,
+                shuffle=True,
+                batch_size=self.args.per_device_train_batch_size,
+                drop_last=self.args.dataloader_drop_last,
+            )
+        else:
+            ptq_sampler = DistributedBatchSampler(
+                self.train_dataset,
+                batch_size=self.args.per_device_train_batch_size,
+                shuffle=True,
+                num_replicas=self.args.dataset_world_size,
+                rank=self.args.dataset_rank,
+                drop_last=self.args.dataloader_drop_last,
+            )
+        ptq_dataloader = DataLoader(
+            ptq_ds,
+            batch_sampler=ptq_sampler,
+            collate_fn=self.data_collator,
+            num_workers=self.args.dataloader_num_workers,
+        )
+        return ptq_dataloader
+
+    def ptq_loop(
+        self,
+        dataloader: DataLoader,
+        description: str,
+        max_eval_iters: Optional[int] = -1,
+    ):
+        if isinstance(dataloader, paddle.io.DataLoader):
+            batch_size = dataloader.batch_sampler.batch_size
+        else:
+            raise ValueError("Only support for paddle.io.DataLoader")
+
+        if has_length(dataloader):
+            logger.info(f"  Num examples = {self.num_examples(dataloader)}")
+            if max_eval_iters > 0:
+                logger.info(f"  Total {description} steps = {max_eval_iters}")
+            else:
+                logger.info(f"  Total {description} steps = {len(dataloader)}")
+        else:
+            logger.info("  Num examples: Unknown")
+            if max_eval_iters > 0:
+                logger.info(f"  Total {description} steps = {max_eval_iters}")
+
+        logger.info(f"  Pre device batch size = {batch_size}")
+        logger.info(f"  Total Batch size = {batch_size * self.args.dataset_world_size}")
+        self.model.eval()
+        with paddle.no_grad():
+            for step, inputs in enumerate(dataloader):
+                self.prediction_step(model=self.model, inputs=inputs, prediction_loss_only=True, ignore_keys=None)
+                if max_eval_iters > 0 and step >= max_eval_iters - 1:
+                    break
+
+
+def get_infer_model_path(input_dir, model_prefix):
+    if dist.get_world_size() > 1:
+        local_rank = dist.get_rank()
+        return os.path.join(input_dir, "rank_{}".format(local_rank), model_prefix)
+    else:
+        return os.path.join(input_dir, model_prefix)
+
+
+def generate_rank_mapping(output_filename):
+    ring_id = -1
+    try:
+        hcg = fleet.get_hybrid_communicate_group()
+        model_parallel_group = hcg.get_model_parallel_group()
+        ring_id = model_parallel_group.id
+    except Exception:
+        pass
+
+    if ring_id == -1:
+        return
+
+    world_size = dist.get_world_size()
+    with open(output_filename, "w") as f:
+        f.write("[ring_id -> ranks]\n")
+        f.write(",".join(map(str, [0] + list(range(world_size)))) + "\n")
+        f.write(",".join(map(str, [ring_id] + list(range(world_size)))) + "\n")
+
+        f.write("[rank -> ring_ids]\n")
+        for i in range(world_size):
+            f.write("{},0,{}\n".format(i, ring_id))
+
+
+def deserialize_from_file(fp):
+    x_type = fp.read(1)
+    x_type_out = struct.unpack("c", x_type)[0]
+    # data
+    data_list = []
+    if x_type_out == b"0":
+        data = fp.read(4)
+        data_out = struct.unpack("f", data)[0]
+        while data:
+            data_out = struct.unpack("f", data)[0]
+            data_list.append(data_out)
+            data = fp.read(4)
+    elif x_type_out == b"1":
+        data = fp.read(8)
+        while data:
+            data_out = struct.unpack("l", data)[0]
+            data_list.append(data_out)
+            data = fp.read(8)
+    elif x_type_out == b"2":
+        data = fp.read(4)
+        while data:
+            data_out = struct.unpack("i", data)[0]
+            data_list.append(data_out)
+            data = fp.read(4)
+    else:
+        print("type error")
+    data_arr = np.array(data_list)
+    return data_arr
+
+
+def get_alibi_slopes(num_heads):
+    closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
+    base = 2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3)))
+    powers = np.arange(1, 1 + closest_power_of_2)
+    slopes = np.power(base, powers)
+
+    if closest_power_of_2 != num_heads:
+        extra_base = 2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3)))
+        num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
+        extra_powers = np.arange(1, 1 + 2 * num_remaining_heads, 2)
+        slopes = np.concatante([slopes, np.power(extra_base, extra_powers)], axis=0)
+
+    return slopes.astype("float32")
+
+
+def pad_batch_data(insts, pad_id=0, return_seq_len=False, pad_style="right"):
+    """Pad sequences to the max sequence length in batch."""
+    max_len = max(map(len, insts))
+    if pad_style == "left":
+        inst_data = np.array([[pad_id] * (max_len - len(inst)) + list(inst) for inst in insts])
+    else:
+        inst_data = np.array([list(inst) + [pad_id] * (max_len - len(inst)) for inst in insts])
+
+    if return_seq_len:
+        seq_len = np.array([len(inst) for inst in insts])
+        return inst_data.astype("int64").reshape([-1, max_len]), seq_len
+    else:
+        return inst_data.astype("int64").reshape([-1, max_len])
+
+
+def dybatch_preprocess(
+    tokenizer,
+    texts: list[str],
+    src_length: int,
+    max_length: int,
+    architectures: str,
+    top_p: float,
+    temperature: float,
+    eos_token_id: int | list[list[int]],
+    pre_caches_length: int = 0,
+    benchmark: bool = False,
+):
+    """Pre-process generation inputs."""
+    inputs = {}
+    if "chatglmforcausallm" == architectures.lower():
+        input_ids = []
+        position_ids = []
+
+        for text in texts:
+            tokens = tokenizer(
+                text,
+                return_tensors="np",
+                padding=True,
+                max_length=src_length,
+                # if use chat_template, it will not add special_tokens
+                add_special_tokens=tokenizer.chat_template is None or isinstance(tokenizer, ChatGLMv2Tokenizer),
+            )
+            input_ids.append(tokens["input_ids"][0])
+            position_ids.append(tokens["position_ids"][0])
+
+        pad_token_id = tokenizer([tokenizer.pad_token], return_tensors="np")["input_ids"][0][0]
+        inputs["input_ids"], seq_len = pad_batch_data(input_ids, pad_id=pad_token_id, return_seq_len=True)
+        bs = inputs["input_ids"].shape[0]
+        max_len = max(map(len, input_ids))
+
+        inst_data_pos = []
+        for i in range(len(position_ids)):
+            inst_data_pos.append(np.array([list(inst) + [0] * (max_len - len(inst)) for inst in position_ids[i]]))
+        inputs["position_ids"] = paddle.to_tensor(np.array(inst_data_pos))
+    elif "gpt" in architectures:
+        input_ids = []
+        if isinstance(texts, str):
+            texts = [texts]
+
+        for text in texts:
+            tokens = tokenizer(
+                text,
+                return_tensors="np",
+                padding=False,
+                max_length=src_length,
+                return_attention_mask=False,
+                return_token_type_ids=False,
+            )
+            input_ids.append(tokens["input_ids"][0])
+
+        pad_token_id = tokenizer([tokenizer.pad_token], return_tensors="np")["input_ids"][0][-1]
+        inputs["input_ids"], seq_len = pad_batch_data(input_ids, pad_id=pad_token_id, return_seq_len=True)
+        bs = inputs["input_ids"].shape[0]
+        max_len = max(map(len, input_ids))
+
+        position_ids = paddle.arange(sum(seq_len), dtype="int64")
+        pre_len = seq_len[0]
+        for length in seq_len[1:]:
+            position_ids[pre_len : length + pre_len] = position_ids[pre_len : length + pre_len] - pre_len
+            pre_len += length
+        inputs["position_ids"] = position_ids
+    else:
+        input_ids = []
+        if isinstance(texts, str):
+            texts = [texts]
+
+        for text in texts:
+            tokens = tokenizer(
+                text,
+                return_tensors="np",
+                padding=False,
+                max_length=src_length,
+                return_attention_mask=False,
+                return_token_type_ids=False,
+                add_special_tokens=tokenizer.chat_template is None or isinstance(tokenizer, ChatGLMv2Tokenizer),
+            )
+            input_ids.append(tokens["input_ids"][0])
+
+        pad_token_id = tokenizer([tokenizer.pad_token], return_tensors="np")["input_ids"][0][-1]
+        inputs["input_ids"], seq_len = pad_batch_data(input_ids, pad_id=pad_token_id, return_seq_len=True)
+        bs = inputs["input_ids"].shape[0]
+        max_len = max(map(len, input_ids))
+
+        position_ids = paddle.zeros(shape=[bs, max_length + src_length], dtype="int64")
+
+        for i in range(bs):
+            position_ids[i, pre_caches_length : pre_caches_length + seq_len[i]] = paddle.arange(seq_len[i])
+        inputs["position_ids"] = position_ids
+
+    tgt_ids = [input[-1:] for input in input_ids]
+    tgt_pos = []
+    for i, valid_len in enumerate(map(len, input_ids)):
+        tgt_pos.append(valid_len - 1)
+
+    step_idx = [
+        0,
+    ] * bs
+    tgt_pos = np.array(tgt_pos).astype("int64")
+
+    if isinstance(eos_token_id, int):
+        eos_token_id = [eos_token_id]
+
+    inputs["eos_token_id"] = np.array(eos_token_id * bs).reshape(-1, 1).astype("int64")
+
+    inputs["top_p"] = (
+        np.array(
+            [
+                top_p,
+            ]
+            * bs
+        )
+        .reshape(-1, 1)
+        .astype("float32")
+    )
+    inputs["temperature"] = (
+        np.array(
+            [
+                temperature,
+            ]
+            * bs
+        )
+        .reshape(-1, 1)
+        .astype("float32")
+    )
+    inputs["seq_len_encoder"] = seq_len.astype("int32").reshape(-1, 1)
+    inputs["seq_len_decoder"] = (seq_len + pre_caches_length).astype("int32").reshape(-1, 1)
+    inputs["step_idx"] = np.array(step_idx).astype("int64").reshape(-1, 1)
+    inputs["tgt_ids"] = np.array(tgt_ids).astype("int64").reshape(-1, 1)
+    inputs["tgt_pos"] = tgt_pos.reshape(-1, 1)
+    inputs["max_length"] = np.array(max_length - pre_caches_length).astype("int64").reshape((-1, 1))
+    inputs["min_length"] = (
+        np.array(
+            [
+                1
+                if not benchmark
+                else max_length
+                - pre_caches_length,  # Note(Zhengzekang): When in benchmark mode, we need to set a fixed decode length.
+            ]
+            * bs
+        )
+        .astype("int64")
+        .reshape((-1, 1))
+    )
+    inputs["penalty_score"] = (
+        np.array(
+            [
+                1.0,
+            ]
+            * bs
+        )
+        .astype("float32")
+        .reshape((-1, 1))
+    )
+    inputs["frequency_score"] = (
+        np.array(
+            [
+                0.0,
+            ]
+            * bs
+        )
+        .astype("float32")
+        .reshape((-1, 1))
+    )
+    inputs["presence_score"] = (
+        np.array(
+            [
+                0.0,
+            ]
+            * bs
+        )
+        .astype("float32")
+        .reshape((-1, 1))
+    )
+    inputs["stop_flags"] = (
+        np.array(
+            [
+                0,
+            ]
+            * bs
+        )
+        .astype("bool")
+        .reshape((-1, 1))
+    )
+    inputs["stop_nums"] = np.array([bs]).astype("int64")
+    return inputs
+
+
+def load_real_time_tokens():
+    tokens = []
+    files = glob.glob(os.path.join("./real_time_save.*"))
+    for j in range(1, len(files) + 1):
+        filename = "./real_time_save.temp_ids_rank_0_step_{}".format(j)
+        if not os.path.exists(filename):
+            break
+        fp = open(filename, "rb+")
+        fp.read(1)
+        data_list = deserialize_from_file(fp)
+        fp.close()
+        tokens.append(np.array(data_list).reshape(-1, 1))
+    os.system("rm -f ./real_time_save.temp_ids_rank_*")
+    tokens = np.concatenate(tokens, axis=1)
+    return tokens
+
+
+def init_chat_template(
+    tokenizer: PretrainedTokenizer, model_name_or_path: str, chat_template_file: Optional[str] = None
+):
+    """init chat template for the given tokenizer.
+
+        If is None, it will not use `chat_template.json`;
+        If is equal with `model_name_or_path`, it will use the default loading;
+        If is directory, it will find the `chat_template.json` under the directory;
+        If is file, it will load it.
+
+    Args:
+        tokenizer (PretrainedTokenizer): the instance of tokenizer
+        model_name_or_path (str): _description_
+        chat_template_file (Optional[str], optional): _description_. Defaults to None.
+    """
+    # 1. use the default chat_template file
+    if chat_template_file is None:
+        return
+
+    if str(chat_template_file).lower() == "none":
+        # delete the chat_template from tokenizer if not use chat_template.
+        # why do this: it will load the `chat_template.json` file by default
+        tokenizer.chat_template = None
+        return
+
+    # it will load the `chat_template.json` file by default, so do nothing
+    if chat_template_file == model_name_or_path:
+        if tokenizer.chat_template is None:
+            logger.warning(f"there is not `chat_template.json` file in the `{model_name_or_path}`")
+        return
+
+    if os.path.isdir(chat_template_file):
+        local_chat_template_file_path = os.path.join(chat_template_file, "chat_template.json")
+        if os.path.exists(local_chat_template_file_path):
+            chat_template_file = local_chat_template_file_path
+        else:
+            logger.warning(f"there is not `chat_template.json` file in the `{model_name_or_path}`")
+            return
+
+    if not os.path.exists(chat_template_file):
+        logger.warning(f"there is not `chat_template.json` file from path<`{model_name_or_path}`>")
+        return
+
+    logger.info(f"loading `chat_template.json` from `{chat_template_file}`")
+    tokenizer.init_chat_template(chat_template_file)
+
+
+def get_model_max_position_embeddings(config: PretrainedConfig) -> Optional[int]:
+    names = [
+        "max_position_embeddings",  # most of models
+        "max_sequence_length",  # GLM model
+        "seq_length",  # llama model
+    ]
+    for name in names:
+        max_length = config.get(name, None)
+        if max_length is not None:
+            return max_length
+    return None
+
+
+def read_res(model_name_or_path: str, tensor_queue: mp.Queue, result_queue: mp.Queue):
+    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+
+    paddle.device.set_device("cpu")
+    paddle.disable_static()
+    outputs = []
+    output_tensor = tensor_queue.get(timeout=1)
+
+    logger.info("Start read result message")
+    logger.info(f"Current path is {os.getcwd()}")
+
+    from paddlenlp_ops import get_output
+
+    while True:
+        get_output(output_tensor, 0, True)
+        if int(output_tensor[0, 0]) == -2:  # read none
+            continue
+        bsz = int(output_tensor[1, 0])
+        output_numpy = output_tensor[2 : bsz + 2].numpy()
+        output_numpy[output_numpy == -1] = tokenizer.eos_token_id
+        outputs.append(output_numpy)
+        if int(output_tensor[0, 0]) == -1:
+            break
+    output = np.concatenate(outputs, axis=1).tolist()
+    seqs = tokenizer.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+    for i, (out, seq) in enumerate(zip(output, seqs)):
+        result_queue.put([i, out, seq])
+
+    logger.info("Finish read result message")
+
+
+def get_rotary_position_embedding(position_ids, head_dim, rope_theta=10000.0, rope_scaling: dict = None):
+    """
+    Pre-calculate rotary position embedding for position_ids.
+
+    Args:
+        position_ids: [1, S]
+        head_dim: D
+
+    Returns:
+        rot_emb: [2, 1, S, 1, D], cos + sin
+    """
+    bsz, max_seq_len = position_ids.shape[:2]
+    rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, head_dim), dtype="float32")
+    inv_freq = rope_theta ** (-paddle.arange(0, head_dim, 2, dtype="float32") / head_dim)
+
+    if rope_scaling is not None:
+        rope_type = rope_scaling.get("rope_type", None)
+        if rope_type is not None and rope_type == "llama3":
+            factor = rope_scaling.get("factor", 8.0)
+            low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
+            high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
+            original_max_position_embeddings = rope_scaling.get("original_max_position_embeddings", 8192)
+
+            low_freq_wavelen = original_max_position_embeddings / low_freq_factor
+            high_freq_wavelen = original_max_position_embeddings / high_freq_factor
+            new_freqs = []
+            for freq in inv_freq:
+                wavelen = 2 * math.pi / freq
+                if wavelen < high_freq_wavelen:
+                    new_freqs.append(freq)
+                elif wavelen > low_freq_wavelen:
+                    new_freqs.append(freq / factor)
+                else:
+                    assert low_freq_wavelen != high_freq_wavelen
+                    smooth = (original_max_position_embeddings / wavelen - low_freq_factor) / (
+                        high_freq_factor - low_freq_factor
+                    )
+                    new_freqs.append((1 - smooth) * freq / factor + smooth * freq)
+            inv_freq = paddle.to_tensor(new_freqs, dtype=inv_freq.dtype)
+
+    # shape: [B, S, D/2]
+    freqs = paddle.einsum("ij,k->ijk", position_ids.cast("float32"), inv_freq)
+    # shape: [B, S, 1, D]
+    emb = paddle.concat([freqs, freqs], axis=-1).reshape((bsz, max_seq_len, 1, head_dim))
+
+    rot_emb[0] = paddle.cos(emb)
+    rot_emb[1] = paddle.sin(emb)
+    return rot_emb
+
+
+def init_dist_env():
+    tensor_parallel_degree = paddle.distributed.get_world_size()
+    tensor_parallel_rank = paddle.distributed.get_rank()
+
+    if tensor_parallel_degree > 1:
+        # refer to: https://github.com/PaddlePaddle/Paddle/blob/4abea956ee852ce52791a1e08fa92ed4d3be150d/python/paddle/distributed/fleet/fleet.py#L298C23-L298C45
+        hcg = tp._HYBRID_PARALLEL_GROUP
+        if hcg is None:
+            strategy = fleet.DistributedStrategy()
+            strategy.hybrid_configs = {
+                "dp_degree": 1,
+                "mp_degree": tensor_parallel_degree,
+                "pp_degree": 1,
+                "sharding_degree": 1,
+            }
+            fleet.init(is_collective=True, strategy=strategy)
+            hcg = fleet.get_hybrid_communicate_group()
+
+        tensor_parallel_rank = hcg.get_model_parallel_rank()
+    return tensor_parallel_rank, tensor_parallel_degree
+
+
+def get_eos_token_id(
+    tokenizer: PretrainedTokenizer, generation_config: Optional[GenerationConfig] = None
+) -> List[List[int]]:
+    """get eos_token_id from generation_config or tokenizer
+
+    Returns:
+        List[int]: eos_token_id to stop the generation
+    """
+    eos_token_ids = []
+    if tokenizer.eos_token_id is not None:
+        eos_token_ids.append(tokenizer.eos_token_id)
+
+    if generation_config is not None and generation_config.eos_token_id is not None:
+        if isinstance(generation_config.eos_token_id, int):
+            eos_token_ids.append(generation_config.eos_token_id)
+        else:
+            eos_token_ids.extend(generation_config.eos_token_id)
+
+    eos_token_ids_dict = {str(item): item for item in eos_token_ids}
+    return list(eos_token_ids_dict.values())
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/log.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/log.py
new file mode 100644
index 000000000..c887ead64
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/log.py
@@ -0,0 +1,133 @@
+# coding:utf-8
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import functools
+import logging
+import threading
+import time
+
+import colorlog
+
+loggers = {}
+
+log_config = {
+    "DEBUG": {"level": 10, "color": "purple"},
+    "INFO": {"level": 20, "color": "green"},
+    "TRAIN": {"level": 21, "color": "cyan"},
+    "EVAL": {"level": 22, "color": "blue"},
+    "WARNING": {"level": 30, "color": "yellow"},
+    "ERROR": {"level": 40, "color": "red"},
+    "CRITICAL": {"level": 50, "color": "bold_red"},
+}
+
+
+class Logger(object):
+    """
+    Deafult logger in PaddleNLP
+
+    Args:
+        name(str) : Logger name, default is 'PaddleNLP'
+    """
+
+    def __init__(self, name: str = None):
+        name = "PaddleNLP" if not name else name
+        self.logger = logging.getLogger(name)
+
+        for key, conf in log_config.items():
+            logging.addLevelName(conf["level"], key)
+            self.__dict__[key] = functools.partial(self.__call__, conf["level"])
+            self.__dict__[key.lower()] = functools.partial(self.__call__, conf["level"])
+
+        self.format = colorlog.ColoredFormatter(
+            "%(log_color)s[%(asctime)-15s] [%(levelname)8s]%(reset)s - %(message)s",
+            log_colors={key: conf["color"] for key, conf in log_config.items()},
+        )
+
+        self.handler = logging.StreamHandler()
+        self.handler.setFormatter(self.format)
+
+        self.logger.addHandler(self.handler)
+        self.logLevel = "DEBUG"
+        self.logger.setLevel(logging.DEBUG)
+        self.logger.propagate = False
+        self._is_enable = True
+
+    def disable(self):
+        self._is_enable = False
+
+    def enable(self):
+        self._is_enable = True
+
+    def set_level(self, log_level: str):
+        assert log_level in log_config, f"Invalid log level. Choose among {log_config.keys()}"
+        self.logger.setLevel(log_level)
+
+    @property
+    def is_enable(self) -> bool:
+        return self._is_enable
+
+    def __call__(self, log_level: str, msg: str):
+        if not self.is_enable:
+            return
+
+        self.logger.log(log_level, msg)
+
+    @contextlib.contextmanager
+    def use_terminator(self, terminator: str):
+        old_terminator = self.handler.terminator
+        self.handler.terminator = terminator
+        yield
+        self.handler.terminator = old_terminator
+
+    @contextlib.contextmanager
+    def processing(self, msg: str, interval: float = 0.1):
+        """
+        Continuously print a progress bar with rotating special effects.
+
+        Args:
+            msg(str): Message to be printed.
+            interval(float): Rotation interval. Default to 0.1.
+        """
+        end = False
+
+        def _printer():
+            index = 0
+            flags = ["\\", "|", "/", "-"]
+            while not end:
+                flag = flags[index % len(flags)]
+                with self.use_terminator("\r"):
+                    self.info("{}: {}".format(msg, flag))
+                time.sleep(interval)
+                index += 1
+
+        t = threading.Thread(target=_printer)
+        t.start()
+        yield
+        end = True
+
+    @functools.lru_cache(None)
+    def warning_once(self, *args, **kwargs):
+        """
+        This method is identical to `logger.warning()`, but will emit the warning with the same message only once
+
+        Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the cache.
+        The assumption here is that all warning messages are unique across the code. If they aren't then need to switch to
+        another type of cache that includes the caller frame information in the hashing function.
+        """
+        self.warning(*args, **kwargs)
+
+
+logger = Logger()
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/nested.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/nested.py
new file mode 100644
index 000000000..4e8002318
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/nested.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import copy
+
+import paddle
+
+from paddlenlp.utils.log import logger
+
+TensorHolder = collections.namedtuple("TensorHolder", ["shape", "dtype", "name"])
+
+
+def nested_reduce_tensor(tensor):
+    if isinstance(tensor, dict):
+        # copy tensor since it will be inplace modified dict
+        tensor = copy.copy(tensor)
+        for key in list(tensor.keys()):
+            tensor[key] = nested_reduce_tensor(tensor[key])
+    if isinstance(tensor, (tuple, list)):
+        return type(tensor)(nested_reduce_tensor(t) for t in tensor)
+
+    if isinstance(tensor, paddle.Tensor):
+        return TensorHolder(tensor.shape, tensor.dtype, tensor.name)
+
+    return tensor
+
+
+def nested_empty_tensor(tensor):
+    if isinstance(tensor, dict):
+        for key in list(tensor.keys()):
+            tensor[key] = nested_empty_tensor(tensor[key])
+    if isinstance(tensor, list):
+        return type(tensor)(nested_empty_tensor(t) for t in tensor)
+
+    # TensorHolder is tuple
+    if isinstance(tensor, TensorHolder):
+        t = paddle.empty(tensor.shape, dtype=tensor.dtype, name=tensor.name)
+        t.name = tensor.name
+        return t
+
+    return tensor
+
+
+def nested_broadcast_tensor(tensor, src=0, group=None):
+    if isinstance(tensor, dict):
+        for key in list(tensor.keys()):
+            tensor[key] = nested_broadcast_tensor(tensor[key], src=src, group=group)
+    if isinstance(tensor, list):
+        return type(tensor)(nested_broadcast_tensor(t, src=src, group=group) for t in tensor)
+
+    if isinstance(tensor, paddle.Tensor):
+        paddle.distributed.broadcast(tensor, src=src, group=group, sync_op=True)
+    return tensor
+
+
+def nested_broadcast_tensor_with_empty(tensor, src=0, group=None):
+    # src should src rank in the group, not global rank.
+    process_rank = paddle.distributed.get_rank()
+
+    if group is not None:
+        src_rank = group.ranks[src]
+    if process_rank == src_rank:
+        if tensor is None:
+            logger.warning(
+                f"Your local rank {paddle.distributed.get_rank()} must have a state_dict. dp_rank:{process_rank}, src_rank:{src_rank}"
+            )
+        fake_tensor = [nested_reduce_tensor(tensor)]
+    else:
+        if tensor is not None:
+            logger.warning(
+                f"Your local rank {paddle.distributed.get_rank()}  are forbidden to have a state_dict. dp_rank:{process_rank}, src_rank:{src_rank}"
+            )
+        fake_tensor = [None]
+
+    paddle.distributed.broadcast_object_list(
+        fake_tensor,
+        src=src_rank,
+        group=group,
+    )
+    fake_tensor = fake_tensor[0]
+
+    if process_rank != src_rank:
+        tensor = nested_empty_tensor(fake_tensor)
+
+    tensor = nested_broadcast_tensor(tensor, src=src_rank, group=group)
+    return tensor
+
+
+def nested_copy(inputs):
+    if isinstance(inputs, dict):
+        outputs = {}
+        for key in list(inputs.keys()):
+            outputs[key] = nested_copy(inputs[key])
+        return outputs
+    return inputs
+
+
+def nested_copy_place(inputs, place=None, blocking=False):
+    if isinstance(inputs, dict):
+        outputs = {}
+        for key in list(inputs.keys()):
+            outputs[key] = nested_copy_place(inputs[key], place, blocking)
+        return outputs
+    if isinstance(inputs, paddle.Tensor):
+        inputs = inputs if inputs.place == place else inputs._copy_to(place, blocking)
+    return inputs
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/profiler.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/profiler.py
new file mode 100644
index 000000000..9a6fa25b9
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/profiler.py
@@ -0,0 +1,130 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+import paddle.profiler as profiler
+
+# A global variable to record the number of calling times for profiler
+# functions. It is used to specify the tracing range of training steps.
+_profiler_step_id = 0
+
+# A global variable to avoid parsing from string every time.
+_profiler_options = None
+_prof = None
+
+
+class ProfilerOptions(object):
+    """
+    Use a string to initialize a ProfilerOptions.
+    The string should be in the format: "key1=value1;key2=value;key3=value3".
+    For example:
+      "profile_path=model.profile"
+      "batch_range=[50, 60]; profile_path=model.profile"
+      "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
+
+    ProfilerOptions supports following key-value pair:
+      batch_range      - a integer list, e.g. [100, 110].
+      state            - a string, the optional values are 'CPU', 'GPU' or 'All'.
+      sorted_key       - a string, the optional values are 'calls', 'total',
+                         'max', 'min' or 'ave.
+      tracer_option    - a string, the optional values are 'Default', 'OpDetail',
+                         'AllOpDetail'.
+      profile_path     - a string, the path to save the serialized profile data,
+                         which can be used to generate a timeline.
+      exit_on_finished - a boolean.
+      record_shapes    - a boolean.
+    """
+
+    def __init__(self, options_str):
+        assert isinstance(options_str, str)
+
+        self._options = {
+            "batch_range": [10, 20],
+            "state": "All",
+            "sorted_key": "total",
+            "tracer_option": "Default",
+            "profile_path": "/tmp/profile",
+            "exit_on_finished": True,
+            "timer_only": True,
+            "record_shapes": False,
+        }
+        self._parse_from_string(options_str)
+
+    def _parse_from_string(self, options_str):
+        for kv in options_str.replace(" ", "").split(";"):
+            key, value = kv.split("=")
+            if key == "batch_range":
+                value_list = value.replace("[", "").replace("]", "").split(",")
+                value_list = list(map(int, value_list))
+                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[1] > value_list[0]:
+                    self._options[key] = value_list
+            elif key == "exit_on_finished":
+                self._options[key] = value.lower() in ("yes", "true", "t", "1")
+            elif key in ["state", "sorted_key", "tracer_option", "profile_path"]:
+                self._options[key] = value
+            elif key == "timer_only":
+                self._options[key] = value
+            elif key == "record_shapes":
+                self._options[key] = value
+
+    def __getitem__(self, name):
+        if self._options.get(name, None) is None:
+            raise ValueError("ProfilerOptions does not have an option named %s." % name)
+        return self._options[name]
+
+
+def add_profiler_step(options_str=None):
+    """
+    Enable the operator-level timing using PaddlePaddle's profiler.
+    The profiler uses a independent variable to count the profiler steps.
+    One call of this function is treated as a profiler step.
+    Args:
+      profiler_options - a string to initialize the ProfilerOptions.
+                         Default is None, and the profiler is disabled.
+    """
+    if options_str is None:
+        return
+
+    global _prof
+    global _profiler_step_id
+    global _profiler_options
+
+    if _profiler_options is None:
+        _profiler_options = ProfilerOptions(options_str)
+    # profile : https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/performance_improving/profiling_model.html#chakanxingnengshujudetongjibiaodan
+    # timer_only = True  only the model's throughput and time overhead are displayed
+    # timer_only = False calling summary can print a statistical form that presents performance data from different perspectives.
+    # timer_only = False the output Timeline information can be found in the profiler_log directory
+    if _prof is None:
+        _timer_only = str(_profiler_options["timer_only"]) == str(True)
+        _record_shapes = str(_profiler_options["record_shapes"]) == str(True)
+        _prof = profiler.Profiler(
+            scheduler=(_profiler_options["batch_range"][0], _profiler_options["batch_range"][1]),
+            on_trace_ready=profiler.export_chrome_tracing(_profiler_options["profile_path"]),
+            timer_only=_timer_only,
+            record_shapes=_record_shapes,
+        )
+        _prof.start()
+    else:
+        _prof.step()
+
+    if _profiler_step_id == _profiler_options["batch_range"][1]:
+        _prof.stop()
+        _prof.summary(op_detail=True, thread_sep=False, time_unit="ms")
+        _prof = None
+        if _profiler_options["exit_on_finished"]:
+            sys.exit(0)
+
+    _profiler_step_id += 1
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/safetensors.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/safetensors.py
new file mode 100644
index 000000000..54256023d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/safetensors.py
@@ -0,0 +1,312 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import json
+import mmap
+from collections import OrderedDict
+
+import numpy as np
+
+__all__ = [
+    "fast_safe_open",
+    "fast_load_file",
+]
+
+
+MAX_HEADER_SIZE = 100 * 1000 * 1000
+
+dtype_size = {
+    "BOOL": 1,
+    "U8": 1,
+    "I8": 1,
+    "F8_E5M2": 1,
+    "F8_E4M3": 1,
+    "I16": 2,
+    "U16": 2,
+    "I32": 4,
+    "U32": 4,
+    "I64": 8,
+    "U64": 8,
+    "F16": 2,
+    "BF16": 2,
+    "F32": 4,
+    "F64": 8,
+}
+
+numpy_dtype = {
+    "BOOL": np.bool_,
+    "U8": np.uint8,
+    "I8": np.int8,
+    "F8_E5M2": 1,  # no fp8
+    "F8_E4M3": 1,  # no fp8
+    "I16": np.int16,
+    "U16": np.uint16,
+    "I32": np.int32,
+    "U32": np.uint32,
+    "I64": np.int64,
+    "U64": np.uint64,
+    "F16": np.float16,
+    "BF16": 2,  # no bf16
+    "F32": np.float32,
+    "F64": np.float64,
+}
+
+
+def getSize(fileobject):
+    fileobject.seek(0, 2)  # move the cursor to the end of the file
+    size = fileobject.tell()
+    fileobject.seek(0)  # move the cursor to the start of the file
+    return size
+
+
+def metadata_validate(metadata):
+    start = 0
+    for key, info in metadata.items():
+        s, e = info["data_offsets"]
+        if s != start or e < s:
+            raise ValueError(f"SafeTensorError::InvalidOffset({key})")
+        start = e
+        nelements = np.prod(info["shape"])
+        nbytes = nelements * dtype_size[info["dtype"]]
+        if (e - s) != nbytes:
+            raise ValueError("SafeTensorError::TensorInvalidInfo")
+    return start
+
+
+def read_metadata(buffer):
+    buffer_len = getSize(buffer)
+    if buffer_len < 8:
+        raise ValueError("SafeTensorError::HeaderTooSmall")
+
+    n = np.frombuffer(buffer.read(8), dtype=np.uint64).item()
+    if n > MAX_HEADER_SIZE:
+        raise ValueError("SafeTensorError::HeaderTooLarge")
+
+    stop = n + 8
+    if stop > buffer_len:
+        raise ValueError("SafeTensorError::InvalidHeaderLength")
+
+    tensors = json.loads(buffer.read(n), object_pairs_hook=OrderedDict)
+    metadata = tensors.pop("__metadata__", None)
+    buffer_end = metadata_validate(tensors)
+
+    if buffer_end + 8 + n != buffer_len:
+        raise ValueError("SafeTensorError::MetadataIncompleteBuffer")
+
+    return stop, tensors, metadata
+
+
+def readinto_numpy(meta, buffer, base_ptr):
+    def create_empty(info):
+        return np.empty(shape=info["shape"], dtype=numpy_dtype[info["dtype"]])
+
+    ret = {}
+    for k, v in meta.items():
+        t = create_empty(v)
+        buffer.seek(base_ptr + v["data_offsets"][0])
+        buffer.readinto(memoryview(t))
+        ret[k] = t
+    return ret
+
+
+class PySafeSlice:
+    def __init__(self, info, bufferfile, base_ptr, buffermmap):
+        self.info = info
+        self.bufferfile = bufferfile
+        self.buffermmap = buffermmap
+        self.base_ptr = base_ptr
+
+        self.start = [0 for dim in self.shape]
+        self.stop = [dim for dim in self.shape]
+        self.step = [1 for dim in self.shape]
+
+    @property
+    def ndim(self):
+        return len(self.shape)
+
+    def __getitem__(self, index):
+        # https://github.com/numpy/numpy/blob/4d652465cea38e9504f954ac708d91e4954bd13a/numpy/lib/_arrayterator_impl.py#L96-L126
+        # Fix index, handling ellipsis and incomplete slices.
+        if not isinstance(index, tuple):
+            index = (index,)
+        fixed = []
+        length, dims = len(index), self.ndim
+        for slice_ in index:
+            if slice_ is Ellipsis:
+                fixed.extend([slice(None)] * (dims - length + 1))
+                length = len(fixed)
+            elif isinstance(slice_, int):
+                fixed.append(slice(slice_, slice_ + 1, 1))
+            else:
+                fixed.append(slice_)
+        index = tuple(fixed)
+        if len(index) < dims:
+            index += (slice(None),) * (dims - len(index))
+
+        out_start, out_stop, out_step = copy.deepcopy((self.start, self.stop, self.step))
+        for i, (start, stop, step, slice_) in enumerate(zip(self.start, self.stop, self.step, index)):
+            out_start[i] = slice_.start if slice_.start is not None else 0
+            out_step[i] = slice_.step if slice_.step is not None else 1
+            out_stop[i] = slice_.stop if slice_.stop is not None else stop - start
+            out_stop[i] = min(stop, out_stop[i])
+
+        target_shape = []
+        for x, y, z, sli in zip(out_start, out_stop, out_step, index):
+            assert z == 1, "only support step = 1"
+            if y - x > 1 or sli.step is None:
+                target_shape.append(max(int(y - x), 0))
+
+        if len(target_shape) == 0:
+            if self.shape == [1]:
+                target_shape = self.shape
+
+        # https://github.com/huggingface/safetensors/blob/b947b59079a6197d7930dfb535818ac4896113e8/safetensors/src/slice.rs#L297-L315
+        indices = []
+        span = self.bits
+        for i, (start, stop, step) in enumerate(zip(out_start[::-1], out_stop[::-1], out_step[::-1])):
+            if len(indices) == 0:
+                if start == 0 and stop == self.shape[::-1][i]:
+                    pass
+                    #  We haven't started to slice yet, just increase the span
+                else:
+                    offset = start * span
+                    small_span = stop * span - offset
+                    indices.append((offset, offset + small_span))
+
+            else:
+                capacity = (stop - start) * len(indices)
+                newindices = []
+                for n in range(start, stop):
+                    offset = n * span
+                    for (old_start, old_stop) in indices:
+                        newindices.append((old_start + offset, old_stop + offset))
+                indices = newindices
+                assert len(indices) == capacity, f"error {capacity} {len(indices)}"
+            span *= self.shape[::-1][i]
+
+        if len(indices) == 0:
+            indices.append((0, self.nbytes))
+
+        merge_indices = []
+        last_end = -1
+        last_start = -1
+        for start, end in indices:
+            if start == last_end:
+                last_end = end
+                continue
+            else:
+                if last_start != -1:
+                    merge_indices.append((last_start, last_end))
+                last_start = start
+                last_end = end
+        if last_start != -1:
+            merge_indices.append((last_start, last_end))
+        tensor = np.empty(shape=[1] if len(target_shape) == 0 else np.prod(target_shape), dtype=self.dtype)
+
+        tensor_view = memoryview(tensor.view(np.uint8).reshape(-1))
+        curr_data_ptr = 0
+        # if to many slice and each slice < 1M
+        if len(merge_indices) > 128 and (merge_indices[0][1] - merge_indices[0][0] < 1024 * 1024):
+            # Use mmap for random access
+            for start, end in merge_indices:
+                data_len = end - start
+                tensor_view[curr_data_ptr : curr_data_ptr + data_len] = self.buffermmap[
+                    self.start_offset + start : self.start_offset + end
+                ]
+                curr_data_ptr += data_len
+        else:
+            # Use file read for sequence access
+            for start, end in merge_indices:
+                data_len = end - start
+                self.bufferfile.seek(self.start_offset + start)
+                view = tensor_view[curr_data_ptr : curr_data_ptr + data_len]
+                self.bufferfile.readinto(view)
+                curr_data_ptr += data_len
+
+        return tensor.reshape(target_shape)
+
+    def get(self, *args, **kwargs):
+        tensor = np.empty(shape=self.shape, dtype=self.dtype)
+        self.bufferfile.seek(self.start_offset)
+        self.bufferfile.readinto(memoryview(tensor))
+        return tensor
+
+    @property
+    def start_offset(self):
+        return self.base_ptr + self.info["data_offsets"][0]
+
+    def get_shape(self):
+        return self.shape
+
+    @property
+    def shape(self):
+        return self.info["shape"]
+
+    @property
+    def dtype(self):
+        return numpy_dtype[self.info["dtype"]]
+
+    @property
+    def nelements(self):
+        return np.prod(self.info["shape"])
+
+    @property
+    def bits(self):
+        return dtype_size[self.info["dtype"]]
+
+    @property
+    def nbytes(self):
+        return self.nelements * dtype_size[self.info["dtype"]]
+
+
+# a simple file writer object
+class fast_safe_open:
+    def __init__(self, filename, framework=None, device="cpu"):
+        self.filename = filename
+        self.framework = framework
+        self.file = open(self.filename, "rb")
+        self.file_mmap = mmap.mmap(self.file.fileno(), 0, flags=mmap.MAP_PRIVATE)
+        self.base, self.tensors_decs, self.__metadata__ = read_metadata(self.file)
+        self.tensors = OrderedDict()
+        for key, info in self.tensors_decs.items():
+            self.tensors[key] = PySafeSlice(info, self.file, self.base, self.file_mmap)
+            self.tensors[key].key = key
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        self.file_mmap.close()
+        self.file.close()
+
+    def metadata(self):
+        return self.__metadata__
+
+    def keys(self):
+        return list(self.tensors.keys())
+
+    def get_tensor(self, name):
+        return self.tensors[name].get()
+
+    def get_slice(self, name):
+        return self.tensors[name]
+
+
+def fast_load_file(filename):
+    result = {}
+    with fast_safe_open(filename, framework="np") as f:
+        for k in f.keys():
+            result[k] = f.get_tensor(k)
+    return result
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/serialization.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/serialization.py
new file mode 100644
index 000000000..9b467ec14
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/serialization.py
@@ -0,0 +1,253 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import io
+import os
+import pickle
+from functools import lru_cache
+from typing import Union
+from zipfile import ZipFile
+
+import numpy as np
+import paddle
+from _io import BufferedReader
+from safetensors import deserialize
+
+from paddlenlp.utils.env import PYTORCH_WEIGHTS_NAME, SAFE_WEIGHTS_NAME
+
+MZ_ZIP_LOCAL_DIR_HEADER_SIZE = 30
+
+_TYPES = {
+    "F64": np.float64,
+    "F32": np.float32,
+    "F16": np.float16,
+    "I64": np.int64,
+    "U64": np.uint64,
+    "I32": np.int32,
+    "U32": np.uint32,
+    "I16": np.int16,
+    "U16": np.uint16,
+    "BF16": np.uint16,
+    "I8": np.int8,
+    "U8": np.uint8,
+    "BOOL": bool,
+}
+
+
+class SerializationError(Exception):
+    """Exception for serialization"""
+
+    pass
+
+
+def seek_by_string(file_handler: BufferedReader, string: str, file_size: int) -> int:
+    """seek the index of file-handler with target words
+    Args:
+        file_handler (BufferedReader): file handler
+        string (str): the specific string in the file
+        file_size (int): size of file
+    Returns:
+        int: end index of target string
+    """
+    word_index = 0
+    word_bytes = string.encode("latin")
+    empty_byte = "".encode("latin")
+
+    while word_index < len(string) and file_handler.tell() < file_size:
+        content = file_handler.read(1)
+        if content == empty_byte:
+            break
+
+        if word_bytes[word_index] == content[0]:
+            word_index += 1
+        else:
+            word_index = 0
+
+    if file_handler.tell() >= file_size - 1:
+        raise SerializationError(f"can't find the find the target string<{string}> in the file")
+    return file_handler.tell()
+
+
+def read_prefix_key(path):
+    file_size = os.stat(path).st_size
+    with open(path, "rb") as file_handler:
+        end_index = seek_by_string(file_handler, "data.pkl", file_size)
+        file_handler.seek(MZ_ZIP_LOCAL_DIR_HEADER_SIZE)
+        prefix_key = file_handler.read(end_index - MZ_ZIP_LOCAL_DIR_HEADER_SIZE - len("/data.pkl"))
+    return prefix_key.decode("latin")
+
+
+def _maybe_decode_ascii(bytes_str: Union[bytes, str]) -> str:
+    if isinstance(bytes_str, bytes):
+        return bytes_str.decode("ascii")
+    return bytes_str
+
+
+@lru_cache(maxsize=None)
+def _storage_type_to_dtype_to_map():
+    """convert storage type to numpy dtype"""
+    return {
+        "DoubleStorage": np.double,
+        "FloatStorage": np.float32,
+        "HalfStorage": np.half,
+        "LongStorage": np.int64,
+        "IntStorage": np.int32,
+        "ShortStorage": np.int16,
+        "CharStorage": np.int8,
+        "ByteStorage": np.uint8,
+        "BoolStorage": np.bool_,
+        "ComplexDoubleStorage": np.cdouble,
+        "ComplexFloatStorage": np.cfloat,
+        "BFloat16Storage": np.uint16,  # support bf16
+    }
+
+
+class StorageType:
+    """Temp Class for Storage Type"""
+
+    def __init__(self, name):
+        self.dtype = _storage_type_to_dtype_to_map()[name]
+
+    def __str__(self):
+        return f"StorageType(dtype={self.dtype})"
+
+
+def _element_size(dtype: str) -> int:
+    """
+    Returns the element size for a dtype, in bytes
+    """
+    if dtype in [np.float16, np.float32, np.float64]:
+        return np.finfo(dtype).bits >> 3
+    elif dtype == np.bool_:
+        return 1
+    else:
+        return np.iinfo(dtype).bits >> 3
+
+
+class UnpicklerWrapperStage(pickle.Unpickler):
+    def find_class(self, mod_name, name):
+        if type(name) is str and "Storage" in name:
+            try:
+                return StorageType(name)
+            except KeyError:
+                pass
+
+        if mod_name == "torch._utils":
+            # rebuild torch.nn.Papameter
+            if name == "_rebuild_parameter":
+                return _rebuild_parameter
+            # rebuild torch.nn.Papameter with state
+            if name == "_rebuild_parameter_with_state":
+                return _rebuild_parameter_with_state
+            # rebuild torch.Tensor
+            return _rebuild_tensor_stage
+
+        # pytorch_lightning tensor builder
+        if "pytorch_lightning" in mod_name:
+            return dumpy
+        return super().find_class(mod_name, name)
+
+
+def _rebuild_tensor_stage(storage, storage_offset, size, stride, requires_grad, backward_hooks):
+    # if a tensor has shape [M, N] and stride is [1, N], it's column-wise / fortran-style
+    # if a tensor has shape [M, N] and stride is [M, 1], it's row-wise / C-style
+    # defautls to C-style
+    if stride is not None and len(stride) > 1 and stride[0] == 1 and stride[1] > 1:
+        order = "F"
+    else:
+        order = "C"
+
+    # fix bug when load https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
+    numel = int(np.prod(size))
+    return storage[storage_offset : storage_offset + numel].reshape(size, order=order)
+
+
+def _rebuild_parameter(data, requires_grad, backward_hooks):
+    return data
+
+
+def _rebuild_parameter_with_state(data, requires_grad, backward_hooks, state):
+    return data
+
+
+def dumpy(*args, **kwarsg):
+    return None
+
+
+def load_torch(path: str, **pickle_load_args):
+    """
+    load torch weight file with the following steps:
+    1. load the structure of pytorch weight file
+    2. read the tensor data and re-construct the state-dict
+    Args:
+        path: the path of pytorch weight file
+        **pickle_load_args: args of pickle module
+    Returns:
+    """
+
+    if path.endswith(PYTORCH_WEIGHTS_NAME) or os.path.split(path)[-1].startswith("pytorch_model-"):
+        pickle_load_args.update({"encoding": "utf-8"})
+
+        prefix_key = read_prefix_key(path)
+
+        torch_zip = ZipFile(path, "r")
+        loaded_storages = {}
+
+        def load_tensor(dtype, numel, key, location):
+            name = f"{prefix_key}/data/{key}"
+            typed_storage = np.frombuffer(torch_zip.open(name).read()[:numel], dtype=dtype)
+            return typed_storage
+
+        def persistent_load(saved_id):
+            assert isinstance(saved_id, tuple)
+            typename = _maybe_decode_ascii(saved_id[0])
+            data = saved_id[1:]
+
+            assert (
+                typename == "storage"
+            ), f"Unknown typename for persistent_load, expected 'storage' but got '{typename}'"
+            storage_type, key, location, numel = data
+            dtype = storage_type.dtype
+
+            if key in loaded_storages:
+                typed_storage = loaded_storages[key]
+            else:
+                nbytes = numel * _element_size(dtype)
+                typed_storage = load_tensor(dtype, nbytes, key, _maybe_decode_ascii(location))
+                loaded_storages[key] = typed_storage
+
+            return typed_storage
+
+        data_iostream = torch_zip.open(f"{prefix_key}/data.pkl").read()
+        unpickler_stage = UnpicklerWrapperStage(io.BytesIO(data_iostream), **pickle_load_args)
+        unpickler_stage.persistent_load = persistent_load
+        state_dict = unpickler_stage.load()
+        torch_zip.close()
+    elif path.endswith(SAFE_WEIGHTS_NAME) or os.path.split(path)[-1].startswith("model-"):
+        # torch safetensors -> numpy -> paddle.Tensor
+        with open(path, "rb") as f:
+            data = f.read()
+
+        flat = deserialize(data)
+        state_dict = {}
+        for k, v in flat:
+            dtype = _TYPES[v["dtype"]]
+            if v["dtype"] == "BF16":
+                arr = paddle.to_tensor(np.frombuffer(v["data"], dtype=dtype).reshape(v["shape"]), dtype="bfloat16")
+            else:
+                arr = paddle.to_tensor(np.frombuffer(v["data"], dtype=dtype).reshape(v["shape"]))
+            state_dict[k] = arr
+
+    return state_dict
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/tools.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/tools.py
new file mode 100644
index 000000000..8f7b90f15
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/utils/tools.py
@@ -0,0 +1,839 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import os
+import random
+
+import numpy as np
+import paddle
+from tqdm import tqdm
+
+from .doc_parser import DocParser
+from .log import logger
+
+
+def static_params_to_dygraph(model, static_tensor_dict):
+    """Simple tool for convert static paramters to dygraph paramters dict.
+
+    **NOTE** The model must both support static graph and dygraph mode.
+
+    Args:
+        model (nn.Layer): the model of a neural network.
+        static_tensor_dict (string): path of which locate the saved paramters in static mode.
+            Usualy load by `paddle.static.load_program_state`.
+
+    Returns:
+        [tensor dict]: a state dict the same as the dygraph mode.
+    """
+    state_dict = model.state_dict()
+    # static_tensor_dict = paddle.static.load_program_state(static_params_path)
+
+    ret_dict = dict()
+    for n, p in state_dict.items():
+        if p.name not in static_tensor_dict:
+            logger.info("%s paramter is missing from you state dict." % n)
+            continue
+        ret_dict[n] = static_tensor_dict[p.name]
+
+    return ret_dict
+
+
+def dygraph_params_to_static(model, dygraph_tensor_dict, topo=None):
+    """Simple tool for convert dygraph paramters to static paramters dict.
+
+    **NOTE** The model must both support static graph and dygraph mode.
+
+    Args:
+        model (nn.Layer): the model of a neural network.
+        dygraph_tensor_dict (string): path of which locate the saved paramters in static mode.
+
+    Returns:
+        [tensor dict]: a state dict the same as the dygraph mode.
+    """
+    state_dict = model.state_dict()
+
+    ret_dict = dict()
+    for name, parm in state_dict.items():
+        if name not in dygraph_tensor_dict:
+            logger.info("%s paramter is missing from you state dict." % name)
+            continue
+
+        tensor = dygraph_tensor_dict[name]
+        if parm.is_distributed:
+            assert topo is not None
+            for dim, v in enumerate(tensor.shape):
+                if parm.shape[dim] != v:
+                    break
+
+            splited = np.split(tensor, topo.mp_info.size, axis=dim)[topo.mp_info.rank]
+            ret_dict[parm.name] = splited
+        else:
+            ret_dict[parm.name] = tensor
+
+    return ret_dict
+
+
+class TimeCostAverage(object):
+    """
+    Simple tool for calcluating time average cost in the process of training and inferencing.
+    """
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        """
+        Reset the recoder state, and reset the `cnt` to zero.
+        """
+        self.cnt = 0
+        self.total_time = 0
+
+    def record(self, usetime):
+        """
+        Recoding the time cost in current step and accumulating the `cnt`.
+        """
+        self.cnt += 1
+        self.total_time += usetime
+
+    def get_average(self):
+        """
+        Returning the average time cost after the start of training.
+        """
+        if self.cnt == 0:
+            return 0
+        return self.total_time / self.cnt
+
+
+def get_env_device():
+    """
+    Return the device name of running environment.
+    """
+    if paddle.is_compiled_with_cuda():
+        return "gpu"
+    elif "npu" in paddle.device.get_all_custom_device_type():
+        return "npu"
+    elif "gcu" in paddle.device.get_all_custom_device_type():
+        return "gcu"
+    elif paddle.is_compiled_with_rocm():
+        return "rocm"
+    elif paddle.is_compiled_with_xpu():
+        return "xpu"
+    return "cpu"
+
+
+def compare_version(version, pair_version):
+    """
+    Args:
+        version (str): The first version string needed to be compared.
+            The format of version string should be as follow : "xxx.yyy.zzz".
+        pair_version (str): The second version string needed to be compared.
+             The format of version string should be as follow : "xxx.yyy.zzz".
+    Returns:
+        int: The result of comparasion. 1 means version > pair_version; 0 means
+            version = pair_version; -1 means version < pair_version.
+
+    Examples:
+        >>> compare_version("2.2.1", "2.2.0")
+        >>> 1
+        >>> compare_version("2.2.0", "2.2.0")
+        >>> 0
+        >>> compare_version("2.2.0-rc0", "2.2.0")
+        >>> -1
+        >>> compare_version("2.3.0-rc0", "2.2.0")
+        >>> 1
+    """
+    version = version.strip()
+    pair_version = pair_version.strip()
+    if version == pair_version:
+        return 0
+    version_list = version.split(".")
+    pair_version_list = pair_version.split(".")
+    for version_code, pair_version_code in zip(version_list, pair_version_list):
+        if not version_code.isnumeric():
+            return -1
+        if not pair_version_code.isnumeric():
+            return 1
+        if int(version_code) > int(pair_version_code):
+            return 1
+        elif int(version_code) < int(pair_version_code):
+            return -1
+    return 0
+
+
+def get_bool_ids_greater_than(probs, limit=0.5, return_prob=False):
+    """
+    Get idx of the last dimension in probability arrays, which is greater than a limitation.
+
+    Args:
+        probs (List[List[float]]): The input probability arrays.
+        limit (float): The limitation for probability.
+        return_prob (bool): Whether to return the probability
+    Returns:
+        List[List[int]]: The index of the last dimension meet the conditions.
+    """
+    probs = np.array(probs)
+    dim_len = len(probs.shape)
+    if dim_len > 1:
+        result = []
+        for p in probs:
+            result.append(get_bool_ids_greater_than(p, limit, return_prob))
+        return result
+    else:
+        result = []
+        for i, p in enumerate(probs):
+            if p > limit:
+                if return_prob:
+                    result.append((i, p))
+                else:
+                    result.append(i)
+        return result
+
+
+def get_span(start_ids, end_ids, with_prob=False):
+    """
+    Get span set from position start and end list.
+
+    Args:
+        start_ids (List[int]/List[tuple]): The start index list.
+        end_ids (List[int]/List[tuple]): The end index list.
+        with_prob (bool): If True, each element for start_ids and end_ids is a tuple aslike: (index, probability).
+    Returns:
+        set: The span set without overlapping, every id can only be used once .
+    """
+    if with_prob:
+        start_ids = sorted(start_ids, key=lambda x: x[0])
+        end_ids = sorted(end_ids, key=lambda x: x[0])
+    else:
+        start_ids = sorted(start_ids)
+        end_ids = sorted(end_ids)
+
+    start_pointer = 0
+    end_pointer = 0
+    len_start = len(start_ids)
+    len_end = len(end_ids)
+    couple_dict = {}
+    while start_pointer < len_start and end_pointer < len_end:
+        if with_prob:
+            start_id = start_ids[start_pointer][0]
+            end_id = end_ids[end_pointer][0]
+        else:
+            start_id = start_ids[start_pointer]
+            end_id = end_ids[end_pointer]
+
+        if start_id == end_id:
+            couple_dict[end_ids[end_pointer]] = start_ids[start_pointer]
+            start_pointer += 1
+            end_pointer += 1
+            continue
+        if start_id < end_id:
+            couple_dict[end_ids[end_pointer]] = start_ids[start_pointer]
+            start_pointer += 1
+            continue
+        if start_id > end_id:
+            end_pointer += 1
+            continue
+    result = [(couple_dict[end], end) for end in couple_dict]
+    result = set(result)
+    return result
+
+
+class DataConverter(object):
+    """DataConverter to convert data export from annotation platform"""
+
+    def __init__(
+        self,
+        label_studio_file,
+        negative_ratio=5,
+        prompt_prefix="情感倾向",
+        options=["正向", "负向"],
+        separator="##",
+        layout_analysis=False,
+        expand_to_a4_size=True,
+        schema_lang="ch",
+        ocr_lang="en",
+        anno_type="text",
+    ):
+        """Init Data Converter"""
+        self.negative_ratio = negative_ratio
+        self.prompt_prefix = prompt_prefix
+        self.options = options
+        self.separator = separator
+        self.layout_analysis = layout_analysis
+        self.expand_to_a4_size = expand_to_a4_size
+        self.schema_lang = schema_lang
+        self.ocr_lang = ocr_lang
+        self.anno_type = anno_type
+        self.label_studio_file = label_studio_file
+        self.ignore_list = ["属性值", "object"]
+
+    def process_text_tag(self, line, task_type="ext"):
+        items = {}
+        items["text"] = line["data"]["text"]
+        if task_type == "ext":
+            items["entities"] = []
+            items["relations"] = []
+            result_list = line["annotations"][0]["result"]
+            for a in result_list:
+                if a["type"] == "labels":
+                    items["entities"].append(
+                        {
+                            "id": a["id"],
+                            "start_offset": a["value"]["start"],
+                            "end_offset": a["value"]["end"],
+                            "label": a["value"]["labels"][0],
+                        }
+                    )
+                else:
+                    items["relations"].append(
+                        {
+                            "id": a["from_id"] + "-" + a["to_id"],
+                            "from_id": a["from_id"],
+                            "to_id": a["to_id"],
+                            "type": a["labels"][0],
+                        }
+                    )
+        elif task_type == "cls":
+            items["label"] = line["annotations"][0]["result"][0]["value"]["choices"]
+        return items
+
+    def process_image_tag(self, line, task_type="ext"):
+        def _io1(box1, box2):
+            """calc intersection over box1 area"""
+            x1 = max(box1[0], box2[0])
+            y1 = max(box1[1], box2[1])
+            x2 = min(box1[2], box2[2])
+            y2 = min(box1[3], box2[3])
+            if x2 <= x1 or y2 <= y1:
+                return 0.0
+            box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
+            return (x2 - x1) * (y2 - y1) * 1.0 / box1_area
+
+        def _find_segment_in_box(layouts, box, threshold=0.7):
+            positions = []
+            global_offset = 0
+            for segment in layouts:
+                sbox = segment[0]
+                text_len = len(segment[1])
+                if text_len == 0:
+                    continue
+                if len(segment) == 2 or (len(segment) == 3 and segment[2] != "table"):
+                    char_w = (sbox[2] - sbox[0]) * 1.0 / text_len
+                    for i in range(text_len):
+                        cbox = [sbox[0] + i * char_w, sbox[1], sbox[0] + (i + 1) * char_w, sbox[3]]
+                        c_covered = _io1(cbox, box)
+                        if c_covered >= threshold:
+                            positions.append(global_offset)
+                        elif (
+                            cbox[2] == min(cbox[2], box[2])
+                            and cbox[0] == max(cbox[0], box[0])
+                            and cbox[1] < box[1]
+                            and cbox[3] > box[3]
+                        ):
+                            if c_covered > 0.5:
+                                positions.append(global_offset)
+                        global_offset += 1
+                else:
+                    cell_covered = _io1(box, sbox)
+                    if cell_covered >= threshold:
+                        for i in range(text_len):
+                            positions.append(global_offset)
+                            global_offset += 1
+                    else:
+                        global_offset += text_len
+
+            offsets = []
+            if not positions:
+                return offsets
+            spos = positions[0]
+            for i in range(1, len(positions)):
+                if positions[i] != positions[i - 1] + 1:
+                    offsets.append((spos, positions[i - 1] + 1))
+                    spos = positions[i]
+            offsets.append((spos, positions[-1] + 1))
+            return offsets
+
+        items = {}
+        img_file = os.path.basename(line["data"]["image"])
+        p = img_file.find("-")
+        img_file = img_file[p + 1 :]
+
+        # Get file path for adapting to windows
+        file_dir = os.path.dirname(self.label_studio_file)
+        # Get image file path
+        img_path = os.path.join(file_dir, "images", img_file)
+
+        if not os.path.exists(img_path):
+            logger.warning("Image file %s not exist in %s" % (img_file, os.path.join(file_dir, "images")))
+            return None
+        logger.info("Parsing image file %s ..." % (img_file))
+        doc_parser = DocParser(layout_analysis=self.layout_analysis, ocr_lang=self.ocr_lang)
+
+        parsed_doc = doc_parser.parse({"doc": img_path})
+        img_w, img_h = parsed_doc["img_w"], parsed_doc["img_h"]
+
+        text = ""
+        bbox = []
+        for segment in parsed_doc["layout"]:
+            box = doc_parser._normalize_box(segment[0], [img_w, img_h], [1000, 1000])
+            text += segment[1]
+            bbox.extend([box] * len(segment[1]))
+        assert len(text) == len(bbox), "len of text is not equal to len of bbox"
+        items["text"] = text
+        items["bbox"] = bbox
+        items["image"] = parsed_doc["image"]
+        if task_type == "ext":
+            items["entities"] = []
+            items["relations"] = []
+
+            result_list = line["annotations"][0]["result"]
+            ent_ids = []
+            for e in result_list:
+                if e["type"] != "rectanglelabels":
+                    continue
+                assert img_w == e["original_width"] and img_h == e["original_height"], "Image size not match"
+                box = [
+                    e["value"]["x"] * 0.01 * img_w,
+                    e["value"]["y"] * 0.01 * img_h,
+                    (e["value"]["x"] + e["value"]["width"]) * 0.01 * img_w,
+                    (e["value"]["y"] + e["value"]["height"]) * 0.01 * img_h,
+                ]
+                offsets = _find_segment_in_box(parsed_doc["layout"], box)
+                if len(offsets) > 0:
+                    items["entities"].append(
+                        {
+                            "id": e["id"],
+                            "start_offset": offsets[0][0],
+                            "end_offset": offsets[0][1],
+                            "label": e["value"]["rectanglelabels"][0],
+                        }
+                    )
+                    ent_ids.append(e["id"])
+            for r in result_list:
+                if r["type"] != "relation":
+                    continue
+                if r["from_id"] in ent_ids and r["to_id"] in ent_ids:
+                    items["relations"].append(
+                        {
+                            "id": r["from_id"] + "-" + r["to_id"],
+                            "from_id": r["from_id"],
+                            "to_id": r["to_id"],
+                            "type": r["labels"][0],
+                        }
+                    )
+        else:
+            items["label"] = line["annotations"][0]["result"][0]["value"]["choices"]
+        return items
+
+    def convert_cls_examples(self, raw_examples):
+        """
+        Convert labeled data for classification task.
+        """
+        examples = []
+        logger.info("Converting annotation data...")
+        with tqdm(total=len(raw_examples)):
+            for line in raw_examples:
+                if self.anno_type == "text":
+                    items = self.process_text_tag(line, task_type="cls")
+                    image, bbox = None, None
+                elif self.anno_type == "image":
+                    items = self.process_image_tag(line, task_type="cls")
+                    if items is None:
+                        continue
+                    image, bbox = items["image"], items["bbox"]
+                else:
+                    raise ValueError("The type of annotation should be text or image")
+                text, labels = items["text"], items["label"]
+                example = self.generate_cls_example(text, labels, self.prompt_prefix, self.options, image, bbox)
+                examples.append(example)
+        return examples
+
+    def convert_ext_examples(self, raw_examples, is_train=True):
+        """
+        Convert labeled data for extraction task.
+        """
+
+        def _sep_cls_label(label, separator):
+            label_list = label.split(separator)
+            if len(label_list) == 1:
+                return label_list[0], None
+            return label_list[0], label_list[1:]
+
+        texts = []
+        # {"content": "", "result_list": [], "prompt": "X"}
+        entity_examples = []
+        # {"content": "", "result_list": [], "prompt": "X的Y"}
+        relation_examples = []
+        # {"content": "", "result_list": [], "prompt": "X的情感倾向[正向，负向]"}
+        entity_cls_examples = []
+
+        # Entity label set: ["时间", "地点", ... ]
+        entity_label_set = []
+        # Entity name set: ["2月8日上午", "北京", ... ]
+        entity_name_set = []
+        # Predicate set: ["歌手", "所属专辑", ... ]
+        predicate_set = []
+
+        # List[List[str]]
+        # List of entity prompt for each example
+        entity_prompt_list = []
+        # List of relation prompt for each example
+        relation_prompt_list = []
+        # Golden subject label for each example
+        subject_golden_list = []
+        # List of inverse relation for each example
+        inverse_relation_list = []
+        # List of predicate for each example
+        predicate_list = []
+
+        if self.anno_type == "text":
+            images, bbox_list = None, None
+        else:
+            images, bbox_list = [], []
+
+        logger.info("Converting annotation data...")
+        with tqdm(total=len(raw_examples)) as pbar:
+            for line in raw_examples:
+
+                if self.anno_type == "text":
+                    items = self.process_text_tag(line, task_type="ext")
+                    image, bbox = None, None
+                elif self.anno_type == "image":
+                    items = self.process_image_tag(line, task_type="ext")
+                    if items is None:
+                        continue
+                    image, bbox = items["image"], items["bbox"]
+                    images.append(image)
+                    bbox_list.append(bbox)
+                else:
+                    raise ValueError("The type of annotation should be text or image")
+
+                text, relations, entities = items["text"], items["relations"], items["entities"]
+                texts.append(text)
+
+                entity_example = []
+                entity_prompt = []
+                entity_example_map = {}
+                entity_map = {}  # id to entity name
+                for entity in entities:
+                    entity_name = text[entity["start_offset"] : entity["end_offset"]]
+                    entity_map[entity["id"]] = {
+                        "name": entity_name,
+                        "start": entity["start_offset"],
+                        "end": entity["end_offset"],
+                    }
+                    if entity["label"] in self.ignore_list:
+                        continue
+
+                    entity_label, entity_cls_label = _sep_cls_label(entity["label"], self.separator)
+
+                    # Define the prompt prefix for entity-level classification
+                    # xxx + "的" + 情感倾向 -> Chinese
+                    # Sentiment classification + " of " + xxx -> English
+                    if self.schema_lang == "ch":
+                        entity_cls_prompt_prefix = entity_name + "的" + self.prompt_prefix
+                    else:
+                        entity_cls_prompt_prefix = self.prompt_prefix + " of " + entity_name
+                    if entity_cls_label is not None:
+                        entity_cls_example = self.generate_cls_example(
+                            text, entity_cls_label, entity_cls_prompt_prefix, self.options, image, bbox
+                        )
+
+                        entity_cls_examples.append(entity_cls_example)
+
+                    result = {"text": entity_name, "start": entity["start_offset"], "end": entity["end_offset"]}
+                    if entity_label not in entity_example_map.keys():
+                        entity_example_map[entity_label] = {
+                            "content": text,
+                            "result_list": [result],
+                            "prompt": entity_label,
+                        }
+                        if self.anno_type == "image":
+                            entity_example_map[entity_label]["image"] = image
+                            entity_example_map[entity_label]["bbox"] = bbox
+                    else:
+                        entity_example_map[entity_label]["result_list"].append(result)
+
+                    if entity_label not in entity_label_set and entity_label != "观点词":
+                        entity_label_set.append(entity_label)
+                    if entity_name not in entity_name_set:
+                        entity_name_set.append(entity_name)
+                    entity_prompt.append(entity_label)
+
+                for v in entity_example_map.values():
+                    entity_example.append(v)
+
+                entity_examples.append(entity_example)
+                entity_prompt_list.append(entity_prompt)
+
+                subject_golden = []  # Golden entity inputs
+                relation_example = []
+                relation_prompt = []
+                relation_example_map = {}
+                inverse_relation = []
+                predicates = []
+                for relation in relations:
+                    predicate = relation["type"]
+                    subject_id = relation["from_id"]
+                    object_id = relation["to_id"]
+                    # The relation prompt is constructed as follows:
+                    # subject + "的" + predicate -> Chinese
+                    # predicate + " of " + subject -> English
+                    if self.schema_lang == "ch":
+                        prompt = entity_map[subject_id]["name"] + "的" + predicate
+                        inverse_negative = entity_map[object_id]["name"] + "的" + predicate
+                    else:
+                        prompt = predicate + " of " + entity_map[subject_id]["name"]
+                        inverse_negative = predicate + " of " + entity_map[object_id]["name"]
+
+                    if entity_map[subject_id]["name"] not in subject_golden:
+                        subject_golden.append(entity_map[subject_id]["name"])
+                    result = {
+                        "text": entity_map[object_id]["name"],
+                        "start": entity_map[object_id]["start"],
+                        "end": entity_map[object_id]["end"],
+                    }
+
+                    inverse_relation.append(inverse_negative)
+                    predicates.append(predicate)
+
+                    if prompt not in relation_example_map.keys():
+                        relation_example_map[prompt] = {"content": text, "result_list": [result], "prompt": prompt}
+                        if self.anno_type == "image":
+                            relation_example_map[prompt]["image"] = image
+                            relation_example_map[prompt]["bbox"] = bbox
+                    else:
+                        relation_example_map[prompt]["result_list"].append(result)
+
+                    if predicate not in predicate_set:
+                        predicate_set.append(predicate)
+                    relation_prompt.append(prompt)
+
+                for v in relation_example_map.values():
+                    relation_example.append(v)
+
+                relation_examples.append(relation_example)
+                relation_prompt_list.append(relation_prompt)
+                subject_golden_list.append(subject_golden)
+                inverse_relation_list.append(inverse_relation)
+                predicate_list.append(predicates)
+                pbar.update(1)
+
+        logger.info("Adding negative samples for first stage prompt...")
+        positive_examples, negative_examples = self.add_entity_negative_example(
+            entity_examples, texts, entity_prompt_list, entity_label_set, images, bbox_list
+        )
+        if len(positive_examples) == 0:
+            all_entity_examples = []
+        else:
+            all_entity_examples = positive_examples + negative_examples
+
+        all_relation_examples = []
+        if len(predicate_set) != 0:
+            logger.info("Adding negative samples for second stage prompt...")
+            if is_train:
+
+                positive_examples = []
+                negative_examples = []
+                per_n_ratio = self.negative_ratio // 3
+
+                with tqdm(total=len(texts)) as pbar:
+                    for i, text in enumerate(texts):
+                        negative_example = []
+                        collects = []
+                        num_positive = len(relation_examples[i])
+
+                        # 1. inverse_relation_list
+                        redundants1 = inverse_relation_list[i]
+
+                        # 2. entity_name_set ^ subject_golden_list[i]
+                        redundants2 = []
+                        if len(predicate_list[i]) != 0:
+                            nonentity_list = list(set(entity_name_set) ^ set(subject_golden_list[i]))
+                            nonentity_list.sort()
+
+                            if self.schema_lang == "ch":
+                                redundants2 = [
+                                    nonentity + "的" + predicate_list[i][random.randrange(len(predicate_list[i]))]
+                                    for nonentity in nonentity_list
+                                ]
+                            else:
+                                redundants2 = [
+                                    predicate_list[i][random.randrange(len(predicate_list[i]))] + " of " + nonentity
+                                    for nonentity in nonentity_list
+                                ]
+
+                        # 3. entity_label_set ^ entity_prompt_list[i]
+                        redundants3 = []
+                        if len(subject_golden_list[i]) != 0:
+                            non_ent_label_list = list(set(entity_label_set) ^ set(entity_prompt_list[i]))
+                            non_ent_label_list.sort()
+
+                            if self.schema_lang == "ch":
+                                redundants3 = [
+                                    subject_golden_list[i][random.randrange(len(subject_golden_list[i]))]
+                                    + "的"
+                                    + non_ent_label
+                                    for non_ent_label in non_ent_label_list
+                                ]
+                            else:
+                                redundants3 = [
+                                    non_ent_label
+                                    + " of "
+                                    + subject_golden_list[i][random.randrange(len(subject_golden_list[i]))]
+                                    for non_ent_label in non_ent_label_list
+                                ]
+
+                        redundants_list = [redundants1, redundants2, redundants3]
+
+                        for redundants in redundants_list:
+                            if self.anno_type == "text":
+                                added, rest = self.add_relation_negative_example(
+                                    redundants,
+                                    texts[i],
+                                    num_positive,
+                                    per_n_ratio,
+                                )
+                            else:
+                                added, rest = self.add_relation_negative_example(
+                                    redundants, texts[i], num_positive, per_n_ratio, images[i], bbox_list[i]
+                                )
+                            negative_example.extend(added)
+                            collects.extend(rest)
+
+                        num_sup = num_positive * self.negative_ratio - len(negative_example)
+                        if num_sup > 0 and collects:
+                            if num_sup > len(collects):
+                                idxs = [k for k in range(len(collects))]
+                            else:
+                                idxs = random.sample(range(0, len(collects)), num_sup)
+                            for idx in idxs:
+                                negative_example.append(collects[idx])
+
+                        positive_examples.extend(relation_examples[i])
+                        negative_examples.extend(negative_example)
+                        pbar.update(1)
+                all_relation_examples = positive_examples + negative_examples
+            else:
+                relation_examples = self.add_full_negative_example(
+                    relation_examples, texts, relation_prompt_list, predicate_set, subject_golden_list
+                )
+                all_relation_examples = [r for relation_example in relation_examples for r in relation_example]
+        return all_entity_examples + all_relation_examples + entity_cls_examples
+
+    def generate_cls_example(self, text, labels, prompt_prefix, options, image=None, bbox=None):
+        random.shuffle(self.options)
+        cls_options = ",".join(self.options)
+        prompt = prompt_prefix + "[" + cls_options + "]"
+
+        result_list = []
+        example = {"content": text, "result_list": result_list, "prompt": prompt}
+        if image and bbox:
+            example["image"] = image
+            example["bbox"] = bbox
+        for label in labels:
+            start = prompt.rfind(label) - len(prompt) - 1
+            end = start + len(label)
+            result = {"text": label, "start": start, "end": end}
+            example["result_list"].append(result)
+        return example
+
+    def add_full_negative_example(
+        self, examples, texts, relation_prompt_list, predicate_set, subject_golden_list, images=None, bbox_list=None
+    ):
+        with tqdm(total=len(relation_prompt_list)) as pbar:
+            for i, relation_prompt in enumerate(relation_prompt_list):
+                negative_sample = []
+                for subject in subject_golden_list[i]:
+                    for predicate in predicate_set:
+                        # The relation prompt is constructed as follows:
+                        # subject + "的" + predicate -> Chinese
+                        # predicate + " of " + subject -> English
+                        if self.schema_lang == "ch":
+                            prompt = subject + "的" + predicate
+                        else:
+                            prompt = predicate + " of " + subject
+                        if prompt not in relation_prompt:
+                            negative_result = {"content": texts[i], "result_list": [], "prompt": prompt}
+                            if images and bbox_list:
+                                negative_result["image"] = images[i]
+                                negative_result["bbox"] = bbox_list[i]
+                            negative_sample.append(negative_result)
+                examples[i].extend(negative_sample)
+                pbar.update(1)
+        return examples
+
+    def add_entity_negative_example(self, examples, texts, prompts, label_set, images=None, bbox_list=None):
+        negative_examples = []
+        positive_examples = []
+        with tqdm(total=len(prompts)) as pbar:
+            for i, prompt in enumerate(prompts):
+                redundants = list(set(label_set) ^ set(prompt))
+                redundants.sort()
+
+                num_positive = len(examples[i])
+                if num_positive != 0:
+                    actual_ratio = math.ceil(len(redundants) / num_positive)
+                else:
+                    # Set num_positive to 1 for text without positive example
+                    num_positive, actual_ratio = 1, 0
+
+                if actual_ratio <= self.negative_ratio or self.negative_ratio == -1:
+                    idxs = [k for k in range(len(redundants))]
+                else:
+                    idxs = random.sample(range(0, len(redundants)), self.negative_ratio * num_positive)
+
+                for idx in idxs:
+                    negative_result = {"content": texts[i], "result_list": [], "prompt": redundants[idx]}
+                    if images and bbox_list:
+                        negative_result["image"] = images[i]
+                        negative_result["bbox"] = bbox_list[i]
+                    negative_examples.append(negative_result)
+                positive_examples.extend(examples[i])
+                pbar.update(1)
+        return positive_examples, negative_examples
+
+    def add_relation_negative_example(self, redundants, text, num_positive, ratio, image=None, bbox=None):
+        added_example = []
+        rest_example = []
+
+        if num_positive != 0:
+            actual_ratio = math.ceil(len(redundants) / num_positive)
+        else:
+            # Set num_positive to 1 for text without positive example
+            num_positive, actual_ratio = 1, 0
+
+        all_idxs = [k for k in range(len(redundants))]
+        if actual_ratio <= ratio or ratio == -1:
+            idxs = all_idxs
+            rest_idxs = []
+        else:
+            idxs = random.sample(range(0, len(redundants)), ratio * num_positive)
+            rest_idxs = list(set(all_idxs) ^ set(idxs))
+
+        for idx in idxs:
+            negative_result = {"content": text, "result_list": [], "prompt": redundants[idx]}
+            if image and bbox:
+                negative_result["image"] = image
+                negative_result["bbox"] = bbox
+            added_example.append(negative_result)
+
+        for rest_idx in rest_idxs:
+            negative_result = {"content": text, "result_list": [], "prompt": redundants[rest_idx]}
+            if image and bbox:
+                negative_result["image"] = image
+                negative_result["bbox"] = bbox
+            rest_example.append(negative_result)
+
+        return added_example, rest_example
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/version/__init__.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/version/__init__.py
new file mode 100644
index 000000000..bff37dc2e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/version/__init__.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+
+from paddlenlp.version import git
+
+commit = "unknown"
+
+paddlenlp_dir = os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+if commit.endswith("unknown") and git.is_git_repo(paddlenlp_dir) and git.have_git():
+    commit = git.git_revision(paddlenlp_dir).decode("utf-8")
+    if git.is_dirty(paddlenlp_dir):
+        commit += ".dirty"
+del paddlenlp_dir
+
+
+__all__ = ["show"]
+
+
+def show():
+    """Get the corresponding commit id of paddlenlp.
+
+    Returns:
+        The commit-id of paddlenlp will be output.
+
+        full_version: version of paddlenlp
+
+
+    Examples:
+        .. code-block:: python
+
+            import paddlenlp
+
+            paddlenlp.version.show()
+            # commit: 1ef5b94a18773bb0b1bba1651526e5f5fc5b16fa
+
+    """
+    print("commit:", commit)
diff --git a/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/version/git.py b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/version/git.py
new file mode 100644
index 000000000..e042220e3
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/paddlenlp_3.0.0/version/git.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Git utilities."""
+# https://github.com/python/mypy/blob/2c2d126cc742f2467045d36780c33bb8fb77a614/mypy/git.py#L1-L34
+# Used also from setup.py, so don't pull in anything additional here (like mypy or typing):
+from __future__ import annotations
+
+import os
+import subprocess
+
+
+def is_git_repo(dir: str) -> bool:
+    """Is the given directory version-controlled with git?"""
+    return os.path.exists(os.path.join(dir, ".git"))
+
+
+def have_git() -> bool:
+    """Can we run the git executable?"""
+    try:
+        subprocess.check_output(["git", "--help"])
+        return True
+    except subprocess.CalledProcessError:
+        return False
+    except OSError:
+        return False
+
+
+def git_revision(dir: str) -> bytes:
+    """Get the SHA-1 of the HEAD of a git repository."""
+    return subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=dir).strip()
+
+
+def is_dirty(dir: str) -> bool:
+    """Check whether a git repository has uncommitted changes."""
+    output = subprocess.check_output(["git", "status", "-uno", "--porcelain"], cwd=dir)
+    return output.strip() != b""
diff --git a/nlp/text_classification/bert/paddlepaddle/predict.py b/nlp/text_classification/bert/paddlepaddle/predict.py
new file mode 100644
index 000000000..d3bccc907
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/predict.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+import numpy as np
+from scipy.special import softmax
+
+import paddle
+from paddle import inference
+from paddlenlp.data import Stack, Tuple, Pad
+from paddlenlp.transformers import BertTokenizer
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_path",
+        default=None,
+        type=str,
+        required=True,
+        help="The path prefix of inference model to be used.", )
+    parser.add_argument(
+        "--device",
+        default="gpu",
+        type=str,
+        choices=["cpu", "gpu", "xpu"],
+        help="The device to select to train the model, is must be cpu/gpu/xpu.")
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.", )
+    args = parser.parse_args()
+    return args
+
+
+def convert_example(example, tokenizer, label_list, max_seq_length=128):
+    text = example
+    encoded_inputs = tokenizer(text=text, max_seq_len=max_seq_length)
+    input_ids = encoded_inputs["input_ids"]
+    segment_ids = encoded_inputs["token_type_ids"]
+
+    return input_ids, segment_ids
+
+
+class Predictor(object):
+    def __init__(self, predictor, input_handles, output_handle, tokenizer,
+                 max_seq_length):
+        self.predictor = predictor
+        self.input_handles = input_handles
+        self.output_handle = output_handle
+        self.tokenizer = tokenizer
+        self.max_seq_length = max_seq_length
+
+    @classmethod
+    def create_predictor(cls, args):
+        max_seq_length = args.max_seq_length
+        config = paddle.inference.Config(args.model_path + ".pdmodel",
+                                         args.model_path + ".pdiparams")
+        if args.device == "gpu":
+            # Set GPU configs accordingly
+            config.enable_use_gpu(100, 0)
+        elif args.device == "cpu":
+            # Set CPU configs accordingly,
+            # such as enable_mkldnn, set_cpu_math_library_num_threads
+            config.disable_gpu()
+        elif args.device == "xpu":
+            # Set XPU configs accordingly
+            config.enable_xpu(100)
+        config.switch_use_feed_fetch_ops(False)
+        predictor = paddle.inference.create_predictor(config)
+        input_handles = [
+            predictor.get_input_handle(name)
+            for name in predictor.get_input_names()
+        ]
+        output_handle = predictor.get_output_handle(predictor.get_output_names()
+                                                    [0])
+        tokenizer = BertTokenizer.from_pretrained(
+            os.path.dirname(args.model_path))
+
+        return cls(predictor, input_handles, output_handle, tokenizer,
+                   max_seq_length)
+
+    def predict(self, data, label_map, batch_size=1):
+        examples = []
+        for text in data:
+            input_ids, segment_ids = convert_example(
+                text,
+                self.tokenizer,
+                label_list=label_map.values(),
+                max_seq_length=self.max_seq_length)
+            examples.append((input_ids, segment_ids))
+
+        batchify_fn = lambda samples, fn=Tuple(
+            Pad(axis=0, pad_val=self.tokenizer.pad_token_id, dtype="int64"),  # input
+            Pad(axis=0, pad_val=self.tokenizer.pad_token_id, dtype="int64"),  # segment
+        ): fn(samples)
+
+        # Seperates data into some batches.
+        batches = [
+            examples[idx:idx + batch_size]
+            for idx in range(0, len(examples), batch_size)
+        ]
+
+        outputs = []
+        results = []
+        for batch in batches:
+            input_ids, segment_ids = batchify_fn(batch)
+            self.input_handles[0].copy_from_cpu(input_ids)
+            self.input_handles[1].copy_from_cpu(segment_ids)
+            self.predictor.run()
+            logits = self.output_handle.copy_to_cpu()
+            probs = softmax(logits, axis=1)
+            idx = np.argmax(probs, axis=1)
+            idx = idx.tolist()
+            labels = [label_map[i] for i in idx]
+            outputs.extend(probs)
+            results.extend(labels)
+        return outputs, results
+
+
+def main():
+    args = parse_args()
+    predictor = Predictor.create_predictor(args)
+
+    data = [
+        'against shimmering cinematography that lends the setting the ethereal beauty of an asian landscape painting',
+        'the situation in a well-balanced fashion',
+        'at achieving the modest , crowd-pleasing goals it sets for itself',
+        'so pat it makes your teeth hurt',
+        'this new jangle of noise , mayhem and stupidity must be a serious contender for the title .'
+    ]
+    label_map = {0: 'negative', 1: 'positive'}
+
+    outputs, results = predictor.predict(data, label_map)
+    for idx, text in enumerate(data):
+        print(
+            'Data: {} \n Label: {} \n Negative prob: {} \n Positive prob: {} \n '.
+            format(text, results[idx], outputs[idx][0], outputs[idx][1]))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nlp/text_classification/bert/paddlepaddle/predict_glue.py b/nlp/text_classification/bert/paddlepaddle/predict_glue.py
new file mode 100644
index 000000000..e9593129e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/predict_glue.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+from functools import partial
+
+import paddle
+from paddle import inference
+from datasets import load_dataset
+from paddlenlp.data import Stack, Tuple, Pad, Dict
+
+from run_glue import METRIC_CLASSES, MODEL_CLASSES, task_to_keys
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument(
+        "--task_name",
+        default=None,
+        type=str,
+        required=True,
+        help="The name of the task to perform predict, selected in the list: " +
+        ", ".join(METRIC_CLASSES.keys()), )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " +
+        ", ".join(MODEL_CLASSES.keys()), )
+    parser.add_argument(
+        "--model_path",
+        default=None,
+        type=str,
+        required=True,
+        help="The path prefix of inference model to be used.", )
+    parser.add_argument(
+        "--device",
+        default="gpu",
+        choices=["gpu", "cpu", "xpu"],
+        help="Device selected for inference.", )
+    parser.add_argument(
+        "--batch_size",
+        default=32,
+        type=int,
+        help="Batch size for predict.", )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.", )
+    args = parser.parse_args()
+    return args
+
+
+class Predictor(object):
+    def __init__(self, predictor, input_handles, output_handles):
+        self.predictor = predictor
+        self.input_handles = input_handles
+        self.output_handles = output_handles
+
+    @classmethod
+    def create_predictor(cls, args):
+        config = paddle.inference.Config(args.model_path + ".pdmodel",
+                                         args.model_path + ".pdiparams")
+        if args.device == "gpu":
+            # set GPU configs accordingly
+            config.enable_use_gpu(100, 0)
+        elif args.device == "cpu":
+            # set CPU configs accordingly,
+            # such as enable_mkldnn, set_cpu_math_library_num_threads
+            config.disable_gpu()
+        elif args.device == "xpu":
+            # set XPU configs accordingly
+            config.enable_xpu(100)
+        config.switch_use_feed_fetch_ops(False)
+        predictor = paddle.inference.create_predictor(config)
+        input_handles = [
+            predictor.get_input_handle(name)
+            for name in predictor.get_input_names()
+        ]
+        output_handles = [
+            predictor.get_output_handle(name)
+            for name in predictor.get_output_names()
+        ]
+        return cls(predictor, input_handles, output_handles)
+
+    def predict_batch(self, data):
+        for input_field, input_handle in zip(data, self.input_handles):
+            input_handle.copy_from_cpu(input_field.numpy() if isinstance(
+                input_field, paddle.Tensor) else input_field)
+        self.predictor.run()
+        output = [
+            output_handle.copy_to_cpu() for output_handle in self.output_handles
+        ]
+        return output
+
+    def predict(self, dataset, collate_fn, batch_size=1):
+        batch_sampler = paddle.io.BatchSampler(
+            dataset, batch_size=batch_size, shuffle=False)
+        data_loader = paddle.io.DataLoader(
+            dataset=dataset,
+            batch_sampler=batch_sampler,
+            collate_fn=collate_fn,
+            num_workers=0,
+            return_list=True)
+        outputs = []
+        for data in data_loader:
+            output = self.predict_batch(data)
+            outputs.append(output)
+        return outputs
+
+
+def main():
+    args = parse_args()
+
+    predictor = Predictor.create_predictor(args)
+
+    args.task_name = args.task_name.lower()
+    args.model_type = args.model_type.lower()
+    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    sentence1_key, sentence2_key = task_to_keys[args.task_name]
+
+    test_ds = load_dataset('glue', args.task_name, split="test")
+    tokenizer = tokenizer_class.from_pretrained(
+        os.path.dirname(args.model_path))
+
+    def preprocess_function(examples):
+        # Tokenize the texts
+        texts = ((examples[sentence1_key], ) if sentence2_key is None else
+                 (examples[sentence1_key], examples[sentence2_key]))
+        result = tokenizer(*texts, max_seq_len=args.max_seq_length)
+        if "label" in examples:
+            # In all cases, rename the column to labels because the model will expect that.
+            result["labels"] = examples["label"]
+        return result
+
+    test_ds = test_ds.map(preprocess_function)
+    batchify_fn = lambda samples, fn=Dict({
+        'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),  # input
+        'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"),  # segment
+    }): fn(samples)
+    predictor.predict(
+        test_ds, batch_size=args.batch_size, collate_fn=batchify_fn)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nlp/text_classification/bert/paddlepaddle/requirements.txt b/nlp/text_classification/bert/paddlepaddle/requirements.txt
new file mode 100644
index 000000000..419616471
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/requirements.txt
@@ -0,0 +1,30 @@
+jieba
+colorlog
+colorama
+seqeval
+dill<0.3.5
+multiprocess<=0.70.12.2
+datasets >= 2.0.0
+tqdm
+paddlefsl
+sentencepiece
+huggingface_hub>=0.19.2
+onnx>=1.10.0
+protobuf>=3.20.2 ; platform_system != "Windows"
+protobuf==3.20.2 ; platform_system == "Windows"        # onnx require: protobuf<4,>=3.20.2, paddle require different version on platforms, refer to: https://github.com/PaddlePaddle/Paddle/blob/cd88156a369bbfb83d6306f89e0ae6ebd78b8040/python/requirements.txt#L3
+paddle2onnx
+Flask-Babel
+visualdl
+fastapi
+uvicorn
+typer
+rich
+safetensors
+tool_helpers==0.1.1 ; platform_system == "Linux"
+aistudio-sdk>=0.1.3
+jinja2
+regex
+numpy<=1.26.4
+tiktoken
+tokenizers
+h5py
\ No newline at end of file
diff --git a/nlp/text_classification/bert/paddlepaddle/run_glue.py b/nlp/text_classification/bert/paddlepaddle/run_glue.py
new file mode 100644
index 000000000..71c5b1820
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/run_glue.py
@@ -0,0 +1,435 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import sys
+import random
+import time
+import math
+import distutils.util
+from functools import partial
+
+from paddlenlp.data import default_data_collator, DataCollatorWithPadding
+from datasets import load_dataset
+from paddlenlp.data.sampler import SamplerHelper
+from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer
+from paddlenlp.transformers import ElectraForSequenceClassification, ElectraTokenizer
+from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
+from paddlenlp.transformers import LinearDecayWithWarmup
+from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman
+
+import numpy as np
+import paddle
+from paddle.io import DataLoader
+from paddle.metric import Metric, Accuracy, Precision, Recall
+import paddlenlp
+
+
+METRIC_CLASSES = {
+    "cola": Mcc,
+    "sst2": Accuracy,
+    "mrpc": AccuracyAndF1,
+    "stsb": PearsonAndSpearman,
+    "qqp": AccuracyAndF1,
+    "mnli": Accuracy,
+    "qnli": Accuracy,
+    "rte": Accuracy,
+}
+
+task_to_keys = {
+    "cola": ("sentence", None),
+    "mnli": ("premise", "hypothesis"),
+    "mrpc": ("sentence1", "sentence2"),
+    "qnli": ("question", "sentence"),
+    "qqp": ("question1", "question2"),
+    "rte": ("sentence1", "sentence2"),
+    "sst2": ("sentence", None),
+    "stsb": ("sentence1", "sentence2"),
+    "wnli": ("sentence1", "sentence2"),
+}
+
+MODEL_CLASSES = {
+    "bert": (BertForSequenceClassification, BertTokenizer),
+    "ernie": (ErnieForSequenceClassification, ErnieTokenizer),
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument(
+        "--task_name",
+        default=None,
+        type=str,
+        required=True,
+        help="The name of the task to train selected in the list: " +
+        ", ".join(METRIC_CLASSES.keys()), )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " +
+        ", ".join(MODEL_CLASSES.keys()), )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: "
+        + ", ".join(
+            sum([
+                list(classes[-1].pretrained_init_configuration.keys())
+                for classes in MODEL_CLASSES.values()
+            ], [])), )
+    parser.add_argument(
+        "--dataset_path",
+        default=None,
+        type=str,
+        required=True,
+        help="The local path of the dataset.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.", )
+    parser.add_argument(
+        "--learning_rate",
+        default=1e-4,
+        type=float,
+        help="The initial learning rate for Adam.")
+    parser.add_argument(
+        "--num_train_epochs",
+        default=3,
+        type=int,
+        help="Total number of training epochs to perform.", )
+    parser.add_argument(
+        "--logging_steps",
+        type=int,
+        default=100,
+        help="Log every X updates steps.")
+    parser.add_argument(
+        "--save_steps",
+        type=int,
+        default=100,
+        help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--batch_size",
+        default=32,
+        type=int,
+        help="Batch size per GPU/CPU for training.", )
+    parser.add_argument(
+        "--weight_decay",
+        default=0.0,
+        type=float,
+        help="Weight decay if we apply some.")
+    parser.add_argument(
+        "--warmup_steps",
+        default=0,
+        type=int,
+        help="Linear warmup over warmup_steps. If > 0: Override warmup_proportion"
+    )
+    parser.add_argument(
+        "--warmup_proportion",
+        default=0.1,
+        type=float,
+        help="Linear warmup proportion over total steps.")
+    parser.add_argument(
+        "--adam_epsilon",
+        default=1e-6,
+        type=float,
+        help="Epsilon for Adam optimizer.")
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument(
+        "--seed", default=42, type=int, help="random seed for initialization")
+    parser.add_argument(
+        "--device",
+        default="gpu",
+        type=str,
+        choices=["cpu", "gpu", "xpu", "npu"],
+        help="The device to select to train the model, is must be cpu/gpu/xpu/npu."
+    )
+    parser.add_argument(
+        "--use_amp",
+        type=distutils.util.strtobool,
+        default=False,
+        help="Enable mixed precision training.")
+    parser.add_argument(
+        "--scale_loss",
+        type=float,
+        default=2**15,
+        help="The value of scale_loss for fp16.")
+    args = parser.parse_args()
+    return args
+
+
+def set_seed(args):
+    # Use the same data seed(for data shuffle) for all procs to guarantee data
+    # consistency after sharding.
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    # Maybe different op seeds(for dropout) for different procs is better. By:
+    # `paddle.seed(args.seed + paddle.distributed.get_rank())`
+    paddle.seed(args.seed)
+
+
+@paddle.no_grad()
+def evaluate(model, loss_fct, metric, data_loader):
+    model.eval()
+    metric.reset()
+    for batch in data_loader:
+        logits = model(batch['input_ids'], batch['token_type_ids'])
+        loss = loss_fct(logits, batch['labels'])
+        correct = metric.compute(logits, batch['labels'])
+        metric.update(correct)
+    res = metric.accumulate()
+    if isinstance(metric, AccuracyAndF1):
+        print(
+            "eval loss: %f, acc: %s, precision: %s, recall: %s, f1: %s, acc and f1: %s, "
+            % (
+                loss.numpy(),
+                res[0],
+                res[1],
+                res[2],
+                res[3],
+                res[4], ),
+            end='')
+    elif isinstance(metric, Mcc):
+        print("eval loss: %f, mcc: %s, " % (loss.numpy(), res[0]), end='')
+    elif isinstance(metric, PearsonAndSpearman):
+        print(
+            "eval loss: %f, pearson: %s, spearman: %s, pearson and spearman: %s, "
+            % (loss.numpy(), res[0], res[1], res[2]),
+            end='')
+    else:
+        print("eval loss: %f, acc: %s, " % (loss.numpy(), res), end='')
+    model.train()
+
+
+def do_train(args):
+    paddle.set_device(args.device)
+    if paddle.distributed.get_world_size() > 1:
+        paddle.distributed.init_parallel_env()
+
+    set_seed(args)
+
+    args.task_name = args.task_name.lower()
+
+    sentence1_key, sentence2_key = task_to_keys[args.task_name]
+
+    metric_class = METRIC_CLASSES[args.task_name]
+    args.model_type = args.model_type.lower()
+    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+
+    train_ds = load_dataset(path=args.dataset_path, split="train")
+    columns = train_ds.column_names
+    is_regression = args.task_name == "stsb"
+    label_list = None
+    if not is_regression:
+        label_list = train_ds.features["label"].names
+        num_classes = len(label_list)
+    else:
+        num_classes = 1
+    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
+
+    def preprocess_function(examples):
+        # Tokenize the texts
+        texts = ((examples[sentence1_key], ) if sentence2_key is None else
+                 (examples[sentence1_key], examples[sentence2_key]))
+        result = tokenizer(*texts, max_seq_len=args.max_seq_length)
+        if "label" in examples:
+            # In all cases, rename the column to labels because the model will expect that.
+            result["labels"] = examples["label"]
+        return result
+
+    train_ds = train_ds.map(preprocess_function,
+                            batched=True,
+                            remove_columns=columns)
+    train_batch_sampler = paddle.io.DistributedBatchSampler(
+        train_ds, batch_size=args.batch_size, shuffle=True)
+    batchify_fn = DataCollatorWithPadding(tokenizer)
+    train_data_loader = DataLoader(
+        dataset=train_ds,
+        batch_sampler=train_batch_sampler,
+        collate_fn=batchify_fn,
+        num_workers=0,
+        return_list=True)
+    train_data_loader.prefetch_factor=1
+    if args.task_name == "mnli":
+        dev_ds_matched, dev_ds_mismatched = load_dataset(
+            path=args.dataset_path,
+            split=["validation_matched", "validation_mismatched"])
+
+        dev_ds_matched = dev_ds_matched.map(preprocess_function,
+                                            batched=True,
+                                            remove_columns=columns)
+        dev_ds_mismatched = dev_ds_mismatched.map(preprocess_function,
+                                                  batched=True,
+                                                  remove_columns=columns)
+        dev_batch_sampler_matched = paddle.io.BatchSampler(
+            dev_ds_matched, batch_size=args.batch_size, shuffle=False)
+        dev_data_loader_matched = DataLoader(
+            dataset=dev_ds_matched,
+            batch_sampler=dev_batch_sampler_matched,
+            collate_fn=batchify_fn,
+            num_workers=0,
+            return_list=True)
+        dev_batch_sampler_mismatched = paddle.io.BatchSampler(
+            dev_ds_mismatched, batch_size=args.batch_size, shuffle=False)
+        dev_data_loader_mismatched = DataLoader(
+            dataset=dev_ds_mismatched,
+            batch_sampler=dev_batch_sampler_mismatched,
+            collate_fn=batchify_fn,
+            num_workers=0,
+            return_list=True)
+    else:
+        dev_ds = load_dataset(path=args.dataset_path, split='validation')
+        dev_ds = dev_ds.map(preprocess_function,
+                            batched=True,
+                            remove_columns=columns)
+        dev_batch_sampler = paddle.io.BatchSampler(
+            dev_ds, batch_size=args.batch_size, shuffle=False)
+        dev_data_loader = DataLoader(
+            dataset=dev_ds,
+            batch_sampler=dev_batch_sampler,
+            collate_fn=batchify_fn,
+            num_workers=0,
+            return_list=True)
+        dev_data_loader.prefetch_factor=1
+    model = model_class.from_pretrained(
+        args.model_name_or_path, num_classes=num_classes)
+    if paddle.distributed.get_world_size() > 1:
+        model = paddle.DataParallel(model)
+
+    num_training_steps = args.max_steps if args.max_steps > 0 else (
+        len(train_data_loader) * args.num_train_epochs)
+    warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion
+
+    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
+                                         warmup)
+
+    # Generate parameter names needed to perform weight decay.
+    # All bias and LayerNorm parameters are excluded.
+    decay_params = [
+        p.name for n, p in model.named_parameters()
+        if not any(nd in n for nd in ["bias", "norm"])
+    ]
+    optimizer = paddle.optimizer.AdamW(
+        learning_rate=lr_scheduler,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=args.adam_epsilon,
+        parameters=model.parameters(),
+        weight_decay=args.weight_decay,
+        apply_decay_param_fun=lambda x: x in decay_params)
+
+    loss_fct = paddle.nn.loss.CrossEntropyLoss(
+    ) if not is_regression else paddle.nn.loss.MSELoss()
+
+    metric = metric_class()
+    if args.use_amp:
+        scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss)
+
+    global_step = 0
+    tic_train = time.time()
+    for epoch in range(args.num_train_epochs):
+        for step, batch in enumerate(train_data_loader):
+            global_step += 1
+            with paddle.amp.auto_cast(
+                    args.use_amp,
+                    custom_white_list=["layer_norm", "softmax", "gelu"]):
+                logits = model(batch['input_ids'], batch['token_type_ids'])
+                loss = loss_fct(logits, batch['labels'])
+            if args.use_amp:
+                scaler.scale(loss).backward()
+                scaler.minimize(optimizer, loss)
+            else:
+                loss.backward()
+                optimizer.step()
+            lr_scheduler.step()
+            optimizer.clear_grad()
+            if global_step % args.logging_steps == 0:
+                print(
+                    "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s"
+                    % (global_step, num_training_steps, epoch, step,
+                       paddle.distributed.get_rank(), loss, optimizer.get_lr(),
+                       args.logging_steps / (time.time() - tic_train)))
+                tic_train = time.time()
+            if global_step % args.save_steps == 0 or global_step == num_training_steps:
+                tic_eval = time.time()
+                if args.task_name == "mnli":
+                    evaluate(model, loss_fct, metric, dev_data_loader_matched)
+                    evaluate(model, loss_fct, metric,
+                             dev_data_loader_mismatched)
+                    print("eval done total : %s s" % (time.time() - tic_eval))
+                else:
+                    evaluate(model, loss_fct, metric, dev_data_loader)
+                    print("eval done total : %s s" % (time.time() - tic_eval))
+                if paddle.distributed.get_rank() == 0:
+                    output_dir = os.path.join(args.output_dir,
+                                              "%s_ft_model_%d.pdparams" %
+                                              (args.task_name, global_step))
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    # Need better way to get inner model of DataParallel
+                    model_to_save = model._layers if isinstance(
+                        model, paddle.DataParallel) else model
+                    model_to_save.save_pretrained(output_dir)
+                    tokenizer.save_pretrained(output_dir)
+            
+            loss_value = loss.item()
+            if not math.isfinite(loss_value):
+                print("Loss is {}, stopping training".format(loss_value))
+                sys.exit(1)
+            if global_step >= num_training_steps:
+                return
+
+
+def print_arguments(args):
+    """print arguments"""
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).items()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    try:
+        from dltest import show_training_arguments
+        show_training_arguments(args)
+    except:
+        pass
+    do_train(args)
+    try:
+        # WARN: Fix hang
+        os._exit(0)
+    except:
+        exit(0)
diff --git a/nlp/text_classification/bert/paddlepaddle/run_pretrain.py b/nlp/text_classification/bert/paddlepaddle/run_pretrain.py
new file mode 100644
index 000000000..216f915bd
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/run_pretrain.py
@@ -0,0 +1,496 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import collections
+import itertools
+import logging
+import os
+import random
+import time
+import h5py
+import yaml
+import distutils.util
+from functools import partial
+from concurrent.futures import ThreadPoolExecutor
+
+import numpy as np
+
+import paddle
+import paddle.distributed as dist
+from paddle.io import DataLoader, Dataset
+
+from paddlenlp.data import Stack, Tuple, Pad
+from paddlenlp.utils import profiler
+from paddlenlp.utils.tools import TimeCostAverage
+from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion
+from paddlenlp.transformers import ErnieForPretraining, ErnieModel, ErniePretrainingCriterion
+from paddlenlp.transformers import BertTokenizer, ErnieTokenizer
+from paddlenlp.transformers import LinearDecayWithWarmup
+
+FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+logger = logging.getLogger(__name__)
+
+MODEL_CLASSES = {
+    "bert":
+    (BertModel, BertForPretraining, BertPretrainingCriterion, BertTokenizer),
+    "ernie":
+    (ErnieModel, ErnieForPretraining, ErniePretrainingCriterion, ErnieTokenizer)
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " +
+        ", ".join(MODEL_CLASSES.keys()), )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: "
+        + ", ".join(
+            sum([
+                list(classes[-1].pretrained_init_configuration.keys())
+                for classes in MODEL_CLASSES.values()
+            ], [])), )
+    parser.add_argument(
+        "--input_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input directory where the data will be read from.", )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+
+    parser.add_argument(
+        "--max_predictions_per_seq",
+        default=80,
+        type=int,
+        help="The maximum total of masked tokens in input sequence")
+
+    parser.add_argument(
+        "--batch_size",
+        default=8,
+        type=int,
+        help="Batch size per GPU/CPU for training.", )
+    parser.add_argument(
+        "--learning_rate",
+        default=5e-5,
+        type=float,
+        help="The initial learning rate for Adam.")
+    parser.add_argument(
+        "--weight_decay",
+        default=0.0,
+        type=float,
+        help="Weight decay if we apply some.")
+    parser.add_argument(
+        "--adam_epsilon",
+        default=1e-8,
+        type=float,
+        help="Epsilon for Adam optimizer.")
+    parser.add_argument(
+        "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs",
+        default=3,
+        type=int,
+        help="Total number of training epochs to perform.", )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument(
+        "--warmup_steps",
+        default=0,
+        type=int,
+        help="Linear warmup over warmup_steps.")
+
+    parser.add_argument(
+        "--logging_steps",
+        type=int,
+        default=500,
+        help="Log every X updates steps.")
+    parser.add_argument(
+        "--save_steps",
+        type=int,
+        default=500,
+        help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--seed", type=int, default=42, help="random seed for initialization")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="gpu",
+        choices=["cpu", "gpu", "xpu"],
+        help="Device for selecting for the training.")
+    parser.add_argument(
+        "--use_amp",
+        type=distutils.util.strtobool,
+        default=False,
+        help="Enable mixed precision training.")
+    parser.add_argument(
+        "--scale_loss",
+        type=float,
+        default=2**15,
+        help="The value of scale_loss for fp16.")
+    parser.add_argument(
+        "--to_static",
+        type=distutils.util.strtobool,
+        default=False,
+        help="Enable training under @to_static.")
+
+    # For benchmark.
+    parser.add_argument(
+        '--profiler_options',
+        type=str,
+        default=None,
+        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
+    )
+    args = parser.parse_args()
+    return args
+
+
+def set_seed(args):
+    random.seed(args.seed + paddle.distributed.get_rank())
+    np.random.seed(args.seed + paddle.distributed.get_rank())
+    paddle.seed(args.seed + paddle.distributed.get_rank())
+
+
+class WorkerInitObj(object):
+    def __init__(self, seed):
+        self.seed = seed
+
+    def __call__(self, id):
+        np.random.seed(seed=self.seed + id)
+        random.seed(self.seed + id)
+
+
+def create_pretraining_dataset(input_file, max_pred_length, shared_list, args,
+                               worker_init):
+    train_data = PretrainingDataset(
+        input_file=input_file, max_pred_length=max_pred_length)
+    # files have been sharded, no need to dispatch again
+    train_batch_sampler = paddle.io.BatchSampler(
+        train_data, batch_size=args.batch_size, shuffle=True)
+
+    # DataLoader cannot be pickled because of its place.
+    # If it can be pickled, use global function instead of lambda and use
+    # ProcessPoolExecutor instead of ThreadPoolExecutor to prefetch.
+    def _collate_data(data, stack_fn=Stack()):
+        num_fields = len(data[0])
+        out = [None] * num_fields
+        # input_ids, segment_ids, input_mask, masked_lm_positions,
+        # masked_lm_labels, next_sentence_labels, mask_token_num
+        for i in (0, 1, 2, 5):
+            out[i] = stack_fn([x[i] for x in data])
+        batch_size, seq_length = out[0].shape
+        size = num_mask = sum(len(x[3]) for x in data)
+        # Padding for divisibility by 8 for fp16 or int8 usage
+        if size % 8 != 0:
+            size += 8 - (size % 8)
+        # masked_lm_positions
+        # Organize as a 1D tensor for gather or use gather_nd
+        out[3] = np.full(size, 0, dtype=np.int32)
+        # masked_lm_labels
+        out[4] = np.full([size, 1], -1, dtype=np.int64)
+        mask_token_num = 0
+        for i, x in enumerate(data):
+            for j, pos in enumerate(x[3]):
+                out[3][mask_token_num] = i * seq_length + pos
+                out[4][mask_token_num] = x[4][j]
+                mask_token_num += 1
+        # mask_token_num
+        out.append(np.asarray([mask_token_num], dtype=np.float32))
+        return out
+
+    train_data_loader = DataLoader(
+        dataset=train_data,
+        batch_sampler=train_batch_sampler,
+        collate_fn=_collate_data,
+        num_workers=0,
+        worker_init_fn=worker_init,
+        return_list=True)
+    return train_data_loader, input_file
+
+
+def create_input_specs():
+    input_ids = paddle.static.InputSpec(
+        name="input_ids", shape=[-1, -1], dtype="int64")
+    segment_ids = paddle.static.InputSpec(
+        name="segment_ids", shape=[-1, -1], dtype="int64")
+    position_ids = None
+    input_mask = paddle.static.InputSpec(
+        name="input_mask", shape=[-1, 1, 1, -1], dtype="float32")
+    masked_lm_positions = paddle.static.InputSpec(
+        name="masked_lm_positions", shape=[-1], dtype="int32")
+    return [
+        input_ids, segment_ids, position_ids, input_mask, masked_lm_positions
+    ]
+
+
+class PretrainingDataset(Dataset):
+    def __init__(self, input_file, max_pred_length):
+        self.input_file = input_file
+        self.max_pred_length = max_pred_length
+        f = h5py.File(input_file, "r")
+        keys = [
+            'input_ids', 'input_mask', 'segment_ids', 'masked_lm_positions',
+            'masked_lm_ids', 'next_sentence_labels'
+        ]
+        self.inputs = [np.asarray(f[key][:]) for key in keys]
+        f.close()
+
+    def __len__(self):
+        'Denotes the total number of samples'
+        return len(self.inputs[0])
+
+    def __getitem__(self, index):
+
+        [
+            input_ids, input_mask, segment_ids, masked_lm_positions,
+            masked_lm_ids, next_sentence_labels
+        ] = [
+            input[index].astype(np.int64)
+            if indice < 5 else np.asarray(input[index].astype(np.int64))
+            for indice, input in enumerate(self.inputs)
+        ]
+        # TODO: whether to use reversed mask by changing 1s and 0s to be
+        # consistent with nv bert
+        input_mask = (1 - np.reshape(
+            input_mask.astype(np.float32), [1, 1, input_mask.shape[0]])) * -1e9
+
+        index = self.max_pred_length
+        # store number of  masked tokens in index
+        # outputs of torch.nonzero diff with that of numpy.nonzero by zip
+        padded_mask_indices = (masked_lm_positions == 0).nonzero()[0]
+        if len(padded_mask_indices) != 0:
+            index = padded_mask_indices[0].item()
+            mask_token_num = index
+        else:
+            index = self.max_pred_length
+            mask_token_num = self.max_pred_length
+        # masked_lm_labels = np.full(input_ids.shape, -1, dtype=np.int64)
+        # masked_lm_labels[masked_lm_positions[:index]] = masked_lm_ids[:index]
+        masked_lm_labels = masked_lm_ids[:index]
+        masked_lm_positions = masked_lm_positions[:index]
+        # softmax_with_cross_entropy enforce last dim size equal 1
+        masked_lm_labels = np.expand_dims(masked_lm_labels, axis=-1)
+        next_sentence_labels = np.expand_dims(next_sentence_labels, axis=-1)
+
+        return [
+            input_ids, segment_ids, input_mask, masked_lm_positions,
+            masked_lm_labels, next_sentence_labels
+        ]
+
+
+def do_train(args):
+    paddle.set_device(args.device)
+    if paddle.distributed.get_world_size() > 1:
+        paddle.distributed.init_parallel_env()
+
+    set_seed(args)
+    worker_init = WorkerInitObj(args.seed + paddle.distributed.get_rank())
+
+    args.model_type = args.model_type.lower()
+    base_class, model_class, criterion_class, tokenizer_class = MODEL_CLASSES[
+        args.model_type]
+
+    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
+
+    pretrained_models_list = list(
+        model_class.pretrained_init_configuration.keys())
+    if args.model_name_or_path in pretrained_models_list:
+        model = model_class(
+            base_class(**model_class.pretrained_init_configuration[
+                args.model_name_or_path]))
+    else:
+        model = model_class.from_pretrained(args.model_name_or_path)
+    criterion = criterion_class(
+        getattr(model, model_class.base_model_prefix).config["vocab_size"])
+    # decorate @to_static for benchmark, skip it by default.
+    if args.to_static:
+        specs = create_input_specs()
+        model = paddle.jit.to_static(model, input_spec=specs)
+        logger.info("Successfully to apply @to_static with specs: {}".format(
+            specs))
+
+    if paddle.distributed.get_world_size() > 1:
+        model = paddle.DataParallel(model)
+
+    # If use default last_epoch, lr of the first iteration is 0.
+    # Use `last_epoch = 0` to be consistent with nv bert.
+    num_training_steps = args.max_steps if args.max_steps > 0 else len(
+        train_data_loader) * args.num_train_epochs
+
+    lr_scheduler = LinearDecayWithWarmup(
+        args.learning_rate, num_training_steps, args.warmup_steps, last_epoch=0)
+
+    # Generate parameter names needed to perform weight decay.
+    # All bias and LayerNorm parameters are excluded.
+    decay_params = [
+        p.name for n, p in model.named_parameters()
+        if not any(nd in n for nd in ["bias", "norm"])
+    ]
+    optimizer = paddle.optimizer.AdamW(
+        learning_rate=lr_scheduler,
+        epsilon=args.adam_epsilon,
+        parameters=model.parameters(),
+        weight_decay=args.weight_decay,
+        apply_decay_param_fun=lambda x: x in decay_params)
+    if args.use_amp:
+        scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss)
+
+    pool = ThreadPoolExecutor(1)
+    global_step = 0
+    tic_train = time.time()
+    for epoch in range(args.num_train_epochs):
+        files = [
+            os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
+            if os.path.isfile(os.path.join(args.input_dir, f)) and "train" in f
+        ]
+        files.sort()
+        num_files = len(files)
+        random.Random(args.seed + epoch).shuffle(files)
+        f_start_id = 0
+
+        shared_file_list = {}
+
+        if paddle.distributed.get_world_size() > num_files:
+            remainder = paddle.distributed.get_world_size() % num_files
+            data_file = files[(
+                f_start_id * paddle.distributed.get_world_size() +
+                paddle.distributed.get_rank() + remainder * f_start_id) %
+                              num_files]
+        else:
+            data_file = files[(f_start_id * paddle.distributed.get_world_size()
+                               + paddle.distributed.get_rank()) % num_files]
+
+        previous_file = data_file
+
+        train_data_loader, _ = create_pretraining_dataset(
+            data_file, args.max_predictions_per_seq, shared_file_list, args,
+            worker_init)
+
+        # TODO(guosheng): better way to process single file
+        single_file = True if f_start_id + 1 == len(files) else False
+
+        for f_id in range(f_start_id, len(files)):
+            if not single_file and f_id == f_start_id:
+                continue
+            if paddle.distributed.get_world_size() > num_files:
+                data_file = files[(
+                    f_id * paddle.distributed.get_world_size() +
+                    paddle.distributed.get_rank() + remainder * f_id) %
+                                  num_files]
+            else:
+                data_file = files[(f_id * paddle.distributed.get_world_size() +
+                                   paddle.distributed.get_rank()) % num_files]
+
+            previous_file = data_file
+            dataset_future = pool.submit(create_pretraining_dataset, data_file,
+                                         args.max_predictions_per_seq,
+                                         shared_file_list, args, worker_init)
+            train_cost_avg = TimeCostAverage()
+            reader_cost_avg = TimeCostAverage()
+            total_samples = 0
+            batch_start = time.time()
+            for step, batch in enumerate(train_data_loader):
+                train_reader_cost = time.time() - batch_start
+                reader_cost_avg.record(train_reader_cost)
+                global_step += 1
+                (input_ids, segment_ids, input_mask, masked_lm_positions,
+                 masked_lm_labels, next_sentence_labels,
+                 masked_lm_scale) = batch
+                with paddle.amp.auto_cast(
+                        args.use_amp,
+                        custom_white_list=["layer_norm", "softmax", "gelu"]):
+                    prediction_scores, seq_relationship_score = model(
+                        input_ids=input_ids,
+                        token_type_ids=segment_ids,
+                        attention_mask=input_mask,
+                        masked_positions=masked_lm_positions)
+                    loss = criterion(prediction_scores, seq_relationship_score,
+                                     masked_lm_labels, next_sentence_labels,
+                                     masked_lm_scale)
+                if args.use_amp:
+                    scaler.scale(loss).backward()
+                    scaler.minimize(optimizer, loss)
+                else:
+                    loss.backward()
+                    optimizer.step()
+                lr_scheduler.step()
+                optimizer.clear_grad()
+                total_samples += args.batch_size
+                train_run_cost = time.time() - batch_start
+                train_cost_avg.record(train_run_cost)
+
+                # Profile for model benchmark
+                if args.profiler_options is not None:
+                    profiler.add_profiler_step(args.profiler_options)
+
+                if global_step % args.logging_steps == 0:
+                    if paddle.distributed.get_rank() == 0:
+                        logger.info(
+                            "global step: %d, epoch: %d, batch: %d, loss: %f, "
+                            "avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, avg_samples: %.5f, ips: %.5f sequences/sec"
+                            % (global_step, epoch, step, loss,
+                               reader_cost_avg.get_average(),
+                               train_cost_avg.get_average(), total_samples /
+                               args.logging_steps, total_samples / (
+                                   args.logging_steps *
+                                   train_cost_avg.get_average())))
+                    total_samples = 0
+                    train_cost_avg.reset()
+                    reader_cost_avg.reset()
+                if global_step % args.save_steps == 0:
+                    if paddle.distributed.get_rank() == 0:
+                        output_dir = os.path.join(args.output_dir,
+                                                  "model_%d" % global_step)
+                        if not os.path.exists(output_dir):
+                            os.makedirs(output_dir)
+                        # need better way to get inner model of DataParallel
+                        model_to_save = model._layers if isinstance(
+                            model, paddle.DataParallel) else model
+                        model_to_save.save_pretrained(output_dir)
+                        tokenizer.save_pretrained(output_dir)
+                        paddle.save(
+                            optimizer.state_dict(),
+                            os.path.join(output_dir, "model_state.pdopt"))
+                if global_step >= args.max_steps:
+                    del train_data_loader
+                    return
+                batch_start = time.time()
+
+            del train_data_loader
+            train_data_loader, data_file = dataset_future.result(timeout=None)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    print(args)
+    do_train(args)
diff --git a/nlp/text_classification/bert/paddlepaddle/run_training.sh b/nlp/text_classification/bert/paddlepaddle/run_training.sh
new file mode 100644
index 000000000..465c4b9d7
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/run_training.sh
@@ -0,0 +1,26 @@
+: ${EPOCH_ARG:="--num_train_epochs 3"}
+: ${BATCH_SIZE:=32}
+read py_major py_minor <<< $(python3 -V 2>&1 | awk -F '[ .]' '{print $2, $3}')
+if [[ $py_major -eq 3 ]] && (( 9 <= py_minor && py_minor <= 12 )); then
+    pip3 install numpy~=1.26.4
+else
+    pip3 install numpy==1.21.6
+fi
+
+DATASET=${ROOT_DIR}/data/datasets/glue/sst2
+
+python3 run_glue.py \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --task_name SST2 \
+    --dataset_path ${DATASET} \
+    --max_seq_length 128 \
+    --batch_size ${BATCH_SIZE}   \
+    --learning_rate 2e-5 \
+    --logging_steps 100 \
+    --save_steps 1000 \
+    --output_dir ./tmp/ \
+    --device gpu \
+    --use_amp False ${EPOCH_ARG} "$@"
+
+exit $?
diff --git a/nlp/text_classification/bert/paddlepaddle/static/README.md b/nlp/text_classification/bert/paddlepaddle/static/README.md
new file mode 100644
index 000000000..fa3d5f62b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static/README.md
@@ -0,0 +1,153 @@
+# BERT Benchmark with Fleet API
+## 模型简介
+
+[BERT](https://arxiv.org/abs/1810.04805) （Bidirectional Encoder Representations from Transformers）以[Transformer](https://arxiv.org/abs/1706.03762) 编码器为网络基本组件，使用掩码语言模型（Masked Language Model）和邻接句子预测（Next Sentence Prediction）两个任务在大规模无标注文本语料上进行预训练（pre-train），得到融合了双向内容的通用语义表示模型。以预训练产生的通用语义表示模型为基础，结合任务适配的简单输出层，微调（fine-tune）后即可应用到下游的NLP任务，效果通常也较直接在下游的任务上训练的模型更优。此前BERT即在[GLUE评测任务](https://gluebenchmark.com/tasks)上取得了SOTA的结果。
+
+本项目是BERT在 Paddle 2.0上的开源实现，包含了预训练和[GLUE评测任务](https://gluebenchmark.com/tasks)上的微调代码。
+
+## 快速开始
+
+### 数据准备
+
+#### Pre-training数据准备
+
+`create_pretraining_data.py` 是创建预训练程序所需数据的脚本。其以文本文件（使用换行符换行和空白符分隔，data目录下提供了部分示例数据）为输入，经由BERT tokenizer进行tokenize后再做生成sentence pair正负样本、掩码token等处理，最后输出hdf5格式的数据文件。使用方式如下：
+
+```shell
+python create_pretraining_data.py \
+  --input_file=data/sample_text.txt \
+  --output_file=data/training_data.hdf5 \
+  --bert_model=bert-base-uncased \
+  --max_seq_length=128 \
+  --max_predictions_per_seq=20 \
+  --masked_lm_prob=0.15 \
+  --random_seed=12345 \
+  --dupe_factor=5
+```
+
+其中参数释义如下：
+- `input_file` 指定输入文件，可以使用目录，指定目录时将包括目录中的所有`.txt`文件。
+- `output_file` 指定输出文件。
+- `bert_model` 指定使用特定BERT模型对应的tokenizer进行tokenize处理。
+- `max_seq_length` 指定最大句子长度，超过该长度将被截断，不足该长度的将会进行padding。
+- `max_predictions_per_seq` 表示每个句子中会被mask的token的最大数目。
+- `masked_lm_prob` 表示每个token被mask的概率。
+- `random_seed` 指定随机种子。
+- `dupe_factor` 指定输入数据被重复处理的次数，每次处理将重新产生随机mask。
+
+使用以上预训练数据生成程序可以用于处理领域垂类数据后进行二次预训练。若需要使用BERT论文中预训练使用的英文Wiki和BookCorpus数据，可以参考[这里](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT)进行处理，得到的数据可以直接接入本项目中的预训练程序使用。
+
+#### Fine-tuning数据准备
+Fine-tuning的数据集已经被PaddleNLP框架集成，只需要填写相应的数据集的名称，PaddleNLP会自动下载数据集，具体的使用方法可以参考 `run_glue.py` 脚本。
+
+##### GLUE评测任务数据
+
+GLUE评测任务所含数据集已在paddlenlp中以API形式提供，无需预先准备，使用`run_glue.py`执行微调时将会自动下载。
+
+### 执行Pre-training
+
+#### GPU训练
+```shell
+unset CUDA_VISIBLE_DEVICES
+python -m paddle.distributed.launch --gpus "0" run_pretrain.py \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --max_predictions_per_seq 20 \
+    --batch_size 32   \
+    --learning_rate 1e-4 \
+    --weight_decay 1e-2 \
+    --adam_epsilon 1e-6 \
+    --warmup_steps 10000 \
+    --input_dir data/ \
+    --output_dir pretrained_models/ \
+    --logging_steps 1 \
+    --save_steps 20000 \
+    --max_steps 1000000 \
+    --device gpu \
+    --use_amp False
+```
+其中参数释义如下：
+- `model_type` 指示了模型类型，使用BERT模型时设置为bert即可。
+- `model_name_or_path` 指示了某种特定配置的模型，对应有其预训练模型和预训练时使用的 tokenizer。若模型相关内容保存在本地，这里也可以提供相应目录地址。
+- `max_predictions_per_seq` 表示每个句子中会被mask的token的最大数目，与创建预训练数据时的设置一致。
+- `batch_size` 表示每次迭代**每张卡**上的样本数目。
+- `learning_rate` 表示基础学习率大小，将于learning rate scheduler产生的值相乘作为当前学习率。
+- `weight_decay` 表示AdamW优化器中使用的weight_decay的系数。
+- `adam_epsilon` 表示AdamW优化器中使用的epsilon值。
+- `warmup_steps` 表示动态学习率热启的step数。
+- `num_train_epochs` 表示训练轮数。
+- `input_dir` 表示输入数据的目录，该目录下所有文件名中包含training的文件将被作为训练数据。
+- `output_dir` 表示模型的保存目录。
+- `logging_steps` 表示日志打印间隔。
+- `save_steps` 表示模型保存及评估间隔。
+- `max_steps` 表示最大训练步数。若训练`num_train_epochs`轮包含的训练步数大于该值，则达到`max_steps`后就提前结束。
+- `device` 表示训练使用的设备, 'gpu'表示使用GPU, 'xpu'表示使用百度昆仑卡, 'cpu'表示使用CPU。
+- `use_amp` 指示是否启用自动混合精度训练。
+**NOTICE**: 预训练时data目录存放的是经过 `create_pretraining_data.py` 处理后的数据，因此需要通过该数据处理脚本预先处理，否则预训练将会出现报错。
+
+### 执行Fine-tunning
+
+以GLUE中的SST-2任务为例，启动Fine-tuning的方式如下：
+
+```shell
+unset CUDA_VISIBLE_DEVICES
+python -m paddle.distributed.launch --gpus "0" run_glue.py \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --task_name SST-2 \
+    --max_seq_length 128 \
+    --batch_size 32   \
+    --learning_rate 2e-5 \
+    --num_train_epochs 3 \
+    --logging_steps 1 \
+    --save_steps 500 \
+    --output_dir ./tmp/ \
+    --device gpu
+```
+
+其中参数释义如下：
+- `model_type` 指示了模型类型，使用BERT模型时设置为bert即可。
+- `model_name_or_path` 指示了某种特定配置的模型，对应有其预训练模型和预训练时使用的 tokenizer。若模型相关内容保存在本地，这里也可以提供相应目录地址。注：`bert-base-uncased`等对应使用的预训练模型转自[huggingface/transformers](https://github.com/huggingface/transformers)，具体可参考当前目录下converter中的内容。
+- `task_name` 表示Fine-tuning的任务。
+- `max_seq_length` 表示最大句子长度，超过该长度将被截断。
+- `batch_size` 表示每次迭代**每张卡**上的样本数目。
+- `learning_rate` 表示基础学习率大小，将于learning rate scheduler产生的值相乘作为当前学习率。
+- `num_train_epochs` 表示训练轮数。
+- `logging_steps` 表示日志打印间隔。
+- `save_steps` 表示模型保存及评估间隔。
+- `output_dir` 表示模型保存路径。
+- `device` 表示训练使用的设备, 'gpu'表示使用GPU, 'xpu'表示使用百度昆仑卡, 'cpu'表示使用CPU。
+
+基于`bert-base-uncased`在GLUE各评测任务上Fine-tuning后，在验证集上有如下结果：
+
+| Task  | Metric                       | Result      |
+|-------|------------------------------|-------------|
+| CoLA  | Matthews corr                | 59.90       |
+| SST-2 | Accuracy                     | 92.76       |
+| STS-B | Pearson/Spearman corr        | 89.12       |
+| MNLI  | matched acc./mismatched acc. | 84.45/84.62 |
+| QNLI  | acc.                         | 91.73       |
+| RTE   | acc.                         | 67.15       |
+
+### 预测
+
+在Fine-tuning完成后，我们可以使用如下方式导出希望用来预测的模型：
+然后按照如下的方式进行GLUE中的评测任务进行预测（基于Paddle的[Python预测API](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/05_inference_deployment/inference/python_infer_cn.html)）：
+
+```shell
+python -u ./predict_glue.py \
+    --task_name SST-2 \
+    --model_type bert \
+    --model_path ./tmp/model_20/infer_model \
+    --batch_size 32 \
+    --max_seq_length 128
+```
+
+其中参数释义如下：
+- `task_name` 表示Fine-tuning的任务。
+- `model_type` 指示了模型类型，使用BERT模型时设置为bert即可。
+- `model_path` 表示预测模型文件的前缀，和上一步导出预测模型中的`output_path`一致。
+- `batch_size` 表示每个预测批次的样本数目。
+- `max_seq_length` 表示最大句子长度，超过该长度将被截断。
+
+**NOTICE**: 预测脚本中的 './tmp/model_20/infer_model' 是 run_glue.py 中保存下来的模型，具体的模型路径可以根据具体的路径来设定。
diff --git a/nlp/text_classification/bert/paddlepaddle/static/create_pretraining_data.py b/nlp/text_classification/bert/paddlepaddle/static/create_pretraining_data.py
new file mode 100644
index 000000000..ccf28f855
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static/create_pretraining_data.py
@@ -0,0 +1,499 @@
+# coding=utf-8
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Create masked LM/next sentence masked_lm examples for BERT."""
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+import logging
+import os
+import random
+from io import open
+import h5py
+import numpy as np
+from tqdm import tqdm
+
+from paddlenlp.transformers import BertTokenizer
+from paddlenlp.transformers.tokenizer_utils import convert_to_unicode
+
+import random
+import collections
+
+
+class TrainingInstance(object):
+    """A single training instance (sentence pair)."""
+
+    def __init__(self, tokens, segment_ids, masked_lm_positions,
+                 masked_lm_labels, is_random_next):
+        self.tokens = tokens
+        self.segment_ids = segment_ids
+        self.is_random_next = is_random_next
+        self.masked_lm_positions = masked_lm_positions
+        self.masked_lm_labels = masked_lm_labels
+
+
+def write_instance_to_example_file(instances, tokenizer, max_seq_length,
+                                   max_predictions_per_seq, output_file):
+    """Create example files from `TrainingInstance`s."""
+
+    total_written = 0
+    features = collections.OrderedDict()
+
+    num_instances = len(instances)
+    features["input_ids"] = np.zeros(
+        [num_instances, max_seq_length], dtype="int32")
+    features["input_mask"] = np.zeros(
+        [num_instances, max_seq_length], dtype="int32")
+    features["segment_ids"] = np.zeros(
+        [num_instances, max_seq_length], dtype="int32")
+    features["masked_lm_positions"] = np.zeros(
+        [num_instances, max_predictions_per_seq], dtype="int32")
+    features["masked_lm_ids"] = np.zeros(
+        [num_instances, max_predictions_per_seq], dtype="int32")
+    features["next_sentence_labels"] = np.zeros(num_instances, dtype="int32")
+
+    for inst_index, instance in enumerate(tqdm(instances)):
+        input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
+        input_mask = [1] * len(input_ids)
+        segment_ids = list(instance.segment_ids)
+        assert len(input_ids) <= max_seq_length
+
+        while len(input_ids) < max_seq_length:
+            input_ids.append(0)
+            input_mask.append(0)
+            segment_ids.append(0)
+
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+
+        masked_lm_positions = list(instance.masked_lm_positions)
+        masked_lm_ids = tokenizer.convert_tokens_to_ids(
+            instance.masked_lm_labels)
+        masked_lm_weights = [1.0] * len(masked_lm_ids)
+
+        while len(masked_lm_positions) < max_predictions_per_seq:
+            masked_lm_positions.append(0)
+            masked_lm_ids.append(0)
+            masked_lm_weights.append(0.0)
+
+        next_sentence_label = 1 if instance.is_random_next else 0
+
+        features["input_ids"][inst_index] = input_ids
+        features["input_mask"][inst_index] = input_mask
+        features["segment_ids"][inst_index] = segment_ids
+        features["masked_lm_positions"][inst_index] = masked_lm_positions
+        features["masked_lm_ids"][inst_index] = masked_lm_ids
+        features["next_sentence_labels"][inst_index] = next_sentence_label
+
+        total_written += 1
+
+    print("saving data")
+    f = h5py.File(output_file, 'w')
+    f.create_dataset(
+        "input_ids", data=features["input_ids"], dtype='i4', compression='gzip')
+    f.create_dataset(
+        "input_mask",
+        data=features["input_mask"],
+        dtype='i1',
+        compression='gzip')
+    f.create_dataset(
+        "segment_ids",
+        data=features["segment_ids"],
+        dtype='i1',
+        compression='gzip')
+    f.create_dataset(
+        "masked_lm_positions",
+        data=features["masked_lm_positions"],
+        dtype='i4',
+        compression='gzip')
+    f.create_dataset(
+        "masked_lm_ids",
+        data=features["masked_lm_ids"],
+        dtype='i4',
+        compression='gzip')
+    f.create_dataset(
+        "next_sentence_labels",
+        data=features["next_sentence_labels"],
+        dtype='i1',
+        compression='gzip')
+    f.flush()
+    f.close()
+
+
+def create_training_instances(input_files, tokenizer, max_seq_length,
+                              dupe_factor, short_seq_prob, masked_lm_prob,
+                              max_predictions_per_seq, rng):
+    """Create `TrainingInstance`s from raw text."""
+    all_documents = [[]]
+
+    # Input file format:
+    # (1) One sentence per line. These should ideally be actual sentences, not
+    # entire paragraphs or arbitrary spans of text. (Because we use the
+    # sentence boundaries for the "next sentence prediction" task).
+    # (2) Blank lines between documents. Document boundaries are needed so
+    # that the "next sentence prediction" task doesn't span between documents.
+    for input_file in input_files:
+        print("creating instance from {}".format(input_file))
+        with open(input_file, "r", encoding="UTF-8") as reader:
+            while True:
+                line = convert_to_unicode(reader.readline())
+                if not line:
+                    break
+                line = line.strip()
+
+                # Empty lines are used as document delimiters
+                if not line:
+                    all_documents.append([])
+                tokens = tokenizer.tokenize(line)
+                if tokens:
+                    all_documents[-1].append(tokens)
+
+    # Remove empty documents
+    all_documents = [x for x in all_documents if x]
+    rng.shuffle(all_documents)
+
+    # vocab_words = list(tokenizer.vocab.keys())
+    vocab_words = list(tokenizer.vocab.token_to_idx.keys())
+    instances = []
+    for _ in range(dupe_factor):
+        for document_index in range(len(all_documents)):
+            instances.extend(
+                create_instances_from_document(
+                    all_documents, document_index, max_seq_length,
+                    short_seq_prob, masked_lm_prob, max_predictions_per_seq,
+                    vocab_words, rng))
+
+    rng.shuffle(instances)
+    return instances
+
+
+def create_instances_from_document(
+        all_documents, document_index, max_seq_length, short_seq_prob,
+        masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
+    """Creates `TrainingInstance`s for a single document."""
+    document = all_documents[document_index]
+
+    # Account for [CLS], [SEP], [SEP]
+    max_num_tokens = max_seq_length - 3
+
+    # We *usually* want to fill up the entire sequence since we are padding
+    # to `max_seq_length` anyways, so short sequences are generally wasted
+    # computation. However, we *sometimes*
+    # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
+    # sequences to minimize the mismatch between pre-training and fine-tuning.
+    # The `target_seq_length` is just a rough target however, whereas
+    # `max_seq_length` is a hard limit.
+    target_seq_length = max_num_tokens
+    if rng.random() < short_seq_prob:
+        target_seq_length = rng.randint(2, max_num_tokens)
+
+    # We DON'T just concatenate all of the tokens from a document into a long
+    # sequence and choose an arbitrary split point because this would make the
+    # next sentence prediction task too easy. Instead, we split the input into
+    # segments "A" and "B" based on the actual "sentences" provided by the user
+    # input.
+    instances = []
+    current_chunk = []
+    current_length = 0
+    i = 0
+    while i < len(document):
+        segment = document[i]
+        current_chunk.append(segment)
+        current_length += len(segment)
+        if i == len(document) - 1 or current_length >= target_seq_length:
+            if current_chunk:
+                # `a_end` is how many segments from `current_chunk` go into the `A`
+                # (first) sentence.
+                a_end = 1
+                if len(current_chunk) >= 2:
+                    a_end = rng.randint(1, len(current_chunk) - 1)
+
+                tokens_a = []
+                for j in range(a_end):
+                    tokens_a.extend(current_chunk[j])
+
+                tokens_b = []
+                # Random next
+                is_random_next = False
+                if len(current_chunk) == 1 or rng.random() < 0.5:
+                    is_random_next = True
+                    target_b_length = target_seq_length - len(tokens_a)
+
+                    # This should rarely go for more than one iteration for large
+                    # corpora. However, just to be careful, we try to make sure that
+                    # the random document is not the same as the document
+                    # we're processing.
+                    for _ in range(10):
+                        random_document_index = rng.randint(
+                            0, len(all_documents) - 1)
+                        if random_document_index != document_index:
+                            break
+
+                    #If picked random document is the same as the current document
+                    if random_document_index == document_index:
+                        is_random_next = False
+
+                    random_document = all_documents[random_document_index]
+                    random_start = rng.randint(0, len(random_document) - 1)
+                    for j in range(random_start, len(random_document)):
+                        tokens_b.extend(random_document[j])
+                        if len(tokens_b) >= target_b_length:
+                            break
+                    # We didn't actually use these segments so we "put them back" so
+                    # they don't go to waste.
+                    num_unused_segments = len(current_chunk) - a_end
+                    i -= num_unused_segments
+                # Actual next
+                else:
+                    is_random_next = False
+                    for j in range(a_end, len(current_chunk)):
+                        tokens_b.extend(current_chunk[j])
+                truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
+
+                assert len(tokens_a) >= 1
+                assert len(tokens_b) >= 1
+
+                tokens = []
+                segment_ids = []
+                tokens.append("[CLS]")
+                segment_ids.append(0)
+                for token in tokens_a:
+                    tokens.append(token)
+                    segment_ids.append(0)
+
+                tokens.append("[SEP]")
+                segment_ids.append(0)
+
+                for token in tokens_b:
+                    tokens.append(token)
+                    segment_ids.append(1)
+                tokens.append("[SEP]")
+                segment_ids.append(1)
+
+                (tokens, masked_lm_positions,
+                 masked_lm_labels) = create_masked_lm_predictions(
+                     tokens, masked_lm_prob, max_predictions_per_seq,
+                     vocab_words, rng)
+                instance = TrainingInstance(
+                    tokens=tokens,
+                    segment_ids=segment_ids,
+                    is_random_next=is_random_next,
+                    masked_lm_positions=masked_lm_positions,
+                    masked_lm_labels=masked_lm_labels)
+                instances.append(instance)
+            current_chunk = []
+            current_length = 0
+        i += 1
+
+    return instances
+
+
+MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
+                                          ["index", "label"])
+
+
+def create_masked_lm_predictions(tokens, masked_lm_prob,
+                                 max_predictions_per_seq, vocab_words, rng):
+    """Creates the predictions for the masked LM objective."""
+
+    cand_indexes = []
+    for (i, token) in enumerate(tokens):
+        if token == "[CLS]" or token == "[SEP]":
+            continue
+        cand_indexes.append(i)
+
+    rng.shuffle(cand_indexes)
+
+    output_tokens = list(tokens)
+
+    num_to_predict = min(max_predictions_per_seq,
+                         max(1, int(round(len(tokens) * masked_lm_prob))))
+
+    masked_lms = []
+    covered_indexes = set()
+    for index in cand_indexes:
+        if len(masked_lms) >= num_to_predict:
+            break
+        if index in covered_indexes:
+            continue
+        covered_indexes.add(index)
+
+        masked_token = None
+        # 80% of the time, replace with [MASK]
+        if rng.random() < 0.8:
+            masked_token = "[MASK]"
+        else:
+            # 10% of the time, keep original
+            if rng.random() < 0.5:
+                masked_token = tokens[index]
+            # 10% of the time, replace with random word
+            else:
+                masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
+
+        output_tokens[index] = masked_token
+
+        masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
+
+    masked_lms = sorted(masked_lms, key=lambda x: x.index)
+
+    masked_lm_positions = []
+    masked_lm_labels = []
+    for p in masked_lms:
+        masked_lm_positions.append(p.index)
+        masked_lm_labels.append(p.label)
+
+    return (output_tokens, masked_lm_positions, masked_lm_labels)
+
+
+def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
+    """Truncates a pair of sequences to a maximum sequence length."""
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_num_tokens:
+            break
+
+        trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
+        assert len(trunc_tokens) >= 1
+
+        # We want to sometimes truncate from the front and sometimes from the
+        # back to add more randomness and avoid biases.
+        if rng.random() < 0.5:
+            del trunc_tokens[0]
+        else:
+            trunc_tokens.pop()
+
+
+def main():
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--input_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The input train corpus. can be directory with .txt files or a path to a single file"
+    )
+    parser.add_argument(
+        "--output_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The output file where created hdf5 formatted data will be written."
+    )
+    parser.add_argument(
+        "--vocab_file",
+        default=None,
+        type=str,
+        required=False,
+        help="The vocabulary the BERT model will train on. "
+        "Use bert_model argument would ignore this. "
+        "The bert_model argument is recommended.")
+    parser.add_argument(
+        "--do_lower_case",
+        action='store_true',
+        default=True,
+        help="Whether to lower case the input text. True for uncased models, False for cased models. "
+        "Use bert_model argument would ignore this. The bert_model argument is recommended."
+    )
+    parser.add_argument(
+        "--bert_model",
+        default="bert-base-uncased",
+        type=str,
+        required=False,
+        help="Bert pre-trained model selected in the list: bert-base-uncased, "
+        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
+        "If provided, use the pre-trained model used tokenizer to create data "
+        "and ignore vocab_file and do_lower_case.")
+
+    ## Other parameters
+    #int
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after WordPiece tokenization. \n"
+        "Sequences longer than this will be truncated, and sequences shorter \n"
+        "than this will be padded.")
+    parser.add_argument(
+        "--dupe_factor",
+        default=10,
+        type=int,
+        help="Number of times to duplicate the input data (with different masks)."
+    )
+    parser.add_argument(
+        "--max_predictions_per_seq",
+        default=20,
+        type=int,
+        help="Maximum number of masked LM predictions per sequence.")
+
+    # floats
+    parser.add_argument(
+        "--masked_lm_prob",
+        default=0.15,
+        type=float,
+        help="Masked LM probability.")
+    parser.add_argument(
+        "--short_seq_prob",
+        default=0.1,
+        type=float,
+        help="Probability to create a sequence shorter than maximum sequence length"
+    )
+
+    parser.add_argument(
+        '--random_seed',
+        type=int,
+        default=12345,
+        help="random seed for initialization")
+
+    args = parser.parse_args()
+    print(args)
+
+    if args.bert_model:
+        tokenizer = BertTokenizer.from_pretrained(args.bert_model)
+    else:
+        assert args.vocab_file, (
+            "vocab_file must be set If bert_model is not provided.")
+        tokenizer = BertTokenizer(
+            args.vocab_file, do_lower_case=args.do_lower_case)
+
+    input_files = []
+    if os.path.isfile(args.input_file):
+        input_files.append(args.input_file)
+    elif os.path.isdir(args.input_file):
+        input_files = [
+            os.path.join(args.input_file, f)
+            for f in os.listdir(args.input_file)
+            if (os.path.isfile(os.path.join(args.input_file, f)) and f.endswith(
+                '.txt'))
+        ]
+    else:
+        raise ValueError("{} is not a valid path".format(args.input_file))
+
+    rng = random.Random(args.random_seed)
+    instances = create_training_instances(
+        input_files, tokenizer, args.max_seq_length, args.dupe_factor,
+        args.short_seq_prob, args.masked_lm_prob, args.max_predictions_per_seq,
+        rng)
+
+    output_file = args.output_file
+
+    write_instance_to_example_file(instances, tokenizer, args.max_seq_length,
+                                   args.max_predictions_per_seq, output_file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nlp/text_classification/bert/paddlepaddle/static/data/sample_text.txt b/nlp/text_classification/bert/paddlepaddle/static/data/sample_text.txt
new file mode 100644
index 000000000..75ec60cdb
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static/data/sample_text.txt
@@ -0,0 +1,100 @@
+Zulfiqar A. Bhutta trained as a physician in Pakistan in the early stages of his career.
+He holds titles across various organizations in diverse geographies.
+Professor Bhutta is the Founding Director of the Center of Excellence in Women and Child Health & Institute for Global Child Health & Development, at the Aga Khan University South-Central Asia, East Africa & United Kingdom.
+He is currently the Co-Director at the Centre for Global Child Health, at the Hospital for Sick Children and leads many projects as a Senior Scientist at the Research Institute in the Centre for Global Child Health at Sick Kids.
+He holds a Professorship at the University of Toronto in the Department of Nutritional Sciences and the Division of Epidemiology, Dalla Lana School of Public Health.
+Additionally, he holds concurrent professorship at the Department of Paediatrics, Aga Khan University in Karachi, Pakistan and at the Schools of Public Health of Johns Hopkins University, Tufts University, Boston University, University of Alberta and the London School of Hygiene & Tropical Medicine.
+He is a designated Distinguished National Professor of the Government of Pakistan and was the Founding Chair of the National Research Ethics Committee of the Government of Pakistan from 2003-2014.
+Dr. Bhutta received his MBBS from Khyber Medical College in Peshawar, Pakistan in 1977 at which time he was names "Best Graduate of the Year" and awarded the University Gold Medal for overall distinction.
+His PhD work was completed at Karolinska Institute in Stockholm, Sweden in 1996.
+He is a Fellow of the Royal College of Physicians (Edinburgh & London), the Royal College of Paediatrics and Child Health (London), American Academy of Paediatrics and the Pakistan Academy of Sciences.
+Following the completion of his PhD Dr. Bhutta began working as House Surgeon in Obstetrics & Gynecology at the Khyber Teaching Hospital, Peshawar (April-November 1978).
+He began work in paediatrics as a physician in November of 1978 in the Professorial Unit at the Institute of Child Health, Jinnah Postgraduate Medical Centre, Karachi (Pakistan).
+Through 1980's he continued his work as a surgeon and paediatrician.
+He undertook his first professor position in the Department of Paediatrics, The Aga Khan University Hospital, Karachi (Pakistan), from November 1987 to June 1992.
+In 2005, Dr. Bhutta became the Chairman of the Department of Paediatrics & Child Health at the Aga Khan University & Medical Center, a position held until 2008.
+Following his term as Chairman he became The Noordin Noormahomed Sheriff Professor & Founding Chair, Division of Women & Child Health, The Aga Khan University, a position he held for four years.
+Dr. Bhutta currently holds the titles of co-director of the Centre for Global Child Health at the Hospital for Sick Children in Toronto, and founding director of the Centre of Excellence in Women and Child Health at the Aga Khan University.
+In 2020, he was appointed founding director of the Institute for Global child Health & Development at the Aga Khan University and elected Fellow to the Royal Society, United Kingdom.
+Outside of his professional responsibilities Dr. Bhutta serves on various local and international boards and committees, including a series of editorial boards.
+In his various capacities Dr. Bhutta has produced a large collection of publications working with his teams at Sick Kids, AKU and international partners.
+These include book reviews, chapters, 1.
+"Haematological disorders" "Neonatal Jaundice" in Neonatal Vade‑Mecum, Fleming PJ, Speidel BD, Dunn PM Eds, Lloyd‑Luke Publishers, UK, 1986.
+Revised 2nd Edition 1991.
+2.
+"Nutritional management of acute and persistent diarrhoea".
+A M Molla, Bhutta Z A and  A Molla.
+In McNeish A S, Mittal S K and Walker-Smith J A (eds).
+Recent trends in diarrhoea and malnutrition, MAMC, Delhi, 1991, pp 37-51.
+3.
+"Paediatric Prescribing” in "Text book of Paediatrics for developing countries"            Arif MA, Hanif SM, Wasti SMK Eds, 1989, 2nd Edition 1996,  PPA, Karachi.
+& Lahore 4.
+"Innovations in neonatal care : Impact on neonatal survival in the developing world:.
+Bhutta Z A  Zaidi S (Editor) 1992.
+TWEL Publisher.
+Karachi pp 121-131 5.
+"Short course therapy in Pediatrics" Bhutta Z A& Teele D.  In Tice A D, Waldvogel F (Eds), Contemporary issues in Infectious Disease Epidemiology and Management, 1993 Gardiner Caldwell, Cheshire, pp 52 - 60.
+6.
+"Dietary management of persistent diarrhoea".
+Bhutta Z A, Molla A M, Issani Z.
+In Reflections on  Diarrhoeal Disease & Nutrition  of Children".
+1993 Karachi, pp 97 - 103.
+7.
+"Prescribing practices amongst general practitioners (GPs) and consultant paediatricians in childhood diarrhoea.”  S.Q.
+Nizami, I.A.
+Khan, Bhutta Z A.
+In "Reflections on Diarrhoeal Disease and Nutrition of Children".
+1993 Karachi, pp  88-90.
+8.
+"The challenge of multidrug-resistant typhoid".
+Bhutta Z A.
+In Puri R K, Sachdev H P S, Choudhry P, Verma I C (Eds), Current concepts in Paediatrics, 1994.
+Jaypee Publishers, New Delhi, pp 403.8.
+9.
+"Perinatal Care in Pakistan: Current status and trends".
+In Proceedings of the Workshop in Reproductive Health.
+College of Physicians and Surgeons, Pakistan, Karachi, 1995, pp 95-103.
+10.
+“A study of whole body protein kinetics in malnourished children with persistent diarrhoea” Bhutta Z A, Nizami SQ, Isani Z, Hardy S, Hendricks K, Young V.   Report of the second RCM coordinated Research Programme for application of stable isotope tracer methods to studies of energy metabolism in malnourished populations of developing countries.
+NAHRES-30 1996 IAEA Vienna.
+11.
+"Pneumococcal infections in Pakistan: a country report".
+In Adult Immunization in Asia, Fondation Mercel Merieux, Lyon, 1998. pp 79-82.
+12.
+“Factors affecting protein and aminoacid metabolism in childhood from developing countries".
+In Child Nutrition: an international perspective.
+Editors Solomons NW, Caballero B, Brown KH.
+CRC Press 1998.
+13.
+"Protein Digestion and Bioavailability".
+In Encyclopedia of Human Nutrition.
+Editors: Sadler M, Strain JJ, Caballero B.
+Academic Press (London), 1998 pp.1646-54.
+14.
+"Perinatal Care in Pakistan.
+Reproductive Health: A manual for family practice and primary health care.
+Bhutta Z A, Maqbool S.  College of Physicians and Surgeons, Pakistan, Karachi, 1999, pp 69-78.
+15.
+“Effective interventions to reduce neonatal mortality and morbidity from perinatal infection.
+Bhutta ZA.
+In Costello A, Manandhar D (eds).
+"Improving Newborn Infant Health in Developing Countries’ 1999.
+Imperial College Press, London pp.289-308.
+16.
+“Ambulatory management of typhoid fever”            “Risk factors and management of micronutrient deficiencies”            “Management of persistent diarrhoea in developing countries”.
+In Manual of International Child Health, British Medical Journal, 2000 (in press).
+17.
+“The role of Cefixime in typhoid fever during childhood” in Cefixime, Adam D, Quintiliani R (Eds), Torre-Lazur-McCann, Tokyo, 2000; pp.107-112.
+18.
+"Micronutrients and Child Health in the Commonwealth”, Commonwealth Foundation" (UK) (2001).
+19.
+"Isotopic evaluation of breast milk intake, energy metabolism growth and body composition of exclusively breastfed infants in Pakistan".
+Bhutta ZA, Nizami SQ, Weaver LT, Preston T. In Application of Stable Isotopes to evaluate Growth and Body Composition of Exclusively Breastfed infants, IAEA and WHO, NAHRES Report.
+2000.
+20.
+“Typhoid Fever in Childhood: the south Asian experience”.
+Ahmad K &Bhutta ZA.
+In "Recent Advances in Paediatrics", Gupte S (Ed), 2000, India .
+21.
+“Neonatal Infections in developing countries” in  Carrera JM, Cabero L, Baraibar R (Eds).
+The Perinatal Medicine of the new Millennium.
\ No newline at end of file
diff --git a/nlp/text_classification/bert/paddlepaddle/static/dataset.py b/nlp/text_classification/bert/paddlepaddle/static/dataset.py
new file mode 100644
index 000000000..a7c00562f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static/dataset.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import h5py
+import numpy as np
+
+import paddle
+from paddle.io import DataLoader, Dataset
+from paddlenlp.data import Tuple, Stack
+
+
+def create_pretraining_dataset(input_file,
+                               max_pred_length,
+                               args,
+                               data_holders,
+                               worker_init=None,
+                               places=None):
+    train_data = PretrainingDataset(
+        input_file=input_file, max_pred_length=max_pred_length)
+    train_batch_sampler = paddle.io.BatchSampler(
+        train_data, batch_size=args.batch_size, shuffle=True)
+
+    def _collate_data(data, stack_fn=Stack()):
+        num_fields = len(data[0])
+        out = [None] * num_fields
+        # input_ids, segment_ids, input_mask, masked_lm_positions,
+        # masked_lm_labels, next_sentence_labels, mask_token_num
+        for i in (0, 1, 2, 5):
+            out[i] = stack_fn([x[i] for x in data])
+        batch_size, seq_length = out[0].shape
+        size = num_mask = sum(len(x[3]) for x in data)
+        # Padding for divisibility by 8 for fp16 or int8 usage
+        if size % 8 != 0:
+            size += 8 - (size % 8)
+        # masked_lm_positions
+        # Organize as a 1D tensor for gather or use gather_nd
+        out[3] = np.full(size, 0, dtype=np.int32)
+        # masked_lm_labels
+        out[4] = np.full([size, 1], -1, dtype=np.int64)
+        mask_token_num = 0
+        for i, x in enumerate(data):
+            for j, pos in enumerate(x[3]):
+                out[3][mask_token_num] = i * seq_length + pos
+                out[4][mask_token_num] = x[4][j]
+                mask_token_num += 1
+        # mask_token_num
+        out.append(np.asarray([mask_token_num], dtype=np.float32))
+        if args.use_amp and args.use_pure_fp16:
+            # cast input_mask to fp16
+            out[2] = out[2].astype(np.float16)
+            # cast masked_lm_scale to fp16
+            out[-1] = out[-1].astype(np.float16)
+        return out
+
+    train_data_loader = DataLoader(
+        dataset=train_data,
+        places=places,
+        feed_list=data_holders,
+        batch_sampler=train_batch_sampler,
+        collate_fn=_collate_data,
+        num_workers=0,
+        worker_init_fn=worker_init,
+        return_list=False)
+    return train_data_loader, input_file
+
+
+def create_data_holder(args):
+    input_ids = paddle.static.data(
+        name="input_ids", shape=[-1, -1], dtype="int64")
+    segment_ids = paddle.static.data(
+        name="segment_ids", shape=[-1, -1], dtype="int64")
+    input_mask = paddle.static.data(
+        name="input_mask", shape=[-1, 1, 1, -1], dtype="float32")
+    masked_lm_positions = paddle.static.data(
+        name="masked_lm_positions", shape=[-1], dtype="int32")
+    masked_lm_labels = paddle.static.data(
+        name="masked_lm_labels", shape=[-1, 1], dtype="int64")
+    next_sentence_labels = paddle.static.data(
+        name="next_sentence_labels", shape=[-1, 1], dtype="int64")
+    masked_lm_scale = paddle.static.data(
+        name="masked_lm_scale", shape=[-1, 1], dtype="float32")
+    return [
+        input_ids, segment_ids, input_mask, masked_lm_positions,
+        masked_lm_labels, next_sentence_labels, masked_lm_scale
+    ]
+
+
+class PretrainingDataset(Dataset):
+    def __init__(self, input_file, max_pred_length):
+        self.input_file = input_file
+        self.max_pred_length = max_pred_length
+        f = h5py.File(input_file, "r")
+        keys = [
+            'input_ids', 'input_mask', 'segment_ids', 'masked_lm_positions',
+            'masked_lm_ids', 'next_sentence_labels'
+        ]
+        self.inputs = [np.asarray(f[key][:]) for key in keys]
+        f.close()
+
+    def __len__(self):
+        'Denotes the total number of samples'
+        return len(self.inputs[0])
+
+    def __getitem__(self, index):
+
+        [
+            input_ids, input_mask, segment_ids, masked_lm_positions,
+            masked_lm_ids, next_sentence_labels
+        ] = [
+            input[index].astype(np.int64)
+            if indice < 5 else np.asarray(input[index].astype(np.int64))
+            for indice, input in enumerate(self.inputs)
+        ]
+        # TODO: whether to use reversed mask by changing 1s and 0s to be
+        # consistent with nv bert
+        input_mask = (1 - np.reshape(
+            input_mask.astype(np.float32), [1, 1, input_mask.shape[0]])) * -1e4
+
+        index = self.max_pred_length
+        # store number of  masked tokens in index
+        # outputs of torch.nonzero diff with that of numpy.nonzero by zip
+        padded_mask_indices = (masked_lm_positions == 0).nonzero()[0]
+        if len(padded_mask_indices) != 0:
+            index = padded_mask_indices[0].item()
+            mask_token_num = index
+        else:
+            index = self.max_pred_length
+            mask_token_num = self.max_pred_length
+        # masked_lm_labels = np.full(input_ids.shape, -1, dtype=np.int64)
+        # masked_lm_labels[masked_lm_positions[:index]] = masked_lm_ids[:index]
+        masked_lm_labels = masked_lm_ids[:index]
+        masked_lm_positions = masked_lm_positions[:index]
+        # softmax_with_cross_entropy enforce last dim size equal 1
+        masked_lm_labels = np.expand_dims(masked_lm_labels, axis=-1)
+        next_sentence_labels = np.expand_dims(next_sentence_labels, axis=-1)
+
+        return [
+            input_ids, segment_ids, input_mask, masked_lm_positions,
+            masked_lm_labels, next_sentence_labels
+        ]
diff --git a/nlp/text_classification/bert/paddlepaddle/static/predict_glue.py b/nlp/text_classification/bert/paddlepaddle/static/predict_glue.py
new file mode 100644
index 000000000..bea832aa0
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static/predict_glue.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+from functools import partial
+
+import paddle
+from paddle import inference
+from paddlenlp.datasets import load_dataset
+from paddlenlp.data import Stack, Tuple, Pad
+
+from run_glue import convert_example, METRIC_CLASSES, MODEL_CLASSES
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument(
+        "--task_name",
+        default=None,
+        type=str,
+        required=True,
+        help="The name of the task to perform predict, selected in the list: " +
+        ", ".join(METRIC_CLASSES.keys()), )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " +
+        ", ".join(MODEL_CLASSES.keys()), )
+    parser.add_argument(
+        "--model_path",
+        default=None,
+        type=str,
+        required=True,
+        help="The path prefix of inference model to be used.", )
+    parser.add_argument(
+        "--device",
+        default="gpu",
+        choices=["gpu", "cpu", "xpu"],
+        help="Device selected for inference.", )
+    parser.add_argument(
+        "--batch_size",
+        default=32,
+        type=int,
+        help="Batch size for predict.", )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.", )
+    args = parser.parse_args()
+    return args
+
+
+class Predictor(object):
+    def __init__(self, predictor, input_handles, output_handles):
+        self.predictor = predictor
+        self.input_handles = input_handles
+        self.output_handles = output_handles
+
+    @classmethod
+    def create_predictor(cls, args):
+        config = paddle.inference.Config(args.model_path + ".pdmodel",
+                                         args.model_path + ".pdiparams")
+        if args.device == "gpu":
+            # set GPU configs accordingly
+            config.enable_use_gpu(100, 0)
+        elif args.device == "cpu":
+            # set CPU configs accordingly,
+            # such as enable_mkldnn, set_cpu_math_library_num_threads
+            config.disable_gpu()
+        elif args.device == "xpu":
+            # set XPU configs accordingly
+            config.enable_xpu(100)
+        config.switch_use_feed_fetch_ops(False)
+        predictor = paddle.inference.create_predictor(config)
+        input_handles = [
+            predictor.get_input_handle(name)
+            for name in predictor.get_input_names()
+        ]
+        output_handles = [
+            predictor.get_output_handle(name)
+            for name in predictor.get_output_names()
+        ]
+        return cls(predictor, input_handles, output_handles)
+
+    def predict_batch(self, data):
+        for input_field, input_handle in zip(data, self.input_handles):
+            input_handle.copy_from_cpu(input_field.numpy() if isinstance(
+                input_field, paddle.Tensor) else input_field)
+        self.predictor.run()
+        output = [
+            output_handle.copy_to_cpu() for output_handle in self.output_handles
+        ]
+        return output
+
+    def predict(self, dataset, collate_fn, batch_size=1):
+        batch_sampler = paddle.io.BatchSampler(
+            dataset, batch_size=batch_size, shuffle=False)
+        data_loader = paddle.io.DataLoader(
+            dataset=dataset,
+            batch_sampler=batch_sampler,
+            collate_fn=collate_fn,
+            num_workers=0,
+            return_list=True)
+        outputs = []
+        for data in data_loader:
+            output = self.predict_batch(data)
+            outputs.append(output)
+        return outputs
+
+
+def main():
+    args = parse_args()
+
+    predictor = Predictor.create_predictor(args)
+
+    args.task_name = args.task_name.lower()
+    args.model_type = args.model_type.lower()
+    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+
+    test_ds = load_dataset('glue', args.task_name, splits="test")
+    tokenizer = tokenizer_class.from_pretrained(
+        os.path.dirname(args.model_path))
+
+    trans_func = partial(
+        convert_example,
+        tokenizer=tokenizer,
+        label_list=test_ds.label_list,
+        max_seq_length=args.max_seq_length,
+        is_test=True)
+    test_ds = test_ds.map(trans_func)
+    batchify_fn = lambda samples, fn=Tuple(
+        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),  # input
+        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"),  # segment
+    ): fn(samples)
+    predictor.predict(
+        test_ds, batch_size=args.batch_size, collate_fn=batchify_fn)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nlp/text_classification/bert/paddlepaddle/static/run_glue.py b/nlp/text_classification/bert/paddlepaddle/static/run_glue.py
new file mode 100644
index 000000000..0b6dfc11a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static/run_glue.py
@@ -0,0 +1,446 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import os
+import random
+import time
+from functools import partial
+
+import numpy as np
+import paddle
+import paddle.distributed.fleet as fleet
+from paddle.io import DataLoader
+from paddlenlp.datasets import load_dataset
+
+from paddle.metric import Accuracy
+from paddlenlp.data import Stack, Tuple, Pad
+from paddlenlp.data.sampler import SamplerHelper
+from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer
+from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
+from paddlenlp.transformers import LinearDecayWithWarmup
+from paddlenlp.metrics import Mcc, PearsonAndSpearman
+from paddlenlp.utils.log import logger
+
+METRIC_CLASSES = {
+    "cola": Mcc,
+    "sst-2": Accuracy,
+    "sts-b": PearsonAndSpearman,
+    "mnli": Accuracy,
+    "qnli": Accuracy,
+    "rte": Accuracy,
+}
+
+MODEL_CLASSES = {
+    "bert": (BertForSequenceClassification, BertTokenizer),
+    "ernie": (ErnieForSequenceClassification, ErnieTokenizer),
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument(
+        "--task_name",
+        default=None,
+        type=str,
+        required=True,
+        help="The name of the task to train selected in the list: " +
+        ", ".join(METRIC_CLASSES.keys()), )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " +
+        ", ".join(MODEL_CLASSES.keys()), )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: "
+        + ", ".join(
+            sum([
+                list(classes[-1].pretrained_init_configuration.keys())
+                for classes in MODEL_CLASSES.values()
+            ], [])), )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.", )
+    parser.add_argument(
+        "--batch_size",
+        default=8,
+        type=int,
+        help="Batch size per GPU/CPU for training.", )
+    parser.add_argument(
+        "--learning_rate",
+        default=5e-5,
+        type=float,
+        help="The initial learning rate for Adam.")
+    parser.add_argument(
+        "--weight_decay",
+        default=0.0,
+        type=float,
+        help="Weight decay if we apply some.")
+    parser.add_argument(
+        "--adam_epsilon",
+        default=1e-8,
+        type=float,
+        help="Epsilon for Adam optimizer.")
+    parser.add_argument(
+        "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs",
+        default=3,
+        type=int,
+        help="Total number of training epochs to perform.", )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument(
+        "--warmup_steps",
+        default=0,
+        type=int,
+        help="Linear warmup over warmup_steps.")
+    parser.add_argument(
+        "--logging_steps",
+        type=int,
+        default=500,
+        help="Log every X updates steps.")
+    parser.add_argument(
+        "--save_steps",
+        type=int,
+        default=500,
+        help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--seed", type=int, default=42, help="Random seed for initialization")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="gpu",
+        help="Device for selecting for the training.")
+    args = parser.parse_args()
+    return args
+
+
+def create_data_holder(task_name):
+    """
+    Define the input data holder for the glue task.
+    """
+    input_ids = paddle.static.data(
+        name="input_ids", shape=[-1, -1], dtype="int64")
+    token_type_ids = paddle.static.data(
+        name="token_type_ids", shape=[-1, -1], dtype="int64")
+    if task_name == "sts-b":
+        label = paddle.static.data(name="label", shape=[-1, 1], dtype="float32")
+    else:
+        label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64")
+
+    return [input_ids, token_type_ids, label]
+
+
+def reset_program_state_dict(args, model, state_dict, pretrained_state_dict):
+    """
+    Initialize the parameter from the bert config, and set the parameter by 
+    reseting the state dict."
+    """
+    reset_state_dict = {}
+    scale = model.initializer_range if hasattr(model, "initializer_range")\
+        else getattr(model, args.model_type).config["initializer_range"]
+    reset_parameter_names = []
+    for n, p in state_dict.items():
+        if n in pretrained_state_dict:
+            reset_state_dict[p.name] = np.array(pretrained_state_dict[n])
+            reset_parameter_names.append(n)
+        elif p.name in pretrained_state_dict and "bert" in n:
+            reset_state_dict[p.name] = np.array(pretrained_state_dict[p.name])
+            reset_parameter_names.append(n)
+        else:
+            dtype_str = "float32"
+            if str(p.dtype) == "VarType.FP64":
+                dtype_str = "float64"
+            reset_state_dict[p.name] = np.random.normal(
+                loc=0.0, scale=scale, size=p.shape).astype(dtype_str)
+    logger.info("the following parameter had reset, please check. {}".format(
+        reset_parameter_names))
+    return reset_state_dict
+
+
+def set_seed(args):
+    """
+    Use the same data seed(for data shuffle) for all procs to guarantee data
+    consistency after sharding.
+    """
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    # Maybe different op seeds(for dropout) for different procs is better. By:
+    # `paddle.seed(args.seed + paddle.distributed.get_rank())`
+    paddle.seed(args.seed)
+
+
+def evaluate(exe, metric, loss, correct, dev_program, data_loader,
+             phase="eval"):
+    """
+    The evaluate process, calcluate the eval loss and metric. 
+    """
+    metric.reset()
+    returns = [loss]
+    if isinstance(correct, list) or isinstance(correct, tuple):
+        returns.extend(list(correct))
+    else:
+        returns.append(correct)
+    for batch in data_loader:
+        exe.run(dev_program, feed=batch, \
+           fetch_list=returns)
+        return_numpys = exe.run(dev_program, feed=batch, \
+           fetch_list=returns)
+        metric_numpy = return_numpys[1] if len(return_numpys[
+            1:]) == 1 else return_numpys[1:]
+        metric.update(metric_numpy)
+    res = metric.accumulate()
+    if isinstance(metric, Mcc):
+        print("%s loss: %f, mcc: %s" % (phase, return_numpys[0], res[0]))
+    elif isinstance(metric, PearsonAndSpearman):
+        print("%s loss: %f, pearson: %s, spearman: %s, pearson and spearman: %s"
+              % (phase, return_numpys[0], res[0], res[1], res[2]))
+    else:
+        print("%s loss: %f, acc: %s, " % (phase, return_numpys[0], res))
+
+
+def convert_example(example,
+                    tokenizer,
+                    label_list,
+                    max_seq_length=512,
+                    is_test=False):
+    """
+    Convert a glue example into necessary features.
+    """
+    if not is_test:
+        # `label_list == None` is for regression task
+        label_dtype = "int64" if label_list else "float32"
+        # Get the label
+        label = example['labels']
+        label = np.array([label], dtype=label_dtype)
+    # Convert raw text to feature
+    if (int(is_test) + len(example)) == 2:
+        example = tokenizer(example['sentence'], max_seq_len=max_seq_length)
+    else:
+        example = tokenizer(
+            example['sentence1'],
+            text_pair=example['sentence2'],
+            max_seq_len=max_seq_length)
+
+    if not is_test:
+        return example['input_ids'], example['token_type_ids'], label
+    else:
+        return example['input_ids'], example['token_type_ids']
+
+
+def do_train(args):
+    # Set the paddle execute enviroment
+    paddle.enable_static()
+    place = paddle.set_device(args.device)
+    fleet.init(is_collective=True)
+    set_seed(args)
+
+    # Create the main_program for the training and dev_program for the validation
+    main_program = paddle.static.default_main_program()
+    startup_program = paddle.static.default_startup_program()
+    dev_program = paddle.static.Program()
+
+    # Get the configuration of tokenizer and model
+    args.task_name = args.task_name.lower()
+    args.model_type = args.model_type.lower()
+    metric_class = METRIC_CLASSES[args.task_name]
+    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+
+    # Create the tokenizer and dataset
+    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
+    train_ds = load_dataset('glue', args.task_name, splits="train")
+
+    trans_func = partial(
+        convert_example,
+        tokenizer=tokenizer,
+        label_list=train_ds.label_list,
+        max_seq_length=args.max_seq_length)
+
+    train_ds = train_ds.map(trans_func, lazy=True)
+
+    batchify_fn = lambda samples, fn=Tuple(
+        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
+        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # token_type 
+        Stack(dtype="int64" if train_ds.label_list else "float32")  # label
+    ): fn(samples)
+
+    train_batch_sampler = paddle.io.BatchSampler(
+        train_ds, batch_size=args.batch_size, shuffle=True)
+
+    feed_list_name = []
+
+    # Define the input data and create the train/dev data_loader
+    with paddle.static.program_guard(main_program, startup_program):
+        [input_ids, token_type_ids, labels] = create_data_holder(args.task_name)
+
+    train_data_loader = DataLoader(
+        dataset=train_ds,
+        feed_list=[input_ids, token_type_ids, labels],
+        batch_sampler=train_batch_sampler,
+        collate_fn=batchify_fn,
+        num_workers=0,
+        return_list=False)
+
+    if args.task_name == "mnli":
+        dev_ds_matched, dev_ds_mismatched = load_dataset(
+            'glue', args.task_name, splits=["dev_matched", "dev_mismatched"])
+
+        dev_ds_matched = dev_ds_matched.map(trans_func, lazy=True)
+        dev_ds_mismatched = dev_ds_mismatched.map(trans_func, lazy=True)
+        dev_batch_sampler_matched = paddle.io.BatchSampler(
+            dev_ds_matched, batch_size=args.batch_size, shuffle=False)
+        dev_data_loader_matched = DataLoader(
+            dataset=dev_ds_matched,
+            batch_sampler=dev_batch_sampler_matched,
+            collate_fn=batchify_fn,
+            feed_list=[input_ids, token_type_ids, labels],
+            num_workers=0,
+            return_list=False)
+        dev_batch_sampler_mismatched = paddle.io.BatchSampler(
+            dev_ds_mismatched, batch_size=args.batch_size, shuffle=False)
+        dev_data_loader_mismatched = DataLoader(
+            dataset=dev_ds_mismatched,
+            batch_sampler=dev_batch_sampler_mismatched,
+            collate_fn=batchify_fn,
+            num_workers=0,
+            feed_list=[input_ids, token_type_ids, labels],
+            return_list=False)
+    else:
+        dev_ds = load_dataset('glue', args.task_name, splits='dev')
+        dev_ds = dev_ds.map(trans_func, lazy=True)
+        dev_batch_sampler = paddle.io.BatchSampler(
+            dev_ds, batch_size=args.batch_size, shuffle=False)
+        dev_data_loader = DataLoader(
+            dataset=dev_ds,
+            batch_sampler=dev_batch_sampler,
+            collate_fn=batchify_fn,
+            num_workers=0,
+            feed_list=[input_ids, token_type_ids, labels],
+            return_list=False)
+
+    # Create the training-forward program, and clone it for the validation
+    with paddle.static.program_guard(main_program, startup_program):
+        num_class = 1 if train_ds.label_list is None else len(
+            train_ds.label_list)
+        model, pretrained_state_dict = model_class.from_pretrained(
+            args.model_name_or_path, num_classes=num_class)
+        loss_fct = paddle.nn.loss.CrossEntropyLoss(
+        ) if train_ds.label_list else paddle.nn.loss.MSELoss()
+        logits = model(input_ids, token_type_ids)
+        loss = loss_fct(logits, labels)
+        dev_program = main_program.clone(for_test=True)
+
+    # Create the training-backward program, this pass will not be
+    # executed in the validation
+    num_training_steps = args.max_steps if args.max_steps > 0 else len(
+        train_data_loader) * args.num_train_epochs
+    with paddle.static.program_guard(main_program, startup_program):
+        lr_scheduler = LinearDecayWithWarmup(
+            args.learning_rate, num_training_steps, args.warmup_steps)
+        # Generate parameter names needed to perform weight decay.
+        # All bias and LayerNorm parameters are excluded.
+        decay_params = [
+            p.name for n, p in model.named_parameters()
+            if not any(nd in n for nd in ["bias", "norm"])
+        ]
+        optimizer = paddle.optimizer.AdamW(
+            learning_rate=lr_scheduler,
+            epsilon=args.adam_epsilon,
+            parameters=model.parameters(),
+            weight_decay=args.weight_decay,
+            apply_decay_param_fun=lambda x: x in decay_params)
+        optimizer = fleet.distributed_optimizer(optimizer)
+        optimizer.minimize(loss)
+
+    # Create the metric pass for the validation
+    with paddle.static.program_guard(dev_program, startup_program):
+        metric = metric_class()
+        correct = metric.compute(logits, labels)
+
+    # Initialize the fine-tuning parameter, we will load the parameters in
+    # pre-training model. And initialize the parameter which not in pre-training model
+    # by the normal distribution.
+    exe = paddle.static.Executor(place)
+    exe.run(startup_program)
+    state_dict = model.state_dict()
+    reset_state_dict = reset_program_state_dict(args, model, state_dict,
+                                                pretrained_state_dict)
+    paddle.static.set_program_state(main_program, reset_state_dict)
+
+    global_step = 0
+    tic_train = time.time()
+    for epoch in range(args.num_train_epochs):
+        for step, batch in enumerate(train_data_loader):
+            global_step += 1
+            loss_return = exe.run(main_program, feed=batch, fetch_list=[loss])
+            if global_step % args.logging_steps == 0:
+                logger.info(
+                    "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
+                    % (global_step, epoch, step, loss_return[0],
+                       args.logging_steps / (time.time() - tic_train)))
+                tic_train = time.time()
+            lr_scheduler.step()
+            if global_step % args.save_steps == 0:
+                # Validation pass, record the loss and metric
+                if args.task_name == "mnli":
+                    evaluate(exe, metric, loss, correct, dev_program,
+                             dev_data_loader_matched, "matched eval")
+                    evaluate(exe, metric, loss, correct, dev_program,
+                             dev_data_loader_mismatched, "mismatched eval")
+                else:
+                    evaluate(exe, metric, loss, correct, dev_program,
+                             dev_data_loader)
+                output_dir = os.path.join(args.output_dir,
+                                          "model_%d" % global_step)
+                if not os.path.exists(output_dir):
+                    os.makedirs(output_dir)
+                paddle.static.save_inference_model(
+                    os.path.join(output_dir, "model"),
+                    [input_ids, token_type_ids], [logits], exe)
+                tokenizer.save_pretrained(output_dir)
+            if global_step >= num_training_steps:
+                return
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    assert args.device in [
+        "cpu", "gpu", "xpu"
+    ], "Invalid device! Available device should be cpu, gpu, or xpu."
+
+    do_train(args)
diff --git a/nlp/text_classification/bert/paddlepaddle/static/run_glue_with_sparaity.py b/nlp/text_classification/bert/paddlepaddle/static/run_glue_with_sparaity.py
new file mode 100644
index 000000000..f57e53631
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static/run_glue_with_sparaity.py
@@ -0,0 +1,458 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import os
+import random
+import time
+from functools import partial
+
+import numpy as np
+import paddle
+from paddle.io import DataLoader
+from paddlenlp.datasets import load_dataset
+
+from paddle.metric import Accuracy
+from paddlenlp.data import Stack, Tuple, Pad
+from paddlenlp.data.sampler import SamplerHelper
+from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer
+from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
+from paddlenlp.transformers import LinearDecayWithWarmup
+from paddlenlp.metrics import Mcc, PearsonAndSpearman
+from paddlenlp.utils.log import logger
+
+from paddle.fluid.contrib import sparsity
+
+METRIC_CLASSES = {
+    "cola": Mcc,
+    "sst-2": Accuracy,
+    "sts-b": PearsonAndSpearman,
+    "mnli": Accuracy,
+    "qnli": Accuracy,
+    "rte": Accuracy,
+}
+
+MODEL_CLASSES = {
+    "bert": (BertForSequenceClassification, BertTokenizer),
+    "ernie": (ErnieForSequenceClassification, ErnieTokenizer),
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument(
+        "--task_name",
+        default=None,
+        type=str,
+        required=True,
+        help="The name of the task to train selected in the list: " +
+        ", ".join(METRIC_CLASSES.keys()), )
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " +
+        ", ".join(MODEL_CLASSES.keys()), )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: "
+        + ", ".join(
+            sum([
+                list(classes[-1].pretrained_init_configuration.keys())
+                for classes in MODEL_CLASSES.values()
+            ], [])), )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=128,
+        type=int,
+        help="The maximum total input sequence length after tokenization. Sequences longer "
+        "than this will be truncated, sequences shorter will be padded.", )
+    parser.add_argument(
+        "--batch_size",
+        default=8,
+        type=int,
+        help="Batch size per GPU/CPU for training.", )
+    parser.add_argument(
+        "--learning_rate",
+        default=5e-5,
+        type=float,
+        help="The initial learning rate for Adam.")
+    parser.add_argument(
+        "--weight_decay",
+        default=0.0,
+        type=float,
+        help="Weight decay if we apply some.")
+    parser.add_argument(
+        "--adam_epsilon",
+        default=1e-8,
+        type=float,
+        help="Epsilon for Adam optimizer.")
+    parser.add_argument(
+        "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs",
+        default=3,
+        type=int,
+        help="Total number of training epochs to perform.", )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument(
+        "--warmup_steps",
+        default=0,
+        type=int,
+        help="Linear warmup over warmup_steps.")
+    parser.add_argument(
+        "--logging_steps",
+        type=int,
+        default=500,
+        help="Log every X updates steps.")
+    parser.add_argument(
+        "--save_steps",
+        type=int,
+        default=500,
+        help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--seed", type=int, default=42, help="Random seed for initialization")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="gpu",
+        help="Device for selecting for the training.")
+    args = parser.parse_args()
+    return args
+
+
+def create_data_holder(task_name):
+    """
+    Define the input data holder for the glue task.
+    """
+    input_ids = paddle.static.data(
+        name="input_ids", shape=[-1, -1], dtype="int64")
+    token_type_ids = paddle.static.data(
+        name="token_type_ids", shape=[-1, -1], dtype="int64")
+    if task_name == "sts-b":
+        label = paddle.static.data(name="label", shape=[-1, 1], dtype="float32")
+    else:
+        label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64")
+
+    return [input_ids, token_type_ids, label]
+
+
+def reset_program_state_dict(args, model, state_dict, pretrained_state_dict):
+    """
+    Initialize the parameter from the bert config, and set the parameter by 
+    reseting the state dict."
+    """
+    reset_state_dict = {}
+    scale = model.initializer_range if hasattr(model, "initializer_range")\
+        else getattr(model, args.model_type).config["initializer_range"]
+    reset_parameter_names = []
+    for n, p in state_dict.items():
+        if n in pretrained_state_dict:
+            reset_state_dict[p.name] = np.array(pretrained_state_dict[n])
+            reset_parameter_names.append(n)
+        elif p.name in pretrained_state_dict and "bert" in n:
+            reset_state_dict[p.name] = np.array(pretrained_state_dict[p.name])
+            reset_parameter_names.append(n)
+        else:
+            dtype_str = "float32"
+            if str(p.dtype) == "VarType.FP64":
+                dtype_str = "float64"
+            reset_state_dict[p.name] = np.random.normal(
+                loc=0.0, scale=scale, size=p.shape).astype(dtype_str)
+    logger.info("the following parameter had reset, please check. {}".format(
+        reset_parameter_names))
+    return reset_state_dict
+
+
+def set_seed(args):
+    """
+    Use the same data seed(for data shuffle) for all procs to guarantee data
+    consistency after sharding.
+    """
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    # Maybe different op seeds(for dropout) for different procs is better. By:
+    # `paddle.seed(args.seed + paddle.distributed.get_rank())`
+    paddle.seed(args.seed)
+
+
+def evaluate(exe, metric, loss, correct, dev_program, data_loader,
+             phase="eval"):
+    """
+    The evaluate process, calcluate the eval loss and metric. 
+    """
+    metric.reset()
+    returns = [loss]
+    if isinstance(correct, list) or isinstance(correct, tuple):
+        returns.extend(list(correct))
+    else:
+        returns.append(correct)
+    for batch in data_loader:
+        exe.run(dev_program, feed=batch, \
+           fetch_list=returns)
+        return_numpys = exe.run(dev_program, feed=batch, \
+           fetch_list=returns)
+        metric_numpy = return_numpys[1] if len(return_numpys[
+            1:]) == 1 else return_numpys[1:]
+        metric.update(metric_numpy)
+    res = metric.accumulate()
+    if isinstance(metric, Mcc):
+        print("%s loss: %f, mcc: %s" % (phase, return_numpys[0], res[0]))
+    elif isinstance(metric, PearsonAndSpearman):
+        print("%s loss: %f, pearson: %s, spearman: %s, pearson and spearman: %s"
+              % (phase, return_numpys[0], res[0], res[1], res[2]))
+    else:
+        print("%s loss: %f, acc: %s, " % (phase, return_numpys[0], res))
+
+
+def convert_example(example,
+                    tokenizer,
+                    label_list,
+                    max_seq_length=512,
+                    is_test=False):
+    """
+    Convert a glue example into necessary features.
+    """
+    if not is_test:
+        # `label_list == None` is for regression task
+        label_dtype = "int64" if label_list else "float32"
+        # Get the label
+        label = example['labels']
+        label = np.array([label], dtype=label_dtype)
+    # Convert raw text to feature
+    if (int(is_test) + len(example)) == 2:
+        example = tokenizer(example['sentence'], max_seq_len=max_seq_length)
+    else:
+        example = tokenizer(
+            example['sentence1'],
+            text_pair=example['sentence2'],
+            max_seq_len=max_seq_length)
+
+    if not is_test:
+        return example['input_ids'], example['token_type_ids'], label
+    else:
+        return example['input_ids'], example['token_type_ids']
+
+
+def do_train(args):
+    # Set the paddle execute enviroment
+    paddle.enable_static()
+    place = paddle.set_device(args.device)
+    set_seed(args)
+
+    # Create the main_program for the training and dev_program for the validation
+    main_program = paddle.static.default_main_program()
+    startup_program = paddle.static.default_startup_program()
+    dev_program = paddle.static.Program()
+
+    # Get the configuration of tokenizer and model
+    args.task_name = args.task_name.lower()
+    args.model_type = args.model_type.lower()
+    metric_class = METRIC_CLASSES[args.task_name]
+    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+
+    # Create the tokenizer and dataset
+    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
+    train_ds = load_dataset('glue', args.task_name, splits="train")
+
+    trans_func = partial(
+        convert_example,
+        tokenizer=tokenizer,
+        label_list=train_ds.label_list,
+        max_seq_length=args.max_seq_length)
+
+    train_ds = train_ds.map(trans_func, lazy=True)
+
+    batchify_fn = lambda samples, fn=Tuple(
+        Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
+        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # token_type 
+        Stack(dtype="int64" if train_ds.label_list else "float32")  # label
+    ): fn(samples)
+
+    train_batch_sampler = paddle.io.BatchSampler(
+        train_ds, batch_size=args.batch_size, shuffle=True)
+
+    feed_list_name = []
+
+    # Define the input data and create the train/dev data_loader
+    with paddle.static.program_guard(main_program, startup_program):
+        [input_ids, token_type_ids, labels] = create_data_holder(args.task_name)
+
+    train_data_loader = DataLoader(
+        dataset=train_ds,
+        feed_list=[input_ids, token_type_ids, labels],
+        batch_sampler=train_batch_sampler,
+        collate_fn=batchify_fn,
+        num_workers=0,
+        return_list=False)
+
+    if args.task_name == "mnli":
+        dev_ds_matched, dev_ds_mismatched = load_dataset(
+            'glue', args.task_name, splits=["dev_matched", "dev_mismatched"])
+
+        dev_ds_matched = dev_ds_matched.map(trans_func, lazy=True)
+        dev_ds_mismatched = dev_ds_mismatched.map(trans_func, lazy=True)
+        dev_batch_sampler_matched = paddle.io.BatchSampler(
+            dev_ds_matched, batch_size=args.batch_size, shuffle=False)
+        dev_data_loader_matched = DataLoader(
+            dataset=dev_ds_matched,
+            batch_sampler=dev_batch_sampler_matched,
+            collate_fn=batchify_fn,
+            feed_list=[input_ids, token_type_ids, labels],
+            num_workers=0,
+            return_list=False)
+        dev_batch_sampler_mismatched = paddle.io.BatchSampler(
+            dev_ds_mismatched, batch_size=args.batch_size, shuffle=False)
+        dev_data_loader_mismatched = DataLoader(
+            dataset=dev_ds_mismatched,
+            batch_sampler=dev_batch_sampler_mismatched,
+            collate_fn=batchify_fn,
+            num_workers=0,
+            feed_list=[input_ids, token_type_ids, labels],
+            return_list=False)
+    else:
+        dev_ds = load_dataset('glue', args.task_name, splits='dev')
+        dev_ds = dev_ds.map(trans_func, lazy=True)
+        dev_batch_sampler = paddle.io.BatchSampler(
+            dev_ds, batch_size=args.batch_size, shuffle=False)
+        dev_data_loader = DataLoader(
+            dataset=dev_ds,
+            batch_sampler=dev_batch_sampler,
+            collate_fn=batchify_fn,
+            num_workers=0,
+            feed_list=[input_ids, token_type_ids, labels],
+            return_list=False)
+
+    # Create the training-forward program, and clone it for the validation
+    with paddle.static.program_guard(main_program, startup_program):
+        num_class = 1 if train_ds.label_list is None else len(
+            train_ds.label_list)
+        model, pretrained_state_dict = model_class.from_pretrained(
+            args.model_name_or_path, num_classes=num_class)
+        loss_fct = paddle.nn.loss.CrossEntropyLoss(
+        ) if train_ds.label_list else paddle.nn.loss.MSELoss()
+        logits = model(input_ids, token_type_ids)
+        loss = loss_fct(logits, labels)
+        dev_program = main_program.clone(for_test=True)
+
+    # Create the training-backward program, this pass will not be
+    # executed in the validation
+    num_training_steps = args.max_steps if args.max_steps > 0 else len(
+        train_data_loader) * args.num_train_epochs
+    with paddle.static.program_guard(main_program, startup_program):
+        lr_scheduler = LinearDecayWithWarmup(
+            args.learning_rate, num_training_steps, args.warmup_steps)
+        # Generate parameter names needed to perform weight decay.
+        # All bias and LayerNorm parameters are excluded.
+        decay_params = [
+            p.name for n, p in model.named_parameters()
+            if not any(nd in n for nd in ["bias", "norm"])
+        ]
+        optimizer = paddle.optimizer.AdamW(
+            learning_rate=lr_scheduler,
+            epsilon=args.adam_epsilon,
+            parameters=model.parameters(),
+            weight_decay=args.weight_decay,
+            apply_decay_param_fun=lambda x: x in decay_params)
+
+        # Keep Pooler and task-specific layer dense.
+        # Please note, excluded_layers must be set before calling `optimizer.minimize()`.
+        sparsity.set_excluded_layers(main_program, [
+            model.bert.pooler.dense.full_name(), model.classifier.full_name()
+        ])
+        # Calling sparsity.decorate() to wrap minimize() in optimizer, which 
+        # will insert necessary masking operations for ASP workflow.
+        optimizer = sparsity.decorate(optimizer)
+        optimizer.minimize(loss)
+
+    # Create the metric pass for the validation
+    with paddle.static.program_guard(dev_program, startup_program):
+        metric = metric_class()
+        correct = metric.compute(logits, labels)
+
+    # Initialize the fine-tuning parameter, we will load the parameters in
+    # pre-training model. And initialize the parameter which not in pre-training model
+    # by the normal distribution.
+    exe = paddle.static.Executor(place)
+    exe.run(startup_program)
+    state_dict = model.state_dict()
+    reset_state_dict = reset_program_state_dict(args, model, state_dict,
+                                                pretrained_state_dict)
+    paddle.static.set_program_state(main_program, reset_state_dict)
+
+    # Pruning model to be 2:4 sparse pattern
+    # Must call `exe.run(startup_program)` first before calling `sparsity.prune_model`
+    sparsity.prune_model(place, main_program)
+
+    global_step = 0
+    tic_train = time.time()
+    for epoch in range(args.num_train_epochs):
+        for step, batch in enumerate(train_data_loader):
+            global_step += 1
+            loss_return = exe.run(main_program, feed=batch, fetch_list=[loss])
+            if global_step % args.logging_steps == 0:
+                logger.info(
+                    "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
+                    % (global_step, epoch, step, loss_return[0],
+                       args.logging_steps / (time.time() - tic_train)))
+                tic_train = time.time()
+            lr_scheduler.step()
+            if global_step % args.save_steps == 0:
+                # Validation pass, record the loss and metric
+                if args.task_name == "mnli":
+                    evaluate(exe, metric, loss, correct, dev_program,
+                             dev_data_loader_matched, "matched eval")
+                    evaluate(exe, metric, loss, correct, dev_program,
+                             dev_data_loader_mismatched, "mismatched eval")
+                else:
+                    evaluate(exe, metric, loss, correct, dev_program,
+                             dev_data_loader)
+                output_dir = os.path.join(args.output_dir,
+                                          "model_%d" % global_step)
+                if not os.path.exists(output_dir):
+                    os.makedirs(output_dir)
+                paddle.static.save_inference_model(
+                    os.path.join(output_dir, "model"),
+                    [input_ids, token_type_ids], [logits], exe)
+                tokenizer.save_pretrained(output_dir)
+            if global_step >= num_training_steps:
+                return
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    assert args.device in [
+        "cpu", "gpu", "xpu"
+    ], "Invalid device! Available device should be cpu, gpu, or xpu."
+
+    do_train(args)
diff --git a/nlp/text_classification/bert/paddlepaddle/static/run_pretrain.py b/nlp/text_classification/bert/paddlepaddle/static/run_pretrain.py
new file mode 100644
index 000000000..fa04034b6
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static/run_pretrain.py
@@ -0,0 +1,439 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import collections
+import itertools
+import os
+import random
+import time
+import h5py
+from functools import partial
+from concurrent.futures import ThreadPoolExecutor
+
+import numpy as np
+import distutils.util
+
+import paddle
+import paddle.distributed.fleet as fleet
+from paddle.io import DataLoader, Dataset
+
+from paddlenlp.utils import profiler
+from paddlenlp.utils.tools import TimeCostAverage
+from paddlenlp.transformers import BertForPretraining, BertModel, BertPretrainingCriterion
+from paddlenlp.transformers import BertTokenizer
+from paddlenlp.transformers import LinearDecayWithWarmup
+from dataset import create_data_holder, create_pretraining_dataset
+
+MODEL_CLASSES = {"bert": (BertForPretraining, BertTokenizer)}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " +
+        ", ".join(MODEL_CLASSES.keys()), )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: "
+        + ", ".join(
+            sum([
+                list(classes[-1].pretrained_init_configuration.keys())
+                for classes in MODEL_CLASSES.values()
+            ], [])), )
+    parser.add_argument(
+        "--input_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input directory where the data will be read from.", )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--max_predictions_per_seq",
+        default=80,
+        type=int,
+        help="The maximum total of masked tokens in input sequence")
+
+    parser.add_argument(
+        "--batch_size",
+        default=8,
+        type=int,
+        help="Batch size per GPU/CPU for training.", )
+    parser.add_argument(
+        "--learning_rate",
+        default=5e-5,
+        type=float,
+        help="The initial learning rate for Adam.")
+    parser.add_argument(
+        "--weight_decay",
+        default=0.0,
+        type=float,
+        help="Weight decay if we apply some.")
+    parser.add_argument(
+        "--adam_epsilon",
+        default=1e-8,
+        type=float,
+        help="Epsilon for Adam optimizer.")
+    parser.add_argument(
+        "--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument(
+        "--warmup_steps",
+        default=0,
+        type=int,
+        help="Linear warmup over warmup_steps.")
+    parser.add_argument(
+        "--logging_steps",
+        type=int,
+        default=500,
+        help="Log every X updates steps.")
+    parser.add_argument(
+        "--save_steps",
+        type=int,
+        default=500,
+        help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--seed", type=int, default=42, help="Random seed for initialization")
+    parser.add_argument(
+        "--use_amp",
+        type=distutils.util.strtobool,
+        default=False,
+        help="Enable mixed precision training.")
+    parser.add_argument(
+        "--enable_addto",
+        type=distutils.util.strtobool,
+        default=False,
+        help="Whether to enable the addto strategy for gradient accumulation or not. This is only used for AMP training."
+    )
+    parser.add_argument(
+        "--scale_loss",
+        type=float,
+        default=2**15,
+        help="The value of scale_loss for fp16.")
+    parser.add_argument(
+        "--use_pure_fp16",
+        type=distutils.util.strtobool,
+        default=False,
+        help="Whether to use pure fp16 training.")
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="gpu",
+        help="Device for selecting for the training.")
+    parser.add_argument(
+        "--gradient_merge_steps",
+        type=int,
+        default=1,
+        help="Number of merge steps before gradient update."
+        "global_batch_size = gradient_merge_steps * batch_size.")
+
+    # For benchmark.
+    parser.add_argument(
+        '--profiler_options',
+        type=str,
+        default=None,
+        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
+    )
+    args = parser.parse_args()
+    return args
+
+
+def select_dataset_file_for_each_worker(files, f_start_id, worker_num,
+                                        worker_index):
+    """  
+    Spliting the train file according to the worker index.
+    """
+    num_files = len(files)
+    if worker_num > num_files:
+        remainder = worker_num % num_files
+        data_file = files[(
+            f_start_id * worker_num + worker_index + remainder * f_start_id) %
+                          num_files]
+    else:
+        data_file = files[(f_start_id * worker_num + worker_index) % num_files]
+    return data_file
+
+
+def reset_program_state_dict(model, state_dict):
+    """
+    Initialize the parameter from the bert config, and set the parameter by 
+    reseting the state dict."
+    """
+    scale = model.initializer_range if hasattr(model, "initializer_range")\
+        else model.bert.config["initializer_range"]
+
+    new_state_dict = dict()
+    for n, p in state_dict.items():
+        if "layer_norm" not in p.name:
+            dtype_str = "float32"
+            if str(p.dtype) == "VarType.FP64":
+                dtype_str = "float64"
+            new_state_dict[p.name] = np.random.normal(
+                loc=0.0, scale=scale, size=p.shape).astype(dtype_str)
+    return new_state_dict
+
+
+def create_strategy(args):
+    """
+    Create build strategy and exec strategy.
+    """
+    build_strategy = paddle.static.BuildStrategy()
+    exec_strategy = paddle.static.ExecutionStrategy()
+
+    build_strategy.enable_addto = args.enable_addto
+
+    exec_strategy.num_threads = 1
+    exec_strategy.num_iteration_per_drop_scope = 10000
+    return build_strategy, exec_strategy
+
+
+def dist_optimizer(args, optimizer):
+    """
+    Create a distributed optimizer based on a normal optimizer
+    """
+    build_strategy, exec_strategy = create_strategy(args)
+
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.execution_strategy = exec_strategy
+    dist_strategy.build_strategy = build_strategy
+
+    dist_strategy.fuse_grad_size_in_MB = 16
+    if args.use_amp:
+        dist_strategy.amp = True
+
+        custom_black_list = ['lookup_table',
+                             'lookup_table_v2'] if args.use_pure_fp16 else None
+        dist_strategy.amp_configs = {
+            'custom_white_list': ['softmax', 'layer_norm', 'gelu'],
+            'init_loss_scaling': args.scale_loss,
+            'custom_black_list': custom_black_list,
+            'use_pure_fp16': args.use_pure_fp16
+        }
+    if args.gradient_merge_steps > 1:
+        dist_strategy.gradient_merge = True
+        dist_strategy.gradient_merge_configs = {
+            'k_steps': args.gradient_merge_steps
+        }
+
+    optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy)
+    return optimizer
+
+
+def set_seed(seed):
+    """
+    Use the same data seed(for data shuffle) for all procs to guarantee data
+    consistency after sharding.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    paddle.seed(seed)
+
+
+class WorkerInitObj(object):
+    "Construct the object with different seed, and the Dataloader will generate the data "
+    "with different seed in each worker."
+
+    def __init__(self, seed):
+        self.seed = seed
+
+    def __call__(self, id):
+        np.random.seed(seed=self.seed + id)
+        random.seed(self.seed + id)
+
+
+def do_train(args):
+    # Initialize the paddle and paddle fleet execute enviroment
+    paddle.enable_static()
+    place = paddle.set_device(args.device)
+    fleet.init(is_collective=True)
+
+    worker_num = fleet.worker_num()
+    worker_index = fleet.worker_index()
+
+    # Create the random seed for the worker
+    set_seed(args.seed)
+    worker_init = WorkerInitObj(args.seed + worker_index)
+
+    # Define the input data in the static mode
+    main_program = paddle.static.default_main_program()
+    startup_program = paddle.static.default_startup_program()
+
+    data_holders = create_data_holder(args)
+
+    [
+        input_ids, segment_ids, input_mask, masked_lm_positions,
+        masked_lm_labels, next_sentence_labels, masked_lm_scale
+    ] = data_holders
+
+    # Define the model structure in static mode
+    args.model_type = args.model_type.lower()
+    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
+    config = model_class.pretrained_init_configuration[args.model_name_or_path]
+    if config["vocab_size"] % 8 != 0:
+        config["vocab_size"] += 8 - (config["vocab_size"] % 8)
+    model = BertForPretraining(BertModel(**config))
+    criterion = BertPretrainingCriterion(model.bert.config["vocab_size"])
+    prediction_scores, seq_relationship_score = model(
+        input_ids=input_ids,
+        token_type_ids=segment_ids,
+        attention_mask=input_mask,
+        masked_positions=masked_lm_positions)
+    loss = criterion(prediction_scores, seq_relationship_score,
+                     masked_lm_labels, next_sentence_labels, masked_lm_scale)
+
+    # Define the dynamic learing_reate scheduler and optimizer
+    num_training_steps = args.max_steps if args.max_steps > 0 else len(
+        train_data_loader) * args.num_train_epochs
+
+    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
+                                         args.warmup_steps)
+
+    # Generate parameter names needed to perform weight decay.
+    # All bias and LayerNorm parameters are excluded.
+    decay_params = [
+        p.name for n, p in model.named_parameters()
+        if not any(nd in n for nd in ["bias", "norm"])
+    ]
+    optimizer = paddle.optimizer.AdamW(
+        learning_rate=lr_scheduler,
+        epsilon=args.adam_epsilon,
+        parameters=model.parameters(),
+        weight_decay=args.weight_decay,
+        apply_decay_param_fun=lambda x: x in decay_params,
+        multi_precision=args.use_pure_fp16)
+
+    # Use the fleet api to compile the distributed optimizer
+    optimizer = dist_optimizer(args, optimizer)
+    optimizer.minimize(loss)
+
+    # Define the Executor for running the static model
+    exe = paddle.static.Executor(place)
+    exe.run(startup_program)
+    state_dict = model.state_dict()
+
+    # Use the state dict to update the parameter
+    reset_state_dict = reset_program_state_dict(model, state_dict)
+    paddle.static.set_program_state(main_program, reset_state_dict)
+    if args.use_amp:
+        optimizer.amp_init(place)
+
+    pool = ThreadPoolExecutor(1)
+    global_step = 0
+    tic_train = time.time()
+    epoch = 0
+    while True:
+        files = [
+            os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
+            if os.path.isfile(os.path.join(args.input_dir, f)) and "training" in
+            f
+        ]
+        files.sort()
+        num_files = len(files)
+        random.Random(args.seed + epoch).shuffle(files)
+        f_start_id = 0
+
+        # Select one file for each worker and create the DataLoader for the file
+        data_file = select_dataset_file_for_each_worker(
+            files, f_start_id, worker_num, worker_index)
+        train_data_loader, _ = create_pretraining_dataset(
+            data_file, args.max_predictions_per_seq, args, data_holders,
+            worker_init, paddle.static.cuda_places())
+
+        for f_id in range(f_start_id + 1, len(files)):
+            data_file = select_dataset_file_for_each_worker(
+                files, f_id, worker_num, worker_index)
+            dataset_future = pool.submit(create_pretraining_dataset, data_file,
+                                         args.max_predictions_per_seq, args,
+                                         data_holders, worker_init,
+                                         paddle.static.cuda_places())
+
+            train_cost_avg = TimeCostAverage()
+            reader_cost_avg = TimeCostAverage()
+            total_samples = 0
+            batch_start = time.time()
+            for step, batch in enumerate(train_data_loader):
+                train_reader_cost = time.time() - batch_start
+                reader_cost_avg.record(train_reader_cost)
+                global_step += 1
+                train_start = time.time()
+                loss_return = exe.run(main_program,
+                                      feed=batch,
+                                      fetch_list=[loss])
+                total_samples += args.batch_size
+                # In the new 2.0 api, must call this function to change the learning_rate
+                lr_scheduler.step()
+                train_run_cost = time.time() - batch_start
+                train_cost_avg.record(train_run_cost)
+
+                # Profile for model benchmark
+                if args.profiler_options is not None:
+                    profiler.add_profiler_step(args.profiler_options)
+
+                if global_step % args.logging_steps == 0:
+                    print(
+                        "tobal step: %d, epoch: %d, batch: %d, loss: %f, "
+                        "avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, avg_samples: %.5f, ips: %.5f sequences/sec"
+                        % (global_step, epoch, step, loss_return[0],
+                           reader_cost_avg.get_average(),
+                           train_cost_avg.get_average(), total_samples /
+                           args.logging_steps, args.batch_size / (
+                               reader_cost_avg.get_average() +
+                               train_cost_avg.get_average())))
+                    total_samples = 0
+                    train_cost_avg.reset()
+                    reader_cost_avg.reset()
+                if global_step % args.save_steps == 0:
+                    if worker_index == 0:
+                        output_dir = os.path.join(args.output_dir,
+                                                  "model_%d" % global_step)
+                        if not os.path.exists(output_dir):
+                            os.makedirs(output_dir)
+                        model.save_model_config(output_dir)
+                        paddle.static.save(main_program,
+                                           os.path.join(output_dir,
+                                                        "model_state"))
+                        tokenizer.save_pretrained(output_dir)
+                if global_step >= args.max_steps:
+                    reader_start = time.time()
+                    del train_data_loader
+                    return
+                batch_start = time.time()
+            del train_data_loader
+            train_data_loader, data_file = dataset_future.result(timeout=None)
+        epoch += 1
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    print(args)
+    do_train(args)
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/README.md b/nlp/text_classification/bert/paddlepaddle/static_ipu/README.md
new file mode 100644
index 000000000..f18e209a0
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/README.md
@@ -0,0 +1,223 @@
+# Paddle-BERT with Graphcore IPUs
+
+## Overview
+
+This project enabled BERT-Base pre-training and SQuAD fine-tuning task using [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) on Graphcore [IPU-POD16](https://www.graphcore.ai/products/mk2/ipu-pod16).
+
+## File Structure
+
+| File              | Description                                                        |
+| ----------------- | ------------------------------------------------------------------ |
+| `README.md`       | How to run the model.                                              |
+| `run_pretrain.py` | The algorithm script to run pretraining tasks (phase1 and phase2). |
+| `run_squad.py`    | The algorithm script to run SQuAD finetune and validation task.    |
+| `modeling.py`     | The algorithm script to build the Bert-Base model.                 |
+| `dataset_ipu.py`  | The algorithm script to load input data in pretraining.            |
+| `custom_ops/`     | The folder contains custom ops that will be used.                  |
+| `scripts/`        | The folder contains scripts for model running.                     |
+
+## Dataset
+
+- Pretraining dataset
+
+   Wikipedia dataset is used to do pretraining. Please refer to the Wikipedia dataset generator provided by [Nvidia](https://github.com/NVIDIA/DeepLearningExamples.git) to generate pretraining dataset.
+
+   The sequence length used in pretraining phase1 and phase2 are: 128 and 384. Following steps are provided for dataset generation.
+
+   ```bash
+   # Here we use a specific commmit, the latest commit should also be fine.
+   git clone https://github.com/NVIDIA/DeepLearningExamples.git
+   git checkout 88eb3cff2f03dad85035621d041e23a14345999e
+
+   cd DeepLearningExamples/PyTorch/LanguageModeling/BERT
+
+   # Modified the parameters `--max_seq_length 512` to `--max_seq_length 384` at line 50 and
+   # `--max_predictions_per_seq 80` to `--max_predictions_per_seq 56` at line 51.
+   vim data/create_datasets_from_start.sh
+
+   # Build docker image
+   bash scripts/docker/build.sh
+
+   # Use NV's docker to download and generate hdf5 file. This may requires GPU available.
+   # You can Remove `--gpus $NV_VISIBLE_DEVICES` to avoid GPU requirements.
+   bash scripts/docker/launch.sh
+
+   # generate dataset with wiki_only
+   bash data/create_datasets_from_start.sh wiki_only
+   ```
+
+- SQuAD v1.1 dataset
+
+   SQuAD v1.1 dataset will be downloaded automatically. You don't have to download manually.
+
+
+## Quick Start Guide
+
+### Prepare Project Environment
+
+- Create docker image
+
+```bash
+# clone paddle repo
+git clone https://github.com/paddlepaddle/Paddle.git -b release/2.3
+cd Paddle
+
+# build docker image
+docker build -t paddlepaddle/paddle:latest-dev-ipu -f tools/dockerfile/Dockerfile.ipu .
+```
+
+- Create docker container
+
+```bash
+# clone paddlenlp repo
+git clone https://github.com/paddlepaddle/paddlenlp.git
+cd paddlenlp/examples/language_model/bert/static_ipu
+
+# create docker container
+# the ipuof configuration file need to be pre-generated and mounted to docker container
+# the environment variable IPUOF_CONFIG_PATH should point to the ipuof configuration file
+# more information on ipuof configuration is available at https://docs.graphcore.ai/projects/vipu-admin/en/latest/cli_reference.html?highlight=ipuof#ipuof-configuration-file
+docker run --ulimit memlock=-1:-1 --net=host --cap-add=IPC_LOCK \
+--device=/dev/infiniband/ --ipc=host \
+--name paddle-bert-base \
+-v ${IPUOF_CONFIG_PATH}:/ipu.conf \
+-e IPUOF_CONFIG_PATH=/ipu.conf \
+-v ${PWD}:/workdir \
+-w /home -it paddlepaddle/paddle:latest-dev-ipu bash
+```
+
+All of later processes are required to be executed in the container.
+
+- Compile and installation
+
+```bash
+# clone paddle repo
+git clone https://github.com/paddlepaddle/Paddle.git -b release/2.3
+cd Paddle
+
+mkdir build && cd build
+
+# run cmake
+cmake .. -DWITH_IPU=ON -DWITH_PYTHON=ON -DPY_VERSION=3.7 -DWITH_MKL=ON \
+         -DPOPLAR_DIR=/opt/poplar -DPOPART_DIR=/opt/popart -DCMAKE_BUILD_TYPE=Release
+
+# compile
+make paddle_python -j$(nproc)
+
+# install paddle package
+pip install -U python/dist/paddlepaddle-0.0.0-cp37-cp37m-linux_x86_64.whl
+
+# go to workdir
+cd /workdir
+```
+
+### Execution
+
+- Run pretraining phase1 (sequence_length = 128)
+
+```bash
+# pod16
+# takes about 11.3 hours
+bash scripts/pod16/run_pretrain.sh
+
+# pod4
+# takes about 11.3 * 4 hours
+bash scripts/pod4/run_pretrain.sh
+```
+
+- Run pretraining phase2 (sequence_length = 384)
+
+```bash
+# pod16
+# takes about 3 hours
+bash scripts/pod16/run_pretrain_phase2.sh
+
+# pod4
+# takes about 3 * 4 hours
+bash scripts/pod4/run_pretrain_phase2.sh
+```
+
+- Run SQuAD finetune task
+
+```bash
+# pod16
+bash scripts/pod16/run_squad.sh
+
+# pod4
+bash scripts/pod4/run_squad.sh
+```
+
+- Run SQuAD validation
+
+```bash
+# pod16
+bash scripts/pod16/run_squad_infer.sh
+
+# pod4
+bash scripts/pod4/run_squad_infer.sh
+```
+
+#### Parameters
+
+- `task` The type of the NLP model.
+- `input_files` The directory of the input data.
+- `output_dir` The directory of the trained models.
+- `is_training` Training or inference.
+- `seq_len` The sequence length.
+- `vocab_size` Size of the vocabulary.
+- `max_predictions_per_seq` The max number of the masked token each sentence.
+- `max_position_embeddings` The length of the input mask.
+- `num_hidden_layers` The number of encoder layers.
+- `hidden_size` The size of the hidden state of the transformer layers size.
+- `ignore_index` The ignore index for the masked position.
+- `hidden_dropout_prob` The dropout probability for fully connected layer in embedding and encoder
+- `attention_probs_dropout_prob` The dropout probability for attention layer in encoder.
+- `learning_rate` The learning rate for training.
+- `weight_decay` The weight decay.
+- `beta1` The Adam/Lamb beta1 value
+- `beta2` The Adam/Lamb beta2 value
+- `adam_epsilon` Epsilon for Adam optimizer.
+- `max_steps` The max training steps.
+- `warmup_steps` The warmup steps used to update learning rate with lr_schedule.
+- `scale_loss` The loss scaling.
+- `accl1_type` set accl1 type to FLOAT or FLOAT16
+- `accl2_type` set accl2 type to FLOAT or FLOAT16
+- `weight_decay_mode` decay or l2 regularization
+- `optimizer_state_offchip` The store location of the optimizer tensors
+- `logging_steps` The gap steps of logging.
+- `save_steps` Save the paddle model every n steps.
+- `epochs` the iteration of the whole dataset.
+- `batch_size` total batch size (= batches_per_step \* num_replica \* grad_acc_factor \* micro_batch_size).
+- `micro_batch_size` The batch size of the IPU graph.
+- `batches_per_step` The number of batches per step with pipelining.
+- `seed` The random seed.
+- `num_ipus` The number of IPUs.
+- `ipu_enable_fp16` Enable FP16 or not.
+- `num_replica` The number of the graph replication.
+- `enable_grad_acc` Enable gradiant accumulation or not.
+- `grad_acc_factor` Update the weights every n batches.
+- `available_mem_proportion` The available proportion of memory used by conv or matmul.
+- `shuffle` Shuffle Dataset.
+- `wandb` Enable logging to Weights and Biases.
+- `enable_engine_caching` Enable engine caching or not.
+- `enable_load_params` Load paddle params or not.
+- `tf_checkpoint` Path to Tensorflow Checkpoint to initialise the model.
+
+## Result
+
+For a POD16 platform:
+
+| Task   | Metric   | Result  |
+| ------ | -------- | ------- |
+| Phase1 | MLM Loss | 1.6064  |
+|        | NSP Loss | 0.0272  |
+|        | MLM Acc  | 0.6689  |
+|        | NSP Acc  | 0.9897  |
+|        | tput     | 11700   |
+| Phase2 | MLM Loss | 1.5029  |
+|        | NSP Loss | 0.02444 |
+|        | MLM Acc  | 0.68555 |
+|        | NSP Acc  | 0.99121 |
+|        | tput     | 3470    |
+| SQuAD  | EM       | 79.9053 |
+|        | F1       | 87.6396 |
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/custom_checkpointoutput.cc b/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/custom_checkpointoutput.cc
new file mode 100644
index 000000000..edc7eec8f
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/custom_checkpointoutput.cc
@@ -0,0 +1,41 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/extension.h"
+
+namespace {
+std::vector<std::vector<int64_t>> InferShape(std::vector<int64_t> x_shape) {
+  return {x_shape};
+}
+
+std::vector<paddle::DataType> InferDtype(paddle::DataType x_dtype) {
+  return {x_dtype};
+}
+
+std::vector<paddle::Tensor> OpForward(const paddle::Tensor &x) { return {x}; }
+
+std::vector<paddle::Tensor> OpBackward(const paddle::Tensor &x) { return {x}; }
+}
+
+PD_BUILD_OP(checkpointoutput)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(InferDtype))
+    .SetKernelFn(PD_KERNEL(OpForward));
+
+PD_BUILD_GRAD_OP(checkpointoutput)
+    .Inputs({paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X")})
+    .SetKernelFn(PD_KERNEL(OpBackward));
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/custom_detach.cc b/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/custom_detach.cc
new file mode 100644
index 000000000..2796fd07d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/custom_detach.cc
@@ -0,0 +1,42 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/extension.h"
+
+namespace {
+std::vector<std::vector<int64_t>>
+InferShape(std::vector<int64_t> x_shape) {
+  return {x_shape};
+}
+
+std::vector<paddle::DataType> InferDtype(paddle::DataType x_dtype) {
+  return {x_dtype};
+}
+
+std::vector<paddle::Tensor> OpForward(const paddle::Tensor &x) { return {x}; }
+
+std::vector<paddle::Tensor> OpBackward(const paddle::Tensor &x) { return {x}; }
+}
+
+PD_BUILD_OP(detach)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(InferDtype))
+    .SetKernelFn(PD_KERNEL(OpForward));
+
+PD_BUILD_GRAD_OP(detach)
+    .Inputs({paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X")})
+    .SetKernelFn(PD_KERNEL(OpBackward));
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/custom_identity.cc b/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/custom_identity.cc
new file mode 100644
index 000000000..1997d0e89
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/custom_identity.cc
@@ -0,0 +1,41 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/extension.h"
+
+namespace {
+std::vector<std::vector<int64_t>> InferShape(std::vector<int64_t> x_shape) {
+  return {x_shape};
+}
+
+std::vector<paddle::DataType> InferDtype(paddle::DataType x_dtype) {
+  return {x_dtype};
+}
+
+std::vector<paddle::Tensor> OpForward(const paddle::Tensor &x) { return {x}; }
+
+std::vector<paddle::Tensor> OpBackward(const paddle::Tensor &x) { return {x}; }
+}
+
+PD_BUILD_OP(identity)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(InferDtype))
+    .SetKernelFn(PD_KERNEL(OpForward));
+
+PD_BUILD_GRAD_OP(identity)
+    .Inputs({paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X")})
+    .SetKernelFn(PD_KERNEL(OpBackward));
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/custom_nll_loss.cc b/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/custom_nll_loss.cc
new file mode 100644
index 000000000..88112a26b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/custom_nll_loss.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/extension.h"
+
+namespace {
+std::vector<std::vector<int64_t>>
+InferShape(std::vector<int64_t> x_shape, std::vector<int64_t> y_shape,
+           const int &reduction, const std::string &ignoreIndex,
+           const bool &inputIsLogProbability) {
+  // 0: sum, 1: mean, 2: none
+  if (reduction == 2) {
+    return {y_shape};
+  } else {
+    return {{1}};
+  }
+}
+
+std::vector<paddle::DataType> InferDtype(paddle::DataType x_dtype,
+                                         paddle::DataType y_dtype) {
+  return {x_dtype};
+}
+
+std::vector<paddle::Tensor> OpForward(const paddle::Tensor &x,
+                                      const paddle::Tensor &y) {
+  return {x};
+}
+
+std::vector<paddle::Tensor> OpBackward(const paddle::Tensor &x) { return {x}; }
+}
+
+PD_BUILD_OP(custom_nll_loss)
+    .Inputs({"X", "Y"})
+    .Outputs({"Out"})
+    .Attrs({"reduction: int", "ignoreIndex: std::string",
+            "inputIsLogProbability: bool"})
+    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(InferDtype))
+    .SetKernelFn(PD_KERNEL(OpForward));
+
+PD_BUILD_GRAD_OP(custom_nll_loss)
+    .Inputs({paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X")})
+    .SetKernelFn(PD_KERNEL(OpBackward));
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/custom_shape_infer.cc b/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/custom_shape_infer.cc
new file mode 100644
index 000000000..74e144d8d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/custom_shape_infer.cc
@@ -0,0 +1,37 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <popart/shapeinference.hpp>
+#include <popart/version.hpp>
+
+auto splitShapeInferenceFun = [](popart::ShapeInferenceContext &ctx) {
+  auto numOutputs = ctx.getNumOutputs();
+  auto type = ctx.inType(0);
+  auto shape = ctx.inShape(0);
+  auto axis = ctx.getAttribute<int64_t>("axis");
+  auto split = ctx.getAttribute<std::vector<int64_t>>("split");
+
+  for (int i = 0; i < numOutputs; i++) {
+    shape[axis] = split.at(i);
+    ctx.outInfo(i) = {type, shape};
+  }
+};
+
+#if POPART_VERSION_MAJOR == 2
+#if POPART_VERSION_MINOR == 3
+// for version 2.3, need to register a shape inference function for Split op
+static popart::RegisterShapeInferenceFunction
+    splitRegister11(popart::Onnx::Operators::Split_11, splitShapeInferenceFun);
+#endif
+#endif
\ No newline at end of file
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/disable_attn_dropout_bwd_pattern.cc b/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/disable_attn_dropout_bwd_pattern.cc
new file mode 100644
index 000000000..803ae20c6
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/disable_attn_dropout_bwd_pattern.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <popart/graph.hpp>
+#include <popart/ir.hpp>
+#include <popart/op.hpp>
+#include <popart/patterns/patterns.hpp>
+#include <popart/tensor.hpp>
+#include <popart/tensorinfo.hpp>
+#include <popart/logging.hpp>
+#include <popart/op/matmul.hpp>
+#include <popart/op/identity.hpp>
+#include <popart/op/dropout.hpp>
+#include <popart/op/softmax.hpp>
+#include <popart/logging.hpp>
+#include <popart/opidentifier.hpp>
+
+#include "utils.cc"
+
+// Tests have found that disabling dropout in the backwards pass of the attention, just before the softmax,
+// can improve SQuAD fine-tuning. This pattern finds that op replaces it with an identity op.
+class DisableAttnDropoutBwdPattern : public popart::PreAliasPattern {
+public:
+  bool matches(popart::Op *op) const override {
+    int check_levels = 2;
+
+    if (!op->isConvertibleTo<popart::DropoutGradOp>()) {
+      return false;
+    }
+
+    // Is dropout enabled? If ratio is 0, we don't need to apply the pattern.
+    auto dropoutGradOp = dynamic_cast<popart::DropoutGradOp*>(op);
+    if (dropoutGradOp->getRatio() == 0.f) {
+      return false;
+    }
+
+    // The specific attention DropoutGradOp we want to cull sits between a matmul and a softmax,
+    // so we'll look through producers and consumers and see if we can find them.
+    auto grad = op->input->tensor(popart::DropoutGradOp::getGradInIndex());
+
+    // The MatMulPattern converts the MatMulLhsGradOp to a MatMulOp
+    // There doesn't seem to be a way to check if a pattern is enabled from inside another pattern.
+    // The IR holds the patterns object, but it’s inaccessible for checking the status of individual patterns.
+    // Check both, with the most likely first.
+    bool hasMatMulProducer = search_producers_for<popart::MatMulOp>(grad, check_levels) != nullptr;
+    if (!hasMatMulProducer) {
+      hasMatMulProducer |= search_producers_for<popart::MatMulLhsGradOp>(grad, check_levels) != nullptr;
+    }
+
+    return hasMatMulProducer && search_consumers_for<popart::SoftmaxGradOp>(grad) != nullptr;
+  }
+
+  std::vector<const popart::Tensor *> touches(popart::Op *) const override { return {}; }
+
+  bool apply(popart::Op *op) const override {
+    if (!op->isConvertibleTo<popart::DropoutGradOp>()) {
+      return false;
+    }
+
+    auto dropoutGradOp = dynamic_cast<popart::DropoutGradOp*>(op);
+    auto identityOp = makeReplacementOpInIr(popart::Onnx::Operators::Identity_1,
+                                            dropoutGradOp,
+                                            "");
+
+    auto inputId = dropoutGradOp->inId(popart::DropoutGradOp::getGradInIndex());
+    auto outputId = dropoutGradOp->outId(popart::DropoutGradOp::getOutIndex());
+    dropoutGradOp->disconnectAllInputs();
+    dropoutGradOp->disconnectAllOutputs();
+    dropoutGradOp->getGraph().eraseOp(dropoutGradOp->id);
+
+    identityOp->connectInTensor(popart::IdentityOp::getInIndex(), inputId);
+    identityOp->connectOutTensor(popart::IdentityOp::getOutIndex(), outputId);
+    identityOp->setup();
+
+    return true;
+  }
+};
+
+
+static popart::PatternCreator<DisableAttnDropoutBwdPattern> disableAttnDropoutBwdPatternCreator("DisableAttnDropoutBwdPattern", false);
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/tied_gather.cc b/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/tied_gather.cc
new file mode 100644
index 000000000..2350ffd24
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/tied_gather.cc
@@ -0,0 +1,181 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <popart/logging.hpp>
+#include <popart/names.hpp>
+#include <popart/opmanager.hpp>
+#include <popart/region.hpp>
+#include <popart/op.hpp>
+#include <popart/op/gather.hpp>
+#include <popart/popx/opx.hpp>
+#include <popart/popx/op/gatherx.hpp>
+#include <popart/popx/opxmanager.hpp>
+#include <popart/popx/devicex.hpp>
+#include <popart/util.hpp>
+
+#include <popops/DynamicSlice.hpp>
+#include <popops/ElementWise.hpp>
+#include <popops/Gather.hpp>
+#include <popops/Cast.hpp>
+
+namespace CustomOperators {
+  const popart::OperatorIdentifier TiedGather = {"ai.graphcore", "TiedGather", 1};
+} // namespace CustomOperators
+
+class TiedGatherOp;
+class TiedGatherGradOp;
+
+class TiedGatherGradOp : public popart::GatherGradOp {
+public:
+  TiedGatherGradOp(const popart::GatherOp &op, int64_t axis_)
+      : popart::GatherGradOp(op, axis_),
+        fwd_op(&op) {}
+  const popart::GatherOp *fwd_op;
+};
+
+class TiedGatherOp : public popart::GatherOp {
+public:
+  TiedGatherOp(int64_t axis_, const popart::Op::Settings &settings_)
+      : popart::GatherOp(CustomOperators::TiedGather, axis_, settings_) {}
+  bool check_indices = true;
+
+  std::unique_ptr<popart::Op> clone() const override {
+    return std::make_unique<TiedGatherOp>(*this);
+  }
+
+  std::vector<std::unique_ptr<Op>> getGradOps() {
+    std::vector<std::unique_ptr<Op>> result;
+    result.push_back(std::make_unique<TiedGatherGradOp>(*this, getAxis()));
+    result[0]->pruneable = false;
+    return result;
+  }
+};
+
+class TiedGatherOpx : public popart::popx::Opx {
+public:
+  TiedGatherOpx(popart::Op *op, popart::popx::Devicex *devicex) : popart::popx::Opx(op, devicex) {
+    verifyOp<TiedGatherOp>(op, CustomOperators::TiedGather);
+    // We always want this to layout its inputs
+    inputCreatorPriority = std::numeric_limits<double>::max();
+  }
+
+  bool createsEquiv(int, const popart::popx::Opx *, int) const final { return false; }
+
+  std::set<popart::TensorId> mustExistBeforeCreate(int) const final { return {}; }
+
+  popart::popx::InputCreatorType getInputCreatorType(int index0) const final {
+    return index0 == TiedGatherOp::dataInIndex() ? popart::popx::InputCreatorType::CanCreate
+                                                 : popart::popx::Opx::getInputCreatorType(index0);
+  }
+
+  poplar::Tensor createInput(popart::InIndex index,
+                             const poplar::DebugNameAndId &dnai) const final {
+    popart::logging::debug("TiedGather asked to create index {}: name {}", index, dnai);
+    if (index != TiedGatherOp::dataInIndex()) {
+      throw popart::error("CustomOps Error: GatherOpx::createInput Cannot create input {}", index);
+    }
+
+    auto inputInfo = inInfo(TiedGatherOp::indicesInIndex());
+    auto weightInfo = inInfo(TiedGatherOp::dataInIndex());
+
+    unsigned inputSize = inputInfo.nelms();
+    unsigned inChannels = weightInfo.dim(getOp<TiedGatherOp>().getAxis());
+    unsigned outChannels = weightInfo.nelms() / inChannels;
+
+    std::vector<std::size_t> lhsShape = {inputSize, inChannels};
+    std::vector<std::size_t> rhsShape = {inChannels, outChannels};
+
+    return poplin::createMatMulInputRHS(graph(),
+                                        popart::popx::popType(weightInfo),
+                                        lhsShape,
+                                        rhsShape,
+                                        dnai,
+                                        {},
+                                        &dv_p->matmulCache);
+  }
+
+  // Identical to popart::opx::GatherOpx::grow however:
+  //    1) uses popops::gather instead of popops::multislice
+  //    2) range checks the indices and masks those out of range
+  void grow(poplar::program::Sequence &prog) const final {
+    const auto indicesShape = inShape(TiedGatherOp::indicesInIndex());
+    const auto outputShape =
+        popart::vXtoY<int64_t, std::size_t>(outShape(TiedGatherOp::outIndex()));
+
+    auto op       = getOp<TiedGatherOp>();
+    unsigned axis = op.getAxis();
+    auto indices  = getInTensor(TiedGatherOp::indicesInIndex());
+    auto data     = getInTensor(TiedGatherOp::dataInIndex());
+
+    // If there are no indices, return an empty tensor of the appropriate
+    // shape
+    if (indices.numElements() == 0) {
+      auto result = graph().addVariable(
+          data.elementType(), outputShape, debugContext("result"));
+
+      setOutTensor(TiedGatherOp::outIndex(), result);
+    } else {
+      // Flatten the scalar indices.
+      auto offsets = indices.flatten();
+      // reinterpret the indices as unsigned int. This assumes negative indices.
+      // are impossible.
+      offsets = offsets.reinterpret(poplar::UNSIGNED_INT);
+
+      // Place the gather axis at the front.
+      data = data.dimShufflePartial({0}, {axis});
+      // Store the shape for later.
+      auto tmp_shape = data.shape();
+      // Flatten the other dimensions.
+      data = data.flatten(1, data.rank());
+
+      // Change (2)
+      poplar::Tensor mask;
+      if (op.check_indices) {
+        auto gather_size = data.shape()[0];
+        mask = popops::lt(graph(), offsets, static_cast<unsigned>(gather_size), prog, debugContext("mask<size"));
+        auto indices_mask = popops::cast(graph(), mask, offsets.elementType(), prog, debugContext("mask_castInt"));
+        offsets = popops::mul(graph(), offsets, indices_mask, prog, debugContext("masked_indices"));
+      }
+
+      // Change (1)
+      auto result = popops::gather(graph(),
+                                   data,
+                                   offsets,
+                                   0,
+                                   prog,
+                                   popops::GatherParams(),
+                                   debugContext());
+
+      // Change (2)
+      if (op.check_indices) {
+        auto out_mask = popops::cast(graph(), mask, data.elementType(), prog, debugContext("mask_cast"));
+        popops::mulInPlace(graph(), result, out_mask.expand({1}), prog, debugContext("masked_result"));
+      }
+
+      // Reshape the result to "unflatten" the other dimensions.
+      tmp_shape.front() = result.dim(0);
+      result            = result.reshape(tmp_shape);
+      // Put the gather axis dimension back in the right place.
+      result = result.dimShufflePartial({axis}, {0});
+
+      // Reshape into the expected ONNX shape.
+      result = result.reshape(outputShape);
+
+      setOutTensor(TiedGatherOp::outIndex(), result);
+    }
+  }
+};
+
+static popart::popx::OpxCreator<TiedGatherOpx>
+    tiedGatherOpxCreator(CustomOperators::TiedGather);
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/tied_gather_pattern.cc b/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/tied_gather_pattern.cc
new file mode 100644
index 000000000..ddbe4bd15
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/tied_gather_pattern.cc
@@ -0,0 +1,504 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <popart/graph.hpp>
+#include <popart/op.hpp>
+#include <popart/opidentifier.hpp>
+#include <popart/topocons.hpp>
+#include <popart/optimizer.hpp>
+#include <popart/adam.hpp>
+#include <popart/tensor.hpp>
+#include <popart/tensorinfo.hpp>
+#include <popart/logging.hpp>
+#include <popart/op/gather.hpp>
+#include <popart/op/slice.hpp>
+#include <popart/op/add.hpp>
+#include <popart/op/subtract.hpp>
+#include <popart/op/matmul.hpp>
+#include <popart/op/div.hpp>
+#include <popart/op/detach.hpp>
+#include <popart/op/transpose.hpp>
+#include <popart/op/accumulate.hpp>
+#include <popart/op/collectives/replicatedallgather.hpp>
+
+#include <map>
+
+#include "tied_gather.cc"
+#include "utils.cc"
+
+using SerialiseSettings = popart::MatMulBaseOp::SerialiseSettings;
+
+// This pattern matches for graphs of the shape.
+//
+//              Weight
+//             /     \
+//        Transpose   MatMul
+//            |
+// Indices --Gather
+//
+// And performs the following transformations:
+//    1) Disable FullyConnectedPass on MatMul
+//    2) Add Detach between the Gather and the Weight so no SGD ops are created (they will be added later by TiedGatherAccumulatePattern)
+//    3) Replace Gather with TiedGather
+// Resulting in:
+//              Weight
+//             /     \
+//        Transpose   MatMul
+//            |
+//          Detach
+//            |
+// Indices --TiedGather
+//
+// Conditionally, if MatMul is annotated with serialisation it will:
+//    4) Replace Gather with N x TiedGather to match the serialisation on the MatMul
+// Resulting in:
+//    For serialisation factor: 2
+//
+//              Weight
+//             /     \
+//        Transpose  MatMul
+//            |
+// Indices  Detach
+//  |   |    |  |
+//  |   |    | Slice--\
+//  |   Sub -|------TiedGather
+//  |        |              |
+//  |       Slice--\        |
+//  Sub ---------TiedGather |
+//                        \ |
+//                        Add
+//
+namespace {
+bool produced_by_transpose(popart::Tensor *t) {
+    return t->hasProducer() && t->getProducer()->isConvertibleTo<popart::TransposeBaseOp>();
+}
+}
+
+class TiedGatherPattern : public popart::PreAliasPattern {
+    mutable std::map<popart::Op *, popart::MatMulBaseOp *> tied_op_map;
+public:
+    bool matches(popart::Op *op) const override {
+        auto &ir = op->getIr();
+        // Only run in the fwd pass
+        if (op->getIr().hasConstructedBackwards()) {
+            return false;
+        }
+        if (op->getIr().isTraining() && !op->getIr().getSessionOptions().enableGradientAccumulation) {
+            return false;
+        }
+        if (op->isConvertibleTo<popart::GatherOp>() && !op->isConvertibleTo<TiedGatherOp>()) {
+            if (produced_by_transpose(op->input->tensor(popart::GatherOp::dataInIndex()))) {
+                auto matmul = weight_consumed_by<popart::MatMulBaseOp>(op->input->tensor(popart::GatherOp::dataInIndex()));
+                if (matmul) {
+                    tied_op_map.insert({op, matmul});
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+
+    std::vector<const popart::Tensor *> touches(popart::Op *) const override { return {}; }
+
+    bool apply(popart::Op *op) const override {
+        auto &graph = op->getGraph();
+
+        auto gather = dynamic_cast<popart::GatherOp *>(op);
+        auto matmul = tied_op_map[gather];
+
+        // (1)
+        matmul->setUseFullyConnectedPass(false);
+
+        auto axis = gather->getAxis();
+        auto serialisation = matmul->getSerialiseSettings();
+
+        auto data    = gather->input->tensor(popart::GatherOp::dataInIndex());
+        auto indices = gather->input->tensor(popart::GatherOp::indicesInIndex());
+        auto out     = gather->output->tensor(popart::GatherOp::outIndex());
+
+        // Disconnect "out" so it can be connected to the replacing ops.
+        gather->disconnectAllOutputs();
+
+        // (2)
+        auto detach_up = std::make_unique<popart::DetachOp>(
+            popart::Onnx::CustomOperators::Detach_1,
+            popart::Op::Settings(graph, "TiedGatherDetach")
+        );
+        auto detach = detach_up.get();
+        transferBaseProperties(gather, detach);
+        graph.moveIntoGraph(std::move(detach_up));
+        detach->connectInTensor(0, data->id);
+        auto detached_data_id = data->id + "/detached";
+        detach->createAndConnectOutTensor(0, detached_data_id);
+        detach->setup();
+        data = graph.getTensors().get(detached_data_id);
+
+        std::string name = gather->name();
+        if (name.empty()) {
+            name = std::to_string(gather->id);
+        }
+
+        auto replace_with_tied_gather = [&](popart::TensorId dict, popart::TensorId ind, int64_t i, const std::string &debugContext) {
+            auto tied_gather_up = std::make_unique<TiedGatherOp>(
+                axis,
+                popart::Op::Settings(graph, debugContext));
+            auto tied_gather = tied_gather_up.get();
+            transferBaseProperties(gather, tied_gather);
+            graph.moveIntoGraph(std::move(tied_gather_up));
+
+            tied_gather->connectInTensor(TiedGatherOp::dataInIndex(), dict);
+            tied_gather->connectInTensor(TiedGatherOp::indicesInIndex(), ind);
+
+            auto out_id = out->id;
+            if (i >= 0) {
+                out_id = debugContext + ":0";
+                tied_gather->createAndConnectOutTensor(TiedGatherOp::outIndex(), out_id);
+            } else {
+                tied_gather->connectOutTensor(TiedGatherOp::outIndex(), out_id);
+            }
+
+            graph.topoCons->transfer(gather, tied_gather);
+
+            tied_gather->setup();
+
+            return out_id;
+        };
+
+        if (serialisation.factor <= 1 || serialisation.mode == SerialiseSettings::Mode::None) {
+            // (3)
+            replace_with_tied_gather(data->id, indices->id, -1, name);
+        } else {
+            // (4)
+            if (serialisation.mode != SerialiseSettings::Mode::OutputChannels) {
+                throw popart::error("CustomOps Error: Tied Gather Pattern only supports Serialisation::Mode::OutputChannels");
+            }
+
+            auto slice_op = [&](int64_t starts, int64_t ends, const std::string &debugContext) {
+                auto slice_up = std::make_unique<popart::SliceOp>(
+                    popart::Onnx::AiOnnx::OpSet9::Slice,
+                    std::vector<int64_t>({starts}),
+                    std::vector<int64_t>({ends}),
+                    std::vector<int64_t>({axis}),
+                    popart::Op::Settings(graph, debugContext + "/slice"));
+                auto slice = slice_up.get();
+                transferBaseProperties(gather, slice);
+                graph.moveIntoGraph(std::move(slice_up));
+                slice->connectInTensor(popart::SliceOp::getInIndex(), data->id);
+                auto data_slice = debugContext + "/slice:0";
+                slice->createAndConnectOutTensor(popart::SliceOp::getOutIndex(), data_slice);
+                slice->setup();
+                return data_slice;
+            };
+
+            auto subtract_with_constant = [&](popart::Tensor *a, int64_t c, const std::string &debugContext) {
+                auto sub_up = std::make_unique<popart::SubtractOp>(
+                    popart::Onnx::Operators::Sub_7,
+                    popart::Op::Settings(graph, debugContext + "/sub"));
+                auto sub = sub_up.get();
+                transferBaseProperties(gather, sub);
+                graph.moveIntoGraph(std::move(sub_up));
+                sub->connectInTensor(popart::SubtractOp::getArg0InIndex(), a->id);
+                // Create constant to subtract from
+                static unsigned i = 0;
+                auto sub_const_id = a->id + "_sub_const_" + std::to_string(i++);
+                popart::TensorInfo subInfo(a->info.dataType(), {1});
+                std::vector<unsigned> d(1, c);
+                graph.getTensors().addConstInit(sub_const_id, subInfo, d.data());
+                sub->connectInTensor(popart::SubtractOp::getArg1InIndex(), sub_const_id);
+                auto indices_sub = debugContext + "/sub:0";
+                sub->createAndConnectOutTensor(popart::SubtractOp::getOutIndex(), indices_sub);
+                sub->setup();
+                return indices_sub;
+            };
+
+            auto add_op = [&](popart::TensorId a, popart::TensorId b, popart::TensorId out, const std::string &debugContext) {
+                auto add_up = std::make_unique<popart::AddOp>(
+                    popart::Onnx::Operators::Add_6,
+                    popart::Op::Settings(graph, debugContext + "/add"));
+                auto add = add_up.get();
+                transferBaseProperties(gather, add);
+                graph.moveIntoGraph(std::move(add_up));
+                add->connectInTensor(popart::AddOp::getArg0InIndex(), a);
+                add->connectInTensor(popart::AddOp::getArg1InIndex(), b);
+                if (graph.getTensors().contains(out)) {
+                    add->connectOutTensor(popart::AddOp::getOutIndex(), out);
+                } else {
+                    add->createAndConnectOutTensor(popart::AddOp::getOutIndex(), out);
+                }
+                add->setup();
+                return out;
+            };
+
+            popart::TensorId tmp_id;
+            for (int64_t i = 0; i < serialisation.factor; i++) {
+                int64_t slice_size = data->info.dim(axis) / serialisation.factor;
+                auto serial_name = name + "/" + std::to_string(i);
+                // Slice the Dictionary
+                auto data_slice = slice_op(i * slice_size, (i + 1) * slice_size, serial_name);
+                // Subtract the indicies
+                auto indices_sub = subtract_with_constant(indices, i * slice_size, serial_name);
+                // Add the tied gather to the graph
+                auto next_id = replace_with_tied_gather(data_slice, indices_sub, i, serial_name);
+
+                // Add the results
+                if (i == 0) {
+                    tmp_id = next_id;
+                } else {
+                    auto out_id = out->id;
+                    if (i < serialisation.factor - 1) {
+                        out_id += "_tmp" + std::to_string(i);
+                    }
+                    tmp_id = add_op(tmp_id, next_id, out_id, serial_name);
+
+                    // Tie the add to happen directly after the gather
+                    graph.topoCons->insert(
+                        graph.getTensors().get(next_id)->getProducer(),
+                        graph.getTensors().get(tmp_id)->getProducer(),
+                        true);
+                }
+            }
+        }
+
+        gather->disconnectAllInputs();
+        graph.eraseOp(gather->id);
+
+        return true;
+    }
+};
+
+// This pattern matches for graphs of the shape.
+//
+//    Weight
+//    |              \
+// TiedGatherGrad   MatMul
+//                    |
+//         Accl  -  Accumulate
+//
+// And will perform the following transformation
+//   1) Replace TiedGatherGrad with SparseAccumulate
+//
+// Resulting in:
+//
+//    Weight
+//    |              \
+//    |             MatMul
+//    |               |
+//    |    Accl  -  Accumulate
+//    |     |          |
+// SparseAccumulate  - Optimizer
+//
+// (--> is a topocon)
+
+class TiedGatherAccumulatePattern : public popart::PreAliasPattern {
+public:
+    bool matches(popart::Op *op) const override {
+        // Only works with gradient accumulation
+        if (!op->getIr().getSessionOptions().enableGradientAccumulation) {
+            return false;
+        }
+        // Only run after the optimizers have been created
+        if (!op->getIr().hasDecomposedOptimizers()) {
+            return false;
+        }
+        return op->isConvertibleTo<TiedGatherGradOp>();
+    }
+
+    std::vector<const popart::Tensor *> touches(popart::Op *) const override { return {}; }
+
+    bool apply(popart::Op *op) const override {
+        auto gather_grad = dynamic_cast<TiedGatherGradOp *>(op);
+        auto gather = gather_grad->fwd_op;
+        auto root_weight = get_variable(gather->input->tensor(popart::GatherOp::dataInIndex()));
+
+        auto gather_ops = find_all_consumers<TiedGatherOp>(root_weight);
+
+        auto &ir = op->getIr();
+
+        // Get all the Accumulate ops in the normal context
+        std::vector<popart::AccumulateOp *> accumulate_ops;
+
+        auto update_ops = find_all_consumers<popart::VarUpdateWithUpdaterOp, popart::ExecutionContext::AccumulateOuterFragment>(root_weight);
+        if (update_ops.size() < 1) {
+            // OptimizerDecomposePattern has not run.
+            throw popart::error("CustomOps Error: Could not find update ops for weight {}", root_weight->id);
+        }
+
+        for (size_t i = 0; i < update_ops.size(); i++) {
+            auto var_update = update_ops[i];
+
+            auto accum = var_update->inTensor(popart::VarUpdateWithUpdaterOp::getUpdaterInIndex());
+            // Accumulate Ops in the normal fragment are Gradient Accumulation.
+            auto accl_op = search_producers_for<popart::AccumulateOp, popart::ExecutionContext::Normal>(accum, 10);
+
+            if (accl_op) {
+                auto exists = std::find_if(accumulate_ops.begin(), accumulate_ops.end(), [&accl_op](popart::Op* op){ return op->id == accl_op->id; });
+                if (exists == accumulate_ops.end()) {
+                    accumulate_ops.push_back(accl_op);
+                }
+            } else {
+                popart::logging::info("CustomOps Warning: Could not find outer AccumulateOp gradient accumulation via accumulator {}.", accum->id);
+            }
+        }
+
+        if (accumulate_ops.size() != gather_ops.size()) {
+            throw popart::error("CustomOps Error: The number of gather ops ({}) does not match the number of accumulate ops ({}).", gather_ops.size(), accumulate_ops.size());
+        }
+
+        // Match up gather serial index to Accumulator's matmul index.
+        // TODO: Find a more robust way than sorting input ids
+        std::sort(accumulate_ops.begin(), accumulate_ops.end(),
+                  [](const popart::Op *l, const popart::Op *r) {
+                      return l->input->tensor(popart::AccumulateOp::getVarToUpdateInIndex())->id.compare(
+                          r->input->tensor(popart::AccumulateOp::getVarToUpdateInIndex())->id) < 0;
+                  });
+        std::sort(gather_ops.begin(), gather_ops.end(),
+            [](const popart::Op *l, const popart::Op *r) {
+            return l->name().compare(r->name()) < 0;
+        });
+
+        auto itr = std::find(gather_ops.begin(), gather_ops.end(), gather);
+        if (itr == gather_ops.end()) {
+            throw popart::error("CustomOps Error: Could not find {} in the consumers of {}.", gather->name(), root_weight->id);
+        }
+
+        unsigned serial_index = std::distance(gather_ops.begin(), itr);
+
+        auto dense_accl = accumulate_ops[serial_index];
+
+        auto accl_id = dense_accl->inId(popart::AccumulateOp::getVarToUpdateInIndex());
+        auto weight_id = gather->inId(popart::GatherOp::dataInIndex());
+        popart::logging::pattern::info("Using tied accumulator {} for {}", accl_id, gather->name());
+
+        // Transpose must be inplace so the accumulator is actually updated
+        accl_id    = transpose_inplace(accl_id, gather_grad);
+
+        auto &graph = op->getGraph();
+
+        auto accum_type = dense_accl->getAccumulationType();
+        popart::Tensor *factor = dense_accl->getFactor().isConst() ? nullptr : dense_accl->inTensor(popart::SparseAccumulateOp::getFactorInIndex());
+
+        if (factor != nullptr && accum_type == popart::AccumulationType::Mean) {
+            auto inv_counter = factor->id + "_inverse";
+            if (!graph.getTensors().contains(inv_counter)) {
+                popart::TensorInfo one_info(factor->info.dataType(), {});
+                std::vector<float> one_data(one_info.nelms(), 1);
+                const auto &one_id = graph.getIr().createIntermediateTensorId("one");
+                graph.getTensors().addConstInit(one_id, one_info, one_data.data());
+                auto inv_op = graph.createConnectedOp<popart::DivOp>(
+                    {{popart::DivOp::getArg0InIndex(), one_id},
+                    {popart::DivOp::getArg1InIndex(), factor->id}},
+                    {{popart::DivOp::getOutIndex(), inv_counter}},
+                    popart::Onnx::Operators::Div_7,
+                    popart::Op::Settings(graph, "mean_accumulate_inverse"));
+                transferBaseProperties(gather_grad, inv_op);
+
+                for (auto cons : factor->consumers.getOps()) {
+                    if (cons->isConvertibleTo<popart::AccumulateOp>() &&
+                        cons->inId(popart::AccumulateOp::getVarToUpdateInIndex()) == factor->id) {
+                        graph.topoCons->insert(cons, inv_op);
+                    }
+                }
+            }
+            accum_type = popart::AccumulationType::DampenedAdd;
+            factor = graph.getTensor(inv_counter);
+        }
+
+        // Add sparseAccumulateOp.
+        auto sparse_accl_up = std::make_unique<popart::SparseAccumulateOp>(
+            accum_type,
+            dense_accl->getFactor(),
+            gather_grad->getAxis(),
+            popart::Op::Settings(graph, "_tiedAccumulate/" + std::to_string(serial_index)));
+
+        auto sparse_accl = sparse_accl_up.get();
+        transferBaseProperties(gather_grad, sparse_accl);
+        graph.moveIntoGraph(std::move(sparse_accl_up));
+
+        // Inputs
+        // Accumulator
+        sparse_accl->connectInTensor(popart::SparseAccumulateOp::getVarToUpdateInIndex(),
+                                     accl_id);
+        // Gradients
+        sparse_accl->connectInTensor(
+            popart::SparseAccumulateOp::getUpdaterInIndex(),
+            gather_grad->inId(popart::GatherGradOp::gradInIndex()));
+        // Scale
+        if (!dense_accl->getFactor().isConst()) {
+          sparse_accl->connectInTensor(
+              // the index at which the dampening scale factor is received,
+              popart::SparseAccumulateOp::getFactorInIndex(),
+              // the name of the dampening scale factor
+              factor->id);
+        }
+        // Indices
+        sparse_accl->connectInTensor(
+            popart::SparseAccumulateOp::getIndicesInIndex(),
+            gather_grad->inId(popart::GatherGradOp::indicesInIndex()));
+
+        // Original weight to be cloned
+        sparse_accl->connectInTensor(
+            popart::SparseAccumulateOp::getOriginalVarToUpdateInIndex(),
+            weight_id);
+
+        // Transfer TopoCons
+        graph.topoCons->transfer(gather_grad, sparse_accl);
+
+        // gatherGrad output that will be isolated
+        auto grad_Id = gather_grad->outId(TiedGatherGradOp::gradOutIndex());
+
+        // Remove TiedGatherGrad
+        gather_grad->disconnectAllInputs();
+        gather_grad->disconnectAllOutputs();
+        graph.eraseOp(gather_grad->id);
+
+        // Outputs
+        sparse_accl->createAndConnectOutTensor(
+            popart::SparseAccumulateOp::getUpdatedVarOutIndex(),
+            sparse_accl->name() + ":0");
+
+        // remove the gatherGrad output
+        graph.getTensors().remove(grad_Id);
+
+        // Finalise sparse op
+        sparse_accl->setup();
+
+        return true;
+    }
+
+    popart::TensorId transpose_inplace(popart::TensorId tid, popart::Op *op) const {
+        auto &graph = op->getGraph();
+
+        // TransposeInplaceOp's constructor requires a transposeOp
+        auto outplace_up = std::make_unique<popart::TransposeOp>(
+            popart::Onnx::AiOnnx::OpSet9::Transpose,
+            std::vector<int64_t>{1, 0},
+            popart::Op::Settings(graph, tid + "_Transpose"));
+        auto transpose_up = outplace_up->getInplaceVariant(popart::Onnx::CustomOperators::TransposeInplace);
+
+        auto transpose = transpose_up.get();
+        transferBaseProperties(op, transpose);
+        graph.moveIntoGraph(std::move(transpose_up));
+
+        transpose->connectInTensor(popart::TransposeOp::getInIndex(), tid);
+        popart::TensorId out_id = tid + "/transposed";
+        transpose->createAndConnectOutTensor(popart::TransposeOp::getOutIndex(), out_id);
+
+        transpose->setup();
+        return out_id;
+    }
+};
+
+static popart::PatternCreator<TiedGatherPattern> TiedGatherPatternCreator("TiedGatherPattern", true);
+static popart::PatternCreator<TiedGatherAccumulatePattern> TiedGatherAccumulatePatternCreator("TiedGatherAccumulatePattern", true);
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/utils.cc b/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/utils.cc
new file mode 100644
index 000000000..b6c6570f8
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/utils.cc
@@ -0,0 +1,173 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <popart/tensor.hpp>
+#include <popart/tensors.hpp>
+#include <popart/tensorindex.hpp>
+#include <popart/op.hpp>
+#include <popart/op/matmul.hpp>
+#include <popart/op/dropout.hpp>
+#include <popart/op/mul.hpp>
+#include <popart/op/collectives/replicatedreducescatter.hpp>
+#include <popart/op/collectives/replicatedallgather.hpp>
+#include <popart/op/adamupdater.hpp>
+#include <popart/op/adamvarupdate.hpp>
+#include <popart/op/accumulate.hpp>
+#include <popart/popx/op/matmulx.hpp>
+#include <popart/logging.hpp>
+#include <queue>
+
+template <class T, popart::ExecutionContext Ctx = popart::ExecutionContext::Normal>
+static T *search_producers_for(popart::Tensor *t, int max_depth=-1) {
+
+    // Searched as far as we can without success
+    if (t->tensorType() == popart::TensorType::Variable || !t->hasProducer()) {
+        return nullptr;
+    }
+    auto op = t->getProducer();
+    if (op->isConvertibleTo<T>() && op->settings.executionContext == Ctx) {
+        return dynamic_cast<T *>(op);
+    }
+
+    if (op->input->n() < 1) {
+        return nullptr;
+    }
+
+    unsigned producer_index = 0;
+    if (op->input->n() > 1) {
+        if (op->isConvertibleTo<popart::AdamUpdaterOp>()) {
+            producer_index = popart::AdamUpdaterOp::getAccl1InIndex();
+        } else if (op->isConvertibleTo<popart::AdamVarUpdateOp>()) {
+            producer_index = popart::AdamVarUpdateOp::getUpdaterInIndex();
+        } else if (op->isConvertibleTo<popart::AccumulateBaseOp>()) {
+            producer_index = popart::AccumulateBaseOp::getUpdaterInIndex();
+        } else if (op->isConvertibleTo<popart::DropoutGradOp>()) {
+            producer_index = popart::DropoutGradOp::getGradInIndex();
+        } else if (op->isConvertibleTo<popart::MulOp>()) {
+            // Grad Unscaling for Adam-based optimizers
+            producer_index = popart::MulOp::getArg0InIndex();
+        } else if (op->isConvertibleTo<popart::ReplicatedReduceScatterOp>()) {
+            // Replicated Tensor Sharding
+            producer_index = popart::ReplicatedReduceScatterOp::getInIndex();
+        } else if (op->isConvertibleTo<popart::ReplicatedAllGatherOp>()) {
+            // Replicated Tensor Sharding
+            producer_index = popart::ReplicatedAllGatherOp::getInIndex();
+        } else {
+            return nullptr;
+        }
+    }
+
+    // Providing a max-search depth of -1 will remove the depth limit at the cost of potentially
+    // unnecessary checks.
+    if (max_depth > 0) {
+        max_depth -= 1;
+        if (max_depth == 0) {
+            return nullptr;
+        }
+    }
+
+    return search_producers_for<T, Ctx>(op->input->tensor(producer_index), max_depth);
+}
+
+// Finds the underlying variable by searching through producers.
+static popart::Tensor *get_variable(popart::Tensor *t) {
+    if (t->tensorType() == popart::TensorType::Variable || t->tensorType() == popart::TensorType::Const) {
+        return t;
+    } else if (!t->hasProducer()) {
+        return nullptr;
+    }
+    auto op = t->getProducer();
+    if (op->input->n() != 1) {
+        return nullptr;
+    }
+    return get_variable(op->input->tensors().front());
+}
+
+// Attempts to find T by searching through consumers.
+template <class T, popart::ExecutionContext Ctx = popart::ExecutionContext::Normal>
+static T *search_consumers_for(popart::Tensor *w, std::queue<popart::Tensor *> &q) {
+    for (auto consumer : w->consumers.getOps()) {
+        if (consumer->isConvertibleTo<T>() && consumer->settings.executionContext == Ctx) {
+            return dynamic_cast<T *>(consumer);
+        }
+
+        if (consumer->isConvertibleTo<popart::DropoutGradOp>()) {
+            q.push(consumer->output->tensor(popart::DropoutGradOp::getGradInIndex()));
+        }
+        if (consumer->isConvertibleTo<popart::ReplicatedReduceScatterOp>()) {
+          q.push(consumer->output->tensor(
+              popart::ReplicatedReduceScatterOp::getOutIndex()));
+        }
+
+        // TODO: Improve this as it's too general. Most ops that have one input and one output are view changing.
+        if (consumer->input->n() == 1 && consumer->output->n() == 1) {
+            q.push(consumer->output->tensor(0));
+        }
+    }
+    if (q.size() < 1) {
+        return nullptr;
+    }
+    w = q.front();
+    q.pop();
+    return search_consumers_for<T, Ctx>(w, q);
+}
+template <class T, popart::ExecutionContext Ctx = popart::ExecutionContext::Normal>
+static T *search_consumers_for(popart::Tensor *w) {
+    std::queue<popart::Tensor *> q;
+    return search_consumers_for<T, Ctx>(w, q);
+}
+
+template <class T>
+static T *weight_consumed_by(popart::Tensor *w) {
+    w = get_variable(w);
+    if (w) {
+        return search_consumers_for<T>(w);
+    }
+    return nullptr;
+}
+
+template <class T, popart::ExecutionContext Ctx>
+static void find_all_consumers(popart::Tensor *w,std::queue<popart::Tensor *> &q, std::vector<T *> &result) {
+    for (auto consumer : w->consumers.getOps()) {
+        if (std::find(result.begin(), result.end(), consumer) == result.end()) {
+            if (consumer->isConvertibleTo<T>() && consumer->settings.executionContext == Ctx) {
+                result.push_back(dynamic_cast<T *>(consumer));
+            }
+            if (consumer->isConvertibleTo<popart::MatMulOp>()) {
+                q.push(consumer->output->tensor(popart::MatMulOp::getOutIndex()));
+            }
+            if (consumer->isConvertibleTo<popart::ReplicatedReduceScatterOp>()) {
+              q.push(consumer->output->tensor(
+                  popart::ReplicatedReduceScatterOp::getOutIndex()));
+            }
+            // Most ops that have one input and one output are view changing.
+            if (consumer->input->n() == 1 && consumer->output->n() == 1) {
+                q.push(consumer->output->tensor(0));
+            }
+        }
+    }
+    if (q.size() < 1) {
+        return;
+    }
+    w = q.front();
+    q.pop();
+    return find_all_consumers<T, Ctx>(w, q, result);
+}
+template <class T, popart::ExecutionContext Ctx = popart::ExecutionContext::Normal>
+static std::vector<T *> find_all_consumers(popart::Tensor *w) {
+    std::queue<popart::Tensor *> q;
+    std::vector<T *> result;
+    find_all_consumers<T, Ctx>(w, q, result);
+    return result;
+}
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/workarounds/prevent_const_expr_folding_op.cc b/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/workarounds/prevent_const_expr_folding_op.cc
new file mode 100644
index 000000000..d6482ad4e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/custom_ops/workarounds/prevent_const_expr_folding_op.cc
@@ -0,0 +1,137 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <popart/op.hpp>
+#include <popart/names.hpp>
+#include <popart/opmanager.hpp>
+#include <popart/region.hpp>
+#include <popart/popx/opx.hpp>
+#include <popart/popx/opxmanager.hpp>
+#include <popart/popx/devicex.hpp>
+
+namespace CustomOperators
+{
+    const popart::OperatorIdentifier PreventConstFolding = {"ai.graphcore", "PreventConstFolding", 1};
+} // namespace CustomOperators
+namespace CustomGradOperators {
+  const popart::OperatorIdentifier PreventConstFoldingGrad = {"ai.graphcore", "PreventConstFoldingGrad", 1};
+} // namespace CustomGradOperators
+
+class PreventConstFoldingOp;
+class PreventConstFoldingGradOp;
+class PreventConstFoldingOpx;
+class PreventConstFoldingGradOpx;
+
+// By default, const expressions ops get folded to optimise the graph and remove unnessary ops
+// at the start. However, in this case, it causes the word embedding to exist in both its
+// original and transposed form. By adding this op, the constant expression folding transform
+// can't fold through it, so we prevent folding after this point.
+
+class PreventConstFoldingOp : public popart::Op
+{
+public:
+    PreventConstFoldingOp(const popart::OperatorIdentifier &_opid, const Op::Settings &settings_)
+        : Op(_opid, settings_) {}
+
+    void setup() final { outInfo(0) = inInfo(0); }
+
+    std::unique_ptr<Op> clone() const {
+        return std::make_unique<PreventConstFoldingOp>(*this);
+    }
+
+    std::vector<std::unique_ptr<Op>> getGradOps() {
+        std::vector<std::unique_ptr<Op>> upops;
+        upops.emplace_back(std::make_unique<PreventConstFoldingGradOp>(*this));
+        return upops;
+    }
+
+    float getSubgraphValue() const final { return getLowSubgraphValue(); }
+};
+
+static popart::OpDefinition PreventConstFoldingOpDef({});
+
+static popart::OpCreator<PreventConstFoldingOp> PreventConstFoldingOpCreator(
+    popart::OpDefinitions({{CustomOperators::PreventConstFolding,
+                            PreventConstFoldingOpDef}}),
+    [](const popart::OpCreatorInfo &oci) -> std::unique_ptr<popart::Op> {
+      return std::unique_ptr<PreventConstFoldingOp>(
+          new PreventConstFoldingOp(oci.opid, oci.settings));
+    },
+    true);
+
+class PreventConstFoldingOpx : public popart::popx::Opx {
+public:
+    PreventConstFoldingOpx(popart::Op *op, popart::popx::Devicex *devicex) : popart::popx::Opx(op, devicex)
+    { verifyOp<PreventConstFoldingOp>(op, CustomOperators::PreventConstFolding); }
+
+    popart::popx::InputCreatorType getInputCreatorType(popart::InIndex) const {
+        return popart::popx::InputCreatorType::CanUnwind;
+    }
+
+    poplar::Tensor unwindTensorLayout(poplar::Tensor tensor, popart::InIndex, popart::OutIndex) const {
+        return tensor;
+    }
+
+    popart::view::RegMap unwindRegion(popart::InIndex, popart::OutIndex) const {
+        return [this](const popart::view::Region &r) {
+            return popart::view::Regions(1, r);
+        };
+    }
+
+    void grow(poplar::program::Sequence &prog) const final {
+        insert(outId(0), getInTensor(0));
+    }
+};
+
+class PreventConstFoldingGradOp : public PreventConstFoldingOp
+{
+public:
+    PreventConstFoldingGradOp(const PreventConstFoldingOp &fwdOp)
+        : PreventConstFoldingOp(CustomGradOperators::PreventConstFoldingGrad, fwdOp.getSettings()) {}
+
+    PreventConstFoldingGradOp(const popart::Op::Settings &settings)
+        : PreventConstFoldingOp(CustomGradOperators::PreventConstFoldingGrad, settings) {}
+
+    std::unique_ptr<popart::Op> clone() const final {
+        return std::make_unique<PreventConstFoldingGradOp>(*this);
+    }
+
+    const std::vector<popart::GradInOutMapper> &gradInputInfo() const {
+        static const std::vector<popart::GradInOutMapper> inInfo = {
+            {0, 0, popart::GradOpInType::GradOut}};
+
+        return inInfo;
+    }
+    const std::map<int, int> &gradOutToNonGradIn() const {
+        static const std::map<int, int> outInfo = {{0, 0}};
+        return outInfo;
+    }
+};
+
+class PreventConstFoldingGradOpx : public popart::popx::Opx {
+public:
+  PreventConstFoldingGradOpx(popart::Op *op, popart::popx::Devicex *devicex)
+      : popart::popx::Opx(op, devicex) {
+    verifyOp<PreventConstFoldingGradOp>(op, CustomGradOperators::PreventConstFoldingGrad);
+  }
+
+  void grow(poplar::program::Sequence &prog) const final {
+      setOutTensor(0, getInTensor(0));
+  }
+};
+
+static popart::popx::OpxCreator<PreventConstFoldingOpx>
+    preventConstFoldingOpxCreator(CustomOperators::PreventConstFolding);
+static popart::popx::OpxCreator<PreventConstFoldingGradOpx>
+    preventConstFoldingGradOpxCreator(CustomGradOperators::PreventConstFoldingGrad);
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/dataset_ipu.py b/nlp/text_classification/bert/paddlepaddle/static_ipu/dataset_ipu.py
new file mode 100644
index 000000000..3703064b1
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/dataset_ipu.py
@@ -0,0 +1,283 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import multiprocessing
+import threading
+from queue import Queue
+
+import h5py
+import numpy as np
+import paddle
+
+KEYS = ('input_ids', 'input_mask', 'segment_ids', 'masked_lm_positions',
+        'masked_lm_ids', 'next_sentence_labels')
+
+
+def shuffle_dict(dic, len):
+    idxs = np.arange(len)
+    np.random.shuffle(idxs)
+    for k, v in dic.items():
+        dic[k] = v[idxs]
+
+
+class PretrainingHDF5DataLoader:
+    def __init__(self,
+                 input_files,
+                 max_seq_length=128,
+                 max_mask_tokens=20,
+                 batch_size=1,
+                 dtype=np.int32,
+                 shuffle=False,
+                 pad_position_value=511,
+                 num_workers=3):
+        self.files = input_files
+        self.batch_size = batch_size
+        self.max_seq_length = max_seq_length
+        self.max_mask_tokens = max_mask_tokens
+        self.dtype = dtype
+        self.shuffle = shuffle
+        self.pad_position_value = pad_position_value
+        if shuffle:
+            np.random.shuffle(self.files)
+
+        self.counter = 0
+
+        # get total number of samples
+        pool = multiprocessing.Pool(min(multiprocessing.cpu_count(), 32))
+        num_samples = pool.map(self.samples_in_file, self.files)
+        pool.close()
+        pool.join()
+        self.total_samples = sum(num_samples)
+        self.len = self.total_samples // self.batch_size
+        assert self.len > 1, f"Batch size {self.batch_size} larger than number of samples {self.total_samples}"
+
+        # notify feed and fetch processes/thread to stop
+        self.event_queue = multiprocessing.Manager().Queue(10)
+
+        # buffer to store final data
+        self.feed_buffer = Queue(20)
+
+        # number of processes to do remask
+        self.num_workers = num_workers
+        # each feed_worker has one process_buffer to use
+        self.process_buffers = [
+            multiprocessing.Manager().Queue(10) for _ in range(num_workers)
+        ]
+        self.split_files = np.array_split(self.files, self.num_workers)
+        # feed_worker will load data from h5py files, and do remask process
+        self.feed_workers = [
+            multiprocessing.Process(
+                target=self.fill_buffer_loop,
+                args=(self.split_files[idx], self.process_buffers[idx]))
+            for idx in range(self.num_workers)
+        ]
+        for p in self.feed_workers:
+            p.start()
+
+        # index for which process_buffer is used each time
+        self.post_fetch_idx = 0
+        # load final data from process_buffers
+        self.fetch_worker = threading.Thread(target=self.post_fetch)
+        self.fetch_worker.start()
+
+    def samples_in_file(self, filename):
+        with h5py.File(filename, "r") as f:
+            data_len = f[KEYS[0]].shape[0]
+        return data_len
+
+    def release(self):
+        self.event_queue.put('END')
+        while not self.feed_buffer.empty():
+            self.feed_buffer.get()
+        for process_buffer in self.process_buffers:
+            while not process_buffer.empty():
+                process_buffer.get()
+        self.fetch_worker.join()
+        for p in self.feed_workers:
+            p.join()
+        return
+
+    def __len__(self):
+        return self.len
+
+    def __iter__(self):
+        self.counter = 0
+        return self
+
+    def __next__(self):
+        result = self.feed_buffer.get()
+        self.counter += 1
+        return result
+
+    def post_fetch(self):
+        while True:
+            if not self.event_queue.empty():
+                return
+            if not self.process_buffers[self.post_fetch_idx].empty():
+                logging.debug(f"self.post_fetch_idx: {self.post_fetch_idx}")
+                np_feed_list = self.process_buffers[self.post_fetch_idx].get()
+                self.post_fetch_idx += 1
+                if self.post_fetch_idx == self.num_workers:
+                    self.post_fetch_idx = 0
+                elif self.post_fetch_idx > self.num_workers:
+                    raise Exception('post_fetch_idx must < num_workers')
+
+                lod_feed_list = []
+                for data in np_feed_list:
+                    tensor = paddle.fluid.core.LoDTensor()
+                    place = paddle.CPUPlace()
+                    tensor.set(data, place)
+                    lod_feed_list.append(tensor)
+                self.feed_buffer.put(lod_feed_list)
+
+    def fill_buffer_loop(self, files, process_buffer):
+        data = None
+        data_index = 0
+        file_index = 0
+
+        def multiprocess_fill_buffer(data, file_index, data_index):
+            if data is None:
+                data = self.load_one_file(files[file_index])
+                file_index += 1
+                data_index = 0
+
+            curr_batch = []
+            still_required = self.batch_size
+            while still_required > 0:
+                data_batch = {
+                    k: data[k][data_index:data_index + still_required]
+                    for k in KEYS
+                }
+                data_batch_len = len(data_batch[KEYS[0]])
+                data_index += data_batch_len
+                curr_batch.append(data_batch)
+                curr_batch_len = sum(len(x[KEYS[0]]) for x in curr_batch)
+                still_required = self.batch_size - curr_batch_len
+                if still_required > 0:
+                    if file_index >= len(files):
+                        np.random.shuffle(files)
+                        file_index = 0
+
+                    data = self.load_one_file(files[file_index])
+                    file_index += 1
+                    data_index = 0
+            if not curr_batch_len == self.batch_size:
+                raise Exception("data length should equal to batch_size")
+
+            result = {}
+            for k in KEYS:
+                result[k] = np.concatenate(
+                    [item[k] for item in curr_batch], axis=0)
+            process_buffer.put(self.do_remask(result))
+
+            return data, file_index, data_index
+
+        while True:
+            if self.event_queue.empty():
+                data, file_index, data_index = multiprocess_fill_buffer(
+                    data, file_index, data_index)
+            else:
+                return
+
+    def do_remask(self, samples):
+        input_ids = samples['input_ids']
+        segment_ids = samples['segment_ids']
+        masked_lm_positions = samples['masked_lm_positions']
+        masked_lm_ids = samples['masked_lm_ids']
+        next_sentence_labels = samples['next_sentence_labels']
+        masked_lm_weights = np.ones_like(masked_lm_ids, dtype=np.int32)
+        masked_lm_weights[masked_lm_ids == 0] = 0
+
+        # post process
+        batch_size, seq_len = input_ids.shape
+        formatted_pos = self.pad_position_value * np.ones_like(samples[
+            'input_ids'])
+        formatted_input = np.zeros_like(input_ids)
+        formatted_seg = np.zeros_like(segment_ids)
+        formatted_mask_labels = np.zeros(
+            (batch_size, self.max_mask_tokens), dtype=masked_lm_ids.dtype)
+
+        valid_seq_positions = []
+        valid_mask_positions = masked_lm_weights == 1
+        valid_mask_len = np.sum(valid_mask_positions, axis=1).reshape(-1, 1)
+        for i, mask_pos in enumerate(masked_lm_positions):
+            pos = [True] * seq_len
+            for mask_index, m in enumerate(mask_pos):
+                if mask_index < valid_mask_len[i]:
+                    pos[m] = False
+            valid_seq_positions.append(np.logical_and(pos, input_ids[i] != 0))
+        valid_seq_len = np.minimum(
+            np.sum(valid_seq_positions, axis=1) + self.max_mask_tokens,
+            self.max_seq_length).reshape(-1, 1)
+        unmasked_len = np.minimum(
+            np.sum(valid_seq_positions, axis=1),
+            self.max_seq_length - self.max_mask_tokens)
+        for i in range(batch_size):
+            target_mask_indices = np.arange(valid_mask_len[i])
+            target_seq_indices = self.max_mask_tokens + np.arange(unmasked_len[
+                i])
+            source_mask_indices = masked_lm_positions[i][valid_mask_positions[
+                i]]
+            source_seq_indices = np.arange(seq_len)[valid_seq_positions[
+                i]][:unmasked_len[i]]
+
+            target_indices = np.hstack(
+                [target_mask_indices, target_seq_indices])
+            source_indices = np.hstack(
+                [source_mask_indices, source_seq_indices])
+
+            formatted_pos[i, target_indices] = source_indices
+            formatted_input[i, target_indices] = input_ids[i, source_indices]
+            formatted_seg[i, target_indices] = segment_ids[i, source_indices]
+            formatted_mask_labels[i] = masked_lm_ids[i, :self.max_mask_tokens]
+
+        return [
+            formatted_input.astype(np.int32), formatted_seg.astype(np.int32),
+            formatted_pos.astype(np.int32), valid_mask_len.astype(np.int32),
+            valid_seq_len.astype(np.int32),
+            formatted_mask_labels.astype(np.int32),
+            next_sentence_labels.astype(np.int32)
+        ]
+
+    def load_one_file(self, file_path):
+        data = self.load_hdf5(file_path)
+
+        if self.shuffle:
+            shuffle_dict(data, len(data[KEYS[0]]))
+
+        return data
+
+    def load_hdf5(self, filename):
+        with h5py.File(filename, "r") as f:
+            data = {key: np.asarray(f[key][:]) for key in KEYS}
+        return data
+
+
+if __name__ == "__main__":
+    import glob
+    base_dir = 'data_path/wikicorpus_en/'
+    input_files = glob.glob(f"{base_dir}/*training*.hdf5")
+    input_files.sort()
+    # print(input_files)
+
+    seed = 1984
+    np.random.seed(seed)
+    paddle.seed(seed)
+
+    data_loader = PretrainingHDF5DataLoader(
+        input_files, batch_size=65536, shuffle=True)
+
+    for idx, batch in enumerate(data_loader):
+        print(f"{idx}: {batch[0].shape()}")
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/load_tf_ckpt.py b/nlp/text_classification/bert/paddlepaddle/static_ipu/load_tf_ckpt.py
new file mode 100644
index 000000000..4bad63fe2
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/load_tf_ckpt.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+from logging import getLogger
+
+logger = getLogger(__name__)
+
+
+def get_tf_mapping(args):
+    squad_mapping = {
+        "cls/squad/output_weights": "linear_72.w_0",
+        "cls/squad/output_bias": "linear_72.b_0"
+    }
+
+    tf_to_pdmodel = {
+        "bert/embeddings/word_embeddings": "ipu_bert_embeddings_0.w_0",
+        "bert/embeddings/position_embeddings": "embedding_0.w_0",
+        "bert/embeddings/token_type_embeddings": "ipu_bert_embeddings_0.w_1",
+        "bert/embeddings/LayerNorm/gamma": "layer_norm_0.w_0",
+        "bert/embeddings/LayerNorm/beta": "layer_norm_0.b_0"
+    }
+    for i in range(args.num_hidden_layers):
+        layer = {
+            f"bert/encoder/layer_{i}/attention/self/query/bias":
+            f"bert_model_0.b_{i}",
+            f"bert/encoder/layer_{i}/attention/self/key/bias":
+            f"bert_model_0.b_{i}",
+            f"bert/encoder/layer_{i}/attention/self/value/bias":
+            f"bert_model_0.b_{i}",
+            f"bert/encoder/layer_{i}/attention/output/dense/kernel":
+            f"linear_{i*6}.w_0",
+            f"bert/encoder/layer_{i}/attention/output/dense/bias":
+            f"linear_{i*6}.b_0",
+            f"bert/encoder/layer_{i}/attention/output/LayerNorm/gamma":
+            f"layer_norm_{i*4+2}.w_0",
+            f"bert/encoder/layer_{i}/attention/output/LayerNorm/beta":
+            f"layer_norm_{i*4+2}.b_0",
+            f"bert/encoder/layer_{i}/intermediate/dense/kernel":
+            f"linear_{i*6+2}.w_0",
+            f"bert/encoder/layer_{i}/intermediate/dense/bias":
+            f"linear_{i*6+2}.b_0",
+            f"bert/encoder/layer_{i}/output/dense/kernel":
+            f"linear_{i*6+3}.w_0",
+            f"bert/encoder/layer_{i}/output/dense/bias": f"linear_{i*6+3}.b_0",
+            f"bert/encoder/layer_{i}/output/LayerNorm/gamma":
+            f"layer_norm_{(i+1)*4}.w_0",
+            f"bert/encoder/layer_{i}/output/LayerNorm/beta":
+            f"layer_norm_{(i+1)*4}.b_0",
+        }
+        layer[
+            f"bert/encoder/layer_{i}/attention/self/query/kernel"] = f"bert_model_0.w_{i*3+0}"
+        layer[
+            f"bert/encoder/layer_{i}/attention/self/key/kernel"] = f"bert_model_0.w_{i*3+1}"
+        layer[
+            f"bert/encoder/layer_{i}/attention/self/value/kernel"] = f"bert_model_0.w_{i*3+2}"
+        tf_to_pdmodel.update(**layer)
+
+    if args.task == "PRETRAINING":
+        logger.error("Mapping ckpt weights is only supported in SQUAD task.")
+    elif args.task == "SQUAD":
+        tf_to_pdmodel.update(**squad_mapping)
+
+    return tf_to_pdmodel
+
+
+def generate_initializers(args, map_names, load_data, mapping, transform={}):
+    initializers = {}
+    initializers_param = {}
+    initializers_opt = {}
+
+    qkv_tensor_range = {
+        "query": (0, args.hidden_size),
+        "key": (args.hidden_size, args.hidden_size * 2),
+        "value": (args.hidden_size * 2, args.hidden_size * 3),
+    }
+
+    for name, array in zip(map_names, load_data):
+        logger.debug(
+            f"Initialising tensor from checkpoint {name} -> {mapping[name]}")
+
+        # config["lamb_m_dtype"] is for setting the data type for accl1 of lamb
+        # BERT can use FP16 for accl1 without lossing accuracy
+        # accl2 is always in FP32
+        lamb_m_dtype = np.float32
+        dtype = np.float32
+
+        if "moment1" in mapping[name]:
+            if array.dtype != lamb_m_dtype:
+                array = array.astype(lamb_m_dtype)
+        elif "moment2" in mapping[name]:
+            if array.dtype != np.float32:
+                array = array.astype(np.float32)
+        elif array.dtype != dtype:
+            array = array.astype(dtype)
+
+        # If it's part of QKV biases, we need to handle separately as those 3
+        # tensors need concatenating into one
+        if "bert_model_0.b" in mapping[name]:
+            qkv_part = name.split("/")[5]
+            if mapping[name] not in initializers.keys():
+                qkv_shape = (array.shape[0] * 3)
+                initializers[mapping[name]] = np.empty(
+                    qkv_shape, dtype=array.dtype)
+
+            start_idx = qkv_tensor_range[qkv_part][0]
+            end_idx = qkv_tensor_range[qkv_part][1]
+            initializers[mapping[name]][start_idx:end_idx] = array
+            logger.debug(
+                f"Initialising QKV_bias component {name}[{start_idx}:{end_idx}] from checkpoint"
+            )
+            continue
+
+        if name in transform:
+            array = transform[name](array)
+
+        padded_vocab_length = args.vocab_size
+        if "bert_embeddings_0.w_0" in mapping[name]:
+            tf_vocab_length = array.shape[0]
+            diff = padded_vocab_length - tf_vocab_length
+            # Pad or Crop the vocab.
+            if diff > 0:
+                logger.info(
+                    f"Padding the vocabulary. From {tf_vocab_length} to {padded_vocab_length}"
+                )
+                pad = np.zeros((diff, args.hidden_size)).astype(array.dtype)
+                array = np.concatenate((array, pad), axis=0)
+            else:
+                logger.warning(
+                    f"Cropping the vocabulary may negatively effect performance. From {tf_vocab_length} to {padded_vocab_length}"
+                )
+                array = np.array(array[:padded_vocab_length, :])
+            # if args.task == "PRETRAINING":
+            # We use transposed weight in both pretraining and squad
+            array = np.transpose(array, [1, 0])
+
+        if "embedding_0.w_0" in mapping[name]:
+            max_pos, hidden_len = array.shape
+            if max_pos > args.max_position_embeddings:
+                array = array[:args.max_position_embeddings, :]
+
+            # Otherwise just copy the positional embeddings over and over again as is done in longformer
+            elif max_pos < args.max_position_embeddings:
+                logger.warning(
+                    f"Not enough positional embeddings in checkpoint, copying to match length..."
+                )
+                array = array[np.mod(
+                    np.arange(args.max_position_embeddings), max_pos)]
+
+        initializers[mapping[name]] = array.copy()
+        for k in initializers:
+            if "moment" in k:
+                initializers_opt[k] = initializers[k]
+            else:
+                initializers_param[k] = initializers[k]
+    return initializers_param, initializers_opt
+
+
+# util function for load tf pretrained weight
+def load_initializers_from_tf(file_path, args):
+    """
+    Loads weights, etc. from Tensorflow files into a dictionary of Numpy Arrays.
+
+    Can read either checkpoint files, or frozen graphs, according to the
+    `is_checkpoint` flag, passed in as the second argument.
+    """
+    try:
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model requires TensorFlow to be installed. "
+            "Please see https://www.tensorflow.org/install/ for installation "
+            "instructions.")
+        raise
+
+    tf_path = os.path.abspath(file_path)
+    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+
+    mapping = get_tf_mapping(args)
+    map_names = [name for name, shape in init_vars if name in mapping.keys()]
+    for name in (n for n, _ in init_vars if n not in mapping.keys()):
+        logger.debug(f"Skipping load of {name} - Not in mapping")
+
+    load_data = [tf.train.load_variable(tf_path, name) for name in map_names]
+    initializers, opt_params = generate_initializers(args, map_names, load_data,
+                                                     mapping)
+    return initializers, opt_params
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/modeling.py b/nlp/text_classification/bert/paddlepaddle/static_ipu/modeling.py
new file mode 100644
index 000000000..7f106336a
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/modeling.py
@@ -0,0 +1,705 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.static
+import paddle.fluid
+from paddle.nn import Layer
+from typing import List, NamedTuple, Optional
+from contextlib import ExitStack
+
+
+class DeviceScope(object):
+    def __init__(self, index, stage, name_scope=None):
+        self.index = index
+        self.stage = stage
+        self.name_scope = name_scope
+
+    def __enter__(self):
+        self.stack = ExitStack()
+        self.stack.enter_context(
+            paddle.static.ipu_shard_guard(
+                index=self.index, stage=self.stage))
+        if self.name_scope is not None:
+            self.stack.enter_context(paddle.static.name_scope(self.name_scope))
+        return self
+
+    def __exit__(self, *exp):
+        self.stack.close()
+        return False
+
+
+class IpuBertConfig(NamedTuple):
+    """
+    The configuration for BERT Model.
+    Args:
+        seq_len (int):
+            The sequence length. Default to `128`.
+        max_position_embeddings (int):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
+            sequence. Defaults to `512`.
+        max_predictions_per_seq (int):
+            The max number of the masked token each sentence. Default to `20`.
+        hidden_size (int):
+            Dimensionality of the embedding layer, encoder layer and pooler layer. Defaults to `768`.
+        vocab_size (int):
+            Vocabulary size of `inputs_ids` in `BertModel`. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `BertModel`.
+        num_hidden_layers (int):
+            Number of hidden layers in the Transformer encoder. Defaults to `12`.
+        available_mem_proportion (float):
+            The available proportion of memory used by conv or matmul. Default to `0.28`.
+        type_vocab_size (int):
+            The vocabulary size of `token_type_ids`.
+            Defaults to `2`.
+        hidden_dropout_prob (float):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+            Defaults to `0.1`.
+        attention_probs_dropout_prob (float):
+            The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
+            Defaults to `0.1`.
+        task (str):
+            The type of the NLP model.
+        layers_per_ipu (list):
+            Number of attention layers executed on each IPU.
+    """
+    micro_batch_size: int = 1
+    seq_len: int = 128
+    max_position_embeddings: int = 512
+    max_predictions_per_seq: int = 20
+    hidden_size: int = 768
+    vocab_size: int = 30400
+    num_hidden_layers: int = 12
+    available_mem_proportion: float = 0.28
+    type_vocab_size: int = 2
+
+    hidden_dropout_prob: float = 0.1
+    attention_probs_dropout_prob: float = 0.1
+
+    # Choices: PRETRAINING (MLM + NSP), SQUAD
+    task: str = "PRETRAINING"
+    layers_per_ipu: List = None
+
+    embeddings_scope: DeviceScope = None
+    attn_scopes: DeviceScope = None
+    ff_scopes: DeviceScope = None
+    mlm_scope: DeviceScope = None
+    nsp_scope: DeviceScope = None
+
+
+class IpuBertEmbeddings(Layer):
+    """
+    Include embeddings from word, position and token_type embeddings
+    """
+
+    def __init__(self, config, custom_ops=None):
+        super(IpuBertEmbeddings, self).__init__()
+        self.config = config
+        self.word_embeddings_weights = self.create_parameter(
+            shape=[config.hidden_size, config.vocab_size], dtype="float32")
+        self.token_embeddings_weights = self.create_parameter(
+            shape=[config.type_vocab_size, config.hidden_size], dtype="float32")
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=0.001)
+        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
+        self.custom_ops = custom_ops
+
+    def forward(self, indices, segments, positions):
+        # word embeddings
+        word_embeddings_weights = paddle.transpose(self.word_embeddings_weights,
+                                                   [1, 0])
+        input_embeddings = paddle.gather(
+            word_embeddings_weights, indices, axis=0)
+
+        # position_embeddings
+        position_embeddings = self.position_embeddings(positions)
+
+        # token_type_embeddings
+        token_type_embeddings = paddle.fluid.input.one_hot(segments, depth=2)
+        token_type_embeddings = paddle.matmul(token_type_embeddings,
+                                              self.token_embeddings_weights)
+
+        embeddings = paddle.add(input_embeddings, position_embeddings)
+        embeddings = paddle.add(embeddings, token_type_embeddings)
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings, self.word_embeddings_weights
+
+
+class BertModel(Layer):
+    """
+    The bare BERT Model transformer outputting raw hidden-states.
+
+    This model refers to :class:`~paddlenlp.transformers.bert.BertModel`.
+
+    Args:
+        config (IpuBertConfig):
+            configuration of bert.
+        custom_ops:
+            custom defined operators which can be found in directory `custom_ops`.
+    """
+
+    def __init__(self, config, custom_ops=None):
+        super(BertModel, self).__init__()
+        self.config = config
+        self.custom_ops = custom_ops
+
+        qk_scale = 1 / np.sqrt(self.config.hidden_size /
+                               self.config.num_hidden_layers)
+        self.qk_scale_attrs = {
+            'name': 'QK_scale',
+            'shape': [1],
+            'dtype': 'float32',
+            'value': qk_scale,
+        }
+        self.qkv_shape = [-1, self.config.seq_len, 12, 64]
+        self.masks = {}
+
+        self.embedding = IpuBertEmbeddings(self.config, custom_ops)
+
+    def _encoder_layer_ipu_offset(self, layer_index):
+        encoder_index = 0
+        if len(self.config.layers_per_ipu) == 1:
+            encoder_index = layer_index // self.config.layers_per_ipu[0]
+        else:
+            for ipu, num_layers in enumerate(self.config.layers_per_ipu):
+                layer_index -= num_layers
+                if layer_index < 0:
+                    encoder_index = ipu
+                    break
+        return encoder_index
+
+    def should_checkpoint(self, layer_index):
+        encoder_index = self._encoder_layer_ipu_offset(layer_index)
+        if len(self.config.layers_per_ipu) == 1:
+            layers = self.config.layers_per_ipu[0]
+            layer_index -= encoder_index * layers
+        else:
+            layers = self.config.layers_per_ipu[encoder_index]
+            layer_index -= sum(self.config.layers_per_ipu[:encoder_index])
+        return layer_index < (layers - 1)
+
+    def forward(self, indices, segments, positions, input_mask):
+        r'''
+        The BertModel forward method, overrides the `__call__()` special method.
+
+        Args:
+            indices (Tensor):
+                Indices of input sequence tokens in the vocabulary. They are
+                numerical representations of tokens that build the input sequence.
+                Its data type should be `int32` and it has a shape of [batch_size * sequence_length].
+            segments (Tensor):
+                Segment token indices to indicate different portions of the inputs.
+                Selected in the range ``[0, type_vocab_size - 1]``.
+                Its data type should be `int32` and it has a shape of [batch_size * sequence_length].
+            positions(Tensor):
+                Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+                max_position_embeddings - 1]``.
+                Shape as `[batch_size * sequence_length]` and dtype as int32.
+            input_mask (Tensor, optional):
+                Mask used in multi-head attention to avoid performing attention on to some unwanted positions,
+                usually the paddings or the subsequent positions.
+                If the task is PRETRAINING:
+                    input_mask[0] is the index that masking starts in the mask_tokens
+                    input_mask[1] is the index that masking starts in the rest of the sequence
+                Otherwise
+                    input_mask is the mask tensor that has -1000 in positions to be masked and 0 otherwise.
+
+        Returns:
+            tuple: Returns tuple (`sequence_output`, `word_embeddings_weights`).
+
+            With the fields:
+
+            - `sequence_output` (Tensor):
+                Sequence of hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+        '''
+
+        with self.config.embeddings_scope:
+            sequence_output, word_embeddings_weights = self.embedding(
+                indices, segments, positions)
+
+        if self.config.task == "PRETRAINING":
+            with paddle.static.ipu_shard_guard(index=0, stage=0):
+                input_mask[0] = self.custom_ops.detach(input_mask[0])
+                input_mask[1] = self.custom_ops.detach(input_mask[1])
+
+        for i in range(self.config.num_hidden_layers):
+            # Attention
+            attn_scope = self.config.attn_scopes[i]
+            with attn_scope:
+                with paddle.static.name_scope(f"Layer{i}/Attention"):
+                    layer_input = sequence_output
+                    q = self.create_parameter(
+                        shape=[
+                            self.config.hidden_size, self.config.hidden_size
+                        ],
+                        dtype="float32")
+                    k = self.create_parameter(
+                        shape=[
+                            self.config.hidden_size, self.config.hidden_size
+                        ],
+                        dtype="float32")
+                    v = self.create_parameter(
+                        shape=[
+                            self.config.hidden_size, self.config.hidden_size
+                        ],
+                        dtype="float32")
+                    qkv = paddle.concat([q, k, v], axis=1)
+                    qkv = paddle.matmul(sequence_output, qkv)
+                    qkv.block.ops[-1]._set_attr(
+                        '__available_memory',
+                        self.config.available_mem_proportion)
+                    q, k, v = paddle.split(
+                        qkv,
+                        num_or_sections=[
+                            self.config.hidden_size, self.config.hidden_size,
+                            self.config.hidden_size
+                        ],
+                        axis=1)
+                    q = paddle.reshape(q, self.qkv_shape)
+                    q = paddle.transpose(q, [0, 2, 1, 3])
+                    k = paddle.reshape(k, self.qkv_shape)
+                    k = paddle.transpose(k, [0, 2, 3, 1])
+                    v = paddle.reshape(v, self.qkv_shape)
+                    v = paddle.transpose(v, [0, 2, 1, 3])
+
+                    # Attention calculation
+                    with paddle.static.name_scope(f"Z"):
+                        if self.config.task == "PRETRAINING":
+                            if attn_scope.index in self.masks:
+                                final_mask = self.masks[attn_scope.index]
+                            else:
+                                with paddle.static.name_scope("Mask"):
+                                    base_value = np.arange(
+                                        self.config.seq_len).astype('int32')
+                                    base = paddle.fluid.layers.assign(
+                                        base_value)
+                                    mmask = paddle.less_than(base,
+                                                             input_mask[0])
+                                    mask_value = np.greater_equal(
+                                        base_value,
+                                        self.config.max_predictions_per_seq)
+                                    mask = paddle.fluid.layers.assign(
+                                        mask_value)
+                                    mmask = paddle.logical_or(mmask, mask)
+                                    smask = paddle.less_than(base,
+                                                             input_mask[1])
+                                    final_mask = paddle.logical_and(mmask,
+                                                                    smask)
+                                    final_mask = paddle.cast(final_mask,
+                                                             "float16")
+                                    sub_attrs = {
+                                        'name': 'constant_sub',
+                                        'shape': [1],
+                                        'dtype': 'float32',
+                                        'value': 1,
+                                    }
+                                    mul_attrs = {
+                                        'name': 'constant_mul',
+                                        'shape': [1],
+                                        'dtype': 'float32',
+                                        'value': 1000,
+                                    }
+                                    final_mask = paddle.fluid.layers.elementwise_sub(
+                                        final_mask,
+                                        paddle.fluid.layers.fill_constant(
+                                            **sub_attrs))
+                                    final_mask = paddle.fluid.layers.elementwise_mul(
+                                        final_mask,
+                                        paddle.fluid.layers.fill_constant(
+                                            **mul_attrs))
+                                    final_mask = paddle.reshape(
+                                        final_mask,
+                                        [-1, 1, 1, self.config.seq_len])
+                                    final_mask = self.custom_ops.detach(
+                                        final_mask)
+                                    self.masks[attn_scope.index] = final_mask
+
+                        qk = paddle.matmul(q, k)
+                        qk.block.ops[-1]._set_attr(
+                            '__available_memory',
+                            self.config.available_mem_proportion)
+                        qk_scale = paddle.fluid.layers.fill_constant(
+                            **self.qk_scale_attrs)
+                        qk = paddle.fluid.layers.elementwise_mul(qk, qk_scale)
+
+                        if self.config.task == "PRETRAINING":
+                            qk = paddle.fluid.layers.elementwise_add(qk,
+                                                                     final_mask)
+                        else:
+                            # for SQUAD task, input_mask is calculated in data preprocessing
+                            qk = paddle.fluid.layers.elementwise_add(qk,
+                                                                     input_mask)
+
+                        qk = paddle.fluid.layers.softmax(qk)
+                        if self.config.task == "SQUAD":
+                            qk = paddle.fluid.layers.dropout(
+                                qk,
+                                self.config.attention_probs_dropout_prob,
+                                dropout_implementation='upscale_in_train')
+                        qkv = paddle.matmul(qk, v)
+                        qkv.block.ops[-1]._set_attr(
+                            '__available_memory',
+                            self.config.available_mem_proportion)
+                        qkv = paddle.transpose(qkv, [0, 2, 1, 3])
+                        qkv = paddle.reshape(qkv, [-1, self.config.hidden_size])
+
+                    qkv_linear = nn.Linear(
+                        self.config.hidden_size,
+                        self.config.hidden_size,
+                        bias_attr=False)
+                    qkv = qkv_linear(qkv)
+                    qkv.block.ops[-1]._set_attr(
+                        '__available_memory',
+                        self.config.available_mem_proportion)
+                    qkv = paddle.fluid.layers.dropout(
+                        qkv,
+                        self.config.attention_probs_dropout_prob,
+                        dropout_implementation='upscale_in_train')
+                    attention = paddle.add(layer_input, qkv)
+                    layer_norm1 = nn.LayerNorm(
+                        self.config.hidden_size, epsilon=0.001)
+                    attention = layer_norm1(attention)
+
+            # FF
+            with self.config.ff_scopes[i]:
+                with paddle.static.name_scope(f"Layer{i}/FF"):
+                    ff_linear1 = nn.Linear(self.config.hidden_size,
+                                           4 * self.config.hidden_size)
+                    ff_linear2 = nn.Linear(4 * self.config.hidden_size,
+                                           self.config.hidden_size)
+                    with paddle.static.name_scope(f"1"):
+                        ff = ff_linear1(attention)
+                        ff.block.ops[-2]._set_attr(
+                            '__available_memory',
+                            self.config.available_mem_proportion)
+                    ff = paddle.fluid.layers.gelu(ff, approximate=True)
+                    with paddle.static.name_scope(f"2"):
+                        ff = ff_linear2(ff)
+                        ff.block.ops[-2]._set_attr(
+                            '__available_memory',
+                            self.config.available_mem_proportion)
+                    ff = paddle.fluid.layers.dropout(
+                        ff,
+                        self.config.attention_probs_dropout_prob,
+                        dropout_implementation='upscale_in_train')
+                    ff = paddle.add(attention, ff)
+                    layer_norm2 = nn.LayerNorm(
+                        self.config.hidden_size, epsilon=0.001)
+                    sequence_output = layer_norm2(ff)
+
+                if self.should_checkpoint(i):
+                    with paddle.static.name_scope(f"Layer{i}"):
+                        logging.info(f'add checkpointoutput for ff_{i}')
+                        sequence_output = self.custom_ops.checkpointoutput(
+                            sequence_output)
+        return sequence_output, word_embeddings_weights
+
+
+class IpuBertForQuestionAnswering(Layer):
+    """
+    Bert Model with a span classification head on top for extractive question-answering tasks like
+    SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and
+    `span end logits`).
+
+    Args:
+        hidden_size (int):
+            Dimensionality of the embedding layer, encoder layer and pooler layer. Defaults to `768`.
+        seq_len (int):
+            See :class:`IpuBertConfig`.
+        """
+
+    def __init__(self, hidden_size, seq_len):
+        super(IpuBertForQuestionAnswering, self).__init__()
+        self.hidden_size = hidden_size
+        self.seq_len = seq_len
+        self.classifier = nn.Linear(hidden_size, 2)
+
+    def forward(self, sequence_output):
+        r"""
+        The IpuBertForQuestionAnswering forward method, overrides the __call__() special method.
+
+        Args:
+            sequence_output (Tensor):
+                See :class:`BertModel`.
+
+        Returns:
+            tuple: Returns tuple (`start_logits`, `end_logits`).
+
+            With the fields:
+
+            - `start_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+            - `end_logits` (Tensor):
+                A tensor of the input token classification logits, indicates the end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+        """
+        logits = self.classifier(sequence_output)
+
+        start_logits = paddle.slice(
+            input=logits, axes=[1], starts=[0], ends=[1])
+        end_logits = paddle.slice(input=logits, axes=[1], starts=[1], ends=[2])
+
+        start_logits = paddle.reshape(start_logits, [-1, self.seq_len])
+        end_logits = paddle.reshape(end_logits, [-1, self.seq_len])
+        return start_logits, end_logits
+
+
+class IpuBertQAAccAndLoss(paddle.nn.Layer):
+    """
+    Criterion for Question and Answering.
+    """
+
+    def __init__(self, custom_ops=None):
+        super(IpuBertQAAccAndLoss, self).__init__()
+        self.custom_ops = custom_ops
+
+    def forward(self, start_logits, end_logits, start_labels, end_labels):
+        r"""
+        The IpuBertQAAccAndLoss forward method, overrides the __call__() special method.
+
+        Args:
+            start_logits (Tensor):
+                See :class:`IpuBertForQuestionAnswering`.
+            end_logits (Tensor):
+                See :class:`IpuBertForQuestionAnswering`.
+            start_labels (Tensor):
+                Labels for start position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+            end_labels (Tensor):
+                Labels for end position of the labelled span.
+                Its data type should be float32 and its shape is [batch_size, sequence_length].
+
+        """
+        with paddle.static.name_scope("loss"):
+            start_loss = paddle.fluid.layers.softmax(start_logits)
+            start_loss = self.custom_ops.custom_nll_loss(
+                start_loss, start_labels, 1, "None", False)
+            end_loss = paddle.fluid.layers.softmax(end_logits)
+            end_loss = self.custom_ops.custom_nll_loss(end_loss, end_labels, 1,
+                                                       "None", False)
+            loss = paddle.add(start_loss, end_loss)
+
+        with paddle.static.name_scope("acc"):
+            start_logits = paddle.fluid.layers.argmax(start_logits, axis=1)
+            end_logits = paddle.fluid.layers.argmax(end_logits, axis=1)
+            start_equal = paddle.fluid.layers.equal(start_logits, start_labels)
+            end_equal = paddle.fluid.layers.equal(end_logits, end_labels)
+            start_equal = paddle.fluid.layers.cast(start_equal, 'float32')
+            end_equal = paddle.fluid.layers.cast(end_equal, 'float32')
+            start_acc = paddle.mean(start_equal)
+            end_acc = paddle.mean(end_equal)
+
+        return start_acc, end_acc, loss
+
+
+class IpuBertPretrainingMLMHeads(Layer):
+    """
+    Perform language modeling task.
+
+    Args:
+        hidden_size (int):
+            See :class:`IpuBertConfig`.
+        vocab_size (int):
+            See :class:`IpuBertConfig`.
+        max_position_embeddings (int):
+            See :class:`IpuBertConfig`.
+        max_predictions_per_seq (int):
+            See :class:`IpuBertConfig`.
+        seq_len (int):
+            See :class:`IpuBertConfig`.
+    """
+
+    def __init__(self, hidden_size, vocab_size, max_position_embeddings,
+                 max_predictions_per_seq, seq_len):
+        super(IpuBertPretrainingMLMHeads, self).__init__()
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.max_predictions_per_seq = max_predictions_per_seq
+        self.sequence_length = seq_len
+        self.transform = nn.Linear(hidden_size, hidden_size)
+        self.layer_norm = nn.LayerNorm(hidden_size, epsilon=0.001)
+
+    def forward(self, encoders_output, word_embeddings_weights):
+        # cls
+        out = self.transform(encoders_output)
+        out = paddle.fluid.layers.gelu(out, approximate=True)
+        out = self.layer_norm(out)
+
+        # mlm
+        out = paddle.reshape(out, [-1, self.sequence_length, self.hidden_size])
+        out = paddle.slice(out, [1], [0], [self.max_predictions_per_seq])
+        out = paddle.reshape(out, [-1, self.hidden_size])
+
+        # serialized matmul
+        out = paddle.matmul(out, word_embeddings_weights)
+        out.block.ops[-1]._set_attr('serialize_factor', 5)
+        mlm_out = paddle.reshape(
+            out, [-1, self.max_predictions_per_seq, self.vocab_size])
+
+        return mlm_out
+
+
+class IpuBertPretrainingNSPHeads(Layer):
+    """
+    Perform next sequence classification task.
+
+    Args:
+        hidden_size (int):
+            See :class:`IpuBertConfig`.
+        max_predictions_per_seq (int):
+            See :class:`IpuBertConfig`.
+        seq_len (int):
+            See :class:`IpuBertConfig`.
+    """
+
+    def __init__(self, hidden_size, max_predictions_per_seq, seq_len):
+        super(IpuBertPretrainingNSPHeads, self).__init__()
+        self.hidden_size = hidden_size
+        self.max_predictions_per_seq = max_predictions_per_seq
+        self.seq_len = seq_len
+        self.seq_relationship = nn.Linear(hidden_size, 2)
+        self.pooler = IpuBertPooler(hidden_size, self.seq_len,
+                                    self.max_predictions_per_seq)
+
+    def forward(self, encoders_output):
+        pooled_output = self.pooler(encoders_output)
+        nsp_out = self.seq_relationship(pooled_output)
+        return nsp_out
+
+
+class IpuBertPooler(Layer):
+    """
+    Pool the result of BertEncoder.
+    """
+
+    def __init__(self,
+                 hidden_size,
+                 sequence_length,
+                 max_predictions_per_seq,
+                 pool_act="tanh"):
+        super(IpuBertPooler, self).__init__()
+        self.dense = nn.Linear(hidden_size, hidden_size)
+        self.activation = nn.Tanh()
+        self.pool_act = pool_act
+        self.sequence_length = sequence_length
+        self.max_predictions_per_seq = max_predictions_per_seq
+        self.hidden_size = hidden_size
+
+    def forward(self, hidden_states):
+        hidden_states = paddle.reshape(
+            hidden_states, [-1, self.sequence_length, self.hidden_size])
+        first_token_tensor = paddle.slice(
+            input=hidden_states,
+            axes=[1],
+            starts=[self.max_predictions_per_seq],
+            ends=[self.max_predictions_per_seq + 1])
+        first_token_tensor = paddle.reshape(first_token_tensor,
+                                            [-1, self.hidden_size])
+        pooled_output = self.dense(first_token_tensor)
+        if self.pool_act == "tanh":
+            pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class IpuBertPretrainingMLMAccAndLoss(Layer):
+    """
+    Criterion for masked language modeling.
+    """
+
+    def __init__(self, micro_batch, ignore_index, custom_ops):
+        super(IpuBertPretrainingMLMAccAndLoss, self).__init__()
+        self.micro_batch = micro_batch
+        self.ignore_index = ignore_index
+        self.custom_ops = custom_ops
+
+    def forward(self, mlm, masked_lm_ids):
+        mlm_pred = paddle.fluid.layers.argmax(mlm, axis=-1)
+        mlm_pred = paddle.cast(mlm_pred, "int32")
+        with paddle.static.name_scope("Accuracy"):
+            mlm_label = paddle.cast(masked_lm_ids, "int32")
+            mlm_correct = paddle.fluid.layers.equal(mlm_pred, mlm_label)
+            attrs = {
+                'name': 'mlm_mask_val',
+                'shape': [1],
+                'dtype': 'int32',
+                'value': self.ignore_index,
+            }
+            mlm_mask_val = paddle.fluid.layers.fill_constant(**attrs)
+            mlm_unmask = paddle.fluid.layers.equal(mlm_label, mlm_mask_val)
+            mlm_mask = paddle.logical_not(mlm_unmask)
+            mlm_mask = paddle.cast(mlm_mask, "float32")
+            mlm_correct = paddle.cast(mlm_correct, "float32")
+            masked_mlm_correct = paddle.fluid.layers.elementwise_mul(
+                mlm_correct, mlm_mask)
+            total_correct_tokens = paddle.fluid.layers.reduce_sum(
+                masked_mlm_correct)
+            total_tokens = paddle.fluid.layers.reduce_sum(mlm_mask)
+            total_correct_tokens = paddle.cast(total_correct_tokens, "float32")
+            total_tokens = paddle.cast(total_tokens, "float32")
+            mlm_acc = paddle.fluid.layers.elementwise_div(total_correct_tokens,
+                                                          total_tokens)
+
+        masked_lm_softmax = paddle.fluid.layers.softmax(mlm)
+        mlm_loss = self.custom_ops.custom_nll_loss(
+            masked_lm_softmax, masked_lm_ids, 1, str(self.ignore_index), False)
+
+        return mlm_acc, mlm_loss
+
+
+class IpuBertPretrainingNSPAccAndLoss(Layer):
+    """
+    Criterion for next sequence classification.
+    """
+
+    def __init__(self, micro_batch, ignore_index, custom_ops):
+        super(IpuBertPretrainingNSPAccAndLoss, self).__init__()
+        self.micro_batch = micro_batch
+        self.ignore_index = ignore_index
+        self.custom_ops = custom_ops
+
+    def forward(self, nsp, nsp_label):
+        nsp_pred = paddle.fluid.layers.argmax(nsp, axis=-1)
+        nsp_pred = paddle.cast(nsp_pred, "int32")
+        with paddle.static.name_scope("Accuracy"):
+            nsp_label = paddle.cast(nsp_label, "int32")
+            nsp_correct = paddle.fluid.layers.equal(nsp_pred, nsp_label)
+            nsp_correct = paddle.cast(nsp_correct, "int32")
+            nsp_correct = paddle.fluid.layers.reduce_sum(nsp_correct)
+            nsp_correct = paddle.cast(nsp_correct, "float32")
+            attrs = {
+                'name': 'mlm_mask_val',
+                'shape': [1],
+                'dtype': 'int32',
+                'value': self.micro_batch,
+            }
+            nsp_total = paddle.fluid.layers.fill_constant(**attrs)
+            nsp_total = paddle.cast(nsp_total, "float32")
+            nsp_acc = paddle.fluid.layers.elementwise_div(nsp_correct,
+                                                          nsp_total)
+
+        next_sentence_softmax = paddle.fluid.layers.softmax(nsp)
+        nsp_loss = self.custom_ops.custom_nll_loss(next_sentence_softmax,
+                                                   nsp_label, 1, "None", False)
+
+        return nsp_acc, nsp_loss
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/requirements.txt b/nlp/text_classification/bert/paddlepaddle/static_ipu/requirements.txt
new file mode 100644
index 000000000..43dee67df
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/requirements.txt
@@ -0,0 +1,8 @@
+datasets
+h5py
+multiprocess
+numpy
+paddlenlp
+scipy
+wandb
+tqdm
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/run_pretrain.py b/nlp/text_classification/bert/paddlepaddle/static_ipu/run_pretrain.py
new file mode 100644
index 000000000..107247c68
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/run_pretrain.py
@@ -0,0 +1,410 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import pickle
+import random
+import time
+
+import numpy as np
+import paddle
+import paddle.optimizer
+import paddle.static
+from paddlenlp.transformers import LinearDecayWithWarmup
+from scipy.stats import truncnorm
+
+from dataset_ipu import PretrainingHDF5DataLoader
+from modeling import (
+    BertModel, DeviceScope, IpuBertConfig, IpuBertPretrainingMLMAccAndLoss,
+    IpuBertPretrainingMLMHeads, IpuBertPretrainingNSPAccAndLoss,
+    IpuBertPretrainingNSPHeads)
+from utils import load_custom_ops, parse_args, ProgressBar, ProgressFunc
+
+
+def set_seed(seed):
+    """
+    Use the same data seed(for data shuffle) for all procs to guarantee data
+    consistency after sharding.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    paddle.seed(seed)
+
+
+def create_data_holder(args):
+    bs = args.micro_batch_size
+    indices = paddle.static.data(
+        name="indices", shape=[bs * args.seq_len], dtype="int32")
+    segments = paddle.static.data(
+        name="segments", shape=[bs * args.seq_len], dtype="int32")
+    positions = paddle.static.data(
+        name="positions", shape=[bs * args.seq_len], dtype="int32")
+    mask_tokens_mask_idx = paddle.static.data(
+        name="mask_tokens_mask_idx", shape=[bs, 1], dtype="int32")
+    sequence_mask_idx = paddle.static.data(
+        name="sequence_mask_idx", shape=[bs, 1], dtype="int32")
+    masked_lm_ids = paddle.static.data(
+        name="masked_lm_ids",
+        shape=[bs, args.max_predictions_per_seq],
+        dtype="int32")
+    next_sentence_labels = paddle.static.data(
+        name="next_sentence_labels", shape=[bs], dtype="int32")
+    return [
+        indices, segments, positions, mask_tokens_mask_idx, sequence_mask_idx,
+        masked_lm_ids, next_sentence_labels
+    ]
+
+
+def reset_program_state_dict(state_dict, mean=0, scale=0.02):
+    """
+    Initialize the parameter from the bert config, and set the parameter by
+    reseting the state dict."
+    """
+    new_state_dict = dict()
+    for n, p in state_dict.items():
+        if  n.endswith('_moment1_0') or n.endswith('_moment2_0') \
+            or n.endswith('_beta2_pow_acc_0') or n.endswith('_beta1_pow_acc_0'):
+            continue
+        if 'learning_rate' in n:
+            continue
+
+        dtype_str = "float32"
+        if p._dtype == paddle.float64:
+            dtype_str = "float64"
+
+        if "layer_norm" in n and n.endswith('.w_0'):
+            new_state_dict[n] = np.ones(p.shape()).astype(dtype_str)
+            continue
+
+        if n.endswith('.b_0'):
+            new_state_dict[n] = np.zeros(p.shape()).astype(dtype_str)
+        else:
+            new_state_dict[n] = truncnorm.rvs(-2,
+                                              2,
+                                              loc=mean,
+                                              scale=scale,
+                                              size=p.shape()).astype(dtype_str)
+    return new_state_dict
+
+
+def create_ipu_strategy(args):
+    ipu_strategy = paddle.static.IpuStrategy()
+    options = {
+        'is_training': args.is_training,
+        'enable_manual_shard': True,
+        'enable_pipelining': True,
+        'batches_per_step': args.batches_per_step,
+        'micro_batch_size': args.micro_batch_size,
+        'loss_scaling': args.scale_loss,
+        'enable_replicated_graphs': True,
+        'replicated_graph_count': args.num_replica,
+        'num_ipus': args.num_ipus * args.num_replica,
+        'enable_gradient_accumulation': args.enable_grad_acc,
+        'accumulation_factor': args.grad_acc_factor,
+        'auto_recomputation': 3,
+        'enable_half_partial': True,
+        'available_memory_proportion': args.available_mem_proportion,
+        'enable_stochastic_rounding': True,
+        'max_weight_norm': 65504.0,
+        'default_prefetch_buffering_depth': 3,
+        'rearrange_anchors_on_host': False,
+        'enable_fp16': args.ipu_enable_fp16,
+        'random_seed': args.seed,
+        'use_no_bias_optimizer': True,
+        'enable_prefetch_datastreams': True,
+        'enable_outlining': True,
+        'subgraph_copying_strategy': 1,  # JustInTime
+        'outline_threshold': 10.0,
+        'disable_grad_accumulation_tensor_streams': True,
+        'schedule_non_weight_update_gradient_consumers_early': True,
+        'cache_path': 'paddle_cache',
+        'enable_floating_point_checks': False,
+        'accl1_type': args.accl1_type,
+        'accl2_type': args.accl2_type,
+        'weight_decay_mode': args.weight_decay_mode,
+    }
+
+    if not args.optimizer_state_offchip:
+        options['location_optimizer'] = {
+            'on_chip': 1,  # popart::TensorStorage::OnChip
+            'use_replicated_tensor_sharding':
+            1,  # popart::ReplicatedTensorSharding::On
+        }
+
+    # use popart::AccumulateOuterFragmentSchedule::OverlapMemoryOptimized
+    # excludedVirtualGraphs = [0]
+    options['accumulate_outer_fragment'] = {3: [0]}
+
+    options['convolution_options'] = {"partialsType": "half"}
+    options['engine_options'] = {
+        "opt.useAutoloader": "true",
+        "target.syncReplicasIndependently": "true",
+        "exchange.streamBufferOverlap": "hostRearrangeOnly",
+    }
+
+    options['enable_engine_caching'] = args.enable_engine_caching
+
+    options['compilation_progress_logger'] = ProgressFunc
+
+    ipu_strategy.set_options(options)
+
+    # enable custom patterns
+    ipu_strategy.enable_pattern('DisableAttnDropoutBwdPattern')
+
+    return ipu_strategy
+
+
+def main(args):
+    paddle.enable_static()
+    place = paddle.set_device('ipu')
+    set_seed(args.seed)
+    main_program = paddle.static.default_main_program()
+    startup_program = paddle.static.default_startup_program()
+
+    # The sharding of encoder layers
+    if args.num_hidden_layers == 12:
+        attn_index = [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3]
+        ff_index = [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3]
+    else:
+        raise Exception("Only support num_hidden_layers = 12")
+
+    bert_config = {
+        k: getattr(args, k)
+        for k in IpuBertConfig._fields if hasattr(args, k)
+    }
+    bert_config['embeddings_scope'] = DeviceScope(0, 0, "Embedding")
+    bert_config['attn_scopes'] = [
+        DeviceScope(attn_index[i], attn_index[i])
+        for i in range(args.num_hidden_layers)
+    ]
+    bert_config['ff_scopes'] = [
+        DeviceScope(ff_index[i], ff_index[i])
+        for i in range(args.num_hidden_layers)
+    ]
+    bert_config['mlm_scope'] = DeviceScope(0, args.num_ipus, "MLM")
+    bert_config['nsp_scope'] = DeviceScope(0, args.num_ipus, "NSP")
+    bert_config['layers_per_ipu'] = [4, 4, 4]
+
+    config = IpuBertConfig(**bert_config)
+
+    # custom_ops
+    custom_ops = load_custom_ops()
+
+    # Load the training dataset
+    logging.info("Loading dataset")
+    input_files = [
+        os.path.join(args.input_files, f) for f in os.listdir(args.input_files)
+        if os.path.isfile(os.path.join(args.input_files, f)) and "training" in f
+    ]
+    input_files.sort()
+
+    dataset = PretrainingHDF5DataLoader(
+        input_files=input_files,
+        max_seq_length=args.seq_len,
+        max_mask_tokens=args.max_predictions_per_seq,
+        batch_size=args.batch_size,
+        shuffle=args.shuffle)
+    logging.info(f"dataset length: {len(dataset)}")
+    total_samples = dataset.total_samples
+    logging.info("total samples: %d, total batch_size: %d, max steps: %d" %
+                 (total_samples, args.batch_size, args.max_steps))
+
+    logging.info("Building Model")
+
+    [
+        indices, segments, positions, mask_tokens_mask_idx, sequence_mask_idx,
+        masked_lm_ids, next_sentence_labels
+    ] = create_data_holder(args)
+
+    # Encoder Layers
+    bert_model = BertModel(config, custom_ops)
+    encoders, word_embedding = bert_model(
+        indices, segments, positions,
+        [mask_tokens_mask_idx, sequence_mask_idx])
+
+    # PretrainingHeads
+    mlm_heads = IpuBertPretrainingMLMHeads(
+        args.hidden_size, args.vocab_size, args.max_position_embeddings,
+        args.max_predictions_per_seq, args.seq_len)
+    nsp_heads = IpuBertPretrainingNSPHeads(
+        args.hidden_size, args.max_predictions_per_seq, args.seq_len)
+
+    # AccAndLoss
+    nsp_criterion = IpuBertPretrainingNSPAccAndLoss(
+        args.micro_batch_size, args.ignore_index, custom_ops)
+    mlm_criterion = IpuBertPretrainingMLMAccAndLoss(
+        args.micro_batch_size, args.ignore_index, custom_ops)
+
+    with config.nsp_scope:
+        nsp_out = nsp_heads(encoders)
+        nsp_acc, nsp_loss = nsp_criterion(nsp_out, next_sentence_labels)
+
+    with config.mlm_scope:
+        mlm_out = mlm_heads(encoders, word_embedding)
+        mlm_acc, mlm_loss, = mlm_criterion(mlm_out, masked_lm_ids)
+        total_loss = mlm_loss + nsp_loss
+
+    # lr_scheduler
+    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, args.max_steps,
+                                         args.warmup_steps)
+    # optimizer
+    optimizer = paddle.optimizer.Lamb(
+        learning_rate=lr_scheduler,
+        lamb_weight_decay=args.weight_decay,
+        beta1=args.beta1,
+        beta2=args.beta2,
+        epsilon=args.adam_epsilon)
+    optimizer.minimize(total_loss)
+
+    # Static executor
+    exe = paddle.static.Executor(place)
+    exe.run(startup_program)
+
+    # Set initial weights
+    state_dict = main_program.state_dict()
+    reset_state_dict = reset_program_state_dict(state_dict)
+    paddle.static.set_program_state(main_program, reset_state_dict)
+
+    if args.enable_load_params:
+        logging.info(f'loading weights from: {args.load_params_path}')
+        if not args.load_params_path.endswith('pdparams'):
+            raise Exception('need pdparams file')
+        with open(args.load_params_path, 'rb') as file:
+            params = pickle.load(file)
+        paddle.static.set_program_state(main_program, params)
+
+    # Create ipu_strategy
+    ipu_strategy = create_ipu_strategy(args)
+
+    feed_list = [
+        "indices",
+        "segments",
+        "positions",
+        "mask_tokens_mask_idx",
+        "sequence_mask_idx",
+        "masked_lm_ids",
+        "next_sentence_labels",
+    ]
+    fetch_list = [mlm_acc.name, mlm_loss.name, nsp_acc.name, nsp_loss.name]
+
+    # Compile program for IPU
+    ipu_compiler = paddle.static.IpuCompiledProgram(
+        main_program, ipu_strategy=ipu_strategy)
+    logging.info(f'start compiling, please wait some minutes')
+    cur_time = time.time()
+    main_program = ipu_compiler.compile(feed_list, fetch_list)
+    time_cost = time.time() - cur_time
+    logging.info(f'finish compiling! time cost: {time_cost}')
+
+    batch_start = time.time()
+    global_step = 0
+    for batch in dataset:
+        global_step += 1
+        epoch = global_step * args.batch_size // total_samples
+        read_cost = time.time() - batch_start
+
+        feed = {
+            "indices": batch[0],
+            "segments": batch[1],
+            "positions": batch[2],
+            "mask_tokens_mask_idx": batch[3],
+            "sequence_mask_idx": batch[4],
+            "masked_lm_ids": batch[5],
+            "next_sentence_labels": batch[6],
+        }
+        lr_scheduler.step()
+
+        train_start = time.time()
+        loss_return = exe.run(main_program,
+                              feed=feed,
+                              fetch_list=fetch_list,
+                              use_program_cache=True)
+        train_cost = time.time() - train_start
+        total_cost = time.time() - batch_start
+        tput = args.batch_size / total_cost
+
+        if args.wandb:
+            wandb.log({
+                "epoch": epoch,
+                "global_step": global_step,
+                "loss/MLM": np.mean(loss_return[1]),
+                "loss/NSP": np.mean(loss_return[3]),
+                "accuracy/MLM": np.mean(loss_return[0]),
+                "accuracy/NSP": np.mean(loss_return[2]),
+                "latency/read": read_cost,
+                "latency/train": train_cost,
+                "latency/e2e": total_cost,
+                "throughput": tput,
+                "learning_rate": lr_scheduler(),
+            })
+
+        if global_step % args.logging_steps == 0:
+            logging.info({
+                "epoch": epoch,
+                "global_step": global_step,
+                "loss/MLM": np.mean(loss_return[1]),
+                "loss/NSP": np.mean(loss_return[3]),
+                "accuracy/MLM": np.mean(loss_return[0]),
+                "accuracy/NSP": np.mean(loss_return[2]),
+                "latency/read": read_cost,
+                "latency/train": train_cost,
+                "latency/e2e": total_cost,
+                "throughput": tput,
+                "learning_rate": lr_scheduler(),
+            })
+
+        if global_step % args.save_steps == 0:
+            ipu_compiler._backend.weights_to_host()
+            paddle.static.save(main_program.org_program,
+                               os.path.join(args.output_dir,
+                                            'step_{}'.format(global_step)))
+
+        if global_step >= args.max_steps:
+            ipu_compiler._backend.weights_to_host()
+            paddle.static.save(
+                main_program.org_program,
+                os.path.join(args.output_dir,
+                             'final_step_{}'.format(global_step)))
+            dataset.release()
+            del dataset
+            return
+
+        batch_start = time.time()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(name)s %(levelname)s %(message)s",
+        datefmt='%Y-%m-%d %H:%M:%S %a')
+
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir, exist_ok=True)
+
+    if args.wandb:
+        import wandb
+        wandb.init(
+            project="paddle-base-bert",
+            settings=wandb.Settings(console='off'),
+            name='paddle-base-bert')
+        wandb_config = vars(args)
+        wandb_config["global_batch_size"] = args.batch_size
+        wandb.config.update(args)
+
+    logging.info(args)
+    main(args)
+    logging.info("program finished")
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/run_squad.py b/nlp/text_classification/bert/paddlepaddle/static_ipu/run_squad.py
new file mode 100644
index 000000000..e4f516e92
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/run_squad.py
@@ -0,0 +1,516 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import os
+import pickle
+import time
+from functools import partial
+
+import numpy as np
+import paddle
+import paddle.optimizer
+import paddle.static
+from datasets import load_dataset
+from paddle.io import BatchSampler, DataLoader
+from paddlenlp.data import Dict, Stack
+from paddlenlp.metrics.squad import compute_prediction, squad_evaluate
+from paddlenlp.transformers import BertTokenizer, LinearDecayWithWarmup
+
+from modeling import (BertModel, DeviceScope, IpuBertConfig,
+                      IpuBertForQuestionAnswering, IpuBertQAAccAndLoss)
+from run_pretrain import (create_ipu_strategy, reset_program_state_dict,
+                          set_seed)
+from utils import load_custom_ops, parse_args
+
+
+def create_data_holder(args):
+    bs = args.micro_batch_size
+    indices = paddle.static.data(
+        name="indices", shape=[bs * args.seq_len], dtype="int32")
+    segments = paddle.static.data(
+        name="segments", shape=[bs * args.seq_len], dtype="int32")
+    positions = paddle.static.data(
+        name="positions", shape=[bs * args.seq_len], dtype="int32")
+    input_mask = paddle.static.data(
+        name="input_mask", shape=[bs, 1, 1, args.seq_len], dtype="float32")
+    if not args.is_training:
+        return [indices, segments, positions, input_mask]
+    else:
+        start_labels = paddle.static.data(
+            name="start_labels", shape=[bs], dtype="int32")
+        end_labels = paddle.static.data(
+            name="end_labels", shape=[bs], dtype="int32")
+        return [
+            indices, segments, positions, input_mask, start_labels, end_labels
+        ]
+
+
+def prepare_train_features(examples, tokenizer, args):
+    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
+    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
+    # left whitespace
+    contexts = examples['context']
+    questions = examples['question']
+
+    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+    # in one example possible giving several features when a context is long, each of those features having a
+    # context that overlaps a bit the context of the previous feature.
+    tokenized_examples = tokenizer(
+        questions,
+        contexts,
+        stride=128,
+        max_seq_len=args.seq_len,
+        pad_to_max_seq_len=True,
+        return_position_ids=True,
+        return_token_type_ids=True,
+        return_attention_mask=True,
+        return_length=True)
+
+    # Since one example might give us several features if it has a long context, we need a map from a feature to
+    # its corresponding example. This key gives us just that.
+    sample_mapping = tokenized_examples.pop("overflow_to_sample")
+    # The offset mappings will give us a map from token to character position in the original context. This will
+    # help us compute the start_positions and end_positions.
+    offset_mapping = tokenized_examples.pop("offset_mapping")
+
+    # Let's label those examples!
+    tokenized_examples["start_positions"] = []
+    tokenized_examples["end_positions"] = []
+    tokenized_examples["input_mask"] = []
+
+    for i, offsets in enumerate(offset_mapping):
+        # We will label impossible answers with the index of the CLS token.
+        input_ids = tokenized_examples["input_ids"][i]
+        cls_index = input_ids.index(tokenizer.cls_token_id)
+
+        sequence_ids = tokenized_examples['token_type_ids'][i]
+
+        # attention_mask to input_mask
+        input_mask = (
+            np.asarray(tokenized_examples["attention_mask"][i]) - 1) * 1e3
+        input_mask = np.expand_dims(input_mask, axis=(0, 1))
+        if args.ipu_enable_fp16:
+            input_mask = input_mask.astype(np.float16)
+        else:
+            input_mask = input_mask.astype(np.float32)
+        tokenized_examples["input_mask"].append(input_mask)
+
+        # One example can give several spans, this is the index of the example containing this span of text.
+        sample_index = sample_mapping[i]
+        answers = examples['answers'][sample_index]
+        # If no answers are given, set the cls_index as answer.
+        if len(answers["answer_start"]) == 0:
+            tokenized_examples["start_positions"].append(cls_index)
+            tokenized_examples["end_positions"].append(cls_index)
+        else:
+            # Start/end character index of the answer in the text.
+            start_char = answers["answer_start"][0]
+            end_char = start_char + len(answers["text"][0])
+
+            # Start token index of the current span in the text.
+            token_start_index = 0
+            while sequence_ids[token_start_index] != 1:
+                token_start_index += 1
+
+            # End token index of the current span in the text.
+            token_end_index = len(input_ids) - 1
+            while sequence_ids[token_end_index] != 1:
+                token_end_index -= 1
+            token_end_index -= 1
+
+            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
+            if not (offsets[token_start_index][0] <= start_char and
+                    offsets[token_end_index][1] >= end_char):
+                tokenized_examples["start_positions"].append(cls_index)
+                tokenized_examples["end_positions"].append(cls_index)
+            else:
+                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
+                # Note: we could go after the last offset if the answer is the last word (edge case).
+                while token_start_index < len(offsets) and offsets[
+                        token_start_index][0] <= start_char:
+                    token_start_index += 1
+                tokenized_examples["start_positions"].append(token_start_index -
+                                                             1)
+                while offsets[token_end_index][1] >= end_char:
+                    token_end_index -= 1
+                tokenized_examples["end_positions"].append(token_end_index + 1)
+
+    return tokenized_examples
+
+
+def prepare_validation_features(examples, tokenizer, args):
+    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+    # in one example possible giving several features when a context is long, each of those features having a
+    # context that overlaps a bit the context of the previous feature.
+    #NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is
+    # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead.
+    contexts = examples['context']
+    questions = examples['question']
+    tokenized_examples = tokenizer(
+        questions,
+        contexts,
+        stride=128,
+        max_seq_len=args.seq_len,
+        pad_to_max_seq_len=True,
+        return_position_ids=True,
+        return_token_type_ids=True,
+        return_attention_mask=True,
+        return_length=True)
+
+    # Since one example might give us several features if it has a long context, we need a map from a feature to
+    # its corresponding example. This key gives us just that.
+    sample_mapping = tokenized_examples.pop("overflow_to_sample")
+
+    # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
+    # corresponding example_id and we will store the offset mappings.
+    tokenized_examples["example_id"] = []
+    tokenized_examples["input_mask"] = []
+
+    for i in range(len(tokenized_examples["input_ids"])):
+        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+        input_ids = tokenized_examples["input_ids"][i]
+        sequence_A_lengths = input_ids.index(tokenizer.sep_token_id) + 2
+        sequence_B_lengths = len(input_ids) - sequence_A_lengths
+        sequence_ids = [0] * sequence_A_lengths + [1] * sequence_B_lengths
+        context_index = 1
+
+        # One example can give several spans, this is the index of the example containing this span of text.
+        sample_index = sample_mapping[i]
+        tokenized_examples["example_id"].append(examples["id"][sample_index])
+
+        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
+        # position is part of the context or not.
+        tokenized_examples["offset_mapping"][i] = [
+            (o if sequence_ids[k] == context_index else None)
+            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
+        ]
+
+        # attention_mask to input_mask
+        input_mask = (
+            np.asarray(tokenized_examples["attention_mask"][i]) - 1) * 1e3
+        input_mask = np.expand_dims(input_mask, axis=(0, 1))
+        if args.ipu_enable_fp16:
+            input_mask = input_mask.astype(np.float16)
+        else:
+            input_mask = input_mask.astype(np.float32)
+        tokenized_examples["input_mask"].append(input_mask)
+
+    return tokenized_examples
+
+
+def load_squad_dataset(args):
+    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    features_fn = prepare_train_features if args.is_training else prepare_validation_features
+    if args.is_training:
+        raw_dataset = load_dataset('squad', split='train')
+    else:
+        raw_dataset = load_dataset('squad', split='validation')
+    column_names = raw_dataset.column_names
+    dataset = raw_dataset.map(partial(
+        features_fn, tokenizer=tokenizer, args=args),
+                              batched=True,
+                              remove_columns=column_names,
+                              num_proc=4)
+
+    bs = args.micro_batch_size * args.grad_acc_factor * args.batches_per_step * args.num_replica
+    args.batch_size = bs
+    if args.is_training:
+        train_batch_sampler = BatchSampler(
+            dataset, batch_size=bs, shuffle=args.shuffle, drop_last=True)
+    else:
+        train_batch_sampler = BatchSampler(
+            dataset, batch_size=bs, shuffle=args.shuffle, drop_last=False)
+
+    if args.is_training:
+        collate_fn = lambda samples, fn=Dict({
+            "input_ids": Stack(),
+            "token_type_ids": Stack(),
+            "position_ids": Stack(),
+            "input_mask": Stack(),
+            "start_positions": Stack(),
+            "end_positions": Stack()
+        }): fn(samples)
+    else:
+        collate_fn = lambda samples, fn=Dict({
+            "input_ids": Stack(),
+            "token_type_ids": Stack(),
+            "position_ids": Stack(),
+            "input_mask": Stack()}): fn(samples)
+
+    data_loader = DataLoader(
+        dataset=dataset,
+        batch_sampler=train_batch_sampler,
+        collate_fn=collate_fn,
+        return_list=True)
+    return raw_dataset, data_loader
+
+
+def main(args):
+    paddle.enable_static()
+    place = paddle.set_device('ipu')
+    set_seed(args.seed)
+    main_program = paddle.static.default_main_program()
+    startup_program = paddle.static.default_startup_program()
+
+    # The sharding of encoder layers
+    if args.num_hidden_layers == 12:
+        attn_ipu_index = [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
+        ff_ipu_index = [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
+    else:
+        raise Exception("Only support num_hidden_layers = 12")
+
+    bert_config = {
+        k: getattr(args, k)
+        for k in IpuBertConfig._fields if hasattr(args, k)
+    }
+    bert_config['embeddings_scope'] = DeviceScope(0, 0, "Embedding")
+    bert_config['attn_scopes'] = [
+        DeviceScope(attn_ipu_index[i], attn_ipu_index[i])
+        for i in range(args.num_hidden_layers)
+    ]
+    bert_config['ff_scopes'] = [
+        DeviceScope(ff_ipu_index[i], ff_ipu_index[i])
+        for i in range(args.num_hidden_layers)
+    ]
+    bert_config['layers_per_ipu'] = [6, 6]
+
+    config = IpuBertConfig(**bert_config)
+
+    # custom_ops
+    custom_ops = load_custom_ops()
+
+    logging.info("building model")
+
+    if args.is_training:
+        [indices, segments, positions, input_mask, start_labels,
+         end_labels] = create_data_holder(args)
+    else:
+        [indices, segments, positions, input_mask] = create_data_holder(args)
+
+    # Encoder Layers
+    bert_model = BertModel(config, custom_ops)
+    encoders, _ = bert_model(indices, segments, positions, input_mask)
+
+    squad_scope = DeviceScope(args.num_ipus - 1, args.num_ipus - 1, "squad")
+    with squad_scope:
+        qa_cls = IpuBertForQuestionAnswering(args.hidden_size, args.seq_len)
+        start_logits, end_logits = qa_cls(encoders)
+
+        if args.is_training:
+            acc_loss = IpuBertQAAccAndLoss(custom_ops)
+            acc0, acc1, loss = acc_loss(start_logits, end_logits, start_labels,
+                                        end_labels)
+
+    # load squad dataset
+    raw_dataset, data_loader = load_squad_dataset(args)
+
+    total_samples = len(data_loader.dataset)
+    max_steps = total_samples // args.batch_size * args.epochs
+    logging.info("total samples: %d, total batch_size: %d, max steps: %d" %
+                 (total_samples, args.batch_size, max_steps))
+
+    if args.is_training:
+        lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_steps,
+                                             args.warmup_steps)
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=lr_scheduler,
+            weight_decay=args.weight_decay,
+            beta1=args.beta1,
+            beta2=args.beta2,
+            epsilon=args.adam_epsilon)
+        optimizer.minimize(loss)
+
+    # Static executor
+    exe = paddle.static.Executor(place)
+    exe.run(startup_program)
+
+    # Set initial weights
+    state_dict = main_program.state_dict()
+    reset_state_dict = reset_program_state_dict(state_dict)
+    paddle.static.set_program_state(main_program, reset_state_dict)
+
+    if args.enable_load_params:
+        logging.info(f'loading weights from: {args.load_params_path}')
+        if not args.load_params_path.endswith('pdparams'):
+            raise Exception('need pdparams file')
+        with open(args.load_params_path, 'rb') as file:
+            params = pickle.load(file)
+        # Delete mlm and nsp weights
+        if args.is_training and 'linear_72.w_0' in params:
+            params.pop("linear_72.w_0")
+            params.pop("linear_72.b_0")
+        paddle.static.set_program_state(main_program, params)
+
+    if args.tf_checkpoint:
+        from load_tf_ckpt import load_initializers_from_tf
+        logging.info(f'loading weights from: {args.tf_checkpoint}')
+        initializers, _ = load_initializers_from_tf(args.tf_checkpoint, args)
+        paddle.static.set_program_state(main_program, initializers)
+
+    # Create ipu_strategy
+    ipu_strategy = create_ipu_strategy(args)
+
+    if args.is_training:
+        feed_list = [
+            "indices", "segments", "positions", "input_mask", "start_labels",
+            "end_labels"
+        ]
+        fetch_list = [loss.name, acc0.name, acc1.name]
+    else:
+        feed_list = ["indices", "segments", "positions", "input_mask"]
+        fetch_list = [start_logits.name, end_logits.name]
+
+    ipu_compiler = paddle.static.IpuCompiledProgram(
+        main_program, ipu_strategy=ipu_strategy)
+    logging.info(f'start compiling, please wait some minutes')
+    cur_time = time.time()
+    main_program = ipu_compiler.compile(feed_list, fetch_list)
+    time_cost = time.time() - cur_time
+    logging.info(f'finish compiling! time cost: {time_cost}')
+
+    if args.is_training:
+        global_step = 0
+        batch_start = time.time()
+        for epoch in range(args.epochs):
+            for batch in data_loader:
+                global_step += 1
+
+                feed = {
+                    "indices": batch[0],
+                    "segments": batch[1],
+                    "positions": batch[2],
+                    "input_mask": batch[3],
+                    "start_labels": batch[4],
+                    "end_labels": batch[5],
+                }
+                lr_scheduler.step()
+
+                train_start = time.time()
+                outputs = exe.run(main_program,
+                                  feed=feed,
+                                  fetch_list=fetch_list,
+                                  use_program_cache=True)
+                train_cost = time.time() - train_start
+                total_cost = time.time() - batch_start
+
+                tput = args.batch_size / total_cost
+                if args.wandb:
+                    wandb.log({
+                        "epoch": epoch,
+                        "global_step": global_step,
+                        "loss": np.mean(outputs[0]),
+                        "accuracy": np.mean(outputs[1:]),
+                        "train_cost": train_cost,
+                        "total_cost": total_cost,
+                        "throughput": tput,
+                        "learning_rate": lr_scheduler(),
+                    })
+
+                if global_step % args.logging_steps == 0:
+                    logging.info({
+                        "epoch": epoch,
+                        "global_step": global_step,
+                        "loss": np.mean(outputs[0]),
+                        "accuracy": np.mean(outputs[1:]),
+                        "train_cost": train_cost,
+                        "total_cost": total_cost,
+                        "throughput": tput,
+                        "learning_rate": lr_scheduler(),
+                    })
+
+                batch_start = time.time()
+
+        # save final state
+        ipu_compiler._backend.weights_to_host()
+        paddle.static.save(main_program.org_program,
+                           os.path.join(args.output_dir, 'Final_model'))
+
+    if not args.is_training:
+        all_start_logits = []
+        all_end_logits = []
+        for step, batch in enumerate(data_loader):
+            if step % args.logging_steps == 0:
+                logging.info(f'running step: {step}')
+
+            real_len = np.array(batch[0]).shape[0]
+            # padding zeros if needed
+            if real_len < args.batch_size:
+                batch = [np.asarray(x) for x in batch]
+                pad0 = np.zeros([args.batch_size - real_len,
+                                 args.seq_len]).astype(batch[0].dtype)
+                batch[0] = np.vstack((batch[0], pad0))
+                batch[1] = np.vstack((batch[1], pad0))
+                batch[2] = np.vstack((batch[2], pad0))
+                pad1 = np.zeros(
+                    [args.batch_size - real_len, 1, 1, args.seq_len]) - 1e3
+                pad1 = pad1.astype(batch[3].dtype)
+                batch[3] = np.vstack((batch[3], pad1))
+
+            feed = {
+                "indices": batch[0],
+                "segments": batch[1],
+                "positions": batch[2],
+                "input_mask": batch[3],
+            }
+            start_logits, end_logits = exe.run(main_program,
+                                               feed=feed,
+                                               fetch_list=fetch_list)
+
+            start_logits = start_logits.reshape([-1, args.seq_len])
+            end_logits = end_logits.reshape([-1, args.seq_len])
+            for idx in range(real_len):
+                all_start_logits.append(start_logits[idx])
+                all_end_logits.append(end_logits[idx])
+
+        # evaluate results
+        all_predictions, all_nbest_json, scores_diff_json = compute_prediction(
+            raw_dataset, data_loader.dataset,
+            (all_start_logits, all_end_logits))
+        squad_evaluate(
+            examples=[raw_data for raw_data in raw_dataset],
+            preds=all_predictions,
+            na_probs=scores_diff_json)
+        # write results to file
+        with open('squad_prediction.json', "w", encoding='utf-8') as writer:
+            writer.write(
+                json.dumps(
+                    all_predictions, ensure_ascii=False, indent=4) + "\n")
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(name)s %(levelname)s %(message)s",
+        datefmt='%Y-%m-%d %H:%M:%S %a')
+
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir, exist_ok=True)
+
+    if args.wandb:
+        import wandb
+        wandb.init(
+            project="paddle-squad",
+            settings=wandb.Settings(console='off'),
+            name='paddle-squad')
+        wandb_config = vars(args)
+        wandb_config["global_batch_size"] = args.batch_size
+        wandb.config.update(args)
+
+    logging.info(args)
+    main(args)
+    logging.info("program finished")
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod16/run_pretrain.sh b/nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod16/run_pretrain.sh
new file mode 100644
index 000000000..cd1c5bb00
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod16/run_pretrain.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+export RDMAV_FORK_SAFE=1
+python3 run_pretrain.py \
+        --input_files "path_to_phase1_hdf5_dataset" \
+        --output_dir pretrain_128_model \
+        --seq_len 128 \
+        --hidden_size 768 \
+        --vocab_size 30400 \
+        --max_predictions_per_seq 20 \
+        --max_position_embeddings 512 \
+        --learning_rate 0.006 \
+        --weight_decay 1e-2 \
+        --max_steps 7038 \
+        --warmup_steps 2000 \
+        --logging_steps 10 \
+        --seed 1984 \
+        --beta1 0.9 \
+        --beta2 0.999 \
+        --num_hidden_layers 12 \
+        --micro_batch_size 32 \
+        --ipu_enable_fp16 True \
+        --scale_loss 512 \
+        --batches_per_step 1 \
+        --num_replica 4 \
+        --enable_grad_acc True \
+        --grad_acc_factor 512 \
+        --batch_size 65536 \
+        --available_mem_proportion 0.28 \
+        --ignore_index 0 \
+        --enable_load_params False \
+        --hidden_dropout_prob 0.1 \
+        --attention_probs_dropout_prob 0.1 \
+        --shuffle True \
+        --wandb False \
+        --save_steps 1000
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod16/run_pretrain_phase2.sh b/nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod16/run_pretrain_phase2.sh
new file mode 100644
index 000000000..8458ed48b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod16/run_pretrain_phase2.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+export RDMAV_FORK_SAFE=1
+python3 run_pretrain.py \
+        --input_files "path_to_phase2_hdf5_dataset" \
+        --output_dir pretrain_384_model \
+        --seq_len 384 \
+        --hidden_size 768 \
+        --vocab_size 30400 \
+        --max_predictions_per_seq 56 \
+        --max_position_embeddings 512 \
+        --learning_rate 0.002828427125 \
+        --weight_decay 1e-2 \
+        --max_steps 2137 \
+        --warmup_steps 274 \
+        --logging_steps 10 \
+        --seed 1984 \
+        --beta1 0.9 \
+        --beta2 0.999 \
+        --num_hidden_layers 12 \
+        --micro_batch_size 8 \
+        --ipu_enable_fp16 True \
+        --scale_loss 128 \
+        --batches_per_step 1 \
+        --num_replica 4 \
+        --enable_grad_acc True \
+        --grad_acc_factor 512 \
+        --batch_size 16384 \
+        --available_mem_proportion 0.28 \
+        --ignore_index 0 \
+        --enable_load_params True \
+        --load_params_path "./pretrain_128_model/final_step_7038.pdparams" \
+        --hidden_dropout_prob 0.1 \
+        --attention_probs_dropout_prob 0.1 \
+        --shuffle True \
+        --wandb False \
+        --enable_engine_caching False \
+        --save_steps 500
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod16/run_squad.sh b/nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod16/run_squad.sh
new file mode 100644
index 000000000..4c36ef69d
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod16/run_squad.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+python3 run_squad.py \
+        --output_dir squad_model \
+        --task "SQUAD" \
+        --is_training True \
+        --seq_len 384 \
+        --hidden_size 768 \
+        --vocab_size 30400 \
+        --max_predictions_per_seq 56 \
+        --max_position_embeddings 512 \
+        --learning_rate 5.6e-05 \
+        --weight_decay 0 \
+        --epochs 4 \
+        --warmup_steps 52 \
+        --logging_steps 10 \
+        --seed 42 \
+        --beta1 0.9 \
+        --beta2 0.999 \
+        --num_hidden_layers 12 \
+        --micro_batch_size 2 \
+        --ipu_enable_fp16 True \
+        --accl1_type "FLOAT" \
+        --accl2_type "FLOAT" \
+        --weight_decay_mode "decay" \
+        --scale_loss 256 \
+        --optimizer_state_offchip False \
+        --batches_per_step 4 \
+        --num_replica 4 \
+        --num_ipus 2 \
+        --enable_grad_acc True \
+        --grad_acc_factor 16 \
+        --available_mem_proportion 0.40 \
+        --ignore_index 0 \
+        --hidden_dropout_prob 0.1 \
+        --attention_probs_dropout_prob 0.1 \
+        --shuffle True \
+        --wandb False \
+        --enable_engine_caching False \
+        --enable_load_params True \
+        --load_params_path "pretrain_384_model/final_step_2137.pdparams"
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod16/run_squad_infer.sh b/nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod16/run_squad_infer.sh
new file mode 100644
index 000000000..28ffa7285
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod16/run_squad_infer.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+python3 run_squad.py \
+        --output_dir squad_model \
+        --task "SQUAD" \
+        --is_training False \
+        --seq_len 384 \
+        --hidden_size 768 \
+        --vocab_size 30400 \
+        --max_predictions_per_seq 56 \
+        --max_position_embeddings 512 \
+        --learning_rate 5.6e-05 \
+        --weight_decay 1e-2 \
+        --epochs 4 \
+        --warmup_steps 52 \
+        --logging_steps 10 \
+        --seed 1984 \
+        --beta1 0.9 \
+        --beta2 0.999 \
+        --num_hidden_layers 12 \
+        --micro_batch_size 2 \
+        --ipu_enable_fp16 True \
+        --scale_loss 256 \
+        --optimizer_state_offchip False \
+        --batches_per_step 4 \
+        --num_replica 4 \
+        --num_ipus 2 \
+        --enable_grad_acc False \
+        --grad_acc_factor 1 \
+        --available_mem_proportion 0.40 \
+        --ignore_index 0 \
+        --hidden_dropout_prob 0.0 \
+        --attention_probs_dropout_prob 0.0 \
+        --shuffle False \
+        --wandb False \
+        --enable_engine_caching False \
+        --enable_load_params True \
+        --load_params_path "squad_model/Final_model.pdparams"
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod4/run_pretrain.sh b/nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod4/run_pretrain.sh
new file mode 100644
index 000000000..299e0dc25
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod4/run_pretrain.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+export RDMAV_FORK_SAFE=1
+python3 run_pretrain.py \
+        --input_files "path_to_phase1_hdf5_dataset" \
+        --output_dir pretrain_128_model \
+        --seq_len 128 \
+        --hidden_size 768 \
+        --vocab_size 30400 \
+        --max_predictions_per_seq 20 \
+        --max_position_embeddings 512 \
+        --learning_rate 0.006 \
+        --weight_decay 1e-2 \
+        --max_steps 7038 \
+        --warmup_steps 2000 \
+        --logging_steps 10 \
+        --seed 1984 \
+        --beta1 0.9 \
+        --beta2 0.999 \
+        --num_hidden_layers 12 \
+        --micro_batch_size 32 \
+        --ipu_enable_fp16 True \
+        --scale_loss 512 \
+        --batches_per_step 1 \
+        --num_replica 1 \
+        --enable_grad_acc True \
+        --grad_acc_factor 2048 \
+        --batch_size 65536 \
+        --available_mem_proportion 0.28 \
+        --ignore_index 0 \
+        --enable_load_params False \
+        --hidden_dropout_prob 0.1 \
+        --attention_probs_dropout_prob 0.1 \
+        --shuffle True \
+        --wandb False \
+        --save_steps 1000
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod4/run_pretrain_phase2.sh b/nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod4/run_pretrain_phase2.sh
new file mode 100644
index 000000000..89ec3ec4b
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod4/run_pretrain_phase2.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+export RDMAV_FORK_SAFE=1
+python3 run_pretrain.py \
+        --input_files "path_to_phase2_hdf5_dataset" \
+        --output_dir pretrain_384_model \
+        --seq_len 384 \
+        --hidden_size 768 \
+        --vocab_size 30400 \
+        --max_predictions_per_seq 56 \
+        --max_position_embeddings 512 \
+        --learning_rate 0.002828427125 \
+        --weight_decay 1e-2 \
+        --max_steps 2137 \
+        --warmup_steps 274 \
+        --logging_steps 10 \
+        --seed 1984 \
+        --beta1 0.9 \
+        --beta2 0.999 \
+        --num_hidden_layers 12 \
+        --micro_batch_size 8 \
+        --ipu_enable_fp16 True \
+        --scale_loss 128 \
+        --batches_per_step 1 \
+        --num_replica 1 \
+        --enable_grad_acc True \
+        --grad_acc_factor 2048 \
+        --batch_size 16384 \
+        --available_mem_proportion 0.28 \
+        --ignore_index 0 \
+        --enable_load_params True \
+        --load_params_path "./pretrain_128_model/final_step_7038.pdparams" \
+        --hidden_dropout_prob 0.1 \
+        --attention_probs_dropout_prob 0.1 \
+        --shuffle True \
+        --wandb False \
+        --enable_engine_caching False \
+        --save_steps 500
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod4/run_squad.sh b/nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod4/run_squad.sh
new file mode 100644
index 000000000..81302949c
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod4/run_squad.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+python3 run_squad.py \
+        --output_dir squad_model \
+        --task "SQUAD" \
+        --is_training True \
+        --seq_len 384 \
+        --hidden_size 768 \
+        --vocab_size 30400 \
+        --max_predictions_per_seq 56 \
+        --max_position_embeddings 512 \
+        --learning_rate 5.6e-05 \
+        --weight_decay 1e-2 \
+        --epochs 4 \
+        --warmup_steps 30 \
+        --logging_steps 10 \
+        --seed 1984 \
+        --beta1 0.9 \
+        --beta2 0.999 \
+        --num_hidden_layers 12 \
+        --micro_batch_size 2 \
+        --ipu_enable_fp16 True \
+        --accl1_type "FLOAT" \
+        --accl2_type "FLOAT" \
+        --weight_decay_mode "decay" \
+        --scale_loss 256 \
+        --optimizer_state_offchip True \
+        --batches_per_step 4 \
+        --num_replica 2 \
+        --num_ipus 2 \
+        --enable_grad_acc True \
+        --grad_acc_factor 64 \
+        --available_mem_proportion 0.40 \
+        --ignore_index 0 \
+        --hidden_dropout_prob 0.1 \
+        --attention_probs_dropout_prob 0.1 \
+        --shuffle True \
+        --wandb False \
+        --enable_engine_caching False \
+        --enable_load_params True \
+        --load_params_path "pretrain_384_model/final_step_2137.pdparams"
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod4/run_squad_infer.sh b/nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod4/run_squad_infer.sh
new file mode 100644
index 000000000..ae400c59e
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/scripts/pod4/run_squad_infer.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+python3 run_squad.py \
+        --output_dir squad_model \
+        --task "SQUAD" \
+        --is_training False \
+        --seq_len 384 \
+        --hidden_size 768 \
+        --vocab_size 30400 \
+        --max_predictions_per_seq 56 \
+        --max_position_embeddings 512 \
+        --learning_rate 5.6e-05 \
+        --weight_decay 1e-2 \
+        --epochs 4 \
+        --warmup_steps 52 \
+        --logging_steps 10 \
+        --seed 1984 \
+        --beta1 0.9 \
+        --beta2 0.999 \
+        --num_hidden_layers 12 \
+        --micro_batch_size 2 \
+        --ipu_enable_fp16 True \
+        --scale_loss 256 \
+        --optimizer_state_offchip False \
+        --batches_per_step 4 \
+        --num_replica 2 \
+        --num_ipus 2 \
+        --enable_grad_acc False \
+        --grad_acc_factor 1 \
+        --available_mem_proportion 0.40 \
+        --ignore_index 0 \
+        --hidden_dropout_prob 0.0 \
+        --attention_probs_dropout_prob 0.0 \
+        --shuffle False \
+        --wandb False \
+        --enable_engine_caching False \
+        --enable_load_params True \
+        --load_params_path "squad_model/Final_model.pdparams"
diff --git a/nlp/text_classification/bert/paddlepaddle/static_ipu/utils.py b/nlp/text_classification/bert/paddlepaddle/static_ipu/utils.py
new file mode 100644
index 000000000..9e9d9eeb2
--- /dev/null
+++ b/nlp/text_classification/bert/paddlepaddle/static_ipu/utils.py
@@ -0,0 +1,282 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+from distutils.util import strtobool
+
+import tqdm
+from paddle.utils.cpp_extension import load
+
+
+def load_custom_ops():
+    cur_dir = os.path.dirname(os.path.realpath(__file__))
+    custom_dir = cur_dir + "/custom_ops"
+    sources = [
+        f"{custom_dir}/custom_shape_infer.cc",
+        f"{custom_dir}/custom_checkpointoutput.cc",
+        f"{custom_dir}/custom_detach.cc", f"{custom_dir}/custom_identity.cc",
+        f"{custom_dir}/custom_nll_loss.cc",
+        f"{custom_dir}/tied_gather_pattern.cc", f"{custom_dir}/tied_gather.cc",
+        f"{custom_dir}/disable_attn_dropout_bwd_pattern.cc",
+        f"{custom_dir}/workarounds/prevent_const_expr_folding_op.cc",
+        f"{custom_dir}/utils.cc"
+    ]
+    custom_ops = load(
+        name="custom_ops",
+        sources=sources,
+        extra_cxx_cflags=['-DONNX_NAMESPACE=onnx'],
+        build_directory=custom_dir, )
+    return custom_ops
+
+
+class ProgressBar:
+    def __init__(self):
+        self._bar = None
+        self._last = 0
+
+    def __call__(self, progress: int, total: int):
+        if self._bar is None:
+            bar_format = "{l_bar}{bar}| {n_fmt}/{total_fmt} "
+            bar_format += "[{elapsed}<{remaining}]"
+            self._bar = tqdm.tqdm(
+                desc="Graph compilation", total=total, bar_format=bar_format)
+        self._bar.update(progress - self._last)
+        self._last = progress
+        if progress == total:
+            self._bar.close()
+            self._bar = None
+
+
+# need to set to 0 when start a new compilation
+g_current_progress = 0
+
+
+def ProgressFunc(progress, total):
+    global g_current_progress
+    if progress != g_current_progress:
+        g_current_progress = progress
+        print(f"Graph compilation: {progress}/{total}")
+
+
+def str_to_bool(val):
+    return bool(strtobool(val))
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--task",
+        type=str,
+        default="PRETRAINING",
+        help="task", )
+    parser.add_argument(
+        "--input_files",
+        type=str,
+        default="",
+        help="Files to load data from. "
+        "For Pretraining: Path to tfrecord files"
+        "For SQuAD: Path to train-v1.1.json")
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=False,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--is_training",
+        type=str_to_bool,
+        default=True,
+        help="training or inference")
+    # graph
+    parser.add_argument(
+        "--seq_len", default=128, type=int, help="The sequence length")
+    parser.add_argument(
+        "--vocab_size",
+        default=30912,
+        type=int,
+        help="Set the size of the vocabulary")
+    parser.add_argument(
+        "--max_predictions_per_seq",
+        default=20,
+        type=int,
+        help="The maximum total of masked tokens in input sequence")
+    parser.add_argument(
+        "--max_position_embeddings",
+        default=512,
+        type=int,
+        help="the length of the input mask")
+    parser.add_argument(
+        "--num_hidden_layers",
+        type=int,
+        default=None,
+        help="Override config file if not None")
+    parser.add_argument(
+        "--hidden_size",
+        default=768,
+        type=int,
+        help="Set the size of the hidden state of the transformer layers size")
+    parser.add_argument(
+        "--ignore_index", type=int, default=-1, help="ignore mlm index")
+    parser.add_argument(
+        "--hidden_dropout_prob",
+        type=float,
+        default=0.1,
+        help="Set the layer dropout probability for fully connected layer in embedding and encoder",
+    )
+    parser.add_argument(
+        "--attention_probs_dropout_prob",
+        type=float,
+        default=0.0,
+        help="Set the layer dropout probability for attention layer in encoder",
+    )
+    # optimizer
+    parser.add_argument(
+        "--learning_rate",
+        default=5e-5,
+        type=float,
+        help="The initial learning rate.")
+    parser.add_argument(
+        "--weight_decay",
+        default=0.0,
+        type=float,
+        help="Weight decay if we apply some.")
+    parser.add_argument(
+        "--beta1",
+        type=float,
+        default=0.9,
+        help="Set the Adam/Lamb beta1 value")
+    parser.add_argument(
+        "--beta2",
+        type=float,
+        default=0.999,
+        help="Set the Adam/Lamb beta2 value")
+    parser.add_argument(
+        "--adam_epsilon",
+        default=1e-6,
+        type=float,
+        help="Epsilon for Adam optimizer.")
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument(
+        "--warmup_steps",
+        default=10,
+        type=int,
+        help="Linear warmup over warmup_steps.")
+    parser.add_argument(
+        "--scale_loss",
+        type=float,
+        default=1.0,
+        help="The value of scale_loss for fp16.")
+    parser.add_argument(
+        "--accl1_type", type=str, default='FLOAT', help="FLOAT or FLOAT16")
+    parser.add_argument(
+        "--accl2_type", type=str, default='FLOAT', help="FLOAT or FLOAT16")
+    parser.add_argument(
+        "--weight_decay_mode",
+        type=str,
+        default='',
+        help="decay or l2_regularization")
+    parser.add_argument(
+        "--optimizer_state_offchip",
+        type=str_to_bool,
+        default=True,
+        help="Set the store location of the optimizer tensors")
+    parser.add_argument(
+        "--logging_steps",
+        type=int,
+        default=500,
+        help="Log every X updates steps.")
+    parser.add_argument(
+        "--save_steps",
+        type=int,
+        default=500,
+        help="Save checkpoint every X updates steps.")
+    # ipu
+    parser.add_argument(
+        "--epochs",
+        type=int,
+        default=1,
+        help="the iteration of the whole dataset", )
+    parser.add_argument(
+        "--batch_size",
+        default=8,
+        type=int,
+        help="Batch size per GPU/CPU for training.", )
+    parser.add_argument(
+        "--micro_batch_size", type=int, default=1, help="micro batch size")
+    parser.add_argument(
+        "--batches_per_step", type=int, default=1, help="batches per step")
+    parser.add_argument(
+        "--seed", type=int, default=42, help="Random seed for initialization")
+    parser.add_argument(
+        "--num_ipus", type=int, default=4, help="Number of IPUs to use")
+    parser.add_argument(
+        "--ipu_enable_fp16",
+        type=str_to_bool,
+        default=False,
+        help="ipu enable fp16 or not.")
+    parser.add_argument(
+        "--num_replica", type=int, default=1, help="number of replica")
+    parser.add_argument(
+        "--enable_grad_acc",
+        type=str_to_bool,
+        default=False,
+        help="enable gradient accumulation")
+    parser.add_argument(
+        "--grad_acc_factor",
+        type=int,
+        default=1,
+        help="factor of gradient accumulation")
+    parser.add_argument(
+        "--available_mem_proportion",
+        type=float,
+        default=0.0,
+        help="set the available memory proportion for matmul/conv")
+    parser.add_argument(
+        "--shuffle",
+        type=str_to_bool,
+        nargs="?",
+        const=True,
+        default=False,
+        help="Shuffle Dataset")
+    parser.add_argument(
+        "--wandb",
+        type=str_to_bool,
+        nargs="?",
+        const=True,
+        default=False,
+        help="Enable logging to Weights and Biases.")
+    parser.add_argument(
+        "--enable_load_params",
+        type=str_to_bool,
+        default=False,
+        help="load params or not")
+    parser.add_argument("--load_params_path", type=str, help="load params path")
+    parser.add_argument(
+        "--tf_checkpoint",
+        type=str,
+        help="Path to Tensorflow Checkpoint to initialise the model.")
+    parser.add_argument(
+        "--enable_engine_caching",
+        type=str_to_bool,
+        default=True,
+        help="enable engine caching or not")
+    args = parser.parse_args()
+    return args
diff --git a/nlp/text_classification/bert/tensorflow2.0/README.md b/nlp/text_classification/bert/tensorflow2.0/README.md
new file mode 100644
index 000000000..e7784b1c9
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/README.md
@@ -0,0 +1,395 @@
+# BERT (Bidirectional Encoder Representations from Transformers)
+
+**WARNING**: We are on the way to deprecate most of the code in this directory.
+Please see
+[this link](https://github.com/tensorflow/models/blob/master/official/nlp/docs/train.md)
+for the new tutorial and use the new code in `nlp/modeling`. This README is
+still correct for this legacy implementation.
+
+The academic paper which describes BERT in detail and provides full results on a
+number of tasks can be found here: https://arxiv.org/abs/1810.04805.
+
+This repository contains TensorFlow 2.x implementation for BERT.
+
+## Contents
+  * [Contents](#contents)
+  * [Pre-trained Models](#pre-trained-models)
+    * [Restoring from Checkpoints](#restoring-from-checkpoints)
+  * [Set Up](#set-up)
+  * [Process Datasets](#process-datasets)
+  * [Fine-tuning with BERT](#fine-tuning-with-bert)
+    * [Cloud GPUs and TPUs](#cloud-gpus-and-tpus)
+    * [Sentence and Sentence-pair Classification Tasks](#sentence-and-sentence-pair-classification-tasks)
+    * [SQuAD 1.1](#squad-1.1)
+
+
+## Pre-trained Models
+
+We released both checkpoints and tf.hub modules as the pretrained models for
+fine-tuning. They are TF 2.x compatible and are converted from the checkpoints
+released in TF 1.x official BERT repository
+[google-research/bert](https://github.com/google-research/bert)
+in order to keep consistent with BERT paper.
+
+
+### Access to Pretrained Checkpoints
+
+Pretrained checkpoints can be found in the following links:
+
+**Note: We have switched BERT implementation
+to use Keras functional-style networks in [nlp/modeling](../modeling).
+The new checkpoints are:**
+
+*   **[`BERT-Large, Uncased (Whole Word Masking)`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/wwm_uncased_L-24_H-1024_A-16.tar.gz)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Large, Cased (Whole Word Masking)`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/wwm_cased_L-24_H-1024_A-16.tar.gz)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Uncased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12.tar.gz)**:
+    12-layer, 768-hidden, 12-heads, 110M parameters
+*   **[`BERT-Large, Uncased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16.tar.gz)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Cased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-12_H-768_A-12.tar.gz)**:
+    12-layer, 768-hidden, 12-heads , 110M parameters
+*   **[`BERT-Large, Cased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-24_H-1024_A-16.tar.gz)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Multilingual Cased`](https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/multi_cased_L-12_H-768_A-12.tar.gz)**:
+    104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
+
+We recommend to host checkpoints on Google Cloud storage buckets when you use
+Cloud GPU/TPU.
+
+### Restoring from Checkpoints
+
+`tf.train.Checkpoint` is used to manage model checkpoints in TF 2. To restore
+weights from provided pre-trained checkpoints, you can use the following code:
+
+```python
+init_checkpoint='the pretrained model checkpoint path.'
+model=tf.keras.Model() # Bert pre-trained model as feature extractor.
+checkpoint = tf.train.Checkpoint(model=model)
+checkpoint.restore(init_checkpoint)
+```
+
+Checkpoints featuring native serialized Keras models
+(i.e. model.load()/load_weights()) will be available soon.
+
+### Access to Pretrained hub modules.
+
+Pretrained tf.hub modules in TF 2.x SavedModel format can be found in the
+following links:
+
+*   **[`BERT-Large, Uncased (Whole Word Masking)`](https://tfhub.dev/tensorflow/bert_en_wwm_uncased_L-24_H-1024_A-16/)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Large, Cased (Whole Word Masking)`](https://tfhub.dev/tensorflow/bert_en_wwm_cased_L-24_H-1024_A-16/)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Uncased`](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/)**:
+    12-layer, 768-hidden, 12-heads, 110M parameters
+*   **[`BERT-Large, Uncased`](https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Cased`](https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/)**:
+    12-layer, 768-hidden, 12-heads , 110M parameters
+*   **[`BERT-Large, Cased`](https://tfhub.dev/tensorflow/bert_en_cased_L-24_H-1024_A-16/)**:
+    24-layer, 1024-hidden, 16-heads, 340M parameters
+*   **[`BERT-Base, Multilingual Cased`](https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/)**:
+    104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
+*   **[`BERT-Base, Chinese`](https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/)**:
+    Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads,
+    110M parameters
+
+## Set Up
+
+```shell
+export PYTHONPATH="$PYTHONPATH:/path/to/models"
+```
+
+Install `tf-nightly` to get latest updates:
+
+```shell
+pip install tf-nightly-gpu
+```
+
+With TPU, GPU support is not necessary. First, you need to create a `tf-nightly`
+TPU with [ctpu tool](https://github.com/tensorflow/tpu/tree/master/tools/ctpu):
+
+```shell
+ctpu up -name <instance name> --tf-version=”nightly”
+```
+
+Second, you need to install TF 2 `tf-nightly` on your VM:
+
+```shell
+pip install tf-nightly
+```
+
+## Process Datasets
+
+### Pre-training
+
+There is no change to generate pre-training data. Please use the script
+[`../data/create_pretraining_data.py`](../data/create_pretraining_data.py)
+which is essentially branched from [BERT research repo](https://github.com/google-research/bert)
+to get processed pre-training data and it adapts to TF2 symbols and python3
+compatibility.
+
+Running the pre-training script requires an input and output directory, as well as a vocab file.  Note that max_seq_length will need to match the sequence length parameter you specify when you run pre-training.
+
+Example shell script to call create_pretraining_data.py
+```
+export WORKING_DIR='local disk or cloud location'
+export BERT_DIR='local disk or cloud location'
+python models/official/nlp/data/create_pretraining_data.py \
+  --input_file=$WORKING_DIR/input/input.txt \
+  --output_file=$WORKING_DIR/output/tf_examples.tfrecord \
+  --vocab_file=$BERT_DIR/wwm_uncased_L-24_H-1024_A-16/vocab.txt \
+  --do_lower_case=True \
+  --max_seq_length=512 \
+  --max_predictions_per_seq=76 \
+  --masked_lm_prob=0.15 \
+  --random_seed=12345 \
+  --dupe_factor=5
+```
+
+### Fine-tuning
+
+To prepare the fine-tuning data for final model training, use the
+[`../data/create_finetuning_data.py`](../data/create_finetuning_data.py) script.
+Resulting datasets in `tf_record` format and training meta data should be later
+passed to training or evaluation scripts. The task-specific arguments are
+described in following sections:
+
+* GLUE
+
+Users can download the
+[GLUE data](https://gluebenchmark.com/tasks) by running
+[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
+and unpack it to some directory `$GLUE_DIR`.
+Also, users can download [Pretrained Checkpoint](#access-to-pretrained-checkpoints) and locate on some directory `$BERT_DIR` instead of using checkpoints on Google Cloud Storage.
+
+```shell
+export GLUE_DIR=~/glue
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+
+export TASK_NAME=MNLI
+export OUTPUT_DIR=gs://some_bucket/datasets
+python ../data/create_finetuning_data.py \
+ --input_data_dir=${GLUE_DIR}/${TASK_NAME}/ \
+ --vocab_file=${BERT_DIR}/vocab.txt \
+ --train_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_train.tf_record \
+ --eval_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_eval.tf_record \
+ --meta_data_file_path=${OUTPUT_DIR}/${TASK_NAME}_meta_data \
+ --fine_tuning_task_type=classification --max_seq_length=128 \
+ --classification_task_name=${TASK_NAME}
+```
+
+* SQUAD
+
+The [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/) contains
+detailed information about the SQuAD datasets and evaluation.
+
+The necessary files can be found here:
+
+*   [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
+*   [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
+*   [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
+*   [train-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json)
+*   [dev-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json)
+*   [evaluate-v2.0.py](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)
+
+```shell
+export SQUAD_DIR=~/squad
+export SQUAD_VERSION=v1.1
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export OUTPUT_DIR=gs://some_bucket/datasets
+
+python ../data/create_finetuning_data.py \
+ --squad_data_file=${SQUAD_DIR}/train-${SQUAD_VERSION}.json \
+ --vocab_file=${BERT_DIR}/vocab.txt \
+ --train_data_output_path=${OUTPUT_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
+ --meta_data_file_path=${OUTPUT_DIR}/squad_${SQUAD_VERSION}_meta_data \
+ --fine_tuning_task_type=squad --max_seq_length=384
+```
+
+Note: To create fine-tuning data with SQUAD 2.0, you need to add flag `--version_2_with_negative=True`.
+
+## Fine-tuning with BERT
+
+### Cloud GPUs and TPUs
+
+* Cloud Storage
+
+The unzipped pre-trained model files can also be found in the Google Cloud
+Storage folder `gs://cloud-tpu-checkpoints/bert/keras_bert`. For example:
+
+```shell
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export MODEL_DIR=gs://some_bucket/my_output_dir
+```
+
+Currently, users are able to access to `tf-nightly` TPUs and the following TPU
+script should run with `tf-nightly`.
+
+* GPU -> TPU
+
+Just add the following flags to `run_classifier.py` or `run_squad.py`:
+
+```shell
+  --distribution_strategy=tpu
+  --tpu=grpc://${TPU_IP_ADDRESS}:8470
+```
+
+### Sentence and Sentence-pair Classification Tasks
+
+This example code fine-tunes `BERT-Large` on the Microsoft Research Paraphrase
+Corpus (MRPC) corpus, which only contains 3,600 examples and can fine-tune in a
+few minutes on most GPUs.
+
+We use the `BERT-Large` (uncased_L-24_H-1024_A-16) as an example throughout the
+workflow.
+For GPU memory of 16GB or smaller, you may try to use `BERT-Base`
+(uncased_L-12_H-768_A-12).
+
+```shell
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export MODEL_DIR=gs://some_bucket/my_output_dir
+export GLUE_DIR=gs://some_bucket/datasets
+export TASK=MRPC
+
+python run_classifier.py \
+  --mode='train_and_eval' \
+  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
+  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
+  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
+  --bert_config_file=${BERT_DIR}/bert_config.json \
+  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
+  --train_batch_size=4 \
+  --eval_batch_size=4 \
+  --steps_per_loop=1 \
+  --learning_rate=2e-5 \
+  --num_train_epochs=3 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=mirrored
+```
+
+Alternatively, instead of specifying `init_checkpoint`, you can specify
+`hub_module_url` to employ a pretraind BERT hub module, e.g.,
+` --hub_module_url=https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1`.
+
+After training a model, to get predictions from the classifier, you can set the
+`--mode=predict` and offer the test set tfrecords to `--eval_data_path`.
+Output will be created in file called test_results.tsv in the output folder.
+Each line will contain output for each sample, columns are the class
+probabilities.
+
+```shell
+python run_classifier.py \
+  --mode='predict' \
+  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
+  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
+  --bert_config_file=${BERT_DIR}/bert_config.json \
+  --eval_batch_size=4 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=mirrored
+```
+
+To use TPU, you only need to switch distribution strategy type to `tpu` with TPU
+information and use remote storage for model checkpoints.
+
+```shell
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export TPU_IP_ADDRESS='???'
+export MODEL_DIR=gs://some_bucket/my_output_dir
+export GLUE_DIR=gs://some_bucket/datasets
+export TASK=MRPC
+
+python run_classifier.py \
+  --mode='train_and_eval' \
+  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
+  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
+  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
+  --bert_config_file=${BERT_DIR}/bert_config.json \
+  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
+  --train_batch_size=32 \
+  --eval_batch_size=32 \
+  --steps_per_loop=1000 \
+  --learning_rate=2e-5 \
+  --num_train_epochs=3 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=tpu \
+  --tpu=grpc://${TPU_IP_ADDRESS}:8470
+```
+
+Note that, we specify `steps_per_loop=1000` for TPU, because running a loop of
+training steps inside a `tf.function` can significantly increase TPU utilization
+and callbacks will not be called inside the loop.
+
+### SQuAD 1.1
+
+The Stanford Question Answering Dataset (SQuAD) is a popular question answering
+benchmark dataset. See more in [SQuAD website](https://rajpurkar.github.io/SQuAD-explorer/).
+
+We use the `BERT-Large` (uncased_L-24_H-1024_A-16) as an example throughout the
+workflow.
+For GPU memory of 16GB or smaller, you may try to use `BERT-Base`
+(uncased_L-12_H-768_A-12).
+
+```shell
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export SQUAD_DIR=gs://some_bucket/datasets
+export MODEL_DIR=gs://some_bucket/my_output_dir
+export SQUAD_VERSION=v1.1
+
+python run_squad.py \
+  --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
+  --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
+  --predict_file=${SQUAD_DIR}/dev-v1.1.json \
+  --vocab_file=${BERT_DIR}/vocab.txt \
+  --bert_config_file=${BERT_DIR}/bert_config.json \
+  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
+  --train_batch_size=4 \
+  --predict_batch_size=4 \
+  --learning_rate=8e-5 \
+  --num_train_epochs=2 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=mirrored
+```
+
+Similarily, you can replace `init_checkpoint` FLAG with `hub_module_url` to
+specify a hub module path.
+
+`run_squad.py` writes the prediction for `--predict_file` by default. If you set
+the `--model=predict` and offer the SQuAD test data, the scripts will generate
+the prediction json file.
+
+To use TPU, you need switch distribution strategy type to `tpu` with TPU
+information.
+
+```shell
+export BERT_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+export TPU_IP_ADDRESS='???'
+export MODEL_DIR=gs://some_bucket/my_output_dir
+export SQUAD_DIR=gs://some_bucket/datasets
+export SQUAD_VERSION=v1.1
+
+python run_squad.py \
+  --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
+  --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
+  --predict_file=${SQUAD_DIR}/dev-v1.1.json \
+  --vocab_file=${BERT_DIR}/vocab.txt \
+  --bert_config_file=${BERT_DIR}/bert_config.json \
+  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
+  --train_batch_size=32 \
+  --learning_rate=8e-5 \
+  --num_train_epochs=2 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=tpu \
+  --tpu=grpc://${TPU_IP_ADDRESS}:8470
+```
+
+The dev set predictions will be saved into a file called predictions.json in the
+model_dir:
+
+```shell
+python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ./squad/predictions.json
+```
+
+
diff --git a/nlp/text_classification/bert/tensorflow2.0/__init__.py b/nlp/text_classification/bert/tensorflow2.0/__init__.py
new file mode 100644
index 000000000..a25710c22
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
diff --git a/nlp/text_classification/bert/tensorflow2.0/albert/configs.py b/nlp/text_classification/bert/tensorflow2.0/albert/configs.py
new file mode 100644
index 000000000..abcae824d
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/albert/configs.py
@@ -0,0 +1,50 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The ALBERT configurations."""
+
+import six
+
+import configs
+
+
+class AlbertConfig(configs.BertConfig):
+  """Configuration for `ALBERT`."""
+
+  def __init__(self, num_hidden_groups=1, inner_group_num=1, **kwargs):
+    """Constructs AlbertConfig.
+
+    Args:
+      num_hidden_groups: Number of group for the hidden layers, parameters in
+        the same group are shared. Note that this value and also the following
+        'inner_group_num' has to be 1 for now, because all released ALBERT
+        models set them to 1. We may support arbitary valid values in future.
+      inner_group_num: Number of inner repetition of attention and ffn.
+      **kwargs: The remaining arguments are the same as above 'BertConfig'.
+    """
+    super(AlbertConfig, self).__init__(**kwargs)
+
+    # TODO(chendouble): 'inner_group_num' and 'num_hidden_groups' are always 1
+    # in the released ALBERT. Support other values in AlbertEncoder if needed.
+    if inner_group_num != 1 or num_hidden_groups != 1:
+      raise ValueError("We only support 'inner_group_num' and "
+                       "'num_hidden_groups' as 1.")
+
+  @classmethod
+  def from_dict(cls, json_object):
+    """Constructs a `AlbertConfig` from a Python dictionary of parameters."""
+    config = AlbertConfig(vocab_size=None)
+    for (key, value) in six.iteritems(json_object):
+      config.__dict__[key] = value
+    return config
diff --git a/nlp/text_classification/bert/tensorflow2.0/bert_cloud_tpu.md b/nlp/text_classification/bert/tensorflow2.0/bert_cloud_tpu.md
new file mode 100644
index 000000000..baf6f9bdc
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/bert_cloud_tpu.md
@@ -0,0 +1,110 @@
+# BERT FineTuning with Cloud TPU: Sentence and Sentence-Pair Classification Tasks (TF 2.1)
+This tutorial shows you how to train the Bidirectional Encoder Representations from Transformers (BERT) model on Cloud TPU.
+
+
+## Set up Cloud Storage and Compute Engine VM
+1. [Open a cloud shell window](https://console.cloud.google.com/?cloudshell=true&_ga=2.11844148.-1612541229.1552429951)
+2. Create a variable for the project's id:
+```
+export PROJECT_ID=your-project_id
+```
+3. Configure `gcloud` command-line tool to use the project where you want to create Cloud TPU.
+```
+gcloud config set project ${PROJECT_ID}
+```
+4. Create a Cloud Storage bucket using the following command:
+```
+gsutil mb -p ${PROJECT_ID} -c standard -l europe-west4 -b on gs://your-bucket-name
+```
+This Cloud Storage bucket stores the data you use to train your model and the training results.
+5. Launch a Compute Engine VM and Cloud TPU using the ctpu up command.
+```
+ctpu up --tpu-size=v3-8 \
+ --machine-type=n1-standard-8 \
+ --zone=europe-west4-a \
+ --tf-version=2.1 [optional flags: --project, --name]
+```
+6. The configuration you specified appears. Enter y to approve or n to cancel.
+7. When the ctpu up command has finished executing, verify that your shell prompt has changed from username@project to username@tpuname. This change shows that you are now logged into your Compute Engine VM.
+```
+gcloud compute ssh vm-name --zone=europe-west4-a
+(vm)$ export TPU_NAME=vm-name
+```
+As you continue these instructions, run each command that begins with `(vm)$` in your VM session window.
+
+## Prepare the Dataset
+1. From your Compute Engine virtual machine (VM), install requirements.txt.
+```
+(vm)$ cd /usr/share/models
+(vm)$ sudo pip3 install -r official/requirements.txt
+```
+2. Optional: download download_glue_data.py
+
+This tutorial uses the General Language Understanding Evaluation (GLUE) benchmark to evaluate and analyze the performance of the model. The GLUE data is provided for this tutorial at gs://cloud-tpu-checkpoints/bert/classification.
+
+## Define parameter values
+Next, define several parameter values that are required when you train and evaluate your model:
+
+```
+(vm)$ export PYTHONPATH="$PYTHONPATH:/usr/share/tpu/models"
+(vm)$ export STORAGE_BUCKET=gs://your-bucket-name
+(vm)$ export BERT_BASE_DIR=gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16
+(vm)$ export MODEL_DIR=${STORAGE_BUCKET}/bert-output
+(vm)$ export GLUE_DIR=gs://cloud-tpu-checkpoints/bert/classification
+(vm)$ export TASK=mnli
+```
+
+## Train the model
+From your Compute Engine VM, run the following command.
+
+```
+(vm)$ python3 official/nlp/bert/run_classifier.py \
+  --mode='train_and_eval' \
+  --input_meta_data_path=${GLUE_DIR}/${TASK}_meta_data \
+  --train_data_path=${GLUE_DIR}/${TASK}_train.tf_record \
+  --eval_data_path=${GLUE_DIR}/${TASK}_eval.tf_record \
+  --bert_config_file=$BERT_BASE_DIR/bert_config.json \
+  --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
+  --train_batch_size=32 \
+  --eval_batch_size=32 \
+  --learning_rate=2e-5 \
+  --num_train_epochs=3 \
+  --model_dir=${MODEL_DIR} \
+  --distribution_strategy=tpu \
+  --tpu=${TPU_NAME}
+```
+
+## Verify your results
+The training takes approximately 1 hour on a v3-8 TPU. When script completes, you should see results similar to the following:
+```
+Training Summary:
+{'train_loss': 0.28142181038856506,
+'last_train_metrics': 0.9467429518699646,
+'eval_metrics': 0.8599063158035278,
+'total_training_steps': 36813}
+```
+
+## Clean up
+To avoid incurring charges to your GCP account for the resources used in this topic:
+1. Disconnect from the Compute Engine VM:
+```
+(vm)$ exit
+```
+2. In your Cloud Shell, run ctpu delete with the --zone flag you used when you set up the Cloud TPU to delete your Compute Engine VM and your Cloud TPU:
+```
+$ ctpu delete --zone=your-zone
+```
+3. Run ctpu status specifying your zone to make sure you have no instances allocated to avoid unnecessary charges for TPU usage. The deletion might take several minutes. A response like the one below indicates there are no more allocated instances:
+```
+$ ctpu status --zone=your-zone
+```
+4. Run gsutil as shown, replacing your-bucket with the name of the Cloud Storage bucket you created for this tutorial:
+```
+$ gsutil rm -r gs://your-bucket
+```
+
+
+
+
+
+
diff --git a/nlp/text_classification/bert/tensorflow2.0/bert_models.py b/nlp/text_classification/bert/tensorflow2.0/bert_models.py
new file mode 100644
index 000000000..f512828e7
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/bert_models.py
@@ -0,0 +1,366 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""BERT models that are compatible with TF 2.0."""
+
+import gin
+import tensorflow as tf
+import tensorflow_hub as hub
+
+from modeling import tf_utils
+from albert import configs as albert_configs
+import configs
+from nlp_modeling import models
+from nlp_modeling import networks
+
+
+class BertPretrainLossAndMetricLayer(tf.keras.layers.Layer):
+  """Returns layer that computes custom loss and metrics for pretraining."""
+
+  def __init__(self, vocab_size, **kwargs):
+    super(BertPretrainLossAndMetricLayer, self).__init__(**kwargs)
+    self._vocab_size = vocab_size
+    self.config = {
+        'vocab_size': vocab_size,
+    }
+
+  def _add_metrics(self, lm_output, lm_labels, lm_label_weights,
+                   lm_example_loss, sentence_output, sentence_labels,
+                   next_sentence_loss):
+    """Adds metrics."""
+    masked_lm_accuracy = tf.keras.metrics.sparse_categorical_accuracy(
+        lm_labels, lm_output)
+    numerator = tf.reduce_sum(masked_lm_accuracy * lm_label_weights)
+    denominator = tf.reduce_sum(lm_label_weights) + 1e-5
+    masked_lm_accuracy = numerator / denominator
+    self.add_metric(
+        masked_lm_accuracy, name='masked_lm_accuracy', aggregation='mean')
+
+    self.add_metric(lm_example_loss, name='lm_example_loss', aggregation='mean')
+
+    if sentence_labels is not None:
+      next_sentence_accuracy = tf.keras.metrics.sparse_categorical_accuracy(
+          sentence_labels, sentence_output)
+      self.add_metric(
+          next_sentence_accuracy,
+          name='next_sentence_accuracy',
+          aggregation='mean')
+
+    if next_sentence_loss is not None:
+      self.add_metric(
+          next_sentence_loss, name='next_sentence_loss', aggregation='mean')
+
+  def call(self,
+           lm_output_logits,
+           sentence_output_logits,
+           lm_label_ids,
+           lm_label_weights,
+           sentence_labels=None):
+    """Implements call() for the layer."""
+    lm_label_weights = tf.cast(lm_label_weights, tf.float32)
+    lm_output_logits = tf.cast(lm_output_logits, tf.float32)
+
+    lm_prediction_losses = tf.keras.losses.sparse_categorical_crossentropy(
+        lm_label_ids, lm_output_logits, from_logits=True)
+    lm_numerator_loss = tf.reduce_sum(lm_prediction_losses * lm_label_weights)
+    lm_denominator_loss = tf.reduce_sum(lm_label_weights)
+    mask_label_loss = tf.math.divide_no_nan(lm_numerator_loss,
+                                            lm_denominator_loss)
+
+    if sentence_labels is not None:
+      sentence_output_logits = tf.cast(sentence_output_logits, tf.float32)
+      sentence_loss = tf.keras.losses.sparse_categorical_crossentropy(
+          sentence_labels, sentence_output_logits, from_logits=True)
+      sentence_loss = tf.reduce_mean(sentence_loss)
+      loss = mask_label_loss + sentence_loss
+    else:
+      sentence_loss = None
+      loss = mask_label_loss
+
+    batch_shape = tf.slice(tf.shape(lm_label_ids), [0], [1])
+    # TODO(hongkuny): Avoids the hack and switches add_loss.
+    final_loss = tf.fill(batch_shape, loss)
+
+    self._add_metrics(lm_output_logits, lm_label_ids, lm_label_weights,
+                      mask_label_loss, sentence_output_logits, sentence_labels,
+                      sentence_loss)
+    return final_loss
+
+
+@gin.configurable
+def get_transformer_encoder(bert_config,
+                            sequence_length=None,
+                            transformer_encoder_cls=None,
+                            output_range=None):
+  """Gets a 'TransformerEncoder' object.
+
+  Args:
+    bert_config: A 'modeling.BertConfig' or 'modeling.AlbertConfig' object.
+    sequence_length: [Deprecated].
+    transformer_encoder_cls: A EncoderScaffold class. If it is None, uses the
+      default BERT encoder implementation.
+    output_range: the sequence output range, [0, output_range). Default setting
+      is to return the entire sequence output.
+
+  Returns:
+    A encoder object.
+  """
+  del sequence_length
+  if transformer_encoder_cls is not None:
+    # TODO(hongkuny): evaluate if it is better to put cfg definition in gin.
+    embedding_cfg = dict(
+        vocab_size=bert_config.vocab_size,
+        type_vocab_size=bert_config.type_vocab_size,
+        hidden_size=bert_config.hidden_size,
+        max_seq_length=bert_config.max_position_embeddings,
+        initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=bert_config.initializer_range),
+        dropout_rate=bert_config.hidden_dropout_prob,
+    )
+    hidden_cfg = dict(
+        num_attention_heads=bert_config.num_attention_heads,
+        intermediate_size=bert_config.intermediate_size,
+        intermediate_activation=tf_utils.get_activation(bert_config.hidden_act),
+        dropout_rate=bert_config.hidden_dropout_prob,
+        attention_dropout_rate=bert_config.attention_probs_dropout_prob,
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=bert_config.initializer_range),
+    )
+    kwargs = dict(
+        embedding_cfg=embedding_cfg,
+        hidden_cfg=hidden_cfg,
+        num_hidden_instances=bert_config.num_hidden_layers,
+        pooled_output_dim=bert_config.hidden_size,
+        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=bert_config.initializer_range))
+
+    # Relies on gin configuration to define the Transformer encoder arguments.
+    return transformer_encoder_cls(**kwargs)
+
+  kwargs = dict(
+      vocab_size=bert_config.vocab_size,
+      hidden_size=bert_config.hidden_size,
+      num_layers=bert_config.num_hidden_layers,
+      num_attention_heads=bert_config.num_attention_heads,
+      intermediate_size=bert_config.intermediate_size,
+      activation=tf_utils.get_activation(bert_config.hidden_act),
+      dropout_rate=bert_config.hidden_dropout_prob,
+      attention_dropout_rate=bert_config.attention_probs_dropout_prob,
+      max_sequence_length=bert_config.max_position_embeddings,
+      type_vocab_size=bert_config.type_vocab_size,
+      embedding_width=bert_config.embedding_size,
+      initializer=tf.keras.initializers.TruncatedNormal(
+          stddev=bert_config.initializer_range))
+  if isinstance(bert_config, albert_configs.AlbertConfig):
+    return networks.AlbertEncoder(**kwargs)
+  else:
+    assert isinstance(bert_config, configs.BertConfig)
+    kwargs['output_range'] = output_range
+    return networks.BertEncoder(**kwargs)
+
+
+def pretrain_model(bert_config,
+                   seq_length,
+                   max_predictions_per_seq,
+                   initializer=None,
+                   use_next_sentence_label=True,
+                   return_core_pretrainer_model=False):
+  """Returns model to be used for pre-training.
+
+  Args:
+      bert_config: Configuration that defines the core BERT model.
+      seq_length: Maximum sequence length of the training data.
+      max_predictions_per_seq: Maximum number of tokens in sequence to mask out
+        and use for pretraining.
+      initializer: Initializer for weights in BertPretrainer.
+      use_next_sentence_label: Whether to use the next sentence label.
+      return_core_pretrainer_model: Whether to also return the `BertPretrainer`
+        object.
+
+  Returns:
+      A Tuple of (1) Pretraining model, (2) core BERT submodel from which to
+      save weights after pretraining, and (3) optional core `BertPretrainer`
+      object if argument `return_core_pretrainer_model` is True.
+  """
+  input_word_ids = tf.keras.layers.Input(
+      shape=(seq_length,), name='input_word_ids', dtype=tf.int32)
+  input_mask = tf.keras.layers.Input(
+      shape=(seq_length,), name='input_mask', dtype=tf.int32)
+  input_type_ids = tf.keras.layers.Input(
+      shape=(seq_length,), name='input_type_ids', dtype=tf.int32)
+  masked_lm_positions = tf.keras.layers.Input(
+      shape=(max_predictions_per_seq,),
+      name='masked_lm_positions',
+      dtype=tf.int32)
+  masked_lm_ids = tf.keras.layers.Input(
+      shape=(max_predictions_per_seq,), name='masked_lm_ids', dtype=tf.int32)
+  masked_lm_weights = tf.keras.layers.Input(
+      shape=(max_predictions_per_seq,),
+      name='masked_lm_weights',
+      dtype=tf.int32)
+
+  if use_next_sentence_label:
+    next_sentence_labels = tf.keras.layers.Input(
+        shape=(1,), name='next_sentence_labels', dtype=tf.int32)
+  else:
+    next_sentence_labels = None
+
+  transformer_encoder = get_transformer_encoder(bert_config, seq_length)
+  if initializer is None:
+    initializer = tf.keras.initializers.TruncatedNormal(
+        stddev=bert_config.initializer_range)
+  pretrainer_model = models.BertPretrainer(
+      network=transformer_encoder,
+      embedding_table=transformer_encoder.get_embedding_table(),
+      num_classes=2,  # The next sentence prediction label has two classes.
+      activation=tf_utils.get_activation(bert_config.hidden_act),
+      num_token_predictions=max_predictions_per_seq,
+      initializer=initializer,
+      output='logits')
+
+  outputs = pretrainer_model(
+      [input_word_ids, input_mask, input_type_ids, masked_lm_positions])
+  lm_output = outputs['masked_lm']
+  sentence_output = outputs['classification']
+  pretrain_loss_layer = BertPretrainLossAndMetricLayer(
+      vocab_size=bert_config.vocab_size)
+  output_loss = pretrain_loss_layer(lm_output, sentence_output, masked_lm_ids,
+                                    masked_lm_weights, next_sentence_labels)
+  inputs = {
+      'input_word_ids': input_word_ids,
+      'input_mask': input_mask,
+      'input_type_ids': input_type_ids,
+      'masked_lm_positions': masked_lm_positions,
+      'masked_lm_ids': masked_lm_ids,
+      'masked_lm_weights': masked_lm_weights,
+  }
+  if use_next_sentence_label:
+    inputs['next_sentence_labels'] = next_sentence_labels
+
+  keras_model = tf.keras.Model(inputs=inputs, outputs=output_loss)
+  if return_core_pretrainer_model:
+    return keras_model, transformer_encoder, pretrainer_model
+  else:
+    return keras_model, transformer_encoder
+
+
+def squad_model(bert_config,
+                max_seq_length,
+                initializer=None,
+                hub_module_url=None,
+                hub_module_trainable=True):
+  """Returns BERT Squad model along with core BERT model to import weights.
+
+  Args:
+    bert_config: BertConfig, the config defines the core Bert model.
+    max_seq_length: integer, the maximum input sequence length.
+    initializer: Initializer for the final dense layer in the span labeler.
+      Defaulted to TruncatedNormal initializer.
+    hub_module_url: TF-Hub path/url to Bert module.
+    hub_module_trainable: True to finetune layers in the hub module.
+
+  Returns:
+    A tuple of (1) keras model that outputs start logits and end logits and
+    (2) the core BERT transformer encoder.
+  """
+  if initializer is None:
+    initializer = tf.keras.initializers.TruncatedNormal(
+        stddev=bert_config.initializer_range)
+  if not hub_module_url:
+    bert_encoder = get_transformer_encoder(bert_config, max_seq_length)
+    return models.BertSpanLabeler(
+        network=bert_encoder, initializer=initializer), bert_encoder
+
+  input_word_ids = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
+  input_mask = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
+  input_type_ids = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
+  core_model = hub.KerasLayer(hub_module_url, trainable=hub_module_trainable)
+  pooled_output, sequence_output = core_model(
+      [input_word_ids, input_mask, input_type_ids])
+  bert_encoder = tf.keras.Model(
+      inputs={
+          'input_word_ids': input_word_ids,
+          'input_mask': input_mask,
+          'input_type_ids': input_type_ids,
+      },
+      outputs=[sequence_output, pooled_output],
+      name='core_model')
+  return models.BertSpanLabeler(
+      network=bert_encoder, initializer=initializer), bert_encoder
+
+
+def classifier_model(bert_config,
+                     num_labels,
+                     max_seq_length=None,
+                     final_layer_initializer=None,
+                     hub_module_url=None,
+                     hub_module_trainable=True):
+  """BERT classifier model in functional API style.
+
+  Construct a Keras model for predicting `num_labels` outputs from an input with
+  maximum sequence length `max_seq_length`.
+
+  Args:
+    bert_config: BertConfig or AlbertConfig, the config defines the core BERT or
+      ALBERT model.
+    num_labels: integer, the number of classes.
+    max_seq_length: integer, the maximum input sequence length.
+    final_layer_initializer: Initializer for final dense layer. Defaulted
+      TruncatedNormal initializer.
+    hub_module_url: TF-Hub path/url to Bert module.
+    hub_module_trainable: True to finetune layers in the hub module.
+
+  Returns:
+    Combined prediction model (words, mask, type) -> (one-hot labels)
+    BERT sub-model (words, mask, type) -> (bert_outputs)
+  """
+  if final_layer_initializer is not None:
+    initializer = final_layer_initializer
+  else:
+    initializer = tf.keras.initializers.TruncatedNormal(
+        stddev=bert_config.initializer_range)
+
+  if not hub_module_url:
+    bert_encoder = get_transformer_encoder(
+        bert_config, max_seq_length, output_range=1)
+    return models.BertClassifier(
+        bert_encoder,
+        num_classes=num_labels,
+        dropout_rate=bert_config.hidden_dropout_prob,
+        initializer=initializer), bert_encoder
+
+  input_word_ids = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
+  input_mask = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
+  input_type_ids = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
+  bert_model = hub.KerasLayer(hub_module_url, trainable=hub_module_trainable)
+  pooled_output, _ = bert_model([input_word_ids, input_mask, input_type_ids])
+  output = tf.keras.layers.Dropout(rate=bert_config.hidden_dropout_prob)(
+      pooled_output)
+
+  output = tf.keras.layers.Dense(
+      num_labels, kernel_initializer=initializer, name='output')(
+          output)
+  return tf.keras.Model(
+      inputs={
+          'input_word_ids': input_word_ids,
+          'input_mask': input_mask,
+          'input_type_ids': input_type_ids
+      },
+      outputs=output), bert_model
diff --git a/nlp/text_classification/bert/tensorflow2.0/common/__init__.py b/nlp/text_classification/bert/tensorflow2.0/common/__init__.py
new file mode 100644
index 000000000..a25710c22
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/common/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
diff --git a/nlp/text_classification/bert/tensorflow2.0/common/dataset_fn.py b/nlp/text_classification/bert/tensorflow2.0/common/dataset_fn.py
new file mode 100644
index 000000000..4ac16a31b
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/common/dataset_fn.py
@@ -0,0 +1,42 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility library for picking an appropriate dataset function."""
+
+from typing import Any, Callable, Union, Type
+
+import tensorflow as tf
+
+PossibleDatasetType = Union[Type[tf.data.Dataset], Callable[[tf.Tensor], Any]]
+
+
+def pick_dataset_fn(file_type: str) -> PossibleDatasetType:
+  if file_type == 'tfrecord':
+    return tf.data.TFRecordDataset
+
+  raise ValueError('Unrecognized file_type: {}'.format(file_type))
diff --git a/nlp/text_classification/bert/tensorflow2.0/common/distribute_utils.py b/nlp/text_classification/bert/tensorflow2.0/common/distribute_utils.py
new file mode 100644
index 000000000..e2e05df9f
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/common/distribute_utils.py
@@ -0,0 +1,233 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helper functions for running models in a distributed setting."""
+
+import json
+import os
+import tensorflow as tf
+
+
+def _collective_communication(all_reduce_alg):
+  """Return a CollectiveCommunication based on all_reduce_alg.
+
+  Args:
+    all_reduce_alg: a string specifying which collective communication to pick,
+      or None.
+
+  Returns:
+    tf.distribute.experimental.CollectiveCommunication object
+
+  Raises:
+    ValueError: if `all_reduce_alg` not in [None, "ring", "nccl"]
+  """
+  collective_communication_options = {
+      None: tf.distribute.experimental.CollectiveCommunication.AUTO,
+      "ring": tf.distribute.experimental.CollectiveCommunication.RING,
+      "nccl": tf.distribute.experimental.CollectiveCommunication.NCCL
+  }
+  if all_reduce_alg not in collective_communication_options:
+    raise ValueError(
+        "When used with `multi_worker_mirrored`, valid values for "
+        "all_reduce_alg are [`ring`, `nccl`].  Supplied value: {}".format(
+            all_reduce_alg))
+  return collective_communication_options[all_reduce_alg]
+
+
+def _mirrored_cross_device_ops(all_reduce_alg, num_packs):
+  """Return a CrossDeviceOps based on all_reduce_alg and num_packs.
+
+  Args:
+    all_reduce_alg: a string specifying which cross device op to pick, or None.
+    num_packs: an integer specifying number of packs for the cross device op.
+
+  Returns:
+    tf.distribute.CrossDeviceOps object or None.
+
+  Raises:
+    ValueError: if `all_reduce_alg` not in [None, "nccl", "hierarchical_copy"].
+  """
+  if all_reduce_alg is None:
+    return None
+  mirrored_all_reduce_options = {
+      "nccl": tf.distribute.NcclAllReduce,
+      "hierarchical_copy": tf.distribute.HierarchicalCopyAllReduce
+  }
+  if all_reduce_alg not in mirrored_all_reduce_options:
+    raise ValueError(
+        "When used with `mirrored`, valid values for all_reduce_alg are "
+        "[`nccl`, `hierarchical_copy`].  Supplied value: {}".format(
+            all_reduce_alg))
+  cross_device_ops_class = mirrored_all_reduce_options[all_reduce_alg]
+  return cross_device_ops_class(num_packs=num_packs)
+
+
+def tpu_initialize(tpu_address):
+  """Initializes TPU for TF 2.x training.
+
+  Args:
+    tpu_address: string, bns address of master TPU worker.
+
+  Returns:
+    A TPUClusterResolver.
+  """
+  cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
+      tpu=tpu_address)
+  if tpu_address not in ("", "local"):
+    tf.config.experimental_connect_to_cluster(cluster_resolver)
+  tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
+  return cluster_resolver
+
+
+def get_distribution_strategy(distribution_strategy="mirrored",
+                              num_gpus=0,
+                              all_reduce_alg=None,
+                              num_packs=1,
+                              tpu_address=None,
+                              **kwargs):
+  """Return a DistributionStrategy for running the model.
+
+  Args:
+    distribution_strategy: a string specifying which distribution strategy to
+      use. Accepted values are "off", "one_device", "mirrored",
+      "parameter_server", "multi_worker_mirrored", and "tpu" -- case
+      insensitive. "tpu" means to use TPUStrategy using `tpu_address`.
+      "off" means to use the default strategy which is obtained from
+      tf.distribute.get_strategy (for details on the default strategy, see
+      https://www.tensorflow.org/guide/distributed_training#default_strategy).
+    num_gpus: Number of GPUs to run this model.
+    all_reduce_alg: Optional. Specifies which algorithm to use when performing
+      all-reduce. For `MirroredStrategy`, valid values are "nccl" and
+      "hierarchical_copy". For `MultiWorkerMirroredStrategy`, valid values are
+      "ring" and "nccl".  If None, DistributionStrategy will choose based on
+      device topology.
+    num_packs: Optional.  Sets the `num_packs` in `tf.distribute.NcclAllReduce`
+      or `tf.distribute.HierarchicalCopyAllReduce` for `MirroredStrategy`.
+    tpu_address: Optional. String that represents TPU to connect to. Must not be
+      None if `distribution_strategy` is set to `tpu`.
+    **kwargs: Additional kwargs for internal usages.
+
+  Returns:
+    tf.distribute.DistibutionStrategy object.
+  Raises:
+    ValueError: if `distribution_strategy` is "off" or "one_device" and
+      `num_gpus` is larger than 1; or `num_gpus` is negative or if
+      `distribution_strategy` is `tpu` but `tpu_address` is not specified.
+  """
+  del kwargs
+  if num_gpus < 0:
+    raise ValueError("`num_gpus` can not be negative.")
+
+  if not isinstance(distribution_strategy, str):
+    msg = ("distribution_strategy must be a string but got: %s." %
+           (distribution_strategy,))
+    if distribution_strategy == False:  # pylint: disable=singleton-comparison,g-explicit-bool-comparison
+      msg += (" If you meant to pass the string 'off', make sure you add "
+              "quotes around 'off' so that yaml interprets it as a string "
+              "instead of a bool.")
+    raise ValueError(msg)
+
+  distribution_strategy = distribution_strategy.lower()
+  if distribution_strategy == "off":
+    if num_gpus > 1:
+      raise ValueError("When {} GPUs are specified, distribution_strategy "
+                       "flag cannot be set to `off`.".format(num_gpus))
+    # Return the default distribution strategy.
+    return tf.distribute.get_strategy()
+
+  if distribution_strategy == "tpu":
+    # When tpu_address is an empty string, we communicate with local TPUs.
+    cluster_resolver = tpu_initialize(tpu_address)
+    return tf.distribute.TPUStrategy(cluster_resolver)
+
+  if distribution_strategy == "multi_worker_mirrored":
+    return tf.distribute.experimental.MultiWorkerMirroredStrategy(
+        communication=_collective_communication(all_reduce_alg))
+
+  if distribution_strategy == "one_device":
+    if num_gpus == 0:
+      return tf.distribute.OneDeviceStrategy("device:CPU:0")
+    if num_gpus > 1:
+      raise ValueError("`OneDeviceStrategy` can not be used for more than "
+                       "one device.")
+    return tf.distribute.OneDeviceStrategy("device:GPU:0")
+
+  if distribution_strategy == "mirrored":
+    if num_gpus == 0:
+      devices = ["device:CPU:0"]
+    else:
+      devices = ["device:GPU:%d" % i for i in range(num_gpus)]
+    return tf.distribute.MirroredStrategy(
+        devices=devices,
+        cross_device_ops=_mirrored_cross_device_ops(all_reduce_alg, num_packs))
+
+  if distribution_strategy == "parameter_server":
+    cluster_resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver()
+    return tf.distribute.experimental.ParameterServerStrategy(cluster_resolver)
+
+  raise ValueError("Unrecognized Distribution Strategy: %r" %
+                   distribution_strategy)
+
+
+def configure_cluster(worker_hosts=None, task_index=-1):
+  """Set multi-worker cluster spec in TF_CONFIG environment variable.
+
+  Args:
+    worker_hosts: comma-separated list of worker ip:port pairs.
+    task_index: index of the worker.
+
+  Returns:
+    Number of workers in the cluster.
+  """
+  tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
+  if tf_config:
+    num_workers = (
+        len(tf_config["cluster"].get("chief", [])) +
+        len(tf_config["cluster"].get("worker", [])))
+  elif worker_hosts:
+    workers = worker_hosts.split(",")
+    num_workers = len(workers)
+    if num_workers > 1 and task_index < 0:
+      raise ValueError("Must specify task_index when number of workers > 1")
+    task_index = 0 if num_workers == 1 else task_index
+    os.environ["TF_CONFIG"] = json.dumps({
+        "cluster": {
+            "worker": workers
+        },
+        "task": {
+            "type": "worker",
+            "index": task_index
+        }
+    })
+  else:
+    num_workers = 1
+  return num_workers
+
+
+def get_strategy_scope(strategy):
+  if strategy:
+    strategy_scope = strategy.scope()
+  else:
+    strategy_scope = DummyContextManager()
+
+  return strategy_scope
+
+
+class DummyContextManager(object):
+
+  def __enter__(self):
+    pass
+
+  def __exit__(self, *args):
+    pass
diff --git a/nlp/text_classification/bert/tensorflow2.0/common/flags.py b/nlp/text_classification/bert/tensorflow2.0/common/flags.py
new file mode 100644
index 000000000..7c2b87c66
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/common/flags.py
@@ -0,0 +1,110 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The central place to define flags."""
+
+from absl import flags
+
+
+def define_flags():
+  """Defines flags.
+
+  All flags are defined as optional, but in practice most models use some of
+  these flags and so mark_flags_as_required() should be called after calling
+  this function. Typically, 'experiment', 'mode', and 'model_dir' are required.
+  For example:
+
+  ```
+  from absl import flags
+  from common import flags as tfm_flags  # pylint: disable=line-too-long
+  ...
+  tfm_flags.define_flags()
+  flags.mark_flags_as_required(['experiment', 'mode', 'model_dir'])
+  ```
+
+  The reason all flags are optional is because unit tests often do not set or
+  use any of the flags.
+  """
+  flags.DEFINE_string(
+      'experiment', default=None, help=
+      'The experiment type registered, specifying an ExperimentConfig.')
+
+  flags.DEFINE_enum(
+      'mode',
+      default=None,
+      enum_values=[
+          'train', 'eval', 'train_and_eval', 'continuous_eval',
+          'continuous_train_and_eval', 'train_and_validate'
+      ],
+      help='Mode to run: `train`, `eval`, `train_and_eval`, '
+      '`continuous_eval`, `continuous_train_and_eval` and '
+      '`train_and_validate` (which is not implemented in '
+      'the open source version).')
+
+  flags.DEFINE_string(
+      'model_dir',
+      default=None,
+      help='The directory where the model and training/evaluation summaries'
+      'are stored.')
+
+  flags.DEFINE_multi_string(
+      'config_file',
+      default=None,
+      help='YAML/JSON files which specifies overrides. The override order '
+      'follows the order of args. Note that each file '
+      'can be used as an override template to override the default parameters '
+      'specified in Python. If the same parameter is specified in both '
+      '`--config_file` and `--params_override`, `config_file` will be used '
+      'first, followed by params_override.')
+
+  flags.DEFINE_string(
+      'params_override',
+      default=None,
+      help='a YAML/JSON string or a YAML file which specifies additional '
+      'overrides over the default parameters and those specified in '
+      '`--config_file`. Note that this is supposed to be used only to override '
+      'the model parameters, but not the parameters like TPU specific flags. '
+      'One canonical use case of `--config_file` and `--params_override` is '
+      'users first define a template config file using `--config_file`, then '
+      'use `--params_override` to adjust the minimal set of tuning parameters, '
+      'for example setting up different `train_batch_size`. The final override '
+      'order of parameters: default_model_params --> params from config_file '
+      '--> params in params_override. See also the help message of '
+      '`--config_file`.')
+
+  # The libraries rely on gin often make mistakes that include flags inside
+  # the library files which causes conflicts.
+  try:
+    flags.DEFINE_multi_string(
+        'gin_file', default=None, help='List of paths to the config files.')
+  except flags.DuplicateFlagError:
+    pass
+
+  try:
+    flags.DEFINE_multi_string(
+        'gin_params',
+        default=None,
+        help='Newline separated list of Gin parameter bindings.')
+  except flags.DuplicateFlagError:
+    pass
+
+  flags.DEFINE_string(
+      'tpu',
+      default=None,
+      help='The Cloud TPU to use for training. This should be either the name '
+      'used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 '
+      'url.')
+
+  flags.DEFINE_string(
+      'tf_data_service', default=None, help='The tf.data service address')
diff --git a/nlp/text_classification/bert/tensorflow2.0/common/registry_imports.py b/nlp/text_classification/bert/tensorflow2.0/common/registry_imports.py
new file mode 100644
index 000000000..88d59a422
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/common/registry_imports.py
@@ -0,0 +1,20 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""All necessary imports for registration."""
+# pylint: disable=unused-import
+import tasks
+from nlp_configs import experiment_configs
+from utils.testing import mock_task
+ 
\ No newline at end of file
diff --git a/nlp/text_classification/bert/tensorflow2.0/common_flags.py b/nlp/text_classification/bert/tensorflow2.0/common_flags.py
new file mode 100644
index 000000000..47bf32003
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/common_flags.py
@@ -0,0 +1,129 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Defining common flags used across all BERT models/applications."""
+
+from absl import flags
+import tensorflow as tf
+
+from utils import hyperparams_flags
+from utils.flags import core as flags_core
+
+
+def define_common_bert_flags():
+  """Define common flags for BERT tasks."""
+  flags_core.define_base(
+      data_dir=False,
+      model_dir=True,
+      clean=False,
+      train_epochs=False,
+      epochs_between_evals=False,
+      stop_threshold=False,
+      batch_size=False,
+      num_gpu=True,
+      export_dir=False,
+      distribution_strategy=True,
+      run_eagerly=True)
+  flags_core.define_distribution()
+  flags.DEFINE_string('bert_config_file', None,
+                      'Bert configuration file to define core bert layers.')
+  flags.DEFINE_string(
+      'model_export_path', None,
+      'Path to the directory, where trainined model will be '
+      'exported.')
+  flags.DEFINE_string('tpu', '', 'TPU address to connect to.')
+  flags.DEFINE_string(
+      'init_checkpoint', None,
+      'Initial checkpoint (usually from a pre-trained BERT model).')
+  flags.DEFINE_integer('num_train_epochs', 3,
+                       'Total number of training epochs to perform.')
+  flags.DEFINE_integer(
+      'steps_per_loop', None,
+      'Number of steps per graph-mode loop. Only training step '
+      'happens inside the loop. Callbacks will not be called '
+      'inside. If not set the value will be configured depending on the '
+      'devices available.')
+  flags.DEFINE_float('learning_rate', 5e-5,
+                     'The initial learning rate for Adam.')
+  flags.DEFINE_float('end_lr', 0.0,
+                     'The end learning rate for learning rate decay.')
+  flags.DEFINE_string('optimizer_type', 'adamw',
+                      'The type of optimizer to use for training (adamw|lamb)')
+  flags.DEFINE_boolean(
+      'scale_loss', False,
+      'Whether to divide the loss by number of replica inside the per-replica '
+      'loss function.')
+  flags.DEFINE_boolean(
+      'use_keras_compile_fit', False,
+      'If True, uses Keras compile/fit() API for training logic. Otherwise '
+      'use custom training loop.')
+  flags.DEFINE_string(
+      'hub_module_url', None, 'TF-Hub path/url to Bert module. '
+      'If specified, init_checkpoint flag should not be used.')
+  flags.DEFINE_bool('hub_module_trainable', True,
+                    'True to make keras layers in the hub module trainable.')
+  flags.DEFINE_string(
+      'sub_model_export_name', None,
+      'If set, `sub_model` checkpoints are exported into '
+      'FLAGS.model_dir/FLAGS.sub_model_export_name.')
+  flags.DEFINE_bool('explicit_allreduce', False,
+                    'True to use explicit allreduce instead of the implicit '
+                    'allreduce in optimizer.apply_gradients(). If fp16 mixed '
+                    'precision training is used, this also enables allreduce '
+                    'gradients in fp16.')
+  flags.DEFINE_integer('allreduce_bytes_per_pack', 0,
+                       'Number of bytes of a gradient pack for allreduce. '
+                       'Should be positive integer, if set to 0, all '
+                       'gradients are in one pack. Breaking gradient into '
+                       'packs could enable overlap between allreduce and '
+                       'backprop computation. This flag only takes effect '
+                       'when explicit_allreduce is set to True.')
+
+  flags_core.define_log_steps()
+
+  # Adds flags for mixed precision and multi-worker training.
+  flags_core.define_performance(
+      num_parallel_calls=False,
+      inter_op=False,
+      intra_op=False,
+      synthetic_data=False,
+      max_train_steps=False,
+      dtype=True,
+      loss_scale=True,
+      all_reduce_alg=True,
+      num_packs=False,
+      tf_gpu_thread_mode=True,
+      datasets_num_private_threads=True,
+      enable_xla=True,
+      fp16_implementation=True,
+  )
+
+  # Adds gin configuration flags.
+  hyperparams_flags.define_gin_flags()
+
+
+def dtype():
+  return flags_core.get_tf_dtype(flags.FLAGS)
+
+
+def use_float16():
+  return flags_core.get_tf_dtype(flags.FLAGS) == tf.float16
+
+
+def use_graph_rewrite():
+  return flags.FLAGS.fp16_implementation == 'graph_rewrite'
+
+
+def get_loss_scale():
+  return flags_core.get_loss_scale(flags.FLAGS, default_for_fp16='dynamic')
diff --git a/nlp/text_classification/bert/tensorflow2.0/configs.py b/nlp/text_classification/bert/tensorflow2.0/configs.py
new file mode 100644
index 000000000..950c32d0b
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/configs.py
@@ -0,0 +1,104 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""The main BERT model and related functions."""
+
+import copy
+import json
+
+import six
+import tensorflow as tf
+
+
+class BertConfig(object):
+  """Configuration for `BertModel`."""
+
+  def __init__(self,
+               vocab_size,
+               hidden_size=768,
+               num_hidden_layers=12,
+               num_attention_heads=12,
+               intermediate_size=3072,
+               hidden_act="gelu",
+               hidden_dropout_prob=0.1,
+               attention_probs_dropout_prob=0.1,
+               max_position_embeddings=512,
+               type_vocab_size=16,
+               initializer_range=0.02,
+               embedding_size=None,
+               backward_compatible=True):
+    """Constructs BertConfig.
+
+    Args:
+      vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
+      hidden_size: Size of the encoder layers and the pooler layer.
+      num_hidden_layers: Number of hidden layers in the Transformer encoder.
+      num_attention_heads: Number of attention heads for each attention layer in
+        the Transformer encoder.
+      intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+        layer in the Transformer encoder.
+      hidden_act: The non-linear activation function (function or string) in the
+        encoder and pooler.
+      hidden_dropout_prob: The dropout probability for all fully connected
+        layers in the embeddings, encoder, and pooler.
+      attention_probs_dropout_prob: The dropout ratio for the attention
+        probabilities.
+      max_position_embeddings: The maximum sequence length that this model might
+        ever be used with. Typically set this to something large just in case
+        (e.g., 512 or 1024 or 2048).
+      type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+        `BertModel`.
+      initializer_range: The stdev of the truncated_normal_initializer for
+        initializing all weight matrices.
+      embedding_size: (Optional) width of the factorized word embeddings.
+      backward_compatible: Boolean, whether the variables shape are compatible
+        with checkpoints converted from TF 1.x BERT.
+    """
+    self.vocab_size = vocab_size
+    self.hidden_size = hidden_size
+    self.num_hidden_layers = num_hidden_layers
+    self.num_attention_heads = num_attention_heads
+    self.hidden_act = hidden_act
+    self.intermediate_size = intermediate_size
+    self.hidden_dropout_prob = hidden_dropout_prob
+    self.attention_probs_dropout_prob = attention_probs_dropout_prob
+    self.max_position_embeddings = max_position_embeddings
+    self.type_vocab_size = type_vocab_size
+    self.initializer_range = initializer_range
+    self.embedding_size = embedding_size
+    self.backward_compatible = backward_compatible
+
+  @classmethod
+  def from_dict(cls, json_object):
+    """Constructs a `BertConfig` from a Python dictionary of parameters."""
+    config = BertConfig(vocab_size=None)
+    for (key, value) in six.iteritems(json_object):
+      config.__dict__[key] = value
+    return config
+
+  @classmethod
+  def from_json_file(cls, json_file):
+    """Constructs a `BertConfig` from a json file of parameters."""
+    with tf.io.gfile.GFile(json_file, "r") as reader:
+      text = reader.read()
+    return cls.from_dict(json.loads(text))
+
+  def to_dict(self):
+    """Serializes this instance to a Python dictionary."""
+    output = copy.deepcopy(self.__dict__)
+    return output
+
+  def to_json_string(self):
+    """Serializes this instance to a JSON string."""
+    return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
diff --git a/nlp/text_classification/bert/tensorflow2.0/core/config_definitions.py b/nlp/text_classification/bert/tensorflow2.0/core/config_definitions.py
new file mode 100644
index 000000000..434058edd
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/core/config_definitions.py
@@ -0,0 +1,252 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Common configuration settings."""
+
+from typing import Optional, Sequence, Union
+
+import dataclasses
+
+from modeling.hyperparams import base_config
+from modeling.optimization.configs import optimization_config
+
+OptimizationConfig = optimization_config.OptimizationConfig
+
+
+@dataclasses.dataclass
+class DataConfig(base_config.Config):
+  """The base configuration for building datasets.
+
+  Attributes:
+    input_path: The path to the input. It can be either (1) a str indicating a
+      file path/pattern, or (2) a str indicating multiple file paths/patterns
+      separated by comma (e.g "a, b, c" or no spaces "a,b,c"), or (3) a list of
+      str, each of which is a file path/pattern or multiple file paths/patterns
+      separated by comma, or (4) a dictionary of the previous three approaches
+      for more advanced data mixing using named access. It should not be
+      specified when the following `tfds_name` is specified.
+    tfds_name: The name of the tensorflow dataset (TFDS). It should not be
+      specified when the above `input_path` is specified.
+    tfds_split: A str indicating which split of the data to load from TFDS. It
+      is required when above `tfds_name` is specified.
+    global_batch_size: The global batch size across all replicas.
+    is_training: Whether this data is used for training or not.
+    drop_remainder: Whether the last batch should be dropped in the case it has
+      fewer than `global_batch_size` elements.
+    shuffle_buffer_size: The buffer size used for shuffling training data.
+    cache: Whether to cache dataset examples. If `True`, we will cache the
+      dataset after applying the decode_fn and parse_fn. It can be used to avoid
+      re-reading from disk, re-decoding and re-parsing the example on the second
+      epoch, but it requires significant memory overhead.
+    cycle_length: The number of files that will be processed concurrently when
+      interleaving files.
+    block_length: The number of consecutive elements to produce from each input
+      element before cycling to another input element when interleaving files.
+    deterministic: A boolean controlling whether determinism should be enforced.
+    sharding: Whether sharding is used in the input pipeline.
+    enable_tf_data_service: A boolean indicating whether to enable tf.data
+      service for the input pipeline.
+    tf_data_service_address: The URI of a tf.data service to offload
+      preprocessing onto during training. The URI should be in the format
+      "protocol://address", e.g. "grpc://tf-data-service:5050". It can be
+        overridden by `FLAGS.tf_data_service` flag in the binary.
+    tf_data_service_job_name: The name of the tf.data service job. This argument
+      makes it possible for multiple datasets to share the same job. The default
+      behavior is that the dataset creates anonymous, exclusively owned jobs.
+    tfds_data_dir: A str specifying the directory to read/write TFDS data.
+    tfds_as_supervised: A bool. When loading dataset from TFDS, if True, the
+      returned tf.data.Dataset will have a 2-tuple structure (input, label)
+      according to builder.info.supervised_keys; if False, the default, the
+      returned tf.data.Dataset will have a dictionary with all the features.
+    tfds_skip_decoding_feature: A str to indicate which features are skipped for
+      decoding when loading dataset from TFDS. Use comma to separate multiple
+      features. The main use case is to skip the image/video decoding for better
+      performance.
+    seed: An optional seed to use for deterministic shuffling/preprocessing.
+  """
+  input_path: Union[Sequence[str], str, base_config.Config] = ""
+  tfds_name: str = ""
+  tfds_split: str = ""
+  global_batch_size: int = 0
+  is_training: bool = None
+  drop_remainder: bool = True
+  shuffle_buffer_size: int = 100
+  cache: bool = False
+  cycle_length: Optional[int] = None
+  block_length: int = 1
+  deterministic: Optional[bool] = None
+  sharding: bool = True
+  enable_tf_data_service: bool = False
+  tf_data_service_address: Optional[str] = None
+  tf_data_service_job_name: Optional[str] = None
+  tfds_data_dir: str = ""
+  tfds_as_supervised: bool = False
+  tfds_skip_decoding_feature: str = ""
+  seed: Optional[int] = None
+
+
+@dataclasses.dataclass
+class RuntimeConfig(base_config.Config):
+  """High-level configurations for Runtime.
+
+  These include parameters that are not directly related to the experiment,
+  e.g. directories, accelerator type, etc.
+
+  Attributes:
+    distribution_strategy: e.g. 'mirrored', 'tpu', etc.
+    enable_xla: Whether or not to enable XLA.
+    per_gpu_thread_count: thread count per GPU.
+    gpu_thread_mode: Whether and how the GPU device uses its own threadpool.
+    dataset_num_private_threads: Number of threads for a private threadpool
+      created for all datasets computation.
+    tpu: The address of the TPU to use, if any.
+    num_gpus: The number of GPUs to use, if any.
+    worker_hosts: comma-separated list of worker ip:port pairs for running
+      multi-worker models with DistributionStrategy.
+    task_index: If multi-worker training, the task index of this worker.
+    all_reduce_alg: Defines the algorithm for performing all-reduce.
+    num_packs: Sets `num_packs` in the cross device ops used in
+      MirroredStrategy.  For details, see tf.distribute.NcclAllReduce.
+    mixed_precision_dtype: dtype of mixed precision policy. It can be 'float32',
+      'float16', or 'bfloat16'.
+    loss_scale: The type of loss scale, or 'float' value. This is used when
+      setting the mixed precision policy.
+    run_eagerly: Whether or not to run the experiment eagerly.
+    batchnorm_spatial_persistent: Whether or not to enable the spatial
+      persistent mode for CuDNN batch norm kernel for improved GPU performance.
+  """
+  distribution_strategy: str = "mirrored"
+  enable_xla: bool = False
+  gpu_thread_mode: Optional[str] = None
+  dataset_num_private_threads: Optional[int] = None
+  per_gpu_thread_count: int = 0
+  tpu: Optional[str] = None
+  num_gpus: int = 0
+  worker_hosts: Optional[str] = None
+  task_index: int = -1
+  all_reduce_alg: Optional[str] = None
+  num_packs: int = 1
+  mixed_precision_dtype: Optional[str] = None
+  loss_scale: Optional[Union[str, float]] = None
+  run_eagerly: bool = False
+  batchnorm_spatial_persistent: bool = False
+
+  # XLA runtime params.
+  # XLA params are only applied to the train_step.
+  # These augments can improve training speed. They can also improve eval, but
+  # may reduce usability and users would need to make changes to code.
+
+  # Whether to enable XLA dynamic padder
+  # infrastructure to handle dynamic shapes inputs inside XLA. True by
+  # default. Disabling this may cause correctness issues with dynamic shapes
+  # inputs, as XLA will just assume the inputs are with padded shapes. However
+  # users can optionally set it to False to improve device time if masking is
+  # already handled in the user side.
+  # If None, will respect XLA default.
+  tpu_enable_xla_dynamic_padder: Optional[bool] = None
+
+  # Global model parallelism configurations.
+  num_cores_per_replica: int = 1
+  default_shard_dim: int = -1
+
+  def model_parallelism(self):
+    return dict(
+        num_cores_per_replica=self.num_cores_per_replica,
+        default_shard_dim=self.default_shard_dim)
+
+
+@dataclasses.dataclass
+class TrainerConfig(base_config.Config):
+  """Configuration for trainer.
+
+  Attributes:
+    optimizer_config: optimizer config, it includes optimizer, learning rate,
+      and warmup schedule configs.
+    train_tf_while_loop: whether or not to use tf while loop.
+    train_tf_function: whether or not to use tf_function for training loop.
+    eval_tf_function: whether or not to use tf_function for eval.
+    allow_tpu_summary: Whether to allow summary happen inside the XLA program
+      runs on TPU through automatic outside compilation.
+    steps_per_loop: number of steps per loop.
+    summary_interval: number of steps between each summary.
+    checkpoint_interval: number of steps between checkpoints.
+    max_to_keep: max checkpoints to keep.
+    continuous_eval_timeout: maximum number of seconds to wait between
+      checkpoints, if set to None, continuous eval will wait indefinitely. This
+      is only used continuous_train_and_eval and continuous_eval modes. Default
+      value is 1 hrs.
+    train_steps: number of train steps.
+    validation_steps: number of eval steps. If `None`, the entire eval dataset
+      is used.
+    validation_interval: number of training steps to run between evaluations.
+    best_checkpoint_export_subdir: if set, the trainer will keep track of the
+      best evaluation metric, and export the corresponding best checkpoint under
+      `model_dir/best_checkpoint_export_subdir`. Note that this only works if
+      mode contains eval (such as `train_and_eval`, `continuous_eval`, and
+      `continuous_train_and_eval`).
+    best_checkpoint_eval_metric: for exporting the best checkpoint, which
+      evaluation metric the trainer should monitor. This can be any evaluation
+      metric appears on tensorboard.
+    best_checkpoint_metric_comp: for exporting the best checkpoint, how the
+      trainer should compare the evaluation metrics. This can be either `higher`
+      (higher the better) or `lower` (lower the better).
+    validation_summary_subdir: A 'str', sub directory for saving eval summary.
+  """
+  optimizer_config: OptimizationConfig = OptimizationConfig()
+  # Orbit settings.
+  train_tf_while_loop: bool = True
+  train_tf_function: bool = True
+  eval_tf_function: bool = True
+  eval_tf_while_loop: bool = False
+  allow_tpu_summary: bool = False
+  # Trainer intervals.
+  steps_per_loop: int = 1000
+  summary_interval: int = 1000
+  checkpoint_interval: int = 1000
+  # Checkpoint manager.
+  max_to_keep: int = 5
+  continuous_eval_timeout: int = 60 * 60
+  # Train/Eval routines.
+  train_steps: int = 0
+  # Sets validation steps to be -1 to evaluate the entire dataset.
+  validation_steps: int = -1
+  validation_interval: int = 1000
+  # Best checkpoint export.
+  best_checkpoint_export_subdir: str = ""
+  best_checkpoint_eval_metric: str = ""
+  best_checkpoint_metric_comp: str = "higher"
+  # Blowup recovery.
+  loss_upper_bound: float = 1e6
+  recovery_begin_steps: int = 0  # Enforcing the loss bound after these steps.
+  # When max trials < 0, no recovery module; max trials = 0, we will check
+  # the condition and fail the job if the condition happens; max trials > 0,
+  # we will retore the model states.
+  recovery_max_trials: int = 0
+  validation_summary_subdir: str = "validation"
+
+
+@dataclasses.dataclass
+class TaskConfig(base_config.Config):
+  init_checkpoint: str = ""
+  model: base_config.Config = None
+  train_data: DataConfig = DataConfig()
+  validation_data: DataConfig = DataConfig()
+
+
+@dataclasses.dataclass
+class ExperimentConfig(base_config.Config):
+  """Top-level configuration."""
+  task: TaskConfig = TaskConfig()
+  trainer: TrainerConfig = TrainerConfig()
+  runtime: RuntimeConfig = RuntimeConfig()
diff --git a/nlp/text_classification/bert/tensorflow2.0/data/__init__.py b/nlp/text_classification/bert/tensorflow2.0/data/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/data/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/nlp/text_classification/bert/tensorflow2.0/data/squad_lib.py b/nlp/text_classification/bert/tensorflow2.0/data/squad_lib.py
new file mode 100644
index 000000000..d529080fe
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/data/squad_lib.py
@@ -0,0 +1,975 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Library to process data for SQuAD 1.1 and SQuAD 2.0."""
+# pylint: disable=g-bad-import-order
+import collections
+import copy
+import json
+import math
+import os
+
+import six
+
+from absl import logging
+import tensorflow as tf
+
+import tokenization
+
+
+class SquadExample(object):
+  """A single training/test example for simple sequence classification.
+
+  For examples without an answer, the start and end position are -1.
+
+  Attributes:
+    qas_id: ID of the question-answer pair.
+    question_text: Original text for the question.
+    doc_tokens: The list of tokens in the context obtained by splitting on
+      whitespace only.
+    orig_answer_text: Original text for the answer.
+    start_position: Starting index of the answer in `doc_tokens`.
+    end_position: Ending index of the answer in `doc_tokens`.
+    is_impossible: Whether the question is impossible to answer given the
+      context. Only used in SQuAD 2.0.
+  """
+
+  def __init__(self,
+               qas_id,
+               question_text,
+               doc_tokens,
+               orig_answer_text=None,
+               start_position=None,
+               end_position=None,
+               is_impossible=False):
+    self.qas_id = qas_id
+    self.question_text = question_text
+    self.doc_tokens = doc_tokens
+    self.orig_answer_text = orig_answer_text
+    self.start_position = start_position
+    self.end_position = end_position
+    self.is_impossible = is_impossible
+
+  def __str__(self):
+    return self.__repr__()
+
+  def __repr__(self):
+    s = ""
+    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
+    s += ", question_text: %s" % (
+        tokenization.printable_text(self.question_text))
+    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+    if self.start_position:
+      s += ", start_position: %d" % (self.start_position)
+    if self.start_position:
+      s += ", end_position: %d" % (self.end_position)
+    if self.start_position:
+      s += ", is_impossible: %r" % (self.is_impossible)
+    return s
+
+
+class InputFeatures(object):
+  """A single set of features of data."""
+
+  def __init__(self,
+               unique_id,
+               example_index,
+               doc_span_index,
+               tokens,
+               token_to_orig_map,
+               token_is_max_context,
+               input_ids,
+               input_mask,
+               segment_ids,
+               paragraph_mask=None,
+               class_index=None,
+               start_position=None,
+               end_position=None,
+               is_impossible=None):
+    self.unique_id = unique_id
+    self.example_index = example_index
+    self.doc_span_index = doc_span_index
+    self.tokens = tokens
+    self.token_to_orig_map = token_to_orig_map
+    self.token_is_max_context = token_is_max_context
+    self.input_ids = input_ids
+    self.input_mask = input_mask
+    self.segment_ids = segment_ids
+    self.start_position = start_position
+    self.end_position = end_position
+    self.is_impossible = is_impossible
+    self.paragraph_mask = paragraph_mask
+    self.class_index = class_index
+
+
+class FeatureWriter(object):
+  """Writes InputFeature to TF example file."""
+
+  def __init__(self, filename, is_training):
+    self.filename = filename
+    self.is_training = is_training
+    self.num_features = 0
+    tf.io.gfile.makedirs(os.path.dirname(filename))
+    self._writer = tf.io.TFRecordWriter(filename)
+
+  def process_feature(self, feature):
+    """Write a InputFeature to the TFRecordWriter as a tf.train.Example."""
+    self.num_features += 1
+
+    def create_int_feature(values):
+      feature = tf.train.Feature(
+          int64_list=tf.train.Int64List(value=list(values)))
+      return feature
+
+    features = collections.OrderedDict()
+    features["unique_ids"] = create_int_feature([feature.unique_id])
+    features["input_ids"] = create_int_feature(feature.input_ids)
+    features["input_mask"] = create_int_feature(feature.input_mask)
+    features["segment_ids"] = create_int_feature(feature.segment_ids)
+
+    if feature.paragraph_mask is not None:
+      features["paragraph_mask"] = create_int_feature(feature.paragraph_mask)
+    if feature.class_index is not None:
+      features["class_index"] = create_int_feature([feature.class_index])
+
+    if self.is_training:
+      features["start_positions"] = create_int_feature([feature.start_position])
+      features["end_positions"] = create_int_feature([feature.end_position])
+      impossible = 0
+      if feature.is_impossible:
+        impossible = 1
+      features["is_impossible"] = create_int_feature([impossible])
+
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    self._writer.write(tf_example.SerializeToString())
+
+  def close(self):
+    self._writer.close()
+
+
+def read_squad_examples(input_file, is_training,
+                        version_2_with_negative,
+                        translated_input_folder=None):
+  """Read a SQuAD json file into a list of SquadExample."""
+  with tf.io.gfile.GFile(input_file, "r") as reader:
+    input_data = json.load(reader)["data"]
+
+  if translated_input_folder is not None:
+    translated_files = tf.io.gfile.glob(
+        os.path.join(translated_input_folder, "*.json"))
+    for file in translated_files:
+      with tf.io.gfile.GFile(file, "r") as reader:
+        input_data.extend(json.load(reader)["data"])
+
+  def is_whitespace(c):
+    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+      return True
+    return False
+
+  examples = []
+  for entry in input_data:
+    for paragraph in entry["paragraphs"]:
+      paragraph_text = paragraph["context"]
+      doc_tokens = []
+      char_to_word_offset = []
+      prev_is_whitespace = True
+      for c in paragraph_text:
+        if is_whitespace(c):
+          prev_is_whitespace = True
+        else:
+          if prev_is_whitespace:
+            doc_tokens.append(c)
+          else:
+            doc_tokens[-1] += c
+          prev_is_whitespace = False
+        char_to_word_offset.append(len(doc_tokens) - 1)
+
+      for qa in paragraph["qas"]:
+        qas_id = qa["id"]
+        question_text = qa["question"]
+        start_position = None
+        end_position = None
+        orig_answer_text = None
+        is_impossible = False
+        if is_training:
+
+          if version_2_with_negative:
+            is_impossible = qa["is_impossible"]
+          if (len(qa["answers"]) != 1) and (not is_impossible):
+            raise ValueError(
+                "For training, each question should have exactly 1 answer.")
+          if not is_impossible:
+            answer = qa["answers"][0]
+            orig_answer_text = answer["text"]
+            answer_offset = answer["answer_start"]
+            answer_length = len(orig_answer_text)
+            start_position = char_to_word_offset[answer_offset]
+            end_position = char_to_word_offset[answer_offset + answer_length -
+                                               1]
+            # Only add answers where the text can be exactly recovered from the
+            # document. If this CAN'T happen it's likely due to weird Unicode
+            # stuff so we will just skip the example.
+            #
+            # Note that this means for training mode, every example is NOT
+            # guaranteed to be preserved.
+            actual_text = " ".join(doc_tokens[start_position:(end_position +
+                                                              1)])
+            cleaned_answer_text = " ".join(
+                tokenization.whitespace_tokenize(orig_answer_text))
+            if actual_text.find(cleaned_answer_text) == -1:
+              logging.warning("Could not find answer: '%s' vs. '%s'",
+                              actual_text, cleaned_answer_text)
+              continue
+          else:
+            start_position = -1
+            end_position = -1
+            orig_answer_text = ""
+
+        example = SquadExample(
+            qas_id=qas_id,
+            question_text=question_text,
+            doc_tokens=doc_tokens,
+            orig_answer_text=orig_answer_text,
+            start_position=start_position,
+            end_position=end_position,
+            is_impossible=is_impossible)
+        examples.append(example)
+
+  return examples
+
+
+def convert_examples_to_features(examples,
+                                 tokenizer,
+                                 max_seq_length,
+                                 doc_stride,
+                                 max_query_length,
+                                 is_training,
+                                 output_fn,
+                                 xlnet_format=False,
+                                 batch_size=None):
+  """Loads a data file into a list of `InputBatch`s."""
+
+  base_id = 1000000000
+  unique_id = base_id
+  feature = None
+  for (example_index, example) in enumerate(examples):
+    query_tokens = tokenizer.tokenize(example.question_text)
+
+    if len(query_tokens) > max_query_length:
+      query_tokens = query_tokens[0:max_query_length]
+
+    tok_to_orig_index = []
+    orig_to_tok_index = []
+    all_doc_tokens = []
+    for (i, token) in enumerate(example.doc_tokens):
+      orig_to_tok_index.append(len(all_doc_tokens))
+      sub_tokens = tokenizer.tokenize(token)
+      for sub_token in sub_tokens:
+        tok_to_orig_index.append(i)
+        all_doc_tokens.append(sub_token)
+
+    tok_start_position = None
+    tok_end_position = None
+    if is_training and example.is_impossible:
+      tok_start_position = -1
+      tok_end_position = -1
+    if is_training and not example.is_impossible:
+      tok_start_position = orig_to_tok_index[example.start_position]
+      if example.end_position < len(example.doc_tokens) - 1:
+        tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
+      else:
+        tok_end_position = len(all_doc_tokens) - 1
+      (tok_start_position, tok_end_position) = _improve_answer_span(
+          all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
+          example.orig_answer_text)
+
+    # The -3 accounts for [CLS], [SEP] and [SEP]
+    max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+
+    # We can have documents that are longer than the maximum sequence length.
+    # To deal with this we do a sliding window approach, where we take chunks
+    # of the up to our max length with a stride of `doc_stride`.
+    _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
+        "DocSpan", ["start", "length"])
+    doc_spans = []
+    start_offset = 0
+    while start_offset < len(all_doc_tokens):
+      length = len(all_doc_tokens) - start_offset
+      if length > max_tokens_for_doc:
+        length = max_tokens_for_doc
+      doc_spans.append(_DocSpan(start=start_offset, length=length))
+      if start_offset + length == len(all_doc_tokens):
+        break
+      start_offset += min(length, doc_stride)
+
+    for (doc_span_index, doc_span) in enumerate(doc_spans):
+      tokens = []
+      token_to_orig_map = {}
+      token_is_max_context = {}
+      segment_ids = []
+
+      # Paragraph mask used in XLNet.
+      # 1 represents paragraph and class tokens.
+      # 0 represents query and other special tokens.
+      paragraph_mask = []
+
+      # pylint: disable=cell-var-from-loop
+      def process_query(seg_q):
+        for token in query_tokens:
+          tokens.append(token)
+          segment_ids.append(seg_q)
+          paragraph_mask.append(0)
+        tokens.append("[SEP]")
+        segment_ids.append(seg_q)
+        paragraph_mask.append(0)
+
+      def process_paragraph(seg_p):
+        for i in range(doc_span.length):
+          split_token_index = doc_span.start + i
+          token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
+
+          is_max_context = _check_is_max_context(doc_spans, doc_span_index,
+                                                 split_token_index)
+          token_is_max_context[len(tokens)] = is_max_context
+          tokens.append(all_doc_tokens[split_token_index])
+          segment_ids.append(seg_p)
+          paragraph_mask.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(seg_p)
+        paragraph_mask.append(0)
+
+      def process_class(seg_class):
+        class_index = len(segment_ids)
+        tokens.append("[CLS]")
+        segment_ids.append(seg_class)
+        paragraph_mask.append(1)
+        return class_index
+
+      if xlnet_format:
+        seg_p, seg_q, seg_class, seg_pad = 0, 1, 2, 3
+        process_paragraph(seg_p)
+        process_query(seg_q)
+        class_index = process_class(seg_class)
+      else:
+        seg_p, seg_q, seg_class, seg_pad = 1, 0, 0, 0
+        class_index = process_class(seg_class)
+        process_query(seg_q)
+        process_paragraph(seg_p)
+
+      input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+      # The mask has 1 for real tokens and 0 for padding tokens. Only real
+      # tokens are attended to.
+      input_mask = [1] * len(input_ids)
+
+      # Zero-pad up to the sequence length.
+      while len(input_ids) < max_seq_length:
+        input_ids.append(0)
+        input_mask.append(0)
+        segment_ids.append(seg_pad)
+        paragraph_mask.append(0)
+
+      assert len(input_ids) == max_seq_length
+      assert len(input_mask) == max_seq_length
+      assert len(segment_ids) == max_seq_length
+      assert len(paragraph_mask) == max_seq_length
+
+      start_position = 0
+      end_position = 0
+      span_contains_answer = False
+
+      if is_training and not example.is_impossible:
+        # For training, if our document chunk does not contain an annotation
+        # we throw it out, since there is nothing to predict.
+        doc_start = doc_span.start
+        doc_end = doc_span.start + doc_span.length - 1
+        span_contains_answer = (tok_start_position >= doc_start and
+                                tok_end_position <= doc_end)
+        if span_contains_answer:
+          doc_offset = 0 if xlnet_format else len(query_tokens) + 2
+          start_position = tok_start_position - doc_start + doc_offset
+          end_position = tok_end_position - doc_start + doc_offset
+
+      if example_index < 20:
+        logging.info("*** Example ***")
+        logging.info("unique_id: %s", (unique_id))
+        logging.info("example_index: %s", (example_index))
+        logging.info("doc_span_index: %s", (doc_span_index))
+        logging.info("tokens: %s",
+                     " ".join([tokenization.printable_text(x) for x in tokens]))
+        logging.info(
+            "token_to_orig_map: %s", " ".join([
+                "%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)
+            ]))
+        logging.info(
+            "token_is_max_context: %s", " ".join([
+                "%d:%s" % (x, y)
+                for (x, y) in six.iteritems(token_is_max_context)
+            ]))
+        logging.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
+        logging.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
+        logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
+        logging.info("paragraph_mask: %s", " ".join(
+            [str(x) for x in paragraph_mask]))
+        logging.info("class_index: %d", class_index)
+        if is_training:
+          if span_contains_answer:
+            answer_text = " ".join(tokens[start_position:(end_position + 1)])
+            logging.info("start_position: %d", (start_position))
+            logging.info("end_position: %d", (end_position))
+            logging.info("answer: %s", tokenization.printable_text(answer_text))
+          else:
+            logging.info("document span doesn't contain answer")
+
+      feature = InputFeatures(
+          unique_id=unique_id,
+          example_index=example_index,
+          doc_span_index=doc_span_index,
+          tokens=tokens,
+          paragraph_mask=paragraph_mask,
+          class_index=class_index,
+          token_to_orig_map=token_to_orig_map,
+          token_is_max_context=token_is_max_context,
+          input_ids=input_ids,
+          input_mask=input_mask,
+          segment_ids=segment_ids,
+          start_position=start_position,
+          end_position=end_position,
+          is_impossible=not span_contains_answer)
+
+      # Run callback
+      if is_training:
+        output_fn(feature)
+      else:
+        output_fn(feature, is_padding=False)
+
+      unique_id += 1
+
+  if not is_training and feature:
+    assert batch_size
+    num_padding = 0
+    num_examples = unique_id - base_id
+    if unique_id % batch_size != 0:
+      num_padding = batch_size - (num_examples % batch_size)
+    logging.info("Adding padding examples to make sure no partial batch.")
+    logging.info("Adds %d padding examples for inference.", num_padding)
+    dummy_feature = copy.deepcopy(feature)
+    for _ in range(num_padding):
+      dummy_feature.unique_id = unique_id
+
+      # Run callback
+      output_fn(feature, is_padding=True)
+      unique_id += 1
+  return unique_id - base_id
+
+
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
+                         orig_answer_text):
+  """Returns tokenized answer spans that better match the annotated answer."""
+
+  # The SQuAD annotations are character based. We first project them to
+  # whitespace-tokenized words. But then after WordPiece tokenization, we can
+  # often find a "better match". For example:
+  #
+  #   Question: What year was John Smith born?
+  #   Context: The leader was John Smith (1895-1943).
+  #   Answer: 1895
+  #
+  # The original whitespace-tokenized answer will be "(1895-1943).". However
+  # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
+  # the exact answer, 1895.
+  #
+  # However, this is not always possible. Consider the following:
+  #
+  #   Question: What country is the top exporter of electornics?
+  #   Context: The Japanese electronics industry is the lagest in the world.
+  #   Answer: Japan
+  #
+  # In this case, the annotator chose "Japan" as a character sub-span of
+  # the word "Japanese". Since our WordPiece tokenizer does not split
+  # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
+  # in SQuAD, but does happen.
+  tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+
+  for new_start in range(input_start, input_end + 1):
+    for new_end in range(input_end, new_start - 1, -1):
+      text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+      if text_span == tok_answer_text:
+        return (new_start, new_end)
+
+  return (input_start, input_end)
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+  """Check if this is the 'max context' doc span for the token."""
+
+  # Because of the sliding window approach taken to scoring documents, a single
+  # token can appear in multiple documents. E.g.
+  #  Doc: the man went to the store and bought a gallon of milk
+  #  Span A: the man went to the
+  #  Span B: to the store and bought
+  #  Span C: and bought a gallon of
+  #  ...
+  #
+  # Now the word 'bought' will have two scores from spans B and C. We only
+  # want to consider the score with "maximum context", which we define as
+  # the *minimum* of its left and right context (the *sum* of left and
+  # right context will always be the same, of course).
+  #
+  # In the example the maximum context for 'bought' would be span C since
+  # it has 1 left context and 3 right context, while span B has 4 left context
+  # and 0 right context.
+  best_score = None
+  best_span_index = None
+  for (span_index, doc_span) in enumerate(doc_spans):
+    end = doc_span.start + doc_span.length - 1
+    if position < doc_span.start:
+      continue
+    if position > end:
+      continue
+    num_left_context = position - doc_span.start
+    num_right_context = end - position
+    score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+    if best_score is None or score > best_score:
+      best_score = score
+      best_span_index = span_index
+
+  return cur_span_index == best_span_index
+
+
+def write_predictions(all_examples,
+                      all_features,
+                      all_results,
+                      n_best_size,
+                      max_answer_length,
+                      do_lower_case,
+                      output_prediction_file,
+                      output_nbest_file,
+                      output_null_log_odds_file,
+                      version_2_with_negative=False,
+                      null_score_diff_threshold=0.0,
+                      verbose=False):
+  """Write final predictions to the json file and log-odds of null if needed."""
+  logging.info("Writing predictions to: %s", (output_prediction_file))
+  logging.info("Writing nbest to: %s", (output_nbest_file))
+
+  all_predictions, all_nbest_json, scores_diff_json = (
+      postprocess_output(
+          all_examples=all_examples,
+          all_features=all_features,
+          all_results=all_results,
+          n_best_size=n_best_size,
+          max_answer_length=max_answer_length,
+          do_lower_case=do_lower_case,
+          version_2_with_negative=version_2_with_negative,
+          null_score_diff_threshold=null_score_diff_threshold,
+          verbose=verbose))
+
+  write_to_json_files(all_predictions, output_prediction_file)
+  write_to_json_files(all_nbest_json, output_nbest_file)
+  if version_2_with_negative:
+    write_to_json_files(scores_diff_json, output_null_log_odds_file)
+
+
+def postprocess_output(all_examples,
+                       all_features,
+                       all_results,
+                       n_best_size,
+                       max_answer_length,
+                       do_lower_case,
+                       version_2_with_negative=False,
+                       null_score_diff_threshold=0.0,
+                       xlnet_format=False,
+                       verbose=False):
+  """Postprocess model output, to form predicton results."""
+
+  example_index_to_features = collections.defaultdict(list)
+  for feature in all_features:
+    example_index_to_features[feature.example_index].append(feature)
+  unique_id_to_result = {}
+  for result in all_results:
+    unique_id_to_result[result.unique_id] = result
+
+  _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+      "PrelimPrediction",
+      ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
+
+  all_predictions = collections.OrderedDict()
+  all_nbest_json = collections.OrderedDict()
+  scores_diff_json = collections.OrderedDict()
+
+  for (example_index, example) in enumerate(all_examples):
+    features = example_index_to_features[example_index]
+
+    prelim_predictions = []
+    # keep track of the minimum score of null start+end of position 0
+    score_null = 1000000  # large and positive
+    min_null_feature_index = 0  # the paragraph slice with min mull score
+    null_start_logit = 0  # the start logit at the slice with min null score
+    null_end_logit = 0  # the end logit at the slice with min null score
+    for (feature_index, feature) in enumerate(features):
+      if feature.unique_id not in unique_id_to_result:
+        logging.info("Skip eval example %s, not in pred.", feature.unique_id)
+        continue
+      result = unique_id_to_result[feature.unique_id]
+
+      # if we could have irrelevant answers, get the min score of irrelevant
+      if version_2_with_negative:
+        if xlnet_format:
+          feature_null_score = result.class_logits
+        else:
+          feature_null_score = result.start_logits[0] + result.end_logits[0]
+        if feature_null_score < score_null:
+          score_null = feature_null_score
+          min_null_feature_index = feature_index
+          null_start_logit = result.start_logits[0]
+          null_end_logit = result.end_logits[0]
+      for (start_index, start_logit,
+           end_index, end_logit) in _get_best_indexes_and_logits(
+               result=result,
+               n_best_size=n_best_size,
+               xlnet_format=xlnet_format):
+        # We could hypothetically create invalid predictions, e.g., predict
+        # that the start of the span is in the question. We throw out all
+        # invalid predictions.
+        if start_index >= len(feature.tokens):
+          continue
+        if end_index >= len(feature.tokens):
+          continue
+        if start_index not in feature.token_to_orig_map:
+          continue
+        if end_index not in feature.token_to_orig_map:
+          continue
+        if not feature.token_is_max_context.get(start_index, False):
+          continue
+        if end_index < start_index:
+          continue
+        length = end_index - start_index + 1
+        if length > max_answer_length:
+          continue
+        prelim_predictions.append(
+            _PrelimPrediction(
+                feature_index=feature_index,
+                start_index=start_index,
+                end_index=end_index,
+                start_logit=start_logit,
+                end_logit=end_logit))
+
+    if version_2_with_negative and not xlnet_format:
+      prelim_predictions.append(
+          _PrelimPrediction(
+              feature_index=min_null_feature_index,
+              start_index=0,
+              end_index=0,
+              start_logit=null_start_logit,
+              end_logit=null_end_logit))
+    prelim_predictions = sorted(
+        prelim_predictions,
+        key=lambda x: (x.start_logit + x.end_logit),
+        reverse=True)
+
+    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "NbestPrediction", ["text", "start_logit", "end_logit"])
+
+    seen_predictions = {}
+    nbest = []
+    for pred in prelim_predictions:
+      if len(nbest) >= n_best_size:
+        break
+      feature = features[pred.feature_index]
+      if pred.start_index > 0 or xlnet_format:  # this is a non-null prediction
+        tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+        orig_doc_start = feature.token_to_orig_map[pred.start_index]
+        orig_doc_end = feature.token_to_orig_map[pred.end_index]
+        orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+        tok_text = " ".join(tok_tokens)
+
+        # De-tokenize WordPieces that have been split off.
+        tok_text = tok_text.replace(" ##", "")
+        tok_text = tok_text.replace("##", "")
+
+        # Clean whitespace
+        tok_text = tok_text.strip()
+        tok_text = " ".join(tok_text.split())
+        orig_text = " ".join(orig_tokens)
+
+        final_text = get_final_text(
+            tok_text, orig_text, do_lower_case, verbose=verbose)
+        if final_text in seen_predictions:
+          continue
+
+        seen_predictions[final_text] = True
+      else:
+        final_text = ""
+        seen_predictions[final_text] = True
+
+      nbest.append(
+          _NbestPrediction(
+              text=final_text,
+              start_logit=pred.start_logit,
+              end_logit=pred.end_logit))
+
+    # if we didn't inlude the empty option in the n-best, inlcude it
+    if version_2_with_negative and not xlnet_format:
+      if "" not in seen_predictions:
+        nbest.append(
+            _NbestPrediction(
+                text="", start_logit=null_start_logit,
+                end_logit=null_end_logit))
+    # In very rare edge cases we could have no valid predictions. So we
+    # just create a nonce prediction in this case to avoid failure.
+    if not nbest:
+      nbest.append(
+          _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+    assert len(nbest) >= 1
+
+    total_scores = []
+    best_non_null_entry = None
+    for entry in nbest:
+      total_scores.append(entry.start_logit + entry.end_logit)
+      if not best_non_null_entry:
+        if entry.text:
+          best_non_null_entry = entry
+
+    probs = _compute_softmax(total_scores)
+
+    nbest_json = []
+    for (i, entry) in enumerate(nbest):
+      output = collections.OrderedDict()
+      output["text"] = entry.text
+      output["probability"] = probs[i]
+      output["start_logit"] = entry.start_logit
+      output["end_logit"] = entry.end_logit
+      nbest_json.append(output)
+
+    assert len(nbest_json) >= 1
+
+    if not version_2_with_negative:
+      all_predictions[example.qas_id] = nbest_json[0]["text"]
+    else:
+      # pytype: disable=attribute-error
+      # predict "" iff the null score - the score of best non-null > threshold
+      if best_non_null_entry is not None:
+        if xlnet_format:
+          score_diff = score_null
+          scores_diff_json[example.qas_id] = score_diff
+          all_predictions[example.qas_id] = best_non_null_entry.text
+        else:
+          score_diff = score_null - best_non_null_entry.start_logit - (
+              best_non_null_entry.end_logit)
+          scores_diff_json[example.qas_id] = score_diff
+          if score_diff > null_score_diff_threshold:
+            all_predictions[example.qas_id] = ""
+          else:
+            all_predictions[example.qas_id] = best_non_null_entry.text
+      else:
+        logging.warning("best_non_null_entry is None")
+        scores_diff_json[example.qas_id] = score_null
+        all_predictions[example.qas_id] = ""
+      # pytype: enable=attribute-error
+
+    all_nbest_json[example.qas_id] = nbest_json
+
+  return all_predictions, all_nbest_json, scores_diff_json
+
+
+def write_to_json_files(json_records, json_file):
+  with tf.io.gfile.GFile(json_file, "w") as writer:
+    writer.write(json.dumps(json_records, indent=4) + "\n")
+
+
+def get_final_text(pred_text, orig_text, do_lower_case, verbose=False):
+  """Project the tokenized prediction back to the original text."""
+
+  # When we created the data, we kept track of the alignment between original
+  # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+  # now `orig_text` contains the span of our original text corresponding to the
+  # span that we predicted.
+  #
+  # However, `orig_text` may contain extra characters that we don't want in
+  # our prediction.
+  #
+  # For example, let's say:
+  #   pred_text = steve smith
+  #   orig_text = Steve Smith's
+  #
+  # We don't want to return `orig_text` because it contains the extra "'s".
+  #
+  # We don't want to return `pred_text` because it's already been normalized
+  # (the SQuAD eval script also does punctuation stripping/lower casing but
+  # our tokenizer does additional normalization like stripping accent
+  # characters).
+  #
+  # What we really want to return is "Steve Smith".
+  #
+  # Therefore, we have to apply a semi-complicated alignment heruistic between
+  # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
+  # can fail in certain cases in which case we just return `orig_text`.
+
+  def _strip_spaces(text):
+    ns_chars = []
+    ns_to_s_map = collections.OrderedDict()
+    for (i, c) in enumerate(text):
+      if c == " ":
+        continue
+      ns_to_s_map[len(ns_chars)] = i
+      ns_chars.append(c)
+    ns_text = "".join(ns_chars)
+    return (ns_text, ns_to_s_map)
+
+  # We first tokenize `orig_text`, strip whitespace from the result
+  # and `pred_text`, and check if they are the same length. If they are
+  # NOT the same length, the heuristic has failed. If they are the same
+  # length, we assume the characters are one-to-one aligned.
+  tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
+
+  tok_text = " ".join(tokenizer.tokenize(orig_text))
+
+  start_position = tok_text.find(pred_text)
+  if start_position == -1:
+    if verbose:
+      logging.info("Unable to find text: '%s' in '%s'", pred_text, orig_text)
+    return orig_text
+  end_position = start_position + len(pred_text) - 1
+
+  (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+  (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+  if len(orig_ns_text) != len(tok_ns_text):
+    if verbose:
+      logging.info("Length not equal after stripping spaces: '%s' vs '%s'",
+                   orig_ns_text, tok_ns_text)
+    return orig_text
+
+  # We then project the characters in `pred_text` back to `orig_text` using
+  # the character-to-character alignment.
+  tok_s_to_ns_map = {}
+  for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
+    tok_s_to_ns_map[tok_index] = i
+
+  orig_start_position = None
+  if start_position in tok_s_to_ns_map:
+    ns_start_position = tok_s_to_ns_map[start_position]
+    if ns_start_position in orig_ns_to_s_map:
+      orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+  if orig_start_position is None:
+    if verbose:
+      logging.info("Couldn't map start position")
+    return orig_text
+
+  orig_end_position = None
+  if end_position in tok_s_to_ns_map:
+    ns_end_position = tok_s_to_ns_map[end_position]
+    if ns_end_position in orig_ns_to_s_map:
+      orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+  if orig_end_position is None:
+    if verbose:
+      logging.info("Couldn't map end position")
+    return orig_text
+
+  output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+  return output_text
+
+
+def _get_best_indexes_and_logits(result,
+                                 n_best_size,
+                                 xlnet_format=False):
+  """Generates the n-best indexes and logits from a list."""
+  if xlnet_format:
+    for i in range(n_best_size):
+      for j in range(n_best_size):
+        j_index = i * n_best_size + j
+        yield (result.start_indexes[i], result.start_logits[i],
+               result.end_indexes[j_index], result.end_logits[j_index])
+  else:
+    start_index_and_score = sorted(enumerate(result.start_logits),
+                                   key=lambda x: x[1], reverse=True)
+    end_index_and_score = sorted(enumerate(result.end_logits),
+                                 key=lambda x: x[1], reverse=True)
+    for i in range(len(start_index_and_score)):
+      if i >= n_best_size:
+        break
+      for j in range(len(end_index_and_score)):
+        if j >= n_best_size:
+          break
+        yield (start_index_and_score[i][0], start_index_and_score[i][1],
+               end_index_and_score[j][0], end_index_and_score[j][1])
+
+
+def _compute_softmax(scores):
+  """Compute softmax probability over raw logits."""
+  if not scores:
+    return []
+
+  max_score = None
+  for score in scores:
+    if max_score is None or score > max_score:
+      max_score = score
+
+  exp_scores = []
+  total_sum = 0.0
+  for score in scores:
+    x = math.exp(score - max_score)
+    exp_scores.append(x)
+    total_sum += x
+
+  probs = []
+  for score in exp_scores:
+    probs.append(score / total_sum)
+  return probs
+
+
+def generate_tf_record_from_json_file(input_file_path,
+                                      vocab_file_path,
+                                      output_path,
+                                      translated_input_folder=None,
+                                      max_seq_length=384,
+                                      do_lower_case=True,
+                                      max_query_length=64,
+                                      doc_stride=128,
+                                      version_2_with_negative=False,
+                                      xlnet_format=False):
+  """Generates and saves training data into a tf record file."""
+  train_examples = read_squad_examples(
+      input_file=input_file_path,
+      is_training=True,
+      version_2_with_negative=version_2_with_negative,
+      translated_input_folder=translated_input_folder)
+  tokenizer = tokenization.FullTokenizer(
+      vocab_file=vocab_file_path, do_lower_case=do_lower_case)
+  train_writer = FeatureWriter(filename=output_path, is_training=True)
+  number_of_examples = convert_examples_to_features(
+      examples=train_examples,
+      tokenizer=tokenizer,
+      max_seq_length=max_seq_length,
+      doc_stride=doc_stride,
+      max_query_length=max_query_length,
+      is_training=True,
+      output_fn=train_writer.process_feature,
+      xlnet_format=xlnet_format)
+  train_writer.close()
+
+  meta_data = {
+      "task_type": "bert_squad",
+      "train_data_size": number_of_examples,
+      "max_seq_length": max_seq_length,
+      "max_query_length": max_query_length,
+      "doc_stride": doc_stride,
+      "version_2_with_negative": version_2_with_negative,
+  }
+
+  return meta_data
diff --git a/nlp/text_classification/bert/tensorflow2.0/data/squad_lib_sp.py b/nlp/text_classification/bert/tensorflow2.0/data/squad_lib_sp.py
new file mode 100644
index 000000000..61f49a361
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/data/squad_lib_sp.py
@@ -0,0 +1,976 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Run ALBERT on SQuAD 1.1 and SQuAD 2.0 using sentence piece tokenization.
+
+The file is forked from:
+
+https://github.com/google-research/ALBERT/blob/master/run_squad_sp.py
+"""
+import collections
+import copy
+import json
+import math
+import os
+
+from absl import logging
+import numpy as np
+import tensorflow as tf
+
+import tokenization
+
+
+class SquadExample(object):
+  """A single training/test example for simple sequence classification.
+
+     For examples without an answer, the start and end position are -1.
+  """
+
+  def __init__(self,
+               qas_id,
+               question_text,
+               paragraph_text,
+               orig_answer_text=None,
+               start_position=None,
+               end_position=None,
+               is_impossible=False):
+    self.qas_id = qas_id
+    self.question_text = question_text
+    self.paragraph_text = paragraph_text
+    self.orig_answer_text = orig_answer_text
+    self.start_position = start_position
+    self.end_position = end_position
+    self.is_impossible = is_impossible
+
+  def __str__(self):
+    return self.__repr__()
+
+  def __repr__(self):
+    s = ""
+    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
+    s += ", question_text: %s" % (
+        tokenization.printable_text(self.question_text))
+    s += ", paragraph_text: [%s]" % (" ".join(self.paragraph_text))
+    if self.start_position:
+      s += ", start_position: %d" % (self.start_position)
+    if self.start_position:
+      s += ", end_position: %d" % (self.end_position)
+    if self.start_position:
+      s += ", is_impossible: %r" % (self.is_impossible)
+    return s
+
+
+class InputFeatures(object):
+  """A single set of features of data."""
+
+  def __init__(self,
+               unique_id,
+               example_index,
+               doc_span_index,
+               tok_start_to_orig_index,
+               tok_end_to_orig_index,
+               token_is_max_context,
+               tokens,
+               input_ids,
+               input_mask,
+               segment_ids,
+               paragraph_len,
+               class_index=None,
+               paragraph_mask=None,
+               start_position=None,
+               end_position=None,
+               is_impossible=None):
+    self.unique_id = unique_id
+    self.example_index = example_index
+    self.doc_span_index = doc_span_index
+    self.tok_start_to_orig_index = tok_start_to_orig_index
+    self.tok_end_to_orig_index = tok_end_to_orig_index
+    self.token_is_max_context = token_is_max_context
+    self.tokens = tokens
+    self.input_ids = input_ids
+    self.input_mask = input_mask
+    self.paragraph_mask = paragraph_mask
+    self.segment_ids = segment_ids
+    self.paragraph_len = paragraph_len
+    self.class_index = class_index
+    self.start_position = start_position
+    self.end_position = end_position
+    self.is_impossible = is_impossible
+
+
+def read_squad_examples(input_file,
+                        is_training,
+                        version_2_with_negative,
+                        translated_input_folder=None):
+  """Read a SQuAD json file into a list of SquadExample."""
+  del version_2_with_negative
+  with tf.io.gfile.GFile(input_file, "r") as reader:
+    input_data = json.load(reader)["data"]
+
+  if translated_input_folder is not None:
+    translated_files = tf.io.gfile.glob(
+        os.path.join(translated_input_folder, "*.json"))
+    for file in translated_files:
+      with tf.io.gfile.GFile(file, "r") as reader:
+        input_data.extend(json.load(reader)["data"])
+
+  examples = []
+  for entry in input_data:
+    for paragraph in entry["paragraphs"]:
+      paragraph_text = paragraph["context"]
+
+      for qa in paragraph["qas"]:
+        qas_id = qa["id"]
+        question_text = qa["question"]
+        start_position = None
+        orig_answer_text = None
+        is_impossible = False
+
+        if is_training:
+          is_impossible = qa.get("is_impossible", False)
+          if (len(qa["answers"]) != 1) and (not is_impossible):
+            raise ValueError(
+                "For training, each question should have exactly 1 answer.")
+          if not is_impossible:
+            answer = qa["answers"][0]
+            orig_answer_text = answer["text"]
+            start_position = answer["answer_start"]
+          else:
+            start_position = -1
+            orig_answer_text = ""
+
+        example = SquadExample(
+            qas_id=qas_id,
+            question_text=question_text,
+            paragraph_text=paragraph_text,
+            orig_answer_text=orig_answer_text,
+            start_position=start_position,
+            is_impossible=is_impossible)
+        examples.append(example)
+
+  return examples
+
+
+def _convert_index(index, pos, m=None, is_start=True):
+  """Converts index."""
+  if index[pos] is not None:
+    return index[pos]
+  n = len(index)
+  rear = pos
+  while rear < n - 1 and index[rear] is None:
+    rear += 1
+  front = pos
+  while front > 0 and index[front] is None:
+    front -= 1
+  assert index[front] is not None or index[rear] is not None
+  if index[front] is None:
+    if index[rear] >= 1:
+      if is_start:
+        return 0
+      else:
+        return index[rear] - 1
+    return index[rear]
+  if index[rear] is None:
+    if m is not None and index[front] < m - 1:
+      if is_start:
+        return index[front] + 1
+      else:
+        return m - 1
+    return index[front]
+  if is_start:
+    if index[rear] > index[front] + 1:
+      return index[front] + 1
+    else:
+      return index[rear]
+  else:
+    if index[rear] > index[front] + 1:
+      return index[rear] - 1
+    else:
+      return index[front]
+
+
+def convert_examples_to_features(examples,
+                                 tokenizer,
+                                 max_seq_length,
+                                 doc_stride,
+                                 max_query_length,
+                                 is_training,
+                                 output_fn,
+                                 do_lower_case,
+                                 xlnet_format=False,
+                                 batch_size=None):
+  """Loads a data file into a list of `InputBatch`s."""
+  cnt_pos, cnt_neg = 0, 0
+  base_id = 1000000000
+  unique_id = base_id
+  max_n, max_m = 1024, 1024
+  f = np.zeros((max_n, max_m), dtype=np.float32)
+
+  for (example_index, example) in enumerate(examples):
+
+    if example_index % 100 == 0:
+      logging.info("Converting %d/%d pos %d neg %d", example_index,
+                   len(examples), cnt_pos, cnt_neg)
+
+    query_tokens = tokenization.encode_ids(
+        tokenizer.sp_model,
+        tokenization.preprocess_text(
+            example.question_text, lower=do_lower_case))
+
+    if len(query_tokens) > max_query_length:
+      query_tokens = query_tokens[0:max_query_length]
+
+    paragraph_text = example.paragraph_text
+    para_tokens = tokenization.encode_pieces(
+        tokenizer.sp_model,
+        tokenization.preprocess_text(
+            example.paragraph_text, lower=do_lower_case))
+
+    chartok_to_tok_index = []
+    tok_start_to_chartok_index = []
+    tok_end_to_chartok_index = []
+    char_cnt = 0
+    for i, token in enumerate(para_tokens):
+      new_token = token.replace(tokenization.SPIECE_UNDERLINE, " ")
+      chartok_to_tok_index.extend([i] * len(new_token))
+      tok_start_to_chartok_index.append(char_cnt)
+      char_cnt += len(new_token)
+      tok_end_to_chartok_index.append(char_cnt - 1)
+
+    tok_cat_text = "".join(para_tokens).replace(tokenization.SPIECE_UNDERLINE,
+                                                " ")
+    n, m = len(paragraph_text), len(tok_cat_text)
+
+    if n > max_n or m > max_m:
+      max_n = max(n, max_n)
+      max_m = max(m, max_m)
+      f = np.zeros((max_n, max_m), dtype=np.float32)
+
+    g = {}
+
+    # pylint: disable=cell-var-from-loop
+    def _lcs_match(max_dist, n=n, m=m):
+      """Longest-common-substring algorithm."""
+      f.fill(0)
+      g.clear()
+
+      ### longest common sub sequence
+      # f[i, j] = max(f[i - 1, j], f[i, j - 1], f[i - 1, j - 1] + match(i, j))
+      for i in range(n):
+
+        # unlike standard LCS, this is specifically optimized for the setting
+        # because the mismatch between sentence pieces and original text will
+        # be small
+        for j in range(i - max_dist, i + max_dist):
+          if j >= m or j < 0:
+            continue
+
+          if i > 0:
+            g[(i, j)] = 0
+            f[i, j] = f[i - 1, j]
+
+          if j > 0 and f[i, j - 1] > f[i, j]:
+            g[(i, j)] = 1
+            f[i, j] = f[i, j - 1]
+
+          f_prev = f[i - 1, j - 1] if i > 0 and j > 0 else 0
+          if (tokenization.preprocess_text(
+              paragraph_text[i], lower=do_lower_case,
+              remove_space=False) == tok_cat_text[j] and f_prev + 1 > f[i, j]):
+            g[(i, j)] = 2
+            f[i, j] = f_prev + 1
+
+    # pylint: enable=cell-var-from-loop
+
+    max_dist = abs(n - m) + 5
+    for _ in range(2):
+      _lcs_match(max_dist)
+      if f[n - 1, m - 1] > 0.8 * n:
+        break
+      max_dist *= 2
+
+    orig_to_chartok_index = [None] * n
+    chartok_to_orig_index = [None] * m
+    i, j = n - 1, m - 1
+    while i >= 0 and j >= 0:
+      if (i, j) not in g:
+        break
+      if g[(i, j)] == 2:
+        orig_to_chartok_index[i] = j
+        chartok_to_orig_index[j] = i
+        i, j = i - 1, j - 1
+      elif g[(i, j)] == 1:
+        j = j - 1
+      else:
+        i = i - 1
+
+    if (all(v is None for v in orig_to_chartok_index) or
+        f[n - 1, m - 1] < 0.8 * n):
+      logging.info("MISMATCH DETECTED!")
+      continue
+
+    tok_start_to_orig_index = []
+    tok_end_to_orig_index = []
+    for i in range(len(para_tokens)):
+      start_chartok_pos = tok_start_to_chartok_index[i]
+      end_chartok_pos = tok_end_to_chartok_index[i]
+      start_orig_pos = _convert_index(
+          chartok_to_orig_index, start_chartok_pos, n, is_start=True)
+      end_orig_pos = _convert_index(
+          chartok_to_orig_index, end_chartok_pos, n, is_start=False)
+
+      tok_start_to_orig_index.append(start_orig_pos)
+      tok_end_to_orig_index.append(end_orig_pos)
+
+    if not is_training:
+      tok_start_position = tok_end_position = None
+
+    if is_training and example.is_impossible:
+      tok_start_position = 0
+      tok_end_position = 0
+
+    if is_training and not example.is_impossible:
+      start_position = example.start_position
+      end_position = start_position + len(example.orig_answer_text) - 1
+
+      start_chartok_pos = _convert_index(
+          orig_to_chartok_index, start_position, is_start=True)
+      tok_start_position = chartok_to_tok_index[start_chartok_pos]
+
+      end_chartok_pos = _convert_index(
+          orig_to_chartok_index, end_position, is_start=False)
+      tok_end_position = chartok_to_tok_index[end_chartok_pos]
+      assert tok_start_position <= tok_end_position
+
+    def _piece_to_id(x):
+      return tokenizer.sp_model.PieceToId(x)
+
+    all_doc_tokens = list(map(_piece_to_id, para_tokens))
+
+    # The -3 accounts for [CLS], [SEP] and [SEP]
+    max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+
+    # We can have documents that are longer than the maximum sequence length.
+    # To deal with this we do a sliding window approach, where we take chunks
+    # of the up to our max length with a stride of `doc_stride`.
+    _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
+        "DocSpan", ["start", "length"])
+    doc_spans = []
+    start_offset = 0
+
+    while start_offset < len(all_doc_tokens):
+      length = len(all_doc_tokens) - start_offset
+      if length > max_tokens_for_doc:
+        length = max_tokens_for_doc
+      doc_spans.append(_DocSpan(start=start_offset, length=length))
+      if start_offset + length == len(all_doc_tokens):
+        break
+      start_offset += min(length, doc_stride)
+
+    for (doc_span_index, doc_span) in enumerate(doc_spans):
+      tokens = []
+      token_is_max_context = {}
+      segment_ids = []
+
+      # Paragraph mask used in XLNet.
+      # 1 represents paragraph and class tokens.
+      # 0 represents query and other special tokens.
+      paragraph_mask = []
+
+      cur_tok_start_to_orig_index = []
+      cur_tok_end_to_orig_index = []
+
+      # pylint: disable=cell-var-from-loop
+      def process_query(seg_q):
+        for token in query_tokens:
+          tokens.append(token)
+          segment_ids.append(seg_q)
+          paragraph_mask.append(0)
+        tokens.append(tokenizer.sp_model.PieceToId("[SEP]"))
+        segment_ids.append(seg_q)
+        paragraph_mask.append(0)
+
+      def process_paragraph(seg_p):
+        for i in range(doc_span.length):
+          split_token_index = doc_span.start + i
+
+          cur_tok_start_to_orig_index.append(
+              tok_start_to_orig_index[split_token_index])
+          cur_tok_end_to_orig_index.append(
+              tok_end_to_orig_index[split_token_index])
+
+          is_max_context = _check_is_max_context(doc_spans, doc_span_index,
+                                                 split_token_index)
+          token_is_max_context[len(tokens)] = is_max_context
+          tokens.append(all_doc_tokens[split_token_index])
+          segment_ids.append(seg_p)
+          paragraph_mask.append(1)
+        tokens.append(tokenizer.sp_model.PieceToId("[SEP]"))
+        segment_ids.append(seg_p)
+        paragraph_mask.append(0)
+        return len(tokens)
+
+      def process_class(seg_class):
+        class_index = len(segment_ids)
+        tokens.append(tokenizer.sp_model.PieceToId("[CLS]"))
+        segment_ids.append(seg_class)
+        paragraph_mask.append(1)
+        return class_index
+
+      if xlnet_format:
+        seg_p, seg_q, seg_class, seg_pad = 0, 1, 2, 3
+        paragraph_len = process_paragraph(seg_p)
+        process_query(seg_q)
+        class_index = process_class(seg_class)
+      else:
+        seg_p, seg_q, seg_class, seg_pad = 1, 0, 0, 0
+        class_index = process_class(seg_class)
+        process_query(seg_q)
+        paragraph_len = process_paragraph(seg_p)
+
+      input_ids = tokens
+
+      # The mask has 1 for real tokens and 0 for padding tokens. Only real
+      # tokens are attended to.
+      input_mask = [1] * len(input_ids)
+
+      # Zero-pad up to the sequence length.
+      while len(input_ids) < max_seq_length:
+        input_ids.append(0)
+        input_mask.append(0)
+        segment_ids.append(seg_pad)
+        paragraph_mask.append(0)
+
+      assert len(input_ids) == max_seq_length
+      assert len(input_mask) == max_seq_length
+      assert len(segment_ids) == max_seq_length
+      assert len(paragraph_mask) == max_seq_length
+
+      span_is_impossible = example.is_impossible
+      start_position = None
+      end_position = None
+      if is_training and not span_is_impossible:
+        # For training, if our document chunk does not contain an annotation
+        # we throw it out, since there is nothing to predict.
+        doc_start = doc_span.start
+        doc_end = doc_span.start + doc_span.length - 1
+        out_of_span = False
+        if not (tok_start_position >= doc_start and
+                tok_end_position <= doc_end):
+          out_of_span = True
+        if out_of_span:
+          # continue
+          start_position = 0
+          end_position = 0
+          span_is_impossible = True
+        else:
+          doc_offset = 0 if xlnet_format else len(query_tokens) + 2
+          start_position = tok_start_position - doc_start + doc_offset
+          end_position = tok_end_position - doc_start + doc_offset
+
+      if is_training and span_is_impossible:
+        start_position = class_index
+        end_position = class_index
+
+      if example_index < 20:
+        logging.info("*** Example ***")
+        logging.info("unique_id: %s", (unique_id))
+        logging.info("example_index: %s", (example_index))
+        logging.info("doc_span_index: %s", (doc_span_index))
+        logging.info("tok_start_to_orig_index: %s",
+                     " ".join([str(x) for x in cur_tok_start_to_orig_index]))
+        logging.info("tok_end_to_orig_index: %s",
+                     " ".join([str(x) for x in cur_tok_end_to_orig_index]))
+        logging.info(
+            "token_is_max_context: %s", " ".join(
+                ["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()]))
+        logging.info(
+            "input_pieces: %s",
+            " ".join([tokenizer.sp_model.IdToPiece(x) for x in tokens]))
+        logging.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
+        logging.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
+        logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
+        logging.info("paragraph_mask: %s", " ".join(
+            [str(x) for x in paragraph_mask]))
+        logging.info("class_index: %d", class_index)
+
+        if is_training and span_is_impossible:
+          logging.info("impossible example span")
+
+        if is_training and not span_is_impossible:
+          pieces = [
+              tokenizer.sp_model.IdToPiece(token)
+              for token in tokens[start_position:(end_position + 1)]
+          ]
+          answer_text = tokenizer.sp_model.DecodePieces(pieces)
+          logging.info("start_position: %d", (start_position))
+          logging.info("end_position: %d", (end_position))
+          logging.info("answer: %s", (tokenization.printable_text(answer_text)))
+
+          # With multi processing, the example_index is actually the index
+          # within the current process therefore we use example_index=None
+          # to avoid being used in the future.
+          # The current code does not use example_index of training data.
+      if is_training:
+        feat_example_index = None
+      else:
+        feat_example_index = example_index
+
+      feature = InputFeatures(
+          unique_id=unique_id,
+          example_index=feat_example_index,
+          doc_span_index=doc_span_index,
+          tok_start_to_orig_index=cur_tok_start_to_orig_index,
+          tok_end_to_orig_index=cur_tok_end_to_orig_index,
+          token_is_max_context=token_is_max_context,
+          tokens=[tokenizer.sp_model.IdToPiece(x) for x in tokens],
+          input_ids=input_ids,
+          input_mask=input_mask,
+          paragraph_mask=paragraph_mask,
+          segment_ids=segment_ids,
+          paragraph_len=paragraph_len,
+          class_index=class_index,
+          start_position=start_position,
+          end_position=end_position,
+          is_impossible=span_is_impossible)
+
+      # Run callback
+      if is_training:
+        output_fn(feature)
+      else:
+        output_fn(feature, is_padding=False)
+
+      unique_id += 1
+      if span_is_impossible:
+        cnt_neg += 1
+      else:
+        cnt_pos += 1
+
+  if not is_training and feature:
+    assert batch_size
+    num_padding = 0
+    num_examples = unique_id - base_id
+    if unique_id % batch_size != 0:
+      num_padding = batch_size - (num_examples % batch_size)
+    dummy_feature = copy.deepcopy(feature)
+    for _ in range(num_padding):
+      dummy_feature.unique_id = unique_id
+
+      # Run callback
+      output_fn(feature, is_padding=True)
+      unique_id += 1
+
+  logging.info("Total number of instances: %d = pos %d neg %d",
+               cnt_pos + cnt_neg, cnt_pos, cnt_neg)
+  return unique_id - base_id
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+  """Check if this is the 'max context' doc span for the token."""
+
+  # Because of the sliding window approach taken to scoring documents, a single
+  # token can appear in multiple documents. E.g.
+  #  Doc: the man went to the store and bought a gallon of milk
+  #  Span A: the man went to the
+  #  Span B: to the store and bought
+  #  Span C: and bought a gallon of
+  #  ...
+  #
+  # Now the word 'bought' will have two scores from spans B and C. We only
+  # want to consider the score with "maximum context", which we define as
+  # the *minimum* of its left and right context (the *sum* of left and
+  # right context will always be the same, of course).
+  #
+  # In the example the maximum context for 'bought' would be span C since
+  # it has 1 left context and 3 right context, while span B has 4 left context
+  # and 0 right context.
+  best_score = None
+  best_span_index = None
+  for (span_index, doc_span) in enumerate(doc_spans):
+    end = doc_span.start + doc_span.length - 1
+    if position < doc_span.start:
+      continue
+    if position > end:
+      continue
+    num_left_context = position - doc_span.start
+    num_right_context = end - position
+    score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+    if best_score is None or score > best_score:
+      best_score = score
+      best_span_index = span_index
+
+  return cur_span_index == best_span_index
+
+
+def write_predictions(all_examples,
+                      all_features,
+                      all_results,
+                      n_best_size,
+                      max_answer_length,
+                      do_lower_case,
+                      output_prediction_file,
+                      output_nbest_file,
+                      output_null_log_odds_file,
+                      version_2_with_negative=False,
+                      null_score_diff_threshold=0.0,
+                      verbose=False):
+  """Write final predictions to the json file and log-odds of null if needed."""
+  logging.info("Writing predictions to: %s", (output_prediction_file))
+  logging.info("Writing nbest to: %s", (output_nbest_file))
+
+  all_predictions, all_nbest_json, scores_diff_json = (
+      postprocess_output(
+          all_examples=all_examples,
+          all_features=all_features,
+          all_results=all_results,
+          n_best_size=n_best_size,
+          max_answer_length=max_answer_length,
+          do_lower_case=do_lower_case,
+          version_2_with_negative=version_2_with_negative,
+          null_score_diff_threshold=null_score_diff_threshold,
+          verbose=verbose))
+
+  write_to_json_files(all_predictions, output_prediction_file)
+  write_to_json_files(all_nbest_json, output_nbest_file)
+  if version_2_with_negative:
+    write_to_json_files(scores_diff_json, output_null_log_odds_file)
+
+
+def postprocess_output(all_examples,
+                       all_features,
+                       all_results,
+                       n_best_size,
+                       max_answer_length,
+                       do_lower_case,
+                       version_2_with_negative=False,
+                       null_score_diff_threshold=0.0,
+                       xlnet_format=False,
+                       verbose=False):
+  """Postprocess model output, to form predicton results."""
+
+  del do_lower_case, verbose
+  example_index_to_features = collections.defaultdict(list)
+  for feature in all_features:
+    example_index_to_features[feature.example_index].append(feature)
+
+  unique_id_to_result = {}
+  for result in all_results:
+    unique_id_to_result[result.unique_id] = result
+
+  _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+      "PrelimPrediction",
+      ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
+
+  all_predictions = collections.OrderedDict()
+  all_nbest_json = collections.OrderedDict()
+  scores_diff_json = collections.OrderedDict()
+
+  for (example_index, example) in enumerate(all_examples):
+    features = example_index_to_features[example_index]
+
+    prelim_predictions = []
+    # keep track of the minimum score of null start+end of position 0
+    score_null = 1000000  # large and positive
+    min_null_feature_index = 0  # the paragraph slice with min mull score
+    null_start_logit = 0  # the start logit at the slice with min null score
+    null_end_logit = 0  # the end logit at the slice with min null score
+    for (feature_index, feature) in enumerate(features):
+      if feature.unique_id not in unique_id_to_result:
+        logging.info("Skip eval example %s, not in pred.", feature.unique_id)
+        continue
+      result = unique_id_to_result[feature.unique_id]
+
+      # if we could have irrelevant answers, get the min score of irrelevant
+      if version_2_with_negative:
+        if xlnet_format:
+          feature_null_score = result.class_logits
+        else:
+          feature_null_score = result.start_logits[0] + result.end_logits[0]
+        if feature_null_score < score_null:
+          score_null = feature_null_score
+          min_null_feature_index = feature_index
+          null_start_logit = result.start_logits[0]
+          null_end_logit = result.end_logits[0]
+
+      doc_offset = 0 if xlnet_format else feature.tokens.index("[SEP]") + 1
+
+      for (start_index, start_logit,
+           end_index, end_logit) in _get_best_indexes_and_logits(
+               result=result,
+               n_best_size=n_best_size,
+               xlnet_format=xlnet_format):
+        # We could hypothetically create invalid predictions, e.g., predict
+        # that the start of the span is in the question. We throw out all
+        # invalid predictions.
+        if start_index - doc_offset >= len(feature.tok_start_to_orig_index):
+          continue
+        if end_index - doc_offset >= len(feature.tok_end_to_orig_index):
+          continue
+        if not feature.token_is_max_context.get(start_index, False):
+          continue
+        if end_index < start_index:
+          continue
+        length = end_index - start_index + 1
+        if length > max_answer_length:
+          continue
+        prelim_predictions.append(
+            _PrelimPrediction(
+                feature_index=feature_index,
+                start_index=start_index - doc_offset,
+                end_index=end_index - doc_offset,
+                start_logit=start_logit,
+                end_logit=end_logit))
+
+    if version_2_with_negative and not xlnet_format:
+      prelim_predictions.append(
+          _PrelimPrediction(
+              feature_index=min_null_feature_index,
+              start_index=-1,
+              end_index=-1,
+              start_logit=null_start_logit,
+              end_logit=null_end_logit))
+    prelim_predictions = sorted(
+        prelim_predictions,
+        key=lambda x: (x.start_logit + x.end_logit),
+        reverse=True)
+
+    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "NbestPrediction", ["text", "start_logit", "end_logit"])
+
+    seen_predictions = {}
+    nbest = []
+    for pred in prelim_predictions:
+      if len(nbest) >= n_best_size:
+        break
+      feature = features[pred.feature_index]
+      if pred.start_index >= 0 or xlnet_format:  # this is a non-null prediction
+        tok_start_to_orig_index = feature.tok_start_to_orig_index
+        tok_end_to_orig_index = feature.tok_end_to_orig_index
+        start_orig_pos = tok_start_to_orig_index[pred.start_index]
+        end_orig_pos = tok_end_to_orig_index[pred.end_index]
+
+        paragraph_text = example.paragraph_text
+        final_text = paragraph_text[start_orig_pos:end_orig_pos + 1].strip()
+        if final_text in seen_predictions:
+          continue
+
+        seen_predictions[final_text] = True
+      else:
+        final_text = ""
+        seen_predictions[final_text] = True
+
+      nbest.append(
+          _NbestPrediction(
+              text=final_text,
+              start_logit=pred.start_logit,
+              end_logit=pred.end_logit))
+
+    # if we didn't inlude the empty option in the n-best, include it
+    if version_2_with_negative and not xlnet_format:
+      if "" not in seen_predictions:
+        nbest.append(
+            _NbestPrediction(
+                text="", start_logit=null_start_logit,
+                end_logit=null_end_logit))
+    # In very rare edge cases we could have no valid predictions. So we
+    # just create a nonce prediction in this case to avoid failure.
+    if not nbest:
+      nbest.append(
+          _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+    assert len(nbest) >= 1
+
+    total_scores = []
+    best_non_null_entry = None
+    for entry in nbest:
+      total_scores.append(entry.start_logit + entry.end_logit)
+      if not best_non_null_entry:
+        if entry.text:
+          best_non_null_entry = entry
+
+    probs = _compute_softmax(total_scores)
+
+    nbest_json = []
+    for (i, entry) in enumerate(nbest):
+      output = collections.OrderedDict()
+      output["text"] = entry.text
+      output["probability"] = probs[i]
+      output["start_logit"] = entry.start_logit
+      output["end_logit"] = entry.end_logit
+      nbest_json.append(output)
+
+    assert len(nbest_json) >= 1
+
+    if not version_2_with_negative:
+      all_predictions[example.qas_id] = nbest_json[0]["text"]
+    else:
+      assert best_non_null_entry is not None
+      if xlnet_format:
+        score_diff = score_null
+        scores_diff_json[example.qas_id] = score_diff
+        all_predictions[example.qas_id] = best_non_null_entry.text
+      else:
+        # predict "" iff the null score - the score of best non-null > threshold
+        score_diff = score_null - best_non_null_entry.start_logit - (
+            best_non_null_entry.end_logit)
+        scores_diff_json[example.qas_id] = score_diff
+        if score_diff > null_score_diff_threshold:
+          all_predictions[example.qas_id] = ""
+        else:
+          all_predictions[example.qas_id] = best_non_null_entry.text
+
+    all_nbest_json[example.qas_id] = nbest_json
+
+  return all_predictions, all_nbest_json, scores_diff_json
+
+
+def write_to_json_files(json_records, json_file):
+  with tf.io.gfile.GFile(json_file, "w") as writer:
+    writer.write(json.dumps(json_records, indent=4) + "\n")
+
+
+def _get_best_indexes_and_logits(result,
+                                 n_best_size,
+                                 xlnet_format=False):
+  """Generates the n-best indexes and logits from a list."""
+  if xlnet_format:
+    for i in range(n_best_size):
+      for j in range(n_best_size):
+        j_index = i * n_best_size + j
+        yield (result.start_indexes[i], result.start_logits[i],
+               result.end_indexes[j_index], result.end_logits[j_index])
+  else:
+    start_index_and_score = sorted(enumerate(result.start_logits),
+                                   key=lambda x: x[1], reverse=True)
+    end_index_and_score = sorted(enumerate(result.end_logits),
+                                 key=lambda x: x[1], reverse=True)
+    for i in range(len(start_index_and_score)):
+      if i >= n_best_size:
+        break
+      for j in range(len(end_index_and_score)):
+        if j >= n_best_size:
+          break
+        yield (start_index_and_score[i][0], start_index_and_score[i][1],
+               end_index_and_score[j][0], end_index_and_score[j][1])
+
+
+def _compute_softmax(scores):
+  """Compute softmax probability over raw logits."""
+  if not scores:
+    return []
+
+  max_score = None
+  for score in scores:
+    if max_score is None or score > max_score:
+      max_score = score
+
+  exp_scores = []
+  total_sum = 0.0
+  for score in scores:
+    x = math.exp(score - max_score)
+    exp_scores.append(x)
+    total_sum += x
+
+  probs = []
+  for score in exp_scores:
+    probs.append(score / total_sum)
+  return probs
+
+
+class FeatureWriter(object):
+  """Writes InputFeature to TF example file."""
+
+  def __init__(self, filename, is_training):
+    self.filename = filename
+    self.is_training = is_training
+    self.num_features = 0
+    tf.io.gfile.makedirs(os.path.dirname(filename))
+    self._writer = tf.io.TFRecordWriter(filename)
+
+  def process_feature(self, feature):
+    """Write a InputFeature to the TFRecordWriter as a tf.train.Example."""
+    self.num_features += 1
+
+    def create_int_feature(values):
+      feature = tf.train.Feature(
+          int64_list=tf.train.Int64List(value=list(values)))
+      return feature
+
+    features = collections.OrderedDict()
+    features["unique_ids"] = create_int_feature([feature.unique_id])
+    features["input_ids"] = create_int_feature(feature.input_ids)
+    features["input_mask"] = create_int_feature(feature.input_mask)
+    features["segment_ids"] = create_int_feature(feature.segment_ids)
+    if feature.paragraph_mask is not None:
+      features["paragraph_mask"] = create_int_feature(feature.paragraph_mask)
+    if feature.class_index is not None:
+      features["class_index"] = create_int_feature([feature.class_index])
+
+    if self.is_training:
+      features["start_positions"] = create_int_feature([feature.start_position])
+      features["end_positions"] = create_int_feature([feature.end_position])
+      impossible = 0
+      if feature.is_impossible:
+        impossible = 1
+      features["is_impossible"] = create_int_feature([impossible])
+
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    self._writer.write(tf_example.SerializeToString())
+
+  def close(self):
+    self._writer.close()
+
+
+def generate_tf_record_from_json_file(input_file_path,
+                                      sp_model_file,
+                                      output_path,
+                                      translated_input_folder=None,
+                                      max_seq_length=384,
+                                      do_lower_case=True,
+                                      max_query_length=64,
+                                      doc_stride=128,
+                                      xlnet_format=False,
+                                      version_2_with_negative=False):
+  """Generates and saves training data into a tf record file."""
+  train_examples = read_squad_examples(
+      input_file=input_file_path,
+      is_training=True,
+      version_2_with_negative=version_2_with_negative,
+      translated_input_folder=translated_input_folder)
+  tokenizer = tokenization.FullSentencePieceTokenizer(
+      sp_model_file=sp_model_file)
+  train_writer = FeatureWriter(
+      filename=output_path, is_training=True)
+  number_of_examples = convert_examples_to_features(
+      examples=train_examples,
+      tokenizer=tokenizer,
+      max_seq_length=max_seq_length,
+      doc_stride=doc_stride,
+      max_query_length=max_query_length,
+      is_training=True,
+      output_fn=train_writer.process_feature,
+      xlnet_format=xlnet_format,
+      do_lower_case=do_lower_case)
+  train_writer.close()
+
+  meta_data = {
+      "task_type": "bert_squad",
+      "train_data_size": number_of_examples,
+      "max_seq_length": max_seq_length,
+      "max_query_length": max_query_length,
+      "doc_stride": doc_stride,
+      "version_2_with_negative": version_2_with_negative,
+  }
+
+  return meta_data
diff --git a/nlp/text_classification/bert/tensorflow2.0/download_glue_data.py b/nlp/text_classification/bert/tensorflow2.0/download_glue_data.py
new file mode 100644
index 000000000..872715135
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/download_glue_data.py
@@ -0,0 +1,150 @@
+''' Script for downloading all GLUE data.
+Note: for legal reasons, we are unable to host MRPC.
+You can either use the version hosted by the SentEval team, which is already tokenized, 
+or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually.
+For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example).
+You should then rename and place specific files in a folder (see below for an example).
+mkdir MRPC
+cabextract MSRParaphraseCorpus.msi -d MRPC
+cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt
+cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt
+rm MRPC/_*
+rm MSRParaphraseCorpus.msi
+1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now.
+2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray!
+'''
+
+import os
+import sys
+import shutil
+import argparse
+import tempfile
+import urllib.request
+import zipfile
+import io
+
+URLLIB = urllib.request
+TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
+TASK2PATH = {"CoLA":'https://dl.fbaipublicfiles.com/glue/data/CoLA.zip',
+             "SST":'https://dl.fbaipublicfiles.com/glue/data/SST-2.zip',
+             "QQP":'https://dl.fbaipublicfiles.com/glue/data/QQP-clean.zip',
+             "STS":'https://dl.fbaipublicfiles.com/glue/data/STS-B.zip',
+             "MNLI":'https://dl.fbaipublicfiles.com/glue/data/MNLI.zip',
+             "QNLI":'https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip',
+             "RTE":'https://dl.fbaipublicfiles.com/glue/data/RTE.zip',
+             "WNLI":'https://dl.fbaipublicfiles.com/glue/data/WNLI.zip',
+             "diagnostic":'https://dl.fbaipublicfiles.com/glue/data/AX.tsv',
+             "MRPC":"https://raw.githubusercontent.com/MegEngine/Models/master/official/nlp/bert/glue_data/MRPC/dev_ids.tsv"}
+
+MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt'
+MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'
+
+def download_and_extract(task, data_dir):
+    print("Downloading and extracting %s..." % task)
+    if task == "MNLI":
+        print("\tNote (12/10/20): This script no longer downloads SNLI. You will need to manually download and format the data to use SNLI.")
+    data_file = "%s.zip" % task
+    urllib.request.urlretrieve(TASK2PATH[task], data_file)
+    with zipfile.ZipFile(data_file) as zip_ref:
+        zip_ref.extractall(data_dir)
+    os.remove(data_file)
+    print("\tCompleted!")
+
+def format_mrpc(data_dir, path_to_data):
+    print("Processing MRPC...")
+    mrpc_dir = os.path.join(data_dir, "MRPC")
+    if not os.path.isdir(mrpc_dir):
+        os.mkdir(mrpc_dir)
+    if path_to_data:
+        mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt")
+        mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt")
+    else:
+        try:
+            mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
+            mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
+            URLLIB.urlretrieve(MRPC_TRAIN, mrpc_train_file)
+            URLLIB.urlretrieve(MRPC_TEST, mrpc_test_file)
+        except urllib.error.HTTPError:
+            print("Error downloading MRPC")
+            return
+    assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file
+    assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file
+
+    with io.open(mrpc_test_file, encoding='utf-8') as data_fh, \
+            io.open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding='utf-8') as test_fh:
+        header = data_fh.readline()
+        test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
+        for idx, row in enumerate(data_fh):
+            label, id1, id2, s1, s2 = row.strip().split('\t')
+            test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
+
+    try:
+        URLLIB.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv"))
+    except KeyError or urllib.error.HTTPError:
+        print("\tError downloading standard development IDs for MRPC. You will need to manually split your data.")
+        return
+
+    dev_ids = []
+    with io.open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding='utf-8') as ids_fh:
+        for row in ids_fh:
+            dev_ids.append(row.strip().split('\t'))
+
+    with io.open(mrpc_train_file, encoding='utf-8') as data_fh, \
+         io.open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding='utf-8') as train_fh, \
+         io.open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding='utf-8') as dev_fh:
+        header = data_fh.readline()
+        train_fh.write(header)
+        dev_fh.write(header)
+        for row in data_fh:
+            label, id1, id2, s1, s2 = row.strip().split('\t')
+            if [id1, id2] in dev_ids:
+                dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
+            else:
+                train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
+                
+    print("\tCompleted!")
+    
+def download_diagnostic(data_dir):
+    print("Downloading and extracting diagnostic...")
+    if not os.path.isdir(os.path.join(data_dir, "diagnostic")):
+        os.mkdir(os.path.join(data_dir, "diagnostic"))
+    data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv")
+    urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file)
+    print("\tCompleted!")
+    return
+
+def get_tasks(task_names):
+    task_names = task_names.split(',')
+    if "all" in task_names:
+        tasks = TASKS
+    else:
+        tasks = []
+        for task_name in task_names:
+            assert task_name in TASKS, "Task %s not found!" % task_name
+            tasks.append(task_name)
+    return tasks
+
+def main(arguments):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data')
+    parser.add_argument('--tasks', help='tasks to download data for as a comma separated string',
+                        type=str, default='all')
+    parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt',
+                        type=str, default='')
+    args = parser.parse_args(arguments)
+
+    if not os.path.isdir(args.data_dir):
+        os.mkdir(args.data_dir)
+    tasks = get_tasks(args.tasks)
+
+    for task in tasks:
+        if task == 'MRPC':
+            format_mrpc(args.data_dir, args.path_to_mrpc)
+        elif task == 'diagnostic':
+            download_diagnostic(args.data_dir)
+        else:
+            download_and_extract(task, args.data_dir)
+
+
+if __name__ == '__main__':
+    sys.exit(main(sys.argv[1:]))
diff --git a/nlp/text_classification/bert/tensorflow2.0/download_script.sh b/nlp/text_classification/bert/tensorflow2.0/download_script.sh
new file mode 100644
index 000000000..1409697f2
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/download_script.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+WORK_PATH=$(dirname $(readlink -f $0))
+
+set -e
+cp ${WORK_PATH}/../../../../data/datasets/bert_scripts/*  .
+exit 0
\ No newline at end of file
diff --git a/nlp/text_classification/bert/tensorflow2.0/export_tfhub.py b/nlp/text_classification/bert/tensorflow2.0/export_tfhub.py
new file mode 100644
index 000000000..ef9b56f6a
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/export_tfhub.py
@@ -0,0 +1,139 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A script to export BERT as a TF-Hub SavedModel.
+
+This script is **DEPRECATED** for exporting BERT encoder models;
+see the error message in by main() for details.
+"""
+
+from typing import Text
+
+# Import libraries
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+import bert_models
+import configs
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("bert_config_file", None,
+                    "Bert configuration file to define core bert layers.")
+flags.DEFINE_string("model_checkpoint_path", None,
+                    "File path to TF model checkpoint.")
+flags.DEFINE_string("export_path", None, "TF-Hub SavedModel destination path.")
+flags.DEFINE_string("vocab_file", None,
+                    "The vocabulary file that the BERT model was trained on.")
+flags.DEFINE_bool(
+    "do_lower_case", None, "Whether to lowercase. If None, "
+    "do_lower_case will be enabled if 'uncased' appears in the "
+    "name of --vocab_file")
+flags.DEFINE_enum("model_type", "encoder", ["encoder", "squad"],
+                  "What kind of BERT model to export.")
+
+
+def create_bert_model(bert_config: configs.BertConfig) -> tf.keras.Model:
+  """Creates a BERT keras core model from BERT configuration.
+
+  Args:
+    bert_config: A `BertConfig` to create the core model.
+
+  Returns:
+    A keras model.
+  """
+  # Adds input layers just as placeholders.
+  input_word_ids = tf.keras.layers.Input(
+      shape=(None,), dtype=tf.int32, name="input_word_ids")
+  input_mask = tf.keras.layers.Input(
+      shape=(None,), dtype=tf.int32, name="input_mask")
+  input_type_ids = tf.keras.layers.Input(
+      shape=(None,), dtype=tf.int32, name="input_type_ids")
+  transformer_encoder = bert_models.get_transformer_encoder(
+      bert_config, sequence_length=None)
+  sequence_output, pooled_output = transformer_encoder(
+      [input_word_ids, input_mask, input_type_ids])
+  # To keep consistent with legacy hub modules, the outputs are
+  # "pooled_output" and "sequence_output".
+  return tf.keras.Model(
+      inputs=[input_word_ids, input_mask, input_type_ids],
+      outputs=[pooled_output, sequence_output]), transformer_encoder
+
+
+def export_bert_tfhub(bert_config: configs.BertConfig,
+                      model_checkpoint_path: Text,
+                      hub_destination: Text,
+                      vocab_file: Text,
+                      do_lower_case: bool = None):
+  """Restores a tf.keras.Model and saves for TF-Hub."""
+  # If do_lower_case is not explicit, default to checking whether "uncased" is
+  # in the vocab file name
+  if do_lower_case is None:
+    do_lower_case = "uncased" in vocab_file
+    logging.info("Using do_lower_case=%s based on name of vocab_file=%s",
+                 do_lower_case, vocab_file)
+  core_model, encoder = create_bert_model(bert_config)
+  checkpoint = tf.train.Checkpoint(
+      model=encoder,  # Legacy checkpoints.
+      encoder=encoder)
+  checkpoint.restore(model_checkpoint_path).assert_existing_objects_matched()
+  core_model.vocab_file = tf.saved_model.Asset(vocab_file)
+  core_model.do_lower_case = tf.Variable(do_lower_case, trainable=False)
+  core_model.save(hub_destination, include_optimizer=False, save_format="tf")
+
+
+def export_bert_squad_tfhub(bert_config: configs.BertConfig,
+                            model_checkpoint_path: Text,
+                            hub_destination: Text,
+                            vocab_file: Text,
+                            do_lower_case: bool = None):
+  """Restores a tf.keras.Model for BERT with SQuAD and saves for TF-Hub."""
+  # If do_lower_case is not explicit, default to checking whether "uncased" is
+  # in the vocab file name
+  if do_lower_case is None:
+    do_lower_case = "uncased" in vocab_file
+    logging.info("Using do_lower_case=%s based on name of vocab_file=%s",
+                 do_lower_case, vocab_file)
+  span_labeling, _ = bert_models.squad_model(bert_config, max_seq_length=None)
+  checkpoint = tf.train.Checkpoint(model=span_labeling)
+  checkpoint.restore(model_checkpoint_path).assert_existing_objects_matched()
+  span_labeling.vocab_file = tf.saved_model.Asset(vocab_file)
+  span_labeling.do_lower_case = tf.Variable(do_lower_case, trainable=False)
+  span_labeling.save(hub_destination, include_optimizer=False, save_format="tf")
+
+
+def main(_):
+  bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
+  if FLAGS.model_type == "encoder":
+    deprecation_note = (
+        "nlp/bert/export_tfhub is **DEPRECATED** for exporting BERT encoder "
+        "models. Please switch to nlp/tools/export_tfhub for exporting BERT "
+        "(and other) encoders with dict inputs/outputs conforming to "
+        "https://www.tensorflow.org/hub/common_saved_model_apis/text#transformer-encoders"
+    )
+    logging.error(deprecation_note)
+    print("\n\nNOTICE:", deprecation_note, "\n")
+    export_bert_tfhub(bert_config, FLAGS.model_checkpoint_path,
+                      FLAGS.export_path, FLAGS.vocab_file, FLAGS.do_lower_case)
+  elif FLAGS.model_type == "squad":
+    export_bert_squad_tfhub(bert_config, FLAGS.model_checkpoint_path,
+                            FLAGS.export_path, FLAGS.vocab_file,
+                            FLAGS.do_lower_case)
+  else:
+    raise ValueError("Unsupported model_type %s." % FLAGS.model_type)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/nlp/text_classification/bert/tensorflow2.0/input_pipeline.py b/nlp/text_classification/bert/tensorflow2.0/input_pipeline.py
new file mode 100644
index 000000000..0c0f7615c
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/input_pipeline.py
@@ -0,0 +1,302 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""BERT model input pipelines."""
+
+import tensorflow as tf
+
+
+def decode_record(record, name_to_features):
+  """Decodes a record to a TensorFlow example."""
+  example = tf.io.parse_single_example(record, name_to_features)
+
+  # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+  # So cast all int64 to int32.
+  for name in list(example.keys()):
+    t = example[name]
+    if t.dtype == tf.int64:
+      t = tf.cast(t, tf.int32)
+    example[name] = t
+
+  return example
+
+
+def single_file_dataset(input_file, name_to_features, num_samples=None):
+  """Creates a single-file dataset to be passed for BERT custom training."""
+  # For training, we want a lot of parallel reading and shuffling.
+  # For eval, we want no shuffling and parallel reading doesn't matter.
+  d = tf.data.TFRecordDataset(input_file)
+  if num_samples:
+    d = d.take(num_samples)
+  d = d.map(
+      lambda record: decode_record(record, name_to_features),
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+  # When `input_file` is a path to a single file or a list
+  # containing a single path, disable auto sharding so that
+  # same input file is sent to all workers.
+  if isinstance(input_file, str) or len(input_file) == 1:
+    options = tf.data.Options()
+    options.experimental_distribute.auto_shard_policy = (
+        tf.data.experimental.AutoShardPolicy.OFF)
+    d = d.with_options(options)
+  return d
+
+
+def create_pretrain_dataset(input_patterns,
+                            seq_length,
+                            max_predictions_per_seq,
+                            batch_size,
+                            is_training=True,
+                            input_pipeline_context=None,
+                            use_next_sentence_label=True,
+                            use_position_id=False,
+                            output_fake_labels=True):
+  """Creates input dataset from (tf)records files for pretraining."""
+  name_to_features = {
+      'input_ids':
+          tf.io.FixedLenFeature([seq_length], tf.int64),
+      'input_mask':
+          tf.io.FixedLenFeature([seq_length], tf.int64),
+      'segment_ids':
+          tf.io.FixedLenFeature([seq_length], tf.int64),
+      'masked_lm_positions':
+          tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
+      'masked_lm_ids':
+          tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
+      'masked_lm_weights':
+          tf.io.FixedLenFeature([max_predictions_per_seq], tf.float32),
+  }
+  if use_next_sentence_label:
+    name_to_features['next_sentence_labels'] = tf.io.FixedLenFeature([1],
+                                                                     tf.int64)
+  if use_position_id:
+    name_to_features['position_ids'] = tf.io.FixedLenFeature([seq_length],
+                                                             tf.int64)
+  for input_pattern in input_patterns:
+    if not tf.io.gfile.glob(input_pattern):
+      raise ValueError('%s does not match any files.' % input_pattern)
+
+  dataset = tf.data.Dataset.list_files(input_patterns, shuffle=is_training)
+
+  if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
+    dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
+                            input_pipeline_context.input_pipeline_id)
+  if is_training:
+    dataset = dataset.repeat()
+
+    # We set shuffle buffer to exactly match total number of
+    # training files to ensure that training data is well shuffled.
+    input_files = []
+    for input_pattern in input_patterns:
+      input_files.extend(tf.io.gfile.glob(input_pattern))
+    dataset = dataset.shuffle(len(input_files))
+
+  # In parallel, create tf record dataset for each train files.
+  # cycle_length = 8 means that up to 8 files will be read and deserialized in
+  # parallel. You may want to increase this number if you have a large number of
+  # CPU cores.
+  dataset = dataset.interleave(
+      tf.data.TFRecordDataset,
+      cycle_length=8,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+  if is_training:
+    dataset = dataset.shuffle(100)
+
+  decode_fn = lambda record: decode_record(record, name_to_features)
+  dataset = dataset.map(
+      decode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+  def _select_data_from_record(record):
+    """Filter out features to use for pretraining."""
+    x = {
+        'input_word_ids': record['input_ids'],
+        'input_mask': record['input_mask'],
+        'input_type_ids': record['segment_ids'],
+        'masked_lm_positions': record['masked_lm_positions'],
+        'masked_lm_ids': record['masked_lm_ids'],
+        'masked_lm_weights': record['masked_lm_weights'],
+    }
+    if use_next_sentence_label:
+      x['next_sentence_labels'] = record['next_sentence_labels']
+    if use_position_id:
+      x['position_ids'] = record['position_ids']
+
+    # TODO(hongkuny): Remove the fake labels after migrating bert pretraining.
+    if output_fake_labels:
+      return (x, record['masked_lm_weights'])
+    else:
+      return x
+
+  dataset = dataset.map(
+      _select_data_from_record,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  dataset = dataset.batch(batch_size, drop_remainder=is_training)
+  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+  return dataset
+
+
+def create_classifier_dataset(file_path,
+                              seq_length,
+                              batch_size,
+                              is_training=True,
+                              input_pipeline_context=None,
+                              label_type=tf.int64,
+                              include_sample_weights=False,
+                              num_samples=None):
+  """Creates input dataset from (tf)records files for train/eval."""
+  name_to_features = {
+      'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'label_ids': tf.io.FixedLenFeature([], label_type),
+  }
+  if include_sample_weights:
+    name_to_features['weight'] = tf.io.FixedLenFeature([], tf.float32)
+  dataset = single_file_dataset(file_path, name_to_features,
+                                num_samples=num_samples)
+
+  # The dataset is always sharded by number of hosts.
+  # num_input_pipelines is the number of hosts rather than number of cores.
+  if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
+    dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
+                            input_pipeline_context.input_pipeline_id)
+
+  def _select_data_from_record(record):
+    x = {
+        'input_word_ids': record['input_ids'],
+        'input_mask': record['input_mask'],
+        'input_type_ids': record['segment_ids']
+    }
+    y = record['label_ids']
+    if include_sample_weights:
+      w = record['weight']
+      return (x, y, w)
+    return (x, y)
+
+  if is_training:
+    dataset = dataset.shuffle(100)
+    dataset = dataset.repeat()
+
+  dataset = dataset.map(
+      _select_data_from_record,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  dataset = dataset.batch(batch_size, drop_remainder=is_training)
+  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+  return dataset
+
+
+def create_squad_dataset(file_path,
+                         seq_length,
+                         batch_size,
+                         is_training=True,
+                         input_pipeline_context=None):
+  """Creates input dataset from (tf)records files for train/eval."""
+  name_to_features = {
+      'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
+  }
+  if is_training:
+    name_to_features['start_positions'] = tf.io.FixedLenFeature([], tf.int64)
+    name_to_features['end_positions'] = tf.io.FixedLenFeature([], tf.int64)
+  else:
+    name_to_features['unique_ids'] = tf.io.FixedLenFeature([], tf.int64)
+
+  dataset = single_file_dataset(file_path, name_to_features)
+
+  # The dataset is always sharded by number of hosts.
+  # num_input_pipelines is the number of hosts rather than number of cores.
+  if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
+    dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
+                            input_pipeline_context.input_pipeline_id)
+
+  def _select_data_from_record(record):
+    """Dispatches record to features and labels."""
+    x, y = {}, {}
+    for name, tensor in record.items():
+      if name in ('start_positions', 'end_positions'):
+        y[name] = tensor
+      elif name == 'input_ids':
+        x['input_word_ids'] = tensor
+      elif name == 'segment_ids':
+        x['input_type_ids'] = tensor
+      else:
+        x[name] = tensor
+    return (x, y)
+
+  if is_training:
+    dataset = dataset.shuffle(100)
+    dataset = dataset.repeat()
+
+  dataset = dataset.map(
+      _select_data_from_record,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  dataset = dataset.batch(batch_size, drop_remainder=True)
+  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+  return dataset
+
+
+def create_retrieval_dataset(file_path,
+                             seq_length,
+                             batch_size,
+                             input_pipeline_context=None):
+  """Creates input dataset from (tf)records files for scoring."""
+  name_to_features = {
+      'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'example_id': tf.io.FixedLenFeature([1], tf.int64),
+  }
+  dataset = single_file_dataset(file_path, name_to_features)
+
+  # The dataset is always sharded by number of hosts.
+  # num_input_pipelines is the number of hosts rather than number of cores.
+  if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
+    dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
+                            input_pipeline_context.input_pipeline_id)
+
+  def _select_data_from_record(record):
+    x = {
+        'input_word_ids': record['input_ids'],
+        'input_mask': record['input_mask'],
+        'input_type_ids': record['segment_ids']
+    }
+    y = record['example_id']
+    return (x, y)
+
+  dataset = dataset.map(
+      _select_data_from_record,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  dataset = dataset.batch(batch_size, drop_remainder=False)
+
+  def _pad_to_batch(x, y):
+    cur_size = tf.shape(y)[0]
+    pad_size = batch_size - cur_size
+
+    pad_ids = tf.zeros(shape=[pad_size, seq_length], dtype=tf.int32)
+    for key in ('input_word_ids', 'input_mask', 'input_type_ids'):
+      x[key] = tf.concat([x[key], pad_ids], axis=0)
+
+    pad_labels = -tf.ones(shape=[pad_size, 1], dtype=tf.int32)
+    y = tf.concat([y, pad_labels], axis=0)
+    return x, y
+
+  dataset = dataset.map(
+      _pad_to_batch,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+  return dataset
diff --git a/nlp/text_classification/bert/tensorflow2.0/keras_nlp/README.md b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/README.md
new file mode 100644
index 000000000..1c5bbb131
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/README.md
@@ -0,0 +1,37 @@
+# keras-nlp
+
+## Layers
+
+Layers are the fundamental building blocks for NLP models. They can be used to
+assemble new layers, networks, or models.
+
+*   [TransformerEncoderBlock](layers/transformer_encoder_block.py) implements
+    an optionally masked transformer as described in
+    ["Attention Is All You Need"](https://arxiv.org/abs/1706.03762).
+
+*   [OnDeviceEmbedding](layers/on_device_embedding.py) implements efficient
+    embedding lookups designed for TPU-based models.
+
+*   [PositionalEmbedding](layers/position_embedding.py) creates a positional
+    embedding as described in ["BERT: Pre-training of Deep Bidirectional
+    Transformers for Language Understanding"](https://arxiv.org/abs/1810.04805).
+
+*   [SelfAttentionMask](layers/self_attention_mask.py) creates a 3D attention
+    mask from a 2D tensor mask.
+
+*   [MaskedLM](layers/masked_lm.py) implements a masked language model. It
+    assumes the embedding table variable is passed to it.
+
+
+## Encoders
+
+Encoders are combinations of layers (and possibly other encoders). They are
+sub-units of models that would not be trained alone. It encapsulates common
+network structures like a classification head or a transformer encoder into an
+easily handled object with a standardized configuration.
+
+*   [BertEncoder](encoders/bert_encoder.py) implements a bi-directional
+    Transformer-based encoder as described in
+    ["BERT: Pre-training of Deep Bidirectional Transformers for Language
+    Understanding"](https://arxiv.org/abs/1810.04805). It includes the embedding
+    lookups, transformer layers and pooling layer.
diff --git a/nlp/text_classification/bert/tensorflow2.0/keras_nlp/__init__.py b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/__init__.py
new file mode 100644
index 000000000..585d1ccf2
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-NLP package definition."""
+# pylint: disable=wildcard-import
+from keras_nlp import encoders
+from keras_nlp import layers
diff --git a/nlp/text_classification/bert/tensorflow2.0/keras_nlp/contributing.md b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/contributing.md
new file mode 100644
index 000000000..b9ec1716d
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/contributing.md
@@ -0,0 +1,21 @@
+## Contributing to KerasNLP
+
+Patches to KerasNLP are welcome!
+
+The source-of-truth repository lives under
+[TF Model Garden NLP](https://github.com/tensorflow/models/tree/master/official/nlp/keras_nlp),
+and is mirrored as a read-only repository under
+[keras-team/keras-nlp](https://github.com/keras-team/keras-nlp).
+Contributions should be made as PRs to the TF Model Garden repository.
+This is to ensure the codebase is rigorously tested with state-of-art models
+on different accelerators.
+In the long run, we will move development to the current repository `keras-team/keras-nlp`.
+
+## :heavy_check_mark: Contributor checklist
+
+1. Ensure you have signed the [Contributor License Agreement](https://cla.developers.google.com/about/google-individual?csw=1).
+    * All code contributors are required to sign a Contributor License Agreement.
+    * Please read this [troubleshooting guide](Contributor-License-Agreements#troubleshooting-clas)
+    if you encounter an issue.
+2. Please review the [contribution guidelines](https://github.com/tensorflow/models/wiki/How-to-contribute).
+3. Check if your changes are consistent with the [TensorFlow coding style](https://www.tensorflow.org/community/contribute/code_style).
diff --git a/nlp/text_classification/bert/tensorflow2.0/keras_nlp/encoders/__init__.py b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/encoders/__init__.py
new file mode 100644
index 000000000..e83e5a3bf
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/encoders/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-NLP layers package definition."""
+from keras_nlp.encoders.bert_encoder import BertEncoder
diff --git a/nlp/text_classification/bert/tensorflow2.0/keras_nlp/encoders/bert_encoder.py b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/encoders/bert_encoder.py
new file mode 100644
index 000000000..695431f6a
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/encoders/bert_encoder.py
@@ -0,0 +1,262 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Bert encoder network."""
+# pylint: disable=g-classes-have-attributes
+
+import collections
+from absl import logging
+import tensorflow as tf
+
+from keras_nlp import layers
+
+
+@tf.keras.utils.register_keras_serializable(package='keras_nlp')
+class BertEncoder(tf.keras.Model):
+  """Bi-directional Transformer-based encoder network.
+
+  This network implements a bi-directional Transformer-based encoder as
+  described in "BERT: Pre-training of Deep Bidirectional Transformers for
+  Language Understanding" (https://arxiv.org/abs/1810.04805). It includes the
+  embedding lookups and transformer layers, but not the masked language model
+  or classification task networks.
+
+  The default values for this object are taken from the BERT-Base implementation
+  in "BERT: Pre-training of Deep Bidirectional Transformers for Language
+  Understanding".
+
+  *Note* that the network is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).
+
+  Args:
+    vocab_size: The size of the token vocabulary.
+    hidden_size: The size of the transformer hidden layers.
+    num_layers: The number of transformer layers.
+    num_attention_heads: The number of attention heads for each transformer. The
+      hidden size must be divisible by the number of attention heads.
+    max_sequence_length: The maximum sequence length that this encoder can
+      consume. If None, max_sequence_length uses the value from sequence length.
+      This determines the variable shape for positional embeddings.
+    type_vocab_size: The number of types that the 'type_ids' input can take.
+    inner_dim: The output dimension of the first Dense layer in a two-layer
+        feedforward network for each transformer.
+    inner_activation: The activation for the first Dense layer in a two-layer
+        feedforward network for each transformer.
+    output_dropout: Dropout probability for the post-attention and output
+        dropout.
+    attention_dropout: The dropout rate to use for the attention layers
+      within the transformer layers.
+    initializer: The initialzer to use for all weights in this encoder.
+    output_range: The sequence output range, [0, output_range), by slicing the
+      target sequence of the last transformer layer. `None` means the entire
+      target sequence will attend to the source sequence, which yields the full
+      output.
+    embedding_width: The width of the word embeddings. If the embedding width is
+      not equal to hidden size, embedding parameters will be factorized into two
+      matrices in the shape of ['vocab_size', 'embedding_width'] and
+      ['embedding_width', 'hidden_size'] ('embedding_width' is usually much
+      smaller than 'hidden_size').
+    embedding_layer: An optional Layer instance which will be called to
+     generate embeddings for the input word IDs.
+  """
+
+  def __init__(
+      self,
+      vocab_size,
+      hidden_size=768,
+      num_layers=12,
+      num_attention_heads=12,
+      max_sequence_length=512,
+      type_vocab_size=16,
+      inner_dim=3072,
+      inner_activation=lambda x: tf.keras.activations.gelu(x, approximate=True),
+      output_dropout=0.1,
+      attention_dropout=0.1,
+      initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      output_range=None,
+      embedding_width=None,
+      embedding_layer=None,
+      **kwargs):
+    activation = tf.keras.activations.get(inner_activation)
+    initializer = tf.keras.initializers.get(initializer)
+
+    word_ids = tf.keras.layers.Input(
+        shape=(None,), dtype=tf.int32, name='input_word_ids')
+    mask = tf.keras.layers.Input(
+        shape=(None,), dtype=tf.int32, name='input_mask')
+    type_ids = tf.keras.layers.Input(
+        shape=(None,), dtype=tf.int32, name='input_type_ids')
+
+    if embedding_width is None:
+      embedding_width = hidden_size
+
+    if embedding_layer is None:
+      embedding_layer_inst = layers.OnDeviceEmbedding(
+          vocab_size=vocab_size,
+          embedding_width=embedding_width,
+          initializer=initializer,
+          name='word_embeddings')
+    else:
+      embedding_layer_inst = embedding_layer
+    word_embeddings = embedding_layer_inst(word_ids)
+
+    # Always uses dynamic slicing for simplicity.
+    position_embedding_layer = layers.PositionEmbedding(
+        initializer=initializer,
+        max_length=max_sequence_length,
+        name='position_embedding')
+    position_embeddings = position_embedding_layer(word_embeddings)
+    type_embedding_layer = layers.OnDeviceEmbedding(
+        vocab_size=type_vocab_size,
+        embedding_width=embedding_width,
+        initializer=initializer,
+        use_one_hot=True,
+        name='type_embeddings')
+    type_embeddings = type_embedding_layer(type_ids)
+
+    embeddings = tf.keras.layers.Add()(
+        [word_embeddings, position_embeddings, type_embeddings])
+
+    embedding_norm_layer = tf.keras.layers.LayerNormalization(
+        name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32)
+
+    embeddings = embedding_norm_layer(embeddings)
+    embeddings = (tf.keras.layers.Dropout(rate=output_dropout)(embeddings))
+
+    # We project the 'embedding' output to 'hidden_size' if it is not already
+    # 'hidden_size'.
+    if embedding_width != hidden_size:
+      embedding_projection = tf.keras.layers.experimental.EinsumDense(
+          '...x,xy->...y',
+          output_shape=hidden_size,
+          bias_axes='y',
+          kernel_initializer=initializer,
+          name='embedding_projection')
+      embeddings = embedding_projection(embeddings)
+    else:
+      embedding_projection = None
+
+    transformer_layers = []
+    data = embeddings
+    attention_mask = layers.SelfAttentionMask()(data, mask)
+    encoder_outputs = []
+    for i in range(num_layers):
+      if i == num_layers - 1 and output_range is not None:
+        transformer_output_range = output_range
+      else:
+        transformer_output_range = None
+      layer = layers.TransformerEncoderBlock(
+          num_attention_heads=num_attention_heads,
+          inner_dim=inner_dim,
+          inner_activation=inner_activation,
+          output_dropout=output_dropout,
+          attention_dropout=attention_dropout,
+          output_range=transformer_output_range,
+          kernel_initializer=initializer,
+          name='transformer/layer_%d' % i)
+      transformer_layers.append(layer)
+      data = layer([data, attention_mask])
+      encoder_outputs.append(data)
+
+    last_encoder_output = encoder_outputs[-1]
+    # Applying a tf.slice op (through subscript notation) to a Keras tensor
+    # like this will create a SliceOpLambda layer. This is better than a Lambda
+    # layer with Python code, because that is fundamentally less portable.
+    first_token_tensor = last_encoder_output[:, 0, :]
+    pooler_layer = tf.keras.layers.Dense(
+        units=hidden_size,
+        activation='tanh',
+        kernel_initializer=initializer,
+        name='pooler_transform')
+    cls_output = pooler_layer(first_token_tensor)
+
+    outputs = dict(
+        sequence_output=encoder_outputs[-1],
+        pooled_output=cls_output,
+        encoder_outputs=encoder_outputs,
+    )
+
+    # Once we've created the network using the Functional API, we call
+    # super().__init__ as though we were invoking the Functional API Model
+    # constructor, resulting in this object having all the properties of a model
+    # created using the Functional API. Once super().__init__ is called, we
+    # can assign attributes to `self` - note that all `self` assignments are
+    # below this line.
+    super(BertEncoder, self).__init__(
+        inputs=[word_ids, mask, type_ids], outputs=outputs, **kwargs)
+
+    config_dict = {
+        'vocab_size': vocab_size,
+        'hidden_size': hidden_size,
+        'num_layers': num_layers,
+        'num_attention_heads': num_attention_heads,
+        'max_sequence_length': max_sequence_length,
+        'type_vocab_size': type_vocab_size,
+        'inner_dim': inner_dim,
+        'inner_activation': tf.keras.activations.serialize(activation),
+        'output_dropout': output_dropout,
+        'attention_dropout': attention_dropout,
+        'initializer': tf.keras.initializers.serialize(initializer),
+        'output_range': output_range,
+        'embedding_width': embedding_width,
+        'embedding_layer': embedding_layer,
+    }
+
+    # We are storing the config dict as a namedtuple here to ensure checkpoint
+    # compatibility with an earlier version of this model which did not track
+    # the config dict attribute. TF does not track immutable attrs which
+    # do not contain Trackables, so by creating a config namedtuple instead of
+    # a dict we avoid tracking it.
+    config_cls = collections.namedtuple('Config', config_dict.keys())
+    self._config = config_cls(**config_dict)
+    self._pooler_layer = pooler_layer
+    self._transformer_layers = transformer_layers
+    self._embedding_norm_layer = embedding_norm_layer
+    self._embedding_layer = embedding_layer_inst
+    self._position_embedding_layer = position_embedding_layer
+    self._type_embedding_layer = type_embedding_layer
+    if embedding_projection is not None:
+      self._embedding_projection = embedding_projection
+
+  def get_embedding_table(self):
+    return self._embedding_layer.embeddings
+
+  def get_embedding_layer(self):
+    return self._embedding_layer
+
+  def get_config(self):
+    return dict(self._config._asdict())
+
+  @property
+  def transformer_layers(self):
+    """List of Transformer layers in the encoder."""
+    return self._transformer_layers
+
+  @property
+  def pooler_layer(self):
+    """The pooler dense layer after the transformer layers."""
+    return self._pooler_layer
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    if 'embedding_layer' in config and config['embedding_layer'] is not None:
+      warn_string = (
+          'You are reloading a model that was saved with a '
+          'potentially-shared embedding layer object. If you contine to '
+          'train this model, the embedding layer will no longer be shared. '
+          'To work around this, load the model outside of the Keras API.')
+      print('WARNING: ' + warn_string)
+      logging.warn(warn_string)
+
+    return cls(**config)
diff --git a/nlp/text_classification/bert/tensorflow2.0/keras_nlp/encoders/bert_encoder_test.py b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/encoders/bert_encoder_test.py
new file mode 100644
index 000000000..07333029e
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/encoders/bert_encoder_test.py
@@ -0,0 +1,232 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for transformer-based bert encoder network."""
+
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from keras_nlp.encoders import bert_encoder
+
+
+# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
+# guarantees forward compatibility of this code for the V2 switchover.
+@keras_parameterized.run_all_keras_modes
+class BertEncoderTest(keras_parameterized.TestCase):
+
+  def tearDown(self):
+    super(BertEncoderTest, self).tearDown()
+    tf.keras.mixed_precision.set_global_policy("float32")
+
+  def test_network_creation(self):
+    hidden_size = 32
+    sequence_length = 21
+    # Create a small BertEncoder for testing.
+    test_network = bert_encoder.BertEncoder(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3)
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network([word_ids, mask, type_ids])
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+
+    self.assertIsInstance(test_network.transformer_layers, list)
+    self.assertLen(test_network.transformer_layers, 3)
+    self.assertIsInstance(test_network.pooler_layer, tf.keras.layers.Dense)
+
+    expected_data_shape = [None, sequence_length, hidden_size]
+    expected_pooled_shape = [None, hidden_size]
+    self.assertAllEqual(expected_data_shape, data.shape.as_list())
+    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
+
+    # The default output dtype is float32.
+    self.assertAllEqual(tf.float32, data.dtype)
+    self.assertAllEqual(tf.float32, pooled.dtype)
+
+  def test_all_encoder_outputs_network_creation(self):
+    hidden_size = 32
+    sequence_length = 21
+    # Create a small BertEncoder for testing.
+    test_network = bert_encoder.BertEncoder(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3)
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network([word_ids, mask, type_ids])
+    all_encoder_outputs = dict_outputs["encoder_outputs"]
+    pooled = dict_outputs["pooled_output"]
+
+    expected_data_shape = [None, sequence_length, hidden_size]
+    expected_pooled_shape = [None, hidden_size]
+    self.assertLen(all_encoder_outputs, 3)
+    for data in all_encoder_outputs:
+      self.assertAllEqual(expected_data_shape, data.shape.as_list())
+    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
+
+    # The default output dtype is float32.
+    self.assertAllEqual(tf.float32, all_encoder_outputs[-1].dtype)
+    self.assertAllEqual(tf.float32, pooled.dtype)
+
+  def test_network_creation_with_float16_dtype(self):
+    hidden_size = 32
+    sequence_length = 21
+    tf.keras.mixed_precision.set_global_policy("mixed_float16")
+    # Create a small BertEncoder for testing.
+    test_network = bert_encoder.BertEncoder(
+        vocab_size=100,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3)
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network([word_ids, mask, type_ids])
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+
+    expected_data_shape = [None, sequence_length, hidden_size]
+    expected_pooled_shape = [None, hidden_size]
+    self.assertAllEqual(expected_data_shape, data.shape.as_list())
+    self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
+
+    # If float_dtype is set to float16, the data output is float32 (from a layer
+    # norm) and pool output should be float16.
+    self.assertAllEqual(tf.float32, data.dtype)
+    self.assertAllEqual(tf.float16, pooled.dtype)
+
+  @parameterized.named_parameters(
+      ("all_sequence", None, 21),
+      ("output_range", 1, 1),
+  )
+  def test_network_invocation(self, output_range, out_seq_len):
+    hidden_size = 32
+    sequence_length = 21
+    vocab_size = 57
+    num_types = 7
+    # Create a small BertEncoder for testing.
+    test_network = bert_encoder.BertEncoder(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=3,
+        type_vocab_size=num_types,
+        output_range=output_range)
+    # Create the inputs (note that the first dimension is implicit).
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    dict_outputs = test_network([word_ids, mask, type_ids])
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+
+    # Create a model based off of this network:
+    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
+
+    # Invoke the model. We can't validate the output data here (the model is too
+    # complex) but this will catch structural runtime errors.
+    batch_size = 3
+    word_id_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    mask_data = np.random.randint(2, size=(batch_size, sequence_length))
+    type_id_data = np.random.randint(
+        num_types, size=(batch_size, sequence_length))
+    outputs = model.predict([word_id_data, mask_data, type_id_data])
+    self.assertEqual(outputs[0].shape[1], out_seq_len)
+
+    # Creates a BertEncoder with max_sequence_length != sequence_length
+    max_sequence_length = 128
+    test_network = bert_encoder.BertEncoder(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        max_sequence_length=max_sequence_length,
+        num_attention_heads=2,
+        num_layers=3,
+        type_vocab_size=num_types)
+    dict_outputs = test_network([word_ids, mask, type_ids])
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
+    outputs = model.predict([word_id_data, mask_data, type_id_data])
+    self.assertEqual(outputs[0].shape[1], sequence_length)
+
+    # Creates a BertEncoder with embedding_width != hidden_size
+    test_network = bert_encoder.BertEncoder(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        max_sequence_length=max_sequence_length,
+        num_attention_heads=2,
+        num_layers=3,
+        type_vocab_size=num_types,
+        embedding_width=16)
+    dict_outputs = test_network([word_ids, mask, type_ids])
+    data = dict_outputs["sequence_output"]
+    pooled = dict_outputs["pooled_output"]
+    model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
+    outputs = model.predict([word_id_data, mask_data, type_id_data])
+    self.assertEqual(outputs[0].shape[-1], hidden_size)
+    self.assertTrue(hasattr(test_network, "_embedding_projection"))
+
+  def test_serialize_deserialize(self):
+    # Create a network object that sets all of its config options.
+    kwargs = dict(
+        vocab_size=100,
+        hidden_size=32,
+        num_layers=3,
+        num_attention_heads=2,
+        max_sequence_length=21,
+        type_vocab_size=12,
+        inner_dim=1223,
+        inner_activation="relu",
+        output_dropout=0.05,
+        attention_dropout=0.22,
+        initializer="glorot_uniform",
+        output_range=-1,
+        embedding_width=16,
+        embedding_layer=None)
+    network = bert_encoder.BertEncoder(**kwargs)
+    expected_config = dict(kwargs)
+    expected_config["inner_activation"] = tf.keras.activations.serialize(
+        tf.keras.activations.get(expected_config["inner_activation"]))
+    expected_config["initializer"] = tf.keras.initializers.serialize(
+        tf.keras.initializers.get(expected_config["initializer"]))
+    self.assertEqual(network.get_config(), expected_config)
+    # Create another network object from the first object's config.
+    new_network = bert_encoder.BertEncoder.from_config(network.get_config())
+
+    # Validate that the config can be forced to JSON.
+    _ = network.to_json()
+
+    # If the serialization was successful, the new config should match the old.
+    self.assertAllEqual(network.get_config(), new_network.get_config())
+
+    # Tests model saving/loading.
+    model_path = self.get_temp_dir() + "/model"
+    network.save(model_path)
+    _ = tf.keras.models.load_model(model_path)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/__init__.py b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/__init__.py
new file mode 100644
index 000000000..7e974238a
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-NLP layers package definition."""
+from keras_nlp.layers.masked_lm import MaskedLM
+from keras_nlp.layers.on_device_embedding import OnDeviceEmbedding
+from keras_nlp.layers.position_embedding import PositionEmbedding
+from keras_nlp.layers.self_attention_mask import SelfAttentionMask
+from keras_nlp.layers.transformer_encoder_block import TransformerEncoderBlock
diff --git a/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/masked_lm.py b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/masked_lm.py
new file mode 100644
index 000000000..a624169d4
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/masked_lm.py
@@ -0,0 +1,123 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Masked language model network."""
+# pylint: disable=g-classes-have-attributes
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package='keras_nlp')
+class MaskedLM(tf.keras.layers.Layer):
+  """Masked language model network head for BERT modeling.
+
+  This layer implements a masked language model based on the provided
+  transformer based encoder. It assumes that the encoder network being passed
+  has a "get_embedding_table()" method.
+
+  Example:
+  ```python
+  encoder=keras_nlp.BertEncoder(...)
+  lm_layer=MaskedLM(embedding_table=encoder.get_embedding_table())
+  ```
+
+  Args:
+    embedding_table: The embedding table from encoder network.
+    activation: The activation, if any, for the dense layer.
+    initializer: The initializer for the dense layer. Defaults to a Glorot
+      uniform initializer.
+    output: The output style for this layer. Can be either 'logits' or
+      'predictions'.
+  """
+
+  def __init__(self,
+               embedding_table,
+               activation=None,
+               initializer='glorot_uniform',
+               output='logits',
+               name=None,
+               **kwargs):
+    super(MaskedLM, self).__init__(name=name, **kwargs)
+    self.embedding_table = embedding_table
+    self.activation = activation
+    self.initializer = tf.keras.initializers.get(initializer)
+
+    if output not in ('predictions', 'logits'):
+      raise ValueError(
+          ('Unknown `output` value "%s". `output` can be either "logits" or '
+           '"predictions"') % output)
+    self._output_type = output
+
+  def build(self, input_shape):
+    self._vocab_size, hidden_size = self.embedding_table.shape
+    self.dense = tf.keras.layers.Dense(
+        hidden_size,
+        activation=self.activation,
+        kernel_initializer=self.initializer,
+        name='transform/dense')
+    self.layer_norm = tf.keras.layers.LayerNormalization(
+        axis=-1, epsilon=1e-12, name='transform/LayerNorm')
+    self.bias = self.add_weight(
+        'output_bias/bias',
+        shape=(self._vocab_size,),
+        initializer='zeros',
+        trainable=True)
+
+    super(MaskedLM, self).build(input_shape)
+
+  def call(self, sequence_data, masked_positions):
+    masked_lm_input = self._gather_indexes(sequence_data, masked_positions)
+    lm_data = self.dense(masked_lm_input)
+    lm_data = self.layer_norm(lm_data)
+    lm_data = tf.matmul(lm_data, self.embedding_table, transpose_b=True)
+    logits = tf.nn.bias_add(lm_data, self.bias)
+    masked_positions_length = masked_positions.shape.as_list()[1] or tf.shape(
+        masked_positions)[1]
+    logits = tf.reshape(logits,
+                        [-1, masked_positions_length, self._vocab_size])
+    if self._output_type == 'logits':
+      return logits
+    return tf.nn.log_softmax(logits)
+
+  def get_config(self):
+    raise NotImplementedError('MaskedLM cannot be directly serialized because '
+                              'it has variable sharing logic.')
+
+  def _gather_indexes(self, sequence_tensor, positions):
+    """Gathers the vectors at the specific positions, for performance.
+
+    Args:
+        sequence_tensor: Sequence output of shape
+          (`batch_size`, `seq_length`, num_hidden) where num_hidden is number of
+          hidden units.
+        positions: Positions ids of tokens in sequence to mask for pretraining
+          of with dimension (batch_size, num_predictions) where
+          `num_predictions` is maximum number of tokens to mask out and predict
+          per each sequence.
+
+    Returns:
+        Masked out sequence tensor of shape (batch_size * num_predictions,
+        num_hidden).
+    """
+    sequence_shape = tf.shape(sequence_tensor)
+    batch_size, seq_length = sequence_shape[0], sequence_shape[1]
+    width = sequence_tensor.shape.as_list()[2] or sequence_shape[2]
+
+    flat_offsets = tf.reshape(
+        tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
+    flat_positions = tf.reshape(positions + flat_offsets, [-1])
+    flat_sequence_tensor = tf.reshape(sequence_tensor,
+                                      [batch_size * seq_length, width])
+    output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
+
+    return output_tensor
diff --git a/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/on_device_embedding.py b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/on_device_embedding.py
new file mode 100644
index 000000000..51dab628f
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/on_device_embedding.py
@@ -0,0 +1,106 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-based one-hot embedding layer."""
+# pylint: disable=g-classes-have-attributes
+
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package="keras_nlp")
+class OnDeviceEmbedding(tf.keras.layers.Layer):
+  """Performs an embedding lookup suitable for accelerator devices.
+
+  This layer uses either tf.gather or tf.one_hot to translate integer indices to
+  float embeddings.
+
+  Args:
+    vocab_size: Number of elements in the vocabulary.
+    embedding_width: Output size of the embedding layer.
+    initializer: The initializer to use for the embedding weights. Defaults to
+      "glorot_uniform".
+    use_one_hot: Whether to use tf.one_hot over tf.gather for the embedding
+      lookup. Defaults to False (that is, using tf.gather). Setting this option
+      to True may improve performance, especially on small vocabulary sizes, but
+      will generally require more memory.
+    scale_factor: Whether to scale the output embeddings. Defaults to None (that
+      is, not to scale). Setting this option to a float will let values in
+      output embeddings multiplied by scale_factor.
+  """
+
+  def __init__(self,
+               vocab_size,
+               embedding_width,
+               initializer="glorot_uniform",
+               use_one_hot=False,
+               scale_factor=None,
+               **kwargs):
+
+    super(OnDeviceEmbedding, self).__init__(**kwargs)
+    self._vocab_size = vocab_size
+    self._embedding_width = embedding_width
+    self._initializer = initializer
+    self._use_one_hot = use_one_hot
+    self._scale_factor = scale_factor
+
+  def get_config(self):
+    config = {
+        "vocab_size": self._vocab_size,
+        "embedding_width": self._embedding_width,
+        "initializer": self._initializer,
+        "use_one_hot": self._use_one_hot,
+        "scale_factor": self._scale_factor,
+    }
+    base_config = super(OnDeviceEmbedding, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def build(self, input_shape):
+    self.embeddings = self.add_weight(
+        "embeddings",
+        shape=[self._vocab_size, self._embedding_width],
+        initializer=self._initializer,
+        dtype=tf.float32)
+
+    super(OnDeviceEmbedding, self).build(input_shape)
+
+  def call(self, inputs):
+    flat_inputs = tf.reshape(inputs, [-1])
+    if self._use_one_hot:
+      dtype = self._compute_dtype
+      if not tf.dtypes.as_dtype(dtype).is_floating:
+        # TensorFlow 1 compatibility. In TF1, self._compute_dtype is int32
+        # instead of a floating-point dtype, as the dtype is inferred from the
+        # dtype of the inputs
+        dtype = tf.float32
+      one_hot_data = tf.one_hot(
+          flat_inputs, depth=self._vocab_size, dtype=dtype)
+      embeddings = tf.matmul(one_hot_data, self.embeddings)
+    else:
+      embeddings = tf.gather(self.embeddings, flat_inputs)
+    embeddings = tf.reshape(
+        embeddings,
+        # Work around b/142213824: prefer concat to shape over a Python list.
+        tf.concat([tf.shape(inputs), [self._embedding_width]], axis=0))
+    embeddings.set_shape(inputs.shape.as_list() + [self._embedding_width])
+    if self._scale_factor:
+      embeddings *= self._scale_factor
+    return embeddings
+
+  @property
+  def vocab_size(self):
+    return self._vocab_size
+
+  @property
+  def embedding_width(self):
+    return self._embedding_width
diff --git a/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/on_device_embedding_test.py b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/on_device_embedding_test.py
new file mode 100644
index 000000000..39d5884b5
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/on_device_embedding_test.py
@@ -0,0 +1,213 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for Keras-based one-hot embedding layer."""
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from keras_nlp.layers import on_device_embedding
+
+
+# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
+# guarantees forward compatibility of this code for the V2 switchover.
+@keras_parameterized.run_all_keras_modes
+class OnDeviceEmbeddingTest(keras_parameterized.TestCase):
+
+  def test_layer_creation(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size, embedding_width=embedding_width)
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+
+    # The output should be the same as the input, save that it has an extra
+    # embedding_width dimension on the end.
+    expected_output_shape = [None, sequence_length, embedding_width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+    self.assertEqual(output_tensor.dtype, tf.float32)
+
+  def test_layer_creation_with_mixed_precision(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size, embedding_width=embedding_width,
+        dtype="mixed_float16")
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+
+    # The output should be the same as the input, save that it has an extra
+    # embedding_width dimension on the end.
+    expected_output_shape = [None, sequence_length, embedding_width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+    self.assertEqual(output_tensor.dtype, tf.float16)
+
+  def test_layer_invocation(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size, embedding_width=embedding_width)
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+
+    # Create a model from the test layer.
+    model = tf.keras.Model(input_tensor, output_tensor)
+
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 3
+    input_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    output = model.predict(input_data)
+    self.assertEqual(tf.float32, output.dtype)
+
+  def test_layer_invocation_with_mixed_precision(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size, embedding_width=embedding_width,
+        dtype="mixed_float16")
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+
+    # Create a model from the test layer.
+    model = tf.keras.Model(input_tensor, output_tensor)
+
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 3
+    input_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    output = model.predict(input_data)
+    self.assertEqual(tf.float16, output.dtype)
+
+  def test_one_hot_layer_creation(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size,
+        embedding_width=embedding_width,
+        use_one_hot=True)
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+
+    # The output should be the same as the input, save that it has an extra
+    # embedding_width dimension on the end.
+    expected_output_shape = [None, sequence_length, embedding_width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+    self.assertEqual(output_tensor.dtype, tf.float32)
+
+  def test_one_hot_layer_creation_with_mixed_precision(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size,
+        embedding_width=embedding_width,
+        dtype="mixed_float16",
+        use_one_hot=True)
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+
+    # The output should be the same as the input, save that it has an extra
+    # embedding_width dimension on the end.
+    expected_output_shape = [None, sequence_length, embedding_width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+    self.assertEqual(output_tensor.dtype, tf.float16)
+
+  def test_one_hot_layer_invocation(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size,
+        embedding_width=embedding_width,
+        use_one_hot=True)
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+
+    # Create a model from the test layer.
+    model = tf.keras.Model(input_tensor, output_tensor)
+
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 3
+    input_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    output = model.predict(input_data)
+    self.assertEqual(tf.float32, output.dtype)
+
+  def test_one_hot_layer_invocation_with_mixed_precision(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size,
+        embedding_width=embedding_width,
+        dtype="mixed_float16",
+        use_one_hot=True)
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+
+    # Create a model from the test layer.
+    model = tf.keras.Model(input_tensor, output_tensor)
+
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 3
+    input_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    output = model.predict(input_data)
+    self.assertEqual(tf.float16, output.dtype)
+
+  def test_use_scale_layer_invocation(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size, embedding_width=embedding_width,
+        scale_factor=embedding_width**0.5)
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+
+    # Create a model from the test layer.
+    model = tf.keras.Model(input_tensor, output_tensor)
+
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 3
+    input_data = np.random.randint(
+        vocab_size, size=(batch_size, sequence_length))
+    output = model.predict(input_data)
+    self.assertEqual(tf.float32, output.dtype)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/position_embedding.py b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/position_embedding.py
new file mode 100644
index 000000000..440559ae7
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/position_embedding.py
@@ -0,0 +1,93 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-based positional embedding layer."""
+# pylint: disable=g-classes-have-attributes
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package="keras_nlp")
+class PositionEmbedding(tf.keras.layers.Layer):
+  """Creates a positional embedding.
+
+  Example:
+  ```python
+  position_embedding = PositionEmbedding(max_length=100)
+  inputs = tf.keras.Input((100, 32), dtype=tf.float32)
+  outputs = position_embedding(inputs)
+  ```
+
+
+  Args:
+    max_length: The maximum size of the dynamic sequence.
+    initializer: The initializer to use for the embedding weights. Defaults to
+      "glorot_uniform".
+    seq_axis: The axis of the input tensor where we add the embeddings.
+
+  Reference: This layer creates a positional embedding as described in
+  [BERT: Pre-training of Deep Bidirectional Transformers for Language
+  Understanding](https://arxiv.org/abs/1810.04805).
+  """
+
+  def __init__(self,
+               max_length,
+               initializer="glorot_uniform",
+               seq_axis=1,
+               **kwargs):
+
+    super(PositionEmbedding, self).__init__(**kwargs)
+    if max_length is None:
+      raise ValueError(
+          "`max_length` must be an Integer, not `None`."
+      )
+    self._max_length = max_length
+    self._initializer = tf.keras.initializers.get(initializer)
+    self._seq_axis = seq_axis
+
+  def get_config(self):
+    config = {
+        "max_length": self._max_length,
+        "initializer": tf.keras.initializers.serialize(self._initializer),
+        "seq_axis": self._seq_axis,
+    }
+    base_config = super(PositionEmbedding, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def build(self, input_shape):
+    dimension_list = input_shape.as_list()
+
+    seq_length = dimension_list[self._seq_axis]
+    width = dimension_list[-1]
+
+    if self._max_length is not None:
+      weight_sequence_length = self._max_length
+    else:
+      weight_sequence_length = seq_length
+
+    self._position_embeddings = self.add_weight(
+        "embeddings",
+        shape=[weight_sequence_length, width],
+        initializer=self._initializer)
+
+    super(PositionEmbedding, self).build(input_shape)
+
+  def call(self, inputs):
+    input_shape = tf.shape(inputs)
+    actual_seq_len = input_shape[self._seq_axis]
+    position_embeddings = self._position_embeddings[:actual_seq_len, :]
+    new_shape = [1 for _ in inputs.get_shape().as_list()]
+    new_shape[self._seq_axis] = actual_seq_len
+    new_shape[-1] = position_embeddings.get_shape().as_list()[-1]
+    position_embeddings = tf.reshape(position_embeddings, new_shape)
+    return tf.broadcast_to(position_embeddings, input_shape)
diff --git a/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/position_embedding_test.py b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/position_embedding_test.py
new file mode 100644
index 000000000..5197b1c7a
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/position_embedding_test.py
@@ -0,0 +1,132 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for Keras-based positional embedding layer."""
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from keras_nlp.layers import position_embedding
+
+
+# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
+# guarantees forward compatibility of this code for the V2 switchover.
+@keras_parameterized.run_all_keras_modes
+class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
+
+  def test_static_layer_output_shape(self):
+    # Create a 3-dimensional input (the first dimension is implicit).
+    sequence_length = 21
+    test_layer = position_embedding.PositionEmbedding(
+        max_length=sequence_length)
+    width = 30
+    input_tensor = tf.keras.Input(shape=(sequence_length, width))
+    output_tensor = test_layer(input_tensor)
+
+    # When using static positional embedding shapes, the output is expected
+    # to be the same as the input shape in all dimensions save batch.
+    expected_output_shape = [None, sequence_length, width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+    # The default output dtype for this layer should be tf.float32.
+    self.assertEqual(tf.float32, output_tensor.dtype)
+
+  def test_non_default_axis_static(self):
+    # Create a 3-dimensional input (the first dimension is implicit).
+    sequence_length = 21
+    test_layer = position_embedding.PositionEmbedding(
+        max_length=sequence_length, seq_axis=2)
+    width = 30
+    input_tensor = tf.keras.Input(shape=(width, sequence_length, width))
+    output_tensor = test_layer(input_tensor)
+
+    # When using static positional embedding shapes, the output is expected
+    # to be the same as the input shape in all dimensions save batch.
+    expected_output_shape = [None, width, sequence_length, width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+    # The default output dtype for this layer should be tf.float32.
+    self.assertEqual(tf.float32, output_tensor.dtype)
+
+  def test_float16_dtype(self):
+    # Create a 3-dimensional input (the first dimension is implicit).
+    sequence_length = 21
+    test_layer = position_embedding.PositionEmbedding(
+        max_length=sequence_length, dtype="float16")
+    width = 30
+    input_tensor = tf.keras.Input(shape=(sequence_length, width))
+    output_tensor = test_layer(input_tensor)
+
+    # When using static positional embedding shapes, the output is expected
+    # to be the same as the input shape in all dimensions save batch.
+    expected_output_shape = [None, sequence_length, width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+    # The default output dtype for this layer should be tf.float32.
+    self.assertEqual(tf.float16, output_tensor.dtype)
+
+  def test_dynamic_layer_output_shape(self):
+    max_sequence_length = 40
+    test_layer = position_embedding.PositionEmbedding(
+        max_length=max_sequence_length)
+    # Create a 3-dimensional input (the first dimension is implicit).
+    width = 30
+    input_tensor = tf.keras.Input(shape=(None, width))
+    output_tensor = test_layer(input_tensor)
+
+    # When using dynamic positional embedding shapes, the output is expected
+    # to be the same as the input shape in all dimensions - but may be None if
+    # the input shape is None there.
+    expected_output_shape = [None, None, width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+
+  def test_non_default_axis_dynamic(self):
+    max_sequence_length = 60
+    test_layer = position_embedding.PositionEmbedding(
+        max_length=max_sequence_length, seq_axis=2)
+    # Create a 3-dimensional input (the first dimension is implicit).
+    width = 30
+    input_tensor = tf.keras.Input(shape=(None, None, width))
+    output_tensor = test_layer(input_tensor)
+
+    # When using dynamic positional embedding shapes, the output is expected
+    # to be the same as the input shape in all dimensions - but may be None if
+    # the input shape is None there.
+    expected_output_shape = [None, None, None, width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+
+  def test_dynamic_layer_slicing(self):
+    max_sequence_length = 40
+    test_layer = position_embedding.PositionEmbedding(
+        max_length=max_sequence_length)
+    # Create a 3-dimensional input (the first dimension is implicit).
+    width = 30
+    input_tensor = tf.keras.Input(shape=(None, width))
+    output_tensor = test_layer(input_tensor)
+
+    model = tf.keras.Model(input_tensor, output_tensor)
+
+    # Create input data that is shorter than max_sequence_length, which should
+    # trigger a down-slice.
+    input_length = 17
+    # Note: This test explicitly uses a batch size of 1. This is to get around
+    # Keras' restriction on Model invocations: inputs are expected to have the
+    # same batch cardinality as outputs. In practice, this layer should be used
+    # inside a model, where it can be projected when added to another tensor.
+    input_data = np.ones((1, input_length, width))
+    output_data = model.predict(input_data)
+
+    self.assertAllEqual([1, input_length, width], output_data.shape)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/self_attention_mask.py b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/self_attention_mask.py
new file mode 100644
index 000000000..1c0033f71
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/self_attention_mask.py
@@ -0,0 +1,55 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras layer that creates a self-attention mask."""
+
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package='keras_nlp')
+class SelfAttentionMask(tf.keras.layers.Layer):
+  """Create 3D attention mask from a 2D tensor mask.
+
+    inputs[0]: from_tensor: 2D or 3D Tensor of shape
+      [batch_size, from_seq_length, ...].
+    inputs[1]: to_mask: int32 Tensor of shape [batch_size, to_seq_length].
+
+    Returns:
+      float Tensor of shape [batch_size, from_seq_length, to_seq_length].
+  """
+
+  def call(self, inputs, to_mask):
+    from_shape = tf.shape(inputs)
+    batch_size = from_shape[0]
+    from_seq_length = from_shape[1]
+
+    to_shape = tf.shape(to_mask)
+    to_seq_length = to_shape[1]
+
+    to_mask = tf.cast(
+        tf.reshape(to_mask, [batch_size, 1, to_seq_length]),
+        dtype=inputs.dtype)
+
+    # We don't assume that `from_tensor` is a mask (although it could be). We
+    # don't actually care if we attend *from* padding tokens (only *to* padding)
+    # tokens so we create a tensor of all ones.
+    #
+    # `broadcast_ones` = [batch_size, from_seq_length, 1]
+    broadcast_ones = tf.ones(
+        shape=[batch_size, from_seq_length, 1], dtype=inputs.dtype)
+
+    # Here we broadcast along two dimensions to create the mask.
+    mask = broadcast_ones * to_mask
+
+    return mask
diff --git a/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/transformer_encoder_block.py b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/transformer_encoder_block.py
new file mode 100644
index 000000000..db7a43a93
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/transformer_encoder_block.py
@@ -0,0 +1,308 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-based TransformerEncoder block layer."""
+
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package="keras_nlp")
+class TransformerEncoderBlock(tf.keras.layers.Layer):
+  """TransformerEncoderBlock layer.
+
+  This layer implements the Transformer Encoder from
+  "Attention Is All You Need". (https://arxiv.org/abs/1706.03762),
+  which combines a `tf.keras.layers.MultiHeadAttention` layer with a
+  two-layer feedforward network.
+
+  References:
+    [Attention Is All You Need](https://arxiv.org/abs/1706.03762)
+    [BERT: Pre-training of Deep Bidirectional Transformers for Language
+     Understanding](https://arxiv.org/abs/1810.04805)
+  """
+
+  def __init__(self,
+               num_attention_heads,
+               inner_dim,
+               inner_activation,
+               output_range=None,
+               kernel_initializer="glorot_uniform",
+               bias_initializer="zeros",
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               use_bias=True,
+               norm_first=False,
+               norm_epsilon=1e-12,
+               output_dropout=0.0,
+               attention_dropout=0.0,
+               inner_dropout=0.0,
+               attention_initializer=None,
+               attention_axes=None,
+               **kwargs):
+    """Initializes `TransformerEncoderBlock`.
+
+    Args:
+      num_attention_heads: Number of attention heads.
+      inner_dim: The output dimension of the first Dense layer in a two-layer
+        feedforward network.
+      inner_activation: The activation for the first Dense layer in a two-layer
+        feedforward network.
+      output_range: the sequence output range, [0, output_range) for slicing the
+        target sequence. `None` means the target sequence is not sliced.
+      kernel_initializer: Initializer for dense layer kernels.
+      bias_initializer: Initializer for dense layer biases.
+      kernel_regularizer: Regularizer for dense layer kernels.
+      bias_regularizer: Regularizer for dense layer biases.
+      activity_regularizer: Regularizer for dense layer activity.
+      kernel_constraint: Constraint for dense layer kernels.
+      bias_constraint: Constraint for dense layer kernels.
+      use_bias: Whether to enable use_bias in attention layer. If set False,
+        use_bias in attention layer is disabled.
+      norm_first: Whether to normalize inputs to attention and intermediate
+        dense layers. If set False, output of attention and intermediate dense
+        layers is normalized.
+      norm_epsilon: Epsilon value to initialize normalization layers.
+      output_dropout: Dropout probability for the post-attention and output
+        dropout.
+      attention_dropout: Dropout probability for within the attention layer.
+      inner_dropout: Dropout probability for the first Dense layer in a
+        two-layer feedforward network.
+      attention_initializer: Initializer for kernels of attention layers. If set
+        `None`, attention layers use kernel_initializer as initializer for
+        kernel.
+      attention_axes: axes over which the attention is applied. `None` means
+        attention over all axes, but batch, heads, and features.
+      **kwargs: keyword arguments/
+    """
+    super().__init__(**kwargs)
+
+    self._num_heads = num_attention_heads
+    self._inner_dim = inner_dim
+    self._inner_activation = inner_activation
+    self._attention_dropout = attention_dropout
+    self._attention_dropout_rate = attention_dropout
+    self._output_dropout = output_dropout
+    self._output_dropout_rate = output_dropout
+    self._output_range = output_range
+    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
+    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
+    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
+    self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
+    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
+    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
+    self._use_bias = use_bias
+    self._norm_first = norm_first
+    self._norm_epsilon = norm_epsilon
+    self._inner_dropout = inner_dropout
+    if attention_initializer:
+      self._attention_initializer = tf.keras.initializers.get(
+          attention_initializer)
+    else:
+      self._attention_initializer = self._kernel_initializer
+    self._attention_axes = attention_axes
+
+  def build(self, input_shape):
+    if isinstance(input_shape, tf.TensorShape):
+      input_tensor_shape = input_shape
+    elif isinstance(input_shape, (list, tuple)):
+      input_tensor_shape = tf.TensorShape(input_shape[0])
+    else:
+      raise ValueError(
+          "The type of input shape argument is not supported, got: %s" %
+          type(input_shape))
+    einsum_equation = "abc,cd->abd"
+    if len(input_tensor_shape.as_list()) > 3:
+      einsum_equation = "...bc,cd->...bd"
+    hidden_size = input_tensor_shape[-1]
+    if hidden_size % self._num_heads != 0:
+      raise ValueError(
+          "The input size (%d) is not a multiple of the number of attention "
+          "heads (%d)" % (hidden_size, self._num_heads))
+    self._attention_head_size = int(hidden_size // self._num_heads)
+    common_kwargs = dict(
+        bias_initializer=self._bias_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activity_regularizer=self._activity_regularizer,
+        kernel_constraint=self._kernel_constraint,
+        bias_constraint=self._bias_constraint)
+    self._attention_layer = tf.keras.layers.MultiHeadAttention(
+        num_heads=self._num_heads,
+        key_dim=self._attention_head_size,
+        dropout=self._attention_dropout,
+        use_bias=self._use_bias,
+        kernel_initializer=self._attention_initializer,
+        attention_axes=self._attention_axes,
+        name="self_attention",
+        **common_kwargs)
+    self._attention_dropout = tf.keras.layers.Dropout(rate=self._output_dropout)
+    # Use float32 in layernorm for numeric stability.
+    # It is probably safe in mixed_float16, but we haven't validated this yet.
+    self._attention_layer_norm = (
+        tf.keras.layers.LayerNormalization(
+            name="self_attention_layer_norm",
+            axis=-1,
+            epsilon=self._norm_epsilon,
+            dtype=tf.float32))
+    self._intermediate_dense = tf.keras.layers.experimental.EinsumDense(
+        einsum_equation,
+        output_shape=(None, self._inner_dim),
+        bias_axes="d",
+        kernel_initializer=self._kernel_initializer,
+        name="intermediate",
+        **common_kwargs)
+    policy = tf.keras.mixed_precision.global_policy()
+    if policy.name == "mixed_bfloat16":
+      # bfloat16 causes BERT with the LAMB optimizer to not converge
+      # as well, so we use float32.
+      # TODO(b/154538392): Investigate this.
+      policy = tf.float32
+    self._intermediate_activation_layer = tf.keras.layers.Activation(
+        self._inner_activation, dtype=policy)
+    self._inner_dropout_layer = tf.keras.layers.Dropout(
+        rate=self._inner_dropout)
+    self._output_dense = tf.keras.layers.experimental.EinsumDense(
+        einsum_equation,
+        output_shape=(None, hidden_size),
+        bias_axes="d",
+        name="output",
+        kernel_initializer=self._kernel_initializer,
+        **common_kwargs)
+    self._output_dropout = tf.keras.layers.Dropout(rate=self._output_dropout)
+    # Use float32 in layernorm for numeric stability.
+    self._output_layer_norm = tf.keras.layers.LayerNormalization(
+        name="output_layer_norm",
+        axis=-1,
+        epsilon=self._norm_epsilon,
+        dtype=tf.float32)
+
+    super(TransformerEncoderBlock, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        "num_attention_heads":
+            self._num_heads,
+        "inner_dim":
+            self._inner_dim,
+        "inner_activation":
+            self._inner_activation,
+        "output_dropout":
+            self._output_dropout_rate,
+        "attention_dropout":
+            self._attention_dropout_rate,
+        "output_range":
+            self._output_range,
+        "kernel_initializer":
+            tf.keras.initializers.serialize(self._kernel_initializer),
+        "bias_initializer":
+            tf.keras.initializers.serialize(self._bias_initializer),
+        "kernel_regularizer":
+            tf.keras.regularizers.serialize(self._kernel_regularizer),
+        "bias_regularizer":
+            tf.keras.regularizers.serialize(self._bias_regularizer),
+        "activity_regularizer":
+            tf.keras.regularizers.serialize(self._activity_regularizer),
+        "kernel_constraint":
+            tf.keras.constraints.serialize(self._kernel_constraint),
+        "bias_constraint":
+            tf.keras.constraints.serialize(self._bias_constraint),
+        "use_bias":
+            self._use_bias,
+        "norm_first":
+            self._norm_first,
+        "norm_epsilon":
+            self._norm_epsilon,
+        "inner_dropout":
+            self._inner_dropout,
+        "attention_initializer":
+            tf.keras.initializers.serialize(self._attention_initializer),
+        "attention_axes": self._attention_axes,
+    }
+    base_config = super(TransformerEncoderBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs):
+    """Transformer self-attention encoder block call.
+
+    Args:
+      inputs: a single tensor or a list of tensors.
+        `input tensor` as the single sequence of embeddings.
+        [`input tensor`, `attention mask`] to have the additional attention
+          mask.
+        [`query tensor`, `key value tensor`, `attention mask`] to have separate
+          input streams for the query, and key/value to the multi-head
+          attention.
+
+    Returns:
+      An ouput tensor with the same dimensions as input/query tensor.
+    """
+    if isinstance(inputs, (list, tuple)):
+      if len(inputs) == 2:
+        input_tensor, attention_mask = inputs
+        key_value = None
+      elif len(inputs) == 3:
+        input_tensor, key_value, attention_mask = inputs
+      else:
+        raise ValueError("Unexpected inputs to %s with length at %d" %
+                         (self.__class__, len(inputs)))
+    else:
+      input_tensor, key_value, attention_mask = (inputs, None, None)
+
+    if self._output_range:
+      if self._norm_first:
+        source_tensor = input_tensor[:, 0:self._output_range, :]
+        input_tensor = self._attention_layer_norm(input_tensor)
+        if key_value is not None:
+          key_value = self._attention_layer_norm(key_value)
+      target_tensor = input_tensor[:, 0:self._output_range, :]
+      if attention_mask is not None:
+        attention_mask = attention_mask[:, 0:self._output_range, :]
+    else:
+      if self._norm_first:
+        source_tensor = input_tensor
+        input_tensor = self._attention_layer_norm(input_tensor)
+        if key_value is not None:
+          key_value = self._attention_layer_norm(key_value)
+      target_tensor = input_tensor
+
+    if key_value is None:
+      key_value = input_tensor
+    attention_output = self._attention_layer(
+        query=target_tensor, value=key_value, attention_mask=attention_mask)
+    attention_output = self._attention_dropout(attention_output)
+    if self._norm_first:
+      attention_output = source_tensor + attention_output
+    else:
+      attention_output = self._attention_layer_norm(target_tensor +
+                                                    attention_output)
+    if self._norm_first:
+      source_attention_output = attention_output
+      attention_output = self._output_layer_norm(attention_output)
+    inner_output = self._intermediate_dense(attention_output)
+    inner_output = self._intermediate_activation_layer(inner_output)
+    inner_output = self._inner_dropout_layer(inner_output)
+    layer_output = self._output_dense(inner_output)
+    layer_output = self._output_dropout(layer_output)
+
+    if self._norm_first:
+      return source_attention_output + layer_output
+
+    # During mixed precision training, layer norm output is always fp32 for now.
+    # Casts fp32 for the subsequent add.
+    layer_output = tf.cast(layer_output, tf.float32)
+    return self._output_layer_norm(layer_output + attention_output)
diff --git a/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/transformer_encoder_block_test.py b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/transformer_encoder_block_test.py
new file mode 100644
index 000000000..ccbb7247f
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/layers/transformer_encoder_block_test.py
@@ -0,0 +1,324 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for Keras-based transformer block layer."""
+
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from keras_nlp.layers.transformer_encoder_block import TransformerEncoderBlock
+
+
+@keras_parameterized.run_all_keras_modes
+@parameterized.named_parameters(
+    ('base', TransformerEncoderBlock))
+class TransformerEncoderBlockLayerTest(keras_parameterized.TestCase):
+
+  def tearDown(self):
+    super(TransformerEncoderBlockLayerTest, self).tearDown()
+    tf.keras.mixed_precision.set_global_policy('float32')
+
+  def test_layer_creation(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
+    sequence_length = 21
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    output_tensor = test_layer(data_tensor)
+    # The default output of a transformer layer should be the same as the input.
+    self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
+
+  def test_layer_creation_with_mask(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
+    sequence_length = 21
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    # Create a 2-dimensional input (the first dimension is implicit).
+    mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
+    output_tensor = test_layer([data_tensor, mask_tensor])
+    # The default output of a transformer layer should be the same as the input.
+    self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
+
+  def test_layer_invocation(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
+    sequence_length = 21
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    output_tensor = test_layer(data_tensor)
+
+    # Create a model from the test layer.
+    model = tf.keras.Model(data_tensor, output_tensor)
+
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 6
+    input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, width))
+    _ = model.predict(input_data)
+
+  def test_layer_invocation_with_mask(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
+    sequence_length = 21
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    # Create a 2-dimensional input (the first dimension is implicit).
+    mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
+    output_tensor = test_layer([data_tensor, mask_tensor])
+
+    # Create a model from the test layer.
+    model = tf.keras.Model([data_tensor, mask_tensor], output_tensor)
+
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 6
+    input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, width))
+    # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
+    # which here is (batch, sequence_length, sequence_length)
+    mask_data = np.random.randint(
+        2, size=(batch_size, sequence_length, sequence_length))
+    _ = model.predict([input_data, mask_data])
+
+  def test_layer_output_range(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
+    sequence_length = 21
+    width = 80
+
+    batch_size = 6
+    input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, width))
+    mask_data = np.random.randint(
+        2, size=(batch_size, sequence_length, sequence_length))
+    output_tensor = test_layer([input_data, mask_data])
+
+    # The layer only attends to the first token and outputs the first token
+    # embedding.
+    new_layer = transformer_cls(
+        num_attention_heads=10,
+        inner_dim=2048,
+        inner_activation='relu',
+        output_range=1)
+    _ = new_layer([input_data, mask_data])
+    new_layer.set_weights(test_layer.get_weights())
+    new_output_tensor = new_layer([input_data, mask_data])
+    self.assertAllClose(
+        new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
+
+  def test_layer_output_range_without_mask(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048,
+        inner_activation='relu', norm_first=True)
+    sequence_length = 21
+    width = 80
+
+    batch_size = 6
+    input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, width))
+    output_tensor = test_layer(input_data)
+
+    # The layer only attends to the first token and outputs the first token
+    # embedding.
+    new_layer = transformer_cls(
+        num_attention_heads=10,
+        inner_dim=2048,
+        inner_activation='relu',
+        output_range=1,
+        norm_first=True)
+    _ = new_layer(input_data)
+    new_layer.set_weights(test_layer.get_weights())
+    new_output_tensor = new_layer(input_data)
+    self.assertAllClose(
+        new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
+
+  def test_layer_output_range_with_pre_norm(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048,
+        inner_activation='relu', norm_first=True)
+    sequence_length = 21
+    width = 80
+
+    batch_size = 6
+    input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, width))
+    mask_data = np.random.randint(
+        2, size=(batch_size, sequence_length, sequence_length))
+    output_tensor = test_layer([input_data, mask_data])
+
+    # The layer only attends to the first token and outputs the first token
+    # embedding.
+    new_layer = transformer_cls(
+        num_attention_heads=10,
+        inner_dim=2048,
+        inner_activation='relu',
+        output_range=1,
+        norm_first=True)
+    _ = new_layer([input_data, mask_data])
+    new_layer.set_weights(test_layer.get_weights())
+    new_output_tensor = new_layer([input_data, mask_data])
+    self.assertAllClose(
+        new_output_tensor, output_tensor[:, 0:1, :], atol=5e-5, rtol=0.003)
+
+  def test_layer_invocation_with_float16_dtype(self, transformer_cls):
+    tf.keras.mixed_precision.set_global_policy('mixed_float16')
+    test_layer = transformer_cls(
+        num_attention_heads=10, inner_dim=2048, inner_activation='relu')
+    sequence_length = 21
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    # Create a 2-dimensional input (the first dimension is implicit).
+    mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length))
+    output_tensor = test_layer([data_tensor, mask_tensor])
+
+    # Create a model from the test layer.
+    model = tf.keras.Model([data_tensor, mask_tensor], output_tensor)
+
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 6
+    input_data = (10 * np.random.random_sample(
+        (batch_size, sequence_length, width)))
+    # The attention mask should be of shape (batch, from_seq_len, to_seq_len),
+    # which here is (batch, sequence_length, sequence_length)
+    mask_data = np.random.randint(
+        2, size=(batch_size, sequence_length, sequence_length))
+    _ = model.predict([input_data, mask_data])
+
+  def test_transform_with_initializer(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10,
+        inner_dim=2048,
+        inner_activation='relu',
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
+    sequence_length = 21
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(sequence_length, width))
+    output = test_layer(data_tensor)
+    # The default output of a transformer layer should be the same as the input.
+    self.assertEqual(data_tensor.shape.as_list(), output.shape.as_list())
+
+  def test_dynamic_layer_sequence(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=10,
+        inner_dim=2048,
+        inner_activation='relu',
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
+    # Create a 3-dimensional input (the first dimension is implicit).
+    width = 30
+    input_tensor = tf.keras.Input(shape=(None, width))
+    output_tensor = test_layer(input_tensor)
+    model = tf.keras.Model(input_tensor, output_tensor)
+
+    input_length = 17
+    input_data = np.ones((1, input_length, width))
+    output_data = model.predict(input_data)
+
+    self.assertAllEqual([1, input_length, width], output_data.shape)
+
+  def test_separate_qkv(self, transformer_cls):
+    test_layer = transformer_cls(
+        num_attention_heads=2,
+        inner_dim=128,
+        inner_activation='relu',
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
+    # Forward path.
+    q_tensor = tf.zeros([2, 4, 16], dtype=tf.float32)
+    kv_tensor = tf.zeros([2, 8, 16], dtype=tf.float32)
+    dummy_mask = tf.zeros([2, 4, 8], dtype=tf.float32)
+    inputs = [q_tensor, kv_tensor, dummy_mask]
+    output = test_layer(inputs)
+    self.assertEqual(output.shape, q_tensor.shape)
+
+
+@keras_parameterized.run_all_keras_modes
+class TransformerArgumentTest(keras_parameterized.TestCase):
+
+  def test_use_bias_norm_first(self):
+    num_attention_heads = 2
+    hidden_size = 16
+    encoder_block = TransformerEncoderBlock(
+        num_attention_heads=num_attention_heads,
+        inner_dim=32,
+        inner_activation='relu',
+        output_dropout=0.1,
+        attention_dropout=0.1,
+        use_bias=False,
+        norm_first=True,
+        norm_epsilon=1e-6,
+        inner_dropout=0.1,
+        attention_initializer=tf.keras.initializers.RandomUniform(
+            minval=0., maxval=1.))
+    # Forward path.
+    dummy_tensor = tf.zeros([2, 4, 16], dtype=tf.float32)
+    dummy_mask = tf.zeros([2, 4, 4], dtype=tf.float32)
+    inputs = [dummy_tensor, dummy_mask]
+    output = encoder_block(inputs)
+    self.assertEqual(output.shape, (2, 4, hidden_size))
+
+  def test_get_config(self):
+    num_attention_heads = 2
+    encoder_block = TransformerEncoderBlock(
+        num_attention_heads=num_attention_heads,
+        inner_dim=32,
+        inner_activation='relu',
+        output_dropout=0.1,
+        attention_dropout=0.1,
+        use_bias=False,
+        norm_first=True,
+        norm_epsilon=1e-6,
+        inner_dropout=0.1,
+        attention_initializer=tf.keras.initializers.RandomUniform(
+            minval=0., maxval=1.))
+    encoder_block_config = encoder_block.get_config()
+    new_encoder_block = TransformerEncoderBlock.from_config(
+        encoder_block_config)
+    self.assertEqual(encoder_block_config, new_encoder_block.get_config())
+
+  @parameterized.parameters({'attention_axes': None}, {'attention_axes': [1]},
+                            {'attention_axes': [2]}, {'attention_axes': [1, 2]})
+  def test_several_attention_axes(self, attention_axes):
+    test_layer = TransformerEncoderBlock(
+        inner_dim=32,
+        inner_activation='relu',
+        output_dropout=0.1,
+        attention_dropout=0.1,
+        use_bias=False,
+        norm_first=True,
+        norm_epsilon=1e-6,
+        inner_dropout=0.1,
+        num_attention_heads=10,
+        attention_axes=attention_axes)
+    num_rows = 21
+    num_cols = 13
+    width = 80
+    # Create a 3-dimensional input (the first dimension is implicit).
+    data_tensor = tf.keras.Input(shape=(num_rows, num_cols, width))
+    output_tensor = test_layer(data_tensor)
+    # The default output of a transformer layer should be the same as the input.
+    self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/nlp/text_classification/bert/tensorflow2.0/keras_nlp/requirements.txt b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/requirements.txt
new file mode 100644
index 000000000..c765b1ead
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/requirements.txt
@@ -0,0 +1 @@
+numpy>=1.15.4
diff --git a/nlp/text_classification/bert/tensorflow2.0/keras_nlp/setup.py b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/setup.py
new file mode 100644
index 000000000..0611a450e
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/keras_nlp/setup.py
@@ -0,0 +1,69 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Setup script."""
+
+import os
+
+from setuptools import find_packages
+from setuptools import setup
+
+version = '0.0.1'
+
+
+def _get_requirements():
+  """Parses requirements.txt file."""
+  install_requires_tmp = []
+  dependency_links_tmp = []
+  with open(
+      os.path.join(os.path.dirname(__file__), './requirements.txt'), 'r') as f:
+    for line in f:
+      package_name = line.strip()
+      # Skip empty line or comments starting with "#".
+      if not package_name or package_name[0] == '#':
+        continue
+      if package_name.startswith('-e '):
+        dependency_links_tmp.append(package_name[3:].strip())
+      else:
+        install_requires_tmp.append(package_name)
+  return install_requires_tmp, dependency_links_tmp
+
+install_requires, dependency_links = _get_requirements()
+
+install_requires.append('tf-nightly')
+
+setup(
+    name='keras-nlp',
+    version=version,
+    description='Keras Natural Language Processing Library',
+    url='https://github.com/keras-team/keras-nlp',
+    author='The Keras authors',
+    author_email='keras-team@google.com',
+    license='Apache License 2.0',
+    install_requires=install_requires,
+    classifiers=[
+        'Programming Language :: Python',
+        'Programming Language :: Python :: 3.6',
+        'Operating System :: Unix',
+        'Operating System :: Microsoft :: Windows',
+        'Operating System :: MacOS',
+        'Intended Audience :: Science/Research',
+        'Topic :: Scientific/Engineering',
+        'Topic :: Software Development'
+    ],
+    packages=find_packages(exclude=('tests',)),
+    exclude_package_data={'': ['*_test.py',],},
+    dependency_links=dependency_links,
+    python_requires='>=3.6',
+)
diff --git a/nlp/text_classification/bert/tensorflow2.0/model_saving_utils.py b/nlp/text_classification/bert/tensorflow2.0/model_saving_utils.py
new file mode 100644
index 000000000..1d6975087
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/model_saving_utils.py
@@ -0,0 +1,68 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities to save models."""
+
+import os
+
+from absl import logging
+import tensorflow as tf
+import typing
+
+
+def export_bert_model(model_export_path: typing.Text,
+                      model: tf.keras.Model,
+                      checkpoint_dir: typing.Optional[typing.Text] = None,
+                      restore_model_using_load_weights: bool = False) -> None:
+  """Export BERT model for serving which does not include the optimizer.
+
+  Args:
+      model_export_path: Path to which exported model will be saved.
+      model: Keras model object to export.
+      checkpoint_dir: Path from which model weights will be loaded, if
+        specified.
+      restore_model_using_load_weights: Whether to use checkpoint.restore() API
+        for custom checkpoint or to use model.load_weights() API. There are 2
+        different ways to save checkpoints. One is using tf.train.Checkpoint and
+        another is using Keras model.save_weights(). Custom training loop
+        implementation uses tf.train.Checkpoint API and Keras ModelCheckpoint
+        callback internally uses model.save_weights() API. Since these two API's
+        cannot be used toghether, model loading logic must be take into account
+        how model checkpoint was saved.
+
+  Raises:
+    ValueError when either model_export_path or model is not specified.
+  """
+  if not model_export_path:
+    raise ValueError('model_export_path must be specified.')
+  if not isinstance(model, tf.keras.Model):
+    raise ValueError('model must be a tf.keras.Model object.')
+
+  if checkpoint_dir:
+    if restore_model_using_load_weights:
+      model_weight_path = os.path.join(checkpoint_dir, 'checkpoint')
+      assert tf.io.gfile.exists(model_weight_path)
+      model.load_weights(model_weight_path)
+    else:
+      checkpoint = tf.train.Checkpoint(model=model)
+
+      # Restores the model from latest checkpoint.
+      latest_checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
+      assert latest_checkpoint_file
+      logging.info('Checkpoint file %s found and restoring from '
+                   'checkpoint', latest_checkpoint_file)
+      checkpoint.restore(
+          latest_checkpoint_file).assert_existing_objects_matched()
+
+  model.save(model_export_path, include_optimizer=False, save_format='tf')
diff --git a/nlp/text_classification/bert/tensorflow2.0/model_training_utils.py b/nlp/text_classification/bert/tensorflow2.0/model_training_utils.py
new file mode 100644
index 000000000..762fee690
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/model_training_utils.py
@@ -0,0 +1,590 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A light weight utilities to train NLP models."""
+
+import json
+import os
+import tempfile
+
+from absl import logging
+import tensorflow as tf
+from tensorflow.python.util import deprecation
+from common import distribute_utils
+from staging.training import grad_utils
+
+_SUMMARY_TXT = 'training_summary.txt'
+_MIN_SUMMARY_STEPS = 10
+
+
+def _should_export_checkpoint(strategy):
+  return (not strategy) or strategy.extended.should_checkpoint
+
+
+def _should_export_summary(strategy):
+  return (not strategy) or strategy.extended.should_save_summary
+
+
+def _save_checkpoint(strategy, checkpoint, model_dir, checkpoint_prefix):
+  """Saves model to with provided checkpoint prefix."""
+
+  if _should_export_checkpoint(strategy):
+    checkpoint_path = os.path.join(model_dir, checkpoint_prefix)
+    saved_path = checkpoint.save(checkpoint_path)
+    logging.info('Saving model as TF checkpoint: %s', saved_path)
+  else:
+    # In multi worker training we need every worker to save checkpoint, because
+    # variables can trigger synchronization on read and synchronization needs
+    # all workers to participate. To avoid workers overriding each other we save
+    # to a temporary directory on non-chief workers.
+    tmp_dir = tempfile.mkdtemp()
+    checkpoint.save(os.path.join(tmp_dir, 'ckpt'))
+    tf.io.gfile.rmtree(tmp_dir)
+  return
+
+
+def _get_input_iterator(input_fn, strategy):
+  """Returns distributed dataset iterator."""
+  # When training with TPU pods, datasets needs to be cloned across
+  # workers. Since Dataset instance cannot be cloned in eager mode, we instead
+  # pass callable that returns a dataset.
+  if not callable(input_fn):
+    raise ValueError('`input_fn` should be a closure that returns a dataset.')
+  iterator = iter(strategy.distribute_datasets_from_function(input_fn))
+  return iterator
+
+
+def _float_metric_value(metric):
+  """Gets the value of a float-value keras metric."""
+  return metric.result().numpy().astype(float)
+
+
+def clip_by_global_norm_callback(grads_and_vars):
+  """Performs gradient clipping."""
+  grads, variables = zip(*grads_and_vars)
+  (clipped_grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
+  return zip(clipped_grads, variables)
+
+
+def steps_to_run(current_step, steps_per_epoch, steps_per_loop):
+  """Calculates steps to run on device."""
+  if steps_per_loop <= 0:
+    raise ValueError('steps_per_loop should be positive integer.')
+  if steps_per_loop == 1:
+    return steps_per_loop
+  remainder_in_epoch = current_step % steps_per_epoch
+  if remainder_in_epoch != 0:
+    return min(steps_per_epoch - remainder_in_epoch, steps_per_loop)
+  else:
+    return steps_per_loop
+
+
+def write_txt_summary(training_summary, summary_dir):
+  """Writes a summary text file to record stats."""
+  if not tf.io.gfile.exists(summary_dir):
+    tf.io.gfile.mkdir(summary_dir)
+  summary_path = os.path.join(summary_dir, _SUMMARY_TXT)
+  with tf.io.gfile.GFile(summary_path, 'wb') as f:
+    logging.info('Training Summary: \n%s', str(training_summary))
+    f.write(json.dumps(training_summary, indent=4))
+
+
+@deprecation.deprecated(
+    None, 'This function is deprecated and we do not expect adding new '
+    'functionalities. Please do not have your code depending '
+    'on this library.')
+def run_customized_training_loop(
+    # pylint: disable=invalid-name
+    _sentinel=None,
+    # pylint: enable=invalid-name
+    strategy=None,
+    model_fn=None,
+    loss_fn=None,
+    scale_loss=True,
+    model_dir=None,
+    train_input_fn=None,
+    steps_per_epoch=None,
+    num_eval_per_epoch=1,
+    steps_per_loop=None,
+    epochs=1,
+    eval_input_fn=None,
+    eval_steps=None,
+    metric_fn=None,
+    init_checkpoint=None,
+    custom_callbacks=None,
+    run_eagerly=False,
+    sub_model_export_name=None,
+    explicit_allreduce=False,
+    pre_allreduce_callbacks=None,
+    post_allreduce_callbacks=None,
+    train_summary_interval=0,
+    allreduce_bytes_per_pack=0):
+  """Run BERT pretrain model training using low-level API.
+
+  Args:
+      _sentinel: Used to prevent positional parameters. Internal, do not use.
+      strategy: Distribution strategy on which to run low level training loop.
+      model_fn: Function that returns a tuple (model, sub_model). Caller of this
+        function should add optimizer to the `model` via calling
+        `model.compile()` API or manually setting `model.optimizer` attribute.
+        Second element of the returned tuple(sub_model) is an optional sub model
+        to be used for initial checkpoint -- if provided.
+      loss_fn: Function with signature func(labels, logits) and returns a loss
+        tensor.
+      scale_loss: Whether to divide the raw loss by number of replicas before
+        gradients calculation.
+      model_dir: Model directory used during training for restoring/saving model
+        weights.
+      train_input_fn: Function that returns a tf.data.Dataset used for training.
+      steps_per_epoch: Number of steps to run per epoch. At the end of each
+        epoch, model checkpoint will be saved and evaluation will be conducted
+        if evaluation dataset is provided.
+      num_eval_per_epoch: Number of evaluations per epoch.
+      steps_per_loop: Number of steps per graph-mode loop. In order to reduce
+        communication in eager context, training logs are printed every
+        steps_per_loop.
+      epochs: Number of epochs to train.
+      eval_input_fn: Function that returns evaluation dataset. If none,
+        evaluation is skipped.
+      eval_steps: Number of steps to run evaluation. Required if `eval_input_fn`
+        is not none.
+      metric_fn: A metrics function that returns either a Keras Metric object or
+        a list of Keras Metric objects to record evaluation result using
+        evaluation dataset or with training dataset after every epoch.
+      init_checkpoint: Optional checkpoint to load to `sub_model` returned by
+        `model_fn`.
+      custom_callbacks: A list of Keras Callbacks objects to run during
+        training. More specifically, `on_train_begin(), on_train_end(),
+        on_batch_begin()`, `on_batch_end()`, `on_epoch_begin()`,
+        `on_epoch_end()` methods are invoked during training. Note that some
+        metrics may be missing from `logs`.
+      run_eagerly: Whether to run model training in pure eager execution. This
+        should be disable for TPUStrategy.
+      sub_model_export_name: If not None, will export `sub_model` returned by
+        `model_fn` into checkpoint files. The name of intermediate checkpoint
+        file is {sub_model_export_name}_step_{step}.ckpt and the last
+        checkpint's name is {sub_model_export_name}.ckpt; if None, `sub_model`
+        will not be exported as checkpoint.
+      explicit_allreduce: Whether to explicitly perform gradient allreduce,
+        instead of relying on implicit allreduce in optimizer.apply_gradients().
+        default is False. For now, if training using FP16 mixed precision,
+        explicit allreduce will aggregate gradients in FP16 format. For TPU and
+        GPU training using FP32, explicit allreduce will aggregate gradients in
+        FP32 format.
+      pre_allreduce_callbacks: A list of callback functions that takes gradients
+        and model variables pairs as input, manipulate them, and returns a new
+        gradients and model variables paris. The callback functions will be
+        invoked in the list order and before gradients are allreduced. With
+        mixed precision training, the pre_allreduce_allbacks will be applied on
+        scaled_gradients. Default is no callbacks. Only used when
+        explicit_allreduce=True.
+      post_allreduce_callbacks: A list of callback functions that takes
+        gradients and model variables pairs as input, manipulate them, and
+        returns a new gradients and model variables paris. The callback
+        functions will be invoked in the list order and right before gradients
+        are applied to variables for updates. Default is no callbacks. Only used
+        when explicit_allreduce=True.
+      train_summary_interval: Step interval for training summaries. If the value
+        is a negative number, then training summaries are not enabled.
+      allreduce_bytes_per_pack: A non-negative integer. Breaks collective
+        operations into packs of certain size. If it's zero, all gradients are
+        in one pack. Breaking gradient into packs could enable overlap between
+        allreduce and backprop computation. This flag only takes effect when
+        explicit_allreduce is set to True.'
+
+  Returns:
+      Trained model.
+
+  Raises:
+      ValueError: (1) When model returned by `model_fn` does not have optimizer
+        attribute or when required parameters are set to none. (2) eval args are
+        not specified correctly. (3) metric_fn must be a callable if specified.
+        (4) sub_model_checkpoint_name is specified, but `sub_model` returned
+        by `model_fn` is None.
+  """
+
+  if _sentinel is not None:
+    raise ValueError('only call `run_customized_training_loop()` '
+                     'with named arguments.')
+
+  required_arguments = [
+      strategy, model_fn, loss_fn, model_dir, steps_per_epoch, train_input_fn
+  ]
+
+  steps_between_evals = int(steps_per_epoch / num_eval_per_epoch)
+  if [arg for arg in required_arguments if arg is None]:
+    raise ValueError('`strategy`, `model_fn`, `loss_fn`, `model_dir`, '
+                     '`steps_per_epoch` and `train_input_fn` are required '
+                     'parameters.')
+  if not steps_per_loop:
+    if tf.config.list_logical_devices('TPU'):
+      # One can't fully utilize a TPU with steps_per_loop=1, so in this case
+      # default users to a more useful value.
+      steps_per_loop = min(1000, steps_between_evals)
+    else:
+      steps_per_loop = 1
+    logging.info('steps_per_loop not specified. Using steps_per_loop=%d',
+                 steps_per_loop)
+  if steps_per_loop > steps_between_evals:
+    logging.warning(
+        'steps_per_loop: %d is specified to be greater than '
+        ' steps_between_evals: %d, we will use steps_between_evals as'
+        ' steps_per_loop.', steps_per_loop, steps_between_evals)
+    steps_per_loop = steps_between_evals
+  assert tf.executing_eagerly()
+
+  if run_eagerly:
+    if isinstance(
+        strategy,
+        (tf.distribute.TPUStrategy, tf.distribute.experimental.TPUStrategy)):
+      raise ValueError(
+          'TPUStrategy should not run eagerly as it heavily relies on graph'
+          ' optimization for the distributed system.')
+
+  if eval_input_fn and eval_steps is None:
+    raise ValueError(
+        '`eval_step` is required when `eval_input_fn ` is not none.')
+  if metric_fn and not callable(metric_fn):
+    raise ValueError(
+        'if `metric_fn` is specified, metric_fn must be a callable.')
+
+  total_training_steps = steps_per_epoch * epochs
+  train_iterator = _get_input_iterator(train_input_fn, strategy)
+  eval_loss_metric = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
+
+  with distribute_utils.get_strategy_scope(strategy):
+    # To correctly place the model weights on accelerators,
+    # model and optimizer should be created in scope.
+    model, sub_model = model_fn()
+    if not hasattr(model, 'optimizer'):
+      raise ValueError('User should set optimizer attribute to model '
+                       'inside `model_fn`.')
+    if sub_model_export_name and sub_model is None:
+      raise ValueError('sub_model_export_name is specified as %s, but '
+                       'sub_model is None.' % sub_model_export_name)
+
+    callback_list = tf.keras.callbacks.CallbackList(
+        callbacks=custom_callbacks, model=model)
+
+    optimizer = model.optimizer
+
+    if init_checkpoint:
+      logging.info(
+          'Checkpoint file %s found and restoring from '
+          'initial checkpoint for core model.', init_checkpoint)
+      checkpoint = tf.train.Checkpoint(model=sub_model, encoder=sub_model)
+      checkpoint.read(init_checkpoint).assert_existing_objects_matched()
+      logging.info('Loading from checkpoint file completed')
+
+    train_loss_metric = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
+    eval_metrics = metric_fn() if metric_fn else []
+    if not isinstance(eval_metrics, list):
+      eval_metrics = [eval_metrics]
+    # If evaluation is required, make a copy of metric as it will be used by
+    # both train and evaluation.
+    train_metrics = [
+        metric.__class__.from_config(metric.get_config())
+        for metric in eval_metrics
+    ]
+
+    # Create summary writers
+    if _should_export_summary(strategy):
+      summary_dir = os.path.join(model_dir, 'summaries')
+    else:
+      # In multi worker training we need every worker to write summary, because
+      # variables can trigger synchronization on read and synchronization needs
+      # all workers to participate.
+      summary_dir = tempfile.mkdtemp()
+    eval_summary_writer = tf.summary.create_file_writer(
+        os.path.join(summary_dir, 'eval'))
+    last_summary_step = 0
+    if steps_per_loop >= _MIN_SUMMARY_STEPS and train_summary_interval >= 0:
+      # Only writes summary when the stats are collected sufficiently over
+      # enough steps.
+      train_summary_writer = tf.summary.create_file_writer(
+          os.path.join(summary_dir, 'train'))
+    else:
+      train_summary_writer = tf.summary.create_noop_writer()
+
+    # Collects training variables.
+    training_vars = model.trainable_variables
+
+    def _replicated_step(inputs):
+      """Replicated training step."""
+
+      inputs, labels = inputs
+      with tf.GradientTape() as tape:
+        model_outputs = model(inputs, training=True)
+        loss = loss_fn(labels, model_outputs)
+        # Raw loss is used for reporting in metrics/logs.
+        raw_loss = loss
+        if scale_loss:
+          # Scales down the loss for gradients to be invariant from replicas.
+          loss = loss / strategy.num_replicas_in_sync
+
+      if explicit_allreduce:
+        grad_utils.minimize_using_explicit_allreduce(tape, optimizer, loss,
+                                                     training_vars,
+                                                     pre_allreduce_callbacks,
+                                                     post_allreduce_callbacks,
+                                                     allreduce_bytes_per_pack)
+      else:
+        if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer):
+          with tape:
+            scaled_loss = optimizer.get_scaled_loss(loss)
+          scaled_grads = tape.gradient(scaled_loss, training_vars)
+          grads = optimizer.get_unscaled_gradients(scaled_grads)
+        else:
+          grads = tape.gradient(loss, training_vars)
+        optimizer.apply_gradients(zip(grads, training_vars))
+      # For reporting, the metric takes the mean of losses.
+      train_loss_metric.update_state(raw_loss)
+      for metric in train_metrics:
+        metric.update_state(labels, model_outputs)
+
+    @tf.function
+    def train_steps(iterator, steps):
+      """Performs distributed training steps in a loop.
+
+      Args:
+        iterator: the distributed iterator of training datasets.
+        steps: an tf.int32 integer tensor to specify number of steps to run
+          inside host training loop.
+
+      Raises:
+        ValueError: Any of the arguments or tensor shapes are invalid.
+      """
+      if not isinstance(steps, tf.Tensor):
+        raise ValueError('steps should be an Tensor. Python object may cause '
+                         'retracing.')
+
+      for _ in tf.range(steps):
+        strategy.run(_replicated_step, args=(next(iterator),))
+
+    def train_single_step(iterator):
+      """Performs a distributed training step.
+
+      Args:
+        iterator: the distributed iterator of training datasets.
+
+      Raises:
+        ValueError: Any of the arguments or tensor shapes are invalid.
+      """
+      strategy.run(_replicated_step, args=(next(iterator),))
+
+    def test_step(iterator):
+      """Calculates evaluation metrics on distributed devices."""
+
+      def _test_step_fn(inputs):
+        """Replicated accuracy calculation."""
+
+        inputs, labels = inputs
+        model_outputs = model(inputs, training=False)
+        for metric in eval_metrics:
+          metric.update_state(labels, model_outputs)
+        return model_outputs, labels
+
+      outputs, labels = strategy.run(_test_step_fn, args=(next(iterator),))
+      outputs = tf.nest.map_structure(strategy.experimental_local_results,
+                                      outputs)
+      labels = tf.nest.map_structure(strategy.experimental_local_results,
+                                     labels)
+      return outputs, labels
+
+    if not run_eagerly:
+      train_single_step = tf.function(train_single_step)
+      test_step = tf.function(test_step)
+
+    def _run_evaluation(current_training_step, test_iterator):
+      """Runs validation steps and aggregate metrics.
+
+      Args:
+        current_training_step: tf.int32 tensor containing the current step.
+        test_iterator: distributed iterator of test datasets.
+
+      Returns:
+        A dict of metic names and values.
+      """
+      # The last batch of the evaluation is often smaller than previous ones.
+      # Moreover, in some distributed pieces it might even be empty. Therefore,
+      # different from the way training_loss is calculated, it is needed to
+      # gather all the logits and labels here to calculate the evaluation loss
+      # outside.
+      loss_list, loss_weights = list(), list()
+      for _ in range(eval_steps):
+        outputs, labels = test_step(test_iterator)
+        for cur_logits, cur_labels in zip(outputs, labels):
+          # This is to handle cases when cur_labels is not a single tensor,
+          # but a dict of tensors.
+          cur_weight = tf.shape(tf.nest.flatten(cur_labels)[0])[0]
+          if cur_weight != 0:
+            loss_list.append(loss_fn(cur_labels, cur_logits).numpy())
+            loss_weights.append(cur_weight)
+      # The sample_weights are the actual number of examples in each batch,
+      # a summation of numbers of examples in each replica if using
+      # distributed training.
+      eval_loss_metric.update_state(loss_list, sample_weight=loss_weights)
+
+      logs = {}
+      with eval_summary_writer.as_default():
+        for metric in [eval_loss_metric] + eval_metrics + model.metrics:
+          metric_value = _float_metric_value(metric)
+          logs[metric.name] = metric_value
+          logging.info('Step: [%d] Validation %s = %f', current_training_step,
+                       metric.name, metric_value)
+          tf.summary.scalar(
+              metric.name, metric_value, step=current_training_step)
+        eval_summary_writer.flush()
+
+      return logs
+
+    # Training loop starts here.
+    checkpoint = tf.train.Checkpoint(
+        model=model, optimizer=optimizer, global_step=optimizer.iterations)
+    sub_model_checkpoint = tf.train.Checkpoint(
+        model=sub_model,
+        global_step=optimizer.iterations) if sub_model_export_name else None
+
+    latest_checkpoint_file = tf.train.latest_checkpoint(model_dir)
+    if latest_checkpoint_file:
+      logging.info('Checkpoint file %s found and restoring from '
+                   'checkpoint', latest_checkpoint_file)
+      checkpoint.restore(latest_checkpoint_file)
+      logging.info('Loading from checkpoint file completed')
+
+    current_step = optimizer.iterations.numpy()
+    checkpoint_name = 'ctl_step_{step}.ckpt'
+
+    logs = {}
+    callback_list.on_train_begin()
+    while current_step < total_training_steps and not model.stop_training:
+      if current_step % steps_per_epoch == 0:
+        callback_list.on_epoch_begin(int(current_step / steps_per_epoch) + 1)
+
+      # Training loss/metric are taking average over steps inside micro
+      # training loop. We reset the their values before each round.
+      train_loss_metric.reset_states()
+      for metric in train_metrics + model.metrics:
+        metric.reset_states()
+
+      callback_list.on_batch_begin(current_step)
+      # Runs several steps in the host while loop.
+      steps = steps_to_run(current_step, steps_between_evals, steps_per_loop)
+
+      if tf.config.list_physical_devices('GPU'):
+        # TODO(zongweiz): merge with train_steps once tf.while_loop
+        # GPU performance bugs are fixed.
+        for _ in range(steps):
+          train_single_step(train_iterator)
+      else:
+        # Converts steps to a Tensor to avoid tf.function retracing.
+        train_steps(train_iterator, tf.convert_to_tensor(steps, dtype=tf.int32))
+      train_loss = _float_metric_value(train_loss_metric)
+      current_step += steps
+
+      # Updates training logging.
+      training_status = 'Train Step: %d/%d  / loss = %s' % (
+          current_step, total_training_steps, train_loss)
+
+      if current_step >= last_summary_step + train_summary_interval:
+        summary_writer = train_summary_writer
+        last_summary_step = current_step
+      else:
+        summary_writer = tf.summary.create_noop_writer()
+
+      with summary_writer.as_default():
+        if callable(optimizer.learning_rate):
+          tf.summary.scalar(
+              'learning_rate',
+              optimizer.learning_rate(current_step),
+              step=current_step)
+        tf.summary.scalar(train_loss_metric.name, train_loss, step=current_step)
+        for metric in train_metrics + model.metrics:
+          metric_value = _float_metric_value(metric)
+          training_status += '  %s = %f' % (metric.name, metric_value)
+          tf.summary.scalar(metric.name, metric_value, step=current_step)
+        summary_writer.flush()
+      logging.info(training_status)
+
+      # If no need for evaluation, we only call on_batch_end with train_loss,
+      # this is to ensure we get granular global_step/sec on Tensorboard.
+      if current_step % steps_between_evals:
+        callback_list.on_batch_end(current_step - 1, {'loss': train_loss})
+      else:
+        # Save a submodel with the step in the file name after each epoch.
+        if sub_model_export_name:
+          _save_checkpoint(
+              strategy, sub_model_checkpoint, model_dir,
+              '%s_step_%d.ckpt' % (sub_model_export_name, current_step))
+
+        # Save model checkpoints and run validation steps after each epoch
+        # (with the exception of the final epoch which is handled after the
+        # training loop).
+        if current_step < total_training_steps:
+          _save_checkpoint(strategy, checkpoint, model_dir,
+                           checkpoint_name.format(step=current_step))
+          if eval_input_fn:
+            # Re-initialize evaluation metric.
+            eval_loss_metric.reset_states()
+            for metric in eval_metrics + model.metrics:
+              metric.reset_states()
+
+            logging.info('Running evaluation after step: %s.', current_step)
+            logs = _run_evaluation(current_step,
+                                   _get_input_iterator(eval_input_fn, strategy))
+        # We add train_loss here rather than call on_batch_end twice to make
+        # sure that no duplicated values are generated.
+        logs['loss'] = train_loss
+        callback_list.on_batch_end(current_step - 1, logs)
+
+      # Calls on_epoch_end after each real epoch ends to prevent mis-calculation
+      # of training steps.
+      if current_step % steps_per_epoch == 0:
+        callback_list.on_epoch_end(int(current_step / steps_per_epoch), logs)
+
+    if sub_model_export_name:
+      _save_checkpoint(strategy, sub_model_checkpoint, model_dir,
+                       '%s.ckpt' % sub_model_export_name)
+
+    _save_checkpoint(strategy, checkpoint, model_dir,
+                     checkpoint_name.format(step=current_step))
+    if eval_input_fn:
+      # Re-initialize evaluation metric.
+      eval_loss_metric.reset_states()
+      for metric in eval_metrics + model.metrics:
+        metric.reset_states()
+
+      logging.info('Running final evaluation after training is complete.')
+      logs = _run_evaluation(current_step,
+                             _get_input_iterator(eval_input_fn, strategy))
+    callback_list.on_epoch_end(int(current_step / steps_per_epoch), logs)
+    training_summary = {
+        'total_training_steps': total_training_steps,
+        'train_loss': _float_metric_value(train_loss_metric),
+    }
+    for metric in model.metrics:
+      training_summary[metric.name] = _float_metric_value(metric)
+    if eval_metrics:
+      training_summary['last_train_metrics'] = _float_metric_value(
+          train_metrics[0])
+      training_summary['eval_metrics'] = _float_metric_value(eval_metrics[0])
+
+    write_txt_summary(training_summary, summary_dir)
+
+    if not _should_export_summary(strategy):
+      tf.io.gfile.rmtree(summary_dir)
+
+    callback_list.on_train_end()
+
+    return model
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/__init__.py b/nlp/text_classification/bert/tensorflow2.0/modeling/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/activations/__init__.py b/nlp/text_classification/bert/tensorflow2.0/modeling/activations/__init__.py
new file mode 100644
index 000000000..3237bbe6f
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/activations/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Activations package definition."""
+from modeling.activations.gelu import gelu
+from modeling.activations.relu import relu6
+from modeling.activations.sigmoid import hard_sigmoid
+from modeling.activations.swish import hard_swish
+from modeling.activations.swish import identity
+from modeling.activations.swish import simple_swish
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/activations/gelu.py b/nlp/text_classification/bert/tensorflow2.0/modeling/activations/gelu.py
new file mode 100644
index 000000000..a73294aa5
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/activations/gelu.py
@@ -0,0 +1,32 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Gaussian error linear unit."""
+
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+def gelu(x):
+  """Gaussian Error Linear Unit.
+
+  This is a smoother version of the RELU.
+  Original paper: https://arxiv.org/abs/1606.08415
+  Args:
+    x: float Tensor to perform activation.
+
+  Returns:
+    `x` with the GELU activation applied.
+  """
+  return tf.keras.activations.gelu(x, approximate=True)
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/activations/relu.py b/nlp/text_classification/bert/tensorflow2.0/modeling/activations/relu.py
new file mode 100644
index 000000000..b3941b2f3
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/activations/relu.py
@@ -0,0 +1,31 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Customized Relu activation."""
+
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+def relu6(features):
+  """Computes the Relu6 activation function.
+
+  Args:
+    features: A `Tensor` representing preactivation values.
+
+  Returns:
+    The activation value.
+  """
+  features = tf.convert_to_tensor(features)
+  return tf.nn.relu6(features)
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/activations/sigmoid.py b/nlp/text_classification/bert/tensorflow2.0/modeling/activations/sigmoid.py
new file mode 100644
index 000000000..277463040
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/activations/sigmoid.py
@@ -0,0 +1,31 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Customized Sigmoid activation."""
+
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+def hard_sigmoid(features):
+  """Computes the hard sigmoid activation function.
+
+  Args:
+    features: A `Tensor` representing preactivation values.
+
+  Returns:
+    The activation value.
+  """
+  features = tf.convert_to_tensor(features)
+  return tf.nn.relu6(features + tf.cast(3., features.dtype)) * 0.16667
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/activations/swish.py b/nlp/text_classification/bert/tensorflow2.0/modeling/activations/swish.py
new file mode 100644
index 000000000..ea79985e3
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/activations/swish.py
@@ -0,0 +1,72 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Customized Swish activation."""
+
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+def simple_swish(features):
+  """Computes the Swish activation function.
+
+  The tf.nn.swish operation uses a custom gradient to reduce memory usage.
+  Since saving custom gradients in SavedModel is currently not supported, and
+  one would not be able to use an exported TF-Hub module for fine-tuning, we
+  provide this wrapper that can allow to select whether to use the native
+  TensorFlow swish operation, or whether to use a customized operation that
+  has uses default TensorFlow gradient computation.
+
+  Args:
+    features: A `Tensor` representing preactivation values.
+
+  Returns:
+    The activation value.
+  """
+  features = tf.convert_to_tensor(features)
+  return features * tf.nn.sigmoid(features)
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+def hard_swish(features):
+  """Computes a hard version of the swish function.
+
+  This operation can be used to reduce computational cost and improve
+  quantization for edge devices.
+
+  Args:
+    features: A `Tensor` representing preactivation values.
+
+  Returns:
+    The activation value.
+  """
+  features = tf.convert_to_tensor(features)
+  fdtype = features.dtype
+  return features * tf.nn.relu6(features + tf.cast(3., fdtype)) * (1. / 6.)
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+def identity(features):
+  """Computes the identity function.
+
+  Useful for helping in quantization.
+
+  Args:
+    features: A `Tensor` representing preactivation values.
+
+  Returns:
+    The activation value.
+  """
+  features = tf.convert_to_tensor(features)
+  return tf.identity(features)
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/hyperparams/__init__.py b/nlp/text_classification/bert/tensorflow2.0/modeling/hyperparams/__init__.py
new file mode 100644
index 000000000..e47d28b1d
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/hyperparams/__init__.py
@@ -0,0 +1,20 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Hyperparams package definition."""
+# pylint: disable=g-multiple-import
+from modeling.hyperparams.base_config import *
+from modeling.hyperparams.oneof import *
+from modeling.hyperparams.params_dict import *
+
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/hyperparams/base_config.py b/nlp/text_classification/bert/tensorflow2.0/modeling/hyperparams/base_config.py
new file mode 100644
index 000000000..07dcf4d0c
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/hyperparams/base_config.py
@@ -0,0 +1,270 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Base configurations to standardize experiments."""
+
+import copy
+import functools
+from typing import Any, List, Mapping, Optional, Type
+from absl import logging
+
+import dataclasses
+import tensorflow as tf
+import yaml
+
+from modeling.hyperparams import params_dict
+
+
+@dataclasses.dataclass
+class Config(params_dict.ParamsDict):
+  """The base configuration class that supports YAML/JSON based overrides.
+
+  Because of YAML/JSON serialization limitations, some semantics of dataclass
+  are not supported:
+  * It recursively enforces a allowlist of basic types and container types, so
+    it avoids surprises with copy and reuse caused by unanticipated types.
+  * Warning: it converts Dict to `Config` even within sequences,
+    e.g. for config = Config({'key': [([{'a': 42}],)]),
+         type(config.key[0][0][0]) is Config rather than dict.
+    If you define/annotate some field as Dict, the field will convert to a
+    `Config` instance and lose the dictionary type.
+  """
+
+  # It's safe to add bytes and other immutable types here.
+  IMMUTABLE_TYPES = (str, int, float, bool, type(None))
+  # It's safe to add set, frozenset and other collections here.
+  SEQUENCE_TYPES = (list, tuple)
+
+  default_params: dataclasses.InitVar[Optional[Mapping[str, Any]]] = None
+  restrictions: dataclasses.InitVar[Optional[List[str]]] = None
+
+  @classmethod
+  def _isvalidsequence(cls, v):
+    """Check if the input values are valid sequences.
+
+    Args:
+      v: Input sequence.
+
+    Returns:
+      True if the sequence is valid. Valid sequence includes the sequence
+      type in cls.SEQUENCE_TYPES and element type is in cls.IMMUTABLE_TYPES or
+      is dict or ParamsDict.
+    """
+    if not isinstance(v, cls.SEQUENCE_TYPES):
+      return False
+    return (all(isinstance(e, cls.IMMUTABLE_TYPES) for e in v) or
+            all(isinstance(e, dict) for e in v) or
+            all(isinstance(e, params_dict.ParamsDict) for e in v))
+
+  @classmethod
+  def _import_config(cls, v, subconfig_type):
+    """Returns v with dicts converted to Configs, recursively."""
+    if not issubclass(subconfig_type, params_dict.ParamsDict):
+      raise TypeError(
+          'Subconfig_type should be subclass of ParamsDict, found {!r}'.format(
+              subconfig_type))
+    if isinstance(v, cls.IMMUTABLE_TYPES):
+      return v
+    elif isinstance(v, cls.SEQUENCE_TYPES):
+      # Only support one layer of sequence.
+      if not cls._isvalidsequence(v):
+        raise TypeError(
+            'Invalid sequence: only supports single level {!r} of {!r} or '
+            'dict or ParamsDict found: {!r}'.format(cls.SEQUENCE_TYPES,
+                                                    cls.IMMUTABLE_TYPES, v))
+      import_fn = functools.partial(
+          cls._import_config, subconfig_type=subconfig_type)
+      return type(v)(map(import_fn, v))
+    elif isinstance(v, params_dict.ParamsDict):
+      # Deepcopy here is a temporary solution for preserving type in nested
+      # Config object.
+      return copy.deepcopy(v)
+    elif isinstance(v, dict):
+      return subconfig_type(v)
+    else:
+      raise TypeError('Unknown type: {!r}'.format(type(v)))
+
+  @classmethod
+  def _export_config(cls, v):
+    """Returns v with Configs converted to dicts, recursively."""
+    if isinstance(v, cls.IMMUTABLE_TYPES):
+      return v
+    elif isinstance(v, cls.SEQUENCE_TYPES):
+      return type(v)(map(cls._export_config, v))
+    elif isinstance(v, params_dict.ParamsDict):
+      return v.as_dict()
+    elif isinstance(v, dict):
+      raise TypeError('dict value not supported in converting.')
+    else:
+      raise TypeError('Unknown type: {!r}'.format(type(v)))
+
+  @classmethod
+  def _get_subconfig_type(cls, k) -> Type[params_dict.ParamsDict]:
+    """Get element type by the field name.
+
+    Args:
+      k: the key/name of the field.
+
+    Returns:
+      Config as default. If a type annotation is found for `k`,
+      1) returns the type of the annotation if it is subtype of ParamsDict;
+      2) returns the element type if the annotation of `k` is List[SubType]
+         or Tuple[SubType].
+    """
+    subconfig_type = Config
+    if k in cls.__annotations__:
+      # Directly Config subtype.
+      type_annotation = cls.__annotations__[k]  # pytype: disable=invalid-annotation
+      if (isinstance(type_annotation, type) and
+          issubclass(type_annotation, Config)):
+        subconfig_type = cls.__annotations__[k]  # pytype: disable=invalid-annotation
+      else:
+        # Check if the field is a sequence of subtypes.
+        field_type = getattr(type_annotation, '__origin__', type(None))
+        if (isinstance(field_type, type) and
+            issubclass(field_type, cls.SEQUENCE_TYPES)):
+          element_type = getattr(type_annotation, '__args__', [type(None)])[0]
+          subconfig_type = (
+              element_type if issubclass(element_type, params_dict.ParamsDict)
+              else subconfig_type)
+    return subconfig_type
+
+  def __post_init__(self, default_params, restrictions, *args, **kwargs):
+    super().__init__(
+        default_params=default_params,
+        restrictions=restrictions,
+        *args,
+        **kwargs)
+
+  def _set(self, k, v):
+    """Overrides same method in ParamsDict.
+
+    Also called by ParamsDict methods.
+
+    Args:
+      k: key to set.
+      v: value.
+
+    Raises:
+      RuntimeError
+    """
+    subconfig_type = self._get_subconfig_type(k)
+
+    def is_null(k):
+      if k not in self.__dict__ or not self.__dict__[k]:
+        return True
+      return False
+
+    if isinstance(v, dict):
+      if is_null(k):
+        # If the key not exist or the value is None, a new Config-family object
+        # sould be created for the key.
+        self.__dict__[k] = subconfig_type(v)
+      else:
+        self.__dict__[k].override(v)
+    elif not is_null(k) and isinstance(v, self.SEQUENCE_TYPES) and all(
+        [not isinstance(e, self.IMMUTABLE_TYPES) for e in v]):
+      if len(self.__dict__[k]) == len(v):
+        for i in range(len(v)):
+          self.__dict__[k][i].override(v[i])
+      elif not all([isinstance(e, self.IMMUTABLE_TYPES) for e in v]):
+        logging.warning(
+            "The list/tuple don't match the value dictionaries provided. Thus, "
+            'the list/tuple is determined by the type annotation and '
+            'values provided. This is error-prone.')
+        self.__dict__[k] = self._import_config(v, subconfig_type)
+      else:
+        self.__dict__[k] = self._import_config(v, subconfig_type)
+    else:
+      self.__dict__[k] = self._import_config(v, subconfig_type)
+
+  def __setattr__(self, k, v):
+    if k not in self.RESERVED_ATTR:
+      if getattr(self, '_locked', False):
+        raise ValueError('The Config has been locked. ' 'No change is allowed.')
+    self._set(k, v)
+
+  def _override(self, override_dict, is_strict=True):
+    """Overrides same method in ParamsDict.
+
+    Also called by ParamsDict methods.
+
+    Args:
+      override_dict: dictionary to write to .
+      is_strict: If True, not allows to add new keys.
+
+    Raises:
+      KeyError: overriding reserved keys or keys not exist (is_strict=True).
+    """
+    for k, v in sorted(override_dict.items()):
+      if k in self.RESERVED_ATTR:
+        raise KeyError('The key {!r} is internally reserved. '
+                       'Can not be overridden.'.format(k))
+      if k not in self.__dict__:
+        if is_strict:
+          raise KeyError('The key {!r} does not exist in {!r}. '
+                         'To extend the existing keys, use '
+                         '`override` with `is_strict` = False.'.format(
+                             k, type(self)))
+        else:
+          self._set(k, v)
+      else:
+        if isinstance(v, dict) and self.__dict__[k]:
+          self.__dict__[k]._override(v, is_strict)  # pylint: disable=protected-access
+        elif isinstance(v, params_dict.ParamsDict) and self.__dict__[k]:
+          self.__dict__[k]._override(v.as_dict(), is_strict)  # pylint: disable=protected-access
+        else:
+          self._set(k, v)
+
+  def as_dict(self):
+    """Returns a dict representation of params_dict.ParamsDict.
+
+    For the nested params_dict.ParamsDict, a nested dict will be returned.
+    """
+    return {
+        k: self._export_config(v)
+        for k, v in self.__dict__.items()
+        if k not in self.RESERVED_ATTR
+    }
+
+  def replace(self, **kwargs):
+    """Overrides/returns a unlocked copy with the current config unchanged."""
+    # pylint: disable=protected-access
+    params = copy.deepcopy(self)
+    params._locked = False
+    params._override(kwargs, is_strict=True)
+    # pylint: enable=protected-access
+    return params
+
+  @classmethod
+  def from_yaml(cls, file_path: str):
+    # Note: This only works if the Config has all default values.
+    with tf.io.gfile.GFile(file_path, 'r') as f:
+      loaded = yaml.load(f, Loader=yaml.FullLoader)
+      config = cls()
+      config.override(loaded)
+      return config
+
+  @classmethod
+  def from_json(cls, file_path: str):
+    """Wrapper for `from_yaml`."""
+    return cls.from_yaml(file_path)
+
+  @classmethod
+  def from_args(cls, *args, **kwargs):
+    """Builds a config from the given list of arguments."""
+    attributes = list(cls.__annotations__.keys())
+    default_params = {a: p for a, p in zip(attributes, args)}
+    default_params.update(kwargs)
+    return cls(default_params)
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/hyperparams/config_definitions.py b/nlp/text_classification/bert/tensorflow2.0/modeling/hyperparams/config_definitions.py
new file mode 100644
index 000000000..3bd950084
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/hyperparams/config_definitions.py
@@ -0,0 +1,57 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Common configuration settings."""
+# pylint:disable=wildcard-import
+import dataclasses
+
+from core.config_definitions import *
+from modeling.hyperparams import base_config
+
+
+# TODO(hongkuny): These configs are used in models that are going to deprecate.
+# Once those models are removed, we should delete this file to avoid confusion.
+# Users should not use this file anymore.
+@dataclasses.dataclass
+class TensorboardConfig(base_config.Config):
+  """Configuration for Tensorboard.
+
+  Attributes:
+    track_lr: Whether or not to track the learning rate in Tensorboard. Defaults
+      to True.
+    write_model_weights: Whether or not to write the model weights as images in
+      Tensorboard. Defaults to False.
+  """
+  track_lr: bool = True
+  write_model_weights: bool = False
+
+
+@dataclasses.dataclass
+class CallbacksConfig(base_config.Config):
+  """Configuration for Callbacks.
+
+  Attributes:
+    enable_checkpoint_and_export: Whether or not to enable checkpoints as a
+      Callback. Defaults to True.
+    enable_backup_and_restore: Whether or not to add BackupAndRestore
+      callback. Defaults to True.
+    enable_tensorboard: Whether or not to enable Tensorboard as a Callback.
+      Defaults to True.
+    enable_time_history: Whether or not to enable TimeHistory Callbacks.
+      Defaults to True.
+  """
+  enable_checkpoint_and_export: bool = True
+  enable_backup_and_restore: bool = False
+  enable_tensorboard: bool = True
+  enable_time_history: bool = True
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/hyperparams/oneof.py b/nlp/text_classification/bert/tensorflow2.0/modeling/hyperparams/oneof.py
new file mode 100644
index 000000000..8879c02c1
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/hyperparams/oneof.py
@@ -0,0 +1,57 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Config class that supports oneof functionality."""
+
+from typing import Optional
+
+import dataclasses
+from modeling.hyperparams import base_config
+
+
+@dataclasses.dataclass
+class OneOfConfig(base_config.Config):
+  """Configuration for configs with one of feature.
+
+  Attributes:
+    type: 'str', name of the field to select.
+  """
+  type: Optional[str] = None
+
+  def as_dict(self):
+    """Returns a dict representation of OneOfConfig.
+
+    For the nested base_config.Config, a nested dict will be returned.
+    """
+    if self.type is None:
+      return {'type': None}
+    elif self.__dict__['type'] not in self.__dict__:
+      raise ValueError('type: {!r} is not a valid key!'.format(
+          self.__dict__['type']))
+    else:
+      chosen_type = self.type
+      chosen_value = self.__dict__[chosen_type]
+      return {'type': self.type, chosen_type: self._export_config(chosen_value)}
+
+  def get(self):
+    """Returns selected config based on the value of type.
+
+    If type is not set (None), None is returned.
+    """
+    chosen_type = self.type
+    if chosen_type is None:
+      return None
+    if chosen_type not in self.__dict__:
+      raise ValueError('type: {!r} is not a valid key!'.format(self.type))
+    return self.__dict__[chosen_type]
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/hyperparams/params_dict.py b/nlp/text_classification/bert/tensorflow2.0/modeling/hyperparams/params_dict.py
new file mode 100644
index 000000000..76b0446f0
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/hyperparams/params_dict.py
@@ -0,0 +1,464 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A parameter dictionary class which supports the nest structure."""
+
+import collections
+import copy
+import re
+
+import six
+import tensorflow as tf
+import yaml
+
+# regex pattern that matches on key-value pairs in a comma-separated
+# key-value pair string. It splits each k-v pair on the = sign, and
+# matches on values that are within single quotes, double quotes, single
+# values (e.g. floats, ints, etc.), and a lists within brackets.
+_PARAM_RE = re.compile(
+    r"""
+  (?P<name>[a-zA-Z][\w\.]*)    # variable name: "var" or "x"
+  \s*=\s*
+  ((?P<val>\'(.*?)\'           # single quote
+  |
+  \"(.*?)\"                    # double quote
+  |
+  [^,\[]*                      # single value
+  |
+  \[[^\]]*\]))                 # list of values
+  ($|,\s*)""", re.VERBOSE)
+
+_CONST_VALUE_RE = re.compile(r'(\d.*|-\d.*|None)')
+
+# Yaml loader with an implicit resolver to parse float decimal and exponential
+# format. The regular experission parse the following cases:
+# 1- Decimal number with an optional exponential term.
+# 2- Integer number with an exponential term.
+# 3- Decimal number with an optional exponential term.
+# 4- Decimal number.
+
+LOADER = yaml.SafeLoader
+LOADER.add_implicit_resolver(
+    'tag:yaml.org,2002:float',
+    re.compile(r'''
+    ^(?:[-+]?(?:[0-9][0-9_]*)\\.[0-9_]*(?:[eE][-+]?[0-9]+)?
+    |
+    [-+]?(?:[0-9][0-9_]*)(?:[eE][-+]?[0-9]+)
+    |
+    \\.[0-9_]+(?:[eE][-+][0-9]+)?
+    |
+    [-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\\.[0-9_]*)$''', re.X),
+    list('-+0123456789.'))
+
+
+class ParamsDict(object):
+  """A hyperparameter container class."""
+
+  RESERVED_ATTR = ['_locked', '_restrictions']
+
+  def __init__(self, default_params=None, restrictions=None):
+    """Instantiate a ParamsDict.
+
+    Instantiate a ParamsDict given a set of default parameters and a list of
+    restrictions. Upon initialization, it validates itself by checking all the
+    defined restrictions, and raise error if it finds inconsistency.
+
+    Args:
+      default_params: a Python dict or another ParamsDict object including the
+        default parameters to initialize.
+      restrictions: a list of strings, which define a list of restrictions to
+        ensure the consistency of different parameters internally. Each
+        restriction string is defined as a binary relation with a set of
+        operators, including {'==', '!=',  '<', '<=', '>', '>='}.
+    """
+    self._locked = False
+    self._restrictions = []
+    if restrictions:
+      self._restrictions = restrictions
+    if default_params is None:
+      default_params = {}
+    self.override(default_params, is_strict=False)
+
+  def _set(self, k, v):
+    if isinstance(v, dict):
+      self.__dict__[k] = ParamsDict(v)
+    else:
+      self.__dict__[k] = copy.deepcopy(v)
+
+  def __setattr__(self, k, v):
+    """Sets the value of the existing key.
+
+    Note that this does not allow directly defining a new key. Use the
+    `override` method with `is_strict=False` instead.
+
+    Args:
+      k: the key string.
+      v: the value to be used to set the key `k`.
+
+    Raises:
+      KeyError: if k is not defined in the ParamsDict.
+    """
+    if k not in ParamsDict.RESERVED_ATTR:
+      if k not in self.__dict__.keys():
+        raise KeyError('The key `%{}` does not exist. '
+                       'To extend the existing keys, use '
+                       '`override` with `is_strict` = True.'.format(k))
+      if self._locked:
+        raise ValueError('The ParamsDict has been locked. '
+                         'No change is allowed.')
+    self._set(k, v)
+
+  def __getattr__(self, k):
+    """Gets the value of the existing key.
+
+    Args:
+      k: the key string.
+
+    Returns:
+      the value of the key.
+
+    Raises:
+      AttributeError: if k is not defined in the ParamsDict.
+    """
+    if k not in self.__dict__.keys():
+      raise AttributeError('The key `{}` does not exist. '.format(k))
+    return self.__dict__[k]
+
+  def __contains__(self, key):
+    """Implements the membership test operator."""
+    return key in self.__dict__
+
+  def get(self, key, value=None):
+    """Accesses through built-in dictionary get method."""
+    return self.__dict__.get(key, value)
+
+  def __delattr__(self, k):
+    """Deletes the key and removes its values.
+
+    Args:
+      k: the key string.
+
+    Raises:
+      AttributeError: if k is reserverd or not defined in the ParamsDict.
+      ValueError: if the ParamsDict instance has been locked.
+    """
+    if k in ParamsDict.RESERVED_ATTR:
+      raise AttributeError(
+          'The key `{}` is reserved. No change is allowes. '.format(k))
+    if k not in self.__dict__.keys():
+      raise AttributeError('The key `{}` does not exist. '.format(k))
+    if self._locked:
+      raise ValueError('The ParamsDict has been locked. No change is allowed.')
+    del self.__dict__[k]
+
+  def override(self, override_params, is_strict=True):
+    """Override the ParamsDict with a set of given params.
+
+    Args:
+      override_params: a dict or a ParamsDict specifying the parameters to be
+        overridden.
+      is_strict: a boolean specifying whether override is strict or not. If
+        True, keys in `override_params` must be present in the ParamsDict. If
+        False, keys in `override_params` can be different from what is currently
+        defined in the ParamsDict. In this case, the ParamsDict will be extended
+        to include the new keys.
+    """
+    if self._locked:
+      raise ValueError('The ParamsDict has been locked. No change is allowed.')
+    if isinstance(override_params, ParamsDict):
+      override_params = override_params.as_dict()
+    self._override(override_params, is_strict)  # pylint: disable=protected-access
+
+  def _override(self, override_dict, is_strict=True):
+    """The implementation of `override`."""
+    for k, v in six.iteritems(override_dict):
+      if k in ParamsDict.RESERVED_ATTR:
+        raise KeyError('The key `%{}` is internally reserved. '
+                       'Can not be overridden.')
+      if k not in self.__dict__.keys():
+        if is_strict:
+          raise KeyError('The key `{}` does not exist. '
+                         'To extend the existing keys, use '
+                         '`override` with `is_strict` = False.'.format(k))
+        else:
+          self._set(k, v)
+      else:
+        if isinstance(v, dict):
+          self.__dict__[k]._override(v, is_strict)  # pylint: disable=protected-access
+        elif isinstance(v, ParamsDict):
+          self.__dict__[k]._override(v.as_dict(), is_strict)  # pylint: disable=protected-access
+        else:
+          self.__dict__[k] = copy.deepcopy(v)
+
+  def lock(self):
+    """Makes the ParamsDict immutable."""
+    self._locked = True
+
+  def as_dict(self):
+    """Returns a dict representation of ParamsDict.
+
+    For the nested ParamsDict, a nested dict will be returned.
+    """
+    params_dict = {}
+    for k, v in six.iteritems(self.__dict__):
+      if k not in ParamsDict.RESERVED_ATTR:
+        if isinstance(v, ParamsDict):
+          params_dict[k] = v.as_dict()
+        else:
+          params_dict[k] = copy.deepcopy(v)
+    return params_dict
+
+  def validate(self):
+    """Validate the parameters consistency based on the restrictions.
+
+    This method validates the internal consistency using the pre-defined list of
+    restrictions. A restriction is defined as a string which specfiies a binary
+    operation. The supported binary operations are {'==', '!=', '<', '<=', '>',
+    '>='}. Note that the meaning of these operators are consistent with the
+    underlying Python immplementation. Users should make sure the define
+    restrictions on their type make sense.
+
+    For example, for a ParamsDict like the following
+    ```
+    a:
+      a1: 1
+      a2: 2
+    b:
+      bb:
+        bb1: 10
+        bb2: 20
+      ccc:
+        a1: 1
+        a3: 3
+    ```
+    one can define two restrictions like this
+    ['a.a1 == b.ccc.a1', 'a.a2 <= b.bb.bb2']
+
+    What it enforces are:
+     - a.a1 = 1 == b.ccc.a1 = 1
+     - a.a2 = 2 <= b.bb.bb2 = 20
+
+    Raises:
+      KeyError: if any of the following happens
+        (1) any of parameters in any of restrictions is not defined in
+            ParamsDict,
+        (2) any inconsistency violating the restriction is found.
+      ValueError: if the restriction defined in the string is not supported.
+    """
+
+    def _get_kv(dotted_string, params_dict):
+      """Get keys and values indicated by dotted_string."""
+      if _CONST_VALUE_RE.match(dotted_string) is not None:
+        const_str = dotted_string
+        if const_str == 'None':
+          constant = None
+        else:
+          constant = float(const_str)
+        return None, constant
+      else:
+        tokenized_params = dotted_string.split('.')
+        v = params_dict
+        for t in tokenized_params:
+          v = v[t]
+        return tokenized_params[-1], v
+
+    def _get_kvs(tokens, params_dict):
+      if len(tokens) != 2:
+        raise ValueError('Only support binary relation in restriction.')
+      stripped_tokens = [t.strip() for t in tokens]
+      left_k, left_v = _get_kv(stripped_tokens[0], params_dict)
+      right_k, right_v = _get_kv(stripped_tokens[1], params_dict)
+      return left_k, left_v, right_k, right_v
+
+    params_dict = self.as_dict()
+    for restriction in self._restrictions:
+      if '==' in restriction:
+        tokens = restriction.split('==')
+        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
+        if left_v != right_v:
+          raise KeyError(
+              'Found inconsistncy between key `{}` and key `{}`.'.format(
+                  tokens[0], tokens[1]))
+      elif '!=' in restriction:
+        tokens = restriction.split('!=')
+        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
+        if left_v == right_v:
+          raise KeyError(
+              'Found inconsistncy between key `{}` and key `{}`.'.format(
+                  tokens[0], tokens[1]))
+      elif '<' in restriction:
+        tokens = restriction.split('<')
+        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
+        if left_v >= right_v:
+          raise KeyError(
+              'Found inconsistncy between key `{}` and key `{}`.'.format(
+                  tokens[0], tokens[1]))
+      elif '<=' in restriction:
+        tokens = restriction.split('<=')
+        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
+        if left_v > right_v:
+          raise KeyError(
+              'Found inconsistncy between key `{}` and key `{}`.'.format(
+                  tokens[0], tokens[1]))
+      elif '>' in restriction:
+        tokens = restriction.split('>')
+        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
+        if left_v <= right_v:
+          raise KeyError(
+              'Found inconsistncy between key `{}` and key `{}`.'.format(
+                  tokens[0], tokens[1]))
+      elif '>=' in restriction:
+        tokens = restriction.split('>=')
+        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
+        if left_v < right_v:
+          raise KeyError(
+              'Found inconsistncy between key `{}` and key `{}`.'.format(
+                  tokens[0], tokens[1]))
+      else:
+        raise ValueError('Unsupported relation in restriction.')
+
+
+def read_yaml_to_params_dict(file_path: str):
+  """Reads a YAML file to a ParamsDict."""
+  with tf.io.gfile.GFile(file_path, 'r') as f:
+    params_dict = yaml.load(f, Loader=LOADER)
+    return ParamsDict(params_dict)
+
+
+def save_params_dict_to_yaml(params, file_path):
+  """Saves the input ParamsDict to a YAML file."""
+  with tf.io.gfile.GFile(file_path, 'w') as f:
+
+    def _my_list_rep(dumper, data):
+      # u'tag:yaml.org,2002:seq' is the YAML internal tag for sequence.
+      return dumper.represent_sequence(
+          u'tag:yaml.org,2002:seq', data, flow_style=True)
+
+    yaml.add_representer(list, _my_list_rep)
+    yaml.dump(params.as_dict(), f, default_flow_style=False)
+
+
+def nested_csv_str_to_json_str(csv_str):
+  """Converts a nested (using '.') comma-separated k=v string to a JSON string.
+
+  Converts a comma-separated string of key/value pairs that supports
+  nesting of keys to a JSON string. Nesting is implemented using
+  '.' between levels for a given key.
+
+  Spacing between commas and = is supported (e.g. there is no difference between
+  "a=1,b=2", "a = 1, b = 2", or "a=1, b=2") but there should be no spaces before
+  keys or after values (e.g. " a=1,b=2" and "a=1,b=2 " are not supported).
+
+  Note that this will only support values supported by CSV, meaning
+  values such as nested lists (e.g. "a=[[1,2,3],[4,5,6]]") are not
+  supported. Strings are supported as well, e.g. "a='hello'".
+
+  An example conversion would be:
+
+  "a=1, b=2, c.a=2, c.b=3, d.a.a=5"
+
+  to
+
+  "{ a: 1, b : 2, c: {a : 2, b : 3}, d: {a: {a : 5}}}"
+
+  Args:
+    csv_str: the comma separated string.
+
+  Returns:
+    the converted JSON string.
+
+  Raises:
+    ValueError: If csv_str is not in a comma separated string or
+      if the string is formatted incorrectly.
+  """
+  if not csv_str:
+    return ''
+
+  formatted_entries = []
+  nested_map = collections.defaultdict(list)
+  pos = 0
+  while pos < len(csv_str):
+    m = _PARAM_RE.match(csv_str, pos)
+    if not m:
+      raise ValueError('Malformed hyperparameter value while parsing '
+                       'CSV string: %s' % csv_str[pos:])
+    pos = m.end()
+    # Parse the values.
+    m_dict = m.groupdict()
+    name = m_dict['name']
+    v = m_dict['val']
+
+    # If a GCS path (e.g. gs://...) is provided, wrap this in quotes
+    # as yaml.load would otherwise throw an exception
+    if re.match(r'(?=[^\"\'])(?=[gs://])', v):
+      v = '\'{}\''.format(v)
+
+    name_nested = name.split('.')
+    if len(name_nested) > 1:
+      grouping = name_nested[0]
+      value = '.'.join(name_nested[1:]) + '=' + v
+      nested_map[grouping].append(value)
+    else:
+      formatted_entries.append('%s : %s' % (name, v))
+
+  for grouping, value in nested_map.items():
+    value = ','.join(value)
+    value = nested_csv_str_to_json_str(value)
+    formatted_entries.append('%s : %s' % (grouping, value))
+  return '{' + ', '.join(formatted_entries) + '}'
+
+
+def override_params_dict(params, dict_or_string_or_yaml_file, is_strict):
+  """Override a given ParamsDict using a dict, JSON/YAML/CSV string or YAML file.
+
+  The logic of the function is outlined below:
+  1. Test that the input is a dict. If not, proceed to 2.
+  2. Tests that the input is a string. If not, raise unknown ValueError
+  2.1. Test if the string is in a CSV format. If so, parse.
+  If not, proceed to 2.2.
+  2.2. Try loading the string as a YAML/JSON. If successful, parse to
+  dict and use it to override. If not, proceed to 2.3.
+  2.3. Try using the string as a file path and load the YAML file.
+
+  Args:
+    params: a ParamsDict object to be overridden.
+    dict_or_string_or_yaml_file: a Python dict, JSON/YAML/CSV string or path to
+      a YAML file specifying the parameters to be overridden.
+    is_strict: a boolean specifying whether override is strict or not.
+
+  Returns:
+    params: the overridden ParamsDict object.
+
+  Raises:
+    ValueError: if failed to override the parameters.
+  """
+  if not dict_or_string_or_yaml_file:
+    return params
+  if isinstance(dict_or_string_or_yaml_file, dict):
+    params.override(dict_or_string_or_yaml_file, is_strict)
+  elif isinstance(dict_or_string_or_yaml_file, six.string_types):
+    try:
+      dict_or_string_or_yaml_file = (
+          nested_csv_str_to_json_str(dict_or_string_or_yaml_file))
+    except ValueError:
+      pass
+    params_dict = yaml.load(dict_or_string_or_yaml_file, Loader=LOADER)
+    if isinstance(params_dict, dict):
+      params.override(params_dict, is_strict)
+    else:
+      with tf.io.gfile.GFile(dict_or_string_or_yaml_file) as f:
+        params.override(yaml.load(f, Loader=yaml.FullLoader), is_strict)
+  else:
+    raise ValueError('Unknown input type to parse.')
+  return params
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/__init__.py b/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/base_model.py b/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/base_model.py
new file mode 100644
index 000000000..976b0d8e3
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/base_model.py
@@ -0,0 +1,60 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Abstraction of multi-task model."""
+from typing import Text, Dict
+
+import tensorflow as tf
+
+
+class MultiTaskBaseModel(tf.Module):
+  """Base class that holds multi-task model computation."""
+
+  def __init__(self, **kwargs):
+    super().__init__(**kwargs)
+    self._sub_tasks = self._instantiate_sub_tasks()
+
+  def _instantiate_sub_tasks(self) -> Dict[Text, tf.keras.Model]:
+    """Abstract function that sets up the computation for each sub-task.
+
+    Returns:
+      A map from task name (as string) to a tf.keras.Model object that
+        represents the sub-task in the multi-task pool.
+    """
+    raise NotImplementedError(
+        "_instantiate_sub_task_models() is not implemented.")
+
+  @property
+  def sub_tasks(self):
+    """Fetch a map of task name (string) to task model (tf.keras.Model)."""
+    return self._sub_tasks
+
+  def initialize(self):
+    """Optional function that loads a pre-train checkpoint."""
+    return
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/base_trainer.py b/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/base_trainer.py
new file mode 100644
index 000000000..7f975be84
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/base_trainer.py
@@ -0,0 +1,176 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Multitask base trainer implementation.
+
+The trainer derives from the Orbit `StandardTrainer` class.
+"""
+from typing import Union
+import gin
+import orbit
+import tensorflow as tf
+
+from modeling.multitask import base_model
+from modeling.multitask import multitask
+
+
+@gin.configurable
+class MultiTaskBaseTrainer(orbit.StandardTrainer):
+  """Multitask base trainer."""
+
+  def __init__(self,
+               multi_task: multitask.MultiTask,
+               multi_task_model: Union[tf.keras.Model,
+                                       base_model.MultiTaskBaseModel],
+               optimizer: tf.optimizers.Optimizer,
+               trainer_options=None):
+    self._strategy = tf.distribute.get_strategy()
+    self._multi_task = multi_task
+    self._multi_task_model = multi_task_model
+    self._optimizer = optimizer
+
+    self._training_losses = None
+    self._training_metrics = None
+    self._global_step = orbit.utils.create_global_step()
+
+    if hasattr(self.multi_task_model, "checkpoint_items"):
+      checkpoint_items = self.multi_task_model.checkpoint_items
+    else:
+      checkpoint_items = {}
+
+    self._checkpoint = tf.train.Checkpoint(
+        model=self.multi_task_model,
+        optimizer=self.optimizer,
+        global_step=self.global_step,
+        **checkpoint_items)
+
+    train_datasets = {}
+    for name, task in self.multi_task.tasks.items():
+      train_datasets[name] = orbit.utils.make_distributed_dataset(
+          self.strategy, task.build_inputs, task.task_config.train_data)
+
+    super().__init__(
+        train_dataset=train_datasets,
+        options=trainer_options or orbit.StandardTrainerOptions())
+
+  def train_loop_begin(self):
+    """Clean up states that hold losses and metrics."""
+    for _, train_loss_metric in self.training_losses.items():
+      train_loss_metric.reset_states()
+
+    for _, metrics in self.training_metrics.items():
+      for metric in metrics:
+        metric.reset_states()
+
+  def train_loop_end(self):
+    """Record loss and metric values per task."""
+    result = {}
+    for task_name, loss in self.training_losses.items():
+      result[task_name] = {loss.name: loss.result()}
+    for task_name, task_metrics in self.training_metrics.items():
+      result[task_name].update(
+          {metric.name: metric.result() for metric in task_metrics})
+    # Note that, the learning rate schedule is managed by the keras optimizer
+    # internally, which respects the number of backward pass as `iterations`.
+    # The learning rate schedule does not follow the trainer logical global
+    # step of multiple tasks.
+    if callable(self.optimizer.learning_rate):
+      result["learning_rate"] = self.optimizer.learning_rate(
+          self.optimizer.iterations)
+    else:
+      result["learning_rate"] = self.optimizer.learning_rate
+    return result
+
+  @property
+  def checkpoint(self):
+    """Accesses the training checkpoint."""
+    return self._checkpoint
+
+  @property
+  def training_losses(self):
+    """Access training loss metric objects for all tasks."""
+    if self._training_losses is None:
+      # Builds the per-task metrics and losses.
+      # This the total summed training loss of tasks in the joint training.
+      self._training_losses = dict(
+          total_loss=tf.keras.metrics.Mean("training_loss", dtype=tf.float32))
+      for name in self.multi_task.tasks:
+        self._training_losses[name] = tf.keras.metrics.Mean(
+            "training_loss", dtype=tf.float32)
+    return self._training_losses
+
+  @property
+  def training_metrics(self):
+    """Access training metric metric objects for all tasks."""
+    if self._training_metrics is None:
+      # Builds the per-task metrics and losses.
+      self._training_metrics = {}
+      for name, task in self.multi_task.tasks.items():
+        self._training_metrics[name] = task.build_metrics(training=True)
+    return self._training_metrics
+
+  @property
+  def strategy(self):
+    return self._strategy
+
+  @property
+  def multi_task(self):
+    return self._multi_task
+
+  @property
+  def multi_task_model(self):
+    return self._multi_task_model
+
+  @property
+  def optimizer(self):
+    return self._optimizer
+
+  @property
+  def global_step(self):
+    return self._global_step
+
+  def train_step(self, iterator_map):
+    """The default train step calling the multi-task train step.
+
+    Args:
+      iterator_map: a dictionary of task names and per-task dataset iterators.
+    """
+
+    def step_fn(inputs):
+      losses = self.multi_task.joint_train_step(
+          inputs,
+          multi_task_model=self.multi_task_model,
+          optimizer=self.optimizer,
+          task_metrics=self.training_metrics)
+      for key, loss in losses.items():
+        self.training_losses[key].update_state(loss)
+
+    self.strategy.run(
+        step_fn, args=(tf.nest.map_structure(next, iterator_map),))
+    self.global_step.assign_add(1)
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/configs.py b/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/configs.py
new file mode 100644
index 000000000..70e98682a
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/configs.py
@@ -0,0 +1,79 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration definitions for multi-task training."""
+from typing import Optional, Tuple
+
+import dataclasses
+
+from core import config_definitions as cfg
+from modeling import hyperparams
+
+
+@dataclasses.dataclass
+class TaskRoutine(hyperparams.Config):
+  task_name: str = ""
+  task_config: cfg.TaskConfig = None
+  eval_steps: Optional[int] = None
+  task_weight: Optional[float] = 1.0
+
+
+@dataclasses.dataclass
+class MultiTaskConfig(hyperparams.Config):
+  init_checkpoint: str = ""
+  model: hyperparams.Config = None
+  task_routines: Tuple[TaskRoutine, ...] = ()
+
+
+@dataclasses.dataclass
+class ProportionalSampleConfig(hyperparams.Config):
+  alpha: float = 1.0
+
+
+@dataclasses.dataclass
+class AnnealingSampleConfig(hyperparams.Config):
+  steps_per_epoch: int = 5
+  total_steps: int = 20
+
+
+@dataclasses.dataclass
+class TaskSamplingConfig(hyperparams.OneOfConfig):
+  type: str = ""
+  uniform: hyperparams.Config = hyperparams.Config()
+  proportional: ProportionalSampleConfig = ProportionalSampleConfig()
+  annealing: AnnealingSampleConfig = AnnealingSampleConfig()
+
+
+@dataclasses.dataclass
+class MultiTaskTrainerConfig(cfg.TrainerConfig):
+  trainer_type: str = "interleaving"
+  task_sampler: TaskSamplingConfig = TaskSamplingConfig(type="proportional")
+
+
+@dataclasses.dataclass
+class MultiTaskExperimentConfig(hyperparams.Config):
+  """An experiment config for multi-task training and multi-task evaluation."""
+  task: MultiTaskConfig = MultiTaskConfig()
+  trainer: MultiTaskTrainerConfig = MultiTaskTrainerConfig()
+  runtime: cfg.RuntimeConfig = cfg.RuntimeConfig()
+
+
+@dataclasses.dataclass
+class MultiEvalExperimentConfig(cfg.ExperimentConfig):
+  """An experiment config for single-task training and multi-task evaluation.
+
+  Attributes:
+    eval_tasks: individual evaluation tasks.
+  """
+  eval_tasks: MultiTaskConfig = MultiTaskConfig()
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/evaluator.py b/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/evaluator.py
new file mode 100644
index 000000000..cb28f784b
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/evaluator.py
@@ -0,0 +1,172 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Multitask Evaluator implementation.
+
+The evaluator implements the Orbit `AbstractEvaluator` interface.
+"""
+from typing import Optional, Union
+import gin
+import orbit
+import tensorflow as tf
+
+from core import train_utils
+from modeling.multitask import base_model
+from modeling.multitask import multitask
+
+
+@gin.configurable
+class MultiTaskEvaluator(orbit.AbstractEvaluator):
+  """Implements the common trainer shared for TensorFlow models."""
+
+  def __init__(
+      self,
+      task: multitask.MultiTask,
+      model: Union[tf.keras.Model, base_model.MultiTaskBaseModel],
+      global_step: Optional[tf.Variable] = None,
+      checkpoint_exporter: Optional[train_utils.BestCheckpointExporter] = None):
+    """Initialize common trainer for TensorFlow models.
+
+    Args:
+      task: A multitask.MultiTask instance.
+      model: tf.keras.Model instance.
+      global_step: the global step variable.
+      checkpoint_exporter: an object that has the `maybe_export_checkpoint`
+        interface.
+    """
+    # Gets the current distribution strategy. If not inside any strategy scope,
+    # it gets a single-replica no-op strategy.
+    self._strategy = tf.distribute.get_strategy()
+    self._task = task
+    self._model = model
+    self._global_step = global_step or orbit.utils.create_global_step()
+    self._checkpoint_exporter = checkpoint_exporter
+    self._checkpoint = tf.train.Checkpoint(
+        global_step=self.global_step,
+        model=self.model)
+
+    self._validation_losses = None
+    self._validation_metrics = None
+
+    # Builds per-task datasets.
+    self.eval_datasets = {}
+    for name, task in self.task.tasks.items():
+      self.eval_datasets[name] = orbit.utils.make_distributed_dataset(
+          self.strategy, task.build_inputs, task.task_config.validation_data)
+
+    # Builds per-task validation loops.
+    def get_function(task_name, task):
+
+      task_metrics = self.validation_metrics[task_name]
+      task_loss = self.validation_losses[task_name]
+      if isinstance(self.model, base_model.MultiTaskBaseModel):
+        model = self.model.sub_tasks[task_name]
+      else:
+        model = self.model
+
+      def step_fn(inputs):
+        logs = task.validation_step(inputs, model=model, metrics=task_metrics)
+        task_loss.update_state(logs[task.loss])
+        return logs
+
+      @tf.function
+      def eval_step_fn(iterator):
+        distributed_outputs = self.strategy.run(step_fn, args=(next(iterator),))
+        return tf.nest.map_structure(self.strategy.experimental_local_results,
+                                     distributed_outputs)
+
+      return orbit.utils.create_loop_fn(eval_step_fn)
+
+    self.task_fns = {
+        name: get_function(name, task)
+        for name, task in self.task.tasks.items()
+    }
+
+  @property
+  def strategy(self):
+    return self._strategy
+
+  @property
+  def task(self):
+    return self._task
+
+  @property
+  def model(self):
+    return self._model
+
+  @property
+  def global_step(self):
+    return self._global_step
+
+  @property
+  def validation_losses(self):
+    """Accesses the validation loss metric object."""
+    if self._validation_losses is None:
+      # Builds the per-task metrics and losses.
+      self._validation_losses = {}
+      for name in self.task.tasks:
+        self._validation_losses[name] = tf.keras.metrics.Mean(
+            "validation_loss", dtype=tf.float32)
+    return self._validation_losses
+
+  @property
+  def validation_metrics(self):
+    """Accesses all validation metric metric objects."""
+    if self._validation_metrics is None:
+      # Builds the per-task metrics and losses.
+      self._validation_metrics = {}
+      for name, task in self.task.tasks.items():
+        self._validation_metrics[name] = task.build_metrics(training=False)
+    return self._validation_metrics
+
+  @property
+  def checkpoint(self):
+    """Accesses the training checkpoint."""
+    return self._checkpoint
+
+  def evaluate(self, num_steps: tf.Tensor):
+    """Performs evaluation for each `EvalTask`."""
+    for metric in self.validation_losses.values():
+      metric.reset_states()
+    for metrics in self.validation_metrics.values():
+      for metric in metrics:
+        metric.reset_states()
+    results = {}
+    eval_iters = tf.nest.map_structure(iter, self.eval_datasets)
+
+    for name, task_eval_loop in self.task_fns.items():
+      outputs = None
+      eval_iter = eval_iters[name]
+      task = self.task.tasks[name]
+      task_eval_steps = self.task.task_eval_steps(name) or num_steps
+      outputs = task_eval_loop(
+          eval_iter,
+          task_eval_steps,
+          state=outputs,
+          reduce_fn=task.aggregate_logs)
+      task_metrics = self.validation_metrics[name]
+      task_loss = self.validation_losses[name]
+      logs = {}
+      for metric in task_metrics + [task_loss]:
+        logs[metric.name] = metric.result()
+      if outputs:
+        metrics = task.reduce_aggregated_logs(
+            outputs, global_step=self.global_step)
+        logs.update(metrics)
+      results[name] = logs
+
+    if self._checkpoint_exporter:
+      self._checkpoint_exporter.maybe_export_checkpoint(
+          self.checkpoint, results, self.global_step.numpy())
+    return results
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/interleaving_trainer.py b/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/interleaving_trainer.py
new file mode 100644
index 000000000..935351af5
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/interleaving_trainer.py
@@ -0,0 +1,92 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Multitask trainer that interleaves each task's train step."""
+from typing import Union
+import gin
+import orbit
+import tensorflow as tf
+from modeling.multitask import base_model
+from modeling.multitask import base_trainer
+from modeling.multitask import multitask
+from modeling.multitask import task_sampler as sampler
+
+
+@gin.configurable
+class MultiTaskInterleavingTrainer(base_trainer.MultiTaskBaseTrainer):
+  """MultiTask trainer that interleaves task update."""
+
+  def __init__(self,
+               multi_task: multitask.MultiTask,
+               multi_task_model: Union[tf.keras.Model,
+                                       base_model.MultiTaskBaseModel],
+               optimizer: tf.optimizers.Optimizer,
+               task_sampler: sampler.TaskSampler,
+               trainer_options=None):
+    super(MultiTaskInterleavingTrainer, self).__init__(
+        multi_task=multi_task,
+        multi_task_model=multi_task_model,
+        optimizer=optimizer,
+        trainer_options=trainer_options)
+    self._task_sampler = task_sampler
+
+    # Build per task train step.
+    def _get_task_step(task_name, task):
+
+      def step_fn(inputs):
+        if isinstance(self.multi_task_model, base_model.MultiTaskBaseModel):
+          task_model = self.multi_task_model.sub_tasks[task_name]
+        else:
+          task_model = self.multi_task_model
+        task_logs = task.train_step(
+            inputs,
+            model=task_model,
+            optimizer=self.optimizer,
+            metrics=self.training_metrics[task_name])
+        self.training_losses[task_name].update_state(task_logs[task.loss])
+
+      return step_fn
+
+    self._task_train_step_map = {
+        name: _get_task_step(name, task)
+        for name, task in self.multi_task.tasks.items()
+    }
+
+    # TODO(haozhangthu): Add taskwise step counter to train_loop_end for logging
+    # on TensorBoard.
+    self._task_step_counters = {
+        name: orbit.utils.create_global_step() for name in self.multi_task.tasks
+    }
+
+  def task_step_counter(self, name):
+    return self._task_step_counters[name]
+
+  def train_step(self, iterator_map):
+    # Sample one task to train according to a multinomial distribution
+    rn = tf.random.stateless_uniform(shape=[], seed=(0, self.global_step))
+    cumulative_sample_distribution = self._task_sampler.task_cumulative_distribution(
+        self.global_step)
+    # Prepend a [0.0] for indexing convenience.
+    cumulative_sample_distribution = tf.concat(
+        [tf.constant([0.0], dtype=tf.float32), cumulative_sample_distribution],
+        axis=0)
+
+    for idx, (name, _) in enumerate(self.multi_task.tasks.items()):
+      begin = cumulative_sample_distribution[idx]
+      end = cumulative_sample_distribution[idx + 1]
+      if rn >= begin and rn < end:
+        self._strategy.run(
+            self._task_train_step_map[name], args=(next(iterator_map[name]),))
+        self.global_step.assign_add(1)
+        self.task_step_counter(name).assign_add(1)
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/multitask.py b/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/multitask.py
new file mode 100644
index 000000000..d32b897e9
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/multitask.py
@@ -0,0 +1,148 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Experimental MultiTask base class for multi-task training/evaluation."""
+import abc
+from typing import Dict, List, Optional, Text, Union
+
+import tensorflow as tf
+from core import base_task
+from core import config_definitions
+from core import task_factory
+from modeling import optimization
+from modeling.multitask import base_model
+from modeling.multitask import configs
+
+OptimizationConfig = optimization.OptimizationConfig
+RuntimeConfig = config_definitions.RuntimeConfig
+
+
+class MultiTask(tf.Module, metaclass=abc.ABCMeta):
+  """A multi-task class to manage multiple tasks."""
+
+  def __init__(self,
+               tasks: Union[Dict[Text, base_task.Task], List[base_task.Task]],
+               task_weights: Optional[Dict[str, Union[float, int]]] = None,
+               task_eval_steps: Optional[Dict[str, int]] = None,
+               name: Optional[str] = None):
+    """MultiTask initialization.
+
+    Args:
+      tasks: a list or a flat dict of Task.
+      task_weights: a dict of (task, task weight), task weight can be applied
+        directly during loss summation in a joint backward step, or it can be
+        used to sample task among interleaved backward step.
+      task_eval_steps: a dict of (task, eval steps).
+      name: the instance name of a MultiTask object.
+    """
+    super().__init__(name=name)
+    if isinstance(tasks, list):
+      self._tasks = {}
+      for task in tasks:
+        if task.name in self._tasks:
+          raise ValueError("Duplicated tasks found, task.name is %s" %
+                           task.name)
+        self._tasks[task.name] = task
+    elif isinstance(tasks, dict):
+      self._tasks = tasks
+    else:
+      raise ValueError("The tasks argument has an invalid type: %s" %
+                       type(tasks))
+    self._task_eval_steps = task_eval_steps or {}
+    self._task_eval_steps = dict([
+        (name, self._task_eval_steps.get(name, None)) for name in self.tasks
+    ])
+    self._task_weights = task_weights or {}
+    self._task_weights = dict([
+        (name, self._task_weights.get(name, 1.0)) for name in self.tasks
+    ])
+
+  @classmethod
+  def from_config(cls, config: configs.MultiTaskConfig, logging_dir=None):
+    tasks = {}
+    task_eval_steps = {}
+    task_weights = {}
+    for task_routine in config.task_routines:
+      task_name = task_routine.task_name
+      tasks[task_name] = task_factory.get_task(
+          task_routine.task_config, logging_dir=logging_dir)
+      task_eval_steps[task_name] = task_routine.eval_steps
+      task_weights[task_name] = task_routine.task_weight
+    return cls(
+        tasks, task_eval_steps=task_eval_steps, task_weights=task_weights)
+
+  @property
+  def tasks(self):
+    return self._tasks
+
+  def task_eval_steps(self, task_name):
+    return self._task_eval_steps[task_name]
+
+  def task_weight(self, task_name):
+    return self._task_weights[task_name]
+
+  @property
+  def task_weights(self):
+    return self._task_weights
+
+  @classmethod
+  def create_optimizer(cls,
+                       optimizer_config: OptimizationConfig,
+                       runtime_config: Optional[RuntimeConfig] = None):
+    return base_task.Task.create_optimizer(
+        optimizer_config=optimizer_config, runtime_config=runtime_config)
+
+  def joint_train_step(self, task_inputs,
+                       multi_task_model: base_model.MultiTaskBaseModel,
+                       optimizer: tf.keras.optimizers.Optimizer, task_metrics):
+    """The joint train step.
+
+    Args:
+      task_inputs: a dictionary of task names and per-task features.
+      multi_task_model: a MultiTaskBaseModel instance.
+      optimizer: a tf.optimizers.Optimizer.
+      task_metrics: a dictionary of task names and per-task metrics.
+
+    Returns:
+      A dictionary of losses, inculding per-task losses and their weighted sum.
+    """
+    losses = {}
+    with tf.GradientTape() as tape:
+      total_loss = 0.0
+      for name, model in multi_task_model.sub_tasks.items():
+        inputs = task_inputs[name]
+        if isinstance(inputs, tuple) and len(inputs) == 2:
+          features, labels = inputs
+        elif isinstance(inputs, dict):
+          features, labels = inputs, inputs
+        else:
+          raise ValueError("The iterator output is neither a tuple nor a "
+                           "dictionary. It is not implemented to support "
+                           "such outputs.")
+        outputs = model(features, training=True)
+        task_loss = self.tasks[name].build_losses(labels, outputs)
+        task_weight = self.task_weight(name)
+        total_loss += task_weight * task_loss
+        losses[name] = task_loss
+        self.tasks[name].process_metrics(task_metrics[name], labels, outputs)
+
+        # Scales loss as the default gradients allreduce performs sum inside
+        # the optimizer.
+        scaled_loss = total_loss / tf.distribute.get_strategy(
+        ).num_replicas_in_sync
+    tvars = multi_task_model.trainable_variables
+    grads = tape.gradient(scaled_loss, tvars)
+    optimizer.apply_gradients(list(zip(grads, tvars)))
+    losses["total_loss"] = total_loss
+    return losses
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/task_sampler.py b/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/task_sampler.py
new file mode 100644
index 000000000..78ba84341
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/task_sampler.py
@@ -0,0 +1,121 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utils to sample tasks for interleaved optimization."""
+import abc
+from typing import Union, Dict, Text
+import tensorflow as tf
+
+from modeling.multitask import configs
+
+
+class TaskSampler(tf.Module, metaclass=abc.ABCMeta):
+  """An abstract class defining task sampling API for interleaving trainer."""
+
+  def __init__(self, task_weights: Dict[Text, Union[float, int]]):
+    self._task_weights = task_weights
+
+  @abc.abstractmethod
+  def task_cumulative_distribution(self, global_step: tf.Tensor) -> tf.Tensor:
+    """Compute cumulative distribution to sample tasks.
+
+    It calculates the cumulative distribution of the multinomial task
+    distribution with respect to which to be sampled against.
+
+    Args:
+      global_step: A tensor indicating current progess of training.
+
+    Returns:
+      A float tensor with shape (#(task), 1) that represents the cumulative
+        sampling distribution.
+    """
+    pass
+
+
+class UniformTaskSampler(TaskSampler):
+  """Sample all tasks uniformly."""
+
+  def __init__(self, task_weights: Dict[Text, Union[float, int]]):
+    super(UniformTaskSampler, self).__init__(task_weights=task_weights)
+    self._uniform_cumulative = tf.math.cumsum(
+        tf.constant(
+            [1.0 / len(self._task_weights)] * len(self._task_weights),
+            dtype=tf.float32))
+
+  def task_cumulative_distribution(self, global_step: tf.Tensor) -> tf.Tensor:
+    del global_step
+    return self._uniform_cumulative
+
+
+class ProportionalTaskSampler(TaskSampler):
+  """Sample tasks proportional to task weights."""
+
+  def __init__(self,
+               task_weights: Dict[Text, Union[float, int]],
+               alpha: float = 1.0):
+    super(ProportionalTaskSampler, self).__init__(task_weights=task_weights)
+    self._alpha = tf.cast(alpha, dtype=tf.float32)
+    task_weight_dict_ordered_list = tf.constant(
+        [weight for _, weight in self._task_weights.items()], dtype=tf.float32)
+    task_sizes = tf.math.pow(task_weight_dict_ordered_list, self._alpha)
+    task_distribution = task_sizes / tf.reduce_sum(task_sizes)
+    self._porportional_cumulative = tf.math.cumsum(task_distribution)
+
+  def task_cumulative_distribution(self, global_step: tf.Tensor) -> tf.Tensor:
+    del global_step
+    return self._porportional_cumulative
+
+
+class AnnealingTaskSampler(TaskSampler):
+  """Sample tasks according to task weights as well as training progress."""
+
+  def __init__(self,
+               task_weights: Dict[Text, Union[float, int]],
+               steps_per_epoch: int,
+               total_steps: int):
+    super(AnnealingTaskSampler, self).__init__(task_weights=task_weights)
+    self._steps_per_epoch = tf.cast(steps_per_epoch, dtype=tf.float32)
+    self._total_epochs = tf.cast(
+        total_steps / self._steps_per_epoch, dtype=tf.float32)
+
+  def task_cumulative_distribution(self, global_step: tf.Tensor) -> tf.Tensor:
+    cur_epoch = tf.math.floor(
+        tf.cast(global_step, dtype=tf.float32) / self._steps_per_epoch)
+    alpha = 1.0 - 0.8 * (cur_epoch - 1) / (self._total_epochs - 1 + 1e-10)
+    task_weight_dict_ordered_list = [
+        weight for _, weight in self._task_weights.items()
+    ]
+    task_sizes = tf.math.pow(
+        tf.constant(task_weight_dict_ordered_list, dtype=tf.float32),
+        tf.cast(alpha, dtype=tf.float32))
+    dynamic_task_distribution = task_sizes / tf.reduce_sum(task_sizes)
+    return tf.math.cumsum(dynamic_task_distribution)
+
+
+def get_task_sampler(config: configs.TaskSamplingConfig,
+                     task_weights: Dict[Text, float]) -> TaskSampler:
+  """Utils to create task sampler with configuration and task weights."""
+  oneof_config = config.get()
+  if config.type == 'uniform':
+    return UniformTaskSampler(task_weights=task_weights)
+  elif config.type == 'proportional':
+    return ProportionalTaskSampler(
+        task_weights=task_weights, alpha=oneof_config.alpha)
+  elif config.type == 'annealing':
+    return AnnealingTaskSampler(
+        task_weights=task_weights,
+        steps_per_epoch=oneof_config.steps_per_epoch,
+        total_steps=oneof_config.total_steps)
+  else:
+    raise RuntimeError('Task sampler type not supported')
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/test_utils.py b/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/test_utils.py
new file mode 100644
index 000000000..9ef43ee01
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/test_utils.py
@@ -0,0 +1,125 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Testing utils for mock models and tasks."""
+from typing import Dict, Text
+import tensorflow as tf
+from core import base_task
+from core import config_definitions as cfg
+from core import task_factory
+from modeling.multitask import base_model
+
+
+class MockFooModel(tf.keras.Model):
+  """A mock model can consume 'foo' and 'bar' inputs."""
+
+  def __init__(self, shared_layer, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+    self._share_layer = shared_layer
+    self._foo_specific_layer = tf.keras.layers.Dense(1)
+
+  def call(self, inputs):
+    self.add_loss(tf.zeros((1,), dtype=tf.float32))
+    if "foo" in inputs:
+      input_tensor = inputs["foo"]
+    else:
+      input_tensor = inputs["bar"]
+    return self._foo_specific_layer(self._share_layer(input_tensor))
+
+
+class MockBarModel(tf.keras.Model):
+
+  def __init__(self, shared_layer, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+    self._share_layer = shared_layer
+    self._bar_specific_layer = tf.keras.layers.Dense(1)
+
+  def call(self, inputs):
+    self.add_loss(tf.zeros((2,), dtype=tf.float32))
+    return self._bar_specific_layer(self._share_layer(inputs["bar"]))
+
+
+class MockMultiTaskModel(base_model.MultiTaskBaseModel):
+
+  def __init__(self, *args, **kwargs):
+    self._shared_dense = tf.keras.layers.Dense(1)
+    super().__init__(*args, **kwargs)
+
+  def _instantiate_sub_tasks(self) -> Dict[Text, tf.keras.Model]:
+    return {
+        "foo": MockFooModel(self._shared_dense),
+        "bar": MockBarModel(self._shared_dense)
+    }
+
+
+def mock_data(feature_name):
+  """Mock dataset function."""
+
+  def _generate_data(_):
+    x = tf.zeros(shape=(2,), dtype=tf.float32)
+    label = tf.zeros([1], dtype=tf.int32)
+    return {feature_name: x}, label
+
+  dataset = tf.data.Dataset.range(1)
+  dataset = dataset.repeat()
+  dataset = dataset.map(
+      _generate_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  return dataset.prefetch(buffer_size=1).batch(2, drop_remainder=True)
+
+
+class FooConfig(cfg.TaskConfig):
+  pass
+
+
+class BarConfig(cfg.TaskConfig):
+  pass
+
+
+@task_factory.register_task_cls(FooConfig)
+class MockFooTask(base_task.Task):
+  """Mock foo task object for testing."""
+
+  def build_metrics(self, training: bool = True):
+    del training
+    return [tf.keras.metrics.Accuracy(name="foo_acc")]
+
+  def build_inputs(self, params):
+    return mock_data("foo")
+
+  def build_model(self) -> tf.keras.Model:
+    return MockFooModel(shared_layer=tf.keras.layers.Dense(1))
+
+  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
+    loss = tf.keras.losses.mean_squared_error(labels, model_outputs)
+    if aux_losses:
+      loss += tf.add_n(aux_losses)
+    return tf.reduce_mean(loss)
+
+
+@task_factory.register_task_cls(BarConfig)
+class MockBarTask(base_task.Task):
+  """Mock bar task object for testing."""
+
+  def build_metrics(self, training: bool = True):
+    del training
+    return [tf.keras.metrics.Accuracy(name="bar_acc")]
+
+  def build_inputs(self, params):
+    return mock_data("bar")
+
+  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
+    loss = tf.keras.losses.mean_squared_error(labels, model_outputs)
+    if aux_losses:
+      loss += tf.add_n(aux_losses)
+    return tf.reduce_mean(loss)
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/train_lib.py b/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/train_lib.py
new file mode 100644
index 000000000..bc8f508b5
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/multitask/train_lib.py
@@ -0,0 +1,251 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Multitask training driver library."""
+# pytype: disable=attribute-error
+import os
+from typing import Optional
+from absl import logging
+import orbit
+import tensorflow as tf
+from core import base_task
+from core import base_trainer as core_lib
+from core import train_utils
+from modeling.multitask import base_model
+from modeling.multitask import base_trainer
+from modeling.multitask import configs
+from modeling.multitask import evaluator as evaluator_lib
+from modeling.multitask import interleaving_trainer
+from modeling.multitask import multitask
+from modeling.multitask import task_sampler
+
+TRAINERS = {
+    'interleaving': interleaving_trainer.MultiTaskInterleavingTrainer,
+    'joint': base_trainer.MultiTaskBaseTrainer
+}
+
+
+def run_experiment(*, distribution_strategy: tf.distribute.Strategy,
+                   task: multitask.MultiTask,
+                   model: base_model.MultiTaskBaseModel, mode: str,
+                   params: configs.MultiTaskExperimentConfig,
+                   model_dir: str) -> base_model.MultiTaskBaseModel:
+  """Runs train/eval configured by the experiment params.
+
+  Args:
+    distribution_strategy: A distribution distribution_strategy.
+    task: A MultiTaskTask instance.
+    model: A MultiTaskBaseModel instance.
+    mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval'
+      or 'continuous_eval'.
+    params: ExperimentConfig instance.
+    model_dir: A 'str', a path to store model checkpoints and summaries.
+
+  Returns:
+      model: `base_model.MultiTaskBaseModel` instance.
+  """
+
+  is_training = 'train' in mode
+  is_eval = 'eval' in mode
+  with distribution_strategy.scope():
+    optimizer = task.create_optimizer(params.trainer.optimizer_config,
+                                      params.runtime)
+    kwargs = dict(multi_task=task, multi_task_model=model, optimizer=optimizer)
+    if params.trainer.trainer_type == 'interleaving':
+      sampler = task_sampler.get_task_sampler(params.trainer.task_sampler,
+                                              task.task_weights)
+      kwargs.update(dict(task_sampler=sampler))
+    trainer = TRAINERS[params.trainer.trainer_type](
+        **kwargs) if is_training else None
+    if is_eval:
+      evaluator = evaluator_lib.MultiTaskEvaluator(
+          task=task,
+          model=model,
+          global_step=trainer.global_step if is_training else None,
+          checkpoint_exporter=train_utils.maybe_create_best_ckpt_exporter(
+              params, model_dir))
+    else:
+      evaluator = None
+
+  if trainer:
+    checkpoint = trainer.checkpoint
+    global_step = trainer.global_step
+  else:
+    checkpoint = evaluator.checkpoint
+    global_step = evaluator.global_step
+
+  # TODO(hongkuny,haozhangthu): Revisit initialization method.
+  checkpoint_manager = tf.train.CheckpointManager(
+      checkpoint,
+      directory=model_dir,
+      max_to_keep=params.trainer.max_to_keep,
+      step_counter=global_step,
+      checkpoint_interval=params.trainer.checkpoint_interval,
+      init_fn=model.initialize)
+
+  controller = orbit.Controller(
+      strategy=distribution_strategy,
+      trainer=trainer,
+      evaluator=evaluator,
+      global_step=global_step,
+      steps_per_loop=params.trainer.steps_per_loop,
+      checkpoint_manager=checkpoint_manager,
+      summary_dir=os.path.join(model_dir, 'train'),
+      eval_summary_dir=os.path.join(model_dir, 'validation'),
+      summary_interval=params.trainer.summary_interval)
+
+  logging.info('Starts to execute mode: %s', mode)
+  with distribution_strategy.scope():
+    if mode == 'train':
+      controller.train(steps=params.trainer.train_steps)
+    elif mode == 'train_and_eval':
+      controller.train_and_evaluate(
+          train_steps=params.trainer.train_steps,
+          eval_steps=params.trainer.validation_steps,
+          eval_interval=params.trainer.validation_interval)
+    elif mode == 'eval':
+      controller.evaluate(steps=params.trainer.validation_steps)
+    elif mode == 'continuous_eval':
+
+      def timeout_fn():
+        if evaluator.global_step.numpy() >= params.trainer.train_steps:
+          return True
+        return False
+
+      controller.evaluate_continuously(
+          steps=params.trainer.validation_steps,
+          timeout=params.trainer.continuous_eval_timeout,
+          timeout_fn=timeout_fn)
+    else:
+      raise NotImplementedError('The mode is not implemented: %s' % mode)
+
+    return model
+
+
+def run_experiment_with_multitask_eval(
+    *,
+    distribution_strategy: tf.distribute.Strategy,
+    train_task: base_task.Task,
+    eval_tasks: multitask.MultiTask,
+    mode: str,
+    params: configs.MultiEvalExperimentConfig,
+    model_dir: str,
+    run_post_eval: bool = False,
+    save_summary: bool = True,
+    trainer: Optional[core_lib.Trainer] = None) -> tf.keras.Model:
+  """Runs train/eval configured by the experiment params.
+
+  Args:
+    distribution_strategy: A distribution distribution_strategy.
+    train_task: A base_task.Task instance.
+    eval_tasks: A multitask.MultiTask with evaluation tasks.
+    mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval'
+      or 'continuous_eval'.
+    params: MultiEvalExperimentConfig instance.
+    model_dir: A 'str', a path to store model checkpoints and summaries.
+    run_post_eval: Whether to run post eval once after training, metrics logs
+      are returned.
+    save_summary: Whether to save train and validation summary.
+    trainer: the core_lib.Trainer instance. It should be created within the
+      strategy.scope(). If not provided, an instance will be created by default
+      if `mode` contains 'train'.
+
+  Returns:
+      model: `tf.keras.Model` instance.
+  """
+
+  is_training = 'train' in mode
+  is_eval = 'eval' in mode
+  with distribution_strategy.scope():
+    if is_training:
+      trainer = trainer or core_lib.Trainer(
+          config=params,
+          task=train_task,
+          model=train_task.build_model(),
+          optimizer=train_task.create_optimizer(
+              params.trainer.optimizer_config, params.runtime),
+          train=True,
+          evaluate=False)
+    else:
+      trainer = None
+    model = trainer.model if trainer else train_task.build_model()
+
+    if is_eval:
+      evaluator = evaluator_lib.MultiTaskEvaluator(
+          task=eval_tasks,
+          model=model,
+          global_step=trainer.global_step if is_training else None,
+          checkpoint_exporter=train_utils.maybe_create_best_ckpt_exporter(
+              params, model_dir))
+    else:
+      evaluator = None
+
+  if trainer:
+    checkpoint = trainer.checkpoint
+    global_step = trainer.global_step
+  else:
+    checkpoint = evaluator.checkpoint
+    global_step = evaluator.global_step
+
+  checkpoint_manager = tf.train.CheckpointManager(
+      checkpoint,
+      directory=model_dir,
+      max_to_keep=params.trainer.max_to_keep,
+      step_counter=global_step,
+      checkpoint_interval=params.trainer.checkpoint_interval,
+      init_fn=trainer.initialize if trainer else None)
+
+  controller = orbit.Controller(
+      strategy=distribution_strategy,
+      trainer=trainer,
+      evaluator=evaluator,
+      global_step=global_step,
+      steps_per_loop=params.trainer.steps_per_loop,
+      checkpoint_manager=checkpoint_manager,
+      summary_dir=os.path.join(model_dir, 'train') if save_summary else None,
+      eval_summary_dir=os.path.join(model_dir, 'validation') if
+      (save_summary) else None,
+      summary_interval=params.trainer.summary_interval if
+      (save_summary) else None)
+
+  logging.info('Starts to execute mode: %s', mode)
+  with distribution_strategy.scope():
+    if mode == 'train':
+      controller.train(steps=params.trainer.train_steps)
+    elif mode == 'train_and_eval':
+      controller.train_and_evaluate(
+          train_steps=params.trainer.train_steps,
+          eval_steps=params.trainer.validation_steps,
+          eval_interval=params.trainer.validation_interval)
+    elif mode == 'eval':
+      controller.evaluate(steps=params.trainer.validation_steps)
+    elif mode == 'continuous_eval':
+
+      def timeout_fn():
+        if evaluator.global_step.numpy() >= params.trainer.train_steps:
+          return True
+        return False
+
+      controller.evaluate_continuously(
+          steps=params.trainer.validation_steps,
+          timeout=params.trainer.continuous_eval_timeout,
+          timeout_fn=timeout_fn)
+    else:
+      raise NotImplementedError('The mode is not implemented: %s' % mode)
+
+    if run_post_eval:
+      return model, evaluator.evaluate(
+          tf.convert_to_tensor(params.trainer.validation_steps))
+    else:
+      return model, {}
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/__init__.py b/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/__init__.py
new file mode 100644
index 000000000..e6f22c6a8
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/__init__.py
@@ -0,0 +1,23 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Optimization package definition."""
+
+# pylint: disable=wildcard-import
+from modeling.optimization.configs.learning_rate_config import *
+from modeling.optimization.configs.optimization_config import *
+from modeling.optimization.configs.optimizer_config import *
+from modeling.optimization.ema_optimizer import ExponentialMovingAverage
+from modeling.optimization.lr_schedule import *
+from modeling.optimization.optimizer_factory import OptimizerFactory
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/configs/__init__.py b/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/configs/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/configs/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/configs/learning_rate_config.py b/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/configs/learning_rate_config.py
new file mode 100644
index 000000000..72b3da508
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/configs/learning_rate_config.py
@@ -0,0 +1,250 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Dataclasses for learning rate schedule config."""
+from typing import List, Optional
+
+import dataclasses
+from modeling.hyperparams import base_config
+
+
+@dataclasses.dataclass
+class ConstantLrConfig(base_config.Config):
+  """Configuration for constant learning rate.
+
+  This class is a containers for the constant learning rate decay configs.
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to Constant.
+    learning_rate: A float. The learning rate. Defaults to 0.1.
+  """
+  name: str = 'Constant'
+  learning_rate: float = 0.1
+
+
+@dataclasses.dataclass
+class StepwiseLrConfig(base_config.Config):
+  """Configuration for stepwise learning rate decay.
+
+  This class is a container for the piecewise constant learning rate scheduling
+  configs. It will configure an instance of PiecewiseConstantDecay keras
+  learning rate schedule.
+
+  An example (from keras docs): use a learning rate that's 1.0 for the first
+  100001 steps, 0.5 for the next 10000 steps, and 0.1 for any additional steps.
+    ```python
+    boundaries: [100000, 110000]
+    values: [1.0, 0.5, 0.1]
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to PiecewiseConstant.
+    boundaries: A list of ints of strictly increasing entries. Defaults to None.
+    values: A list of floats that specifies the values for the intervals defined
+      by `boundaries`. It should have one more element than `boundaries`.
+            The learning rate is computed as follows: [0, boundaries[0]] ->
+              values[0] [boundaries[0], boundaries[1]]     -> values[1]
+              [boundaries[n-1], boundaries[n]]   -> values[n] [boundaries[n],
+              end]               -> values[n+1] Defaults to None.
+    offset: An int. The offset applied to steps. Defaults to 0.
+  """
+  name: str = 'PiecewiseConstantDecay'
+  boundaries: Optional[List[int]] = None
+  values: Optional[List[float]] = None
+  offset: int = 0
+
+
+@dataclasses.dataclass
+class ExponentialLrConfig(base_config.Config):
+  """Configuration for exponential learning rate decay.
+
+  This class is a containers for the exponential learning rate decay configs.
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to ExponentialDecay.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    decay_steps: A positive integer that is used for decay computation. Defaults
+      to None.
+    decay_rate: A float. Defaults to None.
+    staircase: A boolean, if true, learning rate is decreased at discreate
+      intervals. Defaults to False.
+    offset: An int. The offset applied to steps. Defaults to 0.
+  """
+  name: str = 'ExponentialDecay'
+  initial_learning_rate: Optional[float] = None
+  decay_steps: Optional[int] = None
+  decay_rate: Optional[float] = None
+  staircase: Optional[bool] = None
+  offset: int = 0
+
+
+@dataclasses.dataclass
+class PolynomialLrConfig(base_config.Config):
+  """Configuration for polynomial learning rate decay.
+
+  This class is a containers for the polynomial learning rate decay configs.
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to PolynomialDecay.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    decay_steps: A positive integer that is used for decay computation. Defaults
+      to None.
+    end_learning_rate: A float.  The minimal end learning rate.
+    power: A float.  The power of the polynomial. Defaults to linear, 1.0.
+    cycle: A boolean, whether or not it should cycle beyond decay_steps.
+      Defaults to False.
+    offset: An int. The offset applied to steps. Defaults to 0.
+  """
+  name: str = 'PolynomialDecay'
+  initial_learning_rate: Optional[float] = None
+  decay_steps: Optional[int] = None
+  end_learning_rate: float = 0.0001
+  power: float = 1.0
+  cycle: bool = False
+  offset: int = 0
+
+
+@dataclasses.dataclass
+class CosineLrConfig(base_config.Config):
+  """Configuration for Cosine learning rate decay.
+
+  This class is a containers for the cosine learning rate decay configs,
+  tf.keras.experimental.CosineDecay.
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to CosineDecay.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    decay_steps: A positive integer that is used for decay computation. Defaults
+      to None.
+    alpha: A float.  Minimum learning rate value as a fraction of
+      initial_learning_rate.
+    offset: An int. The offset applied to steps. Defaults to 0.
+  """
+  name: str = 'CosineDecay'
+  initial_learning_rate: Optional[float] = None
+  decay_steps: Optional[int] = None
+  alpha: float = 0.0
+  offset: int = 0
+
+
+@dataclasses.dataclass
+class DirectPowerLrConfig(base_config.Config):
+  """Configuration for DirectPower learning rate decay.
+
+  This class configures a schedule following follows lr * (step)^power.
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to DirectPowerDecay.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    power: A float. Defaults to -0.5, for sqrt decay.
+  """
+  name: str = 'DirectPowerDecay'
+  initial_learning_rate: Optional[float] = None
+  power: float = -0.5
+
+
+@dataclasses.dataclass
+class PowerAndLinearDecayLrConfig(base_config.Config):
+  """Configuration for DirectPower learning rate decay.
+
+  The schedule has the following behavoir.
+  Let offset_step = step - offset.
+  1) offset_step < 0, the actual learning rate equals initial_learning_rate.
+  2) offset_step <= total_decay_steps * (1 - linear_decay_fraction), the
+  actual learning rate equals lr * offset_step^power.
+  3) total_decay_steps * (1 - linear_decay_fraction) <= offset_step <
+  total_decay_steps, the actual learning rate equals lr * offset_step^power *
+  (total_decay_steps - offset_step) / (total_decay_steps *
+  linear_decay_fraction).
+  4) offset_step >= total_decay_steps, the actual learning rate equals zero.
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to
+      PowerAndLinearDecay.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    total_decay_steps: An int. The total number of steps for power + linear
+      decay. Defaults to None.
+    power: A float. The order of the polynomial. Defaults to -0.5, for sqrt
+      decay.
+    linear_decay_fraction: A float. In the last `linear_decay_fraction` steps,
+      the learning rate will be multiplied by a linear decay. Defaults to 0.1.
+    offset: An int. The offset applied to steps. Defaults to 0.
+  """
+  name: str = 'PowerAndLinearDecay'
+  initial_learning_rate: Optional[float] = None
+  total_decay_steps: Optional[int] = None
+  power: float = -0.5
+  linear_decay_fraction: float = 0.1
+  offset: int = 0
+
+
+@dataclasses.dataclass
+class PowerDecayWithOffsetLrConfig(base_config.Config):
+  """Configuration for power learning rate decay with step offset.
+
+  Learning rate equals to `pre_offset_learning_rate` if `step` < `offset`.
+  Otherwise, learning rate equals to lr * (step - offset)^power.
+
+  Attributes:
+    name: The name of the learning rate schedule. Defaults to
+      PowerDecayWithOffset.
+    initial_learning_rate: A float. The initial learning rate. Defaults to None.
+    power: A float. Defaults to -0.5, for sqrt decay.
+    offset: An integer. Power decay happens after `offset` steps.
+    pre_offset_learning_rate: A float. The constant learning rate before
+      `offset` steps.
+  """
+  name: str = 'PowerDecayWithOffset'
+  initial_learning_rate: Optional[float] = None
+  power: float = -0.5
+  offset: int = 0
+  pre_offset_learning_rate: float = 1.0e6
+
+
+@dataclasses.dataclass
+class LinearWarmupConfig(base_config.Config):
+  """Configuration for linear warmup schedule config.
+
+  This class is a container for the linear warmup schedule configs.
+  Warmup_learning_rate is the initial learning rate, the final learning rate of
+  the warmup period is the learning_rate of the optimizer in use. The learning
+  rate at each step linearly increased according to the following formula:
+    warmup_learning_rate = warmup_learning_rate +
+    step / warmup_steps * (final_learning_rate - warmup_learning_rate).
+  Using warmup overrides the learning rate schedule by the number of warmup
+  steps.
+
+  Attributes:
+    name: The name of warmup schedule. Defaults to linear.
+    warmup_learning_rate: Initial learning rate for the warmup. Defaults to 0.
+    warmup_steps: Warmup steps. Defaults to None.
+  """
+  name: str = 'linear'
+  warmup_learning_rate: float = 0
+  warmup_steps: Optional[int] = None
+
+
+@dataclasses.dataclass
+class PolynomialWarmupConfig(base_config.Config):
+  """Configuration for linear warmup schedule config.
+
+  This class is a container for the polynomial warmup schedule configs.
+
+  Attributes:
+    name: The name of warmup schedule. Defaults to Polynomial.
+    power: Polynomial power. Defaults to 1.
+    warmup_steps: Warmup steps. Defaults to None.
+  """
+  name: str = 'polynomial'
+  power: float = 1
+  warmup_steps: Optional[int] = None
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/configs/optimization_config.py b/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/configs/optimization_config.py
new file mode 100644
index 000000000..e1809f67f
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/configs/optimization_config.py
@@ -0,0 +1,114 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Dataclasses for optimization configs.
+
+This file define the dataclass for optimization configs (OptimizationConfig).
+It also has two helper functions get_optimizer_config, and get_lr_config from
+an OptimizationConfig class.
+"""
+from typing import Optional
+
+import dataclasses
+
+from modeling.hyperparams import base_config
+from modeling.hyperparams import oneof
+from modeling.optimization.configs import learning_rate_config as lr_cfg
+from modeling.optimization.configs import optimizer_config as opt_cfg
+
+
+@dataclasses.dataclass
+class OptimizerConfig(oneof.OneOfConfig):
+  """Configuration for optimizer.
+
+  Attributes:
+    type: 'str', type of optimizer to be used, on the of fields below.
+    sgd: sgd optimizer config.
+    adam: adam optimizer config.
+    adamw: adam with weight decay.
+    lamb: lamb optimizer.
+    rmsprop: rmsprop optimizer.
+    lars: lars optimizer.
+    adagrad: adagrad optimizer.
+    slide: slide optimizer.
+  """
+  type: Optional[str] = None
+  sgd: opt_cfg.SGDConfig = opt_cfg.SGDConfig()
+  adam: opt_cfg.AdamConfig = opt_cfg.AdamConfig()
+  adamw: opt_cfg.AdamWeightDecayConfig = opt_cfg.AdamWeightDecayConfig()
+  lamb: opt_cfg.LAMBConfig = opt_cfg.LAMBConfig()
+  rmsprop: opt_cfg.RMSPropConfig = opt_cfg.RMSPropConfig()
+  lars: opt_cfg.LARSConfig = opt_cfg.LARSConfig()
+  adagrad: opt_cfg.AdagradConfig = opt_cfg.AdagradConfig()
+  slide: opt_cfg.SLIDEConfig = opt_cfg.SLIDEConfig()
+
+
+@dataclasses.dataclass
+class LrConfig(oneof.OneOfConfig):
+  """Configuration for lr schedule.
+
+  Attributes:
+    type: 'str', type of lr schedule to be used, one of the fields below.
+    constant: constant learning rate config.
+    stepwise: stepwise learning rate config.
+    exponential: exponential learning rate config.
+    polynomial: polynomial learning rate config.
+    cosine: cosine learning rate config.
+    power: step^power learning rate config.
+    power_linear: learning rate config of step^power followed by
+      step^power*linear.
+    power_with_offset: power decay with a step offset.
+  """
+  type: Optional[str] = None
+  constant: lr_cfg.ConstantLrConfig = lr_cfg.ConstantLrConfig()
+  stepwise: lr_cfg.StepwiseLrConfig = lr_cfg.StepwiseLrConfig()
+  exponential: lr_cfg.ExponentialLrConfig = lr_cfg.ExponentialLrConfig()
+  polynomial: lr_cfg.PolynomialLrConfig = lr_cfg.PolynomialLrConfig()
+  cosine: lr_cfg.CosineLrConfig = lr_cfg.CosineLrConfig()
+  power: lr_cfg.DirectPowerLrConfig = lr_cfg.DirectPowerLrConfig()
+  power_linear: lr_cfg.PowerAndLinearDecayLrConfig = (
+      lr_cfg.PowerAndLinearDecayLrConfig())
+  power_with_offset: lr_cfg.PowerDecayWithOffsetLrConfig = (
+      lr_cfg.PowerDecayWithOffsetLrConfig())
+
+
+@dataclasses.dataclass
+class WarmupConfig(oneof.OneOfConfig):
+  """Configuration for lr schedule.
+
+  Attributes:
+    type: 'str', type of warmup schedule to be used, one of the fields below.
+    linear: linear warmup config.
+    polynomial: polynomial warmup config.
+  """
+  type: Optional[str] = None
+  linear: lr_cfg.LinearWarmupConfig = lr_cfg.LinearWarmupConfig()
+  polynomial: lr_cfg.PolynomialWarmupConfig = lr_cfg.PolynomialWarmupConfig()
+
+
+@dataclasses.dataclass
+class OptimizationConfig(base_config.Config):
+  """Configuration for optimizer and learning rate schedule.
+
+  Attributes:
+    optimizer: optimizer oneof config.
+    ema: optional exponential moving average optimizer config, if specified, ema
+      optimizer will be used.
+    learning_rate: learning rate oneof config.
+    warmup: warmup oneof config.
+  """
+  optimizer: OptimizerConfig = OptimizerConfig()
+  ema: Optional[opt_cfg.EMAConfig] = None
+  learning_rate: LrConfig = LrConfig()
+  warmup: WarmupConfig = WarmupConfig()
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/configs/optimizer_config.py b/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/configs/optimizer_config.py
new file mode 100644
index 000000000..b267fde43
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/configs/optimizer_config.py
@@ -0,0 +1,249 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Dataclasses for optimizer configs."""
+from typing import List, Optional
+
+import dataclasses
+from modeling.hyperparams import base_config
+
+
+@dataclasses.dataclass
+class BaseOptimizerConfig(base_config.Config):
+  """Base optimizer config.
+
+  Attributes:
+    clipnorm: float >= 0 or None. If not None, Gradients will be clipped when
+      their L2 norm exceeds this value.
+    clipvalue: float >= 0 or None. If not None, Gradients will be clipped when
+      their absolute value exceeds this value.
+    global_clipnorm: float >= 0 or None. If not None, gradient of all weights is
+      clipped so that their global norm is no higher than this value
+  """
+  clipnorm: Optional[float] = None
+  clipvalue: Optional[float] = None
+  global_clipnorm: Optional[float] = None
+
+
+@dataclasses.dataclass
+class SGDConfig(BaseOptimizerConfig):
+  """Configuration for SGD optimizer.
+
+  The attributes for this class matches the arguments of tf.keras.optimizer.SGD.
+
+  Attributes:
+    name: name of the optimizer.
+    decay: decay rate for SGD optimizer.
+    nesterov: nesterov for SGD optimizer.
+    momentum: momentum for SGD optimizer.
+  """
+  name: str = "SGD"
+  decay: float = 0.0
+  nesterov: bool = False
+  momentum: float = 0.0
+
+
+@dataclasses.dataclass
+class RMSPropConfig(BaseOptimizerConfig):
+  """Configuration for RMSProp optimizer.
+
+  The attributes for this class matches the arguments of
+  tf.keras.optimizers.RMSprop.
+
+  Attributes:
+    name: name of the optimizer.
+    rho: discounting factor for RMSprop optimizer.
+    momentum: momentum for RMSprop optimizer.
+    epsilon: epsilon value for RMSprop optimizer, help with numerical stability.
+    centered: Whether to normalize gradients or not.
+  """
+  name: str = "RMSprop"
+  rho: float = 0.9
+  momentum: float = 0.0
+  epsilon: float = 1e-7
+  centered: bool = False
+
+
+@dataclasses.dataclass
+class AdagradConfig(BaseOptimizerConfig):
+  """Configuration for Adagrad optimizer.
+
+  The attributes of this class match the arguments of
+  tf.keras.optimizer.Adagrad.
+
+  Attributes:
+    name: name of the optimizer.
+    initial_accumulator_value: A floating point value. Starting value for the
+      accumulators, must be non-negative.
+    epsilon: A small floating point value to avoid zero denominator.
+  """
+  name: str = "Adagrad"
+  initial_accumulator_value: float = 0.1
+  epsilon: float = 1e-07
+
+
+@dataclasses.dataclass
+class AdamConfig(BaseOptimizerConfig):
+  """Configuration for Adam optimizer.
+
+  The attributes for this class matches the arguments of
+  tf.keras.optimizer.Adam.
+
+  Attributes:
+    name: name of the optimizer.
+    beta_1: decay rate for 1st order moments.
+    beta_2: decay rate for 2st order moments.
+    epsilon: epsilon value used for numerical stability in Adam optimizer.
+    amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
+      the paper "On the Convergence of Adam and beyond".
+  """
+  name: str = "Adam"
+  beta_1: float = 0.9
+  beta_2: float = 0.999
+  epsilon: float = 1e-07
+  amsgrad: bool = False
+
+
+@dataclasses.dataclass
+class AdamWeightDecayConfig(BaseOptimizerConfig):
+  """Configuration for Adam optimizer with weight decay.
+
+  Attributes:
+    name: name of the optimizer.
+    beta_1: decay rate for 1st order moments.
+    beta_2: decay rate for 2st order moments.
+    epsilon: epsilon value used for numerical stability in the optimizer.
+    amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
+      the paper "On the Convergence of Adam and beyond".
+    weight_decay_rate: float. Weight decay rate. Default to 0.
+    include_in_weight_decay: list[str], or None. List of weight names to include
+      in weight decay.
+    exclude_from_weight_decay: list[str], or None. List of weight names to not
+      include in weight decay.
+    gradient_clip_norm: A positive float. Clips the gradients to this maximum
+      L2-norm. Default to 1.0.
+  """
+  name: str = "AdamWeightDecay"
+  beta_1: float = 0.9
+  beta_2: float = 0.999
+  epsilon: float = 1e-07
+  amsgrad: bool = False
+  weight_decay_rate: float = 0.0
+  include_in_weight_decay: Optional[List[str]] = None
+  exclude_from_weight_decay: Optional[List[str]] = None
+  gradient_clip_norm: float = 1.0
+
+
+@dataclasses.dataclass
+class LAMBConfig(BaseOptimizerConfig):
+  """Configuration for LAMB optimizer.
+
+  The attributes for this class matches the arguments of
+  tensorflow_addons.optimizers.LAMB.
+
+  Attributes:
+    name: name of the optimizer.
+    beta_1: decay rate for 1st order moments.
+    beta_2: decay rate for 2st order moments.
+    epsilon: epsilon value used for numerical stability in LAMB optimizer.
+    weight_decay_rate: float. Weight decay rate. Default to 0.
+    exclude_from_weight_decay: List of regex patterns of variables excluded from
+      weight decay. Variables whose name contain a substring matching the
+      pattern will be excluded.
+    exclude_from_layer_adaptation: List of regex patterns of variables excluded
+      from layer adaptation. Variables whose name contain a substring matching
+      the pattern will be excluded.
+  """
+  name: str = "LAMB"
+  beta_1: float = 0.9
+  beta_2: float = 0.999
+  epsilon: float = 1e-6
+  weight_decay_rate: float = 0.0
+  exclude_from_weight_decay: Optional[List[str]] = None
+  exclude_from_layer_adaptation: Optional[List[str]] = None
+
+
+@dataclasses.dataclass
+class EMAConfig(BaseOptimizerConfig):
+  """Exponential moving average optimizer config.
+
+  Attributes:
+    name: 'str', name of the optimizer.
+    trainable_weights_only: 'bool', if True, only model trainable weights will
+      be updated. Otherwise, all model weights will be updated. This mainly
+      affects batch normalization parameters.
+    average_decay: 'float', average decay value.
+    start_step: 'int', start step to apply moving average.
+    dynamic_decay: 'bool', whether to apply dynamic decay or not.
+  """
+  name: str = "ExponentialMovingAverage"
+  trainable_weights_only: bool = True
+  average_decay: float = 0.99
+  start_step: int = 0
+  dynamic_decay: bool = True
+
+
+@dataclasses.dataclass
+class LARSConfig(BaseOptimizerConfig):
+  """Layer-wise adaptive rate scaling config.
+
+  Attributes:
+    name: 'str', name of the optimizer.
+    momentum: `float` hyperparameter >= 0 that accelerates gradient descent in
+      the relevant direction and dampens oscillations. Defaults to 0.9.
+    eeta: `float` LARS coefficient as used in the paper. Default set to LARS
+      coefficient from the paper. (eeta / weight_decay) determines the highest
+      scaling factor in LARS..
+    weight_decay_rate: `float` for weight decay.
+    nesterov: 'boolean' for whether to use nesterov momentum.
+    classic_momentum: `boolean` for whether to use classic (or popular)
+      momentum. The learning rate is applied during momentum update in classic
+      momentum, but after momentum for popular momentum.
+    exclude_from_weight_decay: A list of `string` for variable screening, if any
+      of the string appears in a variable's name, the variable will be excluded
+      for computing weight decay. For example, one could specify the list like
+      ['batch_normalization', 'bias'] to exclude BN and bias from weight decay.
+    exclude_from_layer_adaptation: Similar to exclude_from_weight_decay, but for
+      layer adaptation. If it is None, it will be defaulted the same as
+      exclude_from_weight_decay.
+  """
+  name: str = "LARS"
+  momentum: float = 0.9
+  eeta: float = 0.001
+  weight_decay_rate: float = 0.0
+  nesterov: bool = False
+  classic_momentum: bool = True
+  exclude_from_weight_decay: Optional[List[str]] = None
+  exclude_from_layer_adaptation: Optional[List[str]] = None
+
+
+@dataclasses.dataclass
+class SLIDEConfig(BaseOptimizerConfig):
+  """Configuration for SLIDE optimizer.
+
+  Details coming soon.
+  """
+  name: str = "SLIDE"
+  beta_1: float = 0.9
+  beta_2: float = 0.999
+  epsilon: float = 1e-6
+  weight_decay_rate: float = 0.0
+  weight_decay_type: str = "inner"
+  exclude_from_weight_decay: Optional[List[str]] = None
+  exclude_from_layer_adaptation: Optional[List[str]] = None
+  include_in_sparse_layer_adaptation: Optional[List[str]] = None
+  sparse_layer_learning_rate: float = 0.1
+  do_gradient_rescaling: bool = True
+  norm_type: str = "layer"
+  ratio_clip_norm: float = 1e5
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/ema_optimizer.py b/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/ema_optimizer.py
new file mode 100644
index 000000000..c4f44d712
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/ema_optimizer.py
@@ -0,0 +1,255 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Exponential moving average optimizer."""
+
+from typing import List, Optional, Text
+
+import tensorflow as tf
+
+# pylint: disable=protected-access
+
+
+class ExponentialMovingAverage(tf.keras.optimizers.Optimizer):
+  """Optimizer that computes an exponential moving average of the variables.
+
+  Empirically it has been found that using the moving average of the trained
+  parameters of a deep network is better than using its trained parameters
+  directly. This optimizer allows you to compute this moving average and swap
+  the variables at save time so that any code outside of the training loop
+  will use by default the average values instead of the original ones.
+
+  Example of usage for training:
+  ```python
+  opt = tf.keras.optimizers.SGD(learning_rate)
+  opt = ExponentialMovingAverage(opt)
+
+  opt.shadow_copy(model)
+  ```
+
+  At test time, swap the shadow variables to evaluate on the averaged weights:
+  ```python
+  opt.swap_weights()
+  # Test eval the model here
+  opt.swap_weights()
+  ```
+  """
+
+  def __init__(self,
+               optimizer: tf.keras.optimizers.Optimizer,
+               trainable_weights_only: bool = True,
+               average_decay: float = 0.99,
+               start_step: int = 0,
+               dynamic_decay: bool = True,
+               name: Text = 'ExponentialMovingAverage',
+               **kwargs):
+    """Construct a new ExponentialMovingAverage optimizer.
+
+    Args:
+      optimizer: `tf.keras.optimizers.Optimizer` that will be
+        used to compute and apply gradients.
+      trainable_weights_only: 'bool', if True, only model trainable weights will
+        be updated. Otherwise, all model weights will be updated. This mainly
+        affects batch normalization parameters.
+      average_decay: float. Decay to use to maintain the moving averages
+        of trained variables.
+      start_step: int. What step to start the moving average.
+      dynamic_decay: bool. Whether to change the decay based on the number
+        of optimizer updates. Decay will start at 0.1 and gradually increase
+        up to `average_decay` after each optimizer update. This behavior is
+        similar to `tf.train.ExponentialMovingAverage` in TF 1.x.
+      name: Optional name for the operations created when applying
+        gradients. Defaults to "moving_average".
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`,
+        `clipvalue`, `lr`, `decay`}.
+    """
+    super().__init__(name, **kwargs)
+    self._average_decay = average_decay
+    self._trainable_weights_only = trainable_weights_only
+    self._start_step = tf.constant(start_step, tf.float32)
+    self._dynamic_decay = dynamic_decay
+    self._optimizer = optimizer
+    self._track_trackable(self._optimizer, 'base_optimizer')
+    self._average_weights = None
+    self._model_weights = None
+
+  def shadow_copy(self, model: tf.keras.Model):
+    """Creates shadow variables for the given model weights."""
+
+    if self._trainable_weights_only:
+      self._model_weights = model.trainable_variables
+    else:
+      self._model_weights = model.variables
+    for var in self._model_weights:
+      self.add_slot(var, 'average', initializer='zeros')
+
+    self._average_weights = [
+        self.get_slot(var, 'average') for var in self._model_weights
+    ]
+
+  @property
+  def has_shadow_copy(self):
+    """Whether this optimizer has created shadow variables."""
+    return self._model_weights is not None and self._average_weights is not None
+
+  def _create_slots(self, var_list):
+    self._optimizer._create_slots(var_list=var_list)  # pylint: disable=protected-access
+
+  def apply_gradients(self, grads_and_vars, name: Optional[Text] = None):
+    result = self._optimizer.apply_gradients(grads_and_vars, name)
+    self.update_average(self.iterations)
+    return result
+
+  @tf.function
+  def update_average(self, step: tf.Tensor):
+    step = tf.cast(step, tf.float32)
+    if step < self._start_step:
+      decay = tf.constant(0., tf.float32)
+    elif self._dynamic_decay:
+      decay = step - self._start_step
+      decay = tf.minimum(self._average_decay, (1. + decay) / (10. + decay))
+    else:
+      decay = self._average_decay
+
+    def _apply_moving(v_moving, v_normal):
+      diff = v_moving - v_normal
+      v_moving.assign_sub(tf.cast(1. - decay, v_moving.dtype) * diff)
+      return v_moving
+
+    def _update(strategy, v_moving_and_v_normal):
+      for v_moving, v_normal in v_moving_and_v_normal:
+        strategy.extended.update(v_moving, _apply_moving, args=(v_normal,))
+
+    ctx = tf.distribute.get_replica_context()
+    return ctx.merge_call(_update, args=(zip(self._average_weights,
+                                             self._model_weights),))
+
+  def swap_weights(self):
+    """Swap the average and moving weights.
+
+    This is a convenience method to allow one to evaluate the averaged weights
+    at test time. Loads the weights stored in `self._average` into the model,
+    keeping a copy of the original model weights. Swapping twice will return
+    the original weights.
+    """
+    if tf.distribute.in_cross_replica_context():
+      strategy = tf.distribute.get_strategy()
+      strategy.run(self._swap_weights, args=())
+    else:
+      raise ValueError('Swapping weights must occur under a '
+                       'tf.distribute.Strategy')
+
+  @tf.function
+  def _swap_weights(self):
+    def fn_0(a, b):
+      a.assign_add(b)
+      return a
+    def fn_1(b, a):
+      b.assign(a - b)
+      return b
+    def fn_2(a, b):
+      a.assign_sub(b)
+      return a
+
+    def swap(strategy, a_and_b):
+      """Swap `a` and `b` and mirror to all devices."""
+      for a, b in a_and_b:
+        strategy.extended.update(a, fn_0, args=(b,))  # a = a + b
+        strategy.extended.update(b, fn_1, args=(a,))  # b = a - b
+        strategy.extended.update(a, fn_2, args=(b,))  # a = a - b
+
+    ctx = tf.distribute.get_replica_context()
+    return ctx.merge_call(
+        swap, args=(zip(self._average_weights, self._model_weights),))
+
+  def assign_average_vars(self, var_list: List[tf.Variable]):
+    """Assign variables in var_list with their respective averages.
+
+    Args:
+      var_list: List of model variables to be assigned to their average.
+    Returns:
+      assign_op: The op corresponding to the assignment operation of
+        variables to their average.
+    """
+    assign_op = tf.group([
+        var.assign(self.get_slot(var, 'average')) for var in var_list
+        if var.trainable
+    ])
+    return assign_op
+
+  def _create_hypers(self):
+    self._optimizer._create_hypers()  # pylint: disable=protected-access
+
+  def _prepare(self, var_list):
+    return self._optimizer._prepare(var_list=var_list)  # pylint: disable=protected-access
+
+  @property
+  def iterations(self):
+    return self._optimizer.iterations
+
+  @iterations.setter
+  def iterations(self, variable):
+    self._optimizer.iterations = variable
+
+  @property
+  def weights(self):
+    # return self._weights + self._optimizer.weights
+    return self._optimizer.weights
+
+  def variables(self):
+    return self._weights + [self.iterations]
+
+  @property
+  def lr(self):
+    return self._optimizer._get_hyper('learning_rate')
+
+  @lr.setter
+  def lr(self, lr):
+    self._optimizer._set_hyper('learning_rate', lr)
+
+  @property
+  def learning_rate(self):
+    return self._optimizer._get_hyper('learning_rate')
+
+  @learning_rate.setter
+  def learning_rate(self, learning_rate):  # pylint: disable=redefined-outer-name
+    self._optimizer._set_hyper('learning_rate', learning_rate)
+
+  def _resource_apply_dense(self, grad, var):
+    return self._optimizer._resource_apply_dense(grad, var)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    return self._optimizer._resource_apply_sparse(grad, var, indices)
+
+  def _resource_apply_sparse_duplicate_indices(self, grad, var, indices):
+    return self._optimizer._resource_apply_sparse_duplicate_indices(
+        grad, var, indices)
+
+  def get_config(self):
+    config = {
+        'optimizer': tf.keras.optimizers.serialize(self._optimizer),
+        'average_decay': self._average_decay,
+        'start_step': self._start_step,
+        'dynamic_decay': self._dynamic_decay,
+    }
+    base_config = super(ExponentialMovingAverage, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    optimizer = tf.keras.optimizers.deserialize(
+        config.pop('optimizer'),
+        custom_objects=custom_objects,
+    )
+    return cls(optimizer, **config)
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/lars_optimizer.py b/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/lars_optimizer.py
new file mode 100644
index 000000000..ac1504275
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/lars_optimizer.py
@@ -0,0 +1,186 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Layer-wise adaptive rate scaling optimizer."""
+import re
+from typing import Text, List, Optional
+
+import tensorflow as tf
+
+
+# pylint: disable=protected-access
+
+
+class LARS(tf.keras.optimizers.Optimizer):
+  """Layer-wise Adaptive Rate Scaling for large batch training.
+
+  Introduced by "Large Batch Training of Convolutional Networks" by Y. You,
+  I. Gitman, and B. Ginsburg. (https://arxiv.org/abs/1708.03888)
+  """
+
+  def __init__(self,
+               learning_rate: float = 0.01,
+               momentum: float = 0.9,
+               weight_decay_rate: float = 0.0,
+               eeta: float = 0.001,
+               nesterov: bool = False,
+               classic_momentum: bool = True,
+               exclude_from_weight_decay: Optional[List[Text]] = None,
+               exclude_from_layer_adaptation: Optional[List[Text]] = None,
+               name: Text = "LARS",
+               **kwargs):
+    """Constructs a LARSOptimizer.
+
+    Args:
+      learning_rate: `float` for learning rate. Defaults to 0.01.
+      momentum: `float` hyperparameter >= 0 that accelerates gradient descent
+          in the relevant direction and dampens oscillations. Defaults to 0.9.
+      weight_decay_rate: `float` for weight decay.
+      eeta: `float` LARS coefficient as used in the paper. Default set to LARS
+          coefficient from the paper. (eeta / weight_decay) determines the
+          highest scaling factor in LARS..
+      nesterov: 'boolean' for whether to use nesterov momentum.
+      classic_momentum: `boolean` for whether to use classic (or popular)
+          momentum. The learning rate is applied during momentum update in
+          classic momentum, but after momentum for popular momentum.
+      exclude_from_weight_decay: A list of `string` for variable screening, if
+          any of the string appears in a variable's name, the variable will be
+          excluded for computing weight decay. For example, one could specify
+          the list like ['batch_normalization', 'bias'] to exclude BN and bias
+          from weight decay.
+      exclude_from_layer_adaptation: Similar to exclude_from_weight_decay, but
+          for layer adaptation. If it is None, it will be defaulted the same as
+          exclude_from_weight_decay.
+      name: `Text` as optional name for the operations created when applying
+        gradients. Defaults to "LARS".
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
+        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
+        gradients by value, `decay` is included for backward compatibility to
+        allow time inverse decay of learning rate. `lr` is included for
+        backward compatibility, recommended to use `learning_rate` instead.
+    """
+    super(LARS, self).__init__(name, **kwargs)
+
+    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("decay", self._initial_decay)
+    self.momentum = momentum
+    self.weight_decay_rate = weight_decay_rate
+    self.eeta = eeta
+    self.nesterov = nesterov
+    self.classic_momentum = classic_momentum
+    self.exclude_from_weight_decay = exclude_from_weight_decay
+    # exclude_from_layer_adaptation is set to exclude_from_weight_decay if the
+    # arg is None.
+    if exclude_from_layer_adaptation:
+      self.exclude_from_layer_adaptation = exclude_from_layer_adaptation
+    else:
+      self.exclude_from_layer_adaptation = exclude_from_weight_decay
+
+  def _create_slots(self, var_list):
+    for v in var_list:
+      self.add_slot(v, "momentum")
+
+  def _resource_apply_dense(self, grad, param, apply_state=None):
+    if grad is None or param is None:
+      return tf.no_op()
+
+    var_device, var_dtype = param.device, param.dtype.base_dtype
+    coefficients = ((apply_state or {}).get((var_device, var_dtype)) or
+                    self._fallback_apply_state(var_device, var_dtype))
+    learning_rate = coefficients["lr_t"]
+
+    param_name = param.name
+
+    v = self.get_slot(param, "momentum")
+
+    if self._use_weight_decay(param_name):
+      grad += self.weight_decay_rate * param
+
+    if self.classic_momentum:
+      trust_ratio = 1.0
+      if self._do_layer_adaptation(param_name):
+        w_norm = tf.norm(param, ord=2)
+        g_norm = tf.norm(grad, ord=2)
+        trust_ratio = tf.where(
+            tf.greater(w_norm, 0),
+            tf.where(tf.greater(g_norm, 0), (self.eeta * w_norm / g_norm), 1.0),
+            1.0)
+      scaled_lr = learning_rate * trust_ratio
+
+      next_v = tf.multiply(self.momentum, v) + scaled_lr * grad
+      if self.nesterov:
+        update = tf.multiply(self.momentum, next_v) + scaled_lr * grad
+      else:
+        update = next_v
+      next_param = param - update
+    else:
+      next_v = tf.multiply(self.momentum, v) + grad
+      if self.nesterov:
+        update = tf.multiply(self.momentum, next_v) + grad
+      else:
+        update = next_v
+
+      trust_ratio = 1.0
+      if self._do_layer_adaptation(param_name):
+        w_norm = tf.norm(param, ord=2)
+        v_norm = tf.norm(update, ord=2)
+        trust_ratio = tf.where(
+            tf.greater(w_norm, 0),
+            tf.where(tf.greater(v_norm, 0), (self.eeta * w_norm / v_norm), 1.0),
+            1.0)
+      scaled_lr = trust_ratio * learning_rate
+      next_param = param - scaled_lr * update
+
+    return tf.group(*[
+        param.assign(next_param, use_locking=False),
+        v.assign(next_v, use_locking=False)
+    ])
+
+  def _resource_apply_sparse(self, grad, handle, indices, apply_state):
+    raise NotImplementedError("Applying sparse gradients is not implemented.")
+
+  def _use_weight_decay(self, param_name):
+    """Whether to use L2 weight decay for `param_name`."""
+    if not self.weight_decay_rate:
+      return False
+    if self.exclude_from_weight_decay:
+      for r in self.exclude_from_weight_decay:
+        if re.search(r, param_name) is not None:
+          return False
+    return True
+
+  def _do_layer_adaptation(self, param_name):
+    """Whether to do layer-wise learning rate adaptation for `param_name`."""
+    if self.exclude_from_layer_adaptation:
+      for r in self.exclude_from_layer_adaptation:
+        if re.search(r, param_name) is not None:
+          return False
+    return True
+
+  def get_config(self):
+    config = super(LARS, self).get_config()
+    config.update({
+        "learning_rate": self._serialize_hyperparameter("learning_rate"),
+        "decay": self._serialize_hyperparameter("decay"),
+        "momentum": self.momentum,
+        "classic_momentum": self.classic_momentum,
+        "weight_decay_rate": self.weight_decay_rate,
+        "eeta": self.eeta,
+        "nesterov": self.nesterov,
+    })
+    return config
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/lr_schedule.py b/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/lr_schedule.py
new file mode 100644
index 000000000..09f082bbb
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/lr_schedule.py
@@ -0,0 +1,385 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Learning rate schedule classes."""
+
+from typing import Mapping, Any, Union, Optional
+
+import tensorflow as tf
+
+
+def _make_offset_wrapper(new_class_name: str, base_lr_class):
+  """Generates a offset wrapper of learning rate schedule.
+
+  It will returns a subclass of the the `base_lr_class`, the subclass takes an
+  `offset` argument in the constructor. When the new class instance is called,
+  the behavior is:
+    new_class_object(step) = base_lr_class_object(step - offset)
+
+  Example:
+    CosineDecayWithOffset = _make_offset_wrapper(
+                     'CosineDecayWithOffset', tf.keras.experimental.CosineDecay)
+    # Use the lr:
+    lr = CosineDecayWithOffset(offset=100, initial_learning_rate=0.1,
+                               decay_steps=1000)
+    lr(101) # equals to tf.keras.experimental.CosineDecay(...)(101-100)
+
+  Args:
+    new_class_name: the name of the new class.
+    base_lr_class: the base learning rate schedule class. Should be subclass of
+      tf.keras.optimizers.schedules.LearningRateSchedule
+
+  Returns:
+    A new class (subclass of the base_lr_class) that can take an offset.
+  """
+  assert issubclass(base_lr_class,
+                    tf.keras.optimizers.schedules.LearningRateSchedule), (
+                        "base_lr_class should be subclass of keras "
+                        f"LearningRateSchedule, got {base_lr_class}")
+
+  # pylint: disable=protected-access,pointless-statement
+  def offset_learning_rate_init(self, offset=0, **kwargs):
+    """Construct learning rate schedule object.
+
+    When this object is called, its behavior is
+       self.__call__(step) == base_lr_class.__call__(step - offset)
+    Args:
+      self: this object.
+      offset: The offset when computing the learning rate schedule.
+      **kwargs: Pass through to base learning rate class constructor.
+    """
+    base_lr_class.__init__(self, **kwargs)
+    self._offset = offset
+
+  def offset_learning_rate_call(self, step):
+    step = tf.cast(step - self._offset, tf.float32)
+    return base_lr_class.__call__(self, step)
+
+  # pylint: enable=protected-access,pointless-statement
+
+  return type(
+      new_class_name, (base_lr_class,), {
+          "base_lr_class": base_lr_class,
+          "__init__": offset_learning_rate_init,
+          "__call__": offset_learning_rate_call
+      })
+
+
+PiecewiseConstantDecayWithOffset = _make_offset_wrapper(
+    "PiecewiseConstantDecayWithOffset",
+    tf.keras.optimizers.schedules.PiecewiseConstantDecay)
+PolynomialDecayWithOffset = _make_offset_wrapper(
+    "PolynomialDecayWithOffset", tf.keras.optimizers.schedules.PolynomialDecay)
+ExponentialDecayWithOffset = _make_offset_wrapper(
+    "ExponentialDecayWithOffset",
+    tf.keras.optimizers.schedules.ExponentialDecay)
+CosineDecayWithOffset = _make_offset_wrapper("CosineDecayWithOffset",
+                                             tf.keras.experimental.CosineDecay)
+
+
+class LinearWarmup(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Linear warmup schedule."""
+
+  def __init__(self,
+               after_warmup_lr_sched: Union[
+                   tf.keras.optimizers.schedules.LearningRateSchedule, float],
+               warmup_steps: int,
+               warmup_learning_rate: float,
+               name: Optional[str] = None):
+    """Add linear warmup schedule to a learning rate schedule.
+
+    warmup_lr is the initial learning rate, the final learning rate of the
+    init_warmup period is the initial learning rate of lr_schedule in use.
+    The learning rate at each step linearly increased according to the following
+    formula:
+      learning_rate = warmup_lr + step / warmup_steps
+                    * (final_warmup_lr - warmup_lr).
+    Using warmup overrides the learning rate schedule by the number of warmup
+    steps.
+
+    Args:
+      after_warmup_lr_sched: tf.keras.optimizers.schedules .LearningRateSchedule
+        or a constant.
+      warmup_steps: Number of the warmup steps.
+      warmup_learning_rate: Initial learning rate for the warmup.
+      name: Optional, name of warmup schedule.
+    """
+    super().__init__()
+    self._name = name
+    self._after_warmup_lr_sched = after_warmup_lr_sched
+    self._warmup_steps = warmup_steps
+    self._init_warmup_lr = warmup_learning_rate
+    if isinstance(after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      self._final_warmup_lr = after_warmup_lr_sched(warmup_steps)
+    else:
+      self._final_warmup_lr = tf.cast(after_warmup_lr_sched, dtype=tf.float32)
+
+  def __call__(self, step: int):
+
+    global_step = tf.cast(step, dtype=tf.float32)
+
+    linear_warmup_lr = (
+        self._init_warmup_lr + global_step / self._warmup_steps *
+        (self._final_warmup_lr - self._init_warmup_lr))
+
+    if isinstance(self._after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      after_warmup_lr = self._after_warmup_lr_sched(step)
+    else:
+      after_warmup_lr = tf.cast(self._after_warmup_lr_sched, dtype=tf.float32)
+
+    lr = tf.cond(global_step < self._warmup_steps,
+                 lambda: linear_warmup_lr,
+                 lambda: after_warmup_lr)
+    return lr
+
+  def get_config(self) -> Mapping[str, Any]:
+    if isinstance(self._after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      config = {
+          "after_warmup_lr_sched": self._after_warmup_lr_sched.get_config()}  # pytype: disable=attribute-error
+    else:
+      config = {"after_warmup_lr_sched": self._after_warmup_lr_sched}  # pytype: disable=attribute-error
+
+    config.update({
+        "warmup_steps": self._warmup_steps,
+        "warmup_learning_rate": self._init_warmup_lr,
+        "name": self._name
+    })
+    return config
+
+
+class PolynomialWarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Applies polynomial warmup schedule on a given learning rate decay schedule."""
+
+  def __init__(self,
+               after_warmup_lr_sched: Union[
+                   tf.keras.optimizers.schedules.LearningRateSchedule, float],
+               warmup_steps: int,
+               power: float = 1.0,
+               name: str = "PolynomialWarmup"):
+    super().__init__()
+    if isinstance(after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      self._initial_learning_rate = after_warmup_lr_sched(warmup_steps)
+    else:
+      self._initial_learning_rate = tf.cast(
+          after_warmup_lr_sched, dtype=tf.float32)
+
+    self._warmup_steps = warmup_steps
+    self._power = power
+    self._after_warmup_lr_sched = after_warmup_lr_sched
+    self._name = name
+
+  def __call__(self, step):
+    with tf.name_scope(self._name or "PolynomialWarmUp") as name:
+      # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
+      # learning rate will be `global_step/num_warmup_steps * init_lr`.
+      global_step_float = tf.cast(step, tf.float32)
+      warmup_steps_float = tf.cast(self._warmup_steps, tf.float32)
+
+      if self._warmup_steps <= 0:
+        warmup_percent_done = 1.0
+      else:
+        # A zero `step` may cause Inf. So make `step` positive.
+        step_non_zero = tf.math.maximum(global_step_float, 1.0)
+        warmup_percent_done = step_non_zero / warmup_steps_float
+
+      warmup_learning_rate = (
+          self._initial_learning_rate *
+          tf.math.pow(warmup_percent_done, self._power))
+
+      if isinstance(self._after_warmup_lr_sched,
+                    tf.keras.optimizers.schedules.LearningRateSchedule):
+        after_warmup_lr = self._after_warmup_lr_sched(step)
+      else:
+        after_warmup_lr = tf.cast(self._after_warmup_lr_sched, dtype=tf.float32)
+
+      return tf.cond(
+          global_step_float < warmup_steps_float,
+          lambda: warmup_learning_rate,
+          lambda: after_warmup_lr,
+          name=name)
+
+  def get_config(self) -> Mapping[str, Any]:
+    if isinstance(self._after_warmup_lr_sched,
+                  tf.keras.optimizers.schedules.LearningRateSchedule):
+      config = {
+          "after_warmup_lr_sched": self._after_warmup_lr_sched.get_config()}  # pytype: disable=attribute-error
+    else:
+      config = {"after_warmup_lr_sched": self._after_warmup_lr_sched}  # pytype: disable=attribute-error
+
+    config.update({
+        "warmup_steps": self._warmup_steps,
+        "power": self._power,
+        "name": self._name
+    })
+    return config
+
+
+class DirectPowerDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Learning rate schedule follows lr * (step)^power."""
+
+  def __init__(self,
+               initial_learning_rate: float,
+               power: float = 1.0,
+               name: str = "DirectPowerDecay"):
+    """Initialize configuration of the learning rate schedule.
+
+    Args:
+      initial_learning_rate: The initial learning rate.
+      power: The order of the polynomial.
+      name: Optional, name of learning rate schedule.
+    """
+    super().__init__()
+    self._initial_learning_rate = initial_learning_rate
+    self._power = power
+    self._name = name
+
+  def __call__(self, step):
+    with tf.name_scope(self._name or "DirectPowerDecay"):
+      step = tf.cast(step, tf.float32)
+      learning_rate = self._initial_learning_rate
+      # A zero `step` may cause Inf. So make `step` positive.
+      step_non_zero = tf.math.maximum(step, 1.0)
+      learning_rate *= tf.math.pow(step_non_zero, self._power)
+      return learning_rate
+
+  def get_config(self):
+    """Get the configuration of the learning rate schedule."""
+    return {
+        "initial_learning_rate": self._initial_learning_rate,
+        "power": self._power,
+        "name": self._name,
+    }
+
+
+class PowerAndLinearDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Learning rate schedule with multiplied by linear decay at the end.
+
+  The schedule has the following behavoir.
+  Let offset_step = step - offset.
+  1) offset_step < 0, the actual learning rate equals initial_learning_rate.
+  2) offset_step <= total_decay_steps * (1 - linear_decay_fraction), the
+  actual learning rate equals lr * offset_step^power.
+  3) total_decay_steps * (1 - linear_decay_fraction) <= offset_step <
+  total_decay_steps, the actual learning rate equals lr * offset_step^power *
+  (total_decay_steps - offset_step) / (total_decay_steps *
+  linear_decay_fraction).
+  4) offset_step >= total_decay_steps, the actual learning rate equals zero.
+  """
+
+  def __init__(self,
+               initial_learning_rate: float,
+               total_decay_steps: int,
+               power: float = 1.0,
+               linear_decay_fraction: float = 0.1,
+               offset: int = 0,
+               name: str = "PowerAndLinearDecay"):
+    """Initialize configuration of the learning rate schedule.
+
+    Args:
+      initial_learning_rate: The initial learning rate.
+      total_decay_steps: The total number of steps for power + linear decay.
+      power: The order of the polynomial.
+      linear_decay_fraction: In the last `linear_decay_fraction` steps, the
+        learning rate will be multiplied by a linear decay.
+      offset: The offset applied to steps.
+      name: Optional, name of learning rate schedule.
+    """
+    super().__init__()
+    self._initial_learning_rate = initial_learning_rate
+    self._total_decay_steps = total_decay_steps
+    self._power = power
+    self._linear_decay_fraction = linear_decay_fraction
+    self._offset = offset
+    self._name = name
+
+  def __call__(self, step):
+    with tf.name_scope(self._name or "PowerAndLinearDecay"):
+      step = tf.cast(step - self._offset, tf.float32)
+      learning_rate = self._initial_learning_rate
+      # A zero `step` may cause Inf. So make `step` positive.
+      step_non_zero = tf.math.maximum(step, 1.0)
+      learning_rate *= tf.math.pow(step_non_zero, self._power)
+      if self._total_decay_steps * self._linear_decay_fraction > 0:
+        learning_rate *= tf.minimum(
+            1.0, (self._total_decay_steps - step) /
+            (self._total_decay_steps * self._linear_decay_fraction))
+        learning_rate = tf.maximum(0.0, learning_rate)
+      return learning_rate
+
+  def get_config(self):
+    """Get the configuration of the learning rate schedule."""
+    return {
+        "initial_learning_rate": self._initial_learning_rate,
+        "total_decay_steps": self._total_decay_steps,
+        "power": self._power,
+        "linear_decay_fraction": self._linear_decay_fraction,
+        "offset": self._offset,
+        "name": self._name,
+    }
+
+
+class PowerDecayWithOffset(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Power learning rate decay with offset.
+
+  Learning rate equals to `pre_offset_learning_rate` if `step` < `offset`.
+  Otherwise, learning rate equals to lr * (step - offset)^power.
+  """
+
+  def __init__(self,
+               initial_learning_rate: float,
+               power: float = 1.0,
+               offset: int = 0,
+               pre_offset_learning_rate: float = 1.0e6,
+               name: str = "PowerDecayWithOffset"):
+    """Initialize configuration of the learning rate schedule.
+
+    Args:
+      initial_learning_rate: The initial learning rate.
+      power: The order of the polynomial.
+      offset: The offset when computing the power decay.
+      pre_offset_learning_rate: The maximum learning rate we'll use.
+      name: Optional, name of learning rate schedule.
+    """
+    super().__init__()
+    self._initial_learning_rate = initial_learning_rate
+    self._power = power
+    self._offset = offset
+    self._pre_offset_lr = pre_offset_learning_rate
+    self._name = name
+
+  def __call__(self, step):
+    with tf.name_scope(self._name or "PowerDecayWithOffset"):
+      step = tf.cast(step, tf.float32)
+      lr_after_offset = tf.math.pow(
+          tf.math.maximum(step - self._offset, 1.0), self._power) * (
+              self._initial_learning_rate)
+
+      sign = tf.cast(step > self._offset, tf.float32)
+      lr_combined = (1.0 - sign) * self._pre_offset_lr + sign * lr_after_offset
+      # Power may give infinitely large LR. So cap it with pre_offset_lr.
+      return tf.math.minimum(lr_combined, self._pre_offset_lr)
+
+  def get_config(self):
+    """Get the configuration of the learning rate schedule."""
+    return {
+        "initial_learning_rate": self._initial_learning_rate,
+        "power": self._power,
+        "offset": self._offset,
+        "pre_offset_learning_rate": self._pre_offset_lr,
+        "name": self._name,
+    }
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/optimizer_factory.py b/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/optimizer_factory.py
new file mode 100644
index 000000000..694a87bc1
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/optimizer_factory.py
@@ -0,0 +1,177 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Optimizer factory class."""
+from typing import Callable, Optional, Union
+
+import gin
+import tensorflow as tf
+import tensorflow_addons.optimizers as tfa_optimizers
+
+from modeling.optimization import slide_optimizer
+from modeling.optimization import ema_optimizer
+from modeling.optimization import lars_optimizer
+from modeling.optimization import lr_schedule
+from modeling.optimization.configs import optimization_config as opt_cfg
+from nlp import optimization as nlp_optimization
+
+OPTIMIZERS_CLS = {
+    'sgd': tf.keras.optimizers.SGD,
+    'adam': tf.keras.optimizers.Adam,
+    'adamw': nlp_optimization.AdamWeightDecay,
+    'lamb': tfa_optimizers.LAMB,
+    'rmsprop': tf.keras.optimizers.RMSprop,
+    'lars': lars_optimizer.LARS,
+    'adagrad': tf.keras.optimizers.Adagrad,
+    'slide': slide_optimizer.SLIDE
+}
+
+LR_CLS = {
+    'stepwise': lr_schedule.PiecewiseConstantDecayWithOffset,
+    'polynomial': lr_schedule.PolynomialDecayWithOffset,
+    'exponential': lr_schedule.ExponentialDecayWithOffset,
+    'cosine': lr_schedule.CosineDecayWithOffset,
+    'power': lr_schedule.DirectPowerDecay,
+    'power_linear': lr_schedule.PowerAndLinearDecay,
+    'power_with_offset': lr_schedule.PowerDecayWithOffset,
+}
+
+WARMUP_CLS = {
+    'linear': lr_schedule.LinearWarmup,
+    'polynomial': lr_schedule.PolynomialWarmUp
+}
+
+
+class OptimizerFactory:
+  """Optimizer factory class.
+
+  This class builds learning rate and optimizer based on an optimization config.
+  To use this class, you need to do the following:
+  (1) Define optimization config, this includes optimizer, and learning rate
+      schedule.
+  (2) Initialize the class using the optimization config.
+  (3) Build learning rate.
+  (4) Build optimizer.
+
+  This is a typical example for using this class:
+  params = {
+        'optimizer': {
+            'type': 'sgd',
+            'sgd': {'momentum': 0.9}
+        },
+        'learning_rate': {
+            'type': 'stepwise',
+            'stepwise': {'boundaries': [10000, 20000],
+                         'values': [0.1, 0.01, 0.001]}
+        },
+        'warmup': {
+            'type': 'linear',
+            'linear': {'warmup_steps': 500, 'warmup_learning_rate': 0.01}
+        }
+    }
+  opt_config = OptimizationConfig(params)
+  opt_factory = OptimizerFactory(opt_config)
+  lr = opt_factory.build_learning_rate()
+  optimizer = opt_factory.build_optimizer(lr)
+  """
+
+  def __init__(self, config: opt_cfg.OptimizationConfig):
+    """Initializing OptimizerFactory.
+
+    Args:
+      config: OptimizationConfig instance contain optimization config.
+    """
+    self._config = config
+    self._optimizer_config = config.optimizer.get()
+    self._optimizer_type = config.optimizer.type
+
+    self._use_ema = config.ema is not None
+    self._ema_config = config.ema
+
+    if self._optimizer_config is None:
+      raise ValueError('Optimizer type must be specified')
+
+    self._lr_config = config.learning_rate.get()
+    self._lr_type = config.learning_rate.type
+
+    if self._lr_type is None:
+      raise ValueError('Learning rate type must be specified')
+
+    self._warmup_config = config.warmup.get()
+    self._warmup_type = config.warmup.type
+
+  def build_learning_rate(self):
+    """Build learning rate.
+
+    Builds learning rate from config. Learning rate schedule is built according
+    to the learning rate config. If learning rate type is consant,
+    lr_config.learning_rate is returned.
+
+    Returns:
+      tf.keras.optimizers.schedules.LearningRateSchedule instance. If
+      learning rate type is consant, lr_config.learning_rate is returned.
+    """
+    if self._lr_type == 'constant':
+      lr = self._lr_config.learning_rate
+    else:
+      lr = LR_CLS[self._lr_type](**self._lr_config.as_dict())
+
+    if self._warmup_config:
+      lr = WARMUP_CLS[self._warmup_type](lr, **self._warmup_config.as_dict())
+
+    return lr
+
+  @gin.configurable
+  def build_optimizer(
+      self,
+      lr: Union[tf.keras.optimizers.schedules.LearningRateSchedule, float],
+      postprocessor: Optional[Callable[[tf.keras.optimizers.Optimizer],
+                                       tf.keras.optimizers.Optimizer]] = None):
+    """Build optimizer.
+
+    Builds optimizer from config. It takes learning rate as input, and builds
+    the optimizer according to the optimizer config. Typically, the learning
+    rate built using self.build_lr() is passed as an argument to this method.
+
+    Args:
+      lr: A floating point value, or a
+        tf.keras.optimizers.schedules.LearningRateSchedule instance.
+      postprocessor: An optional function for postprocessing the optimizer. It
+        takes an optimizer and returns an optimizer.
+
+    Returns:
+      tf.keras.optimizers.Optimizer instance.
+    """
+
+    optimizer_dict = self._optimizer_config.as_dict()
+    ## Delete clipnorm and clipvalue if None
+    if optimizer_dict['clipnorm'] is None:
+      del optimizer_dict['clipnorm']
+    if optimizer_dict['clipvalue'] is None:
+      del optimizer_dict['clipvalue']
+
+    optimizer_dict['learning_rate'] = lr
+
+    optimizer = OPTIMIZERS_CLS[self._optimizer_type](**optimizer_dict)
+
+    if self._use_ema:
+      optimizer = ema_optimizer.ExponentialMovingAverage(
+          optimizer, **self._ema_config.as_dict())
+    if postprocessor:
+      optimizer = postprocessor(optimizer)
+    assert isinstance(optimizer, tf.keras.optimizers.Optimizer), (
+        'OptimizerFactory.build_optimizer returning a non-optimizer object: '
+        '{}'.format(optimizer))
+
+    return optimizer
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/slide_optimizer.py b/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/slide_optimizer.py
new file mode 100644
index 000000000..c1975a311
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/optimization/slide_optimizer.py
@@ -0,0 +1,20 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""SLIDE optimizer.
+
+A new optimizer that will be open sourced soon.
+"""
+
+SLIDE = "Unimplemented"
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/performance.py b/nlp/text_classification/bert/tensorflow2.0/modeling/performance.py
new file mode 100644
index 000000000..9dd2438f4
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/performance.py
@@ -0,0 +1,55 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functions and classes related to training performance."""
+
+import tensorflow as tf
+
+
+def configure_optimizer(optimizer,
+                        use_float16=False,
+                        use_graph_rewrite=False,
+                        loss_scale='dynamic'):
+  """Configures optimizer object with performance options."""
+  if use_float16:
+    if loss_scale == 'dynamic':
+      optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)
+    else:
+      # loss_scale is a number. We interpret that as a fixed loss scale.
+      optimizer = tf.keras.mixed_precision.LossScaleOptimizer(
+          optimizer, dynamic=False, initial_scale=loss_scale)
+  if use_graph_rewrite:
+    # Note: the model dtype must be 'float32', which will ensure
+    # tf.keras.mixed_precision and enable_mixed_precision_graph_rewrite do not
+    # double up.
+    optimizer = (
+        tf.compat.v1.mixed_precision.enable_mixed_precision_graph_rewrite(
+            optimizer))
+  return optimizer
+
+
+def set_mixed_precision_policy(dtype, loss_scale=None):
+  """Sets the global `tf.keras.mixed_precision.Policy`."""
+  # TODO(b/191894773): Remove loss_scale argument
+  assert loss_scale is None, (
+      'The loss_scale argument must be None. The argument exists for '
+      'historical reasons and will be removed soon.')
+  if dtype == tf.float16:
+    tf.keras.mixed_precision.set_global_policy('mixed_float16')
+  elif dtype == tf.bfloat16:
+    tf.keras.mixed_precision.set_global_policy('mixed_bfloat16')
+  elif dtype == tf.float32:
+    tf.keras.mixed_precision.set_global_policy('float32')
+  else:
+    raise ValueError('Unexpected dtype: %s' % dtype)
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/progressive/policies.py b/nlp/text_classification/bert/tensorflow2.0/modeling/progressive/policies.py
new file mode 100644
index 000000000..14155214d
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/progressive/policies.py
@@ -0,0 +1,173 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Base ProgressivePolicy definition for progressive training.
+
+To write a progressive model, subclass ProgressivePolicy and implement its
+abstract methods to handle each training stage.
+"""
+
+import abc
+from typing import Any, Mapping
+from absl import logging
+import dataclasses
+import six
+import tensorflow as tf
+from modeling.hyperparams import base_config
+from modeling.progressive import utils
+
+
+@dataclasses.dataclass
+class ProgressiveConfig(base_config.Config):
+  pass
+
+
+@six.add_metaclass(abc.ABCMeta)
+class ProgressivePolicy:
+  """The APIs for handling progressive training stages.
+
+  Attributes:
+    cur_model: The model for the current progressive training stage.
+    cur_train_dataset: The train dataset function for the current stage.
+    cur_eval_dataset: The eval dataset function for the current stage.
+    cur_optimizer: The optimizer for the current stage.
+    cur_checkpoint_items: Items to be saved in and restored from checkpoints,
+      for the progressive trainer.
+    is_last_stage: Whether it is currently in the last stage.
+
+  Interfaces:
+    is_stage_advancing: Returns if progressive training is advancing to the
+      next stage.
+    update_pt_stage: Update progressive training stage.
+  """
+
+  def __init__(self):
+    """Initialize stage policy."""
+    self._cur_train_dataset = None
+    self._cur_eval_dataset = None
+    self._volatiles = utils.VolatileTrackable(optimizer=None, model=None)
+
+    stage_id = 0
+    self._stage_id = tf.Variable(
+        stage_id,
+        trainable=False,
+        dtype=tf.int64,
+        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
+        shape=[])
+    self._volatiles.reassign_trackable(
+        optimizer=self.get_optimizer(stage_id),
+        model=self.get_model(stage_id, old_model=None))
+
+  def compute_stage_id(self, global_step: int) -> int:
+    for stage_id in range(self.num_stages()):
+      global_step -= self.num_steps(stage_id)
+      if global_step < 0:
+        return stage_id
+    logging.error('Global step %d found no matching progressive stages. '
+                  'Default to the last stage.', global_step)
+    return self.num_stages() - 1
+
+  @abc.abstractmethod
+  def num_stages(self) -> int:
+    """Return the total number of progressive stages."""
+    pass
+
+  @abc.abstractmethod
+  def num_steps(self, stage_id: int) -> int:
+    """Return the total number of steps in this stage."""
+    pass
+
+  @abc.abstractmethod
+  def get_model(self,
+                stage_id: int,
+                old_model: tf.keras.Model = None) -> tf.keras.Model:
+    """Return model for this stage. For initialization, `old_model` = None."""
+    pass
+
+  @abc.abstractmethod
+  def get_optimizer(self, stage_id: int) -> tf.keras.optimizers.Optimizer:
+    """Return optimizer for this stage."""
+    pass
+
+  @abc.abstractmethod
+  def get_train_dataset(self, stage_id: int) -> tf.data.Dataset:
+    """Return training Dataset for this stage."""
+    pass
+
+  @abc.abstractmethod
+  def get_eval_dataset(self, stage_id: int) -> tf.data.Dataset:
+    """Return evaluation Dataset for this stage."""
+    pass
+
+  @property
+  def cur_model(self) -> tf.keras.Model:
+    return self._volatiles.model
+
+  @property
+  def cur_train_dataset(self) -> tf.data.Dataset:
+    if self._cur_train_dataset is None:
+      self._cur_train_dataset = self.get_train_dataset(self._stage_id.numpy())
+    return self._cur_train_dataset
+
+  @property
+  def cur_eval_dataset(self) -> tf.data.Dataset:
+    if self._cur_eval_dataset is None:
+      self._cur_eval_dataset = self.get_eval_dataset(self._stage_id.numpy())
+    return self._cur_eval_dataset
+
+  @property
+  def cur_optimizer(self) -> tf.keras.optimizers.Optimizer:
+    return self._volatiles.optimizer
+
+  @property
+  def is_last_stage(self) -> bool:
+    stage_id = self._stage_id.numpy()
+    return stage_id >= self.num_stages() - 1
+
+  @property
+  def cur_checkpoint_items(self) -> Mapping[str, Any]:
+    return dict(stage_id=self._stage_id, volatiles=self._volatiles)
+
+  def is_stage_advancing(self, global_step: int) -> bool:
+    old_stage_id = self._stage_id.numpy()
+    new_stage_id = self.compute_stage_id(global_step)
+    return old_stage_id != new_stage_id
+
+  def update_pt_stage(self, global_step: int, pass_old_model=True) -> None:
+    """Update progressive training internal status.
+
+    Call this after a training loop ends.
+
+    Args:
+      global_step: an integer scalar of the current global step.
+      pass_old_model: whether to pass the old_model to get_model() function.
+        This is set to False if the old_model is irrelevant (e.g, just a default
+        model from stage 0).
+    """
+    old_stage_id = self._stage_id.numpy()
+    new_stage_id = self.compute_stage_id(global_step)
+    logging.info('Switching stage from %d to %d', old_stage_id, new_stage_id)
+
+    # Update stage id.
+    self._stage_id.assign(new_stage_id)
+    # Update dataset function.
+    self._cur_train_dataset = None
+    self._cur_eval_dataset = None
+
+    # Update optimizer and model.
+    new_optimizer = self.get_optimizer(new_stage_id)
+    self._volatiles.reassign_trackable(optimizer=new_optimizer)
+    new_model = self.get_model(
+        new_stage_id, old_model=self.cur_model if pass_old_model else None)
+    self._volatiles.reassign_trackable(model=new_model)
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/progressive/train.py b/nlp/text_classification/bert/tensorflow2.0/modeling/progressive/train.py
new file mode 100644
index 000000000..5c106687d
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/progressive/train.py
@@ -0,0 +1,69 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TFM binary for the progressive trainer."""
+
+from absl import app
+from absl import flags
+import gin
+
+from common import distribute_utils
+# pylint: disable=unused-import
+from common import registry_imports
+# pylint: enable=unused-import
+from common import flags as tfm_flags
+from core import task_factory
+from core import train_utils
+from modeling import performance
+from modeling.progressive import train_lib
+
+FLAGS = flags.FLAGS
+
+
+def main(_):
+  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
+  params = train_utils.parse_configuration(FLAGS)
+  model_dir = FLAGS.model_dir
+  if 'train' in FLAGS.mode:
+    # Pure eval modes do not output yaml files. Otherwise continuous eval job
+    # may race against the train job for writing the same file.
+    train_utils.serialize_config(params, model_dir)
+
+  # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
+  # can have significant impact on model speeds by utilizing float16 in case of
+  # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
+  # dtype is float16
+  if params.runtime.mixed_precision_dtype:
+    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype)
+  distribution_strategy = distribute_utils.get_distribution_strategy(
+      distribution_strategy=params.runtime.distribution_strategy,
+      all_reduce_alg=params.runtime.all_reduce_alg,
+      num_gpus=params.runtime.num_gpus,
+      tpu_address=params.runtime.tpu,
+      **params.runtime.model_parallelism())
+  with distribution_strategy.scope():
+    task = task_factory.get_task(params.task, logging_dir=model_dir)
+
+  train_lib.run_experiment(
+      distribution_strategy=distribution_strategy,
+      task=task,
+      mode=FLAGS.mode,
+      params=params,
+      model_dir=model_dir)
+
+  train_utils.save_gin_config(FLAGS.mode, model_dir)
+
+if __name__ == '__main__':
+  tfm_flags.define_flags()
+  app.run(main)
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/progressive/train_lib.py b/nlp/text_classification/bert/tensorflow2.0/modeling/progressive/train_lib.py
new file mode 100644
index 000000000..409c2108f
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/progressive/train_lib.py
@@ -0,0 +1,126 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TFM progressive training driver library.
+
+Compared to the common training driver, the only difference is that we use
+prog_trainer_lib.ProgressiveTrainer instead of the base trainer.
+"""
+
+# pytype: disable=attribute-error
+import os
+from typing import Any, Mapping, Tuple
+
+# Import libraries
+from absl import logging
+import orbit
+import tensorflow as tf
+from core import base_task
+from core import config_definitions
+from core import train_lib as base_train_lib
+from modeling.progressive import trainer as prog_trainer_lib
+
+
+def run_experiment(distribution_strategy: tf.distribute.Strategy,
+                   task: base_task.Task,
+                   mode: str,
+                   params: config_definitions.ExperimentConfig,
+                   model_dir: str,
+                   run_post_eval: bool = False,
+                   save_summary: bool = True) \
+-> Tuple[tf.keras.Model, Mapping[str, Any]]:
+  """Runs train/eval configured by the experiment params.
+
+  Args:
+    distribution_strategy: A distribution distribution_strategy.
+    task: A Task instance.
+    mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval'
+      or 'continuous_eval'.
+    params: ExperimentConfig instance.
+    model_dir: A 'str', a path to store model checkpoints and summaries.
+    run_post_eval: Whether to run post eval once after training, metrics logs
+      are returned.
+    save_summary: Whether to save train and validation summary.
+
+  Returns:
+    A 2-tuple of (model, eval_logs).
+      model: `tf.keras.Model` instance.
+      eval_logs: returns eval metrics logs when run_post_eval is set to True,
+        otherwise, returns {}.
+  """
+
+  with distribution_strategy.scope():
+    logging.info('Running progressive trainer.')
+    trainer = prog_trainer_lib.ProgressiveTrainer(
+        params, task, ckpt_dir=model_dir,
+        train='train' in mode,
+        evaluate=('eval' in mode) or run_post_eval,
+        checkpoint_exporter=base_train_lib.maybe_create_best_ckpt_exporter(
+            params, model_dir))
+
+  if trainer.checkpoint:
+    checkpoint_manager = tf.train.CheckpointManager(
+        trainer.checkpoint,
+        directory=model_dir,
+        max_to_keep=params.trainer.max_to_keep,
+        step_counter=trainer.global_step,
+        checkpoint_interval=params.trainer.checkpoint_interval,
+        init_fn=trainer.initialize)
+  else:
+    checkpoint_manager = None
+
+  controller = orbit.Controller(
+      strategy=distribution_strategy,
+      trainer=trainer if 'train' in mode else None,
+      evaluator=trainer,
+      global_step=trainer.global_step,
+      steps_per_loop=params.trainer.steps_per_loop,
+      checkpoint_manager=checkpoint_manager,
+      summary_dir=os.path.join(model_dir, 'train') if (save_summary) else None,
+      eval_summary_dir=os.path.join(model_dir, 'validation') if
+      (save_summary) else None,
+      summary_interval=params.trainer.summary_interval if
+      (save_summary) else None)
+
+  logging.info('Starts to execute mode: %s', mode)
+  with distribution_strategy.scope():
+    if mode == 'train':
+      controller.train(steps=params.trainer.train_steps)
+    elif mode == 'train_and_eval':
+      controller.train_and_evaluate(
+          train_steps=params.trainer.train_steps,
+          eval_steps=params.trainer.validation_steps,
+          eval_interval=params.trainer.validation_interval)
+    elif mode == 'eval':
+      controller.evaluate(steps=params.trainer.validation_steps)
+    elif mode == 'continuous_eval':
+
+      def timeout_fn():
+        if trainer.global_step.numpy() >= params.trainer.train_steps:
+          return True
+        return False
+
+      controller.evaluate_continuously(
+          steps=params.trainer.validation_steps,
+          timeout=params.trainer.continuous_eval_timeout,
+          timeout_fn=timeout_fn)
+    else:
+      raise NotImplementedError('The mode is not implemented: %s' % mode)
+
+  if run_post_eval:
+    with distribution_strategy.scope():
+      return trainer.model, trainer.evaluate(
+          tf.convert_to_tensor(params.trainer.validation_steps))
+  else:
+    return trainer.model, {}
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/progressive/trainer.py b/nlp/text_classification/bert/tensorflow2.0/modeling/progressive/trainer.py
new file mode 100644
index 000000000..bc94c1632
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/progressive/trainer.py
@@ -0,0 +1,294 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Progressive Trainer implementation.
+
+The trainer implements the Orbit `StandardTrainable` and
+`StandardEvaluable` interfaces. Trainers inside this project should be
+interchangable and independent on model architectures and tasks.
+"""
+import os
+from typing import Any, Optional
+
+# Import libraries
+from absl import logging
+
+import dataclasses
+import gin
+import orbit
+import tensorflow as tf
+from core import base_task
+from core import base_trainer as trainer_lib
+from core import config_definitions
+from modeling.progressive import policies
+from modeling.progressive import utils
+
+ExperimentConfig = config_definitions.ExperimentConfig
+
+
+@dataclasses.dataclass
+class ProgressiveTrainerConfig(config_definitions.TrainerConfig):
+  """Configuration for progressive trainer.
+
+  Attributes:
+    progressive: A task-specific config. Users can subclass ProgressiveConfig
+      and define any task-specific settings in their subclass.
+    export_checkpoint: A bool. Whether to export checkpoints in non-progressive
+      manner (without the volatiles wrapper) such that your down-stream tasks
+      can load checkpoints from a progressive trainer as if it is a regular
+      checkpoint.
+    export_checkpoint_interval: A bool. The number of steps between exporting
+      checkpoints. If None (by default), will use the same value as
+      TrainerConfig.checkpoint_interval.
+    export_max_to_keep: The maximum number of exported checkpoints to keep.
+      If None (by default), will use the same value as
+      TrainerConfig.max_to_keep.
+    export_only_final_stage_ckpt: A bool. Whether to just export checkpoints
+      during the final progressive training stage. In other words, whether to
+      not export small, partial models. In many cases, it is not meaningful to
+      finetune a small, partial model in down-stream tasks.
+  """
+  progressive: Optional[policies.ProgressiveConfig] = None
+  export_checkpoint: bool = True
+  export_checkpoint_interval: Optional[int] = None
+  export_max_to_keep: Optional[int] = None
+  export_only_final_stage_ckpt: bool = True
+
+
+@gin.configurable
+class ProgressiveTrainer(trainer_lib.Trainer):
+  """Implements the progressive trainer shared for TensorFlow models."""
+
+  def __init__(
+      self,
+      config: ExperimentConfig,
+      prog_task: base_task.Task,  # also implemented ProgressivePolicy.
+      ckpt_dir: str = '',
+      train: bool = True,
+      evaluate: bool = True,
+      checkpoint_exporter: Any = None):
+    """Initialize common trainer for TensorFlow models.
+
+    Args:
+      config: An `ExperimentConfig` instance specifying experiment config.
+      prog_task: An instance both implemented policies.ProgressivePolicy and
+        base_task.Task.
+      ckpt_dir: Checkpoint directory.
+      train: bool, whether or not this trainer will be used for training.
+        default to True.
+      evaluate: bool, whether or not this trainer will be used for evaluation.
+        default to True.
+      checkpoint_exporter: an object that has the `maybe_export_checkpoint`
+        interface.
+    """
+    # Gets the current distribution strategy. If not inside any strategy scope,
+    # it gets a single-replica no-op strategy.
+    self._strategy = tf.distribute.get_strategy()
+    self._config = config
+    self._runtime_options = trainer_lib.get_runtime_options(config)
+    self._task = prog_task
+
+    # Directory for non-progressive checkpoint
+    self._export_ckpt_dir = os.path.join(ckpt_dir, 'exported_ckpts')
+    tf.io.gfile.makedirs(self._export_ckpt_dir)
+    self._export_ckpt_manager = None
+
+    # Receive other checkpoint export, e.g, best checkpoint exporter.
+    # TODO(lehou): unify the checkpoint exporting logic, although the default
+    # setting does not use checkpoint_exporter.
+    self._checkpoint_exporter = checkpoint_exporter
+
+    self._global_step = orbit.utils.create_global_step()
+
+    self._checkpoint = utils.CheckpointWithHooks(
+        before_load_hook=self._update_pt_stage_from_ckpt,
+        global_step=self.global_step,
+        **self._task.cur_checkpoint_items)
+
+    self._train_loss = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
+    self._validation_loss = tf.keras.metrics.Mean(
+        'validation_loss', dtype=tf.float32)
+    self._train_metrics = self.task.build_metrics(
+        training=True) + self.model.metrics
+    self._validation_metrics = self.task.build_metrics(
+        training=False) + self.model.metrics
+
+    if train:
+      orbit.StandardTrainer.__init__(
+          self,
+          None,  # Manage train_dataset by ourselves, not by StandardTrainer.
+          options=orbit.StandardTrainerOptions(
+              use_tf_while_loop=config.trainer.train_tf_while_loop,
+              use_tf_function=config.trainer.train_tf_function))
+
+    if evaluate:
+      orbit.StandardEvaluator.__init__(
+          self,
+          None,  # Manage train_dataset by ourselves, not by StandardEvaluator.
+          options=orbit.StandardEvaluatorOptions(
+              use_tf_function=config.trainer.eval_tf_function))
+
+  @property
+  def model(self):
+    return self._task.cur_model
+
+  @property
+  def optimizer(self):
+    return self._task.cur_optimizer
+
+  # override
+  @property
+  def train_dataset(self):
+    """Overriding StandardTrainer.train_dataset."""
+    return self._task.cur_train_dataset
+
+  # override
+  @train_dataset.setter
+  def train_dataset(self, _):
+    raise SyntaxError('Please do not set train_dataset. Progressive training '
+                      'relies on progressive policy to manager train dataset.')
+
+  # override
+  @property
+  def eval_dataset(self):
+    """Overriding StandardEvaluator.eval_dataset."""
+    return self._task.cur_eval_dataset
+
+  # override
+  @eval_dataset.setter
+  def eval_dataset(self, _):
+    raise SyntaxError('Please do not set eval_dataset. Progressive training '
+                      'relies on progressive policy to manager eval dataset.')
+
+  def train_loop_end(self):
+    """See base class."""
+    logs = {}
+    for metric in self.train_metrics + [self.train_loss]:
+      logs[metric.name] = metric.result()
+      metric.reset_states()
+    if callable(self.optimizer.learning_rate):
+      logs['learning_rate'] = self.optimizer.learning_rate(
+          self.optimizer.iterations)
+    else:
+      logs['learning_rate'] = self.optimizer.learning_rate
+
+    self._maybe_export_non_progressive_checkpoint(self._export_ckpt_dir)
+    if self._task.is_stage_advancing(self.global_step.numpy()):
+      old_train_dataset = self.train_dataset
+
+      # Update progressive properties
+      self._task.update_pt_stage(self.global_step.numpy())
+
+      # Setting `self._train_loop_fn` and `self._eval_loop_fn` to None will
+      # rebuild the train and eval functions with the updated model.
+      self._train_loop_fn = None
+      self._eval_loop_fn = None
+
+      if self.train_dataset != old_train_dataset:
+        # Setting `self._train_iter` to None will rebuild the dataset iterator.
+        self._train_iter = None
+
+      # Setting `self._export_ckpt_manager` to None will rebuild the checkpoint
+      # for exporting.
+      self._export_ckpt_manager = None
+
+    return logs
+
+  def _update_pt_stage_from_ckpt(self, ckpt_file):
+    """Update stage properties based on the global_step variable in a ckpt file.
+
+    Before loading variables from a checkpoint file, we need to go to the
+    correct stage and build corresponding model and optimizer, to make sure that
+    we retore variables of the right model and optimizer.
+
+    Args:
+      ckpt_file: Checkpoint file that will be restored/read from.
+    """
+    if not ckpt_file:
+      return
+    ckpt = tf.train.Checkpoint(global_step=self.global_step)
+    ckpt.read(ckpt_file).expect_partial().assert_existing_objects_matched()
+
+    if self._task.is_stage_advancing(self.global_step.numpy()):
+      old_train_dataset = self.train_dataset
+
+      # Update progressive properties
+      self._task.update_pt_stage(self.global_step.numpy(), pass_old_model=False)
+
+      # Setting `self._train_loop_fn` and `self._eval_loop_fn` to None will
+      # rebuild the train and eval functions with the updated model.
+      self._train_loop_fn = None
+      self._eval_loop_fn = None
+
+      if self.train_dataset != old_train_dataset:
+        # Setting `self._train_iter` to None will rebuild the dataset iterator.
+        self._train_iter = None
+
+      # Setting `self._export_ckpt_manager` to None will rebuild the checkpoint
+      # for exporting.
+      self._export_ckpt_manager = None
+
+  def _maybe_export_non_progressive_checkpoint(self, export_ckpt_dir):
+    """Export checkpoints in non-progressive format.
+
+    This basically removes the wrapping of self._task.cur_checkpoint_items
+    -- just save the model, optimizer, etc., directly.
+    The purpose is to let your down-stream tasks to use these checkpoints.
+
+    Args:
+      export_ckpt_dir: A str. folder of exported checkpoints.
+    """
+    if not self.config.trainer.export_checkpoint:
+      logging.info('Not exporting checkpoints.')
+      return
+    if not self._task.is_last_stage and (
+        self.config.trainer.export_only_final_stage_ckpt):
+      logging.info('Not exporting checkpoints until the last stage.')
+      return
+
+    if self._export_ckpt_manager is None:
+      # Create a checkpoint object just now, to make sure we use
+      # progressive_policy.cur_model and progressive_policy.cur_optimizer of the
+      # current stage.
+      if hasattr(self.model, 'checkpoint_items'):
+        checkpoint_items = self.model.checkpoint_items
+      else:
+        checkpoint_items = {}
+      checkpoint = tf.train.Checkpoint(
+          global_step=self.global_step,
+          model=self.model,
+          optimizer=self.optimizer,
+          **checkpoint_items)
+
+      max_to_keep = self.config.trainer.export_max_to_keep or (
+          self.config.trainer.max_to_keep)
+      checkpoint_interval = self.config.trainer.export_checkpoint_interval or (
+          self.config.trainer.checkpoint_interval)
+      self._export_ckpt_manager = tf.train.CheckpointManager(
+          checkpoint,
+          directory=export_ckpt_dir,
+          checkpoint_name='ckpt',
+          step_counter=self.global_step,
+          max_to_keep=max_to_keep,
+          checkpoint_interval=checkpoint_interval,
+      )
+
+    # Make sure we export the last checkpoint.
+    last_checkpoint = (
+        self.global_step.numpy() == self._config.trainer.train_steps)
+    checkpoint_path = self._export_ckpt_manager.save(
+        checkpoint_number=self.global_step.numpy(),
+        check_interval=not last_checkpoint)
+    if checkpoint_path:
+      logging.info('Checkpoints exported: %s.', checkpoint_path)
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/progressive/utils.py b/nlp/text_classification/bert/tensorflow2.0/modeling/progressive/utils.py
new file mode 100644
index 000000000..192170cb8
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/progressive/utils.py
@@ -0,0 +1,56 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Util classes and functions."""
+
+from absl import logging
+import tensorflow as tf
+
+# pylint: disable=g-direct-tensorflow-import
+from tensorflow.python.training.tracking import tracking
+
+
+class VolatileTrackable(tracking.AutoTrackable):
+  """A util class to keep Trackables that might change instances."""
+
+  def __init__(self, **kwargs):
+    for k, v in kwargs.items():
+      setattr(self, k, v)
+
+  def reassign_trackable(self, **kwargs):
+    for k, v in kwargs.items():
+      delattr(self, k)  # untrack this object
+      setattr(self, k, v)  # track the new object
+
+
+class CheckpointWithHooks(tf.train.Checkpoint):
+  """Same as tf.train.Checkpoint but supports hooks.
+
+  In progressive training, use this class instead of tf.train.Checkpoint.
+
+  Since the network architecture changes during progressive training, we need to
+  prepare something (like switch to the correct architecture) before loading the
+  checkpoint. This class supports a hook that will be executed before checkpoint
+  loading.
+  """
+
+  def __init__(self, before_load_hook, **kwargs):
+    self._before_load_hook = before_load_hook
+    super(CheckpointWithHooks, self).__init__(**kwargs)
+
+  # override
+  def read(self, save_path, options=None):
+    self._before_load_hook(save_path)
+    logging.info('Ran before_load_hook.')
+    super(CheckpointWithHooks, self).read(save_path=save_path, options=options)
diff --git a/nlp/text_classification/bert/tensorflow2.0/modeling/tf_utils.py b/nlp/text_classification/bert/tensorflow2.0/modeling/tf_utils.py
new file mode 100644
index 000000000..199662f74
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/modeling/tf_utils.py
@@ -0,0 +1,200 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Common TF utilities."""
+
+import six
+import tensorflow as tf
+
+from tensorflow.python.util import deprecation
+from modeling import activations
+
+
+@deprecation.deprecated(
+    None,
+    "tf.keras.layers.Layer supports multiple positional args and kwargs as "
+    "input tensors. pack/unpack inputs to override __call__ is no longer "
+    "needed.")
+def pack_inputs(inputs):
+  """Pack a list of `inputs` tensors to a tuple.
+
+  Args:
+    inputs: a list of tensors.
+
+  Returns:
+    a tuple of tensors. if any input is None, replace it with a special constant
+    tensor.
+  """
+  inputs = tf.nest.flatten(inputs)
+  outputs = []
+  for x in inputs:
+    if x is None:
+      outputs.append(tf.constant(0, shape=[], dtype=tf.int32))
+    else:
+      outputs.append(x)
+  return tuple(outputs)
+
+
+@deprecation.deprecated(
+    None,
+    "tf.keras.layers.Layer supports multiple positional args and kwargs as "
+    "input tensors. pack/unpack inputs to override __call__ is no longer "
+    "needed.")
+def unpack_inputs(inputs):
+  """unpack a tuple of `inputs` tensors to a tuple.
+
+  Args:
+    inputs: a list of tensors.
+
+  Returns:
+    a tuple of tensors. if any input is a special constant tensor, replace it
+    with None.
+  """
+  inputs = tf.nest.flatten(inputs)
+  outputs = []
+  for x in inputs:
+    if is_special_none_tensor(x):
+      outputs.append(None)
+    else:
+      outputs.append(x)
+  x = tuple(outputs)
+
+  # To trick the very pointless 'unbalanced-tuple-unpacking' pylint check
+  # from triggering.
+  if len(x) == 1:
+    return x[0]
+  return tuple(outputs)
+
+
+def is_special_none_tensor(tensor):
+  """Checks if a tensor is a special None Tensor."""
+  return tensor.shape.ndims == 0 and tensor.dtype == tf.int32
+
+
+def get_activation(identifier, use_keras_layer=False):
+  """Maps a identifier to a Python function, e.g., "relu" => `tf.nn.relu`.
+
+  It checks string first and if it is one of customized activation not in TF,
+  the corresponding activation will be returned. For non-customized activation
+  names and callable identifiers, always fallback to tf.keras.activations.get.
+
+  Prefers using keras layers when use_keras_layer=True. Now it only supports
+  'relu', 'linear', 'identity', 'swish'.
+
+  Args:
+    identifier: String name of the activation function or callable.
+    use_keras_layer: If True, use keras layer if identifier is allow-listed.
+
+  Returns:
+    A Python function corresponding to the activation function or a keras
+    activation layer when use_keras_layer=True.
+  """
+  if isinstance(identifier, six.string_types):
+    identifier = str(identifier).lower()
+    if use_keras_layer:
+      keras_layer_allowlist = {
+          "relu": "relu",
+          "linear": "linear",
+          "identity": "linear",
+          "swish": "swish",
+          "relu6": tf.nn.relu6,
+      }
+      if identifier in keras_layer_allowlist:
+        return tf.keras.layers.Activation(keras_layer_allowlist[identifier])
+    name_to_fn = {
+        "gelu": activations.gelu,
+        "simple_swish": activations.simple_swish,
+        "hard_swish": activations.hard_swish,
+        "relu6": activations.relu6,
+        "hard_sigmoid": activations.hard_sigmoid,
+        "identity": activations.identity,
+    }
+    if identifier in name_to_fn:
+      return tf.keras.activations.get(name_to_fn[identifier])
+  return tf.keras.activations.get(identifier)
+
+
+def get_shape_list(tensor, expected_rank=None, name=None):
+  """Returns a list of the shape of tensor, preferring static dimensions.
+
+  Args:
+    tensor: A tf.Tensor object to find the shape of.
+    expected_rank: (optional) int. The expected rank of `tensor`. If this is
+      specified and the `tensor` has a different rank, and exception will be
+      thrown.
+    name: Optional name of the tensor for the error message.
+
+  Returns:
+    A list of dimensions of the shape of tensor. All static dimensions will
+    be returned as python integers, and dynamic dimensions will be returned
+    as tf.Tensor scalars.
+  """
+  if expected_rank is not None:
+    assert_rank(tensor, expected_rank, name)
+
+  shape = tensor.shape.as_list()
+
+  non_static_indexes = []
+  for (index, dim) in enumerate(shape):
+    if dim is None:
+      non_static_indexes.append(index)
+
+  if not non_static_indexes:
+    return shape
+
+  dyn_shape = tf.shape(tensor)
+  for index in non_static_indexes:
+    shape[index] = dyn_shape[index]
+  return shape
+
+
+def assert_rank(tensor, expected_rank, name=None):
+  """Raises an exception if the tensor rank is not of the expected rank.
+
+  Args:
+    tensor: A tf.Tensor to check the rank of.
+    expected_rank: Python integer or list of integers, expected rank.
+    name: Optional name of the tensor for the error message.
+
+  Raises:
+    ValueError: If the expected shape doesn't match the actual shape.
+  """
+  expected_rank_dict = {}
+  if isinstance(expected_rank, six.integer_types):
+    expected_rank_dict[expected_rank] = True
+  else:
+    for x in expected_rank:
+      expected_rank_dict[x] = True
+
+  actual_rank = tensor.shape.ndims
+  if actual_rank not in expected_rank_dict:
+    raise ValueError(
+        "For the tensor `%s`, the actual tensor rank `%d` (shape = %s) is not "
+        "equal to the expected tensor rank `%s`" %
+        (name, actual_rank, str(tensor.shape), str(expected_rank)))
+
+
+def safe_mean(losses):
+  """Computes a safe mean of the losses.
+
+  Args:
+    losses: `Tensor` whose elements contain individual loss measurements.
+
+  Returns:
+    A scalar representing the mean of `losses`. If `num_present` is zero,
+      then zero is returned.
+  """
+  total = tf.reduce_sum(losses)
+  num_elements = tf.cast(tf.size(losses), dtype=losses.dtype)
+  return tf.math.divide_no_nan(total, num_elements)
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_configs/__init__.py b/nlp/text_classification/bert/tensorflow2.0/nlp_configs/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_configs/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_configs/bert.py b/nlp/text_classification/bert/tensorflow2.0/nlp_configs/bert.py
new file mode 100644
index 000000000..66c951ff4
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_configs/bert.py
@@ -0,0 +1,43 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Multi-head BERT encoder network with classification heads.
+
+Includes configurations and instantiation methods.
+"""
+from typing import List, Optional, Text
+
+import dataclasses
+
+from modeling.hyperparams import base_config
+from nlp_configs import encoders
+
+
+@dataclasses.dataclass
+class ClsHeadConfig(base_config.Config):
+  inner_dim: int = 0
+  num_classes: int = 2
+  activation: Optional[Text] = "tanh"
+  dropout_rate: float = 0.0
+  cls_token_idx: int = 0
+  name: Optional[Text] = None
+
+
+@dataclasses.dataclass
+class PretrainerConfig(base_config.Config):
+  """Pretrainer configuration."""
+  encoder: encoders.EncoderConfig = encoders.EncoderConfig()
+  cls_heads: List[ClsHeadConfig] = dataclasses.field(default_factory=list)
+  mlm_activation: str = "gelu"
+  mlm_initializer_range: float = 0.02
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_configs/electra.py b/nlp/text_classification/bert/tensorflow2.0/nlp_configs/electra.py
new file mode 100644
index 000000000..8da8fe794
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_configs/electra.py
@@ -0,0 +1,36 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ELECTRA model configurations and instantiation methods."""
+from typing import List
+
+import dataclasses
+
+from modeling.hyperparams import base_config
+from nlp_configs import bert
+from nlp_configs import encoders
+
+
+@dataclasses.dataclass
+class ElectraPretrainerConfig(base_config.Config):
+  """ELECTRA pretrainer configuration."""
+  num_masked_tokens: int = 76
+  sequence_length: int = 512
+  num_classes: int = 2
+  discriminator_loss_weight: float = 50.0
+  tie_embeddings: bool = True
+  disallow_correct: bool = False
+  generator_encoder: encoders.EncoderConfig = encoders.EncoderConfig()
+  discriminator_encoder: encoders.EncoderConfig = encoders.EncoderConfig()
+  cls_heads: List[bert.ClsHeadConfig] = dataclasses.field(default_factory=list)
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_configs/encoders.py b/nlp/text_classification/bert/tensorflow2.0/nlp_configs/encoders.py
new file mode 100644
index 000000000..0b0288838
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_configs/encoders.py
@@ -0,0 +1,448 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Transformer Encoders.
+
+Includes configurations and factory methods.
+"""
+from typing import Optional
+
+import dataclasses
+import gin
+import tensorflow as tf
+
+from modeling import hyperparams
+from modeling import tf_utils
+from nlp_modeling import layers
+from nlp_modeling import networks
+from nlp.projects.bigbird import encoder as bigbird_encoder
+
+
+@dataclasses.dataclass
+class BertEncoderConfig(hyperparams.Config):
+  """BERT encoder configuration."""
+  vocab_size: int = 30522
+  hidden_size: int = 768
+  num_layers: int = 12
+  num_attention_heads: int = 12
+  hidden_activation: str = "gelu"
+  intermediate_size: int = 3072
+  dropout_rate: float = 0.1
+  attention_dropout_rate: float = 0.1
+  max_position_embeddings: int = 512
+  type_vocab_size: int = 2
+  initializer_range: float = 0.02
+  embedding_size: Optional[int] = None
+  output_range: Optional[int] = None
+  return_all_encoder_outputs: bool = False
+
+
+@dataclasses.dataclass
+class MobileBertEncoderConfig(hyperparams.Config):
+  """MobileBERT encoder configuration.
+
+  Attributes:
+    word_vocab_size: number of words in the vocabulary.
+    word_embed_size: word embedding size.
+    type_vocab_size: number of word types.
+    max_sequence_length: maximum length of input sequence.
+    num_blocks: number of transformer block in the encoder model.
+    hidden_size: the hidden size for the transformer block.
+    num_attention_heads: number of attention heads in the transformer block.
+    intermediate_size: the size of the "intermediate" (a.k.a., feed forward)
+      layer.
+    hidden_activation: the non-linear activation function to apply to the
+      output of the intermediate/feed-forward layer.
+    hidden_dropout_prob: dropout probability for the hidden layers.
+    attention_probs_dropout_prob: dropout probability of the attention
+      probabilities.
+    intra_bottleneck_size: the size of bottleneck.
+    initializer_range: The stddev of the truncated_normal_initializer for
+      initializing all weight matrices.
+    use_bottleneck_attention: Use attention inputs from the bottleneck
+      transformation. If true, the following `key_query_shared_bottleneck`
+      will be ignored.
+    key_query_shared_bottleneck: whether to share linear transformation for keys
+      and queries.
+    num_feedforward_networks: number of stacked feed-forward networks.
+    normalization_type: the type of normalization_type, only 'no_norm' and
+      'layer_norm' are supported. 'no_norm' represents the element-wise linear
+      transformation for the student model, as suggested by the original
+      MobileBERT paper. 'layer_norm' is used for the teacher model.
+    classifier_activation: if using the tanh activation for the final
+      representation of the [CLS] token in fine-tuning.
+  """
+  word_vocab_size: int = 30522
+  word_embed_size: int = 128
+  type_vocab_size: int = 2
+  max_sequence_length: int = 512
+  num_blocks: int = 24
+  hidden_size: int = 512
+  num_attention_heads: int = 4
+  intermediate_size: int = 4096
+  hidden_activation: str = "gelu"
+  hidden_dropout_prob: float = 0.1
+  attention_probs_dropout_prob: float = 0.1
+  intra_bottleneck_size: int = 1024
+  initializer_range: float = 0.02
+  use_bottleneck_attention: bool = False
+  key_query_shared_bottleneck: bool = False
+  num_feedforward_networks: int = 1
+  normalization_type: str = "layer_norm"
+  classifier_activation: bool = True
+  input_mask_dtype: str = "int32"
+
+
+@dataclasses.dataclass
+class AlbertEncoderConfig(hyperparams.Config):
+  """ALBERT encoder configuration."""
+  vocab_size: int = 30000
+  embedding_width: int = 128
+  hidden_size: int = 768
+  num_layers: int = 12
+  num_attention_heads: int = 12
+  hidden_activation: str = "gelu"
+  intermediate_size: int = 3072
+  dropout_rate: float = 0.0
+  attention_dropout_rate: float = 0.0
+  max_position_embeddings: int = 512
+  type_vocab_size: int = 2
+  initializer_range: float = 0.02
+
+
+@dataclasses.dataclass
+class BigBirdEncoderConfig(hyperparams.Config):
+  """BigBird encoder configuration."""
+  vocab_size: int = 50358
+  hidden_size: int = 768
+  num_layers: int = 12
+  num_attention_heads: int = 12
+  hidden_activation: str = "gelu"
+  intermediate_size: int = 3072
+  dropout_rate: float = 0.1
+  attention_dropout_rate: float = 0.1
+  max_position_embeddings: int = 4096
+  num_rand_blocks: int = 3
+  block_size: int = 64
+  type_vocab_size: int = 16
+  initializer_range: float = 0.02
+  embedding_width: Optional[int] = None
+  use_gradient_checkpointing: bool = False
+
+
+@dataclasses.dataclass
+class KernelEncoderConfig(hyperparams.Config):
+  """Linear encoder configuration."""
+  vocab_size: int = 30522
+  hidden_size: int = 768
+  num_layers: int = 12
+  num_attention_heads: int = 12
+  hidden_activation: str = "gelu"
+  intermediate_size: int = 3072
+  dropout_rate: float = 0.1
+  attention_dropout_rate: float = 0.1
+  max_position_embeddings: int = 512
+  type_vocab_size: int = 2
+  initializer_range: float = 0.02
+  embedding_size: Optional[int] = None
+  feature_transform: str = "exp"
+  num_random_features: int = 256
+  redraw: bool = False
+  is_short_seq: bool = False
+  begin_kernel: int = 0
+
+
+@dataclasses.dataclass
+class XLNetEncoderConfig(hyperparams.Config):
+  """XLNet encoder configuration."""
+  vocab_size: int = 32000
+  num_layers: int = 24
+  hidden_size: int = 1024
+  num_attention_heads: int = 16
+  head_size: int = 64
+  inner_size: int = 4096
+  inner_activation: str = "gelu"
+  dropout_rate: float = 0.1
+  attention_dropout_rate: float = 0.1
+  attention_type: str = "bi"
+  bi_data: bool = False
+  tie_attention_biases: bool = False
+  memory_length: int = 0
+  same_length: bool = False
+  clamp_length: int = -1
+  reuse_length: int = 0
+  use_cls_mask: bool = False
+  embedding_width: int = 1024
+  initializer_range: float = 0.02
+  two_stream: bool = False
+
+
+@dataclasses.dataclass
+class EncoderConfig(hyperparams.OneOfConfig):
+  """Encoder configuration."""
+  type: Optional[str] = "bert"
+  albert: AlbertEncoderConfig = AlbertEncoderConfig()
+  bert: BertEncoderConfig = BertEncoderConfig()
+  bigbird: BigBirdEncoderConfig = BigBirdEncoderConfig()
+  kernel: KernelEncoderConfig = KernelEncoderConfig()
+  mobilebert: MobileBertEncoderConfig = MobileBertEncoderConfig()
+  xlnet: XLNetEncoderConfig = XLNetEncoderConfig()
+
+
+@gin.configurable
+def build_encoder(config: EncoderConfig,
+                  embedding_layer: Optional[tf.keras.layers.Layer] = None,
+                  encoder_cls=None,
+                  bypass_config: bool = False):
+  """Instantiate a Transformer encoder network from EncoderConfig.
+
+  Args:
+    config: the one-of encoder config, which provides encoder parameters of a
+      chosen encoder.
+    embedding_layer: an external embedding layer passed to the encoder.
+    encoder_cls: an external encoder cls not included in the supported encoders,
+      usually used by gin.configurable.
+    bypass_config: whether to ignore config instance to create the object with
+      `encoder_cls`.
+
+  Returns:
+    An encoder instance.
+  """
+  if bypass_config:
+    return encoder_cls()
+  encoder_type = config.type
+  encoder_cfg = config.get()
+  if encoder_cls and encoder_cls.__name__ == "EncoderScaffold":
+    embedding_cfg = dict(
+        vocab_size=encoder_cfg.vocab_size,
+        type_vocab_size=encoder_cfg.type_vocab_size,
+        hidden_size=encoder_cfg.hidden_size,
+        max_seq_length=encoder_cfg.max_position_embeddings,
+        initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=encoder_cfg.initializer_range),
+        dropout_rate=encoder_cfg.dropout_rate,
+    )
+    hidden_cfg = dict(
+        num_attention_heads=encoder_cfg.num_attention_heads,
+        intermediate_size=encoder_cfg.intermediate_size,
+        intermediate_activation=tf_utils.get_activation(
+            encoder_cfg.hidden_activation),
+        dropout_rate=encoder_cfg.dropout_rate,
+        attention_dropout_rate=encoder_cfg.attention_dropout_rate,
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=encoder_cfg.initializer_range),
+    )
+    kwargs = dict(
+        embedding_cfg=embedding_cfg,
+        hidden_cfg=hidden_cfg,
+        num_hidden_instances=encoder_cfg.num_layers,
+        pooled_output_dim=encoder_cfg.hidden_size,
+        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=encoder_cfg.initializer_range),
+        return_all_layer_outputs=encoder_cfg.return_all_encoder_outputs,
+        dict_outputs=True)
+    return encoder_cls(**kwargs)
+
+  if encoder_type == "mobilebert":
+    return networks.MobileBERTEncoder(
+        word_vocab_size=encoder_cfg.word_vocab_size,
+        word_embed_size=encoder_cfg.word_embed_size,
+        type_vocab_size=encoder_cfg.type_vocab_size,
+        max_sequence_length=encoder_cfg.max_sequence_length,
+        num_blocks=encoder_cfg.num_blocks,
+        hidden_size=encoder_cfg.hidden_size,
+        num_attention_heads=encoder_cfg.num_attention_heads,
+        intermediate_size=encoder_cfg.intermediate_size,
+        intermediate_act_fn=encoder_cfg.hidden_activation,
+        hidden_dropout_prob=encoder_cfg.hidden_dropout_prob,
+        attention_probs_dropout_prob=encoder_cfg.attention_probs_dropout_prob,
+        intra_bottleneck_size=encoder_cfg.intra_bottleneck_size,
+        initializer_range=encoder_cfg.initializer_range,
+        use_bottleneck_attention=encoder_cfg.use_bottleneck_attention,
+        key_query_shared_bottleneck=encoder_cfg.key_query_shared_bottleneck,
+        num_feedforward_networks=encoder_cfg.num_feedforward_networks,
+        normalization_type=encoder_cfg.normalization_type,
+        classifier_activation=encoder_cfg.classifier_activation,
+        input_mask_dtype=encoder_cfg.input_mask_dtype)
+
+  if encoder_type == "albert":
+    return networks.AlbertEncoder(
+        vocab_size=encoder_cfg.vocab_size,
+        embedding_width=encoder_cfg.embedding_width,
+        hidden_size=encoder_cfg.hidden_size,
+        num_layers=encoder_cfg.num_layers,
+        num_attention_heads=encoder_cfg.num_attention_heads,
+        max_sequence_length=encoder_cfg.max_position_embeddings,
+        type_vocab_size=encoder_cfg.type_vocab_size,
+        intermediate_size=encoder_cfg.intermediate_size,
+        activation=tf_utils.get_activation(encoder_cfg.hidden_activation),
+        dropout_rate=encoder_cfg.dropout_rate,
+        attention_dropout_rate=encoder_cfg.attention_dropout_rate,
+        initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=encoder_cfg.initializer_range),
+        dict_outputs=True)
+
+  if encoder_type == "bigbird":
+    # TODO(frederickliu): Support use_gradient_checkpointing and update
+    # experiments to use the EncoderScaffold only.
+    if encoder_cfg.use_gradient_checkpointing:
+      return bigbird_encoder.BigBirdEncoder(
+          vocab_size=encoder_cfg.vocab_size,
+          hidden_size=encoder_cfg.hidden_size,
+          num_layers=encoder_cfg.num_layers,
+          num_attention_heads=encoder_cfg.num_attention_heads,
+          intermediate_size=encoder_cfg.intermediate_size,
+          activation=tf_utils.get_activation(encoder_cfg.hidden_activation),
+          dropout_rate=encoder_cfg.dropout_rate,
+          attention_dropout_rate=encoder_cfg.attention_dropout_rate,
+          num_rand_blocks=encoder_cfg.num_rand_blocks,
+          block_size=encoder_cfg.block_size,
+          max_position_embeddings=encoder_cfg.max_position_embeddings,
+          type_vocab_size=encoder_cfg.type_vocab_size,
+          initializer=tf.keras.initializers.TruncatedNormal(
+              stddev=encoder_cfg.initializer_range),
+          embedding_width=encoder_cfg.embedding_width,
+          use_gradient_checkpointing=encoder_cfg.use_gradient_checkpointing)
+    embedding_cfg = dict(
+        vocab_size=encoder_cfg.vocab_size,
+        type_vocab_size=encoder_cfg.type_vocab_size,
+        hidden_size=encoder_cfg.hidden_size,
+        max_seq_length=encoder_cfg.max_position_embeddings,
+        initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=encoder_cfg.initializer_range),
+        dropout_rate=encoder_cfg.dropout_rate)
+    attention_cfg = dict(
+        num_heads=encoder_cfg.num_attention_heads,
+        key_dim=int(encoder_cfg.hidden_size // encoder_cfg.num_attention_heads),
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=encoder_cfg.initializer_range),
+        max_rand_mask_length=encoder_cfg.max_position_embeddings,
+        num_rand_blocks=encoder_cfg.num_rand_blocks,
+        from_block_size=encoder_cfg.block_size,
+        to_block_size=encoder_cfg.block_size,
+        )
+    hidden_cfg = dict(
+        num_attention_heads=encoder_cfg.num_attention_heads,
+        intermediate_size=encoder_cfg.intermediate_size,
+        intermediate_activation=tf_utils.get_activation(
+            encoder_cfg.hidden_activation),
+        dropout_rate=encoder_cfg.dropout_rate,
+        attention_dropout_rate=encoder_cfg.attention_dropout_rate,
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=encoder_cfg.initializer_range),
+        attention_cls=layers.BigBirdAttention,
+        attention_cfg=attention_cfg)
+    kwargs = dict(
+        embedding_cfg=embedding_cfg,
+        hidden_cls=layers.TransformerScaffold,
+        hidden_cfg=hidden_cfg,
+        num_hidden_instances=encoder_cfg.num_layers,
+        mask_cls=layers.BigBirdMasks,
+        mask_cfg=dict(block_size=encoder_cfg.block_size),
+        pooled_output_dim=encoder_cfg.hidden_size,
+        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=encoder_cfg.initializer_range),
+        return_all_layer_outputs=False,
+        dict_outputs=True,
+        layer_idx_as_attention_seed=True)
+    return networks.EncoderScaffold(**kwargs)
+
+  if encoder_type == "kernel":
+    embedding_cfg = dict(
+        vocab_size=encoder_cfg.vocab_size,
+        type_vocab_size=encoder_cfg.type_vocab_size,
+        hidden_size=encoder_cfg.hidden_size,
+        max_seq_length=encoder_cfg.max_position_embeddings,
+        initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=encoder_cfg.initializer_range),
+        dropout_rate=encoder_cfg.dropout_rate)
+    attention_cfg = dict(
+        num_heads=encoder_cfg.num_attention_heads,
+        key_dim=int(encoder_cfg.hidden_size // encoder_cfg.num_attention_heads),
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=encoder_cfg.initializer_range),
+        feature_transform=encoder_cfg.feature_transform,
+        num_random_features=encoder_cfg.num_random_features,
+        redraw=encoder_cfg.redraw,
+        is_short_seq=encoder_cfg.is_short_seq,
+        begin_kernel=encoder_cfg.begin_kernel,
+        )
+    hidden_cfg = dict(
+        num_attention_heads=encoder_cfg.num_attention_heads,
+        intermediate_size=encoder_cfg.intermediate_size,
+        intermediate_activation=tf_utils.get_activation(
+            encoder_cfg.hidden_activation),
+        dropout_rate=encoder_cfg.dropout_rate,
+        attention_dropout_rate=encoder_cfg.attention_dropout_rate,
+        kernel_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=encoder_cfg.initializer_range),
+        attention_cls=layers.KernelAttention,
+        attention_cfg=attention_cfg)
+    kwargs = dict(
+        embedding_cfg=embedding_cfg,
+        hidden_cls=layers.TransformerScaffold,
+        hidden_cfg=hidden_cfg,
+        num_hidden_instances=encoder_cfg.num_layers,
+        mask_cls=layers.KernelMask,
+        pooled_output_dim=encoder_cfg.hidden_size,
+        pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=encoder_cfg.initializer_range),
+        return_all_layer_outputs=False,
+        dict_outputs=True,
+        layer_idx_as_attention_seed=True)
+    return networks.EncoderScaffold(**kwargs)
+
+  if encoder_type == "xlnet":
+    return networks.XLNetBase(
+        vocab_size=encoder_cfg.vocab_size,
+        num_layers=encoder_cfg.num_layers,
+        hidden_size=encoder_cfg.hidden_size,
+        num_attention_heads=encoder_cfg.num_attention_heads,
+        head_size=encoder_cfg.head_size,
+        inner_size=encoder_cfg.inner_size,
+        dropout_rate=encoder_cfg.dropout_rate,
+        attention_dropout_rate=encoder_cfg.attention_dropout_rate,
+        attention_type=encoder_cfg.attention_type,
+        bi_data=encoder_cfg.bi_data,
+        two_stream=encoder_cfg.two_stream,
+        tie_attention_biases=encoder_cfg.tie_attention_biases,
+        memory_length=encoder_cfg.memory_length,
+        clamp_length=encoder_cfg.clamp_length,
+        reuse_length=encoder_cfg.reuse_length,
+        inner_activation=encoder_cfg.inner_activation,
+        use_cls_mask=encoder_cfg.use_cls_mask,
+        embedding_width=encoder_cfg.embedding_width,
+        initializer=tf.keras.initializers.RandomNormal(
+            stddev=encoder_cfg.initializer_range))
+
+  # Uses the default BERTEncoder configuration schema to create the encoder.
+  # If it does not match, please add a switch branch by the encoder type.
+  return networks.BertEncoder(
+      vocab_size=encoder_cfg.vocab_size,
+      hidden_size=encoder_cfg.hidden_size,
+      num_layers=encoder_cfg.num_layers,
+      num_attention_heads=encoder_cfg.num_attention_heads,
+      intermediate_size=encoder_cfg.intermediate_size,
+      activation=tf_utils.get_activation(encoder_cfg.hidden_activation),
+      dropout_rate=encoder_cfg.dropout_rate,
+      attention_dropout_rate=encoder_cfg.attention_dropout_rate,
+      max_sequence_length=encoder_cfg.max_position_embeddings,
+      type_vocab_size=encoder_cfg.type_vocab_size,
+      initializer=tf.keras.initializers.TruncatedNormal(
+          stddev=encoder_cfg.initializer_range),
+      output_range=encoder_cfg.output_range,
+      embedding_width=encoder_cfg.embedding_size,
+      embedding_layer=embedding_layer,
+      return_all_encoder_outputs=encoder_cfg.return_all_encoder_outputs,
+      dict_outputs=True)
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_configs/experiment_configs.py b/nlp/text_classification/bert/tensorflow2.0/nlp_configs/experiment_configs.py
new file mode 100644
index 000000000..b9f80b26b
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_configs/experiment_configs.py
@@ -0,0 +1,19 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Experiments definition."""
+# pylint: disable=unused-import
+from nlp_configs import finetuning_experiments
+from nlp_configs import pretraining_experiments
+from nlp_configs import wmt_transformer_experiments
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_configs/experiments/glue_mnli_matched.yaml b/nlp/text_classification/bert/tensorflow2.0/nlp_configs/experiments/glue_mnli_matched.yaml
new file mode 100644
index 000000000..29dfcb68b
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_configs/experiments/glue_mnli_matched.yaml
@@ -0,0 +1,49 @@
+task:
+  hub_module_url: ''
+  model:
+    num_classes: 3
+  init_checkpoint: ''
+  metric_type: 'accuracy'
+  train_data:
+    drop_remainder: true
+    global_batch_size: 32
+    input_path: ''
+    is_training: true
+    seq_length: 128
+    label_type: 'int'
+  validation_data:
+    drop_remainder: false
+    global_batch_size: 32
+    input_path: ''
+    is_training: false
+    seq_length: 128
+    label_type: 'int'
+trainer:
+  checkpoint_interval: 3000
+  optimizer_config:
+    learning_rate:
+      polynomial:
+        # 100% of train_steps.
+        decay_steps: 36813
+        end_learning_rate: 0.0
+        initial_learning_rate: 3.0e-05
+        power: 1.0
+      type: polynomial
+    optimizer:
+      type: adamw
+    warmup:
+      polynomial:
+        power: 1
+        # ~10% of train_steps.
+        warmup_steps: 3681
+      type: polynomial
+  steps_per_loop: 1000
+  summary_interval: 1000
+  # Training data size 392,702 examples, 3 epochs.
+  train_steps: 36813
+  validation_interval: 6135
+  # Eval data size = 9815 examples.
+  validation_steps: 307
+  best_checkpoint_export_subdir: 'best_ckpt'
+  best_checkpoint_eval_metric: 'cls_accuracy'
+  best_checkpoint_metric_comp: 'higher'
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_configs/experiments/squad_v1.yaml b/nlp/text_classification/bert/tensorflow2.0/nlp_configs/experiments/squad_v1.yaml
new file mode 100644
index 000000000..a69710a58
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_configs/experiments/squad_v1.yaml
@@ -0,0 +1,50 @@
+task:
+  hub_module_url: ''
+  max_answer_length: 30
+  n_best_size: 20
+  null_score_diff_threshold: 0.0
+  init_checkpoint: ''
+  train_data:
+    drop_remainder: true
+    global_batch_size: 48
+    input_path: ''
+    is_training: true
+    seq_length: 384
+  validation_data:
+    do_lower_case: true
+    doc_stride: 128
+    drop_remainder: false
+    global_batch_size: 48
+    input_path: ''
+    is_training: false
+    query_length: 64
+    seq_length: 384
+    tokenization: WordPiece
+    version_2_with_negative: false
+    vocab_file: ''
+trainer:
+  checkpoint_interval: 1000
+  max_to_keep: 5
+  optimizer_config:
+    learning_rate:
+      polynomial:
+        decay_steps: 3699
+        end_learning_rate: 0.0
+        initial_learning_rate: 8.0e-05
+        power: 1.0
+      type: polynomial
+    optimizer:
+      type: adamw
+    warmup:
+      polynomial:
+        power: 1
+        warmup_steps: 370
+      type: polynomial
+  steps_per_loop: 1000
+  summary_interval: 1000
+  train_steps: 3699
+  validation_interval: 1000
+  validation_steps: 226
+  best_checkpoint_export_subdir: 'best_ckpt'
+  best_checkpoint_eval_metric: 'final_f1'
+  best_checkpoint_metric_comp: 'higher'
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_configs/finetuning_experiments.py b/nlp/text_classification/bert/tensorflow2.0/nlp_configs/finetuning_experiments.py
new file mode 100644
index 000000000..86afc03e9
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_configs/finetuning_experiments.py
@@ -0,0 +1,139 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Finetuning experiment configurations."""
+# pylint: disable=g-doc-return-or-yield,line-too-long
+from core import config_definitions as cfg
+from core import exp_factory
+from modeling import optimization
+from nlp.data import question_answering_dataloader
+from nlp.data import sentence_prediction_dataloader
+from nlp.data import tagging_dataloader
+from tasks import question_answering
+from tasks import sentence_prediction
+from tasks import tagging
+
+
+@exp_factory.register_config_factory('bert/sentence_prediction')
+def bert_sentence_prediction() -> cfg.ExperimentConfig:
+  r"""BERT GLUE."""
+  config = cfg.ExperimentConfig(
+      task=sentence_prediction.SentencePredictionConfig(
+          train_data=sentence_prediction_dataloader
+          .SentencePredictionDataConfig(),
+          validation_data=sentence_prediction_dataloader
+          .SentencePredictionDataConfig(
+              is_training=False, drop_remainder=False)),
+      trainer=cfg.TrainerConfig(
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'adamw',
+                  'adamw': {
+                      'weight_decay_rate':
+                          0.01,
+                      'exclude_from_weight_decay':
+                          ['LayerNorm', 'layer_norm', 'bias'],
+                  }
+              },
+              'learning_rate': {
+                  'type': 'polynomial',
+                  'polynomial': {
+                      'initial_learning_rate': 3e-5,
+                      'end_learning_rate': 0.0,
+                  }
+              },
+              'warmup': {
+                  'type': 'polynomial'
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+  config.task.model.encoder.type = 'bert'
+  return config
+
+
+@exp_factory.register_config_factory('bert/squad')
+def bert_squad() -> cfg.ExperimentConfig:
+  """BERT Squad V1/V2."""
+  config = cfg.ExperimentConfig(
+      task=question_answering.QuestionAnsweringConfig(
+          train_data=question_answering_dataloader.QADataConfig(),
+          validation_data=question_answering_dataloader.QADataConfig()),
+      trainer=cfg.TrainerConfig(
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'adamw',
+                  'adamw': {
+                      'weight_decay_rate':
+                          0.01,
+                      'exclude_from_weight_decay':
+                          ['LayerNorm', 'layer_norm', 'bias'],
+                  }
+              },
+              'learning_rate': {
+                  'type': 'polynomial',
+                  'polynomial': {
+                      'initial_learning_rate': 8e-5,
+                      'end_learning_rate': 0.0,
+                  }
+              },
+              'warmup': {
+                  'type': 'polynomial'
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+  config.task.model.encoder.type = 'bert'
+  return config
+
+
+@exp_factory.register_config_factory('bert/tagging')
+def bert_tagging() -> cfg.ExperimentConfig:
+  """BERT tagging task."""
+  config = cfg.ExperimentConfig(
+      task=tagging.TaggingConfig(
+          train_data=tagging_dataloader.TaggingDataConfig(),
+          validation_data=tagging_dataloader.TaggingDataConfig(
+              is_training=False, drop_remainder=False)),
+      trainer=cfg.TrainerConfig(
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'adamw',
+                  'adamw': {
+                      'weight_decay_rate':
+                          0.01,
+                      'exclude_from_weight_decay':
+                          ['LayerNorm', 'layer_norm', 'bias'],
+                  }
+              },
+              'learning_rate': {
+                  'type': 'polynomial',
+                  'polynomial': {
+                      'initial_learning_rate': 8e-5,
+                      'end_learning_rate': 0.0,
+                  }
+              },
+              'warmup': {
+                  'type': 'polynomial'
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None',
+      ])
+  return config
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_configs/models/bert_en_uncased_base.yaml b/nlp/text_classification/bert/tensorflow2.0/nlp_configs/models/bert_en_uncased_base.yaml
new file mode 100644
index 000000000..1e49bc543
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_configs/models/bert_en_uncased_base.yaml
@@ -0,0 +1,16 @@
+task:
+  model:
+    encoder:
+      type: bert
+      bert:
+        attention_dropout_rate: 0.1
+        dropout_rate: 0.1
+        hidden_activation: gelu
+        hidden_size: 768
+        initializer_range: 0.02
+        intermediate_size: 3072
+        max_position_embeddings: 512
+        num_attention_heads: 12
+        num_layers: 12
+        type_vocab_size: 2
+        vocab_size: 30522
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_configs/pretraining_experiments.py b/nlp/text_classification/bert/tensorflow2.0/nlp_configs/pretraining_experiments.py
new file mode 100644
index 000000000..bb105f1a8
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_configs/pretraining_experiments.py
@@ -0,0 +1,82 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Pretraining experiment configurations."""
+# pylint: disable=g-doc-return-or-yield,line-too-long
+from core import config_definitions as cfg
+from core import exp_factory
+from modeling import optimization
+from nlp.data import pretrain_dataloader
+from nlp.data import pretrain_dynamic_dataloader
+from tasks import masked_lm
+
+_TRAINER = cfg.TrainerConfig(
+    train_steps=1000000,
+    optimizer_config=optimization.OptimizationConfig({
+        'optimizer': {
+            'type': 'adamw',
+            'adamw': {
+                'weight_decay_rate':
+                    0.01,
+                'exclude_from_weight_decay': [
+                    'LayerNorm', 'layer_norm', 'bias'
+                ],
+            }
+        },
+        'learning_rate': {
+            'type': 'polynomial',
+            'polynomial': {
+                'initial_learning_rate': 1e-4,
+                'end_learning_rate': 0.0,
+            }
+        },
+        'warmup': {
+            'type': 'polynomial'
+        }
+    }))
+
+
+@exp_factory.register_config_factory('bert/pretraining')
+def bert_pretraining() -> cfg.ExperimentConfig:
+  """BERT pretraining experiment."""
+  config = cfg.ExperimentConfig(
+      task=masked_lm.MaskedLMConfig(
+          train_data=pretrain_dataloader.BertPretrainDataConfig(),
+          validation_data=pretrain_dataloader.BertPretrainDataConfig(
+              is_training=False)),
+      trainer=_TRAINER,
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+  return config
+
+
+@exp_factory.register_config_factory('bert/pretraining_dynamic')
+def bert_dynamic() -> cfg.ExperimentConfig:
+  """BERT base with dynamic input sequences.
+
+  TPU needs to run with tf.data service with round-robin behavior.
+  """
+  config = cfg.ExperimentConfig(
+      task=masked_lm.MaskedLMConfig(
+          train_data=pretrain_dynamic_dataloader.BertPretrainDataConfig(),
+          validation_data=pretrain_dataloader.BertPretrainDataConfig(
+              is_training=False)),
+      trainer=_TRAINER,
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.validation_data.is_training != None'
+      ])
+  return config
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_configs/wmt_transformer_experiments.py b/nlp/text_classification/bert/tensorflow2.0/nlp_configs/wmt_transformer_experiments.py
new file mode 100644
index 000000000..fe2c0fc57
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_configs/wmt_transformer_experiments.py
@@ -0,0 +1,110 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+# pylint: disable=g-doc-return-or-yield,line-too-long
+"""WMT translation configurations."""
+
+from core import config_definitions as cfg
+from core import exp_factory
+from modeling import optimization
+from nlp.data import wmt_dataloader
+from tasks import translation
+
+
+@exp_factory.register_config_factory('wmt_transformer/large')
+def wmt_transformer_large() -> cfg.ExperimentConfig:
+  """WMT Transformer Large.
+
+  Please refer to
+  tensorflow_models/official/nlp/data/train_sentencepiece.py
+  to generate sentencepiece_model
+  and pass
+  --params_override=task.sentencepiece_model_path='YOUR_PATH'
+  to the train script.
+  """
+  learning_rate = 2.0
+  hidden_size = 1024
+  learning_rate *= (hidden_size**-0.5)
+  warmup_steps = 16000
+  train_steps = 300000
+  token_batch_size = 24576
+  encdecoder = translation.EncDecoder(
+      num_attention_heads=16, intermediate_size=hidden_size * 4)
+  config = cfg.ExperimentConfig(
+      task=translation.TranslationConfig(
+          model=translation.ModelConfig(
+              encoder=encdecoder,
+              decoder=encdecoder,
+              embedding_width=hidden_size,
+              padded_decode=True,
+              decode_max_length=100),
+          train_data=wmt_dataloader.WMTDataConfig(
+              tfds_name='wmt14_translate/de-en',
+              tfds_split='train',
+              src_lang='en',
+              tgt_lang='de',
+              is_training=True,
+              global_batch_size=token_batch_size,
+              static_batch=True,
+              max_seq_length=64
+          ),
+          validation_data=wmt_dataloader.WMTDataConfig(
+              tfds_name='wmt14_translate/de-en',
+              tfds_split='test',
+              src_lang='en',
+              tgt_lang='de',
+              is_training=False,
+              global_batch_size=32,
+              static_batch=True,
+              max_seq_length=100,
+          ),
+          sentencepiece_model_path=None,
+      ),
+      trainer=cfg.TrainerConfig(
+          train_steps=train_steps,
+          validation_steps=-1,
+          steps_per_loop=1000,
+          summary_interval=1000,
+          checkpoint_interval=5000,
+          validation_interval=5000,
+          max_to_keep=1,
+          optimizer_config=optimization.OptimizationConfig({
+              'optimizer': {
+                  'type': 'adam',
+                  'adam': {
+                      'beta_2': 0.997,
+                      'epsilon': 1e-9,
+                  },
+              },
+              'learning_rate': {
+                  'type': 'power',
+                  'power': {
+                      'initial_learning_rate': learning_rate,
+                      'power': -0.5,
+                  }
+              },
+              'warmup': {
+                  'type': 'linear',
+                  'linear': {
+                      'warmup_steps': warmup_steps,
+                      'warmup_learning_rate': 0.0
+                  }
+              }
+          })),
+      restrictions=[
+          'task.train_data.is_training != None',
+          'task.sentencepiece_model_path != None',
+      ])
+  return config
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/README.md b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/README.md
new file mode 100644
index 000000000..99c7c361f
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/README.md
@@ -0,0 +1,52 @@
+# NLP Modeling Library
+
+This library provides a set of Keras primitives (`tf.keras.Layer` and
+`tf.keras.Model`) that can be assembled into transformer-based models.
+They are flexible, validated, interoperable, and both TF1 and TF2 compatible.
+
+* [`layers`](layers) are the fundamental building blocks for NLP models.
+They can be used to assemble new `tf.keras` layers or models.
+
+* [`networks`](networks) are combinations of `tf.keras` layers (and possibly
+other networks). They are `tf.keras` models that would not be trained alone.
+It encapsulates common network structures like a transformer encoder into an
+easily handled object with a standardized configuration.
+
+* [`models`](models) are combinations of `tf.keras` layers and models that can
+be trained. Several pre-built canned models are provided to train encoder
+networks. These models are intended as both convenience functions and canonical
+examples.
+
+* [`losses`](losses) contains common loss computation used in NLP tasks.
+
+Please see the colab
+[nlp_modeling_library_intro.ipynb]
+(https://colab.sandbox.google.com/github/tensorflow/models/blob/master/official/colab/nlp/nlp_modeling_library_intro.ipynb)
+for how to build transformer-based NLP models using above primitives.
+
+Besides the pre-defined primitives, it also provides scaffold classes to allow
+easy experimentation with noval achitectures, e.g., you don’t need to fork a
+whole Transformer object to try a different kind of attention primitive,
+for instance.
+
+* [`TransformerScaffold`](layers/transformer_scaffold.py) implements the
+Transformer from ["Attention Is All You Need"]
+(https://arxiv.org/abs/1706.03762), with a customizable attention layer
+option. Users can pass a class to `attention_cls` and associated config to
+`attention_cfg`, in which case the scaffold will instantiate the class with
+the config, or pass a class instance to `attention_cls`.
+
+* [`EncoderScaffold`](networks/encoder_scaffold.py) implements the transformer
+encoder from ["BERT: Pre-training of Deep Bidirectional Transformers for
+Language Understanding"](https://arxiv.org/abs/1810.04805), with customizable
+embedding subnetwork (which will replace the standard embedding logic) and/or a
+custom hidden layer (which will replace the Transformer instantiation in the
+encoder).
+
+Please see the colab
+[customize_encoder.ipynb]
+(https://colab.sandbox.google.com/github/tensorflow/models/blob/master/official/colab/nlp/customize_encoder.ipynb)
+for how to use scaffold classes to build noval achitectures.
+
+BERT and ALBERT models in this repo are implemented using this library.
+Code examples can be found in the corresponding model folder.
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/__init__.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/__init__.py
new file mode 100644
index 000000000..34d39223a
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/__init__.py
@@ -0,0 +1,24 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""NLP Modeling Library.
+
+This library provides a set of Keras primitives (`tf.keras.Layer` and
+`tf.keras.Model`) that can be assembled into transformer-based models.
+They are flexible, validated, interoperable, and both TF1 and TF2 compatible.
+"""
+from nlp_modeling import layers
+from nlp_modeling import losses
+from nlp_modeling import models
+from nlp_modeling import networks
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/README.md b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/README.md
new file mode 100644
index 000000000..79e142a08
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/README.md
@@ -0,0 +1,123 @@
+# Layers
+
+Layers are the fundamental building blocks for NLP models. They can be used to
+assemble new `tf.keras` layers or models.
+
+*   [MultiHeadAttention](attention.py) implements an optionally masked attention
+    between query, key, value tensors as described in
+    ["Attention Is All You Need"](https://arxiv.org/abs/1706.03762). If
+    `from_tensor` and `to_tensor` are the same, then this is self-attention.
+
+*   [BigBirdAttention](bigbird_attention.py) implements a sparse attention
+    mechanism that reduces this quadratic dependency to linear described in
+    ["Big Bird: Transformers for Longer Sequences"](https://arxiv.org/abs/2007.14062).
+
+*   [CachedAttention](attention.py) implements an attention layer with cache
+    used for auto-agressive decoding.
+
+*   [KernelAttention](kernel_attention.py) implements a group of attention
+    mechansim that express the self-attention as a linear dot-product of
+    kernel feature maps and make use of the associativity property of
+    matrix products to reduce the complexity from quadratic to linear. The
+    implementation includes methods described in ["Transformers are RNNs:
+    Fast Autoregressive Transformers with Linear Attention"](https://arxiv.org/abs/2006.16236),
+    ["Rethinking Attention with Performers"](https://arxiv.org/abs/2009.14794),
+    ["Random Feature Attention"](https://openreview.net/pdf?id=QtTKTdVrFBB).
+
+*   [MatMulWithMargin](mat_mul_with_margin.py) implements a matrix
+    multiplication with margin layer used for training retrieval / ranking
+    tasks, as described in ["Improving Multilingual Sentence Embedding using
+    Bi-directional Dual Encoder with Additive Margin
+    Softmax"](https://www.ijcai.org/Proceedings/2019/0746.pdf).
+
+*   [MultiChannelAttention](multi_channel_attention.py) implements an variant of
+    multi-head attention which can be used to merge multiple streams for
+    cross-attentions.
+
+*   [TalkingHeadsAttention](talking_heads_attention.py) implements the talking
+    heads attention, as decribed in
+    ["Talking-Heads Attention"](https://arxiv.org/abs/2003.02436).
+
+*   [Transformer](transformer.py) implements an optionally masked transformer as
+    described in
+    ["Attention Is All You Need"](https://arxiv.org/abs/1706.03762).
+
+*   [TransformerDecoderBlock](transformer.py) TransformerDecoderBlock is made up
+    of self multi-head attention, cross multi-head attention and feedforward
+    network.
+
+*   [RandomFeatureGaussianProcess](gaussian_process.py) implements random
+    feature-based Gaussian process described in ["Random Features for
+     Large-Scale Kernel Machines"](https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf).
+
+*   [ReZeroTransformer](rezero_transformer.py) implements Transformer with
+    ReZero described in
+    ["ReZero is All You Need: Fast Convergence at Large Depth"](https://arxiv.org/abs/2003.04887).
+
+*   [OnDeviceEmbedding](on_device_embedding.py) implements efficient embedding
+    lookups designed for TPU-based models.
+
+*   [PositionalEmbedding](position_embedding.py) creates a positional embedding
+    as described in ["BERT: Pre-training of Deep Bidirectional Transformers for
+    Language Understanding"](https://arxiv.org/abs/1810.04805).
+
+*   [SelfAttentionMask](self_attention_mask.py) creates a 3D attention mask from
+    a 2D tensor mask.
+
+*   [SpectralNormalization](spectral_normalization.py) implements a tf.Wrapper
+    that applies spectral normalization regularization to a given layer. See
+    [Spectral Norm Regularization for Improving the Generalizability of
+     Deep Learning](https://arxiv.org/abs/1705.10941)
+
+*   [MaskedSoftmax](masked_softmax.py) implements a softmax with an optional
+    masking input. If no mask is provided to this layer, it performs a standard
+    softmax; however, if a mask tensor is applied (which should be 1 in
+    positions where the data should be allowed through, and 0 where the data
+    should be masked), the output will have masked positions set to
+    approximately zero.
+
+*   [`MaskedLM`](masked_lm.py) implements a masked language model. It assumes
+    the embedding table variable is passed to it.
+
+*   [ClassificationHead](cls_head.py) A pooling head over a sequence of
+    embeddings, commonly used by classification tasks.
+
+*   [GaussianProcessClassificationHead](cls_head.py) A spectral-normalized
+    neural Gaussian process (SNGP)-based classification head as described in
+    ["Simple and Principled Uncertainty Estimation with Deterministic Deep
+     Learning via Distance Awareness"](https://arxiv.org/abs/2006.10108).
+
+*   [GatedFeedforward](gated_feedforward.py) implements the gated linear layer
+    feedforward as described in
+    ["GLU Variants Improve Transformer"](https://arxiv.org/abs/2002.05202).
+
+*   [MultiHeadRelativeAttention](relative_attention.py) implements a variant
+    of multi-head attention with support for relative position encodings as
+    described in ["Transformer-XL: Attentive Language Models Beyond a
+    Fixed-Length Context"](https://arxiv.org/abs/1901.02860). This also has
+    extended support for segment-based attention, a re-parameterization
+    introduced in ["XLNet: Generalized Autoregressive Pretraining for Language
+    Understanding"](https://arxiv.org/abs/1906.08237).
+
+*   [TwoStreamRelativeAttention](relative_attention.py) implements a variant
+    of multi-head relative attention as described in ["XLNet: Generalized
+    Autoregressive Pretraining for Language Understanding"]
+    (https://arxiv.org/abs/1906.08237). This takes in a query and content
+    stream and applies self attention.
+
+*   [TransformerXL](transformer_xl.py) implements Transformer XL introduced in
+    ["Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"]
+    (https://arxiv.org/abs/1901.02860). This contains `TransformerXLBlock`, a
+    block containing either one or two stream relative self-attention as well as
+    subsequent feedforward networks. It also contains `TransformerXL`, which
+    contains attention biases as well as multiple `TransformerXLBlocks`.
+
+*   [MobileBertEmbedding](mobile_bert_layers.py) and
+    [MobileBertTransformer](mobile_bert_layers.py) implement the embedding layer
+    and also transformer layer proposed in the
+    [MobileBERT paper](https://arxiv.org/pdf/2004.02984.pdf).
+
+*   [BertPackInputs](text_layers.py) and
+    [BertTokenizer](text_layers.py) and [SentencepieceTokenizer](text_layers.py)
+    implements the layer to tokenize raw text and pack them into the inputs for
+    BERT models.
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/__init__.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/__init__.py
new file mode 100644
index 000000000..7bdfb7dbd
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/__init__.py
@@ -0,0 +1,52 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Layers are the fundamental building blocks for NLP models.
+
+They can be used to assemble new `tf.keras` layers or models.
+"""
+# pylint: disable=wildcard-import
+from nlp_modeling.layers.attention import *
+from nlp_modeling.layers.bigbird_attention import BigBirdAttention
+from nlp_modeling.layers.bigbird_attention import BigBirdMasks
+from nlp_modeling.layers.cls_head import *
+from nlp_modeling.layers.dense_einsum import DenseEinsum
+from nlp_modeling.layers.gated_feedforward import GatedFeedforward
+from nlp_modeling.layers.gaussian_process import RandomFeatureGaussianProcess
+from nlp_modeling.layers.kernel_attention import KernelAttention
+from nlp_modeling.layers.kernel_attention import KernelMask
+from nlp_modeling.layers.masked_lm import MaskedLM
+from nlp_modeling.layers.masked_softmax import MaskedSoftmax
+from nlp_modeling.layers.mat_mul_with_margin import MatMulWithMargin
+from nlp_modeling.layers.mobile_bert_layers import MobileBertEmbedding
+from nlp_modeling.layers.mobile_bert_layers import MobileBertMaskedLM
+from nlp_modeling.layers.mobile_bert_layers import MobileBertTransformer
+from nlp_modeling.layers.multi_channel_attention import *
+from nlp_modeling.layers.on_device_embedding import OnDeviceEmbedding
+from nlp_modeling.layers.position_embedding import RelativePositionBias
+from nlp_modeling.layers.position_embedding import RelativePositionEmbedding
+from nlp_modeling.layers.relative_attention import MultiHeadRelativeAttention
+from nlp_modeling.layers.relative_attention import TwoStreamRelativeAttention
+from nlp_modeling.layers.rezero_transformer import ReZeroTransformer
+from nlp_modeling.layers.self_attention_mask import SelfAttentionMask
+from nlp_modeling.layers.spectral_normalization import *
+from nlp_modeling.layers.talking_heads_attention import TalkingHeadsAttention
+from nlp_modeling.layers.text_layers import BertPackInputs
+from nlp_modeling.layers.text_layers import BertTokenizer
+from nlp_modeling.layers.text_layers import SentencepieceTokenizer
+from nlp_modeling.layers.tn_transformer_expand_condense import TNTransformerExpandCondense
+from nlp_modeling.layers.transformer import *
+from nlp_modeling.layers.transformer_scaffold import TransformerScaffold
+from nlp_modeling.layers.transformer_xl import TransformerXL
+from nlp_modeling.layers.transformer_xl import TransformerXLBlock
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/attention.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/attention.py
new file mode 100644
index 000000000..9b13b8969
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/attention.py
@@ -0,0 +1,107 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-based attention layer."""
+# pylint: disable=g-classes-have-attributes
+import math
+
+import tensorflow as tf
+
+EinsumDense = tf.keras.layers.experimental.EinsumDense
+MultiHeadAttention = tf.keras.layers.MultiHeadAttention
+
+
+@tf.keras.utils.register_keras_serializable(package="Text")
+class CachedAttention(tf.keras.layers.MultiHeadAttention):
+  """Attention layer with cache used for auto-agressive decoding.
+
+  Arguments are the same as `tf.keras.layers.MultiHeadAttention` layer.
+  """
+
+  def _update_cache(self, key, value, cache, decode_loop_step):
+    """Updates cache states and gets full-length key/value tensors."""
+    # Combines cached keys and values with new keys and values.
+    if decode_loop_step is not None:
+      # TPU special case.
+      key_seq_dim = cache["key"].shape.as_list()[1]
+      indices = tf.reshape(
+          tf.one_hot(decode_loop_step, key_seq_dim, dtype=key.dtype),
+          [1, key_seq_dim, 1, 1])
+      key = cache["key"] + key * indices
+      value_seq_dim = cache["value"].shape.as_list()[1]
+      indices = tf.reshape(
+          tf.one_hot(decode_loop_step, value_seq_dim, dtype=value.dtype),
+          [1, value_seq_dim, 1, 1])
+      value = cache["value"] + value * indices
+    else:
+      key = tf.concat([tf.cast(cache["key"], key.dtype), key], axis=1)
+      value = tf.concat([tf.cast(cache["value"], value.dtype), value], axis=1)
+
+    # Update cache
+    cache["key"] = key
+    cache["value"] = value
+
+    return key, value
+
+  def call(self,
+           query,
+           value,
+           key=None,
+           attention_mask=None,
+           cache=None,
+           decode_loop_step=None,
+           return_attention_scores=False):
+    if not self._built_from_signature:
+      self._build_from_signature(query=query, value=value, key=key)
+    if key is None:
+      key = value
+
+    # Scalar dimensions referenced here:
+    #   B = batch size (number of sequences)
+    #   F = `from_tensor` sequence length
+    #   T = `to_tensor` sequence length
+    #   N = `num_attention_heads`
+    #   H = `size_per_head`
+    # `query` = [B, F, N ,H]
+    query = self._query_dense(query)
+
+    # `key` = [B, T, N, H]
+    key = self._key_dense(key)
+
+    # `value` = [B, T, N, H]
+    value = self._value_dense(value)
+
+    if cache:
+      key, value = self._update_cache(key, value, cache, decode_loop_step)
+
+    query = tf.multiply(query, 1.0 / math.sqrt(float(self._key_dim)))
+
+    # Take the dot product between "query" and "key" to get the raw
+    # attention scores.
+    attention_scores = tf.einsum(self._dot_product_equation, key, query)
+
+    # Normalize the attention scores to probabilities.
+    # `attention_scores` = [B, N, F, T]
+    attention_scores = self._masked_softmax(attention_scores, attention_mask)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attention_scores = self._dropout_layer(attention_scores)
+    # `context_layer` = [B, F, N, H]
+    attention_output = tf.einsum(self._combine_equation, attention_scores,
+                                 value)
+    attention_output = self._output_dense(attention_output)
+    if return_attention_scores:
+      return attention_output, attention_scores, cache
+    return attention_output, cache
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/bigbird_attention.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/bigbird_attention.py
new file mode 100644
index 000000000..4d3c66244
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/bigbird_attention.py
@@ -0,0 +1,492 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-based bigbird attention layer."""
+
+import numpy as np
+import tensorflow as tf
+
+MAX_SEQ_LEN = 4096
+
+
+def create_band_mask_from_inputs(from_blocked_mask, to_blocked_mask):
+  """Create 3D attention mask from a 2D tensor mask.
+
+  Args:
+    from_blocked_mask: 2D Tensor of shape [batch_size,
+      from_seq_length//from_block_size, from_block_size].
+    to_blocked_mask: int32 Tensor of shape [batch_size,
+      to_seq_length//to_block_size, to_block_size].
+
+  Returns:
+    float Tensor of shape [batch_size, 1, from_seq_length//from_block_size-4,
+                           from_block_size,  3*to_block_size].
+  """
+  exp_blocked_to_pad = tf.concat([
+      to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:,
+                                                                          3:-1]
+  ], 2)
+  band_mask = tf.einsum("BLQ,BLK->BLQK", from_blocked_mask[:, 2:-2],
+                        exp_blocked_to_pad)
+  band_mask = tf.expand_dims(band_mask, 1)
+  return band_mask
+
+
+def bigbird_block_rand_mask(from_seq_length,
+                            to_seq_length,
+                            from_block_size,
+                            to_block_size,
+                            num_rand_blocks,
+                            last_idx=-1):
+  """Create adjacency list of random attention.
+
+  Args:
+    from_seq_length: int. length of from sequence.
+    to_seq_length: int. length of to sequence.
+    from_block_size: int. size of block in from sequence.
+    to_block_size: int. size of block in to sequence.
+    num_rand_blocks: int. Number of random chunks per row.
+    last_idx: if -1 then num_rand_blocks blocks chosen anywhere in to sequence,
+      if positive then num_rand_blocks blocks choosen only upto last_idx.
+
+  Returns:
+    adjacency list of size from_seq_length//from_block_size-2 by num_rand_blocks
+  """
+  assert from_seq_length//from_block_size == to_seq_length//to_block_size, \
+      "Error the number of blocks needs to be same!"
+
+  rand_attn = np.zeros(
+      (from_seq_length // from_block_size - 2, num_rand_blocks), dtype=np.int32)
+  middle_seq = np.arange(1, to_seq_length // to_block_size - 1, dtype=np.int32)
+  last = to_seq_length // to_block_size - 1
+  if last_idx > (2 * to_block_size):
+    last = (last_idx // to_block_size) - 1
+
+  r = num_rand_blocks  # shorthand
+  for i in range(1, from_seq_length // from_block_size - 1):
+    start = i - 2
+    end = i
+    if i == 1:
+      rand_attn[i - 1, :] = np.random.permutation(middle_seq[2:last])[:r]
+    elif i == 2:
+      rand_attn[i - 1, :] = np.random.permutation(middle_seq[3:last])[:r]
+    elif i == from_seq_length // from_block_size - 3:
+      rand_attn[i - 1, :] = np.random.permutation(middle_seq[:last])[:r]
+      # Missing -3: should have been sliced till last-3
+    elif i == from_seq_length // from_block_size - 2:
+      rand_attn[i - 1, :] = np.random.permutation(middle_seq[:last])[:r]
+      # Missing -4: should have been sliced till last-4
+    else:
+      if start > last:
+        start = last
+        rand_attn[i - 1, :] = np.random.permutation(middle_seq[:start])[:r]
+      elif (end + 1) == last:
+        rand_attn[i - 1, :] = np.random.permutation(middle_seq[:start])[:r]
+      else:
+        rand_attn[i - 1, :] = np.random.permutation(
+            np.concatenate((middle_seq[:start], middle_seq[end + 1:last])))[:r]
+  return rand_attn
+
+
+def create_rand_mask_from_inputs(from_blocked_mask, to_blocked_mask, rand_attn,
+                                 num_attention_heads, num_rand_blocks,
+                                 batch_size, from_seq_length, from_block_size):
+  """Create 3D attention mask from a 2D tensor mask.
+
+  Args:
+    from_blocked_mask: 2D Tensor of shape [batch_size,
+      from_seq_length//from_block_size, from_block_size].
+    to_blocked_mask: int32 Tensor of shape [batch_size,
+      to_seq_length//to_block_size, to_block_size].
+    rand_attn: [batch_size, num_attention_heads,
+      from_seq_length//from_block_size-2, num_rand_blocks]
+    num_attention_heads: int. Number of attention heads.
+    num_rand_blocks: int. Number of random chunks per row.
+    batch_size: int. Batch size for computation.
+    from_seq_length: int. length of from sequence.
+    from_block_size: int. size of block in from sequence.
+
+  Returns:
+    float Tensor of shape [batch_size, num_attention_heads,
+                           from_seq_length//from_block_size-2,
+                           from_block_size, num_rand_blocks*to_block_size].
+  """
+  num_windows = from_seq_length // from_block_size - 2
+  rand_mask = tf.reshape(
+      tf.gather(to_blocked_mask, rand_attn, batch_dims=1), [
+          batch_size, num_attention_heads, num_windows,
+          num_rand_blocks * from_block_size
+      ])
+  rand_mask = tf.einsum("BLQ,BHLK->BHLQK", from_blocked_mask[:, 1:-1],
+                        rand_mask)
+  return rand_mask
+
+
+def bigbird_block_sparse_attention(
+    query_layer, key_layer, value_layer, band_mask, from_mask, to_mask,
+    from_blocked_mask, to_blocked_mask, rand_attn, num_attention_heads,
+    num_rand_blocks, size_per_head, batch_size, from_seq_length, to_seq_length,
+    from_block_size, to_block_size):
+  """BigBird attention sparse calculation using blocks in linear time.
+
+  Assumes from_seq_length//from_block_size == to_seq_length//to_block_size.
+
+
+  Args:
+    query_layer: float Tensor of shape [batch_size, num_attention_heads,
+      from_seq_length, size_per_head]
+    key_layer: float Tensor of shape [batch_size, num_attention_heads,
+      to_seq_length, size_per_head]
+    value_layer: float Tensor of shape [batch_size, num_attention_heads,
+      to_seq_length, size_per_head]
+    band_mask: (optional) int32 Tensor of shape [batch_size, 1,
+      from_seq_length//from_block_size-4, from_block_size, 3*to_block_size]. The
+      values should be 1 or 0. The attention scores will effectively be set to
+      -infinity for any positions in the mask that are 0, and will be unchanged
+      for positions that are 1.
+    from_mask: (optional) int32 Tensor of shape [batch_size, 1, from_seq_length,
+      1]. The values should be 1 or 0. The attention scores will effectively be
+      set to -infinity for any positions in the mask that are 0, and will be
+      unchanged for positions that are 1.
+    to_mask: (optional) int32 Tensor of shape [batch_size, 1, 1, to_seq_length].
+      The values should be 1 or 0. The attention scores will effectively be set
+      to -infinity for any positions in the mask that are 0, and will be
+      unchanged for positions that are 1.
+    from_blocked_mask: (optional) int32 Tensor of shape [batch_size,
+      from_seq_length//from_block_size, from_block_size]. Same as from_mask,
+      just reshaped.
+    to_blocked_mask: (optional) int32 Tensor of shape [batch_size,
+      to_seq_length//to_block_size, to_block_size]. Same as to_mask, just
+      reshaped.
+    rand_attn: [batch_size, num_attention_heads,
+      from_seq_length//from_block_size-2, num_rand_blocks]
+    num_attention_heads: int. Number of attention heads.
+    num_rand_blocks: int. Number of random chunks per row.
+    size_per_head: int. Size of each attention head.
+    batch_size: int. Batch size for computation.
+    from_seq_length: int. length of from sequence.
+    to_seq_length: int. length of to sequence.
+    from_block_size: int. size of block in from sequence.
+    to_block_size: int. size of block in to sequence.
+
+  Returns:
+    float Tensor of shape [batch_size, from_seq_length, num_attention_heads,
+      size_per_head].
+  """
+  rand_attn = tf.expand_dims(rand_attn, 0)
+  rand_attn = tf.repeat(rand_attn, batch_size, 0)
+
+  rand_mask = create_rand_mask_from_inputs(
+      from_blocked_mask,
+      to_blocked_mask,
+      rand_attn,
+      num_attention_heads,
+      num_rand_blocks,
+      batch_size,
+      from_seq_length,
+      from_block_size,
+  )
+
+  # Define shorthands
+  h = num_attention_heads
+  r = num_rand_blocks
+  d = size_per_head
+  b = batch_size
+  m = from_seq_length
+  n = to_seq_length
+  wm = from_block_size
+  wn = to_block_size
+  dtype = query_layer.dtype
+  query_layer = tf.transpose(query_layer, perm=[0, 2, 1, 3])
+  key_layer = tf.transpose(key_layer, perm=[0, 2, 1, 3])
+  value_layer = tf.transpose(value_layer, perm=[0, 2, 1, 3])
+  blocked_query_matrix = tf.reshape(query_layer, (b, h, m // wm, wm, -1))
+  blocked_key_matrix = tf.reshape(key_layer, (b, h, n // wn, wn, -1))
+  blocked_value_matrix = tf.reshape(value_layer, (b, h, n // wn, wn, -1))
+  gathered_key = tf.reshape(
+      tf.gather(blocked_key_matrix, rand_attn, batch_dims=2, name="gather_key"),
+      (b, h, m // wm - 2, r * wn, -1))  # [b, h, n//wn-2, r, wn, -1]
+  gathered_value = tf.reshape(
+      tf.gather(
+          blocked_value_matrix, rand_attn, batch_dims=2, name="gather_value"),
+      (b, h, m // wm - 2, r * wn, -1))  # [b, h, n//wn-2, r, wn, -1]
+  first_product = tf.einsum(
+      "BHQD,BHKD->BHQK", blocked_query_matrix[:, :, 0],
+      key_layer)  # [b, h, wm, -1] x [b, h, n, -1] ==> [b, h, wm, n]
+  first_product = tf.multiply(first_product, 1.0 / np.sqrt(d))
+  first_product += (1.0 - tf.cast(to_mask, dtype=dtype)) * -10000.0
+  first_attn_weights = tf.nn.softmax(first_product)  # [b, h, wm, n]
+  first_context_layer = tf.einsum(
+      "BHQK,BHKD->BHQD", first_attn_weights,
+      value_layer)  # [b, h, wm, n] x [b, h, n, -1] ==> [b, h, wm, -1]
+  first_context_layer = tf.expand_dims(first_context_layer, 2)
+
+  second_key_mat = tf.concat([
+      blocked_key_matrix[:, :, 0], blocked_key_matrix[:, :, 1],
+      blocked_key_matrix[:, :, 2], blocked_key_matrix[:, :,
+                                                      -1], gathered_key[:, :, 0]
+  ], 2)  # [b, h, (4+r)*wn, -1]
+  second_value_mat = tf.concat([
+      blocked_value_matrix[:, :, 0], blocked_value_matrix[:, :, 1],
+      blocked_value_matrix[:, :, 2], blocked_value_matrix[:, :, -1],
+      gathered_value[:, :, 0]
+  ], 2)  # [b, h, (4+r)*wn, -1]
+  second_product = tf.einsum(
+      "BHQD,BHKD->BHQK", blocked_query_matrix[:, :, 1], second_key_mat
+  )  # [b, h, wm, -1] x [b, h, (4+r)*wn, -1] ==> [b, h, wm, (4+r)*wn]
+  second_seq_pad = tf.concat([
+      to_mask[:, :, :, :3 * wn], to_mask[:, :, :, -wn:],
+      tf.ones([b, 1, 1, r * wn], dtype=dtype)
+  ], 3)
+  second_rand_pad = tf.concat([
+      tf.ones([b, h, wm, 4 * wn], dtype=dtype), rand_mask[:, :, 0]
+  ], 3)
+  second_product = tf.multiply(second_product, 1.0 / np.sqrt(d))
+  second_product += (1.0 -
+                     tf.minimum(second_seq_pad, second_rand_pad)) * -10000.0
+  second_attn_weights = tf.nn.softmax(second_product)  # [b , h, wm, (4+r)*wn]
+  second_context_layer = tf.einsum(
+      "BHQK,BHKD->BHQD", second_attn_weights, second_value_mat
+  )  # [b, h, wm, (4+r)*wn] x [b, h, (4+r)*wn, -1] ==> [b, h, wm, -1]
+  second_context_layer = tf.expand_dims(second_context_layer, 2)
+
+  exp_blocked_key_matrix = tf.concat([
+      blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2],
+      blocked_key_matrix[:, :, 3:-1]
+  ], 3)  # [b, h, m//wm-4, 3*wn, -1]
+  exp_blocked_value_matrix = tf.concat([
+      blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2],
+      blocked_value_matrix[:, :, 3:-1]
+  ], 3)  # [b, h, m//wm-4, 3*wn, -1]
+  middle_query_matrix = blocked_query_matrix[:, :, 2:-2]
+  inner_band_product = tf.einsum(
+      "BHLQD,BHLKD->BHLQK", middle_query_matrix, exp_blocked_key_matrix
+  )  # [b, h, m//wm-4, wm, -1] x [b, h, m//wm-4, 3*wn, -1]
+  #     ==> [b, h, m//wm-4, wm, 3*wn]
+  inner_band_product = tf.multiply(inner_band_product, 1.0 / np.sqrt(d))
+  rand_band_product = tf.einsum(
+      "BHLQD,BHLKD->BHLQK", middle_query_matrix,
+      gathered_key[:, :,
+                   1:-1])  # [b, h, m//wm-4, wm, -1] x [b, h, m//wm-4, r*wn, -1]
+  #     ==> [b, h, m//wm-4, wm, r*wn]
+  rand_band_product = tf.multiply(rand_band_product, 1.0 / np.sqrt(d))
+  first_band_product = tf.einsum(
+      "BHLQD,BHKD->BHLQK", middle_query_matrix, blocked_key_matrix[:, :, 0]
+  )  # [b, h, m//wm-4, wm, -1] x [b, h, wn, -1] ==> [b, h, m//wm-4, wm, wn]
+  first_band_product = tf.multiply(first_band_product, 1.0 / np.sqrt(d))
+  last_band_product = tf.einsum(
+      "BHLQD,BHKD->BHLQK", middle_query_matrix, blocked_key_matrix[:, :, -1]
+  )  # [b, h, m//wm-4, wm, -1] x [b, h, wn, -1] ==> [b, h, m//wm-4, wm, wn]
+  last_band_product = tf.multiply(last_band_product, 1.0 / np.sqrt(d))
+  inner_band_product += (1.0 - band_mask) * -10000.0
+  first_band_product += (1.0 -
+                         tf.expand_dims(to_mask[:, :, :, :wn], 3)) * -10000.0
+  last_band_product += (1.0 -
+                        tf.expand_dims(to_mask[:, :, :, -wn:], 3)) * -10000.0
+  rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * -10000.0
+  band_product = tf.concat([
+      first_band_product, inner_band_product, rand_band_product,
+      last_band_product
+  ], -1)  # [b, h, m//wm-4, wm, (5+r)*wn]
+  attn_weights = tf.nn.softmax(band_product)  # [b, h, m//wm-4, wm, (5+r)*wn]
+  context_layer = tf.einsum(
+      "BHLQK,BHLKD->BHLQD", attn_weights[:, :, :, :,
+                                         wn:4 * wn], exp_blocked_value_matrix
+  )  # [b, h, m//wm-4, wm, 3*wn] x [b, h, m//wm-4, 3*wn, -1]
+  #     ==> [b, h, m//wm-4, wm, -1]
+  context_layer += tf.einsum(
+      "BHLQK,BHLKD->BHLQD", attn_weights[:, :, :, :,
+                                         4 * wn:-wn], gathered_value[:, :, 1:-1]
+  )  # [b, h, m//wm-4, wm, r*wn] x [b, h, m//wm-4, r*wn, -1]
+  #     ==> [b, h, m//wm-4, wm, -1]
+  context_layer += tf.einsum(
+      "BHLQK,BHKD->BHLQD", attn_weights[:, :, :, :, :wn],
+      blocked_value_matrix[:, :, 0]
+  )  # [b, h, m//wm-4, wm, wn] x [b, h, wn, -1] ==> [b, h, m//wm-4, wm, -1]
+  context_layer += tf.einsum(
+      "BHLQK,BHKD->BHLQD", attn_weights[:, :, :, :,
+                                        -wn:], blocked_value_matrix[:, :, -1]
+  )  # [b, h, m//wm-4, wm, wn] x [b, h, wn, -1] ==> [b, h, m//wm-4, wm, -1]
+
+  second_last_key_mat = tf.concat([
+      blocked_key_matrix[:, :, 0], blocked_key_matrix[:, :, -3],
+      blocked_key_matrix[:, :, -2], blocked_key_matrix[:, :, -1],
+      gathered_key[:, :, -1]
+  ], 2)  # [b, h, (4+r)*wn, -1]
+  second_last_value_mat = tf.concat([
+      blocked_value_matrix[:, :, 0], blocked_value_matrix[:, :, -3],
+      blocked_value_matrix[:, :, -2], blocked_value_matrix[:, :, -1],
+      gathered_value[:, :, -1]
+  ], 2)  # [b, h, (4+r)*wn, -1]
+  second_last_product = tf.einsum(
+      "BHQD,BHKD->BHQK", blocked_query_matrix[:, :, -2], second_last_key_mat
+  )  # [b, h, wm, -1] x [b, h, (4+r)*wn, -1] ==> [b, h, wm, (4+r)*wn]
+  second_last_seq_pad = tf.concat([
+      to_mask[:, :, :, :wn], to_mask[:, :, :, -3 * wn:],
+      tf.ones([b, 1, 1, r * wn], dtype=dtype)
+  ], 3)
+  second_last_rand_pad = tf.concat(
+      [tf.ones([b, h, wm, 4 * wn], dtype=dtype), rand_mask[:, :, -1]], 3)
+  second_last_product = tf.multiply(second_last_product, 1.0 / np.sqrt(d))
+  second_last_product += (
+      1.0 - tf.minimum(second_last_seq_pad, second_last_rand_pad)) * -10000.0
+  second_last_attn_weights = tf.nn.softmax(
+      second_last_product)  # [b, h, wm, (4+r)*wn]
+  second_last_context_layer = tf.einsum(
+      "BHQK,BHKD->BHQD", second_last_attn_weights, second_last_value_mat
+  )  # [b, h, wm, (4+r)*wn] x [b, h, (4+r)*wn, -1] ==> [b, h, wm, -1]
+  second_last_context_layer = tf.expand_dims(second_last_context_layer, 2)
+
+  last_product = tf.einsum(
+      "BHQD,BHKD->BHQK", blocked_query_matrix[:, :, -1],
+      key_layer)  # [b, h, wm, -1] x [b, h, n, -1] ==> [b, h, wm, n]
+  last_product = tf.multiply(last_product, 1.0 / np.sqrt(d))
+  last_product += (1.0 - to_mask) * -10000.0
+  last_attn_weights = tf.nn.softmax(last_product)  # [b, h, wm, n]
+  last_context_layer = tf.einsum(
+      "BHQK,BHKD->BHQD", last_attn_weights,
+      value_layer)  # [b, h, wm, n] x [b, h, n, -1] ==> [b, h, wm, -1]
+  last_context_layer = tf.expand_dims(last_context_layer, 2)
+
+  context_layer = tf.concat([
+      first_context_layer, second_context_layer, context_layer,
+      second_last_context_layer, last_context_layer
+  ], 2)
+  context_layer = tf.reshape(context_layer, (b, h, m, -1)) * from_mask
+  context_layer = tf.transpose(context_layer, (0, 2, 1, 3))
+  return context_layer
+
+
+class BigBirdMasks(tf.keras.layers.Layer):
+  """Creates bigbird attention masks."""
+
+  def __init__(self, block_size, **kwargs):
+    super().__init__(**kwargs)
+    self._block_size = block_size
+
+  def call(self, inputs, mask):
+    encoder_shape = tf.shape(mask)
+    mask = tf.cast(mask, inputs.dtype)
+    batch_size, seq_length = encoder_shape[0], encoder_shape[1]
+    # reshape for blocking
+    blocked_encoder_mask = tf.reshape(
+        mask, (batch_size, seq_length // self._block_size, self._block_size))
+    encoder_from_mask = tf.reshape(mask, (batch_size, 1, seq_length, 1))
+    encoder_to_mask = tf.reshape(mask, (batch_size, 1, 1, seq_length))
+
+    band_mask = create_band_mask_from_inputs(blocked_encoder_mask,
+                                             blocked_encoder_mask)
+    return [band_mask, encoder_from_mask, encoder_to_mask, blocked_encoder_mask]
+
+
+@tf.keras.utils.register_keras_serializable(package="Text")
+class BigBirdAttention(tf.keras.layers.MultiHeadAttention):
+  """BigBird, a sparse attention mechanism.
+
+  This layer follows the paper "Big Bird: Transformers for Longer Sequences"
+  (https://arxiv.org/abs/2007.14062).
+  It reduces this quadratic dependency of attention
+  computation to linear.
+
+  Arguments are the same as `MultiHeadAttention` layer.
+  """
+
+  def __init__(self,
+               num_rand_blocks=3,
+               from_block_size=64,
+               to_block_size=64,
+               max_rand_mask_length=MAX_SEQ_LEN,
+               seed=None,
+               **kwargs):
+    super().__init__(**kwargs)
+    self._num_rand_blocks = num_rand_blocks
+    self._from_block_size = from_block_size
+    self._to_block_size = to_block_size
+    self._seed = seed
+
+    # Generates random attention.
+    np.random.seed(self._seed)
+    # pylint: disable=g-complex-comprehension
+    rand_attn = [
+        bigbird_block_rand_mask(
+            max_rand_mask_length,
+            max_rand_mask_length,
+            from_block_size,
+            to_block_size,
+            num_rand_blocks,
+            last_idx=1024) for _ in range(self._num_heads)
+    ]
+    # pylint: enable=g-complex-comprehension
+    rand_attn = np.stack(rand_attn, axis=0)
+    self.rand_attn = tf.constant(rand_attn, dtype=tf.int32)
+
+  def _compute_attention(self, query, key, value, attention_mask=None):
+    (band_mask, encoder_from_mask, encoder_to_mask,
+     blocked_encoder_mask) = attention_mask
+    query_shape = tf.shape(query)
+    from_seq_length = query_shape[1]
+    to_seq_length = tf.shape(key)[1]
+    rand_attn = self.rand_attn[:, :(from_seq_length // self._from_block_size -
+                                    2)]
+    return bigbird_block_sparse_attention(
+        query,
+        key,
+        value,
+        band_mask,
+        encoder_from_mask,
+        encoder_to_mask,
+        blocked_encoder_mask,
+        blocked_encoder_mask,
+        num_attention_heads=self._num_heads,
+        num_rand_blocks=self._num_rand_blocks,
+        size_per_head=self._key_dim,
+        batch_size=query_shape[0],
+        from_seq_length=from_seq_length,
+        to_seq_length=to_seq_length,
+        from_block_size=self._from_block_size,
+        to_block_size=self._to_block_size,
+        rand_attn=rand_attn)
+
+  def call(self, query, value, key=None, attention_mask=None, **kwargs):
+    if not self._built_from_signature:
+      self._build_from_signature(query=query, value=value, key=key)
+    if key is None:
+      key = value
+
+    #   N = `num_attention_heads`
+    #   H = `size_per_head`
+    # `query` = [B, T, N ,H]
+    query = self._query_dense(query)
+
+    # `key` = [B, S, N, H]
+    key = self._key_dense(key)
+
+    # `value` = [B, S, N, H]
+    value = self._value_dense(value)
+
+    attention_output = self._compute_attention(query, key, value,
+                                               attention_mask)
+    attention_output.set_shape([None, None, self._num_heads, self._value_dim])
+    attention_output = self._output_dense(attention_output)
+    return attention_output
+
+  def get_config(self):
+    config = {
+        "num_rand_blocks": self._num_rand_blocks,
+        "from_block_size": self._from_block_size,
+        "to_block_size": self._to_block_size,
+        "seed": self._seed
+    }
+    base_config = super().get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/cls_head.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/cls_head.py
new file mode 100644
index 000000000..f2b6ad223
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/cls_head.py
@@ -0,0 +1,334 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A Classification head layer which is common used with sequence encoders."""
+
+import tensorflow as tf
+
+from modeling import tf_utils
+
+from nlp_modeling.layers import gaussian_process
+from nlp_modeling.layers import spectral_normalization
+
+
+class ClassificationHead(tf.keras.layers.Layer):
+  """Pooling head for sentence-level classification tasks."""
+
+  def __init__(self,
+               inner_dim,
+               num_classes,
+               cls_token_idx=0,
+               activation="tanh",
+               dropout_rate=0.0,
+               initializer="glorot_uniform",
+               **kwargs):
+    """Initializes the `ClassificationHead`.
+
+    Args:
+      inner_dim: The dimensionality of inner projection layer. If 0 or `None`
+        then only the output projection layer is created.
+      num_classes: Number of output classes.
+      cls_token_idx: The index inside the sequence to pool.
+      activation: Dense layer activation.
+      dropout_rate: Dropout probability.
+      initializer: Initializer for dense layer kernels.
+      **kwargs: Keyword arguments.
+    """
+    super().__init__(**kwargs)
+    self.dropout_rate = dropout_rate
+    self.inner_dim = inner_dim
+    self.num_classes = num_classes
+    self.activation = tf_utils.get_activation(activation)
+    self.initializer = tf.keras.initializers.get(initializer)
+    self.cls_token_idx = cls_token_idx
+
+    if self.inner_dim:
+      self.dense = tf.keras.layers.Dense(
+          units=self.inner_dim,
+          activation=self.activation,
+          kernel_initializer=self.initializer,
+          name="pooler_dense")
+      self.dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)
+
+    self.out_proj = tf.keras.layers.Dense(
+        units=num_classes, kernel_initializer=self.initializer, name="logits")
+
+  def call(self, features):
+    if not self.inner_dim:
+      x = features
+    else:
+      x = features[:, self.cls_token_idx, :]  # take <CLS> token.
+      x = self.dense(x)
+      x = self.dropout(x)
+
+    x = self.out_proj(x)
+    return x
+
+  def get_config(self):
+    config = {
+        "cls_token_idx": self.cls_token_idx,
+        "dropout_rate": self.dropout_rate,
+        "num_classes": self.num_classes,
+        "inner_dim": self.inner_dim,
+        "activation": tf.keras.activations.serialize(self.activation),
+        "initializer": tf.keras.initializers.serialize(self.initializer),
+    }
+    config.update(super(ClassificationHead, self).get_config())
+    return config
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
+  @property
+  def checkpoint_items(self):
+    return {self.dense.name: self.dense}
+
+
+class MultiClsHeads(tf.keras.layers.Layer):
+  """Pooling heads sharing the same pooling stem."""
+
+  def __init__(self,
+               inner_dim,
+               cls_list,
+               cls_token_idx=0,
+               activation="tanh",
+               dropout_rate=0.0,
+               initializer="glorot_uniform",
+               **kwargs):
+    """Initializes the `MultiClsHeads`.
+
+    Args:
+      inner_dim: The dimensionality of inner projection layer. If 0 or `None`
+        then only the output projection layer is created.
+      cls_list: a list of pairs of (classification problem name and the numbers
+        of classes.
+      cls_token_idx: The index inside the sequence to pool.
+      activation: Dense layer activation.
+      dropout_rate: Dropout probability.
+      initializer: Initializer for dense layer kernels.
+      **kwargs: Keyword arguments.
+    """
+    super().__init__(**kwargs)
+    self.dropout_rate = dropout_rate
+    self.inner_dim = inner_dim
+    self.cls_list = cls_list
+    self.activation = tf_utils.get_activation(activation)
+    self.initializer = tf.keras.initializers.get(initializer)
+    self.cls_token_idx = cls_token_idx
+
+    if self.inner_dim:
+      self.dense = tf.keras.layers.Dense(
+          units=inner_dim,
+          activation=self.activation,
+          kernel_initializer=self.initializer,
+          name="pooler_dense")
+      self.dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)
+    self.out_projs = []
+    for name, num_classes in cls_list:
+      self.out_projs.append(
+          tf.keras.layers.Dense(
+              units=num_classes, kernel_initializer=self.initializer,
+              name=name))
+
+  def call(self, features):
+    if not self.inner_dim:
+      x = features
+    else:
+      x = features[:, self.cls_token_idx, :]  # take <CLS> token.
+      x = self.dense(x)
+      x = self.dropout(x)
+
+    outputs = {}
+    for proj_layer in self.out_projs:
+      outputs[proj_layer.name] = proj_layer(x)
+    return outputs
+
+  def get_config(self):
+    config = {
+        "dropout_rate": self.dropout_rate,
+        "cls_token_idx": self.cls_token_idx,
+        "cls_list": self.cls_list,
+        "inner_dim": self.inner_dim,
+        "activation": tf.keras.activations.serialize(self.activation),
+        "initializer": tf.keras.initializers.serialize(self.initializer),
+    }
+    config.update(super().get_config())
+    return config
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
+  @property
+  def checkpoint_items(self):
+    items = {self.dense.name: self.dense}
+    items.update({v.name: v for v in self.out_projs})
+    return items
+
+
+class GaussianProcessClassificationHead(ClassificationHead):
+  """Gaussian process-based pooling head for sentence classification.
+
+  This class implements a classifier head for BERT encoder that is based on the
+  spectral-normalized neural Gaussian process (SNGP) [1]. SNGP is a simple
+  method to improve a neural network's uncertainty quantification ability
+  without sacrificing accuracy or lantency. It applies spectral normalization to
+  the hidden pooler layer, and then replaces the dense output layer with a
+  Gaussian process.
+
+
+  [1]: Jeremiah Liu et al. Simple and Principled Uncertainty Estimation with
+       Deterministic Deep Learning via Distance Awareness.
+       In _Neural Information Processing Systems_, 2020.
+       https://arxiv.org/abs/2006.10108
+  """
+
+  def __init__(self,
+               inner_dim,
+               num_classes,
+               cls_token_idx=0,
+               activation="tanh",
+               dropout_rate=0.0,
+               initializer="glorot_uniform",
+               use_spec_norm=True,
+               use_gp_layer=True,
+               temperature=None,
+               **kwargs):
+    """Initializes the `GaussianProcessClassificationHead`.
+
+    Args:
+      inner_dim: The dimensionality of inner projection layer. If 0 or `None`
+        then only the output projection layer is created.
+      num_classes: Number of output classes.
+      cls_token_idx: The index inside the sequence to pool.
+      activation: Dense layer activation.
+      dropout_rate: Dropout probability.
+      initializer: Initializer for dense layer kernels.
+      use_spec_norm: Whether to apply spectral normalization to pooler layer.
+      use_gp_layer: Whether to use Gaussian process as the output layer.
+      temperature: The temperature parameter to be used for mean-field
+        approximation during inference. If None then no mean-field adjustment is
+        applied.
+      **kwargs: Additional keyword arguments.
+    """
+    # Collects spectral normalization and Gaussian process args from kwargs.
+    self.use_spec_norm = use_spec_norm
+    self.use_gp_layer = use_gp_layer
+    self.spec_norm_kwargs = extract_spec_norm_kwargs(kwargs)
+    self.gp_layer_kwargs = extract_gp_layer_kwargs(kwargs)
+    self.temperature = temperature
+
+    super().__init__(
+        inner_dim=inner_dim,
+        num_classes=num_classes,
+        cls_token_idx=cls_token_idx,
+        activation=activation,
+        dropout_rate=dropout_rate,
+        initializer=initializer,
+        **kwargs)
+
+    # Applies spectral normalization to the dense pooler layer.
+    if self.use_spec_norm and hasattr(self, "dense"):
+      self.dense = spectral_normalization.SpectralNormalization(
+          self.dense, inhere_layer_name=True, **self.spec_norm_kwargs)
+
+    # Replace Dense output layer with the Gaussian process layer.
+    if use_gp_layer:
+      self.out_proj = gaussian_process.RandomFeatureGaussianProcess(
+          self.num_classes,
+          kernel_initializer=self.initializer,
+          name="logits",
+          **self.gp_layer_kwargs)
+
+  def call(self, features, training=False, return_covmat=False):
+    """Returns model output.
+
+    Dring training, the model returns raw logits. During evaluation, the model
+    returns uncertainty adjusted logits, and (optionally) the covariance matrix.
+
+    Arguments:
+      features: A tensor of input features, shape (batch_size, feature_dim).
+      training: Whether the model is in training mode.
+      return_covmat: Whether the model should also return covariance matrix if
+        `use_gp_layer=True`. During training, it is recommended to set
+        `return_covmat=False` to be compatible with the standard Keras pipelines
+        (e.g., `model.fit()`).
+
+    Returns:
+      logits: Uncertainty-adjusted predictive logits, shape
+        (batch_size, num_classes).
+      covmat: (Optional) Covariance matrix, shape (batch_size, batch_size).
+        Returned only when return_covmat=True.
+    """
+    logits = super().call(features)
+
+    # Extracts logits and covariance matrix from model output.
+    if self.use_gp_layer:
+      logits, covmat = logits
+    else:
+      covmat = None
+
+    # Computes the uncertainty-adjusted logits during evaluation.
+    if not training:
+      logits = gaussian_process.mean_field_logits(
+          logits, covmat, mean_field_factor=self.temperature)
+
+    if return_covmat and covmat is not None:
+      return logits, covmat
+    return logits
+
+  def reset_covariance_matrix(self):
+    """Resets covariance matrix of the Gaussian process layer."""
+    if hasattr(self.out_proj, "reset_covariance_matrix"):
+      self.out_proj.reset_covariance_matrix()
+
+  def get_config(self):
+    config = dict(
+        use_spec_norm=self.use_spec_norm, use_gp_layer=self.use_gp_layer)
+
+    config.update(self.spec_norm_kwargs)
+    config.update(self.gp_layer_kwargs)
+    config["temperature"] = self.temperature
+
+    config.update(super(GaussianProcessClassificationHead, self).get_config())
+    return config
+
+
+def extract_gp_layer_kwargs(kwargs):
+  """Extracts Gaussian process layer configs from a given kwarg."""
+
+  return dict(
+      num_inducing=kwargs.pop("num_inducing", 1024),
+      normalize_input=kwargs.pop("normalize_input", True),
+      gp_cov_momentum=kwargs.pop("gp_cov_momentum", 0.999),
+      gp_cov_ridge_penalty=kwargs.pop("gp_cov_ridge_penalty", 1.),
+      scale_random_features=kwargs.pop("scale_random_features", False),
+      l2_regularization=kwargs.pop("l2_regularization", 1e-6),
+      gp_cov_likelihood=kwargs.pop("gp_cov_likelihood", "gaussian"),
+      return_gp_cov=kwargs.pop("return_gp_cov", True),
+      return_random_features=kwargs.pop("return_random_features", False),
+      use_custom_random_features=kwargs.pop("use_custom_random_features", True),
+      custom_random_features_initializer=kwargs.pop(
+          "custom_random_features_initializer", "random_normal"),
+      custom_random_features_activation=kwargs.pop(
+          "custom_random_features_activation", None))
+
+
+def extract_spec_norm_kwargs(kwargs):
+  """Extracts spectral normalization configs from a given kwarg."""
+
+  return dict(
+      iteration=kwargs.pop("iteration", 1),
+      norm_multiplier=kwargs.pop("norm_multiplier", .99))
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/dense_einsum.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/dense_einsum.py
new file mode 100644
index 000000000..f54c14b72
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/dense_einsum.py
@@ -0,0 +1,180 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-based einsum layer."""
+# pylint: disable=g-classes-have-attributes
+
+import tensorflow as tf
+
+from tensorflow.python.util import deprecation
+
+_CHR_IDX = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m"]
+
+
+@tf.keras.utils.register_keras_serializable(package="Text")
+class DenseEinsum(tf.keras.layers.Layer):
+  """A densely connected layer that uses `tf.einsum` as the backing computation.
+
+  This layer can perform einsum calculations of arbitrary dimensionality.
+
+  Args:
+    output_shape: Positive integer or tuple, dimensionality of the output space.
+    num_summed_dimensions: The number of dimensions to sum over. Standard 2D
+      matmul should use 1, 3D matmul should use 2, and so forth.
+    activation: Activation function to use. If you don't specify anything, no
+      activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix.
+    bias_initializer: Initializer for the bias vector.
+    kernel_regularizer: Regularizer function applied to the `kernel` weights
+      matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to the output of the
+      layer (its "activation")..
+    kernel_constraint: Constraint function applied to the `kernel` weights
+      matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+  Input shape:
+    N-D tensor with shape: `(batch_size, ..., input_dim)`. The most common
+      situation would be a 2D input with shape `(batch_size, input_dim)`.
+  Output shape:
+    N-D tensor with shape: `(batch_size, ..., units)`. For instance, for a 2D
+      input with shape `(batch_size, input_dim)`, the output would have shape
+      `(batch_size, units)`.
+  """
+
+  @deprecation.deprecated(None, "DenseEinsum is deprecated. Please use "
+                          "tf.keras.experimental.EinsumDense layer instead.")
+  def __init__(self,
+               output_shape,
+               num_summed_dimensions=1,
+               activation=None,
+               use_bias=True,
+               kernel_initializer="glorot_uniform",
+               bias_initializer="zeros",
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    super(DenseEinsum, self).__init__(**kwargs)
+    self._output_shape = output_shape if isinstance(
+        output_shape, (list, tuple)) else (output_shape,)
+    self._activation = tf.keras.activations.get(activation)
+    self._use_bias = use_bias
+    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
+    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
+    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
+    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
+    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
+    self._num_summed_dimensions = num_summed_dimensions
+    self._einsum_string = None
+
+  def _build_einsum_string(self, free_input_dims, bound_dims, output_dims):
+    input_str = ""
+    kernel_str = ""
+    output_str = ""
+    letter_offset = 0
+    for i in range(free_input_dims):
+      char = _CHR_IDX[i + letter_offset]
+      input_str += char
+      output_str += char
+
+    letter_offset += free_input_dims
+    for i in range(bound_dims):
+      char = _CHR_IDX[i + letter_offset]
+      input_str += char
+      kernel_str += char
+
+    letter_offset += bound_dims
+    for i in range(output_dims):
+      char = _CHR_IDX[i + letter_offset]
+      kernel_str += char
+      output_str += char
+
+    return input_str + "," + kernel_str + "->" + output_str
+
+  def build(self, input_shape):
+    input_shape = tf.TensorShape(input_shape)
+    input_rank = input_shape.rank
+    free_input_dims = input_rank - self._num_summed_dimensions
+    output_dims = len(self._output_shape)
+
+    self._einsum_string = self._build_einsum_string(free_input_dims,
+                                                    self._num_summed_dimensions,
+                                                    output_dims)
+
+    # This is only saved for testing purposes.
+    self._kernel_shape = (
+        input_shape[free_input_dims:].concatenate(self._output_shape))
+
+    self._kernel = self.add_weight(
+        "kernel",
+        shape=self._kernel_shape,
+        initializer=self._kernel_initializer,
+        regularizer=self._kernel_regularizer,
+        constraint=self._kernel_constraint,
+        dtype=self.dtype,
+        trainable=True)
+    if self._use_bias:
+      self._bias = self.add_weight(
+          "bias",
+          shape=self._output_shape,
+          initializer=self._bias_initializer,
+          regularizer=self._bias_regularizer,
+          constraint=self._bias_constraint,
+          dtype=self.dtype,
+          trainable=True)
+    else:
+      self._bias = None
+    super(DenseEinsum, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        "output_shape":
+            self._output_shape,
+        "num_summed_dimensions":
+            self._num_summed_dimensions,
+        "activation":
+            tf.keras.activations.serialize(self._activation),
+        "use_bias":
+            self._use_bias,
+        "kernel_initializer":
+            tf.keras.initializers.serialize(self._kernel_initializer),
+        "bias_initializer":
+            tf.keras.initializers.serialize(self._bias_initializer),
+        "kernel_regularizer":
+            tf.keras.regularizers.serialize(self._kernel_regularizer),
+        "bias_regularizer":
+            tf.keras.regularizers.serialize(self._bias_regularizer),
+        "activity_regularizer":
+            tf.keras.regularizers.serialize(self._activity_regularizer),
+        "kernel_constraint":
+            tf.keras.constraints.serialize(self._kernel_constraint),
+        "bias_constraint":
+            tf.keras.constraints.serialize(self._bias_constraint)
+    }
+    base_config = super(DenseEinsum, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs):
+    ret = tf.einsum(self._einsum_string, inputs, self._kernel)
+    if self._use_bias:
+      ret += self._bias
+    if self._activation is not None:
+      ret = self._activation(ret)
+    return ret
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/gated_feedforward.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/gated_feedforward.py
new file mode 100644
index 000000000..2de294065
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/gated_feedforward.py
@@ -0,0 +1,210 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-based gated feedforward layer."""
+# pylint: disable=g-classes-have-attributes
+
+import gin
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package="Text")
+@gin.configurable
+class GatedFeedforward(tf.keras.layers.Layer):
+  """Gated linear feedforward layer.
+
+  This layer follows the paper "GLU Variants Improve Transformer"
+  (https://arxiv.org/abs/2002.05202). In additional, it allows to stack
+  multiple feedforward blocks and specify the position of dropout layer.
+
+  Args:
+    intermediate_size: Size of the intermediate layer.
+    intermediate_activation: Activation for the intermediate layer.
+    dropout: Dropout probability for the output dropout.
+    use_gate: Whether to use gated linear units. If True, assuming `GELU` as the
+      activation and omitting bias, will apply
+      `GEGLU(x, W, V, W_2) = (GEGLU(xW) * xV)W2`; if False, will follow
+      "Attention Is All You Need" (https://arxiv.org/abs/1706.03762) paper and
+        apply `FFN(x, W, W_2) = GELU(xW_1)W_2.`
+    num_blocks: The number of feedforward blocks to stack. Each block contains a
+      (gated) linear layer and a fully connected layer followed by dropout,
+      layer norm and residual.
+    dropout_position: Where to apply the dropout, the value can be either
+      `before_residual` or `after_residual`. If `before_residual`, will apply
+      `layer_output = layer_norm(dropout(layer_output) + layer_input)`; if
+      `after residual`, will apply
+      `layer_output = dropout(layer_norm(layer_output + layer_input))`.
+    kernel_initializer: Initializer for dense layer kernels.
+    bias_initializer: Initializer for dense layer biases.
+    kernel_regularizer: Regularizer for dense layer kernels.
+    bias_regularizer: Regularizer for dense layer biases.
+    activity_regularizer: Regularizer for dense layer activity.
+    kernel_constraint: Constraint for dense layer kernels.
+    bias_constraint: Constraint for dense layer kernels.
+  """
+
+  def __init__(self,
+               intermediate_size,
+               intermediate_activation,
+               dropout,
+               use_gate=True,
+               apply_output_layer_norm=True,
+               num_blocks=1,
+               dropout_position="before_residual",
+               kernel_initializer="glorot_uniform",
+               bias_initializer="zeros",
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    super(GatedFeedforward, self).__init__(**kwargs)
+    self._intermediate_size = intermediate_size
+    self._intermediate_activation = intermediate_activation
+    self._dropout = dropout
+    self._use_gate = use_gate
+    self._num_blocks = num_blocks
+    self._apply_output_layer_norm = apply_output_layer_norm
+    self._dropout_position = dropout_position
+    if self._dropout_position not in ("before_residual", "after_residual"):
+      raise ValueError(
+          "The dropout_position should be either `before_residual` or"
+          "`after_residual`, got: %s" % self._dropout_position)
+
+    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
+    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
+    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
+    self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
+    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
+    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
+
+  def build(self, input_shape):
+    hidden_size = input_shape.as_list()[-1]
+
+    common_kwargs = dict(
+        kernel_initializer=self._kernel_initializer,
+        bias_initializer=self._bias_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activity_regularizer=self._activity_regularizer,
+        kernel_constraint=self._kernel_constraint,
+        bias_constraint=self._bias_constraint)
+    self._intermediate_dense = []
+    self._intermediate_activation_layers = []
+    self._gate_dense = []
+    self._output_dense = []
+    self._output_dropout = []
+    self._output_layer_norm = []
+    activation_policy = tf.keras.mixed_precision.global_policy()
+    if activation_policy.name == "mixed_bfloat16":
+      # bfloat16 causes BERT with the LAMB optimizer to not converge
+      # as well, so we use float32.
+      # TODO(b/154538392): Investigate this.
+      activation_policy = tf.float32
+    for i in range(self._num_blocks):
+      self._intermediate_dense.append(
+          tf.keras.layers.experimental.EinsumDense(
+              "abc,cd->abd",
+              output_shape=(None, self._intermediate_size),
+              bias_axes="d",
+              name="intermediate_%d" % i,
+              **common_kwargs))
+      self._intermediate_activation_layers.append(
+          tf.keras.layers.Activation(
+              self._intermediate_activation, dtype=activation_policy))
+      if self._use_gate:
+        self._gate_dense.append(
+            tf.keras.layers.experimental.EinsumDense(
+                "abc,cd->abd",
+                output_shape=(None, self._intermediate_size),
+                bias_axes="d",
+                name="gate_%d" % i,
+                **common_kwargs))
+      self._output_dense.append(
+          tf.keras.layers.experimental.EinsumDense(
+              "abc,cd->abd",
+              output_shape=(None, hidden_size),
+              bias_axes="d",
+              name="output_%d" % i,
+              **common_kwargs))
+      self._output_dropout.append(tf.keras.layers.Dropout(rate=self._dropout))
+      # Use float32 in layernorm for numeric stability.
+      if self._apply_output_layer_norm:
+        self._output_layer_norm.append(
+            tf.keras.layers.LayerNormalization(
+                name="output_layer_norm_%d" % i,
+                axis=-1,
+                epsilon=1e-12,
+                dtype=tf.float32))
+
+  def get_config(self):
+    config = {
+        "intermediate_size":
+            self._intermediate_size,
+        "intermediate_activation":
+            self._intermediate_activation,
+        "dropout":
+            self._dropout,
+        "use_gate":
+            self._use_gate,
+        "num_blocks":
+            self._num_blocks,
+        "dropout_position":
+            self._dropout_position,
+        "kernel_initializer":
+            tf.keras.initializers.serialize(self._kernel_initializer),
+        "bias_initializer":
+            tf.keras.initializers.serialize(self._bias_initializer),
+        "kernel_regularizer":
+            tf.keras.regularizers.serialize(self._kernel_regularizer),
+        "bias_regularizer":
+            tf.keras.regularizers.serialize(self._bias_regularizer),
+        "activity_regularizer":
+            tf.keras.regularizers.serialize(self._activity_regularizer),
+        "kernel_constraint":
+            tf.keras.constraints.serialize(self._kernel_constraint),
+        "bias_constraint":
+            tf.keras.constraints.serialize(self._bias_constraint)
+    }
+    base_config = super(GatedFeedforward, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs):
+    layer_output = inputs
+    for i in range(self._num_blocks):
+      layer_input = layer_output
+      intermediate_output = self._intermediate_dense[i](layer_input)
+      intermediate_output = self._intermediate_activation_layers[i](
+          intermediate_output)
+      if self._use_gate:
+        gated_linear = self._gate_dense[i](layer_input)
+        intermediate_output = intermediate_output * gated_linear
+
+      layer_output = self._output_dense[i](intermediate_output)
+      if self._dropout_position == "before_residual":
+        layer_output = self._output_dropout[i](layer_output)
+
+      # During mixed precision training, `layer_input` may be from layer norm.
+      # If so, it is always fp32. Cast layer_output to fp32 for the subsequent
+      # add.
+      if layer_input.dtype == tf.float32:
+        layer_output = tf.cast(layer_output, tf.float32)
+      if self._apply_output_layer_norm:
+        layer_output = self._output_layer_norm[i](layer_output + layer_input)
+      if self._dropout_position == "after_residual":
+        layer_output = self._output_dropout[i](layer_output)
+
+    return layer_output
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/gaussian_process.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/gaussian_process.py
new file mode 100644
index 000000000..3729d8ee6
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/gaussian_process.py
@@ -0,0 +1,495 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Definitions for random feature Gaussian process layer."""
+import math
+import tensorflow as tf
+
+
+_SUPPORTED_LIKELIHOOD = ('binary_logistic', 'poisson', 'gaussian')
+
+
+class RandomFeatureGaussianProcess(tf.keras.layers.Layer):
+  """Gaussian process layer with random feature approximation [1].
+
+  During training, the model updates the maximum a posteriori (MAP) logits
+  estimates and posterior precision matrix using minibatch statistics. During
+  inference, the model divides the MAP logit estimates by the predictive
+  standard deviation, which is equivalent to approximating the posterior mean
+  of the predictive probability via the mean-field approximation.
+
+  User can specify different types of random features by setting
+  `use_custom_random_features=True`, and change the initializer and activations
+  of the custom random features. For example:
+
+    MLP Kernel: initializer='random_normal', activation=tf.nn.relu
+    RBF Kernel: initializer='random_normal', activation=tf.math.cos
+
+  A linear kernel can also be specified by setting gp_kernel_type='linear' and
+  `use_custom_random_features=True`.
+
+  [1]: Ali Rahimi and Benjamin Recht. Random Features for Large-Scale Kernel
+       Machines. In _Neural Information Processing Systems_, 2007.
+       https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf
+
+  Attributes:
+    units: (int) The dimensionality of layer.
+    num_inducing: (int) The number of random features for the approximation.
+    is_training: (tf.bool) Whether the layer is set in training mode. If so the
+      layer updates the Gaussian process' variance estimate using statistics
+      computed from the incoming minibatches.
+  """
+
+  def __init__(self,
+               units,
+               num_inducing=1024,
+               gp_kernel_type='gaussian',
+               gp_kernel_scale=1.,
+               gp_output_bias=0.,
+               normalize_input=False,
+               gp_kernel_scale_trainable=False,
+               gp_output_bias_trainable=False,
+               gp_cov_momentum=0.999,
+               gp_cov_ridge_penalty=1.,
+               scale_random_features=True,
+               use_custom_random_features=True,
+               custom_random_features_initializer=None,
+               custom_random_features_activation=None,
+               l2_regularization=1e-6,
+               gp_cov_likelihood='gaussian',
+               return_gp_cov=True,
+               return_random_features=False,
+               dtype=None,
+               name='random_feature_gaussian_process',
+               **gp_output_kwargs):
+    """Initializes a random-feature Gaussian process layer instance.
+
+    Args:
+      units: (int) Number of output units.
+      num_inducing: (int) Number of random Fourier features used for
+        approximating the Gaussian process.
+      gp_kernel_type: (string) The type of kernel function to use for Gaussian
+        process. Currently default to 'gaussian' which is the Gaussian RBF
+        kernel.
+      gp_kernel_scale: (float) The length-scale parameter of the a
+        shift-invariant kernel function, i.e., for RBF kernel:
+        exp(-|x1 - x2|**2 / gp_kernel_scale).
+      gp_output_bias: (float) Scalar initial value for the bias vector.
+      normalize_input: (bool) Whether to normalize the input to Gaussian
+        process.
+      gp_kernel_scale_trainable: (bool) Whether the length scale variable is
+        trainable.
+      gp_output_bias_trainable: (bool) Whether the bias is trainable.
+      gp_cov_momentum: (float) A discount factor used to compute the moving
+        average for posterior covariance matrix.
+      gp_cov_ridge_penalty: (float) Initial Ridge penalty to posterior
+        covariance matrix.
+      scale_random_features: (bool) Whether to scale the random feature
+        by sqrt(2. / num_inducing).
+      use_custom_random_features: (bool) Whether to use custom random
+        features implemented using tf.keras.layers.Dense.
+      custom_random_features_initializer: (str or callable) Initializer for
+        the random features. Default to random normal which approximates a RBF
+        kernel function if activation function is cos.
+      custom_random_features_activation: (callable) Activation function for the
+        random feature layer. Default to cosine which approximates a RBF
+        kernel function.
+      l2_regularization: (float) The strength of l2 regularization on the output
+        weights.
+      gp_cov_likelihood: (string) Likelihood to use for computing Laplace
+        approximation for covariance matrix. Default to `gaussian`.
+      return_gp_cov: (bool) Whether to also return GP covariance matrix.
+        If False then no covariance learning is performed.
+      return_random_features: (bool) Whether to also return random features.
+      dtype: (tf.DType) Input data type.
+      name: (string) Layer name.
+      **gp_output_kwargs: Additional keyword arguments to dense output layer.
+    """
+    super(RandomFeatureGaussianProcess, self).__init__(name=name, dtype=dtype)
+    self.units = units
+    self.num_inducing = num_inducing
+
+    self.normalize_input = normalize_input
+    self.gp_input_scale = 1. / tf.sqrt(gp_kernel_scale)
+    self.gp_feature_scale = tf.sqrt(2. / float(num_inducing))
+
+    self.scale_random_features = scale_random_features
+    self.return_random_features = return_random_features
+    self.return_gp_cov = return_gp_cov
+
+    self.gp_kernel_type = gp_kernel_type
+    self.gp_kernel_scale = gp_kernel_scale
+    self.gp_output_bias = gp_output_bias
+    self.gp_kernel_scale_trainable = gp_kernel_scale_trainable
+    self.gp_output_bias_trainable = gp_output_bias_trainable
+
+    self.use_custom_random_features = use_custom_random_features
+    self.custom_random_features_initializer = custom_random_features_initializer
+    self.custom_random_features_activation = custom_random_features_activation
+
+    self.l2_regularization = l2_regularization
+    self.gp_output_kwargs = gp_output_kwargs
+
+    self.gp_cov_momentum = gp_cov_momentum
+    self.gp_cov_ridge_penalty = gp_cov_ridge_penalty
+    self.gp_cov_likelihood = gp_cov_likelihood
+
+    if self.use_custom_random_features:
+      # Default to Gaussian RBF kernel.
+      self.random_features_bias_initializer = tf.random_uniform_initializer(
+          minval=0., maxval=2. * math.pi)
+      if self.custom_random_features_initializer is None:
+        self.custom_random_features_initializer = (
+            tf.keras.initializers.RandomNormal(stddev=1.))
+      if self.custom_random_features_activation is None:
+        self.custom_random_features_activation = tf.math.cos
+
+  def build(self, input_shape):
+    # Defines model layers.
+    if self.normalize_input:
+      self._input_norm_layer = tf.keras.layers.LayerNormalization(
+          name='gp_input_normalization')
+      self._input_norm_layer.build(input_shape)
+      input_shape = self._input_norm_layer.compute_output_shape(input_shape)
+
+    self._random_feature = self._make_random_feature_layer(
+        name='gp_random_feature')
+    self._random_feature.build(input_shape)
+    input_shape = self._random_feature.compute_output_shape(input_shape)
+
+    if self.return_gp_cov:
+      self._gp_cov_layer = LaplaceRandomFeatureCovariance(
+          momentum=self.gp_cov_momentum,
+          ridge_penalty=self.gp_cov_ridge_penalty,
+          likelihood=self.gp_cov_likelihood,
+          dtype=self.dtype,
+          name='gp_covariance')
+      self._gp_cov_layer.build(input_shape)
+
+    self._gp_output_layer = tf.keras.layers.Dense(
+        units=self.units,
+        use_bias=False,
+        kernel_regularizer=tf.keras.regularizers.l2(self.l2_regularization),
+        dtype=self.dtype,
+        name='gp_output_weights',
+        **self.gp_output_kwargs)
+    self._gp_output_layer.build(input_shape)
+
+    self._gp_output_bias = tf.Variable(
+        initial_value=[self.gp_output_bias] * self.units,
+        dtype=self.dtype,
+        trainable=self.gp_output_bias_trainable,
+        name='gp_output_bias')
+
+    self.built = True
+
+  def _make_random_feature_layer(self, name):
+    """Defines random feature layer depending on kernel type."""
+    if not self.use_custom_random_features:
+      # Use default RandomFourierFeatures layer from tf.keras.
+      return tf.keras.layers.experimental.RandomFourierFeatures(
+          output_dim=self.num_inducing,
+          kernel_initializer=self.gp_kernel_type,
+          scale=self.gp_kernel_scale,
+          trainable=self.gp_kernel_scale_trainable,
+          dtype=self.dtype,
+          name=name)
+
+    if self.gp_kernel_type.lower() == 'linear':
+      custom_random_feature_layer = tf.keras.layers.Lambda(
+          lambda x: x, name=name)
+    else:
+      # Use user-supplied configurations.
+      custom_random_feature_layer = tf.keras.layers.Dense(
+          units=self.num_inducing,
+          use_bias=True,
+          activation=self.custom_random_features_activation,
+          kernel_initializer=self.custom_random_features_initializer,
+          bias_initializer=self.random_features_bias_initializer,
+          trainable=False,
+          name=name)
+
+    return custom_random_feature_layer
+
+  def reset_covariance_matrix(self):
+    """Resets covariance matrix of the GP layer.
+
+    This function is useful for reseting the model's covariance matrix at the
+    begining of a new epoch.
+    """
+    self._gp_cov_layer.reset_precision_matrix()
+
+  def call(self, inputs, global_step=None, training=None):
+    # Computes random features.
+    gp_inputs = inputs
+    if self.normalize_input:
+      gp_inputs = self._input_norm_layer(gp_inputs)
+    elif self.use_custom_random_features:
+      # Supports lengthscale for custom random feature layer by directly
+      # rescaling the input.
+      gp_input_scale = tf.cast(self.gp_input_scale, inputs.dtype)
+      gp_inputs = gp_inputs * gp_input_scale
+
+    gp_feature = self._random_feature(gp_inputs)
+
+    if self.scale_random_features:
+      # Scale random feature by 2. / sqrt(num_inducing) following [1].
+      # When using GP layer as the output layer of a nerual network,
+      # it is recommended to turn this scaling off to prevent it from changing
+      # the learning rate to the hidden layers.
+      gp_feature_scale = tf.cast(self.gp_feature_scale, inputs.dtype)
+      gp_feature = gp_feature * gp_feature_scale
+
+    # Computes posterior center (i.e., MAP estimate) and variance.
+    gp_output = self._gp_output_layer(gp_feature) + self._gp_output_bias
+
+    if self.return_gp_cov:
+      gp_covmat = self._gp_cov_layer(gp_feature, gp_output, training)
+
+    # Assembles model output.
+    model_output = [gp_output,]
+    if self.return_gp_cov:
+      model_output.append(gp_covmat)
+    if self.return_random_features:
+      model_output.append(gp_feature)
+
+    return model_output
+
+
+class LaplaceRandomFeatureCovariance(tf.keras.layers.Layer):
+  """Computes the Gaussian Process covariance using Laplace method.
+
+  At training time, this layer updates the Gaussian process posterior using
+  model features in minibatches.
+
+  Attributes:
+    momentum: (float) A discount factor used to compute the moving average for
+      posterior precision matrix. Analogous to the momentum factor in batch
+      normalization. If -1 then update covariance matrix using a naive sum
+      without momentum, which is desirable if the goal is to compute the exact
+      covariance matrix by passing through data once (say in the final epoch).
+    ridge_penalty: (float) Initial Ridge penalty to weight covariance matrix.
+      This value is used to stablize the eigenvalues of weight covariance
+      estimate so that the matrix inverse can be computed for Cov = inv(t(X) * X
+      + s * I). The ridge factor s cannot be too large since otherwise it will
+      dominate the t(X) * X term and make covariance estimate not meaningful.
+    likelihood: (str) The likelihood to use for computing Laplace approximation
+      for the covariance matrix. Can be one of ('binary_logistic', 'poisson',
+      'gaussian').
+  """
+
+  def __init__(self,
+               momentum=0.999,
+               ridge_penalty=1.,
+               likelihood='gaussian',
+               dtype=None,
+               name='laplace_covariance'):
+    if likelihood not in _SUPPORTED_LIKELIHOOD:
+      raise ValueError(
+          f'"likelihood" must be one of {_SUPPORTED_LIKELIHOOD}, got {likelihood}.'
+      )
+    self.ridge_penalty = ridge_penalty
+    self.momentum = momentum
+    self.likelihood = likelihood
+    super(LaplaceRandomFeatureCovariance, self).__init__(dtype=dtype, name=name)
+
+  def compute_output_shape(self, input_shape):
+    gp_feature_dim = input_shape[-1]
+    return tf.TensorShape([gp_feature_dim, gp_feature_dim])
+
+  def build(self, input_shape):
+    gp_feature_dim = input_shape[-1]
+
+    # Convert gp_feature_dim to int value for TF1 compatibility.
+    if isinstance(gp_feature_dim, tf.compat.v1.Dimension):
+      gp_feature_dim = gp_feature_dim.value
+
+    # Posterior precision matrix for the GP's random feature coefficients.
+    self.initial_precision_matrix = (
+        self.ridge_penalty * tf.eye(gp_feature_dim, dtype=self.dtype))
+
+    self.precision_matrix = (
+        self.add_weight(
+            name='gp_precision_matrix',
+            shape=(gp_feature_dim, gp_feature_dim),
+            dtype=self.dtype,
+            initializer=tf.keras.initializers.Identity(self.ridge_penalty),
+            trainable=False,
+            aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA))
+    self.built = True
+
+  def make_precision_matrix_update_op(self,
+                                      gp_feature,
+                                      logits,
+                                      precision_matrix):
+    """Defines update op for the precision matrix of feature weights."""
+    if self.likelihood != 'gaussian':
+      if logits is None:
+        raise ValueError(
+            f'"logits" cannot be None when likelihood={self.likelihood}')
+
+      if logits.shape[-1] != 1:
+        raise ValueError(
+            f'likelihood={self.likelihood} only support univariate logits.'
+            f'Got logits dimension: {logits.shape[-1]}')
+
+    batch_size = tf.shape(gp_feature)[0]
+    batch_size = tf.cast(batch_size, dtype=gp_feature.dtype)
+
+    # Computes batch-specific normalized precision matrix.
+    if self.likelihood == 'binary_logistic':
+      prob = tf.sigmoid(logits)
+      prob_multiplier = prob * (1. - prob)
+    elif self.likelihood == 'poisson':
+      prob_multiplier = tf.exp(logits)
+    else:
+      prob_multiplier = 1.
+
+    gp_feature_adjusted = tf.sqrt(prob_multiplier) * gp_feature
+    precision_matrix_minibatch = tf.matmul(
+        gp_feature_adjusted, gp_feature_adjusted, transpose_a=True)
+
+    # Updates the population-wise precision matrix.
+    if self.momentum > 0:
+      # Use moving-average updates to accumulate batch-specific precision
+      # matrices.
+      precision_matrix_minibatch = precision_matrix_minibatch / batch_size
+      precision_matrix_new = (
+          self.momentum * precision_matrix +
+          (1. - self.momentum) * precision_matrix_minibatch)
+    else:
+      # Compute exact population-wise covariance without momentum.
+      # If use this option, make sure to pass through data only once.
+      precision_matrix_new = precision_matrix + precision_matrix_minibatch
+
+    # Returns the update op.
+    return precision_matrix.assign(precision_matrix_new)
+
+  def reset_precision_matrix(self):
+    """Resets precision matrix to its initial value.
+
+    This function is useful for reseting the model's covariance matrix at the
+    begining of a new epoch.
+    """
+    precision_matrix_reset_op = self.precision_matrix.assign(
+        self.initial_precision_matrix)
+    self.add_update(precision_matrix_reset_op)
+
+  def compute_predictive_covariance(self, gp_feature):
+    """Computes posterior predictive variance.
+
+    Approximates the Gaussian process posterior using random features.
+    Given training random feature Phi_tr (num_train, num_hidden) and testing
+    random feature Phi_ts (batch_size, num_hidden). The predictive covariance
+    matrix is computed as (assuming Gaussian likelihood):
+
+    s * Phi_ts @ inv(t(Phi_tr) * Phi_tr + s * I) @ t(Phi_ts),
+
+    where s is the ridge factor to be used for stablizing the inverse, and I is
+    the identity matrix with shape (num_hidden, num_hidden).
+
+    Args:
+      gp_feature: (tf.Tensor) The random feature of testing data to be used for
+        computing the covariance matrix. Shape (batch_size, gp_hidden_size).
+
+    Returns:
+      (tf.Tensor) Predictive covariance matrix, shape (batch_size, batch_size).
+    """
+    # Computes the covariance matrix of the feature coefficient.
+    feature_cov_matrix = tf.linalg.inv(self.precision_matrix)
+
+    # Computes the covariance matrix of the gp prediction.
+    cov_feature_product = tf.matmul(
+        feature_cov_matrix, gp_feature, transpose_b=True) * self.ridge_penalty
+    gp_cov_matrix = tf.matmul(gp_feature, cov_feature_product)
+    return gp_cov_matrix
+
+  def _get_training_value(self, training=None):
+    if training is None:
+      training = tf.keras.backend.learning_phase()
+
+    if isinstance(training, int):
+      training = bool(training)
+
+    return training
+
+  def call(self, inputs, logits=None, training=None):
+    """Minibatch updates the GP's posterior precision matrix estimate.
+
+    Args:
+      inputs: (tf.Tensor) GP random features, shape (batch_size,
+        gp_hidden_size).
+      logits: (tf.Tensor) Pre-activation output from the model. Needed
+        for Laplace approximation under a non-Gaussian likelihood.
+      training: (tf.bool) whether or not the layer is in training mode. If in
+        training mode, the gp_weight covariance is updated using gp_feature.
+
+    Returns:
+      gp_stddev (tf.Tensor): GP posterior predictive variance,
+        shape (batch_size, batch_size).
+    """
+    batch_size = tf.shape(inputs)[0]
+    training = self._get_training_value(training)
+
+    if training:
+      # Define and register the update op for feature precision matrix.
+      precision_matrix_update_op = self.make_precision_matrix_update_op(
+          gp_feature=inputs,
+          logits=logits,
+          precision_matrix=self.precision_matrix)
+      self.add_update(precision_matrix_update_op)
+      # Return null estimate during training.
+      return tf.eye(batch_size, dtype=self.dtype)
+    else:
+      # Return covariance estimate during inference.
+      return self.compute_predictive_covariance(gp_feature=inputs)
+
+
+def mean_field_logits(logits, covariance_matrix=None, mean_field_factor=1.):
+  """Adjust the model logits so its softmax approximates the posterior mean [1].
+
+  [1]: Zhiyun Lu, Eugene Ie, Fei Sha. Uncertainty Estimation with Infinitesimal
+       Jackknife.  _arXiv preprint arXiv:2006.07584_, 2020.
+       https://arxiv.org/abs/2006.07584
+
+  Arguments:
+    logits: A float tensor of shape (batch_size, num_classes).
+    covariance_matrix: The covariance matrix of shape (batch_size, batch_size).
+      If None then it assumes the covariance_matrix is an identity matrix.
+    mean_field_factor: The scale factor for mean-field approximation, used to
+      adjust the influence of posterior variance in posterior mean
+      approximation. If covariance_matrix=None then it is used as the
+      temperature parameter for temperature scaling.
+
+  Returns:
+    Tensor of adjusted logits, shape (batch_size, num_classes).
+  """
+  if mean_field_factor is None or mean_field_factor < 0:
+    return logits
+
+  # Compute standard deviation.
+  if covariance_matrix is None:
+    variances = 1.
+  else:
+    variances = tf.linalg.diag_part(covariance_matrix)
+
+  # Compute scaling coefficient for mean-field approximation.
+  logits_scale = tf.sqrt(1. + variances * mean_field_factor)
+
+  if len(logits.shape) > 1:
+    # Cast logits_scale to compatible dimension.
+    logits_scale = tf.expand_dims(logits_scale, axis=-1)
+
+  return logits / logits_scale
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/kernel_attention.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/kernel_attention.py
new file mode 100644
index 000000000..d68149391
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/kernel_attention.py
@@ -0,0 +1,396 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-based kernel attention layer."""
+
+import functools
+import math
+import tensorflow as tf
+
+_NUMERIC_STABLER = 1e-6
+
+
+class KernelMask(tf.keras.layers.Layer):
+  """Creates kernel attention mask.
+
+    inputs: from_tensor: 2D or 3D Tensor of shape
+      [batch_size, from_seq_length, ...].
+    mask: a Tensor of shape [batch_size, from_seq_length] which indicates
+      which part of the inputs we should not attend.
+
+    Returns:
+      float Tensor of shape [batch_size, from_seq_length] that KernelAttention
+      takes as mask.
+  """
+
+  def call(self, inputs, mask):
+    mask = tf.cast(mask, inputs.dtype)
+    return mask
+
+
+def create_projection_matrix(m, d, seed=None):
+  r"""Constructs the matrix of random projections.
+
+  Constructs a matrix of random orthogonal projections. Each projection vector
+  has direction chosen uniformly at random length taken from the
+  \chi(d) distribution.).
+
+  Args:
+    m: number of random projections.
+    d: dimensionality of each random projection.
+    seed: random seed used to construct projections. If not, we use the stateful
+      api.
+
+  Returns:
+    The matrix of random projections of the shape [m, d].
+  """
+  nb_full_blocks = math.ceil(m / d)
+  block_list = tf.TensorArray(tf.float32,
+                              size=tf.cast(nb_full_blocks, dtype=tf.int32))
+  stateful = False
+  if seed is None:
+    stateful = True
+    # dummy seed to make sure the graph compiles though the path is not taken.
+    seed = tf.constant([0, 1])
+  current_seed = seed
+  for i in range(nb_full_blocks):
+    if stateful:
+      unstructured_block = tf.random.normal((d, d))
+    else:
+      unstructured_block = tf.random.stateless_normal((d, d), seed=current_seed)
+      current_seed = tf.random.stateless_uniform([2],
+                                                 seed=current_seed,
+                                                 minval=None,
+                                                 dtype=tf.int32)
+    q, _ = tf.linalg.qr(unstructured_block)
+    q = tf.transpose(q)
+    block_list = block_list.write(i, q)
+  final_matrix = block_list.concat()[:m]
+  if stateful is None:
+    multiplier = tf.norm(tf.random.normal((m, d)), axis=1)
+  else:
+    multiplier = tf.norm(
+        tf.random.stateless_normal((m, d), seed=current_seed), axis=1)
+  return tf.linalg.matmul(tf.linalg.diag(multiplier), final_matrix)
+
+
+def _generalized_kernel(x, projection_matrix, is_query, f, h,
+                        data_normalizer_fn=None):
+  """Generalized kernel in RETHINKING ATTENTION WITH PERFORMERS.
+
+  Args:
+    x: The feature being transformed with shape [B, T, N ,H].
+    projection_matrix: The matrix with shape [M, H] that we projecct x to, where
+      M is the number of projections.
+    is_query: Whether the transform is a query or key. This transform is
+      symmetric is the argument is not used.
+    f: A non-linear function applied on x or projected x.
+    h: A muliplier which is a function of x applied after projected and
+      transformed. Only applied if projection_matrix is not None.
+    data_normalizer_fn: A function which takes x and returns a scalar that
+      normalize data.
+
+  Returns:
+    Transformed feature.
+  """
+  # No asymmetric operations.
+  del is_query
+
+  if data_normalizer_fn is not None:
+    x = data_normalizer_fn(x)
+
+  if projection_matrix is None:
+    return h(x) * f(x)
+  else:
+    x_projected = tf.einsum("BTNH,MH->BTNM", x, projection_matrix)
+    return h(x) * f(x_projected) / tf.math.sqrt(
+        tf.cast(tf.shape(projection_matrix)[0], tf.float32))
+
+
+# pylint: disable=g-long-lambda
+_TRANSFORM_MAP = {
+    "elu":
+        functools.partial(
+            _generalized_kernel,
+            f=lambda x: tf.keras.activations.elu(x) + 1,
+            h=lambda x: 1),
+    "relu":
+        functools.partial(
+            _generalized_kernel, f=tf.keras.activations.relu, h=lambda x: 1),
+    "square":
+        functools.partial(
+            _generalized_kernel, f=tf.math.square, h=lambda x: 1),
+    "exp":
+        functools.partial(
+            _generalized_kernel,
+            # Avoid exp explosion by shifting.
+            f=lambda x: tf.math.exp(
+                x - tf.math.reduce_max(x, axis=[1, 2, 3], keepdims=True)),
+            h=lambda x: tf.math.exp(
+                -0.5 * tf.math.reduce_sum(
+                    tf.math.square(x), axis=-1, keepdims=True)),
+            data_normalizer_fn=lambda x: x /
+            (tf.math.sqrt(tf.math.sqrt(tf.cast(tf.shape(x)[-1], tf.float32))))),
+    "expmod":
+        functools.partial(
+            _generalized_kernel,
+            # Avoid exp explosion by shifting.
+            f=lambda x: tf.math.exp(
+                x - tf.math.reduce_max(x, axis=[1, 2, 3], keepdims=True)),
+            h=lambda x: tf.math.exp(
+                -0.5 * tf.math.sqrt(tf.cast(tf.shape(x)[-1], tf.float32))),
+            data_normalizer_fn=lambda x: x /
+            (tf.math.sqrt(tf.math.sqrt(tf.cast(tf.shape(x)[-1], tf.float32))))),
+    "l2":
+        functools.partial(
+            _generalized_kernel,
+            f=lambda x: x,
+            h=lambda x: tf.math.sqrt(tf.cast(tf.shape(x)[-1], tf.float32)),
+            data_normalizer_fn=lambda x: x),
+    "identity": lambda x, projection_matrix, is_query: x
+}
+# pylint: enable=g-long-lambda
+
+
+class KernelAttention(tf.keras.layers.MultiHeadAttention):
+  """A variant of efficient transformers which replaces softmax with kernels.
+
+  This module combines ideas from the two following papers:
+
+  Rethinking Attention with Performers
+  (https://arxiv.org/abs/2009.14794)
+  - exp (Lemma 1, positive), relu, l2
+  - random/deterministic projection
+
+  Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention
+  (https://arxiv.org/abs/2006.16236)
+  - elu
+
+  with the theory of approximating angular Performer kernels from go/performer.
+
+  The module enables computing efficient attention in both: long sequence and
+  shorter sequence regimes. In the former setting, the attention matrix is never
+  explicitly computed and instead its low-rank decomposition obtained with given
+  kernel feature maps is leveraged to conduct attention module calculations
+  (see: https://arxiv.org/abs/2006.16236). In the latter setting, attention
+  matrix is constructed, but kernel features providing dimensionality reduction
+  are applied, resulting in more efficient computation of the attention matrix.
+  """
+
+  def __init__(self,
+               feature_transform="exp",
+               num_random_features=256,
+               seed=0,
+               redraw=False,
+               is_short_seq=False,
+               begin_kernel=0,
+               **kwargs):
+    r"""Constructor of KernelAttention.
+
+    Args:
+      feature_transform: A non-linear transform of the keys and quries.
+      Possible transforms are "elu", "relu", "square", "exp", "expmod",
+      "l2", "identity". If <is_short_seq> = True, it is recommended to choose
+      feature_transform as "l2".
+      num_random_features: Number of random features to be used for projection.
+        if num_random_features <= 0, no production is used before transform.
+      seed: The seed to begin drawing random features. Once the seed is set, the
+        psedo number generation is determinisitc. Users should pass different
+        seed for different layers. For multi-worker, each layer will use the
+        same projection at each step.
+      redraw: Whether to redraw projection every forward pass during training.
+        The argument is only effective when num_random_features > 0.
+      is_short_seq: boolean predicate indicating whether input data consists of
+        very short sequences or not; in most cases this should be False
+        (default option).
+      begin_kernel: Apply kernel_attention after this sequence id and apply
+        softmax attention before this.
+      **kwargs: The same arguments `MultiHeadAttention` layer.
+    """
+    if feature_transform not in _TRANSFORM_MAP:
+      raise ValueError("Unsupported feature_transform. The supported "
+                       "feature_transform are %s. "
+                       "Got '%s'." % (_TRANSFORM_MAP.keys(), feature_transform))
+    if num_random_features <= 0 and redraw:
+      raise ValueError(
+          "There is nothing to redraw when num_random_features <= 0.")
+    self._feature_transform = feature_transform
+    self._num_random_features = num_random_features
+    self._redraw = redraw
+    self._is_short_seq = is_short_seq
+    self._begin_kernel = begin_kernel
+    # We use the seed for two scenarios:
+    # 1. inference
+    # 2. no redraw
+    self._seed = seed
+
+    super().__init__(**kwargs)
+    self._projection_matrix = None
+    if num_random_features > 0:
+      self._projection_matrix = create_projection_matrix(
+          self._num_random_features, self._key_dim,
+          tf.constant([self._seed, self._seed + 1]))
+
+  def _compute_attention(self,
+                         query,
+                         key,
+                         value,
+                         feature_transform,
+                         is_short_seq,
+                         attention_mask=None,
+                         training=False,
+                         numeric_stabler=_NUMERIC_STABLER):
+    """Applies kernel attention with query, key, value tensors.
+
+    This function defines the computation inside `call` with projected
+    multi-head Q, K, V inputs. Users can override this function for customized
+    attention implementation.
+
+    Args:
+      query: Projected query `Tensor` of shape `[B, T, N, key_dim]`.
+      key: Projected key `Tensor` of shape `[B, S, N, key_dim]`.
+      value: Projected value `Tensor` of shape `[B, S, N, value_dim]`.
+      feature_transform: A non-linear transform of the keys and quries.
+      is_short_seq: boolean predicate indicating whether input data consists of
+        short or long sequences; usually short sequence is defined as having
+        length L <= 1024.
+      attention_mask: a boolean mask of shape `[B, S]`, that prevents
+        attenting to masked positions. Note that the mask is only appied to
+        the keys. User may want to mask the output if query contains pads.
+      training: Python boolean indicating whether the layer should behave in
+        training mode (adding dropout) or in inference mode (doing nothing).
+      numeric_stabler: A scalar value added to avoid divide by 0.
+
+    Returns:
+      attention_output: Multi-headed outputs of attention computation.
+    """
+
+    projection_matrix = None
+    if self._num_random_features > 0:
+      if self._redraw and training:
+        projection_matrix = create_projection_matrix(self._num_random_features,
+                                                     self._key_dim)
+      else:
+        projection_matrix = self._projection_matrix
+
+    key = _TRANSFORM_MAP[feature_transform](key, projection_matrix, False)
+    query = _TRANSFORM_MAP[feature_transform](query, projection_matrix, True)
+
+    if attention_mask is not None:
+      key = tf.einsum("BSNH,BS->BSNH", key, attention_mask)
+
+    if is_short_seq:
+      attention_scores = tf.einsum("BTNH,BSNH->BTSN", query, key)
+      attention_scores = tf.nn.softmax(attention_scores, axis=2)
+      attention_output = tf.einsum("BTSN,BSNH->BTNH", attention_scores, value)
+      return attention_output
+    else:
+      kv = tf.einsum("BSNH,BSND->BNDH", key, value)
+      denominator = 1.0 / (
+          tf.einsum("BTNH,BNH->BTN", query, tf.reduce_sum(key, axis=1)) +
+          _NUMERIC_STABLER)
+      return tf.einsum("BTNH,BNDH,BTN->BTND", query, kv, denominator)
+
+  def _build_from_signature(self, query, value, key=None):
+    super()._build_from_signature(query=query, value=value, key=key)
+    if self._begin_kernel > 0:
+      common_kwargs = dict(
+          kernel_initializer=self._kernel_initializer,
+          bias_initializer=self._bias_initializer,
+          kernel_regularizer=self._kernel_regularizer,
+          bias_regularizer=self._bias_regularizer,
+          activity_regularizer=self._activity_regularizer,
+          kernel_constraint=self._kernel_constraint,
+          bias_constraint=self._bias_constraint)
+      self._output_dense_softmax = self._make_output_dense(
+          self._query_shape.rank - 1, common_kwargs,
+          name="attention_output_softmax")
+      self._dropout_softmax = tf.keras.layers.Dropout(rate=self._dropout)
+
+  def call(self,
+           query,
+           value,
+           key=None,
+           attention_mask=None,
+           training=False):
+    """Compute attention with kernel mechanism.
+
+    Args:
+      query: Query `Tensor` of shape `[B, T, dim]`.
+      value: Value `Tensor` of shape `[B, S, dim]`.
+      key: Optional key `Tensor` of shape `[B, S, dim]`. If not given, will use
+        `value` for both `key` and `value`, which is the most common case.
+      attention_mask: a boolean mask of shape `[B, S]`, that prevents
+        attenting to masked positions. Note that the mask is only appied to
+        the keys. User may want to mask the output if query contains pads.
+      training: Python boolean indicating whether the layer should behave in
+        training mode (adding dropout) or in inference mode (doing nothing).
+
+    Returns:
+      Multi-headed outputs of attention computation.
+    """
+    if not self._built_from_signature:
+      self._build_from_signature(query=query, value=value, key=key)
+    if key is None:
+      key = value
+
+    #   N = `num_attention_heads`
+    #   H = `size_per_head`
+    # `query` = [B, T, N ,H]
+    query = self._query_dense(query)
+
+    # `key` = [B, S, N, H]
+    key = self._key_dense(key)
+
+    # `value` = [B, S, N, D]
+    value = self._value_dense(value)
+
+    if self._begin_kernel > 0:
+      attention_output_softmax = self._compute_attention(
+          query[:, :self._begin_kernel],
+          key, value, "identity", True, attention_mask, training)
+      attention_output_softmax = self._dropout_softmax(attention_output_softmax)
+      attention_output_softmax = self._output_dense_softmax(
+          attention_output_softmax)
+
+      attention_output_kernel = self._compute_attention(
+          query[:, self._begin_kernel:],
+          key, value, self._feature_transform, self._is_short_seq,
+          attention_mask, training)
+      attention_output_kernel = self._dropout_layer(attention_output_kernel)
+      attention_output_kernel = self._output_dense(
+          attention_output_kernel)
+      attention_output = tf.concat(
+          [attention_output_softmax, attention_output_kernel], axis=1)
+    else:
+      attention_output = self._compute_attention(
+          query, key, value, self._feature_transform,
+          self._is_short_seq, attention_mask, training)
+      # This is actually dropping out entire tokens to attend to, which might
+      # seem a bit unusual, but is taken from the original Transformer paper.
+      attention_output = self._dropout_layer(attention_output)
+      attention_output = self._output_dense(attention_output)
+    return attention_output
+
+  def get_config(self):
+    config = {
+        "feature_transform": self._feature_transform,
+        "num_random_features": self._num_random_features,
+        "seed": self._seed,
+        "redraw": self._redraw,
+        "is_short_seq": self._is_short_seq,
+        "begin_kernel": self._begin_kernel,
+    }
+    base_config = super().get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/masked_lm.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/masked_lm.py
new file mode 100644
index 000000000..4bdea56d4
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/masked_lm.py
@@ -0,0 +1,20 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Masked language model network."""
+# pylint: disable=g-classes-have-attributes
+import keras_nlp
+
+
+MaskedLM = keras_nlp.layers.MaskedLM
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/masked_softmax.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/masked_softmax.py
new file mode 100644
index 000000000..06b1994c7
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/masked_softmax.py
@@ -0,0 +1,85 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-based softmax layer with optional masking."""
+# pylint: disable=g-classes-have-attributes
+
+import tensorflow as tf
+
+
+def _large_compatible_negative(tensor_type):
+  """Large negative number as Tensor.
+
+  This function is necessary because the standard value for epsilon
+  in this module (-1e9) cannot be represented using `tf.float16`.
+
+  Args:
+    tensor_type: A dtype to determine the type.
+
+  Returns:
+    A large negative number.
+  """
+  if tensor_type == tf.float16:
+    return tf.float16.min
+  return -1e9
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class MaskedSoftmax(tf.keras.layers.Layer):
+  """Performs a softmax with optional masking on a tensor.
+
+  Args:
+    mask_expansion_axes: Any axes that should be padded on the mask tensor.
+    normalization_axes: On which axes the softmax should perform.
+  """
+
+  def __init__(self,
+               mask_expansion_axes=None,
+               normalization_axes=None,
+               **kwargs):
+    self._mask_expansion_axes = mask_expansion_axes
+    if normalization_axes is None:
+      self._normalization_axes = (-1,)
+    else:
+      self._normalization_axes = normalization_axes
+    super(MaskedSoftmax, self).__init__(**kwargs)
+
+  def call(self, scores, mask=None):
+
+    if mask is not None:
+      for _ in range(len(scores.shape) - len(mask.shape)):
+        mask = tf.expand_dims(mask, axis=self._mask_expansion_axes)
+
+      # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+      # masked positions, this operation will create a tensor which is 0.0 for
+      # positions we want to attend and -1.e9 for masked positions.
+      adder = (1.0 - tf.cast(mask, scores.dtype)) * _large_compatible_negative(
+          scores.dtype)
+      # Since we are adding it to the raw scores before the softmax, this is
+      # effectively the same as removing these entirely.
+      scores += adder
+
+    if len(self._normalization_axes) == 1:
+      return tf.nn.softmax(scores, axis=self._normalization_axes[0])
+    else:
+      return tf.math.exp(scores - tf.math.reduce_logsumexp(
+          scores, axis=self._normalization_axes, keepdims=True))
+
+  def get_config(self):
+    config = {
+        'mask_expansion_axes': self._mask_expansion_axes,
+        'normalization_axes': self._normalization_axes
+    }
+    base_config = super(MaskedSoftmax, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/mat_mul_with_margin.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/mat_mul_with_margin.py
new file mode 100644
index 000000000..330a02046
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/mat_mul_with_margin.py
@@ -0,0 +1,69 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Dot product with margin layer."""
+# pylint: disable=g-classes-have-attributes
+
+from typing import Tuple
+# Import libraries
+import tensorflow as tf
+
+from modeling import tf_utils
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class MatMulWithMargin(tf.keras.layers.Layer):
+  """This layer computs a dot product matrix given two encoded inputs.
+
+  Args:
+    logit_scale: The scaling factor of dot products when doing training.
+    logit_margin: The margin value between the positive and negative examples
+      when doing training.
+  """
+
+  def __init__(self,
+               logit_scale=1.0,
+               logit_margin=0.0,
+               **kwargs):
+    super(MatMulWithMargin, self).__init__(**kwargs)
+    self.logit_scale = logit_scale
+    self.logit_margin = logit_margin
+
+  def call(self, left_encoded: tf.Tensor,
+           right_encoded: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+    batch_size = tf_utils.get_shape_list(
+        left_encoded, name='sequence_output_tensor')[0]
+
+    # Left -> Right dot product.
+    left_dot_products = tf.matmul(
+        left_encoded, right_encoded, transpose_b=True)
+
+    self.left_logits = self.logit_scale * (
+        left_dot_products - self.logit_margin * tf.eye(batch_size))
+
+    # Right -> Left dot product.
+    self.right_logits = tf.transpose(self.left_logits)
+
+    return (self.left_logits, self.right_logits)
+
+  def get_config(self):
+    config = {
+        'logit_scale': self.logit_scale,
+        'logit_margin': self.logit_margin}
+    config.update(super(MatMulWithMargin, self).get_config())
+    return config
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/mobile_bert_layers.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/mobile_bert_layers.py
new file mode 100644
index 000000000..585a1e413
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/mobile_bert_layers.py
@@ -0,0 +1,554 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""MobileBERT embedding and transformer layers."""
+import tensorflow as tf
+
+import keras_nlp
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class NoNorm(tf.keras.layers.Layer):
+  """Apply element-wise linear transformation to the last dimension."""
+
+  def __init__(self, name=None):
+    super(NoNorm, self).__init__(name=name)
+
+  def build(self, shape):
+    kernal_size = shape[-1]
+    self.bias = self.add_weight('beta',
+                                shape=[kernal_size],
+                                initializer='zeros')
+    self.scale = self.add_weight('gamma',
+                                 shape=[kernal_size],
+                                 initializer='ones')
+
+  def call(self, feature):
+    output = feature * self.scale + self.bias
+    return output
+
+
+def _get_norm_layer(normalization_type='no_norm', name=None):
+  """Get normlization layer.
+
+  Args:
+      normalization_type: String. The type of normalization_type, only
+        `no_norm` and `layer_norm` are supported.
+      name: Name for the norm layer.
+
+  Returns:
+    layer norm class.
+  """
+  if normalization_type == 'no_norm':
+    layer = NoNorm(name=name)
+  elif normalization_type == 'layer_norm':
+    layer = tf.keras.layers.LayerNormalization(
+        name=name,
+        axis=-1,
+        epsilon=1e-12,
+        dtype=tf.float32)
+  else:
+    raise NotImplementedError('Only "no_norm" and "layer_norm" and supported.')
+  return layer
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class MobileBertEmbedding(tf.keras.layers.Layer):
+  """Performs an embedding lookup for MobileBERT.
+
+  This layer includes word embedding, token type embedding, position embedding.
+  """
+
+  def __init__(self,
+               word_vocab_size,
+               word_embed_size,
+               type_vocab_size,
+               output_embed_size,
+               max_sequence_length=512,
+               normalization_type='no_norm',
+               initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+               dropout_rate=0.1,
+               **kwargs):
+    """Class initialization.
+
+    Args:
+      word_vocab_size: Number of words in the vocabulary.
+      word_embed_size: Word embedding size.
+      type_vocab_size: Number of word types.
+      output_embed_size: Embedding size for the final embedding output.
+      max_sequence_length: Maximum length of input sequence.
+      normalization_type: String. The type of normalization_type, only
+        `no_norm` and `layer_norm` are supported.
+      initializer: The initializer to use for the embedding weights and
+        linear projection weights.
+      dropout_rate: Dropout rate.
+      **kwargs: keyword arguments.
+    """
+    super(MobileBertEmbedding, self).__init__(**kwargs)
+    self.word_vocab_size = word_vocab_size
+    self.word_embed_size = word_embed_size
+    self.type_vocab_size = type_vocab_size
+    self.output_embed_size = output_embed_size
+    self.max_sequence_length = max_sequence_length
+    self.normalization_type = normalization_type
+    self.initializer = tf.keras.initializers.get(initializer)
+    self.dropout_rate = dropout_rate
+
+    self.word_embedding = keras_nlp.layers.OnDeviceEmbedding(
+        self.word_vocab_size,
+        self.word_embed_size,
+        initializer=initializer,
+        name='word_embedding')
+    self.type_embedding = keras_nlp.layers.OnDeviceEmbedding(
+        self.type_vocab_size,
+        self.output_embed_size,
+        initializer=initializer,
+        name='type_embedding')
+    self.pos_embedding = keras_nlp.layers.PositionEmbedding(
+        max_length=max_sequence_length,
+        initializer=initializer,
+        name='position_embedding')
+    self.word_embedding_proj = tf.keras.layers.experimental.EinsumDense(
+        'abc,cd->abd',
+        output_shape=[None, self.output_embed_size],
+        kernel_initializer=initializer,
+        bias_axes='d',
+        name='embedding_projection')
+    self.layer_norm = _get_norm_layer(normalization_type, 'embedding_norm')
+    self.dropout_layer = tf.keras.layers.Dropout(
+        self.dropout_rate,
+        name='embedding_dropout')
+
+  def get_config(self):
+    config = {
+        'word_vocab_size': self.word_vocab_size,
+        'word_embed_size': self.word_embed_size,
+        'type_vocab_size': self.type_vocab_size,
+        'output_embed_size': self.output_embed_size,
+        'max_sequence_length': self.max_sequence_length,
+        'normalization_type': self.normalization_type,
+        'initializer': tf.keras.initializers.serialize(self.initializer),
+        'dropout_rate': self.dropout_rate
+    }
+    base_config = super(MobileBertEmbedding, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, input_ids, token_type_ids=None):
+    word_embedding_out = self.word_embedding(input_ids)
+    word_embedding_out = tf.concat(
+        [tf.pad(word_embedding_out[:, 1:], ((0, 0), (0, 1), (0, 0))),
+         word_embedding_out,
+         tf.pad(word_embedding_out[:, :-1], ((0, 0), (1, 0), (0, 0)))],
+        axis=2)
+    word_embedding_out = self.word_embedding_proj(word_embedding_out)
+
+    pos_embedding_out = self.pos_embedding(word_embedding_out)
+    embedding_out = word_embedding_out + pos_embedding_out
+    if token_type_ids is not None:
+      type_embedding_out = self.type_embedding(token_type_ids)
+      embedding_out += type_embedding_out
+    embedding_out = self.layer_norm(embedding_out)
+    embedding_out = self.dropout_layer(embedding_out)
+
+    return embedding_out
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class MobileBertTransformer(tf.keras.layers.Layer):
+  """Transformer block for MobileBERT.
+
+  An implementation of one layer (block) of Transformer with bottleneck and
+  inverted-bottleneck for MobilerBERT.
+
+  Original paper for MobileBERT:
+  https://arxiv.org/pdf/2004.02984.pdf
+  """
+
+  def __init__(self,
+               hidden_size=512,
+               num_attention_heads=4,
+               intermediate_size=512,
+               intermediate_act_fn='relu',
+               hidden_dropout_prob=0.1,
+               attention_probs_dropout_prob=0.1,
+               intra_bottleneck_size=128,
+               use_bottleneck_attention=False,
+               key_query_shared_bottleneck=True,
+               num_feedforward_networks=4,
+               normalization_type='no_norm',
+               initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+               **kwargs):
+    """Class initialization.
+
+    Args:
+      hidden_size: Hidden size for the Transformer input and output tensor.
+      num_attention_heads: Number of attention heads in the Transformer.
+      intermediate_size: The size of the "intermediate" (a.k.a., feed
+        forward) layer.
+      intermediate_act_fn: The non-linear activation function to apply
+        to the output of the intermediate/feed-forward layer.
+      hidden_dropout_prob: Dropout probability for the hidden layers.
+      attention_probs_dropout_prob: Dropout probability of the attention
+        probabilities.
+      intra_bottleneck_size: Size of bottleneck.
+      use_bottleneck_attention: Use attention inputs from the bottleneck
+        transformation. If true, the following `key_query_shared_bottleneck`
+        will be ignored.
+      key_query_shared_bottleneck: Whether to share linear transformation for
+        keys and queries.
+      num_feedforward_networks: Number of stacked feed-forward networks.
+      normalization_type: The type of normalization_type, only `no_norm` and
+        `layer_norm` are supported. `no_norm` represents the element-wise
+        linear transformation for the student model, as suggested by the
+        original MobileBERT paper. `layer_norm` is used for the teacher model.
+      initializer: The initializer to use for the embedding weights and
+        linear projection weights.
+      **kwargs: keyword arguments.
+
+    Raises:
+      ValueError: A Tensor shape or parameter is invalid.
+    """
+    super(MobileBertTransformer, self).__init__(**kwargs)
+    self.hidden_size = hidden_size
+    self.num_attention_heads = num_attention_heads
+    self.intermediate_size = intermediate_size
+    self.intermediate_act_fn = intermediate_act_fn
+    self.hidden_dropout_prob = hidden_dropout_prob
+    self.attention_probs_dropout_prob = attention_probs_dropout_prob
+    self.intra_bottleneck_size = intra_bottleneck_size
+    self.use_bottleneck_attention = use_bottleneck_attention
+    self.key_query_shared_bottleneck = key_query_shared_bottleneck
+    self.num_feedforward_networks = num_feedforward_networks
+    self.normalization_type = normalization_type
+    self.initializer = tf.keras.initializers.get(initializer)
+
+    if intra_bottleneck_size % num_attention_heads != 0:
+      raise ValueError(
+          (f'The bottleneck size {intra_bottleneck_size} is not a multiple '
+           f'of the number of attention heads {num_attention_heads}.'))
+    attention_head_size = int(intra_bottleneck_size / num_attention_heads)
+
+    self.block_layers = {}
+    # add input bottleneck
+    dense_layer_2d = tf.keras.layers.experimental.EinsumDense(
+        'abc,cd->abd',
+        output_shape=[None, self.intra_bottleneck_size],
+        bias_axes='d',
+        kernel_initializer=initializer,
+        name='bottleneck_input/dense')
+    layer_norm = _get_norm_layer(self.normalization_type,
+                                 name='bottleneck_input/norm')
+    self.block_layers['bottleneck_input'] = [dense_layer_2d,
+                                             layer_norm]
+
+    if self.key_query_shared_bottleneck:
+      dense_layer_2d = tf.keras.layers.experimental.EinsumDense(
+          'abc,cd->abd',
+          output_shape=[None, self.intra_bottleneck_size],
+          bias_axes='d',
+          kernel_initializer=initializer,
+          name='kq_shared_bottleneck/dense')
+      layer_norm = _get_norm_layer(self.normalization_type,
+                                   name='kq_shared_bottleneck/norm')
+      self.block_layers['kq_shared_bottleneck'] = [dense_layer_2d,
+                                                   layer_norm]
+
+    # add attention layer
+    attention_layer = tf.keras.layers.MultiHeadAttention(
+        num_heads=self.num_attention_heads,
+        key_dim=attention_head_size,
+        value_dim=attention_head_size,
+        dropout=self.attention_probs_dropout_prob,
+        output_shape=self.intra_bottleneck_size,
+        kernel_initializer=initializer,
+        name='attention')
+    layer_norm = _get_norm_layer(self.normalization_type,
+                                 name='attention/norm')
+    self.block_layers['attention'] = [attention_layer,
+                                      layer_norm]
+
+    # add stacked feed-forward networks
+    self.block_layers['ffn'] = []
+    for ffn_layer_idx in range(self.num_feedforward_networks):
+      layer_prefix = f'ffn_layer_{ffn_layer_idx}'
+      layer_name = layer_prefix + '/intermediate_dense'
+      intermediate_layer = tf.keras.layers.experimental.EinsumDense(
+          'abc,cd->abd',
+          activation=self.intermediate_act_fn,
+          output_shape=[None, self.intermediate_size],
+          bias_axes='d',
+          kernel_initializer=initializer,
+          name=layer_name)
+      layer_name = layer_prefix + '/output_dense'
+      output_layer = tf.keras.layers.experimental.EinsumDense(
+          'abc,cd->abd',
+          output_shape=[None, self.intra_bottleneck_size],
+          bias_axes='d',
+          kernel_initializer=initializer,
+          name=layer_name)
+      layer_name = layer_prefix + '/norm'
+      layer_norm = _get_norm_layer(self.normalization_type,
+                                   name=layer_name)
+      self.block_layers['ffn'].append([intermediate_layer,
+                                       output_layer,
+                                       layer_norm])
+
+    # add output bottleneck
+    bottleneck = tf.keras.layers.experimental.EinsumDense(
+        'abc,cd->abd',
+        output_shape=[None, self.hidden_size],
+        activation=None,
+        bias_axes='d',
+        kernel_initializer=initializer,
+        name='bottleneck_output/dense')
+    dropout_layer = tf.keras.layers.Dropout(
+        self.hidden_dropout_prob,
+        name='bottleneck_output/dropout')
+    layer_norm = _get_norm_layer(self.normalization_type,
+                                 name='bottleneck_output/norm')
+    self.block_layers['bottleneck_output'] = [bottleneck,
+                                              dropout_layer,
+                                              layer_norm]
+
+  def get_config(self):
+    config = {
+        'hidden_size': self.hidden_size,
+        'num_attention_heads': self.num_attention_heads,
+        'intermediate_size': self.intermediate_size,
+        'intermediate_act_fn': self.intermediate_act_fn,
+        'hidden_dropout_prob': self.hidden_dropout_prob,
+        'attention_probs_dropout_prob': self.attention_probs_dropout_prob,
+        'intra_bottleneck_size': self.intra_bottleneck_size,
+        'use_bottleneck_attention': self.use_bottleneck_attention,
+        'key_query_shared_bottleneck': self.key_query_shared_bottleneck,
+        'num_feedforward_networks': self.num_feedforward_networks,
+        'normalization_type': self.normalization_type,
+        'initializer': tf.keras.initializers.serialize(self.initializer),
+    }
+    base_config = super(MobileBertTransformer, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self,
+           input_tensor,
+           attention_mask=None,
+           return_attention_scores=False):
+    """Implementes the forward pass.
+
+    Args:
+      input_tensor: Float tensor of shape
+        `(batch_size, seq_length, hidden_size)`.
+      attention_mask: (optional) int32 tensor of shape
+        `(batch_size, seq_length, seq_length)`, with 1 for positions that can
+        be attended to and 0 in positions that should not be.
+      return_attention_scores: If return attention score.
+
+    Returns:
+      layer_output: Float tensor of shape
+        `(batch_size, seq_length, hidden_size)`.
+      attention_scores (Optional): Only when return_attention_scores is True.
+
+    Raises:
+      ValueError: A Tensor shape or parameter is invalid.
+    """
+    input_width = input_tensor.shape.as_list()[-1]
+    if input_width != self.hidden_size:
+      raise ValueError(
+          (f'The width of the input tensor {input_width} != '
+           f'hidden size {self.hidden_size}'))
+
+    prev_output = input_tensor
+    # input bottleneck
+    dense_layer = self.block_layers['bottleneck_input'][0]
+    layer_norm = self.block_layers['bottleneck_input'][1]
+    layer_input = dense_layer(prev_output)
+    layer_input = layer_norm(layer_input)
+
+    if self.use_bottleneck_attention:
+      key_tensor = layer_input
+      query_tensor = layer_input
+      value_tensor = layer_input
+    elif self.key_query_shared_bottleneck:
+      dense_layer = self.block_layers['kq_shared_bottleneck'][0]
+      layer_norm = self.block_layers['kq_shared_bottleneck'][1]
+      shared_attention_input = dense_layer(prev_output)
+      shared_attention_input = layer_norm(shared_attention_input)
+      key_tensor = shared_attention_input
+      query_tensor = shared_attention_input
+      value_tensor = prev_output
+    else:
+      key_tensor = prev_output
+      query_tensor = prev_output
+      value_tensor = prev_output
+
+    # attention layer
+    attention_layer = self.block_layers['attention'][0]
+    layer_norm = self.block_layers['attention'][1]
+    attention_output, attention_scores = attention_layer(
+        query_tensor,
+        value_tensor,
+        key_tensor,
+        attention_mask,
+        return_attention_scores=True,
+    )
+    attention_output = layer_norm(attention_output + layer_input)
+
+    # stacked feed-forward networks
+    layer_input = attention_output
+    for ffn_idx in range(self.num_feedforward_networks):
+      intermediate_layer = self.block_layers['ffn'][ffn_idx][0]
+      output_layer = self.block_layers['ffn'][ffn_idx][1]
+      layer_norm = self.block_layers['ffn'][ffn_idx][2]
+      intermediate_output = intermediate_layer(layer_input)
+      layer_output = output_layer(intermediate_output)
+      layer_output = layer_norm(layer_output + layer_input)
+      layer_input = layer_output
+
+    # output bottleneck
+    bottleneck = self.block_layers['bottleneck_output'][0]
+    dropout_layer = self.block_layers['bottleneck_output'][1]
+    layer_norm = self.block_layers['bottleneck_output'][2]
+    layer_output = bottleneck(layer_output)
+    layer_output = dropout_layer(layer_output)
+    layer_output = layer_norm(layer_output + prev_output)
+
+    if return_attention_scores:
+      return layer_output, attention_scores
+    else:
+      return layer_output
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class MobileBertMaskedLM(tf.keras.layers.Layer):
+  """Masked language model network head for BERT modeling.
+
+  This layer implements a masked language model based on the provided
+  transformer based encoder. It assumes that the encoder network being passed
+  has a "get_embedding_table()" method. Different from canonical BERT's masked
+  LM layer, when the embedding width is smaller than hidden_size, it adds an
+  extra output weights in shape [vocab_size, (hidden_size - embedding_width)].
+  """
+
+  def __init__(self,
+               embedding_table,
+               activation=None,
+               initializer='glorot_uniform',
+               output='logits',
+               **kwargs):
+    """Class initialization.
+
+    Args:
+      embedding_table: The embedding table from encoder network.
+      activation: The activation, if any, for the dense layer.
+      initializer: The initializer for the dense layer. Defaults to a Glorot
+        uniform initializer.
+      output: The output style for this layer. Can be either `logits` or
+        `predictions`.
+      **kwargs: keyword arguments.
+    """
+    super(MobileBertMaskedLM, self).__init__(**kwargs)
+    self.embedding_table = embedding_table
+    self.activation = activation
+    self.initializer = tf.keras.initializers.get(initializer)
+
+    if output not in ('predictions', 'logits'):
+      raise ValueError(
+          ('Unknown `output` value "%s". `output` can be either "logits" or '
+           '"predictions"') % output)
+    self._output_type = output
+
+  def build(self, input_shape):
+    self._vocab_size, embedding_width = self.embedding_table.shape
+    hidden_size = input_shape[-1]
+    self.dense = tf.keras.layers.Dense(
+        hidden_size,
+        activation=self.activation,
+        kernel_initializer=self.initializer,
+        name='transform/dense')
+
+    if hidden_size > embedding_width:
+      self.extra_output_weights = self.add_weight(
+          'extra_output_weights',
+          shape=(self._vocab_size, hidden_size - embedding_width),
+          initializer=self.initializer,
+          trainable=True)
+    elif hidden_size == embedding_width:
+      self.extra_output_weights = None
+    else:
+      raise ValueError(
+          'hidden size %d cannot be smaller than embedding width %d.' %
+          (hidden_size, embedding_width))
+
+    self.layer_norm = tf.keras.layers.LayerNormalization(
+        axis=-1, epsilon=1e-12, name='transform/LayerNorm')
+    self.bias = self.add_weight(
+        'output_bias/bias',
+        shape=(self._vocab_size,),
+        initializer='zeros',
+        trainable=True)
+
+    super(MobileBertMaskedLM, self).build(input_shape)
+
+  def call(self, sequence_data, masked_positions):
+    masked_lm_input = self._gather_indexes(sequence_data, masked_positions)
+    lm_data = self.dense(masked_lm_input)
+    lm_data = self.layer_norm(lm_data)
+    if self.extra_output_weights is None:
+      lm_data = tf.matmul(lm_data, self.embedding_table, transpose_b=True)
+    else:
+      lm_data = tf.matmul(
+          lm_data,
+          tf.concat([self.embedding_table, self.extra_output_weights], axis=1),
+          transpose_b=True)
+
+    logits = tf.nn.bias_add(lm_data, self.bias)
+    masked_positions_length = masked_positions.shape.as_list()[1] or tf.shape(
+        masked_positions)[1]
+    logits = tf.reshape(logits,
+                        [-1, masked_positions_length, self._vocab_size])
+    if self._output_type == 'logits':
+      return logits
+    return tf.nn.log_softmax(logits)
+
+  def get_config(self):
+    raise NotImplementedError('MaskedLM cannot be directly serialized because '
+                              'it has variable sharing logic.')
+
+  def _gather_indexes(self, sequence_tensor, positions):
+    """Gathers the vectors at the specific positions.
+
+    Args:
+      sequence_tensor: Sequence output of `BertModel` layer of shape
+        `(batch_size, seq_length, num_hidden)` where `num_hidden` is number of
+        hidden units of `BertModel` layer.
+      positions: Positions ids of tokens in sequence to mask for pretraining
+        of with dimension `(batch_size, num_predictions)` where
+        `num_predictions` is maximum number of tokens to mask out and predict
+        per each sequence.
+
+    Returns:
+      Masked out sequence tensor of shape
+        `(batch_size * num_predictions, num_hidden)`.
+    """
+    sequence_shape = tf.shape(sequence_tensor)
+    batch_size, seq_length = sequence_shape[0], sequence_shape[1]
+    width = sequence_tensor.shape.as_list()[2] or sequence_shape[2]
+
+    flat_offsets = tf.reshape(
+        tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
+    flat_positions = tf.reshape(positions + flat_offsets, [-1])
+    flat_sequence_tensor = tf.reshape(sequence_tensor,
+                                      [batch_size * seq_length, width])
+    output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
+
+    return output_tensor
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/multi_channel_attention.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/multi_channel_attention.py
new file mode 100644
index 000000000..227428535
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/multi_channel_attention.py
@@ -0,0 +1,173 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Multi-channel Attention."""
+# pylint: disable=g-classes-have-attributes
+
+import math
+
+import tensorflow as tf
+from modeling import tf_utils
+from nlp_modeling.layers import masked_softmax
+
+
+class VotingAttention(tf.keras.layers.Layer):
+  """Voting Attention layer.
+
+  Args:
+    num_heads: The number of attention heads.
+    head_size: Per-head hidden size.
+    kernel_initializer: Initializer for dense layer kernels.
+    bias_initializer: Initializer for dense layer biases.
+    kernel_regularizer: Regularizer for dense layer kernels.
+    bias_regularizer: Regularizer for dense layer biases.
+    activity_regularizer: Regularizer for dense layer activity.
+    kernel_constraint: Constraint for dense layer kernels.
+    bias_constraint: Constraint for dense layer kernels.
+  """
+
+  def __init__(self,
+               num_heads,
+               head_size,
+               kernel_initializer="glorot_uniform",
+               bias_initializer="zeros",
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    super(VotingAttention, self).__init__(**kwargs)
+    self._num_heads = num_heads
+    self._head_size = head_size
+    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
+    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
+    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
+    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
+    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
+
+  def build(self, unused_input_shapes):
+    common_kwargs = dict(
+        kernel_initializer=self._kernel_initializer,
+        bias_initializer=self._bias_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activity_regularizer=self._activity_regularizer,
+        kernel_constraint=self._kernel_constraint,
+        bias_constraint=self._bias_constraint)
+    self._query_dense = tf.keras.layers.experimental.EinsumDense(
+        "BAE,ENH->BANH",
+        output_shape=(None, self._num_heads, self._head_size),
+        bias_axes="NH",
+        name="query",
+        **common_kwargs)
+    self._key_dense = tf.keras.layers.experimental.EinsumDense(
+        "BAE,ENH->BANH",
+        output_shape=(None, self._num_heads, self._head_size),
+        bias_axes="NH",
+        name="key",
+        **common_kwargs)
+    super(VotingAttention, self).build(unused_input_shapes)
+
+  def call(self, encoder_outputs, doc_attention_mask):
+    num_docs = tf_utils.get_shape_list(encoder_outputs, expected_rank=[4])[1]
+    cls_embeddings = encoder_outputs[:, :, 0, :]
+    key = self._key_dense(cls_embeddings)
+    query = self._query_dense(cls_embeddings)
+    doc_attention_mask = tf.cast(doc_attention_mask, tf.float32)
+
+    key = tf.einsum("BANH,BA->BANH", key, doc_attention_mask)
+    query = tf.einsum("BANH,BA->BANH", query, doc_attention_mask)
+    attention_matrix = tf.einsum("BXNH,BYNH->BNXY", query, key)
+    mask = tf.ones([num_docs, num_docs])
+    mask = tf.linalg.set_diag(mask, tf.zeros(num_docs))
+    attention_matrix = tf.einsum("BNXY,XY->BNXY", attention_matrix, mask)
+    doc_attention_probs = tf.einsum("BNAY->BNA", attention_matrix)
+    doc_attention_probs = tf.einsum("BNA->BA", doc_attention_probs)
+    infadder = (1.0 - doc_attention_mask) * -100000.0
+    return tf.nn.softmax(doc_attention_probs + infadder)
+
+
+class MultiChannelAttention(tf.keras.layers.MultiHeadAttention):
+  """Multi-channel Attention layer.
+
+  Introduced in, [Generating Representative Headlines for News Stories
+  ](https://arxiv.org/abs/2001.09386). Expects multiple cross-attention
+  target sequences.
+
+  Call args:
+    query: Query `Tensor` of shape `[B, T, dim]`.
+    value: Value `Tensor` of shape `[B, A, S, dim]`, where A denotes the
+    context_attention_weights: Context weights of shape `[B, N, T, A]`, where N
+      is the number of attention heads. Combines multi-channel sources
+      context tensors according to the distribution among channels.
+    key: Optional key `Tensor` of shape `[B, A, S, dim]`. If not given, will use
+      `value` for both `key` and `value`, which is the most common case.
+    attention_mask: A boolean mask of shape `[B, T, S]`, that prevents attention
+      to certain positions.
+  """
+
+  def _build_attention(self, rank):
+    super(MultiChannelAttention, self)._build_attention(rank)
+    self._masked_softmax = masked_softmax.MaskedSoftmax(mask_expansion_axes=[2])
+
+  def call(self,
+           query,
+           value,
+           key=None,
+           context_attention_weights=None,
+           attention_mask=None):
+    if not self._built_from_signature:
+      self._build_from_signature(query, value, key=key)
+    if key is None:
+      key = value
+
+    # Scalar dimensions referenced here:
+    #   B = batch size (number of stories)
+    #   A = num_docs (number of docs)
+    #   F = target sequence length
+    #   T = source sequence length
+    #   N = `num_attention_heads`
+    #   H = `size_per_head`
+    # `query_tensor` = [B, F, N ,H]
+    query_tensor = self._query_dense(query)
+
+    # `key_tensor` = [B, A, T, N, H]
+    key_tensor = self._key_dense(key)
+
+    # `value_tensor` = [B, A, T, N, H]
+    value_tensor = self._value_dense(value)
+
+    # Take the dot product between "query" and "key" to get the raw
+    # attention scores.
+    attention_scores = tf.einsum("BATNH,BFNH->BANFT", key_tensor, query_tensor)
+    attention_scores = tf.multiply(attention_scores,
+                                   1.0 / math.sqrt(float(self._key_dim)))
+
+    # Normalize the attention scores to probabilities.
+    # `attention_probs` = [B, A, N, F, T]
+    attention_probs = self._masked_softmax(attention_scores, attention_mask)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attention_probs = self._dropout_layer(attention_probs)
+
+    # `context_layer` = [B, F, N, H]
+    context_layer = tf.einsum("BANFT,BATNH->BAFNH", attention_probs,
+                              value_tensor)
+    attention_output = tf.einsum("BNFA,BAFNH->BFNH", context_attention_weights,
+                                 context_layer)
+    attention_output = self._output_dense(attention_output)
+    return attention_output
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/on_device_embedding.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/on_device_embedding.py
new file mode 100644
index 000000000..c589f95f4
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/on_device_embedding.py
@@ -0,0 +1,21 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-based one-hot embedding layer."""
+# pylint: disable=g-classes-have-attributes
+
+import keras_nlp
+
+
+OnDeviceEmbedding = keras_nlp.layers.OnDeviceEmbedding
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/position_embedding.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/position_embedding.py
new file mode 100644
index 000000000..5ff97c497
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/position_embedding.py
@@ -0,0 +1,237 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-based positional embedding layer."""
+# pylint: disable=g-classes-have-attributes
+import math
+from typing import Optional
+
+import tensorflow as tf
+
+from modeling import tf_utils
+
+Initializer = tf.keras.initializers.Initializer
+
+
+@tf.keras.utils.register_keras_serializable(package="Text")
+class RelativePositionEmbedding(tf.keras.layers.Layer):
+  """Creates a positional embedding.
+
+  This layer calculates the position encoding as a mix of sine and cosine
+  functions with geometrically increasing wavelengths. Defined and formulized in
+   "Attention is All You Need", section 3.5.
+  (https://arxiv.org/abs/1706.03762).
+
+  Args:
+    hidden_size: Size of the hidden layer.
+    min_timescale: Minimum scale that will be applied at each position
+    max_timescale: Maximum scale that will be applied at each position.
+  """
+
+  def __init__(self,
+               hidden_size: int,
+               min_timescale: float = 1.0,
+               max_timescale: float = 1.0e4,
+               **kwargs):
+    # We need to have a default dtype of float32, since the inputs (which Keras
+    # usually uses to infer the dtype) will always be int32.
+    # We compute the positional encoding in float32 even if the model uses
+    # float16, as many of the ops used, like log and exp, are numerically
+    # unstable in float16.
+    if "dtype" not in kwargs:
+      kwargs["dtype"] = "float32"
+
+    super().__init__(**kwargs)
+    self._hidden_size = hidden_size
+    self._min_timescale = min_timescale
+    self._max_timescale = max_timescale
+
+  def get_config(self):
+    config = {
+        "hidden_size": self._hidden_size,
+        "min_timescale": self._min_timescale,
+        "max_timescale": self._max_timescale,
+    }
+    base_config = super(RelativePositionEmbedding, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs, length=None):
+    """Implements call() for the layer.
+
+    Args:
+      inputs: An tensor whose second dimension will be used as `length`. If
+        `None`, the other `length` argument must be specified.
+      length: An optional integer specifying the number of positions. If both
+        `inputs` and `length` are spcified, `length` must be equal to the second
+        dimension of `inputs`.
+
+    Returns:
+      A tensor in shape of `(length, hidden_size)`.
+    """
+    if inputs is None and length is None:
+      raise ValueError("If inputs is None, `length` must be set in "
+                       "RelativePositionEmbedding().")
+    if inputs is not None:
+      input_shape = tf_utils.get_shape_list(inputs)
+      if length is not None and length != input_shape[1]:
+        raise ValueError(
+            "If inputs is not None, `length` must equal to input_shape[1].")
+      length = input_shape[1]
+    position = tf.cast(tf.range(length), tf.float32)
+    num_timescales = self._hidden_size // 2
+    min_timescale, max_timescale = self._min_timescale, self._max_timescale
+    log_timescale_increment = (
+        math.log(float(max_timescale) / float(min_timescale)) /
+        (tf.cast(num_timescales, tf.float32) - 1))
+    inv_timescales = min_timescale * tf.exp(
+        tf.cast(tf.range(num_timescales), tf.float32) *
+        -log_timescale_increment)
+    scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(
+        inv_timescales, 0)
+    position_embeddings = tf.concat(
+        [tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
+    return position_embeddings
+
+
+def _relative_position_bucket(relative_position,
+                              bidirectional=True,
+                              num_buckets=32,
+                              max_distance=128):
+  """Translate relative position to a bucket number for relative attention.
+
+  The relative position is defined as memory_position - query_position, i.e.
+  the distance in tokens from the attending position to the attended-to
+  position.
+
+  If `bidirectional=False`, then positive relative positions are invalid.
+
+  We use smaller buckets for small absolute relative_position and larger
+  buckets for larger absolute relative_positions.
+
+  All relative positions >=max_distance map to the same bucket.
+
+  All relative positions <=-max_distance map to the same bucket.
+
+  This should allow for more graceful generalization to longer sequences
+  than the model has been trained on.
+
+  Args:
+    relative_position: An int32 Tensor
+    bidirectional: A boolean - whether the attention is bidirectional
+    num_buckets: An integer
+    max_distance: An integer
+
+  Returns:
+    A Tensor with the same shape as relative_position, containing int32
+    values in the range [0, num_buckets)
+  """
+  ret = 0
+  n = -relative_position
+  if bidirectional:
+    num_buckets //= 2
+    ret += tf.cast(tf.math.less(n, 0), tf.int32) * num_buckets
+    n = tf.math.abs(n)
+  else:
+    n = tf.math.maximum(n, 0)
+  # now n is in the range [0, inf)
+  max_exact = num_buckets // 2
+  is_small = tf.math.less(n, max_exact)
+  val_if_large = max_exact + tf.dtypes.cast(
+      tf.math.log(tf.cast(n, tf.float32) / max_exact) /
+      math.log(max_distance / max_exact) * (num_buckets - max_exact),
+      tf.int32,
+  )
+  val_if_large = tf.math.minimum(val_if_large, num_buckets - 1)
+  ret += tf.where(is_small, n, val_if_large)
+  return ret
+
+
+@tf.keras.utils.register_keras_serializable(package="Text")
+class RelativePositionBias(tf.keras.layers.Layer):
+  """Relative position embedding via per-head bias in T5 style.
+
+  Reference implementation in MeshTF:
+  https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L1000
+
+  This layer implements the relative position bias used in "Exploring the Limits
+  of Transfer Learning with a Unified Text-to-Text Transformer"
+  (https://arxiv.org/abs/1910.10683)
+  """
+
+  def __init__(self,
+               num_heads: int,
+               relative_attention_num_buckets: int = 32,
+               relative_attention_max_distance: int = 128,
+               bidirectional: bool = True,
+               embeddings_initializer: Optional[Initializer] = None,
+               **kwargs):
+    super().__init__(**kwargs)
+    self.num_heads = num_heads
+    self.relative_attention_num_buckets = relative_attention_num_buckets
+    self.bidirectional = bidirectional
+    self.relative_attention_max_distance = relative_attention_max_distance
+    if embeddings_initializer:
+      self._embed_init = embeddings_initializer
+    else:
+      self._embed_init = tf.keras.initializers.TruncatedNormal(stddev=1.0)
+    with tf.name_scope(self.name):
+      self._relative_attention_bias = self.add_weight(
+          "rel_embedding",
+          shape=[self.relative_attention_num_buckets, self.num_heads],
+          initializer=self._embed_init,
+          dtype=self.dtype,
+          trainable=True)
+
+  def get_config(self):
+    config = {
+        "num_heads":
+            self.num_heads,
+        "relative_attention_num_buckets":
+            self.relative_attention_num_buckets,
+        "relative_attention_max_distance":
+            self.relative_attention_max_distance,
+        "bidirectional":
+            self.bidirectional,
+        "embeddings_initializer":
+            tf.keras.initializers.serialize(self._embed_init),
+    }
+    base_config = super().get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, query: tf.Tensor, key: tf.Tensor):
+    """Implements the forward pass.
+
+    Args:
+      query: query input tensor shape [batch, query length, hidden size].
+      key: key input tensor shape [batch, key length, hidden size].
+
+    Returns:
+      A tensor in shape of [batch, heads, query length, key length].
+    """
+    batch_size, qlen = tf_utils.get_shape_list(query)[:2]
+    klen = tf_utils.get_shape_list(key)[1]
+    context_position = tf.range(qlen)[:, None]
+    memory_position = tf.range(klen)[None, :]
+    relative_position = memory_position - context_position
+    rp_bucket = _relative_position_bucket(
+        relative_position,
+        bidirectional=self.bidirectional,
+        num_buckets=self.relative_attention_num_buckets,
+        max_distance=self.relative_attention_max_distance)
+    values = tf.nn.embedding_lookup(self._relative_attention_bias, rp_bucket)
+    values = tf.expand_dims(
+        tf.transpose(values, [2, 0, 1]),
+        axis=0)  # shape (1, num_heads, qlen, klen)
+    values = tf.tile(values, [batch_size, 1, 1, 1])
+    return values
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/relative_attention.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/relative_attention.py
new file mode 100644
index 000000000..be18c9d1e
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/relative_attention.py
@@ -0,0 +1,499 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-based relative attention layers."""
+import math
+import string
+import tensorflow as tf
+
+_CHR_IDX = string.ascii_lowercase
+
+
+def _build_proj_equation(free_dims, bound_dims, output_dims):
+  """Builds an einsum equation for projections inside multi-head attention."""
+  input_str = ""
+  kernel_str = ""
+  output_str = ""
+  bias_axes = ""
+  letter_offset = 0
+  for i in range(free_dims):
+    char = _CHR_IDX[i + letter_offset]
+    input_str += char
+    output_str += char
+
+  letter_offset += free_dims
+  for i in range(bound_dims):
+    char = _CHR_IDX[i + letter_offset]
+    input_str += char
+    kernel_str += char
+
+  letter_offset += bound_dims
+  for i in range(output_dims):
+    char = _CHR_IDX[i + letter_offset]
+    kernel_str += char
+    output_str += char
+    bias_axes += char
+  equation = "%s,%s->%s" % (input_str, kernel_str, output_str)
+
+  return equation, bias_axes, len(output_str)
+
+
+def _get_output_shape(output_rank, known_last_dims):
+  return [None] * (output_rank - len(known_last_dims)) + list(known_last_dims)
+
+
+def _rel_shift(x, klen=-1):
+  """Performs relative shift to form the relative attention score."""
+
+  x = tf.transpose(x, perm=[2, 3, 0, 1])
+  x_size = tf.shape(x)
+
+  x = tf.reshape(x, [x_size[1], x_size[0], x_size[2], x_size[3]])
+  x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1])
+  x = tf.reshape(x, [x_size[0], x_size[1] - 1, x_size[2], x_size[3]])
+  x = tf.slice(x, [0, 0, 0, 0], [-1, klen, -1, -1])
+
+  x = tf.transpose(x, perm=[2, 3, 0, 1])
+
+  return x
+
+
+@tf.keras.utils.register_keras_serializable(package="Text")
+class MultiHeadRelativeAttention(tf.keras.layers.MultiHeadAttention):
+  """A multi-head attention layer with relative attention + position encoding.
+
+  This layer shares the same input/output projections as the common
+  `tf.keras.layers.MultiHeadAttention` layer.
+
+  When it calculates attention logits, position encoding is projected to form
+  relative keys. The logits are composed by shifted relative logits and content
+  logits.
+
+  **Note: This layer is currently experimental.
+
+  Attributes:
+    kernel_initializer: The kernel initializer. Defaults to variance_scaling.
+
+  Call args:
+    query: Query `Tensor` of shape `[B, T, dim]`.
+    value: Value `Tensor` of shape `[B, S, dim]`.
+    content_attention_bias: Bias `Tensor` for content based attention of shape
+      `[num_heads, dim]`.
+    positional_attention_bias: Bias `Tensor` for position based attention of
+      shape `[num_heads, dim]`.
+    key: Optional key `Tensor` of shape `[B, S, dim]`. If not given, will use
+      `value` for both `key` and `value`, which is the most common case.
+    relative_position_encoding: Relative positional encoding `Tensor` of shape
+      `[B, L, dim]`.
+    segment_matrix: Optional `Tensor` representing segmentation IDs used in
+      XLNet of shape `[B, S, S + M]`.
+    segment_encoding: Optional `Tensor` representing the segmentation
+      encoding as used in XLNet of shape `[2, num_heads, dim]`.
+    segment_attention_bias: Optional trainable bias parameter added to the
+      query had when calculating the segment-based attention score used in
+      XLNet of shape `[num_heads, dim]`.
+    state: Optional `Tensor` of shape `[B, M, E]` where M is the length of the
+      state or memory.
+      If passed, this is also attended over as in Transformer XL.
+    attention_mask: A boolean mask of shape `[B, T, S]` that prevents attention
+      to certain positions.
+  """
+
+  def __init__(self,
+               kernel_initializer="variance_scaling",
+               **kwargs):
+    super().__init__(kernel_initializer=kernel_initializer,
+                     **kwargs)
+
+  def _build_from_signature(self, query, value, key=None):
+    super(MultiHeadRelativeAttention, self)._build_from_signature(
+        query=query,
+        value=value,
+        key=key)
+    if hasattr(value, "shape"):
+      value_shape = tf.TensorShape(value.shape)
+    else:
+      value_shape = value
+    if key is None:
+      key_shape = value_shape
+    elif hasattr(key, "shape"):
+      key_shape = tf.TensorShape(key.shape)
+    else:
+      key_shape = key
+
+    common_kwargs = dict(
+        kernel_initializer=self._kernel_initializer,
+        bias_initializer=self._bias_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activity_regularizer=self._activity_regularizer,
+        kernel_constraint=self._kernel_constraint,
+        bias_constraint=self._bias_constraint)
+
+    with tf.init_scope():
+      einsum_equation, _, output_rank = _build_proj_equation(
+          key_shape.rank - 1, bound_dims=1, output_dims=2)
+      self._encoding_dense = tf.keras.layers.experimental.EinsumDense(
+          einsum_equation,
+          output_shape=_get_output_shape(output_rank - 1,
+                                         [self._num_heads, self._key_dim]),
+          bias_axes=None,
+          name="encoding",
+          **common_kwargs)
+
+  def compute_attention(self,
+                        query,
+                        key,
+                        value,
+                        position,
+                        content_attention_bias,
+                        positional_attention_bias,
+                        segment_matrix=None,
+                        segment_encoding=None,
+                        segment_attention_bias=None,
+                        attention_mask=None):
+    """Computes the attention.
+
+    This function defines the computation inside `call` with projected
+    multihead Q, K, V, R inputs.
+
+    Args:
+      query: Projected query `Tensor` of shape `[B, T, N, key_dim]`.
+      key: Projected key `Tensor` of shape `[B, S + M, N, key_dim]`.
+      value: Projected value `Tensor` of shape `[B, S + M, N, key_dim]`.
+      position: Projected position `Tensor` of shape `[B, L, N, key_dim]`.
+      content_attention_bias: Trainable bias parameter added to the query head
+        when calculating the content-based attention score.
+      positional_attention_bias: Trainable bias parameter added to the query
+        head when calculating the position-based attention score.
+      segment_matrix: Optional `Tensor` representing segmentation IDs used in
+        XLNet.
+      segment_encoding: Optional trainable `Tensor` representing the
+        segmentation encoding as used in XLNet.
+      segment_attention_bias: Optional trainable bias parameter added to the
+        query had when calculating the segment-based attention score used in
+        XLNet.
+      attention_mask: (default None) Optional mask that is added to attention
+        logits. If state is not None, the mask source sequence dimension should
+        extend M.
+
+    Returns:
+      attention_output: Multi-headed output of attention computation of shape
+        `[B, S, N, key_dim]`.
+
+    """
+    content_attention = tf.einsum(self._dot_product_equation,
+                                  key,
+                                  query + content_attention_bias)
+    positional_attention = tf.einsum(self._dot_product_equation,
+                                     position,
+                                     query + positional_attention_bias)
+    positional_attention = _rel_shift(
+        positional_attention, klen=tf.shape(content_attention)[3])
+
+    if segment_matrix is not None:
+      segment_attention = tf.einsum("bind,snd->bnis",
+                                    query + segment_attention_bias,
+                                    segment_encoding)
+      target_shape = tf.shape(positional_attention)
+      segment_attention = tf.where(
+          tf.broadcast_to(tf.expand_dims(segment_matrix, 1), target_shape),
+          tf.broadcast_to(segment_attention[:, :, :, 1:], target_shape),
+          tf.broadcast_to(segment_attention[:, :, :, :1], target_shape))
+      attention_sum = (
+          content_attention + positional_attention + segment_attention)
+    else:
+      attention_sum = content_attention + positional_attention
+
+    attention_scores = tf.multiply(
+        attention_sum, 1.0 / math.sqrt(float(self._key_dim)))
+
+    attention_scores = self._masked_softmax(attention_scores, attention_mask)
+
+    attention_output = self._dropout_layer(attention_scores)
+
+    attention_output = tf.einsum(self._combine_equation,
+                                 attention_output,
+                                 value)
+    return attention_output
+
+  def call(self,
+           query,
+           value,
+           content_attention_bias,
+           positional_attention_bias,
+           key=None,
+           relative_position_encoding=None,
+           segment_matrix=None,
+           segment_encoding=None,
+           segment_attention_bias=None,
+           state=None,
+           attention_mask=None):
+    """Compute multi-head relative attention over inputs.
+
+    Size glossary:
+      * Number of heads (H): the number of attention heads.
+      * Value size (V): the size of each value embedding per head.
+      * Key size (K): the size of each key embedding per head. Equally, the size
+        of each query embedding per head. Typically K <= V.
+      * Batch dimensions (B).
+      * Query (target) attention axes shape (T).
+      * Value (source) attention axes shape (S), the rank must match the target.
+      * Encoding length (L): The relative positional encoding length.
+
+    Args:
+      query: attention input.
+      value: attention input.
+      content_attention_bias: A trainable bias parameter added to the query
+        head when calculating the content-based attention score.
+      positional_attention_bias: A trainable bias parameter added to the query
+        head when calculating the position-based attention score.
+      key: attention input.
+      relative_position_encoding: relative positional encoding for key and
+        value.
+      segment_matrix: Optional `Tensor` representing segmentation IDs used in
+        XLNet.
+      segment_encoding: Optional `Tensor` representing the segmentation
+        encoding as used in XLNet.
+      segment_attention_bias: Optional trainable bias parameter added to the
+        query had when calculating the segment-based attention score used in
+        XLNet.
+      state: (default None) optional state. If passed, this is also attended
+        over as in TransformerXL.
+      attention_mask: (default None) Optional mask that is added to attention
+        logits. If state is not None, the mask source sequence dimension should
+        extend M.
+
+    Returns:
+      attention_output: The result of the computation, of shape [B, T, E],
+        where `T` is for target sequence shapes and `E` is the query input last
+        dimension if `output_shape` is `None`. Otherwise, the multi-head outputs
+        are projected to the shape specified by `output_shape`.
+    """
+    if not self._built_from_signature:
+      self._build_from_signature(query, value, key=key)
+    if key is None:
+      key = value
+    if state is not None and state.shape.ndims > 1:
+      value = tf.concat([state, value], 1)
+      key = tf.concat([state, key], 1)
+
+    # `query` = [B, T, N ,H]
+    query = self._query_dense(query)
+
+    # `key` = [B, S + M, N, H]
+    key = self._key_dense(key)
+
+    # `value` = [B, S + M, N, H]
+    value = self._value_dense(value)
+
+    # `position` = [B, L, N, H]
+    position = self._encoding_dense(relative_position_encoding)
+
+    attention_output = self.compute_attention(
+        query=query,
+        key=key,
+        value=value,
+        position=position,
+        content_attention_bias=content_attention_bias,
+        positional_attention_bias=positional_attention_bias,
+        segment_matrix=segment_matrix,
+        segment_encoding=segment_encoding,
+        segment_attention_bias=segment_attention_bias,
+        attention_mask=attention_mask)
+
+    # `attention_output` = [B, S, N, H]
+    attention_output = self._output_dense(attention_output)
+
+    return attention_output
+
+
+@tf.keras.utils.register_keras_serializable(package="Text")
+class TwoStreamRelativeAttention(MultiHeadRelativeAttention):
+  """Two-stream relative self-attention for XLNet.
+
+  In XLNet, each token has two associated vectors at each self-attention layer,
+  the content stream (h) and the query stream (g).
+
+  The content stream is the self-attention stream as in Transformer XL and
+  represents the context and content (the token itself).
+
+  The query stream only has access to contextual information and the position,
+  but not the content.
+
+  This layer shares the same build signature as
+  `tf.keras.layers.MultiHeadAttention` but has different input/output
+  projections.
+
+  **Note: This layer is currently experimental.
+
+  Call args:
+    content_stream: `Tensor` of shape `[B, T, dim]`.
+    content_attention_bias: Bias `Tensor` for content based attention of shape
+      `[num_heads, dim]`.
+    positional_attention_bias: Bias `Tensor` for position based attention of
+      shape `[num_heads, dim]`.
+    query_stream: `Tensor` of shape `[B, P, dim]`.
+    target_mapping: `Tensor` of shape `[B, P, S]`.
+    relative_position_encoding: Relative positional encoding `Tensor` of shape
+      `[B, L, dim]`.
+    segment_matrix: Optional `Tensor` representing segmentation IDs used in
+      XLNet of shape `[B, S, S + M]`.
+    segment_encoding: Optional `Tensor` representing the segmentation
+      encoding as used in XLNet of shape `[2, num_heads, dim]`.
+    segment_attention_bias: Optional trainable bias parameter added to the
+      query had when calculating the segment-based attention score used in
+      XLNet of shape `[num_heads, dim]`.
+    state: Optional `Tensor` of shape [B, M, E] where M is the length of the
+      state or memory.
+      If passed, this is also attended over as in Transformer XL.
+    content_attention_mask: a boolean mask of shape `[B, T, S]` that
+      prevents attention to certain positions for content attention computation.
+    query_attention_mask: a boolean mask of shape `[B, T, S]` that
+      prevents attention to certain position for query attention computation.
+  """
+
+  def call(self,
+           content_stream,
+           content_attention_bias,
+           positional_attention_bias,
+           query_stream,
+           relative_position_encoding,
+           target_mapping=None,
+           segment_matrix=None,
+           segment_encoding=None,
+           segment_attention_bias=None,
+           state=None,
+           content_attention_mask=None,
+           query_attention_mask=None):
+    """Compute multi-head relative attention over inputs.
+
+    Size glossary:
+      * Number of heads (H): the number of attention heads.
+      * Value size (V): the size of each value embedding per head.
+      * Key size (K): the size of each key embedding per head. Equally, the size
+        of each query embedding per head. Typically K <= V.
+      * Number of predictions (P): the number of predictions.
+      * Batch dimensions (B).
+      * Query (target) attention axes shape (T).
+      * Value (source) attention axes shape (S), the rank must match the target.
+      * Encoding length (L): The relative positional encoding length.
+
+    Args:
+      content_stream: The content representation, commonly referred to as h.
+        This serves a similar role to the standard hidden states in
+        Transformer-XL.
+      content_attention_bias: A trainable bias parameter added to the query
+        head when calculating the content-based attention score.
+      positional_attention_bias: A trainable bias parameter added to the query
+        head when calculating the position-based attention score.
+      query_stream: The query representation, commonly referred to as g.
+        This only has access to contextual information and position, but not
+        content. If not provided, then this is MultiHeadRelativeAttention with
+        self-attention.
+      relative_position_encoding: relative positional encoding for key and
+        value.
+      target_mapping: Optional `Tensor` representing the target mapping used
+        in partial prediction.
+      segment_matrix: Optional `Tensor` representing segmentation IDs used in
+        XLNet.
+      segment_encoding: Optional `Tensor` representing the segmentation
+        encoding as used in XLNet.
+      segment_attention_bias: Optional trainable bias parameter added to the
+        query head when calculating the segment-based attention score.
+      state: (default None) optional state. If passed, this is also attended
+        over as in TransformerXL and XLNet.
+      content_attention_mask: (default None) Optional mask that is added to
+        content attention logits. If state is not None, the mask source sequence
+        dimension should extend M.
+      query_attention_mask: (default None) Optional mask that is added to
+        query attention logits. If state is not None, the mask source sequence
+        dimension should extend M.
+
+    Returns:
+      content_attention_output, query_attention_output: the results of the
+        computation, both of shape [B, T, E]. `T` is for target sequence shapes,
+        `E` is the query input last dimension if `output_shape` is `None`.
+        Otherwise, the multi-head outputs are projected to the shape specified
+        by `output_shape`.
+    """
+    if not self._built_from_signature:
+      self._build_from_signature(content_stream, content_stream, content_stream)
+    if state is not None and state.shape.ndims > 1:
+      content_and_memory_stream = tf.concat([state, content_stream], 1)
+    else:
+      content_and_memory_stream = content_stream
+
+    # `query` = [B, T, N, H]
+    query = self._query_dense(content_stream)
+
+    # `key` = [B, S + M, N, H]
+    key = self._key_dense(content_and_memory_stream)
+
+    # `value` = [B, S + M, N, H]
+    value = self._value_dense(content_and_memory_stream)
+
+    # `position` = [B, L, N, H]
+    position = self._encoding_dense(relative_position_encoding)
+
+    content_attention_output = self.compute_attention(
+        query=query,
+        key=key,
+        value=value,
+        position=position,
+        content_attention_bias=content_attention_bias,
+        positional_attention_bias=positional_attention_bias,
+        segment_matrix=segment_matrix,
+        segment_encoding=segment_encoding,
+        segment_attention_bias=segment_attention_bias,
+        attention_mask=content_attention_mask)
+
+    # `content_attention_output` = [B, S, N, H]
+    content_attention_output = self._output_dense(content_attention_output)
+
+    query_attention_output = None
+    if query_stream is not None:
+      query = self._query_dense(query_stream)
+      if target_mapping is not None:
+        query = tf.einsum("bmnd,bml->blnd", query, target_mapping)
+        query_attention_output = self.compute_attention(
+            query=query,
+            key=key,
+            value=value,
+            position=position,
+            content_attention_bias=content_attention_bias,
+            positional_attention_bias=positional_attention_bias,
+            segment_matrix=segment_matrix,
+            segment_encoding=segment_encoding,
+            segment_attention_bias=segment_attention_bias,
+            attention_mask=query_attention_mask)
+        query_attention_output = tf.einsum("blnd,bml->bmnd",
+                                           query_attention_output,
+                                           target_mapping)
+      else:
+        query_attention_output = self.compute_attention(
+            query=query,
+            key=key,
+            value=value,
+            position=position,
+            content_attention_bias=content_attention_bias,
+            positional_attention_bias=positional_attention_bias,
+            segment_matrix=segment_matrix,
+            segment_encoding=segment_encoding,
+            segment_attention_bias=segment_attention_bias,
+            attention_mask=query_attention_mask)
+      query_attention_output = self._output_dense(query_attention_output)
+
+    return content_attention_output, query_attention_output
+
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/rezero_transformer.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/rezero_transformer.py
new file mode 100644
index 000000000..0bcc78189
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/rezero_transformer.py
@@ -0,0 +1,233 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-based rezero-transformer block layer (Transformer with ReZero)."""
+# pylint: disable=g-classes-have-attributes
+
+import gin
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package="Text")
+@gin.configurable
+class ReZeroTransformer(tf.keras.layers.Layer):
+  """Transformer layer with ReZero.
+
+  This layer implements the Transformer from "Attention Is All You Need".
+  (https://arxiv.org/abs/1706.03762).
+  The residual connection implements the ReZero method.
+  (https://arxiv.org/abs/2003.04887)
+
+  Args:
+    num_attention_heads: Number of attention heads.
+    intermediate_size: Size of the intermediate layer.
+    intermediate_activation: Activation for the intermediate layer.
+    dropout_rate: Dropout probability for the post-attention and output dropout.
+    attention_dropout_rate: Dropout probability for within the attention layer.
+    output_range: the sequence output range, [0, output_range) by slicing the
+      target sequence. `None` means the target sequence is not sliced.
+    kernel_initializer: Initializer for dense layer kernels.
+    bias_initializer: Initializer for dense layer biases.
+    kernel_regularizer: Regularizer for dense layer kernels.
+    bias_regularizer: Regularizer for dense layer biases.
+    activity_regularizer: Regularizer for dense layer activity.
+    kernel_constraint: Constraint for dense layer kernels.
+    bias_constraint: Constraint for dense layer kernels.
+    use_layer_norm: If add layer_norm on top of the ReZero.
+  """
+
+  def __init__(self,
+               num_attention_heads,
+               intermediate_size,
+               intermediate_activation,
+               dropout_rate=0.0,
+               attention_dropout_rate=0.0,
+               output_range=None,
+               kernel_initializer="glorot_uniform",
+               bias_initializer="zeros",
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               use_layer_norm=False,
+               **kwargs):
+    super(ReZeroTransformer, self).__init__(**kwargs)
+
+    self._num_heads = num_attention_heads
+    self._intermediate_size = intermediate_size
+    self._intermediate_activation = intermediate_activation
+    self._attention_dropout_rate = attention_dropout_rate
+    self._dropout_rate = dropout_rate
+    self._output_range = output_range
+    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
+    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
+    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
+    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
+    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
+    self._use_layer_norm = use_layer_norm
+
+  def build(self, input_shape):
+    input_tensor = input_shape[0] if len(input_shape) == 2 else input_shape
+    input_tensor_shape = tf.TensorShape(input_tensor)
+    if len(input_tensor_shape.as_list()) != 3:
+      raise ValueError("TransformerLayer expects a three-dimensional input of "
+                       "shape [batch, sequence, width].")
+    batch_size, sequence_length, hidden_size = input_tensor_shape
+
+    if len(input_shape) == 2:
+      mask_tensor_shape = tf.TensorShape(input_shape[1])
+      expected_mask_tensor_shape = tf.TensorShape(
+          [batch_size, sequence_length, sequence_length])
+      if not expected_mask_tensor_shape.is_compatible_with(mask_tensor_shape):
+        raise ValueError("When passing a mask tensor to TransformerLayer, the "
+                         "mask tensor must be of shape [batch, "
+                         "sequence_length, sequence_length] (here %s). Got a "
+                         "mask tensor of shape %s." %
+                         (expected_mask_tensor_shape, mask_tensor_shape))
+    if hidden_size % self._num_heads != 0:
+      raise ValueError(
+          "The input size (%d) is not a multiple of the number of attention "
+          "heads (%d)" % (hidden_size, self._num_heads))
+    self._attention_head_size = int(hidden_size // self._num_heads)
+    common_kwargs = dict(
+        kernel_initializer=self._kernel_initializer,
+        bias_initializer=self._bias_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activity_regularizer=self._activity_regularizer,
+        kernel_constraint=self._kernel_constraint,
+        bias_constraint=self._bias_constraint)
+    self._attention_layer = tf.keras.layers.MultiHeadAttention(
+        num_heads=self._num_heads,
+        key_dim=self._attention_head_size,
+        dropout=self._attention_dropout_rate,
+        name="self_attention",
+        **common_kwargs)
+    self._attention_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
+    if self._use_layer_norm:
+      # Use float32 in layernorm for numeric stability.
+      # It is probably safe in mixed_float16, but we haven't validated this yet.
+      self._attention_layer_norm = (
+          tf.keras.layers.LayerNormalization(
+              name="self_attention_layer_norm",
+              axis=-1,
+              epsilon=1e-12,
+              dtype=tf.float32))
+    self._intermediate_dense = tf.keras.layers.experimental.EinsumDense(
+        "abc,cd->abd",
+        output_shape=(None, self._intermediate_size),
+        bias_axes="d",
+        name="intermediate",
+        **common_kwargs)
+    policy = tf.keras.mixed_precision.global_policy()
+    if policy.name == "mixed_bfloat16":
+      # bfloat16 causes BERT with the LAMB optimizer to not converge
+      # as well, so we use float32.
+      # TODO(b/154538392): Investigate this.
+      policy = tf.float32
+    self._intermediate_activation_layer = tf.keras.layers.Activation(
+        self._intermediate_activation, dtype=policy)
+    self._output_dense = tf.keras.layers.experimental.EinsumDense(
+        "abc,cd->abd",
+        output_shape=(None, hidden_size),
+        bias_axes="d",
+        name="output",
+        **common_kwargs)
+    self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
+    if self._use_layer_norm:
+      # Use float32 in layernorm for numeric stability.
+      self._output_layer_norm = tf.keras.layers.LayerNormalization(
+          name="output_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32)
+
+    self._rezero_a = self.add_weight(
+        name="rezero_alpha",
+        initializer=tf.keras.initializers.Zeros(),
+        trainable=True,
+        dtype=tf.float32)
+
+    super(ReZeroTransformer, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        "num_attention_heads":
+            self._num_heads,
+        "intermediate_size":
+            self._intermediate_size,
+        "intermediate_activation":
+            self._intermediate_activation,
+        "dropout_rate":
+            self._dropout_rate,
+        "attention_dropout_rate":
+            self._attention_dropout_rate,
+        "output_range":
+            self._output_range,
+        "use_layer_norm":
+            self._use_layer_norm,
+        "kernel_initializer":
+            tf.keras.initializers.serialize(self._kernel_initializer),
+        "bias_initializer":
+            tf.keras.initializers.serialize(self._bias_initializer),
+        "kernel_regularizer":
+            tf.keras.regularizers.serialize(self._kernel_regularizer),
+        "bias_regularizer":
+            tf.keras.regularizers.serialize(self._bias_regularizer),
+        "activity_regularizer":
+            tf.keras.regularizers.serialize(self._activity_regularizer),
+        "kernel_constraint":
+            tf.keras.constraints.serialize(self._kernel_constraint),
+        "bias_constraint":
+            tf.keras.constraints.serialize(self._bias_constraint),
+    }
+    base_config = super(ReZeroTransformer, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def reset_rezero(self):
+    self._rezero_a.assign(0.)
+
+  def call(self, inputs):
+    if isinstance(inputs, (list, tuple)) and len(inputs) == 2:
+      input_tensor, attention_mask = inputs
+    else:
+      input_tensor, attention_mask = (inputs, None)
+
+    if self._output_range:
+      target_tensor = input_tensor[:, 0:self._output_range, :]
+      attention_mask = attention_mask[:, 0:self._output_range, :]
+    else:
+      target_tensor = input_tensor
+
+    attention_output = self._attention_layer(
+        query=target_tensor, value=input_tensor, attention_mask=attention_mask)
+    attention_output = self._attention_dropout(attention_output)
+    attention_output = target_tensor + self._rezero_a * attention_output
+    if self._use_layer_norm:
+      attention_output = self._attention_layer_norm(attention_output)
+    else:
+      attention_output = tf.cast(attention_output, tf.float32)
+
+    intermediate_output = self._intermediate_dense(attention_output)
+    intermediate_output = self._intermediate_activation_layer(
+        intermediate_output)
+    layer_output = self._output_dense(intermediate_output)
+    layer_output = self._output_dropout(layer_output)
+    # During mixed precision training, attention_output is from layer norm and
+    # is always fp32 for now. Cast layer_output to fp32 for the subsequent add.
+    layer_output = attention_output + tf.cast(self._rezero_a * layer_output,
+                                              tf.float32)
+    if self._use_layer_norm:
+      layer_output = self._output_layer_norm(layer_output)
+
+    return layer_output
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/self_attention_mask.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/self_attention_mask.py
new file mode 100644
index 000000000..7f42692ea
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/self_attention_mask.py
@@ -0,0 +1,39 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras layer that creates a self-attention mask."""
+
+import tensorflow as tf
+
+from keras_nlp import layers
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class SelfAttentionMask(layers.SelfAttentionMask):
+  """Creates 3D attention mask from a 2D tensor mask.
+
+    **Warning: Please use the `keras_nlp.layers.SelfAttentionMask`.**
+    inputs[0]: from_tensor: 2D or 3D Tensor of shape
+      `(batch_size, from_seq_length, ...)`.
+    inputs[1]: to_mask: int32 Tensor of shape `(batch_size, to_seq_length)`.
+
+    Returns:
+      Float Tensor of shape `(batch_size, from_seq_length, to_seq_length)`.
+  """
+
+  def call(self, inputs):
+    if isinstance(inputs, list):
+      return super().call(inputs[0], inputs[1])
+    else:
+      return super().call(inputs)
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/spectral_normalization.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/spectral_normalization.py
new file mode 100644
index 000000000..dbc851b40
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/spectral_normalization.py
@@ -0,0 +1,295 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Normalization layers.
+
+## References:
+
+[1] Yuichi Yoshida, Takeru Miyato. Spectral Norm Regularization for Improving
+    the Generalizability of Deep Learning.
+    _arXiv preprint arXiv:1705.10941_, 2017. https://arxiv.org/abs/1705.10941
+
+[2] Takeru Miyato, Toshiki Kataoka, Masanori Koyama, Yuichi Yoshida.
+    Spectral normalization for generative adversarial networks.
+    In _International Conference on Learning Representations_, 2018.
+
+[3] Henry Gouk, Eibe Frank, Bernhard Pfahringer, Michael Cree.
+    Regularisation of neural networks by enforcing lipschitz continuity.
+    _arXiv preprint arXiv:1804.04368_, 2018. https://arxiv.org/abs/1804.04368
+"""
+
+import numpy as np
+import tensorflow as tf
+
+
+class SpectralNormalization(tf.keras.layers.Wrapper):
+  """Implements spectral normalization for Dense layer."""
+
+  def __init__(self,
+               layer,
+               iteration=1,
+               norm_multiplier=0.95,
+               training=True,
+               aggregation=tf.VariableAggregation.MEAN,
+               inhere_layer_name=False,
+               **kwargs):
+    """Initializer.
+
+    Args:
+      layer: (tf.keras.layers.Layer) A TF Keras layer to apply normalization to.
+      iteration: (int) The number of power iteration to perform to estimate
+        weight matrix's singular value.
+      norm_multiplier: (float) Multiplicative constant to threshold the
+        normalization. Usually under normalization, the singular value will
+        converge to this value.
+      training: (bool) Whether to perform power iteration to update the singular
+        value estimate.
+      aggregation: (tf.VariableAggregation) Indicates how a distributed variable
+        will be aggregated. Accepted values are constants defined in the class
+        tf.VariableAggregation.
+      inhere_layer_name: (bool) Whether to inhere the name of the input layer.
+      **kwargs: (dict) Other keyword arguments for the layers.Wrapper class.
+    """
+    self.iteration = iteration
+    self.do_power_iteration = training
+    self.aggregation = aggregation
+    self.norm_multiplier = norm_multiplier
+
+    # Set layer name.
+    wrapper_name = kwargs.pop('name', None)
+    if inhere_layer_name:
+      wrapper_name = layer.name
+
+    if not isinstance(layer, tf.keras.layers.Layer):
+      raise ValueError('`layer` must be a `tf.keras.layer.Layer`. '
+                       'Observed `{}`'.format(layer))
+    super(SpectralNormalization, self).__init__(
+        layer, name=wrapper_name, **kwargs)
+
+  def build(self, input_shape):
+    super(SpectralNormalization, self).build(input_shape)
+    self.layer.kernel._aggregation = self.aggregation  # pylint: disable=protected-access
+    self._dtype = self.layer.kernel.dtype
+
+    self.w = self.layer.kernel
+    self.w_shape = self.w.shape.as_list()
+    self.uv_initializer = tf.initializers.random_normal()
+
+    self.v = self.add_weight(
+        shape=(1, np.prod(self.w_shape[:-1])),
+        initializer=self.uv_initializer,
+        trainable=False,
+        name='v',
+        dtype=self.dtype,
+        aggregation=self.aggregation)
+
+    self.u = self.add_weight(
+        shape=(1, self.w_shape[-1]),
+        initializer=self.uv_initializer,
+        trainable=False,
+        name='u',
+        dtype=self.dtype,
+        aggregation=self.aggregation)
+
+    self.update_weights()
+
+  def call(self, inputs, *, training=None):
+    training = self.do_power_iteration if training is None else training
+    u_update_op, v_update_op, w_update_op = self.update_weights(
+        training=training)
+    output = self.layer(inputs)
+    w_restore_op = self.restore_weights()
+
+    # Register update ops.
+    self.add_update(u_update_op)
+    self.add_update(v_update_op)
+    self.add_update(w_update_op)
+    self.add_update(w_restore_op)
+
+    return output
+
+  def update_weights(self, *, training=True):
+    w_reshaped = tf.reshape(self.w, [-1, self.w_shape[-1]])
+
+    u_hat = self.u
+    v_hat = self.v
+
+    if training:
+      for _ in range(self.iteration):
+        v_hat = tf.nn.l2_normalize(tf.matmul(u_hat, tf.transpose(w_reshaped)))
+        u_hat = tf.nn.l2_normalize(tf.matmul(v_hat, w_reshaped))
+
+    sigma = tf.matmul(tf.matmul(v_hat, w_reshaped), tf.transpose(u_hat))
+    # Convert sigma from a 1x1 matrix to a scalar.
+    sigma = tf.reshape(sigma, [])
+    u_update_op = self.u.assign(u_hat)
+    v_update_op = self.v.assign(v_hat)
+
+    # Bound spectral norm to be not larger than self.norm_multiplier.
+    w_norm = tf.cond((self.norm_multiplier / sigma) < 1, lambda:  # pylint:disable=g-long-lambda
+                     (self.norm_multiplier / sigma) * self.w, lambda: self.w)
+
+    w_update_op = self.layer.kernel.assign(w_norm)
+    return u_update_op, v_update_op, w_update_op
+
+  def restore_weights(self):
+    """Restores layer weights to maintain gradient update (See Alg 1 of [1])."""
+    return self.layer.kernel.assign(self.w)
+
+
+class SpectralNormalizationConv2D(tf.keras.layers.Wrapper):
+  """Implements spectral normalization for Conv2D layer based on [3]."""
+
+  def __init__(self,
+               layer,
+               iteration=1,
+               norm_multiplier=0.95,
+               training=True,
+               aggregation=tf.VariableAggregation.MEAN,
+               legacy_mode=False,
+               **kwargs):
+    """Initializer.
+
+    Args:
+      layer: (tf.keras.layers.Layer) A TF Keras layer to apply normalization to.
+      iteration: (int) The number of power iteration to perform to estimate
+        weight matrix's singular value.
+      norm_multiplier: (float) Multiplicative constant to threshold the
+        normalization. Usually under normalization, the singular value will
+        converge to this value.
+      training: (bool) Whether to perform power iteration to update the singular
+        value estimate.
+      aggregation: (tf.VariableAggregation) Indicates how a distributed variable
+        will be aggregated. Accepted values are constants defined in the class
+        tf.VariableAggregation.
+      legacy_mode: (bool) Whether to use the legacy implementation where the
+        dimension of the u and v vectors are set to the batch size. It should
+        not be enabled unless for backward compatibility reasons.
+      **kwargs: (dict) Other keyword arguments for the layers.Wrapper class.
+    """
+    self.iteration = iteration
+    self.do_power_iteration = training
+    self.aggregation = aggregation
+    self.norm_multiplier = norm_multiplier
+    self.legacy_mode = legacy_mode
+
+    # Set layer attributes.
+    layer._name += '_spec_norm'
+
+    if not isinstance(layer, tf.keras.layers.Conv2D):
+      raise ValueError(
+          'layer must be a `tf.keras.layer.Conv2D` instance. You passed: {input}'
+          .format(input=layer))
+    super(SpectralNormalizationConv2D, self).__init__(layer, **kwargs)
+
+  def build(self, input_shape):
+    self.layer.build(input_shape)
+    self.layer.kernel._aggregation = self.aggregation  # pylint: disable=protected-access
+    self._dtype = self.layer.kernel.dtype
+
+    # Shape (kernel_size_1, kernel_size_2, in_channel, out_channel).
+    self.w = self.layer.kernel
+    self.w_shape = self.w.shape.as_list()
+    self.strides = self.layer.strides
+
+    # Set the dimensions of u and v vectors.
+    batch_size = input_shape[0]
+    uv_dim = batch_size if self.legacy_mode else 1
+
+    # Resolve shapes.
+    in_height = input_shape[1]
+    in_width = input_shape[2]
+    in_channel = self.w_shape[2]
+
+    out_height = in_height // self.strides[0]
+    out_width = in_width // self.strides[1]
+    out_channel = self.w_shape[3]
+
+    self.in_shape = (uv_dim, in_height, in_width, in_channel)
+    self.out_shape = (uv_dim, out_height, out_width, out_channel)
+    self.uv_initializer = tf.initializers.random_normal()
+
+    self.v = self.add_weight(
+        shape=self.in_shape,
+        initializer=self.uv_initializer,
+        trainable=False,
+        name='v',
+        dtype=self.dtype,
+        aggregation=self.aggregation)
+
+    self.u = self.add_weight(
+        shape=self.out_shape,
+        initializer=self.uv_initializer,
+        trainable=False,
+        name='u',
+        dtype=self.dtype,
+        aggregation=self.aggregation)
+
+    super(SpectralNormalizationConv2D, self).build()
+
+  def call(self, inputs):
+    u_update_op, v_update_op, w_update_op = self.update_weights()
+    output = self.layer(inputs)
+    w_restore_op = self.restore_weights()
+
+    # Register update ops.
+    self.add_update(u_update_op)
+    self.add_update(v_update_op)
+    self.add_update(w_update_op)
+    self.add_update(w_restore_op)
+
+    return output
+
+  def update_weights(self):
+    """Computes power iteration for convolutional filters based on [3]."""
+    # Initialize u, v vectors.
+    u_hat = self.u
+    v_hat = self.v
+
+    if self.do_power_iteration:
+      for _ in range(self.iteration):
+        # Updates v.
+        v_ = tf.nn.conv2d_transpose(
+            u_hat,
+            self.w,
+            output_shape=self.in_shape,
+            strides=self.strides,
+            padding='SAME')
+        v_hat = tf.nn.l2_normalize(tf.reshape(v_, [1, -1]))
+        v_hat = tf.reshape(v_hat, v_.shape)
+
+        # Updates u.
+        u_ = tf.nn.conv2d(v_hat, self.w, strides=self.strides, padding='SAME')
+        u_hat = tf.nn.l2_normalize(tf.reshape(u_, [1, -1]))
+        u_hat = tf.reshape(u_hat, u_.shape)
+
+    v_w_hat = tf.nn.conv2d(v_hat, self.w, strides=self.strides, padding='SAME')
+
+    sigma = tf.matmul(tf.reshape(v_w_hat, [1, -1]), tf.reshape(u_hat, [-1, 1]))
+    # Convert sigma from a 1x1 matrix to a scalar.
+    sigma = tf.reshape(sigma, [])
+
+    u_update_op = self.u.assign(u_hat)
+    v_update_op = self.v.assign(v_hat)
+
+    w_norm = tf.cond((self.norm_multiplier / sigma) < 1, lambda:      # pylint:disable=g-long-lambda
+                     (self.norm_multiplier / sigma) * self.w, lambda: self.w)
+
+    w_update_op = self.layer.kernel.assign(w_norm)
+
+    return u_update_op, v_update_op, w_update_op
+
+  def restore_weights(self):
+    """Restores layer weights to maintain gradient update (See Alg 1 of [1])."""
+    return self.layer.kernel.assign(self.w)
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/talking_heads_attention.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/talking_heads_attention.py
new file mode 100644
index 000000000..bddfacaa8
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/talking_heads_attention.py
@@ -0,0 +1,155 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Talking Head Attention layer."""
+# pylint: disable=g-classes-have-attributes
+import math
+import string
+
+import gin
+import tensorflow as tf
+
+_CHR_IDX = string.ascii_lowercase
+
+
+@tf.keras.utils.register_keras_serializable(package="Text")
+@gin.configurable
+class TalkingHeadsAttention(tf.keras.layers.MultiHeadAttention):
+  """Implements Talking-Heads Attention.
+
+  This is an implementation of Talking-Heads Attention based on the paper
+  Talking-Heads Attention (https://arxiv.org/abs/2003.02436): it enhanced
+  multi-head attention by including linearprojections across the attention-heads
+  dimension, immediately before and after the softmax operation.
+
+  See the base class `tf.keras.layers.MultiHeadAttention` for more details.
+
+  Args:
+    num_heads: Number of attention heads.
+    key_dim: Size of each attention head for query and key.
+    value_dim:  Size of each attention head for value.
+    dropout: Dropout probability.
+    use_bias: Boolean, whether the dense layers use bias vectors/matrices.
+    output_shape: The expected shape of an output tensor, besides the batch and
+      sequence dims. If not specified, projects back to the key feature dim.
+    attention_axes: axes over which the attention is applied. `None` means
+      attention over all axes, but batch, heads, and features.
+    return_attention_scores: bool, if `True`, returns the multi-head attention
+      scores as an additional output argument.
+    kernel_initializer: Initializer for dense layer kernels.
+    bias_initializer: Initializer for dense layer biases.
+    kernel_regularizer: Regularizer for dense layer kernels.
+    bias_regularizer: Regularizer for dense layer biases.
+    activity_regularizer: Regularizer for dense layer activity.
+    kernel_constraint: Constraint for dense layer kernels.
+    bias_constraint: Constraint for dense layer kernels.
+  """
+
+  def _build_attention(self, qkv_rank):
+    """Builds multi-head dot-product attention computations.
+
+    This function overrides base class to create additional linear projection
+    that will be applied on attention scores before and after softmax.
+
+    Args:
+      qkv_rank: The rank of query, key, value tensors after projection.
+    """
+    super(TalkingHeadsAttention, self)._build_attention(qkv_rank)
+
+    # Build an equation:
+    # (<batch_dims>, num_heads_a, ...),(num_heads_a, num_heads_b) ->
+    # (<batch_dims>, num_heads_b, ...)
+    # qkv_ranks has `batch_dims`, `attention_dims`, `num_heads` and `channels`.
+    num_batch_dims = qkv_rank - len(self._attention_axes) - 2
+
+    # The shape of attn_scores is:
+    # (<batch_dims>, num_heads, <query_attn_dims>, <key_attn_dims>)
+    attn_scores_rank = num_batch_dims + 1 + len(self._attention_axes) * 2
+    scores_notation = _CHR_IDX[:attn_scores_rank]
+    projection_notation = scores_notation[num_batch_dims] + (
+        _CHR_IDX[attn_scores_rank])
+    projected_scores_notation = scores_notation[:num_batch_dims] + (
+        _CHR_IDX[attn_scores_rank] + scores_notation[num_batch_dims + 1:])
+    self._talking_heads_equation = "%s,%s->%s" % (
+        scores_notation, projection_notation, projected_scores_notation)
+
+    self._pre_softmax_weight = self.add_weight(
+        "pre_softmax_weight",
+        shape=(self._num_heads, self._num_heads),
+        initializer=self._kernel_initializer,
+        regularizer=self._kernel_regularizer,
+        constraint=self._kernel_constraint,
+        dtype=self.dtype,
+        trainable=True)
+    self._post_softmax_weight = self.add_weight(
+        "post_softmax_weight",
+        shape=(self._num_heads, self._num_heads),
+        initializer=self._kernel_initializer,
+        regularizer=self._kernel_regularizer,
+        constraint=self._kernel_constraint,
+        dtype=self.dtype,
+        trainable=True)
+
+  def _compute_attention(self,
+                         query_tensor,
+                         key_tensor,
+                         value_tensor,
+                         attention_mask=None,
+                         training=None):
+    """Applies Dot-product attention with query, key, value tensors.
+
+    This function overrides base class to apply additional linear projection
+    on attention scores before and after softmax.
+
+    Args:
+      query_tensor: Projected query `Tensor` of shape `[B, T, N, key_dim]`.
+      key_tensor: Projected key `Tensor` of shape `[B, T, N, key_dim]`.
+      value_tensor: Projected value `Tensor` of shape `[B, T, N, value_dim]`.
+      attention_mask: a boolean mask of shape `[B, T, S]`, that prevents
+        attention to certain positions.
+      training: Python boolean indicating whether the layer should behave in
+        training mode (adding dropout) or in inference mode (doing nothing).
+
+    Returns:
+      attention_output: Multi-headed outputs of attention computation.
+      attention_scores: Multi-headed attention weights.
+    """
+    # Take the dot product between "query" and "key" to get the raw
+    # attention scores.
+    attention_scores = tf.einsum(self._dot_product_equation, key_tensor,
+                                 query_tensor)
+    attention_scores = tf.multiply(attention_scores,
+                                   1.0 / math.sqrt(float(self._key_dim)))
+
+    # Apply linear projection before softmax
+    attention_scores = tf.einsum(self._talking_heads_equation, attention_scores,
+                                 self._pre_softmax_weight)
+
+    # Normalize the attention scores to probabilities.
+    # `attention_scores` = [B, N, T, S]
+    attention_scores = self._masked_softmax(attention_scores, attention_mask)
+
+    # Apply linear projection after softmax
+    attention_scores = tf.einsum(self._talking_heads_equation, attention_scores,
+                                 self._post_softmax_weight)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attention_scores_dropout = self._dropout_layer(
+        attention_scores, training=training)
+
+    # `context_layer` = [B, T, N, H]
+    attention_output = tf.einsum(self._combine_equation,
+                                 attention_scores_dropout, value_tensor)
+    return attention_output, attention_scores
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/text_layers.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/text_layers.py
new file mode 100644
index 000000000..df7049e6f
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/text_layers.py
@@ -0,0 +1,704 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras Layers for BERT-specific preprocessing."""
+from typing import Any, Dict, List, Optional, Union
+
+from absl import logging
+import tensorflow as tf
+
+try:
+  import tensorflow_text as text  # pylint: disable=g-import-not-at-top
+except ImportError:
+  text = None
+except tf.errors.NotFoundError as e:
+  logging.warn("Encountered error when importing tensorflow_text: %s", e)
+  text = None
+
+
+def _check_if_tf_text_installed():
+  if text is None:
+    raise ImportError("import tensorflow_text failed, please install "
+                      "'tensorflow-text-nightly'.")
+
+
+def _iterative_vectorized_fair_share(capacity: tf.Tensor,
+                                     limit: Union[int, tf.Tensor]):
+  """Iterative algorithm for max min fairness algorithm.
+
+  Reference: https://en.wikipedia.org/wiki/Max-min_fairness
+
+  The idea is for each example with some number of segments and a limit of
+  total segment length allowed, we grant each segment a fair share of the
+  limit. For example, if every segment has the same length, no work to do.
+  If one segment has below average length, its share will be spilt to others
+  fairly. In this way, the longest segment will be the shortest among all
+  potential capacity assignments.
+
+  Args:
+    capacity: A rank-2 Tensor of #Segments x Batch.
+    limit: The largest permissible number of tokens in total across one example.
+
+  Returns:
+    A rank-2 Tensor with new segment capacity assignment such that
+      the total number of tokens in each example does not exceed the `limit`.
+  """
+  # Firstly, we calculate the lower bound of the capacity assignment.
+  per_seg_limit = limit // capacity.shape[0]
+  limit_mask = tf.ones(capacity.shape, dtype=tf.int64) * per_seg_limit
+  lower_bound = tf.minimum(capacity, limit_mask)
+
+  # This step makes up the capacity that already statisfy the capacity limit.
+  remaining_cap_sum = limit - tf.math.reduce_sum(lower_bound, axis=0)
+  remaining_cap_mat = capacity - lower_bound
+  new_cap = lower_bound + remaining_cap_mat * tf.cast(
+      tf.math.reduce_sum(remaining_cap_mat, axis=0) <= remaining_cap_sum,
+      tf.int64)
+
+  # Process iteratively. This step is O(#segments), see analysis below.
+  while True:
+    remaining_limit = limit - tf.math.reduce_sum(new_cap, axis=0)
+    remaining_cap = capacity - new_cap
+    masked_remaining_slots = tf.cast(remaining_cap > 0, tf.int64)
+    remaining_cap_col_slots = tf.reduce_sum(masked_remaining_slots, axis=0)
+    masked_remaining_limit = tf.cast(remaining_cap_col_slots > 0,
+                                     tf.int64) * remaining_limit
+    # Total remaining segment limit is different for each example.
+    per_seg_limit = masked_remaining_limit // (
+        tf.cast(remaining_cap_col_slots <= 0, tf.int64) +
+        remaining_cap_col_slots)  # +1 to make sure 0/0 = 0
+
+    # Note that for each step, there is at least one more segment being
+    # fulfilled or the loop is finished.
+    # The idea is, if remaining per example limit > smallest among segments,
+    # the smallest segment ask is fullfilled. Otherwise, all remaining segments
+    # are truncated, the assignment is finished.
+    if tf.math.reduce_sum(per_seg_limit) > 0:
+      remaining_slots_mat = tf.cast(remaining_cap > 0, tf.int64)
+      new_cap = new_cap + remaining_slots_mat * per_seg_limit
+    else:
+      # Leftover assignment of limit that is smaller than #slots.
+      new_remained_assignment_mask = tf.cast(
+          (tf.cumsum(masked_remaining_slots, axis=0) <= masked_remaining_limit)
+          & (masked_remaining_slots > 0), tf.int64)
+      new_cap = new_cap + new_remained_assignment_mask
+      break
+  return new_cap
+
+
+def round_robin_truncate_inputs(
+    inputs: Union[tf.RaggedTensor, List[tf.RaggedTensor]],
+    limit: Union[int, tf.Tensor],
+) -> Union[tf.RaggedTensor, List[tf.RaggedTensor]]:
+  """Truncates a list of batched segments to fit a per-example length limit.
+
+  Available space is assigned one token at a time in a round-robin fashion
+  to the inputs that still need some, until the limit is reached.
+  (Or equivalently: the longest input is truncated by one token until the total
+  length of inputs fits the limit.) Examples that fit the limit as passed in
+  remain unchanged.
+
+  Args:
+    inputs: A list of rank-2 RaggedTensors. The i-th example is given by
+      the i-th row in each list element, that is, `inputs[:][i, :]`.
+    limit: The largest permissible number of tokens in total across one example.
+
+  Returns:
+    A list of rank-2 RaggedTensors at corresponding indices with the inputs,
+      in which the rows of each RaggedTensor have been truncated such that
+      the total number of tokens in each example does not exceed the `limit`.
+  """
+  if not isinstance(inputs, (list, tuple)):
+    return round_robin_truncate_inputs([inputs], limit)[0]
+  limit = tf.cast(limit, tf.int64)
+  if not all(rt.shape.rank == 2 for rt in inputs):
+    raise ValueError("All inputs must have shape [batch_size, (items)]")
+  if len(inputs) == 1:
+    return [_truncate_row_lengths(inputs[0], limit)]
+  elif len(inputs) == 2:
+    size_a, size_b = [rt.row_lengths() for rt in inputs]
+    # Here's a brain-twister: This does round-robin assignment of quota
+    # to both inputs until the limit is reached. Hint: consider separately
+    # the cases of zero, one, or two inputs exceeding half the limit.
+    floor_half = limit // 2
+    ceil_half = limit - floor_half
+    quota_a = tf.minimum(size_a, ceil_half + tf.nn.relu(floor_half - size_b))
+    quota_b = tf.minimum(size_b, floor_half + tf.nn.relu(ceil_half - size_a))
+    return [_truncate_row_lengths(inputs[0], quota_a),
+            _truncate_row_lengths(inputs[1], quota_b)]
+  else:
+    # Note that we don't merge with the 2 input case because the full algorithm
+    # is more expensive.
+    capacity = tf.stack([rt.row_lengths() for rt in inputs])  # #Segments x B
+    new_capacity = _iterative_vectorized_fair_share(capacity, limit)
+    return [
+        _truncate_row_lengths(inputs[i], new_capacity[i])
+        for i in range(capacity.shape[0])
+    ]
+
+
+def _truncate_row_lengths(ragged_tensor: tf.RaggedTensor,
+                          new_lengths: tf.Tensor) -> tf.RaggedTensor:
+  """Truncates the rows of `ragged_tensor` to the given row lengths."""
+  new_lengths = tf.broadcast_to(new_lengths,
+                                ragged_tensor.bounding_shape()[0:1])
+  def fn(x):
+    row, new_length = x
+    return row[0:new_length]
+  fn_dtype = tf.RaggedTensorSpec(dtype=ragged_tensor.dtype,
+                                 ragged_rank=ragged_tensor.ragged_rank - 1)
+  result = tf.map_fn(fn, (ragged_tensor, new_lengths), dtype=fn_dtype)
+  # Work around broken shape propagation: without this, result has unknown rank.
+  flat_values_shape = [None] * ragged_tensor.flat_values.shape.rank
+  result = result.with_flat_values(
+      tf.ensure_shape(result.flat_values, flat_values_shape))
+
+  return result
+
+
+class BertTokenizer(tf.keras.layers.Layer):
+  """Wraps BertTokenizer with pre-defined vocab as a Keras Layer.
+
+  Attributes:
+    tokenize_with_offsets: If true, calls
+      `text.BertTokenizer.tokenize_with_offsets()` instead of plain
+      `text.BertTokenizer.tokenize()` and outputs a triple of
+      `(tokens, start_offsets, limit_offsets)`.
+    raw_table_access: An object with methods `.lookup(keys) and `.size()`
+      that operate on the raw lookup table of tokens. It can be used to
+      look up special token synbols like `[MASK]`.
+  """
+
+  def __init__(self, *,
+               vocab_file: str,
+               lower_case: bool,
+               tokenize_with_offsets: bool = False,
+               **kwargs):
+    """Initialize a `BertTokenizer` layer.
+
+    Args:
+      vocab_file: A Python string with the path of the vocabulary file.
+        This is a text file with newline-separated wordpiece tokens.
+        This layer initializes a lookup table from it that gets used with
+        `text.BertTokenizer`.
+      lower_case: A Python boolean forwarded to `text.BertTokenizer`.
+        If true, input text is converted to lower case (where applicable)
+        before tokenization. This must be set to match the way in which
+        the `vocab_file` was created.
+      tokenize_with_offsets: A Python boolean. If true, this layer calls
+        `text.BertTokenizer.tokenize_with_offsets()` instead of plain
+        `text.BertTokenizer.tokenize()` and outputs a triple of
+        `(tokens, start_offsets, limit_offsets)`
+        insead of just tokens.
+      **kwargs: Standard arguments to `Layer()`.
+
+    Raises:
+      ImportError: If importing `tensorflow_text` failed.
+    """
+    _check_if_tf_text_installed()
+
+    self.tokenize_with_offsets = tokenize_with_offsets
+    # TODO(b/177326279): Stop storing the vocab table initializer as an
+    # attribute when https://github.com/tensorflow/tensorflow/issues/46456
+    # has been fixed in the TensorFlow versions of the TF Hub users that load
+    # a SavedModel created from this layer. Due to that issue, loading such a
+    # SavedModel forgets to add .vocab_table._initializer as a trackable
+    # dependency of .vocab_table, so that saving it again to a second SavedModel
+    # (e.g., the final model built using TF Hub) does not properly track
+    # the ._vocab_table._initializer._filename as an Asset.
+    self._vocab_table, self._vocab_initializer_donotuse = (
+        self._create_vocab_table_and_initializer(vocab_file))
+    self._special_tokens_dict = self._create_special_tokens_dict(
+        self._vocab_table, vocab_file)
+    super().__init__(**kwargs)
+    self._bert_tokenizer = text.BertTokenizer(
+        self._vocab_table, lower_case=lower_case)
+
+  @property
+  def vocab_size(self):
+    return self._vocab_table.size()
+
+  def _create_vocab_table_and_initializer(self, vocab_file):
+    vocab_initializer = tf.lookup.TextFileInitializer(
+        vocab_file,
+        key_dtype=tf.string, key_index=tf.lookup.TextFileIndex.WHOLE_LINE,
+        value_dtype=tf.int64, value_index=tf.lookup.TextFileIndex.LINE_NUMBER)
+    vocab_table = tf.lookup.StaticHashTable(vocab_initializer, default_value=-1)
+    return vocab_table, vocab_initializer
+
+  def call(self, inputs: tf.Tensor):
+    """Calls `text.BertTokenizer` on inputs.
+
+    Args:
+      inputs: A string Tensor of shape `(batch_size,)`.
+
+    Returns:
+      One or three of `RaggedTensors` if `tokenize_with_offsets` is False or
+      True, respectively. These are
+        tokens: A `RaggedTensor` of shape
+          `[batch_size, (words), (pieces_per_word)]`
+          and type int32. `tokens[i,j,k]` contains the k-th wordpiece of the
+          j-th word in the i-th input.
+        start_offsets, limit_offsets: If `tokenize_with_offsets` is True,
+          RaggedTensors of type int64 with the same indices as tokens.
+          Element `[i,j,k]` contains the byte offset at the start, or past the
+          end, resp., for the k-th wordpiece of the j-th word in the i-th input.
+    """
+    # Prepare to reshape the result to work around broken shape inference.
+    batch_size = tf.shape(inputs)[0]
+    def _reshape(rt):
+      values = rt.values
+      row_splits = rt.row_splits
+      row_splits = tf.reshape(row_splits, [batch_size + 1])
+      return tf.RaggedTensor.from_row_splits(values, row_splits)
+
+    # Call the tokenizer.
+    if self.tokenize_with_offsets:
+      tokens, start_offsets, limit_offsets = (
+          self._bert_tokenizer.tokenize_with_offsets(inputs))
+      tokens = tf.cast(tokens, dtype=tf.int32)
+      return _reshape(tokens), _reshape(start_offsets), _reshape(limit_offsets)
+    else:
+      tokens = self._bert_tokenizer.tokenize(inputs)
+      tokens = tf.cast(tokens, dtype=tf.int32)
+      return _reshape(tokens)
+
+  def get_config(self):
+    # Skip in tf.saved_model.save(); fail if called direcly.
+    raise NotImplementedError("TODO(b/170480226): implement")
+
+  def get_special_tokens_dict(self):
+    """Returns dict of token ids, keyed by standard names for their purpose.
+
+    Returns:
+      A dict from Python strings to Python integers. Each key is a standard
+      name for a special token describing its use. (For example, "padding_id"
+      is what BERT traditionally calls "[PAD]" but others may call "<pad>".)
+      The corresponding value is the integer token id. If a special token
+      is not found, its entry is omitted from the dict.
+
+      The supported keys and tokens are:
+        * start_of_sequence_id: looked up from "[CLS]"
+        * end_of_segment_id: looked up from "[SEP]"
+        * padding_id: looked up form "[PAD]"
+        * mask_id: looked up from "[MASK]"
+        * vocab_size: one past the largest token id used
+    """
+    return self._special_tokens_dict
+
+  def _create_special_tokens_dict(self, vocab_table, vocab_file):
+    special_tokens = dict(start_of_sequence_id="[CLS]",
+                          end_of_segment_id="[SEP]",
+                          padding_id="[PAD]",
+                          mask_id="[MASK]")
+    with tf.init_scope():
+      if tf.executing_eagerly():
+        special_token_ids = vocab_table.lookup(
+            tf.constant(list(special_tokens.values()), tf.string))
+        vocab_size = vocab_table.size()
+      else:
+        # A blast from the past: non-eager init context while building Model.
+        # This can happen with Estimator or tf.compat.v1.disable_v2_behavior().
+        logging.warning(
+            "Non-eager init context; computing "
+            "BertTokenizer's special_tokens_dict in tf.compat.v1.Session")
+        with tf.Graph().as_default():
+          local_vocab_table, _ = self._create_vocab_table_and_initializer(
+              vocab_file)
+          special_token_ids_tensor = local_vocab_table.lookup(
+              tf.constant(list(special_tokens.values()), tf.string))
+          vocab_size_tensor = local_vocab_table.size()
+          init_ops = [tf.compat.v1.initialize_all_tables()]
+          with tf.compat.v1.Session() as sess:
+            sess.run(init_ops)
+            special_token_ids, vocab_size = sess.run(
+                [special_token_ids_tensor, vocab_size_tensor])
+      result = dict(
+          vocab_size=int(vocab_size)  # Numpy to Python.
+      )
+      for k, v in zip(special_tokens, special_token_ids):
+        v = int(v)
+        if v >= 0:
+          result[k] = v
+        else:
+          logging.warning("Could not find %s as token \"%s\" in vocab file %s",
+                          k, special_tokens[k], vocab_file)
+    return result
+
+
+class SentencepieceTokenizer(tf.keras.layers.Layer):
+  """Wraps `tf_text.SentencepieceTokenizer` as a Keras Layer.
+
+  Attributes:
+    tokenize_with_offsets: If true, calls
+      `SentencepieceTokenizer.tokenize_with_offsets()`
+      instead of plain `.tokenize()` and outputs a triple of
+      `(tokens, start_offsets, limit_offsets)`.
+  """
+
+  def __init__(self,
+               *,
+               lower_case: bool,
+               model_file_path: Optional[str] = None,
+               model_serialized_proto: Optional[str] = None,
+               tokenize_with_offsets: bool = False,
+               nbest_size: int = 0,
+               alpha: float = 1.0,
+               strip_diacritics: bool = False,
+               **kwargs):
+    """Initializes a SentencepieceTokenizer layer.
+
+    Args:
+      lower_case: A Python boolean indicating whether to lowercase the string
+        before tokenization. NOTE: New models are encouraged to build `*_cf`
+        (case folding) normalization into the Sentencepiece model itself and
+        avoid this extra step.
+      model_file_path: A Python string with the path of the sentencepiece model.
+        Exactly one of `model_file_path` and `model_serialized_proto` can be
+        specified. In either case, the Keras model config for this layer will
+        store the actual proto (not a filename passed here).
+      model_serialized_proto: The sentencepiece model serialized proto string.
+      tokenize_with_offsets: A Python boolean. If true, this layer calls
+        `SentencepieceTokenizer.tokenize_with_offsets()` instead of
+        plain `.tokenize()` and outputs a triple of
+        `(tokens, start_offsets, limit_offsets)` insead of just tokens.
+        Note that when following `strip_diacritics` is set to True, returning
+        offsets is not supported now.
+      nbest_size: A scalar for sampling:
+        nbest_size = {0,1}: No sampling is performed. (default)
+        nbest_size > 1: samples from the nbest_size results.
+        nbest_size < 0: assuming that nbest_size is infinite and samples
+           from the all hypothesis (lattice) using
+           forward-filtering-and-backward-sampling algorithm.
+      alpha: A scalar for a smoothing parameter. Inverse temperature for
+        probability rescaling.
+      strip_diacritics: Whether to strip diacritics or not. Note that stripping
+        diacritics requires additional text normalization and dropping bytes,
+        which makes it impossible to keep track of the offsets now. Hence
+        when `strip_diacritics` is set to True, we don't yet support
+        `tokenize_with_offsets`. NOTE: New models are encouraged to put this
+        into custom normalization rules for the Sentencepiece model itself to
+        avoid this extra step and the limitation regarding offsets.
+      **kwargs: standard arguments to `Layer()`.
+
+    Raises:
+      ImportError: if importing tensorflow_text failed.
+    """
+    _check_if_tf_text_installed()
+    super().__init__(**kwargs)
+    if bool(model_file_path) == bool(model_serialized_proto):
+      raise ValueError("Exact one of `model_file_path` and "
+                       "`model_serialized_proto` can be specified.")
+    # TODO(b/181866850): Support tokenize_with_offsets for strip_diacritics=True
+    if tokenize_with_offsets and strip_diacritics:
+      raise ValueError("`tokenize_with_offsets` is not supported when "
+                       "`strip_diacritics` is set to True.")
+    if model_file_path:
+      self._model_serialized_proto = tf.io.gfile.GFile(model_file_path,
+                                                       "rb").read()
+    else:
+      self._model_serialized_proto = model_serialized_proto
+
+    self._lower_case = lower_case
+    self.tokenize_with_offsets = tokenize_with_offsets
+    self._nbest_size = nbest_size
+    self._alpha = alpha
+    self._strip_diacritics = strip_diacritics
+    self._tokenizer = self._create_tokenizer()
+    self._special_tokens_dict = self._create_special_tokens_dict()
+
+  def _create_tokenizer(self):
+    return text.SentencepieceTokenizer(
+        model=self._model_serialized_proto,
+        out_type=tf.int32,
+        nbest_size=self._nbest_size,
+        alpha=self._alpha)
+
+  @property
+  def vocab_size(self):
+    return self._tokenizer.vocab_size()
+
+  def call(self, inputs: tf.Tensor):
+    """Calls `text.SentencepieceTokenizer` on inputs.
+
+    Args:
+      inputs: A string Tensor of shape `(batch_size,)`.
+
+    Returns:
+      One or three of RaggedTensors if tokenize_with_offsets is False or True,
+      respectively. These are
+      tokens: A RaggedTensor of shape `[batch_size, (pieces)]` and type `int32`.
+        `tokens[i,j]` contains the j-th piece in the i-th input.
+      start_offsets, limit_offsets: If `tokenize_with_offsets` is True,
+        RaggedTensors of type `int64` with the same indices as tokens.
+        Element `[i,j]` contains the byte offset at the start, or past the
+        end, resp., for the j-th piece in the i-th input.
+    """
+    if self._strip_diacritics:
+      if self.tokenize_with_offsets:
+        raise ValueError("`tokenize_with_offsets` is not supported yet when "
+                         "`strip_diacritics` is set to True (b/181866850).")
+      inputs = text.normalize_utf8(inputs, "NFD")
+      inputs = tf.strings.regex_replace(inputs, r"\p{Mn}", "")
+
+    if self._lower_case:
+      inputs = text.case_fold_utf8(inputs)
+
+    # Prepare to reshape the result to work around broken shape inference.
+    batch_size = tf.shape(inputs)[0]
+    def _reshape(rt):
+      values = rt.values
+      row_splits = rt.row_splits
+      row_splits = tf.reshape(row_splits, [batch_size + 1])
+      return tf.RaggedTensor.from_row_splits(values, row_splits)
+
+    # Call the tokenizer.
+    if self.tokenize_with_offsets:
+      tokens, start_offsets, limit_offsets = (
+          self._tokenizer.tokenize_with_offsets(inputs))
+      return _reshape(tokens), _reshape(start_offsets), _reshape(limit_offsets)
+    else:
+      tokens = self._tokenizer.tokenize(inputs)
+      return _reshape(tokens)
+
+  def get_config(self):
+    # Skip in tf.saved_model.save(); fail if called direcly.
+    raise NotImplementedError("TODO(b/170480226): implement")
+
+  def get_special_tokens_dict(self):
+    """Returns dict of token ids, keyed by standard names for their purpose.
+
+    Returns:
+      A dict from Python strings to Python integers. Each key is a standard
+      name for a special token describing its use. (For example, "padding_id"
+      is what Sentencepiece calls "<pad>" but others may call "[PAD]".)
+      The corresponding value is the integer token id. If a special token
+      is not found, its entry is omitted from the dict.
+
+      The supported keys and tokens are:
+        * start_of_sequence_id: looked up from "[CLS]"
+        * end_of_segment_id: looked up from "[SEP]"
+        * padding_id: looked up from "<pad>"
+        * mask_id: looked up from "[MASK]"
+        * vocab_size: one past the largest token id used
+    """
+    return self._special_tokens_dict
+
+  def _create_special_tokens_dict(self):
+    special_tokens = dict(
+        start_of_sequence_id=b"[CLS]",
+        end_of_segment_id=b"[SEP]",
+        padding_id=b"<pad>",
+        mask_id=b"[MASK]")
+    with tf.init_scope():
+      if tf.executing_eagerly():
+        special_token_ids = self._tokenizer.string_to_id(
+            tf.constant(list(special_tokens.values()), tf.string))
+        inverse_tokens = self._tokenizer.id_to_string(special_token_ids)
+        vocab_size = self._tokenizer.vocab_size()
+      else:
+        # A blast from the past: non-eager init context while building Model.
+        # This can happen with Estimator or tf.compat.v1.disable_v2_behavior().
+        logging.warning(
+            "Non-eager init context; computing SentencepieceTokenizer's "
+            "special_tokens_dict in tf.compat.v1.Session")
+        with tf.Graph().as_default():
+          local_tokenizer = self._create_tokenizer()
+          special_token_ids_tensor = local_tokenizer.string_to_id(
+              tf.constant(list(special_tokens.values()), tf.string))
+          inverse_tokens_tensor = local_tokenizer.id_to_string(
+              special_token_ids_tensor)
+          vocab_size_tensor = local_tokenizer.vocab_size()
+          with tf.compat.v1.Session() as sess:
+            special_token_ids, inverse_tokens, vocab_size = sess.run(
+                [special_token_ids_tensor, inverse_tokens_tensor,
+                 vocab_size_tensor])
+      result = dict(
+          vocab_size=int(vocab_size)  # Numpy to Python.
+      )
+      for name, token_id, inverse_token in zip(special_tokens,
+                                               special_token_ids,
+                                               inverse_tokens):
+        if special_tokens[name] == inverse_token:
+          result[name] = int(token_id)
+        else:
+          logging.warning(
+              "Could not find %s as token \"%s\" in sentencepiece model, "
+              "got \"%s\"", name, special_tokens[name], inverse_token)
+    return result
+
+
+class BertPackInputs(tf.keras.layers.Layer):
+  """Packs tokens into model inputs for BERT."""
+
+  def __init__(self,
+               seq_length,
+               *,
+               start_of_sequence_id=None,
+               end_of_segment_id=None,
+               padding_id=None,
+               special_tokens_dict=None,
+               truncator="round_robin",
+               **kwargs):
+    """Initializes with a target `seq_length`, relevant token ids and truncator.
+
+    Args:
+      seq_length: The desired output length. Must not exceed the max_seq_length
+        that was fixed at training time for the BERT model receiving the inputs.
+      start_of_sequence_id: The numeric id of the token that is to be placed
+        at the start of each sequence (called "[CLS]" for BERT).
+      end_of_segment_id: The numeric id of the token that is to be placed
+        at the end of each input segment (called "[SEP]" for BERT).
+      padding_id: The numeric id of the token that is to be placed into the
+        unused positions after the last segment in the sequence
+        (called "[PAD]" for BERT).
+      special_tokens_dict: Optionally, a dict from Python strings to Python
+        integers that contains values for `start_of_sequence_id`,
+        `end_of_segment_id` and `padding_id`. (Further values in the dict are
+        silenty ignored.) If this is passed, separate *_id arguments must be
+        omitted.
+      truncator: The algorithm to truncate a list of batched segments to fit a
+        per-example length limit. The value can be either `round_robin` or
+        `waterfall`:
+          (1) For "round_robin" algorithm, available space is assigned
+          one token at a time in a round-robin fashion to the inputs that still
+          need some, until the limit is reached. It currently only supports
+          one or two segments.
+          (2) For "waterfall" algorithm, the allocation of the budget is done
+            using a "waterfall" algorithm that allocates quota in a
+            left-to-right manner and fills up the buckets until we run out of
+            budget. It support arbitrary number of segments.
+
+      **kwargs: standard arguments to `Layer()`.
+
+    Raises:
+      ImportError: if importing `tensorflow_text` failed.
+    """
+    _check_if_tf_text_installed()
+    super().__init__(**kwargs)
+    self.seq_length = seq_length
+    if truncator not in ("round_robin", "waterfall"):
+      raise ValueError("Only 'round_robin' and 'waterfall' algorithms are "
+                       "supported, but got %s" % truncator)
+    self.truncator = truncator
+    self._init_token_ids(
+        start_of_sequence_id=start_of_sequence_id,
+        end_of_segment_id=end_of_segment_id,
+        padding_id=padding_id,
+        special_tokens_dict=special_tokens_dict)
+
+  def _init_token_ids(
+      self, *,
+      start_of_sequence_id,
+      end_of_segment_id,
+      padding_id,
+      special_tokens_dict):
+    usage = ("Must pass either all of start_of_sequence_id, end_of_segment_id, "
+             "padding_id as arguments, or else a special_tokens_dict "
+             "with those keys.")
+    special_tokens_args = [start_of_sequence_id, end_of_segment_id, padding_id]
+    if special_tokens_dict is None:
+      if any(x is None for x in special_tokens_args):
+        return ValueError(usage)
+      self.start_of_sequence_id = int(start_of_sequence_id)
+      self.end_of_segment_id = int(end_of_segment_id)
+      self.padding_id = int(padding_id)
+    else:
+      if any(x is not None for x in special_tokens_args):
+        return ValueError(usage)
+      self.start_of_sequence_id = int(
+          special_tokens_dict["start_of_sequence_id"])
+      self.end_of_segment_id = int(special_tokens_dict["end_of_segment_id"])
+      self.padding_id = int(special_tokens_dict["padding_id"])
+
+  def get_config(self) -> Dict[str, Any]:
+    config = super().get_config()
+    config["seq_length"] = self.seq_length
+    config["start_of_sequence_id"] = self.start_of_sequence_id
+    config["end_of_segment_id"] = self.end_of_segment_id
+    config["padding_id"] = self.padding_id
+    config["truncator"] = self.truncator
+    return config
+
+  def call(self, inputs: Union[tf.RaggedTensor, List[tf.RaggedTensor]]):
+    """Adds special tokens to pack a list of segments into BERT input Tensors.
+
+    Args:
+      inputs: A Python list of one or two RaggedTensors, each with the batched
+        values one input segment. The j-th segment of the i-th input example
+        consists of slice `inputs[j][i, ...]`.
+
+    Returns:
+      A nest of Tensors for use as input to the BERT TransformerEncoder.
+    """
+    # BertPackInputsSavedModelWrapper relies on only calling bert_pack_inputs()
+    return BertPackInputs.bert_pack_inputs(
+        inputs, self.seq_length,
+        start_of_sequence_id=self.start_of_sequence_id,
+        end_of_segment_id=self.end_of_segment_id,
+        padding_id=self.padding_id,
+        truncator=self.truncator)
+
+  @staticmethod
+  def bert_pack_inputs(inputs: Union[tf.RaggedTensor, List[tf.RaggedTensor]],
+                       seq_length: Union[int, tf.Tensor],
+                       start_of_sequence_id: Union[int, tf.Tensor],
+                       end_of_segment_id: Union[int, tf.Tensor],
+                       padding_id: Union[int, tf.Tensor],
+                       truncator="round_robin"):
+    """Freestanding equivalent of the BertPackInputs layer."""
+    _check_if_tf_text_installed()
+    # Sanitize inputs.
+    if not isinstance(inputs, (list, tuple)):
+      inputs = [inputs]
+    if not inputs:
+      raise ValueError("At least one input is required for packing")
+    input_ranks = [rt.shape.rank for rt in inputs]
+    if None in input_ranks or len(set(input_ranks)) > 1:
+      raise ValueError("All inputs for packing must have the same known rank, "
+                       "found ranks " + ",".join(input_ranks))
+    # Flatten inputs to [batch_size, (tokens)].
+    if input_ranks[0] > 2:
+      inputs = [rt.merge_dims(1, -1) for rt in inputs]
+    # In case inputs weren't truncated (as they should have been),
+    # fall back to some ad-hoc truncation.
+    num_special_tokens = len(inputs) + 1
+    if truncator == "round_robin":
+      trimmed_segments = round_robin_truncate_inputs(
+          inputs, seq_length - num_special_tokens)
+    elif truncator == "waterfall":
+      trimmed_segments = text.WaterfallTrimmer(
+          seq_length - num_special_tokens).trim(inputs)
+    else:
+      raise ValueError("Unsupported truncator: %s" % truncator)
+    # Combine segments.
+    segments_combined, segment_ids = text.combine_segments(
+        trimmed_segments,
+        start_of_sequence_id=start_of_sequence_id,
+        end_of_segment_id=end_of_segment_id)
+    # Pad to dense Tensors.
+    input_word_ids, _ = text.pad_model_inputs(segments_combined, seq_length,
+                                              pad_value=padding_id)
+    input_type_ids, input_mask = text.pad_model_inputs(segment_ids, seq_length,
+                                                       pad_value=0)
+    # Work around broken shape inference.
+    output_shape = tf.stack([
+        inputs[0].nrows(out_type=tf.int32),  # batch_size
+        tf.cast(seq_length, dtype=tf.int32)])
+    def _reshape(t):
+      return tf.reshape(t, output_shape)
+    # Assemble nest of input tensors as expected by BERT TransformerEncoder.
+    return dict(input_word_ids=_reshape(input_word_ids),
+                input_mask=_reshape(input_mask),
+                input_type_ids=_reshape(input_type_ids))
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/tn_expand_condense.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/tn_expand_condense.py
new file mode 100644
index 000000000..c4bd08c5d
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/tn_expand_condense.py
@@ -0,0 +1,180 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ExpandCondense tensor network layer used in TN-BERT."""
+# pylint: disable=g-classes-have-attributes
+from typing import List, Optional, Text, Any, Dict
+import tensorflow as tf
+
+Layer = tf.keras.layers.Layer
+activations = tf.keras.activations
+initializers = tf.keras.initializers
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class TNExpandCondense(Layer):
+  """A TPU-optimized TensorNetwork layer.
+
+  Designed for use in models that currently use Dense layers to achieve
+  up projection followed by down projection.
+
+  This layer is a TPU-optimized combination of 3 operations:
+  Expand, Apply Activation, and Condense. The layer projects up from
+  `input_shape[-1]` to `input_shape[-1] * proj_multiplier`, applies
+  `self.activation`, and then condenses back to `input_shape[-1]`.
+
+  Note the input shape and output shape will be identical.
+
+  Args:
+    proj_multiplier: Positive integer, multiple of `input_shape[-1]` to project
+      up to. Must be one of `[2, 4, 6, 8]`.
+    use_bias: Boolean, whether the layer uses a bias vector.
+    activation: Activation function to use between Expand and Condense. If you
+      don't specify anything, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    kernel_initializer: Initializer for the weight matrices.
+    bias_initializer: Initializer for the bias vector.
+  Input shape:
+    N-D tensor with shape: `(batch_size, ..., input_shape[-1])`.
+  Output shape:
+    N-D tensor with shape: `(batch_size, ..., input_shape[-1])`.
+  """
+
+  def __init__(self,
+               proj_multiplier: int,
+               use_bias: Optional[bool] = True,
+               activation: Optional[Text] = 'relu',
+               kernel_initializer: Optional[Text] = 'glorot_uniform',
+               bias_initializer: Optional[Text] = 'zeros',
+               **kwargs) -> None:
+
+    # Allow specification of input_dim instead of input_shape,
+    # for compatability with Keras layers that support this
+    if 'input_shape' not in kwargs and 'input_dim' in kwargs:
+      kwargs['input_shape'] = (kwargs.pop('input_dim'),)
+
+    super(TNExpandCondense, self).__init__(**kwargs)
+
+    assert proj_multiplier in [
+        2, 4, 6, 8, 10, 12
+    ], 'proj_multiplier needs to be one of [2, 4, 6, 8, 10, 12]'
+    self.proj_multiplier = proj_multiplier
+
+    self.use_bias = use_bias
+    self.activation = activations.get(activation)
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+
+  def build(self, input_shape: List[int]) -> None:
+    # Disable the attribute-defined-outside-init violations in this function
+    # pylint: disable=attribute-defined-outside-init
+    if input_shape[-1] is None:
+      raise ValueError(
+          'The last dimension of the inputs to `TNExpandCondense` '
+          'should be defined. Found `None`.')
+
+    super(TNExpandCondense, self).build(input_shape)
+
+    self.proj_size = self.proj_multiplier * input_shape[-1]
+
+    assert (self.proj_size // input_shape[-1]) * input_shape[
+        -1] == self.proj_size, (f'{self.proj_size} / {input_shape[-1]} must be '
+                                f'round')
+    assert (input_shape[-1] // 128
+           ) * 128 == input_shape[-1], f'{input_shape[-1]} / 128 must be round'
+
+    self.w1 = self.add_weight(
+        name='w1',
+        shape=(input_shape[-1], input_shape[-1]),
+        trainable=True,
+        initializer=self.kernel_initializer)
+
+    self.w2 = self.add_weight(
+        name='w2',
+        shape=(128, (128 * (self.proj_size // input_shape[-1]))),
+        trainable=True,
+        initializer=self.kernel_initializer)
+
+    self.w3 = self.add_weight(
+        name='w3',
+        shape=(128 * (self.proj_size // input_shape[-1]), 128),
+        trainable=True,
+        initializer=self.kernel_initializer)
+    self.w4 = self.add_weight(
+        name='w4',
+        shape=(input_shape[-1] // 128, 128, input_shape[-1]),
+        trainable=True,
+        initializer=self.kernel_initializer)
+
+    if self.use_bias:
+      self.bias = self.add_weight(
+          name='b',
+          shape=(input_shape[-1] // 128, 1,
+                 128 * (self.proj_size // input_shape[-1])),
+          trainable=True,
+          initializer=self.bias_initializer)
+    else:
+      self.bias = None
+
+  def call(self, inputs: tf.Tensor, **kwargs):
+    orig_shape = tf.shape(inputs)
+    input_dim = inputs.shape[-1]
+    tmp = tf.reshape(inputs, (-1, input_dim))
+    # Shape is (BatchSeq, input_dim)
+
+    # Expansion network
+    tmp = tf.einsum('ab,Qb->aQ', self.w1, tmp)
+    # Note: Letter Q will always represent the BatchSeq axis.
+    tmp = tf.reshape(tmp, (input_dim // 128, 128, -1))
+    tmp = tf.einsum('abQ,bd->aQd', tmp, self.w2)
+
+    # Apply activation and then Condense
+    tmp = self.activation(tmp + self.bias)
+    tmp = tf.einsum('aQd,db->aQb', tmp, self.w3)
+    tmp = tf.einsum('aQb,abd->Qd', tmp, self.w4)
+
+    out = tf.reshape(tmp, orig_shape)
+    return out
+
+  def compute_output_shape(self, input_shape: List[int]) -> List[int]:
+    return input_shape
+
+  def get_config(self) -> Dict[Any, Any]:
+    """Returns the config of the layer.
+
+    The same layer can be reinstantiated later
+    (without its trained weights) from this configuration.
+
+    Returns:
+      Python dictionary containing the configuration of the layer.
+    """
+    config = {}
+
+    # Include the layer-specific arguments
+    args = ['proj_multiplier', 'use_bias']
+    for arg in args:
+      config[arg] = getattr(self, arg)
+
+    # Serialize the activation
+    config['activation'] = activations.serialize(getattr(self, 'activation'))
+
+    # Serialize the initializers
+    decomp_initializers = ['kernel_initializer', 'bias_initializer']
+    for initializer_arg in decomp_initializers:
+      config[initializer_arg] = initializers.serialize(
+          getattr(self, initializer_arg))
+
+    # Get base config
+    base_config = super(TNExpandCondense, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/tn_transformer_expand_condense.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/tn_transformer_expand_condense.py
new file mode 100644
index 000000000..b563457ce
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/tn_transformer_expand_condense.py
@@ -0,0 +1,253 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""TN-BERT TNTransformerExpandCondense employing Expand-Condense layer instead of Dense."""
+# pylint: disable=g-classes-have-attributes
+# Import libraries
+
+import gin
+import tensorflow as tf
+
+from nlp_modeling.layers.tn_expand_condense import TNExpandCondense
+
+
+@tf.keras.utils.register_keras_serializable(package="Text")
+@gin.configurable
+class TNTransformerExpandCondense(tf.keras.layers.Layer):
+  """Transformer layer using tensor network Expand-Condense layer.
+
+  This layer implements the Transformer from transformer.py, with a single
+  tensor network layer replacing the usual intermediate and output Dense
+  layers.
+
+  Args:
+    num_attention_heads: Number of attention heads.
+    intermediate_size: Size of the intermediate layer.
+    intermediate_activation: Activation for the intermediate layer.
+    dropout_rate: Dropout probability for the post-attention and output dropout.
+    attention_dropout_rate: Dropout probability for within the attention layer.
+    output_range: the sequence output range, [0, output_range) by slicing the
+      target sequence. `None` means the target sequence is not sliced.
+    kernel_initializer: Initializer for dense layer kernels.
+    bias_initializer: Initializer for dense layer biases.
+    kernel_regularizer: Regularizer for dense layer kernels.
+    bias_regularizer: Regularizer for dense layer biases.
+    activity_regularizer: Regularizer for dense layer activity.
+    kernel_constraint: Constraint for dense layer kernels.
+    bias_constraint: Constraint for dense layer kernels.
+    use_bias: Whether to enable use_bias in attention layer. If set to False,
+      use_bias in attention layer is disabled.
+    norm_first: Whether to normalize inputs to attention and intermediate dense
+      layers. If set False, output of attention and intermediate dense layers is
+      normalized.
+    norm_epsilon: Epsilon value to initialize normalization layers.
+    intermediate_dropout: Dropout probability for intermediate_dropout_layer.
+    attention_initializer: Initializer for kernels of attention layers. If set
+      `None`, attention layers use kernel_initializer as initializer for kernel.
+  """
+
+  def __init__(self,
+               num_attention_heads,
+               intermediate_size,
+               intermediate_activation,
+               dropout_rate=0.0,
+               attention_dropout_rate=0.0,
+               output_range=None,
+               kernel_initializer="glorot_uniform",
+               bias_initializer="zeros",
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               use_bias=True,
+               norm_first=False,
+               norm_epsilon=1e-12,
+               intermediate_dropout=0.0,
+               attention_initializer=None,
+               **kwargs):
+    super(TNTransformerExpandCondense, self).__init__(**kwargs)
+
+    self._num_heads = num_attention_heads
+    self._intermediate_size = intermediate_size
+    self._intermediate_activation = intermediate_activation
+    self._attention_dropout_rate = attention_dropout_rate
+    self._dropout_rate = dropout_rate
+    self._output_range = output_range
+    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
+    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
+    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
+    self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
+    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
+    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
+    self._use_bias = use_bias
+    self._norm_first = norm_first
+    self._norm_epsilon = norm_epsilon
+    self._intermediate_dropout = intermediate_dropout
+    if attention_initializer:
+      self._attention_initializer = tf.keras.initializers.get(
+          attention_initializer)
+    else:
+      self._attention_initializer = self._kernel_initializer
+
+  def build(self, input_shape):
+    input_tensor = input_shape[0] if len(input_shape) == 2 else input_shape
+    input_tensor_shape = tf.TensorShape(input_tensor)
+    if len(input_tensor_shape.as_list()) != 3:
+      raise ValueError(
+          "TNTransformerExpandCondense expects a three-dimensional input of "
+          "shape [batch, sequence, width].")
+    batch_size, sequence_length, hidden_size = input_tensor_shape
+
+    if len(input_shape) == 2:
+      mask_tensor_shape = tf.TensorShape(input_shape[1])
+      expected_mask_tensor_shape = tf.TensorShape(
+          [batch_size, sequence_length, sequence_length])
+      if not expected_mask_tensor_shape.is_compatible_with(mask_tensor_shape):
+        raise ValueError(
+            "When passing a mask tensor to TNTransformerExpandCondense, the "
+            "mask tensor must be of shape [batch, "
+            "sequence_length, sequence_length] (here %s). Got a "
+            "mask tensor of shape %s." %
+            (expected_mask_tensor_shape, mask_tensor_shape))
+    if hidden_size % self._num_heads != 0:
+      raise ValueError(
+          "The input size (%d) is not a multiple of the number of attention "
+          "heads (%d)" % (hidden_size, self._num_heads))
+    self._attention_head_size = int(hidden_size // self._num_heads)
+    common_kwargs = dict(
+        bias_initializer=self._bias_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activity_regularizer=self._activity_regularizer,
+        kernel_constraint=self._kernel_constraint,
+        bias_constraint=self._bias_constraint)
+    self._attention_layer = tf.keras.layers.MultiHeadAttention(
+        num_heads=self._num_heads,
+        key_dim=self._attention_head_size,
+        dropout=self._attention_dropout_rate,
+        use_bias=self._use_bias,
+        kernel_initializer=self._attention_initializer,
+        name="self_attention",
+        **common_kwargs)
+    self._attention_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
+    # Use float32 in layernorm for numeric stability.
+    # It is probably safe in mixed_float16, but we haven't validated this yet.
+    self._attention_layer_norm = (
+        tf.keras.layers.LayerNormalization(
+            name="self_attention_layer_norm",
+            axis=-1,
+            epsilon=self._norm_epsilon,
+            dtype=tf.float32))
+
+    # Substitute Dense layers with a single Expand-Condense layer.
+    self._output_dense = TNExpandCondense(
+        4,
+        use_bias=True,
+        activation=self._intermediate_activation,
+        kernel_initializer=self._kernel_initializer,
+        bias_initializer=self._bias_initializer)
+
+    self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
+    # Use float32 in layernorm for numeric stability.
+    self._output_layer_norm = tf.keras.layers.LayerNormalization(
+        name="output_layer_norm",
+        axis=-1,
+        epsilon=self._norm_epsilon,
+        dtype=tf.float32)
+
+    super(TNTransformerExpandCondense, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        "num_attention_heads":
+            self._num_heads,
+        "intermediate_size":
+            self._intermediate_size,
+        "intermediate_activation":
+            self._intermediate_activation,
+        "dropout_rate":
+            self._dropout_rate,
+        "attention_dropout_rate":
+            self._attention_dropout_rate,
+        "output_range":
+            self._output_range,
+        "kernel_initializer":
+            tf.keras.initializers.serialize(self._kernel_initializer),
+        "bias_initializer":
+            tf.keras.initializers.serialize(self._bias_initializer),
+        "kernel_regularizer":
+            tf.keras.regularizers.serialize(self._kernel_regularizer),
+        "bias_regularizer":
+            tf.keras.regularizers.serialize(self._bias_regularizer),
+        "activity_regularizer":
+            tf.keras.regularizers.serialize(self._activity_regularizer),
+        "kernel_constraint":
+            tf.keras.constraints.serialize(self._kernel_constraint),
+        "bias_constraint":
+            tf.keras.constraints.serialize(self._bias_constraint),
+        "use_bias":
+            self._use_bias,
+        "norm_first":
+            self._norm_first,
+        "norm_epsilon":
+            self._norm_epsilon,
+        "intermediate_dropout":
+            self._intermediate_dropout,
+        "attention_initializer":
+            tf.keras.initializers.serialize(self._attention_initializer)
+    }
+    base_config = super(TNTransformerExpandCondense, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs):
+    if isinstance(inputs, (list, tuple)) and len(inputs) == 2:
+      input_tensor, attention_mask = inputs
+    else:
+      input_tensor, attention_mask = (inputs, None)
+
+    if self._output_range:
+      target_tensor = input_tensor[:, 0:self._output_range, :]
+      attention_mask = attention_mask[:, 0:self._output_range, :]
+    else:
+      if self._norm_first:
+        source_tensor = input_tensor
+        input_tensor = self._attention_layer_norm(input_tensor)
+      target_tensor = input_tensor
+
+    attention_output = self._attention_layer(
+        query=target_tensor, value=input_tensor, attention_mask=attention_mask)
+    attention_output = self._attention_dropout(attention_output)
+    if self._norm_first:
+      attention_output = source_tensor + attention_output
+    else:
+      attention_output = self._attention_layer_norm(target_tensor +
+                                                    attention_output)
+    if self._norm_first:
+      source_attention_output = attention_output
+      attention_output = self._output_layer_norm(attention_output)
+
+    layer_output = self._output_dense(attention_output)
+    layer_output = self._output_dropout(layer_output)
+    # During mixed precision training, attention_output is from layer norm and
+    # is always fp32 for now. Cast layer_output to fp32 for the subsequent
+    # add.
+    layer_output = tf.cast(layer_output, tf.float32)
+    if self._norm_first:
+      layer_output = source_attention_output + layer_output
+    else:
+      layer_output = self._output_layer_norm(layer_output + attention_output)
+
+    return layer_output
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/transformer.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/transformer.py
new file mode 100644
index 000000000..274951205
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/transformer.py
@@ -0,0 +1,431 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-based transformer block layer."""
+# pylint: disable=g-classes-have-attributes
+
+import gin
+import tensorflow as tf
+
+import keras_nlp
+from nlp_modeling.layers import attention
+from nlp_modeling.layers import multi_channel_attention
+from nlp_modeling.layers.util import tf_function_if_eager
+
+
+@tf.keras.utils.register_keras_serializable(package="Text")
+class Transformer(keras_nlp.layers.TransformerEncoderBlock):
+  """Transformer layer.
+
+  This layer implements the Transformer from "Attention Is All You Need".
+  (https://arxiv.org/abs/1706.03762).
+
+  Args:
+    num_attention_heads: Number of attention heads.
+    intermediate_size: Size of the intermediate layer.
+    intermediate_activation: Activation for the intermediate layer.
+    dropout_rate: Dropout probability for the post-attention and output dropout.
+    attention_dropout_rate: Dropout probability for within the attention layer.
+    output_range: the sequence output range, [0, output_range) by slicing the
+      target sequence. `None` means the target sequence is not sliced.
+    kernel_initializer: Initializer for dense layer kernels.
+    bias_initializer: Initializer for dense layer biases.
+    kernel_regularizer: Regularizer for dense layer kernels.
+    bias_regularizer: Regularizer for dense layer biases.
+    activity_regularizer: Regularizer for dense layer activity.
+    kernel_constraint: Constraint for dense layer kernels.
+    bias_constraint: Constraint for dense layer kernels.
+    use_bias: Whether to enable use_bias in attention layer. If set False,
+      use_bias in attention layer is disabled.
+    norm_first: Whether to normalize inputs to attention and intermediate dense
+      layers. If set False, output of attention and intermediate dense layers is
+      normalized.
+    norm_epsilon: Epsilon value to initialize normalization layers.
+    intermediate_dropout: Dropout probability for intermediate_dropout_layer.
+    attention_initializer: Initializer for kernels of attention layers. If set
+      `None`, attention layers use kernel_initializer as initializer for kernel.
+  """
+
+  def __init__(self,
+               num_attention_heads,
+               intermediate_size,
+               intermediate_activation,
+               dropout_rate=0.0,
+               attention_dropout_rate=0.0,
+               output_range=None,
+               kernel_initializer="glorot_uniform",
+               bias_initializer="zeros",
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               use_bias=True,
+               norm_first=False,
+               norm_epsilon=1e-12,
+               intermediate_dropout=0.0,
+               attention_initializer=None,
+               **kwargs):
+    super().__init__(
+        num_attention_heads=num_attention_heads,
+        inner_dim=intermediate_size,
+        inner_activation=intermediate_activation,
+        output_dropout=dropout_rate,
+        attention_dropout=attention_dropout_rate,
+        output_range=output_range,
+        kernel_initializer=kernel_initializer,
+        bias_initializer=bias_initializer,
+        kernel_regularizer=kernel_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        kernel_constraint=kernel_constraint,
+        bias_constraint=bias_constraint,
+        use_bias=use_bias,
+        norm_first=norm_first,
+        norm_epsilon=norm_epsilon,
+        inner_dropout=intermediate_dropout,
+        attention_initializer=attention_initializer,
+        **kwargs)
+
+  def get_config(self):
+    return {
+        "num_attention_heads":
+            self._num_heads,
+        "intermediate_size":
+            self._inner_dim,
+        "intermediate_activation":
+            self._inner_activation,
+        "dropout_rate":
+            self._output_dropout_rate,
+        "attention_dropout_rate":
+            self._attention_dropout_rate,
+        "output_range":
+            self._output_range,
+        "kernel_initializer":
+            tf.keras.initializers.serialize(self._kernel_initializer),
+        "bias_initializer":
+            tf.keras.initializers.serialize(self._bias_initializer),
+        "kernel_regularizer":
+            tf.keras.regularizers.serialize(self._kernel_regularizer),
+        "bias_regularizer":
+            tf.keras.regularizers.serialize(self._bias_regularizer),
+        "activity_regularizer":
+            tf.keras.regularizers.serialize(self._activity_regularizer),
+        "kernel_constraint":
+            tf.keras.constraints.serialize(self._kernel_constraint),
+        "bias_constraint":
+            tf.keras.constraints.serialize(self._bias_constraint),
+        "use_bias":
+            self._use_bias,
+        "norm_first":
+            self._norm_first,
+        "norm_epsilon":
+            self._norm_epsilon,
+        "intermediate_dropout":
+            self._inner_dropout,
+        "attention_initializer":
+            tf.keras.initializers.serialize(self._attention_initializer)
+    }
+
+
+@tf.keras.utils.register_keras_serializable(package="Text")
+@gin.configurable
+class CompiledTransformer(Transformer):
+
+  @tf_function_if_eager(experimental_compile=True)
+  def call(self, inputs):
+    return super().call(inputs)
+
+
+@tf.keras.utils.register_keras_serializable(package="Text")
+class TransformerDecoderBlock(tf.keras.layers.Layer):
+  """Single transformer layer for decoder.
+
+  It has three sub-layers:
+  (1) a multi-head self-attention mechanism.
+  (2) a encoder-decoder attention.
+  (3) a positionwise fully connected feed-forward network.
+
+  Args:
+    num_attention_heads: Number of attention heads.
+    intermediate_size: Size of the intermediate layer.
+    intermediate_activation: Activation for the intermediate layer.
+    dropout_rate: Dropout probability for the post-attention and output dropout.
+    attention_dropout_rate: Dropout probability for within the attention layer.
+    multi_channel_cross_attention: Whether to use `MultiChannelAttention` for
+      cross-attention between target sequences and source sequences.
+    kernel_initializer: Initializer for dense layer kernels.
+    bias_initializer: Initializer for dense layer biases.
+    kernel_regularizer: Regularizer for dense layer kernels.
+    bias_regularizer: Regularizer for dense layer biases.
+    activity_regularizer: Regularizer for dense layer activity.
+    kernel_constraint: Constraint for dense layer kernels.
+    bias_constraint: Constraint for dense layer kernels.
+    use_bias: Whether to enable use_bias in attention layer. If set False,
+      use_bias in attention layer is disabled.
+    norm_first: Whether to normalize inputs to attention and intermediate dense
+      layers. If set False, output of attention and intermediate dense layers is
+      normalized.
+    norm_epsilon: Epsilon value to initialize normalization layers.
+    intermediate_dropout: Dropout probability for intermediate_dropout_layer.
+    attention_initializer: Initializer for kernels of attention layers. If set
+      `None`, attention layers use kernel_initializer as initializer for kernel.
+  """
+
+  def __init__(self,
+               num_attention_heads,
+               intermediate_size,
+               intermediate_activation,
+               dropout_rate=0.0,
+               attention_dropout_rate=0.0,
+               multi_channel_cross_attention=False,
+               kernel_initializer="glorot_uniform",
+               bias_initializer="zeros",
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               use_bias=True,
+               norm_first=False,
+               norm_epsilon=1e-12,
+               intermediate_dropout=0.0,
+               attention_initializer=None,
+               **kwargs):
+    super().__init__(**kwargs)
+    self.num_attention_heads = num_attention_heads
+    self.intermediate_size = intermediate_size
+    self.intermediate_activation = tf.keras.activations.get(
+        intermediate_activation)
+    self.dropout_rate = dropout_rate
+    self.attention_dropout_rate = attention_dropout_rate
+    self.multi_channel_cross_attention = multi_channel_cross_attention
+    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
+    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
+    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
+    self._activity_regularizer = tf.keras.regularizers.get(activity_regularizer)
+    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
+    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
+    self._use_bias = use_bias
+    self._norm_first = norm_first
+    self._norm_epsilon = norm_epsilon
+    self._intermediate_dropout = intermediate_dropout
+    if attention_initializer:
+      self._attention_initializer = tf.keras.initializers.get(
+          attention_initializer)
+    else:
+      self._attention_initializer = self._kernel_initializer
+    if self.multi_channel_cross_attention:
+      self._cross_attention_cls = multi_channel_attention.MultiChannelAttention
+    else:
+      self._cross_attention_cls = attention.MultiHeadAttention
+
+  def build(self, input_shape):
+    target_tensor_shape = tf.TensorShape(input_shape[0])
+    if len(target_tensor_shape.as_list()) != 3:
+      raise ValueError("TransformerLayer expects a three-dimensional input of "
+                       "shape [batch, sequence, width].")
+    hidden_size = target_tensor_shape[2]
+    if hidden_size % self.num_attention_heads != 0:
+      raise ValueError(
+          "The hidden size (%d) is not a multiple of the number of attention "
+          "heads (%d)" % (hidden_size, self.num_attention_heads))
+    self.attention_head_size = int(hidden_size) // self.num_attention_heads
+    common_kwargs = dict(
+        bias_initializer=self._bias_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activity_regularizer=self._activity_regularizer,
+        kernel_constraint=self._kernel_constraint,
+        bias_constraint=self._bias_constraint)
+    # Self attention.
+    self.self_attention = attention.CachedAttention(
+        num_heads=self.num_attention_heads,
+        key_dim=self.attention_head_size,
+        dropout=self.attention_dropout_rate,
+        use_bias=self._use_bias,
+        kernel_initializer=self._attention_initializer,
+        name="self_attention",
+        **common_kwargs)
+    self.self_attention_output_dense = tf.keras.layers.experimental.EinsumDense(
+        "abc,cd->abd",
+        output_shape=(None, hidden_size),
+        bias_axes="d",
+        kernel_initializer=self._kernel_initializer,
+        name="output",
+        **common_kwargs)
+    self.self_attention_dropout = tf.keras.layers.Dropout(
+        rate=self.dropout_rate)
+    self.self_attention_layer_norm = (
+        tf.keras.layers.LayerNormalization(
+            name="self_attention_layer_norm",
+            axis=-1,
+            epsilon=self._norm_epsilon,
+            dtype="float32"))
+    # Encoder-decoder attention.
+    self.encdec_attention = self._cross_attention_cls(
+        num_heads=self.num_attention_heads,
+        key_dim=self.attention_head_size,
+        dropout=self.attention_dropout_rate,
+        output_shape=hidden_size,
+        use_bias=self._use_bias,
+        kernel_initializer=self._attention_initializer,
+        name="attention/encdec",
+        **common_kwargs)
+
+    self.encdec_attention_dropout = tf.keras.layers.Dropout(
+        rate=self.dropout_rate)
+    self.encdec_attention_layer_norm = (
+        tf.keras.layers.LayerNormalization(
+            name="attention/encdec_output_layer_norm",
+            axis=-1,
+            epsilon=self._norm_epsilon,
+            dtype="float32"))
+
+    # Feed-forward projection.
+    self.intermediate_dense = tf.keras.layers.experimental.EinsumDense(
+        "abc,cd->abd",
+        output_shape=(None, self.intermediate_size),
+        bias_axes="d",
+        kernel_initializer=self._kernel_initializer,
+        name="intermediate",
+        **common_kwargs)
+    self.intermediate_activation_layer = tf.keras.layers.Activation(
+        self.intermediate_activation)
+    self._intermediate_dropout_layer = tf.keras.layers.Dropout(
+        rate=self._intermediate_dropout)
+    self.output_dense = tf.keras.layers.experimental.EinsumDense(
+        "abc,cd->abd",
+        output_shape=(None, hidden_size),
+        bias_axes="d",
+        kernel_initializer=self._kernel_initializer,
+        name="output",
+        **common_kwargs)
+    self.output_dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)
+    self.output_layer_norm = tf.keras.layers.LayerNormalization(
+        name="output_layer_norm",
+        axis=-1,
+        epsilon=self._norm_epsilon,
+        dtype="float32")
+    super().build(input_shape)
+
+  def get_config(self):
+    config = {
+        "num_attention_heads":
+            self.num_attention_heads,
+        "intermediate_size":
+            self.intermediate_size,
+        "intermediate_activation":
+            self.intermediate_activation,
+        "dropout_rate":
+            self.dropout_rate,
+        "attention_dropout_rate":
+            self.attention_dropout_rate,
+        "multi_channel_cross_attention":
+            self.multi_channel_cross_attention,
+        "kernel_initializer":
+            tf.keras.initializers.serialize(self._kernel_initializer),
+        "bias_initializer":
+            tf.keras.initializers.serialize(self._bias_initializer),
+        "kernel_regularizer":
+            tf.keras.regularizers.serialize(self._kernel_regularizer),
+        "bias_regularizer":
+            tf.keras.regularizers.serialize(self._bias_regularizer),
+        "activity_regularizer":
+            tf.keras.regularizers.serialize(self._activity_regularizer),
+        "kernel_constraint":
+            tf.keras.constraints.serialize(self._kernel_constraint),
+        "bias_constraint":
+            tf.keras.constraints.serialize(self._bias_constraint),
+        "use_bias":
+            self._use_bias,
+        "norm_first":
+            self._norm_first,
+        "norm_epsilon":
+            self._norm_epsilon,
+        "intermediate_dropout":
+            self._intermediate_dropout,
+        "attention_initializer":
+            tf.keras.initializers.serialize(self._attention_initializer)
+    }
+    base_config = super().get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def common_layers_with_encoder(self):
+    """Gets layer objects that can make a Transformer encoder block."""
+    return [
+        self.self_attention, self.self_attention_layer_norm,
+        self.intermediate_dense, self.output_dense, self.output_layer_norm
+    ]
+
+  def call(self, inputs, cache=None, decode_loop_step=None):
+    if self.multi_channel_cross_attention:
+      if len(inputs) != 5:
+        raise ValueError(
+            "TransformerDecoderBlock must have 5 inputs, when it uses "
+            "multi_channel_cross_attention. But it got: %d" % len(inputs))
+    elif len(inputs) != 4:
+      raise ValueError(
+          "TransformerDecoderBlock must have 4 inputs, but it got: %d" %
+          len(inputs))
+    input_tensor, memory, attention_mask, self_attention_mask = inputs[:4]
+    source_tensor = input_tensor
+    if self._norm_first:
+      input_tensor = self.self_attention_layer_norm(input_tensor)
+    self_attention_output, cache = self.self_attention(
+        query=input_tensor,
+        value=input_tensor,
+        attention_mask=self_attention_mask,
+        cache=cache,
+        decode_loop_step=decode_loop_step)
+    self_attention_output = self.self_attention_dropout(self_attention_output)
+    if self._norm_first:
+      self_attention_output = source_tensor + self_attention_output
+    else:
+      self_attention_output = self.self_attention_layer_norm(
+          input_tensor + self_attention_output)
+    if self._norm_first:
+      source_self_attention_output = self_attention_output
+      self_attention_output = self.encdec_attention_layer_norm(
+          self_attention_output)
+    cross_attn_inputs = dict(
+        query=self_attention_output,
+        value=memory,
+        attention_mask=attention_mask)
+    if self.multi_channel_cross_attention:
+      # Accesses the 5-th input tensor for the doc-attention probabilities.
+      cross_attn_inputs["context_attention_weights"] = inputs[-1]
+    attention_output = self.encdec_attention(**cross_attn_inputs)
+    attention_output = self.encdec_attention_dropout(attention_output)
+    if self._norm_first:
+      attention_output = source_self_attention_output + attention_output
+    else:
+      attention_output = self.encdec_attention_layer_norm(
+          self_attention_output + attention_output)
+    if self._norm_first:
+      source_attention_output = attention_output
+      attention_output = self.output_layer_norm(attention_output)
+
+    intermediate_output = self.intermediate_dense(attention_output)
+    intermediate_output = self.intermediate_activation_layer(
+        intermediate_output)
+    intermediate_output = self._intermediate_dropout_layer(intermediate_output)
+    layer_output = self.output_dense(intermediate_output)
+    layer_output = self.output_dropout(layer_output)
+    if self._norm_first:
+      layer_output = source_attention_output + layer_output
+    else:
+      layer_output = self.output_layer_norm(layer_output + attention_output)
+    return layer_output, cache
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/transformer_scaffold.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/transformer_scaffold.py
new file mode 100644
index 000000000..3e371cae9
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/transformer_scaffold.py
@@ -0,0 +1,305 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-based transformer scaffold layer."""
+# pylint: disable=g-classes-have-attributes
+
+from absl import logging
+import gin
+import tensorflow as tf
+
+from nlp_modeling.layers import attention
+
+
+@tf.keras.utils.register_keras_serializable(package="Text")
+@gin.configurable
+class TransformerScaffold(tf.keras.layers.Layer):
+  """Transformer scaffold layer.
+
+  This layer implements the Transformer from "Attention Is All You Need".
+  (https://arxiv.org/abs/1706.03762), with a customizable attention layer and
+  feedforward layer option. Users can pass a class to
+  `attention_cls`/`feedforward_cls` and associated config to
+  `attention_cfg`/`feedforward_cfg`, in which case the scaffold will
+  instantiate the class with the config, or pass a class instance to
+  `attention_cls`/`feedforward_cls`.
+
+  Args:
+    num_attention_heads: Number of attention heads.
+    intermediate_size: Size of the intermediate layer.
+    intermediate_activation: Activation for the intermediate layer.
+    attention_cls: A class to instantiate attention layer, or a layer instance.
+    attention_cfg: The config with which to instantiate `attention_cls`. Ignored
+      if attention_cls is a layer instance or None. If `attention_cls` is a
+      class, but `attention_cfg` is None, following kwargs will be used to
+      instantiate the attention instance: {
+        "num_heads": num_attention_heads,
+        "key_dim": int(hidden_size // num_attention_heads),
+        "dropout": attention_dropout_rate,
+        "name": "self_attention" }, where `hidden_size` is the input tensor's
+          last dimension.
+    feedforward_cls: A class to instantiate feedforward layer, or a layer
+      instance. If None, will use the standard feedforward layer as described in
+      "Attention Is All You Need" paper. If not None, the instantiated
+      feedforward layer is expected to take the output of attention as input and
+      its output is this transformer layer's output.
+    feedforward_cfg: The config with which to instantiate `feedforward_cls`.
+      Ignored if feedforward_cls is a layer instance or is None. If
+      `feedforward_cls` is a class, but `feedforward_cfg` is None, following
+      kwargs will be used to instantiate the feedforward instance: {
+        "intermediate_size": intermediate_size,
+        "intermediate_activation": intermediate_activation,
+        "dropout": dropout_rate,
+        "name": "feedforward" }.
+    dropout_rate: Dropout probability for the post-attention and output dropout.
+    attention_dropout_rate: Dropout probability for within the attention layer.
+    kernel_initializer: Initializer for dense layer kernels.
+    bias_initializer: Initializer for dense layer biases.
+    kernel_regularizer: Regularizer for dense layer kernels.
+    bias_regularizer: Regularizer for dense layer biases.
+    activity_regularizer: Regularizer for dense layer activity.
+    kernel_constraint: Constraint for dense layer kernels.
+    bias_constraint: Constraint for dense layer kernels.
+  """
+
+  def __init__(self,
+               num_attention_heads,
+               intermediate_size,
+               intermediate_activation,
+               attention_cls=attention.MultiHeadAttention,
+               attention_cfg=None,
+               feedforward_cls=None,
+               feedforward_cfg=None,
+               dropout_rate=0.0,
+               attention_dropout_rate=0.0,
+               norm_first=False,
+               kernel_initializer="glorot_uniform",
+               bias_initializer="zeros",
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    super(TransformerScaffold, self).__init__(**kwargs)
+
+    self._attention_cfg = attention_cfg
+    self._attention_cls = attention_cls
+    self._feedforward_cls = feedforward_cls
+    self._feedforward_cfg = feedforward_cfg
+    self._norm_first = norm_first
+    self._num_heads = num_attention_heads
+    self._intermediate_size = intermediate_size
+    self._intermediate_activation = intermediate_activation
+    self._attention_dropout_rate = attention_dropout_rate
+    self._dropout_rate = dropout_rate
+    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
+    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
+    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
+    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
+    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
+
+  def build(self, input_shape):
+    input_tensor_shape = input_shape[0] if (
+        len(input_shape) == 2) else input_shape
+    input_tensor_shape = tf.TensorShape(input_tensor_shape)
+    if len(input_tensor_shape.as_list()) != 3:
+      raise ValueError(
+          "TransformerScaffold expects a three-dimensional input of "
+          "shape [batch, sequence, width].")
+    hidden_size = input_tensor_shape[-1]
+    if hidden_size % self._num_heads != 0:
+      raise ValueError(
+          "The input size (%d) is not a multiple of the number of attention "
+          "heads (%d)" % (hidden_size, self._num_heads))
+    self._attention_head_size = int(hidden_size // self._num_heads)
+
+    common_kwargs = dict(
+        kernel_initializer=self._kernel_initializer,
+        bias_initializer=self._bias_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        activity_regularizer=self._activity_regularizer,
+        kernel_constraint=self._kernel_constraint,
+        bias_constraint=self._bias_constraint)
+
+    def get_layer_instance(instance_or_cls, config, default_config):
+      if isinstance(instance_or_cls, tf.keras.layers.Layer):
+        return instance_or_cls
+      else:
+        if config is None:
+          return instance_or_cls(**default_config)
+        else:
+          return instance_or_cls(**config)
+
+    default_attention_cfg = {
+        "num_heads": self._num_heads,
+        "key_dim": self._attention_head_size,
+        "dropout": self._attention_dropout_rate,
+        "name": "self_attention"
+    }
+    default_attention_cfg.update(common_kwargs)
+    self._attention_layer = get_layer_instance(
+        self._attention_cls,
+        config=self._attention_cfg,
+        default_config=default_attention_cfg)
+
+    if self._feedforward_cls is not None:
+      default_feedforward_cfg = {
+          "intermediate_size": self._intermediate_size,
+          "intermediate_activation": self._intermediate_activation,
+          "dropout": self._dropout_rate,
+          "name": "feedforward",
+      }
+      default_feedforward_cfg.update(common_kwargs)
+      self._feedforward_block = get_layer_instance(
+          self._feedforward_cls,
+          config=self._feedforward_cfg,
+          default_config=default_feedforward_cfg)
+    else:
+      self._feedforward_block = None
+
+    # self._dropout_rate controls dropout rates at two places:
+    # after attention, and after FFN.
+    self._attention_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
+    # Use float32 in layernorm for numeric stability.
+    # It is probably safe in mixed_float16, but we haven't validated this yet.
+    self._attention_layer_norm = (
+        tf.keras.layers.LayerNormalization(
+            name="self_attention_layer_norm",
+            axis=-1,
+            epsilon=1e-12,
+            dtype=tf.float32))
+
+    if self._feedforward_block is None:
+      self._intermediate_dense = tf.keras.layers.experimental.EinsumDense(
+          "abc,cd->abd",
+          output_shape=(None, self._intermediate_size),
+          bias_axes="d",
+          name="intermediate",
+          **common_kwargs)
+      policy = tf.keras.mixed_precision.global_policy()
+      if policy.name == "mixed_bfloat16":
+        # bfloat16 causes BERT with the LAMB optimizer to not converge
+        # as well, so we use float32.
+        # TODO(b/154538392): Investigate this.
+        policy = tf.float32
+      self._intermediate_activation_layer = tf.keras.layers.Activation(
+          self._intermediate_activation, dtype=policy)
+      self._output_dense = tf.keras.layers.experimental.EinsumDense(
+          "abc,cd->abd",
+          output_shape=(None, hidden_size),
+          bias_axes="d",
+          name="output",
+          **common_kwargs)
+
+    self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
+    # Use float32 in layernorm for numeric stability.
+    self._output_layer_norm = tf.keras.layers.LayerNormalization(
+        name="output_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32)
+
+    super(TransformerScaffold, self).build(input_shape)
+    logging.info("%s configs: %s", self.__class__.__name__, self.get_config())
+
+  def get_config(self):
+    config = {
+        "attention_cls":
+            self._attention_layer,
+        "feedforward_cls":
+            self._feedforward_block,
+        "num_attention_heads":
+            self._num_heads,
+        "intermediate_size":
+            self._intermediate_size,
+        "intermediate_activation":
+            self._intermediate_activation,
+        "dropout_rate":
+            self._dropout_rate,
+        "attention_dropout_rate":
+            self._attention_dropout_rate,
+        "norm_first":
+            self._norm_first,
+        "kernel_initializer":
+            tf.keras.initializers.serialize(self._kernel_initializer),
+        "bias_initializer":
+            tf.keras.initializers.serialize(self._bias_initializer),
+        "kernel_regularizer":
+            tf.keras.regularizers.serialize(self._kernel_regularizer),
+        "bias_regularizer":
+            tf.keras.regularizers.serialize(self._bias_regularizer),
+        "activity_regularizer":
+            tf.keras.regularizers.serialize(self._activity_regularizer),
+        "kernel_constraint":
+            tf.keras.constraints.serialize(self._kernel_constraint),
+        "bias_constraint":
+            tf.keras.constraints.serialize(self._bias_constraint)
+    }
+    base_config = super(TransformerScaffold, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, inputs, training=None):
+    if isinstance(inputs, (list, tuple)) and len(inputs) == 2:
+      input_tensor, attention_mask = inputs
+    else:
+      input_tensor, attention_mask = (inputs, None)
+
+    if self._norm_first:
+      source_tensor = input_tensor
+      input_tensor = self._attention_layer_norm(input_tensor, training=training)
+
+    attention_output = self._attention_layer(
+        query=input_tensor, value=input_tensor, attention_mask=attention_mask,
+        training=training)
+    attention_output = self._attention_dropout(attention_output,
+                                               training=training)
+
+    if self._norm_first:
+      attention_output = source_tensor + attention_output
+    else:
+      attention_output = self._attention_layer_norm(input_tensor +
+                                                    attention_output,
+                                                    training=training)
+    if self._norm_first:
+      source_attention_output = attention_output
+      attention_output = self._output_layer_norm(attention_output,
+                                                 training=training)
+
+    if self._feedforward_block is None:
+      intermediate_output = self._intermediate_dense(attention_output)
+      intermediate_output = self._intermediate_activation_layer(
+          intermediate_output)
+      layer_output = self._output_dense(intermediate_output, training=training)
+      layer_output = self._output_dropout(layer_output, training=training)
+      # During mixed precision training, attention_output is from layer norm
+      # and is always fp32 for now. Cast layer_output to fp32 for the subsequent
+      # add.
+      layer_output = tf.cast(layer_output, tf.float32)
+      if self._norm_first:
+        layer_output = source_attention_output + layer_output
+      else:
+        layer_output = self._output_layer_norm(layer_output + attention_output,
+                                               training=training)
+    else:
+      if self._norm_first:
+        # if norm_first, assume the feedforward block will not apply layer norm
+        layer_output = self._feedforward_block(attention_output,
+                                               training=training)
+        layer_output += source_attention_output
+      else:
+        # if not norm_first, assume that the feedforwad does apply layer norm
+        layer_output = self._feedforward_block(attention_output,
+                                               training=training)
+
+    return layer_output
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/transformer_xl.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/transformer_xl.py
new file mode 100644
index 000000000..96b0322cc
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/transformer_xl.py
@@ -0,0 +1,559 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-based Transformer XL layer."""
+
+from absl import logging
+
+import tensorflow as tf
+
+from nlp_modeling.layers import relative_attention
+
+
+def _cache_memory(current_state, previous_state, memory_length, reuse_length=0):
+  """Caches hidden states into memory.
+
+  Args:
+    current_state: `Tensor`, the current state.
+    previous_state: `Tensor`, the previous state.
+    memory_length: `int`, the number of tokens to cache.
+    reuse_length: `int`, the number of tokens in the current batch to be cached
+      and reused in the future.
+
+  Returns:
+    A `Tensor`, representing the cached state with stopped gradients.
+
+  """
+  if memory_length is None or memory_length == 0:
+    return None
+  else:
+    if reuse_length > 0:
+      current_state = current_state[:, :reuse_length, :]
+
+    if previous_state is None:
+      new_mem = current_state[:, -memory_length:, :]
+    else:
+      new_mem = tf.concat(
+          [previous_state, current_state], 1)[:, -memory_length:, :]
+
+  return tf.stop_gradient(new_mem)
+
+
+@tf.keras.utils.register_keras_serializable(package="Text")
+class TransformerXLBlock(tf.keras.layers.Layer):
+  """Transformer XL block.
+
+  This implements a Transformer XL block from "Transformer-XL: Attentive
+  Language Models Beyond a Fixed-Length Context"
+  (https://arxiv.org/abs/1901.02860).
+
+  This block is further extended to allow for the Transformer-XL
+  re-parameterization in "XLNet: Generalized Autoregressive Pretraining for
+  Language Understanding" (https://arxiv.org/abs/1906.08237).
+
+  Given an input stream, this block computes attention, applies dropouts and
+  layer norms and feeds into the FFN network.
+
+  **Note: This layer is currently experimental.
+
+  Attributes:
+    vocab_size: The size of the token vocabulary.
+    hidden_size: The size of the transformer hidden layers.
+    num_attention_heads: The number of attention heads.
+    head_size: The dimension size of each attention head.
+    inner_size: The inner size for the transformer layers.
+    dropout_rate: Dropout rate for the output of this layer.
+    attention_dropout_rate: Dropout rate on attention probabilities.
+    two_stream: Whether or not to use `TwoStreamRelativeAttention` used in the
+      XLNet pretrainer. If `False`, then it will use
+      `MultiHeadRelativeAttention` as in Transformer XL.
+    norm_epsilon: Epsilon value to initialize normalization layers.
+    inner_activation: The activation to use for the inner
+      FFN layers.
+    kernel_initializer: Initializer for dense layer kernels.
+    inner_dropout: Dropout probability for the inner dropout
+      layer.
+  """
+
+  def __init__(self,
+               vocab_size,
+               hidden_size,
+               num_attention_heads,
+               head_size,
+               inner_size,
+               dropout_rate,
+               attention_dropout_rate,
+               two_stream=False,
+               norm_epsilon=1e-12,
+               inner_activation="relu",
+               kernel_initializer="variance_scaling",
+               inner_dropout=0.0,
+               **kwargs):
+    """Initializes TransformerXLBlock layer."""
+
+    super(TransformerXLBlock, self).__init__(**kwargs)
+    self._vocab_size = vocab_size
+    self._num_heads = num_attention_heads
+    self._head_size = head_size
+    self._hidden_size = hidden_size
+    self._inner_size = inner_size
+    self._dropout_rate = dropout_rate
+    self._attention_dropout_rate = attention_dropout_rate
+    self._inner_activation = inner_activation
+    self._norm_epsilon = norm_epsilon
+    self._kernel_initializer = kernel_initializer
+    self._inner_dropout = inner_dropout
+    self._two_stream = two_stream
+    if two_stream:
+      self._attention_layer_type = relative_attention.TwoStreamRelativeAttention
+    else:
+      self._attention_layer_type = relative_attention.MultiHeadRelativeAttention
+
+  def build(self, input_shape):
+    input_tensor = input_shape[0] if len(input_shape) == 2 else input_shape
+    input_tensor_shape = tf.TensorShape(input_tensor)
+    if len(input_tensor_shape.as_list()) != 3:
+      raise ValueError("TransformerLayer expects a three-dimensional input of "
+                       "shape [batch, sequence, width].")
+    batch_size, sequence_length, hidden_size = input_tensor_shape
+
+    if len(input_shape) == 2:
+      mask_tensor_shape = tf.TensorShape(input_shape[1])
+      expected_mask_tensor_shape = tf.TensorShape(
+          [batch_size, sequence_length, sequence_length])
+      if not expected_mask_tensor_shape.is_compatible_with(mask_tensor_shape):
+        raise ValueError("When passing a mask tensor to TransformerXLBlock, "
+                         "the mask tensor must be of shape [batch, "
+                         "sequence_length, sequence_length] (here %s). Got a "
+                         "mask tensor of shape %s." %
+                         (expected_mask_tensor_shape, mask_tensor_shape))
+    if hidden_size % self._num_heads != 0:
+      raise ValueError(
+          "The input size (%d) is not a multiple of the number of attention "
+          "heads (%d)" % (hidden_size, self._num_heads))
+    self._attention_layer = self._attention_layer_type(
+        num_heads=self._num_heads,
+        key_dim=self._head_size,
+        value_dim=self._head_size,
+        dropout=self._attention_dropout_rate,
+        use_bias=False,
+        kernel_initializer=self._kernel_initializer,
+        name="rel_attn")
+    self._attention_dropout = tf.keras.layers.Dropout(
+        rate=self._attention_dropout_rate)
+    self._attention_layer_norm = tf.keras.layers.LayerNormalization(
+        name="self_attention_layer_norm",
+        axis=-1,
+        epsilon=self._norm_epsilon,
+        dtype=tf.float32)
+    self._inner_dense = tf.keras.layers.experimental.EinsumDense(
+        "abc,cd->abd",
+        output_shape=(None, self._inner_size),
+        bias_axes="d",
+        kernel_initializer=self._kernel_initializer,
+        name="inner")
+
+    self._inner_activation_layer = tf.keras.layers.Activation(
+        self._inner_activation)
+    self._inner_dropout_layer = tf.keras.layers.Dropout(
+        rate=self._inner_dropout)
+    self._output_dense = tf.keras.layers.experimental.EinsumDense(
+        "abc,cd->abd",
+        output_shape=(None, hidden_size),
+        bias_axes="d",
+        name="output",
+        kernel_initializer=self._kernel_initializer)
+    self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
+    self._output_layer_norm = tf.keras.layers.LayerNormalization(
+        name="output_layer_norm",
+        axis=-1,
+        epsilon=self._norm_epsilon)
+
+    super(TransformerXLBlock, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        "vocab_size":
+            self._vocab_size,
+        "hidden_size":
+            self._hidden_size,
+        "num_attention_heads":
+            self._num_heads,
+        "head_size":
+            self._head_size,
+        "inner_size":
+            self._inner_size,
+        "dropout_rate":
+            self._dropout_rate,
+        "attention_dropout_rate":
+            self._attention_dropout_rate,
+        "two_stream":
+            self._two_stream,
+        "norm_epsilon":
+            self._norm_epsilon,
+        "inner_activation":
+            self._inner_activation,
+        "kernel_initializer":
+            self._kernel_initializer,
+        "inner_dropout":
+            self._inner_dropout,
+    }
+    base_config = super(TransformerXLBlock, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self,
+           content_stream,
+           content_attention_bias,
+           positional_attention_bias,
+           relative_position_encoding=None,
+           segment_matrix=None,
+           segment_encoding=None,
+           segment_attention_bias=None,
+           state=None,
+           content_attention_mask=None,
+           query_stream=None,
+           query_attention_mask=None,
+           target_mapping=None):
+    """Implements `call` for the Layer.
+
+    Args:
+      content_stream: `Tensor`, the input content stream. This is the standard
+        input to Transformer XL and is commonly referred to as `h` in XLNet.
+      content_attention_bias: Bias `Tensor` for content based attention of shape
+        `[num_heads, dim]`.
+      positional_attention_bias: Bias `Tensor` for position based attention of
+        shape `[num_heads, dim]`.
+      relative_position_encoding: Relative positional encoding `Tensor` of shape
+        `[B, L, dim]`.
+      segment_matrix: Optional `Tensor` of shape `[B, S, S + M]`. Used in XLNet,
+        but not in Transformer XL.
+      segment_encoding: Optional `Tensor` of shape `[2, num_heads, dim]`. Used
+        in XLNet, but not in Transformer XL.
+      segment_attention_bias: Optional bias `Tensor` for segment based attention
+        of shape `[num_heads, dim]`.
+      state: Optional `Tensor` of shape `[B, M, E]`, where M is the length of
+        the state or memory. If passed, this is also attended over as in
+        Transformer XL.
+      content_attention_mask: Optional `Tensor` representing the mask that is
+        added to content attention logits. If state is not None, the mask source
+        sequence dimension should extend M.
+      query_stream: Optional `Tensor`, the query stream. This is introduced in
+        `TwoStreamRelativeAttention`/XLNet pretrainer. This is ignored if
+        `two_stream` is `False`.
+      query_attention_mask: Optional `Tensor` representing the mask that is
+        added to query attention logits. If state is not None, the mask source
+        sequence dimension should extend M.
+      target_mapping: Optional `Tensor` representing the target mapping when
+        calculating query attention.
+
+    Returns:
+      A `dict` object, containing the key value pairs for `content_attention`
+      and (if `two_stream` is `True`) `query_attention`.
+
+    """
+    if not self._two_stream and query_stream is not None:
+      logging.warning("`query_stream` was provided but two stream attention is "
+                      "disabled. `query_stream` will be ignored.")
+    if self._two_stream:
+      attention_kwargs = dict(
+          content_stream=content_stream,
+          query_stream=query_stream,
+          query_attention_mask=query_attention_mask,
+          target_mapping=target_mapping,
+          content_attention_mask=content_attention_mask)
+    else:
+      attention_kwargs = dict(
+          query=content_stream,
+          value=content_stream,
+          key=content_stream,
+          attention_mask=content_attention_mask)
+
+    common_attention_kwargs = dict(
+        content_attention_bias=content_attention_bias,
+        relative_position_encoding=relative_position_encoding,
+        positional_attention_bias=positional_attention_bias,
+        segment_matrix=segment_matrix,
+        segment_encoding=segment_encoding,
+        segment_attention_bias=segment_attention_bias,
+        state=state)
+
+    attention_kwargs.update(common_attention_kwargs)
+    attention_output = self._attention_layer(**attention_kwargs)
+
+    if self._two_stream:
+      attention_streams = attention_output
+      input_streams = [content_stream, query_stream]
+    else:
+      attention_streams = [attention_output]
+      input_streams = [content_stream]
+
+    attention_keys = ["content_attention", "query_attention"]
+    attention_output = {}
+    for attention_stream, input_stream, attention_key in zip(
+        attention_streams, input_streams, attention_keys):
+      attention_stream = self._attention_dropout(attention_stream)
+      attention_stream = self._attention_layer_norm(
+          attention_stream + input_stream)
+      inner_output = self._inner_dense(attention_stream)
+      inner_output = self._inner_activation_layer(
+          inner_output)
+      inner_output = self._inner_dropout_layer(
+          inner_output)
+      layer_output = self._output_dense(inner_output)
+      layer_output = self._output_dropout(layer_output)
+      layer_output = self._output_layer_norm(layer_output + attention_stream)
+      attention_output[attention_key] = layer_output
+
+    return attention_output
+
+
+class TransformerXL(tf.keras.layers.Layer):
+  """Transformer XL.
+
+  This layer combines multiple Transformer XL blocks from "Transformer-XL:
+  Attentive Language Models Beyond a Fixed-Length Context"
+  (https://arxiv.org/abs/1901.02860).
+
+  This layer handles the attention biases as well as memory caching and reuse
+  as in Transformer XL and XLNet.
+
+
+  Attributes:
+    vocab_size: The number of tokens in vocabulary.
+    num_layers: The number of layers.
+    hidden_size: The hidden size.
+    num_attention_heads: The number of attention heads.
+    head_size: The dimension size of each attention head.
+    inner_size: The hidden size in feed-forward layers.
+    dropout_rate: Dropout rate used in each Transformer XL block.
+    attention_dropout_rate: Dropout rate on attention probabilities.
+    two_stream: Whether or not to use `TwoStreamRelativeAttention` used
+      in the XLNet pretrainer. If `False`, then it will use
+      `MultiHeadRelativeAttention` as in Transformer XL.
+    initializer: The initializer to use for attention biases.
+    tie_attention_biases: Whether or not to tie biases together. If `True`, then
+      each Transformer XL block shares the same trainable attention bias. If
+      `False`, then each block has its own attention bias. This is usually set
+      to `True`.
+    memory_length: The number of tokens to cache.
+    reuse_length: The number of tokens in the current batch to be cached
+      and reused in the future.
+    inner_activation: The activation to use in the inner layers
+     for Transformer XL blocks. Typically "relu" or "gelu".
+  """
+
+  def __init__(self,
+               vocab_size,
+               num_layers,
+               hidden_size,
+               num_attention_heads,
+               head_size,
+               inner_size,
+               dropout_rate,
+               attention_dropout_rate,
+               initializer,
+               two_stream=False,
+               tie_attention_biases=True,
+               memory_length=None,
+               reuse_length=None,
+               inner_activation="relu",
+               **kwargs):
+    """Initializes TransformerXL."""
+    super(TransformerXL, self).__init__(**kwargs)
+
+    self._vocab_size = vocab_size
+    self._initializer = initializer
+    self._num_layers = num_layers
+    self._hidden_size = hidden_size
+    self._num_attention_heads = num_attention_heads
+    self._head_size = head_size
+    self._inner_size = inner_size
+    self._inner_activation = inner_activation
+    self._dropout_rate = dropout_rate
+    self._attention_dropout_rate = attention_dropout_rate
+    self._tie_attention_biases = tie_attention_biases
+    self._two_stream = two_stream
+
+    self._memory_length = memory_length
+    self._reuse_length = reuse_length
+
+    if self._tie_attention_biases:
+      attention_bias_shape = [self._num_attention_heads, self._head_size]
+    else:
+      attention_bias_shape = [self._num_layers, self._num_attention_heads,
+                              self._head_size]
+
+    self.content_attention_bias = self.add_weight(
+        "content_attention_bias",
+        shape=attention_bias_shape,
+        dtype=tf.float32,
+        initializer=self._initializer)
+    self.positional_attention_bias = self.add_weight(
+        "positional_attention_bias",
+        shape=attention_bias_shape,
+        dtype=tf.float32,
+        initializer=self._initializer)
+    self.segment_attention_bias = self.add_weight(
+        "segment_attention_bias",
+        shape=attention_bias_shape,
+        dtype=tf.float32,
+        initializer=self._initializer)
+
+    self.transformer_xl_layers = []
+    for i in range(self._num_layers):
+      self.transformer_xl_layers.append(
+          TransformerXLBlock(
+              vocab_size=self._vocab_size,
+              hidden_size=self._head_size * self._num_attention_heads,
+              num_attention_heads=self._num_attention_heads,
+              head_size=self._head_size,
+              inner_size=self._inner_size,
+              dropout_rate=self._dropout_rate,
+              attention_dropout_rate=self._attention_dropout_rate,
+              norm_epsilon=1e-12,
+              inner_activation=self._inner_activation,
+              two_stream=self._two_stream,
+              kernel_initializer="variance_scaling",
+              name="layer_%d" % i))
+
+    self.output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
+
+  def get_config(self):
+    config = {
+        "vocab_size":
+            self._vocab_size,
+        "num_layers":
+            self._num_layers,
+        "hidden_size":
+            self._hidden_size,
+        "num_attention_heads":
+            self._num_attention_heads,
+        "head_size":
+            self._head_size,
+        "inner_size":
+            self._inner_size,
+        "dropout_rate":
+            self._dropout_rate,
+        "attention_dropout_rate":
+            self._attention_dropout_rate,
+        "initializer":
+            self._initializer,
+        "two_stream":
+            self._two_stream,
+        "tie_attention_biases":
+            self._tie_attention_biases,
+        "memory_length":
+            self._memory_length,
+        "reuse_length":
+            self._reuse_length,
+        "inner_activation":
+            self._inner_activation,
+    }
+    base_config = super(TransformerXL, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self,
+           content_stream,
+           relative_position_encoding,
+           segment_matrix=None,
+           segment_embedding=None,
+           state=None,
+           content_attention_mask=None,
+           query_stream=None,
+           query_attention_mask=None,
+           target_mapping=None):
+    """Implements call() for the layer.
+
+    Args:
+      content_stream: `Tensor`, the input content stream. This is the standard
+        input to Transformer XL and is commonly referred to as `h` in XLNet.
+      relative_position_encoding: Relative positional encoding `Tensor` of shape
+        `[B, L, dim]`.
+      segment_matrix: Optional `Tensor` of shape `[B, S, S + M]`. Used in XLNet,
+        but not in Transformer XL.
+      segment_embedding: Optional `Tensor` of shape `[2, num_heads, dim]`. Used
+        in XLNet, but not in Transformer XL.
+      state: Optional `Tensor` of shape `[B, M, E]`, where M is the length of
+        the state or memory. If passed, this is also attended over as in
+        Transformer XL.
+      content_attention_mask: Optional `Tensor` representing the mask that is
+        added to content attention logits. If state is not None, the mask source
+        sequence dimension should extend M.
+      query_stream: Optional `Tensor`, the query stream. This is introduced in
+        `TwoStreamRelativeAttention`/XLNet pretrainer. This is ignored if
+        `two_stream` is `False`.
+      query_attention_mask: Optional `Tensor` representing the mask that is
+        added to query attention logits. If state is not None, the mask source
+        sequence dimension should extend M.
+      target_mapping: Optional `Tensor` representing the target mapping when
+        calculating query attention.
+
+    Returns:
+      A tuple consisting of the attention output and the list of cached memory
+      states.
+      The attention output is `content_attention` if `two_stream` is `False`,
+      otherwise it is `query_attention`.
+    """
+    new_mems = []
+
+    if state is None:
+      state = [None] * self._num_layers
+    for i in range(self._num_layers):
+      # cache new mems
+      new_mems.append(
+          _cache_memory(content_stream, state[i],
+                        self._memory_length, self._reuse_length))
+
+      # segment bias
+      if segment_matrix is None:
+        segment_attention_bias = None
+        segment_encoding = None
+      else:
+        segment_attention_bias = (self.segment_attention_bias
+                                  if self._tie_attention_biases
+                                  else self.segment_attention_bias[i])
+        segment_encoding = segment_embedding[i]
+
+      content_attention_bias = (self.content_attention_bias
+                                if self._tie_attention_biases
+                                else self.content_attention_bias[i])
+      positional_attention_bias = (self.positional_attention_bias
+                                   if self._tie_attention_biases
+                                   else self.positional_attention_bias[i])
+      transformer_xl_layer = self.transformer_xl_layers[i]
+      transformer_xl_output = transformer_xl_layer(
+          content_stream=content_stream,
+          content_attention_bias=content_attention_bias,
+          positional_attention_bias=positional_attention_bias,
+          relative_position_encoding=relative_position_encoding,
+          segment_matrix=segment_matrix,
+          segment_encoding=segment_encoding,
+          segment_attention_bias=segment_attention_bias,
+          state=state[i],
+          content_attention_mask=content_attention_mask,
+          query_attention_mask=query_attention_mask,
+          query_stream=query_stream,
+          target_mapping=target_mapping)
+      content_stream = transformer_xl_output["content_attention"]
+      if self._two_stream:
+        query_stream = transformer_xl_output["query_attention"]
+      else:
+        query_stream = None
+
+    if self._two_stream:
+      output_stream = query_stream
+    else:
+      output_stream = content_stream
+
+    return output_stream, new_mems
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/util.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/util.py
new file mode 100644
index 000000000..280e2d2f6
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/layers/util.py
@@ -0,0 +1,46 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-based transformer block layer."""
+
+import functools
+
+import tensorflow as tf
+
+
+class TfFunctionIfEagerDecorator(object):
+  """Helper decorator function to optionally apply the @tf.function annotation."""
+
+  def __init__(self, **kwargs):
+    self.func_kwargs = kwargs
+
+  def __call__(self, func):
+
+    @functools.wraps(func)
+    def wrapped_func(*args):
+      # TODO(b/150147476, b/150024785): Fix tf.function in TF1 crash.
+      if not hasattr(tf.compat.v1, "executing_eagerly_outside_functions"
+                    ) or tf.compat.v1.executing_eagerly_outside_functions():
+        return tf.function(func=func, **self.func_kwargs)(*args)
+      return func(*args)
+
+    # Cache the created function in self._call_impl.
+    if not hasattr(self, "_call_impl"):
+      self._call_impl = wrapped_func
+    return self._call_impl
+
+
+def tf_function_if_eager(**kwargs):
+  """Applies the @tf.function decorator only if running in eager mode."""
+  return TfFunctionIfEagerDecorator(**kwargs)
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/losses/README.md b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/losses/README.md
new file mode 100644
index 000000000..a2607b1da
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/losses/README.md
@@ -0,0 +1,6 @@
+# Losses
+
+Losses contains common loss computation used in NLP tasks.
+
+* `weighted_sparse_categorical_crossentropy_loss` computes per-batch sparse
+categorical crossentropy loss.
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/losses/__init__.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/losses/__init__.py
new file mode 100644
index 000000000..ae19825e0
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/losses/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Losses contains common loss computation used in NLP (subject to change)."""
+from nlp_modeling.losses.weighted_sparse_categorical_crossentropy import loss as weighted_sparse_categorical_crossentropy_loss
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/losses/weighted_sparse_categorical_crossentropy.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/losses/weighted_sparse_categorical_crossentropy.py
new file mode 100644
index 000000000..d777800c6
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/losses/weighted_sparse_categorical_crossentropy.py
@@ -0,0 +1,71 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Weighted sparse categorical cross-entropy losses."""
+
+import tensorflow as tf
+
+
+def _adjust_labels(labels, predictions):
+  """Adjust the 'labels' tensor by squeezing it if needed."""
+  labels = tf.cast(labels, tf.int32)
+  if len(predictions.shape) == len(labels.shape):
+    labels = tf.squeeze(labels, [-1])
+  return labels, predictions
+
+
+def _validate_rank(labels, predictions, weights):
+  if weights is not None and len(weights.shape) != len(labels.shape):
+    raise RuntimeError(
+        ("Weight and label tensors were not of the same rank. weights.shape "
+         "was %s, and labels.shape was %s.") %
+        (predictions.shape, labels.shape))
+  if (len(predictions.shape) - 1) != len(labels.shape):
+    raise RuntimeError(
+        ("Weighted sparse categorical crossentropy expects `labels` to have a "
+         "rank of one less than `predictions`. labels.shape was %s, and "
+         "predictions.shape was %s.") % (labels.shape, predictions.shape))
+
+
+def loss(labels, predictions, weights=None, from_logits=False):
+  """Calculate a per-batch sparse categorical crossentropy loss.
+
+  This loss function assumes that the predictions are post-softmax.
+  Args:
+    labels: The labels to evaluate against. Should be a set of integer indices
+      ranging from 0 to (vocab_size-1).
+    predictions: The network predictions. Should have softmax already applied.
+    weights: An optional weight array of the same shape as the 'labels' array.
+      If None, all examples will be used.
+    from_logits: Whether the input predictions are logits.
+
+  Returns:
+    A loss scalar.
+
+  Raises:
+    RuntimeError if the passed tensors do not have the same rank.
+  """
+  # When using these functions with the Keras core API, we will need to squeeze
+  # the labels tensor - Keras adds a spurious inner dimension.
+  labels, predictions = _adjust_labels(labels, predictions)
+  _validate_rank(labels, predictions, weights)
+
+  example_losses = tf.keras.losses.sparse_categorical_crossentropy(
+      labels, predictions, from_logits=from_logits)
+
+  if weights is None:
+    return tf.reduce_mean(example_losses)
+  weights = tf.cast(weights, predictions.dtype)
+  return tf.math.divide_no_nan(
+      tf.reduce_sum(example_losses * weights), tf.reduce_sum(weights))
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/README.md b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/README.md
new file mode 100644
index 000000000..22fd8193c
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/README.md
@@ -0,0 +1,25 @@
+# Models
+
+Models are combinations of `tf.keras` layers and models that can be trained.
+
+Several pre-built canned models are provided to train encoder networks.
+These models are intended as both convenience functions and canonical examples.
+
+* [`BertClassifier`](bert_classifier.py) implements a simple classification
+model containing a single classification head using the Classification network.
+It can be used as a regression model as well.
+
+* [`BertTokenClassifier`](bert_token_classifier.py) implements a simple token
+classification model containing a single classification head over the sequence
+output embeddings.
+
+* [`BertSpanLabeler`](bert_span_labeler.py) implementats a simple single-span
+start-end predictor (that is, a model that predicts two values: a start token
+index and an end token index), suitable for SQuAD-style tasks.
+
+* [`BertPretrainer`](bert_pretrainer.py) implements a masked LM and a
+classification head using the Masked LM and Classification networks,
+respectively.
+
+* [`DualEncoder`](dual_encoder.py) implements a dual encoder model, suitbale for
+retrieval tasks.
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/__init__.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/__init__.py
new file mode 100644
index 000000000..a7e896c43
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Models are combinations of `tf.keras` layers and models that can be trained.
+
+Several pre-built canned models are provided to train encoder networks.
+These models are intended as both convenience functions and canonical examples.
+"""
+from nlp_modeling.models.bert_classifier import BertClassifier
+from nlp_modeling.models.bert_pretrainer import *
+from nlp_modeling.models.bert_span_labeler import BertSpanLabeler
+from nlp_modeling.models.bert_token_classifier import BertTokenClassifier
+from nlp_modeling.models.dual_encoder import DualEncoder
+from nlp_modeling.models.electra_pretrainer import ElectraPretrainer
+from nlp_modeling.models.seq2seq_transformer import *
+from nlp_modeling.models.xlnet import XLNetClassifier
+from nlp_modeling.models.xlnet import XLNetPretrainer
+from nlp_modeling.models.xlnet import XLNetSpanLabeler
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/bert_classifier.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/bert_classifier.py
new file mode 100644
index 000000000..7c11bf286
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/bert_classifier.py
@@ -0,0 +1,143 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""BERT cls-token classifier."""
+# pylint: disable=g-classes-have-attributes
+import collections
+import tensorflow as tf
+
+from nlp_modeling import layers
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class BertClassifier(tf.keras.Model):
+  """Classifier model based on a BERT-style transformer-based encoder.
+
+  This is an implementation of the network structure surrounding a transformer
+  encoder as described in "BERT: Pre-training of Deep Bidirectional Transformers
+  for Language Understanding" (https://arxiv.org/abs/1810.04805).
+
+  The BertClassifier allows a user to pass in a transformer stack, and
+  instantiates a classification network based on the passed `num_classes`
+  argument. If `num_classes` is set to 1, a regression network is instantiated.
+
+  *Note* that the model is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).
+
+  Args:
+    network: A transformer network. This network should output a sequence output
+      and a classification output. Furthermore, it should expose its embedding
+      table via a "get_embedding_table" method.
+    num_classes: Number of classes to predict from the classification network.
+    initializer: The initializer (if any) to use in the classification networks.
+      Defaults to a Glorot uniform initializer.
+    dropout_rate: The dropout probability of the cls head.
+    use_encoder_pooler: Whether to use the pooler layer pre-defined inside the
+      encoder.
+    cls_head: (Optional) The layer instance to use for the classifier head.
+      It should take in the output from network and produce the final logits.
+      If set, the arguments ('num_classes', 'initializer', 'dropout_rate',
+      'use_encoder_pooler') will be ignored.
+  """
+
+  def __init__(self,
+               network,
+               num_classes,
+               initializer='glorot_uniform',
+               dropout_rate=0.1,
+               use_encoder_pooler=True,
+               cls_head=None,
+               **kwargs):
+    self.num_classes = num_classes
+    self.initializer = initializer
+    self.use_encoder_pooler = use_encoder_pooler
+
+    # We want to use the inputs of the passed network as the inputs to this
+    # Model. To do this, we need to keep a handle to the network inputs for use
+    # when we construct the Model object at the end of init.
+    inputs = network.inputs
+
+    if use_encoder_pooler:
+      # Because we have a copy of inputs to create this Model object, we can
+      # invoke the Network object with its own input tensors to start the Model.
+      outputs = network(inputs)
+      if isinstance(outputs, list):
+        cls_inputs = outputs[1]
+      else:
+        cls_inputs = outputs['pooled_output']
+      cls_inputs = tf.keras.layers.Dropout(rate=dropout_rate)(cls_inputs)
+    else:
+      outputs = network(inputs)
+      if isinstance(outputs, list):
+        cls_inputs = outputs[0]
+      else:
+        cls_inputs = outputs['sequence_output']
+
+    if cls_head:
+      classifier = cls_head
+    else:
+      classifier = layers.ClassificationHead(
+          inner_dim=0 if use_encoder_pooler else cls_inputs.shape[-1],
+          num_classes=num_classes,
+          initializer=initializer,
+          dropout_rate=dropout_rate,
+          name='sentence_prediction')
+
+    predictions = classifier(cls_inputs)
+
+    # b/164516224
+    # Once we've created the network using the Functional API, we call
+    # super().__init__ as though we were invoking the Functional API Model
+    # constructor, resulting in this object having all the properties of a model
+    # created using the Functional API. Once super().__init__ is called, we
+    # can assign attributes to `self` - note that all `self` assignments are
+    # below this line.
+    super(BertClassifier, self).__init__(
+        inputs=inputs, outputs=predictions, **kwargs)
+    self._network = network
+    self._cls_head = cls_head
+
+    config_dict = self._make_config_dict()
+    # We are storing the config dict as a namedtuple here to ensure checkpoint
+    # compatibility with an earlier version of this model which did not track
+    # the config dict attribute. TF does not track immutable attrs which
+    # do not contain Trackables, so by creating a config namedtuple instead of
+    # a dict we avoid tracking it.
+    config_cls = collections.namedtuple('Config', config_dict.keys())
+    self._config = config_cls(**config_dict)
+    self.classifier = classifier
+
+  @property
+  def checkpoint_items(self):
+    items = dict(encoder=self._network)
+    if hasattr(self.classifier, 'checkpoint_items'):
+      for key, item in self.classifier.checkpoint_items.items():
+        items['.'.join([self.classifier.name, key])] = item
+    return items
+
+  def get_config(self):
+    return dict(self._config._asdict())
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
+  def _make_config_dict(self):
+    return {
+        'network': self._network,
+        'num_classes': self.num_classes,
+        'initializer': self.initializer,
+        'use_encoder_pooler': self.use_encoder_pooler,
+        'cls_head': self._cls_head,
+    }
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/bert_pretrainer.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/bert_pretrainer.py
new file mode 100644
index 000000000..61185d92a
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/bert_pretrainer.py
@@ -0,0 +1,274 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""BERT Pre-training model."""
+# pylint: disable=g-classes-have-attributes
+import collections
+import copy
+from typing import List, Optional
+
+from absl import logging
+import gin
+import tensorflow as tf
+
+from nlp_modeling import layers
+from nlp_modeling import networks
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class BertPretrainer(tf.keras.Model):
+  """BERT pretraining model.
+
+  [Note] Please use the new `BertPretrainerV2` for your projects.
+
+  The BertPretrainer allows a user to pass in a transformer stack, and
+  instantiates the masked language model and classification networks that are
+  used to create the training objectives.
+
+  *Note* that the model is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).
+
+  Args:
+    network: A transformer network. This network should output a sequence output
+      and a classification output.
+    num_classes: Number of classes to predict from the classification network.
+    num_token_predictions: Number of tokens to predict from the masked LM.
+    embedding_table: Embedding table of a network. If None, the
+      "network.get_embedding_table()" is used.
+    activation: The activation (if any) to use in the masked LM network. If
+      None, no activation will be used.
+    initializer: The initializer (if any) to use in the masked LM and
+      classification networks. Defaults to a Glorot uniform initializer.
+    output: The output style for this network. Can be either `logits` or
+      `predictions`.
+  """
+
+  def __init__(self,
+               network,
+               num_classes,
+               num_token_predictions,
+               embedding_table=None,
+               activation=None,
+               initializer='glorot_uniform',
+               output='logits',
+               **kwargs):
+
+    # We want to use the inputs of the passed network as the inputs to this
+    # Model. To do this, we need to keep a copy of the network inputs for use
+    # when we construct the Model object at the end of init. (We keep a copy
+    # because we'll be adding another tensor to the copy later.)
+    network_inputs = network.inputs
+    inputs = copy.copy(network_inputs)
+
+    # Because we have a copy of inputs to create this Model object, we can
+    # invoke the Network object with its own input tensors to start the Model.
+    # Note that, because of how deferred construction happens, we can't use
+    # the copy of the list here - by the time the network is invoked, the list
+    # object contains the additional input added below.
+    sequence_output, cls_output = network(network_inputs)
+
+    # The encoder network may get outputs from all layers.
+    if isinstance(sequence_output, list):
+      sequence_output = sequence_output[-1]
+    if isinstance(cls_output, list):
+      cls_output = cls_output[-1]
+    sequence_output_length = sequence_output.shape.as_list()[1]
+    if sequence_output_length is not None and (sequence_output_length <
+                                               num_token_predictions):
+      raise ValueError(
+          "The passed network's output length is %s, which is less than the "
+          'requested num_token_predictions %s.' %
+          (sequence_output_length, num_token_predictions))
+
+    masked_lm_positions = tf.keras.layers.Input(
+        shape=(num_token_predictions,),
+        name='masked_lm_positions',
+        dtype=tf.int32)
+    inputs.append(masked_lm_positions)
+
+    if embedding_table is None:
+      embedding_table = network.get_embedding_table()
+    masked_lm = layers.MaskedLM(
+        embedding_table=embedding_table,
+        activation=activation,
+        initializer=initializer,
+        output=output,
+        name='cls/predictions')
+    lm_outputs = masked_lm(
+        sequence_output, masked_positions=masked_lm_positions)
+
+    classification = networks.Classification(
+        input_width=cls_output.shape[-1],
+        num_classes=num_classes,
+        initializer=initializer,
+        output=output,
+        name='classification')
+    sentence_outputs = classification(cls_output)
+
+    super(BertPretrainer, self).__init__(
+        inputs=inputs,
+        outputs=dict(masked_lm=lm_outputs, classification=sentence_outputs),
+        **kwargs)
+
+    # b/164516224
+    # Once we've created the network using the Functional API, we call
+    # super().__init__ as though we were invoking the Functional API Model
+    # constructor, resulting in this object having all the properties of a model
+    # created using the Functional API. Once super().__init__ is called, we
+    # can assign attributes to `self` - note that all `self` assignments are
+    # below this line.
+    config_dict = {
+        'network': network,
+        'num_classes': num_classes,
+        'num_token_predictions': num_token_predictions,
+        'activation': activation,
+        'initializer': initializer,
+        'output': output,
+    }
+
+    # We are storing the config dict as a namedtuple here to ensure checkpoint
+    # compatibility with an earlier version of this model which did not track
+    # the config dict attribute. TF does not track immutable attrs which
+    # do not contain Trackables, so by creating a config namedtuple instead of
+    # a dict we avoid tracking it.
+    config_cls = collections.namedtuple('Config', config_dict.keys())
+    self._config = config_cls(**config_dict)
+
+    self.encoder = network
+    self.classification = classification
+    self.masked_lm = masked_lm
+
+  def get_config(self):
+    return dict(self._config._asdict())
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+@gin.configurable
+class BertPretrainerV2(tf.keras.Model):
+  """BERT pretraining model V2.
+
+  Adds the masked language model head and optional classification heads upon the
+  transformer encoder.
+
+  Args:
+    encoder_network: A transformer network. This network should output a
+      sequence output and a classification output.
+    mlm_activation: The activation (if any) to use in the masked LM network. If
+      None, no activation will be used.
+    mlm_initializer: The initializer (if any) to use in the masked LM. Default
+      to a Glorot uniform initializer.
+    classification_heads: A list of optional head layers to transform on encoder
+      sequence outputs.
+    customized_masked_lm: A customized masked_lm layer. If None, will create
+      a standard layer from `layers.MaskedLM`; if not None, will use the
+      specified masked_lm layer. Above arguments `mlm_activation` and
+      `mlm_initializer` will be ignored.
+    name: The name of the model.
+  Inputs: Inputs defined by the encoder network, plus `masked_lm_positions` as a
+    dictionary.
+  Outputs: A dictionary of `lm_output`, classification head outputs keyed by
+    head names, and also outputs from `encoder_network`, keyed by
+    `sequence_output` and `encoder_outputs` (if any).
+  """
+
+  def __init__(
+      self,
+      encoder_network: tf.keras.Model,
+      mlm_activation=None,
+      mlm_initializer='glorot_uniform',
+      classification_heads: Optional[List[tf.keras.layers.Layer]] = None,
+      customized_masked_lm: Optional[tf.keras.layers.Layer] = None,
+      name: str = 'bert',
+      **kwargs):
+    super().__init__(self, name=name, **kwargs)
+    self._config = {
+        'encoder_network': encoder_network,
+        'mlm_initializer': mlm_initializer,
+        'classification_heads': classification_heads,
+        'name': name,
+    }
+    self.encoder_network = encoder_network
+    inputs = copy.copy(self.encoder_network.inputs)
+    self.classification_heads = classification_heads or []
+    if len(set([cls.name for cls in self.classification_heads])) != len(
+        self.classification_heads):
+      raise ValueError('Classification heads should have unique names.')
+
+    self.masked_lm = customized_masked_lm or layers.MaskedLM(
+        embedding_table=self.encoder_network.get_embedding_table(),
+        activation=mlm_activation,
+        initializer=mlm_initializer,
+        name='cls/predictions')
+    masked_lm_positions = tf.keras.layers.Input(
+        shape=(None,), name='masked_lm_positions', dtype=tf.int32)
+    inputs.append(masked_lm_positions)
+    self.inputs = inputs
+
+  def call(self, inputs):
+    if isinstance(inputs, list):
+      logging.warning('List inputs to BertPretrainer are discouraged.')
+      inputs = dict([
+          (ref.name, tensor) for ref, tensor in zip(self.inputs, inputs)
+      ])
+
+    outputs = dict()
+    encoder_network_outputs = self.encoder_network(inputs)
+    if isinstance(encoder_network_outputs, list):
+      outputs['pooled_output'] = encoder_network_outputs[1]
+      # When `encoder_network` was instantiated with return_all_encoder_outputs
+      # set to True, `encoder_network_outputs[0]` is a list containing
+      # all transformer layers' output.
+      if isinstance(encoder_network_outputs[0], list):
+        outputs['encoder_outputs'] = encoder_network_outputs[0]
+        outputs['sequence_output'] = encoder_network_outputs[0][-1]
+      else:
+        outputs['sequence_output'] = encoder_network_outputs[0]
+    elif isinstance(encoder_network_outputs, dict):
+      outputs = encoder_network_outputs
+    else:
+      raise ValueError('encoder_network\'s output should be either a list '
+                       'or a dict, but got %s' % encoder_network_outputs)
+    sequence_output = outputs['sequence_output']
+    # Inference may not have masked_lm_positions and mlm_logits is not needed.
+    if 'masked_lm_positions' in inputs:
+      masked_lm_positions = inputs['masked_lm_positions']
+      outputs['mlm_logits'] = self.masked_lm(
+          sequence_output, masked_positions=masked_lm_positions)
+    for cls_head in self.classification_heads:
+      cls_outputs = cls_head(sequence_output)
+      if isinstance(cls_outputs, dict):
+        outputs.update(cls_outputs)
+      else:
+        outputs[cls_head.name] = cls_outputs
+    return outputs
+
+  @property
+  def checkpoint_items(self):
+    """Returns a dictionary of items to be additionally checkpointed."""
+    items = dict(encoder=self.encoder_network, masked_lm=self.masked_lm)
+    for head in self.classification_heads:
+      for key, item in head.checkpoint_items.items():
+        items['.'.join([head.name, key])] = item
+    return items
+
+  def get_config(self):
+    return self._config
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/bert_span_labeler.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/bert_span_labeler.py
new file mode 100644
index 000000000..9a474a95b
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/bert_span_labeler.py
@@ -0,0 +1,125 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""BERT Question Answering model."""
+# pylint: disable=g-classes-have-attributes
+import collections
+import tensorflow as tf
+
+from nlp_modeling import networks
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class BertSpanLabeler(tf.keras.Model):
+  """Span labeler model based on a BERT-style transformer-based encoder.
+
+  This is an implementation of the network structure surrounding a transformer
+  encoder as described in "BERT: Pre-training of Deep Bidirectional Transformers
+  for Language Understanding" (https://arxiv.org/abs/1810.04805).
+
+  The BertSpanLabeler allows a user to pass in a transformer encoder, and
+  instantiates a span labeling network based on a single dense layer.
+
+  *Note* that the model is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).
+
+  Args:
+    network: A transformer network. This network should output a sequence output
+      and a classification output. Furthermore, it should expose its embedding
+      table via a `get_embedding_table` method.
+    initializer: The initializer (if any) to use in the span labeling network.
+      Defaults to a Glorot uniform initializer.
+    output: The output style for this network. Can be either `logit`' or
+      `predictions`.
+  """
+
+  def __init__(self,
+               network,
+               initializer='glorot_uniform',
+               output='logits',
+               **kwargs):
+
+    # We want to use the inputs of the passed network as the inputs to this
+    # Model. To do this, we need to keep a handle to the network inputs for use
+    # when we construct the Model object at the end of init.
+    inputs = network.inputs
+
+    # Because we have a copy of inputs to create this Model object, we can
+    # invoke the Network object with its own input tensors to start the Model.
+    outputs = network(inputs)
+    if isinstance(outputs, list):
+      sequence_output = outputs[0]
+    else:
+      sequence_output = outputs['sequence_output']
+
+    # The input network (typically a transformer model) may get outputs from all
+    # layers. When this case happens, we retrieve the last layer output.
+    if isinstance(sequence_output, list):
+      sequence_output = sequence_output[-1]
+
+    # This is an instance variable for ease of access to the underlying task
+    # network.
+    span_labeling = networks.SpanLabeling(
+        input_width=sequence_output.shape[-1],
+        initializer=initializer,
+        output=output,
+        name='span_labeling')
+    start_logits, end_logits = span_labeling(sequence_output)
+
+    # Use identity layers wrapped in lambdas to explicitly name the output
+    # tensors. This allows us to use string-keyed dicts in Keras fit/predict/
+    # evaluate calls.
+    start_logits = tf.keras.layers.Lambda(
+        tf.identity, name='start_positions')(
+            start_logits)
+    end_logits = tf.keras.layers.Lambda(
+        tf.identity, name='end_positions')(
+            end_logits)
+
+    logits = [start_logits, end_logits]
+
+    # b/164516224
+    # Once we've created the network using the Functional API, we call
+    # super().__init__ as though we were invoking the Functional API Model
+    # constructor, resulting in this object having all the properties of a model
+    # created using the Functional API. Once super().__init__ is called, we
+    # can assign attributes to `self` - note that all `self` assignments are
+    # below this line.
+    super(BertSpanLabeler, self).__init__(
+        inputs=inputs, outputs=logits, **kwargs)
+    self._network = network
+    config_dict = {
+        'network': network,
+        'initializer': initializer,
+        'output': output,
+    }
+    # We are storing the config dict as a namedtuple here to ensure checkpoint
+    # compatibility with an earlier version of this model which did not track
+    # the config dict attribute. TF does not track immutable attrs which
+    # do not contain Trackables, so by creating a config namedtuple instead of
+    # a dict we avoid tracking it.
+    config_cls = collections.namedtuple('Config', config_dict.keys())
+    self._config = config_cls(**config_dict)
+    self.span_labeling = span_labeling
+
+  @property
+  def checkpoint_items(self):
+    return dict(encoder=self._network)
+
+  def get_config(self):
+    return dict(self._config._asdict())
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/bert_token_classifier.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/bert_token_classifier.py
new file mode 100644
index 000000000..340d92fd6
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/bert_token_classifier.py
@@ -0,0 +1,133 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""BERT token classifier."""
+# pylint: disable=g-classes-have-attributes
+import collections
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class BertTokenClassifier(tf.keras.Model):
+  """Token classifier model based on a BERT-style transformer-based encoder.
+
+  This is an implementation of the network structure surrounding a transformer
+  encoder as described in "BERT: Pre-training of Deep Bidirectional Transformers
+  for Language Understanding" (https://arxiv.org/abs/1810.04805).
+
+  The BertTokenClassifier allows a user to pass in a transformer stack, and
+  instantiates a token classification network based on the passed `num_classes`
+  argument.
+
+  *Note* that the model is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).
+
+  Args:
+    network: A transformer network. This network should output a sequence output
+      and a classification output. Furthermore, it should expose its embedding
+      table via a `get_embedding_table` method.
+    num_classes: Number of classes to predict from the classification network.
+    initializer: The initializer (if any) to use in the classification networks.
+      Defaults to a Glorot uniform initializer.
+    output: The output style for this network. Can be either `logits` or
+      `predictions`.
+    dropout_rate: The dropout probability of the token classification head.
+    output_encoder_outputs: Whether to include intermediate sequence output
+      in the final output.
+  """
+
+  def __init__(self,
+               network,
+               num_classes,
+               initializer='glorot_uniform',
+               output='logits',
+               dropout_rate=0.1,
+               output_encoder_outputs=False,
+               **kwargs):
+
+    # We want to use the inputs of the passed network as the inputs to this
+    # Model. To do this, we need to keep a handle to the network inputs for use
+    # when we construct the Model object at the end of init.
+    inputs = network.inputs
+
+    # Because we have a copy of inputs to create this Model object, we can
+    # invoke the Network object with its own input tensors to start the Model.
+    outputs = network(inputs)
+    if isinstance(outputs, list):
+      sequence_output = outputs[0]
+    else:
+      sequence_output = outputs['sequence_output']
+    sequence_output = tf.keras.layers.Dropout(rate=dropout_rate)(
+        sequence_output)
+
+    classifier = tf.keras.layers.Dense(
+        num_classes,
+        activation=None,
+        kernel_initializer=initializer,
+        name='predictions/transform/logits')
+    logits = classifier(sequence_output)
+    if output == 'logits':
+      output_tensors = {'logits': logits}
+    elif output == 'predictions':
+      output_tensors = {
+          'predictions': tf.keras.layers.Activation(tf.nn.log_softmax)(logits)
+      }
+    else:
+      raise ValueError(
+          ('Unknown `output` value "%s". `output` can be either "logits" or '
+           '"predictions"') % output)
+
+    if output_encoder_outputs:
+      output_tensors['encoder_outputs'] = sequence_output
+
+    # b/164516224
+    # Once we've created the network using the Functional API, we call
+    # super().__init__ as though we were invoking the Functional API Model
+    # constructor, resulting in this object having all the properties of a model
+    # created using the Functional API. Once super().__init__ is called, we
+    # can assign attributes to `self` - note that all `self` assignments are
+    # below this line.
+    super(BertTokenClassifier, self).__init__(
+        inputs=inputs, outputs=output_tensors, **kwargs)
+
+    self._network = network
+    config_dict = {
+        'network': network,
+        'num_classes': num_classes,
+        'initializer': initializer,
+        'output': output,
+        'output_encoder_outputs': output_encoder_outputs
+    }
+
+    # We are storing the config dict as a namedtuple here to ensure checkpoint
+    # compatibility with an earlier version of this model which did not track
+    # the config dict attribute. TF does not track immutable attrs which
+    # do not contain Trackables, so by creating a config namedtuple instead of
+    # a dict we avoid tracking it.
+    config_cls = collections.namedtuple('Config', config_dict.keys())
+    self._config = config_cls(**config_dict)
+
+    self.classifier = classifier
+    self.logits = logits
+
+  @property
+  def checkpoint_items(self):
+    return dict(encoder=self._network)
+
+  def get_config(self):
+    return dict(self._config._asdict())
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/dual_encoder.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/dual_encoder.py
new file mode 100644
index 000000000..31382ce6e
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/dual_encoder.py
@@ -0,0 +1,162 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Trainer network for dual encoder style models."""
+# pylint: disable=g-classes-have-attributes
+import collections
+import tensorflow as tf
+
+from nlp_modeling import layers
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class DualEncoder(tf.keras.Model):
+  """A dual encoder model based on a transformer-based encoder.
+
+  This is an implementation of the dual encoder network structure based on the
+  transfomer stack, as described in ["Language-agnostic BERT Sentence
+  Embedding"](https://arxiv.org/abs/2007.01852)
+
+  The DualEncoder allows a user to pass in a transformer stack, and build a dual
+  encoder model based on the transformer stack.
+
+  Args:
+    network: A transformer network which should output an encoding output.
+    max_seq_length: The maximum allowed sequence length for transformer.
+    normalize: If set to True, normalize the encoding produced by transfomer.
+    logit_scale: The scaling factor of dot products when doing training.
+    logit_margin: The margin between positive and negative when doing training.
+    output: The output style for this network. Can be either `logits` or
+      `predictions`. If set to `predictions`, it will output the embedding
+      producted by transformer network.
+  """
+
+  def __init__(self,
+               network: tf.keras.Model,
+               max_seq_length: int = 32,
+               normalize: bool = True,
+               logit_scale: float = 1.0,
+               logit_margin: float = 0.0,
+               output: str = 'logits',
+               **kwargs) -> None:
+
+    if output == 'logits':
+      left_word_ids = tf.keras.layers.Input(
+          shape=(max_seq_length,), dtype=tf.int32, name='left_word_ids')
+      left_mask = tf.keras.layers.Input(
+          shape=(max_seq_length,), dtype=tf.int32, name='left_mask')
+      left_type_ids = tf.keras.layers.Input(
+          shape=(max_seq_length,), dtype=tf.int32, name='left_type_ids')
+    else:
+      # Keep the consistant with legacy BERT hub module input names.
+      left_word_ids = tf.keras.layers.Input(
+          shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
+      left_mask = tf.keras.layers.Input(
+          shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
+      left_type_ids = tf.keras.layers.Input(
+          shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
+
+    left_inputs = [left_word_ids, left_mask, left_type_ids]
+    left_outputs = network(left_inputs)
+    if isinstance(left_outputs, list):
+      left_sequence_output, left_encoded = left_outputs
+    else:
+      left_sequence_output = left_outputs['sequence_output']
+      left_encoded = left_outputs['pooled_output']
+    if normalize:
+      left_encoded = tf.keras.layers.Lambda(
+          lambda x: tf.nn.l2_normalize(x, axis=1))(
+              left_encoded)
+
+    if output == 'logits':
+      right_word_ids = tf.keras.layers.Input(
+          shape=(max_seq_length,), dtype=tf.int32, name='right_word_ids')
+      right_mask = tf.keras.layers.Input(
+          shape=(max_seq_length,), dtype=tf.int32, name='right_mask')
+      right_type_ids = tf.keras.layers.Input(
+          shape=(max_seq_length,), dtype=tf.int32, name='right_type_ids')
+
+      right_inputs = [right_word_ids, right_mask, right_type_ids]
+      right_outputs = network(right_inputs)
+      if isinstance(right_outputs, list):
+        _, right_encoded = right_outputs
+      else:
+        right_encoded = right_outputs['pooled_output']
+      if normalize:
+        right_encoded = tf.keras.layers.Lambda(
+            lambda x: tf.nn.l2_normalize(x, axis=1))(
+                right_encoded)
+
+      dot_products = layers.MatMulWithMargin(
+          logit_scale=logit_scale,
+          logit_margin=logit_margin,
+          name='dot_product')
+
+      inputs = [
+          left_word_ids, left_mask, left_type_ids, right_word_ids, right_mask,
+          right_type_ids
+      ]
+      left_logits, right_logits = dot_products(left_encoded, right_encoded)
+
+      outputs = dict(left_logits=left_logits, right_logits=right_logits)
+
+    elif output == 'predictions':
+      inputs = [left_word_ids, left_mask, left_type_ids]
+
+      # To keep consistent with legacy BERT hub modules, the outputs are
+      # "pooled_output" and "sequence_output".
+      outputs = dict(
+          sequence_output=left_sequence_output, pooled_output=left_encoded)
+    else:
+      raise ValueError('output type %s is not supported' % output)
+
+    # b/164516224
+    # Once we've created the network using the Functional API, we call
+    # super().__init__ as though we were invoking the Functional API Model
+    # constructor, resulting in this object having all the properties of a model
+    # created using the Functional API. Once super().__init__ is called, we
+    # can assign attributes to `self` - note that all `self` assignments are
+    # below this line.
+    super(DualEncoder, self).__init__(inputs=inputs, outputs=outputs, **kwargs)
+
+    config_dict = {
+        'network': network,
+        'max_seq_length': max_seq_length,
+        'normalize': normalize,
+        'logit_scale': logit_scale,
+        'logit_margin': logit_margin,
+        'output': output,
+    }
+    # We are storing the config dict as a namedtuple here to ensure checkpoint
+    # compatibility with an earlier version of this model which did not track
+    # the config dict attribute. TF does not track immutable attrs which
+    # do not contain Trackables, so by creating a config namedtuple instead of
+    # a dict we avoid tracking it.
+    config_cls = collections.namedtuple('Config', config_dict.keys())
+    self._config = config_cls(**config_dict)
+
+    self.network = network
+
+  def get_config(self):
+    return dict(self._config._asdict())
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
+  @property
+  def checkpoint_items(self):
+    """Returns a dictionary of items to be additionally checkpointed."""
+    items = dict(encoder=self.network)
+    return items
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/electra_pretrainer.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/electra_pretrainer.py
new file mode 100644
index 000000000..41a2366d6
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/electra_pretrainer.py
@@ -0,0 +1,333 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Trainer network for ELECTRA models."""
+# pylint: disable=g-classes-have-attributes
+
+import copy
+
+import tensorflow as tf
+
+from modeling import tf_utils
+from nlp_modeling import layers
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class ElectraPretrainer(tf.keras.Model):
+  """ELECTRA network training model.
+
+  This is an implementation of the network structure described in "ELECTRA:
+  Pre-training Text Encoders as Discriminators Rather Than Generators" (
+  https://arxiv.org/abs/2003.10555).
+
+  The ElectraPretrainer allows a user to pass in two transformer models, one for
+  generator, the other for discriminator, and instantiates the masked language
+  model (at generator side) and classification networks (at discriminator side)
+  that are used to create the training objectives.
+
+  *Note* that the model is constructed by Keras Subclass API, where layers are
+  defined inside `__init__` and `call()` implements the computation.
+
+  Args:
+    generator_network: A transformer network for generator, this network should
+      output a sequence output and an optional classification output.
+    discriminator_network: A transformer network for discriminator, this network
+      should output a sequence output
+    vocab_size: Size of generator output vocabulary
+    num_classes: Number of classes to predict from the classification network
+      for the generator network (not used now)
+    num_token_predictions: Number of tokens to predict from the masked LM.
+    mlm_activation: The activation (if any) to use in the masked LM and
+      classification networks. If None, no activation will be used.
+    mlm_initializer: The initializer (if any) to use in the masked LM and
+      classification networks. Defaults to a Glorot uniform initializer.
+    output_type: The output style for this network. Can be either `logits` or
+      `predictions`.
+    disallow_correct: Whether to disallow the generator to generate the exact
+      same token in the original sentence
+  """
+
+  def __init__(self,
+               generator_network,
+               discriminator_network,
+               vocab_size,
+               num_classes,
+               num_token_predictions,
+               mlm_activation=None,
+               mlm_initializer='glorot_uniform',
+               output_type='logits',
+               disallow_correct=False,
+               **kwargs):
+    super(ElectraPretrainer, self).__init__()
+    self._config = {
+        'generator_network': generator_network,
+        'discriminator_network': discriminator_network,
+        'vocab_size': vocab_size,
+        'num_classes': num_classes,
+        'num_token_predictions': num_token_predictions,
+        'mlm_activation': mlm_activation,
+        'mlm_initializer': mlm_initializer,
+        'output_type': output_type,
+        'disallow_correct': disallow_correct,
+    }
+    for k, v in kwargs.items():
+      self._config[k] = v
+
+    self.generator_network = generator_network
+    self.discriminator_network = discriminator_network
+    self.vocab_size = vocab_size
+    self.num_classes = num_classes
+    self.num_token_predictions = num_token_predictions
+    self.mlm_activation = mlm_activation
+    self.mlm_initializer = mlm_initializer
+    self.output_type = output_type
+    self.disallow_correct = disallow_correct
+    self.masked_lm = layers.MaskedLM(
+        embedding_table=generator_network.get_embedding_table(),
+        activation=mlm_activation,
+        initializer=mlm_initializer,
+        output=output_type,
+        name='generator_masked_lm')
+    self.classification = layers.ClassificationHead(
+        inner_dim=generator_network.get_config()['hidden_size'],
+        num_classes=num_classes,
+        initializer=mlm_initializer,
+        name='generator_classification_head')
+    self.discriminator_projection = tf.keras.layers.Dense(
+        units=discriminator_network.get_config()['hidden_size'],
+        activation=mlm_activation,
+        kernel_initializer=mlm_initializer,
+        name='discriminator_projection_head')
+    self.discriminator_head = tf.keras.layers.Dense(
+        units=1, kernel_initializer=mlm_initializer)
+
+  def call(self, inputs):
+    """ELECTRA forward pass.
+
+    Args:
+      inputs: A dict of all inputs, same as the standard BERT model.
+
+    Returns:
+      outputs: A dict of pretrainer model outputs, including
+        (1) lm_outputs: A `[batch_size, num_token_predictions, vocab_size]`
+        tensor indicating logits on masked positions.
+        (2) sentence_outputs: A `[batch_size, num_classes]` tensor indicating
+        logits for nsp task.
+        (3) disc_logits: A `[batch_size, sequence_length]` tensor indicating
+        logits for discriminator replaced token detection task.
+        (4) disc_label: A `[batch_size, sequence_length]` tensor indicating
+        target labels for discriminator replaced token detection task.
+    """
+    input_word_ids = inputs['input_word_ids']
+    input_mask = inputs['input_mask']
+    input_type_ids = inputs['input_type_ids']
+    masked_lm_positions = inputs['masked_lm_positions']
+
+    ### Generator ###
+    sequence_output = self.generator_network(
+        [input_word_ids, input_mask, input_type_ids])['sequence_output']
+    # The generator encoder network may get outputs from all layers.
+    if isinstance(sequence_output, list):
+      sequence_output = sequence_output[-1]
+
+    lm_outputs = self.masked_lm(sequence_output, masked_lm_positions)
+    sentence_outputs = self.classification(sequence_output)
+
+    ### Sampling from generator ###
+    fake_data = self._get_fake_data(inputs, lm_outputs, duplicate=True)
+
+    ### Discriminator ###
+    disc_input = fake_data['inputs']
+    disc_label = fake_data['is_fake_tokens']
+    disc_sequence_output = self.discriminator_network([
+        disc_input['input_word_ids'], disc_input['input_mask'],
+        disc_input['input_type_ids']
+    ])['sequence_output']
+
+    # The discriminator encoder network may get outputs from all layers.
+    if isinstance(disc_sequence_output, list):
+      disc_sequence_output = disc_sequence_output[-1]
+
+    disc_logits = self.discriminator_head(
+        self.discriminator_projection(disc_sequence_output))
+    disc_logits = tf.squeeze(disc_logits, axis=-1)
+
+    outputs = {
+        'lm_outputs': lm_outputs,
+        'sentence_outputs': sentence_outputs,
+        'disc_logits': disc_logits,
+        'disc_label': disc_label,
+    }
+
+    return outputs
+
+  def _get_fake_data(self, inputs, mlm_logits, duplicate=True):
+    """Generate corrupted data for discriminator.
+
+    Args:
+      inputs: A dict of all inputs, same as the input of `call()` function
+      mlm_logits: The generator's output logits
+      duplicate: Whether to copy the original inputs dict during modifications
+
+    Returns:
+      A dict of generated fake data
+    """
+    inputs = unmask(inputs, duplicate)
+
+    if self.disallow_correct:
+      disallow = tf.one_hot(
+          inputs['masked_lm_ids'], depth=self.vocab_size, dtype=tf.float32)
+    else:
+      disallow = None
+
+    sampled_tokens = tf.stop_gradient(
+        sample_from_softmax(mlm_logits, disallow=disallow))
+    sampled_tokids = tf.argmax(sampled_tokens, -1, output_type=tf.int32)
+    updated_input_ids, masked = scatter_update(inputs['input_word_ids'],
+                                               sampled_tokids,
+                                               inputs['masked_lm_positions'])
+    labels = masked * (1 - tf.cast(
+        tf.equal(updated_input_ids, inputs['input_word_ids']), tf.int32))
+
+    updated_inputs = get_updated_inputs(
+        inputs, duplicate, input_word_ids=updated_input_ids)
+
+    return {
+        'inputs': updated_inputs,
+        'is_fake_tokens': labels,
+        'sampled_tokens': sampled_tokens
+    }
+
+  @property
+  def checkpoint_items(self):
+    """Returns a dictionary of items to be additionally checkpointed."""
+    items = dict(encoder=self.discriminator_network)
+    return items
+
+  def get_config(self):
+    return self._config
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
+
+def scatter_update(sequence, updates, positions):
+  """Scatter-update a sequence.
+
+  Args:
+    sequence: A `[batch_size, seq_len]` or `[batch_size, seq_len, depth]`
+      tensor.
+    updates: A tensor of size `batch_size*seq_len(*depth)`.
+    positions: A `[batch_size, n_positions]` tensor.
+
+  Returns:
+    updated_sequence: A `[batch_size, seq_len]` or
+      `[batch_size, seq_len, depth]` tensor of "sequence" with elements at
+      "positions" replaced by the values at "updates". Updates to index 0 are
+      ignored. If there are duplicated positions the update is only
+      applied once.
+    updates_mask: A `[batch_size, seq_len]` mask tensor of which inputs were
+      updated.
+  """
+  shape = tf_utils.get_shape_list(sequence, expected_rank=[2, 3])
+  depth_dimension = (len(shape) == 3)
+  if depth_dimension:
+    batch_size, seq_len, depth = shape
+  else:
+    batch_size, seq_len = shape
+    depth = 1
+    sequence = tf.expand_dims(sequence, -1)
+  n_positions = tf_utils.get_shape_list(positions)[1]
+
+  shift = tf.expand_dims(seq_len * tf.range(batch_size), -1)
+  flat_positions = tf.reshape(positions + shift, [-1, 1])
+  flat_updates = tf.reshape(updates, [-1, depth])
+  updates = tf.scatter_nd(flat_positions, flat_updates,
+                          [batch_size * seq_len, depth])
+  updates = tf.reshape(updates, [batch_size, seq_len, depth])
+
+  flat_updates_mask = tf.ones([batch_size * n_positions], tf.int32)
+  updates_mask = tf.scatter_nd(flat_positions, flat_updates_mask,
+                               [batch_size * seq_len])
+  updates_mask = tf.reshape(updates_mask, [batch_size, seq_len])
+  not_first_token = tf.concat([
+      tf.zeros((batch_size, 1), tf.int32),
+      tf.ones((batch_size, seq_len - 1), tf.int32)
+  ], -1)
+  updates_mask *= not_first_token
+  updates_mask_3d = tf.expand_dims(updates_mask, -1)
+
+  # account for duplicate positions
+  if sequence.dtype == tf.float32:
+    updates_mask_3d = tf.cast(updates_mask_3d, tf.float32)
+    updates /= tf.maximum(1.0, updates_mask_3d)
+  else:
+    assert sequence.dtype == tf.int32
+    updates = tf.math.floordiv(updates, tf.maximum(1, updates_mask_3d))
+  updates_mask = tf.minimum(updates_mask, 1)
+  updates_mask_3d = tf.minimum(updates_mask_3d, 1)
+
+  updated_sequence = (((1 - updates_mask_3d) * sequence) +
+                      (updates_mask_3d * updates))
+  if not depth_dimension:
+    updated_sequence = tf.squeeze(updated_sequence, -1)
+
+  return updated_sequence, updates_mask
+
+
+def sample_from_softmax(logits, disallow=None):
+  """Implement softmax sampling using gumbel softmax trick.
+
+  Args:
+    logits: A `[batch_size, num_token_predictions, vocab_size]` tensor
+      indicating the generator output logits for each masked position.
+    disallow: If `None`, we directly sample tokens from the logits. Otherwise,
+      this is a tensor of size `[batch_size, num_token_predictions, vocab_size]`
+      indicating the true word id in each masked position.
+
+  Returns:
+    sampled_tokens: A `[batch_size, num_token_predictions, vocab_size]` one hot
+      tensor indicating the sampled word id in each masked position.
+  """
+  if disallow is not None:
+    logits -= 1000.0 * disallow
+  uniform_noise = tf.random.uniform(
+      tf_utils.get_shape_list(logits), minval=0, maxval=1)
+  gumbel_noise = -tf.math.log(-tf.math.log(uniform_noise + 1e-9) + 1e-9)
+
+  # Here we essentially follow the original paper and use temperature 1.0 for
+  # generator output logits.
+  sampled_tokens = tf.one_hot(
+      tf.argmax(tf.nn.softmax(logits + gumbel_noise), -1, output_type=tf.int32),
+      logits.shape[-1])
+  return sampled_tokens
+
+
+def unmask(inputs, duplicate):
+  unmasked_input_word_ids, _ = scatter_update(inputs['input_word_ids'],
+                                              inputs['masked_lm_ids'],
+                                              inputs['masked_lm_positions'])
+  return get_updated_inputs(
+      inputs, duplicate, input_word_ids=unmasked_input_word_ids)
+
+
+def get_updated_inputs(inputs, duplicate, **kwargs):
+  if duplicate:
+    new_inputs = copy.copy(inputs)
+  else:
+    new_inputs = inputs
+  for k, v in kwargs.items():
+    new_inputs[k] = v
+  return new_inputs
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/seq2seq_transformer.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/seq2seq_transformer.py
new file mode 100644
index 000000000..8c219c48f
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/seq2seq_transformer.py
@@ -0,0 +1,591 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Implement Seq2Seq Transformer model by TF official NLP library.
+
+Model paper: https://arxiv.org/pdf/1706.03762.pdf
+"""
+import math
+
+import tensorflow as tf
+from modeling import tf_utils
+import keras_nlp
+from nlp_modeling import layers
+from nlp_modeling.ops import beam_search
+
+EOS_ID = 1
+
+
+@tf.keras.utils.register_keras_serializable(package="Text")
+class Seq2SeqTransformer(tf.keras.Model):
+  """Transformer model with Keras.
+
+  Implemented as described in: https://arxiv.org/pdf/1706.03762.pdf
+
+  The Transformer model consists of an encoder and decoder. The input is an int
+  sequence (or a batch of sequences). The encoder produces a continuous
+  representation, and the decoder uses the encoder output to generate
+  probabilities for the output sequence.
+  """
+
+  def __init__(self,
+               vocab_size=33708,
+               embedding_width=512,
+               dropout_rate=0.0,
+               padded_decode=False,
+               decode_max_length=None,
+               extra_decode_length=0,
+               beam_size=4,
+               alpha=0.6,
+               encoder_layer=None,
+               decoder_layer=None,
+               eos_id=EOS_ID,
+               **kwargs):
+    """Initialize layers to build Transformer model.
+
+    Args:
+      vocab_size: Size of vocabulary.
+      embedding_width: Size of hidden layer for embedding.
+      dropout_rate: Dropout probability.
+      padded_decode: Whether to max_sequence_length padding is used. If set
+        False, max_sequence_length padding is not used.
+      decode_max_length: maximum number of steps to decode a sequence.
+      extra_decode_length: Beam search will run extra steps to decode.
+      beam_size: Number of beams for beam search
+      alpha: The strength of length normalization for beam search.
+      encoder_layer: An initialized encoder layer.
+      decoder_layer: An initialized decoder layer.
+      eos_id: Id of end of sentence token.
+      **kwargs: other keyword arguments.
+    """
+    super().__init__(**kwargs)
+    self._vocab_size = vocab_size
+    self._embedding_width = embedding_width
+    self._dropout_rate = dropout_rate
+    self._padded_decode = padded_decode
+    self._decode_max_length = decode_max_length
+    self._extra_decode_length = extra_decode_length
+    self._beam_size = beam_size
+    self._alpha = alpha
+    self._eos_id = eos_id
+    self.embedding_lookup = keras_nlp.layers.OnDeviceEmbedding(
+        vocab_size=self._vocab_size,
+        embedding_width=self._embedding_width,
+        initializer=tf.random_normal_initializer(
+            mean=0., stddev=self._embedding_width**-0.5),
+        scale_factor=self._embedding_width**0.5)
+    self.encoder_layer = encoder_layer
+    self.decoder_layer = decoder_layer
+    self.position_embedding = layers.RelativePositionEmbedding(
+        hidden_size=self._embedding_width)
+    self.encoder_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
+    self.decoder_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
+
+  def get_config(self):
+    config = {
+        "vocab_size": self._vocab_size,
+        "hidden_size": self._embedding_width,
+        "dropout_rate": self._dropout_rate,
+        "padded_decode": self._padded_decode,
+        "decode_max_length": self._decode_max_length,
+        "eos_id": self._eos_id,
+        "extra_decode_length": self._extra_decode_length,
+        "beam_size": self._beam_size,
+        "alpha": self._alpha,
+        "encoder_layer": self.encoder_layer,
+        "decoder_layer": self.decoder_layer
+    }
+    base_config = super(Seq2SeqTransformer, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def _embedding_linear(self, embedding_matrix, x):
+    """Uses embeddings as linear transformation weights."""
+    embedding_matrix = tf.cast(embedding_matrix, dtype=self.compute_dtype)
+    x = tf.cast(x, dtype=self.compute_dtype)
+    batch_size = tf.shape(x)[0]
+    length = tf.shape(x)[1]
+    hidden_size = tf.shape(x)[2]
+    vocab_size = tf.shape(embedding_matrix)[0]
+
+    x = tf.reshape(x, [-1, hidden_size])
+    logits = tf.matmul(x, embedding_matrix, transpose_b=True)
+
+    return tf.reshape(logits, [batch_size, length, vocab_size])
+
+  def call(self, inputs):
+    """Calculate target logits or inferred target sequences.
+
+    Args:
+      inputs: a dictionary of tensors.
+        Feature `inputs`: int tensor with shape `[batch_size, input_length]`.
+        Feature `targets` (optional): None or int tensor with shape
+          `[batch_size, target_length]`.
+
+    Returns:
+      If targets is defined, then return logits for each word in the target
+      sequence, which is a float tensor with shape
+      `(batch_size, target_length, vocab_size)`. If target is `None`, then
+      generate output sequence one token at a time and
+      returns a dictionary {
+          outputs: `(batch_size, decoded_length)`
+          scores: `(batch_size, 1)`}
+      Even when `float16` is used, the output tensor(s) are always `float32`.
+
+    Raises:
+      NotImplementedError: If try to use padded decode method on CPU/GPUs.
+    """
+    sources = inputs["inputs"]
+    targets = inputs.get("targets", None)
+    # Prepare inputs to the layer stack by adding positional encodings and
+    # applying dropout.
+    embedded_inputs = self.embedding_lookup(sources)
+    embedding_mask = tf.cast(tf.not_equal(sources, 0), embedded_inputs.dtype)
+    embedded_inputs *= tf.expand_dims(embedding_mask, -1)
+    # Attention_mask generation.
+    input_shape = tf_utils.get_shape_list(sources, expected_rank=2)
+    attention_mask = tf.cast(
+        tf.reshape(
+            tf.not_equal(sources, 0), [input_shape[0], 1, input_shape[1]]),
+        dtype=sources.dtype)
+    broadcast_ones = tf.ones(
+        shape=[input_shape[0], input_shape[1], 1], dtype=sources.dtype)
+    attention_mask = broadcast_ones * attention_mask
+
+    pos_encoding = self.position_embedding(embedded_inputs)
+    pos_encoding = tf.cast(pos_encoding, embedded_inputs.dtype)
+    encoder_inputs = embedded_inputs + pos_encoding
+
+    encoder_inputs = self.encoder_dropout(encoder_inputs)
+
+    encoder_outputs = self.encoder_layer(
+        encoder_inputs, attention_mask=attention_mask)
+
+    if targets is None:
+      if self._padded_decode:
+        max_decode_length = self._decode_max_length
+      else:
+        max_decode_length = self._decode_max_length or (
+            tf.shape(encoder_outputs)[1] + self._extra_decode_length)
+      symbols_to_logits_fn = self._get_symbols_to_logits_fn(max_decode_length)
+
+      batch_size = tf.shape(encoder_outputs)[0]
+      # Create initial set of IDs that will be passed to symbols_to_logits_fn.
+      initial_ids = tf.zeros([batch_size], dtype=tf.int32)
+
+      # Create cache storing decoder attention values for each layer.
+      init_decode_length = (max_decode_length if self._padded_decode else 0)
+      num_heads = self.decoder_layer.num_attention_heads
+      dim_per_head = self._embedding_width // num_heads
+
+      # Cache dtype needs to match beam_search dtype.
+      # pylint: disable=g-complex-comprehension
+      cache = {
+          str(layer): {
+              "key":
+                  tf.zeros(
+                      [batch_size, init_decode_length, num_heads, dim_per_head],
+                      dtype=self.compute_dtype),
+              "value":
+                  tf.zeros(
+                      [batch_size, init_decode_length, num_heads, dim_per_head],
+                      dtype=self.compute_dtype)
+          } for layer in range(self.decoder_layer.num_layers)
+      }
+      # pylint: enable=g-complex-comprehension
+
+      # Add encoder output and attention bias to the cache.
+      encoder_outputs = tf.cast(encoder_outputs, dtype=self.compute_dtype)
+      attention_mask = tf.cast(
+          tf.reshape(
+              tf.not_equal(sources, 0), [input_shape[0], 1, input_shape[1]]),
+          dtype=self.compute_dtype)
+      cache["encoder_outputs"] = encoder_outputs
+      cache["encoder_decoder_attention_mask"] = attention_mask
+
+      # Use beam search to find the top beam_size sequences and scores.
+      decoded_ids, scores = beam_search.sequence_beam_search(
+          symbols_to_logits_fn=symbols_to_logits_fn,
+          initial_ids=initial_ids,
+          initial_cache=cache,
+          vocab_size=self._vocab_size,
+          beam_size=self._beam_size,
+          alpha=self._alpha,
+          max_decode_length=max_decode_length,
+          eos_id=self._eos_id,
+          padded_decode=self._padded_decode,
+          dtype=self.compute_dtype)
+
+      # Get the top sequence for each batch element
+      top_decoded_ids = decoded_ids[:, 0, 1:]
+      top_scores = scores[:, 0]
+
+      return {"outputs": top_decoded_ids, "scores": top_scores}
+
+    decoder_inputs = self.embedding_lookup(targets)
+    embedding_mask = tf.cast(tf.not_equal(targets, 0), decoder_inputs.dtype)
+    decoder_inputs *= tf.expand_dims(embedding_mask, -1)
+    # Shift targets to the right, and remove the last element
+    decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
+    length = tf.shape(decoder_inputs)[1]
+    pos_encoding = self.position_embedding(decoder_inputs)
+    pos_encoding = tf.cast(pos_encoding, embedded_inputs.dtype)
+    decoder_inputs += pos_encoding
+
+    decoder_inputs = self.decoder_dropout(decoder_inputs)
+
+    decoder_shape = tf_utils.get_shape_list(decoder_inputs, expected_rank=3)
+    batch_size = decoder_shape[0]
+    decoder_length = decoder_shape[1]
+
+    self_attention_mask = tf.linalg.band_part(tf.ones([length, length]), -1, 0)
+    self_attention_mask = tf.reshape(self_attention_mask, [1, length, length])
+    self_attention_mask = tf.tile(self_attention_mask, [batch_size, 1, 1])
+
+    attention_mask = tf.cast(
+        tf.expand_dims(tf.not_equal(sources, 0), axis=1), dtype=sources.dtype)
+    attention_mask = tf.tile(attention_mask, [1, decoder_length, 1])
+
+    outputs = self.decoder_layer(
+        decoder_inputs,
+        encoder_outputs,
+        self_attention_mask=self_attention_mask,
+        cross_attention_mask=attention_mask)
+    logits = self._embedding_linear(self.embedding_lookup.embeddings, outputs)
+    # Model outputs should be float32 to avoid numeric issues.
+    # https://www.tensorflow.org/guide/mixed_precision#building_the_model
+    logits = tf.cast(logits, tf.float32)
+    return logits
+
+  def _get_symbols_to_logits_fn(self, max_decode_length):
+    """Returns a decoding function that calculates logits of the next tokens."""
+    timing_signal = self.position_embedding(
+        inputs=None, length=max_decode_length + 1)
+    timing_signal = tf.cast(timing_signal, dtype=self.compute_dtype)
+    decoder_self_attention_mask = tf.linalg.band_part(
+        tf.ones([max_decode_length, max_decode_length],
+                dtype=self.compute_dtype), -1, 0)
+    decoder_self_attention_mask = tf.reshape(
+        decoder_self_attention_mask, [1, max_decode_length, max_decode_length])
+
+    def symbols_to_logits_fn(ids, i, cache):
+      """Generate logits for next potential IDs.
+
+      Args:
+        ids: Current decoded sequences. int tensor with shape `(batch_size *
+          beam_size, i + 1)`.
+        i: Loop index.
+        cache: Dictionary of values storing the encoder output, encoder-decoder
+          attention bias, and previous decoder attention values.
+
+      Returns:
+        Tuple of
+          (logits with shape `(batch_size * beam_size, vocab_size)`,
+           updated cache values)
+      """
+      # Set decoder input to the last generated IDs
+      decoder_input = ids[:, -1:]
+
+      # Preprocess decoder input by getting embeddings and adding timing signal.
+      # decoder_input = self.embedding_softmax_layer(decoder_input)
+      source_decoder_input = decoder_input
+      decoder_input = self.embedding_lookup(decoder_input)
+      embedding_mask = tf.cast(
+          tf.not_equal(source_decoder_input, 0), decoder_input.dtype)
+      decoder_input *= tf.expand_dims(embedding_mask, -1)
+      decoder_input += timing_signal[i]
+      if self._padded_decode:
+        # indexing does not work on TPU.
+        bias_shape = decoder_self_attention_mask.shape.as_list()
+        self_attention_mask = tf.slice(decoder_self_attention_mask, [0, i, 0],
+                                       [bias_shape[0], 1, bias_shape[2]])
+      else:
+        self_attention_mask = decoder_self_attention_mask[:, i:i + 1, :i + 1]
+      decoder_shape = tf_utils.get_shape_list(decoder_input, expected_rank=3)
+      batch_size = decoder_shape[0]
+      decoder_length = decoder_shape[1]
+
+      self_attention_mask = tf.tile(self_attention_mask, [batch_size, 1, 1])
+      attention_mask = cache.get("encoder_decoder_attention_mask")
+      attention_mask = tf.tile(attention_mask, [1, decoder_length, 1])
+
+      decoder_outputs = self.decoder_layer(
+          decoder_input,
+          cache.get("encoder_outputs"),
+          self_attention_mask=self_attention_mask,
+          cross_attention_mask=attention_mask,
+          cache=cache,
+          decode_loop_step=i if self._padded_decode else None)
+
+      decoder_outputs = tf.cast(decoder_outputs, dtype=self.compute_dtype)
+      logits = self._embedding_linear(self.embedding_lookup.embeddings,
+                                      decoder_outputs)
+      logits = tf.squeeze(logits, axis=[1])
+      return logits, cache
+
+    return symbols_to_logits_fn
+
+
+class TransformerEncoder(tf.keras.layers.Layer):
+  """Transformer encoder.
+
+  Transformer encoder is made up of N identical layers. Each layer is composed
+  of the sublayers:
+    1. Self-attention layer
+    2. Feedforward network (which is 2 fully-connected layers)
+  """
+
+  def __init__(self,
+               num_layers=6,
+               num_attention_heads=8,
+               intermediate_size=2048,
+               activation="relu",
+               dropout_rate=0.0,
+               attention_dropout_rate=0.0,
+               use_bias=False,
+               norm_first=True,
+               norm_epsilon=1e-6,
+               intermediate_dropout=0.0,
+               **kwargs):
+    """Initialize a Transformer encoder.
+
+    Args:
+      num_layers: Number of layers.
+      num_attention_heads: Number of attention heads.
+      intermediate_size: Size of the intermediate (Feedforward) layer.
+      activation: Activation for the intermediate layer.
+      dropout_rate: Dropout probability.
+      attention_dropout_rate: Dropout probability for attention layers.
+      use_bias: Whether to enable use_bias in attention layer. If set False,
+        use_bias in attention layer is disabled.
+      norm_first: Whether to normalize inputs to attention and intermediate
+        dense layers. If set False, output of attention and intermediate dense
+        layers is normalized.
+      norm_epsilon: Epsilon value to initialize normalization layers.
+      intermediate_dropout: Dropout probability for intermediate_dropout_layer.
+      **kwargs: key word arguemnts passed to tf.keras.layers.Layer.
+    """
+
+    super(TransformerEncoder, self).__init__(**kwargs)
+    self.num_layers = num_layers
+    self.num_attention_heads = num_attention_heads
+    self._intermediate_size = intermediate_size
+    self._activation = activation
+    self._dropout_rate = dropout_rate
+    self._attention_dropout_rate = attention_dropout_rate
+    self._use_bias = use_bias
+    self._norm_first = norm_first
+    self._norm_epsilon = norm_epsilon
+    self._intermediate_dropout = intermediate_dropout
+
+  def build(self, input_shape):
+    """Implements build() for the layer."""
+    self.encoder_layers = []
+    for i in range(self.num_layers):
+      self.encoder_layers.append(
+          keras_nlp.layers.TransformerEncoderBlock(
+              num_attention_heads=self.num_attention_heads,
+              inner_dim=self._intermediate_size,
+              inner_activation=self._activation,
+              output_dropout=self._dropout_rate,
+              attention_dropout=self._attention_dropout_rate,
+              use_bias=self._use_bias,
+              norm_first=self._norm_first,
+              norm_epsilon=self._norm_epsilon,
+              inner_dropout=self._intermediate_dropout,
+              attention_initializer=attention_initializer(input_shape[2]),
+              name=("layer_%d" % i)))
+    self.output_normalization = tf.keras.layers.LayerNormalization(
+        epsilon=self._norm_epsilon, dtype="float32")
+    super(TransformerEncoder, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        "num_layers": self.num_layers,
+        "num_attention_heads": self.num_attention_heads,
+        "intermediate_size": self._intermediate_size,
+        "activation": self._activation,
+        "dropout_rate": self._dropout_rate,
+        "attention_dropout_rate": self._attention_dropout_rate,
+        "use_bias": self._use_bias,
+        "norm_first": self._norm_first,
+        "norm_epsilon": self._norm_epsilon,
+        "intermediate_dropout": self._intermediate_dropout
+    }
+    base_config = super(TransformerEncoder, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self, encoder_inputs, attention_mask=None):
+    """Return the output of the encoder.
+
+    Args:
+      encoder_inputs: A tensor with shape `(batch_size, input_length,
+        hidden_size)`.
+      attention_mask: A mask for the encoder self-attention layer with shape
+        `(batch_size, input_length, input_length)`.
+
+    Returns:
+      Output of encoder which is a `float32` tensor with shape
+        `(batch_size, input_length, hidden_size)`.
+    """
+    for layer_idx in range(self.num_layers):
+      encoder_inputs = self.encoder_layers[layer_idx](
+          [encoder_inputs, attention_mask])
+
+    output_tensor = encoder_inputs
+    output_tensor = self.output_normalization(output_tensor)
+
+    return output_tensor
+
+
+class TransformerDecoder(tf.keras.layers.Layer):
+  """Transformer decoder.
+
+  Like the encoder, the decoder is made up of N identical layers.
+  Each layer is composed of the sublayers:
+    1. Self-attention layer
+    2. Multi-headed attention layer combining encoder outputs with results from
+       the previous self-attention layer.
+    3. Feedforward network (2 fully-connected layers)
+  """
+
+  def __init__(self,
+               num_layers=6,
+               num_attention_heads=8,
+               intermediate_size=2048,
+               activation="relu",
+               dropout_rate=0.0,
+               attention_dropout_rate=0.0,
+               use_bias=False,
+               norm_first=True,
+               norm_epsilon=1e-6,
+               intermediate_dropout=0.0,
+               **kwargs):
+    """Initialize a Transformer decoder.
+
+    Args:
+      num_layers: Number of layers.
+      num_attention_heads: Number of attention heads.
+      intermediate_size: Size of the intermediate (Feedforward) layer.
+      activation: Activation for the intermediate layer.
+      dropout_rate: Dropout probability.
+      attention_dropout_rate: Dropout probability for attention layers.
+      use_bias: Whether to enable use_bias in attention layer. If set `False`,
+        use_bias in attention layer is disabled.
+      norm_first: Whether to normalize inputs to attention and intermediate
+        dense layers. If set `False`, output of attention and intermediate dense
+        layers is normalized.
+      norm_epsilon: Epsilon value to initialize normalization layers.
+      intermediate_dropout: Dropout probability for intermediate_dropout_layer.
+      **kwargs: key word arguemnts passed to tf.keras.layers.Layer.
+    """
+    super(TransformerDecoder, self).__init__(**kwargs)
+    self.num_layers = num_layers
+    self.num_attention_heads = num_attention_heads
+    self._intermediate_size = intermediate_size
+    self._activation = activation
+    self._dropout_rate = dropout_rate
+    self._attention_dropout_rate = attention_dropout_rate
+    self._use_bias = use_bias
+    self._norm_first = norm_first
+    self._norm_epsilon = norm_epsilon
+    self._intermediate_dropout = intermediate_dropout
+
+  def build(self, input_shape):
+    """Implements build() for the layer."""
+    self.decoder_layers = []
+    for i in range(self.num_layers):
+      self.decoder_layers.append(
+          layers.TransformerDecoderBlock(
+              num_attention_heads=self.num_attention_heads,
+              intermediate_size=self._intermediate_size,
+              intermediate_activation=self._activation,
+              dropout_rate=self._dropout_rate,
+              attention_dropout_rate=self._attention_dropout_rate,
+              use_bias=self._use_bias,
+              norm_first=self._norm_first,
+              norm_epsilon=self._norm_epsilon,
+              intermediate_dropout=self._intermediate_dropout,
+              attention_initializer=attention_initializer(input_shape[2]),
+              name=("layer_%d" % i)))
+    self.output_normalization = tf.keras.layers.LayerNormalization(
+        epsilon=1e-6, dtype="float32")
+    super(TransformerDecoder, self).build(input_shape)
+
+  def get_config(self):
+    config = {
+        "num_layers": self.num_layers,
+        "num_attention_heads": self.num_attention_heads,
+        "intermediate_size": self._intermediate_size,
+        "activation": self._activation,
+        "dropout_rate": self._dropout_rate,
+        "attention_dropout_rate": self._attention_dropout_rate,
+        "use_bias": self._use_bias,
+        "norm_first": self._norm_first,
+        "norm_epsilon": self._norm_epsilon,
+        "intermediate_dropout": self._intermediate_dropout
+    }
+    base_config = super(TransformerDecoder, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def call(self,
+           target,
+           memory,
+           self_attention_mask=None,
+           cross_attention_mask=None,
+           cache=None,
+           decode_loop_step=None):
+    """Return the output of the decoder layer stacks.
+
+    Args:
+      target: A tensor with shape `(batch_size, target_length, hidden_size)`.
+      memory: A tensor with shape `(batch_size, input_length, hidden_size)`.
+      self_attention_mask: A tensor with shape `(batch_size, target_len,
+        target_length)`, the mask for decoder self-attention layer.
+      cross_attention_mask: A tensor with shape `(batch_size, target_length,
+        input_length)` which is the mask for encoder-decoder attention layer.
+      cache: (Used for fast decoding) A nested dictionary storing previous
+        decoder self-attention values. The items are:
+        {layer_n: {"k": A tensor with shape `(batch_size, i, key_channels)`,
+                   "v": A tensor with shape `(batch_size, i, value_channels)`},
+                     ...}
+      decode_loop_step: An integer, the step number of the decoding loop. Used
+        only for autoregressive inference on TPU.
+
+    Returns:
+      Output of decoder.
+      float32 tensor with shape `(batch_size, target_length, hidden_size`).
+    """
+
+    output_tensor = target
+    for layer_idx in range(self.num_layers):
+      transformer_inputs = [
+          output_tensor, memory, cross_attention_mask, self_attention_mask
+      ]
+      # Gets the cache for decoding.
+      if cache is None:
+        output_tensor, _ = self.decoder_layers[layer_idx](transformer_inputs)
+      else:
+        cache_layer_idx = str(layer_idx)
+        output_tensor, cache[cache_layer_idx] = self.decoder_layers[layer_idx](
+            transformer_inputs,
+            cache=cache[cache_layer_idx],
+            decode_loop_step=decode_loop_step)
+    return self.output_normalization(output_tensor)
+
+
+def attention_initializer(hidden_size):
+  """Initializer for attention layers in Seq2SeqTransformer."""
+  hidden_size = int(hidden_size)
+  limit = math.sqrt(6.0 / (hidden_size + hidden_size))
+  return tf.keras.initializers.RandomUniform(minval=-limit, maxval=limit)
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/xlnet.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/xlnet.py
new file mode 100644
index 000000000..f0e793625
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/models/xlnet.py
@@ -0,0 +1,342 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""XLNet models."""
+# pylint: disable=g-classes-have-attributes
+
+from typing import Any, Mapping, Optional, Union
+
+import tensorflow as tf
+
+from nlp_modeling import layers
+from nlp_modeling import networks
+
+
+class XLNetMaskedLM(tf.keras.layers.Layer):
+  """XLNet pretraining head."""
+
+  def __init__(self,
+               vocab_size: int,
+               hidden_size: int,
+               initializer: str = 'glorot_uniform',
+               activation: str = 'gelu',
+               name=None,
+               **kwargs):
+    super().__init__(name=name, **kwargs)
+    self._vocab_size = vocab_size
+    self._hidden_size = hidden_size
+    self._initializer = initializer
+    self._activation = activation
+
+  def build(self, input_shape):
+    self.dense = tf.keras.layers.Dense(
+        units=self._hidden_size,
+        activation=self._activation,
+        kernel_initializer=self._initializer,
+        name='transform/dense')
+    self.layer_norm = tf.keras.layers.LayerNormalization(
+        axis=-1, epsilon=1e-12, name='transform/LayerNorm')
+    self.bias = self.add_weight(
+        'output_bias/bias',
+        shape=(self._vocab_size,),
+        initializer='zeros',
+        trainable=True)
+    super().build(input_shape)
+
+  def call(self,
+           sequence_data: tf.Tensor,
+           embedding_table: tf.Tensor):
+    lm_data = self.dense(sequence_data)
+    lm_data = self.layer_norm(lm_data)
+    lm_data = tf.matmul(lm_data, embedding_table, transpose_b=True)
+    logits = tf.nn.bias_add(lm_data, self.bias)
+    return logits
+
+  def get_config(self) -> Mapping[str, Any]:
+    config = {
+        'vocab_size':
+            self._vocab_size,
+        'hidden_size':
+            self._hidden_size,
+        'initializer':
+            self._initializer
+    }
+    base_config = super(XLNetMaskedLM, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class XLNetPretrainer(tf.keras.Model):
+  """XLNet-based pretrainer.
+
+  This is an implementation of the network structure surrounding a
+  Transformer-XL encoder as described in "XLNet: Generalized Autoregressive
+  Pretraining for Language Understanding" (https://arxiv.org/abs/1906.08237).
+
+  Args:
+    network: An XLNet/Transformer-XL based network. This network should output a
+      sequence output and list of `state` tensors.
+    mlm_activation: The activation (if any) to use in the Masked LM network. If
+      None, then no activation will be used.
+    mlm_initializer: The initializer (if any) to use in the masked LM. Defaults
+      to a Glorot uniform initializer.
+
+  """
+
+  def __init__(
+      self,
+      network: Union[tf.keras.layers.Layer, tf.keras.Model],
+      mlm_activation=None,
+      mlm_initializer='glorot_uniform',
+      name: Optional[str] = None,
+      **kwargs):
+    super().__init__(name=name, **kwargs)
+    self._config = {
+        'network': network,
+        'mlm_activation': mlm_activation,
+        'mlm_initializer': mlm_initializer,
+    }
+    self._network = network
+    self._hidden_size = network.get_config()['hidden_size']
+    self._vocab_size = network.get_config()['vocab_size']
+    self._activation = mlm_activation
+    self._initializer = mlm_initializer
+    self._masked_lm = XLNetMaskedLM(
+        vocab_size=self._vocab_size,
+        hidden_size=self._hidden_size,
+        initializer=self._initializer)
+
+  def call(self, inputs: Mapping[str, Any]):
+    input_word_ids = inputs['input_word_ids']
+    input_type_ids = inputs['input_type_ids']
+    masked_tokens = inputs['masked_tokens']
+    permutation_mask = inputs['permutation_mask']
+    target_mapping = inputs['target_mapping']
+    state = inputs.get('state', None)
+
+    attention_output, state = self._network(
+        input_ids=input_word_ids,
+        segment_ids=input_type_ids,
+        input_mask=None,
+        state=state,
+        permutation_mask=permutation_mask,
+        target_mapping=target_mapping,
+        masked_tokens=masked_tokens)
+
+    embedding_table = self._network.get_embedding_lookup_table()
+    mlm_outputs = self._masked_lm(
+        sequence_data=attention_output,
+        embedding_table=embedding_table)
+    return mlm_outputs, state
+
+  def get_config(self) -> Mapping[str, Any]:
+    return self._config
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
+  @property
+  def checkpoint_items(self):
+    return dict(encoder=self._network)
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class XLNetClassifier(tf.keras.Model):
+  """Classifier model based on XLNet.
+
+  This is an implementation of the network structure surrounding a
+  Transformer-XL encoder as described in "XLNet: Generalized Autoregressive
+  Pretraining for Language Understanding" (https://arxiv.org/abs/1906.08237).
+
+  Note: This model does not use utilize the memory mechanism used in the
+  original XLNet Classifier.
+
+  Args:
+    network: An XLNet/Transformer-XL based network. This network should output a
+      sequence output and list of `state` tensors.
+    num_classes: Number of classes to predict from the classification network.
+    initializer: The initializer (if any) to use in the classification networks.
+      Defaults to a RandomNormal initializer.
+    summary_type: Method used to summarize a sequence into a compact vector.
+    dropout_rate: The dropout probability of the cls head.
+  """
+
+  def __init__(
+      self,
+      network: Union[tf.keras.layers.Layer, tf.keras.Model],
+      num_classes: int,
+      initializer: tf.keras.initializers.Initializer = 'random_normal',
+      summary_type: str = 'last',
+      dropout_rate: float = 0.1,
+      **kwargs):
+    super().__init__(**kwargs)
+    self._network = network
+    self._initializer = initializer
+    self._summary_type = summary_type
+    self._num_classes = num_classes
+    self._config = {
+        'network': network,
+        'initializer': initializer,
+        'num_classes': num_classes,
+        'summary_type': summary_type,
+        'dropout_rate': dropout_rate,
+    }
+
+    if summary_type == 'last':
+      cls_token_idx = -1
+    elif summary_type == 'first':
+      cls_token_idx = 0
+    else:
+      raise ValueError('Invalid summary type provided: %s.' % summary_type)
+
+    self.classifier = layers.ClassificationHead(
+        inner_dim=network.get_config()['hidden_size'],
+        num_classes=num_classes,
+        initializer=initializer,
+        dropout_rate=dropout_rate,
+        cls_token_idx=cls_token_idx,
+        name='sentence_prediction')
+
+  def call(self, inputs: Mapping[str, Any]):
+    input_ids = inputs['input_word_ids']
+    segment_ids = inputs['input_type_ids']
+    input_mask = tf.cast(inputs['input_mask'], tf.float32)
+    state = inputs.get('mems', None)
+
+    attention_output, _ = self._network(
+        input_ids=input_ids,
+        segment_ids=segment_ids,
+        input_mask=input_mask,
+        state=state)
+
+    logits = self.classifier(attention_output)
+
+    return logits
+
+  def get_config(self):
+    return self._config
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
+  @property
+  def checkpoint_items(self):
+    items = dict(encoder=self._network)
+    if hasattr(self.classifier, 'checkpoint_items'):
+      for key, item in self.classifier.checkpoint_items.items():
+        items['.'.join([self.classifier.name, key])] = item
+    return items
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class XLNetSpanLabeler(tf.keras.Model):
+  """Span labeler model based on XLNet.
+
+  This is an implementation of the network structure surrounding a
+  Transformer-XL encoder as described in "XLNet: Generalized Autoregressive
+  Pretraining for Language Understanding" (https://arxiv.org/abs/1906.08237).
+
+  Args:
+    network: A transformer network. This network should output a sequence output
+      and a classification output. Furthermore, it should expose its embedding
+      table via a "get_embedding_table" method.
+    start_n_top: Beam size for span start.
+    end_n_top: Beam size for span end.
+    dropout_rate: The dropout rate for the span labeling layer.
+    span_labeling_activation: The activation for the span labeling head.
+    initializer: The initializer (if any) to use in the span labeling network.
+      Defaults to a Glorot uniform initializer.
+  """
+
+  def __init__(
+      self,
+      network: Union[tf.keras.layers.Layer, tf.keras.Model],
+      start_n_top: int = 5,
+      end_n_top: int = 5,
+      dropout_rate: float = 0.1,
+      span_labeling_activation: tf.keras.initializers.Initializer = 'tanh',
+      initializer: tf.keras.initializers.Initializer = 'glorot_uniform',
+      **kwargs):
+    super().__init__(**kwargs)
+    self._config = {
+        'network': network,
+        'start_n_top': start_n_top,
+        'end_n_top': end_n_top,
+        'dropout_rate': dropout_rate,
+        'span_labeling_activation': span_labeling_activation,
+        'initializer': initializer,
+    }
+    network_config = network.get_config()
+    try:
+      input_width = network_config['inner_size']
+      self._xlnet_base = True
+    except KeyError:
+      # BertEncoder uses 'intermediate_size' due to legacy naming.
+      input_width = network_config['intermediate_size']
+      self._xlnet_base = False
+
+    self._network = network
+    self._initializer = initializer
+    self._start_n_top = start_n_top
+    self._end_n_top = end_n_top
+    self._dropout_rate = dropout_rate
+    self._activation = span_labeling_activation
+    self.span_labeling = networks.XLNetSpanLabeling(
+        input_width=input_width,
+        start_n_top=self._start_n_top,
+        end_n_top=self._end_n_top,
+        activation=self._activation,
+        dropout_rate=self._dropout_rate,
+        initializer=self._initializer)
+
+  def call(self, inputs: Mapping[str, Any]):
+    input_word_ids = inputs['input_word_ids']
+    input_type_ids = inputs['input_type_ids']
+    input_mask = inputs['input_mask']
+    class_index = inputs['class_index']
+    paragraph_mask = inputs['paragraph_mask']
+    start_positions = inputs.get('start_positions', None)
+
+    if self._xlnet_base:
+      attention_output, _ = self._network(
+          input_ids=input_word_ids,
+          segment_ids=input_type_ids,
+          input_mask=input_mask)
+    else:
+      network_output_dict = self._network(dict(
+          input_word_ids=input_word_ids,
+          input_type_ids=input_type_ids,
+          input_mask=input_mask))
+      attention_output = network_output_dict['sequence_output']
+
+    outputs = self.span_labeling(
+        sequence_data=attention_output,
+        class_index=class_index,
+        paragraph_mask=paragraph_mask,
+        start_positions=start_positions)
+    return outputs
+
+  @property
+  def checkpoint_items(self):
+    return dict(encoder=self._network)
+
+  def get_config(self):
+    return self._config
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/README.md b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/README.md
new file mode 100644
index 000000000..b192399a7
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/README.md
@@ -0,0 +1,39 @@
+# Networks
+
+Networks are combinations of `tf.keras` layers (and possibly other networks).
+They are `tf.keras` models that would not be trained alone. It encapsulates
+common network structures like a transformer encoder into an easily
+handled object with a standardized configuration.
+
+* [`BertEncoder`](bert_encoder.py) implements a bi-directional
+Transformer-based encoder as described in ["BERT: Pre-training of Deep
+Bidirectional Transformers for Language Understanding"](https://arxiv.org/abs/1810.04805).
+It includes the embedding lookups, transformer layers and pooling layer.
+
+* [`AlbertEncoder`](albert_encoder.py) implements a
+Transformer-encoder described in the paper ["ALBERT: A Lite BERT for
+Self-supervised Learning of Language Representations"]
+(https://arxiv.org/abs/1909.11942). Compared with [BERT](https://arxiv.org/abs/1810.04805),
+ALBERT refactorizes embedding parameters into two smaller matrices and shares
+parameters across layers.
+
+* [`MobileBERTEncoder`](mobile_bert_encoder.py) implements the
+MobileBERT network described in the paper ["MobileBERT: a Compact Task-Agnostic
+BERT for Resource-Limited Devices"](https://arxiv.org/abs/2004.02984).
+
+* [`Classification`](classification.py) contains a single hidden layer, and is
+intended for use as a classification or regression (if number of classes is set
+to 1) head.
+
+* [`PackedSequenceEmbedding`](packed_sequence_embedding.py) implements an
+embedding network that supports packed sequences and position ids.
+
+* [`SpanLabeling`](span_labeling.py) implements a single-span labeler
+(that is, a prediction head that can predict one start and end index per batch
+item) based on a single dense hidden layer. It can be used in the SQuAD task.
+
+* [`XLNetBase`](xlnet_base.py) implements the base network used in "XLNet:
+Generalized Autoregressive Pretraining for Language Understanding"
+(https://arxiv.org/abs/1906.08237). It includes embedding lookups,
+relative position encodings, mask computations, segment matrix computations and
+Transformer XL layers using one or two stream relative self-attention.
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/__init__.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/__init__.py
new file mode 100644
index 000000000..716de7d64
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Networks are combinations of `tf.keras` layers (and possibly other networks).
+
+They are `tf.keras` models that would not be trained alone. It encapsulates
+common network structures like a transformer encoder into an easily
+handled object with a standardized configuration.
+"""
+from nlp_modeling.networks.albert_encoder import AlbertEncoder
+from nlp_modeling.networks.bert_encoder import BertEncoder
+from nlp_modeling.networks.classification import Classification
+from nlp_modeling.networks.encoder_scaffold import EncoderScaffold
+from nlp_modeling.networks.mobile_bert_encoder import MobileBERTEncoder
+from nlp_modeling.networks.packed_sequence_embedding import PackedSequenceEmbedding
+from nlp_modeling.networks.span_labeling import SpanLabeling
+from nlp_modeling.networks.span_labeling import XLNetSpanLabeling
+from nlp_modeling.networks.xlnet_base import XLNetBase
+# Backward compatibility. The modules are deprecated.
+TransformerEncoder = BertEncoder
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/albert_encoder.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/albert_encoder.py
new file mode 100644
index 000000000..75074987f
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/albert_encoder.py
@@ -0,0 +1,211 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ALBERT (https://arxiv.org/abs/1810.04805) text encoder network."""
+# pylint: disable=g-classes-have-attributes
+import collections
+import tensorflow as tf
+
+from modeling import activations
+import keras_nlp
+from nlp_modeling import layers
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class AlbertEncoder(tf.keras.Model):
+  """ALBERT (https://arxiv.org/abs/1810.04805) text encoder network.
+
+  This network implements the encoder described in the paper "ALBERT: A Lite
+  BERT for Self-supervised Learning of Language Representations"
+  (https://arxiv.org/abs/1909.11942).
+
+  Compared with BERT (https://arxiv.org/abs/1810.04805), ALBERT refactorizes
+  embedding parameters into two smaller matrices and shares parameters
+  across layers.
+
+  The default values for this object are taken from the ALBERT-Base
+  implementation described in the paper.
+
+  *Note* that the network is constructed by Keras Functional API.
+
+  Args:
+    vocab_size: The size of the token vocabulary.
+    embedding_width: The width of the word embeddings. If the embedding width is
+      not equal to hidden size, embedding parameters will be factorized into two
+      matrices in the shape of `(vocab_size, embedding_width)` and
+      `(embedding_width, hidden_size)`, where `embedding_width` is usually much
+      smaller than `hidden_size`.
+    hidden_size: The size of the transformer hidden layers.
+    num_layers: The number of transformer layers.
+    num_attention_heads: The number of attention heads for each transformer. The
+      hidden size must be divisible by the number of attention heads.
+    max_sequence_length: The maximum sequence length that this encoder can
+      consume. If None, max_sequence_length uses the value from sequence length.
+      This determines the variable shape for positional embeddings.
+    type_vocab_size: The number of types that the 'type_ids' input can take.
+    intermediate_size: The intermediate size for the transformer layers.
+    activation: The activation to use for the transformer layers.
+    dropout_rate: The dropout rate to use for the transformer layers.
+    attention_dropout_rate: The dropout rate to use for the attention layers
+      within the transformer layers.
+    initializer: The initialzer to use for all weights in this encoder.
+    dict_outputs: Whether to use a dictionary as the model outputs.
+  """
+
+  def __init__(self,
+               vocab_size,
+               embedding_width=128,
+               hidden_size=768,
+               num_layers=12,
+               num_attention_heads=12,
+               max_sequence_length=512,
+               type_vocab_size=16,
+               intermediate_size=3072,
+               activation=activations.gelu,
+               dropout_rate=0.1,
+               attention_dropout_rate=0.1,
+               initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+               dict_outputs=False,
+               **kwargs):
+    activation = tf.keras.activations.get(activation)
+    initializer = tf.keras.initializers.get(initializer)
+
+    word_ids = tf.keras.layers.Input(
+        shape=(None,), dtype=tf.int32, name='input_word_ids')
+    mask = tf.keras.layers.Input(
+        shape=(None,), dtype=tf.int32, name='input_mask')
+    type_ids = tf.keras.layers.Input(
+        shape=(None,), dtype=tf.int32, name='input_type_ids')
+
+    if embedding_width is None:
+      embedding_width = hidden_size
+    embedding_layer = layers.OnDeviceEmbedding(
+        vocab_size=vocab_size,
+        embedding_width=embedding_width,
+        initializer=initializer,
+        name='word_embeddings')
+    word_embeddings = embedding_layer(word_ids)
+
+    # Always uses dynamic slicing for simplicity.
+    position_embedding_layer = keras_nlp.layers.PositionEmbedding(
+        initializer=initializer,
+        max_length=max_sequence_length,
+        name='position_embedding')
+    position_embeddings = position_embedding_layer(word_embeddings)
+
+    type_embeddings = (
+        layers.OnDeviceEmbedding(
+            vocab_size=type_vocab_size,
+            embedding_width=embedding_width,
+            initializer=initializer,
+            use_one_hot=True,
+            name='type_embeddings')(type_ids))
+
+    embeddings = tf.keras.layers.Add()(
+        [word_embeddings, position_embeddings, type_embeddings])
+    embeddings = (
+        tf.keras.layers.LayerNormalization(
+            name='embeddings/layer_norm',
+            axis=-1,
+            epsilon=1e-12,
+            dtype=tf.float32)(embeddings))
+    embeddings = (tf.keras.layers.Dropout(rate=dropout_rate)(embeddings))
+    # We project the 'embedding' output to 'hidden_size' if it is not already
+    # 'hidden_size'.
+    if embedding_width != hidden_size:
+      embeddings = tf.keras.layers.experimental.EinsumDense(
+          '...x,xy->...y',
+          output_shape=hidden_size,
+          bias_axes='y',
+          kernel_initializer=initializer,
+          name='embedding_projection')(
+              embeddings)
+
+    data = embeddings
+    attention_mask = keras_nlp.layers.SelfAttentionMask()(data, mask)
+    shared_layer = keras_nlp.layers.TransformerEncoderBlock(
+        num_attention_heads=num_attention_heads,
+        inner_dim=intermediate_size,
+        inner_activation=activation,
+        output_dropout=dropout_rate,
+        attention_dropout=attention_dropout_rate,
+        kernel_initializer=initializer,
+        name='transformer')
+    encoder_outputs = []
+    for _ in range(num_layers):
+      data = shared_layer([data, attention_mask])
+      encoder_outputs.append(data)
+
+    # Applying a tf.slice op (through subscript notation) to a Keras tensor
+    # like this will create a SliceOpLambda layer. This is better than a Lambda
+    # layer with Python code, because that is fundamentally less portable.
+    first_token_tensor = data[:, 0, :]
+    cls_output = tf.keras.layers.Dense(
+        units=hidden_size,
+        activation='tanh',
+        kernel_initializer=initializer,
+        name='pooler_transform')(
+            first_token_tensor)
+    if dict_outputs:
+      outputs = dict(
+          sequence_output=data,
+          encoder_outputs=encoder_outputs,
+          pooled_output=cls_output,
+      )
+    else:
+      outputs = [data, cls_output]
+
+    # b/164516224
+    # Once we've created the network using the Functional API, we call
+    # super().__init__ as though we were invoking the Functional API Model
+    # constructor, resulting in this object having all the properties of a model
+    # created using the Functional API. Once super().__init__ is called, we
+    # can assign attributes to `self` - note that all `self` assignments are
+    # below this line.
+    super(AlbertEncoder, self).__init__(
+        inputs=[word_ids, mask, type_ids], outputs=outputs, **kwargs)
+    config_dict = {
+        'vocab_size': vocab_size,
+        'embedding_width': embedding_width,
+        'hidden_size': hidden_size,
+        'num_layers': num_layers,
+        'num_attention_heads': num_attention_heads,
+        'max_sequence_length': max_sequence_length,
+        'type_vocab_size': type_vocab_size,
+        'intermediate_size': intermediate_size,
+        'activation': tf.keras.activations.serialize(activation),
+        'dropout_rate': dropout_rate,
+        'attention_dropout_rate': attention_dropout_rate,
+        'initializer': tf.keras.initializers.serialize(initializer),
+    }
+
+    # We are storing the config dict as a namedtuple here to ensure checkpoint
+    # compatibility with an earlier version of this model which did not track
+    # the config dict attribute. TF does not track immutable attrs which
+    # do not contain Trackables, so by creating a config namedtuple instead of
+    # a dict we avoid tracking it.
+    config_cls = collections.namedtuple('Config', config_dict.keys())
+    self._config = config_cls(**config_dict)
+    self._embedding_layer = embedding_layer
+    self._position_embedding_layer = position_embedding_layer
+
+  def get_embedding_table(self):
+    return self._embedding_layer.embeddings
+
+  def get_config(self):
+    return dict(self._config._asdict())
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/bert_encoder.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/bert_encoder.py
new file mode 100644
index 000000000..9594ca0ad
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/bert_encoder.py
@@ -0,0 +1,150 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Transformer-based text encoder network."""
+# pylint: disable=g-classes-have-attributes
+import collections
+import tensorflow as tf
+
+from modeling import activations
+import keras_nlp
+
+
+# This class is being replaced by keras_nlp.encoders.BertEncoder and merely
+# acts as a wrapper if you need: 1) list outputs instead of dict outputs,
+# 2) shared embedding layer.
+@tf.keras.utils.register_keras_serializable(package='Text')
+class BertEncoder(keras_nlp.encoders.BertEncoder):
+  """Bi-directional Transformer-based encoder network.
+
+  This network implements a bi-directional Transformer-based encoder as
+  described in "BERT: Pre-training of Deep Bidirectional Transformers for
+  Language Understanding" (https://arxiv.org/abs/1810.04805). It includes the
+  embedding lookups and transformer layers, but not the masked language model
+  or classification task networks.
+
+  The default values for this object are taken from the BERT-Base implementation
+  in "BERT: Pre-training of Deep Bidirectional Transformers for Language
+  Understanding".
+
+  *Note* that the network is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).
+
+  Args:
+    vocab_size: The size of the token vocabulary.
+    hidden_size: The size of the transformer hidden layers.
+    num_layers: The number of transformer layers.
+    num_attention_heads: The number of attention heads for each transformer. The
+      hidden size must be divisible by the number of attention heads.
+    sequence_length: [Deprecated]. TODO(hongkuny): remove this argument once no
+      user is using it.
+    max_sequence_length: The maximum sequence length that this encoder can
+      consume. If None, max_sequence_length uses the value from sequence length.
+      This determines the variable shape for positional embeddings.
+    type_vocab_size: The number of types that the 'type_ids' input can take.
+    intermediate_size: The intermediate size for the transformer layers.
+    activation: The activation to use for the transformer layers.
+    dropout_rate: The dropout rate to use for the transformer layers.
+    attention_dropout_rate: The dropout rate to use for the attention layers
+      within the transformer layers.
+    initializer: The initialzer to use for all weights in this encoder.
+    return_all_encoder_outputs: Whether to output sequence embedding outputs of
+      all encoder transformer layers. Note: when the following `dict_outputs`
+      argument is True, all encoder outputs are always returned in the dict,
+      keyed by `encoder_outputs`.
+    output_range: The sequence output range, [0, output_range), by slicing the
+      target sequence of the last transformer layer. `None` means the entire
+      target sequence will attend to the source sequence, which yields the full
+      output.
+    embedding_width: The width of the word embeddings. If the embedding width is
+      not equal to hidden size, embedding parameters will be factorized into two
+      matrices in the shape of `(vocab_size, embedding_width)` and
+      `(embedding_width, hidden_size)`, where `embedding_width` is usually much
+      smaller than `hidden_size`.
+    embedding_layer: The word embedding layer. `None` means we will create a new
+      embedding layer. Otherwise, we will reuse the given embedding layer. This
+      parameter is originally added for ELECTRA model which needs to tie the
+      generator embeddings with the discriminator embeddings.
+    dict_outputs: Whether to use a dictionary as the model outputs.
+  """
+
+  def __init__(self,
+               vocab_size,
+               hidden_size=768,
+               num_layers=12,
+               num_attention_heads=12,
+               sequence_length=None,
+               max_sequence_length=512,
+               type_vocab_size=16,
+               intermediate_size=3072,
+               activation=activations.gelu,
+               dropout_rate=0.1,
+               attention_dropout_rate=0.1,
+               initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+               return_all_encoder_outputs=False,
+               output_range=None,
+               embedding_width=None,
+               embedding_layer=None,
+               dict_outputs=False,
+               **kwargs):
+
+    # b/164516224
+    # Once we've created the network using the Functional API, we call
+    # super().__init__ as though we were invoking the Functional API Model
+    # constructor, resulting in this object having all the properties of a model
+    # created using the Functional API. Once super().__init__ is called, we
+    # can assign attributes to `self` - note that all `self` assignments are
+    # below this line.
+    super(BertEncoder, self).__init__(
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        num_layers=num_layers,
+        num_attention_heads=num_attention_heads,
+        max_sequence_length=max_sequence_length,
+        type_vocab_size=type_vocab_size,
+        inner_dim=intermediate_size,
+        inner_activation=activation,
+        output_dropout=dropout_rate,
+        attention_dropout=attention_dropout_rate,
+        initializer=initializer,
+        output_range=output_range,
+        embedding_width=embedding_width,
+        embedding_layer=embedding_layer)
+
+    self._embedding_layer_instance = embedding_layer
+
+    # Replace arguments from keras_nlp.encoders.BertEncoder.
+    config_dict = self._config._asdict()
+    config_dict['activation'] = config_dict.pop('inner_activation')
+    config_dict['intermediate_size'] = config_dict.pop('inner_dim')
+    config_dict['dropout_rate'] = config_dict.pop('output_dropout')
+    config_dict['attention_dropout_rate'] = config_dict.pop('attention_dropout')
+    config_dict['dict_outputs'] = dict_outputs
+    config_dict['return_all_encoder_outputs'] = return_all_encoder_outputs
+    config_cls = collections.namedtuple('Config', config_dict.keys())
+    self._config = config_cls(**config_dict)
+
+    if dict_outputs:
+      return
+    else:
+      nested_output = self._nested_outputs
+      cls_output = nested_output['pooled_output']
+      if return_all_encoder_outputs:
+        encoder_outputs = nested_output['encoder_outputs']
+        outputs = [encoder_outputs, cls_output]
+      else:
+        sequence_output = nested_output['sequence_output']
+        outputs = [sequence_output, cls_output]
+    super(keras_nlp.encoders.BertEncoder, self).__init__(
+        inputs=self.inputs, outputs=outputs, **kwargs)
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/classification.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/classification.py
new file mode 100644
index 000000000..79a8a1b01
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/classification.py
@@ -0,0 +1,104 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Classification and regression network."""
+# pylint: disable=g-classes-have-attributes
+import collections
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class Classification(tf.keras.Model):
+  """Classification network head for BERT modeling.
+
+  This network implements a simple classifier head based on a dense layer. If
+  num_classes is one, it can be considered as a regression problem.
+
+  *Note* that the network is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).
+
+  Args:
+    input_width: The innermost dimension of the input tensor to this network.
+    num_classes: The number of classes that this network should classify to. If
+      equal to 1, a regression problem is assumed.
+    activation: The activation, if any, for the dense layer in this network.
+    initializer: The initializer for the dense layer in this network. Defaults
+      to a Glorot uniform initializer.
+    output: The output style for this network. Can be either `logits` or
+      `predictions`.
+  """
+
+  def __init__(self,
+               input_width,
+               num_classes,
+               initializer='glorot_uniform',
+               output='logits',
+               **kwargs):
+
+    cls_output = tf.keras.layers.Input(
+        shape=(input_width,), name='cls_output', dtype=tf.float32)
+
+    logits = tf.keras.layers.Dense(
+        num_classes,
+        activation=None,
+        kernel_initializer=initializer,
+        name='predictions/transform/logits')(
+            cls_output)
+
+    if output == 'logits':
+      output_tensors = logits
+    elif output == 'predictions':
+      policy = tf.keras.mixed_precision.global_policy()
+      if policy.name == 'mixed_bfloat16':
+        # b/158514794: bf16 is not stable with post-softmax cross-entropy.
+        policy = tf.float32
+      output_tensors = tf.keras.layers.Activation(
+          tf.nn.log_softmax, dtype=policy)(
+              logits)
+    else:
+      raise ValueError(
+          ('Unknown `output` value "%s". `output` can be either "logits" or '
+           '"predictions"') % output)
+
+    super(Classification, self).__init__(
+        inputs=[cls_output], outputs=output_tensors, **kwargs)
+
+    # b/164516224
+    # Once we've created the network using the Functional API, we call
+    # super().__init__ as though we were invoking the Functional API Model
+    # constructor, resulting in this object having all the properties of a model
+    # created using the Functional API. Once super().__init__ is called, we
+    # can assign attributes to `self` - note that all `self` assignments are
+    # below this line.
+    config_dict = {
+        'input_width': input_width,
+        'num_classes': num_classes,
+        'initializer': initializer,
+        'output': output,
+    }
+    # We are storing the config dict as a namedtuple here to ensure checkpoint
+    # compatibility with an earlier version of this model which did not track
+    # the config dict attribute. TF does not track immutable attrs which
+    # do not contain Trackables, so by creating a config namedtuple instead of
+    # a dict we avoid tracking it.
+    config_cls = collections.namedtuple('Config', config_dict.keys())
+    self._config = config_cls(**config_dict)
+    self.logits = logits
+
+  def get_config(self):
+    return dict(self._config._asdict())
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/encoder_scaffold.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/encoder_scaffold.py
new file mode 100644
index 000000000..f01eb31ae
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/encoder_scaffold.py
@@ -0,0 +1,358 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Transformer-based text encoder network."""
+# pylint: disable=g-classes-have-attributes
+import copy
+import inspect
+
+from absl import logging
+import gin
+import tensorflow as tf
+
+import keras_nlp
+from nlp_modeling import layers
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+@gin.configurable
+class EncoderScaffold(tf.keras.Model):
+  """Bi-directional Transformer-based encoder network scaffold.
+
+  This network allows users to flexibly implement an encoder similar to the one
+  described in "BERT: Pre-training of Deep Bidirectional Transformers for
+  Language Understanding" (https://arxiv.org/abs/1810.04805).
+
+  In this network, users can choose to provide a custom embedding subnetwork
+  (which will replace the standard embedding logic) and/or a custom hidden layer
+  class (which will replace the Transformer instantiation in the encoder). For
+  each of these custom injection points, users can pass either a class or a
+  class instance. If a class is passed, that class will be instantiated using
+  the `embedding_cfg` or `hidden_cfg` argument, respectively; if an instance
+  is passed, that instance will be invoked. (In the case of hidden_cls, the
+  instance will be invoked 'num_hidden_instances' times.
+
+  If the hidden_cls is not overridden, a default transformer layer will be
+  instantiated.
+
+  *Note* that the network is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).
+
+  Args:
+    pooled_output_dim: The dimension of pooled output.
+    pooler_layer_initializer: The initializer for the classification layer.
+    embedding_cls: The class or instance to use to embed the input data. This
+      class or instance defines the inputs to this encoder and outputs (1)
+      embeddings tensor with shape `(batch_size, seq_length, hidden_size)` and
+      (2) attention masking with tensor `(batch_size, seq_length, seq_length)`.
+      If `embedding_cls` is not set, a default embedding network (from the
+      original BERT paper) will be created.
+    embedding_cfg: A dict of kwargs to pass to the embedding_cls, if it needs to
+      be instantiated. If `embedding_cls` is not set, a config dict must be
+      passed to `embedding_cfg` with the following values:
+      `vocab_size`: The size of the token vocabulary.
+      `type_vocab_size`: The size of the type vocabulary.
+      `hidden_size`: The hidden size for this encoder.
+      `max_seq_length`: The maximum sequence length for this encoder.
+      `seq_length`: The sequence length for this encoder.
+      `initializer`: The initializer for the embedding portion of this encoder.
+      `dropout_rate`: The dropout rate to apply before the encoding layers.
+    embedding_data: A reference to the embedding weights that will be used to
+      train the masked language model, if necessary. This is optional, and only
+      needed if (1) you are overriding `embedding_cls` and (2) are doing
+      standard pretraining.
+    num_hidden_instances: The number of times to instantiate and/or invoke the
+      hidden_cls.
+    hidden_cls: The class or instance to encode the input data. If `hidden_cls`
+      is not set, a KerasBERT transformer layer will be used as the encoder
+      class.
+    hidden_cfg: A dict of kwargs to pass to the hidden_cls, if it needs to be
+      instantiated. If hidden_cls is not set, a config dict must be passed to
+      `hidden_cfg` with the following values:
+        `num_attention_heads`: The number of attention heads. The hidden size
+          must be divisible by `num_attention_heads`.
+        `intermediate_size`: The intermediate size of the transformer.
+        `intermediate_activation`: The activation to apply in the transfomer.
+        `dropout_rate`: The overall dropout rate for the transformer layers.
+        `attention_dropout_rate`: The dropout rate for the attention layers.
+        `kernel_initializer`: The initializer for the transformer layers.
+    mask_cls: The class to generate masks passed into hidden_cls() from inputs
+      and 2D mask indicating positions we can attend to. It is the caller's job
+      to make sure the output of the mask_layer can be used by hidden_layer.
+      A mask_cls is usually mapped to a hidden_cls.
+    mask_cfg: A dict of kwargs pass to mask_cls.
+    layer_norm_before_pooling: Whether to add a layer norm before the pooling
+      layer. You probably want to turn this on if you set `norm_first=True` in
+      transformer layers.
+    return_all_layer_outputs: Whether to output sequence embedding outputs of
+      all encoder transformer layers.
+    dict_outputs: Whether to use a dictionary as the model outputs.
+    layer_idx_as_attention_seed: Whether to include layer_idx in
+      attention_cfg in hidden_cfg.
+  """
+
+  def __init__(self,
+               pooled_output_dim,
+               pooler_layer_initializer=tf.keras.initializers.TruncatedNormal(
+                   stddev=0.02),
+               embedding_cls=None,
+               embedding_cfg=None,
+               embedding_data=None,
+               num_hidden_instances=1,
+               hidden_cls=layers.Transformer,
+               hidden_cfg=None,
+               mask_cls=keras_nlp.layers.SelfAttentionMask,
+               mask_cfg=None,
+               layer_norm_before_pooling=False,
+               return_all_layer_outputs=False,
+               dict_outputs=False,
+               layer_idx_as_attention_seed=False,
+               **kwargs):
+
+    if embedding_cls:
+      if inspect.isclass(embedding_cls):
+        embedding_network = embedding_cls(
+            **embedding_cfg) if embedding_cfg else embedding_cls()
+      else:
+        embedding_network = embedding_cls
+      inputs = embedding_network.inputs
+      embeddings, attention_mask = embedding_network(inputs)
+      embedding_layer = None
+      position_embedding_layer = None
+      type_embedding_layer = None
+      embedding_norm_layer = None
+    else:
+      embedding_network = None
+      seq_length = embedding_cfg.get('seq_length', None)
+      word_ids = tf.keras.layers.Input(
+          shape=(seq_length,), dtype=tf.int32, name='input_word_ids')
+      mask = tf.keras.layers.Input(
+          shape=(seq_length,), dtype=tf.int32, name='input_mask')
+      type_ids = tf.keras.layers.Input(
+          shape=(seq_length,), dtype=tf.int32, name='input_type_ids')
+      inputs = [word_ids, mask, type_ids]
+
+      embedding_layer = keras_nlp.layers.OnDeviceEmbedding(
+          vocab_size=embedding_cfg['vocab_size'],
+          embedding_width=embedding_cfg['hidden_size'],
+          initializer=embedding_cfg['initializer'],
+          name='word_embeddings')
+
+      word_embeddings = embedding_layer(word_ids)
+
+      # Always uses dynamic slicing for simplicity.
+      position_embedding_layer = keras_nlp.layers.PositionEmbedding(
+          initializer=embedding_cfg['initializer'],
+          max_length=embedding_cfg['max_seq_length'],
+          name='position_embedding')
+      position_embeddings = position_embedding_layer(word_embeddings)
+
+      type_embedding_layer = keras_nlp.layers.OnDeviceEmbedding(
+          vocab_size=embedding_cfg['type_vocab_size'],
+          embedding_width=embedding_cfg['hidden_size'],
+          initializer=embedding_cfg['initializer'],
+          use_one_hot=True,
+          name='type_embeddings')
+      type_embeddings = type_embedding_layer(type_ids)
+
+      embeddings = tf.keras.layers.Add()(
+          [word_embeddings, position_embeddings, type_embeddings])
+
+      embedding_norm_layer = tf.keras.layers.LayerNormalization(
+          name='embeddings/layer_norm',
+          axis=-1,
+          epsilon=1e-12,
+          dtype=tf.float32)
+      embeddings = embedding_norm_layer(embeddings)
+
+      embeddings = (
+          tf.keras.layers.Dropout(
+              rate=embedding_cfg['dropout_rate'])(embeddings))
+
+      mask_cfg = {} if mask_cfg is None else mask_cfg
+      if inspect.isclass(mask_cls):
+        mask_layer = mask_cls(**mask_cfg)
+      else:
+        mask_layer = mask_cls
+      attention_mask = mask_layer(embeddings, mask)
+
+    data = embeddings
+
+    layer_output_data = []
+    hidden_layers = []
+    hidden_cfg = hidden_cfg if hidden_cfg else {}
+    for i in range(num_hidden_instances):
+      if inspect.isclass(hidden_cls):
+        if hidden_cfg and 'attention_cfg' in hidden_cfg and (
+            layer_idx_as_attention_seed):
+          hidden_cfg = copy.deepcopy(hidden_cfg)
+          hidden_cfg['attention_cfg']['seed'] = i
+        layer = hidden_cls(**hidden_cfg)
+      else:
+        layer = hidden_cls
+      data = layer([data, attention_mask])
+      layer_output_data.append(data)
+      hidden_layers.append(layer)
+
+    if layer_norm_before_pooling:
+      # Normalize the final output.
+      output_layer_norm = tf.keras.layers.LayerNormalization(
+          name='final_layer_norm',
+          axis=-1,
+          epsilon=1e-12)
+      layer_output_data[-1] = output_layer_norm(layer_output_data[-1])
+
+    last_layer_output = layer_output_data[-1]
+    # Applying a tf.slice op (through subscript notation) to a Keras tensor
+    # like this will create a SliceOpLambda layer. This is better than a Lambda
+    # layer with Python code, because that is fundamentally less portable.
+    first_token_tensor = last_layer_output[:, 0, :]
+    pooler_layer = tf.keras.layers.Dense(
+        units=pooled_output_dim,
+        activation='tanh',
+        kernel_initializer=pooler_layer_initializer,
+        name='cls_transform')
+    cls_output = pooler_layer(first_token_tensor)
+
+    if dict_outputs:
+      outputs = dict(
+          sequence_output=layer_output_data[-1],
+          pooled_output=cls_output,
+          encoder_outputs=layer_output_data,
+      )
+    elif return_all_layer_outputs:
+      outputs = [layer_output_data, cls_output]
+    else:
+      outputs = [layer_output_data[-1], cls_output]
+
+    # b/164516224
+    # Once we've created the network using the Functional API, we call
+    # super().__init__ as though we were invoking the Functional API Model
+    # constructor, resulting in this object having all the properties of a model
+    # created using the Functional API. Once super().__init__ is called, we
+    # can assign attributes to `self` - note that all `self` assignments are
+    # below this line.
+    super(EncoderScaffold, self).__init__(
+        inputs=inputs, outputs=outputs, **kwargs)
+
+    self._hidden_cls = hidden_cls
+    self._hidden_cfg = hidden_cfg
+    self._mask_cls = mask_cls
+    self._mask_cfg = mask_cfg
+    self._num_hidden_instances = num_hidden_instances
+    self._pooled_output_dim = pooled_output_dim
+    self._pooler_layer_initializer = pooler_layer_initializer
+    self._embedding_cls = embedding_cls
+    self._embedding_cfg = embedding_cfg
+    self._embedding_data = embedding_data
+    self._layer_norm_before_pooling = layer_norm_before_pooling
+    self._return_all_layer_outputs = return_all_layer_outputs
+    self._dict_outputs = dict_outputs
+    self._kwargs = kwargs
+
+    self._embedding_layer = embedding_layer
+    self._embedding_network = embedding_network
+    self._position_embedding_layer = position_embedding_layer
+    self._type_embedding_layer = type_embedding_layer
+    self._embedding_norm_layer = embedding_norm_layer
+    self._hidden_layers = hidden_layers
+    if self._layer_norm_before_pooling:
+      self._output_layer_norm = output_layer_norm
+    self._pooler_layer = pooler_layer
+    self._layer_idx_as_attention_seed = layer_idx_as_attention_seed
+
+    logging.info('EncoderScaffold configs: %s', self.get_config())
+
+  def get_config(self):
+    config_dict = {
+        'num_hidden_instances': self._num_hidden_instances,
+        'pooled_output_dim': self._pooled_output_dim,
+        'pooler_layer_initializer': self._pooler_layer_initializer,
+        'embedding_cls': self._embedding_network,
+        'embedding_cfg': self._embedding_cfg,
+        'layer_norm_before_pooling': self._layer_norm_before_pooling,
+        'return_all_layer_outputs': self._return_all_layer_outputs,
+        'dict_outputs': self._dict_outputs,
+        'layer_idx_as_attention_seed': self._layer_idx_as_attention_seed
+    }
+    cfgs = {
+        'hidden_cfg': self._hidden_cfg,
+        'mask_cfg': self._mask_cfg
+    }
+
+    for cfg_name, cfg in cfgs.items():
+      if cfg:
+        config_dict[cfg_name] = {}
+        for k, v in cfg.items():
+          # `self._hidden_cfg` may contain `class`, e.g., when `hidden_cfg` is
+          # `TransformerScaffold`, `attention_cls` argument can be a `class`.
+          if inspect.isclass(v):
+            config_dict[cfg_name][k] = tf.keras.utils.get_registered_name(v)
+          else:
+            config_dict[cfg_name][k] = v
+
+    clss = {
+        'hidden_cls': self._hidden_cls,
+        'mask_cls': self._mask_cls
+    }
+
+    for cls_name, cls in clss.items():
+      if inspect.isclass(cls):
+        key = '{}_string'.format(cls_name)
+        config_dict[key] = tf.keras.utils.get_registered_name(cls)
+      else:
+        config_dict[cls_name] = cls
+
+    config_dict.update(self._kwargs)
+    return config_dict
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    cls_names = ['hidden_cls', 'mask_cls']
+    for cls_name in cls_names:
+      cls_string = '{}_string'.format(cls_name)
+      if cls_string in config:
+        config[cls_name] = tf.keras.utils.get_registered_object(
+            config[cls_string], custom_objects=custom_objects)
+        del config[cls_string]
+    return cls(**config)
+
+  def get_embedding_table(self):
+    if self._embedding_network is None:
+      # In this case, we don't have a custom embedding network and can return
+      # the standard embedding data.
+      return self._embedding_layer.embeddings
+
+    if self._embedding_data is None:
+      raise RuntimeError(('The EncoderScaffold %s does not have a reference '
+                          'to the embedding data. This is required when you '
+                          'pass a custom embedding network to the scaffold. '
+                          'It is also possible that you are trying to get '
+                          'embedding data from an embedding scaffold with a '
+                          'custom embedding network where the scaffold has '
+                          'been serialized and deserialized. Unfortunately, '
+                          'accessing custom embedding references after '
+                          'serialization is not yet supported.') % self.name)
+    else:
+      return self._embedding_data
+
+  @property
+  def hidden_layers(self):
+    """List of hidden layers in the encoder."""
+    return self._hidden_layers
+
+  @property
+  def pooler_layer(self):
+    """The pooler dense layer after the transformer layers."""
+    return self._pooler_layer
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/mobile_bert_encoder.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/mobile_bert_encoder.py
new file mode 100644
index 000000000..7fd780f74
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/mobile_bert_encoder.py
@@ -0,0 +1,185 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""MobileBERT text encoder network."""
+import gin
+import tensorflow as tf
+
+import keras_nlp
+from nlp_modeling import layers
+
+
+@gin.configurable
+class MobileBERTEncoder(tf.keras.Model):
+  """A Keras functional API implementation for MobileBERT encoder."""
+
+  def __init__(self,
+               word_vocab_size=30522,
+               word_embed_size=128,
+               type_vocab_size=2,
+               max_sequence_length=512,
+               num_blocks=24,
+               hidden_size=512,
+               num_attention_heads=4,
+               intermediate_size=512,
+               intermediate_act_fn='relu',
+               hidden_dropout_prob=0.1,
+               attention_probs_dropout_prob=0.1,
+               intra_bottleneck_size=128,
+               initializer_range=0.02,
+               use_bottleneck_attention=False,
+               key_query_shared_bottleneck=True,
+               num_feedforward_networks=4,
+               normalization_type='no_norm',
+               classifier_activation=False,
+               input_mask_dtype='int32',
+               **kwargs):
+    """Class initialization.
+
+    Args:
+      word_vocab_size: Number of words in the vocabulary.
+      word_embed_size: Word embedding size.
+      type_vocab_size: Number of word types.
+      max_sequence_length: Maximum length of input sequence.
+      num_blocks: Number of transformer block in the encoder model.
+      hidden_size: Hidden size for the transformer block.
+      num_attention_heads: Number of attention heads in the transformer block.
+      intermediate_size: The size of the "intermediate" (a.k.a., feed
+        forward) layer.
+      intermediate_act_fn: The non-linear activation function to apply
+        to the output of the intermediate/feed-forward layer.
+      hidden_dropout_prob: Dropout probability for the hidden layers.
+      attention_probs_dropout_prob: Dropout probability of the attention
+        probabilities.
+      intra_bottleneck_size: Size of bottleneck.
+      initializer_range: The stddev of the `truncated_normal_initializer` for
+        initializing all weight matrices.
+      use_bottleneck_attention: Use attention inputs from the bottleneck
+        transformation. If true, the following `key_query_shared_bottleneck`
+        will be ignored.
+      key_query_shared_bottleneck: Whether to share linear transformation for
+        keys and queries.
+      num_feedforward_networks: Number of stacked feed-forward networks.
+      normalization_type: The type of normalization_type, only `no_norm` and
+        `layer_norm` are supported. `no_norm` represents the element-wise linear
+        transformation for the student model, as suggested by the original
+        MobileBERT paper. `layer_norm` is used for the teacher model.
+      classifier_activation: If using the tanh activation for the final
+        representation of the `[CLS]` token in fine-tuning.
+      input_mask_dtype: The dtype of `input_mask` tensor, which is one of the
+        input tensors of this encoder. Defaults to `int32`. If you want
+        to use `tf.lite` quantization, which does not support `Cast` op,
+        please set this argument to `tf.float32` and feed `input_mask`
+        tensor with values in `float32` to avoid `tf.cast` in the computation.
+      **kwargs: Other keyworded and arguments.
+    """
+    self._self_setattr_tracking = False
+    initializer = tf.keras.initializers.TruncatedNormal(
+        stddev=initializer_range)
+
+    # layer instantiation
+    self.embedding_layer = layers.MobileBertEmbedding(
+        word_vocab_size=word_vocab_size,
+        word_embed_size=word_embed_size,
+        type_vocab_size=type_vocab_size,
+        output_embed_size=hidden_size,
+        max_sequence_length=max_sequence_length,
+        normalization_type=normalization_type,
+        initializer=initializer,
+        dropout_rate=hidden_dropout_prob)
+
+    self._transformer_layers = []
+    for layer_idx in range(num_blocks):
+      transformer = layers.MobileBertTransformer(
+          hidden_size=hidden_size,
+          num_attention_heads=num_attention_heads,
+          intermediate_size=intermediate_size,
+          intermediate_act_fn=intermediate_act_fn,
+          hidden_dropout_prob=hidden_dropout_prob,
+          attention_probs_dropout_prob=attention_probs_dropout_prob,
+          intra_bottleneck_size=intra_bottleneck_size,
+          use_bottleneck_attention=use_bottleneck_attention,
+          key_query_shared_bottleneck=key_query_shared_bottleneck,
+          num_feedforward_networks=num_feedforward_networks,
+          normalization_type=normalization_type,
+          initializer=initializer,
+          name=f'transformer_layer_{layer_idx}')
+      self._transformer_layers.append(transformer)
+
+    # input tensor
+    input_ids = tf.keras.layers.Input(
+        shape=(None,), dtype=tf.int32, name='input_word_ids')
+    input_mask = tf.keras.layers.Input(
+        shape=(None,), dtype=input_mask_dtype, name='input_mask')
+    type_ids = tf.keras.layers.Input(
+        shape=(None,), dtype=tf.int32, name='input_type_ids')
+    self.inputs = [input_ids, input_mask, type_ids]
+
+    # The dtype of `attention_mask` will the same as the dtype of `input_mask`.
+    attention_mask = keras_nlp.layers.SelfAttentionMask()(input_mask,
+                                                          input_mask)
+
+    # build the computation graph
+    all_layer_outputs = []
+    all_attention_scores = []
+    embedding_output = self.embedding_layer(input_ids, type_ids)
+    all_layer_outputs.append(embedding_output)
+    prev_output = embedding_output
+
+    for layer_idx in range(num_blocks):
+      layer_output, attention_score = self._transformer_layers[layer_idx](
+          prev_output,
+          attention_mask,
+          return_attention_scores=True)
+      all_layer_outputs.append(layer_output)
+      all_attention_scores.append(attention_score)
+      prev_output = layer_output
+    first_token = tf.squeeze(prev_output[:, 0:1, :], axis=1)
+
+    if classifier_activation:
+      self._pooler_layer = tf.keras.layers.experimental.EinsumDense(
+          'ab,bc->ac',
+          output_shape=hidden_size,
+          activation=tf.tanh,
+          bias_axes='c',
+          kernel_initializer=initializer,
+          name='pooler')
+      first_token = self._pooler_layer(first_token)
+    else:
+      self._pooler_layer = None
+
+    outputs = dict(
+        sequence_output=prev_output,
+        pooled_output=first_token,
+        encoder_outputs=all_layer_outputs,
+        attention_scores=all_attention_scores)
+
+    super(MobileBERTEncoder, self).__init__(
+        inputs=self.inputs, outputs=outputs, **kwargs)
+
+  def get_embedding_table(self):
+    return self.embedding_layer.word_embedding.embeddings
+
+  def get_embedding_layer(self):
+    return self.embedding_layer.word_embedding
+
+  @property
+  def transformer_layers(self):
+    """List of Transformer layers in the encoder."""
+    return self._transformer_layers
+
+  @property
+  def pooler_layer(self):
+    """The pooler dense layer after the transformer layers."""
+    return self._pooler_layer
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/packed_sequence_embedding.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/packed_sequence_embedding.py
new file mode 100644
index 000000000..1f1dd429b
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/packed_sequence_embedding.py
@@ -0,0 +1,319 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""An embedding network supporting packed sequences and position ids."""
+# pylint: disable=g-classes-have-attributes
+import collections
+import tensorflow as tf
+
+from modeling import tf_utils
+import keras_nlp
+from nlp_modeling import layers
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class PackedSequenceEmbedding(tf.keras.Model):
+  """An embedding network supporting packed sequences and position ids.
+
+  This network implements an embedding layer similar to the one described in
+  "BERT: Pre-training of Deep Bidirectional Transformers for Language
+  Understanding" (https://arxiv.org/abs/1810.04805). On top of it, it supports
+  to (1) pack multiple sequences into one sequence and (2) allow additional
+  "position_ids" as input.
+
+  Args:
+    vocab_size: The size of the token vocabulary.
+    type_vocab_size: The size of the type vocabulary.
+    embedding_width: Width of token embeddings.
+    hidden_size: The output size for this encoder.
+    max_seq_length: The maximum sequence length for this encoder.
+    initializer: The initializer for the embedding portion of this encoder.
+    dropout_rate: The dropout rate to apply before the encoding layers.
+    pack_multiple_sequences: If `True`, we can feed multiple sequences into one
+      sequence for training and inference (they don't impact each other).
+    use_position_id: Whether to expect `position_ids` as an input to the
+      network. If False, the `position_ids` will be inferred: (1) when
+        pack_multiple_sequences is False, we assume the position ids are `0, 1,
+        2, ..., seq_length - 1`; (2) when `pack_multiple_sequences` is `True`,
+        there may be multiple sub sequences, and for each sub sequence, its
+        position ids start from 0, 1, 2, ...
+  """
+
+  def __init__(self,
+               vocab_size,
+               type_vocab_size,
+               embedding_width,
+               hidden_size,
+               max_seq_length,
+               initializer,
+               dropout_rate,
+               use_position_id=False,
+               pack_multiple_sequences=False,
+               **kwargs):
+    initializer = tf.keras.initializers.get(initializer)
+    config_dict = {
+        'vocab_size': vocab_size,
+        'type_vocab_size': type_vocab_size,
+        'embedding_width': embedding_width,
+        'hidden_size': hidden_size,
+        'max_seq_length': max_seq_length,
+        'initializer': tf.keras.initializers.serialize(initializer),
+        'dropout_rate': dropout_rate,
+        'use_position_id': use_position_id,
+        'pack_multiple_sequences': pack_multiple_sequences,
+    }
+
+    word_ids = tf.keras.layers.Input(
+        shape=(None,), dtype=tf.int32, name='input_word_ids')
+    mask = tf.keras.layers.Input(
+        shape=(None,), dtype=tf.int32, name='input_mask')
+    type_ids = tf.keras.layers.Input(
+        shape=(None,), dtype=tf.int32, name='input_type_ids')
+    inputs = {
+        'input_word_ids': word_ids,
+        'input_mask': mask,
+        'input_type_ids': type_ids,
+    }
+    if use_position_id:
+      position_ids = tf.keras.layers.Input(
+          shape=(None,), dtype=tf.int32, name='position_ids')
+      inputs['position_ids'] = position_ids
+    else:
+      position_ids = None
+
+    if pack_multiple_sequences:
+      sub_seq_mask = PackedSequenceMask()(word_ids)
+    else:
+      sub_seq_mask = None
+
+    embedding_layer = layers.OnDeviceEmbedding(
+        vocab_size=vocab_size,
+        embedding_width=embedding_width,
+        initializer=initializer,
+        name='word_embeddings')
+    word_embeddings = embedding_layer(word_ids)
+
+    # Always uses dynamic slicing for simplicity.
+    position_embedding_layer = PositionEmbeddingWithSubSeqMask(
+        initializer=initializer,
+        use_dynamic_slicing=True,
+        max_sequence_length=max_seq_length,
+        name='position_embedding')
+    position_embeddings = position_embedding_layer(
+        word_embeddings, position_ids, sub_seq_mask)
+
+    type_embeddings = (
+        layers.OnDeviceEmbedding(
+            vocab_size=type_vocab_size,
+            embedding_width=embedding_width,
+            initializer=initializer,
+            use_one_hot=True,
+            name='type_embeddings')(type_ids))
+
+    embeddings = tf.keras.layers.Add()(
+        [word_embeddings, position_embeddings, type_embeddings])
+    embeddings = tf.keras.layers.LayerNormalization(
+        name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32)(
+            embeddings)
+    embeddings = tf.keras.layers.Dropout(
+        rate=dropout_rate, dtype=tf.float32)(
+            embeddings)
+
+    if embedding_width != hidden_size:
+      embeddings = tf.keras.layers.experimental.EinsumDense(
+          '...x,xy->...y',
+          output_shape=hidden_size,
+          bias_axes=None,
+          kernel_initializer=initializer,
+          name='embedding_projection')(
+              embeddings)
+
+    attention_mask = keras_nlp.layers.SelfAttentionMask()(embeddings, mask)
+    if sub_seq_mask is not None:
+      attention_mask = tf.keras.layers.Lambda(
+          lambda x: x[0] * tf.cast(x[1], x[0].dtype))(
+              [attention_mask, sub_seq_mask])
+
+    outputs = [embeddings, attention_mask]
+    super(PackedSequenceEmbedding, self).__init__(
+        inputs=inputs, outputs=outputs, **kwargs)
+    # TF does not track immutable attrs which do not contain Trackables,
+    # so by creating a config namedtuple instead of a dict we avoid tracking it.
+    config_cls = collections.namedtuple('Config', config_dict.keys())
+    self._config = config_cls(**config_dict)
+    self._embedding_layer = embedding_layer
+    self._position_embedding_layer = position_embedding_layer
+
+  def get_embedding_table(self):
+    return self._embedding_layer.embeddings
+
+  def get_config(self):
+    return dict(self._config._asdict())
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class PackedSequenceMask(tf.keras.layers.Layer):
+  """A layer to create a mask to indicate multiple sub sequences."""
+
+  def call(self, input_ids):
+    """Implements call() for the layer.
+
+    Args:
+      input_ids: int32 Tensor of shape [batch_size, seq_length].
+
+    Returns:
+      boolean Tensor of shape [batch_size, seq_length, seq_length]. [x, y, z]
+      is True if for x'th instance in a batch, y'th token and z'th token are
+      from the same sub sequence.
+    """
+    # Suppose
+    # - the first token in the parent sequence is [CLS].
+    # - every sequence starts from [CLS].
+    # - every sequence only contains one [CLS].
+    seq_start_token = input_ids[:, 0:1]
+    seq_start_loc = tf.cast(tf.equal(input_ids, seq_start_token), tf.int32)
+    # Set different ids for different sub sequences.
+    seq_ids = tf.expand_dims(tf.cumsum(seq_start_loc, -1), -1)
+    return tf.equal(seq_ids, tf.transpose(seq_ids, [0, 2, 1]))
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class PositionEmbeddingWithSubSeqMask(tf.keras.layers.Layer):
+  """Creates a positional embedding with sub-sequence masking.
+
+  This layer creates a positional embedding as described in "BERT: Pre-training
+  of Deep Bidirectional Transformers for Language Understanding"
+  (https://arxiv.org/abs/1810.04805). On top of it, it supports
+  `position_ids` and `sub_sequence_mask` tensors.
+
+  This layer can be set up to either create a statically shaped slice or a
+  dynamically shaped slice. If `use_dynamic_slicing` is True, the input tensor
+  can have a dynamic 1st dimension, while if `use_dynamic_slicing` is False the
+  input size must be fixed.
+
+  Args:
+    initializer: The initializer to use for the embedding weights. Defaults to
+      "glorot_uniform".
+    use_dynamic_slicing: Whether to use the dynamic slicing path.
+    max_sequence_length: The maximum size of the dynamic sequence. Only
+      applicable if `use_dynamic_slicing` is True.
+  """
+
+  def __init__(self,
+               initializer='glorot_uniform',
+               use_dynamic_slicing=False,
+               max_sequence_length=None,
+               **kwargs):
+    # We need to have a default dtype of float32, since the inputs (which Keras
+    # usually uses to infer the dtype) will always be int32.
+    if 'dtype' not in kwargs:
+      kwargs['dtype'] = 'float32'
+
+    super(PositionEmbeddingWithSubSeqMask, self).__init__(**kwargs)
+    if use_dynamic_slicing and max_sequence_length is None:
+      raise ValueError(
+          'If `use_dynamic_slicing` is True, `max_sequence_length` must be set.'
+      )
+    self._max_sequence_length = max_sequence_length
+    self._initializer = tf.keras.initializers.get(initializer)
+    self._use_dynamic_slicing = use_dynamic_slicing
+
+  def get_config(self):
+    config = {
+        'max_sequence_length': self._max_sequence_length,
+        'initializer': tf.keras.initializers.serialize(self._initializer),
+        'use_dynamic_slicing': self._use_dynamic_slicing,
+    }
+    base_config = super(PositionEmbeddingWithSubSeqMask, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def build(self, input_shape):
+    """Implements build() for the layer."""
+    dimension_list = input_shape.as_list()
+
+    if len(dimension_list) != 3:
+      raise ValueError('PositionEmbedding expects a 3-dimensional input tensor '
+                       'of shape [batch, sequence, width]')
+    seq_length = dimension_list[1]
+    width = dimension_list[2]
+
+    # If we are not using dynamic slicing, we must assume that the sequence
+    # length is fixed and max_sequence_length should not be specified.
+    if not self._use_dynamic_slicing:
+      if seq_length is None:
+        raise ValueError(
+            'PositionEmbedding must have `use_dynamic_slicing` set '
+            'to True (and max_sequence_length set) when the '
+            'sequence (1st) dimension of the input is None.')
+      if self._max_sequence_length is not None:
+        raise ValueError(
+            'When `use_dynamic_slicing` is False, max_sequence_length should '
+            'not be specified and we ought to use seq_length to get the '
+            'variable shape.')
+
+    if self._max_sequence_length is not None:
+      weight_sequence_length = self._max_sequence_length
+    else:
+      weight_sequence_length = seq_length
+
+    self._position_embeddings = self.add_weight(
+        'embeddings',
+        shape=[weight_sequence_length, width],
+        initializer=self._initializer)
+
+    super(PositionEmbeddingWithSubSeqMask, self).build(input_shape)
+
+  def call(self, inputs, position_ids=None, sub_sequence_mask=None):
+    """Implements call() for the layer.
+
+    When `position_ids` is specified, it will return the position embeddings
+    corresponding to this `position_ids`; otherwise, `position_ids` will be
+    inferred in the following way:
+
+    (1) When `sub_sequence_mask` is None, we assume the position ids are
+        0, 1, 2, ..., seq_length - 1.
+    (2) When `sub_sequence_mask` is specified, there may be multiple sub
+        sequences, and for each sub sequence, its position ids start from
+        0, 1, 2, ...
+
+    Args:
+      inputs: Word embeddings in shape [batch, seq_length, embedding_dim].
+      position_ids: An optional int32 tensor in shape [batch, seq_length].
+      sub_sequence_mask: An optional bool tensor in shape [batch, seq_length,
+        seq_length]. [x, y, z] is True if for x'th instance in a batch, y'th
+        token and z'th token are from the same sub sequence.
+
+    Returns:
+      The position embeddings in shape [batch, seq_length, embedding_dim].
+    """
+    input_shape = tf_utils.get_shape_list(inputs, expected_rank=3)
+    if self._use_dynamic_slicing:
+      position_embeddings = self._position_embeddings[:input_shape[1], :]
+    else:
+      position_embeddings = self._position_embeddings
+
+    if position_ids is not None:
+      return tf.gather(position_embeddings, position_ids)
+
+    if sub_sequence_mask is None:
+      return tf.broadcast_to(position_embeddings, input_shape)
+    else:
+      sub_sequence_mask = tf.cast(sub_sequence_mask, tf.int32)
+      # For each sub sequence, its position ids start from 0, 1, 2, ...
+      position_ids = tf.linalg.diag_part(tf.cumsum(sub_sequence_mask, -1)) - 1
+      return tf.gather(position_embeddings, position_ids)
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/span_labeling.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/span_labeling.py
new file mode 100644
index 000000000..efbf69d19
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/span_labeling.py
@@ -0,0 +1,338 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Span labeling network."""
+# pylint: disable=g-classes-have-attributes
+import collections
+import tensorflow as tf
+
+
+def _apply_paragraph_mask(logits, paragraph_mask):
+  """Applies a position mask to calculated logits."""
+  masked_logits = logits * (paragraph_mask) - 1e30 * (1 - paragraph_mask)
+  return tf.nn.log_softmax(masked_logits, -1), masked_logits
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+class SpanLabeling(tf.keras.Model):
+  """Span labeling network head for BERT modeling.
+
+  This network implements a simple single-span labeler based on a dense layer.
+  *Note* that the network is constructed by
+  [Keras Functional API](https://keras.io/guides/functional_api/).
+
+  Args:
+    input_width: The innermost dimension of the input tensor to this network.
+    activation: The activation, if any, for the dense layer in this network.
+    initializer: The initializer for the dense layer in this network. Defaults
+      to a Glorot uniform initializer.
+    output: The output style for this network. Can be either `logits` or
+      `predictions`.
+  """
+
+  def __init__(self,
+               input_width,
+               activation=None,
+               initializer='glorot_uniform',
+               output='logits',
+               **kwargs):
+
+    sequence_data = tf.keras.layers.Input(
+        shape=(None, input_width), name='sequence_data', dtype=tf.float32)
+
+    intermediate_logits = tf.keras.layers.Dense(
+        2,  # This layer predicts start location and end location.
+        activation=activation,
+        kernel_initializer=initializer,
+        name='predictions/transform/logits')(
+            sequence_data)
+    start_logits, end_logits = self._split_output_tensor(intermediate_logits)
+
+    start_predictions = tf.keras.layers.Activation(tf.nn.log_softmax)(
+        start_logits)
+    end_predictions = tf.keras.layers.Activation(tf.nn.log_softmax)(end_logits)
+
+    if output == 'logits':
+      output_tensors = [start_logits, end_logits]
+    elif output == 'predictions':
+      output_tensors = [start_predictions, end_predictions]
+    else:
+      raise ValueError(
+          ('Unknown `output` value "%s". `output` can be either "logits" or '
+           '"predictions"') % output)
+
+    # b/164516224
+    # Once we've created the network using the Functional API, we call
+    # super().__init__ as though we were invoking the Functional API Model
+    # constructor, resulting in this object having all the properties of a model
+    # created using the Functional API. Once super().__init__ is called, we
+    # can assign attributes to `self` - note that all `self` assignments are
+    # below this line.
+    super(SpanLabeling, self).__init__(
+        inputs=[sequence_data], outputs=output_tensors, **kwargs)
+    config_dict = {
+        'input_width': input_width,
+        'activation': activation,
+        'initializer': initializer,
+        'output': output,
+    }
+    # We are storing the config dict as a namedtuple here to ensure checkpoint
+    # compatibility with an earlier version of this model which did not track
+    # the config dict attribute. TF does not track immutable attrs which
+    # do not contain Trackables, so by creating a config namedtuple instead of
+    # a dict we avoid tracking it.
+    config_cls = collections.namedtuple('Config', config_dict.keys())
+    self._config = config_cls(**config_dict)
+    self.start_logits = start_logits
+    self.end_logits = end_logits
+
+  def _split_output_tensor(self, tensor):
+    transposed_tensor = tf.transpose(tensor, [2, 0, 1])
+    return tf.unstack(transposed_tensor)
+
+  def get_config(self):
+    return dict(self._config._asdict())
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
+
+class XLNetSpanLabeling(tf.keras.layers.Layer):
+  """Span labeling network head for XLNet on SQuAD2.0.
+
+  This networks implements a span-labeler based on dense layers and question
+  possibility classification. This is the complex version seen in the original
+  XLNet implementation.
+
+  This applies a dense layer to the input sequence data to predict the start
+  positions, and then uses either the true start positions (if training) or
+  beam search to predict the end positions.
+
+  **Note: `compute_with_beam_search` will not work with the Functional API
+  (https://www.tensorflow.org/guide/keras/functional).
+
+  Args:
+    input_width: The innermost dimension of the input tensor to this network.
+    start_n_top: Beam size for span start.
+    end_n_top: Beam size for span end.
+    activation: The activation, if any, for the dense layer in this network.
+    dropout_rate: The dropout rate used for answer classification.
+    initializer: The initializer for the dense layer in this network. Defaults
+      to a Glorot uniform initializer.
+  """
+
+  def __init__(self,
+               input_width,
+               start_n_top=5,
+               end_n_top=5,
+               activation='tanh',
+               dropout_rate=0.,
+               initializer='glorot_uniform',
+               **kwargs):
+    super().__init__(**kwargs)
+    self._config = {
+        'input_width': input_width,
+        'activation': activation,
+        'initializer': initializer,
+        'start_n_top': start_n_top,
+        'end_n_top': end_n_top,
+        'dropout_rate': dropout_rate,
+    }
+    if start_n_top <= 1:
+      raise ValueError('`start_n_top` must be greater than 1.')
+    self._start_n_top = start_n_top
+    self._end_n_top = end_n_top
+    self.start_logits_dense = tf.keras.layers.Dense(
+        units=1,
+        kernel_initializer=initializer,
+        name='predictions/transform/start_logits')
+
+    self.end_logits_inner_dense = tf.keras.layers.Dense(
+        units=input_width,
+        kernel_initializer=initializer,
+        activation=activation,
+        name='predictions/transform/end_logits/inner')
+    self.end_logits_layer_norm = tf.keras.layers.LayerNormalization(
+        axis=-1, epsilon=1e-12,
+        name='predictions/transform/end_logits/layernorm')
+    self.end_logits_output_dense = tf.keras.layers.Dense(
+        units=1,
+        kernel_initializer=initializer,
+        name='predictions/transform/end_logits/output')
+
+    self.answer_logits_inner = tf.keras.layers.Dense(
+        units=input_width,
+        kernel_initializer=initializer,
+        activation=activation,
+        name='predictions/transform/answer_logits/inner')
+    self.answer_logits_dropout = tf.keras.layers.Dropout(rate=dropout_rate)
+    self.answer_logits_output = tf.keras.layers.Dense(
+        units=1,
+        kernel_initializer=initializer,
+        use_bias=False,
+        name='predictions/transform/answer_logits/output')
+
+  def end_logits(self, inputs):
+    """Computes the end logits.
+
+    Input shapes into the inner, layer norm, output layers should match.
+
+    During training, inputs shape should be
+    [batch_size, seq_length, input_width].
+
+    During inference, input shapes should be
+    [batch_size, seq_length, start_n_top, input_width].
+
+    Args:
+      inputs: The input for end logits.
+
+    Returns:
+      Calculated end logits.
+
+    """
+    if len(tf.shape(inputs)) == 3:
+      # inputs: [B, S, H] -> [B, S, 1, H]
+      inputs = tf.expand_dims(inputs, axis=2)
+
+    end_logits = self.end_logits_inner_dense(inputs)
+    end_logits = self.end_logits_layer_norm(end_logits)
+    end_logits = self.end_logits_output_dense(end_logits)
+    end_logits = tf.squeeze(end_logits)
+    return end_logits
+
+  def call(self,
+           sequence_data,
+           class_index,
+           paragraph_mask=None,
+           start_positions=None,
+           training=False):
+    """Implements call().
+
+    Einsum glossary:
+    - b: the batch size.
+    - l: the sequence length.
+    - h: the hidden size, or input width.
+    - k: the start/end top n.
+
+    Args:
+      sequence_data: The input sequence data of shape
+        `(batch_size, seq_length, input_width)`.
+      class_index: The class indices of the inputs of shape `(batch_size,)`.
+      paragraph_mask: Invalid position mask such as query and special symbols
+        (e.g. PAD, SEP, CLS) of shape `(batch_size,)`.
+      start_positions: The start positions of each example of shape
+        `(batch_size,)`.
+      training: Whether or not this is the training phase.
+
+    Returns:
+      A dictionary with the keys `start_predictions`, `end_predictions`,
+      `start_logits`, `end_logits`.
+
+      If inference, then `start_top_predictions`, `start_top_index`,
+      `end_top_predictions`, `end_top_index` are also included.
+
+    """
+    paragraph_mask = tf.cast(paragraph_mask, dtype=sequence_data.dtype)
+    class_index = tf.reshape(class_index, [-1])
+
+    seq_length = tf.shape(sequence_data)[1]
+    start_logits = self.start_logits_dense(sequence_data)
+    start_logits = tf.squeeze(start_logits, -1)
+    start_predictions, masked_start_logits = _apply_paragraph_mask(
+        start_logits, paragraph_mask)
+
+    compute_with_beam_search = not training or start_positions is None
+
+    if compute_with_beam_search:
+      # Compute end logits using beam search.
+      start_top_predictions, start_top_index = tf.nn.top_k(
+          start_predictions, k=self._start_n_top)
+      start_index = tf.one_hot(
+          start_top_index, depth=seq_length, axis=-1, dtype=tf.float32)
+      # start_index: [batch_size, end_n_top, seq_length]
+
+      start_features = tf.einsum('blh,bkl->bkh', sequence_data, start_index)
+      start_features = tf.tile(start_features[:, None, :, :],
+                               [1, seq_length, 1, 1])
+      # start_features: [batch_size, seq_length, end_n_top, input_width]
+
+      end_input = tf.tile(sequence_data[:, :, None],
+                          [1, 1, self._start_n_top, 1])
+      end_input = tf.concat([end_input, start_features], axis=-1)
+      # end_input: [batch_size, seq_length, end_n_top, 2*input_width]
+      paragraph_mask = paragraph_mask[:, None, :]
+      end_logits = self.end_logits(end_input)
+
+      # Note: this will fail if start_n_top is not >= 1.
+      end_logits = tf.transpose(end_logits, [0, 2, 1])
+    else:
+      start_positions = tf.reshape(start_positions, [-1])
+      start_index = tf.one_hot(
+          start_positions, depth=seq_length, axis=-1, dtype=tf.float32)
+      # start_index: [batch_size, seq_length]
+
+      start_features = tf.einsum('blh,bl->bh', sequence_data, start_index)
+      start_features = tf.tile(start_features[:, None, :], [1, seq_length, 1])
+      # start_features: [batch_size, seq_length, input_width]
+
+      end_input = tf.concat([sequence_data, start_features],
+                            axis=-1)
+      # end_input: [batch_size, seq_length, 2*input_width]
+      end_logits = self.end_logits(end_input)
+    end_predictions, masked_end_logits = _apply_paragraph_mask(
+        end_logits, paragraph_mask)
+
+    output_dict = dict(
+        start_predictions=start_predictions,
+        end_predictions=end_predictions,
+        start_logits=masked_start_logits,
+        end_logits=masked_end_logits)
+
+    if not training:
+      end_top_predictions, end_top_index = tf.nn.top_k(
+          end_predictions, k=self._end_n_top)
+      end_top_predictions = tf.reshape(
+          end_top_predictions,
+          [-1, self._start_n_top * self._end_n_top])
+      end_top_index = tf.reshape(
+          end_top_index,
+          [-1, self._start_n_top * self._end_n_top])
+      output_dict['start_top_predictions'] = start_top_predictions
+      output_dict['start_top_index'] = start_top_index
+      output_dict['end_top_predictions'] = end_top_predictions
+      output_dict['end_top_index'] = end_top_index
+
+    # get the representation of CLS
+    class_index = tf.one_hot(class_index, seq_length, axis=-1, dtype=tf.float32)
+    class_feature = tf.einsum('blh,bl->bh', sequence_data, class_index)
+
+    # get the representation of START
+    start_p = tf.nn.softmax(masked_start_logits, axis=-1)
+    start_feature = tf.einsum('blh,bl->bh', sequence_data, start_p)
+
+    answer_feature = tf.concat([start_feature, class_feature], -1)
+    answer_feature = self.answer_logits_inner(answer_feature)
+    answer_feature = self.answer_logits_dropout(answer_feature)
+    class_logits = self.answer_logits_output(answer_feature)
+    class_logits = tf.squeeze(class_logits, -1)
+    output_dict['class_logits'] = class_logits
+    return output_dict
+
+  def get_config(self):
+    return self._config
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/xlnet_base.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/xlnet_base.py
new file mode 100644
index 000000000..2f6c4a61a
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/networks/xlnet_base.py
@@ -0,0 +1,709 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Keras-based XLNet Model."""
+
+from absl import logging
+
+import tensorflow as tf
+
+from nlp_modeling import layers
+from nlp_modeling.layers import transformer_xl
+
+_SEG_ID_CLS = 2
+
+
+def _create_causal_attention_mask(
+    seq_length,
+    memory_length,
+    dtype=tf.float32,
+    same_length=False):
+  """Creates a causal attention mask with a single-sided context.
+
+  When applying the attention mask in `MultiHeadRelativeAttention`, the
+  attention scores are of shape `[(batch dimensions), S, S + M]`, where:
+  - S = sequence length.
+  - M = memory length.
+
+  In a simple case where S = 2, M = 1, here is a simple illustration of the
+  `attention_scores` matrix, where `a` represents an attention function:
+
+   token_0   [[a(token_0, mem_0)    a(token_0, token_0)   a(token_0, token_1)],
+   token_1    [a(token_1, mem_0)    a(token_1, token_0)   a(token_1, token_1)]]
+                      mem_0                token_0               token_1
+
+  For uni-directional attention, we want to mask out values in the attention
+  scores that represent a(token_i, token_j) where j > i. We can achieve this by
+  concatenating 0s (representing memory positions) with a strictly upper
+  triangular matrix of 1s.
+
+  We then flip the matrix values in order to match the representation where
+  real values are 1s.
+
+  Args:
+    seq_length: int, The length of each sequence.
+    memory_length: int, The length of memory blocks.
+    dtype: dtype of the mask.
+    same_length: bool, whether to use the same attention length for each token.
+
+  Returns:
+    A unidirectional attention mask of shape
+    `[seq_length, seq_length + memory_length]`. E.g.:
+
+    [[1. 1. 1. 0. 0. 0.]
+     [1. 1. 1. 1. 0. 0.]
+     [1. 1. 1. 1. 1. 0.]
+     [1. 1. 1. 1. 1. 1.]]
+  """
+  ones_matrix = tf.ones([seq_length, seq_length], dtype=dtype)
+  upper_triangular = tf.linalg.band_part(ones_matrix, 0, -1)
+  diagonal = tf.linalg.band_part(ones_matrix, 0, 0)
+
+  padding = tf.zeros([seq_length, memory_length], dtype=dtype)
+  causal_attention_mask = tf.concat(
+      [padding, upper_triangular - diagonal], 1)
+  if same_length:
+    lower_triangular = tf.linalg.band_part(ones_matrix, -1, 0)
+    strictly_lower_triangular = lower_triangular - diagonal
+    causal_attention_mask = tf.concat(
+        [causal_attention_mask[:, :seq_length] + strictly_lower_triangular,
+         causal_attention_mask[:, seq_length:]], 1)
+
+  return 1 - causal_attention_mask
+
+
+def _combine_masks(mask1, mask2, dtype, how="and"):
+  """Combines two masks.
+
+  Use "and" if trying to combine two existing masks.
+  Use "or" if trying to flip a few positions to "real".
+
+  Args:
+    mask1: tf.Tensor, input mask 1
+    mask2: tf.Tensor, input mask 2
+    dtype: tf.dtype
+    how: Which logical operation should run.
+
+  Returns:
+    The combined input masks.
+
+  """
+  if how == "and":
+    operator = tf.math.logical_and
+  else:
+    operator = tf.math.logical_or
+  return tf.cast(operator(
+      tf.cast(mask1, tf.bool),
+      tf.cast(mask2, tf.bool)), dtype=dtype)
+
+
+def _compute_attention_mask(
+    input_mask,
+    permutation_mask,
+    attention_type,
+    seq_length,
+    memory_length,
+    batch_size,
+    dtype=tf.float32):
+  """Combines all input attention masks for XLNet.
+
+  In XLNet modeling, `0` represents tokens that can be attended, and `1`
+  represents tokens that cannot be attended.
+
+  For XLNet pre-training and fine tuning, there are a few masks used:
+  - Causal attention mask: If the attention type is unidirectional, then all
+    tokens after the current position cannot be attended to.
+  - Input mask: when generating data, padding is added to a max sequence length
+    to make all sequences the same length. This masks out real tokens (`0`) from
+    padding tokens (`1`).
+  - Permutation mask: during XLNet pretraining, the input sequence is factorized
+    into a factorization sequence `z`. During partial prediction, `z` is split
+    at a cutting point `c` (an index of the factorization sequence) and
+    prediction is only applied to all tokens after `c`. Therefore, tokens at
+    factorization positions `i` > `c` can be attended to and tokens at
+    factorization positions `i` <= `c` cannot be attended to.
+
+  This function broadcasts and combines all attention masks to produce the
+  query attention mask and the content attention mask.
+
+  Args:
+    input_mask: Tensor, the input mask related to padding. Input shape:
+      `(B, S)`.
+    permutation_mask: Tensor, the permutation mask used in partial prediction.
+      Input shape: `(B, S, S)`.
+    attention_type: str, the attention type. Can be "uni" (directional) or
+      "bi" (directional).
+    seq_length: int, the length of each sequence.
+    memory_length: int the length of memory blocks.
+    batch_size: int, the batch size.
+    dtype: The dtype of the masks.
+
+  Returns:
+    attention_mask, content_attention_mask: The position and context-based
+      attention masks and content attention masks, respectively.
+
+  """
+  attention_mask = None
+  # `1` values mean do not attend to this position.
+  if attention_type == "uni":
+    causal_attention_mask = _create_causal_attention_mask(
+        seq_length=seq_length,
+        memory_length=memory_length,
+        dtype=dtype)
+    causal_attention_mask = causal_attention_mask[None, None, :, :]
+    # `causal_attention_mask`: [1, 1, S, S + M]
+
+  # input_mask: [B, S]
+  # permutation_mask: [B, S, S]
+  if input_mask is not None and permutation_mask is not None:
+    data_mask = _combine_masks(input_mask[:, None, :], permutation_mask, dtype)
+  elif input_mask is not None and permutation_mask is None:
+    data_mask = input_mask[:, None, :]
+  elif input_mask is None and permutation_mask is not None:
+    data_mask = permutation_mask
+  else:
+    data_mask = None
+
+  # data_mask: [B, S, S] or [B, 1, S]
+
+  if data_mask is not None:
+    # All positions within state can be attended to.
+    state_mask = tf.ones([batch_size, tf.shape(data_mask)[1], memory_length],
+                         dtype=dtype)
+    # state_mask: [B, 1, M] or [B, S, M]
+    data_mask = tf.concat([state_mask, data_mask], 2)
+    # data_mask: [B, 1, S + M] or [B, S, S + M]
+
+    if attention_type == "uni":
+      attention_mask = _combine_masks(causal_attention_mask,
+                                      data_mask[:, None, :, :],
+                                      dtype=dtype)
+    else:
+      attention_mask = data_mask[:, None, :, :]
+
+  if attention_mask is not None:
+    # Construct the content attention mask.
+    # This ensures that the mask allows the model to attend to positions in
+    # content positions (e.g. the content diagonal).
+    non_target_mask = tf.concat(
+        [tf.zeros([seq_length, memory_length], dtype=dtype),
+         tf.eye(seq_length, dtype=dtype)], axis=-1)
+    content_attention_mask = _combine_masks(
+        attention_mask, non_target_mask, how="or", dtype=dtype)
+  else:
+    content_attention_mask = None
+
+  return attention_mask, content_attention_mask
+
+
+def _compute_segment_matrix(
+    segment_ids,
+    memory_length,
+    batch_size,
+    use_cls_mask):
+  """Computes the segment embedding matrix.
+
+  XLNet introduced segment-based attention for attention calculations. This
+  extends the idea of relative encodings in Transformer XL by considering
+  whether or not two positions are within the same segment, rather than
+  which segments they come from.
+
+  This function generates a segment matrix by broadcasting provided segment IDs
+  in two different dimensions and checking where values are equal. This output
+  matrix shows `True` whenever two tokens are NOT in the same segment and
+  `False` whenever they are.
+
+  Args:
+    segment_ids: A Tensor of size `[B, S]` that represents which segment
+      each token belongs to.
+    memory_length: int, the length of memory blocks.
+    batch_size: int, the batch size.
+    use_cls_mask: bool, whether or not to introduce cls mask in
+      input sequences.
+
+  Returns:
+    A boolean Tensor of size `[B, S, S + M]`, where `True` means that two
+    tokens are NOT in the same segment, and `False` means they are in the same
+    segment.
+
+  """
+  if segment_ids is None:
+    return None
+
+  memory_padding = tf.zeros([batch_size, memory_length],
+                            dtype=segment_ids.dtype)
+  padded_segment_ids = tf.concat([memory_padding, segment_ids], 1)
+  # segment_ids: [B, S]
+  # padded_segment_ids: [B, S + M]
+
+  if use_cls_mask:
+    # `1` indicates not in the same segment.
+    # Target result: [B, S, S + M]
+
+    # segment_ids: [B, S]
+    # padded_segment_ids: [B, S + M]
+    broadcasted_segment_class_indices = (
+        tf.equal(segment_ids,
+                 tf.constant([_SEG_ID_CLS]))[:, :, None])
+
+    broadcasted_padded_class_indices = (
+        tf.equal(
+            padded_segment_ids,
+            tf.constant([_SEG_ID_CLS]))[:, None, :])
+
+    class_index_matrix = tf.logical_or(broadcasted_segment_class_indices,
+                                       broadcasted_padded_class_indices)
+
+    segment_matrix = tf.equal(segment_ids[:, :, None],
+                              padded_segment_ids[:, None, :])
+    segment_matrix = tf.logical_or(class_index_matrix, segment_matrix)
+  else:
+    # TODO(allencwang) - address this legacy mismatch from `use_cls_mask`.
+    segment_matrix = tf.logical_not(
+        tf.equal(segment_ids[:, :, None], padded_segment_ids[:, None, :]))
+  return segment_matrix
+
+
+def _compute_positional_encoding(
+    attention_type,
+    position_encoding_layer,
+    hidden_size,
+    batch_size,
+    total_length,
+    seq_length,
+    clamp_length,
+    bi_data,
+    dtype=tf.float32):
+  """Computes the relative position encoding.
+
+  Args:
+    attention_type: str, the attention type. Can be "uni" (directional) or
+      "bi" (directional).
+    position_encoding_layer: An instance of `RelativePositionEncoding`.
+    hidden_size: int, the hidden size.
+    batch_size: int, the batch size.
+    total_length: int, the sequence length added to the memory length.
+    seq_length: int, the length of each sequence.
+    clamp_length: int, clamp all relative distances larger than clamp_length. -1
+      means no clamping.
+    bi_data: bool, whether to use bidirectional input pipeline. Usually set to
+      True during pretraining and False during finetuning.
+    dtype: the dtype of the encoding.
+
+  Returns:
+    A Tensor, representing the position encoding.
+
+  """
+  freq_seq = tf.range(0, hidden_size, 2.0)
+  if dtype is not None and dtype != tf.float32:
+    freq_seq = tf.cast(freq_seq, dtype=dtype)
+
+  if attention_type == "bi":
+    beg, end = total_length, -seq_length
+  elif attention_type == "uni":
+    beg, end = total_length, -1
+  else:
+    raise ValueError("Unknown `attention_type` {}.".format(attention_type))
+
+  if bi_data:
+    forward_position_sequence = tf.range(beg, end, -1.0)
+    backward_position_sequence = tf.range(-beg, -end, 1.0)
+
+    if dtype is not None and dtype != tf.float32:
+      forward_position_sequence = tf.cast(forward_position_sequence,
+                                          dtype=dtype)
+      backward_position_sequence = tf.cast(backward_position_sequence,
+                                           dtype=dtype)
+
+    if clamp_length > 0:
+      forward_position_sequence = tf.clip_by_value(
+          forward_position_sequence,
+          -clamp_length,
+          clamp_length)
+      backward_position_sequence = tf.clip_by_value(
+          backward_position_sequence,
+          -clamp_length,
+          clamp_length)
+
+    if batch_size is not None:
+      forward_positional_encoding = position_encoding_layer(
+          forward_position_sequence, batch_size // 2)
+      backward_positional_encoding = position_encoding_layer(
+          backward_position_sequence, batch_size // 2)
+    else:
+      forward_positional_encoding = position_encoding_layer(
+          forward_position_sequence, None)
+      backward_positional_encoding = position_encoding_layer(
+          backward_position_sequence, None)
+
+    relative_position_encoding = tf.concat(
+        [forward_positional_encoding, backward_positional_encoding], axis=0)
+  else:
+    forward_position_sequence = tf.range(beg, end, -1.0)
+    if dtype is not None and dtype != tf.float32:
+      forward_position_sequence = tf.cast(
+          forward_position_sequence, dtype=dtype)
+    if clamp_length > 0:
+      forward_position_sequence = tf.clip_by_value(
+          forward_position_sequence,
+          -clamp_length,
+          clamp_length)
+
+    relative_position_encoding = position_encoding_layer(
+        forward_position_sequence, batch_size)
+  return relative_position_encoding
+
+
+class RelativePositionEncoding(tf.keras.layers.Layer):
+  """Creates a relative positional encoding.
+
+  This layer creates a relative positional encoding as described in
+  "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
+  (https://arxiv.org/abs/1901.02860).
+
+  Rather than an absolute position embedding as in Transformer, this
+  formulation represents position as the relative distance between tokens using
+  sinusoidal positional embeddings.
+
+  Note: This layer is currently experimental.
+
+  Attributes:
+    hidden_size: The dimensionality of the input embeddings.
+  """
+
+  def __init__(self, hidden_size, **kwargs):
+    super(RelativePositionEncoding, self).__init__(**kwargs)
+    self._hidden_size = hidden_size
+    self._inv_freq = 1.0 / (10000.0**(
+        tf.range(0, self._hidden_size, 2.0) / self._hidden_size))
+
+  def call(self, pos_seq, batch_size=None):
+    """Implements call() for the layer.
+
+    Args:
+      pos_seq: A 1-D `Tensor`
+      batch_size: The optionally provided batch size that tiles the relative
+        positional encoding.
+
+    Returns:
+      The relative positional encoding of shape:
+        [batch_size, len(pos_seq), hidden_size] if batch_size is provided, else
+        [1, len(pos_seq), hidden_size].
+    """
+    sinusoid_input = tf.einsum("i,d->id", pos_seq, self._inv_freq)
+    relative_position_encoding = tf.concat([tf.sin(sinusoid_input),
+                                            tf.cos(sinusoid_input)], -1)
+    relative_position_encoding = relative_position_encoding[None, :, :]
+    if batch_size is not None:
+      relative_position_encoding = tf.tile(relative_position_encoding,
+                                           [batch_size, 1, 1])
+    return relative_position_encoding
+
+
+@tf.keras.utils.register_keras_serializable(package="Text")
+class XLNetBase(tf.keras.layers.Layer):
+  """Base XLNet model.
+
+  Attributes:
+    vocab_size: int, the number of tokens in vocabulary.
+    num_layers: int, the number of layers.
+    hidden_size: int, the hidden size.
+    num_attention_heads: int, the number of attention heads.
+    head_size: int, the dimension size of each attention head.
+    inner_size: int, the hidden size in feed-forward layers.
+    dropout_rate: float, dropout rate.
+    attention_dropout_rate: float, dropout rate on attention probabilities.
+    attention_type: str, "uni" or "bi".
+    bi_data: bool, whether to use bidirectional input pipeline. Usually set to
+      True during pretraining and False during finetuning.
+    initializer: A tf initializer.
+    two_stream: bool, whether or not to use `TwoStreamRelativeAttention` used
+      in the XLNet pretrainer. If `False`, then it will use
+      `MultiHeadRelativeAttention` as in Transformer XL.
+    tie_attention_biases: bool, whether or not to tie the biases together.
+      Usually set to `True`. Used for backwards compatibility.
+    memory_length: int, the number of tokens to cache.
+    same_length: bool, whether to use the same attention length for each
+      token.
+    clamp_length: int, clamp all relative distances larger than clamp_length. -1
+      means no clamping.
+    reuse_length: int, the number of tokens in the currect batch to be cached
+      and reused in the future.
+    inner_activation: str, "relu" or "gelu".
+    use_cls_mask: bool, whether or not cls mask is included in the
+      input sequences.
+    embedding_width: The width of the word embeddings. If the embedding width
+      is not equal to hidden size, embedding parameters will be factorized
+      into two matrices in the shape of ["vocab_size", "embedding_width"] and
+      ["embedding_width", "hidden_size"] ("embedding_width" is usually much
+      smaller than "hidden_size").
+    embedding_layer: The word embedding layer. `None` means we will create a
+      new embedding layer. Otherwise, we will reuse the given embedding layer.
+      This parameter is originally added for ELECTRA model which needs to tie
+      the generator embeddings with the discriminator embeddings.
+  """
+
+  def __init__(self,
+               vocab_size,
+               num_layers,
+               hidden_size,
+               num_attention_heads,
+               head_size,
+               inner_size,
+               dropout_rate,
+               attention_dropout_rate,
+               attention_type,
+               bi_data,
+               initializer,
+               two_stream=False,
+               tie_attention_biases=True,
+               memory_length=None,
+               clamp_length=-1,
+               reuse_length=None,
+               inner_activation="relu",
+               use_cls_mask=False,
+               embedding_width=None,
+               **kwargs):
+    super(XLNetBase, self).__init__(**kwargs)
+
+    self._vocab_size = vocab_size
+    self._initializer = initializer
+    self._attention_type = attention_type
+    self._num_layers = num_layers
+    self._hidden_size = hidden_size
+    self._num_attention_heads = num_attention_heads
+    self._head_size = head_size
+    self._inner_size = inner_size
+    self._inner_activation = inner_activation
+    self._dropout_rate = dropout_rate
+    self._attention_dropout_rate = attention_dropout_rate
+    self._tie_attention_biases = tie_attention_biases
+    self._two_stream = two_stream
+
+    self._memory_length = memory_length
+    self._reuse_length = reuse_length
+    self._bi_data = bi_data
+    self._clamp_length = clamp_length
+    self._use_cls_mask = use_cls_mask
+
+    self._segment_embedding = None
+    self._mask_embedding = None
+    self._embedding_width = embedding_width
+
+    if embedding_width is None:
+      embedding_width = hidden_size
+
+    self._embedding_layer = layers.OnDeviceEmbedding(
+        vocab_size=self._vocab_size,
+        embedding_width=embedding_width,
+        initializer=self._initializer,
+        dtype=tf.float32,
+        name="word_embedding")
+    self._dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
+
+    self.embedding_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
+    self.position_encoding = RelativePositionEncoding(self._hidden_size)
+
+    self._transformer_xl = transformer_xl.TransformerXL(
+        vocab_size=vocab_size,
+        num_layers=num_layers,
+        hidden_size=hidden_size,
+        num_attention_heads=num_attention_heads,
+        head_size=head_size,
+        inner_size=inner_size,
+        dropout_rate=dropout_rate,
+        attention_dropout_rate=attention_dropout_rate,
+        initializer=initializer,
+        two_stream=two_stream,
+        tie_attention_biases=tie_attention_biases,
+        memory_length=memory_length,
+        reuse_length=reuse_length,
+        inner_activation=inner_activation,
+        name="transformer_xl")
+
+  def get_config(self):
+    config = {
+        "vocab_size":
+            self._vocab_size,
+        "num_layers":
+            self._num_layers,
+        "hidden_size":
+            self._hidden_size,
+        "num_attention_heads":
+            self._num_attention_heads,
+        "head_size":
+            self._head_size,
+        "inner_size":
+            self._inner_size,
+        "dropout_rate":
+            self._dropout_rate,
+        "attention_dropout_rate":
+            self._attention_dropout_rate,
+        "attention_type":
+            self._attention_type,
+        "bi_data":
+            self._bi_data,
+        "initializer":
+            self._initializer,
+        "two_stream":
+            self._two_stream,
+        "tie_attention_biases":
+            self._tie_attention_biases,
+        "memory_length":
+            self._memory_length,
+        "clamp_length":
+            self._clamp_length,
+        "reuse_length":
+            self._reuse_length,
+        "inner_activation":
+            self._inner_activation,
+        "use_cls_mask":
+            self._use_cls_mask,
+        "embedding_width":
+            self._embedding_width,
+    }
+    base_config = super(XLNetBase, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def get_embedding_lookup_table(self):
+    """Returns the embedding layer weights."""
+    return self._embedding_layer.embeddings
+
+  def __call__(self,
+               input_ids,
+               segment_ids=None,
+               input_mask=None,
+               state=None,
+               permutation_mask=None,
+               target_mapping=None,
+               masked_tokens=None,
+               **kwargs):
+    # Uses dict to feed inputs into call() in order to keep state as a python
+    # list.
+    inputs = {
+        "input_ids": input_ids,
+        "segment_ids": segment_ids,
+        "input_mask": input_mask,
+        "state": state,
+        "permutation_mask": permutation_mask,
+        "target_mapping": target_mapping,
+        "masked_tokens": masked_tokens
+    }
+    return super(XLNetBase, self).__call__(inputs, **kwargs)
+
+  def call(self, inputs):
+    """Implements call() for the layer."""
+    input_ids = inputs["input_ids"]
+    segment_ids = inputs["segment_ids"]
+    input_mask = inputs["input_mask"]
+    state = inputs["state"]
+    permutation_mask = inputs["permutation_mask"]
+    target_mapping = inputs["target_mapping"]
+    masked_tokens = inputs["masked_tokens"]
+
+    batch_size = tf.shape(input_ids)[0]
+    seq_length = tf.shape(input_ids)[1]
+    if state is not None:
+      memory_length = tf.shape(state[0])[1]
+    else:
+      memory_length = 0
+    total_length = memory_length + seq_length
+
+    if self._two_stream and masked_tokens is None:
+      raise ValueError("`masked_tokens` must be provided in order to "
+                       "initialize the query stream in "
+                       "`TwoStreamRelativeAttention`.")
+    if masked_tokens is not None and not self._two_stream:
+      logging.warning("`masked_tokens` is provided but `two_stream` is not "
+                      "enabled. Please enable `two_stream` to enable two "
+                      "stream attention.")
+
+    if input_mask is not None:
+      dtype = input_mask.dtype
+    elif permutation_mask is not None:
+      dtype = permutation_mask.dtype
+    else:
+      dtype = tf.int32
+    query_attention_mask, content_attention_mask = _compute_attention_mask(
+        input_mask=input_mask,
+        permutation_mask=permutation_mask,
+        attention_type=self._attention_type,
+        seq_length=seq_length,
+        memory_length=memory_length,
+        batch_size=batch_size,
+        dtype=dtype)
+    relative_position_encoding = _compute_positional_encoding(
+        attention_type=self._attention_type,
+        position_encoding_layer=self.position_encoding,
+        hidden_size=self._hidden_size,
+        batch_size=batch_size,
+        total_length=total_length,
+        seq_length=seq_length,
+        clamp_length=self._clamp_length,
+        bi_data=self._bi_data,
+        dtype=tf.float32)
+    relative_position_encoding = self.embedding_dropout(
+        relative_position_encoding)
+
+    if segment_ids is None:
+      segment_embedding = None
+      segment_matrix = None
+    else:
+      if self._segment_embedding is None:
+        self._segment_embedding = self.add_weight(
+            "seg_embed",
+            shape=[self._num_layers, 2, self._num_attention_heads,
+                   self._head_size],
+            dtype=tf.float32,
+            initializer=self._initializer)
+
+      segment_embedding = self._segment_embedding
+      segment_matrix = _compute_segment_matrix(
+          segment_ids=segment_ids,
+          memory_length=memory_length,
+          batch_size=batch_size,
+          use_cls_mask=self._use_cls_mask)
+
+    word_embeddings = self._embedding_layer(input_ids)
+    content_stream = self._dropout(word_embeddings)
+
+    if self._two_stream:
+      if self._mask_embedding is None:
+        self._mask_embedding = self.add_weight(
+            "mask_emb/mask_emb",
+            shape=[1, 1, self._hidden_size],
+            dtype=tf.float32)
+      if target_mapping is None:
+        masked_tokens = masked_tokens[:, :, None]
+        masked_token_embedding = (
+            masked_tokens * self._mask_embedding +
+            (1 - masked_tokens) * word_embeddings)
+      else:
+        masked_token_embedding = tf.tile(
+            self._mask_embedding,
+            [batch_size, tf.shape(target_mapping)[1], 1])
+      query_stream = self._dropout(masked_token_embedding)
+    else:
+      query_stream = None
+
+    return self._transformer_xl(
+        content_stream=content_stream,
+        query_stream=query_stream,
+        target_mapping=target_mapping,
+        state=state,
+        relative_position_encoding=relative_position_encoding,
+        segment_matrix=segment_matrix,
+        segment_embedding=segment_embedding,
+        content_attention_mask=content_attention_mask,
+        query_attention_mask=query_attention_mask)
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/ops/__init__.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/ops/__init__.py
new file mode 100644
index 000000000..8cd69a0a6
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/ops/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Ops package definition."""
+from nlp_modeling.ops.beam_search import sequence_beam_search
+from nlp_modeling.ops.segment_extractor import get_next_sentence_labels
+from nlp_modeling.ops.segment_extractor import get_sentence_order_labels
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/ops/beam_search.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/ops/beam_search.py
new file mode 100644
index 000000000..eddb31212
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/ops/beam_search.py
@@ -0,0 +1,704 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Beam search to find the translated sequence with the highest probability."""
+
+import numpy as np
+import tensorflow as tf
+
+
+def inf(dtype):
+  """Returns a value close to infinity, but is still finite in `dtype`.
+
+  This is useful to get a very large value that is still zero when multiplied by
+  zero. The floating-point "Inf" value is NaN when multiplied by zero.
+
+  Args:
+    dtype: A dtype. The returned value will be finite when casted to this dtype.
+
+  Returns:
+    A very large value.
+  """
+  if dtype == "float32" or dtype == "bfloat16":
+    return 1e7
+  elif dtype == "float16":
+    # Disable no-member lint error, as the linter thinks np.float16 does not
+    # exist for some reason.
+    return np.finfo(np.float16).max  # pylint: disable=no-member
+  else:
+    raise AssertionError("Invalid dtype: %s" % dtype)
+
+
+class _StateKeys(object):
+  """Keys to dictionary storing the state of the beam search loop."""
+
+  # Variable storing the loop index.
+  CUR_INDEX = "CUR_INDEX"
+
+  # Top sequences that are alive for each batch item. Alive sequences are ones
+  # that have not generated an EOS token. Sequences that reach EOS are marked as
+  # finished and moved to the FINISHED_SEQ tensor.
+  # Has shape [batch_size, beam_size, CUR_INDEX + 1]
+  ALIVE_SEQ = "ALIVE_SEQ"
+  # Log probabilities of each alive sequence. Shape [batch_size, beam_size]
+  ALIVE_LOG_PROBS = "ALIVE_LOG_PROBS"
+  # Dictionary of cached values for each alive sequence. The cache stores
+  # the encoder output, attention bias, and the decoder attention output from
+  # the previous iteration.
+  ALIVE_CACHE = "ALIVE_CACHE"
+
+  # Top finished sequences for each batch item.
+  # Has shape [batch_size, beam_size, CUR_INDEX + 1]. Sequences that are
+  # shorter than CUR_INDEX + 1 are padded with 0s.
+  FINISHED_SEQ = "FINISHED_SEQ"
+  # Scores for each finished sequence. Score = log probability / length norm
+  # Shape [batch_size, beam_size]
+  FINISHED_SCORES = "FINISHED_SCORES"
+  # Flags indicating which sequences in the finished sequences are finished.
+  # At the beginning, all of the sequences in FINISHED_SEQ are filler values.
+  # True -> finished sequence, False -> filler. Shape [batch_size, beam_size]
+  FINISHED_FLAGS = "FINISHED_FLAGS"
+
+
+def _expand_to_same_rank(tensor, target):
+  """Expands a given tensor to target's rank to be broadcastable.
+
+  Args:
+    tensor: input tensor to tile. Shape: [b, d1, ..., da]
+    target: target tensor. Shape: [b, d1, ..., da, ..., dn]
+
+  Returns:
+    Tiled tensor of shape [b, d1, ..., da, 1, ..., 1] with same rank of target.
+
+  Raises:
+    ValueError, if the shape rank of rank tensor/target is None.
+  """
+  if tensor.shape.rank is None:
+    raise ValueError("Expect rank for tensor shape, but got None.")
+  if target.shape.rank is None:
+    raise ValueError("Expect rank for target shape, but got None.")
+
+  with tf.name_scope("expand_rank"):
+    diff_rank = target.shape.rank - tensor.shape.rank
+    for _ in range(diff_rank):
+      tensor = tf.expand_dims(tensor, -1)
+    return tensor
+
+
+class SequenceBeamSearch(tf.Module):
+  """Implementation of beam search loop."""
+
+  def __init__(self,
+               symbols_to_logits_fn,
+               vocab_size,
+               beam_size,
+               alpha,
+               max_decode_length,
+               eos_id,
+               padded_decode,
+               dtype=tf.float32):
+    """Initialize sequence beam search.
+
+    Args:
+      symbols_to_logits_fn: A function to provide logits, which is the
+        interface to the Transformer model. The passed in arguments are: ids ->
+          A tensor with shape [batch_size * beam_size, index]. index -> A
+          scalar. cache -> A nested dictionary of tensors [batch_size *
+          beam_size, ...].
+        The function must return a tuple of logits and the updated cache: logits
+          -> A tensor with shape [batch * beam_size, vocab_size]. updated cache
+          -> A nested dictionary with the same structure as the input cache.
+      vocab_size: An integer, the size of the vocabulary, used for topk
+        computation.
+      beam_size: An integer, number of beams for beam search.
+      alpha: A float, defining the strength of length normalization.
+      max_decode_length: An integer, the maximum number of steps to decode a
+        sequence.
+      eos_id: An integer. ID of end of sentence token.
+      padded_decode: A bool, indicating if max_sequence_length padding is used
+        for beam search.
+      dtype: A tensorflow data type used for score computation. The default is
+        tf.float32.
+    """
+    self.symbols_to_logits_fn = symbols_to_logits_fn
+    self.vocab_size = vocab_size
+    self.beam_size = beam_size
+    self.alpha = alpha
+    self.max_decode_length = max_decode_length
+    self.eos_id = eos_id
+    self.padded_decode = padded_decode
+    self.dtype = tf.as_dtype(dtype)
+
+  def search(self, initial_ids, initial_cache):
+    """Beam search for sequences with highest scores.
+
+    Args:
+      initial_ids: initial ids to pass into the symbols_to_logits_fn. int tensor
+        with shape [batch_size, 1]
+      initial_cache: dictionary storing values to be passed into the
+        symbols_to_logits_fn.
+
+    Returns:
+      finished_seq and finished_scores.
+    """
+    batch_size = (
+        initial_ids.shape.as_list()[0]
+        if self.padded_decode else tf.shape(initial_ids)[0])
+    state, state_shapes = self._create_initial_state(initial_ids, initial_cache,
+                                                     batch_size)
+
+    def _grow_alive_seq(state):
+      """Grow alive sequences by one token, collect top 2*beam_size sequences.
+
+      2*beam_size sequences are collected because some sequences may have
+      reached the EOS token. 2*beam_size ensures that at least beam_size
+      sequences are still alive.
+
+      Args:
+        state: A dictionary with the current loop state.
+
+      Returns:
+        Tuple of
+        (Top 2*beam_size sequences [batch_size, 2 * beam_size, cur_index + 1],
+         Scores of returned sequences [batch_size, 2 * beam_size],
+         New alive cache, for each of the 2 * beam_size sequences)
+      """
+      i = state[_StateKeys.CUR_INDEX]
+      alive_seq = state[_StateKeys.ALIVE_SEQ]
+      alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
+      alive_cache = state[_StateKeys.ALIVE_CACHE]
+
+      beams_to_keep = 2 * self.beam_size
+
+      # Get logits for the next candidate IDs for the alive sequences. Get the
+      # new cache values at the same time.
+      if self.padded_decode:
+        flat_ids = tf.reshape(
+            tf.slice(alive_seq, [0, 0, i], [batch_size, self.beam_size, 1]),
+            [batch_size * self.beam_size, -1])
+      else:
+        flat_ids = flatten_beam_dim(alive_seq)  # [batch_size * beam_size]
+      flat_cache = tf.nest.map_structure(flatten_beam_dim, alive_cache)
+
+      flat_logits, flat_cache = self.symbols_to_logits_fn(
+          flat_ids, i, flat_cache)
+
+      # Unflatten logits to shape [batch_size, beam_size, vocab_size]
+      logits = _unflatten_beam_dim(flat_logits, batch_size, self.beam_size)
+      new_cache = tf.nest.map_structure(
+          lambda t: _unflatten_beam_dim(t, batch_size, self.beam_size),
+          flat_cache)
+
+      # Convert logits to normalized log probs
+      candidate_log_probs = _log_prob_from_logits(logits)
+
+      # Calculate new log probabilities if each of the alive sequences were
+      # extended # by the the candidate IDs.
+      # Shape [batch_size, beam_size, vocab_size]
+      log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs, axis=2)
+
+      # Each batch item has beam_size * vocab_size candidate sequences. For each
+      # batch item, get the k candidates with the highest log probabilities.
+      flat_log_probs = tf.reshape(log_probs,
+                                  [-1, self.beam_size * self.vocab_size])
+      topk_log_probs, topk_indices = tf.nn.top_k(
+          flat_log_probs, k=beams_to_keep)
+
+      # Extract the alive sequences that generate the highest log probabilities
+      # after being extended.
+      topk_beam_indices = topk_indices // self.vocab_size
+      topk_seq, new_cache = self._gather_beams([alive_seq, new_cache],
+                                               topk_beam_indices, batch_size,
+                                               beams_to_keep)
+
+      # Append the most probable IDs to the topk sequences
+      topk_ids = topk_indices % self.vocab_size
+      if self.padded_decode:
+        topk_seq = tf.transpose(topk_seq, perm=[2, 0, 1])
+        # TODO(b/145533236, hongkuny): Reverts once TF fix the validation.
+        topk_seq = tf.tensor_scatter_nd_update(topk_seq, [[i + 1]],
+                                               tf.expand_dims(topk_ids, axis=0))
+        topk_seq = tf.transpose(topk_seq, perm=[1, 2, 0])
+      else:
+        topk_seq = tf.concat(
+            [topk_seq, tf.expand_dims(topk_ids, axis=2)], axis=2)
+      return topk_seq, topk_log_probs, topk_ids, new_cache
+
+    def _get_new_alive_state(new_seq, new_log_probs, new_finished_flags,
+                             new_cache):
+      """Gather the top k sequences that are still alive.
+
+      Args:
+        new_seq: New sequences generated by growing the current alive sequences
+          int32 tensor with shape [batch_size, 2 * beam_size, cur_index + 1]
+        new_log_probs: Log probabilities of new sequences float32 tensor with
+          shape [batch_size, beam_size]
+        new_finished_flags: A boolean Tensor indicates which sequences are live
+          inside the beam.
+        new_cache: Dict of cached values for each sequence.
+
+      Returns:
+        Dictionary with alive keys from _StateKeys:
+          {Top beam_size sequences that are still alive (don't end with eos_id)
+           Log probabilities of top alive sequences
+           Dict cache storing decoder states for top alive sequences}
+      """
+      # To prevent finished sequences from being considered, set log probs to
+      # -inf.
+      new_log_probs += tf.cast(new_finished_flags,
+                               self.dtype) * -inf(self.dtype)
+
+      _, topk_indexes = tf.nn.top_k(new_log_probs, k=self.beam_size)
+      top_alive_seq, top_alive_log_probs, top_alive_cache = (
+          self._gather_beams([new_seq, new_log_probs, new_cache],
+                             topk_indexes, batch_size, self.beam_size))
+
+      return {
+          _StateKeys.ALIVE_SEQ: top_alive_seq,
+          _StateKeys.ALIVE_LOG_PROBS: top_alive_log_probs,
+          _StateKeys.ALIVE_CACHE: top_alive_cache
+      }
+
+    def _get_new_finished_state(state, new_seq, new_log_probs,
+                                new_finished_flags):
+      """Combine new and old finished sequences, and gather the top k sequences.
+
+      Args:
+        state: A dictionary with the current loop state.
+        new_seq: New sequences generated by growing the current alive sequences
+          int32 tensor with shape [batch_size, beam_size, i + 1]
+        new_log_probs: Log probabilities of new sequences float32 tensor with
+          shape [batch_size, beam_size]
+        new_finished_flags: A boolean Tensor indicates which sequences are live
+          inside the beam.
+
+      Returns:
+        Dictionary with finished keys from _StateKeys:
+          {Top beam_size finished sequences based on score,
+           Scores of finished sequences,
+           Finished flags of finished sequences}
+      """
+      i = state[_StateKeys.CUR_INDEX]
+      finished_seq = state[_StateKeys.FINISHED_SEQ]
+      finished_scores = state[_StateKeys.FINISHED_SCORES]
+      finished_flags = state[_StateKeys.FINISHED_FLAGS]
+
+      # First append a column of 0-ids to finished_seq to increment the length.
+      # New shape of finished_seq: [batch_size, beam_size, i + 1]
+      if not self.padded_decode:
+        finished_seq = tf.concat(
+            [finished_seq,
+             tf.zeros([batch_size, self.beam_size, 1], tf.int32)],
+            axis=2)
+
+      # Calculate new seq scores from log probabilities.
+      length_norm = _length_normalization(self.alpha, i + 1, dtype=self.dtype)
+      new_scores = new_log_probs / length_norm
+
+      # Set the scores of the still-alive seq in new_seq to large negative
+      # values.
+      new_scores += ((1. - tf.cast(new_finished_flags, self.dtype)) *
+                     -inf(self.dtype))
+
+      # Combine sequences, scores, and flags.
+      finished_seq = tf.concat([finished_seq, new_seq], axis=1)
+      finished_scores = tf.concat([finished_scores, new_scores], axis=1)
+      finished_flags = tf.concat([finished_flags, new_finished_flags], axis=1)
+
+      # Return the finished sequences with the best scores.
+      _, topk_indexes = tf.nn.top_k(finished_scores, k=self.beam_size)
+      top_finished_seq, top_finished_scores, top_finished_flags = (
+          self._gather_beams([finished_seq, finished_scores, finished_flags],
+                             topk_indexes, batch_size, self.beam_size))
+
+      return {
+          _StateKeys.FINISHED_SEQ: top_finished_seq,
+          _StateKeys.FINISHED_SCORES: top_finished_scores,
+          _StateKeys.FINISHED_FLAGS: top_finished_flags
+      }
+
+    def _search_step(state):
+      """Beam search loop body.
+
+      Grow alive sequences by a single ID. Sequences that have reached the EOS
+      token are marked as finished. The alive and finished sequences with the
+      highest log probabilities and scores are returned.
+
+      A sequence's finished score is calculating by dividing the log probability
+      by the length normalization factor. Without length normalization, the
+      search is more likely to return shorter sequences.
+
+      Args:
+        state: A dictionary with the current loop state.
+
+      Returns:
+        new state dictionary.
+      """
+      # Grow alive sequences by one token.
+      new_seq, new_log_probs, topk_ids, new_cache = _grow_alive_seq(state)
+      new_finished_flags = tf.equal(topk_ids, self.eos_id)
+      # Collect top beam_size alive sequences
+      alive_state = _get_new_alive_state(new_seq, new_log_probs,
+                                         new_finished_flags, new_cache)
+
+      # Combine newly finished sequences with existing finished sequences, and
+      # collect the top k scoring sequences.
+      finished_state = _get_new_finished_state(state, new_seq, new_log_probs,
+                                               new_finished_flags)
+
+      # Increment loop index and create new state dictionary
+      new_state = {_StateKeys.CUR_INDEX: state[_StateKeys.CUR_INDEX] + 1}
+      new_state.update(alive_state)
+      new_state.update(finished_state)
+      return [new_state]
+
+    finished_state = tf.nest.map_structure(
+        tf.stop_gradient,
+        tf.while_loop(
+            self._continue_search,
+            _search_step,
+            loop_vars=[state],
+            shape_invariants=[state_shapes],
+            parallel_iterations=1))
+    finished_state = finished_state[0]
+    return self._process_finished_state(finished_state)
+
+  def _process_finished_state(self, finished_state):
+    alive_seq = finished_state[_StateKeys.ALIVE_SEQ]
+    alive_log_probs = finished_state[_StateKeys.ALIVE_LOG_PROBS]
+    finished_seq = finished_state[_StateKeys.FINISHED_SEQ]
+    finished_scores = finished_state[_StateKeys.FINISHED_SCORES]
+    finished_flags = finished_state[_StateKeys.FINISHED_FLAGS]
+    # TF2 changes tf.where behavior. Should make parameters broadcastable.
+    finished_cond = tf.reduce_any(finished_flags, 1, name="finished_cond")
+    seq_cond = _expand_to_same_rank(finished_cond, finished_seq)
+    score_cond = _expand_to_same_rank(finished_cond, finished_scores)
+
+    # Account for corner case where there are no finished sequences for a
+    # particular batch item. In that case, return alive sequences for that batch
+    # item.
+    finished_seq = tf.where(seq_cond, finished_seq, alive_seq)
+    finished_scores = tf.where(score_cond, finished_scores, alive_log_probs)
+    return finished_seq, finished_scores
+
+  def _create_initial_state(self, initial_ids, initial_cache, batch_size):
+    """Return initial state dictionary and its shape invariants."""
+    for key, value in initial_cache.items():
+      for inner_value in tf.nest.flatten(value):
+        if inner_value.dtype != self.dtype:
+          raise TypeError(
+              "initial_cache element for key '%s' has dtype %s that does not "
+              "match SequenceBeamSearch's dtype of %s. Value: %s" %
+              (key, inner_value.dtype.name, self.dtype.name, inner_value))
+
+    # Current loop index (starts at 0)
+    cur_index = tf.constant(0)
+
+    # Create alive sequence with shape [batch_size, beam_size, 1]
+    alive_seq = expand_to_beam_size(initial_ids, self.beam_size)
+    alive_seq = tf.expand_dims(alive_seq, axis=2)
+    if self.padded_decode:
+      alive_seq = tf.tile(alive_seq, [1, 1, self.max_decode_length + 1])
+
+    # Create tensor for storing initial log probabilities.
+    # Assume initial_ids are prob 1.0
+    initial_log_probs = tf.constant([[0.] + [-float("inf")] *
+                                     (self.beam_size - 1)],
+                                    dtype=self.dtype)
+    alive_log_probs = tf.tile(initial_log_probs, [batch_size, 1])
+
+    # Expand all values stored in the dictionary to the beam size, so that each
+    # beam has a separate cache.
+    alive_cache = tf.nest.map_structure(
+        lambda t: expand_to_beam_size(t, self.beam_size), initial_cache)
+
+    # Initialize tensor storing finished sequences with filler values.
+    finished_seq = tf.zeros(tf.shape(alive_seq), tf.int32)
+
+    # Set scores of the initial finished seqs to negative infinity.
+    finished_scores = tf.ones([batch_size, self.beam_size],
+                              dtype=self.dtype) * -inf(self.dtype)
+
+    # Initialize finished flags with all False values.
+    finished_flags = tf.zeros([batch_size, self.beam_size], tf.bool)
+
+    # Create state dictionary
+    state = {
+        _StateKeys.CUR_INDEX: cur_index,
+        _StateKeys.ALIVE_SEQ: alive_seq,
+        _StateKeys.ALIVE_LOG_PROBS: alive_log_probs,
+        _StateKeys.ALIVE_CACHE: alive_cache,
+        _StateKeys.FINISHED_SEQ: finished_seq,
+        _StateKeys.FINISHED_SCORES: finished_scores,
+        _StateKeys.FINISHED_FLAGS: finished_flags
+    }
+
+    # Create state invariants for each value in the state dictionary. Each
+    # dimension must be a constant or None. A None dimension means either:
+    #   1) the dimension's value is a tensor that remains the same but may
+    #      depend on the input sequence to the model (e.g. batch size).
+    #   2) the dimension may have different values on different iterations.
+    if self.padded_decode:
+      state_shape_invariants = {
+          _StateKeys.CUR_INDEX:
+              tf.TensorShape([]),
+          _StateKeys.ALIVE_SEQ:
+              tf.TensorShape(
+                  [batch_size, self.beam_size, self.max_decode_length + 1]),
+          _StateKeys.ALIVE_LOG_PROBS:
+              tf.TensorShape([batch_size, self.beam_size]),
+          _StateKeys.ALIVE_CACHE:
+              tf.nest.map_structure(lambda state: state.get_shape(),
+                                    alive_cache),
+          _StateKeys.FINISHED_SEQ:
+              tf.TensorShape(
+                  [batch_size, self.beam_size, self.max_decode_length + 1]),
+          _StateKeys.FINISHED_SCORES:
+              tf.TensorShape([batch_size, self.beam_size]),
+          _StateKeys.FINISHED_FLAGS:
+              tf.TensorShape([batch_size, self.beam_size])
+      }
+    else:
+      state_shape_invariants = {
+          _StateKeys.CUR_INDEX:
+              tf.TensorShape([]),
+          _StateKeys.ALIVE_SEQ:
+              tf.TensorShape([None, self.beam_size, None]),
+          _StateKeys.ALIVE_LOG_PROBS:
+              tf.TensorShape([None, self.beam_size]),
+          _StateKeys.ALIVE_CACHE:
+              tf.nest.map_structure(_get_shape_keep_last_dim, alive_cache),
+          _StateKeys.FINISHED_SEQ:
+              tf.TensorShape([None, self.beam_size, None]),
+          _StateKeys.FINISHED_SCORES:
+              tf.TensorShape([None, self.beam_size]),
+          _StateKeys.FINISHED_FLAGS:
+              tf.TensorShape([None, self.beam_size])
+      }
+
+    return state, state_shape_invariants
+
+  def _continue_search(self, state):
+    """Return whether to continue the search loop.
+
+    The loops should terminate when
+      1) when decode length has been reached, or
+      2) when the worst score in the finished sequences is better than the best
+         score in the alive sequences (i.e. the finished sequences are provably
+         unchanging)
+
+    Args:
+      state: A dictionary with the current loop state.
+
+    Returns:
+      Bool tensor with value True if loop should continue, False if loop should
+      terminate.
+    """
+    i = state[_StateKeys.CUR_INDEX]
+    alive_log_probs = state[_StateKeys.ALIVE_LOG_PROBS]
+    finished_scores = state[_StateKeys.FINISHED_SCORES]
+    finished_flags = state[_StateKeys.FINISHED_FLAGS]
+
+    not_at_max_decode_length = tf.less(i, self.max_decode_length)
+
+    # Calculate largest length penalty (the larger penalty, the better score).
+    max_length_norm = _length_normalization(
+        self.alpha, self.max_decode_length, dtype=self.dtype)
+    # Get the best possible scores from alive sequences.
+    # This tf.slice/tf.squeeze is equivalent to alive_log_probs[:, 0] which
+    # emits a tf.strided_slice. tf.slice is easier to reason about as we aren't
+    # actually taking a non trivial stride.
+    best_alive_scores = tf.squeeze(tf.slice(alive_log_probs, [0, 0], [-1, 1]),
+                                   axis=1) / max_length_norm
+
+    # Compute worst score in finished sequences for each batch element
+    finished_scores *= tf.cast(finished_flags,
+                               self.dtype)  # set filler scores to zero
+    lowest_finished_scores = tf.reduce_min(finished_scores, axis=1)
+
+    # If there are no finished sequences in a batch element, then set the lowest
+    # finished score to -INF for that element.
+    finished_batches = tf.reduce_any(finished_flags, 1)
+    lowest_finished_scores += ((1.0 - tf.cast(finished_batches, self.dtype)) *
+                               -inf(self.dtype))
+
+    worst_finished_score_better_than_best_alive_score = tf.reduce_all(
+        tf.greater(lowest_finished_scores, best_alive_scores))
+
+    return tf.logical_and(
+        not_at_max_decode_length,
+        tf.logical_not(worst_finished_score_better_than_best_alive_score))
+
+  @staticmethod
+  def _gather_beams(nested, beam_indices, batch_size, new_beam_size):
+    """Gather beams from nested structure of tensors.
+
+    Each tensor in nested represents a batch of beams, where beam refers to a
+    single search state (beam search involves searching through multiple states
+    in parallel).
+
+    This function is used to gather the top beams, specified by
+    beam_indices, from the nested tensors.
+
+    Args:
+      nested: Nested structure (tensor, list, tuple or dict) containing tensors
+        with shape [batch_size, beam_size, ...].
+      beam_indices: int32 tensor with shape [batch_size, new_beam_size]. Each
+        value in beam_indices must be between [0, beam_size), and are not
+        necessarily unique.
+      batch_size: int size of batch
+      new_beam_size: int number of beams to be pulled from the nested tensors.
+
+    Returns:
+      Nested structure containing tensors with shape
+        [batch_size, new_beam_size, ...]
+    """
+    # Computes the i'th coodinate that contains the batch index for gather_nd.
+    # Batch pos is a tensor like [[0,0,0,0,],[1,1,1,1],..].
+    batch_pos = tf.range(batch_size * new_beam_size) // new_beam_size
+    batch_pos = tf.reshape(batch_pos, [batch_size, new_beam_size])
+
+    # Create coordinates to be passed to tf.gather_nd. Stacking creates a tensor
+    # with shape [batch_size, beam_size, 2], where the last dimension contains
+    # the (i, j) gathering coordinates.
+    coordinates = tf.stack([batch_pos, beam_indices], axis=2)
+
+    return tf.nest.map_structure(lambda state: tf.gather_nd(state, coordinates),
+                                 nested)
+
+
+def sequence_beam_search(symbols_to_logits_fn,
+                         initial_ids,
+                         initial_cache,
+                         vocab_size,
+                         beam_size,
+                         alpha,
+                         max_decode_length,
+                         eos_id,
+                         padded_decode=False,
+                         dtype="float32"):
+  """Search for sequence of subtoken ids with the largest probability.
+
+  Args:
+    symbols_to_logits_fn: A function that takes in ids, index, and cache as
+      arguments. The passed in arguments will have shape: ids -> A tensor with
+        shape [batch_size * beam_size, index]. index -> A scalar. cache -> A
+        nested dictionary of tensors [batch_size * beam_size, ...].
+      The function must return a tuple of logits and new cache: logits -> A
+        tensor with shape [batch * beam_size, vocab_size]. new cache -> A nested
+        dictionary with the same shape/structure as the inputted cache.
+    initial_ids: An int32 tensor with shape [batch_size]. Starting ids for each
+      batch item.
+    initial_cache: A dictionary, containing starting decoder variables
+      information.
+    vocab_size: An integer, the size of tokens.
+    beam_size: An integer, the number of beams.
+    alpha: A float, defining the strength of length normalization.
+    max_decode_length: An integer, the maximum length to decoded a sequence.
+    eos_id: An integer, ID of eos token, used to determine when a sequence has
+      finished.
+    padded_decode: A bool, indicating if max_sequence_length padding is used for
+      beam search.
+    dtype: A tensorflow data type used for score computation. The default is
+      tf.float32.
+
+  Returns:
+    Top decoded sequences [batch_size, beam_size, max_decode_length]
+    sequence scores [batch_size, beam_size]
+  """
+  sbs = SequenceBeamSearch(symbols_to_logits_fn, vocab_size, beam_size, alpha,
+                           max_decode_length, eos_id, padded_decode, dtype)
+  return sbs.search(initial_ids, initial_cache)
+
+
+def _log_prob_from_logits(logits):
+  return logits - tf.reduce_logsumexp(logits, axis=2, keepdims=True)
+
+
+def _length_normalization(alpha, length, dtype=tf.float32):
+  """Return length normalization factor."""
+  return tf.pow(((5. + tf.cast(length, dtype)) / 6.), alpha)
+
+
+def expand_to_beam_size(tensor, beam_size):
+  """Tiles a given tensor by beam_size.
+
+  Args:
+    tensor: tensor to tile [batch_size, ...]
+    beam_size: How much to tile the tensor by.
+
+  Returns:
+    Tiled tensor [batch_size, beam_size, ...]
+  """
+  tensor = tf.expand_dims(tensor, axis=1)
+  tile_dims = [1] * tensor.shape.ndims
+  tile_dims[1] = beam_size
+
+  return tf.tile(tensor, tile_dims)
+
+
+def flatten_beam_dim(tensor):
+  """Reshapes first two dimensions into a single dimension.
+
+  Args:
+    tensor: Tensor to reshape of shape [A, B, ...]
+
+  Returns:
+    Reshaped tensor of shape [A*B, ...]
+  """
+  shape = _shape_list(tensor)
+  shape[0] *= shape[1]
+  shape.pop(1)  # Remove beam dim
+  return tf.reshape(tensor, shape)
+
+
+def _shape_list(tensor):
+  """Return a list of the tensor's shape, and ensure no None values in list."""
+  # Get statically known shape (may contain None's for unknown dimensions)
+  shape = tensor.get_shape().as_list()
+
+  # Ensure that the shape values are not None
+  dynamic_shape = tf.shape(tensor)
+  for i in range(len(shape)):  # pylint: disable=consider-using-enumerate
+    if shape[i] is None:
+      shape[i] = dynamic_shape[i]
+  return shape
+
+
+def _get_shape_keep_last_dim(tensor):
+  shape_list = _shape_list(tensor)
+
+  # Only the last
+  for i in range(len(shape_list) - 1):
+    shape_list[i] = None
+
+  if isinstance(shape_list[-1], tf.Tensor):
+    shape_list[-1] = None
+  return tf.TensorShape(shape_list)
+
+
+def _unflatten_beam_dim(tensor, batch_size, beam_size):
+  """Reshapes first dimension back to [batch_size, beam_size].
+
+  Args:
+    tensor: Tensor to reshape of shape [batch_size*beam_size, ...]
+    batch_size: Tensor, original batch size.
+    beam_size: int, original beam size.
+
+  Returns:
+    Reshaped tensor of shape [batch_size, beam_size, ...]
+  """
+  shape = _shape_list(tensor)
+  new_shape = [batch_size, beam_size] + shape[1:]
+  return tf.reshape(tensor, new_shape)
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/ops/decoding_module.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/ops/decoding_module.py
new file mode 100644
index 000000000..c9eb70703
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/ops/decoding_module.py
@@ -0,0 +1,282 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Base class for Decoding Strategies (beam_search, top_k, top_p and greedy)."""
+
+import abc
+from typing import Any, Callable, Dict, Tuple
+
+import tensorflow as tf
+
+from tensorflow.python.framework import dtypes
+from modeling import tf_utils
+
+Output = Tuple[tf.Tensor, tf.Tensor]
+InternalState = Tuple[tf.Tensor, tf.Tensor, tf.Tensor, Dict]
+InitialState = Tuple[Dict[str, Any], Dict[str, Any]]
+
+
+class StateKeys:
+  """Keys to dictionary storing the state of Decoding loop."""
+
+  # Variable storing the loop index.
+  CUR_INDEX = "CUR_INDEX"
+
+  # Top sequences that are alive for each batch item. Alive sequences are ones
+  # that have not generated an EOS token. Sequences that reach EOS are marked as
+  # finished and moved to the FINISHED_SEQ tensor.
+  # Has shape [batch_size, beam_size, CUR_INDEX + 1] for SequenceBeamSearch and
+  # [batch_size, CUR_INDEX + 1] otherwise.
+  ALIVE_SEQ = "ALIVE_SEQ"
+  # Log probabilities of each alive sequence. Shape [batch_size, beam_size]
+  ALIVE_LOG_PROBS = "ALIVE_LOG_PROBS"
+  # Dictionary of cached values for each alive sequence. The cache stores
+  # the encoder output, attention bias, and the decoder attention output from
+  # the previous iteration.
+  ALIVE_CACHE = "ALIVE_CACHE"
+
+  # Top finished sequences for each batch item.
+  # Has shape [batch_size, beam_size, CUR_INDEX + 1]. Sequences that are
+  # shorter than CUR_INDEX + 1 are padded with 0s.
+  FINISHED_SEQ = "FINISHED_SEQ"
+  # Scores for each finished sequence. Score = log probability / length norm
+  # Shape [batch_size, beam_size]
+  FINISHED_SCORES = "FINISHED_SCORES"
+  # Flags indicating which sequences in the finished sequences are finished.
+  # At the beginning, all of the sequences in FINISHED_SEQ are filler values.
+  # True -> finished sequence, False -> filler. Shape [batch_size, beam_size]
+  FINISHED_FLAGS = "FINISHED_FLAGS"
+
+
+def log_prob_from_logits(logits):
+  return logits - tf.reduce_logsumexp(logits, axis=-1, keepdims=True)
+
+
+def shape_list(tensor):
+  """Return a list of the tensor's shape, and ensure no None values in list."""
+  return tf_utils.get_shape_list(tensor)
+
+
+def get_shape_keep_last_dim(tensor):
+  shape_list_obj = shape_list(tensor)
+  for i in range(len(shape_list_obj) - 1):
+    shape_list_obj[i] = None
+
+  if isinstance(shape_list_obj[-1], tf.Tensor):
+    shape_list_obj[-1] = None
+  return tf.TensorShape(shape_list_obj)
+
+
+def expand_to_same_rank(tensor, target):
+  """Expands a given tensor to target's rank to be broadcastable.
+
+  Args:
+    tensor: input tensor to tile. Shape: [b, d1, ..., da]
+    target: target tensor. Shape: [b, d1, ..., da, ..., dn]
+
+  Returns:
+    Tiled tensor of shape [b, d1, ..., da, 1, ..., 1] with same rank of target
+
+  Raises:
+    ValueError, if the shape rank of rank tensor/target is None.
+  """
+  if tensor.shape.rank is None:
+    raise ValueError("Expect rank for tensor shape, but got None.")
+  if target.shape.rank is None:
+    raise ValueError("Expect rank for target shape, but got None.")
+
+  with tf.name_scope("expand_rank"):
+    diff_rank = target.shape.rank - tensor.shape.rank
+    for _ in range(diff_rank):
+      tensor = tf.expand_dims(tensor, -1)
+    return tensor
+
+
+class DecodingModule(tf.Module, metaclass=abc.ABCMeta):
+  """A base class for the API required for decoding (go/decoding-tf-nlp)."""
+
+  def __init__(self,
+               length_normalization_fn: Callable[[int, tf.DType], float],
+               dtype: tf.DType = tf.float32):
+    """Initialize the Decoding Module.
+
+    Args:
+      length_normalization_fn: Closure for returning length normalization
+      parameter. Function accepts input as length, dtype and returns float.
+      dtype: A tensorflow data type used for score computation. The default is
+        tf.float32.
+    """
+    self.length_normalization_fn = length_normalization_fn
+    self.dtype = tf.as_dtype(dtype)
+
+  def generate(self,
+               initial_ids: tf.Tensor,
+               initial_cache: Dict[str, tf.Tensor]) -> Output:
+    """Implements the decoding strategy (beam_search or sampling).
+
+    Args:
+      initial_ids: initial ids to pass into the symbols_to_logits_fn.
+                   int tensor with shape [batch_size, 1]
+      initial_cache: dictionary for caching model outputs from previous step.
+    Returns:
+      Tuple of tensors representing
+        finished_sequence: shape [batch, max_seq_length]
+        finished_scores: [batch]
+    """
+    batch_size = (
+        initial_ids.shape.as_list()[0]
+        if self.padded_decode else tf.shape(initial_ids)[0])
+
+    state, state_shapes = self._create_initial_state(initial_ids,
+                                                     initial_cache,
+                                                     batch_size)
+
+    def _generate_step(state):
+      topk_seq, topk_log_probs, topk_ids, new_cache = self._grow_alive_seq(
+          state, batch_size)
+      new_finished_flags = self._finished_flags(topk_ids, state)
+      alive_state = self._get_new_alive_state(topk_seq,
+                                              topk_log_probs,
+                                              new_finished_flags,
+                                              new_cache)
+      finished_state = self._get_new_finished_state(state,
+                                                    topk_seq,
+                                                    topk_log_probs,
+                                                    new_finished_flags,
+                                                    batch_size)
+      new_state = {
+          StateKeys.CUR_INDEX: state[StateKeys.CUR_INDEX] + 1
+      }
+      new_state.update(alive_state)
+      new_state.update(finished_state)
+      return [new_state]
+
+    finished_state = tf.nest.map_structure(
+        tf.stop_gradient,
+        tf.while_loop(
+            self._continue_search,
+            _generate_step,
+            loop_vars=[state],
+            shape_invariants=[state_shapes],
+            parallel_iterations=1))
+    final_state = self._process_finished_state(finished_state[0])
+    return final_state
+
+  @abc.abstractmethod
+  def _create_initial_state(self,
+                            initial_ids: tf.Tensor,
+                            initial_cache: Dict[str, tf.Tensor],
+                            batch_size: int) -> InitialState:
+    """Return initial state dictionary and its shape invariants."""
+    pass
+
+  @abc.abstractmethod
+  def _grow_alive_seq(self,
+                      state: Dict[str, Any],
+                      batch_size: int) -> InternalState:
+    """Grow alive sequences by one token.
+
+    Args:
+      state: A dictionary with the current loop state.
+      batch_size: The given batch size
+
+    Returns:
+      Tuple of
+      (Top sequences,
+       Scores of returned sequences,
+       New ids,
+       New alive cache)
+    """
+    pass
+
+  @abc.abstractmethod
+  def _get_new_alive_state(
+      self,
+      new_seq: tf.Tensor,
+      new_log_probs: tf.Tensor,
+      new_finished_flags: tf.Tensor,
+      new_cache: Dict[str, tf.Tensor]) -> Dict[str, Any]:
+    """Gather the sequences that are still alive.
+
+    Args:
+      new_seq: New sequences generated by growing the current alive sequences
+        int32 tensor with shape
+      new_log_probs: Log probabilities of new sequences float32 tensor with
+        shape
+      new_finished_flags: A boolean Tensor indicates which sequences are live.
+      new_cache: Dict of cached values for each sequence.
+
+    Returns:
+      Dictionary with alive keys from StateKeys.
+    """
+    pass
+
+  @abc.abstractmethod
+  def _get_new_finished_state(self,
+                              state: Dict[str, Any],
+                              new_seq: tf.Tensor,
+                              new_log_probs: tf.Tensor,
+                              new_finished_flags: tf.Tensor,
+                              batch_size: int) -> Dict[str, tf.Tensor]:
+    """Combine new and old finished sequences.
+
+    Args:
+      state: A dictionary with the current loop state.
+      new_seq: New sequences generated by growing the current alive sequences
+        int32 tensor.
+      new_log_probs: Log probabilities of new sequences float32 tensor with
+        shape.
+      new_finished_flags: A boolean Tensor indicates which sequences are live.
+      batch_size: The given batch size.
+
+    Returns:
+      Dictionary with finished keys from StateKeys.
+    """
+    pass
+
+  @abc.abstractmethod
+  def _process_finished_state(self, finished_state: Dict[str, Any]) -> Output:
+    """Process the alive/finished state to return final sequences and scores."""
+    pass
+
+  @abc.abstractmethod
+  def _continue_search(self, state: Dict[str, Any]) -> tf.Tensor:
+    """Returns a bool tensor if the decoding loop should continue."""
+    pass
+
+  @abc.abstractmethod
+  def _finished_flags(self,
+                      topk_ids: tf.Tensor,
+                      state: Dict[str, Any]) -> tf.Tensor:
+    """Calculate the finished flags."""
+    pass
+
+  def inf(self):
+    """Returns a value close to infinity, but is still finite in `dtype`.
+
+    This is useful to get a very large value that is still zero when multiplied
+    by zero. The floating-point "Inf" value is NaN when multiplied by zero.
+
+    Returns:
+      A very large value.
+    """
+    if self.dtype == dtypes.float32 or self.dtype == dtypes.bfloat16:
+      return 1e7
+    elif self.dtype == dtypes.float16:
+      return dtypes.float16.max
+    else:
+      raise AssertionError("Invalid dtype: %s" % self.dtype)
+
+
+
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/ops/sampling_module.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/ops/sampling_module.py
new file mode 100644
index 000000000..93fdcad44
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/ops/sampling_module.py
@@ -0,0 +1,447 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Sampling module for top_k, top_p and greedy decoding."""
+
+import abc
+from typing import Any, Callable, Dict
+
+import numpy as np
+import tensorflow as tf
+
+from nlp_modeling.ops import decoding_module
+
+
+def greedy(log_probs):
+  """Returns the top ids and scores based on greedy decoding."""
+  log_probs, ids = tf.math.top_k(log_probs, k=1)
+  return log_probs, ids
+
+
+def sample_logits_with_temperature(logits, temperature):
+  """Applies a sampling temperature.
+
+     Temperature skews the distribution towards high probability
+     tokens and lowers the mass in tail distribution.
+
+  Args:
+    logits: Input logits for next token.
+    temperature: Tensor for specifying the sampling temperature.
+
+  Returns:
+    Logits with applied temperature.
+  """
+  return logits / temperature
+
+
+def sample_top_k(logits, top_k):
+  """Chooses top_k logits and sets the others to negative infinity.
+
+  Args:
+    logits: Input logits for next token.
+    top_k: Tensor to specify the top_k values.
+
+  Returns:
+    Logits with top_k filtering applied.
+  """
+  top_k_logits = tf.math.top_k(logits, k=top_k)
+  indices_to_remove = logits < tf.expand_dims(top_k_logits[0][..., -1], -1)
+  top_k_logits = set_tensor_by_indices_to_value(logits, indices_to_remove,
+                                                np.NINF)
+  return top_k_logits
+
+
+def sample_top_p(logits, top_p):
+  """Chooses most probable logits with cumulative probabilities upto top_p.
+
+  Sets the remaining logits to negative infinity.
+
+  Args:
+    logits: Input logits for next token.
+    top_p: Float tensor with a value >=0 and < 1.0
+
+  Returns:
+    Logits with top_p filtering applied.
+  """
+  sorted_indices = tf.argsort(logits, direction="DESCENDING")
+  # Flatten logits as tf.gather on TPU needs axis to be compile time constant.
+  logits_shape = decoding_module.shape_list(logits)
+  range_for_gather = tf.expand_dims(tf.range(0, logits_shape[0]), axis=1)
+  range_for_gather = tf.tile(range_for_gather * logits_shape[1],
+                             [1, logits_shape[1]]) + sorted_indices
+  flattened_logits = tf.reshape(logits, [-1])
+  flattened_sorted_indices = tf.reshape(range_for_gather, [-1])
+  sorted_logits = tf.reshape(
+      tf.gather(flattened_logits, flattened_sorted_indices),
+      [logits_shape[0], logits_shape[1]])
+  cumulative_probs = tf.cumsum(tf.nn.softmax(sorted_logits, axis=-1), axis=-1)
+
+  # Remove tokens with cumulative probability above the threshold.
+  sorted_indices_to_remove = cumulative_probs > top_p
+
+  # Shift the indices to the right to keep the first token above threshold.
+  sorted_indices_to_remove = tf.roll(sorted_indices_to_remove, 1, axis=-1)
+  sorted_indices_to_remove = tf.concat([
+      tf.zeros_like(sorted_indices_to_remove[:, :1]),
+      sorted_indices_to_remove[:, 1:]
+  ], -1)
+
+  # Scatter sorted indices to original indexes.
+  indices_to_remove = scatter_values_on_batch_indices(
+      sorted_indices_to_remove, sorted_indices)
+  top_p_logits = set_tensor_by_indices_to_value(
+      logits, indices_to_remove, np.NINF)
+  return top_p_logits
+
+
+def scatter_values_on_batch_indices(values, batch_indices):
+  """Scatter `values` into a tensor using `batch_indices`.
+
+  Args:
+    values: tensor of shape [batch_size, vocab_size] containing the values to
+      scatter
+    batch_indices: tensor of shape [batch_size, vocab_size] containing the
+      indices to insert (should be a permutation in range(0, n))
+
+  Returns:
+    Tensor of shape [batch_size, vocab_size] with values inserted at
+    batch_indices
+  """
+  tensor_shape = decoding_module.shape_list(batch_indices)
+  broad_casted_batch_dims = tf.reshape(
+      tf.broadcast_to(
+          tf.expand_dims(tf.range(tensor_shape[0]), axis=-1),
+          tensor_shape), [1, -1])
+  pair_indices = tf.transpose(
+      tf.concat([broad_casted_batch_dims,
+                 tf.reshape(batch_indices, [1, -1])], 0))
+  return tf.scatter_nd(pair_indices,
+                       tf.reshape(values, [-1]), tensor_shape)
+
+
+def set_tensor_by_indices_to_value(input_tensor, indices, value):
+  """Where indices is True, set the value in input_tensor to value.
+
+  Args:
+    input_tensor: float (batch_size, dim)
+    indices: bool (batch_size, dim)
+    value: float scalar
+  Returns:
+    output_tensor: same shape as input_tensor.
+  """
+  value_tensor = tf.zeros_like(input_tensor) + value
+  output_tensor = tf.where(indices, value_tensor, input_tensor)
+  return output_tensor
+
+
+class SamplingModule(decoding_module.DecodingModule, metaclass=abc.ABCMeta):
+  """Implementation for sampling strategies (go/decoding-tf-nlp)."""
+
+  def __init__(self,
+               symbols_to_logits_fn,
+               length_normalization_fn: Callable[[int, tf.DType], float],
+               vocab_size: int,
+               max_decode_length: int,
+               eos_id: int,
+               padded_decode: bool,
+               top_k=0,
+               top_p=1.0,
+               sample_temperature=0.0,
+               enable_greedy: bool = True,
+               dtype: tf.DType = tf.float32):
+    """Initialize sampling module."""
+    self.symbols_to_logits_fn = symbols_to_logits_fn
+    self.length_normalization_fn = length_normalization_fn
+    self.eos_id = eos_id
+    self.padded_decode = padded_decode
+    self.dtype = tf.as_dtype(dtype)
+    self.vocab_size = tf.convert_to_tensor(vocab_size, dtype=tf.int32)
+    self.max_decode_length = max_decode_length
+    self.top_k = tf.convert_to_tensor(top_k, dtype=tf.int32)
+    self.top_p = tf.convert_to_tensor(top_p, dtype=tf.float32)
+    self.sample_temperature = tf.convert_to_tensor(sample_temperature,
+                                                   dtype=tf.float32)
+    self.enable_greedy = enable_greedy
+    super(SamplingModule, self).__init__(
+        length_normalization_fn=length_normalization_fn, dtype=dtype)
+
+  def _grow_alive_seq(self,
+                      state: Dict[str, Any],
+                      batch_size: int) -> decoding_module.InternalState:
+    """Grow alive sequences by one token.
+
+    This function will implement the decoding strategies like top_p, top_k
+    and greedy for the choosing the next logit.
+
+    Args:
+      state: A dictionary with the current loop state.
+      batch_size: The given batch size
+
+    Returns:
+      Tuple of
+      (Top sequences [batch, curr_index + 1] or [batch, max_decode_length + 1],
+       Scores of returned sequences [batch, 1],
+       New ids [batch, 1],
+       New alive cache)
+    """
+    i = state[decoding_module.StateKeys.CUR_INDEX]
+    alive_seq = state[decoding_module.StateKeys.ALIVE_SEQ]
+    alive_log_probs = state[decoding_module.StateKeys.ALIVE_LOG_PROBS]
+    alive_cache = state[decoding_module.StateKeys.ALIVE_CACHE]
+
+    if self.padded_decode:
+      ids = tf.slice(alive_seq, [0, i], [batch_size, 1])
+    else:
+      ids = alive_seq
+
+    new_logits, new_cache = self.symbols_to_logits_fn(ids, i, alive_cache)
+    candidate_log_probs = decoding_module.log_prob_from_logits(
+        new_logits)
+    original_log_probs = candidate_log_probs + alive_log_probs
+
+    topk_log_probs, topk_ids = None, None
+    if self.enable_greedy:
+      topk_log_probs, topk_ids = greedy(original_log_probs)
+    else:
+      temperature_fn = sample_logits_with_temperature
+      sampled_logits = tf.cond(
+          self.sample_temperature > 0.0,
+          lambda: temperature_fn(new_logits, self.sample_temperature),
+          lambda: new_logits)
+      sampled_logits = tf.cond(
+          self.top_k > 0,
+          lambda: sample_top_k(sampled_logits, self.top_k),
+          lambda: sampled_logits)
+      sampled_logits = tf.cond(
+          self.top_p < 1,
+          lambda: sample_top_p(sampled_logits, self.top_p),
+          lambda: sampled_logits)
+      topk_ids = tf.random.categorical(
+          sampled_logits, dtype=tf.int32, num_samples=1)
+      topk_log_probs = tf.gather(
+          original_log_probs, topk_ids, axis=1, batch_dims=1)
+    if self.padded_decode:
+      topk_seq = tf.transpose(alive_seq, perm=[1, 0])
+      topk_seq = tf.tensor_scatter_nd_update(
+          topk_seq, [[i + 1]], tf.expand_dims(tf.squeeze(topk_ids, -1), 0))
+      topk_seq = tf.transpose(topk_seq, perm=[1, 0])
+    else:
+      topk_seq = tf.concat([alive_seq, topk_ids], axis=-1)
+    return topk_seq, topk_log_probs, topk_ids, new_cache
+
+  def _create_initial_state(self,
+                            initial_ids: tf.Tensor,
+                            initial_cache: Dict[str, tf.Tensor],
+                            batch_size: int) -> decoding_module.InitialState:
+    """Return initial state dictionary and its shape invariants."""
+    for key, value in initial_cache.items():
+      for inner_value in tf.nest.flatten(value):
+        if inner_value.dtype != self.dtype:
+          raise TypeError(
+              "initial_cache element for key '%s' has dtype %s that does not "
+              "match sampling_module's dtype of %s. Value: %s" %
+              (key, value.dtype.name, self.dtype.name, inner_value))
+
+    # Current loop index (starts at 0)
+    cur_index = tf.constant(0)
+
+    # Alive sequence with shape [batch_size, 1]
+    alive_seq = initial_ids
+    alive_seq = tf.expand_dims(alive_seq, axis=-1)
+    if self.padded_decode:
+      alive_seq = tf.tile(alive_seq, [1, self.max_decode_length + 1])
+
+    # Initial log probabilities with shape [batch_size, 1].
+    initial_log_probs = tf.constant([[0.]], dtype=self.dtype)
+    alive_log_probs = tf.tile(initial_log_probs, [batch_size, 1])
+
+    alive_cache = initial_cache
+
+    # Initialize tensor storing finished sequences [batch_size, 1, 1].
+    finished_seq = tf.zeros(tf.shape(alive_seq), tf.int32)
+
+    # Set scores of the initial finished seqs to negative infinity.
+    finished_scores = tf.zeros([batch_size, 1], dtype=self.dtype)
+
+    # Initialize finished flags with all False values.
+    finished_flags = tf.zeros([batch_size, 1], tf.bool)
+
+    # Create state dictionary and state shapes.
+    state = {
+        decoding_module.StateKeys.CUR_INDEX: cur_index,
+        decoding_module.StateKeys.ALIVE_SEQ: alive_seq,
+        decoding_module.StateKeys.ALIVE_LOG_PROBS: alive_log_probs,
+        decoding_module.StateKeys.ALIVE_CACHE: alive_cache,
+        decoding_module.StateKeys.FINISHED_SEQ: finished_seq,
+        decoding_module.StateKeys.FINISHED_SCORES: finished_scores,
+        decoding_module.StateKeys.FINISHED_FLAGS: finished_flags
+    }
+
+    if self.padded_decode:
+      state_shape_invariants = {
+          decoding_module.StateKeys.CUR_INDEX:
+              tf.TensorShape([]),
+          decoding_module.StateKeys.ALIVE_SEQ:
+              tf.TensorShape(
+                  [batch_size, self.max_decode_length + 1]),
+          decoding_module.StateKeys.ALIVE_LOG_PROBS:
+              tf.TensorShape([batch_size, 1]),
+          decoding_module.StateKeys.ALIVE_CACHE:
+              tf.nest.map_structure(lambda state: state.get_shape(),
+                                    alive_cache),
+          decoding_module.StateKeys.FINISHED_SEQ:
+              tf.TensorShape(
+                  [batch_size, self.max_decode_length + 1]),
+          decoding_module.StateKeys.FINISHED_SCORES:
+              tf.TensorShape([batch_size, 1]),
+          decoding_module.StateKeys.FINISHED_FLAGS:
+              tf.TensorShape([batch_size, 1])
+      }
+    else:
+      state_shape_invariants = {
+          decoding_module.StateKeys.CUR_INDEX:
+              tf.TensorShape([]),
+          decoding_module.StateKeys.ALIVE_SEQ:
+              tf.TensorShape([None, None]),
+          decoding_module.StateKeys.ALIVE_LOG_PROBS:
+              tf.TensorShape([None, 1]),
+          decoding_module.StateKeys.ALIVE_CACHE:
+              tf.nest.map_structure(
+                  decoding_module.get_shape_keep_last_dim,
+                  alive_cache),
+          decoding_module.StateKeys.FINISHED_SEQ:
+              tf.TensorShape([None, None]),
+          decoding_module.StateKeys.FINISHED_SCORES:
+              tf.TensorShape([None, 1]),
+          decoding_module.StateKeys.FINISHED_FLAGS:
+              tf.TensorShape([None, 1])
+      }
+
+    return state, state_shape_invariants
+
+  def _get_new_alive_state(
+      self,
+      new_seq: tf.Tensor,
+      new_log_probs: tf.Tensor,
+      new_finished_flags: tf.Tensor,
+      new_cache: Dict[str, tf.Tensor]) -> Dict[str, Any]:
+    """Gather the sequences that are still alive.
+
+    This function resets the sequences in the alive_state that are finished.
+
+    Args:
+      new_seq: New sequences generated by growing the current alive sequences
+        int32 tensor with shape [batch_size, cur_index + 1]
+      new_log_probs: Log probabilities of new sequences float32 tensor with
+        shape [batch_size, 1]
+      new_finished_flags: A boolean Tensor indicates which sequences are live
+        inside the beam.
+      new_cache: Dict of cached values for each sequence.
+
+    Returns:
+      Dictionary with alive keys.
+    """
+    new_seq = tf.multiply(
+        new_seq, tf.cast(tf.logical_not(new_finished_flags), new_seq.dtype))
+    return {
+        decoding_module.StateKeys.ALIVE_SEQ: new_seq,
+        decoding_module.StateKeys.ALIVE_LOG_PROBS: new_log_probs,
+        decoding_module.StateKeys.ALIVE_CACHE: new_cache
+    }
+
+  def _get_new_finished_state(self,
+                              state: Dict[str, Any],
+                              new_seq: tf.Tensor,
+                              new_log_probs: tf.Tensor,
+                              new_finished_flags: tf.Tensor,
+                              batch_size: int) -> Dict[str, tf.Tensor]:
+    """Combine new and old finished sequences.
+
+    Args:
+      state: A dictionary with the current loop state.
+      new_seq: New sequences generated by growing the current alive sequences
+        int32 tensor [batch, curr_index + 1] or [batch, max_decode_length + 1].
+      new_log_probs: Log probabilities of new sequences float32 tensor with
+        shape [batch, 1].
+      new_finished_flags: A boolean Tensor indicates which sequences are live.
+      batch_size: The given batch size.
+
+    Returns:
+      Dictionary with finished keys from StateKeys.
+    """
+    i = state[decoding_module.StateKeys.CUR_INDEX]
+    finished_seq = state[decoding_module.StateKeys.FINISHED_SEQ]
+    finished_scores = state[decoding_module.StateKeys.FINISHED_SCORES]
+    finished_flags = state[decoding_module.StateKeys.FINISHED_FLAGS]
+
+    if not self.padded_decode:
+      finished_seq = tf.concat(
+          [finished_seq, tf.zeros([batch_size, 1], tf.int32)], axis=-1)
+    new_scores = new_log_probs
+    if self.length_normalization_fn is not None:
+      length_norm = self.length_normalization_fn(i + 1, self.dtype)
+      new_scores = new_log_probs / length_norm
+    new_seq = tf.multiply(
+        new_seq, tf.cast(tf.logical_not(finished_flags), new_seq.dtype))
+    new_scores = tf.multiply(
+        new_scores, tf.cast(tf.logical_not(finished_flags), new_scores.dtype))
+
+    finished_seq += tf.multiply(new_seq,
+                                tf.cast(new_finished_flags, new_seq.dtype))
+    finished_scores += tf.multiply(
+        new_scores, tf.cast(new_finished_flags, new_scores.dtype))
+    new_finished_flags = tf.logical_or(new_finished_flags, finished_flags)
+    return {
+        decoding_module.StateKeys.FINISHED_SEQ: finished_seq,
+        decoding_module.StateKeys.FINISHED_SCORES: finished_scores,
+        decoding_module.StateKeys.FINISHED_FLAGS: new_finished_flags
+    }
+
+  def _process_finished_state(
+      self, finished_state: Dict[str, Any]) -> decoding_module.Output:
+    """Process the alive/finished state to return final sequences and scores."""
+    alive_seq = finished_state[decoding_module.StateKeys.ALIVE_SEQ]
+    alive_log_probs = finished_state[decoding_module.StateKeys.ALIVE_LOG_PROBS]
+    finished_seq = finished_state[decoding_module.StateKeys.FINISHED_SEQ]
+    finished_scores = finished_state[decoding_module.StateKeys.FINISHED_SCORES]
+    finished_flags = finished_state[decoding_module.StateKeys.FINISHED_FLAGS]
+    finished_cond = tf.reduce_any(finished_flags, 1, name="finished_cond")
+    if self.length_normalization_fn is not None:
+      length_norm = self.length_normalization_fn(self.max_decode_length + 1,
+                                                 self.dtype)
+      alive_log_probs = alive_log_probs / length_norm
+    seq_cond = decoding_module.expand_to_same_rank(
+        finished_cond, finished_seq)
+    score_cond = decoding_module.expand_to_same_rank(
+        finished_cond, finished_scores)
+    finished_seq = tf.where(seq_cond, finished_seq, alive_seq)
+    finished_scores = tf.where(score_cond, finished_scores, alive_log_probs)
+    return finished_seq, finished_scores
+
+  def _continue_search(self, state) -> tf.Tensor:
+    i = state[decoding_module.StateKeys.CUR_INDEX]
+    # Have we reached max decoding length?
+    not_at_end = tf.less(i, self.max_decode_length)
+    # Have all sampled sequences reached an EOS?
+    all_has_eos = tf.reduce_all(
+        state[decoding_module.StateKeys.FINISHED_FLAGS],
+        axis=None,
+        name="search_finish_cond")
+    return tf.logical_and(not_at_end, tf.logical_not(all_has_eos))
+
+  def _finished_flags(self, topk_ids, state) -> tf.Tensor:
+    new_finished_flags = tf.equal(topk_ids, self.eos_id)
+    new_finished_flags = tf.logical_or(
+        new_finished_flags, state[decoding_module.StateKeys.FINISHED_FLAGS])
+    return new_finished_flags
diff --git a/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/ops/segment_extractor.py b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/ops/segment_extractor.py
new file mode 100644
index 000000000..e01649e4a
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/nlp_modeling/ops/segment_extractor.py
@@ -0,0 +1,210 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Module for extracting segments from sentences in documents."""
+
+import tensorflow as tf
+
+
+# Get a random tensor like `positions` and make some decisions
+def _get_random(positions, random_fn):
+  flat_random = random_fn(
+      shape=tf.shape(positions.flat_values),
+      minval=0,
+      maxval=1,
+      dtype=tf.float32)
+  return positions.with_flat_values(flat_random)
+
+
+# For every position j in a row, sample a position preceeding j or
+# a position which is [0, j-1]
+def _random_int_up_to(maxval, random_fn):
+  # Need to cast because the int kernel for uniform doesn't support bcast.
+  # We add one because maxval is exclusive, and this will get rounded down
+  # when we cast back to int.
+  float_maxval = tf.cast(maxval, tf.float32)
+  return tf.cast(
+      random_fn(
+          shape=tf.shape(maxval),
+          minval=tf.zeros_like(float_maxval),
+          maxval=float_maxval),
+      dtype=maxval.dtype)
+
+
+def _random_int_from_range(minval, maxval, random_fn):
+  # Need to cast because the int kernel for uniform doesn't support bcast.
+  # We add one because maxval is exclusive, and this will get rounded down
+  # when we cast back to int.
+  float_minval = tf.cast(minval, tf.float32)
+  float_maxval = tf.cast(maxval, tf.float32)
+  return tf.cast(
+      random_fn(tf.shape(maxval), minval=float_minval, maxval=float_maxval),
+      maxval.dtype)
+
+
+def _sample_from_other_batch(sentences, random_fn):
+  """Samples sentences from other batches."""
+  # other_batch: <int64>[num_sentences]: The batch to sample from for each
+  # sentence.
+  other_batch = random_fn(
+      shape=[tf.size(sentences)],
+      minval=0,
+      maxval=sentences.nrows() - 1,
+      dtype=tf.int64)
+
+  other_batch += tf.cast(other_batch >= sentences.value_rowids(), tf.int64)
+
+  # other_sentence: <int64>[num_sentences]: The sentence within each batch
+  # that we sampled.
+  other_sentence = _random_int_up_to(
+      tf.gather(sentences.row_lengths(), other_batch), random_fn)
+  return sentences.with_values(tf.stack([other_batch, other_sentence], axis=1))
+
+
+def get_sentence_order_labels(sentences,
+                              random_threshold=0.5,
+                              random_next_threshold=0.5,
+                              random_fn=tf.random.uniform):
+  """Extract segments and labels for sentence order prediction (SOP) task.
+
+  Extracts the segment and labels for the sentence order prediction task
+  defined in "ALBERT: A Lite BERT for Self-Supervised Learning of Language
+  Representations" (https://arxiv.org/pdf/1909.11942.pdf)
+
+  Args:
+    sentences: a `RaggedTensor` of shape [batch, (num_sentences)] with string
+      dtype.
+    random_threshold: (optional) A float threshold between 0 and 1, used to
+      determine whether to extract a random, out-of-batch sentence or a
+      suceeding sentence. Higher value favors succeeding sentence.
+    random_next_threshold: (optional) A float threshold between 0 and 1, used to
+      determine whether to extract either a random, out-of-batch, or succeeding
+      sentence or a preceeding sentence. Higher value favors preceeding
+      sentences.
+    random_fn: (optional) An op used to generate random float values.
+
+  Returns:
+    a tuple of (preceeding_or_random_next, is_suceeding_or_random) where:
+      preceeding_or_random_next: a `RaggedTensor` of strings with the same shape
+        as `sentences` and contains either a preceeding, suceeding, or random
+        out-of-batch sentence respective to its counterpart in `sentences` and
+        dependent on its label in `is_preceeding_or_random_next`.
+      is_suceeding_or_random: a `RaggedTensor` of bool values with the
+        same shape as `sentences` and is True if it's corresponding sentence in
+        `preceeding_or_random_next` is a random or suceeding sentence, False
+        otherwise.
+  """
+  # Create a RaggedTensor in the same shape as sentences ([doc, (sentences)])
+  # whose values are index positions.
+  positions = tf.ragged.range(sentences.row_lengths())
+
+  row_lengths_broadcasted = tf.expand_dims(positions.row_lengths(),
+                                           -1) + 0 * positions
+  row_lengths_broadcasted_flat = row_lengths_broadcasted.flat_values
+
+  # Generate indices for all preceeding, succeeding and random.
+  # For every position j in a row, sample a position preceeding j or
+  # a position which is [0, j-1]
+  all_preceding = tf.ragged.map_flat_values(_random_int_up_to, positions,
+                                            random_fn)
+
+  # For every position j, sample a position following j, or a position
+  # which is [j, row_max]
+  all_succeeding = positions.with_flat_values(
+      tf.ragged.map_flat_values(_random_int_from_range,
+                                positions.flat_values + 1,
+                                row_lengths_broadcasted_flat, random_fn))
+
+  # Convert to format that is convenient for `gather_nd`
+  rows_broadcasted = tf.expand_dims(tf.range(sentences.nrows()),
+                                    -1) + 0 * positions
+  all_preceding_nd = tf.stack([rows_broadcasted, all_preceding], -1)
+  all_succeeding_nd = tf.stack([rows_broadcasted, all_succeeding], -1)
+  all_random_nd = _sample_from_other_batch(positions, random_fn)
+
+  # There's a few spots where there is no "preceding" or "succeeding" item (e.g.
+  # first and last sentences in a document). Mark where these are and we will
+  # patch them up to grab a random sentence from another document later.
+  all_zeros = tf.zeros_like(positions)
+  all_ones = tf.ones_like(positions)
+  valid_preceding_mask = tf.cast(
+      tf.concat([all_zeros[:, :1], all_ones[:, 1:]], -1), tf.bool)
+  valid_succeeding_mask = tf.cast(
+      tf.concat([all_ones[:, :-1], all_zeros[:, -1:]], -1), tf.bool)
+
+  # Decide what to use for the segment: (1) random, out-of-batch, (2) preceeding
+  # item, or (3) succeeding.
+  # Should get out-of-batch instead of succeeding item
+  should_get_random = ((_get_random(positions, random_fn) > random_threshold)
+                       | tf.logical_not(valid_succeeding_mask))
+  random_or_succeeding_nd = tf.compat.v1.where(should_get_random, all_random_nd,
+                                               all_succeeding_nd)
+  # Choose which items should get a random succeeding item. Force positions that
+  # don't have a valid preceeding items to get a random succeeding item.
+  should_get_random_or_succeeding = (
+      (_get_random(positions, random_fn) > random_next_threshold)
+      | tf.logical_not(valid_preceding_mask))
+  gather_indices = tf.compat.v1.where(should_get_random_or_succeeding,
+                                      random_or_succeeding_nd, all_preceding_nd)
+  return (tf.gather_nd(sentences,
+                       gather_indices), should_get_random_or_succeeding)
+
+
+def get_next_sentence_labels(sentences,
+                             random_threshold=0.5,
+                             random_fn=tf.random.uniform):
+  """Extracts the next sentence label from sentences.
+
+  Args:
+    sentences: A `RaggedTensor` of strings w/ shape [batch, (num_sentences)].
+    random_threshold: (optional) A float threshold between 0 and 1, used to
+      determine whether to extract a random sentence or the immediate next
+      sentence. Higher value favors next sentence.
+    random_fn: (optional) An op used to generate random float values.
+
+  Returns:
+    A tuple of (next_sentence_or_random, is_next_sentence) where:
+
+    next_sentence_or_random:  A `Tensor` with shape [num_sentences] that
+      contains either the subsequent sentence of `segment_a` or a randomly
+      injected sentence.
+    is_next_sentence: A `Tensor` of bool w/ shape [num_sentences]
+      that contains whether or not `next_sentence_or_random` is truly a
+      subsequent sentence or not.
+  """
+  # shift everyone to get the next sentence predictions positions
+  positions = tf.ragged.range(sentences.row_lengths())
+
+  # Shift every position down to the right.
+  next_sentences_pos = (positions + 1) % tf.expand_dims(sentences.row_lengths(),
+                                                        1)
+  rows_broadcasted = tf.expand_dims(tf.range(sentences.nrows()),
+                                    -1) + 0 * positions
+  next_sentences_pos_nd = tf.stack([rows_broadcasted, next_sentences_pos], -1)
+  all_random_nd = _sample_from_other_batch(positions, random_fn)
+
+  # Mark the items that don't have a next sentence (e.g. the last
+  # sentences in the document). We will patch these up and force them to grab a
+  # random sentence from a random document.
+  valid_next_sentences = tf.cast(
+      tf.concat([
+          tf.ones_like(positions)[:, :-1],
+          tf.zeros([positions.nrows(), 1], dtype=tf.int64)
+      ], -1), tf.bool)
+
+  is_random = ((_get_random(positions, random_fn) > random_threshold)
+               | tf.logical_not(valid_next_sentences))
+  gather_indices = tf.compat.v1.where(is_random, all_random_nd,
+                                      next_sentences_pos_nd)
+  return tf.gather_nd(sentences, gather_indices), tf.logical_not(is_random)
diff --git a/nlp/text_classification/bert/tensorflow2.0/optimization.py b/nlp/text_classification/bert/tensorflow2.0/optimization.py
new file mode 100644
index 000000000..e975d6808
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/optimization.py
@@ -0,0 +1,231 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Functions and classes related to optimization (weight updates)."""
+
+import re
+
+from absl import logging
+import gin
+import tensorflow as tf
+# import tensorflow_addons.optimizers as tfa_optimizers
+
+
+class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Applies a warmup schedule on a given learning rate decay schedule."""
+
+  def __init__(self,
+               initial_learning_rate,
+               decay_schedule_fn,
+               warmup_steps,
+               power=1.0,
+               name=None):
+    super(WarmUp, self).__init__()
+    self.initial_learning_rate = initial_learning_rate
+    self.warmup_steps = warmup_steps
+    self.power = power
+    self.decay_schedule_fn = decay_schedule_fn
+    self.name = name
+
+  def __call__(self, step):
+    with tf.name_scope(self.name or 'WarmUp') as name:
+      # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
+      # learning rate will be `global_step/num_warmup_steps * init_lr`.
+      global_step_float = tf.cast(step, tf.float32)
+      warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
+      warmup_percent_done = global_step_float / warmup_steps_float
+      warmup_learning_rate = (
+          self.initial_learning_rate *
+          tf.math.pow(warmup_percent_done, self.power))
+      return tf.cond(
+          global_step_float < warmup_steps_float,
+          lambda: warmup_learning_rate,
+          lambda: self.decay_schedule_fn(step),
+          name=name)
+
+  def get_config(self):
+    return {
+        'initial_learning_rate': self.initial_learning_rate,
+        'decay_schedule_fn': self.decay_schedule_fn,
+        'warmup_steps': self.warmup_steps,
+        'power': self.power,
+        'name': self.name
+    }
+
+
+@gin.configurable
+def create_optimizer(init_lr,
+                     num_train_steps,
+                     num_warmup_steps,
+                     end_lr=0.0,
+                     optimizer_type='adamw',
+                     beta_1=0.9):
+  """Creates an optimizer with learning rate schedule."""
+  # Implements linear decay of the learning rate.
+  lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
+      initial_learning_rate=init_lr,
+      decay_steps=num_train_steps,
+      end_learning_rate=end_lr)
+  if num_warmup_steps:
+    lr_schedule = WarmUp(
+        initial_learning_rate=init_lr,
+        decay_schedule_fn=lr_schedule,
+        warmup_steps=num_warmup_steps)
+
+  if optimizer_type == 'adamw':
+    logging.info('using Adamw optimizer')
+    optimizer = AdamWeightDecay(
+        learning_rate=lr_schedule,
+        weight_decay_rate=0.01,
+        beta_1=beta_1,
+        beta_2=0.999,
+        epsilon=1e-6,
+        exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'],
+        jit_compile=False)
+#   elif optimizer_type == 'lamb':
+#     logging.info('using Lamb optimizer')
+#     optimizer = tfa_optimizers.LAMB(
+#         learning_rate=lr_schedule,
+#         weight_decay_rate=0.01,
+#         beta_1=beta_1,
+#         beta_2=0.999,
+#         epsilon=1e-6,
+#         exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'])
+  else:
+    raise ValueError('Unsupported optimizer type: ', optimizer_type)
+
+  return optimizer
+
+
+class AdamWeightDecay(tf.keras.optimizers.Adam):
+  """Adam enables L2 weight decay and clip_by_global_norm on gradients.
+
+  Just adding the square of the weights to the loss function is *not* the
+  correct way of using L2 regularization/weight decay with Adam, since that will
+  interact with the m and v parameters in strange ways.
+
+  Instead we want to decay the weights in a manner that doesn't interact with
+  the m/v parameters. This is equivalent to adding the square of the weights to
+  the loss with plain (non-momentum) SGD.
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-7,
+               amsgrad=False,
+               weight_decay_rate=0.0,
+               include_in_weight_decay=None,
+               exclude_from_weight_decay=None,
+               gradient_clip_norm=1.0,
+               name='AdamWeightDecay',
+               **kwargs):
+    super(AdamWeightDecay, self).__init__(learning_rate, beta_1, beta_2,
+                                          epsilon, amsgrad, name=name, **kwargs)
+    self.weight_decay_rate = weight_decay_rate
+    self.gradient_clip_norm = gradient_clip_norm
+    self._include_in_weight_decay = include_in_weight_decay
+    self._exclude_from_weight_decay = exclude_from_weight_decay
+    logging.info('gradient_clip_norm=%f', gradient_clip_norm)
+
+  @classmethod
+  def from_config(cls, config):
+    """Creates an optimizer from its config with WarmUp custom object."""
+    custom_objects = {'WarmUp': WarmUp}
+    return super(AdamWeightDecay, cls).from_config(
+        config, custom_objects=custom_objects)
+
+  def _prepare_local(self, var_device, var_dtype, apply_state):
+    super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype,
+                                                apply_state)
+    apply_state[(var_device, var_dtype)]['weight_decay_rate'] = tf.constant(
+        self.weight_decay_rate, name='adam_weight_decay_rate')
+
+  def _decay_weights_op(self, var, learning_rate, apply_state):
+    do_decay = self._do_use_weight_decay(var.name)
+    if do_decay:
+      return var.assign_sub(
+          learning_rate * var *
+          apply_state[(var.device, var.dtype.base_dtype)]['weight_decay_rate'],
+          use_locking=self._use_locking)
+    return tf.no_op()
+
+  def apply_gradients(self,
+                      grads_and_vars,
+                      name=None,
+                      experimental_aggregate_gradients=True):
+    grads, tvars = list(zip(*grads_and_vars))
+    if experimental_aggregate_gradients and self.gradient_clip_norm > 0.0:
+      # when experimental_aggregate_gradients = False, apply_gradients() no
+      # longer implicitly allreduce gradients, users manually allreduce gradient
+      # and passed the allreduced grads_and_vars. For now, the
+      # clip_by_global_norm will be moved to before the explicit allreduce to
+      # keep the math the same as TF 1 and pre TF 2.2 implementation.
+      (grads, _) = tf.clip_by_global_norm(
+          grads, clip_norm=self.gradient_clip_norm)
+    return super(AdamWeightDecay, self).apply_gradients(
+        zip(grads, tvars),
+        name=name,
+        experimental_aggregate_gradients=experimental_aggregate_gradients)
+
+  def _get_lr(self, var_device, var_dtype, apply_state):
+    """Retrieves the learning rate with the given state."""
+    if apply_state is None:
+      return self._decayed_lr_t[var_dtype], {}
+
+    apply_state = apply_state or {}
+    coefficients = apply_state.get((var_device, var_dtype))
+    if coefficients is None:
+      coefficients = self._fallback_apply_state(var_device, var_dtype)
+      apply_state[(var_device, var_dtype)] = coefficients
+
+    return coefficients['lr_t'], dict(apply_state=apply_state)
+
+  def _resource_apply_dense(self, grad, var, apply_state=None):
+    lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
+    decay = self._decay_weights_op(var, lr_t, apply_state)
+    with tf.control_dependencies([decay]):
+      return super(AdamWeightDecay,
+                   self)._resource_apply_dense(grad, var, **kwargs)
+
+  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
+    lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
+    decay = self._decay_weights_op(var, lr_t, apply_state)
+    with tf.control_dependencies([decay]):
+      return super(AdamWeightDecay,
+                   self)._resource_apply_sparse(grad, var, indices, **kwargs)
+
+  def get_config(self):
+    config = super(AdamWeightDecay, self).get_config()
+    config.update({
+        'weight_decay_rate': self.weight_decay_rate,
+    })
+    return config
+
+  def _do_use_weight_decay(self, param_name):
+    """Whether to use L2 weight decay for `param_name`."""
+    if self.weight_decay_rate == 0:
+      return False
+
+    if self._include_in_weight_decay:
+      for r in self._include_in_weight_decay:
+        if re.search(r, param_name) is not None:
+          return True
+
+    if self._exclude_from_weight_decay:
+      for r in self._exclude_from_weight_decay:
+        if re.search(r, param_name) is not None:
+          return False
+    return True
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/LICENSE b/nlp/text_classification/bert/tensorflow2.0/orbit/LICENSE
new file mode 100644
index 000000000..7a4a3ea24
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/README.md b/nlp/text_classification/bert/tensorflow2.0/orbit/README.md
new file mode 100644
index 000000000..941286003
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/README.md
@@ -0,0 +1,11 @@
+# Orbit
+
+Orbit is a flexible, lightweight library designed to make it easy to write
+[custom training loops][custom_training] in TensorFlow 2. Orbit handles common
+model training tasks such as saving checkpoints, running model evaluations, and
+setting up summary writing, while giving users full control over implementing
+the inner training loop. It integrates with `tf.distribute` seamlessly and
+supports running on different device types (CPU, GPU, and TPU). The core code is
+intended to be easy to read and fork.
+
+[custom_training]: https://www.tensorflow.org/tutorials/distribute/custom_training
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/__init__.py b/nlp/text_classification/bert/tensorflow2.0/orbit/__init__.py
new file mode 100644
index 000000000..01442a565
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Defines exported symbols for the `orbit` package."""
+
+from orbit import actions
+from orbit import utils
+
+from orbit.controller import Action
+from orbit.controller import Controller
+
+from orbit.runner import AbstractEvaluator
+from orbit.runner import AbstractTrainer
+
+from orbit.standard_runner import StandardEvaluator
+from orbit.standard_runner import StandardEvaluatorOptions
+from orbit.standard_runner import StandardTrainer
+from orbit.standard_runner import StandardTrainerOptions
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/actions/__init__.py b/nlp/text_classification/bert/tensorflow2.0/orbit/actions/__init__.py
new file mode 100644
index 000000000..5c3eab2d8
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/actions/__init__.py
@@ -0,0 +1,74 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Defines an "action" abstraction for use with `orbit.Controller`.
+
+"Actions" are simply arbitrary callables that are applied by the `Controller`
+to the output of train steps (after each inner loop of `steps_per_loop` steps)
+or an evaluation. This provides a hook mechanism, enabling things like reporting
+metrics to Vizier, model exporting, additional logging, etc.
+
+The basic `Action` abstraction (just a type alias) is defined in the
+`controller` module. This `actions` module adds a `ConditionalAction` utility
+class to make it easy to trigger actions conditionally based on reusable
+predicates, as well as a small handful of predefined conditions/actions (in
+particular, a `NewBestMetric` condition and an `ExportSavedModel` action).
+
+One example of using actions to do metric-conditional export:
+
+    new_best_metric = orbit.actions.NewBestMetric('accuracy')
+    export_action = orbit.actions.ConditionalAction(
+        condition=lambda x: x['accuracy'] > 0.9 and new_best_metric(x),
+        action=orbit.actions.ExportSavedModel(
+            model,
+            orbit.actions.ExportFileManager(
+                base_name=f'{FLAGS.model_dir}/saved_model',
+                next_id_fn=trainer.global_step.numpy),
+            signatures=model.infer))
+
+    controller = orbit.Controller(
+        strategy=strategy,
+        trainer=trainer,
+        evaluator=evaluator,
+        eval_actions=[export_action],
+        global_step=trainer.global_step,
+        steps_per_loop=FLAGS.steps_per_loop,
+        checkpoint_manager=checkpoint_manager,
+        summary_interval=1000)
+
+Note: In multi-client settings where each client runs its own `Controller`
+instance, some care should be taken in deciding which clients should run certain
+actions. Isolating actions to an individual client (say client 0) can be
+achieved using `ConditionalAction` as follows:
+
+    client_0_actions = orbit.actions.ConditionalAction(
+        condition=lambda _: client_id() == 0,
+        action=[
+            ...
+        ])
+
+In particular, the `NewBestMetric` condition may be used in multi-client
+settings if all clients are guaranteed to compute the same metric (ensuring this
+is up to client code, not Orbit). However, when saving metrics it may be helpful
+to avoid unnecessary writes by setting the `write_value` parameter to `False`
+for most clients.
+"""
+
+from orbit.actions.conditional_action import ConditionalAction
+
+from orbit.actions.export_saved_model import ExportFileManager
+from orbit.actions.export_saved_model import ExportSavedModel
+
+from orbit.actions.new_best_metric import JSONPersistedValue
+from orbit.actions.new_best_metric import NewBestMetric
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/actions/conditional_action.py b/nlp/text_classification/bert/tensorflow2.0/orbit/actions/conditional_action.py
new file mode 100644
index 000000000..e4b812227
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/actions/conditional_action.py
@@ -0,0 +1,60 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Provides a `ConditionalAction` abstraction."""
+
+from typing import Any, Callable, Sequence, Union
+
+from orbit import controller
+from orbit import runner
+
+import tensorflow as tf
+
+Condition = Callable[[runner.Output], Union[bool, tf.Tensor]]
+
+
+def _as_sequence(maybe_sequence: Union[Any, Sequence[Any]]) -> Sequence[Any]:
+  if isinstance(maybe_sequence, Sequence):
+    return maybe_sequence
+  return [maybe_sequence]
+
+
+class ConditionalAction:
+  """Represents an action that is only taken when a given condition is met.
+
+  This class is itself an `Action` (a callable that can be applied to train or
+  eval outputs), but is intended to make it easier to write modular and reusable
+  conditions by decoupling "when" something whappens (the condition) from "what"
+  happens (the action).
+  """
+
+  def __init__(
+      self,
+      condition: Condition,
+      action: Union[controller.Action, Sequence[controller.Action]],
+  ):
+    """Initializes the instance.
+
+    Args:
+      condition: A callable accepting train or eval outputs and returing a bool.
+      action: The action (or optionally sequence of actions) to perform when
+        `condition` is met.
+    """
+    self.condition = condition
+    self.action = action
+
+  def __call__(self, output: runner.Output) -> None:
+    if self.condition(output):
+      for action in _as_sequence(self.action):
+        action(output)
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/actions/conditional_action_test.py b/nlp/text_classification/bert/tensorflow2.0/orbit/actions/conditional_action_test.py
new file mode 100644
index 000000000..cfcfd0f54
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/actions/conditional_action_test.py
@@ -0,0 +1,39 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for orbit.actions.conditional_action."""
+
+from orbit import actions
+
+import tensorflow as tf
+
+
+class ConditionalActionTest(tf.test.TestCase):
+
+  def test_conditional_action(self):
+    # Define a function to raise an AssertionError, since we can't in a lambda.
+    def raise_assertion(arg):
+      raise AssertionError(str(arg))
+
+    conditional_action = actions.ConditionalAction(
+        condition=lambda x: x['value'], action=raise_assertion)
+
+    conditional_action({'value': False})  # Nothing is raised.
+    with self.assertRaises(AssertionError) as ctx:
+      conditional_action({'value': True})
+      self.assertEqual(ctx.exception.message, "{'value': True}")
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/actions/export_saved_model.py b/nlp/text_classification/bert/tensorflow2.0/orbit/actions/export_saved_model.py
new file mode 100644
index 000000000..e53c40c38
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/actions/export_saved_model.py
@@ -0,0 +1,137 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Provides the `ExportSavedModel` action and associated helper classes."""
+
+import re
+
+from typing import Callable, Optional
+
+import tensorflow as tf
+
+
+def _id_key(filename):
+  _, id_num = filename.rsplit('-', maxsplit=1)
+  return int(id_num)
+
+
+def _find_managed_files(base_name):
+  r"""Returns all files matching '{base_name}-\d+', in sorted order."""
+  managed_file_regex = re.compile(rf'{re.escape(base_name)}-\d+$')
+  filenames = tf.io.gfile.glob(f'{base_name}-*')
+  filenames = filter(managed_file_regex.match, filenames)
+  return sorted(filenames, key=_id_key)
+
+
+class _CounterIdFn:
+  """Implements a counter-based ID function for `ExportFileManager`."""
+
+  def __init__(self, base_name: str):
+    managed_files = _find_managed_files(base_name)
+    self.value = _id_key(managed_files[-1]) + 1 if managed_files else 0
+
+  def __call__(self):
+    output = self.value
+    self.value += 1
+    return output
+
+
+class ExportFileManager:
+  """Utility class that manages a group of files with a shared base name.
+
+  For actions like SavedModel exporting, there are potentially many different
+  file naming and cleanup strategies that may be desirable. This class provides
+  a basic interface allowing SavedModel export to be decoupled from these
+  details, and a default implementation that should work for many basic
+  scenarios. Users may subclass this class to alter behavior and define more
+  customized naming and cleanup strategies.
+  """
+
+  def __init__(self,
+               base_name: str,
+               max_to_keep: int = 5,
+               next_id_fn: Optional[Callable[[], int]] = None):
+    """Initializes the instance.
+
+    Args:
+      base_name: A shared base name for file names generated by this class.
+      max_to_keep: The maximum number of files matching `base_name` to keep
+        after each call to `cleanup`. The most recent (as determined by file
+        modification time) `max_to_keep` files are preserved; the rest are
+        deleted. If < 0, all files are preserved.
+      next_id_fn: An optional callable that returns integer IDs to append to
+        base name (formatted as `'{base_name}-{id}'`). The order of integers is
+        used to sort files to determine the oldest ones deleted by `clean_up`.
+        If not supplied, a default ID based on an incrementing counter is used.
+        One common alternative maybe be to use the current global step count,
+        for instance passing `next_id_fn=global_step.numpy`.
+    """
+    self._base_name = base_name
+    self._max_to_keep = max_to_keep
+    self._next_id_fn = next_id_fn or _CounterIdFn(base_name)
+
+  @property
+  def managed_files(self):
+    """Returns all files managed by this instance, in sorted order.
+
+    Returns:
+      The list of files matching the `base_name` provided when constructing this
+      `ExportFileManager` instance, sorted in increasing integer order of the
+      IDs returned by `next_id_fn`.
+    """
+    return _find_managed_files(self._base_name)
+
+  def clean_up(self):
+    """Cleans up old files matching `{base_name}-*`.
+
+    The most recent `max_to_keep` files are preserved.
+    """
+    if self._max_to_keep < 0:
+      return
+
+    for filename in self.managed_files[:-self._max_to_keep]:
+      tf.io.gfile.rmtree(filename)
+
+  def next_name(self) -> str:
+    """Returns a new file name based on `base_name` and `next_id_fn()`."""
+    return f'{self._base_name}-{self._next_id_fn()}'
+
+
+class ExportSavedModel:
+  """Action that exports the given model as a SavedModel."""
+
+  def __init__(self,
+               model: tf.Module,
+               file_manager: ExportFileManager,
+               signatures,
+               options: Optional[tf.saved_model.SaveOptions] = None):
+    """Initializes the instance.
+
+    Args:
+      model: The model to export.
+      file_manager: An instance of `ExportFileManager` (or a subclass), that
+        provides file naming and cleanup functionality.
+      signatures: The signatures to forward to `tf.saved_model.save()`.
+      options: Optional options to forward to `tf.saved_model.save()`.
+    """
+    self.model = model
+    self.file_manager = file_manager
+    self.signatures = signatures
+    self.options = options
+
+  def __call__(self, _):
+    """Exports the SavedModel."""
+    export_dir = self.file_manager.next_name()
+    tf.saved_model.save(self.model, export_dir, self.signatures, self.options)
+    self.file_manager.clean_up()
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/actions/export_saved_model_test.py b/nlp/text_classification/bert/tensorflow2.0/orbit/actions/export_saved_model_test.py
new file mode 100644
index 000000000..191f6fdb5
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/actions/export_saved_model_test.py
@@ -0,0 +1,157 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for orbit.actions.export_saved_model."""
+
+import os
+
+from orbit import actions
+
+import tensorflow as tf
+
+
+def _id_key(name):
+  _, id_num = name.rsplit('-', maxsplit=1)
+  return int(id_num)
+
+
+def _id_sorted_file_base_names(dir_path):
+  return sorted(tf.io.gfile.listdir(dir_path), key=_id_key)
+
+
+class TestModel(tf.Module):
+
+  def __init__(self):
+    self.value = tf.Variable(0)
+
+  @tf.function(input_signature=[])
+  def __call__(self):
+    return self.value
+
+
+class ExportSavedModelTest(tf.test.TestCase):
+
+  def test_export_file_manager_default_ids(self):
+    directory = self.create_tempdir()
+    base_name = os.path.join(directory.full_path, 'basename')
+    manager = actions.ExportFileManager(base_name, max_to_keep=3)
+    self.assertLen(tf.io.gfile.listdir(directory.full_path), 0)
+    directory.create_file(manager.next_name())
+    manager.clean_up()  # Shouldn't do anything...
+    self.assertLen(tf.io.gfile.listdir(directory.full_path), 1)
+    directory.create_file(manager.next_name())
+    manager.clean_up()  # Shouldn't do anything...
+    self.assertLen(tf.io.gfile.listdir(directory.full_path), 2)
+    directory.create_file(manager.next_name())
+    manager.clean_up()  # Shouldn't do anything...
+    self.assertLen(tf.io.gfile.listdir(directory.full_path), 3)
+    directory.create_file(manager.next_name())
+    self.assertLen(tf.io.gfile.listdir(directory.full_path), 4)
+    self.assertEqual(
+        _id_sorted_file_base_names(directory.full_path),
+        ['basename-0', 'basename-1', 'basename-2', 'basename-3'])
+    manager.clean_up()  # Should delete file with lowest ID.
+    self.assertEqual(
+        _id_sorted_file_base_names(directory.full_path),
+        ['basename-1', 'basename-2', 'basename-3'])
+    manager = actions.ExportFileManager(base_name, max_to_keep=3)
+    self.assertEqual(os.path.basename(manager.next_name()), 'basename-4')
+
+  def test_export_file_manager_custom_ids(self):
+    directory = self.create_tempdir()
+    base_name = os.path.join(directory.full_path, 'basename')
+
+    id_num = 0
+
+    def next_id():
+      return id_num
+
+    manager = actions.ExportFileManager(
+        base_name, max_to_keep=2, next_id_fn=next_id)
+    self.assertLen(tf.io.gfile.listdir(directory.full_path), 0)
+    id_num = 30
+    directory.create_file(manager.next_name())
+    self.assertLen(tf.io.gfile.listdir(directory.full_path), 1)
+    manager.clean_up()  # Shouldn't do anything...
+    self.assertEqual(
+        _id_sorted_file_base_names(directory.full_path), ['basename-30'])
+    id_num = 200
+    directory.create_file(manager.next_name())
+    self.assertLen(tf.io.gfile.listdir(directory.full_path), 2)
+    manager.clean_up()  # Shouldn't do anything...
+    self.assertEqual(
+        _id_sorted_file_base_names(directory.full_path),
+        ['basename-30', 'basename-200'])
+    id_num = 1000
+    directory.create_file(manager.next_name())
+    self.assertLen(tf.io.gfile.listdir(directory.full_path), 3)
+    self.assertEqual(
+        _id_sorted_file_base_names(directory.full_path),
+        ['basename-30', 'basename-200', 'basename-1000'])
+    manager.clean_up()  # Should delete file with lowest ID.
+    self.assertLen(tf.io.gfile.listdir(directory.full_path), 2)
+    self.assertEqual(
+        _id_sorted_file_base_names(directory.full_path),
+        ['basename-200', 'basename-1000'])
+
+  def test_export_file_manager_managed_files(self):
+    directory = self.create_tempdir()
+    directory.create_file('basename-5')
+    directory.create_file('basename-10')
+    directory.create_file('basename-50')
+    directory.create_file('basename-1000')
+    directory.create_file('basename-9')
+    directory.create_file('basename-10-suffix')
+    base_name = os.path.join(directory.full_path, 'basename')
+    manager = actions.ExportFileManager(base_name, max_to_keep=3)
+    self.assertLen(manager.managed_files, 5)
+    self.assertEqual(manager.next_name(), f'{base_name}-1001')
+    manager.clean_up()
+    self.assertEqual(
+        manager.managed_files,
+        [f'{base_name}-10', f'{base_name}-50', f'{base_name}-1000'])
+
+  def test_export_saved_model(self):
+    directory = self.create_tempdir()
+    base_name = os.path.join(directory.full_path, 'basename')
+    file_manager = actions.ExportFileManager(base_name, max_to_keep=2)
+    model = TestModel()
+    export_action = actions.ExportSavedModel(
+        model, file_manager=file_manager, signatures=model.__call__)
+
+    model.value.assign(3)
+    self.assertEqual(model(), 3)
+    self.assertEmpty(file_manager.managed_files)
+    export_action({})
+    self.assertLen(file_manager.managed_files, 1)
+    reloaded_model = tf.saved_model.load(file_manager.managed_files[-1])
+    self.assertEqual(reloaded_model(), 3)
+
+    model.value.assign(5)
+    self.assertEqual(model(), 5)
+    export_action({})
+    self.assertLen(file_manager.managed_files, 2)
+    reloaded_model = tf.saved_model.load(file_manager.managed_files[-1])
+    self.assertEqual(reloaded_model(), 5)
+
+    model.value.assign(7)
+    self.assertEqual(model(), 7)
+    export_action({})
+    self.assertLen(file_manager.managed_files, 2)  # Still 2, due to clean up.
+    reloaded_model = tf.saved_model.load(file_manager.managed_files[-1])
+    self.assertEqual(reloaded_model(), 7)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/actions/new_best_metric.py b/nlp/text_classification/bert/tensorflow2.0/orbit/actions/new_best_metric.py
new file mode 100644
index 000000000..f2a01c80f
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/actions/new_best_metric.py
@@ -0,0 +1,222 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Provides the `NewBestMetric` condition and associated helper classes."""
+
+import json
+import os
+import sys
+from typing import Any, Callable, Optional, Union
+import uuid
+
+from orbit import runner
+from orbit import utils
+
+import tensorflow as tf
+
+MetricFn = Callable[[runner.Output], Union[float, tf.Tensor]]
+
+
+class NewBestMetric:
+  """Condition that is satisfied when a new best metric is achieved.
+
+  This class keeps track of the best metric value seen so far, optionally in a
+  persistent (preemption-safe) way.
+
+  Two methods are provided, which each satisfy the `Action` protocol: `test` for
+  only testing whether a new best metric is achieved by a given train/eval
+  output, and `commit`, which both tests and records the new best metric value
+  if it is achieved. These separate methods enable the same `NewBestMetric`
+  instance to be reused as a condition multiple times, and can also provide
+  additional preemption/failure safety. For example, to avoid updating the best
+  metric if a model export fails or is pre-empted:
+
+      new_best_metric = orbit.actions.NewBestMetric(
+        'accuracy', filename='/model/dir/best_metric')
+      action = orbit.actions.ConditionalAction(
+          condition=new_best_metric.test,
+          action=[
+            orbit.actions.ExportSavedModel(...),
+            new_best_metric.commit
+          ])
+
+  The default `__call__` implementation is equivalent to `commit`.
+
+  This class is safe to use in multi-client settings if all clients can be
+  guaranteed to compute the same metric. However when saving metrics it may be
+  helpful to avoid unnecessary writes by setting the `write_value` parameter to
+  `False` for most clients.
+
+  Attributes:
+    metric: The metric passed to __init__ (may be a string key or a callable
+      that can be applied to train/eval output).
+    higher_is_better: Whether higher metric values are better.
+  """
+
+  def __init__(self,
+               metric: Union[str, MetricFn],
+               higher_is_better: bool = True,
+               filename: Optional[str] = None,
+               write_metric=True):
+    """Initializes the instance.
+
+    Args:
+      metric: Either a string key name to use to look up a metric (assuming the
+        train/eval output is a dictionary), or a callable that accepts the
+        train/eval output and returns a metric value.
+      higher_is_better: Whether higher metric values are better. If `True`, a
+        new best metric is achieved when the metric value is strictly greater
+        than the previous best metric. If `False`, a new best metric is achieved
+        when the metric value is strictly less than the previous best metric.
+      filename: A filename to use for storage of the best metric value seen so
+        far, to allow peristence of the value across preemptions. If `None`
+        (default), values aren't persisted.
+      write_metric: If `filename` is set, this controls whether this instance
+        will write new best metric values to the file, or just read from the
+        file to obtain the initial value. Setting this to `False` for most
+        clients in some multi-client setups can avoid unnecessary file writes.
+        Has no effect if `filename` is `None`.
+    """
+    self.metric = metric
+    self.higher_is_better = higher_is_better
+    float_max = sys.float_info.max
+    self._best_value = JSONPersistedValue(
+        initial_value=-float_max if higher_is_better else float_max,
+        filename=filename,
+        write_value=write_metric)
+
+  def __call__(self, output: runner.Output) -> bool:
+    """Tests `output` and updates the current best value if necessary.
+
+    This is equivalent to `commit` below.
+
+    Args:
+      output: The train or eval output to test.
+
+    Returns:
+      `True` if `output` contains a new best metric value, `False` otherwise.
+    """
+    return self.commit(output)
+
+  def metric_value(self, output: runner.Output) -> float:
+    """Computes the metric value for the given `output`."""
+    if callable(self.metric):
+      value = self.metric(output)
+    else:
+      value = output[self.metric]
+    return float(utils.get_value(value))
+
+  @property
+  def best_value(self) -> float:
+    """Returns the best metric value seen so far."""
+    return self._best_value.read()
+
+  def test(self, output: runner.Output) -> bool:
+    """Tests `output` to see if it contains a new best metric value.
+
+    If `output` does contain a new best metric value, this method does *not*
+    save it (i.e., calling this method multiple times in a row with the same
+    `output` will continue to return `True`).
+
+    Args:
+      output: The train or eval output to test.
+
+    Returns:
+      `True` if `output` contains a new best metric value, `False` otherwise.
+    """
+    metric_value = self.metric_value(output)
+    if self.higher_is_better:
+      if metric_value > self.best_value:
+        return True
+    else:  # Lower is better.
+      if metric_value < self.best_value:
+        return True
+    return False
+
+  def commit(self, output: runner.Output) -> bool:
+    """Tests `output` and updates the current best value if necessary.
+
+    Unlike `test` above, if `output` does contain a new best metric value, this
+    method *does* save it (i.e., subsequent calls to this method with the same
+    `output` will return `False`).
+
+    Args:
+      output: The train or eval output to test.
+
+    Returns:
+      `True` if `output` contains a new best metric value, `False` otherwise.
+    """
+
+    if self.test(output):
+      self._best_value.write(self.metric_value(output))
+      return True
+    return False
+
+
+class JSONPersistedValue:
+  """Represents a value that is persisted via a file-based backing store.
+
+  The value must be JSON-serializable. Each time the value is updated, it will
+  be written to the backing file. It is only read from the file at
+  initialization.
+  """
+
+  def __init__(self,
+               initial_value: Any,
+               filename: str,
+               write_value: bool = True):
+    """Initializes the instance.
+
+    Args:
+      initial_value: The initial value to use if no backing file exists or was
+        given. This must be a JSON-serializable value (possibly nested
+        combination of lists, dicts, and primitive values).
+      filename: The path to use for persistent storage of the value. This may be
+        `None`, in which case the value is not stable across preemptions.
+      write_value: If `True`, new values will be written to `filename` on calls
+        to `write()`. If `False`, `filename` is only read once to restore any
+        persisted value, and new values will not be written to it. This can be
+        useful in certain multi-client settings to avoid race conditions or
+        excessive file writes. If `filename` is `None`, this parameter has no
+        effect.
+    """
+    self._value = None
+    self._filename = filename
+    self._write_value = write_value
+
+    if self._filename is not None:
+      if tf.io.gfile.exists(self._filename):
+        if tf.io.gfile.stat(self._filename).length > 0:
+          with tf.io.gfile.GFile(self._filename, 'r') as f:
+            self._value = json.load(f)
+      elif self._write_value:
+        tf.io.gfile.makedirs(os.path.dirname(self._filename))
+
+    if self._value is None:
+      self.write(initial_value)
+
+  def read(self):
+    """Returns the value."""
+    return self._value
+
+  def write(self, value):
+    """Writes the value, updating the backing store if one was provided."""
+    self._value = value
+    if self._filename is not None and self._write_value:
+      # To achieve atomic writes, we first write to a temporary file, and then
+      # rename it to `self._filename`.
+      tmp_filename = f'{self._filename}.tmp.{uuid.uuid4().hex}'
+      with tf.io.gfile.GFile(tmp_filename, 'w') as f:
+        json.dump(self._value, f)
+      tf.io.gfile.rename(tmp_filename, self._filename, overwrite=True)
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/actions/new_best_metric_test.py b/nlp/text_classification/bert/tensorflow2.0/orbit/actions/new_best_metric_test.py
new file mode 100644
index 000000000..aff21fda2
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/actions/new_best_metric_test.py
@@ -0,0 +1,94 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for orbit.actions.new_best_metric."""
+
+import os
+
+from orbit import actions
+
+import tensorflow as tf
+
+
+class NewBestMetricTest(tf.test.TestCase):
+
+  def test_new_best_metric_higher_is_better(self):
+    new_best_metric = actions.NewBestMetric(
+        lambda x: x['value'], higher_is_better=True)
+    self.assertTrue(new_best_metric.test({'value': 0.0}))
+    self.assertTrue(new_best_metric.commit({'value': 0.0}))
+    self.assertFalse(new_best_metric.test({'value': 0.0}))
+    self.assertTrue(new_best_metric.test({'value': 1.0}))
+
+  def test_new_best_metric_lower_is_better(self):
+    new_best_metric = actions.NewBestMetric('value', higher_is_better=False)
+    self.assertTrue(new_best_metric.test({'value': 0.0}))
+    self.assertTrue(new_best_metric.commit({'value': 0.0}))
+    self.assertFalse(new_best_metric.test({'value': 0.0}))
+    self.assertTrue(new_best_metric.test({'value': -1.0}))
+
+  def test_new_best_metric_persistence(self):
+    backing_file = self.create_tempfile()
+    new_best_metric = actions.NewBestMetric(
+        'value',
+        higher_is_better=True,
+        filename=backing_file.full_path,
+        write_metric=False)
+    self.assertTrue(new_best_metric.test({'value': 0.0}))
+    self.assertTrue(new_best_metric.commit({'value': 0.0}))
+    self.assertFalse(new_best_metric.test({'value': 0.0}))
+    new_best_metric = actions.NewBestMetric(
+        'value', higher_is_better=True, filename=backing_file.full_path)
+    self.assertLess(new_best_metric.best_value, 0.0)
+    self.assertTrue(new_best_metric.commit({'value': 5.0}))
+    self.assertEqual(new_best_metric.best_value, 5.0)
+    new_best_metric = actions.NewBestMetric(
+        'value', higher_is_better=True, filename=backing_file.full_path)
+    self.assertEqual(new_best_metric.best_value, 5.0)
+
+  def test_json_persisted_value(self):
+    tempfile = self.create_tempfile().full_path
+    value = {'a': 1, 'b': 2}
+    persisted_value = actions.JSONPersistedValue(value, tempfile)
+    # The inital value is used since tempfile is empty.
+    self.assertEqual(persisted_value.read(), value)
+    persisted_value = actions.JSONPersistedValue('ignored', tempfile)
+    # Initial value of 'ignored' is ignored, since there's a value in tempfile.
+    self.assertEqual(persisted_value.read(), value)
+    value = [1, 2, 3]
+    persisted_value.write(value)
+    # Now that a new value is written, it gets read on initialization.
+    persisted_value = actions.JSONPersistedValue(['also ignored'], tempfile)
+    self.assertEqual(persisted_value.read(), value)
+    # Writes can be disabled.
+    persisted_value = actions.JSONPersistedValue(
+        'ignored', tempfile, write_value=False)
+    self.assertEqual(persisted_value.read(), value)
+    persisted_value.write("won't get persisted")
+    persisted_value = actions.JSONPersistedValue(
+        'ignored', tempfile, write_value=False)
+    self.assertEqual(persisted_value.read(), value)
+
+  def test_json_persisted_value_create_dirs(self):
+    tempfile = os.path.join(self.create_tempdir().full_path, 'subdir/value')
+    value = {'a': 1, 'b': 2}
+    # The directory is not created if write_value=False.
+    actions.JSONPersistedValue(value, tempfile, write_value=False)
+    self.assertFalse(tf.io.gfile.exists(os.path.dirname(tempfile)))
+    actions.JSONPersistedValue(value, tempfile)
+    self.assertTrue(tf.io.gfile.exists(tempfile))
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/controller.py b/nlp/text_classification/bert/tensorflow2.0/orbit/controller.py
new file mode 100644
index 000000000..525331c7e
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/controller.py
@@ -0,0 +1,515 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Provides a `Controller` class for managing the outer training loop."""
+
+import pprint
+import time
+
+from typing import Callable, List, Optional, Union
+
+from absl import logging
+
+from orbit import runner
+from orbit import utils
+
+import tensorflow as tf
+
+
+def _log(message: str):
+  """Logs `message` to the `info` log, and also prints to stdout."""
+  logging.info(message)
+  print(message)
+
+
+logging.ABSLLogger.register_frame_to_skip(__file__, _log.__name__)
+
+
+def _format_output(output, indent=4):
+  """Formats `output`, either on one line, or indented across multiple lines."""
+  formatted = pprint.pformat(output)
+  lines = formatted.splitlines()
+  if len(lines) == 1:
+    return formatted
+  lines = [" " * indent + line for line in lines]
+  return "\n" + "\n".join(lines)
+
+
+Action = Callable[[runner.Output], None]
+
+
+class Controller:
+  """Class that controls the outer loop of model training and evaluation.
+
+  Orbit divides training and evaluation into "inner" and "outer" loops. Inner
+  loops are implemented by users in the form of `AbstractTrainer` and
+  `AbstractEvaluator` subclasses, and define how to run a given number of
+  training or evaluation steps. The outer loop is provided by this `Controller`,
+  and interleaves calls to the user-provided inner loops with additional actions
+  such as saving checkpoints, running evaluations, writing summaries, as well as
+  (optionally) user provided `Action`s (see below).
+
+  There are four top-level "outer loops" provided:
+
+    - `train`, which trains until a specified number of global steps is reached;
+    - `evaluate`, for one-off model evaluation;
+    - `train_and_evaluate`, for interleaved training and evaluation;
+    - `evaluate_continuously`, for monitoring a given directory and running
+      evaluations on new model checkpoints.
+
+  While this class attempts to provide out-of-the-box solutions for common
+  training and evaluation use cases, the internal details and method
+  implementations are also intended to be simple enough to make subclassing or
+  other custom outer loop implementations easy to achieve.
+
+  Some additional customization can be achieved by supplying `train_actions` or
+  `eval_actions` when constructing the `Controller`. These are just lists of
+  arbitrary callables that are applied by the `Controller` to the output of
+  train steps (after each inner loop of `steps_per_loop` steps) or an
+  evaluation. This provides a hook mechanism, enabling things like reporting
+  metrics to Vizier, model exporting, additional logging, etc. See the
+  `orbit.actions` package for a small handful of predefined actions and some
+  utility classes that may be useful in defining your own.
+  """
+
+  def __init__(
+      self,
+      *,  # Makes all args keyword only.
+      global_step: tf.Variable,
+      trainer: Optional[runner.AbstractTrainer] = None,
+      evaluator: Optional[runner.AbstractEvaluator] = None,
+      strategy: Optional[tf.distribute.Strategy] = None,
+      # Actions
+      train_actions: Optional[List[Action]] = None,
+      eval_actions: Optional[List[Action]] = None,
+      # Train related
+      steps_per_loop: Optional[int] = None,
+      checkpoint_manager: Optional[tf.train.CheckpointManager] = None,
+      # Summary related
+      summary_interval: Optional[int] = None,
+      summary_dir: Optional[str] = None,
+      # Evaluation related
+      eval_summary_dir: Optional[str] = None,
+  ):
+    """Initializes a `Controller` instance.
+
+    Note that if `checkpoint_manager` is provided and there are checkpoints in
+    the associated model directory, the model will be restored from the most
+    recent checkpoint during this `__init__` method.
+
+    Args:
+      global_step: An integer `tf.Variable` storing the global training step
+        number. Usually this can be obtained from the `iterations` property of
+        the model's optimizer (e.g. `trainer.optimizer.iterations`). In cases
+        where multiple optimizers are used, or if one model "step" corresponds
+        to more than one update to model parameters, users can create and
+        increment their own global step variable as well. In this case it is
+        recommended to create the `tf.Variable` inside the distribution strategy
+        scope, with `aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA` (see
+        also `orbit.utils.create_global_step()`).
+      trainer: An instance of `orbit.AbstractTrainer`, which implements the
+        inner training loop.
+      evaluator: An instance of `orbit.AbstractEvaluator`, which implements
+        evaluation.
+      strategy: An instance of `tf.distribute.Strategy`. If not provided, the
+        strategy will be initialized from the current in-scope strategy using
+        `tf.distribute.get_strategy()`.
+      train_actions: An optional list of `orbit.Action`s to call after each
+        block of `steps_per_loop` training steps are run. These will be called
+        with the output of `trainer.train`.
+      eval_actions: An optional list of `orbit.Action`s to call after each
+        evaluation. These will be called with the output of
+        `evaluator.evaluate`.
+      steps_per_loop: The number of steps to run in each inner loop of training
+        (passed as the `num_steps` parameter of `trainer.train`).
+      checkpoint_manager: An instance of `tf.train.CheckpointManager`. If
+        provided and there are checkpoints in the associated model directory,
+        the model will be restored from the most recent checkpoint inside this
+        `__init__` method. If not provided, the `Controller` will not
+        automatically save to or restore from checkpoints.
+      summary_interval: Step interval for training summaries. Note that this
+        argument only applies to `tf.summary` calls inside the `trainer.train`
+        function. Summaries written by the `Controller` (specifically
+        "steps_per_second" and output from the `trainer.train` method) will
+        always be enabled unless the `summary_dir` parameter is `None`. If set,
+        the value must be divisible by `steps_per_loop`.
+      summary_dir: The directory to write summaries to. To use the same
+        directory as for checkpointing, pass `checkpoint_manager.directory`. If
+        `None`, no training summaries will be written.
+      eval_summary_dir: The directory to write eval summaries to. If `None`, it
+        will be set to `summary_dir`. If both `summary_dir` and
+        `eval_summary_dir` are `None`, no eval summaries will be written.
+
+    Raises:
+      ValueError: If both `trainer` and `evaluator` are `None`.
+      ValueError: If `steps_per_loop` is not a positive integer.
+      ValueError: If `summary_interval` is not a positive integer or is not
+        divisible by `steps_per_loop`.
+    """
+    if trainer is None and evaluator is None:
+      raise ValueError("`trainer` and `evaluator` should not both be `None`.")
+
+    if trainer is not None:
+      if steps_per_loop is None:
+        raise ValueError(
+            "`steps_per_loop` is required when `trainer` is provided.")
+      elif not isinstance(steps_per_loop, int) or steps_per_loop < 1:
+        raise ValueError(
+            f"`steps_per_loop` ({steps_per_loop}) must be a positive integer.")
+
+      if summary_interval is not None:
+        if summary_interval <= 0:
+          raise ValueError(
+              f"`summary_interval` ({summary_interval}) must be larger than 0.")
+        elif summary_interval % steps_per_loop != 0:
+          raise ValueError(
+              f"`summary interval` ({summary_interval}) must be a multiple "
+              f"of `steps_per_loop` ({steps_per_loop}).")
+
+    if not isinstance(global_step, tf.Variable):
+      raise ValueError("`global_step` must be a `tf.Variable`.")
+
+    self.trainer = trainer
+    self.evaluator = evaluator
+
+    self.strategy = strategy or tf.distribute.get_strategy()
+
+    self.train_actions = train_actions or []
+    self.eval_actions = eval_actions or []
+
+    self.global_step = global_step
+    self.checkpoint_manager = checkpoint_manager
+
+    if self.trainer is not None:
+      self.step_timer = None
+      self.steps_per_loop = steps_per_loop
+      self.summary_interval = summary_interval
+      self.summary_manager = utils.SummaryManager(
+          summary_dir, tf.summary.scalar, global_step=self.global_step)
+
+    if self.evaluator is not None:
+      eval_summary_dir = eval_summary_dir or summary_dir
+      if eval_summary_dir == summary_dir and self.trainer is not None:
+        # Reuse the summary writer if train and evaluation summary directory
+        # are the same.
+        self.eval_summary_manager = self.summary_manager
+      else:
+        self.eval_summary_manager = utils.SummaryManager(
+            eval_summary_dir, tf.summary.scalar, global_step=self.global_step)
+
+    tf.summary.experimental.set_step(self.global_step)
+
+    # Restores the model if needed.
+    if self.checkpoint_manager is not None:
+      restored_path = self.restore_checkpoint()
+      if restored_path:
+        _log(f"restored from checkpoint: {restored_path}")
+
+  def train(self, steps: int, checkpoint_at_completion: bool = True):
+    """Runs training until the specified global step count has been reached.
+
+    This method makes calls to `self.trainer.train()` until the global step
+    count is equal to `steps`. It will additionally save checkpoints (if a
+    `CheckpointManager` was passed to `Controller.__init__`) and summarize
+    training output (if `summary_dir` is set).
+
+    Args:
+      steps: The global step count to train up to.
+      checkpoint_at_completion: Whether to save a checkpoint when this method
+        returns (regardless of the checkpointing interval). Defaults to `True`.
+    """
+    self._require("trainer", for_method="train")
+
+    # TODO(momernick): Support steps=None or -1 (training to exhaustion).
+    current_step = self.global_step.numpy()  # Cache, since this is expensive.
+    _log(f"train | step: {current_step: 6d} | training until step {steps}...")
+    while current_step < steps:
+      # Calculates steps to run for the next train loop.
+      num_steps = min(steps - current_step, self.steps_per_loop)
+      self._train_n_steps(num_steps)
+      self._maybe_save_checkpoint()
+      current_step = self.global_step.numpy()
+
+    if checkpoint_at_completion:
+      self._maybe_save_checkpoint(check_interval=False)
+
+  def evaluate(self, steps: int = -1) -> Optional[runner.Output]:
+    """Runs evaluation for the given number of steps.
+
+    This method calls `self.evaluator.evaluate(steps)`, then writes the returned
+    summaries (if any).
+
+    Args:
+      steps: The number of evaluation steps to run. The value `-1` is reserved
+        as a special sentinel to indicate a "complete" evaluation that runs
+        until the underlying dataset is exhausted. Support for this is dependent
+        on the specific `evaluator` being used.
+
+    Returns:
+      The evaluation results as a dictionary mapping names to NumPy values.
+
+    Raises:
+      ValueError: If `evaluator` was not provided to `Controller.__init__`.
+      ValueError: If no checkpoint is present in `checkpoint_manager.directory`.
+      ValueError: If `steps` is not a positive value or -1.
+    """
+    self._require("evaluator", for_method="evaluate")
+
+    if steps > 0:
+      steps_msg = f"running {steps} steps of evaluation..."
+    elif steps == -1:
+      steps_msg = "running complete evaluation..."
+    else:
+      raise ValueError(f"`steps` ({steps}) should be > 0, or == -1.")
+
+    current_step = self.global_step.numpy()
+    _log(f" eval | step: {current_step: 6d} | {steps_msg}")
+
+    start = time.time()
+    with self.eval_summary_manager.summary_writer().as_default():
+      steps_tensor = tf.convert_to_tensor(steps, dtype=tf.int32)
+      eval_output = self.evaluator.evaluate(steps_tensor)
+    elapsed = time.time() - start
+
+    eval_output = eval_output or {}
+    for action in self.eval_actions:
+      action(eval_output)
+    eval_output = tf.nest.map_structure(utils.get_value, eval_output)
+
+    _log(f" eval | step: {current_step: 6d} | "
+         f"eval time: {elapsed: 6.1f} sec | "
+         f"output: {_format_output(eval_output)}")
+
+    self.eval_summary_manager.write_summaries(eval_output)
+    self.eval_summary_manager.flush()
+
+    return eval_output
+
+  def train_and_evaluate(self,
+                         train_steps: int,
+                         eval_steps: int = -1,
+                         eval_interval: Optional[int] = None) -> None:
+    """Runs interleaved training and evaluation.
+
+    This method interleaves calls to `self.train()` and `self.evaluate()`,
+    training the model until the global step count equals `train_steps`, and
+    running an evaluation for `eval_steps` every `eval_interval` training steps.
+    In addition, this method will run a final evaluation at the end of the
+    training sequence.
+
+    Args:
+      train_steps: The global step count to train up to.
+      eval_steps: The number of steps to run during an evaluation. If -1, this
+        method will evaluate over the entire evaluation dataset.
+      eval_interval: The number of training steps to run between evaluations. If
+        set, training will always stop every `eval_interval` steps, even if this
+        results in a shorter inner loop than specified by `steps_per_loop`
+        setting. If None, evaluation will only be performed after training is
+        complete.
+
+    Raises:
+      ValueError: If eval_interval is not a multiple of self.steps_per_loop.
+    """
+    self._require("trainer", for_method="train_and_evaluate")
+    self._require("evaluator", for_method="train_and_evaluate")
+
+    current_step = self.global_step.numpy()  # Cache, since this is expensive.
+    eval_interval = eval_interval or (train_steps - current_step)
+    while current_step < train_steps:
+      interval = min(train_steps - current_step, eval_interval)
+      num_steps = current_step + interval
+      self.train(steps=num_steps, checkpoint_at_completion=False)
+      self.evaluate(steps=eval_steps)
+      current_step = self.global_step.numpy()
+    self._maybe_save_checkpoint(check_interval=False)
+
+  def evaluate_continuously(self,
+                            steps: int = -1,
+                            timeout: Optional[Union[int, float]] = None,
+                            timeout_fn: Optional[Callable[[], bool]] = None):
+    """Continuously monitors a directory and evaluates new checkpoints in it.
+
+    This method continuously monitors a directory as specified by this
+    Controller's CheckpointManager init arg and runs evaluation on the
+    checkpoints found there.
+
+    Args:
+      steps: The number of steps to run when evaluating. If -1, this method will
+        evaluate over the entire evaluation dataset.
+      timeout: The maximum number of seconds to wait between checkpoints. See
+        tf.train.checkpoints_iterator documentation.
+      timeout_fn: Optional callable to call after a timeout. If the function
+        returns True, then it means that no new checkpoints will be generated
+        and the iterator will exit.
+
+    Raises:
+      ValueError: If no checkpoint found in `self.checkpoint_manager.directory`.
+      ValueError: If `evaluator` was not provided as a controller init arg.
+    """
+    self._require("evaluator", for_method="evaluate_continuously")
+    self._require("checkpoint_manager", for_method="evaluate_continuously")
+
+    for checkpoint_path in tf.train.checkpoints_iterator(
+        self.checkpoint_manager.directory,
+        timeout=timeout,
+        timeout_fn=timeout_fn):
+      self.restore_checkpoint(checkpoint_path)
+      self.evaluate(steps)
+
+  def restore_checkpoint(self, checkpoint_path: Optional[str] = None):
+    """Restores the model from a checkpoint.
+
+    Args:
+      checkpoint_path: An optional string specifying the checkpoint path to
+        restore from. If `None`, will restore from the most recent checkpoint
+        (or initialize the model using a custom `init_fn` if no checkpoints can
+        be found) using `self.checkpoint_manager.restore_or_initialize()`.
+
+    Returns:
+      The path to the restored checkpoint if a restore happened, or `None` if no
+      restore occurred.
+    """
+    self._require("checkpoint_manager", for_method="restore_checkpoint")
+
+    with self.strategy.scope():
+      # Checkpoint restoring should be inside scope (b/139450638).
+      if checkpoint_path is not None:
+        _log(f"restoring model from {checkpoint_path}...")
+        self.checkpoint_manager.checkpoint.restore(checkpoint_path)
+      else:
+        _log("restoring or initializing model...")
+        checkpoint_path = self.checkpoint_manager.restore_or_initialize()
+
+    if checkpoint_path is not None:
+      _log(f"restored model from {checkpoint_path}.")
+    else:
+      _log("initialized model.")
+
+    return checkpoint_path
+
+  def save_checkpoint(self):
+    """Saves the model to a checkpoint.
+
+    This method will save a checkpoint containing the current state of the
+    model.
+
+    Raises:
+      ValueError: If no `checkpoint_manager` was provided to
+        `Controller.__init__`.
+    """
+    self._require("checkpoint_manager", for_method="save_checkpoint")
+    self._maybe_save_checkpoint(check_interval=False)
+
+  def _train_n_steps(self, num_steps: int):
+    """Runs training for `num_steps` steps.
+
+    Also prints/logs updates about training progress, and summarizes training
+    output (if output is returned from `self.trainer.train()`, and if
+    `self.summary_dir` is set).
+
+    Args:
+      num_steps: An integer specifying how many steps of training to run.
+
+    Raises:
+      RuntimeError: If `global_step` is not properly incremented by `num_steps`
+        after calling `self.trainer.train(num_steps)`.
+    """
+    if not self.step_timer:
+      self.step_timer = StepTimer(self.global_step)
+    current_step = self.global_step.numpy()
+
+    with self.summary_manager.summary_writer().as_default():
+      should_record = False  # Allows static optimization in no-summary cases.
+      if self.summary_interval:
+        # Create a predicate to determine when summaries should be written.
+        should_record = lambda: (self.global_step % self.summary_interval == 0)
+      with tf.summary.record_if(should_record):
+        num_steps_tensor = tf.convert_to_tensor(num_steps, dtype=tf.int32)
+        train_output = self.trainer.train(num_steps_tensor)
+
+    # Verify that global_step was updated properly, then update current_step.
+    expected_step = current_step + num_steps
+    if self.global_step.numpy() != expected_step:
+      message = (
+          f"`trainer.train({num_steps})` did not update `global_step` by "
+          f"{num_steps}. Old value was {current_step}, expected updated value "
+          f"to be {expected_step}, but it was {self.global_step.numpy()}.")
+      logging.warning(message)
+      return
+
+    train_output = train_output or {}
+    for action in self.train_actions:
+      action(train_output)
+    train_output = tf.nest.map_structure(utils.get_value, train_output)
+
+    current_step = expected_step
+    steps_per_second = self.step_timer.steps_per_second()
+    _log(f"train | step: {current_step: 6d} | "
+         f"steps/sec: {steps_per_second: 6.1f} | "
+         f"output: {_format_output(train_output)}")
+
+    train_output["steps_per_second"] = steps_per_second
+    self.summary_manager.write_summaries(train_output)
+    self.summary_manager.flush()
+
+  def _maybe_save_checkpoint(self, check_interval: bool = True):
+    """Conditionally saves a checkpoint.
+
+    A checkpoint is saved if a `CheckpointManager` is available, and if the
+    required number of steps has elapsed since the last checkpoint was saved
+    (although this condition can be disabled by setting `check_interval=False`).
+
+    Args:
+      check_interval: Whether to check if the checkpoint interval has fully
+        elapsed. If `False`, a checkpoint is saved regardless of the elapsed
+        steps since the most recent checkpoint, unless no `checkpoint_manager`
+        was provided to `Controller.__init__`.
+
+    Returns:
+      A boolean indicating whether a checkpoint was saved.
+    """
+    if self.checkpoint_manager and self.checkpoint_manager.checkpoint_interval:
+      ckpt_path = self.checkpoint_manager.save(
+          checkpoint_number=self.global_step.numpy(),
+          check_interval=check_interval)
+      if ckpt_path is not None:
+        _log(f"saved checkpoint to {ckpt_path}.")
+        return True
+    return False
+
+  def _require(self, attribute, for_method):
+    """Utility method to raise an error if the given `attribute` is not set."""
+    if getattr(self, attribute, None) is None:
+      raise ValueError(
+          f"`{attribute}` is not set. Pass `{attribute}` to "
+          f"`Controller.__init__` before calling `{for_method}()`.")
+
+
+class StepTimer:
+  """Utility class for measuring steps/second."""
+
+  def __init__(self, step):
+    self.step = step
+    self.start()
+
+  def start(self):
+    self.last_iteration = self.step.numpy()
+    self.last_time = time.time()
+
+  def steps_per_second(self, restart=True):
+    value = ((self.step.numpy() - self.last_iteration) /
+             (time.time() - self.last_time))
+    if restart:
+      self.start()
+    return value
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/controller_test.py b/nlp/text_classification/bert/tensorflow2.0/orbit/controller_test.py
new file mode 100644
index 000000000..fd1d1b8b8
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/controller_test.py
@@ -0,0 +1,775 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for orbit.controller."""
+
+import os
+
+from absl import logging
+from absl.testing import parameterized
+
+import numpy as np
+
+from orbit import controller
+from orbit import runner
+from orbit import standard_runner
+
+import tensorflow as tf
+
+
+def create_model():
+  x = tf.keras.layers.Input(shape=(3,), name="input")
+  y = tf.keras.layers.Dense(4, name="dense")(x)
+  model = tf.keras.Model(x, y)
+  return model
+
+
+def summaries_with_matching_keyword(keyword, summary_dir):
+  """Returns summary protos matching given keyword from event file."""
+  matches = []
+  event_paths = tf.io.gfile.glob(os.path.join(summary_dir, "events*"))
+  for event in tf.compat.v1.train.summary_iterator(event_paths[-1]):
+    if event.summary is not None:
+      for value in event.summary.value:
+        if keyword in value.tag:
+          matches.append(event.summary)
+  return matches
+
+
+def dataset_fn(ctx):
+  del ctx
+  inputs = np.zeros((10, 3), dtype=np.float32)
+  targets = np.ones((10, 4), dtype=np.float32)
+  dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
+  dataset = dataset.repeat(100)
+  dataset = dataset.batch(10, drop_remainder=True)
+  return dataset
+
+
+class TestRunner(standard_runner.StandardTrainer,
+                 standard_runner.StandardEvaluator):
+  """Implements the training and evaluation APIs for the test model."""
+
+  def __init__(self, return_numpy=False):
+    self.strategy = tf.distribute.get_strategy()
+    self.model = create_model()
+    self.optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.1)
+    self.global_step = self.optimizer.iterations
+    self.train_loss = tf.keras.metrics.Mean("train_loss", dtype=tf.float32)
+    self.eval_loss = tf.keras.metrics.Mean("eval_loss", dtype=tf.float32)
+    self.return_numpy = return_numpy
+    train_dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
+    eval_dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
+    standard_runner.StandardTrainer.__init__(self, train_dataset)
+    standard_runner.StandardEvaluator.__init__(self, eval_dataset)
+
+  def train_step(self, iterator):
+
+    def _replicated_step(inputs):
+      """Replicated training step."""
+      inputs, targets = inputs
+      with tf.GradientTape() as tape:
+        outputs = self.model(inputs)
+        loss = tf.reduce_mean(tf.keras.losses.MSE(targets, outputs))
+      grads = tape.gradient(loss, self.model.variables)
+      self.optimizer.apply_gradients(zip(grads, self.model.variables))
+      self.train_loss.update_state(loss)
+
+    self.strategy.run(_replicated_step, args=(next(iterator),))
+
+  def train_loop_end(self):
+    train_loss = self.train_loss.result()
+    return {
+        "loss": train_loss.numpy() if self.return_numpy else train_loss,
+    }
+
+  def build_eval_dataset(self):
+    return self.strategy.distribute_datasets_from_function(dataset_fn)
+
+  def eval_begin(self):
+    self.eval_loss.reset_states()
+
+  def eval_step(self, iterator):
+
+    def _replicated_step(inputs):
+      """Replicated evaluation step."""
+      inputs, targets = inputs
+      outputs = self.model(inputs)
+      loss = tf.reduce_mean(tf.keras.losses.MSE(targets, outputs))
+      self.eval_loss.update_state(loss)
+
+    self.strategy.run(_replicated_step, args=(next(iterator),))
+
+  def eval_end(self):
+    eval_loss = self.eval_loss.result()
+    return {
+        "eval_loss": eval_loss.numpy() if self.return_numpy else eval_loss,
+    }
+
+
+class TestEvaluator(standard_runner.StandardEvaluator):
+  """Implements the training and evaluation APIs for the test model."""
+
+  def __init__(self):
+    self.strategy = tf.distribute.get_strategy()
+    self.model = create_model()
+    eval_dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
+    standard_runner.StandardEvaluator.__init__(self, eval_dataset)
+
+  def eval_reduce(self, state, output):
+    state.append(output)
+    return state
+
+  def eval_begin(self):
+    return []
+
+  def eval_step(self, iterator):
+
+    def _replicated_step(inputs):
+      """Replicated evaluation step."""
+      inputs, targets = inputs
+      outputs = self.model(inputs)
+      loss = tf.reduce_mean(tf.keras.losses.MSE(targets, outputs))
+      return loss
+
+    per_replica_losses = self.strategy.run(
+        _replicated_step, args=(next(iterator),))
+    mean_loss = self.strategy.reduce(
+        tf.distribute.ReduceOp.MEAN, per_replica_losses, axis=None)
+    return mean_loss
+
+  def eval_end(self, outputs):
+    return {
+        "eval_loss": tf.reduce_mean(outputs),
+    }
+
+
+class TestEvaluatorNoOutput(runner.AbstractEvaluator):
+
+  def evaluate(self, num_steps):
+    pass
+
+
+class TestEvaluatorWithNestedSummary(standard_runner.StandardEvaluator):
+  """Implements the training and evaluation APIs for the test model."""
+
+  def __init__(self):
+    self.strategy = tf.distribute.get_strategy()
+    self.model = create_model()
+    dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
+    dataset2 = self.strategy.distribute_datasets_from_function(dataset_fn)
+    self.loss = tf.keras.metrics.Mean("loss", dtype=tf.float32)
+    self.accuracy = tf.keras.metrics.CategoricalAccuracy(
+        "accuracy", dtype=tf.float32)
+    self.loss2 = tf.keras.metrics.Mean("loss", dtype=tf.float32)
+    self.accuracy2 = tf.keras.metrics.CategoricalAccuracy(
+        "accuracy", dtype=tf.float32)
+    standard_runner.StandardEvaluator.__init__(
+        self, eval_dataset={
+            "dataset": dataset,
+            "dataset2": dataset2
+        })
+
+  def eval_step(self, iterator):
+
+    def _replicated_step(loss, accuracy, inputs):
+      """Replicated evaluation step."""
+      inputs, targets = inputs
+      outputs = self.model(inputs)
+      loss.update_state(tf.keras.losses.MSE(targets, outputs))
+      accuracy.update_state(targets, outputs)
+
+    self.strategy.run(
+        lambda inputs: _replicated_step(self.loss, self.accuracy, inputs),
+        args=(next(iterator["dataset"]),))
+    self.strategy.run(
+        lambda inputs: _replicated_step(self.loss2, self.accuracy2, inputs),
+        args=(next(iterator["dataset2"]),))
+
+  def eval_end(self):
+    return {
+        "dataset": {
+            "loss": self.loss.result(),
+            "accuracy": self.accuracy.result()
+        },
+        "dataset2": {
+            "loss": self.loss2.result(),
+            "accuracy": self.accuracy2.result()
+        },
+    }
+
+
+class TestTrainerWithSummaries(standard_runner.StandardTrainer):
+  """A Trainer model with summaries for testing purposes."""
+
+  def __init__(self):
+    self.strategy = tf.distribute.get_strategy()
+    self.model = create_model()
+    self.optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.1)
+    self.global_step = self.optimizer.iterations
+    self.train_loss = tf.keras.metrics.Mean("train_loss", dtype=tf.float32)
+    train_dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
+    standard_runner.StandardTrainer.__init__(
+        self,
+        train_dataset,
+        options=standard_runner.StandardTrainerOptions(
+            use_tpu_summary_optimization=True))
+
+  def build_train_dataset(self):
+    return self.strategy.distribute_datasets_from_function(dataset_fn)
+
+  def train_step(self, iterator):
+
+    def _replicated_step(inputs):
+      """Replicated training step."""
+      inputs, targets = inputs
+      with tf.GradientTape() as tape:
+        outputs = self.model(inputs)
+        loss = tf.reduce_mean(tf.keras.losses.MSE(targets, outputs))
+      tf.summary.scalar("loss", loss)
+      grads = tape.gradient(loss, self.model.variables)
+      self.optimizer.apply_gradients(zip(grads, self.model.variables))
+      self.train_loss.update_state(loss)
+
+    self.strategy.run(_replicated_step, args=(next(iterator),))
+
+
+class ControllerTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self.model_dir = self.get_temp_dir()
+
+  def test_no_checkpoint(self):
+    test_runner = TestRunner()
+    # No checkpoint manager and no strategy.
+    test_controller = controller.Controller(
+        trainer=test_runner,
+        evaluator=test_runner,
+        global_step=test_runner.global_step,
+        steps_per_loop=2,
+        summary_dir=os.path.join(self.model_dir, "summaries/train"),
+        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"))
+    test_controller.train_and_evaluate(
+        train_steps=10, eval_steps=2, eval_interval=6)
+    self.assertEqual(test_runner.global_step, 10)
+    # Loss and accuracy values should be written into summaries.
+    self.assertNotEmpty(
+        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train")))
+    self.assertNotEmpty(
+        summaries_with_matching_keyword(
+            "loss", os.path.join(self.model_dir, "summaries/train")))
+    self.assertNotEmpty(
+        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval")))
+    self.assertNotEmpty(
+        summaries_with_matching_keyword(
+            "eval_loss", os.path.join(self.model_dir, "summaries/eval")))
+    # No checkpoint, so global step starts from 0.
+    test_runner.global_step.assign(0)
+    test_controller.train_and_evaluate(
+        train_steps=10, eval_steps=2, eval_interval=6)
+    self.assertEqual(test_runner.global_step, 10)
+
+  def test_no_checkpoint_and_summaries(self):
+    test_runner = TestRunner()
+    # No checkpoint + summary directories.
+    test_controller = controller.Controller(
+        trainer=test_runner,
+        evaluator=test_runner,
+        global_step=test_runner.global_step,
+        steps_per_loop=2)
+    test_controller.train_and_evaluate(
+        train_steps=10, eval_steps=2, eval_interval=6)
+    self.assertEqual(test_runner.global_step, 10)
+
+  def test_has_checkpoint_no_summaries(self):
+    test_runner = TestRunner()
+    # Has checkpoint, but no summary directories.
+    checkpoint = tf.train.Checkpoint(model=test_runner.model)
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint,
+        self.model_dir,
+        max_to_keep=None,
+        step_counter=test_runner.global_step)
+    test_controller = controller.Controller(
+        trainer=test_runner,
+        evaluator=test_runner,
+        global_step=test_runner.global_step,
+        checkpoint_manager=checkpoint_manager,
+        steps_per_loop=2)
+    test_controller.train_and_evaluate(
+        train_steps=10, eval_steps=2, eval_interval=6)
+    self.assertEqual(test_runner.global_step, 10)
+
+    # No summaries are saved.
+    self.assertEmpty(tf.io.gfile.glob(
+        os.path.join(checkpoint_manager.directory, "events.*")))
+
+  def test_has_checkpoint_eval_summary_only(self):
+    test_runner = TestRunner()
+    # Has checkpoint, but no summary directories.
+    checkpoint = tf.train.Checkpoint(model=test_runner.model)
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint,
+        self.model_dir,
+        max_to_keep=None,
+        step_counter=test_runner.global_step)
+    test_controller = controller.Controller(
+        trainer=test_runner,
+        evaluator=test_runner,
+        global_step=test_runner.global_step,
+        checkpoint_manager=checkpoint_manager,
+        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
+        steps_per_loop=2)
+    test_controller.train_and_evaluate(
+        train_steps=10, eval_steps=2, eval_interval=6)
+    self.assertEqual(test_runner.global_step, 10)
+
+    # Training summaries are not saved.
+    self.assertEmpty(tf.io.gfile.glob(
+        os.path.join(checkpoint_manager.directory, "events.*")))
+    # Evaluation summaries are saved.
+    self.assertNotEmpty(tf.io.gfile.glob(
+        os.path.join(self.model_dir, "summaries/eval/events.*")))
+
+  def test_restore_from_most_recent_checkpoint(self):
+    test_runner = TestRunner()
+    checkpoint = tf.train.Checkpoint(model=test_runner.model)
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint,
+        self.model_dir,
+        max_to_keep=None,
+        step_counter=test_runner.global_step,
+        checkpoint_interval=5)
+    test_controller = controller.Controller(
+        trainer=test_runner,
+        global_step=test_runner.global_step,
+        checkpoint_manager=checkpoint_manager,
+        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
+        steps_per_loop=5)
+    test_controller.train(20)
+    self.assertLen(checkpoint_manager.checkpoints, 4)
+    restored_path = test_controller.restore_checkpoint()
+    self.assertEqual(restored_path, checkpoint_manager.checkpoints[-1])
+
+  @parameterized.named_parameters(("return_numpy", True),
+                                  ("return_tensor", False))
+  def test_train_and_evaluate(self, return_numpy):
+    test_runner = TestRunner(return_numpy=return_numpy)
+
+    checkpoint = tf.train.Checkpoint(
+        model=test_runner.model, optimizer=test_runner.optimizer)
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint,
+        self.model_dir,
+        max_to_keep=None,
+        step_counter=test_runner.global_step,
+        checkpoint_interval=10)
+    test_controller = controller.Controller(
+        trainer=test_runner,
+        evaluator=test_runner,
+        global_step=test_runner.global_step,
+        steps_per_loop=2,
+        summary_dir=os.path.join(self.model_dir, "summaries/train"),
+        checkpoint_manager=checkpoint_manager,
+        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"))
+    test_controller.train_and_evaluate(
+        train_steps=10, eval_steps=2, eval_interval=6)
+
+    # Checkpoints are saved.
+    self.assertNotEmpty(tf.io.gfile.glob(os.path.join(self.model_dir, "ckpt*")))
+
+    # Loss and accuracy values should be written into summaries.
+    self.assertNotEmpty(
+        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train")))
+    self.assertNotEmpty(
+        summaries_with_matching_keyword(
+            "loss", os.path.join(self.model_dir, "summaries/train")))
+    self.assertNotEmpty(
+        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval")))
+    self.assertNotEmpty(
+        summaries_with_matching_keyword(
+            "eval_loss", os.path.join(self.model_dir, "summaries/eval")))
+
+  def test_train_only(self):
+    test_runner = TestRunner()
+
+    checkpoint = tf.train.Checkpoint(
+        model=test_runner.model, optimizer=test_runner.optimizer)
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint,
+        self.model_dir,
+        max_to_keep=None,
+        step_counter=test_runner.global_step,
+        checkpoint_interval=10)
+    test_controller = controller.Controller(
+        trainer=test_runner,
+        global_step=test_runner.global_step,
+        steps_per_loop=2,
+        summary_dir=os.path.join(self.model_dir, "summaries/train"),
+        checkpoint_manager=checkpoint_manager,
+        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
+    )
+    test_controller.train(steps=10)
+
+    # Checkpoints are saved.
+    self.assertNotEmpty(tf.io.gfile.glob(os.path.join(self.model_dir, "ckpt*")))
+
+    # Only train summaries are written.
+    self.assertNotEmpty(
+        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train")))
+    self.assertNotEmpty(
+        summaries_with_matching_keyword(
+            "loss", os.path.join(self.model_dir, "summaries/train")))
+    self.assertFalse(
+        tf.io.gfile.exists(os.path.join(self.model_dir, "summaries/eval")))
+
+  def test_evaluate_only(self):
+    test_runner = TestRunner()
+
+    checkpoint = tf.train.Checkpoint(model=test_runner.model)
+    checkpoint.save(os.path.join(self.model_dir, "ckpt"))
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint,
+        self.model_dir,
+        max_to_keep=None,
+        step_counter=test_runner.global_step)
+    test_controller = controller.Controller(
+        evaluator=test_runner,
+        global_step=test_runner.global_step,
+        checkpoint_manager=checkpoint_manager,
+        summary_dir=os.path.join(self.model_dir, "summaries/train"),
+        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"))
+    eval_results = test_controller.evaluate(steps=2)
+
+    # Only eval summaries are written
+    self.assertFalse(
+        tf.io.gfile.exists(os.path.join(self.model_dir, "summaries/train")))
+    self.assertNotEmpty(
+        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval")))
+    self.assertNotEmpty(
+        summaries_with_matching_keyword(
+            "eval_loss", os.path.join(self.model_dir, "summaries/eval")))
+    self.assertIn("eval_loss", eval_results)
+
+    # Tests continuous eval with timeout and timeout_fn.
+    done_file = os.path.join(self.model_dir, "summaries/eval/Done")
+
+    def timeout_fn():
+      with tf.io.gfile.GFile(done_file, "w") as f:
+        f.write("DONE")
+        return True
+
+    test_controller = controller.Controller(
+        evaluator=test_runner,
+        global_step=test_runner.global_step,
+        checkpoint_manager=checkpoint_manager,
+        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"))
+    test_controller.evaluate_continuously(
+        timeout=1, timeout_fn=timeout_fn, steps=2)
+    self.assertNotEmpty(tf.io.gfile.glob(done_file))
+
+  def test_no_eval_steps(self):
+    test_runner = TestRunner()
+
+    checkpoint = tf.train.Checkpoint(model=test_runner.model)
+    checkpoint.save(os.path.join(self.model_dir, "ckpt"))
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint,
+        self.model_dir,
+        max_to_keep=None,
+        step_counter=test_runner.global_step)
+    test_controller = controller.Controller(
+        evaluator=test_runner,
+        global_step=test_runner.global_step,
+        checkpoint_manager=checkpoint_manager)
+    test_controller.evaluate()
+
+  def test_already_trained_model(self):
+    test_runner = TestRunner()
+    test_runner.global_step.assign(10)
+
+    checkpoint = tf.train.Checkpoint(
+        model=test_runner.model, optimizer=test_runner.optimizer)
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint,
+        self.model_dir,
+        max_to_keep=None,
+        step_counter=test_runner.global_step,
+        checkpoint_interval=10)
+    test_controller = controller.Controller(
+        trainer=test_runner,
+        global_step=test_runner.global_step,
+        steps_per_loop=2,
+        checkpoint_manager=checkpoint_manager)
+    # `global_step` is already `train_steps`.
+    test_controller.train(steps=10)
+
+  def test_summaries_inside_train_fn(self):
+    test_runner = TestTrainerWithSummaries()
+
+    checkpoint = tf.train.Checkpoint(
+        model=test_runner.model, optimizer=test_runner.optimizer)
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint,
+        self.model_dir,
+        max_to_keep=None,
+        step_counter=test_runner.global_step)
+    test_controller = controller.Controller(
+        trainer=test_runner,
+        global_step=test_runner.global_step,
+        steps_per_loop=2,
+        summary_dir=os.path.join(self.model_dir, "summaries/train"),
+        summary_interval=2,
+        checkpoint_manager=checkpoint_manager,
+    )
+    test_controller.train(steps=10)
+
+    # Checkpoints are saved.
+    self.assertEmpty(tf.io.gfile.glob(os.path.join(self.model_dir, "ckpt*")))
+
+    # Only train summaries are written.
+    self.assertNotEmpty(
+        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train")))
+    self.assertNotEmpty(
+        summaries_with_matching_keyword(
+            "loss", os.path.join(self.model_dir, "summaries/train")))
+    self.assertFalse(
+        tf.io.gfile.exists(os.path.join(self.model_dir, "summaries/eval")))
+
+  def test_train_and_evaluate_with_same_summary_dir(self):
+    test_runner = TestRunner()
+
+    checkpoint = tf.train.Checkpoint(
+        model=test_runner.model, optimizer=test_runner.optimizer)
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint,
+        self.model_dir,
+        max_to_keep=None,
+        step_counter=test_runner.global_step)
+    test_controller = controller.Controller(
+        trainer=test_runner,
+        evaluator=test_runner,
+        global_step=test_runner.global_step,
+        steps_per_loop=2,
+        summary_dir=os.path.join(self.model_dir, "summaries"),
+        checkpoint_manager=checkpoint_manager,
+        eval_summary_dir=os.path.join(self.model_dir, "summaries"))
+    test_controller.train_and_evaluate(
+        train_steps=10, eval_steps=2, eval_interval=6)
+
+    # Loss and accuracy values should be written into summaries.
+    self.assertNotEmpty(
+        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries")))
+    self.assertNotEmpty(
+        summaries_with_matching_keyword(
+            "loss", os.path.join(self.model_dir, "summaries")))
+    self.assertNotEmpty(
+        summaries_with_matching_keyword(
+            "eval_loss", os.path.join(self.model_dir, "summaries")))
+
+  def test_early_stop_on_eval_loss(self):
+    test_runner = TestRunner()
+
+    class EarlyStopController(controller.Controller):
+      """A subclass of Controller that supports early stopping."""
+
+      def train_and_evaluate(self,
+                             train_steps: int = None,
+                             eval_steps: int = None,
+                             eval_interval: int = None):
+        while self.global_step.numpy() < train_steps:
+          interval = min(train_steps - self.global_step.numpy(), eval_interval)
+          num_steps = self.global_step.numpy() + interval
+          self.train(steps=num_steps, checkpoint_at_completion=False)
+          self.evaluate(steps=eval_steps)
+          # Early stop condition.
+          if test_runner.eval_loss.result() < 0.1:
+            logging.info(
+                "Training early stopped as eval_loss %s is less than 0.1",
+                test_runner.eval_loss.result())
+            return
+
+    checkpoint = tf.train.Checkpoint(
+        model=test_runner.model, optimizer=test_runner.optimizer)
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint,
+        self.model_dir,
+        max_to_keep=None,
+        step_counter=test_runner.global_step,
+        checkpoint_interval=10)
+    test_controller = EarlyStopController(
+        trainer=test_runner,
+        evaluator=test_runner,
+        global_step=test_runner.global_step,
+        steps_per_loop=2,
+        checkpoint_manager=checkpoint_manager)
+    test_controller.train_and_evaluate(
+        train_steps=10, eval_steps=6, eval_interval=2)
+
+    self.assertLess(test_runner.global_step, 10)
+
+  def test_evaluate_with_loss_output(self):
+    test_evaluator = TestEvaluator()
+
+    checkpoint = tf.train.Checkpoint(model=test_evaluator.model)
+    checkpoint.save(os.path.join(self.model_dir, "ckpt"))
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint, self.model_dir, max_to_keep=None)
+    test_controller = controller.Controller(
+        evaluator=test_evaluator,
+        global_step=tf.Variable(0, dtype=tf.int64),
+        checkpoint_manager=checkpoint_manager,
+        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"))
+    test_controller.evaluate(steps=5)
+
+    # Only eval summaries are written
+    self.assertNotEmpty(
+        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval")))
+    self.assertNotEmpty(
+        summaries_with_matching_keyword(
+            "eval_loss", os.path.join(self.model_dir, "summaries/eval")))
+
+  def test_evaluate_with_no_output(self):
+    test_controller = controller.Controller(
+        evaluator=TestEvaluatorNoOutput(),
+        global_step=tf.Variable(0, dtype=tf.int64),
+        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"))
+    self.assertEqual(test_controller.evaluate(steps=5), {})
+
+  def test_train_and_evaluate_reset_datasets(self):
+    test_runner = TestRunner()
+
+    test_controller = controller.Controller(
+        trainer=test_runner,
+        evaluator=test_runner,
+        global_step=test_runner.global_step,
+        steps_per_loop=2)
+
+    test_controller.train_and_evaluate(
+        train_steps=10, eval_steps=2, eval_interval=6)
+
+    train_dataset = (
+        test_runner.strategy.distribute_datasets_from_function(dataset_fn))
+    eval_dataset = (
+        test_runner.strategy.distribute_datasets_from_function(dataset_fn))
+    test_runner.train_dataset = train_dataset
+    test_runner.eval_dataset = eval_dataset
+
+    test_controller.train_and_evaluate(
+        train_steps=10, eval_steps=2, eval_interval=6)
+
+  def test_eval_and_checkpoint_interval(self):
+    test_runner = TestRunner()
+
+    checkpoint = tf.train.Checkpoint(
+        model=test_runner.model, optimizer=test_runner.optimizer)
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint,
+        self.model_dir,
+        max_to_keep=None,
+        step_counter=test_runner.global_step,
+        checkpoint_interval=5)
+    test_controller = controller.Controller(
+        trainer=test_runner,
+        evaluator=test_runner,
+        global_step=test_runner.global_step,
+        steps_per_loop=10,
+        checkpoint_manager=checkpoint_manager,
+        summary_dir=self.model_dir)
+    test_controller.train_and_evaluate(
+        train_steps=10, eval_steps=2, eval_interval=5)
+
+    # Expect 3 checkpoints to be saved at step: 5, 10.
+    self.assertLen(
+        tf.io.gfile.glob(os.path.join(self.model_dir, "ckpt-*.data*")), 2)
+    # Expect evaluation is performed 2 times at step: 5, 10.
+    self.assertLen(
+        summaries_with_matching_keyword("eval_loss", self.model_dir), 2)
+
+  def test_evaluate_with_nested_summaries(self):
+    test_evaluator = TestEvaluatorWithNestedSummary()
+    test_controller = controller.Controller(
+        evaluator=test_evaluator,
+        global_step=tf.Variable(0, dtype=tf.int64),
+        eval_summary_dir=self.model_dir)
+    test_controller.evaluate(steps=5)
+
+    self.assertNotEmpty(
+        tf.io.gfile.listdir(os.path.join(self.model_dir, "dataset")))
+    self.assertNotEmpty(
+        summaries_with_matching_keyword(
+            "loss", os.path.join(self.model_dir, "dataset")))
+    self.assertNotEmpty(
+        summaries_with_matching_keyword(
+            "accuracy", os.path.join(self.model_dir, "dataset")))
+
+    self.assertNotEmpty(
+        tf.io.gfile.listdir(os.path.join(self.model_dir, "dataset2")))
+    self.assertNotEmpty(
+        summaries_with_matching_keyword(
+            "loss", os.path.join(self.model_dir, "dataset2")))
+    self.assertNotEmpty(
+        summaries_with_matching_keyword(
+            "accuracy", os.path.join(self.model_dir, "dataset2")))
+
+  def test_actions(self):
+    test_runner = TestRunner()
+    checkpoint = tf.train.Checkpoint(
+        model=test_runner.model, optimizer=test_runner.optimizer)
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint,
+        self.model_dir,
+        max_to_keep=None,
+        step_counter=test_runner.global_step,
+        checkpoint_interval=10)
+
+    class OutputRecorderAction:
+      """Simple `Action` that just saves the outputs passed to `__call__`."""
+
+      def __init__(self):
+        self.outputs = []
+
+      def __call__(self, output):
+        self.outputs.append(output)
+
+    train_output_recorder = OutputRecorderAction()
+    eval_output_recorder = OutputRecorderAction()
+
+    test_controller = controller.Controller(
+        trainer=test_runner,
+        evaluator=test_runner,
+        train_actions=[train_output_recorder],
+        eval_actions=[eval_output_recorder],
+        global_step=test_runner.global_step,
+        steps_per_loop=2,
+        summary_dir=os.path.join(self.model_dir, "summaries/train"),
+        checkpoint_manager=checkpoint_manager,
+        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"))
+    test_controller.train_and_evaluate(
+        train_steps=10, eval_steps=2, eval_interval=6)
+
+    self.assertLen(train_output_recorder.outputs, 5)
+    for output in train_output_recorder.outputs:
+      self.assertIn("loss", output)
+      self.assertGreaterEqual(output["loss"], 0)
+
+    self.assertLen(eval_output_recorder.outputs, 2)
+    for output in eval_output_recorder.outputs:
+      self.assertIn("eval_loss", output)
+      self.assertGreaterEqual(output["eval_loss"], 0)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/examples/__init__.py b/nlp/text_classification/bert/tensorflow2.0/orbit/examples/__init__.py
new file mode 100644
index 000000000..a4d9cc3a1
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/examples/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/examples/single_task/__init__.py b/nlp/text_classification/bert/tensorflow2.0/orbit/examples/single_task/__init__.py
new file mode 100644
index 000000000..a4d9cc3a1
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/examples/single_task/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/examples/single_task/single_task_evaluator.py b/nlp/text_classification/bert/tensorflow2.0/orbit/examples/single_task/single_task_evaluator.py
new file mode 100644
index 000000000..0dcbae063
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/examples/single_task/single_task_evaluator.py
@@ -0,0 +1,86 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""An evaluator object that can evaluate models with a single output."""
+import orbit
+import tensorflow as tf
+
+
+class SingleTaskEvaluator(orbit.StandardEvaluator):
+  """Evaluates a single-output model on a given dataset.
+
+  This evaluator will handle running a model with one output on a single
+  dataset, and will apply the output of that model to one or more
+  `tf.keras.metrics.Metric` objects.
+  """
+
+  def __init__(self,
+               eval_dataset,
+               label_key,
+               model,
+               metrics,
+               evaluator_options=None):
+    """Initializes a `SingleTaskEvaluator` instance.
+
+    If the `SingleTaskEvaluator` should run its model under a distribution
+    strategy, it should be created within that strategy's scope.
+
+    Arguments:
+      eval_dataset: A `tf.data.Dataset` or `DistributedDataset` that contains a
+        string-keyed dict of `Tensor`s.
+      label_key: The key corresponding to the label value in feature
+        dictionaries dequeued from `eval_dataset`. This key will be removed from
+        the dictionary before it is passed to the model.
+      model: A `tf.Module` or Keras `Model` object to evaluate.
+      metrics: A single `tf.keras.metrics.Metric` object, or a list of
+        `tf.keras.metrics.Metric` objects.
+      evaluator_options: An optional `orbit.StandardEvaluatorOptions` object.
+    """
+
+    self.label_key = label_key
+    self.model = model
+    self.metrics = metrics if isinstance(metrics, list) else [metrics]
+
+    # Capture the strategy from the containing scope.
+    self.strategy = tf.distribute.get_strategy()
+
+    super(SingleTaskEvaluator, self).__init__(
+        eval_dataset=eval_dataset, options=evaluator_options)
+
+  def eval_begin(self):
+    """Actions to take once before every eval loop."""
+    for metric in self.metrics:
+      metric.reset_states()
+
+  def eval_step(self, iterator):
+    """One eval step. Called multiple times per eval loop by the superclass."""
+
+    def step_fn(inputs):
+      # Extract the target value and delete it from the input dict, so that
+      # the model never sees it.
+      target = inputs.pop(self.label_key)
+      output = self.model(inputs)
+      for metric in self.metrics:
+        metric.update_state(target, output)
+
+    # This is needed to handle distributed computation.
+    self.strategy.run(step_fn, args=(next(iterator),))
+
+  def eval_end(self):
+    """Actions to take once after an eval loop."""
+    with self.strategy.scope():
+      # Export the metrics.
+      metrics = {metric.name: metric.result() for metric in self.metrics}
+
+    return metrics
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/examples/single_task/single_task_evaluator_test.py b/nlp/text_classification/bert/tensorflow2.0/orbit/examples/single_task/single_task_evaluator_test.py
new file mode 100644
index 000000000..c074da0fb
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/examples/single_task/single_task_evaluator_test.py
@@ -0,0 +1,65 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the single_task_evaluator."""
+import orbit
+from orbit.examples.single_task import single_task_evaluator
+from orbit.examples.single_task import single_task_trainer
+
+import tensorflow as tf
+import tensorflow_datasets as tfds
+
+
+class SingleTaskEvaluatorTest(tf.test.TestCase):
+
+  def test_single_task_evaluation(self):
+
+    iris = tfds.load('iris')
+    train_ds = iris['train'].batch(32)
+
+    model = tf.keras.Sequential([
+        tf.keras.Input(shape=(4,), name='features'),
+        tf.keras.layers.Dense(10, activation=tf.nn.relu),
+        tf.keras.layers.Dense(10, activation=tf.nn.relu),
+        tf.keras.layers.Dense(3)
+    ])
+
+    trainer = single_task_trainer.SingleTaskTrainer(
+        train_ds,
+        label_key='label',
+        model=model,
+        loss_fn=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        optimizer=tf.keras.optimizers.SGD(learning_rate=0.01))
+
+    evaluator = single_task_evaluator.SingleTaskEvaluator(
+        train_ds,
+        label_key='label',
+        model=model,
+        metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
+
+    controller = orbit.Controller(
+        trainer=trainer,
+        evaluator=evaluator,
+        steps_per_loop=100,
+        global_step=trainer.optimizer.iterations)
+
+    controller.train(train_ds.cardinality().numpy())
+    controller.evaluate()
+    accuracy = evaluator.metrics[0].result().numpy()
+
+    self.assertGreater(0.925, accuracy)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/examples/single_task/single_task_trainer.py b/nlp/text_classification/bert/tensorflow2.0/orbit/examples/single_task/single_task_trainer.py
new file mode 100644
index 000000000..f9b29185a
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/examples/single_task/single_task_trainer.py
@@ -0,0 +1,140 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A trainer object that can train models with a single output."""
+
+import orbit
+import tensorflow as tf
+
+
+class SingleTaskTrainer(orbit.StandardTrainer):
+  """Trains a single-output model on a given dataset.
+
+  This trainer will handle running a model with one output on a single
+  dataset. It will apply the provided loss function to the model's output
+  to calculate gradients and will apply them via the provided optimizer. It will
+  also supply the output of that model to one or more `tf.keras.metrics.Metric`
+  objects.
+  """
+
+  def __init__(self,
+               train_dataset,
+               label_key,
+               model,
+               loss_fn,
+               optimizer,
+               metrics=None,
+               trainer_options=None):
+    """Initializes a `SingleTaskTrainer` instance.
+
+    If the `SingleTaskTrainer` should run its model under a distribution
+    strategy, it should be created within that strategy's scope.
+
+    This trainer will also calculate metrics during training. The loss metric
+    is calculated by default, but other metrics can be passed to the `metrics`
+    arg.
+
+    Arguments:
+      train_dataset: A `tf.data.Dataset` or `DistributedDataset` that contains a
+        string-keyed dict of `Tensor`s.
+      label_key: The key corresponding to the label value in feature
+        dictionaries dequeued from `train_dataset`. This key will be removed
+        from the dictionary before it is passed to the model.
+      model: A `tf.Module` or Keras `Model` object to evaluate. It must accept a
+        `training` kwarg.
+      loss_fn: A per-element loss function of the form (target, output). The
+        output of this loss function will be reduced via `tf.reduce_mean` to
+        create the final loss. We recommend using the functions in the
+        `tf.keras.losses` package or `tf.keras.losses.Loss` objects with
+        `reduction=tf.keras.losses.reduction.NONE`.
+      optimizer: A `tf.keras.optimizers.Optimizer` instance.
+      metrics: A single `tf.keras.metrics.Metric` object, or a list of
+        `tf.keras.metrics.Metric` objects.
+      trainer_options: An optional `orbit.utils.StandardTrainerOptions` object.
+    """
+    self.label_key = label_key
+    self.model = model
+    self.loss_fn = loss_fn
+    self.optimizer = optimizer
+
+    # Capture the strategy from the containing scope.
+    self.strategy = tf.distribute.get_strategy()
+
+    # We always want to report training loss.
+    self.train_loss = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
+
+    # We need self.metrics to be an iterable later, so we handle that here.
+    if metrics is None:
+      self.metrics = []
+    elif isinstance(metrics, list):
+      self.metrics = metrics
+    else:
+      self.metrics = [metrics]
+
+    super(SingleTaskTrainer, self).__init__(
+        train_dataset=train_dataset, options=trainer_options)
+
+  def train_loop_begin(self):
+    """Actions to take once, at the beginning of each train loop."""
+    self.train_loss.reset_states()
+    for metric in self.metrics:
+      metric.reset_states()
+
+  def train_step(self, iterator):
+    """A train step. Called multiple times per train loop by the superclass."""
+
+    def train_fn(inputs):
+      with tf.GradientTape() as tape:
+        # Extract the target value and delete it from the input dict, so that
+        # the model never sees it.
+        target = inputs.pop(self.label_key)
+
+        # Get the outputs of the model.
+        output = self.model(inputs, training=True)
+
+        # Get the average per-batch loss and scale it down by the number of
+        # replicas. This ensures that we don't end up multiplying our loss by
+        # the number of workers - gradients are summed, not averaged, across
+        # replicas during the apply_gradients call.
+        # Note, the reduction of loss is explicitly handled and scaled by
+        # num_replicas_in_sync. Recommend to use a plain loss function.
+        # If you're using tf.keras.losses.Loss object, you may need to set
+        # reduction argument explicitly.
+        loss = tf.reduce_mean(self.loss_fn(target, output))
+        scaled_loss = loss / self.strategy.num_replicas_in_sync
+
+        # Get the gradients by applying the loss to the model's trainable
+        # variables.
+        gradients = tape.gradient(scaled_loss, self.model.trainable_variables)
+
+        # Apply the gradients via the optimizer.
+        self.optimizer.apply_gradients(
+            list(zip(gradients, self.model.trainable_variables)))
+
+        # Update metrics.
+        self.train_loss.update_state(loss)
+        for metric in self.metrics:
+          metric.update_state(target, output)
+
+    # This is needed to handle distributed computation.
+    self.strategy.run(train_fn, args=(next(iterator),))
+
+  def train_loop_end(self):
+    """Actions to take once after a training loop."""
+    with self.strategy.scope():
+      # Export the metrics.
+      metrics = {metric.name: metric.result() for metric in self.metrics}
+      metrics[self.train_loss.name] = self.train_loss.result()
+
+    return metrics
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/examples/single_task/single_task_trainer_test.py b/nlp/text_classification/bert/tensorflow2.0/orbit/examples/single_task/single_task_trainer_test.py
new file mode 100644
index 000000000..cba34f7b0
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/examples/single_task/single_task_trainer_test.py
@@ -0,0 +1,60 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the single_task_trainer."""
+import orbit
+from orbit.examples.single_task import single_task_trainer
+
+import tensorflow as tf
+import tensorflow_datasets as tfds
+
+
+class SingleTaskTrainerTest(tf.test.TestCase):
+
+  def test_single_task_training(self):
+    iris = tfds.load('iris')
+    train_ds = iris['train'].batch(32).repeat()
+
+    model = tf.keras.Sequential([
+        tf.keras.Input(shape=(4,), name='features'),
+        tf.keras.layers.Dense(10, activation=tf.nn.relu),
+        tf.keras.layers.Dense(10, activation=tf.nn.relu),
+        tf.keras.layers.Dense(3),
+        tf.keras.layers.Softmax(),
+    ])
+
+    trainer = single_task_trainer.SingleTaskTrainer(
+        train_ds,
+        label_key='label',
+        model=model,
+        loss_fn=tf.keras.losses.sparse_categorical_crossentropy,
+        optimizer=tf.keras.optimizers.SGD(learning_rate=0.01))
+
+    controller = orbit.Controller(
+        trainer=trainer,
+        steps_per_loop=100,
+        global_step=trainer.optimizer.iterations)
+
+    controller.train(1)
+    start_loss = trainer.train_loss.result().numpy()
+    controller.train(500)
+    end_loss = trainer.train_loss.result().numpy()
+
+    # Assert that the model has trained 'significantly' - that the loss
+    # has dropped by over 50%.
+    self.assertLess(end_loss, start_loss / 2)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/runner.py b/nlp/text_classification/bert/tensorflow2.0/orbit/runner.py
new file mode 100644
index 000000000..b0377c521
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/runner.py
@@ -0,0 +1,83 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Provides AbstractTrainer/Evaluator base classes, defining train/eval APIs."""
+
+import abc
+
+from typing import Dict, Optional, Union
+
+import numpy as np
+import tensorflow as tf
+
+
+Output = Dict[str, Union[tf.Tensor, float, np.number, np.ndarray, 'Output']]  # pytype: disable=not-supported-yet
+
+
+class AbstractTrainer(tf.Module, metaclass=abc.ABCMeta):
+  """An abstract class defining the API required for training."""
+
+  @abc.abstractmethod
+  def train(self, num_steps: tf.Tensor) -> Optional[Output]:
+    """Implements `num_steps` steps of training.
+
+    This method will be called by the `Controller` to perform the "inner loop"
+    of training. This inner loop amortizes the cost of bookkeeping associated
+    with checkpointing, evaluation, and writing summaries. Additionally, the
+    inner loop can be implemented (if desired) using TensorFlow's looping
+    constructs (e.g. a `for` loop over a `tf.range` inside a `tf.function`),
+    which can be necessary for getting optimal performance when running on TPU.
+    For cases that don't require peak performance, a simple Python loop can be
+    used instead for simplicity.
+
+    Args:
+      num_steps: The number of training steps to run. Note that it is up to the
+        model what constitutes a "step", which may involve more than one update
+        to model parameters (e.g., if training a GAN).
+
+    Returns:
+      Either `None`, or a dictionary mapping names to `Tensor`s or NumPy values.
+      If a dictionary is returned, it will be written to logs and as TensorBoard
+      summaries. The dictionary may also be nested, which will generate a
+      hierarchy of summary directories.
+    """
+    pass
+
+
+class AbstractEvaluator(tf.Module, metaclass=abc.ABCMeta):
+  """An abstract class defining the API required for evaluation."""
+
+  @abc.abstractmethod
+  def evaluate(self, num_steps: tf.Tensor) -> Optional[Output]:
+    """Implements `num_steps` steps of evaluation.
+
+    This method will by called the `Controller` to perform an evaluation. The
+    `num_steps` parameter specifies the number of steps of evaluation to run,
+    which is specified by the user when calling one of the `Controller`'s
+    evaluation methods. A special sentinel value of `-1` is reserved to indicate
+    evaluation should run until the underlying data source is exhausted.
+
+    Args:
+      num_steps: The number of evaluation steps to run. Note that it is up to
+        the model what constitutes a "step". Evaluations may also want to
+        support "complete" evaluations when `num_steps == -1`, running until a
+        given data source is exhausted.
+
+    Returns:
+      Either `None`, or a dictionary mapping names to `Tensor`s or NumPy values.
+      If a dictionary is returned, it will be written to logs and as TensorBoard
+      summaries. The dictionary may also be nested, which will generate a
+      hierarchy of summary directories.
+    """
+    pass
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/standard_runner.py b/nlp/text_classification/bert/tensorflow2.0/orbit/standard_runner.py
new file mode 100644
index 000000000..d6ea757af
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/standard_runner.py
@@ -0,0 +1,447 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""AbstractTrainer/Evaluator subclasses with added functionality.
+
+The classes in this module provide some additional structure to the bare
+`AbstractTrainer`/`AbstractEvaluator` APIs.
+
+Both `StandardTrainer` and `StandardEvaluator` split the train/eval loops into
+"begin", "step", and "end" methods, and provide an implementation of the loop
+itself that makes calls to the relevant step method.
+
+`StandardTrainer` supports running the loop using the TF while loop construct
+for added performance (particularly on TPUs). It additionally provides some
+functionality to make writing summaries from inside a model more performant when
+running on TPUs.
+
+These classes are intended to work well in common settings, however there may
+be use cases these classes don't support (for instance, `StandardEvaluator` in
+particular doesn't support running full evaluations over multiple different eval
+datasets). Users are encouraged to simply fall back to custom `AbstractTrainer`
+and `AbstractEvaluator` subclasses in these cases.
+"""
+
+import abc
+
+from typing import Any, Optional
+
+import dataclasses
+
+from orbit import runner
+from orbit.utils import loop_fns
+
+import tensorflow as tf
+
+
+@dataclasses.dataclass(frozen=True)
+class StandardTrainerOptions:
+  """Advanced options for `orbit.StandardTrainer`.
+
+  Attributes:
+    use_tf_function: A boolean indicating whether to apply `tf.function` to the
+      training loop. This will only affect the body of the loop (involving
+      `train_step`); `train_loop_begin` and `train_loop_end` will always be run
+      in eager mode.
+    use_tf_while_loop: A boolean indicating whether to run the training loop
+      using a `tf.while_loop`. If `True`, `use_tf_function` must also be `True`.
+    use_tpu_summary_optimization: A boolean indicating whether to enable a
+      performance optimization for summaries in TPUs. Writing summaries
+      conditionally with outside compilation on TPUs can be extremely slow. If
+      `True`, this optimization creates two `tf.function`s with two XLA programs
+      (one with summary calls, and one without). The program with summaries runs
+      only for one step when summaries should be recorded.
+  """
+  use_tf_function: bool = True
+  use_tf_while_loop: bool = True
+  use_tpu_summary_optimization: bool = False
+
+
+class StandardTrainer(runner.AbstractTrainer, metaclass=abc.ABCMeta):
+  """Implements standard functionality on top of the AbstractTrainer API.
+
+  This class structures the training "inner loop" roughly as follows:
+
+      train_loop_begin()
+      for _ in range(num_steps):
+        train_step(train_iterator)
+      return train_loop_end()
+
+  Calls to `train_loop_begin` and `train_loop_end` are always done in eager
+  mode, while the loop/`train_step` may be implemented using `tf.while` and/or
+  `tf.function`, as determined by the `options` passed to `__init__`.
+  """
+
+  def __init__(self,
+               train_dataset,
+               options: Optional[StandardTrainerOptions] = None):
+    """Initializes the `StandardTrainer` instance.
+
+    Args:
+      train_dataset: A `tf.nest`-compatible structure of `tf.data.Dataset` or
+        `DistributedDataset`.
+      options: An `orbit.StandardTrainerOptions` instance.
+    """
+    options = options or StandardTrainerOptions()
+    if options.use_tf_while_loop and not options.use_tf_function:
+      raise ValueError("`use_tf_while_loop=True` and `use_tf_function=False` "
+                       "is not supported")
+    if options.use_tpu_summary_optimization and not options.use_tf_while_loop:
+      raise ValueError("`use_tpu_summary_optimization=True` and "
+                       "`use_tf_while_loop=False` is not supported")
+
+    self._train_options = options
+    self._train_dataset = train_dataset
+    self._train_iter = None
+    self._train_loop_fn = None
+
+  def create_train_loop_fn(self):
+    """Creates a training loop from the current step function and options.
+
+    Returns:
+      The train loop function, i.e. wrapper of multiple train steps.
+    """
+    train_step_fn = self.train_step
+    if self._train_options.use_tf_while_loop:
+      loop_fn = loop_fns.create_tf_while_loop_fn(train_step_fn)
+      if self._train_options.use_tpu_summary_optimization:
+        loop_fn = loop_fns.LoopFnWithSummaries(loop_fn)
+      else:
+        loop_fn = tf.function(loop_fn)
+    else:
+      if self._train_options.use_tf_function:
+        train_step_fn = tf.function(train_step_fn)
+      loop_fn = loop_fns.create_loop_fn(train_step_fn)
+    return loop_fn
+
+  def train(self, num_steps: tf.Tensor) -> Optional[runner.Output]:
+    """Implements `num_steps` steps of training.
+
+    Args:
+      num_steps: The number of training steps to run. This corresponds directly
+        to the number of calls made to `train_step`.
+
+    Returns:
+      The output of `train_loop_end`.
+    """
+    self.train_loop_begin()
+
+    if self._train_loop_fn is None:
+      self._train_loop_fn = self.create_train_loop_fn()
+
+    if self._train_iter is None:
+      self._train_iter = tf.nest.map_structure(iter, self.train_dataset)
+
+    self._train_loop_fn(self._train_iter, num_steps)
+    return self.train_loop_end()
+
+  def train_loop_begin(self):
+    """Called once at the beginning of the training loop.
+
+    This method is always called in eager mode, and is a good place to reset
+    metrics that accumulate values over multiple steps of training.
+
+    Note that this method is called before dataset iterator creation.
+    """
+    pass
+
+  @abc.abstractmethod
+  def train_step(self, iterator):
+    """Implements one step of training.
+
+    What a "step" consists of is up to the implementer. When using distribution
+    strategies, the call to this method takes place in the "cross-replica
+    context" for generality, to allow e.g. multiple iterator dequeues and calls
+    to `strategy.run`.
+
+    Note that if `use_tf_function=True`, all the code inside `train_step` should
+    be compatible with `tf.function` tracing (and in particular, any state
+    modifications involving `self` should be avoided). In some cases, non-
+    `tf.function` compatible code can be moved to `train_loop_begin` or
+    `train_loop_end`, which always execute eagerly.
+
+    Args:
+      iterator: A `tf.nest`-compatible structure of `tf.data.Iterator` or
+        `DistributedIterator`. The structure of this input matches the structure
+        of `train_dataset` as passed to `__init__`.
+    """
+    pass
+
+  def train_loop_end(self) -> Optional[runner.Output]:
+    """Called once at the end of the training loop.
+
+    This method is always called in eager mode, and is a good place to get
+    metric results. The value returned from this function will be returned as-is
+    from the `train` method implementation provided by `StandardTrainer`.
+
+    Returns:
+      The function may return a dictionary of `Tensors`, which will be
+      written to logs and as TensorBoard summaries. It can also be a
+      nested dictionary, yielding a hierarchy of summary directories.
+    """
+    pass
+
+  @property
+  def train_dataset(self):
+    """The current training dataset."""
+    return self._train_dataset
+
+  @train_dataset.setter
+  def train_dataset(self, train_dataset):
+    """Sets a new training dataset, replacing the current one.
+
+    Any unprocessed examples in the current dataset are discarded.
+
+    Args:
+      train_dataset: A `tf.nest`-compatible structure of `tf.data.Dataset` or
+        `DistributedDataset`.
+    """
+    self._train_dataset = train_dataset
+    self._train_iter = None
+
+
+@dataclasses.dataclass(frozen=True)
+class StandardEvaluatorOptions:
+  """Advanced options for the `orbit.StandardEvaluator`.
+
+  Attributes:
+    use_tf_function: A boolean indicating whether to apply `tf.function` to the
+      evaluation loop. This will only affect the body of the loop (involving
+      `eval_step`); `eval_loop_begin` and `eval_loop_end` will always be run
+      in eager mode.
+    use_tf_while_loop: A boolean indicating whether to run the evaluation loop
+      using a `tf.while_loop`. If `True`, `use_tf_function` must also be `True`.
+    recreate_iterator_for_each_eval: A boolean indicating whether to recreate a
+      new iterator for the evaluation dataset before each round of evaluation,
+      which implies each round of evaluation starts from the beginning of
+      the evaluation dataset. For example, the evaluation dataset is
+      `[1, 2, 3, 4]`, batch size is 1 and evaluation steps is 2. If `True`, the
+      data to be evaluated is [1, 2] every time. If `False`, the iterator
+      state is maintained between calls to `StandardEvaluator.evaluate()`.
+  """
+  use_tf_function: bool = True
+  use_tf_while_loop: bool = False
+  recreate_iterator_for_each_eval: bool = True
+
+
+class StandardEvaluator(runner.AbstractEvaluator, metaclass=abc.ABCMeta):
+  """Implements the standard functionality of AbstractEvaluator APIs.
+
+  This class structures evaluation roughly as follows:
+
+      state = eval_begin()
+      for _ in range(num_steps):
+        step_outputs = eval_step(eval_iterator)
+        state = eval_reduce(state, step_outputs)
+      return eval_end(state)
+
+  Calls to `eval_begin` and `eval_end` are always done in eager
+  mode, while `eval_step` may be compiled with `tf.function` as determined by
+  the `options` passed to `__init__`. `eval_reduce` is in eager mode if
+  `use_tf_while_loop=False` in `StandardEvaluatorOptions`, but in graph mode if
+  `use_tf_while_loop=True`.
+
+  This class does not support completely evaluating multiple different datasets
+  (i.e., where every example of each dataset should be processed, as opposed to
+  running for a fixed number of evaluation steps). A custom `AbstractEvaluator`
+  is recommended in this case.
+  """
+
+  def __init__(self,
+               eval_dataset,
+               options: Optional[StandardEvaluatorOptions] = None):
+    """Initializes the `StandardEvaluator` instance.
+
+    Args:
+      eval_dataset: A `tf.nest`-compatible structure of `tf.data.Dataset` or
+        `DistributedDataset`.
+      options: An `orbit.StandardEvaluatorOptions` instance.
+    """
+    options = options or StandardEvaluatorOptions()
+    if options.use_tf_while_loop and not options.use_tf_function:
+      raise ValueError("`use_tf_while_loop=True` and `use_tf_function=False` "
+                       "is not supported")
+
+    self._eval_options = options
+    self._eval_dataset = eval_dataset
+    self._eval_iter = None
+    self._eval_loop_fn = None
+
+  def create_eval_loop_fn(self, has_state: bool):
+    """Creates an eval loop from the current step function and options.
+
+    Args:
+      has_state: If the step function has state, state will be kept in the loop.
+
+    Returns:
+      The eval loop function, i.e. wrapper of multiple eval steps.
+    """
+    eval_step_fn = self.eval_step
+    if self._eval_options.use_tf_while_loop:
+      # TODO(b/176126742): tf.while_loop doesn't support `None` as a loop input
+      # even when it is not used inside the loop. To workaround this limitation,
+      # we have to build two tf.functions for it.
+      if has_state:
+        loop_fn = loop_fns.create_tf_while_loop_fn_with_state(eval_step_fn)
+      else:
+        loop_fn = loop_fns.create_tf_while_loop_fn(eval_step_fn)
+      loop_fn = tf.function(loop_fn)
+    else:
+      if self._eval_options.use_tf_function:
+        eval_step_fn = tf.function(eval_step_fn)
+      loop_fn = loop_fns.create_loop_fn(eval_step_fn)
+    return loop_fn
+
+  def evaluate(self, num_steps: tf.Tensor) -> Optional[runner.Output]:
+    """Implements `num_steps` steps of evaluation.
+
+    Args:
+      num_steps: The number of evaluation steps to run. When this is -1,
+        evaluation proceeds until a call to `eval_step` raises a `StopIteration`
+        or `tf.errors.OutOfRangeError`.
+
+    Returns:
+      The output of `self.eval_end()`.
+
+    Raises:
+      ValueError: If `options.use_tf_while_loop` is `True` and `num_steps` is
+        unspecified.
+    """
+    if self._eval_options.use_tf_while_loop and num_steps == -1:
+      raise ValueError("Looping until exhausted is not supported if "
+                       "`options.use_tf_while_loop` is `True`")
+
+    outputs = self.eval_begin()  # pylint: disable=assignment-from-no-return
+
+    has_state = outputs is not None
+    if self._eval_loop_fn is None:
+      self._eval_loop_fn = self.create_eval_loop_fn(has_state)
+
+    # If `recreate_iterator_for_each_eval` is `True`, `self._eval_iter` is
+    # always None.
+    if self._eval_iter is None:
+      eval_iter = tf.nest.map_structure(iter, self.eval_dataset)
+      if not self._eval_options.recreate_iterator_for_each_eval:
+        self._eval_iter = eval_iter
+    else:
+      eval_iter = self._eval_iter
+
+    if self._eval_options.use_tf_while_loop and not has_state:
+      self._eval_loop_fn(eval_iter, num_steps)
+    else:
+      outputs = self._eval_loop_fn(
+          eval_iter, num_steps, state=outputs, reduce_fn=self.eval_reduce)
+
+    if outputs is None:
+      return self.eval_end()
+    else:
+      return self.eval_end(outputs)
+
+  def eval_begin(self) -> Any:
+    """Called once at the beginning of the evaluation.
+
+    This method is always called in eager mode, and is a good place to reset
+    metrics that accumulate values over the course of evaluation.
+
+    Note that this method is called before dataset iterator creation.
+
+    Returns:
+      An value to pass as the `state` argument to `eval_reduce`.
+    """
+    pass
+
+  @abc.abstractmethod
+  def eval_step(self, iterator) -> Any:
+    """Implements one step of evaluation.
+
+    What a "step" consists of is up to the implementer. When using distribution
+    strategies, the call to this method takes place in the "cross-replica
+    context" for generality, to allow e.g. multiple iterator dequeues and calls
+    to `strategy.run`.
+
+    Note that if `use_tf_function=True`, all the code inside `eval_step` should
+    be compatible with `tf.function` tracing (and in particular, any state
+    modifications involving `self` should be avoided). In some cases, non-
+    `tf.function` compatible code can be moved to `eval_loop_begin`,
+    `eval_reduce`, or `eval_loop_end`, which always execute eagerly.
+
+    Args:
+      iterator: A `tf.nest`-compatible structure of `tf.data.Iterator` or
+        `DistributedIterator`.
+
+    Returns:
+      An output which is passed as `step_outputs` argument into `eval_reduce`
+      function.
+    """
+    pass
+
+  def eval_end(self, *args) -> Optional[runner.Output]:
+    """Called at the end of the evaluation.
+
+    Called once at the end of evaluation.
+
+    This method is always called in eager mode, and is a good place to get
+    metric results. The value returned from this function will be returned as-is
+    from the `evaluate` method implementation provided by `StandardEvaluator`.
+
+    Args:
+      *args: The outputs from `eval_reduce` for the last eval step, if they are
+        non-`None` (if they are `None`, nothing is passed).
+
+    Returns:
+      The function may return a dictionary of `Tensors`, which will be
+      written to logs and as TensorBoard summaries. It can also be a
+      nested dictionary, yielding a hierarchy of summary directories.
+    """
+    pass
+
+  def eval_reduce(self,
+                  state: Optional[Any] = None,
+                  step_outputs: Optional[runner.Output] = None) -> Any:
+    """A function to perform per-step reduction on the evaluation outputs.
+
+    This is useful for passing state throughout evaluation, especially in cases
+    where maintaining or accumulating state is hard to accomplish using
+    `tf.metrics.Metric` or other `tf.Variable`-based approaches. For instance,
+    it can be used to easily accumulate all per-example losses from the full
+    evaluation for subsequent processing in `eval_end()`.
+
+    Args:
+      state: A state being mainted throughout the evaluation.
+      step_outputs: Outputs from the current evaluation step.
+
+    Returns:
+      An output which is passed as the `state` argument to this function for the
+      next step. After evaluation is finished, the output from last step will be
+      passed to `eval_end`.
+    """
+    pass
+
+  @property
+  def eval_dataset(self):
+    """The current evaluation dataset."""
+    return self._eval_dataset
+
+  @eval_dataset.setter
+  def eval_dataset(self, eval_dataset):
+    """Sets a new eval dataset, replacing the current one.
+
+    Any unprocessed examples in the current dataset are discarded.
+
+    Args:
+      eval_dataset: A `tf.nest`-compatible structure of `tf.data.Dataset` or
+        `DistributedDataset`.
+    """
+    self._eval_dataset = eval_dataset
+    self._eval_iter = None
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/standard_runner_test.py b/nlp/text_classification/bert/tensorflow2.0/orbit/standard_runner_test.py
new file mode 100644
index 000000000..ef1335e79
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/standard_runner_test.py
@@ -0,0 +1,152 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for orbit.standard_runner."""
+
+from absl.testing import parameterized
+
+from orbit import standard_runner
+from orbit import utils
+
+import tensorflow as tf
+
+
+def dataset_fn(input_context=None):
+  del input_context
+
+  def dummy_data(_):
+    return tf.zeros((1, 1), dtype=tf.float32)
+
+  dataset = tf.data.Dataset.range(1)
+  dataset = dataset.repeat()
+  dataset = dataset.map(
+      dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  return dataset
+
+
+class TestTrainer(standard_runner.StandardTrainer):
+  """A StandardTrainer subclass for tests."""
+
+  def __init__(self, options=None):
+    self.strategy = tf.distribute.get_strategy()
+    self.global_step = utils.create_global_step()
+    dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
+    super().__init__(train_dataset=dataset, options=options)
+
+  def train_loop_begin(self):
+    self.global_step.assign(0)
+
+  def train_step(self, iterator):
+
+    def replica_step(_):
+      self.global_step.assign_add(1)
+
+    self.strategy.run(replica_step, args=(next(iterator),))
+
+  def train_loop_end(self):
+    return self.global_step.numpy()
+
+
+class TestEvaluator(standard_runner.StandardEvaluator):
+  """A StandardEvaluator subclass for tests."""
+
+  def __init__(self, options=None):
+    self.strategy = tf.distribute.get_strategy()
+    self.global_step = utils.create_global_step()
+    dataset = self.strategy.distribute_datasets_from_function(dataset_fn)
+    super().__init__(eval_dataset=dataset, options=options)
+
+  def eval_begin(self):
+    self.global_step.assign(0)
+
+  def eval_step(self, iterator):
+
+    def replica_step(_):
+      self.global_step.assign_add(1)
+
+    self.strategy.run(replica_step, args=(next(iterator),))
+
+  def eval_end(self):
+    return self.global_step.numpy()
+
+
+class TestEvaluatorWithOutputsAggregation(standard_runner.StandardEvaluator):
+  """A StandardEvaluator subclass for tests."""
+
+  def __init__(self, options=None):
+    self.strategy = tf.distribute.get_strategy()
+    dataset = self.strategy.distribute_datasets_from_function(
+        lambda _: tf.data.Dataset.range(10))
+    super().__init__(eval_dataset=dataset, options=options)
+
+  def eval_begin(self):
+    return tf.constant((0.0,))
+
+  def eval_reduce(self, state, step_outputs):
+    state = tf.concat([state, step_outputs], 0)
+    return state
+
+  def eval_step(self, iterator):
+
+    def replica_step(x):
+      x = tf.cast(x, tf.float32)
+      return tf.reduce_sum(x)
+
+    return self.strategy.experimental_local_results(
+        self.strategy.run(replica_step, args=(next(iterator),)))
+
+  def eval_end(self, outputs):
+    return tf.reduce_sum(outputs)
+
+
+class StandardRunnerTest(parameterized.TestCase):
+
+  def test_default_trainer(self):
+    trainer = TestTrainer()
+    self.assertEqual(trainer.train(tf.constant(10)), 10)
+
+  def test_trainer_with_tpu_summary_optimization(self):
+    options = standard_runner.StandardTrainerOptions(
+        use_tpu_summary_optimization=True)
+    trainer = TestTrainer(options)
+    self.assertEqual(trainer.train(tf.constant(10)), 10)
+
+  @parameterized.named_parameters(("use_tf_while_loop", True), ("", False))
+  def test_default_evaluator(self, use_tf_while_loop):
+    options = standard_runner.StandardEvaluatorOptions(
+        use_tf_while_loop=use_tf_while_loop)
+    evaluator = TestEvaluator(options)
+    self.assertEqual(evaluator.evaluate(tf.constant(10)), 10)
+
+  @parameterized.named_parameters(("use_tf_while_loop", True), ("", False))
+  def test_evaluator_with_outputs_aggregation(self, use_tf_while_loop):
+    options = standard_runner.StandardEvaluatorOptions(
+        use_tf_while_loop=use_tf_while_loop)
+    evaluator = TestEvaluatorWithOutputsAggregation(options)
+    self.assertEqual(evaluator.evaluate(tf.constant(10)), 45)
+
+  @parameterized.named_parameters(
+      ("recreate_iterator_for_each_eval", True, 10, 10),
+      ("not_recreate_iterator_for_each_eval", False, 10, 35))
+  def test_evaluator_with_repeat_dataset(self, recreate_iterator_for_each_eval,
+                                         sum_for_1st_time, sum_for_2nd_time):
+    options = standard_runner.StandardEvaluatorOptions(
+        recreate_iterator_for_each_eval=recreate_iterator_for_each_eval)
+    evaluator = TestEvaluatorWithOutputsAggregation(options)
+    self.assertEqual(evaluator.evaluate(tf.constant(5)), sum_for_1st_time)
+    self.assertEqual(evaluator.evaluate(tf.constant(5)), sum_for_2nd_time)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/utils/__init__.py b/nlp/text_classification/bert/tensorflow2.0/orbit/utils/__init__.py
new file mode 100644
index 000000000..3eeb67c4a
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/utils/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Defines exported symbols for the `orbit.utils` package."""
+
+from orbit.utils.common import create_global_step
+from orbit.utils.common import get_value
+from orbit.utils.common import make_distributed_dataset
+
+from orbit.utils.epoch_helper import EpochHelper
+
+from orbit.utils.loop_fns import create_loop_fn
+from orbit.utils.loop_fns import create_tf_while_loop_fn
+from orbit.utils.loop_fns import LoopFnWithSummaries
+
+from orbit.utils.summary_manager import SummaryManager
+
+from orbit.utils.tpu_summaries import OptionalSummariesFunction
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/utils/common.py b/nlp/text_classification/bert/tensorflow2.0/orbit/utils/common.py
new file mode 100644
index 000000000..63ee020af
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/utils/common.py
@@ -0,0 +1,100 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Some layered modules/functions to help users writing custom training loop."""
+
+import inspect
+
+import tensorflow as tf
+
+
+def create_global_step() -> tf.Variable:
+  """Creates a `tf.Variable` suitable for use as a global step counter.
+
+  Creating and managing a global step variable may be necessary for
+  `AbstractTrainer` subclasses that perform multiple parameter updates per
+  `Controller` "step", or use different optimizers on different steps.
+
+  In these cases, an `optimizer.iterations` property generally can't be used
+  directly, since it would correspond to parameter updates instead of iterations
+  in the `Controller`'s training loop. Such use cases should simply call
+  `step.assign_add(1)` at the end of each step.
+
+  Returns:
+    A non-trainable scalar `tf.Variable` of dtype `tf.int64`, with only the
+    first replica's value retained when synchronizing across replicas in
+    a distributed setting.
+  """
+  return tf.Variable(
+      0,
+      dtype=tf.int64,
+      name="global_step",
+      trainable=False,
+      aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
+
+
+def make_distributed_dataset(strategy, dataset_or_fn, *args, **kwargs):
+  """A utility function to help create a `tf.distribute.DistributedDataset`.
+
+  Args:
+    strategy: An instance of `tf.distribute.Strategy`.
+    dataset_or_fn: A instance of `tf.data.Dataset`, or a "dataset function"
+      returning a `tf.data.Dataset`. If it is a function, it may optionally have
+      an argument named `input_context` which will be passed a
+      `tf.distribute.InputContext` instance.
+    *args: Any positional arguments to pass through to `dataset_or_fn`.
+    **kwargs: Any keyword arguments to pass through to `dataset_or_fn`.
+
+  Returns:
+    A distributed Dataset.
+  """
+  if strategy is None:
+    strategy = tf.distribute.get_strategy()
+
+  if isinstance(dataset_or_fn, tf.data.Dataset):
+    return strategy.experimental_distribute_dataset(dataset_or_fn)
+
+  if not callable(dataset_or_fn):
+    raise ValueError("`dataset_or_fn` should be either callable or an instance "
+                     "of `tf.data.Dataset`.")
+
+  def dataset_fn(input_context):
+    """Wraps `dataset_or_fn` for strategy.distribute_datasets_from_function."""
+
+    # If `dataset_or_fn` is a function and has an argument named
+    # `input_context`, pass through the given `input_context`. Otherwise
+    # `input_context` will be ignored.
+    argspec = inspect.getfullargspec(dataset_or_fn)
+    arg_names = argspec.args
+
+    if "input_context" in arg_names:
+      kwargs["input_context"] = input_context
+    return dataset_or_fn(*args, **kwargs)
+
+  return strategy.distribute_datasets_from_function(dataset_fn)
+
+
+def get_value(x):
+  """Returns input values, converting any TensorFlow values to NumPy values.
+
+  Args:
+    x: The input. May be a `tf.Tensor` or `tf.Variable`.
+
+  Returns:
+    If the input is a TensorFlow `Tensor`, returns the `Tensor`'s equivalent
+    NumPy value. Otherwise, just returns the input.
+  """
+  if not tf.is_tensor(x):
+    return x
+  return x.numpy()
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/utils/common_test.py b/nlp/text_classification/bert/tensorflow2.0/orbit/utils/common_test.py
new file mode 100644
index 000000000..1a68e7c66
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/utils/common_test.py
@@ -0,0 +1,34 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for orbit.utils.common."""
+
+from orbit.utils import common
+
+import tensorflow as tf
+
+
+class UtilsTest(tf.test.TestCase):
+
+  def test_create_global_step(self):
+    step = common.create_global_step()
+    self.assertEqual(step.name, "global_step:0")
+    self.assertEqual(step.dtype, tf.int64)
+    self.assertEqual(step, 0)
+    step.assign_add(1)
+    self.assertEqual(step, 1)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/utils/epoch_helper.py b/nlp/text_classification/bert/tensorflow2.0/orbit/utils/epoch_helper.py
new file mode 100644
index 000000000..10c11324a
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/utils/epoch_helper.py
@@ -0,0 +1,65 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Provides a utility class for training in epochs."""
+
+import tensorflow as tf
+
+
+class EpochHelper:
+  """A helper class handle bookkeeping of epochs in custom training loops."""
+
+  def __init__(self, epoch_steps: int, global_step: tf.Variable):
+    """Initializes the `EpochHelper` instance.
+
+    Args:
+      epoch_steps: An integer indicating how many steps are in an epoch.
+      global_step: A `tf.Variable` providing the current global step.
+    """
+    self._epoch_steps = epoch_steps
+    self._global_step = global_step
+    self._current_epoch = None
+    self._epoch_start_step = None
+    self._in_epoch = False
+
+  def epoch_begin(self):
+    """Returns whether a new epoch should begin."""
+    if self._in_epoch:
+      return False
+    current_step = self._global_step.numpy()
+    self._epoch_start_step = current_step
+    self._current_epoch = current_step // self._epoch_steps
+    self._in_epoch = True
+    return True
+
+  def epoch_end(self):
+    """Returns whether the current epoch should end."""
+    if not self._in_epoch:
+      raise ValueError("`epoch_end` can only be called inside an epoch.")
+    current_step = self._global_step.numpy()
+    epoch = current_step // self._epoch_steps
+
+    if epoch > self._current_epoch:
+      self._in_epoch = False
+      return True
+    return False
+
+  @property
+  def batch_index(self):
+    """Index of the next batch within the current epoch."""
+    return self._global_step.numpy() - self._epoch_start_step
+
+  @property
+  def current_epoch(self):
+    return self._current_epoch
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/utils/loop_fns.py b/nlp/text_classification/bert/tensorflow2.0/orbit/utils/loop_fns.py
new file mode 100644
index 000000000..6e3262469
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/utils/loop_fns.py
@@ -0,0 +1,192 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for creating loop functions."""
+
+from orbit.utils import tpu_summaries
+
+import tensorflow as tf
+
+
+def create_loop_fn(step_fn):
+  """Creates a loop function driven by a Python `while` loop.
+
+  Args:
+    step_fn: A function taking a nested structure of `tf.data.Iterator` or
+      `DistributedIterator`. There are no constraints on the return value of the
+      function (except that it must be compatible with any `reduce_fn` provided
+      to the returned `loop_fn`).
+
+  Returns:
+    A loop function taking required `iterator` and `num_steps` parameters, as
+    well as optional `state` and `reduce_fn` parameters for accumulating state
+    over multiple iterations of the loop. See the `loop_fn` definition below for
+    additional details.
+  """
+
+  def loop_fn(iterator, num_steps, state=None, reduce_fn=None):
+    """Makes `num_steps` calls to `step_fn(iterator)`.
+
+    Additionally, state may be accumulated across iterations of the loop.
+    Conceptually, state accumulation is handled roughly as follows:
+
+        for _ in range(num_steps):
+          step_outputs  = step_fn(iterator)
+          state = reduce_fn(state, step_outputs)
+        return state
+
+    However, the implementation is slightly more complicated in order to support
+    looping until the iterator is exhausted (when `num_steps == -1`) and to
+    properly catch exceptions when running under async remote eager (as is the
+    case in TPU training setups involving separate coordinator/worker machines).
+
+    Args:
+      iterator: A nested structure of `tf.data.Iterator` or
+        `DistributedIterator`.
+      num_steps: The number of steps in the loop. If `num_steps == -1`, will
+        iterate until exausting the iterator.
+      state: An optional initial state before running the loop.
+      reduce_fn: A callable taking two inputs, `state` and `value`, where
+        `state` is the previous output from `reduce_fn`, and `value` is the
+        output from `step_fn`.
+
+    Returns:
+      The final state returned by `reduce_fn`, or `None` if `state` and
+      `reduce_fn` are not provided.
+    """
+    try:
+      step = 0
+      # To make sure the OutOfRangeError exception can be handled well under
+      # async remote eager, we need to wrap the loop body in `async_scope`.
+      with tf.experimental.async_scope():
+        while num_steps == -1 or step < num_steps:
+          outputs = step_fn(iterator)
+          if reduce_fn is not None:
+            state = reduce_fn(state, outputs)
+          step += 1
+        return state
+    except (StopIteration, tf.errors.OutOfRangeError):
+      tf.experimental.async_clear_error()
+      return state
+
+  return loop_fn
+
+
+def create_tf_while_loop_fn(step_fn):
+  """Creates a loop function compatible with TF's AutoGraph loop conversion.
+
+  Args:
+    step_fn: A function taking a nested structure of `tf.data.Iterator` or
+      `DistributedIterator`. Currently, any return values are ignored.
+
+  Returns:
+    A loop function taking required `iterator` and `num_steps` parameters. If
+    called inside a `tf.function`, the loop will be converted by AutoGraph into
+    a `tf.while_loop` construct. See the `loop_fn` definition below for
+    additional details.
+  """
+
+  def loop_fn(iterator, num_steps):
+    """Makes `num_steps` calls to `step_fn(iterator)`.
+
+    Args:
+      iterator: A nested structure of `tf.data.Iterator` or
+        `DistributedIterator`.
+      num_steps: The number of steps in the loop. Should be passed as a
+        `tf.Tensor`. Iterating until iterator exhaustion is not supported.
+    """
+    if not isinstance(num_steps, tf.Tensor):
+      raise ValueError(
+          "`num_steps` should be a `tf.Tensor`. Passing a Python value can "
+          "cause unnecessary retracing when wrapped by `tf.function`.")
+
+    for _ in tf.range(num_steps):
+      # Clear out the outer name scope so the ops created inside `tf.while_loop`
+      # don't get "while/" as name prefix.
+      with tf.name_scope(""):
+        step_fn(iterator)
+
+  return loop_fn
+
+
+def create_tf_while_loop_fn_with_state(step_fn):
+  """Creates a TF while loop function with state.
+
+  This function is similar to `create_tf_while_loop_fn`, but allowing a `state`
+  to be accumulated over multiple iterations of the loop. Note that the
+  structure of the `state` cannot be changed across iterations.
+
+  Args:
+    step_fn: A function taking a nested structure of `tf.data.Iterator` or
+      `DistributedIterator`. Currently, any return values are ignored.
+
+  Returns:
+    A loop function taking required `iterator`, `num_steps`, `state` and
+    `reduce_fn` parameters. If called inside a `tf.function`, the loop will be
+    converted by AutoGraph into a `tf.while_loop` construct. See the `loop_fn`
+    definition below for additional details.
+  """
+
+  def loop_fn_with_state(iterator, num_steps, state, reduce_fn):
+    """Makes `num_steps` calls to `step_fn(iterator)`.
+
+    Args:
+      iterator: A nested structure of `tf.data.Iterator` or
+        `DistributedIterator`.
+      num_steps: The number of steps in the loop. Should be passed as a
+        `tf.Tensor`. Iterating until iterator exhaustion is not supported.
+      state: An initial state before running the loop.
+      reduce_fn: A callable taking two inputs, `state` and `value`, where
+        `state` is the previous output from `reduce_fn`, and `value` is the
+        output from `step_fn`.
+
+    Returns:
+      The final state returned by `reduce_fn`.
+    """
+    if not isinstance(num_steps, tf.Tensor):
+      raise ValueError(
+          "`num_steps` should be a `tf.Tensor`. Passing a Python value can "
+          "cause unnecessary retracing when wrapped by `tf.function`.")
+
+    for _ in tf.range(num_steps):
+      # Clear out the outer name scope so the ops created inside `tf.while_loop`
+      # don't get "while/" as name prefix.
+      with tf.name_scope(""):
+        # Relax the shapes within the loop, so the shape of `state` can change
+        # across iterations. This is useful to aggregate outputs from each step
+        # and concat to `state`.
+        tf.autograph.experimental.set_loop_options(
+            shape_invariants=[(t, tf.TensorShape([None] * t.shape.rank))
+                              for t in tf.nest.flatten(state)
+                              if tf.is_tensor(t)])
+        outputs = step_fn(iterator)
+        state = reduce_fn(state, outputs)
+    return state
+
+  return loop_fn_with_state
+
+
+class LoopFnWithSummaries(tpu_summaries.OptionalSummariesFunction):
+  """Implements a two-program approach for optimizing summaries on TPU.
+
+  This version works with the result of `create_tf_while_loop_fn`.
+  """
+
+  def __call__(self, iterator, num_steps):
+    if tf.summary.should_record_summaries():
+      output = self.with_summaries(iterator, tf.constant(1))
+      num_steps -= 1
+    if num_steps >= 1:
+      output = self.without_summaries(iterator, num_steps)
+    return output
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/utils/summary_manager.py b/nlp/text_classification/bert/tensorflow2.0/orbit/utils/summary_manager.py
new file mode 100644
index 000000000..63a44940f
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/utils/summary_manager.py
@@ -0,0 +1,110 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Provides a utility class for managing summary writing."""
+
+import os
+
+import tensorflow as tf
+
+
+class SummaryManager:
+  """A utility class for managing summary writing."""
+
+  def __init__(self, summary_dir, summary_fn, global_step=None):
+    """Initializes the `SummaryManager` instance.
+
+    Args:
+      summary_dir: The directory in which to write summaries. If `None`, all
+        summary writing operations provided by this class are no-ops.
+      summary_fn: A callable defined accepting `name`, `value`, and `step`
+        parameters, making calls to `tf.summary` functions to write summaries.
+      global_step: A `tf.Variable` containing the global step value.
+    """
+    self._enabled = summary_dir is not None
+    self._summary_dir = summary_dir
+    self._summary_fn = summary_fn
+    self._summary_writers = {}
+
+    if global_step is None:
+      self._global_step = tf.summary.experimental.get_step()
+    else:
+      self._global_step = global_step
+
+  def summary_writer(self, relative_path=""):
+    """Returns the underlying summary writer for a specific subdirectory.
+
+    Args:
+      relative_path: The current path in which to write summaries, relative to
+        the summary directory. By default it is empty, which corresponds to the
+        root directory.
+    """
+    if self._summary_writers and relative_path in self._summary_writers:
+      return self._summary_writers[relative_path]
+    if self._enabled:
+      self._summary_writers[relative_path] = tf.summary.create_file_writer(
+          os.path.join(self._summary_dir, relative_path))
+    else:
+      self._summary_writers[relative_path] = tf.summary.create_noop_writer()
+    return self._summary_writers[relative_path]
+
+  def flush(self):
+    """Flushes the underlying summary writers."""
+    if self._enabled:
+      tf.nest.map_structure(tf.summary.flush, self._summary_writers)
+
+  def write_summaries(self, summary_dict):
+    """Writes summaries for the given dictionary of values.
+
+    This recursively creates subdirectories for any nested dictionaries
+    provided in `summary_dict`, yielding a hierarchy of directories which will
+    then be reflected in the TensorBoard UI as different colored curves.
+
+    For example, users may evaluate on multiple datasets and return
+    `summary_dict` as a nested dictionary:
+
+        {
+            "dataset1": {
+                "loss": loss1,
+                "accuracy": accuracy1
+            },
+            "dataset2": {
+                "loss": loss2,
+                "accuracy": accuracy2
+            },
+        }
+
+    This will create two subdirectories, "dataset1" and "dataset2", inside the
+    summary root directory. Each directory will contain event files including
+    both "loss" and "accuracy" summaries.
+
+    Args:
+      summary_dict: A dictionary of values. If any value in `summary_dict` is
+        itself a dictionary, then the function will create a subdirectory with
+        name given by the corresponding key. This is performed recursively. Leaf
+        values are then summarized using the summary writer instance specific to
+        the parent relative path.
+    """
+    if not self._enabled:
+      return
+    self._write_summaries(summary_dict)
+
+  def _write_summaries(self, summary_dict, relative_path=""):
+    for name, value in summary_dict.items():
+      if isinstance(value, dict):
+        self._write_summaries(
+            value, relative_path=os.path.join(relative_path, name))
+      else:
+        with self.summary_writer(relative_path).as_default():
+          self._summary_fn(name, value, step=self._global_step)
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/utils/tpu_summaries.py b/nlp/text_classification/bert/tensorflow2.0/orbit/utils/tpu_summaries.py
new file mode 100644
index 000000000..3501c7aa8
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/utils/tpu_summaries.py
@@ -0,0 +1,145 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Contains utilities for TPU summary optimization."""
+
+import contextlib
+import functools
+
+import tensorflow as tf
+
+
+@contextlib.contextmanager
+def _soft_device_placement():
+  """Context manager for soft device placement, allowing summaries on CPU."""
+  original_setting = tf.config.get_soft_device_placement()
+  try:
+    tf.config.set_soft_device_placement(True)
+    yield
+  finally:
+    tf.config.set_soft_device_placement(original_setting)
+
+
+class OptionalSummariesFunction:
+  """Wrapper that provides versions of a function with and without summaries.
+
+  This is a utility class for implementing optimized summary recording via a
+  two-function approach, specifically important for TPUs. Two `tf.function`
+  versions of a given `function` are created: one with soft device placement
+  enabled (for use on steps that require summary writing), and one with summary
+  writing and soft device placement entirely disabled (for use on all other
+  steps). This removes any performance impact of summaries on steps where they
+  aren't recorded (b/148418718).
+
+  This class can be used as a base class to implement summary optimizations for
+  a function with a specific signature. For example, to implement efficient TPU
+  summaries for a standard `train()` method (as in `orbit.AbstractTrainer`):
+
+      class TrainFunctionWithSummaries(orbit.utils.OptionalSummariesFunction):
+        '''Implements a two-program approach for summaries on TPU.'''
+
+        def __call__(self, num_steps):
+          if tf.summary.should_record_summaries():
+            output = self.with_summaries(tf.constant(1))
+            num_steps -= 1
+          if num_steps >= 1:
+            output = self.without_summaries(num_steps)
+          return output
+
+  This can be used directly or to implement a decorator:
+
+      def train_function_with_summaries(function=None, **kwargs):
+        if function is not None:
+          return TrainFunctionWithSummaries(function, **kwargs)
+        return functools.partial(TrainFunctionWithSummaries, **kwargs)
+
+  The decorator can be applied directly to `train()` methods:
+
+      @train_function_with_summaries
+      def train(self, num_steps):
+        ...
+
+  A similar approach approach can be implemented for functions with different
+  signatures.
+
+  Note: The above approach assumes that the frequency of summary writing is
+  based on a step interval that is divisible by the number of steps executed
+  in each call to the `train()` function. This is enforced by the
+  `orbit.Controller`.
+
+  This wrapper properly handles instance methods (see `__get__`).
+
+  Attributes:
+    with_summaries: A wrapped version of the underlying function with summaries
+      enabled (using whatever the active predicate is for
+      `tf.summary.record_if`), and placed inside a "soft device placement"
+      context to enable summary recording on TPU.
+    without_summaries: A wrapped version of the underlying function with all
+      summary recording disabled.
+  """
+
+  def __init__(self, function, **tf_function_kwargs):
+    """Constructs an instance wrapping the given `function`.
+
+    The given `function` is wrapped twice: Once in a "soft device placement"
+    context (allowing summaries to also run on TPU), and once with summary
+    recording entirely disabled.
+
+    Both of these versions are compiled via `tf.function` (optionally using any
+    supplied `tf.function` settings), and made available as attributes.
+
+    Args:
+      function: The underlying function to wrap.
+      **tf_function_kwargs: Additional arguments to pass to `tf.function`.
+    """
+
+    @tf.function(**tf_function_kwargs)
+    @functools.wraps(function)
+    def with_summaries(*args, **kwargs):
+      with _soft_device_placement():
+        return function(*args, **kwargs)
+
+    @tf.function(**tf_function_kwargs)
+    @functools.wraps(function)
+    def without_summaries(*args, **kwargs):
+      with tf.summary.record_if(False):
+        return function(*args, **kwargs)
+
+    self.with_summaries = with_summaries
+    self.without_summaries = without_summaries
+
+  def __get__(self, instance, owner):
+    """Allows this class to be used to wrap methods as well as free functions.
+
+    For `tf.function` to work properly in all cases (e.g., when an
+    input_signature is specified), any `tf.function`-converted methods must be
+    properly bound to an instance if they are called as an instance method.
+
+    This is done by implementing this `__get__` method of the descriptor
+    protocol, and forwarding to the `__get__` method on the underlying
+    `tf.function`s.
+
+    Args:
+      instance: The instance to bind to.
+      owner: The class type of the instance.
+
+    Returns:
+      A new bound instance of `TpuDiscretionarySummariesFunctions`.
+    """
+    new = object.__new__(self.__class__)
+    # pytype: disable=attribute-error  # See b/162476201.
+    new.with_summaries = self.with_summaries.__get__(instance, owner)
+    new.without_summaries = self.without_summaries.__get__(instance, owner)
+    # pytype: enable=attribute-error
+    return new
diff --git a/nlp/text_classification/bert/tensorflow2.0/orbit/utils/tpu_summaries_test.py b/nlp/text_classification/bert/tensorflow2.0/orbit/utils/tpu_summaries_test.py
new file mode 100644
index 000000000..4aa0d0820
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/orbit/utils/tpu_summaries_test.py
@@ -0,0 +1,120 @@
+# Copyright 2021 The Orbit Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for orbit.utils.tpu_summaries."""
+
+import functools
+import os
+
+from orbit.utils import common
+from orbit.utils import tpu_summaries
+
+import tensorflow as tf
+
+
+class TrainFunctionWithSummaries(tpu_summaries.OptionalSummariesFunction):
+  """Implements a two-program approach for summaries on TPU."""
+
+  def __call__(self, num_steps):
+    if tf.summary.should_record_summaries():
+      output = self.with_summaries(tf.constant(1))
+      num_steps -= 1
+    if num_steps >= 1:
+      output = self.without_summaries(num_steps)
+    return output
+
+
+def train_function_with_summaries(function=None, **kwargs):
+  if function is not None:
+    return TrainFunctionWithSummaries(function, **kwargs)
+  return functools.partial(TrainFunctionWithSummaries, **kwargs)
+
+
+class DummyTrainer(tf.Module):
+
+  def __init__(self):
+    self.step_counter = common.create_global_step()
+
+  @train_function_with_summaries
+  def train_with_tpu_summary_optimization(self, num_steps):
+    for _ in tf.range(num_steps):
+      tf.summary.scalar("step", self.step_counter, step=self.step_counter)
+      self.step_counter.assign_add(1)
+    return self.step_counter
+
+  @train_function_with_summaries(
+      input_signature=[tf.TensorSpec((), dtype=tf.int32)])
+  def train_with_tpu_summary_optimization_and_input_signature(self, num_steps):
+    for _ in tf.range(num_steps):
+      tf.summary.scalar("step", self.step_counter, step=self.step_counter)
+      self.step_counter.assign_add(1)
+    return self.step_counter
+
+  def train_with_tpu_summary_optimization_no_decorator(self, num_steps):
+    for _ in tf.range(num_steps):
+      tf.summary.scalar("step", self.step_counter, step=self.step_counter)
+      self.step_counter.assign_add(1)
+    return self.step_counter
+
+
+class TpuSummariesTest(tf.test.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self.trainer = DummyTrainer()
+
+  def _get_events_from_logdir(self, logdir):
+    event_files = tf.io.gfile.listdir(logdir)
+    self.assertLen(event_files, 1)
+    path = os.path.join(logdir, event_files[0])
+    events = list(tf.compat.v1.train.summary_iterator(path))
+    return [event for event in events if event.WhichOneof("what") == "summary"]
+
+  def _validate_tpu_summary_optimization(self, function, *args, **kwargs):
+    logdir = self.get_temp_dir()
+    with tf.summary.create_file_writer(logdir).as_default():
+      with tf.summary.record_if(lambda: self.trainer.step_counter % 20 == 0):
+        for _ in range(4):
+          output = function(tf.constant(10), *args, **kwargs)
+    events = self._get_events_from_logdir(logdir)
+    self.assertLen(events, 2)
+    self.assertEqual(events[0].step, 0)
+    self.assertEqual(events[1].step, 20)
+    return output
+
+  def test_train_with_tpu_summary_optimization(self):
+    output = self._validate_tpu_summary_optimization(
+        self.trainer.train_with_tpu_summary_optimization)
+    self.assertEqual(output, self.trainer.step_counter.numpy())
+
+  def test_train_with_tpu_summary_optimization_no_decorator(self):
+    optimized = train_function_with_summaries(
+        self.trainer.train_with_tpu_summary_optimization_no_decorator)
+    output = self._validate_tpu_summary_optimization(optimized)
+    self.assertEqual(output, self.trainer.step_counter.numpy())
+
+  def test_train_with_tpu_summary_optimization_and_input_signature(self):
+    output = self._validate_tpu_summary_optimization(
+        self.trainer.train_with_tpu_summary_optimization_and_input_signature)
+    self.assertEqual(output, self.trainer.step_counter.numpy())
+    function = self.trainer.train_with_tpu_summary_optimization_and_input_signature
+    expected = (tf.TensorSpec((), dtype=tf.int32),)
+    input_signature = function.with_summaries.input_signature
+    self.assertEqual(input_signature, expected)
+    input_signature = function.without_summaries.input_signature
+    self.assertEqual(input_signature, expected)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/nlp/text_classification/bert/tensorflow2.0/process_data.sh b/nlp/text_classification/bert/tensorflow2.0/process_data.sh
new file mode 100644
index 000000000..4483620f1
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/process_data.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+export GLUE_DIR=./data
+export BERT_DIR=./pretrained_model/uncased_L-12_H-768_A-12
+export TASK_NAME=MRPC
+export OUTPUT_DIR=./datasets
+export PYTHONPATH=/home/jingliang.chen/models-2.6.0/:$PYTHONPATH
+
+python3 ../data/create_finetuning_data.py \
+ --input_data_dir=${GLUE_DIR}/${TASK_NAME}/ \
+ --vocab_file=${BERT_DIR}/vocab.txt \
+ --train_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_train.tf_record \
+ --eval_data_output_path=${OUTPUT_DIR}/${TASK_NAME}_eval.tf_record \
+ --meta_data_file_path=${OUTPUT_DIR}/${TASK_NAME}_meta_data \
+ --fine_tuning_task_type=classification --max_seq_length=128 \
+ --classification_task_name=${TASK_NAME}
diff --git a/nlp/text_classification/bert/tensorflow2.0/run_classifier.py b/nlp/text_classification/bert/tensorflow2.0/run_classifier.py
new file mode 100644
index 000000000..a6e4b10cf
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/run_classifier.py
@@ -0,0 +1,561 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""BERT classification or regression finetuning runner in TF 2.x."""
+
+import functools
+import json
+import platform
+import re
+
+import math
+import os
+import sys
+import random
+import numpy as np
+# Import libraries
+from absl import app
+from absl import flags
+from absl import logging
+import gin
+import tensorflow as tf
+from common import distribute_utils
+from modeling import performance
+import optimization
+import bert_models
+import common_flags
+import configs as bert_configs
+import input_pipeline
+import model_saving_utils
+from utils.misc import keras_utils
+
+flags.DEFINE_enum(
+    'mode', 'train_and_eval', ['train_and_eval', 'export_only', 'predict'],
+    'One of {"train_and_eval", "export_only", "predict"}. `train_and_eval`: '
+    'trains the model and evaluates in the meantime. '
+    '`export_only`: will take the latest checkpoint inside '
+    'model_dir and export a `SavedModel`. `predict`: takes a checkpoint and '
+    'restores the model to output predictions on the test set.')
+flags.DEFINE_string('train_data_path', None,
+                    'Path to training data for BERT classifier.')
+flags.DEFINE_string('eval_data_path', None,
+                    'Path to evaluation data for BERT classifier.')
+flags.DEFINE_string(
+    'input_meta_data_path', None,
+    'Path to file that contains meta data about input '
+    'to be used for training and evaluation.')
+flags.DEFINE_integer('train_data_size', None, 'Number of training samples '
+                     'to use. If None, uses the full train data. '
+                     '(default: None).')
+flags.DEFINE_string('predict_checkpoint_path', None,
+                    'Path to the checkpoint for predictions.')
+flags.DEFINE_integer(
+    'num_eval_per_epoch', 1,
+    'Number of evaluations per epoch. The purpose of this flag is to provide '
+    'more granular evaluation scores and checkpoints. For example, if original '
+    'data has N samples and num_eval_per_epoch is n, then each epoch will be '
+    'evaluated every N/n samples.')
+flags.DEFINE_integer('train_batch_size', 32, 'Batch size for training.')
+flags.DEFINE_integer('eval_batch_size', 32, 'Batch size for evaluation.')
+flags.DEFINE_float('acc_target', 0.82, 'target accuravy of evaluation.')
+
+common_flags.define_common_bert_flags()
+
+FLAGS = flags.FLAGS
+
+LABEL_TYPES_MAP = {'int': tf.int64, 'float': tf.float32}
+
+
+def get_loss_fn(num_classes):
+  """Gets the classification loss function."""
+
+  def classification_loss_fn(labels, logits):
+    """Classification loss."""
+    labels = tf.squeeze(labels)
+    log_probs = tf.nn.log_softmax(logits, axis=-1)
+    one_hot_labels = tf.one_hot(
+        tf.cast(labels, dtype=tf.int32), depth=num_classes, dtype=tf.float32)
+    per_example_loss = -tf.reduce_sum(
+        tf.cast(one_hot_labels, dtype=tf.float32) * log_probs, axis=-1)
+    return tf.reduce_mean(per_example_loss)
+
+  return classification_loss_fn
+
+
+def get_dataset_fn(input_file_pattern,
+                   max_seq_length,
+                   global_batch_size,
+                   is_training,
+                   label_type=tf.int64,
+                   include_sample_weights=False,
+                   num_samples=None):
+  """Gets a closure to create a dataset."""
+
+  def _dataset_fn(ctx=None):
+    """Returns tf.data.Dataset for distributed BERT pretraining."""
+    batch_size = ctx.get_per_replica_batch_size(
+        global_batch_size) if ctx else global_batch_size
+    dataset = input_pipeline.create_classifier_dataset(
+        tf.io.gfile.glob(input_file_pattern),
+        max_seq_length,
+        batch_size,
+        is_training=is_training,
+        input_pipeline_context=ctx,
+        label_type=label_type,
+        include_sample_weights=include_sample_weights,
+        num_samples=num_samples)
+    return dataset
+
+  return _dataset_fn
+
+
+def run_bert_classifier(strategy,
+                        bert_config,
+                        input_meta_data,
+                        model_dir,
+                        epochs,
+                        steps_per_epoch,
+                        steps_per_loop,
+                        eval_steps,
+                        warmup_steps,
+                        initial_lr,
+                        init_checkpoint,
+                        train_input_fn,
+                        eval_input_fn,
+                        training_callbacks=True,
+                        custom_callbacks=None,
+                        custom_metrics=None):
+  """Run BERT classifier training using low-level API."""
+  max_seq_length = input_meta_data['max_seq_length']
+  num_classes = input_meta_data.get('num_labels', 1)
+  is_regression = num_classes == 1
+
+  def _get_classifier_model():
+    """Gets a classifier model."""
+    classifier_model, core_model = (
+        bert_models.classifier_model(
+            bert_config,
+            num_classes,
+            max_seq_length,
+            hub_module_url=FLAGS.hub_module_url,
+            hub_module_trainable=FLAGS.hub_module_trainable))
+    optimizer = optimization.create_optimizer(initial_lr,
+                                              steps_per_epoch * epochs,
+                                              warmup_steps, FLAGS.end_lr,
+                                              FLAGS.optimizer_type)
+    classifier_model.optimizer = performance.configure_optimizer(
+        optimizer,
+        use_float16=common_flags.use_float16(),
+        use_graph_rewrite=common_flags.use_graph_rewrite())
+    return classifier_model, core_model
+
+  # tf.keras.losses objects accept optional sample_weight arguments (eg. coming
+  # from the dataset) to compute weighted loss, as used for the regression
+  # tasks. The classification tasks, using the custom get_loss_fn don't accept
+  # sample weights though.
+  loss_fn = (tf.keras.losses.MeanSquaredError() if is_regression
+             else get_loss_fn(num_classes))
+
+  # Defines evaluation metrics function, which will create metrics in the
+  # correct device and strategy scope.
+  if custom_metrics:
+    metric_fn = custom_metrics
+  elif is_regression:
+    metric_fn = functools.partial(
+        tf.keras.metrics.MeanSquaredError,
+        'mean_squared_error',
+        dtype=tf.float32)
+  else:
+    metric_fn = functools.partial(
+        tf.keras.metrics.SparseCategoricalAccuracy,
+        'accuracy',
+        dtype=tf.float32)
+
+  # Start training using Keras compile/fit API.
+  logging.info('Training using TF 2.x Keras compile/fit API with '
+               'distribution strategy.')
+  return run_keras_compile_fit(
+      model_dir,
+      strategy,
+      _get_classifier_model,
+      train_input_fn,
+      eval_input_fn,
+      loss_fn,
+      metric_fn,
+      init_checkpoint,
+      epochs,
+      steps_per_epoch,
+      steps_per_loop,
+      eval_steps,
+      training_callbacks=training_callbacks,
+      custom_callbacks=custom_callbacks)
+
+
+def run_keras_compile_fit(model_dir,
+                          strategy,
+                          model_fn,
+                          train_input_fn,
+                          eval_input_fn,
+                          loss_fn,
+                          metric_fn,
+                          init_checkpoint,
+                          epochs,
+                          steps_per_epoch,
+                          steps_per_loop,
+                          eval_steps,
+                          training_callbacks=True,
+                          custom_callbacks=None):
+  """Runs BERT classifier model using Keras compile/fit API."""
+
+  with strategy.scope():
+    training_dataset = train_input_fn()
+    evaluation_dataset = eval_input_fn() if eval_input_fn else None
+    bert_model, sub_model = model_fn()
+    optimizer = bert_model.optimizer
+
+    if init_checkpoint:
+      checkpoint = tf.train.Checkpoint(model=sub_model, encoder=sub_model)
+      checkpoint.read(init_checkpoint).assert_existing_objects_matched()
+
+    if not isinstance(metric_fn, (list, tuple)):
+      metric_fn = [metric_fn]
+    bert_model.compile(
+        optimizer=optimizer,
+        loss=loss_fn,
+        metrics=[fn() for fn in metric_fn],
+        steps_per_execution=steps_per_loop)
+
+    summary_dir = os.path.join(model_dir, 'summaries')
+    summary_callback = tf.keras.callbacks.TensorBoard(summary_dir)
+    checkpoint = tf.train.Checkpoint(model=bert_model, optimizer=optimizer)
+    checkpoint_manager = tf.train.CheckpointManager(
+        checkpoint,
+        directory=model_dir,
+        max_to_keep=None,
+        step_counter=optimizer.iterations,
+        checkpoint_interval=0)
+    using_py_ver = platform.python_version()
+    dest_py_ver = "3.7"
+    need_bypass = re.match(dest_py_ver, using_py_ver)
+    if need_bypass:
+        checkpoint_callback = []
+    else:
+        checkpoint_callback = keras_utils.SimpleCheckpoint(checkpoint_manager)
+
+    if training_callbacks:
+      if custom_callbacks is not None:
+        custom_callbacks += [summary_callback, checkpoint_callback]
+      else:
+        custom_callbacks = [summary_callback, checkpoint_callback]
+
+    history = bert_model.fit(
+        x=training_dataset,
+        validation_data=evaluation_dataset,
+        steps_per_epoch=steps_per_epoch,
+        epochs=epochs,
+        validation_steps=eval_steps,
+        callbacks=custom_callbacks,
+        verbose=2)
+    stats = {'total_training_steps': steps_per_epoch * epochs}
+    if 'loss' in history.history:
+      stats['train_loss'] = history.history['loss'][-1]
+    if 'val_accuracy' in history.history:
+      stats['eval_metrics'] = history.history['val_accuracy'][-1]
+    return bert_model, stats
+
+
+def get_predictions_and_labels(strategy,
+                               trained_model,
+                               eval_input_fn,
+                               is_regression=False,
+                               return_probs=False):
+  """Obtains predictions of trained model on evaluation data.
+
+  Note that list of labels is returned along with the predictions because the
+  order changes on distributing dataset over TPU pods.
+
+  Args:
+    strategy: Distribution strategy.
+    trained_model: Trained model with preloaded weights.
+    eval_input_fn: Input function for evaluation data.
+    is_regression: Whether it is a regression task.
+    return_probs: Whether to return probabilities of classes.
+
+  Returns:
+    predictions: List of predictions.
+    labels: List of gold labels corresponding to predictions.
+  """
+
+  @tf.function
+  def test_step(iterator):
+    """Computes predictions on distributed devices."""
+
+    def _test_step_fn(inputs):
+      """Replicated predictions."""
+      inputs, labels = inputs
+      logits = trained_model(inputs, training=False)
+      if not is_regression:
+        probabilities = tf.nn.softmax(logits)
+        return probabilities, labels
+      else:
+        return logits, labels
+
+    outputs, labels = strategy.run(_test_step_fn, args=(next(iterator),))
+    # outputs: current batch logits as a tuple of shard logits
+    outputs = tf.nest.map_structure(strategy.experimental_local_results,
+                                    outputs)
+    labels = tf.nest.map_structure(strategy.experimental_local_results, labels)
+    return outputs, labels
+
+  def _run_evaluation(test_iterator):
+    """Runs evaluation steps."""
+    preds, golds = list(), list()
+    try:
+      with tf.experimental.async_scope():
+        while True:
+          probabilities, labels = test_step(test_iterator)
+          for cur_probs, cur_labels in zip(probabilities, labels):
+            if return_probs:
+              preds.extend(cur_probs.numpy().tolist())
+            else:
+              preds.extend(tf.math.argmax(cur_probs, axis=1).numpy())
+            golds.extend(cur_labels.numpy().tolist())
+    except (StopIteration, tf.errors.OutOfRangeError):
+      tf.experimental.async_clear_error()
+    return preds, golds
+
+  test_iter = iter(strategy.distribute_datasets_from_function(eval_input_fn))
+  predictions, labels = _run_evaluation(test_iter)
+
+  return predictions, labels
+
+
+def export_classifier(model_export_path, input_meta_data, bert_config,
+                      model_dir):
+  """Exports a trained model as a `SavedModel` for inference.
+
+  Args:
+    model_export_path: a string specifying the path to the SavedModel directory.
+    input_meta_data: dictionary containing meta data about input and model.
+    bert_config: Bert configuration file to define core bert layers.
+    model_dir: The directory where the model weights and training/evaluation
+      summaries are stored.
+
+  Raises:
+    Export path is not specified, got an empty string or None.
+  """
+  if not model_export_path:
+    raise ValueError('Export path is not specified: %s' % model_export_path)
+  if not model_dir:
+    raise ValueError('Export path is not specified: %s' % model_dir)
+
+  # Export uses float32 for now, even if training uses mixed precision.
+  tf.keras.mixed_precision.set_global_policy('float32')
+  classifier_model = bert_models.classifier_model(
+      bert_config,
+      input_meta_data.get('num_labels', 1),
+      hub_module_url=FLAGS.hub_module_url,
+      hub_module_trainable=False)[0]
+
+  model_saving_utils.export_bert_model(
+      model_export_path, model=classifier_model, checkpoint_dir=model_dir)
+
+
+def run_bert(strategy,
+             input_meta_data,
+             model_config,
+             train_input_fn=None,
+             eval_input_fn=None,
+             init_checkpoint=None,
+             custom_callbacks=None,
+             custom_metrics=None):
+  """Run BERT training."""
+  # Enables XLA in Session Config. Should not be set for TPU.
+  keras_utils.set_session_config(FLAGS.enable_xla)
+  performance.set_mixed_precision_policy(common_flags.dtype())
+
+  epochs = FLAGS.num_train_epochs * FLAGS.num_eval_per_epoch
+  train_data_size = (
+      input_meta_data['train_data_size'] // FLAGS.num_eval_per_epoch)
+  if FLAGS.train_data_size:
+    train_data_size = min(train_data_size, FLAGS.train_data_size)
+    logging.info('Updated train_data_size: %s', train_data_size)
+  steps_per_epoch = int(train_data_size / FLAGS.train_batch_size)
+  warmup_steps = int(epochs * train_data_size * 0.1 / FLAGS.train_batch_size)
+  eval_steps = int(
+      math.ceil(input_meta_data['eval_data_size'] / FLAGS.eval_batch_size))
+
+  if not strategy:
+    raise ValueError('Distribution strategy has not been specified.')
+
+  if not custom_callbacks:
+    custom_callbacks = []
+
+  if FLAGS.log_steps:
+    custom_callbacks.append(
+        keras_utils.TimeHistory(
+            batch_size=FLAGS.train_batch_size,
+            log_steps=FLAGS.log_steps,
+            logdir=FLAGS.model_dir))
+
+  trained_model, status = run_bert_classifier(
+      strategy,
+      model_config,
+      input_meta_data,
+      FLAGS.model_dir,
+      epochs,
+      steps_per_epoch,
+      FLAGS.steps_per_loop,
+      eval_steps,
+      warmup_steps,
+      FLAGS.learning_rate,
+      init_checkpoint or FLAGS.init_checkpoint,
+      train_input_fn,
+      eval_input_fn,
+      custom_callbacks=custom_callbacks,
+      custom_metrics=custom_metrics)
+
+
+  if FLAGS.model_export_path:
+    model_saving_utils.export_bert_model(
+        FLAGS.model_export_path, model=trained_model)
+  return status['eval_metrics']
+
+
+def custom_main(custom_callbacks=None, custom_metrics=None):
+  """Run classification or regression.
+
+  Args:
+    custom_callbacks: list of tf.keras.Callbacks passed to training loop.
+    custom_metrics: list of metrics passed to the training loop.
+  """
+  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param)
+
+  if FLAGS.distribution_strategy == "multi_worker_mirrored":
+    FLAGS.worker_hosts = [worker_host for worker_host in FLAGS.worker_hosts.split(",")]
+    os.environ['TF_CONFIG'] = json.dumps({
+        'cluster': {
+            'worker': FLAGS.worker_hosts
+        },
+        'task': {'type': 'worker', 'index': FLAGS.task_index}
+    })
+
+  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
+    input_meta_data = json.loads(reader.read().decode('utf-8'))
+  label_type = LABEL_TYPES_MAP[input_meta_data.get('label_type', 'int')]
+  include_sample_weights = input_meta_data.get('has_sample_weights', False)
+
+  if not FLAGS.model_dir:
+    FLAGS.model_dir = '/tmp/bert20/'
+
+  bert_config = bert_configs.BertConfig.from_json_file(FLAGS.bert_config_file)
+
+  if FLAGS.mode == 'export_only':
+    export_classifier(FLAGS.model_export_path, input_meta_data, bert_config,
+                      FLAGS.model_dir)
+    return
+
+  strategy = distribute_utils.get_distribution_strategy(
+      distribution_strategy=FLAGS.distribution_strategy,
+      num_gpus=FLAGS.num_gpus,
+      tpu_address=FLAGS.tpu,
+      all_reduce_alg=FLAGS.all_reduce_alg)
+  eval_input_fn = get_dataset_fn(
+      FLAGS.eval_data_path,
+      input_meta_data['max_seq_length'],
+      FLAGS.eval_batch_size,
+      is_training=False,
+      label_type=label_type,
+      include_sample_weights=include_sample_weights)
+
+  if FLAGS.mode == 'predict':
+    num_labels = input_meta_data.get('num_labels', 1)
+    with strategy.scope():
+      classifier_model = bert_models.classifier_model(
+          bert_config, num_labels)[0]
+      checkpoint = tf.train.Checkpoint(model=classifier_model)
+      latest_checkpoint_file = (
+          FLAGS.predict_checkpoint_path or
+          tf.train.latest_checkpoint(FLAGS.model_dir))
+      assert latest_checkpoint_file
+      logging.info('Checkpoint file %s found and restoring from '
+                   'checkpoint', latest_checkpoint_file)
+      checkpoint.restore(
+          latest_checkpoint_file).assert_existing_objects_matched()
+      preds, _ = get_predictions_and_labels(
+          strategy,
+          classifier_model,
+          eval_input_fn,
+          is_regression=(num_labels == 1),
+          return_probs=True)
+    output_predict_file = os.path.join(FLAGS.model_dir, 'test_results.tsv')
+    with tf.io.gfile.GFile(output_predict_file, 'w') as writer:
+      logging.info('***** Predict results *****')
+      for probabilities in preds:
+        output_line = '\t'.join(
+            str(class_probability)
+            for class_probability in probabilities) + '\n'
+        writer.write(output_line)
+    return
+
+  if FLAGS.mode != 'train_and_eval':
+    raise ValueError('Unsupported mode is specified: %s' % FLAGS.mode)
+  train_input_fn = get_dataset_fn(
+      FLAGS.train_data_path,
+      input_meta_data['max_seq_length'],
+      FLAGS.train_batch_size,
+      is_training=True,
+      label_type=label_type,
+      include_sample_weights=include_sample_weights,
+      num_samples=FLAGS.train_data_size)
+  val_acc = run_bert(
+      strategy,
+      input_meta_data,
+      bert_config,
+      train_input_fn,
+      eval_input_fn,
+      custom_callbacks=custom_callbacks,
+      custom_metrics=custom_metrics)
+
+  if FLAGS.acc_target == 0 or val_acc >= FLAGS.acc_target:
+    print('task success!')
+    sys.exit(0)
+  else:
+    print('task failed!')
+    sys.exit(1)
+
+
+def seed_tensorflow(seed=17645):
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    tf.random.set_seed(seed)
+
+
+def main(_):
+  try:
+    from dltest import show_training_arguments
+    seed_tensorflow(seed=6879)
+    show_training_arguments(FLAGS)
+  except:
+    pass
+  custom_main(custom_callbacks=None, custom_metrics=None)
+
+
+
+if __name__ == '__main__':
+  flags.mark_flag_as_required('bert_config_file')
+  flags.mark_flag_as_required('input_meta_data_path')
+  flags.mark_flag_as_required('model_dir')
+  app.run(main)
diff --git a/nlp/text_classification/bert/tensorflow2.0/run_pretraining.py b/nlp/text_classification/bert/tensorflow2.0/run_pretraining.py
new file mode 100644
index 000000000..3dfbb8dad
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/run_pretraining.py
@@ -0,0 +1,218 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Run masked LM/next sentence pre-training for BERT in TF 2.x."""
+
+# Import libraries
+from absl import app
+from absl import flags
+from absl import logging
+import gin
+import tensorflow as tf
+from common import distribute_utils
+from modeling import performance
+import optimization
+import bert_models
+import common_flags
+import configs
+import input_pipeline
+import model_training_utils
+
+
+flags.DEFINE_string('input_files', None,
+                    'File path to retrieve training data for pre-training.')
+# Model training specific flags.
+flags.DEFINE_integer(
+    'max_seq_length', 128,
+    'The maximum total input sequence length after WordPiece tokenization. '
+    'Sequences longer than this will be truncated, and sequences shorter '
+    'than this will be padded.')
+flags.DEFINE_integer('max_predictions_per_seq', 20,
+                     'Maximum predictions per sequence_output.')
+flags.DEFINE_integer('train_batch_size', 32, 'Total batch size for training.')
+flags.DEFINE_integer('num_steps_per_epoch', 1000,
+                     'Total number of training steps to run per epoch.')
+flags.DEFINE_float('warmup_steps', 10000,
+                   'Warmup steps for Adam weight decay optimizer.')
+flags.DEFINE_bool('use_next_sentence_label', True,
+                  'Whether to use next sentence label to compute final loss.')
+flags.DEFINE_bool('train_summary_interval', 0, 'Step interval for training '
+                  'summaries. If the value is a negative number, '
+                  'then training summaries are not enabled.')
+
+common_flags.define_common_bert_flags()
+
+FLAGS = flags.FLAGS
+
+
+def get_pretrain_dataset_fn(input_file_pattern, seq_length,
+                            max_predictions_per_seq, global_batch_size,
+                            use_next_sentence_label=True):
+  """Returns input dataset from input file string."""
+  def _dataset_fn(ctx=None):
+    """Returns tf.data.Dataset for distributed BERT pretraining."""
+    input_patterns = input_file_pattern.split(',')
+    batch_size = ctx.get_per_replica_batch_size(global_batch_size)
+    train_dataset = input_pipeline.create_pretrain_dataset(
+        input_patterns,
+        seq_length,
+        max_predictions_per_seq,
+        batch_size,
+        is_training=True,
+        input_pipeline_context=ctx,
+        use_next_sentence_label=use_next_sentence_label)
+    return train_dataset
+
+  return _dataset_fn
+
+
+def get_loss_fn():
+  """Returns loss function for BERT pretraining."""
+
+  def _bert_pretrain_loss_fn(unused_labels, losses, **unused_args):
+    return tf.reduce_mean(losses)
+
+  return _bert_pretrain_loss_fn
+
+
+def run_customized_training(strategy,
+                            bert_config,
+                            init_checkpoint,
+                            max_seq_length,
+                            max_predictions_per_seq,
+                            model_dir,
+                            steps_per_epoch,
+                            steps_per_loop,
+                            epochs,
+                            initial_lr,
+                            warmup_steps,
+                            end_lr,
+                            optimizer_type,
+                            input_files,
+                            train_batch_size,
+                            use_next_sentence_label=True,
+                            train_summary_interval=0,
+                            custom_callbacks=None,
+                            explicit_allreduce=False,
+                            pre_allreduce_callbacks=None,
+                            post_allreduce_callbacks=None,
+                            allreduce_bytes_per_pack=0):
+  """Run BERT pretrain model training using low-level API."""
+
+  train_input_fn = get_pretrain_dataset_fn(input_files, max_seq_length,
+                                           max_predictions_per_seq,
+                                           train_batch_size,
+                                           use_next_sentence_label)
+
+  def _get_pretrain_model():
+    """Gets a pretraining model."""
+    pretrain_model, core_model = bert_models.pretrain_model(
+        bert_config, max_seq_length, max_predictions_per_seq,
+        use_next_sentence_label=use_next_sentence_label)
+    optimizer = optimization.create_optimizer(
+        initial_lr, steps_per_epoch * epochs, warmup_steps,
+        end_lr, optimizer_type)
+    pretrain_model.optimizer = performance.configure_optimizer(
+        optimizer,
+        use_float16=common_flags.use_float16(),
+        use_graph_rewrite=common_flags.use_graph_rewrite())
+    return pretrain_model, core_model
+
+  trained_model = model_training_utils.run_customized_training_loop(
+      strategy=strategy,
+      model_fn=_get_pretrain_model,
+      loss_fn=get_loss_fn(),
+      scale_loss=FLAGS.scale_loss,
+      model_dir=model_dir,
+      init_checkpoint=init_checkpoint,
+      train_input_fn=train_input_fn,
+      steps_per_epoch=steps_per_epoch,
+      steps_per_loop=steps_per_loop,
+      epochs=epochs,
+      sub_model_export_name='pretrained/bert_model',
+      explicit_allreduce=explicit_allreduce,
+      pre_allreduce_callbacks=pre_allreduce_callbacks,
+      post_allreduce_callbacks=post_allreduce_callbacks,
+      allreduce_bytes_per_pack=allreduce_bytes_per_pack,
+      train_summary_interval=train_summary_interval,
+      custom_callbacks=custom_callbacks)
+
+  return trained_model
+
+
+def run_bert_pretrain(strategy, custom_callbacks=None):
+  """Runs BERT pre-training."""
+
+  bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
+  if not strategy:
+    raise ValueError('Distribution strategy is not specified.')
+
+  # Runs customized training loop.
+  logging.info('Training using customized training loop TF 2.0 with distributed'
+               'strategy.')
+
+  performance.set_mixed_precision_policy(common_flags.dtype())
+
+  # Only when explicit_allreduce = True, post_allreduce_callbacks and
+  # allreduce_bytes_per_pack will take effect. optimizer.apply_gradients() no
+  # longer implicitly allreduce gradients, users manually allreduce gradient and
+  # pass the allreduced grads_and_vars to apply_gradients().
+  # With explicit_allreduce = True, clip_by_global_norm is moved to after
+  # allreduce.
+  return run_customized_training(
+      strategy,
+      bert_config,
+      FLAGS.init_checkpoint,  # Used to initialize only the BERT submodel.
+      FLAGS.max_seq_length,
+      FLAGS.max_predictions_per_seq,
+      FLAGS.model_dir,
+      FLAGS.num_steps_per_epoch,
+      FLAGS.steps_per_loop,
+      FLAGS.num_train_epochs,
+      FLAGS.learning_rate,
+      FLAGS.warmup_steps,
+      FLAGS.end_lr,
+      FLAGS.optimizer_type,
+      FLAGS.input_files,
+      FLAGS.train_batch_size,
+      FLAGS.use_next_sentence_label,
+      FLAGS.train_summary_interval,
+      custom_callbacks=custom_callbacks,
+      explicit_allreduce=FLAGS.explicit_allreduce,
+      pre_allreduce_callbacks=[
+          model_training_utils.clip_by_global_norm_callback
+      ],
+      allreduce_bytes_per_pack=FLAGS.allreduce_bytes_per_pack)
+
+
+def main(_):
+  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param)
+  if not FLAGS.model_dir:
+    FLAGS.model_dir = '/tmp/bert20/'
+  # Configures cluster spec for multi-worker distribution strategy.
+  if FLAGS.num_gpus > 0:
+    _ = distribute_utils.configure_cluster(FLAGS.worker_hosts, FLAGS.task_index)
+  strategy = distribute_utils.get_distribution_strategy(
+      distribution_strategy=FLAGS.distribution_strategy,
+      num_gpus=FLAGS.num_gpus,
+      all_reduce_alg=FLAGS.all_reduce_alg,
+      tpu_address=FLAGS.tpu)
+  if strategy:
+    print('***** Number of cores used : ', strategy.num_replicas_in_sync)
+
+  run_bert_pretrain(strategy)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/nlp/text_classification/bert/tensorflow2.0/run_squad.py b/nlp/text_classification/bert/tensorflow2.0/run_squad.py
new file mode 100644
index 000000000..8494f899d
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/run_squad.py
@@ -0,0 +1,148 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Run BERT on SQuAD 1.1 and SQuAD 2.0 in TF 2.x."""
+
+import json
+import os
+import time
+
+# Import libraries
+from absl import app
+from absl import flags
+from absl import logging
+import gin
+import tensorflow as tf
+from common import distribute_utils
+import configs as bert_configs
+import run_squad_helper
+import tokenization
+from data import squad_lib as squad_lib_wp
+from utils.misc import keras_utils
+
+
+flags.DEFINE_string('vocab_file', None,
+                    'The vocabulary file that the BERT model was trained on.')
+
+# More flags can be found in run_squad_helper.
+run_squad_helper.define_common_squad_flags()
+
+FLAGS = flags.FLAGS
+
+
+def train_squad(strategy,
+                input_meta_data,
+                custom_callbacks=None,
+                run_eagerly=False,
+                init_checkpoint=None,
+                sub_model_export_name=None):
+  """Run bert squad training."""
+  bert_config = bert_configs.BertConfig.from_json_file(FLAGS.bert_config_file)
+  init_checkpoint = init_checkpoint or FLAGS.init_checkpoint
+  run_squad_helper.train_squad(strategy, input_meta_data, bert_config,
+                               custom_callbacks, run_eagerly, init_checkpoint,
+                               sub_model_export_name=sub_model_export_name)
+
+
+def predict_squad(strategy, input_meta_data):
+  """Makes predictions for the squad dataset."""
+  bert_config = bert_configs.BertConfig.from_json_file(FLAGS.bert_config_file)
+  tokenizer = tokenization.FullTokenizer(
+      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+  run_squad_helper.predict_squad(
+      strategy, input_meta_data, tokenizer, bert_config, squad_lib_wp)
+
+
+def eval_squad(strategy, input_meta_data):
+  """Evaluate on the squad dataset."""
+  bert_config = bert_configs.BertConfig.from_json_file(FLAGS.bert_config_file)
+  tokenizer = tokenization.FullTokenizer(
+      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+  eval_metrics = run_squad_helper.eval_squad(
+      strategy, input_meta_data, tokenizer, bert_config, squad_lib_wp)
+  return eval_metrics
+
+
+def export_squad(model_export_path, input_meta_data):
+  """Exports a trained model as a `SavedModel` for inference.
+
+  Args:
+    model_export_path: a string specifying the path to the SavedModel directory.
+    input_meta_data: dictionary containing meta data about input and model.
+
+  Raises:
+    Export path is not specified, got an empty string or None.
+  """
+  bert_config = bert_configs.BertConfig.from_json_file(FLAGS.bert_config_file)
+  run_squad_helper.export_squad(model_export_path, input_meta_data, bert_config)
+
+
+def main(_):
+  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param)
+
+  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
+    input_meta_data = json.loads(reader.read().decode('utf-8'))
+
+  if FLAGS.mode == 'export_only':
+    export_squad(FLAGS.model_export_path, input_meta_data)
+    return
+
+  # Configures cluster spec for multi-worker distribution strategy.
+  if FLAGS.num_gpus > 0:
+    _ = distribute_utils.configure_cluster(FLAGS.worker_hosts, FLAGS.task_index)
+  strategy = distribute_utils.get_distribution_strategy(
+      distribution_strategy=FLAGS.distribution_strategy,
+      num_gpus=FLAGS.num_gpus,
+      all_reduce_alg=FLAGS.all_reduce_alg,
+      tpu_address=FLAGS.tpu)
+
+  if 'train' in FLAGS.mode:
+    if FLAGS.log_steps:
+      custom_callbacks = [keras_utils.TimeHistory(
+          batch_size=FLAGS.train_batch_size,
+          log_steps=FLAGS.log_steps,
+          logdir=FLAGS.model_dir,
+      )]
+    else:
+      custom_callbacks = None
+
+    train_squad(
+        strategy,
+        input_meta_data,
+        custom_callbacks=custom_callbacks,
+        run_eagerly=FLAGS.run_eagerly,
+        sub_model_export_name=FLAGS.sub_model_export_name,
+    )
+  if 'predict' in FLAGS.mode:
+    predict_squad(strategy, input_meta_data)
+  if 'eval' in FLAGS.mode:
+    eval_metrics = eval_squad(strategy, input_meta_data)
+    f1_score = eval_metrics['final_f1']
+    logging.info('SQuAD eval F1-score: %f', f1_score)
+    summary_dir = os.path.join(FLAGS.model_dir, 'summaries', 'eval')
+    summary_writer = tf.summary.create_file_writer(summary_dir)
+    with summary_writer.as_default():
+      # TODO(lehou): write to the correct step number.
+      tf.summary.scalar('F1-score', f1_score, step=0)
+      summary_writer.flush()
+    # Also write eval_metrics to json file.
+    squad_lib_wp.write_to_json_files(
+        eval_metrics, os.path.join(summary_dir, 'eval_metrics.json'))
+    time.sleep(60)
+
+
+if __name__ == '__main__':
+  flags.mark_flag_as_required('bert_config_file')
+  flags.mark_flag_as_required('model_dir')
+  app.run(main)
diff --git a/nlp/text_classification/bert/tensorflow2.0/run_squad_helper.py b/nlp/text_classification/bert/tensorflow2.0/run_squad_helper.py
new file mode 100644
index 000000000..d498babd7
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/run_squad_helper.py
@@ -0,0 +1,472 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Library for running BERT family models on SQuAD 1.1/2.0 in TF 2.x."""
+
+import collections
+import json
+import os
+
+from absl import flags
+from absl import logging
+import tensorflow as tf
+from modeling import performance
+import optimization
+import bert_models
+import common_flags
+import input_pipeline
+import model_saving_utils
+import model_training_utils
+import squad_evaluate_v1_1
+import squad_evaluate_v2_0
+from data import squad_lib_sp
+from utils.misc import keras_utils
+
+
+def define_common_squad_flags():
+  """Defines common flags used by SQuAD tasks."""
+  flags.DEFINE_enum(
+      'mode', 'train_and_eval', [
+          'train_and_eval', 'train_and_predict', 'train', 'eval', 'predict',
+          'export_only'
+      ], 'One of {"train_and_eval", "train_and_predict", '
+      '"train", "eval", "predict", "export_only"}. '
+      '`train_and_eval`: train & predict to json files & compute eval metrics. '
+      '`train_and_predict`: train & predict to json files. '
+      '`train`: only trains the model. '
+      '`eval`: predict answers from squad json file & compute eval metrics. '
+      '`predict`: predict answers from the squad json file. '
+      '`export_only`: will take the latest checkpoint inside '
+      'model_dir and export a `SavedModel`.')
+  flags.DEFINE_string('train_data_path', '',
+                      'Training data path with train tfrecords.')
+  flags.DEFINE_string(
+      'input_meta_data_path', None,
+      'Path to file that contains meta data about input '
+      'to be used for training and evaluation.')
+  # Model training specific flags.
+  flags.DEFINE_integer('train_batch_size', 32, 'Total batch size for training.')
+  # Predict processing related.
+  flags.DEFINE_string(
+      'predict_file', None, 'SQuAD prediction json file path. '
+      '`predict` mode supports multiple files: one can use '
+      'wildcard to specify multiple files and it can also be '
+      'multiple file patterns separated by comma. Note that '
+      '`eval` mode only supports a single predict file.')
+  flags.DEFINE_bool(
+      'do_lower_case', True,
+      'Whether to lower case the input text. Should be True for uncased '
+      'models and False for cased models.')
+  flags.DEFINE_float(
+      'null_score_diff_threshold', 0.0,
+      'If null_score - best_non_null is greater than the threshold, '
+      'predict null. This is only used for SQuAD v2.')
+  flags.DEFINE_bool(
+      'verbose_logging', False,
+      'If true, all of the warnings related to data processing will be '
+      'printed. A number of warnings are expected for a normal SQuAD '
+      'evaluation.')
+  flags.DEFINE_integer('predict_batch_size', 8,
+                       'Total batch size for prediction.')
+  flags.DEFINE_integer(
+      'n_best_size', 20,
+      'The total number of n-best predictions to generate in the '
+      'nbest_predictions.json output file.')
+  flags.DEFINE_integer(
+      'max_answer_length', 30,
+      'The maximum length of an answer that can be generated. This is needed '
+      'because the start and end predictions are not conditioned on one '
+      'another.')
+
+  common_flags.define_common_bert_flags()
+
+
+FLAGS = flags.FLAGS
+
+
+def squad_loss_fn(start_positions, end_positions, start_logits, end_logits):
+  """Returns sparse categorical crossentropy for start/end logits."""
+  start_loss = tf.keras.losses.sparse_categorical_crossentropy(
+      start_positions, start_logits, from_logits=True)
+  end_loss = tf.keras.losses.sparse_categorical_crossentropy(
+      end_positions, end_logits, from_logits=True)
+
+  total_loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2
+  return total_loss
+
+
+def get_loss_fn():
+  """Gets a loss function for squad task."""
+
+  def _loss_fn(labels, model_outputs):
+    start_positions = labels['start_positions']
+    end_positions = labels['end_positions']
+    start_logits, end_logits = model_outputs
+    return squad_loss_fn(start_positions, end_positions, start_logits,
+                         end_logits)
+
+  return _loss_fn
+
+
+RawResult = collections.namedtuple('RawResult',
+                                   ['unique_id', 'start_logits', 'end_logits'])
+
+
+def get_raw_results(predictions):
+  """Converts multi-replica predictions to RawResult."""
+  for unique_ids, start_logits, end_logits in zip(predictions['unique_ids'],
+                                                  predictions['start_logits'],
+                                                  predictions['end_logits']):
+    for values in zip(unique_ids.numpy(), start_logits.numpy(),
+                      end_logits.numpy()):
+      yield RawResult(
+          unique_id=values[0],
+          start_logits=values[1].tolist(),
+          end_logits=values[2].tolist())
+
+
+def get_dataset_fn(input_file_pattern, max_seq_length, global_batch_size,
+                   is_training):
+  """Gets a closure to create a dataset.."""
+
+  def _dataset_fn(ctx=None):
+    """Returns tf.data.Dataset for distributed BERT pretraining."""
+    batch_size = ctx.get_per_replica_batch_size(
+        global_batch_size) if ctx else global_batch_size
+    dataset = input_pipeline.create_squad_dataset(
+        input_file_pattern,
+        max_seq_length,
+        batch_size,
+        is_training=is_training,
+        input_pipeline_context=ctx)
+    return dataset
+
+  return _dataset_fn
+
+
+def get_squad_model_to_predict(strategy, bert_config, checkpoint_path,
+                               input_meta_data):
+  """Gets a squad model to make predictions."""
+  with strategy.scope():
+    # Prediction always uses float32, even if training uses mixed precision.
+    tf.keras.mixed_precision.set_global_policy('float32')
+    squad_model, _ = bert_models.squad_model(
+        bert_config,
+        input_meta_data['max_seq_length'],
+        hub_module_url=FLAGS.hub_module_url)
+
+  if checkpoint_path is None:
+    checkpoint_path = tf.train.latest_checkpoint(FLAGS.model_dir)
+  logging.info('Restoring checkpoints from %s', checkpoint_path)
+  checkpoint = tf.train.Checkpoint(model=squad_model)
+  checkpoint.restore(checkpoint_path).expect_partial()
+  return squad_model
+
+
+def predict_squad_customized(strategy, input_meta_data, predict_tfrecord_path,
+                             num_steps, squad_model):
+  """Make predictions using a Bert-based squad model."""
+  predict_dataset_fn = get_dataset_fn(
+      predict_tfrecord_path,
+      input_meta_data['max_seq_length'],
+      FLAGS.predict_batch_size,
+      is_training=False)
+  predict_iterator = iter(
+      strategy.distribute_datasets_from_function(predict_dataset_fn))
+
+  @tf.function
+  def predict_step(iterator):
+    """Predicts on distributed devices."""
+
+    def _replicated_step(inputs):
+      """Replicated prediction calculation."""
+      x, _ = inputs
+      unique_ids = x.pop('unique_ids')
+      start_logits, end_logits = squad_model(x, training=False)
+      return dict(
+          unique_ids=unique_ids,
+          start_logits=start_logits,
+          end_logits=end_logits)
+
+    outputs = strategy.run(_replicated_step, args=(next(iterator),))
+    return tf.nest.map_structure(strategy.experimental_local_results, outputs)
+
+  all_results = []
+  for _ in range(num_steps):
+    predictions = predict_step(predict_iterator)
+    for result in get_raw_results(predictions):
+      all_results.append(result)
+    if len(all_results) % 100 == 0:
+      logging.info('Made predictions for %d records.', len(all_results))
+  return all_results
+
+
+def train_squad(strategy,
+                input_meta_data,
+                bert_config,
+                custom_callbacks=None,
+                run_eagerly=False,
+                init_checkpoint=None,
+                sub_model_export_name=None):
+  """Run bert squad training."""
+  if strategy:
+    logging.info('Training using customized training loop with distribution'
+                 ' strategy.')
+  # Enables XLA in Session Config. Should not be set for TPU.
+  keras_utils.set_session_config(FLAGS.enable_xla)
+  performance.set_mixed_precision_policy(common_flags.dtype())
+
+  epochs = FLAGS.num_train_epochs
+  num_train_examples = input_meta_data['train_data_size']
+  max_seq_length = input_meta_data['max_seq_length']
+  steps_per_epoch = int(num_train_examples / FLAGS.train_batch_size)
+  warmup_steps = int(epochs * num_train_examples * 0.1 / FLAGS.train_batch_size)
+  train_input_fn = get_dataset_fn(
+      FLAGS.train_data_path,
+      max_seq_length,
+      FLAGS.train_batch_size,
+      is_training=True)
+
+  def _get_squad_model():
+    """Get Squad model and optimizer."""
+    squad_model, core_model = bert_models.squad_model(
+        bert_config,
+        max_seq_length,
+        hub_module_url=FLAGS.hub_module_url,
+        hub_module_trainable=FLAGS.hub_module_trainable)
+    optimizer = optimization.create_optimizer(FLAGS.learning_rate,
+                                              steps_per_epoch * epochs,
+                                              warmup_steps, FLAGS.end_lr,
+                                              FLAGS.optimizer_type)
+
+    squad_model.optimizer = performance.configure_optimizer(
+        optimizer,
+        use_float16=common_flags.use_float16(),
+        use_graph_rewrite=common_flags.use_graph_rewrite())
+    return squad_model, core_model
+
+  # Only when explicit_allreduce = True, post_allreduce_callbacks and
+  # allreduce_bytes_per_pack will take effect. optimizer.apply_gradients() no
+  # longer implicitly allreduce gradients, users manually allreduce gradient and
+  # pass the allreduced grads_and_vars to apply_gradients().
+  # With explicit_allreduce = True, clip_by_global_norm is moved to after
+  # allreduce.
+  model_training_utils.run_customized_training_loop(
+      strategy=strategy,
+      model_fn=_get_squad_model,
+      loss_fn=get_loss_fn(),
+      model_dir=FLAGS.model_dir,
+      steps_per_epoch=steps_per_epoch,
+      steps_per_loop=FLAGS.steps_per_loop,
+      epochs=epochs,
+      train_input_fn=train_input_fn,
+      init_checkpoint=init_checkpoint or FLAGS.init_checkpoint,
+      sub_model_export_name=sub_model_export_name,
+      run_eagerly=run_eagerly,
+      custom_callbacks=custom_callbacks,
+      explicit_allreduce=FLAGS.explicit_allreduce,
+      pre_allreduce_callbacks=[
+          model_training_utils.clip_by_global_norm_callback
+      ],
+      allreduce_bytes_per_pack=FLAGS.allreduce_bytes_per_pack)
+
+
+def prediction_output_squad(strategy, input_meta_data, tokenizer, squad_lib,
+                            predict_file, squad_model):
+  """Makes predictions for a squad dataset."""
+  doc_stride = input_meta_data['doc_stride']
+  max_query_length = input_meta_data['max_query_length']
+  # Whether data should be in Ver 2.0 format.
+  version_2_with_negative = input_meta_data.get('version_2_with_negative',
+                                                False)
+  eval_examples = squad_lib.read_squad_examples(
+      input_file=predict_file,
+      is_training=False,
+      version_2_with_negative=version_2_with_negative)
+
+  eval_writer = squad_lib.FeatureWriter(
+      filename=os.path.join(FLAGS.model_dir, 'eval.tf_record'),
+      is_training=False)
+  eval_features = []
+
+  def _append_feature(feature, is_padding):
+    if not is_padding:
+      eval_features.append(feature)
+    eval_writer.process_feature(feature)
+
+  # TPU requires a fixed batch size for all batches, therefore the number
+  # of examples must be a multiple of the batch size, or else examples
+  # will get dropped. So we pad with fake examples which are ignored
+  # later on.
+  kwargs = dict(
+      examples=eval_examples,
+      tokenizer=tokenizer,
+      max_seq_length=input_meta_data['max_seq_length'],
+      doc_stride=doc_stride,
+      max_query_length=max_query_length,
+      is_training=False,
+      output_fn=_append_feature,
+      batch_size=FLAGS.predict_batch_size)
+
+  # squad_lib_sp requires one more argument 'do_lower_case'.
+  if squad_lib == squad_lib_sp:
+    kwargs['do_lower_case'] = FLAGS.do_lower_case
+  dataset_size = squad_lib.convert_examples_to_features(**kwargs)
+  eval_writer.close()
+
+  logging.info('***** Running predictions *****')
+  logging.info('  Num orig examples = %d', len(eval_examples))
+  logging.info('  Num split examples = %d', len(eval_features))
+  logging.info('  Batch size = %d', FLAGS.predict_batch_size)
+
+  num_steps = int(dataset_size / FLAGS.predict_batch_size)
+  all_results = predict_squad_customized(strategy, input_meta_data,
+                                         eval_writer.filename, num_steps,
+                                         squad_model)
+
+  all_predictions, all_nbest_json, scores_diff_json = (
+      squad_lib.postprocess_output(
+          eval_examples,
+          eval_features,
+          all_results,
+          FLAGS.n_best_size,
+          FLAGS.max_answer_length,
+          FLAGS.do_lower_case,
+          version_2_with_negative=version_2_with_negative,
+          null_score_diff_threshold=FLAGS.null_score_diff_threshold,
+          verbose=FLAGS.verbose_logging))
+
+  return all_predictions, all_nbest_json, scores_diff_json
+
+
+def dump_to_files(all_predictions,
+                  all_nbest_json,
+                  scores_diff_json,
+                  squad_lib,
+                  version_2_with_negative,
+                  file_prefix=''):
+  """Save output to json files."""
+  output_prediction_file = os.path.join(FLAGS.model_dir,
+                                        '%spredictions.json' % file_prefix)
+  output_nbest_file = os.path.join(FLAGS.model_dir,
+                                   '%snbest_predictions.json' % file_prefix)
+  output_null_log_odds_file = os.path.join(FLAGS.model_dir, file_prefix,
+                                           '%snull_odds.json' % file_prefix)
+  logging.info('Writing predictions to: %s', (output_prediction_file))
+  logging.info('Writing nbest to: %s', (output_nbest_file))
+
+  squad_lib.write_to_json_files(all_predictions, output_prediction_file)
+  squad_lib.write_to_json_files(all_nbest_json, output_nbest_file)
+  if version_2_with_negative:
+    squad_lib.write_to_json_files(scores_diff_json, output_null_log_odds_file)
+
+
+def _get_matched_files(input_path):
+  """Returns all files that matches the input_path."""
+  input_patterns = input_path.strip().split(',')
+  all_matched_files = []
+  for input_pattern in input_patterns:
+    input_pattern = input_pattern.strip()
+    if not input_pattern:
+      continue
+    matched_files = tf.io.gfile.glob(input_pattern)
+    if not matched_files:
+      raise ValueError('%s does not match any files.' % input_pattern)
+    else:
+      all_matched_files.extend(matched_files)
+  return sorted(all_matched_files)
+
+
+def predict_squad(strategy,
+                  input_meta_data,
+                  tokenizer,
+                  bert_config,
+                  squad_lib,
+                  init_checkpoint=None):
+  """Get prediction results and evaluate them to hard drive."""
+  if init_checkpoint is None:
+    init_checkpoint = tf.train.latest_checkpoint(FLAGS.model_dir)
+
+  all_predict_files = _get_matched_files(FLAGS.predict_file)
+  squad_model = get_squad_model_to_predict(strategy, bert_config,
+                                           init_checkpoint, input_meta_data)
+  for idx, predict_file in enumerate(all_predict_files):
+    all_predictions, all_nbest_json, scores_diff_json = prediction_output_squad(
+        strategy, input_meta_data, tokenizer, squad_lib, predict_file,
+        squad_model)
+    if len(all_predict_files) == 1:
+      file_prefix = ''
+    else:
+      # if predict_file is /path/xquad.ar.json, the `file_prefix` may be
+      # "xquad.ar-0-"
+      file_prefix = '%s-' % os.path.splitext(
+          os.path.basename(all_predict_files[idx]))[0]
+    dump_to_files(all_predictions, all_nbest_json, scores_diff_json, squad_lib,
+                  input_meta_data.get('version_2_with_negative', False),
+                  file_prefix)
+
+
+def eval_squad(strategy,
+               input_meta_data,
+               tokenizer,
+               bert_config,
+               squad_lib,
+               init_checkpoint=None):
+  """Get prediction results and evaluate them against ground truth."""
+  if init_checkpoint is None:
+    init_checkpoint = tf.train.latest_checkpoint(FLAGS.model_dir)
+
+  all_predict_files = _get_matched_files(FLAGS.predict_file)
+  if len(all_predict_files) != 1:
+    raise ValueError('`eval_squad` only supports one predict file, '
+                     'but got %s' % all_predict_files)
+
+  squad_model = get_squad_model_to_predict(strategy, bert_config,
+                                           init_checkpoint, input_meta_data)
+  all_predictions, all_nbest_json, scores_diff_json = prediction_output_squad(
+      strategy, input_meta_data, tokenizer, squad_lib, all_predict_files[0],
+      squad_model)
+  dump_to_files(all_predictions, all_nbest_json, scores_diff_json, squad_lib,
+                input_meta_data.get('version_2_with_negative', False))
+
+  with tf.io.gfile.GFile(FLAGS.predict_file, 'r') as reader:
+    dataset_json = json.load(reader)
+    pred_dataset = dataset_json['data']
+  if input_meta_data.get('version_2_with_negative', False):
+    eval_metrics = squad_evaluate_v2_0.evaluate(pred_dataset, all_predictions,
+                                                scores_diff_json)
+  else:
+    eval_metrics = squad_evaluate_v1_1.evaluate(pred_dataset, all_predictions)
+  return eval_metrics
+
+
+def export_squad(model_export_path, input_meta_data, bert_config):
+  """Exports a trained model as a `SavedModel` for inference.
+
+  Args:
+    model_export_path: a string specifying the path to the SavedModel directory.
+    input_meta_data: dictionary containing meta data about input and model.
+    bert_config: Bert configuration file to define core bert layers.
+
+  Raises:
+    Export path is not specified, got an empty string or None.
+  """
+  if not model_export_path:
+    raise ValueError('Export path is not specified: %s' % model_export_path)
+  # Export uses float32 for now, even if training uses mixed precision.
+  tf.keras.mixed_precision.set_global_policy('float32')
+  squad_model, _ = bert_models.squad_model(bert_config,
+                                           input_meta_data['max_seq_length'])
+  model_saving_utils.export_bert_model(
+      model_export_path, model=squad_model, checkpoint_dir=FLAGS.model_dir)
diff --git a/nlp/text_classification/bert/tensorflow2.0/run_train_mirrored.sh b/nlp/text_classification/bert/tensorflow2.0/run_train_mirrored.sh
new file mode 100644
index 000000000..f82b2d4d5
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/run_train_mirrored.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+# Sentence paraphrase classification tasks
+# Dataset: GLUE/MRPC
+# Model: bert-base
+
+MODEL="bert-base_uncased_L-12_H-768_A-12"
+DATE=`date +%Y%m%d%H%M%S`
+WORK_PATH=$(dirname $(readlink -f $0))
+OFFICALPATH=$WORK_PATH/../../../
+LOG_DIR="logs/bert"
+BERT_DIR=pretrained_model/uncased_L-12_H-768_A-12
+MODEL_DIR=output_dir
+GLUE_DIR=./datasets
+TASK=MRPC
+BATCH=32
+NUM_GPUS=2
+
+export PYTHONPATH=$OFFICALPATH:$PYTHONPATH
+
+mkdir -p ${LOG_DIR}
+rm -rf ${MODEL_DIR}
+mkdir -p ${MODEL_DIR}
+
+EXIT_STATUS=0
+check_status()
+{
+  if ((${PIPESTATUS[0]} != 0)); then
+    EXIT_STATUS=1
+  fi
+}
+
+# Download model
+if [ ! -d ${BERT_DIR} ]; then
+  mkdir -p ${BERT_DIR}
+  cp ${WORK_PATH}/../../../../data/model_zoo/bert/${MODEL}/* ${BERT_DIR}
+fi
+
+# Download data
+if [ ! -d ${GLUE_DIR}/${TASK} ]; then
+  mkdir -p ${GLUE_DIR}/${TASK}
+  cp ${WORK_PATH}/../../../../data/datasets/MRPC_tf_record/* ${GLUE_DIR}/${TASK}
+fi
+
+pip3 install tensorflow_hub tensorflow_addons gin-config
+
+time python3 run_classifier.py \
+  --mode='train_and_eval' \
+  --input_meta_data_path=${GLUE_DIR}/${TASK}/${TASK}_meta_data \
+  --train_data_path=${GLUE_DIR}/${TASK}/${TASK}_train.tf_record \
+  --eval_data_path=${GLUE_DIR}/${TASK}/${TASK}_eval.tf_record \
+  --bert_config_file=${BERT_DIR}/bert_config.json \
+  --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
+  --train_batch_size=${BATCH} \
+  --eval_batch_size=${BATCH} \
+  --steps_per_loop=1 \
+  --learning_rate=2e-5 \
+  --num_train_epochs=3 \
+  --model_dir=${MODEL_DIR} \
+  --num_gpus=${NUM_GPUS} \
+  --all_reduce_alg='nccl' \
+  --distribution_strategy=mirrored  2>&1 | tee ${LOG_DIR}/${BATCH}_${DATE}.log; [[ ${PIPESTATUS[0]} == 0 ]] || exit
+
+  if [ ! -f "compare_kv.py" -o ! -f "get_key_value.py" ]; then
+    bash download_script.sh
+  if [[ $? != 0 ]]; then
+    echo "ERROR: download scripts failed"
+    exit 1
+  fi
+fi
+
+python3 get_key_value.py -i ${LOG_DIR}/${BATCH}_${DATE}.log -k 'loss: ' 'accuracy: ' 'val_loss: ' 'val_accuracy: ' -o train_mirrored_bi.json
+python3 compare_kv.py -b train_mirrored_bi.json -n train_mirrored_nv.json -i 'val_accuracy: '; check_status
+exit ${EXIT_STATUS}
diff --git a/nlp/text_classification/bert/tensorflow2.0/run_train_worker_mirrored.sh b/nlp/text_classification/bert/tensorflow2.0/run_train_worker_mirrored.sh
new file mode 100644
index 000000000..bb869db76
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/run_train_worker_mirrored.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+# Sentence paraphrase classification tasks
+# Dataset: GLUE/MRPC
+# Model: bert-base
+
+MODEL="bert-base_uncased_L-12_H-768_A-12"
+DATE=`date +%Y%m%d%H%M%S`
+WORK_PATH=$(dirname $(readlink -f $0))
+OFFICALPATH=$WORK_PATH/../../../
+LOG_DIR="logs/bert"
+BERT_DIR=pretrained_model/uncased_L-12_H-768_A-12
+MODEL_DIR=output_dir_
+GLUE_DIR=./datasets
+TASK=MRPC
+BATCH=32
+GPUS=${1:-'0,1'}
+WORKER_HOSTS=${2:-'localhost:20001','localhost:20002'}
+# GPUS=${1:-'0,1,2,3,4,5,6,7'}
+# WORKER_HOSTS=${2:-'localhost:20001','localhost:20002','localhost:20003','localhost:20004','localhost:20005','localhost:20006','localhost:20007','localhost:20008'}
+
+export PYTHONPATH=$OFFICALPATH:$PYTHONPATH
+export TF_CPP_MIN_LOG_LEVEL=0
+export TF_CPP_MAX_VLOG_LEVEL=0
+
+mkdir -p ${LOG_DIR}
+rm -rf ${MODEL_DIR}*
+
+EXIT_STATUS=0
+check_status()
+{
+  if ((${PIPESTATUS[0]} != 0)); then
+    EXIT_STATUS=1
+  fi
+}
+
+# Download model
+if [ ! -d ${BERT_DIR} ]; then
+  mkdir -p ${BERT_DIR}
+  cp ${WORK_PATH}/../../../../data/model_zoo/bert/${MODEL}/* ${BERT_DIR}
+fi
+
+# Download data
+if [ ! -d ${GLUE_DIR}/${TASK} ]; then
+  mkdir -p ${GLUE_DIR}/${TASK}
+  cp ${WORK_PATH}/../../../../data/datasets/MRPC_tf_record/* ${GLUE_DIR}/${TASK}
+fi
+
+pip3 install tensorflow_hub tensorflow_addons gin-config
+
+gpus_array=(`echo $GPUS | tr ',' ' '` ) 
+
+for ((index=0; index<${#gpus_array[@]}; index++))
+do
+  export CUDA_VISIBLE_DEVICES=${gpus_array[index]}
+  time python3 run_classifier.py \
+    --mode='train_and_eval' \
+    --input_meta_data_path=${GLUE_DIR}/${TASK}/${TASK}_meta_data \
+    --train_data_path=${GLUE_DIR}/${TASK}/${TASK}_train.tf_record \
+    --eval_data_path=${GLUE_DIR}/${TASK}/${TASK}_eval.tf_record \
+    --bert_config_file=${BERT_DIR}/bert_config.json \
+    --init_checkpoint=${BERT_DIR}/bert_model.ckpt \
+    --train_batch_size=${BATCH} \
+    --eval_batch_size=${BATCH} \
+    --steps_per_loop=1 \
+    --learning_rate=2e-5 \
+    --num_train_epochs=3 \
+    --model_dir=${MODEL_DIR}${index} \
+    --distribution_strategy=multi_worker_mirrored \
+    --task_index=${index} \
+    --worker_hosts=${WORKER_HOSTS} 2>&1 | tee ${LOG_DIR}/${BATCH}_${DATE}_${index}.log || exit &
+done
+
+wait
+if [ ! -f "compare_kv.py" -o ! -f "get_key_value.py" ]; then
+  ./download_script.sh
+  if [[ $? != 0 ]]; then
+    echo "ERROR: download scripts failed"
+    exit 1
+  fi
+fi
+
+python3 get_key_value.py -i ${LOG_DIR}/${BATCH}_${DATE}_0.log -k 'loss: ' 'accuracy: ' 'val_loss: ' 'val_accuracy: ' -o train_worker_mirrored_bi.json
+python3 compare_kv.py -b train_worker_mirrored_bi.json -n train_worker_mirrored_nv.json -i 'val_accuracy: '; check_status
+exit ${EXIT_STATUS}
\ No newline at end of file
diff --git a/nlp/text_classification/bert/tensorflow2.0/serving.py b/nlp/text_classification/bert/tensorflow2.0/serving.py
new file mode 100644
index 000000000..82c102817
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/serving.py
@@ -0,0 +1,133 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Examples of SavedModel export for tf-serving."""
+
+from absl import app
+from absl import flags
+import tensorflow as tf
+
+import bert_models
+import configs
+
+flags.DEFINE_integer(
+    "sequence_length", None, "Sequence length to parse the tf.Example. If "
+    "sequence_length > 0, add a signature for serialized "
+    "tf.Example and define the parsing specification by the "
+    "sequence_length.")
+flags.DEFINE_string("bert_config_file", None,
+                    "Bert configuration file to define core bert layers.")
+flags.DEFINE_string("model_checkpoint_path", None,
+                    "File path to TF model checkpoint.")
+flags.DEFINE_string("export_path", None,
+                    "Destination folder to export the serving SavedModel.")
+
+FLAGS = flags.FLAGS
+
+
+class BertServing(tf.keras.Model):
+  """Bert transformer encoder model for serving."""
+
+  def __init__(self, bert_config, name_to_features=None, name="serving_model"):
+    super(BertServing, self).__init__(name=name)
+    self.encoder = bert_models.get_transformer_encoder(
+        bert_config, sequence_length=None)
+    self.name_to_features = name_to_features
+
+  def call(self, inputs):
+    input_word_ids = inputs["input_ids"]
+    input_mask = inputs["input_mask"]
+    input_type_ids = inputs["segment_ids"]
+
+    encoder_outputs, _ = self.encoder(
+        [input_word_ids, input_mask, input_type_ids])
+    return encoder_outputs
+
+  def serve_body(self, input_ids, input_mask=None, segment_ids=None):
+    if segment_ids is None:
+      # Requires CLS token is the first token of inputs.
+      segment_ids = tf.zeros_like(input_ids)
+    if input_mask is None:
+      # The mask has 1 for real tokens and 0 for padding tokens.
+      input_mask = tf.where(
+          tf.equal(input_ids, 0), tf.zeros_like(input_ids),
+          tf.ones_like(input_ids))
+
+    inputs = dict(
+        input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids)
+    return self.call(inputs)
+
+  @tf.function
+  def serve(self, input_ids, input_mask=None, segment_ids=None):
+    outputs = self.serve_body(input_ids, input_mask, segment_ids)
+    # Returns a dictionary to control SignatureDef output signature.
+    return {"outputs": outputs[-1]}
+
+  @tf.function
+  def serve_examples(self, inputs):
+    features = tf.io.parse_example(inputs, self.name_to_features)
+    for key in list(features.keys()):
+      t = features[key]
+      if t.dtype == tf.int64:
+        t = tf.cast(t, tf.int32)
+      features[key] = t
+    return self.serve(
+        features["input_ids"],
+        input_mask=features["input_mask"] if "input_mask" in features else None,
+        segment_ids=features["segment_ids"]
+        if "segment_ids" in features else None)
+
+  @classmethod
+  def export(cls, model, export_dir):
+    if not isinstance(model, cls):
+      raise ValueError("Invalid model instance: %s, it should be a %s" %
+                       (model, cls))
+
+    signatures = {
+        "serving_default":
+            model.serve.get_concrete_function(
+                input_ids=tf.TensorSpec(
+                    shape=[None, None], dtype=tf.int32, name="inputs")),
+    }
+    if model.name_to_features:
+      signatures[
+          "serving_examples"] = model.serve_examples.get_concrete_function(
+              tf.TensorSpec(shape=[None], dtype=tf.string, name="examples"))
+    tf.saved_model.save(model, export_dir=export_dir, signatures=signatures)
+
+
+def main(_):
+  sequence_length = FLAGS.sequence_length
+  if sequence_length is not None and sequence_length > 0:
+    name_to_features = {
+        "input_ids": tf.io.FixedLenFeature([sequence_length], tf.int64),
+        "input_mask": tf.io.FixedLenFeature([sequence_length], tf.int64),
+        "segment_ids": tf.io.FixedLenFeature([sequence_length], tf.int64),
+    }
+  else:
+    name_to_features = None
+  bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
+  serving_model = BertServing(
+      bert_config=bert_config, name_to_features=name_to_features)
+  checkpoint = tf.train.Checkpoint(model=serving_model.encoder)
+  checkpoint.restore(FLAGS.model_checkpoint_path
+                    ).assert_existing_objects_matched().run_restore_ops()
+  BertServing.export(serving_model, FLAGS.export_path)
+
+
+if __name__ == "__main__":
+  flags.mark_flag_as_required("bert_config_file")
+  flags.mark_flag_as_required("model_checkpoint_path")
+  flags.mark_flag_as_required("export_path")
+  app.run(main)
diff --git a/nlp/text_classification/bert/tensorflow2.0/squad_evaluate_v1_1.py b/nlp/text_classification/bert/tensorflow2.0/squad_evaluate_v1_1.py
new file mode 100644
index 000000000..a39f571c3
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/squad_evaluate_v1_1.py
@@ -0,0 +1,106 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Evaluation of SQuAD predictions (version 1.1).
+
+The functions are copied from
+https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/.
+
+The SQuAD dataset is described in this paper:
+SQuAD: 100,000+ Questions for Machine Comprehension of Text
+Pranav Rajpurkar, Jian Zhang, Konstantin Lopyrev, Percy Liang
+https://nlp.stanford.edu/pubs/rajpurkar2016squad.pdf
+"""
+
+import collections
+import re
+import string
+
+# pylint: disable=g-bad-import-order
+
+from absl import logging
+# pylint: enable=g-bad-import-order
+
+
+def _normalize_answer(s):
+  """Lowers text and remove punctuation, articles and extra whitespace."""
+
+  def remove_articles(text):
+    return re.sub(r"\b(a|an|the)\b", " ", text)
+
+  def white_space_fix(text):
+    return " ".join(text.split())
+
+  def remove_punc(text):
+    exclude = set(string.punctuation)
+    return "".join(ch for ch in text if ch not in exclude)
+
+  def lower(text):
+    return text.lower()
+
+  return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def _f1_score(prediction, ground_truth):
+  """Computes F1 score by comparing prediction to ground truth."""
+  prediction_tokens = _normalize_answer(prediction).split()
+  ground_truth_tokens = _normalize_answer(ground_truth).split()
+  prediction_counter = collections.Counter(prediction_tokens)
+  ground_truth_counter = collections.Counter(ground_truth_tokens)
+  common = prediction_counter & ground_truth_counter
+  num_same = sum(common.values())
+  if num_same == 0:
+    return 0
+  precision = 1.0 * num_same / len(prediction_tokens)
+  recall = 1.0 * num_same / len(ground_truth_tokens)
+  f1 = (2 * precision * recall) / (precision + recall)
+  return f1
+
+
+def _exact_match_score(prediction, ground_truth):
+  """Checks if predicted answer exactly matches ground truth answer."""
+  return _normalize_answer(prediction) == _normalize_answer(ground_truth)
+
+
+def _metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+  """Computes the max over all metric scores."""
+  scores_for_ground_truths = []
+  for ground_truth in ground_truths:
+    score = metric_fn(prediction, ground_truth)
+    scores_for_ground_truths.append(score)
+  return max(scores_for_ground_truths)
+
+
+def evaluate(dataset, predictions):
+  """Evaluates predictions for a dataset."""
+  f1 = exact_match = total = 0
+  for article in dataset:
+    for paragraph in article["paragraphs"]:
+      for qa in paragraph["qas"]:
+        total += 1
+        if qa["id"] not in predictions:
+          message = "Unanswered question " + qa["id"] + " will receive score 0."
+          logging.error(message)
+          continue
+        ground_truths = [entry["text"] for entry in qa["answers"]]
+        prediction = predictions[qa["id"]]
+        exact_match += _metric_max_over_ground_truths(_exact_match_score,
+                                                      prediction, ground_truths)
+        f1 += _metric_max_over_ground_truths(_f1_score, prediction,
+                                             ground_truths)
+
+  exact_match = exact_match / total
+  f1 = f1 / total
+
+  return {"exact_match": exact_match, "final_f1": f1}
diff --git a/nlp/text_classification/bert/tensorflow2.0/squad_evaluate_v2_0.py b/nlp/text_classification/bert/tensorflow2.0/squad_evaluate_v2_0.py
new file mode 100644
index 000000000..12c5a7e3d
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/squad_evaluate_v2_0.py
@@ -0,0 +1,249 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Evaluation script for SQuAD version 2.0.
+
+The functions are copied and modified from
+https://raw.githubusercontent.com/white127/SQUAD-2.0-bidaf/master/evaluate-v2.0.py
+
+In addition to basic functionality, we also compute additional statistics and
+plot precision-recall curves if an additional na_prob.json file is provided.
+This file is expected to map question ID's to the model's predicted probability
+that a question is unanswerable.
+"""
+
+import collections
+import re
+import string
+
+from absl import logging
+
+
+def _make_qid_to_has_ans(dataset):
+  qid_to_has_ans = {}
+  for article in dataset:
+    for p in article['paragraphs']:
+      for qa in p['qas']:
+        qid_to_has_ans[qa['id']] = bool(qa['answers'])
+  return qid_to_has_ans
+
+
+def _normalize_answer(s):
+  """Lower text and remove punctuation, articles and extra whitespace."""
+  def remove_articles(text):
+    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
+    return re.sub(regex, ' ', text)
+  def white_space_fix(text):
+    return ' '.join(text.split())
+  def remove_punc(text):
+    exclude = set(string.punctuation)
+    return ''.join(ch for ch in text if ch not in exclude)
+  def lower(text):
+    return text.lower()
+  return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def _get_tokens(s):
+  if not s: return []
+  return _normalize_answer(s).split()
+
+
+def _compute_exact(a_gold, a_pred):
+  return int(_normalize_answer(a_gold) == _normalize_answer(a_pred))
+
+
+def _compute_f1(a_gold, a_pred):
+  """Compute F1-score."""
+  gold_toks = _get_tokens(a_gold)
+  pred_toks = _get_tokens(a_pred)
+  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+  num_same = sum(common.values())
+  if not gold_toks or not pred_toks:
+    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+    return int(gold_toks == pred_toks)
+  if num_same == 0:
+    return 0
+  precision = 1.0 * num_same / len(pred_toks)
+  recall = 1.0 * num_same / len(gold_toks)
+  f1 = (2 * precision * recall) / (precision + recall)
+  return f1
+
+
+def _get_raw_scores(dataset, predictions):
+  """Compute raw scores."""
+  exact_scores = {}
+  f1_scores = {}
+  for article in dataset:
+    for p in article['paragraphs']:
+      for qa in p['qas']:
+        qid = qa['id']
+        gold_answers = [a['text'] for a in qa['answers']
+                        if _normalize_answer(a['text'])]
+        if not gold_answers:
+          # For unanswerable questions, only correct answer is empty string
+          gold_answers = ['']
+        if qid not in predictions:
+          logging.error('Missing prediction for %s', qid)
+          continue
+        a_pred = predictions[qid]
+        # Take max over all gold answers
+        exact_scores[qid] = max(_compute_exact(a, a_pred) for a in gold_answers)
+        f1_scores[qid] = max(_compute_f1(a, a_pred) for a in gold_answers)
+  return exact_scores, f1_scores
+
+
+def _apply_no_ans_threshold(
+    scores, na_probs, qid_to_has_ans, na_prob_thresh=1.0):
+  new_scores = {}
+  for qid, s in scores.items():
+    pred_na = na_probs[qid] > na_prob_thresh
+    if pred_na:
+      new_scores[qid] = float(not qid_to_has_ans[qid])
+    else:
+      new_scores[qid] = s
+  return new_scores
+
+
+def _make_eval_dict(exact_scores, f1_scores, qid_list=None):
+  """Make evaluation result dictionary."""
+  if not qid_list:
+    total = len(exact_scores)
+    return collections.OrderedDict([
+        ('exact', 100.0 * sum(exact_scores.values()) / total),
+        ('f1', 100.0 * sum(f1_scores.values()) / total),
+        ('total', total),
+    ])
+  else:
+    total = len(qid_list)
+    return collections.OrderedDict([
+        ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
+        ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
+        ('total', total),
+    ])
+
+
+def _merge_eval(main_eval, new_eval, prefix):
+  for k in new_eval:
+    main_eval['%s_%s' % (prefix, k)] = new_eval[k]
+
+
+def _make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans):
+  """Make evaluation dictionary containing average recision recall."""
+  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+  true_pos = 0.0
+  cur_p = 1.0
+  cur_r = 0.0
+  precisions = [1.0]
+  recalls = [0.0]
+  avg_prec = 0.0
+  for i, qid in enumerate(qid_list):
+    if qid_to_has_ans[qid]:
+      true_pos += scores[qid]
+    cur_p = true_pos / float(i+1)
+    cur_r = true_pos / float(num_true_pos)
+    if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]:
+      # i.e., if we can put a threshold after this point
+      avg_prec += cur_p * (cur_r - recalls[-1])
+      precisions.append(cur_p)
+      recalls.append(cur_r)
+  return {'ap': 100.0 * avg_prec}
+
+
+def _run_precision_recall_analysis(
+    main_eval, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+  """Run precision recall analysis and return result dictionary."""
+  num_true_pos = sum(1 for v in qid_to_has_ans.values() if v)
+  if num_true_pos == 0:
+    return
+  pr_exact = _make_precision_recall_eval(
+      exact_raw, na_probs, num_true_pos, qid_to_has_ans)
+  pr_f1 = _make_precision_recall_eval(
+      f1_raw, na_probs, num_true_pos, qid_to_has_ans)
+  oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()}
+  pr_oracle = _make_precision_recall_eval(
+      oracle_scores, na_probs, num_true_pos, qid_to_has_ans)
+  _merge_eval(main_eval, pr_exact, 'pr_exact')
+  _merge_eval(main_eval, pr_f1, 'pr_f1')
+  _merge_eval(main_eval, pr_oracle, 'pr_oracle')
+
+
+def _find_best_thresh(predictions, scores, na_probs, qid_to_has_ans):
+  """Find the best threshold for no answer probability."""
+  num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+  cur_score = num_no_ans
+  best_score = cur_score
+  best_thresh = 0.0
+  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+  for qid in qid_list:
+    if qid not in scores: continue
+    if qid_to_has_ans[qid]:
+      diff = scores[qid]
+    else:
+      if predictions[qid]:
+        diff = -1
+      else:
+        diff = 0
+    cur_score += diff
+    if cur_score > best_score:
+      best_score = cur_score
+      best_thresh = na_probs[qid]
+  return 100.0 * best_score / len(scores), best_thresh
+
+
+def _find_all_best_thresh(
+    main_eval, predictions, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+  best_exact, exact_thresh = _find_best_thresh(
+      predictions, exact_raw, na_probs, qid_to_has_ans)
+  best_f1, f1_thresh = _find_best_thresh(
+      predictions, f1_raw, na_probs, qid_to_has_ans)
+  main_eval['final_exact'] = best_exact
+  main_eval['final_exact_thresh'] = exact_thresh
+  main_eval['final_f1'] = best_f1
+  main_eval['final_f1_thresh'] = f1_thresh
+
+
+def evaluate(dataset, predictions, na_probs=None):
+  """Evaluate prediction results."""
+  new_orig_data = []
+  for article in dataset:
+    for p in article['paragraphs']:
+      for qa in p['qas']:
+        if qa['id'] in predictions:
+          new_para = {'qas': [qa]}
+          new_article = {'paragraphs': [new_para]}
+          new_orig_data.append(new_article)
+  dataset = new_orig_data
+
+  if na_probs is None:
+    na_probs = {k: 0.0 for k in predictions}
+  qid_to_has_ans = _make_qid_to_has_ans(dataset)  # maps qid to True/False
+  has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
+  no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
+  exact_raw, f1_raw = _get_raw_scores(dataset, predictions)
+  exact_thresh = _apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans)
+  f1_thresh = _apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans)
+  out_eval = _make_eval_dict(exact_thresh, f1_thresh)
+  if has_ans_qids:
+    has_ans_eval = _make_eval_dict(
+        exact_thresh, f1_thresh, qid_list=has_ans_qids)
+    _merge_eval(out_eval, has_ans_eval, 'HasAns')
+  if no_ans_qids:
+    no_ans_eval = _make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
+    _merge_eval(out_eval, no_ans_eval, 'NoAns')
+
+  _find_all_best_thresh(
+      out_eval, predictions, exact_raw, f1_raw, na_probs, qid_to_has_ans)
+  _run_precision_recall_analysis(
+      out_eval, exact_raw, f1_raw, na_probs, qid_to_has_ans)
+  return out_eval
diff --git a/nlp/text_classification/bert/tensorflow2.0/staging/__init__.py b/nlp/text_classification/bert/tensorflow2.0/staging/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/staging/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/nlp/text_classification/bert/tensorflow2.0/staging/training/__init__.py b/nlp/text_classification/bert/tensorflow2.0/staging/training/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/staging/training/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/nlp/text_classification/bert/tensorflow2.0/staging/training/grad_utils.py b/nlp/text_classification/bert/tensorflow2.0/staging/training/grad_utils.py
new file mode 100644
index 000000000..1113d39d5
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/staging/training/grad_utils.py
@@ -0,0 +1,151 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Some gradient util functions to help users writing custom training loop."""
+
+from absl import logging
+
+import tensorflow as tf
+
+
+def _filter_grads(grads_and_vars):
+  """Filter out iterable with grad equal to None."""
+  grads_and_vars = tuple(grads_and_vars)
+  if not grads_and_vars:
+    return grads_and_vars
+  filtered = []
+  vars_with_empty_grads = []
+  for grad, var in grads_and_vars:
+    if grad is None:
+      vars_with_empty_grads.append(var)
+    else:
+      filtered.append((grad, var))
+  filtered = tuple(filtered)
+  if not filtered:
+    raise ValueError("No gradients provided for any variable: %s." %
+                     ([v.name for _, v in grads_and_vars],))
+  if vars_with_empty_grads:
+    logging.warning(
+        ("Gradients do not exist for variables %s when minimizing the loss."),
+        ([v.name for v in vars_with_empty_grads]))
+  return filtered
+
+
+def _filter_and_allreduce_gradients(grads_and_vars,
+                                    allreduce_precision="float32",
+                                    bytes_per_pack=0):
+  """Filter None grads and then allreduce gradients in specified precision.
+
+  This utils function is used when users intent to explicitly allreduce
+  gradients and customize gradients operations before and after allreduce.
+  The allreduced gradients are then passed to optimizer.apply_gradients(
+  experimental_aggregate_gradients=False).
+
+  Args:
+      grads_and_vars: gradients and variables pairs.
+      allreduce_precision: Whether to allreduce gradients in float32 or float16.
+      bytes_per_pack: A non-negative integer. Breaks collective operations into
+        packs of certain size. If it's zero, all gradients are in one pack.
+
+  Returns:
+      pairs of allreduced non-None gradients and variables.
+  """
+  filtered_grads_and_vars = _filter_grads(grads_and_vars)
+  (grads, variables) = zip(*filtered_grads_and_vars)
+  if allreduce_precision == "float16":
+    grads = [tf.cast(grad, "float16") for grad in grads]
+  hints = tf.distribute.experimental.CommunicationOptions(
+      bytes_per_pack=bytes_per_pack)
+  allreduced_grads = tf.distribute.get_strategy(  # pylint: disable=protected-access
+  ).extended._replica_ctx_all_reduce(tf.distribute.ReduceOp.SUM, grads, hints)
+  if allreduce_precision == "float16":
+    allreduced_grads = [tf.cast(grad, "float32") for grad in allreduced_grads]
+  return allreduced_grads, variables
+
+
+def _run_callbacks(callbacks, grads_and_vars):
+  for callback in callbacks:
+    grads_and_vars = callback(grads_and_vars)
+  return grads_and_vars
+
+
+def minimize_using_explicit_allreduce(tape,
+                                      optimizer,
+                                      loss,
+                                      trainable_variables,
+                                      pre_allreduce_callbacks=None,
+                                      post_allreduce_callbacks=None,
+                                      allreduce_bytes_per_pack=0):
+  """Minimizes loss for one step by updating `trainable_variables`.
+
+  Minimizes loss for one step by updating `trainable_variables`.
+  This explicitly performs gradient allreduce, instead of relying on implicit
+  allreduce in optimizer.apply_gradients(). If training using FP16 mixed
+  precision, explicit allreduce will aggregate gradients in FP16 format.
+  For TPU and GPU training using FP32, explicit allreduce will aggregate
+  gradients in FP32 format.
+
+  Args:
+      tape: An instance of `tf.GradientTape`.
+      optimizer: An instance of `tf.keras.optimizers.Optimizer`.
+      loss: the loss tensor.
+      trainable_variables: A list of model Variables.
+      pre_allreduce_callbacks: A list of callback functions that takes gradients
+        and model variables pairs as input, manipulate them, and returns a new
+        gradients and model variables pairs. The callback functions will be
+        invoked in the list order and before gradients are allreduced. With
+        mixed precision training, the pre_allreduce_allbacks will be applied on
+        scaled_gradients. Default is no callbacks.
+      post_allreduce_callbacks: A list of callback functions that takes
+        gradients and model variables pairs as input, manipulate them, and
+        returns a new gradients and model variables paris. The callback
+        functions will be invoked in the list order and right before gradients
+        are applied to variables for updates. Default is no callbacks.
+      allreduce_bytes_per_pack: A non-negative integer. Breaks collective
+        operations into packs of certain size. If it's zero, all gradients are
+        in one pack.
+  """
+  if isinstance(optimizer,
+                tf.keras.mixed_precision.LossScaleOptimizer):
+    # FP16 GPU code path
+    with tape:
+      scaled_loss = optimizer.get_scaled_loss(loss)
+    scaled_grads = tape.gradient(scaled_loss, trainable_variables)
+    grads_and_vars = zip(scaled_grads, trainable_variables)
+    if pre_allreduce_callbacks:
+      grads_and_vars = _run_callbacks(pre_allreduce_callbacks, grads_and_vars)
+    (allreduced_scaled_grads,
+     filtered_training_vars) = _filter_and_allreduce_gradients(
+         grads_and_vars,
+         allreduce_precision="float16",
+         bytes_per_pack=allreduce_bytes_per_pack)
+    allreduced_unscaled_grads = optimizer.get_unscaled_gradients(
+        allreduced_scaled_grads)
+    grads_and_vars = zip(allreduced_unscaled_grads, filtered_training_vars)
+  else:
+    # TPU or FP32 GPU code path
+    grads = tape.gradient(loss, trainable_variables)
+    grads_and_vars = zip(grads, trainable_variables)
+    if pre_allreduce_callbacks:
+      grads_and_vars = _run_callbacks(pre_allreduce_callbacks, grads_and_vars)
+    (allreduced_grads,
+     filtered_training_vars) = _filter_and_allreduce_gradients(
+         grads_and_vars,
+         allreduce_precision="float32",
+         bytes_per_pack=allreduce_bytes_per_pack)
+    grads_and_vars = zip(allreduced_grads, filtered_training_vars)
+  if post_allreduce_callbacks:
+    grads_and_vars = _run_callbacks(post_allreduce_callbacks, grads_and_vars)
+  optimizer.apply_gradients(
+      grads_and_vars, experimental_aggregate_gradients=False)
diff --git a/nlp/text_classification/bert/tensorflow2.0/tasks/__init__.py b/nlp/text_classification/bert/tensorflow2.0/tasks/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/tasks/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/nlp/text_classification/bert/tensorflow2.0/tasks/electra_task.py b/nlp/text_classification/bert/tensorflow2.0/tasks/electra_task.py
new file mode 100644
index 000000000..208f725a4
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/tasks/electra_task.py
@@ -0,0 +1,242 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ELECTRA pretraining task (Joint Masked LM and Replaced Token Detection)."""
+
+import dataclasses
+import tensorflow as tf
+
+from core import base_task
+from core import config_definitions as cfg
+from core import task_factory
+from modeling import tf_utils
+from nlp_configs import bert
+from nlp_configs import electra
+from nlp_configs import encoders
+from nlp.data import pretrain_dataloader
+from nlp_modeling import layers
+from nlp_modeling import models
+
+
+@dataclasses.dataclass
+class ElectraPretrainConfig(cfg.TaskConfig):
+  """The model config."""
+  model: electra.ElectraPretrainerConfig = electra.ElectraPretrainerConfig(
+      cls_heads=[
+          bert.ClsHeadConfig(
+              inner_dim=768,
+              num_classes=2,
+              dropout_rate=0.1,
+              name='next_sentence')
+      ])
+  train_data: cfg.DataConfig = cfg.DataConfig()
+  validation_data: cfg.DataConfig = cfg.DataConfig()
+
+
+def _build_pretrainer(
+    config: electra.ElectraPretrainerConfig) -> models.ElectraPretrainer:
+  """Instantiates ElectraPretrainer from the config."""
+  generator_encoder_cfg = config.generator_encoder
+  discriminator_encoder_cfg = config.discriminator_encoder
+  # Copy discriminator's embeddings to generator for easier model serialization.
+  discriminator_network = encoders.build_encoder(discriminator_encoder_cfg)
+  if config.tie_embeddings:
+    embedding_layer = discriminator_network.get_embedding_layer()
+    generator_network = encoders.build_encoder(
+        generator_encoder_cfg, embedding_layer=embedding_layer)
+  else:
+    generator_network = encoders.build_encoder(generator_encoder_cfg)
+
+  generator_encoder_cfg = generator_encoder_cfg.get()
+  return models.ElectraPretrainer(
+      generator_network=generator_network,
+      discriminator_network=discriminator_network,
+      vocab_size=generator_encoder_cfg.vocab_size,
+      num_classes=config.num_classes,
+      sequence_length=config.sequence_length,
+      num_token_predictions=config.num_masked_tokens,
+      mlm_activation=tf_utils.get_activation(
+          generator_encoder_cfg.hidden_activation),
+      mlm_initializer=tf.keras.initializers.TruncatedNormal(
+          stddev=generator_encoder_cfg.initializer_range),
+      classification_heads=[
+          layers.ClassificationHead(**cfg.as_dict()) for cfg in config.cls_heads
+      ],
+      disallow_correct=config.disallow_correct)
+
+
+@task_factory.register_task_cls(ElectraPretrainConfig)
+class ElectraPretrainTask(base_task.Task):
+  """ELECTRA Pretrain Task (Masked LM + Replaced Token Detection)."""
+
+  def build_model(self):
+    return _build_pretrainer(self.task_config.model)
+
+  def build_losses(self,
+                   labels,
+                   model_outputs,
+                   metrics,
+                   aux_losses=None) -> tf.Tensor:
+    metrics = dict([(metric.name, metric) for metric in metrics])
+
+    # generator lm and (optional) nsp loss.
+    lm_prediction_losses = tf.keras.losses.sparse_categorical_crossentropy(
+        labels['masked_lm_ids'],
+        tf.cast(model_outputs['lm_outputs'], tf.float32),
+        from_logits=True)
+    lm_label_weights = labels['masked_lm_weights']
+    lm_numerator_loss = tf.reduce_sum(lm_prediction_losses * lm_label_weights)
+    lm_denominator_loss = tf.reduce_sum(lm_label_weights)
+    mlm_loss = tf.math.divide_no_nan(lm_numerator_loss, lm_denominator_loss)
+    metrics['lm_example_loss'].update_state(mlm_loss)
+    if 'next_sentence_labels' in labels:
+      sentence_labels = labels['next_sentence_labels']
+      sentence_outputs = tf.cast(
+          model_outputs['sentence_outputs'], dtype=tf.float32)
+      sentence_loss = tf.keras.losses.sparse_categorical_crossentropy(
+          sentence_labels, sentence_outputs, from_logits=True)
+      metrics['next_sentence_loss'].update_state(sentence_loss)
+      total_loss = mlm_loss + sentence_loss
+    else:
+      total_loss = mlm_loss
+
+    # discriminator replaced token detection (rtd) loss.
+    rtd_logits = model_outputs['disc_logits']
+    rtd_labels = tf.cast(model_outputs['disc_label'], tf.float32)
+    input_mask = tf.cast(labels['input_mask'], tf.float32)
+    rtd_ind_loss = tf.nn.sigmoid_cross_entropy_with_logits(
+        logits=rtd_logits, labels=rtd_labels)
+    rtd_numerator = tf.reduce_sum(input_mask * rtd_ind_loss)
+    rtd_denominator = tf.reduce_sum(input_mask)
+    rtd_loss = tf.math.divide_no_nan(rtd_numerator, rtd_denominator)
+    metrics['discriminator_loss'].update_state(rtd_loss)
+    total_loss = total_loss + \
+        self.task_config.model.discriminator_loss_weight * rtd_loss
+
+    if aux_losses:
+      total_loss += tf.add_n(aux_losses)
+
+    metrics['total_loss'].update_state(total_loss)
+    return total_loss
+
+  def build_inputs(self, params, input_context=None):
+    """Returns tf.data.Dataset for pretraining."""
+    if params.input_path == 'dummy':
+
+      def dummy_data(_):
+        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
+        dummy_lm = tf.zeros((1, params.max_predictions_per_seq), dtype=tf.int32)
+        return dict(
+            input_word_ids=dummy_ids,
+            input_mask=dummy_ids,
+            input_type_ids=dummy_ids,
+            masked_lm_positions=dummy_lm,
+            masked_lm_ids=dummy_lm,
+            masked_lm_weights=tf.cast(dummy_lm, dtype=tf.float32),
+            next_sentence_labels=tf.zeros((1, 1), dtype=tf.int32))
+
+      dataset = tf.data.Dataset.range(1)
+      dataset = dataset.repeat()
+      dataset = dataset.map(
+          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+      return dataset
+
+    return pretrain_dataloader.BertPretrainDataLoader(params).load(
+        input_context)
+
+  def build_metrics(self, training=None):
+    del training
+    metrics = [
+        tf.keras.metrics.SparseCategoricalAccuracy(name='masked_lm_accuracy'),
+        tf.keras.metrics.Mean(name='lm_example_loss'),
+        tf.keras.metrics.SparseCategoricalAccuracy(
+            name='discriminator_accuracy'),
+    ]
+    if self.task_config.train_data.use_next_sentence_label:
+      metrics.append(
+          tf.keras.metrics.SparseCategoricalAccuracy(
+              name='next_sentence_accuracy'))
+      metrics.append(tf.keras.metrics.Mean(name='next_sentence_loss'))
+
+    metrics.append(tf.keras.metrics.Mean(name='discriminator_loss'))
+    metrics.append(tf.keras.metrics.Mean(name='total_loss'))
+
+    return metrics
+
+  def process_metrics(self, metrics, labels, model_outputs):
+    metrics = dict([(metric.name, metric) for metric in metrics])
+    if 'masked_lm_accuracy' in metrics:
+      metrics['masked_lm_accuracy'].update_state(labels['masked_lm_ids'],
+                                                 model_outputs['lm_outputs'],
+                                                 labels['masked_lm_weights'])
+    if 'next_sentence_accuracy' in metrics:
+      metrics['next_sentence_accuracy'].update_state(
+          labels['next_sentence_labels'], model_outputs['sentence_outputs'])
+    if 'discriminator_accuracy' in metrics:
+      disc_logits_expanded = tf.expand_dims(model_outputs['disc_logits'], -1)
+      discrim_full_logits = tf.concat(
+          [-1.0 * disc_logits_expanded, disc_logits_expanded], -1)
+      metrics['discriminator_accuracy'].update_state(
+          model_outputs['disc_label'], discrim_full_logits,
+          labels['input_mask'])
+
+  def train_step(self, inputs, model: tf.keras.Model,
+                 optimizer: tf.keras.optimizers.Optimizer, metrics):
+    """Does forward and backward.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the model, forward pass definition.
+      optimizer: the optimizer for this training step.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    with tf.GradientTape() as tape:
+      outputs = model(inputs, training=True)
+      # Computes per-replica loss.
+      loss = self.build_losses(
+          labels=inputs,
+          model_outputs=outputs,
+          metrics=metrics,
+          aux_losses=model.losses)
+      # Scales loss as the default gradients allreduce performs sum inside the
+      # optimizer.
+      scaled_loss = loss / tf.distribute.get_strategy().num_replicas_in_sync
+    tvars = model.trainable_variables
+    grads = tape.gradient(scaled_loss, tvars)
+    optimizer.apply_gradients(list(zip(grads, tvars)))
+    self.process_metrics(metrics, inputs, outputs)
+    return {self.loss: loss}
+
+  def validation_step(self, inputs, model: tf.keras.Model, metrics):
+    """Validatation step.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the keras.Model.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    outputs = model(inputs, training=False)
+    loss = self.build_losses(
+        labels=inputs,
+        model_outputs=outputs,
+        metrics=metrics,
+        aux_losses=model.losses)
+    self.process_metrics(metrics, inputs, outputs)
+    return {self.loss: loss}
diff --git a/nlp/text_classification/bert/tensorflow2.0/tasks/masked_lm.py b/nlp/text_classification/bert/tensorflow2.0/tasks/masked_lm.py
new file mode 100644
index 000000000..7a3d23497
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/tasks/masked_lm.py
@@ -0,0 +1,200 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Masked language task."""
+
+import dataclasses
+import tensorflow as tf
+
+from core import base_task
+from core import config_definitions as cfg
+from core import task_factory
+from modeling import tf_utils
+from nlp_configs import bert
+from nlp_configs import encoders
+from nlp.data import data_loader_factory
+from nlp_modeling import layers
+from nlp_modeling import models
+
+
+@dataclasses.dataclass
+class MaskedLMConfig(cfg.TaskConfig):
+  """The model config."""
+  model: bert.PretrainerConfig = bert.PretrainerConfig(cls_heads=[
+      bert.ClsHeadConfig(
+          inner_dim=768, num_classes=2, dropout_rate=0.1, name='next_sentence')
+  ])
+  # TODO(b/154564893): Mathematically, scale_loss should be True.
+  # However, it works better with scale_loss being False.
+  scale_loss: bool = False
+  train_data: cfg.DataConfig = cfg.DataConfig()
+  validation_data: cfg.DataConfig = cfg.DataConfig()
+
+
+@task_factory.register_task_cls(MaskedLMConfig)
+class MaskedLMTask(base_task.Task):
+  """Task object for Mask language modeling."""
+
+  def _build_encoder(self, encoder_cfg):
+    return encoders.build_encoder(encoder_cfg)
+
+  def build_model(self, params=None):
+    config = params or self.task_config.model
+    encoder_cfg = config.encoder
+    encoder_network = self._build_encoder(encoder_cfg)
+    cls_heads = [
+        layers.ClassificationHead(**cfg.as_dict()) for cfg in config.cls_heads
+    ] if config.cls_heads else []
+    return models.BertPretrainerV2(
+        mlm_activation=tf_utils.get_activation(config.mlm_activation),
+        mlm_initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=config.mlm_initializer_range),
+        encoder_network=encoder_network,
+        classification_heads=cls_heads)
+
+  def build_losses(self,
+                   labels,
+                   model_outputs,
+                   metrics,
+                   aux_losses=None) -> tf.Tensor:
+    with tf.name_scope('MaskedLMTask/losses'):
+      metrics = dict([(metric.name, metric) for metric in metrics])
+      lm_prediction_losses = tf.keras.losses.sparse_categorical_crossentropy(
+          labels['masked_lm_ids'],
+          tf.cast(model_outputs['mlm_logits'], tf.float32),
+          from_logits=True)
+      lm_label_weights = labels['masked_lm_weights']
+      lm_numerator_loss = tf.reduce_sum(lm_prediction_losses *
+                                        lm_label_weights)
+      lm_denominator_loss = tf.reduce_sum(lm_label_weights)
+      mlm_loss = tf.math.divide_no_nan(lm_numerator_loss, lm_denominator_loss)
+      metrics['lm_example_loss'].update_state(mlm_loss)
+      if 'next_sentence_labels' in labels:
+        sentence_labels = labels['next_sentence_labels']
+        sentence_outputs = tf.cast(
+            model_outputs['next_sentence'], dtype=tf.float32)
+        sentence_loss = tf.reduce_mean(
+            tf.keras.losses.sparse_categorical_crossentropy(
+                sentence_labels, sentence_outputs, from_logits=True))
+        metrics['next_sentence_loss'].update_state(sentence_loss)
+        total_loss = mlm_loss + sentence_loss
+      else:
+        total_loss = mlm_loss
+
+      if aux_losses:
+        total_loss += tf.add_n(aux_losses)
+      return total_loss
+
+  def build_inputs(self, params, input_context=None):
+    """Returns tf.data.Dataset for pretraining."""
+    if params.input_path == 'dummy':
+
+      def dummy_data(_):
+        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
+        dummy_lm = tf.zeros((1, params.max_predictions_per_seq), dtype=tf.int32)
+        return dict(
+            input_word_ids=dummy_ids,
+            input_mask=dummy_ids,
+            input_type_ids=dummy_ids,
+            masked_lm_positions=dummy_lm,
+            masked_lm_ids=dummy_lm,
+            masked_lm_weights=tf.cast(dummy_lm, dtype=tf.float32),
+            next_sentence_labels=tf.zeros((1, 1), dtype=tf.int32))
+
+      dataset = tf.data.Dataset.range(1)
+      dataset = dataset.repeat()
+      dataset = dataset.map(
+          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+      return dataset
+
+    return data_loader_factory.get_data_loader(params).load(input_context)
+
+  def build_metrics(self, training=None):
+    del training
+    metrics = [
+        tf.keras.metrics.SparseCategoricalAccuracy(name='masked_lm_accuracy'),
+        tf.keras.metrics.Mean(name='lm_example_loss')
+    ]
+    # TODO(hongkuny): rethink how to manage metrics creation with heads.
+    if self.task_config.train_data.use_next_sentence_label:
+      metrics.append(
+          tf.keras.metrics.SparseCategoricalAccuracy(
+              name='next_sentence_accuracy'))
+      metrics.append(tf.keras.metrics.Mean(name='next_sentence_loss'))
+    return metrics
+
+  def process_metrics(self, metrics, labels, model_outputs):
+    with tf.name_scope('MaskedLMTask/process_metrics'):
+      metrics = dict([(metric.name, metric) for metric in metrics])
+      if 'masked_lm_accuracy' in metrics:
+        metrics['masked_lm_accuracy'].update_state(
+            labels['masked_lm_ids'], model_outputs['mlm_logits'],
+            labels['masked_lm_weights'])
+      if 'next_sentence_accuracy' in metrics:
+        metrics['next_sentence_accuracy'].update_state(
+            labels['next_sentence_labels'], model_outputs['next_sentence'])
+
+  def train_step(self, inputs, model: tf.keras.Model,
+                 optimizer: tf.keras.optimizers.Optimizer, metrics):
+    """Does forward and backward.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the model, forward pass definition.
+      optimizer: the optimizer for this training step.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    with tf.GradientTape() as tape:
+      outputs = model(inputs, training=True)
+      # Computes per-replica loss.
+      loss = self.build_losses(
+          labels=inputs,
+          model_outputs=outputs,
+          metrics=metrics,
+          aux_losses=model.losses)
+      if self.task_config.scale_loss:
+        # Scales loss as the default gradients allreduce performs sum inside the
+        # optimizer.
+        scaled_loss = loss / tf.distribute.get_strategy().num_replicas_in_sync
+    tvars = model.trainable_variables
+    if self.task_config.scale_loss:
+      grads = tape.gradient(scaled_loss, tvars)
+    else:
+      grads = tape.gradient(loss, tvars)
+    optimizer.apply_gradients(list(zip(grads, tvars)))
+    self.process_metrics(metrics, inputs, outputs)
+    return {self.loss: loss}
+
+  def validation_step(self, inputs, model: tf.keras.Model, metrics):
+    """Validatation step.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the keras.Model.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    outputs = self.inference_step(inputs, model)
+    loss = self.build_losses(
+        labels=inputs,
+        model_outputs=outputs,
+        metrics=metrics,
+        aux_losses=model.losses)
+    self.process_metrics(metrics, inputs, outputs)
+    return {self.loss: loss}
diff --git a/nlp/text_classification/bert/tensorflow2.0/tasks/question_answering.py b/nlp/text_classification/bert/tensorflow2.0/tasks/question_answering.py
new file mode 100644
index 000000000..6035d6b1b
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/tasks/question_answering.py
@@ -0,0 +1,498 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Question answering task."""
+import functools
+import json
+import os
+from typing import List, Optional
+
+from absl import logging
+import dataclasses
+import orbit
+import tensorflow as tf
+
+from core import base_task
+from core import config_definitions as cfg
+from core import task_factory
+from modeling.hyperparams import base_config
+from nlp.bert import squad_evaluate_v1_1
+from nlp.bert import squad_evaluate_v2_0
+from nlp.bert import tokenization
+from nlp_configs import encoders
+from nlp.data import data_loader_factory
+from nlp.data import squad_lib as squad_lib_wp
+from nlp.data import squad_lib_sp
+from nlp_modeling import models
+from tasks import utils
+
+
+@dataclasses.dataclass
+class ModelConfig(base_config.Config):
+  """A base span labeler configuration."""
+  encoder: encoders.EncoderConfig = encoders.EncoderConfig()
+
+
+@dataclasses.dataclass
+class QuestionAnsweringConfig(cfg.TaskConfig):
+  """The model config."""
+  # At most one of `init_checkpoint` and `hub_module_url` can be specified.
+  init_checkpoint: str = ''
+  hub_module_url: str = ''
+  n_best_size: int = 20
+  max_answer_length: int = 30
+  null_score_diff_threshold: float = 0.0
+  model: ModelConfig = ModelConfig()
+  train_data: cfg.DataConfig = cfg.DataConfig()
+  validation_data: cfg.DataConfig = cfg.DataConfig()
+
+
+@dataclasses.dataclass
+class RawAggregatedResult:
+  """Raw representation for SQuAD predictions."""
+  unique_id: int
+  start_logits: List[float]
+  end_logits: List[float]
+  start_indexes: Optional[List[int]] = None
+  end_indexes: Optional[List[int]] = None
+  class_logits: Optional[float] = None
+
+
+@task_factory.register_task_cls(QuestionAnsweringConfig)
+class QuestionAnsweringTask(base_task.Task):
+  """Task object for question answering."""
+
+  def __init__(self, params: cfg.TaskConfig, logging_dir=None, name=None):
+    super().__init__(params, logging_dir, name=name)
+
+    if params.validation_data is None:
+      return
+
+    if params.validation_data.tokenization == 'WordPiece':
+      self.squad_lib = squad_lib_wp
+    elif params.validation_data.tokenization == 'SentencePiece':
+      self.squad_lib = squad_lib_sp
+    else:
+      raise ValueError('Unsupported tokenization method: {}'.format(
+          params.validation_data.tokenization))
+
+    if params.validation_data.input_path:
+      self._tf_record_input_path, self._eval_examples, self._eval_features = (
+          self._preprocess_eval_data(params.validation_data))
+
+  def set_preprocessed_eval_input_path(self, eval_input_path):
+    """Sets the path to the preprocessed eval data."""
+    self._tf_record_input_path = eval_input_path
+
+  def build_model(self):
+    if self.task_config.hub_module_url and self.task_config.init_checkpoint:
+      raise ValueError('At most one of `hub_module_url` and '
+                       '`init_checkpoint` can be specified.')
+    if self.task_config.hub_module_url:
+      encoder_network = utils.get_encoder_from_hub(
+          self.task_config.hub_module_url)
+    else:
+      encoder_network = encoders.build_encoder(self.task_config.model.encoder)
+    encoder_cfg = self.task_config.model.encoder.get()
+    return models.BertSpanLabeler(
+        network=encoder_network,
+        initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=encoder_cfg.initializer_range))
+
+  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
+    start_positions = labels['start_positions']
+    end_positions = labels['end_positions']
+    start_logits, end_logits = model_outputs
+
+    start_loss = tf.keras.losses.sparse_categorical_crossentropy(
+        start_positions,
+        tf.cast(start_logits, dtype=tf.float32),
+        from_logits=True)
+    end_loss = tf.keras.losses.sparse_categorical_crossentropy(
+        end_positions, tf.cast(end_logits, dtype=tf.float32), from_logits=True)
+
+    loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2
+    return loss
+
+  def _preprocess_eval_data(self, params):
+    eval_examples = self.squad_lib.read_squad_examples(
+        input_file=params.input_path,
+        is_training=False,
+        version_2_with_negative=params.version_2_with_negative)
+
+    temp_file_path = params.input_preprocessed_data_path or self.logging_dir
+    if not temp_file_path:
+      raise ValueError('You must specify a temporary directory, either in '
+                       'params.input_preprocessed_data_path or logging_dir to '
+                       'store intermediate evaluation TFRecord data.')
+    eval_writer = self.squad_lib.FeatureWriter(
+        filename=os.path.join(temp_file_path, 'eval.tf_record'),
+        is_training=False)
+    eval_features = []
+
+    def _append_feature(feature, is_padding):
+      if not is_padding:
+        eval_features.append(feature)
+      eval_writer.process_feature(feature)
+
+    # XLNet preprocesses SQuAD examples in a P, Q, class order whereas
+    # BERT preprocesses in a class, Q, P order.
+    xlnet_ordering = self.task_config.model.encoder.type == 'xlnet'
+    kwargs = dict(
+        examples=eval_examples,
+        max_seq_length=params.seq_length,
+        doc_stride=params.doc_stride,
+        max_query_length=params.query_length,
+        is_training=False,
+        output_fn=_append_feature,
+        batch_size=params.global_batch_size,
+        xlnet_format=xlnet_ordering)
+
+    if params.tokenization == 'SentencePiece':
+      # squad_lib_sp requires one more argument 'do_lower_case'.
+      kwargs['do_lower_case'] = params.do_lower_case
+      kwargs['tokenizer'] = tokenization.FullSentencePieceTokenizer(
+          sp_model_file=params.vocab_file)
+    elif params.tokenization == 'WordPiece':
+      kwargs['tokenizer'] = tokenization.FullTokenizer(
+          vocab_file=params.vocab_file, do_lower_case=params.do_lower_case)
+    else:
+      raise ValueError('Unexpected tokenization: %s' % params.tokenization)
+
+    eval_dataset_size = self.squad_lib.convert_examples_to_features(**kwargs)
+    eval_writer.close()
+
+    logging.info('***** Evaluation input stats *****')
+    logging.info('  Num orig examples = %d', len(eval_examples))
+    logging.info('  Num split examples = %d', len(eval_features))
+    logging.info('  Batch size = %d', params.global_batch_size)
+    logging.info('  Dataset size = %d', eval_dataset_size)
+
+    return eval_writer.filename, eval_examples, eval_features
+
+  def _dummy_data(self, params, _):
+    """Returns dummy data."""
+    dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
+    x = dict(
+        input_word_ids=dummy_ids,
+        input_mask=dummy_ids,
+        input_type_ids=dummy_ids)
+    y = dict(
+        start_positions=tf.constant(0, dtype=tf.int32),
+        end_positions=tf.constant(1, dtype=tf.int32),
+        is_impossible=tf.constant(0, dtype=tf.int32))
+    return x, y
+
+  def build_inputs(self, params, input_context=None):
+    """Returns tf.data.Dataset for sentence_prediction task."""
+    if params.input_path == 'dummy':
+      dataset = tf.data.Dataset.range(1)
+      dataset = dataset.repeat()
+      dummy_data = functools.partial(self._dummy_data, params)
+      dataset = dataset.map(
+          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+      return dataset
+
+    if params.is_training:
+      dataloader_params = params
+    else:
+      input_path = self._tf_record_input_path
+      dataloader_params = params.replace(input_path=input_path)
+
+    return data_loader_factory.get_data_loader(dataloader_params).load(
+        input_context)
+
+  def build_metrics(self, training=None):
+    if not training:
+      # We cannot compute start/end_position_accuracy because start/end_position
+      # labels are not available in the validation dataset (b/173794928).
+      return []
+    # TODO(lehou): a list of metrics doesn't work the same as in compile/fit.
+    metrics = [
+        tf.keras.metrics.SparseCategoricalAccuracy(
+            name='start_position_accuracy'),
+        tf.keras.metrics.SparseCategoricalAccuracy(
+            name='end_position_accuracy'),
+    ]
+    return metrics
+
+  def process_metrics(self, metrics, labels, model_outputs):
+    metrics = dict([(metric.name, metric) for metric in metrics])
+    start_logits, end_logits = model_outputs
+    metrics['start_position_accuracy'].update_state(labels['start_positions'],
+                                                    start_logits)
+    metrics['end_position_accuracy'].update_state(labels['end_positions'],
+                                                  end_logits)
+
+  def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
+    start_logits, end_logits = model_outputs
+    compiled_metrics.update_state(
+        y_true=labels,  # labels has keys 'start_positions' and 'end_positions'.
+        y_pred={
+            'start_positions': start_logits,
+            'end_positions': end_logits
+        })
+
+  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
+    features, _ = inputs
+    unique_ids = features.pop('unique_ids')
+    model_outputs = self.inference_step(features, model)
+    start_logits, end_logits = model_outputs
+    # We cannot compute validation_loss here, because start/end_position
+    # labels are not available in the validation dataset (b/173794928).
+    logs = {
+        'unique_ids': unique_ids,
+        'start_logits': start_logits,
+        'end_logits': end_logits,
+    }
+    return logs
+
+  def aggregate_logs(self, state=None, step_outputs=None):
+    assert step_outputs is not None, 'Got no logs from self.validation_step.'
+    if state is None:
+      state = []
+
+    for outputs in zip(step_outputs['unique_ids'],
+                       step_outputs['start_logits'],
+                       step_outputs['end_logits']):
+      numpy_values = [
+          output.numpy() for output in outputs if output is not None]
+
+      for values in zip(*numpy_values):
+        state.append(RawAggregatedResult(
+            unique_id=values[0],
+            start_logits=values[1],
+            end_logits=values[2]))
+    return state
+
+  def reduce_aggregated_logs(self, aggregated_logs, global_step=None):
+    all_predictions, _, scores_diff = (
+        self.squad_lib.postprocess_output(
+            self._eval_examples,
+            self._eval_features,
+            aggregated_logs,
+            self.task_config.n_best_size,
+            self.task_config.max_answer_length,
+            self.task_config.validation_data.do_lower_case,
+            version_2_with_negative=(
+                self.task_config.validation_data.version_2_with_negative),
+            null_score_diff_threshold=(
+                self.task_config.null_score_diff_threshold),
+            xlnet_format=self.task_config.validation_data.xlnet_format,
+            verbose=False))
+
+    with tf.io.gfile.GFile(self.task_config.validation_data.input_path,
+                           'r') as reader:
+      dataset_json = json.load(reader)
+      pred_dataset = dataset_json['data']
+    if self.task_config.validation_data.version_2_with_negative:
+      eval_metrics = squad_evaluate_v2_0.evaluate(pred_dataset, all_predictions,
+                                                  scores_diff)
+      eval_metrics = {
+          'exact_match': eval_metrics['final_exact'],
+          'exact_match_threshold': eval_metrics['final_exact_thresh'],
+          'final_f1': eval_metrics['final_f1'] / 100.0,  # scale back to [0, 1].
+          'f1_threshold': eval_metrics['final_f1_thresh'],
+          'has_answer_exact_match': eval_metrics['HasAns_exact'],
+          'has_answer_f1': eval_metrics['HasAns_f1']
+      }
+    else:
+      eval_metrics = squad_evaluate_v1_1.evaluate(pred_dataset, all_predictions)
+      eval_metrics = {
+          'exact_match': eval_metrics['exact_match'],
+          'final_f1': eval_metrics['final_f1']
+      }
+    return eval_metrics
+
+
+@dataclasses.dataclass
+class XLNetQuestionAnsweringConfig(QuestionAnsweringConfig):
+  """The config for the XLNet variation of QuestionAnswering."""
+  pass
+
+
+@task_factory.register_task_cls(XLNetQuestionAnsweringConfig)
+class XLNetQuestionAnsweringTask(QuestionAnsweringTask):
+  """XLNet variant of the Question Answering Task.
+
+  The main differences include:
+    - The encoder is an `XLNetBase` class.
+    - The `SpanLabeling` head is an instance of `XLNetSpanLabeling` which
+      predicts start/end positions and impossibility score. During inference,
+      it predicts the top N scores and indexes.
+  """
+
+  def build_model(self):
+    if self.task_config.hub_module_url and self.task_config.init_checkpoint:
+      raise ValueError('At most one of `hub_module_url` and '
+                       '`init_checkpoint` can be specified.')
+    if self.task_config.hub_module_url:
+      encoder_network = utils.get_encoder_from_hub(
+          self.task_config.hub_module_url)
+    else:
+      encoder_network = encoders.build_encoder(self.task_config.model.encoder)
+    encoder_cfg = self.task_config.model.encoder.get()
+    return models.XLNetSpanLabeler(
+        network=encoder_network,
+        start_n_top=self.task_config.n_best_size,
+        end_n_top=self.task_config.n_best_size,
+        initializer=tf.keras.initializers.RandomNormal(
+            stddev=encoder_cfg.initializer_range))
+
+  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
+    start_positions = labels['start_positions']
+    end_positions = labels['end_positions']
+    is_impossible = labels['is_impossible']
+    is_impossible = tf.cast(tf.reshape(is_impossible, [-1]), tf.float32)
+
+    start_logits = model_outputs['start_logits']
+    end_logits = model_outputs['end_logits']
+    class_logits = model_outputs['class_logits']
+
+    start_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+        start_positions, start_logits)
+    end_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+        end_positions, end_logits)
+    is_impossible_loss = tf.keras.losses.binary_crossentropy(
+        is_impossible, class_logits, from_logits=True)
+
+    loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2
+    loss += tf.reduce_mean(is_impossible_loss) / 2
+    return loss
+
+  def process_metrics(self, metrics, labels, model_outputs):
+    metrics = dict([(metric.name, metric) for metric in metrics])
+    start_logits = model_outputs['start_logits']
+    end_logits = model_outputs['end_logits']
+    metrics['start_position_accuracy'].update_state(labels['start_positions'],
+                                                    start_logits)
+    metrics['end_position_accuracy'].update_state(labels['end_positions'],
+                                                  end_logits)
+
+  def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
+    start_logits = model_outputs['start_logits']
+    end_logits = model_outputs['end_logits']
+    compiled_metrics.update_state(
+        y_true=labels,  # labels has keys 'start_positions' and 'end_positions'.
+        y_pred={
+            'start_positions': start_logits,
+            'end_positions': end_logits,
+        })
+
+  def _dummy_data(self, params, _):
+    """Returns dummy data."""
+    dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
+    zero = tf.constant(0, dtype=tf.int32)
+    x = dict(
+        input_word_ids=dummy_ids,
+        input_mask=dummy_ids,
+        input_type_ids=dummy_ids,
+        class_index=zero,
+        is_impossible=zero,
+        paragraph_mask=dummy_ids,
+        start_positions=tf.zeros((1), dtype=tf.int32))
+    y = dict(
+        start_positions=tf.zeros((1), dtype=tf.int32),
+        end_positions=tf.ones((1), dtype=tf.int32),
+        is_impossible=zero)
+    return x, y
+
+  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
+    features, _ = inputs
+    unique_ids = features.pop('unique_ids')
+    model_outputs = self.inference_step(features, model)
+    start_top_predictions = model_outputs['start_top_predictions']
+    end_top_predictions = model_outputs['end_top_predictions']
+    start_indexes = model_outputs['start_top_index']
+    end_indexes = model_outputs['end_top_index']
+    class_logits = model_outputs['class_logits']
+
+    logs = {
+        'unique_ids': unique_ids,
+        'start_top_predictions': start_top_predictions,
+        'end_top_predictions': end_top_predictions,
+        'start_indexes': start_indexes,
+        'end_indexes': end_indexes,
+        'class_logits': class_logits,
+    }
+    return logs
+
+  def aggregate_logs(self, state=None, step_outputs=None):
+    assert step_outputs is not None, 'Got no logs from self.validation_step.'
+    if state is None:
+      state = []
+
+    for outputs in zip(step_outputs['unique_ids'],
+                       step_outputs['start_top_predictions'],
+                       step_outputs['end_top_predictions'],
+                       step_outputs['start_indexes'],
+                       step_outputs['end_indexes'],
+                       step_outputs['class_logits']):
+      numpy_values = [
+          output.numpy() for output in outputs]
+
+      for (unique_id, start_top_predictions, end_top_predictions, start_indexes,
+           end_indexes, class_logits) in zip(*numpy_values):
+        state.append(RawAggregatedResult(
+            unique_id=unique_id,
+            start_logits=start_top_predictions.tolist(),
+            end_logits=end_top_predictions.tolist(),
+            start_indexes=start_indexes.tolist(),
+            end_indexes=end_indexes.tolist(),
+            class_logits=class_logits))
+    return state
+
+
+def predict(task: QuestionAnsweringTask, params: cfg.DataConfig,
+            model: tf.keras.Model):
+  """Predicts on the input data.
+
+  Args:
+    task: A `QuestionAnsweringTask` object.
+    params: A `cfg.DataConfig` object.
+    model: A keras.Model.
+
+  Returns:
+    A tuple of `all_predictions`, `all_nbest` and `scores_diff`, which
+      are dict and can be written to json files including prediction json file,
+      nbest json file and null_odds json file.
+  """
+  tf_record_input_path, eval_examples, eval_features = (
+      task._preprocess_eval_data(params))  # pylint: disable=protected-access
+
+  # `tf_record_input_path` will overwrite `params.input_path`,
+  # when `task.buid_inputs()` is called.
+  task.set_preprocessed_eval_input_path(tf_record_input_path)
+
+  def predict_step(inputs):
+    """Replicated prediction calculation."""
+    return task.validation_step(inputs, model)
+
+  dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
+                                                 task.build_inputs, params)
+  aggregated_outputs = utils.predict(predict_step, task.aggregate_logs, dataset)
+
+  all_predictions, all_nbest, scores_diff = (
+      task.squad_lib.postprocess_output(
+          eval_examples,
+          eval_features,
+          aggregated_outputs,
+          task.task_config.n_best_size,
+          task.task_config.max_answer_length,
+          task.task_config.validation_data.do_lower_case,
+          version_2_with_negative=(params.version_2_with_negative),
+          null_score_diff_threshold=task.task_config.null_score_diff_threshold,
+          xlnet_format=task.task_config.validation_data.xlnet_format,
+          verbose=False))
+  return all_predictions, all_nbest, scores_diff
diff --git a/nlp/text_classification/bert/tensorflow2.0/tasks/sentence_prediction.py b/nlp/text_classification/bert/tensorflow2.0/tasks/sentence_prediction.py
new file mode 100644
index 000000000..c493bb43a
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/tasks/sentence_prediction.py
@@ -0,0 +1,299 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Sentence prediction (classification) task."""
+from typing import List, Union, Optional
+
+from absl import logging
+import dataclasses
+import numpy as np
+import orbit
+from scipy import stats
+from sklearn import metrics as sklearn_metrics
+import tensorflow as tf
+
+from core import base_task
+from core import config_definitions as cfg
+from core import task_factory
+from modeling import tf_utils
+from modeling.hyperparams import base_config
+from nlp_configs import encoders
+from nlp.data import data_loader_factory
+from nlp_modeling import models
+from tasks import utils
+
+METRIC_TYPES = frozenset(
+    ['accuracy', 'matthews_corrcoef', 'pearson_spearman_corr'])
+
+
+@dataclasses.dataclass
+class ModelConfig(base_config.Config):
+  """A classifier/regressor configuration."""
+  num_classes: int = 0
+  use_encoder_pooler: bool = False
+  encoder: encoders.EncoderConfig = encoders.EncoderConfig()
+
+
+@dataclasses.dataclass
+class SentencePredictionConfig(cfg.TaskConfig):
+  """The model config."""
+  # At most one of `init_checkpoint` and `hub_module_url` can
+  # be specified.
+  init_checkpoint: str = ''
+  init_cls_pooler: bool = False
+  hub_module_url: str = ''
+  metric_type: str = 'accuracy'
+  # Defines the concrete model config at instantiation time.
+  model: ModelConfig = ModelConfig()
+  train_data: cfg.DataConfig = cfg.DataConfig()
+  validation_data: cfg.DataConfig = cfg.DataConfig()
+
+
+@task_factory.register_task_cls(SentencePredictionConfig)
+class SentencePredictionTask(base_task.Task):
+  """Task object for sentence_prediction."""
+
+  def __init__(self, params: cfg.TaskConfig, logging_dir=None, name=None):
+    super().__init__(params, logging_dir, name=name)
+    if params.metric_type not in METRIC_TYPES:
+      raise ValueError('Invalid metric_type: {}'.format(params.metric_type))
+    self.metric_type = params.metric_type
+    if hasattr(params.train_data, 'label_field'):
+      self.label_field = params.train_data.label_field
+    else:
+      self.label_field = 'label_ids'
+
+  def build_model(self):
+    if self.task_config.hub_module_url and self.task_config.init_checkpoint:
+      raise ValueError('At most one of `hub_module_url` and '
+                       '`init_checkpoint` can be specified.')
+    if self.task_config.hub_module_url:
+      encoder_network = utils.get_encoder_from_hub(
+          self.task_config.hub_module_url)
+    else:
+      encoder_network = encoders.build_encoder(self.task_config.model.encoder)
+    encoder_cfg = self.task_config.model.encoder.get()
+    if self.task_config.model.encoder.type == 'xlnet':
+      return models.XLNetClassifier(
+          network=encoder_network,
+          num_classes=self.task_config.model.num_classes,
+          initializer=tf.keras.initializers.RandomNormal(
+              stddev=encoder_cfg.initializer_range))
+    else:
+      return models.BertClassifier(
+          network=encoder_network,
+          num_classes=self.task_config.model.num_classes,
+          initializer=tf.keras.initializers.TruncatedNormal(
+              stddev=encoder_cfg.initializer_range),
+          use_encoder_pooler=self.task_config.model.use_encoder_pooler)
+
+  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
+    label_ids = labels[self.label_field]
+    if self.task_config.model.num_classes == 1:
+      loss = tf.keras.losses.mean_squared_error(label_ids, model_outputs)
+    else:
+      loss = tf.keras.losses.sparse_categorical_crossentropy(
+          label_ids, tf.cast(model_outputs, tf.float32), from_logits=True)
+
+    if aux_losses:
+      loss += tf.add_n(aux_losses)
+    return tf_utils.safe_mean(loss)
+
+  def build_inputs(self, params, input_context=None):
+    """Returns tf.data.Dataset for sentence_prediction task."""
+    if params.input_path == 'dummy':
+
+      def dummy_data(_):
+        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
+        x = dict(
+            input_word_ids=dummy_ids,
+            input_mask=dummy_ids,
+            input_type_ids=dummy_ids)
+
+        if self.task_config.model.num_classes == 1:
+          y = tf.zeros((1,), dtype=tf.float32)
+        else:
+          y = tf.zeros((1, 1), dtype=tf.int32)
+        x[self.label_field] = y
+        return x
+
+      dataset = tf.data.Dataset.range(1)
+      dataset = dataset.repeat()
+      dataset = dataset.map(
+          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+      return dataset
+
+    return data_loader_factory.get_data_loader(params).load(input_context)
+
+  def build_metrics(self, training=None):
+    del training
+    if self.task_config.model.num_classes == 1:
+      metrics = [tf.keras.metrics.MeanSquaredError()]
+    else:
+      metrics = [
+          tf.keras.metrics.SparseCategoricalAccuracy(name='cls_accuracy')
+      ]
+    return metrics
+
+  def process_metrics(self, metrics, labels, model_outputs):
+    for metric in metrics:
+      metric.update_state(labels[self.label_field], model_outputs)
+
+  def process_compiled_metrics(self, compiled_metrics, labels, model_outputs):
+    compiled_metrics.update_state(labels[self.label_field], model_outputs)
+
+  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
+    if self.metric_type == 'accuracy':
+      return super(SentencePredictionTask,
+                   self).validation_step(inputs, model, metrics)
+    features, labels = inputs, inputs
+    outputs = self.inference_step(features, model)
+    loss = self.build_losses(
+        labels=labels, model_outputs=outputs, aux_losses=model.losses)
+    logs = {self.loss: loss}
+    if self.metric_type == 'matthews_corrcoef':
+      logs.update({
+          'sentence_prediction':  # Ensure one prediction along batch dimension.
+              tf.expand_dims(tf.math.argmax(outputs, axis=1), axis=1),
+          'labels':
+              labels[self.label_field],
+      })
+    if self.metric_type == 'pearson_spearman_corr':
+      logs.update({
+          'sentence_prediction': outputs,
+          'labels': labels[self.label_field],
+      })
+    return logs
+
+  def aggregate_logs(self, state=None, step_outputs=None):
+    if self.metric_type == 'accuracy':
+      return None
+    if state is None:
+      state = {'sentence_prediction': [], 'labels': []}
+    state['sentence_prediction'].append(
+        np.concatenate([v.numpy() for v in step_outputs['sentence_prediction']],
+                       axis=0))
+    state['labels'].append(
+        np.concatenate([v.numpy() for v in step_outputs['labels']], axis=0))
+    return state
+
+  def reduce_aggregated_logs(self, aggregated_logs, global_step=None):
+    if self.metric_type == 'accuracy':
+      return None
+    elif self.metric_type == 'matthews_corrcoef':
+      preds = np.concatenate(aggregated_logs['sentence_prediction'], axis=0)
+      preds = np.reshape(preds, -1)
+      labels = np.concatenate(aggregated_logs['labels'], axis=0)
+      labels = np.reshape(labels, -1)
+      return {
+          self.metric_type: sklearn_metrics.matthews_corrcoef(preds, labels)
+      }
+    elif self.metric_type == 'pearson_spearman_corr':
+      preds = np.concatenate(aggregated_logs['sentence_prediction'], axis=0)
+      preds = np.reshape(preds, -1)
+      labels = np.concatenate(aggregated_logs['labels'], axis=0)
+      labels = np.reshape(labels, -1)
+      pearson_corr = stats.pearsonr(preds, labels)[0]
+      spearman_corr = stats.spearmanr(preds, labels)[0]
+      corr_metric = (pearson_corr + spearman_corr) / 2
+      return {self.metric_type: corr_metric}
+
+  def initialize(self, model):
+    """Load a pretrained checkpoint (if exists) and then train from iter 0."""
+    ckpt_dir_or_file = self.task_config.init_checkpoint
+    if not ckpt_dir_or_file:
+      return
+    if tf.io.gfile.isdir(ckpt_dir_or_file):
+      ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
+
+    pretrain2finetune_mapping = {
+        'encoder': model.checkpoint_items['encoder'],
+    }
+    if self.task_config.init_cls_pooler:
+      # This option is valid when use_encoder_pooler is false.
+      pretrain2finetune_mapping[
+          'next_sentence.pooler_dense'] = model.checkpoint_items[
+              'sentence_prediction.pooler_dense']
+    ckpt = tf.train.Checkpoint(**pretrain2finetune_mapping)
+    status = ckpt.read(ckpt_dir_or_file)
+    status.expect_partial().assert_existing_objects_matched()
+    logging.info('Finished loading pretrained checkpoint from %s',
+                 ckpt_dir_or_file)
+
+
+def predict(task: SentencePredictionTask,
+            params: cfg.DataConfig,
+            model: tf.keras.Model,
+            params_aug: Optional[cfg.DataConfig] = None,
+            test_time_aug_wgt: float = 0.3) -> List[Union[int, float]]:
+  """Predicts on the input data.
+
+  Args:
+    task: A `SentencePredictionTask` object.
+    params: A `cfg.DataConfig` object.
+    model: A keras.Model.
+    params_aug: A `cfg.DataConfig` object for augmented data.
+    test_time_aug_wgt: Test time augmentation weight. The prediction score will
+      use (1. - test_time_aug_wgt) original prediction plus test_time_aug_wgt
+      augmented prediction.
+
+  Returns:
+    A list of predictions with length of `num_examples`. For regression task,
+      each element in the list is the predicted score; for classification task,
+      each element is the predicted class id.
+  """
+
+  def predict_step(inputs):
+    """Replicated prediction calculation."""
+    x = inputs
+    example_id = x.pop('example_id')
+    outputs = task.inference_step(x, model)
+    return dict(example_id=example_id, predictions=outputs)
+
+  def aggregate_fn(state, outputs):
+    """Concatenates model's outputs."""
+    if state is None:
+      state = []
+
+    for per_replica_example_id, per_replica_batch_predictions in zip(
+        outputs['example_id'], outputs['predictions']):
+      state.extend(zip(per_replica_example_id, per_replica_batch_predictions))
+    return state
+
+  dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
+                                                 task.build_inputs, params)
+  outputs = utils.predict(predict_step, aggregate_fn, dataset)
+
+  # When running on TPU POD, the order of output cannot be maintained,
+  # so we need to sort by example_id.
+  outputs = sorted(outputs, key=lambda x: x[0])
+  is_regression = task.task_config.model.num_classes == 1
+  if params_aug is not None:
+    dataset_aug = orbit.utils.make_distributed_dataset(
+        tf.distribute.get_strategy(), task.build_inputs, params_aug)
+    outputs_aug = utils.predict(predict_step, aggregate_fn, dataset_aug)
+    outputs_aug = sorted(outputs_aug, key=lambda x: x[0])
+    if is_regression:
+      return [(1. - test_time_aug_wgt) * x[1] + test_time_aug_wgt * y[1]
+              for x, y in zip(outputs, outputs_aug)]
+    else:
+      return [
+          tf.argmax(
+              (1. - test_time_aug_wgt) * x[1] + test_time_aug_wgt * y[1],
+              axis=-1) for x, y in zip(outputs, outputs_aug)
+      ]
+  if is_regression:
+    return [x[1] for x in outputs]
+  else:
+    return [tf.argmax(x[1], axis=-1) for x in outputs]
diff --git a/nlp/text_classification/bert/tensorflow2.0/tasks/tagging.py b/nlp/text_classification/bert/tensorflow2.0/tasks/tagging.py
new file mode 100644
index 000000000..39e68f6fe
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/tasks/tagging.py
@@ -0,0 +1,265 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tagging (e.g., NER/POS) task."""
+from typing import List, Optional, Tuple
+
+import dataclasses
+import orbit
+
+from seqeval import metrics as seqeval_metrics
+
+import tensorflow as tf
+
+from core import base_task
+from core import config_definitions as cfg
+from core import task_factory
+from modeling.hyperparams import base_config
+from nlp_configs import encoders
+from nlp.data import data_loader_factory
+from nlp_modeling import models
+from tasks import utils
+
+
+@dataclasses.dataclass
+class ModelConfig(base_config.Config):
+  """A base span labeler configuration."""
+  encoder: encoders.EncoderConfig = encoders.EncoderConfig()
+  head_dropout: float = 0.1
+  head_initializer_range: float = 0.02
+
+
+@dataclasses.dataclass
+class TaggingConfig(cfg.TaskConfig):
+  """The model config."""
+  # At most one of `init_checkpoint` and `hub_module_url` can be specified.
+  init_checkpoint: str = ''
+  hub_module_url: str = ''
+  model: ModelConfig = ModelConfig()
+
+  # The real class names, the order of which should match real label id.
+  # Note that a word may be tokenized into multiple word_pieces tokens, and
+  # we asssume the real label id (non-negative) is assigned to the first token
+  # of the word, and a negative label id is assigned to the remaining tokens.
+  # The negative label id will not contribute to loss and metrics.
+  class_names: Optional[List[str]] = None
+  train_data: cfg.DataConfig = cfg.DataConfig()
+  validation_data: cfg.DataConfig = cfg.DataConfig()
+
+
+def _masked_labels_and_weights(y_true):
+  """Masks negative values from token level labels.
+
+  Args:
+    y_true: Token labels, typically shape (batch_size, seq_len), where tokens
+      with negative labels should be ignored during loss/accuracy calculation.
+
+  Returns:
+    (masked_y_true, masked_weights) where `masked_y_true` is the input
+    with each negative label replaced with zero and `masked_weights` is 0.0
+    where negative labels were replaced and 1.0 for original labels.
+  """
+  # Ignore the classes of tokens with negative values.
+  mask = tf.greater_equal(y_true, 0)
+  # Replace negative labels, which are out of bounds for some loss functions,
+  # with zero.
+  masked_y_true = tf.where(mask, y_true, 0)
+  return masked_y_true, tf.cast(mask, tf.float32)
+
+
+@task_factory.register_task_cls(TaggingConfig)
+class TaggingTask(base_task.Task):
+  """Task object for tagging (e.g., NER or POS)."""
+
+  def build_model(self):
+    if self.task_config.hub_module_url and self.task_config.init_checkpoint:
+      raise ValueError('At most one of `hub_module_url` and '
+                       '`init_checkpoint` can be specified.')
+    if self.task_config.hub_module_url:
+      encoder_network = utils.get_encoder_from_hub(
+          self.task_config.hub_module_url)
+    else:
+      encoder_network = encoders.build_encoder(self.task_config.model.encoder)
+
+    return models.BertTokenClassifier(
+        network=encoder_network,
+        num_classes=len(self.task_config.class_names),
+        initializer=tf.keras.initializers.TruncatedNormal(
+            stddev=self.task_config.model.head_initializer_range),
+        dropout_rate=self.task_config.model.head_dropout,
+        output='logits',
+        output_encoder_outputs=True)
+
+  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
+    logits = tf.cast(model_outputs['logits'], tf.float32)
+    masked_labels, masked_weights = _masked_labels_and_weights(labels)
+    loss = tf.keras.losses.sparse_categorical_crossentropy(
+        masked_labels, logits, from_logits=True)
+    numerator_loss = tf.reduce_sum(loss * masked_weights)
+    denominator_loss = tf.reduce_sum(masked_weights)
+    loss = tf.math.divide_no_nan(numerator_loss, denominator_loss)
+    return loss
+
+  def build_inputs(self, params: cfg.DataConfig, input_context=None):
+    """Returns tf.data.Dataset for sentence_prediction task."""
+    if params.input_path == 'dummy':
+
+      def dummy_data(_):
+        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
+        x = dict(
+            input_word_ids=dummy_ids,
+            input_mask=dummy_ids,
+            input_type_ids=dummy_ids)
+
+        # Include some label_id as -1, which will be ignored in loss/metrics.
+        y = tf.random.uniform(
+            shape=(1, params.seq_length),
+            minval=-1,
+            maxval=len(self.task_config.class_names),
+            dtype=tf.dtypes.int32)
+        return (x, y)
+
+      dataset = tf.data.Dataset.range(1)
+      dataset = dataset.repeat()
+      dataset = dataset.map(
+          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+      return dataset
+
+    return data_loader_factory.get_data_loader(params).load(input_context)
+
+  def inference_step(self, inputs, model: tf.keras.Model):
+    """Performs the forward step."""
+    logits = model(inputs, training=False)['logits']
+    return {'logits': logits,
+            'predict_ids': tf.argmax(logits, axis=-1, output_type=tf.int32)}
+
+  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
+    """Validatation step.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the keras.Model.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    features, labels = inputs
+    outputs = self.inference_step(features, model)
+    loss = self.build_losses(labels=labels, model_outputs=outputs)
+
+    # Negative label ids are padding labels which should be ignored.
+    real_label_index = tf.where(tf.greater_equal(labels, 0))
+    predict_ids = tf.gather_nd(outputs['predict_ids'], real_label_index)
+    label_ids = tf.gather_nd(labels, real_label_index)
+    return {
+        self.loss: loss,
+        'predict_ids': predict_ids,
+        'label_ids': label_ids,
+    }
+
+  def aggregate_logs(self, state=None, step_outputs=None):
+    """Aggregates over logs returned from a validation step."""
+    if state is None:
+      state = {'predict_class': [], 'label_class': []}
+
+    def id_to_class_name(batched_ids):
+      class_names = []
+      for per_example_ids in batched_ids:
+        class_names.append([])
+        for per_token_id in per_example_ids.numpy().tolist():
+          class_names[-1].append(self.task_config.class_names[per_token_id])
+
+      return class_names
+
+    # Convert id to class names, because `seqeval_metrics` relies on the class
+    # name to decide IOB tags.
+    state['predict_class'].extend(id_to_class_name(step_outputs['predict_ids']))
+    state['label_class'].extend(id_to_class_name(step_outputs['label_ids']))
+    return state
+
+  def reduce_aggregated_logs(self, aggregated_logs, global_step=None):
+    """Reduces aggregated logs over validation steps."""
+    label_class = aggregated_logs['label_class']
+    predict_class = aggregated_logs['predict_class']
+    return {
+        'f1':
+            seqeval_metrics.f1_score(label_class, predict_class),
+        'precision':
+            seqeval_metrics.precision_score(label_class, predict_class),
+        'recall':
+            seqeval_metrics.recall_score(label_class, predict_class),
+        'accuracy':
+            seqeval_metrics.accuracy_score(label_class, predict_class),
+    }
+
+
+def predict(task: TaggingTask,
+            params: cfg.DataConfig,
+            model: tf.keras.Model) -> List[Tuple[int, int, List[int]]]:
+  """Predicts on the input data.
+
+  Args:
+    task: A `TaggingTask` object.
+    params: A `cfg.DataConfig` object.
+    model: A keras.Model.
+
+  Returns:
+    A list of tuple. Each tuple contains `sentence_id`, `sub_sentence_id` and
+      a list of predicted ids.
+  """
+
+  def predict_step(inputs):
+    """Replicated prediction calculation."""
+    x, y = inputs
+    sentence_ids = x.pop('sentence_id')
+    sub_sentence_ids = x.pop('sub_sentence_id')
+    outputs = task.inference_step(x, model)
+    predict_ids = outputs['predict_ids']
+    label_mask = tf.greater_equal(y, 0)
+    return dict(
+        predict_ids=predict_ids,
+        label_mask=label_mask,
+        sentence_ids=sentence_ids,
+        sub_sentence_ids=sub_sentence_ids)
+
+  def aggregate_fn(state, outputs):
+    """Concatenates model's outputs."""
+    if state is None:
+      state = []
+
+    for (batch_predict_ids, batch_label_mask, batch_sentence_ids,
+         batch_sub_sentence_ids) in zip(outputs['predict_ids'],
+                                        outputs['label_mask'],
+                                        outputs['sentence_ids'],
+                                        outputs['sub_sentence_ids']):
+      for (tmp_predict_ids, tmp_label_mask, tmp_sentence_id,
+           tmp_sub_sentence_id) in zip(batch_predict_ids.numpy(),
+                                       batch_label_mask.numpy(),
+                                       batch_sentence_ids.numpy(),
+                                       batch_sub_sentence_ids.numpy()):
+        real_predict_ids = []
+        assert len(tmp_predict_ids) == len(tmp_label_mask)
+        for i in range(len(tmp_predict_ids)):
+          # Skip the padding label.
+          if tmp_label_mask[i]:
+            real_predict_ids.append(tmp_predict_ids[i])
+        state.append((tmp_sentence_id, tmp_sub_sentence_id, real_predict_ids))
+
+    return state
+
+  dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
+                                                 task.build_inputs, params)
+  outputs = utils.predict(predict_step, aggregate_fn, dataset)
+  return sorted(outputs, key=lambda x: (x[0], x[1]))
diff --git a/nlp/text_classification/bert/tensorflow2.0/tasks/translation.py b/nlp/text_classification/bert/tensorflow2.0/tasks/translation.py
new file mode 100644
index 000000000..9ab3311f9
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/tasks/translation.py
@@ -0,0 +1,367 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Defines the translation task."""
+import os
+from typing import Optional
+
+from absl import logging
+import dataclasses
+import sacrebleu
+import tensorflow as tf
+import tensorflow_text as tftxt
+
+from core import base_task
+from core import config_definitions as cfg
+from core import task_factory
+from modeling.hyperparams import base_config
+from nlp.data import data_loader_factory
+from nlp.metrics import bleu
+from nlp_modeling import models
+
+
+def _pad_tensors_to_same_length(x, y):
+  """Pad x and y so that the results have the same length (second dimension)."""
+  x_length = tf.shape(x)[1]
+  y_length = tf.shape(y)[1]
+
+  max_length = tf.maximum(x_length, y_length)
+
+  x = tf.pad(x, [[0, 0], [0, max_length - x_length], [0, 0]])
+  y = tf.pad(y, [[0, 0], [0, max_length - y_length]])
+  return x, y
+
+
+def _padded_cross_entropy_loss(logits, labels, smoothing, vocab_size):
+  """Calculate cross entropy loss while ignoring padding.
+
+  Args:
+    logits: Tensor of size [batch_size, length_logits, vocab_size]
+    labels: Tensor of size [batch_size, length_labels]
+    smoothing: Label smoothing constant, used to determine the on and off values
+    vocab_size: int size of the vocabulary
+
+  Returns:
+    Returns the cross entropy loss and weight tensors: float32 tensors with
+      shape [batch_size, max(length_logits, length_labels)]
+  """
+  logits, labels = _pad_tensors_to_same_length(logits, labels)
+
+  # Calculate smoothing cross entropy
+  confidence = 1.0 - smoothing
+  low_confidence = (1.0 - confidence) / tf.cast(vocab_size - 1, tf.float32)
+  soft_targets = tf.one_hot(
+      tf.cast(labels, tf.int32),
+      depth=vocab_size,
+      on_value=confidence,
+      off_value=low_confidence)
+  xentropy = tf.nn.softmax_cross_entropy_with_logits(
+      logits=logits, labels=soft_targets)
+
+  # Calculate the best (lowest) possible value of cross entropy, and
+  # subtract from the cross entropy loss.
+  normalizing_constant = -(
+      confidence * tf.math.log(confidence) + tf.cast(vocab_size - 1, tf.float32)
+      * low_confidence * tf.math.log(low_confidence + 1e-20))
+  xentropy -= normalizing_constant
+
+  weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
+  return xentropy * weights, weights
+
+
+@dataclasses.dataclass
+class EncDecoder(base_config.Config):
+  """Configurations for Encoder/Decoder."""
+  num_layers: int = 6
+  num_attention_heads: int = 8
+  intermediate_size: int = 2048
+  activation: str = "relu"
+  dropout_rate: float = 0.1
+  attention_dropout_rate: float = 0.1
+  intermediate_dropout: float = 0.1
+  use_bias: bool = False
+  norm_first: bool = True
+  norm_epsilon: float = 1e-6
+
+
+@dataclasses.dataclass
+class ModelConfig(base_config.Config):
+  """A base Seq2Seq model configuration."""
+  encoder: EncDecoder = EncDecoder()
+  decoder: EncDecoder = EncDecoder()
+
+  embedding_width: int = 512
+  dropout_rate: float = 0.1
+
+  # Decoding.
+  padded_decode: bool = False
+  decode_max_length: Optional[int] = None
+  beam_size: int = 4
+  alpha: float = 0.6
+
+  # Training.
+  label_smoothing: float = 0.1
+
+
+@dataclasses.dataclass
+class TranslationConfig(cfg.TaskConfig):
+  """The translation task config."""
+  model: ModelConfig = ModelConfig()
+  train_data: cfg.DataConfig = cfg.DataConfig()
+  validation_data: cfg.DataConfig = cfg.DataConfig()
+  # Tokenization
+  sentencepiece_model_path: str = ""
+  # Evaluation.
+  print_translations: Optional[bool] = None
+
+
+def write_test_record(params, model_dir):
+  """Writes the test input to a tfrecord."""
+  # Get raw data from tfds.
+  params = params.replace(transform_and_batch=False)
+  dataset = data_loader_factory.get_data_loader(params).load()
+  references = []
+  total_samples = 0
+  output_file = os.path.join(model_dir, "eval.tf_record")
+  writer = tf.io.TFRecordWriter(output_file)
+  for d in dataset:
+    references.append(d[params.tgt_lang].numpy().decode())
+    example = tf.train.Example(
+        features=tf.train.Features(
+            feature={
+                "unique_id": tf.train.Feature(
+                    int64_list=tf.train.Int64List(value=[total_samples])),
+                params.src_lang: tf.train.Feature(
+                    bytes_list=tf.train.BytesList(
+                        value=[d[params.src_lang].numpy()])),
+                params.tgt_lang: tf.train.Feature(
+                    bytes_list=tf.train.BytesList(
+                        value=[d[params.tgt_lang].numpy()])),
+            }))
+    writer.write(example.SerializeToString())
+    total_samples += 1
+  batch_size = params.global_batch_size
+  num_dummy_example = batch_size - total_samples % batch_size
+  for i in range(num_dummy_example):
+    example = tf.train.Example(
+        features=tf.train.Features(
+            feature={
+                "unique_id": tf.train.Feature(
+                    int64_list=tf.train.Int64List(value=[total_samples + i])),
+                params.src_lang: tf.train.Feature(
+                    bytes_list=tf.train.BytesList(value=[b""])),
+                params.tgt_lang: tf.train.Feature(
+                    bytes_list=tf.train.BytesList(value=[b""])),
+            }))
+    writer.write(example.SerializeToString())
+  writer.close()
+  return references, output_file
+
+
+@task_factory.register_task_cls(TranslationConfig)
+class TranslationTask(base_task.Task):
+  """A single-replica view of training procedure.
+
+  Tasks provide artifacts for training/evalution procedures, including
+  loading/iterating over Datasets, initializing the model, calculating the loss
+  and customized metrics with reduction.
+  """
+
+  def __init__(self, params: cfg.TaskConfig, logging_dir=None, name=None):
+    super().__init__(params, logging_dir, name=name)
+    self._sentencepiece_model_path = params.sentencepiece_model_path
+    if params.sentencepiece_model_path:
+      self._sp_tokenizer = tftxt.SentencepieceTokenizer(
+          model=tf.io.gfile.GFile(params.sentencepiece_model_path, "rb").read(),
+          add_eos=True)
+      try:
+        empty_str_tokenized = self._sp_tokenizer.tokenize("").numpy()
+      except tf.errors.InternalError:
+        raise ValueError(
+            "EOS token not in tokenizer vocab."
+            "Please make sure the tokenizer generates a single token for an "
+            "empty string.")
+      self._eos_id = empty_str_tokenized.item()
+      self._vocab_size = self._sp_tokenizer.vocab_size().numpy()
+    else:
+      raise ValueError("Setencepiece model path not provided.")
+    if (params.validation_data.input_path or
+        params.validation_data.tfds_name) and self._logging_dir:
+      self._references, self._tf_record_input_path = write_test_record(
+          params.validation_data, self.logging_dir)
+
+  def build_model(self) -> tf.keras.Model:
+    """Creates model architecture.
+
+    Returns:
+      A model instance.
+    """
+    model_cfg = self.task_config.model
+    encoder_kwargs = model_cfg.encoder.as_dict()
+    encoder_layer = models.TransformerEncoder(**encoder_kwargs)
+    decoder_kwargs = model_cfg.decoder.as_dict()
+    decoder_layer = models.TransformerDecoder(**decoder_kwargs)
+
+    return models.Seq2SeqTransformer(
+        vocab_size=self._vocab_size,
+        embedding_width=model_cfg.embedding_width,
+        dropout_rate=model_cfg.dropout_rate,
+        padded_decode=model_cfg.padded_decode,
+        decode_max_length=model_cfg.decode_max_length,
+        beam_size=model_cfg.beam_size,
+        alpha=model_cfg.alpha,
+        encoder_layer=encoder_layer,
+        decoder_layer=decoder_layer,
+        eos_id=self._eos_id)
+
+  def build_inputs(self,
+                   params: cfg.DataConfig,
+                   input_context: Optional[tf.distribute.InputContext] = None):
+    """Returns a dataset."""
+    if params.is_training:
+      dataloader_params = params
+    else:
+      input_path = self._tf_record_input_path
+      # Read from padded tf records instead.
+      dataloader_params = params.replace(
+          input_path=input_path,
+          tfds_name="",
+          tfds_split="",
+          has_unique_id=True)
+    dataloader_params = dataloader_params.replace(
+        sentencepiece_model_path=self._sentencepiece_model_path)
+    return data_loader_factory.get_data_loader(dataloader_params).load(
+        input_context)
+
+  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
+    """Standard interface to compute losses.
+
+    Args:
+      labels: optional label tensors.
+      model_outputs: a nested structure of output tensors.
+      aux_losses: auxiliary loss tensors, i.e. `losses` in keras.Model.
+
+    Returns:
+      The total loss tensor.
+    """
+    del aux_losses
+
+    smoothing = self.task_config.model.label_smoothing
+    xentropy, weights = _padded_cross_entropy_loss(model_outputs, labels,
+                                                   smoothing, self._vocab_size)
+    return tf.reduce_sum(xentropy) / tf.reduce_sum(weights)
+
+  def train_step(self,
+                 inputs,
+                 model: tf.keras.Model,
+                 optimizer: tf.keras.optimizers.Optimizer,
+                 metrics=None):
+    """Does forward and backward.
+
+    With distribution strategies, this method runs on devices.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the model, forward pass definition.
+      optimizer: the optimizer for this training step.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    with tf.GradientTape() as tape:
+      outputs = model(inputs, training=True)
+      # Computes per-replica loss.
+      loss = self.build_losses(labels=inputs["targets"], model_outputs=outputs)
+      # Scales loss as the default gradients allreduce performs sum inside the
+      # optimizer.
+      scaled_loss = loss / tf.distribute.get_strategy().num_replicas_in_sync
+
+      # For mixed precision, when a LossScaleOptimizer is used, the loss is
+      # scaled to avoid numeric underflow.
+      if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer):
+        scaled_loss = optimizer.get_scaled_loss(scaled_loss)
+
+    tvars = model.trainable_variables
+    grads = tape.gradient(scaled_loss, tvars)
+
+    if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer):
+      grads = optimizer.get_unscaled_gradients(grads)
+    optimizer.apply_gradients(list(zip(grads, tvars)))
+    logs = {self.loss: loss}
+    if metrics:
+      self.process_metrics(metrics, inputs["targets"], outputs)
+    return logs
+
+  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
+    unique_ids = inputs.pop("unique_id")
+    # Validation loss
+    outputs = model(inputs, training=False)
+    # Computes per-replica loss to help understand if we are overfitting.
+    loss = self.build_losses(labels=inputs["targets"], model_outputs=outputs)
+    inputs.pop("targets")
+    # Beam search to calculate metrics.
+    model_outputs = model(inputs, training=False)
+    outputs = model_outputs
+    logs = {
+        self.loss: loss,
+        "inputs": inputs["inputs"],
+        "unique_ids": unique_ids,
+    }
+    logs.update(outputs)
+    return logs
+
+  def aggregate_logs(self, state=None, step_outputs=None):
+    """Aggregates over logs returned from a validation step."""
+    if state is None:
+      state = {}
+
+    for in_token_ids, out_token_ids, unique_ids in zip(
+        step_outputs["inputs"],
+        step_outputs["outputs"],
+        step_outputs["unique_ids"]):
+      for in_ids, out_ids, u_id in zip(
+          in_token_ids.numpy(), out_token_ids.numpy(), unique_ids.numpy()):
+        state[u_id] = (in_ids, out_ids)
+    return state
+
+  def reduce_aggregated_logs(self, aggregated_logs, global_step=None):
+
+    def _decode(ids):
+      return self._sp_tokenizer.detokenize(ids).numpy().decode()
+
+    def _trim_and_decode(ids):
+      """Trim EOS and PAD tokens from ids, and decode to return a string."""
+      try:
+        index = list(ids).index(self._eos_id)
+        return _decode(ids[:index])
+      except ValueError:  # No EOS found in sequence
+        return _decode(ids)
+
+    translations = []
+    for u_id in sorted(aggregated_logs):
+      if u_id >= len(self._references):
+        continue
+      src = _trim_and_decode(aggregated_logs[u_id][0])
+      translation = _trim_and_decode(aggregated_logs[u_id][1])
+      translations.append(translation)
+      if self.task_config.print_translations:
+        # Deccoding the in_ids to reflect what the model sees.
+        logging.info("Translating:\n\tInput: %s\n\tOutput: %s\n\tReference: %s",
+                     src, translation, self._references[u_id])
+    sacrebleu_score = sacrebleu.corpus_bleu(
+        translations, [self._references]).score
+    bleu_score = bleu.bleu_on_list(self._references, translations)
+    return {"sacrebleu_score": sacrebleu_score,
+            "bleu_score": bleu_score}
diff --git a/nlp/text_classification/bert/tensorflow2.0/tasks/utils.py b/nlp/text_classification/bert/tensorflow2.0/tasks/utils.py
new file mode 100644
index 000000000..35be4e3d4
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/tasks/utils.py
@@ -0,0 +1,76 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Common utils for tasks."""
+from typing import Any, Callable
+
+import orbit
+import tensorflow as tf
+import tensorflow_hub as hub
+
+
+def get_encoder_from_hub(hub_model_path: str) -> tf.keras.Model:
+  """Gets an encoder from hub.
+
+  Args:
+    hub_model_path: The path to the tfhub model.
+
+  Returns:
+    A tf.keras.Model.
+  """
+  input_word_ids = tf.keras.layers.Input(
+      shape=(None,), dtype=tf.int32, name='input_word_ids')
+  input_mask = tf.keras.layers.Input(
+      shape=(None,), dtype=tf.int32, name='input_mask')
+  input_type_ids = tf.keras.layers.Input(
+      shape=(None,), dtype=tf.int32, name='input_type_ids')
+  hub_layer = hub.KerasLayer(hub_model_path, trainable=True)
+  output_dict = {}
+  dict_input = dict(
+      input_word_ids=input_word_ids,
+      input_mask=input_mask,
+      input_type_ids=input_type_ids)
+  output_dict = hub_layer(dict_input)
+
+  return tf.keras.Model(inputs=dict_input, outputs=output_dict)
+
+
+def predict(predict_step_fn: Callable[[Any], Any],
+            aggregate_fn: Callable[[Any, Any], Any], dataset: tf.data.Dataset):
+  """Runs prediction.
+
+  Args:
+    predict_step_fn: A callable such as `def predict_step(inputs)`, where
+      `inputs` are input tensors.
+    aggregate_fn: A callable such as `def aggregate_fn(state, value)`, where
+      `value` is the outputs from `predict_step_fn`.
+    dataset: A `tf.data.Dataset` object.
+
+  Returns:
+    The aggregated predictions.
+  """
+
+  @tf.function
+  def predict_step(iterator):
+    """Predicts on distributed devices."""
+    outputs = tf.distribute.get_strategy().run(
+        predict_step_fn, args=(next(iterator),))
+    return tf.nest.map_structure(
+        tf.distribute.get_strategy().experimental_local_results, outputs)
+
+  loop_fn = orbit.utils.create_loop_fn(predict_step)
+  # Set `num_steps` to -1 to exhaust the dataset.
+  outputs = loop_fn(
+      iter(dataset), num_steps=-1, state=None, reduce_fn=aggregate_fn)  # pytype: disable=wrong-arg-types
+  return outputs
diff --git a/nlp/text_classification/bert/tensorflow2.0/tf1_checkpoint_converter_lib.py b/nlp/text_classification/bert/tensorflow2.0/tf1_checkpoint_converter_lib.py
new file mode 100644
index 000000000..035a69438
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/tf1_checkpoint_converter_lib.py
@@ -0,0 +1,201 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Convert checkpoints created by Estimator (tf1) to be Keras compatible."""
+
+import numpy as np
+import tensorflow.compat.v1 as tf  # TF 1.x
+
+# Mapping between old <=> new names. The source pattern in original variable
+# name will be replaced by destination pattern.
+BERT_NAME_REPLACEMENTS = (
+    ("bert", "bert_model"),
+    ("embeddings/word_embeddings", "word_embeddings/embeddings"),
+    ("embeddings/token_type_embeddings",
+     "embedding_postprocessor/type_embeddings"),
+    ("embeddings/position_embeddings",
+     "embedding_postprocessor/position_embeddings"),
+    ("embeddings/LayerNorm", "embedding_postprocessor/layer_norm"),
+    ("attention/self", "self_attention"),
+    ("attention/output/dense", "self_attention_output"),
+    ("attention/output/LayerNorm", "self_attention_layer_norm"),
+    ("intermediate/dense", "intermediate"),
+    ("output/dense", "output"),
+    ("output/LayerNorm", "output_layer_norm"),
+    ("pooler/dense", "pooler_transform"),
+)
+
+BERT_V2_NAME_REPLACEMENTS = (
+    ("bert/", ""),
+    ("encoder", "transformer"),
+    ("embeddings/word_embeddings", "word_embeddings/embeddings"),
+    ("embeddings/token_type_embeddings", "type_embeddings/embeddings"),
+    ("embeddings/position_embeddings", "position_embedding/embeddings"),
+    ("embeddings/LayerNorm", "embeddings/layer_norm"),
+    ("attention/self", "self_attention"),
+    ("attention/output/dense", "self_attention/attention_output"),
+    ("attention/output/LayerNorm", "self_attention_layer_norm"),
+    ("intermediate/dense", "intermediate"),
+    ("output/dense", "output"),
+    ("output/LayerNorm", "output_layer_norm"),
+    ("pooler/dense", "pooler_transform"),
+    ("cls/predictions", "bert/cls/predictions"),
+    ("cls/predictions/output_bias", "cls/predictions/output_bias/bias"),
+    ("cls/seq_relationship/output_bias", "predictions/transform/logits/bias"),
+    ("cls/seq_relationship/output_weights",
+     "predictions/transform/logits/kernel"),
+)
+
+BERT_PERMUTATIONS = ()
+
+BERT_V2_PERMUTATIONS = (("cls/seq_relationship/output_weights", (1, 0)),)
+
+
+def _bert_name_replacement(var_name, name_replacements):
+  """Gets the variable name replacement."""
+  for src_pattern, tgt_pattern in name_replacements:
+    if src_pattern in var_name:
+      old_var_name = var_name
+      var_name = var_name.replace(src_pattern, tgt_pattern)
+      tf.logging.info("Converted: %s --> %s", old_var_name, var_name)
+  return var_name
+
+
+def _has_exclude_patterns(name, exclude_patterns):
+  """Checks if a string contains substrings that match patterns to exclude."""
+  for p in exclude_patterns:
+    if p in name:
+      return True
+  return False
+
+
+def _get_permutation(name, permutations):
+  """Checks whether a variable requires transposition by pattern matching."""
+  for src_pattern, permutation in permutations:
+    if src_pattern in name:
+      tf.logging.info("Permuted: %s --> %s", name, permutation)
+      return permutation
+
+  return None
+
+
+def _get_new_shape(name, shape, num_heads):
+  """Checks whether a variable requires reshape by pattern matching."""
+  if "self_attention/attention_output/kernel" in name:
+    return tuple([num_heads, shape[0] // num_heads, shape[1]])
+  if "self_attention/attention_output/bias" in name:
+    return shape
+
+  patterns = [
+      "self_attention/query", "self_attention/value", "self_attention/key"
+  ]
+  for pattern in patterns:
+    if pattern in name:
+      if "kernel" in name:
+        return tuple([shape[0], num_heads, shape[1] // num_heads])
+      if "bias" in name:
+        return tuple([num_heads, shape[0] // num_heads])
+  return None
+
+
+def create_v2_checkpoint(model,
+                         src_checkpoint,
+                         output_path,
+                         checkpoint_model_name="model"):
+  """Converts a name-based matched TF V1 checkpoint to TF V2 checkpoint."""
+  # Uses streaming-restore in eager model to read V1 name-based checkpoints.
+  model.load_weights(src_checkpoint).assert_existing_objects_matched()
+  if hasattr(model, "checkpoint_items"):
+    checkpoint_items = model.checkpoint_items
+  else:
+    checkpoint_items = {}
+
+  checkpoint_items[checkpoint_model_name] = model
+  checkpoint = tf.train.Checkpoint(**checkpoint_items)
+  checkpoint.save(output_path)
+
+
+def convert(checkpoint_from_path,
+            checkpoint_to_path,
+            num_heads,
+            name_replacements,
+            permutations,
+            exclude_patterns=None):
+  """Migrates the names of variables within a checkpoint.
+
+  Args:
+    checkpoint_from_path: Path to source checkpoint to be read in.
+    checkpoint_to_path: Path to checkpoint to be written out.
+    num_heads: The number of heads of the model.
+    name_replacements: A list of tuples of the form (match_str, replace_str)
+      describing variable names to adjust.
+    permutations: A list of tuples of the form (match_str, permutation)
+      describing permutations to apply to given variables. Note that match_str
+      should match the original variable name, not the replaced one.
+    exclude_patterns: A list of string patterns to exclude variables from
+      checkpoint conversion.
+
+  Returns:
+    A dictionary that maps the new variable names to the Variable objects.
+    A dictionary that maps the old variable names to the new variable names.
+  """
+  with tf.Graph().as_default():
+    tf.logging.info("Reading checkpoint_from_path %s", checkpoint_from_path)
+    reader = tf.train.NewCheckpointReader(checkpoint_from_path)
+    name_shape_map = reader.get_variable_to_shape_map()
+    new_variable_map = {}
+    conversion_map = {}
+    for var_name in name_shape_map:
+      if exclude_patterns and _has_exclude_patterns(var_name, exclude_patterns):
+        continue
+      # Get the original tensor data.
+      tensor = reader.get_tensor(var_name)
+
+      # Look up the new variable name, if any.
+      new_var_name = _bert_name_replacement(var_name, name_replacements)
+
+      # See if we need to reshape the underlying tensor.
+      new_shape = None
+      if num_heads > 0:
+        new_shape = _get_new_shape(new_var_name, tensor.shape, num_heads)
+      if new_shape:
+        tf.logging.info("Veriable %s has a shape change from %s to %s",
+                        var_name, tensor.shape, new_shape)
+        tensor = np.reshape(tensor, new_shape)
+
+      # See if we need to permute the underlying tensor.
+      permutation = _get_permutation(var_name, permutations)
+      if permutation:
+        tensor = np.transpose(tensor, permutation)
+
+      # Create a new variable with the possibly-reshaped or transposed tensor.
+      var = tf.Variable(tensor, name=var_name)
+
+      # Save the variable into the new variable map.
+      new_variable_map[new_var_name] = var
+
+      # Keep a list of converter variables for sanity checking.
+      if new_var_name != var_name:
+        conversion_map[var_name] = new_var_name
+
+    saver = tf.train.Saver(new_variable_map)
+
+    with tf.Session() as sess:
+      sess.run(tf.global_variables_initializer())
+      tf.logging.info("Writing checkpoint_to_path %s", checkpoint_to_path)
+      saver.save(sess, checkpoint_to_path, write_meta_graph=False)
+
+  tf.logging.info("Summary:")
+  tf.logging.info("  Converted %d variable name(s).", len(new_variable_map))
+  tf.logging.info("  Converted: %s", str(conversion_map))
diff --git a/nlp/text_classification/bert/tensorflow2.0/tf2_encoder_checkpoint_converter.py b/nlp/text_classification/bert/tensorflow2.0/tf2_encoder_checkpoint_converter.py
new file mode 100644
index 000000000..2198f5e60
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/tf2_encoder_checkpoint_converter.py
@@ -0,0 +1,160 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""A converter from a V1 BERT encoder checkpoint to a V2 encoder checkpoint.
+
+The conversion will yield an object-oriented checkpoint that can be used
+to restore a BertEncoder or BertPretrainerV2 object (see the `converted_model`
+FLAG below).
+"""
+
+import os
+
+from absl import app
+from absl import flags
+
+import tensorflow as tf
+from modeling import tf_utils
+import configs
+import tf1_checkpoint_converter_lib
+from nlp_modeling import models
+from nlp_modeling import networks
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("bert_config_file", None,
+                    "Bert configuration file to define core bert layers.")
+flags.DEFINE_string(
+    "checkpoint_to_convert", None,
+    "Initial checkpoint from a pretrained BERT model core (that is, only the "
+    "BertModel, with no task heads.)")
+flags.DEFINE_string("converted_checkpoint_path", None,
+                    "Name for the created object-based V2 checkpoint.")
+flags.DEFINE_string("checkpoint_model_name", "encoder",
+                    "The name of the model when saving the checkpoint, i.e., "
+                    "the checkpoint will be saved using: "
+                    "tf.train.Checkpoint(FLAGS.checkpoint_model_name=model).")
+flags.DEFINE_enum(
+    "converted_model", "encoder", ["encoder", "pretrainer"],
+    "Whether to convert the checkpoint to a `BertEncoder` model or a "
+    "`BertPretrainerV2` model (with mlm but without classification heads).")
+
+
+def _create_bert_model(cfg):
+  """Creates a BERT keras core model from BERT configuration.
+
+  Args:
+    cfg: A `BertConfig` to create the core model.
+
+  Returns:
+    A BertEncoder network.
+  """
+  bert_encoder = networks.BertEncoder(
+      vocab_size=cfg.vocab_size,
+      hidden_size=cfg.hidden_size,
+      num_layers=cfg.num_hidden_layers,
+      num_attention_heads=cfg.num_attention_heads,
+      intermediate_size=cfg.intermediate_size,
+      activation=tf_utils.get_activation(cfg.hidden_act),
+      dropout_rate=cfg.hidden_dropout_prob,
+      attention_dropout_rate=cfg.attention_probs_dropout_prob,
+      max_sequence_length=cfg.max_position_embeddings,
+      type_vocab_size=cfg.type_vocab_size,
+      initializer=tf.keras.initializers.TruncatedNormal(
+          stddev=cfg.initializer_range),
+      embedding_width=cfg.embedding_size)
+
+  return bert_encoder
+
+
+def _create_bert_pretrainer_model(cfg):
+  """Creates a BERT keras core model from BERT configuration.
+
+  Args:
+    cfg: A `BertConfig` to create the core model.
+
+  Returns:
+    A BertPretrainerV2 model.
+  """
+  bert_encoder = _create_bert_model(cfg)
+  pretrainer = models.BertPretrainerV2(
+      encoder_network=bert_encoder,
+      mlm_activation=tf_utils.get_activation(cfg.hidden_act),
+      mlm_initializer=tf.keras.initializers.TruncatedNormal(
+          stddev=cfg.initializer_range))
+  # Makes sure the pretrainer variables are created.
+  _ = pretrainer(pretrainer.inputs)
+  return pretrainer
+
+
+def convert_checkpoint(bert_config,
+                       output_path,
+                       v1_checkpoint,
+                       checkpoint_model_name="model",
+                       converted_model="encoder"):
+  """Converts a V1 checkpoint into an OO V2 checkpoint."""
+  output_dir, _ = os.path.split(output_path)
+  tf.io.gfile.makedirs(output_dir)
+
+  # Create a temporary V1 name-converted checkpoint in the output directory.
+  temporary_checkpoint_dir = os.path.join(output_dir, "temp_v1")
+  temporary_checkpoint = os.path.join(temporary_checkpoint_dir, "ckpt")
+
+  tf1_checkpoint_converter_lib.convert(
+      checkpoint_from_path=v1_checkpoint,
+      checkpoint_to_path=temporary_checkpoint,
+      num_heads=bert_config.num_attention_heads,
+      name_replacements=tf1_checkpoint_converter_lib.BERT_V2_NAME_REPLACEMENTS,
+      permutations=tf1_checkpoint_converter_lib.BERT_V2_PERMUTATIONS,
+      exclude_patterns=["adam", "Adam"])
+
+  if converted_model == "encoder":
+    model = _create_bert_model(bert_config)
+  elif converted_model == "pretrainer":
+    model = _create_bert_pretrainer_model(bert_config)
+  else:
+    raise ValueError("Unsupported converted_model: %s" % converted_model)
+
+  # Create a V2 checkpoint from the temporary checkpoint.
+  tf1_checkpoint_converter_lib.create_v2_checkpoint(model, temporary_checkpoint,
+                                                    output_path,
+                                                    checkpoint_model_name)
+
+  # Clean up the temporary checkpoint, if it exists.
+  try:
+    tf.io.gfile.rmtree(temporary_checkpoint_dir)
+  except tf.errors.OpError:
+    # If it doesn't exist, we don't need to clean it up; continue.
+    pass
+
+
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError("Too many command-line arguments.")
+
+  output_path = FLAGS.converted_checkpoint_path
+  v1_checkpoint = FLAGS.checkpoint_to_convert
+  checkpoint_model_name = FLAGS.checkpoint_model_name
+  converted_model = FLAGS.converted_model
+  bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
+  convert_checkpoint(
+      bert_config=bert_config,
+      output_path=output_path,
+      v1_checkpoint=v1_checkpoint,
+      checkpoint_model_name=checkpoint_model_name,
+      converted_model=converted_model)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/nlp/text_classification/bert/tensorflow2.0/tokenization.py b/nlp/text_classification/bert/tensorflow2.0/tokenization.py
new file mode 100644
index 000000000..ea1546e3c
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/tokenization.py
@@ -0,0 +1,541 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# coding=utf-8
+"""Tokenization classes implementation.
+
+The file is forked from:
+https://github.com/google-research/bert/blob/master/tokenization.py.
+"""
+
+import collections
+import re
+import unicodedata
+
+import six
+import tensorflow as tf
+
+import sentencepiece as spm
+
+SPIECE_UNDERLINE = "▁"
+
+
+def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
+  """Checks whether the casing config is consistent with the checkpoint name."""
+
+  # The casing has to be passed in by the user and there is no explicit check
+  # as to whether it matches the checkpoint. The casing information probably
+  # should have been stored in the bert_config.json file, but it's not, so
+  # we have to heuristically detect it to validate.
+
+  if not init_checkpoint:
+    return
+
+  m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
+  if m is None:
+    return
+
+  model_name = m.group(1)
+
+  lower_models = [
+      "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
+      "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
+  ]
+
+  cased_models = [
+      "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
+      "multi_cased_L-12_H-768_A-12"
+  ]
+
+  is_bad_config = False
+  if model_name in lower_models and not do_lower_case:
+    is_bad_config = True
+    actual_flag = "False"
+    case_name = "lowercased"
+    opposite_flag = "True"
+
+  if model_name in cased_models and do_lower_case:
+    is_bad_config = True
+    actual_flag = "True"
+    case_name = "cased"
+    opposite_flag = "False"
+
+  if is_bad_config:
+    raise ValueError(
+        "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
+        "However, `%s` seems to be a %s model, so you "
+        "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
+        "how the model was pre-training. If this error is wrong, please "
+        "just comment out this check." %
+        (actual_flag, init_checkpoint, model_name, case_name, opposite_flag))
+
+
+def convert_to_unicode(text):
+  """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text.decode("utf-8", "ignore")
+    elif isinstance(text, unicode):
+      return text
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
+
+
+def printable_text(text):
+  """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+  # These functions want `str` for both Python2 and Python3, but in one case
+  # it's a Unicode string and in the other it's a byte string.
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, unicode):
+      return text.encode("utf-8")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+  """Loads a vocabulary file into a dictionary."""
+  vocab = collections.OrderedDict()
+  index = 0
+  with tf.io.gfile.GFile(vocab_file, "r") as reader:
+    while True:
+      token = convert_to_unicode(reader.readline())
+      if not token:
+        break
+      token = token.strip()
+      vocab[token] = index
+      index += 1
+  return vocab
+
+
+def convert_by_vocab(vocab, items):
+  """Converts a sequence of [tokens|ids] using the vocab."""
+  output = []
+  for item in items:
+    output.append(vocab[item])
+  return output
+
+
+def convert_tokens_to_ids(vocab, tokens):
+  return convert_by_vocab(vocab, tokens)
+
+
+def convert_ids_to_tokens(inv_vocab, ids):
+  return convert_by_vocab(inv_vocab, ids)
+
+
+def whitespace_tokenize(text):
+  """Runs basic whitespace cleaning and splitting on a piece of text."""
+  text = text.strip()
+  if not text:
+    return []
+  tokens = text.split()
+  return tokens
+
+
+class FullTokenizer(object):
+  """Runs end-to-end tokenziation."""
+
+  def __init__(self, vocab_file, do_lower_case=True, split_on_punc=True):
+    self.vocab = load_vocab(vocab_file)
+    self.inv_vocab = {v: k for k, v in self.vocab.items()}
+    self.basic_tokenizer = BasicTokenizer(
+        do_lower_case=do_lower_case, split_on_punc=split_on_punc)
+    self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+  def tokenize(self, text):
+    split_tokens = []
+    for token in self.basic_tokenizer.tokenize(text):
+      for sub_token in self.wordpiece_tokenizer.tokenize(token):
+        split_tokens.append(sub_token)
+
+    return split_tokens
+
+  def convert_tokens_to_ids(self, tokens):
+    return convert_by_vocab(self.vocab, tokens)
+
+  def convert_ids_to_tokens(self, ids):
+    return convert_by_vocab(self.inv_vocab, ids)
+
+
+class BasicTokenizer(object):
+  """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+  def __init__(self, do_lower_case=True, split_on_punc=True):
+    """Constructs a BasicTokenizer.
+
+    Args:
+      do_lower_case: Whether to lower case the input.
+      split_on_punc: Whether to apply split on punctuations. By default BERT
+        starts a new token for punctuations. This makes detokenization difficult
+        for tasks like seq2seq decoding.
+    """
+    self.do_lower_case = do_lower_case
+    self.split_on_punc = split_on_punc
+
+  def tokenize(self, text):
+    """Tokenizes a piece of text."""
+    text = convert_to_unicode(text)
+    text = self._clean_text(text)
+
+    # This was added on November 1st, 2018 for the multilingual and Chinese
+    # models. This is also applied to the English models now, but it doesn't
+    # matter since the English models were not trained on any Chinese data
+    # and generally don't have any Chinese data in them (there are Chinese
+    # characters in the vocabulary because Wikipedia does have some Chinese
+    # words in the English Wikipedia.).
+    text = self._tokenize_chinese_chars(text)
+
+    orig_tokens = whitespace_tokenize(text)
+    split_tokens = []
+    for token in orig_tokens:
+      if self.do_lower_case:
+        token = token.lower()
+        token = self._run_strip_accents(token)
+      if self.split_on_punc:
+        split_tokens.extend(self._run_split_on_punc(token))
+      else:
+        split_tokens.append(token)
+
+    output_tokens = whitespace_tokenize(" ".join(split_tokens))
+    return output_tokens
+
+  def _run_strip_accents(self, text):
+    """Strips accents from a piece of text."""
+    text = unicodedata.normalize("NFD", text)
+    output = []
+    for char in text:
+      cat = unicodedata.category(char)
+      if cat == "Mn":
+        continue
+      output.append(char)
+    return "".join(output)
+
+  def _run_split_on_punc(self, text):
+    """Splits punctuation on a piece of text."""
+    chars = list(text)
+    i = 0
+    start_new_word = True
+    output = []
+    while i < len(chars):
+      char = chars[i]
+      if _is_punctuation(char):
+        output.append([char])
+        start_new_word = True
+      else:
+        if start_new_word:
+          output.append([])
+        start_new_word = False
+        output[-1].append(char)
+      i += 1
+
+    return ["".join(x) for x in output]
+
+  def _tokenize_chinese_chars(self, text):
+    """Adds whitespace around any CJK character."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if self._is_chinese_char(cp):
+        output.append(" ")
+        output.append(char)
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+
+  def _is_chinese_char(self, cp):
+    """Checks whether CP is the codepoint of a CJK character."""
+    # This defines a "chinese character" as anything in the CJK Unicode block:
+    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    #
+    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+    # despite its name. The modern Korean Hangul alphabet is a different block,
+    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+    # space-separated words, so they are not treated specially and handled
+    # like the all of the other languages.
+    if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+        (cp >= 0x3400 and cp <= 0x4DBF) or  #
+        (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+        (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+        (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+        (cp >= 0x2B820 and cp <= 0x2CEAF) or
+        (cp >= 0xF900 and cp <= 0xFAFF) or  #
+        (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+      return True
+
+    return False
+
+  def _clean_text(self, text):
+    """Performs invalid character removal and whitespace cleanup on text."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if cp == 0 or cp == 0xfffd or _is_control(char):
+        continue
+      if _is_whitespace(char):
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+
+
+class WordpieceTokenizer(object):
+  """Runs WordPiece tokenziation."""
+
+  def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=400):
+    self.vocab = vocab
+    self.unk_token = unk_token
+    self.max_input_chars_per_word = max_input_chars_per_word
+
+  def tokenize(self, text):
+    """Tokenizes a piece of text into its word pieces.
+
+    This uses a greedy longest-match-first algorithm to perform tokenization
+    using the given vocabulary.
+
+    For example:
+      input = "unaffable"
+      output = ["un", "##aff", "##able"]
+
+    Args:
+      text: A single token or whitespace separated tokens. This should have
+        already been passed through `BasicTokenizer.
+
+    Returns:
+      A list of wordpiece tokens.
+    """
+
+    text = convert_to_unicode(text)
+
+    output_tokens = []
+    for token in whitespace_tokenize(text):
+      chars = list(token)
+      if len(chars) > self.max_input_chars_per_word:
+        output_tokens.append(self.unk_token)
+        continue
+
+      is_bad = False
+      start = 0
+      sub_tokens = []
+      while start < len(chars):
+        end = len(chars)
+        cur_substr = None
+        while start < end:
+          substr = "".join(chars[start:end])
+          if start > 0:
+            substr = "##" + substr
+          if substr in self.vocab:
+            cur_substr = substr
+            break
+          end -= 1
+        if cur_substr is None:
+          is_bad = True
+          break
+        sub_tokens.append(cur_substr)
+        start = end
+
+      if is_bad:
+        output_tokens.append(self.unk_token)
+      else:
+        output_tokens.extend(sub_tokens)
+    return output_tokens
+
+
+def _is_whitespace(char):
+  """Checks whether `chars` is a whitespace character."""
+  # \t, \n, and \r are technically control characters but we treat them
+  # as whitespace since they are generally considered as such.
+  if char == " " or char == "\t" or char == "\n" or char == "\r":
+    return True
+  cat = unicodedata.category(char)
+  if cat == "Zs":
+    return True
+  return False
+
+
+def _is_control(char):
+  """Checks whether `chars` is a control character."""
+  # These are technically control characters but we count them as whitespace
+  # characters.
+  if char == "\t" or char == "\n" or char == "\r":
+    return False
+  cat = unicodedata.category(char)
+  if cat in ("Cc", "Cf"):
+    return True
+  return False
+
+
+def _is_punctuation(char):
+  """Checks whether `chars` is a punctuation character."""
+  cp = ord(char)
+  # We treat all non-letter/number ASCII as punctuation.
+  # Characters such as "^", "$", and "`" are not in the Unicode
+  # Punctuation class but we treat them as punctuation anyways, for
+  # consistency.
+  if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+      (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+    return True
+  cat = unicodedata.category(char)
+  if cat.startswith("P"):
+    return True
+  return False
+
+
+def preprocess_text(inputs, remove_space=True, lower=False):
+  """Preprocesses data by removing extra space and normalize data.
+
+  This method is used together with sentence piece tokenizer and is forked from:
+  https://github.com/google-research/google-research/blob/e1f6fa00/albert/tokenization.py
+
+  Args:
+    inputs: The input text.
+    remove_space: Whether to remove the extra space.
+    lower: Whether to lowercase the text.
+
+  Returns:
+    The preprocessed text.
+
+  """
+  outputs = inputs
+  if remove_space:
+    outputs = " ".join(inputs.strip().split())
+
+  if six.PY2 and isinstance(outputs, str):
+    try:
+      outputs = six.ensure_text(outputs, "utf-8")
+    except UnicodeDecodeError:
+      outputs = six.ensure_text(outputs, "latin-1")
+
+  outputs = unicodedata.normalize("NFKD", outputs)
+  outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
+  if lower:
+    outputs = outputs.lower()
+
+  return outputs
+
+
+def encode_pieces(sp_model, text, sample=False):
+  """Segements text into pieces.
+
+  This method is used together with sentence piece tokenizer and is forked from:
+  https://github.com/google-research/google-research/blob/e1f6fa00/albert/tokenization.py
+
+
+  Args:
+    sp_model: A spm.SentencePieceProcessor object.
+    text: The input text to be segemented.
+    sample: Whether to randomly sample a segmentation output or return a
+      deterministic one.
+
+  Returns:
+    A list of token pieces.
+  """
+  if six.PY2 and isinstance(text, six.text_type):
+    text = six.ensure_binary(text, "utf-8")
+
+  if not sample:
+    pieces = sp_model.EncodeAsPieces(text)
+  else:
+    pieces = sp_model.SampleEncodeAsPieces(text, 64, 0.1)
+  new_pieces = []
+  for piece in pieces:
+    piece = printable_text(piece)
+    if len(piece) > 1 and piece[-1] == "," and piece[-2].isdigit():
+      cur_pieces = sp_model.EncodeAsPieces(piece[:-1].replace(
+          SPIECE_UNDERLINE, ""))
+      if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
+        if len(cur_pieces[0]) == 1:
+          cur_pieces = cur_pieces[1:]
+        else:
+          cur_pieces[0] = cur_pieces[0][1:]
+      cur_pieces.append(piece[-1])
+      new_pieces.extend(cur_pieces)
+    else:
+      new_pieces.append(piece)
+
+  return new_pieces
+
+
+def encode_ids(sp_model, text, sample=False):
+  """Segments text and return token ids.
+
+  This method is used together with sentence piece tokenizer and is forked from:
+  https://github.com/google-research/google-research/blob/e1f6fa00/albert/tokenization.py
+
+  Args:
+    sp_model: A spm.SentencePieceProcessor object.
+    text: The input text to be segemented.
+    sample: Whether to randomly sample a segmentation output or return a
+      deterministic one.
+
+  Returns:
+    A list of token ids.
+  """
+  pieces = encode_pieces(sp_model, text, sample=sample)
+  ids = [sp_model.PieceToId(piece) for piece in pieces]
+  return ids
+
+
+class FullSentencePieceTokenizer(object):
+  """Runs end-to-end sentence piece tokenization.
+
+  The interface of this class is intended to keep the same as above
+  `FullTokenizer` class for easier usage.
+  """
+
+  def __init__(self, sp_model_file):
+    """Inits FullSentencePieceTokenizer.
+
+    Args:
+      sp_model_file: The path to the sentence piece model file.
+    """
+    self.sp_model = spm.SentencePieceProcessor()
+    self.sp_model.Load(sp_model_file)
+    self.vocab = {
+        self.sp_model.IdToPiece(i): i
+        for i in six.moves.range(self.sp_model.GetPieceSize())
+    }
+
+  def tokenize(self, text):
+    """Tokenizes text into pieces."""
+    return encode_pieces(self.sp_model, text)
+
+  def convert_tokens_to_ids(self, tokens):
+    """Converts a list of tokens to a list of ids."""
+    return [self.sp_model.PieceToId(printable_text(token)) for token in tokens]
+
+  def convert_ids_to_tokens(self, ids):
+    """Converts a list of ids ot a list of tokens."""
+    return [self.sp_model.IdToPiece(id_) for id_ in ids]
diff --git a/nlp/text_classification/bert/tensorflow2.0/train_mirrored_nv.json b/nlp/text_classification/bert/tensorflow2.0/train_mirrored_nv.json
new file mode 100644
index 000000000..b412a2281
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/train_mirrored_nv.json
@@ -0,0 +1,8 @@
+{
+    "loss: ": 0.3244,
+    "accuracy: ": 0.8698,
+    "val_loss: ": 0.4148,
+    "val_accuracy: ": 0.8309,
+    "threshold": 0.02,
+    "percentage": 0.002
+}
\ No newline at end of file
diff --git a/nlp/text_classification/bert/tensorflow2.0/train_worker_mirrored_nv.json b/nlp/text_classification/bert/tensorflow2.0/train_worker_mirrored_nv.json
new file mode 100644
index 000000000..ef7e8e44d
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/train_worker_mirrored_nv.json
@@ -0,0 +1,8 @@
+{
+    "loss: ": 0.4382,
+    "accuracy: ": 0.801,
+    "val_loss: ": 0.3895,
+    "val_accuracy: ": 0.8365,
+    "threshold": 0.02,
+    "percentage": 0.002
+}
\ No newline at end of file
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/__init__.py b/nlp/text_classification/bert/tensorflow2.0/utils/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/docs/build_api_docs_lib.py b/nlp/text_classification/bert/tensorflow2.0/utils/docs/build_api_docs_lib.py
new file mode 100644
index 000000000..0bff8b011
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/docs/build_api_docs_lib.py
@@ -0,0 +1,54 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Common library for API docs builder."""
+
+import tensorflow as tf
+from tensorflow_docs.api_generator import doc_controls
+
+
+def hide_module_model_and_layer_methods():
+  """Hide methods and properties defined in the base classes of Keras layers.
+
+  We hide all methods and properties of the base classes, except:
+  - `__init__` is always documented.
+  - `call` is always documented, as it can carry important information for
+    complex layers.
+  """
+  module_contents = list(tf.Module.__dict__.items())
+  model_contents = list(tf.keras.Model.__dict__.items())
+  layer_contents = list(tf.keras.layers.Layer.__dict__.items())
+
+  for name, obj in module_contents + layer_contents + model_contents:
+    if name == '__init__':
+      # Always document __init__.
+      continue
+
+    if name == 'call':
+      # Always document `call`.
+      if hasattr(obj, doc_controls._FOR_SUBCLASS_IMPLEMENTERS):  # pylint: disable=protected-access
+        delattr(obj, doc_controls._FOR_SUBCLASS_IMPLEMENTERS)  # pylint: disable=protected-access
+      continue
+
+    # Otherwise, exclude from documentation.
+    if isinstance(obj, property):
+      obj = obj.fget
+
+    if isinstance(obj, (staticmethod, classmethod)):
+      obj = obj.__func__
+
+    try:
+      doc_controls.do_not_doc_in_subclasses(obj)
+    except AttributeError:
+      pass
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/docs/build_nlp_api_docs.py b/nlp/text_classification/bert/tensorflow2.0/utils/docs/build_nlp_api_docs.py
new file mode 100644
index 000000000..ee54879a1
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/docs/build_nlp_api_docs.py
@@ -0,0 +1,95 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Tool to generate api_docs for tensorflow_models/official library.
+
+Example:
+
+$> pip install -U git+https://github.com/tensorflow/docs
+$> python build_nlp_api_docs \
+ --output_dir=/tmp/api_docs
+"""
+
+import os
+
+from absl import app
+from absl import flags
+from absl import logging
+from tensorflow_docs.api_generator import generate_lib
+from tensorflow_docs.api_generator import public_api
+
+from nlp import modeling as tfnlp
+import build_api_docs_lib
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('output_dir', None, 'Where to write the resulting docs to.')
+flags.DEFINE_string(
+    'code_url_prefix',
+    'https://github.com/tensorflow/models/blob/master/official/nlp/modeling/',
+    'The url prefix for links to code.')
+
+flags.DEFINE_bool('search_hints', True,
+                  'Include metadata search hints in the generated files')
+
+flags.DEFINE_string('site_path', '/api_docs/python',
+                    'Path prefix in the _toc.yaml')
+
+flags.DEFINE_bool('gen_report', False,
+                  'Generate an API report containing the health of the '
+                  'docstrings of the public API.')
+
+PROJECT_SHORT_NAME = 'tfnlp'
+PROJECT_FULL_NAME = 'TensorFlow Official Models - NLP Modeling Library'
+
+
+def gen_api_docs(code_url_prefix, site_path, output_dir, gen_report,
+                 project_short_name, project_full_name, search_hints):
+  """Generates api docs for the tensorflow docs package."""
+  build_api_docs_lib.hide_module_model_and_layer_methods()
+  del tfnlp.layers.MultiHeadAttention
+  del tfnlp.layers.EinsumDense
+
+  doc_generator = generate_lib.DocGenerator(
+      root_title=project_full_name,
+      py_modules=[(project_short_name, tfnlp)],
+      base_dir=os.path.dirname(tfnlp.__file__),
+      code_url_prefix=code_url_prefix,
+      search_hints=search_hints,
+      site_path=site_path,
+      gen_report=gen_report,
+      callbacks=[public_api.explicit_package_contents_filter],
+  )
+
+  doc_generator.build(output_dir)
+  logging.info('Output docs to: %s', output_dir)
+
+
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError('Too many command-line arguments.')
+
+  gen_api_docs(
+      code_url_prefix=FLAGS.code_url_prefix,
+      site_path=FLAGS.site_path,
+      output_dir=FLAGS.output_dir,
+      gen_report=FLAGS.gen_report,
+      project_short_name=PROJECT_SHORT_NAME,
+      project_full_name=PROJECT_FULL_NAME,
+      search_hints=FLAGS.search_hints)
+
+
+if __name__ == '__main__':
+  flags.mark_flag_as_required('output_dir')
+  app.run(main)
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/docs/build_vision_api_docs.py b/nlp/text_classification/bert/tensorflow2.0/utils/docs/build_vision_api_docs.py
new file mode 100644
index 000000000..8123c6e8c
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/docs/build_vision_api_docs.py
@@ -0,0 +1,93 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Tool to generate api_docs for tensorflow_models/official library.
+
+Example:
+
+$> pip install -U git+https://github.com/tensorflow/docs
+$> python build_vision_api_docs \
+ --output_dir=/tmp/api_docs
+"""
+
+import os
+
+from absl import app
+from absl import flags
+from absl import logging
+from tensorflow_docs.api_generator import generate_lib
+from tensorflow_docs.api_generator import public_api
+
+import build_api_docs_lib
+from vision.beta import modeling as tfvision
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string('output_dir', None, 'Where to write the resulting docs to.')
+flags.DEFINE_string(
+    'code_url_prefix',
+    'https://github.com/tensorflow/models/blob/master/official/vision/beta/modeling/',
+    'The url prefix for links to code.')
+
+flags.DEFINE_bool('search_hints', True,
+                  'Include metadata search hints in the generated files')
+
+flags.DEFINE_string('site_path', 'tfvision/api_docs/python',
+                    'Path prefix in the _toc.yaml')
+
+flags.DEFINE_bool('gen_report', False,
+                  'Generate an API report containing the health of the '
+                  'docstrings of the public API.')
+
+PROJECT_SHORT_NAME = 'tfvision'
+PROJECT_FULL_NAME = 'TensorFlow Official Models - Vision Modeling Library'
+
+
+def gen_api_docs(code_url_prefix, site_path, output_dir, gen_report,
+                 project_short_name, project_full_name, search_hints):
+  """Generates api docs for the tensorflow docs package."""
+  build_api_docs_lib.hide_module_model_and_layer_methods()
+
+  doc_generator = generate_lib.DocGenerator(
+      root_title=project_full_name,
+      py_modules=[(project_short_name, tfvision)],
+      base_dir=os.path.dirname(tfvision.__file__),
+      code_url_prefix=code_url_prefix,
+      search_hints=search_hints,
+      site_path=site_path,
+      gen_report=gen_report,
+      callbacks=[public_api.explicit_package_contents_filter],
+  )
+
+  doc_generator.build(output_dir)
+  logging.info('Output docs to: %s', output_dir)
+
+
+def main(argv):
+  if len(argv) > 1:
+    raise app.UsageError('Too many command-line arguments.')
+
+  gen_api_docs(
+      code_url_prefix=FLAGS.code_url_prefix,
+      site_path=FLAGS.site_path,
+      output_dir=FLAGS.output_dir,
+      gen_report=FLAGS.gen_report,
+      project_short_name=PROJECT_SHORT_NAME,
+      project_full_name=PROJECT_FULL_NAME,
+      search_hints=FLAGS.search_hints)
+
+
+if __name__ == '__main__':
+  flags.mark_flag_as_required('output_dir')
+  app.run(main)
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/flags/README.md b/nlp/text_classification/bert/tensorflow2.0/utils/flags/README.md
new file mode 100644
index 000000000..beb3b2a1e
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/flags/README.md
@@ -0,0 +1,102 @@
+# Adding Abseil (absl) flags quickstart
+
+**WARNING** This module is deprecated. We no long use it in new models and
+your projects should not depend on it. We will remove this module when
+all models using it are deprecated which may take time.
+
+## Defining a flag
+absl flag definitions are similar to argparse, although they are defined on a global namespace.
+
+For instance defining a string flag looks like:
+```$xslt
+from absl import flags
+flags.DEFINE_string(
+    name="my_flag",
+    default="a_sensible_default",
+    help="Here is what this flag does."
+)
+```
+
+All three arguments are required, but default may be `None`. A common optional argument is
+short_name for defining abreviations. Certain `DEFINE_*` methods will have other required arguments.
+For instance `DEFINE_enum` requires the `enum_values` argument to be specified.
+
+## Key Flags
+absl has the concept of a key flag. Any flag defined in `__main__` is considered a key flag by
+default. Key flags are displayed in `--help`, others only appear in `--helpfull`. In order to
+handle key flags that are defined outside the module in question, absl provides the
+`flags.adopt_module_key_flags()` method. This adds the key flags of a different module to one's own
+key flags. For example:
+```$xslt
+File: flag_source.py
+---------------------------------------
+
+from absl import flags
+flags.DEFINE_string(name="my_flag", default="abc", help="a flag.")
+```
+
+```$xslt
+File: my_module.py
+---------------------------------------
+
+from absl import app as absl_app
+from absl import flags
+
+import flag_source
+
+flags.adopt_module_key_flags(flag_source)
+
+def main(_):
+  pass
+
+absl_app.run(main, [__file__, "-h"]
+```
+
+when `my_module.py` is run it will show the help text for `my_flag`. Because not all flags defined
+in a file are equally important, `official/utils/flags/core.py` (generally imported as flags_core)
+provides an abstraction for handling key flag declaration in an easy way through the
+`register_key_flags_in_core()` function, which allows a module to make a single
+`adopt_key_flags(flags_core)` call when using the util flag declaration functions.
+
+## Validators
+Often the constraints on a flag are complicated. absl provides the validator decorator to allow
+one to mark a function as a flag validation function. Suppose we want users to provide a flag
+which is a palindrome.
+
+```$xslt
+from absl import flags
+
+flags.DEFINE_string(name="pal_flag", short_name="pf", default="", help="Give me a palindrome")
+
+@flags.validator("pal_flag")
+def _check_pal(provided_pal_flag):
+  return provided_pal_flag == provided_pal_flag[::-1]
+
+```
+
+Validators take the form that returning True (truthy) passes, and all others 
+(False, None, exception) fail.
+
+## Testing
+To test using absl, simply declare flags in the setupClass method of TensorFlow's TestCase.
+
+```$xslt
+from absl import flags
+import tensorflow as tf
+
+def define_flags():
+  flags.DEFINE_string(name="test_flag", default="abc", help="an example flag")
+
+
+class BaseTester(unittest.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    super(BaseTester, cls).setUpClass()
+    define_flags()
+    
+  def test_trivial(self):
+    flags_core.parse_flags([__file__, "test_flag", "def"])
+    self.AssertEqual(flags.FLAGS.test_flag, "def")
+    
+```
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/flags/__init__.py b/nlp/text_classification/bert/tensorflow2.0/utils/flags/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/flags/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/flags/_base.py b/nlp/text_classification/bert/tensorflow2.0/utils/flags/_base.py
new file mode 100644
index 000000000..491300e42
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/flags/_base.py
@@ -0,0 +1,177 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Flags which will be nearly universal across models."""
+
+from absl import flags
+import tensorflow as tf
+from utils.flags._conventions import help_wrap
+
+
+def define_base(data_dir=True,
+                model_dir=True,
+                clean=False,
+                train_epochs=False,
+                epochs_between_evals=False,
+                stop_threshold=False,
+                batch_size=True,
+                num_gpu=False,
+                hooks=False,
+                export_dir=False,
+                distribution_strategy=False,
+                run_eagerly=False):
+  """Register base flags.
+
+  Args:
+    data_dir: Create a flag for specifying the input data directory.
+    model_dir: Create a flag for specifying the model file directory.
+    clean: Create a flag for removing the model_dir.
+    train_epochs: Create a flag to specify the number of training epochs.
+    epochs_between_evals: Create a flag to specify the frequency of testing.
+    stop_threshold: Create a flag to specify a threshold accuracy or other eval
+      metric which should trigger the end of training.
+    batch_size: Create a flag to specify the batch size.
+    num_gpu: Create a flag to specify the number of GPUs used.
+    hooks: Create a flag to specify hooks for logging.
+    export_dir: Create a flag to specify where a SavedModel should be exported.
+    distribution_strategy: Create a flag to specify which Distribution Strategy
+      to use.
+    run_eagerly: Create a flag to specify to run eagerly op by op.
+
+  Returns:
+    A list of flags for core.py to marks as key flags.
+  """
+  key_flags = []
+
+  if data_dir:
+    flags.DEFINE_string(
+        name="data_dir",
+        short_name="dd",
+        default="/tmp",
+        help=help_wrap("The location of the input data."))
+    key_flags.append("data_dir")
+
+  if model_dir:
+    flags.DEFINE_string(
+        name="model_dir",
+        short_name="md",
+        default="/tmp",
+        help=help_wrap("The location of the model checkpoint files."))
+    key_flags.append("model_dir")
+
+  if clean:
+    flags.DEFINE_boolean(
+        name="clean",
+        default=False,
+        help=help_wrap("If set, model_dir will be removed if it exists."))
+    key_flags.append("clean")
+
+  if train_epochs:
+    flags.DEFINE_integer(
+        name="train_epochs",
+        short_name="te",
+        default=1,
+        help=help_wrap("The number of epochs used to train."))
+    key_flags.append("train_epochs")
+
+  if epochs_between_evals:
+    flags.DEFINE_integer(
+        name="epochs_between_evals",
+        short_name="ebe",
+        default=1,
+        help=help_wrap("The number of training epochs to run between "
+                       "evaluations."))
+    key_flags.append("epochs_between_evals")
+
+  if stop_threshold:
+    flags.DEFINE_float(
+        name="stop_threshold",
+        short_name="st",
+        default=None,
+        help=help_wrap("If passed, training will stop at the earlier of "
+                       "train_epochs and when the evaluation metric is  "
+                       "greater than or equal to stop_threshold."))
+
+  if batch_size:
+    flags.DEFINE_integer(
+        name="batch_size",
+        short_name="bs",
+        default=32,
+        help=help_wrap("Batch size for training and evaluation. When using "
+                       "multiple gpus, this is the global batch size for "
+                       "all devices. For example, if the batch size is 32 "
+                       "and there are 4 GPUs, each GPU will get 8 examples on "
+                       "each step."))
+    key_flags.append("batch_size")
+
+  if num_gpu:
+    flags.DEFINE_integer(
+        name="num_gpus",
+        short_name="ng",
+        default=1,
+        help=help_wrap("How many GPUs to use at each worker with the "
+                       "DistributionStrategies API. The default is 1."))
+
+  if run_eagerly:
+    flags.DEFINE_boolean(
+        name="run_eagerly",
+        default=False,
+        help="Run the model op by op without building a model function.")
+
+  if hooks:
+    flags.DEFINE_list(
+        name="hooks",
+        short_name="hk",
+        default="LoggingTensorHook",
+        help=help_wrap(
+            u"A list of (case insensitive) strings to specify the names of "
+            u"training hooks. Example: `--hooks ProfilerHook,"
+            u"ExamplesPerSecondHook`\n See hooks_helper "
+            u"for details."))
+    key_flags.append("hooks")
+
+  if export_dir:
+    flags.DEFINE_string(
+        name="export_dir",
+        short_name="ed",
+        default=None,
+        help=help_wrap("If set, a SavedModel serialization of the model will "
+                       "be exported to this directory at the end of training. "
+                       "See the README for more details and relevant links."))
+    key_flags.append("export_dir")
+
+  if distribution_strategy:
+    flags.DEFINE_string(
+        name="distribution_strategy",
+        short_name="ds",
+        default="mirrored",
+        help=help_wrap("The Distribution Strategy to use for training. "
+                       "Accepted values are 'off', 'one_device', "
+                       "'mirrored', 'parameter_server', 'collective', "
+                       "case insensitive. 'off' means not to use "
+                       "Distribution Strategy; 'default' means to choose "
+                       "from `MirroredStrategy` or `OneDeviceStrategy` "
+                       "according to the number of GPUs."))
+
+  return key_flags
+
+
+def get_num_gpus(flags_obj):
+  """Treat num_gpus=-1 as 'use all'."""
+  if flags_obj.num_gpus != -1:
+    return flags_obj.num_gpus
+
+  from tensorflow.python.client import device_lib  # pylint: disable=g-import-not-at-top
+  local_device_protos = device_lib.list_local_devices()
+  return sum([1 for d in local_device_protos if d.device_type == "GPU"])
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/flags/_benchmark.py b/nlp/text_classification/bert/tensorflow2.0/utils/flags/_benchmark.py
new file mode 100644
index 000000000..66ddefc05
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/flags/_benchmark.py
@@ -0,0 +1,117 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Flags for benchmarking models."""
+
+from absl import flags
+
+from utils.flags._conventions import help_wrap
+
+
+def define_log_steps():
+  flags.DEFINE_integer(
+      name="log_steps",
+      default=100,
+      help="Frequency with which to log timing information with TimeHistory.")
+
+  return []
+
+
+def define_benchmark(benchmark_log_dir=True, bigquery_uploader=True):
+  """Register benchmarking flags.
+
+  Args:
+    benchmark_log_dir: Create a flag to specify location for benchmark logging.
+    bigquery_uploader: Create flags for uploading results to BigQuery.
+
+  Returns:
+    A list of flags for core.py to marks as key flags.
+  """
+
+  key_flags = []
+
+  flags.DEFINE_enum(
+      name="benchmark_logger_type",
+      default="BaseBenchmarkLogger",
+      enum_values=["BaseBenchmarkLogger", "BenchmarkFileLogger"],
+      help=help_wrap("The type of benchmark logger to use. Defaults to using "
+                     "BaseBenchmarkLogger which logs to STDOUT. Different "
+                     "loggers will require other flags to be able to work."))
+  flags.DEFINE_string(
+      name="benchmark_test_id",
+      short_name="bti",
+      default=None,
+      help=help_wrap("The unique test ID of the benchmark run. It could be the "
+                     "combination of key parameters. It is hardware "
+                     "independent and could be used compare the performance "
+                     "between different test runs. This flag is designed for "
+                     "human consumption, and does not have any impact within "
+                     "the system."))
+
+  define_log_steps()
+
+  if benchmark_log_dir:
+    flags.DEFINE_string(
+        name="benchmark_log_dir",
+        short_name="bld",
+        default=None,
+        help=help_wrap("The location of the benchmark logging."))
+
+  if bigquery_uploader:
+    flags.DEFINE_string(
+        name="gcp_project",
+        short_name="gp",
+        default=None,
+        help=help_wrap(
+            "The GCP project name where the benchmark will be uploaded."))
+
+    flags.DEFINE_string(
+        name="bigquery_data_set",
+        short_name="bds",
+        default="test_benchmark",
+        help=help_wrap(
+            "The Bigquery dataset name where the benchmark will be uploaded."))
+
+    flags.DEFINE_string(
+        name="bigquery_run_table",
+        short_name="brt",
+        default="benchmark_run",
+        help=help_wrap("The Bigquery table name where the benchmark run "
+                       "information will be uploaded."))
+
+    flags.DEFINE_string(
+        name="bigquery_run_status_table",
+        short_name="brst",
+        default="benchmark_run_status",
+        help=help_wrap("The Bigquery table name where the benchmark run "
+                       "status information will be uploaded."))
+
+    flags.DEFINE_string(
+        name="bigquery_metric_table",
+        short_name="bmt",
+        default="benchmark_metric",
+        help=help_wrap("The Bigquery table name where the benchmark metric "
+                       "information will be uploaded."))
+
+  @flags.multi_flags_validator(
+      ["benchmark_logger_type", "benchmark_log_dir"],
+      message="--benchmark_logger_type=BenchmarkFileLogger will require "
+      "--benchmark_log_dir being set")
+  def _check_benchmark_log_dir(flags_dict):
+    benchmark_logger_type = flags_dict["benchmark_logger_type"]
+    if benchmark_logger_type == "BenchmarkFileLogger":
+      return flags_dict["benchmark_log_dir"]
+    return True
+
+  return key_flags
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/flags/_conventions.py b/nlp/text_classification/bert/tensorflow2.0/utils/flags/_conventions.py
new file mode 100644
index 000000000..a42ff42a2
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/flags/_conventions.py
@@ -0,0 +1,50 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Central location for shared argparse convention definitions."""
+
+import sys
+import codecs
+import functools
+
+from absl import app as absl_app
+from absl import flags
+
+# This codifies help string conventions and makes it easy to update them if
+# necessary. Currently the only major effect is that help bodies start on the
+# line after flags are listed. All flag definitions should wrap the text bodies
+# with help wrap when calling DEFINE_*.
+_help_wrap = functools.partial(
+    flags.text_wrap, length=80, indent="", firstline_indent="\n")
+
+
+# Pretty formatting causes issues when utf-8 is not installed on a system.
+def _stdout_utf8():
+  try:
+    codecs.lookup("utf-8")
+  except LookupError:
+    return False
+  return getattr(sys.stdout, "encoding", "") == "UTF-8"
+
+
+if _stdout_utf8():
+  help_wrap = _help_wrap
+else:
+
+  def help_wrap(text, *args, **kwargs):
+    return _help_wrap(text, *args, **kwargs).replace(u"\ufeff", u"")
+
+
+# Replace None with h to also allow -h
+absl_app.HelpshortFlag.SHORT_NAME = "h"
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/flags/_device.py b/nlp/text_classification/bert/tensorflow2.0/utils/flags/_device.py
new file mode 100644
index 000000000..1c9a3ad7d
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/flags/_device.py
@@ -0,0 +1,90 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Flags for managing compute devices. Currently only contains TPU flags."""
+
+from absl import flags
+from absl import logging
+
+from utils.flags._conventions import help_wrap
+
+
+def require_cloud_storage(flag_names):
+  """Register a validator to check directory flags.
+
+  Args:
+    flag_names: An iterable of strings containing the names of flags to be
+      checked.
+  """
+  msg = "TPU requires GCS path for {}".format(", ".join(flag_names))
+
+  @flags.multi_flags_validator(["tpu"] + flag_names, message=msg)
+  def _path_check(flag_values):  # pylint: disable=missing-docstring
+    if flag_values["tpu"] is None:
+      return True
+
+    valid_flags = True
+    for key in flag_names:
+      if not flag_values[key].startswith("gs://"):
+        logging.error("%s must be a GCS path.", key)
+        valid_flags = False
+
+    return valid_flags
+
+
+def define_device(tpu=True):
+  """Register device specific flags.
+
+  Args:
+    tpu: Create flags to specify TPU operation.
+
+  Returns:
+    A list of flags for core.py to marks as key flags.
+  """
+
+  key_flags = []
+
+  if tpu:
+    flags.DEFINE_string(
+        name="tpu",
+        default=None,
+        help=help_wrap(
+            "The Cloud TPU to use for training. This should be either the name "
+            "used when creating the Cloud TPU, or a "
+            "grpc://ip.address.of.tpu:8470 url. Passing `local` will use the"
+            "CPU of the local instance instead. (Good for debugging.)"))
+    key_flags.append("tpu")
+
+    flags.DEFINE_string(
+        name="tpu_zone",
+        default=None,
+        help=help_wrap(
+            "[Optional] GCE zone where the Cloud TPU is located in. If not "
+            "specified, we will attempt to automatically detect the GCE "
+            "project from metadata."))
+
+    flags.DEFINE_string(
+        name="tpu_gcp_project",
+        default=None,
+        help=help_wrap(
+            "[Optional] Project name for the Cloud TPU-enabled project. If not "
+            "specified, we will attempt to automatically detect the GCE "
+            "project from metadata."))
+
+    flags.DEFINE_integer(
+        name="num_tpu_shards",
+        default=8,
+        help=help_wrap("Number of shards (TPU chips)."))
+
+  return key_flags
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/flags/_distribution.py b/nlp/text_classification/bert/tensorflow2.0/utils/flags/_distribution.py
new file mode 100644
index 000000000..e7cfcdf69
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/flags/_distribution.py
@@ -0,0 +1,52 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Flags related to distributed execution."""
+
+from absl import flags
+import tensorflow as tf
+
+from utils.flags._conventions import help_wrap
+
+
+def define_distribution(worker_hosts=True, task_index=True):
+  """Register distributed execution flags.
+
+  Args:
+    worker_hosts: Create a flag for specifying comma-separated list of workers.
+    task_index: Create a flag for specifying index of task.
+
+  Returns:
+    A list of flags for core.py to marks as key flags.
+  """
+  key_flags = []
+
+  if worker_hosts:
+    flags.DEFINE_string(
+        name='worker_hosts',
+        default=None,
+        help=help_wrap(
+            'Comma-separated list of worker ip:port pairs for running '
+            'multi-worker models with DistributionStrategy.  The user would '
+            'start the program on each host with identical value for this '
+            'flag.'))
+
+  if task_index:
+    flags.DEFINE_integer(
+        name='task_index',
+        default=-1,
+        help=help_wrap('If multi-worker training, the task_index of this '
+                       'worker.'))
+
+  return key_flags
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/flags/_misc.py b/nlp/text_classification/bert/tensorflow2.0/utils/flags/_misc.py
new file mode 100644
index 000000000..eb248fe47
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/flags/_misc.py
@@ -0,0 +1,48 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Misc flags."""
+
+from absl import flags
+
+from utils.flags._conventions import help_wrap
+
+
+def define_image(data_format=True):
+  """Register image specific flags.
+
+  Args:
+    data_format: Create a flag to specify image axis convention.
+
+  Returns:
+    A list of flags for core.py to marks as key flags.
+  """
+
+  key_flags = []
+
+  if data_format:
+    flags.DEFINE_enum(
+        name="data_format",
+        short_name="df",
+        default=None,
+        enum_values=["channels_first", "channels_last"],
+        help=help_wrap(
+            "A flag to override the data format used in the model. "
+            "channels_first provides a performance boost on GPU but is not "
+            "always compatible with CPU. If left unspecified, the data format "
+            "will be chosen automatically based on whether TensorFlow was "
+            "built for CPU or GPU."))
+    key_flags.append("data_format")
+
+  return key_flags
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/flags/_performance.py b/nlp/text_classification/bert/tensorflow2.0/utils/flags/_performance.py
new file mode 100644
index 000000000..e1124fde1
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/flags/_performance.py
@@ -0,0 +1,294 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Register flags for optimizing performance."""
+
+import multiprocessing
+
+from absl import flags  # pylint: disable=g-bad-import-order
+import tensorflow as tf  # pylint: disable=g-bad-import-order
+
+from utils.flags._conventions import help_wrap
+
+# Map string to TensorFlow dtype
+DTYPE_MAP = {
+    "fp16": tf.float16,
+    "bf16": tf.bfloat16,
+    "fp32": tf.float32,
+}
+
+
+def get_tf_dtype(flags_obj):
+  if getattr(flags_obj, "fp16_implementation", None) == "graph_rewrite":
+    # If the graph_rewrite is used, we build the graph with fp32, and let the
+    # graph rewrite change ops to fp16.
+    return tf.float32
+  return DTYPE_MAP[flags_obj.dtype]
+
+
+def get_loss_scale(flags_obj, default_for_fp16):
+  dtype = get_tf_dtype(flags_obj)
+  if flags_obj.loss_scale == "dynamic":
+    return flags_obj.loss_scale
+  elif flags_obj.loss_scale is not None:
+    return float(flags_obj.loss_scale)
+  elif dtype == tf.float32 or dtype == tf.bfloat16:
+    return 1  # No loss scaling is needed for fp32
+  else:
+    assert dtype == tf.float16
+    return default_for_fp16
+
+
+def define_performance(num_parallel_calls=False,
+                       inter_op=False,
+                       intra_op=False,
+                       synthetic_data=False,
+                       max_train_steps=False,
+                       dtype=False,
+                       all_reduce_alg=False,
+                       num_packs=False,
+                       tf_gpu_thread_mode=False,
+                       datasets_num_private_threads=False,
+                       datasets_num_parallel_batches=False,
+                       fp16_implementation=False,
+                       loss_scale=False,
+                       tf_data_experimental_slack=False,
+                       enable_xla=False,
+                       training_dataset_cache=False):
+  """Register flags for specifying performance tuning arguments.
+
+  Args:
+    num_parallel_calls: Create a flag to specify parallelism of data loading.
+    inter_op: Create a flag to allow specification of inter op threads.
+    intra_op: Create a flag to allow specification of intra op threads.
+    synthetic_data: Create a flag to allow the use of synthetic data.
+    max_train_steps: Create a flags to allow specification of maximum number of
+      training steps
+    dtype: Create flags for specifying dtype.
+    all_reduce_alg: If set forces a specific algorithm for multi-gpu.
+    num_packs: If set provides number of packs for MirroredStrategy's cross
+      device ops.
+    tf_gpu_thread_mode: gpu_private triggers us of private thread pool.
+    datasets_num_private_threads: Number of private threads for datasets.
+    datasets_num_parallel_batches: Determines how many batches to process in
+      parallel when using map and batch from tf.data.
+    fp16_implementation: Create fp16_implementation flag.
+    loss_scale: Controls the loss scaling, normally for mixed-precision
+      training. Can only be turned on if dtype is also True.
+    tf_data_experimental_slack: Determines whether to enable tf.data's
+      `experimental_slack` option.
+    enable_xla: Determines if XLA (auto clustering) is turned on.
+    training_dataset_cache: Whether to cache the training dataset on workers.
+      Typically used to improve training performance when training data is in
+      remote storage and can fit into worker memory.
+
+  Returns:
+    A list of flags for core.py to marks as key flags.
+  """
+
+  key_flags = []
+  if num_parallel_calls:
+    flags.DEFINE_integer(
+        name="num_parallel_calls",
+        short_name="npc",
+        default=multiprocessing.cpu_count(),
+        help=help_wrap("The number of records that are  processed in parallel "
+                       "during input processing. This can be optimized per "
+                       "data set but for generally homogeneous data sets, "
+                       "should be approximately the number of available CPU "
+                       "cores. (default behavior)"))
+
+  if inter_op:
+    flags.DEFINE_integer(
+        name="inter_op_parallelism_threads",
+        short_name="inter",
+        default=0,
+        help=help_wrap("Number of inter_op_parallelism_threads to use for CPU. "
+                       "See TensorFlow config.proto for details."))
+
+  if intra_op:
+    flags.DEFINE_integer(
+        name="intra_op_parallelism_threads",
+        short_name="intra",
+        default=0,
+        help=help_wrap("Number of intra_op_parallelism_threads to use for CPU. "
+                       "See TensorFlow config.proto for details."))
+
+  if synthetic_data:
+    flags.DEFINE_bool(
+        name="use_synthetic_data",
+        short_name="synth",
+        default=False,
+        help=help_wrap(
+            "If set, use fake data (zeroes) instead of a real dataset. "
+            "This mode is useful for performance debugging, as it removes "
+            "input processing steps, but will not learn anything."))
+
+  if max_train_steps:
+    flags.DEFINE_integer(
+        name="max_train_steps",
+        short_name="mts",
+        default=None,
+        help=help_wrap(
+            "The model will stop training if the global_step reaches this "
+            "value. If not set, training will run until the specified number "
+            "of epochs have run as usual. It is generally recommended to set "
+            "--train_epochs=1 when using this flag."))
+
+  if dtype:
+    flags.DEFINE_enum(
+        name="dtype",
+        short_name="dt",
+        default="fp32",
+        enum_values=DTYPE_MAP.keys(),
+        help=help_wrap("The TensorFlow datatype used for calculations. "
+                       "For 16-bit dtypes, variables and certain ops will "
+                       "still be float32 for numeric stability."))
+
+    if loss_scale:
+      flags.DEFINE_string(
+          name="loss_scale",
+          short_name="ls",
+          default=None,
+          help=help_wrap(
+              "The amount to scale the loss by when --dtype=fp16. This can be "
+              "an int/float or the string 'dynamic'. Before gradients are "
+              "computed, the loss is multiplied by the loss scale, making all "
+              "gradients loss_scale times larger. To adjust for this, "
+              "gradients are divided by the loss scale before being applied to "
+              "variables. This is mathematically equivalent to training "
+              "without a loss scale, but the loss scale helps avoid some "
+              "intermediate gradients from underflowing to zero. The default "
+              "is 'dynamic', which dynamic determines the optimal loss scale "
+              "during training."))
+
+      # pylint: disable=unused-variable
+      @flags.validator(
+          flag_name="loss_scale",
+          message="loss_scale should be a positive int/float or the string "
+                  "'dynamic'.")
+      def _check_loss_scale(loss_scale):
+        """Validator to check the loss scale flag is valid."""
+        if loss_scale is None:
+          return True  # null case is handled in get_loss_scale()
+
+        if loss_scale == "dynamic":
+          return True
+
+        try:
+          loss_scale = float(loss_scale)
+        except ValueError:
+          return False
+
+        return loss_scale > 0
+      # pylint: enable=unused-variable
+
+    if fp16_implementation:
+      flags.DEFINE_enum(
+          name="fp16_implementation",
+          default="keras",
+          enum_values=("keras", "graph_rewrite"),
+          help=help_wrap(
+              "When --dtype=fp16, how fp16 should be implemented. This has no "
+              "impact on correctness. 'keras' uses the "
+              "tf.keras.mixed_precision API. 'graph_rewrite' uses the "
+              "tf.compat.v1.mixed_precision."
+              "enable_mixed_precision_graph_rewrite API."))
+
+      @flags.multi_flags_validator(
+          ["fp16_implementation", "dtype", "loss_scale"])
+      def _check_fp16_implementation(flags_dict):
+        """Validator to check fp16_implementation flag is valid."""
+        if (flags_dict["fp16_implementation"] == "graph_rewrite" and
+            flags_dict["dtype"] != "fp16"):
+          raise flags.ValidationError("--fp16_implementation should not be "
+                                      "specified unless --dtype=fp16")
+        return True
+
+  if all_reduce_alg:
+    flags.DEFINE_string(
+        name="all_reduce_alg",
+        short_name="ara",
+        default=None,
+        help=help_wrap("Defines the algorithm to use for performing all-reduce."
+                       "When specified with MirroredStrategy for single "
+                       "worker, this controls "
+                       "tf.contrib.distribute.AllReduceCrossTowerOps.  When "
+                       "specified with MultiWorkerMirroredStrategy, this "
+                       "controls "
+                       "tf.distribute.experimental.CollectiveCommunication; "
+                       "valid options are `ring` and `nccl`."))
+
+  if num_packs:
+    flags.DEFINE_integer(
+        name="num_packs",
+        default=1,
+        help=help_wrap("Sets `num_packs` in the cross device ops used in "
+                       "MirroredStrategy.  For details, see "
+                       "tf.distribute.NcclAllReduce."))
+
+  if tf_gpu_thread_mode:
+    flags.DEFINE_string(
+        name="tf_gpu_thread_mode",
+        short_name="gt_mode",
+        default=None,
+        help=help_wrap(
+            "Whether and how the GPU device uses its own threadpool."))
+
+    flags.DEFINE_integer(
+        name="per_gpu_thread_count",
+        short_name="pgtc",
+        default=0,
+        help=help_wrap("The number of threads to use for GPU. Only valid when "
+                       "tf_gpu_thread_mode is not global."))
+
+  if datasets_num_private_threads:
+    flags.DEFINE_integer(
+        name="datasets_num_private_threads",
+        default=None,
+        help=help_wrap(
+            "Number of threads for a private threadpool created for all"
+            "datasets computation.."))
+
+  if datasets_num_parallel_batches:
+    flags.DEFINE_integer(
+        name="datasets_num_parallel_batches",
+        default=None,
+        help=help_wrap(
+            "Determines how many batches to process in parallel when using "
+            "map and batch from tf.data."))
+
+  if training_dataset_cache:
+    flags.DEFINE_boolean(
+        name="training_dataset_cache",
+        default=False,
+        help=help_wrap(
+            "Determines whether to cache the training dataset on workers. "
+            "Typically used to improve training performance when training "
+            "data is in remote storage and can fit into worker memory."))
+
+  if tf_data_experimental_slack:
+    flags.DEFINE_boolean(
+        name="tf_data_experimental_slack",
+        default=False,
+        help=help_wrap(
+            "Whether to enable tf.data's `experimental_slack` option."))
+
+  if enable_xla:
+    flags.DEFINE_boolean(
+        name="enable_xla",
+        default=False,
+        help="Whether to enable XLA auto jit compilation")
+
+  return key_flags
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/flags/core.py b/nlp/text_classification/bert/tensorflow2.0/utils/flags/core.py
new file mode 100644
index 000000000..3d894f9cb
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/flags/core.py
@@ -0,0 +1,130 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Public interface for flag definition.
+
+See _example.py for detailed instructions on defining flags.
+"""
+
+import sys
+
+from six.moves import shlex_quote
+
+from absl import app as absl_app
+from absl import flags
+
+from utils.flags import _base
+from utils.flags import _benchmark
+from utils.flags import _conventions
+from utils.flags import _device
+from utils.flags import _distribution
+from utils.flags import _misc
+from utils.flags import _performance
+
+
+def set_defaults(**kwargs):
+  for key, value in kwargs.items():
+    flags.FLAGS.set_default(name=key, value=value)
+
+
+def parse_flags(argv=None):
+  """Reset flags and reparse. Currently only used in testing."""
+  flags.FLAGS.unparse_flags()
+  absl_app.parse_flags_with_usage(argv or sys.argv)
+
+
+def register_key_flags_in_core(f):
+  """Defines a function in core.py, and registers its key flags.
+
+  absl uses the location of a flags.declare_key_flag() to determine the context
+  in which a flag is key. By making all declares in core, this allows model
+  main functions to call flags.adopt_module_key_flags() on core and correctly
+  chain key flags.
+
+  Args:
+    f:  The function to be wrapped
+
+  Returns:
+    The "core-defined" version of the input function.
+  """
+
+  def core_fn(*args, **kwargs):
+    key_flags = f(*args, **kwargs)
+    [flags.declare_key_flag(fl) for fl in key_flags]  # pylint: disable=expression-not-assigned
+
+  return core_fn
+
+
+define_base = register_key_flags_in_core(_base.define_base)
+# We have define_base_eager for compatibility, since it used to be a separate
+# function from define_base.
+define_base_eager = define_base
+define_log_steps = register_key_flags_in_core(_benchmark.define_log_steps)
+define_benchmark = register_key_flags_in_core(_benchmark.define_benchmark)
+define_device = register_key_flags_in_core(_device.define_device)
+define_image = register_key_flags_in_core(_misc.define_image)
+define_performance = register_key_flags_in_core(_performance.define_performance)
+define_distribution = register_key_flags_in_core(
+    _distribution.define_distribution)
+
+help_wrap = _conventions.help_wrap
+
+get_num_gpus = _base.get_num_gpus
+get_tf_dtype = _performance.get_tf_dtype
+get_loss_scale = _performance.get_loss_scale
+DTYPE_MAP = _performance.DTYPE_MAP
+require_cloud_storage = _device.require_cloud_storage
+
+
+def _get_nondefault_flags_as_dict():
+  """Returns the nondefault flags as a dict from flag name to value."""
+  nondefault_flags = {}
+  for flag_name in flags.FLAGS:
+    flag_value = getattr(flags.FLAGS, flag_name)
+    if (flag_name != flags.FLAGS[flag_name].short_name and
+        flag_value != flags.FLAGS[flag_name].default):
+      nondefault_flags[flag_name] = flag_value
+  return nondefault_flags
+
+
+def get_nondefault_flags_as_str():
+  """Returns flags as a string that can be passed as command line arguments.
+
+  E.g., returns: "--batch_size=256 --use_synthetic_data" for the following code
+  block:
+
+  ```
+  flags.FLAGS.batch_size = 256
+  flags.FLAGS.use_synthetic_data = True
+  print(get_nondefault_flags_as_str())
+  ```
+
+  Only flags with nondefault values are returned, as passing default flags as
+  command line arguments has no effect.
+
+  Returns:
+    A string with the flags, that can be passed as command line arguments to a
+    program to use the flags.
+  """
+  nondefault_flags = _get_nondefault_flags_as_dict()
+  flag_strings = []
+  for name, value in sorted(nondefault_flags.items()):
+    if isinstance(value, bool):
+      flag_str = '--{}'.format(name) if value else '--no{}'.format(name)
+    elif isinstance(value, list):
+      flag_str = '--{}={}'.format(name, ','.join(value))
+    else:
+      flag_str = '--{}={}'.format(name, value)
+    flag_strings.append(flag_str)
+  return ' '.join(shlex_quote(flag_str) for flag_str in flag_strings)
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/flags/guidelines.md b/nlp/text_classification/bert/tensorflow2.0/utils/flags/guidelines.md
new file mode 100644
index 000000000..db79e4f3e
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/flags/guidelines.md
@@ -0,0 +1,65 @@
+# Using flags in official models
+
+1. **All common flags must be incorporated in the models.**
+
+   Common flags (i.e. batch_size, model_dir, etc.) are provided by various flag definition functions,
+   and channeled through `utils.flags.core`. For instance to define common supervised
+   learning parameters one could use the following code:
+
+   ```$xslt
+   from absl import app as absl_app
+   from absl import flags
+
+   from utils.flags import core as flags_core
+
+
+   def define_flags():
+     flags_core.define_base()
+     flags.adopt_key_flags(flags_core)
+
+
+   def main(_):
+     flags_obj = flags.FLAGS
+     print(flags_obj)
+
+
+   if __name__ == "__main__"
+     absl_app.run(main)
+   ```
+2. **Validate flag values.**
+
+   See the [Validators](#validators) section for implementation details.
+
+   Validators in the official model repo should not access the file system, such as verifying
+   that files exist, due to the strict ordering requirements.
+
+3. **Flag values should not be mutated.**
+
+   Instead of mutating flag values, use getter functions to return the desired values. An example
+   getter function is `get_tf_dtype` function below:
+
+   ```
+   # Map string to TensorFlow dtype
+   DTYPE_MAP = {
+       "fp16": tf.float16,
+       "fp32": tf.float32,
+   }
+
+   def get_tf_dtype(flags_obj):
+     if getattr(flags_obj, "fp16_implementation", None) == "graph_rewrite":
+       # If the graph_rewrite is used, we build the graph with fp32, and let the
+       # graph rewrite change ops to fp16.
+       return tf.float32
+     return DTYPE_MAP[flags_obj.dtype]
+
+
+   def main(_):
+     flags_obj = flags.FLAGS()
+
+     # Do not mutate flags_obj
+     # if flags_obj.fp16_implementation == "graph_rewrite":
+     #   flags_obj.dtype = "float32" # Don't do this
+
+     print(get_tf_dtype(flags_obj))
+     ...
+   ```
\ No newline at end of file
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/hyperparams_flags.py b/nlp/text_classification/bert/tensorflow2.0/utils/hyperparams_flags.py
new file mode 100644
index 000000000..08799f7d2
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/hyperparams_flags.py
@@ -0,0 +1,123 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Common flags for importing hyperparameters."""
+
+from absl import flags
+from utils.flags import core as flags_core
+
+FLAGS = flags.FLAGS
+
+
+def define_gin_flags():
+  """Define common gin configurable flags."""
+  flags.DEFINE_multi_string('gin_file', None,
+                            'List of paths to the config files.')
+  flags.DEFINE_multi_string(
+      'gin_param', None, 'Newline separated list of Gin parameter bindings.')
+
+
+def define_common_hparams_flags():
+  """Define the common flags across models."""
+
+  flags.DEFINE_string(
+      'model_dir',
+      default=None,
+      help=('The directory where the model and training/evaluation summaries'
+            'are stored.'))
+
+  flags.DEFINE_integer(
+      'train_batch_size', default=None, help='Batch size for training.')
+
+  flags.DEFINE_integer(
+      'eval_batch_size', default=None, help='Batch size for evaluation.')
+
+  flags.DEFINE_string(
+      'precision',
+      default=None,
+      help=('Precision to use; one of: {bfloat16, float32}'))
+
+  flags.DEFINE_string(
+      'config_file',
+      default=None,
+      help=('A YAML file which specifies overrides. Note that this file can be '
+            'used as an override template to override the default parameters '
+            'specified in Python. If the same parameter is specified in both '
+            '`--config_file` and `--params_override`, the one in '
+            '`--params_override` will be used finally.'))
+
+  flags.DEFINE_string(
+      'params_override',
+      default=None,
+      help=('a YAML/JSON string or a YAML file which specifies additional '
+            'overrides over the default parameters and those specified in '
+            '`--config_file`. Note that this is supposed to be used only to '
+            'override the model parameters, but not the parameters like TPU '
+            'specific flags. One canonical use case of `--config_file` and '
+            '`--params_override` is users first define a template config file '
+            'using `--config_file`, then use `--params_override` to adjust the '
+            'minimal set of tuning parameters, for example setting up different'
+            ' `train_batch_size`. '
+            'The final override order of parameters: default_model_params --> '
+            'params from config_file --> params in params_override.'
+            'See also the help message of `--config_file`.'))
+  flags.DEFINE_integer('save_checkpoint_freq', None,
+                       'Number of steps to save checkpoint.')
+
+
+def initialize_common_flags():
+  """Define the common flags across models."""
+  define_common_hparams_flags()
+
+  flags_core.define_device(tpu=True)
+  flags_core.define_base(
+      num_gpu=True, model_dir=False, data_dir=False, batch_size=False)
+  flags_core.define_distribution(worker_hosts=True, task_index=True)
+  flags_core.define_performance(all_reduce_alg=True, num_packs=True)
+
+  # Reset the default value of num_gpus to zero.
+  FLAGS.num_gpus = 0
+
+  flags.DEFINE_string(
+      'strategy_type', 'mirrored', 'Type of distribute strategy.'
+      'One of mirrored, tpu and multiworker.')
+
+
+def strategy_flags_dict():
+  """Returns TPU and/or GPU related flags in a dictionary."""
+  return {
+      'distribution_strategy': FLAGS.strategy_type,
+      # TPUStrategy related flags.
+      'tpu': FLAGS.tpu,
+      # MultiWorkerMirroredStrategy related flags.
+      'all_reduce_alg': FLAGS.all_reduce_alg,
+      'worker_hosts': FLAGS.worker_hosts,
+      'task_index': FLAGS.task_index,
+      # MirroredStrategy and OneDeviceStrategy
+      'num_gpus': FLAGS.num_gpus,
+      'num_packs': FLAGS.num_packs,
+  }
+
+
+def hparam_flags_dict():
+  """Returns model params related flags in a dictionary."""
+  return {
+      'data_dir': FLAGS.data_dir,
+      'model_dir': FLAGS.model_dir,
+      'train_batch_size': FLAGS.train_batch_size,
+      'eval_batch_size': FLAGS.eval_batch_size,
+      'precision': FLAGS.precision,
+      'config_file': FLAGS.config_file,
+      'params_override': FLAGS.params_override,
+  }
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/misc/__init__.py b/nlp/text_classification/bert/tensorflow2.0/utils/misc/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/misc/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/misc/distribution_utils.py b/nlp/text_classification/bert/tensorflow2.0/utils/misc/distribution_utils.py
new file mode 100644
index 000000000..3065c0e54
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/misc/distribution_utils.py
@@ -0,0 +1,17 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helper functions for running models in a distributed setting."""
+# pylint: disable=wildcard-import
+from common.distribute_utils import *
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/misc/keras_utils.py b/nlp/text_classification/bert/tensorflow2.0/utils/misc/keras_utils.py
new file mode 100644
index 000000000..a5b20c8a3
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/misc/keras_utils.py
@@ -0,0 +1,211 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helper functions for the Keras implementations of models."""
+
+import multiprocessing
+import os
+import time
+
+from absl import logging
+import tensorflow as tf
+
+from tensorflow.python.eager import monitoring
+
+global_batch_size_gauge = monitoring.IntGauge(
+    '/tensorflow/training/global_batch_size', 'TF training global batch size')
+
+first_batch_time_gauge = monitoring.IntGauge(
+    '/tensorflow/training/first_batch',
+    'TF training start/end time for first batch (unix epoch time in us.',
+    'type')
+
+first_batch_start_time = first_batch_time_gauge.get_cell('start')
+first_batch_end_time = first_batch_time_gauge.get_cell('end')
+
+
+class BatchTimestamp(object):
+  """A structure to store batch time stamp."""
+
+  def __init__(self, batch_index, timestamp):
+    self.batch_index = batch_index
+    self.timestamp = timestamp
+
+  def __repr__(self):
+    return "'BatchTimestamp<batch_index: {}, timestamp: {}>'".format(
+        self.batch_index, self.timestamp)
+
+
+class TimeHistory(tf.keras.callbacks.Callback):
+  """Callback for Keras models."""
+
+  def __init__(self, batch_size, log_steps, initial_step=0, logdir=None):
+    """Callback for logging performance.
+
+    Args:
+      batch_size: Total batch size.
+      log_steps: Interval of steps between logging of batch level stats.
+      initial_step: Optional, initial step.
+      logdir: Optional directory to write TensorBoard summaries.
+    """
+    # TODO(wcromar): remove this parameter and rely on `logs` parameter of
+    # on_train_batch_end()
+    self.batch_size = batch_size
+    super(TimeHistory, self).__init__()
+    self.log_steps = log_steps
+    self.last_log_step = initial_step
+    self.steps_before_epoch = initial_step
+    self.steps_in_epoch = 0
+    self.start_time = None
+
+    global_batch_size_gauge.get_cell().set(batch_size)
+
+    if logdir:
+      self.summary_writer = tf.summary.create_file_writer(logdir)
+    else:
+      self.summary_writer = None
+
+    # Logs start of step 1 then end of each step based on log_steps interval.
+    self.timestamp_log = []
+
+    # Records the time each epoch takes to run from start to finish of epoch.
+    self.epoch_runtime_log = []
+
+  @property
+  def global_steps(self):
+    """The current 1-indexed global step."""
+    return self.steps_before_epoch + self.steps_in_epoch
+
+  @property
+  def average_steps_per_second(self):
+    """The average training steps per second across all epochs."""
+    return self.global_steps / sum(self.epoch_runtime_log)
+
+  @property
+  def average_examples_per_second(self):
+    """The average number of training examples per second across all epochs."""
+    return self.average_steps_per_second * self.batch_size
+
+  def get_examples_per_sec(self, warmup=1):
+    """Calculates examples/sec through timestamp_log and skip warmup period."""
+    # First entry in timestamp_log is the start of the step 1. The rest of the
+    # entries are the end of each step recorded.
+    time_log = self.timestamp_log
+    seconds = time_log[-1].timestamp - time_log[warmup].timestamp
+    steps = time_log[-1].batch_index - time_log[warmup].batch_index
+    return self.batch_size * steps / seconds
+
+  def get_startup_time(self, start_time_sec):
+    return self.timestamp_log[0].timestamp - start_time_sec
+
+  def on_train_end(self, logs=None):
+    self.train_finish_time = time.time()
+
+    if self.summary_writer:
+      self.summary_writer.flush()
+
+  def on_epoch_begin(self, epoch, logs=None):
+    self.epoch_start = time.time()
+
+  def on_batch_begin(self, batch, logs=None):
+    if not self.start_time:
+      self.start_time = time.time()
+      if not first_batch_start_time.value():
+        first_batch_start_time.set(int(self.start_time * 1000000))
+
+    # Record the timestamp of the first global step
+    if not self.timestamp_log:
+      self.timestamp_log.append(
+          BatchTimestamp(self.global_steps, self.start_time))
+
+  def on_batch_end(self, batch, logs=None):
+    """Records elapse time of the batch and calculates examples per second."""
+    if not first_batch_end_time.value():
+      first_batch_end_time.set(int(time.time() * 1000000))
+    self.steps_in_epoch = batch + 1
+    steps_since_last_log = self.global_steps - self.last_log_step
+    if steps_since_last_log >= self.log_steps:
+      now = time.time()
+      elapsed_time = now - self.start_time
+      steps_per_second = steps_since_last_log / elapsed_time
+      examples_per_second = steps_per_second * self.batch_size
+
+      self.timestamp_log.append(BatchTimestamp(self.global_steps, now))
+      logging.info(
+          'TimeHistory: %.2f seconds, %.2f examples/second between steps %d '
+          'and %d', elapsed_time, examples_per_second, self.last_log_step,
+          self.global_steps)
+
+      if self.summary_writer:
+        with self.summary_writer.as_default():
+          tf.summary.scalar('steps_per_second', steps_per_second,
+                            self.global_steps)
+          tf.summary.scalar('examples_per_second', examples_per_second,
+                            self.global_steps)
+
+      self.last_log_step = self.global_steps
+      self.start_time = None
+
+  def on_epoch_end(self, epoch, logs=None):
+    epoch_run_time = time.time() - self.epoch_start
+    self.epoch_runtime_log.append(epoch_run_time)
+
+    self.steps_before_epoch += self.steps_in_epoch
+    self.steps_in_epoch = 0
+
+
+class SimpleCheckpoint(tf.keras.callbacks.Callback):
+  """Keras callback to save tf.train.Checkpoints."""
+
+  def __init__(self, checkpoint_manager):
+    super(SimpleCheckpoint, self).__init__()
+    self.checkpoint_manager = checkpoint_manager
+
+  def on_epoch_end(self, epoch, logs=None):
+    step_counter = self.checkpoint_manager._step_counter.numpy()  # pylint: disable=protected-access
+    self.checkpoint_manager.save(checkpoint_number=step_counter)
+
+
+def set_session_config(enable_xla=False):
+  """Sets the session config."""
+  if enable_xla:
+    tf.config.optimizer.set_jit(True)
+
+
+# TODO(hongkuny): remove set_config_v2 globally.
+set_config_v2 = set_session_config
+
+
+def set_gpu_thread_mode_and_count(gpu_thread_mode, datasets_num_private_threads,
+                                  num_gpus, per_gpu_thread_count):
+  """Set GPU thread mode and count, and adjust dataset threads count."""
+  cpu_count = multiprocessing.cpu_count()
+  logging.info('Logical CPU cores: %s', cpu_count)
+
+  # Allocate private thread pool for each GPU to schedule and launch kernels
+  per_gpu_thread_count = per_gpu_thread_count or 2
+  os.environ['TF_GPU_THREAD_MODE'] = gpu_thread_mode
+  os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count)
+  logging.info('TF_GPU_THREAD_COUNT: %s', os.environ['TF_GPU_THREAD_COUNT'])
+  logging.info('TF_GPU_THREAD_MODE: %s', os.environ['TF_GPU_THREAD_MODE'])
+
+  # Limit data preprocessing threadpool to CPU cores minus number of total GPU
+  # private threads and memory copy threads.
+  total_gpu_thread_count = per_gpu_thread_count * num_gpus
+  num_runtime_threads = num_gpus
+  if not datasets_num_private_threads:
+    datasets_num_private_threads = min(
+        cpu_count - total_gpu_thread_count - num_runtime_threads, num_gpus * 8)
+    logging.info('Set datasets_num_private_threads to %s',
+                 datasets_num_private_threads)
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/misc/model_helpers.py b/nlp/text_classification/bert/tensorflow2.0/utils/misc/model_helpers.py
new file mode 100644
index 000000000..4c310588b
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/misc/model_helpers.py
@@ -0,0 +1,94 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Miscellaneous functions that can be called by models."""
+
+import numbers
+
+from absl import logging
+import tensorflow as tf
+
+from tensorflow.python.util import nest
+# pylint:disable=logging-format-interpolation
+
+
+def past_stop_threshold(stop_threshold, eval_metric):
+  """Return a boolean representing whether a model should be stopped.
+
+  Args:
+    stop_threshold: float, the threshold above which a model should stop
+      training.
+    eval_metric: float, the current value of the relevant metric to check.
+
+  Returns:
+    True if training should stop, False otherwise.
+
+  Raises:
+    ValueError: if either stop_threshold or eval_metric is not a number
+  """
+  if stop_threshold is None:
+    return False
+
+  if not isinstance(stop_threshold, numbers.Number):
+    raise ValueError("Threshold for checking stop conditions must be a number.")
+  if not isinstance(eval_metric, numbers.Number):
+    raise ValueError("Eval metric being checked against stop conditions "
+                     "must be a number.")
+
+  if eval_metric >= stop_threshold:
+    logging.info("Stop threshold of {} was passed with metric value {}.".format(
+        stop_threshold, eval_metric))
+    return True
+
+  return False
+
+
+def generate_synthetic_data(input_shape,
+                            input_value=0,
+                            input_dtype=None,
+                            label_shape=None,
+                            label_value=0,
+                            label_dtype=None):
+  """Create a repeating dataset with constant values.
+
+  Args:
+    input_shape: a tf.TensorShape object or nested tf.TensorShapes. The shape of
+      the input data.
+    input_value: Value of each input element.
+    input_dtype: Input dtype. If None, will be inferred by the input value.
+    label_shape: a tf.TensorShape object or nested tf.TensorShapes. The shape of
+      the label data.
+    label_value: Value of each input element.
+    label_dtype: Input dtype. If None, will be inferred by the target value.
+
+  Returns:
+    Dataset of tensors or tuples of tensors (if label_shape is set).
+  """
+  # TODO(kathywu): Replace with SyntheticDataset once it is in contrib.
+  element = input_element = nest.map_structure(
+      lambda s: tf.constant(input_value, input_dtype, s), input_shape)
+
+  if label_shape:
+    label_element = nest.map_structure(
+        lambda s: tf.constant(label_value, label_dtype, s), label_shape)
+    element = (input_element, label_element)
+
+  return tf.data.Dataset.from_tensors(element).repeat()
+
+
+def apply_clean(flags_obj):
+  if flags_obj.clean and tf.io.gfile.exists(flags_obj.model_dir):
+    logging.info("--clean flag set. Removing existing model dir:"
+                 " {}".format(flags_obj.model_dir))
+    tf.io.gfile.rmtree(flags_obj.model_dir)
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/testing/__init__.py b/nlp/text_classification/bert/tensorflow2.0/utils/testing/__init__.py
new file mode 100644
index 000000000..e419af524
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/testing/__init__.py
@@ -0,0 +1,14 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/testing/integration.py b/nlp/text_classification/bert/tensorflow2.0/utils/testing/integration.py
new file mode 100644
index 000000000..ceee7f920
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/testing/integration.py
@@ -0,0 +1,70 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helper code to run complete models from within python."""
+
+import os
+import shutil
+import sys
+import tempfile
+
+from absl import flags
+from absl.testing import flagsaver
+
+from utils.flags import core as flags_core
+
+
+@flagsaver.flagsaver
+def run_synthetic(main,
+                  tmp_root,
+                  extra_flags=None,
+                  synth=True,
+                  train_epochs=1,
+                  epochs_between_evals=1):
+  """Performs a minimal run of a model.
+
+    This function is intended to test for syntax errors throughout a model. A
+  very limited run is performed using synthetic data.
+
+  Args:
+    main: The primary function used to exercise a code path. Generally this
+      function is "<MODULE>.main(argv)".
+    tmp_root: Root path for the temp directory created by the test class.
+    extra_flags: Additional flags passed by the caller of this function.
+    synth: Use synthetic data.
+    train_epochs: Value of the --train_epochs flag.
+    epochs_between_evals: Value of the --epochs_between_evals flag.
+  """
+
+  extra_flags = [] if extra_flags is None else extra_flags
+
+  model_dir = tempfile.mkdtemp(dir=tmp_root)
+
+  args = [sys.argv[0], "--model_dir", model_dir] + extra_flags
+
+  if synth:
+    args.append("--use_synthetic_data")
+
+  if train_epochs is not None:
+    args.extend(["--train_epochs", str(train_epochs)])
+
+  if epochs_between_evals is not None:
+    args.extend(["--epochs_between_evals", str(epochs_between_evals)])
+
+  try:
+    flags_core.parse_flags(argv=args)
+    main(flags.FLAGS)
+  finally:
+    if os.path.exists(model_dir):
+      shutil.rmtree(model_dir)
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/testing/mock_task.py b/nlp/text_classification/bert/tensorflow2.0/utils/testing/mock_task.py
new file mode 100644
index 000000000..fdf7da4d0
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/testing/mock_task.py
@@ -0,0 +1,101 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Mock task for testing."""
+
+import dataclasses
+import numpy as np
+import tensorflow as tf
+
+from core import base_task
+from core import config_definitions as cfg
+from core import exp_factory
+from core import task_factory
+
+
+class MockModel(tf.keras.Model):
+
+  def __init__(self, network):
+    super().__init__()
+    self.network = network
+
+  def call(self, inputs):
+    outputs = self.network(inputs)
+    self.add_loss(tf.reduce_mean(outputs))
+    return outputs
+
+
+@dataclasses.dataclass
+class MockTaskConfig(cfg.TaskConfig):
+  pass
+
+
+@task_factory.register_task_cls(MockTaskConfig)
+class MockTask(base_task.Task):
+  """Mock task object for testing."""
+
+  def __init__(self, params=None, logging_dir=None, name=None):
+    super().__init__(params=params, logging_dir=logging_dir, name=name)
+
+  def build_model(self, *arg, **kwargs):
+    inputs = tf.keras.layers.Input(shape=(2,), name="random", dtype=tf.float32)
+    outputs = tf.keras.layers.Dense(
+        1, bias_initializer=tf.keras.initializers.Ones(), name="dense_0")(
+            inputs)
+    network = tf.keras.Model(inputs=inputs, outputs=outputs)
+    return MockModel(network)
+
+  def build_metrics(self, training: bool = True):
+    del training
+    return [tf.keras.metrics.Accuracy(name="acc")]
+
+  def validation_step(self, inputs, model: tf.keras.Model, metrics=None):
+    logs = super().validation_step(inputs, model, metrics)
+    logs["counter"] = tf.constant(1, dtype=tf.float32)
+    return logs
+
+  def build_inputs(self, params):
+
+    def generate_data(_):
+      x = tf.zeros(shape=(2,), dtype=tf.float32)
+      label = tf.zeros([1], dtype=tf.int32)
+      return x, label
+
+    dataset = tf.data.Dataset.range(1)
+    dataset = dataset.repeat()
+    dataset = dataset.map(
+        generate_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+    return dataset.prefetch(buffer_size=1).batch(2, drop_remainder=True)
+
+  def aggregate_logs(self, state, step_outputs):
+    if state is None:
+      state = {}
+    for key, value in step_outputs.items():
+      if key not in state:
+        state[key] = []
+      state[key].append(
+          np.concatenate([np.expand_dims(v.numpy(), axis=0) for v in value]))
+    return state
+
+  def reduce_aggregated_logs(self, aggregated_logs, global_step=None):
+    for k, v in aggregated_logs.items():
+      aggregated_logs[k] = np.sum(np.stack(v, axis=0))
+    return aggregated_logs
+
+
+@exp_factory.register_config_factory("mock")
+def mock_experiment() -> cfg.ExperimentConfig:
+  config = cfg.ExperimentConfig(
+      task=MockTaskConfig(), trainer=cfg.TrainerConfig())
+  return config
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/testing/pylint.rcfile b/nlp/text_classification/bert/tensorflow2.0/utils/testing/pylint.rcfile
new file mode 100644
index 000000000..7dadc68bd
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/testing/pylint.rcfile
@@ -0,0 +1,168 @@
+[MESSAGES CONTROL]
+disable=R,W,bad-option-value,trailing-newlines,no-name-in-module
+
+[REPORTS]
+# Tells whether to display a full report or only the messages
+reports=no
+
+# Activate the evaluation score.
+score=no
+
+[BASIC]
+
+# Regular expression matching correct argument names
+argument-rgx=^[a-z][a-z0-9_]*$
+
+# Regular expression matching correct attribute names
+attr-rgx=^_{0,2}[a-z][a-z0-9_]*$
+
+# Regular expression matching correct class attribute names
+class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
+
+# Regular expression matching correct class names
+class-rgx=^_?[A-Z][a-zA-Z0-9]*$
+
+# Regular expression matching correct constant names
+const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=10
+
+# Regular expression matching correct function names
+function-rgx=^(?:(?P<camel_case>_?[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_?[a-z][a-z0-9_]*))$
+
+# Good variable names which should always be accepted, separated by a comma
+good-names=main,_
+
+# Regular expression matching correct inline iteration names
+inlinevar-rgx=^[a-z][a-z0-9_]*$
+
+# Regular expression matching correct method names
+method-rgx=^(?:(?P<exempt>__[a-z0-9_]+__|next)|(?P<camel_case>_{0,2}[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_{0,2}[a-z][a-z0-9_]*)|(setUp|tearDown))$
+
+# Regular expression matching correct module names
+module-rgx=^(_?[a-z][a-z0-9_]*)|__init__|PRESUBMIT|PRESUBMIT_unittest$
+
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=(__.*__|main|.*ArgParser)
+
+# Naming hint for variable names
+variable-name-hint=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression matching correct variable names
+variable-rgx=^[a-z][a-z0-9_]*$
+
+[TYPECHECK]
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis. It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=absl, absl.*, official, *, tensorflow, tensorflow.*, LazyLoader, google, google.cloud.*
+
+
+[CLASSES]
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,__new__,setUp
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,_fields,_replace,_source,_make
+
+# This is deprecated, because it is not used anymore.
+#ignore-iface-methods=
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls,class_
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=mcs
+
+
+[DESIGN]
+
+# Argument names that match this expression will be ignored. Default to name
+# with leading underscore
+ignored-argument-names=_.*
+
+# Maximum number of arguments for function / method
+max-args=5
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+
+# Maximum number of branch for function / method body
+max-branches=12
+
+# Maximum number of locals for function / method body
+max-locals=15
+
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+
+# Maximum number of return / yield for function / method body
+max-returns=6
+
+# Maximum number of statements in function / method body
+max-statements=50
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=2
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when being caught. Defaults to
+# "Exception"
+overgeneral-exceptions=StandardError,Exception,BaseException
+
+
+[FORMAT]
+
+# Number of spaces of indent required inside a hanging or continued line.
+indent-after-paren=4
+
+# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
+# tab).
+indent-string='  '
+
+# Maximum number of characters on a single line.
+max-line-length=80
+
+# Maximum number of lines in a module
+max-module-lines=99999
+
+# List of optional constructs for which whitespace checking is disabled
+no-space-check=
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=yes
+
+# Allow URLs and comment type annotations to exceed the max line length as neither can be easily
+# split across lines.
+ignore-long-lines=^\s*(?:(# )?<?https?://\S+>?$|# type:)
+
+
+[VARIABLES]
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid to define new builtins when possible.
+additional-builtins=
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,_cb
+
+# A regular expression matching the name of dummy variables (i.e. expectedly
+# not used).
+dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_)
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/testing/scripts/builds_common.sh b/nlp/text_classification/bert/tensorflow2.0/utils/testing/scripts/builds_common.sh
new file mode 100644
index 000000000..3cf08bb51
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/testing/scripts/builds_common.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Common Bash functions used by build scripts
+
+COLOR_NC='\033[0m'
+COLOR_BOLD='\033[1m'
+COLOR_LIGHT_GRAY='\033[0;37m'
+COLOR_GREEN='\033[0;32m'
+COLOR_RED='\033[0;31m'
+
+die() {
+    # Print a message and exit with code 1.
+    #
+    # Usage: die <error_message>
+    #   e.g., die "Something bad happened."
+
+    echo $@
+    exit 1
+}
+
+num_cpus() {
+    # Get the number of CPUs
+    N_CPUS=$(grep -c ^processor /proc/cpuinfo)
+    if [[ -z ${N_CPUS} ]]; then
+        die "ERROR: Unable to determine the number of CPUs"
+    fi
+
+    echo ${N_CPUS}
+}
+
+# List files changed (i.e., added, or revised) from
+# the common ancestor of HEAD and the latest master branch.
+# Usage: get_changed_files_from_master_branch
+get_changed_files_from_master_branch() {
+    ANCESTOR=$(git merge-base HEAD master origin/master)
+    git diff ${ANCESTOR} --diff-filter=d --name-only "$@"
+}
+
+# List python files changed that still exist,
+# i.e., not removed.
+# Usage: get_py_files_to_check [--incremental]
+get_py_files_to_check() {
+    if [[ "$1" == "--incremental" ]]; then
+        get_changed_files_from_master_branch -- '*.py'
+    elif [[ -z "$1" ]]; then
+        find official/ -name '*.py'
+    else
+        die "Found unsupported args: $@ for get_py_files_to_check."
+    fi
+}
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/testing/scripts/ci_sanity.sh b/nlp/text_classification/bert/tensorflow2.0/utils/testing/scripts/ci_sanity.sh
new file mode 100644
index 000000000..0646c87a9
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/testing/scripts/ci_sanity.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Sanity check script that runs tests and lint under local environment.
+# Make sure that tensorflow and pylint is installed.
+# usage: models >: ./official/utils/testing/scripts/ci_sanity.sh do_pylint --incremental
+set +x
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/builds_common.sh"
+cd "$SCRIPT_DIR/../../../.."
+MODEL_ROOT="$(pwd)"
+
+export PYTHONPATH="$PYTHONPATH:${MODEL_ROOT}"
+
+# Run pylint
+do_pylint() {
+    # Usage: do_pylint [--incremental]
+    #
+    # Options:
+    #   --incremental  Performs check on only the python files changed in the
+    #                  last non-merge git commit.
+
+    # Use this list to ALLOWLIST pylint errors
+    ERROR_ALLOWLIST=""
+
+    echo "ERROR_ALLOWLIST=\"${ERROR_ALLOWLIST}\""
+
+    PYLINT_BIN="python3 -m pylint"
+
+    PYTHON_SRC_FILES=$(get_py_files_to_check $1)
+    if [[ -z ${PYTHON_SRC_FILES} ]]; then
+        echo "do_pylint found no Python files to check. Returning."
+        return 0
+    fi
+
+    PYLINTRC_FILE="official/utils/testing/pylint.rcfile"
+
+    if [[ ! -f "${PYLINTRC_FILE}" ]]; then
+        die "ERROR: Cannot find pylint rc file at ${PYLINTRC_FILE}"
+    fi
+
+    NUM_SRC_FILES=$(echo ${PYTHON_SRC_FILES} | wc -w)
+    NUM_CPUS=$(num_cpus)
+
+    echo "Running pylint on ${NUM_SRC_FILES} files with ${NUM_CPUS} "\
+    "parallel jobs..."
+    echo ""
+
+    PYLINT_START_TIME=$(date +'%s')
+    OUTPUT_FILE="$(mktemp)_pylint_output.log"
+    ERRORS_FILE="$(mktemp)_pylint_errors.log"
+    NONWL_ERRORS_FILE="$(mktemp)_pylint_nonwl_errors.log"
+
+    rm -rf ${OUTPUT_FILE}
+    rm -rf ${ERRORS_FILE}
+    rm -rf ${NONWL_ERRORS_FILE}
+    touch ${NONWL_ERRORS_FILE}
+
+    ${PYLINT_BIN} --rcfile="${PYLINTRC_FILE}" --output-format=parseable \
+        --jobs=${NUM_CPUS} ${PYTHON_SRC_FILES} > ${OUTPUT_FILE} 2>&1
+    PYLINT_END_TIME=$(date +'%s')
+
+    echo ""
+    echo "pylint took $((PYLINT_END_TIME - PYLINT_START_TIME)) s"
+    echo ""
+
+    # Report only what we care about
+    # Ref https://pylint.readthedocs.io/en/latest/technical_reference/features.html
+    # E: all errors
+    # W0311 bad-indentation
+    # W0312 mixed-indentation
+    # C0330 bad-continuation
+    # C0301 line-too-long
+    # C0326 bad-whitespace
+    # W0611 unused-import
+    # W0622 redefined-builtin
+    grep -E '(\[E|\[W0311|\[W0312|\[C0330|\[C0301|\[C0326|\[W0611|\[W0622)' ${OUTPUT_FILE} > ${ERRORS_FILE}
+
+    N_ERRORS=0
+    while read -r LINE; do
+        IS_ALLOWLISTED=0
+        for WL_REGEX in ${ERROR_ALLOWLIST}; do
+            if echo ${LINE} | grep -q "${WL_REGEX}"; then
+                echo "Found a ALLOWLISTed error:"
+                echo "  ${LINE}"
+                IS_ALLOWLISTED=1
+            fi
+        done
+
+        if [[ ${IS_ALLOWLISTED} == "0" ]]; then
+            echo "${LINE}" >> ${NONWL_ERRORS_FILE}
+            echo "" >> ${NONWL_ERRORS_FILE}
+            ((N_ERRORS++))
+        fi
+    done <${ERRORS_FILE}
+
+    echo "Raw lint output file: ${OUTPUT_FILE}"
+
+    echo ""
+    if [[ ${N_ERRORS} != 0 ]]; then
+        echo "FAIL: Found ${N_ERRORS} non-whitelited pylint errors:"
+        cat "${NONWL_ERRORS_FILE}"
+        return 1
+    else
+        echo "PASS: No non-ALLOWLISTed pylint errors were found."
+        return 0
+    fi
+}
+
+test_result=0
+
+TESTS="$@"
+
+for t in "${TESTS}"; do
+  ${t} || test_result=$?
+done
+
+exit "${test_result}"
diff --git a/nlp/text_classification/bert/tensorflow2.0/utils/testing/scripts/presubmit.sh b/nlp/text_classification/bert/tensorflow2.0/utils/testing/scripts/presubmit.sh
new file mode 100644
index 000000000..33eca3cbb
--- /dev/null
+++ b/nlp/text_classification/bert/tensorflow2.0/utils/testing/scripts/presubmit.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Presubmit script that runs tests and lint under local environment.
+# Make sure that tensorflow and pylint is installed.
+# usage: models >: ./official/utils/testing/scripts/presubmit.sh
+# usage: models >: ./official/utils/testing/scripts/presubmit.sh lint py2_test py3_test
+set +x
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$SCRIPT_DIR/../../../.."
+MODEL_ROOT="$(pwd)"
+
+export PYTHONPATH="$PYTHONPATH:${MODEL_ROOT}"
+
+py_test() {
+  local PY_BINARY="$1"
+  local exit_code=0
+
+  echo "===========Running Python test============"
+  # Skipping Ranking tests, TODO(b/189265753) remove it once the issue is fixed.
+  for test_file in `find official/ -name '*test.py' -print | grep -v 'official/recommendation/ranking'`
+  do
+    echo "####=======Testing ${test_file}=======####"
+    ${PY_BINARY} "${test_file}"
+    _exit_code=$?
+    if [[ $_exit_code != 0 ]]; then
+      exit_code=$_exit_code
+      echo "FAIL: ${test_file}"
+    fi
+  done
+
+  return "${exit_code}"
+}
+
+py2_test() {
+  local PY_BINARY=$(which python2)
+  py_test "$PY_BINARY"
+  return $?
+}
+
+py3_test() {
+  local PY_BINARY=$(which python3)
+  py_test "$PY_BINARY"
+  return $?
+}
+
+test_result=0
+
+if [ "$#" -eq 0 ]; then
+  TESTS="lint py2_test py3_test"
+else
+  TESTS="$@"
+fi
+
+for t in "${TESTS}"; do
+  ${t} || test_result=$?
+done
+
+exit "${test_result}"
-- 
Gitee